From 771eb4fe8b420bb8563863e242861e635c742bc2 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 9 Jul 2018 23:27:33 -0400
Subject: fs: factor out d_mark_tmpfile()

New helper for bcachefs - bcachefs doesn't want the
inode_dec_link_count() call that d_tmpfile does, it handles i_nlink on
its own atomically with other btree updates

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Christian Brauner <brauner@kernel.org>
Cc: linux-fsdevel@vger.kernel.org
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Christian Brauner <brauner@kernel.org>
---
 fs/dcache.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/dcache.c b/fs/dcache.c
index 25ac74d30bff..796e23761ba0 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -3246,11 +3246,10 @@ void d_genocide(struct dentry *parent)
 	d_walk(parent, parent, d_genocide_kill);
 }
 
-void d_tmpfile(struct file *file, struct inode *inode)
+void d_mark_tmpfile(struct file *file, struct inode *inode)
 {
 	struct dentry *dentry = file->f_path.dentry;
 
-	inode_dec_link_count(inode);
 	BUG_ON(dentry->d_name.name != dentry->d_iname ||
 		!hlist_unhashed(&dentry->d_u.d_alias) ||
 		!d_unlinked(dentry));
@@ -3260,6 +3259,15 @@ void d_tmpfile(struct file *file, struct inode *inode)
 				(unsigned long long)inode->i_ino);
 	spin_unlock(&dentry->d_lock);
 	spin_unlock(&dentry->d_parent->d_lock);
+}
+EXPORT_SYMBOL(d_mark_tmpfile);
+
+void d_tmpfile(struct file *file, struct inode *inode)
+{
+	struct dentry *dentry = file->f_path.dentry;
+
+	inode_dec_link_count(inode);
+	d_mark_tmpfile(file, inode);
 	d_instantiate(dentry, inode);
 }
 EXPORT_SYMBOL(d_tmpfile);
-- 
cgit 


From 1c6fdbd8f2465ddfb73a01ec620cbf3d14044e1a Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 16 Mar 2017 22:18:50 -0800
Subject: bcachefs: Initial commit

Initially forked from drivers/md/bcache, bcachefs is a new copy-on-write
filesystem with every feature you could possibly want.

Website: https://bcachefs.org

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/Kconfig                          |    1 +
 fs/Makefile                         |    1 +
 fs/bcachefs/Kconfig                 |   52 +
 fs/bcachefs/Makefile                |   53 +
 fs/bcachefs/acl.c                   |  387 +++++
 fs/bcachefs/acl.h                   |   59 +
 fs/bcachefs/alloc.c                 | 2205 +++++++++++++++++++++++++++
 fs/bcachefs/alloc.h                 |  141 ++
 fs/bcachefs/alloc_types.h           |   90 ++
 fs/bcachefs/bcachefs.h              |  785 ++++++++++
 fs/bcachefs/bcachefs_format.h       | 1498 ++++++++++++++++++
 fs/bcachefs/bcachefs_ioctl.h        |  310 ++++
 fs/bcachefs/bkey.c                  | 1164 ++++++++++++++
 fs/bcachefs/bkey.h                  |  627 ++++++++
 fs/bcachefs/bkey_methods.c          |  192 +++
 fs/bcachefs/bkey_methods.h          |   87 ++
 fs/bcachefs/bset.c                  | 1849 ++++++++++++++++++++++
 fs/bcachefs/bset.h                  |  668 ++++++++
 fs/bcachefs/btree_cache.c           |  941 ++++++++++++
 fs/bcachefs/btree_cache.h           |   91 ++
 fs/bcachefs/btree_gc.c              | 1099 ++++++++++++++
 fs/bcachefs/btree_gc.h              |  113 ++
 fs/bcachefs/btree_io.c              | 2095 +++++++++++++++++++++++++
 fs/bcachefs/btree_io.h              |  197 +++
 fs/bcachefs/btree_iter.c            | 1844 ++++++++++++++++++++++
 fs/bcachefs/btree_iter.h            |  314 ++++
 fs/bcachefs/btree_locking.h         |  196 +++
 fs/bcachefs/btree_types.h           |  479 ++++++
 fs/bcachefs/btree_update.h          |  168 ++
 fs/bcachefs/btree_update_interior.c | 2171 ++++++++++++++++++++++++++
 fs/bcachefs/btree_update_interior.h |  374 +++++
 fs/bcachefs/btree_update_leaf.c     |  737 +++++++++
 fs/bcachefs/buckets.c               |  975 ++++++++++++
 fs/bcachefs/buckets.h               |  276 ++++
 fs/bcachefs/buckets_types.h         |   96 ++
 fs/bcachefs/chardev.c               |  663 ++++++++
 fs/bcachefs/chardev.h               |   31 +
 fs/bcachefs/checksum.c              |  753 +++++++++
 fs/bcachefs/checksum.h              |  184 +++
 fs/bcachefs/clock.c                 |  180 +++
 fs/bcachefs/clock.h                 |   25 +
 fs/bcachefs/clock_types.h           |   36 +
 fs/bcachefs/compress.c              |  621 ++++++++
 fs/bcachefs/compress.h              |   18 +
 fs/bcachefs/debug.c                 |  425 ++++++
 fs/bcachefs/debug.h                 |   63 +
 fs/bcachefs/dirent.c                |  426 ++++++
 fs/bcachefs/dirent.h                |   55 +
 fs/bcachefs/disk_groups.c           |  494 ++++++
 fs/bcachefs/disk_groups.h           |   74 +
 fs/bcachefs/error.c                 |  159 ++
 fs/bcachefs/error.h                 |  229 +++
 fs/bcachefs/extents.c               | 2395 +++++++++++++++++++++++++++++
 fs/bcachefs/extents.h               |  539 +++++++
 fs/bcachefs/extents_types.h         |   27 +
 fs/bcachefs/eytzinger.h             |  283 ++++
 fs/bcachefs/fifo.h                  |  125 ++
 fs/bcachefs/fs-io.c                 | 2862 +++++++++++++++++++++++++++++++++++
 fs/bcachefs/fs-io.h                 |   47 +
 fs/bcachefs/fs-ioctl.c              |  312 ++++
 fs/bcachefs/fs-ioctl.h              |   10 +
 fs/bcachefs/fs.c                    | 1773 ++++++++++++++++++++++
 fs/bcachefs/fs.h                    |   99 ++
 fs/bcachefs/fsck.c                  | 1306 ++++++++++++++++
 fs/bcachefs/fsck.h                  |    8 +
 fs/bcachefs/inode.c                 |  517 +++++++
 fs/bcachefs/inode.h                 |  101 ++
 fs/bcachefs/io.c                    | 1875 +++++++++++++++++++++++
 fs/bcachefs/io.h                    |  144 ++
 fs/bcachefs/io_types.h              |  148 ++
 fs/bcachefs/journal.c               | 1140 ++++++++++++++
 fs/bcachefs/journal.h               |  383 +++++
 fs/bcachefs/journal_io.c            | 1392 +++++++++++++++++
 fs/bcachefs/journal_io.h            |   44 +
 fs/bcachefs/journal_reclaim.c       |  402 +++++
 fs/bcachefs/journal_reclaim.h       |   42 +
 fs/bcachefs/journal_seq_blacklist.c |  360 +++++
 fs/bcachefs/journal_seq_blacklist.h |   14 +
 fs/bcachefs/journal_types.h         |  242 +++
 fs/bcachefs/keylist.c               |   67 +
 fs/bcachefs/keylist.h               |   76 +
 fs/bcachefs/keylist_types.h         |   16 +
 fs/bcachefs/migrate.c               |  178 +++
 fs/bcachefs/migrate.h               |    7 +
 fs/bcachefs/move.c                  |  761 ++++++++++
 fs/bcachefs/move.h                  |   63 +
 fs/bcachefs/move_types.h            |   15 +
 fs/bcachefs/movinggc.c              |  309 ++++
 fs/bcachefs/movinggc.h              |    9 +
 fs/bcachefs/opts.c                  |  381 +++++
 fs/bcachefs/opts.h                  |  296 ++++
 fs/bcachefs/quota.c                 |  790 ++++++++++
 fs/bcachefs/quota.h                 |   76 +
 fs/bcachefs/quota_types.h           |   37 +
 fs/bcachefs/rebalance.c             |  342 +++++
 fs/bcachefs/rebalance.h             |   28 +
 fs/bcachefs/rebalance_types.h       |   27 +
 fs/bcachefs/recovery.c              |  377 +++++
 fs/bcachefs/recovery.h              |    8 +
 fs/bcachefs/replicas.c              |  698 +++++++++
 fs/bcachefs/replicas.h              |   52 +
 fs/bcachefs/siphash.c               |  173 +++
 fs/bcachefs/siphash.h               |   87 ++
 fs/bcachefs/six.c                   |  780 ++++++++++
 fs/bcachefs/six.h                   |  215 +++
 fs/bcachefs/str_hash.h              |  319 ++++
 fs/bcachefs/super-io.c              |  971 ++++++++++++
 fs/bcachefs/super-io.h              |  142 ++
 fs/bcachefs/super.c                 | 1754 +++++++++++++++++++++
 fs/bcachefs/super.h                 |  228 +++
 fs/bcachefs/super_types.h           |   63 +
 fs/bcachefs/sysfs.c                 | 1027 +++++++++++++
 fs/bcachefs/sysfs.h                 |   44 +
 fs/bcachefs/tests.c                 |  531 +++++++
 fs/bcachefs/tests.h                 |   15 +
 fs/bcachefs/trace.c                 |   12 +
 fs/bcachefs/trace.h                 |  536 +++++++
 fs/bcachefs/util.c                  |  942 ++++++++++++
 fs/bcachefs/util.h                  |  737 +++++++++
 fs/bcachefs/vstructs.h              |   63 +
 fs/bcachefs/xattr.c                 |  485 ++++++
 fs/bcachefs/xattr.h                 |   49 +
 122 files changed, 57147 insertions(+)
 create mode 100644 fs/bcachefs/Kconfig
 create mode 100644 fs/bcachefs/Makefile
 create mode 100644 fs/bcachefs/acl.c
 create mode 100644 fs/bcachefs/acl.h
 create mode 100644 fs/bcachefs/alloc.c
 create mode 100644 fs/bcachefs/alloc.h
 create mode 100644 fs/bcachefs/alloc_types.h
 create mode 100644 fs/bcachefs/bcachefs.h
 create mode 100644 fs/bcachefs/bcachefs_format.h
 create mode 100644 fs/bcachefs/bcachefs_ioctl.h
 create mode 100644 fs/bcachefs/bkey.c
 create mode 100644 fs/bcachefs/bkey.h
 create mode 100644 fs/bcachefs/bkey_methods.c
 create mode 100644 fs/bcachefs/bkey_methods.h
 create mode 100644 fs/bcachefs/bset.c
 create mode 100644 fs/bcachefs/bset.h
 create mode 100644 fs/bcachefs/btree_cache.c
 create mode 100644 fs/bcachefs/btree_cache.h
 create mode 100644 fs/bcachefs/btree_gc.c
 create mode 100644 fs/bcachefs/btree_gc.h
 create mode 100644 fs/bcachefs/btree_io.c
 create mode 100644 fs/bcachefs/btree_io.h
 create mode 100644 fs/bcachefs/btree_iter.c
 create mode 100644 fs/bcachefs/btree_iter.h
 create mode 100644 fs/bcachefs/btree_locking.h
 create mode 100644 fs/bcachefs/btree_types.h
 create mode 100644 fs/bcachefs/btree_update.h
 create mode 100644 fs/bcachefs/btree_update_interior.c
 create mode 100644 fs/bcachefs/btree_update_interior.h
 create mode 100644 fs/bcachefs/btree_update_leaf.c
 create mode 100644 fs/bcachefs/buckets.c
 create mode 100644 fs/bcachefs/buckets.h
 create mode 100644 fs/bcachefs/buckets_types.h
 create mode 100644 fs/bcachefs/chardev.c
 create mode 100644 fs/bcachefs/chardev.h
 create mode 100644 fs/bcachefs/checksum.c
 create mode 100644 fs/bcachefs/checksum.h
 create mode 100644 fs/bcachefs/clock.c
 create mode 100644 fs/bcachefs/clock.h
 create mode 100644 fs/bcachefs/clock_types.h
 create mode 100644 fs/bcachefs/compress.c
 create mode 100644 fs/bcachefs/compress.h
 create mode 100644 fs/bcachefs/debug.c
 create mode 100644 fs/bcachefs/debug.h
 create mode 100644 fs/bcachefs/dirent.c
 create mode 100644 fs/bcachefs/dirent.h
 create mode 100644 fs/bcachefs/disk_groups.c
 create mode 100644 fs/bcachefs/disk_groups.h
 create mode 100644 fs/bcachefs/error.c
 create mode 100644 fs/bcachefs/error.h
 create mode 100644 fs/bcachefs/extents.c
 create mode 100644 fs/bcachefs/extents.h
 create mode 100644 fs/bcachefs/extents_types.h
 create mode 100644 fs/bcachefs/eytzinger.h
 create mode 100644 fs/bcachefs/fifo.h
 create mode 100644 fs/bcachefs/fs-io.c
 create mode 100644 fs/bcachefs/fs-io.h
 create mode 100644 fs/bcachefs/fs-ioctl.c
 create mode 100644 fs/bcachefs/fs-ioctl.h
 create mode 100644 fs/bcachefs/fs.c
 create mode 100644 fs/bcachefs/fs.h
 create mode 100644 fs/bcachefs/fsck.c
 create mode 100644 fs/bcachefs/fsck.h
 create mode 100644 fs/bcachefs/inode.c
 create mode 100644 fs/bcachefs/inode.h
 create mode 100644 fs/bcachefs/io.c
 create mode 100644 fs/bcachefs/io.h
 create mode 100644 fs/bcachefs/io_types.h
 create mode 100644 fs/bcachefs/journal.c
 create mode 100644 fs/bcachefs/journal.h
 create mode 100644 fs/bcachefs/journal_io.c
 create mode 100644 fs/bcachefs/journal_io.h
 create mode 100644 fs/bcachefs/journal_reclaim.c
 create mode 100644 fs/bcachefs/journal_reclaim.h
 create mode 100644 fs/bcachefs/journal_seq_blacklist.c
 create mode 100644 fs/bcachefs/journal_seq_blacklist.h
 create mode 100644 fs/bcachefs/journal_types.h
 create mode 100644 fs/bcachefs/keylist.c
 create mode 100644 fs/bcachefs/keylist.h
 create mode 100644 fs/bcachefs/keylist_types.h
 create mode 100644 fs/bcachefs/migrate.c
 create mode 100644 fs/bcachefs/migrate.h
 create mode 100644 fs/bcachefs/move.c
 create mode 100644 fs/bcachefs/move.h
 create mode 100644 fs/bcachefs/move_types.h
 create mode 100644 fs/bcachefs/movinggc.c
 create mode 100644 fs/bcachefs/movinggc.h
 create mode 100644 fs/bcachefs/opts.c
 create mode 100644 fs/bcachefs/opts.h
 create mode 100644 fs/bcachefs/quota.c
 create mode 100644 fs/bcachefs/quota.h
 create mode 100644 fs/bcachefs/quota_types.h
 create mode 100644 fs/bcachefs/rebalance.c
 create mode 100644 fs/bcachefs/rebalance.h
 create mode 100644 fs/bcachefs/rebalance_types.h
 create mode 100644 fs/bcachefs/recovery.c
 create mode 100644 fs/bcachefs/recovery.h
 create mode 100644 fs/bcachefs/replicas.c
 create mode 100644 fs/bcachefs/replicas.h
 create mode 100644 fs/bcachefs/siphash.c
 create mode 100644 fs/bcachefs/siphash.h
 create mode 100644 fs/bcachefs/six.c
 create mode 100644 fs/bcachefs/six.h
 create mode 100644 fs/bcachefs/str_hash.h
 create mode 100644 fs/bcachefs/super-io.c
 create mode 100644 fs/bcachefs/super-io.h
 create mode 100644 fs/bcachefs/super.c
 create mode 100644 fs/bcachefs/super.h
 create mode 100644 fs/bcachefs/super_types.h
 create mode 100644 fs/bcachefs/sysfs.c
 create mode 100644 fs/bcachefs/sysfs.h
 create mode 100644 fs/bcachefs/tests.c
 create mode 100644 fs/bcachefs/tests.h
 create mode 100644 fs/bcachefs/trace.c
 create mode 100644 fs/bcachefs/trace.h
 create mode 100644 fs/bcachefs/util.c
 create mode 100644 fs/bcachefs/util.h
 create mode 100644 fs/bcachefs/vstructs.h
 create mode 100644 fs/bcachefs/xattr.c
 create mode 100644 fs/bcachefs/xattr.h

(limited to 'fs')

diff --git a/fs/Kconfig b/fs/Kconfig
index aa7e03cc1941..0d6cb927872a 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -48,6 +48,7 @@ source "fs/ocfs2/Kconfig"
 source "fs/btrfs/Kconfig"
 source "fs/nilfs2/Kconfig"
 source "fs/f2fs/Kconfig"
+source "fs/bcachefs/Kconfig"
 source "fs/zonefs/Kconfig"
 
 endif # BLOCK
diff --git a/fs/Makefile b/fs/Makefile
index f9541f40be4e..75522f88e763 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -123,6 +123,7 @@ obj-$(CONFIG_OCFS2_FS)		+= ocfs2/
 obj-$(CONFIG_BTRFS_FS)		+= btrfs/
 obj-$(CONFIG_GFS2_FS)           += gfs2/
 obj-$(CONFIG_F2FS_FS)		+= f2fs/
+obj-$(CONFIG_BCACHEFS_FS)	+= bcachefs/
 obj-$(CONFIG_CEPH_FS)		+= ceph/
 obj-$(CONFIG_PSTORE)		+= pstore/
 obj-$(CONFIG_EFIVAR_FS)		+= efivarfs/
diff --git a/fs/bcachefs/Kconfig b/fs/bcachefs/Kconfig
new file mode 100644
index 000000000000..c13f2cfa6489
--- /dev/null
+++ b/fs/bcachefs/Kconfig
@@ -0,0 +1,52 @@
+
+config BCACHEFS_FS
+	tristate "bcachefs filesystem support"
+	depends on BLOCK
+	select EXPORTFS
+	select CLOSURES
+	select LIBCRC32C
+	select FS_POSIX_ACL
+	select LZ4_COMPRESS
+	select LZ4_DECOMPRESS
+	select ZLIB_DEFLATE
+	select ZLIB_INFLATE
+	select ZSTD_COMPRESS
+	select ZSTD_DECOMPRESS
+	select CRYPTO_SHA256
+	select CRYPTO_CHACHA20
+	select CRYPTO_POLY1305
+	select KEYS
+	help
+	The bcachefs filesystem - a modern, copy on write filesystem, with
+	support for multiple devices, compression, checksumming, etc.
+
+config BCACHEFS_QUOTA
+	bool "bcachefs quota support"
+	depends on BCACHEFS_FS
+	select QUOTACTL
+
+config BCACHEFS_POSIX_ACL
+	bool "bcachefs POSIX ACL support"
+	depends on BCACHEFS_FS
+	select FS_POSIX_ACL
+
+config BCACHEFS_DEBUG
+	bool "bcachefs debugging"
+	depends on BCACHEFS_FS
+	help
+	Enables many extra debugging checks and assertions.
+
+	The resulting code will be significantly slower than normal; you
+	probably shouldn't select this option unless you're a developer.
+
+config BCACHEFS_TESTS
+	bool "bcachefs unit and performance tests"
+	depends on BCACHEFS_FS
+	help
+	Include some unit and performance tests for the core btree code
+
+config BCACHEFS_NO_LATENCY_ACCT
+	bool "disable latency accounting and time stats"
+	depends on BCACHEFS_FS
+	help
+	This disables device latency tracking and time stats, only for performance testing
diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile
new file mode 100644
index 000000000000..13cd6d2cdc91
--- /dev/null
+++ b/fs/bcachefs/Makefile
@@ -0,0 +1,53 @@
+
+obj-$(CONFIG_BCACHEFS_FS)	+= bcachefs.o
+
+bcachefs-y		:=	\
+	acl.o			\
+	alloc.o			\
+	bkey.o			\
+	bkey_methods.o		\
+	bset.o			\
+	btree_cache.o		\
+	btree_gc.o		\
+	btree_io.o		\
+	btree_iter.o		\
+	btree_update_interior.o	\
+	btree_update_leaf.o	\
+	buckets.o		\
+	chardev.o		\
+	checksum.o		\
+	clock.o			\
+	compress.o		\
+	debug.o			\
+	dirent.o		\
+	disk_groups.o		\
+	error.o			\
+	extents.o		\
+	fs.o			\
+	fs-ioctl.o		\
+	fs-io.o			\
+	fsck.o			\
+	inode.o			\
+	io.o			\
+	journal.o		\
+	journal_io.o		\
+	journal_reclaim.o	\
+	journal_seq_blacklist.o	\
+	keylist.o		\
+	migrate.o		\
+	move.o			\
+	movinggc.o		\
+	opts.o			\
+	quota.o			\
+	rebalance.o		\
+	recovery.o		\
+	replicas.o		\
+	siphash.o		\
+	six.o			\
+	super.o			\
+	super-io.o		\
+	sysfs.o			\
+	tests.o			\
+	trace.o			\
+	util.o			\
+	xattr.o
diff --git a/fs/bcachefs/acl.c b/fs/bcachefs/acl.c
new file mode 100644
index 000000000000..eaf5c8e138fb
--- /dev/null
+++ b/fs/bcachefs/acl.c
@@ -0,0 +1,387 @@
+// SPDX-License-Identifier: GPL-2.0
+#ifdef CONFIG_BCACHEFS_POSIX_ACL
+
+#include "bcachefs.h"
+
+#include <linux/fs.h>
+#include <linux/posix_acl.h>
+#include <linux/posix_acl_xattr.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+
+#include "acl.h"
+#include "fs.h"
+#include "xattr.h"
+
+static inline size_t bch2_acl_size(unsigned nr_short, unsigned nr_long)
+{
+	return sizeof(bch_acl_header) +
+		sizeof(bch_acl_entry_short) * nr_short +
+		sizeof(bch_acl_entry) * nr_long;
+}
+
+static inline int acl_to_xattr_type(int type)
+{
+	switch (type) {
+	case ACL_TYPE_ACCESS:
+		return BCH_XATTR_INDEX_POSIX_ACL_ACCESS;
+	case ACL_TYPE_DEFAULT:
+		return BCH_XATTR_INDEX_POSIX_ACL_DEFAULT;
+	default:
+		BUG();
+	}
+}
+
+/*
+ * Convert from filesystem to in-memory representation.
+ */
+static struct posix_acl *bch2_acl_from_disk(const void *value, size_t size)
+{
+	const void *p, *end = value + size;
+	struct posix_acl *acl;
+	struct posix_acl_entry *out;
+	unsigned count = 0;
+
+	if (!value)
+		return NULL;
+	if (size < sizeof(bch_acl_header))
+		goto invalid;
+	if (((bch_acl_header *)value)->a_version !=
+	    cpu_to_le32(BCH_ACL_VERSION))
+		goto invalid;
+
+	p = value + sizeof(bch_acl_header);
+	while (p < end) {
+		const bch_acl_entry *entry = p;
+
+		if (p + sizeof(bch_acl_entry_short) > end)
+			goto invalid;
+
+		switch (le16_to_cpu(entry->e_tag)) {
+		case ACL_USER_OBJ:
+		case ACL_GROUP_OBJ:
+		case ACL_MASK:
+		case ACL_OTHER:
+			p += sizeof(bch_acl_entry_short);
+			break;
+		case ACL_USER:
+		case ACL_GROUP:
+			p += sizeof(bch_acl_entry);
+			break;
+		default:
+			goto invalid;
+		}
+
+		count++;
+	}
+
+	if (p > end)
+		goto invalid;
+
+	if (!count)
+		return NULL;
+
+	acl = posix_acl_alloc(count, GFP_KERNEL);
+	if (!acl)
+		return ERR_PTR(-ENOMEM);
+
+	out = acl->a_entries;
+
+	p = value + sizeof(bch_acl_header);
+	while (p < end) {
+		const bch_acl_entry *in = p;
+
+		out->e_tag  = le16_to_cpu(in->e_tag);
+		out->e_perm = le16_to_cpu(in->e_perm);
+
+		switch (out->e_tag) {
+		case ACL_USER_OBJ:
+		case ACL_GROUP_OBJ:
+		case ACL_MASK:
+		case ACL_OTHER:
+			p += sizeof(bch_acl_entry_short);
+			break;
+		case ACL_USER:
+			out->e_uid = make_kuid(&init_user_ns,
+					       le32_to_cpu(in->e_id));
+			p += sizeof(bch_acl_entry);
+			break;
+		case ACL_GROUP:
+			out->e_gid = make_kgid(&init_user_ns,
+					       le32_to_cpu(in->e_id));
+			p += sizeof(bch_acl_entry);
+			break;
+		}
+
+		out++;
+	}
+
+	BUG_ON(out != acl->a_entries + acl->a_count);
+
+	return acl;
+invalid:
+	pr_err("invalid acl entry");
+	return ERR_PTR(-EINVAL);
+}
+
+#define acl_for_each_entry(acl, acl_e)			\
+	for (acl_e = acl->a_entries;			\
+	     acl_e < acl->a_entries + acl->a_count;	\
+	     acl_e++)
+
+/*
+ * Convert from in-memory to filesystem representation.
+ */
+static struct bkey_i_xattr *
+bch2_acl_to_xattr(struct btree_trans *trans,
+		  const struct posix_acl *acl,
+		  int type)
+{
+	struct bkey_i_xattr *xattr;
+	bch_acl_header *acl_header;
+	const struct posix_acl_entry *acl_e;
+	void *outptr;
+	unsigned nr_short = 0, nr_long = 0, acl_len, u64s;
+
+	acl_for_each_entry(acl, acl_e) {
+		switch (acl_e->e_tag) {
+		case ACL_USER:
+		case ACL_GROUP:
+			nr_long++;
+			break;
+		case ACL_USER_OBJ:
+		case ACL_GROUP_OBJ:
+		case ACL_MASK:
+		case ACL_OTHER:
+			nr_short++;
+			break;
+		default:
+			return ERR_PTR(-EINVAL);
+		}
+	}
+
+	acl_len = bch2_acl_size(nr_short, nr_long);
+	u64s = BKEY_U64s + xattr_val_u64s(0, acl_len);
+
+	if (u64s > U8_MAX)
+		return ERR_PTR(-E2BIG);
+
+	xattr = bch2_trans_kmalloc(trans, u64s * sizeof(u64));
+	if (IS_ERR(xattr))
+		return xattr;
+
+	bkey_xattr_init(&xattr->k_i);
+	xattr->k.u64s		= u64s;
+	xattr->v.x_type		= acl_to_xattr_type(type);
+	xattr->v.x_name_len	= 0,
+	xattr->v.x_val_len	= cpu_to_le16(acl_len);
+
+	acl_header = xattr_val(&xattr->v);
+	acl_header->a_version = cpu_to_le32(BCH_ACL_VERSION);
+
+	outptr = (void *) acl_header + sizeof(*acl_header);
+
+	acl_for_each_entry(acl, acl_e) {
+		bch_acl_entry *entry = outptr;
+
+		entry->e_tag = cpu_to_le16(acl_e->e_tag);
+		entry->e_perm = cpu_to_le16(acl_e->e_perm);
+		switch (acl_e->e_tag) {
+		case ACL_USER:
+			entry->e_id = cpu_to_le32(
+				from_kuid(&init_user_ns, acl_e->e_uid));
+			outptr += sizeof(bch_acl_entry);
+			break;
+		case ACL_GROUP:
+			entry->e_id = cpu_to_le32(
+				from_kgid(&init_user_ns, acl_e->e_gid));
+			outptr += sizeof(bch_acl_entry);
+			break;
+
+		case ACL_USER_OBJ:
+		case ACL_GROUP_OBJ:
+		case ACL_MASK:
+		case ACL_OTHER:
+			outptr += sizeof(bch_acl_entry_short);
+			break;
+		}
+	}
+
+	BUG_ON(outptr != xattr_val(&xattr->v) + acl_len);
+
+	return xattr;
+}
+
+struct posix_acl *bch2_get_acl(struct mnt_idmap *idmap,
+			       struct dentry *dentry, int type)
+{
+	struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	struct btree_trans trans;
+	struct btree_iter *iter;
+	struct bkey_s_c_xattr xattr;
+	struct posix_acl *acl = NULL;
+
+	bch2_trans_init(&trans, c);
+retry:
+	bch2_trans_begin(&trans);
+
+	iter = bch2_hash_lookup(&trans, bch2_xattr_hash_desc,
+			&inode->ei_str_hash, inode->v.i_ino,
+			&X_SEARCH(acl_to_xattr_type(type), "", 0),
+			0);
+	if (IS_ERR(iter)) {
+		if (PTR_ERR(iter) == -EINTR)
+			goto retry;
+
+		if (PTR_ERR(iter) != -ENOENT)
+			acl = ERR_CAST(iter);
+		goto out;
+	}
+
+	xattr = bkey_s_c_to_xattr(bch2_btree_iter_peek_slot(iter));
+
+	acl = bch2_acl_from_disk(xattr_val(xattr.v),
+			le16_to_cpu(xattr.v->x_val_len));
+
+	if (!IS_ERR(acl))
+		set_cached_acl(&inode->v, type, acl);
+out:
+	bch2_trans_exit(&trans);
+	return acl;
+}
+
+int bch2_set_acl_trans(struct btree_trans *trans,
+		       struct bch_inode_unpacked *inode_u,
+		       const struct bch_hash_info *hash_info,
+		       struct posix_acl *acl, int type)
+{
+	int ret;
+
+	if (type == ACL_TYPE_DEFAULT &&
+	    !S_ISDIR(inode_u->bi_mode))
+		return acl ? -EACCES : 0;
+
+	if (acl) {
+		struct bkey_i_xattr *xattr =
+			bch2_acl_to_xattr(trans, acl, type);
+		if (IS_ERR(xattr))
+			return PTR_ERR(xattr);
+
+		ret = __bch2_hash_set(trans, bch2_xattr_hash_desc, hash_info,
+				      inode_u->bi_inum, &xattr->k_i, 0);
+	} else {
+		struct xattr_search_key search =
+			X_SEARCH(acl_to_xattr_type(type), "", 0);
+
+		ret = bch2_hash_delete(trans, bch2_xattr_hash_desc, hash_info,
+				       inode_u->bi_inum, &search);
+	}
+
+	return ret == -ENOENT ? 0 : ret;
+}
+
+static int inode_update_for_set_acl_fn(struct bch_inode_info *inode,
+				       struct bch_inode_unpacked *bi,
+				       void *p)
+{
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	struct timespec64 now = current_time(&inode->v);
+	umode_t mode = (unsigned long) p;
+
+	bi->bi_ctime	= timespec_to_bch2_time(c, now);
+	bi->bi_mode	= mode;
+	return 0;
+}
+
+int bch2_set_acl(struct mnt_idmap *idmap,
+		 struct dentry *dentry,
+		 struct posix_acl *acl, int type)
+{
+	struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	struct btree_trans trans;
+	struct bch_inode_unpacked inode_u;
+	umode_t mode = inode->v.i_mode;
+	int ret;
+
+	if (type == ACL_TYPE_ACCESS && acl) {
+		ret = posix_acl_update_mode(idmap, &inode->v, &mode, &acl);
+		if (ret)
+			return ret;
+	}
+
+	bch2_trans_init(&trans, c);
+retry:
+	bch2_trans_begin(&trans);
+
+	ret   = bch2_set_acl_trans(&trans,
+				   &inode->ei_inode,
+				   &inode->ei_str_hash,
+				   acl, type) ?:
+		bch2_write_inode_trans(&trans, inode, &inode_u,
+				       inode_update_for_set_acl_fn,
+				       (void *)(unsigned long) mode) ?:
+		bch2_trans_commit(&trans, NULL, NULL,
+				  &inode->ei_journal_seq,
+				  BTREE_INSERT_ATOMIC|
+				  BTREE_INSERT_NOUNLOCK);
+	if (ret == -EINTR)
+		goto retry;
+	if (unlikely(ret))
+		goto err;
+
+	bch2_inode_update_after_write(c, inode, &inode_u,
+				      ATTR_CTIME|ATTR_MODE);
+
+	set_cached_acl(&inode->v, type, acl);
+err:
+	bch2_trans_exit(&trans);
+
+	return ret;
+}
+
+int bch2_acl_chmod(struct btree_trans *trans,
+		   struct bch_inode_info *inode,
+		   umode_t mode,
+		   struct posix_acl **new_acl)
+{
+	struct btree_iter *iter;
+	struct bkey_s_c_xattr xattr;
+	struct bkey_i_xattr *new;
+	struct posix_acl *acl;
+	int ret = 0;
+
+	iter = bch2_hash_lookup(trans, bch2_xattr_hash_desc,
+			&inode->ei_str_hash, inode->v.i_ino,
+			&X_SEARCH(BCH_XATTR_INDEX_POSIX_ACL_ACCESS, "", 0),
+			BTREE_ITER_INTENT);
+	if (IS_ERR(iter))
+		return PTR_ERR(iter) != -ENOENT ? PTR_ERR(iter) : 0;
+
+	xattr = bkey_s_c_to_xattr(bch2_btree_iter_peek_slot(iter));
+
+	acl = bch2_acl_from_disk(xattr_val(xattr.v),
+			le16_to_cpu(xattr.v->x_val_len));
+	if (IS_ERR_OR_NULL(acl))
+		return PTR_ERR(acl);
+
+	ret = __posix_acl_chmod(&acl, GFP_KERNEL, mode);
+	if (ret)
+		goto err;
+
+	new = bch2_acl_to_xattr(trans, acl, ACL_TYPE_ACCESS);
+	if (IS_ERR(new)) {
+		ret = PTR_ERR(new);
+		goto err;
+	}
+
+	bch2_trans_update(trans, iter, &new->k_i, 0);
+	*new_acl = acl;
+	acl = NULL;
+err:
+	kfree(acl);
+	return ret;
+}
+
+#endif /* CONFIG_BCACHEFS_POSIX_ACL */
diff --git a/fs/bcachefs/acl.h b/fs/bcachefs/acl.h
new file mode 100644
index 000000000000..73739e38e2d5
--- /dev/null
+++ b/fs/bcachefs/acl.h
@@ -0,0 +1,59 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_ACL_H
+#define _BCACHEFS_ACL_H
+
+struct bch_inode_unpacked;
+struct bch_hash_info;
+struct bch_inode_info;
+struct posix_acl;
+
+#ifdef CONFIG_BCACHEFS_POSIX_ACL
+
+#define BCH_ACL_VERSION	0x0001
+
+typedef struct {
+	__le16		e_tag;
+	__le16		e_perm;
+	__le32		e_id;
+} bch_acl_entry;
+
+typedef struct {
+	__le16		e_tag;
+	__le16		e_perm;
+} bch_acl_entry_short;
+
+typedef struct {
+	__le32		a_version;
+} bch_acl_header;
+
+struct posix_acl *bch2_get_acl(struct mnt_idmap *, struct dentry *, int);
+
+int bch2_set_acl_trans(struct btree_trans *,
+		       struct bch_inode_unpacked *,
+		       const struct bch_hash_info *,
+		       struct posix_acl *, int);
+int bch2_set_acl(struct mnt_idmap *, struct dentry *, struct posix_acl *, int);
+int bch2_acl_chmod(struct btree_trans *, struct bch_inode_info *,
+		   umode_t, struct posix_acl **);
+
+#else
+
+static inline int bch2_set_acl_trans(struct btree_trans *trans,
+				     struct bch_inode_unpacked *inode_u,
+				     const struct bch_hash_info *hash_info,
+				     struct posix_acl *acl, int type)
+{
+	return 0;
+}
+
+static inline int bch2_acl_chmod(struct btree_trans *trans,
+				 struct bch_inode_info *inode,
+				 umode_t mode,
+				 struct posix_acl **new_acl)
+{
+	return 0;
+}
+
+#endif /* CONFIG_BCACHEFS_POSIX_ACL */
+
+#endif /* _BCACHEFS_ACL_H */
diff --git a/fs/bcachefs/alloc.c b/fs/bcachefs/alloc.c
new file mode 100644
index 000000000000..e6e506e4a8a3
--- /dev/null
+++ b/fs/bcachefs/alloc.c
@@ -0,0 +1,2205 @@
+/*
+ * Primary bucket allocation code
+ *
+ * Copyright 2012 Google, Inc.
+ *
+ * Allocation in bcache is done in terms of buckets:
+ *
+ * Each bucket has associated an 8 bit gen; this gen corresponds to the gen in
+ * btree pointers - they must match for the pointer to be considered valid.
+ *
+ * Thus (assuming a bucket has no dirty data or metadata in it) we can reuse a
+ * bucket simply by incrementing its gen.
+ *
+ * The gens (along with the priorities; it's really the gens are important but
+ * the code is named as if it's the priorities) are written in an arbitrary list
+ * of buckets on disk, with a pointer to them in the journal header.
+ *
+ * When we invalidate a bucket, we have to write its new gen to disk and wait
+ * for that write to complete before we use it - otherwise after a crash we
+ * could have pointers that appeared to be good but pointed to data that had
+ * been overwritten.
+ *
+ * Since the gens and priorities are all stored contiguously on disk, we can
+ * batch this up: We fill up the free_inc list with freshly invalidated buckets,
+ * call prio_write(), and when prio_write() finishes we pull buckets off the
+ * free_inc list and optionally discard them.
+ *
+ * free_inc isn't the only freelist - if it was, we'd often have to sleep while
+ * priorities and gens were being written before we could allocate. c->free is a
+ * smaller freelist, and buckets on that list are always ready to be used.
+ *
+ * If we've got discards enabled, that happens when a bucket moves from the
+ * free_inc list to the free list.
+ *
+ * It's important to ensure that gens don't wrap around - with respect to
+ * either the oldest gen in the btree or the gen on disk. This is quite
+ * difficult to do in practice, but we explicitly guard against it anyways - if
+ * a bucket is in danger of wrapping around we simply skip invalidating it that
+ * time around, and we garbage collect or rewrite the priorities sooner than we
+ * would have otherwise.
+ *
+ * bch2_bucket_alloc() allocates a single bucket from a specific device.
+ *
+ * bch2_bucket_alloc_set() allocates one or more buckets from different devices
+ * in a given filesystem.
+ *
+ * invalidate_buckets() drives all the processes described above. It's called
+ * from bch2_bucket_alloc() and a few other places that need to make sure free
+ * buckets are ready.
+ *
+ * invalidate_buckets_(lru|fifo)() find buckets that are available to be
+ * invalidated, and then invalidate them and stick them on the free_inc list -
+ * in either lru or fifo order.
+ */
+
+#include "bcachefs.h"
+#include "alloc.h"
+#include "btree_cache.h"
+#include "btree_io.h"
+#include "btree_update.h"
+#include "btree_update_interior.h"
+#include "btree_gc.h"
+#include "buckets.h"
+#include "checksum.h"
+#include "clock.h"
+#include "debug.h"
+#include "disk_groups.h"
+#include "error.h"
+#include "extents.h"
+#include "io.h"
+#include "journal.h"
+#include "journal_io.h"
+#include "super-io.h"
+#include "trace.h"
+
+#include <linux/blkdev.h>
+#include <linux/kthread.h>
+#include <linux/math64.h>
+#include <linux/random.h>
+#include <linux/rculist.h>
+#include <linux/rcupdate.h>
+#include <linux/sched/task.h>
+#include <linux/sort.h>
+
+static void bch2_recalc_oldest_io(struct bch_fs *, struct bch_dev *, int);
+
+/* Ratelimiting/PD controllers */
+
+static void pd_controllers_update(struct work_struct *work)
+{
+	struct bch_fs *c = container_of(to_delayed_work(work),
+					   struct bch_fs,
+					   pd_controllers_update);
+	struct bch_dev *ca;
+	unsigned i;
+
+	for_each_member_device(ca, c, i) {
+		struct bch_dev_usage stats = bch2_dev_usage_read(c, ca);
+
+		u64 free = bucket_to_sector(ca,
+				__dev_buckets_free(ca, stats)) << 9;
+		/*
+		 * Bytes of internal fragmentation, which can be
+		 * reclaimed by copy GC
+		 */
+		s64 fragmented = (bucket_to_sector(ca,
+					stats.buckets[BCH_DATA_USER] +
+					stats.buckets[BCH_DATA_CACHED]) -
+				  (stats.sectors[BCH_DATA_USER] +
+				   stats.sectors[BCH_DATA_CACHED])) << 9;
+
+		fragmented = max(0LL, fragmented);
+
+		bch2_pd_controller_update(&ca->copygc_pd,
+					 free, fragmented, -1);
+	}
+
+	schedule_delayed_work(&c->pd_controllers_update,
+			      c->pd_controllers_update_seconds * HZ);
+}
+
+/* Persistent alloc info: */
+
+static unsigned bch_alloc_val_u64s(const struct bch_alloc *a)
+{
+	unsigned bytes = offsetof(struct bch_alloc, data);
+
+	if (a->fields & (1 << BCH_ALLOC_FIELD_READ_TIME))
+		bytes += 2;
+	if (a->fields & (1 << BCH_ALLOC_FIELD_WRITE_TIME))
+		bytes += 2;
+
+	return DIV_ROUND_UP(bytes, sizeof(u64));
+}
+
+const char *bch2_alloc_invalid(const struct bch_fs *c, struct bkey_s_c k)
+{
+	if (k.k->p.inode >= c->sb.nr_devices ||
+	    !c->devs[k.k->p.inode])
+		return "invalid device";
+
+	switch (k.k->type) {
+	case BCH_ALLOC: {
+		struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k);
+
+		if (bch_alloc_val_u64s(a.v) != bkey_val_u64s(a.k))
+			return "incorrect value size";
+		break;
+	}
+	default:
+		return "invalid type";
+	}
+
+	return NULL;
+}
+
+void bch2_alloc_to_text(struct bch_fs *c, char *buf,
+			size_t size, struct bkey_s_c k)
+{
+	buf[0] = '\0';
+
+	switch (k.k->type) {
+	case BCH_ALLOC:
+		break;
+	}
+}
+
+static inline unsigned get_alloc_field(const u8 **p, unsigned bytes)
+{
+	unsigned v;
+
+	switch (bytes) {
+	case 1:
+		v = **p;
+		break;
+	case 2:
+		v = le16_to_cpup((void *) *p);
+		break;
+	case 4:
+		v = le32_to_cpup((void *) *p);
+		break;
+	default:
+		BUG();
+	}
+
+	*p += bytes;
+	return v;
+}
+
+static inline void put_alloc_field(u8 **p, unsigned bytes, unsigned v)
+{
+	switch (bytes) {
+	case 1:
+		**p = v;
+		break;
+	case 2:
+		*((__le16 *) *p) = cpu_to_le16(v);
+		break;
+	case 4:
+		*((__le32 *) *p) = cpu_to_le32(v);
+		break;
+	default:
+		BUG();
+	}
+
+	*p += bytes;
+}
+
+static void bch2_alloc_read_key(struct bch_fs *c, struct bkey_s_c k)
+{
+	struct bch_dev *ca;
+	struct bkey_s_c_alloc a;
+	struct bucket_mark new;
+	struct bucket *g;
+	const u8 *d;
+
+	if (k.k->type != BCH_ALLOC)
+		return;
+
+	a = bkey_s_c_to_alloc(k);
+	ca = bch_dev_bkey_exists(c, a.k->p.inode);
+
+	if (a.k->p.offset >= ca->mi.nbuckets)
+		return;
+
+	percpu_down_read(&c->usage_lock);
+
+	g = bucket(ca, a.k->p.offset);
+	bucket_cmpxchg(g, new, ({
+		new.gen = a.v->gen;
+		new.gen_valid = 1;
+	}));
+
+	d = a.v->data;
+	if (a.v->fields & (1 << BCH_ALLOC_FIELD_READ_TIME))
+		g->io_time[READ] = get_alloc_field(&d, 2);
+	if (a.v->fields & (1 << BCH_ALLOC_FIELD_WRITE_TIME))
+		g->io_time[WRITE] = get_alloc_field(&d, 2);
+
+	percpu_up_read(&c->usage_lock);
+}
+
+int bch2_alloc_read(struct bch_fs *c, struct list_head *journal_replay_list)
+{
+	struct journal_replay *r;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	struct bch_dev *ca;
+	unsigned i;
+	int ret;
+
+	for_each_btree_key(&iter, c, BTREE_ID_ALLOC, POS_MIN, 0, k) {
+		bch2_alloc_read_key(c, k);
+		bch2_btree_iter_cond_resched(&iter);
+	}
+
+	ret = bch2_btree_iter_unlock(&iter);
+	if (ret)
+		return ret;
+
+	list_for_each_entry(r, journal_replay_list, list) {
+		struct bkey_i *k, *n;
+		struct jset_entry *entry;
+
+		for_each_jset_key(k, n, entry, &r->j)
+			if (entry->btree_id == BTREE_ID_ALLOC)
+				bch2_alloc_read_key(c, bkey_i_to_s_c(k));
+	}
+
+	mutex_lock(&c->bucket_clock[READ].lock);
+	for_each_member_device(ca, c, i) {
+		down_read(&ca->bucket_lock);
+		bch2_recalc_oldest_io(c, ca, READ);
+		up_read(&ca->bucket_lock);
+	}
+	mutex_unlock(&c->bucket_clock[READ].lock);
+
+	mutex_lock(&c->bucket_clock[WRITE].lock);
+	for_each_member_device(ca, c, i) {
+		down_read(&ca->bucket_lock);
+		bch2_recalc_oldest_io(c, ca, WRITE);
+		up_read(&ca->bucket_lock);
+	}
+	mutex_unlock(&c->bucket_clock[WRITE].lock);
+
+	return 0;
+}
+
+static int __bch2_alloc_write_key(struct bch_fs *c, struct bch_dev *ca,
+				  size_t b, struct btree_iter *iter,
+				  u64 *journal_seq, bool nowait)
+{
+	struct bucket_mark m;
+	__BKEY_PADDED(k, DIV_ROUND_UP(sizeof(struct bch_alloc), 8)) alloc_key;
+	struct bucket *g;
+	struct bkey_i_alloc *a;
+	u8 *d;
+	int ret;
+	unsigned flags = BTREE_INSERT_ATOMIC|
+		BTREE_INSERT_NOFAIL|
+		BTREE_INSERT_USE_RESERVE|
+		BTREE_INSERT_USE_ALLOC_RESERVE;
+
+	if (nowait)
+		flags |= BTREE_INSERT_NOWAIT;
+
+	bch2_btree_iter_set_pos(iter, POS(ca->dev_idx, b));
+
+	do {
+		ret = btree_iter_err(bch2_btree_iter_peek_slot(iter));
+		if (ret)
+			break;
+
+		percpu_down_read(&c->usage_lock);
+		g = bucket(ca, b);
+
+		/* read mark under btree node lock: */
+		m = READ_ONCE(g->mark);
+		a = bkey_alloc_init(&alloc_key.k);
+		a->k.p		= iter->pos;
+		a->v.fields	= 0;
+		a->v.gen	= m.gen;
+		set_bkey_val_u64s(&a->k, bch_alloc_val_u64s(&a->v));
+
+		d = a->v.data;
+		if (a->v.fields & (1 << BCH_ALLOC_FIELD_READ_TIME))
+			put_alloc_field(&d, 2, g->io_time[READ]);
+		if (a->v.fields & (1 << BCH_ALLOC_FIELD_WRITE_TIME))
+			put_alloc_field(&d, 2, g->io_time[WRITE]);
+		percpu_up_read(&c->usage_lock);
+
+		ret = bch2_btree_insert_at(c, NULL, NULL, journal_seq, flags,
+					   BTREE_INSERT_ENTRY(iter, &a->k_i));
+		bch2_btree_iter_cond_resched(iter);
+	} while (ret == -EINTR);
+
+	return ret;
+}
+
+int bch2_alloc_replay_key(struct bch_fs *c, struct bpos pos)
+{
+	struct bch_dev *ca;
+	struct btree_iter iter;
+	int ret;
+
+	if (pos.inode >= c->sb.nr_devices || !c->devs[pos.inode])
+		return 0;
+
+	ca = bch_dev_bkey_exists(c, pos.inode);
+
+	if (pos.offset >= ca->mi.nbuckets)
+		return 0;
+
+	bch2_btree_iter_init(&iter, c, BTREE_ID_ALLOC, POS_MIN,
+			     BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+
+	ret = __bch2_alloc_write_key(c, ca, pos.offset, &iter,
+				     NULL, false);
+	bch2_btree_iter_unlock(&iter);
+	return ret;
+}
+
+int bch2_alloc_write(struct bch_fs *c)
+{
+	struct bch_dev *ca;
+	unsigned i;
+	int ret = 0;
+
+	for_each_rw_member(ca, c, i) {
+		struct btree_iter iter;
+		unsigned long bucket;
+
+		bch2_btree_iter_init(&iter, c, BTREE_ID_ALLOC, POS_MIN,
+				     BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+
+		down_read(&ca->bucket_lock);
+		for_each_set_bit(bucket, ca->buckets_dirty, ca->mi.nbuckets) {
+			ret = __bch2_alloc_write_key(c, ca, bucket, &iter,
+						     NULL, false);
+			if (ret)
+				break;
+
+			clear_bit(bucket, ca->buckets_dirty);
+		}
+		up_read(&ca->bucket_lock);
+		bch2_btree_iter_unlock(&iter);
+
+		if (ret) {
+			percpu_ref_put(&ca->io_ref);
+			break;
+		}
+	}
+
+	return ret;
+}
+
+/* Bucket IO clocks: */
+
+static void bch2_recalc_oldest_io(struct bch_fs *c, struct bch_dev *ca, int rw)
+{
+	struct bucket_clock *clock = &c->bucket_clock[rw];
+	struct bucket_array *buckets = bucket_array(ca);
+	struct bucket *g;
+	u16 max_last_io = 0;
+	unsigned i;
+
+	lockdep_assert_held(&c->bucket_clock[rw].lock);
+
+	/* Recalculate max_last_io for this device: */
+	for_each_bucket(g, buckets)
+		max_last_io = max(max_last_io, bucket_last_io(c, g, rw));
+
+	ca->max_last_bucket_io[rw] = max_last_io;
+
+	/* Recalculate global max_last_io: */
+	max_last_io = 0;
+
+	for_each_member_device(ca, c, i)
+		max_last_io = max(max_last_io, ca->max_last_bucket_io[rw]);
+
+	clock->max_last_io = max_last_io;
+}
+
+static void bch2_rescale_bucket_io_times(struct bch_fs *c, int rw)
+{
+	struct bucket_clock *clock = &c->bucket_clock[rw];
+	struct bucket_array *buckets;
+	struct bch_dev *ca;
+	struct bucket *g;
+	unsigned i;
+
+	trace_rescale_prios(c);
+
+	for_each_member_device(ca, c, i) {
+		down_read(&ca->bucket_lock);
+		buckets = bucket_array(ca);
+
+		for_each_bucket(g, buckets)
+			g->io_time[rw] = clock->hand -
+			bucket_last_io(c, g, rw) / 2;
+
+		bch2_recalc_oldest_io(c, ca, rw);
+
+		up_read(&ca->bucket_lock);
+	}
+}
+
+static void bch2_inc_clock_hand(struct io_timer *timer)
+{
+	struct bucket_clock *clock = container_of(timer,
+						struct bucket_clock, rescale);
+	struct bch_fs *c = container_of(clock,
+					struct bch_fs, bucket_clock[clock->rw]);
+	struct bch_dev *ca;
+	u64 capacity;
+	unsigned i;
+
+	mutex_lock(&clock->lock);
+
+	/* if clock cannot be advanced more, rescale prio */
+	if (clock->max_last_io >= U16_MAX - 2)
+		bch2_rescale_bucket_io_times(c, clock->rw);
+
+	BUG_ON(clock->max_last_io >= U16_MAX - 2);
+
+	for_each_member_device(ca, c, i)
+		ca->max_last_bucket_io[clock->rw]++;
+	clock->max_last_io++;
+	clock->hand++;
+
+	mutex_unlock(&clock->lock);
+
+	capacity = READ_ONCE(c->capacity);
+
+	if (!capacity)
+		return;
+
+	/*
+	 * we only increment when 0.1% of the filesystem capacity has been read
+	 * or written too, this determines if it's time
+	 *
+	 * XXX: we shouldn't really be going off of the capacity of devices in
+	 * RW mode (that will be 0 when we're RO, yet we can still service
+	 * reads)
+	 */
+	timer->expire += capacity >> 10;
+
+	bch2_io_timer_add(&c->io_clock[clock->rw], timer);
+}
+
+static void bch2_bucket_clock_init(struct bch_fs *c, int rw)
+{
+	struct bucket_clock *clock = &c->bucket_clock[rw];
+
+	clock->hand		= 1;
+	clock->rw		= rw;
+	clock->rescale.fn	= bch2_inc_clock_hand;
+	clock->rescale.expire	= c->capacity >> 10;
+	mutex_init(&clock->lock);
+}
+
+/* Background allocator thread: */
+
+/*
+ * Scans for buckets to be invalidated, invalidates them, rewrites prios/gens
+ * (marking them as invalidated on disk), then optionally issues discard
+ * commands to the newly free buckets, then puts them on the various freelists.
+ */
+
+static void verify_not_on_freelist(struct bch_fs *c, struct bch_dev *ca,
+				   size_t bucket)
+{
+	if (expensive_debug_checks(c) &&
+	    test_bit(BCH_FS_ALLOCATOR_STARTED, &c->flags)) {
+		size_t iter;
+		long i;
+		unsigned j;
+
+		for (j = 0; j < RESERVE_NR; j++)
+			fifo_for_each_entry(i, &ca->free[j], iter)
+				BUG_ON(i == bucket);
+		fifo_for_each_entry(i, &ca->free_inc, iter)
+			BUG_ON(i == bucket);
+	}
+}
+
+#define BUCKET_GC_GEN_MAX	96U
+
+/**
+ * wait_buckets_available - wait on reclaimable buckets
+ *
+ * If there aren't enough available buckets to fill up free_inc, wait until
+ * there are.
+ */
+static int wait_buckets_available(struct bch_fs *c, struct bch_dev *ca)
+{
+	unsigned long gc_count = c->gc_count;
+	int ret = 0;
+
+	while (1) {
+		set_current_state(TASK_INTERRUPTIBLE);
+		if (kthread_should_stop()) {
+			ret = 1;
+			break;
+		}
+
+		if (gc_count != c->gc_count)
+			ca->inc_gen_really_needs_gc = 0;
+
+		if ((ssize_t) (dev_buckets_available(c, ca) -
+			       ca->inc_gen_really_needs_gc) >=
+		    (ssize_t) fifo_free(&ca->free_inc))
+			break;
+
+		up_read(&c->gc_lock);
+		schedule();
+		try_to_freeze();
+		down_read(&c->gc_lock);
+	}
+
+	__set_current_state(TASK_RUNNING);
+	return ret;
+}
+
+static bool bch2_can_invalidate_bucket(struct bch_dev *ca,
+				       size_t bucket,
+				       struct bucket_mark mark)
+{
+	u8 gc_gen;
+
+	if (!is_available_bucket(mark))
+		return false;
+
+	gc_gen = bucket_gc_gen(ca, bucket);
+
+	if (gc_gen >= BUCKET_GC_GEN_MAX / 2)
+		ca->inc_gen_needs_gc++;
+
+	if (gc_gen >= BUCKET_GC_GEN_MAX)
+		ca->inc_gen_really_needs_gc++;
+
+	return gc_gen < BUCKET_GC_GEN_MAX;
+}
+
+static void bch2_invalidate_one_bucket(struct bch_fs *c, struct bch_dev *ca,
+				       size_t bucket)
+{
+	struct bucket_mark m;
+
+	percpu_down_read(&c->usage_lock);
+	spin_lock(&c->freelist_lock);
+
+	if (!bch2_invalidate_bucket(c, ca, bucket, &m)) {
+		spin_unlock(&c->freelist_lock);
+		percpu_up_read(&c->usage_lock);
+		return;
+	}
+
+	verify_not_on_freelist(c, ca, bucket);
+	BUG_ON(!fifo_push(&ca->free_inc, bucket));
+
+	spin_unlock(&c->freelist_lock);
+	percpu_up_read(&c->usage_lock);
+
+	/* gc lock held: */
+	bucket_io_clock_reset(c, ca, bucket, READ);
+	bucket_io_clock_reset(c, ca, bucket, WRITE);
+
+	if (m.cached_sectors) {
+		ca->allocator_invalidating_data = true;
+	} else if (m.journal_seq_valid) {
+		u64 journal_seq = atomic64_read(&c->journal.seq);
+		u64 bucket_seq	= journal_seq;
+
+		bucket_seq &= ~((u64) U16_MAX);
+		bucket_seq |= m.journal_seq;
+
+		if (bucket_seq > journal_seq)
+			bucket_seq -= 1 << 16;
+
+		ca->allocator_journal_seq_flush =
+			max(ca->allocator_journal_seq_flush, bucket_seq);
+	}
+}
+
+/*
+ * Determines what order we're going to reuse buckets, smallest bucket_key()
+ * first.
+ *
+ *
+ * - We take into account the read prio of the bucket, which gives us an
+ *   indication of how hot the data is -- we scale the prio so that the prio
+ *   farthest from the clock is worth 1/8th of the closest.
+ *
+ * - The number of sectors of cached data in the bucket, which gives us an
+ *   indication of the cost in cache misses this eviction will cause.
+ *
+ * - If hotness * sectors used compares equal, we pick the bucket with the
+ *   smallest bucket_gc_gen() - since incrementing the same bucket's generation
+ *   number repeatedly forces us to run mark and sweep gc to avoid generation
+ *   number wraparound.
+ */
+
+static unsigned long bucket_sort_key(struct bch_fs *c, struct bch_dev *ca,
+				     size_t b, struct bucket_mark m)
+{
+	unsigned last_io = bucket_last_io(c, bucket(ca, b), READ);
+	unsigned max_last_io = ca->max_last_bucket_io[READ];
+
+	/*
+	 * Time since last read, scaled to [0, 8) where larger value indicates
+	 * more recently read data:
+	 */
+	unsigned long hotness = (max_last_io - last_io) * 7 / max_last_io;
+
+	/* How much we want to keep the data in this bucket: */
+	unsigned long data_wantness =
+		(hotness + 1) * bucket_sectors_used(m);
+
+	unsigned long needs_journal_commit =
+		bucket_needs_journal_commit(m, c->journal.last_seq_ondisk);
+
+	return  (data_wantness << 9) |
+		(needs_journal_commit << 8) |
+		bucket_gc_gen(ca, b);
+}
+
+static inline int bucket_alloc_cmp(alloc_heap *h,
+				   struct alloc_heap_entry l,
+				   struct alloc_heap_entry r)
+{
+	return (l.key > r.key) - (l.key < r.key) ?:
+		(l.nr < r.nr)  - (l.nr  > r.nr) ?:
+		(l.bucket > r.bucket) - (l.bucket < r.bucket);
+}
+
+static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca)
+{
+	struct bucket_array *buckets;
+	struct alloc_heap_entry e = { 0 };
+	size_t b;
+
+	ca->alloc_heap.used = 0;
+
+	mutex_lock(&c->bucket_clock[READ].lock);
+	down_read(&ca->bucket_lock);
+
+	buckets = bucket_array(ca);
+
+	bch2_recalc_oldest_io(c, ca, READ);
+
+	/*
+	 * Find buckets with lowest read priority, by building a maxheap sorted
+	 * by read priority and repeatedly replacing the maximum element until
+	 * all buckets have been visited.
+	 */
+	for (b = ca->mi.first_bucket; b < ca->mi.nbuckets; b++) {
+		struct bucket_mark m = READ_ONCE(buckets->b[b].mark);
+		unsigned long key = bucket_sort_key(c, ca, b, m);
+
+		if (!bch2_can_invalidate_bucket(ca, b, m))
+			continue;
+
+		if (e.nr && e.bucket + e.nr == b && e.key == key) {
+			e.nr++;
+		} else {
+			if (e.nr)
+				heap_add_or_replace(&ca->alloc_heap, e, -bucket_alloc_cmp);
+
+			e = (struct alloc_heap_entry) {
+				.bucket = b,
+				.nr	= 1,
+				.key	= key,
+			};
+		}
+
+		cond_resched();
+	}
+
+	if (e.nr)
+		heap_add_or_replace(&ca->alloc_heap, e, -bucket_alloc_cmp);
+
+	up_read(&ca->bucket_lock);
+	mutex_unlock(&c->bucket_clock[READ].lock);
+
+	heap_resort(&ca->alloc_heap, bucket_alloc_cmp);
+
+	while (heap_pop(&ca->alloc_heap, e, bucket_alloc_cmp)) {
+		for (b = e.bucket;
+		     b < e.bucket + e.nr;
+		     b++) {
+			if (fifo_full(&ca->free_inc))
+				return;
+
+			bch2_invalidate_one_bucket(c, ca, b);
+		}
+	}
+}
+
+static void find_reclaimable_buckets_fifo(struct bch_fs *c, struct bch_dev *ca)
+{
+	struct bucket_array *buckets = bucket_array(ca);
+	struct bucket_mark m;
+	size_t b, checked;
+
+	for (checked = 0;
+	     checked < ca->mi.nbuckets && !fifo_full(&ca->free_inc);
+	     checked++) {
+		if (ca->fifo_last_bucket <  ca->mi.first_bucket ||
+		    ca->fifo_last_bucket >= ca->mi.nbuckets)
+			ca->fifo_last_bucket = ca->mi.first_bucket;
+
+		b = ca->fifo_last_bucket++;
+
+		m = READ_ONCE(buckets->b[b].mark);
+
+		if (bch2_can_invalidate_bucket(ca, b, m))
+			bch2_invalidate_one_bucket(c, ca, b);
+
+		cond_resched();
+	}
+}
+
+static void find_reclaimable_buckets_random(struct bch_fs *c, struct bch_dev *ca)
+{
+	struct bucket_array *buckets = bucket_array(ca);
+	struct bucket_mark m;
+	size_t checked;
+
+	for (checked = 0;
+	     checked < ca->mi.nbuckets / 2 && !fifo_full(&ca->free_inc);
+	     checked++) {
+		size_t b = bch2_rand_range(ca->mi.nbuckets -
+					   ca->mi.first_bucket) +
+			ca->mi.first_bucket;
+
+		m = READ_ONCE(buckets->b[b].mark);
+
+		if (bch2_can_invalidate_bucket(ca, b, m))
+			bch2_invalidate_one_bucket(c, ca, b);
+
+		cond_resched();
+	}
+}
+
+static void find_reclaimable_buckets(struct bch_fs *c, struct bch_dev *ca)
+{
+	ca->inc_gen_needs_gc			= 0;
+	ca->inc_gen_really_needs_gc		= 0;
+
+	switch (ca->mi.replacement) {
+	case CACHE_REPLACEMENT_LRU:
+		find_reclaimable_buckets_lru(c, ca);
+		break;
+	case CACHE_REPLACEMENT_FIFO:
+		find_reclaimable_buckets_fifo(c, ca);
+		break;
+	case CACHE_REPLACEMENT_RANDOM:
+		find_reclaimable_buckets_random(c, ca);
+		break;
+	}
+}
+
+static int size_t_cmp(const void *_l, const void *_r)
+{
+	const size_t *l = _l, *r = _r;
+
+	return (*l > *r) - (*l < *r);
+}
+
+static void sort_free_inc(struct bch_fs *c, struct bch_dev *ca)
+{
+	BUG_ON(ca->free_inc.front);
+
+	spin_lock(&c->freelist_lock);
+	sort(ca->free_inc.data,
+	     ca->free_inc.back,
+	     sizeof(ca->free_inc.data[0]),
+	     size_t_cmp, NULL);
+	spin_unlock(&c->freelist_lock);
+}
+
+static int bch2_invalidate_free_inc(struct bch_fs *c, struct bch_dev *ca,
+				    u64 *journal_seq, size_t nr,
+				    bool nowait)
+{
+	struct btree_iter iter;
+	int ret = 0;
+
+	bch2_btree_iter_init(&iter, c, BTREE_ID_ALLOC, POS(ca->dev_idx, 0),
+			     BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+
+	/* Only use nowait if we've already invalidated at least one bucket: */
+	while (ca->nr_invalidated < min(nr, fifo_used(&ca->free_inc))) {
+		size_t b = fifo_idx_entry(&ca->free_inc, ca->nr_invalidated);
+
+		ret = __bch2_alloc_write_key(c, ca, b, &iter, journal_seq,
+					     nowait && ca->nr_invalidated);
+		if (ret)
+			break;
+
+		ca->nr_invalidated++;
+	}
+
+	bch2_btree_iter_unlock(&iter);
+
+	/* If we used NOWAIT, don't return the error: */
+	return ca->nr_invalidated ? 0 : ret;
+}
+
+static bool __push_invalidated_bucket(struct bch_fs *c, struct bch_dev *ca, size_t bucket)
+{
+	unsigned i;
+
+	/*
+	 * Don't remove from free_inc until after it's added to
+	 * freelist, so gc can find it:
+	 */
+	spin_lock(&c->freelist_lock);
+	for (i = 0; i < RESERVE_NR; i++)
+		if (fifo_push(&ca->free[i], bucket)) {
+			fifo_pop(&ca->free_inc, bucket);
+			--ca->nr_invalidated;
+			closure_wake_up(&c->freelist_wait);
+			spin_unlock(&c->freelist_lock);
+			return true;
+		}
+	spin_unlock(&c->freelist_lock);
+
+	return false;
+}
+
+static int push_invalidated_bucket(struct bch_fs *c, struct bch_dev *ca, size_t bucket)
+{
+	int ret = 0;
+
+	while (1) {
+		set_current_state(TASK_INTERRUPTIBLE);
+
+		if (__push_invalidated_bucket(c, ca, bucket))
+			break;
+
+		if ((current->flags & PF_KTHREAD) &&
+		    kthread_should_stop()) {
+			ret = 1;
+			break;
+		}
+
+		schedule();
+		try_to_freeze();
+	}
+
+	__set_current_state(TASK_RUNNING);
+	return ret;
+}
+
+/*
+ * Given an invalidated, ready to use bucket: issue a discard to it if enabled,
+ * then add it to the freelist, waiting until there's room if necessary:
+ */
+static int discard_invalidated_buckets(struct bch_fs *c, struct bch_dev *ca)
+{
+	while (ca->nr_invalidated) {
+		size_t bucket = fifo_peek(&ca->free_inc);
+
+		BUG_ON(fifo_empty(&ca->free_inc) || !ca->nr_invalidated);
+
+		if (ca->mi.discard &&
+		    bdev_max_discard_sectors(ca->disk_sb.bdev))
+			blkdev_issue_discard(ca->disk_sb.bdev,
+					     bucket_to_sector(ca, bucket),
+					     ca->mi.bucket_size, GFP_NOIO);
+
+		if (push_invalidated_bucket(c, ca, bucket))
+			return 1;
+	}
+
+	return 0;
+}
+
+/**
+ * bch_allocator_thread - move buckets from free_inc to reserves
+ *
+ * The free_inc FIFO is populated by find_reclaimable_buckets(), and
+ * the reserves are depleted by bucket allocation. When we run out
+ * of free_inc, try to invalidate some buckets and write out
+ * prios and gens.
+ */
+static int bch2_allocator_thread(void *arg)
+{
+	struct bch_dev *ca = arg;
+	struct bch_fs *c = ca->fs;
+	u64 journal_seq;
+	int ret;
+
+	set_freezable();
+
+	while (1) {
+		while (1) {
+			cond_resched();
+
+			pr_debug("discarding %zu invalidated buckets",
+				 ca->nr_invalidated);
+
+			ret = discard_invalidated_buckets(c, ca);
+			if (ret)
+				goto stop;
+
+			if (fifo_empty(&ca->free_inc))
+				break;
+
+			pr_debug("invalidating %zu buckets",
+				 fifo_used(&ca->free_inc));
+
+			journal_seq = 0;
+			ret = bch2_invalidate_free_inc(c, ca, &journal_seq,
+						       SIZE_MAX, true);
+			if (ret) {
+				bch_err(ca, "error invalidating buckets: %i", ret);
+				goto stop;
+			}
+
+			if (!ca->nr_invalidated) {
+				bch_err(ca, "allocator thread unable to make forward progress!");
+				goto stop;
+			}
+
+			if (ca->allocator_invalidating_data)
+				ret = bch2_journal_flush_seq(&c->journal, journal_seq);
+			else if (ca->allocator_journal_seq_flush)
+				ret = bch2_journal_flush_seq(&c->journal,
+						       ca->allocator_journal_seq_flush);
+
+			/*
+			 * journal error - buckets haven't actually been
+			 * invalidated, can't discard them:
+			 */
+			if (ret) {
+				bch_err(ca, "journal error: %i", ret);
+				goto stop;
+			}
+		}
+
+		pr_debug("free_inc now empty");
+
+		/* Reset front/back so we can easily sort fifo entries later: */
+		ca->free_inc.front = ca->free_inc.back	= 0;
+		ca->allocator_journal_seq_flush		= 0;
+		ca->allocator_invalidating_data		= false;
+
+		down_read(&c->gc_lock);
+		while (1) {
+			size_t prev = fifo_used(&ca->free_inc);
+
+			if (test_bit(BCH_FS_GC_FAILURE, &c->flags)) {
+				up_read(&c->gc_lock);
+				bch_err(ca, "gc failure");
+				goto stop;
+			}
+
+			/*
+			 * Find some buckets that we can invalidate, either
+			 * they're completely unused, or only contain clean data
+			 * that's been written back to the backing device or
+			 * another cache tier
+			 */
+
+			pr_debug("scanning for reclaimable buckets");
+
+			find_reclaimable_buckets(c, ca);
+
+			pr_debug("found %zu buckets (free_inc %zu/%zu)",
+				 fifo_used(&ca->free_inc) - prev,
+				 fifo_used(&ca->free_inc), ca->free_inc.size);
+
+			trace_alloc_batch(ca, fifo_used(&ca->free_inc),
+					  ca->free_inc.size);
+
+			if ((ca->inc_gen_needs_gc >= ca->free_inc.size ||
+			     (!fifo_full(&ca->free_inc) &&
+			      ca->inc_gen_really_needs_gc >=
+			      fifo_free(&ca->free_inc))) &&
+			    c->gc_thread) {
+				atomic_inc(&c->kick_gc);
+				wake_up_process(c->gc_thread);
+			}
+
+			if (fifo_full(&ca->free_inc))
+				break;
+
+			if (!fifo_empty(&ca->free_inc) &&
+			    !fifo_full(&ca->free[RESERVE_MOVINGGC]))
+				break;
+
+			/*
+			 * copygc may be waiting until either its reserve fills
+			 * up, or we can't make forward progress:
+			 */
+			ca->allocator_blocked = true;
+			closure_wake_up(&c->freelist_wait);
+
+			ret = wait_buckets_available(c, ca);
+			if (ret) {
+				up_read(&c->gc_lock);
+				goto stop;
+			}
+		}
+
+		ca->allocator_blocked = false;
+		up_read(&c->gc_lock);
+
+		pr_debug("free_inc now %zu/%zu",
+			 fifo_used(&ca->free_inc),
+			 ca->free_inc.size);
+
+		sort_free_inc(c, ca);
+
+		/*
+		 * free_inc is now full of newly-invalidated buckets: next,
+		 * write out the new bucket gens:
+		 */
+	}
+
+stop:
+	pr_debug("alloc thread stopping (ret %i)", ret);
+	return 0;
+}
+
+/* Allocation */
+
+/*
+ * Open buckets represent a bucket that's currently being allocated from.  They
+ * serve two purposes:
+ *
+ *  - They track buckets that have been partially allocated, allowing for
+ *    sub-bucket sized allocations - they're used by the sector allocator below
+ *
+ *  - They provide a reference to the buckets they own that mark and sweep GC
+ *    can find, until the new allocation has a pointer to it inserted into the
+ *    btree
+ *
+ * When allocating some space with the sector allocator, the allocation comes
+ * with a reference to an open bucket - the caller is required to put that
+ * reference _after_ doing the index update that makes its allocation reachable.
+ */
+
+void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob)
+{
+	struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev);
+
+	percpu_down_read(&c->usage_lock);
+	spin_lock(&ob->lock);
+
+	bch2_mark_alloc_bucket(c, ca, PTR_BUCKET_NR(ca, &ob->ptr),
+			       false, gc_pos_alloc(c, ob), 0);
+	ob->valid = false;
+
+	spin_unlock(&ob->lock);
+	percpu_up_read(&c->usage_lock);
+
+	spin_lock(&c->freelist_lock);
+	ob->freelist = c->open_buckets_freelist;
+	c->open_buckets_freelist = ob - c->open_buckets;
+	c->open_buckets_nr_free++;
+	spin_unlock(&c->freelist_lock);
+
+	closure_wake_up(&c->open_buckets_wait);
+}
+
+static struct open_bucket *bch2_open_bucket_alloc(struct bch_fs *c)
+{
+	struct open_bucket *ob;
+
+	BUG_ON(!c->open_buckets_freelist || !c->open_buckets_nr_free);
+
+	ob = c->open_buckets + c->open_buckets_freelist;
+	c->open_buckets_freelist = ob->freelist;
+	atomic_set(&ob->pin, 1);
+
+	c->open_buckets_nr_free--;
+	return ob;
+}
+
+/* _only_ for allocating the journal on a new device: */
+long bch2_bucket_alloc_new_fs(struct bch_dev *ca)
+{
+	struct bucket_array *buckets;
+	ssize_t b;
+
+	rcu_read_lock();
+	buckets = bucket_array(ca);
+
+	for (b = ca->mi.first_bucket; b < ca->mi.nbuckets; b++)
+		if (is_available_bucket(buckets->b[b].mark))
+			goto success;
+	b = -1;
+success:
+	rcu_read_unlock();
+	return b;
+}
+
+static inline unsigned open_buckets_reserved(enum alloc_reserve reserve)
+{
+	switch (reserve) {
+	case RESERVE_ALLOC:
+		return 0;
+	case RESERVE_BTREE:
+		return BTREE_NODE_RESERVE / 2;
+	default:
+		return BTREE_NODE_RESERVE;
+	}
+}
+
+/**
+ * bch_bucket_alloc - allocate a single bucket from a specific device
+ *
+ * Returns index of bucket on success, 0 on failure
+ * */
+int bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca,
+		      enum alloc_reserve reserve,
+		      bool may_alloc_partial,
+		      struct closure *cl)
+{
+	struct bucket_array *buckets;
+	struct open_bucket *ob;
+	long bucket;
+
+	spin_lock(&c->freelist_lock);
+
+	if (may_alloc_partial &&
+	    ca->open_buckets_partial_nr) {
+		int ret = ca->open_buckets_partial[--ca->open_buckets_partial_nr];
+		c->open_buckets[ret].on_partial_list = false;
+		spin_unlock(&c->freelist_lock);
+		return ret;
+	}
+
+	if (unlikely(c->open_buckets_nr_free <= open_buckets_reserved(reserve))) {
+		if (cl)
+			closure_wait(&c->open_buckets_wait, cl);
+		spin_unlock(&c->freelist_lock);
+		trace_open_bucket_alloc_fail(ca, reserve);
+		return OPEN_BUCKETS_EMPTY;
+	}
+
+	if (likely(fifo_pop(&ca->free[RESERVE_NONE], bucket)))
+		goto out;
+
+	switch (reserve) {
+	case RESERVE_ALLOC:
+		if (fifo_pop(&ca->free[RESERVE_BTREE], bucket))
+			goto out;
+		break;
+	case RESERVE_BTREE:
+		if (fifo_used(&ca->free[RESERVE_BTREE]) * 2 >=
+		    ca->free[RESERVE_BTREE].size &&
+		    fifo_pop(&ca->free[RESERVE_BTREE], bucket))
+			goto out;
+		break;
+	case RESERVE_MOVINGGC:
+		if (fifo_pop(&ca->free[RESERVE_MOVINGGC], bucket))
+			goto out;
+		break;
+	default:
+		break;
+	}
+
+	if (cl)
+		closure_wait(&c->freelist_wait, cl);
+
+	spin_unlock(&c->freelist_lock);
+
+	trace_bucket_alloc_fail(ca, reserve);
+	return FREELIST_EMPTY;
+out:
+	verify_not_on_freelist(c, ca, bucket);
+
+	ob = bch2_open_bucket_alloc(c);
+
+	spin_lock(&ob->lock);
+	buckets = bucket_array(ca);
+
+	ob->valid	= true;
+	ob->sectors_free = ca->mi.bucket_size;
+	ob->ptr		= (struct bch_extent_ptr) {
+		.gen	= buckets->b[bucket].mark.gen,
+		.offset	= bucket_to_sector(ca, bucket),
+		.dev	= ca->dev_idx,
+	};
+
+	bucket_io_clock_reset(c, ca, bucket, READ);
+	bucket_io_clock_reset(c, ca, bucket, WRITE);
+	spin_unlock(&ob->lock);
+
+	spin_unlock(&c->freelist_lock);
+
+	bch2_wake_allocator(ca);
+
+	trace_bucket_alloc(ca, reserve);
+	return ob - c->open_buckets;
+}
+
+static int __dev_alloc_cmp(struct write_point *wp,
+			   unsigned l, unsigned r)
+{
+	return ((wp->next_alloc[l] > wp->next_alloc[r]) -
+		(wp->next_alloc[l] < wp->next_alloc[r]));
+}
+
+#define dev_alloc_cmp(l, r) __dev_alloc_cmp(wp, l, r)
+
+struct dev_alloc_list bch2_wp_alloc_list(struct bch_fs *c,
+					 struct write_point *wp,
+					 struct bch_devs_mask *devs)
+{
+	struct dev_alloc_list ret = { .nr = 0 };
+	struct bch_dev *ca;
+	unsigned i;
+
+	for_each_member_device_rcu(ca, c, i, devs)
+		ret.devs[ret.nr++] = i;
+
+	bubble_sort(ret.devs, ret.nr, dev_alloc_cmp);
+	return ret;
+}
+
+void bch2_wp_rescale(struct bch_fs *c, struct bch_dev *ca,
+		     struct write_point *wp)
+{
+	u64 *v = wp->next_alloc + ca->dev_idx;
+	u64 free_space = dev_buckets_free(c, ca);
+	u64 free_space_inv = free_space
+		? div64_u64(1ULL << 48, free_space)
+		: 1ULL << 48;
+	u64 scale = *v / 4;
+
+	if (*v + free_space_inv >= *v)
+		*v += free_space_inv;
+	else
+		*v = U64_MAX;
+
+	for (v = wp->next_alloc;
+	     v < wp->next_alloc + ARRAY_SIZE(wp->next_alloc); v++)
+		*v = *v < scale ? 0 : *v - scale;
+}
+
+static enum bucket_alloc_ret bch2_bucket_alloc_set(struct bch_fs *c,
+					struct write_point *wp,
+					unsigned nr_replicas,
+					enum alloc_reserve reserve,
+					struct bch_devs_mask *devs,
+					struct closure *cl)
+{
+	enum bucket_alloc_ret ret = NO_DEVICES;
+	struct dev_alloc_list devs_sorted;
+	struct bch_dev *ca;
+	unsigned i, nr_ptrs_effective = 0;
+	bool have_cache_dev = false;
+
+	BUG_ON(nr_replicas > ARRAY_SIZE(wp->ptrs));
+
+	for (i = wp->first_ptr; i < wp->nr_ptrs; i++) {
+		ca = bch_dev_bkey_exists(c, wp->ptrs[i]->ptr.dev);
+
+		nr_ptrs_effective += ca->mi.durability;
+		have_cache_dev |= !ca->mi.durability;
+	}
+
+	if (nr_ptrs_effective >= nr_replicas)
+		return ALLOC_SUCCESS;
+
+	devs_sorted = bch2_wp_alloc_list(c, wp, devs);
+
+	for (i = 0; i < devs_sorted.nr; i++) {
+		int ob;
+
+		ca = rcu_dereference(c->devs[devs_sorted.devs[i]]);
+		if (!ca)
+			continue;
+
+		if (!ca->mi.durability &&
+		    (have_cache_dev ||
+		     wp->type != BCH_DATA_USER))
+			continue;
+
+		ob = bch2_bucket_alloc(c, ca, reserve,
+				       wp->type == BCH_DATA_USER, cl);
+		if (ob < 0) {
+			ret = ob;
+			if (ret == OPEN_BUCKETS_EMPTY)
+				break;
+			continue;
+		}
+
+		BUG_ON(ob <= 0 || ob > U8_MAX);
+		BUG_ON(wp->nr_ptrs >= ARRAY_SIZE(wp->ptrs));
+
+		wp->ptrs[wp->nr_ptrs++] = c->open_buckets + ob;
+
+		bch2_wp_rescale(c, ca, wp);
+
+		nr_ptrs_effective += ca->mi.durability;
+		have_cache_dev |= !ca->mi.durability;
+
+		__clear_bit(ca->dev_idx, devs->d);
+
+		if (nr_ptrs_effective >= nr_replicas) {
+			ret = ALLOC_SUCCESS;
+			break;
+		}
+	}
+
+	EBUG_ON(reserve == RESERVE_MOVINGGC &&
+		ret != ALLOC_SUCCESS &&
+		ret != OPEN_BUCKETS_EMPTY);
+
+	switch (ret) {
+	case ALLOC_SUCCESS:
+		return 0;
+	case NO_DEVICES:
+		return -EROFS;
+	case FREELIST_EMPTY:
+	case OPEN_BUCKETS_EMPTY:
+		return cl ? -EAGAIN : -ENOSPC;
+	default:
+		BUG();
+	}
+}
+
+/* Sector allocator */
+
+static void writepoint_drop_ptr(struct bch_fs *c,
+				struct write_point *wp,
+				unsigned i)
+{
+	struct open_bucket *ob = wp->ptrs[i];
+	struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev);
+
+	BUG_ON(ca->open_buckets_partial_nr >=
+	       ARRAY_SIZE(ca->open_buckets_partial));
+
+	if (wp->type == BCH_DATA_USER) {
+		spin_lock(&c->freelist_lock);
+		ob->on_partial_list = true;
+		ca->open_buckets_partial[ca->open_buckets_partial_nr++] =
+			ob - c->open_buckets;
+		spin_unlock(&c->freelist_lock);
+
+		closure_wake_up(&c->open_buckets_wait);
+		closure_wake_up(&c->freelist_wait);
+	} else {
+		bch2_open_bucket_put(c, ob);
+	}
+
+	array_remove_item(wp->ptrs, wp->nr_ptrs, i);
+
+	if (i < wp->first_ptr)
+		wp->first_ptr--;
+}
+
+static void writepoint_drop_ptrs(struct bch_fs *c,
+				 struct write_point *wp,
+				 u16 target, bool in_target)
+{
+	int i;
+
+	for (i = wp->first_ptr - 1; i >= 0; --i)
+		if (bch2_dev_in_target(c, wp->ptrs[i]->ptr.dev,
+				       target) == in_target)
+			writepoint_drop_ptr(c, wp, i);
+}
+
+static void verify_not_stale(struct bch_fs *c, const struct write_point *wp)
+{
+#ifdef CONFIG_BCACHEFS_DEBUG
+	struct open_bucket *ob;
+	unsigned i;
+
+	writepoint_for_each_ptr_all(wp, ob, i) {
+		struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev);
+
+		BUG_ON(ptr_stale(ca, &ob->ptr));
+	}
+#endif
+}
+
+static int open_bucket_add_buckets(struct bch_fs *c,
+				   u16 target,
+				   struct write_point *wp,
+				   struct bch_devs_list *devs_have,
+				   unsigned nr_replicas,
+				   enum alloc_reserve reserve,
+				   struct closure *cl)
+{
+	struct bch_devs_mask devs = c->rw_devs[wp->type];
+	const struct bch_devs_mask *t;
+	struct open_bucket *ob;
+	unsigned i;
+	int ret;
+
+	percpu_down_read(&c->usage_lock);
+	rcu_read_lock();
+
+	/* Don't allocate from devices we already have pointers to: */
+	for (i = 0; i < devs_have->nr; i++)
+		__clear_bit(devs_have->devs[i], devs.d);
+
+	writepoint_for_each_ptr_all(wp, ob, i)
+		__clear_bit(ob->ptr.dev, devs.d);
+
+	t = bch2_target_to_mask(c, target);
+	if (t)
+		bitmap_and(devs.d, devs.d, t->d, BCH_SB_MEMBERS_MAX);
+
+	ret = bch2_bucket_alloc_set(c, wp, nr_replicas, reserve, &devs, cl);
+
+	rcu_read_unlock();
+	percpu_up_read(&c->usage_lock);
+
+	return ret;
+}
+
+static struct write_point *__writepoint_find(struct hlist_head *head,
+					     unsigned long write_point)
+{
+	struct write_point *wp;
+
+	hlist_for_each_entry_rcu(wp, head, node)
+		if (wp->write_point == write_point)
+			return wp;
+
+	return NULL;
+}
+
+static struct hlist_head *writepoint_hash(struct bch_fs *c,
+					  unsigned long write_point)
+{
+	unsigned hash =
+		hash_long(write_point, ilog2(ARRAY_SIZE(c->write_points_hash)));
+
+	return &c->write_points_hash[hash];
+}
+
+static struct write_point *writepoint_find(struct bch_fs *c,
+					   unsigned long write_point)
+{
+	struct write_point *wp, *oldest;
+	struct hlist_head *head;
+
+	if (!(write_point & 1UL)) {
+		wp = (struct write_point *) write_point;
+		mutex_lock(&wp->lock);
+		return wp;
+	}
+
+	head = writepoint_hash(c, write_point);
+restart_find:
+	wp = __writepoint_find(head, write_point);
+	if (wp) {
+lock_wp:
+		mutex_lock(&wp->lock);
+		if (wp->write_point == write_point)
+			goto out;
+		mutex_unlock(&wp->lock);
+		goto restart_find;
+	}
+
+	oldest = NULL;
+	for (wp = c->write_points;
+	     wp < c->write_points + ARRAY_SIZE(c->write_points);
+	     wp++)
+		if (!oldest || time_before64(wp->last_used, oldest->last_used))
+			oldest = wp;
+
+	mutex_lock(&oldest->lock);
+	mutex_lock(&c->write_points_hash_lock);
+	wp = __writepoint_find(head, write_point);
+	if (wp && wp != oldest) {
+		mutex_unlock(&c->write_points_hash_lock);
+		mutex_unlock(&oldest->lock);
+		goto lock_wp;
+	}
+
+	wp = oldest;
+	hlist_del_rcu(&wp->node);
+	wp->write_point = write_point;
+	hlist_add_head_rcu(&wp->node, head);
+	mutex_unlock(&c->write_points_hash_lock);
+out:
+	wp->last_used = sched_clock();
+	return wp;
+}
+
+/*
+ * Get us an open_bucket we can allocate from, return with it locked:
+ */
+struct write_point *bch2_alloc_sectors_start(struct bch_fs *c,
+				unsigned target,
+				struct write_point_specifier write_point,
+				struct bch_devs_list *devs_have,
+				unsigned nr_replicas,
+				unsigned nr_replicas_required,
+				enum alloc_reserve reserve,
+				unsigned flags,
+				struct closure *cl)
+{
+	struct write_point *wp;
+	struct open_bucket *ob;
+	struct bch_dev *ca;
+	unsigned nr_ptrs_have, nr_ptrs_effective;
+	int ret, i, cache_idx = -1;
+
+	BUG_ON(!nr_replicas || !nr_replicas_required);
+
+	wp = writepoint_find(c, write_point.v);
+
+	wp->first_ptr = 0;
+
+	/* does writepoint have ptrs we can't use? */
+	writepoint_for_each_ptr(wp, ob, i)
+		if (bch2_dev_list_has_dev(*devs_have, ob->ptr.dev)) {
+			swap(wp->ptrs[i], wp->ptrs[wp->first_ptr]);
+			wp->first_ptr++;
+		}
+
+	nr_ptrs_have = wp->first_ptr;
+
+	/* does writepoint have ptrs we don't want to use? */
+	if (target)
+		writepoint_for_each_ptr(wp, ob, i)
+			if (!bch2_dev_in_target(c, ob->ptr.dev, target)) {
+				swap(wp->ptrs[i], wp->ptrs[wp->first_ptr]);
+				wp->first_ptr++;
+			}
+
+	if (flags & BCH_WRITE_ONLY_SPECIFIED_DEVS) {
+		ret = open_bucket_add_buckets(c, target, wp, devs_have,
+					      nr_replicas, reserve, cl);
+	} else {
+		ret = open_bucket_add_buckets(c, target, wp, devs_have,
+					      nr_replicas, reserve, NULL);
+		if (!ret)
+			goto alloc_done;
+
+		wp->first_ptr = nr_ptrs_have;
+
+		ret = open_bucket_add_buckets(c, 0, wp, devs_have,
+					      nr_replicas, reserve, cl);
+	}
+
+	if (ret && ret != -EROFS)
+		goto err;
+alloc_done:
+	/* check for more than one cache: */
+	for (i = wp->nr_ptrs - 1; i >= wp->first_ptr; --i) {
+		ca = bch_dev_bkey_exists(c, wp->ptrs[i]->ptr.dev);
+
+		if (ca->mi.durability)
+			continue;
+
+		/*
+		 * if we ended up with more than one cache device, prefer the
+		 * one in the target we want:
+		 */
+		if (cache_idx >= 0) {
+			if (!bch2_dev_in_target(c, wp->ptrs[i]->ptr.dev,
+						target)) {
+				writepoint_drop_ptr(c, wp, i);
+			} else {
+				writepoint_drop_ptr(c, wp, cache_idx);
+				cache_idx = i;
+			}
+		} else {
+			cache_idx = i;
+		}
+	}
+
+	/* we might have more effective replicas than required: */
+	nr_ptrs_effective = 0;
+	writepoint_for_each_ptr(wp, ob, i) {
+		ca = bch_dev_bkey_exists(c, ob->ptr.dev);
+		nr_ptrs_effective += ca->mi.durability;
+	}
+
+	if (ret == -EROFS &&
+	    nr_ptrs_effective >= nr_replicas_required)
+		ret = 0;
+
+	if (ret)
+		goto err;
+
+	if (nr_ptrs_effective > nr_replicas) {
+		writepoint_for_each_ptr(wp, ob, i) {
+			ca = bch_dev_bkey_exists(c, ob->ptr.dev);
+
+			if (ca->mi.durability &&
+			    ca->mi.durability <= nr_ptrs_effective - nr_replicas &&
+			    !bch2_dev_in_target(c, ob->ptr.dev, target)) {
+				swap(wp->ptrs[i], wp->ptrs[wp->first_ptr]);
+				wp->first_ptr++;
+				nr_ptrs_effective -= ca->mi.durability;
+			}
+		}
+	}
+
+	if (nr_ptrs_effective > nr_replicas) {
+		writepoint_for_each_ptr(wp, ob, i) {
+			ca = bch_dev_bkey_exists(c, ob->ptr.dev);
+
+			if (ca->mi.durability &&
+			    ca->mi.durability <= nr_ptrs_effective - nr_replicas) {
+				swap(wp->ptrs[i], wp->ptrs[wp->first_ptr]);
+				wp->first_ptr++;
+				nr_ptrs_effective -= ca->mi.durability;
+			}
+		}
+	}
+
+	/* Remove pointers we don't want to use: */
+	if (target)
+		writepoint_drop_ptrs(c, wp, target, false);
+
+	BUG_ON(wp->first_ptr >= wp->nr_ptrs);
+	BUG_ON(nr_ptrs_effective < nr_replicas_required);
+
+	wp->sectors_free = UINT_MAX;
+
+	writepoint_for_each_ptr(wp, ob, i)
+		wp->sectors_free = min(wp->sectors_free, ob->sectors_free);
+
+	BUG_ON(!wp->sectors_free || wp->sectors_free == UINT_MAX);
+
+	verify_not_stale(c, wp);
+
+	return wp;
+err:
+	mutex_unlock(&wp->lock);
+	return ERR_PTR(ret);
+}
+
+/*
+ * Append pointers to the space we just allocated to @k, and mark @sectors space
+ * as allocated out of @ob
+ */
+void bch2_alloc_sectors_append_ptrs(struct bch_fs *c, struct write_point *wp,
+				    struct bkey_i_extent *e, unsigned sectors)
+{
+	struct open_bucket *ob;
+	unsigned i;
+
+	BUG_ON(sectors > wp->sectors_free);
+	wp->sectors_free -= sectors;
+
+	writepoint_for_each_ptr(wp, ob, i) {
+		struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev);
+		struct bch_extent_ptr tmp = ob->ptr;
+
+		EBUG_ON(bch2_extent_has_device(extent_i_to_s_c(e), ob->ptr.dev));
+
+		tmp.cached = bkey_extent_is_cached(&e->k) ||
+			(!ca->mi.durability && wp->type == BCH_DATA_USER);
+
+		tmp.offset += ca->mi.bucket_size - ob->sectors_free;
+		extent_ptr_append(e, tmp);
+
+		BUG_ON(sectors > ob->sectors_free);
+		ob->sectors_free -= sectors;
+	}
+}
+
+/*
+ * Append pointers to the space we just allocated to @k, and mark @sectors space
+ * as allocated out of @ob
+ */
+void bch2_alloc_sectors_done(struct bch_fs *c, struct write_point *wp)
+{
+	int i;
+
+	for (i = wp->nr_ptrs - 1; i >= 0; --i) {
+		struct open_bucket *ob = wp->ptrs[i];
+
+		if (!ob->sectors_free) {
+			array_remove_item(wp->ptrs, wp->nr_ptrs, i);
+			bch2_open_bucket_put(c, ob);
+		}
+	}
+
+	mutex_unlock(&wp->lock);
+}
+
+/* Startup/shutdown (ro/rw): */
+
+void bch2_recalc_capacity(struct bch_fs *c)
+{
+	struct bch_dev *ca;
+	u64 total_capacity, capacity = 0, reserved_sectors = 0;
+	unsigned long ra_pages = 0;
+	unsigned i, j;
+
+	lockdep_assert_held(&c->state_lock);
+
+	for_each_online_member(ca, c, i) {
+		struct backing_dev_info *bdi = ca->disk_sb.bdev->bd_disk->bdi;
+
+		ra_pages += bdi->ra_pages;
+	}
+
+	bch2_set_ra_pages(c, ra_pages);
+
+	for_each_rw_member(ca, c, i) {
+		size_t reserve = 0;
+
+		/*
+		 * We need to reserve buckets (from the number
+		 * of currently available buckets) against
+		 * foreground writes so that mainly copygc can
+		 * make forward progress.
+		 *
+		 * We need enough to refill the various reserves
+		 * from scratch - copygc will use its entire
+		 * reserve all at once, then run against when
+		 * its reserve is refilled (from the formerly
+		 * available buckets).
+		 *
+		 * This reserve is just used when considering if
+		 * allocations for foreground writes must wait -
+		 * not -ENOSPC calculations.
+		 */
+		for (j = 0; j < RESERVE_NONE; j++)
+			reserve += ca->free[j].size;
+
+		reserve += ca->free_inc.size;
+
+		reserve += ARRAY_SIZE(c->write_points);
+
+		reserve += 1;	/* btree write point */
+
+		reserved_sectors += bucket_to_sector(ca, reserve);
+
+		capacity += bucket_to_sector(ca, ca->mi.nbuckets -
+					     ca->mi.first_bucket);
+	}
+
+	total_capacity = capacity;
+
+	capacity *= (100 - c->opts.gc_reserve_percent);
+	capacity = div64_u64(capacity, 100);
+
+	BUG_ON(reserved_sectors > total_capacity);
+
+	capacity = min(capacity, total_capacity - reserved_sectors);
+
+	c->capacity = capacity;
+
+	if (c->capacity) {
+		bch2_io_timer_add(&c->io_clock[READ],
+				 &c->bucket_clock[READ].rescale);
+		bch2_io_timer_add(&c->io_clock[WRITE],
+				 &c->bucket_clock[WRITE].rescale);
+	} else {
+		bch2_io_timer_del(&c->io_clock[READ],
+				 &c->bucket_clock[READ].rescale);
+		bch2_io_timer_del(&c->io_clock[WRITE],
+				 &c->bucket_clock[WRITE].rescale);
+	}
+
+	/* Wake up case someone was waiting for buckets */
+	closure_wake_up(&c->freelist_wait);
+}
+
+static void bch2_stop_write_point(struct bch_fs *c, struct bch_dev *ca,
+				  struct write_point *wp)
+{
+	struct bch_devs_mask not_self;
+
+	bitmap_complement(not_self.d, ca->self.d, BCH_SB_MEMBERS_MAX);
+
+	mutex_lock(&wp->lock);
+	wp->first_ptr = wp->nr_ptrs;
+	writepoint_drop_ptrs(c, wp, dev_to_target(ca->dev_idx), true);
+	mutex_unlock(&wp->lock);
+}
+
+static bool bch2_dev_has_open_write_point(struct bch_fs *c, struct bch_dev *ca)
+{
+	struct open_bucket *ob;
+	bool ret = false;
+
+	for (ob = c->open_buckets;
+	     ob < c->open_buckets + ARRAY_SIZE(c->open_buckets);
+	     ob++) {
+		spin_lock(&ob->lock);
+		if (ob->valid && !ob->on_partial_list &&
+		    ob->ptr.dev == ca->dev_idx)
+			ret = true;
+		spin_unlock(&ob->lock);
+	}
+
+	return ret;
+}
+
+/* device goes ro: */
+void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca)
+{
+	unsigned i;
+
+	BUG_ON(ca->alloc_thread);
+
+	/* First, remove device from allocation groups: */
+
+	for (i = 0; i < ARRAY_SIZE(c->rw_devs); i++)
+		clear_bit(ca->dev_idx, c->rw_devs[i].d);
+
+	/*
+	 * Capacity is calculated based off of devices in allocation groups:
+	 */
+	bch2_recalc_capacity(c);
+
+	/* Next, close write points that point to this device... */
+	for (i = 0; i < ARRAY_SIZE(c->write_points); i++)
+		bch2_stop_write_point(c, ca, &c->write_points[i]);
+
+	bch2_stop_write_point(c, ca, &ca->copygc_write_point);
+	bch2_stop_write_point(c, ca, &c->rebalance_write_point);
+	bch2_stop_write_point(c, ca, &c->btree_write_point);
+
+	mutex_lock(&c->btree_reserve_cache_lock);
+	while (c->btree_reserve_cache_nr) {
+		struct btree_alloc *a =
+			&c->btree_reserve_cache[--c->btree_reserve_cache_nr];
+
+		bch2_open_bucket_put_refs(c, &a->ob.nr, a->ob.refs);
+	}
+	mutex_unlock(&c->btree_reserve_cache_lock);
+
+	/*
+	 * Wake up threads that were blocked on allocation, so they can notice
+	 * the device can no longer be removed and the capacity has changed:
+	 */
+	closure_wake_up(&c->freelist_wait);
+
+	/*
+	 * journal_res_get() can block waiting for free space in the journal -
+	 * it needs to notice there may not be devices to allocate from anymore:
+	 */
+	wake_up(&c->journal.wait);
+
+	/* Now wait for any in flight writes: */
+
+	closure_wait_event(&c->open_buckets_wait,
+			   !bch2_dev_has_open_write_point(c, ca));
+}
+
+/* device goes rw: */
+void bch2_dev_allocator_add(struct bch_fs *c, struct bch_dev *ca)
+{
+	unsigned i;
+
+	for (i = 0; i < ARRAY_SIZE(c->rw_devs); i++)
+		if (ca->mi.data_allowed & (1 << i))
+			set_bit(ca->dev_idx, c->rw_devs[i].d);
+}
+
+/* stop allocator thread: */
+void bch2_dev_allocator_stop(struct bch_dev *ca)
+{
+	struct task_struct *p;
+
+	p = rcu_dereference_protected(ca->alloc_thread, 1);
+	ca->alloc_thread = NULL;
+
+	/*
+	 * We need an rcu barrier between setting ca->alloc_thread = NULL and
+	 * the thread shutting down to avoid bch2_wake_allocator() racing:
+	 *
+	 * XXX: it would be better to have the rcu barrier be asynchronous
+	 * instead of blocking us here
+	 */
+	synchronize_rcu();
+
+	if (p) {
+		kthread_stop(p);
+		put_task_struct(p);
+	}
+}
+
+/* start allocator thread: */
+int bch2_dev_allocator_start(struct bch_dev *ca)
+{
+	struct task_struct *p;
+
+	/*
+	 * allocator thread already started?
+	 */
+	if (ca->alloc_thread)
+		return 0;
+
+	p = kthread_create(bch2_allocator_thread, ca,
+			   "bch_alloc[%s]", ca->name);
+	if (IS_ERR(p))
+		return PTR_ERR(p);
+
+	get_task_struct(p);
+	rcu_assign_pointer(ca->alloc_thread, p);
+	wake_up_process(p);
+	return 0;
+}
+
+static void allocator_start_issue_discards(struct bch_fs *c)
+{
+	struct bch_dev *ca;
+	unsigned dev_iter;
+	size_t i, bu;
+
+	for_each_rw_member(ca, c, dev_iter) {
+		unsigned done = 0;
+
+		fifo_for_each_entry(bu, &ca->free_inc, i) {
+			if (done == ca->nr_invalidated)
+				break;
+
+			blkdev_issue_discard(ca->disk_sb.bdev,
+					     bucket_to_sector(ca, bu),
+					     ca->mi.bucket_size, GFP_NOIO);
+			done++;
+		}
+	}
+}
+
+static int __bch2_fs_allocator_start(struct bch_fs *c)
+{
+	struct bch_dev *ca;
+	size_t bu, i;
+	unsigned dev_iter;
+	u64 journal_seq = 0;
+	bool invalidating_data = false;
+	int ret = 0;
+
+	if (test_bit(BCH_FS_GC_FAILURE, &c->flags))
+		return -1;
+
+	/* Scan for buckets that are already invalidated: */
+	for_each_rw_member(ca, c, dev_iter) {
+		struct btree_iter iter;
+		struct bucket_mark m;
+		struct bkey_s_c k;
+
+		for_each_btree_key(&iter, c, BTREE_ID_ALLOC, POS(ca->dev_idx, 0), 0, k) {
+			if (k.k->type != BCH_ALLOC)
+				continue;
+
+			bu = k.k->p.offset;
+			m = READ_ONCE(bucket(ca, bu)->mark);
+
+			if (!is_available_bucket(m) || m.cached_sectors)
+				continue;
+
+			percpu_down_read(&c->usage_lock);
+			bch2_mark_alloc_bucket(c, ca, bu, true,
+					gc_pos_alloc(c, NULL),
+					BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
+					BCH_BUCKET_MARK_GC_LOCK_HELD);
+			percpu_up_read(&c->usage_lock);
+
+			fifo_push(&ca->free_inc, bu);
+			ca->nr_invalidated++;
+
+			if (fifo_full(&ca->free_inc))
+				break;
+		}
+		bch2_btree_iter_unlock(&iter);
+	}
+
+	/* did we find enough buckets? */
+	for_each_rw_member(ca, c, dev_iter)
+		if (fifo_used(&ca->free_inc) < ca->free[RESERVE_BTREE].size) {
+			percpu_ref_put(&ca->io_ref);
+			goto not_enough;
+		}
+
+	return 0;
+not_enough:
+	pr_debug("did not find enough empty buckets; issuing discards");
+
+	/* clear out free_inc - find_reclaimable_buckets() assumes it's empty */
+	for_each_rw_member(ca, c, dev_iter)
+		discard_invalidated_buckets(c, ca);
+
+	pr_debug("scanning for reclaimable buckets");
+
+	for_each_rw_member(ca, c, dev_iter) {
+		BUG_ON(!fifo_empty(&ca->free_inc));
+		ca->free_inc.front = ca->free_inc.back	= 0;
+
+		find_reclaimable_buckets(c, ca);
+		sort_free_inc(c, ca);
+
+		invalidating_data |= ca->allocator_invalidating_data;
+
+		fifo_for_each_entry(bu, &ca->free_inc, i)
+			if (!fifo_push(&ca->free[RESERVE_BTREE], bu))
+				break;
+	}
+
+	pr_debug("done scanning for reclaimable buckets");
+
+	/*
+	 * We're moving buckets to freelists _before_ they've been marked as
+	 * invalidated on disk - we have to so that we can allocate new btree
+	 * nodes to mark them as invalidated on disk.
+	 *
+	 * However, we can't _write_ to any of these buckets yet - they might
+	 * have cached data in them, which is live until they're marked as
+	 * invalidated on disk:
+	 */
+	if (invalidating_data) {
+		pr_debug("invalidating existing data");
+		set_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags);
+	} else {
+		pr_debug("issuing discards");
+		allocator_start_issue_discards(c);
+	}
+
+	/*
+	 * XXX: it's possible for this to deadlock waiting on journal reclaim,
+	 * since we're holding btree writes. What then?
+	 */
+
+	for_each_rw_member(ca, c, dev_iter) {
+		ret = bch2_invalidate_free_inc(c, ca, &journal_seq,
+					       ca->free[RESERVE_BTREE].size,
+					       false);
+		if (ret) {
+			percpu_ref_put(&ca->io_ref);
+			return ret;
+		}
+	}
+
+	if (invalidating_data) {
+		pr_debug("flushing journal");
+
+		ret = bch2_journal_flush_seq(&c->journal, journal_seq);
+		if (ret)
+			return ret;
+
+		pr_debug("issuing discards");
+		allocator_start_issue_discards(c);
+	}
+
+	for_each_rw_member(ca, c, dev_iter)
+		while (ca->nr_invalidated) {
+			BUG_ON(!fifo_pop(&ca->free_inc, bu));
+			ca->nr_invalidated--;
+		}
+
+	set_bit(BCH_FS_ALLOCATOR_STARTED, &c->flags);
+
+	/* now flush dirty btree nodes: */
+	if (invalidating_data) {
+		struct bucket_table *tbl;
+		struct rhash_head *pos;
+		struct btree *b;
+		bool flush_updates;
+		size_t nr_pending_updates;
+
+		clear_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags);
+again:
+		pr_debug("flushing dirty btree nodes");
+		cond_resched();
+
+		flush_updates = false;
+		nr_pending_updates = bch2_btree_interior_updates_nr_pending(c);
+
+
+		rcu_read_lock();
+		for_each_cached_btree(b, c, tbl, i, pos)
+			if (btree_node_dirty(b) && (!b->written || b->level)) {
+				if (btree_node_may_write(b)) {
+					rcu_read_unlock();
+					btree_node_lock_type(c, b, SIX_LOCK_read);
+					bch2_btree_node_write(c, b, SIX_LOCK_read);
+					six_unlock_read(&b->lock);
+					goto again;
+				} else {
+					flush_updates = true;
+				}
+			}
+		rcu_read_unlock();
+
+		/*
+		 * This is ugly, but it's needed to flush btree node writes
+		 * without spinning...
+		 */
+		if (flush_updates) {
+			closure_wait_event(&c->btree_interior_update_wait,
+				bch2_btree_interior_updates_nr_pending(c) <
+				nr_pending_updates);
+			goto again;
+		}
+	}
+
+	return 0;
+}
+
+int bch2_fs_allocator_start(struct bch_fs *c)
+{
+	struct bch_dev *ca;
+	unsigned i;
+	int ret;
+
+	down_read(&c->gc_lock);
+	ret = __bch2_fs_allocator_start(c);
+	up_read(&c->gc_lock);
+
+	if (ret)
+		return ret;
+
+	for_each_rw_member(ca, c, i) {
+		ret = bch2_dev_allocator_start(ca);
+		if (ret) {
+			percpu_ref_put(&ca->io_ref);
+			return ret;
+		}
+	}
+
+	return bch2_alloc_write(c);
+}
+
+void bch2_fs_allocator_init(struct bch_fs *c)
+{
+	struct open_bucket *ob;
+	struct write_point *wp;
+
+	mutex_init(&c->write_points_hash_lock);
+	spin_lock_init(&c->freelist_lock);
+	bch2_bucket_clock_init(c, READ);
+	bch2_bucket_clock_init(c, WRITE);
+
+	/* open bucket 0 is a sentinal NULL: */
+	spin_lock_init(&c->open_buckets[0].lock);
+
+	for (ob = c->open_buckets + 1;
+	     ob < c->open_buckets + ARRAY_SIZE(c->open_buckets); ob++) {
+		spin_lock_init(&ob->lock);
+		c->open_buckets_nr_free++;
+
+		ob->freelist = c->open_buckets_freelist;
+		c->open_buckets_freelist = ob - c->open_buckets;
+	}
+
+	writepoint_init(&c->btree_write_point, BCH_DATA_BTREE);
+	writepoint_init(&c->rebalance_write_point, BCH_DATA_USER);
+
+	for (wp = c->write_points;
+	     wp < c->write_points + ARRAY_SIZE(c->write_points); wp++) {
+		writepoint_init(wp, BCH_DATA_USER);
+
+		wp->last_used	= sched_clock();
+		wp->write_point	= (unsigned long) wp;
+		hlist_add_head_rcu(&wp->node, writepoint_hash(c, wp->write_point));
+	}
+
+	c->pd_controllers_update_seconds = 5;
+	INIT_DELAYED_WORK(&c->pd_controllers_update, pd_controllers_update);
+}
diff --git a/fs/bcachefs/alloc.h b/fs/bcachefs/alloc.h
new file mode 100644
index 000000000000..00d01f464c68
--- /dev/null
+++ b/fs/bcachefs/alloc.h
@@ -0,0 +1,141 @@
+#ifndef _BCACHEFS_ALLOC_H
+#define _BCACHEFS_ALLOC_H
+
+#include "bcachefs.h"
+#include "alloc_types.h"
+
+struct bkey;
+struct bch_dev;
+struct bch_fs;
+struct bch_devs_List;
+
+const char *bch2_alloc_invalid(const struct bch_fs *, struct bkey_s_c);
+void bch2_alloc_to_text(struct bch_fs *, char *, size_t, struct bkey_s_c);
+
+#define bch2_bkey_alloc_ops (struct bkey_ops) {		\
+	.key_invalid	= bch2_alloc_invalid,		\
+	.val_to_text	= bch2_alloc_to_text,		\
+}
+
+struct dev_alloc_list {
+	unsigned	nr;
+	u8		devs[BCH_SB_MEMBERS_MAX];
+};
+
+struct dev_alloc_list bch2_wp_alloc_list(struct bch_fs *,
+					 struct write_point *,
+					 struct bch_devs_mask *);
+void bch2_wp_rescale(struct bch_fs *, struct bch_dev *,
+		     struct write_point *);
+
+int bch2_alloc_read(struct bch_fs *, struct list_head *);
+int bch2_alloc_replay_key(struct bch_fs *, struct bpos);
+
+enum bucket_alloc_ret {
+	ALLOC_SUCCESS		= 0,
+	OPEN_BUCKETS_EMPTY	= -1,
+	FREELIST_EMPTY		= -2,	/* Allocator thread not keeping up */
+	NO_DEVICES		= -3,	/* -EROFS */
+};
+
+long bch2_bucket_alloc_new_fs(struct bch_dev *);
+
+int bch2_bucket_alloc(struct bch_fs *, struct bch_dev *, enum alloc_reserve, bool,
+		      struct closure *);
+
+#define __writepoint_for_each_ptr(_wp, _ob, _i, _start)			\
+	for ((_i) = (_start);						\
+	     (_i) < (_wp)->nr_ptrs && ((_ob) = (_wp)->ptrs[_i], true);	\
+	     (_i)++)
+
+#define writepoint_for_each_ptr_all(_wp, _ob, _i)			\
+	__writepoint_for_each_ptr(_wp, _ob, _i, 0)
+
+#define writepoint_for_each_ptr(_wp, _ob, _i)				\
+	__writepoint_for_each_ptr(_wp, _ob, _i, wp->first_ptr)
+
+void __bch2_open_bucket_put(struct bch_fs *, struct open_bucket *);
+
+static inline void bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob)
+{
+	if (atomic_dec_and_test(&ob->pin))
+		__bch2_open_bucket_put(c, ob);
+}
+
+static inline void bch2_open_bucket_put_refs(struct bch_fs *c, u8 *nr, u8 *refs)
+{
+	unsigned i;
+
+	for (i = 0; i < *nr; i++)
+		bch2_open_bucket_put(c, c->open_buckets + refs[i]);
+
+	*nr = 0;
+}
+
+static inline void bch2_open_bucket_get(struct bch_fs *c,
+					struct write_point *wp,
+					u8 *nr, u8 *refs)
+{
+	struct open_bucket *ob;
+	unsigned i;
+
+	writepoint_for_each_ptr(wp, ob, i) {
+		atomic_inc(&ob->pin);
+		refs[(*nr)++] = ob - c->open_buckets;
+	}
+}
+
+struct write_point *bch2_alloc_sectors_start(struct bch_fs *,
+					     unsigned,
+					     struct write_point_specifier,
+					     struct bch_devs_list *,
+					     unsigned, unsigned,
+					     enum alloc_reserve,
+					     unsigned,
+					     struct closure *);
+
+void bch2_alloc_sectors_append_ptrs(struct bch_fs *, struct write_point *,
+				    struct bkey_i_extent *, unsigned);
+void bch2_alloc_sectors_done(struct bch_fs *, struct write_point *);
+
+static inline void bch2_wake_allocator(struct bch_dev *ca)
+{
+	struct task_struct *p;
+
+	rcu_read_lock();
+	p = rcu_dereference(ca->alloc_thread);
+	if (p)
+		wake_up_process(p);
+	rcu_read_unlock();
+}
+
+static inline struct write_point_specifier writepoint_hashed(unsigned long v)
+{
+	return (struct write_point_specifier) { .v = v | 1 };
+}
+
+static inline struct write_point_specifier writepoint_ptr(struct write_point *wp)
+{
+	return (struct write_point_specifier) { .v = (unsigned long) wp };
+}
+
+void bch2_recalc_capacity(struct bch_fs *);
+
+void bch2_dev_allocator_remove(struct bch_fs *, struct bch_dev *);
+void bch2_dev_allocator_add(struct bch_fs *, struct bch_dev *);
+
+void bch2_dev_allocator_stop(struct bch_dev *);
+int bch2_dev_allocator_start(struct bch_dev *);
+
+static inline void writepoint_init(struct write_point *wp,
+				   enum bch_data_type type)
+{
+	mutex_init(&wp->lock);
+	wp->type = type;
+}
+
+int bch2_alloc_write(struct bch_fs *);
+int bch2_fs_allocator_start(struct bch_fs *);
+void bch2_fs_allocator_init(struct bch_fs *);
+
+#endif /* _BCACHEFS_ALLOC_H */
diff --git a/fs/bcachefs/alloc_types.h b/fs/bcachefs/alloc_types.h
new file mode 100644
index 000000000000..035c50052167
--- /dev/null
+++ b/fs/bcachefs/alloc_types.h
@@ -0,0 +1,90 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_ALLOC_TYPES_H
+#define _BCACHEFS_ALLOC_TYPES_H
+
+#include <linux/mutex.h>
+#include <linux/spinlock.h>
+
+#include "clock_types.h"
+#include "fifo.h"
+
+/* There's two of these clocks, one for reads and one for writes: */
+struct bucket_clock {
+	/*
+	 * "now" in (read/write) IO time - incremented whenever we do X amount
+	 * of reads or writes.
+	 *
+	 * Goes with the bucket read/write prios: when we read or write to a
+	 * bucket we reset the bucket's prio to the current hand; thus hand -
+	 * prio = time since bucket was last read/written.
+	 *
+	 * The units are some amount (bytes/sectors) of data read/written, and
+	 * the units can change on the fly if we need to rescale to fit
+	 * everything in a u16 - your only guarantee is that the units are
+	 * consistent.
+	 */
+	u16			hand;
+	u16			max_last_io;
+
+	int			rw;
+
+	struct io_timer		rescale;
+	struct mutex		lock;
+};
+
+/* There is one reserve for each type of btree, one for prios and gens
+ * and one for moving GC */
+enum alloc_reserve {
+	RESERVE_ALLOC		= -1,
+	RESERVE_BTREE		= 0,
+	RESERVE_MOVINGGC	= 1,
+	RESERVE_NONE		= 2,
+	RESERVE_NR		= 3,
+};
+
+typedef FIFO(long)	alloc_fifo;
+
+/* Enough for 16 cache devices, 2 tiers and some left over for pipelining */
+#define OPEN_BUCKETS_COUNT	256
+#define WRITE_POINT_COUNT	32
+
+struct open_bucket {
+	spinlock_t		lock;
+	atomic_t		pin;
+	u8			freelist;
+	bool			valid;
+	bool			on_partial_list;
+	unsigned		sectors_free;
+	struct bch_extent_ptr	ptr;
+};
+
+struct write_point {
+	struct hlist_node	node;
+	struct mutex		lock;
+	u64			last_used;
+	unsigned long		write_point;
+	enum bch_data_type	type;
+
+	u8			nr_ptrs;
+	u8			first_ptr;
+
+	/* calculated based on how many pointers we're actually going to use: */
+	unsigned		sectors_free;
+
+	struct open_bucket	*ptrs[BCH_REPLICAS_MAX * 2];
+	u64			next_alloc[BCH_SB_MEMBERS_MAX];
+};
+
+struct write_point_specifier {
+	unsigned long		v;
+};
+
+struct alloc_heap_entry {
+	size_t			bucket;
+	size_t			nr;
+	unsigned long		key;
+};
+
+typedef HEAP(struct alloc_heap_entry) alloc_heap;
+
+#endif /* _BCACHEFS_ALLOC_TYPES_H */
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
new file mode 100644
index 000000000000..b5e119d09a83
--- /dev/null
+++ b/fs/bcachefs/bcachefs.h
@@ -0,0 +1,785 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_H
+#define _BCACHEFS_H
+
+/*
+ * SOME HIGH LEVEL CODE DOCUMENTATION:
+ *
+ * Bcache mostly works with cache sets, cache devices, and backing devices.
+ *
+ * Support for multiple cache devices hasn't quite been finished off yet, but
+ * it's about 95% plumbed through. A cache set and its cache devices is sort of
+ * like a md raid array and its component devices. Most of the code doesn't care
+ * about individual cache devices, the main abstraction is the cache set.
+ *
+ * Multiple cache devices is intended to give us the ability to mirror dirty
+ * cached data and metadata, without mirroring clean cached data.
+ *
+ * Backing devices are different, in that they have a lifetime independent of a
+ * cache set. When you register a newly formatted backing device it'll come up
+ * in passthrough mode, and then you can attach and detach a backing device from
+ * a cache set at runtime - while it's mounted and in use. Detaching implicitly
+ * invalidates any cached data for that backing device.
+ *
+ * A cache set can have multiple (many) backing devices attached to it.
+ *
+ * There's also flash only volumes - this is the reason for the distinction
+ * between struct cached_dev and struct bcache_device. A flash only volume
+ * works much like a bcache device that has a backing device, except the
+ * "cached" data is always dirty. The end result is that we get thin
+ * provisioning with very little additional code.
+ *
+ * Flash only volumes work but they're not production ready because the moving
+ * garbage collector needs more work. More on that later.
+ *
+ * BUCKETS/ALLOCATION:
+ *
+ * Bcache is primarily designed for caching, which means that in normal
+ * operation all of our available space will be allocated. Thus, we need an
+ * efficient way of deleting things from the cache so we can write new things to
+ * it.
+ *
+ * To do this, we first divide the cache device up into buckets. A bucket is the
+ * unit of allocation; they're typically around 1 mb - anywhere from 128k to 2M+
+ * works efficiently.
+ *
+ * Each bucket has a 16 bit priority, and an 8 bit generation associated with
+ * it. The gens and priorities for all the buckets are stored contiguously and
+ * packed on disk (in a linked list of buckets - aside from the superblock, all
+ * of bcache's metadata is stored in buckets).
+ *
+ * The priority is used to implement an LRU. We reset a bucket's priority when
+ * we allocate it or on cache it, and every so often we decrement the priority
+ * of each bucket. It could be used to implement something more sophisticated,
+ * if anyone ever gets around to it.
+ *
+ * The generation is used for invalidating buckets. Each pointer also has an 8
+ * bit generation embedded in it; for a pointer to be considered valid, its gen
+ * must match the gen of the bucket it points into.  Thus, to reuse a bucket all
+ * we have to do is increment its gen (and write its new gen to disk; we batch
+ * this up).
+ *
+ * Bcache is entirely COW - we never write twice to a bucket, even buckets that
+ * contain metadata (including btree nodes).
+ *
+ * THE BTREE:
+ *
+ * Bcache is in large part design around the btree.
+ *
+ * At a high level, the btree is just an index of key -> ptr tuples.
+ *
+ * Keys represent extents, and thus have a size field. Keys also have a variable
+ * number of pointers attached to them (potentially zero, which is handy for
+ * invalidating the cache).
+ *
+ * The key itself is an inode:offset pair. The inode number corresponds to a
+ * backing device or a flash only volume. The offset is the ending offset of the
+ * extent within the inode - not the starting offset; this makes lookups
+ * slightly more convenient.
+ *
+ * Pointers contain the cache device id, the offset on that device, and an 8 bit
+ * generation number. More on the gen later.
+ *
+ * Index lookups are not fully abstracted - cache lookups in particular are
+ * still somewhat mixed in with the btree code, but things are headed in that
+ * direction.
+ *
+ * Updates are fairly well abstracted, though. There are two different ways of
+ * updating the btree; insert and replace.
+ *
+ * BTREE_INSERT will just take a list of keys and insert them into the btree -
+ * overwriting (possibly only partially) any extents they overlap with. This is
+ * used to update the index after a write.
+ *
+ * BTREE_REPLACE is really cmpxchg(); it inserts a key into the btree iff it is
+ * overwriting a key that matches another given key. This is used for inserting
+ * data into the cache after a cache miss, and for background writeback, and for
+ * the moving garbage collector.
+ *
+ * There is no "delete" operation; deleting things from the index is
+ * accomplished by either by invalidating pointers (by incrementing a bucket's
+ * gen) or by inserting a key with 0 pointers - which will overwrite anything
+ * previously present at that location in the index.
+ *
+ * This means that there are always stale/invalid keys in the btree. They're
+ * filtered out by the code that iterates through a btree node, and removed when
+ * a btree node is rewritten.
+ *
+ * BTREE NODES:
+ *
+ * Our unit of allocation is a bucket, and we we can't arbitrarily allocate and
+ * free smaller than a bucket - so, that's how big our btree nodes are.
+ *
+ * (If buckets are really big we'll only use part of the bucket for a btree node
+ * - no less than 1/4th - but a bucket still contains no more than a single
+ * btree node. I'd actually like to change this, but for now we rely on the
+ * bucket's gen for deleting btree nodes when we rewrite/split a node.)
+ *
+ * Anyways, btree nodes are big - big enough to be inefficient with a textbook
+ * btree implementation.
+ *
+ * The way this is solved is that btree nodes are internally log structured; we
+ * can append new keys to an existing btree node without rewriting it. This
+ * means each set of keys we write is sorted, but the node is not.
+ *
+ * We maintain this log structure in memory - keeping 1Mb of keys sorted would
+ * be expensive, and we have to distinguish between the keys we have written and
+ * the keys we haven't. So to do a lookup in a btree node, we have to search
+ * each sorted set. But we do merge written sets together lazily, so the cost of
+ * these extra searches is quite low (normally most of the keys in a btree node
+ * will be in one big set, and then there'll be one or two sets that are much
+ * smaller).
+ *
+ * This log structure makes bcache's btree more of a hybrid between a
+ * conventional btree and a compacting data structure, with some of the
+ * advantages of both.
+ *
+ * GARBAGE COLLECTION:
+ *
+ * We can't just invalidate any bucket - it might contain dirty data or
+ * metadata. If it once contained dirty data, other writes might overwrite it
+ * later, leaving no valid pointers into that bucket in the index.
+ *
+ * Thus, the primary purpose of garbage collection is to find buckets to reuse.
+ * It also counts how much valid data it each bucket currently contains, so that
+ * allocation can reuse buckets sooner when they've been mostly overwritten.
+ *
+ * It also does some things that are really internal to the btree
+ * implementation. If a btree node contains pointers that are stale by more than
+ * some threshold, it rewrites the btree node to avoid the bucket's generation
+ * wrapping around. It also merges adjacent btree nodes if they're empty enough.
+ *
+ * THE JOURNAL:
+ *
+ * Bcache's journal is not necessary for consistency; we always strictly
+ * order metadata writes so that the btree and everything else is consistent on
+ * disk in the event of an unclean shutdown, and in fact bcache had writeback
+ * caching (with recovery from unclean shutdown) before journalling was
+ * implemented.
+ *
+ * Rather, the journal is purely a performance optimization; we can't complete a
+ * write until we've updated the index on disk, otherwise the cache would be
+ * inconsistent in the event of an unclean shutdown. This means that without the
+ * journal, on random write workloads we constantly have to update all the leaf
+ * nodes in the btree, and those writes will be mostly empty (appending at most
+ * a few keys each) - highly inefficient in terms of amount of metadata writes,
+ * and it puts more strain on the various btree resorting/compacting code.
+ *
+ * The journal is just a log of keys we've inserted; on startup we just reinsert
+ * all the keys in the open journal entries. That means that when we're updating
+ * a node in the btree, we can wait until a 4k block of keys fills up before
+ * writing them out.
+ *
+ * For simplicity, we only journal updates to leaf nodes; updates to parent
+ * nodes are rare enough (since our leaf nodes are huge) that it wasn't worth
+ * the complexity to deal with journalling them (in particular, journal replay)
+ * - updates to non leaf nodes just happen synchronously (see btree_split()).
+ */
+
+#undef pr_fmt
+#define pr_fmt(fmt) "bcachefs: %s() " fmt "\n", __func__
+
+#include <linux/backing-dev-defs.h>
+#include <linux/bug.h>
+#include <linux/bio.h>
+#include <linux/closure.h>
+#include <linux/kobject.h>
+#include <linux/list.h>
+#include <linux/mutex.h>
+#include <linux/percpu-refcount.h>
+#include <linux/percpu-rwsem.h>
+#include <linux/rhashtable.h>
+#include <linux/rwsem.h>
+#include <linux/seqlock.h>
+#include <linux/shrinker.h>
+#include <linux/types.h>
+#include <linux/workqueue.h>
+#include <linux/zstd.h>
+
+#include "bcachefs_format.h"
+#include "fifo.h"
+#include "opts.h"
+#include "util.h"
+
+#define dynamic_fault(...)		0
+#define race_fault(...)			0
+
+#define bch2_fs_init_fault(name)						\
+	dynamic_fault("bcachefs:bch_fs_init:" name)
+#define bch2_meta_read_fault(name)					\
+	 dynamic_fault("bcachefs:meta:read:" name)
+#define bch2_meta_write_fault(name)					\
+	 dynamic_fault("bcachefs:meta:write:" name)
+
+#ifdef __KERNEL__
+#define bch2_fmt(_c, fmt)	"bcachefs (%s): " fmt "\n", ((_c)->name)
+#else
+#define bch2_fmt(_c, fmt)	fmt "\n"
+#endif
+
+#define bch_info(c, fmt, ...) \
+	printk(KERN_INFO bch2_fmt(c, fmt), ##__VA_ARGS__)
+#define bch_notice(c, fmt, ...) \
+	printk(KERN_NOTICE bch2_fmt(c, fmt), ##__VA_ARGS__)
+#define bch_warn(c, fmt, ...) \
+	printk(KERN_WARNING bch2_fmt(c, fmt), ##__VA_ARGS__)
+#define bch_err(c, fmt, ...) \
+	printk(KERN_ERR bch2_fmt(c, fmt), ##__VA_ARGS__)
+
+#define bch_verbose(c, fmt, ...)					\
+do {									\
+	if ((c)->opts.verbose_recovery)					\
+		bch_info(c, fmt, ##__VA_ARGS__);			\
+} while (0)
+
+#define pr_verbose_init(opts, fmt, ...)					\
+do {									\
+	if (opt_get(opts, verbose_init))				\
+		pr_info(fmt, ##__VA_ARGS__);				\
+} while (0)
+
+/* Parameters that are useful for debugging, but should always be compiled in: */
+#define BCH_DEBUG_PARAMS_ALWAYS()					\
+	BCH_DEBUG_PARAM(key_merging_disabled,				\
+		"Disables merging of extents")				\
+	BCH_DEBUG_PARAM(btree_gc_always_rewrite,			\
+		"Causes mark and sweep to compact and rewrite every "	\
+		"btree node it traverses")				\
+	BCH_DEBUG_PARAM(btree_gc_rewrite_disabled,			\
+		"Disables rewriting of btree nodes during mark and sweep")\
+	BCH_DEBUG_PARAM(btree_shrinker_disabled,			\
+		"Disables the shrinker callback for the btree node cache")
+
+/* Parameters that should only be compiled in in debug mode: */
+#define BCH_DEBUG_PARAMS_DEBUG()					\
+	BCH_DEBUG_PARAM(expensive_debug_checks,				\
+		"Enables various runtime debugging checks that "	\
+		"significantly affect performance")			\
+	BCH_DEBUG_PARAM(debug_check_bkeys,				\
+		"Run bkey_debugcheck (primarily checking GC/allocation "\
+		"information) when iterating over keys")		\
+	BCH_DEBUG_PARAM(verify_btree_ondisk,				\
+		"Reread btree nodes at various points to verify the "	\
+		"mergesort in the read path against modifications "	\
+		"done in memory")					\
+	BCH_DEBUG_PARAM(journal_seq_verify,				\
+		"Store the journal sequence number in the version "	\
+		"number of every btree key, and verify that btree "	\
+		"update ordering is preserved during recovery")		\
+	BCH_DEBUG_PARAM(inject_invalid_keys,				\
+		"Store the journal sequence number in the version "	\
+		"number of every btree key, and verify that btree "	\
+		"update ordering is preserved during recovery")		\
+
+#define BCH_DEBUG_PARAMS_ALL() BCH_DEBUG_PARAMS_ALWAYS() BCH_DEBUG_PARAMS_DEBUG()
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+#define BCH_DEBUG_PARAMS() BCH_DEBUG_PARAMS_ALL()
+#else
+#define BCH_DEBUG_PARAMS() BCH_DEBUG_PARAMS_ALWAYS()
+#endif
+
+#define BCH_TIME_STATS()			\
+	x(btree_node_mem_alloc)			\
+	x(btree_gc)				\
+	x(btree_split)				\
+	x(btree_sort)				\
+	x(btree_read)				\
+	x(btree_lock_contended_read)		\
+	x(btree_lock_contended_intent)		\
+	x(btree_lock_contended_write)		\
+	x(data_write)				\
+	x(data_read)				\
+	x(data_promote)				\
+	x(journal_write)			\
+	x(journal_delay)			\
+	x(journal_blocked)			\
+	x(journal_flush_seq)
+
+enum bch_time_stats {
+#define x(name) BCH_TIME_##name,
+	BCH_TIME_STATS()
+#undef x
+	BCH_TIME_STAT_NR
+};
+
+#include "alloc_types.h"
+#include "btree_types.h"
+#include "buckets_types.h"
+#include "clock_types.h"
+#include "journal_types.h"
+#include "keylist_types.h"
+#include "quota_types.h"
+#include "rebalance_types.h"
+#include "super_types.h"
+
+/* Number of nodes btree coalesce will try to coalesce at once */
+#define GC_MERGE_NODES		4U
+
+/* Maximum number of nodes we might need to allocate atomically: */
+#define BTREE_RESERVE_MAX	(BTREE_MAX_DEPTH + (BTREE_MAX_DEPTH - 1))
+
+/* Size of the freelist we allocate btree nodes from: */
+#define BTREE_NODE_RESERVE	(BTREE_RESERVE_MAX * 4)
+
+struct btree;
+
+enum gc_phase {
+	GC_PHASE_START,
+	GC_PHASE_SB,
+
+#define DEF_BTREE_ID(kwd, val, name) GC_PHASE_BTREE_##kwd,
+	DEFINE_BCH_BTREE_IDS()
+#undef DEF_BTREE_ID
+
+	GC_PHASE_PENDING_DELETE,
+	GC_PHASE_ALLOC,
+	GC_PHASE_DONE
+};
+
+struct gc_pos {
+	enum gc_phase		phase;
+	struct bpos		pos;
+	unsigned		level;
+};
+
+struct io_count {
+	u64			sectors[2][BCH_DATA_NR];
+};
+
+struct bch_dev {
+	struct kobject		kobj;
+	struct percpu_ref	ref;
+	struct completion	ref_completion;
+	struct percpu_ref	io_ref;
+	struct completion	io_ref_completion;
+
+	struct bch_fs		*fs;
+
+	u8			dev_idx;
+	/*
+	 * Cached version of this device's member info from superblock
+	 * Committed by bch2_write_super() -> bch_fs_mi_update()
+	 */
+	struct bch_member_cpu	mi;
+	__uuid_t		uuid;
+	char			name[BDEVNAME_SIZE];
+
+	struct bch_sb_handle	disk_sb;
+	int			sb_write_error;
+
+	struct bch_devs_mask	self;
+
+	/* biosets used in cloned bios for writing multiple replicas */
+	struct bio_set		replica_set;
+
+	/*
+	 * Buckets:
+	 * Per-bucket arrays are protected by c->usage_lock, bucket_lock and
+	 * gc_lock, for device resize - holding any is sufficient for access:
+	 * Or rcu_read_lock(), but only for ptr_stale():
+	 */
+	struct bucket_array __rcu *buckets;
+	unsigned long		*buckets_dirty;
+	/* most out of date gen in the btree */
+	u8			*oldest_gens;
+	struct rw_semaphore	bucket_lock;
+
+	struct bch_dev_usage __percpu *usage_percpu;
+	struct bch_dev_usage	usage_cached;
+
+	/* Allocator: */
+	struct task_struct __rcu *alloc_thread;
+
+	/*
+	 * free: Buckets that are ready to be used
+	 *
+	 * free_inc: Incoming buckets - these are buckets that currently have
+	 * cached data in them, and we can't reuse them until after we write
+	 * their new gen to disk. After prio_write() finishes writing the new
+	 * gens/prios, they'll be moved to the free list (and possibly discarded
+	 * in the process)
+	 */
+	alloc_fifo		free[RESERVE_NR];
+	alloc_fifo		free_inc;
+	spinlock_t		freelist_lock;
+	size_t			nr_invalidated;
+
+	u8			open_buckets_partial[OPEN_BUCKETS_COUNT];
+	unsigned		open_buckets_partial_nr;
+
+	size_t			fifo_last_bucket;
+
+	/* last calculated minimum prio */
+	u16			max_last_bucket_io[2];
+
+	atomic_long_t		saturated_count;
+	size_t			inc_gen_needs_gc;
+	size_t			inc_gen_really_needs_gc;
+	u64			allocator_journal_seq_flush;
+	bool			allocator_invalidating_data;
+	bool			allocator_blocked;
+
+	alloc_heap		alloc_heap;
+
+	/* Copying GC: */
+	struct task_struct	*copygc_thread;
+	copygc_heap		copygc_heap;
+	struct bch_pd_controller copygc_pd;
+	struct write_point	copygc_write_point;
+
+	atomic64_t		rebalance_work;
+
+	struct journal_device	journal;
+
+	struct work_struct	io_error_work;
+
+	/* The rest of this all shows up in sysfs */
+	atomic64_t		cur_latency[2];
+	struct bch2_time_stats	io_latency[2];
+
+#define CONGESTED_MAX		1024
+	atomic_t		congested;
+	u64			congested_last;
+
+	struct io_count __percpu *io_done;
+};
+
+/*
+ * Flag bits for what phase of startup/shutdown the cache set is at, how we're
+ * shutting down, etc.:
+ *
+ * BCH_FS_UNREGISTERING means we're not just shutting down, we're detaching
+ * all the backing devices first (their cached data gets invalidated, and they
+ * won't automatically reattach).
+ */
+enum {
+	/* startup: */
+	BCH_FS_ALLOC_READ_DONE,
+	BCH_FS_ALLOCATOR_STARTED,
+	BCH_FS_INITIAL_GC_DONE,
+	BCH_FS_FSCK_DONE,
+	BCH_FS_STARTED,
+
+	/* shutdown: */
+	BCH_FS_EMERGENCY_RO,
+	BCH_FS_WRITE_DISABLE_COMPLETE,
+
+	/* errors: */
+	BCH_FS_ERROR,
+	BCH_FS_GC_FAILURE,
+
+	/* misc: */
+	BCH_FS_BDEV_MOUNTED,
+	BCH_FS_FSCK_FIXED_ERRORS,
+	BCH_FS_FIXED_GENS,
+	BCH_FS_REBUILD_REPLICAS,
+	BCH_FS_HOLD_BTREE_WRITES,
+};
+
+struct btree_debug {
+	unsigned		id;
+	struct dentry		*btree;
+	struct dentry		*btree_format;
+	struct dentry		*failed;
+};
+
+enum bch_fs_state {
+	BCH_FS_STARTING		= 0,
+	BCH_FS_STOPPING,
+	BCH_FS_RO,
+	BCH_FS_RW,
+};
+
+struct bch_fs {
+	struct closure		cl;
+
+	struct list_head	list;
+	struct kobject		kobj;
+	struct kobject		internal;
+	struct kobject		opts_dir;
+	struct kobject		time_stats;
+	unsigned long		flags;
+
+	int			minor;
+	struct device		*chardev;
+	struct super_block	*vfs_sb;
+	char			name[40];
+
+	/* ro/rw, add/remove devices: */
+	struct mutex		state_lock;
+	enum bch_fs_state	state;
+
+	/* Counts outstanding writes, for clean transition to read-only */
+	struct percpu_ref	writes;
+	struct work_struct	read_only_work;
+
+	struct bch_dev __rcu	*devs[BCH_SB_MEMBERS_MAX];
+
+	struct bch_replicas_cpu __rcu *replicas;
+	struct bch_replicas_cpu __rcu *replicas_gc;
+	struct mutex		replicas_gc_lock;
+
+	struct bch_disk_groups_cpu __rcu *disk_groups;
+
+	struct bch_opts		opts;
+
+	/* Updated by bch2_sb_update():*/
+	struct {
+		__uuid_t	uuid;
+		__uuid_t	user_uuid;
+
+		u16		encoded_extent_max;
+
+		u8		nr_devices;
+		u8		clean;
+
+		u8		encryption_type;
+
+		u64		time_base_lo;
+		u32		time_base_hi;
+		u32		time_precision;
+		u64		features;
+	}			sb;
+
+	struct bch_sb_handle	disk_sb;
+
+	unsigned short		block_bits;	/* ilog2(block_size) */
+
+	u16			btree_foreground_merge_threshold;
+
+	struct closure		sb_write;
+	struct mutex		sb_lock;
+
+	/* BTREE CACHE */
+	struct bio_set		btree_bio;
+
+	struct btree_root	btree_roots[BTREE_ID_NR];
+	bool			btree_roots_dirty;
+	struct mutex		btree_root_lock;
+
+	struct btree_cache	btree_cache;
+
+	mempool_t		btree_reserve_pool;
+
+	/*
+	 * Cache of allocated btree nodes - if we allocate a btree node and
+	 * don't use it, if we free it that space can't be reused until going
+	 * _all_ the way through the allocator (which exposes us to a livelock
+	 * when allocating btree reserves fail halfway through) - instead, we
+	 * can stick them here:
+	 */
+	struct btree_alloc	btree_reserve_cache[BTREE_NODE_RESERVE * 2];
+	unsigned		btree_reserve_cache_nr;
+	struct mutex		btree_reserve_cache_lock;
+
+	mempool_t		btree_interior_update_pool;
+	struct list_head	btree_interior_update_list;
+	struct mutex		btree_interior_update_lock;
+	struct closure_waitlist	btree_interior_update_wait;
+
+	struct workqueue_struct	*wq;
+	/* copygc needs its own workqueue for index updates.. */
+	struct workqueue_struct	*copygc_wq;
+
+	/* ALLOCATION */
+	struct delayed_work	pd_controllers_update;
+	unsigned		pd_controllers_update_seconds;
+
+	struct bch_devs_mask	rw_devs[BCH_DATA_NR];
+
+	u64			capacity; /* sectors */
+
+	/*
+	 * When capacity _decreases_ (due to a disk being removed), we
+	 * increment capacity_gen - this invalidates outstanding reservations
+	 * and forces them to be revalidated
+	 */
+	u32			capacity_gen;
+
+	atomic64_t		sectors_available;
+
+	struct bch_fs_usage __percpu *usage_percpu;
+	struct bch_fs_usage	usage_cached;
+	struct percpu_rw_semaphore usage_lock;
+
+	struct closure_waitlist	freelist_wait;
+
+	/*
+	 * When we invalidate buckets, we use both the priority and the amount
+	 * of good data to determine which buckets to reuse first - to weight
+	 * those together consistently we keep track of the smallest nonzero
+	 * priority of any bucket.
+	 */
+	struct bucket_clock	bucket_clock[2];
+
+	struct io_clock		io_clock[2];
+
+	/* ALLOCATOR */
+	spinlock_t		freelist_lock;
+	u8			open_buckets_freelist;
+	u8			open_buckets_nr_free;
+	struct closure_waitlist	open_buckets_wait;
+	struct open_bucket	open_buckets[OPEN_BUCKETS_COUNT];
+
+	struct write_point	btree_write_point;
+	struct write_point	rebalance_write_point;
+
+	struct write_point	write_points[WRITE_POINT_COUNT];
+	struct hlist_head	write_points_hash[WRITE_POINT_COUNT];
+	struct mutex		write_points_hash_lock;
+
+	/* GARBAGE COLLECTION */
+	struct task_struct	*gc_thread;
+	atomic_t		kick_gc;
+	unsigned long		gc_count;
+
+	/*
+	 * Tracks GC's progress - everything in the range [ZERO_KEY..gc_cur_pos]
+	 * has been marked by GC.
+	 *
+	 * gc_cur_phase is a superset of btree_ids (BTREE_ID_EXTENTS etc.)
+	 *
+	 * gc_cur_phase == GC_PHASE_DONE indicates that gc is finished/not
+	 * currently running, and gc marks are currently valid
+	 *
+	 * Protected by gc_pos_lock. Only written to by GC thread, so GC thread
+	 * can read without a lock.
+	 */
+	seqcount_t		gc_pos_lock;
+	struct gc_pos		gc_pos;
+
+	/*
+	 * The allocation code needs gc_mark in struct bucket to be correct, but
+	 * it's not while a gc is in progress.
+	 */
+	struct rw_semaphore	gc_lock;
+
+	/* IO PATH */
+	struct bio_set		bio_read;
+	struct bio_set		bio_read_split;
+	struct bio_set		bio_write;
+	struct mutex		bio_bounce_pages_lock;
+	mempool_t		bio_bounce_pages;
+	struct rhashtable	promote_table;
+
+	mempool_t		compression_bounce[2];
+	mempool_t		compress_workspace[BCH_COMPRESSION_NR];
+	mempool_t		decompress_workspace;
+	ZSTD_parameters		zstd_params;
+
+	struct crypto_shash	*sha256;
+	struct crypto_sync_skcipher *chacha20;
+	struct crypto_shash	*poly1305;
+
+	atomic64_t		key_version;
+
+	/* REBALANCE */
+	struct bch_fs_rebalance	rebalance;
+
+	/* VFS IO PATH - fs-io.c */
+	struct bio_set		writepage_bioset;
+	struct bio_set		dio_write_bioset;
+	struct bio_set		dio_read_bioset;
+
+	struct bio_list		btree_write_error_list;
+	struct work_struct	btree_write_error_work;
+	spinlock_t		btree_write_error_lock;
+
+	/* ERRORS */
+	struct list_head	fsck_errors;
+	struct mutex		fsck_error_lock;
+	bool			fsck_alloc_err;
+
+	/* FILESYSTEM */
+	atomic_long_t		nr_inodes;
+
+	/* QUOTAS */
+	struct bch_memquota_type quotas[QTYP_NR];
+
+	/* DEBUG JUNK */
+	struct dentry		*debug;
+	struct btree_debug	btree_debug[BTREE_ID_NR];
+#ifdef CONFIG_BCACHEFS_DEBUG
+	struct btree		*verify_data;
+	struct btree_node	*verify_ondisk;
+	struct mutex		verify_lock;
+#endif
+
+	u64			unused_inode_hint;
+
+	/*
+	 * A btree node on disk could have too many bsets for an iterator to fit
+	 * on the stack - have to dynamically allocate them
+	 */
+	mempool_t		fill_iter;
+
+	mempool_t		btree_bounce_pool;
+
+	struct journal		journal;
+
+	unsigned		bucket_journal_seq;
+
+	/* The rest of this all shows up in sysfs */
+	atomic_long_t		read_realloc_races;
+	atomic_long_t		extent_migrate_done;
+	atomic_long_t		extent_migrate_raced;
+
+	unsigned		btree_gc_periodic:1;
+	unsigned		copy_gc_enabled:1;
+	bool			promote_whole_extents;
+
+#define BCH_DEBUG_PARAM(name, description) bool name;
+	BCH_DEBUG_PARAMS_ALL()
+#undef BCH_DEBUG_PARAM
+
+	struct bch2_time_stats	times[BCH_TIME_STAT_NR];
+};
+
+static inline void bch2_set_ra_pages(struct bch_fs *c, unsigned ra_pages)
+{
+#ifndef NO_BCACHEFS_FS
+	if (c->vfs_sb)
+		c->vfs_sb->s_bdi->ra_pages = ra_pages;
+#endif
+}
+
+static inline bool bch2_fs_running(struct bch_fs *c)
+{
+	return c->state == BCH_FS_RO || c->state == BCH_FS_RW;
+}
+
+static inline unsigned bucket_bytes(const struct bch_dev *ca)
+{
+	return ca->mi.bucket_size << 9;
+}
+
+static inline unsigned block_bytes(const struct bch_fs *c)
+{
+	return c->opts.block_size << 9;
+}
+
+static inline struct timespec64 bch2_time_to_timespec(struct bch_fs *c, u64 time)
+{
+	return ns_to_timespec64(time * c->sb.time_precision + c->sb.time_base_lo);
+}
+
+static inline s64 timespec_to_bch2_time(struct bch_fs *c, struct timespec64 ts)
+{
+	s64 ns = timespec64_to_ns(&ts) - c->sb.time_base_lo;
+
+	if (c->sb.time_precision == 1)
+		return ns;
+
+	return div_s64(ns, c->sb.time_precision);
+}
+
+static inline s64 bch2_current_time(struct bch_fs *c)
+{
+	struct timespec64 now;
+
+	ktime_get_real_ts64(&now);
+	return timespec_to_bch2_time(c, now);
+}
+
+#endif /* _BCACHEFS_H */
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
new file mode 100644
index 000000000000..eb14dba87402
--- /dev/null
+++ b/fs/bcachefs/bcachefs_format.h
@@ -0,0 +1,1498 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_FORMAT_H
+#define _BCACHEFS_FORMAT_H
+
+/*
+ * bcachefs on disk data structures
+ *
+ * OVERVIEW:
+ *
+ * There are three main types of on disk data structures in bcachefs (this is
+ * reduced from 5 in bcache)
+ *
+ *  - superblock
+ *  - journal
+ *  - btree
+ *
+ * The btree is the primary structure; most metadata exists as keys in the
+ * various btrees. There are only a small number of btrees, they're not
+ * sharded - we have one btree for extents, another for inodes, et cetera.
+ *
+ * SUPERBLOCK:
+ *
+ * The superblock contains the location of the journal, the list of devices in
+ * the filesystem, and in general any metadata we need in order to decide
+ * whether we can start a filesystem or prior to reading the journal/btree
+ * roots.
+ *
+ * The superblock is extensible, and most of the contents of the superblock are
+ * in variable length, type tagged fields; see struct bch_sb_field.
+ *
+ * Backup superblocks do not reside in a fixed location; also, superblocks do
+ * not have a fixed size. To locate backup superblocks we have struct
+ * bch_sb_layout; we store a copy of this inside every superblock, and also
+ * before the first superblock.
+ *
+ * JOURNAL:
+ *
+ * The journal primarily records btree updates in the order they occurred;
+ * journal replay consists of just iterating over all the keys in the open
+ * journal entries and re-inserting them into the btrees.
+ *
+ * The journal also contains entry types for the btree roots, and blacklisted
+ * journal sequence numbers (see journal_seq_blacklist.c).
+ *
+ * BTREE:
+ *
+ * bcachefs btrees are copy on write b+ trees, where nodes are big (typically
+ * 128k-256k) and log structured. We use struct btree_node for writing the first
+ * entry in a given node (offset 0), and struct btree_node_entry for all
+ * subsequent writes.
+ *
+ * After the header, btree node entries contain a list of keys in sorted order.
+ * Values are stored inline with the keys; since values are variable length (and
+ * keys effectively are variable length too, due to packing) we can't do random
+ * access without building up additional in memory tables in the btree node read
+ * path.
+ *
+ * BTREE KEYS (struct bkey):
+ *
+ * The various btrees share a common format for the key - so as to avoid
+ * switching in fastpath lookup/comparison code - but define their own
+ * structures for the key values.
+ *
+ * The size of a key/value pair is stored as a u8 in units of u64s, so the max
+ * size is just under 2k. The common part also contains a type tag for the
+ * value, and a format field indicating whether the key is packed or not (and
+ * also meant to allow adding new key fields in the future, if desired).
+ *
+ * bkeys, when stored within a btree node, may also be packed. In that case, the
+ * bkey_format in that node is used to unpack it. Packed bkeys mean that we can
+ * be generous with field sizes in the common part of the key format (64 bit
+ * inode number, 64 bit offset, 96 bit version field, etc.) for negligible cost.
+ */
+
+#include <asm/types.h>
+#include <asm/byteorder.h>
+#include <linux/uuid.h>
+
+#ifdef __KERNEL__
+typedef uuid_t __uuid_t;
+#endif
+
+#define LE_BITMASK(_bits, name, type, field, offset, end)		\
+static const unsigned	name##_OFFSET = offset;				\
+static const unsigned	name##_BITS = (end - offset);			\
+static const __u##_bits	name##_MAX = (1ULL << (end - offset)) - 1;	\
+									\
+static inline __u64 name(const type *k)					\
+{									\
+	return (__le##_bits##_to_cpu(k->field) >> offset) &		\
+		~(~0ULL << (end - offset));				\
+}									\
+									\
+static inline void SET_##name(type *k, __u64 v)				\
+{									\
+	__u##_bits new = __le##_bits##_to_cpu(k->field);		\
+									\
+	new &= ~(~(~0ULL << (end - offset)) << offset);			\
+	new |= (v & ~(~0ULL << (end - offset))) << offset;		\
+	k->field = __cpu_to_le##_bits(new);				\
+}
+
+#define LE16_BITMASK(n, t, f, o, e)	LE_BITMASK(16, n, t, f, o, e)
+#define LE32_BITMASK(n, t, f, o, e)	LE_BITMASK(32, n, t, f, o, e)
+#define LE64_BITMASK(n, t, f, o, e)	LE_BITMASK(64, n, t, f, o, e)
+
+struct bkey_format {
+	__u8		key_u64s;
+	__u8		nr_fields;
+	/* One unused slot for now: */
+	__u8		bits_per_field[6];
+	__le64		field_offset[6];
+};
+
+/* Btree keys - all units are in sectors */
+
+struct bpos {
+	/*
+	 * Word order matches machine byte order - btree code treats a bpos as a
+	 * single large integer, for search/comparison purposes
+	 *
+	 * Note that wherever a bpos is embedded in another on disk data
+	 * structure, it has to be byte swabbed when reading in metadata that
+	 * wasn't written in native endian order:
+	 */
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+	__u32		snapshot;
+	__u64		offset;
+	__u64		inode;
+#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+	__u64		inode;
+	__u64		offset;		/* Points to end of extent - sectors */
+	__u32		snapshot;
+#else
+#error edit for your odd byteorder.
+#endif
+} __attribute__((packed, aligned(4)));
+
+#define KEY_INODE_MAX			((__u64)~0ULL)
+#define KEY_OFFSET_MAX			((__u64)~0ULL)
+#define KEY_SNAPSHOT_MAX		((__u32)~0U)
+#define KEY_SIZE_MAX			((__u32)~0U)
+
+static inline struct bpos POS(__u64 inode, __u64 offset)
+{
+	struct bpos ret;
+
+	ret.inode	= inode;
+	ret.offset	= offset;
+	ret.snapshot	= 0;
+
+	return ret;
+}
+
+#define POS_MIN				POS(0, 0)
+#define POS_MAX				POS(KEY_INODE_MAX, KEY_OFFSET_MAX)
+
+/* Empty placeholder struct, for container_of() */
+struct bch_val {
+	__u64		__nothing[0];
+};
+
+struct bversion {
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+	__u64		lo;
+	__u32		hi;
+#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+	__u32		hi;
+	__u64		lo;
+#endif
+} __attribute__((packed, aligned(4)));
+
+struct bkey {
+	/* Size of combined key and value, in u64s */
+	__u8		u64s;
+
+	/* Format of key (0 for format local to btree node) */
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+	__u8		format:7,
+			needs_whiteout:1;
+#elif defined (__BIG_ENDIAN_BITFIELD)
+	__u8		needs_whiteout:1,
+			format:7;
+#else
+#error edit for your odd byteorder.
+#endif
+
+	/* Type of the value */
+	__u8		type;
+
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+	__u8		pad[1];
+
+	struct bversion	version;
+	__u32		size;		/* extent size, in sectors */
+	struct bpos	p;
+#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+	struct bpos	p;
+	__u32		size;		/* extent size, in sectors */
+	struct bversion	version;
+
+	__u8		pad[1];
+#endif
+} __attribute__((packed, aligned(8)));
+
+struct bkey_packed {
+	__u64		_data[0];
+
+	/* Size of combined key and value, in u64s */
+	__u8		u64s;
+
+	/* Format of key (0 for format local to btree node) */
+
+	/*
+	 * XXX: next incompat on disk format change, switch format and
+	 * needs_whiteout - bkey_packed() will be cheaper if format is the high
+	 * bits of the bitfield
+	 */
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+	__u8		format:7,
+			needs_whiteout:1;
+#elif defined (__BIG_ENDIAN_BITFIELD)
+	__u8		needs_whiteout:1,
+			format:7;
+#endif
+
+	/* Type of the value */
+	__u8		type;
+	__u8		key_start[0];
+
+	/*
+	 * We copy bkeys with struct assignment in various places, and while
+	 * that shouldn't be done with packed bkeys we can't disallow it in C,
+	 * and it's legal to cast a bkey to a bkey_packed  - so padding it out
+	 * to the same size as struct bkey should hopefully be safest.
+	 */
+	__u8		pad[sizeof(struct bkey) - 3];
+} __attribute__((packed, aligned(8)));
+
+#define BKEY_U64s			(sizeof(struct bkey) / sizeof(__u64))
+#define KEY_PACKED_BITS_START		24
+
+#define KEY_FORMAT_LOCAL_BTREE		0
+#define KEY_FORMAT_CURRENT		1
+
+enum bch_bkey_fields {
+	BKEY_FIELD_INODE,
+	BKEY_FIELD_OFFSET,
+	BKEY_FIELD_SNAPSHOT,
+	BKEY_FIELD_SIZE,
+	BKEY_FIELD_VERSION_HI,
+	BKEY_FIELD_VERSION_LO,
+	BKEY_NR_FIELDS,
+};
+
+#define bkey_format_field(name, field)					\
+	[BKEY_FIELD_##name] = (sizeof(((struct bkey *) NULL)->field) * 8)
+
+#define BKEY_FORMAT_CURRENT						\
+((struct bkey_format) {							\
+	.key_u64s	= BKEY_U64s,					\
+	.nr_fields	= BKEY_NR_FIELDS,				\
+	.bits_per_field = {						\
+		bkey_format_field(INODE,	p.inode),		\
+		bkey_format_field(OFFSET,	p.offset),		\
+		bkey_format_field(SNAPSHOT,	p.snapshot),		\
+		bkey_format_field(SIZE,		size),			\
+		bkey_format_field(VERSION_HI,	version.hi),		\
+		bkey_format_field(VERSION_LO,	version.lo),		\
+	},								\
+})
+
+/* bkey with inline value */
+struct bkey_i {
+	__u64			_data[0];
+
+	union {
+	struct {
+		/* Size of combined key and value, in u64s */
+		__u8		u64s;
+	};
+	struct {
+		struct bkey	k;
+		struct bch_val	v;
+	};
+	};
+};
+
+#define KEY(_inode, _offset, _size)					\
+((struct bkey) {							\
+	.u64s		= BKEY_U64s,					\
+	.format		= KEY_FORMAT_CURRENT,				\
+	.p		= POS(_inode, _offset),				\
+	.size		= _size,					\
+})
+
+static inline void bkey_init(struct bkey *k)
+{
+	*k = KEY(0, 0, 0);
+}
+
+#define bkey_bytes(_k)		((_k)->u64s * sizeof(__u64))
+
+#define __BKEY_PADDED(key, pad)					\
+	struct { struct bkey_i key; __u64 key ## _pad[pad]; }
+
+#define BKEY_VAL_TYPE(name, nr)						\
+struct bkey_i_##name {							\
+	union {								\
+		struct bkey		k;				\
+		struct bkey_i		k_i;				\
+	};								\
+	struct bch_##name		v;				\
+}
+
+/*
+ * - DELETED keys are used internally to mark keys that should be ignored but
+ *   override keys in composition order.  Their version number is ignored.
+ *
+ * - DISCARDED keys indicate that the data is all 0s because it has been
+ *   discarded. DISCARDs may have a version; if the version is nonzero the key
+ *   will be persistent, otherwise the key will be dropped whenever the btree
+ *   node is rewritten (like DELETED keys).
+ *
+ * - ERROR: any read of the data returns a read error, as the data was lost due
+ *   to a failing device. Like DISCARDED keys, they can be removed (overridden)
+ *   by new writes or cluster-wide GC. Node repair can also overwrite them with
+ *   the same or a more recent version number, but not with an older version
+ *   number.
+*/
+#define KEY_TYPE_DELETED		0
+#define KEY_TYPE_DISCARD		1
+#define KEY_TYPE_ERROR			2
+#define KEY_TYPE_COOKIE			3
+#define KEY_TYPE_PERSISTENT_DISCARD	4
+#define KEY_TYPE_GENERIC_NR		128
+
+struct bch_cookie {
+	struct bch_val		v;
+	__le64			cookie;
+};
+BKEY_VAL_TYPE(cookie,		KEY_TYPE_COOKIE);
+
+/* Extents */
+
+/*
+ * In extent bkeys, the value is a list of pointers (bch_extent_ptr), optionally
+ * preceded by checksum/compression information (bch_extent_crc32 or
+ * bch_extent_crc64).
+ *
+ * One major determining factor in the format of extents is how we handle and
+ * represent extents that have been partially overwritten and thus trimmed:
+ *
+ * If an extent is not checksummed or compressed, when the extent is trimmed we
+ * don't have to remember the extent we originally allocated and wrote: we can
+ * merely adjust ptr->offset to point to the start of the data that is currently
+ * live. The size field in struct bkey records the current (live) size of the
+ * extent, and is also used to mean "size of region on disk that we point to" in
+ * this case.
+ *
+ * Thus an extent that is not checksummed or compressed will consist only of a
+ * list of bch_extent_ptrs, with none of the fields in
+ * bch_extent_crc32/bch_extent_crc64.
+ *
+ * When an extent is checksummed or compressed, it's not possible to read only
+ * the data that is currently live: we have to read the entire extent that was
+ * originally written, and then return only the part of the extent that is
+ * currently live.
+ *
+ * Thus, in addition to the current size of the extent in struct bkey, we need
+ * to store the size of the originally allocated space - this is the
+ * compressed_size and uncompressed_size fields in bch_extent_crc32/64. Also,
+ * when the extent is trimmed, instead of modifying the offset field of the
+ * pointer, we keep a second smaller offset field - "offset into the original
+ * extent of the currently live region".
+ *
+ * The other major determining factor is replication and data migration:
+ *
+ * Each pointer may have its own bch_extent_crc32/64. When doing a replicated
+ * write, we will initially write all the replicas in the same format, with the
+ * same checksum type and compression format - however, when copygc runs later (or
+ * tiering/cache promotion, anything that moves data), it is not in general
+ * going to rewrite all the pointers at once - one of the replicas may be in a
+ * bucket on one device that has very little fragmentation while another lives
+ * in a bucket that has become heavily fragmented, and thus is being rewritten
+ * sooner than the rest.
+ *
+ * Thus it will only move a subset of the pointers (or in the case of
+ * tiering/cache promotion perhaps add a single pointer without dropping any
+ * current pointers), and if the extent has been partially overwritten it must
+ * write only the currently live portion (or copygc would not be able to reduce
+ * fragmentation!) - which necessitates a different bch_extent_crc format for
+ * the new pointer.
+ *
+ * But in the interests of space efficiency, we don't want to store one
+ * bch_extent_crc for each pointer if we don't have to.
+ *
+ * Thus, a bch_extent consists of bch_extent_crc32s, bch_extent_crc64s, and
+ * bch_extent_ptrs appended arbitrarily one after the other. We determine the
+ * type of a given entry with a scheme similar to utf8 (except we're encoding a
+ * type, not a size), encoding the type in the position of the first set bit:
+ *
+ * bch_extent_crc32	- 0b1
+ * bch_extent_ptr	- 0b10
+ * bch_extent_crc64	- 0b100
+ *
+ * We do it this way because bch_extent_crc32 is _very_ constrained on bits (and
+ * bch_extent_crc64 is the least constrained).
+ *
+ * Then, each bch_extent_crc32/64 applies to the pointers that follow after it,
+ * until the next bch_extent_crc32/64.
+ *
+ * If there are no bch_extent_crcs preceding a bch_extent_ptr, then that pointer
+ * is neither checksummed nor compressed.
+ */
+
+/* 128 bits, sufficient for cryptographic MACs: */
+struct bch_csum {
+	__le64			lo;
+	__le64			hi;
+} __attribute__((packed, aligned(8)));
+
+enum bch_csum_type {
+	BCH_CSUM_NONE			= 0,
+	BCH_CSUM_CRC32C_NONZERO		= 1,
+	BCH_CSUM_CRC64_NONZERO		= 2,
+	BCH_CSUM_CHACHA20_POLY1305_80	= 3,
+	BCH_CSUM_CHACHA20_POLY1305_128	= 4,
+	BCH_CSUM_CRC32C			= 5,
+	BCH_CSUM_CRC64			= 6,
+	BCH_CSUM_NR			= 7,
+};
+
+static const unsigned bch_crc_bytes[] = {
+	[BCH_CSUM_NONE]				= 0,
+	[BCH_CSUM_CRC32C_NONZERO]		= 4,
+	[BCH_CSUM_CRC32C]			= 4,
+	[BCH_CSUM_CRC64_NONZERO]		= 8,
+	[BCH_CSUM_CRC64]			= 8,
+	[BCH_CSUM_CHACHA20_POLY1305_80]		= 10,
+	[BCH_CSUM_CHACHA20_POLY1305_128]	= 16,
+};
+
+static inline _Bool bch2_csum_type_is_encryption(enum bch_csum_type type)
+{
+	switch (type) {
+	case BCH_CSUM_CHACHA20_POLY1305_80:
+	case BCH_CSUM_CHACHA20_POLY1305_128:
+		return true;
+	default:
+		return false;
+	}
+}
+
+enum bch_compression_type {
+	BCH_COMPRESSION_NONE		= 0,
+	BCH_COMPRESSION_LZ4_OLD		= 1,
+	BCH_COMPRESSION_GZIP		= 2,
+	BCH_COMPRESSION_LZ4		= 3,
+	BCH_COMPRESSION_ZSTD		= 4,
+	BCH_COMPRESSION_NR		= 5,
+};
+
+enum bch_extent_entry_type {
+	BCH_EXTENT_ENTRY_ptr		= 0,
+	BCH_EXTENT_ENTRY_crc32		= 1,
+	BCH_EXTENT_ENTRY_crc64		= 2,
+	BCH_EXTENT_ENTRY_crc128		= 3,
+};
+
+#define BCH_EXTENT_ENTRY_MAX		4
+
+/* Compressed/uncompressed size are stored biased by 1: */
+struct bch_extent_crc32 {
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+	__u32			type:2,
+				_compressed_size:7,
+				_uncompressed_size:7,
+				offset:7,
+				_unused:1,
+				csum_type:4,
+				compression_type:4;
+	__u32			csum;
+#elif defined (__BIG_ENDIAN_BITFIELD)
+	__u32			csum;
+	__u32			compression_type:4,
+				csum_type:4,
+				_unused:1,
+				offset:7,
+				_uncompressed_size:7,
+				_compressed_size:7,
+				type:2;
+#endif
+} __attribute__((packed, aligned(8)));
+
+#define CRC32_SIZE_MAX		(1U << 7)
+#define CRC32_NONCE_MAX		0
+
+struct bch_extent_crc64 {
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+	__u64			type:3,
+				_compressed_size:9,
+				_uncompressed_size:9,
+				offset:9,
+				nonce:10,
+				csum_type:4,
+				compression_type:4,
+				csum_hi:16;
+#elif defined (__BIG_ENDIAN_BITFIELD)
+	__u64			csum_hi:16,
+				compression_type:4,
+				csum_type:4,
+				nonce:10,
+				offset:9,
+				_uncompressed_size:9,
+				_compressed_size:9,
+				type:3;
+#endif
+	__u64			csum_lo;
+} __attribute__((packed, aligned(8)));
+
+#define CRC64_SIZE_MAX		(1U << 9)
+#define CRC64_NONCE_MAX		((1U << 10) - 1)
+
+struct bch_extent_crc128 {
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+	__u64			type:4,
+				_compressed_size:13,
+				_uncompressed_size:13,
+				offset:13,
+				nonce:13,
+				csum_type:4,
+				compression_type:4;
+#elif defined (__BIG_ENDIAN_BITFIELD)
+	__u64			compression_type:4,
+				csum_type:4,
+				nonce:13,
+				offset:13,
+				_uncompressed_size:13,
+				_compressed_size:13,
+				type:4;
+#endif
+	struct bch_csum		csum;
+} __attribute__((packed, aligned(8)));
+
+#define CRC128_SIZE_MAX		(1U << 13)
+#define CRC128_NONCE_MAX	((1U << 13) - 1)
+
+/*
+ * @reservation - pointer hasn't been written to, just reserved
+ */
+struct bch_extent_ptr {
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+	__u64			type:1,
+				cached:1,
+				erasure_coded:1,
+				reservation:1,
+				offset:44, /* 8 petabytes */
+				dev:8,
+				gen:8;
+#elif defined (__BIG_ENDIAN_BITFIELD)
+	__u64			gen:8,
+				dev:8,
+				offset:44,
+				reservation:1,
+				erasure_coded:1,
+				cached:1,
+				type:1;
+#endif
+} __attribute__((packed, aligned(8)));
+
+struct bch_extent_reservation {
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+	__u64			type:5,
+				unused:23,
+				replicas:4,
+				generation:32;
+#elif defined (__BIG_ENDIAN_BITFIELD)
+	__u64			generation:32,
+				replicas:4,
+				unused:23,
+				type:5;
+#endif
+};
+
+union bch_extent_entry {
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ ||  __BITS_PER_LONG == 64
+	unsigned long			type;
+#elif __BITS_PER_LONG == 32
+	struct {
+		unsigned long		pad;
+		unsigned long		type;
+	};
+#else
+#error edit for your odd byteorder.
+#endif
+	struct bch_extent_crc32		crc32;
+	struct bch_extent_crc64		crc64;
+	struct bch_extent_crc128	crc128;
+	struct bch_extent_ptr		ptr;
+};
+
+enum {
+	BCH_EXTENT		= 128,
+
+	/*
+	 * This is kind of a hack, we're overloading the type for a boolean that
+	 * really should be part of the value - BCH_EXTENT and BCH_EXTENT_CACHED
+	 * have the same value type:
+	 */
+	BCH_EXTENT_CACHED	= 129,
+
+	/*
+	 * Persistent reservation:
+	 */
+	BCH_RESERVATION		= 130,
+};
+
+struct bch_extent {
+	struct bch_val		v;
+
+	__u64			_data[0];
+	union bch_extent_entry	start[];
+} __attribute__((packed, aligned(8)));
+BKEY_VAL_TYPE(extent,		BCH_EXTENT);
+
+struct bch_reservation {
+	struct bch_val		v;
+
+	__le32			generation;
+	__u8			nr_replicas;
+	__u8			pad[3];
+} __attribute__((packed, aligned(8)));
+BKEY_VAL_TYPE(reservation,	BCH_RESERVATION);
+
+/* Maximum size (in u64s) a single pointer could be: */
+#define BKEY_EXTENT_PTR_U64s_MAX\
+	((sizeof(struct bch_extent_crc128) +			\
+	  sizeof(struct bch_extent_ptr)) / sizeof(u64))
+
+/* Maximum possible size of an entire extent value: */
+#define BKEY_EXTENT_VAL_U64s_MAX				\
+	(BKEY_EXTENT_PTR_U64s_MAX * (BCH_REPLICAS_MAX + 1))
+
+#define BKEY_PADDED(key)	__BKEY_PADDED(key, BKEY_EXTENT_VAL_U64s_MAX)
+
+/* * Maximum possible size of an entire extent, key + value: */
+#define BKEY_EXTENT_U64s_MAX		(BKEY_U64s + BKEY_EXTENT_VAL_U64s_MAX)
+
+/* Btree pointers don't carry around checksums: */
+#define BKEY_BTREE_PTR_VAL_U64s_MAX				\
+	((sizeof(struct bch_extent_ptr)) / sizeof(u64) * BCH_REPLICAS_MAX)
+#define BKEY_BTREE_PTR_U64s_MAX					\
+	(BKEY_U64s + BKEY_BTREE_PTR_VAL_U64s_MAX)
+
+/* Inodes */
+
+#define BLOCKDEV_INODE_MAX	4096
+
+#define BCACHEFS_ROOT_INO	4096
+
+enum bch_inode_types {
+	BCH_INODE_FS		= 128,
+	BCH_INODE_BLOCKDEV	= 129,
+	BCH_INODE_GENERATION	= 130,
+};
+
+struct bch_inode {
+	struct bch_val		v;
+
+	__le64			bi_hash_seed;
+	__le32			bi_flags;
+	__le16			bi_mode;
+	__u8			fields[0];
+} __attribute__((packed, aligned(8)));
+BKEY_VAL_TYPE(inode,		BCH_INODE_FS);
+
+struct bch_inode_generation {
+	struct bch_val		v;
+
+	__le32			bi_generation;
+	__le32			pad;
+} __attribute__((packed, aligned(8)));
+BKEY_VAL_TYPE(inode_generation,	BCH_INODE_GENERATION);
+
+#define BCH_INODE_FIELDS()					\
+	BCH_INODE_FIELD(bi_atime,			64)	\
+	BCH_INODE_FIELD(bi_ctime,			64)	\
+	BCH_INODE_FIELD(bi_mtime,			64)	\
+	BCH_INODE_FIELD(bi_otime,			64)	\
+	BCH_INODE_FIELD(bi_size,			64)	\
+	BCH_INODE_FIELD(bi_sectors,			64)	\
+	BCH_INODE_FIELD(bi_uid,				32)	\
+	BCH_INODE_FIELD(bi_gid,				32)	\
+	BCH_INODE_FIELD(bi_nlink,			32)	\
+	BCH_INODE_FIELD(bi_generation,			32)	\
+	BCH_INODE_FIELD(bi_dev,				32)	\
+	BCH_INODE_FIELD(bi_data_checksum,		8)	\
+	BCH_INODE_FIELD(bi_compression,			8)	\
+	BCH_INODE_FIELD(bi_project,			32)	\
+	BCH_INODE_FIELD(bi_background_compression,	8)	\
+	BCH_INODE_FIELD(bi_data_replicas,		8)	\
+	BCH_INODE_FIELD(bi_promote_target,		16)	\
+	BCH_INODE_FIELD(bi_foreground_target,		16)	\
+	BCH_INODE_FIELD(bi_background_target,		16)
+
+#define BCH_INODE_FIELDS_INHERIT()				\
+	BCH_INODE_FIELD(bi_data_checksum)			\
+	BCH_INODE_FIELD(bi_compression)				\
+	BCH_INODE_FIELD(bi_project)				\
+	BCH_INODE_FIELD(bi_background_compression)		\
+	BCH_INODE_FIELD(bi_data_replicas)			\
+	BCH_INODE_FIELD(bi_promote_target)			\
+	BCH_INODE_FIELD(bi_foreground_target)			\
+	BCH_INODE_FIELD(bi_background_target)
+
+enum {
+	/*
+	 * User flags (get/settable with FS_IOC_*FLAGS, correspond to FS_*_FL
+	 * flags)
+	 */
+	__BCH_INODE_SYNC	= 0,
+	__BCH_INODE_IMMUTABLE	= 1,
+	__BCH_INODE_APPEND	= 2,
+	__BCH_INODE_NODUMP	= 3,
+	__BCH_INODE_NOATIME	= 4,
+
+	__BCH_INODE_I_SIZE_DIRTY= 5,
+	__BCH_INODE_I_SECTORS_DIRTY= 6,
+	__BCH_INODE_UNLINKED	= 7,
+
+	/* bits 20+ reserved for packed fields below: */
+};
+
+#define BCH_INODE_SYNC		(1 << __BCH_INODE_SYNC)
+#define BCH_INODE_IMMUTABLE	(1 << __BCH_INODE_IMMUTABLE)
+#define BCH_INODE_APPEND	(1 << __BCH_INODE_APPEND)
+#define BCH_INODE_NODUMP	(1 << __BCH_INODE_NODUMP)
+#define BCH_INODE_NOATIME	(1 << __BCH_INODE_NOATIME)
+#define BCH_INODE_I_SIZE_DIRTY	(1 << __BCH_INODE_I_SIZE_DIRTY)
+#define BCH_INODE_I_SECTORS_DIRTY (1 << __BCH_INODE_I_SECTORS_DIRTY)
+#define BCH_INODE_UNLINKED	(1 << __BCH_INODE_UNLINKED)
+
+LE32_BITMASK(INODE_STR_HASH,	struct bch_inode, bi_flags, 20, 24);
+LE32_BITMASK(INODE_NR_FIELDS,	struct bch_inode, bi_flags, 24, 32);
+
+struct bch_inode_blockdev {
+	struct bch_val		v;
+
+	__le64			i_size;
+	__le64			i_flags;
+
+	/* Seconds: */
+	__le64			i_ctime;
+	__le64			i_mtime;
+
+	__uuid_t		i_uuid;
+	__u8			i_label[32];
+} __attribute__((packed, aligned(8)));
+BKEY_VAL_TYPE(inode_blockdev,	BCH_INODE_BLOCKDEV);
+
+/* Thin provisioned volume, or cache for another block device? */
+LE64_BITMASK(CACHED_DEV,	struct bch_inode_blockdev, i_flags, 0,  1)
+
+/* Dirents */
+
+/*
+ * Dirents (and xattrs) have to implement string lookups; since our b-tree
+ * doesn't support arbitrary length strings for the key, we instead index by a
+ * 64 bit hash (currently truncated sha1) of the string, stored in the offset
+ * field of the key - using linear probing to resolve hash collisions. This also
+ * provides us with the readdir cookie posix requires.
+ *
+ * Linear probing requires us to use whiteouts for deletions, in the event of a
+ * collision:
+ */
+
+enum {
+	BCH_DIRENT		= 128,
+	BCH_DIRENT_WHITEOUT	= 129,
+};
+
+struct bch_dirent {
+	struct bch_val		v;
+
+	/* Target inode number: */
+	__le64			d_inum;
+
+	/*
+	 * Copy of mode bits 12-15 from the target inode - so userspace can get
+	 * the filetype without having to do a stat()
+	 */
+	__u8			d_type;
+
+	__u8			d_name[];
+} __attribute__((packed, aligned(8)));
+BKEY_VAL_TYPE(dirent,		BCH_DIRENT);
+
+#define BCH_NAME_MAX	(U8_MAX * sizeof(u64) -				\
+			 sizeof(struct bkey) -				\
+			 offsetof(struct bch_dirent, d_name))
+
+
+/* Xattrs */
+
+enum {
+	BCH_XATTR		= 128,
+	BCH_XATTR_WHITEOUT	= 129,
+};
+
+#define BCH_XATTR_INDEX_USER			0
+#define BCH_XATTR_INDEX_POSIX_ACL_ACCESS	1
+#define BCH_XATTR_INDEX_POSIX_ACL_DEFAULT	2
+#define BCH_XATTR_INDEX_TRUSTED			3
+#define BCH_XATTR_INDEX_SECURITY	        4
+
+struct bch_xattr {
+	struct bch_val		v;
+	__u8			x_type;
+	__u8			x_name_len;
+	__le16			x_val_len;
+	__u8			x_name[];
+} __attribute__((packed, aligned(8)));
+BKEY_VAL_TYPE(xattr,		BCH_XATTR);
+
+/* Bucket/allocation information: */
+
+enum {
+	BCH_ALLOC		= 128,
+};
+
+enum {
+	BCH_ALLOC_FIELD_READ_TIME	= 0,
+	BCH_ALLOC_FIELD_WRITE_TIME	= 1,
+};
+
+struct bch_alloc {
+	struct bch_val		v;
+	__u8			fields;
+	__u8			gen;
+	__u8			data[];
+} __attribute__((packed, aligned(8)));
+BKEY_VAL_TYPE(alloc,	BCH_ALLOC);
+
+/* Quotas: */
+
+enum {
+	BCH_QUOTA		= 128,
+};
+
+enum quota_types {
+	QTYP_USR		= 0,
+	QTYP_GRP		= 1,
+	QTYP_PRJ		= 2,
+	QTYP_NR			= 3,
+};
+
+enum quota_counters {
+	Q_SPC			= 0,
+	Q_INO			= 1,
+	Q_COUNTERS		= 2,
+};
+
+struct bch_quota_counter {
+	__le64			hardlimit;
+	__le64			softlimit;
+};
+
+struct bch_quota {
+	struct bch_val		v;
+	struct bch_quota_counter c[Q_COUNTERS];
+} __attribute__((packed, aligned(8)));
+BKEY_VAL_TYPE(quota,	BCH_QUOTA);
+
+/* Optional/variable size superblock sections: */
+
+struct bch_sb_field {
+	__u64			_data[0];
+	__le32			u64s;
+	__le32			type;
+};
+
+#define BCH_SB_FIELDS()		\
+	x(journal,	0)	\
+	x(members,	1)	\
+	x(crypt,	2)	\
+	x(replicas,	3)	\
+	x(quota,	4)	\
+	x(disk_groups,	5)	\
+	x(clean,	6)
+
+enum bch_sb_field_type {
+#define x(f, nr)	BCH_SB_FIELD_##f = nr,
+	BCH_SB_FIELDS()
+#undef x
+	BCH_SB_FIELD_NR
+};
+
+/* BCH_SB_FIELD_journal: */
+
+struct bch_sb_field_journal {
+	struct bch_sb_field	field;
+	__le64			buckets[0];
+};
+
+/* BCH_SB_FIELD_members: */
+
+struct bch_member {
+	__uuid_t		uuid;
+	__le64			nbuckets;	/* device size */
+	__le16			first_bucket;   /* index of first bucket used */
+	__le16			bucket_size;	/* sectors */
+	__le32			pad;
+	__le64			last_mount;	/* time_t */
+
+	__le64			flags[2];
+};
+
+LE64_BITMASK(BCH_MEMBER_STATE,		struct bch_member, flags[0],  0,  4)
+/* 4-10 unused, was TIER, HAS_(META)DATA */
+LE64_BITMASK(BCH_MEMBER_REPLACEMENT,	struct bch_member, flags[0], 10, 14)
+LE64_BITMASK(BCH_MEMBER_DISCARD,	struct bch_member, flags[0], 14, 15)
+LE64_BITMASK(BCH_MEMBER_DATA_ALLOWED,	struct bch_member, flags[0], 15, 20)
+LE64_BITMASK(BCH_MEMBER_GROUP,		struct bch_member, flags[0], 20, 28)
+LE64_BITMASK(BCH_MEMBER_DURABILITY,	struct bch_member, flags[0], 28, 30)
+
+#define BCH_TIER_MAX			4U
+
+#if 0
+LE64_BITMASK(BCH_MEMBER_NR_READ_ERRORS,	struct bch_member, flags[1], 0,  20);
+LE64_BITMASK(BCH_MEMBER_NR_WRITE_ERRORS,struct bch_member, flags[1], 20, 40);
+#endif
+
+enum bch_member_state {
+	BCH_MEMBER_STATE_RW		= 0,
+	BCH_MEMBER_STATE_RO		= 1,
+	BCH_MEMBER_STATE_FAILED		= 2,
+	BCH_MEMBER_STATE_SPARE		= 3,
+	BCH_MEMBER_STATE_NR		= 4,
+};
+
+enum cache_replacement {
+	CACHE_REPLACEMENT_LRU		= 0,
+	CACHE_REPLACEMENT_FIFO		= 1,
+	CACHE_REPLACEMENT_RANDOM	= 2,
+	CACHE_REPLACEMENT_NR		= 3,
+};
+
+struct bch_sb_field_members {
+	struct bch_sb_field	field;
+	struct bch_member	members[0];
+};
+
+/* BCH_SB_FIELD_crypt: */
+
+struct nonce {
+	__le32			d[4];
+};
+
+struct bch_key {
+	__le64			key[4];
+};
+
+#define BCH_KEY_MAGIC					\
+	(((u64) 'b' <<  0)|((u64) 'c' <<  8)|		\
+	 ((u64) 'h' << 16)|((u64) '*' << 24)|		\
+	 ((u64) '*' << 32)|((u64) 'k' << 40)|		\
+	 ((u64) 'e' << 48)|((u64) 'y' << 56))
+
+struct bch_encrypted_key {
+	__le64			magic;
+	struct bch_key		key;
+};
+
+/*
+ * If this field is present in the superblock, it stores an encryption key which
+ * is used encrypt all other data/metadata. The key will normally be encrypted
+ * with the key userspace provides, but if encryption has been turned off we'll
+ * just store the master key unencrypted in the superblock so we can access the
+ * previously encrypted data.
+ */
+struct bch_sb_field_crypt {
+	struct bch_sb_field	field;
+
+	__le64			flags;
+	__le64			kdf_flags;
+	struct bch_encrypted_key key;
+};
+
+LE64_BITMASK(BCH_CRYPT_KDF_TYPE,	struct bch_sb_field_crypt, flags, 0, 4);
+
+enum bch_kdf_types {
+	BCH_KDF_SCRYPT		= 0,
+	BCH_KDF_NR		= 1,
+};
+
+/* stored as base 2 log of scrypt params: */
+LE64_BITMASK(BCH_KDF_SCRYPT_N,	struct bch_sb_field_crypt, kdf_flags,  0, 16);
+LE64_BITMASK(BCH_KDF_SCRYPT_R,	struct bch_sb_field_crypt, kdf_flags, 16, 32);
+LE64_BITMASK(BCH_KDF_SCRYPT_P,	struct bch_sb_field_crypt, kdf_flags, 32, 48);
+
+/* BCH_SB_FIELD_replicas: */
+
+enum bch_data_type {
+	BCH_DATA_NONE		= 0,
+	BCH_DATA_SB		= 1,
+	BCH_DATA_JOURNAL	= 2,
+	BCH_DATA_BTREE		= 3,
+	BCH_DATA_USER		= 4,
+	BCH_DATA_CACHED		= 5,
+	BCH_DATA_NR		= 6,
+};
+
+struct bch_replicas_entry {
+	u8			data_type;
+	u8			nr;
+	u8			devs[];
+};
+
+struct bch_sb_field_replicas {
+	struct bch_sb_field	field;
+	struct bch_replicas_entry entries[];
+};
+
+/* BCH_SB_FIELD_quota: */
+
+struct bch_sb_quota_counter {
+	__le32				timelimit;
+	__le32				warnlimit;
+};
+
+struct bch_sb_quota_type {
+	__le64				flags;
+	struct bch_sb_quota_counter	c[Q_COUNTERS];
+};
+
+struct bch_sb_field_quota {
+	struct bch_sb_field		field;
+	struct bch_sb_quota_type	q[QTYP_NR];
+} __attribute__((packed, aligned(8)));
+
+/* BCH_SB_FIELD_disk_groups: */
+
+#define BCH_SB_LABEL_SIZE		32
+
+struct bch_disk_group {
+	__u8			label[BCH_SB_LABEL_SIZE];
+	__le64			flags[2];
+};
+
+LE64_BITMASK(BCH_GROUP_DELETED,		struct bch_disk_group, flags[0], 0,  1)
+LE64_BITMASK(BCH_GROUP_DATA_ALLOWED,	struct bch_disk_group, flags[0], 1,  6)
+LE64_BITMASK(BCH_GROUP_PARENT,		struct bch_disk_group, flags[0], 6, 24)
+
+struct bch_sb_field_disk_groups {
+	struct bch_sb_field	field;
+	struct bch_disk_group	entries[0];
+};
+
+/*
+ * On clean shutdown, store btree roots and current journal sequence number in
+ * the superblock:
+ */
+struct jset_entry {
+	__le16			u64s;
+	__u8			btree_id;
+	__u8			level;
+	__u8			type; /* designates what this jset holds */
+	__u8			pad[3];
+
+	union {
+		struct bkey_i	start[0];
+		__u64		_data[0];
+	};
+};
+
+struct bch_sb_field_clean {
+	struct bch_sb_field	field;
+
+	__le32			flags;
+	__le16			read_clock;
+	__le16			write_clock;
+	__le64			journal_seq;
+
+	union {
+		struct jset_entry start[0];
+		__u64		_data[0];
+	};
+};
+
+/* Superblock: */
+
+/*
+ * Version 8:	BCH_SB_ENCODED_EXTENT_MAX_BITS
+ *		BCH_MEMBER_DATA_ALLOWED
+ * Version 9:	incompatible extent nonce change
+ */
+
+#define BCH_SB_VERSION_MIN		7
+#define BCH_SB_VERSION_EXTENT_MAX	8
+#define BCH_SB_VERSION_EXTENT_NONCE_V1	9
+#define BCH_SB_VERSION_MAX		9
+
+#define BCH_SB_SECTOR			8
+#define BCH_SB_MEMBERS_MAX		64 /* XXX kill */
+
+struct bch_sb_layout {
+	__uuid_t		magic;	/* bcachefs superblock UUID */
+	__u8			layout_type;
+	__u8			sb_max_size_bits; /* base 2 of 512 byte sectors */
+	__u8			nr_superblocks;
+	__u8			pad[5];
+	__le64			sb_offset[61];
+} __attribute__((packed, aligned(8)));
+
+#define BCH_SB_LAYOUT_SECTOR	7
+
+/*
+ * @offset	- sector where this sb was written
+ * @version	- on disk format version
+ * @magic	- identifies as a bcachefs superblock (BCACHE_MAGIC)
+ * @seq		- incremented each time superblock is written
+ * @uuid	- used for generating various magic numbers and identifying
+ *                member devices, never changes
+ * @user_uuid	- user visible UUID, may be changed
+ * @label	- filesystem label
+ * @seq		- identifies most recent superblock, incremented each time
+ *		  superblock is written
+ * @features	- enabled incompatible features
+ */
+struct bch_sb {
+	struct bch_csum		csum;
+	__le16			version;
+	__le16			version_min;
+	__le16			pad[2];
+	__uuid_t		magic;
+	__uuid_t		uuid;
+	__uuid_t		user_uuid;
+	__u8			label[BCH_SB_LABEL_SIZE];
+	__le64			offset;
+	__le64			seq;
+
+	__le16			block_size;
+	__u8			dev_idx;
+	__u8			nr_devices;
+	__le32			u64s;
+
+	__le64			time_base_lo;
+	__le32			time_base_hi;
+	__le32			time_precision;
+
+	__le64			flags[8];
+	__le64			features[2];
+	__le64			compat[2];
+
+	struct bch_sb_layout	layout;
+
+	union {
+		struct bch_sb_field start[0];
+		__le64		_data[0];
+	};
+} __attribute__((packed, aligned(8)));
+
+/*
+ * Flags:
+ * BCH_SB_INITALIZED	- set on first mount
+ * BCH_SB_CLEAN		- did we shut down cleanly? Just a hint, doesn't affect
+ *			  behaviour of mount/recovery path:
+ * BCH_SB_INODE_32BIT	- limit inode numbers to 32 bits
+ * BCH_SB_128_BIT_MACS	- 128 bit macs instead of 80
+ * BCH_SB_ENCRYPTION_TYPE - if nonzero encryption is enabled; overrides
+ *			   DATA/META_CSUM_TYPE. Also indicates encryption
+ *			   algorithm in use, if/when we get more than one
+ */
+
+LE16_BITMASK(BCH_SB_BLOCK_SIZE,		struct bch_sb, block_size, 0, 16);
+
+LE64_BITMASK(BCH_SB_INITIALIZED,	struct bch_sb, flags[0],  0,  1);
+LE64_BITMASK(BCH_SB_CLEAN,		struct bch_sb, flags[0],  1,  2);
+LE64_BITMASK(BCH_SB_CSUM_TYPE,		struct bch_sb, flags[0],  2,  8);
+LE64_BITMASK(BCH_SB_ERROR_ACTION,	struct bch_sb, flags[0],  8, 12);
+
+LE64_BITMASK(BCH_SB_BTREE_NODE_SIZE,	struct bch_sb, flags[0], 12, 28);
+
+LE64_BITMASK(BCH_SB_GC_RESERVE,		struct bch_sb, flags[0], 28, 33);
+LE64_BITMASK(BCH_SB_ROOT_RESERVE,	struct bch_sb, flags[0], 33, 40);
+
+LE64_BITMASK(BCH_SB_META_CSUM_TYPE,	struct bch_sb, flags[0], 40, 44);
+LE64_BITMASK(BCH_SB_DATA_CSUM_TYPE,	struct bch_sb, flags[0], 44, 48);
+
+LE64_BITMASK(BCH_SB_META_REPLICAS_WANT,	struct bch_sb, flags[0], 48, 52);
+LE64_BITMASK(BCH_SB_DATA_REPLICAS_WANT,	struct bch_sb, flags[0], 52, 56);
+
+LE64_BITMASK(BCH_SB_POSIX_ACL,		struct bch_sb, flags[0], 56, 57);
+LE64_BITMASK(BCH_SB_USRQUOTA,		struct bch_sb, flags[0], 57, 58);
+LE64_BITMASK(BCH_SB_GRPQUOTA,		struct bch_sb, flags[0], 58, 59);
+LE64_BITMASK(BCH_SB_PRJQUOTA,		struct bch_sb, flags[0], 59, 60);
+
+/* 60-64 unused */
+
+LE64_BITMASK(BCH_SB_STR_HASH_TYPE,	struct bch_sb, flags[1],  0,  4);
+LE64_BITMASK(BCH_SB_COMPRESSION_TYPE,	struct bch_sb, flags[1],  4,  8);
+LE64_BITMASK(BCH_SB_INODE_32BIT,	struct bch_sb, flags[1],  8,  9);
+
+LE64_BITMASK(BCH_SB_128_BIT_MACS,	struct bch_sb, flags[1],  9, 10);
+LE64_BITMASK(BCH_SB_ENCRYPTION_TYPE,	struct bch_sb, flags[1], 10, 14);
+
+/*
+ * Max size of an extent that may require bouncing to read or write
+ * (checksummed, compressed): 64k
+ */
+LE64_BITMASK(BCH_SB_ENCODED_EXTENT_MAX_BITS,
+					struct bch_sb, flags[1], 14, 20);
+
+LE64_BITMASK(BCH_SB_META_REPLICAS_REQ,	struct bch_sb, flags[1], 20, 24);
+LE64_BITMASK(BCH_SB_DATA_REPLICAS_REQ,	struct bch_sb, flags[1], 24, 28);
+
+LE64_BITMASK(BCH_SB_PROMOTE_TARGET,	struct bch_sb, flags[1], 28, 40);
+LE64_BITMASK(BCH_SB_FOREGROUND_TARGET,	struct bch_sb, flags[1], 40, 52);
+LE64_BITMASK(BCH_SB_BACKGROUND_TARGET,	struct bch_sb, flags[1], 52, 64);
+
+LE64_BITMASK(BCH_SB_BACKGROUND_COMPRESSION_TYPE,
+					struct bch_sb, flags[2],  0,  4);
+
+/* Features: */
+enum bch_sb_features {
+	BCH_FEATURE_LZ4			= 0,
+	BCH_FEATURE_GZIP		= 1,
+	BCH_FEATURE_ZSTD		= 2,
+	BCH_FEATURE_ATOMIC_NLINK	= 3,
+};
+
+/* options: */
+
+#define BCH_REPLICAS_MAX		4U
+
+enum bch_error_actions {
+	BCH_ON_ERROR_CONTINUE		= 0,
+	BCH_ON_ERROR_RO			= 1,
+	BCH_ON_ERROR_PANIC		= 2,
+	BCH_NR_ERROR_ACTIONS		= 3,
+};
+
+enum bch_csum_opts {
+	BCH_CSUM_OPT_NONE		= 0,
+	BCH_CSUM_OPT_CRC32C		= 1,
+	BCH_CSUM_OPT_CRC64		= 2,
+	BCH_CSUM_OPT_NR			= 3,
+};
+
+enum bch_str_hash_opts {
+	BCH_STR_HASH_CRC32C		= 0,
+	BCH_STR_HASH_CRC64		= 1,
+	BCH_STR_HASH_SIPHASH		= 2,
+	BCH_STR_HASH_NR			= 3,
+};
+
+#define BCH_COMPRESSION_TYPES()		\
+	x(NONE)				\
+	x(LZ4)				\
+	x(GZIP)				\
+	x(ZSTD)
+
+enum bch_compression_opts {
+#define x(t) BCH_COMPRESSION_OPT_##t,
+	BCH_COMPRESSION_TYPES()
+#undef x
+	BCH_COMPRESSION_OPT_NR
+};
+
+/*
+ * Magic numbers
+ *
+ * The various other data structures have their own magic numbers, which are
+ * xored with the first part of the cache set's UUID
+ */
+
+#define BCACHE_MAGIC							\
+	UUID_INIT(0xc68573f6, 0x4e1a, 0x45ca,				\
+		  0x82, 0x65, 0xf5, 0x7f, 0x48, 0xba, 0x6d, 0x81)
+#define BCHFS_MAGIC							\
+	UUID_INIT(0xc68573f6, 0x66ce, 0x90a9,				\
+		  0xd9, 0x6a, 0x60, 0xcf, 0x80, 0x3d, 0xf7, 0xef)
+
+#define BCACHEFS_STATFS_MAGIC		0xca451a4e
+
+#define JSET_MAGIC		__cpu_to_le64(0x245235c1a3625032ULL)
+#define BSET_MAGIC		__cpu_to_le64(0x90135c78b99e07f5ULL)
+
+static inline __le64 __bch2_sb_magic(struct bch_sb *sb)
+{
+	__le64 ret;
+	memcpy(&ret, &sb->uuid, sizeof(ret));
+	return ret;
+}
+
+static inline __u64 __jset_magic(struct bch_sb *sb)
+{
+	return __le64_to_cpu(__bch2_sb_magic(sb) ^ JSET_MAGIC);
+}
+
+static inline __u64 __bset_magic(struct bch_sb *sb)
+{
+	return __le64_to_cpu(__bch2_sb_magic(sb) ^ BSET_MAGIC);
+}
+
+/* Journal */
+
+#define BCACHE_JSET_VERSION_UUIDv1	1
+#define BCACHE_JSET_VERSION_UUID	1	/* Always latest UUID format */
+#define BCACHE_JSET_VERSION_JKEYS	2
+#define BCACHE_JSET_VERSION		2
+
+#define JSET_KEYS_U64s	(sizeof(struct jset_entry) / sizeof(__u64))
+
+#define BCH_JSET_ENTRY_TYPES()			\
+	x(btree_keys,		0)		\
+	x(btree_root,		1)		\
+	x(prio_ptrs,		2)		\
+	x(blacklist,		3)		\
+	x(blacklist_v2,		4)
+
+enum {
+#define x(f, nr)	BCH_JSET_ENTRY_##f	= nr,
+	BCH_JSET_ENTRY_TYPES()
+#undef x
+	BCH_JSET_ENTRY_NR
+};
+
+/*
+ * Journal sequence numbers can be blacklisted: bsets record the max sequence
+ * number of all the journal entries they contain updates for, so that on
+ * recovery we can ignore those bsets that contain index updates newer that what
+ * made it into the journal.
+ *
+ * This means that we can't reuse that journal_seq - we have to skip it, and
+ * then record that we skipped it so that the next time we crash and recover we
+ * don't think there was a missing journal entry.
+ */
+struct jset_entry_blacklist {
+	struct jset_entry	entry;
+	__le64			seq;
+};
+
+struct jset_entry_blacklist_v2 {
+	struct jset_entry	entry;
+	__le64			start;
+	__le64			end;
+};
+
+/*
+ * On disk format for a journal entry:
+ * seq is monotonically increasing; every journal entry has its own unique
+ * sequence number.
+ *
+ * last_seq is the oldest journal entry that still has keys the btree hasn't
+ * flushed to disk yet.
+ *
+ * version is for on disk format changes.
+ */
+struct jset {
+	struct bch_csum		csum;
+
+	__le64			magic;
+	__le64			seq;
+	__le32			version;
+	__le32			flags;
+
+	__le32			u64s; /* size of d[] in u64s */
+
+	__u8			encrypted_start[0];
+
+	__le16			read_clock;
+	__le16			write_clock;
+
+	/* Sequence number of oldest dirty journal entry */
+	__le64			last_seq;
+
+
+	union {
+		struct jset_entry start[0];
+		__u64		_data[0];
+	};
+} __attribute__((packed, aligned(8)));
+
+LE32_BITMASK(JSET_CSUM_TYPE,	struct jset, flags, 0, 4);
+LE32_BITMASK(JSET_BIG_ENDIAN,	struct jset, flags, 4, 5);
+
+#define BCH_JOURNAL_BUCKETS_MIN		20
+
+/* Btree: */
+
+#define DEFINE_BCH_BTREE_IDS()					\
+	DEF_BTREE_ID(EXTENTS,	0, "extents")			\
+	DEF_BTREE_ID(INODES,	1, "inodes")			\
+	DEF_BTREE_ID(DIRENTS,	2, "dirents")			\
+	DEF_BTREE_ID(XATTRS,	3, "xattrs")			\
+	DEF_BTREE_ID(ALLOC,	4, "alloc")			\
+	DEF_BTREE_ID(QUOTAS,	5, "quotas")
+
+#define DEF_BTREE_ID(kwd, val, name) BTREE_ID_##kwd = val,
+
+enum btree_id {
+	DEFINE_BCH_BTREE_IDS()
+	BTREE_ID_NR
+};
+
+#undef DEF_BTREE_ID
+
+#define BTREE_MAX_DEPTH		4U
+
+/* Btree nodes */
+
+/* Version 1: Seed pointer into btree node checksum
+ */
+#define BCACHE_BSET_CSUM		1
+#define BCACHE_BSET_KEY_v1		2
+#define BCACHE_BSET_JOURNAL_SEQ		3
+#define BCACHE_BSET_VERSION		3
+
+/*
+ * Btree nodes
+ *
+ * On disk a btree node is a list/log of these; within each set the keys are
+ * sorted
+ */
+struct bset {
+	__le64			seq;
+
+	/*
+	 * Highest journal entry this bset contains keys for.
+	 * If on recovery we don't see that journal entry, this bset is ignored:
+	 * this allows us to preserve the order of all index updates after a
+	 * crash, since the journal records a total order of all index updates
+	 * and anything that didn't make it to the journal doesn't get used.
+	 */
+	__le64			journal_seq;
+
+	__le32			flags;
+	__le16			version;
+	__le16			u64s; /* count of d[] in u64s */
+
+	union {
+		struct bkey_packed start[0];
+		__u64		_data[0];
+	};
+} __attribute__((packed, aligned(8)));
+
+LE32_BITMASK(BSET_CSUM_TYPE,	struct bset, flags, 0, 4);
+
+LE32_BITMASK(BSET_BIG_ENDIAN,	struct bset, flags, 4, 5);
+LE32_BITMASK(BSET_SEPARATE_WHITEOUTS,
+				struct bset, flags, 5, 6);
+
+struct btree_node {
+	struct bch_csum		csum;
+	__le64			magic;
+
+	/* this flags field is encrypted, unlike bset->flags: */
+	__le64			flags;
+
+	/* Closed interval: */
+	struct bpos		min_key;
+	struct bpos		max_key;
+	struct bch_extent_ptr	ptr;
+	struct bkey_format	format;
+
+	union {
+	struct bset		keys;
+	struct {
+		__u8		pad[22];
+		__le16		u64s;
+		__u64		_data[0];
+
+	};
+	};
+} __attribute__((packed, aligned(8)));
+
+LE64_BITMASK(BTREE_NODE_ID,	struct btree_node, flags,  0,  4);
+LE64_BITMASK(BTREE_NODE_LEVEL,	struct btree_node, flags,  4,  8);
+/* 8-32 unused */
+LE64_BITMASK(BTREE_NODE_SEQ,	struct btree_node, flags, 32, 64);
+
+struct btree_node_entry {
+	struct bch_csum		csum;
+
+	union {
+	struct bset		keys;
+	struct {
+		__u8		pad[22];
+		__le16		u64s;
+		__u64		_data[0];
+
+	};
+	};
+} __attribute__((packed, aligned(8)));
+
+#endif /* _BCACHEFS_FORMAT_H */
diff --git a/fs/bcachefs/bcachefs_ioctl.h b/fs/bcachefs/bcachefs_ioctl.h
new file mode 100644
index 000000000000..c65104ed454a
--- /dev/null
+++ b/fs/bcachefs/bcachefs_ioctl.h
@@ -0,0 +1,310 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_IOCTL_H
+#define _BCACHEFS_IOCTL_H
+
+#include <linux/uuid.h>
+#include <asm/ioctl.h>
+#include "bcachefs_format.h"
+
+/*
+ * Flags common to multiple ioctls:
+ */
+#define BCH_FORCE_IF_DATA_LOST		(1 << 0)
+#define BCH_FORCE_IF_METADATA_LOST	(1 << 1)
+#define BCH_FORCE_IF_DATA_DEGRADED	(1 << 2)
+#define BCH_FORCE_IF_METADATA_DEGRADED	(1 << 3)
+
+#define BCH_FORCE_IF_DEGRADED			\
+	(BCH_FORCE_IF_DATA_DEGRADED|		\
+	 BCH_FORCE_IF_METADATA_DEGRADED)
+
+/*
+ * If cleared, ioctl that refer to a device pass it as a pointer to a pathname
+ * (e.g. /dev/sda1); if set, the dev field is the device's index within the
+ * filesystem:
+ */
+#define BCH_BY_INDEX			(1 << 4)
+
+/*
+ * For BCH_IOCTL_READ_SUPER: get superblock of a specific device, not filesystem
+ * wide superblock:
+ */
+#define BCH_READ_DEV			(1 << 5)
+
+/* global control dev: */
+
+/* These are currently broken, and probably unnecessary: */
+#if 0
+#define BCH_IOCTL_ASSEMBLE	_IOW(0xbc, 1, struct bch_ioctl_assemble)
+#define BCH_IOCTL_INCREMENTAL	_IOW(0xbc, 2, struct bch_ioctl_incremental)
+
+struct bch_ioctl_assemble {
+	__u32			flags;
+	__u32			nr_devs;
+	__u64			pad;
+	__u64			devs[];
+};
+
+struct bch_ioctl_incremental {
+	__u32			flags;
+	__u64			pad;
+	__u64			dev;
+};
+#endif
+
+/* filesystem ioctls: */
+
+#define BCH_IOCTL_QUERY_UUID	_IOR(0xbc,	1,  struct bch_ioctl_query_uuid)
+
+/* These only make sense when we also have incremental assembly */
+#if 0
+#define BCH_IOCTL_START		_IOW(0xbc,	2,  struct bch_ioctl_start)
+#define BCH_IOCTL_STOP		_IO(0xbc,	3)
+#endif
+
+#define BCH_IOCTL_DISK_ADD	_IOW(0xbc,	4,  struct bch_ioctl_disk)
+#define BCH_IOCTL_DISK_REMOVE	_IOW(0xbc,	5,  struct bch_ioctl_disk)
+#define BCH_IOCTL_DISK_ONLINE	_IOW(0xbc,	6,  struct bch_ioctl_disk)
+#define BCH_IOCTL_DISK_OFFLINE	_IOW(0xbc,	7,  struct bch_ioctl_disk)
+#define BCH_IOCTL_DISK_SET_STATE _IOW(0xbc,	8,  struct bch_ioctl_disk_set_state)
+#define BCH_IOCTL_DATA		_IOW(0xbc,	10, struct bch_ioctl_data)
+#define BCH_IOCTL_USAGE		_IOWR(0xbc,	11, struct bch_ioctl_usage)
+#define BCH_IOCTL_READ_SUPER	_IOW(0xbc,	12, struct bch_ioctl_read_super)
+#define BCH_IOCTL_DISK_GET_IDX	_IOW(0xbc,	13,  struct bch_ioctl_disk_get_idx)
+#define BCH_IOCTL_DISK_RESIZE	_IOW(0xbc,	13,  struct bch_ioctl_disk_resize)
+
+/*
+ * BCH_IOCTL_QUERY_UUID: get filesystem UUID
+ *
+ * Returns user visible UUID, not internal UUID (which may not ever be changed);
+ * the filesystem's sysfs directory may be found under /sys/fs/bcachefs with
+ * this UUID.
+ */
+struct bch_ioctl_query_uuid {
+	__uuid_t		uuid;
+};
+
+#if 0
+struct bch_ioctl_start {
+	__u32			flags;
+	__u32			pad;
+};
+#endif
+
+/*
+ * BCH_IOCTL_DISK_ADD: add a new device to an existing filesystem
+ *
+ * The specified device must not be open or in use. On success, the new device
+ * will be an online member of the filesystem just like any other member.
+ *
+ * The device must first be prepared by userspace by formatting with a bcachefs
+ * superblock, which is only used for passing in superblock options/parameters
+ * for that device (in struct bch_member). The new device's superblock should
+ * not claim to be a member of any existing filesystem - UUIDs on it will be
+ * ignored.
+ */
+
+/*
+ * BCH_IOCTL_DISK_REMOVE: permanently remove a member device from a filesystem
+ *
+ * Any data present on @dev will be permanently deleted, and @dev will be
+ * removed from its slot in the filesystem's list of member devices. The device
+ * may be either offline or offline.
+ *
+ * Will fail removing @dev would leave us with insufficient read write devices
+ * or degraded/unavailable data, unless the approprate BCH_FORCE_IF_* flags are
+ * set.
+ */
+
+/*
+ * BCH_IOCTL_DISK_ONLINE: given a disk that is already a member of a filesystem
+ * but is not open (e.g. because we started in degraded mode), bring it online
+ *
+ * all existing data on @dev will be available once the device is online,
+ * exactly as if @dev was present when the filesystem was first mounted
+ */
+
+/*
+ * BCH_IOCTL_DISK_OFFLINE: offline a disk, causing the kernel to close that
+ * block device, without removing it from the filesystem (so it can be brought
+ * back online later)
+ *
+ * Data present on @dev will be unavailable while @dev is offline (unless
+ * replicated), but will still be intact and untouched if @dev is brought back
+ * online
+ *
+ * Will fail (similarly to BCH_IOCTL_DISK_SET_STATE) if offlining @dev would
+ * leave us with insufficient read write devices or degraded/unavailable data,
+ * unless the approprate BCH_FORCE_IF_* flags are set.
+ */
+
+struct bch_ioctl_disk {
+	__u32			flags;
+	__u32			pad;
+	__u64			dev;
+};
+
+/*
+ * BCH_IOCTL_DISK_SET_STATE: modify state of a member device of a filesystem
+ *
+ * @new_state		- one of the bch_member_state states (rw, ro, failed,
+ *			  spare)
+ *
+ * Will refuse to change member state if we would then have insufficient devices
+ * to write to, or if it would result in degraded data (when @new_state is
+ * failed or spare) unless the appropriate BCH_FORCE_IF_* flags are set.
+ */
+struct bch_ioctl_disk_set_state {
+	__u32			flags;
+	__u8			new_state;
+	__u8			pad[3];
+	__u64			dev;
+};
+
+enum bch_data_ops {
+	BCH_DATA_OP_SCRUB	= 0,
+	BCH_DATA_OP_REREPLICATE	= 1,
+	BCH_DATA_OP_MIGRATE	= 2,
+	BCH_DATA_OP_NR		= 3,
+};
+
+/*
+ * BCH_IOCTL_DATA: operations that walk and manipulate filesystem data (e.g.
+ * scrub, rereplicate, migrate).
+ *
+ * This ioctl kicks off a job in the background, and returns a file descriptor.
+ * Reading from the file descriptor returns a struct bch_ioctl_data_event,
+ * indicating current progress, and closing the file descriptor will stop the
+ * job. The file descriptor is O_CLOEXEC.
+ */
+struct bch_ioctl_data {
+	__u32			op;
+	__u32			flags;
+
+	struct bpos		start;
+	struct bpos		end;
+
+	union {
+	struct {
+		__u32		dev;
+		__u32		pad;
+	}			migrate;
+	struct {
+		__u64		pad[8];
+	};
+	};
+} __attribute__((packed, aligned(8)));
+
+enum bch_data_event {
+	BCH_DATA_EVENT_PROGRESS	= 0,
+	/* XXX: add an event for reporting errors */
+	BCH_DATA_EVENT_NR	= 1,
+};
+
+struct bch_ioctl_data_progress {
+	__u8			data_type;
+	__u8			btree_id;
+	__u8			pad[2];
+	struct bpos		pos;
+
+	__u64			sectors_done;
+	__u64			sectors_total;
+} __attribute__((packed, aligned(8)));
+
+struct bch_ioctl_data_event {
+	__u8			type;
+	__u8			pad[7];
+	union {
+	struct bch_ioctl_data_progress p;
+	__u64			pad2[15];
+	};
+} __attribute__((packed, aligned(8)));
+
+struct bch_ioctl_dev_usage {
+	__u8			state;
+	__u8			alive;
+	__u8			pad[6];
+	__u32			dev;
+
+	__u32			bucket_size;
+	__u64			nr_buckets;
+
+	__u64			buckets[BCH_DATA_NR];
+	__u64			sectors[BCH_DATA_NR];
+};
+
+struct bch_ioctl_fs_usage {
+	__u64			capacity;
+	__u64			used;
+	__u64			online_reserved;
+	__u64			persistent_reserved[BCH_REPLICAS_MAX];
+	__u64			sectors[BCH_DATA_NR][BCH_REPLICAS_MAX];
+};
+
+/*
+ * BCH_IOCTL_USAGE: query filesystem disk space usage
+ *
+ * Returns disk space usage broken out by data type, number of replicas, and
+ * by component device
+ *
+ * @nr_devices	- number of devices userspace allocated space for in @devs
+ *
+ * On success, @fs and @devs will be filled out appropriately and devs[i].alive
+ * will indicate if a device was present in that slot
+ *
+ * Returns -ERANGE if @nr_devices was too small
+ */
+struct bch_ioctl_usage {
+	__u16			nr_devices;
+	__u16			pad[3];
+
+	struct bch_ioctl_fs_usage fs;
+	struct bch_ioctl_dev_usage devs[0];
+};
+
+/*
+ * BCH_IOCTL_READ_SUPER: read filesystem superblock
+ *
+ * Equivalent to reading the superblock directly from the block device, except
+ * avoids racing with the kernel writing the superblock or having to figure out
+ * which block device to read
+ *
+ * @sb		- buffer to read into
+ * @size	- size of userspace allocated buffer
+ * @dev		- device to read superblock for, if BCH_READ_DEV flag is
+ *		  specified
+ *
+ * Returns -ERANGE if buffer provided is too small
+ */
+struct bch_ioctl_read_super {
+	__u32			flags;
+	__u32			pad;
+	__u64			dev;
+	__u64			size;
+	__u64			sb;
+};
+
+/*
+ * BCH_IOCTL_DISK_GET_IDX: give a path to a block device, query filesystem to
+ * determine if disk is a (online) member - if so, returns device's index
+ *
+ * Returns -ENOENT if not found
+ */
+struct bch_ioctl_disk_get_idx {
+	__u64			dev;
+};
+
+/*
+ * BCH_IOCTL_DISK_RESIZE: resize filesystem on a device
+ *
+ * @dev		- member to resize
+ * @nbuckets	- new number of buckets
+ */
+struct bch_ioctl_disk_resize {
+	__u32			flags;
+	__u32			pad;
+	__u64			dev;
+	__u64			nbuckets;
+};
+
+#endif /* _BCACHEFS_IOCTL_H */
diff --git a/fs/bcachefs/bkey.c b/fs/bcachefs/bkey.c
new file mode 100644
index 000000000000..c0e86ada1c53
--- /dev/null
+++ b/fs/bcachefs/bkey.c
@@ -0,0 +1,1164 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "bkey.h"
+#include "bkey_methods.h"
+#include "bset.h"
+#include "util.h"
+
+#undef EBUG_ON
+
+#ifdef DEBUG_BKEYS
+#define EBUG_ON(cond)		BUG_ON(cond)
+#else
+#define EBUG_ON(cond)
+#endif
+
+const struct bkey_format bch2_bkey_format_current = BKEY_FORMAT_CURRENT;
+
+struct bkey __bch2_bkey_unpack_key(const struct bkey_format *,
+			      const struct bkey_packed *);
+
+void bch2_to_binary(char *out, const u64 *p, unsigned nr_bits)
+{
+	unsigned bit = high_bit_offset, done = 0;
+
+	while (1) {
+		while (bit < 64) {
+			if (done && !(done % 8))
+				*out++ = ' ';
+			*out++ = *p & (1ULL << (63 - bit)) ? '1' : '0';
+			bit++;
+			done++;
+			if (done == nr_bits) {
+				*out++ = '\0';
+				return;
+			}
+		}
+
+		p = next_word(p);
+		bit = 0;
+	}
+}
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+
+static void bch2_bkey_pack_verify(const struct bkey_packed *packed,
+				 const struct bkey *unpacked,
+				 const struct bkey_format *format)
+{
+	struct bkey tmp;
+
+	BUG_ON(bkeyp_val_u64s(format, packed) !=
+	       bkey_val_u64s(unpacked));
+
+	BUG_ON(packed->u64s < bkeyp_key_u64s(format, packed));
+
+	tmp = __bch2_bkey_unpack_key(format, packed);
+
+	if (memcmp(&tmp, unpacked, sizeof(struct bkey))) {
+		char buf1[160], buf2[160];
+		char buf3[160], buf4[160];
+
+		bch2_bkey_to_text(buf1, sizeof(buf1), unpacked);
+		bch2_bkey_to_text(buf2, sizeof(buf2), &tmp);
+		bch2_to_binary(buf3, (void *) unpacked, 80);
+		bch2_to_binary(buf4, high_word(format, packed), 80);
+
+		panic("keys differ: format u64s %u fields %u %u %u %u %u\n%s\n%s\n%s\n%s\n",
+		      format->key_u64s,
+		      format->bits_per_field[0],
+		      format->bits_per_field[1],
+		      format->bits_per_field[2],
+		      format->bits_per_field[3],
+		      format->bits_per_field[4],
+		      buf1, buf2, buf3, buf4);
+	}
+}
+
+#else
+static inline void bch2_bkey_pack_verify(const struct bkey_packed *packed,
+					const struct bkey *unpacked,
+					const struct bkey_format *format) {}
+#endif
+
+struct pack_state {
+	const struct bkey_format *format;
+	unsigned		bits;	/* bits remaining in current word */
+	u64			w;	/* current word */
+	u64			*p;	/* pointer to next word */
+};
+
+__always_inline
+static struct pack_state pack_state_init(const struct bkey_format *format,
+					 struct bkey_packed *k)
+{
+	u64 *p = high_word(format, k);
+
+	return (struct pack_state) {
+		.format	= format,
+		.bits	= 64 - high_bit_offset,
+		.w	= 0,
+		.p	= p,
+	};
+}
+
+__always_inline
+static void pack_state_finish(struct pack_state *state,
+			      struct bkey_packed *k)
+{
+	EBUG_ON(state->p <  k->_data);
+	EBUG_ON(state->p >= k->_data + state->format->key_u64s);
+
+	*state->p = state->w;
+}
+
+struct unpack_state {
+	const struct bkey_format *format;
+	unsigned		bits;	/* bits remaining in current word */
+	u64			w;	/* current word */
+	const u64		*p;	/* pointer to next word */
+};
+
+__always_inline
+static struct unpack_state unpack_state_init(const struct bkey_format *format,
+					     const struct bkey_packed *k)
+{
+	const u64 *p = high_word(format, k);
+
+	return (struct unpack_state) {
+		.format	= format,
+		.bits	= 64 - high_bit_offset,
+		.w	= *p << high_bit_offset,
+		.p	= p,
+	};
+}
+
+__always_inline
+static u64 get_inc_field(struct unpack_state *state, unsigned field)
+{
+	unsigned bits = state->format->bits_per_field[field];
+	u64 v = 0, offset = le64_to_cpu(state->format->field_offset[field]);
+
+	if (bits >= state->bits) {
+		v = state->w >> (64 - bits);
+		bits -= state->bits;
+
+		state->p = next_word(state->p);
+		state->w = *state->p;
+		state->bits = 64;
+	}
+
+	/* avoid shift by 64 if bits is 0 - bits is never 64 here: */
+	v |= (state->w >> 1) >> (63 - bits);
+	state->w <<= bits;
+	state->bits -= bits;
+
+	return v + offset;
+}
+
+__always_inline
+static bool set_inc_field(struct pack_state *state, unsigned field, u64 v)
+{
+	unsigned bits = state->format->bits_per_field[field];
+	u64 offset = le64_to_cpu(state->format->field_offset[field]);
+
+	if (v < offset)
+		return false;
+
+	v -= offset;
+
+	if (fls64(v) > bits)
+		return false;
+
+	if (bits > state->bits) {
+		bits -= state->bits;
+		/* avoid shift by 64 if bits is 0 - bits is never 64 here: */
+		state->w |= (v >> 1) >> (bits - 1);
+
+		*state->p = state->w;
+		state->p = next_word(state->p);
+		state->w = 0;
+		state->bits = 64;
+	}
+
+	state->bits -= bits;
+	state->w |= v << state->bits;
+
+	return true;
+}
+
+/*
+ * Note: does NOT set out->format (we don't know what it should be here!)
+ *
+ * Also: doesn't work on extents - it doesn't preserve the invariant that
+ * if k is packed bkey_start_pos(k) will successfully pack
+ */
+static bool bch2_bkey_transform_key(const struct bkey_format *out_f,
+				   struct bkey_packed *out,
+				   const struct bkey_format *in_f,
+				   const struct bkey_packed *in)
+{
+	struct pack_state out_s = pack_state_init(out_f, out);
+	struct unpack_state in_s = unpack_state_init(in_f, in);
+	u64 *w = out->_data;
+	unsigned i;
+
+	*w = 0;
+
+	for (i = 0; i < BKEY_NR_FIELDS; i++)
+		if (!set_inc_field(&out_s, i, get_inc_field(&in_s, i)))
+			return false;
+
+	/* Can't happen because the val would be too big to unpack: */
+	EBUG_ON(in->u64s - in_f->key_u64s + out_f->key_u64s > U8_MAX);
+
+	pack_state_finish(&out_s, out);
+	out->u64s	= out_f->key_u64s + in->u64s - in_f->key_u64s;
+	out->needs_whiteout = in->needs_whiteout;
+	out->type	= in->type;
+
+	return true;
+}
+
+bool bch2_bkey_transform(const struct bkey_format *out_f,
+			struct bkey_packed *out,
+			const struct bkey_format *in_f,
+			const struct bkey_packed *in)
+{
+	if (!bch2_bkey_transform_key(out_f, out, in_f, in))
+		return false;
+
+	memcpy_u64s((u64 *) out + out_f->key_u64s,
+		    (u64 *) in + in_f->key_u64s,
+		    (in->u64s - in_f->key_u64s));
+	return true;
+}
+
+#define bkey_fields()							\
+	x(BKEY_FIELD_INODE,		p.inode)			\
+	x(BKEY_FIELD_OFFSET,		p.offset)			\
+	x(BKEY_FIELD_SNAPSHOT,		p.snapshot)			\
+	x(BKEY_FIELD_SIZE,		size)				\
+	x(BKEY_FIELD_VERSION_HI,	version.hi)			\
+	x(BKEY_FIELD_VERSION_LO,	version.lo)
+
+struct bkey __bch2_bkey_unpack_key(const struct bkey_format *format,
+			      const struct bkey_packed *in)
+{
+	struct unpack_state state = unpack_state_init(format, in);
+	struct bkey out;
+
+	EBUG_ON(format->nr_fields != BKEY_NR_FIELDS);
+	EBUG_ON(in->u64s < format->key_u64s);
+	EBUG_ON(in->format != KEY_FORMAT_LOCAL_BTREE);
+	EBUG_ON(in->u64s - format->key_u64s + BKEY_U64s > U8_MAX);
+
+	out.u64s	= BKEY_U64s + in->u64s - format->key_u64s;
+	out.format	= KEY_FORMAT_CURRENT;
+	out.needs_whiteout = in->needs_whiteout;
+	out.type	= in->type;
+	out.pad[0]	= 0;
+
+#define x(id, field)	out.field = get_inc_field(&state, id);
+	bkey_fields()
+#undef x
+
+	return out;
+}
+
+#ifndef HAVE_BCACHEFS_COMPILED_UNPACK
+struct bpos __bkey_unpack_pos(const struct bkey_format *format,
+				     const struct bkey_packed *in)
+{
+	struct unpack_state state = unpack_state_init(format, in);
+	struct bpos out;
+
+	EBUG_ON(format->nr_fields != BKEY_NR_FIELDS);
+	EBUG_ON(in->u64s < format->key_u64s);
+	EBUG_ON(in->format != KEY_FORMAT_LOCAL_BTREE);
+
+	out.inode	= get_inc_field(&state, BKEY_FIELD_INODE);
+	out.offset	= get_inc_field(&state, BKEY_FIELD_OFFSET);
+	out.snapshot	= get_inc_field(&state, BKEY_FIELD_SNAPSHOT);
+
+	return out;
+}
+#endif
+
+/**
+ * bch2_bkey_pack_key -- pack just the key, not the value
+ */
+bool bch2_bkey_pack_key(struct bkey_packed *out, const struct bkey *in,
+		   const struct bkey_format *format)
+{
+	struct pack_state state = pack_state_init(format, out);
+	u64 *w = out->_data;
+
+	EBUG_ON((void *) in == (void *) out);
+	EBUG_ON(format->nr_fields != BKEY_NR_FIELDS);
+	EBUG_ON(in->format != KEY_FORMAT_CURRENT);
+
+	*w = 0;
+
+#define x(id, field)	if (!set_inc_field(&state, id, in->field)) return false;
+	bkey_fields()
+#undef x
+
+	/*
+	 * Extents - we have to guarantee that if an extent is packed, a trimmed
+	 * version will also pack:
+	 */
+	if (bkey_start_offset(in) <
+	    le64_to_cpu(format->field_offset[BKEY_FIELD_OFFSET]))
+		return false;
+
+	pack_state_finish(&state, out);
+	out->u64s	= format->key_u64s + in->u64s - BKEY_U64s;
+	out->format	= KEY_FORMAT_LOCAL_BTREE;
+	out->needs_whiteout = in->needs_whiteout;
+	out->type	= in->type;
+
+	bch2_bkey_pack_verify(out, in, format);
+	return true;
+}
+
+/**
+ * bch2_bkey_unpack -- unpack the key and the value
+ */
+void bch2_bkey_unpack(const struct btree *b, struct bkey_i *dst,
+		 const struct bkey_packed *src)
+{
+	dst->k = bkey_unpack_key(b, src);
+
+	memcpy_u64s(&dst->v,
+		    bkeyp_val(&b->format, src),
+		    bkeyp_val_u64s(&b->format, src));
+}
+
+/**
+ * bch2_bkey_pack -- pack the key and the value
+ */
+bool bch2_bkey_pack(struct bkey_packed *out, const struct bkey_i *in,
+	       const struct bkey_format *format)
+{
+	struct bkey_packed tmp;
+
+	if (!bch2_bkey_pack_key(&tmp, &in->k, format))
+		return false;
+
+	memmove_u64s((u64 *) out + format->key_u64s,
+		     &in->v,
+		     bkey_val_u64s(&in->k));
+	memcpy_u64s(out, &tmp, format->key_u64s);
+
+	return true;
+}
+
+__always_inline
+static bool set_inc_field_lossy(struct pack_state *state, unsigned field, u64 v)
+{
+	unsigned bits = state->format->bits_per_field[field];
+	u64 offset = le64_to_cpu(state->format->field_offset[field]);
+	bool ret = true;
+
+	EBUG_ON(v < offset);
+	v -= offset;
+
+	if (fls64(v) > bits) {
+		v = ~(~0ULL << bits);
+		ret = false;
+	}
+
+	if (bits > state->bits) {
+		bits -= state->bits;
+		state->w |= (v >> 1) >> (bits - 1);
+
+		*state->p = state->w;
+		state->p = next_word(state->p);
+		state->w = 0;
+		state->bits = 64;
+	}
+
+	state->bits -= bits;
+	state->w |= v << state->bits;
+
+	return ret;
+}
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+static bool bkey_packed_successor(struct bkey_packed *out,
+				  const struct btree *b,
+				  struct bkey_packed k)
+{
+	const struct bkey_format *f = &b->format;
+	unsigned nr_key_bits = b->nr_key_bits;
+	unsigned first_bit, offset;
+	u64 *p;
+
+	EBUG_ON(b->nr_key_bits != bkey_format_key_bits(f));
+
+	if (!nr_key_bits)
+		return false;
+
+	*out = k;
+
+	first_bit = high_bit_offset + nr_key_bits - 1;
+	p = nth_word(high_word(f, out), first_bit >> 6);
+	offset = 63 - (first_bit & 63);
+
+	while (nr_key_bits) {
+		unsigned bits = min(64 - offset, nr_key_bits);
+		u64 mask = (~0ULL >> (64 - bits)) << offset;
+
+		if ((*p & mask) != mask) {
+			*p += 1ULL << offset;
+			EBUG_ON(bkey_cmp_packed(b, out, &k) <= 0);
+			return true;
+		}
+
+		*p &= ~mask;
+		p = prev_word(p);
+		nr_key_bits -= bits;
+		offset = 0;
+	}
+
+	return false;
+}
+#endif
+
+/*
+ * Returns a packed key that compares <= in
+ *
+ * This is used in bset_search_tree(), where we need a packed pos in order to be
+ * able to compare against the keys in the auxiliary search tree - and it's
+ * legal to use a packed pos that isn't equivalent to the original pos,
+ * _provided_ it compares <= to the original pos.
+ */
+enum bkey_pack_pos_ret bch2_bkey_pack_pos_lossy(struct bkey_packed *out,
+					   struct bpos in,
+					   const struct btree *b)
+{
+	const struct bkey_format *f = &b->format;
+	struct pack_state state = pack_state_init(f, out);
+	u64 *w = out->_data;
+#ifdef CONFIG_BCACHEFS_DEBUG
+	struct bpos orig = in;
+#endif
+	bool exact = true;
+
+	*w = 0;
+
+	if (unlikely(in.snapshot <
+		     le64_to_cpu(f->field_offset[BKEY_FIELD_SNAPSHOT]))) {
+		if (!in.offset-- &&
+		    !in.inode--)
+			return BKEY_PACK_POS_FAIL;
+		in.snapshot	= KEY_SNAPSHOT_MAX;
+		exact = false;
+	}
+
+	if (unlikely(in.offset <
+		     le64_to_cpu(f->field_offset[BKEY_FIELD_OFFSET]))) {
+		if (!in.inode--)
+			return BKEY_PACK_POS_FAIL;
+		in.offset	= KEY_OFFSET_MAX;
+		in.snapshot	= KEY_SNAPSHOT_MAX;
+		exact = false;
+	}
+
+	if (unlikely(in.inode <
+		     le64_to_cpu(f->field_offset[BKEY_FIELD_INODE])))
+		return BKEY_PACK_POS_FAIL;
+
+	if (!set_inc_field_lossy(&state, BKEY_FIELD_INODE, in.inode)) {
+		in.offset	= KEY_OFFSET_MAX;
+		in.snapshot	= KEY_SNAPSHOT_MAX;
+		exact = false;
+	}
+
+	if (!set_inc_field_lossy(&state, BKEY_FIELD_OFFSET, in.offset)) {
+		in.snapshot	= KEY_SNAPSHOT_MAX;
+		exact = false;
+	}
+
+	if (!set_inc_field_lossy(&state, BKEY_FIELD_SNAPSHOT, in.snapshot))
+		exact = false;
+
+	pack_state_finish(&state, out);
+	out->u64s	= f->key_u64s;
+	out->format	= KEY_FORMAT_LOCAL_BTREE;
+	out->type	= KEY_TYPE_DELETED;
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+	if (exact) {
+		BUG_ON(bkey_cmp_left_packed(b, out, &orig));
+	} else {
+		struct bkey_packed successor;
+
+		BUG_ON(bkey_cmp_left_packed(b, out, &orig) >= 0);
+		BUG_ON(bkey_packed_successor(&successor, b, *out) &&
+		       bkey_cmp_left_packed(b, &successor, &orig) < 0);
+	}
+#endif
+
+	return exact ? BKEY_PACK_POS_EXACT : BKEY_PACK_POS_SMALLER;
+}
+
+void bch2_bkey_format_init(struct bkey_format_state *s)
+{
+	unsigned i;
+
+	for (i = 0; i < ARRAY_SIZE(s->field_min); i++)
+		s->field_min[i] = U64_MAX;
+
+	for (i = 0; i < ARRAY_SIZE(s->field_max); i++)
+		s->field_max[i] = 0;
+
+	/* Make sure we can store a size of 0: */
+	s->field_min[BKEY_FIELD_SIZE] = 0;
+}
+
+static void __bkey_format_add(struct bkey_format_state *s,
+			      unsigned field, u64 v)
+{
+	s->field_min[field] = min(s->field_min[field], v);
+	s->field_max[field] = max(s->field_max[field], v);
+}
+
+/*
+ * Changes @format so that @k can be successfully packed with @format
+ */
+void bch2_bkey_format_add_key(struct bkey_format_state *s, const struct bkey *k)
+{
+#define x(id, field) __bkey_format_add(s, id, k->field);
+	bkey_fields()
+#undef x
+	__bkey_format_add(s, BKEY_FIELD_OFFSET, bkey_start_offset(k));
+}
+
+void bch2_bkey_format_add_pos(struct bkey_format_state *s, struct bpos p)
+{
+	unsigned field = 0;
+
+	__bkey_format_add(s, field++, p.inode);
+	__bkey_format_add(s, field++, p.offset);
+	__bkey_format_add(s, field++, p.snapshot);
+}
+
+/*
+ * We don't want it to be possible for the packed format to represent fields
+ * bigger than a u64... that will cause confusion and issues (like with
+ * bkey_packed_successor())
+ */
+static void set_format_field(struct bkey_format *f, enum bch_bkey_fields i,
+			     unsigned bits, u64 offset)
+{
+	offset = bits == 64 ? 0 : min(offset, U64_MAX - ((1ULL << bits) - 1));
+
+	f->bits_per_field[i]	= bits;
+	f->field_offset[i]	= cpu_to_le64(offset);
+}
+
+struct bkey_format bch2_bkey_format_done(struct bkey_format_state *s)
+{
+	unsigned i, bits = KEY_PACKED_BITS_START;
+	struct bkey_format ret = {
+		.nr_fields = BKEY_NR_FIELDS,
+	};
+
+	for (i = 0; i < ARRAY_SIZE(s->field_min); i++) {
+		s->field_min[i] = min(s->field_min[i], s->field_max[i]);
+
+		set_format_field(&ret, i,
+				 fls64(s->field_max[i] - s->field_min[i]),
+				 s->field_min[i]);
+
+		bits += ret.bits_per_field[i];
+	}
+
+	/* allow for extent merging: */
+	if (ret.bits_per_field[BKEY_FIELD_SIZE]) {
+		ret.bits_per_field[BKEY_FIELD_SIZE] += 4;
+		bits += 4;
+	}
+
+	ret.key_u64s = DIV_ROUND_UP(bits, 64);
+
+	/* if we have enough spare bits, round fields up to nearest byte */
+	bits = ret.key_u64s * 64 - bits;
+
+	for (i = 0; i < ARRAY_SIZE(ret.bits_per_field); i++) {
+		unsigned r = round_up(ret.bits_per_field[i], 8) -
+			ret.bits_per_field[i];
+
+		if (r <= bits) {
+			set_format_field(&ret, i,
+					 ret.bits_per_field[i] + r,
+					 le64_to_cpu(ret.field_offset[i]));
+			bits -= r;
+		}
+	}
+
+	EBUG_ON(bch2_bkey_format_validate(&ret));
+	return ret;
+}
+
+const char *bch2_bkey_format_validate(struct bkey_format *f)
+{
+	unsigned i, bits = KEY_PACKED_BITS_START;
+
+	if (f->nr_fields != BKEY_NR_FIELDS)
+		return "incorrect number of fields";
+
+	for (i = 0; i < f->nr_fields; i++) {
+		u64 field_offset = le64_to_cpu(f->field_offset[i]);
+
+		if (f->bits_per_field[i] > 64)
+			return "field too large";
+
+		if (field_offset &&
+		    (f->bits_per_field[i] == 64 ||
+		    (field_offset + ((1ULL << f->bits_per_field[i]) - 1) <
+		     field_offset)))
+			return "offset + bits overflow";
+
+		bits += f->bits_per_field[i];
+	}
+
+	if (f->key_u64s != DIV_ROUND_UP(bits, 64))
+		return "incorrect key_u64s";
+
+	return NULL;
+}
+
+/*
+ * Most significant differing bit
+ * Bits are indexed from 0 - return is [0, nr_key_bits)
+ */
+__pure
+unsigned bch2_bkey_greatest_differing_bit(const struct btree *b,
+					  const struct bkey_packed *l_k,
+					  const struct bkey_packed *r_k)
+{
+	const u64 *l = high_word(&b->format, l_k);
+	const u64 *r = high_word(&b->format, r_k);
+	unsigned nr_key_bits = b->nr_key_bits;
+	unsigned word_bits = 64 - high_bit_offset;
+	u64 l_v, r_v;
+
+	EBUG_ON(b->nr_key_bits != bkey_format_key_bits(&b->format));
+
+	/* for big endian, skip past header */
+	l_v = *l & (~0ULL >> high_bit_offset);
+	r_v = *r & (~0ULL >> high_bit_offset);
+
+	while (nr_key_bits) {
+		if (nr_key_bits < word_bits) {
+			l_v >>= word_bits - nr_key_bits;
+			r_v >>= word_bits - nr_key_bits;
+			nr_key_bits = 0;
+		} else {
+			nr_key_bits -= word_bits;
+		}
+
+		if (l_v != r_v)
+			return fls64(l_v ^ r_v) - 1 + nr_key_bits;
+
+		l = next_word(l);
+		r = next_word(r);
+
+		l_v = *l;
+		r_v = *r;
+		word_bits = 64;
+	}
+
+	return 0;
+}
+
+/*
+ * First set bit
+ * Bits are indexed from 0 - return is [0, nr_key_bits)
+ */
+__pure
+unsigned bch2_bkey_ffs(const struct btree *b, const struct bkey_packed *k)
+{
+	const u64 *p = high_word(&b->format, k);
+	unsigned nr_key_bits = b->nr_key_bits;
+	unsigned ret = 0, offset;
+
+	EBUG_ON(b->nr_key_bits != bkey_format_key_bits(&b->format));
+
+	offset = nr_key_bits;
+	while (offset > 64) {
+		p = next_word(p);
+		offset -= 64;
+	}
+
+	offset = 64 - offset;
+
+	while (nr_key_bits) {
+		unsigned bits = nr_key_bits + offset < 64
+			? nr_key_bits
+			: 64 - offset;
+
+		u64 mask = (~0ULL >> (64 - bits)) << offset;
+
+		if (*p & mask)
+			return ret + __ffs64(*p & mask) - offset;
+
+		p = prev_word(p);
+		nr_key_bits -= bits;
+		ret += bits;
+		offset = 0;
+	}
+
+	return 0;
+}
+
+#ifdef HAVE_BCACHEFS_COMPILED_UNPACK
+
+static inline int __bkey_cmp_bits(const u64 *l, const u64 *r,
+				  unsigned nr_key_bits)
+{
+	long d0, d1, d2, d3;
+	int cmp;
+
+	/* we shouldn't need asm for this, but gcc is being retarded: */
+
+	asm(".intel_syntax noprefix;"
+	    "xor eax, eax;"
+	    "xor edx, edx;"
+	    "1:;"
+	    "mov r8, [rdi];"
+	    "mov r9, [rsi];"
+	    "sub ecx, 64;"
+	    "jl 2f;"
+
+	    "cmp r8, r9;"
+	    "jnz 3f;"
+
+	    "lea rdi, [rdi - 8];"
+	    "lea rsi, [rsi - 8];"
+	    "jmp 1b;"
+
+	    "2:;"
+	    "not ecx;"
+	    "shr r8, 1;"
+	    "shr r9, 1;"
+	    "shr r8, cl;"
+	    "shr r9, cl;"
+	    "cmp r8, r9;"
+
+	    "3:\n"
+	    "seta al;"
+	    "setb dl;"
+	    "sub eax, edx;"
+	    ".att_syntax prefix;"
+	    : "=&D" (d0), "=&S" (d1), "=&d" (d2), "=&c" (d3), "=&a" (cmp)
+	    : "0" (l), "1" (r), "3" (nr_key_bits)
+	    : "r8", "r9", "cc", "memory");
+
+	return cmp;
+}
+
+#define I(_x)			(*(out)++ = (_x))
+#define I1(i0)						I(i0)
+#define I2(i0, i1)		(I1(i0),		I(i1))
+#define I3(i0, i1, i2)		(I2(i0, i1),		I(i2))
+#define I4(i0, i1, i2, i3)	(I3(i0, i1, i2),	I(i3))
+#define I5(i0, i1, i2, i3, i4)	(I4(i0, i1, i2, i3),	I(i4))
+
+static u8 *compile_bkey_field(const struct bkey_format *format, u8 *out,
+			      enum bch_bkey_fields field,
+			      unsigned dst_offset, unsigned dst_size,
+			      bool *eax_zeroed)
+{
+	unsigned bits = format->bits_per_field[field];
+	u64 offset = le64_to_cpu(format->field_offset[field]);
+	unsigned i, byte, bit_offset, align, shl, shr;
+
+	if (!bits && !offset) {
+		if (!*eax_zeroed) {
+			/* xor eax, eax */
+			I2(0x31, 0xc0);
+		}
+
+		*eax_zeroed = true;
+		goto set_field;
+	}
+
+	if (!bits) {
+		/* just return offset: */
+
+		switch (dst_size) {
+		case 8:
+			if (offset > S32_MAX) {
+				/* mov [rdi + dst_offset], offset */
+				I3(0xc7, 0x47, dst_offset);
+				memcpy(out, &offset, 4);
+				out += 4;
+
+				I3(0xc7, 0x47, dst_offset + 4);
+				memcpy(out, (void *) &offset + 4, 4);
+				out += 4;
+			} else {
+				/* mov [rdi + dst_offset], offset */
+				/* sign extended */
+				I4(0x48, 0xc7, 0x47, dst_offset);
+				memcpy(out, &offset, 4);
+				out += 4;
+			}
+			break;
+		case 4:
+			/* mov [rdi + dst_offset], offset */
+			I3(0xc7, 0x47, dst_offset);
+			memcpy(out, &offset, 4);
+			out += 4;
+			break;
+		default:
+			BUG();
+		}
+
+		return out;
+	}
+
+	bit_offset = format->key_u64s * 64;
+	for (i = 0; i <= field; i++)
+		bit_offset -= format->bits_per_field[i];
+
+	byte = bit_offset / 8;
+	bit_offset -= byte * 8;
+
+	*eax_zeroed = false;
+
+	if (bit_offset == 0 && bits == 8) {
+		/* movzx eax, BYTE PTR [rsi + imm8] */
+		I4(0x0f, 0xb6, 0x46, byte);
+	} else if (bit_offset == 0 && bits == 16) {
+		/* movzx eax, WORD PTR [rsi + imm8] */
+		I4(0x0f, 0xb7, 0x46, byte);
+	} else if (bit_offset + bits <= 32) {
+		align = min(4 - DIV_ROUND_UP(bit_offset + bits, 8), byte & 3);
+		byte -= align;
+		bit_offset += align * 8;
+
+		BUG_ON(bit_offset + bits > 32);
+
+		/* mov eax, [rsi + imm8] */
+		I3(0x8b, 0x46, byte);
+
+		if (bit_offset) {
+			/* shr eax, imm8 */
+			I3(0xc1, 0xe8, bit_offset);
+		}
+
+		if (bit_offset + bits < 32) {
+			unsigned mask = ~0U >> (32 - bits);
+
+			/* and eax, imm32 */
+			I1(0x25);
+			memcpy(out, &mask, 4);
+			out += 4;
+		}
+	} else if (bit_offset + bits <= 64) {
+		align = min(8 - DIV_ROUND_UP(bit_offset + bits, 8), byte & 7);
+		byte -= align;
+		bit_offset += align * 8;
+
+		BUG_ON(bit_offset + bits > 64);
+
+		/* mov rax, [rsi + imm8] */
+		I4(0x48, 0x8b, 0x46, byte);
+
+		shl = 64 - bit_offset - bits;
+		shr = bit_offset + shl;
+
+		if (shl) {
+			/* shl rax, imm8 */
+			I4(0x48, 0xc1, 0xe0, shl);
+		}
+
+		if (shr) {
+			/* shr rax, imm8 */
+			I4(0x48, 0xc1, 0xe8, shr);
+		}
+	} else {
+		align = min(4 - DIV_ROUND_UP(bit_offset + bits, 8), byte & 3);
+		byte -= align;
+		bit_offset += align * 8;
+
+		BUG_ON(bit_offset + bits > 96);
+
+		/* mov rax, [rsi + byte] */
+		I4(0x48, 0x8b, 0x46, byte);
+
+		/* mov edx, [rsi + byte + 8] */
+		I3(0x8b, 0x56, byte + 8);
+
+		/* bits from next word: */
+		shr = bit_offset + bits - 64;
+		BUG_ON(shr > bit_offset);
+
+		/* shr rax, bit_offset */
+		I4(0x48, 0xc1, 0xe8, shr);
+
+		/* shl rdx, imm8 */
+		I4(0x48, 0xc1, 0xe2, 64 - shr);
+
+		/* or rax, rdx */
+		I3(0x48, 0x09, 0xd0);
+
+		shr = bit_offset - shr;
+
+		if (shr) {
+			/* shr rax, imm8 */
+			I4(0x48, 0xc1, 0xe8, shr);
+		}
+	}
+
+	/* rax += offset: */
+	if (offset > S32_MAX) {
+		/* mov rdx, imm64 */
+		I2(0x48, 0xba);
+		memcpy(out, &offset, 8);
+		out += 8;
+		/* add %rdx, %rax */
+		I3(0x48, 0x01, 0xd0);
+	} else if (offset + (~0ULL >> (64 - bits)) > U32_MAX) {
+		/* add rax, imm32 */
+		I2(0x48, 0x05);
+		memcpy(out, &offset, 4);
+		out += 4;
+	} else if (offset) {
+		/* add eax, imm32 */
+		I1(0x05);
+		memcpy(out, &offset, 4);
+		out += 4;
+	}
+set_field:
+	switch (dst_size) {
+	case 8:
+		/* mov [rdi + dst_offset], rax */
+		I4(0x48, 0x89, 0x47, dst_offset);
+		break;
+	case 4:
+		/* mov [rdi + dst_offset], eax */
+		I3(0x89, 0x47, dst_offset);
+		break;
+	default:
+		BUG();
+	}
+
+	return out;
+}
+
+int bch2_compile_bkey_format(const struct bkey_format *format, void *_out)
+{
+	bool eax_zeroed = false;
+	u8 *out = _out;
+
+	/*
+	 * rdi: dst - unpacked key
+	 * rsi: src - packed key
+	 */
+
+	/* k->u64s, k->format, k->type */
+
+	/* mov eax, [rsi] */
+	I2(0x8b, 0x06);
+
+	/* add eax, BKEY_U64s - format->key_u64s */
+	I5(0x05, BKEY_U64s - format->key_u64s, KEY_FORMAT_CURRENT, 0, 0);
+
+	/* and eax, imm32: mask out k->pad: */
+	I5(0x25, 0xff, 0xff, 0xff, 0);
+
+	/* mov [rdi], eax */
+	I2(0x89, 0x07);
+
+#define x(id, field)							\
+	out = compile_bkey_field(format, out, id,			\
+				 offsetof(struct bkey, field),		\
+				 sizeof(((struct bkey *) NULL)->field),	\
+				 &eax_zeroed);
+	bkey_fields()
+#undef x
+
+	/* retq */
+	I1(0xc3);
+
+	return (void *) out - _out;
+}
+
+#else
+static inline int __bkey_cmp_bits(const u64 *l, const u64 *r,
+				  unsigned nr_key_bits)
+{
+	u64 l_v, r_v;
+
+	if (!nr_key_bits)
+		return 0;
+
+	/* for big endian, skip past header */
+	nr_key_bits += high_bit_offset;
+	l_v = *l & (~0ULL >> high_bit_offset);
+	r_v = *r & (~0ULL >> high_bit_offset);
+
+	while (1) {
+		if (nr_key_bits < 64) {
+			l_v >>= 64 - nr_key_bits;
+			r_v >>= 64 - nr_key_bits;
+			nr_key_bits = 0;
+		} else {
+			nr_key_bits -= 64;
+		}
+
+		if (l_v != r_v)
+			return l_v < r_v ? -1 : 1;
+
+		if (!nr_key_bits)
+			return 0;
+
+		l = next_word(l);
+		r = next_word(r);
+
+		l_v = *l;
+		r_v = *r;
+	}
+}
+#endif
+
+__pure
+int __bch2_bkey_cmp_packed_format_checked(const struct bkey_packed *l,
+					  const struct bkey_packed *r,
+					  const struct btree *b)
+{
+	const struct bkey_format *f = &b->format;
+	int ret;
+
+	EBUG_ON(!bkey_packed(l) || !bkey_packed(r));
+	EBUG_ON(b->nr_key_bits != bkey_format_key_bits(f));
+
+	ret = __bkey_cmp_bits(high_word(f, l),
+			      high_word(f, r),
+			      b->nr_key_bits);
+
+	EBUG_ON(ret != bkey_cmp(bkey_unpack_pos(b, l),
+				bkey_unpack_pos(b, r)));
+	return ret;
+}
+
+__pure __flatten
+int __bch2_bkey_cmp_left_packed_format_checked(const struct btree *b,
+					       const struct bkey_packed *l,
+					       const struct bpos *r)
+{
+	return bkey_cmp(bkey_unpack_pos_format_checked(b, l), *r);
+}
+
+__pure __flatten
+int __bch2_bkey_cmp_packed(const struct bkey_packed *l,
+			   const struct bkey_packed *r,
+			   const struct btree *b)
+{
+	int packed = bkey_lr_packed(l, r);
+
+	if (likely(packed == BKEY_PACKED_BOTH))
+		return __bch2_bkey_cmp_packed_format_checked(l, r, b);
+
+	switch (packed) {
+	case BKEY_PACKED_NONE:
+		return bkey_cmp(((struct bkey *) l)->p,
+				((struct bkey *) r)->p);
+	case BKEY_PACKED_LEFT:
+		return __bch2_bkey_cmp_left_packed_format_checked(b,
+				  (struct bkey_packed *) l,
+				  &((struct bkey *) r)->p);
+	case BKEY_PACKED_RIGHT:
+		return -__bch2_bkey_cmp_left_packed_format_checked(b,
+				  (struct bkey_packed *) r,
+				  &((struct bkey *) l)->p);
+	default:
+		unreachable();
+	}
+}
+
+__pure __flatten
+int __bch2_bkey_cmp_left_packed(const struct btree *b,
+				const struct bkey_packed *l,
+				const struct bpos *r)
+{
+	const struct bkey *l_unpacked;
+
+	return unlikely(l_unpacked = packed_to_bkey_c(l))
+		? bkey_cmp(l_unpacked->p, *r)
+		: __bch2_bkey_cmp_left_packed_format_checked(b, l, r);
+}
+
+void bch2_bpos_swab(struct bpos *p)
+{
+	u8 *l = (u8 *) p;
+	u8 *h = ((u8 *) &p[1]) - 1;
+
+	while (l < h) {
+		swap(*l, *h);
+		l++;
+		--h;
+	}
+}
+
+void bch2_bkey_swab_key(const struct bkey_format *_f, struct bkey_packed *k)
+{
+	const struct bkey_format *f = bkey_packed(k) ? _f : &bch2_bkey_format_current;
+	u8 *l = k->key_start;
+	u8 *h = (u8 *) (k->_data + f->key_u64s) - 1;
+
+	while (l < h) {
+		swap(*l, *h);
+		l++;
+		--h;
+	}
+}
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+void bch2_bkey_pack_test(void)
+{
+	struct bkey t = KEY(4134ULL, 1250629070527416633ULL, 0);
+	struct bkey_packed p;
+
+	struct bkey_format test_format = {
+		.key_u64s	= 2,
+		.nr_fields	= BKEY_NR_FIELDS,
+		.bits_per_field = {
+			13,
+			64,
+		},
+	};
+
+	struct unpack_state in_s =
+		unpack_state_init(&bch2_bkey_format_current, (void *) &t);
+	struct pack_state out_s = pack_state_init(&test_format, &p);
+	unsigned i;
+
+	for (i = 0; i < out_s.format->nr_fields; i++) {
+		u64 a, v = get_inc_field(&in_s, i);
+
+		switch (i) {
+#define x(id, field)	case id: a = t.field; break;
+	bkey_fields()
+#undef x
+		default:
+			BUG();
+		}
+
+		if (a != v)
+			panic("got %llu actual %llu i %u\n", v, a, i);
+
+		if (!set_inc_field(&out_s, i, v))
+			panic("failed at %u\n", i);
+	}
+
+	BUG_ON(!bch2_bkey_pack_key(&p, &t, &test_format));
+}
+#endif
diff --git a/fs/bcachefs/bkey.h b/fs/bcachefs/bkey.h
new file mode 100644
index 000000000000..9a0286d86784
--- /dev/null
+++ b/fs/bcachefs/bkey.h
@@ -0,0 +1,627 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BKEY_H
+#define _BCACHEFS_BKEY_H
+
+#include <linux/bug.h>
+#include "bcachefs_format.h"
+
+#include "util.h"
+#include "vstructs.h"
+
+#if 0
+
+/*
+ * compiled unpack functions are disabled, pending a new interface for
+ * dynamically allocating executable memory:
+ */
+
+#ifdef CONFIG_X86_64
+#define HAVE_BCACHEFS_COMPILED_UNPACK	1
+#endif
+#endif
+
+void bch2_to_binary(char *, const u64 *, unsigned);
+
+/* bkey with split value, const */
+struct bkey_s_c {
+	const struct bkey	*k;
+	const struct bch_val	*v;
+};
+
+/* bkey with split value */
+struct bkey_s {
+	union {
+	struct {
+		struct bkey	*k;
+		struct bch_val	*v;
+	};
+	struct bkey_s_c		s_c;
+	};
+};
+
+#define bkey_next(_k)		vstruct_next(_k)
+
+static inline unsigned bkey_val_u64s(const struct bkey *k)
+{
+	return k->u64s - BKEY_U64s;
+}
+
+static inline size_t bkey_val_bytes(const struct bkey *k)
+{
+	return bkey_val_u64s(k) * sizeof(u64);
+}
+
+static inline void set_bkey_val_u64s(struct bkey *k, unsigned val_u64s)
+{
+	k->u64s = BKEY_U64s + val_u64s;
+}
+
+static inline void set_bkey_val_bytes(struct bkey *k, unsigned bytes)
+{
+	k->u64s = BKEY_U64s + DIV_ROUND_UP(bytes, sizeof(u64));
+}
+
+#define bkey_deleted(_k)	((_k)->type == KEY_TYPE_DELETED)
+
+#define bkey_whiteout(_k)				\
+	((_k)->type == KEY_TYPE_DELETED || (_k)->type == KEY_TYPE_DISCARD)
+
+#define bkey_packed_typecheck(_k)					\
+({									\
+	BUILD_BUG_ON(!type_is(_k, struct bkey *) &&			\
+		     !type_is(_k, struct bkey_packed *));		\
+	type_is(_k, struct bkey_packed *);				\
+})
+
+enum bkey_lr_packed {
+	BKEY_PACKED_BOTH,
+	BKEY_PACKED_RIGHT,
+	BKEY_PACKED_LEFT,
+	BKEY_PACKED_NONE,
+};
+
+#define bkey_lr_packed_typecheck(_l, _r)				\
+	(!bkey_packed_typecheck(_l) + ((!bkey_packed_typecheck(_r)) << 1))
+
+#define bkey_lr_packed(_l, _r)						\
+	((_l)->format + ((_r)->format << 1))
+
+#define bkey_copy(_dst, _src)					\
+do {								\
+	BUILD_BUG_ON(!type_is(_dst, struct bkey_i *) &&		\
+		     !type_is(_dst, struct bkey_packed *));	\
+	BUILD_BUG_ON(!type_is(_src, struct bkey_i *) &&		\
+		     !type_is(_src, struct bkey_packed *));	\
+	EBUG_ON((u64 *) (_dst) > (u64 *) (_src) &&		\
+		(u64 *) (_dst) < (u64 *) (_src) +		\
+		((struct bkey *) (_src))->u64s);		\
+								\
+	__memmove_u64s_down((_dst), (_src),			\
+			    ((struct bkey *) (_src))->u64s);	\
+} while (0)
+
+struct btree;
+
+struct bkey_format_state {
+	u64 field_min[BKEY_NR_FIELDS];
+	u64 field_max[BKEY_NR_FIELDS];
+};
+
+void bch2_bkey_format_init(struct bkey_format_state *);
+void bch2_bkey_format_add_key(struct bkey_format_state *, const struct bkey *);
+void bch2_bkey_format_add_pos(struct bkey_format_state *, struct bpos);
+struct bkey_format bch2_bkey_format_done(struct bkey_format_state *);
+const char *bch2_bkey_format_validate(struct bkey_format *);
+
+__pure
+unsigned bch2_bkey_greatest_differing_bit(const struct btree *,
+					  const struct bkey_packed *,
+					  const struct bkey_packed *);
+__pure
+unsigned bch2_bkey_ffs(const struct btree *, const struct bkey_packed *);
+
+__pure
+int __bch2_bkey_cmp_packed_format_checked(const struct bkey_packed *,
+				     const struct bkey_packed *,
+				     const struct btree *);
+
+__pure
+int __bch2_bkey_cmp_left_packed_format_checked(const struct btree *,
+					  const struct bkey_packed *,
+					  const struct bpos *);
+
+__pure
+int __bch2_bkey_cmp_packed(const struct bkey_packed *,
+			   const struct bkey_packed *,
+			   const struct btree *);
+
+__pure
+int __bch2_bkey_cmp_left_packed(const struct btree *,
+				const struct bkey_packed *,
+				const struct bpos *);
+
+static inline __pure
+int bkey_cmp_left_packed(const struct btree *b,
+			 const struct bkey_packed *l, const struct bpos *r)
+{
+	return __bch2_bkey_cmp_left_packed(b, l, r);
+}
+
+/*
+ * we prefer to pass bpos by ref, but it's often enough terribly convenient to
+ * pass it by by val... as much as I hate c++, const ref would be nice here:
+ */
+__pure __flatten
+static inline int bkey_cmp_left_packed_byval(const struct btree *b,
+					     const struct bkey_packed *l,
+					     struct bpos r)
+{
+	return bkey_cmp_left_packed(b, l, &r);
+}
+
+/*
+ * If @_l or @_r are struct bkey * (not bkey_packed *), uses type information to
+ * skip dispatching on k->format:
+ */
+#define bkey_cmp_packed(_b, _l, _r)					\
+({									\
+	int _cmp;							\
+									\
+	switch (bkey_lr_packed_typecheck(_l, _r)) {			\
+	case BKEY_PACKED_NONE:						\
+		_cmp = bkey_cmp(((struct bkey *) (_l))->p,		\
+				((struct bkey *) (_r))->p);		\
+		break;							\
+	case BKEY_PACKED_LEFT:						\
+		_cmp = bkey_cmp_left_packed((_b),			\
+				  (struct bkey_packed *) (_l),		\
+				  &((struct bkey *) (_r))->p);		\
+		break;							\
+	case BKEY_PACKED_RIGHT:						\
+		_cmp = -bkey_cmp_left_packed((_b),			\
+				  (struct bkey_packed *) (_r),		\
+				  &((struct bkey *) (_l))->p);		\
+		break;							\
+	case BKEY_PACKED_BOTH:						\
+		_cmp = __bch2_bkey_cmp_packed((void *) (_l),		\
+					 (void *) (_r), (_b));		\
+		break;							\
+	}								\
+	_cmp;								\
+})
+
+#if 1
+static __always_inline int bkey_cmp(struct bpos l, struct bpos r)
+{
+	if (l.inode != r.inode)
+		return l.inode < r.inode ? -1 : 1;
+	if (l.offset != r.offset)
+		return l.offset < r.offset ? -1 : 1;
+	if (l.snapshot != r.snapshot)
+		return l.snapshot < r.snapshot ? -1 : 1;
+	return 0;
+}
+#else
+int bkey_cmp(struct bpos l, struct bpos r);
+#endif
+
+static inline struct bpos bpos_min(struct bpos l, struct bpos r)
+{
+	return bkey_cmp(l, r) < 0 ? l : r;
+}
+
+void bch2_bpos_swab(struct bpos *);
+void bch2_bkey_swab_key(const struct bkey_format *, struct bkey_packed *);
+
+static __always_inline int bversion_cmp(struct bversion l, struct bversion r)
+{
+	return  (l.hi > r.hi) - (l.hi < r.hi) ?:
+		(l.lo > r.lo) - (l.lo < r.lo);
+}
+
+#define ZERO_VERSION	((struct bversion) { .hi = 0, .lo = 0 })
+#define MAX_VERSION	((struct bversion) { .hi = ~0, .lo = ~0ULL })
+
+static __always_inline int bversion_zero(struct bversion v)
+{
+	return !bversion_cmp(v, ZERO_VERSION);
+}
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+/* statement expressions confusing unlikely()? */
+#define bkey_packed(_k)							\
+	({ EBUG_ON((_k)->format > KEY_FORMAT_CURRENT);			\
+	 (_k)->format != KEY_FORMAT_CURRENT; })
+#else
+#define bkey_packed(_k)		((_k)->format != KEY_FORMAT_CURRENT)
+#endif
+
+/*
+ * It's safe to treat an unpacked bkey as a packed one, but not the reverse
+ */
+static inline struct bkey_packed *bkey_to_packed(struct bkey_i *k)
+{
+	return (struct bkey_packed *) k;
+}
+
+static inline const struct bkey_packed *bkey_to_packed_c(const struct bkey_i *k)
+{
+	return (const struct bkey_packed *) k;
+}
+
+static inline struct bkey_i *packed_to_bkey(struct bkey_packed *k)
+{
+	return bkey_packed(k) ? NULL : (struct bkey_i *) k;
+}
+
+static inline const struct bkey *packed_to_bkey_c(const struct bkey_packed *k)
+{
+	return bkey_packed(k) ? NULL : (const struct bkey *) k;
+}
+
+static inline unsigned bkey_format_key_bits(const struct bkey_format *format)
+{
+	return format->bits_per_field[BKEY_FIELD_INODE] +
+		format->bits_per_field[BKEY_FIELD_OFFSET] +
+		format->bits_per_field[BKEY_FIELD_SNAPSHOT];
+}
+
+static inline struct bpos bkey_successor(struct bpos p)
+{
+	struct bpos ret = p;
+
+	if (!++ret.offset)
+		BUG_ON(!++ret.inode);
+
+	return ret;
+}
+
+static inline struct bpos bkey_predecessor(struct bpos p)
+{
+	struct bpos ret = p;
+
+	if (!ret.offset--)
+		BUG_ON(!ret.inode--);
+
+	return ret;
+}
+
+static inline u64 bkey_start_offset(const struct bkey *k)
+{
+	return k->p.offset - k->size;
+}
+
+static inline struct bpos bkey_start_pos(const struct bkey *k)
+{
+	return (struct bpos) {
+		.inode		= k->p.inode,
+		.offset		= bkey_start_offset(k),
+		.snapshot	= k->p.snapshot,
+	};
+}
+
+/* Packed helpers */
+
+static inline unsigned bkeyp_key_u64s(const struct bkey_format *format,
+				      const struct bkey_packed *k)
+{
+	unsigned ret = bkey_packed(k) ? format->key_u64s : BKEY_U64s;
+
+	EBUG_ON(k->u64s < ret);
+	return ret;
+}
+
+static inline unsigned bkeyp_key_bytes(const struct bkey_format *format,
+				       const struct bkey_packed *k)
+{
+	return bkeyp_key_u64s(format, k) * sizeof(u64);
+}
+
+static inline unsigned bkeyp_val_u64s(const struct bkey_format *format,
+				      const struct bkey_packed *k)
+{
+	return k->u64s - bkeyp_key_u64s(format, k);
+}
+
+static inline size_t bkeyp_val_bytes(const struct bkey_format *format,
+				     const struct bkey_packed *k)
+{
+	return bkeyp_val_u64s(format, k) * sizeof(u64);
+}
+
+static inline void set_bkeyp_val_u64s(const struct bkey_format *format,
+				      struct bkey_packed *k, unsigned val_u64s)
+{
+	k->u64s = bkeyp_key_u64s(format, k) + val_u64s;
+}
+
+#define bkeyp_val(_format, _k)						\
+	 ((struct bch_val *) ((_k)->_data + bkeyp_key_u64s(_format, _k)))
+
+extern const struct bkey_format bch2_bkey_format_current;
+
+bool bch2_bkey_transform(const struct bkey_format *,
+			 struct bkey_packed *,
+			 const struct bkey_format *,
+			 const struct bkey_packed *);
+
+struct bkey __bch2_bkey_unpack_key(const struct bkey_format *,
+				   const struct bkey_packed *);
+
+#ifndef HAVE_BCACHEFS_COMPILED_UNPACK
+struct bpos __bkey_unpack_pos(const struct bkey_format *,
+			      const struct bkey_packed *);
+#endif
+
+bool bch2_bkey_pack_key(struct bkey_packed *, const struct bkey *,
+		   const struct bkey_format *);
+
+enum bkey_pack_pos_ret {
+	BKEY_PACK_POS_EXACT,
+	BKEY_PACK_POS_SMALLER,
+	BKEY_PACK_POS_FAIL,
+};
+
+enum bkey_pack_pos_ret bch2_bkey_pack_pos_lossy(struct bkey_packed *, struct bpos,
+					   const struct btree *);
+
+static inline bool bkey_pack_pos(struct bkey_packed *out, struct bpos in,
+				 const struct btree *b)
+{
+	return bch2_bkey_pack_pos_lossy(out, in, b) == BKEY_PACK_POS_EXACT;
+}
+
+void bch2_bkey_unpack(const struct btree *, struct bkey_i *,
+		 const struct bkey_packed *);
+bool bch2_bkey_pack(struct bkey_packed *, const struct bkey_i *,
+	       const struct bkey_format *);
+
+static inline u64 bkey_field_max(const struct bkey_format *f,
+				 enum bch_bkey_fields nr)
+{
+	return f->bits_per_field[nr] < 64
+		? (le64_to_cpu(f->field_offset[nr]) +
+		   ~(~0ULL << f->bits_per_field[nr]))
+		: U64_MAX;
+}
+
+#ifdef HAVE_BCACHEFS_COMPILED_UNPACK
+
+int bch2_compile_bkey_format(const struct bkey_format *, void *);
+
+#else
+
+static inline int bch2_compile_bkey_format(const struct bkey_format *format,
+					  void *out) { return 0; }
+
+#endif
+
+static inline void bkey_reassemble(struct bkey_i *dst,
+				   struct bkey_s_c src)
+{
+	BUG_ON(bkey_packed(src.k));
+	dst->k = *src.k;
+	memcpy_u64s(&dst->v, src.v, bkey_val_u64s(src.k));
+}
+
+#define bkey_s_null		((struct bkey_s)   { .k = NULL })
+#define bkey_s_c_null		((struct bkey_s_c) { .k = NULL })
+
+#define bkey_s_err(err)		((struct bkey_s)   { .k = ERR_PTR(err) })
+#define bkey_s_c_err(err)	((struct bkey_s_c) { .k = ERR_PTR(err) })
+
+static inline struct bkey_s bkey_to_s(struct bkey *k)
+{
+	return (struct bkey_s) { .k = k, .v = NULL };
+}
+
+static inline struct bkey_s_c bkey_to_s_c(const struct bkey *k)
+{
+	return (struct bkey_s_c) { .k = k, .v = NULL };
+}
+
+static inline struct bkey_s bkey_i_to_s(struct bkey_i *k)
+{
+	return (struct bkey_s) { .k = &k->k, .v = &k->v };
+}
+
+static inline struct bkey_s_c bkey_i_to_s_c(const struct bkey_i *k)
+{
+	return (struct bkey_s_c) { .k = &k->k, .v = &k->v };
+}
+
+/*
+ * For a given type of value (e.g. struct bch_extent), generates the types for
+ * bkey + bch_extent - inline, split, split const - and also all the conversion
+ * functions, which also check that the value is of the correct type.
+ *
+ * We use anonymous unions for upcasting - e.g. converting from e.g. a
+ * bkey_i_extent to a bkey_i - since that's always safe, instead of conversion
+ * functions.
+ */
+#define __BKEY_VAL_ACCESSORS(name, nr, _assert)				\
+struct bkey_s_c_##name {						\
+	union {								\
+	struct {							\
+		const struct bkey	*k;				\
+		const struct bch_##name	*v;				\
+	};								\
+	struct bkey_s_c			s_c;				\
+	};								\
+};									\
+									\
+struct bkey_s_##name {							\
+	union {								\
+	struct {							\
+		struct bkey		*k;				\
+		struct bch_##name	*v;				\
+	};								\
+	struct bkey_s_c_##name		c;				\
+	struct bkey_s			s;				\
+	struct bkey_s_c			s_c;				\
+	};								\
+};									\
+									\
+static inline struct bkey_i_##name *bkey_i_to_##name(struct bkey_i *k)	\
+{									\
+	_assert(k->k.type, nr);						\
+	return container_of(&k->k, struct bkey_i_##name, k);		\
+}									\
+									\
+static inline const struct bkey_i_##name *				\
+bkey_i_to_##name##_c(const struct bkey_i *k)				\
+{									\
+	_assert(k->k.type, nr);						\
+	return container_of(&k->k, struct bkey_i_##name, k);		\
+}									\
+									\
+static inline struct bkey_s_##name bkey_s_to_##name(struct bkey_s k)	\
+{									\
+	_assert(k.k->type, nr);						\
+	return (struct bkey_s_##name) {					\
+		.k = k.k,						\
+		.v = container_of(k.v, struct bch_##name, v),		\
+	};								\
+}									\
+									\
+static inline struct bkey_s_c_##name bkey_s_c_to_##name(struct bkey_s_c k)\
+{									\
+	_assert(k.k->type, nr);						\
+	return (struct bkey_s_c_##name) {				\
+		.k = k.k,						\
+		.v = container_of(k.v, struct bch_##name, v),		\
+	};								\
+}									\
+									\
+static inline struct bkey_s_##name name##_i_to_s(struct bkey_i_##name *k)\
+{									\
+	return (struct bkey_s_##name) {					\
+		.k = &k->k,						\
+		.v = &k->v,						\
+	};								\
+}									\
+									\
+static inline struct bkey_s_c_##name					\
+name##_i_to_s_c(const struct bkey_i_##name *k)				\
+{									\
+	return (struct bkey_s_c_##name) {				\
+		.k = &k->k,						\
+		.v = &k->v,						\
+	};								\
+}									\
+									\
+static inline struct bkey_s_##name bkey_i_to_s_##name(struct bkey_i *k)	\
+{									\
+	_assert(k->k.type, nr);						\
+	return (struct bkey_s_##name) {					\
+		.k = &k->k,						\
+		.v = container_of(&k->v, struct bch_##name, v),		\
+	};								\
+}									\
+									\
+static inline struct bkey_s_c_##name					\
+bkey_i_to_s_c_##name(const struct bkey_i *k)				\
+{									\
+	_assert(k->k.type, nr);						\
+	return (struct bkey_s_c_##name) {				\
+		.k = &k->k,						\
+		.v = container_of(&k->v, struct bch_##name, v),		\
+	};								\
+}									\
+									\
+static inline struct bch_##name *					\
+bkey_p_##name##_val(const struct bkey_format *f,			\
+		    struct bkey_packed *k)				\
+{									\
+	return container_of(bkeyp_val(f, k), struct bch_##name, v);	\
+}									\
+									\
+static inline const struct bch_##name *					\
+bkey_p_c_##name##_val(const struct bkey_format *f,			\
+		      const struct bkey_packed *k)			\
+{									\
+	return container_of(bkeyp_val(f, k), struct bch_##name, v);	\
+}									\
+									\
+static inline struct bkey_i_##name *bkey_##name##_init(struct bkey_i *_k)\
+{									\
+	struct bkey_i_##name *k =					\
+		container_of(&_k->k, struct bkey_i_##name, k);		\
+									\
+	bkey_init(&k->k);						\
+	memset(&k->v, 0, sizeof(k->v));					\
+	k->k.type = nr;							\
+	set_bkey_val_bytes(&k->k, sizeof(k->v));			\
+									\
+	return k;							\
+}
+
+#define __BKEY_VAL_ASSERT(_type, _nr)	EBUG_ON(_type != _nr)
+
+#define BKEY_VAL_ACCESSORS(name, _nr)					\
+	static inline void __bch_##name##_assert(u8 type, u8 nr)	\
+	{								\
+		EBUG_ON(type != _nr);					\
+	}								\
+									\
+	__BKEY_VAL_ACCESSORS(name, _nr, __bch_##name##_assert)
+
+BKEY_VAL_ACCESSORS(cookie,		KEY_TYPE_COOKIE);
+
+static inline void __bch2_extent_assert(u8 type, u8 nr)
+{
+	EBUG_ON(type != BCH_EXTENT && type != BCH_EXTENT_CACHED);
+}
+
+__BKEY_VAL_ACCESSORS(extent,		BCH_EXTENT, __bch2_extent_assert);
+BKEY_VAL_ACCESSORS(reservation,		BCH_RESERVATION);
+
+BKEY_VAL_ACCESSORS(inode,		BCH_INODE_FS);
+BKEY_VAL_ACCESSORS(inode_blockdev,	BCH_INODE_BLOCKDEV);
+BKEY_VAL_ACCESSORS(inode_generation,	BCH_INODE_GENERATION);
+
+BKEY_VAL_ACCESSORS(dirent,		BCH_DIRENT);
+
+BKEY_VAL_ACCESSORS(xattr,		BCH_XATTR);
+
+BKEY_VAL_ACCESSORS(alloc,		BCH_ALLOC);
+
+BKEY_VAL_ACCESSORS(quota,		BCH_QUOTA);
+
+/* byte order helpers */
+
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+
+static inline unsigned high_word_offset(const struct bkey_format *f)
+{
+	return f->key_u64s - 1;
+}
+
+#define high_bit_offset		0
+#define nth_word(p, n)		((p) - (n))
+
+#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+
+static inline unsigned high_word_offset(const struct bkey_format *f)
+{
+	return 0;
+}
+
+#define high_bit_offset		KEY_PACKED_BITS_START
+#define nth_word(p, n)		((p) + (n))
+
+#else
+#error edit for your odd byteorder.
+#endif
+
+#define high_word(f, k)		((k)->_data + high_word_offset(f))
+#define next_word(p)		nth_word(p, 1)
+#define prev_word(p)		nth_word(p, -1)
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+void bch2_bkey_pack_test(void);
+#else
+static inline void bch2_bkey_pack_test(void) {}
+#endif
+
+#endif /* _BCACHEFS_BKEY_H */
diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c
new file mode 100644
index 000000000000..017425a534c6
--- /dev/null
+++ b/fs/bcachefs/bkey_methods.c
@@ -0,0 +1,192 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "bkey_methods.h"
+#include "btree_types.h"
+#include "alloc.h"
+#include "dirent.h"
+#include "error.h"
+#include "extents.h"
+#include "inode.h"
+#include "quota.h"
+#include "xattr.h"
+
+const struct bkey_ops bch2_bkey_ops[] = {
+	[BKEY_TYPE_EXTENTS]	= bch2_bkey_extent_ops,
+	[BKEY_TYPE_INODES]	= bch2_bkey_inode_ops,
+	[BKEY_TYPE_DIRENTS]	= bch2_bkey_dirent_ops,
+	[BKEY_TYPE_XATTRS]	= bch2_bkey_xattr_ops,
+	[BKEY_TYPE_ALLOC]	= bch2_bkey_alloc_ops,
+	[BKEY_TYPE_QUOTAS]	= bch2_bkey_quota_ops,
+	[BKEY_TYPE_BTREE]	= bch2_bkey_btree_ops,
+};
+
+const char *bch2_bkey_val_invalid(struct bch_fs *c, enum bkey_type type,
+				  struct bkey_s_c k)
+{
+	const struct bkey_ops *ops = &bch2_bkey_ops[type];
+
+	switch (k.k->type) {
+	case KEY_TYPE_DELETED:
+	case KEY_TYPE_DISCARD:
+		return NULL;
+
+	case KEY_TYPE_ERROR:
+		return bkey_val_bytes(k.k) != 0
+			? "value size should be zero"
+			: NULL;
+
+	case KEY_TYPE_COOKIE:
+		return bkey_val_bytes(k.k) != sizeof(struct bch_cookie)
+			? "incorrect value size"
+			: NULL;
+
+	default:
+		if (k.k->type < KEY_TYPE_GENERIC_NR)
+			return "invalid type";
+
+		return ops->key_invalid(c, k);
+	}
+}
+
+const char *__bch2_bkey_invalid(struct bch_fs *c, enum bkey_type type,
+			      struct bkey_s_c k)
+{
+	const struct bkey_ops *ops = &bch2_bkey_ops[type];
+
+	if (k.k->u64s < BKEY_U64s)
+		return "u64s too small";
+
+	if (!ops->is_extents) {
+		if (k.k->size)
+			return "nonzero size field";
+	} else {
+		if ((k.k->size == 0) != bkey_deleted(k.k))
+			return "bad size field";
+	}
+
+	if (ops->is_extents &&
+	    !k.k->size &&
+	    !bkey_deleted(k.k))
+		return "zero size field";
+
+	if (k.k->p.snapshot)
+		return "nonzero snapshot";
+
+	if (type != BKEY_TYPE_BTREE &&
+	    !bkey_cmp(k.k->p, POS_MAX))
+		return "POS_MAX key";
+
+	return NULL;
+}
+
+const char *bch2_bkey_invalid(struct bch_fs *c, enum bkey_type type,
+			      struct bkey_s_c k)
+{
+	return __bch2_bkey_invalid(c, type, k) ?:
+		bch2_bkey_val_invalid(c, type, k);
+}
+
+const char *bch2_bkey_in_btree_node(struct btree *b, struct bkey_s_c k)
+{
+	if (bkey_cmp(bkey_start_pos(k.k), b->data->min_key) < 0)
+		return "key before start of btree node";
+
+	if (bkey_cmp(k.k->p, b->data->max_key) > 0)
+		return "key past end of btree node";
+
+	return NULL;
+}
+
+void bch2_bkey_debugcheck(struct bch_fs *c, struct btree *b, struct bkey_s_c k)
+{
+	enum bkey_type type = btree_node_type(b);
+	const struct bkey_ops *ops = &bch2_bkey_ops[type];
+	const char *invalid;
+
+	BUG_ON(!k.k->u64s);
+
+	invalid = bch2_bkey_invalid(c, type, k) ?:
+		bch2_bkey_in_btree_node(b, k);
+	if (invalid) {
+		char buf[160];
+
+		bch2_bkey_val_to_text(c, type, buf, sizeof(buf), k);
+		bch2_fs_bug(c, "invalid bkey %s: %s", buf, invalid);
+		return;
+	}
+
+	if (k.k->type >= KEY_TYPE_GENERIC_NR &&
+	    ops->key_debugcheck)
+		ops->key_debugcheck(c, b, k);
+}
+
+#define p(...)	(out += scnprintf(out, end - out, __VA_ARGS__))
+
+int bch2_bkey_to_text(char *buf, size_t size, const struct bkey *k)
+{
+	char *out = buf, *end = buf + size;
+
+	p("u64s %u type %u ", k->u64s, k->type);
+
+	if (bkey_cmp(k->p, POS_MAX))
+		p("%llu:%llu", k->p.inode, k->p.offset);
+	else
+		p("POS_MAX");
+
+	p(" snap %u len %u ver %llu", k->p.snapshot, k->size, k->version.lo);
+
+	return out - buf;
+}
+
+int bch2_val_to_text(struct bch_fs *c, enum bkey_type type,
+		     char *buf, size_t size, struct bkey_s_c k)
+{
+	const struct bkey_ops *ops = &bch2_bkey_ops[type];
+	char *out = buf, *end = buf + size;
+
+	switch (k.k->type) {
+	case KEY_TYPE_DELETED:
+		p(" deleted");
+		break;
+	case KEY_TYPE_DISCARD:
+		p(" discard");
+		break;
+	case KEY_TYPE_ERROR:
+		p(" error");
+		break;
+	case KEY_TYPE_COOKIE:
+		p(" cookie");
+		break;
+	default:
+		if (k.k->type >= KEY_TYPE_GENERIC_NR && ops->val_to_text)
+			ops->val_to_text(c, buf, size, k);
+		break;
+	}
+
+	return out - buf;
+}
+
+int bch2_bkey_val_to_text(struct bch_fs *c, enum bkey_type type,
+			  char *buf, size_t size, struct bkey_s_c k)
+{
+	char *out = buf, *end = buf + size;
+
+	out += bch2_bkey_to_text(out, end - out, k.k);
+	out += scnprintf(out, end - out, ": ");
+	out += bch2_val_to_text(c, type, out, end - out, k);
+
+	return out - buf;
+}
+
+void bch2_bkey_swab(enum bkey_type type,
+		   const struct bkey_format *f,
+		   struct bkey_packed *k)
+{
+	const struct bkey_ops *ops = &bch2_bkey_ops[type];
+
+	bch2_bkey_swab_key(f, k);
+
+	if (ops->swab)
+		ops->swab(f, k);
+}
diff --git a/fs/bcachefs/bkey_methods.h b/fs/bcachefs/bkey_methods.h
new file mode 100644
index 000000000000..04c80f3603cc
--- /dev/null
+++ b/fs/bcachefs/bkey_methods.h
@@ -0,0 +1,87 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BKEY_METHODS_H
+#define _BCACHEFS_BKEY_METHODS_H
+
+#include "bkey.h"
+
+#define DEF_BTREE_ID(kwd, val, name) BKEY_TYPE_##kwd = val,
+
+enum bkey_type {
+	DEFINE_BCH_BTREE_IDS()
+	BKEY_TYPE_BTREE,
+};
+
+#undef DEF_BTREE_ID
+
+/* Type of a key in btree @id at level @level: */
+static inline enum bkey_type bkey_type(unsigned level, enum btree_id id)
+{
+	return level ? BKEY_TYPE_BTREE : (enum bkey_type) id;
+}
+
+static inline bool btree_type_has_ptrs(enum bkey_type type)
+{
+	switch (type) {
+	case BKEY_TYPE_BTREE:
+	case BKEY_TYPE_EXTENTS:
+		return true;
+	default:
+		return false;
+	}
+}
+
+struct bch_fs;
+struct btree;
+struct bkey;
+
+enum merge_result {
+	BCH_MERGE_NOMERGE,
+
+	/*
+	 * The keys were mergeable, but would have overflowed size - so instead
+	 * l was changed to the maximum size, and both keys were modified:
+	 */
+	BCH_MERGE_PARTIAL,
+	BCH_MERGE_MERGE,
+};
+
+typedef bool (*key_filter_fn)(struct bch_fs *, struct btree *,
+			      struct bkey_s);
+typedef enum merge_result (*key_merge_fn)(struct bch_fs *,
+					  struct btree *,
+					  struct bkey_i *, struct bkey_i *);
+
+struct bkey_ops {
+	/* Returns reason for being invalid if invalid, else NULL: */
+	const char *	(*key_invalid)(const struct bch_fs *,
+				       struct bkey_s_c);
+	void		(*key_debugcheck)(struct bch_fs *, struct btree *,
+					  struct bkey_s_c);
+	void		(*val_to_text)(struct bch_fs *, char *,
+				       size_t, struct bkey_s_c);
+	void		(*swab)(const struct bkey_format *, struct bkey_packed *);
+	key_filter_fn	key_normalize;
+	key_merge_fn	key_merge;
+	bool		is_extents;
+};
+
+const char *bch2_bkey_val_invalid(struct bch_fs *, enum bkey_type,
+				  struct bkey_s_c);
+const char *__bch2_bkey_invalid(struct bch_fs *, enum bkey_type, struct bkey_s_c);
+const char *bch2_bkey_invalid(struct bch_fs *, enum bkey_type, struct bkey_s_c);
+const char *bch2_bkey_in_btree_node(struct btree *, struct bkey_s_c);
+
+void bch2_bkey_debugcheck(struct bch_fs *, struct btree *, struct bkey_s_c);
+
+int bch2_bkey_to_text(char *, size_t, const struct bkey *);
+int bch2_val_to_text(struct bch_fs *, enum bkey_type,
+		     char *, size_t, struct bkey_s_c);
+int bch2_bkey_val_to_text(struct bch_fs *, enum bkey_type,
+			  char *, size_t, struct bkey_s_c);
+
+void bch2_bkey_swab(enum bkey_type, const struct bkey_format *,
+		    struct bkey_packed *);
+
+extern const struct bkey_ops bch2_bkey_ops[];
+
+#endif /* _BCACHEFS_BKEY_METHODS_H */
diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c
new file mode 100644
index 000000000000..faf58b4c0eb4
--- /dev/null
+++ b/fs/bcachefs/bset.c
@@ -0,0 +1,1849 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Code for working with individual keys, and sorted sets of keys with in a
+ * btree node
+ *
+ * Copyright 2012 Google, Inc.
+ */
+
+#include "bcachefs.h"
+#include "btree_cache.h"
+#include "bset.h"
+#include "eytzinger.h"
+#include "trace.h"
+#include "util.h"
+
+#include <asm/unaligned.h>
+#include <linux/console.h>
+#include <linux/random.h>
+#include <linux/prefetch.h>
+
+struct bset_tree *bch2_bkey_to_bset(struct btree *b, struct bkey_packed *k)
+{
+	struct bset_tree *t;
+
+	for_each_bset(b, t)
+		if (k >= btree_bkey_first(b, t) &&
+		    k < btree_bkey_last(b, t))
+			return t;
+
+	BUG();
+}
+
+/*
+ * There are never duplicate live keys in the btree - but including keys that
+ * have been flagged as deleted (and will be cleaned up later) we _will_ see
+ * duplicates.
+ *
+ * Thus the sort order is: usual key comparison first, but for keys that compare
+ * equal the deleted key(s) come first, and the (at most one) live version comes
+ * last.
+ *
+ * The main reason for this is insertion: to handle overwrites, we first iterate
+ * over keys that compare equal to our insert key, and then insert immediately
+ * prior to the first key greater than the key we're inserting - our insert
+ * position will be after all keys that compare equal to our insert key, which
+ * by the time we actually do the insert will all be deleted.
+ */
+
+void bch2_dump_bset(struct btree *b, struct bset *i, unsigned set)
+{
+	struct bkey_packed *_k, *_n;
+	struct bkey k, n;
+	char buf[120];
+
+	if (!i->u64s)
+		return;
+
+	for (_k = i->start, k = bkey_unpack_key(b, _k);
+	     _k < vstruct_last(i);
+	     _k = _n, k = n) {
+		_n = bkey_next(_k);
+
+		bch2_bkey_to_text(buf, sizeof(buf), &k);
+		printk(KERN_ERR "block %u key %zi/%u: %s\n", set,
+		       _k->_data - i->_data, i->u64s, buf);
+
+		if (_n == vstruct_last(i))
+			continue;
+
+		n = bkey_unpack_key(b, _n);
+
+		if (bkey_cmp(bkey_start_pos(&n), k.p) < 0) {
+			printk(KERN_ERR "Key skipped backwards\n");
+			continue;
+		}
+
+		/*
+		 * Weird check for duplicate non extent keys: extents are
+		 * deleted iff they have 0 size, so if it has zero size and it's
+		 * not deleted these aren't extents:
+		 */
+		if (((!k.size && !bkey_deleted(&k)) ||
+		     (!n.size && !bkey_deleted(&n))) &&
+		    !bkey_deleted(&k) &&
+		    !bkey_cmp(n.p, k.p))
+			printk(KERN_ERR "Duplicate keys\n");
+	}
+}
+
+void bch2_dump_btree_node(struct btree *b)
+{
+	struct bset_tree *t;
+
+	console_lock();
+	for_each_bset(b, t)
+		bch2_dump_bset(b, bset(b, t), t - b->set);
+	console_unlock();
+}
+
+void bch2_dump_btree_node_iter(struct btree *b,
+			      struct btree_node_iter *iter)
+{
+	struct btree_node_iter_set *set;
+
+	printk(KERN_ERR "btree node iter with %u sets:\n", b->nsets);
+
+	btree_node_iter_for_each(iter, set) {
+		struct bkey_packed *k = __btree_node_offset_to_key(b, set->k);
+		struct bset_tree *t = bch2_bkey_to_bset(b, k);
+		struct bkey uk = bkey_unpack_key(b, k);
+		char buf[100];
+
+		bch2_bkey_to_text(buf, sizeof(buf), &uk);
+		printk(KERN_ERR "set %zu key %zi/%u: %s\n", t - b->set,
+		       k->_data - bset(b, t)->_data, bset(b, t)->u64s, buf);
+	}
+}
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+
+static bool keys_out_of_order(struct btree *b,
+			      const struct bkey_packed *prev,
+			      const struct bkey_packed *next,
+			      bool is_extents)
+{
+	struct bkey nextu = bkey_unpack_key(b, next);
+
+	return bkey_cmp_left_packed_byval(b, prev, bkey_start_pos(&nextu)) > 0 ||
+		((is_extents
+		  ? !bkey_deleted(next)
+		  : !bkey_deleted(prev)) &&
+		 !bkey_cmp_packed(b, prev, next));
+}
+
+void __bch2_verify_btree_nr_keys(struct btree *b)
+{
+	struct bset_tree *t;
+	struct bkey_packed *k;
+	struct btree_nr_keys nr = { 0 };
+
+	for_each_bset(b, t)
+		for (k = btree_bkey_first(b, t);
+		     k != btree_bkey_last(b, t);
+		     k = bkey_next(k))
+			if (!bkey_whiteout(k))
+				btree_keys_account_key_add(&nr, t - b->set, k);
+
+	BUG_ON(memcmp(&nr, &b->nr, sizeof(nr)));
+}
+
+static void bch2_btree_node_iter_next_check(struct btree_node_iter *iter,
+					   struct btree *b,
+					   struct bkey_packed *k)
+{
+	const struct bkey_packed *n = bch2_btree_node_iter_peek_all(iter, b);
+
+	bkey_unpack_key(b, k);
+
+	if (n &&
+	    keys_out_of_order(b, k, n, iter->is_extents)) {
+		struct bkey ku = bkey_unpack_key(b, k);
+		struct bkey nu = bkey_unpack_key(b, n);
+		char buf1[80], buf2[80];
+
+		bch2_dump_btree_node(b);
+		bch2_bkey_to_text(buf1, sizeof(buf1), &ku);
+		bch2_bkey_to_text(buf2, sizeof(buf2), &nu);
+		panic("out of order/overlapping:\n%s\n%s\n", buf1, buf2);
+	}
+}
+
+void bch2_btree_node_iter_verify(struct btree_node_iter *iter,
+				struct btree *b)
+{
+	struct btree_node_iter_set *set, *prev = NULL;
+	struct bset_tree *t;
+	struct bkey_packed *k, *first;
+
+	if (bch2_btree_node_iter_end(iter))
+		return;
+
+	btree_node_iter_for_each(iter, set) {
+		k = __btree_node_offset_to_key(b, set->k);
+		t = bch2_bkey_to_bset(b, k);
+
+		BUG_ON(__btree_node_offset_to_key(b, set->end) !=
+		       btree_bkey_last(b, t));
+
+		BUG_ON(prev &&
+		       btree_node_iter_cmp(iter, b, *prev, *set) > 0);
+
+		prev = set;
+	}
+
+	first = __btree_node_offset_to_key(b, iter->data[0].k);
+
+	for_each_bset(b, t)
+		if (bch2_btree_node_iter_bset_pos(iter, b, t) ==
+		    btree_bkey_last(b, t) &&
+		    (k = bch2_bkey_prev_all(b, t, btree_bkey_last(b, t))))
+			BUG_ON(__btree_node_iter_cmp(iter->is_extents, b,
+						     k, first) > 0);
+}
+
+void bch2_verify_key_order(struct btree *b,
+			  struct btree_node_iter *iter,
+			  struct bkey_packed *where)
+{
+	struct bset_tree *t = bch2_bkey_to_bset(b, where);
+	struct bkey_packed *k, *prev;
+	struct bkey uk, uw = bkey_unpack_key(b, where);
+
+	k = bch2_bkey_prev_all(b, t, where);
+	if (k &&
+	    keys_out_of_order(b, k, where, iter->is_extents)) {
+		char buf1[100], buf2[100];
+
+		bch2_dump_btree_node(b);
+		uk = bkey_unpack_key(b, k);
+		bch2_bkey_to_text(buf1, sizeof(buf1), &uk);
+		bch2_bkey_to_text(buf2, sizeof(buf2), &uw);
+		panic("out of order with prev:\n%s\n%s\n",
+		      buf1, buf2);
+	}
+
+	k = bkey_next(where);
+	BUG_ON(k != btree_bkey_last(b, t) &&
+	       keys_out_of_order(b, where, k, iter->is_extents));
+
+	for_each_bset(b, t) {
+		if (where >= btree_bkey_first(b, t) ||
+		    where < btree_bkey_last(b, t))
+			continue;
+
+		k = bch2_btree_node_iter_bset_pos(iter, b, t);
+
+		if (k == btree_bkey_last(b, t))
+			k = bch2_bkey_prev_all(b, t, k);
+
+		while (bkey_cmp_left_packed_byval(b, k, bkey_start_pos(&uw)) > 0 &&
+		       (prev = bch2_bkey_prev_all(b, t, k)))
+			k = prev;
+
+		for (;
+		     k != btree_bkey_last(b, t);
+		     k = bkey_next(k)) {
+			uk = bkey_unpack_key(b, k);
+
+			if (iter->is_extents) {
+				BUG_ON(!(bkey_cmp(uw.p, bkey_start_pos(&uk)) <= 0 ||
+					 bkey_cmp(uk.p, bkey_start_pos(&uw)) <= 0));
+			} else {
+				BUG_ON(!bkey_cmp(uw.p, uk.p) &&
+				       !bkey_deleted(&uk));
+			}
+
+			if (bkey_cmp(uw.p, bkey_start_pos(&uk)) <= 0)
+				break;
+		}
+	}
+}
+
+#else
+
+static inline void bch2_btree_node_iter_next_check(struct btree_node_iter *iter,
+						   struct btree *b,
+						   struct bkey_packed *k) {}
+
+#endif
+
+/* Auxiliary search trees */
+
+#define BFLOAT_FAILED_UNPACKED	(U8_MAX - 0)
+#define BFLOAT_FAILED_PREV	(U8_MAX - 1)
+#define BFLOAT_FAILED_OVERFLOW	(U8_MAX - 2)
+#define BFLOAT_FAILED		(U8_MAX - 2)
+
+#define KEY_WORDS		BITS_TO_LONGS(1 << BKEY_EXPONENT_BITS)
+
+struct bkey_float {
+	u8		exponent;
+	u8		key_offset;
+	union {
+		u32	mantissa32;
+	struct {
+		u16	mantissa16;
+		u16	_pad;
+	};
+	};
+} __packed;
+
+#define BFLOAT_32BIT_NR		32U
+
+static unsigned bkey_float_byte_offset(unsigned idx)
+{
+	int d = (idx - BFLOAT_32BIT_NR) << 1;
+
+	d &= ~(d >> 31);
+
+	return idx * 6 - d;
+}
+
+struct ro_aux_tree {
+	struct bkey_float	_d[0];
+};
+
+struct rw_aux_tree {
+	u16		offset;
+	struct bpos	k;
+};
+
+/*
+ * BSET_CACHELINE was originally intended to match the hardware cacheline size -
+ * it used to be 64, but I realized the lookup code would touch slightly less
+ * memory if it was 128.
+ *
+ * It definites the number of bytes (in struct bset) per struct bkey_float in
+ * the auxiliar search tree - when we're done searching the bset_float tree we
+ * have this many bytes left that we do a linear search over.
+ *
+ * Since (after level 5) every level of the bset_tree is on a new cacheline,
+ * we're touching one fewer cacheline in the bset tree in exchange for one more
+ * cacheline in the linear search - but the linear search might stop before it
+ * gets to the second cacheline.
+ */
+
+#define BSET_CACHELINE		128
+
+/* Space required for the btree node keys */
+static inline size_t btree_keys_bytes(struct btree *b)
+{
+	return PAGE_SIZE << b->page_order;
+}
+
+static inline size_t btree_keys_cachelines(struct btree *b)
+{
+	return btree_keys_bytes(b) / BSET_CACHELINE;
+}
+
+static inline size_t btree_aux_data_bytes(struct btree *b)
+{
+	return btree_keys_cachelines(b) * 8;
+}
+
+static inline size_t btree_aux_data_u64s(struct btree *b)
+{
+	return btree_aux_data_bytes(b) / sizeof(u64);
+}
+
+static unsigned bset_aux_tree_buf_end(const struct bset_tree *t)
+{
+	BUG_ON(t->aux_data_offset == U16_MAX);
+
+	switch (bset_aux_tree_type(t)) {
+	case BSET_NO_AUX_TREE:
+		return t->aux_data_offset;
+	case BSET_RO_AUX_TREE:
+		return t->aux_data_offset +
+			DIV_ROUND_UP(bkey_float_byte_offset(t->size) +
+				     sizeof(u8) * t->size, 8);
+	case BSET_RW_AUX_TREE:
+		return t->aux_data_offset +
+			DIV_ROUND_UP(sizeof(struct rw_aux_tree) * t->size, 8);
+	default:
+		BUG();
+	}
+}
+
+static unsigned bset_aux_tree_buf_start(const struct btree *b,
+					const struct bset_tree *t)
+{
+	return t == b->set
+		? DIV_ROUND_UP(b->unpack_fn_len, 8)
+		: bset_aux_tree_buf_end(t - 1);
+}
+
+static void *__aux_tree_base(const struct btree *b,
+			     const struct bset_tree *t)
+{
+	return b->aux_data + t->aux_data_offset * 8;
+}
+
+static struct ro_aux_tree *ro_aux_tree_base(const struct btree *b,
+					    const struct bset_tree *t)
+{
+	EBUG_ON(bset_aux_tree_type(t) != BSET_RO_AUX_TREE);
+
+	return __aux_tree_base(b, t);
+}
+
+static u8 *ro_aux_tree_prev(const struct btree *b,
+			    const struct bset_tree *t)
+{
+	EBUG_ON(bset_aux_tree_type(t) != BSET_RO_AUX_TREE);
+
+	return __aux_tree_base(b, t) + bkey_float_byte_offset(t->size);
+}
+
+static struct bkey_float *bkey_float_get(struct ro_aux_tree *b,
+					 unsigned idx)
+{
+	return (void *) b + bkey_float_byte_offset(idx);
+}
+
+static struct bkey_float *bkey_float(const struct btree *b,
+				     const struct bset_tree *t,
+				     unsigned idx)
+{
+	return bkey_float_get(ro_aux_tree_base(b, t), idx);
+}
+
+static void bset_aux_tree_verify(struct btree *b)
+{
+#ifdef CONFIG_BCACHEFS_DEBUG
+	struct bset_tree *t;
+
+	for_each_bset(b, t) {
+		if (t->aux_data_offset == U16_MAX)
+			continue;
+
+		BUG_ON(t != b->set &&
+		       t[-1].aux_data_offset == U16_MAX);
+
+		BUG_ON(t->aux_data_offset < bset_aux_tree_buf_start(b, t));
+		BUG_ON(t->aux_data_offset > btree_aux_data_u64s(b));
+		BUG_ON(bset_aux_tree_buf_end(t) > btree_aux_data_u64s(b));
+	}
+#endif
+}
+
+/* Memory allocation */
+
+void bch2_btree_keys_free(struct btree *b)
+{
+	kvfree(b->aux_data);
+	b->aux_data = NULL;
+}
+
+int bch2_btree_keys_alloc(struct btree *b, unsigned page_order, gfp_t gfp)
+{
+	b->page_order	= page_order;
+	b->aux_data	= kvmalloc(btree_aux_data_bytes(b), gfp);
+	if (!b->aux_data)
+		return -ENOMEM;
+
+	return 0;
+}
+
+void bch2_btree_keys_init(struct btree *b, bool *expensive_debug_checks)
+{
+	unsigned i;
+
+	b->nsets		= 0;
+	memset(&b->nr, 0, sizeof(b->nr));
+#ifdef CONFIG_BCACHEFS_DEBUG
+	b->expensive_debug_checks = expensive_debug_checks;
+#endif
+	for (i = 0; i < MAX_BSETS; i++)
+		b->set[i].data_offset = U16_MAX;
+
+	bch2_bset_set_no_aux_tree(b, b->set);
+}
+
+/* Binary tree stuff for auxiliary search trees */
+
+/*
+ * Cacheline/offset <-> bkey pointer arithmetic:
+ *
+ * t->tree is a binary search tree in an array; each node corresponds to a key
+ * in one cacheline in t->set (BSET_CACHELINE bytes).
+ *
+ * This means we don't have to store the full index of the key that a node in
+ * the binary tree points to; eytzinger1_to_inorder() gives us the cacheline, and
+ * then bkey_float->m gives us the offset within that cacheline, in units of 8
+ * bytes.
+ *
+ * cacheline_to_bkey() and friends abstract out all the pointer arithmetic to
+ * make this work.
+ *
+ * To construct the bfloat for an arbitrary key we need to know what the key
+ * immediately preceding it is: we have to check if the two keys differ in the
+ * bits we're going to store in bkey_float->mantissa. t->prev[j] stores the size
+ * of the previous key so we can walk backwards to it from t->tree[j]'s key.
+ */
+
+static inline void *bset_cacheline(const struct btree *b,
+				   const struct bset_tree *t,
+				   unsigned cacheline)
+{
+	return (void *) round_down((unsigned long) btree_bkey_first(b, t),
+				   L1_CACHE_BYTES) +
+		cacheline * BSET_CACHELINE;
+}
+
+static struct bkey_packed *cacheline_to_bkey(const struct btree *b,
+					     const struct bset_tree *t,
+					     unsigned cacheline,
+					     unsigned offset)
+{
+	return bset_cacheline(b, t, cacheline) + offset * 8;
+}
+
+static unsigned bkey_to_cacheline(const struct btree *b,
+				  const struct bset_tree *t,
+				  const struct bkey_packed *k)
+{
+	return ((void *) k - bset_cacheline(b, t, 0)) / BSET_CACHELINE;
+}
+
+static ssize_t __bkey_to_cacheline_offset(const struct btree *b,
+					  const struct bset_tree *t,
+					  unsigned cacheline,
+					  const struct bkey_packed *k)
+{
+	return (u64 *) k - (u64 *) bset_cacheline(b, t, cacheline);
+}
+
+static unsigned bkey_to_cacheline_offset(const struct btree *b,
+					 const struct bset_tree *t,
+					 unsigned cacheline,
+					 const struct bkey_packed *k)
+{
+	size_t m = __bkey_to_cacheline_offset(b, t, cacheline, k);
+
+	EBUG_ON(m > U8_MAX);
+	return m;
+}
+
+static inline struct bkey_packed *tree_to_bkey(const struct btree *b,
+					       const struct bset_tree *t,
+					       unsigned j)
+{
+	return cacheline_to_bkey(b, t,
+			__eytzinger1_to_inorder(j, t->size, t->extra),
+			bkey_float(b, t, j)->key_offset);
+}
+
+static struct bkey_packed *tree_to_prev_bkey(const struct btree *b,
+					     const struct bset_tree *t,
+					     unsigned j)
+{
+	unsigned prev_u64s = ro_aux_tree_prev(b, t)[j];
+
+	return (void *) (tree_to_bkey(b, t, j)->_data - prev_u64s);
+}
+
+static struct rw_aux_tree *rw_aux_tree(const struct btree *b,
+				       const struct bset_tree *t)
+{
+	EBUG_ON(bset_aux_tree_type(t) != BSET_RW_AUX_TREE);
+
+	return __aux_tree_base(b, t);
+}
+
+/*
+ * For the write set - the one we're currently inserting keys into - we don't
+ * maintain a full search tree, we just keep a simple lookup table in t->prev.
+ */
+static struct bkey_packed *rw_aux_to_bkey(const struct btree *b,
+					  struct bset_tree *t,
+					  unsigned j)
+{
+	return __btree_node_offset_to_key(b, rw_aux_tree(b, t)[j].offset);
+}
+
+static void rw_aux_tree_set(const struct btree *b, struct bset_tree *t,
+			    unsigned j, struct bkey_packed *k)
+{
+	EBUG_ON(k >= btree_bkey_last(b, t));
+
+	rw_aux_tree(b, t)[j] = (struct rw_aux_tree) {
+		.offset	= __btree_node_key_to_offset(b, k),
+		.k	= bkey_unpack_pos(b, k),
+	};
+}
+
+static void bch2_bset_verify_rw_aux_tree(struct btree *b,
+					struct bset_tree *t)
+{
+	struct bkey_packed *k = btree_bkey_first(b, t);
+	unsigned j = 0;
+
+	if (!btree_keys_expensive_checks(b))
+		return;
+
+	BUG_ON(bset_has_ro_aux_tree(t));
+
+	if (!bset_has_rw_aux_tree(t))
+		return;
+
+	BUG_ON(t->size < 1);
+	BUG_ON(rw_aux_to_bkey(b, t, j) != k);
+
+	goto start;
+	while (1) {
+		if (rw_aux_to_bkey(b, t, j) == k) {
+			BUG_ON(bkey_cmp(rw_aux_tree(b, t)[j].k,
+					bkey_unpack_pos(b, k)));
+start:
+			if (++j == t->size)
+				break;
+
+			BUG_ON(rw_aux_tree(b, t)[j].offset <=
+			       rw_aux_tree(b, t)[j - 1].offset);
+		}
+
+		k = bkey_next(k);
+		BUG_ON(k >= btree_bkey_last(b, t));
+	}
+}
+
+/* returns idx of first entry >= offset: */
+static unsigned rw_aux_tree_bsearch(struct btree *b,
+				    struct bset_tree *t,
+				    unsigned offset)
+{
+	unsigned l = 0, r = t->size;
+
+	EBUG_ON(bset_aux_tree_type(t) != BSET_RW_AUX_TREE);
+
+	while (l < r) {
+		unsigned m = (l + r) >> 1;
+
+		if (rw_aux_tree(b, t)[m].offset < offset)
+			l = m + 1;
+		else
+			r = m;
+	}
+
+	EBUG_ON(l < t->size &&
+		rw_aux_tree(b, t)[l].offset < offset);
+	EBUG_ON(l &&
+		rw_aux_tree(b, t)[l - 1].offset >= offset);
+
+	EBUG_ON(l > r);
+	EBUG_ON(l > t->size);
+
+	return l;
+}
+
+static inline unsigned bfloat_mantissa(const struct bkey_float *f,
+				       unsigned idx)
+{
+	return idx < BFLOAT_32BIT_NR ? f->mantissa32 : f->mantissa16;
+}
+
+static inline void bfloat_mantissa_set(struct bkey_float *f,
+				       unsigned idx, unsigned mantissa)
+{
+	if (idx < BFLOAT_32BIT_NR)
+		f->mantissa32 = mantissa;
+	else
+		f->mantissa16 = mantissa;
+}
+
+static inline unsigned bkey_mantissa(const struct bkey_packed *k,
+				     const struct bkey_float *f,
+				     unsigned idx)
+{
+	u64 v;
+
+	EBUG_ON(!bkey_packed(k));
+
+	v = get_unaligned((u64 *) (((u8 *) k->_data) + (f->exponent >> 3)));
+
+	/*
+	 * In little endian, we're shifting off low bits (and then the bits we
+	 * want are at the low end), in big endian we're shifting off high bits
+	 * (and then the bits we want are at the high end, so we shift them
+	 * back down):
+	 */
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+	v >>= f->exponent & 7;
+#else
+	v >>= 64 - (f->exponent & 7) - (idx < BFLOAT_32BIT_NR ? 32 : 16);
+#endif
+	return idx < BFLOAT_32BIT_NR ? (u32) v : (u16) v;
+}
+
+static void make_bfloat(struct btree *b, struct bset_tree *t,
+			unsigned j,
+			struct bkey_packed *min_key,
+			struct bkey_packed *max_key)
+{
+	struct bkey_float *f = bkey_float(b, t, j);
+	struct bkey_packed *m = tree_to_bkey(b, t, j);
+	struct bkey_packed *p = tree_to_prev_bkey(b, t, j);
+	struct bkey_packed *l, *r;
+	unsigned bits = j < BFLOAT_32BIT_NR ? 32 : 16;
+	unsigned mantissa;
+	int shift, exponent, high_bit;
+
+	EBUG_ON(bkey_next(p) != m);
+
+	if (is_power_of_2(j)) {
+		l = min_key;
+
+		if (!l->u64s) {
+			if (!bkey_pack_pos(l, b->data->min_key, b)) {
+				struct bkey_i tmp;
+
+				bkey_init(&tmp.k);
+				tmp.k.p = b->data->min_key;
+				bkey_copy(l, &tmp);
+			}
+		}
+	} else {
+		l = tree_to_prev_bkey(b, t, j >> ffs(j));
+
+		EBUG_ON(m < l);
+	}
+
+	if (is_power_of_2(j + 1)) {
+		r = max_key;
+
+		if (!r->u64s) {
+			if (!bkey_pack_pos(r, t->max_key, b)) {
+				struct bkey_i tmp;
+
+				bkey_init(&tmp.k);
+				tmp.k.p = t->max_key;
+				bkey_copy(r, &tmp);
+			}
+		}
+	} else {
+		r = tree_to_bkey(b, t, j >> (ffz(j) + 1));
+
+		EBUG_ON(m > r);
+	}
+
+	/*
+	 * for failed bfloats, the lookup code falls back to comparing against
+	 * the original key.
+	 */
+
+	if (!bkey_packed(l) || !bkey_packed(r) ||
+	    !bkey_packed(p) || !bkey_packed(m) ||
+	    !b->nr_key_bits) {
+		f->exponent = BFLOAT_FAILED_UNPACKED;
+		return;
+	}
+
+	/*
+	 * The greatest differing bit of l and r is the first bit we must
+	 * include in the bfloat mantissa we're creating in order to do
+	 * comparisons - that bit always becomes the high bit of
+	 * bfloat->mantissa, and thus the exponent we're calculating here is
+	 * the position of what will become the low bit in bfloat->mantissa:
+	 *
+	 * Note that this may be negative - we may be running off the low end
+	 * of the key: we handle this later:
+	 */
+	high_bit = max(bch2_bkey_greatest_differing_bit(b, l, r),
+		       min_t(unsigned, bits, b->nr_key_bits) - 1);
+	exponent = high_bit - (bits - 1);
+
+	/*
+	 * Then we calculate the actual shift value, from the start of the key
+	 * (k->_data), to get the key bits starting at exponent:
+	 */
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+	shift = (int) (b->format.key_u64s * 64 - b->nr_key_bits) + exponent;
+
+	EBUG_ON(shift + bits > b->format.key_u64s * 64);
+#else
+	shift = high_bit_offset +
+		b->nr_key_bits -
+		exponent -
+		bits;
+
+	EBUG_ON(shift < KEY_PACKED_BITS_START);
+#endif
+	EBUG_ON(shift < 0 || shift >= BFLOAT_FAILED);
+
+	f->exponent = shift;
+	mantissa = bkey_mantissa(m, f, j);
+
+	/*
+	 * If we've got garbage bits, set them to all 1s - it's legal for the
+	 * bfloat to compare larger than the original key, but not smaller:
+	 */
+	if (exponent < 0)
+		mantissa |= ~(~0U << -exponent);
+
+	bfloat_mantissa_set(f, j, mantissa);
+
+	/*
+	 * The bfloat must be able to tell its key apart from the previous key -
+	 * if its key and the previous key don't differ in the required bits,
+	 * flag as failed - unless the keys are actually equal, in which case
+	 * we aren't required to return a specific one:
+	 */
+	if (exponent > 0 &&
+	    bfloat_mantissa(f, j) == bkey_mantissa(p, f, j) &&
+	    bkey_cmp_packed(b, p, m)) {
+		f->exponent = BFLOAT_FAILED_PREV;
+		return;
+	}
+
+	/*
+	 * f->mantissa must compare >= the original key - for transitivity with
+	 * the comparison in bset_search_tree. If we're dropping set bits,
+	 * increment it:
+	 */
+	if (exponent > (int) bch2_bkey_ffs(b, m)) {
+		if (j < BFLOAT_32BIT_NR
+		    ? f->mantissa32 == U32_MAX
+		    : f->mantissa16 == U16_MAX)
+			f->exponent = BFLOAT_FAILED_OVERFLOW;
+
+		if (j < BFLOAT_32BIT_NR)
+			f->mantissa32++;
+		else
+			f->mantissa16++;
+	}
+}
+
+/* bytes remaining - only valid for last bset: */
+static unsigned __bset_tree_capacity(struct btree *b, struct bset_tree *t)
+{
+	bset_aux_tree_verify(b);
+
+	return btree_aux_data_bytes(b) - t->aux_data_offset * sizeof(u64);
+}
+
+static unsigned bset_ro_tree_capacity(struct btree *b, struct bset_tree *t)
+{
+	unsigned bytes = __bset_tree_capacity(b, t);
+
+	if (bytes < 7 * BFLOAT_32BIT_NR)
+		return bytes / 7;
+
+	bytes -= 7 * BFLOAT_32BIT_NR;
+
+	return BFLOAT_32BIT_NR + bytes / 5;
+}
+
+static unsigned bset_rw_tree_capacity(struct btree *b, struct bset_tree *t)
+{
+	return __bset_tree_capacity(b, t) / sizeof(struct rw_aux_tree);
+}
+
+static void __build_rw_aux_tree(struct btree *b, struct bset_tree *t)
+{
+	struct bkey_packed *k;
+
+	t->size = 1;
+	t->extra = BSET_RW_AUX_TREE_VAL;
+	rw_aux_tree(b, t)[0].offset =
+		__btree_node_key_to_offset(b, btree_bkey_first(b, t));
+
+	for (k = btree_bkey_first(b, t);
+	     k != btree_bkey_last(b, t);
+	     k = bkey_next(k)) {
+		if (t->size == bset_rw_tree_capacity(b, t))
+			break;
+
+		if ((void *) k - (void *) rw_aux_to_bkey(b, t, t->size - 1) >
+		    L1_CACHE_BYTES)
+			rw_aux_tree_set(b, t, t->size++, k);
+	}
+}
+
+static void __build_ro_aux_tree(struct btree *b, struct bset_tree *t)
+{
+	struct bkey_packed *prev = NULL, *k = btree_bkey_first(b, t);
+	struct bkey_packed min_key, max_key;
+	unsigned j, cacheline = 1;
+
+	/* signal to make_bfloat() that they're uninitialized: */
+	min_key.u64s = max_key.u64s = 0;
+
+	t->size = min(bkey_to_cacheline(b, t, btree_bkey_last(b, t)),
+		      bset_ro_tree_capacity(b, t));
+retry:
+	if (t->size < 2) {
+		t->size = 0;
+		t->extra = BSET_NO_AUX_TREE_VAL;
+		return;
+	}
+
+	t->extra = (t->size - rounddown_pow_of_two(t->size - 1)) << 1;
+
+	/* First we figure out where the first key in each cacheline is */
+	eytzinger1_for_each(j, t->size) {
+		while (bkey_to_cacheline(b, t, k) < cacheline)
+			prev = k, k = bkey_next(k);
+
+		if (k >= btree_bkey_last(b, t)) {
+			/* XXX: this path sucks */
+			t->size--;
+			goto retry;
+		}
+
+		ro_aux_tree_prev(b, t)[j] = prev->u64s;
+		bkey_float(b, t, j)->key_offset =
+			bkey_to_cacheline_offset(b, t, cacheline++, k);
+
+		EBUG_ON(tree_to_prev_bkey(b, t, j) != prev);
+		EBUG_ON(tree_to_bkey(b, t, j) != k);
+	}
+
+	while (bkey_next(k) != btree_bkey_last(b, t))
+		k = bkey_next(k);
+
+	t->max_key = bkey_unpack_pos(b, k);
+
+	/* Then we build the tree */
+	eytzinger1_for_each(j, t->size)
+		make_bfloat(b, t, j, &min_key, &max_key);
+}
+
+static void bset_alloc_tree(struct btree *b, struct bset_tree *t)
+{
+	struct bset_tree *i;
+
+	for (i = b->set; i != t; i++)
+		BUG_ON(bset_has_rw_aux_tree(i));
+
+	bch2_bset_set_no_aux_tree(b, t);
+
+	/* round up to next cacheline: */
+	t->aux_data_offset = round_up(bset_aux_tree_buf_start(b, t),
+				      SMP_CACHE_BYTES / sizeof(u64));
+
+	bset_aux_tree_verify(b);
+}
+
+void bch2_bset_build_aux_tree(struct btree *b, struct bset_tree *t,
+			     bool writeable)
+{
+	if (writeable
+	    ? bset_has_rw_aux_tree(t)
+	    : bset_has_ro_aux_tree(t))
+		return;
+
+	bset_alloc_tree(b, t);
+
+	if (!__bset_tree_capacity(b, t))
+		return;
+
+	if (writeable)
+		__build_rw_aux_tree(b, t);
+	else
+		__build_ro_aux_tree(b, t);
+
+	bset_aux_tree_verify(b);
+}
+
+void bch2_bset_init_first(struct btree *b, struct bset *i)
+{
+	struct bset_tree *t;
+
+	BUG_ON(b->nsets);
+
+	memset(i, 0, sizeof(*i));
+	get_random_bytes(&i->seq, sizeof(i->seq));
+	SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN);
+
+	t = &b->set[b->nsets++];
+	set_btree_bset(b, t, i);
+}
+
+void bch2_bset_init_next(struct bch_fs *c, struct btree *b,
+			 struct btree_node_entry *bne)
+{
+	struct bset *i = &bne->keys;
+	struct bset_tree *t;
+
+	BUG_ON(bset_byte_offset(b, bne) >= btree_bytes(c));
+	BUG_ON((void *) bne < (void *) btree_bkey_last(b, bset_tree_last(b)));
+	BUG_ON(b->nsets >= MAX_BSETS);
+
+	memset(i, 0, sizeof(*i));
+	i->seq = btree_bset_first(b)->seq;
+	SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN);
+
+	t = &b->set[b->nsets++];
+	set_btree_bset(b, t, i);
+}
+
+/*
+ * find _some_ key in the same bset as @k that precedes @k - not necessarily the
+ * immediate predecessor:
+ */
+static struct bkey_packed *__bkey_prev(struct btree *b, struct bset_tree *t,
+				       struct bkey_packed *k)
+{
+	struct bkey_packed *p;
+	unsigned offset;
+	int j;
+
+	EBUG_ON(k < btree_bkey_first(b, t) ||
+		k > btree_bkey_last(b, t));
+
+	if (k == btree_bkey_first(b, t))
+		return NULL;
+
+	switch (bset_aux_tree_type(t)) {
+	case BSET_NO_AUX_TREE:
+		p = btree_bkey_first(b, t);
+		break;
+	case BSET_RO_AUX_TREE:
+		j = min_t(unsigned, t->size - 1, bkey_to_cacheline(b, t, k));
+
+		do {
+			p = j ? tree_to_bkey(b, t,
+					__inorder_to_eytzinger1(j--,
+							t->size, t->extra))
+			      : btree_bkey_first(b, t);
+		} while (p >= k);
+		break;
+	case BSET_RW_AUX_TREE:
+		offset = __btree_node_key_to_offset(b, k);
+		j = rw_aux_tree_bsearch(b, t, offset);
+		p = j ? rw_aux_to_bkey(b, t, j - 1)
+		      : btree_bkey_first(b, t);
+		break;
+	}
+
+	return p;
+}
+
+struct bkey_packed *bch2_bkey_prev_filter(struct btree *b,
+					  struct bset_tree *t,
+					  struct bkey_packed *k,
+					  unsigned min_key_type)
+{
+	struct bkey_packed *p, *i, *ret = NULL, *orig_k = k;
+
+	while ((p = __bkey_prev(b, t, k)) && !ret) {
+		for (i = p; i != k; i = bkey_next(i))
+			if (i->type >= min_key_type)
+				ret = i;
+
+		k = p;
+	}
+
+	if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) {
+		BUG_ON(ret >= orig_k);
+
+		for (i = ret ? bkey_next(ret) : btree_bkey_first(b, t);
+		     i != orig_k;
+		     i = bkey_next(i))
+			BUG_ON(i->type >= min_key_type);
+	}
+
+	return ret;
+}
+
+/* Insert */
+
+static void rw_aux_tree_fix_invalidated_key(struct btree *b,
+					    struct bset_tree *t,
+					    struct bkey_packed *k)
+{
+	unsigned offset = __btree_node_key_to_offset(b, k);
+	unsigned j = rw_aux_tree_bsearch(b, t, offset);
+
+	if (j < t->size &&
+	    rw_aux_tree(b, t)[j].offset == offset)
+		rw_aux_tree_set(b, t, j, k);
+
+	bch2_bset_verify_rw_aux_tree(b, t);
+}
+
+static void ro_aux_tree_fix_invalidated_key(struct btree *b,
+					    struct bset_tree *t,
+					    struct bkey_packed *k)
+{
+	struct bkey_packed min_key, max_key;
+	unsigned inorder, j;
+
+	EBUG_ON(bset_aux_tree_type(t) != BSET_RO_AUX_TREE);
+
+	/* signal to make_bfloat() that they're uninitialized: */
+	min_key.u64s = max_key.u64s = 0;
+
+	if (bkey_next(k) == btree_bkey_last(b, t)) {
+		t->max_key = bkey_unpack_pos(b, k);
+
+		for (j = 1; j < t->size; j = j * 2 + 1)
+			make_bfloat(b, t, j, &min_key, &max_key);
+	}
+
+	inorder = bkey_to_cacheline(b, t, k);
+
+	if (inorder &&
+	    inorder < t->size) {
+		j = __inorder_to_eytzinger1(inorder, t->size, t->extra);
+
+		if (k == tree_to_bkey(b, t, j)) {
+			/* Fix the node this key corresponds to */
+			make_bfloat(b, t, j, &min_key, &max_key);
+
+			/* Children for which this key is the right boundary */
+			for (j = eytzinger1_left_child(j);
+			     j < t->size;
+			     j = eytzinger1_right_child(j))
+				make_bfloat(b, t, j, &min_key, &max_key);
+		}
+	}
+
+	if (inorder + 1 < t->size) {
+		j = __inorder_to_eytzinger1(inorder + 1, t->size, t->extra);
+
+		if (k == tree_to_prev_bkey(b, t, j)) {
+			make_bfloat(b, t, j, &min_key, &max_key);
+
+			/* Children for which this key is the left boundary */
+			for (j = eytzinger1_right_child(j);
+			     j < t->size;
+			     j = eytzinger1_left_child(j))
+				make_bfloat(b, t, j, &min_key, &max_key);
+		}
+	}
+}
+
+/**
+ * bch2_bset_fix_invalidated_key() - given an existing  key @k that has been
+ * modified, fix any auxiliary search tree by remaking all the nodes in the
+ * auxiliary search tree that @k corresponds to
+ */
+void bch2_bset_fix_invalidated_key(struct btree *b, struct bset_tree *t,
+				   struct bkey_packed *k)
+{
+	switch (bset_aux_tree_type(t)) {
+	case BSET_NO_AUX_TREE:
+		break;
+	case BSET_RO_AUX_TREE:
+		ro_aux_tree_fix_invalidated_key(b, t, k);
+		break;
+	case BSET_RW_AUX_TREE:
+		rw_aux_tree_fix_invalidated_key(b, t, k);
+		break;
+	}
+}
+
+static void bch2_bset_fix_lookup_table(struct btree *b,
+				       struct bset_tree *t,
+				       struct bkey_packed *_where,
+				       unsigned clobber_u64s,
+				       unsigned new_u64s)
+{
+	int shift = new_u64s - clobber_u64s;
+	unsigned l, j, where = __btree_node_key_to_offset(b, _where);
+
+	EBUG_ON(bset_has_ro_aux_tree(t));
+
+	if (!bset_has_rw_aux_tree(t))
+		return;
+
+	l = rw_aux_tree_bsearch(b, t, where);
+
+	/* l is first >= than @where */
+
+	EBUG_ON(l < t->size && rw_aux_tree(b, t)[l].offset < where);
+	EBUG_ON(l && rw_aux_tree(b, t)[l - 1].offset >= where);
+
+	if (!l) /* never delete first entry */
+		l++;
+	else if (l < t->size &&
+		 where < t->end_offset &&
+		 rw_aux_tree(b, t)[l].offset == where)
+		rw_aux_tree_set(b, t, l++, _where);
+
+	/* l now > where */
+
+	for (j = l;
+	     j < t->size &&
+	     rw_aux_tree(b, t)[j].offset < where + clobber_u64s;
+	     j++)
+		;
+
+	if (j < t->size &&
+	    rw_aux_tree(b, t)[j].offset + shift ==
+	    rw_aux_tree(b, t)[l - 1].offset)
+		j++;
+
+	memmove(&rw_aux_tree(b, t)[l],
+		&rw_aux_tree(b, t)[j],
+		(void *) &rw_aux_tree(b, t)[t->size] -
+		(void *) &rw_aux_tree(b, t)[j]);
+	t->size -= j - l;
+
+	for (j = l; j < t->size; j++)
+	       rw_aux_tree(b, t)[j].offset += shift;
+
+	EBUG_ON(l < t->size &&
+		rw_aux_tree(b, t)[l].offset ==
+		rw_aux_tree(b, t)[l - 1].offset);
+
+	if (t->size < bset_rw_tree_capacity(b, t) &&
+	    (l < t->size
+	     ? rw_aux_tree(b, t)[l].offset
+	     : t->end_offset) -
+	    rw_aux_tree(b, t)[l - 1].offset >
+	    L1_CACHE_BYTES / sizeof(u64)) {
+		struct bkey_packed *start = rw_aux_to_bkey(b, t, l - 1);
+		struct bkey_packed *end = l < t->size
+			? rw_aux_to_bkey(b, t, l)
+			: btree_bkey_last(b, t);
+		struct bkey_packed *k = start;
+
+		while (1) {
+			k = bkey_next(k);
+			if (k == end)
+				break;
+
+			if ((void *) k - (void *) start >= L1_CACHE_BYTES) {
+				memmove(&rw_aux_tree(b, t)[l + 1],
+					&rw_aux_tree(b, t)[l],
+					(void *) &rw_aux_tree(b, t)[t->size] -
+					(void *) &rw_aux_tree(b, t)[l]);
+				t->size++;
+				rw_aux_tree_set(b, t, l, k);
+				break;
+			}
+		}
+	}
+
+	bch2_bset_verify_rw_aux_tree(b, t);
+	bset_aux_tree_verify(b);
+}
+
+void bch2_bset_insert(struct btree *b,
+		      struct btree_node_iter *iter,
+		      struct bkey_packed *where,
+		      struct bkey_i *insert,
+		      unsigned clobber_u64s)
+{
+	struct bkey_format *f = &b->format;
+	struct bset_tree *t = bset_tree_last(b);
+	struct bkey_packed packed, *src = bkey_to_packed(insert);
+
+	bch2_bset_verify_rw_aux_tree(b, t);
+
+	if (bch2_bkey_pack_key(&packed, &insert->k, f))
+		src = &packed;
+
+	if (!bkey_whiteout(&insert->k))
+		btree_keys_account_key_add(&b->nr, t - b->set, src);
+
+	if (src->u64s != clobber_u64s) {
+		u64 *src_p = where->_data + clobber_u64s;
+		u64 *dst_p = where->_data + src->u64s;
+
+		EBUG_ON((int) le16_to_cpu(bset(b, t)->u64s) <
+			(int) clobber_u64s - src->u64s);
+
+		memmove_u64s(dst_p, src_p, btree_bkey_last(b, t)->_data - src_p);
+		le16_add_cpu(&bset(b, t)->u64s, src->u64s - clobber_u64s);
+		set_btree_bset_end(b, t);
+	}
+
+	memcpy_u64s(where, src,
+		    bkeyp_key_u64s(f, src));
+	memcpy_u64s(bkeyp_val(f, where), &insert->v,
+		    bkeyp_val_u64s(f, src));
+
+	bch2_bset_fix_lookup_table(b, t, where, clobber_u64s, src->u64s);
+
+	bch2_verify_key_order(b, iter, where);
+	bch2_verify_btree_nr_keys(b);
+}
+
+void bch2_bset_delete(struct btree *b,
+		      struct bkey_packed *where,
+		      unsigned clobber_u64s)
+{
+	struct bset_tree *t = bset_tree_last(b);
+	u64 *src_p = where->_data + clobber_u64s;
+	u64 *dst_p = where->_data;
+
+	bch2_bset_verify_rw_aux_tree(b, t);
+
+	EBUG_ON(le16_to_cpu(bset(b, t)->u64s) < clobber_u64s);
+
+	memmove_u64s_down(dst_p, src_p, btree_bkey_last(b, t)->_data - src_p);
+	le16_add_cpu(&bset(b, t)->u64s, -clobber_u64s);
+	set_btree_bset_end(b, t);
+
+	bch2_bset_fix_lookup_table(b, t, where, clobber_u64s, 0);
+}
+
+/* Lookup */
+
+__flatten
+static struct bkey_packed *bset_search_write_set(const struct btree *b,
+				struct bset_tree *t,
+				struct bpos search,
+				const struct bkey_packed *packed_search)
+{
+	unsigned l = 0, r = t->size;
+
+	while (l + 1 != r) {
+		unsigned m = (l + r) >> 1;
+
+		if (bkey_cmp(rw_aux_tree(b, t)[m].k, search) < 0)
+			l = m;
+		else
+			r = m;
+	}
+
+	return rw_aux_to_bkey(b, t, l);
+}
+
+noinline
+static int bset_search_tree_slowpath(const struct btree *b,
+				struct bset_tree *t, struct bpos *search,
+				const struct bkey_packed *packed_search,
+				unsigned n)
+{
+	return bkey_cmp_p_or_unp(b, tree_to_bkey(b, t, n),
+				 packed_search, search) < 0;
+}
+
+__flatten
+static struct bkey_packed *bset_search_tree(const struct btree *b,
+				struct bset_tree *t,
+				struct bpos search,
+				const struct bkey_packed *packed_search)
+{
+	struct ro_aux_tree *base = ro_aux_tree_base(b, t);
+	struct bkey_float *f = bkey_float_get(base, 1);
+	void *p;
+	unsigned inorder, n = 1;
+
+	while (1) {
+		if (likely(n << 4 < t->size)) {
+			p = bkey_float_get(base, n << 4);
+			prefetch(p);
+		} else if (n << 3 < t->size) {
+			inorder = __eytzinger1_to_inorder(n, t->size, t->extra);
+			p = bset_cacheline(b, t, inorder);
+#ifdef CONFIG_X86_64
+			asm(".intel_syntax noprefix;"
+			    "prefetcht0 [%0 - 127 + 64 * 0];"
+			    "prefetcht0 [%0 - 127 + 64 * 1];"
+			    "prefetcht0 [%0 - 127 + 64 * 2];"
+			    "prefetcht0 [%0 - 127 + 64 * 3];"
+			    ".att_syntax prefix;"
+			    :
+			    : "r" (p + 127));
+#else
+			prefetch(p + L1_CACHE_BYTES * 0);
+			prefetch(p + L1_CACHE_BYTES * 1);
+			prefetch(p + L1_CACHE_BYTES * 2);
+			prefetch(p + L1_CACHE_BYTES * 3);
+#endif
+		} else if (n >= t->size)
+			break;
+
+		f = bkey_float_get(base, n);
+
+		if (packed_search &&
+		    likely(f->exponent < BFLOAT_FAILED))
+			n = n * 2 + (bfloat_mantissa(f, n) <
+				     bkey_mantissa(packed_search, f, n));
+		else
+			n = n * 2 + bset_search_tree_slowpath(b, t,
+						&search, packed_search, n);
+	} while (n < t->size);
+
+	inorder = __eytzinger1_to_inorder(n >> 1, t->size, t->extra);
+
+	/*
+	 * n would have been the node we recursed to - the low bit tells us if
+	 * we recursed left or recursed right.
+	 */
+	if (n & 1) {
+		return cacheline_to_bkey(b, t, inorder, f->key_offset);
+	} else {
+		if (--inorder) {
+			n = eytzinger1_prev(n >> 1, t->size);
+			f = bkey_float_get(base, n);
+			return cacheline_to_bkey(b, t, inorder, f->key_offset);
+		} else
+			return btree_bkey_first(b, t);
+	}
+}
+
+/*
+ * Returns the first key greater than or equal to @search
+ */
+__always_inline __flatten
+static struct bkey_packed *bch2_bset_search(struct btree *b,
+				struct bset_tree *t,
+				struct bpos search,
+				struct bkey_packed *packed_search,
+				const struct bkey_packed *lossy_packed_search,
+				bool strictly_greater)
+{
+	struct bkey_packed *m;
+
+	/*
+	 * First, we search for a cacheline, then lastly we do a linear search
+	 * within that cacheline.
+	 *
+	 * To search for the cacheline, there's three different possibilities:
+	 *  * The set is too small to have a search tree, so we just do a linear
+	 *    search over the whole set.
+	 *  * The set is the one we're currently inserting into; keeping a full
+	 *    auxiliary search tree up to date would be too expensive, so we
+	 *    use a much simpler lookup table to do a binary search -
+	 *    bset_search_write_set().
+	 *  * Or we use the auxiliary search tree we constructed earlier -
+	 *    bset_search_tree()
+	 */
+
+	switch (bset_aux_tree_type(t)) {
+	case BSET_NO_AUX_TREE:
+		m = btree_bkey_first(b, t);
+		break;
+	case BSET_RW_AUX_TREE:
+		m = bset_search_write_set(b, t, search, lossy_packed_search);
+		break;
+	case BSET_RO_AUX_TREE:
+		/*
+		 * Each node in the auxiliary search tree covers a certain range
+		 * of bits, and keys above and below the set it covers might
+		 * differ outside those bits - so we have to special case the
+		 * start and end - handle that here:
+		 */
+
+		if (bkey_cmp(search, t->max_key) > 0)
+			return btree_bkey_last(b, t);
+
+		m = bset_search_tree(b, t, search, lossy_packed_search);
+		break;
+	}
+
+	if (lossy_packed_search)
+		while (m != btree_bkey_last(b, t) &&
+		       !btree_iter_pos_cmp_p_or_unp(b, search, lossy_packed_search,
+						    m, strictly_greater))
+			m = bkey_next(m);
+
+	if (!packed_search)
+		while (m != btree_bkey_last(b, t) &&
+		       !btree_iter_pos_cmp_packed(b, &search, m, strictly_greater))
+			m = bkey_next(m);
+
+	if (btree_keys_expensive_checks(b)) {
+		struct bkey_packed *prev = bch2_bkey_prev_all(b, t, m);
+
+		BUG_ON(prev &&
+		       btree_iter_pos_cmp_p_or_unp(b, search, packed_search,
+						   prev, strictly_greater));
+	}
+
+	return m;
+}
+
+/* Btree node iterator */
+
+void bch2_btree_node_iter_push(struct btree_node_iter *iter,
+			       struct btree *b,
+			       const struct bkey_packed *k,
+			       const struct bkey_packed *end)
+{
+	__bch2_btree_node_iter_push(iter, b, k, end);
+	bch2_btree_node_iter_sort(iter, b);
+}
+
+noinline __flatten __attribute__((cold))
+static void btree_node_iter_init_pack_failed(struct btree_node_iter *iter,
+			      struct btree *b, struct bpos search,
+			      bool strictly_greater, bool is_extents)
+{
+	struct bset_tree *t;
+
+	trace_bkey_pack_pos_fail(&search);
+
+	for_each_bset(b, t)
+		__bch2_btree_node_iter_push(iter, b,
+			bch2_bset_search(b, t, search, NULL, NULL,
+					strictly_greater),
+			btree_bkey_last(b, t));
+
+	bch2_btree_node_iter_sort(iter, b);
+}
+
+/**
+ * bch_btree_node_iter_init - initialize a btree node iterator, starting from a
+ * given position
+ *
+ * Main entry point to the lookup code for individual btree nodes:
+ *
+ * NOTE:
+ *
+ * When you don't filter out deleted keys, btree nodes _do_ contain duplicate
+ * keys. This doesn't matter for most code, but it does matter for lookups.
+ *
+ * Some adjacent keys with a string of equal keys:
+ *	i j k k k k l m
+ *
+ * If you search for k, the lookup code isn't guaranteed to return you any
+ * specific k. The lookup code is conceptually doing a binary search and
+ * iterating backwards is very expensive so if the pivot happens to land at the
+ * last k that's what you'll get.
+ *
+ * This works out ok, but it's something to be aware of:
+ *
+ *  - For non extents, we guarantee that the live key comes last - see
+ *    btree_node_iter_cmp(), keys_out_of_order(). So the duplicates you don't
+ *    see will only be deleted keys you don't care about.
+ *
+ *  - For extents, deleted keys sort last (see the comment at the top of this
+ *    file). But when you're searching for extents, you actually want the first
+ *    key strictly greater than your search key - an extent that compares equal
+ *    to the search key is going to have 0 sectors after the search key.
+ *
+ *    But this does mean that we can't just search for
+ *    bkey_successor(start_of_range) to get the first extent that overlaps with
+ *    the range we want - if we're unlucky and there's an extent that ends
+ *    exactly where we searched, then there could be a deleted key at the same
+ *    position and we'd get that when we search instead of the preceding extent
+ *    we needed.
+ *
+ *    So we've got to search for start_of_range, then after the lookup iterate
+ *    past any extents that compare equal to the position we searched for.
+ */
+void bch2_btree_node_iter_init(struct btree_node_iter *iter,
+			       struct btree *b, struct bpos search,
+			       bool strictly_greater, bool is_extents)
+{
+	struct bset_tree *t;
+	struct bkey_packed p, *packed_search = NULL;
+
+	EBUG_ON(bkey_cmp(search, b->data->min_key) < 0);
+	bset_aux_tree_verify(b);
+
+	__bch2_btree_node_iter_init(iter, is_extents);
+
+	switch (bch2_bkey_pack_pos_lossy(&p, search, b)) {
+	case BKEY_PACK_POS_EXACT:
+		packed_search = &p;
+		break;
+	case BKEY_PACK_POS_SMALLER:
+		packed_search = NULL;
+		break;
+	case BKEY_PACK_POS_FAIL:
+		btree_node_iter_init_pack_failed(iter, b, search,
+					strictly_greater, is_extents);
+		return;
+	}
+
+	for_each_bset(b, t)
+		__bch2_btree_node_iter_push(iter, b,
+					   bch2_bset_search(b, t, search,
+							   packed_search, &p,
+							   strictly_greater),
+					   btree_bkey_last(b, t));
+
+	bch2_btree_node_iter_sort(iter, b);
+}
+
+void bch2_btree_node_iter_init_from_start(struct btree_node_iter *iter,
+					  struct btree *b,
+					  bool is_extents)
+{
+	struct bset_tree *t;
+
+	__bch2_btree_node_iter_init(iter, is_extents);
+
+	for_each_bset(b, t)
+		__bch2_btree_node_iter_push(iter, b,
+					   btree_bkey_first(b, t),
+					   btree_bkey_last(b, t));
+	bch2_btree_node_iter_sort(iter, b);
+}
+
+struct bkey_packed *bch2_btree_node_iter_bset_pos(struct btree_node_iter *iter,
+						  struct btree *b,
+						  struct bset_tree *t)
+{
+	struct btree_node_iter_set *set;
+
+	btree_node_iter_for_each(iter, set)
+		if (set->end == t->end_offset)
+			return __btree_node_offset_to_key(b, set->k);
+
+	return btree_bkey_last(b, t);
+}
+
+static inline bool btree_node_iter_sort_two(struct btree_node_iter *iter,
+					    struct btree *b,
+					    unsigned first)
+{
+	bool ret;
+
+	if ((ret = (btree_node_iter_cmp(iter, b,
+					iter->data[first],
+					iter->data[first + 1]) > 0)))
+		swap(iter->data[first], iter->data[first + 1]);
+	return ret;
+}
+
+void bch2_btree_node_iter_sort(struct btree_node_iter *iter,
+			       struct btree *b)
+{
+	/* unrolled bubble sort: */
+
+	if (!__btree_node_iter_set_end(iter, 2)) {
+		btree_node_iter_sort_two(iter, b, 0);
+		btree_node_iter_sort_two(iter, b, 1);
+	}
+
+	if (!__btree_node_iter_set_end(iter, 1))
+		btree_node_iter_sort_two(iter, b, 0);
+}
+
+void bch2_btree_node_iter_set_drop(struct btree_node_iter *iter,
+				   struct btree_node_iter_set *set)
+{
+	struct btree_node_iter_set *last =
+		iter->data + ARRAY_SIZE(iter->data) - 1;
+
+	memmove(&set[0], &set[1], (void *) last - (void *) set);
+	*last = (struct btree_node_iter_set) { 0, 0 };
+}
+
+static inline void __bch2_btree_node_iter_advance(struct btree_node_iter *iter,
+						  struct btree *b)
+{
+	iter->data->k += __bch2_btree_node_iter_peek_all(iter, b)->u64s;
+
+	EBUG_ON(iter->data->k > iter->data->end);
+
+	if (unlikely(__btree_node_iter_set_end(iter, 0))) {
+		bch2_btree_node_iter_set_drop(iter, iter->data);
+		return;
+	}
+
+	if (__btree_node_iter_set_end(iter, 1))
+		return;
+
+	if (!btree_node_iter_sort_two(iter, b, 0))
+		return;
+
+	if (__btree_node_iter_set_end(iter, 2))
+		return;
+
+	btree_node_iter_sort_two(iter, b, 1);
+}
+
+/**
+ * bch_btree_node_iter_advance - advance @iter by one key
+ *
+ * Doesn't do debugchecks - for cases where (insert_fixup_extent()) a bset might
+ * momentarily have out of order extents.
+ */
+void bch2_btree_node_iter_advance(struct btree_node_iter *iter,
+				  struct btree *b)
+{
+#ifdef CONFIG_BCACHEFS_DEBUG
+	struct bkey_packed *k = bch2_btree_node_iter_peek_all(iter, b);
+
+	__bch2_btree_node_iter_advance(iter, b);
+	bch2_btree_node_iter_next_check(iter, b, k);
+#else
+	__bch2_btree_node_iter_advance(iter, b);
+#endif
+}
+
+static inline unsigned __btree_node_iter_used(struct btree_node_iter *iter)
+{
+	unsigned n = ARRAY_SIZE(iter->data);
+
+	while (n && __btree_node_iter_set_end(iter, n - 1))
+		--n;
+
+	return n;
+}
+
+/*
+ * Expensive:
+ */
+struct bkey_packed *bch2_btree_node_iter_prev_filter(struct btree_node_iter *iter,
+						     struct btree *b,
+						     unsigned min_key_type)
+{
+	struct bkey_packed *k, *prev = NULL;
+	struct bkey_packed *orig_pos = bch2_btree_node_iter_peek_all(iter, b);
+	struct btree_node_iter_set *set;
+	struct bset_tree *t;
+	unsigned end;
+
+	bch2_btree_node_iter_verify(iter, b);
+
+	for_each_bset(b, t) {
+		k = bch2_bkey_prev_filter(b, t,
+			bch2_btree_node_iter_bset_pos(iter, b, t),
+			min_key_type);
+		if (k &&
+		    (!prev || __btree_node_iter_cmp(iter->is_extents, b,
+						    k, prev) > 0)) {
+			prev = k;
+			end = t->end_offset;
+		}
+	}
+
+	if (!prev)
+		goto out;
+
+	/*
+	 * We're manually memmoving instead of just calling sort() to ensure the
+	 * prev we picked ends up in slot 0 - sort won't necessarily put it
+	 * there because of duplicate deleted keys:
+	 */
+	btree_node_iter_for_each(iter, set)
+		if (set->end == end)
+			goto found;
+
+	BUG_ON(set != &iter->data[__btree_node_iter_used(iter)]);
+found:
+	BUG_ON(set >= iter->data + ARRAY_SIZE(iter->data));
+
+	memmove(&iter->data[1],
+		&iter->data[0],
+		(void *) set - (void *) &iter->data[0]);
+
+	iter->data[0].k = __btree_node_key_to_offset(b, prev);
+	iter->data[0].end = end;
+out:
+	if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) {
+		struct btree_node_iter iter2 = *iter;
+
+		if (prev)
+			bch2_btree_node_iter_advance(&iter2, b);
+
+		while ((k = bch2_btree_node_iter_peek_all(&iter2, b)) != orig_pos) {
+			BUG_ON(k->type >= min_key_type);
+			bch2_btree_node_iter_advance(&iter2, b);
+		}
+	}
+
+	return prev;
+}
+
+struct bkey_s_c bch2_btree_node_iter_peek_unpack(struct btree_node_iter *iter,
+						 struct btree *b,
+						 struct bkey *u)
+{
+	struct bkey_packed *k = bch2_btree_node_iter_peek(iter, b);
+
+	return k ? bkey_disassemble(b, k, u) : bkey_s_c_null;
+}
+
+/* Mergesort */
+
+void bch2_btree_keys_stats(struct btree *b, struct bset_stats *stats)
+{
+	struct bset_tree *t;
+
+	for_each_bset(b, t) {
+		enum bset_aux_tree_type type = bset_aux_tree_type(t);
+		size_t j;
+
+		stats->sets[type].nr++;
+		stats->sets[type].bytes += le16_to_cpu(bset(b, t)->u64s) *
+			sizeof(u64);
+
+		if (bset_has_ro_aux_tree(t)) {
+			stats->floats += t->size - 1;
+
+			for (j = 1; j < t->size; j++)
+				switch (bkey_float(b, t, j)->exponent) {
+				case BFLOAT_FAILED_UNPACKED:
+					stats->failed_unpacked++;
+					break;
+				case BFLOAT_FAILED_PREV:
+					stats->failed_prev++;
+					break;
+				case BFLOAT_FAILED_OVERFLOW:
+					stats->failed_overflow++;
+					break;
+				}
+		}
+	}
+}
+
+int bch2_bkey_print_bfloat(struct btree *b, struct bkey_packed *k,
+			   char *buf, size_t size)
+{
+	struct bset_tree *t = bch2_bkey_to_bset(b, k);
+	struct bkey_packed *l, *r, *p;
+	struct bkey uk, up;
+	char buf1[200], buf2[200];
+	unsigned j;
+
+	if (!size)
+		return 0;
+
+	if (!bset_has_ro_aux_tree(t))
+		goto out;
+
+	j = __inorder_to_eytzinger1(bkey_to_cacheline(b, t, k), t->size, t->extra);
+	if (j &&
+	    j < t->size &&
+	    k == tree_to_bkey(b, t, j))
+		switch (bkey_float(b, t, j)->exponent) {
+		case BFLOAT_FAILED_UNPACKED:
+			uk = bkey_unpack_key(b, k);
+			return scnprintf(buf, size,
+					 "    failed unpacked at depth %u\n"
+					 "\t%llu:%llu\n",
+					 ilog2(j),
+					 uk.p.inode, uk.p.offset);
+		case BFLOAT_FAILED_PREV:
+			p = tree_to_prev_bkey(b, t, j);
+			l = is_power_of_2(j)
+				? btree_bkey_first(b, t)
+				: tree_to_prev_bkey(b, t, j >> ffs(j));
+			r = is_power_of_2(j + 1)
+				? bch2_bkey_prev_all(b, t, btree_bkey_last(b, t))
+				: tree_to_bkey(b, t, j >> (ffz(j) + 1));
+
+			up = bkey_unpack_key(b, p);
+			uk = bkey_unpack_key(b, k);
+			bch2_to_binary(buf1, high_word(&b->format, p), b->nr_key_bits);
+			bch2_to_binary(buf2, high_word(&b->format, k), b->nr_key_bits);
+
+			return scnprintf(buf, size,
+					 "    failed prev at depth %u\n"
+					 "\tkey starts at bit %u but first differing bit at %u\n"
+					 "\t%llu:%llu\n"
+					 "\t%llu:%llu\n"
+					 "\t%s\n"
+					 "\t%s\n",
+					 ilog2(j),
+					 bch2_bkey_greatest_differing_bit(b, l, r),
+					 bch2_bkey_greatest_differing_bit(b, p, k),
+					 uk.p.inode, uk.p.offset,
+					 up.p.inode, up.p.offset,
+					 buf1, buf2);
+		case BFLOAT_FAILED_OVERFLOW:
+			uk = bkey_unpack_key(b, k);
+			return scnprintf(buf, size,
+					 "    failed overflow at depth %u\n"
+					 "\t%llu:%llu\n",
+					 ilog2(j),
+					 uk.p.inode, uk.p.offset);
+		}
+out:
+	*buf = '\0';
+	return 0;
+}
diff --git a/fs/bcachefs/bset.h b/fs/bcachefs/bset.h
new file mode 100644
index 000000000000..2fa71d7c0e8a
--- /dev/null
+++ b/fs/bcachefs/bset.h
@@ -0,0 +1,668 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BSET_H
+#define _BCACHEFS_BSET_H
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+
+#include "bcachefs_format.h"
+#include "bkey.h"
+#include "bkey_methods.h"
+#include "btree_types.h"
+#include "util.h" /* for time_stats */
+#include "vstructs.h"
+
+/*
+ * BKEYS:
+ *
+ * A bkey contains a key, a size field, a variable number of pointers, and some
+ * ancillary flag bits.
+ *
+ * We use two different functions for validating bkeys, bkey_invalid and
+ * bkey_deleted().
+ *
+ * The one exception to the rule that ptr_invalid() filters out invalid keys is
+ * that it also filters out keys of size 0 - these are keys that have been
+ * completely overwritten. It'd be safe to delete these in memory while leaving
+ * them on disk, just unnecessary work - so we filter them out when resorting
+ * instead.
+ *
+ * We can't filter out stale keys when we're resorting, because garbage
+ * collection needs to find them to ensure bucket gens don't wrap around -
+ * unless we're rewriting the btree node those stale keys still exist on disk.
+ *
+ * We also implement functions here for removing some number of sectors from the
+ * front or the back of a bkey - this is mainly used for fixing overlapping
+ * extents, by removing the overlapping sectors from the older key.
+ *
+ * BSETS:
+ *
+ * A bset is an array of bkeys laid out contiguously in memory in sorted order,
+ * along with a header. A btree node is made up of a number of these, written at
+ * different times.
+ *
+ * There could be many of them on disk, but we never allow there to be more than
+ * 4 in memory - we lazily resort as needed.
+ *
+ * We implement code here for creating and maintaining auxiliary search trees
+ * (described below) for searching an individial bset, and on top of that we
+ * implement a btree iterator.
+ *
+ * BTREE ITERATOR:
+ *
+ * Most of the code in bcache doesn't care about an individual bset - it needs
+ * to search entire btree nodes and iterate over them in sorted order.
+ *
+ * The btree iterator code serves both functions; it iterates through the keys
+ * in a btree node in sorted order, starting from either keys after a specific
+ * point (if you pass it a search key) or the start of the btree node.
+ *
+ * AUXILIARY SEARCH TREES:
+ *
+ * Since keys are variable length, we can't use a binary search on a bset - we
+ * wouldn't be able to find the start of the next key. But binary searches are
+ * slow anyways, due to terrible cache behaviour; bcache originally used binary
+ * searches and that code topped out at under 50k lookups/second.
+ *
+ * So we need to construct some sort of lookup table. Since we only insert keys
+ * into the last (unwritten) set, most of the keys within a given btree node are
+ * usually in sets that are mostly constant. We use two different types of
+ * lookup tables to take advantage of this.
+ *
+ * Both lookup tables share in common that they don't index every key in the
+ * set; they index one key every BSET_CACHELINE bytes, and then a linear search
+ * is used for the rest.
+ *
+ * For sets that have been written to disk and are no longer being inserted
+ * into, we construct a binary search tree in an array - traversing a binary
+ * search tree in an array gives excellent locality of reference and is very
+ * fast, since both children of any node are adjacent to each other in memory
+ * (and their grandchildren, and great grandchildren...) - this means
+ * prefetching can be used to great effect.
+ *
+ * It's quite useful performance wise to keep these nodes small - not just
+ * because they're more likely to be in L2, but also because we can prefetch
+ * more nodes on a single cacheline and thus prefetch more iterations in advance
+ * when traversing this tree.
+ *
+ * Nodes in the auxiliary search tree must contain both a key to compare against
+ * (we don't want to fetch the key from the set, that would defeat the purpose),
+ * and a pointer to the key. We use a few tricks to compress both of these.
+ *
+ * To compress the pointer, we take advantage of the fact that one node in the
+ * search tree corresponds to precisely BSET_CACHELINE bytes in the set. We have
+ * a function (to_inorder()) that takes the index of a node in a binary tree and
+ * returns what its index would be in an inorder traversal, so we only have to
+ * store the low bits of the offset.
+ *
+ * The key is 84 bits (KEY_DEV + key->key, the offset on the device). To
+ * compress that,  we take advantage of the fact that when we're traversing the
+ * search tree at every iteration we know that both our search key and the key
+ * we're looking for lie within some range - bounded by our previous
+ * comparisons. (We special case the start of a search so that this is true even
+ * at the root of the tree).
+ *
+ * So we know the key we're looking for is between a and b, and a and b don't
+ * differ higher than bit 50, we don't need to check anything higher than bit
+ * 50.
+ *
+ * We don't usually need the rest of the bits, either; we only need enough bits
+ * to partition the key range we're currently checking.  Consider key n - the
+ * key our auxiliary search tree node corresponds to, and key p, the key
+ * immediately preceding n.  The lowest bit we need to store in the auxiliary
+ * search tree is the highest bit that differs between n and p.
+ *
+ * Note that this could be bit 0 - we might sometimes need all 80 bits to do the
+ * comparison. But we'd really like our nodes in the auxiliary search tree to be
+ * of fixed size.
+ *
+ * The solution is to make them fixed size, and when we're constructing a node
+ * check if p and n differed in the bits we needed them to. If they don't we
+ * flag that node, and when doing lookups we fallback to comparing against the
+ * real key. As long as this doesn't happen to often (and it seems to reliably
+ * happen a bit less than 1% of the time), we win - even on failures, that key
+ * is then more likely to be in cache than if we were doing binary searches all
+ * the way, since we're touching so much less memory.
+ *
+ * The keys in the auxiliary search tree are stored in (software) floating
+ * point, with an exponent and a mantissa. The exponent needs to be big enough
+ * to address all the bits in the original key, but the number of bits in the
+ * mantissa is somewhat arbitrary; more bits just gets us fewer failures.
+ *
+ * We need 7 bits for the exponent and 3 bits for the key's offset (since keys
+ * are 8 byte aligned); using 22 bits for the mantissa means a node is 4 bytes.
+ * We need one node per 128 bytes in the btree node, which means the auxiliary
+ * search trees take up 3% as much memory as the btree itself.
+ *
+ * Constructing these auxiliary search trees is moderately expensive, and we
+ * don't want to be constantly rebuilding the search tree for the last set
+ * whenever we insert another key into it. For the unwritten set, we use a much
+ * simpler lookup table - it's just a flat array, so index i in the lookup table
+ * corresponds to the i range of BSET_CACHELINE bytes in the set. Indexing
+ * within each byte range works the same as with the auxiliary search trees.
+ *
+ * These are much easier to keep up to date when we insert a key - we do it
+ * somewhat lazily; when we shift a key up we usually just increment the pointer
+ * to it, only when it would overflow do we go to the trouble of finding the
+ * first key in that range of bytes again.
+ */
+
+extern bool bch2_expensive_debug_checks;
+
+static inline bool btree_keys_expensive_checks(const struct btree *b)
+{
+#ifdef CONFIG_BCACHEFS_DEBUG
+	return bch2_expensive_debug_checks || *b->expensive_debug_checks;
+#else
+	return false;
+#endif
+}
+
+enum bset_aux_tree_type {
+	BSET_NO_AUX_TREE,
+	BSET_RO_AUX_TREE,
+	BSET_RW_AUX_TREE,
+};
+
+#define BSET_TREE_NR_TYPES	3
+
+#define BSET_NO_AUX_TREE_VAL	(U16_MAX)
+#define BSET_RW_AUX_TREE_VAL	(U16_MAX - 1)
+
+static inline enum bset_aux_tree_type bset_aux_tree_type(const struct bset_tree *t)
+{
+	switch (t->extra) {
+	case BSET_NO_AUX_TREE_VAL:
+		EBUG_ON(t->size);
+		return BSET_NO_AUX_TREE;
+	case BSET_RW_AUX_TREE_VAL:
+		EBUG_ON(!t->size);
+		return BSET_RW_AUX_TREE;
+	default:
+		EBUG_ON(!t->size);
+		return BSET_RO_AUX_TREE;
+	}
+}
+
+typedef void (*compiled_unpack_fn)(struct bkey *, const struct bkey_packed *);
+
+static inline void
+__bkey_unpack_key_format_checked(const struct btree *b,
+			       struct bkey *dst,
+			       const struct bkey_packed *src)
+{
+#ifdef HAVE_BCACHEFS_COMPILED_UNPACK
+	{
+		compiled_unpack_fn unpack_fn = b->aux_data;
+		unpack_fn(dst, src);
+
+		if (btree_keys_expensive_checks(b)) {
+			struct bkey dst2 = __bch2_bkey_unpack_key(&b->format, src);
+
+			/*
+			 * hack around a harmless race when compacting whiteouts
+			 * for a write:
+			 */
+			dst2.needs_whiteout = dst->needs_whiteout;
+
+			BUG_ON(memcmp(dst, &dst2, sizeof(*dst)));
+		}
+	}
+#else
+	*dst = __bch2_bkey_unpack_key(&b->format, src);
+#endif
+}
+
+static inline struct bkey
+bkey_unpack_key_format_checked(const struct btree *b,
+			       const struct bkey_packed *src)
+{
+	struct bkey dst;
+
+	__bkey_unpack_key_format_checked(b, &dst, src);
+	return dst;
+}
+
+static inline void __bkey_unpack_key(const struct btree *b,
+				     struct bkey *dst,
+				     const struct bkey_packed *src)
+{
+	if (likely(bkey_packed(src)))
+		__bkey_unpack_key_format_checked(b, dst, src);
+	else
+		*dst = *packed_to_bkey_c(src);
+}
+
+/**
+ * bkey_unpack_key -- unpack just the key, not the value
+ */
+static inline struct bkey bkey_unpack_key(const struct btree *b,
+					  const struct bkey_packed *src)
+{
+	return likely(bkey_packed(src))
+		? bkey_unpack_key_format_checked(b, src)
+		: *packed_to_bkey_c(src);
+}
+
+static inline struct bpos
+bkey_unpack_pos_format_checked(const struct btree *b,
+			       const struct bkey_packed *src)
+{
+#ifdef HAVE_BCACHEFS_COMPILED_UNPACK
+	return bkey_unpack_key_format_checked(b, src).p;
+#else
+	return __bkey_unpack_pos(&b->format, src);
+#endif
+}
+
+static inline struct bpos bkey_unpack_pos(const struct btree *b,
+					  const struct bkey_packed *src)
+{
+	return likely(bkey_packed(src))
+		? bkey_unpack_pos_format_checked(b, src)
+		: packed_to_bkey_c(src)->p;
+}
+
+/* Disassembled bkeys */
+
+static inline struct bkey_s_c bkey_disassemble(struct btree *b,
+					       const struct bkey_packed *k,
+					       struct bkey *u)
+{
+	__bkey_unpack_key(b, u, k);
+
+	return (struct bkey_s_c) { u, bkeyp_val(&b->format, k), };
+}
+
+/* non const version: */
+static inline struct bkey_s __bkey_disassemble(struct btree *b,
+					       struct bkey_packed *k,
+					       struct bkey *u)
+{
+	__bkey_unpack_key(b, u, k);
+
+	return (struct bkey_s) { .k = u, .v = bkeyp_val(&b->format, k), };
+}
+
+#define for_each_bset(_b, _t)					\
+	for (_t = (_b)->set; _t < (_b)->set + (_b)->nsets; _t++)
+
+static inline bool bset_has_ro_aux_tree(struct bset_tree *t)
+{
+	return bset_aux_tree_type(t) == BSET_RO_AUX_TREE;
+}
+
+static inline bool bset_has_rw_aux_tree(struct bset_tree *t)
+{
+	return bset_aux_tree_type(t) == BSET_RW_AUX_TREE;
+}
+
+static inline void bch2_bset_set_no_aux_tree(struct btree *b,
+					    struct bset_tree *t)
+{
+	BUG_ON(t < b->set);
+
+	for (; t < b->set + ARRAY_SIZE(b->set); t++) {
+		t->size = 0;
+		t->extra = BSET_NO_AUX_TREE_VAL;
+		t->aux_data_offset = U16_MAX;
+	}
+}
+
+static inline void btree_node_set_format(struct btree *b,
+					 struct bkey_format f)
+{
+	int len;
+
+	b->format	= f;
+	b->nr_key_bits	= bkey_format_key_bits(&f);
+
+	len = bch2_compile_bkey_format(&b->format, b->aux_data);
+	BUG_ON(len < 0 || len > U8_MAX);
+
+	b->unpack_fn_len = len;
+
+	bch2_bset_set_no_aux_tree(b, b->set);
+}
+
+static inline struct bset *bset_next_set(struct btree *b,
+					 unsigned block_bytes)
+{
+	struct bset *i = btree_bset_last(b);
+
+	EBUG_ON(!is_power_of_2(block_bytes));
+
+	return ((void *) i) + round_up(vstruct_bytes(i), block_bytes);
+}
+
+void bch2_btree_keys_free(struct btree *);
+int bch2_btree_keys_alloc(struct btree *, unsigned, gfp_t);
+void bch2_btree_keys_init(struct btree *, bool *);
+
+void bch2_bset_init_first(struct btree *, struct bset *);
+void bch2_bset_init_next(struct bch_fs *, struct btree *,
+			 struct btree_node_entry *);
+void bch2_bset_build_aux_tree(struct btree *, struct bset_tree *, bool);
+void bch2_bset_fix_invalidated_key(struct btree *, struct bset_tree *,
+				  struct bkey_packed *);
+
+void bch2_bset_insert(struct btree *, struct btree_node_iter *,
+		     struct bkey_packed *, struct bkey_i *, unsigned);
+void bch2_bset_delete(struct btree *, struct bkey_packed *, unsigned);
+
+/* Bkey utility code */
+
+/* packed or unpacked */
+static inline int bkey_cmp_p_or_unp(const struct btree *b,
+				    const struct bkey_packed *l,
+				    const struct bkey_packed *r_packed,
+				    struct bpos *r)
+{
+	EBUG_ON(r_packed && !bkey_packed(r_packed));
+
+	if (unlikely(!bkey_packed(l)))
+		return bkey_cmp(packed_to_bkey_c(l)->p, *r);
+
+	if (likely(r_packed))
+		return __bch2_bkey_cmp_packed_format_checked(l, r_packed, b);
+
+	return __bch2_bkey_cmp_left_packed_format_checked(b, l, r);
+}
+
+/* Returns true if @k is after iterator position @pos */
+static inline bool btree_iter_pos_cmp_packed(const struct btree *b,
+					     struct bpos *pos,
+					     const struct bkey_packed *k,
+					     bool strictly_greater)
+{
+	int cmp = bkey_cmp_left_packed(b, k, pos);
+
+	return cmp > 0 ||
+		(cmp == 0 && !strictly_greater && !bkey_deleted(k));
+}
+
+static inline bool btree_iter_pos_cmp_p_or_unp(const struct btree *b,
+					struct bpos pos,
+					const struct bkey_packed *pos_packed,
+					const struct bkey_packed *k,
+					bool strictly_greater)
+{
+	int cmp = bkey_cmp_p_or_unp(b, k, pos_packed, &pos);
+
+	return cmp > 0 ||
+		(cmp == 0 && !strictly_greater && !bkey_deleted(k));
+}
+
+struct bset_tree *bch2_bkey_to_bset(struct btree *, struct bkey_packed *);
+
+struct bkey_packed *bch2_bkey_prev_filter(struct btree *, struct bset_tree *,
+					  struct bkey_packed *, unsigned);
+
+static inline struct bkey_packed *
+bch2_bkey_prev_all(struct btree *b, struct bset_tree *t, struct bkey_packed *k)
+{
+	return bch2_bkey_prev_filter(b, t, k, 0);
+}
+
+static inline struct bkey_packed *
+bch2_bkey_prev(struct btree *b, struct bset_tree *t, struct bkey_packed *k)
+{
+	return bch2_bkey_prev_filter(b, t, k, KEY_TYPE_DISCARD + 1);
+}
+
+enum bch_extent_overlap {
+	BCH_EXTENT_OVERLAP_ALL		= 0,
+	BCH_EXTENT_OVERLAP_BACK		= 1,
+	BCH_EXTENT_OVERLAP_FRONT	= 2,
+	BCH_EXTENT_OVERLAP_MIDDLE	= 3,
+};
+
+/* Returns how k overlaps with m */
+static inline enum bch_extent_overlap bch2_extent_overlap(const struct bkey *k,
+							 const struct bkey *m)
+{
+	int cmp1 = bkey_cmp(k->p, m->p) < 0;
+	int cmp2 = bkey_cmp(bkey_start_pos(k),
+			    bkey_start_pos(m)) > 0;
+
+	return (cmp1 << 1) + cmp2;
+}
+
+/* Btree key iteration */
+
+static inline void __bch2_btree_node_iter_init(struct btree_node_iter *iter,
+					      bool is_extents)
+{
+	iter->is_extents = is_extents;
+	memset(iter->data, 0, sizeof(iter->data));
+}
+
+void bch2_btree_node_iter_push(struct btree_node_iter *, struct btree *,
+			      const struct bkey_packed *,
+			      const struct bkey_packed *);
+void bch2_btree_node_iter_init(struct btree_node_iter *, struct btree *,
+			      struct bpos, bool, bool);
+void bch2_btree_node_iter_init_from_start(struct btree_node_iter *,
+					 struct btree *, bool);
+struct bkey_packed *bch2_btree_node_iter_bset_pos(struct btree_node_iter *,
+						 struct btree *,
+						 struct bset_tree *);
+
+void bch2_btree_node_iter_sort(struct btree_node_iter *, struct btree *);
+void bch2_btree_node_iter_set_drop(struct btree_node_iter *,
+				   struct btree_node_iter_set *);
+void bch2_btree_node_iter_advance(struct btree_node_iter *, struct btree *);
+
+#define btree_node_iter_for_each(_iter, _set)				\
+	for (_set = (_iter)->data;					\
+	     _set < (_iter)->data + ARRAY_SIZE((_iter)->data) &&	\
+	     (_set)->k != (_set)->end;					\
+	     _set++)
+
+static inline bool __btree_node_iter_set_end(struct btree_node_iter *iter,
+					     unsigned i)
+{
+	return iter->data[i].k == iter->data[i].end;
+}
+
+static inline bool bch2_btree_node_iter_end(struct btree_node_iter *iter)
+{
+	return __btree_node_iter_set_end(iter, 0);
+}
+
+static inline int __btree_node_iter_cmp(bool is_extents,
+					struct btree *b,
+					struct bkey_packed *l,
+					struct bkey_packed *r)
+{
+	/*
+	 * For non extents, when keys compare equal the deleted keys have to
+	 * come first - so that bch2_btree_node_iter_next_check() can detect
+	 * duplicate nondeleted keys (and possibly other reasons?)
+	 *
+	 * For extents, bkey_deleted() is used as a proxy for k->size == 0, so
+	 * deleted keys have to sort last.
+	 */
+	return bkey_cmp_packed(b, l, r)
+		?: (is_extents
+		    ? (int) bkey_deleted(l) - (int) bkey_deleted(r)
+		    : (int) bkey_deleted(r) - (int) bkey_deleted(l))
+		?: (l > r) - (l < r);
+}
+
+static inline int btree_node_iter_cmp(struct btree_node_iter *iter,
+				      struct btree *b,
+				      struct btree_node_iter_set l,
+				      struct btree_node_iter_set r)
+{
+	return __btree_node_iter_cmp(iter->is_extents, b,
+			__btree_node_offset_to_key(b, l.k),
+			__btree_node_offset_to_key(b, r.k));
+}
+
+static inline void __bch2_btree_node_iter_push(struct btree_node_iter *iter,
+			      struct btree *b,
+			      const struct bkey_packed *k,
+			      const struct bkey_packed *end)
+{
+	if (k != end) {
+		struct btree_node_iter_set *pos;
+
+		btree_node_iter_for_each(iter, pos)
+			;
+
+		BUG_ON(pos >= iter->data + ARRAY_SIZE(iter->data));
+		*pos = (struct btree_node_iter_set) {
+			__btree_node_key_to_offset(b, k),
+			__btree_node_key_to_offset(b, end)
+		};
+	}
+}
+
+static inline struct bkey_packed *
+__bch2_btree_node_iter_peek_all(struct btree_node_iter *iter,
+				struct btree *b)
+{
+	return __btree_node_offset_to_key(b, iter->data->k);
+}
+
+static inline struct bkey_packed *
+bch2_btree_node_iter_peek_filter(struct btree_node_iter *iter,
+				 struct btree *b,
+				 unsigned min_key_type)
+{
+	while (!bch2_btree_node_iter_end(iter)) {
+		struct bkey_packed *k = __bch2_btree_node_iter_peek_all(iter, b);
+
+		if (k->type >= min_key_type)
+			return k;
+
+		bch2_btree_node_iter_advance(iter, b);
+	}
+
+	return NULL;
+}
+
+static inline struct bkey_packed *
+bch2_btree_node_iter_peek_all(struct btree_node_iter *iter,
+			      struct btree *b)
+{
+	return bch2_btree_node_iter_peek_filter(iter, b, 0);
+}
+
+static inline struct bkey_packed *
+bch2_btree_node_iter_peek(struct btree_node_iter *iter, struct btree *b)
+{
+	return bch2_btree_node_iter_peek_filter(iter, b, KEY_TYPE_DISCARD + 1);
+}
+
+static inline struct bkey_packed *
+bch2_btree_node_iter_next_all(struct btree_node_iter *iter, struct btree *b)
+{
+	struct bkey_packed *ret = bch2_btree_node_iter_peek_all(iter, b);
+
+	if (ret)
+		bch2_btree_node_iter_advance(iter, b);
+
+	return ret;
+}
+
+struct bkey_packed *bch2_btree_node_iter_prev_filter(struct btree_node_iter *,
+						     struct btree *, unsigned);
+
+static inline struct bkey_packed *
+bch2_btree_node_iter_prev_all(struct btree_node_iter *iter, struct btree *b)
+{
+	return bch2_btree_node_iter_prev_filter(iter, b, 0);
+}
+
+static inline struct bkey_packed *
+bch2_btree_node_iter_prev(struct btree_node_iter *iter, struct btree *b)
+{
+	return bch2_btree_node_iter_prev_filter(iter, b, KEY_TYPE_DISCARD + 1);
+}
+
+/*
+ * Iterates over all _live_ keys - skipping deleted (and potentially
+ * overlapping) keys
+ */
+#define for_each_btree_node_key(b, k, iter, _is_extents)		\
+	for (bch2_btree_node_iter_init_from_start((iter), (b), (_is_extents));\
+	     ((k) = bch2_btree_node_iter_peek(iter, b));			\
+	     bch2_btree_node_iter_advance(iter, b))
+
+struct bkey_s_c bch2_btree_node_iter_peek_unpack(struct btree_node_iter *,
+						struct btree *,
+						struct bkey *);
+
+#define for_each_btree_node_key_unpack(b, k, iter, _is_extents, unpacked)\
+	for (bch2_btree_node_iter_init_from_start((iter), (b), (_is_extents));\
+	     (k = bch2_btree_node_iter_peek_unpack((iter), (b), (unpacked))).k;\
+	     bch2_btree_node_iter_advance(iter, b))
+
+/* Accounting: */
+
+static inline void btree_keys_account_key(struct btree_nr_keys *n,
+					  unsigned bset,
+					  struct bkey_packed *k,
+					  int sign)
+{
+	n->live_u64s		+= k->u64s * sign;
+	n->bset_u64s[bset]	+= k->u64s * sign;
+
+	if (bkey_packed(k))
+		n->packed_keys	+= sign;
+	else
+		n->unpacked_keys += sign;
+}
+
+#define btree_keys_account_key_add(_nr, _bset_idx, _k)		\
+	btree_keys_account_key(_nr, _bset_idx, _k, 1)
+#define btree_keys_account_key_drop(_nr, _bset_idx, _k)	\
+	btree_keys_account_key(_nr, _bset_idx, _k, -1)
+
+struct bset_stats {
+	struct {
+		size_t nr, bytes;
+	} sets[BSET_TREE_NR_TYPES];
+
+	size_t floats;
+	size_t failed_unpacked;
+	size_t failed_prev;
+	size_t failed_overflow;
+};
+
+void bch2_btree_keys_stats(struct btree *, struct bset_stats *);
+int bch2_bkey_print_bfloat(struct btree *, struct bkey_packed *,
+			  char *, size_t);
+
+/* Debug stuff */
+
+void bch2_dump_bset(struct btree *, struct bset *, unsigned);
+void bch2_dump_btree_node(struct btree *);
+void bch2_dump_btree_node_iter(struct btree *, struct btree_node_iter *);
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+
+void __bch2_verify_btree_nr_keys(struct btree *);
+void bch2_btree_node_iter_verify(struct btree_node_iter *, struct btree *);
+void bch2_verify_key_order(struct btree *, struct btree_node_iter *,
+			  struct bkey_packed *);
+
+#else
+
+static inline void __bch2_verify_btree_nr_keys(struct btree *b) {}
+static inline void bch2_btree_node_iter_verify(struct btree_node_iter *iter,
+					      struct btree *b) {}
+static inline void bch2_verify_key_order(struct btree *b,
+					struct btree_node_iter *iter,
+					struct bkey_packed *where) {}
+#endif
+
+static inline void bch2_verify_btree_nr_keys(struct btree *b)
+{
+	if (btree_keys_expensive_checks(b))
+		__bch2_verify_btree_nr_keys(b);
+}
+
+#endif /* _BCACHEFS_BSET_H */
diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
new file mode 100644
index 000000000000..f9afae6c710d
--- /dev/null
+++ b/fs/bcachefs/btree_cache.c
@@ -0,0 +1,941 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "btree_cache.h"
+#include "btree_io.h"
+#include "btree_iter.h"
+#include "btree_locking.h"
+#include "debug.h"
+#include "extents.h"
+#include "trace.h"
+
+#include <linux/prefetch.h>
+
+#define DEF_BTREE_ID(kwd, val, name) name,
+
+const char * const bch2_btree_ids[] = {
+	DEFINE_BCH_BTREE_IDS()
+	NULL
+};
+
+#undef DEF_BTREE_ID
+
+void bch2_recalc_btree_reserve(struct bch_fs *c)
+{
+	unsigned i, reserve = 16;
+
+	if (!c->btree_roots[0].b)
+		reserve += 8;
+
+	for (i = 0; i < BTREE_ID_NR; i++)
+		if (c->btree_roots[i].b)
+			reserve += min_t(unsigned, 1,
+					 c->btree_roots[i].b->level) * 8;
+
+	c->btree_cache.reserve = reserve;
+}
+
+static inline unsigned btree_cache_can_free(struct btree_cache *bc)
+{
+	return max_t(int, 0, bc->used - bc->reserve);
+}
+
+static void __btree_node_data_free(struct bch_fs *c, struct btree *b)
+{
+	EBUG_ON(btree_node_write_in_flight(b));
+
+	kvpfree(b->data, btree_bytes(c));
+	b->data = NULL;
+	bch2_btree_keys_free(b);
+}
+
+static void btree_node_data_free(struct bch_fs *c, struct btree *b)
+{
+	struct btree_cache *bc = &c->btree_cache;
+
+	__btree_node_data_free(c, b);
+	bc->used--;
+	list_move(&b->list, &bc->freed);
+}
+
+static int bch2_btree_cache_cmp_fn(struct rhashtable_compare_arg *arg,
+				   const void *obj)
+{
+	const struct btree *b = obj;
+	const u64 *v = arg->key;
+
+	return PTR_HASH(&b->key) == *v ? 0 : 1;
+}
+
+static const struct rhashtable_params bch_btree_cache_params = {
+	.head_offset	= offsetof(struct btree, hash),
+	.key_offset	= offsetof(struct btree, key.v),
+	.key_len	= sizeof(struct bch_extent_ptr),
+	.obj_cmpfn	= bch2_btree_cache_cmp_fn,
+};
+
+static void btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp)
+{
+	struct btree_cache *bc = &c->btree_cache;
+
+	b->data = kvpmalloc(btree_bytes(c), gfp);
+	if (!b->data)
+		goto err;
+
+	if (bch2_btree_keys_alloc(b, btree_page_order(c), gfp))
+		goto err;
+
+	bc->used++;
+	list_move(&b->list, &bc->freeable);
+	return;
+err:
+	kvpfree(b->data, btree_bytes(c));
+	b->data = NULL;
+	list_move(&b->list, &bc->freed);
+}
+
+static struct btree *btree_node_mem_alloc(struct bch_fs *c, gfp_t gfp)
+{
+	struct btree *b = kzalloc(sizeof(struct btree), gfp);
+	if (!b)
+		return NULL;
+
+	bkey_extent_init(&b->key);
+	six_lock_init(&b->lock);
+	lockdep_set_novalidate_class(&b->lock);
+	INIT_LIST_HEAD(&b->list);
+	INIT_LIST_HEAD(&b->write_blocked);
+
+	btree_node_data_alloc(c, b, gfp);
+	return b->data ? b : NULL;
+}
+
+/* Btree in memory cache - hash table */
+
+void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b)
+{
+	rhashtable_remove_fast(&bc->table, &b->hash, bch_btree_cache_params);
+
+	/* Cause future lookups for this node to fail: */
+	bkey_i_to_extent(&b->key)->v._data[0] = 0;
+}
+
+int __bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b)
+{
+	return rhashtable_lookup_insert_fast(&bc->table, &b->hash,
+					     bch_btree_cache_params);
+}
+
+int bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b,
+				unsigned level, enum btree_id id)
+{
+	int ret;
+
+	b->level	= level;
+	b->btree_id	= id;
+
+	mutex_lock(&bc->lock);
+	ret = __bch2_btree_node_hash_insert(bc, b);
+	if (!ret)
+		list_add(&b->list, &bc->live);
+	mutex_unlock(&bc->lock);
+
+	return ret;
+}
+
+__flatten
+static inline struct btree *btree_cache_find(struct btree_cache *bc,
+				     const struct bkey_i *k)
+{
+	return rhashtable_lookup_fast(&bc->table, &PTR_HASH(k),
+				      bch_btree_cache_params);
+}
+
+/*
+ * this version is for btree nodes that have already been freed (we're not
+ * reaping a real btree node)
+ */
+static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush)
+{
+	struct btree_cache *bc = &c->btree_cache;
+	int ret = 0;
+
+	lockdep_assert_held(&bc->lock);
+
+	if (!six_trylock_intent(&b->lock))
+		return -ENOMEM;
+
+	if (!six_trylock_write(&b->lock))
+		goto out_unlock_intent;
+
+	if (btree_node_noevict(b))
+		goto out_unlock;
+
+	if (!btree_node_may_write(b))
+		goto out_unlock;
+
+	if (btree_node_dirty(b) ||
+	    btree_node_write_in_flight(b) ||
+	    btree_node_read_in_flight(b)) {
+		if (!flush)
+			goto out_unlock;
+
+		wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight,
+			       TASK_UNINTERRUPTIBLE);
+
+		/*
+		 * Using the underscore version because we don't want to compact
+		 * bsets after the write, since this node is about to be evicted
+		 * - unless btree verify mode is enabled, since it runs out of
+		 * the post write cleanup:
+		 */
+		if (verify_btree_ondisk(c))
+			bch2_btree_node_write(c, b, SIX_LOCK_intent);
+		else
+			__bch2_btree_node_write(c, b, SIX_LOCK_read);
+
+		/* wait for any in flight btree write */
+		btree_node_wait_on_io(b);
+	}
+out:
+	if (PTR_HASH(&b->key) && !ret)
+		trace_btree_node_reap(c, b);
+	return ret;
+out_unlock:
+	six_unlock_write(&b->lock);
+out_unlock_intent:
+	six_unlock_intent(&b->lock);
+	ret = -ENOMEM;
+	goto out;
+}
+
+static int btree_node_reclaim(struct bch_fs *c, struct btree *b)
+{
+	return __btree_node_reclaim(c, b, false);
+}
+
+static int btree_node_write_and_reclaim(struct bch_fs *c, struct btree *b)
+{
+	return __btree_node_reclaim(c, b, true);
+}
+
+static unsigned long bch2_btree_cache_scan(struct shrinker *shrink,
+					   struct shrink_control *sc)
+{
+	struct bch_fs *c = container_of(shrink, struct bch_fs,
+					btree_cache.shrink);
+	struct btree_cache *bc = &c->btree_cache;
+	struct btree *b, *t;
+	unsigned long nr = sc->nr_to_scan;
+	unsigned long can_free;
+	unsigned long touched = 0;
+	unsigned long freed = 0;
+	unsigned i;
+
+	if (btree_shrinker_disabled(c))
+		return SHRINK_STOP;
+
+	/* Return -1 if we can't do anything right now */
+	if (sc->gfp_mask & __GFP_IO)
+		mutex_lock(&bc->lock);
+	else if (!mutex_trylock(&bc->lock))
+		return -1;
+
+	/*
+	 * It's _really_ critical that we don't free too many btree nodes - we
+	 * have to always leave ourselves a reserve. The reserve is how we
+	 * guarantee that allocating memory for a new btree node can always
+	 * succeed, so that inserting keys into the btree can always succeed and
+	 * IO can always make forward progress:
+	 */
+	nr /= btree_pages(c);
+	can_free = btree_cache_can_free(bc);
+	nr = min_t(unsigned long, nr, can_free);
+
+	i = 0;
+	list_for_each_entry_safe(b, t, &bc->freeable, list) {
+		touched++;
+
+		if (freed >= nr)
+			break;
+
+		if (++i > 3 &&
+		    !btree_node_reclaim(c, b)) {
+			btree_node_data_free(c, b);
+			six_unlock_write(&b->lock);
+			six_unlock_intent(&b->lock);
+			freed++;
+		}
+	}
+restart:
+	list_for_each_entry_safe(b, t, &bc->live, list) {
+		touched++;
+
+		if (freed >= nr) {
+			/* Save position */
+			if (&t->list != &bc->live)
+				list_move_tail(&bc->live, &t->list);
+			break;
+		}
+
+		if (!btree_node_accessed(b) &&
+		    !btree_node_reclaim(c, b)) {
+			/* can't call bch2_btree_node_hash_remove under lock  */
+			freed++;
+			if (&t->list != &bc->live)
+				list_move_tail(&bc->live, &t->list);
+
+			btree_node_data_free(c, b);
+			mutex_unlock(&bc->lock);
+
+			bch2_btree_node_hash_remove(bc, b);
+			six_unlock_write(&b->lock);
+			six_unlock_intent(&b->lock);
+
+			if (freed >= nr)
+				goto out;
+
+			if (sc->gfp_mask & __GFP_IO)
+				mutex_lock(&bc->lock);
+			else if (!mutex_trylock(&bc->lock))
+				goto out;
+			goto restart;
+		} else
+			clear_btree_node_accessed(b);
+	}
+
+	mutex_unlock(&bc->lock);
+out:
+	return (unsigned long) freed * btree_pages(c);
+}
+
+static unsigned long bch2_btree_cache_count(struct shrinker *shrink,
+					    struct shrink_control *sc)
+{
+	struct bch_fs *c = container_of(shrink, struct bch_fs,
+					btree_cache.shrink);
+	struct btree_cache *bc = &c->btree_cache;
+
+	if (btree_shrinker_disabled(c))
+		return 0;
+
+	return btree_cache_can_free(bc) * btree_pages(c);
+}
+
+void bch2_fs_btree_cache_exit(struct bch_fs *c)
+{
+	struct btree_cache *bc = &c->btree_cache;
+	struct btree *b;
+	unsigned i;
+
+	if (bc->shrink.list.next)
+		unregister_shrinker(&bc->shrink);
+
+	mutex_lock(&bc->lock);
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+	if (c->verify_data)
+		list_move(&c->verify_data->list, &bc->live);
+
+	kvpfree(c->verify_ondisk, btree_bytes(c));
+#endif
+
+	for (i = 0; i < BTREE_ID_NR; i++)
+		if (c->btree_roots[i].b)
+			list_add(&c->btree_roots[i].b->list, &bc->live);
+
+	list_splice(&bc->freeable, &bc->live);
+
+	while (!list_empty(&bc->live)) {
+		b = list_first_entry(&bc->live, struct btree, list);
+
+		BUG_ON(btree_node_read_in_flight(b) ||
+		       btree_node_write_in_flight(b));
+
+		if (btree_node_dirty(b))
+			bch2_btree_complete_write(c, b, btree_current_write(b));
+		clear_btree_node_dirty(b);
+
+		btree_node_data_free(c, b);
+	}
+
+	while (!list_empty(&bc->freed)) {
+		b = list_first_entry(&bc->freed, struct btree, list);
+		list_del(&b->list);
+		kfree(b);
+	}
+
+	mutex_unlock(&bc->lock);
+
+	if (bc->table_init_done)
+		rhashtable_destroy(&bc->table);
+}
+
+int bch2_fs_btree_cache_init(struct bch_fs *c)
+{
+	struct btree_cache *bc = &c->btree_cache;
+	unsigned i;
+	int ret = 0;
+
+	pr_verbose_init(c->opts, "");
+
+	ret = rhashtable_init(&bc->table, &bch_btree_cache_params);
+	if (ret)
+		goto out;
+
+	bc->table_init_done = true;
+
+	bch2_recalc_btree_reserve(c);
+
+	for (i = 0; i < bc->reserve; i++)
+		if (!btree_node_mem_alloc(c, GFP_KERNEL)) {
+			ret = -ENOMEM;
+			goto out;
+		}
+
+	list_splice_init(&bc->live, &bc->freeable);
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+	mutex_init(&c->verify_lock);
+
+	c->verify_ondisk = kvpmalloc(btree_bytes(c), GFP_KERNEL);
+	if (!c->verify_ondisk) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	c->verify_data = btree_node_mem_alloc(c, GFP_KERNEL);
+	if (!c->verify_data) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	list_del_init(&c->verify_data->list);
+#endif
+
+	bc->shrink.count_objects	= bch2_btree_cache_count;
+	bc->shrink.scan_objects		= bch2_btree_cache_scan;
+	bc->shrink.seeks		= 4;
+	bc->shrink.batch		= btree_pages(c) * 2;
+	register_shrinker(&bc->shrink, "%s/btree_cache", c->name);
+out:
+	pr_verbose_init(c->opts, "ret %i", ret);
+	return ret;
+}
+
+void bch2_fs_btree_cache_init_early(struct btree_cache *bc)
+{
+	mutex_init(&bc->lock);
+	INIT_LIST_HEAD(&bc->live);
+	INIT_LIST_HEAD(&bc->freeable);
+	INIT_LIST_HEAD(&bc->freed);
+}
+
+/*
+ * We can only have one thread cannibalizing other cached btree nodes at a time,
+ * or we'll deadlock. We use an open coded mutex to ensure that, which a
+ * cannibalize_bucket() will take. This means every time we unlock the root of
+ * the btree, we need to release this lock if we have it held.
+ */
+void bch2_btree_cache_cannibalize_unlock(struct bch_fs *c)
+{
+	struct btree_cache *bc = &c->btree_cache;
+
+	if (bc->alloc_lock == current) {
+		trace_btree_node_cannibalize_unlock(c);
+		bc->alloc_lock = NULL;
+		closure_wake_up(&bc->alloc_wait);
+	}
+}
+
+int bch2_btree_cache_cannibalize_lock(struct bch_fs *c, struct closure *cl)
+{
+	struct btree_cache *bc = &c->btree_cache;
+	struct task_struct *old;
+
+	old = cmpxchg(&bc->alloc_lock, NULL, current);
+	if (old == NULL || old == current)
+		goto success;
+
+	if (!cl) {
+		trace_btree_node_cannibalize_lock_fail(c);
+		return -ENOMEM;
+	}
+
+	closure_wait(&bc->alloc_wait, cl);
+
+	/* Try again, after adding ourselves to waitlist */
+	old = cmpxchg(&bc->alloc_lock, NULL, current);
+	if (old == NULL || old == current) {
+		/* We raced */
+		closure_wake_up(&bc->alloc_wait);
+		goto success;
+	}
+
+	trace_btree_node_cannibalize_lock_fail(c);
+	return -EAGAIN;
+
+success:
+	trace_btree_node_cannibalize_lock(c);
+	return 0;
+}
+
+static struct btree *btree_node_cannibalize(struct bch_fs *c)
+{
+	struct btree_cache *bc = &c->btree_cache;
+	struct btree *b;
+
+	list_for_each_entry_reverse(b, &bc->live, list)
+		if (!btree_node_reclaim(c, b))
+			return b;
+
+	while (1) {
+		list_for_each_entry_reverse(b, &bc->live, list)
+			if (!btree_node_write_and_reclaim(c, b))
+				return b;
+
+		/*
+		 * Rare case: all nodes were intent-locked.
+		 * Just busy-wait.
+		 */
+		WARN_ONCE(1, "btree cache cannibalize failed\n");
+		cond_resched();
+	}
+}
+
+struct btree *bch2_btree_node_mem_alloc(struct bch_fs *c)
+{
+	struct btree_cache *bc = &c->btree_cache;
+	struct btree *b;
+	u64 start_time = local_clock();
+
+	mutex_lock(&bc->lock);
+
+	/*
+	 * btree_free() doesn't free memory; it sticks the node on the end of
+	 * the list. Check if there's any freed nodes there:
+	 */
+	list_for_each_entry(b, &bc->freeable, list)
+		if (!btree_node_reclaim(c, b))
+			goto out_unlock;
+
+	/*
+	 * We never free struct btree itself, just the memory that holds the on
+	 * disk node. Check the freed list before allocating a new one:
+	 */
+	list_for_each_entry(b, &bc->freed, list)
+		if (!btree_node_reclaim(c, b)) {
+			btree_node_data_alloc(c, b, __GFP_NOWARN|GFP_NOIO);
+			if (b->data)
+				goto out_unlock;
+
+			six_unlock_write(&b->lock);
+			six_unlock_intent(&b->lock);
+			goto err;
+		}
+
+	b = btree_node_mem_alloc(c, __GFP_NOWARN|GFP_NOIO);
+	if (!b)
+		goto err;
+
+	BUG_ON(!six_trylock_intent(&b->lock));
+	BUG_ON(!six_trylock_write(&b->lock));
+out_unlock:
+	BUG_ON(btree_node_hashed(b));
+	BUG_ON(btree_node_write_in_flight(b));
+
+	list_del_init(&b->list);
+	mutex_unlock(&bc->lock);
+out:
+	b->flags		= 0;
+	b->written		= 0;
+	b->nsets		= 0;
+	b->sib_u64s[0]		= 0;
+	b->sib_u64s[1]		= 0;
+	b->whiteout_u64s	= 0;
+	b->uncompacted_whiteout_u64s = 0;
+	bch2_btree_keys_init(b, &c->expensive_debug_checks);
+
+	bch2_time_stats_update(&c->times[BCH_TIME_btree_node_mem_alloc],
+			       start_time);
+
+	return b;
+err:
+	/* Try to cannibalize another cached btree node: */
+	if (bc->alloc_lock == current) {
+		b = btree_node_cannibalize(c);
+		list_del_init(&b->list);
+		mutex_unlock(&bc->lock);
+
+		bch2_btree_node_hash_remove(bc, b);
+
+		trace_btree_node_cannibalize(c);
+		goto out;
+	}
+
+	mutex_unlock(&bc->lock);
+	return ERR_PTR(-ENOMEM);
+}
+
+/* Slowpath, don't want it inlined into btree_iter_traverse() */
+static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c,
+				struct btree_iter *iter,
+				const struct bkey_i *k,
+				unsigned level,
+				enum six_lock_type lock_type,
+				bool sync)
+{
+	struct btree_cache *bc = &c->btree_cache;
+	struct btree *b;
+
+	/*
+	 * Parent node must be locked, else we could read in a btree node that's
+	 * been freed:
+	 */
+	BUG_ON(!btree_node_locked(iter, level + 1));
+	BUG_ON(level >= BTREE_MAX_DEPTH);
+
+	b = bch2_btree_node_mem_alloc(c);
+	if (IS_ERR(b))
+		return b;
+
+	bkey_copy(&b->key, k);
+	if (bch2_btree_node_hash_insert(bc, b, level, iter->btree_id)) {
+		/* raced with another fill: */
+
+		/* mark as unhashed... */
+		bkey_i_to_extent(&b->key)->v._data[0] = 0;
+
+		mutex_lock(&bc->lock);
+		list_add(&b->list, &bc->freeable);
+		mutex_unlock(&bc->lock);
+
+		six_unlock_write(&b->lock);
+		six_unlock_intent(&b->lock);
+		return NULL;
+	}
+
+	/*
+	 * If the btree node wasn't cached, we can't drop our lock on
+	 * the parent until after it's added to the cache - because
+	 * otherwise we could race with a btree_split() freeing the node
+	 * we're trying to lock.
+	 *
+	 * But the deadlock described below doesn't exist in this case,
+	 * so it's safe to not drop the parent lock until here:
+	 */
+	if (btree_node_read_locked(iter, level + 1))
+		btree_node_unlock(iter, level + 1);
+
+	bch2_btree_node_read(c, b, sync);
+
+	six_unlock_write(&b->lock);
+
+	if (!sync) {
+		six_unlock_intent(&b->lock);
+		return NULL;
+	}
+
+	if (lock_type == SIX_LOCK_read)
+		six_lock_downgrade(&b->lock);
+
+	return b;
+}
+
+/**
+ * bch_btree_node_get - find a btree node in the cache and lock it, reading it
+ * in from disk if necessary.
+ *
+ * If IO is necessary and running under generic_make_request, returns -EAGAIN.
+ *
+ * The btree node will have either a read or a write lock held, depending on
+ * the @write parameter.
+ */
+struct btree *bch2_btree_node_get(struct bch_fs *c, struct btree_iter *iter,
+				  const struct bkey_i *k, unsigned level,
+				  enum six_lock_type lock_type,
+				  bool may_drop_locks)
+{
+	struct btree_cache *bc = &c->btree_cache;
+	struct btree *b;
+	struct bset_tree *t;
+
+	/*
+	 * XXX: locking optimization
+	 *
+	 * we can make the locking looser here - caller can drop lock on parent
+	 * node before locking child node (and potentially blocking): we just
+	 * have to have bch2_btree_node_fill() call relock on the parent and
+	 * return -EINTR if that fails
+	 */
+	EBUG_ON(!btree_node_locked(iter, level + 1));
+	EBUG_ON(level >= BTREE_MAX_DEPTH);
+retry:
+	rcu_read_lock();
+	b = btree_cache_find(bc, k);
+	rcu_read_unlock();
+
+	if (unlikely(!b)) {
+		/*
+		 * We must have the parent locked to call bch2_btree_node_fill(),
+		 * else we could read in a btree node from disk that's been
+		 * freed:
+		 */
+		b = bch2_btree_node_fill(c, iter, k, level, lock_type, true);
+
+		/* We raced and found the btree node in the cache */
+		if (!b)
+			goto retry;
+
+		if (IS_ERR(b))
+			return b;
+	} else {
+		/*
+		 * There's a potential deadlock with splits and insertions into
+		 * interior nodes we have to avoid:
+		 *
+		 * The other thread might be holding an intent lock on the node
+		 * we want, and they want to update its parent node so they're
+		 * going to upgrade their intent lock on the parent node to a
+		 * write lock.
+		 *
+		 * But if we're holding a read lock on the parent, and we're
+		 * trying to get the intent lock they're holding, we deadlock.
+		 *
+		 * So to avoid this we drop the read locks on parent nodes when
+		 * we're starting to take intent locks - and handle the race.
+		 *
+		 * The race is that they might be about to free the node we
+		 * want, and dropping our read lock on the parent node lets them
+		 * update the parent marking the node we want as freed, and then
+		 * free it:
+		 *
+		 * To guard against this, btree nodes are evicted from the cache
+		 * when they're freed - and PTR_HASH() is zeroed out, which we
+		 * check for after we lock the node.
+		 *
+		 * Then, bch2_btree_node_relock() on the parent will fail - because
+		 * the parent was modified, when the pointer to the node we want
+		 * was removed - and we'll bail out:
+		 */
+		if (btree_node_read_locked(iter, level + 1))
+			btree_node_unlock(iter, level + 1);
+
+		if (!btree_node_lock(b, k->k.p, level, iter,
+				     lock_type, may_drop_locks))
+			return ERR_PTR(-EINTR);
+
+		if (unlikely(PTR_HASH(&b->key) != PTR_HASH(k) ||
+			     b->level != level ||
+			     race_fault())) {
+			six_unlock_type(&b->lock, lock_type);
+			if (bch2_btree_node_relock(iter, level + 1))
+				goto retry;
+
+			return ERR_PTR(-EINTR);
+		}
+	}
+
+	wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight,
+		       TASK_UNINTERRUPTIBLE);
+
+	prefetch(b->aux_data);
+
+	for_each_bset(b, t) {
+		void *p = (u64 *) b->aux_data + t->aux_data_offset;
+
+		prefetch(p + L1_CACHE_BYTES * 0);
+		prefetch(p + L1_CACHE_BYTES * 1);
+		prefetch(p + L1_CACHE_BYTES * 2);
+	}
+
+	/* avoid atomic set bit if it's not needed: */
+	if (btree_node_accessed(b))
+		set_btree_node_accessed(b);
+
+	if (unlikely(btree_node_read_error(b))) {
+		six_unlock_type(&b->lock, lock_type);
+		return ERR_PTR(-EIO);
+	}
+
+	EBUG_ON(b->btree_id != iter->btree_id ||
+		BTREE_NODE_LEVEL(b->data) != level ||
+		bkey_cmp(b->data->max_key, k->k.p));
+
+	return b;
+}
+
+struct btree *bch2_btree_node_get_sibling(struct bch_fs *c,
+					  struct btree_iter *iter,
+					  struct btree *b,
+					  bool may_drop_locks,
+					  enum btree_node_sibling sib)
+{
+	struct btree *parent;
+	struct btree_node_iter node_iter;
+	struct bkey_packed *k;
+	BKEY_PADDED(k) tmp;
+	struct btree *ret = NULL;
+	unsigned level = b->level;
+
+	parent = btree_iter_node(iter, level + 1);
+	if (!parent)
+		return NULL;
+
+	if (!bch2_btree_node_relock(iter, level + 1))
+		goto out_upgrade;
+
+	node_iter = iter->l[parent->level].iter;
+
+	k = bch2_btree_node_iter_peek_all(&node_iter, parent);
+	BUG_ON(bkey_cmp_left_packed(parent, k, &b->key.k.p));
+
+	k = sib == btree_prev_sib
+		? bch2_btree_node_iter_prev(&node_iter, parent)
+		: (bch2_btree_node_iter_advance(&node_iter, parent),
+		   bch2_btree_node_iter_peek(&node_iter, parent));
+	if (!k)
+		goto out;
+
+	bch2_bkey_unpack(parent, &tmp.k, k);
+
+	ret = bch2_btree_node_get(c, iter, &tmp.k, level,
+				  SIX_LOCK_intent, may_drop_locks);
+
+	if (PTR_ERR_OR_ZERO(ret) == -EINTR && may_drop_locks) {
+		struct btree_iter *linked;
+
+		if (!bch2_btree_node_relock(iter, level + 1))
+			goto out_upgrade;
+
+		/*
+		 * We might have got -EINTR because trylock failed, and we're
+		 * holding other locks that would cause us to deadlock:
+		 */
+		for_each_linked_btree_iter(iter, linked)
+			if (btree_iter_cmp(iter, linked) < 0)
+				__bch2_btree_iter_unlock(linked);
+
+		if (sib == btree_prev_sib)
+			btree_node_unlock(iter, level);
+
+		ret = bch2_btree_node_get(c, iter, &tmp.k, level,
+					  SIX_LOCK_intent, may_drop_locks);
+
+		/*
+		 * before btree_iter_relock() calls btree_iter_verify_locks():
+		 */
+		if (btree_lock_want(iter, level + 1) == BTREE_NODE_UNLOCKED)
+			btree_node_unlock(iter, level + 1);
+
+		if (!bch2_btree_node_relock(iter, level)) {
+			btree_iter_set_dirty(iter, BTREE_ITER_NEED_RELOCK);
+
+			if (!IS_ERR(ret)) {
+				six_unlock_intent(&ret->lock);
+				ret = ERR_PTR(-EINTR);
+			}
+		}
+
+		bch2_btree_iter_relock(iter);
+	}
+out:
+	if (btree_lock_want(iter, level + 1) == BTREE_NODE_UNLOCKED)
+		btree_node_unlock(iter, level + 1);
+
+	bch2_btree_iter_verify_locks(iter);
+
+	BUG_ON((!may_drop_locks || !IS_ERR(ret)) &&
+	       (iter->uptodate >= BTREE_ITER_NEED_RELOCK ||
+		!btree_node_locked(iter, level)));
+
+	if (!IS_ERR_OR_NULL(ret)) {
+		struct btree *n1 = ret, *n2 = b;
+
+		if (sib != btree_prev_sib)
+			swap(n1, n2);
+
+		BUG_ON(bkey_cmp(btree_type_successor(n1->btree_id,
+						     n1->key.k.p),
+				n2->data->min_key));
+	}
+
+	return ret;
+out_upgrade:
+	if (may_drop_locks)
+		bch2_btree_iter_upgrade(iter, level + 2, true);
+	ret = ERR_PTR(-EINTR);
+	goto out;
+}
+
+void bch2_btree_node_prefetch(struct bch_fs *c, struct btree_iter *iter,
+			      const struct bkey_i *k, unsigned level)
+{
+	struct btree_cache *bc = &c->btree_cache;
+	struct btree *b;
+
+	BUG_ON(!btree_node_locked(iter, level + 1));
+	BUG_ON(level >= BTREE_MAX_DEPTH);
+
+	rcu_read_lock();
+	b = btree_cache_find(bc, k);
+	rcu_read_unlock();
+
+	if (b)
+		return;
+
+	bch2_btree_node_fill(c, iter, k, level, SIX_LOCK_read, false);
+}
+
+int bch2_print_btree_node(struct bch_fs *c, struct btree *b,
+			  char *buf, size_t len)
+{
+	const struct bkey_format *f = &b->format;
+	struct bset_stats stats;
+	char ptrs[100];
+
+	memset(&stats, 0, sizeof(stats));
+
+	bch2_val_to_text(c, BKEY_TYPE_BTREE, ptrs, sizeof(ptrs),
+			bkey_i_to_s_c(&b->key));
+	bch2_btree_keys_stats(b, &stats);
+
+	return scnprintf(buf, len,
+			 "l %u %llu:%llu - %llu:%llu:\n"
+			 "    ptrs: %s\n"
+			 "    format: u64s %u fields %u %u %u %u %u\n"
+			 "    unpack fn len: %u\n"
+			 "    bytes used %zu/%zu (%zu%% full)\n"
+			 "    sib u64s: %u, %u (merge threshold %zu)\n"
+			 "    nr packed keys %u\n"
+			 "    nr unpacked keys %u\n"
+			 "    floats %zu\n"
+			 "    failed unpacked %zu\n"
+			 "    failed prev %zu\n"
+			 "    failed overflow %zu\n",
+			 b->level,
+			 b->data->min_key.inode,
+			 b->data->min_key.offset,
+			 b->data->max_key.inode,
+			 b->data->max_key.offset,
+			 ptrs,
+			 f->key_u64s,
+			 f->bits_per_field[0],
+			 f->bits_per_field[1],
+			 f->bits_per_field[2],
+			 f->bits_per_field[3],
+			 f->bits_per_field[4],
+			 b->unpack_fn_len,
+			 b->nr.live_u64s * sizeof(u64),
+			 btree_bytes(c) - sizeof(struct btree_node),
+			 b->nr.live_u64s * 100 / btree_max_u64s(c),
+			 b->sib_u64s[0],
+			 b->sib_u64s[1],
+			 BTREE_FOREGROUND_MERGE_THRESHOLD(c),
+			 b->nr.packed_keys,
+			 b->nr.unpacked_keys,
+			 stats.floats,
+			 stats.failed_unpacked,
+			 stats.failed_prev,
+			 stats.failed_overflow);
+}
diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h
new file mode 100644
index 000000000000..f7b9bcfe09a3
--- /dev/null
+++ b/fs/bcachefs/btree_cache.h
@@ -0,0 +1,91 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BTREE_CACHE_H
+#define _BCACHEFS_BTREE_CACHE_H
+
+#include "bcachefs.h"
+#include "btree_types.h"
+#include "extents.h"
+
+struct btree_iter;
+
+extern const char * const bch2_btree_ids[];
+
+void bch2_recalc_btree_reserve(struct bch_fs *);
+
+void bch2_btree_node_hash_remove(struct btree_cache *, struct btree *);
+int __bch2_btree_node_hash_insert(struct btree_cache *, struct btree *);
+int bch2_btree_node_hash_insert(struct btree_cache *, struct btree *,
+				unsigned, enum btree_id);
+
+void bch2_btree_cache_cannibalize_unlock(struct bch_fs *);
+int bch2_btree_cache_cannibalize_lock(struct bch_fs *, struct closure *);
+
+struct btree *bch2_btree_node_mem_alloc(struct bch_fs *);
+
+struct btree *bch2_btree_node_get(struct bch_fs *, struct btree_iter *,
+				  const struct bkey_i *, unsigned,
+				  enum six_lock_type, bool);
+
+struct btree *bch2_btree_node_get_sibling(struct bch_fs *, struct btree_iter *,
+					  struct btree *, bool,
+					  enum btree_node_sibling);
+
+void bch2_btree_node_prefetch(struct bch_fs *, struct btree_iter *,
+			      const struct bkey_i *, unsigned);
+
+void bch2_fs_btree_cache_exit(struct bch_fs *);
+int bch2_fs_btree_cache_init(struct bch_fs *);
+void bch2_fs_btree_cache_init_early(struct btree_cache *);
+
+#define PTR_HASH(_k)	(bkey_i_to_extent_c(_k)->v._data[0])
+
+/* is btree node in hash table? */
+static inline bool btree_node_hashed(struct btree *b)
+{
+	return bkey_extent_is_data(&b->key.k) && PTR_HASH(&b->key);
+}
+
+#define for_each_cached_btree(_b, _c, _tbl, _iter, _pos)		\
+	for ((_tbl) = rht_dereference_rcu((_c)->btree_cache.table.tbl,	\
+					  &(_c)->btree_cache.table),	\
+	     _iter = 0;	_iter < (_tbl)->size; _iter++)			\
+		rht_for_each_entry_rcu((_b), (_pos), _tbl, _iter, hash)
+
+static inline size_t btree_bytes(struct bch_fs *c)
+{
+	return c->opts.btree_node_size << 9;
+}
+
+static inline size_t btree_max_u64s(struct bch_fs *c)
+{
+	return (btree_bytes(c) - sizeof(struct btree_node)) / sizeof(u64);
+}
+
+static inline size_t btree_page_order(struct bch_fs *c)
+{
+	return get_order(btree_bytes(c));
+}
+
+static inline size_t btree_pages(struct bch_fs *c)
+{
+	return 1 << btree_page_order(c);
+}
+
+static inline unsigned btree_blocks(struct bch_fs *c)
+{
+	return c->opts.btree_node_size >> c->block_bits;
+}
+
+#define BTREE_SPLIT_THRESHOLD(c)		(btree_blocks(c) * 3 / 4)
+
+#define BTREE_FOREGROUND_MERGE_THRESHOLD(c)	(btree_max_u64s(c) * 1 / 3)
+#define BTREE_FOREGROUND_MERGE_HYSTERESIS(c)			\
+	(BTREE_FOREGROUND_MERGE_THRESHOLD(c) +			\
+	 (BTREE_FOREGROUND_MERGE_THRESHOLD(c) << 2))
+
+#define btree_node_root(_c, _b)	((_c)->btree_roots[(_b)->btree_id].b)
+
+int bch2_print_btree_node(struct bch_fs *, struct btree *,
+			 char *, size_t);
+
+#endif /* _BCACHEFS_BTREE_CACHE_H */
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
new file mode 100644
index 000000000000..155e69056d96
--- /dev/null
+++ b/fs/bcachefs/btree_gc.c
@@ -0,0 +1,1099 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2010 Kent Overstreet <kent.overstreet@gmail.com>
+ * Copyright (C) 2014 Datera Inc.
+ */
+
+#include "bcachefs.h"
+#include "alloc.h"
+#include "bkey_methods.h"
+#include "btree_locking.h"
+#include "btree_update_interior.h"
+#include "btree_io.h"
+#include "btree_gc.h"
+#include "buckets.h"
+#include "clock.h"
+#include "debug.h"
+#include "error.h"
+#include "extents.h"
+#include "journal.h"
+#include "keylist.h"
+#include "move.h"
+#include "replicas.h"
+#include "super-io.h"
+#include "trace.h"
+
+#include <linux/slab.h>
+#include <linux/bitops.h>
+#include <linux/freezer.h>
+#include <linux/kthread.h>
+#include <linux/preempt.h>
+#include <linux/rcupdate.h>
+#include <linux/sched/task.h>
+
+struct range_checks {
+	struct range_level {
+		struct bpos	min;
+		struct bpos	max;
+	}			l[BTREE_MAX_DEPTH];
+	unsigned		depth;
+};
+
+static void btree_node_range_checks_init(struct range_checks *r, unsigned depth)
+{
+	unsigned i;
+
+	for (i = 0; i < BTREE_MAX_DEPTH; i++)
+		r->l[i].min = r->l[i].max = POS_MIN;
+	r->depth = depth;
+}
+
+static void btree_node_range_checks(struct bch_fs *c, struct btree *b,
+				    struct range_checks *r)
+{
+	struct range_level *l = &r->l[b->level];
+
+	struct bpos expected_min = bkey_cmp(l->min, l->max)
+		? btree_type_successor(b->btree_id, l->max)
+		: l->max;
+
+	bch2_fs_inconsistent_on(bkey_cmp(b->data->min_key, expected_min), c,
+		"btree node has incorrect min key: %llu:%llu != %llu:%llu",
+		b->data->min_key.inode,
+		b->data->min_key.offset,
+		expected_min.inode,
+		expected_min.offset);
+
+	l->max = b->data->max_key;
+
+	if (b->level > r->depth) {
+		l = &r->l[b->level - 1];
+
+		bch2_fs_inconsistent_on(bkey_cmp(b->data->min_key, l->min), c,
+			"btree node min doesn't match min of child nodes: %llu:%llu != %llu:%llu",
+			b->data->min_key.inode,
+			b->data->min_key.offset,
+			l->min.inode,
+			l->min.offset);
+
+		bch2_fs_inconsistent_on(bkey_cmp(b->data->max_key, l->max), c,
+			"btree node max doesn't match max of child nodes: %llu:%llu != %llu:%llu",
+			b->data->max_key.inode,
+			b->data->max_key.offset,
+			l->max.inode,
+			l->max.offset);
+
+		if (bkey_cmp(b->data->max_key, POS_MAX))
+			l->min = l->max =
+				btree_type_successor(b->btree_id,
+						     b->data->max_key);
+	}
+}
+
+u8 bch2_btree_key_recalc_oldest_gen(struct bch_fs *c, struct bkey_s_c k)
+{
+	const struct bch_extent_ptr *ptr;
+	u8 max_stale = 0;
+
+	if (bkey_extent_is_data(k.k)) {
+		struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
+
+		extent_for_each_ptr(e, ptr) {
+			struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+			size_t b = PTR_BUCKET_NR(ca, ptr);
+
+			if (gen_after(ca->oldest_gens[b], ptr->gen))
+				ca->oldest_gens[b] = ptr->gen;
+
+			max_stale = max(max_stale, ptr_stale(ca, ptr));
+		}
+	}
+
+	return max_stale;
+}
+
+/*
+ * For runtime mark and sweep:
+ */
+static u8 bch2_gc_mark_key(struct bch_fs *c, enum bkey_type type,
+			   struct bkey_s_c k, unsigned flags)
+{
+	struct gc_pos pos = { 0 };
+	u8 ret = 0;
+
+	switch (type) {
+	case BKEY_TYPE_BTREE:
+		bch2_mark_key(c, k, c->opts.btree_node_size, true, pos, NULL,
+			      0, flags|
+			      BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
+			      BCH_BUCKET_MARK_GC_LOCK_HELD);
+		break;
+	case BKEY_TYPE_EXTENTS:
+		bch2_mark_key(c, k, k.k->size, false, pos, NULL,
+			      0, flags|
+			      BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
+			      BCH_BUCKET_MARK_GC_LOCK_HELD);
+		ret = bch2_btree_key_recalc_oldest_gen(c, k);
+		break;
+	default:
+		BUG();
+	}
+
+	return ret;
+}
+
+int bch2_btree_mark_key_initial(struct bch_fs *c, enum bkey_type type,
+				struct bkey_s_c k)
+{
+	enum bch_data_type data_type = type == BKEY_TYPE_BTREE
+		? BCH_DATA_BTREE : BCH_DATA_USER;
+	int ret = 0;
+
+	BUG_ON(journal_seq_verify(c) &&
+	       k.k->version.lo > journal_cur_seq(&c->journal));
+
+	if (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) ||
+	    fsck_err_on(!bch2_bkey_replicas_marked(c, data_type, k), c,
+			"superblock not marked as containing replicas (type %u)",
+			data_type)) {
+		ret = bch2_mark_bkey_replicas(c, data_type, k);
+		if (ret)
+			return ret;
+	}
+
+	switch (k.k->type) {
+	case BCH_EXTENT:
+	case BCH_EXTENT_CACHED: {
+		struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
+		const struct bch_extent_ptr *ptr;
+
+		extent_for_each_ptr(e, ptr) {
+			struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+			size_t b = PTR_BUCKET_NR(ca, ptr);
+			struct bucket *g = PTR_BUCKET(ca, ptr);
+
+			if (mustfix_fsck_err_on(!g->mark.gen_valid, c,
+					"found ptr with missing gen in alloc btree,\n"
+					"type %s gen %u",
+					bch2_data_types[data_type],
+					ptr->gen)) {
+				g->_mark.gen = ptr->gen;
+				g->_mark.gen_valid = 1;
+				set_bit(b, ca->buckets_dirty);
+			}
+
+			if (mustfix_fsck_err_on(gen_cmp(ptr->gen, g->mark.gen) > 0, c,
+					"%s ptr gen in the future: %u > %u",
+					bch2_data_types[data_type],
+					ptr->gen, g->mark.gen)) {
+				g->_mark.gen = ptr->gen;
+				g->_mark.gen_valid = 1;
+				set_bit(b, ca->buckets_dirty);
+				set_bit(BCH_FS_FIXED_GENS, &c->flags);
+			}
+
+		}
+		break;
+	}
+	}
+
+	atomic64_set(&c->key_version,
+		     max_t(u64, k.k->version.lo,
+			   atomic64_read(&c->key_version)));
+
+	bch2_gc_mark_key(c, type, k, BCH_BUCKET_MARK_NOATOMIC);
+fsck_err:
+	return ret;
+}
+
+static unsigned btree_gc_mark_node(struct bch_fs *c, struct btree *b)
+{
+	enum bkey_type type = btree_node_type(b);
+	struct btree_node_iter iter;
+	struct bkey unpacked;
+	struct bkey_s_c k;
+	u8 stale = 0;
+
+	if (btree_node_has_ptrs(b))
+		for_each_btree_node_key_unpack(b, k, &iter,
+					       btree_node_is_extents(b),
+					       &unpacked) {
+			bch2_bkey_debugcheck(c, b, k);
+			stale = max(stale, bch2_gc_mark_key(c, type, k, 0));
+		}
+
+	return stale;
+}
+
+static inline void __gc_pos_set(struct bch_fs *c, struct gc_pos new_pos)
+{
+	preempt_disable();
+	write_seqcount_begin(&c->gc_pos_lock);
+	c->gc_pos = new_pos;
+	write_seqcount_end(&c->gc_pos_lock);
+	preempt_enable();
+}
+
+static inline void gc_pos_set(struct bch_fs *c, struct gc_pos new_pos)
+{
+	BUG_ON(gc_pos_cmp(new_pos, c->gc_pos) <= 0);
+	__gc_pos_set(c, new_pos);
+}
+
+static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id)
+{
+	struct btree_iter iter;
+	struct btree *b;
+	struct range_checks r;
+	unsigned depth = btree_id == BTREE_ID_EXTENTS ? 0 : 1;
+	unsigned max_stale;
+	int ret = 0;
+
+	gc_pos_set(c, gc_pos_btree(btree_id, POS_MIN, 0));
+
+	if (!c->btree_roots[btree_id].b)
+		return 0;
+
+	/*
+	 * if expensive_debug_checks is on, run range_checks on all leaf nodes:
+	 */
+	if (expensive_debug_checks(c))
+		depth = 0;
+
+	btree_node_range_checks_init(&r, depth);
+
+	__for_each_btree_node(&iter, c, btree_id, POS_MIN,
+			      0, depth, BTREE_ITER_PREFETCH, b) {
+		btree_node_range_checks(c, b, &r);
+
+		bch2_verify_btree_nr_keys(b);
+
+		max_stale = btree_gc_mark_node(c, b);
+
+		gc_pos_set(c, gc_pos_btree_node(b));
+
+		if (max_stale > 64)
+			bch2_btree_node_rewrite(c, &iter,
+					b->data->keys.seq,
+					BTREE_INSERT_USE_RESERVE|
+					BTREE_INSERT_NOWAIT|
+					BTREE_INSERT_GC_LOCK_HELD);
+		else if (!btree_gc_rewrite_disabled(c) &&
+			 (btree_gc_always_rewrite(c) || max_stale > 16))
+			bch2_btree_node_rewrite(c, &iter,
+					b->data->keys.seq,
+					BTREE_INSERT_NOWAIT|
+					BTREE_INSERT_GC_LOCK_HELD);
+
+		bch2_btree_iter_cond_resched(&iter);
+	}
+	ret = bch2_btree_iter_unlock(&iter);
+	if (ret)
+		return ret;
+
+	mutex_lock(&c->btree_root_lock);
+
+	b = c->btree_roots[btree_id].b;
+	if (!btree_node_fake(b))
+		bch2_gc_mark_key(c, BKEY_TYPE_BTREE, bkey_i_to_s_c(&b->key), 0);
+	gc_pos_set(c, gc_pos_btree_root(b->btree_id));
+
+	mutex_unlock(&c->btree_root_lock);
+	return 0;
+}
+
+static void mark_metadata_sectors(struct bch_fs *c, struct bch_dev *ca,
+				  u64 start, u64 end,
+				  enum bch_data_type type,
+				  unsigned flags)
+{
+	u64 b = sector_to_bucket(ca, start);
+
+	do {
+		unsigned sectors =
+			min_t(u64, bucket_to_sector(ca, b + 1), end) - start;
+
+		bch2_mark_metadata_bucket(c, ca, b, type, sectors,
+					  gc_phase(GC_PHASE_SB), flags);
+		b++;
+		start += sectors;
+	} while (start < end);
+}
+
+void bch2_mark_dev_superblock(struct bch_fs *c, struct bch_dev *ca,
+			      unsigned flags)
+{
+	struct bch_sb_layout *layout = &ca->disk_sb.sb->layout;
+	unsigned i;
+	u64 b;
+
+	if (c) {
+		lockdep_assert_held(&c->sb_lock);
+		percpu_down_read(&c->usage_lock);
+	}
+
+	for (i = 0; i < layout->nr_superblocks; i++) {
+		u64 offset = le64_to_cpu(layout->sb_offset[i]);
+
+		if (offset == BCH_SB_SECTOR)
+			mark_metadata_sectors(c, ca, 0, BCH_SB_SECTOR,
+					      BCH_DATA_SB, flags);
+
+		mark_metadata_sectors(c, ca, offset,
+				      offset + (1 << layout->sb_max_size_bits),
+				      BCH_DATA_SB, flags);
+	}
+
+	if (c)
+		spin_lock(&c->journal.lock);
+
+	for (i = 0; i < ca->journal.nr; i++) {
+		b = ca->journal.buckets[i];
+		bch2_mark_metadata_bucket(c, ca, b, BCH_DATA_JOURNAL,
+					  ca->mi.bucket_size,
+					  gc_phase(GC_PHASE_SB), flags);
+	}
+
+	if (c) {
+		spin_unlock(&c->journal.lock);
+		percpu_up_read(&c->usage_lock);
+	}
+}
+
+static void bch2_mark_superblocks(struct bch_fs *c)
+{
+	struct bch_dev *ca;
+	unsigned i;
+
+	mutex_lock(&c->sb_lock);
+	gc_pos_set(c, gc_phase(GC_PHASE_SB));
+
+	for_each_online_member(ca, c, i)
+		bch2_mark_dev_superblock(c, ca,
+					 BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
+					 BCH_BUCKET_MARK_GC_LOCK_HELD);
+	mutex_unlock(&c->sb_lock);
+}
+
+/* Also see bch2_pending_btree_node_free_insert_done() */
+static void bch2_mark_pending_btree_node_frees(struct bch_fs *c)
+{
+	struct gc_pos pos = { 0 };
+	struct bch_fs_usage stats = { 0 };
+	struct btree_update *as;
+	struct pending_btree_node_free *d;
+
+	mutex_lock(&c->btree_interior_update_lock);
+	gc_pos_set(c, gc_phase(GC_PHASE_PENDING_DELETE));
+
+	for_each_pending_btree_node_free(c, as, d)
+		if (d->index_update_done)
+			bch2_mark_key(c, bkey_i_to_s_c(&d->key),
+				      c->opts.btree_node_size, true, pos,
+				      &stats, 0,
+				      BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
+				      BCH_BUCKET_MARK_GC_LOCK_HELD);
+	/*
+	 * Don't apply stats - pending deletes aren't tracked in
+	 * bch_alloc_stats:
+	 */
+
+	mutex_unlock(&c->btree_interior_update_lock);
+}
+
+static void bch2_mark_allocator_buckets(struct bch_fs *c)
+{
+	struct bch_dev *ca;
+	struct open_bucket *ob;
+	size_t i, j, iter;
+	unsigned ci;
+
+	percpu_down_read(&c->usage_lock);
+
+	spin_lock(&c->freelist_lock);
+	gc_pos_set(c, gc_pos_alloc(c, NULL));
+
+	for_each_member_device(ca, c, ci) {
+		fifo_for_each_entry(i, &ca->free_inc, iter)
+			bch2_mark_alloc_bucket(c, ca, i, true,
+					       gc_pos_alloc(c, NULL),
+					       BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
+					       BCH_BUCKET_MARK_GC_LOCK_HELD);
+
+
+
+		for (j = 0; j < RESERVE_NR; j++)
+			fifo_for_each_entry(i, &ca->free[j], iter)
+				bch2_mark_alloc_bucket(c, ca, i, true,
+						       gc_pos_alloc(c, NULL),
+						       BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
+						       BCH_BUCKET_MARK_GC_LOCK_HELD);
+	}
+
+	spin_unlock(&c->freelist_lock);
+
+	for (ob = c->open_buckets;
+	     ob < c->open_buckets + ARRAY_SIZE(c->open_buckets);
+	     ob++) {
+		spin_lock(&ob->lock);
+		if (ob->valid) {
+			gc_pos_set(c, gc_pos_alloc(c, ob));
+			ca = bch_dev_bkey_exists(c, ob->ptr.dev);
+			bch2_mark_alloc_bucket(c, ca, PTR_BUCKET_NR(ca, &ob->ptr), true,
+					       gc_pos_alloc(c, ob),
+					       BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
+					       BCH_BUCKET_MARK_GC_LOCK_HELD);
+		}
+		spin_unlock(&ob->lock);
+	}
+
+	percpu_up_read(&c->usage_lock);
+}
+
+static void bch2_gc_start(struct bch_fs *c)
+{
+	struct bch_dev *ca;
+	struct bucket_array *buckets;
+	struct bucket_mark new;
+	unsigned i;
+	size_t b;
+	int cpu;
+
+	percpu_down_write(&c->usage_lock);
+
+	/*
+	 * Indicates to buckets code that gc is now in progress - done under
+	 * usage_lock to avoid racing with bch2_mark_key():
+	 */
+	__gc_pos_set(c, gc_phase(GC_PHASE_START));
+
+	/* Save a copy of the existing bucket stats while we recompute them: */
+	for_each_member_device(ca, c, i) {
+		ca->usage_cached = __bch2_dev_usage_read(ca);
+		for_each_possible_cpu(cpu) {
+			struct bch_dev_usage *p =
+				per_cpu_ptr(ca->usage_percpu, cpu);
+			memset(p, 0, sizeof(*p));
+		}
+	}
+
+	c->usage_cached = __bch2_fs_usage_read(c);
+	for_each_possible_cpu(cpu) {
+		struct bch_fs_usage *p =
+			per_cpu_ptr(c->usage_percpu, cpu);
+
+		memset(p->s, 0, sizeof(p->s));
+	}
+
+	percpu_up_write(&c->usage_lock);
+
+	/* Clear bucket marks: */
+	for_each_member_device(ca, c, i) {
+		down_read(&ca->bucket_lock);
+		buckets = bucket_array(ca);
+
+		for (b = buckets->first_bucket; b < buckets->nbuckets; b++) {
+			bucket_cmpxchg(buckets->b + b, new, ({
+				new.owned_by_allocator	= 0;
+				new.data_type		= 0;
+				new.cached_sectors	= 0;
+				new.dirty_sectors	= 0;
+			}));
+			ca->oldest_gens[b] = new.gen;
+		}
+		up_read(&ca->bucket_lock);
+	}
+}
+
+/**
+ * bch_gc - recompute bucket marks and oldest_gen, rewrite btree nodes
+ */
+void bch2_gc(struct bch_fs *c)
+{
+	struct bch_dev *ca;
+	u64 start_time = local_clock();
+	unsigned i;
+
+	/*
+	 * Walk _all_ references to buckets, and recompute them:
+	 *
+	 * Order matters here:
+	 *  - Concurrent GC relies on the fact that we have a total ordering for
+	 *    everything that GC walks - see  gc_will_visit_node(),
+	 *    gc_will_visit_root()
+	 *
+	 *  - also, references move around in the course of index updates and
+	 *    various other crap: everything needs to agree on the ordering
+	 *    references are allowed to move around in - e.g., we're allowed to
+	 *    start with a reference owned by an open_bucket (the allocator) and
+	 *    move it to the btree, but not the reverse.
+	 *
+	 *    This is necessary to ensure that gc doesn't miss references that
+	 *    move around - if references move backwards in the ordering GC
+	 *    uses, GC could skip past them
+	 */
+	trace_gc_start(c);
+
+	/*
+	 * Do this before taking gc_lock - bch2_disk_reservation_get() blocks on
+	 * gc_lock if sectors_available goes to 0:
+	 */
+	bch2_recalc_sectors_available(c);
+
+	down_write(&c->gc_lock);
+	if (test_bit(BCH_FS_GC_FAILURE, &c->flags))
+		goto out;
+
+	bch2_gc_start(c);
+
+	bch2_mark_superblocks(c);
+
+	/* Walk btree: */
+	for (i = 0; i < BTREE_ID_NR; i++) {
+		int ret = bch2_gc_btree(c, i);
+		if (ret) {
+			bch_err(c, "btree gc failed: %d", ret);
+			set_bit(BCH_FS_GC_FAILURE, &c->flags);
+			goto out;
+		}
+	}
+
+	bch2_mark_pending_btree_node_frees(c);
+	bch2_mark_allocator_buckets(c);
+
+	for_each_member_device(ca, c, i)
+		atomic_long_set(&ca->saturated_count, 0);
+
+	/* Indicates that gc is no longer in progress: */
+	gc_pos_set(c, gc_phase(GC_PHASE_DONE));
+	c->gc_count++;
+out:
+	up_write(&c->gc_lock);
+	trace_gc_end(c);
+	bch2_time_stats_update(&c->times[BCH_TIME_btree_gc], start_time);
+
+	/*
+	 * Wake up allocator in case it was waiting for buckets
+	 * because of not being able to inc gens
+	 */
+	for_each_member_device(ca, c, i)
+		bch2_wake_allocator(ca);
+
+	/*
+	 * At startup, allocations can happen directly instead of via the
+	 * allocator thread - issue wakeup in case they blocked on gc_lock:
+	 */
+	closure_wake_up(&c->freelist_wait);
+}
+
+/* Btree coalescing */
+
+static void recalc_packed_keys(struct btree *b)
+{
+	struct bkey_packed *k;
+
+	memset(&b->nr, 0, sizeof(b->nr));
+
+	BUG_ON(b->nsets != 1);
+
+	for (k =  btree_bkey_first(b, b->set);
+	     k != btree_bkey_last(b, b->set);
+	     k = bkey_next(k))
+		btree_keys_account_key_add(&b->nr, 0, k);
+}
+
+static void bch2_coalesce_nodes(struct bch_fs *c, struct btree_iter *iter,
+				struct btree *old_nodes[GC_MERGE_NODES])
+{
+	struct btree *parent = btree_node_parent(iter, old_nodes[0]);
+	unsigned i, nr_old_nodes, nr_new_nodes, u64s = 0;
+	unsigned blocks = btree_blocks(c) * 2 / 3;
+	struct btree *new_nodes[GC_MERGE_NODES];
+	struct btree_update *as;
+	struct keylist keylist;
+	struct bkey_format_state format_state;
+	struct bkey_format new_format;
+
+	memset(new_nodes, 0, sizeof(new_nodes));
+	bch2_keylist_init(&keylist, NULL);
+
+	/* Count keys that are not deleted */
+	for (i = 0; i < GC_MERGE_NODES && old_nodes[i]; i++)
+		u64s += old_nodes[i]->nr.live_u64s;
+
+	nr_old_nodes = nr_new_nodes = i;
+
+	/* Check if all keys in @old_nodes could fit in one fewer node */
+	if (nr_old_nodes <= 1 ||
+	    __vstruct_blocks(struct btree_node, c->block_bits,
+			     DIV_ROUND_UP(u64s, nr_old_nodes - 1)) > blocks)
+		return;
+
+	/* Find a format that all keys in @old_nodes can pack into */
+	bch2_bkey_format_init(&format_state);
+
+	for (i = 0; i < nr_old_nodes; i++)
+		__bch2_btree_calc_format(&format_state, old_nodes[i]);
+
+	new_format = bch2_bkey_format_done(&format_state);
+
+	/* Check if repacking would make any nodes too big to fit */
+	for (i = 0; i < nr_old_nodes; i++)
+		if (!bch2_btree_node_format_fits(c, old_nodes[i], &new_format)) {
+			trace_btree_gc_coalesce_fail(c,
+					BTREE_GC_COALESCE_FAIL_FORMAT_FITS);
+			return;
+		}
+
+	if (bch2_keylist_realloc(&keylist, NULL, 0,
+			(BKEY_U64s + BKEY_EXTENT_U64s_MAX) * nr_old_nodes)) {
+		trace_btree_gc_coalesce_fail(c,
+				BTREE_GC_COALESCE_FAIL_KEYLIST_REALLOC);
+		return;
+	}
+
+	as = bch2_btree_update_start(c, iter->btree_id,
+			btree_update_reserve_required(c, parent) + nr_old_nodes,
+			BTREE_INSERT_NOFAIL|
+			BTREE_INSERT_USE_RESERVE,
+			NULL);
+	if (IS_ERR(as)) {
+		trace_btree_gc_coalesce_fail(c,
+				BTREE_GC_COALESCE_FAIL_RESERVE_GET);
+		bch2_keylist_free(&keylist, NULL);
+		return;
+	}
+
+	trace_btree_gc_coalesce(c, old_nodes[0]);
+
+	for (i = 0; i < nr_old_nodes; i++)
+		bch2_btree_interior_update_will_free_node(as, old_nodes[i]);
+
+	/* Repack everything with @new_format and sort down to one bset */
+	for (i = 0; i < nr_old_nodes; i++)
+		new_nodes[i] =
+			__bch2_btree_node_alloc_replacement(as, old_nodes[i],
+							    new_format);
+
+	/*
+	 * Conceptually we concatenate the nodes together and slice them
+	 * up at different boundaries.
+	 */
+	for (i = nr_new_nodes - 1; i > 0; --i) {
+		struct btree *n1 = new_nodes[i];
+		struct btree *n2 = new_nodes[i - 1];
+
+		struct bset *s1 = btree_bset_first(n1);
+		struct bset *s2 = btree_bset_first(n2);
+		struct bkey_packed *k, *last = NULL;
+
+		/* Calculate how many keys from @n2 we could fit inside @n1 */
+		u64s = 0;
+
+		for (k = s2->start;
+		     k < vstruct_last(s2) &&
+		     vstruct_blocks_plus(n1->data, c->block_bits,
+					 u64s + k->u64s) <= blocks;
+		     k = bkey_next(k)) {
+			last = k;
+			u64s += k->u64s;
+		}
+
+		if (u64s == le16_to_cpu(s2->u64s)) {
+			/* n2 fits entirely in n1 */
+			n1->key.k.p = n1->data->max_key = n2->data->max_key;
+
+			memcpy_u64s(vstruct_last(s1),
+				    s2->start,
+				    le16_to_cpu(s2->u64s));
+			le16_add_cpu(&s1->u64s, le16_to_cpu(s2->u64s));
+
+			set_btree_bset_end(n1, n1->set);
+
+			six_unlock_write(&n2->lock);
+			bch2_btree_node_free_never_inserted(c, n2);
+			six_unlock_intent(&n2->lock);
+
+			memmove(new_nodes + i - 1,
+				new_nodes + i,
+				sizeof(new_nodes[0]) * (nr_new_nodes - i));
+			new_nodes[--nr_new_nodes] = NULL;
+		} else if (u64s) {
+			/* move part of n2 into n1 */
+			n1->key.k.p = n1->data->max_key =
+				bkey_unpack_pos(n1, last);
+
+			n2->data->min_key =
+				btree_type_successor(iter->btree_id,
+						     n1->data->max_key);
+
+			memcpy_u64s(vstruct_last(s1),
+				    s2->start, u64s);
+			le16_add_cpu(&s1->u64s, u64s);
+
+			memmove(s2->start,
+				vstruct_idx(s2, u64s),
+				(le16_to_cpu(s2->u64s) - u64s) * sizeof(u64));
+			s2->u64s = cpu_to_le16(le16_to_cpu(s2->u64s) - u64s);
+
+			set_btree_bset_end(n1, n1->set);
+			set_btree_bset_end(n2, n2->set);
+		}
+	}
+
+	for (i = 0; i < nr_new_nodes; i++) {
+		struct btree *n = new_nodes[i];
+
+		recalc_packed_keys(n);
+		btree_node_reset_sib_u64s(n);
+
+		bch2_btree_build_aux_trees(n);
+		six_unlock_write(&n->lock);
+
+		bch2_btree_node_write(c, n, SIX_LOCK_intent);
+	}
+
+	/*
+	 * The keys for the old nodes get deleted. We don't want to insert keys
+	 * that compare equal to the keys for the new nodes we'll also be
+	 * inserting - we can't because keys on a keylist must be strictly
+	 * greater than the previous keys, and we also don't need to since the
+	 * key for the new node will serve the same purpose (overwriting the key
+	 * for the old node).
+	 */
+	for (i = 0; i < nr_old_nodes; i++) {
+		struct bkey_i delete;
+		unsigned j;
+
+		for (j = 0; j < nr_new_nodes; j++)
+			if (!bkey_cmp(old_nodes[i]->key.k.p,
+				      new_nodes[j]->key.k.p))
+				goto next;
+
+		bkey_init(&delete.k);
+		delete.k.p = old_nodes[i]->key.k.p;
+		bch2_keylist_add_in_order(&keylist, &delete);
+next:
+		i = i;
+	}
+
+	/*
+	 * Keys for the new nodes get inserted: bch2_btree_insert_keys() only
+	 * does the lookup once and thus expects the keys to be in sorted order
+	 * so we have to make sure the new keys are correctly ordered with
+	 * respect to the deleted keys added in the previous loop
+	 */
+	for (i = 0; i < nr_new_nodes; i++)
+		bch2_keylist_add_in_order(&keylist, &new_nodes[i]->key);
+
+	/* Insert the newly coalesced nodes */
+	bch2_btree_insert_node(as, parent, iter, &keylist, 0);
+
+	BUG_ON(!bch2_keylist_empty(&keylist));
+
+	BUG_ON(iter->l[old_nodes[0]->level].b != old_nodes[0]);
+
+	bch2_btree_iter_node_replace(iter, new_nodes[0]);
+
+	for (i = 0; i < nr_new_nodes; i++)
+		bch2_btree_open_bucket_put(c, new_nodes[i]);
+
+	/* Free the old nodes and update our sliding window */
+	for (i = 0; i < nr_old_nodes; i++) {
+		bch2_btree_node_free_inmem(c, old_nodes[i], iter);
+		six_unlock_intent(&old_nodes[i]->lock);
+
+		/*
+		 * the index update might have triggered a split, in which case
+		 * the nodes we coalesced - the new nodes we just created -
+		 * might not be sibling nodes anymore - don't add them to the
+		 * sliding window (except the first):
+		 */
+		if (!i) {
+			old_nodes[i] = new_nodes[i];
+		} else {
+			old_nodes[i] = NULL;
+			if (new_nodes[i])
+				six_unlock_intent(&new_nodes[i]->lock);
+		}
+	}
+
+	bch2_btree_update_done(as);
+	bch2_keylist_free(&keylist, NULL);
+}
+
+static int bch2_coalesce_btree(struct bch_fs *c, enum btree_id btree_id)
+{
+	struct btree_iter iter;
+	struct btree *b;
+	bool kthread = (current->flags & PF_KTHREAD) != 0;
+	unsigned i;
+
+	/* Sliding window of adjacent btree nodes */
+	struct btree *merge[GC_MERGE_NODES];
+	u32 lock_seq[GC_MERGE_NODES];
+
+	/*
+	 * XXX: We don't have a good way of positively matching on sibling nodes
+	 * that have the same parent - this code works by handling the cases
+	 * where they might not have the same parent, and is thus fragile. Ugh.
+	 *
+	 * Perhaps redo this to use multiple linked iterators?
+	 */
+	memset(merge, 0, sizeof(merge));
+
+	__for_each_btree_node(&iter, c, btree_id, POS_MIN,
+			      BTREE_MAX_DEPTH, 0,
+			      BTREE_ITER_PREFETCH, b) {
+		memmove(merge + 1, merge,
+			sizeof(merge) - sizeof(merge[0]));
+		memmove(lock_seq + 1, lock_seq,
+			sizeof(lock_seq) - sizeof(lock_seq[0]));
+
+		merge[0] = b;
+
+		for (i = 1; i < GC_MERGE_NODES; i++) {
+			if (!merge[i] ||
+			    !six_relock_intent(&merge[i]->lock, lock_seq[i]))
+				break;
+
+			if (merge[i]->level != merge[0]->level) {
+				six_unlock_intent(&merge[i]->lock);
+				break;
+			}
+		}
+		memset(merge + i, 0, (GC_MERGE_NODES - i) * sizeof(merge[0]));
+
+		bch2_coalesce_nodes(c, &iter, merge);
+
+		for (i = 1; i < GC_MERGE_NODES && merge[i]; i++) {
+			lock_seq[i] = merge[i]->lock.state.seq;
+			six_unlock_intent(&merge[i]->lock);
+		}
+
+		lock_seq[0] = merge[0]->lock.state.seq;
+
+		if (kthread && kthread_should_stop()) {
+			bch2_btree_iter_unlock(&iter);
+			return -ESHUTDOWN;
+		}
+
+		bch2_btree_iter_cond_resched(&iter);
+
+		/*
+		 * If the parent node wasn't relocked, it might have been split
+		 * and the nodes in our sliding window might not have the same
+		 * parent anymore - blow away the sliding window:
+		 */
+		if (btree_iter_node(&iter, iter.level + 1) &&
+		    !btree_node_intent_locked(&iter, iter.level + 1))
+			memset(merge + 1, 0,
+			       (GC_MERGE_NODES - 1) * sizeof(merge[0]));
+	}
+	return bch2_btree_iter_unlock(&iter);
+}
+
+/**
+ * bch_coalesce - coalesce adjacent nodes with low occupancy
+ */
+void bch2_coalesce(struct bch_fs *c)
+{
+	enum btree_id id;
+
+	if (test_bit(BCH_FS_GC_FAILURE, &c->flags))
+		return;
+
+	down_read(&c->gc_lock);
+	trace_gc_coalesce_start(c);
+
+	for (id = 0; id < BTREE_ID_NR; id++) {
+		int ret = c->btree_roots[id].b
+			? bch2_coalesce_btree(c, id)
+			: 0;
+
+		if (ret) {
+			if (ret != -ESHUTDOWN)
+				bch_err(c, "btree coalescing failed: %d", ret);
+			set_bit(BCH_FS_GC_FAILURE, &c->flags);
+			return;
+		}
+	}
+
+	trace_gc_coalesce_end(c);
+	up_read(&c->gc_lock);
+}
+
+static int bch2_gc_thread(void *arg)
+{
+	struct bch_fs *c = arg;
+	struct io_clock *clock = &c->io_clock[WRITE];
+	unsigned long last = atomic_long_read(&clock->now);
+	unsigned last_kick = atomic_read(&c->kick_gc);
+
+	set_freezable();
+
+	while (1) {
+		while (1) {
+			set_current_state(TASK_INTERRUPTIBLE);
+
+			if (kthread_should_stop()) {
+				__set_current_state(TASK_RUNNING);
+				return 0;
+			}
+
+			if (atomic_read(&c->kick_gc) != last_kick)
+				break;
+
+			if (c->btree_gc_periodic) {
+				unsigned long next = last + c->capacity / 16;
+
+				if (atomic_long_read(&clock->now) >= next)
+					break;
+
+				bch2_io_clock_schedule_timeout(clock, next);
+			} else {
+				schedule();
+			}
+
+			try_to_freeze();
+		}
+		__set_current_state(TASK_RUNNING);
+
+		last = atomic_long_read(&clock->now);
+		last_kick = atomic_read(&c->kick_gc);
+
+		bch2_gc(c);
+
+		debug_check_no_locks_held();
+	}
+
+	return 0;
+}
+
+void bch2_gc_thread_stop(struct bch_fs *c)
+{
+	struct task_struct *p;
+
+	p = c->gc_thread;
+	c->gc_thread = NULL;
+
+	if (p) {
+		kthread_stop(p);
+		put_task_struct(p);
+	}
+}
+
+int bch2_gc_thread_start(struct bch_fs *c)
+{
+	struct task_struct *p;
+
+	BUG_ON(c->gc_thread);
+
+	p = kthread_create(bch2_gc_thread, c, "bch_gc");
+	if (IS_ERR(p))
+		return PTR_ERR(p);
+
+	get_task_struct(p);
+	c->gc_thread = p;
+	wake_up_process(p);
+	return 0;
+}
+
+/* Initial GC computes bucket marks during startup */
+
+static int bch2_initial_gc_btree(struct bch_fs *c, enum btree_id id)
+{
+	struct btree_iter iter;
+	struct btree *b;
+	struct range_checks r;
+	int ret = 0;
+
+	btree_node_range_checks_init(&r, 0);
+
+	gc_pos_set(c, gc_pos_btree(id, POS_MIN, 0));
+
+	if (!c->btree_roots[id].b)
+		return 0;
+
+	b = c->btree_roots[id].b;
+	if (!btree_node_fake(b))
+		ret = bch2_btree_mark_key_initial(c, BKEY_TYPE_BTREE,
+						  bkey_i_to_s_c(&b->key));
+	if (ret)
+		return ret;
+
+	/*
+	 * We have to hit every btree node before starting journal replay, in
+	 * order for the journal seq blacklist machinery to work:
+	 */
+	for_each_btree_node(&iter, c, id, POS_MIN, BTREE_ITER_PREFETCH, b) {
+		btree_node_range_checks(c, b, &r);
+
+		if (btree_node_has_ptrs(b)) {
+			struct btree_node_iter node_iter;
+			struct bkey unpacked;
+			struct bkey_s_c k;
+
+			for_each_btree_node_key_unpack(b, k, &node_iter,
+						       btree_node_is_extents(b),
+						       &unpacked) {
+				ret = bch2_btree_mark_key_initial(c,
+							btree_node_type(b), k);
+				if (ret)
+					goto err;
+			}
+		}
+
+		bch2_btree_iter_cond_resched(&iter);
+	}
+err:
+	return bch2_btree_iter_unlock(&iter) ?: ret;
+}
+
+int bch2_initial_gc(struct bch_fs *c, struct list_head *journal)
+{
+	unsigned iter = 0;
+	enum btree_id id;
+	int ret = 0;
+
+	down_write(&c->gc_lock);
+again:
+	bch2_gc_start(c);
+
+	bch2_mark_superblocks(c);
+
+	for (id = 0; id < BTREE_ID_NR; id++) {
+		ret = bch2_initial_gc_btree(c, id);
+		if (ret)
+			goto err;
+	}
+
+	ret = bch2_journal_mark(c, journal);
+	if (ret)
+		goto err;
+
+	if (test_bit(BCH_FS_FIXED_GENS, &c->flags)) {
+		if (iter++ > 2) {
+			bch_info(c, "Unable to fix bucket gens, looping");
+			ret = -EINVAL;
+			goto err;
+		}
+
+		bch_info(c, "Fixed gens, restarting initial mark and sweep:");
+		clear_bit(BCH_FS_FIXED_GENS, &c->flags);
+		goto again;
+	}
+
+	/*
+	 * Skip past versions that might have possibly been used (as nonces),
+	 * but hadn't had their pointers written:
+	 */
+	if (c->sb.encryption_type)
+		atomic64_add(1 << 16, &c->key_version);
+
+	gc_pos_set(c, gc_phase(GC_PHASE_DONE));
+	set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags);
+err:
+	up_write(&c->gc_lock);
+	return ret;
+}
diff --git a/fs/bcachefs/btree_gc.h b/fs/bcachefs/btree_gc.h
new file mode 100644
index 000000000000..9d2b9d5953d2
--- /dev/null
+++ b/fs/bcachefs/btree_gc.h
@@ -0,0 +1,113 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BTREE_GC_H
+#define _BCACHEFS_BTREE_GC_H
+
+#include "btree_types.h"
+
+enum bkey_type;
+
+void bch2_coalesce(struct bch_fs *);
+void bch2_gc(struct bch_fs *);
+void bch2_gc_thread_stop(struct bch_fs *);
+int bch2_gc_thread_start(struct bch_fs *);
+int bch2_initial_gc(struct bch_fs *, struct list_head *);
+u8 bch2_btree_key_recalc_oldest_gen(struct bch_fs *, struct bkey_s_c);
+int bch2_btree_mark_key_initial(struct bch_fs *, enum bkey_type,
+				struct bkey_s_c);
+void bch2_mark_dev_superblock(struct bch_fs *, struct bch_dev *, unsigned);
+
+/*
+ * For concurrent mark and sweep (with other index updates), we define a total
+ * ordering of _all_ references GC walks:
+ *
+ * Note that some references will have the same GC position as others - e.g.
+ * everything within the same btree node; in those cases we're relying on
+ * whatever locking exists for where those references live, i.e. the write lock
+ * on a btree node.
+ *
+ * That locking is also required to ensure GC doesn't pass the updater in
+ * between the updater adding/removing the reference and updating the GC marks;
+ * without that, we would at best double count sometimes.
+ *
+ * That part is important - whenever calling bch2_mark_pointers(), a lock _must_
+ * be held that prevents GC from passing the position the updater is at.
+ *
+ * (What about the start of gc, when we're clearing all the marks? GC clears the
+ * mark with the gc pos seqlock held, and bch_mark_bucket checks against the gc
+ * position inside its cmpxchg loop, so crap magically works).
+ */
+
+/* Position of (the start of) a gc phase: */
+static inline struct gc_pos gc_phase(enum gc_phase phase)
+{
+	return (struct gc_pos) {
+		.phase	= phase,
+		.pos	= POS_MIN,
+		.level	= 0,
+	};
+}
+
+static inline int gc_pos_cmp(struct gc_pos l, struct gc_pos r)
+{
+	if (l.phase != r.phase)
+		return l.phase < r.phase ? -1 : 1;
+	if (bkey_cmp(l.pos, r.pos))
+		return bkey_cmp(l.pos, r.pos);
+	if (l.level != r.level)
+		return l.level < r.level ? -1 : 1;
+	return 0;
+}
+
+static inline struct gc_pos gc_pos_btree(enum btree_id id,
+					 struct bpos pos, unsigned level)
+{
+	return (struct gc_pos) {
+		.phase	= GC_PHASE_BTREE_EXTENTS + id,
+		.pos	= pos,
+		.level	= level,
+	};
+}
+
+/*
+ * GC position of the pointers within a btree node: note, _not_ for &b->key
+ * itself, that lives in the parent node:
+ */
+static inline struct gc_pos gc_pos_btree_node(struct btree *b)
+{
+	return gc_pos_btree(b->btree_id, b->key.k.p, b->level);
+}
+
+/*
+ * GC position of the pointer to a btree root: we don't use
+ * gc_pos_pointer_to_btree_node() here to avoid a potential race with
+ * btree_split() increasing the tree depth - the new root will have level > the
+ * old root and thus have a greater gc position than the old root, but that
+ * would be incorrect since once gc has marked the root it's not coming back.
+ */
+static inline struct gc_pos gc_pos_btree_root(enum btree_id id)
+{
+	return gc_pos_btree(id, POS_MAX, BTREE_MAX_DEPTH);
+}
+
+static inline struct gc_pos gc_pos_alloc(struct bch_fs *c, struct open_bucket *ob)
+{
+	return (struct gc_pos) {
+		.phase	= GC_PHASE_ALLOC,
+		.pos	= POS(ob ? ob - c->open_buckets : 0, 0),
+	};
+}
+
+static inline bool gc_will_visit(struct bch_fs *c, struct gc_pos pos)
+{
+	unsigned seq;
+	bool ret;
+
+	do {
+		seq = read_seqcount_begin(&c->gc_pos_lock);
+		ret = gc_pos_cmp(c->gc_pos, pos) < 0;
+	} while (read_seqcount_retry(&c->gc_pos_lock, seq));
+
+	return ret;
+}
+
+#endif /* _BCACHEFS_BTREE_GC_H */
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
new file mode 100644
index 000000000000..2d004941c52e
--- /dev/null
+++ b/fs/bcachefs/btree_io.c
@@ -0,0 +1,2095 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "bkey_methods.h"
+#include "btree_cache.h"
+#include "btree_io.h"
+#include "btree_iter.h"
+#include "btree_locking.h"
+#include "btree_update.h"
+#include "btree_update_interior.h"
+#include "buckets.h"
+#include "checksum.h"
+#include "debug.h"
+#include "error.h"
+#include "extents.h"
+#include "io.h"
+#include "journal_reclaim.h"
+#include "journal_seq_blacklist.h"
+#include "super-io.h"
+#include "trace.h"
+
+/* btree_node_iter_large: */
+
+#define btree_node_iter_cmp_heap(h, _l, _r)				\
+	__btree_node_iter_cmp((iter)->is_extents, b,			\
+			       __btree_node_offset_to_key(b, (_l).k),	\
+			       __btree_node_offset_to_key(b, (_r).k))
+
+void bch2_btree_node_iter_large_push(struct btree_node_iter_large *iter,
+				     struct btree *b,
+				     const struct bkey_packed *k,
+				     const struct bkey_packed *end)
+{
+	if (k != end) {
+		struct btree_node_iter_set n =
+			((struct btree_node_iter_set) {
+				 __btree_node_key_to_offset(b, k),
+				 __btree_node_key_to_offset(b, end)
+			 });
+
+		__heap_add(iter, n, btree_node_iter_cmp_heap);
+	}
+}
+
+void bch2_btree_node_iter_large_advance(struct btree_node_iter_large *iter,
+					struct btree *b)
+{
+	iter->data->k += __btree_node_offset_to_key(b, iter->data->k)->u64s;
+
+	EBUG_ON(!iter->used);
+	EBUG_ON(iter->data->k > iter->data->end);
+
+	if (iter->data->k == iter->data->end)
+		heap_del(iter, 0, btree_node_iter_cmp_heap);
+	else
+		heap_sift_down(iter, 0, btree_node_iter_cmp_heap);
+}
+
+static void verify_no_dups(struct btree *b,
+			   struct bkey_packed *start,
+			   struct bkey_packed *end)
+{
+#ifdef CONFIG_BCACHEFS_DEBUG
+	struct bkey_packed *k;
+
+	for (k = start; k != end && bkey_next(k) != end; k = bkey_next(k)) {
+		struct bkey l = bkey_unpack_key(b, k);
+		struct bkey r = bkey_unpack_key(b, bkey_next(k));
+
+		BUG_ON(btree_node_is_extents(b)
+		       ? bkey_cmp(l.p, bkey_start_pos(&r)) > 0
+		       : bkey_cmp(l.p, bkey_start_pos(&r)) >= 0);
+		//BUG_ON(bkey_cmp_packed(&b->format, k, bkey_next(k)) >= 0);
+	}
+#endif
+}
+
+static void clear_needs_whiteout(struct bset *i)
+{
+	struct bkey_packed *k;
+
+	for (k = i->start; k != vstruct_last(i); k = bkey_next(k))
+		k->needs_whiteout = false;
+}
+
+static void set_needs_whiteout(struct bset *i)
+{
+	struct bkey_packed *k;
+
+	for (k = i->start; k != vstruct_last(i); k = bkey_next(k))
+		k->needs_whiteout = true;
+}
+
+static void btree_bounce_free(struct bch_fs *c, unsigned order,
+			      bool used_mempool, void *p)
+{
+	if (used_mempool)
+		mempool_free(p, &c->btree_bounce_pool);
+	else
+		vpfree(p, PAGE_SIZE << order);
+}
+
+static void *btree_bounce_alloc(struct bch_fs *c, unsigned order,
+				bool *used_mempool)
+{
+	void *p;
+
+	BUG_ON(order > btree_page_order(c));
+
+	*used_mempool = false;
+	p = (void *) __get_free_pages(__GFP_NOWARN|GFP_NOWAIT, order);
+	if (p)
+		return p;
+
+	*used_mempool = true;
+	return mempool_alloc(&c->btree_bounce_pool, GFP_NOIO);
+}
+
+typedef int (*sort_cmp_fn)(struct btree *,
+			   struct bkey_packed *,
+			   struct bkey_packed *);
+
+struct sort_iter {
+	struct btree	*b;
+	unsigned		used;
+
+	struct sort_iter_set {
+		struct bkey_packed *k, *end;
+	} data[MAX_BSETS + 1];
+};
+
+static void sort_iter_init(struct sort_iter *iter, struct btree *b)
+{
+	memset(iter, 0, sizeof(*iter));
+	iter->b = b;
+}
+
+static inline void __sort_iter_sift(struct sort_iter *iter,
+				    unsigned from,
+				    sort_cmp_fn cmp)
+{
+	unsigned i;
+
+	for (i = from;
+	     i + 1 < iter->used &&
+	     cmp(iter->b, iter->data[i].k, iter->data[i + 1].k) > 0;
+	     i++)
+		swap(iter->data[i], iter->data[i + 1]);
+}
+
+static inline void sort_iter_sift(struct sort_iter *iter, sort_cmp_fn cmp)
+{
+
+	__sort_iter_sift(iter, 0, cmp);
+}
+
+static inline void sort_iter_sort(struct sort_iter *iter, sort_cmp_fn cmp)
+{
+	unsigned i = iter->used;
+
+	while (i--)
+		__sort_iter_sift(iter, i, cmp);
+}
+
+static void sort_iter_add(struct sort_iter *iter,
+			  struct bkey_packed *k,
+			  struct bkey_packed *end)
+{
+	BUG_ON(iter->used >= ARRAY_SIZE(iter->data));
+
+	if (k != end)
+		iter->data[iter->used++] = (struct sort_iter_set) { k, end };
+}
+
+static inline struct bkey_packed *sort_iter_peek(struct sort_iter *iter)
+{
+	return iter->used ? iter->data->k : NULL;
+}
+
+static inline void sort_iter_advance(struct sort_iter *iter, sort_cmp_fn cmp)
+{
+	iter->data->k = bkey_next(iter->data->k);
+
+	BUG_ON(iter->data->k > iter->data->end);
+
+	if (iter->data->k == iter->data->end)
+		array_remove_item(iter->data, iter->used, 0);
+	else
+		sort_iter_sift(iter, cmp);
+}
+
+static inline struct bkey_packed *sort_iter_next(struct sort_iter *iter,
+						 sort_cmp_fn cmp)
+{
+	struct bkey_packed *ret = sort_iter_peek(iter);
+
+	if (ret)
+		sort_iter_advance(iter, cmp);
+
+	return ret;
+}
+
+static inline int sort_key_whiteouts_cmp(struct btree *b,
+					 struct bkey_packed *l,
+					 struct bkey_packed *r)
+{
+	return bkey_cmp_packed(b, l, r);
+}
+
+static unsigned sort_key_whiteouts(struct bkey_packed *dst,
+				   struct sort_iter *iter)
+{
+	struct bkey_packed *in, *out = dst;
+
+	sort_iter_sort(iter, sort_key_whiteouts_cmp);
+
+	while ((in = sort_iter_next(iter, sort_key_whiteouts_cmp))) {
+		bkey_copy(out, in);
+		out = bkey_next(out);
+	}
+
+	return (u64 *) out - (u64 *) dst;
+}
+
+static inline int sort_extent_whiteouts_cmp(struct btree *b,
+					    struct bkey_packed *l,
+					    struct bkey_packed *r)
+{
+	struct bkey ul = bkey_unpack_key(b, l);
+	struct bkey ur = bkey_unpack_key(b, r);
+
+	return bkey_cmp(bkey_start_pos(&ul), bkey_start_pos(&ur));
+}
+
+static unsigned sort_extent_whiteouts(struct bkey_packed *dst,
+				      struct sort_iter *iter)
+{
+	const struct bkey_format *f = &iter->b->format;
+	struct bkey_packed *in, *out = dst;
+	struct bkey_i l, r;
+	bool prev = false, l_packed = false;
+	u64 max_packed_size	= bkey_field_max(f, BKEY_FIELD_SIZE);
+	u64 max_packed_offset	= bkey_field_max(f, BKEY_FIELD_OFFSET);
+	u64 new_size;
+
+	max_packed_size = min_t(u64, max_packed_size, KEY_SIZE_MAX);
+
+	sort_iter_sort(iter, sort_extent_whiteouts_cmp);
+
+	while ((in = sort_iter_next(iter, sort_extent_whiteouts_cmp))) {
+		EBUG_ON(bkeyp_val_u64s(f, in));
+		EBUG_ON(in->type != KEY_TYPE_DISCARD);
+
+		r.k = bkey_unpack_key(iter->b, in);
+
+		if (prev &&
+		    bkey_cmp(l.k.p, bkey_start_pos(&r.k)) >= 0) {
+			if (bkey_cmp(l.k.p, r.k.p) >= 0)
+				continue;
+
+			new_size = l_packed
+				? min(max_packed_size, max_packed_offset -
+				      bkey_start_offset(&l.k))
+				: KEY_SIZE_MAX;
+
+			new_size = min(new_size, r.k.p.offset -
+				       bkey_start_offset(&l.k));
+
+			BUG_ON(new_size < l.k.size);
+
+			bch2_key_resize(&l.k, new_size);
+
+			if (bkey_cmp(l.k.p, r.k.p) >= 0)
+				continue;
+
+			bch2_cut_front(l.k.p, &r);
+		}
+
+		if (prev) {
+			if (!bch2_bkey_pack(out, &l, f)) {
+				BUG_ON(l_packed);
+				bkey_copy(out, &l);
+			}
+			out = bkey_next(out);
+		}
+
+		l = r;
+		prev = true;
+		l_packed = bkey_packed(in);
+	}
+
+	if (prev) {
+		if (!bch2_bkey_pack(out, &l, f)) {
+			BUG_ON(l_packed);
+			bkey_copy(out, &l);
+		}
+		out = bkey_next(out);
+	}
+
+	return (u64 *) out - (u64 *) dst;
+}
+
+static unsigned should_compact_bset(struct btree *b, struct bset_tree *t,
+				    bool compacting,
+				    enum compact_mode mode)
+{
+	unsigned bset_u64s = le16_to_cpu(bset(b, t)->u64s);
+	unsigned dead_u64s = bset_u64s - b->nr.bset_u64s[t - b->set];
+
+	if (mode == COMPACT_LAZY) {
+		if (should_compact_bset_lazy(b, t) ||
+		    (compacting && bset_unwritten(b, bset(b, t))))
+			return dead_u64s;
+	} else {
+		if (bset_written(b, bset(b, t)))
+			return dead_u64s;
+	}
+
+	return 0;
+}
+
+bool __bch2_compact_whiteouts(struct bch_fs *c, struct btree *b,
+			     enum compact_mode mode)
+{
+	const struct bkey_format *f = &b->format;
+	struct bset_tree *t;
+	struct bkey_packed *whiteouts = NULL;
+	struct bkey_packed *u_start, *u_pos;
+	struct sort_iter sort_iter;
+	unsigned order, whiteout_u64s = 0, u64s;
+	bool used_mempool, compacting = false;
+
+	for_each_bset(b, t)
+		whiteout_u64s += should_compact_bset(b, t,
+					whiteout_u64s != 0, mode);
+
+	if (!whiteout_u64s)
+		return false;
+
+	sort_iter_init(&sort_iter, b);
+
+	whiteout_u64s += b->whiteout_u64s;
+	order = get_order(whiteout_u64s * sizeof(u64));
+
+	whiteouts = btree_bounce_alloc(c, order, &used_mempool);
+	u_start = u_pos = whiteouts;
+
+	memcpy_u64s(u_pos, unwritten_whiteouts_start(c, b),
+		    b->whiteout_u64s);
+	u_pos = (void *) u_pos + b->whiteout_u64s * sizeof(u64);
+
+	sort_iter_add(&sort_iter, u_start, u_pos);
+
+	for_each_bset(b, t) {
+		struct bset *i = bset(b, t);
+		struct bkey_packed *k, *n, *out, *start, *end;
+		struct btree_node_entry *src = NULL, *dst = NULL;
+
+		if (t != b->set && bset_unwritten(b, i)) {
+			src = container_of(i, struct btree_node_entry, keys);
+			dst = max(write_block(b),
+				  (void *) btree_bkey_last(b, t -1));
+		}
+
+		if (!should_compact_bset(b, t, compacting, mode)) {
+			if (src != dst) {
+				memmove(dst, src, sizeof(*src) +
+					le16_to_cpu(src->keys.u64s) *
+					sizeof(u64));
+				i = &dst->keys;
+				set_btree_bset(b, t, i);
+			}
+			continue;
+		}
+
+		compacting = true;
+		u_start = u_pos;
+		start = i->start;
+		end = vstruct_last(i);
+
+		if (src != dst) {
+			memmove(dst, src, sizeof(*src));
+			i = &dst->keys;
+			set_btree_bset(b, t, i);
+		}
+
+		out = i->start;
+
+		for (k = start; k != end; k = n) {
+			n = bkey_next(k);
+
+			if (bkey_deleted(k) && btree_node_is_extents(b))
+				continue;
+
+			if (bkey_whiteout(k) && !k->needs_whiteout)
+				continue;
+
+			if (bkey_whiteout(k)) {
+				unreserve_whiteout(b, t, k);
+				memcpy_u64s(u_pos, k, bkeyp_key_u64s(f, k));
+				set_bkeyp_val_u64s(f, u_pos, 0);
+				u_pos = bkey_next(u_pos);
+			} else if (mode != COMPACT_WRITTEN_NO_WRITE_LOCK) {
+				bkey_copy(out, k);
+				out = bkey_next(out);
+			}
+		}
+
+		sort_iter_add(&sort_iter, u_start, u_pos);
+
+		if (mode != COMPACT_WRITTEN_NO_WRITE_LOCK) {
+			i->u64s = cpu_to_le16((u64 *) out - i->_data);
+			set_btree_bset_end(b, t);
+			bch2_bset_set_no_aux_tree(b, t);
+		}
+	}
+
+	b->whiteout_u64s = (u64 *) u_pos - (u64 *) whiteouts;
+
+	BUG_ON((void *) unwritten_whiteouts_start(c, b) <
+	       (void *) btree_bkey_last(b, bset_tree_last(b)));
+
+	u64s = btree_node_is_extents(b)
+		? sort_extent_whiteouts(unwritten_whiteouts_start(c, b),
+					&sort_iter)
+		: sort_key_whiteouts(unwritten_whiteouts_start(c, b),
+				     &sort_iter);
+
+	BUG_ON(u64s > b->whiteout_u64s);
+	BUG_ON(u64s != b->whiteout_u64s && !btree_node_is_extents(b));
+	BUG_ON(u_pos != whiteouts && !u64s);
+
+	if (u64s != b->whiteout_u64s) {
+		void *src = unwritten_whiteouts_start(c, b);
+
+		b->whiteout_u64s = u64s;
+		memmove_u64s_up(unwritten_whiteouts_start(c, b), src, u64s);
+	}
+
+	verify_no_dups(b,
+		       unwritten_whiteouts_start(c, b),
+		       unwritten_whiteouts_end(c, b));
+
+	btree_bounce_free(c, order, used_mempool, whiteouts);
+
+	if (mode != COMPACT_WRITTEN_NO_WRITE_LOCK)
+		bch2_btree_build_aux_trees(b);
+
+	bch_btree_keys_u64s_remaining(c, b);
+	bch2_verify_btree_nr_keys(b);
+
+	return true;
+}
+
+static bool bch2_drop_whiteouts(struct btree *b)
+{
+	struct bset_tree *t;
+	bool ret = false;
+
+	for_each_bset(b, t) {
+		struct bset *i = bset(b, t);
+		struct bkey_packed *k, *n, *out, *start, *end;
+
+		if (!should_compact_bset(b, t, true, COMPACT_WRITTEN))
+			continue;
+
+		start	= btree_bkey_first(b, t);
+		end	= btree_bkey_last(b, t);
+
+		if (bset_unwritten(b, i) &&
+		    t != b->set) {
+			struct bset *dst =
+			       max_t(struct bset *, write_block(b),
+				     (void *) btree_bkey_last(b, t -1));
+
+			memmove(dst, i, sizeof(struct bset));
+			i = dst;
+			set_btree_bset(b, t, i);
+		}
+
+		out = i->start;
+
+		for (k = start; k != end; k = n) {
+			n = bkey_next(k);
+
+			if (!bkey_whiteout(k)) {
+				bkey_copy(out, k);
+				out = bkey_next(out);
+			}
+		}
+
+		i->u64s = cpu_to_le16((u64 *) out - i->_data);
+		bch2_bset_set_no_aux_tree(b, t);
+		ret = true;
+	}
+
+	bch2_verify_btree_nr_keys(b);
+
+	return ret;
+}
+
+static inline int sort_keys_cmp(struct btree *b,
+				struct bkey_packed *l,
+				struct bkey_packed *r)
+{
+	return bkey_cmp_packed(b, l, r) ?:
+		(int) bkey_whiteout(r) - (int) bkey_whiteout(l) ?:
+		(int) l->needs_whiteout - (int) r->needs_whiteout;
+}
+
+static unsigned sort_keys(struct bkey_packed *dst,
+			  struct sort_iter *iter,
+			  bool filter_whiteouts)
+{
+	const struct bkey_format *f = &iter->b->format;
+	struct bkey_packed *in, *next, *out = dst;
+
+	sort_iter_sort(iter, sort_keys_cmp);
+
+	while ((in = sort_iter_next(iter, sort_keys_cmp))) {
+		if (bkey_whiteout(in) &&
+		    (filter_whiteouts || !in->needs_whiteout))
+			continue;
+
+		if (bkey_whiteout(in) &&
+		    (next = sort_iter_peek(iter)) &&
+		    !bkey_cmp_packed(iter->b, in, next)) {
+			BUG_ON(in->needs_whiteout &&
+			       next->needs_whiteout);
+			/*
+			 * XXX racy, called with read lock from write path
+			 *
+			 * leads to spurious BUG_ON() in bkey_unpack_key() in
+			 * debug mode
+			 */
+			next->needs_whiteout |= in->needs_whiteout;
+			continue;
+		}
+
+		if (bkey_whiteout(in)) {
+			memcpy_u64s(out, in, bkeyp_key_u64s(f, in));
+			set_bkeyp_val_u64s(f, out, 0);
+		} else {
+			bkey_copy(out, in);
+		}
+		out = bkey_next(out);
+	}
+
+	return (u64 *) out - (u64 *) dst;
+}
+
+static inline int sort_extents_cmp(struct btree *b,
+				   struct bkey_packed *l,
+				   struct bkey_packed *r)
+{
+	return bkey_cmp_packed(b, l, r) ?:
+		(int) bkey_deleted(l) - (int) bkey_deleted(r);
+}
+
+static unsigned sort_extents(struct bkey_packed *dst,
+			     struct sort_iter *iter,
+			     bool filter_whiteouts)
+{
+	struct bkey_packed *in, *out = dst;
+
+	sort_iter_sort(iter, sort_extents_cmp);
+
+	while ((in = sort_iter_next(iter, sort_extents_cmp))) {
+		if (bkey_deleted(in))
+			continue;
+
+		if (bkey_whiteout(in) &&
+		    (filter_whiteouts || !in->needs_whiteout))
+			continue;
+
+		bkey_copy(out, in);
+		out = bkey_next(out);
+	}
+
+	return (u64 *) out - (u64 *) dst;
+}
+
+static void btree_node_sort(struct bch_fs *c, struct btree *b,
+			    struct btree_iter *iter,
+			    unsigned start_idx,
+			    unsigned end_idx,
+			    bool filter_whiteouts)
+{
+	struct btree_node *out;
+	struct sort_iter sort_iter;
+	struct bset_tree *t;
+	struct bset *start_bset = bset(b, &b->set[start_idx]);
+	bool used_mempool = false;
+	u64 start_time, seq = 0;
+	unsigned i, u64s = 0, order, shift = end_idx - start_idx - 1;
+	bool sorting_entire_node = start_idx == 0 &&
+		end_idx == b->nsets;
+
+	sort_iter_init(&sort_iter, b);
+
+	for (t = b->set + start_idx;
+	     t < b->set + end_idx;
+	     t++) {
+		u64s += le16_to_cpu(bset(b, t)->u64s);
+		sort_iter_add(&sort_iter,
+			      btree_bkey_first(b, t),
+			      btree_bkey_last(b, t));
+	}
+
+	order = sorting_entire_node
+		? btree_page_order(c)
+		: get_order(__vstruct_bytes(struct btree_node, u64s));
+
+	out = btree_bounce_alloc(c, order, &used_mempool);
+
+	start_time = local_clock();
+
+	if (btree_node_is_extents(b))
+		filter_whiteouts = bset_written(b, start_bset);
+
+	u64s = btree_node_is_extents(b)
+		? sort_extents(out->keys.start, &sort_iter, filter_whiteouts)
+		: sort_keys(out->keys.start, &sort_iter, filter_whiteouts);
+
+	out->keys.u64s = cpu_to_le16(u64s);
+
+	BUG_ON(vstruct_end(&out->keys) > (void *) out + (PAGE_SIZE << order));
+
+	if (sorting_entire_node)
+		bch2_time_stats_update(&c->times[BCH_TIME_btree_sort],
+				       start_time);
+
+	/* Make sure we preserve bset journal_seq: */
+	for (t = b->set + start_idx; t < b->set + end_idx; t++)
+		seq = max(seq, le64_to_cpu(bset(b, t)->journal_seq));
+	start_bset->journal_seq = cpu_to_le64(seq);
+
+	if (sorting_entire_node) {
+		unsigned u64s = le16_to_cpu(out->keys.u64s);
+
+		BUG_ON(order != btree_page_order(c));
+
+		/*
+		 * Our temporary buffer is the same size as the btree node's
+		 * buffer, we can just swap buffers instead of doing a big
+		 * memcpy()
+		 */
+		*out = *b->data;
+		out->keys.u64s = cpu_to_le16(u64s);
+		swap(out, b->data);
+		set_btree_bset(b, b->set, &b->data->keys);
+	} else {
+		start_bset->u64s = out->keys.u64s;
+		memcpy_u64s(start_bset->start,
+			    out->keys.start,
+			    le16_to_cpu(out->keys.u64s));
+	}
+
+	for (i = start_idx + 1; i < end_idx; i++)
+		b->nr.bset_u64s[start_idx] +=
+			b->nr.bset_u64s[i];
+
+	b->nsets -= shift;
+
+	for (i = start_idx + 1; i < b->nsets; i++) {
+		b->nr.bset_u64s[i]	= b->nr.bset_u64s[i + shift];
+		b->set[i]		= b->set[i + shift];
+	}
+
+	for (i = b->nsets; i < MAX_BSETS; i++)
+		b->nr.bset_u64s[i] = 0;
+
+	set_btree_bset_end(b, &b->set[start_idx]);
+	bch2_bset_set_no_aux_tree(b, &b->set[start_idx]);
+
+	btree_bounce_free(c, order, used_mempool, out);
+
+	bch2_verify_btree_nr_keys(b);
+}
+
+/* Sort + repack in a new format: */
+static struct btree_nr_keys sort_repack(struct bset *dst,
+					struct btree *src,
+					struct btree_node_iter *src_iter,
+					struct bkey_format *out_f,
+					bool filter_whiteouts)
+{
+	struct bkey_format *in_f = &src->format;
+	struct bkey_packed *in, *out = vstruct_last(dst);
+	struct btree_nr_keys nr;
+
+	memset(&nr, 0, sizeof(nr));
+
+	while ((in = bch2_btree_node_iter_next_all(src_iter, src))) {
+		if (filter_whiteouts && bkey_whiteout(in))
+			continue;
+
+		if (bch2_bkey_transform(out_f, out, bkey_packed(in)
+				       ? in_f : &bch2_bkey_format_current, in))
+			out->format = KEY_FORMAT_LOCAL_BTREE;
+		else
+			bch2_bkey_unpack(src, (void *) out, in);
+
+		btree_keys_account_key_add(&nr, 0, out);
+		out = bkey_next(out);
+	}
+
+	dst->u64s = cpu_to_le16((u64 *) out - dst->_data);
+	return nr;
+}
+
+/* Sort, repack, and merge: */
+static struct btree_nr_keys sort_repack_merge(struct bch_fs *c,
+					      struct bset *dst,
+					      struct btree *src,
+					      struct btree_node_iter *iter,
+					      struct bkey_format *out_f,
+					      bool filter_whiteouts,
+					      key_filter_fn filter,
+					      key_merge_fn merge)
+{
+	struct bkey_packed *k, *prev = NULL, *out;
+	struct btree_nr_keys nr;
+	BKEY_PADDED(k) tmp;
+
+	memset(&nr, 0, sizeof(nr));
+
+	while ((k = bch2_btree_node_iter_next_all(iter, src))) {
+		if (filter_whiteouts && bkey_whiteout(k))
+			continue;
+
+		/*
+		 * The filter might modify pointers, so we have to unpack the
+		 * key and values to &tmp.k:
+		 */
+		bch2_bkey_unpack(src, &tmp.k, k);
+
+		if (filter && filter(c, src, bkey_i_to_s(&tmp.k)))
+			continue;
+
+		/* prev is always unpacked, for key merging: */
+
+		if (prev &&
+		    merge &&
+		    merge(c, src, (void *) prev, &tmp.k) == BCH_MERGE_MERGE)
+			continue;
+
+		/*
+		 * the current key becomes the new prev: advance prev, then
+		 * copy the current key - but first pack prev (in place):
+		 */
+		if (prev) {
+			bch2_bkey_pack(prev, (void *) prev, out_f);
+
+			btree_keys_account_key_add(&nr, 0, prev);
+			prev = bkey_next(prev);
+		} else {
+			prev = vstruct_last(dst);
+		}
+
+		bkey_copy(prev, &tmp.k);
+	}
+
+	if (prev) {
+		bch2_bkey_pack(prev, (void *) prev, out_f);
+		btree_keys_account_key_add(&nr, 0, prev);
+		out = bkey_next(prev);
+	} else {
+		out = vstruct_last(dst);
+	}
+
+	dst->u64s = cpu_to_le16((u64 *) out - dst->_data);
+	return nr;
+}
+
+void bch2_btree_sort_into(struct bch_fs *c,
+			 struct btree *dst,
+			 struct btree *src)
+{
+	struct btree_nr_keys nr;
+	struct btree_node_iter src_iter;
+	u64 start_time = local_clock();
+
+	BUG_ON(dst->nsets != 1);
+
+	bch2_bset_set_no_aux_tree(dst, dst->set);
+
+	bch2_btree_node_iter_init_from_start(&src_iter, src,
+					    btree_node_is_extents(src));
+
+	if (btree_node_ops(src)->key_normalize ||
+	    btree_node_ops(src)->key_merge)
+		nr = sort_repack_merge(c, btree_bset_first(dst),
+				src, &src_iter,
+				&dst->format,
+				true,
+				btree_node_ops(src)->key_normalize,
+				btree_node_ops(src)->key_merge);
+	else
+		nr = sort_repack(btree_bset_first(dst),
+				src, &src_iter,
+				&dst->format,
+				true);
+
+	bch2_time_stats_update(&c->times[BCH_TIME_btree_sort], start_time);
+
+	set_btree_bset_end(dst, dst->set);
+
+	dst->nr.live_u64s	+= nr.live_u64s;
+	dst->nr.bset_u64s[0]	+= nr.bset_u64s[0];
+	dst->nr.packed_keys	+= nr.packed_keys;
+	dst->nr.unpacked_keys	+= nr.unpacked_keys;
+
+	bch2_verify_btree_nr_keys(dst);
+}
+
+#define SORT_CRIT	(4096 / sizeof(u64))
+
+/*
+ * We're about to add another bset to the btree node, so if there's currently
+ * too many bsets - sort some of them together:
+ */
+static bool btree_node_compact(struct bch_fs *c, struct btree *b,
+			       struct btree_iter *iter)
+{
+	unsigned unwritten_idx;
+	bool ret = false;
+
+	for (unwritten_idx = 0;
+	     unwritten_idx < b->nsets;
+	     unwritten_idx++)
+		if (bset_unwritten(b, bset(b, &b->set[unwritten_idx])))
+			break;
+
+	if (b->nsets - unwritten_idx > 1) {
+		btree_node_sort(c, b, iter, unwritten_idx,
+				b->nsets, false);
+		ret = true;
+	}
+
+	if (unwritten_idx > 1) {
+		btree_node_sort(c, b, iter, 0, unwritten_idx, false);
+		ret = true;
+	}
+
+	return ret;
+}
+
+void bch2_btree_build_aux_trees(struct btree *b)
+{
+	struct bset_tree *t;
+
+	for_each_bset(b, t)
+		bch2_bset_build_aux_tree(b, t,
+				bset_unwritten(b, bset(b, t)) &&
+				t == bset_tree_last(b));
+}
+
+/*
+ * @bch_btree_init_next - initialize a new (unwritten) bset that can then be
+ * inserted into
+ *
+ * Safe to call if there already is an unwritten bset - will only add a new bset
+ * if @b doesn't already have one.
+ *
+ * Returns true if we sorted (i.e. invalidated iterators
+ */
+void bch2_btree_init_next(struct bch_fs *c, struct btree *b,
+			  struct btree_iter *iter)
+{
+	struct btree_node_entry *bne;
+	bool did_sort;
+
+	EBUG_ON(!(b->lock.state.seq & 1));
+	EBUG_ON(iter && iter->l[b->level].b != b);
+
+	did_sort = btree_node_compact(c, b, iter);
+
+	bne = want_new_bset(c, b);
+	if (bne)
+		bch2_bset_init_next(c, b, bne);
+
+	bch2_btree_build_aux_trees(b);
+
+	if (iter && did_sort)
+		bch2_btree_iter_reinit_node(iter, b);
+}
+
+static struct nonce btree_nonce(struct bset *i, unsigned offset)
+{
+	return (struct nonce) {{
+		[0] = cpu_to_le32(offset),
+		[1] = ((__le32 *) &i->seq)[0],
+		[2] = ((__le32 *) &i->seq)[1],
+		[3] = ((__le32 *) &i->journal_seq)[0]^BCH_NONCE_BTREE,
+	}};
+}
+
+static void bset_encrypt(struct bch_fs *c, struct bset *i, unsigned offset)
+{
+	struct nonce nonce = btree_nonce(i, offset);
+
+	if (!offset) {
+		struct btree_node *bn = container_of(i, struct btree_node, keys);
+		unsigned bytes = (void *) &bn->keys - (void *) &bn->flags;
+
+		bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, &bn->flags,
+			     bytes);
+
+		nonce = nonce_add(nonce, round_up(bytes, CHACHA_BLOCK_SIZE));
+	}
+
+	bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, i->_data,
+		     vstruct_end(i) - (void *) i->_data);
+}
+
+static int btree_err_msg(struct bch_fs *c, struct btree *b, struct bset *i,
+			 unsigned offset, int write, char *buf, size_t len)
+{
+	char *out = buf, *end = buf + len;
+
+	out += scnprintf(out, end - out,
+			 "error validating btree node %s"
+			 "at btree %u level %u/%u\n"
+			 "pos %llu:%llu node offset %u",
+			 write ? "before write " : "",
+			 b->btree_id, b->level,
+			 c->btree_roots[b->btree_id].level,
+			 b->key.k.p.inode, b->key.k.p.offset,
+			 b->written);
+	if (i)
+		out += scnprintf(out, end - out,
+				 " bset u64s %u",
+				 le16_to_cpu(i->u64s));
+
+	return out - buf;
+}
+
+enum btree_err_type {
+	BTREE_ERR_FIXABLE,
+	BTREE_ERR_WANT_RETRY,
+	BTREE_ERR_MUST_RETRY,
+	BTREE_ERR_FATAL,
+};
+
+enum btree_validate_ret {
+	BTREE_RETRY_READ = 64,
+};
+
+#define btree_err(type, c, b, i, msg, ...)				\
+({									\
+	__label__ out;							\
+	char _buf[300], *out = _buf, *end = out + sizeof(_buf);		\
+									\
+	out += btree_err_msg(c, b, i, b->written, write, out, end - out);\
+	out += scnprintf(out, end - out, ": " msg, ##__VA_ARGS__);	\
+									\
+	if (type == BTREE_ERR_FIXABLE &&				\
+	    write == READ &&						\
+	    !test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags)) {		\
+		mustfix_fsck_err(c, "%s", _buf);			\
+		goto out;						\
+	}								\
+									\
+	switch (write) {						\
+	case READ:							\
+		bch_err(c, "%s", _buf);					\
+									\
+		switch (type) {						\
+		case BTREE_ERR_FIXABLE:					\
+			ret = BCH_FSCK_ERRORS_NOT_FIXED;		\
+			goto fsck_err;					\
+		case BTREE_ERR_WANT_RETRY:				\
+			if (have_retry) {				\
+				ret = BTREE_RETRY_READ;			\
+				goto fsck_err;				\
+			}						\
+			break;						\
+		case BTREE_ERR_MUST_RETRY:				\
+			ret = BTREE_RETRY_READ;				\
+			goto fsck_err;					\
+		case BTREE_ERR_FATAL:					\
+			ret = BCH_FSCK_ERRORS_NOT_FIXED;		\
+			goto fsck_err;					\
+		}							\
+		break;							\
+	case WRITE:							\
+		bch_err(c, "corrupt metadata before write: %s", _buf);	\
+									\
+		if (bch2_fs_inconsistent(c)) {				\
+			ret = BCH_FSCK_ERRORS_NOT_FIXED;		\
+			goto fsck_err;					\
+		}							\
+		break;							\
+	}								\
+out:									\
+	true;								\
+})
+
+#define btree_err_on(cond, ...)	((cond) ? btree_err(__VA_ARGS__) : false)
+
+static int validate_bset(struct bch_fs *c, struct btree *b,
+			 struct bset *i, unsigned sectors,
+			 unsigned *whiteout_u64s, int write,
+			 bool have_retry)
+{
+	struct bkey_packed *k, *prev = NULL;
+	struct bpos prev_pos = POS_MIN;
+	enum bkey_type type = btree_node_type(b);
+	bool seen_non_whiteout = false;
+	const char *err;
+	int ret = 0;
+
+	if (i == &b->data->keys) {
+		/* These indicate that we read the wrong btree node: */
+		btree_err_on(BTREE_NODE_ID(b->data) != b->btree_id,
+			     BTREE_ERR_MUST_RETRY, c, b, i,
+			     "incorrect btree id");
+
+		btree_err_on(BTREE_NODE_LEVEL(b->data) != b->level,
+			     BTREE_ERR_MUST_RETRY, c, b, i,
+			     "incorrect level");
+
+		if (BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN) {
+			u64 *p = (u64 *) &b->data->ptr;
+
+			*p = swab64(*p);
+			bch2_bpos_swab(&b->data->min_key);
+			bch2_bpos_swab(&b->data->max_key);
+		}
+
+		btree_err_on(bkey_cmp(b->data->max_key, b->key.k.p),
+			     BTREE_ERR_MUST_RETRY, c, b, i,
+			     "incorrect max key");
+
+		/* XXX: ideally we would be validating min_key too */
+#if 0
+		/*
+		 * not correct anymore, due to btree node write error
+		 * handling
+		 *
+		 * need to add b->data->seq to btree keys and verify
+		 * against that
+		 */
+		btree_err_on(!extent_contains_ptr(bkey_i_to_s_c_extent(&b->key),
+						  b->data->ptr),
+			     BTREE_ERR_FATAL, c, b, i,
+			     "incorrect backpointer");
+#endif
+		err = bch2_bkey_format_validate(&b->data->format);
+		btree_err_on(err,
+			     BTREE_ERR_FATAL, c, b, i,
+			     "invalid bkey format: %s", err);
+	}
+
+	if (btree_err_on(le16_to_cpu(i->version) != BCACHE_BSET_VERSION,
+			 BTREE_ERR_FIXABLE, c, b, i,
+			 "unsupported bset version")) {
+		i->version = cpu_to_le16(BCACHE_BSET_VERSION);
+		i->u64s = 0;
+		return 0;
+	}
+
+	if (btree_err_on(b->written + sectors > c->opts.btree_node_size,
+			 BTREE_ERR_FIXABLE, c, b, i,
+			 "bset past end of btree node")) {
+		i->u64s = 0;
+		return 0;
+	}
+
+	btree_err_on(b->written && !i->u64s,
+		     BTREE_ERR_FIXABLE, c, b, i,
+		     "empty bset");
+
+	if (!BSET_SEPARATE_WHITEOUTS(i)) {
+		seen_non_whiteout = true;
+		*whiteout_u64s = 0;
+	}
+
+	for (k = i->start;
+	     k != vstruct_last(i);) {
+		struct bkey_s_c u;
+		struct bkey tmp;
+		const char *invalid;
+
+		if (btree_err_on(!k->u64s,
+				 BTREE_ERR_FIXABLE, c, b, i,
+				 "KEY_U64s 0: %zu bytes of metadata lost",
+				 vstruct_end(i) - (void *) k)) {
+			i->u64s = cpu_to_le16((u64 *) k - i->_data);
+			break;
+		}
+
+		if (btree_err_on(bkey_next(k) > vstruct_last(i),
+				 BTREE_ERR_FIXABLE, c, b, i,
+				 "key extends past end of bset")) {
+			i->u64s = cpu_to_le16((u64 *) k - i->_data);
+			break;
+		}
+
+		if (btree_err_on(k->format > KEY_FORMAT_CURRENT,
+				 BTREE_ERR_FIXABLE, c, b, i,
+				 "invalid bkey format %u", k->format)) {
+			i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
+			memmove_u64s_down(k, bkey_next(k),
+					  (u64 *) vstruct_end(i) - (u64 *) k);
+			continue;
+		}
+
+		if (BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN)
+			bch2_bkey_swab(type, &b->format, k);
+
+		u = bkey_disassemble(b, k, &tmp);
+
+		invalid = __bch2_bkey_invalid(c, type, u) ?:
+			bch2_bkey_in_btree_node(b, u) ?:
+			(write ? bch2_bkey_val_invalid(c, type, u) : NULL);
+		if (invalid) {
+			char buf[160];
+
+			bch2_bkey_val_to_text(c, type, buf, sizeof(buf), u);
+			btree_err(BTREE_ERR_FIXABLE, c, b, i,
+				  "invalid bkey:\n%s\n%s", invalid, buf);
+
+			i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
+			memmove_u64s_down(k, bkey_next(k),
+					  (u64 *) vstruct_end(i) - (u64 *) k);
+			continue;
+		}
+
+		/*
+		 * with the separate whiteouts thing (used for extents), the
+		 * second set of keys actually can have whiteouts too, so we
+		 * can't solely go off bkey_whiteout()...
+		 */
+
+		if (!seen_non_whiteout &&
+		    (!bkey_whiteout(k) ||
+		     (bkey_cmp(prev_pos, bkey_start_pos(u.k)) > 0))) {
+			*whiteout_u64s = k->_data - i->_data;
+			seen_non_whiteout = true;
+		} else if (bkey_cmp(prev_pos, bkey_start_pos(u.k)) > 0) {
+			btree_err(BTREE_ERR_FATAL, c, b, i,
+				  "keys out of order: %llu:%llu > %llu:%llu",
+				  prev_pos.inode,
+				  prev_pos.offset,
+				  u.k->p.inode,
+				  bkey_start_offset(u.k));
+			/* XXX: repair this */
+		}
+
+		prev_pos = u.k->p;
+		prev = k;
+		k = bkey_next(k);
+	}
+
+	SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN);
+fsck_err:
+	return ret;
+}
+
+int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry)
+{
+	struct btree_node_entry *bne;
+	struct btree_node_iter_large *iter;
+	struct btree_node *sorted;
+	struct bkey_packed *k;
+	struct bset *i;
+	bool used_mempool;
+	unsigned u64s;
+	int ret, retry_read = 0, write = READ;
+
+	iter = mempool_alloc(&c->fill_iter, GFP_NOIO);
+	__bch2_btree_node_iter_large_init(iter, btree_node_is_extents(b));
+
+	if (bch2_meta_read_fault("btree"))
+		btree_err(BTREE_ERR_MUST_RETRY, c, b, NULL,
+			  "dynamic fault");
+
+	btree_err_on(le64_to_cpu(b->data->magic) != bset_magic(c),
+		     BTREE_ERR_MUST_RETRY, c, b, NULL,
+		     "bad magic");
+
+	btree_err_on(!b->data->keys.seq,
+		     BTREE_ERR_MUST_RETRY, c, b, NULL,
+		     "bad btree header");
+
+	while (b->written < c->opts.btree_node_size) {
+		unsigned sectors, whiteout_u64s = 0;
+		struct nonce nonce;
+		struct bch_csum csum;
+		bool first = !b->written;
+
+		if (!b->written) {
+			i = &b->data->keys;
+
+			btree_err_on(!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)),
+				     BTREE_ERR_WANT_RETRY, c, b, i,
+				     "unknown checksum type");
+
+			nonce = btree_nonce(i, b->written << 9);
+			csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, b->data);
+
+			btree_err_on(bch2_crc_cmp(csum, b->data->csum),
+				     BTREE_ERR_WANT_RETRY, c, b, i,
+				     "invalid checksum");
+
+			bset_encrypt(c, i, b->written << 9);
+
+			sectors = vstruct_sectors(b->data, c->block_bits);
+
+			btree_node_set_format(b, b->data->format);
+		} else {
+			bne = write_block(b);
+			i = &bne->keys;
+
+			if (i->seq != b->data->keys.seq)
+				break;
+
+			btree_err_on(!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)),
+				     BTREE_ERR_WANT_RETRY, c, b, i,
+				     "unknown checksum type");
+
+			nonce = btree_nonce(i, b->written << 9);
+			csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne);
+
+			btree_err_on(bch2_crc_cmp(csum, bne->csum),
+				     BTREE_ERR_WANT_RETRY, c, b, i,
+				     "invalid checksum");
+
+			bset_encrypt(c, i, b->written << 9);
+
+			sectors = vstruct_sectors(bne, c->block_bits);
+		}
+
+		ret = validate_bset(c, b, i, sectors, &whiteout_u64s,
+				    READ, have_retry);
+		if (ret)
+			goto fsck_err;
+
+		b->written += sectors;
+
+		ret = bch2_journal_seq_should_ignore(c, le64_to_cpu(i->journal_seq), b);
+		if (ret < 0) {
+			btree_err(BTREE_ERR_FATAL, c, b, i,
+				  "insufficient memory");
+			goto err;
+		}
+
+		if (ret) {
+			btree_err_on(first,
+				     BTREE_ERR_FIXABLE, c, b, i,
+				     "first btree node bset has blacklisted journal seq");
+			if (!first)
+				continue;
+		}
+
+		bch2_btree_node_iter_large_push(iter, b,
+					   i->start,
+					   vstruct_idx(i, whiteout_u64s));
+
+		bch2_btree_node_iter_large_push(iter, b,
+					   vstruct_idx(i, whiteout_u64s),
+					   vstruct_last(i));
+	}
+
+	for (bne = write_block(b);
+	     bset_byte_offset(b, bne) < btree_bytes(c);
+	     bne = (void *) bne + block_bytes(c))
+		btree_err_on(bne->keys.seq == b->data->keys.seq,
+			     BTREE_ERR_WANT_RETRY, c, b, NULL,
+			     "found bset signature after last bset");
+
+	sorted = btree_bounce_alloc(c, btree_page_order(c), &used_mempool);
+	sorted->keys.u64s = 0;
+
+	set_btree_bset(b, b->set, &b->data->keys);
+
+	b->nr = btree_node_is_extents(b)
+		? bch2_extent_sort_fix_overlapping(c, &sorted->keys, b, iter)
+		: bch2_key_sort_fix_overlapping(&sorted->keys, b, iter);
+
+	u64s = le16_to_cpu(sorted->keys.u64s);
+	*sorted = *b->data;
+	sorted->keys.u64s = cpu_to_le16(u64s);
+	swap(sorted, b->data);
+	set_btree_bset(b, b->set, &b->data->keys);
+	b->nsets = 1;
+
+	BUG_ON(b->nr.live_u64s != u64s);
+
+	btree_bounce_free(c, btree_page_order(c), used_mempool, sorted);
+
+	i = &b->data->keys;
+	for (k = i->start; k != vstruct_last(i);) {
+		enum bkey_type type = btree_node_type(b);
+		struct bkey tmp;
+		struct bkey_s_c u = bkey_disassemble(b, k, &tmp);
+		const char *invalid = bch2_bkey_val_invalid(c, type, u);
+
+		if (invalid ||
+		    (inject_invalid_keys(c) &&
+		     !bversion_cmp(u.k->version, MAX_VERSION))) {
+			char buf[160];
+
+			bch2_bkey_val_to_text(c, type, buf, sizeof(buf), u);
+			btree_err(BTREE_ERR_FIXABLE, c, b, i,
+				  "invalid bkey %s: %s", buf, invalid);
+
+			btree_keys_account_key_drop(&b->nr, 0, k);
+
+			i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
+			memmove_u64s_down(k, bkey_next(k),
+					  (u64 *) vstruct_end(i) - (u64 *) k);
+			set_btree_bset_end(b, b->set);
+			continue;
+		}
+
+		k = bkey_next(k);
+	}
+
+	bch2_bset_build_aux_tree(b, b->set, false);
+
+	set_needs_whiteout(btree_bset_first(b));
+
+	btree_node_reset_sib_u64s(b);
+out:
+	mempool_free(iter, &c->fill_iter);
+	return retry_read;
+err:
+fsck_err:
+	if (ret == BTREE_RETRY_READ) {
+		retry_read = 1;
+	} else {
+		bch2_inconsistent_error(c);
+		set_btree_node_read_error(b);
+	}
+	goto out;
+}
+
+static void btree_node_read_work(struct work_struct *work)
+{
+	struct btree_read_bio *rb =
+		container_of(work, struct btree_read_bio, work);
+	struct bch_fs *c	= rb->c;
+	struct bch_dev *ca	= bch_dev_bkey_exists(c, rb->pick.ptr.dev);
+	struct btree *b		= rb->bio.bi_private;
+	struct bio *bio		= &rb->bio;
+	struct bch_devs_mask avoid;
+	bool can_retry;
+
+	memset(&avoid, 0, sizeof(avoid));
+
+	goto start;
+	while (1) {
+		bch_info(c, "retrying read");
+		ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev);
+		rb->have_ioref		= bch2_dev_get_ioref(ca, READ);
+		bio_reset(bio, NULL, REQ_OP_READ|REQ_SYNC|REQ_META);
+		bio->bi_iter.bi_sector	= rb->pick.ptr.offset;
+		bio->bi_iter.bi_size	= btree_bytes(c);
+
+		if (rb->have_ioref) {
+			bio_set_dev(bio, ca->disk_sb.bdev);
+			submit_bio_wait(bio);
+		} else {
+			bio->bi_status = BLK_STS_REMOVED;
+		}
+start:
+		bch2_dev_io_err_on(bio->bi_status, ca, "btree read");
+		if (rb->have_ioref)
+			percpu_ref_put(&ca->io_ref);
+		rb->have_ioref = false;
+
+		__set_bit(rb->pick.ptr.dev, avoid.d);
+		can_retry = bch2_btree_pick_ptr(c, b, &avoid, &rb->pick) > 0;
+
+		if (!bio->bi_status &&
+		    !bch2_btree_node_read_done(c, b, can_retry))
+			break;
+
+		if (!can_retry) {
+			set_btree_node_read_error(b);
+			break;
+		}
+	}
+
+	bch2_time_stats_update(&c->times[BCH_TIME_btree_read], rb->start_time);
+	bio_put(&rb->bio);
+	clear_btree_node_read_in_flight(b);
+	wake_up_bit(&b->flags, BTREE_NODE_read_in_flight);
+}
+
+static void btree_node_read_endio(struct bio *bio)
+{
+	struct btree_read_bio *rb =
+		container_of(bio, struct btree_read_bio, bio);
+	struct bch_fs *c	= rb->c;
+
+	if (rb->have_ioref) {
+		struct bch_dev *ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev);
+		bch2_latency_acct(ca, rb->start_time, READ);
+	}
+
+	queue_work(system_unbound_wq, &rb->work);
+}
+
+void bch2_btree_node_read(struct bch_fs *c, struct btree *b,
+			  bool sync)
+{
+	struct extent_pick_ptr pick;
+	struct btree_read_bio *rb;
+	struct bch_dev *ca;
+	struct bio *bio;
+	int ret;
+
+	trace_btree_read(c, b);
+
+	ret = bch2_btree_pick_ptr(c, b, NULL, &pick);
+	if (bch2_fs_fatal_err_on(ret <= 0, c,
+			"btree node read error: no device to read from")) {
+		set_btree_node_read_error(b);
+		return;
+	}
+
+	ca = bch_dev_bkey_exists(c, pick.ptr.dev);
+
+	bio = bio_alloc_bioset(NULL,
+			       buf_pages(b->data, btree_bytes(c)),
+			       REQ_OP_READ|REQ_SYNC|REQ_META,
+			       GFP_NOIO,
+			       &c->btree_bio);
+	rb = container_of(bio, struct btree_read_bio, bio);
+	rb->c			= c;
+	rb->start_time		= local_clock();
+	rb->have_ioref		= bch2_dev_get_ioref(ca, READ);
+	rb->pick		= pick;
+	INIT_WORK(&rb->work, btree_node_read_work);
+	bio->bi_iter.bi_sector	= pick.ptr.offset;
+	bio->bi_iter.bi_size	= btree_bytes(c);
+	bio->bi_end_io		= btree_node_read_endio;
+	bio->bi_private		= b;
+	bch2_bio_map(bio, b->data);
+
+	set_btree_node_read_in_flight(b);
+
+	if (rb->have_ioref) {
+		this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_BTREE],
+			     bio_sectors(bio));
+		bio_set_dev(bio, ca->disk_sb.bdev);
+
+		if (sync) {
+			submit_bio_wait(bio);
+
+			bio->bi_private	= b;
+			btree_node_read_work(&rb->work);
+		} else {
+			submit_bio(bio);
+		}
+	} else {
+		bio->bi_status = BLK_STS_REMOVED;
+
+		if (sync)
+			btree_node_read_work(&rb->work);
+		else
+			queue_work(system_unbound_wq, &rb->work);
+
+	}
+}
+
+int bch2_btree_root_read(struct bch_fs *c, enum btree_id id,
+			const struct bkey_i *k, unsigned level)
+{
+	struct closure cl;
+	struct btree *b;
+	int ret;
+
+	closure_init_stack(&cl);
+
+	do {
+		ret = bch2_btree_cache_cannibalize_lock(c, &cl);
+		closure_sync(&cl);
+	} while (ret);
+
+	b = bch2_btree_node_mem_alloc(c);
+	bch2_btree_cache_cannibalize_unlock(c);
+
+	BUG_ON(IS_ERR(b));
+
+	bkey_copy(&b->key, k);
+	BUG_ON(bch2_btree_node_hash_insert(&c->btree_cache, b, level, id));
+
+	bch2_btree_node_read(c, b, true);
+
+	if (btree_node_read_error(b)) {
+		bch2_btree_node_hash_remove(&c->btree_cache, b);
+
+		mutex_lock(&c->btree_cache.lock);
+		list_move(&b->list, &c->btree_cache.freeable);
+		mutex_unlock(&c->btree_cache.lock);
+
+		ret = -EIO;
+		goto err;
+	}
+
+	bch2_btree_set_root_for_read(c, b);
+err:
+	six_unlock_write(&b->lock);
+	six_unlock_intent(&b->lock);
+
+	return ret;
+}
+
+void bch2_btree_complete_write(struct bch_fs *c, struct btree *b,
+			      struct btree_write *w)
+{
+	unsigned long old, new, v = READ_ONCE(b->will_make_reachable);
+
+	do {
+		old = new = v;
+		if (!(old & 1))
+			break;
+
+		new &= ~1UL;
+	} while ((v = cmpxchg(&b->will_make_reachable, old, new)) != old);
+
+	if (old & 1)
+		closure_put(&((struct btree_update *) new)->cl);
+
+	bch2_journal_pin_drop(&c->journal, &w->journal);
+	closure_wake_up(&w->wait);
+}
+
+static void btree_node_write_done(struct bch_fs *c, struct btree *b)
+{
+	struct btree_write *w = btree_prev_write(b);
+
+	bch2_btree_complete_write(c, b, w);
+	btree_node_io_unlock(b);
+}
+
+static void bch2_btree_node_write_error(struct bch_fs *c,
+					struct btree_write_bio *wbio)
+{
+	struct btree *b		= wbio->wbio.bio.bi_private;
+	__BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp;
+	struct bkey_i_extent *new_key;
+	struct bkey_s_extent e;
+	struct bch_extent_ptr *ptr;
+	struct btree_iter iter;
+	int ret;
+
+	__bch2_btree_iter_init(&iter, c, b->btree_id, b->key.k.p,
+			       BTREE_MAX_DEPTH,
+			       b->level, BTREE_ITER_NODES);
+retry:
+	ret = bch2_btree_iter_traverse(&iter);
+	if (ret)
+		goto err;
+
+	/* has node been freed? */
+	if (iter.l[b->level].b != b) {
+		/* node has been freed: */
+		BUG_ON(!btree_node_dying(b));
+		goto out;
+	}
+
+	BUG_ON(!btree_node_hashed(b));
+
+	bkey_copy(&tmp.k, &b->key);
+
+	new_key = bkey_i_to_extent(&tmp.k);
+	e = extent_i_to_s(new_key);
+	extent_for_each_ptr_backwards(e, ptr)
+		if (bch2_dev_list_has_dev(wbio->wbio.failed, ptr->dev))
+			bch2_extent_drop_ptr(e, ptr);
+
+	if (!bch2_extent_nr_ptrs(e.c))
+		goto err;
+
+	ret = bch2_btree_node_update_key(c, &iter, b, new_key);
+	if (ret == -EINTR)
+		goto retry;
+	if (ret)
+		goto err;
+out:
+	bch2_btree_iter_unlock(&iter);
+	bio_put(&wbio->wbio.bio);
+	btree_node_write_done(c, b);
+	return;
+err:
+	set_btree_node_noevict(b);
+	bch2_fs_fatal_error(c, "fatal error writing btree node");
+	goto out;
+}
+
+void bch2_btree_write_error_work(struct work_struct *work)
+{
+	struct bch_fs *c = container_of(work, struct bch_fs,
+					btree_write_error_work);
+	struct bio *bio;
+
+	while (1) {
+		spin_lock_irq(&c->btree_write_error_lock);
+		bio = bio_list_pop(&c->btree_write_error_list);
+		spin_unlock_irq(&c->btree_write_error_lock);
+
+		if (!bio)
+			break;
+
+		bch2_btree_node_write_error(c,
+			container_of(bio, struct btree_write_bio, wbio.bio));
+	}
+}
+
+static void btree_node_write_work(struct work_struct *work)
+{
+	struct btree_write_bio *wbio =
+		container_of(work, struct btree_write_bio, work);
+	struct bch_fs *c	= wbio->wbio.c;
+	struct btree *b		= wbio->wbio.bio.bi_private;
+
+	btree_bounce_free(c,
+		wbio->wbio.order,
+		wbio->wbio.used_mempool,
+		wbio->data);
+
+	if (wbio->wbio.failed.nr) {
+		unsigned long flags;
+
+		spin_lock_irqsave(&c->btree_write_error_lock, flags);
+		bio_list_add(&c->btree_write_error_list, &wbio->wbio.bio);
+		spin_unlock_irqrestore(&c->btree_write_error_lock, flags);
+
+		queue_work(c->wq, &c->btree_write_error_work);
+		return;
+	}
+
+	bio_put(&wbio->wbio.bio);
+	btree_node_write_done(c, b);
+}
+
+static void btree_node_write_endio(struct bio *bio)
+{
+	struct bch_write_bio *wbio	= to_wbio(bio);
+	struct bch_write_bio *parent	= wbio->split ? wbio->parent : NULL;
+	struct bch_write_bio *orig	= parent ?: wbio;
+	struct bch_fs *c		= wbio->c;
+	struct bch_dev *ca		= bch_dev_bkey_exists(c, wbio->dev);
+	unsigned long flags;
+
+	if (wbio->have_ioref)
+		bch2_latency_acct(ca, wbio->submit_time, WRITE);
+
+	if (bio->bi_status == BLK_STS_REMOVED ||
+	    bch2_dev_io_err_on(bio->bi_status, ca, "btree write") ||
+	    bch2_meta_write_fault("btree")) {
+		spin_lock_irqsave(&c->btree_write_error_lock, flags);
+		bch2_dev_list_add_dev(&orig->failed, wbio->dev);
+		spin_unlock_irqrestore(&c->btree_write_error_lock, flags);
+	}
+
+	if (wbio->have_ioref)
+		percpu_ref_put(&ca->io_ref);
+
+	if (parent) {
+		bio_put(bio);
+		bio_endio(&parent->bio);
+	} else {
+		struct btree_write_bio *wb =
+			container_of(orig, struct btree_write_bio, wbio);
+
+		INIT_WORK(&wb->work, btree_node_write_work);
+		queue_work(system_unbound_wq, &wb->work);
+	}
+}
+
+static int validate_bset_for_write(struct bch_fs *c, struct btree *b,
+				   struct bset *i, unsigned sectors)
+{
+	const struct bch_extent_ptr *ptr;
+	unsigned whiteout_u64s = 0;
+	int ret;
+
+	extent_for_each_ptr(bkey_i_to_s_c_extent(&b->key), ptr)
+		break;
+
+	ret = validate_bset(c, b, i, sectors, &whiteout_u64s, WRITE, false);
+	if (ret)
+		bch2_inconsistent_error(c);
+
+	return ret;
+}
+
+void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
+			    enum six_lock_type lock_type_held)
+{
+	struct btree_write_bio *wbio;
+	struct bset_tree *t;
+	struct bset *i;
+	struct btree_node *bn = NULL;
+	struct btree_node_entry *bne = NULL;
+	BKEY_PADDED(key) k;
+	struct bkey_s_extent e;
+	struct bch_extent_ptr *ptr;
+	struct sort_iter sort_iter;
+	struct nonce nonce;
+	unsigned bytes_to_write, sectors_to_write, order, bytes, u64s;
+	u64 seq = 0;
+	bool used_mempool;
+	unsigned long old, new;
+	void *data;
+
+	if (test_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags))
+		return;
+
+	/*
+	 * We may only have a read lock on the btree node - the dirty bit is our
+	 * "lock" against racing with other threads that may be trying to start
+	 * a write, we do a write iff we clear the dirty bit. Since setting the
+	 * dirty bit requires a write lock, we can't race with other threads
+	 * redirtying it:
+	 */
+	do {
+		old = new = READ_ONCE(b->flags);
+
+		if (!(old & (1 << BTREE_NODE_dirty)))
+			return;
+
+		if (b->written &&
+		    !btree_node_may_write(b))
+			return;
+
+		if (old & (1 << BTREE_NODE_write_in_flight)) {
+			btree_node_wait_on_io(b);
+			continue;
+		}
+
+		new &= ~(1 << BTREE_NODE_dirty);
+		new &= ~(1 << BTREE_NODE_need_write);
+		new |=  (1 << BTREE_NODE_write_in_flight);
+		new |=  (1 << BTREE_NODE_just_written);
+		new ^=  (1 << BTREE_NODE_write_idx);
+	} while (cmpxchg_acquire(&b->flags, old, new) != old);
+
+	BUG_ON(btree_node_fake(b));
+	BUG_ON(!list_empty(&b->write_blocked));
+	BUG_ON((b->will_make_reachable != 0) != !b->written);
+
+	BUG_ON(b->written >= c->opts.btree_node_size);
+	BUG_ON(b->written & (c->opts.block_size - 1));
+	BUG_ON(bset_written(b, btree_bset_last(b)));
+	BUG_ON(le64_to_cpu(b->data->magic) != bset_magic(c));
+	BUG_ON(memcmp(&b->data->format, &b->format, sizeof(b->format)));
+
+	/*
+	 * We can't block on six_lock_write() here; another thread might be
+	 * trying to get a journal reservation with read locks held, and getting
+	 * a journal reservation might be blocked on flushing the journal and
+	 * doing btree writes:
+	 */
+	if (lock_type_held == SIX_LOCK_intent &&
+	    six_trylock_write(&b->lock)) {
+		__bch2_compact_whiteouts(c, b, COMPACT_WRITTEN);
+		six_unlock_write(&b->lock);
+	} else {
+		__bch2_compact_whiteouts(c, b, COMPACT_WRITTEN_NO_WRITE_LOCK);
+	}
+
+	BUG_ON(b->uncompacted_whiteout_u64s);
+
+	sort_iter_init(&sort_iter, b);
+
+	bytes = !b->written
+		? sizeof(struct btree_node)
+		: sizeof(struct btree_node_entry);
+
+	bytes += b->whiteout_u64s * sizeof(u64);
+
+	for_each_bset(b, t) {
+		i = bset(b, t);
+
+		if (bset_written(b, i))
+			continue;
+
+		bytes += le16_to_cpu(i->u64s) * sizeof(u64);
+		sort_iter_add(&sort_iter,
+			      btree_bkey_first(b, t),
+			      btree_bkey_last(b, t));
+		seq = max(seq, le64_to_cpu(i->journal_seq));
+	}
+
+	order = get_order(bytes);
+	data = btree_bounce_alloc(c, order, &used_mempool);
+
+	if (!b->written) {
+		bn = data;
+		*bn = *b->data;
+		i = &bn->keys;
+	} else {
+		bne = data;
+		bne->keys = b->data->keys;
+		i = &bne->keys;
+	}
+
+	i->journal_seq	= cpu_to_le64(seq);
+	i->u64s		= 0;
+
+	if (!btree_node_is_extents(b)) {
+		sort_iter_add(&sort_iter,
+			      unwritten_whiteouts_start(c, b),
+			      unwritten_whiteouts_end(c, b));
+		SET_BSET_SEPARATE_WHITEOUTS(i, false);
+	} else {
+		memcpy_u64s(i->start,
+			    unwritten_whiteouts_start(c, b),
+			    b->whiteout_u64s);
+		i->u64s = cpu_to_le16(b->whiteout_u64s);
+		SET_BSET_SEPARATE_WHITEOUTS(i, true);
+	}
+
+	b->whiteout_u64s = 0;
+
+	u64s = btree_node_is_extents(b)
+		? sort_extents(vstruct_last(i), &sort_iter, false)
+		: sort_keys(i->start, &sort_iter, false);
+	le16_add_cpu(&i->u64s, u64s);
+
+	clear_needs_whiteout(i);
+
+	/* do we have data to write? */
+	if (b->written && !i->u64s)
+		goto nowrite;
+
+	bytes_to_write = vstruct_end(i) - data;
+	sectors_to_write = round_up(bytes_to_write, block_bytes(c)) >> 9;
+
+	memset(data + bytes_to_write, 0,
+	       (sectors_to_write << 9) - bytes_to_write);
+
+	BUG_ON(b->written + sectors_to_write > c->opts.btree_node_size);
+	BUG_ON(BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN);
+	BUG_ON(i->seq != b->data->keys.seq);
+
+	i->version = cpu_to_le16(BCACHE_BSET_VERSION);
+	SET_BSET_CSUM_TYPE(i, bch2_meta_checksum_type(c));
+
+	/* if we're going to be encrypting, check metadata validity first: */
+	if (bch2_csum_type_is_encryption(BSET_CSUM_TYPE(i)) &&
+	    validate_bset_for_write(c, b, i, sectors_to_write))
+		goto err;
+
+	bset_encrypt(c, i, b->written << 9);
+
+	nonce = btree_nonce(i, b->written << 9);
+
+	if (bn)
+		bn->csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bn);
+	else
+		bne->csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne);
+
+	/* if we're not encrypting, check metadata after checksumming: */
+	if (!bch2_csum_type_is_encryption(BSET_CSUM_TYPE(i)) &&
+	    validate_bset_for_write(c, b, i, sectors_to_write))
+		goto err;
+
+	/*
+	 * We handle btree write errors by immediately halting the journal -
+	 * after we've done that, we can't issue any subsequent btree writes
+	 * because they might have pointers to new nodes that failed to write.
+	 *
+	 * Furthermore, there's no point in doing any more btree writes because
+	 * with the journal stopped, we're never going to update the journal to
+	 * reflect that those writes were done and the data flushed from the
+	 * journal:
+	 *
+	 * Make sure to update b->written so bch2_btree_init_next() doesn't
+	 * break:
+	 */
+	if (bch2_journal_error(&c->journal) ||
+	    c->opts.nochanges)
+		goto err;
+
+	trace_btree_write(b, bytes_to_write, sectors_to_write);
+
+	wbio = container_of(bio_alloc_bioset(NULL, 1 << order,
+				REQ_OP_WRITE|REQ_META|REQ_FUA,
+				GFP_NOIO,
+				&c->btree_bio),
+			    struct btree_write_bio, wbio.bio);
+	wbio_init(&wbio->wbio.bio);
+	wbio->data			= data;
+	wbio->wbio.order		= order;
+	wbio->wbio.used_mempool		= used_mempool;
+	wbio->wbio.bio.bi_iter.bi_size	= sectors_to_write << 9;
+	wbio->wbio.bio.bi_end_io	= btree_node_write_endio;
+	wbio->wbio.bio.bi_private	= b;
+
+	bch2_bio_map(&wbio->wbio.bio, data);
+
+	/*
+	 * If we're appending to a leaf node, we don't technically need FUA -
+	 * this write just needs to be persisted before the next journal write,
+	 * which will be marked FLUSH|FUA.
+	 *
+	 * Similarly if we're writing a new btree root - the pointer is going to
+	 * be in the next journal entry.
+	 *
+	 * But if we're writing a new btree node (that isn't a root) or
+	 * appending to a non leaf btree node, we need either FUA or a flush
+	 * when we write the parent with the new pointer. FUA is cheaper than a
+	 * flush, and writes appending to leaf nodes aren't blocking anything so
+	 * just make all btree node writes FUA to keep things sane.
+	 */
+
+	bkey_copy(&k.key, &b->key);
+	e = bkey_i_to_s_extent(&k.key);
+
+	extent_for_each_ptr(e, ptr)
+		ptr->offset += b->written;
+
+	b->written += sectors_to_write;
+
+	bch2_submit_wbio_replicas(&wbio->wbio, c, BCH_DATA_BTREE, &k.key);
+	return;
+err:
+	set_btree_node_noevict(b);
+	b->written += sectors_to_write;
+nowrite:
+	btree_bounce_free(c, order, used_mempool, data);
+	btree_node_write_done(c, b);
+}
+
+/*
+ * Work that must be done with write lock held:
+ */
+bool bch2_btree_post_write_cleanup(struct bch_fs *c, struct btree *b)
+{
+	bool invalidated_iter = false;
+	struct btree_node_entry *bne;
+	struct bset_tree *t;
+
+	if (!btree_node_just_written(b))
+		return false;
+
+	BUG_ON(b->whiteout_u64s);
+	BUG_ON(b->uncompacted_whiteout_u64s);
+
+	clear_btree_node_just_written(b);
+
+	/*
+	 * Note: immediately after write, bset_unwritten()/bset_written() don't
+	 * work - the amount of data we had to write after compaction might have
+	 * been smaller than the offset of the last bset.
+	 *
+	 * However, we know that all bsets have been written here, as long as
+	 * we're still holding the write lock:
+	 */
+
+	/*
+	 * XXX: decide if we really want to unconditionally sort down to a
+	 * single bset:
+	 */
+	if (b->nsets > 1) {
+		btree_node_sort(c, b, NULL, 0, b->nsets, true);
+		invalidated_iter = true;
+	} else {
+		invalidated_iter = bch2_drop_whiteouts(b);
+	}
+
+	for_each_bset(b, t)
+		set_needs_whiteout(bset(b, t));
+
+	bch2_btree_verify(c, b);
+
+	/*
+	 * If later we don't unconditionally sort down to a single bset, we have
+	 * to ensure this is still true:
+	 */
+	BUG_ON((void *) btree_bkey_last(b, bset_tree_last(b)) > write_block(b));
+
+	bne = want_new_bset(c, b);
+	if (bne)
+		bch2_bset_init_next(c, b, bne);
+
+	bch2_btree_build_aux_trees(b);
+
+	return invalidated_iter;
+}
+
+/*
+ * Use this one if the node is intent locked:
+ */
+void bch2_btree_node_write(struct bch_fs *c, struct btree *b,
+			  enum six_lock_type lock_type_held)
+{
+	BUG_ON(lock_type_held == SIX_LOCK_write);
+
+	if (lock_type_held == SIX_LOCK_intent ||
+	    six_lock_tryupgrade(&b->lock)) {
+		__bch2_btree_node_write(c, b, SIX_LOCK_intent);
+
+		/* don't cycle lock unnecessarily: */
+		if (btree_node_just_written(b) &&
+		    six_trylock_write(&b->lock)) {
+			bch2_btree_post_write_cleanup(c, b);
+			six_unlock_write(&b->lock);
+		}
+
+		if (lock_type_held == SIX_LOCK_read)
+			six_lock_downgrade(&b->lock);
+	} else {
+		__bch2_btree_node_write(c, b, SIX_LOCK_read);
+	}
+}
+
+static void __bch2_btree_flush_all(struct bch_fs *c, unsigned flag)
+{
+	struct bucket_table *tbl;
+	struct rhash_head *pos;
+	struct btree *b;
+	unsigned i;
+restart:
+	rcu_read_lock();
+	for_each_cached_btree(b, c, tbl, i, pos)
+		if (test_bit(flag, &b->flags)) {
+			rcu_read_unlock();
+			wait_on_bit_io(&b->flags, flag, TASK_UNINTERRUPTIBLE);
+			goto restart;
+
+		}
+	rcu_read_unlock();
+}
+
+void bch2_btree_flush_all_reads(struct bch_fs *c)
+{
+	__bch2_btree_flush_all(c, BTREE_NODE_read_in_flight);
+}
+
+void bch2_btree_flush_all_writes(struct bch_fs *c)
+{
+	__bch2_btree_flush_all(c, BTREE_NODE_write_in_flight);
+}
+
+void bch2_btree_verify_flushed(struct bch_fs *c)
+{
+	struct bucket_table *tbl;
+	struct rhash_head *pos;
+	struct btree *b;
+	unsigned i;
+
+	rcu_read_lock();
+	for_each_cached_btree(b, c, tbl, i, pos) {
+		unsigned long flags = READ_ONCE(b->flags);
+
+		BUG_ON((flags & (1 << BTREE_NODE_dirty)) ||
+		       (flags & (1 << BTREE_NODE_write_in_flight)));
+	}
+	rcu_read_unlock();
+}
+
+ssize_t bch2_dirty_btree_nodes_print(struct bch_fs *c, char *buf)
+{
+	char *out = buf, *end = buf + PAGE_SIZE;
+	struct bucket_table *tbl;
+	struct rhash_head *pos;
+	struct btree *b;
+	unsigned i;
+
+	rcu_read_lock();
+	for_each_cached_btree(b, c, tbl, i, pos) {
+		unsigned long flags = READ_ONCE(b->flags);
+		unsigned idx = (flags & (1 << BTREE_NODE_write_idx)) != 0;
+
+		if (//!(flags & (1 << BTREE_NODE_dirty)) &&
+		    !b->writes[0].wait.list.first &&
+		    !b->writes[1].wait.list.first &&
+		    !(b->will_make_reachable & 1))
+			continue;
+
+		out += scnprintf(out, end - out, "%p d %u l %u w %u b %u r %u:%lu c %u p %u\n",
+				 b,
+				 (flags & (1 << BTREE_NODE_dirty)) != 0,
+				 b->level,
+				 b->written,
+				 !list_empty_careful(&b->write_blocked),
+				 b->will_make_reachable != 0,
+				 b->will_make_reachable & 1,
+				 b->writes[ idx].wait.list.first != NULL,
+				 b->writes[!idx].wait.list.first != NULL);
+	}
+	rcu_read_unlock();
+
+	return out - buf;
+}
diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h
new file mode 100644
index 000000000000..0688ce420610
--- /dev/null
+++ b/fs/bcachefs/btree_io.h
@@ -0,0 +1,197 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BTREE_IO_H
+#define _BCACHEFS_BTREE_IO_H
+
+#include "bset.h"
+#include "extents.h"
+#include "io_types.h"
+
+struct bch_fs;
+struct btree_write;
+struct btree;
+struct btree_iter;
+
+struct btree_read_bio {
+	struct bch_fs		*c;
+	u64			start_time;
+	unsigned		have_ioref:1;
+	struct extent_pick_ptr	pick;
+	struct work_struct	work;
+	struct bio		bio;
+};
+
+struct btree_write_bio {
+	void			*data;
+	struct work_struct	work;
+	struct bch_write_bio	wbio;
+};
+
+static inline void btree_node_io_unlock(struct btree *b)
+{
+	EBUG_ON(!btree_node_write_in_flight(b));
+	clear_btree_node_write_in_flight(b);
+	wake_up_bit(&b->flags, BTREE_NODE_write_in_flight);
+}
+
+static inline void btree_node_io_lock(struct btree *b)
+{
+	wait_on_bit_lock_io(&b->flags, BTREE_NODE_write_in_flight,
+			    TASK_UNINTERRUPTIBLE);
+}
+
+static inline void btree_node_wait_on_io(struct btree *b)
+{
+	wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight,
+		       TASK_UNINTERRUPTIBLE);
+}
+
+static inline bool btree_node_may_write(struct btree *b)
+{
+	return list_empty_careful(&b->write_blocked) &&
+		!b->will_make_reachable;
+}
+
+enum compact_mode {
+	COMPACT_LAZY,
+	COMPACT_WRITTEN,
+	COMPACT_WRITTEN_NO_WRITE_LOCK,
+};
+
+bool __bch2_compact_whiteouts(struct bch_fs *, struct btree *, enum compact_mode);
+
+static inline unsigned should_compact_bset_lazy(struct btree *b, struct bset_tree *t)
+{
+	unsigned bset_u64s = le16_to_cpu(bset(b, t)->u64s);
+	unsigned dead_u64s = bset_u64s - b->nr.bset_u64s[t - b->set];
+
+	return dead_u64s > 128 && dead_u64s * 3 > bset_u64s;
+}
+
+static inline bool bch2_maybe_compact_whiteouts(struct bch_fs *c, struct btree *b)
+{
+	struct bset_tree *t;
+
+	for_each_bset(b, t)
+		if (should_compact_bset_lazy(b, t))
+			return __bch2_compact_whiteouts(c, b, COMPACT_LAZY);
+
+	return false;
+}
+
+void bch2_btree_sort_into(struct bch_fs *, struct btree *, struct btree *);
+
+void bch2_btree_build_aux_trees(struct btree *);
+void bch2_btree_init_next(struct bch_fs *, struct btree *,
+			 struct btree_iter *);
+
+int bch2_btree_node_read_done(struct bch_fs *, struct btree *, bool);
+void bch2_btree_node_read(struct bch_fs *, struct btree *, bool);
+int bch2_btree_root_read(struct bch_fs *, enum btree_id,
+			 const struct bkey_i *, unsigned);
+
+void bch2_btree_complete_write(struct bch_fs *, struct btree *,
+			      struct btree_write *);
+void bch2_btree_write_error_work(struct work_struct *);
+
+void __bch2_btree_node_write(struct bch_fs *, struct btree *,
+			    enum six_lock_type);
+bool bch2_btree_post_write_cleanup(struct bch_fs *, struct btree *);
+
+void bch2_btree_node_write(struct bch_fs *, struct btree *,
+			  enum six_lock_type);
+
+/*
+ * btree_node_dirty() can be cleared with only a read lock,
+ * and for bch2_btree_node_write_cond() we want to set need_write iff it's
+ * still dirty:
+ */
+static inline void set_btree_node_need_write_if_dirty(struct btree *b)
+{
+	unsigned long old, new, v = READ_ONCE(b->flags);
+
+	do {
+		old = new = v;
+
+		if (!(old & (1 << BTREE_NODE_dirty)))
+			return;
+
+		new |= (1 << BTREE_NODE_need_write);
+	} while ((v = cmpxchg(&b->flags, old, new)) != old);
+}
+
+#define bch2_btree_node_write_cond(_c, _b, cond)			\
+do {									\
+	while ((_b)->written && btree_node_dirty(_b) &&	(cond)) {	\
+		if (!btree_node_may_write(_b)) {			\
+			set_btree_node_need_write_if_dirty(_b);		\
+			break;						\
+		}							\
+									\
+		if (!btree_node_write_in_flight(_b)) {			\
+			bch2_btree_node_write(_c, _b, SIX_LOCK_read);	\
+			break;						\
+		}							\
+									\
+		six_unlock_read(&(_b)->lock);				\
+		btree_node_wait_on_io(_b);				\
+		btree_node_lock_type(c, b, SIX_LOCK_read);		\
+	}								\
+} while (0)
+
+void bch2_btree_flush_all_reads(struct bch_fs *);
+void bch2_btree_flush_all_writes(struct bch_fs *);
+void bch2_btree_verify_flushed(struct bch_fs *);
+ssize_t bch2_dirty_btree_nodes_print(struct bch_fs *, char *);
+
+/* Sorting */
+
+struct btree_node_iter_large {
+	u8		is_extents;
+	u16		used;
+
+	struct btree_node_iter_set data[MAX_BSETS];
+};
+
+static inline void
+__bch2_btree_node_iter_large_init(struct btree_node_iter_large *iter,
+				  bool is_extents)
+{
+	iter->used = 0;
+	iter->is_extents = is_extents;
+}
+
+void bch2_btree_node_iter_large_advance(struct btree_node_iter_large *,
+					struct btree *);
+
+void bch2_btree_node_iter_large_push(struct btree_node_iter_large *,
+				     struct btree *,
+				     const struct bkey_packed *,
+				     const struct bkey_packed *);
+
+static inline bool bch2_btree_node_iter_large_end(struct btree_node_iter_large *iter)
+{
+	return !iter->used;
+}
+
+static inline struct bkey_packed *
+bch2_btree_node_iter_large_peek_all(struct btree_node_iter_large *iter,
+				    struct btree *b)
+{
+	return bch2_btree_node_iter_large_end(iter)
+		? NULL
+		: __btree_node_offset_to_key(b, iter->data->k);
+}
+
+static inline struct bkey_packed *
+bch2_btree_node_iter_large_next_all(struct btree_node_iter_large *iter,
+				    struct btree *b)
+{
+	struct bkey_packed *ret = bch2_btree_node_iter_large_peek_all(iter, b);
+
+	if (ret)
+		bch2_btree_node_iter_large_advance(iter, b);
+
+	return ret;
+}
+
+#endif /* _BCACHEFS_BTREE_IO_H */
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
new file mode 100644
index 000000000000..2b4ba41149cf
--- /dev/null
+++ b/fs/bcachefs/btree_iter.c
@@ -0,0 +1,1844 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "bkey_methods.h"
+#include "btree_cache.h"
+#include "btree_iter.h"
+#include "btree_locking.h"
+#include "debug.h"
+#include "extents.h"
+#include "trace.h"
+
+#include <linux/prefetch.h>
+
+static inline struct bkey_s_c __btree_iter_peek_all(struct btree_iter *,
+						    struct btree_iter_level *,
+						    struct bkey *);
+
+#define BTREE_ITER_NOT_END	((struct btree *) 1)
+
+static inline bool is_btree_node(struct btree_iter *iter, unsigned l)
+{
+	return l < BTREE_MAX_DEPTH &&
+		iter->l[l].b &&
+		iter->l[l].b != BTREE_ITER_NOT_END;
+}
+
+/* Btree node locking: */
+
+/*
+ * Updates the saved lock sequence number, so that bch2_btree_node_relock() will
+ * succeed:
+ */
+void bch2_btree_node_unlock_write(struct btree *b, struct btree_iter *iter)
+{
+	struct btree_iter *linked;
+
+	EBUG_ON(iter->l[b->level].b != b);
+	EBUG_ON(iter->lock_seq[b->level] + 1 != b->lock.state.seq);
+
+	for_each_btree_iter_with_node(iter, b, linked)
+		linked->lock_seq[b->level] += 2;
+
+	six_unlock_write(&b->lock);
+}
+
+void __bch2_btree_node_lock_write(struct btree *b, struct btree_iter *iter)
+{
+	struct bch_fs *c = iter->c;
+	struct btree_iter *linked;
+	unsigned readers = 0;
+
+	EBUG_ON(btree_node_read_locked(iter, b->level));
+
+	for_each_linked_btree_iter(iter, linked)
+		if (linked->l[b->level].b == b &&
+		    btree_node_read_locked(linked, b->level))
+			readers++;
+
+	/*
+	 * Must drop our read locks before calling six_lock_write() -
+	 * six_unlock() won't do wakeups until the reader count
+	 * goes to 0, and it's safe because we have the node intent
+	 * locked:
+	 */
+	atomic64_sub(__SIX_VAL(read_lock, readers),
+		     &b->lock.state.counter);
+	btree_node_lock_type(c, b, SIX_LOCK_write);
+	atomic64_add(__SIX_VAL(read_lock, readers),
+		     &b->lock.state.counter);
+}
+
+/*
+ * Lock a btree node if we already have it locked on one of our linked
+ * iterators:
+ */
+static inline bool btree_node_lock_increment(struct btree_iter *iter,
+					     struct btree *b, unsigned level,
+					     enum btree_node_locked_type want)
+{
+	struct btree_iter *linked;
+
+	for_each_linked_btree_iter(iter, linked)
+		if (linked->l[level].b == b &&
+		    btree_node_locked_type(linked, level) >= want) {
+			six_lock_increment(&b->lock, (enum six_lock_type) want);
+			return true;
+		}
+
+	return false;
+}
+
+bool __bch2_btree_node_relock(struct btree_iter *iter, unsigned level)
+{
+	struct btree *b = btree_iter_node(iter, level);
+	int want = __btree_lock_want(iter, level);
+
+	if (!b || b == BTREE_ITER_NOT_END)
+		return false;
+
+	if (race_fault())
+		return false;
+
+	if (!six_relock_type(&b->lock, want, iter->lock_seq[level]) &&
+	    !(iter->lock_seq[level] >> 1 == b->lock.state.seq >> 1 &&
+	      btree_node_lock_increment(iter, b, level, want)))
+		return false;
+
+	mark_btree_node_locked(iter, level, want);
+	return true;
+}
+
+static bool bch2_btree_node_upgrade(struct btree_iter *iter, unsigned level)
+{
+	struct btree *b = iter->l[level].b;
+
+	EBUG_ON(btree_lock_want(iter, level) != BTREE_NODE_INTENT_LOCKED);
+
+	if (!is_btree_node(iter, level))
+		return false;
+
+	if (btree_node_intent_locked(iter, level))
+		return true;
+
+	if (race_fault())
+		return false;
+
+	if (btree_node_locked(iter, level)
+	    ? six_lock_tryupgrade(&b->lock)
+	    : six_relock_type(&b->lock, SIX_LOCK_intent, iter->lock_seq[level]))
+		goto success;
+
+	if (iter->lock_seq[level] >> 1 == b->lock.state.seq >> 1 &&
+	    btree_node_lock_increment(iter, b, level, BTREE_NODE_INTENT_LOCKED)) {
+		btree_node_unlock(iter, level);
+		goto success;
+	}
+
+	return false;
+success:
+	mark_btree_node_intent_locked(iter, level);
+	return true;
+}
+
+static inline bool btree_iter_get_locks(struct btree_iter *iter,
+					bool upgrade)
+{
+	unsigned l = iter->level;
+	int fail_idx = -1;
+
+	do {
+		if (!btree_iter_node(iter, l))
+			break;
+
+		if (!(upgrade
+		      ? bch2_btree_node_upgrade(iter, l)
+		      : bch2_btree_node_relock(iter, l))) {
+			fail_idx = l;
+			btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
+		}
+
+		l++;
+	} while (l < iter->locks_want);
+
+	/*
+	 * When we fail to get a lock, we have to ensure that any child nodes
+	 * can't be relocked so bch2_btree_iter_traverse has to walk back up to
+	 * the node that we failed to relock:
+	 */
+	while (fail_idx >= 0) {
+		btree_node_unlock(iter, fail_idx);
+		iter->l[fail_idx].b = BTREE_ITER_NOT_END;
+		--fail_idx;
+	}
+
+	if (iter->uptodate == BTREE_ITER_NEED_RELOCK)
+		iter->uptodate = BTREE_ITER_NEED_PEEK;
+
+	bch2_btree_iter_verify_locks(iter);
+	return iter->uptodate < BTREE_ITER_NEED_RELOCK;
+}
+
+/* Slowpath: */
+bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
+			   unsigned level,
+			   struct btree_iter *iter,
+			   enum six_lock_type type,
+			   bool may_drop_locks)
+{
+	struct bch_fs *c = iter->c;
+	struct btree_iter *linked;
+	bool ret = true;
+
+	/* Can't have children locked before ancestors: */
+	EBUG_ON(iter->nodes_locked && level > __ffs(iter->nodes_locked));
+
+	/*
+	 * Can't hold any read locks while we block taking an intent lock - see
+	 * below for reasoning, and we should have already dropped any read
+	 * locks in the current iterator
+	 */
+	EBUG_ON(type == SIX_LOCK_intent &&
+		iter->nodes_locked != iter->nodes_intent_locked);
+
+	if (btree_node_lock_increment(iter, b, level, (enum btree_node_locked_type) type))
+		return true;
+
+	/*
+	 * Must lock btree nodes in key order - this case happens when locking
+	 * the prev sibling in btree node merging:
+	 */
+	if (iter->nodes_locked &&
+	    __ffs(iter->nodes_locked) <= level &&
+	    __btree_iter_cmp(iter->btree_id, pos, iter))
+		return false;
+
+	for_each_linked_btree_iter(iter, linked) {
+		if (!linked->nodes_locked)
+			continue;
+
+		/* We have to lock btree nodes in key order: */
+		if (__btree_iter_cmp(iter->btree_id, pos, linked) < 0)
+			ret = false;
+
+		/*
+		 * Can't block taking an intent lock if we have _any_ nodes read
+		 * locked:
+		 *
+		 * - Our read lock blocks another thread with an intent lock on
+		 *   the same node from getting a write lock, and thus from
+		 *   dropping its intent lock
+		 *
+		 * - And the other thread may have multiple nodes intent locked:
+		 *   both the node we want to intent lock, and the node we
+		 *   already have read locked - deadlock:
+		 */
+		if (type == SIX_LOCK_intent &&
+		    linked->nodes_locked != linked->nodes_intent_locked) {
+			if (may_drop_locks) {
+				linked->locks_want = max_t(unsigned,
+						linked->locks_want,
+						__fls(linked->nodes_locked) + 1);
+				btree_iter_get_locks(linked, true);
+			}
+			ret = false;
+		}
+
+		/*
+		 * Interior nodes must be locked before their descendants: if
+		 * another iterator has possible descendants locked of the node
+		 * we're about to lock, it must have the ancestors locked too:
+		 */
+		if (linked->btree_id == iter->btree_id &&
+		    level > __fls(linked->nodes_locked)) {
+			if (may_drop_locks) {
+				linked->locks_want = max_t(unsigned,
+							   linked->locks_want,
+							   iter->locks_want);
+				btree_iter_get_locks(linked, true);
+			}
+			ret = false;
+		}
+	}
+
+	if (ret)
+		__btree_node_lock_type(c, b, type);
+	return ret;
+}
+
+/* Btree iterator locking: */
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+void bch2_btree_iter_verify_locks(struct btree_iter *iter)
+{
+	unsigned l;
+
+	for (l = 0; btree_iter_node(iter, l); l++) {
+		if (iter->uptodate >= BTREE_ITER_NEED_RELOCK &&
+		    !btree_node_locked(iter, l))
+			continue;
+
+		BUG_ON(btree_lock_want(iter, l) !=
+		       btree_node_locked_type(iter, l));
+	}
+}
+#endif
+
+__flatten
+static bool __bch2_btree_iter_relock(struct btree_iter *iter)
+{
+	return iter->uptodate >= BTREE_ITER_NEED_RELOCK
+		? btree_iter_get_locks(iter, false)
+		: true;
+}
+
+bool bch2_btree_iter_relock(struct btree_iter *iter)
+{
+	struct btree_iter *linked;
+	bool ret = true;
+
+	for_each_btree_iter(iter, linked)
+		ret &= __bch2_btree_iter_relock(linked);
+
+	return ret;
+}
+
+bool __bch2_btree_iter_upgrade(struct btree_iter *iter,
+			       unsigned new_locks_want)
+{
+	struct btree_iter *linked;
+
+	EBUG_ON(iter->locks_want >= new_locks_want);
+
+	iter->locks_want = new_locks_want;
+
+	if (btree_iter_get_locks(iter, true))
+		return true;
+
+	/*
+	 * Ancestor nodes must be locked before child nodes, so set locks_want
+	 * on iterators that might lock ancestors before us to avoid getting
+	 * -EINTR later:
+	 */
+	for_each_linked_btree_iter(iter, linked)
+		if (linked->btree_id == iter->btree_id &&
+		    btree_iter_cmp(linked, iter) <= 0 &&
+		    linked->locks_want < new_locks_want) {
+			linked->locks_want = new_locks_want;
+			btree_iter_get_locks(linked, true);
+		}
+
+	return false;
+}
+
+bool __bch2_btree_iter_upgrade_nounlock(struct btree_iter *iter,
+					unsigned new_locks_want)
+{
+	unsigned l = iter->level;
+
+	EBUG_ON(iter->locks_want >= new_locks_want);
+
+	iter->locks_want = new_locks_want;
+
+	do {
+		if (!btree_iter_node(iter, l))
+			break;
+
+		if (!bch2_btree_node_upgrade(iter, l)) {
+			iter->locks_want = l;
+			return false;
+		}
+
+		l++;
+	} while (l < iter->locks_want);
+
+	return true;
+}
+
+void __bch2_btree_iter_downgrade(struct btree_iter *iter,
+				 unsigned downgrade_to)
+{
+	struct btree_iter *linked;
+	unsigned l;
+
+	/*
+	 * We downgrade linked iterators as well because btree_iter_upgrade
+	 * might have had to modify locks_want on linked iterators due to lock
+	 * ordering:
+	 */
+	for_each_btree_iter(iter, linked) {
+		unsigned new_locks_want = downgrade_to ?:
+			(linked->flags & BTREE_ITER_INTENT ? 1 : 0);
+
+		if (linked->locks_want <= new_locks_want)
+			continue;
+
+		linked->locks_want = new_locks_want;
+
+		while (linked->nodes_locked &&
+		       (l = __fls(linked->nodes_locked)) >= linked->locks_want) {
+			if (l > linked->level) {
+				btree_node_unlock(linked, l);
+			} else {
+				if (btree_node_intent_locked(linked, l)) {
+					six_lock_downgrade(&linked->l[l].b->lock);
+					linked->nodes_intent_locked ^= 1 << l;
+				}
+				break;
+			}
+		}
+
+		bch2_btree_iter_verify_locks(linked);
+	}
+}
+
+int bch2_btree_iter_unlock(struct btree_iter *iter)
+{
+	struct btree_iter *linked;
+
+	for_each_btree_iter(iter, linked)
+		__bch2_btree_iter_unlock(linked);
+
+	return iter->flags & BTREE_ITER_ERROR ? -EIO : 0;
+}
+
+/* Btree iterator: */
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+
+static void __bch2_btree_iter_verify(struct btree_iter *iter,
+				     struct btree *b)
+{
+	struct btree_iter_level *l = &iter->l[b->level];
+	struct btree_node_iter tmp = l->iter;
+	struct bkey_packed *k;
+
+	bch2_btree_node_iter_verify(&l->iter, b);
+
+	/*
+	 * For interior nodes, the iterator will have skipped past
+	 * deleted keys:
+	 */
+	k = b->level
+		? bch2_btree_node_iter_prev(&tmp, b)
+		: bch2_btree_node_iter_prev_all(&tmp, b);
+	if (k && btree_iter_pos_cmp_packed(b, &iter->pos, k,
+				iter->flags & BTREE_ITER_IS_EXTENTS)) {
+		char buf[100];
+		struct bkey uk = bkey_unpack_key(b, k);
+
+		bch2_bkey_to_text(buf, sizeof(buf), &uk);
+		panic("prev key should be before after pos:\n%s\n%llu:%llu\n",
+		      buf, iter->pos.inode, iter->pos.offset);
+	}
+
+	k = bch2_btree_node_iter_peek_all(&l->iter, b);
+	if (k && !btree_iter_pos_cmp_packed(b, &iter->pos, k,
+				iter->flags & BTREE_ITER_IS_EXTENTS)) {
+		char buf[100];
+		struct bkey uk = bkey_unpack_key(b, k);
+
+		bch2_bkey_to_text(buf, sizeof(buf), &uk);
+		panic("next key should be before iter pos:\n%llu:%llu\n%s\n",
+		      iter->pos.inode, iter->pos.offset, buf);
+	}
+
+	if (iter->uptodate == BTREE_ITER_UPTODATE &&
+	    (iter->flags & BTREE_ITER_TYPE) != BTREE_ITER_NODES) {
+		BUG_ON(!bkey_whiteout(&iter->k) &&
+		       bch2_btree_node_iter_end(&l->iter));
+	}
+}
+
+void bch2_btree_iter_verify(struct btree_iter *iter, struct btree *b)
+{
+	struct btree_iter *linked;
+
+	for_each_btree_iter_with_node(iter, b, linked)
+		__bch2_btree_iter_verify(linked, b);
+}
+
+#endif
+
+static void __bch2_btree_node_iter_fix(struct btree_iter *iter,
+				      struct btree *b,
+				      struct btree_node_iter *node_iter,
+				      struct bset_tree *t,
+				      struct bkey_packed *where,
+				      unsigned clobber_u64s,
+				      unsigned new_u64s)
+{
+	const struct bkey_packed *end = btree_bkey_last(b, t);
+	struct btree_node_iter_set *set;
+	unsigned offset = __btree_node_key_to_offset(b, where);
+	int shift = new_u64s - clobber_u64s;
+	unsigned old_end = (int) __btree_node_key_to_offset(b, end) - shift;
+
+	btree_node_iter_for_each(node_iter, set)
+		if (set->end == old_end)
+			goto found;
+
+	/* didn't find the bset in the iterator - might have to readd it: */
+	if (new_u64s &&
+	    btree_iter_pos_cmp_packed(b, &iter->pos, where,
+				      iter->flags & BTREE_ITER_IS_EXTENTS)) {
+		btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
+
+		bch2_btree_node_iter_push(node_iter, b, where, end);
+
+		if (!b->level &&
+		    node_iter == &iter->l[0].iter)
+			bkey_disassemble(b,
+				bch2_btree_node_iter_peek_all(node_iter, b),
+				&iter->k);
+	}
+	return;
+found:
+	set->end = (int) set->end + shift;
+
+	/* Iterator hasn't gotten to the key that changed yet: */
+	if (set->k < offset)
+		return;
+
+	if (new_u64s &&
+	    btree_iter_pos_cmp_packed(b, &iter->pos, where,
+				iter->flags & BTREE_ITER_IS_EXTENTS)) {
+		set->k = offset;
+	} else if (set->k < offset + clobber_u64s) {
+		set->k = offset + new_u64s;
+		if (set->k == set->end)
+			bch2_btree_node_iter_set_drop(node_iter, set);
+	} else {
+		set->k = (int) set->k + shift;
+		goto iter_current_key_not_modified;
+	}
+
+	btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
+
+	bch2_btree_node_iter_sort(node_iter, b);
+	if (!b->level && node_iter == &iter->l[0].iter)
+		__btree_iter_peek_all(iter, &iter->l[0], &iter->k);
+iter_current_key_not_modified:
+
+	/*
+	 * Interior nodes are special because iterators for interior nodes don't
+	 * obey the usual invariants regarding the iterator position:
+	 *
+	 * We may have whiteouts that compare greater than the iterator
+	 * position, and logically should be in the iterator, but that we
+	 * skipped past to find the first live key greater than the iterator
+	 * position. This becomes an issue when we insert a new key that is
+	 * greater than the current iterator position, but smaller than the
+	 * whiteouts we've already skipped past - this happens in the course of
+	 * a btree split.
+	 *
+	 * We have to rewind the iterator past to before those whiteouts here,
+	 * else bkey_node_iter_prev() is not going to work and who knows what
+	 * else would happen. And we have to do it manually, because here we've
+	 * already done the insert and the iterator is currently inconsistent:
+	 *
+	 * We've got multiple competing invariants, here - we have to be careful
+	 * about rewinding iterators for interior nodes, because they should
+	 * always point to the key for the child node the btree iterator points
+	 * to.
+	 */
+	if (b->level && new_u64s && !bkey_deleted(where) &&
+	    btree_iter_pos_cmp_packed(b, &iter->pos, where,
+				iter->flags & BTREE_ITER_IS_EXTENTS)) {
+		struct bset_tree *t;
+		struct bkey_packed *k;
+
+		for_each_bset(b, t) {
+			if (bch2_bkey_to_bset(b, where) == t)
+				continue;
+
+			k = bch2_bkey_prev_all(b, t,
+				bch2_btree_node_iter_bset_pos(node_iter, b, t));
+			if (k &&
+			    __btree_node_iter_cmp(node_iter, b,
+						  k, where) > 0) {
+				struct btree_node_iter_set *set;
+				unsigned offset =
+					__btree_node_key_to_offset(b, bkey_next(k));
+
+				btree_node_iter_for_each(node_iter, set)
+					if (set->k == offset) {
+						set->k = __btree_node_key_to_offset(b, k);
+						bch2_btree_node_iter_sort(node_iter, b);
+						goto next_bset;
+					}
+
+				bch2_btree_node_iter_push(node_iter, b, k,
+						btree_bkey_last(b, t));
+			}
+next_bset:
+			t = t;
+		}
+	}
+}
+
+void bch2_btree_node_iter_fix(struct btree_iter *iter,
+			     struct btree *b,
+			     struct btree_node_iter *node_iter,
+			     struct bset_tree *t,
+			     struct bkey_packed *where,
+			     unsigned clobber_u64s,
+			     unsigned new_u64s)
+{
+	struct btree_iter *linked;
+
+	if (node_iter != &iter->l[b->level].iter)
+		__bch2_btree_node_iter_fix(iter, b, node_iter, t,
+					  where, clobber_u64s, new_u64s);
+
+	for_each_btree_iter_with_node(iter, b, linked)
+		__bch2_btree_node_iter_fix(linked, b,
+					  &linked->l[b->level].iter, t,
+					  where, clobber_u64s, new_u64s);
+
+	/* interior node iterators are... special... */
+	if (!b->level)
+		bch2_btree_iter_verify(iter, b);
+}
+
+static inline struct bkey_s_c __btree_iter_unpack(struct btree_iter *iter,
+						  struct btree_iter_level *l,
+						  struct bkey *u,
+						  struct bkey_packed *k)
+{
+	struct bkey_s_c ret;
+
+	if (unlikely(!k)) {
+		/*
+		 * signal to bch2_btree_iter_peek_slot() that we're currently at
+		 * a hole
+		 */
+		u->type = KEY_TYPE_DELETED;
+		return bkey_s_c_null;
+	}
+
+	ret = bkey_disassemble(l->b, k, u);
+
+	if (debug_check_bkeys(iter->c))
+		bch2_bkey_debugcheck(iter->c, l->b, ret);
+
+	return ret;
+}
+
+/* peek_all() doesn't skip deleted keys */
+static inline struct bkey_s_c __btree_iter_peek_all(struct btree_iter *iter,
+						    struct btree_iter_level *l,
+						    struct bkey *u)
+{
+	return __btree_iter_unpack(iter, l, u,
+			bch2_btree_node_iter_peek_all(&l->iter, l->b));
+}
+
+static inline struct bkey_s_c __btree_iter_peek(struct btree_iter *iter,
+						struct btree_iter_level *l)
+{
+	return __btree_iter_unpack(iter, l, &iter->k,
+			bch2_btree_node_iter_peek(&l->iter, l->b));
+}
+
+static inline void __btree_iter_advance(struct btree_iter_level *l)
+{
+	bch2_btree_node_iter_advance(&l->iter, l->b);
+}
+
+/*
+ * Verify that iterator for parent node points to child node:
+ */
+static void btree_iter_verify_new_node(struct btree_iter *iter, struct btree *b)
+{
+	struct btree_iter_level *l;
+	unsigned plevel;
+	bool parent_locked;
+	struct bkey_packed *k;
+
+	if (!IS_ENABLED(CONFIG_BCACHEFS_DEBUG))
+		return;
+
+	plevel = b->level + 1;
+	if (!btree_iter_node(iter, plevel))
+		return;
+
+	parent_locked = btree_node_locked(iter, plevel);
+
+	if (!bch2_btree_node_relock(iter, plevel))
+		return;
+
+	l = &iter->l[plevel];
+	k = bch2_btree_node_iter_peek_all(&l->iter, l->b);
+	if (!k ||
+	    bkey_deleted(k) ||
+	    bkey_cmp_left_packed(l->b, k, &b->key.k.p)) {
+		char buf[100];
+		struct bkey uk = bkey_unpack_key(b, k);
+
+		bch2_bkey_to_text(buf, sizeof(buf), &uk);
+		panic("parent iter doesn't point to new node:\n%s\n%llu:%llu\n",
+		      buf, b->key.k.p.inode, b->key.k.p.offset);
+	}
+
+	if (!parent_locked)
+		btree_node_unlock(iter, b->level + 1);
+}
+
+/* Returns true if @k is after iterator position @pos */
+static inline bool btree_iter_pos_cmp(struct btree_iter *iter,
+				      const struct bkey *k)
+{
+	int cmp = bkey_cmp(k->p, iter->pos);
+
+	return cmp > 0 ||
+		(cmp == 0 &&
+		 !(iter->flags & BTREE_ITER_IS_EXTENTS) && !bkey_deleted(k));
+}
+
+static inline bool btree_iter_pos_after_node(struct btree_iter *iter,
+					     struct btree *b)
+{
+	return !btree_iter_pos_cmp(iter, &b->key.k) &&
+		bkey_cmp(b->key.k.p, POS_MAX);
+}
+
+static inline bool btree_iter_pos_in_node(struct btree_iter *iter,
+					  struct btree *b)
+{
+	return iter->btree_id == b->btree_id &&
+		bkey_cmp(iter->pos, b->data->min_key) >= 0 &&
+		!btree_iter_pos_after_node(iter, b);
+}
+
+static inline void __btree_iter_init(struct btree_iter *iter,
+				     struct btree *b)
+{
+	struct btree_iter_level *l = &iter->l[b->level];
+
+	bch2_btree_node_iter_init(&l->iter, b, iter->pos,
+				  iter->flags & BTREE_ITER_IS_EXTENTS,
+				  btree_node_is_extents(b));
+
+	/* Skip to first non whiteout: */
+	if (b->level)
+		bch2_btree_node_iter_peek(&l->iter, b);
+
+	btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
+}
+
+static inline void btree_iter_node_set(struct btree_iter *iter,
+				       struct btree *b)
+{
+	btree_iter_verify_new_node(iter, b);
+
+	EBUG_ON(!btree_iter_pos_in_node(iter, b));
+	EBUG_ON(b->lock.state.seq & 1);
+
+	iter->lock_seq[b->level] = b->lock.state.seq;
+	iter->l[b->level].b = b;
+	__btree_iter_init(iter, b);
+}
+
+/*
+ * A btree node is being replaced - update the iterator to point to the new
+ * node:
+ */
+void bch2_btree_iter_node_replace(struct btree_iter *iter, struct btree *b)
+{
+	enum btree_node_locked_type t;
+	struct btree_iter *linked;
+
+	for_each_btree_iter(iter, linked)
+		if (btree_iter_pos_in_node(linked, b)) {
+			/*
+			 * bch2_btree_iter_node_drop() has already been called -
+			 * the old node we're replacing has already been
+			 * unlocked and the pointer invalidated
+			 */
+			BUG_ON(btree_node_locked(linked, b->level));
+
+			t = btree_lock_want(linked, b->level);
+			if (t != BTREE_NODE_UNLOCKED) {
+				six_lock_increment(&b->lock, (enum six_lock_type) t);
+				mark_btree_node_locked(linked, b->level, (enum six_lock_type) t);
+			}
+
+			btree_iter_node_set(linked, b);
+		}
+
+	six_unlock_intent(&b->lock);
+}
+
+void bch2_btree_iter_node_drop(struct btree_iter *iter, struct btree *b)
+{
+	struct btree_iter *linked;
+	unsigned level = b->level;
+
+	for_each_btree_iter(iter, linked)
+		if (linked->l[level].b == b) {
+			btree_node_unlock(linked, level);
+			linked->l[level].b = BTREE_ITER_NOT_END;
+		}
+}
+
+/*
+ * A btree node has been modified in such a way as to invalidate iterators - fix
+ * them:
+ */
+void bch2_btree_iter_reinit_node(struct btree_iter *iter, struct btree *b)
+{
+	struct btree_iter *linked;
+
+	for_each_btree_iter_with_node(iter, b, linked)
+		__btree_iter_init(linked, b);
+}
+
+static inline int btree_iter_lock_root(struct btree_iter *iter,
+				       unsigned depth_want)
+{
+	struct bch_fs *c = iter->c;
+	struct btree *b;
+	enum six_lock_type lock_type;
+	unsigned i;
+
+	EBUG_ON(iter->nodes_locked);
+
+	while (1) {
+		b = READ_ONCE(c->btree_roots[iter->btree_id].b);
+		iter->level = READ_ONCE(b->level);
+
+		if (unlikely(iter->level < depth_want)) {
+			/*
+			 * the root is at a lower depth than the depth we want:
+			 * got to the end of the btree, or we're walking nodes
+			 * greater than some depth and there are no nodes >=
+			 * that depth
+			 */
+			iter->level = depth_want;
+			iter->l[iter->level].b = NULL;
+			return 0;
+		}
+
+		lock_type = __btree_lock_want(iter, iter->level);
+		if (unlikely(!btree_node_lock(b, POS_MAX, iter->level,
+					      iter, lock_type, true)))
+			return -EINTR;
+
+		if (likely(b == c->btree_roots[iter->btree_id].b &&
+			   b->level == iter->level &&
+			   !race_fault())) {
+			for (i = 0; i < iter->level; i++)
+				iter->l[i].b = BTREE_ITER_NOT_END;
+			iter->l[iter->level].b = b;
+
+			mark_btree_node_locked(iter, iter->level, lock_type);
+			btree_iter_node_set(iter, b);
+			return 0;
+
+		}
+
+		six_unlock_type(&b->lock, lock_type);
+	}
+}
+
+noinline
+static void btree_iter_prefetch(struct btree_iter *iter)
+{
+	struct btree_iter_level *l = &iter->l[iter->level];
+	struct btree_node_iter node_iter = l->iter;
+	struct bkey_packed *k;
+	BKEY_PADDED(k) tmp;
+	unsigned nr = test_bit(BCH_FS_STARTED, &iter->c->flags)
+		? (iter->level > 1 ? 0 :  2)
+		: (iter->level > 1 ? 1 : 16);
+	bool was_locked = btree_node_locked(iter, iter->level);
+
+	while (nr) {
+		if (!bch2_btree_node_relock(iter, iter->level))
+			return;
+
+		bch2_btree_node_iter_advance(&node_iter, l->b);
+		k = bch2_btree_node_iter_peek(&node_iter, l->b);
+		if (!k)
+			break;
+
+		bch2_bkey_unpack(l->b, &tmp.k, k);
+		bch2_btree_node_prefetch(iter->c, iter, &tmp.k,
+					 iter->level - 1);
+	}
+
+	if (!was_locked)
+		btree_node_unlock(iter, iter->level);
+}
+
+static inline int btree_iter_down(struct btree_iter *iter)
+{
+	struct btree_iter_level *l = &iter->l[iter->level];
+	struct btree *b;
+	unsigned level = iter->level - 1;
+	enum six_lock_type lock_type = __btree_lock_want(iter, level);
+	BKEY_PADDED(k) tmp;
+
+	BUG_ON(!btree_node_locked(iter, iter->level));
+
+	bch2_bkey_unpack(l->b, &tmp.k,
+			 bch2_btree_node_iter_peek(&l->iter, l->b));
+
+	b = bch2_btree_node_get(iter->c, iter, &tmp.k, level, lock_type, true);
+	if (unlikely(IS_ERR(b)))
+		return PTR_ERR(b);
+
+	mark_btree_node_locked(iter, level, lock_type);
+	btree_iter_node_set(iter, b);
+
+	if (iter->flags & BTREE_ITER_PREFETCH)
+		btree_iter_prefetch(iter);
+
+	iter->level = level;
+
+	return 0;
+}
+
+static void btree_iter_up(struct btree_iter *iter)
+{
+	btree_node_unlock(iter, iter->level++);
+}
+
+int __must_check __bch2_btree_iter_traverse(struct btree_iter *);
+
+static int btree_iter_traverse_error(struct btree_iter *iter, int ret)
+{
+	struct bch_fs *c = iter->c;
+	struct btree_iter *linked, *sorted_iters, **i;
+retry_all:
+	bch2_btree_iter_unlock(iter);
+
+	if (ret != -ENOMEM && ret != -EINTR)
+		goto io_error;
+
+	if (ret == -ENOMEM) {
+		struct closure cl;
+
+		closure_init_stack(&cl);
+
+		do {
+			ret = bch2_btree_cache_cannibalize_lock(c, &cl);
+			closure_sync(&cl);
+		} while (ret);
+	}
+
+	/*
+	 * Linked iters are normally a circular singly linked list - break cycle
+	 * while we sort them:
+	 */
+	linked = iter->next;
+	iter->next = NULL;
+	sorted_iters = NULL;
+
+	while (linked) {
+		iter = linked;
+		linked = linked->next;
+
+		i = &sorted_iters;
+		while (*i && btree_iter_cmp(iter, *i) > 0)
+			i = &(*i)->next;
+
+		iter->next = *i;
+		*i = iter;
+	}
+
+	/* Make list circular again: */
+	iter = sorted_iters;
+	while (iter->next)
+		iter = iter->next;
+	iter->next = sorted_iters;
+
+	/* Now, redo traversals in correct order: */
+
+	iter = sorted_iters;
+	do {
+retry:
+		ret = __bch2_btree_iter_traverse(iter);
+		if (unlikely(ret)) {
+			if (ret == -EINTR)
+				goto retry;
+			goto retry_all;
+		}
+
+		iter = iter->next;
+	} while (iter != sorted_iters);
+
+	ret = btree_iter_linked(iter) ? -EINTR : 0;
+out:
+	bch2_btree_cache_cannibalize_unlock(c);
+	return ret;
+io_error:
+	BUG_ON(ret != -EIO);
+
+	iter->flags |= BTREE_ITER_ERROR;
+	iter->l[iter->level].b = BTREE_ITER_NOT_END;
+	goto out;
+}
+
+static unsigned btree_iter_up_until_locked(struct btree_iter *iter,
+					   bool check_pos)
+{
+	unsigned l = iter->level;
+
+	while (btree_iter_node(iter, l) &&
+	       !(is_btree_node(iter, l) &&
+		 bch2_btree_node_relock(iter, l) &&
+		 (!check_pos ||
+		  btree_iter_pos_in_node(iter, iter->l[l].b)))) {
+		btree_node_unlock(iter, l);
+		iter->l[l].b = BTREE_ITER_NOT_END;
+		l++;
+	}
+
+	return l;
+}
+
+/*
+ * This is the main state machine for walking down the btree - walks down to a
+ * specified depth
+ *
+ * Returns 0 on success, -EIO on error (error reading in a btree node).
+ *
+ * On error, caller (peek_node()/peek_key()) must return NULL; the error is
+ * stashed in the iterator and returned from bch2_btree_iter_unlock().
+ */
+int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter)
+{
+	unsigned depth_want = iter->level;
+
+	if (unlikely(iter->level >= BTREE_MAX_DEPTH))
+		return 0;
+
+	if (__bch2_btree_iter_relock(iter))
+		return 0;
+
+	iter->flags &= ~BTREE_ITER_AT_END_OF_LEAF;
+
+	/*
+	 * XXX: correctly using BTREE_ITER_UPTODATE should make using check_pos
+	 * here unnecessary
+	 */
+	iter->level = btree_iter_up_until_locked(iter, true);
+
+	/*
+	 * If we've got a btree node locked (i.e. we aren't about to relock the
+	 * root) - advance its node iterator if necessary:
+	 *
+	 * XXX correctly using BTREE_ITER_UPTODATE should make this unnecessary
+	 */
+	if (btree_iter_node(iter, iter->level)) {
+		struct btree_iter_level *l = &iter->l[iter->level];
+		struct bkey_s_c k;
+		struct bkey u;
+
+		while ((k = __btree_iter_peek_all(iter, l, &u)).k &&
+		       !btree_iter_pos_cmp(iter, k.k))
+			__btree_iter_advance(l);
+	}
+
+	/*
+	 * Note: iter->nodes[iter->level] may be temporarily NULL here - that
+	 * would indicate to other code that we got to the end of the btree,
+	 * here it indicates that relocking the root failed - it's critical that
+	 * btree_iter_lock_root() comes next and that it can't fail
+	 */
+	while (iter->level > depth_want) {
+		int ret = btree_iter_node(iter, iter->level)
+			? btree_iter_down(iter)
+			: btree_iter_lock_root(iter, depth_want);
+		if (unlikely(ret)) {
+			iter->level = depth_want;
+			iter->l[iter->level].b = BTREE_ITER_NOT_END;
+			return ret;
+		}
+	}
+
+	iter->uptodate = BTREE_ITER_NEED_PEEK;
+	bch2_btree_iter_verify_locks(iter);
+	return 0;
+}
+
+int __must_check bch2_btree_iter_traverse(struct btree_iter *iter)
+{
+	int ret;
+
+	ret = __bch2_btree_iter_traverse(iter);
+	if (unlikely(ret))
+		ret = btree_iter_traverse_error(iter, ret);
+
+	BUG_ON(ret == -EINTR && !btree_iter_linked(iter));
+
+	return ret;
+}
+
+static inline void bch2_btree_iter_checks(struct btree_iter *iter,
+					  enum btree_iter_type type)
+{
+	EBUG_ON(iter->btree_id >= BTREE_ID_NR);
+	EBUG_ON((iter->flags & BTREE_ITER_TYPE) != type);
+	EBUG_ON(!!(iter->flags & BTREE_ITER_IS_EXTENTS) !=
+		(iter->btree_id == BTREE_ID_EXTENTS &&
+		 type != BTREE_ITER_NODES));
+
+	bch2_btree_iter_verify_locks(iter);
+}
+
+/* Iterate across nodes (leaf and interior nodes) */
+
+struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter)
+{
+	struct btree *b;
+	int ret;
+
+	bch2_btree_iter_checks(iter, BTREE_ITER_NODES);
+
+	if (iter->uptodate == BTREE_ITER_UPTODATE)
+		return iter->l[iter->level].b;
+
+	ret = bch2_btree_iter_traverse(iter);
+	if (ret)
+		return NULL;
+
+	b = btree_iter_node(iter, iter->level);
+	if (!b)
+		return NULL;
+
+	BUG_ON(bkey_cmp(b->key.k.p, iter->pos) < 0);
+
+	iter->pos = b->key.k.p;
+	iter->uptodate = BTREE_ITER_UPTODATE;
+
+	return b;
+}
+
+struct btree *bch2_btree_iter_next_node(struct btree_iter *iter, unsigned depth)
+{
+	struct btree *b;
+	int ret;
+
+	bch2_btree_iter_checks(iter, BTREE_ITER_NODES);
+
+	/* already got to end? */
+	if (!btree_iter_node(iter, iter->level))
+		return NULL;
+
+	btree_iter_up(iter);
+
+	if (!bch2_btree_node_relock(iter, iter->level))
+		btree_iter_set_dirty(iter, BTREE_ITER_NEED_RELOCK);
+
+	ret = bch2_btree_iter_traverse(iter);
+	if (ret)
+		return NULL;
+
+	/* got to end? */
+	b = btree_iter_node(iter, iter->level);
+	if (!b)
+		return NULL;
+
+	if (bkey_cmp(iter->pos, b->key.k.p) < 0) {
+		/*
+		 * Haven't gotten to the end of the parent node: go back down to
+		 * the next child node
+		 */
+
+		/*
+		 * We don't really want to be unlocking here except we can't
+		 * directly tell btree_iter_traverse() "traverse to this level"
+		 * except by setting iter->level, so we have to unlock so we
+		 * don't screw up our lock invariants:
+		 */
+		if (btree_node_read_locked(iter, iter->level))
+			btree_node_unlock(iter, iter->level);
+
+		/* ick: */
+		iter->pos	= iter->btree_id == BTREE_ID_INODES
+			? btree_type_successor(iter->btree_id, iter->pos)
+			: bkey_successor(iter->pos);
+		iter->level	= depth;
+
+		btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
+		ret = bch2_btree_iter_traverse(iter);
+		if (ret)
+			return NULL;
+
+		b = iter->l[iter->level].b;
+	}
+
+	iter->pos = b->key.k.p;
+	iter->uptodate = BTREE_ITER_UPTODATE;
+
+	return b;
+}
+
+/* Iterate across keys (in leaf nodes only) */
+
+void bch2_btree_iter_set_pos_same_leaf(struct btree_iter *iter, struct bpos new_pos)
+{
+	struct btree_iter_level *l = &iter->l[0];
+	struct bkey_packed *k;
+
+	EBUG_ON(iter->level != 0);
+	EBUG_ON(bkey_cmp(new_pos, iter->pos) < 0);
+	EBUG_ON(!btree_node_locked(iter, 0));
+	EBUG_ON(bkey_cmp(new_pos, l->b->key.k.p) > 0);
+
+	iter->pos = new_pos;
+	btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
+
+	while ((k = bch2_btree_node_iter_peek_all(&l->iter, l->b)) &&
+	       !btree_iter_pos_cmp_packed(l->b, &iter->pos, k,
+					  iter->flags & BTREE_ITER_IS_EXTENTS))
+		__btree_iter_advance(l);
+
+	if (!k && btree_iter_pos_after_node(iter, l->b)) {
+		btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
+		iter->flags |= BTREE_ITER_AT_END_OF_LEAF;
+	}
+}
+
+void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos)
+{
+	int cmp = bkey_cmp(new_pos, iter->pos);
+	unsigned level;
+
+	if (!cmp)
+		return;
+
+	iter->pos = new_pos;
+
+	level = btree_iter_up_until_locked(iter, true);
+
+	if (btree_iter_node(iter, level)) {
+		unsigned nr_advanced = 0;
+		struct btree_iter_level *l = &iter->l[level];
+		struct bkey_s_c k;
+		struct bkey u;
+
+		/*
+		 * We might have to skip over many keys, or just a few: try
+		 * advancing the node iterator, and if we have to skip over too
+		 * many keys just reinit it (or if we're rewinding, since that
+		 * is expensive).
+		 */
+		if (cmp > 0) {
+			while ((k = __btree_iter_peek_all(iter, l, &u)).k &&
+			       !btree_iter_pos_cmp(iter, k.k)) {
+				if (nr_advanced > 8)
+					goto reinit_node;
+
+				__btree_iter_advance(l);
+				nr_advanced++;
+			}
+		} else {
+reinit_node:
+			__btree_iter_init(iter, iter->l[level].b);
+		}
+
+		/* Don't leave it locked if we're not supposed to: */
+		if (btree_lock_want(iter, level) == BTREE_NODE_UNLOCKED)
+			btree_node_unlock(iter, level);
+	}
+
+	if (level != iter->level)
+		btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
+	else
+		btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
+}
+
+static inline struct bkey_s_c btree_iter_peek_uptodate(struct btree_iter *iter)
+{
+	struct btree_iter_level *l = &iter->l[0];
+	struct bkey_s_c ret = { .k = &iter->k };
+
+	if (!bkey_deleted(&iter->k)) {
+		EBUG_ON(bch2_btree_node_iter_end(&l->iter));
+		ret.v = bkeyp_val(&l->b->format,
+			__bch2_btree_node_iter_peek_all(&l->iter, l->b));
+	}
+
+	if (debug_check_bkeys(iter->c) &&
+	    !bkey_deleted(ret.k))
+		bch2_bkey_debugcheck(iter->c, l->b, ret);
+	return ret;
+}
+
+struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
+{
+	struct btree_iter_level *l = &iter->l[0];
+	struct bkey_s_c k;
+	int ret;
+
+	bch2_btree_iter_checks(iter, BTREE_ITER_KEYS);
+
+	if (iter->uptodate == BTREE_ITER_UPTODATE)
+		return btree_iter_peek_uptodate(iter);
+
+	while (1) {
+		ret = bch2_btree_iter_traverse(iter);
+		if (unlikely(ret))
+			return bkey_s_c_err(ret);
+
+		k = __btree_iter_peek(iter, l);
+		if (likely(k.k))
+			break;
+
+		/* got to the end of the leaf, iterator needs to be traversed: */
+		iter->pos	= l->b->key.k.p;
+		iter->uptodate	= BTREE_ITER_NEED_TRAVERSE;
+
+		if (!bkey_cmp(iter->pos, POS_MAX))
+			return bkey_s_c_null;
+
+		iter->pos = btree_type_successor(iter->btree_id, iter->pos);
+	}
+
+	/*
+	 * iter->pos should always be equal to the key we just
+	 * returned - except extents can straddle iter->pos:
+	 */
+	if (!(iter->flags & BTREE_ITER_IS_EXTENTS) ||
+	    bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0)
+		iter->pos = bkey_start_pos(k.k);
+
+	iter->uptodate = BTREE_ITER_UPTODATE;
+	return k;
+}
+
+static noinline
+struct bkey_s_c bch2_btree_iter_peek_next_leaf(struct btree_iter *iter)
+{
+	struct btree_iter_level *l = &iter->l[0];
+
+	iter->pos	= l->b->key.k.p;
+	iter->uptodate	= BTREE_ITER_NEED_TRAVERSE;
+
+	if (!bkey_cmp(iter->pos, POS_MAX))
+		return bkey_s_c_null;
+
+	iter->pos = btree_type_successor(iter->btree_id, iter->pos);
+
+	return bch2_btree_iter_peek(iter);
+}
+
+struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter)
+{
+	struct btree_iter_level *l = &iter->l[0];
+	struct bkey_packed *p;
+	struct bkey_s_c k;
+
+	bch2_btree_iter_checks(iter, BTREE_ITER_KEYS);
+
+	if (unlikely(iter->uptodate != BTREE_ITER_UPTODATE)) {
+		k = bch2_btree_iter_peek(iter);
+		if (IS_ERR_OR_NULL(k.k))
+			return k;
+	}
+
+	do {
+		__btree_iter_advance(l);
+		p = bch2_btree_node_iter_peek_all(&l->iter, l->b);
+		if (unlikely(!p))
+			return bch2_btree_iter_peek_next_leaf(iter);
+	} while (bkey_whiteout(p));
+
+	k = __btree_iter_unpack(iter, l, &iter->k, p);
+
+	EBUG_ON(bkey_cmp(bkey_start_pos(k.k), iter->pos) < 0);
+	iter->pos = bkey_start_pos(k.k);
+	return k;
+}
+
+struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *iter)
+{
+	struct btree_iter_level *l = &iter->l[0];
+	struct bkey_packed *p;
+	struct bkey_s_c k;
+	int ret;
+
+	bch2_btree_iter_checks(iter, BTREE_ITER_KEYS);
+
+	if (unlikely(iter->uptodate != BTREE_ITER_UPTODATE)) {
+		k = bch2_btree_iter_peek(iter);
+		if (IS_ERR(k.k))
+			return k;
+	}
+
+	while (1) {
+		p = bch2_btree_node_iter_prev(&l->iter, l->b);
+		if (likely(p))
+			break;
+
+		iter->pos = l->b->data->min_key;
+		if (!bkey_cmp(iter->pos, POS_MIN))
+			return bkey_s_c_null;
+
+		bch2_btree_iter_set_pos(iter,
+			btree_type_predecessor(iter->btree_id, iter->pos));
+
+		ret = bch2_btree_iter_traverse(iter);
+		if (unlikely(ret))
+			return bkey_s_c_err(ret);
+
+		p = bch2_btree_node_iter_peek(&l->iter, l->b);
+		if (p)
+			break;
+	}
+
+	k = __btree_iter_unpack(iter, l, &iter->k, p);
+
+	EBUG_ON(bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0);
+
+	iter->pos	= bkey_start_pos(k.k);
+	iter->uptodate	= BTREE_ITER_UPTODATE;
+	return k;
+}
+
+static inline struct bkey_s_c
+__bch2_btree_iter_peek_slot(struct btree_iter *iter)
+{
+	struct btree_iter_level *l = &iter->l[0];
+	struct bkey_s_c k;
+	struct bkey n;
+	int ret;
+
+recheck:
+	while ((k = __btree_iter_peek_all(iter, l, &iter->k)).k &&
+	       bkey_deleted(k.k) &&
+	       bkey_cmp(bkey_start_pos(k.k), iter->pos) == 0)
+		__btree_iter_advance(l);
+
+	/*
+	 * If we got to the end of the node, check if we need to traverse to the
+	 * next node:
+	 */
+	if (unlikely(!k.k && btree_iter_pos_after_node(iter, l->b))) {
+		btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
+		ret = bch2_btree_iter_traverse(iter);
+		if (unlikely(ret))
+			return bkey_s_c_err(ret);
+
+		goto recheck;
+	}
+
+	if (k.k &&
+	    !bkey_whiteout(k.k) &&
+	    bkey_cmp(bkey_start_pos(k.k), iter->pos) <= 0) {
+		EBUG_ON(bkey_cmp(k.k->p, iter->pos) < 0);
+		EBUG_ON(bkey_deleted(k.k));
+		iter->uptodate = BTREE_ITER_UPTODATE;
+		return k;
+	}
+
+	/* hole */
+	bkey_init(&n);
+	n.p = iter->pos;
+
+	if (iter->flags & BTREE_ITER_IS_EXTENTS) {
+		if (n.p.offset == KEY_OFFSET_MAX) {
+			if (n.p.inode == KEY_INODE_MAX)
+				return bkey_s_c_null;
+
+			iter->pos = bkey_successor(iter->pos);
+			goto recheck;
+		}
+
+		if (k.k && bkey_whiteout(k.k)) {
+			struct btree_node_iter node_iter = l->iter;
+
+			k = __btree_iter_unpack(iter, l, &iter->k,
+				bch2_btree_node_iter_peek(&node_iter, l->b));
+		}
+
+		if (!k.k)
+			k.k = &l->b->key.k;
+
+		bch2_key_resize(&n,
+				min_t(u64, KEY_SIZE_MAX,
+				      (k.k->p.inode == n.p.inode
+				       ? bkey_start_offset(k.k)
+				       : KEY_OFFSET_MAX) -
+				      n.p.offset));
+
+		EBUG_ON(!n.size);
+	}
+
+	iter->k	= n;
+	iter->uptodate = BTREE_ITER_UPTODATE;
+	return (struct bkey_s_c) { &iter->k, NULL };
+}
+
+struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
+{
+	int ret;
+
+	bch2_btree_iter_checks(iter, BTREE_ITER_SLOTS);
+
+	if (iter->uptodate == BTREE_ITER_UPTODATE)
+		return btree_iter_peek_uptodate(iter);
+
+	ret = bch2_btree_iter_traverse(iter);
+	if (unlikely(ret))
+		return bkey_s_c_err(ret);
+
+	return __bch2_btree_iter_peek_slot(iter);
+}
+
+struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *iter)
+{
+	bch2_btree_iter_checks(iter, BTREE_ITER_SLOTS);
+
+	iter->pos = btree_type_successor(iter->btree_id, iter->k.p);
+
+	if (unlikely(iter->uptodate != BTREE_ITER_UPTODATE)) {
+		/*
+		 * XXX: when we just need to relock we should be able to avoid
+		 * calling traverse, but we need to kill BTREE_ITER_NEED_PEEK
+		 * for that to work
+		 */
+		btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
+
+		return bch2_btree_iter_peek_slot(iter);
+	}
+
+	if (!bkey_deleted(&iter->k))
+		__btree_iter_advance(&iter->l[0]);
+
+	btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
+
+	return __bch2_btree_iter_peek_slot(iter);
+}
+
+void __bch2_btree_iter_init(struct btree_iter *iter, struct bch_fs *c,
+			    enum btree_id btree_id, struct bpos pos,
+			    unsigned locks_want, unsigned depth,
+			    unsigned flags)
+{
+	unsigned i;
+
+	EBUG_ON(depth >= BTREE_MAX_DEPTH);
+	EBUG_ON(locks_want > BTREE_MAX_DEPTH);
+
+	iter->c				= c;
+	iter->pos			= pos;
+	bkey_init(&iter->k);
+	iter->k.p			= pos;
+	iter->flags			= flags;
+	iter->uptodate			= BTREE_ITER_NEED_TRAVERSE;
+	iter->btree_id			= btree_id;
+	iter->level			= depth;
+	iter->locks_want		= locks_want;
+	iter->nodes_locked		= 0;
+	iter->nodes_intent_locked	= 0;
+	for (i = 0; i < ARRAY_SIZE(iter->l); i++)
+		iter->l[i].b		= NULL;
+	iter->l[iter->level].b		= BTREE_ITER_NOT_END;
+	iter->next			= iter;
+
+	prefetch(c->btree_roots[btree_id].b);
+}
+
+void bch2_btree_iter_unlink(struct btree_iter *iter)
+{
+	struct btree_iter *linked;
+
+	__bch2_btree_iter_unlock(iter);
+
+	if (!btree_iter_linked(iter))
+		return;
+
+	for_each_linked_btree_iter(iter, linked)
+		if (linked->next == iter) {
+			linked->next = iter->next;
+			iter->next = iter;
+			return;
+		}
+
+	BUG();
+}
+
+void bch2_btree_iter_link(struct btree_iter *iter, struct btree_iter *new)
+{
+	BUG_ON(btree_iter_linked(new));
+
+	new->next = iter->next;
+	iter->next = new;
+}
+
+void bch2_btree_iter_copy(struct btree_iter *dst, struct btree_iter *src)
+{
+	unsigned i;
+
+	__bch2_btree_iter_unlock(dst);
+	memcpy(dst, src, offsetof(struct btree_iter, next));
+
+	for (i = 0; i < BTREE_MAX_DEPTH; i++)
+		if (btree_node_locked(dst, i))
+			six_lock_increment(&dst->l[i].b->lock,
+					   __btree_lock_want(dst, i));
+}
+
+/* new transactional stuff: */
+
+static void btree_trans_verify(struct btree_trans *trans)
+{
+	unsigned i;
+
+	for (i = 0; i < trans->nr_iters; i++) {
+		struct btree_iter *iter = &trans->iters[i];
+
+		BUG_ON(btree_iter_linked(iter) !=
+		       ((trans->iters_linked & (1 << i)) &&
+			!is_power_of_2(trans->iters_linked)));
+	}
+}
+
+void bch2_trans_iter_free(struct btree_trans *trans,
+			  struct btree_iter *iter)
+{
+	unsigned idx;
+
+	for (idx = 0; idx < trans->nr_iters; idx++)
+		if (&trans->iters[idx] == iter)
+			goto found;
+	BUG();
+found:
+	BUG_ON(!(trans->iters_linked & (1U << idx)));
+
+	trans->iters_live	&= ~(1U << idx);
+	trans->iters_linked	&= ~(1U << idx);
+	bch2_btree_iter_unlink(iter);
+}
+
+static int btree_trans_realloc_iters(struct btree_trans *trans)
+{
+	struct btree_iter *new_iters;
+	unsigned i;
+
+	bch2_trans_unlock(trans);
+
+	new_iters = kmalloc(sizeof(struct btree_iter) * BTREE_ITER_MAX,
+			    GFP_NOFS);
+	if (!new_iters)
+		return -ENOMEM;
+
+	memcpy(new_iters, trans->iters,
+	       sizeof(struct btree_iter) * trans->nr_iters);
+	trans->iters = new_iters;
+
+	for (i = 0; i < trans->nr_iters; i++)
+		trans->iters[i].next = &trans->iters[i];
+
+	if (trans->iters_linked) {
+		unsigned first_linked = __ffs(trans->iters_linked);
+
+		for (i = first_linked + 1; i < trans->nr_iters; i++)
+			if (trans->iters_linked & (1 << i))
+				bch2_btree_iter_link(&trans->iters[first_linked],
+						     &trans->iters[i]);
+	}
+
+	btree_trans_verify(trans);
+
+	return trans->iters_live ? -EINTR : 0;
+}
+
+int bch2_trans_preload_iters(struct btree_trans *trans)
+{
+	if (trans->iters != trans->iters_onstack)
+		return 0;
+
+	return btree_trans_realloc_iters(trans);
+}
+
+static struct btree_iter *__btree_trans_get_iter(struct btree_trans *trans,
+						 unsigned btree_id,
+						 unsigned flags, u64 iter_id)
+{
+	struct btree_iter *iter;
+	int idx;
+
+	BUG_ON(trans->nr_iters > BTREE_ITER_MAX);
+
+	for (idx = 0; idx < trans->nr_iters; idx++)
+		if (trans->iter_ids[idx] == iter_id)
+			goto found;
+	idx = -1;
+found:
+	if (idx < 0) {
+		idx = ffz(trans->iters_linked);
+		if (idx < trans->nr_iters)
+			goto got_slot;
+
+		BUG_ON(trans->nr_iters == BTREE_ITER_MAX);
+
+		if (trans->iters == trans->iters_onstack &&
+		    trans->nr_iters == ARRAY_SIZE(trans->iters_onstack)) {
+			int ret = btree_trans_realloc_iters(trans);
+			if (ret)
+				return ERR_PTR(ret);
+		}
+
+		idx = trans->nr_iters++;
+got_slot:
+		trans->iter_ids[idx] = iter_id;
+		iter = &trans->iters[idx];
+
+		bch2_btree_iter_init(iter, trans->c, btree_id, POS_MIN, flags);
+	} else {
+		iter = &trans->iters[idx];
+
+		BUG_ON(iter->btree_id != btree_id);
+		BUG_ON((iter->flags ^ flags) &
+		       (BTREE_ITER_SLOTS|BTREE_ITER_IS_EXTENTS));
+
+		iter->flags &= ~(BTREE_ITER_INTENT|BTREE_ITER_PREFETCH);
+		iter->flags |= flags & (BTREE_ITER_INTENT|BTREE_ITER_PREFETCH);
+	}
+
+	BUG_ON(trans->iters_live & (1 << idx));
+	trans->iters_live |= 1 << idx;
+
+	if (trans->iters_linked &&
+	    !(trans->iters_linked & (1 << idx)))
+		bch2_btree_iter_link(&trans->iters[__ffs(trans->iters_linked)],
+				     iter);
+
+	trans->iters_linked |= 1 << idx;
+
+	btree_trans_verify(trans);
+
+	return iter;
+}
+
+struct btree_iter *__bch2_trans_get_iter(struct btree_trans *trans,
+					 enum btree_id btree_id,
+					 struct bpos pos, unsigned flags,
+					 u64 iter_id)
+{
+	struct btree_iter *iter =
+		__btree_trans_get_iter(trans, btree_id, flags, iter_id);
+
+	if (!IS_ERR(iter))
+		bch2_btree_iter_set_pos(iter, pos);
+	return iter;
+}
+
+struct btree_iter *__bch2_trans_copy_iter(struct btree_trans *trans,
+					  struct btree_iter *src,
+					  u64 iter_id)
+{
+	struct btree_iter *iter =
+		__btree_trans_get_iter(trans, src->btree_id,
+				       src->flags, iter_id);
+
+	if (!IS_ERR(iter))
+		bch2_btree_iter_copy(iter, src);
+	return iter;
+}
+
+void *bch2_trans_kmalloc(struct btree_trans *trans,
+			 size_t size)
+{
+	void *ret;
+
+	if (trans->mem_top + size > trans->mem_bytes) {
+		size_t old_bytes = trans->mem_bytes;
+		size_t new_bytes = roundup_pow_of_two(trans->mem_top + size);
+		void *new_mem = krealloc(trans->mem, new_bytes, GFP_NOFS);
+
+		if (!new_mem)
+			return ERR_PTR(-ENOMEM);
+
+		trans->mem = new_mem;
+		trans->mem_bytes = new_bytes;
+
+		if (old_bytes)
+			return ERR_PTR(-EINTR);
+	}
+
+	ret = trans->mem + trans->mem_top;
+	trans->mem_top += size;
+	return ret;
+}
+
+int bch2_trans_unlock(struct btree_trans *trans)
+{
+	unsigned iters = trans->iters_linked;
+	int ret = 0;
+
+	while (iters) {
+		unsigned idx = __ffs(iters);
+		struct btree_iter *iter = &trans->iters[idx];
+
+		if (iter->flags & BTREE_ITER_ERROR)
+			ret = -EIO;
+
+		__bch2_btree_iter_unlock(iter);
+		iters ^= 1 << idx;
+	}
+
+	return ret;
+}
+
+void bch2_trans_begin(struct btree_trans *trans)
+{
+	unsigned idx;
+
+	btree_trans_verify(trans);
+
+	/*
+	 * On transaction restart, the transaction isn't required to allocate
+	 * all the same iterators it on the last iteration:
+	 *
+	 * Unlink any iterators it didn't use this iteration, assuming it got
+	 * further (allocated an iter with a higher idx) than where the iter
+	 * was originally allocated:
+	 */
+	if (!trans->iters_live)
+		return;
+
+	while (trans->iters_linked &&
+	       (idx = __fls(trans->iters_linked)) >
+	       __fls(trans->iters_live)) {
+		trans->iters_linked ^= 1 << idx;
+		bch2_btree_iter_unlink(&trans->iters[idx]);
+	}
+
+	trans->iters_live	= 0;
+	trans->nr_updates	= 0;
+	trans->mem_top		= 0;
+
+	btree_trans_verify(trans);
+}
+
+void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c)
+{
+	trans->c		= c;
+	trans->nr_iters		= 0;
+	trans->iters_live	= 0;
+	trans->iters_linked	= 0;
+	trans->nr_updates	= 0;
+	trans->mem_top		= 0;
+	trans->mem_bytes	= 0;
+	trans->mem		= NULL;
+	trans->iters		= trans->iters_onstack;
+}
+
+int bch2_trans_exit(struct btree_trans *trans)
+{
+	int ret = bch2_trans_unlock(trans);
+
+	kfree(trans->mem);
+	if (trans->iters != trans->iters_onstack)
+		kfree(trans->iters);
+	trans->mem	= (void *) 0x1;
+	trans->iters	= (void *) 0x1;
+	return ret;
+}
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
new file mode 100644
index 000000000000..e686a7ad5b3d
--- /dev/null
+++ b/fs/bcachefs/btree_iter.h
@@ -0,0 +1,314 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BTREE_ITER_H
+#define _BCACHEFS_BTREE_ITER_H
+
+#include "btree_types.h"
+
+static inline void btree_iter_set_dirty(struct btree_iter *iter,
+					enum btree_iter_uptodate u)
+{
+	iter->uptodate = max_t(unsigned, iter->uptodate, u);
+}
+
+static inline struct btree *btree_iter_node(struct btree_iter *iter,
+					    unsigned level)
+{
+	return level < BTREE_MAX_DEPTH ? iter->l[level].b : NULL;
+}
+
+static inline struct btree *btree_node_parent(struct btree_iter *iter,
+					      struct btree *b)
+{
+	return btree_iter_node(iter, b->level + 1);
+}
+
+static inline bool btree_iter_linked(const struct btree_iter *iter)
+{
+	return iter->next != iter;
+}
+
+static inline bool __iter_has_node(const struct btree_iter *iter,
+				   const struct btree *b)
+{
+	/*
+	 * We don't compare the low bits of the lock sequence numbers because
+	 * @iter might have taken a write lock on @b, and we don't want to skip
+	 * the linked iterator if the sequence numbers were equal before taking
+	 * that write lock. The lock sequence number is incremented by taking
+	 * and releasing write locks and is even when unlocked:
+	 */
+
+	return iter->l[b->level].b == b &&
+		iter->lock_seq[b->level] >> 1 == b->lock.state.seq >> 1;
+}
+
+static inline struct btree_iter *
+__next_linked_iter(struct btree_iter *iter, struct btree_iter *linked)
+{
+	return linked->next != iter ? linked->next : NULL;
+}
+
+static inline struct btree_iter *
+__next_iter_with_node(struct btree_iter *iter, struct btree *b,
+		      struct btree_iter *linked)
+{
+	while (linked && !__iter_has_node(linked, b))
+		linked = __next_linked_iter(iter, linked);
+
+	return linked;
+}
+
+/**
+ * for_each_btree_iter - iterate over all iterators linked with @_iter,
+ * including @_iter
+ */
+#define for_each_btree_iter(_iter, _linked)				\
+	for ((_linked) = (_iter); (_linked);				\
+	     (_linked) = __next_linked_iter(_iter, _linked))
+
+/**
+ * for_each_btree_iter_with_node - iterate over all iterators linked with @_iter
+ * that also point to @_b
+ *
+ * @_b is assumed to be locked by @_iter
+ *
+ * Filters out iterators that don't have a valid btree_node iterator for @_b -
+ * i.e. iterators for which bch2_btree_node_relock() would not succeed.
+ */
+#define for_each_btree_iter_with_node(_iter, _b, _linked)		\
+	for ((_linked) = (_iter);					\
+	     ((_linked) = __next_iter_with_node(_iter, _b, _linked));	\
+	     (_linked) = __next_linked_iter(_iter, _linked))
+
+/**
+ * for_each_linked_btree_iter - iterate over all iterators linked with @_iter,
+ * _not_ including @_iter
+ */
+#define for_each_linked_btree_iter(_iter, _linked)			\
+	for ((_linked) = (_iter)->next;					\
+	     (_linked) != (_iter);					\
+	     (_linked) = (_linked)->next)
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+void bch2_btree_iter_verify(struct btree_iter *, struct btree *);
+void bch2_btree_iter_verify_locks(struct btree_iter *);
+#else
+static inline void bch2_btree_iter_verify(struct btree_iter *iter,
+					  struct btree *b) {}
+static inline void bch2_btree_iter_verify_locks(struct btree_iter *iter) {}
+#endif
+
+void bch2_btree_node_iter_fix(struct btree_iter *, struct btree *,
+			     struct btree_node_iter *, struct bset_tree *,
+			     struct bkey_packed *, unsigned, unsigned);
+
+int bch2_btree_iter_unlock(struct btree_iter *);
+
+bool __bch2_btree_iter_upgrade(struct btree_iter *, unsigned);
+bool __bch2_btree_iter_upgrade_nounlock(struct btree_iter *, unsigned);
+
+static inline bool bch2_btree_iter_upgrade(struct btree_iter *iter,
+					   unsigned new_locks_want,
+					   bool may_drop_locks)
+{
+	new_locks_want = min(new_locks_want, BTREE_MAX_DEPTH);
+
+	return iter->locks_want < new_locks_want
+		? (may_drop_locks
+		   ? __bch2_btree_iter_upgrade(iter, new_locks_want)
+		   : __bch2_btree_iter_upgrade_nounlock(iter, new_locks_want))
+		: iter->uptodate <= BTREE_ITER_NEED_PEEK;
+}
+
+void __bch2_btree_iter_downgrade(struct btree_iter *, unsigned);
+
+static inline void bch2_btree_iter_downgrade(struct btree_iter *iter)
+{
+	if (iter->locks_want > (iter->flags & BTREE_ITER_INTENT) ? 1 : 0)
+		__bch2_btree_iter_downgrade(iter, 0);
+}
+
+void bch2_btree_iter_node_replace(struct btree_iter *, struct btree *);
+void bch2_btree_iter_node_drop(struct btree_iter *, struct btree *);
+
+void bch2_btree_iter_reinit_node(struct btree_iter *, struct btree *);
+
+int __must_check bch2_btree_iter_traverse(struct btree_iter *);
+
+struct btree *bch2_btree_iter_peek_node(struct btree_iter *);
+struct btree *bch2_btree_iter_next_node(struct btree_iter *, unsigned);
+
+struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *);
+struct bkey_s_c bch2_btree_iter_next(struct btree_iter *);
+struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *);
+
+struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *);
+struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *);
+
+void bch2_btree_iter_set_pos_same_leaf(struct btree_iter *, struct bpos);
+void bch2_btree_iter_set_pos(struct btree_iter *, struct bpos);
+
+void __bch2_btree_iter_init(struct btree_iter *, struct bch_fs *,
+			   enum btree_id, struct bpos,
+			   unsigned , unsigned, unsigned);
+
+static inline void bch2_btree_iter_init(struct btree_iter *iter,
+			struct bch_fs *c, enum btree_id btree_id,
+			struct bpos pos, unsigned flags)
+{
+	__bch2_btree_iter_init(iter, c, btree_id, pos,
+			       flags & BTREE_ITER_INTENT ? 1 : 0, 0,
+			       (btree_id == BTREE_ID_EXTENTS
+				?  BTREE_ITER_IS_EXTENTS : 0)|flags);
+}
+
+void bch2_btree_iter_link(struct btree_iter *, struct btree_iter *);
+void bch2_btree_iter_unlink(struct btree_iter *);
+void bch2_btree_iter_copy(struct btree_iter *, struct btree_iter *);
+
+static inline struct bpos btree_type_successor(enum btree_id id,
+					       struct bpos pos)
+{
+	if (id == BTREE_ID_INODES) {
+		pos.inode++;
+		pos.offset = 0;
+	} else if (id != BTREE_ID_EXTENTS) {
+		pos = bkey_successor(pos);
+	}
+
+	return pos;
+}
+
+static inline struct bpos btree_type_predecessor(enum btree_id id,
+					       struct bpos pos)
+{
+	if (id == BTREE_ID_INODES) {
+		--pos.inode;
+		pos.offset = 0;
+	} else /* if (id != BTREE_ID_EXTENTS) */ {
+		pos = bkey_predecessor(pos);
+	}
+
+	return pos;
+}
+
+static inline int __btree_iter_cmp(enum btree_id id,
+				   struct bpos pos,
+				   const struct btree_iter *r)
+{
+	if (id != r->btree_id)
+		return id < r->btree_id ? -1 : 1;
+	return bkey_cmp(pos, r->pos);
+}
+
+static inline int btree_iter_cmp(const struct btree_iter *l,
+				 const struct btree_iter *r)
+{
+	return __btree_iter_cmp(l->btree_id, l->pos, r);
+}
+
+/*
+ * Unlocks before scheduling
+ * Note: does not revalidate iterator
+ */
+static inline void bch2_btree_iter_cond_resched(struct btree_iter *iter)
+{
+	if (need_resched()) {
+		bch2_btree_iter_unlock(iter);
+		schedule();
+	} else if (race_fault()) {
+		bch2_btree_iter_unlock(iter);
+	}
+}
+
+#define __for_each_btree_node(_iter, _c, _btree_id, _start,		\
+			      _locks_want, _depth, _flags, _b)		\
+	for (__bch2_btree_iter_init((_iter), (_c), (_btree_id), _start,	\
+				    _locks_want, _depth,		\
+				    _flags|BTREE_ITER_NODES),		\
+	     _b = bch2_btree_iter_peek_node(_iter);			\
+	     (_b);							\
+	     (_b) = bch2_btree_iter_next_node(_iter, _depth))
+
+#define for_each_btree_node(_iter, _c, _btree_id, _start, _flags, _b)	\
+	__for_each_btree_node(_iter, _c, _btree_id, _start, 0, 0, _flags, _b)
+
+static inline struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter,
+						     unsigned flags)
+{
+	return flags & BTREE_ITER_SLOTS
+		? bch2_btree_iter_peek_slot(iter)
+		: bch2_btree_iter_peek(iter);
+}
+
+static inline struct bkey_s_c __bch2_btree_iter_next(struct btree_iter *iter,
+						     unsigned flags)
+{
+	bch2_btree_iter_cond_resched(iter);
+
+	return flags & BTREE_ITER_SLOTS
+		? bch2_btree_iter_next_slot(iter)
+		: bch2_btree_iter_next(iter);
+}
+
+#define for_each_btree_key(_iter, _c, _btree_id,  _start, _flags, _k)	\
+	for (bch2_btree_iter_init((_iter), (_c), (_btree_id),		\
+				  (_start), (_flags)),			\
+	     (_k) = __bch2_btree_iter_peek(_iter, _flags);		\
+	     !IS_ERR_OR_NULL((_k).k);					\
+	     (_k) = __bch2_btree_iter_next(_iter, _flags))
+
+#define for_each_btree_key_continue(_iter, _flags, _k)			\
+	for ((_k) = __bch2_btree_iter_peek(_iter, _flags);		\
+	     !IS_ERR_OR_NULL((_k).k);					\
+	     (_k) = __bch2_btree_iter_next(_iter, _flags))
+
+static inline int btree_iter_err(struct bkey_s_c k)
+{
+	return PTR_ERR_OR_ZERO(k.k);
+}
+
+/* new multiple iterator interface: */
+
+int bch2_trans_preload_iters(struct btree_trans *);
+void bch2_trans_iter_free(struct btree_trans *,
+				struct btree_iter *);
+
+struct btree_iter *__bch2_trans_get_iter(struct btree_trans *, enum btree_id,
+					 struct bpos, unsigned, u64);
+struct btree_iter *__bch2_trans_copy_iter(struct btree_trans *,
+					  struct btree_iter *, u64);
+
+static __always_inline u64 __btree_iter_id(void)
+{
+	u64 ret = 0;
+
+	ret <<= 32;
+	ret |= _RET_IP_ & U32_MAX;
+	ret <<= 32;
+	ret |= _THIS_IP_ & U32_MAX;
+	return ret;
+}
+
+static __always_inline struct btree_iter *
+bch2_trans_get_iter(struct btree_trans *trans, enum btree_id btree_id,
+		    struct bpos pos, unsigned flags)
+{
+	return __bch2_trans_get_iter(trans, btree_id, pos, flags,
+				     __btree_iter_id());
+}
+
+static __always_inline struct btree_iter *
+bch2_trans_copy_iter(struct btree_trans *trans, struct btree_iter *src)
+{
+
+	return __bch2_trans_copy_iter(trans, src, __btree_iter_id());
+}
+
+void *bch2_trans_kmalloc(struct btree_trans *, size_t);
+int bch2_trans_unlock(struct btree_trans *);
+void bch2_trans_begin(struct btree_trans *);
+void bch2_trans_init(struct btree_trans *, struct bch_fs *);
+int bch2_trans_exit(struct btree_trans *);
+
+#endif /* _BCACHEFS_BTREE_ITER_H */
diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h
new file mode 100644
index 000000000000..de3fc0a239da
--- /dev/null
+++ b/fs/bcachefs/btree_locking.h
@@ -0,0 +1,196 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BTREE_LOCKING_H
+#define _BCACHEFS_BTREE_LOCKING_H
+
+/*
+ * Only for internal btree use:
+ *
+ * The btree iterator tracks what locks it wants to take, and what locks it
+ * currently has - here we have wrappers for locking/unlocking btree nodes and
+ * updating the iterator state
+ */
+
+#include "btree_iter.h"
+#include "btree_io.h"
+#include "six.h"
+
+/* matches six lock types */
+enum btree_node_locked_type {
+	BTREE_NODE_UNLOCKED		= -1,
+	BTREE_NODE_READ_LOCKED		= SIX_LOCK_read,
+	BTREE_NODE_INTENT_LOCKED	= SIX_LOCK_intent,
+};
+
+static inline int btree_node_locked_type(struct btree_iter *iter,
+					 unsigned level)
+{
+	/*
+	 * We're relying on the fact that if nodes_intent_locked is set
+	 * nodes_locked must be set as well, so that we can compute without
+	 * branches:
+	 */
+	return BTREE_NODE_UNLOCKED +
+		((iter->nodes_locked >> level) & 1) +
+		((iter->nodes_intent_locked >> level) & 1);
+}
+
+static inline bool btree_node_intent_locked(struct btree_iter *iter,
+					    unsigned level)
+{
+	return btree_node_locked_type(iter, level) == BTREE_NODE_INTENT_LOCKED;
+}
+
+static inline bool btree_node_read_locked(struct btree_iter *iter,
+					  unsigned level)
+{
+	return btree_node_locked_type(iter, level) == BTREE_NODE_READ_LOCKED;
+}
+
+static inline bool btree_node_locked(struct btree_iter *iter, unsigned level)
+{
+	return iter->nodes_locked & (1 << level);
+}
+
+static inline void mark_btree_node_unlocked(struct btree_iter *iter,
+					    unsigned level)
+{
+	iter->nodes_locked &= ~(1 << level);
+	iter->nodes_intent_locked &= ~(1 << level);
+}
+
+static inline void mark_btree_node_locked(struct btree_iter *iter,
+					  unsigned level,
+					  enum six_lock_type type)
+{
+	/* relying on this to avoid a branch */
+	BUILD_BUG_ON(SIX_LOCK_read   != 0);
+	BUILD_BUG_ON(SIX_LOCK_intent != 1);
+
+	iter->nodes_locked |= 1 << level;
+	iter->nodes_intent_locked |= type << level;
+}
+
+static inline void mark_btree_node_intent_locked(struct btree_iter *iter,
+						 unsigned level)
+{
+	mark_btree_node_locked(iter, level, SIX_LOCK_intent);
+}
+
+static inline enum six_lock_type __btree_lock_want(struct btree_iter *iter, int level)
+{
+	return level < iter->locks_want
+		? SIX_LOCK_intent
+		: SIX_LOCK_read;
+}
+
+static inline enum btree_node_locked_type
+btree_lock_want(struct btree_iter *iter, int level)
+{
+	if (level < iter->level)
+		return BTREE_NODE_UNLOCKED;
+	if (level < iter->locks_want)
+		return BTREE_NODE_INTENT_LOCKED;
+	if (level == iter->level)
+		return BTREE_NODE_READ_LOCKED;
+	return BTREE_NODE_UNLOCKED;
+}
+
+static inline void btree_node_unlock(struct btree_iter *iter, unsigned level)
+{
+	int lock_type = btree_node_locked_type(iter, level);
+
+	EBUG_ON(level >= BTREE_MAX_DEPTH);
+
+	if (lock_type != BTREE_NODE_UNLOCKED)
+		six_unlock_type(&iter->l[level].b->lock, lock_type);
+	mark_btree_node_unlocked(iter, level);
+}
+
+static inline void __bch2_btree_iter_unlock(struct btree_iter *iter)
+{
+	btree_iter_set_dirty(iter, BTREE_ITER_NEED_RELOCK);
+
+	while (iter->nodes_locked)
+		btree_node_unlock(iter, __ffs(iter->nodes_locked));
+}
+
+static inline enum bch_time_stats lock_to_time_stat(enum six_lock_type type)
+{
+	switch (type) {
+	case SIX_LOCK_read:
+		return BCH_TIME_btree_lock_contended_read;
+	case SIX_LOCK_intent:
+		return BCH_TIME_btree_lock_contended_intent;
+	case SIX_LOCK_write:
+		return BCH_TIME_btree_lock_contended_write;
+	default:
+		BUG();
+	}
+}
+
+/*
+ * wrapper around six locks that just traces lock contended time
+ */
+static inline void __btree_node_lock_type(struct bch_fs *c, struct btree *b,
+					  enum six_lock_type type)
+{
+	u64 start_time = local_clock();
+
+	six_lock_type(&b->lock, type, NULL, NULL);
+	bch2_time_stats_update(&c->times[lock_to_time_stat(type)], start_time);
+}
+
+static inline void btree_node_lock_type(struct bch_fs *c, struct btree *b,
+					enum six_lock_type type)
+{
+	if (!six_trylock_type(&b->lock, type))
+		__btree_node_lock_type(c, b, type);
+}
+
+bool __bch2_btree_node_lock(struct btree *, struct bpos, unsigned,
+			    struct btree_iter *, enum six_lock_type, bool);
+
+static inline bool btree_node_lock(struct btree *b, struct bpos pos,
+				   unsigned level,
+				   struct btree_iter *iter,
+				   enum six_lock_type type,
+				   bool may_drop_locks)
+{
+	EBUG_ON(level >= BTREE_MAX_DEPTH);
+
+	return likely(six_trylock_type(&b->lock, type)) ||
+		__bch2_btree_node_lock(b, pos, level, iter,
+				       type, may_drop_locks);
+}
+
+bool __bch2_btree_node_relock(struct btree_iter *, unsigned);
+
+static inline bool bch2_btree_node_relock(struct btree_iter *iter,
+					  unsigned level)
+{
+	EBUG_ON(btree_node_locked(iter, level) &&
+		btree_node_locked_type(iter, level) !=
+		__btree_lock_want(iter, level));
+
+	return likely(btree_node_locked(iter, level)) ||
+		__bch2_btree_node_relock(iter, level);
+}
+
+bool bch2_btree_iter_relock(struct btree_iter *);
+
+void bch2_btree_node_unlock_write(struct btree *, struct btree_iter *);
+
+void __bch2_btree_node_lock_write(struct btree *, struct btree_iter *);
+
+static inline void bch2_btree_node_lock_write(struct btree *b, struct btree_iter *iter)
+{
+	EBUG_ON(iter->l[b->level].b != b);
+	EBUG_ON(iter->lock_seq[b->level] != b->lock.state.seq);
+
+	if (!six_trylock_write(&b->lock))
+		__bch2_btree_node_lock_write(b, iter);
+}
+
+#endif /* _BCACHEFS_BTREE_LOCKING_H */
+
+
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
new file mode 100644
index 000000000000..b922a8c104d4
--- /dev/null
+++ b/fs/bcachefs/btree_types.h
@@ -0,0 +1,479 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BTREE_TYPES_H
+#define _BCACHEFS_BTREE_TYPES_H
+
+#include <linux/list.h>
+#include <linux/rhashtable.h>
+
+#include "bkey_methods.h"
+#include "journal_types.h"
+#include "six.h"
+
+struct open_bucket;
+struct btree_update;
+
+#define MAX_BSETS		3U
+
+struct btree_nr_keys {
+
+	/*
+	 * Amount of live metadata (i.e. size of node after a compaction) in
+	 * units of u64s
+	 */
+	u16			live_u64s;
+	u16			bset_u64s[MAX_BSETS];
+
+	/* live keys only: */
+	u16			packed_keys;
+	u16			unpacked_keys;
+};
+
+struct bset_tree {
+	/*
+	 * We construct a binary tree in an array as if the array
+	 * started at 1, so that things line up on the same cachelines
+	 * better: see comments in bset.c at cacheline_to_bkey() for
+	 * details
+	 */
+
+	/* size of the binary tree and prev array */
+	u16			size;
+
+	/* function of size - precalculated for to_inorder() */
+	u16			extra;
+
+	u16			data_offset;
+	u16			aux_data_offset;
+	u16			end_offset;
+
+	struct bpos		max_key;
+};
+
+struct btree_write {
+	struct journal_entry_pin	journal;
+	struct closure_waitlist		wait;
+};
+
+struct btree_ob_ref {
+	u8			nr;
+	u8			refs[BCH_REPLICAS_MAX];
+};
+
+struct btree_alloc {
+	struct btree_ob_ref	ob;
+	BKEY_PADDED(k);
+};
+
+struct btree {
+	/* Hottest entries first */
+	struct rhash_head	hash;
+
+	/* Key/pointer for this btree node */
+	__BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX);
+
+	struct six_lock		lock;
+
+	unsigned long		flags;
+	u16			written;
+	u8			level;
+	u8			btree_id;
+	u8			nsets;
+	u8			nr_key_bits;
+
+	struct bkey_format	format;
+
+	struct btree_node	*data;
+	void			*aux_data;
+
+	/*
+	 * Sets of sorted keys - the real btree node - plus a binary search tree
+	 *
+	 * set[0] is special; set[0]->tree, set[0]->prev and set[0]->data point
+	 * to the memory we have allocated for this btree node. Additionally,
+	 * set[0]->data points to the entire btree node as it exists on disk.
+	 */
+	struct bset_tree	set[MAX_BSETS];
+
+	struct btree_nr_keys	nr;
+	u16			sib_u64s[2];
+	u16			whiteout_u64s;
+	u16			uncompacted_whiteout_u64s;
+	u8			page_order;
+	u8			unpack_fn_len;
+
+	/*
+	 * XXX: add a delete sequence number, so when bch2_btree_node_relock()
+	 * fails because the lock sequence number has changed - i.e. the
+	 * contents were modified - we can still relock the node if it's still
+	 * the one we want, without redoing the traversal
+	 */
+
+	/*
+	 * For asynchronous splits/interior node updates:
+	 * When we do a split, we allocate new child nodes and update the parent
+	 * node to point to them: we update the parent in memory immediately,
+	 * but then we must wait until the children have been written out before
+	 * the update to the parent can be written - this is a list of the
+	 * btree_updates that are blocking this node from being
+	 * written:
+	 */
+	struct list_head	write_blocked;
+
+	/*
+	 * Also for asynchronous splits/interior node updates:
+	 * If a btree node isn't reachable yet, we don't want to kick off
+	 * another write - because that write also won't yet be reachable and
+	 * marking it as completed before it's reachable would be incorrect:
+	 */
+	unsigned long		will_make_reachable;
+
+	struct btree_ob_ref	ob;
+
+	/* lru list */
+	struct list_head	list;
+
+	struct btree_write	writes[2];
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+	bool			*expensive_debug_checks;
+#endif
+};
+
+struct btree_cache {
+	struct rhashtable	table;
+	bool			table_init_done;
+	/*
+	 * We never free a struct btree, except on shutdown - we just put it on
+	 * the btree_cache_freed list and reuse it later. This simplifies the
+	 * code, and it doesn't cost us much memory as the memory usage is
+	 * dominated by buffers that hold the actual btree node data and those
+	 * can be freed - and the number of struct btrees allocated is
+	 * effectively bounded.
+	 *
+	 * btree_cache_freeable effectively is a small cache - we use it because
+	 * high order page allocations can be rather expensive, and it's quite
+	 * common to delete and allocate btree nodes in quick succession. It
+	 * should never grow past ~2-3 nodes in practice.
+	 */
+	struct mutex		lock;
+	struct list_head	live;
+	struct list_head	freeable;
+	struct list_head	freed;
+
+	/* Number of elements in live + freeable lists */
+	unsigned		used;
+	unsigned		reserve;
+	struct shrinker		shrink;
+
+	/*
+	 * If we need to allocate memory for a new btree node and that
+	 * allocation fails, we can cannibalize another node in the btree cache
+	 * to satisfy the allocation - lock to guarantee only one thread does
+	 * this at a time:
+	 */
+	struct task_struct	*alloc_lock;
+	struct closure_waitlist	alloc_wait;
+};
+
+struct btree_node_iter {
+	u8		is_extents;
+
+	struct btree_node_iter_set {
+		u16	k, end;
+	} data[MAX_BSETS];
+};
+
+enum btree_iter_type {
+	BTREE_ITER_KEYS,
+	BTREE_ITER_SLOTS,
+	BTREE_ITER_NODES,
+};
+
+#define BTREE_ITER_TYPE			((1 << 2) - 1)
+
+#define BTREE_ITER_INTENT		(1 << 2)
+#define BTREE_ITER_PREFETCH		(1 << 3)
+/*
+ * Used in bch2_btree_iter_traverse(), to indicate whether we're searching for
+ * @pos or the first key strictly greater than @pos
+ */
+#define BTREE_ITER_IS_EXTENTS		(1 << 4)
+/*
+ * indicates we need to call bch2_btree_iter_traverse() to revalidate iterator:
+ */
+#define BTREE_ITER_AT_END_OF_LEAF	(1 << 5)
+#define BTREE_ITER_ERROR		(1 << 6)
+
+enum btree_iter_uptodate {
+	BTREE_ITER_UPTODATE		= 0,
+	BTREE_ITER_NEED_PEEK		= 1,
+	BTREE_ITER_NEED_RELOCK		= 2,
+	BTREE_ITER_NEED_TRAVERSE	= 3,
+};
+
+/*
+ * @pos			- iterator's current position
+ * @level		- current btree depth
+ * @locks_want		- btree level below which we start taking intent locks
+ * @nodes_locked	- bitmask indicating which nodes in @nodes are locked
+ * @nodes_intent_locked	- bitmask indicating which locks are intent locks
+ */
+struct btree_iter {
+	struct bch_fs		*c;
+	struct bpos		pos;
+
+	u8			flags;
+	enum btree_iter_uptodate uptodate:4;
+	enum btree_id		btree_id:4;
+	unsigned		level:4,
+				locks_want:4,
+				nodes_locked:4,
+				nodes_intent_locked:4;
+
+	struct btree_iter_level {
+		struct btree	*b;
+		struct btree_node_iter iter;
+	}			l[BTREE_MAX_DEPTH];
+
+	u32			lock_seq[BTREE_MAX_DEPTH];
+
+	/*
+	 * Current unpacked key - so that bch2_btree_iter_next()/
+	 * bch2_btree_iter_next_slot() can correctly advance pos.
+	 */
+	struct bkey		k;
+
+	/*
+	 * Circular linked list of linked iterators: linked iterators share
+	 * locks (e.g. two linked iterators may have the same node intent
+	 * locked, or read and write locked, at the same time), and insertions
+	 * through one iterator won't invalidate the other linked iterators.
+	 */
+
+	/* Must come last: */
+	struct btree_iter	*next;
+};
+
+#define BTREE_ITER_MAX		8
+
+struct btree_insert_entry {
+	struct btree_iter *iter;
+	struct bkey_i	*k;
+	unsigned	extra_res;
+	/*
+	 * true if entire key was inserted - can only be false for
+	 * extents
+	 */
+	bool		done;
+};
+
+struct btree_trans {
+	struct bch_fs		*c;
+
+	u8			nr_iters;
+	u8			iters_live;
+	u8			iters_linked;
+	u8			nr_updates;
+
+	unsigned		mem_top;
+	unsigned		mem_bytes;
+	void			*mem;
+
+	struct btree_iter	*iters;
+	u64			iter_ids[BTREE_ITER_MAX];
+
+	struct btree_insert_entry updates[BTREE_ITER_MAX];
+
+	struct btree_iter	iters_onstack[2];
+};
+
+#define BTREE_FLAG(flag)						\
+static inline bool btree_node_ ## flag(struct btree *b)			\
+{	return test_bit(BTREE_NODE_ ## flag, &b->flags); }		\
+									\
+static inline void set_btree_node_ ## flag(struct btree *b)		\
+{	set_bit(BTREE_NODE_ ## flag, &b->flags); }			\
+									\
+static inline void clear_btree_node_ ## flag(struct btree *b)		\
+{	clear_bit(BTREE_NODE_ ## flag, &b->flags); }
+
+enum btree_flags {
+	BTREE_NODE_read_in_flight,
+	BTREE_NODE_read_error,
+	BTREE_NODE_dirty,
+	BTREE_NODE_need_write,
+	BTREE_NODE_noevict,
+	BTREE_NODE_write_idx,
+	BTREE_NODE_accessed,
+	BTREE_NODE_write_in_flight,
+	BTREE_NODE_just_written,
+	BTREE_NODE_dying,
+	BTREE_NODE_fake,
+};
+
+BTREE_FLAG(read_in_flight);
+BTREE_FLAG(read_error);
+BTREE_FLAG(dirty);
+BTREE_FLAG(need_write);
+BTREE_FLAG(noevict);
+BTREE_FLAG(write_idx);
+BTREE_FLAG(accessed);
+BTREE_FLAG(write_in_flight);
+BTREE_FLAG(just_written);
+BTREE_FLAG(dying);
+BTREE_FLAG(fake);
+
+static inline struct btree_write *btree_current_write(struct btree *b)
+{
+	return b->writes + btree_node_write_idx(b);
+}
+
+static inline struct btree_write *btree_prev_write(struct btree *b)
+{
+	return b->writes + (btree_node_write_idx(b) ^ 1);
+}
+
+static inline struct bset_tree *bset_tree_last(struct btree *b)
+{
+	EBUG_ON(!b->nsets);
+	return b->set + b->nsets - 1;
+}
+
+static inline struct bset *bset(const struct btree *b,
+				const struct bset_tree *t)
+{
+	return (void *) b->data + t->data_offset * sizeof(u64);
+}
+
+static inline struct bset *btree_bset_first(struct btree *b)
+{
+	return bset(b, b->set);
+}
+
+static inline struct bset *btree_bset_last(struct btree *b)
+{
+	return bset(b, bset_tree_last(b));
+}
+
+static inline u16
+__btree_node_key_to_offset(const struct btree *b, const struct bkey_packed *k)
+{
+	size_t ret = (u64 *) k - (u64 *) b->data - 1;
+
+	EBUG_ON(ret > U16_MAX);
+	return ret;
+}
+
+static inline struct bkey_packed *
+__btree_node_offset_to_key(const struct btree *b, u16 k)
+{
+	return (void *) ((u64 *) b->data + k + 1);
+}
+
+#define btree_bkey_first(_b, _t)	(bset(_b, _t)->start)
+
+#define btree_bkey_last(_b, _t)						\
+({									\
+	EBUG_ON(__btree_node_offset_to_key(_b, (_t)->end_offset) !=	\
+		vstruct_last(bset(_b, _t)));				\
+									\
+	__btree_node_offset_to_key(_b, (_t)->end_offset);		\
+})
+
+static inline void set_btree_bset_end(struct btree *b, struct bset_tree *t)
+{
+	t->end_offset =
+		__btree_node_key_to_offset(b, vstruct_last(bset(b, t)));
+	btree_bkey_last(b, t);
+}
+
+static inline void set_btree_bset(struct btree *b, struct bset_tree *t,
+				  const struct bset *i)
+{
+	t->data_offset = (u64 *) i - (u64 *) b->data;
+
+	EBUG_ON(bset(b, t) != i);
+
+	set_btree_bset_end(b, t);
+}
+
+static inline unsigned bset_byte_offset(struct btree *b, void *i)
+{
+	return i - (void *) b->data;
+}
+
+/* Type of keys @b contains: */
+static inline enum bkey_type btree_node_type(struct btree *b)
+{
+	return b->level ? BKEY_TYPE_BTREE : b->btree_id;
+}
+
+static inline const struct bkey_ops *btree_node_ops(struct btree *b)
+{
+	return &bch2_bkey_ops[btree_node_type(b)];
+}
+
+static inline bool btree_node_has_ptrs(struct btree *b)
+{
+	return btree_type_has_ptrs(btree_node_type(b));
+}
+
+static inline bool btree_node_is_extents(struct btree *b)
+{
+	return btree_node_type(b) == BKEY_TYPE_EXTENTS;
+}
+
+struct btree_root {
+	struct btree		*b;
+
+	struct btree_update	*as;
+
+	/* On disk root - see async splits: */
+	__BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX);
+	u8			level;
+	u8			alive;
+};
+
+/*
+ * Optional hook that will be called just prior to a btree node update, when
+ * we're holding the write lock and we know what key is about to be overwritten:
+ */
+
+struct btree_iter;
+struct btree_node_iter;
+
+enum btree_insert_ret {
+	BTREE_INSERT_OK,
+	/* extent spanned multiple leaf nodes: have to traverse to next node: */
+	BTREE_INSERT_NEED_TRAVERSE,
+	/* write lock held for too long */
+	BTREE_INSERT_NEED_RESCHED,
+	/* leaf node needs to be split */
+	BTREE_INSERT_BTREE_NODE_FULL,
+	BTREE_INSERT_JOURNAL_RES_FULL,
+	BTREE_INSERT_ENOSPC,
+	BTREE_INSERT_NEED_GC_LOCK,
+};
+
+struct extent_insert_hook {
+	enum btree_insert_ret
+	(*fn)(struct extent_insert_hook *, struct bpos, struct bpos,
+	      struct bkey_s_c, const struct bkey_i *);
+};
+
+enum btree_gc_coalesce_fail_reason {
+	BTREE_GC_COALESCE_FAIL_RESERVE_GET,
+	BTREE_GC_COALESCE_FAIL_KEYLIST_REALLOC,
+	BTREE_GC_COALESCE_FAIL_FORMAT_FITS,
+};
+
+enum btree_node_sibling {
+	btree_prev_sib,
+	btree_next_sib,
+};
+
+typedef struct btree_nr_keys (*sort_fix_overlapping_fn)(struct bset *,
+							struct btree *,
+							struct btree_node_iter *);
+
+#endif /* _BCACHEFS_BTREE_TYPES_H */
diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
new file mode 100644
index 000000000000..451d486fb032
--- /dev/null
+++ b/fs/bcachefs/btree_update.h
@@ -0,0 +1,168 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BTREE_UPDATE_H
+#define _BCACHEFS_BTREE_UPDATE_H
+
+#include "btree_iter.h"
+#include "journal.h"
+
+struct bch_fs;
+struct btree;
+struct btree_insert;
+
+void bch2_btree_node_lock_for_insert(struct bch_fs *, struct btree *,
+				     struct btree_iter *);
+bool bch2_btree_bset_insert_key(struct btree_iter *, struct btree *,
+				struct btree_node_iter *, struct bkey_i *);
+void bch2_btree_journal_key(struct btree_insert *trans, struct btree_iter *,
+			    struct bkey_i *);
+
+/* Normal update interface: */
+
+struct btree_insert {
+	struct bch_fs		*c;
+	struct disk_reservation *disk_res;
+	struct journal_res	journal_res;
+	u64			*journal_seq;
+	struct extent_insert_hook *hook;
+	unsigned		flags;
+	bool			did_work;
+
+	unsigned short		nr;
+	struct btree_insert_entry  *entries;
+};
+
+int __bch2_btree_insert_at(struct btree_insert *);
+
+#define BTREE_INSERT_ENTRY(_iter, _k)					\
+	((struct btree_insert_entry) {					\
+		.iter		= (_iter),				\
+		.k		= (_k),					\
+		.done		= false,				\
+	})
+
+#define BTREE_INSERT_ENTRY_EXTRA_RES(_iter, _k, _extra)			\
+	((struct btree_insert_entry) {					\
+		.iter		= (_iter),				\
+		.k		= (_k),					\
+		.extra_res = (_extra),					\
+		.done		= false,				\
+	})
+
+/**
+ * bch_btree_insert_at - insert one or more keys at iterator positions
+ * @iter:		btree iterator
+ * @insert_key:		key to insert
+ * @disk_res:		disk reservation
+ * @hook:		extent insert callback
+ *
+ * Return values:
+ * -EINTR: locking changed, this function should be called again. Only returned
+ *  if passed BTREE_INSERT_ATOMIC.
+ * -EROFS: filesystem read only
+ * -EIO: journal or btree node IO error
+ */
+#define bch2_btree_insert_at(_c, _disk_res, _hook,			\
+			    _journal_seq, _flags, ...)			\
+	__bch2_btree_insert_at(&(struct btree_insert) {			\
+		.c		= (_c),					\
+		.disk_res	= (_disk_res),				\
+		.journal_seq	= (_journal_seq),			\
+		.hook		= (_hook),				\
+		.flags		= (_flags),				\
+		.nr		= COUNT_ARGS(__VA_ARGS__),		\
+		.entries	= (struct btree_insert_entry[]) {	\
+			__VA_ARGS__					\
+		}})
+
+enum {
+	__BTREE_INSERT_ATOMIC,
+	__BTREE_INSERT_NOUNLOCK,
+	__BTREE_INSERT_NOFAIL,
+	__BTREE_INSERT_USE_RESERVE,
+	__BTREE_INSERT_USE_ALLOC_RESERVE,
+	__BTREE_INSERT_JOURNAL_REPLAY,
+	__BTREE_INSERT_NOWAIT,
+	__BTREE_INSERT_GC_LOCK_HELD,
+	__BCH_HASH_SET_MUST_CREATE,
+	__BCH_HASH_SET_MUST_REPLACE,
+};
+
+/*
+ * Don't drop/retake locks before doing btree update, instead return -EINTR if
+ * we had to drop locks for any reason
+ */
+#define BTREE_INSERT_ATOMIC		(1 << __BTREE_INSERT_ATOMIC)
+
+/*
+ * Don't drop locks _after_ successfully updating btree:
+ */
+#define BTREE_INSERT_NOUNLOCK		(1 << __BTREE_INSERT_NOUNLOCK)
+
+/* Don't check for -ENOSPC: */
+#define BTREE_INSERT_NOFAIL		(1 << __BTREE_INSERT_NOFAIL)
+
+/* for copygc, or when merging btree nodes */
+#define BTREE_INSERT_USE_RESERVE	(1 << __BTREE_INSERT_USE_RESERVE)
+#define BTREE_INSERT_USE_ALLOC_RESERVE	(1 << __BTREE_INSERT_USE_ALLOC_RESERVE)
+
+/*
+ * Insert is for journal replay: don't get journal reservations, or mark extents
+ * (bch_mark_key)
+ */
+#define BTREE_INSERT_JOURNAL_REPLAY	(1 << __BTREE_INSERT_JOURNAL_REPLAY)
+
+/* Don't block on allocation failure (for new btree nodes: */
+#define BTREE_INSERT_NOWAIT		(1 << __BTREE_INSERT_NOWAIT)
+#define BTREE_INSERT_GC_LOCK_HELD	(1 << __BTREE_INSERT_GC_LOCK_HELD)
+
+#define BCH_HASH_SET_MUST_CREATE	(1 << __BCH_HASH_SET_MUST_CREATE)
+#define BCH_HASH_SET_MUST_REPLACE	(1 << __BCH_HASH_SET_MUST_REPLACE)
+
+int bch2_btree_delete_at(struct btree_iter *, unsigned);
+
+int bch2_btree_insert_list_at(struct btree_iter *, struct keylist *,
+			     struct disk_reservation *,
+			     struct extent_insert_hook *, u64 *, unsigned);
+
+int bch2_btree_insert(struct bch_fs *, enum btree_id, struct bkey_i *,
+		     struct disk_reservation *,
+		     struct extent_insert_hook *, u64 *, int flags);
+
+int bch2_btree_delete_range(struct bch_fs *, enum btree_id,
+			   struct bpos, struct bpos, struct bversion,
+			   struct disk_reservation *,
+			   struct extent_insert_hook *, u64 *);
+
+int bch2_btree_node_rewrite(struct bch_fs *c, struct btree_iter *,
+			    __le64, unsigned);
+int bch2_btree_node_update_key(struct bch_fs *, struct btree_iter *,
+			       struct btree *, struct bkey_i_extent *);
+
+/* new transactional interface: */
+
+void bch2_trans_update(struct btree_trans *, struct btree_iter *,
+			     struct bkey_i *, unsigned);
+int bch2_trans_commit(struct btree_trans *,
+		      struct disk_reservation *,
+		      struct extent_insert_hook *,
+		      u64 *, unsigned);
+
+#define bch2_trans_do(_c, _journal_seq, _flags, _do)			\
+({									\
+	struct btree_trans trans;					\
+	int _ret;							\
+									\
+	bch2_trans_init(&trans, (_c));					\
+									\
+	do {								\
+		bch2_trans_begin(&trans);				\
+									\
+		_ret = (_do) ?:	bch2_trans_commit(&trans, NULL, NULL,	\
+					(_journal_seq), (_flags));	\
+	} while (_ret == -EINTR);					\
+									\
+	bch2_trans_exit(&trans);					\
+	_ret;								\
+})
+
+#endif /* _BCACHEFS_BTREE_UPDATE_H */
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
new file mode 100644
index 000000000000..1710efd7c687
--- /dev/null
+++ b/fs/bcachefs/btree_update_interior.c
@@ -0,0 +1,2171 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "alloc.h"
+#include "bkey_methods.h"
+#include "btree_cache.h"
+#include "btree_gc.h"
+#include "btree_update.h"
+#include "btree_update_interior.h"
+#include "btree_io.h"
+#include "btree_iter.h"
+#include "btree_locking.h"
+#include "buckets.h"
+#include "extents.h"
+#include "journal.h"
+#include "journal_reclaim.h"
+#include "keylist.h"
+#include "replicas.h"
+#include "super-io.h"
+#include "trace.h"
+
+#include <linux/random.h>
+
+static void btree_node_will_make_reachable(struct btree_update *,
+					   struct btree *);
+static void btree_update_drop_new_node(struct bch_fs *, struct btree *);
+static void bch2_btree_set_root_ondisk(struct bch_fs *, struct btree *, int);
+
+/* Debug code: */
+
+static void btree_node_interior_verify(struct btree *b)
+{
+	struct btree_node_iter iter;
+	struct bkey_packed *k;
+
+	BUG_ON(!b->level);
+
+	bch2_btree_node_iter_init(&iter, b, b->key.k.p, false, false);
+#if 1
+	BUG_ON(!(k = bch2_btree_node_iter_peek(&iter, b)) ||
+	       bkey_cmp_left_packed(b, k, &b->key.k.p));
+
+	BUG_ON((bch2_btree_node_iter_advance(&iter, b),
+		!bch2_btree_node_iter_end(&iter)));
+#else
+	const char *msg;
+
+	msg = "not found";
+	k = bch2_btree_node_iter_peek(&iter, b);
+	if (!k)
+		goto err;
+
+	msg = "isn't what it should be";
+	if (bkey_cmp_left_packed(b, k, &b->key.k.p))
+		goto err;
+
+	bch2_btree_node_iter_advance(&iter, b);
+
+	msg = "isn't last key";
+	if (!bch2_btree_node_iter_end(&iter))
+		goto err;
+	return;
+err:
+	bch2_dump_btree_node(b);
+	printk(KERN_ERR "last key %llu:%llu %s\n", b->key.k.p.inode,
+	       b->key.k.p.offset, msg);
+	BUG();
+#endif
+}
+
+/* Calculate ideal packed bkey format for new btree nodes: */
+
+void __bch2_btree_calc_format(struct bkey_format_state *s, struct btree *b)
+{
+	struct bkey_packed *k;
+	struct bset_tree *t;
+	struct bkey uk;
+
+	bch2_bkey_format_add_pos(s, b->data->min_key);
+
+	for_each_bset(b, t)
+		for (k = btree_bkey_first(b, t);
+		     k != btree_bkey_last(b, t);
+		     k = bkey_next(k))
+			if (!bkey_whiteout(k)) {
+				uk = bkey_unpack_key(b, k);
+				bch2_bkey_format_add_key(s, &uk);
+			}
+}
+
+static struct bkey_format bch2_btree_calc_format(struct btree *b)
+{
+	struct bkey_format_state s;
+
+	bch2_bkey_format_init(&s);
+	__bch2_btree_calc_format(&s, b);
+
+	return bch2_bkey_format_done(&s);
+}
+
+static size_t btree_node_u64s_with_format(struct btree *b,
+					  struct bkey_format *new_f)
+{
+	struct bkey_format *old_f = &b->format;
+
+	/* stupid integer promotion rules */
+	ssize_t delta =
+	    (((int) new_f->key_u64s - old_f->key_u64s) *
+	     (int) b->nr.packed_keys) +
+	    (((int) new_f->key_u64s - BKEY_U64s) *
+	     (int) b->nr.unpacked_keys);
+
+	BUG_ON(delta + b->nr.live_u64s < 0);
+
+	return b->nr.live_u64s + delta;
+}
+
+/**
+ * btree_node_format_fits - check if we could rewrite node with a new format
+ *
+ * This assumes all keys can pack with the new format -- it just checks if
+ * the re-packed keys would fit inside the node itself.
+ */
+bool bch2_btree_node_format_fits(struct bch_fs *c, struct btree *b,
+				 struct bkey_format *new_f)
+{
+	size_t u64s = btree_node_u64s_with_format(b, new_f);
+
+	return __vstruct_bytes(struct btree_node, u64s) < btree_bytes(c);
+}
+
+/* Btree node freeing/allocation: */
+
+static bool btree_key_matches(struct bch_fs *c,
+			      struct bkey_s_c_extent l,
+			      struct bkey_s_c_extent r)
+{
+	const struct bch_extent_ptr *ptr1, *ptr2;
+
+	extent_for_each_ptr(l, ptr1)
+		extent_for_each_ptr(r, ptr2)
+			if (ptr1->dev == ptr2->dev &&
+			    ptr1->gen == ptr2->gen &&
+			    ptr1->offset == ptr2->offset)
+				return true;
+
+	return false;
+}
+
+/*
+ * We're doing the index update that makes @b unreachable, update stuff to
+ * reflect that:
+ *
+ * Must be called _before_ btree_update_updated_root() or
+ * btree_update_updated_node:
+ */
+static void bch2_btree_node_free_index(struct btree_update *as, struct btree *b,
+				       struct bkey_s_c k,
+				       struct bch_fs_usage *stats)
+{
+	struct bch_fs *c = as->c;
+	struct pending_btree_node_free *d;
+	unsigned replicas;
+
+	/*
+	 * btree_update lock is only needed here to avoid racing with
+	 * gc:
+	 */
+	mutex_lock(&c->btree_interior_update_lock);
+
+	for (d = as->pending; d < as->pending + as->nr_pending; d++)
+		if (!bkey_cmp(k.k->p, d->key.k.p) &&
+		    btree_key_matches(c, bkey_s_c_to_extent(k),
+				      bkey_i_to_s_c_extent(&d->key)))
+			goto found;
+	BUG();
+found:
+	BUG_ON(d->index_update_done);
+	d->index_update_done = true;
+
+	/*
+	 * Btree nodes are accounted as freed in bch_alloc_stats when they're
+	 * freed from the index:
+	 */
+	replicas = bch2_extent_nr_dirty_ptrs(k);
+	if (replicas)
+		stats->s[replicas - 1].data[S_META] -= c->opts.btree_node_size;
+
+	/*
+	 * We're dropping @k from the btree, but it's still live until the
+	 * index update is persistent so we need to keep a reference around for
+	 * mark and sweep to find - that's primarily what the
+	 * btree_node_pending_free list is for.
+	 *
+	 * So here (when we set index_update_done = true), we're moving an
+	 * existing reference to a different part of the larger "gc keyspace" -
+	 * and the new position comes after the old position, since GC marks
+	 * the pending free list after it walks the btree.
+	 *
+	 * If we move the reference while mark and sweep is _between_ the old
+	 * and the new position, mark and sweep will see the reference twice
+	 * and it'll get double accounted - so check for that here and subtract
+	 * to cancel out one of mark and sweep's markings if necessary:
+	 */
+
+	/*
+	 * bch2_mark_key() compares the current gc pos to the pos we're
+	 * moving this reference from, hence one comparison here:
+	 */
+	if (gc_pos_cmp(c->gc_pos, gc_phase(GC_PHASE_PENDING_DELETE)) < 0) {
+		struct bch_fs_usage tmp = { 0 };
+
+		bch2_mark_key(c, bkey_i_to_s_c(&d->key),
+			     -c->opts.btree_node_size, true, b
+			     ? gc_pos_btree_node(b)
+			     : gc_pos_btree_root(as->btree_id),
+			     &tmp, 0, 0);
+		/*
+		 * Don't apply tmp - pending deletes aren't tracked in
+		 * bch_alloc_stats:
+		 */
+	}
+
+	mutex_unlock(&c->btree_interior_update_lock);
+}
+
+static void __btree_node_free(struct bch_fs *c, struct btree *b)
+{
+	trace_btree_node_free(c, b);
+
+	BUG_ON(btree_node_dirty(b));
+	BUG_ON(btree_node_need_write(b));
+	BUG_ON(b == btree_node_root(c, b));
+	BUG_ON(b->ob.nr);
+	BUG_ON(!list_empty(&b->write_blocked));
+	BUG_ON(b->will_make_reachable);
+
+	clear_btree_node_noevict(b);
+
+	bch2_btree_node_hash_remove(&c->btree_cache, b);
+
+	mutex_lock(&c->btree_cache.lock);
+	list_move(&b->list, &c->btree_cache.freeable);
+	mutex_unlock(&c->btree_cache.lock);
+}
+
+void bch2_btree_node_free_never_inserted(struct bch_fs *c, struct btree *b)
+{
+	struct btree_ob_ref ob = b->ob;
+
+	btree_update_drop_new_node(c, b);
+
+	b->ob.nr = 0;
+
+	clear_btree_node_dirty(b);
+
+	btree_node_lock_type(c, b, SIX_LOCK_write);
+	__btree_node_free(c, b);
+	six_unlock_write(&b->lock);
+
+	bch2_open_bucket_put_refs(c, &ob.nr, ob.refs);
+}
+
+void bch2_btree_node_free_inmem(struct bch_fs *c, struct btree *b,
+				struct btree_iter *iter)
+{
+	/*
+	 * Is this a node that isn't reachable on disk yet?
+	 *
+	 * Nodes that aren't reachable yet have writes blocked until they're
+	 * reachable - now that we've cancelled any pending writes and moved
+	 * things waiting on that write to wait on this update, we can drop this
+	 * node from the list of nodes that the other update is making
+	 * reachable, prior to freeing it:
+	 */
+	btree_update_drop_new_node(c, b);
+
+	__bch2_btree_node_lock_write(b, iter);
+	__btree_node_free(c, b);
+	six_unlock_write(&b->lock);
+
+	bch2_btree_iter_node_drop(iter, b);
+}
+
+static void bch2_btree_node_free_ondisk(struct bch_fs *c,
+					struct pending_btree_node_free *pending)
+{
+	struct bch_fs_usage stats = { 0 };
+
+	BUG_ON(!pending->index_update_done);
+
+	bch2_mark_key(c, bkey_i_to_s_c(&pending->key),
+		     -c->opts.btree_node_size, true,
+		     gc_phase(GC_PHASE_PENDING_DELETE),
+		     &stats, 0, 0);
+	/*
+	 * Don't apply stats - pending deletes aren't tracked in
+	 * bch_alloc_stats:
+	 */
+}
+
+void bch2_btree_open_bucket_put(struct bch_fs *c, struct btree *b)
+{
+	bch2_open_bucket_put_refs(c, &b->ob.nr, b->ob.refs);
+}
+
+static struct btree *__bch2_btree_node_alloc(struct bch_fs *c,
+					     struct disk_reservation *res,
+					     struct closure *cl,
+					     unsigned flags)
+{
+	struct write_point *wp;
+	struct btree *b;
+	BKEY_PADDED(k) tmp;
+	struct bkey_i_extent *e;
+	struct btree_ob_ref ob;
+	struct bch_devs_list devs_have = (struct bch_devs_list) { 0 };
+	unsigned nr_reserve;
+	enum alloc_reserve alloc_reserve;
+
+	if (flags & BTREE_INSERT_USE_ALLOC_RESERVE) {
+		nr_reserve	= 0;
+		alloc_reserve	= RESERVE_ALLOC;
+	} else if (flags & BTREE_INSERT_USE_RESERVE) {
+		nr_reserve	= BTREE_NODE_RESERVE / 2;
+		alloc_reserve	= RESERVE_BTREE;
+	} else {
+		nr_reserve	= BTREE_NODE_RESERVE;
+		alloc_reserve	= RESERVE_NONE;
+	}
+
+	mutex_lock(&c->btree_reserve_cache_lock);
+	if (c->btree_reserve_cache_nr > nr_reserve) {
+		struct btree_alloc *a =
+			&c->btree_reserve_cache[--c->btree_reserve_cache_nr];
+
+		ob = a->ob;
+		bkey_copy(&tmp.k, &a->k);
+		mutex_unlock(&c->btree_reserve_cache_lock);
+		goto mem_alloc;
+	}
+	mutex_unlock(&c->btree_reserve_cache_lock);
+
+retry:
+	wp = bch2_alloc_sectors_start(c, c->opts.foreground_target,
+				      writepoint_ptr(&c->btree_write_point),
+				      &devs_have,
+				      res->nr_replicas,
+				      c->opts.metadata_replicas_required,
+				      alloc_reserve, 0, cl);
+	if (IS_ERR(wp))
+		return ERR_CAST(wp);
+
+	if (wp->sectors_free < c->opts.btree_node_size) {
+		struct open_bucket *ob;
+		unsigned i;
+
+		writepoint_for_each_ptr(wp, ob, i)
+			if (ob->sectors_free < c->opts.btree_node_size)
+				ob->sectors_free = 0;
+
+		bch2_alloc_sectors_done(c, wp);
+		goto retry;
+	}
+
+	e = bkey_extent_init(&tmp.k);
+	bch2_alloc_sectors_append_ptrs(c, wp, e, c->opts.btree_node_size);
+
+	ob.nr = 0;
+	bch2_open_bucket_get(c, wp, &ob.nr, ob.refs);
+	bch2_alloc_sectors_done(c, wp);
+mem_alloc:
+	b = bch2_btree_node_mem_alloc(c);
+
+	/* we hold cannibalize_lock: */
+	BUG_ON(IS_ERR(b));
+	BUG_ON(b->ob.nr);
+
+	bkey_copy(&b->key, &tmp.k);
+	b->ob = ob;
+
+	return b;
+}
+
+static struct btree *bch2_btree_node_alloc(struct btree_update *as, unsigned level)
+{
+	struct bch_fs *c = as->c;
+	struct btree *b;
+
+	BUG_ON(level >= BTREE_MAX_DEPTH);
+	BUG_ON(!as->reserve->nr);
+
+	b = as->reserve->b[--as->reserve->nr];
+
+	BUG_ON(bch2_btree_node_hash_insert(&c->btree_cache, b, level, as->btree_id));
+
+	set_btree_node_accessed(b);
+	set_btree_node_dirty(b);
+
+	bch2_bset_init_first(b, &b->data->keys);
+	memset(&b->nr, 0, sizeof(b->nr));
+	b->data->magic = cpu_to_le64(bset_magic(c));
+	b->data->flags = 0;
+	SET_BTREE_NODE_ID(b->data, as->btree_id);
+	SET_BTREE_NODE_LEVEL(b->data, level);
+	b->data->ptr = bkey_i_to_extent(&b->key)->v.start->ptr;
+
+	bch2_btree_build_aux_trees(b);
+
+	btree_node_will_make_reachable(as, b);
+
+	trace_btree_node_alloc(c, b);
+	return b;
+}
+
+struct btree *__bch2_btree_node_alloc_replacement(struct btree_update *as,
+						  struct btree *b,
+						  struct bkey_format format)
+{
+	struct btree *n;
+
+	n = bch2_btree_node_alloc(as, b->level);
+
+	n->data->min_key	= b->data->min_key;
+	n->data->max_key	= b->data->max_key;
+	n->data->format		= format;
+	SET_BTREE_NODE_SEQ(n->data, BTREE_NODE_SEQ(b->data) + 1);
+
+	btree_node_set_format(n, format);
+
+	bch2_btree_sort_into(as->c, n, b);
+
+	btree_node_reset_sib_u64s(n);
+
+	n->key.k.p = b->key.k.p;
+	return n;
+}
+
+static struct btree *bch2_btree_node_alloc_replacement(struct btree_update *as,
+						       struct btree *b)
+{
+	struct bkey_format new_f = bch2_btree_calc_format(b);
+
+	/*
+	 * The keys might expand with the new format - if they wouldn't fit in
+	 * the btree node anymore, use the old format for now:
+	 */
+	if (!bch2_btree_node_format_fits(as->c, b, &new_f))
+		new_f = b->format;
+
+	return __bch2_btree_node_alloc_replacement(as, b, new_f);
+}
+
+static struct btree *__btree_root_alloc(struct btree_update *as, unsigned level)
+{
+	struct btree *b = bch2_btree_node_alloc(as, level);
+
+	b->data->min_key = POS_MIN;
+	b->data->max_key = POS_MAX;
+	b->data->format = bch2_btree_calc_format(b);
+	b->key.k.p = POS_MAX;
+
+	btree_node_set_format(b, b->data->format);
+	bch2_btree_build_aux_trees(b);
+
+	six_unlock_write(&b->lock);
+
+	return b;
+}
+
+static void bch2_btree_reserve_put(struct bch_fs *c, struct btree_reserve *reserve)
+{
+	bch2_disk_reservation_put(c, &reserve->disk_res);
+
+	mutex_lock(&c->btree_reserve_cache_lock);
+
+	while (reserve->nr) {
+		struct btree *b = reserve->b[--reserve->nr];
+
+		six_unlock_write(&b->lock);
+
+		if (c->btree_reserve_cache_nr <
+		    ARRAY_SIZE(c->btree_reserve_cache)) {
+			struct btree_alloc *a =
+				&c->btree_reserve_cache[c->btree_reserve_cache_nr++];
+
+			a->ob = b->ob;
+			b->ob.nr = 0;
+			bkey_copy(&a->k, &b->key);
+		} else {
+			bch2_btree_open_bucket_put(c, b);
+		}
+
+		btree_node_lock_type(c, b, SIX_LOCK_write);
+		__btree_node_free(c, b);
+		six_unlock_write(&b->lock);
+
+		six_unlock_intent(&b->lock);
+	}
+
+	mutex_unlock(&c->btree_reserve_cache_lock);
+
+	mempool_free(reserve, &c->btree_reserve_pool);
+}
+
+static struct btree_reserve *bch2_btree_reserve_get(struct bch_fs *c,
+						    unsigned nr_nodes,
+						    unsigned flags,
+						    struct closure *cl)
+{
+	struct btree_reserve *reserve;
+	struct btree *b;
+	struct disk_reservation disk_res = { 0, 0 };
+	unsigned sectors = nr_nodes * c->opts.btree_node_size;
+	int ret, disk_res_flags = BCH_DISK_RESERVATION_GC_LOCK_HELD;
+
+	if (flags & BTREE_INSERT_NOFAIL)
+		disk_res_flags |= BCH_DISK_RESERVATION_NOFAIL;
+
+	/*
+	 * This check isn't necessary for correctness - it's just to potentially
+	 * prevent us from doing a lot of work that'll end up being wasted:
+	 */
+	ret = bch2_journal_error(&c->journal);
+	if (ret)
+		return ERR_PTR(ret);
+
+	if (bch2_disk_reservation_get(c, &disk_res, sectors,
+				      c->opts.metadata_replicas,
+				      disk_res_flags))
+		return ERR_PTR(-ENOSPC);
+
+	BUG_ON(nr_nodes > BTREE_RESERVE_MAX);
+
+	/*
+	 * Protects reaping from the btree node cache and using the btree node
+	 * open bucket reserve:
+	 */
+	ret = bch2_btree_cache_cannibalize_lock(c, cl);
+	if (ret) {
+		bch2_disk_reservation_put(c, &disk_res);
+		return ERR_PTR(ret);
+	}
+
+	reserve = mempool_alloc(&c->btree_reserve_pool, GFP_NOIO);
+
+	reserve->disk_res = disk_res;
+	reserve->nr = 0;
+
+	while (reserve->nr < nr_nodes) {
+		b = __bch2_btree_node_alloc(c, &disk_res,
+					    flags & BTREE_INSERT_NOWAIT
+					    ? NULL : cl, flags);
+		if (IS_ERR(b)) {
+			ret = PTR_ERR(b);
+			goto err_free;
+		}
+
+		ret = bch2_mark_bkey_replicas(c, BCH_DATA_BTREE,
+					      bkey_i_to_s_c(&b->key));
+		if (ret)
+			goto err_free;
+
+		reserve->b[reserve->nr++] = b;
+	}
+
+	bch2_btree_cache_cannibalize_unlock(c);
+	return reserve;
+err_free:
+	bch2_btree_reserve_put(c, reserve);
+	bch2_btree_cache_cannibalize_unlock(c);
+	trace_btree_reserve_get_fail(c, nr_nodes, cl);
+	return ERR_PTR(ret);
+}
+
+/* Asynchronous interior node update machinery */
+
+static void bch2_btree_update_free(struct btree_update *as)
+{
+	struct bch_fs *c = as->c;
+
+	BUG_ON(as->nr_new_nodes);
+	BUG_ON(as->nr_pending);
+
+	if (as->reserve)
+		bch2_btree_reserve_put(c, as->reserve);
+
+	mutex_lock(&c->btree_interior_update_lock);
+	list_del(&as->list);
+
+	closure_debug_destroy(&as->cl);
+	mempool_free(as, &c->btree_interior_update_pool);
+	percpu_ref_put(&c->writes);
+
+	closure_wake_up(&c->btree_interior_update_wait);
+	mutex_unlock(&c->btree_interior_update_lock);
+}
+
+static void btree_update_nodes_reachable(struct closure *cl)
+{
+	struct btree_update *as = container_of(cl, struct btree_update, cl);
+	struct bch_fs *c = as->c;
+
+	bch2_journal_pin_drop(&c->journal, &as->journal);
+
+	mutex_lock(&c->btree_interior_update_lock);
+
+	while (as->nr_new_nodes) {
+		struct btree *b = as->new_nodes[--as->nr_new_nodes];
+
+		BUG_ON(b->will_make_reachable != (unsigned long) as);
+		b->will_make_reachable = 0;
+		mutex_unlock(&c->btree_interior_update_lock);
+
+		/*
+		 * b->will_make_reachable prevented it from being written, so
+		 * write it now if it needs to be written:
+		 */
+		btree_node_lock_type(c, b, SIX_LOCK_read);
+		bch2_btree_node_write_cond(c, b, btree_node_need_write(b));
+		six_unlock_read(&b->lock);
+		mutex_lock(&c->btree_interior_update_lock);
+	}
+
+	while (as->nr_pending)
+		bch2_btree_node_free_ondisk(c, &as->pending[--as->nr_pending]);
+
+	mutex_unlock(&c->btree_interior_update_lock);
+
+	closure_wake_up(&as->wait);
+
+	bch2_btree_update_free(as);
+}
+
+static void btree_update_wait_on_journal(struct closure *cl)
+{
+	struct btree_update *as = container_of(cl, struct btree_update, cl);
+	struct bch_fs *c = as->c;
+	int ret;
+
+	ret = bch2_journal_open_seq_async(&c->journal, as->journal_seq, cl);
+	if (ret < 0)
+		goto err;
+	if (!ret) {
+		continue_at(cl, btree_update_wait_on_journal, system_wq);
+		return;
+	}
+
+	bch2_journal_flush_seq_async(&c->journal, as->journal_seq, cl);
+err:
+	continue_at(cl, btree_update_nodes_reachable, system_wq);
+}
+
+static void btree_update_nodes_written(struct closure *cl)
+{
+	struct btree_update *as = container_of(cl, struct btree_update, cl);
+	struct bch_fs *c = as->c;
+	struct btree *b;
+
+	/*
+	 * We did an update to a parent node where the pointers we added pointed
+	 * to child nodes that weren't written yet: now, the child nodes have
+	 * been written so we can write out the update to the interior node.
+	 */
+retry:
+	mutex_lock(&c->btree_interior_update_lock);
+	as->nodes_written = true;
+
+	switch (as->mode) {
+	case BTREE_INTERIOR_NO_UPDATE:
+		BUG();
+	case BTREE_INTERIOR_UPDATING_NODE:
+		/* The usual case: */
+		b = READ_ONCE(as->b);
+
+		if (!six_trylock_read(&b->lock)) {
+			mutex_unlock(&c->btree_interior_update_lock);
+			btree_node_lock_type(c, b, SIX_LOCK_read);
+			six_unlock_read(&b->lock);
+			goto retry;
+		}
+
+		BUG_ON(!btree_node_dirty(b));
+		closure_wait(&btree_current_write(b)->wait, cl);
+
+		list_del(&as->write_blocked_list);
+		mutex_unlock(&c->btree_interior_update_lock);
+
+		/*
+		 * b->write_blocked prevented it from being written, so
+		 * write it now if it needs to be written:
+		 */
+		bch2_btree_node_write_cond(c, b, true);
+		six_unlock_read(&b->lock);
+		break;
+
+	case BTREE_INTERIOR_UPDATING_AS:
+		/*
+		 * The btree node we originally updated has been freed and is
+		 * being rewritten - so we need to write anything here, we just
+		 * need to signal to that btree_update that it's ok to make the
+		 * new replacement node visible:
+		 */
+		closure_put(&as->parent_as->cl);
+
+		/*
+		 * and then we have to wait on that btree_update to finish:
+		 */
+		closure_wait(&as->parent_as->wait, cl);
+		mutex_unlock(&c->btree_interior_update_lock);
+		break;
+
+	case BTREE_INTERIOR_UPDATING_ROOT:
+		/* b is the new btree root: */
+		b = READ_ONCE(as->b);
+
+		if (!six_trylock_read(&b->lock)) {
+			mutex_unlock(&c->btree_interior_update_lock);
+			btree_node_lock_type(c, b, SIX_LOCK_read);
+			six_unlock_read(&b->lock);
+			goto retry;
+		}
+
+		BUG_ON(c->btree_roots[b->btree_id].as != as);
+		c->btree_roots[b->btree_id].as = NULL;
+
+		bch2_btree_set_root_ondisk(c, b, WRITE);
+
+		/*
+		 * We don't have to wait anything anything here (before
+		 * btree_update_nodes_reachable frees the old nodes
+		 * ondisk) - we've ensured that the very next journal write will
+		 * have the pointer to the new root, and before the allocator
+		 * can reuse the old nodes it'll have to do a journal commit:
+		 */
+		six_unlock_read(&b->lock);
+		mutex_unlock(&c->btree_interior_update_lock);
+
+		/*
+		 * Bit of funny circularity going on here we have to break:
+		 *
+		 * We have to drop our journal pin before writing the journal
+		 * entry that points to the new btree root: else, we could
+		 * deadlock if the journal currently happens to be full.
+		 *
+		 * This mean we're dropping the journal pin _before_ the new
+		 * nodes are technically reachable - but this is safe, because
+		 * after the bch2_btree_set_root_ondisk() call above they will
+		 * be reachable as of the very next journal write:
+		 */
+		bch2_journal_pin_drop(&c->journal, &as->journal);
+
+		as->journal_seq = bch2_journal_last_unwritten_seq(&c->journal);
+
+		btree_update_wait_on_journal(cl);
+		return;
+	}
+
+	continue_at(cl, btree_update_nodes_reachable, system_wq);
+}
+
+/*
+ * We're updating @b with pointers to nodes that haven't finished writing yet:
+ * block @b from being written until @as completes
+ */
+static void btree_update_updated_node(struct btree_update *as, struct btree *b)
+{
+	struct bch_fs *c = as->c;
+
+	mutex_lock(&c->btree_interior_update_lock);
+
+	BUG_ON(as->mode != BTREE_INTERIOR_NO_UPDATE);
+	BUG_ON(!btree_node_dirty(b));
+
+	as->mode = BTREE_INTERIOR_UPDATING_NODE;
+	as->b = b;
+	list_add(&as->write_blocked_list, &b->write_blocked);
+
+	mutex_unlock(&c->btree_interior_update_lock);
+
+	/*
+	 * In general, when you're staging things in a journal that will later
+	 * be written elsewhere, and you also want to guarantee ordering: that
+	 * is, if you have updates a, b, c, after a crash you should never see c
+	 * and not a or b - there's a problem:
+	 *
+	 * If the final destination of the update(s) (i.e. btree node) can be
+	 * written/flushed _before_ the relevant journal entry - oops, that
+	 * breaks ordering, since the various leaf nodes can be written in any
+	 * order.
+	 *
+	 * Normally we use bset->journal_seq to deal with this - if during
+	 * recovery we find a btree node write that's newer than the newest
+	 * journal entry, we just ignore it - we don't need it, anything we're
+	 * supposed to have (that we reported as completed via fsync()) will
+	 * still be in the journal, and as far as the state of the journal is
+	 * concerned that btree node write never happened.
+	 *
+	 * That breaks when we're rewriting/splitting/merging nodes, since we're
+	 * mixing btree node writes that haven't happened yet with previously
+	 * written data that has been reported as completed to the journal.
+	 *
+	 * Thus, before making the new nodes reachable, we have to wait the
+	 * newest journal sequence number we have data for to be written (if it
+	 * hasn't been yet).
+	 */
+	bch2_journal_wait_on_seq(&c->journal, as->journal_seq, &as->cl);
+}
+
+static void interior_update_flush(struct journal *j,
+			struct journal_entry_pin *pin, u64 seq)
+{
+	struct btree_update *as =
+		container_of(pin, struct btree_update, journal);
+
+	bch2_journal_flush_seq_async(j, as->journal_seq, NULL);
+}
+
+static void btree_update_reparent(struct btree_update *as,
+				  struct btree_update *child)
+{
+	struct bch_fs *c = as->c;
+
+	child->b = NULL;
+	child->mode = BTREE_INTERIOR_UPDATING_AS;
+	child->parent_as = as;
+	closure_get(&as->cl);
+
+	/*
+	 * When we write a new btree root, we have to drop our journal pin
+	 * _before_ the new nodes are technically reachable; see
+	 * btree_update_nodes_written().
+	 *
+	 * This goes for journal pins that are recursively blocked on us - so,
+	 * just transfer the journal pin to the new interior update so
+	 * btree_update_nodes_written() can drop it.
+	 */
+	bch2_journal_pin_add_if_older(&c->journal, &child->journal,
+				      &as->journal, interior_update_flush);
+	bch2_journal_pin_drop(&c->journal, &child->journal);
+
+	as->journal_seq = max(as->journal_seq, child->journal_seq);
+}
+
+static void btree_update_updated_root(struct btree_update *as)
+{
+	struct bch_fs *c = as->c;
+	struct btree_root *r = &c->btree_roots[as->btree_id];
+
+	mutex_lock(&c->btree_interior_update_lock);
+
+	BUG_ON(as->mode != BTREE_INTERIOR_NO_UPDATE);
+
+	/*
+	 * Old root might not be persistent yet - if so, redirect its
+	 * btree_update operation to point to us:
+	 */
+	if (r->as)
+		btree_update_reparent(as, r->as);
+
+	as->mode = BTREE_INTERIOR_UPDATING_ROOT;
+	as->b = r->b;
+	r->as = as;
+
+	mutex_unlock(&c->btree_interior_update_lock);
+
+	/*
+	 * When we're rewriting nodes and updating interior nodes, there's an
+	 * issue with updates that haven't been written in the journal getting
+	 * mixed together with older data - see btree_update_updated_node()
+	 * for the explanation.
+	 *
+	 * However, this doesn't affect us when we're writing a new btree root -
+	 * because to make that new root reachable we have to write out a new
+	 * journal entry, which must necessarily be newer than as->journal_seq.
+	 */
+}
+
+static void btree_node_will_make_reachable(struct btree_update *as,
+					   struct btree *b)
+{
+	struct bch_fs *c = as->c;
+
+	mutex_lock(&c->btree_interior_update_lock);
+	BUG_ON(as->nr_new_nodes >= ARRAY_SIZE(as->new_nodes));
+	BUG_ON(b->will_make_reachable);
+
+	as->new_nodes[as->nr_new_nodes++] = b;
+	b->will_make_reachable = 1UL|(unsigned long) as;
+
+	closure_get(&as->cl);
+	mutex_unlock(&c->btree_interior_update_lock);
+}
+
+static void btree_update_drop_new_node(struct bch_fs *c, struct btree *b)
+{
+	struct btree_update *as;
+	unsigned long v;
+	unsigned i;
+
+	mutex_lock(&c->btree_interior_update_lock);
+	v = xchg(&b->will_make_reachable, 0);
+	as = (struct btree_update *) (v & ~1UL);
+
+	if (!as) {
+		mutex_unlock(&c->btree_interior_update_lock);
+		return;
+	}
+
+	for (i = 0; i < as->nr_new_nodes; i++)
+		if (as->new_nodes[i] == b)
+			goto found;
+
+	BUG();
+found:
+	array_remove_item(as->new_nodes, as->nr_new_nodes, i);
+	mutex_unlock(&c->btree_interior_update_lock);
+
+	if (v & 1)
+		closure_put(&as->cl);
+}
+
+static void btree_interior_update_add_node_reference(struct btree_update *as,
+						     struct btree *b)
+{
+	struct bch_fs *c = as->c;
+	struct pending_btree_node_free *d;
+
+	mutex_lock(&c->btree_interior_update_lock);
+
+	/* Add this node to the list of nodes being freed: */
+	BUG_ON(as->nr_pending >= ARRAY_SIZE(as->pending));
+
+	d = &as->pending[as->nr_pending++];
+	d->index_update_done	= false;
+	d->seq			= b->data->keys.seq;
+	d->btree_id		= b->btree_id;
+	d->level		= b->level;
+	bkey_copy(&d->key, &b->key);
+
+	mutex_unlock(&c->btree_interior_update_lock);
+}
+
+/*
+ * @b is being split/rewritten: it may have pointers to not-yet-written btree
+ * nodes and thus outstanding btree_updates - redirect @b's
+ * btree_updates to point to this btree_update:
+ */
+void bch2_btree_interior_update_will_free_node(struct btree_update *as,
+					       struct btree *b)
+{
+	struct bch_fs *c = as->c;
+	struct closure *cl, *cl_n;
+	struct btree_update *p, *n;
+	struct btree_write *w;
+	struct bset_tree *t;
+
+	set_btree_node_dying(b);
+
+	if (btree_node_fake(b))
+		return;
+
+	btree_interior_update_add_node_reference(as, b);
+
+	/*
+	 * Does this node have data that hasn't been written in the journal?
+	 *
+	 * If so, we have to wait for the corresponding journal entry to be
+	 * written before making the new nodes reachable - we can't just carry
+	 * over the bset->journal_seq tracking, since we'll be mixing those keys
+	 * in with keys that aren't in the journal anymore:
+	 */
+	for_each_bset(b, t)
+		as->journal_seq = max(as->journal_seq,
+				      le64_to_cpu(bset(b, t)->journal_seq));
+
+	mutex_lock(&c->btree_interior_update_lock);
+
+	/*
+	 * Does this node have any btree_update operations preventing
+	 * it from being written?
+	 *
+	 * If so, redirect them to point to this btree_update: we can
+	 * write out our new nodes, but we won't make them visible until those
+	 * operations complete
+	 */
+	list_for_each_entry_safe(p, n, &b->write_blocked, write_blocked_list) {
+		list_del(&p->write_blocked_list);
+		btree_update_reparent(as, p);
+	}
+
+	clear_btree_node_dirty(b);
+	clear_btree_node_need_write(b);
+	w = btree_current_write(b);
+
+	/*
+	 * Does this node have any btree_update operations waiting on this node
+	 * to be written?
+	 *
+	 * If so, wake them up when this btree_update operation is reachable:
+	 */
+	llist_for_each_entry_safe(cl, cl_n, llist_del_all(&w->wait.list), list)
+		llist_add(&cl->list, &as->wait.list);
+
+	/*
+	 * Does this node have unwritten data that has a pin on the journal?
+	 *
+	 * If so, transfer that pin to the btree_update operation -
+	 * note that if we're freeing multiple nodes, we only need to keep the
+	 * oldest pin of any of the nodes we're freeing. We'll release the pin
+	 * when the new nodes are persistent and reachable on disk:
+	 */
+	bch2_journal_pin_add_if_older(&c->journal, &w->journal,
+				      &as->journal, interior_update_flush);
+	bch2_journal_pin_drop(&c->journal, &w->journal);
+
+	w = btree_prev_write(b);
+	bch2_journal_pin_add_if_older(&c->journal, &w->journal,
+				      &as->journal, interior_update_flush);
+	bch2_journal_pin_drop(&c->journal, &w->journal);
+
+	mutex_unlock(&c->btree_interior_update_lock);
+}
+
+void bch2_btree_update_done(struct btree_update *as)
+{
+	BUG_ON(as->mode == BTREE_INTERIOR_NO_UPDATE);
+
+	bch2_btree_reserve_put(as->c, as->reserve);
+	as->reserve = NULL;
+
+	continue_at(&as->cl, btree_update_nodes_written, system_freezable_wq);
+}
+
+struct btree_update *
+bch2_btree_update_start(struct bch_fs *c, enum btree_id id,
+			unsigned nr_nodes, unsigned flags,
+			struct closure *cl)
+{
+	struct btree_reserve *reserve;
+	struct btree_update *as;
+
+	if (unlikely(!percpu_ref_tryget(&c->writes)))
+		return ERR_PTR(-EROFS);
+
+	reserve = bch2_btree_reserve_get(c, nr_nodes, flags, cl);
+	if (IS_ERR(reserve)) {
+		percpu_ref_put(&c->writes);
+		return ERR_CAST(reserve);
+	}
+
+	as = mempool_alloc(&c->btree_interior_update_pool, GFP_NOIO);
+	memset(as, 0, sizeof(*as));
+	closure_init(&as->cl, NULL);
+	as->c		= c;
+	as->mode	= BTREE_INTERIOR_NO_UPDATE;
+	as->btree_id	= id;
+	as->reserve	= reserve;
+	INIT_LIST_HEAD(&as->write_blocked_list);
+
+	bch2_keylist_init(&as->parent_keys, as->inline_keys);
+
+	mutex_lock(&c->btree_interior_update_lock);
+	list_add_tail(&as->list, &c->btree_interior_update_list);
+	mutex_unlock(&c->btree_interior_update_lock);
+
+	return as;
+}
+
+/* Btree root updates: */
+
+static void __bch2_btree_set_root_inmem(struct bch_fs *c, struct btree *b)
+{
+	/* Root nodes cannot be reaped */
+	mutex_lock(&c->btree_cache.lock);
+	list_del_init(&b->list);
+	mutex_unlock(&c->btree_cache.lock);
+
+	mutex_lock(&c->btree_root_lock);
+	BUG_ON(btree_node_root(c, b) &&
+	       (b->level < btree_node_root(c, b)->level ||
+		!btree_node_dying(btree_node_root(c, b))));
+
+	btree_node_root(c, b) = b;
+	mutex_unlock(&c->btree_root_lock);
+
+	bch2_recalc_btree_reserve(c);
+}
+
+static void bch2_btree_set_root_inmem(struct btree_update *as, struct btree *b)
+{
+	struct bch_fs *c = as->c;
+	struct btree *old = btree_node_root(c, b);
+	struct bch_fs_usage stats = { 0 };
+
+	__bch2_btree_set_root_inmem(c, b);
+
+	bch2_mark_key(c, bkey_i_to_s_c(&b->key),
+		      c->opts.btree_node_size, true,
+		      gc_pos_btree_root(b->btree_id),
+		      &stats, 0, 0);
+
+	if (old && !btree_node_fake(old))
+		bch2_btree_node_free_index(as, NULL,
+					   bkey_i_to_s_c(&old->key),
+					   &stats);
+	bch2_fs_usage_apply(c, &stats, &as->reserve->disk_res,
+			    gc_pos_btree_root(b->btree_id));
+}
+
+static void bch2_btree_set_root_ondisk(struct bch_fs *c, struct btree *b, int rw)
+{
+	struct btree_root *r = &c->btree_roots[b->btree_id];
+
+	mutex_lock(&c->btree_root_lock);
+
+	BUG_ON(b != r->b);
+	bkey_copy(&r->key, &b->key);
+	r->level = b->level;
+	r->alive = true;
+	if (rw == WRITE)
+		c->btree_roots_dirty = true;
+
+	mutex_unlock(&c->btree_root_lock);
+}
+
+/**
+ * bch_btree_set_root - update the root in memory and on disk
+ *
+ * To ensure forward progress, the current task must not be holding any
+ * btree node write locks. However, you must hold an intent lock on the
+ * old root.
+ *
+ * Note: This allocates a journal entry but doesn't add any keys to
+ * it.  All the btree roots are part of every journal write, so there
+ * is nothing new to be done.  This just guarantees that there is a
+ * journal write.
+ */
+static void bch2_btree_set_root(struct btree_update *as, struct btree *b,
+				struct btree_iter *iter)
+{
+	struct bch_fs *c = as->c;
+	struct btree *old;
+
+	trace_btree_set_root(c, b);
+	BUG_ON(!b->written);
+
+	old = btree_node_root(c, b);
+
+	/*
+	 * Ensure no one is using the old root while we switch to the
+	 * new root:
+	 */
+	bch2_btree_node_lock_write(old, iter);
+
+	bch2_btree_set_root_inmem(as, b);
+
+	btree_update_updated_root(as);
+
+	/*
+	 * Unlock old root after new root is visible:
+	 *
+	 * The new root isn't persistent, but that's ok: we still have
+	 * an intent lock on the new root, and any updates that would
+	 * depend on the new root would have to update the new root.
+	 */
+	bch2_btree_node_unlock_write(old, iter);
+}
+
+/* Interior node updates: */
+
+static void bch2_insert_fixup_btree_ptr(struct btree_update *as, struct btree *b,
+					struct btree_iter *iter,
+					struct bkey_i *insert,
+					struct btree_node_iter *node_iter)
+{
+	struct bch_fs *c = as->c;
+	struct bch_fs_usage stats = { 0 };
+	struct bkey_packed *k;
+	struct bkey tmp;
+
+	BUG_ON(insert->k.u64s > bch_btree_keys_u64s_remaining(c, b));
+
+	if (bkey_extent_is_data(&insert->k))
+		bch2_mark_key(c, bkey_i_to_s_c(insert),
+			     c->opts.btree_node_size, true,
+			     gc_pos_btree_node(b), &stats, 0, 0);
+
+	while ((k = bch2_btree_node_iter_peek_all(node_iter, b)) &&
+	       !btree_iter_pos_cmp_packed(b, &insert->k.p, k, false))
+		bch2_btree_node_iter_advance(node_iter, b);
+
+	/*
+	 * If we're overwriting, look up pending delete and mark so that gc
+	 * marks it on the pending delete list:
+	 */
+	if (k && !bkey_cmp_packed(b, k, &insert->k))
+		bch2_btree_node_free_index(as, b,
+					   bkey_disassemble(b, k, &tmp),
+					   &stats);
+
+	bch2_fs_usage_apply(c, &stats, &as->reserve->disk_res,
+			    gc_pos_btree_node(b));
+
+	bch2_btree_bset_insert_key(iter, b, node_iter, insert);
+	set_btree_node_dirty(b);
+	set_btree_node_need_write(b);
+}
+
+/*
+ * Move keys from n1 (original replacement node, now lower node) to n2 (higher
+ * node)
+ */
+static struct btree *__btree_split_node(struct btree_update *as,
+					struct btree *n1,
+					struct btree_iter *iter)
+{
+	size_t nr_packed = 0, nr_unpacked = 0;
+	struct btree *n2;
+	struct bset *set1, *set2;
+	struct bkey_packed *k, *prev = NULL;
+
+	n2 = bch2_btree_node_alloc(as, n1->level);
+
+	n2->data->max_key	= n1->data->max_key;
+	n2->data->format	= n1->format;
+	SET_BTREE_NODE_SEQ(n2->data, BTREE_NODE_SEQ(n1->data));
+	n2->key.k.p = n1->key.k.p;
+
+	btree_node_set_format(n2, n2->data->format);
+
+	set1 = btree_bset_first(n1);
+	set2 = btree_bset_first(n2);
+
+	/*
+	 * Has to be a linear search because we don't have an auxiliary
+	 * search tree yet
+	 */
+	k = set1->start;
+	while (1) {
+		if (bkey_next(k) == vstruct_last(set1))
+			break;
+		if (k->_data - set1->_data >= (le16_to_cpu(set1->u64s) * 3) / 5)
+			break;
+
+		if (bkey_packed(k))
+			nr_packed++;
+		else
+			nr_unpacked++;
+
+		prev = k;
+		k = bkey_next(k);
+	}
+
+	BUG_ON(!prev);
+
+	n1->key.k.p = bkey_unpack_pos(n1, prev);
+	n1->data->max_key = n1->key.k.p;
+	n2->data->min_key =
+		btree_type_successor(n1->btree_id, n1->key.k.p);
+
+	set2->u64s = cpu_to_le16((u64 *) vstruct_end(set1) - (u64 *) k);
+	set1->u64s = cpu_to_le16(le16_to_cpu(set1->u64s) - le16_to_cpu(set2->u64s));
+
+	set_btree_bset_end(n1, n1->set);
+	set_btree_bset_end(n2, n2->set);
+
+	n2->nr.live_u64s	= le16_to_cpu(set2->u64s);
+	n2->nr.bset_u64s[0]	= le16_to_cpu(set2->u64s);
+	n2->nr.packed_keys	= n1->nr.packed_keys - nr_packed;
+	n2->nr.unpacked_keys	= n1->nr.unpacked_keys - nr_unpacked;
+
+	n1->nr.live_u64s	= le16_to_cpu(set1->u64s);
+	n1->nr.bset_u64s[0]	= le16_to_cpu(set1->u64s);
+	n1->nr.packed_keys	= nr_packed;
+	n1->nr.unpacked_keys	= nr_unpacked;
+
+	BUG_ON(!set1->u64s);
+	BUG_ON(!set2->u64s);
+
+	memcpy_u64s(set2->start,
+		    vstruct_end(set1),
+		    le16_to_cpu(set2->u64s));
+
+	btree_node_reset_sib_u64s(n1);
+	btree_node_reset_sib_u64s(n2);
+
+	bch2_verify_btree_nr_keys(n1);
+	bch2_verify_btree_nr_keys(n2);
+
+	if (n1->level) {
+		btree_node_interior_verify(n1);
+		btree_node_interior_verify(n2);
+	}
+
+	return n2;
+}
+
+/*
+ * For updates to interior nodes, we've got to do the insert before we split
+ * because the stuff we're inserting has to be inserted atomically. Post split,
+ * the keys might have to go in different nodes and the split would no longer be
+ * atomic.
+ *
+ * Worse, if the insert is from btree node coalescing, if we do the insert after
+ * we do the split (and pick the pivot) - the pivot we pick might be between
+ * nodes that were coalesced, and thus in the middle of a child node post
+ * coalescing:
+ */
+static void btree_split_insert_keys(struct btree_update *as, struct btree *b,
+				    struct btree_iter *iter,
+				    struct keylist *keys)
+{
+	struct btree_node_iter node_iter;
+	struct bkey_i *k = bch2_keylist_front(keys);
+	struct bkey_packed *p;
+	struct bset *i;
+
+	BUG_ON(btree_node_type(b) != BKEY_TYPE_BTREE);
+
+	bch2_btree_node_iter_init(&node_iter, b, k->k.p, false, false);
+
+	while (!bch2_keylist_empty(keys)) {
+		k = bch2_keylist_front(keys);
+
+		BUG_ON(bch_keylist_u64s(keys) >
+		       bch_btree_keys_u64s_remaining(as->c, b));
+		BUG_ON(bkey_cmp(k->k.p, b->data->min_key) < 0);
+		BUG_ON(bkey_cmp(k->k.p, b->data->max_key) > 0);
+
+		bch2_insert_fixup_btree_ptr(as, b, iter, k, &node_iter);
+		bch2_keylist_pop_front(keys);
+	}
+
+	/*
+	 * We can't tolerate whiteouts here - with whiteouts there can be
+	 * duplicate keys, and it would be rather bad if we picked a duplicate
+	 * for the pivot:
+	 */
+	i = btree_bset_first(b);
+	p = i->start;
+	while (p != vstruct_last(i))
+		if (bkey_deleted(p)) {
+			le16_add_cpu(&i->u64s, -p->u64s);
+			set_btree_bset_end(b, b->set);
+			memmove_u64s_down(p, bkey_next(p),
+					  (u64 *) vstruct_last(i) -
+					  (u64 *) p);
+		} else
+			p = bkey_next(p);
+
+	BUG_ON(b->nsets != 1 ||
+	       b->nr.live_u64s != le16_to_cpu(btree_bset_first(b)->u64s));
+
+	btree_node_interior_verify(b);
+}
+
+static void btree_split(struct btree_update *as, struct btree *b,
+			struct btree_iter *iter, struct keylist *keys,
+			unsigned flags)
+{
+	struct bch_fs *c = as->c;
+	struct btree *parent = btree_node_parent(iter, b);
+	struct btree *n1, *n2 = NULL, *n3 = NULL;
+	u64 start_time = local_clock();
+
+	BUG_ON(!parent && (b != btree_node_root(c, b)));
+	BUG_ON(!btree_node_intent_locked(iter, btree_node_root(c, b)->level));
+
+	bch2_btree_interior_update_will_free_node(as, b);
+
+	n1 = bch2_btree_node_alloc_replacement(as, b);
+
+	if (keys)
+		btree_split_insert_keys(as, n1, iter, keys);
+
+	if (vstruct_blocks(n1->data, c->block_bits) > BTREE_SPLIT_THRESHOLD(c)) {
+		trace_btree_split(c, b);
+
+		n2 = __btree_split_node(as, n1, iter);
+
+		bch2_btree_build_aux_trees(n2);
+		bch2_btree_build_aux_trees(n1);
+		six_unlock_write(&n2->lock);
+		six_unlock_write(&n1->lock);
+
+		bch2_btree_node_write(c, n2, SIX_LOCK_intent);
+
+		/*
+		 * Note that on recursive parent_keys == keys, so we
+		 * can't start adding new keys to parent_keys before emptying it
+		 * out (which we did with btree_split_insert_keys() above)
+		 */
+		bch2_keylist_add(&as->parent_keys, &n1->key);
+		bch2_keylist_add(&as->parent_keys, &n2->key);
+
+		if (!parent) {
+			/* Depth increases, make a new root */
+			n3 = __btree_root_alloc(as, b->level + 1);
+
+			n3->sib_u64s[0] = U16_MAX;
+			n3->sib_u64s[1] = U16_MAX;
+
+			btree_split_insert_keys(as, n3, iter, &as->parent_keys);
+
+			bch2_btree_node_write(c, n3, SIX_LOCK_intent);
+		}
+	} else {
+		trace_btree_compact(c, b);
+
+		bch2_btree_build_aux_trees(n1);
+		six_unlock_write(&n1->lock);
+
+		bch2_keylist_add(&as->parent_keys, &n1->key);
+	}
+
+	bch2_btree_node_write(c, n1, SIX_LOCK_intent);
+
+	/* New nodes all written, now make them visible: */
+
+	if (parent) {
+		/* Split a non root node */
+		bch2_btree_insert_node(as, parent, iter, &as->parent_keys, flags);
+	} else if (n3) {
+		bch2_btree_set_root(as, n3, iter);
+	} else {
+		/* Root filled up but didn't need to be split */
+		bch2_btree_set_root(as, n1, iter);
+	}
+
+	bch2_btree_open_bucket_put(c, n1);
+	if (n2)
+		bch2_btree_open_bucket_put(c, n2);
+	if (n3)
+		bch2_btree_open_bucket_put(c, n3);
+
+	/*
+	 * Note - at this point other linked iterators could still have @b read
+	 * locked; we're depending on the bch2_btree_iter_node_replace() calls
+	 * below removing all references to @b so we don't return with other
+	 * iterators pointing to a node they have locked that's been freed.
+	 *
+	 * We have to free the node first because the bch2_iter_node_replace()
+	 * calls will drop _our_ iterator's reference - and intent lock - to @b.
+	 */
+	bch2_btree_node_free_inmem(c, b, iter);
+
+	/* Successful split, update the iterator to point to the new nodes: */
+
+	if (n3)
+		bch2_btree_iter_node_replace(iter, n3);
+	if (n2)
+		bch2_btree_iter_node_replace(iter, n2);
+	bch2_btree_iter_node_replace(iter, n1);
+
+	bch2_time_stats_update(&c->times[BCH_TIME_btree_split], start_time);
+}
+
+static void
+bch2_btree_insert_keys_interior(struct btree_update *as, struct btree *b,
+				struct btree_iter *iter, struct keylist *keys)
+{
+	struct btree_iter *linked;
+	struct btree_node_iter node_iter;
+	struct bkey_i *insert = bch2_keylist_front(keys);
+	struct bkey_packed *k;
+
+	/* Don't screw up @iter's position: */
+	node_iter = iter->l[b->level].iter;
+
+	/*
+	 * btree_split(), btree_gc_coalesce() will insert keys before
+	 * the iterator's current position - they know the keys go in
+	 * the node the iterator points to:
+	 */
+	while ((k = bch2_btree_node_iter_prev_all(&node_iter, b)) &&
+	       (bkey_cmp_packed(b, k, &insert->k) >= 0))
+		;
+
+	while (!bch2_keylist_empty(keys)) {
+		insert = bch2_keylist_front(keys);
+
+		bch2_insert_fixup_btree_ptr(as, b, iter, insert, &node_iter);
+		bch2_keylist_pop_front(keys);
+	}
+
+	btree_update_updated_node(as, b);
+
+	for_each_btree_iter_with_node(iter, b, linked)
+		bch2_btree_node_iter_peek(&linked->l[b->level].iter, b);
+
+	bch2_btree_iter_verify(iter, b);
+}
+
+/**
+ * bch_btree_insert_node - insert bkeys into a given btree node
+ *
+ * @iter:		btree iterator
+ * @keys:		list of keys to insert
+ * @hook:		insert callback
+ * @persistent:		if not null, @persistent will wait on journal write
+ *
+ * Inserts as many keys as it can into a given btree node, splitting it if full.
+ * If a split occurred, this function will return early. This can only happen
+ * for leaf nodes -- inserts into interior nodes have to be atomic.
+ */
+void bch2_btree_insert_node(struct btree_update *as, struct btree *b,
+			    struct btree_iter *iter, struct keylist *keys,
+			    unsigned flags)
+{
+	struct bch_fs *c = as->c;
+	int old_u64s = le16_to_cpu(btree_bset_last(b)->u64s);
+	int old_live_u64s = b->nr.live_u64s;
+	int live_u64s_added, u64s_added;
+
+	BUG_ON(!btree_node_intent_locked(iter, btree_node_root(c, b)->level));
+	BUG_ON(!b->level);
+	BUG_ON(!as || as->b);
+	bch2_verify_keylist_sorted(keys);
+
+	if (as->must_rewrite)
+		goto split;
+
+	bch2_btree_node_lock_for_insert(c, b, iter);
+
+	if (!bch2_btree_node_insert_fits(c, b, bch_keylist_u64s(keys))) {
+		bch2_btree_node_unlock_write(b, iter);
+		goto split;
+	}
+
+	bch2_btree_insert_keys_interior(as, b, iter, keys);
+
+	live_u64s_added = (int) b->nr.live_u64s - old_live_u64s;
+	u64s_added = (int) le16_to_cpu(btree_bset_last(b)->u64s) - old_u64s;
+
+	if (b->sib_u64s[0] != U16_MAX && live_u64s_added < 0)
+		b->sib_u64s[0] = max(0, (int) b->sib_u64s[0] + live_u64s_added);
+	if (b->sib_u64s[1] != U16_MAX && live_u64s_added < 0)
+		b->sib_u64s[1] = max(0, (int) b->sib_u64s[1] + live_u64s_added);
+
+	if (u64s_added > live_u64s_added &&
+	    bch2_maybe_compact_whiteouts(c, b))
+		bch2_btree_iter_reinit_node(iter, b);
+
+	bch2_btree_node_unlock_write(b, iter);
+
+	btree_node_interior_verify(b);
+
+	bch2_foreground_maybe_merge(c, iter, b->level, flags);
+	return;
+split:
+	btree_split(as, b, iter, keys, flags);
+}
+
+int bch2_btree_split_leaf(struct bch_fs *c, struct btree_iter *iter,
+			  unsigned flags)
+{
+	struct btree *b = iter->l[0].b;
+	struct btree_update *as;
+	struct closure cl;
+	int ret = 0;
+	struct btree_iter *linked;
+
+	/*
+	 * We already have a disk reservation and open buckets pinned; this
+	 * allocation must not block:
+	 */
+	for_each_btree_iter(iter, linked)
+		if (linked->btree_id == BTREE_ID_EXTENTS)
+			flags |= BTREE_INSERT_USE_RESERVE;
+
+	closure_init_stack(&cl);
+
+	/* Hack, because gc and splitting nodes doesn't mix yet: */
+	if (!down_read_trylock(&c->gc_lock)) {
+		if (flags & BTREE_INSERT_NOUNLOCK)
+			return -EINTR;
+
+		bch2_btree_iter_unlock(iter);
+		down_read(&c->gc_lock);
+
+		if (btree_iter_linked(iter))
+			ret = -EINTR;
+	}
+
+	/*
+	 * XXX: figure out how far we might need to split,
+	 * instead of locking/reserving all the way to the root:
+	 */
+	if (!bch2_btree_iter_upgrade(iter, U8_MAX,
+			!(flags & BTREE_INSERT_NOUNLOCK))) {
+		ret = -EINTR;
+		goto out;
+	}
+
+	as = bch2_btree_update_start(c, iter->btree_id,
+		btree_update_reserve_required(c, b), flags,
+		!(flags & BTREE_INSERT_NOUNLOCK) ? &cl : NULL);
+	if (IS_ERR(as)) {
+		ret = PTR_ERR(as);
+		if (ret == -EAGAIN) {
+			BUG_ON(flags & BTREE_INSERT_NOUNLOCK);
+			bch2_btree_iter_unlock(iter);
+			ret = -EINTR;
+		}
+		goto out;
+	}
+
+	btree_split(as, b, iter, NULL, flags);
+	bch2_btree_update_done(as);
+
+	/*
+	 * We haven't successfully inserted yet, so don't downgrade all the way
+	 * back to read locks;
+	 */
+	__bch2_btree_iter_downgrade(iter, 1);
+out:
+	up_read(&c->gc_lock);
+	closure_sync(&cl);
+	return ret;
+}
+
+void __bch2_foreground_maybe_merge(struct bch_fs *c,
+				   struct btree_iter *iter,
+				   unsigned level,
+				   unsigned flags,
+				   enum btree_node_sibling sib)
+{
+	struct btree_update *as;
+	struct bkey_format_state new_s;
+	struct bkey_format new_f;
+	struct bkey_i delete;
+	struct btree *b, *m, *n, *prev, *next, *parent;
+	struct closure cl;
+	size_t sib_u64s;
+	int ret = 0;
+
+	closure_init_stack(&cl);
+retry:
+	BUG_ON(!btree_node_locked(iter, level));
+
+	b = iter->l[level].b;
+
+	parent = btree_node_parent(iter, b);
+	if (!parent)
+		goto out;
+
+	if (b->sib_u64s[sib] > BTREE_FOREGROUND_MERGE_THRESHOLD(c))
+		goto out;
+
+	/* XXX: can't be holding read locks */
+	m = bch2_btree_node_get_sibling(c, iter, b,
+			!(flags & BTREE_INSERT_NOUNLOCK), sib);
+	if (IS_ERR(m)) {
+		ret = PTR_ERR(m);
+		goto err;
+	}
+
+	/* NULL means no sibling: */
+	if (!m) {
+		b->sib_u64s[sib] = U16_MAX;
+		goto out;
+	}
+
+	if (sib == btree_prev_sib) {
+		prev = m;
+		next = b;
+	} else {
+		prev = b;
+		next = m;
+	}
+
+	bch2_bkey_format_init(&new_s);
+	__bch2_btree_calc_format(&new_s, b);
+	__bch2_btree_calc_format(&new_s, m);
+	new_f = bch2_bkey_format_done(&new_s);
+
+	sib_u64s = btree_node_u64s_with_format(b, &new_f) +
+		btree_node_u64s_with_format(m, &new_f);
+
+	if (sib_u64s > BTREE_FOREGROUND_MERGE_HYSTERESIS(c)) {
+		sib_u64s -= BTREE_FOREGROUND_MERGE_HYSTERESIS(c);
+		sib_u64s /= 2;
+		sib_u64s += BTREE_FOREGROUND_MERGE_HYSTERESIS(c);
+	}
+
+	sib_u64s = min(sib_u64s, btree_max_u64s(c));
+	b->sib_u64s[sib] = sib_u64s;
+
+	if (b->sib_u64s[sib] > BTREE_FOREGROUND_MERGE_THRESHOLD(c)) {
+		six_unlock_intent(&m->lock);
+		goto out;
+	}
+
+	/* We're changing btree topology, doesn't mix with gc: */
+	if (!down_read_trylock(&c->gc_lock))
+		goto err_cycle_gc_lock;
+
+	if (!bch2_btree_iter_upgrade(iter, U8_MAX,
+			!(flags & BTREE_INSERT_NOUNLOCK))) {
+		ret = -EINTR;
+		goto err_unlock;
+	}
+
+	as = bch2_btree_update_start(c, iter->btree_id,
+			 btree_update_reserve_required(c, parent) + 1,
+			 BTREE_INSERT_NOFAIL|
+			 BTREE_INSERT_USE_RESERVE,
+			 !(flags & BTREE_INSERT_NOUNLOCK) ? &cl : NULL);
+	if (IS_ERR(as)) {
+		ret = PTR_ERR(as);
+		goto err_unlock;
+	}
+
+	trace_btree_merge(c, b);
+
+	bch2_btree_interior_update_will_free_node(as, b);
+	bch2_btree_interior_update_will_free_node(as, m);
+
+	n = bch2_btree_node_alloc(as, b->level);
+
+	n->data->min_key	= prev->data->min_key;
+	n->data->max_key	= next->data->max_key;
+	n->data->format		= new_f;
+	n->key.k.p		= next->key.k.p;
+
+	btree_node_set_format(n, new_f);
+
+	bch2_btree_sort_into(c, n, prev);
+	bch2_btree_sort_into(c, n, next);
+
+	bch2_btree_build_aux_trees(n);
+	six_unlock_write(&n->lock);
+
+	bkey_init(&delete.k);
+	delete.k.p = prev->key.k.p;
+	bch2_keylist_add(&as->parent_keys, &delete);
+	bch2_keylist_add(&as->parent_keys, &n->key);
+
+	bch2_btree_node_write(c, n, SIX_LOCK_intent);
+
+	bch2_btree_insert_node(as, parent, iter, &as->parent_keys, flags);
+
+	bch2_btree_open_bucket_put(c, n);
+	bch2_btree_node_free_inmem(c, b, iter);
+	bch2_btree_node_free_inmem(c, m, iter);
+	bch2_btree_iter_node_replace(iter, n);
+
+	bch2_btree_iter_verify(iter, n);
+
+	bch2_btree_update_done(as);
+
+	six_unlock_intent(&m->lock);
+	up_read(&c->gc_lock);
+out:
+	/*
+	 * Don't downgrade locks here: we're called after successful insert,
+	 * and the caller will downgrade locks after a successful insert
+	 * anyways (in case e.g. a split was required first)
+	 *
+	 * And we're also called when inserting into interior nodes in the
+	 * split path, and downgrading to read locks in there is potentially
+	 * confusing:
+	 */
+	closure_sync(&cl);
+	return;
+
+err_cycle_gc_lock:
+	six_unlock_intent(&m->lock);
+
+	if (flags & BTREE_INSERT_NOUNLOCK)
+		goto out;
+
+	bch2_btree_iter_unlock(iter);
+
+	down_read(&c->gc_lock);
+	up_read(&c->gc_lock);
+	ret = -EINTR;
+	goto err;
+
+err_unlock:
+	six_unlock_intent(&m->lock);
+	up_read(&c->gc_lock);
+err:
+	BUG_ON(ret == -EAGAIN && (flags & BTREE_INSERT_NOUNLOCK));
+
+	if ((ret == -EAGAIN || ret == -EINTR) &&
+	    !(flags & BTREE_INSERT_NOUNLOCK)) {
+		bch2_btree_iter_unlock(iter);
+		closure_sync(&cl);
+		ret = bch2_btree_iter_traverse(iter);
+		if (ret)
+			goto out;
+
+		goto retry;
+	}
+
+	goto out;
+}
+
+static int __btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter,
+				struct btree *b, unsigned flags,
+				struct closure *cl)
+{
+	struct btree *n, *parent = btree_node_parent(iter, b);
+	struct btree_update *as;
+
+	as = bch2_btree_update_start(c, iter->btree_id,
+		(parent
+		 ? btree_update_reserve_required(c, parent)
+		 : 0) + 1,
+		flags, cl);
+	if (IS_ERR(as)) {
+		trace_btree_gc_rewrite_node_fail(c, b);
+		return PTR_ERR(as);
+	}
+
+	bch2_btree_interior_update_will_free_node(as, b);
+
+	n = bch2_btree_node_alloc_replacement(as, b);
+
+	bch2_btree_build_aux_trees(n);
+	six_unlock_write(&n->lock);
+
+	trace_btree_gc_rewrite_node(c, b);
+
+	bch2_btree_node_write(c, n, SIX_LOCK_intent);
+
+	if (parent) {
+		bch2_keylist_add(&as->parent_keys, &n->key);
+		bch2_btree_insert_node(as, parent, iter, &as->parent_keys, flags);
+	} else {
+		bch2_btree_set_root(as, n, iter);
+	}
+
+	bch2_btree_open_bucket_put(c, n);
+
+	bch2_btree_node_free_inmem(c, b, iter);
+
+	bch2_btree_iter_node_replace(iter, n);
+
+	bch2_btree_update_done(as);
+	return 0;
+}
+
+/**
+ * bch_btree_node_rewrite - Rewrite/move a btree node
+ *
+ * Returns 0 on success, -EINTR or -EAGAIN on failure (i.e.
+ * btree_check_reserve() has to wait)
+ */
+int bch2_btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter,
+			    __le64 seq, unsigned flags)
+{
+	struct closure cl;
+	struct btree *b;
+	int ret;
+
+	flags |= BTREE_INSERT_NOFAIL;
+
+	closure_init_stack(&cl);
+
+	bch2_btree_iter_upgrade(iter, U8_MAX, true);
+
+	if (!(flags & BTREE_INSERT_GC_LOCK_HELD)) {
+		if (!down_read_trylock(&c->gc_lock)) {
+			bch2_btree_iter_unlock(iter);
+			down_read(&c->gc_lock);
+		}
+	}
+
+	while (1) {
+		ret = bch2_btree_iter_traverse(iter);
+		if (ret)
+			break;
+
+		b = bch2_btree_iter_peek_node(iter);
+		if (!b || b->data->keys.seq != seq)
+			break;
+
+		ret = __btree_node_rewrite(c, iter, b, flags, &cl);
+		if (ret != -EAGAIN &&
+		    ret != -EINTR)
+			break;
+
+		bch2_btree_iter_unlock(iter);
+		closure_sync(&cl);
+	}
+
+	bch2_btree_iter_downgrade(iter);
+
+	if (!(flags & BTREE_INSERT_GC_LOCK_HELD))
+		up_read(&c->gc_lock);
+
+	closure_sync(&cl);
+	return ret;
+}
+
+static void __bch2_btree_node_update_key(struct bch_fs *c,
+					 struct btree_update *as,
+					 struct btree_iter *iter,
+					 struct btree *b, struct btree *new_hash,
+					 struct bkey_i_extent *new_key)
+{
+	struct btree *parent;
+	int ret;
+
+	/*
+	 * Two corner cases that need to be thought about here:
+	 *
+	 * @b may not be reachable yet - there might be another interior update
+	 * operation waiting on @b to be written, and we're gonna deliver the
+	 * write completion to that interior update operation _before_
+	 * persisting the new_key update
+	 *
+	 * That ends up working without us having to do anything special here:
+	 * the reason is, we do kick off (and do the in memory updates) for the
+	 * update for @new_key before we return, creating a new interior_update
+	 * operation here.
+	 *
+	 * The new interior update operation here will in effect override the
+	 * previous one. The previous one was going to terminate - make @b
+	 * reachable - in one of two ways:
+	 * - updating the btree root pointer
+	 *   In that case,
+	 *   no, this doesn't work. argh.
+	 */
+
+	if (b->will_make_reachable)
+		as->must_rewrite = true;
+
+	btree_interior_update_add_node_reference(as, b);
+
+	parent = btree_node_parent(iter, b);
+	if (parent) {
+		if (new_hash) {
+			bkey_copy(&new_hash->key, &new_key->k_i);
+			ret = bch2_btree_node_hash_insert(&c->btree_cache,
+					new_hash, b->level, b->btree_id);
+			BUG_ON(ret);
+		}
+
+		bch2_keylist_add(&as->parent_keys, &new_key->k_i);
+		bch2_btree_insert_node(as, parent, iter, &as->parent_keys, 0);
+
+		if (new_hash) {
+			mutex_lock(&c->btree_cache.lock);
+			bch2_btree_node_hash_remove(&c->btree_cache, new_hash);
+
+			bch2_btree_node_hash_remove(&c->btree_cache, b);
+
+			bkey_copy(&b->key, &new_key->k_i);
+			ret = __bch2_btree_node_hash_insert(&c->btree_cache, b);
+			BUG_ON(ret);
+			mutex_unlock(&c->btree_cache.lock);
+		} else {
+			bkey_copy(&b->key, &new_key->k_i);
+		}
+	} else {
+		struct bch_fs_usage stats = { 0 };
+
+		BUG_ON(btree_node_root(c, b) != b);
+
+		bch2_btree_node_lock_write(b, iter);
+
+		bch2_mark_key(c, bkey_i_to_s_c(&new_key->k_i),
+			      c->opts.btree_node_size, true,
+			      gc_pos_btree_root(b->btree_id),
+			      &stats, 0, 0);
+		bch2_btree_node_free_index(as, NULL,
+					   bkey_i_to_s_c(&b->key),
+					   &stats);
+		bch2_fs_usage_apply(c, &stats, &as->reserve->disk_res,
+				    gc_pos_btree_root(b->btree_id));
+
+		if (PTR_HASH(&new_key->k_i) != PTR_HASH(&b->key)) {
+			mutex_lock(&c->btree_cache.lock);
+			bch2_btree_node_hash_remove(&c->btree_cache, b);
+
+			bkey_copy(&b->key, &new_key->k_i);
+			ret = __bch2_btree_node_hash_insert(&c->btree_cache, b);
+			BUG_ON(ret);
+			mutex_unlock(&c->btree_cache.lock);
+		} else {
+			bkey_copy(&b->key, &new_key->k_i);
+		}
+
+		btree_update_updated_root(as);
+		bch2_btree_node_unlock_write(b, iter);
+	}
+
+	bch2_btree_update_done(as);
+}
+
+int bch2_btree_node_update_key(struct bch_fs *c, struct btree_iter *iter,
+			       struct btree *b, struct bkey_i_extent *new_key)
+{
+	struct btree *parent = btree_node_parent(iter, b);
+	struct btree_update *as = NULL;
+	struct btree *new_hash = NULL;
+	struct closure cl;
+	int ret;
+
+	closure_init_stack(&cl);
+
+	if (!bch2_btree_iter_upgrade(iter, U8_MAX, true))
+		return -EINTR;
+
+	if (!down_read_trylock(&c->gc_lock)) {
+		bch2_btree_iter_unlock(iter);
+		down_read(&c->gc_lock);
+
+		if (!bch2_btree_iter_relock(iter)) {
+			ret = -EINTR;
+			goto err;
+		}
+	}
+
+	/* check PTR_HASH() after @b is locked by btree_iter_traverse(): */
+	if (PTR_HASH(&new_key->k_i) != PTR_HASH(&b->key)) {
+		/* bch2_btree_reserve_get will unlock */
+		ret = bch2_btree_cache_cannibalize_lock(c, &cl);
+		if (ret) {
+			ret = -EINTR;
+
+			bch2_btree_iter_unlock(iter);
+			up_read(&c->gc_lock);
+			closure_sync(&cl);
+			down_read(&c->gc_lock);
+
+			if (!bch2_btree_iter_relock(iter))
+				goto err;
+		}
+
+		new_hash = bch2_btree_node_mem_alloc(c);
+	}
+
+	as = bch2_btree_update_start(c, iter->btree_id,
+		parent ? btree_update_reserve_required(c, parent) : 0,
+		BTREE_INSERT_NOFAIL|
+		BTREE_INSERT_USE_RESERVE|
+		BTREE_INSERT_USE_ALLOC_RESERVE,
+		&cl);
+
+	if (IS_ERR(as)) {
+		ret = PTR_ERR(as);
+		if (ret == -EAGAIN)
+			ret = -EINTR;
+
+		if (ret != -EINTR)
+			goto err;
+
+		bch2_btree_iter_unlock(iter);
+		up_read(&c->gc_lock);
+		closure_sync(&cl);
+		down_read(&c->gc_lock);
+
+		if (!bch2_btree_iter_relock(iter))
+			goto err;
+	}
+
+	ret = bch2_mark_bkey_replicas(c, BCH_DATA_BTREE,
+				      extent_i_to_s_c(new_key).s_c);
+	if (ret)
+		goto err_free_update;
+
+	__bch2_btree_node_update_key(c, as, iter, b, new_hash, new_key);
+
+	bch2_btree_iter_downgrade(iter);
+err:
+	if (new_hash) {
+		mutex_lock(&c->btree_cache.lock);
+		list_move(&new_hash->list, &c->btree_cache.freeable);
+		mutex_unlock(&c->btree_cache.lock);
+
+		six_unlock_write(&new_hash->lock);
+		six_unlock_intent(&new_hash->lock);
+	}
+	up_read(&c->gc_lock);
+	closure_sync(&cl);
+	return ret;
+err_free_update:
+	bch2_btree_update_free(as);
+	goto err;
+}
+
+/* Init code: */
+
+/*
+ * Only for filesystem bringup, when first reading the btree roots or allocating
+ * btree roots when initializing a new filesystem:
+ */
+void bch2_btree_set_root_for_read(struct bch_fs *c, struct btree *b)
+{
+	BUG_ON(btree_node_root(c, b));
+
+	__bch2_btree_set_root_inmem(c, b);
+	bch2_btree_set_root_ondisk(c, b, READ);
+}
+
+void bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id)
+{
+	struct closure cl;
+	struct btree *b;
+	int ret;
+
+	closure_init_stack(&cl);
+
+	do {
+		ret = bch2_btree_cache_cannibalize_lock(c, &cl);
+		closure_sync(&cl);
+	} while (ret);
+
+	b = bch2_btree_node_mem_alloc(c);
+	bch2_btree_cache_cannibalize_unlock(c);
+
+	set_btree_node_fake(b);
+	b->level	= 0;
+	b->btree_id	= id;
+
+	bkey_extent_init(&b->key);
+	b->key.k.p = POS_MAX;
+	bkey_i_to_extent(&b->key)->v._data[0] = U64_MAX - id;
+
+	bch2_bset_init_first(b, &b->data->keys);
+	bch2_btree_build_aux_trees(b);
+
+	b->data->min_key = POS_MIN;
+	b->data->max_key = POS_MAX;
+	b->data->format = bch2_btree_calc_format(b);
+	btree_node_set_format(b, b->data->format);
+
+	ret = bch2_btree_node_hash_insert(&c->btree_cache, b, b->level, b->btree_id);
+	BUG_ON(ret);
+
+	__bch2_btree_set_root_inmem(c, b);
+
+	six_unlock_write(&b->lock);
+	six_unlock_intent(&b->lock);
+}
+
+ssize_t bch2_btree_updates_print(struct bch_fs *c, char *buf)
+{
+	char *out = buf, *end = buf + PAGE_SIZE;
+	struct btree_update *as;
+
+	mutex_lock(&c->btree_interior_update_lock);
+	list_for_each_entry(as, &c->btree_interior_update_list, list)
+		out += scnprintf(out, end - out, "%p m %u w %u r %u j %llu\n",
+				 as,
+				 as->mode,
+				 as->nodes_written,
+				 atomic_read(&as->cl.remaining) & CLOSURE_REMAINING_MASK,
+				 bch2_journal_pin_seq(&c->journal, &as->journal));
+	mutex_unlock(&c->btree_interior_update_lock);
+
+	return out - buf;
+}
+
+size_t bch2_btree_interior_updates_nr_pending(struct bch_fs *c)
+{
+	size_t ret = 0;
+	struct list_head *i;
+
+	mutex_lock(&c->btree_interior_update_lock);
+	list_for_each(i, &c->btree_interior_update_list)
+		ret++;
+	mutex_unlock(&c->btree_interior_update_lock);
+
+	return ret;
+}
diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h
new file mode 100644
index 000000000000..7a19a52bbcff
--- /dev/null
+++ b/fs/bcachefs/btree_update_interior.h
@@ -0,0 +1,374 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BTREE_UPDATE_INTERIOR_H
+#define _BCACHEFS_BTREE_UPDATE_INTERIOR_H
+
+#include "btree_cache.h"
+#include "btree_locking.h"
+#include "btree_update.h"
+
+struct btree_reserve {
+	struct disk_reservation	disk_res;
+	unsigned		nr;
+	struct btree		*b[BTREE_RESERVE_MAX];
+};
+
+void __bch2_btree_calc_format(struct bkey_format_state *, struct btree *);
+bool bch2_btree_node_format_fits(struct bch_fs *c, struct btree *,
+				struct bkey_format *);
+
+/* Btree node freeing/allocation: */
+
+/*
+ * Tracks a btree node that has been (or is about to be) freed in memory, but
+ * has _not_ yet been freed on disk (because the write that makes the new
+ * node(s) visible and frees the old hasn't completed yet)
+ */
+struct pending_btree_node_free {
+	bool			index_update_done;
+
+	__le64			seq;
+	enum btree_id		btree_id;
+	unsigned		level;
+	__BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX);
+};
+
+/*
+ * Tracks an in progress split/rewrite of a btree node and the update to the
+ * parent node:
+ *
+ * When we split/rewrite a node, we do all the updates in memory without
+ * waiting for any writes to complete - we allocate the new node(s) and update
+ * the parent node, possibly recursively up to the root.
+ *
+ * The end result is that we have one or more new nodes being written -
+ * possibly several, if there were multiple splits - and then a write (updating
+ * an interior node) which will make all these new nodes visible.
+ *
+ * Additionally, as we split/rewrite nodes we free the old nodes - but the old
+ * nodes can't be freed (their space on disk can't be reclaimed) until the
+ * update to the interior node that makes the new node visible completes -
+ * until then, the old nodes are still reachable on disk.
+ *
+ */
+struct btree_update {
+	struct closure			cl;
+	struct bch_fs			*c;
+
+	struct list_head		list;
+
+	/* What kind of update are we doing? */
+	enum {
+		BTREE_INTERIOR_NO_UPDATE,
+		BTREE_INTERIOR_UPDATING_NODE,
+		BTREE_INTERIOR_UPDATING_ROOT,
+		BTREE_INTERIOR_UPDATING_AS,
+	} mode;
+
+	unsigned			must_rewrite:1;
+	unsigned			nodes_written:1;
+
+	enum btree_id			btree_id;
+
+	struct btree_reserve		*reserve;
+
+	/*
+	 * BTREE_INTERIOR_UPDATING_NODE:
+	 * The update that made the new nodes visible was a regular update to an
+	 * existing interior node - @b. We can't write out the update to @b
+	 * until the new nodes we created are finished writing, so we block @b
+	 * from writing by putting this btree_interior update on the
+	 * @b->write_blocked list with @write_blocked_list:
+	 */
+	struct btree			*b;
+	struct list_head		write_blocked_list;
+
+	/*
+	 * BTREE_INTERIOR_UPDATING_AS: btree node we updated was freed, so now
+	 * we're now blocking another btree_update
+	 * @parent_as - btree_update that's waiting on our nodes to finish
+	 * writing, before it can make new nodes visible on disk
+	 * @wait - list of child btree_updates that are waiting on this
+	 * btree_update to make all the new nodes visible before they can free
+	 * their old btree nodes
+	 */
+	struct btree_update		*parent_as;
+	struct closure_waitlist		wait;
+
+	/*
+	 * We may be freeing nodes that were dirty, and thus had journal entries
+	 * pinned: we need to transfer the oldest of those pins to the
+	 * btree_update operation, and release it when the new node(s)
+	 * are all persistent and reachable:
+	 */
+	struct journal_entry_pin	journal;
+
+	u64				journal_seq;
+
+	/*
+	 * Nodes being freed:
+	 * Protected by c->btree_node_pending_free_lock
+	 */
+	struct pending_btree_node_free	pending[BTREE_MAX_DEPTH + GC_MERGE_NODES];
+	unsigned			nr_pending;
+
+	/* New nodes, that will be made reachable by this update: */
+	struct btree			*new_nodes[BTREE_MAX_DEPTH * 2 + GC_MERGE_NODES];
+	unsigned			nr_new_nodes;
+
+	/* Only here to reduce stack usage on recursive splits: */
+	struct keylist			parent_keys;
+	/*
+	 * Enough room for btree_split's keys without realloc - btree node
+	 * pointers never have crc/compression info, so we only need to acount
+	 * for the pointers for three keys
+	 */
+	u64				inline_keys[BKEY_BTREE_PTR_U64s_MAX * 3];
+};
+
+#define for_each_pending_btree_node_free(c, as, p)			\
+	list_for_each_entry(as, &c->btree_interior_update_list, list)	\
+		for (p = as->pending; p < as->pending + as->nr_pending; p++)
+
+void bch2_btree_node_free_inmem(struct bch_fs *, struct btree *,
+				struct btree_iter *);
+void bch2_btree_node_free_never_inserted(struct bch_fs *, struct btree *);
+void bch2_btree_open_bucket_put(struct bch_fs *, struct btree *);
+
+struct btree *__bch2_btree_node_alloc_replacement(struct btree_update *,
+						  struct btree *,
+						  struct bkey_format);
+
+void bch2_btree_update_done(struct btree_update *);
+struct btree_update *
+bch2_btree_update_start(struct bch_fs *, enum btree_id, unsigned,
+			unsigned, struct closure *);
+
+void bch2_btree_interior_update_will_free_node(struct btree_update *,
+					       struct btree *);
+
+void bch2_btree_insert_node(struct btree_update *, struct btree *,
+			    struct btree_iter *, struct keylist *,
+			    unsigned);
+int bch2_btree_split_leaf(struct bch_fs *, struct btree_iter *, unsigned);
+
+void __bch2_foreground_maybe_merge(struct bch_fs *, struct btree_iter *,
+				   unsigned, unsigned, enum btree_node_sibling);
+
+static inline void bch2_foreground_maybe_merge_sibling(struct bch_fs *c,
+					struct btree_iter *iter,
+					unsigned level, unsigned flags,
+					enum btree_node_sibling sib)
+{
+	struct btree *b;
+
+	/*
+	 * iterators are inconsistent when they hit end of leaf, until
+	 * traversed again
+	 *
+	 * XXX inconsistent how?
+	 */
+	if (iter->flags & BTREE_ITER_AT_END_OF_LEAF)
+		return;
+
+	if (iter->uptodate >= BTREE_ITER_NEED_TRAVERSE)
+		return;
+
+	if (!bch2_btree_node_relock(iter, level))
+		return;
+
+	b = iter->l[level].b;
+	if (b->sib_u64s[sib] > c->btree_foreground_merge_threshold)
+		return;
+
+	__bch2_foreground_maybe_merge(c, iter, level, flags, sib);
+}
+
+static inline void bch2_foreground_maybe_merge(struct bch_fs *c,
+					       struct btree_iter *iter,
+					       unsigned level,
+					       unsigned flags)
+{
+	bch2_foreground_maybe_merge_sibling(c, iter, level, flags,
+					    btree_prev_sib);
+	bch2_foreground_maybe_merge_sibling(c, iter, level, flags,
+					    btree_next_sib);
+}
+
+void bch2_btree_set_root_for_read(struct bch_fs *, struct btree *);
+void bch2_btree_root_alloc(struct bch_fs *, enum btree_id);
+
+static inline unsigned btree_update_reserve_required(struct bch_fs *c,
+						     struct btree *b)
+{
+	unsigned depth = btree_node_root(c, b)->level + 1;
+
+	/*
+	 * Number of nodes we might have to allocate in a worst case btree
+	 * split operation - we split all the way up to the root, then allocate
+	 * a new root, unless we're already at max depth:
+	 */
+	if (depth < BTREE_MAX_DEPTH)
+		return (depth - b->level) * 2 + 1;
+	else
+		return (depth - b->level) * 2 - 1;
+}
+
+static inline void btree_node_reset_sib_u64s(struct btree *b)
+{
+	b->sib_u64s[0] = b->nr.live_u64s;
+	b->sib_u64s[1] = b->nr.live_u64s;
+}
+
+static inline void *btree_data_end(struct bch_fs *c, struct btree *b)
+{
+	return (void *) b->data + btree_bytes(c);
+}
+
+static inline struct bkey_packed *unwritten_whiteouts_start(struct bch_fs *c,
+							    struct btree *b)
+{
+	return (void *) ((u64 *) btree_data_end(c, b) - b->whiteout_u64s);
+}
+
+static inline struct bkey_packed *unwritten_whiteouts_end(struct bch_fs *c,
+							  struct btree *b)
+{
+	return btree_data_end(c, b);
+}
+
+static inline void *write_block(struct btree *b)
+{
+	return (void *) b->data + (b->written << 9);
+}
+
+static inline bool bset_written(struct btree *b, struct bset *i)
+{
+	return (void *) i < write_block(b);
+}
+
+static inline bool bset_unwritten(struct btree *b, struct bset *i)
+{
+	return (void *) i > write_block(b);
+}
+
+static inline ssize_t __bch_btree_u64s_remaining(struct bch_fs *c,
+						 struct btree *b,
+						 void *end)
+{
+	ssize_t used = bset_byte_offset(b, end) / sizeof(u64) +
+		b->whiteout_u64s +
+		b->uncompacted_whiteout_u64s;
+	ssize_t total = c->opts.btree_node_size << 6;
+
+	return total - used;
+}
+
+static inline size_t bch_btree_keys_u64s_remaining(struct bch_fs *c,
+						   struct btree *b)
+{
+	ssize_t remaining = __bch_btree_u64s_remaining(c, b,
+				btree_bkey_last(b, bset_tree_last(b)));
+
+	BUG_ON(remaining < 0);
+
+	if (bset_written(b, btree_bset_last(b)))
+		return 0;
+
+	return remaining;
+}
+
+static inline unsigned btree_write_set_buffer(struct btree *b)
+{
+	/*
+	 * Could buffer up larger amounts of keys for btrees with larger keys,
+	 * pending benchmarking:
+	 */
+	return 4 << 10;
+}
+
+static inline struct btree_node_entry *want_new_bset(struct bch_fs *c,
+						     struct btree *b)
+{
+	struct bset *i = btree_bset_last(b);
+	struct btree_node_entry *bne = max(write_block(b),
+			(void *) btree_bkey_last(b, bset_tree_last(b)));
+	ssize_t remaining_space =
+		__bch_btree_u64s_remaining(c, b, &bne->keys.start[0]);
+
+	if (unlikely(bset_written(b, i))) {
+		if (remaining_space > (ssize_t) (block_bytes(c) >> 3))
+			return bne;
+	} else {
+		if (unlikely(vstruct_bytes(i) > btree_write_set_buffer(b)) &&
+		    remaining_space > (ssize_t) (btree_write_set_buffer(b) >> 3))
+			return bne;
+	}
+
+	return NULL;
+}
+
+static inline void unreserve_whiteout(struct btree *b, struct bset_tree *t,
+				      struct bkey_packed *k)
+{
+	if (bset_written(b, bset(b, t))) {
+		EBUG_ON(b->uncompacted_whiteout_u64s <
+			bkeyp_key_u64s(&b->format, k));
+		b->uncompacted_whiteout_u64s -=
+			bkeyp_key_u64s(&b->format, k);
+	}
+}
+
+static inline void reserve_whiteout(struct btree *b, struct bset_tree *t,
+				    struct bkey_packed *k)
+{
+	if (bset_written(b, bset(b, t))) {
+		BUG_ON(!k->needs_whiteout);
+		b->uncompacted_whiteout_u64s +=
+			bkeyp_key_u64s(&b->format, k);
+	}
+}
+
+/*
+ * write lock must be held on @b (else the dirty bset that we were going to
+ * insert into could be written out from under us)
+ */
+static inline bool bch2_btree_node_insert_fits(struct bch_fs *c,
+					      struct btree *b, unsigned u64s)
+{
+	if (unlikely(btree_node_fake(b)))
+		return false;
+
+	if (btree_node_is_extents(b)) {
+		/* The insert key might split an existing key
+		 * (bch2_insert_fixup_extent() -> BCH_EXTENT_OVERLAP_MIDDLE case:
+		 */
+		u64s += BKEY_EXTENT_U64s_MAX;
+	}
+
+	return u64s <= bch_btree_keys_u64s_remaining(c, b);
+}
+
+static inline bool journal_res_insert_fits(struct btree_insert *trans,
+					   struct btree_insert_entry *insert)
+{
+	unsigned u64s = 0;
+	struct btree_insert_entry *i;
+
+	/*
+	 * If we didn't get a journal reservation, we're in journal replay and
+	 * we're not journalling updates:
+	 */
+	if (!trans->journal_res.ref)
+		return true;
+
+	for (i = insert; i < trans->entries + trans->nr; i++)
+		u64s += jset_u64s(i->k->k.u64s + i->extra_res);
+
+	return u64s <= trans->journal_res.u64s;
+}
+
+ssize_t bch2_btree_updates_print(struct bch_fs *, char *);
+
+size_t bch2_btree_interior_updates_nr_pending(struct bch_fs *);
+
+#endif /* _BCACHEFS_BTREE_UPDATE_INTERIOR_H */
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
new file mode 100644
index 000000000000..4d1d0954efbf
--- /dev/null
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -0,0 +1,737 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "btree_update.h"
+#include "btree_update_interior.h"
+#include "btree_io.h"
+#include "btree_iter.h"
+#include "btree_locking.h"
+#include "debug.h"
+#include "extents.h"
+#include "journal.h"
+#include "journal_reclaim.h"
+#include "keylist.h"
+#include "trace.h"
+
+#include <linux/sort.h>
+
+/* Inserting into a given leaf node (last stage of insert): */
+
+/* Handle overwrites and do insert, for non extents: */
+bool bch2_btree_bset_insert_key(struct btree_iter *iter,
+				struct btree *b,
+				struct btree_node_iter *node_iter,
+				struct bkey_i *insert)
+{
+	const struct bkey_format *f = &b->format;
+	struct bkey_packed *k;
+	struct bset_tree *t;
+	unsigned clobber_u64s;
+
+	EBUG_ON(btree_node_just_written(b));
+	EBUG_ON(bset_written(b, btree_bset_last(b)));
+	EBUG_ON(bkey_deleted(&insert->k) && bkey_val_u64s(&insert->k));
+	EBUG_ON(bkey_cmp(bkey_start_pos(&insert->k), b->data->min_key) < 0 ||
+		bkey_cmp(insert->k.p, b->data->max_key) > 0);
+
+	k = bch2_btree_node_iter_peek_all(node_iter, b);
+	if (k && !bkey_cmp_packed(b, k, &insert->k)) {
+		BUG_ON(bkey_whiteout(k));
+
+		t = bch2_bkey_to_bset(b, k);
+
+		if (bset_unwritten(b, bset(b, t)) &&
+		    bkey_val_u64s(&insert->k) == bkeyp_val_u64s(f, k) &&
+		    !bkey_whiteout(&insert->k)) {
+			k->type = insert->k.type;
+			memcpy_u64s(bkeyp_val(f, k), &insert->v,
+				    bkey_val_u64s(&insert->k));
+			return true;
+		}
+
+		insert->k.needs_whiteout = k->needs_whiteout;
+
+		btree_keys_account_key_drop(&b->nr, t - b->set, k);
+
+		if (t == bset_tree_last(b)) {
+			clobber_u64s = k->u64s;
+
+			/*
+			 * If we're deleting, and the key we're deleting doesn't
+			 * need a whiteout (it wasn't overwriting a key that had
+			 * been written to disk) - just delete it:
+			 */
+			if (bkey_whiteout(&insert->k) && !k->needs_whiteout) {
+				bch2_bset_delete(b, k, clobber_u64s);
+				bch2_btree_node_iter_fix(iter, b, node_iter, t,
+							k, clobber_u64s, 0);
+				return true;
+			}
+
+			goto overwrite;
+		}
+
+		k->type = KEY_TYPE_DELETED;
+		bch2_btree_node_iter_fix(iter, b, node_iter, t, k,
+					k->u64s, k->u64s);
+
+		if (bkey_whiteout(&insert->k)) {
+			reserve_whiteout(b, t, k);
+			return true;
+		} else {
+			k->needs_whiteout = false;
+		}
+	} else {
+		/*
+		 * Deleting, but the key to delete wasn't found - nothing to do:
+		 */
+		if (bkey_whiteout(&insert->k))
+			return false;
+
+		insert->k.needs_whiteout = false;
+	}
+
+	t = bset_tree_last(b);
+	k = bch2_btree_node_iter_bset_pos(node_iter, b, t);
+	clobber_u64s = 0;
+overwrite:
+	bch2_bset_insert(b, node_iter, k, insert, clobber_u64s);
+	if (k->u64s != clobber_u64s || bkey_whiteout(&insert->k))
+		bch2_btree_node_iter_fix(iter, b, node_iter, t, k,
+					clobber_u64s, k->u64s);
+	return true;
+}
+
+static void __btree_node_flush(struct journal *j, struct journal_entry_pin *pin,
+			       unsigned i, u64 seq)
+{
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
+	struct btree_write *w = container_of(pin, struct btree_write, journal);
+	struct btree *b = container_of(w, struct btree, writes[i]);
+
+	btree_node_lock_type(c, b, SIX_LOCK_read);
+	bch2_btree_node_write_cond(c, b,
+			(btree_current_write(b) == w &&
+			 w->journal.pin_list == journal_seq_pin(j, seq)));
+	six_unlock_read(&b->lock);
+}
+
+static void btree_node_flush0(struct journal *j, struct journal_entry_pin *pin, u64 seq)
+{
+	return __btree_node_flush(j, pin, 0, seq);
+}
+
+static void btree_node_flush1(struct journal *j, struct journal_entry_pin *pin, u64 seq)
+{
+	return __btree_node_flush(j, pin, 1, seq);
+}
+
+void bch2_btree_journal_key(struct btree_insert *trans,
+			   struct btree_iter *iter,
+			   struct bkey_i *insert)
+{
+	struct bch_fs *c = trans->c;
+	struct journal *j = &c->journal;
+	struct btree *b = iter->l[0].b;
+	struct btree_write *w = btree_current_write(b);
+
+	EBUG_ON(iter->level || b->level);
+	EBUG_ON(trans->journal_res.ref !=
+		!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY));
+
+	if (likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) {
+		u64 seq = trans->journal_res.seq;
+		bool needs_whiteout = insert->k.needs_whiteout;
+
+		/* ick */
+		insert->k.needs_whiteout = false;
+		bch2_journal_add_keys(j, &trans->journal_res,
+				      iter->btree_id, insert);
+		insert->k.needs_whiteout = needs_whiteout;
+
+		bch2_journal_set_has_inode(j, &trans->journal_res,
+					   insert->k.p.inode);
+
+		if (trans->journal_seq)
+			*trans->journal_seq = seq;
+		btree_bset_last(b)->journal_seq = cpu_to_le64(seq);
+	}
+
+	if (unlikely(!journal_pin_active(&w->journal))) {
+		u64 seq = likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))
+			? trans->journal_res.seq
+			: j->replay_journal_seq;
+
+		bch2_journal_pin_add(j, seq, &w->journal,
+				     btree_node_write_idx(b) == 0
+				     ? btree_node_flush0
+				     : btree_node_flush1);
+	}
+
+	if (unlikely(!btree_node_dirty(b)))
+		set_btree_node_dirty(b);
+}
+
+static enum btree_insert_ret
+bch2_insert_fixup_key(struct btree_insert *trans,
+		     struct btree_insert_entry *insert)
+{
+	struct btree_iter *iter = insert->iter;
+	struct btree_iter_level *l = &iter->l[0];
+
+	EBUG_ON(iter->level);
+	EBUG_ON(insert->k->k.u64s >
+		bch_btree_keys_u64s_remaining(trans->c, l->b));
+
+	if (bch2_btree_bset_insert_key(iter, l->b, &l->iter,
+				       insert->k))
+		bch2_btree_journal_key(trans, iter, insert->k);
+
+	trans->did_work = true;
+	return BTREE_INSERT_OK;
+}
+
+/**
+ * btree_insert_key - insert a key one key into a leaf node
+ */
+static enum btree_insert_ret
+btree_insert_key_leaf(struct btree_insert *trans,
+		      struct btree_insert_entry *insert)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter *iter = insert->iter;
+	struct btree *b = iter->l[0].b;
+	enum btree_insert_ret ret;
+	int old_u64s = le16_to_cpu(btree_bset_last(b)->u64s);
+	int old_live_u64s = b->nr.live_u64s;
+	int live_u64s_added, u64s_added;
+
+	ret = !btree_node_is_extents(b)
+		? bch2_insert_fixup_key(trans, insert)
+		: bch2_insert_fixup_extent(trans, insert);
+
+	live_u64s_added = (int) b->nr.live_u64s - old_live_u64s;
+	u64s_added = (int) le16_to_cpu(btree_bset_last(b)->u64s) - old_u64s;
+
+	if (b->sib_u64s[0] != U16_MAX && live_u64s_added < 0)
+		b->sib_u64s[0] = max(0, (int) b->sib_u64s[0] + live_u64s_added);
+	if (b->sib_u64s[1] != U16_MAX && live_u64s_added < 0)
+		b->sib_u64s[1] = max(0, (int) b->sib_u64s[1] + live_u64s_added);
+
+	if (u64s_added > live_u64s_added &&
+	    bch2_maybe_compact_whiteouts(c, b))
+		bch2_btree_iter_reinit_node(iter, b);
+
+	trace_btree_insert_key(c, b, insert->k);
+	return ret;
+}
+
+#define trans_for_each_entry(trans, i)					\
+	for ((i) = (trans)->entries; (i) < (trans)->entries + (trans)->nr; (i)++)
+
+/*
+ * We sort transaction entries so that if multiple iterators point to the same
+ * leaf node they'll be adjacent:
+ */
+static bool same_leaf_as_prev(struct btree_insert *trans,
+			      struct btree_insert_entry *i)
+{
+	return i != trans->entries &&
+		i[0].iter->l[0].b == i[-1].iter->l[0].b;
+}
+
+static inline struct btree_insert_entry *trans_next_leaf(struct btree_insert *trans,
+							 struct btree_insert_entry *i)
+{
+	struct btree *b = i->iter->l[0].b;
+
+	do {
+		i++;
+	} while (i < trans->entries + trans->nr && b == i->iter->l[0].b);
+
+	return i;
+}
+
+#define trans_for_each_leaf(trans, i)					\
+	for ((i) = (trans)->entries;					\
+	     (i) < (trans)->entries + (trans)->nr;			\
+	     (i) = trans_next_leaf(trans, i))
+
+inline void bch2_btree_node_lock_for_insert(struct bch_fs *c, struct btree *b,
+					    struct btree_iter *iter)
+{
+	bch2_btree_node_lock_write(b, iter);
+
+	if (btree_node_just_written(b) &&
+	    bch2_btree_post_write_cleanup(c, b))
+		bch2_btree_iter_reinit_node(iter, b);
+
+	/*
+	 * If the last bset has been written, or if it's gotten too big - start
+	 * a new bset to insert into:
+	 */
+	if (want_new_bset(c, b))
+		bch2_btree_init_next(c, b, iter);
+}
+
+static void multi_lock_write(struct bch_fs *c, struct btree_insert *trans)
+{
+	struct btree_insert_entry *i;
+
+	trans_for_each_leaf(trans, i)
+		bch2_btree_node_lock_for_insert(c, i->iter->l[0].b, i->iter);
+}
+
+static void multi_unlock_write(struct btree_insert *trans)
+{
+	struct btree_insert_entry *i;
+
+	trans_for_each_leaf(trans, i)
+		bch2_btree_node_unlock_write(i->iter->l[0].b, i->iter);
+}
+
+static inline int btree_trans_cmp(struct btree_insert_entry l,
+				  struct btree_insert_entry r)
+{
+	return btree_iter_cmp(l.iter, r.iter);
+}
+
+/* Normal update interface: */
+
+/*
+ * Get journal reservation, take write locks, and attempt to do btree update(s):
+ */
+static inline int do_btree_insert_at(struct btree_insert *trans,
+				     struct btree_iter **split,
+				     bool *cycle_gc_lock)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_insert_entry *i;
+	unsigned u64s;
+	int ret;
+
+	trans_for_each_entry(trans, i) {
+		BUG_ON(i->done);
+		BUG_ON(i->iter->uptodate >= BTREE_ITER_NEED_RELOCK);
+	}
+
+	u64s = 0;
+	trans_for_each_entry(trans, i)
+		u64s += jset_u64s(i->k->k.u64s + i->extra_res);
+
+	memset(&trans->journal_res, 0, sizeof(trans->journal_res));
+
+	ret = !(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)
+		? bch2_journal_res_get(&c->journal,
+				      &trans->journal_res,
+				      u64s, u64s)
+		: 0;
+	if (ret)
+		return ret;
+
+	multi_lock_write(c, trans);
+
+	if (race_fault()) {
+		ret = -EINTR;
+		goto out;
+	}
+
+	u64s = 0;
+	trans_for_each_entry(trans, i) {
+		/* Multiple inserts might go to same leaf: */
+		if (!same_leaf_as_prev(trans, i))
+			u64s = 0;
+
+		/*
+		 * bch2_btree_node_insert_fits() must be called under write lock:
+		 * with only an intent lock, another thread can still call
+		 * bch2_btree_node_write(), converting an unwritten bset to a
+		 * written one
+		 */
+		u64s += i->k->k.u64s + i->extra_res;
+		if (!bch2_btree_node_insert_fits(c,
+				i->iter->l[0].b, u64s)) {
+			ret = -EINTR;
+			*split = i->iter;
+			goto out;
+		}
+	}
+
+	if (!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)) {
+		if (journal_seq_verify(c))
+			trans_for_each_entry(trans, i)
+				i->k->k.version.lo = trans->journal_res.seq;
+		else if (inject_invalid_keys(c))
+			trans_for_each_entry(trans, i)
+				i->k->k.version = MAX_VERSION;
+	}
+
+	trans_for_each_entry(trans, i) {
+		switch (btree_insert_key_leaf(trans, i)) {
+		case BTREE_INSERT_OK:
+			i->done = true;
+			break;
+		case BTREE_INSERT_JOURNAL_RES_FULL:
+		case BTREE_INSERT_NEED_TRAVERSE:
+		case BTREE_INSERT_NEED_RESCHED:
+			ret = -EINTR;
+			break;
+		case BTREE_INSERT_BTREE_NODE_FULL:
+			ret = -EINTR;
+			*split = i->iter;
+			break;
+		case BTREE_INSERT_ENOSPC:
+			ret = -ENOSPC;
+			break;
+		case BTREE_INSERT_NEED_GC_LOCK:
+			ret = -EINTR;
+			*cycle_gc_lock = true;
+			break;
+		default:
+			BUG();
+		}
+
+		/*
+		 * If we did some work (i.e. inserted part of an extent),
+		 * we have to do all the other updates as well:
+		 */
+		if (!trans->did_work && (ret || *split))
+			break;
+	}
+out:
+	multi_unlock_write(trans);
+	bch2_journal_res_put(&c->journal, &trans->journal_res);
+
+	return ret;
+}
+
+static inline void btree_insert_entry_checks(struct bch_fs *c,
+					     struct btree_insert_entry *i)
+{
+	BUG_ON(i->iter->level);
+	BUG_ON(bkey_cmp(bkey_start_pos(&i->k->k), i->iter->pos));
+	BUG_ON(debug_check_bkeys(c) &&
+	       !bkey_deleted(&i->k->k) &&
+	       bch2_bkey_invalid(c, (enum bkey_type) i->iter->btree_id,
+				 bkey_i_to_s_c(i->k)));
+}
+
+/**
+ * __bch_btree_insert_at - insert keys at given iterator positions
+ *
+ * This is main entry point for btree updates.
+ *
+ * Return values:
+ * -EINTR: locking changed, this function should be called again. Only returned
+ *  if passed BTREE_INSERT_ATOMIC.
+ * -EROFS: filesystem read only
+ * -EIO: journal or btree node IO error
+ */
+int __bch2_btree_insert_at(struct btree_insert *trans)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_insert_entry *i;
+	struct btree_iter *linked, *split = NULL;
+	bool cycle_gc_lock = false;
+	unsigned flags;
+	int ret;
+
+	BUG_ON(!trans->nr);
+
+	for_each_btree_iter(trans->entries[0].iter, linked)
+		bch2_btree_iter_verify_locks(linked);
+
+	/* for the sake of sanity: */
+	BUG_ON(trans->nr > 1 && !(trans->flags & BTREE_INSERT_ATOMIC));
+
+	trans_for_each_entry(trans, i)
+		btree_insert_entry_checks(c, i);
+
+	bubble_sort(trans->entries, trans->nr, btree_trans_cmp);
+
+	if (unlikely(!percpu_ref_tryget(&c->writes)))
+		return -EROFS;
+retry:
+	split = NULL;
+	cycle_gc_lock = false;
+
+	trans_for_each_entry(trans, i) {
+		if (!bch2_btree_iter_upgrade(i->iter, 1, true)) {
+			ret = -EINTR;
+			goto err;
+		}
+
+		if (i->iter->flags & BTREE_ITER_ERROR) {
+			ret = -EIO;
+			goto err;
+		}
+	}
+
+	ret = do_btree_insert_at(trans, &split, &cycle_gc_lock);
+	if (unlikely(ret))
+		goto err;
+
+	trans_for_each_leaf(trans, i)
+		bch2_foreground_maybe_merge(c, i->iter, 0, trans->flags);
+
+	trans_for_each_entry(trans, i)
+		bch2_btree_iter_downgrade(i->iter);
+out:
+	percpu_ref_put(&c->writes);
+
+	if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) {
+		/* make sure we didn't drop or screw up locks: */
+		for_each_btree_iter(trans->entries[0].iter, linked) {
+			bch2_btree_iter_verify_locks(linked);
+			BUG_ON((trans->flags & BTREE_INSERT_NOUNLOCK) &&
+			       trans->did_work &&
+			       linked->uptodate >= BTREE_ITER_NEED_RELOCK);
+		}
+
+		/* make sure we didn't lose an error: */
+		if (!ret)
+			trans_for_each_entry(trans, i)
+				BUG_ON(!i->done);
+	}
+
+	BUG_ON(!(trans->flags & BTREE_INSERT_ATOMIC) && ret == -EINTR);
+
+	return ret;
+err:
+	flags = trans->flags;
+
+	/*
+	 * BTREE_INSERT_NOUNLOCK means don't unlock _after_ successful btree
+	 * update; if we haven't done anything yet it doesn't apply
+	 */
+	if (!trans->did_work)
+		flags &= ~BTREE_INSERT_NOUNLOCK;
+
+	if (split) {
+		ret = bch2_btree_split_leaf(c, split, flags);
+
+		/*
+		 * if the split succeeded without dropping locks the insert will
+		 * still be atomic (in the BTREE_INSERT_ATOMIC sense, what the
+		 * caller peeked() and is overwriting won't have changed)
+		 */
+#if 0
+		/*
+		 * XXX:
+		 * split -> btree node merging (of parent node) might still drop
+		 * locks when we're not passing it BTREE_INSERT_NOUNLOCK
+		 */
+		if (!ret && !trans->did_work)
+			goto retry;
+#endif
+
+		/*
+		 * don't care if we got ENOSPC because we told split it
+		 * couldn't block:
+		 */
+		if (!ret || (flags & BTREE_INSERT_NOUNLOCK))
+			ret = -EINTR;
+	}
+
+	if (cycle_gc_lock) {
+		if (!down_read_trylock(&c->gc_lock)) {
+			if (flags & BTREE_INSERT_NOUNLOCK)
+				goto out;
+
+			bch2_btree_iter_unlock(trans->entries[0].iter);
+			down_read(&c->gc_lock);
+		}
+		up_read(&c->gc_lock);
+	}
+
+	if (ret == -EINTR) {
+		if (flags & BTREE_INSERT_NOUNLOCK)
+			goto out;
+
+		trans_for_each_entry(trans, i) {
+			int ret2 = bch2_btree_iter_traverse(i->iter);
+			if (ret2) {
+				ret = ret2;
+				goto out;
+			}
+
+			BUG_ON(i->iter->uptodate > BTREE_ITER_NEED_PEEK);
+		}
+
+		/*
+		 * BTREE_ITER_ATOMIC means we have to return -EINTR if we
+		 * dropped locks:
+		 */
+		if (!(flags & BTREE_INSERT_ATOMIC))
+			goto retry;
+	}
+
+	goto out;
+}
+
+void bch2_trans_update(struct btree_trans *trans,
+		       struct btree_iter *iter,
+		       struct bkey_i *k,
+		       unsigned extra_journal_res)
+{
+	struct btree_insert_entry *i;
+
+	BUG_ON(trans->nr_updates >= ARRAY_SIZE(trans->updates));
+
+	i = &trans->updates[trans->nr_updates++];
+
+	*i = (struct btree_insert_entry) {
+		.iter	= iter,
+		.k		= k,
+		.extra_res	= extra_journal_res,
+	};
+
+	btree_insert_entry_checks(trans->c, i);
+}
+
+int bch2_trans_commit(struct btree_trans *trans,
+		      struct disk_reservation *disk_res,
+		      struct extent_insert_hook *hook,
+		      u64 *journal_seq,
+		      unsigned flags)
+{
+	struct btree_insert insert = {
+		.c		= trans->c,
+		.disk_res	= disk_res,
+		.journal_seq	= journal_seq,
+		.flags		= flags,
+		.nr		= trans->nr_updates,
+		.entries	= trans->updates,
+	};
+
+	if (!trans->nr_updates)
+		return 0;
+
+	trans->nr_updates = 0;
+
+	return __bch2_btree_insert_at(&insert);
+}
+
+int bch2_btree_delete_at(struct btree_iter *iter, unsigned flags)
+{
+	struct bkey_i k;
+
+	bkey_init(&k.k);
+	k.k.p = iter->pos;
+
+	return bch2_btree_insert_at(iter->c, NULL, NULL, NULL,
+				    BTREE_INSERT_NOFAIL|
+				    BTREE_INSERT_USE_RESERVE|flags,
+				    BTREE_INSERT_ENTRY(iter, &k));
+}
+
+int bch2_btree_insert_list_at(struct btree_iter *iter,
+			     struct keylist *keys,
+			     struct disk_reservation *disk_res,
+			     struct extent_insert_hook *hook,
+			     u64 *journal_seq, unsigned flags)
+{
+	BUG_ON(flags & BTREE_INSERT_ATOMIC);
+	BUG_ON(bch2_keylist_empty(keys));
+	bch2_verify_keylist_sorted(keys);
+
+	while (!bch2_keylist_empty(keys)) {
+		int ret = bch2_btree_insert_at(iter->c, disk_res, hook,
+				journal_seq, flags,
+				BTREE_INSERT_ENTRY(iter, bch2_keylist_front(keys)));
+		if (ret)
+			return ret;
+
+		bch2_keylist_pop_front(keys);
+	}
+
+	return 0;
+}
+
+/**
+ * bch_btree_insert - insert keys into the extent btree
+ * @c:			pointer to struct bch_fs
+ * @id:			btree to insert into
+ * @insert_keys:	list of keys to insert
+ * @hook:		insert callback
+ */
+int bch2_btree_insert(struct bch_fs *c, enum btree_id id,
+		     struct bkey_i *k,
+		     struct disk_reservation *disk_res,
+		     struct extent_insert_hook *hook,
+		     u64 *journal_seq, int flags)
+{
+	struct btree_iter iter;
+	int ret;
+
+	bch2_btree_iter_init(&iter, c, id, bkey_start_pos(&k->k),
+			     BTREE_ITER_INTENT);
+	ret = bch2_btree_insert_at(c, disk_res, hook, journal_seq, flags,
+				   BTREE_INSERT_ENTRY(&iter, k));
+	bch2_btree_iter_unlock(&iter);
+
+	return ret;
+}
+
+/*
+ * bch_btree_delete_range - delete everything within a given range
+ *
+ * Range is a half open interval - [start, end)
+ */
+int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id,
+			   struct bpos start,
+			   struct bpos end,
+			   struct bversion version,
+			   struct disk_reservation *disk_res,
+			   struct extent_insert_hook *hook,
+			   u64 *journal_seq)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	int ret = 0;
+
+	bch2_btree_iter_init(&iter, c, id, start,
+			     BTREE_ITER_INTENT);
+
+	while ((k = bch2_btree_iter_peek(&iter)).k &&
+	       !(ret = btree_iter_err(k))) {
+		unsigned max_sectors = KEY_SIZE_MAX & (~0 << c->block_bits);
+		/* really shouldn't be using a bare, unpadded bkey_i */
+		struct bkey_i delete;
+
+		if (bkey_cmp(iter.pos, end) >= 0)
+			break;
+
+		bkey_init(&delete.k);
+
+		/*
+		 * For extents, iter.pos won't necessarily be the same as
+		 * bkey_start_pos(k.k) (for non extents they always will be the
+		 * same). It's important that we delete starting from iter.pos
+		 * because the range we want to delete could start in the middle
+		 * of k.
+		 *
+		 * (bch2_btree_iter_peek() does guarantee that iter.pos >=
+		 * bkey_start_pos(k.k)).
+		 */
+		delete.k.p = iter.pos;
+		delete.k.version = version;
+
+		if (iter.flags & BTREE_ITER_IS_EXTENTS) {
+			/* create the biggest key we can */
+			bch2_key_resize(&delete.k, max_sectors);
+			bch2_cut_back(end, &delete.k);
+		}
+
+		ret = bch2_btree_insert_at(c, disk_res, hook, journal_seq,
+					   BTREE_INSERT_NOFAIL,
+					   BTREE_INSERT_ENTRY(&iter, &delete));
+		if (ret)
+			break;
+
+		bch2_btree_iter_cond_resched(&iter);
+	}
+
+	bch2_btree_iter_unlock(&iter);
+	return ret;
+}
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
new file mode 100644
index 000000000000..f347c93e0c6e
--- /dev/null
+++ b/fs/bcachefs/buckets.c
@@ -0,0 +1,975 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Code for manipulating bucket marks for garbage collection.
+ *
+ * Copyright 2014 Datera, Inc.
+ *
+ * Bucket states:
+ * - free bucket: mark == 0
+ *   The bucket contains no data and will not be read
+ *
+ * - allocator bucket: owned_by_allocator == 1
+ *   The bucket is on a free list, or it is an open bucket
+ *
+ * - cached bucket: owned_by_allocator == 0 &&
+ *                  dirty_sectors == 0 &&
+ *                  cached_sectors > 0
+ *   The bucket contains data but may be safely discarded as there are
+ *   enough replicas of the data on other cache devices, or it has been
+ *   written back to the backing device
+ *
+ * - dirty bucket: owned_by_allocator == 0 &&
+ *                 dirty_sectors > 0
+ *   The bucket contains data that we must not discard (either only copy,
+ *   or one of the 'main copies' for data requiring multiple replicas)
+ *
+ * - metadata bucket: owned_by_allocator == 0 && is_metadata == 1
+ *   This is a btree node, journal or gen/prio bucket
+ *
+ * Lifecycle:
+ *
+ * bucket invalidated => bucket on freelist => open bucket =>
+ *     [dirty bucket =>] cached bucket => bucket invalidated => ...
+ *
+ * Note that cache promotion can skip the dirty bucket step, as data
+ * is copied from a deeper tier to a shallower tier, onto a cached
+ * bucket.
+ * Note also that a cached bucket can spontaneously become dirty --
+ * see below.
+ *
+ * Only a traversal of the key space can determine whether a bucket is
+ * truly dirty or cached.
+ *
+ * Transitions:
+ *
+ * - free => allocator: bucket was invalidated
+ * - cached => allocator: bucket was invalidated
+ *
+ * - allocator => dirty: open bucket was filled up
+ * - allocator => cached: open bucket was filled up
+ * - allocator => metadata: metadata was allocated
+ *
+ * - dirty => cached: dirty sectors were copied to a deeper tier
+ * - dirty => free: dirty sectors were overwritten or moved (copy gc)
+ * - cached => free: cached sectors were overwritten
+ *
+ * - metadata => free: metadata was freed
+ *
+ * Oddities:
+ * - cached => dirty: a device was removed so formerly replicated data
+ *                    is no longer sufficiently replicated
+ * - free => cached: cannot happen
+ * - free => dirty: cannot happen
+ * - free => metadata: cannot happen
+ */
+
+#include "bcachefs.h"
+#include "alloc.h"
+#include "btree_gc.h"
+#include "buckets.h"
+#include "error.h"
+#include "movinggc.h"
+#include "trace.h"
+
+#include <linux/preempt.h>
+
+#ifdef DEBUG_BUCKETS
+
+#define lg_local_lock	lg_global_lock
+#define lg_local_unlock	lg_global_unlock
+
+static void bch2_fs_stats_verify(struct bch_fs *c)
+{
+	struct bch_fs_usage stats =
+		__bch2_fs_usage_read(c);
+	unsigned i;
+
+	for (i = 0; i < ARRAY_SIZE(stats.s); i++) {
+		if ((s64) stats.s[i].data[S_META] < 0)
+			panic("replicas %u meta underflow: %lli\n",
+			      i + 1, stats.s[i].data[S_META]);
+
+		if ((s64) stats.s[i].data[S_DIRTY] < 0)
+			panic("replicas %u dirty underflow: %lli\n",
+			      i + 1, stats.s[i].data[S_DIRTY]);
+
+		if ((s64) stats.s[i].persistent_reserved < 0)
+			panic("replicas %u reserved underflow: %lli\n",
+			      i + 1, stats.s[i].persistent_reserved);
+	}
+
+	if ((s64) stats.online_reserved < 0)
+		panic("sectors_online_reserved underflow: %lli\n",
+		      stats.online_reserved);
+}
+
+static void bch2_dev_stats_verify(struct bch_dev *ca)
+{
+	struct bch_dev_usage stats =
+		__bch2_dev_usage_read(ca);
+	u64 n = ca->mi.nbuckets - ca->mi.first_bucket;
+	unsigned i;
+
+	for (i = 0; i < ARRAY_SIZE(stats.buckets); i++)
+		BUG_ON(stats.buckets[i]		> n);
+	BUG_ON(stats.buckets_alloc		> n);
+	BUG_ON(stats.buckets_unavailable	> n);
+}
+
+static void bch2_disk_reservations_verify(struct bch_fs *c, int flags)
+{
+	if (!(flags & BCH_DISK_RESERVATION_NOFAIL)) {
+		u64 used = __bch2_fs_sectors_used(c);
+		u64 cached = 0;
+		u64 avail = atomic64_read(&c->sectors_available);
+		int cpu;
+
+		for_each_possible_cpu(cpu)
+			cached += per_cpu_ptr(c->usage_percpu, cpu)->available_cache;
+
+		if (used + avail + cached > c->capacity)
+			panic("used %llu avail %llu cached %llu capacity %llu\n",
+			      used, avail, cached, c->capacity);
+	}
+}
+
+#else
+
+static void bch2_fs_stats_verify(struct bch_fs *c) {}
+static void bch2_dev_stats_verify(struct bch_dev *ca) {}
+static void bch2_disk_reservations_verify(struct bch_fs *c, int flags) {}
+
+#endif
+
+/*
+ * Clear journal_seq_valid for buckets for which it's not needed, to prevent
+ * wraparound:
+ */
+void bch2_bucket_seq_cleanup(struct bch_fs *c)
+{
+	u16 last_seq_ondisk = c->journal.last_seq_ondisk;
+	struct bch_dev *ca;
+	struct bucket_array *buckets;
+	struct bucket *g;
+	struct bucket_mark m;
+	unsigned i;
+
+	for_each_member_device(ca, c, i) {
+		down_read(&ca->bucket_lock);
+		buckets = bucket_array(ca);
+
+		for_each_bucket(g, buckets) {
+			bucket_cmpxchg(g, m, ({
+				if (!m.journal_seq_valid ||
+				    bucket_needs_journal_commit(m, last_seq_ondisk))
+					break;
+
+				m.journal_seq_valid = 0;
+			}));
+		}
+		up_read(&ca->bucket_lock);
+	}
+}
+
+#define bch2_usage_add(_acc, _stats)					\
+do {									\
+	typeof(_acc) _a = (_acc), _s = (_stats);			\
+	unsigned i;							\
+									\
+	for (i = 0; i < sizeof(*_a) / sizeof(u64); i++)			\
+		((u64 *) (_a))[i] += ((u64 *) (_s))[i];			\
+} while (0)
+
+#define bch2_usage_read_raw(_stats)					\
+({									\
+	typeof(*this_cpu_ptr(_stats)) _acc;				\
+	int cpu;							\
+									\
+	memset(&_acc, 0, sizeof(_acc));					\
+									\
+	for_each_possible_cpu(cpu)					\
+		bch2_usage_add(&_acc, per_cpu_ptr((_stats), cpu));	\
+									\
+	_acc;								\
+})
+
+#define bch2_usage_read_cached(_c, _cached, _uncached)			\
+({									\
+	typeof(_cached) _ret;						\
+	unsigned _seq;							\
+									\
+	do {								\
+		_seq = read_seqcount_begin(&(_c)->gc_pos_lock);		\
+		_ret = (_c)->gc_pos.phase == GC_PHASE_DONE		\
+			? bch2_usage_read_raw(_uncached)			\
+			: (_cached);					\
+	} while (read_seqcount_retry(&(_c)->gc_pos_lock, _seq));	\
+									\
+	_ret;								\
+})
+
+struct bch_dev_usage __bch2_dev_usage_read(struct bch_dev *ca)
+{
+	return bch2_usage_read_raw(ca->usage_percpu);
+}
+
+struct bch_dev_usage bch2_dev_usage_read(struct bch_fs *c, struct bch_dev *ca)
+{
+	return bch2_usage_read_cached(c, ca->usage_cached, ca->usage_percpu);
+}
+
+struct bch_fs_usage
+__bch2_fs_usage_read(struct bch_fs *c)
+{
+	return bch2_usage_read_raw(c->usage_percpu);
+}
+
+struct bch_fs_usage
+bch2_fs_usage_read(struct bch_fs *c)
+{
+	return bch2_usage_read_cached(c,
+				     c->usage_cached,
+				     c->usage_percpu);
+}
+
+struct fs_usage_sum {
+	u64	data;
+	u64	reserved;
+};
+
+static inline struct fs_usage_sum __fs_usage_sum(struct bch_fs_usage stats)
+{
+	struct fs_usage_sum sum = { 0 };
+	unsigned i;
+
+	for (i = 0; i < ARRAY_SIZE(stats.s); i++) {
+		sum.data += (stats.s[i].data[S_META] +
+			     stats.s[i].data[S_DIRTY]) * (i + 1);
+		sum.reserved += stats.s[i].persistent_reserved * (i + 1);
+	}
+
+	sum.reserved += stats.online_reserved;
+	return sum;
+}
+
+#define RESERVE_FACTOR	6
+
+static u64 reserve_factor(u64 r)
+{
+	return r + (round_up(r, (1 << RESERVE_FACTOR)) >> RESERVE_FACTOR);
+}
+
+static u64 avail_factor(u64 r)
+{
+	return (r << RESERVE_FACTOR) / (1 << RESERVE_FACTOR) + 1;
+}
+
+u64 __bch2_fs_sectors_used(struct bch_fs *c, struct bch_fs_usage stats)
+{
+	struct fs_usage_sum sum = __fs_usage_sum(stats);
+
+	return sum.data + reserve_factor(sum.reserved);
+}
+
+u64 bch2_fs_sectors_used(struct bch_fs *c, struct bch_fs_usage stats)
+{
+	return min(c->capacity, __bch2_fs_sectors_used(c, stats));
+}
+
+u64 bch2_fs_sectors_free(struct bch_fs *c, struct bch_fs_usage stats)
+{
+	return avail_factor(c->capacity - bch2_fs_sectors_used(c, stats));
+}
+
+static inline int is_unavailable_bucket(struct bucket_mark m)
+{
+	return !is_available_bucket(m);
+}
+
+static inline int is_fragmented_bucket(struct bucket_mark m,
+				       struct bch_dev *ca)
+{
+	if (!m.owned_by_allocator &&
+	    m.data_type == BCH_DATA_USER &&
+	    bucket_sectors_used(m))
+		return max_t(int, 0, (int) ca->mi.bucket_size -
+			     bucket_sectors_used(m));
+	return 0;
+}
+
+static inline enum bch_data_type bucket_type(struct bucket_mark m)
+{
+	return m.cached_sectors && !m.dirty_sectors
+		?  BCH_DATA_CACHED
+		: m.data_type;
+}
+
+static bool bucket_became_unavailable(struct bch_fs *c,
+				      struct bucket_mark old,
+				      struct bucket_mark new)
+{
+	return is_available_bucket(old) &&
+	       !is_available_bucket(new) &&
+	       (!c || c->gc_pos.phase == GC_PHASE_DONE);
+}
+
+void bch2_fs_usage_apply(struct bch_fs *c,
+			struct bch_fs_usage *stats,
+			struct disk_reservation *disk_res,
+			struct gc_pos gc_pos)
+{
+	struct fs_usage_sum sum = __fs_usage_sum(*stats);
+	s64 added = sum.data + sum.reserved;
+
+	/*
+	 * Not allowed to reduce sectors_available except by getting a
+	 * reservation:
+	 */
+	BUG_ON(added > (s64) (disk_res ? disk_res->sectors : 0));
+
+	if (added > 0) {
+		disk_res->sectors	-= added;
+		stats->online_reserved	-= added;
+	}
+
+	percpu_down_read(&c->usage_lock);
+	preempt_disable();
+	/* online_reserved not subject to gc: */
+	this_cpu_add(c->usage_percpu->online_reserved, stats->online_reserved);
+	stats->online_reserved = 0;
+
+	if (!gc_will_visit(c, gc_pos))
+		bch2_usage_add(this_cpu_ptr(c->usage_percpu), stats);
+
+	bch2_fs_stats_verify(c);
+	preempt_enable();
+	percpu_up_read(&c->usage_lock);
+
+	memset(stats, 0, sizeof(*stats));
+}
+
+static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
+				  struct bucket_mark old, struct bucket_mark new)
+{
+	struct bch_dev_usage *dev_usage;
+
+	if (c)
+		percpu_rwsem_assert_held(&c->usage_lock);
+
+	if (old.data_type && new.data_type &&
+	    old.data_type != new.data_type) {
+		BUG_ON(!c);
+		bch2_fs_inconsistent(c,
+			"different types of data in same bucket: %s, %s",
+			bch2_data_types[old.data_type],
+			bch2_data_types[new.data_type]);
+	}
+
+	preempt_disable();
+	dev_usage = this_cpu_ptr(ca->usage_percpu);
+
+	dev_usage->buckets[bucket_type(old)]--;
+	dev_usage->buckets[bucket_type(new)]++;
+
+	dev_usage->buckets_alloc +=
+		(int) new.owned_by_allocator - (int) old.owned_by_allocator;
+	dev_usage->buckets_unavailable +=
+		is_unavailable_bucket(new) - is_unavailable_bucket(old);
+
+	dev_usage->sectors[old.data_type] -= old.dirty_sectors;
+	dev_usage->sectors[new.data_type] += new.dirty_sectors;
+	dev_usage->sectors[BCH_DATA_CACHED] +=
+		(int) new.cached_sectors - (int) old.cached_sectors;
+	dev_usage->sectors_fragmented +=
+		is_fragmented_bucket(new, ca) - is_fragmented_bucket(old, ca);
+	preempt_enable();
+
+	if (!is_available_bucket(old) && is_available_bucket(new))
+		bch2_wake_allocator(ca);
+
+	bch2_dev_stats_verify(ca);
+}
+
+#define bucket_data_cmpxchg(c, ca, g, new, expr)		\
+({								\
+	struct bucket_mark _old = bucket_cmpxchg(g, new, expr);	\
+								\
+	bch2_dev_usage_update(c, ca, _old, new);		\
+	_old;							\
+})
+
+bool bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca,
+			    size_t b, struct bucket_mark *old)
+{
+	struct bucket *g;
+	struct bucket_mark new;
+
+	percpu_rwsem_assert_held(&c->usage_lock);
+
+	g = bucket(ca, b);
+
+	*old = bucket_data_cmpxchg(c, ca, g, new, ({
+		if (!is_available_bucket(new))
+			return false;
+
+		new.owned_by_allocator	= 1;
+		new.data_type		= 0;
+		new.cached_sectors	= 0;
+		new.dirty_sectors	= 0;
+		new.gen++;
+	}));
+
+	if (!old->owned_by_allocator && old->cached_sectors)
+		trace_invalidate(ca, bucket_to_sector(ca, b),
+				 old->cached_sectors);
+	return true;
+}
+
+void bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
+			    size_t b, bool owned_by_allocator,
+			    struct gc_pos pos, unsigned flags)
+{
+	struct bucket *g;
+	struct bucket_mark old, new;
+
+	percpu_rwsem_assert_held(&c->usage_lock);
+	g = bucket(ca, b);
+
+	if (!(flags & BCH_BUCKET_MARK_GC_LOCK_HELD) &&
+	    gc_will_visit(c, pos))
+		return;
+
+	old = bucket_data_cmpxchg(c, ca, g, new, ({
+		new.owned_by_allocator	= owned_by_allocator;
+	}));
+
+	BUG_ON(!owned_by_allocator && !old.owned_by_allocator &&
+	       c->gc_pos.phase == GC_PHASE_DONE);
+}
+
+#define saturated_add(ca, dst, src, max)			\
+do {								\
+	BUG_ON((int) (dst) + (src) < 0);			\
+	if ((dst) == (max))					\
+		;						\
+	else if ((dst) + (src) <= (max))			\
+		dst += (src);					\
+	else {							\
+		dst = (max);					\
+		trace_sectors_saturated(ca);		\
+	}							\
+} while (0)
+
+void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
+			       size_t b, enum bch_data_type type,
+			       unsigned sectors, struct gc_pos pos,
+			       unsigned flags)
+{
+	struct bucket *g;
+	struct bucket_mark old, new;
+
+	BUG_ON(!type);
+
+	if (likely(c)) {
+		percpu_rwsem_assert_held(&c->usage_lock);
+
+		if (!(flags & BCH_BUCKET_MARK_GC_LOCK_HELD) &&
+		    gc_will_visit(c, pos))
+			return;
+	}
+
+	rcu_read_lock();
+
+	g = bucket(ca, b);
+	old = bucket_data_cmpxchg(c, ca, g, new, ({
+		saturated_add(ca, new.dirty_sectors, sectors,
+			      GC_MAX_SECTORS_USED);
+		new.data_type		= type;
+	}));
+
+	rcu_read_unlock();
+
+	BUG_ON(!(flags & BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE) &&
+	       bucket_became_unavailable(c, old, new));
+}
+
+/* Reverting this until the copygc + compression issue is fixed: */
+
+static int __disk_sectors(struct bch_extent_crc_unpacked crc, unsigned sectors)
+{
+	if (!sectors)
+		return 0;
+
+	return max(1U, DIV_ROUND_UP(sectors * crc.compressed_size,
+				    crc.uncompressed_size));
+}
+
+/*
+ * Checking against gc's position has to be done here, inside the cmpxchg()
+ * loop, to avoid racing with the start of gc clearing all the marks - GC does
+ * that with the gc pos seqlock held.
+ */
+static void bch2_mark_pointer(struct bch_fs *c,
+			      struct bkey_s_c_extent e,
+			      const struct bch_extent_ptr *ptr,
+			      struct bch_extent_crc_unpacked crc,
+			      s64 sectors, enum s_alloc type,
+			      struct bch_fs_usage *stats,
+			      u64 journal_seq, unsigned flags)
+{
+	struct bucket_mark old, new;
+	unsigned saturated;
+	struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+	struct bucket *g = PTR_BUCKET(ca, ptr);
+	enum bch_data_type data_type = type == S_META
+		? BCH_DATA_BTREE : BCH_DATA_USER;
+	u64 v;
+
+	if (crc.compression_type) {
+		unsigned old_sectors, new_sectors;
+
+		if (sectors > 0) {
+			old_sectors = 0;
+			new_sectors = sectors;
+		} else {
+			old_sectors = e.k->size;
+			new_sectors = e.k->size + sectors;
+		}
+
+		sectors = -__disk_sectors(crc, old_sectors)
+			  +__disk_sectors(crc, new_sectors);
+	}
+
+	if (flags & BCH_BUCKET_MARK_GC_WILL_VISIT) {
+		if (journal_seq)
+			bucket_cmpxchg(g, new, ({
+				new.journal_seq_valid	= 1;
+				new.journal_seq		= journal_seq;
+			}));
+
+		return;
+	}
+
+	v = atomic64_read(&g->_mark.v);
+	do {
+		new.v.counter = old.v.counter = v;
+		saturated = 0;
+
+		/*
+		 * Check this after reading bucket mark to guard against
+		 * the allocator invalidating a bucket after we've already
+		 * checked the gen
+		 */
+		if (gen_after(new.gen, ptr->gen)) {
+			BUG_ON(!test_bit(BCH_FS_ALLOC_READ_DONE, &c->flags));
+			EBUG_ON(!ptr->cached &&
+				test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags));
+			return;
+		}
+
+		if (!ptr->cached &&
+		    new.dirty_sectors == GC_MAX_SECTORS_USED &&
+		    sectors < 0)
+			saturated = -sectors;
+
+		if (ptr->cached)
+			saturated_add(ca, new.cached_sectors, sectors,
+				      GC_MAX_SECTORS_USED);
+		else
+			saturated_add(ca, new.dirty_sectors, sectors,
+				      GC_MAX_SECTORS_USED);
+
+		if (!new.dirty_sectors &&
+		    !new.cached_sectors) {
+			new.data_type	= 0;
+
+			if (journal_seq) {
+				new.journal_seq_valid = 1;
+				new.journal_seq = journal_seq;
+			}
+		} else {
+			new.data_type = data_type;
+		}
+
+		if (flags & BCH_BUCKET_MARK_NOATOMIC) {
+			g->_mark = new;
+			break;
+		}
+	} while ((v = atomic64_cmpxchg(&g->_mark.v,
+			      old.v.counter,
+			      new.v.counter)) != old.v.counter);
+
+	bch2_dev_usage_update(c, ca, old, new);
+
+	BUG_ON(!(flags & BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE) &&
+	       bucket_became_unavailable(c, old, new));
+
+	if (saturated &&
+	    atomic_long_add_return(saturated,
+				   &ca->saturated_count) >=
+	    bucket_to_sector(ca, ca->free_inc.size)) {
+		if (c->gc_thread) {
+			trace_gc_sectors_saturated(c);
+			wake_up_process(c->gc_thread);
+		}
+	}
+}
+
+void bch2_mark_key(struct bch_fs *c, struct bkey_s_c k,
+		   s64 sectors, bool metadata,
+		   struct gc_pos pos,
+		   struct bch_fs_usage *stats,
+		   u64 journal_seq, unsigned flags)
+{
+	/*
+	 * synchronization w.r.t. GC:
+	 *
+	 * Normally, bucket sector counts/marks are updated on the fly, as
+	 * references are added/removed from the btree, the lists of buckets the
+	 * allocator owns, other metadata buckets, etc.
+	 *
+	 * When GC is in progress and going to mark this reference, we do _not_
+	 * mark this reference here, to avoid double counting - GC will count it
+	 * when it gets to it.
+	 *
+	 * To know whether we should mark a given reference (GC either isn't
+	 * running, or has already marked references at this position) we
+	 * construct a total order for everything GC walks. Then, we can simply
+	 * compare the position of the reference we're marking - @pos - with
+	 * GC's current position. If GC is going to mark this reference, GC's
+	 * current position will be less than @pos; if GC's current position is
+	 * greater than @pos GC has either already walked this position, or
+	 * isn't running.
+	 *
+	 * To avoid racing with GC's position changing, we have to deal with
+	 *  - GC's position being set to GC_POS_MIN when GC starts:
+	 *    usage_lock guards against this
+	 *  - GC's position overtaking @pos: we guard against this with
+	 *    whatever lock protects the data structure the reference lives in
+	 *    (e.g. the btree node lock, or the relevant allocator lock).
+	 */
+
+	percpu_down_read(&c->usage_lock);
+	if (!(flags & BCH_BUCKET_MARK_GC_LOCK_HELD) &&
+	    gc_will_visit(c, pos))
+		flags |= BCH_BUCKET_MARK_GC_WILL_VISIT;
+
+	if (!stats)
+		stats = this_cpu_ptr(c->usage_percpu);
+
+	switch (k.k->type) {
+	case BCH_EXTENT:
+	case BCH_EXTENT_CACHED: {
+		struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
+		const struct bch_extent_ptr *ptr;
+		struct bch_extent_crc_unpacked crc;
+		enum s_alloc type = metadata ? S_META : S_DIRTY;
+		unsigned replicas = 0;
+
+		BUG_ON(metadata && bkey_extent_is_cached(e.k));
+		BUG_ON(!sectors);
+
+		extent_for_each_ptr_crc(e, ptr, crc) {
+			bch2_mark_pointer(c, e, ptr, crc, sectors, type,
+					  stats, journal_seq, flags);
+			replicas += !ptr->cached;
+		}
+
+		if (replicas) {
+			BUG_ON(replicas - 1 > ARRAY_SIZE(stats->s));
+			stats->s[replicas - 1].data[type] += sectors;
+		}
+		break;
+	}
+	case BCH_RESERVATION: {
+		struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k);
+
+		if (r.v->nr_replicas) {
+			BUG_ON(r.v->nr_replicas - 1 > ARRAY_SIZE(stats->s));
+			stats->s[r.v->nr_replicas - 1].persistent_reserved += sectors;
+		}
+		break;
+	}
+	}
+	percpu_up_read(&c->usage_lock);
+}
+
+/* Disk reservations: */
+
+static u64 __recalc_sectors_available(struct bch_fs *c)
+{
+	int cpu;
+
+	for_each_possible_cpu(cpu)
+		per_cpu_ptr(c->usage_percpu, cpu)->available_cache = 0;
+
+	return bch2_fs_sectors_free(c, bch2_fs_usage_read(c));
+}
+
+/* Used by gc when it's starting: */
+void bch2_recalc_sectors_available(struct bch_fs *c)
+{
+	percpu_down_write(&c->usage_lock);
+	atomic64_set(&c->sectors_available, __recalc_sectors_available(c));
+	percpu_up_write(&c->usage_lock);
+}
+
+void __bch2_disk_reservation_put(struct bch_fs *c, struct disk_reservation *res)
+{
+	percpu_down_read(&c->usage_lock);
+	this_cpu_sub(c->usage_percpu->online_reserved,
+		     res->sectors);
+
+	bch2_fs_stats_verify(c);
+	percpu_up_read(&c->usage_lock);
+
+	res->sectors = 0;
+}
+
+#define SECTORS_CACHE	1024
+
+int bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res,
+			      unsigned sectors, int flags)
+{
+	struct bch_fs_usage *stats;
+	u64 old, v, get;
+	s64 sectors_available;
+	int ret;
+
+	percpu_down_read(&c->usage_lock);
+	preempt_disable();
+	stats = this_cpu_ptr(c->usage_percpu);
+
+	if (sectors <= stats->available_cache)
+		goto out;
+
+	v = atomic64_read(&c->sectors_available);
+	do {
+		old = v;
+		get = min((u64) sectors + SECTORS_CACHE, old);
+
+		if (get < sectors) {
+			preempt_enable();
+			percpu_up_read(&c->usage_lock);
+			goto recalculate;
+		}
+	} while ((v = atomic64_cmpxchg(&c->sectors_available,
+				       old, old - get)) != old);
+
+	stats->available_cache	+= get;
+
+out:
+	stats->available_cache	-= sectors;
+	stats->online_reserved	+= sectors;
+	res->sectors		+= sectors;
+
+	bch2_disk_reservations_verify(c, flags);
+	bch2_fs_stats_verify(c);
+	preempt_enable();
+	percpu_up_read(&c->usage_lock);
+	return 0;
+
+recalculate:
+	/*
+	 * GC recalculates sectors_available when it starts, so that hopefully
+	 * we don't normally end up blocking here:
+	 */
+
+	/*
+	 * Piss fuck, we can be called from extent_insert_fixup() with btree
+	 * locks held:
+	 */
+
+	if (!(flags & BCH_DISK_RESERVATION_GC_LOCK_HELD)) {
+		if (!(flags & BCH_DISK_RESERVATION_BTREE_LOCKS_HELD))
+			down_read(&c->gc_lock);
+		else if (!down_read_trylock(&c->gc_lock))
+			return -EINTR;
+	}
+
+	percpu_down_write(&c->usage_lock);
+	sectors_available = __recalc_sectors_available(c);
+
+	if (sectors <= sectors_available ||
+	    (flags & BCH_DISK_RESERVATION_NOFAIL)) {
+		atomic64_set(&c->sectors_available,
+			     max_t(s64, 0, sectors_available - sectors));
+		stats->online_reserved	+= sectors;
+		res->sectors		+= sectors;
+		ret = 0;
+
+		bch2_disk_reservations_verify(c, flags);
+	} else {
+		atomic64_set(&c->sectors_available, sectors_available);
+		ret = -ENOSPC;
+	}
+
+	bch2_fs_stats_verify(c);
+	percpu_up_write(&c->usage_lock);
+
+	if (!(flags & BCH_DISK_RESERVATION_GC_LOCK_HELD))
+		up_read(&c->gc_lock);
+
+	return ret;
+}
+
+/* Startup/shutdown: */
+
+static void buckets_free_rcu(struct rcu_head *rcu)
+{
+	struct bucket_array *buckets =
+		container_of(rcu, struct bucket_array, rcu);
+
+	kvpfree(buckets,
+		sizeof(struct bucket_array) +
+		buckets->nbuckets * sizeof(struct bucket));
+}
+
+int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
+{
+	struct bucket_array *buckets = NULL, *old_buckets = NULL;
+	unsigned long *buckets_dirty = NULL;
+	u8 *oldest_gens = NULL;
+	alloc_fifo	free[RESERVE_NR];
+	alloc_fifo	free_inc;
+	alloc_heap	alloc_heap;
+	copygc_heap	copygc_heap;
+
+	size_t btree_reserve	= DIV_ROUND_UP(BTREE_NODE_RESERVE,
+			     ca->mi.bucket_size / c->opts.btree_node_size);
+	/* XXX: these should be tunable */
+	size_t reserve_none	= max_t(size_t, 4, ca->mi.nbuckets >> 9);
+	size_t copygc_reserve	= max_t(size_t, 16, ca->mi.nbuckets >> 7);
+	size_t free_inc_reserve = copygc_reserve / 2;
+	bool resize = ca->buckets != NULL,
+	     start_copygc = ca->copygc_thread != NULL;
+	int ret = -ENOMEM;
+	unsigned i;
+
+	memset(&free,		0, sizeof(free));
+	memset(&free_inc,	0, sizeof(free_inc));
+	memset(&alloc_heap,	0, sizeof(alloc_heap));
+	memset(&copygc_heap,	0, sizeof(copygc_heap));
+
+	if (!(buckets		= kvpmalloc(sizeof(struct bucket_array) +
+					    nbuckets * sizeof(struct bucket),
+					    GFP_KERNEL|__GFP_ZERO)) ||
+	    !(oldest_gens	= kvpmalloc(nbuckets * sizeof(u8),
+					    GFP_KERNEL|__GFP_ZERO)) ||
+	    !(buckets_dirty	= kvpmalloc(BITS_TO_LONGS(nbuckets) *
+					    sizeof(unsigned long),
+					    GFP_KERNEL|__GFP_ZERO)) ||
+	    !init_fifo(&free[RESERVE_BTREE], btree_reserve, GFP_KERNEL) ||
+	    !init_fifo(&free[RESERVE_MOVINGGC],
+		       copygc_reserve, GFP_KERNEL) ||
+	    !init_fifo(&free[RESERVE_NONE], reserve_none, GFP_KERNEL) ||
+	    !init_fifo(&free_inc,	free_inc_reserve, GFP_KERNEL) ||
+	    !init_heap(&alloc_heap,	free_inc_reserve, GFP_KERNEL) ||
+	    !init_heap(&copygc_heap,	copygc_reserve, GFP_KERNEL))
+		goto err;
+
+	buckets->first_bucket	= ca->mi.first_bucket;
+	buckets->nbuckets	= nbuckets;
+
+	bch2_copygc_stop(ca);
+
+	if (resize) {
+		down_write(&c->gc_lock);
+		down_write(&ca->bucket_lock);
+		percpu_down_write(&c->usage_lock);
+	}
+
+	old_buckets = bucket_array(ca);
+
+	if (resize) {
+		size_t n = min(buckets->nbuckets, old_buckets->nbuckets);
+
+		memcpy(buckets->b,
+		       old_buckets->b,
+		       n * sizeof(struct bucket));
+		memcpy(oldest_gens,
+		       ca->oldest_gens,
+		       n * sizeof(u8));
+		memcpy(buckets_dirty,
+		       ca->buckets_dirty,
+		       BITS_TO_LONGS(n) * sizeof(unsigned long));
+	}
+
+	rcu_assign_pointer(ca->buckets, buckets);
+	buckets = old_buckets;
+
+	swap(ca->oldest_gens, oldest_gens);
+	swap(ca->buckets_dirty, buckets_dirty);
+
+	if (resize)
+		percpu_up_write(&c->usage_lock);
+
+	spin_lock(&c->freelist_lock);
+	for (i = 0; i < RESERVE_NR; i++) {
+		fifo_move(&free[i], &ca->free[i]);
+		swap(ca->free[i], free[i]);
+	}
+	fifo_move(&free_inc, &ca->free_inc);
+	swap(ca->free_inc, free_inc);
+	spin_unlock(&c->freelist_lock);
+
+	/* with gc lock held, alloc_heap can't be in use: */
+	swap(ca->alloc_heap, alloc_heap);
+
+	/* and we shut down copygc: */
+	swap(ca->copygc_heap, copygc_heap);
+
+	nbuckets = ca->mi.nbuckets;
+
+	if (resize) {
+		up_write(&ca->bucket_lock);
+		up_write(&c->gc_lock);
+	}
+
+	if (start_copygc &&
+	    bch2_copygc_start(c, ca))
+		bch_err(ca, "error restarting copygc thread");
+
+	ret = 0;
+err:
+	free_heap(&copygc_heap);
+	free_heap(&alloc_heap);
+	free_fifo(&free_inc);
+	for (i = 0; i < RESERVE_NR; i++)
+		free_fifo(&free[i]);
+	kvpfree(buckets_dirty,
+		BITS_TO_LONGS(nbuckets) * sizeof(unsigned long));
+	kvpfree(oldest_gens,
+		nbuckets * sizeof(u8));
+	if (buckets)
+		call_rcu(&old_buckets->rcu, buckets_free_rcu);
+
+	return ret;
+}
+
+void bch2_dev_buckets_free(struct bch_dev *ca)
+{
+	unsigned i;
+
+	free_heap(&ca->copygc_heap);
+	free_heap(&ca->alloc_heap);
+	free_fifo(&ca->free_inc);
+	for (i = 0; i < RESERVE_NR; i++)
+		free_fifo(&ca->free[i]);
+	kvpfree(ca->buckets_dirty,
+		BITS_TO_LONGS(ca->mi.nbuckets) * sizeof(unsigned long));
+	kvpfree(ca->oldest_gens, ca->mi.nbuckets * sizeof(u8));
+	kvpfree(rcu_dereference_protected(ca->buckets, 1),
+		sizeof(struct bucket_array) +
+		ca->mi.nbuckets * sizeof(struct bucket));
+
+	free_percpu(ca->usage_percpu);
+}
+
+int bch2_dev_buckets_alloc(struct bch_fs *c, struct bch_dev *ca)
+{
+	if (!(ca->usage_percpu = alloc_percpu(struct bch_dev_usage)))
+		return -ENOMEM;
+
+	return bch2_dev_buckets_resize(c, ca, ca->mi.nbuckets);;
+}
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
new file mode 100644
index 000000000000..a4ba6d787b0b
--- /dev/null
+++ b/fs/bcachefs/buckets.h
@@ -0,0 +1,276 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Code for manipulating bucket marks for garbage collection.
+ *
+ * Copyright 2014 Datera, Inc.
+ */
+
+#ifndef _BUCKETS_H
+#define _BUCKETS_H
+
+#include "buckets_types.h"
+#include "super.h"
+
+#define for_each_bucket(_b, _buckets)				\
+	for (_b = (_buckets)->b + (_buckets)->first_bucket;	\
+	     _b < (_buckets)->b + (_buckets)->nbuckets; _b++)
+
+#define bucket_cmpxchg(g, new, expr)				\
+({								\
+	u64 _v = atomic64_read(&(g)->_mark.v);			\
+	struct bucket_mark _old;				\
+								\
+	do {							\
+		(new).v.counter = _old.v.counter = _v;		\
+		expr;						\
+	} while ((_v = atomic64_cmpxchg(&(g)->_mark.v,		\
+			       _old.v.counter,			\
+			       (new).v.counter)) != _old.v.counter);\
+	_old;							\
+})
+
+static inline struct bucket_array *bucket_array(struct bch_dev *ca)
+{
+	return rcu_dereference_check(ca->buckets,
+				     !ca->fs ||
+				     percpu_rwsem_is_held(&ca->fs->usage_lock) ||
+				     lockdep_is_held(&ca->fs->gc_lock) ||
+				     lockdep_is_held(&ca->bucket_lock));
+}
+
+static inline struct bucket *bucket(struct bch_dev *ca, size_t b)
+{
+	struct bucket_array *buckets = bucket_array(ca);
+
+	BUG_ON(b < buckets->first_bucket || b >= buckets->nbuckets);
+	return buckets->b + b;
+}
+
+static inline void bucket_io_clock_reset(struct bch_fs *c, struct bch_dev *ca,
+					 size_t b, int rw)
+{
+	bucket(ca, b)->io_time[rw] = c->bucket_clock[rw].hand;
+}
+
+static inline u16 bucket_last_io(struct bch_fs *c, struct bucket *g, int rw)
+{
+	return c->bucket_clock[rw].hand - g->io_time[rw];
+}
+
+/*
+ * bucket_gc_gen() returns the difference between the bucket's current gen and
+ * the oldest gen of any pointer into that bucket in the btree.
+ */
+
+static inline u8 bucket_gc_gen(struct bch_dev *ca, size_t b)
+{
+	return bucket(ca, b)->mark.gen - ca->oldest_gens[b];
+}
+
+static inline size_t PTR_BUCKET_NR(const struct bch_dev *ca,
+				   const struct bch_extent_ptr *ptr)
+{
+	return sector_to_bucket(ca, ptr->offset);
+}
+
+static inline struct bucket *PTR_BUCKET(struct bch_dev *ca,
+					const struct bch_extent_ptr *ptr)
+{
+	return bucket(ca, PTR_BUCKET_NR(ca, ptr));
+}
+
+static inline struct bucket_mark ptr_bucket_mark(struct bch_dev *ca,
+						 const struct bch_extent_ptr *ptr)
+{
+	struct bucket_mark m;
+
+	rcu_read_lock();
+	m = READ_ONCE(bucket(ca, PTR_BUCKET_NR(ca, ptr))->mark);
+	rcu_read_unlock();
+
+	return m;
+}
+
+static inline int gen_cmp(u8 a, u8 b)
+{
+	return (s8) (a - b);
+}
+
+static inline int gen_after(u8 a, u8 b)
+{
+	int r = gen_cmp(a, b);
+
+	return r > 0 ? r : 0;
+}
+
+/**
+ * ptr_stale() - check if a pointer points into a bucket that has been
+ * invalidated.
+ */
+static inline u8 ptr_stale(struct bch_dev *ca,
+			   const struct bch_extent_ptr *ptr)
+{
+	return gen_after(ptr_bucket_mark(ca, ptr).gen, ptr->gen);
+}
+
+/* bucket gc marks */
+
+/* The dirty and cached sector counts saturate. If this occurs,
+ * reference counting alone will not free the bucket, and a btree
+ * GC must be performed. */
+#define GC_MAX_SECTORS_USED ((1U << 15) - 1)
+
+static inline unsigned bucket_sectors_used(struct bucket_mark mark)
+{
+	return mark.dirty_sectors + mark.cached_sectors;
+}
+
+static inline bool bucket_unused(struct bucket_mark mark)
+{
+	return !mark.owned_by_allocator &&
+		!mark.data_type &&
+		!bucket_sectors_used(mark);
+}
+
+/* Device usage: */
+
+struct bch_dev_usage __bch2_dev_usage_read(struct bch_dev *);
+struct bch_dev_usage bch2_dev_usage_read(struct bch_fs *, struct bch_dev *);
+
+static inline u64 __dev_buckets_available(struct bch_dev *ca,
+					  struct bch_dev_usage stats)
+{
+	u64 total = ca->mi.nbuckets - ca->mi.first_bucket;
+
+	if (WARN_ONCE(stats.buckets_unavailable > total,
+		      "buckets_unavailable overflow (%llu > %llu)\n",
+		      stats.buckets_unavailable, total))
+		return 0;
+
+	return total - stats.buckets_unavailable;
+}
+
+/*
+ * Number of reclaimable buckets - only for use by the allocator thread:
+ */
+static inline u64 dev_buckets_available(struct bch_fs *c, struct bch_dev *ca)
+{
+	return __dev_buckets_available(ca, bch2_dev_usage_read(c, ca));
+}
+
+static inline u64 __dev_buckets_free(struct bch_dev *ca,
+				     struct bch_dev_usage stats)
+{
+	return __dev_buckets_available(ca, stats) +
+		fifo_used(&ca->free[RESERVE_NONE]) +
+		fifo_used(&ca->free_inc);
+}
+
+static inline u64 dev_buckets_free(struct bch_fs *c, struct bch_dev *ca)
+{
+	return __dev_buckets_free(ca, bch2_dev_usage_read(c, ca));
+}
+
+/* Filesystem usage: */
+
+static inline enum bch_data_type s_alloc_to_data_type(enum s_alloc s)
+{
+	switch (s) {
+	case S_META:
+		return BCH_DATA_BTREE;
+	case S_DIRTY:
+		return BCH_DATA_USER;
+	default:
+		BUG();
+	}
+}
+
+struct bch_fs_usage __bch2_fs_usage_read(struct bch_fs *);
+struct bch_fs_usage bch2_fs_usage_read(struct bch_fs *);
+void bch2_fs_usage_apply(struct bch_fs *, struct bch_fs_usage *,
+			 struct disk_reservation *, struct gc_pos);
+
+u64 __bch2_fs_sectors_used(struct bch_fs *, struct bch_fs_usage);
+u64 bch2_fs_sectors_used(struct bch_fs *, struct bch_fs_usage);
+u64 bch2_fs_sectors_free(struct bch_fs *, struct bch_fs_usage);
+
+static inline bool is_available_bucket(struct bucket_mark mark)
+{
+	return (!mark.owned_by_allocator &&
+		!mark.dirty_sectors &&
+		!mark.nouse);
+}
+
+static inline bool bucket_needs_journal_commit(struct bucket_mark m,
+					       u16 last_seq_ondisk)
+{
+	return m.journal_seq_valid &&
+		((s16) m.journal_seq - (s16) last_seq_ondisk > 0);
+}
+
+void bch2_bucket_seq_cleanup(struct bch_fs *);
+
+bool bch2_invalidate_bucket(struct bch_fs *, struct bch_dev *,
+			    size_t, struct bucket_mark *);
+void bch2_mark_alloc_bucket(struct bch_fs *, struct bch_dev *,
+			    size_t, bool, struct gc_pos, unsigned);
+void bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *,
+			       size_t, enum bch_data_type, unsigned,
+			       struct gc_pos, unsigned);
+
+#define BCH_BUCKET_MARK_NOATOMIC		(1 << 0)
+#define BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE	(1 << 1)
+#define BCH_BUCKET_MARK_GC_WILL_VISIT		(1 << 2)
+#define BCH_BUCKET_MARK_GC_LOCK_HELD		(1 << 3)
+
+void bch2_mark_key(struct bch_fs *, struct bkey_s_c, s64, bool, struct gc_pos,
+		   struct bch_fs_usage *, u64, unsigned);
+
+void bch2_recalc_sectors_available(struct bch_fs *);
+
+void __bch2_disk_reservation_put(struct bch_fs *, struct disk_reservation *);
+
+static inline void bch2_disk_reservation_put(struct bch_fs *c,
+					     struct disk_reservation *res)
+{
+	if (res->sectors)
+		__bch2_disk_reservation_put(c, res);
+}
+
+#define BCH_DISK_RESERVATION_NOFAIL		(1 << 0)
+#define BCH_DISK_RESERVATION_GC_LOCK_HELD	(1 << 1)
+#define BCH_DISK_RESERVATION_BTREE_LOCKS_HELD	(1 << 2)
+
+int bch2_disk_reservation_add(struct bch_fs *,
+			     struct disk_reservation *,
+			     unsigned, int);
+
+static inline struct disk_reservation
+bch2_disk_reservation_init(struct bch_fs *c, unsigned nr_replicas)
+{
+	return (struct disk_reservation) {
+		.sectors	= 0,
+#if 0
+		/* not used yet: */
+		.gen		= c->capacity_gen,
+#endif
+		.nr_replicas	= nr_replicas,
+	};
+}
+
+static inline int bch2_disk_reservation_get(struct bch_fs *c,
+					    struct disk_reservation *res,
+					    unsigned sectors,
+					    unsigned nr_replicas,
+					    int flags)
+{
+	*res = bch2_disk_reservation_init(c, nr_replicas);
+
+	return bch2_disk_reservation_add(c, res, sectors * nr_replicas, flags);
+}
+
+int bch2_dev_buckets_resize(struct bch_fs *, struct bch_dev *, u64);
+void bch2_dev_buckets_free(struct bch_dev *);
+int bch2_dev_buckets_alloc(struct bch_fs *, struct bch_dev *);
+
+#endif /* _BUCKETS_H */
diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h
new file mode 100644
index 000000000000..5be90139dd0d
--- /dev/null
+++ b/fs/bcachefs/buckets_types.h
@@ -0,0 +1,96 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BUCKETS_TYPES_H
+#define _BUCKETS_TYPES_H
+
+#include "util.h"
+
+struct bucket_mark {
+	union {
+	struct {
+		atomic64_t	v;
+	};
+
+	struct {
+		u8		gen;
+		u8		data_type:3,
+				gen_valid:1,
+				owned_by_allocator:1,
+				nouse:1,
+				journal_seq_valid:1;
+		u16		dirty_sectors;
+		u16		cached_sectors;
+
+		/*
+		 * low bits of journal sequence number when this bucket was most
+		 * recently modified: if journal_seq_valid is set, this bucket
+		 * can't be reused until the journal sequence number written to
+		 * disk is >= the bucket's journal sequence number:
+		 */
+		u16		journal_seq;
+	};
+	};
+};
+
+struct bucket {
+	union {
+		struct bucket_mark	_mark;
+		const struct bucket_mark mark;
+	};
+
+	u16				io_time[2];
+};
+
+struct bucket_array {
+	struct rcu_head		rcu;
+	u16			first_bucket;
+	size_t			nbuckets;
+	struct bucket		b[];
+};
+
+struct bch_dev_usage {
+	u64			buckets[BCH_DATA_NR];
+	u64			buckets_alloc;
+	u64			buckets_unavailable;
+
+	/* _compressed_ sectors: */
+	u64			sectors[BCH_DATA_NR];
+	u64			sectors_fragmented;
+};
+
+/* kill, switch to bch_data_type? */
+enum s_alloc {
+	S_META,
+	S_DIRTY,
+	S_ALLOC_NR,
+};
+
+struct bch_fs_usage {
+	/* all fields are in units of 512 byte sectors: */
+	/* _uncompressed_ sectors: */
+	u64			online_reserved;
+	u64			available_cache;
+
+	struct {
+		u64		data[S_ALLOC_NR];
+		u64		persistent_reserved;
+	}			s[BCH_REPLICAS_MAX];
+};
+
+/*
+ * A reservation for space on disk:
+ */
+struct disk_reservation {
+	u64		sectors;
+	u32		gen;
+	unsigned	nr_replicas;
+};
+
+struct copygc_heap_entry {
+	u8			gen;
+	u32			sectors;
+	u64			offset;
+};
+
+typedef HEAP(struct copygc_heap_entry) copygc_heap;
+
+#endif /* _BUCKETS_TYPES_H */
diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c
new file mode 100644
index 000000000000..2aa86331969a
--- /dev/null
+++ b/fs/bcachefs/chardev.c
@@ -0,0 +1,663 @@
+// SPDX-License-Identifier: GPL-2.0
+#ifndef NO_BCACHEFS_CHARDEV
+
+#include "bcachefs.h"
+#include "alloc.h"
+#include "bcachefs_ioctl.h"
+#include "buckets.h"
+#include "chardev.h"
+#include "move.h"
+#include "super.h"
+#include "super-io.h"
+
+#include <linux/anon_inodes.h>
+#include <linux/cdev.h>
+#include <linux/device.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/ioctl.h>
+#include <linux/kthread.h>
+#include <linux/major.h>
+#include <linux/sched/task.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+
+/* returns with ref on ca->ref */
+static struct bch_dev *bch2_device_lookup(struct bch_fs *c, u64 dev,
+					  unsigned flags)
+{
+	struct bch_dev *ca;
+
+	if (flags & BCH_BY_INDEX) {
+		if (dev >= c->sb.nr_devices)
+			return ERR_PTR(-EINVAL);
+
+		rcu_read_lock();
+		ca = rcu_dereference(c->devs[dev]);
+		if (ca)
+			percpu_ref_get(&ca->ref);
+		rcu_read_unlock();
+
+		if (!ca)
+			return ERR_PTR(-EINVAL);
+	} else {
+		char *path;
+
+		path = strndup_user((const char __user *)
+				    (unsigned long) dev, PATH_MAX);
+		if (IS_ERR(path))
+			return ERR_CAST(path);
+
+		ca = bch2_dev_lookup(c, path);
+		kfree(path);
+	}
+
+	return ca;
+}
+
+#if 0
+static long bch2_ioctl_assemble(struct bch_ioctl_assemble __user *user_arg)
+{
+	struct bch_ioctl_assemble arg;
+	struct bch_fs *c;
+	u64 *user_devs = NULL;
+	char **devs = NULL;
+	unsigned i;
+	int ret = -EFAULT;
+
+	if (copy_from_user(&arg, user_arg, sizeof(arg)))
+		return -EFAULT;
+
+	if (arg.flags || arg.pad)
+		return -EINVAL;
+
+	user_devs = kmalloc_array(arg.nr_devs, sizeof(u64), GFP_KERNEL);
+	if (!user_devs)
+		return -ENOMEM;
+
+	devs = kcalloc(arg.nr_devs, sizeof(char *), GFP_KERNEL);
+
+	if (copy_from_user(user_devs, user_arg->devs,
+			   sizeof(u64) * arg.nr_devs))
+		goto err;
+
+	for (i = 0; i < arg.nr_devs; i++) {
+		devs[i] = strndup_user((const char __user *)(unsigned long)
+				       user_devs[i],
+				       PATH_MAX);
+		if (!devs[i]) {
+			ret = -ENOMEM;
+			goto err;
+		}
+	}
+
+	c = bch2_fs_open(devs, arg.nr_devs, bch2_opts_empty());
+	ret = PTR_ERR_OR_ZERO(c);
+	if (!ret)
+		closure_put(&c->cl);
+err:
+	if (devs)
+		for (i = 0; i < arg.nr_devs; i++)
+			kfree(devs[i]);
+	kfree(devs);
+	return ret;
+}
+
+static long bch2_ioctl_incremental(struct bch_ioctl_incremental __user *user_arg)
+{
+	struct bch_ioctl_incremental arg;
+	const char *err;
+	char *path;
+
+	if (copy_from_user(&arg, user_arg, sizeof(arg)))
+		return -EFAULT;
+
+	if (arg.flags || arg.pad)
+		return -EINVAL;
+
+	path = strndup_user((const char __user *)(unsigned long) arg.dev, PATH_MAX);
+	if (!path)
+		return -ENOMEM;
+
+	err = bch2_fs_open_incremental(path);
+	kfree(path);
+
+	if (err) {
+		pr_err("Could not register bcachefs devices: %s", err);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+#endif
+
+static long bch2_global_ioctl(unsigned cmd, void __user *arg)
+{
+	switch (cmd) {
+#if 0
+	case BCH_IOCTL_ASSEMBLE:
+		return bch2_ioctl_assemble(arg);
+	case BCH_IOCTL_INCREMENTAL:
+		return bch2_ioctl_incremental(arg);
+#endif
+	default:
+		return -ENOTTY;
+	}
+}
+
+static long bch2_ioctl_query_uuid(struct bch_fs *c,
+			struct bch_ioctl_query_uuid __user *user_arg)
+{
+	return copy_to_user(&user_arg->uuid,
+			    &c->sb.user_uuid,
+			    sizeof(c->sb.user_uuid));
+}
+
+#if 0
+static long bch2_ioctl_start(struct bch_fs *c, struct bch_ioctl_start arg)
+{
+	if (arg.flags || arg.pad)
+		return -EINVAL;
+
+	return bch2_fs_start(c) ? -EIO : 0;
+}
+
+static long bch2_ioctl_stop(struct bch_fs *c)
+{
+	bch2_fs_stop(c);
+	return 0;
+}
+#endif
+
+static long bch2_ioctl_disk_add(struct bch_fs *c, struct bch_ioctl_disk arg)
+{
+	char *path;
+	int ret;
+
+	if (arg.flags || arg.pad)
+		return -EINVAL;
+
+	path = strndup_user((const char __user *)(unsigned long) arg.dev, PATH_MAX);
+	if (!path)
+		return -ENOMEM;
+
+	ret = bch2_dev_add(c, path);
+	kfree(path);
+
+	return ret;
+}
+
+static long bch2_ioctl_disk_remove(struct bch_fs *c, struct bch_ioctl_disk arg)
+{
+	struct bch_dev *ca;
+
+	if ((arg.flags & ~(BCH_FORCE_IF_DATA_LOST|
+			   BCH_FORCE_IF_METADATA_LOST|
+			   BCH_FORCE_IF_DEGRADED|
+			   BCH_BY_INDEX)) ||
+	    arg.pad)
+		return -EINVAL;
+
+	ca = bch2_device_lookup(c, arg.dev, arg.flags);
+	if (IS_ERR(ca))
+		return PTR_ERR(ca);
+
+	return bch2_dev_remove(c, ca, arg.flags);
+}
+
+static long bch2_ioctl_disk_online(struct bch_fs *c, struct bch_ioctl_disk arg)
+{
+	char *path;
+	int ret;
+
+	if (arg.flags || arg.pad)
+		return -EINVAL;
+
+	path = strndup_user((const char __user *)(unsigned long) arg.dev, PATH_MAX);
+	if (!path)
+		return -ENOMEM;
+
+	ret = bch2_dev_online(c, path);
+	kfree(path);
+	return ret;
+}
+
+static long bch2_ioctl_disk_offline(struct bch_fs *c, struct bch_ioctl_disk arg)
+{
+	struct bch_dev *ca;
+	int ret;
+
+	if ((arg.flags & ~(BCH_FORCE_IF_DATA_LOST|
+			   BCH_FORCE_IF_METADATA_LOST|
+			   BCH_FORCE_IF_DEGRADED|
+			   BCH_BY_INDEX)) ||
+	    arg.pad)
+		return -EINVAL;
+
+	ca = bch2_device_lookup(c, arg.dev, arg.flags);
+	if (IS_ERR(ca))
+		return PTR_ERR(ca);
+
+	ret = bch2_dev_offline(c, ca, arg.flags);
+	percpu_ref_put(&ca->ref);
+	return ret;
+}
+
+static long bch2_ioctl_disk_set_state(struct bch_fs *c,
+			struct bch_ioctl_disk_set_state arg)
+{
+	struct bch_dev *ca;
+	int ret;
+
+	if ((arg.flags & ~(BCH_FORCE_IF_DATA_LOST|
+			   BCH_FORCE_IF_METADATA_LOST|
+			   BCH_FORCE_IF_DEGRADED|
+			   BCH_BY_INDEX)) ||
+	    arg.pad[0] || arg.pad[1] || arg.pad[2])
+		return -EINVAL;
+
+	ca = bch2_device_lookup(c, arg.dev, arg.flags);
+	if (IS_ERR(ca))
+		return PTR_ERR(ca);
+
+	ret = bch2_dev_set_state(c, ca, arg.new_state, arg.flags);
+
+	percpu_ref_put(&ca->ref);
+	return ret;
+}
+
+struct bch_data_ctx {
+	struct bch_fs			*c;
+	struct bch_ioctl_data		arg;
+	struct bch_move_stats		stats;
+
+	int				ret;
+
+	struct task_struct		*thread;
+};
+
+static int bch2_data_thread(void *arg)
+{
+	struct bch_data_ctx *ctx = arg;
+
+	ctx->ret = bch2_data_job(ctx->c, &ctx->stats, ctx->arg);
+
+	ctx->stats.data_type = U8_MAX;
+	return 0;
+}
+
+static int bch2_data_job_release(struct inode *inode, struct file *file)
+{
+	struct bch_data_ctx *ctx = file->private_data;
+
+	kthread_stop(ctx->thread);
+	put_task_struct(ctx->thread);
+	kfree(ctx);
+	return 0;
+}
+
+static ssize_t bch2_data_job_read(struct file *file, char __user *buf,
+				  size_t len, loff_t *ppos)
+{
+	struct bch_data_ctx *ctx = file->private_data;
+	struct bch_fs *c = ctx->c;
+	struct bch_ioctl_data_event e = {
+		.type			= BCH_DATA_EVENT_PROGRESS,
+		.p.data_type		= ctx->stats.data_type,
+		.p.btree_id		= ctx->stats.iter.btree_id,
+		.p.pos			= ctx->stats.iter.pos,
+		.p.sectors_done		= atomic64_read(&ctx->stats.sectors_seen),
+		.p.sectors_total	= bch2_fs_sectors_used(c, bch2_fs_usage_read(c)),
+	};
+
+	if (len < sizeof(e))
+		return -EINVAL;
+
+	return copy_to_user(buf, &e, sizeof(e)) ?: sizeof(e);
+}
+
+static const struct file_operations bcachefs_data_ops = {
+	.release	= bch2_data_job_release,
+	.read		= bch2_data_job_read,
+	.llseek		= no_llseek,
+};
+
+static long bch2_ioctl_data(struct bch_fs *c,
+			    struct bch_ioctl_data arg)
+{
+	struct bch_data_ctx *ctx = NULL;
+	struct file *file = NULL;
+	unsigned flags = O_RDONLY|O_CLOEXEC|O_NONBLOCK;
+	int ret, fd = -1;
+
+	if (arg.op >= BCH_DATA_OP_NR || arg.flags)
+		return -EINVAL;
+
+	ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+	if (!ctx)
+		return -ENOMEM;
+
+	ctx->c = c;
+	ctx->arg = arg;
+
+	ctx->thread = kthread_create(bch2_data_thread, ctx, "[bcachefs]");
+	if (IS_ERR(ctx->thread)) {
+		ret = PTR_ERR(ctx->thread);
+		goto err;
+	}
+
+	ret = get_unused_fd_flags(flags);
+	if (ret < 0)
+		goto err;
+	fd = ret;
+
+	file = anon_inode_getfile("[bcachefs]", &bcachefs_data_ops, ctx, flags);
+	if (IS_ERR(file)) {
+		ret = PTR_ERR(file);
+		goto err;
+	}
+
+	fd_install(fd, file);
+
+	get_task_struct(ctx->thread);
+	wake_up_process(ctx->thread);
+
+	return fd;
+err:
+	if (fd >= 0)
+		put_unused_fd(fd);
+	if (!IS_ERR_OR_NULL(ctx->thread))
+		kthread_stop(ctx->thread);
+	kfree(ctx);
+	return ret;
+}
+
+static long bch2_ioctl_usage(struct bch_fs *c,
+			     struct bch_ioctl_usage __user *user_arg)
+{
+	struct bch_ioctl_usage arg;
+	struct bch_dev *ca;
+	unsigned i, j;
+	int ret;
+
+	if (!test_bit(BCH_FS_STARTED, &c->flags))
+		return -EINVAL;
+
+	if (copy_from_user(&arg, user_arg, sizeof(arg)))
+		return -EFAULT;
+
+	for (i = 0; i < arg.nr_devices; i++) {
+		struct bch_ioctl_dev_usage dst = { .alive = 0 };
+
+		ret = copy_to_user(&user_arg->devs[i], &dst, sizeof(dst));
+		if (ret)
+			return ret;
+	}
+
+	{
+		struct bch_fs_usage src = bch2_fs_usage_read(c);
+		struct bch_ioctl_fs_usage dst = {
+			.capacity		= c->capacity,
+			.used			= bch2_fs_sectors_used(c, src),
+			.online_reserved	= src.online_reserved,
+		};
+
+		for (i = 0; i < BCH_REPLICAS_MAX; i++) {
+			dst.persistent_reserved[i] =
+				src.s[i].persistent_reserved;
+
+			for (j = 0; j < S_ALLOC_NR; j++)
+				dst.sectors[s_alloc_to_data_type(j)][i] =
+					src.s[i].data[j];
+		}
+
+		ret = copy_to_user(&user_arg->fs, &dst, sizeof(dst));
+		if (ret)
+			return ret;
+	}
+
+	for_each_member_device(ca, c, i) {
+		struct bch_dev_usage src = bch2_dev_usage_read(c, ca);
+		struct bch_ioctl_dev_usage dst = {
+			.alive		= 1,
+			.state		= ca->mi.state,
+			.bucket_size	= ca->mi.bucket_size,
+			.nr_buckets	= ca->mi.nbuckets - ca->mi.first_bucket,
+		};
+
+		if (ca->dev_idx >= arg.nr_devices) {
+			percpu_ref_put(&ca->ref);
+			return -ERANGE;
+		}
+
+		if (percpu_ref_tryget(&ca->io_ref)) {
+			dst.dev = huge_encode_dev(ca->disk_sb.bdev->bd_dev);
+			percpu_ref_put(&ca->io_ref);
+		}
+
+		for (j = 0; j < BCH_DATA_NR; j++) {
+			dst.buckets[j] = src.buckets[j];
+			dst.sectors[j] = src.sectors[j];
+		}
+
+		ret = copy_to_user(&user_arg->devs[i], &dst, sizeof(dst));
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+static long bch2_ioctl_read_super(struct bch_fs *c,
+				  struct bch_ioctl_read_super arg)
+{
+	struct bch_dev *ca = NULL;
+	struct bch_sb *sb;
+	int ret = 0;
+
+	if ((arg.flags & ~(BCH_BY_INDEX|BCH_READ_DEV)) ||
+	    arg.pad)
+		return -EINVAL;
+
+	mutex_lock(&c->sb_lock);
+
+	if (arg.flags & BCH_READ_DEV) {
+		ca = bch2_device_lookup(c, arg.dev, arg.flags);
+
+		if (IS_ERR(ca)) {
+			ret = PTR_ERR(ca);
+			goto err;
+		}
+
+		sb = ca->disk_sb.sb;
+	} else {
+		sb = c->disk_sb.sb;
+	}
+
+	if (vstruct_bytes(sb) > arg.size) {
+		ret = -ERANGE;
+		goto err;
+	}
+
+	ret = copy_to_user((void __user *)(unsigned long)arg.sb,
+			   sb, vstruct_bytes(sb));
+err:
+	if (ca)
+		percpu_ref_put(&ca->ref);
+	mutex_unlock(&c->sb_lock);
+	return ret;
+}
+
+static long bch2_ioctl_disk_get_idx(struct bch_fs *c,
+				    struct bch_ioctl_disk_get_idx arg)
+{
+	dev_t dev = huge_decode_dev(arg.dev);
+	struct bch_dev *ca;
+	unsigned i;
+
+	for_each_online_member(ca, c, i)
+		if (ca->disk_sb.bdev->bd_dev == dev) {
+			percpu_ref_put(&ca->io_ref);
+			return i;
+		}
+
+	return -ENOENT;
+}
+
+static long bch2_ioctl_disk_resize(struct bch_fs *c,
+				   struct bch_ioctl_disk_resize arg)
+{
+	struct bch_dev *ca;
+	int ret;
+
+	if ((arg.flags & ~BCH_BY_INDEX) ||
+	    arg.pad)
+		return -EINVAL;
+
+	ca = bch2_device_lookup(c, arg.dev, arg.flags);
+	if (IS_ERR(ca))
+		return PTR_ERR(ca);
+
+	ret = bch2_dev_resize(c, ca, arg.nbuckets);
+
+	percpu_ref_put(&ca->ref);
+	return ret;
+}
+
+#define BCH_IOCTL(_name, _argtype)					\
+do {									\
+	_argtype i;							\
+									\
+	if (copy_from_user(&i, arg, sizeof(i)))				\
+		return -EFAULT;						\
+	return bch2_ioctl_##_name(c, i);				\
+} while (0)
+
+long bch2_fs_ioctl(struct bch_fs *c, unsigned cmd, void __user *arg)
+{
+	/* ioctls that don't require admin cap: */
+	switch (cmd) {
+	case BCH_IOCTL_QUERY_UUID:
+		return bch2_ioctl_query_uuid(c, arg);
+	case BCH_IOCTL_USAGE:
+		return bch2_ioctl_usage(c, arg);
+	}
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	switch (cmd) {
+#if 0
+	case BCH_IOCTL_START:
+		BCH_IOCTL(start, struct bch_ioctl_start);
+	case BCH_IOCTL_STOP:
+		return bch2_ioctl_stop(c);
+#endif
+	case BCH_IOCTL_READ_SUPER:
+		BCH_IOCTL(read_super, struct bch_ioctl_read_super);
+	case BCH_IOCTL_DISK_GET_IDX:
+		BCH_IOCTL(disk_get_idx, struct bch_ioctl_disk_get_idx);
+	}
+
+	if (!test_bit(BCH_FS_STARTED, &c->flags))
+		return -EINVAL;
+
+	/* ioctls that do require admin cap: */
+	switch (cmd) {
+	case BCH_IOCTL_DISK_ADD:
+		BCH_IOCTL(disk_add, struct bch_ioctl_disk);
+	case BCH_IOCTL_DISK_REMOVE:
+		BCH_IOCTL(disk_remove, struct bch_ioctl_disk);
+	case BCH_IOCTL_DISK_ONLINE:
+		BCH_IOCTL(disk_online, struct bch_ioctl_disk);
+	case BCH_IOCTL_DISK_OFFLINE:
+		BCH_IOCTL(disk_offline, struct bch_ioctl_disk);
+	case BCH_IOCTL_DISK_SET_STATE:
+		BCH_IOCTL(disk_set_state, struct bch_ioctl_disk_set_state);
+	case BCH_IOCTL_DATA:
+		BCH_IOCTL(data, struct bch_ioctl_data);
+	case BCH_IOCTL_DISK_RESIZE:
+		BCH_IOCTL(disk_resize, struct bch_ioctl_disk_resize);
+
+	default:
+		return -ENOTTY;
+	}
+}
+
+static DEFINE_IDR(bch_chardev_minor);
+
+static long bch2_chardev_ioctl(struct file *filp, unsigned cmd, unsigned long v)
+{
+	unsigned minor = iminor(file_inode(filp));
+	struct bch_fs *c = minor < U8_MAX ? idr_find(&bch_chardev_minor, minor) : NULL;
+	void __user *arg = (void __user *) v;
+
+	return c
+		? bch2_fs_ioctl(c, cmd, arg)
+		: bch2_global_ioctl(cmd, arg);
+}
+
+static const struct file_operations bch_chardev_fops = {
+	.owner		= THIS_MODULE,
+	.unlocked_ioctl = bch2_chardev_ioctl,
+	.open		= nonseekable_open,
+};
+
+static int bch_chardev_major;
+static struct class *bch_chardev_class;
+static struct device *bch_chardev;
+
+void bch2_fs_chardev_exit(struct bch_fs *c)
+{
+	if (!IS_ERR_OR_NULL(c->chardev))
+		device_unregister(c->chardev);
+	if (c->minor >= 0)
+		idr_remove(&bch_chardev_minor, c->minor);
+}
+
+int bch2_fs_chardev_init(struct bch_fs *c)
+{
+	c->minor = idr_alloc(&bch_chardev_minor, c, 0, 0, GFP_KERNEL);
+	if (c->minor < 0)
+		return c->minor;
+
+	c->chardev = device_create(bch_chardev_class, NULL,
+				   MKDEV(bch_chardev_major, c->minor), c,
+				   "bcachefs%u-ctl", c->minor);
+	if (IS_ERR(c->chardev))
+		return PTR_ERR(c->chardev);
+
+	return 0;
+}
+
+void bch2_chardev_exit(void)
+{
+	if (!IS_ERR_OR_NULL(bch_chardev_class))
+		device_destroy(bch_chardev_class,
+			       MKDEV(bch_chardev_major, U8_MAX));
+	if (!IS_ERR_OR_NULL(bch_chardev_class))
+		class_destroy(bch_chardev_class);
+	if (bch_chardev_major > 0)
+		unregister_chrdev(bch_chardev_major, "bcachefs");
+}
+
+int __init bch2_chardev_init(void)
+{
+	bch_chardev_major = register_chrdev(0, "bcachefs-ctl", &bch_chardev_fops);
+	if (bch_chardev_major < 0)
+		return bch_chardev_major;
+
+	bch_chardev_class = class_create("bcachefs");
+	if (IS_ERR(bch_chardev_class))
+		return PTR_ERR(bch_chardev_class);
+
+	bch_chardev = device_create(bch_chardev_class, NULL,
+				    MKDEV(bch_chardev_major, U8_MAX),
+				    NULL, "bcachefs-ctl");
+	if (IS_ERR(bch_chardev))
+		return PTR_ERR(bch_chardev);
+
+	return 0;
+}
+
+#endif /* NO_BCACHEFS_CHARDEV */
diff --git a/fs/bcachefs/chardev.h b/fs/bcachefs/chardev.h
new file mode 100644
index 000000000000..3a4890d39ff9
--- /dev/null
+++ b/fs/bcachefs/chardev.h
@@ -0,0 +1,31 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_CHARDEV_H
+#define _BCACHEFS_CHARDEV_H
+
+#ifndef NO_BCACHEFS_FS
+
+long bch2_fs_ioctl(struct bch_fs *, unsigned, void __user *);
+
+void bch2_fs_chardev_exit(struct bch_fs *);
+int bch2_fs_chardev_init(struct bch_fs *);
+
+void bch2_chardev_exit(void);
+int __init bch2_chardev_init(void);
+
+#else
+
+static inline long bch2_fs_ioctl(struct bch_fs *c,
+				unsigned cmd, void __user * arg)
+{
+	return -ENOSYS;
+}
+
+static inline void bch2_fs_chardev_exit(struct bch_fs *c) {}
+static inline int bch2_fs_chardev_init(struct bch_fs *c) { return 0; }
+
+static inline void bch2_chardev_exit(void) {}
+static inline int __init bch2_chardev_init(void) { return 0; }
+
+#endif /* NO_BCACHEFS_FS */
+
+#endif /* _BCACHEFS_CHARDEV_H */
diff --git a/fs/bcachefs/checksum.c b/fs/bcachefs/checksum.c
new file mode 100644
index 000000000000..3733cbfa1c91
--- /dev/null
+++ b/fs/bcachefs/checksum.c
@@ -0,0 +1,753 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "bcachefs.h"
+#include "checksum.h"
+#include "super.h"
+#include "super-io.h"
+
+#include <linux/crc32c.h>
+#include <linux/crypto.h>
+#include <linux/key.h>
+#include <linux/random.h>
+#include <linux/scatterlist.h>
+#include <crypto/algapi.h>
+#include <crypto/chacha.h>
+#include <crypto/hash.h>
+#include <crypto/poly1305.h>
+#include <crypto/skcipher.h>
+#include <keys/user-type.h>
+
+/*
+ * Portions Copyright (c) 1996-2001, PostgreSQL Global Development Group (Any
+ * use permitted, subject to terms of PostgreSQL license; see.)
+
+ * If we have a 64-bit integer type, then a 64-bit CRC looks just like the
+ * usual sort of implementation. (See Ross Williams' excellent introduction
+ * A PAINLESS GUIDE TO CRC ERROR DETECTION ALGORITHMS, available from
+ * ftp://ftp.rocksoft.com/papers/crc_v3.txt or several other net sites.)
+ * If we have no working 64-bit type, then fake it with two 32-bit registers.
+ *
+ * The present implementation is a normal (not "reflected", in Williams'
+ * terms) 64-bit CRC, using initial all-ones register contents and a final
+ * bit inversion. The chosen polynomial is borrowed from the DLT1 spec
+ * (ECMA-182, available from http://www.ecma.ch/ecma1/STAND/ECMA-182.HTM):
+ *
+ * x^64 + x^62 + x^57 + x^55 + x^54 + x^53 + x^52 + x^47 + x^46 + x^45 +
+ * x^40 + x^39 + x^38 + x^37 + x^35 + x^33 + x^32 + x^31 + x^29 + x^27 +
+ * x^24 + x^23 + x^22 + x^21 + x^19 + x^17 + x^13 + x^12 + x^10 + x^9 +
+ * x^7 + x^4 + x + 1
+*/
+
+static const u64 crc_table[256] = {
+	0x0000000000000000ULL, 0x42F0E1EBA9EA3693ULL, 0x85E1C3D753D46D26ULL,
+	0xC711223CFA3E5BB5ULL, 0x493366450E42ECDFULL, 0x0BC387AEA7A8DA4CULL,
+	0xCCD2A5925D9681F9ULL, 0x8E224479F47CB76AULL, 0x9266CC8A1C85D9BEULL,
+	0xD0962D61B56FEF2DULL, 0x17870F5D4F51B498ULL, 0x5577EEB6E6BB820BULL,
+	0xDB55AACF12C73561ULL, 0x99A54B24BB2D03F2ULL, 0x5EB4691841135847ULL,
+	0x1C4488F3E8F96ED4ULL, 0x663D78FF90E185EFULL, 0x24CD9914390BB37CULL,
+	0xE3DCBB28C335E8C9ULL, 0xA12C5AC36ADFDE5AULL, 0x2F0E1EBA9EA36930ULL,
+	0x6DFEFF5137495FA3ULL, 0xAAEFDD6DCD770416ULL, 0xE81F3C86649D3285ULL,
+	0xF45BB4758C645C51ULL, 0xB6AB559E258E6AC2ULL, 0x71BA77A2DFB03177ULL,
+	0x334A9649765A07E4ULL, 0xBD68D2308226B08EULL, 0xFF9833DB2BCC861DULL,
+	0x388911E7D1F2DDA8ULL, 0x7A79F00C7818EB3BULL, 0xCC7AF1FF21C30BDEULL,
+	0x8E8A101488293D4DULL, 0x499B3228721766F8ULL, 0x0B6BD3C3DBFD506BULL,
+	0x854997BA2F81E701ULL, 0xC7B97651866BD192ULL, 0x00A8546D7C558A27ULL,
+	0x4258B586D5BFBCB4ULL, 0x5E1C3D753D46D260ULL, 0x1CECDC9E94ACE4F3ULL,
+	0xDBFDFEA26E92BF46ULL, 0x990D1F49C77889D5ULL, 0x172F5B3033043EBFULL,
+	0x55DFBADB9AEE082CULL, 0x92CE98E760D05399ULL, 0xD03E790CC93A650AULL,
+	0xAA478900B1228E31ULL, 0xE8B768EB18C8B8A2ULL, 0x2FA64AD7E2F6E317ULL,
+	0x6D56AB3C4B1CD584ULL, 0xE374EF45BF6062EEULL, 0xA1840EAE168A547DULL,
+	0x66952C92ECB40FC8ULL, 0x2465CD79455E395BULL, 0x3821458AADA7578FULL,
+	0x7AD1A461044D611CULL, 0xBDC0865DFE733AA9ULL, 0xFF3067B657990C3AULL,
+	0x711223CFA3E5BB50ULL, 0x33E2C2240A0F8DC3ULL, 0xF4F3E018F031D676ULL,
+	0xB60301F359DBE0E5ULL, 0xDA050215EA6C212FULL, 0x98F5E3FE438617BCULL,
+	0x5FE4C1C2B9B84C09ULL, 0x1D14202910527A9AULL, 0x93366450E42ECDF0ULL,
+	0xD1C685BB4DC4FB63ULL, 0x16D7A787B7FAA0D6ULL, 0x5427466C1E109645ULL,
+	0x4863CE9FF6E9F891ULL, 0x0A932F745F03CE02ULL, 0xCD820D48A53D95B7ULL,
+	0x8F72ECA30CD7A324ULL, 0x0150A8DAF8AB144EULL, 0x43A04931514122DDULL,
+	0x84B16B0DAB7F7968ULL, 0xC6418AE602954FFBULL, 0xBC387AEA7A8DA4C0ULL,
+	0xFEC89B01D3679253ULL, 0x39D9B93D2959C9E6ULL, 0x7B2958D680B3FF75ULL,
+	0xF50B1CAF74CF481FULL, 0xB7FBFD44DD257E8CULL, 0x70EADF78271B2539ULL,
+	0x321A3E938EF113AAULL, 0x2E5EB66066087D7EULL, 0x6CAE578BCFE24BEDULL,
+	0xABBF75B735DC1058ULL, 0xE94F945C9C3626CBULL, 0x676DD025684A91A1ULL,
+	0x259D31CEC1A0A732ULL, 0xE28C13F23B9EFC87ULL, 0xA07CF2199274CA14ULL,
+	0x167FF3EACBAF2AF1ULL, 0x548F120162451C62ULL, 0x939E303D987B47D7ULL,
+	0xD16ED1D631917144ULL, 0x5F4C95AFC5EDC62EULL, 0x1DBC74446C07F0BDULL,
+	0xDAAD56789639AB08ULL, 0x985DB7933FD39D9BULL, 0x84193F60D72AF34FULL,
+	0xC6E9DE8B7EC0C5DCULL, 0x01F8FCB784FE9E69ULL, 0x43081D5C2D14A8FAULL,
+	0xCD2A5925D9681F90ULL, 0x8FDAB8CE70822903ULL, 0x48CB9AF28ABC72B6ULL,
+	0x0A3B7B1923564425ULL, 0x70428B155B4EAF1EULL, 0x32B26AFEF2A4998DULL,
+	0xF5A348C2089AC238ULL, 0xB753A929A170F4ABULL, 0x3971ED50550C43C1ULL,
+	0x7B810CBBFCE67552ULL, 0xBC902E8706D82EE7ULL, 0xFE60CF6CAF321874ULL,
+	0xE224479F47CB76A0ULL, 0xA0D4A674EE214033ULL, 0x67C58448141F1B86ULL,
+	0x253565A3BDF52D15ULL, 0xAB1721DA49899A7FULL, 0xE9E7C031E063ACECULL,
+	0x2EF6E20D1A5DF759ULL, 0x6C0603E6B3B7C1CAULL, 0xF6FAE5C07D3274CDULL,
+	0xB40A042BD4D8425EULL, 0x731B26172EE619EBULL, 0x31EBC7FC870C2F78ULL,
+	0xBFC9838573709812ULL, 0xFD39626EDA9AAE81ULL, 0x3A28405220A4F534ULL,
+	0x78D8A1B9894EC3A7ULL, 0x649C294A61B7AD73ULL, 0x266CC8A1C85D9BE0ULL,
+	0xE17DEA9D3263C055ULL, 0xA38D0B769B89F6C6ULL, 0x2DAF4F0F6FF541ACULL,
+	0x6F5FAEE4C61F773FULL, 0xA84E8CD83C212C8AULL, 0xEABE6D3395CB1A19ULL,
+	0x90C79D3FEDD3F122ULL, 0xD2377CD44439C7B1ULL, 0x15265EE8BE079C04ULL,
+	0x57D6BF0317EDAA97ULL, 0xD9F4FB7AE3911DFDULL, 0x9B041A914A7B2B6EULL,
+	0x5C1538ADB04570DBULL, 0x1EE5D94619AF4648ULL, 0x02A151B5F156289CULL,
+	0x4051B05E58BC1E0FULL, 0x87409262A28245BAULL, 0xC5B073890B687329ULL,
+	0x4B9237F0FF14C443ULL, 0x0962D61B56FEF2D0ULL, 0xCE73F427ACC0A965ULL,
+	0x8C8315CC052A9FF6ULL, 0x3A80143F5CF17F13ULL, 0x7870F5D4F51B4980ULL,
+	0xBF61D7E80F251235ULL, 0xFD913603A6CF24A6ULL, 0x73B3727A52B393CCULL,
+	0x31439391FB59A55FULL, 0xF652B1AD0167FEEAULL, 0xB4A25046A88DC879ULL,
+	0xA8E6D8B54074A6ADULL, 0xEA16395EE99E903EULL, 0x2D071B6213A0CB8BULL,
+	0x6FF7FA89BA4AFD18ULL, 0xE1D5BEF04E364A72ULL, 0xA3255F1BE7DC7CE1ULL,
+	0x64347D271DE22754ULL, 0x26C49CCCB40811C7ULL, 0x5CBD6CC0CC10FAFCULL,
+	0x1E4D8D2B65FACC6FULL, 0xD95CAF179FC497DAULL, 0x9BAC4EFC362EA149ULL,
+	0x158E0A85C2521623ULL, 0x577EEB6E6BB820B0ULL, 0x906FC95291867B05ULL,
+	0xD29F28B9386C4D96ULL, 0xCEDBA04AD0952342ULL, 0x8C2B41A1797F15D1ULL,
+	0x4B3A639D83414E64ULL, 0x09CA82762AAB78F7ULL, 0x87E8C60FDED7CF9DULL,
+	0xC51827E4773DF90EULL, 0x020905D88D03A2BBULL, 0x40F9E43324E99428ULL,
+	0x2CFFE7D5975E55E2ULL, 0x6E0F063E3EB46371ULL, 0xA91E2402C48A38C4ULL,
+	0xEBEEC5E96D600E57ULL, 0x65CC8190991CB93DULL, 0x273C607B30F68FAEULL,
+	0xE02D4247CAC8D41BULL, 0xA2DDA3AC6322E288ULL, 0xBE992B5F8BDB8C5CULL,
+	0xFC69CAB42231BACFULL, 0x3B78E888D80FE17AULL, 0x7988096371E5D7E9ULL,
+	0xF7AA4D1A85996083ULL, 0xB55AACF12C735610ULL, 0x724B8ECDD64D0DA5ULL,
+	0x30BB6F267FA73B36ULL, 0x4AC29F2A07BFD00DULL, 0x08327EC1AE55E69EULL,
+	0xCF235CFD546BBD2BULL, 0x8DD3BD16FD818BB8ULL, 0x03F1F96F09FD3CD2ULL,
+	0x41011884A0170A41ULL, 0x86103AB85A2951F4ULL, 0xC4E0DB53F3C36767ULL,
+	0xD8A453A01B3A09B3ULL, 0x9A54B24BB2D03F20ULL, 0x5D45907748EE6495ULL,
+	0x1FB5719CE1045206ULL, 0x919735E51578E56CULL, 0xD367D40EBC92D3FFULL,
+	0x1476F63246AC884AULL, 0x568617D9EF46BED9ULL, 0xE085162AB69D5E3CULL,
+	0xA275F7C11F7768AFULL, 0x6564D5FDE549331AULL, 0x279434164CA30589ULL,
+	0xA9B6706FB8DFB2E3ULL, 0xEB46918411358470ULL, 0x2C57B3B8EB0BDFC5ULL,
+	0x6EA7525342E1E956ULL, 0x72E3DAA0AA188782ULL, 0x30133B4B03F2B111ULL,
+	0xF7021977F9CCEAA4ULL, 0xB5F2F89C5026DC37ULL, 0x3BD0BCE5A45A6B5DULL,
+	0x79205D0E0DB05DCEULL, 0xBE317F32F78E067BULL, 0xFCC19ED95E6430E8ULL,
+	0x86B86ED5267CDBD3ULL, 0xC4488F3E8F96ED40ULL, 0x0359AD0275A8B6F5ULL,
+	0x41A94CE9DC428066ULL, 0xCF8B0890283E370CULL, 0x8D7BE97B81D4019FULL,
+	0x4A6ACB477BEA5A2AULL, 0x089A2AACD2006CB9ULL, 0x14DEA25F3AF9026DULL,
+	0x562E43B4931334FEULL, 0x913F6188692D6F4BULL, 0xD3CF8063C0C759D8ULL,
+	0x5DEDC41A34BBEEB2ULL, 0x1F1D25F19D51D821ULL, 0xD80C07CD676F8394ULL,
+	0x9AFCE626CE85B507ULL,
+};
+
+u64 bch2_crc64_update(u64 crc, const void *_data, size_t len)
+{
+	const unsigned char *data = _data;
+
+	while (len--) {
+		int i = ((int) (crc >> 56) ^ *data++) & 0xFF;
+		crc = crc_table[i] ^ (crc << 8);
+	}
+
+	return crc;
+}
+
+static u64 bch2_checksum_init(unsigned type)
+{
+	switch (type) {
+	case BCH_CSUM_NONE:
+		return 0;
+	case BCH_CSUM_CRC32C_NONZERO:
+		return U32_MAX;
+	case BCH_CSUM_CRC64_NONZERO:
+		return U64_MAX;
+	case BCH_CSUM_CRC32C:
+		return 0;
+	case BCH_CSUM_CRC64:
+		return 0;
+	default:
+		BUG();
+	}
+}
+
+static u64 bch2_checksum_final(unsigned type, u64 crc)
+{
+	switch (type) {
+	case BCH_CSUM_NONE:
+		return 0;
+	case BCH_CSUM_CRC32C_NONZERO:
+		return crc ^ U32_MAX;
+	case BCH_CSUM_CRC64_NONZERO:
+		return crc ^ U64_MAX;
+	case BCH_CSUM_CRC32C:
+		return crc;
+	case BCH_CSUM_CRC64:
+		return crc;
+	default:
+		BUG();
+	}
+}
+
+static u64 bch2_checksum_update(unsigned type, u64 crc, const void *data, size_t len)
+{
+	switch (type) {
+	case BCH_CSUM_NONE:
+		return 0;
+	case BCH_CSUM_CRC32C_NONZERO:
+	case BCH_CSUM_CRC32C:
+		return crc32c(crc, data, len);
+	case BCH_CSUM_CRC64_NONZERO:
+	case BCH_CSUM_CRC64:
+		return bch2_crc64_update(crc, data, len);
+	default:
+		BUG();
+	}
+}
+
+static inline void do_encrypt_sg(struct crypto_sync_skcipher *tfm,
+				 struct nonce nonce,
+				 struct scatterlist *sg, size_t len)
+{
+	SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm);
+	int ret;
+
+	skcipher_request_set_sync_tfm(req, tfm);
+	skcipher_request_set_crypt(req, sg, sg, len, nonce.d);
+
+	ret = crypto_skcipher_encrypt(req);
+	BUG_ON(ret);
+}
+
+static inline void do_encrypt(struct crypto_sync_skcipher *tfm,
+			      struct nonce nonce,
+			      void *buf, size_t len)
+{
+	struct scatterlist sg;
+
+	sg_init_one(&sg, buf, len);
+	do_encrypt_sg(tfm, nonce, &sg, len);
+}
+
+int bch2_chacha_encrypt_key(struct bch_key *key, struct nonce nonce,
+			    void *buf, size_t len)
+{
+	struct crypto_sync_skcipher *chacha20 =
+		crypto_alloc_sync_skcipher("chacha20", 0, 0);
+	int ret;
+
+	if (!chacha20) {
+		pr_err("error requesting chacha20 module: %li", PTR_ERR(chacha20));
+		return PTR_ERR(chacha20);
+	}
+
+	ret = crypto_skcipher_setkey(&chacha20->base,
+				     (void *) key, sizeof(*key));
+	if (ret) {
+		pr_err("crypto_skcipher_setkey() error: %i", ret);
+		goto err;
+	}
+
+	do_encrypt(chacha20, nonce, buf, len);
+err:
+	crypto_free_sync_skcipher(chacha20);
+	return ret;
+}
+
+static void gen_poly_key(struct bch_fs *c, struct shash_desc *desc,
+			 struct nonce nonce)
+{
+	u8 key[POLY1305_KEY_SIZE];
+
+	nonce.d[3] ^= BCH_NONCE_POLY;
+
+	memset(key, 0, sizeof(key));
+	do_encrypt(c->chacha20, nonce, key, sizeof(key));
+
+	desc->tfm = c->poly1305;
+	crypto_shash_init(desc);
+	crypto_shash_update(desc, key, sizeof(key));
+}
+
+struct bch_csum bch2_checksum(struct bch_fs *c, unsigned type,
+			      struct nonce nonce, const void *data, size_t len)
+{
+	switch (type) {
+	case BCH_CSUM_NONE:
+	case BCH_CSUM_CRC32C_NONZERO:
+	case BCH_CSUM_CRC64_NONZERO:
+	case BCH_CSUM_CRC32C:
+	case BCH_CSUM_CRC64: {
+		u64 crc = bch2_checksum_init(type);
+
+		crc = bch2_checksum_update(type, crc, data, len);
+		crc = bch2_checksum_final(type, crc);
+
+		return (struct bch_csum) { .lo = cpu_to_le64(crc) };
+	}
+
+	case BCH_CSUM_CHACHA20_POLY1305_80:
+	case BCH_CSUM_CHACHA20_POLY1305_128: {
+		SHASH_DESC_ON_STACK(desc, c->poly1305);
+		u8 digest[POLY1305_DIGEST_SIZE];
+		struct bch_csum ret = { 0 };
+
+		gen_poly_key(c, desc, nonce);
+
+		crypto_shash_update(desc, data, len);
+		crypto_shash_final(desc, digest);
+
+		memcpy(&ret, digest, bch_crc_bytes[type]);
+		return ret;
+	}
+	default:
+		BUG();
+	}
+}
+
+void bch2_encrypt(struct bch_fs *c, unsigned type,
+		  struct nonce nonce, void *data, size_t len)
+{
+	if (!bch2_csum_type_is_encryption(type))
+		return;
+
+	do_encrypt(c->chacha20, nonce, data, len);
+}
+
+static struct bch_csum __bch2_checksum_bio(struct bch_fs *c, unsigned type,
+					   struct nonce nonce, struct bio *bio,
+					   struct bvec_iter *iter)
+{
+	struct bio_vec bv;
+
+	switch (type) {
+	case BCH_CSUM_NONE:
+		return (struct bch_csum) { 0 };
+	case BCH_CSUM_CRC32C_NONZERO:
+	case BCH_CSUM_CRC64_NONZERO:
+	case BCH_CSUM_CRC32C:
+	case BCH_CSUM_CRC64: {
+		u64 crc = bch2_checksum_init(type);
+
+#ifdef CONFIG_HIGHMEM
+		__bio_for_each_segment(bv, bio, *iter, *iter) {
+			void *p = kmap_atomic(bv.bv_page) + bv.bv_offset;
+			crc = bch2_checksum_update(type,
+				crc, p, bv.bv_len);
+			kunmap_atomic(p);
+		}
+#else
+		__bio_for_each_contig_segment(bv, bio, *iter, *iter)
+			crc = bch2_checksum_update(type, crc,
+				page_address(bv.bv_page) + bv.bv_offset,
+				bv.bv_len);
+#endif
+		crc = bch2_checksum_final(type, crc);
+		return (struct bch_csum) { .lo = cpu_to_le64(crc) };
+	}
+
+	case BCH_CSUM_CHACHA20_POLY1305_80:
+	case BCH_CSUM_CHACHA20_POLY1305_128: {
+		SHASH_DESC_ON_STACK(desc, c->poly1305);
+		u8 digest[POLY1305_DIGEST_SIZE];
+		struct bch_csum ret = { 0 };
+
+		gen_poly_key(c, desc, nonce);
+
+#ifdef CONFIG_HIGHMEM
+		__bio_for_each_segment(bv, bio, *iter, *iter) {
+			void *p = kmap_atomic(bv.bv_page) + bv.bv_offset;
+
+			crypto_shash_update(desc, p, bv.bv_len);
+			kunmap_atomic(p);
+		}
+#else
+		__bio_for_each_contig_segment(bv, bio, *iter, *iter)
+			crypto_shash_update(desc,
+				page_address(bv.bv_page) + bv.bv_offset,
+				bv.bv_len);
+#endif
+		crypto_shash_final(desc, digest);
+
+		memcpy(&ret, digest, bch_crc_bytes[type]);
+		return ret;
+	}
+	default:
+		BUG();
+	}
+}
+
+struct bch_csum bch2_checksum_bio(struct bch_fs *c, unsigned type,
+				  struct nonce nonce, struct bio *bio)
+{
+	struct bvec_iter iter = bio->bi_iter;
+
+	return __bch2_checksum_bio(c, type, nonce, bio, &iter);
+}
+
+void bch2_encrypt_bio(struct bch_fs *c, unsigned type,
+		      struct nonce nonce, struct bio *bio)
+{
+	struct bio_vec bv;
+	struct bvec_iter iter;
+	struct scatterlist sgl[16], *sg = sgl;
+	size_t bytes = 0;
+
+	if (!bch2_csum_type_is_encryption(type))
+		return;
+
+	sg_init_table(sgl, ARRAY_SIZE(sgl));
+
+	bio_for_each_segment(bv, bio, iter) {
+		if (sg == sgl + ARRAY_SIZE(sgl)) {
+			sg_mark_end(sg - 1);
+			do_encrypt_sg(c->chacha20, nonce, sgl, bytes);
+
+			nonce = nonce_add(nonce, bytes);
+			bytes = 0;
+
+			sg_init_table(sgl, ARRAY_SIZE(sgl));
+			sg = sgl;
+		}
+
+		sg_set_page(sg++, bv.bv_page, bv.bv_len, bv.bv_offset);
+		bytes += bv.bv_len;
+	}
+
+	sg_mark_end(sg - 1);
+	do_encrypt_sg(c->chacha20, nonce, sgl, bytes);
+}
+
+static inline bool bch2_checksum_mergeable(unsigned type)
+{
+
+	switch (type) {
+	case BCH_CSUM_NONE:
+	case BCH_CSUM_CRC32C:
+	case BCH_CSUM_CRC64:
+		return true;
+	default:
+		return false;
+	}
+}
+
+static struct bch_csum bch2_checksum_merge(unsigned type,
+					   struct bch_csum a,
+					   struct bch_csum b, size_t b_len)
+{
+	BUG_ON(!bch2_checksum_mergeable(type));
+
+	while (b_len) {
+		unsigned b = min_t(unsigned, b_len, PAGE_SIZE);
+
+		a.lo = bch2_checksum_update(type, a.lo,
+				page_address(ZERO_PAGE(0)), b);
+		b_len -= b;
+	}
+
+	a.lo ^= b.lo;
+	a.hi ^= b.hi;
+	return a;
+}
+
+int bch2_rechecksum_bio(struct bch_fs *c, struct bio *bio,
+			struct bversion version,
+			struct bch_extent_crc_unpacked crc_old,
+			struct bch_extent_crc_unpacked *crc_a,
+			struct bch_extent_crc_unpacked *crc_b,
+			unsigned len_a, unsigned len_b,
+			unsigned new_csum_type)
+{
+	struct bvec_iter iter = bio->bi_iter;
+	struct nonce nonce = extent_nonce(version, crc_old);
+	struct bch_csum merged = { 0 };
+	struct crc_split {
+		struct bch_extent_crc_unpacked	*crc;
+		unsigned			len;
+		unsigned			csum_type;
+		struct bch_csum			csum;
+	} splits[3] = {
+		{ crc_a, len_a, new_csum_type },
+		{ crc_b, len_b, new_csum_type },
+		{ NULL,	 bio_sectors(bio) - len_a - len_b, new_csum_type },
+	}, *i;
+	bool mergeable = crc_old.csum_type == new_csum_type &&
+		bch2_checksum_mergeable(new_csum_type);
+	unsigned crc_nonce = crc_old.nonce;
+
+	BUG_ON(len_a + len_b > bio_sectors(bio));
+	BUG_ON(crc_old.uncompressed_size != bio_sectors(bio));
+	BUG_ON(crc_old.compression_type);
+	BUG_ON(bch2_csum_type_is_encryption(crc_old.csum_type) !=
+	       bch2_csum_type_is_encryption(new_csum_type));
+
+	for (i = splits; i < splits + ARRAY_SIZE(splits); i++) {
+		iter.bi_size = i->len << 9;
+		if (mergeable || i->crc)
+			i->csum = __bch2_checksum_bio(c, i->csum_type,
+						      nonce, bio, &iter);
+		else
+			bio_advance_iter(bio, &iter, i->len << 9);
+		nonce = nonce_add(nonce, i->len << 9);
+	}
+
+	if (mergeable)
+		for (i = splits; i < splits + ARRAY_SIZE(splits); i++)
+			merged = bch2_checksum_merge(new_csum_type, merged,
+						     i->csum, i->len << 9);
+	else
+		merged = bch2_checksum_bio(c, crc_old.csum_type,
+				extent_nonce(version, crc_old), bio);
+
+	if (bch2_crc_cmp(merged, crc_old.csum))
+		return -EIO;
+
+	for (i = splits; i < splits + ARRAY_SIZE(splits); i++) {
+		if (i->crc)
+			*i->crc = (struct bch_extent_crc_unpacked) {
+				.csum_type		= i->csum_type,
+				.compressed_size	= i->len,
+				.uncompressed_size	= i->len,
+				.offset			= 0,
+				.live_size		= i->len,
+				.nonce			= crc_nonce,
+				.csum			= i->csum,
+			};
+
+		if (bch2_csum_type_is_encryption(new_csum_type))
+			crc_nonce += i->len;
+	}
+
+	return 0;
+}
+
+#ifdef __KERNEL__
+int bch2_request_key(struct bch_sb *sb, struct bch_key *key)
+{
+	char key_description[60];
+	struct key *keyring_key;
+	const struct user_key_payload *ukp;
+	int ret;
+
+	snprintf(key_description, sizeof(key_description),
+		 "bcachefs:%pUb", &sb->user_uuid);
+
+	keyring_key = request_key(&key_type_logon, key_description, NULL);
+	if (IS_ERR(keyring_key))
+		return PTR_ERR(keyring_key);
+
+	down_read(&keyring_key->sem);
+	ukp = dereference_key_locked(keyring_key);
+	if (ukp->datalen == sizeof(*key)) {
+		memcpy(key, ukp->data, ukp->datalen);
+		ret = 0;
+	} else {
+		ret = -EINVAL;
+	}
+	up_read(&keyring_key->sem);
+	key_put(keyring_key);
+
+	return ret;
+}
+#else
+#include <keyutils.h>
+#include <uuid/uuid.h>
+
+int bch2_request_key(struct bch_sb *sb, struct bch_key *key)
+{
+	key_serial_t key_id;
+	char key_description[60];
+	char uuid[40];
+
+	uuid_unparse_lower(sb->user_uuid.b, uuid);
+	sprintf(key_description, "bcachefs:%s", uuid);
+
+	key_id = request_key("user", key_description, NULL,
+			     KEY_SPEC_USER_KEYRING);
+	if (key_id < 0)
+		return -errno;
+
+	if (keyctl_read(key_id, (void *) key, sizeof(*key)) != sizeof(*key))
+		return -1;
+
+	return 0;
+}
+#endif
+
+int bch2_decrypt_sb_key(struct bch_fs *c,
+			struct bch_sb_field_crypt *crypt,
+			struct bch_key *key)
+{
+	struct bch_encrypted_key sb_key = crypt->key;
+	struct bch_key user_key;
+	int ret = 0;
+
+	/* is key encrypted? */
+	if (!bch2_key_is_encrypted(&sb_key))
+		goto out;
+
+	ret = bch2_request_key(c->disk_sb.sb, &user_key);
+	if (ret) {
+		bch_err(c, "error requesting encryption key: %i", ret);
+		goto err;
+	}
+
+	/* decrypt real key: */
+	ret = bch2_chacha_encrypt_key(&user_key, bch2_sb_key_nonce(c),
+			     &sb_key, sizeof(sb_key));
+	if (ret)
+		goto err;
+
+	if (bch2_key_is_encrypted(&sb_key)) {
+		bch_err(c, "incorrect encryption key");
+		ret = -EINVAL;
+		goto err;
+	}
+out:
+	*key = sb_key.key;
+err:
+	memzero_explicit(&sb_key, sizeof(sb_key));
+	memzero_explicit(&user_key, sizeof(user_key));
+	return ret;
+}
+
+static int bch2_alloc_ciphers(struct bch_fs *c)
+{
+	if (!c->chacha20)
+		c->chacha20 = crypto_alloc_sync_skcipher("chacha20", 0, 0);
+	if (IS_ERR(c->chacha20)) {
+		bch_err(c, "error requesting chacha20 module: %li",
+			PTR_ERR(c->chacha20));
+		return PTR_ERR(c->chacha20);
+	}
+
+	if (!c->poly1305)
+		c->poly1305 = crypto_alloc_shash("poly1305", 0, 0);
+	if (IS_ERR(c->poly1305)) {
+		bch_err(c, "error requesting poly1305 module: %li",
+			PTR_ERR(c->poly1305));
+		return PTR_ERR(c->poly1305);
+	}
+
+	return 0;
+}
+
+int bch2_disable_encryption(struct bch_fs *c)
+{
+	struct bch_sb_field_crypt *crypt;
+	struct bch_key key;
+	int ret = -EINVAL;
+
+	mutex_lock(&c->sb_lock);
+
+	crypt = bch2_sb_get_crypt(c->disk_sb.sb);
+	if (!crypt)
+		goto out;
+
+	/* is key encrypted? */
+	ret = 0;
+	if (bch2_key_is_encrypted(&crypt->key))
+		goto out;
+
+	ret = bch2_decrypt_sb_key(c, crypt, &key);
+	if (ret)
+		goto out;
+
+	crypt->key.magic	= BCH_KEY_MAGIC;
+	crypt->key.key		= key;
+
+	SET_BCH_SB_ENCRYPTION_TYPE(c->disk_sb.sb, 0);
+	bch2_write_super(c);
+out:
+	mutex_unlock(&c->sb_lock);
+
+	return ret;
+}
+
+int bch2_enable_encryption(struct bch_fs *c, bool keyed)
+{
+	struct bch_encrypted_key key;
+	struct bch_key user_key;
+	struct bch_sb_field_crypt *crypt;
+	int ret = -EINVAL;
+
+	mutex_lock(&c->sb_lock);
+
+	/* Do we already have an encryption key? */
+	if (bch2_sb_get_crypt(c->disk_sb.sb))
+		goto err;
+
+	ret = bch2_alloc_ciphers(c);
+	if (ret)
+		goto err;
+
+	key.magic = BCH_KEY_MAGIC;
+	get_random_bytes(&key.key, sizeof(key.key));
+
+	if (keyed) {
+		ret = bch2_request_key(c->disk_sb.sb, &user_key);
+		if (ret) {
+			bch_err(c, "error requesting encryption key: %i", ret);
+			goto err;
+		}
+
+		ret = bch2_chacha_encrypt_key(&user_key, bch2_sb_key_nonce(c),
+					      &key, sizeof(key));
+		if (ret)
+			goto err;
+	}
+
+	ret = crypto_skcipher_setkey(&c->chacha20->base,
+			(void *) &key.key, sizeof(key.key));
+	if (ret)
+		goto err;
+
+	crypt = bch2_sb_resize_crypt(&c->disk_sb, sizeof(*crypt) / sizeof(u64));
+	if (!crypt) {
+		ret = -ENOMEM; /* XXX this technically could be -ENOSPC */
+		goto err;
+	}
+
+	crypt->key = key;
+
+	/* write superblock */
+	SET_BCH_SB_ENCRYPTION_TYPE(c->disk_sb.sb, 1);
+	bch2_write_super(c);
+err:
+	mutex_unlock(&c->sb_lock);
+	memzero_explicit(&user_key, sizeof(user_key));
+	memzero_explicit(&key, sizeof(key));
+	return ret;
+}
+
+void bch2_fs_encryption_exit(struct bch_fs *c)
+{
+	if (!IS_ERR_OR_NULL(c->poly1305))
+		crypto_free_shash(c->poly1305);
+	if (!IS_ERR_OR_NULL(c->chacha20))
+		crypto_free_sync_skcipher(c->chacha20);
+	if (!IS_ERR_OR_NULL(c->sha256))
+		crypto_free_shash(c->sha256);
+}
+
+int bch2_fs_encryption_init(struct bch_fs *c)
+{
+	struct bch_sb_field_crypt *crypt;
+	struct bch_key key;
+	int ret = 0;
+
+	pr_verbose_init(c->opts, "");
+
+	c->sha256 = crypto_alloc_shash("sha256", 0, 0);
+	if (IS_ERR(c->sha256)) {
+		bch_err(c, "error requesting sha256 module");
+		ret = PTR_ERR(c->sha256);
+		goto out;
+	}
+
+	crypt = bch2_sb_get_crypt(c->disk_sb.sb);
+	if (!crypt)
+		goto out;
+
+	ret = bch2_alloc_ciphers(c);
+	if (ret)
+		goto out;
+
+	ret = bch2_decrypt_sb_key(c, crypt, &key);
+	if (ret)
+		goto out;
+
+	ret = crypto_skcipher_setkey(&c->chacha20->base,
+			(void *) &key.key, sizeof(key.key));
+	if (ret)
+		goto out;
+out:
+	memzero_explicit(&key, sizeof(key));
+	pr_verbose_init(c->opts, "ret %i", ret);
+	return ret;
+}
diff --git a/fs/bcachefs/checksum.h b/fs/bcachefs/checksum.h
new file mode 100644
index 000000000000..42c86466293e
--- /dev/null
+++ b/fs/bcachefs/checksum.h
@@ -0,0 +1,184 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_CHECKSUM_H
+#define _BCACHEFS_CHECKSUM_H
+
+#include "bcachefs.h"
+#include "extents_types.h"
+#include "super-io.h"
+
+#include <crypto/chacha.h>
+
+u64 bch2_crc64_update(u64, const void *, size_t);
+
+#define BCH_NONCE_EXTENT	cpu_to_le32(1 << 28)
+#define BCH_NONCE_BTREE		cpu_to_le32(2 << 28)
+#define BCH_NONCE_JOURNAL	cpu_to_le32(3 << 28)
+#define BCH_NONCE_PRIO		cpu_to_le32(4 << 28)
+#define BCH_NONCE_POLY		cpu_to_le32(1 << 31)
+
+struct bch_csum bch2_checksum(struct bch_fs *, unsigned, struct nonce,
+			     const void *, size_t);
+
+/*
+ * This is used for various on disk data structures - bch_sb, prio_set, bset,
+ * jset: The checksum is _always_ the first field of these structs
+ */
+#define csum_vstruct(_c, _type, _nonce, _i)				\
+({									\
+	const void *start = ((const void *) (_i)) + sizeof((_i)->csum);	\
+	const void *end = vstruct_end(_i);				\
+									\
+	bch2_checksum(_c, _type, _nonce, start, end - start);		\
+})
+
+int bch2_chacha_encrypt_key(struct bch_key *, struct nonce, void *, size_t);
+int bch2_request_key(struct bch_sb *, struct bch_key *);
+
+void bch2_encrypt(struct bch_fs *, unsigned, struct nonce,
+		 void *data, size_t);
+
+struct bch_csum bch2_checksum_bio(struct bch_fs *, unsigned,
+				  struct nonce, struct bio *);
+
+int bch2_rechecksum_bio(struct bch_fs *, struct bio *, struct bversion,
+			struct bch_extent_crc_unpacked,
+			struct bch_extent_crc_unpacked *,
+			struct bch_extent_crc_unpacked *,
+			unsigned, unsigned, unsigned);
+
+void bch2_encrypt_bio(struct bch_fs *, unsigned,
+		    struct nonce, struct bio *);
+
+int bch2_decrypt_sb_key(struct bch_fs *, struct bch_sb_field_crypt *,
+			struct bch_key *);
+
+int bch2_disable_encryption(struct bch_fs *);
+int bch2_enable_encryption(struct bch_fs *, bool);
+
+void bch2_fs_encryption_exit(struct bch_fs *);
+int bch2_fs_encryption_init(struct bch_fs *);
+
+static inline enum bch_csum_type bch2_csum_opt_to_type(enum bch_csum_opts type,
+						       bool data)
+{
+	switch (type) {
+	case BCH_CSUM_OPT_NONE:
+	     return BCH_CSUM_NONE;
+	case BCH_CSUM_OPT_CRC32C:
+	     return data ? BCH_CSUM_CRC32C : BCH_CSUM_CRC32C_NONZERO;
+	case BCH_CSUM_OPT_CRC64:
+	     return data ? BCH_CSUM_CRC64 : BCH_CSUM_CRC64_NONZERO;
+	default:
+	     BUG();
+	}
+}
+
+static inline enum bch_csum_type bch2_data_checksum_type(struct bch_fs *c,
+							 unsigned opt)
+{
+	if (c->sb.encryption_type)
+		return c->opts.wide_macs
+			? BCH_CSUM_CHACHA20_POLY1305_128
+			: BCH_CSUM_CHACHA20_POLY1305_80;
+
+	return bch2_csum_opt_to_type(opt, true);
+}
+
+static inline enum bch_csum_type bch2_meta_checksum_type(struct bch_fs *c)
+{
+	if (c->sb.encryption_type)
+		return BCH_CSUM_CHACHA20_POLY1305_128;
+
+	return bch2_csum_opt_to_type(c->opts.metadata_checksum, false);
+}
+
+static const unsigned bch2_compression_opt_to_type[] = {
+#define x(t) [BCH_COMPRESSION_OPT_##t] = BCH_COMPRESSION_##t,
+	BCH_COMPRESSION_TYPES()
+#undef x
+};
+
+static inline bool bch2_checksum_type_valid(const struct bch_fs *c,
+					   unsigned type)
+{
+	if (type >= BCH_CSUM_NR)
+		return false;
+
+	if (bch2_csum_type_is_encryption(type) && !c->chacha20)
+		return false;
+
+	return true;
+}
+
+/* returns true if not equal */
+static inline bool bch2_crc_cmp(struct bch_csum l, struct bch_csum r)
+{
+	/*
+	 * XXX: need some way of preventing the compiler from optimizing this
+	 * into a form that isn't constant time..
+	 */
+	return ((l.lo ^ r.lo) | (l.hi ^ r.hi)) != 0;
+}
+
+/* for skipping ahead and encrypting/decrypting at an offset: */
+static inline struct nonce nonce_add(struct nonce nonce, unsigned offset)
+{
+	EBUG_ON(offset & (CHACHA_BLOCK_SIZE - 1));
+
+	le32_add_cpu(&nonce.d[0], offset / CHACHA_BLOCK_SIZE);
+	return nonce;
+}
+
+static inline struct nonce null_nonce(void)
+{
+	struct nonce ret;
+
+	memset(&ret, 0, sizeof(ret));
+	return ret;
+}
+
+static inline struct nonce extent_nonce(struct bversion version,
+					struct bch_extent_crc_unpacked crc)
+{
+	unsigned size = crc.compression_type ? crc.uncompressed_size : 0;
+	struct nonce nonce = (struct nonce) {{
+		[0] = cpu_to_le32(size << 22),
+		[1] = cpu_to_le32(version.lo),
+		[2] = cpu_to_le32(version.lo >> 32),
+		[3] = cpu_to_le32(version.hi|
+				  (crc.compression_type << 24))^BCH_NONCE_EXTENT,
+	}};
+
+	return nonce_add(nonce, crc.nonce << 9);
+}
+
+static inline bool bch2_key_is_encrypted(struct bch_encrypted_key *key)
+{
+	return le64_to_cpu(key->magic) != BCH_KEY_MAGIC;
+}
+
+static inline struct nonce __bch2_sb_key_nonce(struct bch_sb *sb)
+{
+	__le64 magic = __bch2_sb_magic(sb);
+
+	return (struct nonce) {{
+		[0] = 0,
+		[1] = 0,
+		[2] = ((__le32 *) &magic)[0],
+		[3] = ((__le32 *) &magic)[1],
+	}};
+}
+
+static inline struct nonce bch2_sb_key_nonce(struct bch_fs *c)
+{
+	__le64 magic = bch2_sb_magic(c);
+
+	return (struct nonce) {{
+		[0] = 0,
+		[1] = 0,
+		[2] = ((__le32 *) &magic)[0],
+		[3] = ((__le32 *) &magic)[1],
+	}};
+}
+
+#endif /* _BCACHEFS_CHECKSUM_H */
diff --git a/fs/bcachefs/clock.c b/fs/bcachefs/clock.c
new file mode 100644
index 000000000000..96f8030384fa
--- /dev/null
+++ b/fs/bcachefs/clock.c
@@ -0,0 +1,180 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "bcachefs.h"
+#include "clock.h"
+
+#include <linux/freezer.h>
+#include <linux/kthread.h>
+#include <linux/preempt.h>
+
+static inline long io_timer_cmp(io_timer_heap *h,
+				struct io_timer *l,
+				struct io_timer *r)
+{
+	return l->expire - r->expire;
+}
+
+void bch2_io_timer_add(struct io_clock *clock, struct io_timer *timer)
+{
+	size_t i;
+
+	spin_lock(&clock->timer_lock);
+	for (i = 0; i < clock->timers.used; i++)
+		if (clock->timers.data[i] == timer)
+			goto out;
+
+	BUG_ON(!heap_add(&clock->timers, timer, io_timer_cmp));
+out:
+	spin_unlock(&clock->timer_lock);
+}
+
+void bch2_io_timer_del(struct io_clock *clock, struct io_timer *timer)
+{
+	size_t i;
+
+	spin_lock(&clock->timer_lock);
+
+	for (i = 0; i < clock->timers.used; i++)
+		if (clock->timers.data[i] == timer) {
+			heap_del(&clock->timers, i, io_timer_cmp);
+			break;
+		}
+
+	spin_unlock(&clock->timer_lock);
+}
+
+struct io_clock_wait {
+	struct io_timer		io_timer;
+	struct timer_list	cpu_timer;
+	struct task_struct	*task;
+	int			expired;
+};
+
+static void io_clock_wait_fn(struct io_timer *timer)
+{
+	struct io_clock_wait *wait = container_of(timer,
+				struct io_clock_wait, io_timer);
+
+	wait->expired = 1;
+	wake_up_process(wait->task);
+}
+
+static void io_clock_cpu_timeout(struct timer_list *timer)
+{
+	struct io_clock_wait *wait = container_of(timer,
+				struct io_clock_wait, cpu_timer);
+
+	wait->expired = 1;
+	wake_up_process(wait->task);
+}
+
+void bch2_io_clock_schedule_timeout(struct io_clock *clock, unsigned long until)
+{
+	struct io_clock_wait wait;
+
+	/* XXX: calculate sleep time rigorously */
+	wait.io_timer.expire	= until;
+	wait.io_timer.fn	= io_clock_wait_fn;
+	wait.task		= current;
+	wait.expired		= 0;
+	bch2_io_timer_add(clock, &wait.io_timer);
+
+	schedule();
+
+	bch2_io_timer_del(clock, &wait.io_timer);
+}
+
+void bch2_kthread_io_clock_wait(struct io_clock *clock,
+				unsigned long io_until,
+				unsigned long cpu_timeout)
+{
+	bool kthread = (current->flags & PF_KTHREAD) != 0;
+	struct io_clock_wait wait;
+
+	wait.io_timer.expire	= io_until;
+	wait.io_timer.fn	= io_clock_wait_fn;
+	wait.task		= current;
+	wait.expired		= 0;
+	bch2_io_timer_add(clock, &wait.io_timer);
+
+	timer_setup_on_stack(&wait.cpu_timer, io_clock_cpu_timeout, 0);
+
+	if (cpu_timeout != MAX_SCHEDULE_TIMEOUT)
+		mod_timer(&wait.cpu_timer, cpu_timeout + jiffies);
+
+	while (1) {
+		set_current_state(TASK_INTERRUPTIBLE);
+		if (kthread && kthread_should_stop())
+			break;
+
+		if (wait.expired)
+			break;
+
+		schedule();
+		try_to_freeze();
+	}
+
+	__set_current_state(TASK_RUNNING);
+	del_timer_sync(&wait.cpu_timer);
+	destroy_timer_on_stack(&wait.cpu_timer);
+	bch2_io_timer_del(clock, &wait.io_timer);
+}
+
+static struct io_timer *get_expired_timer(struct io_clock *clock,
+					  unsigned long now)
+{
+	struct io_timer *ret = NULL;
+
+	spin_lock(&clock->timer_lock);
+
+	if (clock->timers.used &&
+	    time_after_eq(now, clock->timers.data[0]->expire))
+		heap_pop(&clock->timers, ret, io_timer_cmp);
+
+	spin_unlock(&clock->timer_lock);
+
+	return ret;
+}
+
+void bch2_increment_clock(struct bch_fs *c, unsigned sectors, int rw)
+{
+	struct io_clock *clock = &c->io_clock[rw];
+	struct io_timer *timer;
+	unsigned long now;
+
+	/* Buffer up one megabyte worth of IO in the percpu counter */
+	preempt_disable();
+
+	if (likely(this_cpu_add_return(*clock->pcpu_buf, sectors) <
+		   IO_CLOCK_PCPU_SECTORS)) {
+		preempt_enable();
+		return;
+	}
+
+	sectors = this_cpu_xchg(*clock->pcpu_buf, 0);
+	preempt_enable();
+	now = atomic_long_add_return(sectors, &clock->now);
+
+	while ((timer = get_expired_timer(clock, now)))
+		timer->fn(timer);
+}
+
+void bch2_io_clock_exit(struct io_clock *clock)
+{
+	free_heap(&clock->timers);
+	free_percpu(clock->pcpu_buf);
+}
+
+int bch2_io_clock_init(struct io_clock *clock)
+{
+	atomic_long_set(&clock->now, 0);
+	spin_lock_init(&clock->timer_lock);
+
+	clock->pcpu_buf = alloc_percpu(*clock->pcpu_buf);
+	if (!clock->pcpu_buf)
+		return -ENOMEM;
+
+	if (!init_heap(&clock->timers, NR_IO_TIMERS, GFP_KERNEL))
+		return -ENOMEM;
+
+	return 0;
+}
diff --git a/fs/bcachefs/clock.h b/fs/bcachefs/clock.h
new file mode 100644
index 000000000000..5cb043c579d8
--- /dev/null
+++ b/fs/bcachefs/clock.h
@@ -0,0 +1,25 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_CLOCK_H
+#define _BCACHEFS_CLOCK_H
+
+void bch2_io_timer_add(struct io_clock *, struct io_timer *);
+void bch2_io_timer_del(struct io_clock *, struct io_timer *);
+void bch2_kthread_io_clock_wait(struct io_clock *, unsigned long,
+				unsigned long);
+void bch2_increment_clock(struct bch_fs *, unsigned, int);
+
+void bch2_io_clock_schedule_timeout(struct io_clock *, unsigned long);
+
+#define bch2_kthread_wait_event_ioclock_timeout(condition, clock, timeout)\
+({									\
+	long __ret = timeout;						\
+	might_sleep();							\
+	if (!___wait_cond_timeout(condition))				\
+		__ret = __wait_event_timeout(wq, condition, timeout);	\
+	__ret;								\
+})
+
+void bch2_io_clock_exit(struct io_clock *);
+int bch2_io_clock_init(struct io_clock *);
+
+#endif /* _BCACHEFS_CLOCK_H */
diff --git a/fs/bcachefs/clock_types.h b/fs/bcachefs/clock_types.h
new file mode 100644
index 000000000000..2b5e499e12b4
--- /dev/null
+++ b/fs/bcachefs/clock_types.h
@@ -0,0 +1,36 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_CLOCK_TYPES_H
+#define _BCACHEFS_CLOCK_TYPES_H
+
+#include "util.h"
+
+#define NR_IO_TIMERS		(BCH_SB_MEMBERS_MAX * 3)
+
+/*
+ * Clocks/timers in units of sectors of IO:
+ *
+ * Note - they use percpu batching, so they're only approximate.
+ */
+
+struct io_timer;
+typedef void (*io_timer_fn)(struct io_timer *);
+
+struct io_timer {
+	io_timer_fn		fn;
+	unsigned long		expire;
+};
+
+/* Amount to buffer up on a percpu counter */
+#define IO_CLOCK_PCPU_SECTORS	128
+
+typedef HEAP(struct io_timer *)	io_timer_heap;
+
+struct io_clock {
+	atomic_long_t		now;
+	u16 __percpu		*pcpu_buf;
+
+	spinlock_t		timer_lock;
+	io_timer_heap		timers;
+};
+
+#endif /* _BCACHEFS_CLOCK_TYPES_H */
diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c
new file mode 100644
index 000000000000..42ae4cfdcb6b
--- /dev/null
+++ b/fs/bcachefs/compress.c
@@ -0,0 +1,621 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "bcachefs.h"
+#include "checksum.h"
+#include "compress.h"
+#include "extents.h"
+#include "io.h"
+#include "super-io.h"
+
+#include <linux/lz4.h>
+#include <linux/zlib.h>
+#include <linux/zstd.h>
+
+/* Bounce buffer: */
+struct bbuf {
+	void		*b;
+	enum {
+		BB_NONE,
+		BB_VMAP,
+		BB_KMALLOC,
+		BB_VMALLOC,
+		BB_MEMPOOL,
+	}		type;
+	int		rw;
+};
+
+static struct bbuf __bounce_alloc(struct bch_fs *c, unsigned size, int rw)
+{
+	void *b;
+
+	BUG_ON(size > c->sb.encoded_extent_max << 9);
+
+	b = kmalloc(size, GFP_NOIO|__GFP_NOWARN);
+	if (b)
+		return (struct bbuf) { .b = b, .type = BB_KMALLOC, .rw = rw };
+
+	b = mempool_alloc(&c->compression_bounce[rw], GFP_NOWAIT);
+	b = b ? page_address(b) : NULL;
+	if (b)
+		return (struct bbuf) { .b = b, .type = BB_MEMPOOL, .rw = rw };
+
+	b = vmalloc(size);
+	if (b)
+		return (struct bbuf) { .b = b, .type = BB_VMALLOC, .rw = rw };
+
+	b = mempool_alloc(&c->compression_bounce[rw], GFP_NOIO);
+	b = b ? page_address(b) : NULL;
+	if (b)
+		return (struct bbuf) { .b = b, .type = BB_MEMPOOL, .rw = rw };
+
+	BUG();
+}
+
+static struct bbuf __bio_map_or_bounce(struct bch_fs *c, struct bio *bio,
+				       struct bvec_iter start, int rw)
+{
+	struct bbuf ret;
+	struct bio_vec bv;
+	struct bvec_iter iter;
+	unsigned nr_pages = 0;
+	struct page *stack_pages[16];
+	struct page **pages = NULL;
+	bool first = true;
+	unsigned prev_end = PAGE_SIZE;
+	void *data;
+
+	BUG_ON(bvec_iter_sectors(start) > c->sb.encoded_extent_max);
+
+#ifndef CONFIG_HIGHMEM
+	__bio_for_each_contig_segment(bv, bio, iter, start) {
+		if (bv.bv_len == start.bi_size)
+			return (struct bbuf) {
+				.b = page_address(bv.bv_page) + bv.bv_offset,
+				.type = BB_NONE, .rw = rw
+			};
+	}
+#endif
+	__bio_for_each_segment(bv, bio, iter, start) {
+		if ((!first && bv.bv_offset) ||
+		    prev_end != PAGE_SIZE)
+			goto bounce;
+
+		prev_end = bv.bv_offset + bv.bv_len;
+		nr_pages++;
+	}
+
+	BUG_ON(DIV_ROUND_UP(start.bi_size, PAGE_SIZE) > nr_pages);
+
+	pages = nr_pages > ARRAY_SIZE(stack_pages)
+		? kmalloc_array(nr_pages, sizeof(struct page *), GFP_NOIO)
+		: stack_pages;
+	if (!pages)
+		goto bounce;
+
+	nr_pages = 0;
+	__bio_for_each_segment(bv, bio, iter, start)
+		pages[nr_pages++] = bv.bv_page;
+
+	data = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL);
+	if (pages != stack_pages)
+		kfree(pages);
+
+	if (data)
+		return (struct bbuf) {
+			.b = data + bio_iter_offset(bio, start),
+			.type = BB_VMAP, .rw = rw
+		};
+bounce:
+	ret = __bounce_alloc(c, start.bi_size, rw);
+
+	if (rw == READ)
+		memcpy_from_bio(ret.b, bio, start);
+
+	return ret;
+}
+
+static struct bbuf bio_map_or_bounce(struct bch_fs *c, struct bio *bio, int rw)
+{
+	return __bio_map_or_bounce(c, bio, bio->bi_iter, rw);
+}
+
+static void bio_unmap_or_unbounce(struct bch_fs *c, struct bbuf buf)
+{
+	switch (buf.type) {
+	case BB_NONE:
+		break;
+	case BB_VMAP:
+		vunmap((void *) ((unsigned long) buf.b & PAGE_MASK));
+		break;
+	case BB_KMALLOC:
+		kfree(buf.b);
+		break;
+	case BB_VMALLOC:
+		vfree(buf.b);
+		break;
+	case BB_MEMPOOL:
+		mempool_free(virt_to_page(buf.b),
+			     &c->compression_bounce[buf.rw]);
+		break;
+	}
+}
+
+static inline void zlib_set_workspace(z_stream *strm, void *workspace)
+{
+#ifdef __KERNEL__
+	strm->workspace = workspace;
+#endif
+}
+
+static int __bio_uncompress(struct bch_fs *c, struct bio *src,
+			    void *dst_data, struct bch_extent_crc_unpacked crc)
+{
+	struct bbuf src_data = { NULL };
+	size_t src_len = src->bi_iter.bi_size;
+	size_t dst_len = crc.uncompressed_size << 9;
+	void *workspace;
+	int ret;
+
+	src_data = bio_map_or_bounce(c, src, READ);
+
+	switch (crc.compression_type) {
+	case BCH_COMPRESSION_LZ4_OLD:
+	case BCH_COMPRESSION_LZ4:
+		ret = LZ4_decompress_safe_partial(src_data.b, dst_data,
+						  src_len, dst_len, dst_len);
+		if (ret != dst_len)
+			goto err;
+		break;
+	case BCH_COMPRESSION_GZIP: {
+		z_stream strm = {
+			.next_in	= src_data.b,
+			.avail_in	= src_len,
+			.next_out	= dst_data,
+			.avail_out	= dst_len,
+		};
+
+		workspace = mempool_alloc(&c->decompress_workspace, GFP_NOIO);
+
+		zlib_set_workspace(&strm, workspace);
+		zlib_inflateInit2(&strm, -MAX_WBITS);
+		ret = zlib_inflate(&strm, Z_FINISH);
+
+		mempool_free(workspace, &c->decompress_workspace);
+
+		if (ret != Z_STREAM_END)
+			goto err;
+		break;
+	}
+	case BCH_COMPRESSION_ZSTD: {
+		ZSTD_DCtx *ctx;
+		size_t len;
+
+		workspace = mempool_alloc(&c->decompress_workspace, GFP_NOIO);
+		ctx = zstd_init_dctx(workspace, zstd_dctx_workspace_bound());
+
+		src_len = le32_to_cpup(src_data.b);
+
+		len = zstd_decompress_dctx(ctx,
+				dst_data,	dst_len,
+				src_data.b + 4, src_len);
+
+		mempool_free(workspace, &c->decompress_workspace);
+
+		if (len != dst_len)
+			goto err;
+		break;
+	}
+	default:
+		BUG();
+	}
+	ret = 0;
+out:
+	bio_unmap_or_unbounce(c, src_data);
+	return ret;
+err:
+	ret = -EIO;
+	goto out;
+}
+
+int bch2_bio_uncompress_inplace(struct bch_fs *c, struct bio *bio,
+				struct bch_extent_crc_unpacked *crc)
+{
+	struct bbuf data = { NULL };
+	size_t dst_len = crc->uncompressed_size << 9;
+
+	/* bio must own its pages: */
+	BUG_ON(!bio->bi_vcnt);
+	BUG_ON(DIV_ROUND_UP(crc->live_size, PAGE_SECTORS) > bio->bi_max_vecs);
+
+	if (crc->uncompressed_size	> c->sb.encoded_extent_max ||
+	    crc->compressed_size	> c->sb.encoded_extent_max) {
+		bch_err(c, "error rewriting existing data: extent too big");
+		return -EIO;
+	}
+
+	data = __bounce_alloc(c, dst_len, WRITE);
+
+	if (__bio_uncompress(c, bio, data.b, *crc)) {
+		bch_err(c, "error rewriting existing data: decompression error");
+		bio_unmap_or_unbounce(c, data);
+		return -EIO;
+	}
+
+	/*
+	 * might have to free existing pages and retry allocation from mempool -
+	 * do this _after_ decompressing:
+	 */
+	bch2_bio_alloc_more_pages_pool(c, bio, crc->live_size << 9);
+
+	memcpy_to_bio(bio, bio->bi_iter, data.b + (crc->offset << 9));
+
+	crc->csum_type		= 0;
+	crc->compression_type	= 0;
+	crc->compressed_size	= crc->live_size;
+	crc->uncompressed_size	= crc->live_size;
+	crc->offset		= 0;
+	crc->csum		= (struct bch_csum) { 0, 0 };
+
+	bio_unmap_or_unbounce(c, data);
+	return 0;
+}
+
+int bch2_bio_uncompress(struct bch_fs *c, struct bio *src,
+		       struct bio *dst, struct bvec_iter dst_iter,
+		       struct bch_extent_crc_unpacked crc)
+{
+	struct bbuf dst_data = { NULL };
+	size_t dst_len = crc.uncompressed_size << 9;
+	int ret = -ENOMEM;
+
+	if (crc.uncompressed_size	> c->sb.encoded_extent_max ||
+	    crc.compressed_size		> c->sb.encoded_extent_max)
+		return -EIO;
+
+	dst_data = dst_len == dst_iter.bi_size
+		? __bio_map_or_bounce(c, dst, dst_iter, WRITE)
+		: __bounce_alloc(c, dst_len, WRITE);
+
+	ret = __bio_uncompress(c, src, dst_data.b, crc);
+	if (ret)
+		goto err;
+
+	if (dst_data.type != BB_NONE)
+		memcpy_to_bio(dst, dst_iter, dst_data.b + (crc.offset << 9));
+err:
+	bio_unmap_or_unbounce(c, dst_data);
+	return ret;
+}
+
+static int attempt_compress(struct bch_fs *c,
+			    void *workspace,
+			    void *dst, size_t dst_len,
+			    void *src, size_t src_len,
+			    unsigned compression_type)
+{
+	switch (compression_type) {
+	case BCH_COMPRESSION_LZ4: {
+		int len = src_len;
+		int ret = LZ4_compress_destSize(
+				src,		dst,
+				&len,		dst_len,
+				workspace);
+
+		if (len < src_len)
+			return -len;
+
+		return ret;
+	}
+	case BCH_COMPRESSION_GZIP: {
+		z_stream strm = {
+			.next_in	= src,
+			.avail_in	= src_len,
+			.next_out	= dst,
+			.avail_out	= dst_len,
+		};
+
+		zlib_set_workspace(&strm, workspace);
+		zlib_deflateInit2(&strm, Z_DEFAULT_COMPRESSION,
+				  Z_DEFLATED, -MAX_WBITS, DEF_MEM_LEVEL,
+				  Z_DEFAULT_STRATEGY);
+
+		if (zlib_deflate(&strm, Z_FINISH) != Z_STREAM_END)
+			return 0;
+
+		if (zlib_deflateEnd(&strm) != Z_OK)
+			return 0;
+
+		return strm.total_out;
+	}
+	case BCH_COMPRESSION_ZSTD: {
+		ZSTD_CCtx *ctx = zstd_init_cctx(workspace,
+			zstd_cctx_workspace_bound(&c->zstd_params.cParams));
+
+		size_t len = zstd_compress_cctx(ctx,
+				dst + 4,	dst_len - 4,
+				src,		src_len,
+				&c->zstd_params);
+		if (zstd_is_error(len))
+			return 0;
+
+		*((__le32 *) dst) = cpu_to_le32(len);
+		return len + 4;
+	}
+	default:
+		BUG();
+	}
+}
+
+static unsigned __bio_compress(struct bch_fs *c,
+			       struct bio *dst, size_t *dst_len,
+			       struct bio *src, size_t *src_len,
+			       unsigned compression_type)
+{
+	struct bbuf src_data = { NULL }, dst_data = { NULL };
+	void *workspace;
+	unsigned pad;
+	int ret = 0;
+
+	BUG_ON(compression_type >= BCH_COMPRESSION_NR);
+	BUG_ON(!mempool_initialized(&c->compress_workspace[compression_type]));
+
+	/* If it's only one block, don't bother trying to compress: */
+	if (bio_sectors(src) <= c->opts.block_size)
+		return 0;
+
+	dst_data = bio_map_or_bounce(c, dst, WRITE);
+	src_data = bio_map_or_bounce(c, src, READ);
+
+	workspace = mempool_alloc(&c->compress_workspace[compression_type], GFP_NOIO);
+
+	*src_len = src->bi_iter.bi_size;
+	*dst_len = dst->bi_iter.bi_size;
+
+	/*
+	 * XXX: this algorithm sucks when the compression code doesn't tell us
+	 * how much would fit, like LZ4 does:
+	 */
+	while (1) {
+		if (*src_len <= block_bytes(c)) {
+			ret = -1;
+			break;
+		}
+
+		ret = attempt_compress(c, workspace,
+				       dst_data.b,	*dst_len,
+				       src_data.b,	*src_len,
+				       compression_type);
+		if (ret > 0) {
+			*dst_len = ret;
+			ret = 0;
+			break;
+		}
+
+		/* Didn't fit: should we retry with a smaller amount?  */
+		if (*src_len <= *dst_len) {
+			ret = -1;
+			break;
+		}
+
+		/*
+		 * If ret is negative, it's a hint as to how much data would fit
+		 */
+		BUG_ON(-ret >= *src_len);
+
+		if (ret < 0)
+			*src_len = -ret;
+		else
+			*src_len -= (*src_len - *dst_len) / 2;
+		*src_len = round_down(*src_len, block_bytes(c));
+	}
+
+	mempool_free(workspace, &c->compress_workspace[compression_type]);
+
+	if (ret)
+		goto err;
+
+	/* Didn't get smaller: */
+	if (round_up(*dst_len, block_bytes(c)) >= *src_len)
+		goto err;
+
+	pad = round_up(*dst_len, block_bytes(c)) - *dst_len;
+
+	memset(dst_data.b + *dst_len, 0, pad);
+	*dst_len += pad;
+
+	if (dst_data.type != BB_NONE)
+		memcpy_to_bio(dst, dst->bi_iter, dst_data.b);
+
+	BUG_ON(!*dst_len || *dst_len > dst->bi_iter.bi_size);
+	BUG_ON(!*src_len || *src_len > src->bi_iter.bi_size);
+	BUG_ON(*dst_len & (block_bytes(c) - 1));
+	BUG_ON(*src_len & (block_bytes(c) - 1));
+out:
+	bio_unmap_or_unbounce(c, src_data);
+	bio_unmap_or_unbounce(c, dst_data);
+	return compression_type;
+err:
+	compression_type = 0;
+	goto out;
+}
+
+unsigned bch2_bio_compress(struct bch_fs *c,
+			   struct bio *dst, size_t *dst_len,
+			   struct bio *src, size_t *src_len,
+			   unsigned compression_type)
+{
+	unsigned orig_dst = dst->bi_iter.bi_size;
+	unsigned orig_src = src->bi_iter.bi_size;
+
+	/* Don't consume more than BCH_ENCODED_EXTENT_MAX from @src: */
+	src->bi_iter.bi_size = min_t(unsigned, src->bi_iter.bi_size,
+				     c->sb.encoded_extent_max << 9);
+	/* Don't generate a bigger output than input: */
+	dst->bi_iter.bi_size = min(dst->bi_iter.bi_size, src->bi_iter.bi_size);
+
+	if (compression_type == BCH_COMPRESSION_LZ4_OLD)
+		compression_type = BCH_COMPRESSION_LZ4;
+
+	compression_type =
+		__bio_compress(c, dst, dst_len, src, src_len, compression_type);
+
+	dst->bi_iter.bi_size = orig_dst;
+	src->bi_iter.bi_size = orig_src;
+	return compression_type;
+}
+
+static int __bch2_fs_compress_init(struct bch_fs *, u64);
+
+#define BCH_FEATURE_NONE	0
+
+static const unsigned bch2_compression_opt_to_feature[] = {
+#define x(t) [BCH_COMPRESSION_OPT_##t] = BCH_FEATURE_##t,
+	BCH_COMPRESSION_TYPES()
+#undef x
+};
+
+#undef BCH_FEATURE_NONE
+
+static int __bch2_check_set_has_compressed_data(struct bch_fs *c, u64 f)
+{
+	int ret = 0;
+
+	if ((c->sb.features & f) == f)
+		return 0;
+
+	mutex_lock(&c->sb_lock);
+
+	if ((c->sb.features & f) == f) {
+		mutex_unlock(&c->sb_lock);
+		return 0;
+	}
+
+	ret = __bch2_fs_compress_init(c, c->sb.features|f);
+	if (ret) {
+		mutex_unlock(&c->sb_lock);
+		return ret;
+	}
+
+	c->disk_sb.sb->features[0] |= cpu_to_le64(f);
+	bch2_write_super(c);
+	mutex_unlock(&c->sb_lock);
+
+	return 0;
+}
+
+int bch2_check_set_has_compressed_data(struct bch_fs *c,
+				       unsigned compression_type)
+{
+	BUG_ON(compression_type >= ARRAY_SIZE(bch2_compression_opt_to_feature));
+
+	return compression_type
+		? __bch2_check_set_has_compressed_data(c,
+				1ULL << bch2_compression_opt_to_feature[compression_type])
+		: 0;
+}
+
+void bch2_fs_compress_exit(struct bch_fs *c)
+{
+	unsigned i;
+
+	mempool_exit(&c->decompress_workspace);
+	for (i = 0; i < ARRAY_SIZE(c->compress_workspace); i++)
+		mempool_exit(&c->compress_workspace[i]);
+	mempool_exit(&c->compression_bounce[WRITE]);
+	mempool_exit(&c->compression_bounce[READ]);
+}
+
+static int __bch2_fs_compress_init(struct bch_fs *c, u64 features)
+{
+	size_t max_extent = c->sb.encoded_extent_max << 9;
+	size_t order = get_order(max_extent);
+	size_t decompress_workspace_size = 0;
+	bool decompress_workspace_needed;
+	ZSTD_parameters params = zstd_get_params(0, max_extent);
+	struct {
+		unsigned	feature;
+		unsigned	type;
+		size_t		compress_workspace;
+		size_t		decompress_workspace;
+	} compression_types[] = {
+		{ BCH_FEATURE_LZ4, BCH_COMPRESSION_LZ4, LZ4_MEM_COMPRESS, 0 },
+		{ BCH_FEATURE_GZIP, BCH_COMPRESSION_GZIP,
+			zlib_deflate_workspacesize(MAX_WBITS, DEF_MEM_LEVEL),
+			zlib_inflate_workspacesize(), },
+		{ BCH_FEATURE_ZSTD, BCH_COMPRESSION_ZSTD,
+			zstd_cctx_workspace_bound(&params.cParams),
+			zstd_dctx_workspace_bound() },
+	}, *i;
+	int ret = 0;
+
+	pr_verbose_init(c->opts, "");
+
+	c->zstd_params = params;
+
+	for (i = compression_types;
+	     i < compression_types + ARRAY_SIZE(compression_types);
+	     i++)
+		if (features & (1 << i->feature))
+			goto have_compressed;
+
+	goto out;
+have_compressed:
+
+	if (!mempool_initialized(&c->compression_bounce[READ])) {
+		ret = mempool_init_page_pool(&c->compression_bounce[READ],
+					     1, order);
+		if (ret)
+			goto out;
+	}
+
+	if (!mempool_initialized(&c->compression_bounce[WRITE])) {
+		ret = mempool_init_page_pool(&c->compression_bounce[WRITE],
+					     1, order);
+		if (ret)
+			goto out;
+	}
+
+	for (i = compression_types;
+	     i < compression_types + ARRAY_SIZE(compression_types);
+	     i++) {
+		decompress_workspace_size =
+			max(decompress_workspace_size, i->decompress_workspace);
+
+		if (!(features & (1 << i->feature)))
+			continue;
+
+		if (i->decompress_workspace)
+			decompress_workspace_needed = true;
+
+		if (mempool_initialized(&c->compress_workspace[i->type]))
+			continue;
+
+		ret = mempool_init_kvpmalloc_pool(
+				&c->compress_workspace[i->type],
+				1, i->compress_workspace);
+		if (ret)
+			goto out;
+	}
+
+	ret = mempool_init_kmalloc_pool(
+			&c->decompress_workspace,
+			1, decompress_workspace_size);
+	if (ret)
+		goto out;
+out:
+	pr_verbose_init(c->opts, "ret %i", ret);
+	return ret;
+}
+
+int bch2_fs_compress_init(struct bch_fs *c)
+{
+	u64 f = c->sb.features;
+
+	if (c->opts.compression)
+		f |= 1ULL << bch2_compression_opt_to_feature[c->opts.compression];
+
+	if (c->opts.background_compression)
+		f |= 1ULL << bch2_compression_opt_to_feature[c->opts.background_compression];
+
+	return __bch2_fs_compress_init(c, f);
+
+}
diff --git a/fs/bcachefs/compress.h b/fs/bcachefs/compress.h
new file mode 100644
index 000000000000..4bab1f61b3b5
--- /dev/null
+++ b/fs/bcachefs/compress.h
@@ -0,0 +1,18 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_COMPRESS_H
+#define _BCACHEFS_COMPRESS_H
+
+#include "extents_types.h"
+
+int bch2_bio_uncompress_inplace(struct bch_fs *, struct bio *,
+				struct bch_extent_crc_unpacked *);
+int bch2_bio_uncompress(struct bch_fs *, struct bio *, struct bio *,
+		       struct bvec_iter, struct bch_extent_crc_unpacked);
+unsigned bch2_bio_compress(struct bch_fs *, struct bio *, size_t *,
+			   struct bio *, size_t *, unsigned);
+
+int bch2_check_set_has_compressed_data(struct bch_fs *, unsigned);
+void bch2_fs_compress_exit(struct bch_fs *);
+int bch2_fs_compress_init(struct bch_fs *);
+
+#endif /* _BCACHEFS_COMPRESS_H */
diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c
new file mode 100644
index 000000000000..7db0e65927c6
--- /dev/null
+++ b/fs/bcachefs/debug.c
@@ -0,0 +1,425 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Assorted bcachefs debug code
+ *
+ * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
+ * Copyright 2012 Google, Inc.
+ */
+
+#include "bcachefs.h"
+#include "bkey_methods.h"
+#include "btree_cache.h"
+#include "btree_io.h"
+#include "btree_iter.h"
+#include "btree_update.h"
+#include "buckets.h"
+#include "debug.h"
+#include "error.h"
+#include "extents.h"
+#include "fsck.h"
+#include "inode.h"
+#include "io.h"
+#include "super.h"
+
+#include <linux/console.h>
+#include <linux/debugfs.h>
+#include <linux/module.h>
+#include <linux/random.h>
+#include <linux/seq_file.h>
+
+static struct dentry *bch_debug;
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+
+void __bch2_btree_verify(struct bch_fs *c, struct btree *b)
+{
+	struct btree *v = c->verify_data;
+	struct btree_node *n_ondisk, *n_sorted, *n_inmemory;
+	struct bset *sorted, *inmemory;
+	struct extent_pick_ptr pick;
+	struct bch_dev *ca;
+	struct bio *bio;
+
+	if (c->opts.nochanges)
+		return;
+
+	btree_node_io_lock(b);
+	mutex_lock(&c->verify_lock);
+
+	n_ondisk = c->verify_ondisk;
+	n_sorted = c->verify_data->data;
+	n_inmemory = b->data;
+
+	bkey_copy(&v->key, &b->key);
+	v->written	= 0;
+	v->level	= b->level;
+	v->btree_id	= b->btree_id;
+	bch2_btree_keys_init(v, &c->expensive_debug_checks);
+
+	if (bch2_btree_pick_ptr(c, b, NULL, &pick) <= 0)
+		return;
+
+	ca = bch_dev_bkey_exists(c, pick.ptr.dev);
+	if (!bch2_dev_get_ioref(ca, READ))
+		return;
+
+	bio = bio_alloc_bioset(ca->disk_sb.bdev,
+			       buf_pages(n_sorted, btree_bytes(c)),
+			       REQ_OP_READ|REQ_META,
+			       GFP_NOIO,
+			       &c->btree_bio);
+	bio->bi_iter.bi_sector	= pick.ptr.offset;
+	bio->bi_iter.bi_size	= btree_bytes(c);
+	bch2_bio_map(bio, n_sorted);
+
+	submit_bio_wait(bio);
+
+	bio_put(bio);
+	percpu_ref_put(&ca->io_ref);
+
+	memcpy(n_ondisk, n_sorted, btree_bytes(c));
+
+	if (bch2_btree_node_read_done(c, v, false))
+		goto out;
+
+	n_sorted = c->verify_data->data;
+	sorted = &n_sorted->keys;
+	inmemory = &n_inmemory->keys;
+
+	if (inmemory->u64s != sorted->u64s ||
+	    memcmp(inmemory->start,
+		   sorted->start,
+		   vstruct_end(inmemory) - (void *) inmemory->start)) {
+		unsigned offset = 0, sectors;
+		struct bset *i;
+		unsigned j;
+
+		console_lock();
+
+		printk(KERN_ERR "*** in memory:\n");
+		bch2_dump_bset(b, inmemory, 0);
+
+		printk(KERN_ERR "*** read back in:\n");
+		bch2_dump_bset(v, sorted, 0);
+
+		while (offset < b->written) {
+			if (!offset ) {
+				i = &n_ondisk->keys;
+				sectors = vstruct_blocks(n_ondisk, c->block_bits) <<
+					c->block_bits;
+			} else {
+				struct btree_node_entry *bne =
+					(void *) n_ondisk + (offset << 9);
+				i = &bne->keys;
+
+				sectors = vstruct_blocks(bne, c->block_bits) <<
+					c->block_bits;
+			}
+
+			printk(KERN_ERR "*** on disk block %u:\n", offset);
+			bch2_dump_bset(b, i, offset);
+
+			offset += sectors;
+		}
+
+		printk(KERN_ERR "*** block %u/%u not written\n",
+		       offset >> c->block_bits, btree_blocks(c));
+
+		for (j = 0; j < le16_to_cpu(inmemory->u64s); j++)
+			if (inmemory->_data[j] != sorted->_data[j])
+				break;
+
+		printk(KERN_ERR "b->written %u\n", b->written);
+
+		console_unlock();
+		panic("verify failed at %u\n", j);
+	}
+out:
+	mutex_unlock(&c->verify_lock);
+	btree_node_io_unlock(b);
+}
+
+#endif
+
+#ifdef CONFIG_DEBUG_FS
+
+/* XXX: bch_fs refcounting */
+
+struct dump_iter {
+	struct bpos		from;
+	struct bch_fs	*c;
+	enum btree_id		id;
+
+	char			buf[PAGE_SIZE];
+	size_t			bytes;	/* what's currently in buf */
+
+	char __user		*ubuf;	/* destination user buffer */
+	size_t			size;	/* size of requested read */
+	ssize_t			ret;	/* bytes read so far */
+};
+
+static int flush_buf(struct dump_iter *i)
+{
+	if (i->bytes) {
+		size_t bytes = min(i->bytes, i->size);
+		int err = copy_to_user(i->ubuf, i->buf, bytes);
+
+		if (err)
+			return err;
+
+		i->ret	 += bytes;
+		i->ubuf	 += bytes;
+		i->size	 -= bytes;
+		i->bytes -= bytes;
+		memmove(i->buf, i->buf + bytes, i->bytes);
+	}
+
+	return 0;
+}
+
+static int bch2_dump_open(struct inode *inode, struct file *file)
+{
+	struct btree_debug *bd = inode->i_private;
+	struct dump_iter *i;
+
+	i = kzalloc(sizeof(struct dump_iter), GFP_KERNEL);
+	if (!i)
+		return -ENOMEM;
+
+	file->private_data = i;
+	i->from = POS_MIN;
+	i->c	= container_of(bd, struct bch_fs, btree_debug[bd->id]);
+	i->id	= bd->id;
+
+	return 0;
+}
+
+static int bch2_dump_release(struct inode *inode, struct file *file)
+{
+	kfree(file->private_data);
+	return 0;
+}
+
+static ssize_t bch2_read_btree(struct file *file, char __user *buf,
+			       size_t size, loff_t *ppos)
+{
+	struct dump_iter *i = file->private_data;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	int err;
+
+	i->ubuf = buf;
+	i->size	= size;
+	i->ret	= 0;
+
+	err = flush_buf(i);
+	if (err)
+		return err;
+
+	if (!i->size)
+		return i->ret;
+
+	bch2_btree_iter_init(&iter, i->c, i->id, i->from, BTREE_ITER_PREFETCH);
+	k = bch2_btree_iter_peek(&iter);
+
+	while (k.k && !(err = btree_iter_err(k))) {
+		bch2_bkey_val_to_text(i->c, bkey_type(0, i->id),
+				      i->buf, sizeof(i->buf), k);
+		i->bytes = strlen(i->buf);
+		BUG_ON(i->bytes >= PAGE_SIZE);
+		i->buf[i->bytes] = '\n';
+		i->bytes++;
+
+		k = bch2_btree_iter_next(&iter);
+		i->from = iter.pos;
+
+		err = flush_buf(i);
+		if (err)
+			break;
+
+		if (!i->size)
+			break;
+	}
+	bch2_btree_iter_unlock(&iter);
+
+	return err < 0 ? err : i->ret;
+}
+
+static const struct file_operations btree_debug_ops = {
+	.owner		= THIS_MODULE,
+	.open		= bch2_dump_open,
+	.release	= bch2_dump_release,
+	.read		= bch2_read_btree,
+};
+
+static ssize_t bch2_read_btree_formats(struct file *file, char __user *buf,
+				       size_t size, loff_t *ppos)
+{
+	struct dump_iter *i = file->private_data;
+	struct btree_iter iter;
+	struct btree *b;
+	int err;
+
+	i->ubuf = buf;
+	i->size	= size;
+	i->ret	= 0;
+
+	err = flush_buf(i);
+	if (err)
+		return err;
+
+	if (!i->size || !bkey_cmp(POS_MAX, i->from))
+		return i->ret;
+
+	for_each_btree_node(&iter, i->c, i->id, i->from, 0, b) {
+		i->bytes = bch2_print_btree_node(i->c, b, i->buf,
+						sizeof(i->buf));
+		err = flush_buf(i);
+		if (err)
+			break;
+
+		/*
+		 * can't easily correctly restart a btree node traversal across
+		 * all nodes, meh
+		 */
+		i->from = bkey_cmp(POS_MAX, b->key.k.p)
+			? bkey_successor(b->key.k.p)
+			: b->key.k.p;
+
+		if (!i->size)
+			break;
+	}
+	bch2_btree_iter_unlock(&iter);
+
+	return err < 0 ? err : i->ret;
+}
+
+static const struct file_operations btree_format_debug_ops = {
+	.owner		= THIS_MODULE,
+	.open		= bch2_dump_open,
+	.release	= bch2_dump_release,
+	.read		= bch2_read_btree_formats,
+};
+
+static ssize_t bch2_read_bfloat_failed(struct file *file, char __user *buf,
+				       size_t size, loff_t *ppos)
+{
+	struct dump_iter *i = file->private_data;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	struct btree *prev_node = NULL;
+	int err;
+
+	i->ubuf = buf;
+	i->size	= size;
+	i->ret	= 0;
+
+	err = flush_buf(i);
+	if (err)
+		return err;
+
+	if (!i->size)
+		return i->ret;
+
+	bch2_btree_iter_init(&iter, i->c, i->id, i->from, BTREE_ITER_PREFETCH);
+
+	while ((k = bch2_btree_iter_peek(&iter)).k &&
+	       !(err = btree_iter_err(k))) {
+		struct btree_iter_level *l = &iter.l[0];
+		struct bkey_packed *_k =
+			bch2_btree_node_iter_peek(&l->iter, l->b);
+
+		if (l->b != prev_node) {
+			i->bytes = bch2_print_btree_node(i->c, l->b, i->buf,
+							sizeof(i->buf));
+			err = flush_buf(i);
+			if (err)
+				break;
+		}
+		prev_node = l->b;
+
+		i->bytes = bch2_bkey_print_bfloat(l->b, _k, i->buf,
+						  sizeof(i->buf));
+
+		err = flush_buf(i);
+		if (err)
+			break;
+
+		bch2_btree_iter_next(&iter);
+		i->from = iter.pos;
+
+		err = flush_buf(i);
+		if (err)
+			break;
+
+		if (!i->size)
+			break;
+	}
+	bch2_btree_iter_unlock(&iter);
+
+	return err < 0 ? err : i->ret;
+}
+
+static const struct file_operations bfloat_failed_debug_ops = {
+	.owner		= THIS_MODULE,
+	.open		= bch2_dump_open,
+	.release	= bch2_dump_release,
+	.read		= bch2_read_bfloat_failed,
+};
+
+void bch2_fs_debug_exit(struct bch_fs *c)
+{
+	if (!IS_ERR_OR_NULL(c->debug))
+		debugfs_remove_recursive(c->debug);
+}
+
+void bch2_fs_debug_init(struct bch_fs *c)
+{
+	struct btree_debug *bd;
+	char name[100];
+
+	if (IS_ERR_OR_NULL(bch_debug))
+		return;
+
+	snprintf(name, sizeof(name), "%pU", c->sb.user_uuid.b);
+	c->debug = debugfs_create_dir(name, bch_debug);
+	if (IS_ERR_OR_NULL(c->debug))
+		return;
+
+	for (bd = c->btree_debug;
+	     bd < c->btree_debug + ARRAY_SIZE(c->btree_debug);
+	     bd++) {
+		bd->id = bd - c->btree_debug;
+		bd->btree = debugfs_create_file(bch2_btree_ids[bd->id],
+						0400, c->debug, bd,
+						&btree_debug_ops);
+
+		snprintf(name, sizeof(name), "%s-formats",
+			 bch2_btree_ids[bd->id]);
+
+		bd->btree_format = debugfs_create_file(name, 0400, c->debug, bd,
+						       &btree_format_debug_ops);
+
+		snprintf(name, sizeof(name), "%s-bfloat-failed",
+			 bch2_btree_ids[bd->id]);
+
+		bd->failed = debugfs_create_file(name, 0400, c->debug, bd,
+						 &bfloat_failed_debug_ops);
+	}
+}
+
+#endif
+
+void bch2_debug_exit(void)
+{
+	if (!IS_ERR_OR_NULL(bch_debug))
+		debugfs_remove_recursive(bch_debug);
+}
+
+int __init bch2_debug_init(void)
+{
+	int ret = 0;
+
+	bch_debug = debugfs_create_dir("bcachefs", NULL);
+	return ret;
+}
diff --git a/fs/bcachefs/debug.h b/fs/bcachefs/debug.h
new file mode 100644
index 000000000000..56c2d1ab5f63
--- /dev/null
+++ b/fs/bcachefs/debug.h
@@ -0,0 +1,63 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_DEBUG_H
+#define _BCACHEFS_DEBUG_H
+
+#include "bcachefs.h"
+
+struct bio;
+struct btree;
+struct bch_fs;
+
+#define BCH_DEBUG_PARAM(name, description) extern bool bch2_##name;
+BCH_DEBUG_PARAMS()
+#undef BCH_DEBUG_PARAM
+
+#define BCH_DEBUG_PARAM(name, description)				\
+	static inline bool name(struct bch_fs *c)			\
+	{ return bch2_##name || c->name;	}
+BCH_DEBUG_PARAMS_ALWAYS()
+#undef BCH_DEBUG_PARAM
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+
+#define BCH_DEBUG_PARAM(name, description)				\
+	static inline bool name(struct bch_fs *c)			\
+	{ return bch2_##name || c->name;	}
+BCH_DEBUG_PARAMS_DEBUG()
+#undef BCH_DEBUG_PARAM
+
+void __bch2_btree_verify(struct bch_fs *, struct btree *);
+
+#define bypass_torture_test(d)		((d)->bypass_torture_test)
+
+#else /* DEBUG */
+
+#define BCH_DEBUG_PARAM(name, description)				\
+	static inline bool name(struct bch_fs *c) { return false; }
+BCH_DEBUG_PARAMS_DEBUG()
+#undef BCH_DEBUG_PARAM
+
+static inline void __bch2_btree_verify(struct bch_fs *c, struct btree *b) {}
+
+#define bypass_torture_test(d)		0
+
+#endif
+
+static inline void bch2_btree_verify(struct bch_fs *c, struct btree *b)
+{
+	if (verify_btree_ondisk(c))
+		__bch2_btree_verify(c, b);
+}
+
+#ifdef CONFIG_DEBUG_FS
+void bch2_fs_debug_exit(struct bch_fs *);
+void bch2_fs_debug_init(struct bch_fs *);
+#else
+static inline void bch2_fs_debug_exit(struct bch_fs *c) {}
+static inline void bch2_fs_debug_init(struct bch_fs *c) {}
+#endif
+
+void bch2_debug_exit(void);
+int bch2_debug_init(void);
+
+#endif /* _BCACHEFS_DEBUG_H */
diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c
new file mode 100644
index 000000000000..9e5936faf1af
--- /dev/null
+++ b/fs/bcachefs/dirent.c
@@ -0,0 +1,426 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "bkey_methods.h"
+#include "btree_update.h"
+#include "extents.h"
+#include "dirent.h"
+#include "fs.h"
+#include "keylist.h"
+#include "str_hash.h"
+
+#include <linux/dcache.h>
+
+unsigned bch2_dirent_name_bytes(struct bkey_s_c_dirent d)
+{
+	unsigned len = bkey_val_bytes(d.k) -
+		offsetof(struct bch_dirent, d_name);
+
+	while (len && !d.v->d_name[len - 1])
+		--len;
+
+	return len;
+}
+
+static unsigned dirent_val_u64s(unsigned len)
+{
+	return DIV_ROUND_UP(offsetof(struct bch_dirent, d_name) + len,
+			    sizeof(u64));
+}
+
+static u64 bch2_dirent_hash(const struct bch_hash_info *info,
+			    const struct qstr *name)
+{
+	struct bch_str_hash_ctx ctx;
+
+	bch2_str_hash_init(&ctx, info);
+	bch2_str_hash_update(&ctx, info, name->name, name->len);
+
+	/* [0,2) reserved for dots */
+	return max_t(u64, bch2_str_hash_end(&ctx, info), 2);
+}
+
+static u64 dirent_hash_key(const struct bch_hash_info *info, const void *key)
+{
+	return bch2_dirent_hash(info, key);
+}
+
+static u64 dirent_hash_bkey(const struct bch_hash_info *info, struct bkey_s_c k)
+{
+	struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
+	struct qstr name = QSTR_INIT(d.v->d_name, bch2_dirent_name_bytes(d));
+
+	return bch2_dirent_hash(info, &name);
+}
+
+static bool dirent_cmp_key(struct bkey_s_c _l, const void *_r)
+{
+	struct bkey_s_c_dirent l = bkey_s_c_to_dirent(_l);
+	int len = bch2_dirent_name_bytes(l);
+	const struct qstr *r = _r;
+
+	return len - r->len ?: memcmp(l.v->d_name, r->name, len);
+}
+
+static bool dirent_cmp_bkey(struct bkey_s_c _l, struct bkey_s_c _r)
+{
+	struct bkey_s_c_dirent l = bkey_s_c_to_dirent(_l);
+	struct bkey_s_c_dirent r = bkey_s_c_to_dirent(_r);
+	int l_len = bch2_dirent_name_bytes(l);
+	int r_len = bch2_dirent_name_bytes(r);
+
+	return l_len - r_len ?: memcmp(l.v->d_name, r.v->d_name, l_len);
+}
+
+const struct bch_hash_desc bch2_dirent_hash_desc = {
+	.btree_id	= BTREE_ID_DIRENTS,
+	.key_type	= BCH_DIRENT,
+	.whiteout_type	= BCH_DIRENT_WHITEOUT,
+	.hash_key	= dirent_hash_key,
+	.hash_bkey	= dirent_hash_bkey,
+	.cmp_key	= dirent_cmp_key,
+	.cmp_bkey	= dirent_cmp_bkey,
+};
+
+const char *bch2_dirent_invalid(const struct bch_fs *c, struct bkey_s_c k)
+{
+	struct bkey_s_c_dirent d;
+	unsigned len;
+
+	switch (k.k->type) {
+	case BCH_DIRENT:
+		if (bkey_val_bytes(k.k) < sizeof(struct bch_dirent))
+			return "value too small";
+
+		d = bkey_s_c_to_dirent(k);
+		len = bch2_dirent_name_bytes(d);
+
+		if (!len)
+			return "empty name";
+
+		/*
+		 * older versions of bcachefs were buggy and creating dirent
+		 * keys that were bigger than necessary:
+		 */
+		if (bkey_val_u64s(k.k) > dirent_val_u64s(len + 7))
+			return "value too big";
+
+		if (len > BCH_NAME_MAX)
+			return "dirent name too big";
+
+		if (memchr(d.v->d_name, '/', len))
+			return "dirent name has invalid characters";
+
+		return NULL;
+	case BCH_DIRENT_WHITEOUT:
+		return bkey_val_bytes(k.k) != 0
+			? "value size should be zero"
+			: NULL;
+
+	default:
+		return "invalid type";
+	}
+}
+
+void bch2_dirent_to_text(struct bch_fs *c, char *buf,
+			 size_t size, struct bkey_s_c k)
+{
+	struct bkey_s_c_dirent d;
+	size_t n = 0;
+
+	switch (k.k->type) {
+	case BCH_DIRENT:
+		d = bkey_s_c_to_dirent(k);
+
+		n += bch_scnmemcpy(buf + n, size - n, d.v->d_name,
+				   bch2_dirent_name_bytes(d));
+		n += scnprintf(buf + n, size - n, " -> %llu", d.v->d_inum);
+		break;
+	case BCH_DIRENT_WHITEOUT:
+		scnprintf(buf, size, "whiteout");
+		break;
+	}
+}
+
+static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans,
+				u8 type, const struct qstr *name, u64 dst)
+{
+	struct bkey_i_dirent *dirent;
+	unsigned u64s = BKEY_U64s + dirent_val_u64s(name->len);
+
+	if (name->len > BCH_NAME_MAX)
+		return ERR_PTR(-ENAMETOOLONG);
+
+	BUG_ON(u64s > U8_MAX);
+
+	dirent = bch2_trans_kmalloc(trans, u64s * sizeof(u64));
+	if (IS_ERR(dirent))
+		return dirent;
+
+	bkey_dirent_init(&dirent->k_i);
+	dirent->k.u64s = u64s;
+	dirent->v.d_inum = cpu_to_le64(dst);
+	dirent->v.d_type = type;
+
+	memcpy(dirent->v.d_name, name->name, name->len);
+	memset(dirent->v.d_name + name->len, 0,
+	       bkey_val_bytes(&dirent->k) -
+	       offsetof(struct bch_dirent, d_name) -
+	       name->len);
+
+	EBUG_ON(bch2_dirent_name_bytes(dirent_i_to_s_c(dirent)) != name->len);
+
+	return dirent;
+}
+
+int __bch2_dirent_create(struct btree_trans *trans,
+			 u64 dir_inum, const struct bch_hash_info *hash_info,
+			 u8 type, const struct qstr *name, u64 dst_inum,
+			 int flags)
+{
+	struct bkey_i_dirent *dirent;
+	int ret;
+
+	dirent = dirent_create_key(trans, type, name, dst_inum);
+	ret = PTR_ERR_OR_ZERO(dirent);
+	if (ret)
+		return ret;
+
+	return __bch2_hash_set(trans, bch2_dirent_hash_desc, hash_info,
+			       dir_inum, &dirent->k_i, flags);
+}
+
+int bch2_dirent_create(struct bch_fs *c, u64 dir_inum,
+		       const struct bch_hash_info *hash_info,
+		       u8 type, const struct qstr *name, u64 dst_inum,
+		       u64 *journal_seq, int flags)
+{
+	return bch2_trans_do(c, journal_seq, flags,
+		__bch2_dirent_create(&trans, dir_inum, hash_info,
+				     type, name, dst_inum, flags));
+}
+
+static void dirent_copy_target(struct bkey_i_dirent *dst,
+			       struct bkey_s_c_dirent src)
+{
+	dst->v.d_inum = src.v->d_inum;
+	dst->v.d_type = src.v->d_type;
+}
+
+static struct bpos bch2_dirent_pos(struct bch_inode_info *inode,
+				   const struct qstr *name)
+{
+	return POS(inode->v.i_ino, bch2_dirent_hash(&inode->ei_str_hash, name));
+}
+
+int bch2_dirent_rename(struct btree_trans *trans,
+		struct bch_inode_info *src_dir, const struct qstr *src_name,
+		struct bch_inode_info *dst_dir, const struct qstr *dst_name,
+		enum bch_rename_mode mode)
+{
+	struct btree_iter *src_iter, *dst_iter;
+	struct bkey_s_c old_src, old_dst;
+	struct bkey_i_dirent *new_src = NULL, *new_dst = NULL;
+	struct bpos dst_pos = bch2_dirent_pos(dst_dir, dst_name);
+	int ret;
+
+	/*
+	 * Lookup dst:
+	 *
+	 * Note that in BCH_RENAME mode, we're _not_ checking if
+	 * the target already exists - we're relying on the VFS
+	 * to do that check for us for correctness:
+	 */
+	dst_iter = mode == BCH_RENAME
+		? bch2_hash_hole(trans, bch2_dirent_hash_desc,
+				 &dst_dir->ei_str_hash,
+				 dst_dir->v.i_ino, dst_name)
+		: bch2_hash_lookup(trans, bch2_dirent_hash_desc,
+				   &dst_dir->ei_str_hash,
+				   dst_dir->v.i_ino, dst_name,
+				   BTREE_ITER_INTENT);
+	if (IS_ERR(dst_iter))
+		return PTR_ERR(dst_iter);
+	old_dst = bch2_btree_iter_peek_slot(dst_iter);
+
+	/* Lookup src: */
+	src_iter = bch2_hash_lookup(trans, bch2_dirent_hash_desc,
+				    &src_dir->ei_str_hash,
+				    src_dir->v.i_ino, src_name,
+				    BTREE_ITER_INTENT);
+	if (IS_ERR(src_iter))
+		return PTR_ERR(src_iter);
+	old_src = bch2_btree_iter_peek_slot(src_iter);
+
+	/* Create new dst key: */
+	new_dst = dirent_create_key(trans, 0, dst_name, 0);
+	if (IS_ERR(new_dst))
+		return PTR_ERR(new_dst);
+
+	dirent_copy_target(new_dst, bkey_s_c_to_dirent(old_src));
+	new_dst->k.p = dst_iter->pos;
+
+	/* Create new src key: */
+	if (mode == BCH_RENAME_EXCHANGE) {
+		new_src = dirent_create_key(trans, 0, src_name, 0);
+		if (IS_ERR(new_src))
+			return PTR_ERR(new_src);
+
+		dirent_copy_target(new_src, bkey_s_c_to_dirent(old_dst));
+		new_src->k.p = src_iter->pos;
+	} else {
+		new_src = bch2_trans_kmalloc(trans, sizeof(struct bkey_i));
+		if (IS_ERR(new_src))
+			return PTR_ERR(new_src);
+		bkey_init(&new_src->k);
+		new_src->k.p = src_iter->pos;
+
+		if (bkey_cmp(dst_pos, src_iter->pos) <= 0 &&
+		    bkey_cmp(src_iter->pos, dst_iter->pos) < 0) {
+			/*
+			 * We have a hash collision for the new dst key,
+			 * and new_src - the key we're deleting - is between
+			 * new_dst's hashed slot and the slot we're going to be
+			 * inserting it into - oops.  This will break the hash
+			 * table if we don't deal with it:
+			 */
+			if (mode == BCH_RENAME) {
+				/*
+				 * If we're not overwriting, we can just insert
+				 * new_dst at the src position:
+				 */
+				new_dst->k.p = src_iter->pos;
+				bch2_trans_update(trans, src_iter, &new_dst->k_i, 0);
+				return 0;
+			} else {
+				/* If we're overwriting, we can't insert new_dst
+				 * at a different slot because it has to
+				 * overwrite old_dst - just make sure to use a
+				 * whiteout when deleting src:
+				 */
+				new_src->k.type = BCH_DIRENT_WHITEOUT;
+			}
+		} else {
+			/* Check if we need a whiteout to delete src: */
+			ret = bch2_hash_needs_whiteout(trans, bch2_dirent_hash_desc,
+						       &src_dir->ei_str_hash,
+						       src_iter);
+			if (ret < 0)
+				return ret;
+
+			if (ret)
+				new_src->k.type = BCH_DIRENT_WHITEOUT;
+		}
+	}
+
+	bch2_trans_update(trans, src_iter, &new_src->k_i, 0);
+	bch2_trans_update(trans, dst_iter, &new_dst->k_i, 0);
+	return 0;
+}
+
+int __bch2_dirent_delete(struct btree_trans *trans, u64 dir_inum,
+			 const struct bch_hash_info *hash_info,
+			 const struct qstr *name)
+{
+	return bch2_hash_delete(trans, bch2_dirent_hash_desc, hash_info,
+				dir_inum, name);
+}
+
+int bch2_dirent_delete(struct bch_fs *c, u64 dir_inum,
+		       const struct bch_hash_info *hash_info,
+		       const struct qstr *name,
+		       u64 *journal_seq)
+{
+	return bch2_trans_do(c, journal_seq,
+			     BTREE_INSERT_ATOMIC|
+			     BTREE_INSERT_NOFAIL,
+		__bch2_dirent_delete(&trans, dir_inum, hash_info, name));
+}
+
+u64 bch2_dirent_lookup(struct bch_fs *c, u64 dir_inum,
+		       const struct bch_hash_info *hash_info,
+		       const struct qstr *name)
+{
+	struct btree_trans trans;
+	struct btree_iter *iter;
+	struct bkey_s_c k;
+	u64 inum = 0;
+
+	bch2_trans_init(&trans, c);
+
+	iter = bch2_hash_lookup(&trans, bch2_dirent_hash_desc,
+				hash_info, dir_inum, name, 0);
+	if (IS_ERR(iter)) {
+		BUG_ON(PTR_ERR(iter) == -EINTR);
+		goto out;
+	}
+
+	k = bch2_btree_iter_peek_slot(iter);
+	inum = le64_to_cpu(bkey_s_c_to_dirent(k).v->d_inum);
+out:
+	bch2_trans_exit(&trans);
+	return inum;
+}
+
+int bch2_empty_dir(struct bch_fs *c, u64 dir_inum)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	int ret = 0;
+
+	for_each_btree_key(&iter, c, BTREE_ID_DIRENTS, POS(dir_inum, 0), 0, k) {
+		if (k.k->p.inode > dir_inum)
+			break;
+
+		if (k.k->type == BCH_DIRENT) {
+			ret = -ENOTEMPTY;
+			break;
+		}
+	}
+	bch2_btree_iter_unlock(&iter);
+
+	return ret;
+}
+
+int bch2_readdir(struct bch_fs *c, struct file *file,
+		 struct dir_context *ctx)
+{
+	struct bch_inode_info *inode = file_bch_inode(file);
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	struct bkey_s_c_dirent dirent;
+	unsigned len;
+
+	if (!dir_emit_dots(file, ctx))
+		return 0;
+
+	for_each_btree_key(&iter, c, BTREE_ID_DIRENTS,
+			   POS(inode->v.i_ino, ctx->pos), 0, k) {
+		if (k.k->type != BCH_DIRENT)
+			continue;
+
+		dirent = bkey_s_c_to_dirent(k);
+
+		if (bkey_cmp(k.k->p, POS(inode->v.i_ino, ctx->pos)) < 0)
+			continue;
+
+		if (k.k->p.inode > inode->v.i_ino)
+			break;
+
+		len = bch2_dirent_name_bytes(dirent);
+
+		/*
+		 * XXX: dir_emit() can fault and block, while we're holding
+		 * locks
+		 */
+		if (!dir_emit(ctx, dirent.v->d_name, len,
+			      le64_to_cpu(dirent.v->d_inum),
+			      dirent.v->d_type))
+			break;
+
+		ctx->pos = k.k->p.offset + 1;
+	}
+	bch2_btree_iter_unlock(&iter);
+
+	return 0;
+}
diff --git a/fs/bcachefs/dirent.h b/fs/bcachefs/dirent.h
new file mode 100644
index 000000000000..d02dc3e10d95
--- /dev/null
+++ b/fs/bcachefs/dirent.h
@@ -0,0 +1,55 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_DIRENT_H
+#define _BCACHEFS_DIRENT_H
+
+#include "str_hash.h"
+
+extern const struct bch_hash_desc bch2_dirent_hash_desc;
+
+const char *bch2_dirent_invalid(const struct bch_fs *, struct bkey_s_c);
+void bch2_dirent_to_text(struct bch_fs *, char *, size_t, struct bkey_s_c);
+
+#define bch2_bkey_dirent_ops (struct bkey_ops) {	\
+	.key_invalid	= bch2_dirent_invalid,		\
+	.val_to_text	= bch2_dirent_to_text,		\
+}
+
+struct qstr;
+struct file;
+struct dir_context;
+struct bch_fs;
+struct bch_hash_info;
+struct bch_inode_info;
+
+unsigned bch2_dirent_name_bytes(struct bkey_s_c_dirent);
+
+int __bch2_dirent_create(struct btree_trans *, u64,
+			 const struct bch_hash_info *, u8,
+			 const struct qstr *, u64, int);
+int bch2_dirent_create(struct bch_fs *c, u64, const struct bch_hash_info *,
+		       u8, const struct qstr *, u64, u64 *, int);
+
+int __bch2_dirent_delete(struct btree_trans *, u64,
+			 const struct bch_hash_info *,
+			 const struct qstr *);
+int bch2_dirent_delete(struct bch_fs *, u64, const struct bch_hash_info *,
+		       const struct qstr *, u64 *);
+
+enum bch_rename_mode {
+	BCH_RENAME,
+	BCH_RENAME_OVERWRITE,
+	BCH_RENAME_EXCHANGE,
+};
+
+int bch2_dirent_rename(struct btree_trans *,
+		       struct bch_inode_info *, const struct qstr *,
+		       struct bch_inode_info *, const struct qstr *,
+		       enum bch_rename_mode);
+
+u64 bch2_dirent_lookup(struct bch_fs *, u64, const struct bch_hash_info *,
+		       const struct qstr *);
+
+int bch2_empty_dir(struct bch_fs *, u64);
+int bch2_readdir(struct bch_fs *, struct file *, struct dir_context *);
+
+#endif /* _BCACHEFS_DIRENT_H */
diff --git a/fs/bcachefs/disk_groups.c b/fs/bcachefs/disk_groups.c
new file mode 100644
index 000000000000..48f472a384f1
--- /dev/null
+++ b/fs/bcachefs/disk_groups.c
@@ -0,0 +1,494 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "bcachefs.h"
+#include "disk_groups.h"
+#include "super-io.h"
+
+#include <linux/sort.h>
+
+static int group_cmp(const void *_l, const void *_r)
+{
+	const struct bch_disk_group *l = _l;
+	const struct bch_disk_group *r = _r;
+
+	return ((BCH_GROUP_DELETED(l) > BCH_GROUP_DELETED(r)) -
+		(BCH_GROUP_DELETED(l) < BCH_GROUP_DELETED(r))) ?:
+		((BCH_GROUP_PARENT(l) > BCH_GROUP_PARENT(r)) -
+		 (BCH_GROUP_PARENT(l) < BCH_GROUP_PARENT(r))) ?:
+		strncmp(l->label, r->label, sizeof(l->label));
+}
+
+static const char *bch2_sb_disk_groups_validate(struct bch_sb *sb,
+						struct bch_sb_field *f)
+{
+	struct bch_sb_field_disk_groups *groups =
+		field_to_type(f, disk_groups);
+	struct bch_disk_group *g, *sorted = NULL;
+	struct bch_sb_field_members *mi;
+	struct bch_member *m;
+	unsigned i, nr_groups, len;
+	const char *err = NULL;
+
+	mi		= bch2_sb_get_members(sb);
+	groups		= bch2_sb_get_disk_groups(sb);
+	nr_groups	= disk_groups_nr(groups);
+
+	for (m = mi->members;
+	     m < mi->members + sb->nr_devices;
+	     m++) {
+		unsigned g;
+
+		if (!BCH_MEMBER_GROUP(m))
+			continue;
+
+		g = BCH_MEMBER_GROUP(m) - 1;
+
+		if (g >= nr_groups ||
+		    BCH_GROUP_DELETED(&groups->entries[g]))
+			return "disk has invalid group";
+	}
+
+	if (!nr_groups)
+		return NULL;
+
+	for (g = groups->entries;
+	     g < groups->entries + nr_groups;
+	     g++) {
+		if (BCH_GROUP_DELETED(g))
+			continue;
+
+		len = strnlen(g->label, sizeof(g->label));
+		if (!len) {
+			err = "group with empty label";
+			goto err;
+		}
+	}
+
+	sorted = kmalloc_array(nr_groups, sizeof(*sorted), GFP_KERNEL);
+	if (!sorted)
+		return "cannot allocate memory";
+
+	memcpy(sorted, groups->entries, nr_groups * sizeof(*sorted));
+	sort(sorted, nr_groups, sizeof(*sorted), group_cmp, NULL);
+
+	for (i = 0; i + 1 < nr_groups; i++)
+		if (!BCH_GROUP_DELETED(sorted + i) &&
+		    !group_cmp(sorted + i, sorted + i + 1)) {
+			err = "duplicate groups";
+			goto err;
+		}
+
+	err = NULL;
+err:
+	kfree(sorted);
+	return err;
+}
+
+static size_t bch2_sb_disk_groups_to_text(char *buf, size_t size,
+					struct bch_sb *sb,
+					struct bch_sb_field *f)
+{
+	char *out = buf, *end = buf + size;
+	struct bch_sb_field_disk_groups *groups =
+		field_to_type(f, disk_groups);
+	struct bch_disk_group *g;
+	unsigned nr_groups = disk_groups_nr(groups);
+
+	for (g = groups->entries;
+	     g < groups->entries + nr_groups;
+	     g++) {
+		if (g != groups->entries)
+			out += scnprintf(out, end - out, " ");
+
+		if (BCH_GROUP_DELETED(g))
+			out += scnprintf(out, end - out, "[deleted]");
+		else
+			out += scnprintf(out, end - out,
+					 "[parent %llu name %s]",
+					 BCH_GROUP_PARENT(g),
+					 g->label);
+	}
+
+	return out - buf;
+}
+
+const struct bch_sb_field_ops bch_sb_field_ops_disk_groups = {
+	.validate	= bch2_sb_disk_groups_validate,
+	.to_text	= bch2_sb_disk_groups_to_text
+};
+
+int bch2_sb_disk_groups_to_cpu(struct bch_fs *c)
+{
+	struct bch_sb_field_members *mi;
+	struct bch_sb_field_disk_groups *groups;
+	struct bch_disk_groups_cpu *cpu_g, *old_g;
+	unsigned i, g, nr_groups;
+
+	lockdep_assert_held(&c->sb_lock);
+
+	mi		= bch2_sb_get_members(c->disk_sb.sb);
+	groups		= bch2_sb_get_disk_groups(c->disk_sb.sb);
+	nr_groups	= disk_groups_nr(groups);
+
+	if (!groups)
+		return 0;
+
+	cpu_g = kzalloc(sizeof(*cpu_g) +
+			sizeof(cpu_g->entries[0]) * nr_groups, GFP_KERNEL);
+	if (!cpu_g)
+		return -ENOMEM;
+
+	cpu_g->nr = nr_groups;
+
+	for (i = 0; i < nr_groups; i++) {
+		struct bch_disk_group *src	= &groups->entries[i];
+		struct bch_disk_group_cpu *dst	= &cpu_g->entries[i];
+
+		dst->deleted	= BCH_GROUP_DELETED(src);
+		dst->parent	= BCH_GROUP_PARENT(src);
+	}
+
+	for (i = 0; i < c->disk_sb.sb->nr_devices; i++) {
+		struct bch_member *m = mi->members + i;
+		struct bch_disk_group_cpu *dst =
+			&cpu_g->entries[BCH_MEMBER_GROUP(m)];
+
+		if (!bch2_member_exists(m))
+			continue;
+
+		g = BCH_MEMBER_GROUP(m);
+		while (g) {
+			dst = &cpu_g->entries[g - 1];
+			__set_bit(i, dst->devs.d);
+			g = dst->parent;
+		}
+	}
+
+	old_g = rcu_dereference_protected(c->disk_groups,
+				lockdep_is_held(&c->sb_lock));
+	rcu_assign_pointer(c->disk_groups, cpu_g);
+	if (old_g)
+		kfree_rcu(old_g, rcu);
+
+	return 0;
+}
+
+const struct bch_devs_mask *bch2_target_to_mask(struct bch_fs *c, unsigned target)
+{
+	struct target t = target_decode(target);
+
+	switch (t.type) {
+	case TARGET_NULL:
+		return NULL;
+	case TARGET_DEV: {
+		struct bch_dev *ca = t.dev < c->sb.nr_devices
+			? rcu_dereference(c->devs[t.dev])
+			: NULL;
+		return ca ? &ca->self : NULL;
+	}
+	case TARGET_GROUP: {
+		struct bch_disk_groups_cpu *g = rcu_dereference(c->disk_groups);
+
+		return t.group < g->nr && !g->entries[t.group].deleted
+			? &g->entries[t.group].devs
+			: NULL;
+	}
+	default:
+		BUG();
+	}
+}
+
+bool bch2_dev_in_target(struct bch_fs *c, unsigned dev, unsigned target)
+{
+	struct target t = target_decode(target);
+
+	switch (t.type) {
+	case TARGET_NULL:
+		return false;
+	case TARGET_DEV:
+		return dev == t.dev;
+	case TARGET_GROUP: {
+		struct bch_disk_groups_cpu *g;
+		const struct bch_devs_mask *m;
+		bool ret;
+
+		rcu_read_lock();
+		g = rcu_dereference(c->disk_groups);
+		m = t.group < g->nr && !g->entries[t.group].deleted
+			? &g->entries[t.group].devs
+			: NULL;
+
+		ret = m ? test_bit(dev, m->d) : false;
+		rcu_read_unlock();
+
+		return ret;
+	}
+	default:
+		BUG();
+	}
+}
+
+static int __bch2_disk_group_find(struct bch_sb_field_disk_groups *groups,
+				  unsigned parent,
+				  const char *name, unsigned namelen)
+{
+	unsigned i, nr_groups = disk_groups_nr(groups);
+
+	if (!namelen || namelen > BCH_SB_LABEL_SIZE)
+		return -EINVAL;
+
+	for (i = 0; i < nr_groups; i++) {
+		struct bch_disk_group *g = groups->entries + i;
+
+		if (BCH_GROUP_DELETED(g))
+			continue;
+
+		if (!BCH_GROUP_DELETED(g) &&
+		    BCH_GROUP_PARENT(g) == parent &&
+		    strnlen(g->label, sizeof(g->label)) == namelen &&
+		    !memcmp(name, g->label, namelen))
+			return i;
+	}
+
+	return -1;
+}
+
+static int __bch2_disk_group_add(struct bch_sb_handle *sb, unsigned parent,
+				 const char *name, unsigned namelen)
+{
+	struct bch_sb_field_disk_groups *groups =
+		bch2_sb_get_disk_groups(sb->sb);
+	unsigned i, nr_groups = disk_groups_nr(groups);
+	struct bch_disk_group *g;
+
+	if (!namelen || namelen > BCH_SB_LABEL_SIZE)
+		return -EINVAL;
+
+	for (i = 0;
+	     i < nr_groups && !BCH_GROUP_DELETED(&groups->entries[i]);
+	     i++)
+		;
+
+	if (i == nr_groups) {
+		unsigned u64s =
+			(sizeof(struct bch_sb_field_disk_groups) +
+			 sizeof(struct bch_disk_group) * (nr_groups + 1)) /
+			sizeof(u64);
+
+		groups = bch2_sb_resize_disk_groups(sb, u64s);
+		if (!groups)
+			return -ENOSPC;
+
+		nr_groups = disk_groups_nr(groups);
+	}
+
+	BUG_ON(i >= nr_groups);
+
+	g = &groups->entries[i];
+
+	memcpy(g->label, name, namelen);
+	if (namelen < sizeof(g->label))
+		g->label[namelen] = '\0';
+	SET_BCH_GROUP_DELETED(g, 0);
+	SET_BCH_GROUP_PARENT(g, parent);
+	SET_BCH_GROUP_DATA_ALLOWED(g, ~0);
+
+	return i;
+}
+
+int bch2_disk_path_find(struct bch_sb_handle *sb, const char *name)
+{
+	struct bch_sb_field_disk_groups *groups =
+		bch2_sb_get_disk_groups(sb->sb);
+	int v = -1;
+
+	do {
+		const char *next = strchrnul(name, '.');
+		unsigned len = next - name;
+
+		if (*next == '.')
+			next++;
+
+		v = __bch2_disk_group_find(groups, v + 1, name, len);
+		name = next;
+	} while (*name && v >= 0);
+
+	return v;
+}
+
+int bch2_disk_path_find_or_create(struct bch_sb_handle *sb, const char *name)
+{
+	struct bch_sb_field_disk_groups *groups;
+	unsigned parent = 0;
+	int v = -1;
+
+	do {
+		const char *next = strchrnul(name, '.');
+		unsigned len = next - name;
+
+		if (*next == '.')
+			next++;
+
+		groups = bch2_sb_get_disk_groups(sb->sb);
+
+		v = __bch2_disk_group_find(groups, parent, name, len);
+		if (v < 0)
+			v = __bch2_disk_group_add(sb, parent, name, len);
+		if (v < 0)
+			return v;
+
+		parent = v + 1;
+		name = next;
+	} while (*name && v >= 0);
+
+	return v;
+}
+
+int bch2_disk_path_print(struct bch_sb_handle *sb,
+			 char *buf, size_t len, unsigned v)
+{
+	char *out = buf, *end = out + len;
+	struct bch_sb_field_disk_groups *groups =
+		bch2_sb_get_disk_groups(sb->sb);
+	struct bch_disk_group *g;
+	unsigned nr = 0;
+	u16 path[32];
+
+	while (1) {
+		if (nr == ARRAY_SIZE(path))
+			goto inval;
+
+		if (v >= disk_groups_nr(groups))
+			goto inval;
+
+		g = groups->entries + v;
+
+		if (BCH_GROUP_DELETED(g))
+			goto inval;
+
+		path[nr++] = v;
+
+		if (!BCH_GROUP_PARENT(g))
+			break;
+
+		v = BCH_GROUP_PARENT(g) - 1;
+	}
+
+	while (nr) {
+		unsigned b = 0;
+
+		v = path[--nr];
+		g = groups->entries + v;
+
+		if (end != out)
+			b = min_t(size_t, end - out,
+				  strnlen(g->label, sizeof(g->label)));
+		memcpy(out, g->label, b);
+		if (b < end - out)
+			out[b] = '\0';
+		out += b;
+
+		if (nr)
+			out += scnprintf(out, end - out, ".");
+	}
+
+	return out - buf;
+inval:
+	return scnprintf(buf, len, "invalid group %u", v);
+}
+
+int bch2_dev_group_set(struct bch_fs *c, struct bch_dev *ca, const char *name)
+{
+	struct bch_member *mi;
+	int v = -1;
+
+	mutex_lock(&c->sb_lock);
+
+	if (!strlen(name) || !strcmp(name, "none"))
+		goto write_sb;
+
+	v = bch2_disk_path_find_or_create(&c->disk_sb, name);
+	if (v < 0) {
+		mutex_unlock(&c->sb_lock);
+		return v;
+	}
+
+write_sb:
+	mi = &bch2_sb_get_members(c->disk_sb.sb)->members[ca->dev_idx];
+	SET_BCH_MEMBER_GROUP(mi, v + 1);
+
+	bch2_write_super(c);
+	mutex_unlock(&c->sb_lock);
+
+	return 0;
+}
+
+int bch2_opt_target_parse(struct bch_fs *c, const char *buf, u64 *v)
+{
+	struct bch_dev *ca;
+	int g;
+
+	if (!strlen(buf) || !strcmp(buf, "none")) {
+		*v = 0;
+		return 0;
+	}
+
+	/* Is it a device? */
+	ca = bch2_dev_lookup(c, buf);
+	if (!IS_ERR(ca)) {
+		*v = dev_to_target(ca->dev_idx);
+		percpu_ref_put(&ca->ref);
+		return 0;
+	}
+
+	mutex_lock(&c->sb_lock);
+	g = bch2_disk_path_find(&c->disk_sb, buf);
+	mutex_unlock(&c->sb_lock);
+
+	if (g >= 0) {
+		*v = group_to_target(g);
+		return 0;
+	}
+
+	return -EINVAL;
+}
+
+int bch2_opt_target_print(struct bch_fs *c, char *buf, size_t len, u64 v)
+{
+	struct target t = target_decode(v);
+	int ret;
+
+	switch (t.type) {
+	case TARGET_NULL:
+		return scnprintf(buf, len, "none");
+	case TARGET_DEV: {
+		struct bch_dev *ca;
+
+		rcu_read_lock();
+		ca = t.dev < c->sb.nr_devices
+			? rcu_dereference(c->devs[t.dev])
+			: NULL;
+
+		if (ca && percpu_ref_tryget(&ca->io_ref)) {
+			ret = scnprintf(buf, len, "/dev/%pg",
+					ca->disk_sb.bdev);
+			percpu_ref_put(&ca->io_ref);
+		} else if (ca) {
+			ret = scnprintf(buf, len, "offline device %u", t.dev);
+		} else {
+			ret = scnprintf(buf, len, "invalid device %u", t.dev);
+		}
+
+		rcu_read_unlock();
+		break;
+	}
+	case TARGET_GROUP:
+		mutex_lock(&c->sb_lock);
+		ret = bch2_disk_path_print(&c->disk_sb, buf, len, t.group);
+		mutex_unlock(&c->sb_lock);
+		break;
+	default:
+		BUG();
+	}
+
+	return ret;
+}
diff --git a/fs/bcachefs/disk_groups.h b/fs/bcachefs/disk_groups.h
new file mode 100644
index 000000000000..d202eb3a9de6
--- /dev/null
+++ b/fs/bcachefs/disk_groups.h
@@ -0,0 +1,74 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_DISK_GROUPS_H
+#define _BCACHEFS_DISK_GROUPS_H
+
+extern const struct bch_sb_field_ops bch_sb_field_ops_disk_groups;
+
+static inline unsigned disk_groups_nr(struct bch_sb_field_disk_groups *groups)
+{
+	return groups
+		? (vstruct_end(&groups->field) -
+		   (void *) &groups->entries[0]) / sizeof(struct bch_disk_group)
+		: 0;
+}
+
+struct target {
+	enum {
+		TARGET_NULL,
+		TARGET_DEV,
+		TARGET_GROUP,
+	}			type;
+	union {
+		unsigned	dev;
+		unsigned	group;
+	};
+};
+
+#define TARGET_DEV_START	1
+#define TARGET_GROUP_START	(256 + TARGET_DEV_START)
+
+static inline u16 dev_to_target(unsigned dev)
+{
+	return TARGET_DEV_START + dev;
+}
+
+static inline u16 group_to_target(unsigned group)
+{
+	return TARGET_GROUP_START + group;
+}
+
+static inline struct target target_decode(unsigned target)
+{
+	if (target >= TARGET_GROUP_START)
+		return (struct target) {
+			.type	= TARGET_GROUP,
+			.group	= target - TARGET_GROUP_START
+		};
+
+	if (target >= TARGET_DEV_START)
+		return (struct target) {
+			.type	= TARGET_DEV,
+			.group	= target - TARGET_DEV_START
+		};
+
+	return (struct target) { .type = TARGET_NULL };
+}
+
+const struct bch_devs_mask *bch2_target_to_mask(struct bch_fs *, unsigned);
+bool bch2_dev_in_target(struct bch_fs *, unsigned, unsigned);
+
+int bch2_disk_path_find(struct bch_sb_handle *, const char *);
+int bch2_disk_path_find_or_create(struct bch_sb_handle *, const char *);
+int bch2_disk_path_print(struct bch_sb_handle *, char *, size_t, unsigned);
+
+int bch2_opt_target_parse(struct bch_fs *, const char *, u64 *);
+int bch2_opt_target_print(struct bch_fs *, char *, size_t, u64);
+
+int bch2_sb_disk_groups_to_cpu(struct bch_fs *);
+
+int bch2_dev_group_set(struct bch_fs *, struct bch_dev *, const char *);
+
+const char *bch2_sb_validate_disk_groups(struct bch_sb *,
+					 struct bch_sb_field *);
+
+#endif /* _BCACHEFS_DISK_GROUPS_H */
diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c
new file mode 100644
index 000000000000..e975fab43d49
--- /dev/null
+++ b/fs/bcachefs/error.c
@@ -0,0 +1,159 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "bcachefs.h"
+#include "error.h"
+#include "io.h"
+#include "super.h"
+
+bool bch2_inconsistent_error(struct bch_fs *c)
+{
+	set_bit(BCH_FS_ERROR, &c->flags);
+
+	switch (c->opts.errors) {
+	case BCH_ON_ERROR_CONTINUE:
+		return false;
+	case BCH_ON_ERROR_RO:
+		if (bch2_fs_emergency_read_only(c))
+			bch_err(c, "emergency read only");
+		return true;
+	case BCH_ON_ERROR_PANIC:
+		panic(bch2_fmt(c, "panic after error"));
+		return true;
+	default:
+		BUG();
+	}
+}
+
+void bch2_fatal_error(struct bch_fs *c)
+{
+	if (bch2_fs_emergency_read_only(c))
+		bch_err(c, "emergency read only");
+}
+
+void bch2_io_error_work(struct work_struct *work)
+{
+	struct bch_dev *ca = container_of(work, struct bch_dev, io_error_work);
+	struct bch_fs *c = ca->fs;
+	bool dev;
+
+	mutex_lock(&c->state_lock);
+	dev = bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_RO,
+				    BCH_FORCE_IF_DEGRADED);
+	if (dev
+	    ? __bch2_dev_set_state(c, ca, BCH_MEMBER_STATE_RO,
+				  BCH_FORCE_IF_DEGRADED)
+	    : bch2_fs_emergency_read_only(c))
+		bch_err(ca,
+			"too many IO errors, setting %s RO",
+			dev ? "device" : "filesystem");
+	mutex_unlock(&c->state_lock);
+}
+
+void bch2_io_error(struct bch_dev *ca)
+{
+	//queue_work(system_long_wq, &ca->io_error_work);
+}
+
+#ifdef __KERNEL__
+#define ask_yn()	false
+#else
+#include "tools-util.h"
+#endif
+
+enum fsck_err_ret bch2_fsck_err(struct bch_fs *c, unsigned flags,
+				const char *fmt, ...)
+{
+	struct fsck_err_state *s;
+	va_list args;
+	bool fix = false, print = true, suppressing = false;
+	char _buf[sizeof(s->buf)], *buf = _buf;
+
+	mutex_lock(&c->fsck_error_lock);
+
+	if (test_bit(BCH_FS_FSCK_DONE, &c->flags))
+		goto print;
+
+	list_for_each_entry(s, &c->fsck_errors, list)
+		if (s->fmt == fmt)
+			goto found;
+
+	s = kzalloc(sizeof(*s), GFP_KERNEL);
+	if (!s) {
+		if (!c->fsck_alloc_err)
+			bch_err(c, "kmalloc err, cannot ratelimit fsck errs");
+		c->fsck_alloc_err = true;
+		buf = _buf;
+		goto print;
+	}
+
+	INIT_LIST_HEAD(&s->list);
+	s->fmt = fmt;
+found:
+	list_move(&s->list, &c->fsck_errors);
+	s->nr++;
+	suppressing	= s->nr == 10;
+	print		= s->nr <= 10;
+	buf		= s->buf;
+print:
+	va_start(args, fmt);
+	vscnprintf(buf, sizeof(_buf), fmt, args);
+	va_end(args);
+
+	if (c->opts.fix_errors == FSCK_OPT_EXIT) {
+		bch_err(c, "%s, exiting", buf);
+		mutex_unlock(&c->fsck_error_lock);
+		return FSCK_ERR_EXIT;
+	}
+
+	if (flags & FSCK_CAN_FIX) {
+		if (c->opts.fix_errors == FSCK_OPT_ASK) {
+			printk(KERN_ERR "%s: fix?", buf);
+			fix = ask_yn();
+		} else if (c->opts.fix_errors == FSCK_OPT_YES ||
+			   (c->opts.nochanges &&
+			    !(flags & FSCK_CAN_IGNORE))) {
+			if (print)
+				bch_err(c, "%s, fixing", buf);
+			fix = true;
+		} else {
+			if (print)
+				bch_err(c, "%s, not fixing", buf);
+			fix = false;
+		}
+	} else if (flags & FSCK_NEED_FSCK) {
+		if (print)
+			bch_err(c, "%s (run fsck to correct)", buf);
+	} else {
+		if (print)
+			bch_err(c, "%s (repair unimplemented)", buf);
+	}
+
+	if (suppressing)
+		bch_err(c, "Ratelimiting new instances of previous error");
+
+	mutex_unlock(&c->fsck_error_lock);
+
+	if (fix)
+		set_bit(BCH_FS_FSCK_FIXED_ERRORS, &c->flags);
+
+	return fix				? FSCK_ERR_FIX
+		: flags & FSCK_CAN_IGNORE	? FSCK_ERR_IGNORE
+						: FSCK_ERR_EXIT;
+}
+
+void bch2_flush_fsck_errs(struct bch_fs *c)
+{
+	struct fsck_err_state *s, *n;
+
+	mutex_lock(&c->fsck_error_lock);
+	set_bit(BCH_FS_FSCK_DONE, &c->flags);
+
+	list_for_each_entry_safe(s, n, &c->fsck_errors, list) {
+		if (s->nr > 10)
+			bch_err(c, "Saw %llu errors like:\n    %s", s->nr, s->buf);
+
+		list_del(&s->list);
+		kfree(s);
+	}
+
+	mutex_unlock(&c->fsck_error_lock);
+}
diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h
new file mode 100644
index 000000000000..2591e12305b7
--- /dev/null
+++ b/fs/bcachefs/error.h
@@ -0,0 +1,229 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_ERROR_H
+#define _BCACHEFS_ERROR_H
+
+#include <linux/list.h>
+#include <linux/printk.h>
+
+struct bch_dev;
+struct bch_fs;
+struct work_struct;
+
+/*
+ * XXX: separate out errors that indicate on disk data is inconsistent, and flag
+ * superblock as such
+ */
+
+/* Error messages: */
+
+/*
+ * Very fatal logic/inconsistency errors: these indicate that we've majorly
+ * screwed up at runtime, i.e. it's not likely that it was just caused by the
+ * data on disk being inconsistent. These BUG():
+ *
+ * XXX: audit and convert to inconsistent() checks
+ */
+
+#define bch2_fs_bug(c, ...)						\
+do {									\
+	bch_err(c, __VA_ARGS__);					\
+	BUG();								\
+} while (0)
+
+#define bch2_fs_bug_on(cond, c, ...)					\
+do {									\
+	if (cond)							\
+		bch2_fs_bug(c, __VA_ARGS__);				\
+} while (0)
+
+/*
+ * Inconsistency errors: The on disk data is inconsistent. If these occur during
+ * initial recovery, they don't indicate a bug in the running code - we walk all
+ * the metadata before modifying anything. If they occur at runtime, they
+ * indicate either a bug in the running code or (less likely) data is being
+ * silently corrupted under us.
+ *
+ * XXX: audit all inconsistent errors and make sure they're all recoverable, in
+ * BCH_ON_ERROR_CONTINUE mode
+ */
+
+bool bch2_inconsistent_error(struct bch_fs *);
+
+#define bch2_fs_inconsistent(c, ...)					\
+({									\
+	bch_err(c, __VA_ARGS__);					\
+	bch2_inconsistent_error(c);					\
+})
+
+#define bch2_fs_inconsistent_on(cond, c, ...)				\
+({									\
+	int _ret = !!(cond);						\
+									\
+	if (_ret)							\
+		bch2_fs_inconsistent(c, __VA_ARGS__);			\
+	_ret;								\
+})
+
+/*
+ * Later we might want to mark only the particular device inconsistent, not the
+ * entire filesystem:
+ */
+
+#define bch2_dev_inconsistent(ca, ...)					\
+do {									\
+	bch_err(ca, __VA_ARGS__);					\
+	bch2_inconsistent_error((ca)->fs);				\
+} while (0)
+
+#define bch2_dev_inconsistent_on(cond, ca, ...)				\
+({									\
+	int _ret = !!(cond);						\
+									\
+	if (_ret)							\
+		bch2_dev_inconsistent(ca, __VA_ARGS__);			\
+	_ret;								\
+})
+
+/*
+ * Fsck errors: inconsistency errors we detect at mount time, and should ideally
+ * be able to repair:
+ */
+
+enum {
+	BCH_FSCK_OK			= 0,
+	BCH_FSCK_ERRORS_NOT_FIXED	= 1,
+	BCH_FSCK_REPAIR_UNIMPLEMENTED	= 2,
+	BCH_FSCK_REPAIR_IMPOSSIBLE	= 3,
+	BCH_FSCK_UNKNOWN_VERSION	= 4,
+};
+
+enum fsck_err_opts {
+	FSCK_OPT_EXIT,
+	FSCK_OPT_YES,
+	FSCK_OPT_NO,
+	FSCK_OPT_ASK,
+};
+
+enum fsck_err_ret {
+	FSCK_ERR_IGNORE	= 0,
+	FSCK_ERR_FIX	= 1,
+	FSCK_ERR_EXIT	= 2,
+};
+
+struct fsck_err_state {
+	struct list_head	list;
+	const char		*fmt;
+	u64			nr;
+	char			buf[512];
+};
+
+#define FSCK_CAN_FIX		(1 << 0)
+#define FSCK_CAN_IGNORE		(1 << 1)
+#define FSCK_NEED_FSCK		(1 << 2)
+
+enum fsck_err_ret bch2_fsck_err(struct bch_fs *,
+				unsigned, const char *, ...);
+void bch2_flush_fsck_errs(struct bch_fs *);
+
+#define __fsck_err(c, _flags, msg, ...)					\
+({									\
+	int _fix = bch2_fsck_err(c, _flags, msg, ##__VA_ARGS__);\
+									\
+	if (_fix == FSCK_ERR_EXIT) {					\
+		bch_err(c, "Unable to continue, halting");		\
+		ret = BCH_FSCK_ERRORS_NOT_FIXED;			\
+		goto fsck_err;						\
+	}								\
+									\
+	_fix;								\
+})
+
+/* These macros return true if error should be fixed: */
+
+/* XXX: mark in superblock that filesystem contains errors, if we ignore: */
+
+#define __fsck_err_on(cond, c, _flags, ...)				\
+	((cond) ? __fsck_err(c, _flags,	##__VA_ARGS__) : false)
+
+#define need_fsck_err_on(cond, c, ...)					\
+	__fsck_err_on(cond, c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, ##__VA_ARGS__)
+
+#define need_fsck_err(c, ...)						\
+	__fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, ##__VA_ARGS__)
+
+#define mustfix_fsck_err(c, ...)					\
+	__fsck_err(c, FSCK_CAN_FIX, ##__VA_ARGS__)
+
+#define mustfix_fsck_err_on(cond, c, ...)				\
+	__fsck_err_on(cond, c, FSCK_CAN_FIX, ##__VA_ARGS__)
+
+#define fsck_err(c, ...)						\
+	__fsck_err(c, FSCK_CAN_FIX|FSCK_CAN_IGNORE, ##__VA_ARGS__)
+
+#define fsck_err_on(cond, c, ...)					\
+	__fsck_err_on(cond, c, FSCK_CAN_FIX|FSCK_CAN_IGNORE, ##__VA_ARGS__)
+
+/*
+ * Fatal errors: these don't indicate a bug, but we can't continue running in RW
+ * mode - pretty much just due to metadata IO errors:
+ */
+
+void bch2_fatal_error(struct bch_fs *);
+
+#define bch2_fs_fatal_error(c, ...)					\
+do {									\
+	bch_err(c, __VA_ARGS__);					\
+	bch2_fatal_error(c);						\
+} while (0)
+
+#define bch2_fs_fatal_err_on(cond, c, ...)				\
+({									\
+	int _ret = !!(cond);						\
+									\
+	if (_ret)							\
+		bch2_fs_fatal_error(c, __VA_ARGS__);			\
+	_ret;								\
+})
+
+/*
+ * IO errors: either recoverable metadata IO (because we have replicas), or data
+ * IO - we need to log it and print out a message, but we don't (necessarily)
+ * want to shut down the fs:
+ */
+
+void bch2_io_error_work(struct work_struct *);
+
+/* Does the error handling without logging a message */
+void bch2_io_error(struct bch_dev *);
+
+/* Logs message and handles the error: */
+#define bch2_dev_io_error(ca, fmt, ...)					\
+do {									\
+	printk_ratelimited(KERN_ERR bch2_fmt((ca)->fs,			\
+		"IO error on %s for " fmt),				\
+		(ca)->name, ##__VA_ARGS__);				\
+	bch2_io_error(ca);						\
+} while (0)
+
+#define bch2_dev_io_err_on(cond, ca, ...)				\
+({									\
+	bool _ret = (cond);						\
+									\
+	if (_ret)							\
+		bch2_dev_io_error(ca, __VA_ARGS__);			\
+	_ret;								\
+})
+
+/* kill? */
+
+#define __bcache_io_error(c, fmt, ...)					\
+	printk_ratelimited(KERN_ERR bch2_fmt(c,				\
+			"IO error: " fmt), ##__VA_ARGS__)
+
+#define bcache_io_error(c, bio, fmt, ...)				\
+do {									\
+	__bcache_io_error(c, fmt, ##__VA_ARGS__);			\
+	(bio)->bi_status = BLK_STS_IOERR;					\
+} while (0)
+
+#endif /* _BCACHEFS_ERROR_H */
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
new file mode 100644
index 000000000000..2c1cf29e265a
--- /dev/null
+++ b/fs/bcachefs/extents.c
@@ -0,0 +1,2395 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2010 Kent Overstreet <kent.overstreet@gmail.com>
+ *
+ * Code for managing the extent btree and dynamically updating the writeback
+ * dirty sector count.
+ */
+
+#include "bcachefs.h"
+#include "bkey_methods.h"
+#include "btree_gc.h"
+#include "btree_update.h"
+#include "btree_update_interior.h"
+#include "buckets.h"
+#include "checksum.h"
+#include "debug.h"
+#include "dirent.h"
+#include "disk_groups.h"
+#include "error.h"
+#include "extents.h"
+#include "inode.h"
+#include "journal.h"
+#include "replicas.h"
+#include "super.h"
+#include "super-io.h"
+#include "trace.h"
+#include "util.h"
+#include "xattr.h"
+
+static void sort_key_next(struct btree_node_iter_large *iter,
+			  struct btree *b,
+			  struct btree_node_iter_set *i)
+{
+	i->k += __btree_node_offset_to_key(b, i->k)->u64s;
+
+	if (i->k == i->end)
+		*i = iter->data[--iter->used];
+}
+
+/*
+ * Returns true if l > r - unless l == r, in which case returns true if l is
+ * older than r.
+ *
+ * Necessary for btree_sort_fixup() - if there are multiple keys that compare
+ * equal in different sets, we have to process them newest to oldest.
+ */
+#define key_sort_cmp(h, l, r)						\
+({									\
+	bkey_cmp_packed(b,						\
+			__btree_node_offset_to_key(b, (l).k),		\
+			__btree_node_offset_to_key(b, (r).k))		\
+									\
+	?: (l).k - (r).k;						\
+})
+
+static inline bool should_drop_next_key(struct btree_node_iter_large *iter,
+					struct btree *b)
+{
+	struct btree_node_iter_set *l = iter->data, *r = iter->data + 1;
+	struct bkey_packed *k = __btree_node_offset_to_key(b, l->k);
+
+	if (bkey_whiteout(k))
+		return true;
+
+	if (iter->used < 2)
+		return false;
+
+	if (iter->used > 2 &&
+	    key_sort_cmp(iter, r[0], r[1]) >= 0)
+		r++;
+
+	/*
+	 * key_sort_cmp() ensures that when keys compare equal the older key
+	 * comes first; so if l->k compares equal to r->k then l->k is older and
+	 * should be dropped.
+	 */
+	return !bkey_cmp_packed(b,
+				__btree_node_offset_to_key(b, l->k),
+				__btree_node_offset_to_key(b, r->k));
+}
+
+struct btree_nr_keys bch2_key_sort_fix_overlapping(struct bset *dst,
+					struct btree *b,
+					struct btree_node_iter_large *iter)
+{
+	struct bkey_packed *out = dst->start;
+	struct btree_nr_keys nr;
+
+	memset(&nr, 0, sizeof(nr));
+
+	heap_resort(iter, key_sort_cmp);
+
+	while (!bch2_btree_node_iter_large_end(iter)) {
+		if (!should_drop_next_key(iter, b)) {
+			struct bkey_packed *k =
+				__btree_node_offset_to_key(b, iter->data->k);
+
+			bkey_copy(out, k);
+			btree_keys_account_key_add(&nr, 0, out);
+			out = bkey_next(out);
+		}
+
+		sort_key_next(iter, b, iter->data);
+		heap_sift_down(iter, 0, key_sort_cmp);
+	}
+
+	dst->u64s = cpu_to_le16((u64 *) out - dst->_data);
+	return nr;
+}
+
+/* Common among btree and extent ptrs */
+
+const struct bch_extent_ptr *
+bch2_extent_has_device(struct bkey_s_c_extent e, unsigned dev)
+{
+	const struct bch_extent_ptr *ptr;
+
+	extent_for_each_ptr(e, ptr)
+		if (ptr->dev == dev)
+			return ptr;
+
+	return NULL;
+}
+
+bool bch2_extent_drop_device(struct bkey_s_extent e, unsigned dev)
+{
+	struct bch_extent_ptr *ptr;
+	bool dropped = false;
+
+	extent_for_each_ptr_backwards(e, ptr)
+		if (ptr->dev == dev) {
+			__bch2_extent_drop_ptr(e, ptr);
+			dropped = true;
+		}
+
+	if (dropped)
+		bch2_extent_drop_redundant_crcs(e);
+	return dropped;
+}
+
+const struct bch_extent_ptr *
+bch2_extent_has_group(struct bch_fs *c, struct bkey_s_c_extent e, unsigned group)
+{
+	const struct bch_extent_ptr *ptr;
+
+	extent_for_each_ptr(e, ptr) {
+		struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+
+		if (ca->mi.group &&
+		    ca->mi.group - 1 == group)
+			return ptr;
+	}
+
+	return NULL;
+}
+
+const struct bch_extent_ptr *
+bch2_extent_has_target(struct bch_fs *c, struct bkey_s_c_extent e, unsigned target)
+{
+	const struct bch_extent_ptr *ptr;
+
+	extent_for_each_ptr(e, ptr)
+		if (bch2_dev_in_target(c, ptr->dev, target) &&
+		    (!ptr->cached ||
+		     !ptr_stale(bch_dev_bkey_exists(c, ptr->dev), ptr)))
+			return ptr;
+
+	return NULL;
+}
+
+unsigned bch2_extent_nr_ptrs(struct bkey_s_c_extent e)
+{
+	const struct bch_extent_ptr *ptr;
+	unsigned nr_ptrs = 0;
+
+	extent_for_each_ptr(e, ptr)
+		nr_ptrs++;
+
+	return nr_ptrs;
+}
+
+unsigned bch2_extent_nr_dirty_ptrs(struct bkey_s_c k)
+{
+	struct bkey_s_c_extent e;
+	const struct bch_extent_ptr *ptr;
+	unsigned nr_ptrs = 0;
+
+	switch (k.k->type) {
+	case BCH_EXTENT:
+	case BCH_EXTENT_CACHED:
+		e = bkey_s_c_to_extent(k);
+
+		extent_for_each_ptr(e, ptr)
+			nr_ptrs += !ptr->cached;
+		break;
+
+	case BCH_RESERVATION:
+		nr_ptrs = bkey_s_c_to_reservation(k).v->nr_replicas;
+		break;
+	}
+
+	return nr_ptrs;
+}
+
+unsigned bch2_extent_ptr_durability(struct bch_fs *c,
+				    const struct bch_extent_ptr *ptr)
+{
+	struct bch_dev *ca;
+
+	if (ptr->cached)
+		return 0;
+
+	ca = bch_dev_bkey_exists(c, ptr->dev);
+
+	if (ca->mi.state == BCH_MEMBER_STATE_FAILED)
+		return 0;
+
+	return ca->mi.durability;
+}
+
+unsigned bch2_extent_durability(struct bch_fs *c, struct bkey_s_c_extent e)
+{
+	const struct bch_extent_ptr *ptr;
+	unsigned durability = 0;
+
+	extent_for_each_ptr(e, ptr)
+		durability += bch2_extent_ptr_durability(c, ptr);
+
+	return durability;
+}
+
+unsigned bch2_extent_is_compressed(struct bkey_s_c k)
+{
+	struct bkey_s_c_extent e;
+	const struct bch_extent_ptr *ptr;
+	struct bch_extent_crc_unpacked crc;
+	unsigned ret = 0;
+
+	switch (k.k->type) {
+	case BCH_EXTENT:
+	case BCH_EXTENT_CACHED:
+		e = bkey_s_c_to_extent(k);
+
+		extent_for_each_ptr_crc(e, ptr, crc)
+			if (!ptr->cached &&
+			    crc.compression_type != BCH_COMPRESSION_NONE &&
+			    crc.compressed_size < crc.live_size)
+				ret = max_t(unsigned, ret, crc.compressed_size);
+	}
+
+	return ret;
+}
+
+bool bch2_extent_matches_ptr(struct bch_fs *c, struct bkey_s_c_extent e,
+			     struct bch_extent_ptr m, u64 offset)
+{
+	const struct bch_extent_ptr *ptr;
+	struct bch_extent_crc_unpacked crc;
+
+	extent_for_each_ptr_crc(e, ptr, crc)
+		if (ptr->dev	== m.dev &&
+		    ptr->gen	== m.gen &&
+		    (s64) ptr->offset + crc.offset - bkey_start_offset(e.k) ==
+		    (s64) m.offset  - offset)
+			return ptr;
+
+	return NULL;
+}
+
+/* Doesn't cleanup redundant crcs */
+void __bch2_extent_drop_ptr(struct bkey_s_extent e, struct bch_extent_ptr *ptr)
+{
+	EBUG_ON(ptr < &e.v->start->ptr ||
+		ptr >= &extent_entry_last(e)->ptr);
+	EBUG_ON(ptr->type != 1 << BCH_EXTENT_ENTRY_ptr);
+	memmove_u64s_down(ptr, ptr + 1,
+			  (u64 *) extent_entry_last(e) - (u64 *) (ptr + 1));
+	e.k->u64s -= sizeof(*ptr) / sizeof(u64);
+}
+
+void bch2_extent_drop_ptr(struct bkey_s_extent e, struct bch_extent_ptr *ptr)
+{
+	__bch2_extent_drop_ptr(e, ptr);
+	bch2_extent_drop_redundant_crcs(e);
+}
+
+static inline bool can_narrow_crc(struct bch_extent_crc_unpacked u,
+				  struct bch_extent_crc_unpacked n)
+{
+	return !u.compression_type &&
+		u.csum_type &&
+		u.uncompressed_size > u.live_size &&
+		bch2_csum_type_is_encryption(u.csum_type) ==
+		bch2_csum_type_is_encryption(n.csum_type);
+}
+
+bool bch2_can_narrow_extent_crcs(struct bkey_s_c_extent e,
+				 struct bch_extent_crc_unpacked n)
+{
+	struct bch_extent_crc_unpacked crc;
+	const union bch_extent_entry *i;
+
+	if (!n.csum_type)
+		return false;
+
+	extent_for_each_crc(e, crc, i)
+		if (can_narrow_crc(crc, n))
+			return true;
+
+	return false;
+}
+
+/*
+ * We're writing another replica for this extent, so while we've got the data in
+ * memory we'll be computing a new checksum for the currently live data.
+ *
+ * If there are other replicas we aren't moving, and they are checksummed but
+ * not compressed, we can modify them to point to only the data that is
+ * currently live (so that readers won't have to bounce) while we've got the
+ * checksum we need:
+ */
+bool bch2_extent_narrow_crcs(struct bkey_i_extent *e,
+			     struct bch_extent_crc_unpacked n)
+{
+	struct bch_extent_crc_unpacked u;
+	struct bch_extent_ptr *ptr;
+	union bch_extent_entry *i;
+
+	/* Find a checksum entry that covers only live data: */
+	if (!n.csum_type)
+		extent_for_each_crc(extent_i_to_s(e), u, i)
+			if (!u.compression_type &&
+			    u.csum_type &&
+			    u.live_size == u.uncompressed_size) {
+				n = u;
+				break;
+			}
+
+	if (!bch2_can_narrow_extent_crcs(extent_i_to_s_c(e), n))
+		return false;
+
+	BUG_ON(n.compression_type);
+	BUG_ON(n.offset);
+	BUG_ON(n.live_size != e->k.size);
+
+	bch2_extent_crc_append(e, n);
+restart_narrow_pointers:
+	extent_for_each_ptr_crc(extent_i_to_s(e), ptr, u)
+		if (can_narrow_crc(u, n)) {
+			ptr->offset += u.offset;
+			extent_ptr_append(e, *ptr);
+			__bch2_extent_drop_ptr(extent_i_to_s(e), ptr);
+			goto restart_narrow_pointers;
+		}
+
+	bch2_extent_drop_redundant_crcs(extent_i_to_s(e));
+	return true;
+}
+
+/* returns true if not equal */
+static inline bool bch2_crc_unpacked_cmp(struct bch_extent_crc_unpacked l,
+					 struct bch_extent_crc_unpacked r)
+{
+	return (l.csum_type		!= r.csum_type ||
+		l.compression_type	!= r.compression_type ||
+		l.compressed_size	!= r.compressed_size ||
+		l.uncompressed_size	!= r.uncompressed_size ||
+		l.offset		!= r.offset ||
+		l.live_size		!= r.live_size ||
+		l.nonce			!= r.nonce ||
+		bch2_crc_cmp(l.csum, r.csum));
+}
+
+void bch2_extent_drop_redundant_crcs(struct bkey_s_extent e)
+{
+	union bch_extent_entry *entry = e.v->start;
+	union bch_extent_crc *crc, *prev = NULL;
+	struct bch_extent_crc_unpacked u, prev_u = { 0 };
+
+	while (entry != extent_entry_last(e)) {
+		union bch_extent_entry *next = extent_entry_next(entry);
+		size_t crc_u64s = extent_entry_u64s(entry);
+
+		if (!extent_entry_is_crc(entry))
+			goto next;
+
+		crc = entry_to_crc(entry);
+		u = bch2_extent_crc_unpack(e.k, crc);
+
+		if (next == extent_entry_last(e)) {
+			/* crc entry with no pointers after it: */
+			goto drop;
+		}
+
+		if (extent_entry_is_crc(next)) {
+			/* no pointers before next crc entry: */
+			goto drop;
+		}
+
+		if (prev && !bch2_crc_unpacked_cmp(u, prev_u)) {
+			/* identical to previous crc entry: */
+			goto drop;
+		}
+
+		if (!prev &&
+		    !u.csum_type &&
+		    !u.compression_type) {
+			/* null crc entry: */
+			union bch_extent_entry *e2;
+
+			extent_for_each_entry_from(e, e2, extent_entry_next(entry)) {
+				if (!extent_entry_is_ptr(e2))
+					break;
+
+				e2->ptr.offset += u.offset;
+			}
+			goto drop;
+		}
+
+		prev = crc;
+		prev_u = u;
+next:
+		entry = next;
+		continue;
+drop:
+		memmove_u64s_down(crc, next,
+				  (u64 *) extent_entry_last(e) - (u64 *) next);
+		e.k->u64s -= crc_u64s;
+	}
+
+	EBUG_ON(bkey_val_u64s(e.k) && !bch2_extent_nr_ptrs(e.c));
+}
+
+static bool should_drop_ptr(const struct bch_fs *c,
+			    struct bkey_s_c_extent e,
+			    const struct bch_extent_ptr *ptr)
+{
+	return ptr->cached && ptr_stale(bch_dev_bkey_exists(c, ptr->dev), ptr);
+}
+
+static void bch2_extent_drop_stale(struct bch_fs *c, struct bkey_s_extent e)
+{
+	struct bch_extent_ptr *ptr = &e.v->start->ptr;
+	bool dropped = false;
+
+	while ((ptr = extent_ptr_next(e, ptr)))
+		if (should_drop_ptr(c, e.c, ptr)) {
+			__bch2_extent_drop_ptr(e, ptr);
+			dropped = true;
+		} else
+			ptr++;
+
+	if (dropped)
+		bch2_extent_drop_redundant_crcs(e);
+}
+
+bool bch2_ptr_normalize(struct bch_fs *c, struct btree *b, struct bkey_s k)
+{
+	return bch2_extent_normalize(c, k);
+}
+
+void bch2_ptr_swab(const struct bkey_format *f, struct bkey_packed *k)
+{
+	switch (k->type) {
+	case BCH_EXTENT:
+	case BCH_EXTENT_CACHED: {
+		union bch_extent_entry *entry;
+		u64 *d = (u64 *) bkeyp_val(f, k);
+		unsigned i;
+
+		for (i = 0; i < bkeyp_val_u64s(f, k); i++)
+			d[i] = swab64(d[i]);
+
+		for (entry = (union bch_extent_entry *) d;
+		     entry < (union bch_extent_entry *) (d + bkeyp_val_u64s(f, k));
+		     entry = extent_entry_next(entry)) {
+			switch (extent_entry_type(entry)) {
+			case BCH_EXTENT_ENTRY_crc32:
+				entry->crc32.csum = swab32(entry->crc32.csum);
+				break;
+			case BCH_EXTENT_ENTRY_crc64:
+				entry->crc64.csum_hi = swab16(entry->crc64.csum_hi);
+				entry->crc64.csum_lo = swab64(entry->crc64.csum_lo);
+				break;
+			case BCH_EXTENT_ENTRY_crc128:
+				entry->crc128.csum.hi = (__force __le64)
+					swab64((__force u64) entry->crc128.csum.hi);
+				entry->crc128.csum.lo = (__force __le64)
+					swab64((__force u64) entry->crc128.csum.lo);
+				break;
+			case BCH_EXTENT_ENTRY_ptr:
+				break;
+			}
+		}
+		break;
+	}
+	}
+}
+
+static const char *extent_ptr_invalid(const struct bch_fs *c,
+				      struct bkey_s_c_extent e,
+				      const struct bch_extent_ptr *ptr,
+				      unsigned size_ondisk,
+				      bool metadata)
+{
+	const struct bch_extent_ptr *ptr2;
+	struct bch_dev *ca;
+
+	if (ptr->dev >= c->sb.nr_devices ||
+	    !c->devs[ptr->dev])
+		return "pointer to invalid device";
+
+	ca = bch_dev_bkey_exists(c, ptr->dev);
+	if (!ca)
+		return "pointer to invalid device";
+
+	extent_for_each_ptr(e, ptr2)
+		if (ptr != ptr2 && ptr->dev == ptr2->dev)
+			return "multiple pointers to same device";
+
+	if (ptr->offset + size_ondisk > bucket_to_sector(ca, ca->mi.nbuckets))
+		return "offset past end of device";
+
+	if (ptr->offset < bucket_to_sector(ca, ca->mi.first_bucket))
+		return "offset before first bucket";
+
+	if (bucket_remainder(ca, ptr->offset) +
+	    size_ondisk > ca->mi.bucket_size)
+		return "spans multiple buckets";
+
+	return NULL;
+}
+
+static size_t extent_print_ptrs(struct bch_fs *c, char *buf,
+				size_t size, struct bkey_s_c_extent e)
+{
+	char *out = buf, *end = buf + size;
+	const union bch_extent_entry *entry;
+	struct bch_extent_crc_unpacked crc;
+	const struct bch_extent_ptr *ptr;
+	struct bch_dev *ca;
+	bool first = true;
+
+#define p(...)	(out += scnprintf(out, end - out, __VA_ARGS__))
+
+	extent_for_each_entry(e, entry) {
+		if (!first)
+			p(" ");
+
+		switch (__extent_entry_type(entry)) {
+		case BCH_EXTENT_ENTRY_crc32:
+		case BCH_EXTENT_ENTRY_crc64:
+		case BCH_EXTENT_ENTRY_crc128:
+			crc = bch2_extent_crc_unpack(e.k, entry_to_crc(entry));
+
+			p("crc: c_size %u size %u offset %u nonce %u csum %u compress %u",
+			  crc.compressed_size,
+			  crc.uncompressed_size,
+			  crc.offset, crc.nonce,
+			  crc.csum_type,
+			  crc.compression_type);
+			break;
+		case BCH_EXTENT_ENTRY_ptr:
+			ptr = entry_to_ptr(entry);
+			ca = ptr->dev < c->sb.nr_devices && c->devs[ptr->dev]
+				? bch_dev_bkey_exists(c, ptr->dev)
+				: NULL;
+
+			p("ptr: %u:%llu gen %u%s%s", ptr->dev,
+			  (u64) ptr->offset, ptr->gen,
+			  ptr->cached ? " cached" : "",
+			  ca && ptr_stale(ca, ptr)
+			  ? " stale" : "");
+			break;
+		default:
+			p("(invalid extent entry %.16llx)", *((u64 *) entry));
+			goto out;
+		}
+
+		first = false;
+	}
+out:
+	if (bkey_extent_is_cached(e.k))
+		p(" cached");
+#undef p
+	return out - buf;
+}
+
+static inline bool dev_latency_better(struct bch_fs *c,
+			      const struct bch_extent_ptr *ptr1,
+			      const struct bch_extent_ptr *ptr2)
+{
+	struct bch_dev *dev1 = bch_dev_bkey_exists(c, ptr1->dev);
+	struct bch_dev *dev2 = bch_dev_bkey_exists(c, ptr2->dev);
+	u64 l1 = atomic64_read(&dev1->cur_latency[READ]);
+	u64 l2 = atomic64_read(&dev2->cur_latency[READ]);
+
+	/* Pick at random, biased in favor of the faster device: */
+
+	return bch2_rand_range(l1 + l2) > l1;
+}
+
+static int extent_pick_read_device(struct bch_fs *c,
+				   struct bkey_s_c_extent e,
+				   struct bch_devs_mask *avoid,
+				   struct extent_pick_ptr *pick)
+{
+	const struct bch_extent_ptr *ptr;
+	struct bch_extent_crc_unpacked crc;
+	struct bch_dev *ca;
+	int ret = 0;
+
+	extent_for_each_ptr_crc(e, ptr, crc) {
+		ca = bch_dev_bkey_exists(c, ptr->dev);
+
+		if (ptr->cached && ptr_stale(ca, ptr))
+			continue;
+
+		if (avoid && test_bit(ptr->dev, avoid->d))
+			continue;
+
+		if (ret && !dev_latency_better(c, ptr, &pick->ptr))
+			continue;
+
+		*pick = (struct extent_pick_ptr) {
+			.ptr	= *ptr,
+			.crc	= crc,
+		};
+
+		ret = 1;
+	}
+
+	return ret;
+}
+
+/* Btree ptrs */
+
+const char *bch2_btree_ptr_invalid(const struct bch_fs *c, struct bkey_s_c k)
+{
+	if (bkey_extent_is_cached(k.k))
+		return "cached";
+
+	if (k.k->size)
+		return "nonzero key size";
+
+	if (bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX)
+		return "value too big";
+
+	switch (k.k->type) {
+	case BCH_EXTENT: {
+		struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
+		const union bch_extent_entry *entry;
+		const struct bch_extent_ptr *ptr;
+		const char *reason;
+
+		extent_for_each_entry(e, entry) {
+			if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX)
+				return "invalid extent entry type";
+
+			if (extent_entry_is_crc(entry))
+				return "has crc field";
+		}
+
+		extent_for_each_ptr(e, ptr) {
+			reason = extent_ptr_invalid(c, e, ptr,
+						    c->opts.btree_node_size,
+						    true);
+			if (reason)
+				return reason;
+		}
+
+		return NULL;
+	}
+
+	default:
+		return "invalid value type";
+	}
+}
+
+void bch2_btree_ptr_debugcheck(struct bch_fs *c, struct btree *b,
+			       struct bkey_s_c k)
+{
+	struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
+	const struct bch_extent_ptr *ptr;
+	unsigned seq;
+	const char *err;
+	char buf[160];
+	struct bucket_mark mark;
+	struct bch_dev *ca;
+	unsigned replicas = 0;
+	bool bad;
+
+	extent_for_each_ptr(e, ptr) {
+		ca = bch_dev_bkey_exists(c, ptr->dev);
+		replicas++;
+
+		if (!test_bit(BCH_FS_ALLOC_READ_DONE, &c->flags))
+			continue;
+
+		err = "stale";
+		if (ptr_stale(ca, ptr))
+			goto err;
+
+		do {
+			seq = read_seqcount_begin(&c->gc_pos_lock);
+			mark = ptr_bucket_mark(ca, ptr);
+
+			bad = gc_pos_cmp(c->gc_pos, gc_pos_btree_node(b)) > 0 &&
+				(mark.data_type != BCH_DATA_BTREE ||
+				 mark.dirty_sectors < c->opts.btree_node_size);
+		} while (read_seqcount_retry(&c->gc_pos_lock, seq));
+
+		err = "inconsistent";
+		if (bad)
+			goto err;
+	}
+
+	if (!bch2_bkey_replicas_marked(c, BCH_DATA_BTREE, e.s_c)) {
+		bch2_bkey_val_to_text(c, btree_node_type(b),
+				     buf, sizeof(buf), k);
+		bch2_fs_bug(c,
+			"btree key bad (replicas not marked in superblock):\n%s",
+			buf);
+		return;
+	}
+
+	return;
+err:
+	bch2_bkey_val_to_text(c, btree_node_type(b), buf, sizeof(buf), k);
+	bch2_fs_bug(c, "%s btree pointer %s: bucket %zi "
+		      "gen %i mark %08x",
+		      err, buf, PTR_BUCKET_NR(ca, ptr),
+		      mark.gen, (unsigned) mark.v.counter);
+}
+
+void bch2_btree_ptr_to_text(struct bch_fs *c, char *buf,
+			    size_t size, struct bkey_s_c k)
+{
+	char *out = buf, *end = buf + size;
+	const char *invalid;
+
+#define p(...)	(out += scnprintf(out, end - out, __VA_ARGS__))
+
+	if (bkey_extent_is_data(k.k))
+		out += extent_print_ptrs(c, buf, size, bkey_s_c_to_extent(k));
+
+	invalid = bch2_btree_ptr_invalid(c, k);
+	if (invalid)
+		p(" invalid: %s", invalid);
+#undef p
+}
+
+int bch2_btree_pick_ptr(struct bch_fs *c, const struct btree *b,
+			struct bch_devs_mask *avoid,
+			struct extent_pick_ptr *pick)
+{
+	return extent_pick_read_device(c, bkey_i_to_s_c_extent(&b->key),
+				       avoid, pick);
+}
+
+/* Extents */
+
+static bool __bch2_cut_front(struct bpos where, struct bkey_s k)
+{
+	u64 len = 0;
+
+	if (bkey_cmp(where, bkey_start_pos(k.k)) <= 0)
+		return false;
+
+	EBUG_ON(bkey_cmp(where, k.k->p) > 0);
+
+	len = k.k->p.offset - where.offset;
+
+	BUG_ON(len > k.k->size);
+
+	/*
+	 * Don't readjust offset if the key size is now 0, because that could
+	 * cause offset to point to the next bucket:
+	 */
+	if (!len)
+		k.k->type = KEY_TYPE_DELETED;
+	else if (bkey_extent_is_data(k.k)) {
+		struct bkey_s_extent e = bkey_s_to_extent(k);
+		union bch_extent_entry *entry;
+		bool seen_crc = false;
+
+		extent_for_each_entry(e, entry) {
+			switch (extent_entry_type(entry)) {
+			case BCH_EXTENT_ENTRY_ptr:
+				if (!seen_crc)
+					entry->ptr.offset += e.k->size - len;
+				break;
+			case BCH_EXTENT_ENTRY_crc32:
+				entry->crc32.offset += e.k->size - len;
+				break;
+			case BCH_EXTENT_ENTRY_crc64:
+				entry->crc64.offset += e.k->size - len;
+				break;
+			case BCH_EXTENT_ENTRY_crc128:
+				entry->crc128.offset += e.k->size - len;
+				break;
+			}
+
+			if (extent_entry_is_crc(entry))
+				seen_crc = true;
+		}
+	}
+
+	k.k->size = len;
+
+	return true;
+}
+
+bool bch2_cut_front(struct bpos where, struct bkey_i *k)
+{
+	return __bch2_cut_front(where, bkey_i_to_s(k));
+}
+
+bool bch2_cut_back(struct bpos where, struct bkey *k)
+{
+	u64 len = 0;
+
+	if (bkey_cmp(where, k->p) >= 0)
+		return false;
+
+	EBUG_ON(bkey_cmp(where, bkey_start_pos(k)) < 0);
+
+	len = where.offset - bkey_start_offset(k);
+
+	BUG_ON(len > k->size);
+
+	k->p = where;
+	k->size = len;
+
+	if (!len)
+		k->type = KEY_TYPE_DELETED;
+
+	return true;
+}
+
+/**
+ * bch_key_resize - adjust size of @k
+ *
+ * bkey_start_offset(k) will be preserved, modifies where the extent ends
+ */
+void bch2_key_resize(struct bkey *k,
+		    unsigned new_size)
+{
+	k->p.offset -= k->size;
+	k->p.offset += new_size;
+	k->size = new_size;
+}
+
+/*
+ * In extent_sort_fix_overlapping(), insert_fixup_extent(),
+ * extent_merge_inline() - we're modifying keys in place that are packed. To do
+ * that we have to unpack the key, modify the unpacked key - then this
+ * copies/repacks the unpacked to the original as necessary.
+ */
+static bool __extent_save(struct btree *b, struct btree_node_iter *iter,
+			  struct bkey_packed *dst, struct bkey *src)
+{
+	struct bkey_format *f = &b->format;
+	struct bkey_i *dst_unpacked;
+	bool ret;
+
+	if ((dst_unpacked = packed_to_bkey(dst))) {
+		dst_unpacked->k = *src;
+		ret = true;
+	} else {
+		ret = bch2_bkey_pack_key(dst, src, f);
+	}
+
+	if (ret && iter)
+		bch2_verify_key_order(b, iter, dst);
+
+	return ret;
+}
+
+static void extent_save(struct btree *b, struct btree_node_iter *iter,
+			struct bkey_packed *dst, struct bkey *src)
+{
+	BUG_ON(!__extent_save(b, iter, dst, src));
+}
+
+/*
+ * If keys compare equal, compare by pointer order:
+ *
+ * Necessary for sort_fix_overlapping() - if there are multiple keys that
+ * compare equal in different sets, we have to process them newest to oldest.
+ */
+#define extent_sort_cmp(h, l, r)					\
+({									\
+	struct bkey _ul = bkey_unpack_key(b,				\
+				__btree_node_offset_to_key(b, (l).k));	\
+	struct bkey _ur = bkey_unpack_key(b,				\
+				__btree_node_offset_to_key(b, (r).k));	\
+									\
+	bkey_cmp(bkey_start_pos(&_ul),					\
+		 bkey_start_pos(&_ur)) ?: (r).k - (l).k;		\
+})
+
+static inline void extent_sort_sift(struct btree_node_iter_large *iter,
+				    struct btree *b, size_t i)
+{
+	heap_sift_down(iter, i, extent_sort_cmp);
+}
+
+static inline void extent_sort_next(struct btree_node_iter_large *iter,
+				    struct btree *b,
+				    struct btree_node_iter_set *i)
+{
+	sort_key_next(iter, b, i);
+	heap_sift_down(iter, i - iter->data, extent_sort_cmp);
+}
+
+static void extent_sort_append(struct bch_fs *c,
+			       struct btree *b,
+			       struct btree_nr_keys *nr,
+			       struct bkey_packed *start,
+			       struct bkey_packed **prev,
+			       struct bkey_packed *k)
+{
+	struct bkey_format *f = &b->format;
+	BKEY_PADDED(k) tmp;
+
+	if (bkey_whiteout(k))
+		return;
+
+	bch2_bkey_unpack(b, &tmp.k, k);
+
+	if (*prev &&
+	    bch2_extent_merge(c, b, (void *) *prev, &tmp.k))
+		return;
+
+	if (*prev) {
+		bch2_bkey_pack(*prev, (void *) *prev, f);
+
+		btree_keys_account_key_add(nr, 0, *prev);
+		*prev = bkey_next(*prev);
+	} else {
+		*prev = start;
+	}
+
+	bkey_copy(*prev, &tmp.k);
+}
+
+struct btree_nr_keys bch2_extent_sort_fix_overlapping(struct bch_fs *c,
+					struct bset *dst,
+					struct btree *b,
+					struct btree_node_iter_large *iter)
+{
+	struct bkey_format *f = &b->format;
+	struct btree_node_iter_set *_l = iter->data, *_r;
+	struct bkey_packed *prev = NULL, *out, *lk, *rk;
+	struct bkey l_unpacked, r_unpacked;
+	struct bkey_s l, r;
+	struct btree_nr_keys nr;
+
+	memset(&nr, 0, sizeof(nr));
+
+	heap_resort(iter, extent_sort_cmp);
+
+	while (!bch2_btree_node_iter_large_end(iter)) {
+		lk = __btree_node_offset_to_key(b, _l->k);
+
+		if (iter->used == 1) {
+			extent_sort_append(c, b, &nr, dst->start, &prev, lk);
+			extent_sort_next(iter, b, _l);
+			continue;
+		}
+
+		_r = iter->data + 1;
+		if (iter->used > 2 &&
+		    extent_sort_cmp(iter, _r[0], _r[1]) >= 0)
+			_r++;
+
+		rk = __btree_node_offset_to_key(b, _r->k);
+
+		l = __bkey_disassemble(b, lk, &l_unpacked);
+		r = __bkey_disassemble(b, rk, &r_unpacked);
+
+		/* If current key and next key don't overlap, just append */
+		if (bkey_cmp(l.k->p, bkey_start_pos(r.k)) <= 0) {
+			extent_sort_append(c, b, &nr, dst->start, &prev, lk);
+			extent_sort_next(iter, b, _l);
+			continue;
+		}
+
+		/* Skip 0 size keys */
+		if (!r.k->size) {
+			extent_sort_next(iter, b, _r);
+			continue;
+		}
+
+		/*
+		 * overlap: keep the newer key and trim the older key so they
+		 * don't overlap. comparing pointers tells us which one is
+		 * newer, since the bsets are appended one after the other.
+		 */
+
+		/* can't happen because of comparison func */
+		BUG_ON(_l->k < _r->k &&
+		       !bkey_cmp(bkey_start_pos(l.k), bkey_start_pos(r.k)));
+
+		if (_l->k > _r->k) {
+			/* l wins, trim r */
+			if (bkey_cmp(l.k->p, r.k->p) >= 0) {
+				sort_key_next(iter, b, _r);
+			} else {
+				__bch2_cut_front(l.k->p, r);
+				extent_save(b, NULL, rk, r.k);
+			}
+
+			extent_sort_sift(iter, b, _r - iter->data);
+		} else if (bkey_cmp(l.k->p, r.k->p) > 0) {
+			BKEY_PADDED(k) tmp;
+
+			/*
+			 * r wins, but it overlaps in the middle of l - split l:
+			 */
+			bkey_reassemble(&tmp.k, l.s_c);
+			bch2_cut_back(bkey_start_pos(r.k), &tmp.k.k);
+
+			__bch2_cut_front(r.k->p, l);
+			extent_save(b, NULL, lk, l.k);
+
+			extent_sort_sift(iter, b, 0);
+
+			extent_sort_append(c, b, &nr, dst->start, &prev,
+					   bkey_to_packed(&tmp.k));
+		} else {
+			bch2_cut_back(bkey_start_pos(r.k), l.k);
+			extent_save(b, NULL, lk, l.k);
+		}
+	}
+
+	if (prev) {
+		bch2_bkey_pack(prev, (void *) prev, f);
+		btree_keys_account_key_add(&nr, 0, prev);
+		out = bkey_next(prev);
+	} else {
+		out = dst->start;
+	}
+
+	dst->u64s = cpu_to_le16((u64 *) out - dst->_data);
+	return nr;
+}
+
+struct extent_insert_state {
+	struct btree_insert		*trans;
+	struct btree_insert_entry	*insert;
+	struct bpos			committed;
+	struct bch_fs_usage		stats;
+
+	/* for deleting: */
+	struct bkey_i			whiteout;
+	bool				do_journal;
+	bool				deleting;
+};
+
+static void bch2_add_sectors(struct extent_insert_state *s,
+			     struct bkey_s_c k, u64 offset, s64 sectors)
+{
+	struct bch_fs *c = s->trans->c;
+	struct btree *b = s->insert->iter->l[0].b;
+
+	EBUG_ON(bkey_cmp(bkey_start_pos(k.k), b->data->min_key) < 0);
+
+	if (!sectors)
+		return;
+
+	bch2_mark_key(c, k, sectors, false, gc_pos_btree_node(b),
+		      &s->stats, s->trans->journal_res.seq, 0);
+}
+
+static void bch2_subtract_sectors(struct extent_insert_state *s,
+				 struct bkey_s_c k, u64 offset, s64 sectors)
+{
+	bch2_add_sectors(s, k, offset, -sectors);
+}
+
+/* These wrappers subtract exactly the sectors that we're removing from @k */
+static void bch2_cut_subtract_back(struct extent_insert_state *s,
+				  struct bpos where, struct bkey_s k)
+{
+	bch2_subtract_sectors(s, k.s_c, where.offset,
+			     k.k->p.offset - where.offset);
+	bch2_cut_back(where, k.k);
+}
+
+static void bch2_cut_subtract_front(struct extent_insert_state *s,
+				   struct bpos where, struct bkey_s k)
+{
+	bch2_subtract_sectors(s, k.s_c, bkey_start_offset(k.k),
+			     where.offset - bkey_start_offset(k.k));
+	__bch2_cut_front(where, k);
+}
+
+static void bch2_drop_subtract(struct extent_insert_state *s, struct bkey_s k)
+{
+	if (k.k->size)
+		bch2_subtract_sectors(s, k.s_c,
+				     bkey_start_offset(k.k), k.k->size);
+	k.k->size = 0;
+	k.k->type = KEY_TYPE_DELETED;
+}
+
+static bool bch2_extent_merge_inline(struct bch_fs *,
+				     struct btree_iter *,
+				     struct bkey_packed *,
+				     struct bkey_packed *,
+				     bool);
+
+#define MAX_LOCK_HOLD_TIME	(5 * NSEC_PER_MSEC)
+
+static enum btree_insert_ret
+extent_insert_should_stop(struct extent_insert_state *s)
+{
+	struct btree *b = s->insert->iter->l[0].b;
+
+	/*
+	 * Check if we have sufficient space in both the btree node and the
+	 * journal reservation:
+	 *
+	 * Each insert checks for room in the journal entry, but we check for
+	 * room in the btree node up-front. In the worst case, bkey_cmpxchg()
+	 * will insert two keys, and one iteration of this room will insert one
+	 * key, so we need room for three keys.
+	 */
+	if (!bch2_btree_node_insert_fits(s->trans->c, b, s->insert->k->k.u64s))
+		return BTREE_INSERT_BTREE_NODE_FULL;
+	else if (!journal_res_insert_fits(s->trans, s->insert))
+		return BTREE_INSERT_JOURNAL_RES_FULL; /* XXX worth tracing */
+	else
+		return BTREE_INSERT_OK;
+}
+
+static void extent_bset_insert(struct bch_fs *c, struct btree_iter *iter,
+			       struct bkey_i *insert)
+{
+	struct btree_iter_level *l = &iter->l[0];
+	struct bset_tree *t = bset_tree_last(l->b);
+	struct bkey_packed *where =
+		bch2_btree_node_iter_bset_pos(&l->iter, l->b, t);
+	struct bkey_packed *prev = bch2_bkey_prev_filter(l->b, t, where,
+							 KEY_TYPE_DISCARD);
+	struct bkey_packed *next_live_key = where;
+	unsigned clobber_u64s;
+
+	EBUG_ON(bkey_deleted(&insert->k) || !insert->k.size);
+
+	if (prev)
+		where = bkey_next(prev);
+
+	while (next_live_key != btree_bkey_last(l->b, t) &&
+	       bkey_deleted(next_live_key))
+		next_live_key = bkey_next(next_live_key);
+
+	/*
+	 * Everything between where and next_live_key is now deleted keys, and
+	 * is overwritten:
+	 */
+	clobber_u64s = (u64 *) next_live_key - (u64 *) where;
+
+	if (prev &&
+	    bch2_extent_merge_inline(c, iter, prev, bkey_to_packed(insert), true))
+		goto drop_deleted_keys;
+
+	if (next_live_key != btree_bkey_last(l->b, t) &&
+	    bch2_extent_merge_inline(c, iter, bkey_to_packed(insert),
+				    next_live_key, false))
+		goto drop_deleted_keys;
+
+	bch2_bset_insert(l->b, &l->iter, where, insert, clobber_u64s);
+	bch2_btree_node_iter_fix(iter, l->b, &l->iter, t, where,
+				clobber_u64s, where->u64s);
+	return;
+drop_deleted_keys:
+	bch2_bset_delete(l->b, where, clobber_u64s);
+	bch2_btree_node_iter_fix(iter, l->b, &l->iter, t,
+				 where, clobber_u64s, 0);
+}
+
+static void extent_insert_committed(struct extent_insert_state *s)
+{
+	struct bch_fs *c = s->trans->c;
+	struct btree_iter *iter = s->insert->iter;
+	struct bkey_i *insert = !s->deleting
+		? s->insert->k
+		: &s->whiteout;
+	BKEY_PADDED(k) split;
+
+	EBUG_ON(bkey_deleted(&insert->k) || !insert->k.size);
+	EBUG_ON(bkey_cmp(insert->k.p, s->committed) < 0);
+	EBUG_ON(bkey_cmp(s->committed, bkey_start_pos(&insert->k)) < 0);
+
+	if (!bkey_cmp(s->committed, bkey_start_pos(&insert->k)))
+		return;
+
+	if (s->deleting && !s->do_journal) {
+		bch2_cut_front(s->committed, insert);
+		goto done;
+	}
+
+	EBUG_ON(bkey_deleted(&insert->k) || !insert->k.size);
+
+	bkey_copy(&split.k, insert);
+
+	if (!(s->trans->flags & BTREE_INSERT_JOURNAL_REPLAY) &&
+	    bkey_cmp(s->committed, insert->k.p) &&
+	    bch2_extent_is_compressed(bkey_i_to_s_c(insert))) {
+		/* XXX: possibly need to increase our reservation? */
+		bch2_cut_subtract_back(s, s->committed,
+				      bkey_i_to_s(&split.k));
+		bch2_cut_front(s->committed, insert);
+		bch2_add_sectors(s, bkey_i_to_s_c(insert),
+				bkey_start_offset(&insert->k),
+				insert->k.size);
+	} else {
+		bch2_cut_back(s->committed, &split.k.k);
+		bch2_cut_front(s->committed, insert);
+	}
+
+	if (debug_check_bkeys(c))
+		bch2_bkey_debugcheck(c, iter->l[0].b, bkey_i_to_s_c(&split.k));
+
+	bch2_btree_journal_key(s->trans, iter, &split.k);
+
+	if (!s->deleting)
+		extent_bset_insert(c, iter, &split.k);
+done:
+	bch2_btree_iter_set_pos_same_leaf(iter, s->committed);
+
+	insert->k.needs_whiteout	= false;
+	s->do_journal			= false;
+	s->trans->did_work		= true;
+}
+
+static enum btree_insert_ret
+__extent_insert_advance_pos(struct extent_insert_state *s,
+			    struct bpos next_pos,
+			    struct bkey_s_c k)
+{
+	struct extent_insert_hook *hook = s->trans->hook;
+	enum btree_insert_ret ret;
+
+	if (hook)
+		ret = hook->fn(hook, s->committed, next_pos, k, s->insert->k);
+	else
+		ret = BTREE_INSERT_OK;
+
+	if (ret == BTREE_INSERT_OK)
+		s->committed = next_pos;
+
+	return ret;
+}
+
+/*
+ * Update iter->pos, marking how much of @insert we've processed, and call hook
+ * fn:
+ */
+static enum btree_insert_ret
+extent_insert_advance_pos(struct extent_insert_state *s, struct bkey_s_c k)
+{
+	struct btree *b = s->insert->iter->l[0].b;
+	struct bpos next_pos = bpos_min(s->insert->k->k.p,
+					k.k ? k.k->p : b->key.k.p);
+	enum btree_insert_ret ret;
+
+	if (race_fault())
+		return BTREE_INSERT_NEED_TRAVERSE;
+
+	/* hole? */
+	if (k.k && bkey_cmp(s->committed, bkey_start_pos(k.k)) < 0) {
+		ret = __extent_insert_advance_pos(s, bkey_start_pos(k.k),
+						    bkey_s_c_null);
+		if (ret != BTREE_INSERT_OK)
+			return ret;
+	}
+
+	/* avoid redundant calls to hook fn: */
+	if (!bkey_cmp(s->committed, next_pos))
+		return BTREE_INSERT_OK;
+
+	return __extent_insert_advance_pos(s, next_pos, k);
+}
+
+static enum btree_insert_ret
+extent_insert_check_split_compressed(struct extent_insert_state *s,
+				     struct bkey_s_c k,
+				     enum bch_extent_overlap overlap)
+{
+	struct bch_fs *c = s->trans->c;
+	unsigned sectors;
+
+	if (overlap == BCH_EXTENT_OVERLAP_MIDDLE &&
+	    (sectors = bch2_extent_is_compressed(k))) {
+		int flags = BCH_DISK_RESERVATION_BTREE_LOCKS_HELD;
+
+		if (s->trans->flags & BTREE_INSERT_NOFAIL)
+			flags |= BCH_DISK_RESERVATION_NOFAIL;
+
+		switch (bch2_disk_reservation_add(c,
+				s->trans->disk_res,
+				sectors * bch2_extent_nr_dirty_ptrs(k),
+				flags)) {
+		case 0:
+			break;
+		case -ENOSPC:
+			return BTREE_INSERT_ENOSPC;
+		case -EINTR:
+			return BTREE_INSERT_NEED_GC_LOCK;
+		default:
+			BUG();
+		}
+	}
+
+	return BTREE_INSERT_OK;
+}
+
+static enum btree_insert_ret
+extent_squash(struct extent_insert_state *s, struct bkey_i *insert,
+	      struct bset_tree *t, struct bkey_packed *_k, struct bkey_s k,
+	      enum bch_extent_overlap overlap)
+{
+	struct bch_fs *c = s->trans->c;
+	struct btree_iter *iter = s->insert->iter;
+	struct btree_iter_level *l = &iter->l[0];
+	struct btree *b = l->b;
+	struct btree_node_iter *node_iter = &l->iter;
+	enum btree_insert_ret ret;
+
+	switch (overlap) {
+	case BCH_EXTENT_OVERLAP_FRONT:
+		/* insert overlaps with start of k: */
+		bch2_cut_subtract_front(s, insert->k.p, k);
+		BUG_ON(bkey_deleted(k.k));
+		extent_save(b, node_iter, _k, k.k);
+		break;
+
+	case BCH_EXTENT_OVERLAP_BACK:
+		/* insert overlaps with end of k: */
+		bch2_cut_subtract_back(s, bkey_start_pos(&insert->k), k);
+		BUG_ON(bkey_deleted(k.k));
+		extent_save(b, node_iter, _k, k.k);
+
+		/*
+		 * As the auxiliary tree is indexed by the end of the
+		 * key and we've just changed the end, update the
+		 * auxiliary tree.
+		 */
+		bch2_bset_fix_invalidated_key(b, t, _k);
+		bch2_btree_node_iter_fix(iter, b, node_iter, t,
+					_k, _k->u64s, _k->u64s);
+		break;
+
+	case BCH_EXTENT_OVERLAP_ALL: {
+		struct bpos orig_pos = k.k->p;
+
+		/* The insert key completely covers k, invalidate k */
+		if (!bkey_whiteout(k.k))
+			btree_keys_account_key_drop(&b->nr,
+						t - b->set, _k);
+
+		bch2_drop_subtract(s, k);
+		k.k->p = bkey_start_pos(&insert->k);
+		if (!__extent_save(b, node_iter, _k, k.k)) {
+			/*
+			 * Couldn't repack: we aren't necessarily able
+			 * to repack if the new key is outside the range
+			 * of the old extent, so we have to split
+			 * @insert:
+			 */
+			k.k->p = orig_pos;
+			extent_save(b, node_iter, _k, k.k);
+
+			ret = extent_insert_advance_pos(s, k.s_c);
+			if (ret != BTREE_INSERT_OK)
+				return ret;
+
+			extent_insert_committed(s);
+			/*
+			 * We split and inserted upto at k.k->p - that
+			 * has to coincide with iter->pos, so that we
+			 * don't have anything more we have to insert
+			 * until we recheck our journal reservation:
+			 */
+			EBUG_ON(bkey_cmp(s->committed, k.k->p));
+		} else {
+			bch2_bset_fix_invalidated_key(b, t, _k);
+			bch2_btree_node_iter_fix(iter, b, node_iter, t,
+						_k, _k->u64s, _k->u64s);
+		}
+
+		break;
+	}
+	case BCH_EXTENT_OVERLAP_MIDDLE: {
+		BKEY_PADDED(k) split;
+		/*
+		 * The insert key falls 'in the middle' of k
+		 * The insert key splits k in 3:
+		 * - start only in k, preserve
+		 * - middle common section, invalidate in k
+		 * - end only in k, preserve
+		 *
+		 * We update the old key to preserve the start,
+		 * insert will be the new common section,
+		 * we manually insert the end that we are preserving.
+		 *
+		 * modify k _before_ doing the insert (which will move
+		 * what k points to)
+		 */
+		bkey_reassemble(&split.k, k.s_c);
+		split.k.k.needs_whiteout |= bset_written(b, bset(b, t));
+
+		bch2_cut_back(bkey_start_pos(&insert->k), &split.k.k);
+		BUG_ON(bkey_deleted(&split.k.k));
+
+		bch2_cut_subtract_front(s, insert->k.p, k);
+		BUG_ON(bkey_deleted(k.k));
+		extent_save(b, node_iter, _k, k.k);
+
+		bch2_add_sectors(s, bkey_i_to_s_c(&split.k),
+				bkey_start_offset(&split.k.k),
+				split.k.k.size);
+		extent_bset_insert(c, iter, &split.k);
+		break;
+	}
+	}
+
+	return BTREE_INSERT_OK;
+}
+
+static enum btree_insert_ret
+__bch2_delete_fixup_extent(struct extent_insert_state *s)
+{
+	struct bch_fs *c = s->trans->c;
+	struct btree_iter *iter = s->insert->iter;
+	struct btree_iter_level *l = &iter->l[0];
+	struct btree *b = l->b;
+	struct btree_node_iter *node_iter = &l->iter;
+	struct bkey_packed *_k;
+	struct bkey unpacked;
+	struct bkey_i *insert = s->insert->k;
+	enum btree_insert_ret ret = BTREE_INSERT_OK;
+
+	EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k)));
+
+	s->whiteout = *insert;
+	s->whiteout.k.type = KEY_TYPE_DISCARD;
+
+	while (bkey_cmp(s->committed, insert->k.p) < 0 &&
+	       (ret = extent_insert_should_stop(s)) == BTREE_INSERT_OK &&
+	       (_k = bch2_btree_node_iter_peek_all(node_iter, b))) {
+		struct bset_tree *t = bch2_bkey_to_bset(b, _k);
+		struct bkey_s k = __bkey_disassemble(b, _k, &unpacked);
+		enum bch_extent_overlap overlap;
+
+		EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k)));
+		EBUG_ON(bkey_cmp(iter->pos, k.k->p) >= 0);
+
+		if (bkey_cmp(bkey_start_pos(k.k), insert->k.p) >= 0)
+			break;
+
+		if (bkey_whiteout(k.k)) {
+			s->committed = bpos_min(insert->k.p, k.k->p);
+			goto next;
+		}
+
+		overlap = bch2_extent_overlap(&insert->k, k.k);
+
+		ret = extent_insert_check_split_compressed(s, k.s_c, overlap);
+		if (ret)
+			break;
+
+		ret = extent_insert_advance_pos(s, k.s_c);
+		if (ret)
+			break;
+
+		s->do_journal = true;
+
+		if (overlap == BCH_EXTENT_OVERLAP_ALL) {
+			btree_keys_account_key_drop(&b->nr,
+						t - b->set, _k);
+			bch2_subtract_sectors(s, k.s_c,
+					     bkey_start_offset(k.k), k.k->size);
+			_k->type = KEY_TYPE_DISCARD;
+			reserve_whiteout(b, t, _k);
+		} else if (k.k->needs_whiteout ||
+			   bset_written(b, bset(b, t))) {
+			struct bkey_i discard = *insert;
+
+			discard.k.type = KEY_TYPE_DISCARD;
+
+			switch (overlap) {
+			case BCH_EXTENT_OVERLAP_FRONT:
+				bch2_cut_front(bkey_start_pos(k.k), &discard);
+				break;
+			case BCH_EXTENT_OVERLAP_BACK:
+				bch2_cut_back(k.k->p, &discard.k);
+				break;
+			default:
+				break;
+			}
+
+			discard.k.needs_whiteout = true;
+
+			ret = extent_squash(s, insert, t, _k, k, overlap);
+			BUG_ON(ret != BTREE_INSERT_OK);
+
+			extent_bset_insert(c, iter, &discard);
+		} else {
+			ret = extent_squash(s, insert, t, _k, k, overlap);
+			BUG_ON(ret != BTREE_INSERT_OK);
+		}
+next:
+		bch2_cut_front(s->committed, insert);
+		bch2_btree_iter_set_pos_same_leaf(iter, s->committed);
+	}
+
+	return ret;
+}
+
+static enum btree_insert_ret
+__bch2_insert_fixup_extent(struct extent_insert_state *s)
+{
+	struct btree_iter *iter = s->insert->iter;
+	struct btree_iter_level *l = &iter->l[0];
+	struct btree *b = l->b;
+	struct btree_node_iter *node_iter = &l->iter;
+	struct bkey_packed *_k;
+	struct bkey unpacked;
+	struct bkey_i *insert = s->insert->k;
+	enum btree_insert_ret ret = BTREE_INSERT_OK;
+
+	while (bkey_cmp(s->committed, insert->k.p) < 0 &&
+	       (ret = extent_insert_should_stop(s)) == BTREE_INSERT_OK &&
+	       (_k = bch2_btree_node_iter_peek_all(node_iter, b))) {
+		struct bset_tree *t = bch2_bkey_to_bset(b, _k);
+		struct bkey_s k = __bkey_disassemble(b, _k, &unpacked);
+		enum bch_extent_overlap overlap;
+
+		EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k)));
+		EBUG_ON(bkey_cmp(iter->pos, k.k->p) >= 0);
+
+		if (bkey_cmp(bkey_start_pos(k.k), insert->k.p) >= 0)
+			break;
+
+		overlap = bch2_extent_overlap(&insert->k, k.k);
+
+		ret = extent_insert_check_split_compressed(s, k.s_c, overlap);
+		if (ret)
+			break;
+
+		if (!k.k->size)
+			goto squash;
+
+		/*
+		 * Only call advance pos & call hook for nonzero size extents:
+		 */
+		ret = extent_insert_advance_pos(s, k.s_c);
+		if (ret)
+			break;
+
+		if (k.k->size &&
+		    (k.k->needs_whiteout || bset_written(b, bset(b, t))))
+			insert->k.needs_whiteout = true;
+
+		if (overlap == BCH_EXTENT_OVERLAP_ALL &&
+		    bkey_whiteout(k.k) &&
+		    k.k->needs_whiteout) {
+			unreserve_whiteout(b, t, _k);
+			_k->needs_whiteout = false;
+		}
+squash:
+		ret = extent_squash(s, insert, t, _k, k, overlap);
+		if (ret != BTREE_INSERT_OK)
+			break;
+	}
+
+	return ret;
+}
+
+/**
+ * bch_extent_insert_fixup - insert a new extent and deal with overlaps
+ *
+ * this may result in not actually doing the insert, or inserting some subset
+ * of the insert key. For cmpxchg operations this is where that logic lives.
+ *
+ * All subsets of @insert that need to be inserted are inserted using
+ * bch2_btree_insert_and_journal(). If @b or @res fills up, this function
+ * returns false, setting @iter->pos for the prefix of @insert that actually got
+ * inserted.
+ *
+ * BSET INVARIANTS: this function is responsible for maintaining all the
+ * invariants for bsets of extents in memory. things get really hairy with 0
+ * size extents
+ *
+ * within one bset:
+ *
+ * bkey_start_pos(bkey_next(k)) >= k
+ * or bkey_start_offset(bkey_next(k)) >= k->offset
+ *
+ * i.e. strict ordering, no overlapping extents.
+ *
+ * multiple bsets (i.e. full btree node):
+ *
+ * ∀ k, j
+ *   k.size != 0 ∧ j.size != 0 →
+ *     ¬ (k > bkey_start_pos(j) ∧ k < j)
+ *
+ * i.e. no two overlapping keys _of nonzero size_
+ *
+ * We can't realistically maintain this invariant for zero size keys because of
+ * the key merging done in bch2_btree_insert_key() - for two mergeable keys k, j
+ * there may be another 0 size key between them in another bset, and it will
+ * thus overlap with the merged key.
+ *
+ * In addition, the end of iter->pos indicates how much has been processed.
+ * If the end of iter->pos is not the same as the end of insert, then
+ * key insertion needs to continue/be retried.
+ */
+enum btree_insert_ret
+bch2_insert_fixup_extent(struct btree_insert *trans,
+			 struct btree_insert_entry *insert)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter *iter = insert->iter;
+	struct btree_iter_level *l = &iter->l[0];
+	struct btree *b = l->b;
+	enum btree_insert_ret ret = BTREE_INSERT_OK;
+
+	struct extent_insert_state s = {
+		.trans		= trans,
+		.insert		= insert,
+		.committed	= insert->iter->pos,
+		.deleting	= bkey_whiteout(&insert->k->k),
+	};
+
+	EBUG_ON(iter->level);
+	EBUG_ON(!insert->k->k.size);
+
+	/*
+	 * As we process overlapping extents, we advance @iter->pos both to
+	 * signal to our caller (btree_insert_key()) how much of @insert->k has
+	 * been inserted, and also to keep @iter->pos consistent with
+	 * @insert->k and the node iterator that we're advancing:
+	 */
+	EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k->k)));
+
+	if (!s.deleting &&
+	    !(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))
+		bch2_add_sectors(&s, bkey_i_to_s_c(insert->k),
+				bkey_start_offset(&insert->k->k),
+				insert->k->k.size);
+
+	ret = !s.deleting
+		? __bch2_insert_fixup_extent(&s)
+		: __bch2_delete_fixup_extent(&s);
+
+	if (ret == BTREE_INSERT_OK &&
+	    bkey_cmp(s.committed, insert->k->k.p) < 0)
+		ret = extent_insert_advance_pos(&s, bkey_s_c_null);
+
+	extent_insert_committed(&s);
+
+	if (s.deleting)
+		bch2_cut_front(iter->pos, insert->k);
+
+	/*
+	 * Subtract any remaining sectors from @insert, if we bailed out early
+	 * and didn't fully insert @insert:
+	 */
+	if (!s.deleting &&
+	    !(trans->flags & BTREE_INSERT_JOURNAL_REPLAY) &&
+	    insert->k->k.size)
+		bch2_subtract_sectors(&s, bkey_i_to_s_c(insert->k),
+				     bkey_start_offset(&insert->k->k),
+				     insert->k->k.size);
+
+	bch2_fs_usage_apply(c, &s.stats, trans->disk_res,
+			   gc_pos_btree_node(b));
+
+	EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k->k)));
+	EBUG_ON(bkey_cmp(iter->pos, s.committed));
+	EBUG_ON((bkey_cmp(iter->pos, b->key.k.p) == 0) !=
+		!!(iter->flags & BTREE_ITER_AT_END_OF_LEAF));
+
+	if (insert->k->k.size && (iter->flags & BTREE_ITER_AT_END_OF_LEAF))
+		ret = BTREE_INSERT_NEED_TRAVERSE;
+
+	WARN_ONCE((ret == BTREE_INSERT_OK) != (insert->k->k.size == 0),
+		  "ret %u insert->k.size %u", ret, insert->k->k.size);
+
+	return ret;
+}
+
+const char *bch2_extent_invalid(const struct bch_fs *c, struct bkey_s_c k)
+{
+	if (bkey_val_u64s(k.k) > BKEY_EXTENT_VAL_U64s_MAX)
+		return "value too big";
+
+	if (!k.k->size)
+		return "zero key size";
+
+	switch (k.k->type) {
+	case BCH_EXTENT:
+	case BCH_EXTENT_CACHED: {
+		struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
+		const union bch_extent_entry *entry;
+		struct bch_extent_crc_unpacked crc;
+		const struct bch_extent_ptr *ptr;
+		unsigned size_ondisk = e.k->size;
+		const char *reason;
+		unsigned nonce = UINT_MAX;
+
+		extent_for_each_entry(e, entry) {
+			if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX)
+				return "invalid extent entry type";
+
+			if (extent_entry_is_crc(entry)) {
+				crc = bch2_extent_crc_unpack(e.k, entry_to_crc(entry));
+
+				if (crc.offset + e.k->size >
+				    crc.uncompressed_size)
+					return "checksum offset + key size > uncompressed size";
+
+				size_ondisk = crc.compressed_size;
+
+				if (!bch2_checksum_type_valid(c, crc.csum_type))
+					return "invalid checksum type";
+
+				if (crc.compression_type >= BCH_COMPRESSION_NR)
+					return "invalid compression type";
+
+				if (bch2_csum_type_is_encryption(crc.csum_type)) {
+					if (nonce == UINT_MAX)
+						nonce = crc.offset + crc.nonce;
+					else if (nonce != crc.offset + crc.nonce)
+						return "incorrect nonce";
+				}
+			} else {
+				ptr = entry_to_ptr(entry);
+
+				reason = extent_ptr_invalid(c, e, &entry->ptr,
+							    size_ondisk, false);
+				if (reason)
+					return reason;
+			}
+		}
+
+		return NULL;
+	}
+
+	case BCH_RESERVATION: {
+		struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k);
+
+		if (bkey_val_bytes(k.k) != sizeof(struct bch_reservation))
+			return "incorrect value size";
+
+		if (!r.v->nr_replicas || r.v->nr_replicas > BCH_REPLICAS_MAX)
+			return "invalid nr_replicas";
+
+		return NULL;
+	}
+
+	default:
+		return "invalid value type";
+	}
+}
+
+static void bch2_extent_debugcheck_extent(struct bch_fs *c, struct btree *b,
+					  struct bkey_s_c_extent e)
+{
+	const struct bch_extent_ptr *ptr;
+	struct bch_dev *ca;
+	struct bucket_mark mark;
+	unsigned seq, stale;
+	char buf[160];
+	bool bad;
+	unsigned replicas = 0;
+
+	/*
+	 * XXX: we should be doing most/all of these checks at startup time,
+	 * where we check bch2_bkey_invalid() in btree_node_read_done()
+	 *
+	 * But note that we can't check for stale pointers or incorrect gc marks
+	 * until after journal replay is done (it might be an extent that's
+	 * going to get overwritten during replay)
+	 */
+
+	extent_for_each_ptr(e, ptr) {
+		ca = bch_dev_bkey_exists(c, ptr->dev);
+		replicas++;
+
+		/*
+		 * If journal replay hasn't finished, we might be seeing keys
+		 * that will be overwritten by the time journal replay is done:
+		 */
+		if (!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags))
+			continue;
+
+		stale = 0;
+
+		do {
+			seq = read_seqcount_begin(&c->gc_pos_lock);
+			mark = ptr_bucket_mark(ca, ptr);
+
+			/* between mark and bucket gen */
+			smp_rmb();
+
+			stale = ptr_stale(ca, ptr);
+
+			bch2_fs_bug_on(stale && !ptr->cached, c,
+					 "stale dirty pointer");
+
+			bch2_fs_bug_on(stale > 96, c,
+					 "key too stale: %i",
+					 stale);
+
+			if (stale)
+				break;
+
+			bad = gc_pos_cmp(c->gc_pos, gc_pos_btree_node(b)) > 0 &&
+				(mark.data_type != BCH_DATA_USER ||
+				 !(ptr->cached
+				   ? mark.cached_sectors
+				   : mark.dirty_sectors));
+		} while (read_seqcount_retry(&c->gc_pos_lock, seq));
+
+		if (bad)
+			goto bad_ptr;
+	}
+
+	if (replicas > BCH_REPLICAS_MAX) {
+		bch2_bkey_val_to_text(c, btree_node_type(b), buf,
+				     sizeof(buf), e.s_c);
+		bch2_fs_bug(c,
+			"extent key bad (too many replicas: %u): %s",
+			replicas, buf);
+		return;
+	}
+
+	if (!bkey_extent_is_cached(e.k) &&
+	    !bch2_bkey_replicas_marked(c, BCH_DATA_USER, e.s_c)) {
+		bch2_bkey_val_to_text(c, btree_node_type(b),
+				     buf, sizeof(buf), e.s_c);
+		bch2_fs_bug(c,
+			"extent key bad (replicas not marked in superblock):\n%s",
+			buf);
+		return;
+	}
+
+	return;
+
+bad_ptr:
+	bch2_bkey_val_to_text(c, btree_node_type(b), buf,
+			     sizeof(buf), e.s_c);
+	bch2_fs_bug(c, "extent pointer bad gc mark: %s:\nbucket %zu "
+		   "gen %i type %u", buf,
+		   PTR_BUCKET_NR(ca, ptr), mark.gen, mark.data_type);
+	return;
+}
+
+void bch2_extent_debugcheck(struct bch_fs *c, struct btree *b, struct bkey_s_c k)
+{
+	switch (k.k->type) {
+	case BCH_EXTENT:
+	case BCH_EXTENT_CACHED:
+		bch2_extent_debugcheck_extent(c, b, bkey_s_c_to_extent(k));
+		break;
+	case BCH_RESERVATION:
+		break;
+	default:
+		BUG();
+	}
+}
+
+void bch2_extent_to_text(struct bch_fs *c, char *buf,
+			 size_t size, struct bkey_s_c k)
+{
+	char *out = buf, *end = buf + size;
+	const char *invalid;
+
+#define p(...)	(out += scnprintf(out, end - out, __VA_ARGS__))
+
+	if (bkey_extent_is_data(k.k))
+		out += extent_print_ptrs(c, buf, size, bkey_s_c_to_extent(k));
+
+	invalid = bch2_extent_invalid(c, k);
+	if (invalid)
+		p(" invalid: %s", invalid);
+#undef p
+}
+
+static void bch2_extent_crc_init(union bch_extent_crc *crc,
+				 struct bch_extent_crc_unpacked new)
+{
+#define common_fields(_crc)						\
+		.csum_type		= _crc.csum_type,		\
+		.compression_type	= _crc.compression_type,	\
+		._compressed_size	= _crc.compressed_size - 1,	\
+		._uncompressed_size	= _crc.uncompressed_size - 1,	\
+		.offset			= _crc.offset
+
+	if (bch_crc_bytes[new.csum_type]	<= 4 &&
+	    new.uncompressed_size		<= CRC32_SIZE_MAX &&
+	    new.nonce				<= CRC32_NONCE_MAX) {
+		crc->crc32 = (struct bch_extent_crc32) {
+			.type = 1 << BCH_EXTENT_ENTRY_crc32,
+			common_fields(new),
+			.csum			= *((__le32 *) &new.csum.lo),
+		};
+		return;
+	}
+
+	if (bch_crc_bytes[new.csum_type]	<= 10 &&
+	    new.uncompressed_size		<= CRC64_SIZE_MAX &&
+	    new.nonce				<= CRC64_NONCE_MAX) {
+		crc->crc64 = (struct bch_extent_crc64) {
+			.type = 1 << BCH_EXTENT_ENTRY_crc64,
+			common_fields(new),
+			.nonce			= new.nonce,
+			.csum_lo		= new.csum.lo,
+			.csum_hi		= *((__le16 *) &new.csum.hi),
+		};
+		return;
+	}
+
+	if (bch_crc_bytes[new.csum_type]	<= 16 &&
+	    new.uncompressed_size		<= CRC128_SIZE_MAX &&
+	    new.nonce				<= CRC128_NONCE_MAX) {
+		crc->crc128 = (struct bch_extent_crc128) {
+			.type = 1 << BCH_EXTENT_ENTRY_crc128,
+			common_fields(new),
+			.nonce			= new.nonce,
+			.csum			= new.csum,
+		};
+		return;
+	}
+#undef common_fields
+	BUG();
+}
+
+void bch2_extent_crc_append(struct bkey_i_extent *e,
+			    struct bch_extent_crc_unpacked new)
+{
+	struct bch_extent_crc_unpacked crc;
+	const union bch_extent_entry *i;
+
+	BUG_ON(new.compressed_size > new.uncompressed_size);
+	BUG_ON(new.live_size != e->k.size);
+	BUG_ON(!new.compressed_size || !new.uncompressed_size);
+
+	/*
+	 * Look up the last crc entry, so we can check if we need to add
+	 * another:
+	 */
+	extent_for_each_crc(extent_i_to_s(e), crc, i)
+		;
+
+	if (!bch2_crc_unpacked_cmp(crc, new))
+		return;
+
+	bch2_extent_crc_init((void *) extent_entry_last(extent_i_to_s(e)), new);
+	__extent_entry_push(e);
+}
+
+/*
+ * bch_extent_normalize - clean up an extent, dropping stale pointers etc.
+ *
+ * Returns true if @k should be dropped entirely
+ *
+ * For existing keys, only called when btree nodes are being rewritten, not when
+ * they're merely being compacted/resorted in memory.
+ */
+bool bch2_extent_normalize(struct bch_fs *c, struct bkey_s k)
+{
+	struct bkey_s_extent e;
+
+	switch (k.k->type) {
+	case KEY_TYPE_ERROR:
+		return false;
+
+	case KEY_TYPE_DELETED:
+		return true;
+	case KEY_TYPE_DISCARD:
+		return bversion_zero(k.k->version);
+	case KEY_TYPE_COOKIE:
+		return false;
+
+	case BCH_EXTENT:
+	case BCH_EXTENT_CACHED:
+		e = bkey_s_to_extent(k);
+
+		bch2_extent_drop_stale(c, e);
+
+		if (!bkey_val_u64s(e.k)) {
+			if (bkey_extent_is_cached(e.k)) {
+				k.k->type = KEY_TYPE_DISCARD;
+				if (bversion_zero(k.k->version))
+					return true;
+			} else {
+				k.k->type = KEY_TYPE_ERROR;
+			}
+		}
+
+		return false;
+	case BCH_RESERVATION:
+		return false;
+	default:
+		BUG();
+	}
+}
+
+void bch2_extent_mark_replicas_cached(struct bch_fs *c,
+				      struct bkey_s_extent e,
+				      unsigned target,
+				      unsigned nr_desired_replicas)
+{
+	struct bch_extent_ptr *ptr;
+	int extra = bch2_extent_durability(c, e.c) - nr_desired_replicas;
+
+	if (target && extra > 0)
+		extent_for_each_ptr(e, ptr) {
+			int n = bch2_extent_ptr_durability(c, ptr);
+
+			if (n && n <= extra &&
+			    !bch2_dev_in_target(c, ptr->dev, target)) {
+				ptr->cached = true;
+				extra -= n;
+			}
+		}
+
+	if (extra > 0)
+		extent_for_each_ptr(e, ptr) {
+			int n = bch2_extent_ptr_durability(c, ptr);
+
+			if (n && n <= extra) {
+				ptr->cached = true;
+				extra -= n;
+			}
+		}
+}
+
+/*
+ * This picks a non-stale pointer, preferably from a device other than @avoid.
+ * Avoid can be NULL, meaning pick any. If there are no non-stale pointers to
+ * other devices, it will still pick a pointer from avoid.
+ */
+int bch2_extent_pick_ptr(struct bch_fs *c, struct bkey_s_c k,
+			 struct bch_devs_mask *avoid,
+			 struct extent_pick_ptr *pick)
+{
+	int ret;
+
+	switch (k.k->type) {
+	case KEY_TYPE_ERROR:
+		return -EIO;
+
+	case BCH_EXTENT:
+	case BCH_EXTENT_CACHED:
+		ret = extent_pick_read_device(c, bkey_s_c_to_extent(k),
+					      avoid, pick);
+
+		if (!ret && !bkey_extent_is_cached(k.k))
+			ret = -EIO;
+
+		return ret;
+
+	default:
+		return 0;
+	}
+}
+
+enum merge_result bch2_extent_merge(struct bch_fs *c, struct btree *b,
+				    struct bkey_i *l, struct bkey_i *r)
+{
+	struct bkey_s_extent el, er;
+	union bch_extent_entry *en_l, *en_r;
+
+	if (key_merging_disabled(c))
+		return BCH_MERGE_NOMERGE;
+
+	/*
+	 * Generic header checks
+	 * Assumes left and right are in order
+	 * Left and right must be exactly aligned
+	 */
+
+	if (l->k.u64s		!= r->k.u64s ||
+	    l->k.type		!= r->k.type ||
+	    bversion_cmp(l->k.version, r->k.version) ||
+	    bkey_cmp(l->k.p, bkey_start_pos(&r->k)))
+		return BCH_MERGE_NOMERGE;
+
+	switch (l->k.type) {
+	case KEY_TYPE_DISCARD:
+	case KEY_TYPE_ERROR:
+		/* These types are mergeable, and no val to check */
+		break;
+
+	case BCH_EXTENT:
+	case BCH_EXTENT_CACHED:
+		el = bkey_i_to_s_extent(l);
+		er = bkey_i_to_s_extent(r);
+
+		extent_for_each_entry(el, en_l) {
+			struct bch_extent_ptr *lp, *rp;
+			struct bch_dev *ca;
+
+			en_r = vstruct_idx(er.v, (u64 *) en_l - el.v->_data);
+
+			if ((extent_entry_type(en_l) !=
+			     extent_entry_type(en_r)) ||
+			    extent_entry_is_crc(en_l))
+				return BCH_MERGE_NOMERGE;
+
+			lp = &en_l->ptr;
+			rp = &en_r->ptr;
+
+			if (lp->offset + el.k->size	!= rp->offset ||
+			    lp->dev			!= rp->dev ||
+			    lp->gen			!= rp->gen)
+				return BCH_MERGE_NOMERGE;
+
+			/* We don't allow extents to straddle buckets: */
+			ca = bch_dev_bkey_exists(c, lp->dev);
+
+			if (PTR_BUCKET_NR(ca, lp) != PTR_BUCKET_NR(ca, rp))
+				return BCH_MERGE_NOMERGE;
+		}
+
+		break;
+	case BCH_RESERVATION: {
+		struct bkey_i_reservation *li = bkey_i_to_reservation(l);
+		struct bkey_i_reservation *ri = bkey_i_to_reservation(r);
+
+		if (li->v.generation != ri->v.generation ||
+		    li->v.nr_replicas != ri->v.nr_replicas)
+			return BCH_MERGE_NOMERGE;
+		break;
+	}
+	default:
+		return BCH_MERGE_NOMERGE;
+	}
+
+	l->k.needs_whiteout |= r->k.needs_whiteout;
+
+	/* Keys with no pointers aren't restricted to one bucket and could
+	 * overflow KEY_SIZE
+	 */
+	if ((u64) l->k.size + r->k.size > KEY_SIZE_MAX) {
+		bch2_key_resize(&l->k, KEY_SIZE_MAX);
+		bch2_cut_front(l->k.p, r);
+		return BCH_MERGE_PARTIAL;
+	}
+
+	bch2_key_resize(&l->k, l->k.size + r->k.size);
+
+	return BCH_MERGE_MERGE;
+}
+
+static void extent_i_save(struct btree *b, struct bkey_packed *dst,
+			  struct bkey_i *src)
+{
+	struct bkey_format *f = &b->format;
+	struct bkey_i *dst_unpacked;
+
+	BUG_ON(bkeyp_val_u64s(f, dst) != bkey_val_u64s(&src->k));
+
+	/*
+	 * We don't want the bch2_verify_key_order() call in extent_save(),
+	 * because we may be out of order with deleted keys that are about to be
+	 * removed by extent_bset_insert()
+	 */
+
+	if ((dst_unpacked = packed_to_bkey(dst)))
+		bkey_copy(dst_unpacked, src);
+	else
+		BUG_ON(!bch2_bkey_pack(dst, src, f));
+}
+
+static bool extent_merge_one_overlapping(struct btree_iter *iter,
+					 struct bpos new_pos,
+					 struct bset_tree *t,
+					 struct bkey_packed *k, struct bkey uk,
+					 bool check, bool could_pack)
+{
+	struct btree_iter_level *l = &iter->l[0];
+
+	BUG_ON(!bkey_deleted(k));
+
+	if (check) {
+		return !bkey_packed(k) || could_pack;
+	} else {
+		uk.p = new_pos;
+		extent_save(l->b, &l->iter, k, &uk);
+		bch2_bset_fix_invalidated_key(l->b, t, k);
+		bch2_btree_node_iter_fix(iter, l->b, &l->iter, t,
+					 k, k->u64s, k->u64s);
+		return true;
+	}
+}
+
+static bool extent_merge_do_overlapping(struct btree_iter *iter,
+					struct bkey *m, bool back_merge)
+{
+	struct btree_iter_level *l = &iter->l[0];
+	struct btree *b = l->b;
+	struct btree_node_iter *node_iter = &l->iter;
+	struct bset_tree *t;
+	struct bkey_packed *k;
+	struct bkey uk;
+	struct bpos new_pos = back_merge ? m->p : bkey_start_pos(m);
+	bool could_pack = bkey_pack_pos((void *) &uk, new_pos, b);
+	bool check = true;
+
+	/*
+	 * @m is the new merged extent:
+	 *
+	 * The merge took place in the last bset; we know there can't be any 0
+	 * size extents overlapping with m there because if so they would have
+	 * been between the two extents we merged.
+	 *
+	 * But in the other bsets, we have to check for and fix such extents:
+	 */
+do_fixup:
+	for_each_bset(b, t) {
+		if (t == bset_tree_last(b))
+			break;
+
+		/*
+		 * if we don't find this bset in the iterator we already got to
+		 * the end of that bset, so start searching from the end.
+		 */
+		k = bch2_btree_node_iter_bset_pos(node_iter, b, t);
+
+		if (k == btree_bkey_last(b, t))
+			k = bch2_bkey_prev_all(b, t, k);
+		if (!k)
+			continue;
+
+		if (back_merge) {
+			/*
+			 * Back merge: 0 size extents will be before the key
+			 * that was just inserted (and thus the iterator
+			 * position) - walk backwards to find them
+			 */
+			for (;
+			     k &&
+			     (uk = bkey_unpack_key(b, k),
+			      bkey_cmp(uk.p, bkey_start_pos(m)) > 0);
+			     k = bch2_bkey_prev_all(b, t, k)) {
+				if (bkey_cmp(uk.p, m->p) >= 0)
+					continue;
+
+				if (!extent_merge_one_overlapping(iter, new_pos,
+						t, k, uk, check, could_pack))
+					return false;
+			}
+		} else {
+			/* Front merge - walk forwards */
+			for (;
+			     k != btree_bkey_last(b, t) &&
+			     (uk = bkey_unpack_key(b, k),
+			      bkey_cmp(uk.p, m->p) < 0);
+			     k = bkey_next(k)) {
+				if (bkey_cmp(uk.p,
+					     bkey_start_pos(m)) <= 0)
+					continue;
+
+				if (!extent_merge_one_overlapping(iter, new_pos,
+						t, k, uk, check, could_pack))
+					return false;
+			}
+		}
+	}
+
+	if (check) {
+		check = false;
+		goto do_fixup;
+	}
+
+	return true;
+}
+
+/*
+ * When merging an extent that we're inserting into a btree node, the new merged
+ * extent could overlap with an existing 0 size extent - if we don't fix that,
+ * it'll break the btree node iterator so this code finds those 0 size extents
+ * and shifts them out of the way.
+ *
+ * Also unpacks and repacks.
+ */
+static bool bch2_extent_merge_inline(struct bch_fs *c,
+				     struct btree_iter *iter,
+				     struct bkey_packed *l,
+				     struct bkey_packed *r,
+				     bool back_merge)
+{
+	struct btree *b = iter->l[0].b;
+	struct btree_node_iter *node_iter = &iter->l[0].iter;
+	const struct bkey_format *f = &b->format;
+	struct bset_tree *t = bset_tree_last(b);
+	struct bkey_packed *m;
+	BKEY_PADDED(k) li;
+	BKEY_PADDED(k) ri;
+	struct bkey_i *mi;
+	struct bkey tmp;
+
+	/*
+	 * We need to save copies of both l and r, because we might get a
+	 * partial merge (which modifies both) and then fails to repack
+	 */
+	bch2_bkey_unpack(b, &li.k, l);
+	bch2_bkey_unpack(b, &ri.k, r);
+
+	m = back_merge ? l : r;
+	mi = back_merge ? &li.k : &ri.k;
+
+	/* l & r should be in last bset: */
+	EBUG_ON(bch2_bkey_to_bset(b, m) != t);
+
+	switch (bch2_extent_merge(c, b, &li.k, &ri.k)) {
+	case BCH_MERGE_NOMERGE:
+		return false;
+	case BCH_MERGE_PARTIAL:
+		if (bkey_packed(m) && !bch2_bkey_pack_key((void *) &tmp, &mi->k, f))
+			return false;
+
+		if (!extent_merge_do_overlapping(iter, &li.k.k, back_merge))
+			return false;
+
+		extent_i_save(b, m, mi);
+		bch2_bset_fix_invalidated_key(b, t, m);
+
+		/*
+		 * Update iterator to reflect what we just inserted - otherwise,
+		 * the iter_fix() call is going to put us _before_ the key we
+		 * just partially merged with:
+		 */
+		if (back_merge)
+			bch2_btree_iter_set_pos_same_leaf(iter, li.k.k.p);
+
+		bch2_btree_node_iter_fix(iter, b, node_iter,
+					 t, m, m->u64s, m->u64s);
+
+		if (!back_merge)
+			bkey_copy(packed_to_bkey(l), &li.k);
+		else
+			bkey_copy(packed_to_bkey(r), &ri.k);
+		return false;
+	case BCH_MERGE_MERGE:
+		if (bkey_packed(m) && !bch2_bkey_pack_key((void *) &tmp, &li.k.k, f))
+			return false;
+
+		if (!extent_merge_do_overlapping(iter, &li.k.k, back_merge))
+			return false;
+
+		extent_i_save(b, m, &li.k);
+		bch2_bset_fix_invalidated_key(b, t, m);
+
+		bch2_btree_node_iter_fix(iter, b, node_iter,
+					 t, m, m->u64s, m->u64s);
+		return true;
+	default:
+		BUG();
+	}
+}
+
+int bch2_check_range_allocated(struct bch_fs *c, struct bpos pos, u64 size)
+{
+	struct btree_iter iter;
+	struct bpos end = pos;
+	struct bkey_s_c k;
+	int ret = 0;
+
+	end.offset += size;
+
+	for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, pos,
+			     BTREE_ITER_SLOTS, k) {
+		if (bkey_cmp(bkey_start_pos(k.k), end) >= 0)
+			break;
+
+		if (!bch2_extent_is_fully_allocated(k)) {
+			ret = -ENOSPC;
+			break;
+		}
+	}
+	bch2_btree_iter_unlock(&iter);
+
+	return ret;
+}
diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h
new file mode 100644
index 000000000000..15aed3c0665b
--- /dev/null
+++ b/fs/bcachefs/extents.h
@@ -0,0 +1,539 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_EXTENTS_H
+#define _BCACHEFS_EXTENTS_H
+
+#include "bcachefs.h"
+#include "bkey.h"
+#include "extents_types.h"
+
+struct bch_fs;
+struct journal_res;
+struct btree_node_iter;
+struct btree_node_iter_large;
+struct btree_insert;
+struct btree_insert_entry;
+struct extent_insert_hook;
+struct bch_devs_mask;
+union bch_extent_crc;
+
+const char *bch2_btree_ptr_invalid(const struct bch_fs *, struct bkey_s_c);
+void bch2_btree_ptr_debugcheck(struct bch_fs *, struct btree *,
+			       struct bkey_s_c);
+void bch2_btree_ptr_to_text(struct bch_fs *, char *, size_t, struct bkey_s_c);
+void bch2_ptr_swab(const struct bkey_format *, struct bkey_packed *);
+
+#define bch2_bkey_btree_ops (struct bkey_ops) {			\
+	.key_invalid	= bch2_btree_ptr_invalid,		\
+	.key_debugcheck	= bch2_btree_ptr_debugcheck,		\
+	.val_to_text	= bch2_btree_ptr_to_text,		\
+	.swab		= bch2_ptr_swab,			\
+}
+
+const char *bch2_extent_invalid(const struct bch_fs *, struct bkey_s_c);
+void bch2_extent_debugcheck(struct bch_fs *, struct btree *, struct bkey_s_c);
+void bch2_extent_to_text(struct bch_fs *, char *, size_t, struct bkey_s_c);
+bool bch2_ptr_normalize(struct bch_fs *, struct btree *, struct bkey_s);
+enum merge_result bch2_extent_merge(struct bch_fs *, struct btree *,
+				    struct bkey_i *, struct bkey_i *);
+
+#define bch2_bkey_extent_ops (struct bkey_ops) {		\
+	.key_invalid	= bch2_extent_invalid,			\
+	.key_debugcheck	= bch2_extent_debugcheck,		\
+	.val_to_text	= bch2_extent_to_text,			\
+	.swab		= bch2_ptr_swab,			\
+	.key_normalize	= bch2_ptr_normalize,			\
+	.key_merge	= bch2_extent_merge,			\
+	.is_extents	= true,					\
+}
+
+struct btree_nr_keys bch2_key_sort_fix_overlapping(struct bset *,
+						  struct btree *,
+						  struct btree_node_iter_large *);
+struct btree_nr_keys bch2_extent_sort_fix_overlapping(struct bch_fs *c,
+						     struct bset *,
+						     struct btree *,
+						     struct btree_node_iter_large *);
+
+int bch2_btree_pick_ptr(struct bch_fs *, const struct btree *,
+			struct bch_devs_mask *avoid,
+			struct extent_pick_ptr *);
+
+int bch2_extent_pick_ptr(struct bch_fs *, struct bkey_s_c,
+			 struct bch_devs_mask *,
+			 struct extent_pick_ptr *);
+
+enum btree_insert_ret
+bch2_insert_fixup_extent(struct btree_insert *,
+			struct btree_insert_entry *);
+
+bool bch2_extent_normalize(struct bch_fs *, struct bkey_s);
+void bch2_extent_mark_replicas_cached(struct bch_fs *, struct bkey_s_extent,
+				      unsigned, unsigned);
+
+const struct bch_extent_ptr *
+bch2_extent_has_device(struct bkey_s_c_extent, unsigned);
+bool bch2_extent_drop_device(struct bkey_s_extent, unsigned);
+const struct bch_extent_ptr *
+bch2_extent_has_group(struct bch_fs *, struct bkey_s_c_extent, unsigned);
+const struct bch_extent_ptr *
+bch2_extent_has_target(struct bch_fs *, struct bkey_s_c_extent, unsigned);
+
+unsigned bch2_extent_nr_ptrs(struct bkey_s_c_extent);
+unsigned bch2_extent_nr_dirty_ptrs(struct bkey_s_c);
+unsigned bch2_extent_is_compressed(struct bkey_s_c);
+
+unsigned bch2_extent_ptr_durability(struct bch_fs *,
+				    const struct bch_extent_ptr *);
+unsigned bch2_extent_durability(struct bch_fs *, struct bkey_s_c_extent);
+
+bool bch2_extent_matches_ptr(struct bch_fs *, struct bkey_s_c_extent,
+			     struct bch_extent_ptr, u64);
+
+static inline bool bkey_extent_is_data(const struct bkey *k)
+{
+	switch (k->type) {
+	case BCH_EXTENT:
+	case BCH_EXTENT_CACHED:
+		return true;
+	default:
+		return false;
+	}
+}
+
+static inline bool bkey_extent_is_allocation(const struct bkey *k)
+{
+	switch (k->type) {
+	case BCH_EXTENT:
+	case BCH_EXTENT_CACHED:
+	case BCH_RESERVATION:
+		return true;
+	default:
+		return false;
+	}
+}
+
+static inline bool bch2_extent_is_fully_allocated(struct bkey_s_c k)
+{
+	return bkey_extent_is_allocation(k.k) &&
+		!bch2_extent_is_compressed(k);
+}
+
+static inline bool bkey_extent_is_cached(const struct bkey *k)
+{
+	return k->type == BCH_EXTENT_CACHED;
+}
+
+static inline void bkey_extent_set_cached(struct bkey *k, bool cached)
+{
+	EBUG_ON(k->type != BCH_EXTENT &&
+		k->type != BCH_EXTENT_CACHED);
+
+	k->type = cached ? BCH_EXTENT_CACHED : BCH_EXTENT;
+}
+
+static inline unsigned
+__extent_entry_type(const union bch_extent_entry *e)
+{
+	return e->type ? __ffs(e->type) : BCH_EXTENT_ENTRY_MAX;
+}
+
+static inline enum bch_extent_entry_type
+extent_entry_type(const union bch_extent_entry *e)
+{
+	int ret = __ffs(e->type);
+
+	EBUG_ON(ret < 0 || ret >= BCH_EXTENT_ENTRY_MAX);
+
+	return ret;
+}
+
+static inline size_t extent_entry_bytes(const union bch_extent_entry *entry)
+{
+	switch (extent_entry_type(entry)) {
+	case BCH_EXTENT_ENTRY_crc32:
+		return sizeof(struct bch_extent_crc32);
+	case BCH_EXTENT_ENTRY_crc64:
+		return sizeof(struct bch_extent_crc64);
+	case BCH_EXTENT_ENTRY_crc128:
+		return sizeof(struct bch_extent_crc128);
+	case BCH_EXTENT_ENTRY_ptr:
+		return sizeof(struct bch_extent_ptr);
+	default:
+		BUG();
+	}
+}
+
+static inline size_t extent_entry_u64s(const union bch_extent_entry *entry)
+{
+	return extent_entry_bytes(entry) / sizeof(u64);
+}
+
+static inline bool extent_entry_is_ptr(const union bch_extent_entry *e)
+{
+	return extent_entry_type(e) == BCH_EXTENT_ENTRY_ptr;
+}
+
+static inline bool extent_entry_is_crc(const union bch_extent_entry *e)
+{
+	return !extent_entry_is_ptr(e);
+}
+
+union bch_extent_crc {
+	u8				type;
+	struct bch_extent_crc32		crc32;
+	struct bch_extent_crc64		crc64;
+	struct bch_extent_crc128	crc128;
+};
+
+/* downcast, preserves const */
+#define to_entry(_entry)						\
+({									\
+	BUILD_BUG_ON(!type_is(_entry, union bch_extent_crc *) &&	\
+		     !type_is(_entry, struct bch_extent_ptr *));	\
+									\
+	__builtin_choose_expr(						\
+		(type_is_exact(_entry, const union bch_extent_crc *) ||	\
+		 type_is_exact(_entry, const struct bch_extent_ptr *)),	\
+		(const union bch_extent_entry *) (_entry),		\
+		(union bch_extent_entry *) (_entry));			\
+})
+
+#define __entry_to_crc(_entry)						\
+	__builtin_choose_expr(						\
+		type_is_exact(_entry, const union bch_extent_entry *),	\
+		(const union bch_extent_crc *) (_entry),		\
+		(union bch_extent_crc *) (_entry))
+
+#define entry_to_crc(_entry)						\
+({									\
+	EBUG_ON((_entry) && !extent_entry_is_crc(_entry));		\
+									\
+	__entry_to_crc(_entry);						\
+})
+
+#define entry_to_ptr(_entry)						\
+({									\
+	EBUG_ON((_entry) && !extent_entry_is_ptr(_entry));		\
+									\
+	__builtin_choose_expr(						\
+		type_is_exact(_entry, const union bch_extent_entry *),	\
+		(const struct bch_extent_ptr *) (_entry),		\
+		(struct bch_extent_ptr *) (_entry));			\
+})
+
+/* checksum entries: */
+
+enum bch_extent_crc_type {
+	BCH_EXTENT_CRC_NONE,
+	BCH_EXTENT_CRC32,
+	BCH_EXTENT_CRC64,
+	BCH_EXTENT_CRC128,
+};
+
+static inline enum bch_extent_crc_type
+__extent_crc_type(const union bch_extent_crc *crc)
+{
+	if (!crc)
+		return BCH_EXTENT_CRC_NONE;
+
+	switch (extent_entry_type(to_entry(crc))) {
+	case BCH_EXTENT_ENTRY_crc32:
+		return BCH_EXTENT_CRC32;
+	case BCH_EXTENT_ENTRY_crc64:
+		return BCH_EXTENT_CRC64;
+	case BCH_EXTENT_ENTRY_crc128:
+		return BCH_EXTENT_CRC128;
+	default:
+		BUG();
+	}
+}
+
+#define extent_crc_type(_crc)						\
+({									\
+	BUILD_BUG_ON(!type_is(_crc, struct bch_extent_crc32 *) &&	\
+		     !type_is(_crc, struct bch_extent_crc64 *) &&	\
+		     !type_is(_crc, struct bch_extent_crc128 *) &&	\
+		     !type_is(_crc, union bch_extent_crc *));		\
+									\
+	  type_is(_crc, struct bch_extent_crc32 *)  ? BCH_EXTENT_CRC32	\
+	: type_is(_crc, struct bch_extent_crc64 *)  ? BCH_EXTENT_CRC64	\
+	: type_is(_crc, struct bch_extent_crc128 *) ? BCH_EXTENT_CRC128	\
+	: __extent_crc_type((union bch_extent_crc *) _crc);		\
+})
+
+static inline struct bch_extent_crc_unpacked
+bch2_extent_crc_unpack(const struct bkey *k, const union bch_extent_crc *crc)
+{
+#define common_fields(_crc)						\
+		.csum_type		= _crc.csum_type,		\
+		.compression_type	= _crc.compression_type,	\
+		.compressed_size	= _crc._compressed_size + 1,	\
+		.uncompressed_size	= _crc._uncompressed_size + 1,	\
+		.offset			= _crc.offset,			\
+		.live_size		= k->size
+
+	switch (extent_crc_type(crc)) {
+	case BCH_EXTENT_CRC_NONE:
+		return (struct bch_extent_crc_unpacked) {
+			.compressed_size	= k->size,
+			.uncompressed_size	= k->size,
+			.live_size		= k->size,
+		};
+	case BCH_EXTENT_CRC32: {
+		struct bch_extent_crc_unpacked ret = (struct bch_extent_crc_unpacked) {
+			common_fields(crc->crc32),
+		};
+
+		*((__le32 *) &ret.csum.lo) = crc->crc32.csum;
+
+		memcpy(&ret.csum.lo, &crc->crc32.csum,
+		       sizeof(crc->crc32.csum));
+
+		return ret;
+	}
+	case BCH_EXTENT_CRC64: {
+		struct bch_extent_crc_unpacked ret = (struct bch_extent_crc_unpacked) {
+			common_fields(crc->crc64),
+			.nonce			= crc->crc64.nonce,
+			.csum.lo		= (__force __le64) crc->crc64.csum_lo,
+		};
+
+		*((__le16 *) &ret.csum.hi) = crc->crc64.csum_hi;
+
+		return ret;
+	}
+	case BCH_EXTENT_CRC128: {
+		struct bch_extent_crc_unpacked ret = (struct bch_extent_crc_unpacked) {
+			common_fields(crc->crc128),
+			.nonce			= crc->crc128.nonce,
+			.csum			= crc->crc128.csum,
+		};
+
+		return ret;
+	}
+	default:
+		BUG();
+	}
+#undef common_fields
+}
+
+/* Extent entry iteration: */
+
+#define extent_entry_next(_entry)					\
+	((typeof(_entry)) ((void *) (_entry) + extent_entry_bytes(_entry)))
+
+#define extent_entry_last(_e)						\
+	vstruct_idx((_e).v, bkey_val_u64s((_e).k))
+
+/* Iterate over all entries: */
+
+#define extent_for_each_entry_from(_e, _entry, _start)			\
+	for ((_entry) = _start;						\
+	     (_entry) < extent_entry_last(_e);				\
+	     (_entry) = extent_entry_next(_entry))
+
+#define extent_for_each_entry(_e, _entry)				\
+	extent_for_each_entry_from(_e, _entry, (_e).v->start)
+
+/* Iterate over crcs only: */
+
+#define __extent_crc_next(_e, _p)					\
+({									\
+	typeof(&(_e).v->start[0]) _entry = _p;				\
+									\
+	while ((_entry) < extent_entry_last(_e) &&			\
+	       !extent_entry_is_crc(_entry))				\
+		(_entry) = extent_entry_next(_entry);			\
+									\
+	entry_to_crc(_entry < extent_entry_last(_e) ? _entry : NULL);	\
+})
+
+#define __extent_for_each_crc(_e, _crc)					\
+	for ((_crc) = __extent_crc_next(_e, (_e).v->start);		\
+	     (_crc);							\
+	     (_crc) = __extent_crc_next(_e, extent_entry_next(to_entry(_crc))))
+
+#define extent_crc_next(_e, _crc, _iter)				\
+({									\
+	extent_for_each_entry_from(_e, _iter, _iter)			\
+		if (extent_entry_is_crc(_iter)) {			\
+			(_crc) = bch2_extent_crc_unpack((_e).k, entry_to_crc(_iter));\
+			break;						\
+		}							\
+									\
+	(_iter) < extent_entry_last(_e);				\
+})
+
+#define extent_for_each_crc(_e, _crc, _iter)				\
+	for ((_crc) = bch2_extent_crc_unpack((_e).k, NULL),		\
+	     (_iter) = (_e).v->start;					\
+	     extent_crc_next(_e, _crc, _iter);				\
+	     (_iter) = extent_entry_next(_iter))
+
+/* Iterate over pointers, with crcs: */
+
+#define extent_ptr_crc_next(_e, _ptr, _crc)				\
+({									\
+	__label__ out;							\
+	typeof(&(_e).v->start[0]) _entry;				\
+									\
+	extent_for_each_entry_from(_e, _entry, to_entry(_ptr))		\
+		if (extent_entry_is_crc(_entry)) {			\
+			(_crc) = bch2_extent_crc_unpack((_e).k, entry_to_crc(_entry));\
+		} else {						\
+			_ptr = entry_to_ptr(_entry);			\
+			goto out;					\
+		}							\
+									\
+	_ptr = NULL;							\
+out:									\
+	_ptr;								\
+})
+
+#define extent_for_each_ptr_crc(_e, _ptr, _crc)				\
+	for ((_crc) = bch2_extent_crc_unpack((_e).k, NULL),		\
+	     (_ptr) = &(_e).v->start->ptr;				\
+	     ((_ptr) = extent_ptr_crc_next(_e, _ptr, _crc));		\
+	     (_ptr)++)
+
+/* Iterate over pointers only, and from a given position: */
+
+#define extent_ptr_next(_e, _ptr)					\
+({									\
+	struct bch_extent_crc_unpacked _crc;				\
+									\
+	extent_ptr_crc_next(_e, _ptr, _crc);				\
+})
+
+#define extent_for_each_ptr(_e, _ptr)					\
+	for ((_ptr) = &(_e).v->start->ptr;				\
+	     ((_ptr) = extent_ptr_next(_e, _ptr));			\
+	     (_ptr)++)
+
+#define extent_ptr_prev(_e, _ptr)					\
+({									\
+	typeof(&(_e).v->start->ptr) _p;					\
+	typeof(&(_e).v->start->ptr) _prev = NULL;			\
+									\
+	extent_for_each_ptr(_e, _p) {					\
+		if (_p == (_ptr))					\
+			break;						\
+		_prev = _p;						\
+	}								\
+									\
+	_prev;								\
+})
+
+/*
+ * Use this when you'll be dropping pointers as you iterate. Quadratic,
+ * unfortunately:
+ */
+#define extent_for_each_ptr_backwards(_e, _ptr)				\
+	for ((_ptr) = extent_ptr_prev(_e, NULL);			\
+	     (_ptr);							\
+	     (_ptr) = extent_ptr_prev(_e, _ptr))
+
+void bch2_extent_crc_append(struct bkey_i_extent *,
+			    struct bch_extent_crc_unpacked);
+
+static inline void __extent_entry_push(struct bkey_i_extent *e)
+{
+	union bch_extent_entry *entry = extent_entry_last(extent_i_to_s(e));
+
+	EBUG_ON(bkey_val_u64s(&e->k) + extent_entry_u64s(entry) >
+		BKEY_EXTENT_VAL_U64s_MAX);
+
+	e->k.u64s += extent_entry_u64s(entry);
+}
+
+static inline void extent_ptr_append(struct bkey_i_extent *e,
+				     struct bch_extent_ptr ptr)
+{
+	ptr.type = 1 << BCH_EXTENT_ENTRY_ptr;
+	extent_entry_last(extent_i_to_s(e))->ptr = ptr;
+	__extent_entry_push(e);
+}
+
+static inline struct bch_devs_list bch2_extent_devs(struct bkey_s_c_extent e)
+{
+	struct bch_devs_list ret = (struct bch_devs_list) { 0 };
+	const struct bch_extent_ptr *ptr;
+
+	extent_for_each_ptr(e, ptr)
+		ret.devs[ret.nr++] = ptr->dev;
+
+	return ret;
+}
+
+static inline struct bch_devs_list bch2_extent_dirty_devs(struct bkey_s_c_extent e)
+{
+	struct bch_devs_list ret = (struct bch_devs_list) { 0 };
+	const struct bch_extent_ptr *ptr;
+
+	extent_for_each_ptr(e, ptr)
+		if (!ptr->cached)
+			ret.devs[ret.nr++] = ptr->dev;
+
+	return ret;
+}
+
+static inline struct bch_devs_list bch2_extent_cached_devs(struct bkey_s_c_extent e)
+{
+	struct bch_devs_list ret = (struct bch_devs_list) { 0 };
+	const struct bch_extent_ptr *ptr;
+
+	extent_for_each_ptr(e, ptr)
+		if (ptr->cached)
+			ret.devs[ret.nr++] = ptr->dev;
+
+	return ret;
+}
+
+static inline struct bch_devs_list bch2_bkey_devs(struct bkey_s_c k)
+{
+	switch (k.k->type) {
+	case BCH_EXTENT:
+	case BCH_EXTENT_CACHED:
+		return bch2_extent_devs(bkey_s_c_to_extent(k));
+	default:
+		return (struct bch_devs_list) { .nr = 0 };
+	}
+}
+
+static inline struct bch_devs_list bch2_bkey_dirty_devs(struct bkey_s_c k)
+{
+	switch (k.k->type) {
+	case BCH_EXTENT:
+	case BCH_EXTENT_CACHED:
+		return bch2_extent_dirty_devs(bkey_s_c_to_extent(k));
+	default:
+		return (struct bch_devs_list) { .nr = 0 };
+	}
+}
+
+static inline struct bch_devs_list bch2_bkey_cached_devs(struct bkey_s_c k)
+{
+	switch (k.k->type) {
+	case BCH_EXTENT:
+	case BCH_EXTENT_CACHED:
+		return bch2_extent_cached_devs(bkey_s_c_to_extent(k));
+	default:
+		return (struct bch_devs_list) { .nr = 0 };
+	}
+}
+
+bool bch2_can_narrow_extent_crcs(struct bkey_s_c_extent,
+				 struct bch_extent_crc_unpacked);
+bool bch2_extent_narrow_crcs(struct bkey_i_extent *, struct bch_extent_crc_unpacked);
+void bch2_extent_drop_redundant_crcs(struct bkey_s_extent);
+
+void __bch2_extent_drop_ptr(struct bkey_s_extent, struct bch_extent_ptr *);
+void bch2_extent_drop_ptr(struct bkey_s_extent, struct bch_extent_ptr *);
+
+bool bch2_cut_front(struct bpos, struct bkey_i *);
+bool bch2_cut_back(struct bpos, struct bkey *);
+void bch2_key_resize(struct bkey *, unsigned);
+
+int bch2_check_range_allocated(struct bch_fs *, struct bpos, u64);
+
+#endif /* _BCACHEFS_EXTENTS_H */
diff --git a/fs/bcachefs/extents_types.h b/fs/bcachefs/extents_types.h
new file mode 100644
index 000000000000..27b2bde85e5c
--- /dev/null
+++ b/fs/bcachefs/extents_types.h
@@ -0,0 +1,27 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_EXTENTS_TYPES_H
+#define _BCACHEFS_EXTENTS_TYPES_H
+
+#include "bcachefs_format.h"
+
+struct bch_extent_crc_unpacked {
+	u8			csum_type;
+	u8			compression_type;
+
+	u16			compressed_size;
+	u16			uncompressed_size;
+
+	u16			offset;
+	u16			live_size;
+
+	u16			nonce;
+
+	struct bch_csum		csum;
+};
+
+struct extent_pick_ptr {
+	struct bch_extent_ptr		ptr;
+	struct bch_extent_crc_unpacked	crc;
+};
+
+#endif /* _BCACHEFS_EXTENTS_TYPES_H */
diff --git a/fs/bcachefs/eytzinger.h b/fs/bcachefs/eytzinger.h
new file mode 100644
index 000000000000..7cb4942cacf7
--- /dev/null
+++ b/fs/bcachefs/eytzinger.h
@@ -0,0 +1,283 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _EYTZINGER_H
+#define _EYTZINGER_H
+
+#include <linux/bitops.h>
+#include <linux/log2.h>
+
+#include "util.h"
+
+/*
+ * Traversal for trees in eytzinger layout - a full binary tree layed out in an
+ * array
+ */
+
+/*
+ * One based indexing version:
+ *
+ * With one based indexing each level of the tree starts at a power of two -
+ * good for cacheline alignment:
+ *
+ * Size parameter is treated as if we were using 0 based indexing, however:
+ * valid nodes, and inorder indices, are in the range [1..size) - that is, there
+ * are actually size - 1 elements
+ */
+
+static inline unsigned eytzinger1_child(unsigned i, unsigned child)
+{
+	EBUG_ON(child > 1);
+
+	return (i << 1) + child;
+}
+
+static inline unsigned eytzinger1_left_child(unsigned i)
+{
+	return eytzinger1_child(i, 0);
+}
+
+static inline unsigned eytzinger1_right_child(unsigned i)
+{
+	return eytzinger1_child(i, 1);
+}
+
+static inline unsigned eytzinger1_first(unsigned size)
+{
+	return rounddown_pow_of_two(size - 1);
+}
+
+static inline unsigned eytzinger1_last(unsigned size)
+{
+	return rounddown_pow_of_two(size) - 1;
+}
+
+/*
+ * eytzinger1_next() and eytzinger1_prev() have the nice properties that
+ *
+ * eytzinger1_next(0) == eytzinger1_first())
+ * eytzinger1_prev(0) == eytzinger1_last())
+ *
+ * eytzinger1_prev(eytzinger1_first()) == 0
+ * eytzinger1_next(eytzinger1_last()) == 0
+ */
+
+static inline unsigned eytzinger1_next(unsigned i, unsigned size)
+{
+	EBUG_ON(i >= size);
+
+	if (eytzinger1_right_child(i) < size) {
+		i = eytzinger1_right_child(i);
+
+		i <<= __fls(size) - __fls(i);
+		i >>= i >= size;
+	} else {
+		i >>= ffz(i) + 1;
+	}
+
+	return i;
+}
+
+static inline unsigned eytzinger1_prev(unsigned i, unsigned size)
+{
+	EBUG_ON(i >= size);
+
+	if (eytzinger1_left_child(i) < size) {
+		i = eytzinger1_left_child(i) + 1;
+
+		i <<= __fls(size) - __fls(i);
+		i -= 1;
+		i >>= i >= size;
+	} else {
+		i >>= __ffs(i) + 1;
+	}
+
+	return i;
+}
+
+static inline unsigned eytzinger1_extra(unsigned size)
+{
+	return (size - rounddown_pow_of_two(size - 1)) << 1;
+}
+
+static inline unsigned __eytzinger1_to_inorder(unsigned i, unsigned size,
+					      unsigned extra)
+{
+	unsigned b = __fls(i);
+	unsigned shift = __fls(size - 1) - b;
+	int s;
+
+	EBUG_ON(!i || i >= size);
+
+	i  ^= 1U << b;
+	i <<= 1;
+	i  |= 1;
+	i <<= shift;
+
+	/*
+	 * sign bit trick:
+	 *
+	 * if (i > extra)
+	 *	i -= (i - extra) >> 1;
+	 */
+	s = extra - i;
+	i += (s >> 1) & (s >> 31);
+
+	return i;
+}
+
+static inline unsigned __inorder_to_eytzinger1(unsigned i, unsigned size,
+					       unsigned extra)
+{
+	unsigned shift;
+	int s;
+
+	EBUG_ON(!i || i >= size);
+
+	/*
+	 * sign bit trick:
+	 *
+	 * if (i > extra)
+	 *	i += i - extra;
+	 */
+	s = extra - i;
+	i -= s & (s >> 31);
+
+	shift = __ffs(i);
+
+	i >>= shift + 1;
+	i  |= 1U << (__fls(size - 1) - shift);
+
+	return i;
+}
+
+static inline unsigned eytzinger1_to_inorder(unsigned i, unsigned size)
+{
+	return __eytzinger1_to_inorder(i, size, eytzinger1_extra(size));
+}
+
+static inline unsigned inorder_to_eytzinger1(unsigned i, unsigned size)
+{
+	return __inorder_to_eytzinger1(i, size, eytzinger1_extra(size));
+}
+
+#define eytzinger1_for_each(_i, _size)			\
+	for ((_i) = eytzinger1_first((_size));		\
+	     (_i) != 0;					\
+	     (_i) = eytzinger1_next((_i), (_size)))
+
+/* Zero based indexing version: */
+
+static inline unsigned eytzinger0_child(unsigned i, unsigned child)
+{
+	EBUG_ON(child > 1);
+
+	return (i << 1) + 1 + child;
+}
+
+static inline unsigned eytzinger0_left_child(unsigned i)
+{
+	return eytzinger0_child(i, 0);
+}
+
+static inline unsigned eytzinger0_right_child(unsigned i)
+{
+	return eytzinger0_child(i, 1);
+}
+
+static inline unsigned eytzinger0_first(unsigned size)
+{
+	return eytzinger1_first(size + 1) - 1;
+}
+
+static inline unsigned eytzinger0_last(unsigned size)
+{
+	return eytzinger1_last(size + 1) - 1;
+}
+
+static inline unsigned eytzinger0_next(unsigned i, unsigned size)
+{
+	return eytzinger1_next(i + 1, size + 1) - 1;
+}
+
+static inline unsigned eytzinger0_prev(unsigned i, unsigned size)
+{
+	return eytzinger1_prev(i + 1, size + 1) - 1;
+}
+
+static inline unsigned eytzinger0_extra(unsigned size)
+{
+	return eytzinger1_extra(size + 1);
+}
+
+static inline unsigned __eytzinger0_to_inorder(unsigned i, unsigned size,
+					       unsigned extra)
+{
+	return __eytzinger1_to_inorder(i + 1, size + 1, extra) - 1;
+}
+
+static inline unsigned __inorder_to_eytzinger0(unsigned i, unsigned size,
+					       unsigned extra)
+{
+	return __inorder_to_eytzinger1(i + 1, size + 1, extra) - 1;
+}
+
+static inline unsigned eytzinger0_to_inorder(unsigned i, unsigned size)
+{
+	return __eytzinger0_to_inorder(i, size, eytzinger0_extra(size));
+}
+
+static inline unsigned inorder_to_eytzinger0(unsigned i, unsigned size)
+{
+	return __inorder_to_eytzinger0(i, size, eytzinger0_extra(size));
+}
+
+#define eytzinger0_for_each(_i, _size)			\
+	for ((_i) = eytzinger0_first((_size));		\
+	     (_i) != -1;				\
+	     (_i) = eytzinger0_next((_i), (_size)))
+
+typedef int (*eytzinger_cmp_fn)(const void *l, const void *r, size_t size);
+
+/* return greatest node <= @search, or -1 if not found */
+static inline ssize_t eytzinger0_find_le(void *base, size_t nr, size_t size,
+					 eytzinger_cmp_fn cmp, const void *search)
+{
+	unsigned i, n = 0;
+
+	if (!nr)
+		return -1;
+
+	do {
+		i = n;
+		n = eytzinger0_child(i, cmp(search, base + i * size, size) >= 0);
+	} while (n < nr);
+
+	if (n & 1) {
+		/* @i was greater than @search, return previous node: */
+
+		if (i == eytzinger0_first(nr))
+			return -1;
+
+		return eytzinger0_prev(i, nr);
+	} else {
+		return i;
+	}
+}
+
+static inline size_t eytzinger0_find(void *base, size_t nr, size_t size,
+				     eytzinger_cmp_fn cmp, const void *search)
+{
+	size_t i = 0;
+	int res;
+
+	while (i < nr &&
+	       (res = cmp(search, base + i * size, size)))
+		i = eytzinger0_child(i, res > 0);
+
+	return i;
+}
+
+void eytzinger0_sort(void *, size_t, size_t,
+		    int (*cmp_func)(const void *, const void *, size_t),
+		    void (*swap_func)(void *, void *, size_t));
+
+#endif /* _EYTZINGER_H */
diff --git a/fs/bcachefs/fifo.h b/fs/bcachefs/fifo.h
new file mode 100644
index 000000000000..bd1534ecadb6
--- /dev/null
+++ b/fs/bcachefs/fifo.h
@@ -0,0 +1,125 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_FIFO_H
+#define _BCACHEFS_FIFO_H
+
+#include "util.h"
+
+#define FIFO(type)							\
+struct {								\
+	size_t front, back, size, mask;					\
+	type *data;							\
+}
+
+#define DECLARE_FIFO(type, name)	FIFO(type) name
+
+#define fifo_buf_size(fifo)						\
+	(roundup_pow_of_two((fifo)->size) * sizeof((fifo)->data[0]))
+
+#define init_fifo(fifo, _size, _gfp)					\
+({									\
+	(fifo)->front	= (fifo)->back = 0;				\
+	(fifo)->size	= (_size);					\
+	(fifo)->mask	= (fifo)->size					\
+		? roundup_pow_of_two((fifo)->size) - 1			\
+		: 0;							\
+	(fifo)->data	= kvpmalloc(fifo_buf_size(fifo), (_gfp));	\
+})
+
+#define free_fifo(fifo)							\
+do {									\
+	kvpfree((fifo)->data, fifo_buf_size(fifo));			\
+	(fifo)->data = NULL;						\
+} while (0)
+
+#define fifo_swap(l, r)							\
+do {									\
+	swap((l)->front, (r)->front);					\
+	swap((l)->back, (r)->back);					\
+	swap((l)->size, (r)->size);					\
+	swap((l)->mask, (r)->mask);					\
+	swap((l)->data, (r)->data);					\
+} while (0)
+
+#define fifo_move(dest, src)						\
+do {									\
+	typeof(*((dest)->data)) _t;					\
+	while (!fifo_full(dest) &&					\
+	       fifo_pop(src, _t))					\
+		fifo_push(dest, _t);					\
+} while (0)
+
+#define fifo_used(fifo)		(((fifo)->back - (fifo)->front))
+#define fifo_free(fifo)		((fifo)->size - fifo_used(fifo))
+
+#define fifo_empty(fifo)	((fifo)->front == (fifo)->back)
+#define fifo_full(fifo)		(fifo_used(fifo) == (fifo)->size)
+
+#define fifo_peek_front(fifo)	((fifo)->data[(fifo)->front & (fifo)->mask])
+#define fifo_peek_back(fifo)	((fifo)->data[((fifo)->back - 1) & (fifo)->mask])
+
+#define fifo_entry_idx_abs(fifo, p)					\
+	((((p) >= &fifo_peek_front(fifo)				\
+	   ? (fifo)->front : (fifo)->back) & ~(fifo)->mask) +		\
+	   (((p) - (fifo)->data)))
+
+#define fifo_entry_idx(fifo, p)	(((p) - &fifo_peek_front(fifo)) & (fifo)->mask)
+#define fifo_idx_entry(fifo, i)	(fifo)->data[((fifo)->front + (i)) & (fifo)->mask]
+
+#define fifo_push_back_ref(f)						\
+	(fifo_full((f)) ? NULL : &(f)->data[(f)->back++ & (f)->mask])
+
+#define fifo_push_front_ref(f)						\
+	(fifo_full((f)) ? NULL : &(f)->data[--(f)->front & (f)->mask])
+
+#define fifo_push_back(fifo, new)					\
+({									\
+	typeof((fifo)->data) _r = fifo_push_back_ref(fifo);		\
+	if (_r)								\
+		*_r = (new);						\
+	_r != NULL;							\
+})
+
+#define fifo_push_front(fifo, new)					\
+({									\
+	typeof((fifo)->data) _r = fifo_push_front_ref(fifo);		\
+	if (_r)								\
+		*_r = (new);						\
+	_r != NULL;							\
+})
+
+#define fifo_pop_front(fifo, i)						\
+({									\
+	bool _r = !fifo_empty((fifo));					\
+	if (_r)								\
+		(i) = (fifo)->data[(fifo)->front++ & (fifo)->mask];	\
+	_r;								\
+})
+
+#define fifo_pop_back(fifo, i)						\
+({									\
+	bool _r = !fifo_empty((fifo));					\
+	if (_r)								\
+		(i) = (fifo)->data[--(fifo)->back & (fifo)->mask]	\
+	_r;								\
+})
+
+#define fifo_push_ref(fifo)	fifo_push_back_ref(fifo)
+#define fifo_push(fifo, i)	fifo_push_back(fifo, (i))
+#define fifo_pop(fifo, i)	fifo_pop_front(fifo, (i))
+#define fifo_peek(fifo)		fifo_peek_front(fifo)
+
+#define fifo_for_each_entry(_entry, _fifo, _iter)			\
+	for (((void) (&(_iter) == &(_fifo)->front)),			\
+	     _iter = (_fifo)->front;					\
+	     ((_iter != (_fifo)->back) &&				\
+	      (_entry = (_fifo)->data[(_iter) & (_fifo)->mask], true));	\
+	     _iter++)
+
+#define fifo_for_each_entry_ptr(_ptr, _fifo, _iter)			\
+	for (((void) (&(_iter) == &(_fifo)->front)),			\
+	     _iter = (_fifo)->front;					\
+	     ((_iter != (_fifo)->back) &&				\
+	      (_ptr = &(_fifo)->data[(_iter) & (_fifo)->mask], true));	\
+	     _iter++)
+
+#endif /* _BCACHEFS_FIFO_H */
diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
new file mode 100644
index 000000000000..56d21175058c
--- /dev/null
+++ b/fs/bcachefs/fs-io.c
@@ -0,0 +1,2862 @@
+// SPDX-License-Identifier: GPL-2.0
+#ifndef NO_BCACHEFS_FS
+
+#include "bcachefs.h"
+#include "btree_update.h"
+#include "buckets.h"
+#include "clock.h"
+#include "error.h"
+#include "fs.h"
+#include "fs-io.h"
+#include "fsck.h"
+#include "inode.h"
+#include "journal.h"
+#include "io.h"
+#include "keylist.h"
+#include "quota.h"
+#include "trace.h"
+
+#include <linux/aio.h>
+#include <linux/backing-dev.h>
+#include <linux/falloc.h>
+#include <linux/migrate.h>
+#include <linux/mmu_context.h>
+#include <linux/pagevec.h>
+#include <linux/sched/signal.h>
+#include <linux/task_io_accounting_ops.h>
+#include <linux/uio.h>
+#include <linux/writeback.h>
+
+#include <trace/events/writeback.h>
+
+struct quota_res {
+	u64				sectors;
+};
+
+struct i_sectors_hook {
+	struct extent_insert_hook	hook;
+	struct bch_inode_info		*inode;
+	struct quota_res		quota_res;
+	s64				sectors;
+	u64				new_i_size;
+	unsigned			flags;
+	unsigned			appending:1;
+};
+
+struct bchfs_write_op {
+	struct bch_inode_info		*inode;
+	s64				sectors_added;
+	bool				is_dio;
+	bool				unalloc;
+	u64				new_i_size;
+
+	/* must be last: */
+	struct bch_write_op		op;
+};
+
+struct bch_writepage_io {
+	struct closure			cl;
+	u64				new_sectors;
+
+	/* must be last: */
+	struct bchfs_write_op		op;
+};
+
+struct dio_write {
+	struct closure			cl;
+	struct kiocb			*req;
+	struct task_struct		*task;
+	unsigned			loop:1,
+					sync:1,
+					free_iov:1;
+	struct quota_res		quota_res;
+
+	struct iov_iter			iter;
+	struct iovec			inline_vecs[2];
+
+	/* must be last: */
+	struct bchfs_write_op		iop;
+};
+
+struct dio_read {
+	struct closure			cl;
+	struct kiocb			*req;
+	long				ret;
+	struct bch_read_bio		rbio;
+};
+
+/* pagecache_block must be held */
+static int write_invalidate_inode_pages_range(struct address_space *mapping,
+					      loff_t start, loff_t end)
+{
+	int ret;
+
+	/*
+	 * XXX: the way this is currently implemented, we can spin if a process
+	 * is continually redirtying a specific page
+	 */
+	do {
+		if (!mapping->nrpages)
+			return 0;
+
+		ret = filemap_write_and_wait_range(mapping, start, end);
+		if (ret)
+			break;
+
+		if (!mapping->nrpages)
+			return 0;
+
+		ret = invalidate_inode_pages2_range(mapping,
+				start >> PAGE_SHIFT,
+				end >> PAGE_SHIFT);
+	} while (ret == -EBUSY);
+
+	return ret;
+}
+
+/* quotas */
+
+#ifdef CONFIG_BCACHEFS_QUOTA
+
+static void bch2_quota_reservation_put(struct bch_fs *c,
+				       struct bch_inode_info *inode,
+				       struct quota_res *res)
+{
+	if (!res->sectors)
+		return;
+
+	mutex_lock(&inode->ei_quota_lock);
+	BUG_ON(res->sectors > inode->ei_quota_reserved);
+
+	bch2_quota_acct(c, inode->ei_qid, Q_SPC,
+			-((s64) res->sectors), BCH_QUOTA_PREALLOC);
+	inode->ei_quota_reserved -= res->sectors;
+	mutex_unlock(&inode->ei_quota_lock);
+
+	res->sectors = 0;
+}
+
+static int bch2_quota_reservation_add(struct bch_fs *c,
+				      struct bch_inode_info *inode,
+				      struct quota_res *res,
+				      unsigned sectors,
+				      bool check_enospc)
+{
+	int ret;
+
+	mutex_lock(&inode->ei_quota_lock);
+	ret = bch2_quota_acct(c, inode->ei_qid, Q_SPC, sectors,
+			      check_enospc ? BCH_QUOTA_PREALLOC : BCH_QUOTA_NOCHECK);
+	if (likely(!ret)) {
+		inode->ei_quota_reserved += sectors;
+		res->sectors += sectors;
+	}
+	mutex_unlock(&inode->ei_quota_lock);
+
+	return ret;
+}
+
+#else
+
+static void bch2_quota_reservation_put(struct bch_fs *c,
+				       struct bch_inode_info *inode,
+				       struct quota_res *res)
+{
+}
+
+static int bch2_quota_reservation_add(struct bch_fs *c,
+				      struct bch_inode_info *inode,
+				      struct quota_res *res,
+				      unsigned sectors,
+				      bool check_enospc)
+{
+	return 0;
+}
+
+#endif
+
+/* i_size updates: */
+
+static int inode_set_size(struct bch_inode_info *inode,
+			  struct bch_inode_unpacked *bi,
+			  void *p)
+{
+	loff_t *new_i_size = p;
+
+	lockdep_assert_held(&inode->ei_update_lock);
+
+	bi->bi_size = *new_i_size;
+	return 0;
+}
+
+static int __must_check bch2_write_inode_size(struct bch_fs *c,
+					      struct bch_inode_info *inode,
+					      loff_t new_size)
+{
+	return __bch2_write_inode(c, inode, inode_set_size, &new_size, 0);
+}
+
+static void i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode,
+			   struct quota_res *quota_res, int sectors)
+{
+	mutex_lock(&inode->ei_quota_lock);
+#ifdef CONFIG_BCACHEFS_QUOTA
+	if (quota_res && sectors > 0) {
+		BUG_ON(sectors > quota_res->sectors);
+		BUG_ON(sectors > inode->ei_quota_reserved);
+
+		quota_res->sectors -= sectors;
+		inode->ei_quota_reserved -= sectors;
+	} else {
+		bch2_quota_acct(c, inode->ei_qid, Q_SPC, sectors, BCH_QUOTA_WARN);
+	}
+#endif
+	inode->v.i_blocks += sectors;
+	mutex_unlock(&inode->ei_quota_lock);
+}
+
+/* i_sectors accounting: */
+
+static enum btree_insert_ret
+i_sectors_hook_fn(struct extent_insert_hook *hook,
+		  struct bpos committed_pos,
+		  struct bpos next_pos,
+		  struct bkey_s_c k,
+		  const struct bkey_i *insert)
+{
+	struct i_sectors_hook *h = container_of(hook,
+				struct i_sectors_hook, hook);
+	s64 sectors = next_pos.offset - committed_pos.offset;
+	int sign = bkey_extent_is_allocation(&insert->k) -
+		(k.k && bkey_extent_is_allocation(k.k));
+
+	EBUG_ON(!(h->inode->ei_inode.bi_flags & BCH_INODE_I_SECTORS_DIRTY));
+
+	h->sectors += sectors * sign;
+
+	return BTREE_INSERT_OK;
+}
+
+static int i_sectors_dirty_finish_fn(struct bch_inode_info *inode,
+				     struct bch_inode_unpacked *bi,
+				     void *p)
+{
+	struct i_sectors_hook *h = p;
+
+	if (h->new_i_size != U64_MAX &&
+	    (!h->appending ||
+	     h->new_i_size > bi->bi_size))
+		bi->bi_size = h->new_i_size;
+	bi->bi_sectors	+= h->sectors;
+	bi->bi_flags	&= ~h->flags;
+	return 0;
+}
+
+static int i_sectors_dirty_finish(struct bch_fs *c, struct i_sectors_hook *h)
+{
+	int ret;
+
+	mutex_lock(&h->inode->ei_update_lock);
+	i_sectors_acct(c, h->inode, &h->quota_res, h->sectors);
+
+	ret = __bch2_write_inode(c, h->inode, i_sectors_dirty_finish_fn, h, 0);
+
+	if (!ret && h->new_i_size != U64_MAX)
+		i_size_write(&h->inode->v, h->new_i_size);
+	mutex_unlock(&h->inode->ei_update_lock);
+
+	bch2_quota_reservation_put(c, h->inode, &h->quota_res);
+
+	h->sectors = 0;
+
+	return ret;
+}
+
+static int i_sectors_dirty_start_fn(struct bch_inode_info *inode,
+				    struct bch_inode_unpacked *bi, void *p)
+{
+	struct i_sectors_hook *h = p;
+
+	if (h->flags & BCH_INODE_I_SIZE_DIRTY)
+		bi->bi_size = h->new_i_size;
+
+	bi->bi_flags |= h->flags;
+	return 0;
+}
+
+static int i_sectors_dirty_start(struct bch_fs *c, struct i_sectors_hook *h)
+{
+	int ret;
+
+	mutex_lock(&h->inode->ei_update_lock);
+	ret = __bch2_write_inode(c, h->inode, i_sectors_dirty_start_fn, h, 0);
+	mutex_unlock(&h->inode->ei_update_lock);
+
+	return ret;
+}
+
+static inline struct i_sectors_hook
+i_sectors_hook_init(struct bch_inode_info *inode, unsigned flags)
+{
+	return (struct i_sectors_hook) {
+		.hook.fn	= i_sectors_hook_fn,
+		.inode		= inode,
+		.sectors	= 0,
+		.new_i_size	= U64_MAX,
+		.flags		= flags|BCH_INODE_I_SECTORS_DIRTY,
+	};
+}
+
+/* normal i_size/i_sectors update machinery: */
+
+struct bchfs_extent_trans_hook {
+	struct bchfs_write_op		*op;
+	struct extent_insert_hook	hook;
+
+	struct bch_inode_unpacked	inode_u;
+	struct bkey_inode_buf		inode_p;
+
+	bool				need_inode_update;
+};
+
+static enum btree_insert_ret
+bchfs_extent_update_hook(struct extent_insert_hook *hook,
+			 struct bpos committed_pos,
+			 struct bpos next_pos,
+			 struct bkey_s_c k,
+			 const struct bkey_i *insert)
+{
+	struct bchfs_extent_trans_hook *h = container_of(hook,
+				struct bchfs_extent_trans_hook, hook);
+	struct bch_inode_info *inode = h->op->inode;
+	int sign = bkey_extent_is_allocation(&insert->k) -
+		(k.k && bkey_extent_is_allocation(k.k));
+	s64 sectors = (s64) (next_pos.offset - committed_pos.offset) * sign;
+	u64 offset = min(next_pos.offset << 9, h->op->new_i_size);
+	bool do_pack = false;
+
+	if (h->op->unalloc &&
+	    !bch2_extent_is_fully_allocated(k))
+		return BTREE_INSERT_ENOSPC;
+
+	BUG_ON((next_pos.offset << 9) > round_up(offset, PAGE_SIZE));
+
+	/* XXX: inode->i_size locking */
+	if (offset > inode->ei_inode.bi_size) {
+		if (!h->need_inode_update) {
+			h->need_inode_update = true;
+			return BTREE_INSERT_NEED_TRAVERSE;
+		}
+
+		/* truncate in progress? */
+		if (h->inode_u.bi_flags & BCH_INODE_I_SIZE_DIRTY)
+			goto no_i_size_update;
+
+		h->inode_u.bi_size = offset;
+		do_pack = true;
+
+		inode->ei_inode.bi_size = offset;
+
+		spin_lock(&inode->v.i_lock);
+		if (offset > inode->v.i_size) {
+			if (h->op->is_dio)
+				i_size_write(&inode->v, offset);
+			else
+				BUG();
+		}
+		spin_unlock(&inode->v.i_lock);
+	}
+no_i_size_update:
+	if (sectors) {
+		if (!h->need_inode_update) {
+			h->need_inode_update = true;
+			return BTREE_INSERT_NEED_TRAVERSE;
+		}
+
+		h->inode_u.bi_sectors += sectors;
+		do_pack = true;
+
+		h->op->sectors_added += sectors;
+	}
+
+	if (do_pack)
+		bch2_inode_pack(&h->inode_p, &h->inode_u);
+
+	return BTREE_INSERT_OK;
+}
+
+static int bchfs_write_index_update(struct bch_write_op *wop)
+{
+	struct bchfs_write_op *op = container_of(wop,
+				struct bchfs_write_op, op);
+	struct keylist *keys = &op->op.insert_keys;
+	struct btree_iter extent_iter, inode_iter;
+	struct bchfs_extent_trans_hook hook;
+	struct bkey_i *k = bch2_keylist_front(keys);
+	s64 orig_sectors_added = op->sectors_added;
+	int ret;
+
+	BUG_ON(k->k.p.inode != op->inode->v.i_ino);
+
+	bch2_btree_iter_init(&extent_iter, wop->c, BTREE_ID_EXTENTS,
+			     bkey_start_pos(&bch2_keylist_front(keys)->k),
+			     BTREE_ITER_INTENT);
+	bch2_btree_iter_init(&inode_iter, wop->c, BTREE_ID_INODES,
+			     POS(extent_iter.pos.inode, 0),
+			     BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+
+	hook.op			= op;
+	hook.hook.fn		= bchfs_extent_update_hook;
+	hook.need_inode_update	= false;
+
+	do {
+		/* XXX: inode->i_size locking */
+		k = bch2_keylist_front(keys);
+		if (min(k->k.p.offset << 9, op->new_i_size) >
+		    op->inode->ei_inode.bi_size)
+			hook.need_inode_update = true;
+
+		/* optimization for fewer transaction restarts: */
+		ret = bch2_btree_iter_traverse(&extent_iter);
+		if (ret)
+			goto err;
+
+		if (hook.need_inode_update) {
+			struct bkey_s_c inode;
+
+			if (!btree_iter_linked(&inode_iter))
+				bch2_btree_iter_link(&extent_iter, &inode_iter);
+
+			inode = bch2_btree_iter_peek_slot(&inode_iter);
+			if ((ret = btree_iter_err(inode)))
+				goto err;
+
+			if (WARN_ONCE(inode.k->type != BCH_INODE_FS,
+				      "inode %llu not found when updating",
+				      extent_iter.pos.inode)) {
+				ret = -ENOENT;
+				break;
+			}
+
+			if (WARN_ONCE(bkey_bytes(inode.k) >
+				      sizeof(hook.inode_p),
+				      "inode %llu too big (%zu bytes, buf %zu)",
+				      extent_iter.pos.inode,
+				      bkey_bytes(inode.k),
+				      sizeof(hook.inode_p))) {
+				ret = -ENOENT;
+				break;
+			}
+
+			bkey_reassemble(&hook.inode_p.inode.k_i, inode);
+			ret = bch2_inode_unpack(bkey_s_c_to_inode(inode),
+					       &hook.inode_u);
+			if (WARN_ONCE(ret,
+				      "error %i unpacking inode %llu",
+				      ret, extent_iter.pos.inode)) {
+				ret = -ENOENT;
+				break;
+			}
+
+			ret = bch2_btree_insert_at(wop->c, &wop->res,
+					&hook.hook, op_journal_seq(wop),
+					BTREE_INSERT_NOFAIL|
+					BTREE_INSERT_ATOMIC|
+					BTREE_INSERT_USE_RESERVE,
+					BTREE_INSERT_ENTRY(&extent_iter, k),
+					BTREE_INSERT_ENTRY_EXTRA_RES(&inode_iter,
+							&hook.inode_p.inode.k_i, 2));
+		} else {
+			ret = bch2_btree_insert_at(wop->c, &wop->res,
+					&hook.hook, op_journal_seq(wop),
+					BTREE_INSERT_NOFAIL|
+					BTREE_INSERT_ATOMIC|
+					BTREE_INSERT_USE_RESERVE,
+					BTREE_INSERT_ENTRY(&extent_iter, k));
+		}
+
+		BUG_ON(bkey_cmp(extent_iter.pos, bkey_start_pos(&k->k)));
+
+		if (WARN_ONCE(!ret != !k->k.size,
+			      "ret %i k->size %u", ret, k->k.size))
+			ret = k->k.size ? -EINTR : 0;
+err:
+		if (ret == -EINTR)
+			continue;
+		if (ret)
+			break;
+
+		BUG_ON(bkey_cmp(extent_iter.pos, k->k.p) < 0);
+		bch2_keylist_pop_front(keys);
+	} while (!bch2_keylist_empty(keys));
+
+	bch2_btree_iter_unlock(&extent_iter);
+	bch2_btree_iter_unlock(&inode_iter);
+
+	if (op->is_dio) {
+		struct dio_write *dio = container_of(op, struct dio_write, iop);
+
+		i_sectors_acct(wop->c, op->inode, &dio->quota_res,
+			       op->sectors_added - orig_sectors_added);
+	}
+
+	return ret;
+}
+
+static inline void bch2_fswrite_op_init(struct bchfs_write_op *op,
+					struct bch_fs *c,
+					struct bch_inode_info *inode,
+					struct bch_io_opts opts,
+					bool is_dio)
+{
+	op->inode		= inode;
+	op->sectors_added	= 0;
+	op->is_dio		= is_dio;
+	op->unalloc		= false;
+	op->new_i_size		= U64_MAX;
+
+	bch2_write_op_init(&op->op, c, opts);
+	op->op.target		= opts.foreground_target;
+	op->op.index_update_fn	= bchfs_write_index_update;
+	op_journal_seq_set(&op->op, &inode->ei_journal_seq);
+}
+
+static inline struct bch_io_opts io_opts(struct bch_fs *c, struct bch_inode_info *inode)
+{
+	struct bch_io_opts opts = bch2_opts_to_inode_opts(c->opts);
+
+	bch2_io_opts_apply(&opts, bch2_inode_opts_get(&inode->ei_inode));
+	return opts;
+}
+
+/* page state: */
+
+/* stored in page->private: */
+
+/*
+ * bch_page_state has to (unfortunately) be manipulated with cmpxchg - we could
+ * almost protected it with the page lock, except that bch2_writepage_io_done has
+ * to update the sector counts (and from interrupt/bottom half context).
+ */
+struct bch_page_state {
+union { struct {
+	/* existing data: */
+	unsigned		sectors:PAGE_SECTOR_SHIFT + 1;
+	unsigned		nr_replicas:4;
+	unsigned		compressed:1;
+
+	/* Owns PAGE_SECTORS sized reservation: */
+	unsigned		reserved:1;
+	unsigned		reservation_replicas:4;
+
+	/* Owns PAGE_SECTORS sized quota reservation: */
+	unsigned		quota_reserved:1;
+
+	/*
+	 * Number of sectors on disk - for i_blocks
+	 * Uncompressed size, not compressed size:
+	 */
+	unsigned		dirty_sectors:PAGE_SECTOR_SHIFT + 1;
+};
+	/* for cmpxchg: */
+	unsigned long		v;
+};
+};
+
+#define page_state_cmpxchg(_ptr, _new, _expr)				\
+({									\
+	unsigned long _v = READ_ONCE((_ptr)->v);			\
+	struct bch_page_state _old;					\
+									\
+	do {								\
+		_old.v = _new.v = _v;					\
+		_expr;							\
+									\
+		EBUG_ON(_new.sectors + _new.dirty_sectors > PAGE_SECTORS);\
+	} while (_old.v != _new.v &&					\
+		 (_v = cmpxchg(&(_ptr)->v, _old.v, _new.v)) != _old.v);	\
+									\
+	_old;								\
+})
+
+static inline struct bch_page_state *page_state(struct page *page)
+{
+	struct bch_page_state *s = (void *) &page->private;
+
+	BUILD_BUG_ON(sizeof(*s) > sizeof(page->private));
+
+	if (!PagePrivate(page))
+		SetPagePrivate(page);
+
+	return s;
+}
+
+static inline unsigned page_res_sectors(struct bch_page_state s)
+{
+
+	return s.reserved ? s.reservation_replicas * PAGE_SECTORS : 0;
+}
+
+static void __bch2_put_page_reservation(struct bch_fs *c, struct bch_inode_info *inode,
+					struct bch_page_state s)
+{
+	struct disk_reservation res = { .sectors = page_res_sectors(s) };
+	struct quota_res quota_res = { .sectors = s.quota_reserved ? PAGE_SECTORS : 0 };
+
+	bch2_quota_reservation_put(c, inode, &quota_res);
+	bch2_disk_reservation_put(c, &res);
+}
+
+static void bch2_put_page_reservation(struct bch_fs *c, struct bch_inode_info *inode,
+				      struct page *page)
+{
+	struct bch_page_state s;
+
+	s = page_state_cmpxchg(page_state(page), s, {
+		s.reserved		= 0;
+		s.quota_reserved	= 0;
+	});
+
+	__bch2_put_page_reservation(c, inode, s);
+}
+
+static int bch2_get_page_reservation(struct bch_fs *c, struct bch_inode_info *inode,
+				     struct page *page, bool check_enospc)
+{
+	struct bch_page_state *s = page_state(page), new, old;
+
+	/* XXX: this should not be open coded */
+	unsigned nr_replicas = inode->ei_inode.bi_data_replicas
+		? inode->ei_inode.bi_data_replicas - 1
+		: c->opts.data_replicas;
+
+	struct disk_reservation disk_res = bch2_disk_reservation_init(c,
+						nr_replicas);
+	struct quota_res quota_res = { 0 };
+	int ret = 0;
+
+	/*
+	 * XXX: this could likely be quite a bit simpler, page reservations
+	 * _should_ only be manipulated with page locked:
+	 */
+
+	old = page_state_cmpxchg(s, new, {
+		if (new.reserved
+		    ? (new.reservation_replicas < disk_res.nr_replicas)
+		    : (new.sectors < PAGE_SECTORS ||
+		       new.nr_replicas < disk_res.nr_replicas ||
+		       new.compressed)) {
+			int sectors = (disk_res.nr_replicas * PAGE_SECTORS -
+				       page_res_sectors(new) -
+				       disk_res.sectors);
+
+			if (sectors > 0) {
+				ret = bch2_disk_reservation_add(c, &disk_res, sectors,
+						!check_enospc
+						? BCH_DISK_RESERVATION_NOFAIL : 0);
+				if (unlikely(ret))
+					goto err;
+			}
+
+			new.reserved = 1;
+			new.reservation_replicas = disk_res.nr_replicas;
+		}
+
+		if (!new.quota_reserved &&
+		    new.sectors + new.dirty_sectors < PAGE_SECTORS) {
+			ret = bch2_quota_reservation_add(c, inode, &quota_res,
+						PAGE_SECTORS - quota_res.sectors,
+						check_enospc);
+			if (unlikely(ret))
+				goto err;
+
+			new.quota_reserved = 1;
+		}
+	});
+
+	quota_res.sectors -= (new.quota_reserved - old.quota_reserved) * PAGE_SECTORS;
+	disk_res.sectors -= page_res_sectors(new) - page_res_sectors(old);
+err:
+	bch2_quota_reservation_put(c, inode, &quota_res);
+	bch2_disk_reservation_put(c, &disk_res);
+	return ret;
+}
+
+static void bch2_clear_page_bits(struct page *page)
+{
+	struct bch_inode_info *inode = to_bch_ei(page->mapping->host);
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	struct bch_page_state s;
+
+	if (!PagePrivate(page))
+		return;
+
+	s.v = xchg(&page_state(page)->v, 0);
+	ClearPagePrivate(page);
+
+	if (s.dirty_sectors)
+		i_sectors_acct(c, inode, NULL, -s.dirty_sectors);
+
+	__bch2_put_page_reservation(c, inode, s);
+}
+
+bool bch2_dirty_folio(struct address_space *mapping, struct folio *folio)
+{
+	struct bch_inode_info *inode = to_bch_ei(mapping->host);
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	struct quota_res quota_res = { 0 };
+	struct bch_page_state old, new;
+
+	old = page_state_cmpxchg(page_state(&folio->page), new,
+		new.dirty_sectors = PAGE_SECTORS - new.sectors;
+		new.quota_reserved = 0;
+	);
+
+	quota_res.sectors += old.quota_reserved * PAGE_SECTORS;
+
+	if (old.dirty_sectors != new.dirty_sectors)
+		i_sectors_acct(c, inode, &quota_res,
+			       new.dirty_sectors - old.dirty_sectors);
+	bch2_quota_reservation_put(c, inode, &quota_res);
+
+	return filemap_dirty_folio(mapping, folio);
+}
+
+vm_fault_t bch2_page_fault(struct vm_fault *vmf)
+{
+	struct file *file = vmf->vma->vm_file;
+	struct bch_inode_info *inode = file_bch_inode(file);
+	int ret;
+
+	bch2_pagecache_add_get(&inode->ei_pagecache_lock);
+	ret = filemap_fault(vmf);
+	bch2_pagecache_add_put(&inode->ei_pagecache_lock);
+
+	return ret;
+}
+
+vm_fault_t bch2_page_mkwrite(struct vm_fault *vmf)
+{
+	struct page *page = vmf->page;
+	struct file *file = vmf->vma->vm_file;
+	struct bch_inode_info *inode = file_bch_inode(file);
+	struct address_space *mapping = file->f_mapping;
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	int ret = VM_FAULT_LOCKED;
+
+	sb_start_pagefault(inode->v.i_sb);
+	file_update_time(file);
+
+	/*
+	 * Not strictly necessary, but helps avoid dio writes livelocking in
+	 * write_invalidate_inode_pages_range() - can drop this if/when we get
+	 * a write_invalidate_inode_pages_range() that works without dropping
+	 * page lock before invalidating page
+	 */
+	bch2_pagecache_add_get(&inode->ei_pagecache_lock);
+
+	lock_page(page);
+	if (page->mapping != mapping ||
+	    page_offset(page) > i_size_read(&inode->v)) {
+		unlock_page(page);
+		ret = VM_FAULT_NOPAGE;
+		goto out;
+	}
+
+	if (bch2_get_page_reservation(c, inode, page, true)) {
+		unlock_page(page);
+		ret = VM_FAULT_SIGBUS;
+		goto out;
+	}
+
+	if (!PageDirty(page))
+		set_page_dirty(page);
+	wait_for_stable_page(page);
+out:
+	bch2_pagecache_add_put(&inode->ei_pagecache_lock);
+	sb_end_pagefault(inode->v.i_sb);
+	return ret;
+}
+
+void bch2_invalidate_folio(struct folio *folio, size_t offset, size_t length)
+{
+	EBUG_ON(!PageLocked(&folio->page));
+	EBUG_ON(folio_test_writeback(folio));
+
+	if (offset || length < folio_size(folio))
+		return;
+
+	bch2_clear_page_bits(&folio->page);
+}
+
+bool bch2_release_folio(struct folio *folio, gfp_t gfp_mask)
+{
+	/* XXX: this can't take locks that are held while we allocate memory */
+	EBUG_ON(!PageLocked(&folio->page));
+	EBUG_ON(folio_test_writeback(folio));
+
+	if (folio_test_dirty(folio))
+		return false;
+
+	bch2_clear_page_bits(&folio->page);
+	return true;
+}
+
+/* readpages/writepages: */
+
+static bool bio_can_add_page_contig(struct bio *bio, struct page *page)
+{
+	sector_t offset = (sector_t) page->index << PAGE_SECTOR_SHIFT;
+
+	return bio->bi_vcnt < bio->bi_max_vecs &&
+		bio_end_sector(bio) == offset;
+}
+
+static int bio_add_page_contig(struct bio *bio, struct page *page)
+{
+	sector_t offset = (sector_t) page->index << PAGE_SECTOR_SHIFT;
+
+	EBUG_ON(!bio->bi_max_vecs);
+
+	if (!bio->bi_vcnt)
+		bio->bi_iter.bi_sector = offset;
+	else if (!bio_can_add_page_contig(bio, page))
+		return -1;
+
+	__bio_add_page(bio, page, PAGE_SIZE, 0);
+	return 0;
+}
+
+/* readpage(s): */
+
+static void bch2_readpages_end_io(struct bio *bio)
+{
+	struct bvec_iter_all iter;
+	struct bio_vec *bv;
+
+	bio_for_each_segment_all(bv, bio, iter) {
+		struct page *page = bv->bv_page;
+
+		if (!bio->bi_status) {
+			SetPageUptodate(page);
+		} else {
+			ClearPageUptodate(page);
+			SetPageError(page);
+		}
+		unlock_page(page);
+	}
+
+	bio_put(bio);
+}
+
+static inline void page_state_init_for_read(struct page *page)
+{
+	struct bch_page_state *s = page_state(page);
+
+	BUG_ON(s->reserved);
+	s->sectors	= 0;
+	s->compressed	= 0;
+}
+
+struct readpages_iter {
+	struct address_space	*mapping;
+	struct page		**pages;
+	unsigned		nr_pages;
+	unsigned		idx;
+	pgoff_t			offset;
+};
+
+static int readpages_iter_init(struct readpages_iter *iter,
+			       struct readahead_control *ractl)
+{
+	unsigned i, nr_pages = readahead_count(ractl);
+
+	memset(iter, 0, sizeof(*iter));
+
+	iter->mapping	= ractl->mapping;
+	iter->offset	= readahead_index(ractl);
+	iter->nr_pages	= nr_pages;
+
+	iter->pages = kmalloc_array(nr_pages, sizeof(struct page *), GFP_NOFS);
+	if (!iter->pages)
+		return -ENOMEM;
+
+	__readahead_batch(ractl, iter->pages, nr_pages);
+	for (i = 0; i < nr_pages; i++) {
+		put_page(iter->pages[i]);
+	}
+
+	return 0;
+}
+
+static inline struct page *readpage_iter_next(struct readpages_iter *iter)
+{
+	if (iter->idx >= iter->nr_pages)
+		return NULL;
+
+	EBUG_ON(iter->pages[iter->idx]->index != iter->offset + iter->idx);
+
+	page_state_init_for_read(iter->pages[iter->idx]);
+	return iter->pages[iter->idx];
+}
+
+static void bch2_add_page_sectors(struct bio *bio, struct bkey_s_c k)
+{
+	struct bvec_iter iter;
+	struct bio_vec bv;
+	bool compressed = bch2_extent_is_compressed(k);
+	unsigned nr_ptrs = bch2_extent_nr_dirty_ptrs(k);
+
+	bio_for_each_segment(bv, bio, iter) {
+		struct bch_page_state *s = page_state(bv.bv_page);
+
+		/* sectors in @k from the start of this page: */
+		unsigned k_sectors = k.k->size - (iter.bi_sector - k.k->p.offset);
+
+		unsigned page_sectors = min(bv.bv_len >> 9, k_sectors);
+
+		s->nr_replicas = !s->sectors
+			? nr_ptrs
+			: min_t(unsigned, s->nr_replicas, nr_ptrs);
+
+		BUG_ON(s->sectors + page_sectors > PAGE_SECTORS);
+		s->sectors += page_sectors;
+
+		s->compressed |= compressed;
+	}
+}
+
+static void readpage_bio_extend(struct readpages_iter *iter,
+				struct bio *bio, u64 offset,
+				bool get_more)
+{
+	while (bio_end_sector(bio) < offset &&
+	       bio->bi_vcnt < bio->bi_max_vecs) {
+		pgoff_t page_offset = bio_end_sector(bio) >> PAGE_SECTOR_SHIFT;
+		struct page *page = readpage_iter_next(iter);
+		int ret;
+
+		if (page) {
+			if (iter->offset + iter->idx != page_offset)
+				break;
+
+			iter->idx++;
+		} else {
+			if (!get_more)
+				break;
+
+			page = xa_load(&iter->mapping->i_pages, page_offset);
+			if (page && !xa_is_value(page))
+				break;
+
+			page = __page_cache_alloc(readahead_gfp_mask(iter->mapping));
+			if (!page)
+				break;
+
+			page_state_init_for_read(page);
+
+			ret = add_to_page_cache_lru(page, iter->mapping,
+						    page_offset, GFP_NOFS);
+			if (ret) {
+				ClearPagePrivate(page);
+				put_page(page);
+				break;
+			}
+
+			put_page(page);
+		}
+
+		__bio_add_page(bio, page, PAGE_SIZE, 0);
+	}
+}
+
+static void bchfs_read(struct bch_fs *c, struct btree_iter *iter,
+		       struct bch_read_bio *rbio, u64 inum,
+		       struct readpages_iter *readpages_iter)
+{
+	struct bio *bio = &rbio->bio;
+	int flags = BCH_READ_RETRY_IF_STALE|
+		BCH_READ_MAY_PROMOTE;
+
+	rbio->c = c;
+	rbio->start_time = local_clock();
+
+	while (1) {
+		BKEY_PADDED(k) tmp;
+		struct bkey_s_c k;
+		unsigned bytes;
+
+		bch2_btree_iter_set_pos(iter, POS(inum, bio->bi_iter.bi_sector));
+
+		k = bch2_btree_iter_peek_slot(iter);
+		BUG_ON(!k.k);
+
+		if (IS_ERR(k.k)) {
+			int ret = bch2_btree_iter_unlock(iter);
+			BUG_ON(!ret);
+			bcache_io_error(c, bio, "btree IO error %i", ret);
+			bio_endio(bio);
+			return;
+		}
+
+		bkey_reassemble(&tmp.k, k);
+		bch2_btree_iter_unlock(iter);
+		k = bkey_i_to_s_c(&tmp.k);
+
+		if (readpages_iter) {
+			bool want_full_extent = false;
+
+			if (bkey_extent_is_data(k.k)) {
+				struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
+				struct bch_extent_crc_unpacked crc;
+				const union bch_extent_entry *i;
+
+				extent_for_each_crc(e, crc, i)
+					want_full_extent |= ((crc.csum_type != 0) |
+							     (crc.compression_type != 0));
+			}
+
+			readpage_bio_extend(readpages_iter,
+					    bio, k.k->p.offset,
+					    want_full_extent);
+		}
+
+		bytes = (min_t(u64, k.k->p.offset, bio_end_sector(bio)) -
+			 bio->bi_iter.bi_sector) << 9;
+		swap(bio->bi_iter.bi_size, bytes);
+
+		if (bytes == bio->bi_iter.bi_size)
+			flags |= BCH_READ_LAST_FRAGMENT;
+
+		if (bkey_extent_is_allocation(k.k))
+			bch2_add_page_sectors(bio, k);
+
+		bch2_read_extent(c, rbio, k, flags);
+
+		if (flags & BCH_READ_LAST_FRAGMENT)
+			return;
+
+		swap(bio->bi_iter.bi_size, bytes);
+		bio_advance(bio, bytes);
+	}
+}
+
+void bch2_readahead(struct readahead_control *ractl)
+{
+	struct bch_inode_info *inode = to_bch_ei(ractl->mapping->host);
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	struct bch_io_opts opts = io_opts(c, inode);
+	struct btree_iter iter;
+	struct page *page;
+	struct readpages_iter readpages_iter;
+	int ret;
+
+	ret = readpages_iter_init(&readpages_iter, ractl);
+	BUG_ON(ret);
+
+	bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, POS_MIN,
+			     BTREE_ITER_SLOTS);
+
+	bch2_pagecache_add_get(&inode->ei_pagecache_lock);
+
+	while ((page = readpage_iter_next(&readpages_iter))) {
+		pgoff_t index = readpages_iter.offset + readpages_iter.idx;
+		unsigned n = min_t(unsigned,
+				   readpages_iter.nr_pages -
+				   readpages_iter.idx,
+				   BIO_MAX_VECS);
+		struct bch_read_bio *rbio =
+			rbio_init(bio_alloc_bioset(NULL, n, REQ_OP_READ,
+						   GFP_NOFS, &c->bio_read),
+				  opts);
+
+		readpages_iter.idx++;
+
+		rbio->bio.bi_iter.bi_sector = (sector_t) index << PAGE_SECTOR_SHIFT;
+		rbio->bio.bi_end_io = bch2_readpages_end_io;
+		__bio_add_page(&rbio->bio, page, PAGE_SIZE, 0);
+
+		bchfs_read(c, &iter, rbio, inode->v.i_ino, &readpages_iter);
+	}
+
+	bch2_pagecache_add_put(&inode->ei_pagecache_lock);
+	kfree(readpages_iter.pages);
+}
+
+static void __bchfs_readpage(struct bch_fs *c, struct bch_read_bio *rbio,
+			     u64 inum, struct page *page)
+{
+	struct btree_iter iter;
+
+	page_state_init_for_read(page);
+
+	rbio->bio.bi_opf = REQ_OP_READ|REQ_SYNC;
+	bio_add_page_contig(&rbio->bio, page);
+
+	bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, POS_MIN,
+			     BTREE_ITER_SLOTS);
+	bchfs_read(c, &iter, rbio, inum, NULL);
+}
+
+static void bch2_read_single_page_end_io(struct bio *bio)
+{
+	complete(bio->bi_private);
+}
+
+static int bch2_read_single_page(struct page *page,
+				 struct address_space *mapping)
+{
+	struct bch_inode_info *inode = to_bch_ei(mapping->host);
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	struct bch_read_bio *rbio;
+	int ret;
+	DECLARE_COMPLETION_ONSTACK(done);
+
+	rbio = rbio_init(bio_alloc_bioset(NULL, 1, REQ_OP_READ, GFP_NOFS, &c->bio_read),
+			 io_opts(c, inode));
+	rbio->bio.bi_private = &done;
+	rbio->bio.bi_end_io = bch2_read_single_page_end_io;
+
+	__bchfs_readpage(c, rbio, inode->v.i_ino, page);
+	wait_for_completion(&done);
+
+	ret = blk_status_to_errno(rbio->bio.bi_status);
+	bio_put(&rbio->bio);
+
+	if (ret < 0)
+		return ret;
+
+	SetPageUptodate(page);
+	return 0;
+}
+
+int bch2_read_folio(struct file *file, struct folio *folio)
+{
+	struct page *page = &folio->page;
+	int ret;
+
+	ret = bch2_read_single_page(page, page->mapping);
+	folio_unlock(folio);
+	return ret;
+}
+
+/* writepages: */
+
+struct bch_writepage_state {
+	struct bch_writepage_io	*io;
+	struct bch_io_opts	opts;
+};
+
+static inline struct bch_writepage_state bch_writepage_state_init(struct bch_fs *c,
+								  struct bch_inode_info *inode)
+{
+	return (struct bch_writepage_state) { .opts = io_opts(c, inode) };
+}
+
+static void bch2_writepage_io_free(struct closure *cl)
+{
+	struct bch_writepage_io *io = container_of(cl,
+					struct bch_writepage_io, cl);
+
+	bio_put(&io->op.op.wbio.bio);
+}
+
+static void bch2_writepage_io_done(struct closure *cl)
+{
+	struct bch_writepage_io *io = container_of(cl,
+					struct bch_writepage_io, cl);
+	struct bch_fs *c = io->op.op.c;
+	struct bio *bio = &io->op.op.wbio.bio;
+	struct bvec_iter_all iter;
+	struct bio_vec *bvec;
+
+	if (io->op.op.error) {
+		bio_for_each_segment_all(bvec, bio, iter)
+			SetPageError(bvec->bv_page);
+		set_bit(AS_EIO, &io->op.inode->v.i_mapping->flags);
+	}
+
+	/*
+	 * racing with fallocate can cause us to add fewer sectors than
+	 * expected - but we shouldn't add more sectors than expected:
+	 */
+	BUG_ON(io->op.sectors_added > (s64) io->new_sectors);
+
+	/*
+	 * (error (due to going RO) halfway through a page can screw that up
+	 * slightly)
+	 * XXX wtf?
+	   BUG_ON(io->op.sectors_added - io->new_sectors >= (s64) PAGE_SECTORS);
+	 */
+
+	/*
+	 * PageWriteback is effectively our ref on the inode - fixup i_blocks
+	 * before calling end_page_writeback:
+	 */
+	if (io->op.sectors_added != io->new_sectors)
+		i_sectors_acct(c, io->op.inode, NULL,
+			       io->op.sectors_added - (s64) io->new_sectors);
+
+	bio_for_each_segment_all(bvec, bio, iter)
+		end_page_writeback(bvec->bv_page);
+
+	closure_return_with_destructor(&io->cl, bch2_writepage_io_free);
+}
+
+static void bch2_writepage_do_io(struct bch_writepage_state *w)
+{
+	struct bch_writepage_io *io = w->io;
+
+	w->io = NULL;
+	closure_call(&io->op.op.cl, bch2_write, NULL, &io->cl);
+	continue_at(&io->cl, bch2_writepage_io_done, NULL);
+}
+
+/*
+ * Get a bch_writepage_io and add @page to it - appending to an existing one if
+ * possible, else allocating a new one:
+ */
+static void bch2_writepage_io_alloc(struct bch_fs *c,
+				    struct bch_writepage_state *w,
+				    struct bch_inode_info *inode,
+				    struct page *page,
+				    unsigned nr_replicas)
+{
+	struct bch_write_op *op;
+	u64 offset = (u64) page->index << PAGE_SECTOR_SHIFT;
+
+	w->io = container_of(bio_alloc_bioset(NULL, BIO_MAX_VECS,
+					      REQ_OP_WRITE,
+					      GFP_NOFS,
+					      &c->writepage_bioset),
+			     struct bch_writepage_io, op.op.wbio.bio);
+
+	closure_init(&w->io->cl, NULL);
+	w->io->new_sectors	= 0;
+	bch2_fswrite_op_init(&w->io->op, c, inode, w->opts, false);
+	op			= &w->io->op.op;
+	op->nr_replicas		= nr_replicas;
+	op->res.nr_replicas	= nr_replicas;
+	op->write_point		= writepoint_hashed(inode->ei_last_dirtied);
+	op->pos			= POS(inode->v.i_ino, offset);
+	op->wbio.bio.bi_iter.bi_sector = offset;
+}
+
+static int __bch2_writepage(struct folio *folio,
+			    struct writeback_control *wbc,
+			    void *data)
+{
+	struct page *page = &folio->page;
+	struct bch_inode_info *inode = to_bch_ei(page->mapping->host);
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	struct bch_writepage_state *w = data;
+	struct bch_page_state new, old;
+	unsigned offset;
+	loff_t i_size = i_size_read(&inode->v);
+	pgoff_t end_index = i_size >> PAGE_SHIFT;
+
+	EBUG_ON(!PageUptodate(page));
+
+	/* Is the page fully inside i_size? */
+	if (page->index < end_index)
+		goto do_io;
+
+	/* Is the page fully outside i_size? (truncate in progress) */
+	offset = i_size & (PAGE_SIZE - 1);
+	if (page->index > end_index || !offset) {
+		unlock_page(page);
+		return 0;
+	}
+
+	/*
+	 * The page straddles i_size.  It must be zeroed out on each and every
+	 * writepage invocation because it may be mmapped.  "A file is mapped
+	 * in multiples of the page size.  For a file that is not a multiple of
+	 * the  page size, the remaining memory is zeroed when mapped, and
+	 * writes to that region are not written out to the file."
+	 */
+	zero_user_segment(page, offset, PAGE_SIZE);
+do_io:
+	/* Before unlocking the page, transfer reservation to w->io: */
+	old = page_state_cmpxchg(page_state(page), new, {
+		EBUG_ON(!new.reserved &&
+			(new.sectors != PAGE_SECTORS ||
+			new.compressed));
+
+		if (new.reserved)
+			new.nr_replicas = new.reservation_replicas;
+		new.reserved = 0;
+
+		new.compressed |= w->opts.compression != 0;
+
+		new.sectors += new.dirty_sectors;
+		new.dirty_sectors = 0;
+	});
+
+	BUG_ON(PageWriteback(page));
+	set_page_writeback(page);
+	unlock_page(page);
+
+	if (w->io &&
+	    (w->io->op.op.res.nr_replicas != new.nr_replicas ||
+	     !bio_can_add_page_contig(&w->io->op.op.wbio.bio, page)))
+		bch2_writepage_do_io(w);
+
+	if (!w->io)
+		bch2_writepage_io_alloc(c, w, inode, page, new.nr_replicas);
+
+	w->io->new_sectors += new.sectors - old.sectors;
+
+	BUG_ON(inode != w->io->op.inode);
+	BUG_ON(bio_add_page_contig(&w->io->op.op.wbio.bio, page));
+
+	if (old.reserved)
+		w->io->op.op.res.sectors += old.reservation_replicas * PAGE_SECTORS;
+
+	w->io->op.new_i_size = i_size;
+
+	if (wbc->sync_mode == WB_SYNC_ALL)
+		w->io->op.op.wbio.bio.bi_opf |= REQ_SYNC;
+
+	return 0;
+}
+
+int bch2_writepages(struct address_space *mapping, struct writeback_control *wbc)
+{
+	struct bch_fs *c = mapping->host->i_sb->s_fs_info;
+	struct bch_writepage_state w =
+		bch_writepage_state_init(c, to_bch_ei(mapping->host));
+	struct blk_plug plug;
+	int ret;
+
+	blk_start_plug(&plug);
+	ret = write_cache_pages(mapping, wbc, __bch2_writepage, &w);
+	if (w.io)
+		bch2_writepage_do_io(&w);
+	blk_finish_plug(&plug);
+	return ret;
+}
+
+int bch2_writepage(struct page *page, struct writeback_control *wbc)
+{
+	struct bch_fs *c = page->mapping->host->i_sb->s_fs_info;
+	struct bch_writepage_state w =
+		bch_writepage_state_init(c, to_bch_ei(page->mapping->host));
+	int ret;
+
+	ret = __bch2_writepage(page_folio(page), wbc, &w);
+	if (w.io)
+		bch2_writepage_do_io(&w);
+
+	return ret;
+}
+
+/* buffered writes: */
+
+int bch2_write_begin(struct file *file, struct address_space *mapping,
+		     loff_t pos, unsigned len,
+		     struct page **pagep, void **fsdata)
+{
+	struct bch_inode_info *inode = to_bch_ei(mapping->host);
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	pgoff_t index = pos >> PAGE_SHIFT;
+	unsigned offset = pos & (PAGE_SIZE - 1);
+	struct page *page;
+	int ret = -ENOMEM;
+
+	BUG_ON(inode_unhashed(&inode->v));
+
+	bch2_pagecache_add_get(&inode->ei_pagecache_lock);
+
+	page = grab_cache_page_write_begin(mapping, index);
+	if (!page)
+		goto err_unlock;
+
+	if (PageUptodate(page))
+		goto out;
+
+	/* If we're writing entire page, don't need to read it in first: */
+	if (len == PAGE_SIZE)
+		goto out;
+
+	if (!offset && pos + len >= inode->v.i_size) {
+		zero_user_segment(page, len, PAGE_SIZE);
+		flush_dcache_page(page);
+		goto out;
+	}
+
+	if (index > inode->v.i_size >> PAGE_SHIFT) {
+		zero_user_segments(page, 0, offset, offset + len, PAGE_SIZE);
+		flush_dcache_page(page);
+		goto out;
+	}
+readpage:
+	ret = bch2_read_single_page(page, mapping);
+	if (ret)
+		goto err;
+out:
+	ret = bch2_get_page_reservation(c, inode, page, true);
+	if (ret) {
+		if (!PageUptodate(page)) {
+			/*
+			 * If the page hasn't been read in, we won't know if we
+			 * actually need a reservation - we don't actually need
+			 * to read here, we just need to check if the page is
+			 * fully backed by uncompressed data:
+			 */
+			goto readpage;
+		}
+
+		goto err;
+	}
+
+	*pagep = page;
+	return 0;
+err:
+	unlock_page(page);
+	put_page(page);
+	*pagep = NULL;
+err_unlock:
+	bch2_pagecache_add_put(&inode->ei_pagecache_lock);
+	return ret;
+}
+
+int bch2_write_end(struct file *file, struct address_space *mapping,
+		   loff_t pos, unsigned len, unsigned copied,
+		   struct page *page, void *fsdata)
+{
+	struct bch_inode_info *inode = to_bch_ei(mapping->host);
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+
+	lockdep_assert_held(&inode->v.i_rwsem);
+
+	if (unlikely(copied < len && !PageUptodate(page))) {
+		/*
+		 * The page needs to be read in, but that would destroy
+		 * our partial write - simplest thing is to just force
+		 * userspace to redo the write:
+		 */
+		zero_user(page, 0, PAGE_SIZE);
+		flush_dcache_page(page);
+		copied = 0;
+	}
+
+	spin_lock(&inode->v.i_lock);
+	if (pos + copied > inode->v.i_size)
+		i_size_write(&inode->v, pos + copied);
+	spin_unlock(&inode->v.i_lock);
+
+	if (copied) {
+		if (!PageUptodate(page))
+			SetPageUptodate(page);
+		if (!PageDirty(page))
+			set_page_dirty(page);
+
+		inode->ei_last_dirtied = (unsigned long) current;
+	} else {
+		bch2_put_page_reservation(c, inode, page);
+	}
+
+	unlock_page(page);
+	put_page(page);
+	bch2_pagecache_add_put(&inode->ei_pagecache_lock);
+
+	return copied;
+}
+
+#define WRITE_BATCH_PAGES	32
+
+static int __bch2_buffered_write(struct bch_inode_info *inode,
+				 struct address_space *mapping,
+				 struct iov_iter *iter,
+				 loff_t pos, unsigned len)
+{
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	struct page *pages[WRITE_BATCH_PAGES];
+	unsigned long index = pos >> PAGE_SHIFT;
+	unsigned offset = pos & (PAGE_SIZE - 1);
+	unsigned nr_pages = DIV_ROUND_UP(offset + len, PAGE_SIZE);
+	unsigned i, copied = 0, nr_pages_copied = 0;
+	int ret = 0;
+
+	BUG_ON(!len);
+	BUG_ON(nr_pages > ARRAY_SIZE(pages));
+
+	for (i = 0; i < nr_pages; i++) {
+		pages[i] = grab_cache_page_write_begin(mapping, index + i);
+		if (!pages[i]) {
+			nr_pages = i;
+			ret = -ENOMEM;
+			goto out;
+		}
+	}
+
+	if (offset && !PageUptodate(pages[0])) {
+		ret = bch2_read_single_page(pages[0], mapping);
+		if (ret)
+			goto out;
+	}
+
+	if ((pos + len) & (PAGE_SIZE - 1) &&
+	    !PageUptodate(pages[nr_pages - 1])) {
+		if ((index + nr_pages - 1) << PAGE_SHIFT >= inode->v.i_size) {
+			zero_user(pages[nr_pages - 1], 0, PAGE_SIZE);
+		} else {
+			ret = bch2_read_single_page(pages[nr_pages - 1], mapping);
+			if (ret)
+				goto out;
+		}
+	}
+
+	for (i = 0; i < nr_pages; i++) {
+		ret = bch2_get_page_reservation(c, inode, pages[i], true);
+
+		if (ret && !PageUptodate(pages[i])) {
+			ret = bch2_read_single_page(pages[i], mapping);
+			if (ret)
+				goto out;
+
+			ret = bch2_get_page_reservation(c, inode, pages[i], true);
+		}
+
+		if (ret)
+			goto out;
+	}
+
+	if (mapping_writably_mapped(mapping))
+		for (i = 0; i < nr_pages; i++)
+			flush_dcache_page(pages[i]);
+
+	while (copied < len) {
+		struct page *page = pages[(offset + copied) >> PAGE_SHIFT];
+		unsigned pg_offset = (offset + copied) & (PAGE_SIZE - 1);
+		unsigned pg_bytes = min_t(unsigned, len - copied,
+					  PAGE_SIZE - pg_offset);
+		unsigned pg_copied = copy_page_from_iter_atomic(page,
+						pg_offset, pg_bytes, iter);
+
+		flush_dcache_page(page);
+		copied += pg_copied;
+
+		if (pg_copied != pg_bytes)
+			break;
+	}
+
+	if (!copied)
+		goto out;
+
+	nr_pages_copied = DIV_ROUND_UP(offset + copied, PAGE_SIZE);
+	inode->ei_last_dirtied = (unsigned long) current;
+
+	spin_lock(&inode->v.i_lock);
+	if (pos + copied > inode->v.i_size)
+		i_size_write(&inode->v, pos + copied);
+	spin_unlock(&inode->v.i_lock);
+
+	if (copied < len &&
+	    ((offset + copied) & (PAGE_SIZE - 1))) {
+		struct page *page = pages[(offset + copied) >> PAGE_SHIFT];
+
+		if (!PageUptodate(page)) {
+			zero_user(page, 0, PAGE_SIZE);
+			copied -= (offset + copied) & (PAGE_SIZE - 1);
+		}
+	}
+out:
+	for (i = 0; i < nr_pages_copied; i++) {
+		if (!PageUptodate(pages[i]))
+			SetPageUptodate(pages[i]);
+		if (!PageDirty(pages[i]))
+			set_page_dirty(pages[i]);
+		unlock_page(pages[i]);
+		put_page(pages[i]);
+	}
+
+	for (i = nr_pages_copied; i < nr_pages; i++) {
+		if (!PageDirty(pages[i]))
+			bch2_put_page_reservation(c, inode, pages[i]);
+		unlock_page(pages[i]);
+		put_page(pages[i]);
+	}
+
+	return copied ?: ret;
+}
+
+static ssize_t bch2_buffered_write(struct kiocb *iocb, struct iov_iter *iter)
+{
+	struct file *file = iocb->ki_filp;
+	struct address_space *mapping = file->f_mapping;
+	struct bch_inode_info *inode = file_bch_inode(file);
+	loff_t pos = iocb->ki_pos;
+	ssize_t written = 0;
+	int ret = 0;
+
+	bch2_pagecache_add_get(&inode->ei_pagecache_lock);
+
+	do {
+		unsigned offset = pos & (PAGE_SIZE - 1);
+		unsigned bytes = min_t(unsigned long, iov_iter_count(iter),
+			      PAGE_SIZE * WRITE_BATCH_PAGES - offset);
+again:
+		/*
+		 * Bring in the user page that we will copy from _first_.
+		 * Otherwise there's a nasty deadlock on copying from the
+		 * same page as we're writing to, without it being marked
+		 * up-to-date.
+		 *
+		 * Not only is this an optimisation, but it is also required
+		 * to check that the address is actually valid, when atomic
+		 * usercopies are used, below.
+		 */
+		if (unlikely(fault_in_iov_iter_readable(iter, bytes))) {
+			bytes = min_t(unsigned long, iov_iter_count(iter),
+				      PAGE_SIZE - offset);
+
+			if (unlikely(fault_in_iov_iter_readable(iter, bytes))) {
+				ret = -EFAULT;
+				break;
+			}
+		}
+
+		if (unlikely(fatal_signal_pending(current))) {
+			ret = -EINTR;
+			break;
+		}
+
+		ret = __bch2_buffered_write(inode, mapping, iter, pos, bytes);
+		if (unlikely(ret < 0))
+			break;
+
+		cond_resched();
+
+		if (unlikely(ret == 0)) {
+			/*
+			 * If we were unable to copy any data at all, we must
+			 * fall back to a single segment length write.
+			 *
+			 * If we didn't fallback here, we could livelock
+			 * because not all segments in the iov can be copied at
+			 * once without a pagefault.
+			 */
+			bytes = min_t(unsigned long, PAGE_SIZE - offset,
+				      iov_iter_single_seg_count(iter));
+			goto again;
+		}
+		pos += ret;
+		written += ret;
+
+		balance_dirty_pages_ratelimited(mapping);
+	} while (iov_iter_count(iter));
+
+	bch2_pagecache_add_put(&inode->ei_pagecache_lock);
+
+	return written ? written : ret;
+}
+
+/* O_DIRECT reads */
+
+static void bch2_dio_read_complete(struct closure *cl)
+{
+	struct dio_read *dio = container_of(cl, struct dio_read, cl);
+
+	dio->req->ki_complete(dio->req, dio->ret);
+	bio_check_pages_dirty(&dio->rbio.bio);	/* transfers ownership */
+}
+
+static void bch2_direct_IO_read_endio(struct bio *bio)
+{
+	struct dio_read *dio = bio->bi_private;
+
+	if (bio->bi_status)
+		dio->ret = blk_status_to_errno(bio->bi_status);
+
+	closure_put(&dio->cl);
+}
+
+static void bch2_direct_IO_read_split_endio(struct bio *bio)
+{
+	bch2_direct_IO_read_endio(bio);
+	bio_check_pages_dirty(bio);	/* transfers ownership */
+}
+
+static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter)
+{
+	struct file *file = req->ki_filp;
+	struct bch_inode_info *inode = file_bch_inode(file);
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	struct bch_io_opts opts = io_opts(c, inode);
+	struct dio_read *dio;
+	struct bio *bio;
+	loff_t offset = req->ki_pos;
+	bool sync = is_sync_kiocb(req);
+	size_t shorten;
+	ssize_t ret;
+
+	if ((offset|iter->count) & (block_bytes(c) - 1))
+		return -EINVAL;
+
+	ret = min_t(loff_t, iter->count,
+		    max_t(loff_t, 0, i_size_read(&inode->v) - offset));
+
+	if (!ret)
+		return ret;
+
+	shorten = iov_iter_count(iter) - round_up(ret, block_bytes(c));
+	iter->count -= shorten;
+
+	bio = bio_alloc_bioset(NULL,
+			       iov_iter_npages(iter, BIO_MAX_VECS),
+			       REQ_OP_READ,
+			       GFP_KERNEL,
+			       &c->dio_read_bioset);
+
+	bio->bi_end_io = bch2_direct_IO_read_endio;
+
+	dio = container_of(bio, struct dio_read, rbio.bio);
+	closure_init(&dio->cl, NULL);
+
+	/*
+	 * this is a _really_ horrible hack just to avoid an atomic sub at the
+	 * end:
+	 */
+	if (!sync) {
+		set_closure_fn(&dio->cl, bch2_dio_read_complete, NULL);
+		atomic_set(&dio->cl.remaining,
+			   CLOSURE_REMAINING_INITIALIZER -
+			   CLOSURE_RUNNING +
+			   CLOSURE_DESTRUCTOR);
+	} else {
+		atomic_set(&dio->cl.remaining,
+			   CLOSURE_REMAINING_INITIALIZER + 1);
+	}
+
+	dio->req	= req;
+	dio->ret	= ret;
+
+	goto start;
+	while (iter->count) {
+		bio = bio_alloc_bioset(NULL,
+				       iov_iter_npages(iter, BIO_MAX_VECS),
+				       REQ_OP_READ,
+				       GFP_KERNEL,
+				       &c->bio_read);
+		bio->bi_end_io		= bch2_direct_IO_read_split_endio;
+start:
+		bio->bi_opf		= REQ_OP_READ|REQ_SYNC;
+		bio->bi_iter.bi_sector	= offset >> 9;
+		bio->bi_private		= dio;
+
+		ret = bio_iov_iter_get_pages(bio, iter);
+		if (ret < 0) {
+			/* XXX: fault inject this path */
+			bio->bi_status = BLK_STS_RESOURCE;
+			bio_endio(bio);
+			break;
+		}
+
+		offset += bio->bi_iter.bi_size;
+		bio_set_pages_dirty(bio);
+
+		if (iter->count)
+			closure_get(&dio->cl);
+
+		bch2_read(c, rbio_init(bio, opts), inode->v.i_ino);
+	}
+
+	iter->count += shorten;
+
+	if (sync) {
+		closure_sync(&dio->cl);
+		closure_debug_destroy(&dio->cl);
+		ret = dio->ret;
+		bio_check_pages_dirty(&dio->rbio.bio); /* transfers ownership */
+		return ret;
+	} else {
+		return -EIOCBQUEUED;
+	}
+}
+
+ssize_t bch2_read_iter(struct kiocb *iocb, struct iov_iter *iter)
+{
+	struct file *file = iocb->ki_filp;
+	struct bch_inode_info *inode = file_bch_inode(file);
+	struct address_space *mapping = file->f_mapping;
+	size_t count = iov_iter_count(iter);
+	ssize_t ret;
+
+	if (!count)
+		return 0; /* skip atime */
+
+	if (iocb->ki_flags & IOCB_DIRECT) {
+		struct blk_plug plug;
+
+		ret = filemap_write_and_wait_range(mapping,
+					iocb->ki_pos,
+					iocb->ki_pos + count - 1);
+		if (ret < 0)
+			return ret;
+
+		file_accessed(file);
+
+		blk_start_plug(&plug);
+		ret = bch2_direct_IO_read(iocb, iter);
+		blk_finish_plug(&plug);
+
+		if (ret >= 0)
+			iocb->ki_pos += ret;
+	} else {
+		bch2_pagecache_add_get(&inode->ei_pagecache_lock);
+		ret = generic_file_read_iter(iocb, iter);
+		bch2_pagecache_add_put(&inode->ei_pagecache_lock);
+	}
+
+	return ret;
+}
+
+/* O_DIRECT writes */
+
+/*
+ * We're going to return -EIOCBQUEUED, but we haven't finished consuming the
+ * iov_iter yet, so we need to stash a copy of the iovec: it might be on the
+ * caller's stack, we're not guaranteed that it will live for the duration of
+ * the IO:
+ */
+static noinline int bch2_dio_write_copy_iov(struct dio_write *dio)
+{
+	struct iovec *iov = dio->inline_vecs;
+
+	/*
+	 * iov_iter has a single embedded iovec - nothing to do:
+	 */
+	if (iter_is_ubuf(&dio->iter))
+		return 0;
+
+	/*
+	 * We don't currently handle non-iovec iov_iters here - return an error,
+	 * and we'll fall back to doing the IO synchronously:
+	 */
+	if (!iter_is_iovec(&dio->iter))
+		return -1;
+
+	if (dio->iter.nr_segs > ARRAY_SIZE(dio->inline_vecs)) {
+		iov = kmalloc_array(dio->iter.nr_segs, sizeof(*iov),
+				    GFP_KERNEL);
+		if (unlikely(!iov))
+			return -ENOMEM;
+
+		dio->free_iov = true;
+	}
+
+	memcpy(iov, dio->iter.__iov, dio->iter.nr_segs * sizeof(*iov));
+	dio->iter.__iov = iov;
+	return 0;
+}
+
+static void bch2_dio_write_loop_async(struct closure *);
+
+static long bch2_dio_write_loop(struct dio_write *dio)
+{
+	struct kiocb *req = dio->req;
+	struct address_space *mapping = req->ki_filp->f_mapping;
+	struct bch_inode_info *inode = dio->iop.inode;
+	struct bio *bio = &dio->iop.op.wbio.bio;
+	struct bvec_iter_all iter;
+	struct bio_vec *bv;
+	bool sync;
+	long ret;
+
+	if (dio->loop)
+		goto loop;
+
+	inode_dio_begin(&inode->v);
+	bch2_pagecache_block_get(&inode->ei_pagecache_lock);
+
+	/* Write and invalidate pagecache range that we're writing to: */
+	ret = write_invalidate_inode_pages_range(mapping, req->ki_pos,
+				req->ki_pos + iov_iter_count(&dio->iter) - 1);
+	if (unlikely(ret))
+		goto err;
+
+	while (1) {
+		if (current != dio->task)
+			kthread_use_mm(dio->task->mm);
+		BUG_ON(current->faults_disabled_mapping);
+		current->faults_disabled_mapping = mapping;
+
+		ret = bio_iov_iter_get_pages(bio, &dio->iter);
+
+		current->faults_disabled_mapping = NULL;
+		if (current != dio->task)
+			kthread_unuse_mm(dio->task->mm);
+
+		if (unlikely(ret < 0))
+			goto err;
+
+		/* gup might have faulted pages back in: */
+		ret = write_invalidate_inode_pages_range(mapping,
+				req->ki_pos + (dio->iop.op.written << 9),
+				req->ki_pos + iov_iter_count(&dio->iter) - 1);
+		if (unlikely(ret))
+			goto err;
+
+		dio->iop.op.pos = POS(inode->v.i_ino,
+				(req->ki_pos >> 9) + dio->iop.op.written);
+
+		task_io_account_write(bio->bi_iter.bi_size);
+
+		closure_call(&dio->iop.op.cl, bch2_write, NULL, &dio->cl);
+
+		if (!dio->sync && !dio->loop && dio->iter.count) {
+			if (bch2_dio_write_copy_iov(dio)) {
+				dio->iop.op.error = -ENOMEM;
+				goto err_wait_io;
+			}
+		}
+err_wait_io:
+		dio->loop = true;
+
+		if (!dio->sync) {
+			continue_at(&dio->cl, bch2_dio_write_loop_async, NULL);
+			return -EIOCBQUEUED;
+		}
+
+		closure_sync(&dio->cl);
+loop:
+		bio_for_each_segment_all(bv, bio, iter)
+			put_page(bv->bv_page);
+		if (!dio->iter.count || dio->iop.op.error)
+			break;
+		bio_reset(bio, NULL, REQ_OP_WRITE);
+	}
+
+	ret = dio->iop.op.error ?: ((long) dio->iop.op.written << 9);
+err:
+	bch2_pagecache_block_put(&inode->ei_pagecache_lock);
+	bch2_disk_reservation_put(dio->iop.op.c, &dio->iop.op.res);
+	bch2_quota_reservation_put(dio->iop.op.c, inode, &dio->quota_res);
+
+	if (dio->free_iov)
+		kfree(dio->iter.__iov);
+
+	closure_debug_destroy(&dio->cl);
+
+	sync = dio->sync;
+	bio_put(bio);
+
+	/* inode->i_dio_count is our ref on inode and thus bch_fs */
+	inode_dio_end(&inode->v);
+
+	if (!sync) {
+		req->ki_complete(req, ret);
+		ret = -EIOCBQUEUED;
+	}
+	return ret;
+}
+
+static void bch2_dio_write_loop_async(struct closure *cl)
+{
+	struct dio_write *dio = container_of(cl, struct dio_write, cl);
+
+	bch2_dio_write_loop(dio);
+}
+
+static noinline
+ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter)
+{
+	struct file *file = req->ki_filp;
+	struct bch_inode_info *inode = file_bch_inode(file);
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	struct dio_write *dio;
+	struct bio *bio;
+	loff_t offset = req->ki_pos;
+	ssize_t ret;
+
+	lockdep_assert_held(&inode->v.i_rwsem);
+
+	if (unlikely(!iter->count))
+		return 0;
+
+	if (unlikely((offset|iter->count) & (block_bytes(c) - 1)))
+		return -EINVAL;
+
+	bio = bio_alloc_bioset(NULL,
+			       iov_iter_npages(iter, BIO_MAX_VECS),
+			       REQ_OP_WRITE,
+			       GFP_KERNEL,
+			       &c->dio_write_bioset);
+	dio = container_of(bio, struct dio_write, iop.op.wbio.bio);
+	closure_init(&dio->cl, NULL);
+	dio->req		= req;
+	dio->task		= current;
+	dio->loop		= false;
+	dio->sync		= is_sync_kiocb(req) ||
+		offset + iter->count > inode->v.i_size;
+	dio->free_iov		= false;
+	dio->quota_res.sectors	= 0;
+	dio->iter		= *iter;
+	bch2_fswrite_op_init(&dio->iop, c, inode, io_opts(c, inode), true);
+	dio->iop.op.write_point	= writepoint_hashed((unsigned long) dio->task);
+	dio->iop.op.flags |= BCH_WRITE_NOPUT_RESERVATION;
+
+	if ((req->ki_flags & IOCB_DSYNC) &&
+	    !c->opts.journal_flush_disabled)
+		dio->iop.op.flags |= BCH_WRITE_FLUSH;
+
+	ret = bch2_quota_reservation_add(c, inode, &dio->quota_res,
+					 iter->count >> 9, true);
+	if (unlikely(ret))
+		goto err;
+
+	ret = bch2_disk_reservation_get(c, &dio->iop.op.res, iter->count >> 9,
+					dio->iop.op.opts.data_replicas, 0);
+	if (unlikely(ret)) {
+		if (bch2_check_range_allocated(c, POS(inode->v.i_ino,
+						      offset >> 9),
+					       iter->count >> 9))
+			goto err;
+
+		dio->iop.unalloc = true;
+	}
+
+	dio->iop.op.nr_replicas	= dio->iop.op.res.nr_replicas;
+
+	return bch2_dio_write_loop(dio);
+err:
+	bch2_disk_reservation_put(c, &dio->iop.op.res);
+	bch2_quota_reservation_put(c, inode, &dio->quota_res);
+	closure_debug_destroy(&dio->cl);
+	bio_put(bio);
+	return ret;
+}
+
+static ssize_t __bch2_write_iter(struct kiocb *iocb, struct iov_iter *from)
+{
+	struct file *file = iocb->ki_filp;
+	ssize_t	ret;
+
+	if (iocb->ki_flags & IOCB_DIRECT)
+		return bch2_direct_write(iocb, from);
+
+	ret = file_remove_privs(file);
+	if (ret)
+		return ret;
+
+	ret = file_update_time(file);
+	if (ret)
+		return ret;
+
+	ret = iocb->ki_flags & IOCB_DIRECT
+		? bch2_direct_write(iocb, from)
+		: bch2_buffered_write(iocb, from);
+
+	if (likely(ret > 0))
+		iocb->ki_pos += ret;
+
+	return ret;
+}
+
+ssize_t bch2_write_iter(struct kiocb *iocb, struct iov_iter *from)
+{
+	struct bch_inode_info *inode = file_bch_inode(iocb->ki_filp);
+	bool direct = iocb->ki_flags & IOCB_DIRECT;
+	ssize_t ret;
+
+	inode_lock(&inode->v);
+	ret = generic_write_checks(iocb, from);
+	if (ret > 0)
+		ret = __bch2_write_iter(iocb, from);
+	inode_unlock(&inode->v);
+
+	if (ret > 0 && !direct)
+		ret = generic_write_sync(iocb, ret);
+
+	return ret;
+}
+
+/* fsync: */
+
+int bch2_fsync(struct file *file, loff_t start, loff_t end, int datasync)
+{
+	struct bch_inode_info *inode = file_bch_inode(file);
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	int ret;
+
+	ret = file_write_and_wait_range(file, start, end);
+	if (ret)
+		return ret;
+
+	if (datasync && !(inode->v.i_state & I_DIRTY_DATASYNC))
+		goto out;
+
+	ret = sync_inode_metadata(&inode->v, 1);
+	if (ret)
+		return ret;
+out:
+	if (c->opts.journal_flush_disabled)
+		return 0;
+
+	return bch2_journal_flush_seq(&c->journal, inode->ei_journal_seq);
+}
+
+/* truncate: */
+
+static inline int range_has_data(struct bch_fs *c,
+				  struct bpos start,
+				  struct bpos end)
+{
+
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	int ret = 0;
+
+	for_each_btree_key(&iter, c, BTREE_ID_EXTENTS,
+			   start, 0, k) {
+		if (bkey_cmp(bkey_start_pos(k.k), end) >= 0)
+			break;
+
+		if (bkey_extent_is_data(k.k)) {
+			ret = 1;
+			break;
+		}
+	}
+
+	return bch2_btree_iter_unlock(&iter) ?: ret;
+}
+
+static int __bch2_truncate_page(struct bch_inode_info *inode,
+				pgoff_t index, loff_t start, loff_t end)
+{
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	struct address_space *mapping = inode->v.i_mapping;
+	unsigned start_offset = start & (PAGE_SIZE - 1);
+	unsigned end_offset = ((end - 1) & (PAGE_SIZE - 1)) + 1;
+	struct page *page;
+	int ret = 0;
+
+	/* Page boundary? Nothing to do */
+	if (!((index == start >> PAGE_SHIFT && start_offset) ||
+	      (index == end >> PAGE_SHIFT && end_offset != PAGE_SIZE)))
+		return 0;
+
+	/* Above i_size? */
+	if (index << PAGE_SHIFT >= inode->v.i_size)
+		return 0;
+
+	page = find_lock_page(mapping, index);
+	if (!page) {
+		/*
+		 * XXX: we're doing two index lookups when we end up reading the
+		 * page
+		 */
+		ret = range_has_data(c,
+				POS(inode->v.i_ino, index << PAGE_SECTOR_SHIFT),
+				POS(inode->v.i_ino, (index + 1) << PAGE_SECTOR_SHIFT));
+		if (ret <= 0)
+			return ret;
+
+		page = find_or_create_page(mapping, index, GFP_KERNEL);
+		if (unlikely(!page)) {
+			ret = -ENOMEM;
+			goto out;
+		}
+	}
+
+	if (!PageUptodate(page)) {
+		ret = bch2_read_single_page(page, mapping);
+		if (ret)
+			goto unlock;
+	}
+
+	/*
+	 * Bit of a hack - we don't want truncate to fail due to -ENOSPC.
+	 *
+	 * XXX: because we aren't currently tracking whether the page has actual
+	 * data in it (vs. just 0s, or only partially written) this wrong. ick.
+	 */
+	ret = bch2_get_page_reservation(c, inode, page, false);
+	BUG_ON(ret);
+
+	if (index == start >> PAGE_SHIFT &&
+	    index == end >> PAGE_SHIFT)
+		zero_user_segment(page, start_offset, end_offset);
+	else if (index == start >> PAGE_SHIFT)
+		zero_user_segment(page, start_offset, PAGE_SIZE);
+	else if (index == end >> PAGE_SHIFT)
+		zero_user_segment(page, 0, end_offset);
+
+	if (!PageDirty(page))
+		set_page_dirty(page);
+unlock:
+	unlock_page(page);
+	put_page(page);
+out:
+	return ret;
+}
+
+static int bch2_truncate_page(struct bch_inode_info *inode, loff_t from)
+{
+	return __bch2_truncate_page(inode, from >> PAGE_SHIFT,
+				    from, from + PAGE_SIZE);
+}
+
+static int bch2_extend(struct bch_inode_info *inode, struct iattr *iattr)
+{
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	struct address_space *mapping = inode->v.i_mapping;
+	int ret;
+
+	ret = filemap_write_and_wait_range(mapping,
+			inode->ei_inode.bi_size, S64_MAX);
+	if (ret)
+		return ret;
+
+	truncate_setsize(&inode->v, iattr->ia_size);
+	/* ATTR_MODE will never be set here, ns argument isn't needed: */
+	setattr_copy(NULL, &inode->v, iattr);
+
+	mutex_lock(&inode->ei_update_lock);
+	inode_set_ctime_current(&inode->v);
+	inode->v.i_mtime = inode_get_ctime(&inode->v);
+	ret = bch2_write_inode_size(c, inode, inode->v.i_size);
+	mutex_unlock(&inode->ei_update_lock);
+
+	return ret;
+}
+
+int bch2_truncate(struct bch_inode_info *inode, struct iattr *iattr)
+{
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	struct address_space *mapping = inode->v.i_mapping;
+	struct i_sectors_hook i_sectors_hook =
+		i_sectors_hook_init(inode, BCH_INODE_I_SIZE_DIRTY);
+	bool shrink;
+	int ret = 0;
+
+	inode_dio_wait(&inode->v);
+	bch2_pagecache_block_get(&inode->ei_pagecache_lock);
+
+	BUG_ON(inode->v.i_size < inode->ei_inode.bi_size);
+
+	shrink = iattr->ia_size <= inode->v.i_size;
+
+	if (!shrink) {
+		ret = bch2_extend(inode, iattr);
+		goto err_put_pagecache;
+	}
+
+	ret = bch2_truncate_page(inode, iattr->ia_size);
+	if (unlikely(ret))
+		goto err_put_pagecache;
+
+	if (iattr->ia_size > inode->ei_inode.bi_size)
+		ret = filemap_write_and_wait_range(mapping,
+				inode->ei_inode.bi_size,
+				iattr->ia_size - 1);
+	else if (iattr->ia_size & (PAGE_SIZE - 1))
+		ret = filemap_write_and_wait_range(mapping,
+				round_down(iattr->ia_size, PAGE_SIZE),
+				iattr->ia_size - 1);
+	if (ret)
+		goto err_put_pagecache;
+
+	i_sectors_hook.new_i_size = iattr->ia_size;
+
+	ret = i_sectors_dirty_start(c, &i_sectors_hook);
+	if (unlikely(ret))
+		goto err_put_pagecache;
+
+	truncate_setsize(&inode->v, iattr->ia_size);
+
+	ret = bch2_inode_truncate(c, inode->v.i_ino,
+				  round_up(iattr->ia_size, PAGE_SIZE) >> 9,
+				  &i_sectors_hook.hook,
+				  &inode->ei_journal_seq);
+	if (unlikely(ret))
+		goto err_put_sectors_dirty;
+
+	/* ATTR_MODE will never be set here, ns argument isn't needed: */
+	setattr_copy(NULL, &inode->v, iattr);
+	inode_set_ctime_current(&inode->v);
+	inode->v.i_mtime = inode_get_ctime(&inode->v);
+out:
+	ret = i_sectors_dirty_finish(c, &i_sectors_hook) ?: ret;
+err_put_pagecache:
+	bch2_pagecache_block_put(&inode->ei_pagecache_lock);
+	return ret;
+err_put_sectors_dirty:
+	/*
+	 * On error - in particular, bch2_truncate_page() error - don't clear
+	 * I_SIZE_DIRTY, as we've left data above i_size!:
+	 */
+	i_sectors_hook.flags &= ~BCH_INODE_I_SIZE_DIRTY;
+	goto out;
+}
+
+/* fallocate: */
+
+static long bch2_fpunch(struct bch_inode_info *inode, loff_t offset, loff_t len)
+{
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	u64 ino = inode->v.i_ino;
+	u64 discard_start = round_up(offset, PAGE_SIZE) >> 9;
+	u64 discard_end = round_down(offset + len, PAGE_SIZE) >> 9;
+	int ret = 0;
+
+	inode_lock(&inode->v);
+	inode_dio_wait(&inode->v);
+	bch2_pagecache_block_get(&inode->ei_pagecache_lock);
+
+	ret = __bch2_truncate_page(inode,
+				   offset >> PAGE_SHIFT,
+				   offset, offset + len);
+	if (unlikely(ret))
+		goto err;
+
+	if (offset >> PAGE_SHIFT !=
+	    (offset + len) >> PAGE_SHIFT) {
+		ret = __bch2_truncate_page(inode,
+					   (offset + len) >> PAGE_SHIFT,
+					   offset, offset + len);
+		if (unlikely(ret))
+			goto err;
+	}
+
+	truncate_pagecache_range(&inode->v, offset, offset + len - 1);
+
+	if (discard_start < discard_end) {
+		/*
+		 * We need to pass in a disk reservation here because we might
+		 * be splitting a compressed extent into two. This isn't a
+		 * problem with truncate because truncate will never split an
+		 * extent, only truncate it...
+		 */
+		struct disk_reservation disk_res =
+			bch2_disk_reservation_init(c, 0);
+		struct i_sectors_hook i_sectors_hook =
+			i_sectors_hook_init(inode, 0);
+		int ret;
+
+		ret = i_sectors_dirty_start(c, &i_sectors_hook);
+		if (unlikely(ret))
+			goto err;
+
+		ret = bch2_btree_delete_range(c,
+				BTREE_ID_EXTENTS,
+				POS(ino, discard_start),
+				POS(ino, discard_end),
+				ZERO_VERSION,
+				&disk_res,
+				&i_sectors_hook.hook,
+				&inode->ei_journal_seq);
+
+		ret = i_sectors_dirty_finish(c, &i_sectors_hook) ?: ret;
+	}
+err:
+	bch2_pagecache_block_put(&inode->ei_pagecache_lock);
+	inode_unlock(&inode->v);
+
+	return ret;
+}
+
+static long bch2_fcollapse(struct bch_inode_info *inode,
+			   loff_t offset, loff_t len)
+{
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	struct address_space *mapping = inode->v.i_mapping;
+	struct btree_iter src;
+	struct btree_iter dst;
+	BKEY_PADDED(k) copy;
+	struct bkey_s_c k;
+	struct i_sectors_hook i_sectors_hook = i_sectors_hook_init(inode, 0);
+	loff_t new_size;
+	int ret;
+
+	if ((offset | len) & (block_bytes(c) - 1))
+		return -EINVAL;
+
+	bch2_btree_iter_init(&dst, c, BTREE_ID_EXTENTS,
+			     POS(inode->v.i_ino, offset >> 9),
+			     BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+	/* position will be set from dst iter's position: */
+	bch2_btree_iter_init(&src, c, BTREE_ID_EXTENTS, POS_MIN,
+			     BTREE_ITER_SLOTS);
+	bch2_btree_iter_link(&src, &dst);
+
+	/*
+	 * We need i_mutex to keep the page cache consistent with the extents
+	 * btree, and the btree consistent with i_size - we don't need outside
+	 * locking for the extents btree itself, because we're using linked
+	 * iterators
+	 */
+	inode_lock(&inode->v);
+	inode_dio_wait(&inode->v);
+	bch2_pagecache_block_get(&inode->ei_pagecache_lock);
+
+	ret = -EINVAL;
+	if (offset + len >= inode->v.i_size)
+		goto err;
+
+	if (inode->v.i_size < len)
+		goto err;
+
+	new_size = inode->v.i_size - len;
+
+	ret = write_invalidate_inode_pages_range(mapping, offset, LLONG_MAX);
+	if (ret)
+		goto err;
+
+	ret = i_sectors_dirty_start(c, &i_sectors_hook);
+	if (ret)
+		goto err;
+
+	while (bkey_cmp(dst.pos,
+			POS(inode->v.i_ino,
+			    round_up(new_size, PAGE_SIZE) >> 9)) < 0) {
+		struct disk_reservation disk_res;
+
+		bch2_btree_iter_set_pos(&src,
+			POS(dst.pos.inode, dst.pos.offset + (len >> 9)));
+
+		k = bch2_btree_iter_peek_slot(&src);
+		if ((ret = btree_iter_err(k)))
+			goto btree_iter_err;
+
+		bkey_reassemble(&copy.k, k);
+
+		bch2_cut_front(src.pos, &copy.k);
+		copy.k.k.p.offset -= len >> 9;
+
+		BUG_ON(bkey_cmp(dst.pos, bkey_start_pos(&copy.k.k)));
+
+		ret = bch2_disk_reservation_get(c, &disk_res, copy.k.k.size,
+				bch2_extent_nr_dirty_ptrs(bkey_i_to_s_c(&copy.k)),
+				BCH_DISK_RESERVATION_NOFAIL);
+		BUG_ON(ret);
+
+		ret = bch2_btree_insert_at(c, &disk_res, &i_sectors_hook.hook,
+					   &inode->ei_journal_seq,
+					   BTREE_INSERT_ATOMIC|
+					   BTREE_INSERT_NOFAIL,
+					   BTREE_INSERT_ENTRY(&dst, &copy.k));
+		bch2_disk_reservation_put(c, &disk_res);
+btree_iter_err:
+		if (ret == -EINTR)
+			ret = 0;
+		if (ret) {
+			bch2_btree_iter_unlock(&src);
+			bch2_btree_iter_unlock(&dst);
+			goto err_put_sectors_dirty;
+		}
+		/*
+		 * XXX: if we error here we've left data with multiple
+		 * pointers... which isn't a _super_ serious problem...
+		 */
+
+		bch2_btree_iter_cond_resched(&src);
+	}
+
+	bch2_btree_iter_unlock(&src);
+	bch2_btree_iter_unlock(&dst);
+
+	ret = bch2_inode_truncate(c, inode->v.i_ino,
+				 round_up(new_size, block_bytes(c)) >> 9,
+				 &i_sectors_hook.hook,
+				 &inode->ei_journal_seq);
+	if (ret)
+		goto err_put_sectors_dirty;
+
+	i_sectors_hook.new_i_size = new_size;
+err_put_sectors_dirty:
+	ret = i_sectors_dirty_finish(c, &i_sectors_hook) ?: ret;
+err:
+	bch2_pagecache_block_put(&inode->ei_pagecache_lock);
+	inode_unlock(&inode->v);
+	return ret;
+}
+
+static long bch2_fallocate(struct bch_inode_info *inode, int mode,
+			   loff_t offset, loff_t len)
+{
+	struct address_space *mapping = inode->v.i_mapping;
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	struct i_sectors_hook i_sectors_hook = i_sectors_hook_init(inode, 0);
+	struct btree_iter iter;
+	struct bpos end_pos;
+	loff_t block_start, block_end;
+	loff_t end = offset + len;
+	unsigned sectors;
+	unsigned replicas = io_opts(c, inode).data_replicas;
+	int ret;
+
+	bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, POS_MIN,
+			     BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+
+	inode_lock(&inode->v);
+	inode_dio_wait(&inode->v);
+	bch2_pagecache_block_get(&inode->ei_pagecache_lock);
+
+	if (!(mode & FALLOC_FL_KEEP_SIZE) && end > inode->v.i_size) {
+		ret = inode_newsize_ok(&inode->v, end);
+		if (ret)
+			goto err;
+	}
+
+	if (mode & FALLOC_FL_ZERO_RANGE) {
+		ret = __bch2_truncate_page(inode,
+					   offset >> PAGE_SHIFT,
+					   offset, end);
+
+		if (!ret &&
+		    offset >> PAGE_SHIFT != end >> PAGE_SHIFT)
+			ret = __bch2_truncate_page(inode,
+						   end >> PAGE_SHIFT,
+						   offset, end);
+
+		if (unlikely(ret))
+			goto err;
+
+		truncate_pagecache_range(&inode->v, offset, end - 1);
+
+		block_start	= round_up(offset, PAGE_SIZE);
+		block_end	= round_down(end, PAGE_SIZE);
+	} else {
+		block_start	= round_down(offset, PAGE_SIZE);
+		block_end	= round_up(end, PAGE_SIZE);
+	}
+
+	bch2_btree_iter_set_pos(&iter, POS(inode->v.i_ino, block_start >> 9));
+	end_pos = POS(inode->v.i_ino, block_end >> 9);
+
+	ret = i_sectors_dirty_start(c, &i_sectors_hook);
+	if (unlikely(ret))
+		goto err;
+
+	while (bkey_cmp(iter.pos, end_pos) < 0) {
+		struct disk_reservation disk_res = { 0 };
+		struct bkey_i_reservation reservation;
+		struct bkey_s_c k;
+
+		k = bch2_btree_iter_peek_slot(&iter);
+		if ((ret = btree_iter_err(k)))
+			goto btree_iter_err;
+
+		/* already reserved */
+		if (k.k->type == BCH_RESERVATION &&
+		    bkey_s_c_to_reservation(k).v->nr_replicas >= replicas) {
+			bch2_btree_iter_next_slot(&iter);
+			continue;
+		}
+
+		if (bkey_extent_is_data(k.k)) {
+			if (!(mode & FALLOC_FL_ZERO_RANGE)) {
+				bch2_btree_iter_next_slot(&iter);
+				continue;
+			}
+		}
+
+		bkey_reservation_init(&reservation.k_i);
+		reservation.k.type	= BCH_RESERVATION;
+		reservation.k.p		= k.k->p;
+		reservation.k.size	= k.k->size;
+
+		bch2_cut_front(iter.pos, &reservation.k_i);
+		bch2_cut_back(end_pos, &reservation.k);
+
+		sectors = reservation.k.size;
+		reservation.v.nr_replicas = bch2_extent_nr_dirty_ptrs(k);
+
+		if (!bkey_extent_is_allocation(k.k)) {
+			ret = bch2_quota_reservation_add(c, inode,
+					&i_sectors_hook.quota_res,
+					sectors, true);
+			if (unlikely(ret))
+				goto btree_iter_err;
+		}
+
+		if (reservation.v.nr_replicas < replicas ||
+		    bch2_extent_is_compressed(k)) {
+			ret = bch2_disk_reservation_get(c, &disk_res, sectors,
+							replicas, 0);
+			if (unlikely(ret))
+				goto btree_iter_err;
+
+			reservation.v.nr_replicas = disk_res.nr_replicas;
+		}
+
+		ret = bch2_btree_insert_at(c, &disk_res, &i_sectors_hook.hook,
+					  &inode->ei_journal_seq,
+					  BTREE_INSERT_ATOMIC|
+					  BTREE_INSERT_NOFAIL,
+					  BTREE_INSERT_ENTRY(&iter, &reservation.k_i));
+		bch2_disk_reservation_put(c, &disk_res);
+btree_iter_err:
+		if (ret == -EINTR)
+			ret = 0;
+		if (ret) {
+			bch2_btree_iter_unlock(&iter);
+			goto err_put_sectors_dirty;
+		}
+
+	}
+	bch2_btree_iter_unlock(&iter);
+
+	ret = i_sectors_dirty_finish(c, &i_sectors_hook) ?: ret;
+
+	if (!(mode & FALLOC_FL_KEEP_SIZE) &&
+	    end > inode->v.i_size) {
+		i_size_write(&inode->v, end);
+
+		mutex_lock(&inode->ei_update_lock);
+		ret = bch2_write_inode_size(c, inode, inode->v.i_size);
+		mutex_unlock(&inode->ei_update_lock);
+	}
+
+	/* blech */
+	if ((mode & FALLOC_FL_KEEP_SIZE) &&
+	    (mode & FALLOC_FL_ZERO_RANGE) &&
+	    inode->ei_inode.bi_size != inode->v.i_size) {
+		/* sync appends.. */
+		ret = filemap_write_and_wait_range(mapping,
+					inode->ei_inode.bi_size, S64_MAX);
+		if (ret)
+			goto err;
+
+		if (inode->ei_inode.bi_size != inode->v.i_size) {
+			mutex_lock(&inode->ei_update_lock);
+			ret = bch2_write_inode_size(c, inode, inode->v.i_size);
+			mutex_unlock(&inode->ei_update_lock);
+		}
+	}
+
+	bch2_pagecache_block_put(&inode->ei_pagecache_lock);
+	inode_unlock(&inode->v);
+
+	return 0;
+err_put_sectors_dirty:
+	ret = i_sectors_dirty_finish(c, &i_sectors_hook) ?: ret;
+err:
+	bch2_pagecache_block_put(&inode->ei_pagecache_lock);
+	inode_unlock(&inode->v);
+	return ret;
+}
+
+long bch2_fallocate_dispatch(struct file *file, int mode,
+			     loff_t offset, loff_t len)
+{
+	struct bch_inode_info *inode = file_bch_inode(file);
+
+	if (!(mode & ~(FALLOC_FL_KEEP_SIZE|FALLOC_FL_ZERO_RANGE)))
+		return bch2_fallocate(inode, mode, offset, len);
+
+	if (mode == (FALLOC_FL_PUNCH_HOLE|FALLOC_FL_KEEP_SIZE))
+		return bch2_fpunch(inode, offset, len);
+
+	if (mode == FALLOC_FL_COLLAPSE_RANGE)
+		return bch2_fcollapse(inode, offset, len);
+
+	return -EOPNOTSUPP;
+}
+
+/* fseek: */
+
+static bool folio_is_data(struct folio *folio)
+{
+	/* XXX: should only have to check PageDirty */
+	return folio_test_private(folio) &&
+		(page_state(&folio->page)->sectors ||
+		 page_state(&folio->page)->dirty_sectors);
+}
+
+static loff_t bch2_next_pagecache_data(struct inode *vinode,
+				       loff_t start_offset,
+				       loff_t end_offset)
+{
+	struct folio_batch fbatch;
+	pgoff_t start_index	= start_offset >> PAGE_SHIFT;
+	pgoff_t end_index	= end_offset >> PAGE_SHIFT;
+	pgoff_t index		= start_index;
+	unsigned i;
+
+	folio_batch_init(&fbatch);
+
+	while (filemap_get_folios(vinode->i_mapping,
+				  &index, end_index, &fbatch)) {
+		for (i = 0; i < folio_batch_count(&fbatch); i++) {
+			struct folio *folio = fbatch.folios[i];
+
+			folio_lock(folio);
+			if (folio_is_data(folio)) {
+				end_offset =
+					min(end_offset,
+					    max(start_offset,
+						((loff_t) index) << PAGE_SHIFT));
+				folio_unlock(folio);
+				folio_batch_release(&fbatch);
+				return end_offset;
+			}
+			folio_unlock(folio);
+		}
+		folio_batch_release(&fbatch);
+		cond_resched();
+	}
+
+	return end_offset;
+}
+
+static loff_t bch2_seek_data(struct file *file, u64 offset)
+{
+	struct bch_inode_info *inode = file_bch_inode(file);
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	u64 isize, next_data = MAX_LFS_FILESIZE;
+	int ret;
+
+	isize = i_size_read(&inode->v);
+	if (offset >= isize)
+		return -ENXIO;
+
+	for_each_btree_key(&iter, c, BTREE_ID_EXTENTS,
+			   POS(inode->v.i_ino, offset >> 9), 0, k) {
+		if (k.k->p.inode != inode->v.i_ino) {
+			break;
+		} else if (bkey_extent_is_data(k.k)) {
+			next_data = max(offset, bkey_start_offset(k.k) << 9);
+			break;
+		} else if (k.k->p.offset >> 9 > isize)
+			break;
+	}
+
+	ret = bch2_btree_iter_unlock(&iter);
+	if (ret)
+		return ret;
+
+	if (next_data > offset)
+		next_data = bch2_next_pagecache_data(&inode->v,
+						     offset, next_data);
+
+	if (next_data > isize)
+		return -ENXIO;
+
+	return vfs_setpos(file, next_data, MAX_LFS_FILESIZE);
+}
+
+static bool page_slot_is_data(struct address_space *mapping, pgoff_t index)
+{
+	struct page *page;
+	bool ret;
+
+	page = find_lock_page(mapping, index);
+	if (!page)
+		return false;
+
+	ret = folio_is_data(page_folio(page));
+	unlock_page(page);
+
+	return ret;
+}
+
+static loff_t bch2_next_pagecache_hole(struct inode *vinode,
+				       loff_t start_offset,
+				       loff_t end_offset)
+{
+	struct address_space *mapping = vinode->i_mapping;
+	pgoff_t index;
+
+	for (index = start_offset >> PAGE_SHIFT;
+	     index < end_offset >> PAGE_SHIFT;
+	     index++)
+		if (!page_slot_is_data(mapping, index))
+			end_offset = max(start_offset,
+					 ((loff_t) index) << PAGE_SHIFT);
+
+	return end_offset;
+}
+
+static loff_t bch2_seek_hole(struct file *file, u64 offset)
+{
+	struct bch_inode_info *inode = file_bch_inode(file);
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	u64 isize, next_hole = MAX_LFS_FILESIZE;
+	int ret;
+
+	isize = i_size_read(&inode->v);
+	if (offset >= isize)
+		return -ENXIO;
+
+	for_each_btree_key(&iter, c, BTREE_ID_EXTENTS,
+			   POS(inode->v.i_ino, offset >> 9),
+			   BTREE_ITER_SLOTS, k) {
+		if (k.k->p.inode != inode->v.i_ino) {
+			next_hole = bch2_next_pagecache_hole(&inode->v,
+					offset, MAX_LFS_FILESIZE);
+			break;
+		} else if (!bkey_extent_is_data(k.k)) {
+			next_hole = bch2_next_pagecache_hole(&inode->v,
+					max(offset, bkey_start_offset(k.k) << 9),
+					k.k->p.offset << 9);
+
+			if (next_hole < k.k->p.offset << 9)
+				break;
+		} else {
+			offset = max(offset, bkey_start_offset(k.k) << 9);
+		}
+	}
+
+	ret = bch2_btree_iter_unlock(&iter);
+	if (ret)
+		return ret;
+
+	if (next_hole > isize)
+		next_hole = isize;
+
+	return vfs_setpos(file, next_hole, MAX_LFS_FILESIZE);
+}
+
+loff_t bch2_llseek(struct file *file, loff_t offset, int whence)
+{
+	switch (whence) {
+	case SEEK_SET:
+	case SEEK_CUR:
+	case SEEK_END:
+		return generic_file_llseek(file, offset, whence);
+	case SEEK_DATA:
+		return bch2_seek_data(file, offset);
+	case SEEK_HOLE:
+		return bch2_seek_hole(file, offset);
+	}
+
+	return -EINVAL;
+}
+
+void bch2_fs_fsio_exit(struct bch_fs *c)
+{
+	bioset_exit(&c->dio_write_bioset);
+	bioset_exit(&c->dio_read_bioset);
+	bioset_exit(&c->writepage_bioset);
+}
+
+int bch2_fs_fsio_init(struct bch_fs *c)
+{
+	int ret = 0;
+
+	pr_verbose_init(c->opts, "");
+
+	if (bioset_init(&c->writepage_bioset,
+			4, offsetof(struct bch_writepage_io, op.op.wbio.bio),
+			BIOSET_NEED_BVECS) ||
+	    bioset_init(&c->dio_read_bioset,
+			4, offsetof(struct dio_read, rbio.bio),
+			BIOSET_NEED_BVECS) ||
+	    bioset_init(&c->dio_write_bioset,
+			4, offsetof(struct dio_write, iop.op.wbio.bio),
+			BIOSET_NEED_BVECS))
+		ret = -ENOMEM;
+
+	pr_verbose_init(c->opts, "ret %i", ret);
+	return ret;
+}
+
+#endif /* NO_BCACHEFS_FS */
diff --git a/fs/bcachefs/fs-io.h b/fs/bcachefs/fs-io.h
new file mode 100644
index 000000000000..2e4bfee877d9
--- /dev/null
+++ b/fs/bcachefs/fs-io.h
@@ -0,0 +1,47 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_FS_IO_H
+#define _BCACHEFS_FS_IO_H
+
+#ifndef NO_BCACHEFS_FS
+
+#include "buckets.h"
+#include "io_types.h"
+
+#include <linux/uio.h>
+
+bool bch2_dirty_folio(struct address_space *, struct folio *);
+
+int bch2_writepage(struct page *, struct writeback_control *);
+int bch2_read_folio(struct file *, struct folio *);
+
+int bch2_writepages(struct address_space *, struct writeback_control *);
+void bch2_readahead(struct readahead_control *);
+
+int bch2_write_begin(struct file *, struct address_space *, loff_t,
+		     unsigned, struct page **, void **);
+int bch2_write_end(struct file *, struct address_space *, loff_t,
+		   unsigned, unsigned, struct page *, void *);
+
+ssize_t bch2_read_iter(struct kiocb *, struct iov_iter *);
+ssize_t bch2_write_iter(struct kiocb *, struct iov_iter *);
+
+int bch2_fsync(struct file *, loff_t, loff_t, int);
+
+int bch2_truncate(struct bch_inode_info *, struct iattr *);
+long bch2_fallocate_dispatch(struct file *, int, loff_t, loff_t);
+
+loff_t bch2_llseek(struct file *, loff_t, int);
+
+vm_fault_t bch2_page_fault(struct vm_fault *);
+vm_fault_t bch2_page_mkwrite(struct vm_fault *);
+void bch2_invalidate_folio(struct folio *, size_t, size_t);
+bool bch2_release_folio(struct folio *, gfp_t);
+
+void bch2_fs_fsio_exit(struct bch_fs *);
+int bch2_fs_fsio_init(struct bch_fs *);
+#else
+static inline void bch2_fs_fsio_exit(struct bch_fs *c) {}
+static inline int bch2_fs_fsio_init(struct bch_fs *c) { return 0; }
+#endif
+
+#endif /* _BCACHEFS_FS_IO_H */
diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c
new file mode 100644
index 000000000000..895ccc79e782
--- /dev/null
+++ b/fs/bcachefs/fs-ioctl.c
@@ -0,0 +1,312 @@
+// SPDX-License-Identifier: GPL-2.0
+#ifndef NO_BCACHEFS_FS
+
+#include "bcachefs.h"
+#include "chardev.h"
+#include "fs.h"
+#include "fs-ioctl.h"
+#include "quota.h"
+
+#include <linux/compat.h>
+#include <linux/mount.h>
+
+#define FS_IOC_GOINGDOWN	     _IOR('X', 125, __u32)
+
+/* Inode flags: */
+
+/* bcachefs inode flags -> vfs inode flags: */
+static const unsigned bch_flags_to_vfs[] = {
+	[__BCH_INODE_SYNC]	= S_SYNC,
+	[__BCH_INODE_IMMUTABLE]	= S_IMMUTABLE,
+	[__BCH_INODE_APPEND]	= S_APPEND,
+	[__BCH_INODE_NOATIME]	= S_NOATIME,
+};
+
+/* bcachefs inode flags -> FS_IOC_GETFLAGS: */
+static const unsigned bch_flags_to_uflags[] = {
+	[__BCH_INODE_SYNC]	= FS_SYNC_FL,
+	[__BCH_INODE_IMMUTABLE]	= FS_IMMUTABLE_FL,
+	[__BCH_INODE_APPEND]	= FS_APPEND_FL,
+	[__BCH_INODE_NODUMP]	= FS_NODUMP_FL,
+	[__BCH_INODE_NOATIME]	= FS_NOATIME_FL,
+};
+
+/* bcachefs inode flags -> FS_IOC_FSGETXATTR: */
+static const unsigned bch_flags_to_xflags[] = {
+	[__BCH_INODE_SYNC]	= FS_XFLAG_SYNC,
+	[__BCH_INODE_IMMUTABLE]	= FS_XFLAG_IMMUTABLE,
+	[__BCH_INODE_APPEND]	= FS_XFLAG_APPEND,
+	[__BCH_INODE_NODUMP]	= FS_XFLAG_NODUMP,
+	[__BCH_INODE_NOATIME]	= FS_XFLAG_NOATIME,
+	//[__BCH_INODE_PROJINHERIT] = FS_XFLAG_PROJINHERIT;
+};
+
+#define set_flags(_map, _in, _out)					\
+do {									\
+	unsigned _i;							\
+									\
+	for (_i = 0; _i < ARRAY_SIZE(_map); _i++)			\
+		if ((_in) & (1 << _i))					\
+			(_out) |= _map[_i];				\
+		else							\
+			(_out) &= ~_map[_i];				\
+} while (0)
+
+#define map_flags(_map, _in)						\
+({									\
+	unsigned _out = 0;						\
+									\
+	set_flags(_map, _in, _out);					\
+	_out;								\
+})
+
+#define map_flags_rev(_map, _in)					\
+({									\
+	unsigned _i, _out = 0;						\
+									\
+	for (_i = 0; _i < ARRAY_SIZE(_map); _i++)			\
+		if ((_in) & _map[_i]) {					\
+			(_out) |= 1 << _i;				\
+			(_in) &= ~_map[_i];				\
+		}							\
+	(_out);								\
+})
+
+#define map_defined(_map)						\
+({									\
+	unsigned _in = ~0;						\
+									\
+	map_flags_rev(_map, _in);					\
+})
+
+/* Set VFS inode flags from bcachefs inode: */
+void bch2_inode_flags_to_vfs(struct bch_inode_info *inode)
+{
+	set_flags(bch_flags_to_vfs, inode->ei_inode.bi_flags, inode->v.i_flags);
+}
+
+struct flags_set {
+	unsigned		mask;
+	unsigned		flags;
+
+	unsigned		projid;
+};
+
+static int bch2_inode_flags_set(struct bch_inode_info *inode,
+				struct bch_inode_unpacked *bi,
+				void *p)
+{
+	/*
+	 * We're relying on btree locking here for exclusion with other ioctl
+	 * calls - use the flags in the btree (@bi), not inode->i_flags:
+	 */
+	struct flags_set *s = p;
+	unsigned newflags = s->flags;
+	unsigned oldflags = bi->bi_flags & s->mask;
+
+	if (((newflags ^ oldflags) & (BCH_INODE_APPEND|BCH_INODE_IMMUTABLE)) &&
+	    !capable(CAP_LINUX_IMMUTABLE))
+		return -EPERM;
+
+	if (!S_ISREG(inode->v.i_mode) &&
+	    !S_ISDIR(inode->v.i_mode) &&
+	    (newflags & (BCH_INODE_NODUMP|BCH_INODE_NOATIME)) != newflags)
+		return -EINVAL;
+
+	bi->bi_flags &= ~s->mask;
+	bi->bi_flags |= newflags;
+	inode_set_ctime_current(&inode->v);
+	return 0;
+}
+
+static int bch2_ioc_getflags(struct bch_inode_info *inode, int __user *arg)
+{
+	unsigned flags = map_flags(bch_flags_to_uflags, inode->ei_inode.bi_flags);
+
+	return put_user(flags, arg);
+}
+
+static int bch2_ioc_setflags(struct bch_fs *c,
+			     struct file *file,
+			     struct bch_inode_info *inode,
+			     void __user *arg)
+{
+	struct flags_set s = { .mask = map_defined(bch_flags_to_uflags) };
+	unsigned uflags;
+	int ret;
+
+	if (get_user(uflags, (int __user *) arg))
+		return -EFAULT;
+
+	s.flags = map_flags_rev(bch_flags_to_uflags, uflags);
+	if (uflags)
+		return -EOPNOTSUPP;
+
+	ret = mnt_want_write_file(file);
+	if (ret)
+		return ret;
+
+	inode_lock(&inode->v);
+	if (!inode_owner_or_capable(file_mnt_idmap(file), &inode->v)) {
+		ret = -EACCES;
+		goto setflags_out;
+	}
+
+	mutex_lock(&inode->ei_update_lock);
+	ret = __bch2_write_inode(c, inode, bch2_inode_flags_set, &s, 0);
+
+	if (!ret)
+		bch2_inode_flags_to_vfs(inode);
+	mutex_unlock(&inode->ei_update_lock);
+
+setflags_out:
+	inode_unlock(&inode->v);
+	mnt_drop_write_file(file);
+	return ret;
+}
+
+static int bch2_ioc_fsgetxattr(struct bch_inode_info *inode,
+			       struct fsxattr __user *arg)
+{
+	struct fsxattr fa = { 0 };
+
+	fa.fsx_xflags = map_flags(bch_flags_to_xflags, inode->ei_inode.bi_flags);
+	fa.fsx_projid = inode->ei_qid.q[QTYP_PRJ];
+
+	return copy_to_user(arg, &fa, sizeof(fa));
+}
+
+static int bch2_set_projid(struct bch_fs *c,
+			   struct bch_inode_info *inode,
+			   u32 projid)
+{
+	struct bch_qid qid = inode->ei_qid;
+	int ret;
+
+	if (projid == inode->ei_qid.q[QTYP_PRJ])
+		return 0;
+
+	qid.q[QTYP_PRJ] = projid;
+
+	return bch2_quota_transfer(c, 1 << QTYP_PRJ, qid, inode->ei_qid,
+				   inode->v.i_blocks +
+				   inode->ei_quota_reserved);
+	if (ret)
+		return ret;
+
+	inode->ei_qid.q[QTYP_PRJ] = projid;
+	return 0;
+}
+
+static int fssetxattr_inode_update_fn(struct bch_inode_info *inode,
+				      struct bch_inode_unpacked *bi,
+				      void *p)
+{
+	struct flags_set *s = p;
+
+	bi->bi_project = s->projid;
+
+	return bch2_inode_flags_set(inode, bi, p);
+}
+
+static int bch2_ioc_fssetxattr(struct bch_fs *c,
+			       struct file *file,
+			       struct bch_inode_info *inode,
+			       struct fsxattr __user *arg)
+{
+	struct flags_set s = { .mask = map_defined(bch_flags_to_xflags) };
+	struct fsxattr fa;
+	int ret;
+
+	if (copy_from_user(&fa, arg, sizeof(fa)))
+		return -EFAULT;
+
+	s.flags = map_flags_rev(bch_flags_to_xflags, fa.fsx_xflags);
+	if (fa.fsx_xflags)
+		return -EOPNOTSUPP;
+
+	s.projid = fa.fsx_projid;
+
+	ret = mnt_want_write_file(file);
+	if (ret)
+		return ret;
+
+	inode_lock(&inode->v);
+	if (!inode_owner_or_capable(file_mnt_idmap(file), &inode->v)) {
+		ret = -EACCES;
+		goto err;
+	}
+
+	mutex_lock(&inode->ei_update_lock);
+	ret = bch2_set_projid(c, inode, fa.fsx_projid);
+	if (ret)
+		goto err_unlock;
+
+	ret = __bch2_write_inode(c, inode, fssetxattr_inode_update_fn, &s, 0);
+	if (!ret)
+		bch2_inode_flags_to_vfs(inode);
+err_unlock:
+	mutex_unlock(&inode->ei_update_lock);
+err:
+	inode_unlock(&inode->v);
+	mnt_drop_write_file(file);
+	return ret;
+}
+
+long bch2_fs_file_ioctl(struct file *file, unsigned cmd, unsigned long arg)
+{
+	struct bch_inode_info *inode = file_bch_inode(file);
+	struct super_block *sb = inode->v.i_sb;
+	struct bch_fs *c = sb->s_fs_info;
+
+	switch (cmd) {
+	case FS_IOC_GETFLAGS:
+		return bch2_ioc_getflags(inode, (int __user *) arg);
+
+	case FS_IOC_SETFLAGS:
+		return bch2_ioc_setflags(c, file, inode, (int __user *) arg);
+
+	case FS_IOC_FSGETXATTR:
+		return bch2_ioc_fsgetxattr(inode, (void __user *) arg);
+	case FS_IOC_FSSETXATTR:
+		return bch2_ioc_fssetxattr(c, file, inode, (void __user *) arg);
+
+	case FS_IOC_GETVERSION:
+		return -ENOTTY;
+	case FS_IOC_SETVERSION:
+		return -ENOTTY;
+
+	case FS_IOC_GOINGDOWN:
+		if (!capable(CAP_SYS_ADMIN))
+			return -EPERM;
+
+		down_write(&sb->s_umount);
+		sb->s_flags |= SB_RDONLY;
+		bch2_fs_emergency_read_only(c);
+		up_write(&sb->s_umount);
+		return 0;
+
+	default:
+		return bch2_fs_ioctl(c, cmd, (void __user *) arg);
+	}
+}
+
+#ifdef CONFIG_COMPAT
+long bch2_compat_fs_ioctl(struct file *file, unsigned cmd, unsigned long arg)
+{
+	/* These are just misnamed, they actually get/put from/to user an int */
+	switch (cmd) {
+	case FS_IOC_GETFLAGS:
+		cmd = FS_IOC_GETFLAGS;
+		break;
+	case FS_IOC32_SETFLAGS:
+		cmd = FS_IOC_SETFLAGS;
+		break;
+	default:
+		return -ENOIOCTLCMD;
+	}
+	return bch2_fs_file_ioctl(file, cmd, (unsigned long) compat_ptr(arg));
+}
+#endif
+
+#endif /* NO_BCACHEFS_FS */
diff --git a/fs/bcachefs/fs-ioctl.h b/fs/bcachefs/fs-ioctl.h
new file mode 100644
index 000000000000..2d117ef80ab2
--- /dev/null
+++ b/fs/bcachefs/fs-ioctl.h
@@ -0,0 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_FS_IOCTL_H
+#define _BCACHEFS_FS_IOCTL_H
+
+void bch2_inode_flags_to_vfs(struct bch_inode_info *);
+
+long bch2_fs_file_ioctl(struct file *, unsigned, unsigned long);
+long bch2_compat_fs_ioctl(struct file *, unsigned, unsigned long);
+
+#endif /* _BCACHEFS_FS_IOCTL_H */
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
new file mode 100644
index 000000000000..3f3d916e0d37
--- /dev/null
+++ b/fs/bcachefs/fs.c
@@ -0,0 +1,1773 @@
+// SPDX-License-Identifier: GPL-2.0
+#ifndef NO_BCACHEFS_FS
+
+#include "bcachefs.h"
+#include "acl.h"
+#include "btree_update.h"
+#include "buckets.h"
+#include "chardev.h"
+#include "dirent.h"
+#include "extents.h"
+#include "fs.h"
+#include "fs-io.h"
+#include "fs-ioctl.h"
+#include "fsck.h"
+#include "inode.h"
+#include "io.h"
+#include "journal.h"
+#include "keylist.h"
+#include "quota.h"
+#include "super.h"
+#include "xattr.h"
+
+#include <linux/aio.h>
+#include <linux/backing-dev.h>
+#include <linux/exportfs.h>
+#include <linux/fiemap.h>
+#include <linux/module.h>
+#include <linux/pagemap.h>
+#include <linux/posix_acl.h>
+#include <linux/random.h>
+#include <linux/seq_file.h>
+#include <linux/statfs.h>
+#include <linux/xattr.h>
+
+static struct kmem_cache *bch2_inode_cache;
+
+static void bch2_vfs_inode_init(struct bch_fs *,
+				struct bch_inode_info *,
+				struct bch_inode_unpacked *);
+
+static void journal_seq_copy(struct bch_inode_info *dst,
+			     u64 journal_seq)
+{
+	u64 old, v = READ_ONCE(dst->ei_journal_seq);
+
+	do {
+		old = v;
+
+		if (old >= journal_seq)
+			break;
+	} while ((v = cmpxchg(&dst->ei_journal_seq, old, journal_seq)) != old);
+}
+
+static void __pagecache_lock_put(struct pagecache_lock *lock, long i)
+{
+	BUG_ON(atomic_long_read(&lock->v) == 0);
+
+	if (atomic_long_sub_return_release(i, &lock->v) == 0)
+		wake_up_all(&lock->wait);
+}
+
+static bool __pagecache_lock_tryget(struct pagecache_lock *lock, long i)
+{
+	long v = atomic_long_read(&lock->v), old;
+
+	do {
+		old = v;
+
+		if (i > 0 ? v < 0 : v > 0)
+			return false;
+	} while ((v = atomic_long_cmpxchg_acquire(&lock->v,
+					old, old + i)) != old);
+	return true;
+}
+
+static void __pagecache_lock_get(struct pagecache_lock *lock, long i)
+{
+	wait_event(lock->wait, __pagecache_lock_tryget(lock, i));
+}
+
+void bch2_pagecache_add_put(struct pagecache_lock *lock)
+{
+	__pagecache_lock_put(lock, 1);
+}
+
+void bch2_pagecache_add_get(struct pagecache_lock *lock)
+{
+	__pagecache_lock_get(lock, 1);
+}
+
+void bch2_pagecache_block_put(struct pagecache_lock *lock)
+{
+	__pagecache_lock_put(lock, -1);
+}
+
+void bch2_pagecache_block_get(struct pagecache_lock *lock)
+{
+	__pagecache_lock_get(lock, -1);
+}
+
+/*
+ * I_SIZE_DIRTY requires special handling:
+ *
+ * To the recovery code, the flag means that there is stale data past i_size
+ * that needs to be deleted; it's used for implementing atomic appends and
+ * truncates.
+ *
+ * On append, we set I_SIZE_DIRTY before doing the write, then after the write
+ * we clear I_SIZE_DIRTY atomically with updating i_size to the new larger size
+ * that exposes the data we just wrote.
+ *
+ * On truncate, it's the reverse: We set I_SIZE_DIRTY atomically with setting
+ * i_size to the new smaller size, then we delete the data that we just made
+ * invisible, and then we clear I_SIZE_DIRTY.
+ *
+ * Because there can be multiple appends in flight at a time, we need a refcount
+ * (i_size_dirty_count) instead of manipulating the flag directly. Nonzero
+ * refcount means I_SIZE_DIRTY is set, zero means it's cleared.
+ *
+ * Because write_inode() can be called at any time, i_size_dirty_count means
+ * something different to the runtime code - it means to write_inode() "don't
+ * update i_size yet".
+ *
+ * We don't clear I_SIZE_DIRTY directly, we let write_inode() clear it when
+ * i_size_dirty_count is zero - but the reverse is not true, I_SIZE_DIRTY must
+ * be set explicitly.
+ */
+
+void bch2_inode_update_after_write(struct bch_fs *c,
+				   struct bch_inode_info *inode,
+				   struct bch_inode_unpacked *bi,
+				   unsigned fields)
+{
+	set_nlink(&inode->v, bi->bi_flags & BCH_INODE_UNLINKED
+		  ? 0
+		  : bi->bi_nlink + nlink_bias(inode->v.i_mode));
+	i_uid_write(&inode->v, bi->bi_uid);
+	i_gid_write(&inode->v, bi->bi_gid);
+	inode->v.i_mode	= bi->bi_mode;
+
+	if (fields & ATTR_ATIME)
+		inode->v.i_atime = bch2_time_to_timespec(c, bi->bi_atime);
+	if (fields & ATTR_MTIME)
+		inode->v.i_mtime = bch2_time_to_timespec(c, bi->bi_mtime);
+	if (fields & ATTR_CTIME)
+		inode_set_ctime_to_ts(&inode->v, bch2_time_to_timespec(c, bi->bi_ctime));
+
+	inode->ei_inode		= *bi;
+	inode->ei_qid		= bch_qid(bi);
+}
+
+int __must_check bch2_write_inode_trans(struct btree_trans *trans,
+				struct bch_inode_info *inode,
+				struct bch_inode_unpacked *inode_u,
+				inode_set_fn set,
+				void *p)
+{
+	struct btree_iter *iter;
+	struct bkey_inode_buf *inode_p;
+	struct bkey_s_c k;
+	u64 inum = inode->v.i_ino;
+	int ret;
+
+	lockdep_assert_held(&inode->ei_update_lock);
+
+	iter = bch2_trans_get_iter(trans, BTREE_ID_INODES, POS(inum, 0),
+				   BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+	if (IS_ERR(iter))
+		return PTR_ERR(iter);
+
+	k = bch2_btree_iter_peek_slot(iter);
+	if ((ret = btree_iter_err(k)))
+		return ret;
+
+	if (WARN_ONCE(k.k->type != BCH_INODE_FS,
+		      "inode %llu not found when updating", inum))
+		return -ENOENT;
+
+	ret = bch2_inode_unpack(bkey_s_c_to_inode(k), inode_u);
+	if (WARN_ONCE(ret,
+		      "error %i unpacking inode %llu", ret, inum))
+		return -ENOENT;
+
+	BUG_ON(inode_u->bi_size != inode->ei_inode.bi_size);
+
+	BUG_ON(inode_u->bi_size != inode->ei_inode.bi_size &&
+	       !(inode_u->bi_flags & BCH_INODE_I_SIZE_DIRTY) &&
+	       inode_u->bi_size > i_size_read(&inode->v));
+
+	if (set) {
+		ret = set(inode, inode_u, p);
+		if (ret)
+			return ret;
+	}
+
+	inode_p = bch2_trans_kmalloc(trans, sizeof(*inode_p));
+	if (IS_ERR(inode_p))
+		return PTR_ERR(inode_p);
+
+	bch2_inode_pack(inode_p, inode_u);
+	bch2_trans_update(trans, iter, &inode_p->inode.k_i, 0);
+	return 0;
+}
+
+int __must_check __bch2_write_inode(struct bch_fs *c,
+				    struct bch_inode_info *inode,
+				    inode_set_fn set,
+				    void *p, unsigned fields)
+{
+	struct btree_trans trans;
+	struct bch_inode_unpacked inode_u;
+	int ret;
+
+	bch2_trans_init(&trans, c);
+retry:
+	bch2_trans_begin(&trans);
+
+	ret = bch2_write_inode_trans(&trans, inode, &inode_u, set, p) ?:
+		bch2_trans_commit(&trans, NULL, NULL,
+				  &inode->ei_journal_seq,
+				  BTREE_INSERT_ATOMIC|
+				  BTREE_INSERT_NOUNLOCK|
+				  BTREE_INSERT_NOFAIL);
+	if (ret == -EINTR)
+		goto retry;
+
+	/*
+	 * the btree node lock protects inode->ei_inode, not ei_update_lock;
+	 * this is important for inode updates via bchfs_write_index_update
+	 */
+	if (!ret)
+		bch2_inode_update_after_write(c, inode, &inode_u, fields);
+
+	bch2_trans_exit(&trans);
+	return ret < 0 ? ret : 0;
+}
+
+static struct inode *bch2_vfs_inode_get(struct bch_fs *c, u64 inum)
+{
+	struct bch_inode_unpacked inode_u;
+	struct bch_inode_info *inode;
+	int ret;
+
+	inode = to_bch_ei(iget_locked(c->vfs_sb, inum));
+	if (unlikely(!inode))
+		return ERR_PTR(-ENOMEM);
+	if (!(inode->v.i_state & I_NEW))
+		return &inode->v;
+
+	ret = bch2_inode_find_by_inum(c, inum, &inode_u);
+	if (ret) {
+		iget_failed(&inode->v);
+		return ERR_PTR(ret);
+	}
+
+	bch2_vfs_inode_init(c, inode, &inode_u);
+
+	inode->ei_journal_seq = bch2_inode_journal_seq(&c->journal, inum);
+
+	unlock_new_inode(&inode->v);
+
+	return &inode->v;
+}
+
+static void bch2_inode_init_owner(struct bch_inode_unpacked *inode_u,
+				  const struct inode *dir, umode_t mode)
+{
+	kuid_t uid = current_fsuid();
+	kgid_t gid;
+
+	if (dir && dir->i_mode & S_ISGID) {
+		gid = dir->i_gid;
+		if (S_ISDIR(mode))
+			mode |= S_ISGID;
+	} else
+		gid = current_fsgid();
+
+	inode_u->bi_uid		= from_kuid(i_user_ns(dir), uid);
+	inode_u->bi_gid		= from_kgid(i_user_ns(dir), gid);
+	inode_u->bi_mode	= mode;
+}
+
+static int inode_update_for_create_fn(struct bch_inode_info *inode,
+				      struct bch_inode_unpacked *bi,
+				      void *p)
+{
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	struct bch_inode_unpacked *new_inode = p;
+	struct timespec64 now = current_time(&inode->v);
+
+	bi->bi_mtime = bi->bi_ctime = timespec_to_bch2_time(c, now);
+
+	if (S_ISDIR(new_inode->bi_mode))
+		bi->bi_nlink++;
+
+	return 0;
+}
+
+static int inum_test(struct inode *inode, void *p)
+{
+	unsigned long *ino = p;
+
+	return *ino == inode->i_ino;
+}
+
+static struct bch_inode_info *
+__bch2_create(struct mnt_idmap *idmap,
+	      struct bch_inode_info *dir, struct dentry *dentry,
+	      umode_t mode, dev_t rdev, bool tmpfile)
+{
+	struct bch_fs *c = dir->v.i_sb->s_fs_info;
+	struct btree_trans trans;
+	struct bch_inode_unpacked dir_u;
+	struct bch_inode_info *inode, *old;
+	struct bch_inode_unpacked inode_u;
+	struct bch_hash_info hash_info;
+	struct posix_acl *default_acl = NULL, *acl = NULL;
+	int ret;
+
+	bch2_inode_init(c, &inode_u, 0, 0, 0, rdev, &dir->ei_inode);
+	bch2_inode_init_owner(&inode_u, &dir->v, mode);
+
+	inode_u.bi_project = dir->ei_qid.q[QTYP_PRJ];
+
+	hash_info = bch2_hash_info_init(c, &inode_u);
+
+	if (tmpfile)
+		inode_u.bi_flags |= BCH_INODE_UNLINKED;
+
+	ret = bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, 1, BCH_QUOTA_PREALLOC);
+	if (ret)
+		return ERR_PTR(ret);
+
+#ifdef CONFIG_BCACHEFS_POSIX_ACL
+	ret = posix_acl_create(&dir->v, &inode_u.bi_mode, &default_acl, &acl);
+	if (ret)
+		goto err;
+#endif
+
+	/*
+	 * preallocate vfs inode before btree transaction, so that nothing can
+	 * fail after the transaction succeeds:
+	 */
+	inode = to_bch_ei(new_inode(c->vfs_sb));
+	if (unlikely(!inode)) {
+		ret = -ENOMEM;
+		goto err;
+	}
+
+	if (!tmpfile)
+		mutex_lock(&dir->ei_update_lock);
+
+	bch2_trans_init(&trans, c);
+retry:
+	bch2_trans_begin(&trans);
+
+	ret   = __bch2_inode_create(&trans, &inode_u,
+				    BLOCKDEV_INODE_MAX, 0,
+				    &c->unused_inode_hint) ?:
+		(default_acl
+		 ? bch2_set_acl_trans(&trans, &inode_u, &hash_info,
+				      default_acl, ACL_TYPE_DEFAULT)
+		 : 0) ?:
+		(acl
+		 ? bch2_set_acl_trans(&trans, &inode_u, &hash_info,
+				      acl, ACL_TYPE_ACCESS)
+		 : 0) ?:
+		(!tmpfile
+		 ? __bch2_dirent_create(&trans, dir->v.i_ino,
+					&dir->ei_str_hash,
+					mode_to_type(mode),
+					&dentry->d_name,
+					inode_u.bi_inum,
+					BCH_HASH_SET_MUST_CREATE)
+		: 0) ?:
+		(!tmpfile
+		 ? bch2_write_inode_trans(&trans, dir, &dir_u,
+					  inode_update_for_create_fn,
+					  &inode_u)
+		 : 0) ?:
+		bch2_trans_commit(&trans, NULL, NULL,
+				  &inode->ei_journal_seq,
+				  BTREE_INSERT_ATOMIC|
+				  BTREE_INSERT_NOUNLOCK);
+	if (ret == -EINTR)
+		goto retry;
+	if (unlikely(ret))
+		goto err_trans;
+
+	atomic_long_inc(&c->nr_inodes);
+
+	if (!tmpfile) {
+		bch2_inode_update_after_write(c, dir, &dir_u,
+					      ATTR_MTIME|ATTR_CTIME);
+		journal_seq_copy(dir, inode->ei_journal_seq);
+		mutex_unlock(&dir->ei_update_lock);
+	}
+
+	bch2_vfs_inode_init(c, inode, &inode_u);
+
+	set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl);
+	set_cached_acl(&inode->v, ACL_TYPE_DEFAULT, default_acl);
+
+	/*
+	 * we must insert the new inode into the inode cache before calling
+	 * bch2_trans_exit() and dropping locks, else we could race with another
+	 * thread pulling the inode in and modifying it:
+	 */
+
+	inode->v.i_state |= I_CREATING;
+	old = to_bch_ei(inode_insert5(&inode->v, inode->v.i_ino,
+				      inum_test, NULL, &inode->v.i_ino));
+	BUG_ON(!old);
+
+	if (unlikely(old != inode)) {
+		/*
+		 * We raced, another process pulled the new inode into cache
+		 * before us:
+		 */
+		old->ei_journal_seq = inode->ei_journal_seq;
+		make_bad_inode(&inode->v);
+		iput(&inode->v);
+
+		inode = old;
+	} else {
+		/*
+		 * we really don't want insert_inode_locked2() to be setting
+		 * I_NEW...
+		 */
+		unlock_new_inode(&inode->v);
+	}
+
+	bch2_trans_exit(&trans);
+out:
+	posix_acl_release(default_acl);
+	posix_acl_release(acl);
+	return inode;
+err_trans:
+	bch2_trans_exit(&trans);
+	make_bad_inode(&inode->v);
+	iput(&inode->v);
+err:
+	bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, -1, BCH_QUOTA_WARN);
+	inode = ERR_PTR(ret);
+	goto out;
+}
+
+/* methods */
+
+static struct dentry *bch2_lookup(struct inode *vdir, struct dentry *dentry,
+				  unsigned int flags)
+{
+	struct bch_fs *c = vdir->i_sb->s_fs_info;
+	struct bch_inode_info *dir = to_bch_ei(vdir);
+	struct inode *vinode = NULL;
+	u64 inum;
+
+	inum = bch2_dirent_lookup(c, dir->v.i_ino,
+				  &dir->ei_str_hash,
+				  &dentry->d_name);
+
+	if (inum)
+		vinode = bch2_vfs_inode_get(c, inum);
+
+	return d_splice_alias(vinode, dentry);
+}
+
+static int bch2_create(struct mnt_idmap *idmap,
+		       struct inode *vdir, struct dentry *dentry,
+		       umode_t mode, bool excl)
+{
+	struct bch_inode_info *inode =
+		__bch2_create(idmap, to_bch_ei(vdir), dentry, mode|S_IFREG, 0, false);
+
+	if (IS_ERR(inode))
+		return PTR_ERR(inode);
+
+	d_instantiate(dentry, &inode->v);
+	return 0;
+}
+
+static int inode_update_for_link_fn(struct bch_inode_info *inode,
+				    struct bch_inode_unpacked *bi,
+				    void *p)
+{
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	struct timespec64 now = current_time(&inode->v);
+
+	bi->bi_ctime = timespec_to_bch2_time(c, now);
+
+	if (bi->bi_flags & BCH_INODE_UNLINKED)
+		bi->bi_flags &= ~BCH_INODE_UNLINKED;
+	else
+		bi->bi_nlink++;
+
+	return 0;
+}
+
+static int __bch2_link(struct bch_fs *c,
+		       struct bch_inode_info *inode,
+		       struct bch_inode_info *dir,
+		       struct dentry *dentry)
+{
+	struct btree_trans trans;
+	struct bch_inode_unpacked inode_u;
+	int ret;
+
+	lockdep_assert_held(&inode->v.i_rwsem);
+
+	bch2_trans_init(&trans, c);
+retry:
+	bch2_trans_begin(&trans);
+
+	ret   = __bch2_dirent_create(&trans, dir->v.i_ino,
+				     &dir->ei_str_hash,
+				     mode_to_type(inode->v.i_mode),
+				     &dentry->d_name,
+				     inode->v.i_ino,
+				     BCH_HASH_SET_MUST_CREATE) ?:
+		bch2_write_inode_trans(&trans, inode, &inode_u,
+				       inode_update_for_link_fn,
+				       NULL) ?:
+		bch2_trans_commit(&trans, NULL, NULL,
+				  &inode->ei_journal_seq,
+				  BTREE_INSERT_ATOMIC|
+				  BTREE_INSERT_NOUNLOCK);
+
+	if (ret == -EINTR)
+		goto retry;
+
+	if (likely(!ret))
+		bch2_inode_update_after_write(c, inode, &inode_u, ATTR_CTIME);
+
+	bch2_trans_exit(&trans);
+	return ret;
+}
+
+static int bch2_link(struct dentry *old_dentry, struct inode *vdir,
+		     struct dentry *dentry)
+{
+	struct bch_fs *c = vdir->i_sb->s_fs_info;
+	struct bch_inode_info *dir = to_bch_ei(vdir);
+	struct bch_inode_info *inode = to_bch_ei(old_dentry->d_inode);
+	int ret;
+
+	ret = __bch2_link(c, inode, dir, dentry);
+	if (unlikely(ret))
+		return ret;
+
+	ihold(&inode->v);
+	d_instantiate(dentry, &inode->v);
+	return 0;
+}
+
+static int inode_update_dir_for_unlink_fn(struct bch_inode_info *inode,
+					  struct bch_inode_unpacked *bi,
+					  void *p)
+{
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	struct bch_inode_info *unlink_inode = p;
+	struct timespec64 now = current_time(&inode->v);
+
+	bi->bi_mtime = bi->bi_ctime = timespec_to_bch2_time(c, now);
+
+	bi->bi_nlink -= S_ISDIR(unlink_inode->v.i_mode);
+
+	return 0;
+}
+
+static int inode_update_for_unlink_fn(struct bch_inode_info *inode,
+				      struct bch_inode_unpacked *bi,
+				      void *p)
+{
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	struct timespec64 now = current_time(&inode->v);
+
+	bi->bi_ctime = timespec_to_bch2_time(c, now);
+	if (bi->bi_nlink)
+		bi->bi_nlink--;
+	else
+		bi->bi_flags |= BCH_INODE_UNLINKED;
+
+	return 0;
+}
+
+static int bch2_unlink(struct inode *vdir, struct dentry *dentry)
+{
+	struct bch_fs *c = vdir->i_sb->s_fs_info;
+	struct bch_inode_info *dir = to_bch_ei(vdir);
+	struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
+	struct bch_inode_unpacked dir_u, inode_u;
+	struct btree_trans trans;
+	int ret;
+
+	bch2_trans_init(&trans, c);
+retry:
+	bch2_trans_begin(&trans);
+
+	ret   = __bch2_dirent_delete(&trans, dir->v.i_ino,
+				     &dir->ei_str_hash,
+				     &dentry->d_name) ?:
+		bch2_write_inode_trans(&trans, dir, &dir_u,
+				       inode_update_dir_for_unlink_fn,
+				       inode) ?:
+		bch2_write_inode_trans(&trans, inode, &inode_u,
+				       inode_update_for_unlink_fn,
+				       NULL) ?:
+		bch2_trans_commit(&trans, NULL, NULL,
+				  &dir->ei_journal_seq,
+				  BTREE_INSERT_ATOMIC|
+				  BTREE_INSERT_NOUNLOCK|
+				  BTREE_INSERT_NOFAIL);
+	if (ret == -EINTR)
+		goto retry;
+	if (ret)
+		goto err;
+
+	if (dir->ei_journal_seq > inode->ei_journal_seq)
+		inode->ei_journal_seq = dir->ei_journal_seq;
+
+	bch2_inode_update_after_write(c, dir, &dir_u,
+				      ATTR_MTIME|ATTR_CTIME);
+	bch2_inode_update_after_write(c, inode, &inode_u,
+				      ATTR_MTIME);
+err:
+	bch2_trans_exit(&trans);
+
+	return ret;
+}
+
+static int bch2_symlink(struct mnt_idmap *idmap,
+			struct inode *vdir, struct dentry *dentry,
+			const char *symname)
+{
+	struct bch_fs *c = vdir->i_sb->s_fs_info;
+	struct bch_inode_info *dir = to_bch_ei(vdir), *inode;
+	int ret;
+
+	inode = __bch2_create(idmap, dir, dentry, S_IFLNK|S_IRWXUGO, 0, true);
+	if (unlikely(IS_ERR(inode)))
+		return PTR_ERR(inode);
+
+	inode_lock(&inode->v);
+	ret = page_symlink(&inode->v, symname, strlen(symname) + 1);
+	inode_unlock(&inode->v);
+
+	if (unlikely(ret))
+		goto err;
+
+	ret = filemap_write_and_wait_range(inode->v.i_mapping, 0, LLONG_MAX);
+	if (unlikely(ret))
+		goto err;
+
+	journal_seq_copy(dir, inode->ei_journal_seq);
+
+	ret = __bch2_link(c, inode, dir, dentry);
+	if (unlikely(ret))
+		goto err;
+
+	d_instantiate(dentry, &inode->v);
+	return 0;
+err:
+	iput(&inode->v);
+	return ret;
+}
+
+static int bch2_mkdir(struct mnt_idmap *idmap,
+		      struct inode *vdir, struct dentry *dentry, umode_t mode)
+{
+	struct bch_inode_info *inode =
+		__bch2_create(idmap, to_bch_ei(vdir), dentry, mode|S_IFDIR, 0, false);
+
+	if (IS_ERR(inode))
+		return PTR_ERR(inode);
+
+	d_instantiate(dentry, &inode->v);
+	return 0;
+}
+
+static int bch2_rmdir(struct inode *vdir, struct dentry *dentry)
+{
+	struct bch_fs *c = vdir->i_sb->s_fs_info;
+
+	if (bch2_empty_dir(c, dentry->d_inode->i_ino))
+		return -ENOTEMPTY;
+
+	return bch2_unlink(vdir, dentry);
+}
+
+static int bch2_mknod(struct mnt_idmap *idmap,
+		      struct inode *vdir, struct dentry *dentry,
+		      umode_t mode, dev_t rdev)
+{
+	struct bch_inode_info *inode =
+		__bch2_create(idmap, to_bch_ei(vdir), dentry, mode, rdev, false);
+
+	if (IS_ERR(inode))
+		return PTR_ERR(inode);
+
+	d_instantiate(dentry, &inode->v);
+	return 0;
+}
+
+struct rename_info {
+	u64			now;
+	struct bch_inode_info	*src_dir;
+	struct bch_inode_info	*dst_dir;
+	struct bch_inode_info	*src_inode;
+	struct bch_inode_info	*dst_inode;
+	enum bch_rename_mode	mode;
+};
+
+static int inode_update_for_rename_fn(struct bch_inode_info *inode,
+				      struct bch_inode_unpacked *bi,
+				      void *p)
+{
+	struct rename_info *info = p;
+
+	if (inode == info->src_dir) {
+		bi->bi_nlink -= S_ISDIR(info->src_inode->v.i_mode);
+		bi->bi_nlink += info->dst_inode &&
+			S_ISDIR(info->dst_inode->v.i_mode) &&
+			info->mode == BCH_RENAME_EXCHANGE;
+	}
+
+	if (inode == info->dst_dir) {
+		bi->bi_nlink += S_ISDIR(info->src_inode->v.i_mode);
+		bi->bi_nlink -= info->dst_inode &&
+			S_ISDIR(info->dst_inode->v.i_mode);
+	}
+
+	if (inode == info->dst_inode &&
+	    info->mode == BCH_RENAME_OVERWRITE) {
+		BUG_ON(bi->bi_nlink &&
+		       S_ISDIR(info->dst_inode->v.i_mode));
+
+		if (bi->bi_nlink)
+			bi->bi_nlink--;
+		else
+			bi->bi_flags |= BCH_INODE_UNLINKED;
+	}
+
+	if (inode == info->src_dir ||
+	    inode == info->dst_dir)
+		bi->bi_mtime = info->now;
+	bi->bi_ctime = info->now;
+
+	return 0;
+}
+
+static int bch2_rename2(struct mnt_idmap *idmap,
+			struct inode *src_vdir, struct dentry *src_dentry,
+			struct inode *dst_vdir, struct dentry *dst_dentry,
+			unsigned flags)
+{
+	struct bch_fs *c = src_vdir->i_sb->s_fs_info;
+	struct rename_info i = {
+		.now		= timespec_to_bch2_time(c,
+						current_time(src_vdir)),
+		.src_dir	= to_bch_ei(src_vdir),
+		.dst_dir	= to_bch_ei(dst_vdir),
+		.src_inode	= to_bch_ei(src_dentry->d_inode),
+		.dst_inode	= to_bch_ei(dst_dentry->d_inode),
+		.mode		= flags & RENAME_EXCHANGE
+				? BCH_RENAME_EXCHANGE
+			: dst_dentry->d_inode
+				? BCH_RENAME_OVERWRITE : BCH_RENAME,
+	};
+	struct btree_trans trans;
+	struct bch_inode_unpacked dst_dir_u, src_dir_u;
+	struct bch_inode_unpacked src_inode_u, dst_inode_u;
+	u64 journal_seq = 0;
+	int ret;
+
+	if (flags & ~(RENAME_NOREPLACE|RENAME_EXCHANGE))
+		return -EINVAL;
+
+	if (i.mode == BCH_RENAME_OVERWRITE) {
+		if (S_ISDIR(i.src_inode->v.i_mode) !=
+		    S_ISDIR(i.dst_inode->v.i_mode))
+			return -ENOTDIR;
+
+		if (S_ISDIR(i.src_inode->v.i_mode) &&
+		    bch2_empty_dir(c, i.dst_inode->v.i_ino))
+			return -ENOTEMPTY;
+
+		ret = filemap_write_and_wait_range(i.src_inode->v.i_mapping,
+						   0, LLONG_MAX);
+		if (ret)
+			return ret;
+	}
+
+	bch2_trans_init(&trans, c);
+retry:
+	bch2_trans_begin(&trans);
+	i.now = timespec_to_bch2_time(c, current_time(src_vdir)),
+
+	ret   = bch2_dirent_rename(&trans,
+				   i.src_dir, &src_dentry->d_name,
+				   i.dst_dir, &dst_dentry->d_name,
+				   i.mode) ?:
+		bch2_write_inode_trans(&trans, i.src_dir, &src_dir_u,
+				       inode_update_for_rename_fn, &i) ?:
+		(i.src_dir != i.dst_dir
+		 ? bch2_write_inode_trans(&trans, i.dst_dir, &dst_dir_u,
+				       inode_update_for_rename_fn, &i)
+		 : 0 ) ?:
+		bch2_write_inode_trans(&trans, i.src_inode, &src_inode_u,
+				       inode_update_for_rename_fn, &i) ?:
+		(i.dst_inode
+		 ? bch2_write_inode_trans(&trans, i.dst_inode, &dst_inode_u,
+				       inode_update_for_rename_fn, &i)
+		 : 0 ) ?:
+		bch2_trans_commit(&trans, NULL, NULL,
+				  &journal_seq,
+				  BTREE_INSERT_ATOMIC|
+				  BTREE_INSERT_NOUNLOCK);
+	if (ret == -EINTR)
+		goto retry;
+	if (unlikely(ret))
+		goto err;
+
+	bch2_inode_update_after_write(c, i.src_dir, &src_dir_u,
+				      ATTR_MTIME|ATTR_CTIME);
+	journal_seq_copy(i.src_dir, journal_seq);
+
+	if (i.src_dir != i.dst_dir) {
+		bch2_inode_update_after_write(c, i.dst_dir, &dst_dir_u,
+					      ATTR_MTIME|ATTR_CTIME);
+		journal_seq_copy(i.dst_dir, journal_seq);
+	}
+
+	bch2_inode_update_after_write(c, i.src_inode, &src_inode_u,
+				      ATTR_CTIME);
+	if (i.dst_inode)
+		bch2_inode_update_after_write(c, i.dst_inode, &dst_inode_u,
+					      ATTR_CTIME);
+err:
+	bch2_trans_exit(&trans);
+
+	return ret;
+}
+
+struct inode_write_setattr {
+	struct iattr		*attr;
+	struct mnt_idmap	*idmap;
+};
+
+static int inode_update_for_setattr_fn(struct bch_inode_info *inode,
+				       struct bch_inode_unpacked *bi,
+				       void *p)
+{
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	struct inode_write_setattr *s = p;
+	unsigned int ia_valid = s->attr->ia_valid;
+
+	if (ia_valid & ATTR_UID)
+		bi->bi_uid = from_kuid(i_user_ns(&inode->v), s->attr->ia_uid);
+	if (ia_valid & ATTR_GID)
+		bi->bi_gid = from_kgid(i_user_ns(&inode->v), s->attr->ia_gid);
+
+	if (ia_valid & ATTR_ATIME)
+		bi->bi_atime = timespec_to_bch2_time(c, s->attr->ia_atime);
+	if (ia_valid & ATTR_MTIME)
+		bi->bi_mtime = timespec_to_bch2_time(c, s->attr->ia_mtime);
+	if (ia_valid & ATTR_CTIME)
+		bi->bi_ctime = timespec_to_bch2_time(c, s->attr->ia_ctime);
+
+	if (ia_valid & ATTR_MODE) {
+		umode_t mode = s->attr->ia_mode;
+		kgid_t gid = ia_valid & ATTR_GID
+			? s->attr->ia_gid
+			: inode->v.i_gid;
+
+		if (!in_group_p(gid) &&
+		    !capable_wrt_inode_uidgid(s->idmap, &inode->v, CAP_FSETID))
+			mode &= ~S_ISGID;
+		bi->bi_mode = mode;
+	}
+
+	return 0;
+}
+
+static int bch2_setattr_nonsize(struct mnt_idmap *idmap,
+				struct bch_inode_info *inode,
+				struct iattr *iattr)
+{
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	struct bch_qid qid = inode->ei_qid;
+	struct btree_trans trans;
+	struct bch_inode_unpacked inode_u;
+	struct posix_acl *acl = NULL;
+	struct inode_write_setattr s = { iattr, idmap };
+	unsigned qtypes = 0;
+	int ret;
+
+	mutex_lock(&inode->ei_update_lock);
+
+	if (c->opts.usrquota &&
+	    (iattr->ia_valid & ATTR_UID) &&
+	    !uid_eq(iattr->ia_uid, inode->v.i_uid)) {
+		qid.q[QTYP_USR] = from_kuid(i_user_ns(&inode->v), iattr->ia_uid),
+		qtypes |= 1 << QTYP_USR;
+	}
+
+	if (c->opts.grpquota &&
+	    (iattr->ia_valid & ATTR_GID) &&
+	    !gid_eq(iattr->ia_gid, inode->v.i_gid)) {
+		qid.q[QTYP_GRP] = from_kgid(i_user_ns(&inode->v), iattr->ia_gid);
+		qtypes |= 1 << QTYP_GRP;
+	}
+
+	if (qtypes) {
+		ret = bch2_quota_transfer(c, qtypes, qid, inode->ei_qid,
+					  inode->v.i_blocks +
+					  inode->ei_quota_reserved);
+		if (ret)
+			goto err;
+	}
+
+	bch2_trans_init(&trans, c);
+retry:
+	bch2_trans_begin(&trans);
+	kfree(acl);
+	acl = NULL;
+
+	ret = bch2_write_inode_trans(&trans, inode, &inode_u,
+				inode_update_for_setattr_fn, &s) ?:
+		(iattr->ia_valid & ATTR_MODE
+		 ? bch2_acl_chmod(&trans, inode, iattr->ia_mode, &acl)
+		 : 0) ?:
+		bch2_trans_commit(&trans, NULL, NULL,
+				  &inode->ei_journal_seq,
+				  BTREE_INSERT_ATOMIC|
+				  BTREE_INSERT_NOUNLOCK|
+				  BTREE_INSERT_NOFAIL);
+	if (ret == -EINTR)
+		goto retry;
+	if (unlikely(ret))
+		goto err_trans;
+
+	bch2_inode_update_after_write(c, inode, &inode_u, iattr->ia_valid);
+
+	if (acl)
+		set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl);
+err_trans:
+	bch2_trans_exit(&trans);
+err:
+	mutex_unlock(&inode->ei_update_lock);
+
+	return ret;
+}
+
+static int bch2_getattr(struct mnt_idmap *idmap,
+			const struct path *path, struct kstat *stat,
+			u32 request_mask, unsigned query_flags)
+{
+	struct bch_inode_info *inode = to_bch_ei(d_inode(path->dentry));
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+
+	stat->dev	= inode->v.i_sb->s_dev;
+	stat->ino	= inode->v.i_ino;
+	stat->mode	= inode->v.i_mode;
+	stat->nlink	= inode->v.i_nlink;
+	stat->uid	= inode->v.i_uid;
+	stat->gid	= inode->v.i_gid;
+	stat->rdev	= inode->v.i_rdev;
+	stat->size	= i_size_read(&inode->v);
+	stat->atime	= inode->v.i_atime;
+	stat->mtime	= inode->v.i_mtime;
+	stat->ctime	= inode_get_ctime(&inode->v);
+	stat->blksize	= block_bytes(c);
+	stat->blocks	= inode->v.i_blocks;
+
+	if (request_mask & STATX_BTIME) {
+		stat->result_mask |= STATX_BTIME;
+		stat->btime = bch2_time_to_timespec(c, inode->ei_inode.bi_otime);
+	}
+
+	if (inode->ei_inode.bi_flags & BCH_INODE_IMMUTABLE)
+		stat->attributes |= STATX_ATTR_IMMUTABLE;
+	if (inode->ei_inode.bi_flags & BCH_INODE_APPEND)
+		stat->attributes |= STATX_ATTR_APPEND;
+	if (inode->ei_inode.bi_flags & BCH_INODE_NODUMP)
+		stat->attributes |= STATX_ATTR_NODUMP;
+
+	return 0;
+}
+
+static int bch2_setattr(struct mnt_idmap *idmap,
+			struct dentry *dentry, struct iattr *iattr)
+{
+	struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
+	int ret;
+
+	lockdep_assert_held(&inode->v.i_rwsem);
+
+	ret = setattr_prepare(idmap, dentry, iattr);
+	if (ret)
+		return ret;
+
+	return iattr->ia_valid & ATTR_SIZE
+		? bch2_truncate(inode, iattr)
+		: bch2_setattr_nonsize(idmap, inode, iattr);
+}
+
+static int bch2_tmpfile(struct mnt_idmap *idmap,
+			struct inode *vdir, struct file *file, umode_t mode)
+{
+	struct bch_inode_info *inode =
+		__bch2_create(idmap, to_bch_ei(vdir),
+			      file->f_path.dentry, mode, 0, true);
+
+	if (IS_ERR(inode))
+		return PTR_ERR(inode);
+
+	d_mark_tmpfile(file, &inode->v);
+	d_instantiate(file->f_path.dentry, &inode->v);
+	return finish_open_simple(file, 0);
+}
+
+static int bch2_fill_extent(struct fiemap_extent_info *info,
+			    const struct bkey_i *k, unsigned flags)
+{
+	if (bkey_extent_is_data(&k->k)) {
+		struct bkey_s_c_extent e = bkey_i_to_s_c_extent(k);
+		const struct bch_extent_ptr *ptr;
+		struct bch_extent_crc_unpacked crc;
+		int ret;
+
+		extent_for_each_ptr_crc(e, ptr, crc) {
+			int flags2 = 0;
+			u64 offset = ptr->offset;
+
+			if (crc.compression_type)
+				flags2 |= FIEMAP_EXTENT_ENCODED;
+			else
+				offset += crc.offset;
+
+			if ((offset & (PAGE_SECTORS - 1)) ||
+			    (e.k->size & (PAGE_SECTORS - 1)))
+				flags2 |= FIEMAP_EXTENT_NOT_ALIGNED;
+
+			ret = fiemap_fill_next_extent(info,
+						      bkey_start_offset(e.k) << 9,
+						      offset << 9,
+						      e.k->size << 9, flags|flags2);
+			if (ret)
+				return ret;
+		}
+
+		return 0;
+	} else if (k->k.type == BCH_RESERVATION) {
+		return fiemap_fill_next_extent(info,
+					       bkey_start_offset(&k->k) << 9,
+					       0, k->k.size << 9,
+					       flags|
+					       FIEMAP_EXTENT_DELALLOC|
+					       FIEMAP_EXTENT_UNWRITTEN);
+	} else {
+		BUG();
+	}
+}
+
+static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
+		       u64 start, u64 len)
+{
+	struct bch_fs *c = vinode->i_sb->s_fs_info;
+	struct bch_inode_info *ei = to_bch_ei(vinode);
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	BKEY_PADDED(k) tmp;
+	bool have_extent = false;
+	int ret = 0;
+
+	ret = fiemap_prep(&ei->v, info, start, &len, FIEMAP_FLAG_SYNC);
+	if (ret)
+		return ret;
+
+	if (start + len < start)
+		return -EINVAL;
+
+	for_each_btree_key(&iter, c, BTREE_ID_EXTENTS,
+			   POS(ei->v.i_ino, start >> 9), 0, k)
+		if (bkey_extent_is_data(k.k) ||
+		    k.k->type == BCH_RESERVATION) {
+			if (bkey_cmp(bkey_start_pos(k.k),
+				     POS(ei->v.i_ino, (start + len) >> 9)) >= 0)
+				break;
+
+			if (have_extent) {
+				ret = bch2_fill_extent(info, &tmp.k, 0);
+				if (ret)
+					goto out;
+			}
+
+			bkey_reassemble(&tmp.k, k);
+			have_extent = true;
+		}
+
+	if (have_extent)
+		ret = bch2_fill_extent(info, &tmp.k, FIEMAP_EXTENT_LAST);
+out:
+	bch2_btree_iter_unlock(&iter);
+	return ret < 0 ? ret : 0;
+}
+
+static const struct vm_operations_struct bch_vm_ops = {
+	.fault		= bch2_page_fault,
+	.map_pages	= filemap_map_pages,
+	.page_mkwrite   = bch2_page_mkwrite,
+};
+
+static int bch2_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	file_accessed(file);
+
+	vma->vm_ops = &bch_vm_ops;
+	return 0;
+}
+
+/* Directories: */
+
+static loff_t bch2_dir_llseek(struct file *file, loff_t offset, int whence)
+{
+	return generic_file_llseek_size(file, offset, whence,
+					S64_MAX, S64_MAX);
+}
+
+static int bch2_vfs_readdir(struct file *file, struct dir_context *ctx)
+{
+	struct bch_fs *c = file_inode(file)->i_sb->s_fs_info;
+
+	return bch2_readdir(c, file, ctx);
+}
+
+static const struct file_operations bch_file_operations = {
+	.llseek		= bch2_llseek,
+	.read_iter	= bch2_read_iter,
+	.write_iter	= bch2_write_iter,
+	.mmap		= bch2_mmap,
+	.open		= generic_file_open,
+	.fsync		= bch2_fsync,
+	.splice_read	= filemap_splice_read,
+	.splice_write	= iter_file_splice_write,
+	.fallocate	= bch2_fallocate_dispatch,
+	.unlocked_ioctl = bch2_fs_file_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl	= bch2_compat_fs_ioctl,
+#endif
+};
+
+static const struct inode_operations bch_file_inode_operations = {
+	.getattr	= bch2_getattr,
+	.setattr	= bch2_setattr,
+	.fiemap		= bch2_fiemap,
+	.listxattr	= bch2_xattr_list,
+#ifdef CONFIG_BCACHEFS_POSIX_ACL
+	.get_acl	= bch2_get_acl,
+	.set_acl	= bch2_set_acl,
+#endif
+};
+
+static const struct inode_operations bch_dir_inode_operations = {
+	.lookup		= bch2_lookup,
+	.create		= bch2_create,
+	.link		= bch2_link,
+	.unlink		= bch2_unlink,
+	.symlink	= bch2_symlink,
+	.mkdir		= bch2_mkdir,
+	.rmdir		= bch2_rmdir,
+	.mknod		= bch2_mknod,
+	.rename		= bch2_rename2,
+	.getattr	= bch2_getattr,
+	.setattr	= bch2_setattr,
+	.tmpfile	= bch2_tmpfile,
+	.listxattr	= bch2_xattr_list,
+#ifdef CONFIG_BCACHEFS_POSIX_ACL
+	.get_acl	= bch2_get_acl,
+	.set_acl	= bch2_set_acl,
+#endif
+};
+
+static const struct file_operations bch_dir_file_operations = {
+	.llseek		= bch2_dir_llseek,
+	.read		= generic_read_dir,
+	.iterate_shared	= bch2_vfs_readdir,
+	.fsync		= bch2_fsync,
+	.unlocked_ioctl = bch2_fs_file_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl	= bch2_compat_fs_ioctl,
+#endif
+};
+
+static const struct inode_operations bch_symlink_inode_operations = {
+	.get_link	= page_get_link,
+	.getattr	= bch2_getattr,
+	.setattr	= bch2_setattr,
+	.listxattr	= bch2_xattr_list,
+#ifdef CONFIG_BCACHEFS_POSIX_ACL
+	.get_acl	= bch2_get_acl,
+	.set_acl	= bch2_set_acl,
+#endif
+};
+
+static const struct inode_operations bch_special_inode_operations = {
+	.getattr	= bch2_getattr,
+	.setattr	= bch2_setattr,
+	.listxattr	= bch2_xattr_list,
+#ifdef CONFIG_BCACHEFS_POSIX_ACL
+	.get_acl	= bch2_get_acl,
+	.set_acl	= bch2_set_acl,
+#endif
+};
+
+static const struct address_space_operations bch_address_space_operations = {
+	.writepage	= bch2_writepage,
+	.read_folio	= bch2_read_folio,
+	.writepages	= bch2_writepages,
+	.readahead	= bch2_readahead,
+	.dirty_folio	= bch2_dirty_folio,
+	.write_begin	= bch2_write_begin,
+	.write_end	= bch2_write_end,
+	.invalidate_folio = bch2_invalidate_folio,
+	.release_folio	= bch2_release_folio,
+	.direct_IO	= noop_direct_IO,
+#ifdef CONFIG_MIGRATION
+	.migrate_folio	= filemap_migrate_folio,
+#endif
+	.error_remove_page = generic_error_remove_page,
+};
+
+static struct inode *bch2_nfs_get_inode(struct super_block *sb,
+		u64 ino, u32 generation)
+{
+	struct bch_fs *c = sb->s_fs_info;
+	struct inode *vinode;
+
+	if (ino < BCACHEFS_ROOT_INO)
+		return ERR_PTR(-ESTALE);
+
+	vinode = bch2_vfs_inode_get(c, ino);
+	if (IS_ERR(vinode))
+		return ERR_CAST(vinode);
+	if (generation && vinode->i_generation != generation) {
+		/* we didn't find the right inode.. */
+		iput(vinode);
+		return ERR_PTR(-ESTALE);
+	}
+	return vinode;
+}
+
+static struct dentry *bch2_fh_to_dentry(struct super_block *sb, struct fid *fid,
+		int fh_len, int fh_type)
+{
+	return generic_fh_to_dentry(sb, fid, fh_len, fh_type,
+				    bch2_nfs_get_inode);
+}
+
+static struct dentry *bch2_fh_to_parent(struct super_block *sb, struct fid *fid,
+		int fh_len, int fh_type)
+{
+	return generic_fh_to_parent(sb, fid, fh_len, fh_type,
+				    bch2_nfs_get_inode);
+}
+
+static const struct export_operations bch_export_ops = {
+	.fh_to_dentry	= bch2_fh_to_dentry,
+	.fh_to_parent	= bch2_fh_to_parent,
+	//.get_parent	= bch2_get_parent,
+};
+
+static void bch2_vfs_inode_init(struct bch_fs *c,
+				struct bch_inode_info *inode,
+				struct bch_inode_unpacked *bi)
+{
+	bch2_inode_update_after_write(c, inode, bi, ~0);
+
+	inode->v.i_blocks	= bi->bi_sectors;
+	inode->v.i_ino		= bi->bi_inum;
+	inode->v.i_rdev		= bi->bi_dev;
+	inode->v.i_generation	= bi->bi_generation;
+	inode->v.i_size		= bi->bi_size;
+
+	inode->ei_journal_seq	= 0;
+	inode->ei_quota_reserved = 0;
+	inode->ei_str_hash	= bch2_hash_info_init(c, bi);
+
+	bch2_inode_flags_to_vfs(inode);
+
+	inode->v.i_mapping->a_ops = &bch_address_space_operations;
+
+	switch (inode->v.i_mode & S_IFMT) {
+	case S_IFREG:
+		inode->v.i_op	= &bch_file_inode_operations;
+		inode->v.i_fop	= &bch_file_operations;
+		break;
+	case S_IFDIR:
+		inode->v.i_op	= &bch_dir_inode_operations;
+		inode->v.i_fop	= &bch_dir_file_operations;
+		break;
+	case S_IFLNK:
+		inode_nohighmem(&inode->v);
+		inode->v.i_op	= &bch_symlink_inode_operations;
+		break;
+	default:
+		init_special_inode(&inode->v, inode->v.i_mode, inode->v.i_rdev);
+		inode->v.i_op	= &bch_special_inode_operations;
+		break;
+	}
+}
+
+static struct inode *bch2_alloc_inode(struct super_block *sb)
+{
+	struct bch_inode_info *inode;
+
+	inode = kmem_cache_alloc(bch2_inode_cache, GFP_NOFS);
+	if (!inode)
+		return NULL;
+
+	inode_init_once(&inode->v);
+	mutex_init(&inode->ei_update_lock);
+	pagecache_lock_init(&inode->ei_pagecache_lock);
+	mutex_init(&inode->ei_quota_lock);
+	inode->ei_journal_seq = 0;
+
+	return &inode->v;
+}
+
+static void bch2_i_callback(struct rcu_head *head)
+{
+	struct inode *vinode = container_of(head, struct inode, i_rcu);
+	struct bch_inode_info *inode = to_bch_ei(vinode);
+
+	kmem_cache_free(bch2_inode_cache, inode);
+}
+
+static void bch2_destroy_inode(struct inode *vinode)
+{
+	call_rcu(&vinode->i_rcu, bch2_i_callback);
+}
+
+static int inode_update_times_fn(struct bch_inode_info *inode,
+				 struct bch_inode_unpacked *bi,
+				 void *p)
+{
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+
+	bi->bi_atime	= timespec_to_bch2_time(c, inode->v.i_atime);
+	bi->bi_mtime	= timespec_to_bch2_time(c, inode->v.i_mtime);
+	bi->bi_ctime	= timespec_to_bch2_time(c, inode_get_ctime(&inode->v));
+
+	return 0;
+}
+
+static int bch2_vfs_write_inode(struct inode *vinode,
+				struct writeback_control *wbc)
+{
+	struct bch_fs *c = vinode->i_sb->s_fs_info;
+	struct bch_inode_info *inode = to_bch_ei(vinode);
+	int ret;
+
+	mutex_lock(&inode->ei_update_lock);
+	ret = __bch2_write_inode(c, inode, inode_update_times_fn, NULL,
+				 ATTR_ATIME|ATTR_MTIME|ATTR_CTIME);
+	mutex_unlock(&inode->ei_update_lock);
+
+	if (c->opts.journal_flush_disabled)
+		return ret;
+
+	if (!ret && wbc->sync_mode == WB_SYNC_ALL)
+		ret = bch2_journal_flush_seq(&c->journal, inode->ei_journal_seq);
+
+	return ret;
+}
+
+static void bch2_evict_inode(struct inode *vinode)
+{
+	struct bch_fs *c = vinode->i_sb->s_fs_info;
+	struct bch_inode_info *inode = to_bch_ei(vinode);
+
+	truncate_inode_pages_final(&inode->v.i_data);
+
+	clear_inode(&inode->v);
+
+	BUG_ON(!is_bad_inode(&inode->v) && inode->ei_quota_reserved);
+
+	if (!inode->v.i_nlink && !is_bad_inode(&inode->v)) {
+		bch2_quota_acct(c, inode->ei_qid, Q_SPC, -((s64) inode->v.i_blocks),
+				BCH_QUOTA_WARN);
+		bch2_quota_acct(c, inode->ei_qid, Q_INO, -1,
+				BCH_QUOTA_WARN);
+		bch2_inode_rm(c, inode->v.i_ino);
+
+		WARN_ONCE(atomic_long_dec_return(&c->nr_inodes) < 0,
+			  "nr_inodes < 0");
+	}
+}
+
+static int bch2_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+	struct super_block *sb = dentry->d_sb;
+	struct bch_fs *c = sb->s_fs_info;
+	u64 fsid;
+
+	buf->f_type	= BCACHEFS_STATFS_MAGIC;
+	buf->f_bsize	= sb->s_blocksize;
+	buf->f_blocks	= c->capacity >> PAGE_SECTOR_SHIFT;
+	buf->f_bfree	= bch2_fs_sectors_free(c, bch2_fs_usage_read(c)) >>
+			   PAGE_SECTOR_SHIFT;
+	buf->f_bavail	= buf->f_bfree;
+	buf->f_files	= atomic_long_read(&c->nr_inodes);
+	buf->f_ffree	= U64_MAX;
+
+	fsid = le64_to_cpup((void *) c->sb.user_uuid.b) ^
+	       le64_to_cpup((void *) c->sb.user_uuid.b + sizeof(u64));
+	buf->f_fsid.val[0] = fsid & 0xFFFFFFFFUL;
+	buf->f_fsid.val[1] = (fsid >> 32) & 0xFFFFFFFFUL;
+	buf->f_namelen	= BCH_NAME_MAX;
+
+	return 0;
+}
+
+static int bch2_sync_fs(struct super_block *sb, int wait)
+{
+	struct bch_fs *c = sb->s_fs_info;
+
+	if (!wait) {
+		bch2_journal_flush_async(&c->journal, NULL);
+		return 0;
+	}
+
+	return bch2_journal_flush(&c->journal);
+}
+
+static struct bch_fs *bch2_path_to_fs(const char *path)
+{
+	struct bch_fs *c;
+	dev_t dev;
+	int ret;
+
+	ret = lookup_bdev(path, &dev);
+	if (ret)
+		return ERR_PTR(ret);
+
+	c = bch2_dev_to_fs(dev);
+	return c ?: ERR_PTR(-ENOENT);
+}
+
+static struct bch_fs *__bch2_open_as_blockdevs(const char *dev_name, char * const *devs,
+					       unsigned nr_devs, struct bch_opts opts)
+{
+	struct bch_fs *c, *c1, *c2;
+	size_t i;
+
+	if (!nr_devs)
+		return ERR_PTR(-EINVAL);
+
+	c = bch2_fs_open(devs, nr_devs, opts);
+
+	if (IS_ERR(c) && PTR_ERR(c) == -EBUSY) {
+		/*
+		 * Already open?
+		 * Look up each block device, make sure they all belong to a
+		 * filesystem and they all belong to the _same_ filesystem
+		 */
+
+		c1 = bch2_path_to_fs(devs[0]);
+		if (!c1)
+			return c;
+
+		for (i = 1; i < nr_devs; i++) {
+			c2 = bch2_path_to_fs(devs[i]);
+			if (!IS_ERR(c2))
+				closure_put(&c2->cl);
+
+			if (c1 != c2) {
+				closure_put(&c1->cl);
+				return c;
+			}
+		}
+
+		c = c1;
+	}
+
+	if (IS_ERR(c))
+		return c;
+
+	mutex_lock(&c->state_lock);
+
+	if (!bch2_fs_running(c)) {
+		mutex_unlock(&c->state_lock);
+		closure_put(&c->cl);
+		pr_err("err mounting %s: incomplete filesystem", dev_name);
+		return ERR_PTR(-EINVAL);
+	}
+
+	mutex_unlock(&c->state_lock);
+
+	set_bit(BCH_FS_BDEV_MOUNTED, &c->flags);
+	return c;
+}
+
+static struct bch_fs *bch2_open_as_blockdevs(const char *_dev_name,
+					     struct bch_opts opts)
+{
+	char *dev_name = NULL, **devs = NULL, *s;
+	struct bch_fs *c = ERR_PTR(-ENOMEM);
+	size_t i, nr_devs = 0;
+
+	dev_name = kstrdup(_dev_name, GFP_KERNEL);
+	if (!dev_name)
+		goto err;
+
+	for (s = dev_name; s; s = strchr(s + 1, ':'))
+		nr_devs++;
+
+	devs = kcalloc(nr_devs, sizeof(const char *), GFP_KERNEL);
+	if (!devs)
+		goto err;
+
+	for (i = 0, s = dev_name;
+	     s;
+	     (s = strchr(s, ':')) && (*s++ = '\0'))
+		devs[i++] = s;
+
+	c = __bch2_open_as_blockdevs(_dev_name, devs, nr_devs, opts);
+err:
+	kfree(devs);
+	kfree(dev_name);
+	return c;
+}
+
+static int bch2_remount(struct super_block *sb, int *flags, char *data)
+{
+	struct bch_fs *c = sb->s_fs_info;
+	struct bch_opts opts = bch2_opts_empty();
+	int ret;
+
+	opt_set(opts, read_only, (*flags & SB_RDONLY) != 0);
+
+	ret = bch2_parse_mount_opts(&opts, data);
+	if (ret)
+		return ret;
+
+	if (opts.read_only != c->opts.read_only) {
+		const char *err = NULL;
+
+		mutex_lock(&c->state_lock);
+
+		if (opts.read_only) {
+			bch2_fs_read_only(c);
+
+			sb->s_flags |= SB_RDONLY;
+		} else {
+			err = bch2_fs_read_write(c);
+			if (err) {
+				bch_err(c, "error going rw: %s", err);
+				return -EINVAL;
+			}
+
+			sb->s_flags &= ~SB_RDONLY;
+		}
+
+		c->opts.read_only = opts.read_only;
+
+		mutex_unlock(&c->state_lock);
+	}
+
+	if (opts.errors >= 0)
+		c->opts.errors = opts.errors;
+
+	return ret;
+}
+
+static int bch2_show_options(struct seq_file *seq, struct dentry *root)
+{
+	struct bch_fs *c = root->d_sb->s_fs_info;
+	enum bch_opt_id i;
+	char buf[512];
+
+	for (i = 0; i < bch2_opts_nr; i++) {
+		const struct bch_option *opt = &bch2_opt_table[i];
+		u64 v = bch2_opt_get_by_id(&c->opts, i);
+
+		if (opt->mode < OPT_MOUNT)
+			continue;
+
+		if (v == bch2_opt_get_by_id(&bch2_opts_default, i))
+			continue;
+
+		bch2_opt_to_text(c, buf, sizeof(buf), opt, v,
+				 OPT_SHOW_MOUNT_STYLE);
+		seq_putc(seq, ',');
+		seq_puts(seq, buf);
+	}
+
+	return 0;
+
+}
+
+static const struct super_operations bch_super_operations = {
+	.alloc_inode	= bch2_alloc_inode,
+	.destroy_inode	= bch2_destroy_inode,
+	.write_inode	= bch2_vfs_write_inode,
+	.evict_inode	= bch2_evict_inode,
+	.sync_fs	= bch2_sync_fs,
+	.statfs		= bch2_statfs,
+	.show_options	= bch2_show_options,
+	.remount_fs	= bch2_remount,
+#if 0
+	.put_super	= bch2_put_super,
+	.freeze_fs	= bch2_freeze,
+	.unfreeze_fs	= bch2_unfreeze,
+#endif
+};
+
+static int bch2_test_super(struct super_block *s, void *data)
+{
+	return s->s_fs_info == data;
+}
+
+static int bch2_set_super(struct super_block *s, void *data)
+{
+	s->s_fs_info = data;
+	return 0;
+}
+
+static struct dentry *bch2_mount(struct file_system_type *fs_type,
+				 int flags, const char *dev_name, void *data)
+{
+	struct bch_fs *c;
+	struct bch_dev *ca;
+	struct super_block *sb;
+	struct inode *vinode;
+	struct bch_opts opts = bch2_opts_empty();
+	unsigned i;
+	int ret;
+
+	opt_set(opts, read_only, (flags & SB_RDONLY) != 0);
+
+	ret = bch2_parse_mount_opts(&opts, data);
+	if (ret)
+		return ERR_PTR(ret);
+
+	c = bch2_open_as_blockdevs(dev_name, opts);
+	if (IS_ERR(c))
+		return ERR_CAST(c);
+
+	sb = sget(fs_type, bch2_test_super, bch2_set_super, flags|SB_NOSEC, c);
+	if (IS_ERR(sb)) {
+		closure_put(&c->cl);
+		return ERR_CAST(sb);
+	}
+
+	BUG_ON(sb->s_fs_info != c);
+
+	if (sb->s_root) {
+		closure_put(&c->cl);
+
+		if ((flags ^ sb->s_flags) & SB_RDONLY) {
+			ret = -EBUSY;
+			goto err_put_super;
+		}
+		goto out;
+	}
+
+	/* XXX: blocksize */
+	sb->s_blocksize		= PAGE_SIZE;
+	sb->s_blocksize_bits	= PAGE_SHIFT;
+	sb->s_maxbytes		= MAX_LFS_FILESIZE;
+	sb->s_op		= &bch_super_operations;
+	sb->s_export_op		= &bch_export_ops;
+#ifdef CONFIG_BCACHEFS_QUOTA
+	sb->s_qcop		= &bch2_quotactl_operations;
+	sb->s_quota_types	= QTYPE_MASK_USR|QTYPE_MASK_GRP|QTYPE_MASK_PRJ;
+#endif
+	sb->s_xattr		= bch2_xattr_handlers;
+	sb->s_magic		= BCACHEFS_STATFS_MAGIC;
+	sb->s_time_gran		= c->sb.time_precision;
+	c->vfs_sb		= sb;
+	strlcpy(sb->s_id, c->name, sizeof(sb->s_id));
+
+	ret = super_setup_bdi(sb);
+	if (ret)
+		goto err_put_super;
+
+	sb->s_bdi->ra_pages		= VM_READAHEAD_PAGES;
+
+	for_each_online_member(ca, c, i) {
+		struct block_device *bdev = ca->disk_sb.bdev;
+
+		/* XXX: create an anonymous device for multi device filesystems */
+		sb->s_bdev	= bdev;
+		sb->s_dev	= bdev->bd_dev;
+		percpu_ref_put(&ca->io_ref);
+		break;
+	}
+
+#ifdef CONFIG_BCACHEFS_POSIX_ACL
+	if (c->opts.acl)
+		sb->s_flags	|= SB_POSIXACL;
+#endif
+
+	vinode = bch2_vfs_inode_get(c, BCACHEFS_ROOT_INO);
+	if (IS_ERR(vinode)) {
+		ret = PTR_ERR(vinode);
+		goto err_put_super;
+	}
+
+	sb->s_root = d_make_root(vinode);
+	if (!sb->s_root) {
+		ret = -ENOMEM;
+		goto err_put_super;
+	}
+
+	sb->s_flags |= SB_ACTIVE;
+out:
+	return dget(sb->s_root);
+
+err_put_super:
+	deactivate_locked_super(sb);
+	return ERR_PTR(ret);
+}
+
+static void bch2_kill_sb(struct super_block *sb)
+{
+	struct bch_fs *c = sb->s_fs_info;
+
+	generic_shutdown_super(sb);
+
+	if (test_bit(BCH_FS_BDEV_MOUNTED, &c->flags))
+		bch2_fs_stop(c);
+	else
+		closure_put(&c->cl);
+}
+
+static struct file_system_type bcache_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "bcachefs",
+	.mount		= bch2_mount,
+	.kill_sb	= bch2_kill_sb,
+	.fs_flags	= FS_REQUIRES_DEV,
+};
+
+MODULE_ALIAS_FS("bcachefs");
+
+void bch2_vfs_exit(void)
+{
+	unregister_filesystem(&bcache_fs_type);
+	if (bch2_inode_cache)
+		kmem_cache_destroy(bch2_inode_cache);
+}
+
+int __init bch2_vfs_init(void)
+{
+	int ret = -ENOMEM;
+
+	bch2_inode_cache = KMEM_CACHE(bch_inode_info, 0);
+	if (!bch2_inode_cache)
+		goto err;
+
+	ret = register_filesystem(&bcache_fs_type);
+	if (ret)
+		goto err;
+
+	return 0;
+err:
+	bch2_vfs_exit();
+	return ret;
+}
+
+#endif /* NO_BCACHEFS_FS */
diff --git a/fs/bcachefs/fs.h b/fs/bcachefs/fs.h
new file mode 100644
index 000000000000..e8dd566285fc
--- /dev/null
+++ b/fs/bcachefs/fs.h
@@ -0,0 +1,99 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_FS_H
+#define _BCACHEFS_FS_H
+
+#include "opts.h"
+#include "str_hash.h"
+#include "quota_types.h"
+
+#include <linux/seqlock.h>
+#include <linux/stat.h>
+
+/*
+ * Two-state lock - can be taken for add or block - both states are shared,
+ * like read side of rwsem, but conflict with other state:
+ */
+struct pagecache_lock {
+	atomic_long_t		v;
+	wait_queue_head_t	wait;
+};
+
+static inline void pagecache_lock_init(struct pagecache_lock *lock)
+{
+	atomic_long_set(&lock->v, 0);
+	init_waitqueue_head(&lock->wait);
+}
+
+void bch2_pagecache_add_put(struct pagecache_lock *);
+void bch2_pagecache_add_get(struct pagecache_lock *);
+void bch2_pagecache_block_put(struct pagecache_lock *);
+void bch2_pagecache_block_get(struct pagecache_lock *);
+
+struct bch_inode_info {
+	struct inode		v;
+
+	struct mutex		ei_update_lock;
+	u64			ei_journal_seq;
+	u64			ei_quota_reserved;
+	unsigned long		ei_last_dirtied;
+	struct pagecache_lock	ei_pagecache_lock;
+
+	struct mutex		ei_quota_lock;
+	struct bch_qid		ei_qid;
+
+	struct bch_hash_info	ei_str_hash;
+
+	/* copy of inode in btree: */
+	struct bch_inode_unpacked ei_inode;
+};
+
+#define to_bch_ei(_inode)					\
+	container_of_or_null(_inode, struct bch_inode_info, v)
+
+static inline struct bch_inode_info *file_bch_inode(struct file *file)
+{
+	return to_bch_ei(file_inode(file));
+}
+
+static inline u8 mode_to_type(umode_t mode)
+{
+	return (mode >> 12) & 15;
+}
+
+static inline unsigned nlink_bias(umode_t mode)
+{
+	return S_ISDIR(mode) ? 2 : 1;
+}
+
+struct bch_inode_unpacked;
+
+#ifndef NO_BCACHEFS_FS
+
+/* returns 0 if we want to do the update, or error is passed up */
+typedef int (*inode_set_fn)(struct bch_inode_info *,
+			    struct bch_inode_unpacked *, void *);
+
+void bch2_inode_update_after_write(struct bch_fs *,
+				   struct bch_inode_info *,
+				   struct bch_inode_unpacked *,
+				   unsigned);
+int __must_check bch2_write_inode_trans(struct btree_trans *,
+				struct bch_inode_info *,
+				struct bch_inode_unpacked *,
+				inode_set_fn, void *);
+int __must_check __bch2_write_inode(struct bch_fs *, struct bch_inode_info *,
+				    inode_set_fn, void *, unsigned);
+int __must_check bch2_write_inode(struct bch_fs *,
+				  struct bch_inode_info *);
+
+void bch2_vfs_exit(void);
+int bch2_vfs_init(void);
+
+#else
+
+static inline void bch2_vfs_exit(void) {}
+static inline int bch2_vfs_init(void) { return 0; }
+
+#endif /* NO_BCACHEFS_FS */
+
+#endif /* _BCACHEFS_FS_H */
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
new file mode 100644
index 000000000000..eb01284a841f
--- /dev/null
+++ b/fs/bcachefs/fsck.c
@@ -0,0 +1,1306 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "btree_update.h"
+#include "dirent.h"
+#include "error.h"
+#include "fs.h"
+#include "fsck.h"
+#include "inode.h"
+#include "keylist.h"
+#include "super.h"
+#include "xattr.h"
+
+#include <linux/dcache.h> /* struct qstr */
+#include <linux/generic-radix-tree.h>
+
+#define QSTR(n) { { { .len = strlen(n) } }, .name = n }
+
+static int remove_dirent(struct bch_fs *c, struct btree_iter *iter,
+			 struct bkey_s_c_dirent dirent)
+{
+	struct qstr name;
+	struct bch_inode_unpacked dir_inode;
+	struct bch_hash_info dir_hash_info;
+	u64 dir_inum = dirent.k->p.inode;
+	int ret;
+	char *buf;
+
+	name.len = bch2_dirent_name_bytes(dirent);
+	buf = kmalloc(name.len + 1, GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+
+	memcpy(buf, dirent.v->d_name, name.len);
+	buf[name.len] = '\0';
+	name.name = buf;
+
+	/* Unlock iter so we don't deadlock, after copying name: */
+	bch2_btree_iter_unlock(iter);
+
+	ret = bch2_inode_find_by_inum(c, dir_inum, &dir_inode);
+	if (ret) {
+		bch_err(c, "remove_dirent: err %i looking up directory inode", ret);
+		goto err;
+	}
+
+	dir_hash_info = bch2_hash_info_init(c, &dir_inode);
+
+	ret = bch2_dirent_delete(c, dir_inum, &dir_hash_info, &name, NULL);
+	if (ret)
+		bch_err(c, "remove_dirent: err %i deleting dirent", ret);
+err:
+	kfree(buf);
+	return ret;
+}
+
+static int reattach_inode(struct bch_fs *c,
+			  struct bch_inode_unpacked *lostfound_inode,
+			  u64 inum)
+{
+	struct bch_hash_info lostfound_hash_info =
+		bch2_hash_info_init(c, lostfound_inode);
+	struct bkey_inode_buf packed;
+	char name_buf[20];
+	struct qstr name;
+	int ret;
+
+	snprintf(name_buf, sizeof(name_buf), "%llu", inum);
+	name = (struct qstr) QSTR(name_buf);
+
+	lostfound_inode->bi_nlink++;
+
+	bch2_inode_pack(&packed, lostfound_inode);
+
+	ret = bch2_btree_insert(c, BTREE_ID_INODES, &packed.inode.k_i,
+			       NULL, NULL, NULL,
+			       BTREE_INSERT_NOFAIL);
+	if (ret) {
+		bch_err(c, "error %i reattaching inode %llu while updating lost+found",
+			ret, inum);
+		return ret;
+	}
+
+	ret = bch2_dirent_create(c, lostfound_inode->bi_inum,
+				 &lostfound_hash_info,
+				 DT_DIR, &name, inum, NULL,
+				 BTREE_INSERT_NOFAIL);
+	if (ret) {
+		bch_err(c, "error %i reattaching inode %llu while creating new dirent",
+			ret, inum);
+		return ret;
+	}
+	return ret;
+}
+
+struct inode_walker {
+	bool			first_this_inode;
+	bool			have_inode;
+	u64			cur_inum;
+	struct bch_inode_unpacked inode;
+};
+
+static struct inode_walker inode_walker_init(void)
+{
+	return (struct inode_walker) {
+		.cur_inum	= -1,
+		.have_inode	= false,
+	};
+}
+
+static int walk_inode(struct bch_fs *c, struct inode_walker *w, u64 inum)
+{
+	w->first_this_inode	= inum != w->cur_inum;
+	w->cur_inum		= inum;
+
+	if (w->first_this_inode) {
+		int ret = bch2_inode_find_by_inum(c, inum, &w->inode);
+
+		if (ret && ret != -ENOENT)
+			return ret;
+
+		w->have_inode = !ret;
+	}
+
+	return 0;
+}
+
+struct hash_check {
+	struct bch_hash_info	info;
+	struct btree_iter	chain;
+	struct btree_iter	iter;
+	u64			next;
+};
+
+static void hash_check_init(const struct bch_hash_desc desc,
+			    struct hash_check *h, struct bch_fs *c)
+{
+	bch2_btree_iter_init(&h->chain, c, desc.btree_id, POS_MIN, 0);
+	bch2_btree_iter_init(&h->iter, c, desc.btree_id, POS_MIN, 0);
+}
+
+static void hash_check_set_inode(struct hash_check *h, struct bch_fs *c,
+				 const struct bch_inode_unpacked *bi)
+{
+	h->info = bch2_hash_info_init(c, bi);
+	h->next = -1;
+}
+
+static int hash_redo_key(const struct bch_hash_desc desc,
+			 struct hash_check *h, struct bch_fs *c,
+			 struct btree_iter *k_iter, struct bkey_s_c k,
+			 u64 hashed)
+{
+	struct bkey_i *tmp;
+	int ret = 0;
+
+	tmp = kmalloc(bkey_bytes(k.k), GFP_KERNEL);
+	if (!tmp)
+		return -ENOMEM;
+
+	bkey_reassemble(tmp, k);
+
+	ret = bch2_btree_delete_at(k_iter, 0);
+	if (ret)
+		goto err;
+
+	bch2_btree_iter_unlock(k_iter);
+
+	bch2_hash_set(desc, &h->info, c, k_iter->pos.inode, NULL, tmp,
+		      BTREE_INSERT_NOFAIL|
+		      BCH_HASH_SET_MUST_CREATE);
+err:
+	kfree(tmp);
+	return ret;
+}
+
+/* fsck hasn't been converted to new transactions yet: */
+static int fsck_hash_delete_at(const struct bch_hash_desc desc,
+			       struct bch_hash_info *info,
+			       struct btree_iter *orig_iter)
+{
+	struct btree_trans trans;
+	struct btree_iter *iter;
+	int ret;
+
+	bch2_btree_iter_unlock(orig_iter);
+
+	bch2_trans_init(&trans, orig_iter->c);
+retry:
+	bch2_trans_begin(&trans);
+
+	iter = bch2_trans_copy_iter(&trans, orig_iter);
+	if (IS_ERR(iter)) {
+		ret = PTR_ERR(iter);
+		goto err;
+	}
+
+	ret   = bch2_hash_delete_at(&trans, desc, info, iter) ?:
+		bch2_trans_commit(&trans, NULL, NULL, NULL,
+				  BTREE_INSERT_ATOMIC|
+				  BTREE_INSERT_NOFAIL);
+err:
+	if (ret == -EINTR)
+		goto retry;
+
+	bch2_trans_exit(&trans);
+	return ret;
+}
+
+static int hash_check_key(const struct bch_hash_desc desc,
+			  struct hash_check *h, struct bch_fs *c,
+			  struct btree_iter *k_iter, struct bkey_s_c k)
+{
+	char buf[200];
+	u64 hashed;
+	int ret = 0;
+
+	if (k.k->type != desc.whiteout_type &&
+	    k.k->type != desc.key_type)
+		return 0;
+
+	if (k.k->p.offset != h->next) {
+		if (!btree_iter_linked(&h->chain)) {
+			bch2_btree_iter_link(k_iter, &h->chain);
+			bch2_btree_iter_link(k_iter, &h->iter);
+		}
+		bch2_btree_iter_copy(&h->chain, k_iter);
+	}
+	h->next = k.k->p.offset + 1;
+
+	if (k.k->type != desc.key_type)
+		return 0;
+
+	hashed = desc.hash_bkey(&h->info, k);
+
+	if (fsck_err_on(hashed < h->chain.pos.offset ||
+			hashed > k.k->p.offset, c,
+			"hash table key at wrong offset: %llu, "
+			"hashed to %llu chain starts at %llu\n%s",
+			k.k->p.offset, hashed, h->chain.pos.offset,
+			(bch2_bkey_val_to_text(c, bkey_type(0, desc.btree_id),
+					       buf, sizeof(buf), k), buf))) {
+		ret = hash_redo_key(desc, h, c, k_iter, k, hashed);
+		if (ret) {
+			bch_err(c, "hash_redo_key err %i", ret);
+			return ret;
+		}
+		return 1;
+	}
+
+	if (!bkey_cmp(h->chain.pos, k_iter->pos))
+		return 0;
+
+	bch2_btree_iter_copy(&h->iter, &h->chain);
+	while (bkey_cmp(h->iter.pos, k_iter->pos) < 0) {
+		struct bkey_s_c k2 = bch2_btree_iter_peek(&h->iter);
+
+		if (fsck_err_on(k2.k->type == desc.key_type &&
+				!desc.cmp_bkey(k, k2), c,
+				"duplicate hash table keys:\n%s",
+				(bch2_bkey_val_to_text(c, bkey_type(0, desc.btree_id),
+						       buf, sizeof(buf), k), buf))) {
+			ret = fsck_hash_delete_at(desc, &h->info, &h->iter);
+			if (ret)
+				return ret;
+			return 1;
+		}
+		bch2_btree_iter_next(&h->iter);
+	}
+fsck_err:
+	return ret;
+}
+
+/*
+ * Walk extents: verify that extents have a corresponding S_ISREG inode, and
+ * that i_size an i_sectors are consistent
+ */
+noinline_for_stack
+static int check_extents(struct bch_fs *c)
+{
+	struct inode_walker w = inode_walker_init();
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	u64 i_sectors;
+	int ret = 0;
+
+	bch_verbose(c, "checking extents");
+
+	for_each_btree_key(&iter, c, BTREE_ID_EXTENTS,
+			   POS(BCACHEFS_ROOT_INO, 0), 0, k) {
+		ret = walk_inode(c, &w, k.k->p.inode);
+		if (ret)
+			break;
+
+		if (fsck_err_on(!w.have_inode, c,
+			"extent type %u for missing inode %llu",
+			k.k->type, k.k->p.inode) ||
+		    fsck_err_on(w.have_inode &&
+			!S_ISREG(w.inode.bi_mode) && !S_ISLNK(w.inode.bi_mode), c,
+			"extent type %u for non regular file, inode %llu mode %o",
+			k.k->type, k.k->p.inode, w.inode.bi_mode)) {
+			bch2_btree_iter_unlock(&iter);
+
+			ret = bch2_inode_truncate(c, k.k->p.inode, 0, NULL, NULL);
+			if (ret)
+				goto err;
+			continue;
+		}
+
+		if (fsck_err_on(w.first_this_inode &&
+			w.have_inode &&
+			!(w.inode.bi_flags & BCH_INODE_I_SECTORS_DIRTY) &&
+			w.inode.bi_sectors !=
+			(i_sectors = bch2_count_inode_sectors(c, w.cur_inum)),
+			c, "i_sectors wrong: got %llu, should be %llu",
+			w.inode.bi_sectors, i_sectors)) {
+			struct bkey_inode_buf p;
+
+			w.inode.bi_sectors = i_sectors;
+
+			bch2_btree_iter_unlock(&iter);
+
+			bch2_inode_pack(&p, &w.inode);
+
+			ret = bch2_btree_insert(c, BTREE_ID_INODES,
+						&p.inode.k_i,
+						NULL,
+						NULL,
+						NULL,
+						BTREE_INSERT_NOFAIL);
+			if (ret) {
+				bch_err(c, "error in fs gc: error %i "
+					"updating inode", ret);
+				goto err;
+			}
+
+			/* revalidate iterator: */
+			k = bch2_btree_iter_peek(&iter);
+		}
+
+		if (fsck_err_on(w.have_inode &&
+			!(w.inode.bi_flags & BCH_INODE_I_SIZE_DIRTY) &&
+			k.k->type != BCH_RESERVATION &&
+			k.k->p.offset > round_up(w.inode.bi_size, PAGE_SIZE) >> 9, c,
+			"extent type %u offset %llu past end of inode %llu, i_size %llu",
+			k.k->type, k.k->p.offset, k.k->p.inode, w.inode.bi_size)) {
+			bch2_btree_iter_unlock(&iter);
+
+			ret = bch2_inode_truncate(c, k.k->p.inode,
+					round_up(w.inode.bi_size, PAGE_SIZE) >> 9,
+					NULL, NULL);
+			if (ret)
+				goto err;
+			continue;
+		}
+	}
+err:
+fsck_err:
+	return bch2_btree_iter_unlock(&iter) ?: ret;
+}
+
+/*
+ * Walk dirents: verify that they all have a corresponding S_ISDIR inode,
+ * validate d_type
+ */
+noinline_for_stack
+static int check_dirents(struct bch_fs *c)
+{
+	struct inode_walker w = inode_walker_init();
+	struct hash_check h;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	unsigned name_len;
+	char buf[200];
+	int ret = 0;
+
+	bch_verbose(c, "checking dirents");
+
+	hash_check_init(bch2_dirent_hash_desc, &h, c);
+
+	for_each_btree_key(&iter, c, BTREE_ID_DIRENTS,
+			   POS(BCACHEFS_ROOT_INO, 0), 0, k) {
+		struct bkey_s_c_dirent d;
+		struct bch_inode_unpacked target;
+		bool have_target;
+		u64 d_inum;
+
+		ret = walk_inode(c, &w, k.k->p.inode);
+		if (ret)
+			break;
+
+		if (fsck_err_on(!w.have_inode, c,
+				"dirent in nonexisting directory:\n%s",
+				(bch2_bkey_val_to_text(c, (enum bkey_type) BTREE_ID_DIRENTS,
+						       buf, sizeof(buf), k), buf)) ||
+		    fsck_err_on(!S_ISDIR(w.inode.bi_mode), c,
+				"dirent in non directory inode type %u:\n%s",
+				mode_to_type(w.inode.bi_mode),
+				(bch2_bkey_val_to_text(c, (enum bkey_type) BTREE_ID_DIRENTS,
+						       buf, sizeof(buf), k), buf))) {
+			ret = bch2_btree_delete_at(&iter, 0);
+			if (ret)
+				goto err;
+			continue;
+		}
+
+		if (w.first_this_inode && w.have_inode)
+			hash_check_set_inode(&h, c, &w.inode);
+
+		ret = hash_check_key(bch2_dirent_hash_desc, &h, c, &iter, k);
+		if (ret > 0) {
+			ret = 0;
+			continue;
+		}
+
+		if (ret)
+			goto fsck_err;
+
+		if (k.k->type != BCH_DIRENT)
+			continue;
+
+		d = bkey_s_c_to_dirent(k);
+		d_inum = le64_to_cpu(d.v->d_inum);
+
+		name_len = bch2_dirent_name_bytes(d);
+
+		if (fsck_err_on(!name_len, c, "empty dirent") ||
+		    fsck_err_on(name_len == 1 &&
+				!memcmp(d.v->d_name, ".", 1), c,
+				". dirent") ||
+		    fsck_err_on(name_len == 2 &&
+				!memcmp(d.v->d_name, "..", 2), c,
+				".. dirent")) {
+			ret = remove_dirent(c, &iter, d);
+			if (ret)
+				goto err;
+			continue;
+		}
+
+		if (fsck_err_on(d_inum == d.k->p.inode, c,
+				"dirent points to own directory:\n%s",
+				(bch2_bkey_val_to_text(c, (enum bkey_type) BTREE_ID_DIRENTS,
+						       buf, sizeof(buf), k), buf))) {
+			ret = remove_dirent(c, &iter, d);
+			if (ret)
+				goto err;
+			continue;
+		}
+
+		ret = bch2_inode_find_by_inum(c, d_inum, &target);
+		if (ret && ret != -ENOENT)
+			break;
+
+		have_target = !ret;
+		ret = 0;
+
+		if (fsck_err_on(!have_target, c,
+				"dirent points to missing inode:\n%s",
+				(bch2_bkey_val_to_text(c, (enum bkey_type) BTREE_ID_DIRENTS,
+						       buf, sizeof(buf), k), buf))) {
+			ret = remove_dirent(c, &iter, d);
+			if (ret)
+				goto err;
+			continue;
+		}
+
+		if (fsck_err_on(have_target &&
+				d.v->d_type !=
+				mode_to_type(target.bi_mode), c,
+				"incorrect d_type: should be %u:\n%s",
+				mode_to_type(target.bi_mode),
+				(bch2_bkey_val_to_text(c, (enum bkey_type) BTREE_ID_DIRENTS,
+						       buf, sizeof(buf), k), buf))) {
+			struct bkey_i_dirent *n;
+
+			n = kmalloc(bkey_bytes(d.k), GFP_KERNEL);
+			if (!n) {
+				ret = -ENOMEM;
+				goto err;
+			}
+
+			bkey_reassemble(&n->k_i, d.s_c);
+			n->v.d_type = mode_to_type(target.bi_mode);
+
+			ret = bch2_btree_insert_at(c, NULL, NULL, NULL,
+					BTREE_INSERT_NOFAIL,
+					BTREE_INSERT_ENTRY(&iter, &n->k_i));
+			kfree(n);
+			if (ret)
+				goto err;
+
+		}
+	}
+err:
+fsck_err:
+	bch2_btree_iter_unlock(&h.chain);
+	bch2_btree_iter_unlock(&h.iter);
+	return bch2_btree_iter_unlock(&iter) ?: ret;
+}
+
+/*
+ * Walk xattrs: verify that they all have a corresponding inode
+ */
+noinline_for_stack
+static int check_xattrs(struct bch_fs *c)
+{
+	struct inode_walker w = inode_walker_init();
+	struct hash_check h;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	int ret = 0;
+
+	bch_verbose(c, "checking xattrs");
+
+	hash_check_init(bch2_xattr_hash_desc, &h, c);
+
+	for_each_btree_key(&iter, c, BTREE_ID_XATTRS,
+			   POS(BCACHEFS_ROOT_INO, 0), 0, k) {
+		ret = walk_inode(c, &w, k.k->p.inode);
+		if (ret)
+			break;
+
+		if (fsck_err_on(!w.have_inode, c,
+				"xattr for missing inode %llu",
+				k.k->p.inode)) {
+			ret = bch2_btree_delete_at(&iter, 0);
+			if (ret)
+				goto err;
+			continue;
+		}
+
+		if (w.first_this_inode && w.have_inode)
+			hash_check_set_inode(&h, c, &w.inode);
+
+		ret = hash_check_key(bch2_xattr_hash_desc, &h, c, &iter, k);
+		if (ret)
+			goto fsck_err;
+	}
+err:
+fsck_err:
+	bch2_btree_iter_unlock(&h.chain);
+	bch2_btree_iter_unlock(&h.iter);
+	return bch2_btree_iter_unlock(&iter) ?: ret;
+}
+
+/* Get root directory, create if it doesn't exist: */
+static int check_root(struct bch_fs *c, struct bch_inode_unpacked *root_inode)
+{
+	struct bkey_inode_buf packed;
+	int ret;
+
+	bch_verbose(c, "checking root directory");
+
+	ret = bch2_inode_find_by_inum(c, BCACHEFS_ROOT_INO, root_inode);
+	if (ret && ret != -ENOENT)
+		return ret;
+
+	if (fsck_err_on(ret, c, "root directory missing"))
+		goto create_root;
+
+	if (fsck_err_on(!S_ISDIR(root_inode->bi_mode), c,
+			"root inode not a directory"))
+		goto create_root;
+
+	return 0;
+fsck_err:
+	return ret;
+create_root:
+	bch2_inode_init(c, root_inode, 0, 0, S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO,
+			0, NULL);
+	root_inode->bi_inum = BCACHEFS_ROOT_INO;
+
+	bch2_inode_pack(&packed, root_inode);
+
+	return bch2_btree_insert(c, BTREE_ID_INODES, &packed.inode.k_i,
+				 NULL, NULL, NULL, BTREE_INSERT_NOFAIL);
+}
+
+/* Get lost+found, create if it doesn't exist: */
+static int check_lostfound(struct bch_fs *c,
+			   struct bch_inode_unpacked *root_inode,
+			   struct bch_inode_unpacked *lostfound_inode)
+{
+	struct qstr lostfound = QSTR("lost+found");
+	struct bch_hash_info root_hash_info =
+		bch2_hash_info_init(c, root_inode);
+	struct bkey_inode_buf packed;
+	u64 inum;
+	int ret;
+
+	bch_verbose(c, "checking lost+found");
+
+	inum = bch2_dirent_lookup(c, BCACHEFS_ROOT_INO, &root_hash_info,
+				 &lostfound);
+	if (!inum) {
+		bch_notice(c, "creating lost+found");
+		goto create_lostfound;
+	}
+
+	ret = bch2_inode_find_by_inum(c, inum, lostfound_inode);
+	if (ret && ret != -ENOENT)
+		return ret;
+
+	if (fsck_err_on(ret, c, "lost+found missing"))
+		goto create_lostfound;
+
+	if (fsck_err_on(!S_ISDIR(lostfound_inode->bi_mode), c,
+			"lost+found inode not a directory"))
+		goto create_lostfound;
+
+	return 0;
+fsck_err:
+	return ret;
+create_lostfound:
+	root_inode->bi_nlink++;
+
+	bch2_inode_pack(&packed, root_inode);
+
+	ret = bch2_btree_insert(c, BTREE_ID_INODES, &packed.inode.k_i,
+				NULL, NULL, NULL, BTREE_INSERT_NOFAIL);
+	if (ret)
+		return ret;
+
+	bch2_inode_init(c, lostfound_inode, 0, 0, S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO,
+			0, root_inode);
+
+	ret = bch2_inode_create(c, lostfound_inode, BLOCKDEV_INODE_MAX, 0,
+			       &c->unused_inode_hint);
+	if (ret)
+		return ret;
+
+	ret = bch2_dirent_create(c, BCACHEFS_ROOT_INO, &root_hash_info, DT_DIR,
+				 &lostfound, lostfound_inode->bi_inum, NULL,
+				 BTREE_INSERT_NOFAIL);
+	if (ret)
+		return ret;
+
+	return 0;
+}
+
+struct inode_bitmap {
+	unsigned long	*bits;
+	size_t		size;
+};
+
+static inline bool inode_bitmap_test(struct inode_bitmap *b, size_t nr)
+{
+	return nr < b->size ? test_bit(nr, b->bits) : false;
+}
+
+static inline int inode_bitmap_set(struct inode_bitmap *b, size_t nr)
+{
+	if (nr >= b->size) {
+		size_t new_size = max_t(size_t, max_t(size_t,
+					PAGE_SIZE * 8,
+					b->size * 2),
+					nr + 1);
+		void *n;
+
+		new_size = roundup_pow_of_two(new_size);
+		n = krealloc(b->bits, new_size / 8, GFP_KERNEL|__GFP_ZERO);
+		if (!n) {
+			return -ENOMEM;
+		}
+
+		b->bits = n;
+		b->size = new_size;
+	}
+
+	__set_bit(nr, b->bits);
+	return 0;
+}
+
+struct pathbuf {
+	size_t		nr;
+	size_t		size;
+
+	struct pathbuf_entry {
+		u64	inum;
+		u64	offset;
+	}		*entries;
+};
+
+static int path_down(struct pathbuf *p, u64 inum)
+{
+	if (p->nr == p->size) {
+		size_t new_size = max_t(size_t, 256UL, p->size * 2);
+		void *n = krealloc(p->entries,
+				   new_size * sizeof(p->entries[0]),
+				   GFP_KERNEL);
+		if (!n)
+			return -ENOMEM;
+
+		p->entries = n;
+		p->size = new_size;
+	};
+
+	p->entries[p->nr++] = (struct pathbuf_entry) {
+		.inum = inum,
+		.offset = 0,
+	};
+	return 0;
+}
+
+noinline_for_stack
+static int check_directory_structure(struct bch_fs *c,
+				     struct bch_inode_unpacked *lostfound_inode)
+{
+	struct inode_bitmap dirs_done = { NULL, 0 };
+	struct pathbuf path = { 0, 0, NULL };
+	struct pathbuf_entry *e;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	struct bkey_s_c_dirent dirent;
+	bool had_unreachable;
+	u64 d_inum;
+	int ret = 0;
+
+	bch_verbose(c, "checking directory structure");
+
+	/* DFS: */
+restart_dfs:
+	had_unreachable = false;
+
+	ret = inode_bitmap_set(&dirs_done, BCACHEFS_ROOT_INO);
+	if (ret) {
+		bch_err(c, "memory allocation failure in inode_bitmap_set()");
+		goto err;
+	}
+
+	ret = path_down(&path, BCACHEFS_ROOT_INO);
+	if (ret) {
+		return ret;
+	}
+
+	while (path.nr) {
+next:
+		e = &path.entries[path.nr - 1];
+
+		if (e->offset == U64_MAX)
+			goto up;
+
+		for_each_btree_key(&iter, c, BTREE_ID_DIRENTS,
+				   POS(e->inum, e->offset + 1), 0, k) {
+			if (k.k->p.inode != e->inum)
+				break;
+
+			e->offset = k.k->p.offset;
+
+			if (k.k->type != BCH_DIRENT)
+				continue;
+
+			dirent = bkey_s_c_to_dirent(k);
+
+			if (dirent.v->d_type != DT_DIR)
+				continue;
+
+			d_inum = le64_to_cpu(dirent.v->d_inum);
+
+			if (fsck_err_on(inode_bitmap_test(&dirs_done, d_inum), c,
+					"directory %llu has multiple hardlinks",
+					d_inum)) {
+				ret = remove_dirent(c, &iter, dirent);
+				if (ret)
+					goto err;
+				continue;
+			}
+
+			ret = inode_bitmap_set(&dirs_done, d_inum);
+			if (ret) {
+				bch_err(c, "memory allocation failure in inode_bitmap_set()");
+				goto err;
+			}
+
+			ret = path_down(&path, d_inum);
+			if (ret) {
+				goto err;
+			}
+
+			bch2_btree_iter_unlock(&iter);
+			goto next;
+		}
+		ret = bch2_btree_iter_unlock(&iter);
+		if (ret) {
+			bch_err(c, "btree error %i in fsck", ret);
+			goto err;
+		}
+up:
+		path.nr--;
+	}
+
+	for_each_btree_key(&iter, c, BTREE_ID_INODES, POS_MIN, 0, k) {
+		if (k.k->type != BCH_INODE_FS)
+			continue;
+
+		if (!S_ISDIR(le16_to_cpu(bkey_s_c_to_inode(k).v->bi_mode)))
+			continue;
+
+		if (!bch2_empty_dir(c, k.k->p.inode))
+			continue;
+
+		if (fsck_err_on(!inode_bitmap_test(&dirs_done, k.k->p.inode), c,
+				"unreachable directory found (inum %llu)",
+				k.k->p.inode)) {
+			bch2_btree_iter_unlock(&iter);
+
+			ret = reattach_inode(c, lostfound_inode, k.k->p.inode);
+			if (ret) {
+				goto err;
+			}
+
+			had_unreachable = true;
+		}
+	}
+	ret = bch2_btree_iter_unlock(&iter);
+	if (ret)
+		goto err;
+
+	if (had_unreachable) {
+		bch_info(c, "reattached unreachable directories, restarting pass to check for loops");
+		kfree(dirs_done.bits);
+		kfree(path.entries);
+		memset(&dirs_done, 0, sizeof(dirs_done));
+		memset(&path, 0, sizeof(path));
+		goto restart_dfs;
+	}
+
+out:
+	kfree(dirs_done.bits);
+	kfree(path.entries);
+	return ret;
+err:
+fsck_err:
+	ret = bch2_btree_iter_unlock(&iter) ?: ret;
+	goto out;
+}
+
+struct nlink {
+	u32	count;
+	u32	dir_count;
+};
+
+typedef GENRADIX(struct nlink) nlink_table;
+
+static void inc_link(struct bch_fs *c, nlink_table *links,
+		     u64 range_start, u64 *range_end,
+		     u64 inum, bool dir)
+{
+	struct nlink *link;
+
+	if (inum < range_start || inum >= *range_end)
+		return;
+
+	link = genradix_ptr_alloc(links, inum - range_start, GFP_KERNEL);
+	if (!link) {
+		bch_verbose(c, "allocation failed during fs gc - will need another pass");
+		*range_end = inum;
+		return;
+	}
+
+	if (dir)
+		link->dir_count++;
+	else
+		link->count++;
+}
+
+noinline_for_stack
+static int bch2_gc_walk_dirents(struct bch_fs *c, nlink_table *links,
+			       u64 range_start, u64 *range_end)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	struct bkey_s_c_dirent d;
+	u64 d_inum;
+	int ret;
+
+	inc_link(c, links, range_start, range_end, BCACHEFS_ROOT_INO, false);
+
+	for_each_btree_key(&iter, c, BTREE_ID_DIRENTS, POS_MIN, 0, k) {
+		switch (k.k->type) {
+		case BCH_DIRENT:
+			d = bkey_s_c_to_dirent(k);
+			d_inum = le64_to_cpu(d.v->d_inum);
+
+			if (d.v->d_type == DT_DIR)
+				inc_link(c, links, range_start, range_end,
+					 d.k->p.inode, true);
+
+			inc_link(c, links, range_start, range_end,
+				 d_inum, false);
+
+			break;
+		}
+
+		bch2_btree_iter_cond_resched(&iter);
+	}
+	ret = bch2_btree_iter_unlock(&iter);
+	if (ret)
+		bch_err(c, "error in fs gc: btree error %i while walking dirents", ret);
+
+	return ret;
+}
+
+s64 bch2_count_inode_sectors(struct bch_fs *c, u64 inum)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	u64 sectors = 0;
+
+	for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, POS(inum, 0), 0, k) {
+		if (k.k->p.inode != inum)
+			break;
+
+		if (bkey_extent_is_allocation(k.k))
+			sectors += k.k->size;
+	}
+
+	return bch2_btree_iter_unlock(&iter) ?: sectors;
+}
+
+static int check_inode_nlink(struct bch_fs *c,
+			     struct bch_inode_unpacked *lostfound_inode,
+			     struct bch_inode_unpacked *u,
+			     struct nlink *link,
+			     bool *do_update)
+{
+	u32 i_nlink = u->bi_flags & BCH_INODE_UNLINKED
+		? 0
+		: u->bi_nlink + nlink_bias(u->bi_mode);
+	u32 real_i_nlink =
+		link->count * nlink_bias(u->bi_mode) +
+		link->dir_count;
+	int ret = 0;
+
+	/*
+	 * These should have been caught/fixed by earlier passes, we don't
+	 * repair them here:
+	 */
+	if (S_ISDIR(u->bi_mode) && link->count > 1) {
+		need_fsck_err(c, "directory %llu with multiple hardlinks: %u",
+			      u->bi_inum, link->count);
+		return 0;
+	}
+
+	if (S_ISDIR(u->bi_mode) && !link->count) {
+		need_fsck_err(c, "unreachable directory found (inum %llu)",
+			      u->bi_inum);
+		return 0;
+	}
+
+	if (!S_ISDIR(u->bi_mode) && link->dir_count) {
+		need_fsck_err(c, "non directory with subdirectories",
+			      u->bi_inum);
+		return 0;
+	}
+
+	if (i_nlink < link->count) {
+		if (fsck_err(c, "inode %llu i_link too small (%u < %u, type %i)",
+			     u->bi_inum, i_nlink, link->count,
+			     mode_to_type(u->bi_mode)) == FSCK_ERR_IGNORE)
+			return 0;
+		goto set_i_nlink;
+	}
+
+	if (i_nlink != real_i_nlink &&
+	    c->sb.clean) {
+		if (fsck_err(c, "filesystem marked clean, "
+			     "but inode %llu has wrong i_nlink "
+			     "(type %u i_nlink %u, should be %u)",
+			     u->bi_inum, mode_to_type(u->bi_mode),
+			     i_nlink, real_i_nlink) == FSCK_ERR_IGNORE)
+			return 0;
+		goto set_i_nlink;
+	}
+
+	if (real_i_nlink && i_nlink != real_i_nlink)
+		bch_verbose(c, "setting inode %llu nlink from %u to %u",
+			    u->bi_inum, i_nlink, real_i_nlink);
+set_i_nlink:
+	if (i_nlink != real_i_nlink) {
+		if (real_i_nlink) {
+			u->bi_nlink = real_i_nlink - nlink_bias(u->bi_mode);
+			u->bi_flags &= ~BCH_INODE_UNLINKED;
+		} else {
+			u->bi_nlink = 0;
+			u->bi_flags |= BCH_INODE_UNLINKED;
+		}
+
+		*do_update = true;
+	}
+fsck_err:
+	return ret;
+}
+
+static int check_inode(struct bch_fs *c,
+		       struct bch_inode_unpacked *lostfound_inode,
+		       struct btree_iter *iter,
+		       struct bkey_s_c_inode inode,
+		       struct nlink *link)
+{
+	struct bch_inode_unpacked u;
+	bool do_update = false;
+	int ret = 0;
+
+	ret = bch2_inode_unpack(inode, &u);
+	if (bch2_fs_inconsistent_on(ret, c,
+			 "error unpacking inode %llu in fsck",
+			 inode.k->p.inode))
+		return ret;
+
+	if (link) {
+		ret = check_inode_nlink(c, lostfound_inode, &u, link,
+					&do_update);
+		if (ret)
+			return ret;
+	}
+
+	if (u.bi_flags & BCH_INODE_UNLINKED) {
+		bch_verbose(c, "deleting inode %llu", u.bi_inum);
+
+		ret = bch2_inode_rm(c, u.bi_inum);
+		if (ret)
+			bch_err(c, "error in fs gc: error %i "
+				"while deleting inode", ret);
+		return ret;
+	}
+
+	if (u.bi_flags & BCH_INODE_I_SIZE_DIRTY) {
+		fsck_err_on(c->sb.clean, c,
+			    "filesystem marked clean, "
+			    "but inode %llu has i_size dirty",
+			    u.bi_inum);
+
+		bch_verbose(c, "truncating inode %llu", u.bi_inum);
+
+		/*
+		 * XXX: need to truncate partial blocks too here - or ideally
+		 * just switch units to bytes and that issue goes away
+		 */
+
+		ret = bch2_inode_truncate(c, u.bi_inum,
+				round_up(u.bi_size, PAGE_SIZE) >> 9,
+				NULL, NULL);
+		if (ret) {
+			bch_err(c, "error in fs gc: error %i "
+				"truncating inode", ret);
+			return ret;
+		}
+
+		/*
+		 * We truncated without our normal sector accounting hook, just
+		 * make sure we recalculate it:
+		 */
+		u.bi_flags |= BCH_INODE_I_SECTORS_DIRTY;
+
+		u.bi_flags &= ~BCH_INODE_I_SIZE_DIRTY;
+		do_update = true;
+	}
+
+	if (u.bi_flags & BCH_INODE_I_SECTORS_DIRTY) {
+		s64 sectors;
+
+		fsck_err_on(c->sb.clean, c,
+			    "filesystem marked clean, "
+			    "but inode %llu has i_sectors dirty",
+			    u.bi_inum);
+
+		bch_verbose(c, "recounting sectors for inode %llu",
+			    u.bi_inum);
+
+		sectors = bch2_count_inode_sectors(c, u.bi_inum);
+		if (sectors < 0) {
+			bch_err(c, "error in fs gc: error %i "
+				"recounting inode sectors",
+				(int) sectors);
+			return sectors;
+		}
+
+		u.bi_sectors = sectors;
+		u.bi_flags &= ~BCH_INODE_I_SECTORS_DIRTY;
+		do_update = true;
+	}
+
+	if (do_update) {
+		struct bkey_inode_buf p;
+
+		bch2_inode_pack(&p, &u);
+
+		ret = bch2_btree_insert_at(c, NULL, NULL, NULL,
+					  BTREE_INSERT_NOFAIL,
+					  BTREE_INSERT_ENTRY(iter, &p.inode.k_i));
+		if (ret && ret != -EINTR)
+			bch_err(c, "error in fs gc: error %i "
+				"updating inode", ret);
+	}
+fsck_err:
+	return ret;
+}
+
+noinline_for_stack
+static int bch2_gc_walk_inodes(struct bch_fs *c,
+			       struct bch_inode_unpacked *lostfound_inode,
+			       nlink_table *links,
+			       u64 range_start, u64 range_end)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	struct nlink *link, zero_links = { 0, 0 };
+	struct genradix_iter nlinks_iter;
+	int ret = 0, ret2 = 0;
+	u64 nlinks_pos;
+
+	bch2_btree_iter_init(&iter, c, BTREE_ID_INODES, POS(range_start, 0), 0);
+	nlinks_iter = genradix_iter_init(links, 0);
+
+	while ((k = bch2_btree_iter_peek(&iter)).k &&
+	       !btree_iter_err(k)) {
+peek_nlinks:	link = genradix_iter_peek(&nlinks_iter, links);
+
+		if (!link && (!k.k || iter.pos.inode >= range_end))
+			break;
+
+		nlinks_pos = range_start + nlinks_iter.pos;
+		if (iter.pos.inode > nlinks_pos) {
+			/* Should have been caught by dirents pass: */
+			need_fsck_err_on(link && link->count, c,
+				"missing inode %llu (nlink %u)",
+				nlinks_pos, link->count);
+			genradix_iter_advance(&nlinks_iter, links);
+			goto peek_nlinks;
+		}
+
+		if (iter.pos.inode < nlinks_pos || !link)
+			link = &zero_links;
+
+		if (k.k && k.k->type == BCH_INODE_FS) {
+			/*
+			 * Avoid potential deadlocks with iter for
+			 * truncate/rm/etc.:
+			 */
+			bch2_btree_iter_unlock(&iter);
+
+			ret = check_inode(c, lostfound_inode, &iter,
+					  bkey_s_c_to_inode(k), link);
+			BUG_ON(ret == -EINTR);
+			if (ret)
+				break;
+
+			if (link->count)
+				atomic_long_inc(&c->nr_inodes);
+		} else {
+			/* Should have been caught by dirents pass: */
+			need_fsck_err_on(link->count, c,
+				"missing inode %llu (nlink %u)",
+				nlinks_pos, link->count);
+		}
+
+		if (nlinks_pos == iter.pos.inode)
+			genradix_iter_advance(&nlinks_iter, links);
+
+		bch2_btree_iter_next(&iter);
+		bch2_btree_iter_cond_resched(&iter);
+	}
+fsck_err:
+	ret2 = bch2_btree_iter_unlock(&iter);
+	if (ret2)
+		bch_err(c, "error in fs gc: btree error %i while walking inodes", ret2);
+
+	return ret ?: ret2;
+}
+
+noinline_for_stack
+static int check_inode_nlinks(struct bch_fs *c,
+			      struct bch_inode_unpacked *lostfound_inode)
+{
+	nlink_table links;
+	u64 this_iter_range_start, next_iter_range_start = 0;
+	int ret = 0;
+
+	bch_verbose(c, "checking inode nlinks");
+
+	genradix_init(&links);
+
+	do {
+		this_iter_range_start = next_iter_range_start;
+		next_iter_range_start = U64_MAX;
+
+		ret = bch2_gc_walk_dirents(c, &links,
+					  this_iter_range_start,
+					  &next_iter_range_start);
+		if (ret)
+			break;
+
+		ret = bch2_gc_walk_inodes(c, lostfound_inode, &links,
+					 this_iter_range_start,
+					 next_iter_range_start);
+		if (ret)
+			break;
+
+		genradix_free(&links);
+	} while (next_iter_range_start != U64_MAX);
+
+	genradix_free(&links);
+
+	return ret;
+}
+
+noinline_for_stack
+static int check_inodes_fast(struct bch_fs *c)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	struct bkey_s_c_inode inode;
+	unsigned long nr_inodes = 0;
+	int ret = 0;
+
+	for_each_btree_key(&iter, c, BTREE_ID_INODES, POS_MIN, 0, k) {
+		if (k.k->type != BCH_INODE_FS)
+			continue;
+
+		inode = bkey_s_c_to_inode(k);
+
+		if (!(inode.v->bi_flags & BCH_INODE_UNLINKED))
+			nr_inodes++;
+
+		if (inode.v->bi_flags &
+		    (BCH_INODE_I_SIZE_DIRTY|
+		     BCH_INODE_I_SECTORS_DIRTY|
+		     BCH_INODE_UNLINKED)) {
+			fsck_err_on(c->sb.clean, c,
+				"filesystem marked clean but found inode %llu with flags %x",
+				inode.k->p.inode, inode.v->bi_flags);
+			ret = check_inode(c, NULL, &iter, inode, NULL);
+			BUG_ON(ret == -EINTR);
+			if (ret)
+				break;
+		}
+	}
+	atomic_long_set(&c->nr_inodes, nr_inodes);
+fsck_err:
+	return bch2_btree_iter_unlock(&iter) ?: ret;
+}
+
+/*
+ * Checks for inconsistencies that shouldn't happen, unless we have a bug.
+ * Doesn't fix them yet, mainly because they haven't yet been observed:
+ */
+static int bch2_fsck_full(struct bch_fs *c)
+{
+	struct bch_inode_unpacked root_inode, lostfound_inode;
+	int ret;
+
+	bch_verbose(c, "starting fsck:");
+	ret =   check_extents(c) ?:
+		check_dirents(c) ?:
+		check_xattrs(c) ?:
+		check_root(c, &root_inode) ?:
+		check_lostfound(c, &root_inode, &lostfound_inode) ?:
+		check_directory_structure(c, &lostfound_inode) ?:
+		check_inode_nlinks(c, &lostfound_inode);
+
+	bch2_flush_fsck_errs(c);
+	bch_verbose(c, "fsck done");
+
+	return ret;
+}
+
+static int bch2_fsck_inode_nlink(struct bch_fs *c)
+{
+	struct bch_inode_unpacked root_inode, lostfound_inode;
+	int ret;
+
+	bch_verbose(c, "checking inode link counts:");
+	ret =   check_root(c, &root_inode) ?:
+		check_lostfound(c, &root_inode, &lostfound_inode) ?:
+		check_inode_nlinks(c, &lostfound_inode);
+
+	bch2_flush_fsck_errs(c);
+	bch_verbose(c, "done");
+
+	return ret;
+}
+
+static int bch2_fsck_walk_inodes_only(struct bch_fs *c)
+{
+	int ret;
+
+	bch_verbose(c, "walking inodes:");
+	ret = check_inodes_fast(c);
+
+	bch2_flush_fsck_errs(c);
+	bch_verbose(c, "done");
+
+	return ret;
+}
+
+int bch2_fsck(struct bch_fs *c)
+{
+	if (!c->opts.nofsck)
+		return bch2_fsck_full(c);
+
+	if (!c->sb.clean)
+		return bch2_fsck_inode_nlink(c);
+
+	return bch2_fsck_walk_inodes_only(c);
+}
diff --git a/fs/bcachefs/fsck.h b/fs/bcachefs/fsck.h
new file mode 100644
index 000000000000..88da06762d7d
--- /dev/null
+++ b/fs/bcachefs/fsck.h
@@ -0,0 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_FSCK_H
+#define _BCACHEFS_FSCK_H
+
+s64 bch2_count_inode_sectors(struct bch_fs *, u64);
+int bch2_fsck(struct bch_fs *);
+
+#endif /* _BCACHEFS_FSCK_H */
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
new file mode 100644
index 000000000000..2d635555bffb
--- /dev/null
+++ b/fs/bcachefs/inode.c
@@ -0,0 +1,517 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "bkey_methods.h"
+#include "btree_update.h"
+#include "error.h"
+#include "extents.h"
+#include "inode.h"
+#include "io.h"
+#include "keylist.h"
+
+#include <linux/random.h>
+
+#include <asm/unaligned.h>
+
+#define FIELD_BYTES()						\
+
+static const u8 byte_table[8] = { 1, 2, 3, 4, 6, 8, 10, 13 };
+static const u8 bits_table[8] = {
+	1  * 8 - 1,
+	2  * 8 - 2,
+	3  * 8 - 3,
+	4  * 8 - 4,
+	6  * 8 - 5,
+	8  * 8 - 6,
+	10 * 8 - 7,
+	13 * 8 - 8,
+};
+
+static int inode_encode_field(u8 *out, u8 *end, u64 hi, u64 lo)
+{
+	__be64 in[2] = { cpu_to_be64(hi), cpu_to_be64(lo), };
+	unsigned shift, bytes, bits = likely(!hi)
+		? fls64(lo)
+		: fls64(hi) + 64;
+
+	for (shift = 1; shift <= 8; shift++)
+		if (bits < bits_table[shift - 1])
+			goto got_shift;
+
+	BUG();
+got_shift:
+	bytes = byte_table[shift - 1];
+
+	BUG_ON(out + bytes > end);
+
+	memcpy(out, (u8 *) in + 16 - bytes, bytes);
+	*out |= (1 << 8) >> shift;
+
+	return bytes;
+}
+
+static int inode_decode_field(const u8 *in, const u8 *end,
+			      u64 out[2], unsigned *out_bits)
+{
+	__be64 be[2] = { 0, 0 };
+	unsigned bytes, shift;
+	u8 *p;
+
+	if (in >= end)
+		return -1;
+
+	if (!*in)
+		return -1;
+
+	/*
+	 * position of highest set bit indicates number of bytes:
+	 * shift = number of bits to remove in high byte:
+	 */
+	shift	= 8 - __fls(*in); /* 1 <= shift <= 8 */
+	bytes	= byte_table[shift - 1];
+
+	if (in + bytes > end)
+		return -1;
+
+	p = (u8 *) be + 16 - bytes;
+	memcpy(p, in, bytes);
+	*p ^= (1 << 8) >> shift;
+
+	out[0] = be64_to_cpu(be[0]);
+	out[1] = be64_to_cpu(be[1]);
+	*out_bits = out[0] ? 64 + fls64(out[0]) : fls64(out[1]);
+
+	return bytes;
+}
+
+void bch2_inode_pack(struct bkey_inode_buf *packed,
+		     const struct bch_inode_unpacked *inode)
+{
+	u8 *out = packed->inode.v.fields;
+	u8 *end = (void *) &packed[1];
+	u8 *last_nonzero_field = out;
+	unsigned nr_fields = 0, last_nonzero_fieldnr = 0;
+
+	bkey_inode_init(&packed->inode.k_i);
+	packed->inode.k.p.inode		= inode->bi_inum;
+	packed->inode.v.bi_hash_seed	= inode->bi_hash_seed;
+	packed->inode.v.bi_flags	= cpu_to_le32(inode->bi_flags);
+	packed->inode.v.bi_mode		= cpu_to_le16(inode->bi_mode);
+
+#define BCH_INODE_FIELD(_name, _bits)					\
+	out += inode_encode_field(out, end, 0, inode->_name);		\
+	nr_fields++;							\
+									\
+	if (inode->_name) {						\
+		last_nonzero_field = out;				\
+		last_nonzero_fieldnr = nr_fields;			\
+	}
+
+	BCH_INODE_FIELDS()
+#undef  BCH_INODE_FIELD
+
+	out = last_nonzero_field;
+	nr_fields = last_nonzero_fieldnr;
+
+	set_bkey_val_bytes(&packed->inode.k, out - (u8 *) &packed->inode.v);
+	memset(out, 0,
+	       (u8 *) &packed->inode.v +
+	       bkey_val_bytes(&packed->inode.k) - out);
+
+	SET_INODE_NR_FIELDS(&packed->inode.v, nr_fields);
+
+	if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) {
+		struct bch_inode_unpacked unpacked;
+
+		int ret = bch2_inode_unpack(inode_i_to_s_c(&packed->inode),
+					   &unpacked);
+		BUG_ON(ret);
+		BUG_ON(unpacked.bi_inum		!= inode->bi_inum);
+		BUG_ON(unpacked.bi_hash_seed	!= inode->bi_hash_seed);
+		BUG_ON(unpacked.bi_mode		!= inode->bi_mode);
+
+#define BCH_INODE_FIELD(_name, _bits)	BUG_ON(unpacked._name != inode->_name);
+		BCH_INODE_FIELDS()
+#undef  BCH_INODE_FIELD
+	}
+}
+
+int bch2_inode_unpack(struct bkey_s_c_inode inode,
+		      struct bch_inode_unpacked *unpacked)
+{
+	const u8 *in = inode.v->fields;
+	const u8 *end = (void *) inode.v + bkey_val_bytes(inode.k);
+	u64 field[2];
+	unsigned fieldnr = 0, field_bits;
+	int ret;
+
+	unpacked->bi_inum	= inode.k->p.inode;
+	unpacked->bi_hash_seed	= inode.v->bi_hash_seed;
+	unpacked->bi_flags	= le32_to_cpu(inode.v->bi_flags);
+	unpacked->bi_mode	= le16_to_cpu(inode.v->bi_mode);
+
+#define BCH_INODE_FIELD(_name, _bits)					\
+	if (fieldnr++ == INODE_NR_FIELDS(inode.v)) {			\
+		unsigned offset = offsetof(struct bch_inode_unpacked, _name);\
+		memset((void *) unpacked + offset, 0,			\
+		       sizeof(*unpacked) - offset);			\
+		return 0;						\
+	}								\
+									\
+	ret = inode_decode_field(in, end, field, &field_bits);		\
+	if (ret < 0)							\
+		return ret;						\
+									\
+	if (field_bits > sizeof(unpacked->_name) * 8)			\
+		return -1;						\
+									\
+	unpacked->_name = field[1];					\
+	in += ret;
+
+	BCH_INODE_FIELDS()
+#undef  BCH_INODE_FIELD
+
+	/* XXX: signal if there were more fields than expected? */
+
+	return 0;
+}
+
+const char *bch2_inode_invalid(const struct bch_fs *c, struct bkey_s_c k)
+{
+	if (k.k->p.offset)
+		return "nonzero offset";
+
+	switch (k.k->type) {
+	case BCH_INODE_FS: {
+		struct bkey_s_c_inode inode = bkey_s_c_to_inode(k);
+		struct bch_inode_unpacked unpacked;
+
+		if (bkey_val_bytes(k.k) < sizeof(struct bch_inode))
+			return "incorrect value size";
+
+		if (k.k->p.inode < BLOCKDEV_INODE_MAX)
+			return "fs inode in blockdev range";
+
+		if (INODE_STR_HASH(inode.v) >= BCH_STR_HASH_NR)
+			return "invalid str hash type";
+
+		if (bch2_inode_unpack(inode, &unpacked))
+			return "invalid variable length fields";
+
+		if (unpacked.bi_data_checksum >= BCH_CSUM_OPT_NR + 1)
+			return "invalid data checksum type";
+
+		if (unpacked.bi_compression >= BCH_COMPRESSION_OPT_NR + 1)
+			return "invalid data checksum type";
+
+		if ((unpacked.bi_flags & BCH_INODE_UNLINKED) &&
+		    unpacked.bi_nlink != 0)
+			return "flagged as unlinked but bi_nlink != 0";
+
+		return NULL;
+	}
+	case BCH_INODE_BLOCKDEV:
+		if (bkey_val_bytes(k.k) != sizeof(struct bch_inode_blockdev))
+			return "incorrect value size";
+
+		if (k.k->p.inode >= BLOCKDEV_INODE_MAX)
+			return "blockdev inode in fs range";
+
+		return NULL;
+	case BCH_INODE_GENERATION:
+		if (bkey_val_bytes(k.k) != sizeof(struct bch_inode_generation))
+			return "incorrect value size";
+
+		return NULL;
+	default:
+		return "invalid type";
+	}
+}
+
+void bch2_inode_to_text(struct bch_fs *c, char *buf,
+			size_t size, struct bkey_s_c k)
+{
+	char *out = buf, *end = out + size;
+	struct bkey_s_c_inode inode;
+	struct bch_inode_unpacked unpacked;
+
+	switch (k.k->type) {
+	case BCH_INODE_FS:
+		inode = bkey_s_c_to_inode(k);
+		if (bch2_inode_unpack(inode, &unpacked)) {
+			out += scnprintf(out, end - out, "(unpack error)");
+			break;
+		}
+
+#define BCH_INODE_FIELD(_name, _bits)						\
+		out += scnprintf(out, end - out, #_name ": %llu ", (u64) unpacked._name);
+		BCH_INODE_FIELDS()
+#undef  BCH_INODE_FIELD
+		break;
+	}
+}
+
+void bch2_inode_init(struct bch_fs *c, struct bch_inode_unpacked *inode_u,
+		     uid_t uid, gid_t gid, umode_t mode, dev_t rdev,
+		     struct bch_inode_unpacked *parent)
+{
+	s64 now = bch2_current_time(c);
+
+	memset(inode_u, 0, sizeof(*inode_u));
+
+	/* ick */
+	inode_u->bi_flags |= c->opts.str_hash << INODE_STR_HASH_OFFSET;
+	get_random_bytes(&inode_u->bi_hash_seed, sizeof(inode_u->bi_hash_seed));
+
+	inode_u->bi_mode	= mode;
+	inode_u->bi_uid		= uid;
+	inode_u->bi_gid		= gid;
+	inode_u->bi_dev		= rdev;
+	inode_u->bi_atime	= now;
+	inode_u->bi_mtime	= now;
+	inode_u->bi_ctime	= now;
+	inode_u->bi_otime	= now;
+
+	if (parent) {
+#define BCH_INODE_FIELD(_name)	inode_u->_name = parent->_name;
+		BCH_INODE_FIELDS_INHERIT()
+#undef BCH_INODE_FIELD
+	}
+}
+
+static inline u32 bkey_generation(struct bkey_s_c k)
+{
+	switch (k.k->type) {
+	case BCH_INODE_BLOCKDEV:
+	case BCH_INODE_FS:
+		BUG();
+	case BCH_INODE_GENERATION:
+		return le32_to_cpu(bkey_s_c_to_inode_generation(k).v->bi_generation);
+	default:
+		return 0;
+	}
+}
+
+int __bch2_inode_create(struct btree_trans *trans,
+			struct bch_inode_unpacked *inode_u,
+			u64 min, u64 max, u64 *hint)
+{
+	struct bch_fs *c = trans->c;
+	struct bkey_inode_buf *inode_p;
+	struct btree_iter *iter;
+	u64 start;
+	int ret;
+
+	if (!max)
+		max = ULLONG_MAX;
+
+	if (c->opts.inodes_32bit)
+		max = min_t(u64, max, U32_MAX);
+
+	start = READ_ONCE(*hint);
+
+	if (start >= max || start < min)
+		start = min;
+
+	inode_p = bch2_trans_kmalloc(trans, sizeof(*inode_p));
+	if (IS_ERR(inode_p))
+		return PTR_ERR(inode_p);
+
+	iter = bch2_trans_get_iter(trans,
+			BTREE_ID_INODES, POS(start, 0),
+			BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+	if (IS_ERR(iter))
+		return PTR_ERR(iter);
+again:
+	while (1) {
+		struct bkey_s_c k = bch2_btree_iter_peek_slot(iter);
+
+		ret = btree_iter_err(k);
+		if (ret)
+			return ret;
+
+		switch (k.k->type) {
+		case BCH_INODE_BLOCKDEV:
+		case BCH_INODE_FS:
+			/* slot used */
+			if (iter->pos.inode >= max)
+				goto out;
+
+			bch2_btree_iter_next_slot(iter);
+			break;
+
+		default:
+			*hint			= k.k->p.inode;
+			inode_u->bi_inum	= k.k->p.inode;
+			inode_u->bi_generation	= bkey_generation(k);
+
+			bch2_inode_pack(inode_p, inode_u);
+			bch2_trans_update(trans, iter, &inode_p->inode.k_i, 0);
+			return 0;
+		}
+	}
+out:
+	if (start != min) {
+		/* Retry from start */
+		start = min;
+		bch2_btree_iter_set_pos(iter, POS(start, 0));
+		goto again;
+	}
+
+	return -ENOSPC;
+}
+
+int bch2_inode_create(struct bch_fs *c, struct bch_inode_unpacked *inode_u,
+		      u64 min, u64 max, u64 *hint)
+{
+	return bch2_trans_do(c, NULL, BTREE_INSERT_ATOMIC,
+			__bch2_inode_create(&trans, inode_u, min, max, hint));
+}
+
+int bch2_inode_truncate(struct bch_fs *c, u64 inode_nr, u64 new_size,
+			struct extent_insert_hook *hook, u64 *journal_seq)
+{
+	return bch2_btree_delete_range(c, BTREE_ID_EXTENTS,
+				       POS(inode_nr, new_size),
+				       POS(inode_nr + 1, 0),
+				       ZERO_VERSION, NULL, hook,
+				       journal_seq);
+}
+
+int bch2_inode_rm(struct bch_fs *c, u64 inode_nr)
+{
+	struct btree_iter iter;
+	struct bkey_i_inode_generation delete;
+	int ret;
+
+	ret = bch2_inode_truncate(c, inode_nr, 0, NULL, NULL);
+	if (ret < 0)
+		return ret;
+
+	ret = bch2_btree_delete_range(c, BTREE_ID_XATTRS,
+				     POS(inode_nr, 0),
+				     POS(inode_nr + 1, 0),
+				     ZERO_VERSION, NULL, NULL, NULL);
+	if (ret < 0)
+		return ret;
+
+	/*
+	 * If this was a directory, there shouldn't be any real dirents left -
+	 * but there could be whiteouts (from hash collisions) that we should
+	 * delete:
+	 *
+	 * XXX: the dirent could ideally would delete whiteouts when they're no
+	 * longer needed
+	 */
+	ret = bch2_btree_delete_range(c, BTREE_ID_DIRENTS,
+				     POS(inode_nr, 0),
+				     POS(inode_nr + 1, 0),
+				     ZERO_VERSION, NULL, NULL, NULL);
+	if (ret < 0)
+		return ret;
+
+	bch2_btree_iter_init(&iter, c, BTREE_ID_INODES, POS(inode_nr, 0),
+			     BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+	do {
+		struct bkey_s_c k = bch2_btree_iter_peek_slot(&iter);
+		u32 bi_generation = 0;
+
+		ret = btree_iter_err(k);
+		if (ret) {
+			bch2_btree_iter_unlock(&iter);
+			return ret;
+		}
+
+		bch2_fs_inconsistent_on(k.k->type != BCH_INODE_FS, c,
+					"inode %llu not found when deleting",
+					inode_nr);
+
+		switch (k.k->type) {
+		case BCH_INODE_FS: {
+			struct bch_inode_unpacked inode_u;
+
+			if (!bch2_inode_unpack(bkey_s_c_to_inode(k), &inode_u))
+				bi_generation = inode_u.bi_generation + 1;
+			break;
+		}
+		case BCH_INODE_GENERATION: {
+			struct bkey_s_c_inode_generation g =
+				bkey_s_c_to_inode_generation(k);
+			bi_generation = le32_to_cpu(g.v->bi_generation);
+			break;
+		}
+		}
+
+		if (!bi_generation) {
+			bkey_init(&delete.k);
+			delete.k.p.inode = inode_nr;
+		} else {
+			bkey_inode_generation_init(&delete.k_i);
+			delete.k.p.inode = inode_nr;
+			delete.v.bi_generation = cpu_to_le32(bi_generation);
+		}
+
+		ret = bch2_btree_insert_at(c, NULL, NULL, NULL,
+				BTREE_INSERT_ATOMIC|
+				BTREE_INSERT_NOFAIL,
+				BTREE_INSERT_ENTRY(&iter, &delete.k_i));
+	} while (ret == -EINTR);
+
+	bch2_btree_iter_unlock(&iter);
+	return ret;
+}
+
+int bch2_inode_find_by_inum(struct bch_fs *c, u64 inode_nr,
+			    struct bch_inode_unpacked *inode)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	int ret = -ENOENT;
+
+	for_each_btree_key(&iter, c, BTREE_ID_INODES,
+			   POS(inode_nr, 0),
+			   BTREE_ITER_SLOTS, k) {
+		switch (k.k->type) {
+		case BCH_INODE_FS:
+			ret = bch2_inode_unpack(bkey_s_c_to_inode(k), inode);
+			break;
+		default:
+			/* hole, not found */
+			break;
+		}
+
+		break;
+
+	}
+
+	return bch2_btree_iter_unlock(&iter) ?: ret;
+}
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+void bch2_inode_pack_test(void)
+{
+	struct bch_inode_unpacked *u, test_inodes[] = {
+		{
+			.bi_atime	= U64_MAX,
+			.bi_ctime	= U64_MAX,
+			.bi_mtime	= U64_MAX,
+			.bi_otime	= U64_MAX,
+			.bi_size	= U64_MAX,
+			.bi_sectors	= U64_MAX,
+			.bi_uid		= U32_MAX,
+			.bi_gid		= U32_MAX,
+			.bi_nlink	= U32_MAX,
+			.bi_generation	= U32_MAX,
+			.bi_dev		= U32_MAX,
+		},
+	};
+
+	for (u = test_inodes;
+	     u < test_inodes + ARRAY_SIZE(test_inodes);
+	     u++) {
+		struct bkey_inode_buf p;
+
+		bch2_inode_pack(&p, u);
+	}
+}
+#endif
diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h
new file mode 100644
index 000000000000..bd6166c40e6f
--- /dev/null
+++ b/fs/bcachefs/inode.h
@@ -0,0 +1,101 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_INODE_H
+#define _BCACHEFS_INODE_H
+
+#include "opts.h"
+
+#include <linux/math64.h>
+
+const char *bch2_inode_invalid(const struct bch_fs *, struct bkey_s_c);
+void bch2_inode_to_text(struct bch_fs *, char *, size_t, struct bkey_s_c);
+
+#define bch2_bkey_inode_ops (struct bkey_ops) {		\
+	.key_invalid	= bch2_inode_invalid,		\
+	.val_to_text	= bch2_inode_to_text,		\
+}
+
+struct bch_inode_unpacked {
+	u64			bi_inum;
+	__le64			bi_hash_seed;
+	u32			bi_flags;
+	u16			bi_mode;
+
+#define BCH_INODE_FIELD(_name, _bits)	u##_bits _name;
+	BCH_INODE_FIELDS()
+#undef  BCH_INODE_FIELD
+};
+
+struct bkey_inode_buf {
+	struct bkey_i_inode	inode;
+
+#define BCH_INODE_FIELD(_name, _bits)		+ 8 + _bits / 8
+	u8		_pad[0 + BCH_INODE_FIELDS()];
+#undef  BCH_INODE_FIELD
+} __attribute__((packed, aligned(8)));
+
+void bch2_inode_pack(struct bkey_inode_buf *, const struct bch_inode_unpacked *);
+int bch2_inode_unpack(struct bkey_s_c_inode, struct bch_inode_unpacked *);
+
+void bch2_inode_init(struct bch_fs *, struct bch_inode_unpacked *,
+		     uid_t, gid_t, umode_t, dev_t,
+		     struct bch_inode_unpacked *);
+
+int __bch2_inode_create(struct btree_trans *,
+			struct bch_inode_unpacked *,
+			u64, u64, u64 *);
+int bch2_inode_create(struct bch_fs *, struct bch_inode_unpacked *,
+		      u64, u64, u64 *);
+
+int bch2_inode_truncate(struct bch_fs *, u64, u64,
+		       struct extent_insert_hook *, u64 *);
+int bch2_inode_rm(struct bch_fs *, u64);
+
+int bch2_inode_find_by_inum(struct bch_fs *, u64,
+			   struct bch_inode_unpacked *);
+
+static inline struct bch_io_opts bch2_inode_opts_get(struct bch_inode_unpacked *inode)
+{
+	struct bch_io_opts ret = { 0 };
+
+#define BCH_INODE_OPT(_name, _bits)					\
+	if (inode->bi_##_name)						\
+		opt_set(ret, _name, inode->bi_##_name - 1);
+	BCH_INODE_OPTS()
+#undef BCH_INODE_OPT
+	return ret;
+}
+
+static inline void __bch2_inode_opt_set(struct bch_inode_unpacked *inode,
+					enum bch_opt_id id, u64 v)
+{
+	switch (id) {
+#define BCH_INODE_OPT(_name, ...)					\
+	case Opt_##_name:						\
+		inode->bi_##_name = v;					\
+		break;
+	BCH_INODE_OPTS()
+#undef BCH_INODE_OPT
+	default:
+		BUG();
+	}
+}
+
+static inline void bch2_inode_opt_set(struct bch_inode_unpacked *inode,
+				      enum bch_opt_id id, u64 v)
+{
+	return __bch2_inode_opt_set(inode, id, v + 1);
+}
+
+static inline void bch2_inode_opt_clear(struct bch_inode_unpacked *inode,
+					enum bch_opt_id id)
+{
+	return __bch2_inode_opt_set(inode, id, 0);
+}
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+void bch2_inode_pack_test(void);
+#else
+static inline void bch2_inode_pack_test(void) {}
+#endif
+
+#endif /* _BCACHEFS_INODE_H */
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
new file mode 100644
index 000000000000..d1935ef1d6c3
--- /dev/null
+++ b/fs/bcachefs/io.c
@@ -0,0 +1,1875 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Some low level IO code, and hacks for various block layer limitations
+ *
+ * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
+ * Copyright 2012 Google, Inc.
+ */
+
+#include "bcachefs.h"
+#include "alloc.h"
+#include "bset.h"
+#include "btree_update.h"
+#include "buckets.h"
+#include "checksum.h"
+#include "compress.h"
+#include "clock.h"
+#include "debug.h"
+#include "disk_groups.h"
+#include "error.h"
+#include "extents.h"
+#include "io.h"
+#include "journal.h"
+#include "keylist.h"
+#include "move.h"
+#include "rebalance.h"
+#include "replicas.h"
+#include "super.h"
+#include "super-io.h"
+#include "trace.h"
+
+#include <linux/blkdev.h>
+#include <linux/random.h>
+
+#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
+
+static bool bch2_target_congested(struct bch_fs *c, u16 target)
+{
+	const struct bch_devs_mask *devs;
+	unsigned d, nr = 0, total = 0;
+	u64 now = local_clock(), last;
+	s64 congested;
+	struct bch_dev *ca;
+
+	if (!target)
+		return false;
+
+	rcu_read_lock();
+	devs = bch2_target_to_mask(c, target);
+	for_each_set_bit(d, devs->d, BCH_SB_MEMBERS_MAX) {
+		ca = rcu_dereference(c->devs[d]);
+		if (!ca)
+			continue;
+
+		congested = atomic_read(&ca->congested);
+		last = READ_ONCE(ca->congested_last);
+		if (time_after64(now, last))
+			congested -= (now - last) >> 12;
+
+		total += max(congested, 0LL);
+		nr++;
+	}
+	rcu_read_unlock();
+
+	return bch2_rand_range(nr * CONGESTED_MAX) < total;
+}
+
+static inline void bch2_congested_acct(struct bch_dev *ca, u64 io_latency,
+				       u64 now, int rw)
+{
+	u64 latency_capable =
+		ca->io_latency[rw].quantiles.entries[QUANTILE_IDX(1)].m;
+	/* ideally we'd be taking into account the device's variance here: */
+	u64 latency_threshold = latency_capable << (rw == READ ? 2 : 3);
+	s64 latency_over = io_latency - latency_threshold;
+
+	if (latency_threshold && latency_over > 0) {
+		/*
+		 * bump up congested by approximately latency_over * 4 /
+		 * latency_threshold - we don't need much accuracy here so don't
+		 * bother with the divide:
+		 */
+		if (atomic_read(&ca->congested) < CONGESTED_MAX)
+			atomic_add(latency_over >>
+				   max_t(int, ilog2(latency_threshold) - 2, 0),
+				   &ca->congested);
+
+		ca->congested_last = now;
+	} else if (atomic_read(&ca->congested) > 0) {
+		atomic_dec(&ca->congested);
+	}
+}
+
+void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw)
+{
+	atomic64_t *latency = &ca->cur_latency[rw];
+	u64 now = local_clock();
+	u64 io_latency = time_after64(now, submit_time)
+		? now - submit_time
+		: 0;
+	u64 old, new, v = atomic64_read(latency);
+
+	do {
+		old = v;
+
+		/*
+		 * If the io latency was reasonably close to the current
+		 * latency, skip doing the update and atomic operation - most of
+		 * the time:
+		 */
+		if (abs((int) (old - io_latency)) < (old >> 1) &&
+		    now & ~(~0 << 5))
+			break;
+
+		new = ewma_add(old, io_latency, 5);
+	} while ((v = atomic64_cmpxchg(latency, old, new)) != old);
+
+	bch2_congested_acct(ca, io_latency, now, rw);
+
+	__bch2_time_stats_update(&ca->io_latency[rw], submit_time, now);
+}
+
+#else
+
+static bool bch2_target_congested(struct bch_fs *c, u16 target)
+{
+	return false;
+}
+
+#endif
+
+/* Allocate, free from mempool: */
+
+void bch2_bio_free_pages_pool(struct bch_fs *c, struct bio *bio)
+{
+	struct bvec_iter_all iter;
+	struct bio_vec *bv;
+
+	bio_for_each_segment_all(bv, bio, iter)
+		if (bv->bv_page != ZERO_PAGE(0))
+			mempool_free(bv->bv_page, &c->bio_bounce_pages);
+	bio->bi_vcnt = 0;
+}
+
+static void bch2_bio_alloc_page_pool(struct bch_fs *c, struct bio *bio,
+				    bool *using_mempool)
+{
+	struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt++];
+
+	if (likely(!*using_mempool)) {
+		bv->bv_page = alloc_page(GFP_NOIO);
+		if (unlikely(!bv->bv_page)) {
+			mutex_lock(&c->bio_bounce_pages_lock);
+			*using_mempool = true;
+			goto pool_alloc;
+
+		}
+	} else {
+pool_alloc:
+		bv->bv_page = mempool_alloc(&c->bio_bounce_pages, GFP_NOIO);
+	}
+
+	bv->bv_len = PAGE_SIZE;
+	bv->bv_offset = 0;
+}
+
+void bch2_bio_alloc_pages_pool(struct bch_fs *c, struct bio *bio,
+			       size_t bytes)
+{
+	bool using_mempool = false;
+
+	BUG_ON(DIV_ROUND_UP(bytes, PAGE_SIZE) > bio->bi_max_vecs);
+
+	bio->bi_iter.bi_size = bytes;
+
+	while (bio->bi_vcnt < DIV_ROUND_UP(bytes, PAGE_SIZE))
+		bch2_bio_alloc_page_pool(c, bio, &using_mempool);
+
+	if (using_mempool)
+		mutex_unlock(&c->bio_bounce_pages_lock);
+}
+
+void bch2_bio_alloc_more_pages_pool(struct bch_fs *c, struct bio *bio,
+				    size_t bytes)
+{
+	while (bio->bi_vcnt < DIV_ROUND_UP(bytes, PAGE_SIZE)) {
+		struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt];
+
+		BUG_ON(bio->bi_vcnt >= bio->bi_max_vecs);
+
+		bv->bv_page = alloc_page(GFP_NOIO);
+		if (!bv->bv_page) {
+			/*
+			 * We already allocated from mempool, we can't allocate from it again
+			 * without freeing the pages we already allocated or else we could
+			 * deadlock:
+			 */
+			bch2_bio_free_pages_pool(c, bio);
+			bch2_bio_alloc_pages_pool(c, bio, bytes);
+			return;
+		}
+
+		bv->bv_len = PAGE_SIZE;
+		bv->bv_offset = 0;
+		bio->bi_vcnt++;
+	}
+
+	bio->bi_iter.bi_size = bytes;
+}
+
+/* Writes */
+
+void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
+			       enum bch_data_type type,
+			       const struct bkey_i *k)
+{
+	struct bkey_s_c_extent e = bkey_i_to_s_c_extent(k);
+	const struct bch_extent_ptr *ptr;
+	struct bch_write_bio *n;
+	struct bch_dev *ca;
+
+	BUG_ON(c->opts.nochanges);
+
+	extent_for_each_ptr(e, ptr) {
+		BUG_ON(ptr->dev >= BCH_SB_MEMBERS_MAX ||
+		       !c->devs[ptr->dev]);
+
+		ca = bch_dev_bkey_exists(c, ptr->dev);
+
+		if (ptr + 1 < &extent_entry_last(e)->ptr) {
+			n = to_wbio(bio_alloc_clone(NULL, &wbio->bio,
+						GFP_NOIO, &ca->replica_set));
+
+			n->bio.bi_end_io	= wbio->bio.bi_end_io;
+			n->bio.bi_private	= wbio->bio.bi_private;
+			n->parent		= wbio;
+			n->split		= true;
+			n->bounce		= false;
+			n->put_bio		= true;
+			n->bio.bi_opf		= wbio->bio.bi_opf;
+			bio_inc_remaining(&wbio->bio);
+		} else {
+			n = wbio;
+			n->split		= false;
+		}
+
+		n->c			= c;
+		n->dev			= ptr->dev;
+		n->have_ioref		= bch2_dev_get_ioref(ca, WRITE);
+		n->submit_time		= local_clock();
+		n->bio.bi_iter.bi_sector = ptr->offset;
+
+		if (!journal_flushes_device(ca))
+			n->bio.bi_opf |= REQ_FUA;
+
+		if (likely(n->have_ioref)) {
+			this_cpu_add(ca->io_done->sectors[WRITE][type],
+				     bio_sectors(&n->bio));
+
+			bio_set_dev(&n->bio, ca->disk_sb.bdev);
+
+			if (type != BCH_DATA_BTREE && unlikely(c->opts.no_data_io)) {
+				bio_endio(&n->bio);
+				continue;
+			}
+
+			submit_bio(&n->bio);
+		} else {
+			n->bio.bi_status	= BLK_STS_REMOVED;
+			bio_endio(&n->bio);
+		}
+	}
+}
+
+static void __bch2_write(struct closure *);
+
+static void bch2_write_done(struct closure *cl)
+{
+	struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
+	struct bch_fs *c = op->c;
+
+	if (!op->error && (op->flags & BCH_WRITE_FLUSH))
+		op->error = bch2_journal_error(&c->journal);
+
+	if (!(op->flags & BCH_WRITE_NOPUT_RESERVATION))
+		bch2_disk_reservation_put(c, &op->res);
+	percpu_ref_put(&c->writes);
+	bch2_keylist_free(&op->insert_keys, op->inline_keys);
+
+	bch2_time_stats_update(&c->times[BCH_TIME_data_write], op->start_time);
+
+	closure_return(cl);
+}
+
+int bch2_write_index_default(struct bch_write_op *op)
+{
+	struct keylist *keys = &op->insert_keys;
+	struct btree_iter iter;
+	int ret;
+
+	bch2_btree_iter_init(&iter, op->c, BTREE_ID_EXTENTS,
+			     bkey_start_pos(&bch2_keylist_front(keys)->k),
+			     BTREE_ITER_INTENT);
+
+	ret = bch2_btree_insert_list_at(&iter, keys, &op->res,
+					NULL, op_journal_seq(op),
+					BTREE_INSERT_NOFAIL|
+					BTREE_INSERT_USE_RESERVE);
+	bch2_btree_iter_unlock(&iter);
+
+	return ret;
+}
+
+/**
+ * bch_write_index - after a write, update index to point to new data
+ */
+static void __bch2_write_index(struct bch_write_op *op)
+{
+	struct bch_fs *c = op->c;
+	struct keylist *keys = &op->insert_keys;
+	struct bkey_s_extent e;
+	struct bch_extent_ptr *ptr;
+	struct bkey_i *src, *dst = keys->keys, *n, *k;
+	int ret;
+
+	for (src = keys->keys; src != keys->top; src = n) {
+		n = bkey_next(src);
+		bkey_copy(dst, src);
+
+		e = bkey_i_to_s_extent(dst);
+		extent_for_each_ptr_backwards(e, ptr)
+			if (test_bit(ptr->dev, op->failed.d))
+				bch2_extent_drop_ptr(e, ptr);
+
+		if (!bch2_extent_nr_ptrs(e.c)) {
+			ret = -EIO;
+			goto err;
+		}
+
+		if (!(op->flags & BCH_WRITE_NOMARK_REPLICAS)) {
+			ret = bch2_mark_bkey_replicas(c, BCH_DATA_USER, e.s_c);
+			if (ret)
+				goto err;
+		}
+
+		dst = bkey_next(dst);
+	}
+
+	keys->top = dst;
+
+	/*
+	 * probably not the ideal place to hook this in, but I don't
+	 * particularly want to plumb io_opts all the way through the btree
+	 * update stack right now
+	 */
+	for_each_keylist_key(keys, k)
+		bch2_rebalance_add_key(c, bkey_i_to_s_c(k), &op->opts);
+
+	if (!bch2_keylist_empty(keys)) {
+		u64 sectors_start = keylist_sectors(keys);
+		int ret = op->index_update_fn(op);
+
+		BUG_ON(keylist_sectors(keys) && !ret);
+
+		op->written += sectors_start - keylist_sectors(keys);
+
+		if (ret) {
+			__bcache_io_error(c, "btree IO error %i", ret);
+			op->error = ret;
+		}
+	}
+out:
+	bch2_open_bucket_put_refs(c, &op->open_buckets_nr, op->open_buckets);
+	return;
+err:
+	keys->top = keys->keys;
+	op->error = ret;
+	goto out;
+}
+
+static void bch2_write_index(struct closure *cl)
+{
+	struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
+	struct bch_fs *c = op->c;
+
+	__bch2_write_index(op);
+
+	if (!op->error && (op->flags & BCH_WRITE_FLUSH)) {
+		bch2_journal_flush_seq_async(&c->journal,
+					     *op_journal_seq(op),
+					     cl);
+		continue_at(cl, bch2_write_done, index_update_wq(op));
+	} else {
+		continue_at_nobarrier(cl, bch2_write_done, NULL);
+	}
+}
+
+static void bch2_write_endio(struct bio *bio)
+{
+	struct closure *cl		= bio->bi_private;
+	struct bch_write_op *op		= container_of(cl, struct bch_write_op, cl);
+	struct bch_write_bio *wbio	= to_wbio(bio);
+	struct bch_write_bio *parent	= wbio->split ? wbio->parent : NULL;
+	struct bch_fs *c		= wbio->c;
+	struct bch_dev *ca		= bch_dev_bkey_exists(c, wbio->dev);
+
+	if (bch2_dev_io_err_on(bio->bi_status, ca, "data write"))
+		set_bit(wbio->dev, op->failed.d);
+
+	if (wbio->have_ioref) {
+		bch2_latency_acct(ca, wbio->submit_time, WRITE);
+		percpu_ref_put(&ca->io_ref);
+	}
+
+	if (wbio->bounce)
+		bch2_bio_free_pages_pool(c, bio);
+
+	if (wbio->put_bio)
+		bio_put(bio);
+
+	if (parent)
+		bio_endio(&parent->bio);
+	else
+		closure_put(cl);
+}
+
+static void init_append_extent(struct bch_write_op *op,
+			       struct write_point *wp,
+			       struct bversion version,
+			       struct bch_extent_crc_unpacked crc)
+{
+	struct bkey_i_extent *e = bkey_extent_init(op->insert_keys.top);
+
+	op->pos.offset += crc.uncompressed_size;
+	e->k.p = op->pos;
+	e->k.size = crc.uncompressed_size;
+	e->k.version = version;
+	bkey_extent_set_cached(&e->k, op->flags & BCH_WRITE_CACHED);
+
+	bch2_extent_crc_append(e, crc);
+	bch2_alloc_sectors_append_ptrs(op->c, wp, e, crc.compressed_size);
+
+	bch2_keylist_push(&op->insert_keys);
+}
+
+static struct bio *bch2_write_bio_alloc(struct bch_fs *c,
+					struct write_point *wp,
+					struct bio *src,
+					bool *page_alloc_failed)
+{
+	struct bch_write_bio *wbio;
+	struct bio *bio;
+	unsigned output_available =
+		min(wp->sectors_free << 9, src->bi_iter.bi_size);
+	unsigned pages = DIV_ROUND_UP(output_available, PAGE_SIZE);
+
+	bio = bio_alloc_bioset(NULL, pages, 0,
+			       GFP_NOIO, &c->bio_write);
+	wbio			= wbio_init(bio);
+	wbio->bounce		= true;
+	wbio->put_bio		= true;
+	/* copy WRITE_SYNC flag */
+	wbio->bio.bi_opf	= src->bi_opf;
+
+	/*
+	 * We can't use mempool for more than c->sb.encoded_extent_max
+	 * worth of pages, but we'd like to allocate more if we can:
+	 */
+	while (bio->bi_iter.bi_size < output_available) {
+		unsigned len = min_t(unsigned, PAGE_SIZE,
+				     output_available - bio->bi_iter.bi_size);
+		struct page *p;
+
+		p = alloc_page(GFP_NOIO);
+		if (!p) {
+			unsigned pool_max =
+				min_t(unsigned, output_available,
+				      c->sb.encoded_extent_max << 9);
+
+			if (bio_sectors(bio) < pool_max)
+				bch2_bio_alloc_pages_pool(c, bio, pool_max);
+			break;
+		}
+
+		bio->bi_io_vec[bio->bi_vcnt++] = (struct bio_vec) {
+			.bv_page	= p,
+			.bv_len		= len,
+			.bv_offset	= 0,
+		};
+		bio->bi_iter.bi_size += len;
+	}
+
+	*page_alloc_failed = bio->bi_vcnt < pages;
+	return bio;
+}
+
+static int bch2_write_rechecksum(struct bch_fs *c,
+				 struct bch_write_op *op,
+				 unsigned new_csum_type)
+{
+	struct bio *bio = &op->wbio.bio;
+	struct bch_extent_crc_unpacked new_crc;
+	int ret;
+
+	/* bch2_rechecksum_bio() can't encrypt or decrypt data: */
+
+	if (bch2_csum_type_is_encryption(op->crc.csum_type) !=
+	    bch2_csum_type_is_encryption(new_csum_type))
+		new_csum_type = op->crc.csum_type;
+
+	ret = bch2_rechecksum_bio(c, bio, op->version, op->crc,
+				  NULL, &new_crc,
+				  op->crc.offset, op->crc.live_size,
+				  new_csum_type);
+	if (ret)
+		return ret;
+
+	bio_advance(bio, op->crc.offset << 9);
+	bio->bi_iter.bi_size = op->crc.live_size << 9;
+	op->crc = new_crc;
+	return 0;
+}
+
+static int bch2_write_decrypt(struct bch_write_op *op)
+{
+	struct bch_fs *c = op->c;
+	struct nonce nonce = extent_nonce(op->version, op->crc);
+	struct bch_csum csum;
+
+	if (!bch2_csum_type_is_encryption(op->crc.csum_type))
+		return 0;
+
+	/*
+	 * If we need to decrypt data in the write path, we'll no longer be able
+	 * to verify the existing checksum (poly1305 mac, in this case) after
+	 * it's decrypted - this is the last point we'll be able to reverify the
+	 * checksum:
+	 */
+	csum = bch2_checksum_bio(c, op->crc.csum_type, nonce, &op->wbio.bio);
+	if (bch2_crc_cmp(op->crc.csum, csum))
+		return -EIO;
+
+	bch2_encrypt_bio(c, op->crc.csum_type, nonce, &op->wbio.bio);
+	op->crc.csum_type = 0;
+	op->crc.csum = (struct bch_csum) { 0, 0 };
+	return 0;
+}
+
+static enum prep_encoded_ret {
+	PREP_ENCODED_OK,
+	PREP_ENCODED_ERR,
+	PREP_ENCODED_CHECKSUM_ERR,
+	PREP_ENCODED_DO_WRITE,
+} bch2_write_prep_encoded_data(struct bch_write_op *op, struct write_point *wp)
+{
+	struct bch_fs *c = op->c;
+	struct bio *bio = &op->wbio.bio;
+
+	if (!(op->flags & BCH_WRITE_DATA_ENCODED))
+		return PREP_ENCODED_OK;
+
+	BUG_ON(bio_sectors(bio) != op->crc.compressed_size);
+
+	/* Can we just write the entire extent as is? */
+	if (op->crc.uncompressed_size == op->crc.live_size &&
+	    op->crc.compressed_size <= wp->sectors_free &&
+	    op->crc.compression_type == op->compression_type) {
+		if (!op->crc.compression_type &&
+		    op->csum_type != op->crc.csum_type &&
+		    bch2_write_rechecksum(c, op, op->csum_type))
+			return PREP_ENCODED_CHECKSUM_ERR;
+
+		return PREP_ENCODED_DO_WRITE;
+	}
+
+	/*
+	 * If the data is compressed and we couldn't write the entire extent as
+	 * is, we have to decompress it:
+	 */
+	if (op->crc.compression_type) {
+		struct bch_csum csum;
+
+		if (bch2_write_decrypt(op))
+			return PREP_ENCODED_CHECKSUM_ERR;
+
+		/* Last point we can still verify checksum: */
+		csum = bch2_checksum_bio(c, op->crc.csum_type,
+					 extent_nonce(op->version, op->crc),
+					 bio);
+		if (bch2_crc_cmp(op->crc.csum, csum))
+			return PREP_ENCODED_CHECKSUM_ERR;
+
+		if (bch2_bio_uncompress_inplace(c, bio, &op->crc))
+			return PREP_ENCODED_ERR;
+	}
+
+	/*
+	 * No longer have compressed data after this point - data might be
+	 * encrypted:
+	 */
+
+	/*
+	 * If the data is checksummed and we're only writing a subset,
+	 * rechecksum and adjust bio to point to currently live data:
+	 */
+	if ((op->crc.live_size != op->crc.uncompressed_size ||
+	     op->crc.csum_type != op->csum_type) &&
+	    bch2_write_rechecksum(c, op, op->csum_type))
+		return PREP_ENCODED_CHECKSUM_ERR;
+
+	/*
+	 * If we want to compress the data, it has to be decrypted:
+	 */
+	if ((op->compression_type ||
+	     bch2_csum_type_is_encryption(op->crc.csum_type) !=
+	     bch2_csum_type_is_encryption(op->csum_type)) &&
+	    bch2_write_decrypt(op))
+		return PREP_ENCODED_CHECKSUM_ERR;
+
+	return PREP_ENCODED_OK;
+}
+
+static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp)
+{
+	struct bch_fs *c = op->c;
+	struct bio *src = &op->wbio.bio, *dst = src;
+	struct bvec_iter saved_iter;
+	struct bkey_i *key_to_write;
+	unsigned key_to_write_offset = op->insert_keys.top_p -
+		op->insert_keys.keys_p;
+	unsigned total_output = 0;
+	bool bounce = false, page_alloc_failed = false;
+	int ret, more = 0;
+
+	BUG_ON(!bio_sectors(src));
+
+	switch (bch2_write_prep_encoded_data(op, wp)) {
+	case PREP_ENCODED_OK:
+		break;
+	case PREP_ENCODED_ERR:
+		ret = -EIO;
+		goto err;
+	case PREP_ENCODED_CHECKSUM_ERR:
+		goto csum_err;
+	case PREP_ENCODED_DO_WRITE:
+		init_append_extent(op, wp, op->version, op->crc);
+		goto do_write;
+	}
+
+	if (op->compression_type ||
+	    (op->csum_type &&
+	     !(op->flags & BCH_WRITE_PAGES_STABLE)) ||
+	    (bch2_csum_type_is_encryption(op->csum_type) &&
+	     !(op->flags & BCH_WRITE_PAGES_OWNED))) {
+		dst = bch2_write_bio_alloc(c, wp, src, &page_alloc_failed);
+		bounce = true;
+	}
+
+	saved_iter = dst->bi_iter;
+
+	do {
+		struct bch_extent_crc_unpacked crc =
+			(struct bch_extent_crc_unpacked) { 0 };
+		struct bversion version = op->version;
+		size_t dst_len, src_len;
+
+		if (page_alloc_failed &&
+		    bio_sectors(dst) < wp->sectors_free &&
+		    bio_sectors(dst) < c->sb.encoded_extent_max)
+			break;
+
+		BUG_ON(op->compression_type &&
+		       (op->flags & BCH_WRITE_DATA_ENCODED) &&
+		       bch2_csum_type_is_encryption(op->crc.csum_type));
+		BUG_ON(op->compression_type && !bounce);
+
+		crc.compression_type = op->compression_type
+			?  bch2_bio_compress(c, dst, &dst_len, src, &src_len,
+					     op->compression_type)
+			: 0;
+		if (!crc.compression_type) {
+			dst_len = min(dst->bi_iter.bi_size, src->bi_iter.bi_size);
+			dst_len = min_t(unsigned, dst_len, wp->sectors_free << 9);
+
+			if (op->csum_type)
+				dst_len = min_t(unsigned, dst_len,
+						c->sb.encoded_extent_max << 9);
+
+			if (bounce) {
+				swap(dst->bi_iter.bi_size, dst_len);
+				bio_copy_data(dst, src);
+				swap(dst->bi_iter.bi_size, dst_len);
+			}
+
+			src_len = dst_len;
+		}
+
+		BUG_ON(!src_len || !dst_len);
+
+		if (bch2_csum_type_is_encryption(op->csum_type)) {
+			if (bversion_zero(version)) {
+				version.lo = atomic64_inc_return(&c->key_version) + 1;
+			} else {
+				crc.nonce = op->nonce;
+				op->nonce += src_len >> 9;
+			}
+		}
+
+		if ((op->flags & BCH_WRITE_DATA_ENCODED) &&
+		    !crc.compression_type &&
+		    bch2_csum_type_is_encryption(op->crc.csum_type) ==
+		    bch2_csum_type_is_encryption(op->csum_type)) {
+			/*
+			 * Note: when we're using rechecksum(), we need to be
+			 * checksumming @src because it has all the data our
+			 * existing checksum covers - if we bounced (because we
+			 * were trying to compress), @dst will only have the
+			 * part of the data the new checksum will cover.
+			 *
+			 * But normally we want to be checksumming post bounce,
+			 * because part of the reason for bouncing is so the
+			 * data can't be modified (by userspace) while it's in
+			 * flight.
+			 */
+			if (bch2_rechecksum_bio(c, src, version, op->crc,
+					&crc, &op->crc,
+					src_len >> 9,
+					bio_sectors(src) - (src_len >> 9),
+					op->csum_type))
+				goto csum_err;
+		} else {
+			if ((op->flags & BCH_WRITE_DATA_ENCODED) &&
+			    bch2_rechecksum_bio(c, src, version, op->crc,
+					NULL, &op->crc,
+					src_len >> 9,
+					bio_sectors(src) - (src_len >> 9),
+					op->crc.csum_type))
+				goto csum_err;
+
+			crc.compressed_size	= dst_len >> 9;
+			crc.uncompressed_size	= src_len >> 9;
+			crc.live_size		= src_len >> 9;
+
+			swap(dst->bi_iter.bi_size, dst_len);
+			bch2_encrypt_bio(c, op->csum_type,
+					 extent_nonce(version, crc), dst);
+			crc.csum = bch2_checksum_bio(c, op->csum_type,
+					 extent_nonce(version, crc), dst);
+			crc.csum_type = op->csum_type;
+			swap(dst->bi_iter.bi_size, dst_len);
+		}
+
+		init_append_extent(op, wp, version, crc);
+
+		if (dst != src)
+			bio_advance(dst, dst_len);
+		bio_advance(src, src_len);
+		total_output += dst_len;
+	} while (dst->bi_iter.bi_size &&
+		 src->bi_iter.bi_size &&
+		 wp->sectors_free &&
+		 !bch2_keylist_realloc(&op->insert_keys,
+				      op->inline_keys,
+				      ARRAY_SIZE(op->inline_keys),
+				      BKEY_EXTENT_U64s_MAX));
+
+	more = src->bi_iter.bi_size != 0;
+
+	dst->bi_iter = saved_iter;
+
+	if (!bounce && more) {
+		dst = bio_split(src, total_output >> 9,
+				GFP_NOIO, &c->bio_write);
+		wbio_init(dst)->put_bio = true;
+	}
+
+	dst->bi_iter.bi_size = total_output;
+
+	/* Free unneeded pages after compressing: */
+	if (bounce)
+		while (dst->bi_vcnt > DIV_ROUND_UP(dst->bi_iter.bi_size, PAGE_SIZE))
+			mempool_free(dst->bi_io_vec[--dst->bi_vcnt].bv_page,
+				     &c->bio_bounce_pages);
+do_write:
+	/* might have done a realloc... */
+
+	key_to_write = (void *) (op->insert_keys.keys_p + key_to_write_offset);
+
+	dst->bi_end_io	= bch2_write_endio;
+	dst->bi_private	= &op->cl;
+	dst->bi_opf	= REQ_OP_WRITE;
+
+	closure_get(dst->bi_private);
+
+	bch2_submit_wbio_replicas(to_wbio(dst), c, BCH_DATA_USER,
+				  key_to_write);
+	return more;
+csum_err:
+	bch_err(c, "error verifying existing checksum while "
+		"rewriting existing data (memory corruption?)");
+	ret = -EIO;
+err:
+	if (bounce) {
+		bch2_bio_free_pages_pool(c, dst);
+		bio_put(dst);
+	}
+
+	return ret;
+}
+
+static void __bch2_write(struct closure *cl)
+{
+	struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
+	struct bch_fs *c = op->c;
+	struct write_point *wp;
+	int ret;
+again:
+	do {
+		/* +1 for possible cache device: */
+		if (op->open_buckets_nr + op->nr_replicas + 1 >
+		    ARRAY_SIZE(op->open_buckets))
+			goto flush_io;
+
+		if (bch2_keylist_realloc(&op->insert_keys,
+					op->inline_keys,
+					ARRAY_SIZE(op->inline_keys),
+					BKEY_EXTENT_U64s_MAX))
+			goto flush_io;
+
+		wp = bch2_alloc_sectors_start(c,
+			op->target,
+			op->write_point,
+			&op->devs_have,
+			op->nr_replicas,
+			op->nr_replicas_required,
+			op->alloc_reserve,
+			op->flags,
+			(op->flags & BCH_WRITE_ALLOC_NOWAIT) ? NULL : cl);
+		EBUG_ON(!wp);
+
+		if (unlikely(IS_ERR(wp))) {
+			if (unlikely(PTR_ERR(wp) != -EAGAIN)) {
+				ret = PTR_ERR(wp);
+				goto err;
+			}
+
+			goto flush_io;
+		}
+
+		ret = bch2_write_extent(op, wp);
+
+		BUG_ON(op->open_buckets_nr + wp->nr_ptrs - wp->first_ptr >
+		       ARRAY_SIZE(op->open_buckets));
+		bch2_open_bucket_get(c, wp,
+				     &op->open_buckets_nr,
+				     op->open_buckets);
+		bch2_alloc_sectors_done(c, wp);
+
+		if (ret < 0)
+			goto err;
+	} while (ret);
+
+	continue_at(cl, bch2_write_index, index_update_wq(op));
+	return;
+err:
+	op->error = ret;
+
+	continue_at(cl, !bch2_keylist_empty(&op->insert_keys)
+		    ? bch2_write_index
+		    : bch2_write_done, index_update_wq(op));
+	return;
+flush_io:
+	closure_sync(cl);
+
+	if (!bch2_keylist_empty(&op->insert_keys)) {
+		__bch2_write_index(op);
+
+		if (op->error) {
+			continue_at_nobarrier(cl, bch2_write_done, NULL);
+			return;
+		}
+	}
+
+	goto again;
+}
+
+/**
+ * bch_write - handle a write to a cache device or flash only volume
+ *
+ * This is the starting point for any data to end up in a cache device; it could
+ * be from a normal write, or a writeback write, or a write to a flash only
+ * volume - it's also used by the moving garbage collector to compact data in
+ * mostly empty buckets.
+ *
+ * It first writes the data to the cache, creating a list of keys to be inserted
+ * (if the data won't fit in a single open bucket, there will be multiple keys);
+ * after the data is written it calls bch_journal, and after the keys have been
+ * added to the next journal write they're inserted into the btree.
+ *
+ * If op->discard is true, instead of inserting the data it invalidates the
+ * region of the cache represented by op->bio and op->inode.
+ */
+void bch2_write(struct closure *cl)
+{
+	struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
+	struct bch_fs *c = op->c;
+
+	BUG_ON(!op->nr_replicas);
+	BUG_ON(!op->write_point.v);
+	BUG_ON(!bkey_cmp(op->pos, POS_MAX));
+	BUG_ON(bio_sectors(&op->wbio.bio) > U16_MAX);
+
+	op->start_time = local_clock();
+
+	memset(&op->failed, 0, sizeof(op->failed));
+
+	bch2_keylist_init(&op->insert_keys, op->inline_keys);
+	wbio_init(&op->wbio.bio)->put_bio = false;
+
+	if (c->opts.nochanges ||
+	    !percpu_ref_tryget(&c->writes)) {
+		__bcache_io_error(c, "read only");
+		op->error = -EROFS;
+		if (!(op->flags & BCH_WRITE_NOPUT_RESERVATION))
+			bch2_disk_reservation_put(c, &op->res);
+		closure_return(cl);
+		return;
+	}
+
+	bch2_increment_clock(c, bio_sectors(&op->wbio.bio), WRITE);
+
+	continue_at_nobarrier(cl, __bch2_write, NULL);
+}
+
+/* Cache promotion on read */
+
+struct promote_op {
+	struct closure		cl;
+	u64			start_time;
+
+	struct rhash_head	hash;
+	struct bpos		pos;
+
+	struct migrate_write	write;
+	struct bio_vec		bi_inline_vecs[0]; /* must be last */
+};
+
+static const struct rhashtable_params bch_promote_params = {
+	.head_offset	= offsetof(struct promote_op, hash),
+	.key_offset	= offsetof(struct promote_op, pos),
+	.key_len	= sizeof(struct bpos),
+};
+
+static inline bool should_promote(struct bch_fs *c, struct bkey_s_c k,
+				  struct bpos pos,
+				  struct bch_io_opts opts,
+				  unsigned flags)
+{
+	if (!opts.promote_target)
+		return false;
+
+	if (!(flags & BCH_READ_MAY_PROMOTE))
+		return false;
+
+	if (percpu_ref_is_dying(&c->writes))
+		return false;
+
+	if (!bkey_extent_is_data(k.k))
+		return false;
+
+	if (bch2_extent_has_target(c, bkey_s_c_to_extent(k), opts.promote_target))
+		return false;
+
+	if (bch2_target_congested(c, opts.promote_target))
+		return false;
+
+	if (rhashtable_lookup_fast(&c->promote_table, &pos,
+				   bch_promote_params))
+		return false;
+
+	return true;
+}
+
+static void promote_free(struct bch_fs *c, struct promote_op *op)
+{
+	int ret;
+
+	ret = rhashtable_remove_fast(&c->promote_table, &op->hash,
+				     bch_promote_params);
+	BUG_ON(ret);
+	percpu_ref_put(&c->writes);
+	kfree(op);
+}
+
+static void promote_done(struct closure *cl)
+{
+	struct promote_op *op =
+		container_of(cl, struct promote_op, cl);
+	struct bch_fs *c = op->write.op.c;
+
+	bch2_time_stats_update(&c->times[BCH_TIME_data_promote],
+			       op->start_time);
+
+	bch2_bio_free_pages_pool(c, &op->write.op.wbio.bio);
+	promote_free(c, op);
+}
+
+static void promote_start(struct promote_op *op, struct bch_read_bio *rbio)
+{
+	struct bch_fs *c = rbio->c;
+	struct closure *cl = &op->cl;
+	struct bio *bio = &op->write.op.wbio.bio;
+
+	trace_promote(&rbio->bio);
+
+	/* we now own pages: */
+	BUG_ON(!rbio->bounce);
+	BUG_ON(rbio->bio.bi_vcnt > bio->bi_max_vecs);
+
+	memcpy(bio->bi_io_vec, rbio->bio.bi_io_vec,
+	       sizeof(struct bio_vec) * rbio->bio.bi_vcnt);
+	swap(bio->bi_vcnt, rbio->bio.bi_vcnt);
+
+	bch2_migrate_read_done(&op->write, rbio);
+
+	closure_init(cl, NULL);
+	closure_call(&op->write.op.cl, bch2_write, c->wq, cl);
+	closure_return_with_destructor(cl, promote_done);
+}
+
+noinline
+static struct promote_op *__promote_alloc(struct bch_fs *c,
+					  struct bpos pos,
+					  struct extent_pick_ptr *pick,
+					  struct bch_io_opts opts,
+					  unsigned rbio_sectors,
+					  struct bch_read_bio **rbio)
+{
+	struct promote_op *op = NULL;
+	struct bio *bio;
+	unsigned rbio_pages = DIV_ROUND_UP(rbio_sectors, PAGE_SECTORS);
+	/* data might have to be decompressed in the write path: */
+	unsigned wbio_pages = DIV_ROUND_UP(pick->crc.uncompressed_size,
+					   PAGE_SECTORS);
+	int ret;
+
+	if (!percpu_ref_tryget(&c->writes))
+		return NULL;
+
+	op = kzalloc(sizeof(*op) + sizeof(struct bio_vec) * wbio_pages,
+		     GFP_NOIO);
+	if (!op)
+		goto err;
+
+	op->start_time = local_clock();
+	op->pos = pos;
+
+	/*
+	 * promotes require bouncing, but if the extent isn't
+	 * checksummed/compressed it might be too big for the mempool:
+	 */
+	if (rbio_sectors > c->sb.encoded_extent_max) {
+		*rbio = kzalloc(sizeof(struct bch_read_bio) +
+				sizeof(struct bio_vec) * rbio_pages,
+				GFP_NOIO);
+		if (!*rbio)
+			goto err;
+
+		rbio_init(&(*rbio)->bio, opts);
+		bio_init(&(*rbio)->bio, NULL, (*rbio)->bio.bi_inline_vecs, rbio_pages, 0);
+
+		if (bch2_bio_alloc_pages(&(*rbio)->bio, rbio_sectors << 9,
+					 GFP_NOIO))
+			goto err;
+
+		(*rbio)->bounce		= true;
+		(*rbio)->split		= true;
+		(*rbio)->kmalloc	= true;
+	}
+
+	if (rhashtable_lookup_insert_fast(&c->promote_table, &op->hash,
+					  bch_promote_params))
+		goto err;
+
+	bio = &op->write.op.wbio.bio;
+	bio_init(bio, NULL, bio->bi_inline_vecs, wbio_pages, 0);
+
+	ret = bch2_migrate_write_init(c, &op->write,
+			writepoint_hashed((unsigned long) current),
+			opts,
+			DATA_PROMOTE,
+			(struct data_opts) {
+				.target = opts.promote_target
+			},
+			bkey_s_c_null);
+	BUG_ON(ret);
+
+	return op;
+err:
+	if (*rbio)
+		bio_free_pages(&(*rbio)->bio);
+	kfree(*rbio);
+	*rbio = NULL;
+	kfree(op);
+	percpu_ref_put(&c->writes);
+	return NULL;
+}
+
+static inline struct promote_op *promote_alloc(struct bch_fs *c,
+					       struct bvec_iter iter,
+					       struct bkey_s_c k,
+					       struct extent_pick_ptr *pick,
+					       struct bch_io_opts opts,
+					       unsigned flags,
+					       struct bch_read_bio **rbio,
+					       bool *bounce,
+					       bool *read_full)
+{
+	bool promote_full = *read_full || READ_ONCE(c->promote_whole_extents);
+	unsigned sectors = promote_full
+		? pick->crc.compressed_size
+		: bvec_iter_sectors(iter);
+	struct bpos pos = promote_full
+		? bkey_start_pos(k.k)
+		: POS(k.k->p.inode, iter.bi_sector);
+	struct promote_op *promote;
+
+	if (!should_promote(c, k, pos, opts, flags))
+		return NULL;
+
+	promote = __promote_alloc(c, pos, pick, opts, sectors, rbio);
+	if (!promote)
+		return NULL;
+
+	*bounce		= true;
+	*read_full	= promote_full;
+	return promote;
+}
+
+/* Read */
+
+#define READ_RETRY_AVOID	1
+#define READ_RETRY		2
+#define READ_ERR		3
+
+enum rbio_context {
+	RBIO_CONTEXT_NULL,
+	RBIO_CONTEXT_HIGHPRI,
+	RBIO_CONTEXT_UNBOUND,
+};
+
+static inline struct bch_read_bio *
+bch2_rbio_parent(struct bch_read_bio *rbio)
+{
+	return rbio->split ? rbio->parent : rbio;
+}
+
+__always_inline
+static void bch2_rbio_punt(struct bch_read_bio *rbio, work_func_t fn,
+			   enum rbio_context context,
+			   struct workqueue_struct *wq)
+{
+	if (context <= rbio->context) {
+		fn(&rbio->work);
+	} else {
+		rbio->work.func		= fn;
+		rbio->context		= context;
+		queue_work(wq, &rbio->work);
+	}
+}
+
+static inline struct bch_read_bio *bch2_rbio_free(struct bch_read_bio *rbio)
+{
+	BUG_ON(rbio->bounce && !rbio->split);
+
+	if (rbio->promote)
+		promote_free(rbio->c, rbio->promote);
+	rbio->promote = NULL;
+
+	if (rbio->bounce)
+		bch2_bio_free_pages_pool(rbio->c, &rbio->bio);
+
+	if (rbio->split) {
+		struct bch_read_bio *parent = rbio->parent;
+
+		if (rbio->kmalloc)
+			kfree(rbio);
+		else
+			bio_put(&rbio->bio);
+
+		rbio = parent;
+	}
+
+	return rbio;
+}
+
+static void bch2_rbio_done(struct bch_read_bio *rbio)
+{
+	bch2_time_stats_update(&rbio->c->times[BCH_TIME_data_read],
+			       rbio->start_time);
+	bio_endio(&rbio->bio);
+}
+
+static void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio,
+				     struct bvec_iter bvec_iter, u64 inode,
+				     struct bch_devs_mask *avoid, unsigned flags)
+{
+	struct btree_iter iter;
+	BKEY_PADDED(k) tmp;
+	struct bkey_s_c k;
+	int ret;
+
+	flags &= ~BCH_READ_LAST_FRAGMENT;
+
+	bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS,
+			     rbio->pos, BTREE_ITER_SLOTS);
+retry:
+	rbio->bio.bi_status = 0;
+
+	k = bch2_btree_iter_peek_slot(&iter);
+	if (btree_iter_err(k)) {
+		bch2_btree_iter_unlock(&iter);
+		goto err;
+	}
+
+	bkey_reassemble(&tmp.k, k);
+	k = bkey_i_to_s_c(&tmp.k);
+	bch2_btree_iter_unlock(&iter);
+
+	if (!bkey_extent_is_data(k.k) ||
+	    !bch2_extent_matches_ptr(c, bkey_i_to_s_c_extent(&tmp.k),
+				     rbio->pick.ptr,
+				     rbio->pos.offset -
+				     rbio->pick.crc.offset)) {
+		/* extent we wanted to read no longer exists: */
+		rbio->hole = true;
+		goto out;
+	}
+
+	ret = __bch2_read_extent(c, rbio, bvec_iter, k, avoid, flags);
+	if (ret == READ_RETRY)
+		goto retry;
+	if (ret)
+		goto err;
+	goto out;
+err:
+	rbio->bio.bi_status = BLK_STS_IOERR;
+out:
+	bch2_rbio_done(rbio);
+}
+
+static void bch2_read_retry(struct bch_fs *c, struct bch_read_bio *rbio,
+			    struct bvec_iter bvec_iter, u64 inode,
+			    struct bch_devs_mask *avoid, unsigned flags)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	int ret;
+
+	flags &= ~BCH_READ_LAST_FRAGMENT;
+	flags |= BCH_READ_MUST_CLONE;
+retry:
+	for_each_btree_key(&iter, c, BTREE_ID_EXTENTS,
+			   POS(inode, bvec_iter.bi_sector),
+			   BTREE_ITER_SLOTS, k) {
+		BKEY_PADDED(k) tmp;
+		unsigned bytes;
+
+		bkey_reassemble(&tmp.k, k);
+		k = bkey_i_to_s_c(&tmp.k);
+		bch2_btree_iter_unlock(&iter);
+
+		bytes = min_t(unsigned, bvec_iter.bi_size,
+			      (k.k->p.offset - bvec_iter.bi_sector) << 9);
+		swap(bvec_iter.bi_size, bytes);
+
+		ret = __bch2_read_extent(c, rbio, bvec_iter, k, avoid, flags);
+		switch (ret) {
+		case READ_RETRY:
+			goto retry;
+		case READ_ERR:
+			goto err;
+		};
+
+		if (bytes == bvec_iter.bi_size)
+			goto out;
+
+		swap(bvec_iter.bi_size, bytes);
+		bio_advance_iter(&rbio->bio, &bvec_iter, bytes);
+	}
+
+	/*
+	 * If we get here, it better have been because there was an error
+	 * reading a btree node
+	 */
+	ret = bch2_btree_iter_unlock(&iter);
+	BUG_ON(!ret);
+	__bcache_io_error(c, "btree IO error %i", ret);
+err:
+	rbio->bio.bi_status = BLK_STS_IOERR;
+out:
+	bch2_rbio_done(rbio);
+}
+
+static void bch2_rbio_retry(struct work_struct *work)
+{
+	struct bch_read_bio *rbio =
+		container_of(work, struct bch_read_bio, work);
+	struct bch_fs *c	= rbio->c;
+	struct bvec_iter iter	= rbio->bvec_iter;
+	unsigned flags		= rbio->flags;
+	u64 inode		= rbio->pos.inode;
+	struct bch_devs_mask avoid;
+
+	trace_read_retry(&rbio->bio);
+
+	memset(&avoid, 0, sizeof(avoid));
+
+	if (rbio->retry == READ_RETRY_AVOID)
+		__set_bit(rbio->pick.ptr.dev, avoid.d);
+
+	rbio->bio.bi_status = 0;
+
+	rbio = bch2_rbio_free(rbio);
+
+	flags |= BCH_READ_IN_RETRY;
+	flags &= ~BCH_READ_MAY_PROMOTE;
+
+	if (flags & BCH_READ_NODECODE)
+		bch2_read_retry_nodecode(c, rbio, iter, inode, &avoid, flags);
+	else
+		bch2_read_retry(c, rbio, iter, inode, &avoid, flags);
+}
+
+static void bch2_rbio_error(struct bch_read_bio *rbio, int retry,
+			    blk_status_t error)
+{
+	rbio->retry = retry;
+
+	if (rbio->flags & BCH_READ_IN_RETRY)
+		return;
+
+	if (retry == READ_ERR) {
+		rbio = bch2_rbio_free(rbio);
+
+		rbio->bio.bi_status = error;
+		bch2_rbio_done(rbio);
+	} else {
+		bch2_rbio_punt(rbio, bch2_rbio_retry,
+			       RBIO_CONTEXT_UNBOUND, system_unbound_wq);
+	}
+}
+
+static void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio)
+{
+	struct bch_fs *c = rbio->c;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	struct bkey_i_extent *e;
+	BKEY_PADDED(k) new;
+	struct bch_extent_crc_unpacked new_crc;
+	unsigned offset;
+	int ret;
+
+	if (rbio->pick.crc.compression_type)
+		return;
+
+	bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, rbio->pos,
+			     BTREE_ITER_INTENT);
+retry:
+	k = bch2_btree_iter_peek(&iter);
+	if (IS_ERR_OR_NULL(k.k))
+		goto out;
+
+	if (!bkey_extent_is_data(k.k))
+		goto out;
+
+	bkey_reassemble(&new.k, k);
+	e = bkey_i_to_extent(&new.k);
+
+	if (!bch2_extent_matches_ptr(c, extent_i_to_s_c(e),
+				     rbio->pick.ptr,
+				     rbio->pos.offset -
+				     rbio->pick.crc.offset) ||
+	    bversion_cmp(e->k.version, rbio->version))
+		goto out;
+
+	/* Extent was merged? */
+	if (bkey_start_offset(&e->k) < rbio->pos.offset ||
+	    e->k.p.offset > rbio->pos.offset + rbio->pick.crc.uncompressed_size)
+		goto out;
+
+	/* The extent might have been partially overwritten since we read it: */
+	offset = rbio->pick.crc.offset + (bkey_start_offset(&e->k) - rbio->pos.offset);
+
+	if (bch2_rechecksum_bio(c, &rbio->bio, rbio->version,
+				rbio->pick.crc, NULL, &new_crc,
+				offset, e->k.size,
+				rbio->pick.crc.csum_type)) {
+		bch_err(c, "error verifying existing checksum while narrowing checksum (memory corruption?)");
+		goto out;
+	}
+
+	if (!bch2_extent_narrow_crcs(e, new_crc))
+		goto out;
+
+	ret = bch2_btree_insert_at(c, NULL, NULL, NULL,
+				   BTREE_INSERT_ATOMIC|
+				   BTREE_INSERT_NOFAIL|
+				   BTREE_INSERT_NOWAIT,
+				   BTREE_INSERT_ENTRY(&iter, &e->k_i));
+	if (ret == -EINTR)
+		goto retry;
+out:
+	bch2_btree_iter_unlock(&iter);
+}
+
+static bool should_narrow_crcs(struct bkey_s_c k,
+			       struct extent_pick_ptr *pick,
+			       unsigned flags)
+{
+	return !(flags & BCH_READ_IN_RETRY) &&
+		bkey_extent_is_data(k.k) &&
+		bch2_can_narrow_extent_crcs(bkey_s_c_to_extent(k), pick->crc);
+}
+
+/* Inner part that may run in process context */
+static void __bch2_read_endio(struct work_struct *work)
+{
+	struct bch_read_bio *rbio =
+		container_of(work, struct bch_read_bio, work);
+	struct bch_fs *c	= rbio->c;
+	struct bch_dev *ca	= bch_dev_bkey_exists(c, rbio->pick.ptr.dev);
+	struct bio *src		= &rbio->bio;
+	struct bio *dst		= &bch2_rbio_parent(rbio)->bio;
+	struct bvec_iter dst_iter = rbio->bvec_iter;
+	struct bch_extent_crc_unpacked crc = rbio->pick.crc;
+	struct nonce nonce = extent_nonce(rbio->version, crc);
+	struct bch_csum csum;
+
+	/* Reset iterator for checksumming and copying bounced data: */
+	if (rbio->bounce) {
+		src->bi_iter.bi_size		= crc.compressed_size << 9;
+		src->bi_iter.bi_idx		= 0;
+		src->bi_iter.bi_bvec_done	= 0;
+	} else {
+		src->bi_iter			= rbio->bvec_iter;
+	}
+
+	csum = bch2_checksum_bio(c, crc.csum_type, nonce, src);
+	if (bch2_crc_cmp(csum, rbio->pick.crc.csum) && !c->opts.no_data_io)
+		goto csum_err;
+
+	if (unlikely(rbio->narrow_crcs))
+		bch2_rbio_narrow_crcs(rbio);
+
+	if (rbio->flags & BCH_READ_NODECODE)
+		goto nodecode;
+
+	/* Adjust crc to point to subset of data we want: */
+	crc.offset     += rbio->bvec_iter.bi_sector - rbio->pos.offset;
+	crc.live_size	= bvec_iter_sectors(rbio->bvec_iter);
+
+	if (crc.compression_type != BCH_COMPRESSION_NONE) {
+		bch2_encrypt_bio(c, crc.csum_type, nonce, src);
+		if (bch2_bio_uncompress(c, src, dst, dst_iter, crc))
+			goto decompression_err;
+	} else {
+		/* don't need to decrypt the entire bio: */
+		nonce = nonce_add(nonce, crc.offset << 9);
+		bio_advance(src, crc.offset << 9);
+
+		BUG_ON(src->bi_iter.bi_size < dst_iter.bi_size);
+		src->bi_iter.bi_size = dst_iter.bi_size;
+
+		bch2_encrypt_bio(c, crc.csum_type, nonce, src);
+
+		if (rbio->bounce) {
+			struct bvec_iter src_iter = src->bi_iter;
+			bio_copy_data_iter(dst, &dst_iter, src, &src_iter);
+		}
+	}
+
+	if (rbio->promote) {
+		/*
+		 * Re encrypt data we decrypted, so it's consistent with
+		 * rbio->crc:
+		 */
+		bch2_encrypt_bio(c, crc.csum_type, nonce, src);
+		promote_start(rbio->promote, rbio);
+		rbio->promote = NULL;
+	}
+nodecode:
+	if (likely(!(rbio->flags & BCH_READ_IN_RETRY))) {
+		rbio = bch2_rbio_free(rbio);
+		bch2_rbio_done(rbio);
+	}
+	return;
+csum_err:
+	/*
+	 * Checksum error: if the bio wasn't bounced, we may have been
+	 * reading into buffers owned by userspace (that userspace can
+	 * scribble over) - retry the read, bouncing it this time:
+	 */
+	if (!rbio->bounce && (rbio->flags & BCH_READ_USER_MAPPED)) {
+		rbio->flags |= BCH_READ_MUST_BOUNCE;
+		bch2_rbio_error(rbio, READ_RETRY, BLK_STS_IOERR);
+		return;
+	}
+
+	bch2_dev_io_error(ca,
+		"data checksum error, inode %llu offset %llu: expected %0llx:%0llx got %0llx:%0llx (type %u)",
+		rbio->pos.inode, (u64) rbio->bvec_iter.bi_sector,
+		rbio->pick.crc.csum.hi, rbio->pick.crc.csum.lo,
+		csum.hi, csum.lo, crc.csum_type);
+	bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
+	return;
+decompression_err:
+	__bcache_io_error(c, "decompression error, inode %llu offset %llu",
+			  rbio->pos.inode,
+			  (u64) rbio->bvec_iter.bi_sector);
+	bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR);
+	return;
+}
+
+static void bch2_read_endio(struct bio *bio)
+{
+	struct bch_read_bio *rbio =
+		container_of(bio, struct bch_read_bio, bio);
+	struct bch_fs *c	= rbio->c;
+	struct bch_dev *ca	= bch_dev_bkey_exists(c, rbio->pick.ptr.dev);
+	struct workqueue_struct *wq = NULL;
+	enum rbio_context context = RBIO_CONTEXT_NULL;
+
+	if (rbio->have_ioref) {
+		bch2_latency_acct(ca, rbio->submit_time, READ);
+		percpu_ref_put(&ca->io_ref);
+	}
+
+	if (!rbio->split)
+		rbio->bio.bi_end_io = rbio->end_io;
+
+	if (bch2_dev_io_err_on(bio->bi_status, ca, "data read")) {
+		bch2_rbio_error(rbio, READ_RETRY_AVOID, bio->bi_status);
+		return;
+	}
+
+	if (rbio->pick.ptr.cached &&
+	    (((rbio->flags & BCH_READ_RETRY_IF_STALE) && race_fault()) ||
+	     ptr_stale(ca, &rbio->pick.ptr))) {
+		atomic_long_inc(&c->read_realloc_races);
+
+		if (rbio->flags & BCH_READ_RETRY_IF_STALE)
+			bch2_rbio_error(rbio, READ_RETRY, BLK_STS_AGAIN);
+		else
+			bch2_rbio_error(rbio, READ_ERR, BLK_STS_AGAIN);
+		return;
+	}
+
+	if (rbio->narrow_crcs ||
+	    rbio->pick.crc.compression_type ||
+	    bch2_csum_type_is_encryption(rbio->pick.crc.csum_type))
+		context = RBIO_CONTEXT_UNBOUND,	wq = system_unbound_wq;
+	else if (rbio->pick.crc.csum_type)
+		context = RBIO_CONTEXT_HIGHPRI,	wq = system_highpri_wq;
+
+	bch2_rbio_punt(rbio, __bch2_read_endio, context, wq);
+}
+
+int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig,
+		       struct bvec_iter iter, struct bkey_s_c k,
+		       struct bch_devs_mask *avoid, unsigned flags)
+{
+	struct extent_pick_ptr pick;
+	struct bch_read_bio *rbio = NULL;
+	struct bch_dev *ca;
+	struct promote_op *promote = NULL;
+	bool bounce = false, read_full = false, narrow_crcs = false;
+	struct bpos pos = bkey_start_pos(k.k);
+	int pick_ret;
+
+	pick_ret = bch2_extent_pick_ptr(c, k, avoid, &pick);
+
+	/* hole or reservation - just zero fill: */
+	if (!pick_ret)
+		goto hole;
+
+	if (pick_ret < 0)
+		goto no_device;
+
+	if (pick_ret > 0)
+		ca = bch_dev_bkey_exists(c, pick.ptr.dev);
+
+	if (flags & BCH_READ_NODECODE) {
+		/*
+		 * can happen if we retry, and the extent we were going to read
+		 * has been merged in the meantime:
+		 */
+		if (pick.crc.compressed_size > orig->bio.bi_vcnt * PAGE_SECTORS)
+			goto hole;
+
+		iter.bi_sector	= pos.offset;
+		iter.bi_size	= pick.crc.compressed_size << 9;
+		goto noclone;
+	}
+
+	if (!(flags & BCH_READ_LAST_FRAGMENT) ||
+	    bio_flagged(&orig->bio, BIO_CHAIN))
+		flags |= BCH_READ_MUST_CLONE;
+
+	narrow_crcs = should_narrow_crcs(k, &pick, flags);
+
+	if (narrow_crcs && (flags & BCH_READ_USER_MAPPED))
+		flags |= BCH_READ_MUST_BOUNCE;
+
+	EBUG_ON(bkey_start_offset(k.k) > iter.bi_sector ||
+		k.k->p.offset < bvec_iter_end_sector(iter));
+
+	if (pick.crc.compression_type != BCH_COMPRESSION_NONE ||
+	    (pick.crc.csum_type != BCH_CSUM_NONE &&
+	     (bvec_iter_sectors(iter) != pick.crc.uncompressed_size ||
+	      (bch2_csum_type_is_encryption(pick.crc.csum_type) &&
+	       (flags & BCH_READ_USER_MAPPED)) ||
+	      (flags & BCH_READ_MUST_BOUNCE)))) {
+		read_full = true;
+		bounce = true;
+	}
+
+	promote = promote_alloc(c, iter, k, &pick, orig->opts, flags,
+				&rbio, &bounce, &read_full);
+
+	if (!read_full) {
+		EBUG_ON(pick.crc.compression_type);
+		EBUG_ON(pick.crc.csum_type &&
+			(bvec_iter_sectors(iter) != pick.crc.uncompressed_size ||
+			 bvec_iter_sectors(iter) != pick.crc.live_size ||
+			 pick.crc.offset ||
+			 iter.bi_sector != pos.offset));
+
+		pick.ptr.offset += pick.crc.offset +
+			(iter.bi_sector - pos.offset);
+		pick.crc.compressed_size	= bvec_iter_sectors(iter);
+		pick.crc.uncompressed_size	= bvec_iter_sectors(iter);
+		pick.crc.offset			= 0;
+		pick.crc.live_size		= bvec_iter_sectors(iter);
+		pos.offset			= iter.bi_sector;
+	}
+
+	if (rbio) {
+		/* promote already allocated bounce rbio */
+	} else if (bounce) {
+		unsigned sectors = pick.crc.compressed_size;
+
+		rbio = rbio_init(bio_alloc_bioset(NULL,
+						  DIV_ROUND_UP(sectors, PAGE_SECTORS),
+						  0,
+						  GFP_NOIO,
+						  &c->bio_read_split),
+				 orig->opts);
+
+		bch2_bio_alloc_pages_pool(c, &rbio->bio, sectors << 9);
+		rbio->bounce	= true;
+		rbio->split	= true;
+	} else if (flags & BCH_READ_MUST_CLONE) {
+		/*
+		 * Have to clone if there were any splits, due to error
+		 * reporting issues (if a split errored, and retrying didn't
+		 * work, when it reports the error to its parent (us) we don't
+		 * know if the error was from our bio, and we should retry, or
+		 * from the whole bio, in which case we don't want to retry and
+		 * lose the error)
+		 */
+		rbio = rbio_init(bio_alloc_clone(NULL, &orig->bio, GFP_NOIO,
+						 &c->bio_read_split),
+				 orig->opts);
+		rbio->bio.bi_iter = iter;
+		rbio->split	= true;
+	} else {
+noclone:
+		rbio = orig;
+		rbio->bio.bi_iter = iter;
+		BUG_ON(bio_flagged(&rbio->bio, BIO_CHAIN));
+	}
+
+	BUG_ON(bio_sectors(&rbio->bio) != pick.crc.compressed_size);
+
+	rbio->c			= c;
+	rbio->submit_time	= local_clock();
+	if (rbio->split)
+		rbio->parent	= orig;
+	else
+		rbio->end_io	= orig->bio.bi_end_io;
+	rbio->bvec_iter		= iter;
+	rbio->flags		= flags;
+	rbio->have_ioref	= pick_ret > 0 && bch2_dev_get_ioref(ca, READ);
+	rbio->narrow_crcs	= narrow_crcs;
+	rbio->hole		= 0;
+	rbio->retry		= 0;
+	rbio->context		= 0;
+	rbio->devs_have		= bch2_bkey_devs(k);
+	rbio->pick		= pick;
+	rbio->pos		= pos;
+	rbio->version		= k.k->version;
+	rbio->promote		= promote;
+	INIT_WORK(&rbio->work, NULL);
+
+	rbio->bio.bi_opf	= orig->bio.bi_opf;
+	rbio->bio.bi_iter.bi_sector = pick.ptr.offset;
+	rbio->bio.bi_end_io	= bch2_read_endio;
+
+	if (rbio->bounce)
+		trace_read_bounce(&rbio->bio);
+
+	bch2_increment_clock(c, bio_sectors(&rbio->bio), READ);
+
+	if (!rbio->have_ioref)
+		goto no_device_postclone;
+
+	percpu_down_read(&c->usage_lock);
+	bucket_io_clock_reset(c, ca, PTR_BUCKET_NR(ca, &pick.ptr), READ);
+	percpu_up_read(&c->usage_lock);
+
+	this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_USER],
+		     bio_sectors(&rbio->bio));
+
+	bio_set_dev(&rbio->bio, ca->disk_sb.bdev);
+
+	if (likely(!(flags & BCH_READ_IN_RETRY))) {
+		if (!(flags & BCH_READ_LAST_FRAGMENT)) {
+			bio_inc_remaining(&orig->bio);
+			trace_read_split(&orig->bio);
+		}
+
+		if (unlikely(c->opts.no_data_io)) {
+			bio_endio(&rbio->bio);
+			return 0;
+		}
+
+		submit_bio(&rbio->bio);
+		return 0;
+	} else {
+		int ret;
+
+		submit_bio_wait(&rbio->bio);
+
+		rbio->context = RBIO_CONTEXT_UNBOUND;
+		bch2_read_endio(&rbio->bio);
+
+		ret = rbio->retry;
+		rbio = bch2_rbio_free(rbio);
+
+		if (ret == READ_RETRY_AVOID) {
+			__set_bit(pick.ptr.dev, avoid->d);
+			ret = READ_RETRY;
+		}
+
+		return ret;
+	}
+
+no_device_postclone:
+	if (!rbio->split)
+		rbio->bio.bi_end_io = rbio->end_io;
+	bch2_rbio_free(rbio);
+no_device:
+	__bcache_io_error(c, "no device to read from");
+
+	if (likely(!(flags & BCH_READ_IN_RETRY))) {
+		orig->bio.bi_status = BLK_STS_IOERR;
+
+		if (flags & BCH_READ_LAST_FRAGMENT)
+			bch2_rbio_done(orig);
+		return 0;
+	} else {
+		return READ_ERR;
+	}
+
+hole:
+	/*
+	 * won't normally happen in the BCH_READ_NODECODE
+	 * (bch2_move_extent()) path, but if we retry and the extent we wanted
+	 * to read no longer exists we have to signal that:
+	 */
+	if (flags & BCH_READ_NODECODE)
+		orig->hole = true;
+
+	zero_fill_bio_iter(&orig->bio, iter);
+
+	if (flags & BCH_READ_LAST_FRAGMENT)
+		bch2_rbio_done(orig);
+	return 0;
+}
+
+void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, u64 inode)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	unsigned flags = BCH_READ_RETRY_IF_STALE|
+		BCH_READ_MAY_PROMOTE|
+		BCH_READ_USER_MAPPED;
+	int ret;
+
+	BUG_ON(rbio->_state);
+	BUG_ON(flags & BCH_READ_NODECODE);
+	BUG_ON(flags & BCH_READ_IN_RETRY);
+
+	rbio->c = c;
+	rbio->start_time = local_clock();
+
+	for_each_btree_key(&iter, c, BTREE_ID_EXTENTS,
+			   POS(inode, rbio->bio.bi_iter.bi_sector),
+			   BTREE_ITER_SLOTS, k) {
+		BKEY_PADDED(k) tmp;
+		unsigned bytes;
+
+		/*
+		 * Unlock the iterator while the btree node's lock is still in
+		 * cache, before doing the IO:
+		 */
+		bkey_reassemble(&tmp.k, k);
+		k = bkey_i_to_s_c(&tmp.k);
+		bch2_btree_iter_unlock(&iter);
+
+		bytes = min_t(unsigned, rbio->bio.bi_iter.bi_size,
+			      (k.k->p.offset - rbio->bio.bi_iter.bi_sector) << 9);
+		swap(rbio->bio.bi_iter.bi_size, bytes);
+
+		if (rbio->bio.bi_iter.bi_size == bytes)
+			flags |= BCH_READ_LAST_FRAGMENT;
+
+		bch2_read_extent(c, rbio, k, flags);
+
+		if (flags & BCH_READ_LAST_FRAGMENT)
+			return;
+
+		swap(rbio->bio.bi_iter.bi_size, bytes);
+		bio_advance(&rbio->bio, bytes);
+	}
+
+	/*
+	 * If we get here, it better have been because there was an error
+	 * reading a btree node
+	 */
+	ret = bch2_btree_iter_unlock(&iter);
+	BUG_ON(!ret);
+	bcache_io_error(c, &rbio->bio, "btree IO error %i", ret);
+	bch2_rbio_done(rbio);
+}
+
+void bch2_fs_io_exit(struct bch_fs *c)
+{
+	if (c->promote_table.tbl)
+		rhashtable_destroy(&c->promote_table);
+	mempool_exit(&c->bio_bounce_pages);
+	bioset_exit(&c->bio_write);
+	bioset_exit(&c->bio_read_split);
+	bioset_exit(&c->bio_read);
+}
+
+int bch2_fs_io_init(struct bch_fs *c)
+{
+	if (bioset_init(&c->bio_read, 1, offsetof(struct bch_read_bio, bio),
+			BIOSET_NEED_BVECS) ||
+	    bioset_init(&c->bio_read_split, 1, offsetof(struct bch_read_bio, bio),
+			BIOSET_NEED_BVECS) ||
+	    bioset_init(&c->bio_write, 1, offsetof(struct bch_write_bio, bio),
+			BIOSET_NEED_BVECS) ||
+	    mempool_init_page_pool(&c->bio_bounce_pages,
+				   max_t(unsigned,
+					 c->opts.btree_node_size,
+					 c->sb.encoded_extent_max) /
+				   PAGE_SECTORS, 0) ||
+	    rhashtable_init(&c->promote_table, &bch_promote_params))
+		return -ENOMEM;
+
+	return 0;
+}
diff --git a/fs/bcachefs/io.h b/fs/bcachefs/io.h
new file mode 100644
index 000000000000..f814226a5196
--- /dev/null
+++ b/fs/bcachefs/io.h
@@ -0,0 +1,144 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_IO_H
+#define _BCACHEFS_IO_H
+
+#include "alloc.h"
+#include "checksum.h"
+#include "io_types.h"
+
+#define to_wbio(_bio)			\
+	container_of((_bio), struct bch_write_bio, bio)
+
+#define to_rbio(_bio)			\
+	container_of((_bio), struct bch_read_bio, bio)
+
+void bch2_bio_free_pages_pool(struct bch_fs *, struct bio *);
+void bch2_bio_alloc_pages_pool(struct bch_fs *, struct bio *, size_t);
+void bch2_bio_alloc_more_pages_pool(struct bch_fs *, struct bio *, size_t);
+
+#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
+void bch2_latency_acct(struct bch_dev *, u64, int);
+#else
+static inline void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw) {}
+#endif
+
+void bch2_submit_wbio_replicas(struct bch_write_bio *, struct bch_fs *,
+			       enum bch_data_type, const struct bkey_i *);
+
+#define BLK_STS_REMOVED		((__force blk_status_t)128)
+
+enum bch_write_flags {
+	BCH_WRITE_ALLOC_NOWAIT		= (1 << 0),
+	BCH_WRITE_CACHED		= (1 << 1),
+	BCH_WRITE_FLUSH			= (1 << 2),
+	BCH_WRITE_DATA_ENCODED		= (1 << 3),
+	BCH_WRITE_PAGES_STABLE		= (1 << 4),
+	BCH_WRITE_PAGES_OWNED		= (1 << 5),
+	BCH_WRITE_ONLY_SPECIFIED_DEVS	= (1 << 6),
+	BCH_WRITE_NOPUT_RESERVATION	= (1 << 7),
+	BCH_WRITE_NOMARK_REPLICAS	= (1 << 8),
+
+	/* Internal: */
+	BCH_WRITE_JOURNAL_SEQ_PTR	= (1 << 9),
+};
+
+static inline u64 *op_journal_seq(struct bch_write_op *op)
+{
+	return (op->flags & BCH_WRITE_JOURNAL_SEQ_PTR)
+		? op->journal_seq_p : &op->journal_seq;
+}
+
+static inline void op_journal_seq_set(struct bch_write_op *op, u64 *journal_seq)
+{
+	op->journal_seq_p = journal_seq;
+	op->flags |= BCH_WRITE_JOURNAL_SEQ_PTR;
+}
+
+static inline struct workqueue_struct *index_update_wq(struct bch_write_op *op)
+{
+	return op->alloc_reserve == RESERVE_MOVINGGC
+		? op->c->copygc_wq
+		: op->c->wq;
+}
+
+int bch2_write_index_default(struct bch_write_op *);
+
+static inline void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c,
+				      struct bch_io_opts opts)
+{
+	op->c			= c;
+	op->io_wq		= index_update_wq(op);
+	op->flags		= 0;
+	op->written		= 0;
+	op->error		= 0;
+	op->csum_type		= bch2_data_checksum_type(c, opts.data_checksum);
+	op->compression_type	= bch2_compression_opt_to_type[opts.compression];
+	op->nr_replicas		= 0;
+	op->nr_replicas_required = c->opts.data_replicas_required;
+	op->alloc_reserve	= RESERVE_NONE;
+	op->open_buckets_nr	= 0;
+	op->devs_have.nr	= 0;
+	op->target		= 0;
+	op->opts		= opts;
+	op->pos			= POS_MAX;
+	op->version		= ZERO_VERSION;
+	op->write_point		= (struct write_point_specifier) { 0 };
+	op->res			= (struct disk_reservation) { 0 };
+	op->journal_seq		= 0;
+	op->index_update_fn	= bch2_write_index_default;
+}
+
+void bch2_write(struct closure *);
+
+static inline struct bch_write_bio *wbio_init(struct bio *bio)
+{
+	struct bch_write_bio *wbio = to_wbio(bio);
+
+	memset(&wbio->wbio, 0, sizeof(wbio->wbio));
+	return wbio;
+}
+
+struct bch_devs_mask;
+struct cache_promote_op;
+struct extent_pick_ptr;
+
+int __bch2_read_extent(struct bch_fs *, struct bch_read_bio *, struct bvec_iter,
+		       struct bkey_s_c, struct bch_devs_mask *, unsigned);
+void bch2_read(struct bch_fs *, struct bch_read_bio *, u64);
+
+enum bch_read_flags {
+	BCH_READ_RETRY_IF_STALE		= 1 << 0,
+	BCH_READ_MAY_PROMOTE		= 1 << 1,
+	BCH_READ_USER_MAPPED		= 1 << 2,
+	BCH_READ_NODECODE		= 1 << 3,
+	BCH_READ_LAST_FRAGMENT		= 1 << 4,
+
+	/* internal: */
+	BCH_READ_MUST_BOUNCE		= 1 << 5,
+	BCH_READ_MUST_CLONE		= 1 << 6,
+	BCH_READ_IN_RETRY		= 1 << 7,
+};
+
+static inline void bch2_read_extent(struct bch_fs *c,
+				    struct bch_read_bio *rbio,
+				    struct bkey_s_c k,
+				    unsigned flags)
+{
+	__bch2_read_extent(c, rbio, rbio->bio.bi_iter, k, NULL, flags);
+}
+
+static inline struct bch_read_bio *rbio_init(struct bio *bio,
+					     struct bch_io_opts opts)
+{
+	struct bch_read_bio *rbio = to_rbio(bio);
+
+	rbio->_state	= 0;
+	rbio->promote	= NULL;
+	rbio->opts	= opts;
+	return rbio;
+}
+
+void bch2_fs_io_exit(struct bch_fs *);
+int bch2_fs_io_init(struct bch_fs *);
+
+#endif /* _BCACHEFS_IO_H */
diff --git a/fs/bcachefs/io_types.h b/fs/bcachefs/io_types.h
new file mode 100644
index 000000000000..b313128ed857
--- /dev/null
+++ b/fs/bcachefs/io_types.h
@@ -0,0 +1,148 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_IO_TYPES_H
+#define _BCACHEFS_IO_TYPES_H
+
+#include "alloc_types.h"
+#include "btree_types.h"
+#include "buckets_types.h"
+#include "extents_types.h"
+#include "keylist_types.h"
+#include "opts.h"
+#include "super_types.h"
+
+#include <linux/llist.h>
+#include <linux/workqueue.h>
+
+struct bch_read_bio {
+	struct bch_fs		*c;
+	u64			start_time;
+	u64			submit_time;
+
+	/*
+	 * Reads will often have to be split, and if the extent being read from
+	 * was checksummed or compressed we'll also have to allocate bounce
+	 * buffers and copy the data back into the original bio.
+	 *
+	 * If we didn't have to split, we have to save and restore the original
+	 * bi_end_io - @split below indicates which:
+	 */
+	union {
+	struct bch_read_bio	*parent;
+	bio_end_io_t		*end_io;
+	};
+
+	/*
+	 * Saved copy of bio->bi_iter, from submission time - allows us to
+	 * resubmit on IO error, and also to copy data back to the original bio
+	 * when we're bouncing:
+	 */
+	struct bvec_iter	bvec_iter;
+
+	u16			flags;
+	union {
+	struct {
+	u16			bounce:1,
+				split:1,
+				kmalloc:1,
+				have_ioref:1,
+				narrow_crcs:1,
+				hole:1,
+				retry:2,
+				context:2;
+	};
+	u16			_state;
+	};
+
+	struct bch_devs_list	devs_have;
+
+	struct extent_pick_ptr	pick;
+	/* start pos of data we read (may not be pos of data we want) */
+	struct bpos		pos;
+	struct bversion		version;
+
+	struct promote_op	*promote;
+
+	struct bch_io_opts	opts;
+
+	struct work_struct	work;
+
+	struct bio		bio;
+};
+
+struct bch_write_bio {
+	struct_group(wbio,
+	struct bch_fs		*c;
+	struct bch_write_bio	*parent;
+
+	u64			submit_time;
+
+	struct bch_devs_list	failed;
+	u8			order;
+	u8			dev;
+
+	unsigned		split:1,
+				bounce:1,
+				put_bio:1,
+				have_ioref:1,
+				used_mempool:1;
+	);
+
+	struct bio		bio;
+};
+
+struct bch_write_op {
+	struct closure		cl;
+	struct bch_fs		*c;
+	struct workqueue_struct	*io_wq;
+	u64			start_time;
+
+	unsigned		written; /* sectors */
+	u16			flags;
+	s16			error; /* dio write path expects it to hold -ERESTARTSYS... */
+
+	unsigned		csum_type:4;
+	unsigned		compression_type:4;
+	unsigned		nr_replicas:4;
+	unsigned		nr_replicas_required:4;
+	unsigned		alloc_reserve:4;
+
+	u8			open_buckets_nr;
+	struct bch_devs_list	devs_have;
+	u16			target;
+	u16			nonce;
+
+	struct bch_io_opts	opts;
+
+	struct bpos		pos;
+	struct bversion		version;
+
+	/* For BCH_WRITE_DATA_ENCODED: */
+	struct bch_extent_crc_unpacked crc;
+
+	struct write_point_specifier write_point;
+
+	struct disk_reservation	res;
+
+	u8			open_buckets[16];
+
+	/*
+	 * If caller wants to flush but hasn't passed us a journal_seq ptr, we
+	 * still need to stash the journal_seq somewhere:
+	 */
+	union {
+		u64			*journal_seq_p;
+		u64			journal_seq;
+	};
+
+	int			(*index_update_fn)(struct bch_write_op *);
+
+	struct bch_devs_mask	failed;
+
+	struct keylist		insert_keys;
+	u64			inline_keys[BKEY_EXTENT_U64s_MAX * 2];
+
+	/* Must be last: */
+	struct bch_write_bio	wbio;
+};
+
+#endif /* _BCACHEFS_IO_TYPES_H */
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
new file mode 100644
index 000000000000..697f601c2cdf
--- /dev/null
+++ b/fs/bcachefs/journal.c
@@ -0,0 +1,1140 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * bcachefs journalling code, for btree insertions
+ *
+ * Copyright 2012 Google, Inc.
+ */
+
+#include "bcachefs.h"
+#include "alloc.h"
+#include "bkey_methods.h"
+#include "btree_gc.h"
+#include "buckets.h"
+#include "journal.h"
+#include "journal_io.h"
+#include "journal_reclaim.h"
+#include "journal_seq_blacklist.h"
+#include "super-io.h"
+#include "trace.h"
+
+static bool journal_entry_is_open(struct journal *j)
+{
+	return j->reservations.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL;
+}
+
+void bch2_journal_buf_put_slowpath(struct journal *j, bool need_write_just_set)
+{
+	struct journal_buf *w = journal_prev_buf(j);
+
+	atomic_dec_bug(&journal_seq_pin(j, le64_to_cpu(w->data->seq))->count);
+
+	if (!need_write_just_set &&
+	    test_bit(JOURNAL_NEED_WRITE, &j->flags))
+		bch2_time_stats_update(j->delay_time,
+				       j->need_write_time);
+#if 0
+	closure_call(&j->io, bch2_journal_write, NULL, NULL);
+#else
+	/* Shut sparse up: */
+	closure_init(&j->io, NULL);
+	set_closure_fn(&j->io, bch2_journal_write, NULL);
+	bch2_journal_write(&j->io);
+#endif
+}
+
+static void journal_pin_new_entry(struct journal *j, int count)
+{
+	struct journal_entry_pin_list *p;
+
+	/*
+	 * The fifo_push() needs to happen at the same time as j->seq is
+	 * incremented for journal_last_seq() to be calculated correctly
+	 */
+	atomic64_inc(&j->seq);
+	p = fifo_push_ref(&j->pin);
+
+	INIT_LIST_HEAD(&p->list);
+	INIT_LIST_HEAD(&p->flushed);
+	atomic_set(&p->count, count);
+	p->devs.nr = 0;
+}
+
+static void bch2_journal_buf_init(struct journal *j)
+{
+	struct journal_buf *buf = journal_cur_buf(j);
+
+	memset(buf->has_inode, 0, sizeof(buf->has_inode));
+
+	memset(buf->data, 0, sizeof(*buf->data));
+	buf->data->seq	= cpu_to_le64(journal_cur_seq(j));
+	buf->data->u64s	= 0;
+}
+
+static inline size_t journal_entry_u64s_reserve(struct journal_buf *buf)
+{
+	return BTREE_ID_NR * (JSET_KEYS_U64s + BKEY_EXTENT_U64s_MAX);
+}
+
+static inline bool journal_entry_empty(struct jset *j)
+{
+	struct jset_entry *i;
+
+	if (j->seq != j->last_seq)
+		return false;
+
+	vstruct_for_each(j, i)
+		if (i->type || i->u64s)
+			return false;
+	return true;
+}
+
+static enum {
+	JOURNAL_ENTRY_ERROR,
+	JOURNAL_ENTRY_INUSE,
+	JOURNAL_ENTRY_CLOSED,
+	JOURNAL_UNLOCKED,
+} journal_buf_switch(struct journal *j, bool need_write_just_set)
+{
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
+	struct journal_buf *buf;
+	union journal_res_state old, new;
+	u64 v = atomic64_read(&j->reservations.counter);
+
+	lockdep_assert_held(&j->lock);
+
+	do {
+		old.v = new.v = v;
+		if (old.cur_entry_offset == JOURNAL_ENTRY_CLOSED_VAL)
+			return JOURNAL_ENTRY_CLOSED;
+
+		if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL)
+			return JOURNAL_ENTRY_ERROR;
+
+		if (new.prev_buf_unwritten)
+			return JOURNAL_ENTRY_INUSE;
+
+		/*
+		 * avoid race between setting buf->data->u64s and
+		 * journal_res_put starting write:
+		 */
+		journal_state_inc(&new);
+
+		new.cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL;
+		new.idx++;
+		new.prev_buf_unwritten = 1;
+
+		BUG_ON(journal_state_count(new, new.idx));
+	} while ((v = atomic64_cmpxchg(&j->reservations.counter,
+				       old.v, new.v)) != old.v);
+
+	clear_bit(JOURNAL_NEED_WRITE, &j->flags);
+
+	buf = &j->buf[old.idx];
+	buf->data->u64s		= cpu_to_le32(old.cur_entry_offset);
+
+	j->prev_buf_sectors =
+		vstruct_blocks_plus(buf->data, c->block_bits,
+				    journal_entry_u64s_reserve(buf)) *
+		c->opts.block_size;
+	BUG_ON(j->prev_buf_sectors > j->cur_buf_sectors);
+
+	bch2_journal_reclaim_fast(j);
+	/* XXX: why set this here, and not in bch2_journal_write()? */
+	buf->data->last_seq	= cpu_to_le64(journal_last_seq(j));
+
+	if (journal_entry_empty(buf->data))
+		clear_bit(JOURNAL_NOT_EMPTY, &j->flags);
+	else
+		set_bit(JOURNAL_NOT_EMPTY, &j->flags);
+
+	journal_pin_new_entry(j, 1);
+
+	bch2_journal_buf_init(j);
+
+	cancel_delayed_work(&j->write_work);
+	spin_unlock(&j->lock);
+
+	if (c->bucket_journal_seq > 1 << 14) {
+		c->bucket_journal_seq = 0;
+		bch2_bucket_seq_cleanup(c);
+	}
+
+	c->bucket_journal_seq++;
+
+	/* ugh - might be called from __journal_res_get() under wait_event() */
+	__set_current_state(TASK_RUNNING);
+	bch2_journal_buf_put(j, old.idx, need_write_just_set);
+
+	return JOURNAL_UNLOCKED;
+}
+
+void bch2_journal_halt(struct journal *j)
+{
+	union journal_res_state old, new;
+	u64 v = atomic64_read(&j->reservations.counter);
+
+	do {
+		old.v = new.v = v;
+		if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL)
+			return;
+
+		new.cur_entry_offset = JOURNAL_ENTRY_ERROR_VAL;
+	} while ((v = atomic64_cmpxchg(&j->reservations.counter,
+				       old.v, new.v)) != old.v);
+
+	journal_wake(j);
+	closure_wake_up(&journal_cur_buf(j)->wait);
+	closure_wake_up(&journal_prev_buf(j)->wait);
+}
+
+/*
+ * should _only_ called from journal_res_get() - when we actually want a
+ * journal reservation - journal entry is open means journal is dirty:
+ *
+ * returns:
+ * 1:		success
+ * 0:		journal currently full (must wait)
+ * -EROFS:	insufficient rw devices
+ * -EIO:	journal error
+ */
+static int journal_entry_open(struct journal *j)
+{
+	struct journal_buf *buf = journal_cur_buf(j);
+	union journal_res_state old, new;
+	ssize_t u64s;
+	int sectors;
+	u64 v;
+
+	lockdep_assert_held(&j->lock);
+	BUG_ON(journal_entry_is_open(j));
+
+	if (!fifo_free(&j->pin))
+		return 0;
+
+	sectors = bch2_journal_entry_sectors(j);
+	if (sectors <= 0)
+		return sectors;
+
+	buf->disk_sectors	= sectors;
+
+	sectors = min_t(unsigned, sectors, buf->size >> 9);
+	j->cur_buf_sectors	= sectors;
+
+	u64s = (sectors << 9) / sizeof(u64);
+
+	/* Subtract the journal header */
+	u64s -= sizeof(struct jset) / sizeof(u64);
+	/*
+	 * Btree roots, prio pointers don't get added until right before we do
+	 * the write:
+	 */
+	u64s -= journal_entry_u64s_reserve(buf);
+	u64s  = max_t(ssize_t, 0L, u64s);
+
+	BUG_ON(u64s >= JOURNAL_ENTRY_CLOSED_VAL);
+
+	if (u64s <= le32_to_cpu(buf->data->u64s))
+		return 0;
+
+	/*
+	 * Must be set before marking the journal entry as open:
+	 */
+	j->cur_entry_u64s = u64s;
+
+	v = atomic64_read(&j->reservations.counter);
+	do {
+		old.v = new.v = v;
+
+		if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL)
+			return -EIO;
+
+		/* Handle any already added entries */
+		new.cur_entry_offset = le32_to_cpu(buf->data->u64s);
+	} while ((v = atomic64_cmpxchg(&j->reservations.counter,
+				       old.v, new.v)) != old.v);
+
+	if (j->res_get_blocked_start)
+		bch2_time_stats_update(j->blocked_time,
+				       j->res_get_blocked_start);
+	j->res_get_blocked_start = 0;
+
+	mod_delayed_work(system_freezable_wq,
+			 &j->write_work,
+			 msecs_to_jiffies(j->write_delay_ms));
+	journal_wake(j);
+	return 1;
+}
+
+/*
+ * returns true if there's nothing to flush and no journal write still in flight
+ */
+static bool journal_flush_write(struct journal *j)
+{
+	bool ret;
+
+	spin_lock(&j->lock);
+	ret = !j->reservations.prev_buf_unwritten;
+
+	if (!journal_entry_is_open(j)) {
+		spin_unlock(&j->lock);
+		return ret;
+	}
+
+	set_bit(JOURNAL_NEED_WRITE, &j->flags);
+	if (journal_buf_switch(j, false) == JOURNAL_UNLOCKED)
+		ret = false;
+	else
+		spin_unlock(&j->lock);
+	return ret;
+}
+
+static void journal_write_work(struct work_struct *work)
+{
+	struct journal *j = container_of(work, struct journal, write_work.work);
+
+	journal_flush_write(j);
+}
+
+/*
+ * Given an inode number, if that inode number has data in the journal that
+ * hasn't yet been flushed, return the journal sequence number that needs to be
+ * flushed:
+ */
+u64 bch2_inode_journal_seq(struct journal *j, u64 inode)
+{
+	size_t h = hash_64(inode, ilog2(sizeof(j->buf[0].has_inode) * 8));
+	u64 seq = 0;
+
+	if (!test_bit(h, j->buf[0].has_inode) &&
+	    !test_bit(h, j->buf[1].has_inode))
+		return 0;
+
+	spin_lock(&j->lock);
+	if (test_bit(h, journal_cur_buf(j)->has_inode))
+		seq = journal_cur_seq(j);
+	else if (test_bit(h, journal_prev_buf(j)->has_inode))
+		seq = journal_cur_seq(j) - 1;
+	spin_unlock(&j->lock);
+
+	return seq;
+}
+
+static int __journal_res_get(struct journal *j, struct journal_res *res,
+			      unsigned u64s_min, unsigned u64s_max)
+{
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
+	struct journal_buf *buf;
+	int ret;
+retry:
+	ret = journal_res_get_fast(j, res, u64s_min, u64s_max);
+	if (ret)
+		return ret;
+
+	spin_lock(&j->lock);
+	/*
+	 * Recheck after taking the lock, so we don't race with another thread
+	 * that just did journal_entry_open() and call journal_entry_close()
+	 * unnecessarily
+	 */
+	ret = journal_res_get_fast(j, res, u64s_min, u64s_max);
+	if (ret) {
+		spin_unlock(&j->lock);
+		return 1;
+	}
+
+	/*
+	 * If we couldn't get a reservation because the current buf filled up,
+	 * and we had room for a bigger entry on disk, signal that we want to
+	 * realloc the journal bufs:
+	 */
+	buf = journal_cur_buf(j);
+	if (journal_entry_is_open(j) &&
+	    buf->size >> 9 < buf->disk_sectors &&
+	    buf->size < JOURNAL_ENTRY_SIZE_MAX)
+		j->buf_size_want = max(j->buf_size_want, buf->size << 1);
+
+	/*
+	 * Close the current journal entry if necessary, then try to start a new
+	 * one:
+	 */
+	switch (journal_buf_switch(j, false)) {
+	case JOURNAL_ENTRY_ERROR:
+		spin_unlock(&j->lock);
+		return -EROFS;
+	case JOURNAL_ENTRY_INUSE:
+		/* haven't finished writing out the previous one: */
+		spin_unlock(&j->lock);
+		trace_journal_entry_full(c);
+		goto blocked;
+	case JOURNAL_ENTRY_CLOSED:
+		break;
+	case JOURNAL_UNLOCKED:
+		goto retry;
+	}
+
+	/* We now have a new, closed journal buf - see if we can open it: */
+	ret = journal_entry_open(j);
+	spin_unlock(&j->lock);
+
+	if (ret < 0)
+		return ret;
+	if (ret)
+		goto retry;
+
+	/* Journal's full, we have to wait */
+
+	/*
+	 * Direct reclaim - can't rely on reclaim from work item
+	 * due to freezing..
+	 */
+	bch2_journal_reclaim_work(&j->reclaim_work.work);
+
+	trace_journal_full(c);
+blocked:
+	if (!j->res_get_blocked_start)
+		j->res_get_blocked_start = local_clock() ?: 1;
+	return 0;
+}
+
+/*
+ * Essentially the entry function to the journaling code. When bcachefs is doing
+ * a btree insert, it calls this function to get the current journal write.
+ * Journal write is the structure used set up journal writes. The calling
+ * function will then add its keys to the structure, queuing them for the next
+ * write.
+ *
+ * To ensure forward progress, the current task must not be holding any
+ * btree node write locks.
+ */
+int bch2_journal_res_get_slowpath(struct journal *j, struct journal_res *res,
+				 unsigned u64s_min, unsigned u64s_max)
+{
+	int ret;
+
+	wait_event(j->wait,
+		   (ret = __journal_res_get(j, res, u64s_min,
+					    u64s_max)));
+	return ret < 0 ? ret : 0;
+}
+
+u64 bch2_journal_last_unwritten_seq(struct journal *j)
+{
+	u64 seq;
+
+	spin_lock(&j->lock);
+	seq = journal_cur_seq(j);
+	if (j->reservations.prev_buf_unwritten)
+		seq--;
+	spin_unlock(&j->lock);
+
+	return seq;
+}
+
+/**
+ * bch2_journal_open_seq_async - try to open a new journal entry if @seq isn't
+ * open yet, or wait if we cannot
+ *
+ * used by the btree interior update machinery, when it needs to write a new
+ * btree root - every journal entry contains the roots of all the btrees, so it
+ * doesn't need to bother with getting a journal reservation
+ */
+int bch2_journal_open_seq_async(struct journal *j, u64 seq, struct closure *parent)
+{
+	int ret;
+
+	spin_lock(&j->lock);
+	BUG_ON(seq > journal_cur_seq(j));
+
+	if (seq < journal_cur_seq(j) ||
+	    journal_entry_is_open(j)) {
+		spin_unlock(&j->lock);
+		return 1;
+	}
+
+	ret = journal_entry_open(j);
+	if (!ret)
+		closure_wait(&j->async_wait, parent);
+	spin_unlock(&j->lock);
+
+	if (!ret)
+		bch2_journal_reclaim_work(&j->reclaim_work.work);
+
+	return ret;
+}
+
+/**
+ * bch2_journal_wait_on_seq - wait for a journal entry to be written
+ *
+ * does _not_ cause @seq to be written immediately - if there is no other
+ * activity to cause the relevant journal entry to be filled up or flushed it
+ * can wait for an arbitrary amount of time (up to @j->write_delay_ms, which is
+ * configurable).
+ */
+void bch2_journal_wait_on_seq(struct journal *j, u64 seq, struct closure *parent)
+{
+	spin_lock(&j->lock);
+
+	BUG_ON(seq > journal_cur_seq(j));
+
+	if (bch2_journal_error(j)) {
+		spin_unlock(&j->lock);
+		return;
+	}
+
+	if (seq == journal_cur_seq(j)) {
+		if (!closure_wait(&journal_cur_buf(j)->wait, parent))
+			BUG();
+	} else if (seq + 1 == journal_cur_seq(j) &&
+		   j->reservations.prev_buf_unwritten) {
+		if (!closure_wait(&journal_prev_buf(j)->wait, parent))
+			BUG();
+
+		smp_mb();
+
+		/* check if raced with write completion (or failure) */
+		if (!j->reservations.prev_buf_unwritten ||
+		    bch2_journal_error(j))
+			closure_wake_up(&journal_prev_buf(j)->wait);
+	}
+
+	spin_unlock(&j->lock);
+}
+
+/**
+ * bch2_journal_flush_seq_async - wait for a journal entry to be written
+ *
+ * like bch2_journal_wait_on_seq, except that it triggers a write immediately if
+ * necessary
+ */
+void bch2_journal_flush_seq_async(struct journal *j, u64 seq, struct closure *parent)
+{
+	struct journal_buf *buf;
+
+	spin_lock(&j->lock);
+
+	BUG_ON(seq > journal_cur_seq(j));
+
+	if (bch2_journal_error(j)) {
+		spin_unlock(&j->lock);
+		return;
+	}
+
+	if (seq == journal_cur_seq(j)) {
+		bool set_need_write = false;
+
+		buf = journal_cur_buf(j);
+
+		if (parent && !closure_wait(&buf->wait, parent))
+			BUG();
+
+		if (!test_and_set_bit(JOURNAL_NEED_WRITE, &j->flags)) {
+			j->need_write_time = local_clock();
+			set_need_write = true;
+		}
+
+		switch (journal_buf_switch(j, set_need_write)) {
+		case JOURNAL_ENTRY_ERROR:
+			if (parent)
+				closure_wake_up(&buf->wait);
+			break;
+		case JOURNAL_ENTRY_CLOSED:
+			/*
+			 * Journal entry hasn't been opened yet, but caller
+			 * claims it has something
+			 */
+			BUG();
+		case JOURNAL_ENTRY_INUSE:
+			break;
+		case JOURNAL_UNLOCKED:
+			return;
+		}
+	} else if (parent &&
+		   seq + 1 == journal_cur_seq(j) &&
+		   j->reservations.prev_buf_unwritten) {
+		buf = journal_prev_buf(j);
+
+		if (!closure_wait(&buf->wait, parent))
+			BUG();
+
+		smp_mb();
+
+		/* check if raced with write completion (or failure) */
+		if (!j->reservations.prev_buf_unwritten ||
+		    bch2_journal_error(j))
+			closure_wake_up(&buf->wait);
+	}
+
+	spin_unlock(&j->lock);
+}
+
+static int journal_seq_flushed(struct journal *j, u64 seq)
+{
+	struct journal_buf *buf;
+	int ret = 1;
+
+	spin_lock(&j->lock);
+	BUG_ON(seq > journal_cur_seq(j));
+
+	if (seq == journal_cur_seq(j)) {
+		bool set_need_write = false;
+
+		ret = 0;
+
+		buf = journal_cur_buf(j);
+
+		if (!test_and_set_bit(JOURNAL_NEED_WRITE, &j->flags)) {
+			j->need_write_time = local_clock();
+			set_need_write = true;
+		}
+
+		switch (journal_buf_switch(j, set_need_write)) {
+		case JOURNAL_ENTRY_ERROR:
+			ret = -EIO;
+			break;
+		case JOURNAL_ENTRY_CLOSED:
+			/*
+			 * Journal entry hasn't been opened yet, but caller
+			 * claims it has something
+			 */
+			BUG();
+		case JOURNAL_ENTRY_INUSE:
+			break;
+		case JOURNAL_UNLOCKED:
+			return 0;
+		}
+	} else if (seq + 1 == journal_cur_seq(j) &&
+		   j->reservations.prev_buf_unwritten) {
+		ret = bch2_journal_error(j);
+	}
+
+	spin_unlock(&j->lock);
+
+	return ret;
+}
+
+int bch2_journal_flush_seq(struct journal *j, u64 seq)
+{
+	u64 start_time = local_clock();
+	int ret, ret2;
+
+	ret = wait_event_killable(j->wait, (ret2 = journal_seq_flushed(j, seq)));
+
+	bch2_time_stats_update(j->flush_seq_time, start_time);
+
+	return ret ?: ret2 < 0 ? ret2 : 0;
+}
+
+/**
+ * bch2_journal_meta_async - force a journal entry to be written
+ */
+void bch2_journal_meta_async(struct journal *j, struct closure *parent)
+{
+	struct journal_res res;
+	unsigned u64s = jset_u64s(0);
+
+	memset(&res, 0, sizeof(res));
+
+	bch2_journal_res_get(j, &res, u64s, u64s);
+	bch2_journal_res_put(j, &res);
+
+	bch2_journal_flush_seq_async(j, res.seq, parent);
+}
+
+int bch2_journal_meta(struct journal *j)
+{
+	struct journal_res res;
+	unsigned u64s = jset_u64s(0);
+	int ret;
+
+	memset(&res, 0, sizeof(res));
+
+	ret = bch2_journal_res_get(j, &res, u64s, u64s);
+	if (ret)
+		return ret;
+
+	bch2_journal_res_put(j, &res);
+
+	return bch2_journal_flush_seq(j, res.seq);
+}
+
+/*
+ * bch2_journal_flush_async - if there is an open journal entry, or a journal
+ * still being written, write it and wait for the write to complete
+ */
+void bch2_journal_flush_async(struct journal *j, struct closure *parent)
+{
+	u64 seq, journal_seq;
+
+	spin_lock(&j->lock);
+	journal_seq = journal_cur_seq(j);
+
+	if (journal_entry_is_open(j)) {
+		seq = journal_seq;
+	} else if (journal_seq) {
+		seq = journal_seq - 1;
+	} else {
+		spin_unlock(&j->lock);
+		return;
+	}
+	spin_unlock(&j->lock);
+
+	bch2_journal_flush_seq_async(j, seq, parent);
+}
+
+int bch2_journal_flush(struct journal *j)
+{
+	u64 seq, journal_seq;
+
+	spin_lock(&j->lock);
+	journal_seq = journal_cur_seq(j);
+
+	if (journal_entry_is_open(j)) {
+		seq = journal_seq;
+	} else if (journal_seq) {
+		seq = journal_seq - 1;
+	} else {
+		spin_unlock(&j->lock);
+		return 0;
+	}
+	spin_unlock(&j->lock);
+
+	return bch2_journal_flush_seq(j, seq);
+}
+
+/* allocate journal on a device: */
+
+static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
+					 bool new_fs, struct closure *cl)
+{
+	struct bch_fs *c = ca->fs;
+	struct journal_device *ja = &ca->journal;
+	struct bch_sb_field_journal *journal_buckets;
+	u64 *new_bucket_seq = NULL, *new_buckets = NULL;
+	int ret = 0;
+
+	/* don't handle reducing nr of buckets yet: */
+	if (nr <= ja->nr)
+		return 0;
+
+	ret = -ENOMEM;
+	new_buckets	= kzalloc(nr * sizeof(u64), GFP_KERNEL);
+	new_bucket_seq	= kzalloc(nr * sizeof(u64), GFP_KERNEL);
+	if (!new_buckets || !new_bucket_seq)
+		goto err;
+
+	journal_buckets = bch2_sb_resize_journal(&ca->disk_sb,
+				nr + sizeof(*journal_buckets) / sizeof(u64));
+	if (!journal_buckets)
+		goto err;
+
+	if (c)
+		spin_lock(&c->journal.lock);
+
+	memcpy(new_buckets,	ja->buckets,	ja->nr * sizeof(u64));
+	memcpy(new_bucket_seq,	ja->bucket_seq,	ja->nr * sizeof(u64));
+	swap(new_buckets,	ja->buckets);
+	swap(new_bucket_seq,	ja->bucket_seq);
+
+	if (c)
+		spin_unlock(&c->journal.lock);
+
+	while (ja->nr < nr) {
+		struct open_bucket *ob = NULL;
+		long bucket;
+
+		if (new_fs) {
+			percpu_down_read(&c->usage_lock);
+			bucket = bch2_bucket_alloc_new_fs(ca);
+			percpu_up_read(&c->usage_lock);
+
+			if (bucket < 0) {
+				ret = -ENOSPC;
+				goto err;
+			}
+		} else {
+			int ob_idx = bch2_bucket_alloc(c, ca, RESERVE_ALLOC, false, cl);
+			if (ob_idx < 0) {
+				ret = cl ? -EAGAIN : -ENOSPC;
+				goto err;
+			}
+
+			ob = c->open_buckets + ob_idx;
+			bucket = sector_to_bucket(ca, ob->ptr.offset);
+		}
+
+		if (c) {
+			percpu_down_read(&c->usage_lock);
+			spin_lock(&c->journal.lock);
+		}
+
+		__array_insert_item(ja->buckets,		ja->nr, ja->last_idx);
+		__array_insert_item(ja->bucket_seq,		ja->nr, ja->last_idx);
+		__array_insert_item(journal_buckets->buckets,	ja->nr, ja->last_idx);
+
+		ja->buckets[ja->last_idx] = bucket;
+		ja->bucket_seq[ja->last_idx] = 0;
+		journal_buckets->buckets[ja->last_idx] = cpu_to_le64(bucket);
+
+		if (ja->last_idx < ja->nr) {
+			if (ja->cur_idx >= ja->last_idx)
+				ja->cur_idx++;
+			ja->last_idx++;
+		}
+		ja->nr++;
+
+		bch2_mark_metadata_bucket(c, ca, bucket, BCH_DATA_JOURNAL,
+				ca->mi.bucket_size,
+				gc_phase(GC_PHASE_SB),
+				new_fs
+				? BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE
+				: 0);
+
+		if (c) {
+			spin_unlock(&c->journal.lock);
+			percpu_up_read(&c->usage_lock);
+		}
+
+		if (!new_fs)
+			bch2_open_bucket_put(c, ob);
+	}
+
+	ret = 0;
+err:
+	kfree(new_bucket_seq);
+	kfree(new_buckets);
+
+	return ret;
+}
+
+/*
+ * Allocate more journal space at runtime - not currently making use if it, but
+ * the code works:
+ */
+int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca,
+				unsigned nr)
+{
+	struct journal_device *ja = &ca->journal;
+	struct closure cl;
+	unsigned current_nr;
+	int ret;
+
+	closure_init_stack(&cl);
+
+	do {
+		struct disk_reservation disk_res = { 0, 0 };
+
+		closure_sync(&cl);
+
+		mutex_lock(&c->sb_lock);
+		current_nr = ja->nr;
+
+		/*
+		 * note: journal buckets aren't really counted as _sectors_ used yet, so
+		 * we don't need the disk reservation to avoid the BUG_ON() in buckets.c
+		 * when space used goes up without a reservation - but we do need the
+		 * reservation to ensure we'll actually be able to allocate:
+		 */
+
+		if (bch2_disk_reservation_get(c, &disk_res,
+				bucket_to_sector(ca, nr - ja->nr), 1, 0)) {
+			mutex_unlock(&c->sb_lock);
+			return -ENOSPC;
+		}
+
+		ret = __bch2_set_nr_journal_buckets(ca, nr, false, &cl);
+
+		bch2_disk_reservation_put(c, &disk_res);
+
+		if (ja->nr != current_nr)
+			bch2_write_super(c);
+		mutex_unlock(&c->sb_lock);
+	} while (ret == -EAGAIN);
+
+	return ret;
+}
+
+int bch2_dev_journal_alloc(struct bch_dev *ca)
+{
+	unsigned nr;
+
+	if (dynamic_fault("bcachefs:add:journal_alloc"))
+		return -ENOMEM;
+
+	/*
+	 * clamp journal size to 1024 buckets or 512MB (in sectors), whichever
+	 * is smaller:
+	 */
+	nr = clamp_t(unsigned, ca->mi.nbuckets >> 8,
+		     BCH_JOURNAL_BUCKETS_MIN,
+		     min(1 << 10,
+			 (1 << 20) / ca->mi.bucket_size));
+
+	return __bch2_set_nr_journal_buckets(ca, nr, true, NULL);
+}
+
+/* startup/shutdown: */
+
+static bool bch2_journal_writing_to_device(struct journal *j, unsigned dev_idx)
+{
+	union journal_res_state state;
+	struct journal_buf *w;
+	bool ret;
+
+	spin_lock(&j->lock);
+	state = READ_ONCE(j->reservations);
+	w = j->buf + !state.idx;
+
+	ret = state.prev_buf_unwritten &&
+		bch2_extent_has_device(bkey_i_to_s_c_extent(&w->key), dev_idx);
+	spin_unlock(&j->lock);
+
+	return ret;
+}
+
+void bch2_dev_journal_stop(struct journal *j, struct bch_dev *ca)
+{
+	spin_lock(&j->lock);
+	bch2_extent_drop_device(bkey_i_to_s_extent(&j->key), ca->dev_idx);
+	spin_unlock(&j->lock);
+
+	wait_event(j->wait, !bch2_journal_writing_to_device(j, ca->dev_idx));
+}
+
+void bch2_fs_journal_stop(struct journal *j)
+{
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
+
+	wait_event(j->wait, journal_flush_write(j));
+
+	/* do we need to write another journal entry? */
+	if (test_bit(JOURNAL_NOT_EMPTY, &j->flags) ||
+	    c->btree_roots_dirty)
+		bch2_journal_meta(j);
+
+	BUG_ON(!bch2_journal_error(j) &&
+	       test_bit(JOURNAL_NOT_EMPTY, &j->flags));
+
+	cancel_delayed_work_sync(&j->write_work);
+	cancel_delayed_work_sync(&j->reclaim_work);
+}
+
+void bch2_fs_journal_start(struct journal *j)
+{
+	struct journal_seq_blacklist *bl;
+	u64 blacklist = 0;
+
+	list_for_each_entry(bl, &j->seq_blacklist, list)
+		blacklist = max(blacklist, bl->end);
+
+	spin_lock(&j->lock);
+
+	set_bit(JOURNAL_STARTED, &j->flags);
+
+	while (journal_cur_seq(j) < blacklist)
+		journal_pin_new_entry(j, 0);
+
+	/*
+	 * journal_buf_switch() only inits the next journal entry when it
+	 * closes an open journal entry - the very first journal entry gets
+	 * initialized here:
+	 */
+	journal_pin_new_entry(j, 1);
+	bch2_journal_buf_init(j);
+
+	spin_unlock(&j->lock);
+
+	/*
+	 * Adding entries to the next journal entry before allocating space on
+	 * disk for the next journal entry - this is ok, because these entries
+	 * only have to go down with the next journal entry we write:
+	 */
+	bch2_journal_seq_blacklist_write(j);
+
+	queue_delayed_work(system_freezable_wq, &j->reclaim_work, 0);
+}
+
+/* init/exit: */
+
+void bch2_dev_journal_exit(struct bch_dev *ca)
+{
+	kfree(ca->journal.bio);
+	kfree(ca->journal.buckets);
+	kfree(ca->journal.bucket_seq);
+
+	ca->journal.bio		= NULL;
+	ca->journal.buckets	= NULL;
+	ca->journal.bucket_seq	= NULL;
+}
+
+int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb)
+{
+	struct journal_device *ja = &ca->journal;
+	struct bch_sb_field_journal *journal_buckets =
+		bch2_sb_get_journal(sb);
+	unsigned i, nr_bvecs;
+
+	ja->nr = bch2_nr_journal_buckets(journal_buckets);
+
+	ja->bucket_seq = kcalloc(ja->nr, sizeof(u64), GFP_KERNEL);
+	if (!ja->bucket_seq)
+		return -ENOMEM;
+
+	nr_bvecs = DIV_ROUND_UP(JOURNAL_ENTRY_SIZE_MAX, PAGE_SIZE);
+
+	ca->journal.bio = bio_kmalloc(nr_bvecs, GFP_KERNEL);
+	if (!ca->journal.bio)
+		return -ENOMEM;
+
+	bio_init(ca->journal.bio, NULL, ca->journal.bio->bi_inline_vecs, nr_bvecs, 0);
+
+	ja->buckets = kcalloc(ja->nr, sizeof(u64), GFP_KERNEL);
+	if (!ja->buckets)
+		return -ENOMEM;
+
+	for (i = 0; i < ja->nr; i++)
+		ja->buckets[i] = le64_to_cpu(journal_buckets->buckets[i]);
+
+	return 0;
+}
+
+void bch2_fs_journal_exit(struct journal *j)
+{
+	kvpfree(j->buf[1].data, j->buf[1].size);
+	kvpfree(j->buf[0].data, j->buf[0].size);
+	free_fifo(&j->pin);
+}
+
+int bch2_fs_journal_init(struct journal *j)
+{
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
+	static struct lock_class_key res_key;
+	int ret = 0;
+
+	pr_verbose_init(c->opts, "");
+
+	spin_lock_init(&j->lock);
+	spin_lock_init(&j->err_lock);
+	init_waitqueue_head(&j->wait);
+	INIT_DELAYED_WORK(&j->write_work, journal_write_work);
+	INIT_DELAYED_WORK(&j->reclaim_work, bch2_journal_reclaim_work);
+	mutex_init(&j->blacklist_lock);
+	INIT_LIST_HEAD(&j->seq_blacklist);
+	mutex_init(&j->reclaim_lock);
+
+	lockdep_init_map(&j->res_map, "journal res", &res_key, 0);
+
+	j->buf[0].size		= JOURNAL_ENTRY_SIZE_MIN;
+	j->buf[1].size		= JOURNAL_ENTRY_SIZE_MIN;
+	j->write_delay_ms	= 1000;
+	j->reclaim_delay_ms	= 100;
+
+	bkey_extent_init(&j->key);
+
+	atomic64_set(&j->reservations.counter,
+		((union journal_res_state)
+		 { .cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL }).v);
+
+	if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)) ||
+	    !(j->buf[0].data = kvpmalloc(j->buf[0].size, GFP_KERNEL)) ||
+	    !(j->buf[1].data = kvpmalloc(j->buf[1].size, GFP_KERNEL))) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	j->pin.front = j->pin.back = 1;
+out:
+	pr_verbose_init(c->opts, "ret %i", ret);
+	return ret;
+}
+
+/* debug: */
+
+ssize_t bch2_journal_print_debug(struct journal *j, char *buf)
+{
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
+	union journal_res_state *s = &j->reservations;
+	struct bch_dev *ca;
+	unsigned iter;
+	ssize_t ret = 0;
+
+	rcu_read_lock();
+	spin_lock(&j->lock);
+
+	ret += scnprintf(buf + ret, PAGE_SIZE - ret,
+			 "active journal entries:\t%llu\n"
+			 "seq:\t\t\t%llu\n"
+			 "last_seq:\t\t%llu\n"
+			 "last_seq_ondisk:\t%llu\n"
+			 "reservation count:\t%u\n"
+			 "reservation offset:\t%u\n"
+			 "current entry u64s:\t%u\n"
+			 "io in flight:\t\t%i\n"
+			 "need write:\t\t%i\n"
+			 "dirty:\t\t\t%i\n"
+			 "replay done:\t\t%i\n",
+			 fifo_used(&j->pin),
+			 journal_cur_seq(j),
+			 journal_last_seq(j),
+			 j->last_seq_ondisk,
+			 journal_state_count(*s, s->idx),
+			 s->cur_entry_offset,
+			 j->cur_entry_u64s,
+			 s->prev_buf_unwritten,
+			 test_bit(JOURNAL_NEED_WRITE,	&j->flags),
+			 journal_entry_is_open(j),
+			 test_bit(JOURNAL_REPLAY_DONE,	&j->flags));
+
+	for_each_member_device_rcu(ca, c, iter,
+				   &c->rw_devs[BCH_DATA_JOURNAL]) {
+		struct journal_device *ja = &ca->journal;
+
+		if (!ja->nr)
+			continue;
+
+		ret += scnprintf(buf + ret, PAGE_SIZE - ret,
+				 "dev %u:\n"
+				 "\tnr\t\t%u\n"
+				 "\tcur_idx\t\t%u (seq %llu)\n"
+				 "\tlast_idx\t%u (seq %llu)\n",
+				 iter, ja->nr,
+				 ja->cur_idx,	ja->bucket_seq[ja->cur_idx],
+				 ja->last_idx,	ja->bucket_seq[ja->last_idx]);
+	}
+
+	spin_unlock(&j->lock);
+	rcu_read_unlock();
+
+	return ret;
+}
+
+ssize_t bch2_journal_print_pins(struct journal *j, char *buf)
+{
+	struct journal_entry_pin_list *pin_list;
+	struct journal_entry_pin *pin;
+	ssize_t ret = 0;
+	u64 i;
+
+	spin_lock(&j->lock);
+	fifo_for_each_entry_ptr(pin_list, &j->pin, i) {
+		ret += scnprintf(buf + ret, PAGE_SIZE - ret,
+				 "%llu: count %u\n",
+				 i, atomic_read(&pin_list->count));
+
+		list_for_each_entry(pin, &pin_list->list, list)
+			ret += scnprintf(buf + ret, PAGE_SIZE - ret,
+					 "\t%p %pf\n",
+					 pin, pin->flush);
+
+		if (!list_empty(&pin_list->flushed))
+			ret += scnprintf(buf + ret, PAGE_SIZE - ret,
+					 "flushed:\n");
+
+		list_for_each_entry(pin, &pin_list->flushed, list)
+			ret += scnprintf(buf + ret, PAGE_SIZE - ret,
+					 "\t%p %pf\n",
+					 pin, pin->flush);
+	}
+	spin_unlock(&j->lock);
+
+	return ret;
+}
diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h
new file mode 100644
index 000000000000..f39b37e6e3d5
--- /dev/null
+++ b/fs/bcachefs/journal.h
@@ -0,0 +1,383 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_JOURNAL_H
+#define _BCACHEFS_JOURNAL_H
+
+/*
+ * THE JOURNAL:
+ *
+ * The primary purpose of the journal is to log updates (insertions) to the
+ * b-tree, to avoid having to do synchronous updates to the b-tree on disk.
+ *
+ * Without the journal, the b-tree is always internally consistent on
+ * disk - and in fact, in the earliest incarnations bcache didn't have a journal
+ * but did handle unclean shutdowns by doing all index updates synchronously
+ * (with coalescing).
+ *
+ * Updates to interior nodes still happen synchronously and without the journal
+ * (for simplicity) - this may change eventually but updates to interior nodes
+ * are rare enough it's not a huge priority.
+ *
+ * This means the journal is relatively separate from the b-tree; it consists of
+ * just a list of keys and journal replay consists of just redoing those
+ * insertions in same order that they appear in the journal.
+ *
+ * PERSISTENCE:
+ *
+ * For synchronous updates (where we're waiting on the index update to hit
+ * disk), the journal entry will be written out immediately (or as soon as
+ * possible, if the write for the previous journal entry was still in flight).
+ *
+ * Synchronous updates are specified by passing a closure (@flush_cl) to
+ * bch2_btree_insert() or bch_btree_insert_node(), which then pass that parameter
+ * down to the journalling code. That closure will will wait on the journal
+ * write to complete (via closure_wait()).
+ *
+ * If the index update wasn't synchronous, the journal entry will be
+ * written out after 10 ms have elapsed, by default (the delay_ms field
+ * in struct journal).
+ *
+ * JOURNAL ENTRIES:
+ *
+ * A journal entry is variable size (struct jset), it's got a fixed length
+ * header and then a variable number of struct jset_entry entries.
+ *
+ * Journal entries are identified by monotonically increasing 64 bit sequence
+ * numbers - jset->seq; other places in the code refer to this sequence number.
+ *
+ * A jset_entry entry contains one or more bkeys (which is what gets inserted
+ * into the b-tree). We need a container to indicate which b-tree the key is
+ * for; also, the roots of the various b-trees are stored in jset_entry entries
+ * (one for each b-tree) - this lets us add new b-tree types without changing
+ * the on disk format.
+ *
+ * We also keep some things in the journal header that are logically part of the
+ * superblock - all the things that are frequently updated. This is for future
+ * bcache on raw flash support; the superblock (which will become another
+ * journal) can't be moved or wear leveled, so it contains just enough
+ * information to find the main journal, and the superblock only has to be
+ * rewritten when we want to move/wear level the main journal.
+ *
+ * JOURNAL LAYOUT ON DISK:
+ *
+ * The journal is written to a ringbuffer of buckets (which is kept in the
+ * superblock); the individual buckets are not necessarily contiguous on disk
+ * which means that journal entries are not allowed to span buckets, but also
+ * that we can resize the journal at runtime if desired (unimplemented).
+ *
+ * The journal buckets exist in the same pool as all the other buckets that are
+ * managed by the allocator and garbage collection - garbage collection marks
+ * the journal buckets as metadata buckets.
+ *
+ * OPEN/DIRTY JOURNAL ENTRIES:
+ *
+ * Open/dirty journal entries are journal entries that contain b-tree updates
+ * that have not yet been written out to the b-tree on disk. We have to track
+ * which journal entries are dirty, and we also have to avoid wrapping around
+ * the journal and overwriting old but still dirty journal entries with new
+ * journal entries.
+ *
+ * On disk, this is represented with the "last_seq" field of struct jset;
+ * last_seq is the first sequence number that journal replay has to replay.
+ *
+ * To avoid overwriting dirty journal entries on disk, we keep a mapping (in
+ * journal_device->seq) of for each journal bucket, the highest sequence number
+ * any journal entry it contains. Then, by comparing that against last_seq we
+ * can determine whether that journal bucket contains dirty journal entries or
+ * not.
+ *
+ * To track which journal entries are dirty, we maintain a fifo of refcounts
+ * (where each entry corresponds to a specific sequence number) - when a ref
+ * goes to 0, that journal entry is no longer dirty.
+ *
+ * Journalling of index updates is done at the same time as the b-tree itself is
+ * being modified (see btree_insert_key()); when we add the key to the journal
+ * the pending b-tree write takes a ref on the journal entry the key was added
+ * to. If a pending b-tree write would need to take refs on multiple dirty
+ * journal entries, it only keeps the ref on the oldest one (since a newer
+ * journal entry will still be replayed if an older entry was dirty).
+ *
+ * JOURNAL FILLING UP:
+ *
+ * There are two ways the journal could fill up; either we could run out of
+ * space to write to, or we could have too many open journal entries and run out
+ * of room in the fifo of refcounts. Since those refcounts are decremented
+ * without any locking we can't safely resize that fifo, so we handle it the
+ * same way.
+ *
+ * If the journal fills up, we start flushing dirty btree nodes until we can
+ * allocate space for a journal write again - preferentially flushing btree
+ * nodes that are pinning the oldest journal entries first.
+ */
+
+#include <linux/hash.h>
+
+#include "journal_types.h"
+
+struct bch_fs;
+
+static inline void journal_wake(struct journal *j)
+{
+	wake_up(&j->wait);
+	closure_wake_up(&j->async_wait);
+}
+
+static inline struct journal_buf *journal_cur_buf(struct journal *j)
+{
+	return j->buf + j->reservations.idx;
+}
+
+static inline struct journal_buf *journal_prev_buf(struct journal *j)
+{
+	return j->buf + !j->reservations.idx;
+}
+
+/* Sequence number of oldest dirty journal entry */
+
+static inline u64 journal_last_seq(struct journal *j)
+{
+	return j->pin.front;
+}
+
+static inline u64 journal_cur_seq(struct journal *j)
+{
+	BUG_ON(j->pin.back - 1 != atomic64_read(&j->seq));
+
+	return j->pin.back - 1;
+}
+
+u64 bch2_inode_journal_seq(struct journal *, u64);
+
+static inline int journal_state_count(union journal_res_state s, int idx)
+{
+	return idx == 0 ? s.buf0_count : s.buf1_count;
+}
+
+static inline void journal_state_inc(union journal_res_state *s)
+{
+	s->buf0_count += s->idx == 0;
+	s->buf1_count += s->idx == 1;
+}
+
+static inline void bch2_journal_set_has_inode(struct journal *j,
+					      struct journal_res *res,
+					      u64 inum)
+{
+	struct journal_buf *buf = &j->buf[res->idx];
+	unsigned long bit = hash_64(inum, ilog2(sizeof(buf->has_inode) * 8));
+
+	/* avoid atomic op if possible */
+	if (unlikely(!test_bit(bit, buf->has_inode)))
+		set_bit(bit, buf->has_inode);
+}
+
+/*
+ * Amount of space that will be taken up by some keys in the journal (i.e.
+ * including the jset header)
+ */
+static inline unsigned jset_u64s(unsigned u64s)
+{
+	return u64s + sizeof(struct jset_entry) / sizeof(u64);
+}
+
+static inline struct jset_entry *
+bch2_journal_add_entry_noreservation(struct journal_buf *buf, size_t u64s)
+{
+	struct jset *jset = buf->data;
+	struct jset_entry *entry = vstruct_idx(jset, le32_to_cpu(jset->u64s));
+
+	memset(entry, 0, sizeof(*entry));
+	entry->u64s = cpu_to_le16(u64s);
+
+	le32_add_cpu(&jset->u64s, jset_u64s(u64s));
+
+	return entry;
+}
+
+static inline void bch2_journal_add_entry(struct journal *j, struct journal_res *res,
+					  unsigned type, enum btree_id id,
+					  unsigned level,
+					  const void *data, unsigned u64s)
+{
+	struct journal_buf *buf = &j->buf[res->idx];
+	struct jset_entry *entry = vstruct_idx(buf->data, res->offset);
+	unsigned actual = jset_u64s(u64s);
+
+	EBUG_ON(!res->ref);
+	EBUG_ON(actual > res->u64s);
+
+	res->offset	+= actual;
+	res->u64s	-= actual;
+
+	entry->u64s	= cpu_to_le16(u64s);
+	entry->btree_id = id;
+	entry->level	= level;
+	entry->type	= type;
+	entry->pad[0]	= 0;
+	entry->pad[1]	= 0;
+	entry->pad[2]	= 0;
+	memcpy_u64s(entry->_data, data, u64s);
+}
+
+static inline void bch2_journal_add_keys(struct journal *j, struct journal_res *res,
+					enum btree_id id, const struct bkey_i *k)
+{
+	bch2_journal_add_entry(j, res, BCH_JSET_ENTRY_btree_keys,
+			       id, 0, k, k->k.u64s);
+}
+
+void bch2_journal_buf_put_slowpath(struct journal *, bool);
+
+static inline void bch2_journal_buf_put(struct journal *j, unsigned idx,
+				       bool need_write_just_set)
+{
+	union journal_res_state s;
+
+	s.v = atomic64_sub_return(((union journal_res_state) {
+				    .buf0_count = idx == 0,
+				    .buf1_count = idx == 1,
+				    }).v, &j->reservations.counter);
+
+	EBUG_ON(s.idx != idx && !s.prev_buf_unwritten);
+
+	/*
+	 * Do not initiate a journal write if the journal is in an error state
+	 * (previous journal entry write may have failed)
+	 */
+	if (s.idx != idx &&
+	    !journal_state_count(s, idx) &&
+	    s.cur_entry_offset != JOURNAL_ENTRY_ERROR_VAL)
+		bch2_journal_buf_put_slowpath(j, need_write_just_set);
+}
+
+/*
+ * This function releases the journal write structure so other threads can
+ * then proceed to add their keys as well.
+ */
+static inline void bch2_journal_res_put(struct journal *j,
+				       struct journal_res *res)
+{
+	if (!res->ref)
+		return;
+
+	lock_release(&j->res_map, _RET_IP_);
+
+	while (res->u64s)
+		bch2_journal_add_entry(j, res,
+				       BCH_JSET_ENTRY_btree_keys,
+				       0, 0, NULL, 0);
+
+	bch2_journal_buf_put(j, res->idx, false);
+
+	res->ref = 0;
+}
+
+int bch2_journal_res_get_slowpath(struct journal *, struct journal_res *,
+				 unsigned, unsigned);
+
+static inline int journal_res_get_fast(struct journal *j,
+				       struct journal_res *res,
+				       unsigned u64s_min,
+				       unsigned u64s_max)
+{
+	union journal_res_state old, new;
+	u64 v = atomic64_read(&j->reservations.counter);
+
+	do {
+		old.v = new.v = v;
+
+		/*
+		 * Check if there is still room in the current journal
+		 * entry:
+		 */
+		if (old.cur_entry_offset + u64s_min > j->cur_entry_u64s)
+			return 0;
+
+		res->offset	= old.cur_entry_offset;
+		res->u64s	= min(u64s_max, j->cur_entry_u64s -
+				      old.cur_entry_offset);
+
+		journal_state_inc(&new);
+		new.cur_entry_offset += res->u64s;
+	} while ((v = atomic64_cmpxchg(&j->reservations.counter,
+				       old.v, new.v)) != old.v);
+
+	res->ref = true;
+	res->idx = new.idx;
+	res->seq = le64_to_cpu(j->buf[res->idx].data->seq);
+	return 1;
+}
+
+static inline int bch2_journal_res_get(struct journal *j, struct journal_res *res,
+				      unsigned u64s_min, unsigned u64s_max)
+{
+	int ret;
+
+	EBUG_ON(res->ref);
+	EBUG_ON(u64s_max < u64s_min);
+	EBUG_ON(!test_bit(JOURNAL_STARTED, &j->flags));
+
+	if (journal_res_get_fast(j, res, u64s_min, u64s_max))
+		goto out;
+
+	ret = bch2_journal_res_get_slowpath(j, res, u64s_min, u64s_max);
+	if (ret)
+		return ret;
+out:
+	lock_acquire_shared(&j->res_map, 0, 0, NULL, _THIS_IP_);
+	EBUG_ON(!res->ref);
+	return 0;
+}
+
+u64 bch2_journal_last_unwritten_seq(struct journal *);
+int bch2_journal_open_seq_async(struct journal *, u64, struct closure *);
+
+void bch2_journal_wait_on_seq(struct journal *, u64, struct closure *);
+void bch2_journal_flush_seq_async(struct journal *, u64, struct closure *);
+void bch2_journal_flush_async(struct journal *, struct closure *);
+void bch2_journal_meta_async(struct journal *, struct closure *);
+
+int bch2_journal_flush_seq(struct journal *, u64);
+int bch2_journal_flush(struct journal *);
+int bch2_journal_meta(struct journal *);
+
+void bch2_journal_halt(struct journal *);
+
+static inline int bch2_journal_error(struct journal *j)
+{
+	return j->reservations.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL
+		? -EIO : 0;
+}
+
+struct bch_dev;
+
+static inline bool journal_flushes_device(struct bch_dev *ca)
+{
+	return true;
+}
+
+int bch2_journal_mark(struct bch_fs *, struct list_head *);
+void bch2_journal_entries_free(struct list_head *);
+int bch2_journal_replay(struct bch_fs *, struct list_head *);
+
+static inline void bch2_journal_set_replay_done(struct journal *j)
+{
+	BUG_ON(!test_bit(JOURNAL_STARTED, &j->flags));
+	set_bit(JOURNAL_REPLAY_DONE, &j->flags);
+}
+
+ssize_t bch2_journal_print_debug(struct journal *, char *);
+ssize_t bch2_journal_print_pins(struct journal *, char *);
+
+int bch2_set_nr_journal_buckets(struct bch_fs *, struct bch_dev *,
+				unsigned nr);
+int bch2_dev_journal_alloc(struct bch_dev *);
+
+void bch2_dev_journal_stop(struct journal *, struct bch_dev *);
+void bch2_fs_journal_stop(struct journal *);
+void bch2_fs_journal_start(struct journal *);
+void bch2_dev_journal_exit(struct bch_dev *);
+int bch2_dev_journal_init(struct bch_dev *, struct bch_sb *);
+void bch2_fs_journal_exit(struct journal *);
+int bch2_fs_journal_init(struct journal *);
+
+#endif /* _BCACHEFS_JOURNAL_H */
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
new file mode 100644
index 000000000000..320f4f2933c1
--- /dev/null
+++ b/fs/bcachefs/journal_io.c
@@ -0,0 +1,1392 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "bcachefs.h"
+#include "alloc.h"
+#include "btree_gc.h"
+#include "btree_update.h"
+#include "buckets.h"
+#include "checksum.h"
+#include "error.h"
+#include "journal.h"
+#include "journal_io.h"
+#include "journal_reclaim.h"
+#include "journal_seq_blacklist.h"
+#include "replicas.h"
+#include "trace.h"
+
+struct journal_list {
+	struct closure		cl;
+	struct mutex		lock;
+	struct list_head	*head;
+	int			ret;
+};
+
+#define JOURNAL_ENTRY_ADD_OK		0
+#define JOURNAL_ENTRY_ADD_OUT_OF_RANGE	5
+
+/*
+ * Given a journal entry we just read, add it to the list of journal entries to
+ * be replayed:
+ */
+static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca,
+			     struct journal_list *jlist, struct jset *j)
+{
+	struct journal_replay *i, *pos;
+	struct list_head *where;
+	size_t bytes = vstruct_bytes(j);
+	__le64 last_seq;
+	int ret;
+
+	last_seq = !list_empty(jlist->head)
+		? list_last_entry(jlist->head, struct journal_replay,
+				  list)->j.last_seq
+		: 0;
+
+	/* Is this entry older than the range we need? */
+	if (le64_to_cpu(j->seq) < le64_to_cpu(last_seq)) {
+		ret = JOURNAL_ENTRY_ADD_OUT_OF_RANGE;
+		goto out;
+	}
+
+	/* Drop entries we don't need anymore */
+	list_for_each_entry_safe(i, pos, jlist->head, list) {
+		if (le64_to_cpu(i->j.seq) >= le64_to_cpu(j->last_seq))
+			break;
+		list_del(&i->list);
+		kvpfree(i, offsetof(struct journal_replay, j) +
+			vstruct_bytes(&i->j));
+	}
+
+	list_for_each_entry_reverse(i, jlist->head, list) {
+		/* Duplicate? */
+		if (le64_to_cpu(j->seq) == le64_to_cpu(i->j.seq)) {
+			fsck_err_on(bytes != vstruct_bytes(&i->j) ||
+				    memcmp(j, &i->j, bytes), c,
+				    "found duplicate but non identical journal entries (seq %llu)",
+				    le64_to_cpu(j->seq));
+			goto found;
+		}
+
+		if (le64_to_cpu(j->seq) > le64_to_cpu(i->j.seq)) {
+			where = &i->list;
+			goto add;
+		}
+	}
+
+	where = jlist->head;
+add:
+	i = kvpmalloc(offsetof(struct journal_replay, j) + bytes, GFP_KERNEL);
+	if (!i) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	list_add(&i->list, where);
+	i->devs.nr = 0;
+	unsafe_memcpy(&i->j, j, bytes, "embedded variable length struct");
+found:
+	if (!bch2_dev_list_has_dev(i->devs, ca->dev_idx))
+		bch2_dev_list_add_dev(&i->devs, ca->dev_idx);
+	else
+		fsck_err_on(1, c, "duplicate journal entries on same device");
+	ret = JOURNAL_ENTRY_ADD_OK;
+out:
+fsck_err:
+	return ret;
+}
+
+static struct nonce journal_nonce(const struct jset *jset)
+{
+	return (struct nonce) {{
+		[0] = 0,
+		[1] = ((__le32 *) &jset->seq)[0],
+		[2] = ((__le32 *) &jset->seq)[1],
+		[3] = BCH_NONCE_JOURNAL,
+	}};
+}
+
+/* this fills in a range with empty jset_entries: */
+static void journal_entry_null_range(void *start, void *end)
+{
+	struct jset_entry *entry;
+
+	for (entry = start; entry != end; entry = vstruct_next(entry))
+		memset(entry, 0, sizeof(*entry));
+}
+
+#define JOURNAL_ENTRY_REREAD	5
+#define JOURNAL_ENTRY_NONE	6
+#define JOURNAL_ENTRY_BAD	7
+
+#define journal_entry_err(c, msg, ...)					\
+({									\
+	switch (write) {						\
+	case READ:							\
+		mustfix_fsck_err(c, msg, ##__VA_ARGS__);		\
+		break;							\
+	case WRITE:							\
+		bch_err(c, "corrupt metadata before write:\n"		\
+			msg, ##__VA_ARGS__);				\
+		if (bch2_fs_inconsistent(c)) {				\
+			ret = BCH_FSCK_ERRORS_NOT_FIXED;		\
+			goto fsck_err;					\
+		}							\
+		break;							\
+	}								\
+	true;								\
+})
+
+#define journal_entry_err_on(cond, c, msg, ...)				\
+	((cond) ? journal_entry_err(c, msg, ##__VA_ARGS__) : false)
+
+static int journal_validate_key(struct bch_fs *c, struct jset *jset,
+				struct jset_entry *entry,
+				struct bkey_i *k, enum bkey_type key_type,
+				const char *type, int write)
+{
+	void *next = vstruct_next(entry);
+	const char *invalid;
+	char buf[160];
+	int ret = 0;
+
+	if (journal_entry_err_on(!k->k.u64s, c,
+			"invalid %s in journal: k->u64s 0", type)) {
+		entry->u64s = cpu_to_le16((u64 *) k - entry->_data);
+		journal_entry_null_range(vstruct_next(entry), next);
+		return 0;
+	}
+
+	if (journal_entry_err_on((void *) bkey_next(k) >
+				(void *) vstruct_next(entry), c,
+			"invalid %s in journal: extends past end of journal entry",
+			type)) {
+		entry->u64s = cpu_to_le16((u64 *) k - entry->_data);
+		journal_entry_null_range(vstruct_next(entry), next);
+		return 0;
+	}
+
+	if (journal_entry_err_on(k->k.format != KEY_FORMAT_CURRENT, c,
+			"invalid %s in journal: bad format %u",
+			type, k->k.format)) {
+		le16_add_cpu(&entry->u64s, -k->k.u64s);
+		memmove(k, bkey_next(k), next - (void *) bkey_next(k));
+		journal_entry_null_range(vstruct_next(entry), next);
+		return 0;
+	}
+
+	if (JSET_BIG_ENDIAN(jset) != CPU_BIG_ENDIAN)
+		bch2_bkey_swab(key_type, NULL, bkey_to_packed(k));
+
+	invalid = bch2_bkey_invalid(c, key_type, bkey_i_to_s_c(k));
+	if (invalid) {
+		bch2_bkey_val_to_text(c, key_type, buf, sizeof(buf),
+				     bkey_i_to_s_c(k));
+		mustfix_fsck_err(c, "invalid %s in journal: %s\n%s",
+				 type, invalid, buf);
+
+		le16_add_cpu(&entry->u64s, -k->k.u64s);
+		memmove(k, bkey_next(k), next - (void *) bkey_next(k));
+		journal_entry_null_range(vstruct_next(entry), next);
+		return 0;
+	}
+fsck_err:
+	return ret;
+}
+
+static int journal_entry_validate_btree_keys(struct bch_fs *c,
+					     struct jset *jset,
+					     struct jset_entry *entry,
+					     int write)
+{
+	struct bkey_i *k;
+
+	vstruct_for_each(entry, k) {
+		int ret = journal_validate_key(c, jset, entry, k,
+				bkey_type(entry->level,
+					  entry->btree_id),
+				"key", write);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+static int journal_entry_validate_btree_root(struct bch_fs *c,
+					     struct jset *jset,
+					     struct jset_entry *entry,
+					     int write)
+{
+	struct bkey_i *k = entry->start;
+	int ret = 0;
+
+	if (journal_entry_err_on(!entry->u64s ||
+				 le16_to_cpu(entry->u64s) != k->k.u64s, c,
+				 "invalid btree root journal entry: wrong number of keys")) {
+		void *next = vstruct_next(entry);
+		/*
+		 * we don't want to null out this jset_entry,
+		 * just the contents, so that later we can tell
+		 * we were _supposed_ to have a btree root
+		 */
+		entry->u64s = 0;
+		journal_entry_null_range(vstruct_next(entry), next);
+		return 0;
+	}
+
+	return journal_validate_key(c, jset, entry, k, BKEY_TYPE_BTREE,
+				    "btree root", write);
+fsck_err:
+	return ret;
+}
+
+static int journal_entry_validate_prio_ptrs(struct bch_fs *c,
+					    struct jset *jset,
+					    struct jset_entry *entry,
+					    int write)
+{
+	/* obsolete, don't care: */
+	return 0;
+}
+
+static int journal_entry_validate_blacklist(struct bch_fs *c,
+					    struct jset *jset,
+					    struct jset_entry *entry,
+					    int write)
+{
+	int ret = 0;
+
+	if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 1, c,
+		"invalid journal seq blacklist entry: bad size")) {
+		journal_entry_null_range(entry, vstruct_next(entry));
+	}
+fsck_err:
+	return ret;
+}
+
+static int journal_entry_validate_blacklist_v2(struct bch_fs *c,
+					       struct jset *jset,
+					       struct jset_entry *entry,
+					       int write)
+{
+	struct jset_entry_blacklist_v2 *bl_entry;
+	int ret = 0;
+
+	if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 2, c,
+		"invalid journal seq blacklist entry: bad size")) {
+		journal_entry_null_range(entry, vstruct_next(entry));
+	}
+
+	bl_entry = container_of(entry, struct jset_entry_blacklist_v2, entry);
+
+	if (journal_entry_err_on(le64_to_cpu(bl_entry->start) >
+				 le64_to_cpu(bl_entry->end), c,
+		"invalid journal seq blacklist entry: start > end")) {
+		journal_entry_null_range(entry, vstruct_next(entry));
+	}
+
+fsck_err:
+	return ret;
+}
+
+struct jset_entry_ops {
+	int (*validate)(struct bch_fs *, struct jset *,
+			struct jset_entry *, int);
+};
+
+static const struct jset_entry_ops bch2_jset_entry_ops[] = {
+#define x(f, nr)						\
+	[BCH_JSET_ENTRY_##f]	= (struct jset_entry_ops) {	\
+		.validate	= journal_entry_validate_##f,	\
+	},
+	BCH_JSET_ENTRY_TYPES()
+#undef x
+};
+
+static int journal_entry_validate(struct bch_fs *c, struct jset *jset,
+				  struct jset_entry *entry, int write)
+{
+	int ret = 0;
+
+	if (entry->type >= BCH_JSET_ENTRY_NR) {
+		journal_entry_err(c, "invalid journal entry type %u",
+				  entry->type);
+		journal_entry_null_range(entry, vstruct_next(entry));
+		return 0;
+	}
+
+	ret = bch2_jset_entry_ops[entry->type].validate(c, jset, entry, write);
+fsck_err:
+	return ret;
+}
+
+static int jset_validate_entries(struct bch_fs *c, struct jset *jset,
+				 int write)
+{
+	struct jset_entry *entry;
+	int ret = 0;
+
+	vstruct_for_each(jset, entry) {
+		if (journal_entry_err_on(vstruct_next(entry) >
+					 vstruct_last(jset), c,
+				"journal entry extends past end of jset")) {
+			jset->u64s = cpu_to_le32((u64 *) entry - jset->_data);
+			break;
+		}
+
+		ret = journal_entry_validate(c, jset, entry, write);
+		if (ret)
+			break;
+	}
+fsck_err:
+	return ret;
+}
+
+static int jset_validate(struct bch_fs *c,
+			 struct jset *jset, u64 sector,
+			 unsigned bucket_sectors_left,
+			 unsigned sectors_read,
+			 int write)
+{
+	size_t bytes = vstruct_bytes(jset);
+	struct bch_csum csum;
+	int ret = 0;
+
+	if (le64_to_cpu(jset->magic) != jset_magic(c))
+		return JOURNAL_ENTRY_NONE;
+
+	if (le32_to_cpu(jset->version) != BCACHE_JSET_VERSION) {
+		bch_err(c, "unknown journal entry version %u",
+			le32_to_cpu(jset->version));
+		return BCH_FSCK_UNKNOWN_VERSION;
+	}
+
+	if (journal_entry_err_on(bytes > bucket_sectors_left << 9, c,
+				 "journal entry too big (%zu bytes), sector %lluu",
+				 bytes, sector)) {
+		/* XXX: note we might have missing journal entries */
+		return JOURNAL_ENTRY_BAD;
+	}
+
+	if (bytes > sectors_read << 9)
+		return JOURNAL_ENTRY_REREAD;
+
+	if (fsck_err_on(!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(jset)), c,
+			"journal entry with unknown csum type %llu sector %lluu",
+			JSET_CSUM_TYPE(jset), sector))
+		return JOURNAL_ENTRY_BAD;
+
+	csum = csum_vstruct(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), jset);
+	if (journal_entry_err_on(bch2_crc_cmp(csum, jset->csum), c,
+				 "journal checksum bad, sector %llu", sector)) {
+		/* XXX: retry IO, when we start retrying checksum errors */
+		/* XXX: note we might have missing journal entries */
+		return JOURNAL_ENTRY_BAD;
+	}
+
+	bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset),
+		     jset->encrypted_start,
+		     vstruct_end(jset) - (void *) jset->encrypted_start);
+
+	if (journal_entry_err_on(le64_to_cpu(jset->last_seq) > le64_to_cpu(jset->seq), c,
+				 "invalid journal entry: last_seq > seq"))
+		jset->last_seq = jset->seq;
+
+	return 0;
+fsck_err:
+	return ret;
+}
+
+struct journal_read_buf {
+	void		*data;
+	size_t		size;
+};
+
+static int journal_read_buf_realloc(struct journal_read_buf *b,
+				    size_t new_size)
+{
+	void *n;
+
+	/* the bios are sized for this many pages, max: */
+	if (new_size > JOURNAL_ENTRY_SIZE_MAX)
+		return -ENOMEM;
+
+	new_size = roundup_pow_of_two(new_size);
+	n = kvpmalloc(new_size, GFP_KERNEL);
+	if (!n)
+		return -ENOMEM;
+
+	kvpfree(b->data, b->size);
+	b->data = n;
+	b->size = new_size;
+	return 0;
+}
+
+static int journal_read_bucket(struct bch_dev *ca,
+			       struct journal_read_buf *buf,
+			       struct journal_list *jlist,
+			       unsigned bucket, u64 *seq, bool *entries_found)
+{
+	struct bch_fs *c = ca->fs;
+	struct journal_device *ja = &ca->journal;
+	struct bio *bio = ja->bio;
+	struct jset *j = NULL;
+	unsigned sectors, sectors_read = 0;
+	u64 offset = bucket_to_sector(ca, ja->buckets[bucket]),
+	    end = offset + ca->mi.bucket_size;
+	bool saw_bad = false;
+	int ret = 0;
+
+	pr_debug("reading %u", bucket);
+
+	while (offset < end) {
+		if (!sectors_read) {
+reread:			sectors_read = min_t(unsigned,
+				end - offset, buf->size >> 9);
+
+			bio_reset(bio, ca->disk_sb.bdev, REQ_OP_READ);
+			bio->bi_iter.bi_sector	= offset;
+			bio->bi_iter.bi_size	= sectors_read << 9;
+			bch2_bio_map(bio, buf->data);
+
+			ret = submit_bio_wait(bio);
+
+			if (bch2_dev_io_err_on(ret, ca,
+					       "journal read from sector %llu",
+					       offset) ||
+			    bch2_meta_read_fault("journal"))
+				return -EIO;
+
+			j = buf->data;
+		}
+
+		ret = jset_validate(c, j, offset,
+				    end - offset, sectors_read,
+				    READ);
+		switch (ret) {
+		case BCH_FSCK_OK:
+			break;
+		case JOURNAL_ENTRY_REREAD:
+			if (vstruct_bytes(j) > buf->size) {
+				ret = journal_read_buf_realloc(buf,
+							vstruct_bytes(j));
+				if (ret)
+					return ret;
+			}
+			goto reread;
+		case JOURNAL_ENTRY_NONE:
+			if (!saw_bad)
+				return 0;
+			sectors = c->opts.block_size;
+			goto next_block;
+		case JOURNAL_ENTRY_BAD:
+			saw_bad = true;
+			sectors = c->opts.block_size;
+			goto next_block;
+		default:
+			return ret;
+		}
+
+		/*
+		 * This happens sometimes if we don't have discards on -
+		 * when we've partially overwritten a bucket with new
+		 * journal entries. We don't need the rest of the
+		 * bucket:
+		 */
+		if (le64_to_cpu(j->seq) < ja->bucket_seq[bucket])
+			return 0;
+
+		ja->bucket_seq[bucket] = le64_to_cpu(j->seq);
+
+		mutex_lock(&jlist->lock);
+		ret = journal_entry_add(c, ca, jlist, j);
+		mutex_unlock(&jlist->lock);
+
+		switch (ret) {
+		case JOURNAL_ENTRY_ADD_OK:
+			*entries_found = true;
+			break;
+		case JOURNAL_ENTRY_ADD_OUT_OF_RANGE:
+			break;
+		default:
+			return ret;
+		}
+
+		if (le64_to_cpu(j->seq) > *seq)
+			*seq = le64_to_cpu(j->seq);
+
+		sectors = vstruct_sectors(j, c->block_bits);
+next_block:
+		pr_debug("next");
+		offset		+= sectors;
+		sectors_read	-= sectors;
+		j = ((void *) j) + (sectors << 9);
+	}
+
+	return 0;
+}
+
+static void bch2_journal_read_device(struct closure *cl)
+{
+#define read_bucket(b)							\
+	({								\
+		bool entries_found = false;				\
+		ret = journal_read_bucket(ca, &buf, jlist, b, &seq,	\
+					  &entries_found);		\
+		if (ret)						\
+			goto err;					\
+		__set_bit(b, bitmap);					\
+		entries_found;						\
+	 })
+
+	struct journal_device *ja =
+		container_of(cl, struct journal_device, read);
+	struct bch_dev *ca = container_of(ja, struct bch_dev, journal);
+	struct journal_list *jlist =
+		container_of(cl->parent, struct journal_list, cl);
+	struct request_queue *q = bdev_get_queue(ca->disk_sb.bdev);
+	struct journal_read_buf buf = { NULL, 0 };
+	unsigned long *bitmap;
+	unsigned i, l, r;
+	u64 seq = 0;
+	int ret;
+
+	if (!ja->nr)
+		goto out;
+
+	bitmap = kcalloc(BITS_TO_LONGS(ja->nr), ja->nr, GFP_KERNEL);
+	if (!bitmap) {
+		ret = -ENOMEM;
+		goto err;
+	}
+
+	ret = journal_read_buf_realloc(&buf, PAGE_SIZE);
+	if (ret)
+		goto err;
+
+	pr_debug("%u journal buckets", ja->nr);
+
+	/*
+	 * If the device supports discard but not secure discard, we can't do
+	 * the fancy fibonacci hash/binary search because the live journal
+	 * entries might not form a contiguous range:
+	 */
+	for (i = 0; i < ja->nr; i++)
+		read_bucket(i);
+	goto search_done;
+
+	if (!blk_queue_nonrot(q))
+		goto linear_scan;
+
+	/*
+	 * Read journal buckets ordered by golden ratio hash to quickly
+	 * find a sequence of buckets with valid journal entries
+	 */
+	for (i = 0; i < ja->nr; i++) {
+		l = (i * 2654435769U) % ja->nr;
+
+		if (test_bit(l, bitmap))
+			break;
+
+		if (read_bucket(l))
+			goto bsearch;
+	}
+
+	/*
+	 * If that fails, check all the buckets we haven't checked
+	 * already
+	 */
+	pr_debug("falling back to linear search");
+linear_scan:
+	for (l = find_first_zero_bit(bitmap, ja->nr);
+	     l < ja->nr;
+	     l = find_next_zero_bit(bitmap, ja->nr, l + 1))
+		if (read_bucket(l))
+			goto bsearch;
+
+	/* no journal entries on this device? */
+	if (l == ja->nr)
+		goto out;
+bsearch:
+	/* Binary search */
+	r = find_next_bit(bitmap, ja->nr, l + 1);
+	pr_debug("starting binary search, l %u r %u", l, r);
+
+	while (l + 1 < r) {
+		unsigned m = (l + r) >> 1;
+		u64 cur_seq = seq;
+
+		read_bucket(m);
+
+		if (cur_seq != seq)
+			l = m;
+		else
+			r = m;
+	}
+
+search_done:
+	/*
+	 * Find the journal bucket with the highest sequence number:
+	 *
+	 * If there's duplicate journal entries in multiple buckets (which
+	 * definitely isn't supposed to happen, but...) - make sure to start
+	 * cur_idx at the last of those buckets, so we don't deadlock trying to
+	 * allocate
+	 */
+	seq = 0;
+
+	for (i = 0; i < ja->nr; i++)
+		if (ja->bucket_seq[i] >= seq &&
+		    ja->bucket_seq[i] != ja->bucket_seq[(i + 1) % ja->nr]) {
+			/*
+			 * When journal_next_bucket() goes to allocate for
+			 * the first time, it'll use the bucket after
+			 * ja->cur_idx
+			 */
+			ja->cur_idx = i;
+			seq = ja->bucket_seq[i];
+		}
+
+	/*
+	 * Set last_idx to indicate the entire journal is full and needs to be
+	 * reclaimed - journal reclaim will immediately reclaim whatever isn't
+	 * pinned when it first runs:
+	 */
+	ja->last_idx = (ja->cur_idx + 1) % ja->nr;
+
+	/*
+	 * Read buckets in reverse order until we stop finding more journal
+	 * entries:
+	 */
+	for (i = (ja->cur_idx + ja->nr - 1) % ja->nr;
+	     i != ja->cur_idx;
+	     i = (i + ja->nr - 1) % ja->nr)
+		if (!test_bit(i, bitmap) &&
+		    !read_bucket(i))
+			break;
+out:
+	kvpfree(buf.data, buf.size);
+	kfree(bitmap);
+	percpu_ref_put(&ca->io_ref);
+	closure_return(cl);
+	return;
+err:
+	mutex_lock(&jlist->lock);
+	jlist->ret = ret;
+	mutex_unlock(&jlist->lock);
+	goto out;
+#undef read_bucket
+}
+
+void bch2_journal_entries_free(struct list_head *list)
+{
+
+	while (!list_empty(list)) {
+		struct journal_replay *i =
+			list_first_entry(list, struct journal_replay, list);
+		list_del(&i->list);
+		kvpfree(i, offsetof(struct journal_replay, j) +
+			vstruct_bytes(&i->j));
+	}
+}
+
+int bch2_journal_set_seq(struct bch_fs *c, u64 last_seq, u64 end_seq)
+{
+	struct journal *j = &c->journal;
+	struct journal_entry_pin_list *p;
+	u64 seq, nr = end_seq - last_seq + 1;
+
+	if (nr > j->pin.size) {
+		free_fifo(&j->pin);
+		init_fifo(&j->pin, roundup_pow_of_two(nr), GFP_KERNEL);
+		if (!j->pin.data) {
+			bch_err(c, "error reallocating journal fifo (%llu open entries)", nr);
+			return -ENOMEM;
+		}
+	}
+
+	atomic64_set(&j->seq, end_seq);
+	j->last_seq_ondisk = last_seq;
+
+	j->pin.front	= last_seq;
+	j->pin.back	= end_seq + 1;
+
+	fifo_for_each_entry_ptr(p, &j->pin, seq) {
+		INIT_LIST_HEAD(&p->list);
+		INIT_LIST_HEAD(&p->flushed);
+		atomic_set(&p->count, 0);
+		p->devs.nr = 0;
+	}
+
+	return 0;
+}
+
+int bch2_journal_read(struct bch_fs *c, struct list_head *list)
+{
+	struct journal *j = &c->journal;
+	struct journal_list jlist;
+	struct journal_replay *i;
+	struct journal_entry_pin_list *p;
+	struct bch_dev *ca;
+	u64 cur_seq, end_seq;
+	unsigned iter;
+	size_t keys = 0, entries = 0;
+	bool degraded = false;
+	int ret = 0;
+
+	closure_init_stack(&jlist.cl);
+	mutex_init(&jlist.lock);
+	jlist.head = list;
+	jlist.ret = 0;
+
+	for_each_member_device(ca, c, iter) {
+		if (!(bch2_dev_has_data(c, ca) & (1 << BCH_DATA_JOURNAL)))
+			continue;
+
+		if ((ca->mi.state == BCH_MEMBER_STATE_RW ||
+		     ca->mi.state == BCH_MEMBER_STATE_RO) &&
+		    percpu_ref_tryget(&ca->io_ref))
+			closure_call(&ca->journal.read,
+				     bch2_journal_read_device,
+				     system_unbound_wq,
+				     &jlist.cl);
+		else
+			degraded = true;
+	}
+
+	closure_sync(&jlist.cl);
+
+	if (jlist.ret)
+		return jlist.ret;
+
+	if (list_empty(list)){
+		bch_err(c, "no journal entries found");
+		return BCH_FSCK_REPAIR_IMPOSSIBLE;
+	}
+
+	list_for_each_entry(i, list, list) {
+		ret = jset_validate_entries(c, &i->j, READ);
+		if (ret)
+			goto fsck_err;
+
+		/*
+		 * If we're mounting in degraded mode - if we didn't read all
+		 * the devices - this is wrong:
+		 */
+
+		if (!degraded &&
+		    (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) ||
+		     fsck_err_on(!bch2_replicas_marked(c, BCH_DATA_JOURNAL,
+						       i->devs), c,
+				 "superblock not marked as containing replicas (type %u)",
+				 BCH_DATA_JOURNAL))) {
+			ret = bch2_mark_replicas(c, BCH_DATA_JOURNAL, i->devs);
+			if (ret)
+				return ret;
+		}
+	}
+
+	i = list_last_entry(list, struct journal_replay, list);
+
+	ret = bch2_journal_set_seq(c,
+				   le64_to_cpu(i->j.last_seq),
+				   le64_to_cpu(i->j.seq));
+	if (ret)
+		return ret;
+
+	mutex_lock(&j->blacklist_lock);
+
+	list_for_each_entry(i, list, list) {
+		p = journal_seq_pin(j, le64_to_cpu(i->j.seq));
+
+		atomic_set(&p->count, 1);
+		p->devs = i->devs;
+
+		if (bch2_journal_seq_blacklist_read(j, i)) {
+			mutex_unlock(&j->blacklist_lock);
+			return -ENOMEM;
+		}
+	}
+
+	mutex_unlock(&j->blacklist_lock);
+
+	cur_seq = journal_last_seq(j);
+	end_seq = le64_to_cpu(list_last_entry(list,
+				struct journal_replay, list)->j.seq);
+
+	list_for_each_entry(i, list, list) {
+		struct jset_entry *entry;
+		struct bkey_i *k, *_n;
+		bool blacklisted;
+
+		mutex_lock(&j->blacklist_lock);
+		while (cur_seq < le64_to_cpu(i->j.seq) &&
+		       bch2_journal_seq_blacklist_find(j, cur_seq))
+			cur_seq++;
+
+		blacklisted = bch2_journal_seq_blacklist_find(j,
+							 le64_to_cpu(i->j.seq));
+		mutex_unlock(&j->blacklist_lock);
+
+		fsck_err_on(blacklisted, c,
+			    "found blacklisted journal entry %llu",
+			    le64_to_cpu(i->j.seq));
+
+		fsck_err_on(le64_to_cpu(i->j.seq) != cur_seq, c,
+			"journal entries %llu-%llu missing! (replaying %llu-%llu)",
+			cur_seq, le64_to_cpu(i->j.seq) - 1,
+			journal_last_seq(j), end_seq);
+
+		cur_seq = le64_to_cpu(i->j.seq) + 1;
+
+		for_each_jset_key(k, _n, entry, &i->j)
+			keys++;
+		entries++;
+	}
+
+	bch_info(c, "journal read done, %zu keys in %zu entries, seq %llu",
+		 keys, entries, journal_cur_seq(j));
+fsck_err:
+	return ret;
+}
+
+/* journal replay: */
+
+int bch2_journal_mark(struct bch_fs *c, struct list_head *list)
+{
+	struct bkey_i *k, *n;
+	struct jset_entry *j;
+	struct journal_replay *r;
+	int ret;
+
+	list_for_each_entry(r, list, list)
+		for_each_jset_key(k, n, j, &r->j) {
+			enum bkey_type type = bkey_type(j->level, j->btree_id);
+			struct bkey_s_c k_s_c = bkey_i_to_s_c(k);
+
+			if (btree_type_has_ptrs(type)) {
+				ret = bch2_btree_mark_key_initial(c, type, k_s_c);
+				if (ret)
+					return ret;
+			}
+		}
+
+	return 0;
+}
+
+int bch2_journal_replay(struct bch_fs *c, struct list_head *list)
+{
+	struct journal *j = &c->journal;
+	struct journal_entry_pin_list *pin_list;
+	struct bkey_i *k, *_n;
+	struct jset_entry *entry;
+	struct journal_replay *i, *n;
+	int ret = 0;
+
+	list_for_each_entry_safe(i, n, list, list) {
+
+		j->replay_journal_seq = le64_to_cpu(i->j.seq);
+
+		for_each_jset_key(k, _n, entry, &i->j) {
+
+			if (entry->btree_id == BTREE_ID_ALLOC) {
+				/*
+				 * allocation code handles replay for
+				 * BTREE_ID_ALLOC keys:
+				 */
+				ret = bch2_alloc_replay_key(c, k->k.p);
+			} else {
+				/*
+				 * We might cause compressed extents to be
+				 * split, so we need to pass in a
+				 * disk_reservation:
+				 */
+				struct disk_reservation disk_res =
+					bch2_disk_reservation_init(c, 0);
+
+				ret = bch2_btree_insert(c, entry->btree_id, k,
+							&disk_res, NULL, NULL,
+							BTREE_INSERT_NOFAIL|
+							BTREE_INSERT_JOURNAL_REPLAY);
+			}
+
+			if (ret) {
+				bch_err(c, "journal replay: error %d while replaying key",
+					ret);
+				goto err;
+			}
+
+			cond_resched();
+		}
+
+		pin_list = journal_seq_pin(j, j->replay_journal_seq);
+
+		if (atomic_dec_and_test(&pin_list->count))
+			journal_wake(j);
+	}
+
+	j->replay_journal_seq = 0;
+
+	bch2_journal_set_replay_done(j);
+	bch2_journal_flush_all_pins(j);
+	ret = bch2_journal_error(j);
+err:
+	bch2_journal_entries_free(list);
+	return ret;
+}
+
+/* journal write: */
+
+static void bch2_journal_add_btree_root(struct journal_buf *buf,
+				       enum btree_id id, struct bkey_i *k,
+				       unsigned level)
+{
+	struct jset_entry *entry;
+
+	entry = bch2_journal_add_entry_noreservation(buf, k->k.u64s);
+	entry->type	= BCH_JSET_ENTRY_btree_root;
+	entry->btree_id = id;
+	entry->level	= level;
+	memcpy_u64s(entry->_data, k, k->k.u64s);
+}
+
+static unsigned journal_dev_buckets_available(struct journal *j,
+					      struct bch_dev *ca)
+{
+	struct journal_device *ja = &ca->journal;
+	unsigned next = (ja->cur_idx + 1) % ja->nr;
+	unsigned available = (ja->last_idx + ja->nr - next) % ja->nr;
+
+	/*
+	 * Hack to avoid a deadlock during journal replay:
+	 * journal replay might require setting a new btree
+	 * root, which requires writing another journal entry -
+	 * thus, if the journal is full (and this happens when
+	 * replaying the first journal bucket's entries) we're
+	 * screwed.
+	 *
+	 * So don't let the journal fill up unless we're in
+	 * replay:
+	 */
+	if (test_bit(JOURNAL_REPLAY_DONE, &j->flags))
+		available = max((int) available - 2, 0);
+
+	/*
+	 * Don't use the last bucket unless writing the new last_seq
+	 * will make another bucket available:
+	 */
+	if (ja->bucket_seq[ja->last_idx] >= journal_last_seq(j))
+		available = max((int) available - 1, 0);
+
+	return available;
+}
+
+/* returns number of sectors available for next journal entry: */
+int bch2_journal_entry_sectors(struct journal *j)
+{
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
+	struct bch_dev *ca;
+	struct bkey_s_extent e = bkey_i_to_s_extent(&j->key);
+	unsigned sectors_available = UINT_MAX;
+	unsigned i, nr_online = 0, nr_devs = 0;
+
+	lockdep_assert_held(&j->lock);
+
+	rcu_read_lock();
+	for_each_member_device_rcu(ca, c, i,
+				   &c->rw_devs[BCH_DATA_JOURNAL]) {
+		struct journal_device *ja = &ca->journal;
+		unsigned buckets_required = 0;
+
+		if (!ja->nr)
+			continue;
+
+		sectors_available = min_t(unsigned, sectors_available,
+					  ca->mi.bucket_size);
+
+		/*
+		 * Note that we don't allocate the space for a journal entry
+		 * until we write it out - thus, if we haven't started the write
+		 * for the previous entry we have to make sure we have space for
+		 * it too:
+		 */
+		if (bch2_extent_has_device(e.c, ca->dev_idx)) {
+			if (j->prev_buf_sectors > ja->sectors_free)
+				buckets_required++;
+
+			if (j->prev_buf_sectors + sectors_available >
+			    ja->sectors_free)
+				buckets_required++;
+		} else {
+			if (j->prev_buf_sectors + sectors_available >
+			    ca->mi.bucket_size)
+				buckets_required++;
+
+			buckets_required++;
+		}
+
+		if (journal_dev_buckets_available(j, ca) >= buckets_required)
+			nr_devs++;
+		nr_online++;
+	}
+	rcu_read_unlock();
+
+	if (nr_online < c->opts.metadata_replicas_required)
+		return -EROFS;
+
+	if (nr_devs < min_t(unsigned, nr_online, c->opts.metadata_replicas))
+		return 0;
+
+	return sectors_available;
+}
+
+/**
+ * journal_next_bucket - move on to the next journal bucket if possible
+ */
+static int journal_write_alloc(struct journal *j, struct journal_buf *w,
+			       unsigned sectors)
+{
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
+	struct bkey_s_extent e;
+	struct bch_extent_ptr *ptr;
+	struct journal_device *ja;
+	struct bch_dev *ca;
+	struct dev_alloc_list devs_sorted;
+	unsigned i, replicas, replicas_want =
+		READ_ONCE(c->opts.metadata_replicas);
+
+	spin_lock(&j->lock);
+	e = bkey_i_to_s_extent(&j->key);
+
+	/*
+	 * Drop any pointers to devices that have been removed, are no longer
+	 * empty, or filled up their current journal bucket:
+	 *
+	 * Note that a device may have had a small amount of free space (perhaps
+	 * one sector) that wasn't enough for the smallest possible journal
+	 * entry - that's why we drop pointers to devices <= current free space,
+	 * i.e. whichever device was limiting the current journal entry size.
+	 */
+	extent_for_each_ptr_backwards(e, ptr) {
+		   ca = bch_dev_bkey_exists(c, ptr->dev);
+
+		if (ca->mi.state != BCH_MEMBER_STATE_RW ||
+		    ca->journal.sectors_free <= sectors)
+			__bch2_extent_drop_ptr(e, ptr);
+		else
+			ca->journal.sectors_free -= sectors;
+	}
+
+	replicas = bch2_extent_nr_ptrs(e.c);
+
+	rcu_read_lock();
+	devs_sorted = bch2_wp_alloc_list(c, &j->wp,
+					 &c->rw_devs[BCH_DATA_JOURNAL]);
+
+	for (i = 0; i < devs_sorted.nr; i++) {
+		ca = rcu_dereference(c->devs[devs_sorted.devs[i]]);
+		if (!ca)
+			continue;
+
+		if (!ca->mi.durability)
+			continue;
+
+		ja = &ca->journal;
+		if (!ja->nr)
+			continue;
+
+		if (replicas >= replicas_want)
+			break;
+
+		/*
+		 * Check that we can use this device, and aren't already using
+		 * it:
+		 */
+		if (bch2_extent_has_device(e.c, ca->dev_idx) ||
+		    !journal_dev_buckets_available(j, ca) ||
+		    sectors > ca->mi.bucket_size)
+			continue;
+
+		j->wp.next_alloc[ca->dev_idx] += U32_MAX;
+		bch2_wp_rescale(c, ca, &j->wp);
+
+		ja->sectors_free = ca->mi.bucket_size - sectors;
+		ja->cur_idx = (ja->cur_idx + 1) % ja->nr;
+		ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq);
+
+		extent_ptr_append(bkey_i_to_extent(&j->key),
+			(struct bch_extent_ptr) {
+				  .offset = bucket_to_sector(ca,
+					ja->buckets[ja->cur_idx]),
+				  .dev = ca->dev_idx,
+		});
+
+		replicas += ca->mi.durability;
+	}
+	rcu_read_unlock();
+
+	j->prev_buf_sectors = 0;
+
+	bkey_copy(&w->key, &j->key);
+	spin_unlock(&j->lock);
+
+	if (replicas < c->opts.metadata_replicas_required)
+		return -EROFS;
+
+	BUG_ON(!replicas);
+
+	return 0;
+}
+
+static void journal_write_compact(struct jset *jset)
+{
+	struct jset_entry *i, *next, *prev = NULL;
+
+	/*
+	 * Simple compaction, dropping empty jset_entries (from journal
+	 * reservations that weren't fully used) and merging jset_entries that
+	 * can be.
+	 *
+	 * If we wanted to be really fancy here, we could sort all the keys in
+	 * the jset and drop keys that were overwritten - probably not worth it:
+	 */
+	vstruct_for_each_safe(jset, i, next) {
+		unsigned u64s = le16_to_cpu(i->u64s);
+
+		/* Empty entry: */
+		if (!u64s)
+			continue;
+
+		/* Can we merge with previous entry? */
+		if (prev &&
+		    i->btree_id == prev->btree_id &&
+		    i->level	== prev->level &&
+		    i->type	== prev->type &&
+		    i->type	== BCH_JSET_ENTRY_btree_keys &&
+		    le16_to_cpu(prev->u64s) + u64s <= U16_MAX) {
+			memmove_u64s_down(vstruct_next(prev),
+					  i->_data,
+					  u64s);
+			le16_add_cpu(&prev->u64s, u64s);
+			continue;
+		}
+
+		/* Couldn't merge, move i into new position (after prev): */
+		prev = prev ? vstruct_next(prev) : jset->start;
+		if (i != prev)
+			memmove_u64s_down(prev, i, jset_u64s(u64s));
+	}
+
+	prev = prev ? vstruct_next(prev) : jset->start;
+	jset->u64s = cpu_to_le32((u64 *) prev - jset->_data);
+}
+
+static void journal_buf_realloc(struct journal *j, struct journal_buf *buf)
+{
+	/* we aren't holding j->lock: */
+	unsigned new_size = READ_ONCE(j->buf_size_want);
+	void *new_buf;
+
+	if (buf->size >= new_size)
+		return;
+
+	new_buf = kvpmalloc(new_size, GFP_NOIO|__GFP_NOWARN);
+	if (!new_buf)
+		return;
+
+	memcpy(new_buf, buf->data, buf->size);
+	kvpfree(buf->data, buf->size);
+	buf->data	= new_buf;
+	buf->size	= new_size;
+}
+
+static void journal_write_done(struct closure *cl)
+{
+	struct journal *j = container_of(cl, struct journal, io);
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
+	struct journal_buf *w = journal_prev_buf(j);
+	struct bch_devs_list devs =
+		bch2_extent_devs(bkey_i_to_s_c_extent(&w->key));
+	u64 seq = le64_to_cpu(w->data->seq);
+
+	if (!devs.nr) {
+		bch_err(c, "unable to write journal to sufficient devices");
+		goto err;
+	}
+
+	if (bch2_mark_replicas(c, BCH_DATA_JOURNAL, devs))
+		goto err;
+out:
+	bch2_time_stats_update(j->write_time, j->write_start_time);
+
+	spin_lock(&j->lock);
+	j->last_seq_ondisk = seq;
+	if (seq >= j->pin.front)
+		journal_seq_pin(j, seq)->devs = devs;
+
+	/*
+	 * Updating last_seq_ondisk may let bch2_journal_reclaim_work() discard
+	 * more buckets:
+	 *
+	 * Must come before signaling write completion, for
+	 * bch2_fs_journal_stop():
+	 */
+	mod_delayed_work(system_freezable_wq, &j->reclaim_work, 0);
+
+	/* also must come before signalling write completion: */
+	closure_debug_destroy(cl);
+
+	BUG_ON(!j->reservations.prev_buf_unwritten);
+	atomic64_sub(((union journal_res_state) { .prev_buf_unwritten = 1 }).v,
+		     &j->reservations.counter);
+
+	closure_wake_up(&w->wait);
+	journal_wake(j);
+
+	if (test_bit(JOURNAL_NEED_WRITE, &j->flags))
+		mod_delayed_work(system_freezable_wq, &j->write_work, 0);
+	spin_unlock(&j->lock);
+	return;
+err:
+	bch2_fatal_error(c);
+	bch2_journal_halt(j);
+	goto out;
+}
+
+static void journal_write_endio(struct bio *bio)
+{
+	struct bch_dev *ca = bio->bi_private;
+	struct journal *j = &ca->fs->journal;
+
+	if (bch2_dev_io_err_on(bio->bi_status, ca, "journal write") ||
+	    bch2_meta_write_fault("journal")) {
+		struct journal_buf *w = journal_prev_buf(j);
+		unsigned long flags;
+
+		spin_lock_irqsave(&j->err_lock, flags);
+		bch2_extent_drop_device(bkey_i_to_s_extent(&w->key), ca->dev_idx);
+		spin_unlock_irqrestore(&j->err_lock, flags);
+	}
+
+	closure_put(&j->io);
+	percpu_ref_put(&ca->io_ref);
+}
+
+void bch2_journal_write(struct closure *cl)
+{
+	struct journal *j = container_of(cl, struct journal, io);
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
+	struct bch_dev *ca;
+	struct journal_buf *w = journal_prev_buf(j);
+	struct jset *jset;
+	struct bio *bio;
+	struct bch_extent_ptr *ptr;
+	unsigned i, sectors, bytes;
+
+	journal_buf_realloc(j, w);
+	jset = w->data;
+
+	j->write_start_time = local_clock();
+	mutex_lock(&c->btree_root_lock);
+	for (i = 0; i < BTREE_ID_NR; i++) {
+		struct btree_root *r = &c->btree_roots[i];
+
+		if (r->alive)
+			bch2_journal_add_btree_root(w, i, &r->key, r->level);
+	}
+	c->btree_roots_dirty = false;
+	mutex_unlock(&c->btree_root_lock);
+
+	journal_write_compact(jset);
+
+	jset->read_clock	= cpu_to_le16(c->bucket_clock[READ].hand);
+	jset->write_clock	= cpu_to_le16(c->bucket_clock[WRITE].hand);
+	jset->magic		= cpu_to_le64(jset_magic(c));
+	jset->version		= cpu_to_le32(BCACHE_JSET_VERSION);
+
+	SET_JSET_BIG_ENDIAN(jset, CPU_BIG_ENDIAN);
+	SET_JSET_CSUM_TYPE(jset, bch2_meta_checksum_type(c));
+
+	if (bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset)) &&
+	    jset_validate_entries(c, jset, WRITE))
+		goto err;
+
+	bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset),
+		    jset->encrypted_start,
+		    vstruct_end(jset) - (void *) jset->encrypted_start);
+
+	jset->csum = csum_vstruct(c, JSET_CSUM_TYPE(jset),
+				  journal_nonce(jset), jset);
+
+	if (!bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset)) &&
+	    jset_validate_entries(c, jset, WRITE))
+		goto err;
+
+	sectors = vstruct_sectors(jset, c->block_bits);
+	BUG_ON(sectors > j->prev_buf_sectors);
+
+	bytes = vstruct_bytes(w->data);
+	memset((void *) w->data + bytes, 0, (sectors << 9) - bytes);
+
+	if (journal_write_alloc(j, w, sectors)) {
+		bch2_journal_halt(j);
+		bch_err(c, "Unable to allocate journal write");
+		bch2_fatal_error(c);
+		continue_at(cl, journal_write_done, system_highpri_wq);
+		return;
+	}
+
+	/*
+	 * XXX: we really should just disable the entire journal in nochanges
+	 * mode
+	 */
+	if (c->opts.nochanges)
+		goto no_io;
+
+	extent_for_each_ptr(bkey_i_to_s_extent(&w->key), ptr) {
+		ca = bch_dev_bkey_exists(c, ptr->dev);
+		if (!percpu_ref_tryget(&ca->io_ref)) {
+			/* XXX: fix this */
+			bch_err(c, "missing device for journal write\n");
+			continue;
+		}
+
+		this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_JOURNAL],
+			     sectors);
+
+		bio = ca->journal.bio;
+		bio_reset(bio, ca->disk_sb.bdev,
+			  REQ_OP_WRITE|REQ_SYNC|REQ_META|REQ_PREFLUSH|REQ_FUA);
+		bio->bi_iter.bi_sector	= ptr->offset;
+		bio->bi_iter.bi_size	= sectors << 9;
+		bio->bi_end_io		= journal_write_endio;
+		bio->bi_private		= ca;
+		bch2_bio_map(bio, jset);
+
+		trace_journal_write(bio);
+		closure_bio_submit(bio, cl);
+
+		ca->journal.bucket_seq[ca->journal.cur_idx] = le64_to_cpu(w->data->seq);
+	}
+
+	for_each_rw_member(ca, c, i)
+		if (journal_flushes_device(ca) &&
+		    !bch2_extent_has_device(bkey_i_to_s_c_extent(&w->key), i)) {
+			percpu_ref_get(&ca->io_ref);
+
+			bio = ca->journal.bio;
+			bio_reset(bio, ca->disk_sb.bdev, REQ_OP_FLUSH);
+			bio->bi_end_io		= journal_write_endio;
+			bio->bi_private		= ca;
+			closure_bio_submit(bio, cl);
+		}
+
+no_io:
+	extent_for_each_ptr(bkey_i_to_s_extent(&j->key), ptr)
+		ptr->offset += sectors;
+
+	continue_at(cl, journal_write_done, system_highpri_wq);
+	return;
+err:
+	bch2_inconsistent_error(c);
+	continue_at(cl, journal_write_done, system_highpri_wq);
+}
diff --git a/fs/bcachefs/journal_io.h b/fs/bcachefs/journal_io.h
new file mode 100644
index 000000000000..35f90c96008a
--- /dev/null
+++ b/fs/bcachefs/journal_io.h
@@ -0,0 +1,44 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_JOURNAL_IO_H
+#define _BCACHEFS_JOURNAL_IO_H
+
+/*
+ * Only used for holding the journal entries we read in btree_journal_read()
+ * during cache_registration
+ */
+struct journal_replay {
+	struct list_head	list;
+	struct bch_devs_list	devs;
+	/* must be last: */
+	struct jset		j;
+};
+
+static inline struct jset_entry *__jset_entry_type_next(struct jset *jset,
+					struct jset_entry *entry, unsigned type)
+{
+	while (entry < vstruct_last(jset)) {
+		if (entry->type == type)
+			return entry;
+
+		entry = vstruct_next(entry);
+	}
+
+	return NULL;
+}
+
+#define for_each_jset_entry_type(entry, jset, type)			\
+	for (entry = (jset)->start;					\
+	     (entry = __jset_entry_type_next(jset, entry, type));	\
+	     entry = vstruct_next(entry))
+
+#define for_each_jset_key(k, _n, entry, jset)				\
+	for_each_jset_entry_type(entry, jset, BCH_JSET_ENTRY_btree_keys)	\
+		vstruct_for_each_safe(entry, k, _n)
+
+int bch2_journal_set_seq(struct bch_fs *c, u64, u64);
+int bch2_journal_read(struct bch_fs *, struct list_head *);
+
+int bch2_journal_entry_sectors(struct journal *);
+void bch2_journal_write(struct closure *);
+
+#endif /* _BCACHEFS_JOURNAL_IO_H */
diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
new file mode 100644
index 000000000000..e5b8666fa052
--- /dev/null
+++ b/fs/bcachefs/journal_reclaim.c
@@ -0,0 +1,402 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "journal.h"
+#include "journal_reclaim.h"
+#include "replicas.h"
+#include "super.h"
+
+/*
+ * Journal entry pinning - machinery for holding a reference on a given journal
+ * entry, holding it open to ensure it gets replayed during recovery:
+ */
+
+static inline u64 journal_pin_seq(struct journal *j,
+				  struct journal_entry_pin_list *pin_list)
+{
+	return fifo_entry_idx_abs(&j->pin, pin_list);
+}
+
+u64 bch2_journal_pin_seq(struct journal *j, struct journal_entry_pin *pin)
+{
+	u64 ret = 0;
+
+	spin_lock(&j->lock);
+	if (journal_pin_active(pin))
+		ret = journal_pin_seq(j, pin->pin_list);
+	spin_unlock(&j->lock);
+
+	return ret;
+}
+
+static inline void __journal_pin_add(struct journal *j,
+				     struct journal_entry_pin_list *pin_list,
+				     struct journal_entry_pin *pin,
+				     journal_pin_flush_fn flush_fn)
+{
+	BUG_ON(journal_pin_active(pin));
+	BUG_ON(!atomic_read(&pin_list->count));
+
+	atomic_inc(&pin_list->count);
+	pin->pin_list	= pin_list;
+	pin->flush	= flush_fn;
+
+	if (flush_fn)
+		list_add(&pin->list, &pin_list->list);
+	else
+		INIT_LIST_HEAD(&pin->list);
+
+	/*
+	 * If the journal is currently full,  we might want to call flush_fn
+	 * immediately:
+	 */
+	journal_wake(j);
+}
+
+void bch2_journal_pin_add(struct journal *j, u64 seq,
+			  struct journal_entry_pin *pin,
+			  journal_pin_flush_fn flush_fn)
+{
+	spin_lock(&j->lock);
+	__journal_pin_add(j, journal_seq_pin(j, seq), pin, flush_fn);
+	spin_unlock(&j->lock);
+}
+
+static inline void __journal_pin_drop(struct journal *j,
+				      struct journal_entry_pin *pin)
+{
+	struct journal_entry_pin_list *pin_list = pin->pin_list;
+
+	if (!journal_pin_active(pin))
+		return;
+
+	pin->pin_list = NULL;
+	list_del_init(&pin->list);
+
+	/*
+	 * Unpinning a journal entry make make journal_next_bucket() succeed, if
+	 * writing a new last_seq will now make another bucket available:
+	 */
+	if (atomic_dec_and_test(&pin_list->count) &&
+	    pin_list == &fifo_peek_front(&j->pin))
+		bch2_journal_reclaim_fast(j);
+}
+
+void bch2_journal_pin_drop(struct journal *j,
+			  struct journal_entry_pin *pin)
+{
+	spin_lock(&j->lock);
+	__journal_pin_drop(j, pin);
+	spin_unlock(&j->lock);
+}
+
+void bch2_journal_pin_add_if_older(struct journal *j,
+				  struct journal_entry_pin *src_pin,
+				  struct journal_entry_pin *pin,
+				  journal_pin_flush_fn flush_fn)
+{
+	spin_lock(&j->lock);
+
+	if (journal_pin_active(src_pin) &&
+	    (!journal_pin_active(pin) ||
+	     journal_pin_seq(j, src_pin->pin_list) <
+	     journal_pin_seq(j, pin->pin_list))) {
+		__journal_pin_drop(j, pin);
+		__journal_pin_add(j, src_pin->pin_list, pin, flush_fn);
+	}
+
+	spin_unlock(&j->lock);
+}
+
+/*
+ * Journal reclaim: flush references to open journal entries to reclaim space in
+ * the journal
+ *
+ * May be done by the journal code in the background as needed to free up space
+ * for more journal entries, or as part of doing a clean shutdown, or to migrate
+ * data off of a specific device:
+ */
+
+/**
+ * bch2_journal_reclaim_fast - do the fast part of journal reclaim
+ *
+ * Called from IO submission context, does not block. Cleans up after btree
+ * write completions by advancing the journal pin and each cache's last_idx,
+ * kicking off discards and background reclaim as necessary.
+ */
+void bch2_journal_reclaim_fast(struct journal *j)
+{
+	struct journal_entry_pin_list temp;
+	bool popped = false;
+
+	lockdep_assert_held(&j->lock);
+
+	/*
+	 * Unpin journal entries whose reference counts reached zero, meaning
+	 * all btree nodes got written out
+	 */
+	while (!atomic_read(&fifo_peek_front(&j->pin).count)) {
+		BUG_ON(!list_empty(&fifo_peek_front(&j->pin).list));
+		BUG_ON(!fifo_pop(&j->pin, temp));
+		popped = true;
+	}
+
+	if (popped)
+		journal_wake(j);
+}
+
+static struct journal_entry_pin *
+__journal_get_next_pin(struct journal *j, u64 seq_to_flush, u64 *seq)
+{
+	struct journal_entry_pin_list *pin_list;
+	struct journal_entry_pin *ret;
+	u64 iter;
+
+	/* no need to iterate over empty fifo entries: */
+	bch2_journal_reclaim_fast(j);
+
+	fifo_for_each_entry_ptr(pin_list, &j->pin, iter) {
+		if (iter > seq_to_flush)
+			break;
+
+		ret = list_first_entry_or_null(&pin_list->list,
+				struct journal_entry_pin, list);
+		if (ret) {
+			/* must be list_del_init(), see bch2_journal_pin_drop() */
+			list_move(&ret->list, &pin_list->flushed);
+			*seq = iter;
+			return ret;
+		}
+	}
+
+	return NULL;
+}
+
+static struct journal_entry_pin *
+journal_get_next_pin(struct journal *j, u64 seq_to_flush, u64 *seq)
+{
+	struct journal_entry_pin *ret;
+
+	spin_lock(&j->lock);
+	ret = __journal_get_next_pin(j, seq_to_flush, seq);
+	spin_unlock(&j->lock);
+
+	return ret;
+}
+
+static bool should_discard_bucket(struct journal *j, struct journal_device *ja)
+{
+	bool ret;
+
+	spin_lock(&j->lock);
+	ret = ja->nr &&
+		(ja->last_idx != ja->cur_idx &&
+		 ja->bucket_seq[ja->last_idx] < j->last_seq_ondisk);
+	spin_unlock(&j->lock);
+
+	return ret;
+}
+
+/**
+ * bch2_journal_reclaim_work - free up journal buckets
+ *
+ * Background journal reclaim writes out btree nodes. It should be run
+ * early enough so that we never completely run out of journal buckets.
+ *
+ * High watermarks for triggering background reclaim:
+ * - FIFO has fewer than 512 entries left
+ * - fewer than 25% journal buckets free
+ *
+ * Background reclaim runs until low watermarks are reached:
+ * - FIFO has more than 1024 entries left
+ * - more than 50% journal buckets free
+ *
+ * As long as a reclaim can complete in the time it takes to fill up
+ * 512 journal entries or 25% of all journal buckets, then
+ * journal_next_bucket() should not stall.
+ */
+void bch2_journal_reclaim_work(struct work_struct *work)
+{
+	struct bch_fs *c = container_of(to_delayed_work(work),
+				struct bch_fs, journal.reclaim_work);
+	struct journal *j = &c->journal;
+	struct bch_dev *ca;
+	struct journal_entry_pin *pin;
+	u64 seq, seq_to_flush = 0;
+	unsigned iter, bucket_to_flush;
+	unsigned long next_flush;
+	bool reclaim_lock_held = false, need_flush;
+
+	/*
+	 * Advance last_idx to point to the oldest journal entry containing
+	 * btree node updates that have not yet been written out
+	 */
+	for_each_rw_member(ca, c, iter) {
+		struct journal_device *ja = &ca->journal;
+
+		if (!ja->nr)
+			continue;
+
+		while (should_discard_bucket(j, ja)) {
+			if (!reclaim_lock_held) {
+				/*
+				 * ugh:
+				 * might be called from __journal_res_get()
+				 * under wait_event() - have to go back to
+				 * TASK_RUNNING before doing something that
+				 * would block, but only if we're doing work:
+				 */
+				__set_current_state(TASK_RUNNING);
+
+				mutex_lock(&j->reclaim_lock);
+				reclaim_lock_held = true;
+				/* recheck under reclaim_lock: */
+				continue;
+			}
+
+			if (ca->mi.discard &&
+			    bdev_max_discard_sectors(ca->disk_sb.bdev))
+				blkdev_issue_discard(ca->disk_sb.bdev,
+					bucket_to_sector(ca,
+						ja->buckets[ja->last_idx]),
+					ca->mi.bucket_size, GFP_NOIO);
+
+			spin_lock(&j->lock);
+			ja->last_idx = (ja->last_idx + 1) % ja->nr;
+			spin_unlock(&j->lock);
+
+			journal_wake(j);
+		}
+
+		/*
+		 * Write out enough btree nodes to free up 50% journal
+		 * buckets
+		 */
+		spin_lock(&j->lock);
+		bucket_to_flush = (ja->cur_idx + (ja->nr >> 1)) % ja->nr;
+		seq_to_flush = max_t(u64, seq_to_flush,
+				     ja->bucket_seq[bucket_to_flush]);
+		spin_unlock(&j->lock);
+	}
+
+	if (reclaim_lock_held)
+		mutex_unlock(&j->reclaim_lock);
+
+	/* Also flush if the pin fifo is more than half full */
+	spin_lock(&j->lock);
+	seq_to_flush = max_t(s64, seq_to_flush,
+			     (s64) journal_cur_seq(j) -
+			     (j->pin.size >> 1));
+	spin_unlock(&j->lock);
+
+	/*
+	 * If it's been longer than j->reclaim_delay_ms since we last flushed,
+	 * make sure to flush at least one journal pin:
+	 */
+	next_flush = j->last_flushed + msecs_to_jiffies(j->reclaim_delay_ms);
+	need_flush = time_after(jiffies, next_flush);
+
+	while ((pin = journal_get_next_pin(j, need_flush
+					   ? U64_MAX
+					   : seq_to_flush, &seq))) {
+		__set_current_state(TASK_RUNNING);
+		pin->flush(j, pin, seq);
+		need_flush = false;
+
+		j->last_flushed = jiffies;
+	}
+
+	if (!test_bit(BCH_FS_RO, &c->flags))
+		queue_delayed_work(system_freezable_wq, &j->reclaim_work,
+				   msecs_to_jiffies(j->reclaim_delay_ms));
+}
+
+static int journal_flush_done(struct journal *j, u64 seq_to_flush,
+			      struct journal_entry_pin **pin,
+			      u64 *pin_seq)
+{
+	int ret;
+
+	*pin = NULL;
+
+	ret = bch2_journal_error(j);
+	if (ret)
+		return ret;
+
+	spin_lock(&j->lock);
+	/*
+	 * If journal replay hasn't completed, the unreplayed journal entries
+	 * hold refs on their corresponding sequence numbers
+	 */
+	ret = (*pin = __journal_get_next_pin(j, seq_to_flush, pin_seq)) != NULL ||
+		!test_bit(JOURNAL_REPLAY_DONE, &j->flags) ||
+		journal_last_seq(j) > seq_to_flush ||
+		(fifo_used(&j->pin) == 1 &&
+		 atomic_read(&fifo_peek_front(&j->pin).count) == 1);
+	spin_unlock(&j->lock);
+
+	return ret;
+}
+
+void bch2_journal_flush_pins(struct journal *j, u64 seq_to_flush)
+{
+	struct journal_entry_pin *pin;
+	u64 pin_seq;
+
+	if (!test_bit(JOURNAL_STARTED, &j->flags))
+		return;
+
+	while (1) {
+		wait_event(j->wait, journal_flush_done(j, seq_to_flush,
+						       &pin, &pin_seq));
+		if (!pin)
+			break;
+
+		pin->flush(j, pin, pin_seq);
+	}
+}
+
+int bch2_journal_flush_device_pins(struct journal *j, int dev_idx)
+{
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
+	struct journal_entry_pin_list *p;
+	struct bch_devs_list devs;
+	u64 iter, seq = 0;
+	int ret = 0;
+
+	spin_lock(&j->lock);
+	fifo_for_each_entry_ptr(p, &j->pin, iter)
+		if (dev_idx >= 0
+		    ? bch2_dev_list_has_dev(p->devs, dev_idx)
+		    : p->devs.nr < c->opts.metadata_replicas)
+			seq = iter;
+	spin_unlock(&j->lock);
+
+	bch2_journal_flush_pins(j, seq);
+
+	ret = bch2_journal_error(j);
+	if (ret)
+		return ret;
+
+	mutex_lock(&c->replicas_gc_lock);
+	bch2_replicas_gc_start(c, 1 << BCH_DATA_JOURNAL);
+
+	seq = 0;
+
+	spin_lock(&j->lock);
+	while (!ret && seq < j->pin.back) {
+		seq = max(seq, journal_last_seq(j));
+		devs = journal_seq_pin(j, seq)->devs;
+		seq++;
+
+		spin_unlock(&j->lock);
+		ret = bch2_mark_replicas(c, BCH_DATA_JOURNAL, devs);
+		spin_lock(&j->lock);
+	}
+	spin_unlock(&j->lock);
+
+	ret = bch2_replicas_gc_end(c, ret);
+	mutex_unlock(&c->replicas_gc_lock);
+
+	return ret;
+}
diff --git a/fs/bcachefs/journal_reclaim.h b/fs/bcachefs/journal_reclaim.h
new file mode 100644
index 000000000000..a93ed43cfc78
--- /dev/null
+++ b/fs/bcachefs/journal_reclaim.h
@@ -0,0 +1,42 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_JOURNAL_RECLAIM_H
+#define _BCACHEFS_JOURNAL_RECLAIM_H
+
+#define JOURNAL_PIN	(32 * 1024)
+
+static inline bool journal_pin_active(struct journal_entry_pin *pin)
+{
+	return pin->pin_list != NULL;
+}
+
+static inline struct journal_entry_pin_list *
+journal_seq_pin(struct journal *j, u64 seq)
+{
+	BUG_ON(seq < j->pin.front || seq >= j->pin.back);
+
+	return &j->pin.data[seq & j->pin.mask];
+}
+
+u64 bch2_journal_pin_seq(struct journal *, struct journal_entry_pin *);
+
+void bch2_journal_pin_add(struct journal *, u64, struct journal_entry_pin *,
+			  journal_pin_flush_fn);
+void bch2_journal_pin_drop(struct journal *, struct journal_entry_pin *);
+void bch2_journal_pin_add_if_older(struct journal *,
+				  struct journal_entry_pin *,
+				  struct journal_entry_pin *,
+				  journal_pin_flush_fn);
+
+void bch2_journal_reclaim_fast(struct journal *);
+void bch2_journal_reclaim_work(struct work_struct *);
+
+void bch2_journal_flush_pins(struct journal *, u64);
+
+static inline void bch2_journal_flush_all_pins(struct journal *j)
+{
+	bch2_journal_flush_pins(j, U64_MAX);
+}
+
+int bch2_journal_flush_device_pins(struct journal *, int);
+
+#endif /* _BCACHEFS_JOURNAL_RECLAIM_H */
diff --git a/fs/bcachefs/journal_seq_blacklist.c b/fs/bcachefs/journal_seq_blacklist.c
new file mode 100644
index 000000000000..c26f36d58633
--- /dev/null
+++ b/fs/bcachefs/journal_seq_blacklist.c
@@ -0,0 +1,360 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "btree_update.h"
+#include "btree_update_interior.h"
+#include "error.h"
+#include "journal.h"
+#include "journal_io.h"
+#include "journal_reclaim.h"
+#include "journal_seq_blacklist.h"
+
+/*
+ * journal_seq_blacklist machinery:
+ *
+ * To guarantee order of btree updates after a crash, we need to detect when a
+ * btree node entry (bset) is newer than the newest journal entry that was
+ * successfully written, and ignore it - effectively ignoring any btree updates
+ * that didn't make it into the journal.
+ *
+ * If we didn't do this, we might have two btree nodes, a and b, both with
+ * updates that weren't written to the journal yet: if b was updated after a,
+ * but b was flushed and not a - oops; on recovery we'll find that the updates
+ * to b happened, but not the updates to a that happened before it.
+ *
+ * Ignoring bsets that are newer than the newest journal entry is always safe,
+ * because everything they contain will also have been journalled - and must
+ * still be present in the journal on disk until a journal entry has been
+ * written _after_ that bset was written.
+ *
+ * To accomplish this, bsets record the newest journal sequence number they
+ * contain updates for; then, on startup, the btree code queries the journal
+ * code to ask "Is this sequence number newer than the newest journal entry? If
+ * so, ignore it."
+ *
+ * When this happens, we must blacklist that journal sequence number: the
+ * journal must not write any entries with that sequence number, and it must
+ * record that it was blacklisted so that a) on recovery we don't think we have
+ * missing journal entries and b) so that the btree code continues to ignore
+ * that bset, until that btree node is rewritten.
+ *
+ * Blacklisted journal sequence numbers are themselves recorded as entries in
+ * the journal.
+ */
+
+/*
+ * Called when journal needs to evict a blacklist entry to reclaim space: find
+ * any btree nodes that refer to the blacklist journal sequence numbers, and
+ * rewrite them:
+ */
+static void journal_seq_blacklist_flush(struct journal *j,
+					struct journal_entry_pin *pin, u64 seq)
+{
+	struct bch_fs *c =
+		container_of(j, struct bch_fs, journal);
+	struct journal_seq_blacklist *bl =
+		container_of(pin, struct journal_seq_blacklist, pin);
+	struct blacklisted_node n;
+	struct closure cl;
+	unsigned i;
+	int ret;
+
+	closure_init_stack(&cl);
+
+	for (i = 0;; i++) {
+		struct btree_iter iter;
+		struct btree *b;
+
+		mutex_lock(&j->blacklist_lock);
+		if (i >= bl->nr_entries) {
+			mutex_unlock(&j->blacklist_lock);
+			break;
+		}
+		n = bl->entries[i];
+		mutex_unlock(&j->blacklist_lock);
+
+		__bch2_btree_iter_init(&iter, c, n.btree_id, n.pos,
+				       0, 0, BTREE_ITER_NODES);
+
+		b = bch2_btree_iter_peek_node(&iter);
+
+		/* The node might have already been rewritten: */
+
+		if (b->data->keys.seq == n.seq) {
+			ret = bch2_btree_node_rewrite(c, &iter, n.seq, 0);
+			if (ret) {
+				bch2_btree_iter_unlock(&iter);
+				bch2_fs_fatal_error(c,
+					"error %i rewriting btree node with blacklisted journal seq",
+					ret);
+				bch2_journal_halt(j);
+				return;
+			}
+		}
+
+		bch2_btree_iter_unlock(&iter);
+	}
+
+	for (i = 0;; i++) {
+		struct btree_update *as;
+		struct pending_btree_node_free *d;
+
+		mutex_lock(&j->blacklist_lock);
+		if (i >= bl->nr_entries) {
+			mutex_unlock(&j->blacklist_lock);
+			break;
+		}
+		n = bl->entries[i];
+		mutex_unlock(&j->blacklist_lock);
+redo_wait:
+		mutex_lock(&c->btree_interior_update_lock);
+
+		/*
+		 * Is the node on the list of pending interior node updates -
+		 * being freed? If so, wait for that to finish:
+		 */
+		for_each_pending_btree_node_free(c, as, d)
+			if (n.seq	== d->seq &&
+			    n.btree_id	== d->btree_id &&
+			    !d->level &&
+			    !bkey_cmp(n.pos, d->key.k.p)) {
+				closure_wait(&as->wait, &cl);
+				mutex_unlock(&c->btree_interior_update_lock);
+				closure_sync(&cl);
+				goto redo_wait;
+			}
+
+		mutex_unlock(&c->btree_interior_update_lock);
+	}
+
+	mutex_lock(&j->blacklist_lock);
+
+	bch2_journal_pin_drop(j, &bl->pin);
+	list_del(&bl->list);
+	kfree(bl->entries);
+	kfree(bl);
+
+	mutex_unlock(&j->blacklist_lock);
+}
+
+/*
+ * Determine if a particular sequence number is blacklisted - if so, return
+ * blacklist entry:
+ */
+struct journal_seq_blacklist *
+bch2_journal_seq_blacklist_find(struct journal *j, u64 seq)
+{
+	struct journal_seq_blacklist *bl;
+
+	lockdep_assert_held(&j->blacklist_lock);
+
+	list_for_each_entry(bl, &j->seq_blacklist, list)
+		if (seq >= bl->start && seq <= bl->end)
+			return bl;
+
+	return NULL;
+}
+
+/*
+ * Allocate a new, in memory blacklist entry:
+ */
+static struct journal_seq_blacklist *
+bch2_journal_seq_blacklisted_new(struct journal *j, u64 start, u64 end)
+{
+	struct journal_seq_blacklist *bl;
+
+	lockdep_assert_held(&j->blacklist_lock);
+
+	/*
+	 * When we start the journal, bch2_journal_start() will skip over @seq:
+	 */
+
+	bl = kzalloc(sizeof(*bl), GFP_KERNEL);
+	if (!bl)
+		return NULL;
+
+	bl->start	= start;
+	bl->end		= end;
+
+	list_add_tail(&bl->list, &j->seq_blacklist);
+	return bl;
+}
+
+/*
+ * Returns true if @seq is newer than the most recent journal entry that got
+ * written, and data corresponding to @seq should be ignored - also marks @seq
+ * as blacklisted so that on future restarts the corresponding data will still
+ * be ignored:
+ */
+int bch2_journal_seq_should_ignore(struct bch_fs *c, u64 seq, struct btree *b)
+{
+	struct journal *j = &c->journal;
+	struct journal_seq_blacklist *bl = NULL;
+	struct blacklisted_node *n;
+	u64 journal_seq;
+	int ret = 0;
+
+	if (!seq)
+		return 0;
+
+	spin_lock(&j->lock);
+	journal_seq = journal_cur_seq(j);
+	spin_unlock(&j->lock);
+
+	/* Interier updates aren't journalled: */
+	BUG_ON(b->level);
+	BUG_ON(seq > journal_seq && test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags));
+
+	/*
+	 * Decrease this back to j->seq + 2 when we next rev the on disk format:
+	 * increasing it temporarily to work around bug in old kernels
+	 */
+	fsck_err_on(seq > journal_seq + 4, c,
+		    "bset journal seq too far in the future: %llu > %llu",
+		    seq, journal_seq);
+
+	if (seq <= journal_seq &&
+	    list_empty_careful(&j->seq_blacklist))
+		return 0;
+
+	mutex_lock(&j->blacklist_lock);
+
+	if (seq <= journal_seq) {
+		bl = bch2_journal_seq_blacklist_find(j, seq);
+		if (!bl)
+			goto out;
+	} else {
+		bch_verbose(c, "btree node %u:%llu:%llu has future journal sequence number %llu, blacklisting",
+			    b->btree_id, b->key.k.p.inode, b->key.k.p.offset, seq);
+
+		if (!j->new_blacklist) {
+			j->new_blacklist = bch2_journal_seq_blacklisted_new(j,
+						journal_seq + 1,
+						journal_seq + 1);
+			if (!j->new_blacklist) {
+				ret = -ENOMEM;
+				goto out;
+			}
+		}
+		bl = j->new_blacklist;
+		bl->end = max(bl->end, seq);
+	}
+
+	for (n = bl->entries; n < bl->entries + bl->nr_entries; n++)
+		if (b->data->keys.seq	== n->seq &&
+		    b->btree_id		== n->btree_id &&
+		    !bkey_cmp(b->key.k.p, n->pos))
+			goto found_entry;
+
+	if (!bl->nr_entries ||
+	    is_power_of_2(bl->nr_entries)) {
+		n = krealloc(bl->entries,
+			     max_t(size_t, bl->nr_entries * 2, 8) * sizeof(*n),
+			     GFP_KERNEL);
+		if (!n) {
+			ret = -ENOMEM;
+			goto out;
+		}
+		bl->entries = n;
+	}
+
+	bl->entries[bl->nr_entries++] = (struct blacklisted_node) {
+		.seq		= b->data->keys.seq,
+		.btree_id	= b->btree_id,
+		.pos		= b->key.k.p,
+	};
+found_entry:
+	ret = 1;
+out:
+fsck_err:
+	mutex_unlock(&j->blacklist_lock);
+	return ret;
+}
+
+static int __bch2_journal_seq_blacklist_read(struct journal *j,
+					     struct journal_replay *i,
+					     u64 start, u64 end)
+{
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
+	struct journal_seq_blacklist *bl;
+
+	bch_verbose(c, "blacklisting existing journal seq %llu-%llu",
+		    start, end);
+
+	bl = bch2_journal_seq_blacklisted_new(j, start, end);
+	if (!bl)
+		return -ENOMEM;
+
+	bch2_journal_pin_add(j, le64_to_cpu(i->j.seq), &bl->pin,
+			     journal_seq_blacklist_flush);
+	return 0;
+}
+
+/*
+ * After reading the journal, find existing journal seq blacklist entries and
+ * read them into memory:
+ */
+int bch2_journal_seq_blacklist_read(struct journal *j,
+				    struct journal_replay *i)
+{
+	struct jset_entry *entry;
+	int ret = 0;
+
+	vstruct_for_each(&i->j, entry) {
+		switch (entry->type) {
+		case BCH_JSET_ENTRY_blacklist: {
+			struct jset_entry_blacklist *bl_entry =
+				container_of(entry, struct jset_entry_blacklist, entry);
+
+			ret = __bch2_journal_seq_blacklist_read(j, i,
+					le64_to_cpu(bl_entry->seq),
+					le64_to_cpu(bl_entry->seq));
+			break;
+		}
+		case BCH_JSET_ENTRY_blacklist_v2: {
+			struct jset_entry_blacklist_v2 *bl_entry =
+				container_of(entry, struct jset_entry_blacklist_v2, entry);
+
+			ret = __bch2_journal_seq_blacklist_read(j, i,
+					le64_to_cpu(bl_entry->start),
+					le64_to_cpu(bl_entry->end));
+			break;
+		}
+		}
+
+		if (ret)
+			break;
+	}
+
+	return ret;
+}
+
+/*
+ * After reading the journal and walking the btree, we might have new journal
+ * sequence numbers to blacklist - add entries to the next journal entry to be
+ * written:
+ */
+void bch2_journal_seq_blacklist_write(struct journal *j)
+{
+	struct journal_seq_blacklist *bl = j->new_blacklist;
+	struct jset_entry_blacklist_v2 *bl_entry;
+	struct jset_entry *entry;
+
+	if (!bl)
+		return;
+
+	entry = bch2_journal_add_entry_noreservation(journal_cur_buf(j),
+			(sizeof(*bl_entry) - sizeof(*entry)) / sizeof(u64));
+
+	bl_entry = container_of(entry, struct jset_entry_blacklist_v2, entry);
+	bl_entry->entry.type	= BCH_JSET_ENTRY_blacklist_v2;
+	bl_entry->start		= cpu_to_le64(bl->start);
+	bl_entry->end		= cpu_to_le64(bl->end);
+
+	bch2_journal_pin_add(j,
+			     journal_cur_seq(j),
+			     &bl->pin,
+			     journal_seq_blacklist_flush);
+
+	j->new_blacklist = NULL;
+}
diff --git a/fs/bcachefs/journal_seq_blacklist.h b/fs/bcachefs/journal_seq_blacklist.h
new file mode 100644
index 000000000000..b4a3b270e9d2
--- /dev/null
+++ b/fs/bcachefs/journal_seq_blacklist.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H
+#define _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H
+
+struct journal_replay;
+
+struct journal_seq_blacklist *
+bch2_journal_seq_blacklist_find(struct journal *, u64);
+int bch2_journal_seq_should_ignore(struct bch_fs *, u64, struct btree *);
+int bch2_journal_seq_blacklist_read(struct journal *,
+				    struct journal_replay *);
+void bch2_journal_seq_blacklist_write(struct journal *);
+
+#endif /* _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H */
diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h
new file mode 100644
index 000000000000..cf291227cffb
--- /dev/null
+++ b/fs/bcachefs/journal_types.h
@@ -0,0 +1,242 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_JOURNAL_TYPES_H
+#define _BCACHEFS_JOURNAL_TYPES_H
+
+#include <linux/cache.h>
+#include <linux/workqueue.h>
+
+#include "alloc_types.h"
+#include "super_types.h"
+#include "fifo.h"
+
+struct journal_res;
+
+/*
+ * We put two of these in struct journal; we used them for writes to the
+ * journal that are being staged or in flight.
+ */
+struct journal_buf {
+	struct jset		*data;
+
+	BKEY_PADDED(key);
+
+	struct closure_waitlist	wait;
+
+	unsigned		size;
+	unsigned		disk_sectors;
+	/* bloom filter: */
+	unsigned long		has_inode[1024 / sizeof(unsigned long)];
+};
+
+/*
+ * Something that makes a journal entry dirty - i.e. a btree node that has to be
+ * flushed:
+ */
+
+struct journal_entry_pin_list {
+	struct list_head		list;
+	struct list_head		flushed;
+	atomic_t			count;
+	struct bch_devs_list		devs;
+};
+
+struct journal;
+struct journal_entry_pin;
+typedef void (*journal_pin_flush_fn)(struct journal *j,
+				struct journal_entry_pin *, u64);
+
+struct journal_entry_pin {
+	struct list_head		list;
+	journal_pin_flush_fn		flush;
+	struct journal_entry_pin_list	*pin_list;
+};
+
+/* corresponds to a btree node with a blacklisted bset: */
+struct blacklisted_node {
+	__le64			seq;
+	enum btree_id		btree_id;
+	struct bpos		pos;
+};
+
+struct journal_seq_blacklist {
+	struct list_head	list;
+	u64			start;
+	u64			end;
+
+	struct journal_entry_pin pin;
+
+	struct blacklisted_node	*entries;
+	size_t			nr_entries;
+};
+
+struct journal_res {
+	bool			ref;
+	u8			idx;
+	u16			u64s;
+	u32			offset;
+	u64			seq;
+};
+
+union journal_res_state {
+	struct {
+		atomic64_t	counter;
+	};
+
+	struct {
+		u64		v;
+	};
+
+	struct {
+		u64		cur_entry_offset:20,
+				idx:1,
+				prev_buf_unwritten:1,
+				buf0_count:21,
+				buf1_count:21;
+	};
+};
+
+/* bytes: */
+#define JOURNAL_ENTRY_SIZE_MIN		(64U << 10) /* 64k */
+#define JOURNAL_ENTRY_SIZE_MAX		(4U  << 20) /* 4M */
+
+/*
+ * We stash some journal state as sentinal values in cur_entry_offset:
+ * note - cur_entry_offset is in units of u64s
+ */
+#define JOURNAL_ENTRY_OFFSET_MAX	((1U << 20) - 1)
+
+#define JOURNAL_ENTRY_CLOSED_VAL	(JOURNAL_ENTRY_OFFSET_MAX - 1)
+#define JOURNAL_ENTRY_ERROR_VAL		(JOURNAL_ENTRY_OFFSET_MAX)
+
+/*
+ * JOURNAL_NEED_WRITE - current (pending) journal entry should be written ASAP,
+ * either because something's waiting on the write to complete or because it's
+ * been dirty too long and the timer's expired.
+ */
+
+enum {
+	JOURNAL_REPLAY_DONE,
+	JOURNAL_STARTED,
+	JOURNAL_NEED_WRITE,
+	JOURNAL_NOT_EMPTY,
+};
+
+/* Embedded in struct bch_fs */
+struct journal {
+	/* Fastpath stuff up front: */
+
+	unsigned long		flags;
+
+	union journal_res_state reservations;
+	unsigned		cur_entry_u64s;
+	unsigned		prev_buf_sectors;
+	unsigned		cur_buf_sectors;
+	unsigned		buf_size_want;
+
+	/*
+	 * Two journal entries -- one is currently open for new entries, the
+	 * other is possibly being written out.
+	 */
+	struct journal_buf	buf[2];
+
+	spinlock_t		lock;
+
+	/* Used when waiting because the journal was full */
+	wait_queue_head_t	wait;
+	struct closure_waitlist	async_wait;
+
+	struct closure		io;
+	struct delayed_work	write_work;
+
+	/* Sequence number of most recent journal entry (last entry in @pin) */
+	atomic64_t		seq;
+
+	/* last_seq from the most recent journal entry written */
+	u64			last_seq_ondisk;
+
+	/*
+	 * FIFO of journal entries whose btree updates have not yet been
+	 * written out.
+	 *
+	 * Each entry is a reference count. The position in the FIFO is the
+	 * entry's sequence number relative to @seq.
+	 *
+	 * The journal entry itself holds a reference count, put when the
+	 * journal entry is written out. Each btree node modified by the journal
+	 * entry also holds a reference count, put when the btree node is
+	 * written.
+	 *
+	 * When a reference count reaches zero, the journal entry is no longer
+	 * needed. When all journal entries in the oldest journal bucket are no
+	 * longer needed, the bucket can be discarded and reused.
+	 */
+	struct {
+		u64 front, back, size, mask;
+		struct journal_entry_pin_list *data;
+	}			pin;
+	u64			replay_journal_seq;
+
+	struct mutex		blacklist_lock;
+	struct list_head	seq_blacklist;
+	struct journal_seq_blacklist *new_blacklist;
+
+	BKEY_PADDED(key);
+	struct write_point	wp;
+	spinlock_t		err_lock;
+
+	struct delayed_work	reclaim_work;
+	unsigned long		last_flushed;
+
+	/* protects advancing ja->last_idx: */
+	struct mutex		reclaim_lock;
+	unsigned		write_delay_ms;
+	unsigned		reclaim_delay_ms;
+
+	u64			res_get_blocked_start;
+	u64			need_write_time;
+	u64			write_start_time;
+
+	struct bch2_time_stats	*write_time;
+	struct bch2_time_stats	*delay_time;
+	struct bch2_time_stats	*blocked_time;
+	struct bch2_time_stats	*flush_seq_time;
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	struct lockdep_map	res_map;
+#endif
+};
+
+/*
+ * Embedded in struct bch_dev. First three fields refer to the array of journal
+ * buckets, in bch_sb.
+ */
+struct journal_device {
+	/*
+	 * For each journal bucket, contains the max sequence number of the
+	 * journal writes it contains - so we know when a bucket can be reused.
+	 */
+	u64			*bucket_seq;
+
+	unsigned		sectors_free;
+
+	/* Journal bucket we're currently writing to */
+	unsigned		cur_idx;
+
+	/* Last journal bucket that still contains an open journal entry */
+
+	/*
+	 * j->lock and j->reclaim_lock must both be held to modify, j->lock
+	 * sufficient to read:
+	 */
+	unsigned		last_idx;
+	unsigned		nr;
+	u64			*buckets;
+
+	/* Bio for journal reads/writes to this device */
+	struct bio		*bio;
+
+	/* for bch_journal_read_device */
+	struct closure		read;
+};
+
+#endif /* _BCACHEFS_JOURNAL_TYPES_H */
diff --git a/fs/bcachefs/keylist.c b/fs/bcachefs/keylist.c
new file mode 100644
index 000000000000..5da54ced9cad
--- /dev/null
+++ b/fs/bcachefs/keylist.c
@@ -0,0 +1,67 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "keylist.h"
+
+int bch2_keylist_realloc(struct keylist *l, u64 *inline_u64s,
+			size_t nr_inline_u64s, size_t new_u64s)
+{
+	size_t oldsize = bch_keylist_u64s(l);
+	size_t newsize = oldsize + new_u64s;
+	u64 *old_buf = l->keys_p == inline_u64s ? NULL : l->keys_p;
+	u64 *new_keys;
+
+	newsize = roundup_pow_of_two(newsize);
+
+	if (newsize <= nr_inline_u64s ||
+	    (old_buf && roundup_pow_of_two(oldsize) == newsize))
+		return 0;
+
+	new_keys = krealloc(old_buf, sizeof(u64) * newsize, GFP_NOIO);
+	if (!new_keys)
+		return -ENOMEM;
+
+	if (!old_buf)
+		memcpy_u64s(new_keys, inline_u64s, oldsize);
+
+	l->keys_p = new_keys;
+	l->top_p = new_keys + oldsize;
+
+	return 0;
+}
+
+void bch2_keylist_add_in_order(struct keylist *l, struct bkey_i *insert)
+{
+	struct bkey_i *where;
+
+	for_each_keylist_key(l, where)
+		if (bkey_cmp(insert->k.p, where->k.p) < 0)
+			break;
+
+	memmove_u64s_up((u64 *) where + insert->k.u64s,
+			where,
+			((u64 *) l->top) - ((u64 *) where));
+
+	l->top_p += insert->k.u64s;
+	bkey_copy(where, insert);
+}
+
+void bch2_keylist_pop_front(struct keylist *l)
+{
+	l->top_p -= bch2_keylist_front(l)->k.u64s;
+
+	memmove_u64s_down(l->keys,
+			  bkey_next(l->keys),
+			  bch_keylist_u64s(l));
+}
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+void bch2_verify_keylist_sorted(struct keylist *l)
+{
+	struct bkey_i *k;
+
+	for_each_keylist_key(l, k)
+		BUG_ON(bkey_next(k) != l->top &&
+		       bkey_cmp(k->k.p, bkey_next(k)->k.p) >= 0);
+}
+#endif
diff --git a/fs/bcachefs/keylist.h b/fs/bcachefs/keylist.h
new file mode 100644
index 000000000000..a7ff86b08abc
--- /dev/null
+++ b/fs/bcachefs/keylist.h
@@ -0,0 +1,76 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_KEYLIST_H
+#define _BCACHEFS_KEYLIST_H
+
+#include "keylist_types.h"
+
+int bch2_keylist_realloc(struct keylist *, u64 *, size_t, size_t);
+void bch2_keylist_add_in_order(struct keylist *, struct bkey_i *);
+void bch2_keylist_pop_front(struct keylist *);
+
+static inline void bch2_keylist_init(struct keylist *l, u64 *inline_keys)
+{
+	l->top_p = l->keys_p = inline_keys;
+}
+
+static inline void bch2_keylist_free(struct keylist *l, u64 *inline_keys)
+{
+	if (l->keys_p != inline_keys)
+		kfree(l->keys_p);
+	bch2_keylist_init(l, inline_keys);
+}
+
+static inline void bch2_keylist_push(struct keylist *l)
+{
+	l->top = bkey_next(l->top);
+}
+
+static inline void bch2_keylist_add(struct keylist *l, const struct bkey_i *k)
+{
+	bkey_copy(l->top, k);
+	bch2_keylist_push(l);
+}
+
+static inline bool bch2_keylist_empty(struct keylist *l)
+{
+	return l->top == l->keys;
+}
+
+static inline size_t bch_keylist_u64s(struct keylist *l)
+{
+	return l->top_p - l->keys_p;
+}
+
+static inline size_t bch2_keylist_bytes(struct keylist *l)
+{
+	return bch_keylist_u64s(l) * sizeof(u64);
+}
+
+static inline struct bkey_i *bch2_keylist_front(struct keylist *l)
+{
+	return l->keys;
+}
+
+#define for_each_keylist_key(_keylist, _k)			\
+	for (_k = (_keylist)->keys;				\
+	     _k != (_keylist)->top;				\
+	     _k = bkey_next(_k))
+
+static inline u64 keylist_sectors(struct keylist *keys)
+{
+	struct bkey_i *k;
+	u64 ret = 0;
+
+	for_each_keylist_key(keys, k)
+		ret += k->k.size;
+
+	return ret;
+}
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+void bch2_verify_keylist_sorted(struct keylist *);
+#else
+static inline void bch2_verify_keylist_sorted(struct keylist *l) {}
+#endif
+
+#endif /* _BCACHEFS_KEYLIST_H */
diff --git a/fs/bcachefs/keylist_types.h b/fs/bcachefs/keylist_types.h
new file mode 100644
index 000000000000..4b3ff7d8a875
--- /dev/null
+++ b/fs/bcachefs/keylist_types.h
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_KEYLIST_TYPES_H
+#define _BCACHEFS_KEYLIST_TYPES_H
+
+struct keylist {
+	union {
+		struct bkey_i		*keys;
+		u64			*keys_p;
+	};
+	union {
+		struct bkey_i		*top;
+		u64			*top_p;
+	};
+};
+
+#endif /* _BCACHEFS_KEYLIST_TYPES_H */
diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c
new file mode 100644
index 000000000000..8f618dc5160d
--- /dev/null
+++ b/fs/bcachefs/migrate.c
@@ -0,0 +1,178 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Code for moving data off a device.
+ */
+
+#include "bcachefs.h"
+#include "btree_update.h"
+#include "buckets.h"
+#include "extents.h"
+#include "io.h"
+#include "journal.h"
+#include "keylist.h"
+#include "migrate.h"
+#include "move.h"
+#include "replicas.h"
+#include "super-io.h"
+
+static int drop_dev_ptrs(struct bch_fs *c, struct bkey_s_extent e,
+			 unsigned dev_idx, int flags, bool metadata)
+{
+	unsigned replicas = metadata ? c->opts.metadata_replicas : c->opts.data_replicas;
+	unsigned lost = metadata ? BCH_FORCE_IF_METADATA_LOST : BCH_FORCE_IF_DATA_LOST;
+	unsigned degraded = metadata ? BCH_FORCE_IF_METADATA_DEGRADED : BCH_FORCE_IF_DATA_DEGRADED;
+	unsigned nr_good;
+
+	bch2_extent_drop_device(e, dev_idx);
+
+	nr_good = bch2_extent_durability(c, e.c);
+	if ((!nr_good && !(flags & lost)) ||
+	    (nr_good < replicas && !(flags & degraded)))
+		return -EINVAL;
+
+	return 0;
+}
+
+static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
+{
+	struct bkey_s_c k;
+	struct bkey_s_extent e;
+	BKEY_PADDED(key) tmp;
+	struct btree_iter iter;
+	int ret = 0;
+
+	mutex_lock(&c->replicas_gc_lock);
+	bch2_replicas_gc_start(c, (1 << BCH_DATA_USER)|(1 << BCH_DATA_CACHED));
+
+	bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS,
+			     POS_MIN, BTREE_ITER_PREFETCH);
+
+	while ((k = bch2_btree_iter_peek(&iter)).k &&
+	       !(ret = btree_iter_err(k))) {
+		if (!bkey_extent_is_data(k.k) ||
+		    !bch2_extent_has_device(bkey_s_c_to_extent(k), dev_idx)) {
+			ret = bch2_mark_bkey_replicas(c, BCH_DATA_USER, k);
+			if (ret)
+				break;
+			bch2_btree_iter_next(&iter);
+			continue;
+		}
+
+		bkey_reassemble(&tmp.key, k);
+		e = bkey_i_to_s_extent(&tmp.key);
+
+		ret = drop_dev_ptrs(c, e, dev_idx, flags, false);
+		if (ret)
+			break;
+
+		/*
+		 * If the new extent no longer has any pointers, bch2_extent_normalize()
+		 * will do the appropriate thing with it (turning it into a
+		 * KEY_TYPE_ERROR key, or just a discard if it was a cached extent)
+		 */
+		bch2_extent_normalize(c, e.s);
+
+		ret = bch2_mark_bkey_replicas(c, BCH_DATA_USER,
+					      bkey_i_to_s_c(&tmp.key));
+		if (ret)
+			break;
+
+		iter.pos = bkey_start_pos(&tmp.key.k);
+
+		ret = bch2_btree_insert_at(c, NULL, NULL, NULL,
+					   BTREE_INSERT_ATOMIC|
+					   BTREE_INSERT_NOFAIL,
+					   BTREE_INSERT_ENTRY(&iter, &tmp.key));
+
+		/*
+		 * don't want to leave ret == -EINTR, since if we raced and
+		 * something else overwrote the key we could spuriously return
+		 * -EINTR below:
+		 */
+		if (ret == -EINTR)
+			ret = 0;
+		if (ret)
+			break;
+	}
+
+	bch2_btree_iter_unlock(&iter);
+
+	bch2_replicas_gc_end(c, ret);
+	mutex_unlock(&c->replicas_gc_lock);
+
+	return ret;
+}
+
+static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
+{
+	struct btree_iter iter;
+	struct closure cl;
+	struct btree *b;
+	unsigned id;
+	int ret;
+
+	/* don't handle this yet: */
+	if (flags & BCH_FORCE_IF_METADATA_LOST)
+		return -EINVAL;
+
+	closure_init_stack(&cl);
+
+	mutex_lock(&c->replicas_gc_lock);
+	bch2_replicas_gc_start(c, 1 << BCH_DATA_BTREE);
+
+	for (id = 0; id < BTREE_ID_NR; id++) {
+		for_each_btree_node(&iter, c, id, POS_MIN, BTREE_ITER_PREFETCH, b) {
+			__BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp;
+			struct bkey_i_extent *new_key;
+retry:
+			if (!bch2_extent_has_device(bkey_i_to_s_c_extent(&b->key),
+						    dev_idx)) {
+				/*
+				 * we might have found a btree node key we
+				 * needed to update, and then tried to update it
+				 * but got -EINTR after upgrading the iter, but
+				 * then raced and the node is now gone:
+				 */
+				bch2_btree_iter_downgrade(&iter);
+
+				ret = bch2_mark_bkey_replicas(c, BCH_DATA_BTREE,
+							      bkey_i_to_s_c(&b->key));
+				if (ret)
+					goto err;
+			} else {
+				bkey_copy(&tmp.k, &b->key);
+				new_key = bkey_i_to_extent(&tmp.k);
+
+				ret = drop_dev_ptrs(c, extent_i_to_s(new_key),
+						    dev_idx, flags, true);
+				if (ret)
+					goto err;
+
+				ret = bch2_btree_node_update_key(c, &iter, b, new_key);
+				if (ret == -EINTR) {
+					b = bch2_btree_iter_peek_node(&iter);
+					goto retry;
+				}
+				if (ret)
+					goto err;
+			}
+		}
+		bch2_btree_iter_unlock(&iter);
+	}
+
+	ret = 0;
+out:
+	ret = bch2_replicas_gc_end(c, ret);
+	mutex_unlock(&c->replicas_gc_lock);
+
+	return ret;
+err:
+	bch2_btree_iter_unlock(&iter);
+	goto out;
+}
+
+int bch2_dev_data_drop(struct bch_fs *c, unsigned dev_idx, int flags)
+{
+	return bch2_dev_usrdata_drop(c, dev_idx, flags) ?:
+		bch2_dev_metadata_drop(c, dev_idx, flags);
+}
diff --git a/fs/bcachefs/migrate.h b/fs/bcachefs/migrate.h
new file mode 100644
index 000000000000..027efaa0d575
--- /dev/null
+++ b/fs/bcachefs/migrate.h
@@ -0,0 +1,7 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_MIGRATE_H
+#define _BCACHEFS_MIGRATE_H
+
+int bch2_dev_data_drop(struct bch_fs *, unsigned, int);
+
+#endif /* _BCACHEFS_MIGRATE_H */
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
new file mode 100644
index 000000000000..b6310a60d5b7
--- /dev/null
+++ b/fs/bcachefs/move.c
@@ -0,0 +1,761 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "btree_gc.h"
+#include "btree_update.h"
+#include "buckets.h"
+#include "inode.h"
+#include "io.h"
+#include "journal_reclaim.h"
+#include "keylist.h"
+#include "move.h"
+#include "replicas.h"
+#include "super-io.h"
+#include "trace.h"
+
+#include <linux/ioprio.h>
+#include <linux/kthread.h>
+
+#define SECTORS_IN_FLIGHT_PER_DEVICE	2048
+
+struct moving_io {
+	struct list_head	list;
+	struct closure		cl;
+	bool			read_completed;
+
+	unsigned		read_sectors;
+	unsigned		write_sectors;
+
+	struct bch_read_bio	rbio;
+
+	struct migrate_write	write;
+	/* Must be last since it is variable size */
+	struct bio_vec		bi_inline_vecs[0];
+};
+
+struct moving_context {
+	/* Closure for waiting on all reads and writes to complete */
+	struct closure		cl;
+
+	struct bch_move_stats	*stats;
+
+	struct list_head	reads;
+
+	/* in flight sectors: */
+	atomic_t		read_sectors;
+	atomic_t		write_sectors;
+
+	wait_queue_head_t	wait;
+};
+
+static int bch2_migrate_index_update(struct bch_write_op *op)
+{
+	struct bch_fs *c = op->c;
+	struct migrate_write *m =
+		container_of(op, struct migrate_write, op);
+	struct keylist *keys = &op->insert_keys;
+	struct btree_iter iter;
+	int ret = 0;
+
+	bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS,
+			     bkey_start_pos(&bch2_keylist_front(keys)->k),
+			     BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+
+	while (1) {
+		struct bkey_s_c k = bch2_btree_iter_peek_slot(&iter);
+		struct bkey_i_extent *insert, *new =
+			bkey_i_to_extent(bch2_keylist_front(keys));
+		BKEY_PADDED(k) _new, _insert;
+		struct bch_extent_ptr *ptr;
+		struct bch_extent_crc_unpacked crc;
+		bool did_work = false;
+		int nr;
+
+		if (btree_iter_err(k)) {
+			ret = bch2_btree_iter_unlock(&iter);
+			break;
+		}
+
+		if (bversion_cmp(k.k->version, new->k.version) ||
+		    !bkey_extent_is_data(k.k) ||
+		    !bch2_extent_matches_ptr(c, bkey_s_c_to_extent(k),
+					     m->ptr, m->offset))
+			goto nomatch;
+
+		if (m->data_cmd == DATA_REWRITE &&
+		    !bch2_extent_has_device(bkey_s_c_to_extent(k),
+					    m->data_opts.rewrite_dev))
+			goto nomatch;
+
+		bkey_reassemble(&_insert.k, k);
+		insert = bkey_i_to_extent(&_insert.k);
+
+		bkey_copy(&_new.k, bch2_keylist_front(keys));
+		new = bkey_i_to_extent(&_new.k);
+
+		bch2_cut_front(iter.pos, &insert->k_i);
+		bch2_cut_back(new->k.p, &insert->k);
+		bch2_cut_back(insert->k.p, &new->k);
+
+		if (m->data_cmd == DATA_REWRITE) {
+			ptr = (struct bch_extent_ptr *)
+				bch2_extent_has_device(extent_i_to_s_c(insert),
+						       m->data_opts.rewrite_dev);
+			bch2_extent_drop_ptr(extent_i_to_s(insert), ptr);
+		}
+
+		extent_for_each_ptr_crc(extent_i_to_s(new), ptr, crc) {
+			if (bch2_extent_has_device(extent_i_to_s_c(insert), ptr->dev)) {
+				/*
+				 * raced with another move op? extent already
+				 * has a pointer to the device we just wrote
+				 * data to
+				 */
+				continue;
+			}
+
+			bch2_extent_crc_append(insert, crc);
+			extent_ptr_append(insert, *ptr);
+			did_work = true;
+		}
+
+		if (!did_work)
+			goto nomatch;
+
+		bch2_extent_narrow_crcs(insert,
+				(struct bch_extent_crc_unpacked) { 0 });
+		bch2_extent_normalize(c, extent_i_to_s(insert).s);
+		bch2_extent_mark_replicas_cached(c, extent_i_to_s(insert),
+						 op->opts.background_target,
+						 op->opts.data_replicas);
+
+		/*
+		 * It's possible we race, and for whatever reason the extent now
+		 * has fewer replicas than when we last looked at it - meaning
+		 * we need to get a disk reservation here:
+		 */
+		nr = bch2_extent_nr_dirty_ptrs(bkey_i_to_s_c(&insert->k_i)) -
+			(bch2_extent_nr_dirty_ptrs(k) + m->nr_ptrs_reserved);
+		if (nr > 0) {
+			/*
+			 * can't call bch2_disk_reservation_add() with btree
+			 * locks held, at least not without a song and dance
+			 */
+			bch2_btree_iter_unlock(&iter);
+
+			ret = bch2_disk_reservation_add(c, &op->res,
+					keylist_sectors(keys) * nr, 0);
+			if (ret)
+				goto out;
+
+			m->nr_ptrs_reserved += nr;
+			goto next;
+		}
+
+		ret = bch2_mark_bkey_replicas(c, BCH_DATA_USER,
+					      extent_i_to_s_c(insert).s_c);
+		if (ret)
+			break;
+
+		ret = bch2_btree_insert_at(c, &op->res,
+				NULL, op_journal_seq(op),
+				BTREE_INSERT_ATOMIC|
+				BTREE_INSERT_NOFAIL|
+				BTREE_INSERT_USE_RESERVE|
+				m->data_opts.btree_insert_flags,
+				BTREE_INSERT_ENTRY(&iter, &insert->k_i));
+		if (!ret)
+			atomic_long_inc(&c->extent_migrate_done);
+		if (ret == -EINTR)
+			ret = 0;
+		if (ret)
+			break;
+next:
+		while (bkey_cmp(iter.pos, bch2_keylist_front(keys)->k.p) >= 0) {
+			bch2_keylist_pop_front(keys);
+			if (bch2_keylist_empty(keys))
+				goto out;
+		}
+
+		bch2_cut_front(iter.pos, bch2_keylist_front(keys));
+		continue;
+nomatch:
+		if (m->ctxt)
+			atomic64_add(k.k->p.offset - iter.pos.offset,
+				     &m->ctxt->stats->sectors_raced);
+		atomic_long_inc(&c->extent_migrate_raced);
+		trace_move_race(&new->k);
+		bch2_btree_iter_next_slot(&iter);
+		goto next;
+	}
+out:
+	bch2_btree_iter_unlock(&iter);
+	return ret;
+}
+
+void bch2_migrate_read_done(struct migrate_write *m, struct bch_read_bio *rbio)
+{
+	/* write bio must own pages: */
+	BUG_ON(!m->op.wbio.bio.bi_vcnt);
+
+	m->ptr		= rbio->pick.ptr;
+	m->offset	= rbio->pos.offset - rbio->pick.crc.offset;
+	m->op.devs_have	= rbio->devs_have;
+	m->op.pos	= rbio->pos;
+	m->op.version	= rbio->version;
+	m->op.crc	= rbio->pick.crc;
+	m->op.wbio.bio.bi_iter.bi_size = m->op.crc.compressed_size << 9;
+
+	if (bch2_csum_type_is_encryption(m->op.crc.csum_type)) {
+		m->op.nonce	= m->op.crc.nonce + m->op.crc.offset;
+		m->op.csum_type = m->op.crc.csum_type;
+	}
+
+	if (m->data_cmd == DATA_REWRITE)
+		bch2_dev_list_drop_dev(&m->op.devs_have, m->data_opts.rewrite_dev);
+}
+
+int bch2_migrate_write_init(struct bch_fs *c, struct migrate_write *m,
+			    struct write_point_specifier wp,
+			    struct bch_io_opts io_opts,
+			    enum data_cmd data_cmd,
+			    struct data_opts data_opts,
+			    struct bkey_s_c k)
+{
+	int ret;
+
+	m->data_cmd	= data_cmd;
+	m->data_opts	= data_opts;
+	m->nr_ptrs_reserved = 0;
+
+	bch2_write_op_init(&m->op, c, io_opts);
+	m->op.compression_type =
+		bch2_compression_opt_to_type[io_opts.background_compression ?:
+					     io_opts.compression];
+	m->op.target	= data_opts.target,
+	m->op.write_point = wp;
+
+	if (m->data_opts.btree_insert_flags & BTREE_INSERT_USE_RESERVE)
+		m->op.alloc_reserve = RESERVE_MOVINGGC;
+
+	m->op.flags |= BCH_WRITE_ONLY_SPECIFIED_DEVS|
+		BCH_WRITE_PAGES_STABLE|
+		BCH_WRITE_PAGES_OWNED|
+		BCH_WRITE_DATA_ENCODED|
+		BCH_WRITE_NOMARK_REPLICAS;
+
+	m->op.nr_replicas	= 1;
+	m->op.nr_replicas_required = 1;
+	m->op.index_update_fn	= bch2_migrate_index_update;
+
+	switch (data_cmd) {
+	case DATA_ADD_REPLICAS: {
+		int nr = (int) io_opts.data_replicas -
+			bch2_extent_nr_dirty_ptrs(k);
+
+		if (nr > 0) {
+			m->op.nr_replicas = m->nr_ptrs_reserved = nr;
+
+			ret = bch2_disk_reservation_get(c, &m->op.res,
+					k.k->size, m->op.nr_replicas, 0);
+			if (ret)
+				return ret;
+		}
+		break;
+	}
+	case DATA_REWRITE:
+		break;
+	case DATA_PROMOTE:
+		m->op.flags	|= BCH_WRITE_ALLOC_NOWAIT;
+		m->op.flags	|= BCH_WRITE_CACHED;
+		break;
+	default:
+		BUG();
+	}
+
+	return 0;
+}
+
+static void move_free(struct closure *cl)
+{
+	struct moving_io *io = container_of(cl, struct moving_io, cl);
+	struct moving_context *ctxt = io->write.ctxt;
+	struct bvec_iter_all iter;
+	struct bio_vec *bv;
+
+	bch2_disk_reservation_put(io->write.op.c, &io->write.op.res);
+
+	bio_for_each_segment_all(bv, &io->write.op.wbio.bio, iter)
+		if (bv->bv_page)
+			__free_page(bv->bv_page);
+
+	wake_up(&ctxt->wait);
+
+	kfree(io);
+}
+
+static void move_write_done(struct closure *cl)
+{
+	struct moving_io *io = container_of(cl, struct moving_io, cl);
+
+	atomic_sub(io->write_sectors, &io->write.ctxt->write_sectors);
+	closure_return_with_destructor(cl, move_free);
+}
+
+static void move_write(struct closure *cl)
+{
+	struct moving_io *io = container_of(cl, struct moving_io, cl);
+
+	if (unlikely(io->rbio.bio.bi_status || io->rbio.hole)) {
+		closure_return_with_destructor(cl, move_free);
+		return;
+	}
+
+	bch2_migrate_read_done(&io->write, &io->rbio);
+
+	atomic_add(io->write_sectors, &io->write.ctxt->write_sectors);
+	closure_call(&io->write.op.cl, bch2_write, NULL, cl);
+	continue_at(cl, move_write_done, NULL);
+}
+
+static inline struct moving_io *next_pending_write(struct moving_context *ctxt)
+{
+	struct moving_io *io =
+		list_first_entry_or_null(&ctxt->reads, struct moving_io, list);
+
+	return io && io->read_completed ? io : NULL;
+}
+
+static void move_read_endio(struct bio *bio)
+{
+	struct moving_io *io = container_of(bio, struct moving_io, rbio.bio);
+	struct moving_context *ctxt = io->write.ctxt;
+
+	atomic_sub(io->read_sectors, &ctxt->read_sectors);
+	io->read_completed = true;
+
+	if (next_pending_write(ctxt))
+		wake_up(&ctxt->wait);
+
+	closure_put(&ctxt->cl);
+}
+
+static void do_pending_writes(struct moving_context *ctxt)
+{
+	struct moving_io *io;
+
+	while ((io = next_pending_write(ctxt))) {
+		list_del(&io->list);
+		closure_call(&io->cl, move_write, NULL, &ctxt->cl);
+	}
+}
+
+#define move_ctxt_wait_event(_ctxt, _cond)			\
+do {								\
+	do_pending_writes(_ctxt);				\
+								\
+	if (_cond)						\
+		break;						\
+	__wait_event((_ctxt)->wait,				\
+		     next_pending_write(_ctxt) || (_cond));	\
+} while (1)
+
+static void bch2_move_ctxt_wait_for_io(struct moving_context *ctxt)
+{
+	unsigned sectors_pending = atomic_read(&ctxt->write_sectors);
+
+	move_ctxt_wait_event(ctxt,
+		!atomic_read(&ctxt->write_sectors) ||
+		atomic_read(&ctxt->write_sectors) != sectors_pending);
+}
+
+static int bch2_move_extent(struct bch_fs *c,
+			    struct moving_context *ctxt,
+			    struct write_point_specifier wp,
+			    struct bch_io_opts io_opts,
+			    struct bkey_s_c_extent e,
+			    enum data_cmd data_cmd,
+			    struct data_opts data_opts)
+{
+	struct moving_io *io;
+	const struct bch_extent_ptr *ptr;
+	struct bch_extent_crc_unpacked crc;
+	unsigned sectors = e.k->size, pages;
+	int ret = -ENOMEM;
+
+	move_ctxt_wait_event(ctxt,
+		atomic_read(&ctxt->write_sectors) <
+		SECTORS_IN_FLIGHT_PER_DEVICE);
+
+	move_ctxt_wait_event(ctxt,
+		atomic_read(&ctxt->read_sectors) <
+		SECTORS_IN_FLIGHT_PER_DEVICE);
+
+	/* write path might have to decompress data: */
+	extent_for_each_ptr_crc(e, ptr, crc)
+		sectors = max_t(unsigned, sectors, crc.uncompressed_size);
+
+	pages = DIV_ROUND_UP(sectors, PAGE_SECTORS);
+	io = kzalloc(sizeof(struct moving_io) +
+		     sizeof(struct bio_vec) * pages, GFP_KERNEL);
+	if (!io)
+		goto err;
+
+	io->write.ctxt		= ctxt;
+	io->read_sectors	= e.k->size;
+	io->write_sectors	= e.k->size;
+
+	bio_init(&io->write.op.wbio.bio, NULL, io->bi_inline_vecs, pages, 0);
+	bio_set_prio(&io->write.op.wbio.bio,
+		     IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
+
+	if (bch2_bio_alloc_pages(&io->write.op.wbio.bio, sectors << 9,
+				 GFP_KERNEL))
+		goto err_free;
+
+	io->rbio.opts = io_opts;
+	bio_init(&io->rbio.bio, NULL, io->bi_inline_vecs, pages, 0);
+	io->rbio.bio.bi_vcnt = pages;
+	bio_set_prio(&io->rbio.bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
+	io->rbio.bio.bi_iter.bi_size = sectors << 9;
+
+	io->rbio.bio.bi_opf		= REQ_OP_READ;
+	io->rbio.bio.bi_iter.bi_sector	= bkey_start_offset(e.k);
+	io->rbio.bio.bi_end_io		= move_read_endio;
+
+	ret = bch2_migrate_write_init(c, &io->write, wp, io_opts,
+				      data_cmd, data_opts, e.s_c);
+	if (ret)
+		goto err_free_pages;
+
+	atomic64_inc(&ctxt->stats->keys_moved);
+	atomic64_add(e.k->size, &ctxt->stats->sectors_moved);
+
+	trace_move_extent(e.k);
+
+	atomic_add(io->read_sectors, &ctxt->read_sectors);
+	list_add_tail(&io->list, &ctxt->reads);
+
+	/*
+	 * dropped by move_read_endio() - guards against use after free of
+	 * ctxt when doing wakeup
+	 */
+	closure_get(&ctxt->cl);
+	bch2_read_extent(c, &io->rbio, e.s_c,
+			 BCH_READ_NODECODE|
+			 BCH_READ_LAST_FRAGMENT);
+	return 0;
+err_free_pages:
+	bio_free_pages(&io->write.op.wbio.bio);
+err_free:
+	kfree(io);
+err:
+	trace_move_alloc_fail(e.k);
+	return ret;
+}
+
+int bch2_move_data(struct bch_fs *c,
+		   struct bch_ratelimit *rate,
+		   struct write_point_specifier wp,
+		   struct bpos start,
+		   struct bpos end,
+		   move_pred_fn pred, void *arg,
+		   struct bch_move_stats *stats)
+{
+	bool kthread = (current->flags & PF_KTHREAD) != 0;
+	struct moving_context ctxt = { .stats = stats };
+	struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts);
+	BKEY_PADDED(k) tmp;
+	struct bkey_s_c k;
+	struct bkey_s_c_extent e;
+	struct data_opts data_opts;
+	enum data_cmd data_cmd;
+	u64 cur_inum = U64_MAX;
+	int ret = 0, ret2;
+
+	closure_init_stack(&ctxt.cl);
+	INIT_LIST_HEAD(&ctxt.reads);
+	init_waitqueue_head(&ctxt.wait);
+
+	stats->data_type = BCH_DATA_USER;
+	bch2_btree_iter_init(&stats->iter, c, BTREE_ID_EXTENTS, start,
+			     BTREE_ITER_PREFETCH);
+
+	if (rate)
+		bch2_ratelimit_reset(rate);
+
+	while (!kthread || !(ret = kthread_should_stop())) {
+		if (rate &&
+		    bch2_ratelimit_delay(rate) &&
+		    (bch2_btree_iter_unlock(&stats->iter),
+		     (ret = bch2_ratelimit_wait_freezable_stoppable(rate))))
+			break;
+peek:
+		k = bch2_btree_iter_peek(&stats->iter);
+		if (!k.k)
+			break;
+		ret = btree_iter_err(k);
+		if (ret)
+			break;
+		if (bkey_cmp(bkey_start_pos(k.k), end) >= 0)
+			break;
+
+		if (!bkey_extent_is_data(k.k))
+			goto next_nondata;
+
+		e = bkey_s_c_to_extent(k);
+
+		if (cur_inum != k.k->p.inode) {
+			struct bch_inode_unpacked inode;
+
+			/* don't hold btree locks while looking up inode: */
+			bch2_btree_iter_unlock(&stats->iter);
+
+			io_opts = bch2_opts_to_inode_opts(c->opts);
+			if (!bch2_inode_find_by_inum(c, k.k->p.inode, &inode))
+				bch2_io_opts_apply(&io_opts, bch2_inode_opts_get(&inode));
+			cur_inum = k.k->p.inode;
+			goto peek;
+		}
+
+		switch ((data_cmd = pred(c, arg, BKEY_TYPE_EXTENTS, e,
+					 &io_opts, &data_opts))) {
+		case DATA_SKIP:
+			goto next;
+		case DATA_SCRUB:
+			BUG();
+		case DATA_ADD_REPLICAS:
+		case DATA_REWRITE:
+		case DATA_PROMOTE:
+			break;
+		default:
+			BUG();
+		}
+
+		/* unlock before doing IO: */
+		bkey_reassemble(&tmp.k, k);
+		k = bkey_i_to_s_c(&tmp.k);
+		bch2_btree_iter_unlock(&stats->iter);
+
+		ret2 = bch2_move_extent(c, &ctxt, wp, io_opts,
+					bkey_s_c_to_extent(k),
+					data_cmd, data_opts);
+		if (ret2) {
+			if (ret2 == -ENOMEM) {
+				/* memory allocation failure, wait for some IO to finish */
+				bch2_move_ctxt_wait_for_io(&ctxt);
+				continue;
+			}
+
+			/* XXX signal failure */
+			goto next;
+		}
+
+		if (rate)
+			bch2_ratelimit_increment(rate, k.k->size);
+next:
+		atomic64_add(k.k->size * bch2_extent_nr_dirty_ptrs(k),
+			     &stats->sectors_seen);
+next_nondata:
+		bch2_btree_iter_next(&stats->iter);
+		bch2_btree_iter_cond_resched(&stats->iter);
+	}
+
+	bch2_btree_iter_unlock(&stats->iter);
+
+	move_ctxt_wait_event(&ctxt, list_empty(&ctxt.reads));
+	closure_sync(&ctxt.cl);
+
+	EBUG_ON(atomic_read(&ctxt.write_sectors));
+
+	trace_move_data(c,
+			atomic64_read(&stats->sectors_moved),
+			atomic64_read(&stats->keys_moved));
+
+	return ret;
+}
+
+static int bch2_gc_data_replicas(struct bch_fs *c)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	int ret;
+
+	mutex_lock(&c->replicas_gc_lock);
+	bch2_replicas_gc_start(c, (1 << BCH_DATA_USER)|(1 << BCH_DATA_CACHED));
+
+	for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, POS_MIN,
+			   BTREE_ITER_PREFETCH, k) {
+		ret = bch2_mark_bkey_replicas(c, BCH_DATA_USER, k);
+		if (ret)
+			break;
+	}
+	ret = bch2_btree_iter_unlock(&iter) ?: ret;
+
+	bch2_replicas_gc_end(c, ret);
+	mutex_unlock(&c->replicas_gc_lock);
+
+	return ret;
+}
+
+static int bch2_gc_btree_replicas(struct bch_fs *c)
+{
+	struct btree_iter iter;
+	struct btree *b;
+	unsigned id;
+	int ret = 0;
+
+	mutex_lock(&c->replicas_gc_lock);
+	bch2_replicas_gc_start(c, 1 << BCH_DATA_BTREE);
+
+	for (id = 0; id < BTREE_ID_NR; id++) {
+		for_each_btree_node(&iter, c, id, POS_MIN, BTREE_ITER_PREFETCH, b) {
+			ret = bch2_mark_bkey_replicas(c, BCH_DATA_BTREE,
+						      bkey_i_to_s_c(&b->key));
+
+			bch2_btree_iter_cond_resched(&iter);
+		}
+
+		ret = bch2_btree_iter_unlock(&iter) ?: ret;
+	}
+
+	bch2_replicas_gc_end(c, ret);
+	mutex_unlock(&c->replicas_gc_lock);
+
+	return ret;
+}
+
+static int bch2_move_btree(struct bch_fs *c,
+			   move_pred_fn pred,
+			   void *arg,
+			   struct bch_move_stats *stats)
+{
+	struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts);
+	struct btree *b;
+	unsigned id;
+	struct data_opts data_opts;
+	enum data_cmd cmd;
+	int ret = 0;
+
+	stats->data_type = BCH_DATA_BTREE;
+
+	for (id = 0; id < BTREE_ID_NR; id++) {
+		for_each_btree_node(&stats->iter, c, id, POS_MIN, BTREE_ITER_PREFETCH, b) {
+			switch ((cmd = pred(c, arg, BKEY_TYPE_BTREE,
+					    bkey_i_to_s_c_extent(&b->key),
+					    &io_opts,
+					    &data_opts))) {
+			case DATA_SKIP:
+				goto next;
+			case DATA_SCRUB:
+				BUG();
+			case DATA_ADD_REPLICAS:
+			case DATA_REWRITE:
+				break;
+			default:
+				BUG();
+			}
+
+			ret = bch2_btree_node_rewrite(c, &stats->iter,
+					b->data->keys.seq, 0) ?: ret;
+next:
+			bch2_btree_iter_cond_resched(&stats->iter);
+		}
+
+		ret = bch2_btree_iter_unlock(&stats->iter) ?: ret;
+	}
+
+	return ret;
+}
+
+#if 0
+static enum data_cmd scrub_pred(struct bch_fs *c, void *arg,
+				enum bkey_type type,
+				struct bkey_s_c_extent e,
+				struct bch_io_opts *io_opts,
+				struct data_opts *data_opts)
+{
+	return DATA_SCRUB;
+}
+#endif
+
+static enum data_cmd rereplicate_pred(struct bch_fs *c, void *arg,
+				      enum bkey_type type,
+				      struct bkey_s_c_extent e,
+				      struct bch_io_opts *io_opts,
+				      struct data_opts *data_opts)
+{
+	unsigned nr_good = bch2_extent_durability(c, e);
+	unsigned replicas = type == BKEY_TYPE_BTREE
+		? c->opts.metadata_replicas
+		: io_opts->data_replicas;
+
+	if (!nr_good || nr_good >= replicas)
+		return DATA_SKIP;
+
+	data_opts->target		= 0;
+	data_opts->btree_insert_flags = 0;
+	return DATA_ADD_REPLICAS;
+}
+
+static enum data_cmd migrate_pred(struct bch_fs *c, void *arg,
+				  enum bkey_type type,
+				  struct bkey_s_c_extent e,
+				  struct bch_io_opts *io_opts,
+				  struct data_opts *data_opts)
+{
+	struct bch_ioctl_data *op = arg;
+
+	if (!bch2_extent_has_device(e, op->migrate.dev))
+		return DATA_SKIP;
+
+	data_opts->target		= 0;
+	data_opts->btree_insert_flags	= 0;
+	data_opts->rewrite_dev		= op->migrate.dev;
+	return DATA_REWRITE;
+}
+
+int bch2_data_job(struct bch_fs *c,
+		  struct bch_move_stats *stats,
+		  struct bch_ioctl_data op)
+{
+	int ret = 0;
+
+	switch (op.op) {
+	case BCH_DATA_OP_REREPLICATE:
+		stats->data_type = BCH_DATA_JOURNAL;
+		ret = bch2_journal_flush_device_pins(&c->journal, -1);
+
+		ret = bch2_move_btree(c, rereplicate_pred, c, stats) ?: ret;
+		ret = bch2_gc_btree_replicas(c) ?: ret;
+
+		ret = bch2_move_data(c, NULL,
+				     writepoint_hashed((unsigned long) current),
+				     op.start,
+				     op.end,
+				     rereplicate_pred, c, stats) ?: ret;
+		ret = bch2_gc_data_replicas(c) ?: ret;
+		break;
+	case BCH_DATA_OP_MIGRATE:
+		if (op.migrate.dev >= c->sb.nr_devices)
+			return -EINVAL;
+
+		stats->data_type = BCH_DATA_JOURNAL;
+		ret = bch2_journal_flush_device_pins(&c->journal, op.migrate.dev);
+
+		ret = bch2_move_btree(c, migrate_pred, &op, stats) ?: ret;
+		ret = bch2_gc_btree_replicas(c) ?: ret;
+
+		ret = bch2_move_data(c, NULL,
+				     writepoint_hashed((unsigned long) current),
+				     op.start,
+				     op.end,
+				     migrate_pred, &op, stats) ?: ret;
+		ret = bch2_gc_data_replicas(c) ?: ret;
+		break;
+	default:
+		ret = -EINVAL;
+	}
+
+	return ret;
+}
diff --git a/fs/bcachefs/move.h b/fs/bcachefs/move.h
new file mode 100644
index 000000000000..3f7e31cc8f6e
--- /dev/null
+++ b/fs/bcachefs/move.h
@@ -0,0 +1,63 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_MOVE_H
+#define _BCACHEFS_MOVE_H
+
+#include "btree_iter.h"
+#include "buckets.h"
+#include "io_types.h"
+#include "move_types.h"
+
+struct bch_read_bio;
+struct moving_context;
+
+enum data_cmd {
+	DATA_SKIP,
+	DATA_SCRUB,
+	DATA_ADD_REPLICAS,
+	DATA_REWRITE,
+	DATA_PROMOTE,
+};
+
+struct data_opts {
+	u16		target;
+	unsigned	rewrite_dev;
+	int		btree_insert_flags;
+};
+
+struct migrate_write {
+	enum data_cmd		data_cmd;
+	struct data_opts	data_opts;
+
+	unsigned		nr_ptrs_reserved;
+
+	struct moving_context	*ctxt;
+
+	/* what we read: */
+	struct bch_extent_ptr	ptr;
+	u64			offset;
+
+	struct bch_write_op	op;
+};
+
+void bch2_migrate_read_done(struct migrate_write *, struct bch_read_bio *);
+int bch2_migrate_write_init(struct bch_fs *, struct migrate_write *,
+			    struct write_point_specifier,
+			    struct bch_io_opts,
+			    enum data_cmd, struct data_opts,
+			    struct bkey_s_c);
+
+typedef enum data_cmd (*move_pred_fn)(struct bch_fs *, void *,
+				enum bkey_type, struct bkey_s_c_extent,
+				struct bch_io_opts *, struct data_opts *);
+
+int bch2_move_data(struct bch_fs *, struct bch_ratelimit *,
+		   struct write_point_specifier,
+		   struct bpos, struct bpos,
+		   move_pred_fn, void *,
+		   struct bch_move_stats *);
+
+int bch2_data_job(struct bch_fs *,
+		  struct bch_move_stats *,
+		  struct bch_ioctl_data);
+
+#endif /* _BCACHEFS_MOVE_H */
diff --git a/fs/bcachefs/move_types.h b/fs/bcachefs/move_types.h
new file mode 100644
index 000000000000..8dbeb6ef727c
--- /dev/null
+++ b/fs/bcachefs/move_types.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_MOVE_TYPES_H
+#define _BCACHEFS_MOVE_TYPES_H
+
+struct bch_move_stats {
+	enum bch_data_type	data_type;
+	struct btree_iter	iter;
+
+	atomic64_t		keys_moved;
+	atomic64_t		sectors_moved;
+	atomic64_t		sectors_seen;
+	atomic64_t		sectors_raced;
+};
+
+#endif /* _BCACHEFS_MOVE_TYPES_H */
diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c
new file mode 100644
index 000000000000..8b61b163faf5
--- /dev/null
+++ b/fs/bcachefs/movinggc.c
@@ -0,0 +1,309 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Moving/copying garbage collector
+ *
+ * Copyright 2012 Google, Inc.
+ */
+
+#include "bcachefs.h"
+#include "btree_iter.h"
+#include "btree_update.h"
+#include "buckets.h"
+#include "clock.h"
+#include "disk_groups.h"
+#include "extents.h"
+#include "eytzinger.h"
+#include "io.h"
+#include "keylist.h"
+#include "move.h"
+#include "movinggc.h"
+#include "super-io.h"
+#include "trace.h"
+
+#include <linux/freezer.h>
+#include <linux/kthread.h>
+#include <linux/math64.h>
+#include <linux/sched/task.h>
+#include <linux/sort.h>
+#include <linux/wait.h>
+
+/*
+ * We can't use the entire copygc reserve in one iteration of copygc: we may
+ * need the buckets we're freeing up to go back into the copygc reserve to make
+ * forward progress, but if the copygc reserve is full they'll be available for
+ * any allocation - and it's possible that in a given iteration, we free up most
+ * of the buckets we're going to free before we allocate most of the buckets
+ * we're going to allocate.
+ *
+ * If we only use half of the reserve per iteration, then in steady state we'll
+ * always have room in the reserve for the buckets we're going to need in the
+ * next iteration:
+ */
+#define COPYGC_BUCKETS_PER_ITER(ca)					\
+	((ca)->free[RESERVE_MOVINGGC].size / 2)
+
+/*
+ * Max sectors to move per iteration: Have to take into account internal
+ * fragmentation from the multiple write points for each generation:
+ */
+#define COPYGC_SECTORS_PER_ITER(ca)					\
+	((ca)->mi.bucket_size *	COPYGC_BUCKETS_PER_ITER(ca))
+
+static inline int sectors_used_cmp(copygc_heap *heap,
+				   struct copygc_heap_entry l,
+				   struct copygc_heap_entry r)
+{
+	return (l.sectors > r.sectors) - (l.sectors < r.sectors);
+}
+
+static int bucket_offset_cmp(const void *_l, const void *_r, size_t size)
+{
+	const struct copygc_heap_entry *l = _l;
+	const struct copygc_heap_entry *r = _r;
+
+	return (l->offset > r->offset) - (l->offset < r->offset);
+}
+
+static bool __copygc_pred(struct bch_dev *ca,
+			  struct bkey_s_c_extent e)
+{
+	copygc_heap *h = &ca->copygc_heap;
+	const struct bch_extent_ptr *ptr =
+		bch2_extent_has_device(e, ca->dev_idx);
+
+	if (ptr) {
+		struct copygc_heap_entry search = { .offset = ptr->offset };
+
+		ssize_t i = eytzinger0_find_le(h->data, h->used,
+					       sizeof(h->data[0]),
+					       bucket_offset_cmp, &search);
+
+		return (i >= 0 &&
+			ptr->offset < h->data[i].offset + ca->mi.bucket_size &&
+			ptr->gen == h->data[i].gen);
+	}
+
+	return false;
+}
+
+static enum data_cmd copygc_pred(struct bch_fs *c, void *arg,
+				 enum bkey_type type,
+				 struct bkey_s_c_extent e,
+				 struct bch_io_opts *io_opts,
+				 struct data_opts *data_opts)
+{
+	struct bch_dev *ca = arg;
+
+	if (!__copygc_pred(ca, e))
+		return DATA_SKIP;
+
+	data_opts->target		= dev_to_target(ca->dev_idx);
+	data_opts->btree_insert_flags	= BTREE_INSERT_USE_RESERVE;
+	data_opts->rewrite_dev		= ca->dev_idx;
+	return DATA_REWRITE;
+}
+
+static bool have_copygc_reserve(struct bch_dev *ca)
+{
+	bool ret;
+
+	spin_lock(&ca->freelist_lock);
+	ret = fifo_full(&ca->free[RESERVE_MOVINGGC]) ||
+		ca->allocator_blocked;
+	spin_unlock(&ca->freelist_lock);
+
+	return ret;
+}
+
+static void bch2_copygc(struct bch_fs *c, struct bch_dev *ca)
+{
+	copygc_heap *h = &ca->copygc_heap;
+	struct copygc_heap_entry e, *i;
+	struct bucket_array *buckets;
+	struct bch_move_stats move_stats;
+	u64 sectors_to_move = 0, sectors_not_moved = 0;
+	u64 buckets_to_move, buckets_not_moved = 0;
+	size_t b;
+	int ret;
+
+	memset(&move_stats, 0, sizeof(move_stats));
+	closure_wait_event(&c->freelist_wait, have_copygc_reserve(ca));
+
+	/*
+	 * Find buckets with lowest sector counts, skipping completely
+	 * empty buckets, by building a maxheap sorted by sector count,
+	 * and repeatedly replacing the maximum element until all
+	 * buckets have been visited.
+	 */
+	h->used = 0;
+
+	/*
+	 * We need bucket marks to be up to date - gc can't be recalculating
+	 * them:
+	 */
+	down_read(&c->gc_lock);
+	down_read(&ca->bucket_lock);
+	buckets = bucket_array(ca);
+
+	for (b = buckets->first_bucket; b < buckets->nbuckets; b++) {
+		struct bucket_mark m = READ_ONCE(buckets->b[b].mark);
+		struct copygc_heap_entry e;
+
+		if (m.owned_by_allocator ||
+		    m.data_type != BCH_DATA_USER ||
+		    !bucket_sectors_used(m) ||
+		    bucket_sectors_used(m) >= ca->mi.bucket_size)
+			continue;
+
+		e = (struct copygc_heap_entry) {
+			.gen		= m.gen,
+			.sectors	= bucket_sectors_used(m),
+			.offset		= bucket_to_sector(ca, b),
+		};
+		heap_add_or_replace(h, e, -sectors_used_cmp);
+	}
+	up_read(&ca->bucket_lock);
+	up_read(&c->gc_lock);
+
+	for (i = h->data; i < h->data + h->used; i++)
+		sectors_to_move += i->sectors;
+
+	while (sectors_to_move > COPYGC_SECTORS_PER_ITER(ca)) {
+		BUG_ON(!heap_pop(h, e, -sectors_used_cmp));
+		sectors_to_move -= e.sectors;
+	}
+
+	buckets_to_move = h->used;
+
+	if (!buckets_to_move)
+		return;
+
+	eytzinger0_sort(h->data, h->used,
+			sizeof(h->data[0]),
+			bucket_offset_cmp, NULL);
+
+	ret = bch2_move_data(c, &ca->copygc_pd.rate,
+			     writepoint_ptr(&ca->copygc_write_point),
+			     POS_MIN, POS_MAX,
+			     copygc_pred, ca,
+			     &move_stats);
+
+	down_read(&ca->bucket_lock);
+	buckets = bucket_array(ca);
+	for (i = h->data; i < h->data + h->used; i++) {
+		size_t b = sector_to_bucket(ca, i->offset);
+		struct bucket_mark m = READ_ONCE(buckets->b[b].mark);
+
+		if (i->gen == m.gen && bucket_sectors_used(m)) {
+			sectors_not_moved += bucket_sectors_used(m);
+			buckets_not_moved++;
+		}
+	}
+	up_read(&ca->bucket_lock);
+
+	if (sectors_not_moved && !ret)
+		bch_warn(c, "copygc finished but %llu/%llu sectors, %llu/%llu buckets not moved",
+			 sectors_not_moved, sectors_to_move,
+			 buckets_not_moved, buckets_to_move);
+
+	trace_copygc(ca,
+		     atomic64_read(&move_stats.sectors_moved), sectors_not_moved,
+		     buckets_to_move, buckets_not_moved);
+}
+
+static int bch2_copygc_thread(void *arg)
+{
+	struct bch_dev *ca = arg;
+	struct bch_fs *c = ca->fs;
+	struct io_clock *clock = &c->io_clock[WRITE];
+	struct bch_dev_usage usage;
+	unsigned long last;
+	u64 available, fragmented, reserve, next;
+
+	set_freezable();
+
+	while (!kthread_should_stop()) {
+		if (kthread_wait_freezable(c->copy_gc_enabled))
+			break;
+
+		last = atomic_long_read(&clock->now);
+
+		reserve = div64_u64((ca->mi.nbuckets - ca->mi.first_bucket) *
+				 ca->mi.bucket_size *
+				 c->opts.gc_reserve_percent, 200);
+
+		usage = bch2_dev_usage_read(c, ca);
+
+		/*
+		 * don't start copygc until less than half the gc reserve is
+		 * available:
+		 */
+		available = __dev_buckets_available(ca, usage) *
+			ca->mi.bucket_size;
+		if (available > reserve) {
+			next = last + available - reserve;
+			bch2_kthread_io_clock_wait(clock, next,
+					MAX_SCHEDULE_TIMEOUT);
+			continue;
+		}
+
+		/*
+		 * don't start copygc until there's more than half the copygc
+		 * reserve of fragmented space:
+		 */
+		fragmented = usage.sectors_fragmented;
+		if (fragmented < reserve) {
+			next = last + reserve - fragmented;
+			bch2_kthread_io_clock_wait(clock, next,
+					MAX_SCHEDULE_TIMEOUT);
+			continue;
+		}
+
+		bch2_copygc(c, ca);
+	}
+
+	return 0;
+}
+
+void bch2_copygc_stop(struct bch_dev *ca)
+{
+	ca->copygc_pd.rate.rate = UINT_MAX;
+	bch2_ratelimit_reset(&ca->copygc_pd.rate);
+
+	if (ca->copygc_thread) {
+		kthread_stop(ca->copygc_thread);
+		put_task_struct(ca->copygc_thread);
+	}
+	ca->copygc_thread = NULL;
+}
+
+int bch2_copygc_start(struct bch_fs *c, struct bch_dev *ca)
+{
+	struct task_struct *t;
+
+	BUG_ON(ca->copygc_thread);
+
+	if (c->opts.nochanges)
+		return 0;
+
+	if (bch2_fs_init_fault("copygc_start"))
+		return -ENOMEM;
+
+	t = kthread_create(bch2_copygc_thread, ca,
+			   "bch_copygc[%s]", ca->name);
+	if (IS_ERR(t))
+		return PTR_ERR(t);
+
+	get_task_struct(t);
+
+	ca->copygc_thread = t;
+	wake_up_process(ca->copygc_thread);
+
+	return 0;
+}
+
+void bch2_dev_copygc_init(struct bch_dev *ca)
+{
+	bch2_pd_controller_init(&ca->copygc_pd);
+	ca->copygc_pd.d_term = 0;
+}
diff --git a/fs/bcachefs/movinggc.h b/fs/bcachefs/movinggc.h
new file mode 100644
index 000000000000..dcd479632cf1
--- /dev/null
+++ b/fs/bcachefs/movinggc.h
@@ -0,0 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_MOVINGGC_H
+#define _BCACHEFS_MOVINGGC_H
+
+void bch2_copygc_stop(struct bch_dev *);
+int bch2_copygc_start(struct bch_fs *, struct bch_dev *);
+void bch2_dev_copygc_init(struct bch_dev *);
+
+#endif /* _BCACHEFS_MOVINGGC_H */
diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c
new file mode 100644
index 000000000000..9351caeb6630
--- /dev/null
+++ b/fs/bcachefs/opts.c
@@ -0,0 +1,381 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/kernel.h>
+
+#include "bcachefs.h"
+#include "disk_groups.h"
+#include "opts.h"
+#include "super-io.h"
+#include "util.h"
+
+const char * const bch2_error_actions[] = {
+	"continue",
+	"remount-ro",
+	"panic",
+	NULL
+};
+
+const char * const bch2_csum_types[] = {
+	"none",
+	"crc32c",
+	"crc64",
+	NULL
+};
+
+const char * const bch2_compression_types[] = {
+	"none",
+	"lz4",
+	"gzip",
+	"zstd",
+	NULL
+};
+
+const char * const bch2_str_hash_types[] = {
+	"crc32c",
+	"crc64",
+	"siphash",
+	NULL
+};
+
+const char * const bch2_data_types[] = {
+	"none",
+	"sb",
+	"journal",
+	"btree",
+	"data",
+	"cached",
+	NULL
+};
+
+const char * const bch2_cache_replacement_policies[] = {
+	"lru",
+	"fifo",
+	"random",
+	NULL
+};
+
+/* Default is -1; we skip past it for struct cached_dev's cache mode */
+const char * const bch2_cache_modes[] = {
+	"default",
+	"writethrough",
+	"writeback",
+	"writearound",
+	"none",
+	NULL
+};
+
+const char * const bch2_dev_state[] = {
+	"readwrite",
+	"readonly",
+	"failed",
+	"spare",
+	NULL
+};
+
+void bch2_opts_apply(struct bch_opts *dst, struct bch_opts src)
+{
+#define BCH_OPT(_name, ...)						\
+	if (opt_defined(src, _name))					\
+		opt_set(*dst, _name, src._name);
+
+	BCH_OPTS()
+#undef BCH_OPT
+}
+
+bool bch2_opt_defined_by_id(const struct bch_opts *opts, enum bch_opt_id id)
+{
+	switch (id) {
+#define BCH_OPT(_name, ...)						\
+	case Opt_##_name:						\
+		return opt_defined(*opts, _name);
+	BCH_OPTS()
+#undef BCH_OPT
+	default:
+		BUG();
+	}
+}
+
+u64 bch2_opt_get_by_id(const struct bch_opts *opts, enum bch_opt_id id)
+{
+	switch (id) {
+#define BCH_OPT(_name, ...)						\
+	case Opt_##_name:						\
+		return opts->_name;
+	BCH_OPTS()
+#undef BCH_OPT
+	default:
+		BUG();
+	}
+}
+
+void bch2_opt_set_by_id(struct bch_opts *opts, enum bch_opt_id id, u64 v)
+{
+	switch (id) {
+#define BCH_OPT(_name, ...)						\
+	case Opt_##_name:						\
+		opt_set(*opts, _name, v);				\
+		break;
+	BCH_OPTS()
+#undef BCH_OPT
+	default:
+		BUG();
+	}
+}
+
+/*
+ * Initial options from superblock - here we don't want any options undefined,
+ * any options the superblock doesn't specify are set to 0:
+ */
+struct bch_opts bch2_opts_from_sb(struct bch_sb *sb)
+{
+	struct bch_opts opts = bch2_opts_empty();
+
+#define BCH_OPT(_name, _bits, _mode, _type, _sb_opt, _default)		\
+	if (_sb_opt != NO_SB_OPT)					\
+		opt_set(opts, _name, _sb_opt(sb));
+	BCH_OPTS()
+#undef BCH_OPT
+
+	return opts;
+}
+
+const struct bch_option bch2_opt_table[] = {
+#define OPT_BOOL()		.type = BCH_OPT_BOOL
+#define OPT_UINT(_min, _max)	.type = BCH_OPT_UINT, .min = _min, .max = _max
+#define OPT_STR(_choices)	.type = BCH_OPT_STR, .choices = _choices
+#define OPT_FN(_fn)		.type = BCH_OPT_FN,			\
+				.parse = _fn##_parse,			\
+				.print = _fn##_print
+
+#define BCH_OPT(_name, _bits, _mode, _type, _sb_opt, _default)		\
+	[Opt_##_name] = {						\
+		.attr	= {						\
+			.name	= #_name,				\
+			.mode = _mode == OPT_RUNTIME ? 0644 : 0444,	\
+		},							\
+		.mode	= _mode,					\
+		.set_sb	= SET_##_sb_opt,				\
+		_type							\
+	},
+
+	BCH_OPTS()
+#undef BCH_OPT
+};
+
+int bch2_opt_lookup(const char *name)
+{
+	const struct bch_option *i;
+
+	for (i = bch2_opt_table;
+	     i < bch2_opt_table + ARRAY_SIZE(bch2_opt_table);
+	     i++)
+		if (!strcmp(name, i->attr.name))
+			return i - bch2_opt_table;
+
+	return -1;
+}
+
+struct synonym {
+	const char	*s1, *s2;
+};
+
+static const struct synonym bch_opt_synonyms[] = {
+	{ "quota",	"usrquota" },
+};
+
+static int bch2_mount_opt_lookup(const char *name)
+{
+	const struct synonym *i;
+
+	for (i = bch_opt_synonyms;
+	     i < bch_opt_synonyms + ARRAY_SIZE(bch_opt_synonyms);
+	     i++)
+		if (!strcmp(name, i->s1))
+			name = i->s2;
+
+	return bch2_opt_lookup(name);
+}
+
+int bch2_opt_parse(struct bch_fs *c, const struct bch_option *opt,
+		   const char *val, u64 *res)
+{
+	ssize_t ret;
+
+	switch (opt->type) {
+	case BCH_OPT_BOOL:
+		ret = kstrtou64(val, 10, res);
+		if (ret < 0)
+			return ret;
+
+		if (*res > 1)
+			return -ERANGE;
+		break;
+	case BCH_OPT_UINT:
+		ret = kstrtou64(val, 10, res);
+		if (ret < 0)
+			return ret;
+
+		if (*res < opt->min || *res >= opt->max)
+			return -ERANGE;
+		break;
+	case BCH_OPT_STR:
+		ret = match_string(opt->choices, -1, val);
+		if (ret < 0)
+			return ret;
+
+		*res = ret;
+		break;
+	case BCH_OPT_FN:
+		if (!c)
+			return -EINVAL;
+
+		return opt->parse(c, val, res);
+	}
+
+	return 0;
+}
+
+int bch2_opt_to_text(struct bch_fs *c, char *buf, size_t len,
+		     const struct bch_option *opt, u64 v,
+		     unsigned flags)
+{
+	char *out = buf, *end = buf + len;
+
+	if (flags & OPT_SHOW_MOUNT_STYLE) {
+		if (opt->type == BCH_OPT_BOOL)
+			return scnprintf(out, end - out, "%s%s",
+					 v ? "" : "no",
+					 opt->attr.name);
+
+		out += scnprintf(out, end - out, "%s=", opt->attr.name);
+	}
+
+	switch (opt->type) {
+	case BCH_OPT_BOOL:
+	case BCH_OPT_UINT:
+		out += scnprintf(out, end - out, "%lli", v);
+		break;
+	case BCH_OPT_STR:
+		out += (flags & OPT_SHOW_FULL_LIST)
+			? bch2_scnprint_string_list(out, end - out, opt->choices, v)
+			: scnprintf(out, end - out, opt->choices[v]);
+		break;
+	case BCH_OPT_FN:
+		return opt->print(c, out, end - out, v);
+	default:
+		BUG();
+	}
+
+	return out - buf;
+}
+
+int bch2_parse_mount_opts(struct bch_opts *opts, char *options)
+{
+	char *opt, *name, *val;
+	int ret, id;
+	u64 v;
+
+	while ((opt = strsep(&options, ",")) != NULL) {
+		name	= strsep(&opt, "=");
+		val	= opt;
+
+		if (val) {
+			id = bch2_mount_opt_lookup(name);
+			if (id < 0)
+				goto bad_opt;
+
+			ret = bch2_opt_parse(NULL, &bch2_opt_table[id], val, &v);
+			if (ret < 0)
+				goto bad_val;
+		} else {
+			id = bch2_mount_opt_lookup(name);
+			v = 1;
+
+			if (id < 0 &&
+			    !strncmp("no", name, 2)) {
+				id = bch2_mount_opt_lookup(name + 2);
+				v = 0;
+			}
+
+			if (id < 0)
+				goto bad_opt;
+
+			if (bch2_opt_table[id].type != BCH_OPT_BOOL)
+				goto no_val;
+		}
+
+		if (bch2_opt_table[id].mode < OPT_MOUNT)
+			goto bad_opt;
+
+		if (id == Opt_acl &&
+		    !IS_ENABLED(CONFIG_BCACHEFS_POSIX_ACL))
+			goto bad_opt;
+
+		if ((id == Opt_usrquota ||
+		     id == Opt_grpquota) &&
+		    !IS_ENABLED(CONFIG_BCACHEFS_QUOTA))
+			goto bad_opt;
+
+		bch2_opt_set_by_id(opts, id, v);
+	}
+
+	return 0;
+bad_opt:
+	pr_err("Bad mount option %s", name);
+	return -1;
+bad_val:
+	pr_err("Invalid value %s for mount option %s", val, name);
+	return -1;
+no_val:
+	pr_err("Mount option %s requires a value", name);
+	return -1;
+}
+
+/* io opts: */
+
+struct bch_io_opts bch2_opts_to_inode_opts(struct bch_opts src)
+{
+	struct bch_io_opts ret = { 0 };
+#define BCH_INODE_OPT(_name, _bits)					\
+	if (opt_defined(src, _name))					\
+		opt_set(ret, _name, src._name);
+	BCH_INODE_OPTS()
+#undef BCH_INODE_OPT
+	return ret;
+}
+
+struct bch_opts bch2_inode_opts_to_opts(struct bch_io_opts src)
+{
+	struct bch_opts ret = { 0 };
+#define BCH_INODE_OPT(_name, _bits)					\
+	if (opt_defined(src, _name))					\
+		opt_set(ret, _name, src._name);
+	BCH_INODE_OPTS()
+#undef BCH_INODE_OPT
+	return ret;
+}
+
+void bch2_io_opts_apply(struct bch_io_opts *dst, struct bch_io_opts src)
+{
+#define BCH_INODE_OPT(_name, _bits)					\
+	if (opt_defined(src, _name))					\
+		opt_set(*dst, _name, src._name);
+	BCH_INODE_OPTS()
+#undef BCH_INODE_OPT
+}
+
+bool bch2_opt_is_inode_opt(enum bch_opt_id id)
+{
+	static const enum bch_opt_id inode_opt_list[] = {
+#define BCH_INODE_OPT(_name, _bits)	Opt_##_name,
+	BCH_INODE_OPTS()
+#undef BCH_INODE_OPT
+	};
+	unsigned i;
+
+	for (i = 0; i < ARRAY_SIZE(inode_opt_list); i++)
+		if (inode_opt_list[i] == id)
+			return true;
+
+	return false;
+}
diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
new file mode 100644
index 000000000000..3b5eddbf56bf
--- /dev/null
+++ b/fs/bcachefs/opts.h
@@ -0,0 +1,296 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_OPTS_H
+#define _BCACHEFS_OPTS_H
+
+#include <linux/bug.h>
+#include <linux/log2.h>
+#include <linux/string.h>
+#include <linux/sysfs.h>
+#include "bcachefs_format.h"
+
+extern const char * const bch2_error_actions[];
+extern const char * const bch2_csum_types[];
+extern const char * const bch2_compression_types[];
+extern const char * const bch2_str_hash_types[];
+extern const char * const bch2_data_types[];
+extern const char * const bch2_cache_replacement_policies[];
+extern const char * const bch2_cache_modes[];
+extern const char * const bch2_dev_state[];
+
+/*
+ * Mount options; we also store defaults in the superblock.
+ *
+ * Also exposed via sysfs: if an option is writeable, and it's also stored in
+ * the superblock, changing it via sysfs (currently? might change this) also
+ * updates the superblock.
+ *
+ * We store options as signed integers, where -1 means undefined. This means we
+ * can pass the mount options to bch2_fs_alloc() as a whole struct, and then only
+ * apply the options from that struct that are defined.
+ */
+
+/* dummy option, for options that aren't stored in the superblock */
+LE64_BITMASK(NO_SB_OPT,		struct bch_sb, flags[0], 0, 0);
+
+enum opt_mode {
+	OPT_INTERNAL,
+	OPT_FORMAT,
+	OPT_MOUNT,
+	OPT_RUNTIME,
+};
+
+enum opt_type {
+	BCH_OPT_BOOL,
+	BCH_OPT_UINT,
+	BCH_OPT_STR,
+	BCH_OPT_FN,
+};
+
+/**
+ * BCH_OPT(name, type, in mem type, mode, sb_opt)
+ *
+ * @name	- name of mount option, sysfs attribute, and struct bch_opts
+ *		  member
+ *
+ * @mode	- when opt may be set
+ *
+ * @sb_option	- name of corresponding superblock option
+ *
+ * @type	- one of OPT_BOOL, OPT_UINT, OPT_STR
+ */
+
+/*
+ * XXX: add fields for
+ *  - default value
+ *  - helptext
+ */
+
+#define BCH_OPTS()							\
+	BCH_OPT(block_size,		u16,	OPT_FORMAT,		\
+		OPT_UINT(1, 128),					\
+		BCH_SB_BLOCK_SIZE,		8)			\
+	BCH_OPT(btree_node_size,	u16,	OPT_FORMAT,		\
+		OPT_UINT(1, 128),					\
+		BCH_SB_BTREE_NODE_SIZE,		512)			\
+	BCH_OPT(errors,			u8,	OPT_RUNTIME,		\
+		OPT_STR(bch2_error_actions),				\
+		BCH_SB_ERROR_ACTION,		BCH_ON_ERROR_RO)	\
+	BCH_OPT(metadata_replicas,	u8,	OPT_RUNTIME,		\
+		OPT_UINT(1, BCH_REPLICAS_MAX),				\
+		BCH_SB_META_REPLICAS_WANT,	1)			\
+	BCH_OPT(data_replicas,		u8,	OPT_RUNTIME,		\
+		OPT_UINT(1, BCH_REPLICAS_MAX),				\
+		BCH_SB_DATA_REPLICAS_WANT,	1)			\
+	BCH_OPT(metadata_replicas_required, u8,	OPT_MOUNT,		\
+		OPT_UINT(1, BCH_REPLICAS_MAX),				\
+		BCH_SB_META_REPLICAS_REQ,	1)			\
+	BCH_OPT(data_replicas_required, u8,	OPT_MOUNT,		\
+		OPT_UINT(1, BCH_REPLICAS_MAX),				\
+		BCH_SB_DATA_REPLICAS_REQ,	1)			\
+	BCH_OPT(metadata_checksum,	u8,	OPT_RUNTIME,		\
+		OPT_STR(bch2_csum_types),				\
+		BCH_SB_META_CSUM_TYPE,		BCH_CSUM_OPT_CRC32C)	\
+	BCH_OPT(data_checksum,		u8,	OPT_RUNTIME,		\
+		OPT_STR(bch2_csum_types),				\
+		BCH_SB_DATA_CSUM_TYPE,		BCH_CSUM_OPT_CRC32C)	\
+	BCH_OPT(compression,		u8,	OPT_RUNTIME,		\
+		OPT_STR(bch2_compression_types),			\
+		BCH_SB_COMPRESSION_TYPE,	BCH_COMPRESSION_OPT_NONE)\
+	BCH_OPT(background_compression,	u8,	OPT_RUNTIME,		\
+		OPT_STR(bch2_compression_types),			\
+		BCH_SB_BACKGROUND_COMPRESSION_TYPE,BCH_COMPRESSION_OPT_NONE)\
+	BCH_OPT(str_hash,		u8,	OPT_RUNTIME,		\
+		OPT_STR(bch2_str_hash_types),				\
+		BCH_SB_STR_HASH_TYPE,		BCH_STR_HASH_SIPHASH)	\
+	BCH_OPT(foreground_target,	u16,	OPT_RUNTIME,		\
+		OPT_FN(bch2_opt_target),				\
+		BCH_SB_FOREGROUND_TARGET,	0)			\
+	BCH_OPT(background_target,	u16,	OPT_RUNTIME,		\
+		OPT_FN(bch2_opt_target),				\
+		BCH_SB_BACKGROUND_TARGET,	0)			\
+	BCH_OPT(promote_target,		u16,	OPT_RUNTIME,		\
+		OPT_FN(bch2_opt_target),				\
+		BCH_SB_PROMOTE_TARGET,	0)				\
+	BCH_OPT(inodes_32bit,		u8,	OPT_RUNTIME,		\
+		OPT_BOOL(),						\
+		BCH_SB_INODE_32BIT,		false)			\
+	BCH_OPT(gc_reserve_percent,	u8,	OPT_MOUNT,		\
+		OPT_UINT(5, 21),					\
+		BCH_SB_GC_RESERVE,		8)			\
+	BCH_OPT(root_reserve_percent,	u8,	OPT_MOUNT,		\
+		OPT_UINT(0, 100),					\
+		BCH_SB_ROOT_RESERVE,		0)			\
+	BCH_OPT(wide_macs,		u8,	OPT_RUNTIME,		\
+		OPT_BOOL(),						\
+		BCH_SB_128_BIT_MACS,		false)			\
+	BCH_OPT(acl,			u8,	OPT_MOUNT,		\
+		OPT_BOOL(),						\
+		BCH_SB_POSIX_ACL,		true)			\
+	BCH_OPT(usrquota,		u8,	OPT_MOUNT,		\
+		OPT_BOOL(),						\
+		BCH_SB_USRQUOTA,		false)			\
+	BCH_OPT(grpquota,		u8,	OPT_MOUNT,		\
+		OPT_BOOL(),						\
+		BCH_SB_GRPQUOTA,		false)			\
+	BCH_OPT(prjquota,		u8,	OPT_MOUNT,		\
+		OPT_BOOL(),						\
+		BCH_SB_PRJQUOTA,		false)			\
+	BCH_OPT(degraded,		u8,	OPT_MOUNT,		\
+		OPT_BOOL(),						\
+		NO_SB_OPT,			false)			\
+	BCH_OPT(discard,		u8,	OPT_MOUNT,		\
+		OPT_BOOL(),						\
+		NO_SB_OPT,			false)			\
+	BCH_OPT(verbose_recovery,	u8,	OPT_MOUNT,		\
+		OPT_BOOL(),						\
+		NO_SB_OPT,			false)			\
+	BCH_OPT(verbose_init,		u8,	OPT_MOUNT,		\
+		OPT_BOOL(),						\
+		NO_SB_OPT,			false)			\
+	BCH_OPT(journal_flush_disabled, u8,	OPT_RUNTIME,		\
+		OPT_BOOL(),						\
+		NO_SB_OPT,			false)			\
+	BCH_OPT(nofsck,			u8,	OPT_MOUNT,		\
+		OPT_BOOL(),						\
+		NO_SB_OPT,			false)			\
+	BCH_OPT(fix_errors,		u8,	OPT_MOUNT,		\
+		OPT_BOOL(),						\
+		NO_SB_OPT,			false)			\
+	BCH_OPT(nochanges,		u8,	OPT_MOUNT,		\
+		OPT_BOOL(),						\
+		NO_SB_OPT,			false)			\
+	BCH_OPT(noreplay,		u8,	OPT_MOUNT,		\
+		OPT_BOOL(),						\
+		NO_SB_OPT,			false)			\
+	BCH_OPT(norecovery,		u8,	OPT_MOUNT,		\
+		OPT_BOOL(),						\
+		NO_SB_OPT,			false)			\
+	BCH_OPT(noexcl,			u8,	OPT_MOUNT,		\
+		OPT_BOOL(),						\
+		NO_SB_OPT,			false)			\
+	BCH_OPT(sb,			u64,	OPT_MOUNT,		\
+		OPT_UINT(0, S64_MAX),					\
+		NO_SB_OPT,			BCH_SB_SECTOR)		\
+	BCH_OPT(read_only,		u8,	OPT_INTERNAL,		\
+		OPT_BOOL(),						\
+		NO_SB_OPT,			false)			\
+	BCH_OPT(nostart,		u8,	OPT_INTERNAL,		\
+		OPT_BOOL(),						\
+		NO_SB_OPT,			false)			\
+	BCH_OPT(no_data_io,		u8,	OPT_MOUNT,		\
+		OPT_BOOL(),						\
+		NO_SB_OPT,			false)
+
+struct bch_opts {
+#define BCH_OPT(_name, _bits, ...)	unsigned _name##_defined:1;
+	BCH_OPTS()
+#undef BCH_OPT
+
+#define BCH_OPT(_name, _bits, ...)	_bits	_name;
+	BCH_OPTS()
+#undef BCH_OPT
+};
+
+static const struct bch_opts bch2_opts_default = {
+#define BCH_OPT(_name, _bits, _mode, _type, _sb_opt, _default)		\
+	._name##_defined = true,					\
+	._name = _default,						\
+
+	BCH_OPTS()
+#undef BCH_OPT
+};
+
+#define opt_defined(_opts, _name)	((_opts)._name##_defined)
+
+#define opt_get(_opts, _name)						\
+	(opt_defined(_opts, _name) ? (_opts)._name : bch2_opts_default._name)
+
+#define opt_set(_opts, _name, _v)					\
+do {									\
+	(_opts)._name##_defined = true;					\
+	(_opts)._name = _v;						\
+} while (0)
+
+static inline struct bch_opts bch2_opts_empty(void)
+{
+	return (struct bch_opts) { 0 };
+}
+
+void bch2_opts_apply(struct bch_opts *, struct bch_opts);
+
+enum bch_opt_id {
+#define BCH_OPT(_name, ...)	Opt_##_name,
+	BCH_OPTS()
+#undef BCH_OPT
+	bch2_opts_nr
+};
+
+struct bch_fs;
+
+struct bch_option {
+	struct attribute	attr;
+	void			(*set_sb)(struct bch_sb *, u64);
+	enum opt_mode		mode;
+	enum opt_type		type;
+
+	union {
+	struct {
+		u64		min, max;
+	};
+	struct {
+		const char * const *choices;
+	};
+	struct {
+		int (*parse)(struct bch_fs *, const char *, u64 *);
+		int (*print)(struct bch_fs *, char *, size_t, u64);
+	};
+	};
+
+};
+
+extern const struct bch_option bch2_opt_table[];
+
+bool bch2_opt_defined_by_id(const struct bch_opts *, enum bch_opt_id);
+u64 bch2_opt_get_by_id(const struct bch_opts *, enum bch_opt_id);
+void bch2_opt_set_by_id(struct bch_opts *, enum bch_opt_id, u64);
+
+struct bch_opts bch2_opts_from_sb(struct bch_sb *);
+
+int bch2_opt_lookup(const char *);
+int bch2_opt_parse(struct bch_fs *, const struct bch_option *, const char *, u64 *);
+
+#define OPT_SHOW_FULL_LIST	(1 << 0)
+#define OPT_SHOW_MOUNT_STYLE	(1 << 1)
+
+int bch2_opt_to_text(struct bch_fs *, char *, size_t,
+		     const struct bch_option *, u64, unsigned);
+
+int bch2_parse_mount_opts(struct bch_opts *, char *);
+
+/* inode opts: */
+
+#define BCH_INODE_OPTS()					\
+	BCH_INODE_OPT(data_checksum,			8)	\
+	BCH_INODE_OPT(compression,			8)	\
+	BCH_INODE_OPT(background_compression,		8)	\
+	BCH_INODE_OPT(data_replicas,			8)	\
+	BCH_INODE_OPT(promote_target,			16)	\
+	BCH_INODE_OPT(foreground_target,		16)	\
+	BCH_INODE_OPT(background_target,		16)
+
+struct bch_io_opts {
+#define BCH_INODE_OPT(_name, _bits)	unsigned _name##_defined:1;
+	BCH_INODE_OPTS()
+#undef BCH_INODE_OPT
+
+#define BCH_INODE_OPT(_name, _bits)	u##_bits _name;
+	BCH_INODE_OPTS()
+#undef BCH_INODE_OPT
+};
+
+struct bch_io_opts bch2_opts_to_inode_opts(struct bch_opts);
+struct bch_opts bch2_inode_opts_to_opts(struct bch_io_opts);
+void bch2_io_opts_apply(struct bch_io_opts *, struct bch_io_opts);
+bool bch2_opt_is_inode_opt(enum bch_opt_id);
+
+#endif /* _BCACHEFS_OPTS_H */
diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c
new file mode 100644
index 000000000000..0adbfe523f51
--- /dev/null
+++ b/fs/bcachefs/quota.c
@@ -0,0 +1,790 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "bcachefs.h"
+#include "btree_update.h"
+#include "inode.h"
+#include "quota.h"
+#include "super-io.h"
+
+static const char *bch2_sb_validate_quota(struct bch_sb *sb,
+					  struct bch_sb_field *f)
+{
+	struct bch_sb_field_quota *q = field_to_type(f, quota);
+
+	if (vstruct_bytes(&q->field) != sizeof(*q))
+		return "invalid field quota: wrong size";
+
+	return NULL;
+}
+
+const struct bch_sb_field_ops bch_sb_field_ops_quota = {
+	.validate	= bch2_sb_validate_quota,
+};
+
+const char *bch2_quota_invalid(const struct bch_fs *c, struct bkey_s_c k)
+{
+	struct bkey_s_c_quota dq;
+
+	if (k.k->p.inode >= QTYP_NR)
+		return "invalid quota type";
+
+	switch (k.k->type) {
+	case BCH_QUOTA: {
+		dq = bkey_s_c_to_quota(k);
+
+		if (bkey_val_bytes(k.k) != sizeof(struct bch_quota))
+			return "incorrect value size";
+
+		return NULL;
+	}
+	default:
+		return "invalid type";
+	}
+}
+
+static const char * const bch2_quota_counters[] = {
+	"space",
+	"inodes",
+};
+
+void bch2_quota_to_text(struct bch_fs *c, char *buf,
+			size_t size, struct bkey_s_c k)
+{
+	char *out = buf, *end= buf + size;
+	struct bkey_s_c_quota dq;
+	unsigned i;
+
+	switch (k.k->type) {
+	case BCH_QUOTA:
+		dq = bkey_s_c_to_quota(k);
+
+		for (i = 0; i < Q_COUNTERS; i++)
+			out += scnprintf(out, end - out, "%s hardlimit %llu softlimit %llu",
+					 bch2_quota_counters[i],
+					 le64_to_cpu(dq.v->c[i].hardlimit),
+					 le64_to_cpu(dq.v->c[i].softlimit));
+		break;
+	}
+}
+
+#ifdef CONFIG_BCACHEFS_QUOTA
+
+#include <linux/cred.h>
+#include <linux/fs.h>
+#include <linux/quota.h>
+
+static inline unsigned __next_qtype(unsigned i, unsigned qtypes)
+{
+	qtypes >>= i;
+	return qtypes ? i + __ffs(qtypes) : QTYP_NR;
+}
+
+#define for_each_set_qtype(_c, _i, _q, _qtypes)				\
+	for (_i = 0;							\
+	     (_i = __next_qtype(_i, _qtypes),				\
+	      _q = &(_c)->quotas[_i],					\
+	      _i < QTYP_NR);						\
+	     _i++)
+
+static bool ignore_hardlimit(struct bch_memquota_type *q)
+{
+	if (capable(CAP_SYS_RESOURCE))
+		return true;
+#if 0
+	struct mem_dqinfo *info = &sb_dqopt(dquot->dq_sb)->info[dquot->dq_id.type];
+
+	return capable(CAP_SYS_RESOURCE) &&
+	       (info->dqi_format->qf_fmt_id != QFMT_VFS_OLD ||
+		!(info->dqi_flags & DQF_ROOT_SQUASH));
+#endif
+	return false;
+}
+
+enum quota_msg {
+	SOFTWARN,	/* Softlimit reached */
+	SOFTLONGWARN,	/* Grace time expired */
+	HARDWARN,	/* Hardlimit reached */
+
+	HARDBELOW,	/* Usage got below inode hardlimit */
+	SOFTBELOW,	/* Usage got below inode softlimit */
+};
+
+static int quota_nl[][Q_COUNTERS] = {
+	[HARDWARN][Q_SPC]	= QUOTA_NL_BHARDWARN,
+	[SOFTLONGWARN][Q_SPC]	= QUOTA_NL_BSOFTLONGWARN,
+	[SOFTWARN][Q_SPC]	= QUOTA_NL_BSOFTWARN,
+	[HARDBELOW][Q_SPC]	= QUOTA_NL_BHARDBELOW,
+	[SOFTBELOW][Q_SPC]	= QUOTA_NL_BSOFTBELOW,
+
+	[HARDWARN][Q_INO]	= QUOTA_NL_IHARDWARN,
+	[SOFTLONGWARN][Q_INO]	= QUOTA_NL_ISOFTLONGWARN,
+	[SOFTWARN][Q_INO]	= QUOTA_NL_ISOFTWARN,
+	[HARDBELOW][Q_INO]	= QUOTA_NL_IHARDBELOW,
+	[SOFTBELOW][Q_INO]	= QUOTA_NL_ISOFTBELOW,
+};
+
+struct quota_msgs {
+	u8		nr;
+	struct {
+		u8	qtype;
+		u8	msg;
+	}		m[QTYP_NR * Q_COUNTERS];
+};
+
+static void prepare_msg(unsigned qtype,
+			enum quota_counters counter,
+			struct quota_msgs *msgs,
+			enum quota_msg msg_type)
+{
+	BUG_ON(msgs->nr >= ARRAY_SIZE(msgs->m));
+
+	msgs->m[msgs->nr].qtype	= qtype;
+	msgs->m[msgs->nr].msg	= quota_nl[msg_type][counter];
+	msgs->nr++;
+}
+
+static void prepare_warning(struct memquota_counter *qc,
+			    unsigned qtype,
+			    enum quota_counters counter,
+			    struct quota_msgs *msgs,
+			    enum quota_msg msg_type)
+{
+	if (qc->warning_issued & (1 << msg_type))
+		return;
+
+	prepare_msg(qtype, counter, msgs, msg_type);
+}
+
+static void flush_warnings(struct bch_qid qid,
+			   struct super_block *sb,
+			   struct quota_msgs *msgs)
+{
+	unsigned i;
+
+	for (i = 0; i < msgs->nr; i++)
+		quota_send_warning(make_kqid(&init_user_ns, msgs->m[i].qtype, qid.q[i]),
+				   sb->s_dev, msgs->m[i].msg);
+}
+
+static int bch2_quota_check_limit(struct bch_fs *c,
+				  unsigned qtype,
+				  struct bch_memquota *mq,
+				  struct quota_msgs *msgs,
+				  enum quota_counters counter,
+				  s64 v,
+				  enum quota_acct_mode mode)
+{
+	struct bch_memquota_type *q = &c->quotas[qtype];
+	struct memquota_counter *qc = &mq->c[counter];
+	u64 n = qc->v + v;
+
+	BUG_ON((s64) n < 0);
+
+	if (mode == BCH_QUOTA_NOCHECK)
+		return 0;
+
+	if (v <= 0) {
+		if (n < qc->hardlimit &&
+		    (qc->warning_issued & (1 << HARDWARN))) {
+			qc->warning_issued &= ~(1 << HARDWARN);
+			prepare_msg(qtype, counter, msgs, HARDBELOW);
+		}
+
+		if (n < qc->softlimit &&
+		    (qc->warning_issued & (1 << SOFTWARN))) {
+			qc->warning_issued &= ~(1 << SOFTWARN);
+			prepare_msg(qtype, counter, msgs, SOFTBELOW);
+		}
+
+		qc->warning_issued = 0;
+		return 0;
+	}
+
+	if (qc->hardlimit &&
+	    qc->hardlimit < n &&
+	    !ignore_hardlimit(q)) {
+		if (mode == BCH_QUOTA_PREALLOC)
+			return -EDQUOT;
+
+		prepare_warning(qc, qtype, counter, msgs, HARDWARN);
+	}
+
+	if (qc->softlimit &&
+	    qc->softlimit < n &&
+	    qc->timer &&
+	    ktime_get_real_seconds() >= qc->timer &&
+	    !ignore_hardlimit(q)) {
+		if (mode == BCH_QUOTA_PREALLOC)
+			return -EDQUOT;
+
+		prepare_warning(qc, qtype, counter, msgs, SOFTLONGWARN);
+	}
+
+	if (qc->softlimit &&
+	    qc->softlimit < n &&
+	    qc->timer == 0) {
+		if (mode == BCH_QUOTA_PREALLOC)
+			return -EDQUOT;
+
+		prepare_warning(qc, qtype, counter, msgs, SOFTWARN);
+
+		/* XXX is this the right one? */
+		qc->timer = ktime_get_real_seconds() +
+			q->limits[counter].warnlimit;
+	}
+
+	return 0;
+}
+
+int bch2_quota_acct(struct bch_fs *c, struct bch_qid qid,
+		    enum quota_counters counter, s64 v,
+		    enum quota_acct_mode mode)
+{
+	unsigned qtypes = enabled_qtypes(c);
+	struct bch_memquota_type *q;
+	struct bch_memquota *mq[QTYP_NR];
+	struct quota_msgs msgs;
+	unsigned i;
+	int ret = 0;
+
+	memset(&msgs, 0, sizeof(msgs));
+
+	for_each_set_qtype(c, i, q, qtypes)
+		mutex_lock_nested(&q->lock, i);
+
+	for_each_set_qtype(c, i, q, qtypes) {
+		mq[i] = genradix_ptr_alloc(&q->table, qid.q[i], GFP_NOFS);
+		if (!mq[i]) {
+			ret = -ENOMEM;
+			goto err;
+		}
+
+		ret = bch2_quota_check_limit(c, i, mq[i], &msgs, counter, v, mode);
+		if (ret)
+			goto err;
+	}
+
+	for_each_set_qtype(c, i, q, qtypes)
+		mq[i]->c[counter].v += v;
+err:
+	for_each_set_qtype(c, i, q, qtypes)
+		mutex_unlock(&q->lock);
+
+	flush_warnings(qid, c->vfs_sb, &msgs);
+
+	return ret;
+}
+
+static void __bch2_quota_transfer(struct bch_memquota *src_q,
+				  struct bch_memquota *dst_q,
+				  enum quota_counters counter, s64 v)
+{
+	BUG_ON(v > src_q->c[counter].v);
+	BUG_ON(v + dst_q->c[counter].v < v);
+
+	src_q->c[counter].v -= v;
+	dst_q->c[counter].v += v;
+}
+
+int bch2_quota_transfer(struct bch_fs *c, unsigned qtypes,
+			struct bch_qid dst,
+			struct bch_qid src, u64 space)
+{
+	struct bch_memquota_type *q;
+	struct bch_memquota *src_q[3], *dst_q[3];
+	struct quota_msgs msgs;
+	unsigned i;
+	int ret = 0;
+
+	qtypes &= enabled_qtypes(c);
+
+	memset(&msgs, 0, sizeof(msgs));
+
+	for_each_set_qtype(c, i, q, qtypes)
+		mutex_lock_nested(&q->lock, i);
+
+	for_each_set_qtype(c, i, q, qtypes) {
+		src_q[i] = genradix_ptr_alloc(&q->table, src.q[i], GFP_NOFS);
+		dst_q[i] = genradix_ptr_alloc(&q->table, dst.q[i], GFP_NOFS);
+
+		if (!src_q[i] || !dst_q[i]) {
+			ret = -ENOMEM;
+			goto err;
+		}
+
+		ret = bch2_quota_check_limit(c, i, dst_q[i], &msgs, Q_SPC,
+					     dst_q[i]->c[Q_SPC].v + space,
+					     BCH_QUOTA_PREALLOC);
+		if (ret)
+			goto err;
+
+		ret = bch2_quota_check_limit(c, i, dst_q[i], &msgs, Q_INO,
+					     dst_q[i]->c[Q_INO].v + 1,
+					     BCH_QUOTA_PREALLOC);
+		if (ret)
+			goto err;
+	}
+
+	for_each_set_qtype(c, i, q, qtypes) {
+		__bch2_quota_transfer(src_q[i], dst_q[i], Q_SPC, space);
+		__bch2_quota_transfer(src_q[i], dst_q[i], Q_INO, 1);
+	}
+
+err:
+	for_each_set_qtype(c, i, q, qtypes)
+		mutex_unlock(&q->lock);
+
+	flush_warnings(dst, c->vfs_sb, &msgs);
+
+	return ret;
+}
+
+static int __bch2_quota_set(struct bch_fs *c, struct bkey_s_c k)
+{
+	struct bkey_s_c_quota dq;
+	struct bch_memquota_type *q;
+	struct bch_memquota *mq;
+	unsigned i;
+
+	BUG_ON(k.k->p.inode >= QTYP_NR);
+
+	switch (k.k->type) {
+	case BCH_QUOTA:
+		dq = bkey_s_c_to_quota(k);
+		q = &c->quotas[k.k->p.inode];
+
+		mutex_lock(&q->lock);
+		mq = genradix_ptr_alloc(&q->table, k.k->p.offset, GFP_KERNEL);
+		if (!mq) {
+			mutex_unlock(&q->lock);
+			return -ENOMEM;
+		}
+
+		for (i = 0; i < Q_COUNTERS; i++) {
+			mq->c[i].hardlimit = le64_to_cpu(dq.v->c[i].hardlimit);
+			mq->c[i].softlimit = le64_to_cpu(dq.v->c[i].softlimit);
+		}
+
+		mutex_unlock(&q->lock);
+	}
+
+	return 0;
+}
+
+static int bch2_quota_init_type(struct bch_fs *c, enum quota_types type)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	int ret = 0;
+
+	for_each_btree_key(&iter, c, BTREE_ID_QUOTAS, POS(type, 0),
+			   BTREE_ITER_PREFETCH, k) {
+		if (k.k->p.inode != type)
+			break;
+
+		ret = __bch2_quota_set(c, k);
+		if (ret)
+			break;
+	}
+
+	return bch2_btree_iter_unlock(&iter) ?: ret;
+}
+
+void bch2_fs_quota_exit(struct bch_fs *c)
+{
+	unsigned i;
+
+	for (i = 0; i < ARRAY_SIZE(c->quotas); i++)
+		genradix_free(&c->quotas[i].table);
+}
+
+void bch2_fs_quota_init(struct bch_fs *c)
+{
+	unsigned i;
+
+	for (i = 0; i < ARRAY_SIZE(c->quotas); i++)
+		mutex_init(&c->quotas[i].lock);
+}
+
+static void bch2_sb_quota_read(struct bch_fs *c)
+{
+	struct bch_sb_field_quota *sb_quota;
+	unsigned i, j;
+
+	sb_quota = bch2_sb_get_quota(c->disk_sb.sb);
+	if (!sb_quota)
+		return;
+
+	for (i = 0; i < QTYP_NR; i++) {
+		struct bch_memquota_type *q = &c->quotas[i];
+
+		for (j = 0; j < Q_COUNTERS; j++) {
+			q->limits[j].timelimit =
+				le32_to_cpu(sb_quota->q[i].c[j].timelimit);
+			q->limits[j].warnlimit =
+				le32_to_cpu(sb_quota->q[i].c[j].warnlimit);
+		}
+	}
+}
+
+int bch2_fs_quota_read(struct bch_fs *c)
+{
+	unsigned i, qtypes = enabled_qtypes(c);
+	struct bch_memquota_type *q;
+	struct btree_iter iter;
+	struct bch_inode_unpacked u;
+	struct bkey_s_c k;
+	int ret;
+
+	mutex_lock(&c->sb_lock);
+	bch2_sb_quota_read(c);
+	mutex_unlock(&c->sb_lock);
+
+	for_each_set_qtype(c, i, q, qtypes) {
+		ret = bch2_quota_init_type(c, i);
+		if (ret)
+			return ret;
+	}
+
+	for_each_btree_key(&iter, c, BTREE_ID_INODES, POS_MIN,
+			   BTREE_ITER_PREFETCH, k) {
+		switch (k.k->type) {
+		case BCH_INODE_FS:
+			ret = bch2_inode_unpack(bkey_s_c_to_inode(k), &u);
+			if (ret)
+				return ret;
+
+			bch2_quota_acct(c, bch_qid(&u), Q_SPC, u.bi_sectors,
+					BCH_QUOTA_NOCHECK);
+			bch2_quota_acct(c, bch_qid(&u), Q_INO, 1,
+					BCH_QUOTA_NOCHECK);
+		}
+	}
+	return bch2_btree_iter_unlock(&iter) ?: ret;
+}
+
+/* Enable/disable/delete quotas for an entire filesystem: */
+
+static int bch2_quota_enable(struct super_block	*sb, unsigned uflags)
+{
+	struct bch_fs *c = sb->s_fs_info;
+
+	if (sb->s_flags & SB_RDONLY)
+		return -EROFS;
+
+	/* Accounting must be enabled at mount time: */
+	if (uflags & (FS_QUOTA_UDQ_ACCT|FS_QUOTA_GDQ_ACCT|FS_QUOTA_PDQ_ACCT))
+		return -EINVAL;
+
+	/* Can't enable enforcement without accounting: */
+	if ((uflags & FS_QUOTA_UDQ_ENFD) && !c->opts.usrquota)
+		return -EINVAL;
+
+	if ((uflags & FS_QUOTA_GDQ_ENFD) && !c->opts.grpquota)
+		return -EINVAL;
+
+	if (uflags & FS_QUOTA_PDQ_ENFD && !c->opts.prjquota)
+		return -EINVAL;
+
+	mutex_lock(&c->sb_lock);
+	if (uflags & FS_QUOTA_UDQ_ENFD)
+		SET_BCH_SB_USRQUOTA(c->disk_sb.sb, true);
+
+	if (uflags & FS_QUOTA_GDQ_ENFD)
+		SET_BCH_SB_GRPQUOTA(c->disk_sb.sb, true);
+
+	if (uflags & FS_QUOTA_PDQ_ENFD)
+		SET_BCH_SB_PRJQUOTA(c->disk_sb.sb, true);
+
+	bch2_write_super(c);
+	mutex_unlock(&c->sb_lock);
+
+	return 0;
+}
+
+static int bch2_quota_disable(struct super_block *sb, unsigned uflags)
+{
+	struct bch_fs *c = sb->s_fs_info;
+
+	if (sb->s_flags & SB_RDONLY)
+		return -EROFS;
+
+	mutex_lock(&c->sb_lock);
+	if (uflags & FS_QUOTA_UDQ_ENFD)
+		SET_BCH_SB_USRQUOTA(c->disk_sb.sb, false);
+
+	if (uflags & FS_QUOTA_GDQ_ENFD)
+		SET_BCH_SB_GRPQUOTA(c->disk_sb.sb, false);
+
+	if (uflags & FS_QUOTA_PDQ_ENFD)
+		SET_BCH_SB_PRJQUOTA(c->disk_sb.sb, false);
+
+	bch2_write_super(c);
+	mutex_unlock(&c->sb_lock);
+
+	return 0;
+}
+
+static int bch2_quota_remove(struct super_block *sb, unsigned uflags)
+{
+	struct bch_fs *c = sb->s_fs_info;
+	int ret;
+
+	if (sb->s_flags & SB_RDONLY)
+		return -EROFS;
+
+	if (uflags & FS_USER_QUOTA) {
+		if (c->opts.usrquota)
+			return -EINVAL;
+
+		ret = bch2_btree_delete_range(c, BTREE_ID_QUOTAS,
+					      POS(QTYP_USR, 0),
+					      POS(QTYP_USR + 1, 0),
+					      ZERO_VERSION, NULL, NULL, NULL);
+		if (ret)
+			return ret;
+	}
+
+	if (uflags & FS_GROUP_QUOTA) {
+		if (c->opts.grpquota)
+			return -EINVAL;
+
+		ret = bch2_btree_delete_range(c, BTREE_ID_QUOTAS,
+					      POS(QTYP_GRP, 0),
+					      POS(QTYP_GRP + 1, 0),
+					      ZERO_VERSION, NULL, NULL, NULL);
+		if (ret)
+			return ret;
+	}
+
+	if (uflags & FS_PROJ_QUOTA) {
+		if (c->opts.prjquota)
+			return -EINVAL;
+
+		ret = bch2_btree_delete_range(c, BTREE_ID_QUOTAS,
+					      POS(QTYP_PRJ, 0),
+					      POS(QTYP_PRJ + 1, 0),
+					      ZERO_VERSION, NULL, NULL, NULL);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+/*
+ * Return quota status information, such as enforcements, quota file inode
+ * numbers etc.
+ */
+static int bch2_quota_get_state(struct super_block *sb, struct qc_state *state)
+{
+	struct bch_fs *c = sb->s_fs_info;
+	unsigned qtypes = enabled_qtypes(c);
+	unsigned i;
+
+	memset(state, 0, sizeof(*state));
+
+	for (i = 0; i < QTYP_NR; i++) {
+		state->s_state[i].flags |= QCI_SYSFILE;
+
+		if (!(qtypes & (1 << i)))
+			continue;
+
+		state->s_state[i].flags |= QCI_ACCT_ENABLED;
+
+		state->s_state[i].spc_timelimit = c->quotas[i].limits[Q_SPC].timelimit;
+		state->s_state[i].spc_warnlimit = c->quotas[i].limits[Q_SPC].warnlimit;
+
+		state->s_state[i].ino_timelimit = c->quotas[i].limits[Q_INO].timelimit;
+		state->s_state[i].ino_warnlimit = c->quotas[i].limits[Q_INO].warnlimit;
+	}
+
+	return 0;
+}
+
+/*
+ * Adjust quota timers & warnings
+ */
+static int bch2_quota_set_info(struct super_block *sb, int type,
+			       struct qc_info *info)
+{
+	struct bch_fs *c = sb->s_fs_info;
+	struct bch_sb_field_quota *sb_quota;
+	struct bch_memquota_type *q;
+
+	if (sb->s_flags & SB_RDONLY)
+		return -EROFS;
+
+	if (type >= QTYP_NR)
+		return -EINVAL;
+
+	if (!((1 << type) & enabled_qtypes(c)))
+		return -ESRCH;
+
+	if (info->i_fieldmask &
+	    ~(QC_SPC_TIMER|QC_INO_TIMER|QC_SPC_WARNS|QC_INO_WARNS))
+		return -EINVAL;
+
+	q = &c->quotas[type];
+
+	mutex_lock(&c->sb_lock);
+	sb_quota = bch2_sb_get_quota(c->disk_sb.sb);
+	if (!sb_quota) {
+		sb_quota = bch2_sb_resize_quota(&c->disk_sb,
+					sizeof(*sb_quota) / sizeof(u64));
+		if (!sb_quota)
+			return -ENOSPC;
+	}
+
+	if (info->i_fieldmask & QC_SPC_TIMER)
+		sb_quota->q[type].c[Q_SPC].timelimit =
+			cpu_to_le32(info->i_spc_timelimit);
+
+	if (info->i_fieldmask & QC_SPC_WARNS)
+		sb_quota->q[type].c[Q_SPC].warnlimit =
+			cpu_to_le32(info->i_spc_warnlimit);
+
+	if (info->i_fieldmask & QC_INO_TIMER)
+		sb_quota->q[type].c[Q_INO].timelimit =
+			cpu_to_le32(info->i_ino_timelimit);
+
+	if (info->i_fieldmask & QC_INO_WARNS)
+		sb_quota->q[type].c[Q_INO].warnlimit =
+			cpu_to_le32(info->i_ino_warnlimit);
+
+	bch2_sb_quota_read(c);
+
+	bch2_write_super(c);
+	mutex_unlock(&c->sb_lock);
+
+	return 0;
+}
+
+/* Get/set individual quotas: */
+
+static void __bch2_quota_get(struct qc_dqblk *dst, struct bch_memquota *src)
+{
+	dst->d_space		= src->c[Q_SPC].v << 9;
+	dst->d_spc_hardlimit	= src->c[Q_SPC].hardlimit << 9;
+	dst->d_spc_softlimit	= src->c[Q_SPC].softlimit << 9;
+	dst->d_spc_timer	= src->c[Q_SPC].timer;
+	dst->d_spc_warns	= src->c[Q_SPC].warns;
+
+	dst->d_ino_count	= src->c[Q_INO].v;
+	dst->d_ino_hardlimit	= src->c[Q_INO].hardlimit;
+	dst->d_ino_softlimit	= src->c[Q_INO].softlimit;
+	dst->d_ino_timer	= src->c[Q_INO].timer;
+	dst->d_ino_warns	= src->c[Q_INO].warns;
+}
+
+static int bch2_get_quota(struct super_block *sb, struct kqid kqid,
+			  struct qc_dqblk *qdq)
+{
+	struct bch_fs *c		= sb->s_fs_info;
+	struct bch_memquota_type *q	= &c->quotas[kqid.type];
+	qid_t qid			= from_kqid(&init_user_ns, kqid);
+	struct bch_memquota *mq;
+
+	memset(qdq, 0, sizeof(*qdq));
+
+	mutex_lock(&q->lock);
+	mq = genradix_ptr(&q->table, qid);
+	if (mq)
+		__bch2_quota_get(qdq, mq);
+	mutex_unlock(&q->lock);
+
+	return 0;
+}
+
+static int bch2_get_next_quota(struct super_block *sb, struct kqid *kqid,
+			       struct qc_dqblk *qdq)
+{
+	struct bch_fs *c		= sb->s_fs_info;
+	struct bch_memquota_type *q	= &c->quotas[kqid->type];
+	qid_t qid			= from_kqid(&init_user_ns, *kqid);
+	struct genradix_iter iter	= genradix_iter_init(&q->table, qid);
+	struct bch_memquota *mq;
+	int ret = 0;
+
+	mutex_lock(&q->lock);
+
+	while ((mq = genradix_iter_peek(&iter, &q->table))) {
+		if (memcmp(mq, page_address(ZERO_PAGE(0)), sizeof(*mq))) {
+			__bch2_quota_get(qdq, mq);
+			*kqid = make_kqid(current_user_ns(), kqid->type, iter.pos);
+			goto found;
+		}
+
+		genradix_iter_advance(&iter, &q->table);
+	}
+
+	ret = -ENOENT;
+found:
+	mutex_unlock(&q->lock);
+	return ret;
+}
+
+static int bch2_set_quota(struct super_block *sb, struct kqid qid,
+			  struct qc_dqblk *qdq)
+{
+	struct bch_fs *c = sb->s_fs_info;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	struct bkey_i_quota new_quota;
+	int ret;
+
+	if (sb->s_flags & SB_RDONLY)
+		return -EROFS;
+
+	bkey_quota_init(&new_quota.k_i);
+	new_quota.k.p = POS(qid.type, from_kqid(&init_user_ns, qid));
+
+	bch2_btree_iter_init(&iter, c, BTREE_ID_QUOTAS, new_quota.k.p,
+			     BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+	k = bch2_btree_iter_peek_slot(&iter);
+
+	ret = btree_iter_err(k);
+	if (unlikely(ret))
+		return ret;
+
+	switch (k.k->type) {
+	case BCH_QUOTA:
+		new_quota.v = *bkey_s_c_to_quota(k).v;
+		break;
+	}
+
+	if (qdq->d_fieldmask & QC_SPC_SOFT)
+		new_quota.v.c[Q_SPC].softlimit = cpu_to_le64(qdq->d_spc_softlimit >> 9);
+	if (qdq->d_fieldmask & QC_SPC_HARD)
+		new_quota.v.c[Q_SPC].hardlimit = cpu_to_le64(qdq->d_spc_hardlimit >> 9);
+
+	if (qdq->d_fieldmask & QC_INO_SOFT)
+		new_quota.v.c[Q_INO].softlimit = cpu_to_le64(qdq->d_ino_softlimit);
+	if (qdq->d_fieldmask & QC_INO_HARD)
+		new_quota.v.c[Q_INO].hardlimit = cpu_to_le64(qdq->d_ino_hardlimit);
+
+	ret = bch2_btree_insert_at(c, NULL, NULL, NULL, 0,
+				   BTREE_INSERT_ENTRY(&iter, &new_quota.k_i));
+	bch2_btree_iter_unlock(&iter);
+
+	if (ret)
+		return ret;
+
+	ret = __bch2_quota_set(c, bkey_i_to_s_c(&new_quota.k_i));
+
+	return ret;
+}
+
+const struct quotactl_ops bch2_quotactl_operations = {
+	.quota_enable		= bch2_quota_enable,
+	.quota_disable		= bch2_quota_disable,
+	.rm_xquota		= bch2_quota_remove,
+
+	.get_state		= bch2_quota_get_state,
+	.set_info		= bch2_quota_set_info,
+
+	.get_dqblk		= bch2_get_quota,
+	.get_nextdqblk		= bch2_get_next_quota,
+	.set_dqblk		= bch2_set_quota,
+};
+
+#endif /* CONFIG_BCACHEFS_QUOTA */
diff --git a/fs/bcachefs/quota.h b/fs/bcachefs/quota.h
new file mode 100644
index 000000000000..4a76b49f9e00
--- /dev/null
+++ b/fs/bcachefs/quota.h
@@ -0,0 +1,76 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_QUOTA_H
+#define _BCACHEFS_QUOTA_H
+
+#include "inode.h"
+#include "quota_types.h"
+
+extern const struct bch_sb_field_ops bch_sb_field_ops_quota;
+
+const char *bch2_quota_invalid(const struct bch_fs *, struct bkey_s_c);
+void bch2_quota_to_text(struct bch_fs *, char *, size_t, struct bkey_s_c);
+
+#define bch2_bkey_quota_ops (struct bkey_ops) {		\
+	.key_invalid	= bch2_quota_invalid,		\
+	.val_to_text	= bch2_quota_to_text,		\
+}
+
+enum quota_acct_mode {
+	BCH_QUOTA_PREALLOC,
+	BCH_QUOTA_WARN,
+	BCH_QUOTA_NOCHECK,
+};
+
+static inline struct bch_qid bch_qid(struct bch_inode_unpacked *u)
+{
+	return (struct bch_qid) {
+		.q[QTYP_USR] = u->bi_uid,
+		.q[QTYP_GRP] = u->bi_gid,
+		.q[QTYP_PRJ] = u->bi_project,
+	};
+}
+
+static inline unsigned enabled_qtypes(struct bch_fs *c)
+{
+	return ((c->opts.usrquota << QTYP_USR)|
+		(c->opts.grpquota << QTYP_GRP)|
+		(c->opts.prjquota << QTYP_PRJ));
+}
+
+#ifdef CONFIG_BCACHEFS_QUOTA
+
+int bch2_quota_acct(struct bch_fs *, struct bch_qid, enum quota_counters,
+		    s64, enum quota_acct_mode);
+
+int bch2_quota_transfer(struct bch_fs *, unsigned, struct bch_qid,
+			struct bch_qid, u64);
+
+void bch2_fs_quota_exit(struct bch_fs *);
+void bch2_fs_quota_init(struct bch_fs *);
+int bch2_fs_quota_read(struct bch_fs *);
+
+extern const struct quotactl_ops bch2_quotactl_operations;
+
+#else
+
+static inline int bch2_quota_acct(struct bch_fs *c, struct bch_qid qid,
+				  enum quota_counters counter, s64 v,
+				  enum quota_acct_mode mode)
+{
+	return 0;
+}
+
+static inline int bch2_quota_transfer(struct bch_fs *c, unsigned qtypes,
+				      struct bch_qid dst,
+				      struct bch_qid src, u64 space)
+{
+	return 0;
+}
+
+static inline void bch2_fs_quota_exit(struct bch_fs *c) {}
+static inline void bch2_fs_quota_init(struct bch_fs *c) {}
+static inline int bch2_fs_quota_read(struct bch_fs *c) { return 0; }
+
+#endif
+
+#endif /* _BCACHEFS_QUOTA_H */
diff --git a/fs/bcachefs/quota_types.h b/fs/bcachefs/quota_types.h
new file mode 100644
index 000000000000..9eda6c363736
--- /dev/null
+++ b/fs/bcachefs/quota_types.h
@@ -0,0 +1,37 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_QUOTA_TYPES_H
+#define _BCACHEFS_QUOTA_TYPES_H
+
+#include <linux/generic-radix-tree.h>
+
+struct bch_qid {
+	u32		q[QTYP_NR];
+};
+
+struct memquota_counter {
+	u64				v;
+	u64				hardlimit;
+	u64				softlimit;
+	s64				timer;
+	int				warns;
+	int				warning_issued;
+};
+
+struct bch_memquota {
+	struct memquota_counter		c[Q_COUNTERS];
+};
+
+typedef GENRADIX(struct bch_memquota)	bch_memquota_table;
+
+struct quota_limit {
+	u32				timelimit;
+	u32				warnlimit;
+};
+
+struct bch_memquota_type {
+	struct quota_limit		limits[Q_COUNTERS];
+	bch_memquota_table		table;
+	struct mutex			lock;
+};
+
+#endif /* _BCACHEFS_QUOTA_TYPES_H */
diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c
new file mode 100644
index 000000000000..04824f667693
--- /dev/null
+++ b/fs/bcachefs/rebalance.c
@@ -0,0 +1,342 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "alloc.h"
+#include "btree_iter.h"
+#include "buckets.h"
+#include "clock.h"
+#include "disk_groups.h"
+#include "extents.h"
+#include "io.h"
+#include "move.h"
+#include "rebalance.h"
+#include "super-io.h"
+#include "trace.h"
+
+#include <linux/freezer.h>
+#include <linux/kthread.h>
+#include <linux/sched/cputime.h>
+
+static inline bool rebalance_ptr_pred(struct bch_fs *c,
+				      const struct bch_extent_ptr *ptr,
+				      struct bch_extent_crc_unpacked crc,
+				      struct bch_io_opts *io_opts)
+{
+	if (io_opts->background_target &&
+	    !bch2_dev_in_target(c, ptr->dev, io_opts->background_target) &&
+	    !ptr->cached)
+		return true;
+
+	if (io_opts->background_compression &&
+	    crc.compression_type !=
+	    bch2_compression_opt_to_type[io_opts->background_compression])
+		return true;
+
+	return false;
+}
+
+void bch2_rebalance_add_key(struct bch_fs *c,
+			    struct bkey_s_c k,
+			    struct bch_io_opts *io_opts)
+{
+	const struct bch_extent_ptr *ptr;
+	struct bch_extent_crc_unpacked crc;
+	struct bkey_s_c_extent e;
+
+	if (!bkey_extent_is_data(k.k))
+		return;
+
+	if (!io_opts->background_target &&
+	    !io_opts->background_compression)
+		return;
+
+	e = bkey_s_c_to_extent(k);
+
+	extent_for_each_ptr_crc(e, ptr, crc)
+		if (rebalance_ptr_pred(c, ptr, crc, io_opts)) {
+			struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+
+			if (atomic64_add_return(crc.compressed_size,
+						&ca->rebalance_work) ==
+			    crc.compressed_size)
+				rebalance_wakeup(c);
+		}
+}
+
+void bch2_rebalance_add_work(struct bch_fs *c, u64 sectors)
+{
+	if (atomic64_add_return(sectors, &c->rebalance.work_unknown_dev) ==
+	    sectors)
+		rebalance_wakeup(c);
+}
+
+static enum data_cmd rebalance_pred(struct bch_fs *c, void *arg,
+				    enum bkey_type type,
+				    struct bkey_s_c_extent e,
+				    struct bch_io_opts *io_opts,
+				    struct data_opts *data_opts)
+{
+	const struct bch_extent_ptr *ptr;
+	struct bch_extent_crc_unpacked crc;
+
+	/* Make sure we have room to add a new pointer: */
+	if (bkey_val_u64s(e.k) + BKEY_EXTENT_PTR_U64s_MAX >
+	    BKEY_EXTENT_VAL_U64s_MAX)
+		return DATA_SKIP;
+
+	extent_for_each_ptr_crc(e, ptr, crc)
+		if (rebalance_ptr_pred(c, ptr, crc, io_opts))
+			goto found;
+
+	return DATA_SKIP;
+found:
+	data_opts->target		= io_opts->background_target;
+	data_opts->btree_insert_flags	= 0;
+	return DATA_ADD_REPLICAS;
+}
+
+struct rebalance_work {
+	int		dev_most_full_idx;
+	unsigned	dev_most_full_percent;
+	u64		dev_most_full_work;
+	u64		dev_most_full_capacity;
+	u64		total_work;
+};
+
+static void rebalance_work_accumulate(struct rebalance_work *w,
+		u64 dev_work, u64 unknown_dev, u64 capacity, int idx)
+{
+	unsigned percent_full;
+	u64 work = dev_work + unknown_dev;
+
+	if (work < dev_work || work < unknown_dev)
+		work = U64_MAX;
+	work = min(work, capacity);
+
+	percent_full = div_u64(work * 100, capacity);
+
+	if (percent_full >= w->dev_most_full_percent) {
+		w->dev_most_full_idx		= idx;
+		w->dev_most_full_percent	= percent_full;
+		w->dev_most_full_work		= work;
+		w->dev_most_full_capacity	= capacity;
+	}
+
+	if (w->total_work + dev_work >= w->total_work &&
+	    w->total_work + dev_work >= dev_work)
+		w->total_work += dev_work;
+}
+
+static struct rebalance_work rebalance_work(struct bch_fs *c)
+{
+	struct bch_dev *ca;
+	struct rebalance_work ret = { .dev_most_full_idx = -1 };
+	u64 unknown_dev = atomic64_read(&c->rebalance.work_unknown_dev);
+	unsigned i;
+
+	for_each_online_member(ca, c, i)
+		rebalance_work_accumulate(&ret,
+			atomic64_read(&ca->rebalance_work),
+			unknown_dev,
+			bucket_to_sector(ca, ca->mi.nbuckets -
+					 ca->mi.first_bucket),
+			i);
+
+	rebalance_work_accumulate(&ret,
+		unknown_dev, 0, c->capacity, -1);
+
+	return ret;
+}
+
+static void rebalance_work_reset(struct bch_fs *c)
+{
+	struct bch_dev *ca;
+	unsigned i;
+
+	for_each_online_member(ca, c, i)
+		atomic64_set(&ca->rebalance_work, 0);
+
+	atomic64_set(&c->rebalance.work_unknown_dev, 0);
+}
+
+static unsigned long curr_cputime(void)
+{
+	u64 utime, stime;
+
+	task_cputime_adjusted(current, &utime, &stime);
+	return nsecs_to_jiffies(utime + stime);
+}
+
+static int bch2_rebalance_thread(void *arg)
+{
+	struct bch_fs *c = arg;
+	struct bch_fs_rebalance *r = &c->rebalance;
+	struct io_clock *clock = &c->io_clock[WRITE];
+	struct rebalance_work w, p;
+	unsigned long start, prev_start;
+	unsigned long prev_run_time, prev_run_cputime;
+	unsigned long cputime, prev_cputime;
+	unsigned long io_start;
+	long throttle;
+
+	set_freezable();
+
+	io_start	= atomic_long_read(&clock->now);
+	p		= rebalance_work(c);
+	prev_start	= jiffies;
+	prev_cputime	= curr_cputime();
+
+	while (!kthread_wait_freezable(r->enabled)) {
+		start			= jiffies;
+		cputime			= curr_cputime();
+
+		prev_run_time		= start - prev_start;
+		prev_run_cputime	= cputime - prev_cputime;
+
+		w			= rebalance_work(c);
+		BUG_ON(!w.dev_most_full_capacity);
+
+		if (!w.total_work) {
+			r->state = REBALANCE_WAITING;
+			kthread_wait_freezable(rebalance_work(c).total_work);
+			continue;
+		}
+
+		/*
+		 * If there isn't much work to do, throttle cpu usage:
+		 */
+		throttle = prev_run_cputime * 100 /
+			max(1U, w.dev_most_full_percent) -
+			prev_run_time;
+
+		if (w.dev_most_full_percent < 20 && throttle > 0) {
+			r->state = REBALANCE_THROTTLED;
+			r->throttled_until_iotime = io_start +
+				div_u64(w.dev_most_full_capacity *
+					(20 - w.dev_most_full_percent),
+					50);
+			r->throttled_until_cputime = start + throttle;
+
+			bch2_kthread_io_clock_wait(clock,
+				r->throttled_until_iotime,
+				throttle);
+			continue;
+		}
+
+		/* minimum 1 mb/sec: */
+		r->pd.rate.rate =
+			max_t(u64, 1 << 11,
+			      r->pd.rate.rate *
+			      max(p.dev_most_full_percent, 1U) /
+			      max(w.dev_most_full_percent, 1U));
+
+		io_start	= atomic_long_read(&clock->now);
+		p		= w;
+		prev_start	= start;
+		prev_cputime	= cputime;
+
+		r->state = REBALANCE_RUNNING;
+		memset(&r->move_stats, 0, sizeof(r->move_stats));
+		rebalance_work_reset(c);
+
+		bch2_move_data(c,
+			       /* ratelimiting disabled for now */
+			       NULL, /*  &r->pd.rate, */
+			       writepoint_ptr(&c->rebalance_write_point),
+			       POS_MIN, POS_MAX,
+			       rebalance_pred, NULL,
+			       &r->move_stats);
+	}
+
+	return 0;
+}
+
+ssize_t bch2_rebalance_work_show(struct bch_fs *c, char *buf)
+{
+	char *out = buf, *end = out + PAGE_SIZE;
+	struct bch_fs_rebalance *r = &c->rebalance;
+	struct rebalance_work w = rebalance_work(c);
+	char h1[21], h2[21];
+
+	bch2_hprint(h1, w.dev_most_full_work << 9);
+	bch2_hprint(h2, w.dev_most_full_capacity << 9);
+	out += scnprintf(out, end - out,
+			 "fullest_dev (%i):\t%s/%s\n",
+			 w.dev_most_full_idx, h1, h2);
+
+	bch2_hprint(h1, w.total_work << 9);
+	bch2_hprint(h2, c->capacity << 9);
+	out += scnprintf(out, end - out,
+			 "total work:\t\t%s/%s\n",
+			 h1, h2);
+
+	out += scnprintf(out, end - out,
+			 "rate:\t\t\t%u\n",
+			 r->pd.rate.rate);
+
+	switch (r->state) {
+	case REBALANCE_WAITING:
+		out += scnprintf(out, end - out, "waiting\n");
+		break;
+	case REBALANCE_THROTTLED:
+		bch2_hprint(h1,
+			    (r->throttled_until_iotime -
+			     atomic_long_read(&c->io_clock[WRITE].now)) << 9);
+		out += scnprintf(out, end - out,
+				 "throttled for %lu sec or %s io\n",
+				 (r->throttled_until_cputime - jiffies) / HZ,
+				 h1);
+		break;
+	case REBALANCE_RUNNING:
+		out += scnprintf(out, end - out, "running\n");
+		out += scnprintf(out, end - out, "pos %llu:%llu\n",
+				 r->move_stats.iter.pos.inode,
+				 r->move_stats.iter.pos.offset);
+		break;
+	}
+
+	return out - buf;
+}
+
+void bch2_rebalance_stop(struct bch_fs *c)
+{
+	struct task_struct *p;
+
+	c->rebalance.pd.rate.rate = UINT_MAX;
+	bch2_ratelimit_reset(&c->rebalance.pd.rate);
+
+	p = rcu_dereference_protected(c->rebalance.thread, 1);
+	c->rebalance.thread = NULL;
+
+	if (p) {
+		/* for sychronizing with rebalance_wakeup() */
+		synchronize_rcu();
+
+		kthread_stop(p);
+		put_task_struct(p);
+	}
+}
+
+int bch2_rebalance_start(struct bch_fs *c)
+{
+	struct task_struct *p;
+
+	if (c->opts.nochanges)
+		return 0;
+
+	p = kthread_create(bch2_rebalance_thread, c, "bch_rebalance");
+	if (IS_ERR(p))
+		return PTR_ERR(p);
+
+	get_task_struct(p);
+	rcu_assign_pointer(c->rebalance.thread, p);
+	wake_up_process(p);
+	return 0;
+}
+
+void bch2_fs_rebalance_init(struct bch_fs *c)
+{
+	bch2_pd_controller_init(&c->rebalance.pd);
+
+	atomic64_set(&c->rebalance.work_unknown_dev, S64_MAX);
+}
diff --git a/fs/bcachefs/rebalance.h b/fs/bcachefs/rebalance.h
new file mode 100644
index 000000000000..99e2a1fb6084
--- /dev/null
+++ b/fs/bcachefs/rebalance.h
@@ -0,0 +1,28 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_REBALANCE_H
+#define _BCACHEFS_REBALANCE_H
+
+#include "rebalance_types.h"
+
+static inline void rebalance_wakeup(struct bch_fs *c)
+{
+	struct task_struct *p;
+
+	rcu_read_lock();
+	p = rcu_dereference(c->rebalance.thread);
+	if (p)
+		wake_up_process(p);
+	rcu_read_unlock();
+}
+
+void bch2_rebalance_add_key(struct bch_fs *, struct bkey_s_c,
+			    struct bch_io_opts *);
+void bch2_rebalance_add_work(struct bch_fs *, u64);
+
+ssize_t bch2_rebalance_work_show(struct bch_fs *, char *);
+
+void bch2_rebalance_stop(struct bch_fs *);
+int bch2_rebalance_start(struct bch_fs *);
+void bch2_fs_rebalance_init(struct bch_fs *);
+
+#endif /* _BCACHEFS_REBALANCE_H */
diff --git a/fs/bcachefs/rebalance_types.h b/fs/bcachefs/rebalance_types.h
new file mode 100644
index 000000000000..192c6be20ced
--- /dev/null
+++ b/fs/bcachefs/rebalance_types.h
@@ -0,0 +1,27 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_REBALANCE_TYPES_H
+#define _BCACHEFS_REBALANCE_TYPES_H
+
+#include "move_types.h"
+
+enum rebalance_state {
+	REBALANCE_WAITING,
+	REBALANCE_THROTTLED,
+	REBALANCE_RUNNING,
+};
+
+struct bch_fs_rebalance {
+	struct task_struct __rcu *thread;
+	struct bch_pd_controller pd;
+
+	atomic64_t		work_unknown_dev;
+
+	enum rebalance_state	state;
+	unsigned long		throttled_until_iotime;
+	unsigned long		throttled_until_cputime;
+	struct bch_move_stats	move_stats;
+
+	unsigned		enabled:1;
+};
+
+#endif /* _BCACHEFS_REBALANCE_TYPES_H */
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
new file mode 100644
index 000000000000..2596c3c26064
--- /dev/null
+++ b/fs/bcachefs/recovery.c
@@ -0,0 +1,377 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "alloc.h"
+#include "btree_gc.h"
+#include "btree_update.h"
+#include "btree_update_interior.h"
+#include "btree_io.h"
+#include "dirent.h"
+#include "error.h"
+#include "fsck.h"
+#include "journal_io.h"
+#include "quota.h"
+#include "recovery.h"
+#include "super-io.h"
+
+#include <linux/stat.h>
+
+#define QSTR(n) { { { .len = strlen(n) } }, .name = n }
+
+struct bkey_i *btree_root_find(struct bch_fs *c,
+			       struct bch_sb_field_clean *clean,
+			       struct jset *j,
+			       enum btree_id id, unsigned *level)
+{
+	struct bkey_i *k;
+	struct jset_entry *entry, *start, *end;
+
+	if (clean) {
+		start = clean->start;
+		end = vstruct_end(&clean->field);
+	} else {
+		start = j->start;
+		end = vstruct_last(j);
+	}
+
+	for (entry = start; entry < end; entry = vstruct_next(entry))
+		if (entry->type == BCH_JSET_ENTRY_btree_root &&
+		    entry->btree_id == id)
+			goto found;
+
+	return NULL;
+found:
+	if (!entry->u64s)
+		return ERR_PTR(-EINVAL);
+
+	k = entry->start;
+	*level = entry->level;
+	return k;
+}
+
+static int verify_superblock_clean(struct bch_fs *c,
+				   struct bch_sb_field_clean *clean,
+				   struct jset *j)
+{
+	unsigned i;
+	int ret = 0;
+
+	if (!clean || !j)
+		return 0;
+
+	if (mustfix_fsck_err_on(j->seq != clean->journal_seq, c,
+			"superblock journal seq (%llu) doesn't match journal (%llu) after clean shutdown",
+			le64_to_cpu(clean->journal_seq),
+			le64_to_cpu(j->seq)))
+		bch2_fs_mark_clean(c, false);
+
+	mustfix_fsck_err_on(j->read_clock != clean->read_clock, c,
+			"superblock read clock doesn't match journal after clean shutdown");
+	mustfix_fsck_err_on(j->write_clock != clean->write_clock, c,
+			"superblock read clock doesn't match journal after clean shutdown");
+
+	for (i = 0; i < BTREE_ID_NR; i++) {
+		struct bkey_i *k1, *k2;
+		unsigned l1 = 0, l2 = 0;
+
+		k1 = btree_root_find(c, clean, NULL, i, &l1);
+		k2 = btree_root_find(c, NULL, j, i, &l2);
+
+		if (!k1 && !k2)
+			continue;
+
+		mustfix_fsck_err_on(!k1 || !k2 ||
+				    IS_ERR(k1) ||
+				    IS_ERR(k2) ||
+				    k1->k.u64s != k2->k.u64s ||
+				    memcmp(k1, k2, bkey_bytes(k1)) ||
+				    l1 != l2, c,
+			"superblock btree root doesn't match journal after clean shutdown");
+	}
+fsck_err:
+	return ret;
+}
+
+static bool journal_empty(struct list_head *journal)
+{
+	struct journal_replay *i;
+	struct jset_entry *entry;
+
+	if (list_empty(journal))
+		return true;
+
+	i = list_last_entry(journal, struct journal_replay, list);
+
+	if (i->j.last_seq != i->j.seq)
+		return false;
+
+	list_for_each_entry(i, journal, list) {
+		vstruct_for_each(&i->j, entry) {
+			if (entry->type == BCH_JSET_ENTRY_btree_root)
+				continue;
+
+			if (entry->type == BCH_JSET_ENTRY_btree_keys &&
+			    !entry->u64s)
+				continue;
+			return false;
+		}
+	}
+
+	return true;
+}
+
+int bch2_fs_recovery(struct bch_fs *c)
+{
+	const char *err = "cannot allocate memory";
+	struct bch_sb_field_clean *clean = NULL, *sb_clean = NULL;
+	LIST_HEAD(journal);
+	struct jset *j = NULL;
+	unsigned i;
+	int ret;
+
+	mutex_lock(&c->sb_lock);
+	if (!bch2_sb_get_replicas(c->disk_sb.sb)) {
+		bch_info(c, "building replicas info");
+		set_bit(BCH_FS_REBUILD_REPLICAS, &c->flags);
+	}
+
+	if (c->sb.clean)
+		sb_clean = bch2_sb_get_clean(c->disk_sb.sb);
+	if (sb_clean) {
+		clean = kmemdup(sb_clean, vstruct_bytes(&sb_clean->field),
+				GFP_KERNEL);
+		if (!clean) {
+			ret = -ENOMEM;
+			mutex_unlock(&c->sb_lock);
+			goto err;
+		}
+	}
+	mutex_unlock(&c->sb_lock);
+
+	if (clean)
+		bch_info(c, "recovering from clean shutdown, journal seq %llu",
+			 le64_to_cpu(clean->journal_seq));
+
+	if (!clean || !c->opts.nofsck) {
+		ret = bch2_journal_read(c, &journal);
+		if (ret)
+			goto err;
+
+		j = &list_entry(journal.prev, struct journal_replay, list)->j;
+	} else {
+		ret = bch2_journal_set_seq(c,
+					   le64_to_cpu(clean->journal_seq),
+					   le64_to_cpu(clean->journal_seq));
+		BUG_ON(ret);
+	}
+
+	ret = verify_superblock_clean(c, clean, j);
+	if (ret)
+		goto err;
+
+	fsck_err_on(clean && !journal_empty(&journal), c,
+		    "filesystem marked clean but journal not empty");
+
+	if (clean) {
+		c->bucket_clock[READ].hand = le16_to_cpu(clean->read_clock);
+		c->bucket_clock[WRITE].hand = le16_to_cpu(clean->write_clock);
+	} else {
+		c->bucket_clock[READ].hand = le16_to_cpu(j->read_clock);
+		c->bucket_clock[WRITE].hand = le16_to_cpu(j->write_clock);
+	}
+
+	for (i = 0; i < BTREE_ID_NR; i++) {
+		unsigned level;
+		struct bkey_i *k;
+
+		k = btree_root_find(c, clean, j, i, &level);
+		if (!k)
+			continue;
+
+		err = "invalid btree root pointer";
+		if (IS_ERR(k))
+			goto err;
+
+		err = "error reading btree root";
+		if (bch2_btree_root_read(c, i, k, level)) {
+			if (i != BTREE_ID_ALLOC)
+				goto err;
+
+			mustfix_fsck_err(c, "error reading btree root");
+		}
+	}
+
+	for (i = 0; i < BTREE_ID_NR; i++)
+		if (!c->btree_roots[i].b)
+			bch2_btree_root_alloc(c, i);
+
+	err = "error reading allocation information";
+	ret = bch2_alloc_read(c, &journal);
+	if (ret)
+		goto err;
+
+	set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags);
+
+	bch_verbose(c, "starting mark and sweep:");
+	err = "error in recovery";
+	ret = bch2_initial_gc(c, &journal);
+	if (ret)
+		goto err;
+	bch_verbose(c, "mark and sweep done");
+
+	if (c->opts.noreplay)
+		goto out;
+
+	/*
+	 * Mark dirty before journal replay, fsck:
+	 * XXX: after a clean shutdown, this could be done lazily only when fsck
+	 * finds an error
+	 */
+	bch2_fs_mark_clean(c, false);
+
+	/*
+	 * bch2_fs_journal_start() can't happen sooner, or btree_gc_finish()
+	 * will give spurious errors about oldest_gen > bucket_gen -
+	 * this is a hack but oh well.
+	 */
+	bch2_fs_journal_start(&c->journal);
+
+	err = "error starting allocator";
+	ret = bch2_fs_allocator_start(c);
+	if (ret)
+		goto err;
+
+	bch_verbose(c, "starting journal replay:");
+	err = "journal replay failed";
+	ret = bch2_journal_replay(c, &journal);
+	if (ret)
+		goto err;
+	bch_verbose(c, "journal replay done");
+
+	if (c->opts.norecovery)
+		goto out;
+
+	err = "error in fsck";
+	ret = bch2_fsck(c);
+	if (ret)
+		goto err;
+
+	if (enabled_qtypes(c)) {
+		bch_verbose(c, "reading quotas:");
+		ret = bch2_fs_quota_read(c);
+		if (ret)
+			goto err;
+		bch_verbose(c, "quotas done");
+	}
+
+out:
+	bch2_journal_entries_free(&journal);
+	kfree(clean);
+	return ret;
+err:
+fsck_err:
+	BUG_ON(!ret);
+	goto out;
+}
+
+int bch2_fs_initialize(struct bch_fs *c)
+{
+	struct bch_inode_unpacked root_inode, lostfound_inode;
+	struct bkey_inode_buf packed_inode;
+	struct bch_hash_info root_hash_info;
+	struct qstr lostfound = QSTR("lost+found");
+	const char *err = "cannot allocate memory";
+	struct bch_dev *ca;
+	LIST_HEAD(journal);
+	unsigned i;
+	int ret;
+
+	bch_notice(c, "initializing new filesystem");
+
+	set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags);
+
+	ret = bch2_initial_gc(c, &journal);
+	if (ret)
+		goto err;
+
+	err = "unable to allocate journal buckets";
+	for_each_online_member(ca, c, i)
+		if (bch2_dev_journal_alloc(ca)) {
+			percpu_ref_put(&ca->io_ref);
+			goto err;
+		}
+
+	for (i = 0; i < BTREE_ID_NR; i++)
+		bch2_btree_root_alloc(c, i);
+
+	/*
+	 * journal_res_get() will crash if called before this has
+	 * set up the journal.pin FIFO and journal.cur pointer:
+	 */
+	bch2_fs_journal_start(&c->journal);
+	bch2_journal_set_replay_done(&c->journal);
+
+	err = "error starting allocator";
+	ret = bch2_fs_allocator_start(c);
+	if (ret)
+		goto err;
+
+	bch2_inode_init(c, &root_inode, 0, 0,
+			S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0, NULL);
+	root_inode.bi_inum = BCACHEFS_ROOT_INO;
+	root_inode.bi_nlink++; /* lost+found */
+	bch2_inode_pack(&packed_inode, &root_inode);
+
+	err = "error creating root directory";
+	ret = bch2_btree_insert(c, BTREE_ID_INODES,
+				&packed_inode.inode.k_i,
+				NULL, NULL, NULL, 0);
+	if (ret)
+		goto err;
+
+	bch2_inode_init(c, &lostfound_inode, 0, 0,
+			S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0,
+			&root_inode);
+	lostfound_inode.bi_inum = BCACHEFS_ROOT_INO + 1;
+	bch2_inode_pack(&packed_inode, &lostfound_inode);
+
+	err = "error creating lost+found";
+	ret = bch2_btree_insert(c, BTREE_ID_INODES,
+				&packed_inode.inode.k_i,
+				NULL, NULL, NULL, 0);
+	if (ret)
+		goto err;
+
+	root_hash_info = bch2_hash_info_init(c, &root_inode);
+
+	ret = bch2_dirent_create(c, BCACHEFS_ROOT_INO, &root_hash_info, DT_DIR,
+				 &lostfound, lostfound_inode.bi_inum, NULL,
+				 BTREE_INSERT_NOFAIL);
+	if (ret)
+		goto err;
+
+	atomic_long_set(&c->nr_inodes, 2);
+
+	if (enabled_qtypes(c)) {
+		ret = bch2_fs_quota_read(c);
+		if (ret)
+			goto err;
+	}
+
+	err = "error writing first journal entry";
+	ret = bch2_journal_meta(&c->journal);
+	if (ret)
+		goto err;
+
+	mutex_lock(&c->sb_lock);
+	SET_BCH_SB_INITIALIZED(c->disk_sb.sb, true);
+	SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
+
+	bch2_write_super(c);
+	mutex_unlock(&c->sb_lock);
+
+	return 0;
+err:
+	BUG_ON(!ret);
+	return ret;
+}
diff --git a/fs/bcachefs/recovery.h b/fs/bcachefs/recovery.h
new file mode 100644
index 000000000000..912929117c37
--- /dev/null
+++ b/fs/bcachefs/recovery.h
@@ -0,0 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_RECOVERY_H
+#define _BCACHEFS_RECOVERY_H
+
+int bch2_fs_recovery(struct bch_fs *);
+int bch2_fs_initialize(struct bch_fs *);
+
+#endif /* _BCACHEFS_RECOVERY_H */
diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c
new file mode 100644
index 000000000000..4b87aa8e1f75
--- /dev/null
+++ b/fs/bcachefs/replicas.c
@@ -0,0 +1,698 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "replicas.h"
+#include "super-io.h"
+
+static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *,
+					    struct bch_replicas_cpu *);
+
+/* Replicas tracking - in memory: */
+
+#define for_each_cpu_replicas_entry(_r, _i)				\
+	for (_i = (_r)->entries;					\
+	     (void *) (_i) < (void *) (_r)->entries + (_r)->nr * (_r)->entry_size;\
+	     _i = (void *) (_i) + (_r)->entry_size)
+
+static inline struct bch_replicas_cpu_entry *
+cpu_replicas_entry(struct bch_replicas_cpu *r, unsigned i)
+{
+	return (void *) r->entries + r->entry_size * i;
+}
+
+static void bch2_cpu_replicas_sort(struct bch_replicas_cpu *r)
+{
+	eytzinger0_sort(r->entries, r->nr, r->entry_size, memcmp, NULL);
+}
+
+static inline bool replicas_test_dev(struct bch_replicas_cpu_entry *e,
+				     unsigned dev)
+{
+	return (e->devs[dev >> 3] & (1 << (dev & 7))) != 0;
+}
+
+static inline void replicas_set_dev(struct bch_replicas_cpu_entry *e,
+				    unsigned dev)
+{
+	e->devs[dev >> 3] |= 1 << (dev & 7);
+}
+
+static inline unsigned replicas_dev_slots(struct bch_replicas_cpu *r)
+{
+	return (r->entry_size -
+		offsetof(struct bch_replicas_cpu_entry, devs)) * 8;
+}
+
+int bch2_cpu_replicas_to_text(struct bch_replicas_cpu *r,
+			      char *buf, size_t size)
+{
+	char *out = buf, *end = out + size;
+	struct bch_replicas_cpu_entry *e;
+	bool first = true;
+	unsigned i;
+
+	for_each_cpu_replicas_entry(r, e) {
+		bool first_e = true;
+
+		if (!first)
+			out += scnprintf(out, end - out, " ");
+		first = false;
+
+		out += scnprintf(out, end - out, "%u: [", e->data_type);
+
+		for (i = 0; i < replicas_dev_slots(r); i++)
+			if (replicas_test_dev(e, i)) {
+				if (!first_e)
+					out += scnprintf(out, end - out, " ");
+				first_e = false;
+				out += scnprintf(out, end - out, "%u", i);
+			}
+		out += scnprintf(out, end - out, "]");
+	}
+
+	return out - buf;
+}
+
+static inline unsigned bkey_to_replicas(struct bkey_s_c_extent e,
+					enum bch_data_type data_type,
+					struct bch_replicas_cpu_entry *r,
+					unsigned *max_dev)
+{
+	const struct bch_extent_ptr *ptr;
+	unsigned nr = 0;
+
+	BUG_ON(!data_type ||
+	       data_type == BCH_DATA_SB ||
+	       data_type >= BCH_DATA_NR);
+
+	memset(r, 0, sizeof(*r));
+	r->data_type = data_type;
+
+	*max_dev = 0;
+
+	extent_for_each_ptr(e, ptr)
+		if (!ptr->cached) {
+			*max_dev = max_t(unsigned, *max_dev, ptr->dev);
+			replicas_set_dev(r, ptr->dev);
+			nr++;
+		}
+	return nr;
+}
+
+static inline void devlist_to_replicas(struct bch_devs_list devs,
+				       enum bch_data_type data_type,
+				       struct bch_replicas_cpu_entry *r,
+				       unsigned *max_dev)
+{
+	unsigned i;
+
+	BUG_ON(!data_type ||
+	       data_type == BCH_DATA_SB ||
+	       data_type >= BCH_DATA_NR);
+
+	memset(r, 0, sizeof(*r));
+	r->data_type = data_type;
+
+	*max_dev = 0;
+
+	for (i = 0; i < devs.nr; i++) {
+		*max_dev = max_t(unsigned, *max_dev, devs.devs[i]);
+		replicas_set_dev(r, devs.devs[i]);
+	}
+}
+
+static struct bch_replicas_cpu *
+cpu_replicas_add_entry(struct bch_replicas_cpu *old,
+		       struct bch_replicas_cpu_entry new_entry,
+		       unsigned max_dev)
+{
+	struct bch_replicas_cpu *new;
+	unsigned i, nr, entry_size;
+
+	entry_size = offsetof(struct bch_replicas_cpu_entry, devs) +
+		DIV_ROUND_UP(max_dev + 1, 8);
+	entry_size = max(entry_size, old->entry_size);
+	nr = old->nr + 1;
+
+	new = kzalloc(sizeof(struct bch_replicas_cpu) +
+		      nr * entry_size, GFP_NOIO);
+	if (!new)
+		return NULL;
+
+	new->nr		= nr;
+	new->entry_size	= entry_size;
+
+	for (i = 0; i < old->nr; i++)
+		memcpy(cpu_replicas_entry(new, i),
+		       cpu_replicas_entry(old, i),
+		       min(new->entry_size, old->entry_size));
+
+	memcpy(cpu_replicas_entry(new, old->nr),
+	       &new_entry,
+	       new->entry_size);
+
+	bch2_cpu_replicas_sort(new);
+	return new;
+}
+
+static bool replicas_has_entry(struct bch_replicas_cpu *r,
+				struct bch_replicas_cpu_entry search,
+				unsigned max_dev)
+{
+	return max_dev < replicas_dev_slots(r) &&
+		eytzinger0_find(r->entries, r->nr,
+				r->entry_size,
+				memcmp, &search) < r->nr;
+}
+
+noinline
+static int bch2_mark_replicas_slowpath(struct bch_fs *c,
+				struct bch_replicas_cpu_entry new_entry,
+				unsigned max_dev)
+{
+	struct bch_replicas_cpu *old_gc, *new_gc = NULL, *old_r, *new_r = NULL;
+	int ret = -ENOMEM;
+
+	mutex_lock(&c->sb_lock);
+
+	old_gc = rcu_dereference_protected(c->replicas_gc,
+					   lockdep_is_held(&c->sb_lock));
+	if (old_gc && !replicas_has_entry(old_gc, new_entry, max_dev)) {
+		new_gc = cpu_replicas_add_entry(old_gc, new_entry, max_dev);
+		if (!new_gc)
+			goto err;
+	}
+
+	old_r = rcu_dereference_protected(c->replicas,
+					  lockdep_is_held(&c->sb_lock));
+	if (!replicas_has_entry(old_r, new_entry, max_dev)) {
+		new_r = cpu_replicas_add_entry(old_r, new_entry, max_dev);
+		if (!new_r)
+			goto err;
+
+		ret = bch2_cpu_replicas_to_sb_replicas(c, new_r);
+		if (ret)
+			goto err;
+	}
+
+	/* allocations done, now commit: */
+
+	if (new_r)
+		bch2_write_super(c);
+
+	/* don't update in memory replicas until changes are persistent */
+
+	if (new_gc) {
+		rcu_assign_pointer(c->replicas_gc, new_gc);
+		kfree_rcu(old_gc, rcu);
+	}
+
+	if (new_r) {
+		rcu_assign_pointer(c->replicas, new_r);
+		kfree_rcu(old_r, rcu);
+	}
+
+	mutex_unlock(&c->sb_lock);
+	return 0;
+err:
+	mutex_unlock(&c->sb_lock);
+	kfree(new_gc);
+	kfree(new_r);
+	return ret;
+}
+
+int bch2_mark_replicas(struct bch_fs *c,
+		       enum bch_data_type data_type,
+		       struct bch_devs_list devs)
+{
+	struct bch_replicas_cpu_entry search;
+	struct bch_replicas_cpu *r, *gc_r;
+	unsigned max_dev;
+	bool marked;
+
+	if (!devs.nr)
+		return 0;
+
+	BUG_ON(devs.nr >= BCH_REPLICAS_MAX);
+
+	devlist_to_replicas(devs, data_type, &search, &max_dev);
+
+	rcu_read_lock();
+	r = rcu_dereference(c->replicas);
+	gc_r = rcu_dereference(c->replicas_gc);
+	marked = replicas_has_entry(r, search, max_dev) &&
+		(!likely(gc_r) || replicas_has_entry(gc_r, search, max_dev));
+	rcu_read_unlock();
+
+	return likely(marked) ? 0
+		: bch2_mark_replicas_slowpath(c, search, max_dev);
+}
+
+int bch2_mark_bkey_replicas(struct bch_fs *c,
+			    enum bch_data_type data_type,
+			    struct bkey_s_c k)
+{
+	struct bch_devs_list cached = bch2_bkey_cached_devs(k);
+	unsigned i;
+	int ret;
+
+	for (i = 0; i < cached.nr; i++)
+		if ((ret = bch2_mark_replicas(c, BCH_DATA_CACHED,
+					      bch2_dev_list_single(cached.devs[i]))))
+			return ret;
+
+	return bch2_mark_replicas(c, data_type, bch2_bkey_dirty_devs(k));
+}
+
+int bch2_replicas_gc_end(struct bch_fs *c, int ret)
+{
+	struct bch_replicas_cpu *new_r, *old_r;
+
+	lockdep_assert_held(&c->replicas_gc_lock);
+
+	mutex_lock(&c->sb_lock);
+
+	new_r = rcu_dereference_protected(c->replicas_gc,
+					  lockdep_is_held(&c->sb_lock));
+	rcu_assign_pointer(c->replicas_gc, NULL);
+
+	if (ret)
+		goto err;
+
+	if (bch2_cpu_replicas_to_sb_replicas(c, new_r)) {
+		ret = -ENOSPC;
+		goto err;
+	}
+
+	bch2_write_super(c);
+
+	/* don't update in memory replicas until changes are persistent */
+
+	old_r = rcu_dereference_protected(c->replicas,
+					  lockdep_is_held(&c->sb_lock));
+
+	rcu_assign_pointer(c->replicas, new_r);
+	kfree_rcu(old_r, rcu);
+out:
+	mutex_unlock(&c->sb_lock);
+	return ret;
+err:
+	kfree_rcu(new_r, rcu);
+	goto out;
+}
+
+int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask)
+{
+	struct bch_replicas_cpu *dst, *src;
+	struct bch_replicas_cpu_entry *e;
+
+	lockdep_assert_held(&c->replicas_gc_lock);
+
+	mutex_lock(&c->sb_lock);
+	BUG_ON(c->replicas_gc);
+
+	src = rcu_dereference_protected(c->replicas,
+					lockdep_is_held(&c->sb_lock));
+
+	dst = kzalloc(sizeof(struct bch_replicas_cpu) +
+		      src->nr * src->entry_size, GFP_NOIO);
+	if (!dst) {
+		mutex_unlock(&c->sb_lock);
+		return -ENOMEM;
+	}
+
+	dst->nr		= 0;
+	dst->entry_size	= src->entry_size;
+
+	for_each_cpu_replicas_entry(src, e)
+		if (!((1 << e->data_type) & typemask))
+			memcpy(cpu_replicas_entry(dst, dst->nr++),
+			       e, dst->entry_size);
+
+	bch2_cpu_replicas_sort(dst);
+
+	rcu_assign_pointer(c->replicas_gc, dst);
+	mutex_unlock(&c->sb_lock);
+
+	return 0;
+}
+
+/* Replicas tracking - superblock: */
+
+static void bch2_sb_replicas_nr_entries(struct bch_sb_field_replicas *r,
+					unsigned *nr,
+					unsigned *bytes,
+					unsigned *max_dev)
+{
+	struct bch_replicas_entry *i;
+	unsigned j;
+
+	*nr	= 0;
+	*bytes	= sizeof(*r);
+	*max_dev = 0;
+
+	if (!r)
+		return;
+
+	for_each_replicas_entry(r, i) {
+		for (j = 0; j < i->nr; j++)
+			*max_dev = max_t(unsigned, *max_dev, i->devs[j]);
+		(*nr)++;
+	}
+
+	*bytes = (void *) i - (void *) r;
+}
+
+static struct bch_replicas_cpu *
+__bch2_sb_replicas_to_cpu_replicas(struct bch_sb_field_replicas *sb_r)
+{
+	struct bch_replicas_cpu *cpu_r;
+	unsigned i, nr, bytes, max_dev, entry_size;
+
+	bch2_sb_replicas_nr_entries(sb_r, &nr, &bytes, &max_dev);
+
+	entry_size = offsetof(struct bch_replicas_cpu_entry, devs) +
+		DIV_ROUND_UP(max_dev + 1, 8);
+
+	cpu_r = kzalloc(sizeof(struct bch_replicas_cpu) +
+			nr * entry_size, GFP_NOIO);
+	if (!cpu_r)
+		return NULL;
+
+	cpu_r->nr		= nr;
+	cpu_r->entry_size	= entry_size;
+
+	if (nr) {
+		struct bch_replicas_cpu_entry *dst =
+			cpu_replicas_entry(cpu_r, 0);
+		struct bch_replicas_entry *src = sb_r->entries;
+
+		while (dst < cpu_replicas_entry(cpu_r, nr)) {
+			dst->data_type = src->data_type;
+			for (i = 0; i < src->nr; i++)
+				replicas_set_dev(dst, src->devs[i]);
+
+			src	= replicas_entry_next(src);
+			dst	= (void *) dst + entry_size;
+		}
+	}
+
+	bch2_cpu_replicas_sort(cpu_r);
+	return cpu_r;
+}
+
+int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *c)
+{
+	struct bch_sb_field_replicas *sb_r;
+	struct bch_replicas_cpu *cpu_r, *old_r;
+
+	sb_r	= bch2_sb_get_replicas(c->disk_sb.sb);
+	cpu_r	= __bch2_sb_replicas_to_cpu_replicas(sb_r);
+	if (!cpu_r)
+		return -ENOMEM;
+
+	old_r = rcu_dereference_check(c->replicas, lockdep_is_held(&c->sb_lock));
+	rcu_assign_pointer(c->replicas, cpu_r);
+	if (old_r)
+		kfree_rcu(old_r, rcu);
+
+	return 0;
+}
+
+static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *c,
+					    struct bch_replicas_cpu *r)
+{
+	struct bch_sb_field_replicas *sb_r;
+	struct bch_replicas_entry *sb_e;
+	struct bch_replicas_cpu_entry *e;
+	size_t i, bytes;
+
+	bytes = sizeof(struct bch_sb_field_replicas);
+
+	for_each_cpu_replicas_entry(r, e) {
+		bytes += sizeof(struct bch_replicas_entry);
+		for (i = 0; i < r->entry_size - 1; i++)
+			bytes += hweight8(e->devs[i]);
+	}
+
+	sb_r = bch2_sb_resize_replicas(&c->disk_sb,
+			DIV_ROUND_UP(sizeof(*sb_r) + bytes, sizeof(u64)));
+	if (!sb_r)
+		return -ENOSPC;
+
+	memset(&sb_r->entries, 0,
+	       vstruct_end(&sb_r->field) -
+	       (void *) &sb_r->entries);
+
+	sb_e = sb_r->entries;
+	for_each_cpu_replicas_entry(r, e) {
+		sb_e->data_type = e->data_type;
+
+		for (i = 0; i < replicas_dev_slots(r); i++)
+			if (replicas_test_dev(e, i))
+				sb_e->devs[sb_e->nr++] = i;
+
+		sb_e = replicas_entry_next(sb_e);
+
+		BUG_ON((void *) sb_e > vstruct_end(&sb_r->field));
+	}
+
+	return 0;
+}
+
+static const char *bch2_sb_validate_replicas(struct bch_sb *sb, struct bch_sb_field *f)
+{
+	struct bch_sb_field_replicas *sb_r = field_to_type(f, replicas);
+	struct bch_sb_field_members *mi = bch2_sb_get_members(sb);
+	struct bch_replicas_cpu *cpu_r = NULL;
+	struct bch_replicas_entry *e;
+	const char *err;
+	unsigned i;
+
+	for_each_replicas_entry(sb_r, e) {
+		err = "invalid replicas entry: invalid data type";
+		if (e->data_type >= BCH_DATA_NR)
+			goto err;
+
+		err = "invalid replicas entry: no devices";
+		if (!e->nr)
+			goto err;
+
+		err = "invalid replicas entry: too many devices";
+		if (e->nr >= BCH_REPLICAS_MAX)
+			goto err;
+
+		err = "invalid replicas entry: invalid device";
+		for (i = 0; i < e->nr; i++)
+			if (!bch2_dev_exists(sb, mi, e->devs[i]))
+				goto err;
+	}
+
+	err = "cannot allocate memory";
+	cpu_r = __bch2_sb_replicas_to_cpu_replicas(sb_r);
+	if (!cpu_r)
+		goto err;
+
+	sort_cmp_size(cpu_r->entries,
+		      cpu_r->nr,
+		      cpu_r->entry_size,
+		      memcmp, NULL);
+
+	for (i = 0; i + 1 < cpu_r->nr; i++) {
+		struct bch_replicas_cpu_entry *l =
+			cpu_replicas_entry(cpu_r, i);
+		struct bch_replicas_cpu_entry *r =
+			cpu_replicas_entry(cpu_r, i + 1);
+
+		BUG_ON(memcmp(l, r, cpu_r->entry_size) > 0);
+
+		err = "duplicate replicas entry";
+		if (!memcmp(l, r, cpu_r->entry_size))
+			goto err;
+	}
+
+	err = NULL;
+err:
+	kfree(cpu_r);
+	return err;
+}
+
+const struct bch_sb_field_ops bch_sb_field_ops_replicas = {
+	.validate	= bch2_sb_validate_replicas,
+};
+
+int bch2_sb_replicas_to_text(struct bch_sb_field_replicas *r, char *buf, size_t size)
+{
+	char *out = buf, *end = out + size;
+	struct bch_replicas_entry *e;
+	bool first = true;
+	unsigned i;
+
+	if (!r) {
+		out += scnprintf(out, end - out, "(no replicas section found)");
+		return out - buf;
+	}
+
+	for_each_replicas_entry(r, e) {
+		if (!first)
+			out += scnprintf(out, end - out, " ");
+		first = false;
+
+		out += scnprintf(out, end - out, "%u: [", e->data_type);
+
+		for (i = 0; i < e->nr; i++)
+			out += scnprintf(out, end - out,
+					 i ? " %u" : "%u", e->devs[i]);
+		out += scnprintf(out, end - out, "]");
+	}
+
+	return out - buf;
+}
+
+/* Query replicas: */
+
+bool bch2_replicas_marked(struct bch_fs *c,
+			  enum bch_data_type data_type,
+			  struct bch_devs_list devs)
+{
+	struct bch_replicas_cpu_entry search;
+	unsigned max_dev;
+	bool ret;
+
+	if (!devs.nr)
+		return true;
+
+	devlist_to_replicas(devs, data_type, &search, &max_dev);
+
+	rcu_read_lock();
+	ret = replicas_has_entry(rcu_dereference(c->replicas),
+				 search, max_dev);
+	rcu_read_unlock();
+
+	return ret;
+}
+
+bool bch2_bkey_replicas_marked(struct bch_fs *c,
+			       enum bch_data_type data_type,
+			       struct bkey_s_c k)
+{
+	struct bch_devs_list cached = bch2_bkey_cached_devs(k);
+	unsigned i;
+
+	for (i = 0; i < cached.nr; i++)
+		if (!bch2_replicas_marked(c, BCH_DATA_CACHED,
+					  bch2_dev_list_single(cached.devs[i])))
+			return false;
+
+	return bch2_replicas_marked(c, data_type, bch2_bkey_dirty_devs(k));
+}
+
+struct replicas_status __bch2_replicas_status(struct bch_fs *c,
+					      struct bch_devs_mask online_devs)
+{
+	struct bch_sb_field_members *mi;
+	struct bch_replicas_cpu_entry *e;
+	struct bch_replicas_cpu *r;
+	unsigned i, dev, dev_slots, nr_online, nr_offline;
+	struct replicas_status ret;
+
+	memset(&ret, 0, sizeof(ret));
+
+	for (i = 0; i < ARRAY_SIZE(ret.replicas); i++)
+		ret.replicas[i].nr_online = UINT_MAX;
+
+	mi = bch2_sb_get_members(c->disk_sb.sb);
+	rcu_read_lock();
+
+	r = rcu_dereference(c->replicas);
+	dev_slots = replicas_dev_slots(r);
+
+	for_each_cpu_replicas_entry(r, e) {
+		if (e->data_type >= ARRAY_SIZE(ret.replicas))
+			panic("e %p data_type %u\n", e, e->data_type);
+
+		nr_online = nr_offline = 0;
+
+		for (dev = 0; dev < dev_slots; dev++) {
+			if (!replicas_test_dev(e, dev))
+				continue;
+
+			BUG_ON(!bch2_dev_exists(c->disk_sb.sb, mi, dev));
+
+			if (test_bit(dev, online_devs.d))
+				nr_online++;
+			else
+				nr_offline++;
+		}
+
+		ret.replicas[e->data_type].nr_online =
+			min(ret.replicas[e->data_type].nr_online,
+			    nr_online);
+
+		ret.replicas[e->data_type].nr_offline =
+			max(ret.replicas[e->data_type].nr_offline,
+			    nr_offline);
+	}
+
+	rcu_read_unlock();
+
+	return ret;
+}
+
+struct replicas_status bch2_replicas_status(struct bch_fs *c)
+{
+	return __bch2_replicas_status(c, bch2_online_devs(c));
+}
+
+static bool have_enough_devs(struct replicas_status s,
+			     enum bch_data_type type,
+			     bool force_if_degraded,
+			     bool force_if_lost)
+{
+	return (!s.replicas[type].nr_offline || force_if_degraded) &&
+		(s.replicas[type].nr_online || force_if_lost);
+}
+
+bool bch2_have_enough_devs(struct replicas_status s, unsigned flags)
+{
+	return (have_enough_devs(s, BCH_DATA_JOURNAL,
+				 flags & BCH_FORCE_IF_METADATA_DEGRADED,
+				 flags & BCH_FORCE_IF_METADATA_LOST) &&
+		have_enough_devs(s, BCH_DATA_BTREE,
+				 flags & BCH_FORCE_IF_METADATA_DEGRADED,
+				 flags & BCH_FORCE_IF_METADATA_LOST) &&
+		have_enough_devs(s, BCH_DATA_USER,
+				 flags & BCH_FORCE_IF_DATA_DEGRADED,
+				 flags & BCH_FORCE_IF_DATA_LOST));
+}
+
+unsigned bch2_replicas_online(struct bch_fs *c, bool meta)
+{
+	struct replicas_status s = bch2_replicas_status(c);
+
+	return meta
+		? min(s.replicas[BCH_DATA_JOURNAL].nr_online,
+		      s.replicas[BCH_DATA_BTREE].nr_online)
+		: s.replicas[BCH_DATA_USER].nr_online;
+}
+
+unsigned bch2_dev_has_data(struct bch_fs *c, struct bch_dev *ca)
+{
+	struct bch_replicas_cpu_entry *e;
+	struct bch_replicas_cpu *r;
+	unsigned ret = 0;
+
+	rcu_read_lock();
+	r = rcu_dereference(c->replicas);
+
+	if (ca->dev_idx >= replicas_dev_slots(r))
+		goto out;
+
+	for_each_cpu_replicas_entry(r, e)
+		if (replicas_test_dev(e, ca->dev_idx))
+			ret |= 1 << e->data_type;
+out:
+	rcu_read_unlock();
+
+	return ret;
+}
diff --git a/fs/bcachefs/replicas.h b/fs/bcachefs/replicas.h
new file mode 100644
index 000000000000..de506cf9e11d
--- /dev/null
+++ b/fs/bcachefs/replicas.h
@@ -0,0 +1,52 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_REPLICAS_H
+#define _BCACHEFS_REPLICAS_H
+
+bool bch2_replicas_marked(struct bch_fs *, enum bch_data_type,
+			  struct bch_devs_list);
+bool bch2_bkey_replicas_marked(struct bch_fs *, enum bch_data_type,
+			       struct bkey_s_c);
+int bch2_mark_replicas(struct bch_fs *, enum bch_data_type,
+		       struct bch_devs_list);
+int bch2_mark_bkey_replicas(struct bch_fs *, enum bch_data_type,
+			    struct bkey_s_c);
+
+int bch2_cpu_replicas_to_text(struct bch_replicas_cpu *, char *, size_t);
+int bch2_sb_replicas_to_text(struct bch_sb_field_replicas *, char *, size_t);
+
+struct replicas_status {
+	struct {
+		unsigned	nr_online;
+		unsigned	nr_offline;
+	}			replicas[BCH_DATA_NR];
+};
+
+struct replicas_status __bch2_replicas_status(struct bch_fs *,
+					      struct bch_devs_mask);
+struct replicas_status bch2_replicas_status(struct bch_fs *);
+bool bch2_have_enough_devs(struct replicas_status, unsigned);
+
+unsigned bch2_replicas_online(struct bch_fs *, bool);
+unsigned bch2_dev_has_data(struct bch_fs *, struct bch_dev *);
+
+int bch2_replicas_gc_end(struct bch_fs *, int);
+int bch2_replicas_gc_start(struct bch_fs *, unsigned);
+
+/* iterate over superblock replicas - used by userspace tools: */
+
+static inline struct bch_replicas_entry *
+replicas_entry_next(struct bch_replicas_entry *i)
+{
+	return (void *) i + offsetof(struct bch_replicas_entry, devs) + i->nr;
+}
+
+#define for_each_replicas_entry(_r, _i)					\
+	for (_i = (_r)->entries;					\
+	     (void *) (_i) < vstruct_end(&(_r)->field) && (_i)->data_type;\
+	     (_i) = replicas_entry_next(_i))
+
+int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *);
+
+extern const struct bch_sb_field_ops bch_sb_field_ops_replicas;
+
+#endif /* _BCACHEFS_REPLICAS_H */
diff --git a/fs/bcachefs/siphash.c b/fs/bcachefs/siphash.c
new file mode 100644
index 000000000000..c062edb3fbc2
--- /dev/null
+++ b/fs/bcachefs/siphash.c
@@ -0,0 +1,173 @@
+// SPDX-License-Identifier: BSD-3-Clause
+/*	$OpenBSD: siphash.c,v 1.3 2015/02/20 11:51:03 tedu Exp $ */
+
+/*-
+ * Copyright (c) 2013 Andre Oppermann <andre@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote
+ *    products derived from this software without specific prior written
+ *    permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * SipHash is a family of PRFs SipHash-c-d where the integer parameters c and d
+ * are the number of compression rounds and the number of finalization rounds.
+ * A compression round is identical to a finalization round and this round
+ * function is called SipRound.  Given a 128-bit key k and a (possibly empty)
+ * byte string m, SipHash-c-d returns a 64-bit value SipHash-c-d(k; m).
+ *
+ * Implemented from the paper "SipHash: a fast short-input PRF", 2012.09.18,
+ * by Jean-Philippe Aumasson and Daniel J. Bernstein,
+ * Permanent Document ID b9a943a805fbfc6fde808af9fc0ecdfa
+ * https://131002.net/siphash/siphash.pdf
+ * https://131002.net/siphash/
+ */
+
+#include <asm/byteorder.h>
+#include <asm/unaligned.h>
+#include <linux/bitops.h>
+#include <linux/string.h>
+
+#include "siphash.h"
+
+static void SipHash_Rounds(SIPHASH_CTX *ctx, int rounds)
+{
+	while (rounds--) {
+		ctx->v[0] += ctx->v[1];
+		ctx->v[2] += ctx->v[3];
+		ctx->v[1] = rol64(ctx->v[1], 13);
+		ctx->v[3] = rol64(ctx->v[3], 16);
+
+		ctx->v[1] ^= ctx->v[0];
+		ctx->v[3] ^= ctx->v[2];
+		ctx->v[0] = rol64(ctx->v[0], 32);
+
+		ctx->v[2] += ctx->v[1];
+		ctx->v[0] += ctx->v[3];
+		ctx->v[1] = rol64(ctx->v[1], 17);
+		ctx->v[3] = rol64(ctx->v[3], 21);
+
+		ctx->v[1] ^= ctx->v[2];
+		ctx->v[3] ^= ctx->v[0];
+		ctx->v[2] = rol64(ctx->v[2], 32);
+	}
+}
+
+static void SipHash_CRounds(SIPHASH_CTX *ctx, const void *ptr, int rounds)
+{
+	u64 m = get_unaligned_le64(ptr);
+
+	ctx->v[3] ^= m;
+	SipHash_Rounds(ctx, rounds);
+	ctx->v[0] ^= m;
+}
+
+void SipHash_Init(SIPHASH_CTX *ctx, const SIPHASH_KEY *key)
+{
+	u64 k0, k1;
+
+	k0 = le64_to_cpu(key->k0);
+	k1 = le64_to_cpu(key->k1);
+
+	ctx->v[0] = 0x736f6d6570736575ULL ^ k0;
+	ctx->v[1] = 0x646f72616e646f6dULL ^ k1;
+	ctx->v[2] = 0x6c7967656e657261ULL ^ k0;
+	ctx->v[3] = 0x7465646279746573ULL ^ k1;
+
+	memset(ctx->buf, 0, sizeof(ctx->buf));
+	ctx->bytes = 0;
+}
+
+void SipHash_Update(SIPHASH_CTX *ctx, int rc, int rf,
+		    const void *src, size_t len)
+{
+	const u8 *ptr = src;
+	size_t left, used;
+
+	if (len == 0)
+		return;
+
+	used = ctx->bytes % sizeof(ctx->buf);
+	ctx->bytes += len;
+
+	if (used > 0) {
+		left = sizeof(ctx->buf) - used;
+
+		if (len >= left) {
+			memcpy(&ctx->buf[used], ptr, left);
+			SipHash_CRounds(ctx, ctx->buf, rc);
+			len -= left;
+			ptr += left;
+		} else {
+			memcpy(&ctx->buf[used], ptr, len);
+			return;
+		}
+	}
+
+	while (len >= sizeof(ctx->buf)) {
+		SipHash_CRounds(ctx, ptr, rc);
+		len -= sizeof(ctx->buf);
+		ptr += sizeof(ctx->buf);
+	}
+
+	if (len > 0)
+		memcpy(&ctx->buf[used], ptr, len);
+}
+
+void SipHash_Final(void *dst, SIPHASH_CTX *ctx, int rc, int rf)
+{
+	u64 r;
+
+	r = SipHash_End(ctx, rc, rf);
+
+	*((__le64 *) dst) = cpu_to_le64(r);
+}
+
+u64 SipHash_End(SIPHASH_CTX *ctx, int rc, int rf)
+{
+	u64 r;
+	size_t left, used;
+
+	used = ctx->bytes % sizeof(ctx->buf);
+	left = sizeof(ctx->buf) - used;
+	memset(&ctx->buf[used], 0, left - 1);
+	ctx->buf[7] = ctx->bytes;
+
+	SipHash_CRounds(ctx, ctx->buf, rc);
+	ctx->v[2] ^= 0xff;
+	SipHash_Rounds(ctx, rf);
+
+	r = (ctx->v[0] ^ ctx->v[1]) ^ (ctx->v[2] ^ ctx->v[3]);
+	memset(ctx, 0, sizeof(*ctx));
+	return (r);
+}
+
+u64 SipHash(const SIPHASH_KEY *key, int rc, int rf, const void *src, size_t len)
+{
+	SIPHASH_CTX ctx;
+
+	SipHash_Init(&ctx, key);
+	SipHash_Update(&ctx, rc, rf, src, len);
+	return SipHash_End(&ctx, rc, rf);
+}
diff --git a/fs/bcachefs/siphash.h b/fs/bcachefs/siphash.h
new file mode 100644
index 000000000000..3dfaf34a43b2
--- /dev/null
+++ b/fs/bcachefs/siphash.h
@@ -0,0 +1,87 @@
+/* SPDX-License-Identifier: BSD-3-Clause */
+/* $OpenBSD: siphash.h,v 1.5 2015/02/20 11:51:03 tedu Exp $ */
+/*-
+ * Copyright (c) 2013 Andre Oppermann <andre@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote
+ *    products derived from this software without specific prior written
+ *    permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+/*
+ * SipHash is a family of pseudorandom functions (a.k.a. keyed hash functions)
+ * optimized for speed on short messages returning a 64bit hash/digest value.
+ *
+ * The number of rounds is defined during the initialization:
+ *  SipHash24_Init() for the fast and resonable strong version
+ *  SipHash48_Init() for the strong version (half as fast)
+ *
+ * struct SIPHASH_CTX ctx;
+ * SipHash24_Init(&ctx);
+ * SipHash_SetKey(&ctx, "16bytes long key");
+ * SipHash_Update(&ctx, pointer_to_string, length_of_string);
+ * SipHash_Final(output, &ctx);
+ */
+
+#ifndef _SIPHASH_H_
+#define _SIPHASH_H_
+
+#include <linux/types.h>
+
+#define SIPHASH_BLOCK_LENGTH	 8
+#define SIPHASH_KEY_LENGTH	16
+#define SIPHASH_DIGEST_LENGTH	 8
+
+typedef struct _SIPHASH_CTX {
+	u64		v[4];
+	u8		buf[SIPHASH_BLOCK_LENGTH];
+	u32		bytes;
+} SIPHASH_CTX;
+
+typedef struct {
+	__le64		k0;
+	__le64		k1;
+} SIPHASH_KEY;
+
+void	SipHash_Init(SIPHASH_CTX *, const SIPHASH_KEY *);
+void	SipHash_Update(SIPHASH_CTX *, int, int, const void *, size_t);
+u64	SipHash_End(SIPHASH_CTX *, int, int);
+void	SipHash_Final(void *, SIPHASH_CTX *, int, int);
+u64	SipHash(const SIPHASH_KEY *, int, int, const void *, size_t);
+
+#define SipHash24_Init(_c, _k)		SipHash_Init((_c), (_k))
+#define SipHash24_Update(_c, _p, _l)	SipHash_Update((_c), 2, 4, (_p), (_l))
+#define SipHash24_End(_d)		SipHash_End((_d), 2, 4)
+#define SipHash24_Final(_d, _c)		SipHash_Final((_d), (_c), 2, 4)
+#define SipHash24(_k, _p, _l)		SipHash((_k), 2, 4, (_p), (_l))
+
+#define SipHash48_Init(_c, _k)		SipHash_Init((_c), (_k))
+#define SipHash48_Update(_c, _p, _l)	SipHash_Update((_c), 4, 8, (_p), (_l))
+#define SipHash48_End(_d)		SipHash_End((_d), 4, 8)
+#define SipHash48_Final(_d, _c)		SipHash_Final((_d), (_c), 4, 8)
+#define SipHash48(_k, _p, _l)		SipHash((_k), 4, 8, (_p), (_l))
+
+#endif /* _SIPHASH_H_ */
diff --git a/fs/bcachefs/six.c b/fs/bcachefs/six.c
new file mode 100644
index 000000000000..9dd4b71e63ab
--- /dev/null
+++ b/fs/bcachefs/six.c
@@ -0,0 +1,780 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/export.h>
+#include <linux/log2.h>
+#include <linux/percpu.h>
+#include <linux/preempt.h>
+#include <linux/rcupdate.h>
+#include <linux/sched.h>
+#include <linux/sched/rt.h>
+#include <linux/slab.h>
+
+#include "six.h"
+
+#ifdef DEBUG
+#define EBUG_ON(cond)		BUG_ON(cond)
+#else
+#define EBUG_ON(cond)		do {} while (0)
+#endif
+
+#define six_acquire(l, t)	lock_acquire(l, 0, t, 0, 0, NULL, _RET_IP_)
+#define six_release(l)		lock_release(l, _RET_IP_)
+
+struct six_lock_vals {
+	/* Value we add to the lock in order to take the lock: */
+	u64			lock_val;
+
+	/* If the lock has this value (used as a mask), taking the lock fails: */
+	u64			lock_fail;
+
+	/* Value we add to the lock in order to release the lock: */
+	u64			unlock_val;
+
+	/* Mask that indicates lock is held for this type: */
+	u64			held_mask;
+
+	/* Waitlist we wakeup when releasing the lock: */
+	enum six_lock_type	unlock_wakeup;
+};
+
+#define __SIX_LOCK_HELD_read	__SIX_VAL(read_lock, ~0)
+#define __SIX_LOCK_HELD_intent	__SIX_VAL(intent_lock, ~0)
+#define __SIX_LOCK_HELD_write	__SIX_VAL(seq, 1)
+
+#define LOCK_VALS {							\
+	[SIX_LOCK_read] = {						\
+		.lock_val	= __SIX_VAL(read_lock, 1),		\
+		.lock_fail	= __SIX_LOCK_HELD_write + __SIX_VAL(write_locking, 1),\
+		.unlock_val	= -__SIX_VAL(read_lock, 1),		\
+		.held_mask	= __SIX_LOCK_HELD_read,			\
+		.unlock_wakeup	= SIX_LOCK_write,			\
+	},								\
+	[SIX_LOCK_intent] = {						\
+		.lock_val	= __SIX_VAL(intent_lock, 1),		\
+		.lock_fail	= __SIX_LOCK_HELD_intent,		\
+		.unlock_val	= -__SIX_VAL(intent_lock, 1),		\
+		.held_mask	= __SIX_LOCK_HELD_intent,		\
+		.unlock_wakeup	= SIX_LOCK_intent,			\
+	},								\
+	[SIX_LOCK_write] = {						\
+		.lock_val	= __SIX_VAL(seq, 1),			\
+		.lock_fail	= __SIX_LOCK_HELD_read,			\
+		.unlock_val	= __SIX_VAL(seq, 1),			\
+		.held_mask	= __SIX_LOCK_HELD_write,		\
+		.unlock_wakeup	= SIX_LOCK_read,			\
+	},								\
+}
+
+static inline void six_set_owner(struct six_lock *lock, enum six_lock_type type,
+				 union six_lock_state old)
+{
+	if (type != SIX_LOCK_intent)
+		return;
+
+	if (!old.intent_lock) {
+		EBUG_ON(lock->owner);
+		lock->owner = current;
+	} else {
+		EBUG_ON(lock->owner != current);
+	}
+}
+
+static inline unsigned pcpu_read_count(struct six_lock *lock)
+{
+	unsigned read_count = 0;
+	int cpu;
+
+	for_each_possible_cpu(cpu)
+		read_count += *per_cpu_ptr(lock->readers, cpu);
+	return read_count;
+}
+
+struct six_lock_waiter {
+	struct list_head	list;
+	struct task_struct	*task;
+};
+
+/* This is probably up there with the more evil things I've done */
+#define waitlist_bitnr(id) ilog2((((union six_lock_state) { .waiters = 1 << (id) }).l))
+
+static inline void six_lock_wakeup(struct six_lock *lock,
+				   union six_lock_state state,
+				   unsigned waitlist_id)
+{
+	if (waitlist_id == SIX_LOCK_write) {
+		if (state.write_locking && !state.read_lock) {
+			struct task_struct *p = READ_ONCE(lock->owner);
+			if (p)
+				wake_up_process(p);
+		}
+	} else {
+		struct list_head *wait_list = &lock->wait_list[waitlist_id];
+		struct six_lock_waiter *w, *next;
+
+		if (!(state.waiters & (1 << waitlist_id)))
+			return;
+
+		clear_bit(waitlist_bitnr(waitlist_id),
+			  (unsigned long *) &lock->state.v);
+
+		raw_spin_lock(&lock->wait_lock);
+
+		list_for_each_entry_safe(w, next, wait_list, list) {
+			list_del_init(&w->list);
+
+			if (wake_up_process(w->task) &&
+			    waitlist_id != SIX_LOCK_read) {
+				if (!list_empty(wait_list))
+					set_bit(waitlist_bitnr(waitlist_id),
+						(unsigned long *) &lock->state.v);
+				break;
+			}
+		}
+
+		raw_spin_unlock(&lock->wait_lock);
+	}
+}
+
+static __always_inline bool do_six_trylock_type(struct six_lock *lock,
+						enum six_lock_type type,
+						bool try)
+{
+	const struct six_lock_vals l[] = LOCK_VALS;
+	union six_lock_state old, new;
+	bool ret;
+	u64 v;
+
+	EBUG_ON(type == SIX_LOCK_write && lock->owner != current);
+	EBUG_ON(type == SIX_LOCK_write && (lock->state.seq & 1));
+
+	EBUG_ON(type == SIX_LOCK_write && (try != !(lock->state.write_locking)));
+
+	/*
+	 * Percpu reader mode:
+	 *
+	 * The basic idea behind this algorithm is that you can implement a lock
+	 * between two threads without any atomics, just memory barriers:
+	 *
+	 * For two threads you'll need two variables, one variable for "thread a
+	 * has the lock" and another for "thread b has the lock".
+	 *
+	 * To take the lock, a thread sets its variable indicating that it holds
+	 * the lock, then issues a full memory barrier, then reads from the
+	 * other thread's variable to check if the other thread thinks it has
+	 * the lock. If we raced, we backoff and retry/sleep.
+	 */
+
+	if (type == SIX_LOCK_read && lock->readers) {
+retry:
+		preempt_disable();
+		this_cpu_inc(*lock->readers); /* signal that we own lock */
+
+		smp_mb();
+
+		old.v = READ_ONCE(lock->state.v);
+		ret = !(old.v & l[type].lock_fail);
+
+		this_cpu_sub(*lock->readers, !ret);
+		preempt_enable();
+
+		/*
+		 * If we failed because a writer was trying to take the
+		 * lock, issue a wakeup because we might have caused a
+		 * spurious trylock failure:
+		 */
+		if (old.write_locking) {
+			struct task_struct *p = READ_ONCE(lock->owner);
+
+			if (p)
+				wake_up_process(p);
+		}
+
+		/*
+		 * If we failed from the lock path and the waiting bit wasn't
+		 * set, set it:
+		 */
+		if (!try && !ret) {
+			v = old.v;
+
+			do {
+				new.v = old.v = v;
+
+				if (!(old.v & l[type].lock_fail))
+					goto retry;
+
+				if (new.waiters & (1 << type))
+					break;
+
+				new.waiters |= 1 << type;
+			} while ((v = atomic64_cmpxchg(&lock->state.counter,
+						       old.v, new.v)) != old.v);
+		}
+	} else if (type == SIX_LOCK_write && lock->readers) {
+		if (try) {
+			atomic64_add(__SIX_VAL(write_locking, 1),
+				     &lock->state.counter);
+			smp_mb__after_atomic();
+		}
+
+		ret = !pcpu_read_count(lock);
+
+		/*
+		 * On success, we increment lock->seq; also we clear
+		 * write_locking unless we failed from the lock path:
+		 */
+		v = 0;
+		if (ret)
+			v += __SIX_VAL(seq, 1);
+		if (ret || try)
+			v -= __SIX_VAL(write_locking, 1);
+
+		if (try && !ret) {
+			old.v = atomic64_add_return(v, &lock->state.counter);
+			six_lock_wakeup(lock, old, SIX_LOCK_read);
+		} else {
+			atomic64_add(v, &lock->state.counter);
+		}
+	} else {
+		v = READ_ONCE(lock->state.v);
+		do {
+			new.v = old.v = v;
+
+			if (!(old.v & l[type].lock_fail)) {
+				new.v += l[type].lock_val;
+
+				if (type == SIX_LOCK_write)
+					new.write_locking = 0;
+			} else if (!try && type != SIX_LOCK_write &&
+				   !(new.waiters & (1 << type)))
+				new.waiters |= 1 << type;
+			else
+				break; /* waiting bit already set */
+		} while ((v = atomic64_cmpxchg_acquire(&lock->state.counter,
+					old.v, new.v)) != old.v);
+
+		ret = !(old.v & l[type].lock_fail);
+
+		EBUG_ON(ret && !(lock->state.v & l[type].held_mask));
+	}
+
+	if (ret)
+		six_set_owner(lock, type, old);
+
+	EBUG_ON(type == SIX_LOCK_write && (try || ret) && (lock->state.write_locking));
+
+	return ret;
+}
+
+__always_inline __flatten
+static bool __six_trylock_type(struct six_lock *lock, enum six_lock_type type)
+{
+	if (!do_six_trylock_type(lock, type, true))
+		return false;
+
+	if (type != SIX_LOCK_write)
+		six_acquire(&lock->dep_map, 1);
+	return true;
+}
+
+__always_inline __flatten
+static bool __six_relock_type(struct six_lock *lock, enum six_lock_type type,
+			      unsigned seq)
+{
+	const struct six_lock_vals l[] = LOCK_VALS;
+	union six_lock_state old;
+	u64 v;
+
+	EBUG_ON(type == SIX_LOCK_write);
+
+	if (type == SIX_LOCK_read &&
+	    lock->readers) {
+		bool ret;
+
+		preempt_disable();
+		this_cpu_inc(*lock->readers);
+
+		smp_mb();
+
+		old.v = READ_ONCE(lock->state.v);
+		ret = !(old.v & l[type].lock_fail) && old.seq == seq;
+
+		this_cpu_sub(*lock->readers, !ret);
+		preempt_enable();
+
+		/*
+		 * Similar to the lock path, we may have caused a spurious write
+		 * lock fail and need to issue a wakeup:
+		 */
+		if (old.write_locking) {
+			struct task_struct *p = READ_ONCE(lock->owner);
+
+			if (p)
+				wake_up_process(p);
+		}
+
+		if (ret)
+			six_acquire(&lock->dep_map, 1);
+
+		return ret;
+	}
+
+	v = READ_ONCE(lock->state.v);
+	do {
+		old.v = v;
+
+		if (old.seq != seq || old.v & l[type].lock_fail)
+			return false;
+	} while ((v = atomic64_cmpxchg_acquire(&lock->state.counter,
+				old.v,
+				old.v + l[type].lock_val)) != old.v);
+
+	six_set_owner(lock, type, old);
+	if (type != SIX_LOCK_write)
+		six_acquire(&lock->dep_map, 1);
+	return true;
+}
+
+#ifdef CONFIG_SIX_LOCK_SPIN_ON_OWNER
+
+static inline int six_can_spin_on_owner(struct six_lock *lock)
+{
+	struct task_struct *owner;
+	int retval = 1;
+
+	if (need_resched())
+		return 0;
+
+	rcu_read_lock();
+	owner = READ_ONCE(lock->owner);
+	if (owner)
+		retval = owner->on_cpu;
+	rcu_read_unlock();
+	/*
+	 * if lock->owner is not set, the mutex owner may have just acquired
+	 * it and not set the owner yet or the mutex has been released.
+	 */
+	return retval;
+}
+
+static inline bool six_spin_on_owner(struct six_lock *lock,
+				     struct task_struct *owner)
+{
+	bool ret = true;
+
+	rcu_read_lock();
+	while (lock->owner == owner) {
+		/*
+		 * Ensure we emit the owner->on_cpu, dereference _after_
+		 * checking lock->owner still matches owner. If that fails,
+		 * owner might point to freed memory. If it still matches,
+		 * the rcu_read_lock() ensures the memory stays valid.
+		 */
+		barrier();
+
+		if (!owner->on_cpu || need_resched()) {
+			ret = false;
+			break;
+		}
+
+		cpu_relax();
+	}
+	rcu_read_unlock();
+
+	return ret;
+}
+
+static inline bool six_optimistic_spin(struct six_lock *lock, enum six_lock_type type)
+{
+	struct task_struct *task = current;
+
+	if (type == SIX_LOCK_write)
+		return false;
+
+	preempt_disable();
+	if (!six_can_spin_on_owner(lock))
+		goto fail;
+
+	if (!osq_lock(&lock->osq))
+		goto fail;
+
+	while (1) {
+		struct task_struct *owner;
+
+		/*
+		 * If there's an owner, wait for it to either
+		 * release the lock or go to sleep.
+		 */
+		owner = READ_ONCE(lock->owner);
+		if (owner && !six_spin_on_owner(lock, owner))
+			break;
+
+		if (do_six_trylock_type(lock, type, false)) {
+			osq_unlock(&lock->osq);
+			preempt_enable();
+			return true;
+		}
+
+		/*
+		 * When there's no owner, we might have preempted between the
+		 * owner acquiring the lock and setting the owner field. If
+		 * we're an RT task that will live-lock because we won't let
+		 * the owner complete.
+		 */
+		if (!owner && (need_resched() || rt_task(task)))
+			break;
+
+		/*
+		 * The cpu_relax() call is a compiler barrier which forces
+		 * everything in this loop to be re-loaded. We don't need
+		 * memory barriers as we'll eventually observe the right
+		 * values at the cost of a few extra spins.
+		 */
+		cpu_relax();
+	}
+
+	osq_unlock(&lock->osq);
+fail:
+	preempt_enable();
+
+	/*
+	 * If we fell out of the spin path because of need_resched(),
+	 * reschedule now, before we try-lock again. This avoids getting
+	 * scheduled out right after we obtained the lock.
+	 */
+	if (need_resched())
+		schedule();
+
+	return false;
+}
+
+#else /* CONFIG_SIX_LOCK_SPIN_ON_OWNER */
+
+static inline bool six_optimistic_spin(struct six_lock *lock, enum six_lock_type type)
+{
+	return false;
+}
+
+#endif
+
+noinline
+static int __six_lock_type_slowpath(struct six_lock *lock, enum six_lock_type type,
+				    six_lock_should_sleep_fn should_sleep_fn, void *p)
+{
+	union six_lock_state old;
+	struct six_lock_waiter wait;
+	int ret = 0;
+
+	if (type == SIX_LOCK_write) {
+		EBUG_ON(lock->state.write_locking);
+		atomic64_add(__SIX_VAL(write_locking, 1), &lock->state.counter);
+		smp_mb__after_atomic();
+	}
+
+	ret = should_sleep_fn ? should_sleep_fn(lock, p) : 0;
+	if (ret)
+		goto out_before_sleep;
+
+	if (six_optimistic_spin(lock, type))
+		goto out_before_sleep;
+
+	lock_contended(&lock->dep_map, _RET_IP_);
+
+	INIT_LIST_HEAD(&wait.list);
+	wait.task = current;
+
+	while (1) {
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		if (type == SIX_LOCK_write)
+			EBUG_ON(lock->owner != current);
+		else if (list_empty_careful(&wait.list)) {
+			raw_spin_lock(&lock->wait_lock);
+			list_add_tail(&wait.list, &lock->wait_list[type]);
+			raw_spin_unlock(&lock->wait_lock);
+		}
+
+		if (do_six_trylock_type(lock, type, false))
+			break;
+
+		ret = should_sleep_fn ? should_sleep_fn(lock, p) : 0;
+		if (ret)
+			break;
+
+		schedule();
+	}
+
+	__set_current_state(TASK_RUNNING);
+
+	if (!list_empty_careful(&wait.list)) {
+		raw_spin_lock(&lock->wait_lock);
+		list_del_init(&wait.list);
+		raw_spin_unlock(&lock->wait_lock);
+	}
+out_before_sleep:
+	if (ret && type == SIX_LOCK_write) {
+		old.v = atomic64_sub_return(__SIX_VAL(write_locking, 1),
+					    &lock->state.counter);
+		six_lock_wakeup(lock, old, SIX_LOCK_read);
+	}
+
+	return ret;
+}
+
+__always_inline
+static int __six_lock_type(struct six_lock *lock, enum six_lock_type type,
+			   six_lock_should_sleep_fn should_sleep_fn, void *p)
+{
+	int ret;
+
+	if (type != SIX_LOCK_write)
+		six_acquire(&lock->dep_map, 0);
+
+	ret = do_six_trylock_type(lock, type, true) ? 0
+		: __six_lock_type_slowpath(lock, type, should_sleep_fn, p);
+
+	if (ret && type != SIX_LOCK_write)
+		six_release(&lock->dep_map);
+	if (!ret)
+		lock_acquired(&lock->dep_map, _RET_IP_);
+
+	return ret;
+}
+
+__always_inline __flatten
+static void __six_unlock_type(struct six_lock *lock, enum six_lock_type type)
+{
+	const struct six_lock_vals l[] = LOCK_VALS;
+	union six_lock_state state;
+
+	EBUG_ON(type == SIX_LOCK_write &&
+		!(lock->state.v & __SIX_LOCK_HELD_intent));
+
+	if (type != SIX_LOCK_write)
+		six_release(&lock->dep_map);
+
+	if (type == SIX_LOCK_intent) {
+		EBUG_ON(lock->owner != current);
+
+		if (lock->intent_lock_recurse) {
+			--lock->intent_lock_recurse;
+			return;
+		}
+
+		lock->owner = NULL;
+	}
+
+	if (type == SIX_LOCK_read &&
+	    lock->readers) {
+		smp_mb(); /* unlock barrier */
+		this_cpu_dec(*lock->readers);
+		smp_mb(); /* between unlocking and checking for waiters */
+		state.v = READ_ONCE(lock->state.v);
+	} else {
+		EBUG_ON(!(lock->state.v & l[type].held_mask));
+		state.v = atomic64_add_return_release(l[type].unlock_val,
+						      &lock->state.counter);
+	}
+
+	six_lock_wakeup(lock, state, l[type].unlock_wakeup);
+}
+
+#define __SIX_LOCK(type)						\
+bool six_trylock_##type(struct six_lock *lock)				\
+{									\
+	return __six_trylock_type(lock, SIX_LOCK_##type);		\
+}									\
+EXPORT_SYMBOL_GPL(six_trylock_##type);					\
+									\
+bool six_relock_##type(struct six_lock *lock, u32 seq)			\
+{									\
+	return __six_relock_type(lock, SIX_LOCK_##type, seq);		\
+}									\
+EXPORT_SYMBOL_GPL(six_relock_##type);					\
+									\
+int six_lock_##type(struct six_lock *lock,				\
+		    six_lock_should_sleep_fn should_sleep_fn, void *p)	\
+{									\
+	return __six_lock_type(lock, SIX_LOCK_##type, should_sleep_fn, p);\
+}									\
+EXPORT_SYMBOL_GPL(six_lock_##type);					\
+									\
+void six_unlock_##type(struct six_lock *lock)				\
+{									\
+	__six_unlock_type(lock, SIX_LOCK_##type);			\
+}									\
+EXPORT_SYMBOL_GPL(six_unlock_##type);
+
+__SIX_LOCK(read)
+__SIX_LOCK(intent)
+__SIX_LOCK(write)
+
+#undef __SIX_LOCK
+
+/* Convert from intent to read: */
+void six_lock_downgrade(struct six_lock *lock)
+{
+	six_lock_increment(lock, SIX_LOCK_read);
+	six_unlock_intent(lock);
+}
+EXPORT_SYMBOL_GPL(six_lock_downgrade);
+
+bool six_lock_tryupgrade(struct six_lock *lock)
+{
+	union six_lock_state old, new;
+	u64 v = READ_ONCE(lock->state.v);
+
+	do {
+		new.v = old.v = v;
+
+		if (new.intent_lock)
+			return false;
+
+		if (!lock->readers) {
+			EBUG_ON(!new.read_lock);
+			new.read_lock--;
+		}
+
+		new.intent_lock = 1;
+	} while ((v = atomic64_cmpxchg_acquire(&lock->state.counter,
+				old.v, new.v)) != old.v);
+
+	if (lock->readers)
+		this_cpu_dec(*lock->readers);
+
+	six_set_owner(lock, SIX_LOCK_intent, old);
+
+	return true;
+}
+EXPORT_SYMBOL_GPL(six_lock_tryupgrade);
+
+bool six_trylock_convert(struct six_lock *lock,
+			 enum six_lock_type from,
+			 enum six_lock_type to)
+{
+	EBUG_ON(to == SIX_LOCK_write || from == SIX_LOCK_write);
+
+	if (to == from)
+		return true;
+
+	if (to == SIX_LOCK_read) {
+		six_lock_downgrade(lock);
+		return true;
+	} else {
+		return six_lock_tryupgrade(lock);
+	}
+}
+EXPORT_SYMBOL_GPL(six_trylock_convert);
+
+/*
+ * Increment read/intent lock count, assuming we already have it read or intent
+ * locked:
+ */
+void six_lock_increment(struct six_lock *lock, enum six_lock_type type)
+{
+	const struct six_lock_vals l[] = LOCK_VALS;
+
+	six_acquire(&lock->dep_map, 0);
+
+	/* XXX: assert already locked, and that we don't overflow: */
+
+	switch (type) {
+	case SIX_LOCK_read:
+		if (lock->readers) {
+			this_cpu_inc(*lock->readers);
+		} else {
+			EBUG_ON(!lock->state.read_lock &&
+				!lock->state.intent_lock);
+			atomic64_add(l[type].lock_val, &lock->state.counter);
+		}
+		break;
+	case SIX_LOCK_intent:
+		EBUG_ON(!lock->state.intent_lock);
+		lock->intent_lock_recurse++;
+		break;
+	case SIX_LOCK_write:
+		BUG();
+		break;
+	}
+}
+EXPORT_SYMBOL_GPL(six_lock_increment);
+
+void six_lock_wakeup_all(struct six_lock *lock)
+{
+	struct six_lock_waiter *w;
+
+	raw_spin_lock(&lock->wait_lock);
+
+	list_for_each_entry(w, &lock->wait_list[0], list)
+		wake_up_process(w->task);
+	list_for_each_entry(w, &lock->wait_list[1], list)
+		wake_up_process(w->task);
+
+	raw_spin_unlock(&lock->wait_lock);
+}
+EXPORT_SYMBOL_GPL(six_lock_wakeup_all);
+
+struct free_pcpu_rcu {
+	struct rcu_head		rcu;
+	void __percpu		*p;
+};
+
+static void free_pcpu_rcu_fn(struct rcu_head *_rcu)
+{
+	struct free_pcpu_rcu *rcu =
+		container_of(_rcu, struct free_pcpu_rcu, rcu);
+
+	free_percpu(rcu->p);
+	kfree(rcu);
+}
+
+void six_lock_pcpu_free_rcu(struct six_lock *lock)
+{
+	struct free_pcpu_rcu *rcu = kzalloc(sizeof(*rcu), GFP_KERNEL);
+
+	if (!rcu)
+		return;
+
+	rcu->p = lock->readers;
+	lock->readers = NULL;
+
+	call_rcu(&rcu->rcu, free_pcpu_rcu_fn);
+}
+EXPORT_SYMBOL_GPL(six_lock_pcpu_free_rcu);
+
+void six_lock_pcpu_free(struct six_lock *lock)
+{
+	BUG_ON(lock->readers && pcpu_read_count(lock));
+	BUG_ON(lock->state.read_lock);
+
+	free_percpu(lock->readers);
+	lock->readers = NULL;
+}
+EXPORT_SYMBOL_GPL(six_lock_pcpu_free);
+
+void six_lock_pcpu_alloc(struct six_lock *lock)
+{
+#ifdef __KERNEL__
+	if (!lock->readers)
+		lock->readers = alloc_percpu(unsigned);
+#endif
+}
+EXPORT_SYMBOL_GPL(six_lock_pcpu_alloc);
+
+/*
+ * Returns lock held counts, for both read and intent
+ */
+struct six_lock_count six_lock_counts(struct six_lock *lock)
+{
+	struct six_lock_count ret = { 0, lock->state.intent_lock };
+
+	if (!lock->readers)
+		ret.read += lock->state.read_lock;
+	else {
+		int cpu;
+
+		for_each_possible_cpu(cpu)
+			ret.read += *per_cpu_ptr(lock->readers, cpu);
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(six_lock_counts);
diff --git a/fs/bcachefs/six.h b/fs/bcachefs/six.h
new file mode 100644
index 000000000000..08d0e0c7f2b4
--- /dev/null
+++ b/fs/bcachefs/six.h
@@ -0,0 +1,215 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef _LINUX_SIX_H
+#define _LINUX_SIX_H
+
+/*
+ * Shared/intent/exclusive locks: sleepable read/write locks, much like rw
+ * semaphores, except with a third intermediate state, intent. Basic operations
+ * are:
+ *
+ * six_lock_read(&foo->lock);
+ * six_unlock_read(&foo->lock);
+ *
+ * six_lock_intent(&foo->lock);
+ * six_unlock_intent(&foo->lock);
+ *
+ * six_lock_write(&foo->lock);
+ * six_unlock_write(&foo->lock);
+ *
+ * Intent locks block other intent locks, but do not block read locks, and you
+ * must have an intent lock held before taking a write lock, like so:
+ *
+ * six_lock_intent(&foo->lock);
+ * six_lock_write(&foo->lock);
+ * six_unlock_write(&foo->lock);
+ * six_unlock_intent(&foo->lock);
+ *
+ * Other operations:
+ *
+ *   six_trylock_read()
+ *   six_trylock_intent()
+ *   six_trylock_write()
+ *
+ *   six_lock_downgrade():	convert from intent to read
+ *   six_lock_tryupgrade():	attempt to convert from read to intent
+ *
+ * Locks also embed a sequence number, which is incremented when the lock is
+ * locked or unlocked for write. The current sequence number can be grabbed
+ * while a lock is held from lock->state.seq; then, if you drop the lock you can
+ * use six_relock_(read|intent_write)(lock, seq) to attempt to retake the lock
+ * iff it hasn't been locked for write in the meantime.
+ *
+ * There are also operations that take the lock type as a parameter, where the
+ * type is one of SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write:
+ *
+ *   six_lock_type(lock, type)
+ *   six_unlock_type(lock, type)
+ *   six_relock(lock, type, seq)
+ *   six_trylock_type(lock, type)
+ *   six_trylock_convert(lock, from, to)
+ *
+ * A lock may be held multiple times by the same thread (for read or intent,
+ * not write). However, the six locks code does _not_ implement the actual
+ * recursive checks itself though - rather, if your code (e.g. btree iterator
+ * code) knows that the current thread already has a lock held, and for the
+ * correct type, six_lock_increment() may be used to bump up the counter for
+ * that type - the only effect is that one more call to unlock will be required
+ * before the lock is unlocked.
+ */
+
+#include <linux/lockdep.h>
+#include <linux/sched.h>
+#include <linux/types.h>
+
+#ifdef CONFIG_SIX_LOCK_SPIN_ON_OWNER
+#include <linux/osq_lock.h>
+#endif
+
+#define SIX_LOCK_SEPARATE_LOCKFNS
+
+union six_lock_state {
+	struct {
+		atomic64_t	counter;
+	};
+
+	struct {
+		u64		v;
+	};
+
+	struct {
+		/* for waitlist_bitnr() */
+		unsigned long	l;
+	};
+
+	struct {
+		unsigned	read_lock:27;
+		unsigned	write_locking:1;
+		unsigned	intent_lock:1;
+		unsigned	waiters:3;
+		/*
+		 * seq works much like in seqlocks: it's incremented every time
+		 * we lock and unlock for write.
+		 *
+		 * If it's odd write lock is held, even unlocked.
+		 *
+		 * Thus readers can unlock, and then lock again later iff it
+		 * hasn't been modified in the meantime.
+		 */
+		u32		seq;
+	};
+};
+
+enum six_lock_type {
+	SIX_LOCK_read,
+	SIX_LOCK_intent,
+	SIX_LOCK_write,
+};
+
+struct six_lock {
+	union six_lock_state	state;
+	unsigned		intent_lock_recurse;
+	struct task_struct	*owner;
+#ifdef CONFIG_SIX_LOCK_SPIN_ON_OWNER
+	struct optimistic_spin_queue osq;
+#endif
+	unsigned __percpu	*readers;
+
+	raw_spinlock_t		wait_lock;
+	struct list_head	wait_list[2];
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	struct lockdep_map	dep_map;
+#endif
+};
+
+typedef int (*six_lock_should_sleep_fn)(struct six_lock *lock, void *);
+
+static __always_inline void __six_lock_init(struct six_lock *lock,
+					    const char *name,
+					    struct lock_class_key *key)
+{
+	atomic64_set(&lock->state.counter, 0);
+	raw_spin_lock_init(&lock->wait_lock);
+	INIT_LIST_HEAD(&lock->wait_list[SIX_LOCK_read]);
+	INIT_LIST_HEAD(&lock->wait_list[SIX_LOCK_intent]);
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	debug_check_no_locks_freed((void *) lock, sizeof(*lock));
+	lockdep_init_map(&lock->dep_map, name, key, 0);
+#endif
+}
+
+#define six_lock_init(lock)						\
+do {									\
+	static struct lock_class_key __key;				\
+									\
+	__six_lock_init((lock), #lock, &__key);				\
+} while (0)
+
+#define __SIX_VAL(field, _v)	(((union six_lock_state) { .field = _v }).v)
+
+#define __SIX_LOCK(type)						\
+bool six_trylock_##type(struct six_lock *);				\
+bool six_relock_##type(struct six_lock *, u32);				\
+int six_lock_##type(struct six_lock *, six_lock_should_sleep_fn, void *);\
+void six_unlock_##type(struct six_lock *);
+
+__SIX_LOCK(read)
+__SIX_LOCK(intent)
+__SIX_LOCK(write)
+#undef __SIX_LOCK
+
+#define SIX_LOCK_DISPATCH(type, fn, ...)			\
+	switch (type) {						\
+	case SIX_LOCK_read:					\
+		return fn##_read(__VA_ARGS__);			\
+	case SIX_LOCK_intent:					\
+		return fn##_intent(__VA_ARGS__);		\
+	case SIX_LOCK_write:					\
+		return fn##_write(__VA_ARGS__);			\
+	default:						\
+		BUG();						\
+	}
+
+static inline bool six_trylock_type(struct six_lock *lock, enum six_lock_type type)
+{
+	SIX_LOCK_DISPATCH(type, six_trylock, lock);
+}
+
+static inline bool six_relock_type(struct six_lock *lock, enum six_lock_type type,
+				   unsigned seq)
+{
+	SIX_LOCK_DISPATCH(type, six_relock, lock, seq);
+}
+
+static inline int six_lock_type(struct six_lock *lock, enum six_lock_type type,
+				six_lock_should_sleep_fn should_sleep_fn, void *p)
+{
+	SIX_LOCK_DISPATCH(type, six_lock, lock, should_sleep_fn, p);
+}
+
+static inline void six_unlock_type(struct six_lock *lock, enum six_lock_type type)
+{
+	SIX_LOCK_DISPATCH(type, six_unlock, lock);
+}
+
+void six_lock_downgrade(struct six_lock *);
+bool six_lock_tryupgrade(struct six_lock *);
+bool six_trylock_convert(struct six_lock *, enum six_lock_type,
+			 enum six_lock_type);
+
+void six_lock_increment(struct six_lock *, enum six_lock_type);
+
+void six_lock_wakeup_all(struct six_lock *);
+
+void six_lock_pcpu_free_rcu(struct six_lock *);
+void six_lock_pcpu_free(struct six_lock *);
+void six_lock_pcpu_alloc(struct six_lock *);
+
+struct six_lock_count {
+	unsigned read;
+	unsigned intent;
+};
+
+struct six_lock_count six_lock_counts(struct six_lock *);
+
+#endif /* _LINUX_SIX_H */
diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h
new file mode 100644
index 000000000000..0947fdcdc4cd
--- /dev/null
+++ b/fs/bcachefs/str_hash.h
@@ -0,0 +1,319 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_STR_HASH_H
+#define _BCACHEFS_STR_HASH_H
+
+#include "btree_iter.h"
+#include "btree_update.h"
+#include "checksum.h"
+#include "error.h"
+#include "inode.h"
+#include "siphash.h"
+#include "super.h"
+
+#include <linux/crc32c.h>
+#include <crypto/hash.h>
+#include <crypto/sha2.h>
+
+struct bch_hash_info {
+	u8			type;
+	union {
+		__le64		crc_key;
+		SIPHASH_KEY	siphash_key;
+	};
+};
+
+static inline struct bch_hash_info
+bch2_hash_info_init(struct bch_fs *c,
+		   const struct bch_inode_unpacked *bi)
+{
+	/* XXX ick */
+	struct bch_hash_info info = {
+		.type = (bi->bi_flags >> INODE_STR_HASH_OFFSET) &
+			~(~0U << INODE_STR_HASH_BITS)
+	};
+
+	switch (info.type) {
+	case BCH_STR_HASH_CRC32C:
+	case BCH_STR_HASH_CRC64:
+		info.crc_key = bi->bi_hash_seed;
+		break;
+	case BCH_STR_HASH_SIPHASH: {
+		SHASH_DESC_ON_STACK(desc, c->sha256);
+		u8 digest[SHA256_DIGEST_SIZE];
+
+		desc->tfm = c->sha256;
+
+		crypto_shash_digest(desc, (void *) &bi->bi_hash_seed,
+				    sizeof(bi->bi_hash_seed), digest);
+		memcpy(&info.siphash_key, digest, sizeof(info.siphash_key));
+		break;
+	}
+	default:
+		BUG();
+	}
+
+	return info;
+}
+
+struct bch_str_hash_ctx {
+	union {
+		u32		crc32c;
+		u64		crc64;
+		SIPHASH_CTX	siphash;
+	};
+};
+
+static inline void bch2_str_hash_init(struct bch_str_hash_ctx *ctx,
+				     const struct bch_hash_info *info)
+{
+	switch (info->type) {
+	case BCH_STR_HASH_CRC32C:
+		ctx->crc32c = crc32c(~0, &info->crc_key, sizeof(info->crc_key));
+		break;
+	case BCH_STR_HASH_CRC64:
+		ctx->crc64 = bch2_crc64_update(~0, &info->crc_key, sizeof(info->crc_key));
+		break;
+	case BCH_STR_HASH_SIPHASH:
+		SipHash24_Init(&ctx->siphash, &info->siphash_key);
+		break;
+	default:
+		BUG();
+	}
+}
+
+static inline void bch2_str_hash_update(struct bch_str_hash_ctx *ctx,
+				       const struct bch_hash_info *info,
+				       const void *data, size_t len)
+{
+	switch (info->type) {
+	case BCH_STR_HASH_CRC32C:
+		ctx->crc32c = crc32c(ctx->crc32c, data, len);
+		break;
+	case BCH_STR_HASH_CRC64:
+		ctx->crc64 = bch2_crc64_update(ctx->crc64, data, len);
+		break;
+	case BCH_STR_HASH_SIPHASH:
+		SipHash24_Update(&ctx->siphash, data, len);
+		break;
+	default:
+		BUG();
+	}
+}
+
+static inline u64 bch2_str_hash_end(struct bch_str_hash_ctx *ctx,
+				   const struct bch_hash_info *info)
+{
+	switch (info->type) {
+	case BCH_STR_HASH_CRC32C:
+		return ctx->crc32c;
+	case BCH_STR_HASH_CRC64:
+		return ctx->crc64 >> 1;
+	case BCH_STR_HASH_SIPHASH:
+		return SipHash24_End(&ctx->siphash) >> 1;
+	default:
+		BUG();
+	}
+}
+
+struct bch_hash_desc {
+	enum btree_id	btree_id;
+	u8		key_type;
+	u8		whiteout_type;
+
+	u64		(*hash_key)(const struct bch_hash_info *, const void *);
+	u64		(*hash_bkey)(const struct bch_hash_info *, struct bkey_s_c);
+	bool		(*cmp_key)(struct bkey_s_c, const void *);
+	bool		(*cmp_bkey)(struct bkey_s_c, struct bkey_s_c);
+};
+
+static inline struct btree_iter *
+bch2_hash_lookup(struct btree_trans *trans,
+		 const struct bch_hash_desc desc,
+		 const struct bch_hash_info *info,
+		 u64 inode, const void *key,
+		 unsigned flags)
+{
+	struct btree_iter *iter;
+	struct bkey_s_c k;
+
+	iter = bch2_trans_get_iter(trans, desc.btree_id,
+				   POS(inode, desc.hash_key(info, key)),
+				   BTREE_ITER_SLOTS|flags);
+	if (IS_ERR(iter))
+		return iter;
+
+	for_each_btree_key_continue(iter, BTREE_ITER_SLOTS, k) {
+		if (iter->pos.inode != inode)
+			break;
+
+		if (k.k->type == desc.key_type) {
+			if (!desc.cmp_key(k, key))
+				return iter;
+		} else if (k.k->type == desc.whiteout_type) {
+			;
+		} else {
+			/* hole, not found */
+			break;
+		}
+	}
+
+	return IS_ERR(k.k) ? ERR_CAST(k.k) : ERR_PTR(-ENOENT);
+}
+
+static inline struct btree_iter *
+bch2_hash_hole(struct btree_trans *trans,
+	       const struct bch_hash_desc desc,
+	       const struct bch_hash_info *info,
+	       u64 inode, const void *key)
+{
+	struct btree_iter *iter;
+	struct bkey_s_c k;
+
+	iter = bch2_trans_get_iter(trans, desc.btree_id,
+				   POS(inode, desc.hash_key(info, key)),
+				   BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+	if (IS_ERR(iter))
+		return iter;
+
+	for_each_btree_key_continue(iter, BTREE_ITER_SLOTS, k) {
+		if (iter->pos.inode != inode)
+			break;
+
+		if (k.k->type != desc.key_type)
+			return iter;
+	}
+
+	return IS_ERR(k.k) ? ERR_CAST(k.k) : ERR_PTR(-ENOSPC);
+}
+
+static inline int bch2_hash_needs_whiteout(struct btree_trans *trans,
+					   const struct bch_hash_desc desc,
+					   const struct bch_hash_info *info,
+					   struct btree_iter *start)
+{
+	struct btree_iter *iter;
+	struct bkey_s_c k;
+
+	iter = bch2_trans_copy_iter(trans, start);
+	if (IS_ERR(iter))
+		return PTR_ERR(iter);
+
+	bch2_btree_iter_next_slot(iter);
+
+	for_each_btree_key_continue(iter, BTREE_ITER_SLOTS, k) {
+		if (k.k->type != desc.key_type &&
+		    k.k->type != desc.whiteout_type)
+			return false;
+
+		if (k.k->type == desc.key_type &&
+		    desc.hash_bkey(info, k) <= start->pos.offset)
+			return true;
+	}
+	return btree_iter_err(k);
+}
+
+static inline int __bch2_hash_set(struct btree_trans *trans,
+				  const struct bch_hash_desc desc,
+				  const struct bch_hash_info *info,
+				  u64 inode, struct bkey_i *insert, int flags)
+{
+	struct btree_iter *iter, *slot = NULL;
+	struct bkey_s_c k;
+
+	iter = bch2_trans_get_iter(trans, desc.btree_id,
+			POS(inode, desc.hash_bkey(info, bkey_i_to_s_c(insert))),
+			BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+	if (IS_ERR(iter))
+		return PTR_ERR(iter);
+
+	for_each_btree_key_continue(iter, BTREE_ITER_SLOTS, k) {
+		if (iter->pos.inode != inode)
+			break;
+
+		if (k.k->type == desc.key_type) {
+			if (!desc.cmp_bkey(k, bkey_i_to_s_c(insert)))
+				goto found;
+
+			/* hash collision: */
+			continue;
+		}
+
+		if (!slot &&
+		    !(flags & BCH_HASH_SET_MUST_REPLACE)) {
+			slot = bch2_trans_copy_iter(trans, iter);
+			if (IS_ERR(slot))
+				return PTR_ERR(slot);
+		}
+
+		if (k.k->type != desc.whiteout_type)
+			goto not_found;
+	}
+
+	return btree_iter_err(k) ?: -ENOSPC;
+not_found:
+	if (flags & BCH_HASH_SET_MUST_REPLACE)
+		return -ENOENT;
+
+	insert->k.p = slot->pos;
+	bch2_trans_update(trans, slot, insert, 0);
+	return 0;
+found:
+	if (flags & BCH_HASH_SET_MUST_CREATE)
+		return -EEXIST;
+
+	insert->k.p = iter->pos;
+	bch2_trans_update(trans, iter, insert, 0);
+	return 0;
+}
+
+static inline int bch2_hash_set(const struct bch_hash_desc desc,
+			       const struct bch_hash_info *info,
+			       struct bch_fs *c, u64 inode,
+			       u64 *journal_seq,
+			       struct bkey_i *insert, int flags)
+{
+	return bch2_trans_do(c, journal_seq, flags|BTREE_INSERT_ATOMIC,
+			__bch2_hash_set(&trans, desc, info,
+					inode, insert, flags));
+}
+
+static inline int bch2_hash_delete_at(struct btree_trans *trans,
+				      const struct bch_hash_desc desc,
+				      const struct bch_hash_info *info,
+				      struct btree_iter *iter)
+{
+	struct bkey_i *delete;
+	int ret;
+
+	ret = bch2_hash_needs_whiteout(trans, desc, info, iter);
+	if (ret < 0)
+		return ret;
+
+	delete = bch2_trans_kmalloc(trans, sizeof(*delete));
+	if (IS_ERR(delete))
+		return PTR_ERR(delete);
+
+	bkey_init(&delete->k);
+	delete->k.p = iter->pos;
+	delete->k.type = ret ? desc.whiteout_type : KEY_TYPE_DELETED;
+
+	bch2_trans_update(trans, iter, delete, 0);
+	return 0;
+}
+
+static inline int bch2_hash_delete(struct btree_trans *trans,
+				   const struct bch_hash_desc desc,
+				   const struct bch_hash_info *info,
+				   u64 inode, const void *key)
+{
+	struct btree_iter *iter;
+
+	iter = bch2_hash_lookup(trans, desc, info, inode, key,
+				BTREE_ITER_INTENT);
+	if (IS_ERR(iter))
+		return PTR_ERR(iter);
+
+	return bch2_hash_delete_at(trans, desc, info, iter);
+}
+
+#endif /* _BCACHEFS_STR_HASH_H */
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
new file mode 100644
index 000000000000..64c2375302a0
--- /dev/null
+++ b/fs/bcachefs/super-io.c
@@ -0,0 +1,971 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "checksum.h"
+#include "disk_groups.h"
+#include "error.h"
+#include "io.h"
+#include "journal.h"
+#include "replicas.h"
+#include "quota.h"
+#include "super-io.h"
+#include "super.h"
+#include "vstructs.h"
+
+#include <linux/backing-dev.h>
+#include <linux/sort.h>
+
+static const struct blk_holder_ops bch2_sb_handle_bdev_ops = {
+};
+
+const char * const bch2_sb_fields[] = {
+#define x(name, nr)	#name,
+	BCH_SB_FIELDS()
+#undef x
+	NULL
+};
+
+static const char *bch2_sb_field_validate(struct bch_sb *,
+					  struct bch_sb_field *);
+
+struct bch_sb_field *bch2_sb_field_get(struct bch_sb *sb,
+				      enum bch_sb_field_type type)
+{
+	struct bch_sb_field *f;
+
+	/* XXX: need locking around superblock to access optional fields */
+
+	vstruct_for_each(sb, f)
+		if (le32_to_cpu(f->type) == type)
+			return f;
+	return NULL;
+}
+
+static struct bch_sb_field *__bch2_sb_field_resize(struct bch_sb_handle *sb,
+						   struct bch_sb_field *f,
+						   unsigned u64s)
+{
+	unsigned old_u64s = f ? le32_to_cpu(f->u64s) : 0;
+	unsigned sb_u64s = le32_to_cpu(sb->sb->u64s) + u64s - old_u64s;
+
+	BUG_ON(get_order(__vstruct_bytes(struct bch_sb, sb_u64s)) >
+	       sb->page_order);
+
+	if (!f) {
+		f = vstruct_last(sb->sb);
+		memset(f, 0, sizeof(u64) * u64s);
+		f->u64s = cpu_to_le32(u64s);
+		f->type = 0;
+	} else {
+		void *src, *dst;
+
+		src = vstruct_end(f);
+		f->u64s = cpu_to_le32(u64s);
+		dst = vstruct_end(f);
+
+		memmove(dst, src, vstruct_end(sb->sb) - src);
+
+		if (dst > src)
+			memset(src, 0, dst - src);
+	}
+
+	sb->sb->u64s = cpu_to_le32(sb_u64s);
+
+	return f;
+}
+
+/* Superblock realloc/free: */
+
+void bch2_free_super(struct bch_sb_handle *sb)
+{
+	if (sb->bio)
+		kfree(sb->bio);
+	if (!IS_ERR_OR_NULL(sb->bdev))
+		blkdev_put(sb->bdev, sb->holder);
+	kfree(sb->holder);
+
+	free_pages((unsigned long) sb->sb, sb->page_order);
+	memset(sb, 0, sizeof(*sb));
+}
+
+int bch2_sb_realloc(struct bch_sb_handle *sb, unsigned u64s)
+{
+	size_t new_bytes = __vstruct_bytes(struct bch_sb, u64s);
+	unsigned order = get_order(new_bytes);
+	struct bch_sb *new_sb;
+	struct bio *bio;
+
+	if (sb->sb && sb->page_order >= order)
+		return 0;
+
+	if (sb->have_layout) {
+		u64 max_bytes = 512 << sb->sb->layout.sb_max_size_bits;
+
+		if (new_bytes > max_bytes) {
+			pr_err("%pg: superblock too big: want %zu but have %llu",
+			       sb->bdev, new_bytes, max_bytes);
+			return -ENOSPC;
+		}
+	}
+
+	if (sb->page_order >= order && sb->sb)
+		return 0;
+
+	if (dynamic_fault("bcachefs:add:super_realloc"))
+		return -ENOMEM;
+
+	if (sb->have_bio) {
+		unsigned nr_bvecs = 1 << order;
+
+		bio = bio_kmalloc(nr_bvecs, GFP_KERNEL);
+		if (!bio)
+			return -ENOMEM;
+
+		bio_init(bio, NULL, bio->bi_inline_vecs, nr_bvecs, 0);
+
+		if (sb->bio)
+			kfree(sb->bio);
+		sb->bio = bio;
+	}
+
+	new_sb = (void *) __get_free_pages(GFP_KERNEL|__GFP_ZERO, order);
+	if (!new_sb)
+		return -ENOMEM;
+
+	if (sb->sb)
+		memcpy(new_sb, sb->sb, PAGE_SIZE << sb->page_order);
+
+	free_pages((unsigned long) sb->sb, sb->page_order);
+	sb->sb = new_sb;
+
+	sb->page_order = order;
+
+	return 0;
+}
+
+struct bch_sb_field *bch2_sb_field_resize(struct bch_sb_handle *sb,
+					  enum bch_sb_field_type type,
+					  unsigned u64s)
+{
+	struct bch_sb_field *f = bch2_sb_field_get(sb->sb, type);
+	ssize_t old_u64s = f ? le32_to_cpu(f->u64s) : 0;
+	ssize_t d = -old_u64s + u64s;
+
+	if (bch2_sb_realloc(sb, le32_to_cpu(sb->sb->u64s) + d))
+		return NULL;
+
+	if (sb->fs_sb) {
+		struct bch_fs *c = container_of(sb, struct bch_fs, disk_sb);
+		struct bch_dev *ca;
+		unsigned i;
+
+		lockdep_assert_held(&c->sb_lock);
+
+		/* XXX: we're not checking that offline device have enough space */
+
+		for_each_online_member(ca, c, i) {
+			struct bch_sb_handle *sb = &ca->disk_sb;
+
+			if (bch2_sb_realloc(sb, le32_to_cpu(sb->sb->u64s) + d)) {
+				percpu_ref_put(&ca->ref);
+				return NULL;
+			}
+		}
+	}
+
+	f = __bch2_sb_field_resize(sb, f, u64s);
+	f->type = cpu_to_le32(type);
+	return f;
+}
+
+/* Superblock validate: */
+
+static inline void __bch2_sb_layout_size_assert(void)
+{
+	BUILD_BUG_ON(sizeof(struct bch_sb_layout) != 512);
+}
+
+static const char *validate_sb_layout(struct bch_sb_layout *layout)
+{
+	u64 offset, prev_offset, max_sectors;
+	unsigned i;
+
+	if (!uuid_equal(&layout->magic, &BCACHE_MAGIC) &&
+	    !uuid_equal(&layout->magic, &BCHFS_MAGIC))
+		return "Not a bcachefs superblock layout";
+
+	if (layout->layout_type != 0)
+		return "Invalid superblock layout type";
+
+	if (!layout->nr_superblocks)
+		return "Invalid superblock layout: no superblocks";
+
+	if (layout->nr_superblocks > ARRAY_SIZE(layout->sb_offset))
+		return "Invalid superblock layout: too many superblocks";
+
+	max_sectors = 1 << layout->sb_max_size_bits;
+
+	prev_offset = le64_to_cpu(layout->sb_offset[0]);
+
+	for (i = 1; i < layout->nr_superblocks; i++) {
+		offset = le64_to_cpu(layout->sb_offset[i]);
+
+		if (offset < prev_offset + max_sectors)
+			return "Invalid superblock layout: superblocks overlap";
+		prev_offset = offset;
+	}
+
+	return NULL;
+}
+
+const char *bch2_sb_validate(struct bch_sb_handle *disk_sb)
+{
+	struct bch_sb *sb = disk_sb->sb;
+	struct bch_sb_field *f;
+	struct bch_sb_field_members *mi;
+	const char *err;
+	u16 block_size;
+
+	if (le16_to_cpu(sb->version) < BCH_SB_VERSION_MIN ||
+	    le16_to_cpu(sb->version) > BCH_SB_VERSION_MAX)
+		return "Unsupported superblock version";
+
+	if (le16_to_cpu(sb->version) < BCH_SB_VERSION_EXTENT_MAX) {
+		SET_BCH_SB_ENCODED_EXTENT_MAX_BITS(sb, 7);
+		SET_BCH_SB_POSIX_ACL(sb, 1);
+	}
+
+	block_size = le16_to_cpu(sb->block_size);
+
+	if (!is_power_of_2(block_size) ||
+	    block_size > PAGE_SECTORS)
+		return "Bad block size";
+
+	if (bch2_is_zero(sb->user_uuid.b, sizeof(sb->user_uuid)))
+		return "Bad user UUID";
+
+	if (bch2_is_zero(sb->uuid.b, sizeof(sb->uuid)))
+		return "Bad internal UUID";
+
+	if (!sb->nr_devices ||
+	    sb->nr_devices <= sb->dev_idx ||
+	    sb->nr_devices > BCH_SB_MEMBERS_MAX)
+		return "Bad number of member devices";
+
+	if (!BCH_SB_META_REPLICAS_WANT(sb) ||
+	    BCH_SB_META_REPLICAS_WANT(sb) >= BCH_REPLICAS_MAX)
+		return "Invalid number of metadata replicas";
+
+	if (!BCH_SB_META_REPLICAS_REQ(sb) ||
+	    BCH_SB_META_REPLICAS_REQ(sb) >= BCH_REPLICAS_MAX)
+		return "Invalid number of metadata replicas";
+
+	if (!BCH_SB_DATA_REPLICAS_WANT(sb) ||
+	    BCH_SB_DATA_REPLICAS_WANT(sb) >= BCH_REPLICAS_MAX)
+		return "Invalid number of data replicas";
+
+	if (!BCH_SB_DATA_REPLICAS_REQ(sb) ||
+	    BCH_SB_DATA_REPLICAS_REQ(sb) >= BCH_REPLICAS_MAX)
+		return "Invalid number of data replicas";
+
+	if (BCH_SB_META_CSUM_TYPE(sb) >= BCH_CSUM_OPT_NR)
+		return "Invalid metadata checksum type";
+
+	if (BCH_SB_DATA_CSUM_TYPE(sb) >= BCH_CSUM_OPT_NR)
+		return "Invalid metadata checksum type";
+
+	if (BCH_SB_COMPRESSION_TYPE(sb) >= BCH_COMPRESSION_OPT_NR)
+		return "Invalid compression type";
+
+	if (!BCH_SB_BTREE_NODE_SIZE(sb))
+		return "Btree node size not set";
+
+	if (!is_power_of_2(BCH_SB_BTREE_NODE_SIZE(sb)))
+		return "Btree node size not a power of two";
+
+	if (BCH_SB_GC_RESERVE(sb) < 5)
+		return "gc reserve percentage too small";
+
+	if (!sb->time_precision ||
+	    le32_to_cpu(sb->time_precision) > NSEC_PER_SEC)
+		return "invalid time precision";
+
+	/* validate layout */
+	err = validate_sb_layout(&sb->layout);
+	if (err)
+		return err;
+
+	vstruct_for_each(sb, f) {
+		if (!f->u64s)
+			return "Invalid superblock: invalid optional field";
+
+		if (vstruct_next(f) > vstruct_last(sb))
+			return "Invalid superblock: invalid optional field";
+	}
+
+	/* members must be validated first: */
+	mi = bch2_sb_get_members(sb);
+	if (!mi)
+		return "Invalid superblock: member info area missing";
+
+	err = bch2_sb_field_validate(sb, &mi->field);
+	if (err)
+		return err;
+
+	vstruct_for_each(sb, f) {
+		if (le32_to_cpu(f->type) == BCH_SB_FIELD_members)
+			continue;
+
+		err = bch2_sb_field_validate(sb, f);
+		if (err)
+			return err;
+	}
+
+	if (le16_to_cpu(sb->version) < BCH_SB_VERSION_EXTENT_NONCE_V1 &&
+	    bch2_sb_get_crypt(sb) &&
+	    BCH_SB_INITIALIZED(sb))
+		return "Incompatible extent nonces";
+
+	sb->version = cpu_to_le16(BCH_SB_VERSION_MAX);
+
+	return NULL;
+}
+
+/* device open: */
+
+static void bch2_sb_update(struct bch_fs *c)
+{
+	struct bch_sb *src = c->disk_sb.sb;
+	struct bch_sb_field_members *mi = bch2_sb_get_members(src);
+	struct bch_dev *ca;
+	unsigned i;
+
+	lockdep_assert_held(&c->sb_lock);
+
+	c->sb.uuid		= src->uuid;
+	c->sb.user_uuid		= src->user_uuid;
+	c->sb.nr_devices	= src->nr_devices;
+	c->sb.clean		= BCH_SB_CLEAN(src);
+	c->sb.encryption_type	= BCH_SB_ENCRYPTION_TYPE(src);
+	c->sb.encoded_extent_max= 1 << BCH_SB_ENCODED_EXTENT_MAX_BITS(src);
+	c->sb.time_base_lo	= le64_to_cpu(src->time_base_lo);
+	c->sb.time_base_hi	= le32_to_cpu(src->time_base_hi);
+	c->sb.time_precision	= le32_to_cpu(src->time_precision);
+	c->sb.features		= le64_to_cpu(src->features[0]);
+
+	for_each_member_device(ca, c, i)
+		ca->mi = bch2_mi_to_cpu(mi->members + i);
+}
+
+/* doesn't copy member info */
+static void __copy_super(struct bch_sb_handle *dst_handle, struct bch_sb *src)
+{
+	struct bch_sb_field *src_f, *dst_f;
+	struct bch_sb *dst = dst_handle->sb;
+
+	dst->version		= src->version;
+	dst->seq		= src->seq;
+	dst->uuid		= src->uuid;
+	dst->user_uuid		= src->user_uuid;
+	memcpy(dst->label,	src->label, sizeof(dst->label));
+
+	dst->block_size		= src->block_size;
+	dst->nr_devices		= src->nr_devices;
+
+	dst->time_base_lo	= src->time_base_lo;
+	dst->time_base_hi	= src->time_base_hi;
+	dst->time_precision	= src->time_precision;
+
+	memcpy(dst->flags,	src->flags,	sizeof(dst->flags));
+	memcpy(dst->features,	src->features,	sizeof(dst->features));
+	memcpy(dst->compat,	src->compat,	sizeof(dst->compat));
+
+	vstruct_for_each(src, src_f) {
+		if (src_f->type == BCH_SB_FIELD_journal)
+			continue;
+
+		dst_f = bch2_sb_field_get(dst, le32_to_cpu(src_f->type));
+		dst_f = __bch2_sb_field_resize(dst_handle, dst_f,
+					       le32_to_cpu(src_f->u64s));
+
+		memcpy(dst_f, src_f, vstruct_bytes(src_f));
+	}
+}
+
+int bch2_sb_to_fs(struct bch_fs *c, struct bch_sb *src)
+{
+	struct bch_sb_field_journal *journal_buckets =
+		bch2_sb_get_journal(src);
+	unsigned journal_u64s = journal_buckets
+		? le32_to_cpu(journal_buckets->field.u64s)
+		: 0;
+	int ret;
+
+	lockdep_assert_held(&c->sb_lock);
+
+	ret = bch2_sb_realloc(&c->disk_sb,
+			      le32_to_cpu(src->u64s) - journal_u64s);
+	if (ret)
+		return ret;
+
+	__copy_super(&c->disk_sb, src);
+
+	ret = bch2_sb_replicas_to_cpu_replicas(c);
+	if (ret)
+		return ret;
+
+	ret = bch2_sb_disk_groups_to_cpu(c);
+	if (ret)
+		return ret;
+
+	bch2_sb_update(c);
+	return 0;
+}
+
+int bch2_sb_from_fs(struct bch_fs *c, struct bch_dev *ca)
+{
+	struct bch_sb *src = c->disk_sb.sb, *dst = ca->disk_sb.sb;
+	struct bch_sb_field_journal *journal_buckets =
+		bch2_sb_get_journal(dst);
+	unsigned journal_u64s = journal_buckets
+		? le32_to_cpu(journal_buckets->field.u64s)
+		: 0;
+	unsigned u64s = le32_to_cpu(src->u64s) + journal_u64s;
+	int ret;
+
+	ret = bch2_sb_realloc(&ca->disk_sb, u64s);
+	if (ret)
+		return ret;
+
+	__copy_super(&ca->disk_sb, src);
+	return 0;
+}
+
+/* read superblock: */
+
+static const char *read_one_super(struct bch_sb_handle *sb, u64 offset)
+{
+	struct bch_csum csum;
+	size_t bytes;
+reread:
+	bio_reset(sb->bio, sb->bdev, REQ_OP_READ|REQ_SYNC|REQ_META);
+	sb->bio->bi_iter.bi_sector = offset;
+	sb->bio->bi_iter.bi_size = PAGE_SIZE << sb->page_order;
+	bch2_bio_map(sb->bio, sb->sb);
+
+	if (submit_bio_wait(sb->bio))
+		return "IO error";
+
+	if (!uuid_equal(&sb->sb->magic, &BCACHE_MAGIC) &&
+	    !uuid_equal(&sb->sb->magic, &BCHFS_MAGIC))
+		return "Not a bcachefs superblock";
+
+	if (le16_to_cpu(sb->sb->version) < BCH_SB_VERSION_MIN ||
+	    le16_to_cpu(sb->sb->version) > BCH_SB_VERSION_MAX)
+		return "Unsupported superblock version";
+
+	bytes = vstruct_bytes(sb->sb);
+
+	if (bytes > 512 << sb->sb->layout.sb_max_size_bits)
+		return "Bad superblock: too big";
+
+	if (get_order(bytes) > sb->page_order) {
+		if (bch2_sb_realloc(sb, le32_to_cpu(sb->sb->u64s)))
+			return "cannot allocate memory";
+		goto reread;
+	}
+
+	if (BCH_SB_CSUM_TYPE(sb->sb) >= BCH_CSUM_NR)
+		return "unknown csum type";
+
+	/* XXX: verify MACs */
+	csum = csum_vstruct(NULL, BCH_SB_CSUM_TYPE(sb->sb),
+			    null_nonce(), sb->sb);
+
+	if (bch2_crc_cmp(csum, sb->sb->csum))
+		return "bad checksum reading superblock";
+
+	return NULL;
+}
+
+int bch2_read_super(const char *path, struct bch_opts *opts,
+		    struct bch_sb_handle *sb)
+{
+	u64 offset = opt_get(*opts, sb);
+	struct bch_sb_layout layout;
+	const char *err;
+	__le64 *i;
+	int ret;
+
+	pr_verbose_init(*opts, "");
+
+	memset(sb, 0, sizeof(*sb));
+	sb->mode	= BLK_OPEN_READ;
+	sb->have_bio	= true;
+	sb->holder	= kmalloc(1, GFP_KERNEL);
+	if (!sb->holder)
+		return -ENOMEM;
+
+	if (!opt_get(*opts, noexcl))
+		sb->mode |= BLK_OPEN_EXCL;
+
+	if (!opt_get(*opts, nochanges))
+		sb->mode |= BLK_OPEN_WRITE;
+
+	sb->bdev = blkdev_get_by_path(path, sb->mode, sb->holder, &bch2_sb_handle_bdev_ops);
+	if (IS_ERR(sb->bdev) &&
+	    PTR_ERR(sb->bdev) == -EACCES &&
+	    opt_get(*opts, read_only)) {
+		sb->mode &= ~BLK_OPEN_WRITE;
+
+		sb->bdev = blkdev_get_by_path(path, sb->mode, sb->holder, &bch2_sb_handle_bdev_ops);
+		if (!IS_ERR(sb->bdev))
+			opt_set(*opts, nochanges, true);
+	}
+
+	if (IS_ERR(sb->bdev)) {
+		ret = PTR_ERR(sb->bdev);
+		goto out;
+	}
+
+	err = "cannot allocate memory";
+	ret = bch2_sb_realloc(sb, 0);
+	if (ret)
+		goto err;
+
+	ret = -EFAULT;
+	err = "dynamic fault";
+	if (bch2_fs_init_fault("read_super"))
+		goto err;
+
+	ret = -EINVAL;
+	err = read_one_super(sb, offset);
+	if (!err)
+		goto got_super;
+
+	if (opt_defined(*opts, sb))
+		goto err;
+
+	pr_err("error reading default superblock: %s", err);
+
+	/*
+	 * Error reading primary superblock - read location of backup
+	 * superblocks:
+	 */
+	bio_reset(sb->bio, sb->bdev, REQ_OP_READ|REQ_SYNC|REQ_META);
+	sb->bio->bi_iter.bi_sector = BCH_SB_LAYOUT_SECTOR;
+	sb->bio->bi_iter.bi_size = sizeof(struct bch_sb_layout);
+	/*
+	 * use sb buffer to read layout, since sb buffer is page aligned but
+	 * layout won't be:
+	 */
+	bch2_bio_map(sb->bio, sb->sb);
+
+	err = "IO error";
+	if (submit_bio_wait(sb->bio))
+		goto err;
+
+	memcpy(&layout, sb->sb, sizeof(layout));
+	err = validate_sb_layout(&layout);
+	if (err)
+		goto err;
+
+	for (i = layout.sb_offset;
+	     i < layout.sb_offset + layout.nr_superblocks; i++) {
+		offset = le64_to_cpu(*i);
+
+		if (offset == opt_get(*opts, sb))
+			continue;
+
+		err = read_one_super(sb, offset);
+		if (!err)
+			goto got_super;
+	}
+
+	ret = -EINVAL;
+	goto err;
+
+got_super:
+	err = "Superblock block size smaller than device block size";
+	ret = -EINVAL;
+	if (le16_to_cpu(sb->sb->block_size) << 9 <
+	    bdev_logical_block_size(sb->bdev))
+		goto err;
+
+	ret = 0;
+	sb->have_layout = true;
+out:
+	pr_verbose_init(*opts, "ret %i", ret);
+	return ret;
+err:
+	bch2_free_super(sb);
+	pr_err("error reading superblock: %s", err);
+	goto out;
+}
+
+/* write superblock: */
+
+static void write_super_endio(struct bio *bio)
+{
+	struct bch_dev *ca = bio->bi_private;
+
+	/* XXX: return errors directly */
+
+	if (bch2_dev_io_err_on(bio->bi_status, ca, "superblock write"))
+		ca->sb_write_error = 1;
+
+	closure_put(&ca->fs->sb_write);
+	percpu_ref_put(&ca->io_ref);
+}
+
+static void write_one_super(struct bch_fs *c, struct bch_dev *ca, unsigned idx)
+{
+	struct bch_sb *sb = ca->disk_sb.sb;
+	struct bio *bio = ca->disk_sb.bio;
+
+	sb->offset = sb->layout.sb_offset[idx];
+
+	SET_BCH_SB_CSUM_TYPE(sb, c->opts.metadata_checksum);
+	sb->csum = csum_vstruct(c, BCH_SB_CSUM_TYPE(sb),
+				null_nonce(), sb);
+
+	bio_reset(bio, ca->disk_sb.bdev, REQ_OP_WRITE|REQ_SYNC|REQ_META);
+	bio->bi_iter.bi_sector	= le64_to_cpu(sb->offset);
+	bio->bi_iter.bi_size	=
+		roundup((size_t) vstruct_bytes(sb),
+			bdev_logical_block_size(ca->disk_sb.bdev));
+	bio->bi_end_io		= write_super_endio;
+	bio->bi_private		= ca;
+	bch2_bio_map(bio, sb);
+
+	this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_SB],
+		     bio_sectors(bio));
+
+	percpu_ref_get(&ca->io_ref);
+	closure_bio_submit(bio, &c->sb_write);
+}
+
+void bch2_write_super(struct bch_fs *c)
+{
+	struct closure *cl = &c->sb_write;
+	struct bch_dev *ca;
+	unsigned i, sb = 0, nr_wrote;
+	const char *err;
+	struct bch_devs_mask sb_written;
+	bool wrote, can_mount_without_written, can_mount_with_written;
+
+	lockdep_assert_held(&c->sb_lock);
+
+	closure_init_stack(cl);
+	memset(&sb_written, 0, sizeof(sb_written));
+
+	le64_add_cpu(&c->disk_sb.sb->seq, 1);
+
+	for_each_online_member(ca, c, i)
+		bch2_sb_from_fs(c, ca);
+
+	for_each_online_member(ca, c, i) {
+		err = bch2_sb_validate(&ca->disk_sb);
+		if (err) {
+			bch2_fs_inconsistent(c, "sb invalid before write: %s", err);
+			goto out;
+		}
+	}
+
+	if (c->opts.nochanges ||
+	    test_bit(BCH_FS_ERROR, &c->flags))
+		goto out;
+
+	for_each_online_member(ca, c, i) {
+		__set_bit(ca->dev_idx, sb_written.d);
+		ca->sb_write_error = 0;
+	}
+
+	do {
+		wrote = false;
+		for_each_online_member(ca, c, i)
+			if (sb < ca->disk_sb.sb->layout.nr_superblocks) {
+				write_one_super(c, ca, sb);
+				wrote = true;
+			}
+		closure_sync(cl);
+		sb++;
+	} while (wrote);
+
+	for_each_online_member(ca, c, i)
+		if (ca->sb_write_error)
+			__clear_bit(ca->dev_idx, sb_written.d);
+
+	nr_wrote = dev_mask_nr(&sb_written);
+
+	can_mount_with_written =
+		bch2_have_enough_devs(__bch2_replicas_status(c, sb_written),
+				      BCH_FORCE_IF_DEGRADED);
+
+	for (i = 0; i < ARRAY_SIZE(sb_written.d); i++)
+		sb_written.d[i] = ~sb_written.d[i];
+
+	can_mount_without_written =
+		bch2_have_enough_devs(__bch2_replicas_status(c, sb_written),
+				      BCH_FORCE_IF_DEGRADED);
+
+	/*
+	 * If we would be able to mount _without_ the devices we successfully
+	 * wrote superblocks to, we weren't able to write to enough devices:
+	 *
+	 * Exception: if we can mount without the successes because we haven't
+	 * written anything (new filesystem), we continue if we'd be able to
+	 * mount with the devices we did successfully write to:
+	 */
+	bch2_fs_fatal_err_on(!nr_wrote ||
+			     (can_mount_without_written &&
+			      !can_mount_with_written), c,
+		"Unable to write superblock to sufficient devices");
+out:
+	/* Make new options visible after they're persistent: */
+	bch2_sb_update(c);
+}
+
+/* BCH_SB_FIELD_journal: */
+
+static int u64_cmp(const void *_l, const void *_r)
+{
+	u64 l = *((const u64 *) _l), r = *((const u64 *) _r);
+
+	return l < r ? -1 : l > r ? 1 : 0;
+}
+
+static const char *bch2_sb_validate_journal(struct bch_sb *sb,
+					    struct bch_sb_field *f)
+{
+	struct bch_sb_field_journal *journal = field_to_type(f, journal);
+	struct bch_member *m = bch2_sb_get_members(sb)->members + sb->dev_idx;
+	const char *err;
+	unsigned nr;
+	unsigned i;
+	u64 *b;
+
+	journal = bch2_sb_get_journal(sb);
+	if (!journal)
+		return NULL;
+
+	nr = bch2_nr_journal_buckets(journal);
+	if (!nr)
+		return NULL;
+
+	b = kmalloc_array(sizeof(u64), nr, GFP_KERNEL);
+	if (!b)
+		return "cannot allocate memory";
+
+	for (i = 0; i < nr; i++)
+		b[i] = le64_to_cpu(journal->buckets[i]);
+
+	sort(b, nr, sizeof(u64), u64_cmp, NULL);
+
+	err = "journal bucket at sector 0";
+	if (!b[0])
+		goto err;
+
+	err = "journal bucket before first bucket";
+	if (m && b[0] < le16_to_cpu(m->first_bucket))
+		goto err;
+
+	err = "journal bucket past end of device";
+	if (m && b[nr - 1] >= le64_to_cpu(m->nbuckets))
+		goto err;
+
+	err = "duplicate journal buckets";
+	for (i = 0; i + 1 < nr; i++)
+		if (b[i] == b[i + 1])
+			goto err;
+
+	err = NULL;
+err:
+	kfree(b);
+	return err;
+}
+
+static const struct bch_sb_field_ops bch_sb_field_ops_journal = {
+	.validate	= bch2_sb_validate_journal,
+};
+
+/* BCH_SB_FIELD_members: */
+
+static const char *bch2_sb_validate_members(struct bch_sb *sb,
+					    struct bch_sb_field *f)
+{
+	struct bch_sb_field_members *mi = field_to_type(f, members);
+	struct bch_member *m;
+
+	if ((void *) (mi->members + sb->nr_devices) >
+	    vstruct_end(&mi->field))
+		return "Invalid superblock: bad member info";
+
+	for (m = mi->members;
+	     m < mi->members + sb->nr_devices;
+	     m++) {
+		if (!bch2_member_exists(m))
+			continue;
+
+		if (le64_to_cpu(m->nbuckets) > LONG_MAX)
+			return "Too many buckets";
+
+		if (le64_to_cpu(m->nbuckets) -
+		    le16_to_cpu(m->first_bucket) < 1 << 10)
+			return "Not enough buckets";
+
+		if (le16_to_cpu(m->bucket_size) <
+		    le16_to_cpu(sb->block_size))
+			return "bucket size smaller than block size";
+
+		if (le16_to_cpu(m->bucket_size) <
+		    BCH_SB_BTREE_NODE_SIZE(sb))
+			return "bucket size smaller than btree node size";
+	}
+
+	if (le16_to_cpu(sb->version) < BCH_SB_VERSION_EXTENT_MAX)
+		for (m = mi->members;
+		     m < mi->members + sb->nr_devices;
+		     m++)
+			SET_BCH_MEMBER_DATA_ALLOWED(m, ~0);
+
+	return NULL;
+}
+
+static const struct bch_sb_field_ops bch_sb_field_ops_members = {
+	.validate	= bch2_sb_validate_members,
+};
+
+/* BCH_SB_FIELD_crypt: */
+
+static const char *bch2_sb_validate_crypt(struct bch_sb *sb,
+					  struct bch_sb_field *f)
+{
+	struct bch_sb_field_crypt *crypt = field_to_type(f, crypt);
+
+	if (vstruct_bytes(&crypt->field) != sizeof(*crypt))
+		return "invalid field crypt: wrong size";
+
+	if (BCH_CRYPT_KDF_TYPE(crypt))
+		return "invalid field crypt: bad kdf type";
+
+	return NULL;
+}
+
+static const struct bch_sb_field_ops bch_sb_field_ops_crypt = {
+	.validate	= bch2_sb_validate_crypt,
+};
+
+/* BCH_SB_FIELD_clean: */
+
+void bch2_fs_mark_clean(struct bch_fs *c, bool clean)
+{
+	struct bch_sb_field_clean *sb_clean;
+	unsigned u64s = sizeof(*sb_clean) / sizeof(u64);
+	struct jset_entry *entry;
+	struct btree_root *r;
+
+	mutex_lock(&c->sb_lock);
+	if (clean == BCH_SB_CLEAN(c->disk_sb.sb))
+		goto out;
+
+	SET_BCH_SB_CLEAN(c->disk_sb.sb, clean);
+
+	if (!clean)
+		goto write_super;
+
+	mutex_lock(&c->btree_root_lock);
+
+	for (r = c->btree_roots;
+	     r < c->btree_roots + BTREE_ID_NR;
+	     r++)
+		if (r->alive)
+			u64s += jset_u64s(r->key.u64s);
+
+	sb_clean = bch2_sb_resize_clean(&c->disk_sb, u64s);
+	if (!sb_clean) {
+		bch_err(c, "error resizing superblock while setting filesystem clean");
+		goto out;
+	}
+
+	sb_clean->flags		= 0;
+	sb_clean->read_clock	= cpu_to_le16(c->bucket_clock[READ].hand);
+	sb_clean->write_clock	= cpu_to_le16(c->bucket_clock[WRITE].hand);
+	sb_clean->journal_seq	= journal_cur_seq(&c->journal) - 1;
+
+	entry = sb_clean->start;
+	memset(entry, 0,
+	       vstruct_end(&sb_clean->field) - (void *) entry);
+
+	for (r = c->btree_roots;
+	     r < c->btree_roots + BTREE_ID_NR;
+	     r++)
+		if (r->alive) {
+			entry->u64s	= r->key.u64s;
+			entry->btree_id	= r - c->btree_roots;
+			entry->level	= r->level;
+			entry->type	= BCH_JSET_ENTRY_btree_root;
+			bkey_copy(&entry->start[0], &r->key);
+			entry = vstruct_next(entry);
+			BUG_ON((void *) entry > vstruct_end(&sb_clean->field));
+		}
+
+	BUG_ON(entry != vstruct_end(&sb_clean->field));
+
+	mutex_unlock(&c->btree_root_lock);
+write_super:
+	bch2_write_super(c);
+out:
+	mutex_unlock(&c->sb_lock);
+}
+
+static const char *bch2_sb_validate_clean(struct bch_sb *sb,
+					  struct bch_sb_field *f)
+{
+	struct bch_sb_field_clean *clean = field_to_type(f, clean);
+
+	if (vstruct_bytes(&clean->field) < sizeof(*clean))
+		return "invalid field crypt: wrong size";
+
+	return NULL;
+}
+
+static const struct bch_sb_field_ops bch_sb_field_ops_clean = {
+	.validate	= bch2_sb_validate_clean,
+};
+
+static const struct bch_sb_field_ops *bch2_sb_field_ops[] = {
+#define x(f, nr)					\
+	[BCH_SB_FIELD_##f] = &bch_sb_field_ops_##f,
+	BCH_SB_FIELDS()
+#undef x
+};
+
+static const char *bch2_sb_field_validate(struct bch_sb *sb,
+					  struct bch_sb_field *f)
+{
+	unsigned type = le32_to_cpu(f->type);
+
+	return type < BCH_SB_FIELD_NR
+		? bch2_sb_field_ops[type]->validate(sb, f)
+		: NULL;
+}
+
+size_t bch2_sb_field_to_text(char *buf, size_t size,
+			     struct bch_sb *sb, struct bch_sb_field *f)
+{
+	unsigned type = le32_to_cpu(f->type);
+	size_t (*to_text)(char *, size_t, struct bch_sb *,
+				   struct bch_sb_field *) =
+		type < BCH_SB_FIELD_NR
+		? bch2_sb_field_ops[type]->to_text
+		: NULL;
+
+	if (!to_text) {
+		if (size)
+			buf[0] = '\0';
+		return 0;
+	}
+
+	return to_text(buf, size, sb, f);
+}
diff --git a/fs/bcachefs/super-io.h b/fs/bcachefs/super-io.h
new file mode 100644
index 000000000000..1ea91f71f3b0
--- /dev/null
+++ b/fs/bcachefs/super-io.h
@@ -0,0 +1,142 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_SUPER_IO_H
+#define _BCACHEFS_SUPER_IO_H
+
+#include "extents.h"
+#include "eytzinger.h"
+#include "super_types.h"
+#include "super.h"
+
+#include <asm/byteorder.h>
+
+struct bch_sb_field *bch2_sb_field_get(struct bch_sb *, enum bch_sb_field_type);
+struct bch_sb_field *bch2_sb_field_resize(struct bch_sb_handle *,
+					  enum bch_sb_field_type, unsigned);
+
+#define field_to_type(_f, _name)					\
+	container_of_or_null(_f, struct bch_sb_field_##_name, field)
+
+#define x(_name, _nr)							\
+static inline struct bch_sb_field_##_name *				\
+bch2_sb_get_##_name(struct bch_sb *sb)					\
+{									\
+	return field_to_type(bch2_sb_field_get(sb,			\
+				BCH_SB_FIELD_##_name), _name);		\
+}									\
+									\
+static inline struct bch_sb_field_##_name *				\
+bch2_sb_resize_##_name(struct bch_sb_handle *sb, unsigned u64s)	\
+{									\
+	return field_to_type(bch2_sb_field_resize(sb,			\
+				BCH_SB_FIELD_##_name, u64s), _name);	\
+}
+
+BCH_SB_FIELDS()
+#undef x
+
+extern const char * const bch2_sb_fields[];
+
+struct bch_sb_field_ops {
+	const char *	(*validate)(struct bch_sb *, struct bch_sb_field *);
+	size_t		(*to_text)(char *, size_t, struct bch_sb *,
+				   struct bch_sb_field *);
+};
+
+static inline bool bch2_sb_test_feature(struct bch_sb *sb,
+					enum bch_sb_features f)
+{
+	unsigned w = f / 64;
+	unsigned b = f % 64;
+
+	return le64_to_cpu(sb->features[w]) & (1ULL << b);
+}
+
+static inline void bch2_sb_set_feature(struct bch_sb *sb,
+				       enum bch_sb_features f)
+{
+	if (!bch2_sb_test_feature(sb, f)) {
+		unsigned w = f / 64;
+		unsigned b = f % 64;
+
+		le64_add_cpu(&sb->features[w], 1ULL << b);
+	}
+}
+
+static inline __le64 bch2_sb_magic(struct bch_fs *c)
+{
+	__le64 ret;
+	memcpy(&ret, &c->sb.uuid, sizeof(ret));
+	return ret;
+}
+
+static inline __u64 jset_magic(struct bch_fs *c)
+{
+	return __le64_to_cpu(bch2_sb_magic(c) ^ JSET_MAGIC);
+}
+
+static inline __u64 bset_magic(struct bch_fs *c)
+{
+	return __le64_to_cpu(bch2_sb_magic(c) ^ BSET_MAGIC);
+}
+
+int bch2_sb_to_fs(struct bch_fs *, struct bch_sb *);
+int bch2_sb_from_fs(struct bch_fs *, struct bch_dev *);
+
+void bch2_free_super(struct bch_sb_handle *);
+int bch2_sb_realloc(struct bch_sb_handle *, unsigned);
+
+const char *bch2_sb_validate(struct bch_sb_handle *);
+
+int bch2_read_super(const char *, struct bch_opts *, struct bch_sb_handle *);
+void bch2_write_super(struct bch_fs *);
+
+/* BCH_SB_FIELD_journal: */
+
+static inline unsigned bch2_nr_journal_buckets(struct bch_sb_field_journal *j)
+{
+	return j
+		? (__le64 *) vstruct_end(&j->field) - j->buckets
+		: 0;
+}
+
+/* BCH_SB_FIELD_members: */
+
+static inline bool bch2_member_exists(struct bch_member *m)
+{
+	return !bch2_is_zero(&m->uuid, sizeof(m->uuid));
+}
+
+static inline bool bch2_dev_exists(struct bch_sb *sb,
+				   struct bch_sb_field_members *mi,
+				   unsigned dev)
+{
+	return dev < sb->nr_devices &&
+		bch2_member_exists(&mi->members[dev]);
+}
+
+static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi)
+{
+	return (struct bch_member_cpu) {
+		.nbuckets	= le64_to_cpu(mi->nbuckets),
+		.first_bucket	= le16_to_cpu(mi->first_bucket),
+		.bucket_size	= le16_to_cpu(mi->bucket_size),
+		.group		= BCH_MEMBER_GROUP(mi),
+		.state		= BCH_MEMBER_STATE(mi),
+		.replacement	= BCH_MEMBER_REPLACEMENT(mi),
+		.discard	= BCH_MEMBER_DISCARD(mi),
+		.data_allowed	= BCH_MEMBER_DATA_ALLOWED(mi),
+		.durability	= BCH_MEMBER_DURABILITY(mi)
+			? BCH_MEMBER_DURABILITY(mi) - 1
+			: 1,
+		.valid		= bch2_member_exists(mi),
+	};
+}
+
+/* BCH_SB_FIELD_clean: */
+
+void bch2_fs_mark_clean(struct bch_fs *, bool);
+
+size_t bch2_sb_field_to_text(char *, size_t, struct bch_sb *,
+			     struct bch_sb_field *);
+
+#endif /* _BCACHEFS_SUPER_IO_H */
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
new file mode 100644
index 000000000000..3191d4cc8140
--- /dev/null
+++ b/fs/bcachefs/super.c
@@ -0,0 +1,1754 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * bcachefs setup/teardown code, and some metadata io - read a superblock and
+ * figure out what to do with it.
+ *
+ * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
+ * Copyright 2012 Google, Inc.
+ */
+
+#include "bcachefs.h"
+#include "alloc.h"
+#include "btree_cache.h"
+#include "btree_gc.h"
+#include "btree_update_interior.h"
+#include "btree_io.h"
+#include "chardev.h"
+#include "checksum.h"
+#include "clock.h"
+#include "compress.h"
+#include "debug.h"
+#include "disk_groups.h"
+#include "error.h"
+#include "fs.h"
+#include "fs-io.h"
+#include "fsck.h"
+#include "inode.h"
+#include "io.h"
+#include "journal.h"
+#include "journal_reclaim.h"
+#include "move.h"
+#include "migrate.h"
+#include "movinggc.h"
+#include "quota.h"
+#include "rebalance.h"
+#include "recovery.h"
+#include "replicas.h"
+#include "super.h"
+#include "super-io.h"
+#include "sysfs.h"
+#include "trace.h"
+
+#include <linux/backing-dev.h>
+#include <linux/blkdev.h>
+#include <linux/debugfs.h>
+#include <linux/device.h>
+#include <linux/idr.h>
+#include <linux/kthread.h>
+#include <linux/module.h>
+#include <linux/percpu.h>
+#include <linux/random.h>
+#include <linux/sysfs.h>
+#include <crypto/hash.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Kent Overstreet <kent.overstreet@gmail.com>");
+
+#define KTYPE(type)							\
+static const struct attribute_group type ## _group = {			\
+	.attrs = type ## _files						\
+};									\
+									\
+static const struct attribute_group *type ## _groups[] = {		\
+	&type ## _group,						\
+	NULL								\
+};									\
+									\
+static const struct kobj_type type ## _ktype = {			\
+	.release	= type ## _release,				\
+	.sysfs_ops	= &type ## _sysfs_ops,				\
+	.default_groups = type ## _groups				\
+}
+
+static void bch2_fs_release(struct kobject *);
+static void bch2_dev_release(struct kobject *);
+
+static void bch2_fs_internal_release(struct kobject *k)
+{
+}
+
+static void bch2_fs_opts_dir_release(struct kobject *k)
+{
+}
+
+static void bch2_fs_time_stats_release(struct kobject *k)
+{
+}
+
+KTYPE(bch2_fs);
+KTYPE(bch2_fs_internal);
+KTYPE(bch2_fs_opts_dir);
+KTYPE(bch2_fs_time_stats);
+KTYPE(bch2_dev);
+
+static struct kset *bcachefs_kset;
+static LIST_HEAD(bch_fs_list);
+static DEFINE_MUTEX(bch_fs_list_lock);
+
+static DECLARE_WAIT_QUEUE_HEAD(bch_read_only_wait);
+
+static void bch2_dev_free(struct bch_dev *);
+static int bch2_dev_alloc(struct bch_fs *, unsigned);
+static int bch2_dev_sysfs_online(struct bch_fs *, struct bch_dev *);
+static void __bch2_dev_read_only(struct bch_fs *, struct bch_dev *);
+
+struct bch_fs *bch2_dev_to_fs(dev_t dev)
+{
+	struct bch_fs *c;
+	struct bch_dev *ca;
+	unsigned i;
+
+	mutex_lock(&bch_fs_list_lock);
+	rcu_read_lock();
+
+	list_for_each_entry(c, &bch_fs_list, list)
+		for_each_member_device_rcu(ca, c, i, NULL)
+			if (ca->disk_sb.bdev->bd_dev == dev) {
+				closure_get(&c->cl);
+				goto found;
+			}
+	c = NULL;
+found:
+	rcu_read_unlock();
+	mutex_unlock(&bch_fs_list_lock);
+
+	return c;
+}
+
+static struct bch_fs *__bch2_uuid_to_fs(__uuid_t uuid)
+{
+	struct bch_fs *c;
+
+	lockdep_assert_held(&bch_fs_list_lock);
+
+	list_for_each_entry(c, &bch_fs_list, list)
+		if (!memcmp(&c->disk_sb.sb->uuid, &uuid, sizeof(uuid)))
+			return c;
+
+	return NULL;
+}
+
+struct bch_fs *bch2_uuid_to_fs(__uuid_t uuid)
+{
+	struct bch_fs *c;
+
+	mutex_lock(&bch_fs_list_lock);
+	c = __bch2_uuid_to_fs(uuid);
+	if (c)
+		closure_get(&c->cl);
+	mutex_unlock(&bch_fs_list_lock);
+
+	return c;
+}
+
+/* Filesystem RO/RW: */
+
+/*
+ * For startup/shutdown of RW stuff, the dependencies are:
+ *
+ * - foreground writes depend on copygc and rebalance (to free up space)
+ *
+ * - copygc and rebalance depend on mark and sweep gc (they actually probably
+ *   don't because they either reserve ahead of time or don't block if
+ *   allocations fail, but allocations can require mark and sweep gc to run
+ *   because of generation number wraparound)
+ *
+ * - all of the above depends on the allocator threads
+ *
+ * - allocator depends on the journal (when it rewrites prios and gens)
+ */
+
+static void __bch2_fs_read_only(struct bch_fs *c)
+{
+	struct bch_dev *ca;
+	unsigned i;
+
+	bch2_rebalance_stop(c);
+
+	for_each_member_device(ca, c, i)
+		bch2_copygc_stop(ca);
+
+	bch2_gc_thread_stop(c);
+
+	/*
+	 * Flush journal before stopping allocators, because flushing journal
+	 * blacklist entries involves allocating new btree nodes:
+	 */
+	bch2_journal_flush_all_pins(&c->journal);
+
+	for_each_member_device(ca, c, i)
+		bch2_dev_allocator_stop(ca);
+
+	bch2_journal_flush_all_pins(&c->journal);
+
+	/*
+	 * We need to explicitly wait on btree interior updates to complete
+	 * before stopping the journal, flushing all journal pins isn't
+	 * sufficient, because in the BTREE_INTERIOR_UPDATING_ROOT case btree
+	 * interior updates have to drop their journal pin before they're
+	 * fully complete:
+	 */
+	closure_wait_event(&c->btree_interior_update_wait,
+			   !bch2_btree_interior_updates_nr_pending(c));
+
+	bch2_fs_journal_stop(&c->journal);
+
+	/*
+	 * the journal kicks off btree writes via reclaim - wait for in flight
+	 * writes after stopping journal:
+	 */
+	if (test_bit(BCH_FS_EMERGENCY_RO, &c->flags))
+		bch2_btree_flush_all_writes(c);
+	else
+		bch2_btree_verify_flushed(c);
+
+	/*
+	 * After stopping journal:
+	 */
+	for_each_member_device(ca, c, i)
+		bch2_dev_allocator_remove(c, ca);
+}
+
+static void bch2_writes_disabled(struct percpu_ref *writes)
+{
+	struct bch_fs *c = container_of(writes, struct bch_fs, writes);
+
+	set_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags);
+	wake_up(&bch_read_only_wait);
+}
+
+void bch2_fs_read_only(struct bch_fs *c)
+{
+	if (c->state == BCH_FS_RO)
+		return;
+
+	BUG_ON(test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags));
+
+	/*
+	 * Block new foreground-end write operations from starting - any new
+	 * writes will return -EROFS:
+	 *
+	 * (This is really blocking new _allocations_, writes to previously
+	 * allocated space can still happen until stopping the allocator in
+	 * bch2_dev_allocator_stop()).
+	 */
+	percpu_ref_kill(&c->writes);
+
+	cancel_delayed_work(&c->pd_controllers_update);
+
+	/*
+	 * If we're not doing an emergency shutdown, we want to wait on
+	 * outstanding writes to complete so they don't see spurious errors due
+	 * to shutting down the allocator:
+	 *
+	 * If we are doing an emergency shutdown outstanding writes may
+	 * hang until we shutdown the allocator so we don't want to wait
+	 * on outstanding writes before shutting everything down - but
+	 * we do need to wait on them before returning and signalling
+	 * that going RO is complete:
+	 */
+	wait_event(bch_read_only_wait,
+		   test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags) ||
+		   test_bit(BCH_FS_EMERGENCY_RO, &c->flags));
+
+	__bch2_fs_read_only(c);
+
+	wait_event(bch_read_only_wait,
+		   test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags));
+
+	clear_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags);
+
+	if (!bch2_journal_error(&c->journal) &&
+	    !test_bit(BCH_FS_ERROR, &c->flags) &&
+	    !test_bit(BCH_FS_EMERGENCY_RO, &c->flags))
+		bch2_fs_mark_clean(c, true);
+
+	if (c->state != BCH_FS_STOPPING)
+		c->state = BCH_FS_RO;
+}
+
+static void bch2_fs_read_only_work(struct work_struct *work)
+{
+	struct bch_fs *c =
+		container_of(work, struct bch_fs, read_only_work);
+
+	mutex_lock(&c->state_lock);
+	bch2_fs_read_only(c);
+	mutex_unlock(&c->state_lock);
+}
+
+static void bch2_fs_read_only_async(struct bch_fs *c)
+{
+	queue_work(system_long_wq, &c->read_only_work);
+}
+
+bool bch2_fs_emergency_read_only(struct bch_fs *c)
+{
+	bool ret = !test_and_set_bit(BCH_FS_EMERGENCY_RO, &c->flags);
+
+	bch2_fs_read_only_async(c);
+	bch2_journal_halt(&c->journal);
+
+	wake_up(&bch_read_only_wait);
+	return ret;
+}
+
+const char *bch2_fs_read_write(struct bch_fs *c)
+{
+	struct bch_dev *ca;
+	const char *err = NULL;
+	unsigned i;
+
+	if (c->state == BCH_FS_RW)
+		return NULL;
+
+	bch2_fs_mark_clean(c, false);
+
+	for_each_rw_member(ca, c, i)
+		bch2_dev_allocator_add(c, ca);
+	bch2_recalc_capacity(c);
+
+	err = "error starting allocator thread";
+	for_each_rw_member(ca, c, i)
+		if (bch2_dev_allocator_start(ca)) {
+			percpu_ref_put(&ca->io_ref);
+			goto err;
+		}
+
+	err = "error starting btree GC thread";
+	if (bch2_gc_thread_start(c))
+		goto err;
+
+	err = "error starting copygc thread";
+	for_each_rw_member(ca, c, i)
+		if (bch2_copygc_start(c, ca)) {
+			percpu_ref_put(&ca->io_ref);
+			goto err;
+		}
+
+	err = "error starting rebalance thread";
+	if (bch2_rebalance_start(c))
+		goto err;
+
+	schedule_delayed_work(&c->pd_controllers_update, 5 * HZ);
+
+	if (c->state != BCH_FS_STARTING)
+		percpu_ref_reinit(&c->writes);
+
+	c->state = BCH_FS_RW;
+	return NULL;
+err:
+	__bch2_fs_read_only(c);
+	return err;
+}
+
+/* Filesystem startup/shutdown: */
+
+static void bch2_fs_free(struct bch_fs *c)
+{
+	unsigned i;
+
+	for (i = 0; i < BCH_TIME_STAT_NR; i++)
+		bch2_time_stats_exit(&c->times[i]);
+
+	bch2_fs_quota_exit(c);
+	bch2_fs_fsio_exit(c);
+	bch2_fs_encryption_exit(c);
+	bch2_fs_io_exit(c);
+	bch2_fs_btree_cache_exit(c);
+	bch2_fs_journal_exit(&c->journal);
+	bch2_io_clock_exit(&c->io_clock[WRITE]);
+	bch2_io_clock_exit(&c->io_clock[READ]);
+	bch2_fs_compress_exit(c);
+	percpu_free_rwsem(&c->usage_lock);
+	free_percpu(c->usage_percpu);
+	mempool_exit(&c->btree_bounce_pool);
+	bioset_exit(&c->btree_bio);
+	mempool_exit(&c->btree_interior_update_pool);
+	mempool_exit(&c->btree_reserve_pool);
+	mempool_exit(&c->fill_iter);
+	percpu_ref_exit(&c->writes);
+	kfree(rcu_dereference_protected(c->replicas, 1));
+	kfree(rcu_dereference_protected(c->disk_groups, 1));
+
+	if (c->copygc_wq)
+		destroy_workqueue(c->copygc_wq);
+	if (c->wq)
+		destroy_workqueue(c->wq);
+
+	free_pages((unsigned long) c->disk_sb.sb,
+		   c->disk_sb.page_order);
+	kvpfree(c, sizeof(*c));
+	module_put(THIS_MODULE);
+}
+
+static void bch2_fs_release(struct kobject *kobj)
+{
+	struct bch_fs *c = container_of(kobj, struct bch_fs, kobj);
+
+	bch2_fs_free(c);
+}
+
+void bch2_fs_stop(struct bch_fs *c)
+{
+	struct bch_dev *ca;
+	unsigned i;
+
+	for_each_member_device(ca, c, i)
+		if (ca->kobj.state_in_sysfs &&
+		    ca->disk_sb.bdev)
+			sysfs_remove_link(bdev_kobj(ca->disk_sb.bdev), "bcachefs");
+
+	if (c->kobj.state_in_sysfs)
+		kobject_del(&c->kobj);
+
+	bch2_fs_debug_exit(c);
+	bch2_fs_chardev_exit(c);
+
+	kobject_put(&c->time_stats);
+	kobject_put(&c->opts_dir);
+	kobject_put(&c->internal);
+
+	mutex_lock(&bch_fs_list_lock);
+	list_del(&c->list);
+	mutex_unlock(&bch_fs_list_lock);
+
+	closure_sync(&c->cl);
+	closure_debug_destroy(&c->cl);
+
+	mutex_lock(&c->state_lock);
+	bch2_fs_read_only(c);
+	mutex_unlock(&c->state_lock);
+
+	/* btree prefetch might have kicked off reads in the background: */
+	bch2_btree_flush_all_reads(c);
+
+	for_each_member_device(ca, c, i)
+		cancel_work_sync(&ca->io_error_work);
+
+	cancel_work_sync(&c->btree_write_error_work);
+	cancel_delayed_work_sync(&c->pd_controllers_update);
+	cancel_work_sync(&c->read_only_work);
+
+	for (i = 0; i < c->sb.nr_devices; i++)
+		if (c->devs[i])
+			bch2_dev_free(rcu_dereference_protected(c->devs[i], 1));
+
+	kobject_put(&c->kobj);
+}
+
+static const char *bch2_fs_online(struct bch_fs *c)
+{
+	struct bch_dev *ca;
+	const char *err = NULL;
+	unsigned i;
+	int ret;
+
+	lockdep_assert_held(&bch_fs_list_lock);
+
+	if (!list_empty(&c->list))
+		return NULL;
+
+	if (__bch2_uuid_to_fs(c->sb.uuid))
+		return "filesystem UUID already open";
+
+	ret = bch2_fs_chardev_init(c);
+	if (ret)
+		return "error creating character device";
+
+	bch2_fs_debug_init(c);
+
+	if (kobject_add(&c->kobj, NULL, "%pU", c->sb.user_uuid.b) ||
+	    kobject_add(&c->internal, &c->kobj, "internal") ||
+	    kobject_add(&c->opts_dir, &c->kobj, "options") ||
+	    kobject_add(&c->time_stats, &c->kobj, "time_stats") ||
+	    bch2_opts_create_sysfs_files(&c->opts_dir))
+		return "error creating sysfs objects";
+
+	mutex_lock(&c->state_lock);
+
+	err = "error creating sysfs objects";
+	__for_each_member_device(ca, c, i, NULL)
+		if (bch2_dev_sysfs_online(c, ca))
+			goto err;
+
+	list_add(&c->list, &bch_fs_list);
+	err = NULL;
+err:
+	mutex_unlock(&c->state_lock);
+	return err;
+}
+
+static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
+{
+	struct bch_sb_field_members *mi;
+	struct bch_fs *c;
+	unsigned i, iter_size;
+	const char *err;
+
+	pr_verbose_init(opts, "");
+
+	c = kvpmalloc(sizeof(struct bch_fs), GFP_KERNEL|__GFP_ZERO);
+	if (!c)
+		goto out;
+
+	__module_get(THIS_MODULE);
+
+	c->minor		= -1;
+	c->disk_sb.fs_sb	= true;
+
+	mutex_init(&c->state_lock);
+	mutex_init(&c->sb_lock);
+	mutex_init(&c->replicas_gc_lock);
+	mutex_init(&c->btree_root_lock);
+	INIT_WORK(&c->read_only_work, bch2_fs_read_only_work);
+
+	init_rwsem(&c->gc_lock);
+
+	for (i = 0; i < BCH_TIME_STAT_NR; i++)
+		bch2_time_stats_init(&c->times[i]);
+
+	bch2_fs_allocator_init(c);
+	bch2_fs_rebalance_init(c);
+	bch2_fs_quota_init(c);
+
+	INIT_LIST_HEAD(&c->list);
+
+	INIT_LIST_HEAD(&c->btree_interior_update_list);
+	mutex_init(&c->btree_reserve_cache_lock);
+	mutex_init(&c->btree_interior_update_lock);
+
+	mutex_init(&c->bio_bounce_pages_lock);
+
+	bio_list_init(&c->btree_write_error_list);
+	spin_lock_init(&c->btree_write_error_lock);
+	INIT_WORK(&c->btree_write_error_work, bch2_btree_write_error_work);
+
+	INIT_LIST_HEAD(&c->fsck_errors);
+	mutex_init(&c->fsck_error_lock);
+
+	seqcount_init(&c->gc_pos_lock);
+
+	c->copy_gc_enabled		= 1;
+	c->rebalance.enabled		= 1;
+	c->promote_whole_extents	= true;
+
+	c->journal.write_time	= &c->times[BCH_TIME_journal_write];
+	c->journal.delay_time	= &c->times[BCH_TIME_journal_delay];
+	c->journal.blocked_time	= &c->times[BCH_TIME_journal_blocked];
+	c->journal.flush_seq_time = &c->times[BCH_TIME_journal_flush_seq];
+
+	bch2_fs_btree_cache_init_early(&c->btree_cache);
+
+	mutex_lock(&c->sb_lock);
+
+	if (bch2_sb_to_fs(c, sb)) {
+		mutex_unlock(&c->sb_lock);
+		goto err;
+	}
+
+	mutex_unlock(&c->sb_lock);
+
+	scnprintf(c->name, sizeof(c->name), "%pU", &c->sb.user_uuid);
+
+	c->opts = bch2_opts_default;
+	bch2_opts_apply(&c->opts, bch2_opts_from_sb(sb));
+	bch2_opts_apply(&c->opts, opts);
+
+	c->block_bits		= ilog2(c->opts.block_size);
+	c->btree_foreground_merge_threshold = BTREE_FOREGROUND_MERGE_THRESHOLD(c);
+
+	c->opts.nochanges	|= c->opts.noreplay;
+	c->opts.read_only	|= c->opts.nochanges;
+
+	if (bch2_fs_init_fault("fs_alloc"))
+		goto err;
+
+	iter_size = sizeof(struct btree_node_iter_large) +
+		(btree_blocks(c) + 1) * 2 *
+		sizeof(struct btree_node_iter_set);
+
+	if (!(c->wq = alloc_workqueue("bcachefs",
+				WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_HIGHPRI, 1)) ||
+	    !(c->copygc_wq = alloc_workqueue("bcache_copygc",
+				WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_HIGHPRI, 1)) ||
+	    percpu_ref_init(&c->writes, bch2_writes_disabled, 0, GFP_KERNEL) ||
+	    mempool_init_kmalloc_pool(&c->btree_reserve_pool, 1,
+				      sizeof(struct btree_reserve)) ||
+	    mempool_init_kmalloc_pool(&c->btree_interior_update_pool, 1,
+				      sizeof(struct btree_update)) ||
+	    mempool_init_kmalloc_pool(&c->fill_iter, 1, iter_size) ||
+	    bioset_init(&c->btree_bio, 1,
+			max(offsetof(struct btree_read_bio, bio),
+			    offsetof(struct btree_write_bio, wbio.bio)),
+			BIOSET_NEED_BVECS) ||
+	    !(c->usage_percpu = alloc_percpu(struct bch_fs_usage)) ||
+	    percpu_init_rwsem(&c->usage_lock) ||
+	    mempool_init_kvpmalloc_pool(&c->btree_bounce_pool, 1,
+					btree_bytes(c)) ||
+	    bch2_io_clock_init(&c->io_clock[READ]) ||
+	    bch2_io_clock_init(&c->io_clock[WRITE]) ||
+	    bch2_fs_journal_init(&c->journal) ||
+	    bch2_fs_btree_cache_init(c) ||
+	    bch2_fs_io_init(c) ||
+	    bch2_fs_encryption_init(c) ||
+	    bch2_fs_compress_init(c) ||
+	    bch2_fs_fsio_init(c))
+		goto err;
+
+	mi = bch2_sb_get_members(c->disk_sb.sb);
+	for (i = 0; i < c->sb.nr_devices; i++)
+		if (bch2_dev_exists(c->disk_sb.sb, mi, i) &&
+		    bch2_dev_alloc(c, i))
+			goto err;
+
+	/*
+	 * Now that all allocations have succeeded, init various refcounty
+	 * things that let us shutdown:
+	 */
+	closure_init(&c->cl, NULL);
+
+	c->kobj.kset = bcachefs_kset;
+	kobject_init(&c->kobj, &bch2_fs_ktype);
+	kobject_init(&c->internal, &bch2_fs_internal_ktype);
+	kobject_init(&c->opts_dir, &bch2_fs_opts_dir_ktype);
+	kobject_init(&c->time_stats, &bch2_fs_time_stats_ktype);
+
+	mutex_lock(&bch_fs_list_lock);
+	err = bch2_fs_online(c);
+	mutex_unlock(&bch_fs_list_lock);
+	if (err) {
+		bch_err(c, "bch2_fs_online() error: %s", err);
+		goto err;
+	}
+out:
+	pr_verbose_init(opts, "ret %i", c ? 0 : -ENOMEM);
+	return c;
+err:
+	bch2_fs_free(c);
+	c = NULL;
+	goto out;
+}
+
+const char *bch2_fs_start(struct bch_fs *c)
+{
+	const char *err = "cannot allocate memory";
+	struct bch_sb_field_members *mi;
+	struct bch_dev *ca;
+	time64_t now = ktime_get_seconds();
+	unsigned i;
+	int ret = -EINVAL;
+
+	mutex_lock(&c->state_lock);
+
+	BUG_ON(c->state != BCH_FS_STARTING);
+
+	mutex_lock(&c->sb_lock);
+
+	for_each_online_member(ca, c, i)
+		bch2_sb_from_fs(c, ca);
+
+	mi = bch2_sb_get_members(c->disk_sb.sb);
+	for_each_online_member(ca, c, i)
+		mi->members[ca->dev_idx].last_mount = cpu_to_le64(now);
+
+	mutex_unlock(&c->sb_lock);
+
+	for_each_rw_member(ca, c, i)
+		bch2_dev_allocator_add(c, ca);
+	bch2_recalc_capacity(c);
+
+	ret = BCH_SB_INITIALIZED(c->disk_sb.sb)
+		? bch2_fs_recovery(c)
+		: bch2_fs_initialize(c);
+	if (ret)
+		goto err;
+
+	err = "dynamic fault";
+	if (bch2_fs_init_fault("fs_start"))
+		goto err;
+
+	if (c->opts.read_only) {
+		bch2_fs_read_only(c);
+	} else {
+		err = bch2_fs_read_write(c);
+		if (err)
+			goto err;
+	}
+
+	set_bit(BCH_FS_STARTED, &c->flags);
+
+	err = NULL;
+out:
+	mutex_unlock(&c->state_lock);
+	return err;
+err:
+	switch (ret) {
+	case BCH_FSCK_ERRORS_NOT_FIXED:
+		bch_err(c, "filesystem contains errors: please report this to the developers");
+		pr_cont("mount with -o fix_errors to repair\n");
+		err = "fsck error";
+		break;
+	case BCH_FSCK_REPAIR_UNIMPLEMENTED:
+		bch_err(c, "filesystem contains errors: please report this to the developers");
+		pr_cont("repair unimplemented: inform the developers so that it can be added\n");
+		err = "fsck error";
+		break;
+	case BCH_FSCK_REPAIR_IMPOSSIBLE:
+		bch_err(c, "filesystem contains errors, but repair impossible");
+		err = "fsck error";
+		break;
+	case BCH_FSCK_UNKNOWN_VERSION:
+		err = "unknown metadata version";;
+		break;
+	case -ENOMEM:
+		err = "cannot allocate memory";
+		break;
+	case -EIO:
+		err = "IO error";
+		break;
+	}
+
+	BUG_ON(!err);
+	set_bit(BCH_FS_ERROR, &c->flags);
+	goto out;
+}
+
+static const char *bch2_dev_may_add(struct bch_sb *sb, struct bch_fs *c)
+{
+	struct bch_sb_field_members *sb_mi;
+
+	sb_mi = bch2_sb_get_members(sb);
+	if (!sb_mi)
+		return "Invalid superblock: member info area missing";
+
+	if (le16_to_cpu(sb->block_size) != c->opts.block_size)
+		return "mismatched block size";
+
+	if (le16_to_cpu(sb_mi->members[sb->dev_idx].bucket_size) <
+	    BCH_SB_BTREE_NODE_SIZE(c->disk_sb.sb))
+		return "new cache bucket size is too small";
+
+	return NULL;
+}
+
+static const char *bch2_dev_in_fs(struct bch_sb *fs, struct bch_sb *sb)
+{
+	struct bch_sb *newest =
+		le64_to_cpu(fs->seq) > le64_to_cpu(sb->seq) ? fs : sb;
+	struct bch_sb_field_members *mi = bch2_sb_get_members(newest);
+
+	if (!uuid_equal(&fs->uuid, &sb->uuid))
+		return "device not a member of filesystem";
+
+	if (!bch2_dev_exists(newest, mi, sb->dev_idx))
+		return "device has been removed";
+
+	if (fs->block_size != sb->block_size)
+		return "mismatched block size";
+
+	return NULL;
+}
+
+/* Device startup/shutdown: */
+
+static void bch2_dev_release(struct kobject *kobj)
+{
+	struct bch_dev *ca = container_of(kobj, struct bch_dev, kobj);
+
+	kfree(ca);
+}
+
+static void bch2_dev_free(struct bch_dev *ca)
+{
+	cancel_work_sync(&ca->io_error_work);
+
+	if (ca->kobj.state_in_sysfs &&
+	    ca->disk_sb.bdev)
+		sysfs_remove_link(bdev_kobj(ca->disk_sb.bdev), "bcachefs");
+
+	if (ca->kobj.state_in_sysfs)
+		kobject_del(&ca->kobj);
+
+	bch2_free_super(&ca->disk_sb);
+	bch2_dev_journal_exit(ca);
+
+	free_percpu(ca->io_done);
+	bioset_exit(&ca->replica_set);
+	bch2_dev_buckets_free(ca);
+
+	bch2_time_stats_exit(&ca->io_latency[WRITE]);
+	bch2_time_stats_exit(&ca->io_latency[READ]);
+
+	percpu_ref_exit(&ca->io_ref);
+	percpu_ref_exit(&ca->ref);
+	kobject_put(&ca->kobj);
+}
+
+static void __bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca)
+{
+
+	lockdep_assert_held(&c->state_lock);
+
+	if (percpu_ref_is_zero(&ca->io_ref))
+		return;
+
+	__bch2_dev_read_only(c, ca);
+
+	reinit_completion(&ca->io_ref_completion);
+	percpu_ref_kill(&ca->io_ref);
+	wait_for_completion(&ca->io_ref_completion);
+
+	if (ca->kobj.state_in_sysfs) {
+		sysfs_remove_link(bdev_kobj(ca->disk_sb.bdev), "bcachefs");
+		sysfs_remove_link(&ca->kobj, "block");
+	}
+
+	bch2_free_super(&ca->disk_sb);
+	bch2_dev_journal_exit(ca);
+}
+
+static void bch2_dev_ref_complete(struct percpu_ref *ref)
+{
+	struct bch_dev *ca = container_of(ref, struct bch_dev, ref);
+
+	complete(&ca->ref_completion);
+}
+
+static void bch2_dev_io_ref_complete(struct percpu_ref *ref)
+{
+	struct bch_dev *ca = container_of(ref, struct bch_dev, io_ref);
+
+	complete(&ca->io_ref_completion);
+}
+
+static int bch2_dev_sysfs_online(struct bch_fs *c, struct bch_dev *ca)
+{
+	int ret;
+
+	if (!c->kobj.state_in_sysfs)
+		return 0;
+
+	if (!ca->kobj.state_in_sysfs) {
+		ret = kobject_add(&ca->kobj, &c->kobj,
+				  "dev-%u", ca->dev_idx);
+		if (ret)
+			return ret;
+	}
+
+	if (ca->disk_sb.bdev) {
+		struct kobject *block = bdev_kobj(ca->disk_sb.bdev);
+
+		ret = sysfs_create_link(block, &ca->kobj, "bcachefs");
+		if (ret)
+			return ret;
+
+		ret = sysfs_create_link(&ca->kobj, block, "block");
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c,
+					struct bch_member *member)
+{
+	struct bch_dev *ca;
+
+	ca = kzalloc(sizeof(*ca), GFP_KERNEL);
+	if (!ca)
+		return NULL;
+
+	kobject_init(&ca->kobj, &bch2_dev_ktype);
+	init_completion(&ca->ref_completion);
+	init_completion(&ca->io_ref_completion);
+
+	init_rwsem(&ca->bucket_lock);
+
+	writepoint_init(&ca->copygc_write_point, BCH_DATA_USER);
+
+	spin_lock_init(&ca->freelist_lock);
+	bch2_dev_copygc_init(ca);
+
+	INIT_WORK(&ca->io_error_work, bch2_io_error_work);
+
+	bch2_time_stats_init(&ca->io_latency[READ]);
+	bch2_time_stats_init(&ca->io_latency[WRITE]);
+
+	ca->mi = bch2_mi_to_cpu(member);
+	ca->uuid = member->uuid;
+
+	if (opt_defined(c->opts, discard))
+		ca->mi.discard = opt_get(c->opts, discard);
+
+	if (percpu_ref_init(&ca->ref, bch2_dev_ref_complete,
+			    0, GFP_KERNEL) ||
+	    percpu_ref_init(&ca->io_ref, bch2_dev_io_ref_complete,
+			    PERCPU_REF_INIT_DEAD, GFP_KERNEL) ||
+	    bch2_dev_buckets_alloc(c, ca) ||
+	    bioset_init(&ca->replica_set, 4,
+			offsetof(struct bch_write_bio, bio), 0) ||
+	    !(ca->io_done	= alloc_percpu(*ca->io_done)))
+		goto err;
+
+	return ca;
+err:
+	bch2_dev_free(ca);
+	return NULL;
+}
+
+static void bch2_dev_attach(struct bch_fs *c, struct bch_dev *ca,
+			    unsigned dev_idx)
+{
+	ca->dev_idx = dev_idx;
+	__set_bit(ca->dev_idx, ca->self.d);
+	scnprintf(ca->name, sizeof(ca->name), "dev-%u", dev_idx);
+
+	ca->fs = c;
+	rcu_assign_pointer(c->devs[ca->dev_idx], ca);
+
+	if (bch2_dev_sysfs_online(c, ca))
+		pr_warn("error creating sysfs objects");
+}
+
+static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx)
+{
+	struct bch_member *member =
+		bch2_sb_get_members(c->disk_sb.sb)->members + dev_idx;
+	struct bch_dev *ca = NULL;
+	int ret = 0;
+
+	pr_verbose_init(c->opts, "");
+
+	if (bch2_fs_init_fault("dev_alloc"))
+		goto err;
+
+	ca = __bch2_dev_alloc(c, member);
+	if (!ca)
+		goto err;
+
+	bch2_dev_attach(c, ca, dev_idx);
+out:
+	pr_verbose_init(c->opts, "ret %i", ret);
+	return ret;
+err:
+	if (ca)
+		bch2_dev_free(ca);
+	ret = -ENOMEM;
+	goto out;
+}
+
+static int __bch2_dev_attach_bdev(struct bch_dev *ca, struct bch_sb_handle *sb)
+{
+	unsigned ret;
+
+	if (bch2_dev_is_online(ca)) {
+		bch_err(ca, "already have device online in slot %u",
+			sb->sb->dev_idx);
+		return -EINVAL;
+	}
+
+	if (get_capacity(sb->bdev->bd_disk) <
+	    ca->mi.bucket_size * ca->mi.nbuckets) {
+		bch_err(ca, "cannot online: device too small");
+		return -EINVAL;
+	}
+
+	BUG_ON(!percpu_ref_is_zero(&ca->io_ref));
+
+	if (get_capacity(sb->bdev->bd_disk) <
+	    ca->mi.bucket_size * ca->mi.nbuckets) {
+		bch_err(ca, "device too small");
+		return -EINVAL;
+	}
+
+	ret = bch2_dev_journal_init(ca, sb->sb);
+	if (ret)
+		return ret;
+
+	/* Commit: */
+	ca->disk_sb = *sb;
+	memset(sb, 0, sizeof(*sb));
+
+	if (ca->fs)
+		mutex_lock(&ca->fs->sb_lock);
+
+	bch2_mark_dev_superblock(ca->fs, ca, BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE);
+
+	if (ca->fs)
+		mutex_unlock(&ca->fs->sb_lock);
+
+	percpu_ref_reinit(&ca->io_ref);
+
+	return 0;
+}
+
+static int bch2_dev_attach_bdev(struct bch_fs *c, struct bch_sb_handle *sb)
+{
+	struct bch_dev *ca;
+	int ret;
+
+	lockdep_assert_held(&c->state_lock);
+
+	if (le64_to_cpu(sb->sb->seq) >
+	    le64_to_cpu(c->disk_sb.sb->seq))
+		bch2_sb_to_fs(c, sb->sb);
+
+	BUG_ON(sb->sb->dev_idx >= c->sb.nr_devices ||
+	       !c->devs[sb->sb->dev_idx]);
+
+	ca = bch_dev_locked(c, sb->sb->dev_idx);
+
+	ret = __bch2_dev_attach_bdev(ca, sb);
+	if (ret)
+		return ret;
+
+	bch2_dev_sysfs_online(c, ca);
+
+	if (c->sb.nr_devices == 1)
+		snprintf(c->name, sizeof(c->name), "%pg", ca->disk_sb.bdev);
+	snprintf(ca->name, sizeof(ca->name), "%pg", ca->disk_sb.bdev);
+
+	rebalance_wakeup(c);
+	return 0;
+}
+
+/* Device management: */
+
+/*
+ * Note: this function is also used by the error paths - when a particular
+ * device sees an error, we call it to determine whether we can just set the
+ * device RO, or - if this function returns false - we'll set the whole
+ * filesystem RO:
+ *
+ * XXX: maybe we should be more explicit about whether we're changing state
+ * because we got an error or what have you?
+ */
+bool bch2_dev_state_allowed(struct bch_fs *c, struct bch_dev *ca,
+			    enum bch_member_state new_state, int flags)
+{
+	struct bch_devs_mask new_online_devs;
+	struct replicas_status s;
+	struct bch_dev *ca2;
+	int i, nr_rw = 0, required;
+
+	lockdep_assert_held(&c->state_lock);
+
+	switch (new_state) {
+	case BCH_MEMBER_STATE_RW:
+		return true;
+	case BCH_MEMBER_STATE_RO:
+		if (ca->mi.state != BCH_MEMBER_STATE_RW)
+			return true;
+
+		/* do we have enough devices to write to?  */
+		for_each_member_device(ca2, c, i)
+			if (ca2 != ca)
+				nr_rw += ca2->mi.state == BCH_MEMBER_STATE_RW;
+
+		required = max(!(flags & BCH_FORCE_IF_METADATA_DEGRADED)
+			       ? c->opts.metadata_replicas
+			       : c->opts.metadata_replicas_required,
+			       !(flags & BCH_FORCE_IF_DATA_DEGRADED)
+			       ? c->opts.data_replicas
+			       : c->opts.data_replicas_required);
+
+		return nr_rw >= required;
+	case BCH_MEMBER_STATE_FAILED:
+	case BCH_MEMBER_STATE_SPARE:
+		if (ca->mi.state != BCH_MEMBER_STATE_RW &&
+		    ca->mi.state != BCH_MEMBER_STATE_RO)
+			return true;
+
+		/* do we have enough devices to read from?  */
+		new_online_devs = bch2_online_devs(c);
+		__clear_bit(ca->dev_idx, new_online_devs.d);
+
+		s = __bch2_replicas_status(c, new_online_devs);
+
+		return bch2_have_enough_devs(s, flags);
+	default:
+		BUG();
+	}
+}
+
+static bool bch2_fs_may_start(struct bch_fs *c)
+{
+	struct replicas_status s;
+	struct bch_sb_field_members *mi;
+	struct bch_dev *ca;
+	unsigned i, flags = c->opts.degraded
+		? BCH_FORCE_IF_DEGRADED
+		: 0;
+
+	if (!c->opts.degraded) {
+		mutex_lock(&c->sb_lock);
+		mi = bch2_sb_get_members(c->disk_sb.sb);
+
+		for (i = 0; i < c->disk_sb.sb->nr_devices; i++) {
+			if (!bch2_dev_exists(c->disk_sb.sb, mi, i))
+				continue;
+
+			ca = bch_dev_locked(c, i);
+
+			if (!bch2_dev_is_online(ca) &&
+			    (ca->mi.state == BCH_MEMBER_STATE_RW ||
+			     ca->mi.state == BCH_MEMBER_STATE_RO)) {
+				mutex_unlock(&c->sb_lock);
+				return false;
+			}
+		}
+		mutex_unlock(&c->sb_lock);
+	}
+
+	s = bch2_replicas_status(c);
+
+	return bch2_have_enough_devs(s, flags);
+}
+
+static void __bch2_dev_read_only(struct bch_fs *c, struct bch_dev *ca)
+{
+	bch2_copygc_stop(ca);
+
+	/*
+	 * The allocator thread itself allocates btree nodes, so stop it first:
+	 */
+	bch2_dev_allocator_stop(ca);
+	bch2_dev_allocator_remove(c, ca);
+	bch2_dev_journal_stop(&c->journal, ca);
+}
+
+static const char *__bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca)
+{
+	lockdep_assert_held(&c->state_lock);
+
+	BUG_ON(ca->mi.state != BCH_MEMBER_STATE_RW);
+
+	bch2_dev_allocator_add(c, ca);
+	bch2_recalc_capacity(c);
+
+	if (bch2_dev_allocator_start(ca))
+		return "error starting allocator thread";
+
+	if (bch2_copygc_start(c, ca))
+		return "error starting copygc thread";
+
+	return NULL;
+}
+
+int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca,
+			 enum bch_member_state new_state, int flags)
+{
+	struct bch_sb_field_members *mi;
+	int ret = 0;
+
+	if (ca->mi.state == new_state)
+		return 0;
+
+	if (!bch2_dev_state_allowed(c, ca, new_state, flags))
+		return -EINVAL;
+
+	if (new_state != BCH_MEMBER_STATE_RW)
+		__bch2_dev_read_only(c, ca);
+
+	bch_notice(ca, "%s", bch2_dev_state[new_state]);
+
+	mutex_lock(&c->sb_lock);
+	mi = bch2_sb_get_members(c->disk_sb.sb);
+	SET_BCH_MEMBER_STATE(&mi->members[ca->dev_idx], new_state);
+	bch2_write_super(c);
+	mutex_unlock(&c->sb_lock);
+
+	if (new_state == BCH_MEMBER_STATE_RW &&
+	    __bch2_dev_read_write(c, ca))
+		ret = -ENOMEM;
+
+	rebalance_wakeup(c);
+
+	return ret;
+}
+
+int bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca,
+		       enum bch_member_state new_state, int flags)
+{
+	int ret;
+
+	mutex_lock(&c->state_lock);
+	ret = __bch2_dev_set_state(c, ca, new_state, flags);
+	mutex_unlock(&c->state_lock);
+
+	return ret;
+}
+
+/* Device add/removal: */
+
+int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
+{
+	struct bch_sb_field_members *mi;
+	unsigned dev_idx = ca->dev_idx, data;
+	int ret = -EINVAL;
+
+	mutex_lock(&c->state_lock);
+
+	percpu_ref_put(&ca->ref); /* XXX */
+
+	if (!bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_FAILED, flags)) {
+		bch_err(ca, "Cannot remove without losing data");
+		goto err;
+	}
+
+	__bch2_dev_read_only(c, ca);
+
+	/*
+	 * XXX: verify that dev_idx is really not in use anymore, anywhere
+	 *
+	 * flag_data_bad() does not check btree pointers
+	 */
+	ret = bch2_dev_data_drop(c, ca->dev_idx, flags);
+	if (ret) {
+		bch_err(ca, "Remove failed: error %i dropping data", ret);
+		goto err;
+	}
+
+	ret = bch2_journal_flush_device_pins(&c->journal, ca->dev_idx);
+	if (ret) {
+		bch_err(ca, "Remove failed: error %i flushing journal", ret);
+		goto err;
+	}
+
+	data = bch2_dev_has_data(c, ca);
+	if (data) {
+		char data_has_str[100];
+		bch2_scnprint_flag_list(data_has_str,
+					sizeof(data_has_str),
+					bch2_data_types,
+					data);
+		bch_err(ca, "Remove failed, still has data (%s)", data_has_str);
+		ret = -EBUSY;
+		goto err;
+	}
+
+	ret = bch2_btree_delete_range(c, BTREE_ID_ALLOC,
+				      POS(ca->dev_idx, 0),
+				      POS(ca->dev_idx + 1, 0),
+				      ZERO_VERSION,
+				      NULL, NULL, NULL);
+	if (ret) {
+		bch_err(ca, "Remove failed, error deleting alloc info");
+		goto err;
+	}
+
+	/*
+	 * must flush all existing journal entries, they might have
+	 * (overwritten) keys that point to the device we're removing:
+	 */
+	bch2_journal_flush_all_pins(&c->journal);
+	ret = bch2_journal_error(&c->journal);
+	if (ret) {
+		bch_err(ca, "Remove failed, journal error");
+		goto err;
+	}
+
+	__bch2_dev_offline(c, ca);
+
+	mutex_lock(&c->sb_lock);
+	rcu_assign_pointer(c->devs[ca->dev_idx], NULL);
+	mutex_unlock(&c->sb_lock);
+
+	percpu_ref_kill(&ca->ref);
+	wait_for_completion(&ca->ref_completion);
+
+	bch2_dev_free(ca);
+
+	/*
+	 * Free this device's slot in the bch_member array - all pointers to
+	 * this device must be gone:
+	 */
+	mutex_lock(&c->sb_lock);
+	mi = bch2_sb_get_members(c->disk_sb.sb);
+	memset(&mi->members[dev_idx].uuid, 0, sizeof(mi->members[dev_idx].uuid));
+
+	bch2_write_super(c);
+
+	mutex_unlock(&c->sb_lock);
+	mutex_unlock(&c->state_lock);
+	return 0;
+err:
+	if (ca->mi.state == BCH_MEMBER_STATE_RW)
+		__bch2_dev_read_write(c, ca);
+	mutex_unlock(&c->state_lock);
+	return ret;
+}
+
+/* Add new device to running filesystem: */
+int bch2_dev_add(struct bch_fs *c, const char *path)
+{
+	struct bch_opts opts = bch2_opts_empty();
+	struct bch_sb_handle sb;
+	const char *err;
+	struct bch_dev *ca = NULL;
+	struct bch_sb_field_members *mi;
+	struct bch_member dev_mi;
+	unsigned dev_idx, nr_devices, u64s;
+	int ret;
+
+	ret = bch2_read_super(path, &opts, &sb);
+	if (ret)
+		return ret;
+
+	err = bch2_sb_validate(&sb);
+	if (err)
+		return -EINVAL;
+
+	dev_mi = bch2_sb_get_members(sb.sb)->members[sb.sb->dev_idx];
+
+	err = bch2_dev_may_add(sb.sb, c);
+	if (err)
+		return -EINVAL;
+
+	ca = __bch2_dev_alloc(c, &dev_mi);
+	if (!ca) {
+		bch2_free_super(&sb);
+		return -ENOMEM;
+	}
+
+	ret = __bch2_dev_attach_bdev(ca, &sb);
+	if (ret) {
+		bch2_dev_free(ca);
+		return ret;
+	}
+
+	err = "journal alloc failed";
+	ret = bch2_dev_journal_alloc(ca);
+	if (ret)
+		goto err;
+
+	mutex_lock(&c->state_lock);
+	mutex_lock(&c->sb_lock);
+
+	err = "insufficient space in new superblock";
+	ret = bch2_sb_from_fs(c, ca);
+	if (ret)
+		goto err_unlock;
+
+	mi = bch2_sb_get_members(ca->disk_sb.sb);
+
+	if (!bch2_sb_resize_members(&ca->disk_sb,
+				le32_to_cpu(mi->field.u64s) +
+				sizeof(dev_mi) / sizeof(u64))) {
+		ret = -ENOSPC;
+		goto err_unlock;
+	}
+
+	if (dynamic_fault("bcachefs:add:no_slot"))
+		goto no_slot;
+
+	mi = bch2_sb_get_members(c->disk_sb.sb);
+	for (dev_idx = 0; dev_idx < BCH_SB_MEMBERS_MAX; dev_idx++)
+		if (!bch2_dev_exists(c->disk_sb.sb, mi, dev_idx))
+			goto have_slot;
+no_slot:
+	err = "no slots available in superblock";
+	ret = -ENOSPC;
+	goto err_unlock;
+
+have_slot:
+	nr_devices = max_t(unsigned, dev_idx + 1, c->sb.nr_devices);
+	u64s = (sizeof(struct bch_sb_field_members) +
+		sizeof(struct bch_member) * nr_devices) / sizeof(u64);
+
+	err = "no space in superblock for member info";
+	ret = -ENOSPC;
+
+	mi = bch2_sb_resize_members(&c->disk_sb, u64s);
+	if (!mi)
+		goto err_unlock;
+
+	/* success: */
+
+	mi->members[dev_idx] = dev_mi;
+	mi->members[dev_idx].last_mount = cpu_to_le64(ktime_get_seconds());
+	c->disk_sb.sb->nr_devices	= nr_devices;
+
+	ca->disk_sb.sb->dev_idx	= dev_idx;
+	bch2_dev_attach(c, ca, dev_idx);
+
+	bch2_write_super(c);
+	mutex_unlock(&c->sb_lock);
+
+	if (ca->mi.state == BCH_MEMBER_STATE_RW) {
+		err = __bch2_dev_read_write(c, ca);
+		if (err)
+			goto err_late;
+	}
+
+	mutex_unlock(&c->state_lock);
+	return 0;
+
+err_unlock:
+	mutex_unlock(&c->sb_lock);
+	mutex_unlock(&c->state_lock);
+err:
+	if (ca)
+		bch2_dev_free(ca);
+	bch2_free_super(&sb);
+	bch_err(c, "Unable to add device: %s", err);
+	return ret;
+err_late:
+	bch_err(c, "Error going rw after adding device: %s", err);
+	return -EINVAL;
+}
+
+/* Hot add existing device to running filesystem: */
+int bch2_dev_online(struct bch_fs *c, const char *path)
+{
+	struct bch_opts opts = bch2_opts_empty();
+	struct bch_sb_handle sb = { NULL };
+	struct bch_sb_field_members *mi;
+	struct bch_dev *ca;
+	unsigned dev_idx;
+	const char *err;
+	int ret;
+
+	mutex_lock(&c->state_lock);
+
+	ret = bch2_read_super(path, &opts, &sb);
+	if (ret) {
+		mutex_unlock(&c->state_lock);
+		return ret;
+	}
+
+	dev_idx = sb.sb->dev_idx;
+
+	err = bch2_dev_in_fs(c->disk_sb.sb, sb.sb);
+	if (err)
+		goto err;
+
+	if (bch2_dev_attach_bdev(c, &sb)) {
+		err = "bch2_dev_attach_bdev() error";
+		goto err;
+	}
+
+	ca = bch_dev_locked(c, dev_idx);
+	if (ca->mi.state == BCH_MEMBER_STATE_RW) {
+		err = __bch2_dev_read_write(c, ca);
+		if (err)
+			goto err;
+	}
+
+	mutex_lock(&c->sb_lock);
+	mi = bch2_sb_get_members(c->disk_sb.sb);
+
+	mi->members[ca->dev_idx].last_mount =
+		cpu_to_le64(ktime_get_seconds());
+
+	bch2_write_super(c);
+	mutex_unlock(&c->sb_lock);
+
+	mutex_unlock(&c->state_lock);
+	return 0;
+err:
+	mutex_unlock(&c->state_lock);
+	bch2_free_super(&sb);
+	bch_err(c, "error bringing %s online: %s", path, err);
+	return -EINVAL;
+}
+
+int bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca, int flags)
+{
+	mutex_lock(&c->state_lock);
+
+	if (!bch2_dev_is_online(ca)) {
+		bch_err(ca, "Already offline");
+		mutex_unlock(&c->state_lock);
+		return 0;
+	}
+
+	if (!bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_FAILED, flags)) {
+		bch_err(ca, "Cannot offline required disk");
+		mutex_unlock(&c->state_lock);
+		return -EINVAL;
+	}
+
+	__bch2_dev_offline(c, ca);
+
+	mutex_unlock(&c->state_lock);
+	return 0;
+}
+
+int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
+{
+	struct bch_member *mi;
+	int ret = 0;
+
+	mutex_lock(&c->state_lock);
+
+	if (nbuckets < ca->mi.nbuckets) {
+		bch_err(ca, "Cannot shrink yet");
+		ret = -EINVAL;
+		goto err;
+	}
+
+	if (bch2_dev_is_online(ca) &&
+	    get_capacity(ca->disk_sb.bdev->bd_disk) <
+	    ca->mi.bucket_size * nbuckets) {
+		bch_err(ca, "New size larger than device");
+		ret = -EINVAL;
+		goto err;
+	}
+
+	ret = bch2_dev_buckets_resize(c, ca, nbuckets);
+	if (ret) {
+		bch_err(ca, "Resize error: %i", ret);
+		goto err;
+	}
+
+	mutex_lock(&c->sb_lock);
+	mi = &bch2_sb_get_members(c->disk_sb.sb)->members[ca->dev_idx];
+	mi->nbuckets = cpu_to_le64(nbuckets);
+
+	bch2_write_super(c);
+	mutex_unlock(&c->sb_lock);
+
+	bch2_recalc_capacity(c);
+err:
+	mutex_unlock(&c->state_lock);
+	return ret;
+}
+
+/* return with ref on ca->ref: */
+struct bch_dev *bch2_dev_lookup(struct bch_fs *c, const char *path)
+{
+
+	struct bch_dev *ca;
+	dev_t dev;
+	unsigned i;
+	int ret;
+
+	ret = lookup_bdev(path, &dev);
+	if (ret)
+		return ERR_PTR(ret);
+
+	for_each_member_device(ca, c, i)
+		if (ca->disk_sb.bdev->bd_dev == dev)
+			goto found;
+
+	ca = ERR_PTR(-ENOENT);
+found:
+	return ca;
+}
+
+/* Filesystem open: */
+
+struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices,
+			    struct bch_opts opts)
+{
+	struct bch_sb_handle *sb = NULL;
+	struct bch_fs *c = NULL;
+	unsigned i, best_sb = 0;
+	const char *err;
+	int ret = -ENOMEM;
+
+	pr_verbose_init(opts, "");
+
+	if (!nr_devices) {
+		c = ERR_PTR(-EINVAL);
+		goto out2;
+	}
+
+	if (!try_module_get(THIS_MODULE)) {
+		c = ERR_PTR(-ENODEV);
+		goto out2;
+	}
+
+	sb = kcalloc(nr_devices, sizeof(*sb), GFP_KERNEL);
+	if (!sb)
+		goto err;
+
+	for (i = 0; i < nr_devices; i++) {
+		ret = bch2_read_super(devices[i], &opts, &sb[i]);
+		if (ret)
+			goto err;
+
+		err = bch2_sb_validate(&sb[i]);
+		if (err)
+			goto err_print;
+	}
+
+	for (i = 1; i < nr_devices; i++)
+		if (le64_to_cpu(sb[i].sb->seq) >
+		    le64_to_cpu(sb[best_sb].sb->seq))
+			best_sb = i;
+
+	for (i = 0; i < nr_devices; i++) {
+		err = bch2_dev_in_fs(sb[best_sb].sb, sb[i].sb);
+		if (err)
+			goto err_print;
+	}
+
+	ret = -ENOMEM;
+	c = bch2_fs_alloc(sb[best_sb].sb, opts);
+	if (!c)
+		goto err;
+
+	err = "bch2_dev_online() error";
+	mutex_lock(&c->state_lock);
+	for (i = 0; i < nr_devices; i++)
+		if (bch2_dev_attach_bdev(c, &sb[i])) {
+			mutex_unlock(&c->state_lock);
+			goto err_print;
+		}
+	mutex_unlock(&c->state_lock);
+
+	err = "insufficient devices";
+	if (!bch2_fs_may_start(c))
+		goto err_print;
+
+	if (!c->opts.nostart) {
+		err = bch2_fs_start(c);
+		if (err)
+			goto err_print;
+	}
+out:
+	kfree(sb);
+	module_put(THIS_MODULE);
+out2:
+	pr_verbose_init(opts, "ret %i", PTR_ERR_OR_ZERO(c));
+	return c;
+err_print:
+	pr_err("bch_fs_open err opening %s: %s",
+	       devices[0], err);
+	ret = -EINVAL;
+err:
+	if (c)
+		bch2_fs_stop(c);
+	for (i = 0; i < nr_devices; i++)
+		bch2_free_super(&sb[i]);
+	c = ERR_PTR(ret);
+	goto out;
+}
+
+static const char *__bch2_fs_open_incremental(struct bch_sb_handle *sb,
+					      struct bch_opts opts)
+{
+	const char *err;
+	struct bch_fs *c;
+	bool allocated_fs = false;
+
+	err = bch2_sb_validate(sb);
+	if (err)
+		return err;
+
+	mutex_lock(&bch_fs_list_lock);
+	c = __bch2_uuid_to_fs(sb->sb->uuid);
+	if (c) {
+		closure_get(&c->cl);
+
+		err = bch2_dev_in_fs(c->disk_sb.sb, sb->sb);
+		if (err)
+			goto err;
+	} else {
+		c = bch2_fs_alloc(sb->sb, opts);
+		err = "cannot allocate memory";
+		if (!c)
+			goto err;
+
+		allocated_fs = true;
+	}
+
+	err = "bch2_dev_online() error";
+
+	mutex_lock(&c->sb_lock);
+	if (bch2_dev_attach_bdev(c, sb)) {
+		mutex_unlock(&c->sb_lock);
+		goto err;
+	}
+	mutex_unlock(&c->sb_lock);
+
+	if (!c->opts.nostart && bch2_fs_may_start(c)) {
+		err = bch2_fs_start(c);
+		if (err)
+			goto err;
+	}
+
+	closure_put(&c->cl);
+	mutex_unlock(&bch_fs_list_lock);
+
+	return NULL;
+err:
+	mutex_unlock(&bch_fs_list_lock);
+
+	if (allocated_fs)
+		bch2_fs_stop(c);
+	else if (c)
+		closure_put(&c->cl);
+
+	return err;
+}
+
+const char *bch2_fs_open_incremental(const char *path)
+{
+	struct bch_sb_handle sb;
+	struct bch_opts opts = bch2_opts_empty();
+	const char *err;
+
+	if (bch2_read_super(path, &opts, &sb))
+		return "error reading superblock";
+
+	err = __bch2_fs_open_incremental(&sb, opts);
+	bch2_free_super(&sb);
+
+	return err;
+}
+
+/* Global interfaces/init */
+
+static void bcachefs_exit(void)
+{
+	bch2_debug_exit();
+	bch2_vfs_exit();
+	bch2_chardev_exit();
+	if (bcachefs_kset)
+		kset_unregister(bcachefs_kset);
+}
+
+static int __init bcachefs_init(void)
+{
+	bch2_bkey_pack_test();
+	bch2_inode_pack_test();
+
+	if (!(bcachefs_kset = kset_create_and_add("bcachefs", NULL, fs_kobj)) ||
+	    bch2_chardev_init() ||
+	    bch2_vfs_init() ||
+	    bch2_debug_init())
+		goto err;
+
+	return 0;
+err:
+	bcachefs_exit();
+	return -ENOMEM;
+}
+
+#define BCH_DEBUG_PARAM(name, description)			\
+	bool bch2_##name;					\
+	module_param_named(name, bch2_##name, bool, 0644);	\
+	MODULE_PARM_DESC(name, description);
+BCH_DEBUG_PARAMS()
+#undef BCH_DEBUG_PARAM
+
+unsigned bch2_metadata_version = BCH_SB_VERSION_MAX;
+module_param_named(version, bch2_metadata_version, uint, 0400);
+
+module_exit(bcachefs_exit);
+module_init(bcachefs_init);
diff --git a/fs/bcachefs/super.h b/fs/bcachefs/super.h
new file mode 100644
index 000000000000..3f730164ca69
--- /dev/null
+++ b/fs/bcachefs/super.h
@@ -0,0 +1,228 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_SUPER_H
+#define _BCACHEFS_SUPER_H
+
+#include "extents.h"
+
+#include "bcachefs_ioctl.h"
+
+#include <linux/math64.h>
+
+static inline size_t sector_to_bucket(const struct bch_dev *ca, sector_t s)
+{
+	return div_u64(s, ca->mi.bucket_size);
+}
+
+static inline sector_t bucket_to_sector(const struct bch_dev *ca, size_t b)
+{
+	return ((sector_t) b) * ca->mi.bucket_size;
+}
+
+static inline sector_t bucket_remainder(const struct bch_dev *ca, sector_t s)
+{
+	u32 remainder;
+
+	div_u64_rem(s, ca->mi.bucket_size, &remainder);
+	return remainder;
+}
+
+static inline bool bch2_dev_is_online(struct bch_dev *ca)
+{
+	return !percpu_ref_is_zero(&ca->io_ref);
+}
+
+static inline bool bch2_dev_is_readable(struct bch_dev *ca)
+{
+	return bch2_dev_is_online(ca) &&
+		ca->mi.state != BCH_MEMBER_STATE_FAILED;
+}
+
+static inline bool bch2_dev_get_ioref(struct bch_dev *ca, int rw)
+{
+	if (!percpu_ref_tryget(&ca->io_ref))
+		return false;
+
+	if (ca->mi.state == BCH_MEMBER_STATE_RW ||
+	    (ca->mi.state == BCH_MEMBER_STATE_RO && rw == READ))
+		return true;
+
+	percpu_ref_put(&ca->io_ref);
+	return false;
+}
+
+static inline unsigned dev_mask_nr(const struct bch_devs_mask *devs)
+{
+	return bitmap_weight(devs->d, BCH_SB_MEMBERS_MAX);
+}
+
+static inline bool bch2_dev_list_has_dev(struct bch_devs_list devs,
+					 unsigned dev)
+{
+	unsigned i;
+
+	for (i = 0; i < devs.nr; i++)
+		if (devs.devs[i] == dev)
+			return true;
+
+	return false;
+}
+
+static inline void bch2_dev_list_drop_dev(struct bch_devs_list *devs,
+					  unsigned dev)
+{
+	unsigned i;
+
+	for (i = 0; i < devs->nr; i++)
+		if (devs->devs[i] == dev) {
+			array_remove_item(devs->devs, devs->nr, i);
+			return;
+		}
+}
+
+static inline void bch2_dev_list_add_dev(struct bch_devs_list *devs,
+					 unsigned dev)
+{
+	BUG_ON(bch2_dev_list_has_dev(*devs, dev));
+	BUG_ON(devs->nr >= BCH_REPLICAS_MAX);
+	devs->devs[devs->nr++] = dev;
+}
+
+static inline struct bch_devs_list bch2_dev_list_single(unsigned dev)
+{
+	return (struct bch_devs_list) { .nr = 1, .devs[0] = dev };
+}
+
+static inline struct bch_dev *__bch2_next_dev(struct bch_fs *c, unsigned *iter,
+					      const struct bch_devs_mask *mask)
+{
+	struct bch_dev *ca = NULL;
+
+	while ((*iter = mask
+		? find_next_bit(mask->d, c->sb.nr_devices, *iter)
+		: *iter) < c->sb.nr_devices &&
+	       !(ca = rcu_dereference_check(c->devs[*iter],
+					    lockdep_is_held(&c->state_lock))))
+		(*iter)++;
+
+	return ca;
+}
+
+#define __for_each_member_device(ca, c, iter, mask)			\
+	for ((iter) = 0; ((ca) = __bch2_next_dev((c), &(iter), mask)); (iter)++)
+
+#define for_each_member_device_rcu(ca, c, iter, mask)			\
+	__for_each_member_device(ca, c, iter, mask)
+
+static inline struct bch_dev *bch2_get_next_dev(struct bch_fs *c, unsigned *iter)
+{
+	struct bch_dev *ca;
+
+	rcu_read_lock();
+	if ((ca = __bch2_next_dev(c, iter, NULL)))
+		percpu_ref_get(&ca->ref);
+	rcu_read_unlock();
+
+	return ca;
+}
+
+/*
+ * If you break early, you must drop your ref on the current device
+ */
+#define for_each_member_device(ca, c, iter)				\
+	for ((iter) = 0;						\
+	     (ca = bch2_get_next_dev(c, &(iter)));			\
+	     percpu_ref_put(&ca->ref), (iter)++)
+
+static inline struct bch_dev *bch2_get_next_online_dev(struct bch_fs *c,
+						      unsigned *iter,
+						      int state_mask)
+{
+	struct bch_dev *ca;
+
+	rcu_read_lock();
+	while ((ca = __bch2_next_dev(c, iter, NULL)) &&
+	       (!((1 << ca->mi.state) & state_mask) ||
+		!percpu_ref_tryget(&ca->io_ref)))
+		(*iter)++;
+	rcu_read_unlock();
+
+	return ca;
+}
+
+#define __for_each_online_member(ca, c, iter, state_mask)		\
+	for ((iter) = 0;						\
+	     (ca = bch2_get_next_online_dev(c, &(iter), state_mask));	\
+	     percpu_ref_put(&ca->io_ref), (iter)++)
+
+#define for_each_online_member(ca, c, iter)				\
+	__for_each_online_member(ca, c, iter, ~0)
+
+#define for_each_rw_member(ca, c, iter)					\
+	__for_each_online_member(ca, c, iter, 1 << BCH_MEMBER_STATE_RW)
+
+#define for_each_readable_member(ca, c, iter)				\
+	__for_each_online_member(ca, c, iter,				\
+		(1 << BCH_MEMBER_STATE_RW)|(1 << BCH_MEMBER_STATE_RO))
+
+/*
+ * If a key exists that references a device, the device won't be going away and
+ * we can omit rcu_read_lock():
+ */
+static inline struct bch_dev *bch_dev_bkey_exists(const struct bch_fs *c, unsigned idx)
+{
+	EBUG_ON(idx >= c->sb.nr_devices || !c->devs[idx]);
+
+	return rcu_dereference_check(c->devs[idx], 1);
+}
+
+static inline struct bch_dev *bch_dev_locked(struct bch_fs *c, unsigned idx)
+{
+	EBUG_ON(idx >= c->sb.nr_devices || !c->devs[idx]);
+
+	return rcu_dereference_protected(c->devs[idx],
+					 lockdep_is_held(&c->sb_lock) ||
+					 lockdep_is_held(&c->state_lock));
+}
+
+/* XXX kill, move to struct bch_fs */
+static inline struct bch_devs_mask bch2_online_devs(struct bch_fs *c)
+{
+	struct bch_devs_mask devs;
+	struct bch_dev *ca;
+	unsigned i;
+
+	memset(&devs, 0, sizeof(devs));
+	for_each_online_member(ca, c, i)
+		__set_bit(ca->dev_idx, devs.d);
+	return devs;
+}
+
+struct bch_fs *bch2_dev_to_fs(dev_t);
+struct bch_fs *bch2_uuid_to_fs(__uuid_t);
+
+bool bch2_dev_state_allowed(struct bch_fs *, struct bch_dev *,
+			   enum bch_member_state, int);
+int __bch2_dev_set_state(struct bch_fs *, struct bch_dev *,
+			enum bch_member_state, int);
+int bch2_dev_set_state(struct bch_fs *, struct bch_dev *,
+		      enum bch_member_state, int);
+
+int bch2_dev_fail(struct bch_dev *, int);
+int bch2_dev_remove(struct bch_fs *, struct bch_dev *, int);
+int bch2_dev_add(struct bch_fs *, const char *);
+int bch2_dev_online(struct bch_fs *, const char *);
+int bch2_dev_offline(struct bch_fs *, struct bch_dev *, int);
+int bch2_dev_resize(struct bch_fs *, struct bch_dev *, u64);
+struct bch_dev *bch2_dev_lookup(struct bch_fs *, const char *);
+
+bool bch2_fs_emergency_read_only(struct bch_fs *);
+void bch2_fs_read_only(struct bch_fs *);
+const char *bch2_fs_read_write(struct bch_fs *);
+
+void bch2_fs_stop(struct bch_fs *);
+
+const char *bch2_fs_start(struct bch_fs *);
+struct bch_fs *bch2_fs_open(char * const *, unsigned, struct bch_opts);
+const char *bch2_fs_open_incremental(const char *path);
+
+#endif /* _BCACHEFS_SUPER_H */
diff --git a/fs/bcachefs/super_types.h b/fs/bcachefs/super_types.h
new file mode 100644
index 000000000000..4d8265bb3154
--- /dev/null
+++ b/fs/bcachefs/super_types.h
@@ -0,0 +1,63 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_SUPER_TYPES_H
+#define _BCACHEFS_SUPER_TYPES_H
+
+struct bch_sb_handle {
+	struct bch_sb		*sb;
+	struct block_device	*bdev;
+	struct bio		*bio;
+	void			*holder;
+	unsigned		page_order;
+	fmode_t			mode;
+	unsigned		have_layout:1;
+	unsigned		have_bio:1;
+	unsigned		fs_sb:1;
+};
+
+struct bch_devs_mask {
+	unsigned long d[BITS_TO_LONGS(BCH_SB_MEMBERS_MAX)];
+};
+
+struct bch_devs_list {
+	u8			nr;
+	u8			devs[BCH_REPLICAS_MAX + 1];
+};
+
+struct bch_member_cpu {
+	u64			nbuckets;	/* device size */
+	u16			first_bucket;   /* index of first bucket used */
+	u16			bucket_size;	/* sectors */
+	u16			group;
+	u8			state;
+	u8			replacement;
+	u8			discard;
+	u8			data_allowed;
+	u8			durability;
+	u8			valid;
+};
+
+struct bch_replicas_cpu_entry {
+	u8			data_type;
+	u8			devs[BCH_SB_MEMBERS_MAX / 8];
+};
+
+struct bch_replicas_cpu {
+	struct rcu_head		rcu;
+	unsigned		nr;
+	unsigned		entry_size;
+	struct bch_replicas_cpu_entry entries[];
+};
+
+struct bch_disk_group_cpu {
+	bool				deleted;
+	u16				parent;
+	struct bch_devs_mask		devs;
+};
+
+struct bch_disk_groups_cpu {
+	struct rcu_head			rcu;
+	unsigned			nr;
+	struct bch_disk_group_cpu	entries[];
+};
+
+#endif /* _BCACHEFS_SUPER_TYPES_H */
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
new file mode 100644
index 000000000000..430dcbcb6e8a
--- /dev/null
+++ b/fs/bcachefs/sysfs.c
@@ -0,0 +1,1027 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * bcache sysfs interfaces
+ *
+ * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
+ * Copyright 2012 Google, Inc.
+ */
+
+#ifndef NO_BCACHEFS_SYSFS
+
+#include "bcachefs.h"
+#include "alloc.h"
+#include "compress.h"
+#include "sysfs.h"
+#include "btree_cache.h"
+#include "btree_io.h"
+#include "btree_iter.h"
+#include "btree_update.h"
+#include "btree_update_interior.h"
+#include "btree_gc.h"
+#include "buckets.h"
+#include "disk_groups.h"
+#include "inode.h"
+#include "journal.h"
+#include "keylist.h"
+#include "move.h"
+#include "opts.h"
+#include "rebalance.h"
+#include "replicas.h"
+#include "super-io.h"
+#include "tests.h"
+
+#include <linux/blkdev.h>
+#include <linux/sort.h>
+#include <linux/sched/clock.h>
+
+#include "util.h"
+
+#define SYSFS_OPS(type)							\
+struct sysfs_ops type ## _sysfs_ops = {					\
+	.show	= type ## _show,					\
+	.store	= type ## _store					\
+}
+
+#define SHOW(fn)							\
+static ssize_t fn ## _show(struct kobject *kobj, struct attribute *attr,\
+			   char *buf)					\
+
+#define STORE(fn)							\
+static ssize_t fn ## _store(struct kobject *kobj, struct attribute *attr,\
+			    const char *buf, size_t size)		\
+
+#define __sysfs_attribute(_name, _mode)					\
+	static struct attribute sysfs_##_name =				\
+		{ .name = #_name, .mode = _mode }
+
+#define write_attribute(n)	__sysfs_attribute(n, S_IWUSR)
+#define read_attribute(n)	__sysfs_attribute(n, S_IRUGO)
+#define rw_attribute(n)		__sysfs_attribute(n, S_IRUGO|S_IWUSR)
+
+#define sysfs_printf(file, fmt, ...)					\
+do {									\
+	if (attr == &sysfs_ ## file)					\
+		return scnprintf(buf, PAGE_SIZE, fmt "\n", __VA_ARGS__);\
+} while (0)
+
+#define sysfs_print(file, var)						\
+do {									\
+	if (attr == &sysfs_ ## file)					\
+		return snprint(buf, PAGE_SIZE, var);			\
+} while (0)
+
+#define sysfs_hprint(file, val)						\
+do {									\
+	if (attr == &sysfs_ ## file) {					\
+		ssize_t ret = bch2_hprint(buf, val);			\
+		strcat(buf, "\n");					\
+		return ret + 1;						\
+	}								\
+} while (0)
+
+#define var_printf(_var, fmt)	sysfs_printf(_var, fmt, var(_var))
+#define var_print(_var)		sysfs_print(_var, var(_var))
+#define var_hprint(_var)	sysfs_hprint(_var, var(_var))
+
+#define sysfs_strtoul(file, var)					\
+do {									\
+	if (attr == &sysfs_ ## file)					\
+		return strtoul_safe(buf, var) ?: (ssize_t) size;	\
+} while (0)
+
+#define sysfs_strtoul_clamp(file, var, min, max)			\
+do {									\
+	if (attr == &sysfs_ ## file)					\
+		return strtoul_safe_clamp(buf, var, min, max)		\
+			?: (ssize_t) size;				\
+} while (0)
+
+#define strtoul_or_return(cp)						\
+({									\
+	unsigned long _v;						\
+	int _r = kstrtoul(cp, 10, &_v);					\
+	if (_r)								\
+		return _r;						\
+	_v;								\
+})
+
+#define strtoul_restrict_or_return(cp, min, max)			\
+({									\
+	unsigned long __v = 0;						\
+	int _r = strtoul_safe_restrict(cp, __v, min, max);		\
+	if (_r)								\
+		return _r;						\
+	__v;								\
+})
+
+#define strtoi_h_or_return(cp)						\
+({									\
+	u64 _v;								\
+	int _r = strtoi_h(cp, &_v);					\
+	if (_r)								\
+		return _r;						\
+	_v;								\
+})
+
+#define sysfs_hatoi(file, var)						\
+do {									\
+	if (attr == &sysfs_ ## file)					\
+		return strtoi_h(buf, &var) ?: (ssize_t) size;		\
+} while (0)
+
+write_attribute(trigger_journal_flush);
+write_attribute(trigger_btree_coalesce);
+write_attribute(trigger_gc);
+write_attribute(prune_cache);
+rw_attribute(btree_gc_periodic);
+
+read_attribute(uuid);
+read_attribute(minor);
+read_attribute(bucket_size);
+read_attribute(block_size);
+read_attribute(btree_node_size);
+read_attribute(first_bucket);
+read_attribute(nbuckets);
+read_attribute(durability);
+read_attribute(iodone);
+
+read_attribute(io_latency_read);
+read_attribute(io_latency_write);
+read_attribute(io_latency_stats_read);
+read_attribute(io_latency_stats_write);
+read_attribute(congested);
+
+read_attribute(bucket_quantiles_last_read);
+read_attribute(bucket_quantiles_last_write);
+read_attribute(bucket_quantiles_fragmentation);
+read_attribute(bucket_quantiles_oldest_gen);
+
+read_attribute(reserve_stats);
+read_attribute(btree_cache_size);
+read_attribute(compression_stats);
+read_attribute(journal_debug);
+read_attribute(journal_pins);
+read_attribute(btree_updates);
+read_attribute(dirty_btree_nodes);
+
+read_attribute(internal_uuid);
+
+read_attribute(has_data);
+read_attribute(alloc_debug);
+write_attribute(wake_allocator);
+
+read_attribute(read_realloc_races);
+read_attribute(extent_migrate_done);
+read_attribute(extent_migrate_raced);
+
+rw_attribute(journal_write_delay_ms);
+rw_attribute(journal_reclaim_delay_ms);
+
+rw_attribute(discard);
+rw_attribute(cache_replacement_policy);
+rw_attribute(label);
+
+rw_attribute(copy_gc_enabled);
+sysfs_pd_controller_attribute(copy_gc);
+
+rw_attribute(rebalance_enabled);
+sysfs_pd_controller_attribute(rebalance);
+read_attribute(rebalance_work);
+rw_attribute(promote_whole_extents);
+
+rw_attribute(pd_controllers_update_seconds);
+
+read_attribute(meta_replicas_have);
+read_attribute(data_replicas_have);
+
+#ifdef CONFIG_BCACHEFS_TESTS
+write_attribute(perf_test);
+#endif /* CONFIG_BCACHEFS_TESTS */
+
+#define BCH_DEBUG_PARAM(name, description)				\
+	rw_attribute(name);
+
+	BCH_DEBUG_PARAMS()
+#undef BCH_DEBUG_PARAM
+
+#define x(_name)						\
+	static struct attribute sysfs_time_stat_##_name =		\
+		{ .name = #_name, .mode = S_IRUGO };
+	BCH_TIME_STATS()
+#undef x
+
+static struct attribute sysfs_state_rw = {
+	.name = "state",
+	.mode = S_IRUGO
+};
+
+static size_t bch2_btree_cache_size(struct bch_fs *c)
+{
+	size_t ret = 0;
+	struct btree *b;
+
+	mutex_lock(&c->btree_cache.lock);
+	list_for_each_entry(b, &c->btree_cache.live, list)
+		ret += btree_bytes(c);
+
+	mutex_unlock(&c->btree_cache.lock);
+	return ret;
+}
+
+static ssize_t show_fs_alloc_debug(struct bch_fs *c, char *buf)
+{
+	struct bch_fs_usage stats = bch2_fs_usage_read(c);
+
+	return scnprintf(buf, PAGE_SIZE,
+			 "capacity:\t\t%llu\n"
+			 "1 replicas:\n"
+			 "\tmeta:\t\t%llu\n"
+			 "\tdirty:\t\t%llu\n"
+			 "\treserved:\t%llu\n"
+			 "2 replicas:\n"
+			 "\tmeta:\t\t%llu\n"
+			 "\tdirty:\t\t%llu\n"
+			 "\treserved:\t%llu\n"
+			 "3 replicas:\n"
+			 "\tmeta:\t\t%llu\n"
+			 "\tdirty:\t\t%llu\n"
+			 "\treserved:\t%llu\n"
+			 "4 replicas:\n"
+			 "\tmeta:\t\t%llu\n"
+			 "\tdirty:\t\t%llu\n"
+			 "\treserved:\t%llu\n"
+			 "online reserved:\t%llu\n",
+			 c->capacity,
+			 stats.s[0].data[S_META],
+			 stats.s[0].data[S_DIRTY],
+			 stats.s[0].persistent_reserved,
+			 stats.s[1].data[S_META],
+			 stats.s[1].data[S_DIRTY],
+			 stats.s[1].persistent_reserved,
+			 stats.s[2].data[S_META],
+			 stats.s[2].data[S_DIRTY],
+			 stats.s[2].persistent_reserved,
+			 stats.s[3].data[S_META],
+			 stats.s[3].data[S_DIRTY],
+			 stats.s[3].persistent_reserved,
+			 stats.online_reserved);
+}
+
+static ssize_t bch2_compression_stats(struct bch_fs *c, char *buf)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	u64 nr_uncompressed_extents = 0, uncompressed_sectors = 0,
+	    nr_compressed_extents = 0,
+	    compressed_sectors_compressed = 0,
+	    compressed_sectors_uncompressed = 0;
+
+	if (!bch2_fs_running(c))
+		return -EPERM;
+
+	for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, POS_MIN, 0, k)
+		if (k.k->type == BCH_EXTENT) {
+			struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
+			const struct bch_extent_ptr *ptr;
+			struct bch_extent_crc_unpacked crc;
+
+			extent_for_each_ptr_crc(e, ptr, crc) {
+				if (crc.compression_type == BCH_COMPRESSION_NONE) {
+					nr_uncompressed_extents++;
+					uncompressed_sectors += e.k->size;
+				} else {
+					nr_compressed_extents++;
+					compressed_sectors_compressed +=
+						crc.compressed_size;
+					compressed_sectors_uncompressed +=
+						crc.uncompressed_size;
+				}
+
+				/* only looking at the first ptr */
+				break;
+			}
+		}
+	bch2_btree_iter_unlock(&iter);
+
+	return scnprintf(buf, PAGE_SIZE,
+			"uncompressed data:\n"
+			"	nr extents:			%llu\n"
+			"	size (bytes):			%llu\n"
+			"compressed data:\n"
+			"	nr extents:			%llu\n"
+			"	compressed size (bytes):	%llu\n"
+			"	uncompressed size (bytes):	%llu\n",
+			nr_uncompressed_extents,
+			uncompressed_sectors << 9,
+			nr_compressed_extents,
+			compressed_sectors_compressed << 9,
+			compressed_sectors_uncompressed << 9);
+}
+
+SHOW(bch2_fs)
+{
+	struct bch_fs *c = container_of(kobj, struct bch_fs, kobj);
+
+	sysfs_print(minor,			c->minor);
+	sysfs_printf(internal_uuid, "%pU",	c->sb.uuid.b);
+
+	sysfs_print(journal_write_delay_ms,	c->journal.write_delay_ms);
+	sysfs_print(journal_reclaim_delay_ms,	c->journal.reclaim_delay_ms);
+
+	sysfs_print(block_size,			block_bytes(c));
+	sysfs_print(btree_node_size,		btree_bytes(c));
+	sysfs_hprint(btree_cache_size,		bch2_btree_cache_size(c));
+
+	sysfs_print(read_realloc_races,
+		    atomic_long_read(&c->read_realloc_races));
+	sysfs_print(extent_migrate_done,
+		    atomic_long_read(&c->extent_migrate_done));
+	sysfs_print(extent_migrate_raced,
+		    atomic_long_read(&c->extent_migrate_raced));
+
+	sysfs_printf(btree_gc_periodic, "%u",	(int) c->btree_gc_periodic);
+
+	sysfs_printf(copy_gc_enabled, "%i", c->copy_gc_enabled);
+
+	sysfs_print(pd_controllers_update_seconds,
+		    c->pd_controllers_update_seconds);
+
+	sysfs_printf(rebalance_enabled,		"%i", c->rebalance.enabled);
+	sysfs_pd_controller_show(rebalance,	&c->rebalance.pd); /* XXX */
+
+	if (attr == &sysfs_rebalance_work)
+		return bch2_rebalance_work_show(c, buf);
+
+	sysfs_print(promote_whole_extents,	c->promote_whole_extents);
+
+	sysfs_printf(meta_replicas_have, "%u",	bch2_replicas_online(c, true));
+	sysfs_printf(data_replicas_have, "%u",	bch2_replicas_online(c, false));
+
+	/* Debugging: */
+
+	if (attr == &sysfs_alloc_debug)
+		return show_fs_alloc_debug(c, buf);
+
+	if (attr == &sysfs_journal_debug)
+		return bch2_journal_print_debug(&c->journal, buf);
+
+	if (attr == &sysfs_journal_pins)
+		return bch2_journal_print_pins(&c->journal, buf);
+
+	if (attr == &sysfs_btree_updates)
+		return bch2_btree_updates_print(c, buf);
+
+	if (attr == &sysfs_dirty_btree_nodes)
+		return bch2_dirty_btree_nodes_print(c, buf);
+
+	if (attr == &sysfs_compression_stats)
+		return bch2_compression_stats(c, buf);
+
+#define BCH_DEBUG_PARAM(name, description) sysfs_print(name, c->name);
+	BCH_DEBUG_PARAMS()
+#undef BCH_DEBUG_PARAM
+
+	return 0;
+}
+
+STORE(__bch2_fs)
+{
+	struct bch_fs *c = container_of(kobj, struct bch_fs, kobj);
+
+	sysfs_strtoul(journal_write_delay_ms, c->journal.write_delay_ms);
+	sysfs_strtoul(journal_reclaim_delay_ms, c->journal.reclaim_delay_ms);
+
+	if (attr == &sysfs_btree_gc_periodic) {
+		ssize_t ret = strtoul_safe(buf, c->btree_gc_periodic)
+			?: (ssize_t) size;
+
+		wake_up_process(c->gc_thread);
+		return ret;
+	}
+
+	if (attr == &sysfs_copy_gc_enabled) {
+		struct bch_dev *ca;
+		unsigned i;
+		ssize_t ret = strtoul_safe(buf, c->copy_gc_enabled)
+			?: (ssize_t) size;
+
+		for_each_member_device(ca, c, i)
+			if (ca->copygc_thread)
+				wake_up_process(ca->copygc_thread);
+		return ret;
+	}
+
+	if (attr == &sysfs_rebalance_enabled) {
+		ssize_t ret = strtoul_safe(buf, c->rebalance.enabled)
+			?: (ssize_t) size;
+
+		rebalance_wakeup(c);
+		return ret;
+	}
+
+	sysfs_strtoul(pd_controllers_update_seconds,
+		      c->pd_controllers_update_seconds);
+	sysfs_pd_controller_store(rebalance,	&c->rebalance.pd);
+
+	sysfs_strtoul(promote_whole_extents,	c->promote_whole_extents);
+
+	/* Debugging: */
+
+#define BCH_DEBUG_PARAM(name, description) sysfs_strtoul(name, c->name);
+	BCH_DEBUG_PARAMS()
+#undef BCH_DEBUG_PARAM
+
+	if (!bch2_fs_running(c))
+		return -EPERM;
+
+	/* Debugging: */
+
+	if (attr == &sysfs_trigger_journal_flush)
+		bch2_journal_meta_async(&c->journal, NULL);
+
+	if (attr == &sysfs_trigger_btree_coalesce)
+		bch2_coalesce(c);
+
+	if (attr == &sysfs_trigger_gc)
+		bch2_gc(c);
+
+	if (attr == &sysfs_prune_cache) {
+		struct shrink_control sc;
+
+		sc.gfp_mask = GFP_KERNEL;
+		sc.nr_to_scan = strtoul_or_return(buf);
+		c->btree_cache.shrink.scan_objects(&c->btree_cache.shrink, &sc);
+	}
+#ifdef CONFIG_BCACHEFS_TESTS
+	if (attr == &sysfs_perf_test) {
+		char *tmp = kstrdup(buf, GFP_KERNEL), *p = tmp;
+		char *test		= strsep(&p, " \t\n");
+		char *nr_str		= strsep(&p, " \t\n");
+		char *threads_str	= strsep(&p, " \t\n");
+		unsigned threads;
+		u64 nr;
+		int ret = -EINVAL;
+
+		if (threads_str &&
+		    !(ret = kstrtouint(threads_str, 10, &threads)) &&
+		    !(ret = bch2_strtoull_h(nr_str, &nr)))
+			bch2_btree_perf_test(c, test, nr, threads);
+		else
+			size = ret;
+		kfree(tmp);
+	}
+#endif
+	return size;
+}
+
+STORE(bch2_fs)
+{
+	struct bch_fs *c = container_of(kobj, struct bch_fs, kobj);
+
+	mutex_lock(&c->state_lock);
+	size = __bch2_fs_store(kobj, attr, buf, size);
+	mutex_unlock(&c->state_lock);
+
+	return size;
+}
+SYSFS_OPS(bch2_fs);
+
+struct attribute *bch2_fs_files[] = {
+	&sysfs_minor,
+	&sysfs_block_size,
+	&sysfs_btree_node_size,
+	&sysfs_btree_cache_size,
+
+	&sysfs_meta_replicas_have,
+	&sysfs_data_replicas_have,
+
+	&sysfs_journal_write_delay_ms,
+	&sysfs_journal_reclaim_delay_ms,
+
+	&sysfs_promote_whole_extents,
+
+	&sysfs_compression_stats,
+
+#ifdef CONFIG_BCACHEFS_TESTS
+	&sysfs_perf_test,
+#endif
+	NULL
+};
+
+/* internal dir - just a wrapper */
+
+SHOW(bch2_fs_internal)
+{
+	struct bch_fs *c = container_of(kobj, struct bch_fs, internal);
+	return bch2_fs_show(&c->kobj, attr, buf);
+}
+
+STORE(bch2_fs_internal)
+{
+	struct bch_fs *c = container_of(kobj, struct bch_fs, internal);
+	return bch2_fs_store(&c->kobj, attr, buf, size);
+}
+SYSFS_OPS(bch2_fs_internal);
+
+struct attribute *bch2_fs_internal_files[] = {
+	&sysfs_alloc_debug,
+	&sysfs_journal_debug,
+	&sysfs_journal_pins,
+	&sysfs_btree_updates,
+	&sysfs_dirty_btree_nodes,
+
+	&sysfs_read_realloc_races,
+	&sysfs_extent_migrate_done,
+	&sysfs_extent_migrate_raced,
+
+	&sysfs_trigger_journal_flush,
+	&sysfs_trigger_btree_coalesce,
+	&sysfs_trigger_gc,
+	&sysfs_prune_cache,
+
+	&sysfs_copy_gc_enabled,
+
+	&sysfs_rebalance_enabled,
+	&sysfs_rebalance_work,
+	sysfs_pd_controller_files(rebalance),
+
+	&sysfs_internal_uuid,
+
+#define BCH_DEBUG_PARAM(name, description) &sysfs_##name,
+	BCH_DEBUG_PARAMS()
+#undef BCH_DEBUG_PARAM
+
+	NULL
+};
+
+/* options */
+
+SHOW(bch2_fs_opts_dir)
+{
+	char *out = buf, *end = buf + PAGE_SIZE;
+	struct bch_fs *c = container_of(kobj, struct bch_fs, opts_dir);
+	const struct bch_option *opt = container_of(attr, struct bch_option, attr);
+	int id = opt - bch2_opt_table;
+	u64 v = bch2_opt_get_by_id(&c->opts, id);
+
+	out += bch2_opt_to_text(c, out, end - out, opt, v, OPT_SHOW_FULL_LIST);
+	out += scnprintf(out, end - out, "\n");
+
+	return out - buf;
+}
+
+STORE(bch2_fs_opts_dir)
+{
+	struct bch_fs *c = container_of(kobj, struct bch_fs, opts_dir);
+	const struct bch_option *opt = container_of(attr, struct bch_option, attr);
+	int ret, id = opt - bch2_opt_table;
+	char *tmp;
+	u64 v;
+
+	tmp = kstrdup(buf, GFP_KERNEL);
+	if (!tmp)
+		return -ENOMEM;
+
+	ret = bch2_opt_parse(c, opt, strim(tmp), &v);
+	kfree(tmp);
+
+	if (ret < 0)
+		return ret;
+
+	if (id == Opt_compression ||
+	    id == Opt_background_compression) {
+		int ret = bch2_check_set_has_compressed_data(c, v);
+		if (ret) {
+			mutex_unlock(&c->sb_lock);
+			return ret;
+		}
+	}
+
+	if (opt->set_sb != SET_NO_SB_OPT) {
+		mutex_lock(&c->sb_lock);
+		opt->set_sb(c->disk_sb.sb, v);
+		bch2_write_super(c);
+		mutex_unlock(&c->sb_lock);
+	}
+
+	bch2_opt_set_by_id(&c->opts, id, v);
+
+	if ((id == Opt_background_target ||
+	     id == Opt_background_compression) && v) {
+		bch2_rebalance_add_work(c, S64_MAX);
+		rebalance_wakeup(c);
+	}
+
+	return size;
+}
+SYSFS_OPS(bch2_fs_opts_dir);
+
+struct attribute *bch2_fs_opts_dir_files[] = { NULL };
+
+int bch2_opts_create_sysfs_files(struct kobject *kobj)
+{
+	const struct bch_option *i;
+	int ret;
+
+	for (i = bch2_opt_table;
+	     i < bch2_opt_table + bch2_opts_nr;
+	     i++) {
+		if (i->mode == OPT_INTERNAL)
+			continue;
+
+		ret = sysfs_create_file(kobj, &i->attr);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+/* time stats */
+
+SHOW(bch2_fs_time_stats)
+{
+	struct bch_fs *c = container_of(kobj, struct bch_fs, time_stats);
+
+#define x(name)						\
+	if (attr == &sysfs_time_stat_##name)				\
+		return bch2_time_stats_print(&c->times[BCH_TIME_##name],\
+					     buf, PAGE_SIZE);
+	BCH_TIME_STATS()
+#undef x
+
+	return 0;
+}
+
+STORE(bch2_fs_time_stats)
+{
+	return size;
+}
+SYSFS_OPS(bch2_fs_time_stats);
+
+struct attribute *bch2_fs_time_stats_files[] = {
+#define x(name)						\
+	&sysfs_time_stat_##name,
+	BCH_TIME_STATS()
+#undef x
+	NULL
+};
+
+typedef unsigned (bucket_map_fn)(struct bch_fs *, struct bch_dev *,
+				 size_t, void *);
+
+static unsigned bucket_last_io_fn(struct bch_fs *c, struct bch_dev *ca,
+				  size_t b, void *private)
+{
+	int rw = (private ? 1 : 0);
+
+	return bucket_last_io(c, bucket(ca, b), rw);
+}
+
+static unsigned bucket_sectors_used_fn(struct bch_fs *c, struct bch_dev *ca,
+				       size_t b, void *private)
+{
+	struct bucket *g = bucket(ca, b);
+	return bucket_sectors_used(g->mark);
+}
+
+static unsigned bucket_oldest_gen_fn(struct bch_fs *c, struct bch_dev *ca,
+				     size_t b, void *private)
+{
+	return bucket_gc_gen(ca, b);
+}
+
+static int unsigned_cmp(const void *_l, const void *_r)
+{
+	unsigned l = *((unsigned *) _l);
+	unsigned r = *((unsigned *) _r);
+
+	return (l > r) - (l < r);
+}
+
+static ssize_t show_quantiles(struct bch_fs *c, struct bch_dev *ca,
+			      char *buf, bucket_map_fn *fn, void *private)
+{
+	size_t i, n;
+	/* Compute 31 quantiles */
+	unsigned q[31], *p;
+	ssize_t ret = 0;
+
+	down_read(&ca->bucket_lock);
+	n = ca->mi.nbuckets;
+
+	p = vzalloc(n * sizeof(unsigned));
+	if (!p) {
+		up_read(&ca->bucket_lock);
+		return -ENOMEM;
+	}
+
+	for (i = ca->mi.first_bucket; i < n; i++)
+		p[i] = fn(c, ca, i, private);
+
+	sort(p, n, sizeof(unsigned), unsigned_cmp, NULL);
+	up_read(&ca->bucket_lock);
+
+	while (n &&
+	       !p[n - 1])
+		--n;
+
+	for (i = 0; i < ARRAY_SIZE(q); i++)
+		q[i] = p[n * (i + 1) / (ARRAY_SIZE(q) + 1)];
+
+	vfree(p);
+
+	for (i = 0; i < ARRAY_SIZE(q); i++)
+		ret += scnprintf(buf + ret, PAGE_SIZE - ret,
+				 "%u ", q[i]);
+	buf[ret - 1] = '\n';
+
+	return ret;
+}
+
+static ssize_t show_reserve_stats(struct bch_dev *ca, char *buf)
+{
+	enum alloc_reserve i;
+	ssize_t ret;
+
+	spin_lock(&ca->freelist_lock);
+
+	ret = scnprintf(buf, PAGE_SIZE,
+			"free_inc:\t%zu\t%zu\n",
+			fifo_used(&ca->free_inc),
+			ca->free_inc.size);
+
+	for (i = 0; i < RESERVE_NR; i++)
+		ret += scnprintf(buf + ret, PAGE_SIZE - ret,
+				 "free[%u]:\t%zu\t%zu\n", i,
+				 fifo_used(&ca->free[i]),
+				 ca->free[i].size);
+
+	spin_unlock(&ca->freelist_lock);
+
+	return ret;
+}
+
+static ssize_t show_dev_alloc_debug(struct bch_dev *ca, char *buf)
+{
+	struct bch_fs *c = ca->fs;
+	struct bch_dev_usage stats = bch2_dev_usage_read(c, ca);
+
+	return scnprintf(buf, PAGE_SIZE,
+		"free_inc:               %zu/%zu\n"
+		"free[RESERVE_BTREE]:    %zu/%zu\n"
+		"free[RESERVE_MOVINGGC]: %zu/%zu\n"
+		"free[RESERVE_NONE]:     %zu/%zu\n"
+		"buckets:\n"
+		"    capacity:           %llu\n"
+		"    alloc:              %llu\n"
+		"    sb:                 %llu\n"
+		"    journal:            %llu\n"
+		"    meta:               %llu\n"
+		"    user:               %llu\n"
+		"    cached:             %llu\n"
+		"    available:          %llu\n"
+		"sectors:\n"
+		"    sb:                 %llu\n"
+		"    journal:            %llu\n"
+		"    meta:               %llu\n"
+		"    user:               %llu\n"
+		"    cached:             %llu\n"
+		"freelist_wait:          %s\n"
+		"open buckets:           %u/%u (reserved %u)\n"
+		"open_buckets_wait:      %s\n",
+		fifo_used(&ca->free_inc),		ca->free_inc.size,
+		fifo_used(&ca->free[RESERVE_BTREE]),	ca->free[RESERVE_BTREE].size,
+		fifo_used(&ca->free[RESERVE_MOVINGGC]),	ca->free[RESERVE_MOVINGGC].size,
+		fifo_used(&ca->free[RESERVE_NONE]),	ca->free[RESERVE_NONE].size,
+		ca->mi.nbuckets - ca->mi.first_bucket,
+		stats.buckets_alloc,
+		stats.buckets[BCH_DATA_SB],
+		stats.buckets[BCH_DATA_JOURNAL],
+		stats.buckets[BCH_DATA_BTREE],
+		stats.buckets[BCH_DATA_USER],
+		stats.buckets[BCH_DATA_CACHED],
+		__dev_buckets_available(ca, stats),
+		stats.sectors[BCH_DATA_SB],
+		stats.sectors[BCH_DATA_JOURNAL],
+		stats.sectors[BCH_DATA_BTREE],
+		stats.sectors[BCH_DATA_USER],
+		stats.sectors[BCH_DATA_CACHED],
+		c->freelist_wait.list.first		? "waiting" : "empty",
+		c->open_buckets_nr_free, OPEN_BUCKETS_COUNT, BTREE_NODE_RESERVE,
+		c->open_buckets_wait.list.first		? "waiting" : "empty");
+}
+
+static const char * const bch2_rw[] = {
+	"read",
+	"write",
+	NULL
+};
+
+static ssize_t show_dev_iodone(struct bch_dev *ca, char *buf)
+{
+	char *out = buf, *end = buf + PAGE_SIZE;
+	int rw, i, cpu;
+
+	for (rw = 0; rw < 2; rw++) {
+		out += scnprintf(out, end - out, "%s:\n", bch2_rw[rw]);
+
+		for (i = 1; i < BCH_DATA_NR; i++) {
+			u64 n = 0;
+
+			for_each_possible_cpu(cpu)
+				n += per_cpu_ptr(ca->io_done, cpu)->sectors[rw][i];
+
+			out += scnprintf(out, end - out, "%-12s:%12llu\n",
+					 bch2_data_types[i], n << 9);
+		}
+	}
+
+	return out - buf;
+}
+
+SHOW(bch2_dev)
+{
+	struct bch_dev *ca = container_of(kobj, struct bch_dev, kobj);
+	struct bch_fs *c = ca->fs;
+	char *out = buf, *end = buf + PAGE_SIZE;
+
+	sysfs_printf(uuid,		"%pU\n", ca->uuid.b);
+
+	sysfs_print(bucket_size,	bucket_bytes(ca));
+	sysfs_print(block_size,		block_bytes(c));
+	sysfs_print(first_bucket,	ca->mi.first_bucket);
+	sysfs_print(nbuckets,		ca->mi.nbuckets);
+	sysfs_print(durability,		ca->mi.durability);
+	sysfs_print(discard,		ca->mi.discard);
+
+	if (attr == &sysfs_label) {
+		if (ca->mi.group) {
+			mutex_lock(&c->sb_lock);
+			out += bch2_disk_path_print(&c->disk_sb, out, end - out,
+						    ca->mi.group - 1);
+			mutex_unlock(&c->sb_lock);
+		} else {
+			out += scnprintf(out, end - out, "none");
+		}
+
+		out += scnprintf(out, end - out, "\n");
+		return out - buf;
+	}
+
+	if (attr == &sysfs_has_data) {
+		out += bch2_scnprint_flag_list(out, end - out,
+					       bch2_data_types,
+					       bch2_dev_has_data(c, ca));
+		out += scnprintf(out, end - out, "\n");
+		return out - buf;
+	}
+
+	sysfs_pd_controller_show(copy_gc, &ca->copygc_pd);
+
+	if (attr == &sysfs_cache_replacement_policy) {
+		out += bch2_scnprint_string_list(out, end - out,
+						 bch2_cache_replacement_policies,
+						 ca->mi.replacement);
+		out += scnprintf(out, end - out, "\n");
+		return out - buf;
+	}
+
+	if (attr == &sysfs_state_rw) {
+		out += bch2_scnprint_string_list(out, end - out,
+						 bch2_dev_state,
+						 ca->mi.state);
+		out += scnprintf(out, end - out, "\n");
+		return out - buf;
+	}
+
+	if (attr == &sysfs_iodone)
+		return show_dev_iodone(ca, buf);
+
+	sysfs_print(io_latency_read,		atomic64_read(&ca->cur_latency[READ]));
+	sysfs_print(io_latency_write,		atomic64_read(&ca->cur_latency[WRITE]));
+
+	if (attr == &sysfs_io_latency_stats_read)
+		return bch2_time_stats_print(&ca->io_latency[READ], buf, PAGE_SIZE);
+	if (attr == &sysfs_io_latency_stats_write)
+		return bch2_time_stats_print(&ca->io_latency[WRITE], buf, PAGE_SIZE);
+
+	sysfs_printf(congested,			"%u%%",
+		     clamp(atomic_read(&ca->congested), 0, CONGESTED_MAX)
+		     * 100 / CONGESTED_MAX);
+
+	if (attr == &sysfs_bucket_quantiles_last_read)
+		return show_quantiles(c, ca, buf, bucket_last_io_fn, (void *) 0);
+	if (attr == &sysfs_bucket_quantiles_last_write)
+		return show_quantiles(c, ca, buf, bucket_last_io_fn, (void *) 1);
+	if (attr == &sysfs_bucket_quantiles_fragmentation)
+		return show_quantiles(c, ca, buf, bucket_sectors_used_fn, NULL);
+	if (attr == &sysfs_bucket_quantiles_oldest_gen)
+		return show_quantiles(c, ca, buf, bucket_oldest_gen_fn, NULL);
+
+	if (attr == &sysfs_reserve_stats)
+		return show_reserve_stats(ca, buf);
+	if (attr == &sysfs_alloc_debug)
+		return show_dev_alloc_debug(ca, buf);
+
+	return 0;
+}
+
+STORE(bch2_dev)
+{
+	struct bch_dev *ca = container_of(kobj, struct bch_dev, kobj);
+	struct bch_fs *c = ca->fs;
+	struct bch_member *mi;
+
+	sysfs_pd_controller_store(copy_gc, &ca->copygc_pd);
+
+	if (attr == &sysfs_discard) {
+		bool v = strtoul_or_return(buf);
+
+		mutex_lock(&c->sb_lock);
+		mi = &bch2_sb_get_members(c->disk_sb.sb)->members[ca->dev_idx];
+
+		if (v != BCH_MEMBER_DISCARD(mi)) {
+			SET_BCH_MEMBER_DISCARD(mi, v);
+			bch2_write_super(c);
+		}
+		mutex_unlock(&c->sb_lock);
+	}
+
+	if (attr == &sysfs_cache_replacement_policy) {
+		ssize_t v = __sysfs_match_string(bch2_cache_replacement_policies, -1, buf);
+
+		if (v < 0)
+			return v;
+
+		mutex_lock(&c->sb_lock);
+		mi = &bch2_sb_get_members(c->disk_sb.sb)->members[ca->dev_idx];
+
+		if ((unsigned) v != BCH_MEMBER_REPLACEMENT(mi)) {
+			SET_BCH_MEMBER_REPLACEMENT(mi, v);
+			bch2_write_super(c);
+		}
+		mutex_unlock(&c->sb_lock);
+	}
+
+	if (attr == &sysfs_label) {
+		char *tmp;
+		int ret;
+
+		tmp = kstrdup(buf, GFP_KERNEL);
+		if (!tmp)
+			return -ENOMEM;
+
+		ret = bch2_dev_group_set(c, ca, strim(tmp));
+		kfree(tmp);
+		if (ret)
+			return ret;
+	}
+
+	if (attr == &sysfs_wake_allocator)
+		bch2_wake_allocator(ca);
+
+	return size;
+}
+SYSFS_OPS(bch2_dev);
+
+struct attribute *bch2_dev_files[] = {
+	&sysfs_uuid,
+	&sysfs_bucket_size,
+	&sysfs_block_size,
+	&sysfs_first_bucket,
+	&sysfs_nbuckets,
+	&sysfs_durability,
+
+	/* settings: */
+	&sysfs_discard,
+	&sysfs_cache_replacement_policy,
+	&sysfs_state_rw,
+	&sysfs_label,
+
+	&sysfs_has_data,
+	&sysfs_iodone,
+
+	&sysfs_io_latency_read,
+	&sysfs_io_latency_write,
+	&sysfs_io_latency_stats_read,
+	&sysfs_io_latency_stats_write,
+	&sysfs_congested,
+
+	/* alloc info - other stats: */
+	&sysfs_bucket_quantiles_last_read,
+	&sysfs_bucket_quantiles_last_write,
+	&sysfs_bucket_quantiles_fragmentation,
+	&sysfs_bucket_quantiles_oldest_gen,
+
+	&sysfs_reserve_stats,
+
+	/* debug: */
+	&sysfs_alloc_debug,
+	&sysfs_wake_allocator,
+
+	sysfs_pd_controller_files(copy_gc),
+	NULL
+};
+
+#endif  /* _BCACHEFS_SYSFS_H_ */
diff --git a/fs/bcachefs/sysfs.h b/fs/bcachefs/sysfs.h
new file mode 100644
index 000000000000..525fd05d91f7
--- /dev/null
+++ b/fs/bcachefs/sysfs.h
@@ -0,0 +1,44 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_SYSFS_H_
+#define _BCACHEFS_SYSFS_H_
+
+#include <linux/sysfs.h>
+
+#ifndef NO_BCACHEFS_SYSFS
+
+struct attribute;
+struct sysfs_ops;
+
+extern struct attribute *bch2_fs_files[];
+extern struct attribute *bch2_fs_internal_files[];
+extern struct attribute *bch2_fs_opts_dir_files[];
+extern struct attribute *bch2_fs_time_stats_files[];
+extern struct attribute *bch2_dev_files[];
+
+extern struct sysfs_ops bch2_fs_sysfs_ops;
+extern struct sysfs_ops bch2_fs_internal_sysfs_ops;
+extern struct sysfs_ops bch2_fs_opts_dir_sysfs_ops;
+extern struct sysfs_ops bch2_fs_time_stats_sysfs_ops;
+extern struct sysfs_ops bch2_dev_sysfs_ops;
+
+int bch2_opts_create_sysfs_files(struct kobject *);
+
+#else
+
+static struct attribute *bch2_fs_files[] = {};
+static struct attribute *bch2_fs_internal_files[] = {};
+static struct attribute *bch2_fs_opts_dir_files[] = {};
+static struct attribute *bch2_fs_time_stats_files[] = {};
+static struct attribute *bch2_dev_files[] = {};
+
+static const struct sysfs_ops bch2_fs_sysfs_ops;
+static const struct sysfs_ops bch2_fs_internal_sysfs_ops;
+static const struct sysfs_ops bch2_fs_opts_dir_sysfs_ops;
+static const struct sysfs_ops bch2_fs_time_stats_sysfs_ops;
+static const struct sysfs_ops bch2_dev_sysfs_ops;
+
+static inline int bch2_opts_create_sysfs_files(struct kobject *kobj) { return 0; }
+
+#endif /* NO_BCACHEFS_SYSFS */
+
+#endif  /* _BCACHEFS_SYSFS_H_ */
diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c
new file mode 100644
index 000000000000..c522fb795e63
--- /dev/null
+++ b/fs/bcachefs/tests.c
@@ -0,0 +1,531 @@
+// SPDX-License-Identifier: GPL-2.0
+#ifdef CONFIG_BCACHEFS_TESTS
+
+#include "bcachefs.h"
+#include "btree_update.h"
+#include "journal_reclaim.h"
+#include "tests.h"
+
+#include "linux/kthread.h"
+#include "linux/random.h"
+
+static void delete_test_keys(struct bch_fs *c)
+{
+	int ret;
+
+	ret = bch2_btree_delete_range(c, BTREE_ID_EXTENTS,
+				      POS(0, 0), POS(0, U64_MAX),
+				      ZERO_VERSION, NULL, NULL, NULL);
+	BUG_ON(ret);
+
+	ret = bch2_btree_delete_range(c, BTREE_ID_DIRENTS,
+				      POS(0, 0), POS(0, U64_MAX),
+				      ZERO_VERSION, NULL, NULL, NULL);
+	BUG_ON(ret);
+}
+
+/* unit tests */
+
+static void test_delete(struct bch_fs *c, u64 nr)
+{
+	struct btree_iter iter;
+	struct bkey_i_cookie k;
+	int ret;
+
+	bkey_cookie_init(&k.k_i);
+
+	bch2_btree_iter_init(&iter, c, BTREE_ID_DIRENTS, k.k.p,
+			     BTREE_ITER_INTENT);
+
+	ret = bch2_btree_iter_traverse(&iter);
+	BUG_ON(ret);
+
+	ret = bch2_btree_insert_at(c, NULL, NULL, NULL, 0,
+				   BTREE_INSERT_ENTRY(&iter, &k.k_i));
+	BUG_ON(ret);
+
+	pr_info("deleting once");
+	ret = bch2_btree_delete_at(&iter, 0);
+	BUG_ON(ret);
+
+	pr_info("deleting twice");
+	ret = bch2_btree_delete_at(&iter, 0);
+	BUG_ON(ret);
+
+	bch2_btree_iter_unlock(&iter);
+}
+
+static void test_delete_written(struct bch_fs *c, u64 nr)
+{
+	struct btree_iter iter;
+	struct bkey_i_cookie k;
+	int ret;
+
+	bkey_cookie_init(&k.k_i);
+
+	bch2_btree_iter_init(&iter, c, BTREE_ID_DIRENTS, k.k.p,
+			     BTREE_ITER_INTENT);
+
+	ret = bch2_btree_iter_traverse(&iter);
+	BUG_ON(ret);
+
+	ret = bch2_btree_insert_at(c, NULL, NULL, NULL, 0,
+				   BTREE_INSERT_ENTRY(&iter, &k.k_i));
+	BUG_ON(ret);
+
+	bch2_journal_flush_all_pins(&c->journal);
+
+	ret = bch2_btree_delete_at(&iter, 0);
+	BUG_ON(ret);
+
+	bch2_btree_iter_unlock(&iter);
+}
+
+static void test_iterate(struct bch_fs *c, u64 nr)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	u64 i;
+	int ret;
+
+	delete_test_keys(c);
+
+	pr_info("inserting test keys");
+
+	for (i = 0; i < nr; i++) {
+		struct bkey_i_cookie k;
+
+		bkey_cookie_init(&k.k_i);
+		k.k.p.offset = i;
+
+		ret = bch2_btree_insert(c, BTREE_ID_DIRENTS, &k.k_i,
+					NULL, NULL, NULL, 0);
+		BUG_ON(ret);
+	}
+
+	pr_info("iterating forwards");
+
+	i = 0;
+
+	for_each_btree_key(&iter, c, BTREE_ID_DIRENTS, POS(0, 0), 0, k)
+		BUG_ON(k.k->p.offset != i++);
+	bch2_btree_iter_unlock(&iter);
+
+	BUG_ON(i != nr);
+
+	pr_info("iterating backwards");
+
+	while (!IS_ERR_OR_NULL((k = bch2_btree_iter_prev(&iter)).k))
+		BUG_ON(k.k->p.offset != --i);
+	bch2_btree_iter_unlock(&iter);
+
+	BUG_ON(i);
+}
+
+static void test_iterate_extents(struct bch_fs *c, u64 nr)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	u64 i;
+	int ret;
+
+	delete_test_keys(c);
+
+	pr_info("inserting test extents");
+
+	for (i = 0; i < nr; i += 8) {
+		struct bkey_i_cookie k;
+
+		bkey_cookie_init(&k.k_i);
+		k.k.p.offset = i + 8;
+		k.k.size = 8;
+
+		ret = bch2_btree_insert(c, BTREE_ID_EXTENTS, &k.k_i,
+					NULL, NULL, NULL, 0);
+		BUG_ON(ret);
+	}
+
+	pr_info("iterating forwards");
+
+	i = 0;
+
+	for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, POS(0, 0), 0, k) {
+		BUG_ON(bkey_start_offset(k.k) != i);
+		i = k.k->p.offset;
+	}
+	bch2_btree_iter_unlock(&iter);
+
+	BUG_ON(i != nr);
+
+	pr_info("iterating backwards");
+
+	while (!IS_ERR_OR_NULL((k = bch2_btree_iter_prev(&iter)).k)) {
+		BUG_ON(k.k->p.offset != i);
+		i = bkey_start_offset(k.k);
+	}
+	bch2_btree_iter_unlock(&iter);
+
+	BUG_ON(i);
+}
+
+static void test_iterate_slots(struct bch_fs *c, u64 nr)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	u64 i;
+	int ret;
+
+	delete_test_keys(c);
+
+	pr_info("inserting test keys");
+
+	for (i = 0; i < nr; i++) {
+		struct bkey_i_cookie k;
+
+		bkey_cookie_init(&k.k_i);
+		k.k.p.offset = i * 2;
+
+		ret = bch2_btree_insert(c, BTREE_ID_DIRENTS, &k.k_i,
+					NULL, NULL, NULL, 0);
+		BUG_ON(ret);
+	}
+
+	pr_info("iterating forwards");
+
+	i = 0;
+
+	for_each_btree_key(&iter, c, BTREE_ID_DIRENTS, POS(0, 0), 0, k) {
+		BUG_ON(k.k->p.offset != i);
+		i += 2;
+	}
+	bch2_btree_iter_unlock(&iter);
+
+	BUG_ON(i != nr * 2);
+
+	pr_info("iterating forwards by slots");
+
+	i = 0;
+
+	for_each_btree_key(&iter, c, BTREE_ID_DIRENTS, POS(0, 0),
+			   BTREE_ITER_SLOTS, k) {
+		BUG_ON(bkey_deleted(k.k) != (i & 1));
+		BUG_ON(k.k->p.offset != i++);
+
+		if (i == nr * 2)
+			break;
+	}
+	bch2_btree_iter_unlock(&iter);
+}
+
+static void test_iterate_slots_extents(struct bch_fs *c, u64 nr)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	u64 i;
+	int ret;
+
+	delete_test_keys(c);
+
+	pr_info("inserting test keys");
+
+	for (i = 0; i < nr; i += 16) {
+		struct bkey_i_cookie k;
+
+		bkey_cookie_init(&k.k_i);
+		k.k.p.offset = i + 16;
+		k.k.size = 8;
+
+		ret = bch2_btree_insert(c, BTREE_ID_EXTENTS, &k.k_i,
+					NULL, NULL, NULL, 0);
+		BUG_ON(ret);
+	}
+
+	pr_info("iterating forwards");
+
+	i = 0;
+
+	for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, POS(0, 0), 0, k) {
+		BUG_ON(bkey_start_offset(k.k) != i + 8);
+		BUG_ON(k.k->size != 8);
+		i += 16;
+	}
+	bch2_btree_iter_unlock(&iter);
+
+	BUG_ON(i != nr);
+
+	pr_info("iterating forwards by slots");
+
+	i = 0;
+
+	for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, POS(0, 0),
+			   BTREE_ITER_SLOTS, k) {
+		BUG_ON(bkey_deleted(k.k) != !(i % 16));
+
+		BUG_ON(bkey_start_offset(k.k) != i);
+		BUG_ON(k.k->size != 8);
+		i = k.k->p.offset;
+
+		if (i == nr)
+			break;
+	}
+	bch2_btree_iter_unlock(&iter);
+}
+
+/* perf tests */
+
+static u64 test_rand(void)
+{
+	u64 v;
+#if 0
+	v = prandom_u32_max(U32_MAX);
+#else
+	get_random_bytes(&v, sizeof(v));
+#endif
+	return v;
+}
+
+static void rand_insert(struct bch_fs *c, u64 nr)
+{
+	struct bkey_i_cookie k;
+	int ret;
+	u64 i;
+
+	for (i = 0; i < nr; i++) {
+		bkey_cookie_init(&k.k_i);
+		k.k.p.offset = test_rand();
+
+		ret = bch2_btree_insert(c, BTREE_ID_DIRENTS, &k.k_i,
+					NULL, NULL, NULL, 0);
+		BUG_ON(ret);
+	}
+}
+
+static void rand_lookup(struct bch_fs *c, u64 nr)
+{
+	u64 i;
+
+	for (i = 0; i < nr; i++) {
+		struct btree_iter iter;
+		struct bkey_s_c k;
+
+		bch2_btree_iter_init(&iter, c, BTREE_ID_DIRENTS,
+				     POS(0, test_rand()), 0);
+
+		k = bch2_btree_iter_peek(&iter);
+		bch2_btree_iter_unlock(&iter);
+	}
+}
+
+static void rand_mixed(struct bch_fs *c, u64 nr)
+{
+	int ret;
+	u64 i;
+
+	for (i = 0; i < nr; i++) {
+		struct btree_iter iter;
+		struct bkey_s_c k;
+
+		bch2_btree_iter_init(&iter, c, BTREE_ID_DIRENTS,
+				     POS(0, test_rand()), 0);
+
+		k = bch2_btree_iter_peek(&iter);
+
+		if (!(i & 3) && k.k) {
+			struct bkey_i_cookie k;
+
+			bkey_cookie_init(&k.k_i);
+			k.k.p = iter.pos;
+
+			ret = bch2_btree_insert_at(c, NULL, NULL, NULL, 0,
+						   BTREE_INSERT_ENTRY(&iter, &k.k_i));
+			BUG_ON(ret);
+		}
+
+		bch2_btree_iter_unlock(&iter);
+	}
+
+}
+
+static void rand_delete(struct bch_fs *c, u64 nr)
+{
+	struct bkey_i k;
+	int ret;
+	u64 i;
+
+	for (i = 0; i < nr; i++) {
+		bkey_init(&k.k);
+		k.k.p.offset = test_rand();
+
+		ret = bch2_btree_insert(c, BTREE_ID_DIRENTS, &k,
+					NULL, NULL, NULL, 0);
+		BUG_ON(ret);
+	}
+}
+
+static void seq_insert(struct bch_fs *c, u64 nr)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	struct bkey_i_cookie insert;
+	int ret;
+	u64 i = 0;
+
+	bkey_cookie_init(&insert.k_i);
+
+	for_each_btree_key(&iter, c, BTREE_ID_DIRENTS, POS_MIN,
+			   BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k) {
+		insert.k.p = iter.pos;
+
+		ret = bch2_btree_insert_at(c, NULL, NULL, NULL, 0,
+				BTREE_INSERT_ENTRY(&iter, &insert.k_i));
+		BUG_ON(ret);
+
+		if (++i == nr)
+			break;
+	}
+	bch2_btree_iter_unlock(&iter);
+}
+
+static void seq_lookup(struct bch_fs *c, u64 nr)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+
+	for_each_btree_key(&iter, c, BTREE_ID_DIRENTS, POS_MIN, 0, k)
+		;
+	bch2_btree_iter_unlock(&iter);
+}
+
+static void seq_overwrite(struct bch_fs *c, u64 nr)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	int ret;
+
+	for_each_btree_key(&iter, c, BTREE_ID_DIRENTS, POS_MIN,
+			   BTREE_ITER_INTENT, k) {
+		struct bkey_i_cookie u;
+
+		bkey_reassemble(&u.k_i, k);
+
+		ret = bch2_btree_insert_at(c, NULL, NULL, NULL, 0,
+					   BTREE_INSERT_ENTRY(&iter, &u.k_i));
+		BUG_ON(ret);
+	}
+	bch2_btree_iter_unlock(&iter);
+}
+
+static void seq_delete(struct bch_fs *c, u64 nr)
+{
+	int ret;
+
+	ret = bch2_btree_delete_range(c, BTREE_ID_DIRENTS,
+				      POS(0, 0), POS(0, U64_MAX),
+				      ZERO_VERSION, NULL, NULL, NULL);
+	BUG_ON(ret);
+}
+
+typedef void (*perf_test_fn)(struct bch_fs *, u64);
+
+struct test_job {
+	struct bch_fs			*c;
+	u64				nr;
+	unsigned			nr_threads;
+	perf_test_fn			fn;
+
+	atomic_t			ready;
+	wait_queue_head_t		ready_wait;
+
+	atomic_t			done;
+	struct completion		done_completion;
+
+	u64				start;
+	u64				finish;
+};
+
+static int btree_perf_test_thread(void *data)
+{
+	struct test_job *j = data;
+
+	if (atomic_dec_and_test(&j->ready)) {
+		wake_up(&j->ready_wait);
+		j->start = sched_clock();
+	} else {
+		wait_event(j->ready_wait, !atomic_read(&j->ready));
+	}
+
+	j->fn(j->c, j->nr / j->nr_threads);
+
+	if (atomic_dec_and_test(&j->done)) {
+		j->finish = sched_clock();
+		complete(&j->done_completion);
+	}
+
+	return 0;
+}
+
+void bch2_btree_perf_test(struct bch_fs *c, const char *testname,
+			  u64 nr, unsigned nr_threads)
+{
+	struct test_job j = { .c = c, .nr = nr, .nr_threads = nr_threads };
+	char name_buf[20], nr_buf[20], per_sec_buf[20];
+	unsigned i;
+	u64 time;
+
+	atomic_set(&j.ready, nr_threads);
+	init_waitqueue_head(&j.ready_wait);
+
+	atomic_set(&j.done, nr_threads);
+	init_completion(&j.done_completion);
+
+#define perf_test(_test)				\
+	if (!strcmp(testname, #_test)) j.fn = _test
+
+	perf_test(rand_insert);
+	perf_test(rand_lookup);
+	perf_test(rand_mixed);
+	perf_test(rand_delete);
+
+	perf_test(seq_insert);
+	perf_test(seq_lookup);
+	perf_test(seq_overwrite);
+	perf_test(seq_delete);
+
+	/* a unit test, not a perf test: */
+	perf_test(test_delete);
+	perf_test(test_delete_written);
+	perf_test(test_iterate);
+	perf_test(test_iterate_extents);
+	perf_test(test_iterate_slots);
+	perf_test(test_iterate_slots_extents);
+
+	if (!j.fn) {
+		pr_err("unknown test %s", testname);
+		return;
+	}
+
+	//pr_info("running test %s:", testname);
+
+	if (nr_threads == 1)
+		btree_perf_test_thread(&j);
+	else
+		for (i = 0; i < nr_threads; i++)
+			kthread_run(btree_perf_test_thread, &j,
+				    "bcachefs perf test[%u]", i);
+
+	while (wait_for_completion_interruptible(&j.done_completion))
+		;
+
+	time = j.finish - j.start;
+
+	scnprintf(name_buf, sizeof(name_buf), "%s:", testname);
+	bch2_hprint(nr_buf, nr);
+	bch2_hprint(per_sec_buf, nr * NSEC_PER_SEC / time);
+	printk(KERN_INFO "%-12s %s with %u threads in %5llu sec, %5llu nsec per iter, %5s per sec\n",
+		name_buf, nr_buf, nr_threads,
+		time / NSEC_PER_SEC,
+		time * nr_threads / nr,
+		per_sec_buf);
+}
+
+#endif /* CONFIG_BCACHEFS_TESTS */
diff --git a/fs/bcachefs/tests.h b/fs/bcachefs/tests.h
new file mode 100644
index 000000000000..551d0764225e
--- /dev/null
+++ b/fs/bcachefs/tests.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_TEST_H
+#define _BCACHEFS_TEST_H
+
+struct bch_fs;
+
+#ifdef CONFIG_BCACHEFS_TESTS
+
+void bch2_btree_perf_test(struct bch_fs *, const char *, u64, unsigned);
+
+#else
+
+#endif /* CONFIG_BCACHEFS_TESTS */
+
+#endif /* _BCACHEFS_TEST_H */
diff --git a/fs/bcachefs/trace.c b/fs/bcachefs/trace.c
new file mode 100644
index 000000000000..b770973faa14
--- /dev/null
+++ b/fs/bcachefs/trace.c
@@ -0,0 +1,12 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "bcachefs.h"
+#include "alloc_types.h"
+#include "buckets.h"
+#include "btree_types.h"
+#include "keylist.h"
+
+#include <linux/blktrace_api.h>
+#include "keylist.h"
+
+#define CREATE_TRACE_POINTS
+#include "trace.h"
diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h
new file mode 100644
index 000000000000..d0b99c692063
--- /dev/null
+++ b/fs/bcachefs/trace.h
@@ -0,0 +1,536 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM bcachefs
+
+#if !defined(_TRACE_BCACHEFS_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_BCACHEFS_H
+
+#include <linux/tracepoint.h>
+
+DECLARE_EVENT_CLASS(bpos,
+	TP_PROTO(struct bpos *p),
+	TP_ARGS(p),
+
+	TP_STRUCT__entry(
+		__field(u64,	inode				)
+		__field(u64,	offset				)
+	),
+
+	TP_fast_assign(
+		__entry->inode	= p->inode;
+		__entry->offset	= p->offset;
+	),
+
+	TP_printk("%llu:%llu", __entry->inode, __entry->offset)
+);
+
+DECLARE_EVENT_CLASS(bkey,
+	TP_PROTO(const struct bkey *k),
+	TP_ARGS(k),
+
+	TP_STRUCT__entry(
+		__field(u64,	inode				)
+		__field(u64,	offset				)
+		__field(u32,	size				)
+	),
+
+	TP_fast_assign(
+		__entry->inode	= k->p.inode;
+		__entry->offset	= k->p.offset;
+		__entry->size	= k->size;
+	),
+
+	TP_printk("%llu:%llu len %u", __entry->inode,
+		  __entry->offset, __entry->size)
+);
+
+DECLARE_EVENT_CLASS(bch_dev,
+	TP_PROTO(struct bch_dev *ca),
+	TP_ARGS(ca),
+
+	TP_STRUCT__entry(
+		__array(char,		uuid,	16	)
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->uuid, ca->uuid.b, 16);
+	),
+
+	TP_printk("%pU", __entry->uuid)
+);
+
+DECLARE_EVENT_CLASS(bch_fs,
+	TP_PROTO(struct bch_fs *c),
+	TP_ARGS(c),
+
+	TP_STRUCT__entry(
+		__array(char,		uuid,	16 )
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->uuid, c->sb.user_uuid.b, 16);
+	),
+
+	TP_printk("%pU", __entry->uuid)
+);
+
+DECLARE_EVENT_CLASS(bio,
+	TP_PROTO(struct bio *bio),
+	TP_ARGS(bio),
+
+	TP_STRUCT__entry(
+		__field(dev_t,		dev			)
+		__field(sector_t,	sector			)
+		__field(unsigned int,	nr_sector		)
+		__array(char,		rwbs,	6		)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= bio->bi_bdev ? bio_dev(bio) : 0;
+		__entry->sector		= bio->bi_iter.bi_sector;
+		__entry->nr_sector	= bio->bi_iter.bi_size >> 9;
+		blk_fill_rwbs(__entry->rwbs, bio->bi_opf);
+	),
+
+	TP_printk("%d,%d  %s %llu + %u",
+		  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
+		  (unsigned long long)__entry->sector, __entry->nr_sector)
+);
+
+/* io.c: */
+
+DEFINE_EVENT(bio, read_split,
+	TP_PROTO(struct bio *bio),
+	TP_ARGS(bio)
+);
+
+DEFINE_EVENT(bio, read_bounce,
+	TP_PROTO(struct bio *bio),
+	TP_ARGS(bio)
+);
+
+DEFINE_EVENT(bio, read_retry,
+	TP_PROTO(struct bio *bio),
+	TP_ARGS(bio)
+);
+
+DEFINE_EVENT(bio, promote,
+	TP_PROTO(struct bio *bio),
+	TP_ARGS(bio)
+);
+
+/* Journal */
+
+DEFINE_EVENT(bch_fs, journal_full,
+	TP_PROTO(struct bch_fs *c),
+	TP_ARGS(c)
+);
+
+DEFINE_EVENT(bch_fs, journal_entry_full,
+	TP_PROTO(struct bch_fs *c),
+	TP_ARGS(c)
+);
+
+DEFINE_EVENT(bio, journal_write,
+	TP_PROTO(struct bio *bio),
+	TP_ARGS(bio)
+);
+
+/* bset.c: */
+
+DEFINE_EVENT(bpos, bkey_pack_pos_fail,
+	TP_PROTO(struct bpos *p),
+	TP_ARGS(p)
+);
+
+/* Btree */
+
+DECLARE_EVENT_CLASS(btree_node,
+	TP_PROTO(struct bch_fs *c, struct btree *b),
+	TP_ARGS(c, b),
+
+	TP_STRUCT__entry(
+		__array(char,		uuid,		16	)
+		__field(u8,		level			)
+		__field(u8,		id			)
+		__field(u64,		inode			)
+		__field(u64,		offset			)
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->uuid, c->sb.user_uuid.b, 16);
+		__entry->level		= b->level;
+		__entry->id		= b->btree_id;
+		__entry->inode		= b->key.k.p.inode;
+		__entry->offset		= b->key.k.p.offset;
+	),
+
+	TP_printk("%pU  %u id %u %llu:%llu",
+		  __entry->uuid, __entry->level, __entry->id,
+		  __entry->inode, __entry->offset)
+);
+
+DEFINE_EVENT(btree_node, btree_read,
+	TP_PROTO(struct bch_fs *c, struct btree *b),
+	TP_ARGS(c, b)
+);
+
+TRACE_EVENT(btree_write,
+	TP_PROTO(struct btree *b, unsigned bytes, unsigned sectors),
+	TP_ARGS(b, bytes, sectors),
+
+	TP_STRUCT__entry(
+		__field(enum bkey_type,	type)
+		__field(unsigned,	bytes			)
+		__field(unsigned,	sectors			)
+	),
+
+	TP_fast_assign(
+		__entry->type	= btree_node_type(b);
+		__entry->bytes	= bytes;
+		__entry->sectors = sectors;
+	),
+
+	TP_printk("bkey type %u bytes %u sectors %u",
+		  __entry->type , __entry->bytes, __entry->sectors)
+);
+
+DEFINE_EVENT(btree_node, btree_node_alloc,
+	TP_PROTO(struct bch_fs *c, struct btree *b),
+	TP_ARGS(c, b)
+);
+
+DEFINE_EVENT(btree_node, btree_node_free,
+	TP_PROTO(struct bch_fs *c, struct btree *b),
+	TP_ARGS(c, b)
+);
+
+DEFINE_EVENT(btree_node, btree_node_reap,
+	TP_PROTO(struct bch_fs *c, struct btree *b),
+	TP_ARGS(c, b)
+);
+
+DECLARE_EVENT_CLASS(btree_node_cannibalize_lock,
+	TP_PROTO(struct bch_fs *c),
+	TP_ARGS(c),
+
+	TP_STRUCT__entry(
+		__array(char,			uuid,	16	)
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->uuid, c->sb.user_uuid.b, 16);
+	),
+
+	TP_printk("%pU", __entry->uuid)
+);
+
+DEFINE_EVENT(btree_node_cannibalize_lock, btree_node_cannibalize_lock_fail,
+	TP_PROTO(struct bch_fs *c),
+	TP_ARGS(c)
+);
+
+DEFINE_EVENT(btree_node_cannibalize_lock, btree_node_cannibalize_lock,
+	TP_PROTO(struct bch_fs *c),
+	TP_ARGS(c)
+);
+
+DEFINE_EVENT(btree_node_cannibalize_lock, btree_node_cannibalize,
+	TP_PROTO(struct bch_fs *c),
+	TP_ARGS(c)
+);
+
+DEFINE_EVENT(bch_fs, btree_node_cannibalize_unlock,
+	TP_PROTO(struct bch_fs *c),
+	TP_ARGS(c)
+);
+
+TRACE_EVENT(btree_reserve_get_fail,
+	TP_PROTO(struct bch_fs *c, size_t required, struct closure *cl),
+	TP_ARGS(c, required, cl),
+
+	TP_STRUCT__entry(
+		__array(char,			uuid,	16	)
+		__field(size_t,			required	)
+		__field(struct closure *,	cl		)
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->uuid, c->sb.user_uuid.b, 16);
+		__entry->required = required;
+		__entry->cl = cl;
+	),
+
+	TP_printk("%pU required %zu by %p", __entry->uuid,
+		  __entry->required, __entry->cl)
+);
+
+TRACE_EVENT(btree_insert_key,
+	TP_PROTO(struct bch_fs *c, struct btree *b, struct bkey_i *k),
+	TP_ARGS(c, b, k),
+
+	TP_STRUCT__entry(
+		__field(u8,		id			)
+		__field(u64,		inode			)
+		__field(u64,		offset			)
+		__field(u32,		size			)
+	),
+
+	TP_fast_assign(
+		__entry->id		= b->btree_id;
+		__entry->inode		= k->k.p.inode;
+		__entry->offset		= k->k.p.offset;
+		__entry->size		= k->k.size;
+	),
+
+	TP_printk("btree %u: %llu:%llu len %u", __entry->id,
+		  __entry->inode, __entry->offset, __entry->size)
+);
+
+DEFINE_EVENT(btree_node, btree_split,
+	TP_PROTO(struct bch_fs *c, struct btree *b),
+	TP_ARGS(c, b)
+);
+
+DEFINE_EVENT(btree_node, btree_compact,
+	TP_PROTO(struct bch_fs *c, struct btree *b),
+	TP_ARGS(c, b)
+);
+
+DEFINE_EVENT(btree_node, btree_merge,
+	TP_PROTO(struct bch_fs *c, struct btree *b),
+	TP_ARGS(c, b)
+);
+
+DEFINE_EVENT(btree_node, btree_set_root,
+	TP_PROTO(struct bch_fs *c, struct btree *b),
+	TP_ARGS(c, b)
+);
+
+/* Garbage collection */
+
+DEFINE_EVENT(btree_node, btree_gc_coalesce,
+	TP_PROTO(struct bch_fs *c, struct btree *b),
+	TP_ARGS(c, b)
+);
+
+TRACE_EVENT(btree_gc_coalesce_fail,
+	TP_PROTO(struct bch_fs *c, int reason),
+	TP_ARGS(c, reason),
+
+	TP_STRUCT__entry(
+		__field(u8,		reason			)
+		__array(char,		uuid,	16		)
+	),
+
+	TP_fast_assign(
+		__entry->reason		= reason;
+		memcpy(__entry->uuid, c->disk_sb.sb->user_uuid.b, 16);
+	),
+
+	TP_printk("%pU: %u", __entry->uuid, __entry->reason)
+);
+
+DEFINE_EVENT(btree_node, btree_gc_rewrite_node,
+	TP_PROTO(struct bch_fs *c, struct btree *b),
+	TP_ARGS(c, b)
+);
+
+DEFINE_EVENT(btree_node, btree_gc_rewrite_node_fail,
+	TP_PROTO(struct bch_fs *c, struct btree *b),
+	TP_ARGS(c, b)
+);
+
+DEFINE_EVENT(bch_fs, gc_start,
+	TP_PROTO(struct bch_fs *c),
+	TP_ARGS(c)
+);
+
+DEFINE_EVENT(bch_fs, gc_end,
+	TP_PROTO(struct bch_fs *c),
+	TP_ARGS(c)
+);
+
+DEFINE_EVENT(bch_fs, gc_coalesce_start,
+	TP_PROTO(struct bch_fs *c),
+	TP_ARGS(c)
+);
+
+DEFINE_EVENT(bch_fs, gc_coalesce_end,
+	TP_PROTO(struct bch_fs *c),
+	TP_ARGS(c)
+);
+
+DEFINE_EVENT(bch_dev, sectors_saturated,
+	TP_PROTO(struct bch_dev *ca),
+	TP_ARGS(ca)
+);
+
+DEFINE_EVENT(bch_fs, gc_sectors_saturated,
+	TP_PROTO(struct bch_fs *c),
+	TP_ARGS(c)
+);
+
+DEFINE_EVENT(bch_fs, gc_cannot_inc_gens,
+	TP_PROTO(struct bch_fs *c),
+	TP_ARGS(c)
+);
+
+/* Allocator */
+
+TRACE_EVENT(alloc_batch,
+	TP_PROTO(struct bch_dev *ca, size_t free, size_t total),
+	TP_ARGS(ca, free, total),
+
+	TP_STRUCT__entry(
+		__array(char,		uuid,	16	)
+		__field(size_t,		free		)
+		__field(size_t,		total		)
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->uuid, ca->uuid.b, 16);
+		__entry->free = free;
+		__entry->total = total;
+	),
+
+	TP_printk("%pU free %zu total %zu",
+		__entry->uuid, __entry->free, __entry->total)
+);
+
+TRACE_EVENT(invalidate,
+	TP_PROTO(struct bch_dev *ca, u64 offset, unsigned sectors),
+	TP_ARGS(ca, offset, sectors),
+
+	TP_STRUCT__entry(
+		__field(unsigned,	sectors			)
+		__field(dev_t,		dev			)
+		__field(__u64,		offset			)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= ca->disk_sb.bdev->bd_dev;
+		__entry->offset		= offset,
+		__entry->sectors	= sectors;
+	),
+
+	TP_printk("invalidated %u sectors at %d,%d sector=%llu",
+		  __entry->sectors, MAJOR(__entry->dev),
+		  MINOR(__entry->dev), __entry->offset)
+);
+
+DEFINE_EVENT(bch_fs, rescale_prios,
+	TP_PROTO(struct bch_fs *c),
+	TP_ARGS(c)
+);
+
+DECLARE_EVENT_CLASS(bucket_alloc,
+	TP_PROTO(struct bch_dev *ca, enum alloc_reserve reserve),
+	TP_ARGS(ca, reserve),
+
+	TP_STRUCT__entry(
+		__array(char,			uuid,	16)
+		__field(enum alloc_reserve,	reserve	  )
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->uuid, ca->uuid.b, 16);
+		__entry->reserve = reserve;
+	),
+
+	TP_printk("%pU reserve %d", __entry->uuid, __entry->reserve)
+);
+
+DEFINE_EVENT(bucket_alloc, bucket_alloc,
+	TP_PROTO(struct bch_dev *ca, enum alloc_reserve reserve),
+	TP_ARGS(ca, reserve)
+);
+
+DEFINE_EVENT(bucket_alloc, bucket_alloc_fail,
+	TP_PROTO(struct bch_dev *ca, enum alloc_reserve reserve),
+	TP_ARGS(ca, reserve)
+);
+
+DEFINE_EVENT(bucket_alloc, open_bucket_alloc_fail,
+	TP_PROTO(struct bch_dev *ca, enum alloc_reserve reserve),
+	TP_ARGS(ca, reserve)
+);
+
+/* Moving IO */
+
+DEFINE_EVENT(bkey, move_extent,
+	TP_PROTO(const struct bkey *k),
+	TP_ARGS(k)
+);
+
+DEFINE_EVENT(bkey, move_alloc_fail,
+	TP_PROTO(const struct bkey *k),
+	TP_ARGS(k)
+);
+
+DEFINE_EVENT(bkey, move_race,
+	TP_PROTO(const struct bkey *k),
+	TP_ARGS(k)
+);
+
+TRACE_EVENT(move_data,
+	TP_PROTO(struct bch_fs *c, u64 sectors_moved,
+		 u64 keys_moved),
+	TP_ARGS(c, sectors_moved, keys_moved),
+
+	TP_STRUCT__entry(
+		__array(char,		uuid,	16	)
+		__field(u64,		sectors_moved	)
+		__field(u64,		keys_moved	)
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->uuid, c->sb.user_uuid.b, 16);
+		__entry->sectors_moved = sectors_moved;
+		__entry->keys_moved = keys_moved;
+	),
+
+	TP_printk("%pU sectors_moved %llu keys_moved %llu",
+		__entry->uuid, __entry->sectors_moved, __entry->keys_moved)
+);
+
+TRACE_EVENT(copygc,
+	TP_PROTO(struct bch_dev *ca,
+		 u64 sectors_moved, u64 sectors_not_moved,
+		 u64 buckets_moved, u64 buckets_not_moved),
+	TP_ARGS(ca,
+		sectors_moved, sectors_not_moved,
+		buckets_moved, buckets_not_moved),
+
+	TP_STRUCT__entry(
+		__array(char,		uuid,	16		)
+		__field(u64,		sectors_moved		)
+		__field(u64,		sectors_not_moved	)
+		__field(u64,		buckets_moved		)
+		__field(u64,		buckets_not_moved	)
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->uuid, ca->uuid.b, 16);
+		__entry->sectors_moved		= sectors_moved;
+		__entry->sectors_not_moved	= sectors_not_moved;
+		__entry->buckets_moved		= buckets_moved;
+		__entry->buckets_not_moved = buckets_moved;
+	),
+
+	TP_printk("%pU sectors moved %llu remain %llu buckets moved %llu remain %llu",
+		__entry->uuid,
+		__entry->sectors_moved, __entry->sectors_not_moved,
+		__entry->buckets_moved, __entry->buckets_not_moved)
+);
+
+#endif /* _TRACE_BCACHEFS_H */
+
+/* This part must be outside protection */
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH ../../fs/bcachefs
+
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_FILE trace
+
+#include <trace/define_trace.h>
diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c
new file mode 100644
index 000000000000..6666c3aed05f
--- /dev/null
+++ b/fs/bcachefs/util.c
@@ -0,0 +1,942 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * random utiility code, for bcache but in theory not specific to bcache
+ *
+ * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
+ * Copyright 2012 Google, Inc.
+ */
+
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+#include <linux/ctype.h>
+#include <linux/debugfs.h>
+#include <linux/freezer.h>
+#include <linux/kthread.h>
+#include <linux/log2.h>
+#include <linux/math64.h>
+#include <linux/percpu.h>
+#include <linux/preempt.h>
+#include <linux/random.h>
+#include <linux/seq_file.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/sched/clock.h>
+
+#include "eytzinger.h"
+#include "util.h"
+
+#define simple_strtoint(c, end, base)	simple_strtol(c, end, base)
+#define simple_strtouint(c, end, base)	simple_strtoul(c, end, base)
+
+static const char si_units[] = "?kMGTPEZY";
+
+static int __bch2_strtoh(const char *cp, u64 *res,
+			 u64 t_max, bool t_signed)
+{
+	bool positive = *cp != '-';
+	unsigned u;
+	u64 v = 0;
+
+	if (*cp == '+' || *cp == '-')
+		cp++;
+
+	if (!isdigit(*cp))
+		return -EINVAL;
+
+	do {
+		if (v > U64_MAX / 10)
+			return -ERANGE;
+		v *= 10;
+		if (v > U64_MAX - (*cp - '0'))
+			return -ERANGE;
+		v += *cp - '0';
+		cp++;
+	} while (isdigit(*cp));
+
+	for (u = 1; u < strlen(si_units); u++)
+		if (*cp == si_units[u]) {
+			cp++;
+			goto got_unit;
+		}
+	u = 0;
+got_unit:
+	if (*cp == '\n')
+		cp++;
+	if (*cp)
+		return -EINVAL;
+
+	if (fls64(v) + u * 10 > 64)
+		return -ERANGE;
+
+	v <<= u * 10;
+
+	if (positive) {
+		if (v > t_max)
+			return -ERANGE;
+	} else {
+		if (v && !t_signed)
+			return -ERANGE;
+
+		if (v > t_max + 1)
+			return -ERANGE;
+		v = -v;
+	}
+
+	*res = v;
+	return 0;
+}
+
+#define STRTO_H(name, type)					\
+int bch2_ ## name ## _h(const char *cp, type *res)		\
+{								\
+	u64 v;							\
+	int ret = __bch2_strtoh(cp, &v, ANYSINT_MAX(type),	\
+			ANYSINT_MAX(type) != ((type) ~0ULL));	\
+	*res = v;						\
+	return ret;						\
+}
+
+STRTO_H(strtoint, int)
+STRTO_H(strtouint, unsigned int)
+STRTO_H(strtoll, long long)
+STRTO_H(strtoull, unsigned long long)
+
+ssize_t bch2_hprint(char *buf, s64 v)
+{
+	char dec[4] = "";
+	int u, t = 0;
+
+	for (u = 0; v >= 1024 || v <= -1024; u++) {
+		t = v & ~(~0U << 10);
+		v >>= 10;
+	}
+
+	if (!u)
+		return sprintf(buf, "%lli", v);
+
+	/*
+	 * 103 is magic: t is in the range [-1023, 1023] and we want
+	 * to turn it into [-9, 9]
+	 */
+	if (v < 100 && v > -100)
+		scnprintf(dec, sizeof(dec), ".%i", t / 103);
+
+	return sprintf(buf, "%lli%s%c", v, dec, si_units[u]);
+}
+
+ssize_t bch2_scnprint_string_list(char *buf, size_t size,
+				  const char * const list[],
+				  size_t selected)
+{
+	char *out = buf;
+	size_t i;
+
+	if (size)
+		*out = '\0';
+
+	for (i = 0; list[i]; i++)
+		out += scnprintf(out, buf + size - out,
+				 i == selected ? "[%s] " : "%s ", list[i]);
+
+	if (out != buf)
+		*--out = '\0';
+
+	return out - buf;
+}
+
+ssize_t bch2_scnprint_flag_list(char *buf, size_t size,
+				const char * const list[], u64 flags)
+{
+	char *out = buf, *end = buf + size;
+	unsigned bit, nr = 0;
+
+	while (list[nr])
+		nr++;
+
+	if (size)
+		*out = '\0';
+
+	while (flags && (bit = __ffs(flags)) < nr) {
+		out += scnprintf(out, end - out, "%s,", list[bit]);
+		flags ^= 1 << bit;
+	}
+
+	if (out != buf)
+		*--out = '\0';
+
+	return out - buf;
+}
+
+u64 bch2_read_flag_list(char *opt, const char * const list[])
+{
+	u64 ret = 0;
+	char *p, *s, *d = kstrndup(opt, PAGE_SIZE - 1, GFP_KERNEL);
+
+	if (!d)
+		return -ENOMEM;
+
+	s = strim(d);
+
+	while ((p = strsep(&s, ","))) {
+		int flag = match_string(list, -1, p);
+		if (flag < 0) {
+			ret = -1;
+			break;
+		}
+
+		ret |= 1 << flag;
+	}
+
+	kfree(d);
+
+	return ret;
+}
+
+bool bch2_is_zero(const void *_p, size_t n)
+{
+	const char *p = _p;
+	size_t i;
+
+	for (i = 0; i < n; i++)
+		if (p[i])
+			return false;
+	return true;
+}
+
+/* time stats: */
+
+#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
+static void bch2_quantiles_update(struct bch2_quantiles *q, u64 v)
+{
+	unsigned i = 0;
+
+	while (i < ARRAY_SIZE(q->entries)) {
+		struct bch2_quantile_entry *e = q->entries + i;
+
+		if (unlikely(!e->step)) {
+			e->m = v;
+			e->step = max_t(unsigned, v / 2, 1024);
+		} else if (e->m > v) {
+			e->m = e->m >= e->step
+				? e->m - e->step
+				: 0;
+		} else if (e->m < v) {
+			e->m = e->m + e->step > e->m
+				? e->m + e->step
+				: U32_MAX;
+		}
+
+		if ((e->m > v ? e->m - v : v - e->m) < e->step)
+			e->step = max_t(unsigned, e->step / 2, 1);
+
+		if (v >= e->m)
+			break;
+
+		i = eytzinger0_child(i, v > e->m);
+	}
+}
+
+static void bch2_time_stats_update_one(struct bch2_time_stats *stats,
+				       u64 start, u64 end)
+{
+	u64 duration, freq;
+
+	duration	= time_after64(end, start)
+		? end - start : 0;
+	freq		= time_after64(end, stats->last_event)
+		? end - stats->last_event : 0;
+
+	stats->count++;
+
+	stats->average_duration = stats->average_duration
+		? ewma_add(stats->average_duration, duration, 6)
+		: duration;
+
+	stats->average_frequency = stats->average_frequency
+		? ewma_add(stats->average_frequency, freq, 6)
+		: freq;
+
+	stats->max_duration = max(stats->max_duration, duration);
+
+	stats->last_event = end;
+
+	bch2_quantiles_update(&stats->quantiles, duration);
+}
+
+void __bch2_time_stats_update(struct bch2_time_stats *stats, u64 start, u64 end)
+{
+	unsigned long flags;
+
+	if (!stats->buffer) {
+		spin_lock_irqsave(&stats->lock, flags);
+		bch2_time_stats_update_one(stats, start, end);
+
+		if (stats->average_frequency < 32 &&
+		    stats->count > 1024)
+			stats->buffer =
+				alloc_percpu_gfp(struct bch2_time_stat_buffer,
+						 GFP_ATOMIC);
+		spin_unlock_irqrestore(&stats->lock, flags);
+	} else {
+		struct bch2_time_stat_buffer_entry *i;
+		struct bch2_time_stat_buffer *b;
+
+		preempt_disable();
+		b = this_cpu_ptr(stats->buffer);
+
+		BUG_ON(b->nr >= ARRAY_SIZE(b->entries));
+		b->entries[b->nr++] = (struct bch2_time_stat_buffer_entry) {
+			.start = start,
+			.end = end
+		};
+
+		if (b->nr == ARRAY_SIZE(b->entries)) {
+			spin_lock_irqsave(&stats->lock, flags);
+			for (i = b->entries;
+			     i < b->entries + ARRAY_SIZE(b->entries);
+			     i++)
+				bch2_time_stats_update_one(stats, i->start, i->end);
+			spin_unlock_irqrestore(&stats->lock, flags);
+
+			b->nr = 0;
+		}
+
+		preempt_enable();
+	}
+}
+#endif
+
+static const struct time_unit {
+	const char	*name;
+	u32		nsecs;
+} time_units[] = {
+	{ "ns",		1		},
+	{ "us",		NSEC_PER_USEC	},
+	{ "ms",		NSEC_PER_MSEC	},
+	{ "sec",	NSEC_PER_SEC	},
+};
+
+static const struct time_unit *pick_time_units(u64 ns)
+{
+	const struct time_unit *u;
+
+	for (u = time_units;
+	     u + 1 < time_units + ARRAY_SIZE(time_units) &&
+	     ns >= u[1].nsecs << 1;
+	     u++)
+		;
+
+	return u;
+}
+
+static size_t pr_time_units(char *buf, size_t len, u64 ns)
+{
+	const struct time_unit *u = pick_time_units(ns);
+
+	return scnprintf(buf, len, "%llu %s", div_u64(ns, u->nsecs), u->name);
+}
+
+size_t bch2_time_stats_print(struct bch2_time_stats *stats, char *buf, size_t len)
+{
+	char *out = buf, *end = buf + len;
+	const struct time_unit *u;
+	u64 freq = READ_ONCE(stats->average_frequency);
+	u64 q, last_q = 0;
+	int i;
+
+	out += scnprintf(out, end - out, "count:\t\t%llu\n",
+			 stats->count);
+	out += scnprintf(out, end - out, "rate:\t\t%llu/sec\n",
+			 freq ?  div64_u64(NSEC_PER_SEC, freq) : 0);
+
+	out += scnprintf(out, end - out, "frequency:\t");
+	out += pr_time_units(out, end - out, freq);
+
+	out += scnprintf(out, end - out, "\navg duration:\t");
+	out += pr_time_units(out, end - out, stats->average_duration);
+
+	out += scnprintf(out, end - out, "\nmax duration:\t");
+	out += pr_time_units(out, end - out, stats->max_duration);
+
+	i = eytzinger0_first(NR_QUANTILES);
+	u = pick_time_units(stats->quantiles.entries[i].m);
+
+	out += scnprintf(out, end - out, "\nquantiles (%s):\t", u->name);
+	eytzinger0_for_each(i, NR_QUANTILES) {
+		bool is_last = eytzinger0_next(i, NR_QUANTILES) == -1;
+
+		q = max(stats->quantiles.entries[i].m, last_q);
+		out += scnprintf(out, end - out, "%llu%s",
+				 div_u64(q, u->nsecs),
+				 is_last ? "\n" : " ");
+		last_q = q;
+	}
+
+	return out - buf;
+}
+
+void bch2_time_stats_exit(struct bch2_time_stats *stats)
+{
+	free_percpu(stats->buffer);
+}
+
+void bch2_time_stats_init(struct bch2_time_stats *stats)
+{
+	memset(stats, 0, sizeof(*stats));
+	spin_lock_init(&stats->lock);
+}
+
+/* ratelimit: */
+
+/**
+ * bch2_ratelimit_delay() - return how long to delay until the next time to do
+ * some work
+ *
+ * @d - the struct bch_ratelimit to update
+ *
+ * Returns the amount of time to delay by, in jiffies
+ */
+u64 bch2_ratelimit_delay(struct bch_ratelimit *d)
+{
+	u64 now = local_clock();
+
+	return time_after64(d->next, now)
+		? nsecs_to_jiffies(d->next - now)
+		: 0;
+}
+
+/**
+ * bch2_ratelimit_increment() - increment @d by the amount of work done
+ *
+ * @d - the struct bch_ratelimit to update
+ * @done - the amount of work done, in arbitrary units
+ */
+void bch2_ratelimit_increment(struct bch_ratelimit *d, u64 done)
+{
+	u64 now = local_clock();
+
+	d->next += div_u64(done * NSEC_PER_SEC, d->rate);
+
+	if (time_before64(now + NSEC_PER_SEC, d->next))
+		d->next = now + NSEC_PER_SEC;
+
+	if (time_after64(now - NSEC_PER_SEC * 2, d->next))
+		d->next = now - NSEC_PER_SEC * 2;
+}
+
+int bch2_ratelimit_wait_freezable_stoppable(struct bch_ratelimit *d)
+{
+	bool kthread = (current->flags & PF_KTHREAD) != 0;
+
+	while (1) {
+		u64 delay = bch2_ratelimit_delay(d);
+
+		if (delay)
+			set_current_state(TASK_INTERRUPTIBLE);
+
+		if (kthread && kthread_should_stop())
+			return 1;
+
+		if (!delay)
+			return 0;
+
+		schedule_timeout(delay);
+		try_to_freeze();
+	}
+}
+
+/* pd controller: */
+
+/*
+ * Updates pd_controller. Attempts to scale inputed values to units per second.
+ * @target: desired value
+ * @actual: current value
+ *
+ * @sign: 1 or -1; 1 if increasing the rate makes actual go up, -1 if increasing
+ * it makes actual go down.
+ */
+void bch2_pd_controller_update(struct bch_pd_controller *pd,
+			      s64 target, s64 actual, int sign)
+{
+	s64 proportional, derivative, change;
+
+	unsigned long seconds_since_update = (jiffies - pd->last_update) / HZ;
+
+	if (seconds_since_update == 0)
+		return;
+
+	pd->last_update = jiffies;
+
+	proportional = actual - target;
+	proportional *= seconds_since_update;
+	proportional = div_s64(proportional, pd->p_term_inverse);
+
+	derivative = actual - pd->last_actual;
+	derivative = div_s64(derivative, seconds_since_update);
+	derivative = ewma_add(pd->smoothed_derivative, derivative,
+			      (pd->d_term / seconds_since_update) ?: 1);
+	derivative = derivative * pd->d_term;
+	derivative = div_s64(derivative, pd->p_term_inverse);
+
+	change = proportional + derivative;
+
+	/* Don't increase rate if not keeping up */
+	if (change > 0 &&
+	    pd->backpressure &&
+	    time_after64(local_clock(),
+			 pd->rate.next + NSEC_PER_MSEC))
+		change = 0;
+
+	change *= (sign * -1);
+
+	pd->rate.rate = clamp_t(s64, (s64) pd->rate.rate + change,
+				1, UINT_MAX);
+
+	pd->last_actual		= actual;
+	pd->last_derivative	= derivative;
+	pd->last_proportional	= proportional;
+	pd->last_change		= change;
+	pd->last_target		= target;
+}
+
+void bch2_pd_controller_init(struct bch_pd_controller *pd)
+{
+	pd->rate.rate		= 1024;
+	pd->last_update		= jiffies;
+	pd->p_term_inverse	= 6000;
+	pd->d_term		= 30;
+	pd->d_smooth		= pd->d_term;
+	pd->backpressure	= 1;
+}
+
+size_t bch2_pd_controller_print_debug(struct bch_pd_controller *pd, char *buf)
+{
+	/* 2^64 - 1 is 20 digits, plus null byte */
+	char rate[21];
+	char actual[21];
+	char target[21];
+	char proportional[21];
+	char derivative[21];
+	char change[21];
+	s64 next_io;
+
+	bch2_hprint(rate,	pd->rate.rate);
+	bch2_hprint(actual,	pd->last_actual);
+	bch2_hprint(target,	pd->last_target);
+	bch2_hprint(proportional, pd->last_proportional);
+	bch2_hprint(derivative,	pd->last_derivative);
+	bch2_hprint(change,	pd->last_change);
+
+	next_io = div64_s64(pd->rate.next - local_clock(), NSEC_PER_MSEC);
+
+	return sprintf(buf,
+		       "rate:\t\t%s/sec\n"
+		       "target:\t\t%s\n"
+		       "actual:\t\t%s\n"
+		       "proportional:\t%s\n"
+		       "derivative:\t%s\n"
+		       "change:\t\t%s/sec\n"
+		       "next io:\t%llims\n",
+		       rate, target, actual, proportional,
+		       derivative, change, next_io);
+}
+
+/* misc: */
+
+void bch2_bio_map(struct bio *bio, void *base)
+{
+	size_t size = bio->bi_iter.bi_size;
+	struct bio_vec *bv = bio->bi_io_vec;
+
+	BUG_ON(!bio->bi_iter.bi_size);
+	BUG_ON(bio->bi_vcnt);
+
+	bv->bv_offset = base ? offset_in_page(base) : 0;
+	goto start;
+
+	for (; size; bio->bi_vcnt++, bv++) {
+		bv->bv_offset	= 0;
+start:		bv->bv_len	= min_t(size_t, PAGE_SIZE - bv->bv_offset,
+					size);
+		BUG_ON(bio->bi_vcnt >= bio->bi_max_vecs);
+		if (base) {
+			bv->bv_page = is_vmalloc_addr(base)
+				? vmalloc_to_page(base)
+				: virt_to_page(base);
+
+			base += bv->bv_len;
+		}
+
+		size -= bv->bv_len;
+	}
+}
+
+int bch2_bio_alloc_pages(struct bio *bio, size_t size, gfp_t gfp_mask)
+{
+	while (size) {
+		struct page *page = alloc_pages(gfp_mask, 0);
+		unsigned len = min_t(size_t, PAGE_SIZE, size);
+
+		if (!page)
+			return -ENOMEM;
+
+		if (unlikely(!bio_add_page(bio, page, len, 0))) {
+			__free_page(page);
+			break;
+		}
+
+		size -= len;
+	}
+
+	return 0;
+}
+
+size_t bch2_rand_range(size_t max)
+{
+	size_t rand;
+
+	if (!max)
+		return 0;
+
+	do {
+		rand = get_random_long();
+		rand &= roundup_pow_of_two(max) - 1;
+	} while (rand >= max);
+
+	return rand;
+}
+
+void memcpy_to_bio(struct bio *dst, struct bvec_iter dst_iter, void *src)
+{
+	struct bio_vec bv;
+	struct bvec_iter iter;
+
+	__bio_for_each_segment(bv, dst, iter, dst_iter) {
+		void *dstp = kmap_atomic(bv.bv_page);
+		memcpy(dstp + bv.bv_offset, src, bv.bv_len);
+		kunmap_atomic(dstp);
+
+		src += bv.bv_len;
+	}
+}
+
+void memcpy_from_bio(void *dst, struct bio *src, struct bvec_iter src_iter)
+{
+	struct bio_vec bv;
+	struct bvec_iter iter;
+
+	__bio_for_each_segment(bv, src, iter, src_iter) {
+		void *srcp = kmap_atomic(bv.bv_page);
+		memcpy(dst, srcp + bv.bv_offset, bv.bv_len);
+		kunmap_atomic(srcp);
+
+		dst += bv.bv_len;
+	}
+}
+
+size_t bch_scnmemcpy(char *buf, size_t size, const char *src, size_t len)
+{
+	size_t n;
+
+	if (!size)
+		return 0;
+
+	n = min(size - 1, len);
+	memcpy(buf, src, n);
+	buf[n] = '\0';
+
+	return n;
+}
+
+#include "eytzinger.h"
+
+static int alignment_ok(const void *base, size_t align)
+{
+	return IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) ||
+		((unsigned long)base & (align - 1)) == 0;
+}
+
+static void u32_swap(void *a, void *b, size_t size)
+{
+	u32 t = *(u32 *)a;
+	*(u32 *)a = *(u32 *)b;
+	*(u32 *)b = t;
+}
+
+static void u64_swap(void *a, void *b, size_t size)
+{
+	u64 t = *(u64 *)a;
+	*(u64 *)a = *(u64 *)b;
+	*(u64 *)b = t;
+}
+
+static void generic_swap(void *a, void *b, size_t size)
+{
+	char t;
+
+	do {
+		t = *(char *)a;
+		*(char *)a++ = *(char *)b;
+		*(char *)b++ = t;
+	} while (--size > 0);
+}
+
+static inline int do_cmp(void *base, size_t n, size_t size,
+			 int (*cmp_func)(const void *, const void *, size_t),
+			 size_t l, size_t r)
+{
+	return cmp_func(base + inorder_to_eytzinger0(l, n) * size,
+			base + inorder_to_eytzinger0(r, n) * size,
+			size);
+}
+
+static inline void do_swap(void *base, size_t n, size_t size,
+			   void (*swap_func)(void *, void *, size_t),
+			   size_t l, size_t r)
+{
+	swap_func(base + inorder_to_eytzinger0(l, n) * size,
+		  base + inorder_to_eytzinger0(r, n) * size,
+		  size);
+}
+
+void eytzinger0_sort(void *base, size_t n, size_t size,
+		     int (*cmp_func)(const void *, const void *, size_t),
+		     void (*swap_func)(void *, void *, size_t))
+{
+	int i, c, r;
+
+	if (!swap_func) {
+		if (size == 4 && alignment_ok(base, 4))
+			swap_func = u32_swap;
+		else if (size == 8 && alignment_ok(base, 8))
+			swap_func = u64_swap;
+		else
+			swap_func = generic_swap;
+	}
+
+	/* heapify */
+	for (i = n / 2 - 1; i >= 0; --i) {
+		for (r = i; r * 2 + 1 < n; r = c) {
+			c = r * 2 + 1;
+
+			if (c + 1 < n &&
+			    do_cmp(base, n, size, cmp_func, c, c + 1) < 0)
+				c++;
+
+			if (do_cmp(base, n, size, cmp_func, r, c) >= 0)
+				break;
+
+			do_swap(base, n, size, swap_func, r, c);
+		}
+	}
+
+	/* sort */
+	for (i = n - 1; i > 0; --i) {
+		do_swap(base, n, size, swap_func, 0, i);
+
+		for (r = 0; r * 2 + 1 < i; r = c) {
+			c = r * 2 + 1;
+
+			if (c + 1 < i &&
+			    do_cmp(base, n, size, cmp_func, c, c + 1) < 0)
+				c++;
+
+			if (do_cmp(base, n, size, cmp_func, r, c) >= 0)
+				break;
+
+			do_swap(base, n, size, swap_func, r, c);
+		}
+	}
+}
+
+void sort_cmp_size(void *base, size_t num, size_t size,
+	  int (*cmp_func)(const void *, const void *, size_t),
+	  void (*swap_func)(void *, void *, size_t size))
+{
+	/* pre-scale counters for performance */
+	int i = (num/2 - 1) * size, n = num * size, c, r;
+
+	if (!swap_func) {
+		if (size == 4 && alignment_ok(base, 4))
+			swap_func = u32_swap;
+		else if (size == 8 && alignment_ok(base, 8))
+			swap_func = u64_swap;
+		else
+			swap_func = generic_swap;
+	}
+
+	/* heapify */
+	for ( ; i >= 0; i -= size) {
+		for (r = i; r * 2 + size < n; r  = c) {
+			c = r * 2 + size;
+			if (c < n - size &&
+			    cmp_func(base + c, base + c + size, size) < 0)
+				c += size;
+			if (cmp_func(base + r, base + c, size) >= 0)
+				break;
+			swap_func(base + r, base + c, size);
+		}
+	}
+
+	/* sort */
+	for (i = n - size; i > 0; i -= size) {
+		swap_func(base, base + i, size);
+		for (r = 0; r * 2 + size < i; r = c) {
+			c = r * 2 + size;
+			if (c < i - size &&
+			    cmp_func(base + c, base + c + size, size) < 0)
+				c += size;
+			if (cmp_func(base + r, base + c, size) >= 0)
+				break;
+			swap_func(base + r, base + c, size);
+		}
+	}
+}
+
+static void mempool_free_vp(void *element, void *pool_data)
+{
+	size_t size = (size_t) pool_data;
+
+	vpfree(element, size);
+}
+
+static void *mempool_alloc_vp(gfp_t gfp_mask, void *pool_data)
+{
+	size_t size = (size_t) pool_data;
+
+	return vpmalloc(size, gfp_mask);
+}
+
+int mempool_init_kvpmalloc_pool(mempool_t *pool, int min_nr, size_t size)
+{
+	return size < PAGE_SIZE
+		? mempool_init_kmalloc_pool(pool, min_nr, size)
+		: mempool_init(pool, min_nr, mempool_alloc_vp,
+			       mempool_free_vp, (void *) size);
+}
+
+#if 0
+void eytzinger1_test(void)
+{
+	unsigned inorder, eytz, size;
+
+	pr_info("1 based eytzinger test:");
+
+	for (size = 2;
+	     size < 65536;
+	     size++) {
+		unsigned extra = eytzinger1_extra(size);
+
+		if (!(size % 4096))
+			pr_info("tree size %u", size);
+
+		BUG_ON(eytzinger1_prev(0, size) != eytzinger1_last(size));
+		BUG_ON(eytzinger1_next(0, size) != eytzinger1_first(size));
+
+		BUG_ON(eytzinger1_prev(eytzinger1_first(size), size)	!= 0);
+		BUG_ON(eytzinger1_next(eytzinger1_last(size), size)	!= 0);
+
+		inorder = 1;
+		eytzinger1_for_each(eytz, size) {
+			BUG_ON(__inorder_to_eytzinger1(inorder, size, extra) != eytz);
+			BUG_ON(__eytzinger1_to_inorder(eytz, size, extra) != inorder);
+			BUG_ON(eytz != eytzinger1_last(size) &&
+			       eytzinger1_prev(eytzinger1_next(eytz, size), size) != eytz);
+
+			inorder++;
+		}
+	}
+}
+
+void eytzinger0_test(void)
+{
+
+	unsigned inorder, eytz, size;
+
+	pr_info("0 based eytzinger test:");
+
+	for (size = 1;
+	     size < 65536;
+	     size++) {
+		unsigned extra = eytzinger0_extra(size);
+
+		if (!(size % 4096))
+			pr_info("tree size %u", size);
+
+		BUG_ON(eytzinger0_prev(-1, size) != eytzinger0_last(size));
+		BUG_ON(eytzinger0_next(-1, size) != eytzinger0_first(size));
+
+		BUG_ON(eytzinger0_prev(eytzinger0_first(size), size)	!= -1);
+		BUG_ON(eytzinger0_next(eytzinger0_last(size), size)	!= -1);
+
+		inorder = 0;
+		eytzinger0_for_each(eytz, size) {
+			BUG_ON(__inorder_to_eytzinger0(inorder, size, extra) != eytz);
+			BUG_ON(__eytzinger0_to_inorder(eytz, size, extra) != inorder);
+			BUG_ON(eytz != eytzinger0_last(size) &&
+			       eytzinger0_prev(eytzinger0_next(eytz, size), size) != eytz);
+
+			inorder++;
+		}
+	}
+}
+
+static inline int cmp_u16(const void *_l, const void *_r, size_t size)
+{
+	const u16 *l = _l, *r = _r;
+
+	return (*l > *r) - (*r - *l);
+}
+
+static void eytzinger0_find_test_val(u16 *test_array, unsigned nr, u16 search)
+{
+	int i, c1 = -1, c2 = -1;
+	ssize_t r;
+
+	r = eytzinger0_find_le(test_array, nr,
+			       sizeof(test_array[0]),
+			       cmp_u16, &search);
+	if (r >= 0)
+		c1 = test_array[r];
+
+	for (i = 0; i < nr; i++)
+		if (test_array[i] <= search && test_array[i] > c2)
+			c2 = test_array[i];
+
+	if (c1 != c2) {
+		eytzinger0_for_each(i, nr)
+			pr_info("[%3u] = %12u", i, test_array[i]);
+		pr_info("find_le(%2u) -> [%2zi] = %2i should be %2i",
+			i, r, c1, c2);
+	}
+}
+
+void eytzinger0_find_test(void)
+{
+	unsigned i, nr, allocated = 1 << 12;
+	u16 *test_array = kmalloc_array(allocated, sizeof(test_array[0]), GFP_KERNEL);
+
+	for (nr = 1; nr < allocated; nr++) {
+		pr_info("testing %u elems", nr);
+
+		get_random_bytes(test_array, nr * sizeof(test_array[0]));
+		eytzinger0_sort(test_array, nr, sizeof(test_array[0]), cmp_u16, NULL);
+
+		/* verify array is sorted correctly: */
+		eytzinger0_for_each(i, nr)
+			BUG_ON(i != eytzinger0_last(nr) &&
+			       test_array[i] > test_array[eytzinger0_next(i, nr)]);
+
+		for (i = 0; i < U16_MAX; i += 1 << 12)
+			eytzinger0_find_test_val(test_array, nr, i);
+
+		for (i = 0; i < nr; i++) {
+			eytzinger0_find_test_val(test_array, nr, test_array[i] - 1);
+			eytzinger0_find_test_val(test_array, nr, test_array[i]);
+			eytzinger0_find_test_val(test_array, nr, test_array[i] + 1);
+		}
+	}
+
+	kfree(test_array);
+}
+#endif
diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h
new file mode 100644
index 000000000000..c0b26123af4c
--- /dev/null
+++ b/fs/bcachefs/util.h
@@ -0,0 +1,737 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_UTIL_H
+#define _BCACHEFS_UTIL_H
+
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+#include <linux/closure.h>
+#include <linux/errno.h>
+#include <linux/freezer.h>
+#include <linux/kernel.h>
+#include <linux/sched/clock.h>
+#include <linux/llist.h>
+#include <linux/log2.h>
+#include <linux/ratelimit.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/workqueue.h>
+
+#define PAGE_SECTOR_SHIFT	(PAGE_SHIFT - 9)
+
+struct closure;
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+
+#define EBUG_ON(cond)		BUG_ON(cond)
+#define atomic_dec_bug(v)	BUG_ON(atomic_dec_return(v) < 0)
+#define atomic_inc_bug(v, i)	BUG_ON(atomic_inc_return(v) <= i)
+#define atomic_sub_bug(i, v)	BUG_ON(atomic_sub_return(i, v) < 0)
+#define atomic_add_bug(i, v)	BUG_ON(atomic_add_return(i, v) < 0)
+#define atomic_long_dec_bug(v)		BUG_ON(atomic_long_dec_return(v) < 0)
+#define atomic_long_sub_bug(i, v)	BUG_ON(atomic_long_sub_return(i, v) < 0)
+#define atomic64_dec_bug(v)	BUG_ON(atomic64_dec_return(v) < 0)
+#define atomic64_inc_bug(v, i)	BUG_ON(atomic64_inc_return(v) <= i)
+#define atomic64_sub_bug(i, v)	BUG_ON(atomic64_sub_return(i, v) < 0)
+#define atomic64_add_bug(i, v)	BUG_ON(atomic64_add_return(i, v) < 0)
+
+#else /* DEBUG */
+
+#define EBUG_ON(cond)
+#define atomic_dec_bug(v)	atomic_dec(v)
+#define atomic_inc_bug(v, i)	atomic_inc(v)
+#define atomic_sub_bug(i, v)	atomic_sub(i, v)
+#define atomic_add_bug(i, v)	atomic_add(i, v)
+#define atomic_long_dec_bug(v)		atomic_long_dec(v)
+#define atomic_long_sub_bug(i, v)	atomic_long_sub(i, v)
+#define atomic64_dec_bug(v)	atomic64_dec(v)
+#define atomic64_inc_bug(v, i)	atomic64_inc(v)
+#define atomic64_sub_bug(i, v)	atomic64_sub(i, v)
+#define atomic64_add_bug(i, v)	atomic64_add(i, v)
+
+#endif
+
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+#define CPU_BIG_ENDIAN		0
+#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+#define CPU_BIG_ENDIAN		1
+#endif
+
+/* type hackery */
+
+#define type_is_exact(_val, _type)					\
+	__builtin_types_compatible_p(typeof(_val), _type)
+
+#define type_is(_val, _type)						\
+	(__builtin_types_compatible_p(typeof(_val), _type) ||		\
+	 __builtin_types_compatible_p(typeof(_val), const _type))
+
+/* Userspace doesn't align allocations as nicely as the kernel allocators: */
+static inline size_t buf_pages(void *p, size_t len)
+{
+	return DIV_ROUND_UP(len +
+			    ((unsigned long) p & (PAGE_SIZE - 1)),
+			    PAGE_SIZE);
+}
+
+static inline void vpfree(void *p, size_t size)
+{
+	if (is_vmalloc_addr(p))
+		vfree(p);
+	else
+		free_pages((unsigned long) p, get_order(size));
+}
+
+static inline void *vpmalloc(size_t size, gfp_t gfp_mask)
+{
+	return (void *) __get_free_pages(gfp_mask|__GFP_NOWARN,
+					 get_order(size)) ?:
+		__vmalloc(size, gfp_mask);
+}
+
+static inline void kvpfree(void *p, size_t size)
+{
+	if (size < PAGE_SIZE)
+		kfree(p);
+	else
+		vpfree(p, size);
+}
+
+static inline void *kvpmalloc(size_t size, gfp_t gfp_mask)
+{
+	return size < PAGE_SIZE
+		? kmalloc(size, gfp_mask)
+		: vpmalloc(size, gfp_mask);
+}
+
+int mempool_init_kvpmalloc_pool(mempool_t *, int, size_t);
+
+#define HEAP(type)							\
+struct {								\
+	size_t size, used;						\
+	type *data;							\
+}
+
+#define DECLARE_HEAP(type, name) HEAP(type) name
+
+#define init_heap(heap, _size, gfp)					\
+({									\
+	(heap)->used = 0;						\
+	(heap)->size = (_size);						\
+	(heap)->data = kvpmalloc((heap)->size * sizeof((heap)->data[0]),\
+				 (gfp));				\
+})
+
+#define free_heap(heap)							\
+do {									\
+	kvpfree((heap)->data, (heap)->size * sizeof((heap)->data[0]));	\
+	(heap)->data = NULL;						\
+} while (0)
+
+#define heap_swap(h, i, j)	swap((h)->data[i], (h)->data[j])
+
+#define heap_peek(h)							\
+({									\
+	EBUG_ON(!(h)->used);						\
+	(h)->data[0];							\
+})
+
+#define heap_full(h)	((h)->used == (h)->size)
+
+#define heap_sift_down(h, i, cmp)					\
+do {									\
+	size_t _c, _j = i;						\
+									\
+	for (; _j * 2 + 1 < (h)->used; _j = _c) {			\
+		_c = _j * 2 + 1;					\
+		if (_c + 1 < (h)->used &&				\
+		    cmp(h, (h)->data[_c], (h)->data[_c + 1]) >= 0)	\
+			_c++;						\
+									\
+		if (cmp(h, (h)->data[_c], (h)->data[_j]) >= 0)		\
+			break;						\
+		heap_swap(h, _c, _j);					\
+	}								\
+} while (0)
+
+#define heap_sift_up(h, i, cmp)						\
+do {									\
+	while (i) {							\
+		size_t p = (i - 1) / 2;					\
+		if (cmp(h, (h)->data[i], (h)->data[p]) >= 0)		\
+			break;						\
+		heap_swap(h, i, p);					\
+		i = p;							\
+	}								\
+} while (0)
+
+#define __heap_add(h, d, cmp)						\
+do {									\
+	size_t _i = (h)->used++;					\
+	(h)->data[_i] = d;						\
+									\
+	heap_sift_up(h, _i, cmp);					\
+} while (0)
+
+#define heap_add(h, d, cmp)						\
+({									\
+	bool _r = !heap_full(h);					\
+	if (_r)								\
+		__heap_add(h, d, cmp);					\
+	_r;								\
+})
+
+#define heap_add_or_replace(h, new, cmp)				\
+do {									\
+	if (!heap_add(h, new, cmp) &&					\
+	    cmp(h, new, heap_peek(h)) >= 0) {				\
+		(h)->data[0] = new;					\
+		heap_sift_down(h, 0, cmp);				\
+	}								\
+} while (0)
+
+#define heap_del(h, i, cmp)						\
+do {									\
+	size_t _i = (i);						\
+									\
+	BUG_ON(_i >= (h)->used);					\
+	(h)->used--;							\
+	heap_swap(h, _i, (h)->used);					\
+	heap_sift_up(h, _i, cmp);					\
+	heap_sift_down(h, _i, cmp);					\
+} while (0)
+
+#define heap_pop(h, d, cmp)						\
+({									\
+	bool _r = (h)->used;						\
+	if (_r) {							\
+		(d) = (h)->data[0];					\
+		heap_del(h, 0, cmp);					\
+	}								\
+	_r;								\
+})
+
+#define heap_resort(heap, cmp)						\
+do {									\
+	ssize_t _i;							\
+	for (_i = (ssize_t) (heap)->used / 2 -  1; _i >= 0; --_i)	\
+		heap_sift_down(heap, _i, cmp);				\
+} while (0)
+
+#define ANYSINT_MAX(t)							\
+	((((t) 1 << (sizeof(t) * 8 - 2)) - (t) 1) * (t) 2 + (t) 1)
+
+int bch2_strtoint_h(const char *, int *);
+int bch2_strtouint_h(const char *, unsigned int *);
+int bch2_strtoll_h(const char *, long long *);
+int bch2_strtoull_h(const char *, unsigned long long *);
+
+static inline int bch2_strtol_h(const char *cp, long *res)
+{
+#if BITS_PER_LONG == 32
+	return bch2_strtoint_h(cp, (int *) res);
+#else
+	return bch2_strtoll_h(cp, (long long *) res);
+#endif
+}
+
+static inline int bch2_strtoul_h(const char *cp, long *res)
+{
+#if BITS_PER_LONG == 32
+	return bch2_strtouint_h(cp, (unsigned int *) res);
+#else
+	return bch2_strtoull_h(cp, (unsigned long long *) res);
+#endif
+}
+
+#define strtoi_h(cp, res)						\
+	( type_is(*res, int)		? bch2_strtoint_h(cp, (void *) res)\
+	: type_is(*res, long)		? bch2_strtol_h(cp, (void *) res)\
+	: type_is(*res, long long)	? bch2_strtoll_h(cp, (void *) res)\
+	: type_is(*res, unsigned)	? bch2_strtouint_h(cp, (void *) res)\
+	: type_is(*res, unsigned long)	? bch2_strtoul_h(cp, (void *) res)\
+	: type_is(*res, unsigned long long) ? bch2_strtoull_h(cp, (void *) res)\
+	: -EINVAL)
+
+#define strtoul_safe(cp, var)						\
+({									\
+	unsigned long _v;						\
+	int _r = kstrtoul(cp, 10, &_v);					\
+	if (!_r)							\
+		var = _v;						\
+	_r;								\
+})
+
+#define strtoul_safe_clamp(cp, var, min, max)				\
+({									\
+	unsigned long _v;						\
+	int _r = kstrtoul(cp, 10, &_v);					\
+	if (!_r)							\
+		var = clamp_t(typeof(var), _v, min, max);		\
+	_r;								\
+})
+
+#define strtoul_safe_restrict(cp, var, min, max)			\
+({									\
+	unsigned long _v;						\
+	int _r = kstrtoul(cp, 10, &_v);					\
+	if (!_r && _v >= min && _v <= max)				\
+		var = _v;						\
+	else								\
+		_r = -EINVAL;						\
+	_r;								\
+})
+
+#define snprint(buf, size, var)						\
+	snprintf(buf, size,						\
+		   type_is(var, int)		? "%i\n"		\
+		 : type_is(var, unsigned)	? "%u\n"		\
+		 : type_is(var, long)		? "%li\n"		\
+		 : type_is(var, unsigned long)	? "%lu\n"		\
+		 : type_is(var, s64)		? "%lli\n"		\
+		 : type_is(var, u64)		? "%llu\n"		\
+		 : type_is(var, char *)		? "%s\n"		\
+		 : "%i\n", var)
+
+ssize_t bch2_hprint(char *buf, s64 v);
+
+bool bch2_is_zero(const void *, size_t);
+
+ssize_t bch2_scnprint_string_list(char *, size_t, const char * const[], size_t);
+
+ssize_t bch2_scnprint_flag_list(char *, size_t, const char * const[], u64);
+u64 bch2_read_flag_list(char *, const char * const[]);
+
+#define NR_QUANTILES	15
+#define QUANTILE_IDX(i)	inorder_to_eytzinger0(i, NR_QUANTILES)
+#define QUANTILE_FIRST	eytzinger0_first(NR_QUANTILES)
+#define QUANTILE_LAST	eytzinger0_last(NR_QUANTILES)
+
+struct bch2_quantiles {
+	struct bch2_quantile_entry {
+		u64	m;
+		u64	step;
+	}		entries[NR_QUANTILES];
+};
+
+struct bch2_time_stat_buffer {
+	unsigned	nr;
+	struct bch2_time_stat_buffer_entry {
+		u64	start;
+		u64	end;
+	}		entries[32];
+};
+
+struct bch2_time_stats {
+	spinlock_t	lock;
+	u64		count;
+	/* all fields are in nanoseconds */
+	u64		average_duration;
+	u64		average_frequency;
+	u64		max_duration;
+	u64		last_event;
+	struct bch2_quantiles quantiles;
+
+	struct bch2_time_stat_buffer __percpu *buffer;
+};
+
+#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
+void __bch2_time_stats_update(struct bch2_time_stats *stats, u64, u64);
+#else
+static inline void __bch2_time_stats_update(struct bch2_time_stats *stats, u64 start, u64 end) {}
+#endif
+
+static inline void bch2_time_stats_update(struct bch2_time_stats *stats, u64 start)
+{
+	__bch2_time_stats_update(stats, start, local_clock());
+}
+
+size_t bch2_time_stats_print(struct bch2_time_stats *, char *, size_t);
+
+void bch2_time_stats_exit(struct bch2_time_stats *);
+void bch2_time_stats_init(struct bch2_time_stats *);
+
+#define ewma_add(ewma, val, weight)					\
+({									\
+	typeof(ewma) _ewma = (ewma);					\
+	typeof(weight) _weight = (weight);				\
+									\
+	(((_ewma << _weight) - _ewma) + (val)) >> _weight;		\
+})
+
+struct bch_ratelimit {
+	/* Next time we want to do some work, in nanoseconds */
+	u64			next;
+
+	/*
+	 * Rate at which we want to do work, in units per nanosecond
+	 * The units here correspond to the units passed to
+	 * bch2_ratelimit_increment()
+	 */
+	unsigned		rate;
+};
+
+static inline void bch2_ratelimit_reset(struct bch_ratelimit *d)
+{
+	d->next = local_clock();
+}
+
+u64 bch2_ratelimit_delay(struct bch_ratelimit *);
+void bch2_ratelimit_increment(struct bch_ratelimit *, u64);
+int bch2_ratelimit_wait_freezable_stoppable(struct bch_ratelimit *);
+
+struct bch_pd_controller {
+	struct bch_ratelimit	rate;
+	unsigned long		last_update;
+
+	s64			last_actual;
+	s64			smoothed_derivative;
+
+	unsigned		p_term_inverse;
+	unsigned		d_smooth;
+	unsigned		d_term;
+
+	/* for exporting to sysfs (no effect on behavior) */
+	s64			last_derivative;
+	s64			last_proportional;
+	s64			last_change;
+	s64			last_target;
+
+	/* If true, the rate will not increase if bch2_ratelimit_delay()
+	 * is not being called often enough. */
+	bool			backpressure;
+};
+
+void bch2_pd_controller_update(struct bch_pd_controller *, s64, s64, int);
+void bch2_pd_controller_init(struct bch_pd_controller *);
+size_t bch2_pd_controller_print_debug(struct bch_pd_controller *, char *);
+
+#define sysfs_pd_controller_attribute(name)				\
+	rw_attribute(name##_rate);					\
+	rw_attribute(name##_rate_bytes);				\
+	rw_attribute(name##_rate_d_term);				\
+	rw_attribute(name##_rate_p_term_inverse);			\
+	read_attribute(name##_rate_debug)
+
+#define sysfs_pd_controller_files(name)					\
+	&sysfs_##name##_rate,						\
+	&sysfs_##name##_rate_bytes,					\
+	&sysfs_##name##_rate_d_term,					\
+	&sysfs_##name##_rate_p_term_inverse,				\
+	&sysfs_##name##_rate_debug
+
+#define sysfs_pd_controller_show(name, var)				\
+do {									\
+	sysfs_hprint(name##_rate,		(var)->rate.rate);	\
+	sysfs_print(name##_rate_bytes,		(var)->rate.rate);	\
+	sysfs_print(name##_rate_d_term,		(var)->d_term);		\
+	sysfs_print(name##_rate_p_term_inverse,	(var)->p_term_inverse);	\
+									\
+	if (attr == &sysfs_##name##_rate_debug)				\
+		return bch2_pd_controller_print_debug(var, buf);		\
+} while (0)
+
+#define sysfs_pd_controller_store(name, var)				\
+do {									\
+	sysfs_strtoul_clamp(name##_rate,				\
+			    (var)->rate.rate, 1, UINT_MAX);		\
+	sysfs_strtoul_clamp(name##_rate_bytes,				\
+			    (var)->rate.rate, 1, UINT_MAX);		\
+	sysfs_strtoul(name##_rate_d_term,	(var)->d_term);		\
+	sysfs_strtoul_clamp(name##_rate_p_term_inverse,			\
+			    (var)->p_term_inverse, 1, INT_MAX);		\
+} while (0)
+
+#define __DIV_SAFE(n, d, zero)						\
+({									\
+	typeof(n) _n = (n);						\
+	typeof(d) _d = (d);						\
+	_d ? _n / _d : zero;						\
+})
+
+#define DIV_SAFE(n, d)	__DIV_SAFE(n, d, 0)
+
+#define container_of_or_null(ptr, type, member)				\
+({									\
+	typeof(ptr) _ptr = ptr;						\
+	_ptr ? container_of(_ptr, type, member) : NULL;			\
+})
+
+#define RB_INSERT(root, new, member, cmp)				\
+({									\
+	__label__ dup;							\
+	struct rb_node **n = &(root)->rb_node, *parent = NULL;		\
+	typeof(new) this;						\
+	int res, ret = -1;						\
+									\
+	while (*n) {							\
+		parent = *n;						\
+		this = container_of(*n, typeof(*(new)), member);	\
+		res = cmp(new, this);					\
+		if (!res)						\
+			goto dup;					\
+		n = res < 0						\
+			? &(*n)->rb_left				\
+			: &(*n)->rb_right;				\
+	}								\
+									\
+	rb_link_node(&(new)->member, parent, n);			\
+	rb_insert_color(&(new)->member, root);				\
+	ret = 0;							\
+dup:									\
+	ret;								\
+})
+
+#define RB_SEARCH(root, search, member, cmp)				\
+({									\
+	struct rb_node *n = (root)->rb_node;				\
+	typeof(&(search)) this, ret = NULL;				\
+	int res;							\
+									\
+	while (n) {							\
+		this = container_of(n, typeof(search), member);		\
+		res = cmp(&(search), this);				\
+		if (!res) {						\
+			ret = this;					\
+			break;						\
+		}							\
+		n = res < 0						\
+			? n->rb_left					\
+			: n->rb_right;					\
+	}								\
+	ret;								\
+})
+
+#define RB_GREATER(root, search, member, cmp)				\
+({									\
+	struct rb_node *n = (root)->rb_node;				\
+	typeof(&(search)) this, ret = NULL;				\
+	int res;							\
+									\
+	while (n) {							\
+		this = container_of(n, typeof(search), member);		\
+		res = cmp(&(search), this);				\
+		if (res < 0) {						\
+			ret = this;					\
+			n = n->rb_left;					\
+		} else							\
+			n = n->rb_right;				\
+	}								\
+	ret;								\
+})
+
+#define RB_FIRST(root, type, member)					\
+	container_of_or_null(rb_first(root), type, member)
+
+#define RB_LAST(root, type, member)					\
+	container_of_or_null(rb_last(root), type, member)
+
+#define RB_NEXT(ptr, member)						\
+	container_of_or_null(rb_next(&(ptr)->member), typeof(*ptr), member)
+
+#define RB_PREV(ptr, member)						\
+	container_of_or_null(rb_prev(&(ptr)->member), typeof(*ptr), member)
+
+/* Does linear interpolation between powers of two */
+static inline unsigned fract_exp_two(unsigned x, unsigned fract_bits)
+{
+	unsigned fract = x & ~(~0 << fract_bits);
+
+	x >>= fract_bits;
+	x   = 1 << x;
+	x  += (x * fract) >> fract_bits;
+
+	return x;
+}
+
+void bch2_bio_map(struct bio *bio, void *base);
+int bch2_bio_alloc_pages(struct bio *, size_t, gfp_t);
+
+static inline sector_t bdev_sectors(struct block_device *bdev)
+{
+	return bdev->bd_inode->i_size >> 9;
+}
+
+#define closure_bio_submit(bio, cl)					\
+do {									\
+	closure_get(cl);						\
+	submit_bio(bio);						\
+} while (0)
+
+#define kthread_wait_freezable(cond)					\
+({									\
+	int _ret = 0;							\
+	while (1) {							\
+		set_current_state(TASK_INTERRUPTIBLE);			\
+		if (kthread_should_stop()) {				\
+			_ret = -1;					\
+			break;						\
+		}							\
+									\
+		if (cond)						\
+			break;						\
+									\
+		schedule();						\
+		try_to_freeze();					\
+	}								\
+	set_current_state(TASK_RUNNING);				\
+	_ret;								\
+})
+
+size_t bch2_rand_range(size_t);
+
+void memcpy_to_bio(struct bio *, struct bvec_iter, void *);
+void memcpy_from_bio(void *, struct bio *, struct bvec_iter);
+
+static inline void __memcpy_u64s(void *dst, const void *src,
+				 unsigned u64s)
+{
+#ifdef CONFIG_X86_64
+	long d0, d1, d2;
+	asm volatile("rep ; movsq"
+		     : "=&c" (d0), "=&D" (d1), "=&S" (d2)
+		     : "0" (u64s), "1" (dst), "2" (src)
+		     : "memory");
+#else
+	u64 *d = dst;
+	const u64 *s = src;
+
+	while (u64s--)
+		*d++ = *s++;
+#endif
+}
+
+static inline void memcpy_u64s(void *dst, const void *src,
+			       unsigned u64s)
+{
+	EBUG_ON(!(dst >= src + u64s * sizeof(u64) ||
+		 dst + u64s * sizeof(u64) <= src));
+
+	__memcpy_u64s(dst, src, u64s);
+}
+
+static inline void __memmove_u64s_down(void *dst, const void *src,
+				       unsigned u64s)
+{
+	__memcpy_u64s(dst, src, u64s);
+}
+
+static inline void memmove_u64s_down(void *dst, const void *src,
+				     unsigned u64s)
+{
+	EBUG_ON(dst > src);
+
+	__memmove_u64s_down(dst, src, u64s);
+}
+
+static inline void __memmove_u64s_up(void *_dst, const void *_src,
+				     unsigned u64s)
+{
+	u64 *dst = (u64 *) _dst + u64s - 1;
+	u64 *src = (u64 *) _src + u64s - 1;
+
+#ifdef CONFIG_X86_64
+	long d0, d1, d2;
+	asm volatile("std ;\n"
+		     "rep ; movsq\n"
+		     "cld ;\n"
+		     : "=&c" (d0), "=&D" (d1), "=&S" (d2)
+		     : "0" (u64s), "1" (dst), "2" (src)
+		     : "memory");
+#else
+	while (u64s--)
+		*dst-- = *src--;
+#endif
+}
+
+static inline void memmove_u64s_up(void *dst, const void *src,
+				   unsigned u64s)
+{
+	EBUG_ON(dst < src);
+
+	__memmove_u64s_up(dst, src, u64s);
+}
+
+static inline void memmove_u64s(void *dst, const void *src,
+				unsigned u64s)
+{
+	if (dst < src)
+		__memmove_u64s_down(dst, src, u64s);
+	else
+		__memmove_u64s_up(dst, src, u64s);
+}
+
+static inline struct bio_vec next_contig_bvec(struct bio *bio,
+					      struct bvec_iter *iter)
+{
+	struct bio_vec bv = bio_iter_iovec(bio, *iter);
+
+	bio_advance_iter(bio, iter, bv.bv_len);
+#ifndef CONFIG_HIGHMEM
+	while (iter->bi_size) {
+		struct bio_vec next = bio_iter_iovec(bio, *iter);
+
+		if (page_address(bv.bv_page) + bv.bv_offset + bv.bv_len !=
+		    page_address(next.bv_page) + next.bv_offset)
+			break;
+
+		bv.bv_len += next.bv_len;
+		bio_advance_iter(bio, iter, next.bv_len);
+	}
+#endif
+	return bv;
+}
+
+#define __bio_for_each_contig_segment(bv, bio, iter, start)		\
+	for (iter = (start);						\
+	     (iter).bi_size &&						\
+		((bv = next_contig_bvec((bio), &(iter))), 1);)
+
+#define bio_for_each_contig_segment(bv, bio, iter)			\
+	__bio_for_each_contig_segment(bv, bio, iter, (bio)->bi_iter)
+
+size_t bch_scnmemcpy(char *, size_t, const char *, size_t);
+
+void sort_cmp_size(void *base, size_t num, size_t size,
+	  int (*cmp_func)(const void *, const void *, size_t),
+	  void (*swap_func)(void *, void *, size_t));
+
+/* just the memmove, doesn't update @_nr */
+#define __array_insert_item(_array, _nr, _pos)				\
+	memmove(&(_array)[(_pos) + 1],					\
+		&(_array)[(_pos)],					\
+		sizeof((_array)[0]) * ((_nr) - (_pos)))
+
+#define array_insert_item(_array, _nr, _pos, _new_item)			\
+do {									\
+	__array_insert_item(_array, _nr, _pos);				\
+	(_nr)++;							\
+	(_array)[(_pos)] = (_new_item);					\
+} while (0)
+
+#define array_remove_items(_array, _nr, _pos, _nr_to_remove)		\
+do {									\
+	(_nr) -= (_nr_to_remove);					\
+	memmove(&(_array)[(_pos)],					\
+		&(_array)[(_pos) + (_nr_to_remove)],			\
+		sizeof((_array)[0]) * ((_nr) - (_pos)));		\
+} while (0)
+
+#define array_remove_item(_array, _nr, _pos)				\
+	array_remove_items(_array, _nr, _pos, 1)
+
+#define bubble_sort(_base, _nr, _cmp)					\
+do {									\
+	ssize_t _i, _end;						\
+	bool _swapped = true;						\
+									\
+	for (_end = (ssize_t) (_nr) - 1; _end > 0 && _swapped; --_end) {\
+		_swapped = false;					\
+		for (_i = 0; _i < _end; _i++)				\
+			if (_cmp((_base)[_i], (_base)[_i + 1]) > 0) {	\
+				swap((_base)[_i], (_base)[_i + 1]);	\
+				_swapped = true;			\
+			}						\
+	}								\
+} while (0)
+
+#endif /* _BCACHEFS_UTIL_H */
diff --git a/fs/bcachefs/vstructs.h b/fs/bcachefs/vstructs.h
new file mode 100644
index 000000000000..c099cdc0605f
--- /dev/null
+++ b/fs/bcachefs/vstructs.h
@@ -0,0 +1,63 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _VSTRUCTS_H
+#define _VSTRUCTS_H
+
+#include "util.h"
+
+/*
+ * NOTE: we can't differentiate between __le64 and u64 with type_is - this
+ * assumes u64 is little endian:
+ */
+#define __vstruct_u64s(_s)						\
+({									\
+	( type_is((_s)->u64s, u64) ? le64_to_cpu((__force __le64) (_s)->u64s)		\
+	: type_is((_s)->u64s, u32) ? le32_to_cpu((__force __le32) (_s)->u64s)		\
+	: type_is((_s)->u64s, u16) ? le16_to_cpu((__force __le16) (_s)->u64s)		\
+	: ((__force u8) ((_s)->u64s)));						\
+})
+
+#define __vstruct_bytes(_type, _u64s)					\
+({									\
+	BUILD_BUG_ON(offsetof(_type, _data) % sizeof(u64));		\
+									\
+	(offsetof(_type, _data) + (_u64s) * sizeof(u64));		\
+})
+
+#define vstruct_bytes(_s)						\
+	__vstruct_bytes(typeof(*(_s)), __vstruct_u64s(_s))
+
+#define __vstruct_blocks(_type, _sector_block_bits, _u64s)		\
+	(round_up(__vstruct_bytes(_type, _u64s),			\
+		  512 << (_sector_block_bits)) >> (9 + (_sector_block_bits)))
+
+#define vstruct_blocks(_s, _sector_block_bits)				\
+	__vstruct_blocks(typeof(*(_s)), _sector_block_bits, __vstruct_u64s(_s))
+
+#define vstruct_blocks_plus(_s, _sector_block_bits, _u64s)		\
+	__vstruct_blocks(typeof(*(_s)), _sector_block_bits,		\
+			 __vstruct_u64s(_s) + (_u64s))
+
+#define vstruct_sectors(_s, _sector_block_bits)				\
+	(round_up(vstruct_bytes(_s), 512 << (_sector_block_bits)) >> 9)
+
+#define vstruct_next(_s)						\
+	((typeof(_s))			((_s)->_data + __vstruct_u64s(_s)))
+#define vstruct_last(_s)						\
+	((typeof(&(_s)->start[0]))	((_s)->_data + __vstruct_u64s(_s)))
+#define vstruct_end(_s)							\
+	((void *)			((_s)->_data + __vstruct_u64s(_s)))
+
+#define vstruct_for_each(_s, _i)					\
+	for (_i = (_s)->start;						\
+	     _i < vstruct_last(_s);					\
+	     _i = vstruct_next(_i))
+
+#define vstruct_for_each_safe(_s, _i, _t)				\
+	for (_i = (_s)->start;						\
+	     _i < vstruct_last(_s) && (_t = vstruct_next(_i), true);	\
+	     _i = _t)
+
+#define vstruct_idx(_s, _idx)						\
+	((typeof(&(_s)->start[0])) ((_s)->_data + (_idx)))
+
+#endif /* _VSTRUCTS_H */
diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c
new file mode 100644
index 000000000000..f0440d12a031
--- /dev/null
+++ b/fs/bcachefs/xattr.c
@@ -0,0 +1,485 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "bkey_methods.h"
+#include "btree_update.h"
+#include "compress.h"
+#include "extents.h"
+#include "fs.h"
+#include "rebalance.h"
+#include "str_hash.h"
+#include "xattr.h"
+
+#include <linux/dcache.h>
+#include <linux/posix_acl_xattr.h>
+#include <linux/xattr.h>
+
+static const struct xattr_handler *bch2_xattr_type_to_handler(unsigned);
+
+static u64 bch2_xattr_hash(const struct bch_hash_info *info,
+			  const struct xattr_search_key *key)
+{
+	struct bch_str_hash_ctx ctx;
+
+	bch2_str_hash_init(&ctx, info);
+	bch2_str_hash_update(&ctx, info, &key->type, sizeof(key->type));
+	bch2_str_hash_update(&ctx, info, key->name.name, key->name.len);
+
+	return bch2_str_hash_end(&ctx, info);
+}
+
+static u64 xattr_hash_key(const struct bch_hash_info *info, const void *key)
+{
+	return bch2_xattr_hash(info, key);
+}
+
+static u64 xattr_hash_bkey(const struct bch_hash_info *info, struct bkey_s_c k)
+{
+	struct bkey_s_c_xattr x = bkey_s_c_to_xattr(k);
+
+	return bch2_xattr_hash(info,
+		 &X_SEARCH(x.v->x_type, x.v->x_name, x.v->x_name_len));
+}
+
+static bool xattr_cmp_key(struct bkey_s_c _l, const void *_r)
+{
+	struct bkey_s_c_xattr l = bkey_s_c_to_xattr(_l);
+	const struct xattr_search_key *r = _r;
+
+	return l.v->x_type != r->type ||
+		l.v->x_name_len != r->name.len ||
+		memcmp(l.v->x_name, r->name.name, r->name.len);
+}
+
+static bool xattr_cmp_bkey(struct bkey_s_c _l, struct bkey_s_c _r)
+{
+	struct bkey_s_c_xattr l = bkey_s_c_to_xattr(_l);
+	struct bkey_s_c_xattr r = bkey_s_c_to_xattr(_r);
+
+	return l.v->x_type != r.v->x_type ||
+		l.v->x_name_len != r.v->x_name_len ||
+		memcmp(l.v->x_name, r.v->x_name, r.v->x_name_len);
+}
+
+const struct bch_hash_desc bch2_xattr_hash_desc = {
+	.btree_id	= BTREE_ID_XATTRS,
+	.key_type	= BCH_XATTR,
+	.whiteout_type	= BCH_XATTR_WHITEOUT,
+	.hash_key	= xattr_hash_key,
+	.hash_bkey	= xattr_hash_bkey,
+	.cmp_key	= xattr_cmp_key,
+	.cmp_bkey	= xattr_cmp_bkey,
+};
+
+const char *bch2_xattr_invalid(const struct bch_fs *c, struct bkey_s_c k)
+{
+	const struct xattr_handler *handler;
+	struct bkey_s_c_xattr xattr;
+
+	switch (k.k->type) {
+	case BCH_XATTR:
+		if (bkey_val_bytes(k.k) < sizeof(struct bch_xattr))
+			return "value too small";
+
+		xattr = bkey_s_c_to_xattr(k);
+
+		if (bkey_val_u64s(k.k) <
+			xattr_val_u64s(xattr.v->x_name_len,
+				       le16_to_cpu(xattr.v->x_val_len)))
+			return "value too small";
+
+		if (bkey_val_u64s(k.k) >
+			xattr_val_u64s(xattr.v->x_name_len,
+				       le16_to_cpu(xattr.v->x_val_len) + 4))
+			return "value too big";
+
+		handler = bch2_xattr_type_to_handler(xattr.v->x_type);
+		if (!handler)
+			return "invalid type";
+
+		if (memchr(xattr.v->x_name, '\0', xattr.v->x_name_len))
+			return "xattr name has invalid characters";
+
+		return NULL;
+	case BCH_XATTR_WHITEOUT:
+		return bkey_val_bytes(k.k) != 0
+			? "value size should be zero"
+			: NULL;
+
+	default:
+		return "invalid type";
+	}
+}
+
+void bch2_xattr_to_text(struct bch_fs *c, char *buf,
+			size_t size, struct bkey_s_c k)
+{
+	const struct xattr_handler *handler;
+	struct bkey_s_c_xattr xattr;
+	size_t n = 0;
+
+	switch (k.k->type) {
+	case BCH_XATTR:
+		xattr = bkey_s_c_to_xattr(k);
+
+		handler = bch2_xattr_type_to_handler(xattr.v->x_type);
+		if (handler && handler->prefix)
+			n += scnprintf(buf + n, size - n, "%s", handler->prefix);
+		else if (handler)
+			n += scnprintf(buf + n, size - n, "(type %u)",
+				       xattr.v->x_type);
+		else
+			n += scnprintf(buf + n, size - n, "(unknown type %u)",
+				       xattr.v->x_type);
+
+		n += bch_scnmemcpy(buf + n, size - n, xattr.v->x_name,
+				   xattr.v->x_name_len);
+		n += scnprintf(buf + n, size - n, ":");
+		n += bch_scnmemcpy(buf + n, size - n, xattr_val(xattr.v),
+				   le16_to_cpu(xattr.v->x_val_len));
+		break;
+	case BCH_XATTR_WHITEOUT:
+		scnprintf(buf, size, "whiteout");
+		break;
+	}
+}
+
+int bch2_xattr_get(struct bch_fs *c, struct bch_inode_info *inode,
+		   const char *name, void *buffer, size_t size, int type)
+{
+	struct btree_trans trans;
+	struct btree_iter *iter;
+	struct bkey_s_c_xattr xattr;
+	int ret;
+
+	bch2_trans_init(&trans, c);
+
+	iter = bch2_hash_lookup(&trans, bch2_xattr_hash_desc,
+				&inode->ei_str_hash, inode->v.i_ino,
+				&X_SEARCH(type, name, strlen(name)),
+				0);
+	if (IS_ERR(iter)) {
+		bch2_trans_exit(&trans);
+		BUG_ON(PTR_ERR(iter) == -EINTR);
+
+		return PTR_ERR(iter) == -ENOENT ? -ENODATA : PTR_ERR(iter);
+	}
+
+	xattr = bkey_s_c_to_xattr(bch2_btree_iter_peek_slot(iter));
+	ret = le16_to_cpu(xattr.v->x_val_len);
+	if (buffer) {
+		if (ret > size)
+			ret = -ERANGE;
+		else
+			memcpy(buffer, xattr_val(xattr.v), ret);
+	}
+
+	bch2_trans_exit(&trans);
+	return ret;
+}
+
+int bch2_xattr_set(struct btree_trans *trans, u64 inum,
+		   const struct bch_hash_info *hash_info,
+		   const char *name, const void *value, size_t size,
+		   int type, int flags)
+{
+	int ret;
+
+	if (value) {
+		struct bkey_i_xattr *xattr;
+		unsigned namelen = strlen(name);
+		unsigned u64s = BKEY_U64s +
+			xattr_val_u64s(namelen, size);
+
+		if (u64s > U8_MAX)
+			return -ERANGE;
+
+		xattr = bch2_trans_kmalloc(trans, u64s * sizeof(u64));
+		if (IS_ERR(xattr))
+			return PTR_ERR(xattr);
+
+		bkey_xattr_init(&xattr->k_i);
+		xattr->k.u64s		= u64s;
+		xattr->v.x_type		= type;
+		xattr->v.x_name_len	= namelen;
+		xattr->v.x_val_len	= cpu_to_le16(size);
+		memcpy(xattr->v.x_name, name, namelen);
+		memcpy(xattr_val(&xattr->v), value, size);
+
+		ret = __bch2_hash_set(trans, bch2_xattr_hash_desc, hash_info,
+			      inum, &xattr->k_i,
+			      (flags & XATTR_CREATE ? BCH_HASH_SET_MUST_CREATE : 0)|
+			      (flags & XATTR_REPLACE ? BCH_HASH_SET_MUST_REPLACE : 0));
+	} else {
+		struct xattr_search_key search =
+			X_SEARCH(type, name, strlen(name));
+
+		ret = bch2_hash_delete(trans, bch2_xattr_hash_desc,
+				       hash_info, inum, &search);
+	}
+
+	if (ret == -ENOENT)
+		ret = flags & XATTR_REPLACE ? -ENODATA : 0;
+
+	return ret;
+}
+
+static size_t bch2_xattr_emit(struct dentry *dentry,
+			     const struct bch_xattr *xattr,
+			     char *buffer, size_t buffer_size)
+{
+	const struct xattr_handler *handler =
+		bch2_xattr_type_to_handler(xattr->x_type);
+
+	if (handler && (!handler->list || handler->list(dentry))) {
+		const char *prefix = handler->prefix ?: handler->name;
+		const size_t prefix_len = strlen(prefix);
+		const size_t total_len = prefix_len + xattr->x_name_len + 1;
+
+		if (buffer && total_len <= buffer_size) {
+			memcpy(buffer, prefix, prefix_len);
+			memcpy(buffer + prefix_len,
+			       xattr->x_name, xattr->x_name_len);
+			buffer[prefix_len + xattr->x_name_len] = '\0';
+		}
+
+		return total_len;
+	} else {
+		return 0;
+	}
+}
+
+ssize_t bch2_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size)
+{
+	struct bch_fs *c = dentry->d_sb->s_fs_info;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	const struct bch_xattr *xattr;
+	u64 inum = dentry->d_inode->i_ino;
+	ssize_t ret = 0;
+	size_t len;
+
+	for_each_btree_key(&iter, c, BTREE_ID_XATTRS, POS(inum, 0), 0, k) {
+		BUG_ON(k.k->p.inode < inum);
+
+		if (k.k->p.inode > inum)
+			break;
+
+		if (k.k->type != BCH_XATTR)
+			continue;
+
+		xattr = bkey_s_c_to_xattr(k).v;
+
+		len = bch2_xattr_emit(dentry, xattr, buffer, buffer_size);
+		if (buffer) {
+			if (len > buffer_size) {
+				bch2_btree_iter_unlock(&iter);
+				return -ERANGE;
+			}
+
+			buffer += len;
+			buffer_size -= len;
+		}
+
+		ret += len;
+
+	}
+	bch2_btree_iter_unlock(&iter);
+
+	return ret;
+}
+
+static int bch2_xattr_get_handler(const struct xattr_handler *handler,
+				  struct dentry *dentry, struct inode *vinode,
+				  const char *name, void *buffer, size_t size)
+{
+	struct bch_inode_info *inode = to_bch_ei(vinode);
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+
+	return bch2_xattr_get(c, inode, name, buffer, size, handler->flags);
+}
+
+static int bch2_xattr_set_handler(const struct xattr_handler *handler,
+				  struct mnt_idmap *idmap,
+				  struct dentry *dentry, struct inode *vinode,
+				  const char *name, const void *value,
+				  size_t size, int flags)
+{
+	struct bch_inode_info *inode = to_bch_ei(vinode);
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+
+	return bch2_trans_do(c, &inode->ei_journal_seq, BTREE_INSERT_ATOMIC,
+			bch2_xattr_set(&trans, inode->v.i_ino,
+				       &inode->ei_str_hash,
+				       name, value, size,
+				       handler->flags, flags));
+}
+
+static const struct xattr_handler bch_xattr_user_handler = {
+	.prefix	= XATTR_USER_PREFIX,
+	.get	= bch2_xattr_get_handler,
+	.set	= bch2_xattr_set_handler,
+	.flags	= BCH_XATTR_INDEX_USER,
+};
+
+static bool bch2_xattr_trusted_list(struct dentry *dentry)
+{
+	return capable(CAP_SYS_ADMIN);
+}
+
+static const struct xattr_handler bch_xattr_trusted_handler = {
+	.prefix	= XATTR_TRUSTED_PREFIX,
+	.list	= bch2_xattr_trusted_list,
+	.get	= bch2_xattr_get_handler,
+	.set	= bch2_xattr_set_handler,
+	.flags	= BCH_XATTR_INDEX_TRUSTED,
+};
+
+static const struct xattr_handler bch_xattr_security_handler = {
+	.prefix	= XATTR_SECURITY_PREFIX,
+	.get	= bch2_xattr_get_handler,
+	.set	= bch2_xattr_set_handler,
+	.flags	= BCH_XATTR_INDEX_SECURITY,
+};
+
+#ifndef NO_BCACHEFS_FS
+
+static int bch2_xattr_bcachefs_get(const struct xattr_handler *handler,
+				   struct dentry *dentry, struct inode *vinode,
+				   const char *name, void *buffer, size_t size)
+{
+	struct bch_inode_info *inode = to_bch_ei(vinode);
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	struct bch_opts opts =
+		bch2_inode_opts_to_opts(bch2_inode_opts_get(&inode->ei_inode));
+	const struct bch_option *opt;
+	int ret, id;
+	u64 v;
+
+	id = bch2_opt_lookup(name);
+	if (id < 0 || !bch2_opt_is_inode_opt(id))
+		return -EINVAL;
+
+	opt = bch2_opt_table + id;
+
+	if (!bch2_opt_defined_by_id(&opts, id))
+		return -ENODATA;
+
+	v = bch2_opt_get_by_id(&opts, id);
+
+	ret = bch2_opt_to_text(c, buffer, size, opt, v, 0);
+
+	return ret < size || !buffer ? ret : -ERANGE;
+}
+
+struct inode_opt_set {
+	int			id;
+	u64			v;
+	bool			defined;
+};
+
+static int inode_opt_set_fn(struct bch_inode_info *inode,
+			    struct bch_inode_unpacked *bi,
+			    void *p)
+{
+	struct inode_opt_set *s = p;
+
+	if (s->defined)
+		bch2_inode_opt_set(bi, s->id, s->v);
+	else
+		bch2_inode_opt_clear(bi, s->id);
+	return 0;
+}
+
+static int bch2_xattr_bcachefs_set(const struct xattr_handler *handler,
+				   struct mnt_idmap *idmap,
+				   struct dentry *dentry, struct inode *vinode,
+				   const char *name, const void *value,
+				   size_t size, int flags)
+{
+	struct bch_inode_info *inode = to_bch_ei(vinode);
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	const struct bch_option *opt;
+	char *buf;
+	struct inode_opt_set s;
+	int ret;
+
+	s.id = bch2_opt_lookup(name);
+	if (s.id < 0 || !bch2_opt_is_inode_opt(s.id))
+		return -EINVAL;
+
+	opt = bch2_opt_table + s.id;
+
+	if (value) {
+		buf = kmalloc(size + 1, GFP_KERNEL);
+		if (!buf)
+			return -ENOMEM;
+		memcpy(buf, value, size);
+		buf[size] = '\0';
+
+		ret = bch2_opt_parse(c, opt, buf, &s.v);
+		kfree(buf);
+
+		if (ret < 0)
+			return ret;
+
+		if (s.id == Opt_compression ||
+		    s.id == Opt_background_compression) {
+			ret = bch2_check_set_has_compressed_data(c, s.v);
+			if (ret)
+				return ret;
+		}
+
+		s.defined = true;
+	} else {
+		s.defined = false;
+	}
+
+	mutex_lock(&inode->ei_update_lock);
+	ret = __bch2_write_inode(c, inode, inode_opt_set_fn, &s, 0);
+	mutex_unlock(&inode->ei_update_lock);
+
+	if (value &&
+	    (s.id == Opt_background_compression ||
+	     s.id == Opt_background_target))
+		bch2_rebalance_add_work(c, inode->v.i_blocks);
+
+	return ret;
+}
+
+static const struct xattr_handler bch_xattr_bcachefs_handler = {
+	.prefix	= "bcachefs.",
+	.get	= bch2_xattr_bcachefs_get,
+	.set	= bch2_xattr_bcachefs_set,
+};
+
+#endif /* NO_BCACHEFS_FS */
+
+const struct xattr_handler *bch2_xattr_handlers[] = {
+	&bch_xattr_user_handler,
+	&nop_posix_acl_access,
+	&nop_posix_acl_default,
+	&bch_xattr_trusted_handler,
+	&bch_xattr_security_handler,
+#ifndef NO_BCACHEFS_FS
+	&bch_xattr_bcachefs_handler,
+#endif
+	NULL
+};
+
+static const struct xattr_handler *bch_xattr_handler_map[] = {
+	[BCH_XATTR_INDEX_USER]			= &bch_xattr_user_handler,
+	[BCH_XATTR_INDEX_POSIX_ACL_ACCESS]	=
+		&nop_posix_acl_access,
+	[BCH_XATTR_INDEX_POSIX_ACL_DEFAULT]	=
+		&nop_posix_acl_default,
+	[BCH_XATTR_INDEX_TRUSTED]		= &bch_xattr_trusted_handler,
+	[BCH_XATTR_INDEX_SECURITY]		= &bch_xattr_security_handler,
+};
+
+static const struct xattr_handler *bch2_xattr_type_to_handler(unsigned type)
+{
+	return type < ARRAY_SIZE(bch_xattr_handler_map)
+		? bch_xattr_handler_map[type]
+		: NULL;
+}
diff --git a/fs/bcachefs/xattr.h b/fs/bcachefs/xattr.h
new file mode 100644
index 000000000000..0e7d2fa86213
--- /dev/null
+++ b/fs/bcachefs/xattr.h
@@ -0,0 +1,49 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_XATTR_H
+#define _BCACHEFS_XATTR_H
+
+#include "str_hash.h"
+
+extern const struct bch_hash_desc bch2_xattr_hash_desc;
+
+const char *bch2_xattr_invalid(const struct bch_fs *, struct bkey_s_c);
+void bch2_xattr_to_text(struct bch_fs *, char *, size_t, struct bkey_s_c);
+
+#define bch2_bkey_xattr_ops (struct bkey_ops) {		\
+	.key_invalid	= bch2_xattr_invalid,		\
+	.val_to_text	= bch2_xattr_to_text,		\
+}
+
+static inline unsigned xattr_val_u64s(unsigned name_len, unsigned val_len)
+{
+	return DIV_ROUND_UP(offsetof(struct bch_xattr, x_name) +
+			    name_len + val_len, sizeof(u64));
+}
+
+#define xattr_val(_xattr)					\
+	((void *) (_xattr)->x_name + (_xattr)->x_name_len)
+
+struct xattr_search_key {
+	u8		type;
+	struct qstr	name;
+};
+
+#define X_SEARCH(_type, _name, _len) ((struct xattr_search_key)	\
+	{ .type = _type, .name = QSTR_INIT(_name, _len) })
+
+struct dentry;
+struct xattr_handler;
+struct bch_hash_info;
+struct bch_inode_info;
+
+int bch2_xattr_get(struct bch_fs *, struct bch_inode_info *,
+		  const char *, void *, size_t, int);
+
+int bch2_xattr_set(struct btree_trans *, u64, const struct bch_hash_info *,
+		   const char *, const void *, size_t, int, int);
+
+ssize_t bch2_xattr_list(struct dentry *, char *, size_t);
+
+extern const struct xattr_handler *bch2_xattr_handlers[];
+
+#endif /* _BCACHEFS_XATTR_H */
-- 
cgit 


From 88c07f739786d00c7526d598956955c8310d72d2 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 14 Jul 2018 21:06:51 -0400
Subject: bcachefs: Only check inode i_nlink during full fsck

Now that all filesystem operatinos that manipulate the filesystem
heirachy and i_nlink are fully atomic, we can add a feature bit to
indicate i_nlink doesn't need to be checked.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs.h |  1 +
 fs/bcachefs/error.c    |  5 +++--
 fs/bcachefs/fsck.c     | 30 +++++++++++++++++++++++++++++-
 fs/bcachefs/recovery.c |  7 +++++++
 4 files changed, 40 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index b5e119d09a83..57132c79c4b9 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -472,6 +472,7 @@ enum {
 	/* misc: */
 	BCH_FS_BDEV_MOUNTED,
 	BCH_FS_FSCK_FIXED_ERRORS,
+	BCH_FS_FSCK_UNFIXED_ERRORS,
 	BCH_FS_FIXED_GENS,
 	BCH_FS_REBUILD_REPLICAS,
 	BCH_FS_HOLD_BTREE_WRITES,
diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c
index e975fab43d49..08e79166dae4 100644
--- a/fs/bcachefs/error.c
+++ b/fs/bcachefs/error.c
@@ -132,8 +132,9 @@ print:
 
 	mutex_unlock(&c->fsck_error_lock);
 
-	if (fix)
-		set_bit(BCH_FS_FSCK_FIXED_ERRORS, &c->flags);
+	set_bit(fix
+		? BCH_FS_FSCK_FIXED_ERRORS
+		: BCH_FS_FSCK_UNFIXED_ERRORS, &c->flags);
 
 	return fix				? FSCK_ERR_FIX
 		: flags & FSCK_CAN_IGNORE	? FSCK_ERR_IGNORE
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index eb01284a841f..c352fa01fb5a 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -954,6 +954,23 @@ static int check_inode_nlink(struct bch_fs *c,
 		return 0;
 	}
 
+	if (!link->count &&
+	    !(u->bi_flags & BCH_INODE_UNLINKED) &&
+	    (c->sb.features & (1 << BCH_FEATURE_ATOMIC_NLINK))) {
+		if (fsck_err(c, "unreachable inode %llu not marked as unlinked (type %u)",
+			     u->bi_inum, mode_to_type(u->bi_mode)) ==
+		    FSCK_ERR_IGNORE)
+			return 0;
+
+		ret = reattach_inode(c, lostfound_inode, u->bi_inum);
+		if (ret)
+			return ret;
+
+		link->count = 1;
+		real_i_nlink = nlink_bias(u->bi_mode) + link->dir_count;
+		goto set_i_nlink;
+	}
+
 	if (i_nlink < link->count) {
 		if (fsck_err(c, "inode %llu i_link too small (%u < %u, type %i)",
 			     u->bi_inum, i_nlink, link->count,
@@ -973,6 +990,16 @@ static int check_inode_nlink(struct bch_fs *c,
 		goto set_i_nlink;
 	}
 
+	if (i_nlink != real_i_nlink &&
+	    (c->sb.features & (1 << BCH_FEATURE_ATOMIC_NLINK))) {
+		if (fsck_err(c, "inode %llu has wrong i_nlink "
+			     "(type %u i_nlink %u, should be %u)",
+			     u->bi_inum, mode_to_type(u->bi_mode),
+			     i_nlink, real_i_nlink) == FSCK_ERR_IGNORE)
+			return 0;
+		goto set_i_nlink;
+	}
+
 	if (real_i_nlink && i_nlink != real_i_nlink)
 		bch_verbose(c, "setting inode %llu nlink from %u to %u",
 			    u->bi_inum, i_nlink, real_i_nlink);
@@ -1299,7 +1326,8 @@ int bch2_fsck(struct bch_fs *c)
 	if (!c->opts.nofsck)
 		return bch2_fsck_full(c);
 
-	if (!c->sb.clean)
+	if (!c->sb.clean &&
+	    !(c->sb.features & (1 << BCH_FEATURE_ATOMIC_NLINK)))
 		return bch2_fsck_inode_nlink(c);
 
 	return bch2_fsck_walk_inodes_only(c);
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 2596c3c26064..624d97dc4537 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -256,6 +256,12 @@ int bch2_fs_recovery(struct bch_fs *c)
 	if (ret)
 		goto err;
 
+	if (!test_bit(BCH_FS_FSCK_UNFIXED_ERRORS, &c->flags)) {
+		mutex_lock(&c->sb_lock);
+		c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_ATOMIC_NLINK;
+		mutex_unlock(&c->sb_lock);
+	}
+
 	if (enabled_qtypes(c)) {
 		bch_verbose(c, "reading quotas:");
 		ret = bch2_fs_quota_read(c);
@@ -366,6 +372,7 @@ int bch2_fs_initialize(struct bch_fs *c)
 	mutex_lock(&c->sb_lock);
 	SET_BCH_SB_INITIALIZED(c->disk_sb.sb, true);
 	SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
+	c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_ATOMIC_NLINK;
 
 	bch2_write_super(c);
 	mutex_unlock(&c->sb_lock);
-- 
cgit 


From d69f41d6bbd7849253cd823525543bd3a1a307f1 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 12 Jul 2018 19:19:41 -0400
Subject: bcachefs: Convert raw uses of bch2_btree_iter_link() to new
 transactions

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-io.c |  84 +++++++++++++++++--------------
 fs/bcachefs/fsck.c  | 139 +++++++++++++++++++++++++++++++---------------------
 2 files changed, 130 insertions(+), 93 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 56d21175058c..b53fbdc15c87 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -390,7 +390,8 @@ static int bchfs_write_index_update(struct bch_write_op *wop)
 	struct bchfs_write_op *op = container_of(wop,
 				struct bchfs_write_op, op);
 	struct keylist *keys = &op->op.insert_keys;
-	struct btree_iter extent_iter, inode_iter;
+	struct btree_trans trans;
+	struct btree_iter *extent_iter, *inode_iter = NULL;
 	struct bchfs_extent_trans_hook hook;
 	struct bkey_i *k = bch2_keylist_front(keys);
 	s64 orig_sectors_added = op->sectors_added;
@@ -398,12 +399,13 @@ static int bchfs_write_index_update(struct bch_write_op *wop)
 
 	BUG_ON(k->k.p.inode != op->inode->v.i_ino);
 
-	bch2_btree_iter_init(&extent_iter, wop->c, BTREE_ID_EXTENTS,
-			     bkey_start_pos(&bch2_keylist_front(keys)->k),
-			     BTREE_ITER_INTENT);
-	bch2_btree_iter_init(&inode_iter, wop->c, BTREE_ID_INODES,
-			     POS(extent_iter.pos.inode, 0),
-			     BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+	bch2_trans_init(&trans, wop->c);
+
+	extent_iter = bch2_trans_get_iter(&trans,
+				BTREE_ID_EXTENTS,
+				bkey_start_pos(&bch2_keylist_front(keys)->k),
+				BTREE_ITER_INTENT);
+	BUG_ON(IS_ERR(extent_iter));
 
 	hook.op			= op;
 	hook.hook.fn		= bchfs_extent_update_hook;
@@ -417,23 +419,28 @@ static int bchfs_write_index_update(struct bch_write_op *wop)
 			hook.need_inode_update = true;
 
 		/* optimization for fewer transaction restarts: */
-		ret = bch2_btree_iter_traverse(&extent_iter);
+		ret = bch2_btree_iter_traverse(extent_iter);
 		if (ret)
 			goto err;
 
 		if (hook.need_inode_update) {
 			struct bkey_s_c inode;
 
-			if (!btree_iter_linked(&inode_iter))
-				bch2_btree_iter_link(&extent_iter, &inode_iter);
+			if (!inode_iter) {
+				inode_iter = bch2_trans_get_iter(&trans,
+					BTREE_ID_INODES,
+					POS(extent_iter->pos.inode, 0),
+					BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+				BUG_ON(IS_ERR(inode_iter));
+			}
 
-			inode = bch2_btree_iter_peek_slot(&inode_iter);
+			inode = bch2_btree_iter_peek_slot(inode_iter);
 			if ((ret = btree_iter_err(inode)))
 				goto err;
 
 			if (WARN_ONCE(inode.k->type != BCH_INODE_FS,
 				      "inode %llu not found when updating",
-				      extent_iter.pos.inode)) {
+				      extent_iter->pos.inode)) {
 				ret = -ENOENT;
 				break;
 			}
@@ -441,7 +448,7 @@ static int bchfs_write_index_update(struct bch_write_op *wop)
 			if (WARN_ONCE(bkey_bytes(inode.k) >
 				      sizeof(hook.inode_p),
 				      "inode %llu too big (%zu bytes, buf %zu)",
-				      extent_iter.pos.inode,
+				      extent_iter->pos.inode,
 				      bkey_bytes(inode.k),
 				      sizeof(hook.inode_p))) {
 				ret = -ENOENT;
@@ -453,7 +460,7 @@ static int bchfs_write_index_update(struct bch_write_op *wop)
 					       &hook.inode_u);
 			if (WARN_ONCE(ret,
 				      "error %i unpacking inode %llu",
-				      ret, extent_iter.pos.inode)) {
+				      ret, extent_iter->pos.inode)) {
 				ret = -ENOENT;
 				break;
 			}
@@ -463,8 +470,8 @@ static int bchfs_write_index_update(struct bch_write_op *wop)
 					BTREE_INSERT_NOFAIL|
 					BTREE_INSERT_ATOMIC|
 					BTREE_INSERT_USE_RESERVE,
-					BTREE_INSERT_ENTRY(&extent_iter, k),
-					BTREE_INSERT_ENTRY_EXTRA_RES(&inode_iter,
+					BTREE_INSERT_ENTRY(extent_iter, k),
+					BTREE_INSERT_ENTRY_EXTRA_RES(inode_iter,
 							&hook.inode_p.inode.k_i, 2));
 		} else {
 			ret = bch2_btree_insert_at(wop->c, &wop->res,
@@ -472,10 +479,10 @@ static int bchfs_write_index_update(struct bch_write_op *wop)
 					BTREE_INSERT_NOFAIL|
 					BTREE_INSERT_ATOMIC|
 					BTREE_INSERT_USE_RESERVE,
-					BTREE_INSERT_ENTRY(&extent_iter, k));
+					BTREE_INSERT_ENTRY(extent_iter, k));
 		}
 
-		BUG_ON(bkey_cmp(extent_iter.pos, bkey_start_pos(&k->k)));
+		BUG_ON(bkey_cmp(extent_iter->pos, bkey_start_pos(&k->k)));
 
 		if (WARN_ONCE(!ret != !k->k.size,
 			      "ret %i k->size %u", ret, k->k.size))
@@ -486,12 +493,11 @@ err:
 		if (ret)
 			break;
 
-		BUG_ON(bkey_cmp(extent_iter.pos, k->k.p) < 0);
+		BUG_ON(bkey_cmp(extent_iter->pos, k->k.p) < 0);
 		bch2_keylist_pop_front(keys);
 	} while (!bch2_keylist_empty(keys));
 
-	bch2_btree_iter_unlock(&extent_iter);
-	bch2_btree_iter_unlock(&inode_iter);
+	bch2_trans_exit(&trans);
 
 	if (op->is_dio) {
 		struct dio_write *dio = container_of(op, struct dio_write, iop);
@@ -2363,8 +2369,8 @@ static long bch2_fcollapse(struct bch_inode_info *inode,
 {
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
 	struct address_space *mapping = inode->v.i_mapping;
-	struct btree_iter src;
-	struct btree_iter dst;
+	struct btree_trans trans;
+	struct btree_iter *src, *dst;
 	BKEY_PADDED(k) copy;
 	struct bkey_s_c k;
 	struct i_sectors_hook i_sectors_hook = i_sectors_hook_init(inode, 0);
@@ -2374,13 +2380,17 @@ static long bch2_fcollapse(struct bch_inode_info *inode,
 	if ((offset | len) & (block_bytes(c) - 1))
 		return -EINVAL;
 
-	bch2_btree_iter_init(&dst, c, BTREE_ID_EXTENTS,
+	bch2_trans_init(&trans, c);
+
+	dst = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
 			     POS(inode->v.i_ino, offset >> 9),
 			     BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+	BUG_ON(IS_ERR(dst));
+
 	/* position will be set from dst iter's position: */
-	bch2_btree_iter_init(&src, c, BTREE_ID_EXTENTS, POS_MIN,
+	src = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, POS_MIN,
 			     BTREE_ITER_SLOTS);
-	bch2_btree_iter_link(&src, &dst);
+	BUG_ON(IS_ERR(src));
 
 	/*
 	 * We need i_mutex to keep the page cache consistent with the extents
@@ -2409,24 +2419,24 @@ static long bch2_fcollapse(struct bch_inode_info *inode,
 	if (ret)
 		goto err;
 
-	while (bkey_cmp(dst.pos,
+	while (bkey_cmp(dst->pos,
 			POS(inode->v.i_ino,
 			    round_up(new_size, PAGE_SIZE) >> 9)) < 0) {
 		struct disk_reservation disk_res;
 
-		bch2_btree_iter_set_pos(&src,
-			POS(dst.pos.inode, dst.pos.offset + (len >> 9)));
+		bch2_btree_iter_set_pos(src,
+			POS(dst->pos.inode, dst->pos.offset + (len >> 9)));
 
-		k = bch2_btree_iter_peek_slot(&src);
+		k = bch2_btree_iter_peek_slot(src);
 		if ((ret = btree_iter_err(k)))
 			goto btree_iter_err;
 
 		bkey_reassemble(&copy.k, k);
 
-		bch2_cut_front(src.pos, &copy.k);
+		bch2_cut_front(src->pos, &copy.k);
 		copy.k.k.p.offset -= len >> 9;
 
-		BUG_ON(bkey_cmp(dst.pos, bkey_start_pos(&copy.k.k)));
+		BUG_ON(bkey_cmp(dst->pos, bkey_start_pos(&copy.k.k)));
 
 		ret = bch2_disk_reservation_get(c, &disk_res, copy.k.k.size,
 				bch2_extent_nr_dirty_ptrs(bkey_i_to_s_c(&copy.k)),
@@ -2437,14 +2447,13 @@ static long bch2_fcollapse(struct bch_inode_info *inode,
 					   &inode->ei_journal_seq,
 					   BTREE_INSERT_ATOMIC|
 					   BTREE_INSERT_NOFAIL,
-					   BTREE_INSERT_ENTRY(&dst, &copy.k));
+					   BTREE_INSERT_ENTRY(dst, &copy.k));
 		bch2_disk_reservation_put(c, &disk_res);
 btree_iter_err:
 		if (ret == -EINTR)
 			ret = 0;
 		if (ret) {
-			bch2_btree_iter_unlock(&src);
-			bch2_btree_iter_unlock(&dst);
+			bch2_trans_exit(&trans);
 			goto err_put_sectors_dirty;
 		}
 		/*
@@ -2452,11 +2461,10 @@ btree_iter_err:
 		 * pointers... which isn't a _super_ serious problem...
 		 */
 
-		bch2_btree_iter_cond_resched(&src);
+		bch2_btree_iter_cond_resched(src);
 	}
 
-	bch2_btree_iter_unlock(&src);
-	bch2_btree_iter_unlock(&dst);
+	bch2_trans_exit(&trans);
 
 	ret = bch2_inode_truncate(c, inode->v.i_ino,
 				 round_up(new_size, block_bytes(c)) >> 9,
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index c352fa01fb5a..50e310fea4cf 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -127,16 +127,22 @@ static int walk_inode(struct bch_fs *c, struct inode_walker *w, u64 inum)
 
 struct hash_check {
 	struct bch_hash_info	info;
-	struct btree_iter	chain;
-	struct btree_iter	iter;
+	struct btree_trans	*trans;
+
+	/* start of current chain of hash collisions: */
+	struct btree_iter	*chain;
+
+	/* next offset in current chain of hash collisions: */
 	u64			next;
 };
 
 static void hash_check_init(const struct bch_hash_desc desc,
-			    struct hash_check *h, struct bch_fs *c)
+			    struct btree_trans *trans,
+			    struct hash_check *h)
 {
-	bch2_btree_iter_init(&h->chain, c, desc.btree_id, POS_MIN, 0);
-	bch2_btree_iter_init(&h->iter, c, desc.btree_id, POS_MIN, 0);
+	h->trans = trans;
+	h->chain = bch2_trans_get_iter(trans, desc.btree_id, POS_MIN, 0);
+	h->next = -1;
 }
 
 static void hash_check_set_inode(struct hash_check *h, struct bch_fs *c,
@@ -207,6 +213,42 @@ err:
 	return ret;
 }
 
+static int hash_check_duplicates(const struct bch_hash_desc desc,
+				 struct hash_check *h, struct bch_fs *c,
+				 struct btree_iter *k_iter, struct bkey_s_c k)
+{
+	struct btree_iter *iter;
+	struct bkey_s_c k2;
+	char buf[200];
+	int ret = 0;
+
+	if (!bkey_cmp(h->chain->pos, k_iter->pos))
+		return 0;
+
+	iter = bch2_trans_copy_iter(h->trans, h->chain);
+	BUG_ON(IS_ERR(iter));
+
+	for_each_btree_key_continue(iter, 0, k2) {
+		if (bkey_cmp(k2.k->p, k.k->p) >= 0)
+			break;
+
+		if (fsck_err_on(k2.k->type == desc.key_type &&
+				!desc.cmp_bkey(k, k2), c,
+				"duplicate hash table keys:\n%s",
+				(bch2_bkey_val_to_text(c, bkey_type(0, desc.btree_id),
+						       buf, sizeof(buf), k), buf))) {
+			ret = fsck_hash_delete_at(desc, &h->info, k_iter);
+			if (ret)
+				return ret;
+			ret = 1;
+			break;
+		}
+	}
+fsck_err:
+	bch2_trans_iter_free(h->trans, iter);
+	return ret;
+}
+
 static int hash_check_key(const struct bch_hash_desc desc,
 			  struct hash_check *h, struct bch_fs *c,
 			  struct btree_iter *k_iter, struct bkey_s_c k)
@@ -219,13 +261,8 @@ static int hash_check_key(const struct bch_hash_desc desc,
 	    k.k->type != desc.key_type)
 		return 0;
 
-	if (k.k->p.offset != h->next) {
-		if (!btree_iter_linked(&h->chain)) {
-			bch2_btree_iter_link(k_iter, &h->chain);
-			bch2_btree_iter_link(k_iter, &h->iter);
-		}
-		bch2_btree_iter_copy(&h->chain, k_iter);
-	}
+	if (k.k->p.offset != h->next)
+		bch2_btree_iter_copy(h->chain, k_iter);
 	h->next = k.k->p.offset + 1;
 
 	if (k.k->type != desc.key_type)
@@ -233,11 +270,11 @@ static int hash_check_key(const struct bch_hash_desc desc,
 
 	hashed = desc.hash_bkey(&h->info, k);
 
-	if (fsck_err_on(hashed < h->chain.pos.offset ||
+	if (fsck_err_on(hashed < h->chain->pos.offset ||
 			hashed > k.k->p.offset, c,
 			"hash table key at wrong offset: %llu, "
 			"hashed to %llu chain starts at %llu\n%s",
-			k.k->p.offset, hashed, h->chain.pos.offset,
+			k.k->p.offset, hashed, h->chain->pos.offset,
 			(bch2_bkey_val_to_text(c, bkey_type(0, desc.btree_id),
 					       buf, sizeof(buf), k), buf))) {
 		ret = hash_redo_key(desc, h, c, k_iter, k, hashed);
@@ -248,25 +285,7 @@ static int hash_check_key(const struct bch_hash_desc desc,
 		return 1;
 	}
 
-	if (!bkey_cmp(h->chain.pos, k_iter->pos))
-		return 0;
-
-	bch2_btree_iter_copy(&h->iter, &h->chain);
-	while (bkey_cmp(h->iter.pos, k_iter->pos) < 0) {
-		struct bkey_s_c k2 = bch2_btree_iter_peek(&h->iter);
-
-		if (fsck_err_on(k2.k->type == desc.key_type &&
-				!desc.cmp_bkey(k, k2), c,
-				"duplicate hash table keys:\n%s",
-				(bch2_bkey_val_to_text(c, bkey_type(0, desc.btree_id),
-						       buf, sizeof(buf), k), buf))) {
-			ret = fsck_hash_delete_at(desc, &h->info, &h->iter);
-			if (ret)
-				return ret;
-			return 1;
-		}
-		bch2_btree_iter_next(&h->iter);
-	}
+	ret = hash_check_duplicates(desc, h, c, k_iter, k);
 fsck_err:
 	return ret;
 }
@@ -368,7 +387,8 @@ static int check_dirents(struct bch_fs *c)
 {
 	struct inode_walker w = inode_walker_init();
 	struct hash_check h;
-	struct btree_iter iter;
+	struct btree_trans trans;
+	struct btree_iter *iter;
 	struct bkey_s_c k;
 	unsigned name_len;
 	char buf[200];
@@ -376,10 +396,16 @@ static int check_dirents(struct bch_fs *c)
 
 	bch_verbose(c, "checking dirents");
 
-	hash_check_init(bch2_dirent_hash_desc, &h, c);
+	bch2_trans_init(&trans, c);
 
-	for_each_btree_key(&iter, c, BTREE_ID_DIRENTS,
-			   POS(BCACHEFS_ROOT_INO, 0), 0, k) {
+	BUG_ON(bch2_trans_preload_iters(&trans));
+
+	iter = bch2_trans_get_iter(&trans, BTREE_ID_DIRENTS,
+				   POS(BCACHEFS_ROOT_INO, 0), 0);
+
+	hash_check_init(bch2_dirent_hash_desc, &trans, &h);
+
+	for_each_btree_key_continue(iter, 0, k) {
 		struct bkey_s_c_dirent d;
 		struct bch_inode_unpacked target;
 		bool have_target;
@@ -398,7 +424,7 @@ static int check_dirents(struct bch_fs *c)
 				mode_to_type(w.inode.bi_mode),
 				(bch2_bkey_val_to_text(c, (enum bkey_type) BTREE_ID_DIRENTS,
 						       buf, sizeof(buf), k), buf))) {
-			ret = bch2_btree_delete_at(&iter, 0);
+			ret = bch2_btree_delete_at(iter, 0);
 			if (ret)
 				goto err;
 			continue;
@@ -407,7 +433,7 @@ static int check_dirents(struct bch_fs *c)
 		if (w.first_this_inode && w.have_inode)
 			hash_check_set_inode(&h, c, &w.inode);
 
-		ret = hash_check_key(bch2_dirent_hash_desc, &h, c, &iter, k);
+		ret = hash_check_key(bch2_dirent_hash_desc, &h, c, iter, k);
 		if (ret > 0) {
 			ret = 0;
 			continue;
@@ -431,7 +457,7 @@ static int check_dirents(struct bch_fs *c)
 		    fsck_err_on(name_len == 2 &&
 				!memcmp(d.v->d_name, "..", 2), c,
 				".. dirent")) {
-			ret = remove_dirent(c, &iter, d);
+			ret = remove_dirent(c, iter, d);
 			if (ret)
 				goto err;
 			continue;
@@ -441,7 +467,7 @@ static int check_dirents(struct bch_fs *c)
 				"dirent points to own directory:\n%s",
 				(bch2_bkey_val_to_text(c, (enum bkey_type) BTREE_ID_DIRENTS,
 						       buf, sizeof(buf), k), buf))) {
-			ret = remove_dirent(c, &iter, d);
+			ret = remove_dirent(c, iter, d);
 			if (ret)
 				goto err;
 			continue;
@@ -458,7 +484,7 @@ static int check_dirents(struct bch_fs *c)
 				"dirent points to missing inode:\n%s",
 				(bch2_bkey_val_to_text(c, (enum bkey_type) BTREE_ID_DIRENTS,
 						       buf, sizeof(buf), k), buf))) {
-			ret = remove_dirent(c, &iter, d);
+			ret = remove_dirent(c, iter, d);
 			if (ret)
 				goto err;
 			continue;
@@ -484,7 +510,7 @@ static int check_dirents(struct bch_fs *c)
 
 			ret = bch2_btree_insert_at(c, NULL, NULL, NULL,
 					BTREE_INSERT_NOFAIL,
-					BTREE_INSERT_ENTRY(&iter, &n->k_i));
+					BTREE_INSERT_ENTRY(iter, &n->k_i));
 			kfree(n);
 			if (ret)
 				goto err;
@@ -493,9 +519,7 @@ static int check_dirents(struct bch_fs *c)
 	}
 err:
 fsck_err:
-	bch2_btree_iter_unlock(&h.chain);
-	bch2_btree_iter_unlock(&h.iter);
-	return bch2_btree_iter_unlock(&iter) ?: ret;
+	return bch2_trans_exit(&trans) ?: ret;
 }
 
 /*
@@ -506,16 +530,23 @@ static int check_xattrs(struct bch_fs *c)
 {
 	struct inode_walker w = inode_walker_init();
 	struct hash_check h;
-	struct btree_iter iter;
+	struct btree_trans trans;
+	struct btree_iter *iter;
 	struct bkey_s_c k;
 	int ret = 0;
 
 	bch_verbose(c, "checking xattrs");
 
-	hash_check_init(bch2_xattr_hash_desc, &h, c);
+	bch2_trans_init(&trans, c);
 
-	for_each_btree_key(&iter, c, BTREE_ID_XATTRS,
-			   POS(BCACHEFS_ROOT_INO, 0), 0, k) {
+	BUG_ON(bch2_trans_preload_iters(&trans));
+
+	iter = bch2_trans_get_iter(&trans, BTREE_ID_XATTRS,
+				   POS(BCACHEFS_ROOT_INO, 0), 0);
+
+	hash_check_init(bch2_xattr_hash_desc, &trans, &h);
+
+	for_each_btree_key_continue(iter, 0, k) {
 		ret = walk_inode(c, &w, k.k->p.inode);
 		if (ret)
 			break;
@@ -523,7 +554,7 @@ static int check_xattrs(struct bch_fs *c)
 		if (fsck_err_on(!w.have_inode, c,
 				"xattr for missing inode %llu",
 				k.k->p.inode)) {
-			ret = bch2_btree_delete_at(&iter, 0);
+			ret = bch2_btree_delete_at(iter, 0);
 			if (ret)
 				goto err;
 			continue;
@@ -532,15 +563,13 @@ static int check_xattrs(struct bch_fs *c)
 		if (w.first_this_inode && w.have_inode)
 			hash_check_set_inode(&h, c, &w.inode);
 
-		ret = hash_check_key(bch2_xattr_hash_desc, &h, c, &iter, k);
+		ret = hash_check_key(bch2_xattr_hash_desc, &h, c, iter, k);
 		if (ret)
 			goto fsck_err;
 	}
 err:
 fsck_err:
-	bch2_btree_iter_unlock(&h.chain);
-	bch2_btree_iter_unlock(&h.iter);
-	return bch2_btree_iter_unlock(&iter) ?: ret;
+	return bch2_trans_exit(&trans) ?: ret;
 }
 
 /* Get root directory, create if it doesn't exist: */
-- 
cgit 


From 1c7a0adf3112090c42ef93ac84aad97bf4d414d3 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 12 Jul 2018 23:30:45 -0400
Subject: bcachefs: trace transaction restarts

exceptionally crappy "tracing", but it's a start at documenting the
places restarts can be triggered

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_cache.c       |  1 +
 fs/bcachefs/btree_iter.c        | 21 +++++++++++++++------
 fs/bcachefs/btree_iter.h        | 23 ++++++++++++++++++++++-
 fs/bcachefs/btree_types.h       |  1 +
 fs/bcachefs/btree_update_leaf.c | 17 +++++++++++++++--
 5 files changed, 54 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index f9afae6c710d..3cb3da363d11 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -732,6 +732,7 @@ retry:
 			if (bch2_btree_node_relock(iter, level + 1))
 				goto retry;
 
+			trans_restart();
 			return ERR_PTR(-EINTR);
 		}
 	}
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 2b4ba41149cf..7bead41b226f 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -263,6 +263,9 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
 
 	if (ret)
 		__btree_node_lock_type(c, b, type);
+	else
+		trans_restart();
+
 	return ret;
 }
 
@@ -1646,7 +1649,12 @@ static int btree_trans_realloc_iters(struct btree_trans *trans)
 
 	btree_trans_verify(trans);
 
-	return trans->iters_live ? -EINTR : 0;
+	if (trans->iters_live) {
+		trans_restart();
+		return -EINTR;
+	}
+
+	return 0;
 }
 
 int bch2_trans_preload_iters(struct btree_trans *trans)
@@ -1759,8 +1767,10 @@ void *bch2_trans_kmalloc(struct btree_trans *trans,
 		trans->mem = new_mem;
 		trans->mem_bytes = new_bytes;
 
-		if (old_bytes)
+		if (old_bytes) {
+			trans_restart();
 			return ERR_PTR(-EINTR);
+		}
 	}
 
 	ret = trans->mem + trans->mem_top;
@@ -1787,7 +1797,7 @@ int bch2_trans_unlock(struct btree_trans *trans)
 	return ret;
 }
 
-void bch2_trans_begin(struct btree_trans *trans)
+void __bch2_trans_begin(struct btree_trans *trans)
 {
 	unsigned idx;
 
@@ -1801,10 +1811,8 @@ void bch2_trans_begin(struct btree_trans *trans)
 	 * further (allocated an iter with a higher idx) than where the iter
 	 * was originally allocated:
 	 */
-	if (!trans->iters_live)
-		return;
-
 	while (trans->iters_linked &&
+	       trans->iters_live &&
 	       (idx = __fls(trans->iters_linked)) >
 	       __fls(trans->iters_live)) {
 		trans->iters_linked ^= 1 << idx;
@@ -1821,6 +1829,7 @@ void bch2_trans_begin(struct btree_trans *trans)
 void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c)
 {
 	trans->c		= c;
+	trans->nr_restarts	= 0;
 	trans->nr_iters		= 0;
 	trans->iters_live	= 0;
 	trans->iters_linked	= 0;
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index e686a7ad5b3d..315cba28f6b2 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -305,10 +305,31 @@ bch2_trans_copy_iter(struct btree_trans *trans, struct btree_iter *src)
 	return __bch2_trans_copy_iter(trans, src, __btree_iter_id());
 }
 
+void __bch2_trans_begin(struct btree_trans *);
+
 void *bch2_trans_kmalloc(struct btree_trans *, size_t);
 int bch2_trans_unlock(struct btree_trans *);
-void bch2_trans_begin(struct btree_trans *);
 void bch2_trans_init(struct btree_trans *, struct bch_fs *);
 int bch2_trans_exit(struct btree_trans *);
 
+#ifdef TRACE_TRANSACTION_RESTARTS
+#define bch2_trans_begin(_trans)					\
+do {									\
+	if (is_power_of_2((_trans)->nr_restarts) &&			\
+	    (_trans)->nr_restarts >= 8)					\
+		pr_info("nr restarts: %zu", (_trans)->nr_restarts);	\
+									\
+	(_trans)->nr_restarts++;					\
+	__bch2_trans_begin(_trans);					\
+} while (0)
+#else
+#define bch2_trans_begin(_trans)	__bch2_trans_begin(_trans)
+#endif
+
+#ifdef TRACE_TRANSACTION_RESTARTS_ALL
+#define trans_restart(...) pr_info("transaction restart" __VA_ARGS__)
+#else
+#define trans_restart(...) no_printk("transaction restart" __VA_ARGS__)
+#endif
+
 #endif /* _BCACHEFS_BTREE_ITER_H */
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index b922a8c104d4..438ef0c07623 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -269,6 +269,7 @@ struct btree_insert_entry {
 
 struct btree_trans {
 	struct bch_fs		*c;
+	size_t			nr_restarts;
 
 	u8			nr_iters;
 	u8			iters_live;
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 4d1d0954efbf..6c48518e8c0b 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -333,6 +333,7 @@ static inline int do_btree_insert_at(struct btree_insert *trans,
 
 	if (race_fault()) {
 		ret = -EINTR;
+		trans_restart(" (race)");
 		goto out;
 	}
 
@@ -456,7 +457,12 @@ retry:
 	cycle_gc_lock = false;
 
 	trans_for_each_entry(trans, i) {
+		unsigned old_locks_want = i->iter->locks_want;
+		unsigned old_uptodate = i->iter->uptodate;
+
 		if (!bch2_btree_iter_upgrade(i->iter, 1, true)) {
+			trans_restart(" (failed upgrade, locks_want %u uptodate %u)",
+				      old_locks_want, old_uptodate);
 			ret = -EINTR;
 			goto err;
 		}
@@ -529,8 +535,10 @@ err:
 		 * don't care if we got ENOSPC because we told split it
 		 * couldn't block:
 		 */
-		if (!ret || (flags & BTREE_INSERT_NOUNLOCK))
+		if (!ret || (flags & BTREE_INSERT_NOUNLOCK)) {
+			trans_restart(" (split)");
 			ret = -EINTR;
+		}
 	}
 
 	if (cycle_gc_lock) {
@@ -545,13 +553,16 @@ err:
 	}
 
 	if (ret == -EINTR) {
-		if (flags & BTREE_INSERT_NOUNLOCK)
+		if (flags & BTREE_INSERT_NOUNLOCK) {
+			trans_restart(" (can't unlock)");
 			goto out;
+		}
 
 		trans_for_each_entry(trans, i) {
 			int ret2 = bch2_btree_iter_traverse(i->iter);
 			if (ret2) {
 				ret = ret2;
+				trans_restart(" (traverse)");
 				goto out;
 			}
 
@@ -564,6 +575,8 @@ err:
 		 */
 		if (!(flags & BTREE_INSERT_ATOMIC))
 			goto retry;
+
+		trans_restart(" (atomic)");
 	}
 
 	goto out;
-- 
cgit 


From 97446a242a56f9350be7b8985ed933fe9118f41e Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 20 Jul 2018 22:08:17 -0400
Subject: bcachefs: Fix device add

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_gc.c |  9 +++++++++
 fs/bcachefs/journal.c  | 11 ++++++++---
 2 files changed, 17 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 155e69056d96..00c28a0a4d9d 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -327,9 +327,16 @@ void bch2_mark_dev_superblock(struct bch_fs *c, struct bch_dev *ca,
 	unsigned i;
 	u64 b;
 
+	/*
+	 * This conditional is kind of gross, but we may be called from the
+	 * device add path, before the new device has actually been added to the
+	 * running filesystem:
+	 */
 	if (c) {
 		lockdep_assert_held(&c->sb_lock);
 		percpu_down_read(&c->usage_lock);
+	} else {
+		preempt_disable();
 	}
 
 	for (i = 0; i < layout->nr_superblocks; i++) {
@@ -357,6 +364,8 @@ void bch2_mark_dev_superblock(struct bch_fs *c, struct bch_dev *ca,
 	if (c) {
 		spin_unlock(&c->journal.lock);
 		percpu_up_read(&c->usage_lock);
+	} else {
+		preempt_enable();
 	}
 }
 
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index 697f601c2cdf..fe2926095770 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -727,6 +727,10 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
 	if (!journal_buckets)
 		goto err;
 
+	/*
+	 * We may be called from the device add path, before the new device has
+	 * actually been added to the running filesystem:
+	 */
 	if (c)
 		spin_lock(&c->journal.lock);
 
@@ -743,10 +747,7 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
 		long bucket;
 
 		if (new_fs) {
-			percpu_down_read(&c->usage_lock);
 			bucket = bch2_bucket_alloc_new_fs(ca);
-			percpu_up_read(&c->usage_lock);
-
 			if (bucket < 0) {
 				ret = -ENOSPC;
 				goto err;
@@ -765,6 +766,8 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
 		if (c) {
 			percpu_down_read(&c->usage_lock);
 			spin_lock(&c->journal.lock);
+		} else {
+			preempt_disable();
 		}
 
 		__array_insert_item(ja->buckets,		ja->nr, ja->last_idx);
@@ -792,6 +795,8 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
 		if (c) {
 			spin_unlock(&c->journal.lock);
 			percpu_up_read(&c->usage_lock);
+		} else {
+			preempt_enable();
 		}
 
 		if (!new_fs)
-- 
cgit 


From 4077991c8536595b50b52bab739ef1e9ac6a72cf Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 17 Jul 2018 12:19:14 -0400
Subject: bcachefs: Fix a use after free in the journal code

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update_interior.c |   4 +-
 fs/bcachefs/btree_update_leaf.c     |   3 +-
 fs/bcachefs/fifo.h                  |  12 ++--
 fs/bcachefs/journal.c               |  21 +++++-
 fs/bcachefs/journal_reclaim.c       | 135 ++++++++++++++++++++----------------
 fs/bcachefs/journal_reclaim.h       |   7 +-
 fs/bcachefs/journal_types.h         |   6 +-
 7 files changed, 112 insertions(+), 76 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 1710efd7c687..cc1f8b9a9e09 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -579,6 +579,8 @@ static void bch2_btree_update_free(struct btree_update *as)
 {
 	struct bch_fs *c = as->c;
 
+	bch2_journal_pin_flush(&c->journal, &as->journal);
+
 	BUG_ON(as->nr_new_nodes);
 	BUG_ON(as->nr_pending);
 
@@ -2151,7 +2153,7 @@ ssize_t bch2_btree_updates_print(struct bch_fs *c, char *buf)
 				 as->mode,
 				 as->nodes_written,
 				 atomic_read(&as->cl.remaining) & CLOSURE_REMAINING_MASK,
-				 bch2_journal_pin_seq(&c->journal, &as->journal));
+				 as->journal.seq);
 	mutex_unlock(&c->btree_interior_update_lock);
 
 	return out - buf;
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 6c48518e8c0b..5cd20b572759 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -111,8 +111,7 @@ static void __btree_node_flush(struct journal *j, struct journal_entry_pin *pin,
 
 	btree_node_lock_type(c, b, SIX_LOCK_read);
 	bch2_btree_node_write_cond(c, b,
-			(btree_current_write(b) == w &&
-			 w->journal.pin_list == journal_seq_pin(j, seq)));
+		(btree_current_write(b) == w && w->journal.seq == seq));
 	six_unlock_read(&b->lock);
 }
 
diff --git a/fs/bcachefs/fifo.h b/fs/bcachefs/fifo.h
index bd1534ecadb6..00d245efe72a 100644
--- a/fs/bcachefs/fifo.h
+++ b/fs/bcachefs/fifo.h
@@ -109,17 +109,17 @@ do {									\
 #define fifo_peek(fifo)		fifo_peek_front(fifo)
 
 #define fifo_for_each_entry(_entry, _fifo, _iter)			\
-	for (((void) (&(_iter) == &(_fifo)->front)),			\
-	     _iter = (_fifo)->front;					\
+	for (typecheck(typeof((_fifo)->front), _iter),			\
+	     (_iter) = (_fifo)->front;					\
 	     ((_iter != (_fifo)->back) &&				\
 	      (_entry = (_fifo)->data[(_iter) & (_fifo)->mask], true));	\
-	     _iter++)
+	     (_iter)++)
 
 #define fifo_for_each_entry_ptr(_ptr, _fifo, _iter)			\
-	for (((void) (&(_iter) == &(_fifo)->front)),			\
-	     _iter = (_fifo)->front;					\
+	for (typecheck(typeof((_fifo)->front), _iter),			\
+	     (_iter) = (_fifo)->front;					\
 	     ((_iter != (_fifo)->back) &&				\
 	      (_ptr = &(_fifo)->data[(_iter) & (_fifo)->mask], true));	\
-	     _iter++)
+	     (_iter)++)
 
 #endif /* _BCACHEFS_FIFO_H */
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index fe2926095770..3878ceb37dcf 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -138,8 +138,26 @@ static enum {
 		c->opts.block_size;
 	BUG_ON(j->prev_buf_sectors > j->cur_buf_sectors);
 
+	/*
+	 * We have to set last_seq here, _before_ opening a new journal entry:
+	 *
+	 * A threads may replace an old pin with a new pin on their current
+	 * journal reservation - the expectation being that the journal will
+	 * contain either what the old pin protected or what the new pin
+	 * protects.
+	 *
+	 * After the old pin is dropped journal_last_seq() won't include the old
+	 * pin, so we can only write the updated last_seq on the entry that
+	 * contains whatever the new pin protects.
+	 *
+	 * Restated, we can _not_ update last_seq for a given entry if there
+	 * could be a newer entry open with reservations/pins that have been
+	 * taken against it.
+	 *
+	 * Hence, we want update/set last_seq on the current journal entry right
+	 * before we open a new one:
+	 */
 	bch2_journal_reclaim_fast(j);
-	/* XXX: why set this here, and not in bch2_journal_write()? */
 	buf->data->last_seq	= cpu_to_le64(journal_last_seq(j));
 
 	if (journal_entry_empty(buf->data))
@@ -1022,6 +1040,7 @@ int bch2_fs_journal_init(struct journal *j)
 	init_waitqueue_head(&j->wait);
 	INIT_DELAYED_WORK(&j->write_work, journal_write_work);
 	INIT_DELAYED_WORK(&j->reclaim_work, bch2_journal_reclaim_work);
+	init_waitqueue_head(&j->pin_flush_wait);
 	mutex_init(&j->blacklist_lock);
 	INIT_LIST_HEAD(&j->seq_blacklist);
 	mutex_init(&j->reclaim_lock);
diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
index e5b8666fa052..e1d5d41ba118 100644
--- a/fs/bcachefs/journal_reclaim.c
+++ b/fs/bcachefs/journal_reclaim.c
@@ -11,34 +11,18 @@
  * entry, holding it open to ensure it gets replayed during recovery:
  */
 
-static inline u64 journal_pin_seq(struct journal *j,
-				  struct journal_entry_pin_list *pin_list)
-{
-	return fifo_entry_idx_abs(&j->pin, pin_list);
-}
-
-u64 bch2_journal_pin_seq(struct journal *j, struct journal_entry_pin *pin)
-{
-	u64 ret = 0;
-
-	spin_lock(&j->lock);
-	if (journal_pin_active(pin))
-		ret = journal_pin_seq(j, pin->pin_list);
-	spin_unlock(&j->lock);
-
-	return ret;
-}
-
 static inline void __journal_pin_add(struct journal *j,
-				     struct journal_entry_pin_list *pin_list,
+				     u64 seq,
 				     struct journal_entry_pin *pin,
 				     journal_pin_flush_fn flush_fn)
 {
+	struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq);
+
 	BUG_ON(journal_pin_active(pin));
 	BUG_ON(!atomic_read(&pin_list->count));
 
 	atomic_inc(&pin_list->count);
-	pin->pin_list	= pin_list;
+	pin->seq	= seq;
 	pin->flush	= flush_fn;
 
 	if (flush_fn)
@@ -58,19 +42,20 @@ void bch2_journal_pin_add(struct journal *j, u64 seq,
 			  journal_pin_flush_fn flush_fn)
 {
 	spin_lock(&j->lock);
-	__journal_pin_add(j, journal_seq_pin(j, seq), pin, flush_fn);
+	__journal_pin_add(j, seq, pin, flush_fn);
 	spin_unlock(&j->lock);
 }
 
 static inline void __journal_pin_drop(struct journal *j,
 				      struct journal_entry_pin *pin)
 {
-	struct journal_entry_pin_list *pin_list = pin->pin_list;
+	struct journal_entry_pin_list *pin_list;
 
 	if (!journal_pin_active(pin))
 		return;
 
-	pin->pin_list = NULL;
+	pin_list = journal_seq_pin(j, pin->seq);
+	pin->seq = 0;
 	list_del_init(&pin->list);
 
 	/*
@@ -83,7 +68,7 @@ static inline void __journal_pin_drop(struct journal *j,
 }
 
 void bch2_journal_pin_drop(struct journal *j,
-			  struct journal_entry_pin *pin)
+			   struct journal_entry_pin *pin)
 {
 	spin_lock(&j->lock);
 	__journal_pin_drop(j, pin);
@@ -99,15 +84,21 @@ void bch2_journal_pin_add_if_older(struct journal *j,
 
 	if (journal_pin_active(src_pin) &&
 	    (!journal_pin_active(pin) ||
-	     journal_pin_seq(j, src_pin->pin_list) <
-	     journal_pin_seq(j, pin->pin_list))) {
+	     src_pin->seq < pin->seq)) {
 		__journal_pin_drop(j, pin);
-		__journal_pin_add(j, src_pin->pin_list, pin, flush_fn);
+		__journal_pin_add(j, src_pin->seq, pin, flush_fn);
 	}
 
 	spin_unlock(&j->lock);
 }
 
+void bch2_journal_pin_flush(struct journal *j, struct journal_entry_pin *pin)
+{
+	BUG_ON(journal_pin_active(pin));
+
+	wait_event(j->pin_flush_wait, j->flush_in_progress != pin);
+}
+
 /*
  * Journal reclaim: flush references to open journal entries to reclaim space in
  * the journal
@@ -145,41 +136,42 @@ void bch2_journal_reclaim_fast(struct journal *j)
 		journal_wake(j);
 }
 
-static struct journal_entry_pin *
-__journal_get_next_pin(struct journal *j, u64 seq_to_flush, u64 *seq)
+static void journal_pin_mark_flushing(struct journal *j,
+				      struct journal_entry_pin *pin,
+				      u64 seq)
 {
-	struct journal_entry_pin_list *pin_list;
-	struct journal_entry_pin *ret;
-	u64 iter;
-
-	/* no need to iterate over empty fifo entries: */
-	bch2_journal_reclaim_fast(j);
+	lockdep_assert_held(&j->reclaim_lock);
 
-	fifo_for_each_entry_ptr(pin_list, &j->pin, iter) {
-		if (iter > seq_to_flush)
-			break;
+	list_move(&pin->list, &journal_seq_pin(j, seq)->flushed);
+	BUG_ON(j->flush_in_progress);
+	j->flush_in_progress = pin;
+}
 
-		ret = list_first_entry_or_null(&pin_list->list,
-				struct journal_entry_pin, list);
-		if (ret) {
-			/* must be list_del_init(), see bch2_journal_pin_drop() */
-			list_move(&ret->list, &pin_list->flushed);
-			*seq = iter;
-			return ret;
-		}
-	}
+static void journal_pin_flush(struct journal *j,
+			      struct journal_entry_pin *pin,
+			      u64 seq)
+{
+	pin->flush(j, pin, seq);
 
-	return NULL;
+	BUG_ON(j->flush_in_progress != pin);
+	j->flush_in_progress = NULL;
+	wake_up(&j->pin_flush_wait);
 }
 
 static struct journal_entry_pin *
 journal_get_next_pin(struct journal *j, u64 seq_to_flush, u64 *seq)
 {
-	struct journal_entry_pin *ret;
+	struct journal_entry_pin_list *pin_list;
+	struct journal_entry_pin *ret = NULL;
 
-	spin_lock(&j->lock);
-	ret = __journal_get_next_pin(j, seq_to_flush, seq);
-	spin_unlock(&j->lock);
+	/* no need to iterate over empty fifo entries: */
+	bch2_journal_reclaim_fast(j);
+
+	fifo_for_each_entry_ptr(pin_list, &j->pin, *seq)
+		if (*seq > seq_to_flush ||
+		    (ret = list_first_entry_or_null(&pin_list->list,
+				struct journal_entry_pin, list)))
+			break;
 
 	return ret;
 }
@@ -279,15 +271,11 @@ void bch2_journal_reclaim_work(struct work_struct *work)
 		spin_unlock(&j->lock);
 	}
 
-	if (reclaim_lock_held)
-		mutex_unlock(&j->reclaim_lock);
-
 	/* Also flush if the pin fifo is more than half full */
 	spin_lock(&j->lock);
 	seq_to_flush = max_t(s64, seq_to_flush,
 			     (s64) journal_cur_seq(j) -
 			     (j->pin.size >> 1));
-	spin_unlock(&j->lock);
 
 	/*
 	 * If it's been longer than j->reclaim_delay_ms since we last flushed,
@@ -299,13 +287,31 @@ void bch2_journal_reclaim_work(struct work_struct *work)
 	while ((pin = journal_get_next_pin(j, need_flush
 					   ? U64_MAX
 					   : seq_to_flush, &seq))) {
-		__set_current_state(TASK_RUNNING);
-		pin->flush(j, pin, seq);
-		need_flush = false;
+		if (!reclaim_lock_held) {
+			spin_unlock(&j->lock);
+			__set_current_state(TASK_RUNNING);
+			mutex_lock(&j->reclaim_lock);
+			reclaim_lock_held = true;
+			spin_lock(&j->lock);
+			continue;
+		}
 
+		journal_pin_mark_flushing(j, pin, seq);
+		spin_unlock(&j->lock);
+
+		journal_pin_flush(j, pin, seq);
+
+		need_flush = false;
 		j->last_flushed = jiffies;
+
+		spin_lock(&j->lock);
 	}
 
+	spin_unlock(&j->lock);
+
+	if (reclaim_lock_held)
+		mutex_unlock(&j->reclaim_lock);
+
 	if (!test_bit(BCH_FS_RO, &c->flags))
 		queue_delayed_work(system_freezable_wq, &j->reclaim_work,
 				   msecs_to_jiffies(j->reclaim_delay_ms));
@@ -328,11 +334,14 @@ static int journal_flush_done(struct journal *j, u64 seq_to_flush,
 	 * If journal replay hasn't completed, the unreplayed journal entries
 	 * hold refs on their corresponding sequence numbers
 	 */
-	ret = (*pin = __journal_get_next_pin(j, seq_to_flush, pin_seq)) != NULL ||
+	ret = (*pin = journal_get_next_pin(j, seq_to_flush, pin_seq)) != NULL ||
 		!test_bit(JOURNAL_REPLAY_DONE, &j->flags) ||
 		journal_last_seq(j) > seq_to_flush ||
 		(fifo_used(&j->pin) == 1 &&
 		 atomic_read(&fifo_peek_front(&j->pin).count) == 1);
+	if (*pin)
+		journal_pin_mark_flushing(j, *pin, *pin_seq);
+
 	spin_unlock(&j->lock);
 
 	return ret;
@@ -346,14 +355,18 @@ void bch2_journal_flush_pins(struct journal *j, u64 seq_to_flush)
 	if (!test_bit(JOURNAL_STARTED, &j->flags))
 		return;
 
+	mutex_lock(&j->reclaim_lock);
+
 	while (1) {
 		wait_event(j->wait, journal_flush_done(j, seq_to_flush,
 						       &pin, &pin_seq));
 		if (!pin)
 			break;
 
-		pin->flush(j, pin, pin_seq);
+		journal_pin_flush(j, pin, pin_seq);
 	}
+
+	mutex_unlock(&j->reclaim_lock);
 }
 
 int bch2_journal_flush_device_pins(struct journal *j, int dev_idx)
diff --git a/fs/bcachefs/journal_reclaim.h b/fs/bcachefs/journal_reclaim.h
index a93ed43cfc78..f5af4252c88a 100644
--- a/fs/bcachefs/journal_reclaim.h
+++ b/fs/bcachefs/journal_reclaim.h
@@ -6,19 +6,17 @@
 
 static inline bool journal_pin_active(struct journal_entry_pin *pin)
 {
-	return pin->pin_list != NULL;
+	return pin->seq != 0;
 }
 
 static inline struct journal_entry_pin_list *
 journal_seq_pin(struct journal *j, u64 seq)
 {
-	BUG_ON(seq < j->pin.front || seq >= j->pin.back);
+	EBUG_ON(seq < j->pin.front || seq >= j->pin.back);
 
 	return &j->pin.data[seq & j->pin.mask];
 }
 
-u64 bch2_journal_pin_seq(struct journal *, struct journal_entry_pin *);
-
 void bch2_journal_pin_add(struct journal *, u64, struct journal_entry_pin *,
 			  journal_pin_flush_fn);
 void bch2_journal_pin_drop(struct journal *, struct journal_entry_pin *);
@@ -26,6 +24,7 @@ void bch2_journal_pin_add_if_older(struct journal *,
 				  struct journal_entry_pin *,
 				  struct journal_entry_pin *,
 				  journal_pin_flush_fn);
+void bch2_journal_pin_flush(struct journal *, struct journal_entry_pin *);
 
 void bch2_journal_reclaim_fast(struct journal *);
 void bch2_journal_reclaim_work(struct work_struct *);
diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h
index cf291227cffb..dae8b8a65d75 100644
--- a/fs/bcachefs/journal_types.h
+++ b/fs/bcachefs/journal_types.h
@@ -48,7 +48,7 @@ typedef void (*journal_pin_flush_fn)(struct journal *j,
 struct journal_entry_pin {
 	struct list_head		list;
 	journal_pin_flush_fn		flush;
-	struct journal_entry_pin_list	*pin_list;
+	u64				seq;
 };
 
 /* corresponds to a btree node with a blacklisted bset: */
@@ -174,6 +174,10 @@ struct journal {
 		u64 front, back, size, mask;
 		struct journal_entry_pin_list *data;
 	}			pin;
+
+	struct journal_entry_pin *flush_in_progress;
+	wait_queue_head_t	pin_flush_wait;
+
 	u64			replay_journal_seq;
 
 	struct mutex		blacklist_lock;
-- 
cgit 


From af1c6871814eb3088bcd3c2afd2fc4b7d4e4df97 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 21 Jul 2018 03:56:57 -0400
Subject: bcachefs: add bch_verbose() statements for shutdown

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/super.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 3191d4cc8140..fe95b8b026e8 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -404,6 +404,8 @@ void bch2_fs_stop(struct bch_fs *c)
 	struct bch_dev *ca;
 	unsigned i;
 
+	bch_verbose(c, "shutting down");
+
 	for_each_member_device(ca, c, i)
 		if (ca->kobj.state_in_sysfs &&
 		    ca->disk_sb.bdev)
@@ -444,6 +446,8 @@ void bch2_fs_stop(struct bch_fs *c)
 		if (c->devs[i])
 			bch2_dev_free(rcu_dereference_protected(c->devs[i], 1));
 
+	bch_verbose(c, "shutdown complete");
+
 	kobject_put(&c->kobj);
 }
 
-- 
cgit 


From 4e1ec2cc0d82f1d4344e7b5a53229c9ccde8437d Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 17 Jul 2018 14:03:47 -0400
Subject: bcachefs: Simplify bch2_write_inode_trans, fix lockdep splat

ei_update_lock isn't currently needed for write inode (but it will be
needed again when deferred btree updates are used for inode updates)

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-io.c |  6 ++++--
 fs/bcachefs/fs.c    | 33 +++++++++------------------------
 2 files changed, 13 insertions(+), 26 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index b53fbdc15c87..29d289b0dfa5 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -355,8 +355,6 @@ bchfs_extent_update_hook(struct extent_insert_hook *hook,
 		h->inode_u.bi_size = offset;
 		do_pack = true;
 
-		inode->ei_inode.bi_size = offset;
-
 		spin_lock(&inode->v.i_lock);
 		if (offset > inode->v.i_size) {
 			if (h->op->is_dio)
@@ -478,6 +476,7 @@ static int bchfs_write_index_update(struct bch_write_op *wop)
 					&hook.hook, op_journal_seq(wop),
 					BTREE_INSERT_NOFAIL|
 					BTREE_INSERT_ATOMIC|
+					BTREE_INSERT_NOUNLOCK|
 					BTREE_INSERT_USE_RESERVE,
 					BTREE_INSERT_ENTRY(extent_iter, k));
 		}
@@ -493,6 +492,9 @@ err:
 		if (ret)
 			break;
 
+		if (hook.need_inode_update)
+			op->inode->ei_inode = hook.inode_u;
+
 		BUG_ON(bkey_cmp(extent_iter->pos, k->k.p) < 0);
 		bch2_keylist_pop_front(keys);
 	} while (!bch2_keylist_empty(keys));
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index 3f3d916e0d37..53107d02cbb6 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -157,35 +157,20 @@ int __must_check bch2_write_inode_trans(struct btree_trans *trans,
 {
 	struct btree_iter *iter;
 	struct bkey_inode_buf *inode_p;
-	struct bkey_s_c k;
-	u64 inum = inode->v.i_ino;
 	int ret;
 
-	lockdep_assert_held(&inode->ei_update_lock);
-
-	iter = bch2_trans_get_iter(trans, BTREE_ID_INODES, POS(inum, 0),
-				   BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+	iter = bch2_trans_get_iter(trans, BTREE_ID_INODES,
+			POS(inode->v.i_ino, 0),
+			BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
 	if (IS_ERR(iter))
 		return PTR_ERR(iter);
 
-	k = bch2_btree_iter_peek_slot(iter);
-	if ((ret = btree_iter_err(k)))
+	/* The btree node lock is our lock on the inode: */
+	ret = bch2_btree_iter_traverse(iter);
+	if (ret)
 		return ret;
 
-	if (WARN_ONCE(k.k->type != BCH_INODE_FS,
-		      "inode %llu not found when updating", inum))
-		return -ENOENT;
-
-	ret = bch2_inode_unpack(bkey_s_c_to_inode(k), inode_u);
-	if (WARN_ONCE(ret,
-		      "error %i unpacking inode %llu", ret, inum))
-		return -ENOENT;
-
-	BUG_ON(inode_u->bi_size != inode->ei_inode.bi_size);
-
-	BUG_ON(inode_u->bi_size != inode->ei_inode.bi_size &&
-	       !(inode_u->bi_flags & BCH_INODE_I_SIZE_DIRTY) &&
-	       inode_u->bi_size > i_size_read(&inode->v));
+	*inode_u = inode->ei_inode;
 
 	if (set) {
 		ret = set(inode, inode_u, p);
@@ -505,8 +490,6 @@ static int __bch2_link(struct bch_fs *c,
 	struct bch_inode_unpacked inode_u;
 	int ret;
 
-	lockdep_assert_held(&inode->v.i_rwsem);
-
 	bch2_trans_init(&trans, c);
 retry:
 	bch2_trans_begin(&trans);
@@ -543,6 +526,8 @@ static int bch2_link(struct dentry *old_dentry, struct inode *vdir,
 	struct bch_inode_info *inode = to_bch_ei(old_dentry->d_inode);
 	int ret;
 
+	lockdep_assert_held(&inode->v.i_rwsem);
+
 	ret = __bch2_link(c, inode, dir, dentry);
 	if (unlikely(ret))
 		return ret;
-- 
cgit 


From 2ea9004864b918be34e742e38fb08d868600d020 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 17 Jul 2018 14:12:42 -0400
Subject: bcachefs: Fix mtime/ctime updates

Also make inode flags consistent with how the rest of the inode is
updated

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/acl.c      |  3 +-
 fs/bcachefs/fs-io.c    | 45 ++++++++++++++++--------
 fs/bcachefs/fs-ioctl.c | 92 +++++---------------------------------------------
 fs/bcachefs/fs-ioctl.h | 73 ++++++++++++++++++++++++++++++++++++++-
 fs/bcachefs/fs.c       | 32 +++++++-----------
 fs/bcachefs/fs.h       |  7 ++--
 fs/bcachefs/xattr.c    |  2 +-
 7 files changed, 130 insertions(+), 124 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/acl.c b/fs/bcachefs/acl.c
index eaf5c8e138fb..7ee2022d9501 100644
--- a/fs/bcachefs/acl.c
+++ b/fs/bcachefs/acl.c
@@ -286,10 +286,9 @@ static int inode_update_for_set_acl_fn(struct bch_inode_info *inode,
 				       void *p)
 {
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	struct timespec64 now = current_time(&inode->v);
 	umode_t mode = (unsigned long) p;
 
-	bi->bi_ctime	= timespec_to_bch2_time(c, now);
+	bi->bi_ctime	= bch2_current_time(c);
 	bi->bi_mode	= mode;
 	return 0;
 }
diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 29d289b0dfa5..33c379ecf5a1 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -177,23 +177,40 @@ static int bch2_quota_reservation_add(struct bch_fs *c,
 
 /* i_size updates: */
 
+struct inode_new_size {
+	loff_t		new_size;
+	u64		now;
+	unsigned	fields;
+};
+
 static int inode_set_size(struct bch_inode_info *inode,
 			  struct bch_inode_unpacked *bi,
 			  void *p)
 {
-	loff_t *new_i_size = p;
+	struct inode_new_size *s = p;
 
-	lockdep_assert_held(&inode->ei_update_lock);
+	bi->bi_size = s->new_size;
+	if (s->fields & ATTR_ATIME)
+		bi->bi_atime = s->now;
+	if (s->fields & ATTR_MTIME)
+		bi->bi_mtime = s->now;
+	if (s->fields & ATTR_CTIME)
+		bi->bi_ctime = s->now;
 
-	bi->bi_size = *new_i_size;
 	return 0;
 }
 
 static int __must_check bch2_write_inode_size(struct bch_fs *c,
 					      struct bch_inode_info *inode,
-					      loff_t new_size)
+					      loff_t new_size, unsigned fields)
 {
-	return __bch2_write_inode(c, inode, inode_set_size, &new_size, 0);
+	struct inode_new_size s = {
+		.new_size	= new_size,
+		.now		= bch2_current_time(c),
+		.fields		= fields,
+	};
+
+	return bch2_write_inode(c, inode, inode_set_size, &s, fields);
 }
 
 static void i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode,
@@ -241,6 +258,7 @@ static int i_sectors_dirty_finish_fn(struct bch_inode_info *inode,
 				     struct bch_inode_unpacked *bi,
 				     void *p)
 {
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
 	struct i_sectors_hook *h = p;
 
 	if (h->new_i_size != U64_MAX &&
@@ -249,6 +267,7 @@ static int i_sectors_dirty_finish_fn(struct bch_inode_info *inode,
 		bi->bi_size = h->new_i_size;
 	bi->bi_sectors	+= h->sectors;
 	bi->bi_flags	&= ~h->flags;
+	bi->bi_mtime	= bi->bi_ctime = bch2_current_time(c);
 	return 0;
 }
 
@@ -259,7 +278,7 @@ static int i_sectors_dirty_finish(struct bch_fs *c, struct i_sectors_hook *h)
 	mutex_lock(&h->inode->ei_update_lock);
 	i_sectors_acct(c, h->inode, &h->quota_res, h->sectors);
 
-	ret = __bch2_write_inode(c, h->inode, i_sectors_dirty_finish_fn, h, 0);
+	ret = bch2_write_inode(c, h->inode, i_sectors_dirty_finish_fn, h, 0);
 
 	if (!ret && h->new_i_size != U64_MAX)
 		i_size_write(&h->inode->v, h->new_i_size);
@@ -289,7 +308,7 @@ static int i_sectors_dirty_start(struct bch_fs *c, struct i_sectors_hook *h)
 	int ret;
 
 	mutex_lock(&h->inode->ei_update_lock);
-	ret = __bch2_write_inode(c, h->inode, i_sectors_dirty_start_fn, h, 0);
+	ret = bch2_write_inode(c, h->inode, i_sectors_dirty_start_fn, h, 0);
 	mutex_unlock(&h->inode->ei_update_lock);
 
 	return ret;
@@ -2223,9 +2242,8 @@ static int bch2_extend(struct bch_inode_info *inode, struct iattr *iattr)
 	setattr_copy(NULL, &inode->v, iattr);
 
 	mutex_lock(&inode->ei_update_lock);
-	inode_set_ctime_current(&inode->v);
-	inode->v.i_mtime = inode_get_ctime(&inode->v);
-	ret = bch2_write_inode_size(c, inode, inode->v.i_size);
+	ret = bch2_write_inode_size(c, inode, inode->v.i_size,
+				    ATTR_MTIME|ATTR_CTIME);
 	mutex_unlock(&inode->ei_update_lock);
 
 	return ret;
@@ -2284,8 +2302,6 @@ int bch2_truncate(struct bch_inode_info *inode, struct iattr *iattr)
 
 	/* ATTR_MODE will never be set here, ns argument isn't needed: */
 	setattr_copy(NULL, &inode->v, iattr);
-	inode_set_ctime_current(&inode->v);
-	inode->v.i_mtime = inode_get_ctime(&inode->v);
 out:
 	ret = i_sectors_dirty_finish(c, &i_sectors_hook) ?: ret;
 err_put_pagecache:
@@ -2617,7 +2633,7 @@ btree_iter_err:
 		i_size_write(&inode->v, end);
 
 		mutex_lock(&inode->ei_update_lock);
-		ret = bch2_write_inode_size(c, inode, inode->v.i_size);
+		ret = bch2_write_inode_size(c, inode, inode->v.i_size, 0);
 		mutex_unlock(&inode->ei_update_lock);
 	}
 
@@ -2633,7 +2649,8 @@ btree_iter_err:
 
 		if (inode->ei_inode.bi_size != inode->v.i_size) {
 			mutex_lock(&inode->ei_update_lock);
-			ret = bch2_write_inode_size(c, inode, inode->v.i_size);
+			ret = bch2_write_inode_size(c, inode,
+						    inode->v.i_size, 0);
 			mutex_unlock(&inode->ei_update_lock);
 		}
 	}
diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c
index 895ccc79e782..a89786f295cf 100644
--- a/fs/bcachefs/fs-ioctl.c
+++ b/fs/bcachefs/fs-ioctl.c
@@ -12,79 +12,6 @@
 
 #define FS_IOC_GOINGDOWN	     _IOR('X', 125, __u32)
 
-/* Inode flags: */
-
-/* bcachefs inode flags -> vfs inode flags: */
-static const unsigned bch_flags_to_vfs[] = {
-	[__BCH_INODE_SYNC]	= S_SYNC,
-	[__BCH_INODE_IMMUTABLE]	= S_IMMUTABLE,
-	[__BCH_INODE_APPEND]	= S_APPEND,
-	[__BCH_INODE_NOATIME]	= S_NOATIME,
-};
-
-/* bcachefs inode flags -> FS_IOC_GETFLAGS: */
-static const unsigned bch_flags_to_uflags[] = {
-	[__BCH_INODE_SYNC]	= FS_SYNC_FL,
-	[__BCH_INODE_IMMUTABLE]	= FS_IMMUTABLE_FL,
-	[__BCH_INODE_APPEND]	= FS_APPEND_FL,
-	[__BCH_INODE_NODUMP]	= FS_NODUMP_FL,
-	[__BCH_INODE_NOATIME]	= FS_NOATIME_FL,
-};
-
-/* bcachefs inode flags -> FS_IOC_FSGETXATTR: */
-static const unsigned bch_flags_to_xflags[] = {
-	[__BCH_INODE_SYNC]	= FS_XFLAG_SYNC,
-	[__BCH_INODE_IMMUTABLE]	= FS_XFLAG_IMMUTABLE,
-	[__BCH_INODE_APPEND]	= FS_XFLAG_APPEND,
-	[__BCH_INODE_NODUMP]	= FS_XFLAG_NODUMP,
-	[__BCH_INODE_NOATIME]	= FS_XFLAG_NOATIME,
-	//[__BCH_INODE_PROJINHERIT] = FS_XFLAG_PROJINHERIT;
-};
-
-#define set_flags(_map, _in, _out)					\
-do {									\
-	unsigned _i;							\
-									\
-	for (_i = 0; _i < ARRAY_SIZE(_map); _i++)			\
-		if ((_in) & (1 << _i))					\
-			(_out) |= _map[_i];				\
-		else							\
-			(_out) &= ~_map[_i];				\
-} while (0)
-
-#define map_flags(_map, _in)						\
-({									\
-	unsigned _out = 0;						\
-									\
-	set_flags(_map, _in, _out);					\
-	_out;								\
-})
-
-#define map_flags_rev(_map, _in)					\
-({									\
-	unsigned _i, _out = 0;						\
-									\
-	for (_i = 0; _i < ARRAY_SIZE(_map); _i++)			\
-		if ((_in) & _map[_i]) {					\
-			(_out) |= 1 << _i;				\
-			(_in) &= ~_map[_i];				\
-		}							\
-	(_out);								\
-})
-
-#define map_defined(_map)						\
-({									\
-	unsigned _in = ~0;						\
-									\
-	map_flags_rev(_map, _in);					\
-})
-
-/* Set VFS inode flags from bcachefs inode: */
-void bch2_inode_flags_to_vfs(struct bch_inode_info *inode)
-{
-	set_flags(bch_flags_to_vfs, inode->ei_inode.bi_flags, inode->v.i_flags);
-}
-
 struct flags_set {
 	unsigned		mask;
 	unsigned		flags;
@@ -96,6 +23,7 @@ static int bch2_inode_flags_set(struct bch_inode_info *inode,
 				struct bch_inode_unpacked *bi,
 				void *p)
 {
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
 	/*
 	 * We're relying on btree locking here for exclusion with other ioctl
 	 * calls - use the flags in the btree (@bi), not inode->i_flags:
@@ -108,14 +36,15 @@ static int bch2_inode_flags_set(struct bch_inode_info *inode,
 	    !capable(CAP_LINUX_IMMUTABLE))
 		return -EPERM;
 
-	if (!S_ISREG(inode->v.i_mode) &&
-	    !S_ISDIR(inode->v.i_mode) &&
+	if (!S_ISREG(bi->bi_mode) &&
+	    !S_ISDIR(bi->bi_mode) &&
 	    (newflags & (BCH_INODE_NODUMP|BCH_INODE_NOATIME)) != newflags)
 		return -EINVAL;
 
 	bi->bi_flags &= ~s->mask;
 	bi->bi_flags |= newflags;
-	inode_set_ctime_current(&inode->v);
+
+	bi->bi_ctime = timespec_to_bch2_time(c, current_time(&inode->v));
 	return 0;
 }
 
@@ -153,10 +82,8 @@ static int bch2_ioc_setflags(struct bch_fs *c,
 	}
 
 	mutex_lock(&inode->ei_update_lock);
-	ret = __bch2_write_inode(c, inode, bch2_inode_flags_set, &s, 0);
-
-	if (!ret)
-		bch2_inode_flags_to_vfs(inode);
+	ret = bch2_write_inode(c, inode, bch2_inode_flags_set, &s,
+			       ATTR_CTIME);
 	mutex_unlock(&inode->ei_update_lock);
 
 setflags_out:
@@ -242,9 +169,8 @@ static int bch2_ioc_fssetxattr(struct bch_fs *c,
 	if (ret)
 		goto err_unlock;
 
-	ret = __bch2_write_inode(c, inode, fssetxattr_inode_update_fn, &s, 0);
-	if (!ret)
-		bch2_inode_flags_to_vfs(inode);
+	ret = bch2_write_inode(c, inode, fssetxattr_inode_update_fn, &s,
+			       ATTR_CTIME);
 err_unlock:
 	mutex_unlock(&inode->ei_update_lock);
 err:
diff --git a/fs/bcachefs/fs-ioctl.h b/fs/bcachefs/fs-ioctl.h
index 2d117ef80ab2..f201980ef2c3 100644
--- a/fs/bcachefs/fs-ioctl.h
+++ b/fs/bcachefs/fs-ioctl.h
@@ -2,7 +2,78 @@
 #ifndef _BCACHEFS_FS_IOCTL_H
 #define _BCACHEFS_FS_IOCTL_H
 
-void bch2_inode_flags_to_vfs(struct bch_inode_info *);
+/* Inode flags: */
+
+/* bcachefs inode flags -> vfs inode flags: */
+static const unsigned bch_flags_to_vfs[] = {
+	[__BCH_INODE_SYNC]	= S_SYNC,
+	[__BCH_INODE_IMMUTABLE]	= S_IMMUTABLE,
+	[__BCH_INODE_APPEND]	= S_APPEND,
+	[__BCH_INODE_NOATIME]	= S_NOATIME,
+};
+
+/* bcachefs inode flags -> FS_IOC_GETFLAGS: */
+static const unsigned bch_flags_to_uflags[] = {
+	[__BCH_INODE_SYNC]	= FS_SYNC_FL,
+	[__BCH_INODE_IMMUTABLE]	= FS_IMMUTABLE_FL,
+	[__BCH_INODE_APPEND]	= FS_APPEND_FL,
+	[__BCH_INODE_NODUMP]	= FS_NODUMP_FL,
+	[__BCH_INODE_NOATIME]	= FS_NOATIME_FL,
+};
+
+/* bcachefs inode flags -> FS_IOC_FSGETXATTR: */
+static const unsigned bch_flags_to_xflags[] = {
+	[__BCH_INODE_SYNC]	= FS_XFLAG_SYNC,
+	[__BCH_INODE_IMMUTABLE]	= FS_XFLAG_IMMUTABLE,
+	[__BCH_INODE_APPEND]	= FS_XFLAG_APPEND,
+	[__BCH_INODE_NODUMP]	= FS_XFLAG_NODUMP,
+	[__BCH_INODE_NOATIME]	= FS_XFLAG_NOATIME,
+	//[__BCH_INODE_PROJINHERIT] = FS_XFLAG_PROJINHERIT;
+};
+
+#define set_flags(_map, _in, _out)					\
+do {									\
+	unsigned _i;							\
+									\
+	for (_i = 0; _i < ARRAY_SIZE(_map); _i++)			\
+		if ((_in) & (1 << _i))					\
+			(_out) |= _map[_i];				\
+		else							\
+			(_out) &= ~_map[_i];				\
+} while (0)
+
+#define map_flags(_map, _in)						\
+({									\
+	unsigned _out = 0;						\
+									\
+	set_flags(_map, _in, _out);					\
+	_out;								\
+})
+
+#define map_flags_rev(_map, _in)					\
+({									\
+	unsigned _i, _out = 0;						\
+									\
+	for (_i = 0; _i < ARRAY_SIZE(_map); _i++)			\
+		if ((_in) & _map[_i]) {					\
+			(_out) |= 1 << _i;				\
+			(_in) &= ~_map[_i];				\
+		}							\
+	(_out);								\
+})
+
+#define map_defined(_map)						\
+({									\
+	unsigned _in = ~0;						\
+									\
+	map_flags_rev(_map, _in);					\
+})
+
+/* Set VFS inode flags from bcachefs inode: */
+static inline void bch2_inode_flags_to_vfs(struct bch_inode_info *inode)
+{
+	set_flags(bch_flags_to_vfs, inode->ei_inode.bi_flags, inode->v.i_flags);
+}
 
 long bch2_fs_file_ioctl(struct file *, unsigned, unsigned long);
 long bch2_compat_fs_ioctl(struct file *, unsigned, unsigned long);
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index 53107d02cbb6..2e2a5acae0eb 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -147,6 +147,8 @@ void bch2_inode_update_after_write(struct bch_fs *c,
 
 	inode->ei_inode		= *bi;
 	inode->ei_qid		= bch_qid(bi);
+
+	bch2_inode_flags_to_vfs(inode);
 }
 
 int __must_check bch2_write_inode_trans(struct btree_trans *trans,
@@ -187,10 +189,10 @@ int __must_check bch2_write_inode_trans(struct btree_trans *trans,
 	return 0;
 }
 
-int __must_check __bch2_write_inode(struct bch_fs *c,
-				    struct bch_inode_info *inode,
-				    inode_set_fn set,
-				    void *p, unsigned fields)
+int __must_check bch2_write_inode(struct bch_fs *c,
+				  struct bch_inode_info *inode,
+				  inode_set_fn set,
+				  void *p, unsigned fields)
 {
 	struct btree_trans trans;
 	struct bch_inode_unpacked inode_u;
@@ -271,9 +273,8 @@ static int inode_update_for_create_fn(struct bch_inode_info *inode,
 {
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
 	struct bch_inode_unpacked *new_inode = p;
-	struct timespec64 now = current_time(&inode->v);
 
-	bi->bi_mtime = bi->bi_ctime = timespec_to_bch2_time(c, now);
+	bi->bi_mtime = bi->bi_ctime = bch2_current_time(c);
 
 	if (S_ISDIR(new_inode->bi_mode))
 		bi->bi_nlink++;
@@ -469,9 +470,8 @@ static int inode_update_for_link_fn(struct bch_inode_info *inode,
 				    void *p)
 {
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	struct timespec64 now = current_time(&inode->v);
 
-	bi->bi_ctime = timespec_to_bch2_time(c, now);
+	bi->bi_ctime = bch2_current_time(c);
 
 	if (bi->bi_flags & BCH_INODE_UNLINKED)
 		bi->bi_flags &= ~BCH_INODE_UNLINKED;
@@ -543,9 +543,8 @@ static int inode_update_dir_for_unlink_fn(struct bch_inode_info *inode,
 {
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
 	struct bch_inode_info *unlink_inode = p;
-	struct timespec64 now = current_time(&inode->v);
 
-	bi->bi_mtime = bi->bi_ctime = timespec_to_bch2_time(c, now);
+	bi->bi_mtime = bi->bi_ctime = bch2_current_time(c);
 
 	bi->bi_nlink -= S_ISDIR(unlink_inode->v.i_mode);
 
@@ -557,9 +556,8 @@ static int inode_update_for_unlink_fn(struct bch_inode_info *inode,
 				      void *p)
 {
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	struct timespec64 now = current_time(&inode->v);
 
-	bi->bi_ctime = timespec_to_bch2_time(c, now);
+	bi->bi_ctime = bch2_current_time(c);
 	if (bi->bi_nlink)
 		bi->bi_nlink--;
 	else
@@ -740,8 +738,6 @@ static int bch2_rename2(struct mnt_idmap *idmap,
 {
 	struct bch_fs *c = src_vdir->i_sb->s_fs_info;
 	struct rename_info i = {
-		.now		= timespec_to_bch2_time(c,
-						current_time(src_vdir)),
 		.src_dir	= to_bch_ei(src_vdir),
 		.dst_dir	= to_bch_ei(dst_vdir),
 		.src_inode	= to_bch_ei(src_dentry->d_inode),
@@ -778,7 +774,7 @@ static int bch2_rename2(struct mnt_idmap *idmap,
 	bch2_trans_init(&trans, c);
 retry:
 	bch2_trans_begin(&trans);
-	i.now = timespec_to_bch2_time(c, current_time(src_vdir)),
+	i.now = bch2_current_time(c);
 
 	ret   = bch2_dirent_rename(&trans,
 				   i.src_dir, &src_dentry->d_name,
@@ -1271,8 +1267,6 @@ static void bch2_vfs_inode_init(struct bch_fs *c,
 	inode->ei_quota_reserved = 0;
 	inode->ei_str_hash	= bch2_hash_info_init(c, bi);
 
-	bch2_inode_flags_to_vfs(inode);
-
 	inode->v.i_mapping->a_ops = &bch_address_space_operations;
 
 	switch (inode->v.i_mode & S_IFMT) {
@@ -1346,8 +1340,8 @@ static int bch2_vfs_write_inode(struct inode *vinode,
 	int ret;
 
 	mutex_lock(&inode->ei_update_lock);
-	ret = __bch2_write_inode(c, inode, inode_update_times_fn, NULL,
-				 ATTR_ATIME|ATTR_MTIME|ATTR_CTIME);
+	ret = bch2_write_inode(c, inode, inode_update_times_fn, NULL,
+			       ATTR_ATIME|ATTR_MTIME|ATTR_CTIME);
 	mutex_unlock(&inode->ei_update_lock);
 
 	if (c->opts.journal_flush_disabled)
diff --git a/fs/bcachefs/fs.h b/fs/bcachefs/fs.h
index e8dd566285fc..4fdc11762cd7 100644
--- a/fs/bcachefs/fs.h
+++ b/fs/bcachefs/fs.h
@@ -2,6 +2,7 @@
 #ifndef _BCACHEFS_FS_H
 #define _BCACHEFS_FS_H
 
+#include "inode.h"
 #include "opts.h"
 #include "str_hash.h"
 #include "quota_types.h"
@@ -81,10 +82,8 @@ int __must_check bch2_write_inode_trans(struct btree_trans *,
 				struct bch_inode_info *,
 				struct bch_inode_unpacked *,
 				inode_set_fn, void *);
-int __must_check __bch2_write_inode(struct bch_fs *, struct bch_inode_info *,
-				    inode_set_fn, void *, unsigned);
-int __must_check bch2_write_inode(struct bch_fs *,
-				  struct bch_inode_info *);
+int __must_check bch2_write_inode(struct bch_fs *, struct bch_inode_info *,
+				  inode_set_fn, void *, unsigned);
 
 void bch2_vfs_exit(void);
 int bch2_vfs_init(void);
diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c
index f0440d12a031..cb84bdabb6ed 100644
--- a/fs/bcachefs/xattr.c
+++ b/fs/bcachefs/xattr.c
@@ -436,7 +436,7 @@ static int bch2_xattr_bcachefs_set(const struct xattr_handler *handler,
 	}
 
 	mutex_lock(&inode->ei_update_lock);
-	ret = __bch2_write_inode(c, inode, inode_opt_set_fn, &s, 0);
+	ret = bch2_write_inode(c, inode, inode_opt_set_fn, &s, 0);
 	mutex_unlock(&inode->ei_update_lock);
 
 	if (value &&
-- 
cgit 


From fc88796d1ce84181bbf4fb3618305a417454b806 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 17 Jul 2018 15:28:11 -0400
Subject: bcachefs: bch2_trans_update() now takes struct btree_insert_entry

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/acl.c               |  2 +-
 fs/bcachefs/btree_update.h      | 11 +++++++++--
 fs/bcachefs/btree_update_leaf.c | 20 --------------------
 fs/bcachefs/dirent.c            |  8 +++++---
 fs/bcachefs/fs.c                |  2 +-
 fs/bcachefs/inode.c             |  3 ++-
 fs/bcachefs/str_hash.h          |  6 +++---
 7 files changed, 21 insertions(+), 31 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/acl.c b/fs/bcachefs/acl.c
index 7ee2022d9501..c81e5365ec84 100644
--- a/fs/bcachefs/acl.c
+++ b/fs/bcachefs/acl.c
@@ -375,7 +375,7 @@ int bch2_acl_chmod(struct btree_trans *trans,
 		goto err;
 	}
 
-	bch2_trans_update(trans, iter, &new->k_i, 0);
+	bch2_trans_update(trans, BTREE_INSERT_ENTRY(iter, &new->k_i));
 	*new_acl = acl;
 	acl = NULL;
 err:
diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
index 451d486fb032..31b72895f6eb 100644
--- a/fs/bcachefs/btree_update.h
+++ b/fs/bcachefs/btree_update.h
@@ -140,8 +140,15 @@ int bch2_btree_node_update_key(struct bch_fs *, struct btree_iter *,
 
 /* new transactional interface: */
 
-void bch2_trans_update(struct btree_trans *, struct btree_iter *,
-			     struct bkey_i *, unsigned);
+static inline void
+bch2_trans_update(struct btree_trans *trans,
+		  struct btree_insert_entry entry)
+{
+	BUG_ON(trans->nr_updates >= ARRAY_SIZE(trans->updates));
+
+	trans->updates[trans->nr_updates++] = entry;
+}
+
 int bch2_trans_commit(struct btree_trans *,
 		      struct disk_reservation *,
 		      struct extent_insert_hook *,
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 5cd20b572759..7ce2e35dafa2 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -581,26 +581,6 @@ err:
 	goto out;
 }
 
-void bch2_trans_update(struct btree_trans *trans,
-		       struct btree_iter *iter,
-		       struct bkey_i *k,
-		       unsigned extra_journal_res)
-{
-	struct btree_insert_entry *i;
-
-	BUG_ON(trans->nr_updates >= ARRAY_SIZE(trans->updates));
-
-	i = &trans->updates[trans->nr_updates++];
-
-	*i = (struct btree_insert_entry) {
-		.iter	= iter,
-		.k		= k,
-		.extra_res	= extra_journal_res,
-	};
-
-	btree_insert_entry_checks(trans->c, i);
-}
-
 int bch2_trans_commit(struct btree_trans *trans,
 		      struct disk_reservation *disk_res,
 		      struct extent_insert_hook *hook,
diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c
index 9e5936faf1af..18078cc2ca62 100644
--- a/fs/bcachefs/dirent.c
+++ b/fs/bcachefs/dirent.c
@@ -290,7 +290,9 @@ int bch2_dirent_rename(struct btree_trans *trans,
 				 * new_dst at the src position:
 				 */
 				new_dst->k.p = src_iter->pos;
-				bch2_trans_update(trans, src_iter, &new_dst->k_i, 0);
+				bch2_trans_update(trans,
+					BTREE_INSERT_ENTRY(src_iter,
+							   &new_dst->k_i));
 				return 0;
 			} else {
 				/* If we're overwriting, we can't insert new_dst
@@ -313,8 +315,8 @@ int bch2_dirent_rename(struct btree_trans *trans,
 		}
 	}
 
-	bch2_trans_update(trans, src_iter, &new_src->k_i, 0);
-	bch2_trans_update(trans, dst_iter, &new_dst->k_i, 0);
+	bch2_trans_update(trans, BTREE_INSERT_ENTRY(src_iter, &new_src->k_i));
+	bch2_trans_update(trans, BTREE_INSERT_ENTRY(dst_iter, &new_dst->k_i));
 	return 0;
 }
 
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index 2e2a5acae0eb..f10ee147d389 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -185,7 +185,7 @@ int __must_check bch2_write_inode_trans(struct btree_trans *trans,
 		return PTR_ERR(inode_p);
 
 	bch2_inode_pack(inode_p, inode_u);
-	bch2_trans_update(trans, iter, &inode_p->inode.k_i, 0);
+	bch2_trans_update(trans, BTREE_INSERT_ENTRY(iter, &inode_p->inode.k_i));
 	return 0;
 }
 
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index 2d635555bffb..f40ec37d7f0f 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -346,7 +346,8 @@ again:
 			inode_u->bi_generation	= bkey_generation(k);
 
 			bch2_inode_pack(inode_p, inode_u);
-			bch2_trans_update(trans, iter, &inode_p->inode.k_i, 0);
+			bch2_trans_update(trans,
+				BTREE_INSERT_ENTRY(iter, &inode_p->inode.k_i));
 			return 0;
 		}
 	}
diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h
index 0947fdcdc4cd..fbd6c3372677 100644
--- a/fs/bcachefs/str_hash.h
+++ b/fs/bcachefs/str_hash.h
@@ -255,14 +255,14 @@ not_found:
 		return -ENOENT;
 
 	insert->k.p = slot->pos;
-	bch2_trans_update(trans, slot, insert, 0);
+	bch2_trans_update(trans, BTREE_INSERT_ENTRY(slot, insert));
 	return 0;
 found:
 	if (flags & BCH_HASH_SET_MUST_CREATE)
 		return -EEXIST;
 
 	insert->k.p = iter->pos;
-	bch2_trans_update(trans, iter, insert, 0);
+	bch2_trans_update(trans, BTREE_INSERT_ENTRY(iter, insert));
 	return 0;
 }
 
@@ -297,7 +297,7 @@ static inline int bch2_hash_delete_at(struct btree_trans *trans,
 	delete->k.p = iter->pos;
 	delete->k.type = ret ? desc.whiteout_type : KEY_TYPE_DELETED;
 
-	bch2_trans_update(trans, iter, delete, 0);
+	bch2_trans_update(trans, BTREE_INSERT_ENTRY(iter, delete));
 	return 0;
 }
 
-- 
cgit 


From 19ee5f2ac4b34658b417073f4edc27ade11a01ae Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 20 Jul 2018 22:23:42 -0400
Subject: bcachefs: Use ei_update_lock consistently

This is prep work for using deferred btree updates for inode updates -
the way inodes are done now we're relying on btree locking for ei_inode
and ei_update_lock could probably be removed, but it'll actually be
needed when we switch to deferred updates.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/acl.c |  8 +++++---
 fs/bcachefs/fs.c  | 42 ++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 47 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/acl.c b/fs/bcachefs/acl.c
index c81e5365ec84..2856736f7224 100644
--- a/fs/bcachefs/acl.c
+++ b/fs/bcachefs/acl.c
@@ -304,13 +304,14 @@ int bch2_set_acl(struct mnt_idmap *idmap,
 	umode_t mode = inode->v.i_mode;
 	int ret;
 
+	mutex_lock(&inode->ei_update_lock);
+	bch2_trans_init(&trans, c);
+
 	if (type == ACL_TYPE_ACCESS && acl) {
 		ret = posix_acl_update_mode(idmap, &inode->v, &mode, &acl);
 		if (ret)
-			return ret;
+			goto err;
 	}
-
-	bch2_trans_init(&trans, c);
 retry:
 	bch2_trans_begin(&trans);
 
@@ -336,6 +337,7 @@ retry:
 	set_cached_acl(&inode->v, type, acl);
 err:
 	bch2_trans_exit(&trans);
+	mutex_unlock(&inode->ei_update_lock);
 
 	return ret;
 }
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index f10ee147d389..a4d82252bc49 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -51,6 +51,30 @@ static void journal_seq_copy(struct bch_inode_info *dst,
 	} while ((v = cmpxchg(&dst->ei_journal_seq, old, journal_seq)) != old);
 }
 
+static inline int ptrcmp(void *l, void *r)
+{
+	return (l > r) - (l < r);
+}
+
+#define __bch2_lock_inodes(_lock, ...)					\
+do {									\
+	struct bch_inode_info *a[] = { NULL, __VA_ARGS__ };		\
+	unsigned i;							\
+									\
+	bubble_sort(&a[1], ARRAY_SIZE(a) - 1 , ptrcmp);			\
+									\
+	for (i = ARRAY_SIZE(a) - 1; a[i]; --i)				\
+		if (a[i] != a[i - 1]) {					\
+			if (_lock)					\
+				mutex_lock_nested(&a[i]->ei_update_lock, i);\
+			else						\
+				mutex_unlock(&a[i]->ei_update_lock);	\
+		}							\
+} while (0)
+
+#define bch2_lock_inodes(...)	__bch2_lock_inodes(true, __VA_ARGS__)
+#define bch2_unlock_inodes(...)	__bch2_lock_inodes(false, __VA_ARGS__)
+
 static void __pagecache_lock_put(struct pagecache_lock *lock, long i)
 {
 	BUG_ON(atomic_long_read(&lock->v) == 0);
@@ -161,6 +185,8 @@ int __must_check bch2_write_inode_trans(struct btree_trans *trans,
 	struct bkey_inode_buf *inode_p;
 	int ret;
 
+	lockdep_assert_held(&inode->ei_update_lock);
+
 	iter = bch2_trans_get_iter(trans, BTREE_ID_INODES,
 			POS(inode->v.i_ino, 0),
 			BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
@@ -422,6 +448,9 @@ out:
 	posix_acl_release(acl);
 	return inode;
 err_trans:
+	if (!tmpfile)
+		mutex_unlock(&dir->ei_update_lock);
+
 	bch2_trans_exit(&trans);
 	make_bad_inode(&inode->v);
 	iput(&inode->v);
@@ -490,6 +519,7 @@ static int __bch2_link(struct bch_fs *c,
 	struct bch_inode_unpacked inode_u;
 	int ret;
 
+	mutex_lock(&inode->ei_update_lock);
 	bch2_trans_init(&trans, c);
 retry:
 	bch2_trans_begin(&trans);
@@ -515,6 +545,7 @@ retry:
 		bch2_inode_update_after_write(c, inode, &inode_u, ATTR_CTIME);
 
 	bch2_trans_exit(&trans);
+	mutex_unlock(&inode->ei_update_lock);
 	return ret;
 }
 
@@ -575,6 +606,7 @@ static int bch2_unlink(struct inode *vdir, struct dentry *dentry)
 	struct btree_trans trans;
 	int ret;
 
+	bch2_lock_inodes(dir, inode);
 	bch2_trans_init(&trans, c);
 retry:
 	bch2_trans_begin(&trans);
@@ -607,6 +639,7 @@ retry:
 				      ATTR_MTIME);
 err:
 	bch2_trans_exit(&trans);
+	bch2_unlock_inodes(dir, inode);
 
 	return ret;
 }
@@ -771,6 +804,11 @@ static int bch2_rename2(struct mnt_idmap *idmap,
 			return ret;
 	}
 
+	bch2_lock_inodes(i.src_dir,
+			 i.dst_dir,
+			 i.src_inode,
+			 i.dst_inode);
+
 	bch2_trans_init(&trans, c);
 retry:
 	bch2_trans_begin(&trans);
@@ -818,6 +856,10 @@ retry:
 					      ATTR_CTIME);
 err:
 	bch2_trans_exit(&trans);
+	bch2_unlock_inodes(i.src_dir,
+			   i.dst_dir,
+			   i.src_inode,
+			   i.dst_inode);
 
 	return ret;
 }
-- 
cgit 


From d96b3ffe3884d8402ca64d1a1ae880460339ec63 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 22 Jul 2018 13:15:51 -0400
Subject: bcachefs: fix rename + fsync

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index a4d82252bc49..cd29404e0b9b 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -849,6 +849,10 @@ retry:
 		journal_seq_copy(i.dst_dir, journal_seq);
 	}
 
+	journal_seq_copy(i.src_inode, journal_seq);
+	if (i.dst_inode)
+		journal_seq_copy(i.dst_inode, journal_seq);
+
 	bch2_inode_update_after_write(c, i.src_inode, &src_inode_u,
 				      ATTR_CTIME);
 	if (i.dst_inode)
-- 
cgit 


From 8bb4dff72d07f4f46e5627870a9614c4cee5a1bb Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 22 Jul 2018 13:04:00 -0400
Subject: bcachefs: Fix an assertion

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update_leaf.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 7ce2e35dafa2..3d0c6f5c98ad 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -490,7 +490,7 @@ out:
 			bch2_btree_iter_verify_locks(linked);
 			BUG_ON((trans->flags & BTREE_INSERT_NOUNLOCK) &&
 			       trans->did_work &&
-			       linked->uptodate >= BTREE_ITER_NEED_RELOCK);
+			       !btree_node_locked(linked, 0));
 		}
 
 		/* make sure we didn't lose an error: */
-- 
cgit 


From c6923995296e5f06a47aa36e684ef0eccd17adea Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 21 Jul 2018 22:57:20 -0400
Subject: bcachefs: don't call bch2_bucket_seq_cleanup from journal_buf_switch

journal_buf_switch is called from the foreground when getting a journal
reservation and thus is somewhat latency sensitive;
bch2_bucket_seq_cleanup has to run infrequently but is a bit expensive
when it does run.

Call it from the journal write path instead, and punt the journal write
to worqueue context.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs.h      |  2 +-
 fs/bcachefs/buckets.c       |  7 +++++++
 fs/bcachefs/buckets_types.h |  2 ++
 fs/bcachefs/journal.c       | 20 +++++---------------
 fs/bcachefs/journal_io.c    |  2 ++
 5 files changed, 17 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 57132c79c4b9..0c55cc914907 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -719,7 +719,7 @@ struct bch_fs {
 
 	struct journal		journal;
 
-	unsigned		bucket_journal_seq;
+	u64			last_bucket_seq_cleanup;
 
 	/* The rest of this all shows up in sysfs */
 	atomic_long_t		read_realloc_races;
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index f347c93e0c6e..4a910f773953 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -147,6 +147,7 @@ static void bch2_disk_reservations_verify(struct bch_fs *c, int flags) {}
  */
 void bch2_bucket_seq_cleanup(struct bch_fs *c)
 {
+	u64 journal_seq = atomic64_read(&c->journal.seq);
 	u16 last_seq_ondisk = c->journal.last_seq_ondisk;
 	struct bch_dev *ca;
 	struct bucket_array *buckets;
@@ -154,6 +155,12 @@ void bch2_bucket_seq_cleanup(struct bch_fs *c)
 	struct bucket_mark m;
 	unsigned i;
 
+	if (journal_seq - c->last_bucket_seq_cleanup <
+	    (1U << (BUCKET_JOURNAL_SEQ_BITS - 2)))
+		return;
+
+	c->last_bucket_seq_cleanup = journal_seq;
+
 	for_each_member_device(ca, c, i) {
 		down_read(&ca->bucket_lock);
 		buckets = bucket_array(ca);
diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h
index 5be90139dd0d..cad35a70192d 100644
--- a/fs/bcachefs/buckets_types.h
+++ b/fs/bcachefs/buckets_types.h
@@ -4,6 +4,8 @@
 
 #include "util.h"
 
+#define BUCKET_JOURNAL_SEQ_BITS		16
+
 struct bucket_mark {
 	union {
 	struct {
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index 3878ceb37dcf..a83c45b82f95 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -32,14 +32,8 @@ void bch2_journal_buf_put_slowpath(struct journal *j, bool need_write_just_set)
 	    test_bit(JOURNAL_NEED_WRITE, &j->flags))
 		bch2_time_stats_update(j->delay_time,
 				       j->need_write_time);
-#if 0
-	closure_call(&j->io, bch2_journal_write, NULL, NULL);
-#else
-	/* Shut sparse up: */
-	closure_init(&j->io, NULL);
-	set_closure_fn(&j->io, bch2_journal_write, NULL);
-	bch2_journal_write(&j->io);
-#endif
+
+	closure_call(&j->io, bch2_journal_write, system_highpri_wq, NULL);
 }
 
 static void journal_pin_new_entry(struct journal *j, int count)
@@ -172,13 +166,6 @@ static enum {
 	cancel_delayed_work(&j->write_work);
 	spin_unlock(&j->lock);
 
-	if (c->bucket_journal_seq > 1 << 14) {
-		c->bucket_journal_seq = 0;
-		bch2_bucket_seq_cleanup(c);
-	}
-
-	c->bucket_journal_seq++;
-
 	/* ugh - might be called from __journal_res_get() under wait_event() */
 	__set_current_state(TASK_RUNNING);
 	bch2_journal_buf_put(j, old.idx, need_write_just_set);
@@ -943,6 +930,7 @@ void bch2_fs_journal_stop(struct journal *j)
 
 void bch2_fs_journal_start(struct journal *j)
 {
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
 	struct journal_seq_blacklist *bl;
 	u64 blacklist = 0;
 
@@ -964,6 +952,8 @@ void bch2_fs_journal_start(struct journal *j)
 	journal_pin_new_entry(j, 1);
 	bch2_journal_buf_init(j);
 
+	c->last_bucket_seq_cleanup = journal_cur_seq(j);
+
 	spin_unlock(&j->lock);
 
 	/*
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 320f4f2933c1..d479d946eea7 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -1384,6 +1384,8 @@ no_io:
 	extent_for_each_ptr(bkey_i_to_s_extent(&j->key), ptr)
 		ptr->offset += sectors;
 
+	bch2_bucket_seq_cleanup(c);
+
 	continue_at(cl, journal_write_done, system_highpri_wq);
 	return;
 err:
-- 
cgit 


From b2be7c8b731262c5342e9f068b490d61e540ad0d Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 22 Jul 2018 06:10:52 -0400
Subject: bcachefs: kill bucket mark sector count saturation

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs.h |  1 -
 fs/bcachefs/btree_gc.c |  3 ---
 fs/bcachefs/buckets.c  | 45 ++++++++++-----------------------------------
 fs/bcachefs/buckets.h  |  5 -----
 fs/bcachefs/trace.h    | 25 -------------------------
 5 files changed, 10 insertions(+), 69 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 0c55cc914907..7c6b1925f67b 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -413,7 +413,6 @@ struct bch_dev {
 	/* last calculated minimum prio */
 	u16			max_last_bucket_io[2];
 
-	atomic_long_t		saturated_count;
 	size_t			inc_gen_needs_gc;
 	size_t			inc_gen_really_needs_gc;
 	u64			allocator_journal_seq_flush;
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 00c28a0a4d9d..5053247a6b42 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -570,9 +570,6 @@ void bch2_gc(struct bch_fs *c)
 	bch2_mark_pending_btree_node_frees(c);
 	bch2_mark_allocator_buckets(c);
 
-	for_each_member_device(ca, c, i)
-		atomic_long_set(&ca->saturated_count, 0);
-
 	/* Indicates that gc is no longer in progress: */
 	gc_pos_set(c, gc_phase(GC_PHASE_DONE));
 	c->gc_count++;
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 4a910f773953..eec2f6cb4f5b 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -454,17 +454,11 @@ void bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
 	       c->gc_pos.phase == GC_PHASE_DONE);
 }
 
-#define saturated_add(ca, dst, src, max)			\
+#define checked_add(a, b)					\
 do {								\
-	BUG_ON((int) (dst) + (src) < 0);			\
-	if ((dst) == (max))					\
-		;						\
-	else if ((dst) + (src) <= (max))			\
-		dst += (src);					\
-	else {							\
-		dst = (max);					\
-		trace_sectors_saturated(ca);		\
-	}							\
+	unsigned _res = (unsigned) (a) + (b);			\
+	(a) = _res;						\
+	BUG_ON((a) != _res);					\
 } while (0)
 
 void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
@@ -489,9 +483,9 @@ void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
 
 	g = bucket(ca, b);
 	old = bucket_data_cmpxchg(c, ca, g, new, ({
-		saturated_add(ca, new.dirty_sectors, sectors,
-			      GC_MAX_SECTORS_USED);
-		new.data_type		= type;
+		new.data_type = type;
+		checked_add(new.dirty_sectors, sectors);
+		new.dirty_sectors += sectors;
 	}));
 
 	rcu_read_unlock();
@@ -525,7 +519,6 @@ static void bch2_mark_pointer(struct bch_fs *c,
 			      u64 journal_seq, unsigned flags)
 {
 	struct bucket_mark old, new;
-	unsigned saturated;
 	struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
 	struct bucket *g = PTR_BUCKET(ca, ptr);
 	enum bch_data_type data_type = type == S_META
@@ -560,7 +553,6 @@ static void bch2_mark_pointer(struct bch_fs *c,
 	v = atomic64_read(&g->_mark.v);
 	do {
 		new.v.counter = old.v.counter = v;
-		saturated = 0;
 
 		/*
 		 * Check this after reading bucket mark to guard against
@@ -574,17 +566,10 @@ static void bch2_mark_pointer(struct bch_fs *c,
 			return;
 		}
 
-		if (!ptr->cached &&
-		    new.dirty_sectors == GC_MAX_SECTORS_USED &&
-		    sectors < 0)
-			saturated = -sectors;
-
-		if (ptr->cached)
-			saturated_add(ca, new.cached_sectors, sectors,
-				      GC_MAX_SECTORS_USED);
+		if (!ptr->cached)
+			checked_add(new.dirty_sectors, sectors);
 		else
-			saturated_add(ca, new.dirty_sectors, sectors,
-				      GC_MAX_SECTORS_USED);
+			checked_add(new.cached_sectors, sectors);
 
 		if (!new.dirty_sectors &&
 		    !new.cached_sectors) {
@@ -610,16 +595,6 @@ static void bch2_mark_pointer(struct bch_fs *c,
 
 	BUG_ON(!(flags & BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE) &&
 	       bucket_became_unavailable(c, old, new));
-
-	if (saturated &&
-	    atomic_long_add_return(saturated,
-				   &ca->saturated_count) >=
-	    bucket_to_sector(ca, ca->free_inc.size)) {
-		if (c->gc_thread) {
-			trace_gc_sectors_saturated(c);
-			wake_up_process(c->gc_thread);
-		}
-	}
 }
 
 void bch2_mark_key(struct bch_fs *c, struct bkey_s_c k,
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index a4ba6d787b0b..d0dc9c8b4f0b 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -115,11 +115,6 @@ static inline u8 ptr_stale(struct bch_dev *ca,
 
 /* bucket gc marks */
 
-/* The dirty and cached sector counts saturate. If this occurs,
- * reference counting alone will not free the bucket, and a btree
- * GC must be performed. */
-#define GC_MAX_SECTORS_USED ((1U << 15) - 1)
-
 static inline unsigned bucket_sectors_used(struct bucket_mark mark)
 {
 	return mark.dirty_sectors + mark.cached_sectors;
diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h
index d0b99c692063..9730540f7375 100644
--- a/fs/bcachefs/trace.h
+++ b/fs/bcachefs/trace.h
@@ -44,21 +44,6 @@ DECLARE_EVENT_CLASS(bkey,
 		  __entry->offset, __entry->size)
 );
 
-DECLARE_EVENT_CLASS(bch_dev,
-	TP_PROTO(struct bch_dev *ca),
-	TP_ARGS(ca),
-
-	TP_STRUCT__entry(
-		__array(char,		uuid,	16	)
-	),
-
-	TP_fast_assign(
-		memcpy(__entry->uuid, ca->uuid.b, 16);
-	),
-
-	TP_printk("%pU", __entry->uuid)
-);
-
 DECLARE_EVENT_CLASS(bch_fs,
 	TP_PROTO(struct bch_fs *c),
 	TP_ARGS(c),
@@ -361,16 +346,6 @@ DEFINE_EVENT(bch_fs, gc_coalesce_end,
 	TP_ARGS(c)
 );
 
-DEFINE_EVENT(bch_dev, sectors_saturated,
-	TP_PROTO(struct bch_dev *ca),
-	TP_ARGS(ca)
-);
-
-DEFINE_EVENT(bch_fs, gc_sectors_saturated,
-	TP_PROTO(struct bch_fs *c),
-	TP_ARGS(c)
-);
-
 DEFINE_EVENT(bch_fs, gc_cannot_inc_gens,
 	TP_PROTO(struct bch_fs *c),
 	TP_ARGS(c)
-- 
cgit 


From b29e197aafd95fc5cd50f0fd85c6275e3aa319a6 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 22 Jul 2018 10:43:01 -0400
Subject: bcachefs: Invalidate buckets when writing to alloc btree

Prep work for persistent alloc information. Refactoring also lets us
make free_inc much smaller, which means a lot fewer buckets stranded on
freelists.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc.c                 | 610 +++++++++++++++++-------------------
 fs/bcachefs/alloc.h                 |   2 +
 fs/bcachefs/bcachefs.h              |   7 +-
 fs/bcachefs/btree_update_interior.c |   3 +-
 fs/bcachefs/buckets.c               |  13 +-
 fs/bcachefs/buckets.h               |   2 +-
 6 files changed, 300 insertions(+), 337 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc.c b/fs/bcachefs/alloc.c
index e6e506e4a8a3..19523226afd8 100644
--- a/fs/bcachefs/alloc.c
+++ b/fs/bcachefs/alloc.c
@@ -288,53 +288,41 @@ int bch2_alloc_read(struct bch_fs *c, struct list_head *journal_replay_list)
 
 static int __bch2_alloc_write_key(struct bch_fs *c, struct bch_dev *ca,
 				  size_t b, struct btree_iter *iter,
-				  u64 *journal_seq, bool nowait)
+				  u64 *journal_seq, unsigned flags)
 {
 	struct bucket_mark m;
 	__BKEY_PADDED(k, DIV_ROUND_UP(sizeof(struct bch_alloc), 8)) alloc_key;
 	struct bucket *g;
 	struct bkey_i_alloc *a;
 	u8 *d;
-	int ret;
-	unsigned flags = BTREE_INSERT_ATOMIC|
-		BTREE_INSERT_NOFAIL|
-		BTREE_INSERT_USE_RESERVE|
-		BTREE_INSERT_USE_ALLOC_RESERVE;
-
-	if (nowait)
-		flags |= BTREE_INSERT_NOWAIT;
 
-	bch2_btree_iter_set_pos(iter, POS(ca->dev_idx, b));
+	percpu_down_read(&c->usage_lock);
+	g = bucket(ca, b);
+
+	m = READ_ONCE(g->mark);
+	a = bkey_alloc_init(&alloc_key.k);
+	a->k.p		= POS(ca->dev_idx, b);
+	a->v.fields	= 0;
+	a->v.gen	= m.gen;
+	set_bkey_val_u64s(&a->k, bch_alloc_val_u64s(&a->v));
+
+	d = a->v.data;
+	if (a->v.fields & (1 << BCH_ALLOC_FIELD_READ_TIME))
+		put_alloc_field(&d, 2, g->io_time[READ]);
+	if (a->v.fields & (1 << BCH_ALLOC_FIELD_WRITE_TIME))
+		put_alloc_field(&d, 2, g->io_time[WRITE]);
+	percpu_up_read(&c->usage_lock);
 
-	do {
-		ret = btree_iter_err(bch2_btree_iter_peek_slot(iter));
-		if (ret)
-			break;
+	bch2_btree_iter_cond_resched(iter);
 
-		percpu_down_read(&c->usage_lock);
-		g = bucket(ca, b);
-
-		/* read mark under btree node lock: */
-		m = READ_ONCE(g->mark);
-		a = bkey_alloc_init(&alloc_key.k);
-		a->k.p		= iter->pos;
-		a->v.fields	= 0;
-		a->v.gen	= m.gen;
-		set_bkey_val_u64s(&a->k, bch_alloc_val_u64s(&a->v));
-
-		d = a->v.data;
-		if (a->v.fields & (1 << BCH_ALLOC_FIELD_READ_TIME))
-			put_alloc_field(&d, 2, g->io_time[READ]);
-		if (a->v.fields & (1 << BCH_ALLOC_FIELD_WRITE_TIME))
-			put_alloc_field(&d, 2, g->io_time[WRITE]);
-		percpu_up_read(&c->usage_lock);
-
-		ret = bch2_btree_insert_at(c, NULL, NULL, journal_seq, flags,
-					   BTREE_INSERT_ENTRY(iter, &a->k_i));
-		bch2_btree_iter_cond_resched(iter);
-	} while (ret == -EINTR);
+	bch2_btree_iter_set_pos(iter, a->k.p);
 
-	return ret;
+	return bch2_btree_insert_at(c, NULL, NULL, journal_seq,
+				    BTREE_INSERT_NOFAIL|
+				    BTREE_INSERT_USE_RESERVE|
+				    BTREE_INSERT_USE_ALLOC_RESERVE|
+				    flags,
+				    BTREE_INSERT_ENTRY(iter, &a->k_i));
 }
 
 int bch2_alloc_replay_key(struct bch_fs *c, struct bpos pos)
@@ -354,8 +342,7 @@ int bch2_alloc_replay_key(struct bch_fs *c, struct bpos pos)
 	bch2_btree_iter_init(&iter, c, BTREE_ID_ALLOC, POS_MIN,
 			     BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
 
-	ret = __bch2_alloc_write_key(c, ca, pos.offset, &iter,
-				     NULL, false);
+	ret = __bch2_alloc_write_key(c, ca, pos.offset, &iter, NULL, 0);
 	bch2_btree_iter_unlock(&iter);
 	return ret;
 }
@@ -375,8 +362,8 @@ int bch2_alloc_write(struct bch_fs *c)
 
 		down_read(&ca->bucket_lock);
 		for_each_set_bit(bucket, ca->buckets_dirty, ca->mi.nbuckets) {
-			ret = __bch2_alloc_write_key(c, ca, bucket, &iter,
-						     NULL, false);
+			ret = __bch2_alloc_write_key(c, ca, bucket,
+						     &iter, NULL, 0);
 			if (ret)
 				break;
 
@@ -582,47 +569,6 @@ static bool bch2_can_invalidate_bucket(struct bch_dev *ca,
 	return gc_gen < BUCKET_GC_GEN_MAX;
 }
 
-static void bch2_invalidate_one_bucket(struct bch_fs *c, struct bch_dev *ca,
-				       size_t bucket)
-{
-	struct bucket_mark m;
-
-	percpu_down_read(&c->usage_lock);
-	spin_lock(&c->freelist_lock);
-
-	if (!bch2_invalidate_bucket(c, ca, bucket, &m)) {
-		spin_unlock(&c->freelist_lock);
-		percpu_up_read(&c->usage_lock);
-		return;
-	}
-
-	verify_not_on_freelist(c, ca, bucket);
-	BUG_ON(!fifo_push(&ca->free_inc, bucket));
-
-	spin_unlock(&c->freelist_lock);
-	percpu_up_read(&c->usage_lock);
-
-	/* gc lock held: */
-	bucket_io_clock_reset(c, ca, bucket, READ);
-	bucket_io_clock_reset(c, ca, bucket, WRITE);
-
-	if (m.cached_sectors) {
-		ca->allocator_invalidating_data = true;
-	} else if (m.journal_seq_valid) {
-		u64 journal_seq = atomic64_read(&c->journal.seq);
-		u64 bucket_seq	= journal_seq;
-
-		bucket_seq &= ~((u64) U16_MAX);
-		bucket_seq |= m.journal_seq;
-
-		if (bucket_seq > journal_seq)
-			bucket_seq -= 1 << 16;
-
-		ca->allocator_journal_seq_flush =
-			max(ca->allocator_journal_seq_flush, bucket_seq);
-	}
-}
-
 /*
  * Determines what order we're going to reuse buckets, smallest bucket_key()
  * first.
@@ -674,11 +620,18 @@ static inline int bucket_alloc_cmp(alloc_heap *h,
 		(l.bucket > r.bucket) - (l.bucket < r.bucket);
 }
 
+static inline int bucket_idx_cmp(const void *_l, const void *_r)
+{
+	const struct alloc_heap_entry *l = _l, *r = _r;
+
+	return (l->bucket > r->bucket) - (l->bucket < r->bucket);
+}
+
 static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca)
 {
 	struct bucket_array *buckets;
 	struct alloc_heap_entry e = { 0 };
-	size_t b;
+	size_t b, i, nr = 0;
 
 	ca->alloc_heap.used = 0;
 
@@ -720,55 +673,58 @@ static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca)
 	if (e.nr)
 		heap_add_or_replace(&ca->alloc_heap, e, -bucket_alloc_cmp);
 
-	up_read(&ca->bucket_lock);
-	mutex_unlock(&c->bucket_clock[READ].lock);
-
-	heap_resort(&ca->alloc_heap, bucket_alloc_cmp);
-
-	while (heap_pop(&ca->alloc_heap, e, bucket_alloc_cmp)) {
-		for (b = e.bucket;
-		     b < e.bucket + e.nr;
-		     b++) {
-			if (fifo_full(&ca->free_inc))
-				return;
+	for (i = 0; i < ca->alloc_heap.used; i++)
+		nr += ca->alloc_heap.data[i].nr;
 
-			bch2_invalidate_one_bucket(c, ca, b);
-		}
+	while (nr - ca->alloc_heap.data[0].nr >= ALLOC_SCAN_BATCH(ca)) {
+		nr -= ca->alloc_heap.data[0].nr;
+		heap_pop(&ca->alloc_heap, e, -bucket_alloc_cmp);
 	}
+
+	up_read(&ca->bucket_lock);
+	mutex_unlock(&c->bucket_clock[READ].lock);
 }
 
 static void find_reclaimable_buckets_fifo(struct bch_fs *c, struct bch_dev *ca)
 {
 	struct bucket_array *buckets = bucket_array(ca);
 	struct bucket_mark m;
-	size_t b, checked;
+	size_t b, start;
 
-	for (checked = 0;
-	     checked < ca->mi.nbuckets && !fifo_full(&ca->free_inc);
-	     checked++) {
-		if (ca->fifo_last_bucket <  ca->mi.first_bucket ||
-		    ca->fifo_last_bucket >= ca->mi.nbuckets)
-			ca->fifo_last_bucket = ca->mi.first_bucket;
+	if (ca->fifo_last_bucket <  ca->mi.first_bucket ||
+	    ca->fifo_last_bucket >= ca->mi.nbuckets)
+		ca->fifo_last_bucket = ca->mi.first_bucket;
+
+	start = ca->fifo_last_bucket;
 
-		b = ca->fifo_last_bucket++;
+	do {
+		ca->fifo_last_bucket++;
+		if (ca->fifo_last_bucket == ca->mi.nbuckets)
+			ca->fifo_last_bucket = ca->mi.first_bucket;
 
+		b = ca->fifo_last_bucket;
 		m = READ_ONCE(buckets->b[b].mark);
 
-		if (bch2_can_invalidate_bucket(ca, b, m))
-			bch2_invalidate_one_bucket(c, ca, b);
+		if (bch2_can_invalidate_bucket(ca, b, m)) {
+			struct alloc_heap_entry e = { .bucket = b, .nr = 1, };
+
+			heap_add(&ca->alloc_heap, e, bucket_alloc_cmp);
+			if (heap_full(&ca->alloc_heap))
+				break;
+		}
 
 		cond_resched();
-	}
+	} while (ca->fifo_last_bucket != start);
 }
 
 static void find_reclaimable_buckets_random(struct bch_fs *c, struct bch_dev *ca)
 {
 	struct bucket_array *buckets = bucket_array(ca);
 	struct bucket_mark m;
-	size_t checked;
+	size_t checked, i;
 
 	for (checked = 0;
-	     checked < ca->mi.nbuckets / 2 && !fifo_full(&ca->free_inc);
+	     checked < ca->mi.nbuckets / 2;
 	     checked++) {
 		size_t b = bch2_rand_range(ca->mi.nbuckets -
 					   ca->mi.first_bucket) +
@@ -776,17 +732,34 @@ static void find_reclaimable_buckets_random(struct bch_fs *c, struct bch_dev *ca
 
 		m = READ_ONCE(buckets->b[b].mark);
 
-		if (bch2_can_invalidate_bucket(ca, b, m))
-			bch2_invalidate_one_bucket(c, ca, b);
+		if (bch2_can_invalidate_bucket(ca, b, m)) {
+			struct alloc_heap_entry e = { .bucket = b, .nr = 1, };
+
+			heap_add(&ca->alloc_heap, e, bucket_alloc_cmp);
+			if (heap_full(&ca->alloc_heap))
+				break;
+		}
 
 		cond_resched();
 	}
+
+	sort(ca->alloc_heap.data,
+	     ca->alloc_heap.used,
+	     sizeof(ca->alloc_heap.data[0]),
+	     bucket_idx_cmp, NULL);
+
+	/* remove duplicates: */
+	for (i = 0; i + 1 < ca->alloc_heap.used; i++)
+		if (ca->alloc_heap.data[i].bucket ==
+		    ca->alloc_heap.data[i + 1].bucket)
+			ca->alloc_heap.data[i].nr = 0;
 }
 
-static void find_reclaimable_buckets(struct bch_fs *c, struct bch_dev *ca)
+static size_t find_reclaimable_buckets(struct bch_fs *c, struct bch_dev *ca)
 {
+	size_t i, nr = 0;
+
 	ca->inc_gen_needs_gc			= 0;
-	ca->inc_gen_really_needs_gc		= 0;
 
 	switch (ca->mi.replacement) {
 	case CACHE_REPLACEMENT_LRU:
@@ -799,86 +772,132 @@ static void find_reclaimable_buckets(struct bch_fs *c, struct bch_dev *ca)
 		find_reclaimable_buckets_random(c, ca);
 		break;
 	}
+
+	heap_resort(&ca->alloc_heap, bucket_alloc_cmp);
+
+	for (i = 0; i < ca->alloc_heap.used; i++)
+		nr += ca->alloc_heap.data[i].nr;
+
+	return nr;
 }
 
-static int size_t_cmp(const void *_l, const void *_r)
+static inline long next_alloc_bucket(struct bch_dev *ca)
 {
-	const size_t *l = _l, *r = _r;
+	struct alloc_heap_entry e, *top = ca->alloc_heap.data;
+
+	while (ca->alloc_heap.used) {
+		if (top->nr) {
+			size_t b = top->bucket;
+
+			top->bucket++;
+			top->nr--;
+			return b;
+		}
 
-	return (*l > *r) - (*l < *r);
+		heap_pop(&ca->alloc_heap, e, bucket_alloc_cmp);
+	}
+
+	return -1;
 }
 
-static void sort_free_inc(struct bch_fs *c, struct bch_dev *ca)
+static bool bch2_invalidate_one_bucket(struct bch_fs *c, struct bch_dev *ca,
+				       size_t bucket, u64 *flush_seq)
 {
-	BUG_ON(ca->free_inc.front);
+	struct bucket_mark m;
 
+	percpu_down_read(&c->usage_lock);
 	spin_lock(&c->freelist_lock);
-	sort(ca->free_inc.data,
-	     ca->free_inc.back,
-	     sizeof(ca->free_inc.data[0]),
-	     size_t_cmp, NULL);
+
+	bch2_invalidate_bucket(c, ca, bucket, &m);
+
+	verify_not_on_freelist(c, ca, bucket);
+	BUG_ON(!fifo_push(&ca->free_inc, bucket));
+
 	spin_unlock(&c->freelist_lock);
+
+	bucket_io_clock_reset(c, ca, bucket, READ);
+	bucket_io_clock_reset(c, ca, bucket, WRITE);
+
+	percpu_up_read(&c->usage_lock);
+
+	if (m.journal_seq_valid) {
+		u64 journal_seq = atomic64_read(&c->journal.seq);
+		u64 bucket_seq	= journal_seq;
+
+		bucket_seq &= ~((u64) U16_MAX);
+		bucket_seq |= m.journal_seq;
+
+		if (bucket_seq > journal_seq)
+			bucket_seq -= 1 << 16;
+
+		*flush_seq = max(*flush_seq, bucket_seq);
+	}
+
+	return m.cached_sectors != 0;
 }
 
-static int bch2_invalidate_free_inc(struct bch_fs *c, struct bch_dev *ca,
-				    u64 *journal_seq, size_t nr,
-				    bool nowait)
+/*
+ * Pull buckets off ca->alloc_heap, invalidate them, move them to ca->free_inc:
+ */
+static int bch2_invalidate_buckets(struct bch_fs *c, struct bch_dev *ca)
 {
 	struct btree_iter iter;
+	u64 journal_seq = 0;
 	int ret = 0;
+	long b;
 
 	bch2_btree_iter_init(&iter, c, BTREE_ID_ALLOC, POS(ca->dev_idx, 0),
 			     BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
 
 	/* Only use nowait if we've already invalidated at least one bucket: */
-	while (ca->nr_invalidated < min(nr, fifo_used(&ca->free_inc))) {
-		size_t b = fifo_idx_entry(&ca->free_inc, ca->nr_invalidated);
-
-		ret = __bch2_alloc_write_key(c, ca, b, &iter, journal_seq,
-					     nowait && ca->nr_invalidated);
-		if (ret)
-			break;
-
-		ca->nr_invalidated++;
+	while (!ret &&
+	       !fifo_full(&ca->free_inc) &&
+	       (b = next_alloc_bucket(ca)) >= 0) {
+		bool must_flush =
+			bch2_invalidate_one_bucket(c, ca, b, &journal_seq);
+
+		ret = __bch2_alloc_write_key(c, ca, b, &iter,
+				must_flush ? &journal_seq : NULL,
+				!fifo_empty(&ca->free_inc) ? BTREE_INSERT_NOWAIT : 0);
 	}
 
 	bch2_btree_iter_unlock(&iter);
 
 	/* If we used NOWAIT, don't return the error: */
-	return ca->nr_invalidated ? 0 : ret;
-}
-
-static bool __push_invalidated_bucket(struct bch_fs *c, struct bch_dev *ca, size_t bucket)
-{
-	unsigned i;
+	if (!fifo_empty(&ca->free_inc))
+		ret = 0;
+	if (ret) {
+		bch_err(ca, "error invalidating buckets: %i", ret);
+		return ret;
+	}
 
-	/*
-	 * Don't remove from free_inc until after it's added to
-	 * freelist, so gc can find it:
-	 */
-	spin_lock(&c->freelist_lock);
-	for (i = 0; i < RESERVE_NR; i++)
-		if (fifo_push(&ca->free[i], bucket)) {
-			fifo_pop(&ca->free_inc, bucket);
-			--ca->nr_invalidated;
-			closure_wake_up(&c->freelist_wait);
-			spin_unlock(&c->freelist_lock);
-			return true;
-		}
-	spin_unlock(&c->freelist_lock);
+	if (journal_seq)
+		ret = bch2_journal_flush_seq(&c->journal, journal_seq);
+	if (ret) {
+		bch_err(ca, "journal error: %i", ret);
+		return ret;
+	}
 
-	return false;
+	return 0;
 }
 
 static int push_invalidated_bucket(struct bch_fs *c, struct bch_dev *ca, size_t bucket)
 {
+	unsigned i;
 	int ret = 0;
 
 	while (1) {
 		set_current_state(TASK_INTERRUPTIBLE);
 
-		if (__push_invalidated_bucket(c, ca, bucket))
-			break;
+		spin_lock(&c->freelist_lock);
+		for (i = 0; i < RESERVE_NR; i++)
+			if (fifo_push(&ca->free[i], bucket)) {
+				fifo_pop(&ca->free_inc, bucket);
+				closure_wake_up(&c->freelist_wait);
+				spin_unlock(&c->freelist_lock);
+				goto out;
+			}
+		spin_unlock(&c->freelist_lock);
 
 		if ((current->flags & PF_KTHREAD) &&
 		    kthread_should_stop()) {
@@ -889,22 +908,20 @@ static int push_invalidated_bucket(struct bch_fs *c, struct bch_dev *ca, size_t
 		schedule();
 		try_to_freeze();
 	}
-
+out:
 	__set_current_state(TASK_RUNNING);
 	return ret;
 }
 
 /*
- * Given an invalidated, ready to use bucket: issue a discard to it if enabled,
- * then add it to the freelist, waiting until there's room if necessary:
+ * Pulls buckets off free_inc, discards them (if enabled), then adds them to
+ * freelists, waiting until there's room if necessary:
  */
 static int discard_invalidated_buckets(struct bch_fs *c, struct bch_dev *ca)
 {
-	while (ca->nr_invalidated) {
+	while (!fifo_empty(&ca->free_inc)) {
 		size_t bucket = fifo_peek(&ca->free_inc);
 
-		BUG_ON(fifo_empty(&ca->free_inc) || !ca->nr_invalidated);
-
 		if (ca->mi.discard &&
 		    bdev_max_discard_sectors(ca->disk_sb.bdev))
 			blkdev_issue_discard(ca->disk_sb.bdev,
@@ -930,68 +947,32 @@ static int bch2_allocator_thread(void *arg)
 {
 	struct bch_dev *ca = arg;
 	struct bch_fs *c = ca->fs;
-	u64 journal_seq;
+	size_t nr;
 	int ret;
 
 	set_freezable();
 
 	while (1) {
-		while (1) {
-			cond_resched();
-
-			pr_debug("discarding %zu invalidated buckets",
-				 ca->nr_invalidated);
-
-			ret = discard_invalidated_buckets(c, ca);
-			if (ret)
-				goto stop;
-
-			if (fifo_empty(&ca->free_inc))
-				break;
+		cond_resched();
 
-			pr_debug("invalidating %zu buckets",
-				 fifo_used(&ca->free_inc));
+		pr_debug("discarding %zu invalidated buckets",
+			 fifo_used(&ca->free_inc));
 
-			journal_seq = 0;
-			ret = bch2_invalidate_free_inc(c, ca, &journal_seq,
-						       SIZE_MAX, true);
-			if (ret) {
-				bch_err(ca, "error invalidating buckets: %i", ret);
-				goto stop;
-			}
-
-			if (!ca->nr_invalidated) {
-				bch_err(ca, "allocator thread unable to make forward progress!");
-				goto stop;
-			}
+		ret = discard_invalidated_buckets(c, ca);
+		if (ret)
+			goto stop;
 
-			if (ca->allocator_invalidating_data)
-				ret = bch2_journal_flush_seq(&c->journal, journal_seq);
-			else if (ca->allocator_journal_seq_flush)
-				ret = bch2_journal_flush_seq(&c->journal,
-						       ca->allocator_journal_seq_flush);
+		ret = bch2_invalidate_buckets(c, ca);
+		if (ret)
+			goto stop;
 
-			/*
-			 * journal error - buckets haven't actually been
-			 * invalidated, can't discard them:
-			 */
-			if (ret) {
-				bch_err(ca, "journal error: %i", ret);
-				goto stop;
-			}
-		}
+		if (!fifo_empty(&ca->free_inc))
+			continue;
 
 		pr_debug("free_inc now empty");
 
-		/* Reset front/back so we can easily sort fifo entries later: */
-		ca->free_inc.front = ca->free_inc.back	= 0;
-		ca->allocator_journal_seq_flush		= 0;
-		ca->allocator_invalidating_data		= false;
-
 		down_read(&c->gc_lock);
-		while (1) {
-			size_t prev = fifo_used(&ca->free_inc);
-
+		do {
 			if (test_bit(BCH_FS_GC_FAILURE, &c->flags)) {
 				up_read(&c->gc_lock);
 				bch_err(ca, "gc failure");
@@ -1007,56 +988,46 @@ static int bch2_allocator_thread(void *arg)
 
 			pr_debug("scanning for reclaimable buckets");
 
-			find_reclaimable_buckets(c, ca);
+			nr = find_reclaimable_buckets(c, ca);
 
-			pr_debug("found %zu buckets (free_inc %zu/%zu)",
-				 fifo_used(&ca->free_inc) - prev,
-				 fifo_used(&ca->free_inc), ca->free_inc.size);
+			pr_debug("found %zu buckets", nr);
 
-			trace_alloc_batch(ca, fifo_used(&ca->free_inc),
-					  ca->free_inc.size);
+			trace_alloc_batch(ca, nr, ca->alloc_heap.size);
 
-			if ((ca->inc_gen_needs_gc >= ca->free_inc.size ||
-			     (!fifo_full(&ca->free_inc) &&
-			      ca->inc_gen_really_needs_gc >=
-			      fifo_free(&ca->free_inc))) &&
+			if ((ca->inc_gen_needs_gc >= ALLOC_SCAN_BATCH(ca) ||
+			     ca->inc_gen_really_needs_gc) &&
 			    c->gc_thread) {
 				atomic_inc(&c->kick_gc);
 				wake_up_process(c->gc_thread);
 			}
 
-			if (fifo_full(&ca->free_inc))
-				break;
-
-			if (!fifo_empty(&ca->free_inc) &&
-			    !fifo_full(&ca->free[RESERVE_MOVINGGC]))
-				break;
-
 			/*
-			 * copygc may be waiting until either its reserve fills
-			 * up, or we can't make forward progress:
+			 * If we found any buckets, we have to invalidate them
+			 * before we scan for more - but if we didn't find very
+			 * many we may want to wait on more buckets being
+			 * available so we don't spin:
 			 */
-			ca->allocator_blocked = true;
-			closure_wake_up(&c->freelist_wait);
-
-			ret = wait_buckets_available(c, ca);
-			if (ret) {
-				up_read(&c->gc_lock);
-				goto stop;
+			if (!nr ||
+			    (nr < ALLOC_SCAN_BATCH(ca) &&
+			     !fifo_full(&ca->free[RESERVE_MOVINGGC]))) {
+				ca->allocator_blocked = true;
+				closure_wake_up(&c->freelist_wait);
+
+				ret = wait_buckets_available(c, ca);
+				if (ret) {
+					up_read(&c->gc_lock);
+					goto stop;
+				}
 			}
-		}
+		} while (!nr);
 
 		ca->allocator_blocked = false;
 		up_read(&c->gc_lock);
 
-		pr_debug("free_inc now %zu/%zu",
-			 fifo_used(&ca->free_inc),
-			 ca->free_inc.size);
-
-		sort_free_inc(c, ca);
+		pr_debug("%zu buckets to invalidate", nr);
 
 		/*
-		 * free_inc is now full of newly-invalidated buckets: next,
+		 * alloc_heap is now full of newly-invalidated buckets: next,
 		 * write out the new bucket gens:
 		 */
 	}
@@ -1946,39 +1917,83 @@ int bch2_dev_allocator_start(struct bch_dev *ca)
 	return 0;
 }
 
+static void flush_held_btree_writes(struct bch_fs *c)
+{
+	struct bucket_table *tbl;
+	struct rhash_head *pos;
+	struct btree *b;
+	bool flush_updates;
+	size_t i, nr_pending_updates;
+
+	clear_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags);
+again:
+	pr_debug("flushing dirty btree nodes");
+	cond_resched();
+
+	flush_updates = false;
+	nr_pending_updates = bch2_btree_interior_updates_nr_pending(c);
+
+	rcu_read_lock();
+	for_each_cached_btree(b, c, tbl, i, pos)
+		if (btree_node_dirty(b) && (!b->written || b->level)) {
+			if (btree_node_may_write(b)) {
+				rcu_read_unlock();
+				btree_node_lock_type(c, b, SIX_LOCK_read);
+				bch2_btree_node_write(c, b, SIX_LOCK_read);
+				six_unlock_read(&b->lock);
+				goto again;
+			} else {
+				flush_updates = true;
+			}
+		}
+	rcu_read_unlock();
+
+	if (c->btree_roots_dirty)
+		bch2_journal_meta(&c->journal);
+
+	/*
+	 * This is ugly, but it's needed to flush btree node writes
+	 * without spinning...
+	 */
+	if (flush_updates) {
+		closure_wait_event(&c->btree_interior_update_wait,
+				   bch2_btree_interior_updates_nr_pending(c) <
+				   nr_pending_updates);
+		goto again;
+	}
+
+}
+
 static void allocator_start_issue_discards(struct bch_fs *c)
 {
 	struct bch_dev *ca;
 	unsigned dev_iter;
-	size_t i, bu;
-
-	for_each_rw_member(ca, c, dev_iter) {
-		unsigned done = 0;
-
-		fifo_for_each_entry(bu, &ca->free_inc, i) {
-			if (done == ca->nr_invalidated)
-				break;
+	size_t bu;
 
+	for_each_rw_member(ca, c, dev_iter)
+		while (fifo_pop(&ca->free_inc, bu))
 			blkdev_issue_discard(ca->disk_sb.bdev,
 					     bucket_to_sector(ca, bu),
 					     ca->mi.bucket_size, GFP_NOIO);
-			done++;
-		}
-	}
 }
 
 static int __bch2_fs_allocator_start(struct bch_fs *c)
 {
 	struct bch_dev *ca;
-	size_t bu, i;
 	unsigned dev_iter;
 	u64 journal_seq = 0;
+	long bu;
 	bool invalidating_data = false;
 	int ret = 0;
 
 	if (test_bit(BCH_FS_GC_FAILURE, &c->flags))
 		return -1;
 
+	if (test_alloc_startup(c)) {
+		invalidating_data = true;
+		goto not_enough;
+	}
+
 	/* Scan for buckets that are already invalidated: */
 	for_each_rw_member(ca, c, dev_iter) {
 		struct btree_iter iter;
@@ -2003,7 +2018,6 @@ static int __bch2_fs_allocator_start(struct bch_fs *c)
 			percpu_up_read(&c->usage_lock);
 
 			fifo_push(&ca->free_inc, bu);
-			ca->nr_invalidated++;
 
 			if (fifo_full(&ca->free_inc))
 				break;
@@ -2022,24 +2036,23 @@ static int __bch2_fs_allocator_start(struct bch_fs *c)
 not_enough:
 	pr_debug("did not find enough empty buckets; issuing discards");
 
-	/* clear out free_inc - find_reclaimable_buckets() assumes it's empty */
+	/* clear out free_inc, we'll be using it again below: */
 	for_each_rw_member(ca, c, dev_iter)
 		discard_invalidated_buckets(c, ca);
 
 	pr_debug("scanning for reclaimable buckets");
 
 	for_each_rw_member(ca, c, dev_iter) {
-		BUG_ON(!fifo_empty(&ca->free_inc));
-		ca->free_inc.front = ca->free_inc.back	= 0;
-
 		find_reclaimable_buckets(c, ca);
-		sort_free_inc(c, ca);
 
-		invalidating_data |= ca->allocator_invalidating_data;
+		while (!fifo_full(&ca->free[RESERVE_BTREE]) &&
+		       (bu = next_alloc_bucket(ca)) >= 0) {
+			invalidating_data |=
+				bch2_invalidate_one_bucket(c, ca, bu, &journal_seq);
 
-		fifo_for_each_entry(bu, &ca->free_inc, i)
-			if (!fifo_push(&ca->free[RESERVE_BTREE], bu))
-				break;
+			fifo_push(&ca->free[RESERVE_BTREE], bu);
+			set_bit(bu, ca->buckets_dirty);
+		}
 	}
 
 	pr_debug("done scanning for reclaimable buckets");
@@ -2065,16 +2078,9 @@ not_enough:
 	 * XXX: it's possible for this to deadlock waiting on journal reclaim,
 	 * since we're holding btree writes. What then?
 	 */
-
-	for_each_rw_member(ca, c, dev_iter) {
-		ret = bch2_invalidate_free_inc(c, ca, &journal_seq,
-					       ca->free[RESERVE_BTREE].size,
-					       false);
-		if (ret) {
-			percpu_ref_put(&ca->io_ref);
-			return ret;
-		}
-	}
+	ret = bch2_alloc_write(c);
+	if (ret)
+		return ret;
 
 	if (invalidating_data) {
 		pr_debug("flushing journal");
@@ -2087,57 +2093,11 @@ not_enough:
 		allocator_start_issue_discards(c);
 	}
 
-	for_each_rw_member(ca, c, dev_iter)
-		while (ca->nr_invalidated) {
-			BUG_ON(!fifo_pop(&ca->free_inc, bu));
-			ca->nr_invalidated--;
-		}
-
 	set_bit(BCH_FS_ALLOCATOR_STARTED, &c->flags);
 
 	/* now flush dirty btree nodes: */
-	if (invalidating_data) {
-		struct bucket_table *tbl;
-		struct rhash_head *pos;
-		struct btree *b;
-		bool flush_updates;
-		size_t nr_pending_updates;
-
-		clear_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags);
-again:
-		pr_debug("flushing dirty btree nodes");
-		cond_resched();
-
-		flush_updates = false;
-		nr_pending_updates = bch2_btree_interior_updates_nr_pending(c);
-
-
-		rcu_read_lock();
-		for_each_cached_btree(b, c, tbl, i, pos)
-			if (btree_node_dirty(b) && (!b->written || b->level)) {
-				if (btree_node_may_write(b)) {
-					rcu_read_unlock();
-					btree_node_lock_type(c, b, SIX_LOCK_read);
-					bch2_btree_node_write(c, b, SIX_LOCK_read);
-					six_unlock_read(&b->lock);
-					goto again;
-				} else {
-					flush_updates = true;
-				}
-			}
-		rcu_read_unlock();
-
-		/*
-		 * This is ugly, but it's needed to flush btree node writes
-		 * without spinning...
-		 */
-		if (flush_updates) {
-			closure_wait_event(&c->btree_interior_update_wait,
-				bch2_btree_interior_updates_nr_pending(c) <
-				nr_pending_updates);
-			goto again;
-		}
-	}
+	if (invalidating_data)
+		flush_held_btree_writes(c);
 
 	return 0;
 }
diff --git a/fs/bcachefs/alloc.h b/fs/bcachefs/alloc.h
index 00d01f464c68..2a6500d6f97a 100644
--- a/fs/bcachefs/alloc.h
+++ b/fs/bcachefs/alloc.h
@@ -9,6 +9,8 @@ struct bch_dev;
 struct bch_fs;
 struct bch_devs_List;
 
+#define ALLOC_SCAN_BATCH(ca)		((ca)->mi.nbuckets >> 9)
+
 const char *bch2_alloc_invalid(const struct bch_fs *, struct bkey_s_c);
 void bch2_alloc_to_text(struct bch_fs *, char *, size_t, struct bkey_s_c);
 
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 7c6b1925f67b..8dd96a2de1a3 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -270,6 +270,10 @@ do {									\
 		"Store the journal sequence number in the version "	\
 		"number of every btree key, and verify that btree "	\
 		"update ordering is preserved during recovery")		\
+	BCH_DEBUG_PARAM(test_alloc_startup,				\
+		"Force allocator startup to use the slowpath where it"	\
+		"can't find enough free buckets without invalidating"	\
+		"cached data")
 
 #define BCH_DEBUG_PARAMS_ALL() BCH_DEBUG_PARAMS_ALWAYS() BCH_DEBUG_PARAMS_DEBUG()
 
@@ -403,7 +407,6 @@ struct bch_dev {
 	alloc_fifo		free[RESERVE_NR];
 	alloc_fifo		free_inc;
 	spinlock_t		freelist_lock;
-	size_t			nr_invalidated;
 
 	u8			open_buckets_partial[OPEN_BUCKETS_COUNT];
 	unsigned		open_buckets_partial_nr;
@@ -415,8 +418,6 @@ struct bch_dev {
 
 	size_t			inc_gen_needs_gc;
 	size_t			inc_gen_really_needs_gc;
-	u64			allocator_journal_seq_flush;
-	bool			allocator_invalidating_data;
 	bool			allocator_blocked;
 
 	alloc_heap		alloc_heap;
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index cc1f8b9a9e09..bc667ac70f57 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -1145,7 +1145,8 @@ static void bch2_btree_set_root(struct btree_update *as, struct btree *b,
 	struct btree *old;
 
 	trace_btree_set_root(c, b);
-	BUG_ON(!b->written);
+	BUG_ON(!b->written &&
+	       !test_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags));
 
 	old = btree_node_root(c, b);
 
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index eec2f6cb4f5b..6a7e8b7b6a79 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -405,7 +405,7 @@ static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
 	_old;							\
 })
 
-bool bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca,
+void bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca,
 			    size_t b, struct bucket_mark *old)
 {
 	struct bucket *g;
@@ -416,8 +416,7 @@ bool bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca,
 	g = bucket(ca, b);
 
 	*old = bucket_data_cmpxchg(c, ca, g, new, ({
-		if (!is_available_bucket(new))
-			return false;
+		BUG_ON(!is_available_bucket(new));
 
 		new.owned_by_allocator	= 1;
 		new.data_type		= 0;
@@ -429,7 +428,6 @@ bool bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca,
 	if (!old->owned_by_allocator && old->cached_sectors)
 		trace_invalidate(ca, bucket_to_sector(ca, b),
 				 old->cached_sectors);
-	return true;
 }
 
 void bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
@@ -822,7 +820,8 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
 	/* XXX: these should be tunable */
 	size_t reserve_none	= max_t(size_t, 4, ca->mi.nbuckets >> 9);
 	size_t copygc_reserve	= max_t(size_t, 16, ca->mi.nbuckets >> 7);
-	size_t free_inc_reserve = copygc_reserve / 2;
+	size_t free_inc_nr	= max(max_t(size_t, 16, ca->mi.nbuckets >> 12),
+				      btree_reserve);
 	bool resize = ca->buckets != NULL,
 	     start_copygc = ca->copygc_thread != NULL;
 	int ret = -ENOMEM;
@@ -845,8 +844,8 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
 	    !init_fifo(&free[RESERVE_MOVINGGC],
 		       copygc_reserve, GFP_KERNEL) ||
 	    !init_fifo(&free[RESERVE_NONE], reserve_none, GFP_KERNEL) ||
-	    !init_fifo(&free_inc,	free_inc_reserve, GFP_KERNEL) ||
-	    !init_heap(&alloc_heap,	free_inc_reserve, GFP_KERNEL) ||
+	    !init_fifo(&free_inc,	free_inc_nr, GFP_KERNEL) ||
+	    !init_heap(&alloc_heap,	ALLOC_SCAN_BATCH(ca) << 1, GFP_KERNEL) ||
 	    !init_heap(&copygc_heap,	copygc_reserve, GFP_KERNEL))
 		goto err;
 
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index d0dc9c8b4f0b..2671ad29edf9 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -205,7 +205,7 @@ static inline bool bucket_needs_journal_commit(struct bucket_mark m,
 
 void bch2_bucket_seq_cleanup(struct bch_fs *);
 
-bool bch2_invalidate_bucket(struct bch_fs *, struct bch_dev *,
+void bch2_invalidate_bucket(struct bch_fs *, struct bch_dev *,
 			    size_t, struct bucket_mark *);
 void bch2_mark_alloc_bucket(struct bch_fs *, struct bch_dev *,
 			    size_t, bool, struct gc_pos, unsigned);
-- 
cgit 


From 3142e7ef4b39f03c1e1aac90c2ac8f07f55ffa64 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 23 Jul 2018 05:28:40 -0400
Subject: bcachefs: fix nbuckets usage on device resize

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/buckets.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 6a7e8b7b6a79..65232f5e61bc 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -818,9 +818,9 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
 	size_t btree_reserve	= DIV_ROUND_UP(BTREE_NODE_RESERVE,
 			     ca->mi.bucket_size / c->opts.btree_node_size);
 	/* XXX: these should be tunable */
-	size_t reserve_none	= max_t(size_t, 4, ca->mi.nbuckets >> 9);
-	size_t copygc_reserve	= max_t(size_t, 16, ca->mi.nbuckets >> 7);
-	size_t free_inc_nr	= max(max_t(size_t, 16, ca->mi.nbuckets >> 12),
+	size_t reserve_none	= max_t(size_t, 4, nbuckets >> 9);
+	size_t copygc_reserve	= max_t(size_t, 16, nbuckets >> 7);
+	size_t free_inc_nr	= max(max_t(size_t, 16, nbuckets >> 12),
 				      btree_reserve);
 	bool resize = ca->buckets != NULL,
 	     start_copygc = ca->copygc_thread != NULL;
-- 
cgit 


From 73ab6f356db737d9997c830730ab927bbdf6b678 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 23 Jul 2018 05:48:35 -0400
Subject: bcachefs: fix fsync after create

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index cd29404e0b9b..5963f88b8156 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -327,6 +327,7 @@ __bch2_create(struct mnt_idmap *idmap,
 	struct bch_inode_unpacked inode_u;
 	struct bch_hash_info hash_info;
 	struct posix_acl *default_acl = NULL, *acl = NULL;
+	u64 journal_seq = 0;
 	int ret;
 
 	bch2_inode_init(c, &inode_u, 0, 0, 0, rdev, &dir->ei_inode);
@@ -391,7 +392,7 @@ retry:
 					  &inode_u)
 		 : 0) ?:
 		bch2_trans_commit(&trans, NULL, NULL,
-				  &inode->ei_journal_seq,
+				  &journal_seq,
 				  BTREE_INSERT_ATOMIC|
 				  BTREE_INSERT_NOUNLOCK);
 	if (ret == -EINTR)
@@ -409,6 +410,7 @@ retry:
 	}
 
 	bch2_vfs_inode_init(c, inode, &inode_u);
+	journal_seq_copy(inode, journal_seq);
 
 	set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl);
 	set_cached_acl(&inode->v, ACL_TYPE_DEFAULT, default_acl);
-- 
cgit 


From 658971f2769a8c9beaa09dd52218b5b2d17ca497 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 23 Jul 2018 05:48:53 -0400
Subject: bcachefs: fix mtime/ctime update on truncate

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-io.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 33c379ecf5a1..ed028b5b7613 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -278,7 +278,8 @@ static int i_sectors_dirty_finish(struct bch_fs *c, struct i_sectors_hook *h)
 	mutex_lock(&h->inode->ei_update_lock);
 	i_sectors_acct(c, h->inode, &h->quota_res, h->sectors);
 
-	ret = bch2_write_inode(c, h->inode, i_sectors_dirty_finish_fn, h, 0);
+	ret = bch2_write_inode(c, h->inode, i_sectors_dirty_finish_fn,
+			       h, ATTR_MTIME|ATTR_CTIME);
 
 	if (!ret && h->new_i_size != U64_MAX)
 		i_size_write(&h->inode->v, h->new_i_size);
-- 
cgit 


From 60476b14b4c4e7a733047374aa135fb16c6ab340 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 23 Jul 2018 07:38:06 -0400
Subject: bcachefs: fix last_seq_ondisk

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/journal_io.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index d479d946eea7..dd423e79a65c 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -1207,6 +1207,7 @@ static void journal_write_done(struct closure *cl)
 	struct bch_devs_list devs =
 		bch2_extent_devs(bkey_i_to_s_c_extent(&w->key));
 	u64 seq = le64_to_cpu(w->data->seq);
+	u64 last_seq = le64_to_cpu(w->data->last_seq);
 
 	if (!devs.nr) {
 		bch_err(c, "unable to write journal to sufficient devices");
@@ -1219,7 +1220,7 @@ out:
 	bch2_time_stats_update(j->write_time, j->write_start_time);
 
 	spin_lock(&j->lock);
-	j->last_seq_ondisk = seq;
+	j->last_seq_ondisk	= last_seq;
 	if (seq >= j->pin.front)
 		journal_seq_pin(j, seq)->devs = devs;
 
-- 
cgit 


From 9c859dc91b5a85bab4dcb72087528c6cfd7207b4 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 23 Jul 2018 07:52:00 -0400
Subject: bcachefs: Assorted journal refactoring

Also improve error reporting - only return an error from
bch2_journal_flush_seq() if we had an error writing that entry (i.e. not
if there was an error with a newer entry).

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/journal.c       | 214 ++++++++++++++++++--------------------------
 fs/bcachefs/journal_io.c    |   9 +-
 fs/bcachefs/journal_types.h |   3 +-
 3 files changed, 96 insertions(+), 130 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index a83c45b82f95..b83548ae33b2 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -90,7 +90,7 @@ static enum {
 } journal_buf_switch(struct journal *j, bool need_write_just_set)
 {
 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
-	struct journal_buf *buf;
+	struct journal_buf *buf = journal_cur_buf(j);
 	union journal_res_state old, new;
 	u64 v = atomic64_read(&j->reservations.counter);
 
@@ -101,8 +101,11 @@ static enum {
 		if (old.cur_entry_offset == JOURNAL_ENTRY_CLOSED_VAL)
 			return JOURNAL_ENTRY_CLOSED;
 
-		if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL)
+		if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL) {
+			/* this entry will never be written: */
+			closure_wake_up(&buf->wait);
 			return JOURNAL_ENTRY_ERROR;
+		}
 
 		if (new.prev_buf_unwritten)
 			return JOURNAL_ENTRY_INUSE;
@@ -123,7 +126,6 @@ static enum {
 
 	clear_bit(JOURNAL_NEED_WRITE, &j->flags);
 
-	buf = &j->buf[old.idx];
 	buf->data->u64s		= cpu_to_le32(old.cur_entry_offset);
 
 	j->prev_buf_sectors =
@@ -270,34 +272,42 @@ static int journal_entry_open(struct journal *j)
 	return 1;
 }
 
-/*
- * returns true if there's nothing to flush and no journal write still in flight
- */
-static bool journal_flush_write(struct journal *j)
+static bool __journal_entry_close(struct journal *j)
 {
-	bool ret;
-
-	spin_lock(&j->lock);
-	ret = !j->reservations.prev_buf_unwritten;
+	bool set_need_write;
 
 	if (!journal_entry_is_open(j)) {
 		spin_unlock(&j->lock);
-		return ret;
+		return true;
 	}
 
-	set_bit(JOURNAL_NEED_WRITE, &j->flags);
-	if (journal_buf_switch(j, false) == JOURNAL_UNLOCKED)
-		ret = false;
-	else
+	set_need_write = !test_and_set_bit(JOURNAL_NEED_WRITE, &j->flags);
+	if (set_need_write)
+		j->need_write_time = local_clock();
+
+	switch (journal_buf_switch(j, set_need_write)) {
+	case JOURNAL_ENTRY_INUSE:
 		spin_unlock(&j->lock);
-	return ret;
+		return false;
+	default:
+		spin_unlock(&j->lock);
+		fallthrough;
+	case JOURNAL_UNLOCKED:
+		return true;
+	}
+}
+
+static bool journal_entry_close(struct journal *j)
+{
+	spin_lock(&j->lock);
+	return __journal_entry_close(j);
 }
 
 static void journal_write_work(struct work_struct *work)
 {
 	struct journal *j = container_of(work, struct journal, write_work.work);
 
-	journal_flush_write(j);
+	journal_entry_close(j);
 }
 
 /*
@@ -467,6 +477,37 @@ int bch2_journal_open_seq_async(struct journal *j, u64 seq, struct closure *pare
 	return ret;
 }
 
+static int journal_seq_error(struct journal *j, u64 seq)
+{
+	union journal_res_state state = READ_ONCE(j->reservations);
+
+	if (seq == journal_cur_seq(j))
+		return bch2_journal_error(j);
+
+	if (seq + 1 == journal_cur_seq(j) &&
+	    !state.prev_buf_unwritten &&
+	    seq > j->seq_ondisk)
+		return -EIO;
+
+	return 0;
+}
+
+static inline struct journal_buf *
+journal_seq_to_buf(struct journal *j, u64 seq)
+{
+	/* seq should be for a journal entry that has been opened: */
+	BUG_ON(seq > journal_cur_seq(j));
+	BUG_ON(seq == journal_cur_seq(j) &&
+	       j->reservations.cur_entry_offset == JOURNAL_ENTRY_CLOSED_VAL);
+
+	if (seq == journal_cur_seq(j))
+		return journal_cur_buf(j);
+	if (seq + 1 == journal_cur_seq(j) &&
+	    j->reservations.prev_buf_unwritten)
+		return journal_prev_buf(j);
+	return NULL;
+}
+
 /**
  * bch2_journal_wait_on_seq - wait for a journal entry to be written
  *
@@ -475,31 +516,22 @@ int bch2_journal_open_seq_async(struct journal *j, u64 seq, struct closure *pare
  * can wait for an arbitrary amount of time (up to @j->write_delay_ms, which is
  * configurable).
  */
-void bch2_journal_wait_on_seq(struct journal *j, u64 seq, struct closure *parent)
+void bch2_journal_wait_on_seq(struct journal *j, u64 seq,
+			      struct closure *parent)
 {
-	spin_lock(&j->lock);
-
-	BUG_ON(seq > journal_cur_seq(j));
+	struct journal_buf *buf;
 
-	if (bch2_journal_error(j)) {
-		spin_unlock(&j->lock);
-		return;
-	}
+	spin_lock(&j->lock);
 
-	if (seq == journal_cur_seq(j)) {
-		if (!closure_wait(&journal_cur_buf(j)->wait, parent))
-			BUG();
-	} else if (seq + 1 == journal_cur_seq(j) &&
-		   j->reservations.prev_buf_unwritten) {
-		if (!closure_wait(&journal_prev_buf(j)->wait, parent))
+	if ((buf = journal_seq_to_buf(j, seq))) {
+		if (!closure_wait(&buf->wait, parent))
 			BUG();
 
-		smp_mb();
-
-		/* check if raced with write completion (or failure) */
-		if (!j->reservations.prev_buf_unwritten ||
-		    bch2_journal_error(j))
-			closure_wake_up(&journal_prev_buf(j)->wait);
+		if (seq == journal_cur_seq(j)) {
+			smp_mb();
+			if (bch2_journal_error(j))
+				closure_wake_up(&buf->wait);
+		}
 	}
 
 	spin_unlock(&j->lock);
@@ -511,108 +543,35 @@ void bch2_journal_wait_on_seq(struct journal *j, u64 seq, struct closure *parent
  * like bch2_journal_wait_on_seq, except that it triggers a write immediately if
  * necessary
  */
-void bch2_journal_flush_seq_async(struct journal *j, u64 seq, struct closure *parent)
+void bch2_journal_flush_seq_async(struct journal *j, u64 seq,
+				  struct closure *parent)
 {
 	struct journal_buf *buf;
 
 	spin_lock(&j->lock);
 
-	BUG_ON(seq > journal_cur_seq(j));
-
-	if (bch2_journal_error(j)) {
-		spin_unlock(&j->lock);
-		return;
-	}
-
-	if (seq == journal_cur_seq(j)) {
-		bool set_need_write = false;
-
-		buf = journal_cur_buf(j);
-
-		if (parent && !closure_wait(&buf->wait, parent))
-			BUG();
-
-		if (!test_and_set_bit(JOURNAL_NEED_WRITE, &j->flags)) {
-			j->need_write_time = local_clock();
-			set_need_write = true;
-		}
-
-		switch (journal_buf_switch(j, set_need_write)) {
-		case JOURNAL_ENTRY_ERROR:
-			if (parent)
-				closure_wake_up(&buf->wait);
-			break;
-		case JOURNAL_ENTRY_CLOSED:
-			/*
-			 * Journal entry hasn't been opened yet, but caller
-			 * claims it has something
-			 */
-			BUG();
-		case JOURNAL_ENTRY_INUSE:
-			break;
-		case JOURNAL_UNLOCKED:
-			return;
-		}
-	} else if (parent &&
-		   seq + 1 == journal_cur_seq(j) &&
-		   j->reservations.prev_buf_unwritten) {
-		buf = journal_prev_buf(j);
-
+	if (parent &&
+	    (buf = journal_seq_to_buf(j, seq)))
 		if (!closure_wait(&buf->wait, parent))
 			BUG();
 
-		smp_mb();
-
-		/* check if raced with write completion (or failure) */
-		if (!j->reservations.prev_buf_unwritten ||
-		    bch2_journal_error(j))
-			closure_wake_up(&buf->wait);
-	}
-
-	spin_unlock(&j->lock);
+	if (seq == journal_cur_seq(j))
+		__journal_entry_close(j);
+	else
+		spin_unlock(&j->lock);
 }
 
 static int journal_seq_flushed(struct journal *j, u64 seq)
 {
-	struct journal_buf *buf;
-	int ret = 1;
+	int ret;
 
 	spin_lock(&j->lock);
-	BUG_ON(seq > journal_cur_seq(j));
-
-	if (seq == journal_cur_seq(j)) {
-		bool set_need_write = false;
-
-		ret = 0;
-
-		buf = journal_cur_buf(j);
+	ret = seq <= j->seq_ondisk ? 1 : journal_seq_error(j, seq);
 
-		if (!test_and_set_bit(JOURNAL_NEED_WRITE, &j->flags)) {
-			j->need_write_time = local_clock();
-			set_need_write = true;
-		}
-
-		switch (journal_buf_switch(j, set_need_write)) {
-		case JOURNAL_ENTRY_ERROR:
-			ret = -EIO;
-			break;
-		case JOURNAL_ENTRY_CLOSED:
-			/*
-			 * Journal entry hasn't been opened yet, but caller
-			 * claims it has something
-			 */
-			BUG();
-		case JOURNAL_ENTRY_INUSE:
-			break;
-		case JOURNAL_UNLOCKED:
-			return 0;
-		}
-	} else if (seq + 1 == journal_cur_seq(j) &&
-		   j->reservations.prev_buf_unwritten) {
-		ret = bch2_journal_error(j);
-	}
-
-	spin_unlock(&j->lock);
+	if (seq == journal_cur_seq(j))
+		__journal_entry_close(j);
+	else
+		spin_unlock(&j->lock);
 
 	return ret;
 }
@@ -914,13 +873,16 @@ void bch2_fs_journal_stop(struct journal *j)
 {
 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
 
-	wait_event(j->wait, journal_flush_write(j));
+	wait_event(j->wait, journal_entry_close(j));
 
 	/* do we need to write another journal entry? */
 	if (test_bit(JOURNAL_NOT_EMPTY, &j->flags) ||
 	    c->btree_roots_dirty)
 		bch2_journal_meta(j);
 
+	BUG_ON(journal_entry_is_open(j) ||
+	       j->reservations.prev_buf_unwritten);
+
 	BUG_ON(!bch2_journal_error(j) &&
 	       test_bit(JOURNAL_NOT_EMPTY, &j->flags));
 
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index dd423e79a65c..00c454673a04 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -1209,6 +1209,8 @@ static void journal_write_done(struct closure *cl)
 	u64 seq = le64_to_cpu(w->data->seq);
 	u64 last_seq = le64_to_cpu(w->data->last_seq);
 
+	bch2_time_stats_update(j->write_time, j->write_start_time);
+
 	if (!devs.nr) {
 		bch_err(c, "unable to write journal to sufficient devices");
 		goto err;
@@ -1216,11 +1218,11 @@ static void journal_write_done(struct closure *cl)
 
 	if (bch2_mark_replicas(c, BCH_DATA_JOURNAL, devs))
 		goto err;
-out:
-	bch2_time_stats_update(j->write_time, j->write_start_time);
 
 	spin_lock(&j->lock);
+	j->seq_ondisk		= seq;
 	j->last_seq_ondisk	= last_seq;
+
 	if (seq >= j->pin.front)
 		journal_seq_pin(j, seq)->devs = devs;
 
@@ -1232,7 +1234,7 @@ out:
 	 * bch2_fs_journal_stop():
 	 */
 	mod_delayed_work(system_freezable_wq, &j->reclaim_work, 0);
-
+out:
 	/* also must come before signalling write completion: */
 	closure_debug_destroy(cl);
 
@@ -1250,6 +1252,7 @@ out:
 err:
 	bch2_fatal_error(c);
 	bch2_journal_halt(j);
+	spin_lock(&j->lock);
 	goto out;
 }
 
diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h
index dae8b8a65d75..8502a930a05e 100644
--- a/fs/bcachefs/journal_types.h
+++ b/fs/bcachefs/journal_types.h
@@ -151,7 +151,8 @@ struct journal {
 	/* Sequence number of most recent journal entry (last entry in @pin) */
 	atomic64_t		seq;
 
-	/* last_seq from the most recent journal entry written */
+	/* seq, last_seq from the most recent journal entry successfully written */
+	u64			seq_ondisk;
 	u64			last_seq_ondisk;
 
 	/*
-- 
cgit 


From bb1b3658aa7259bdacf7500abdeb8fdff61a51ba Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 23 Jul 2018 07:53:29 -0400
Subject: bcachefs: minor fsync fix

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-io.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index ed028b5b7613..cc99eb1b36e0 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -2107,7 +2107,7 @@ int bch2_fsync(struct file *file, loff_t start, loff_t end, int datasync)
 {
 	struct bch_inode_info *inode = file_bch_inode(file);
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	int ret;
+	int ret, ret2;
 
 	ret = file_write_and_wait_range(file, start, end);
 	if (ret)
@@ -2123,7 +2123,10 @@ out:
 	if (c->opts.journal_flush_disabled)
 		return 0;
 
-	return bch2_journal_flush_seq(&c->journal, inode->ei_journal_seq);
+	ret = bch2_journal_flush_seq(&c->journal, inode->ei_journal_seq);
+	ret2 = file_check_and_advance_wb_err(file);
+
+	return ret ?: ret2;
 }
 
 /* truncate: */
-- 
cgit 


From 277c981c634f3e64dd99523aabfd9ed5e6c5be55 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 23 Jul 2018 09:13:07 -0400
Subject: bcachefs: fix bch2_val_to_text()

was returning wrong value

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc.c        |  8 ++++++--
 fs/bcachefs/alloc.h        |  2 +-
 fs/bcachefs/bkey_methods.c | 21 ++++++++++++++++-----
 fs/bcachefs/bkey_methods.h |  3 ++-
 fs/bcachefs/dirent.c       | 16 +++++++++-------
 fs/bcachefs/dirent.h       |  2 +-
 fs/bcachefs/extents.c      | 10 ++++++----
 fs/bcachefs/extents.h      |  4 ++--
 fs/bcachefs/inode.c        |  6 ++++--
 fs/bcachefs/inode.h        |  2 +-
 fs/bcachefs/quota.c        |  8 +++++---
 fs/bcachefs/quota.h        |  2 +-
 fs/bcachefs/xattr.c        | 32 +++++++++++++++++---------------
 fs/bcachefs/xattr.h        |  2 +-
 14 files changed, 72 insertions(+), 46 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc.c b/fs/bcachefs/alloc.c
index 19523226afd8..ea1dc52e5ff6 100644
--- a/fs/bcachefs/alloc.c
+++ b/fs/bcachefs/alloc.c
@@ -154,8 +154,8 @@ const char *bch2_alloc_invalid(const struct bch_fs *c, struct bkey_s_c k)
 	return NULL;
 }
 
-void bch2_alloc_to_text(struct bch_fs *c, char *buf,
-			size_t size, struct bkey_s_c k)
+int bch2_alloc_to_text(struct bch_fs *c, char *buf,
+		       size_t size, struct bkey_s_c k)
 {
 	buf[0] = '\0';
 
@@ -163,6 +163,8 @@ void bch2_alloc_to_text(struct bch_fs *c, char *buf,
 	case BCH_ALLOC:
 		break;
 	}
+
+	return 0;
 }
 
 static inline unsigned get_alloc_field(const u8 **p, unsigned bytes)
@@ -2067,6 +2069,8 @@ not_enough:
 	 * invalidated on disk:
 	 */
 	if (invalidating_data) {
+		BUG();
+		pr_info("holding writes");
 		pr_debug("invalidating existing data");
 		set_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags);
 	} else {
diff --git a/fs/bcachefs/alloc.h b/fs/bcachefs/alloc.h
index 2a6500d6f97a..739df233236c 100644
--- a/fs/bcachefs/alloc.h
+++ b/fs/bcachefs/alloc.h
@@ -12,7 +12,7 @@ struct bch_devs_List;
 #define ALLOC_SCAN_BATCH(ca)		((ca)->mi.nbuckets >> 9)
 
 const char *bch2_alloc_invalid(const struct bch_fs *, struct bkey_s_c);
-void bch2_alloc_to_text(struct bch_fs *, char *, size_t, struct bkey_s_c);
+int bch2_alloc_to_text(struct bch_fs *, char *, size_t, struct bkey_s_c);
 
 #define bch2_bkey_alloc_ops (struct bkey_ops) {		\
 	.key_invalid	= bch2_alloc_invalid,		\
diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c
index 017425a534c6..8c6c2ca3c992 100644
--- a/fs/bcachefs/bkey_methods.c
+++ b/fs/bcachefs/bkey_methods.c
@@ -123,16 +123,27 @@ void bch2_bkey_debugcheck(struct bch_fs *c, struct btree *b, struct bkey_s_c k)
 
 #define p(...)	(out += scnprintf(out, end - out, __VA_ARGS__))
 
+int bch2_bpos_to_text(char *buf, size_t size, struct bpos pos)
+{
+	char *out = buf, *end = buf + size;
+
+	if (!bkey_cmp(pos, POS_MIN))
+		p("POS_MIN");
+	else if (!bkey_cmp(pos, POS_MAX))
+		p("POS_MAX");
+	else
+		p("%llu:%llu", pos.inode, pos.offset);
+
+	return out - buf;
+}
+
 int bch2_bkey_to_text(char *buf, size_t size, const struct bkey *k)
 {
 	char *out = buf, *end = buf + size;
 
 	p("u64s %u type %u ", k->u64s, k->type);
 
-	if (bkey_cmp(k->p, POS_MAX))
-		p("%llu:%llu", k->p.inode, k->p.offset);
-	else
-		p("POS_MAX");
+	out += bch2_bpos_to_text(out, end - out, k->p);
 
 	p(" snap %u len %u ver %llu", k->p.snapshot, k->size, k->version.lo);
 
@@ -160,7 +171,7 @@ int bch2_val_to_text(struct bch_fs *c, enum bkey_type type,
 		break;
 	default:
 		if (k.k->type >= KEY_TYPE_GENERIC_NR && ops->val_to_text)
-			ops->val_to_text(c, buf, size, k);
+			out += ops->val_to_text(c, out, end - out, k);
 		break;
 	}
 
diff --git a/fs/bcachefs/bkey_methods.h b/fs/bcachefs/bkey_methods.h
index 04c80f3603cc..989b577da928 100644
--- a/fs/bcachefs/bkey_methods.h
+++ b/fs/bcachefs/bkey_methods.h
@@ -57,7 +57,7 @@ struct bkey_ops {
 				       struct bkey_s_c);
 	void		(*key_debugcheck)(struct bch_fs *, struct btree *,
 					  struct bkey_s_c);
-	void		(*val_to_text)(struct bch_fs *, char *,
+	int		(*val_to_text)(struct bch_fs *, char *,
 				       size_t, struct bkey_s_c);
 	void		(*swab)(const struct bkey_format *, struct bkey_packed *);
 	key_filter_fn	key_normalize;
@@ -73,6 +73,7 @@ const char *bch2_bkey_in_btree_node(struct btree *, struct bkey_s_c);
 
 void bch2_bkey_debugcheck(struct bch_fs *, struct btree *, struct bkey_s_c);
 
+int bch2_bpos_to_text(char *, size_t, struct bpos);
 int bch2_bkey_to_text(char *, size_t, const struct bkey *);
 int bch2_val_to_text(struct bch_fs *, enum bkey_type,
 		     char *, size_t, struct bkey_s_c);
diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c
index 18078cc2ca62..d5e174e1e59f 100644
--- a/fs/bcachefs/dirent.c
+++ b/fs/bcachefs/dirent.c
@@ -122,24 +122,26 @@ const char *bch2_dirent_invalid(const struct bch_fs *c, struct bkey_s_c k)
 	}
 }
 
-void bch2_dirent_to_text(struct bch_fs *c, char *buf,
-			 size_t size, struct bkey_s_c k)
+int bch2_dirent_to_text(struct bch_fs *c, char *buf,
+			size_t size, struct bkey_s_c k)
 {
+	char *out = buf, *end = buf + size;
 	struct bkey_s_c_dirent d;
-	size_t n = 0;
 
 	switch (k.k->type) {
 	case BCH_DIRENT:
 		d = bkey_s_c_to_dirent(k);
 
-		n += bch_scnmemcpy(buf + n, size - n, d.v->d_name,
-				   bch2_dirent_name_bytes(d));
-		n += scnprintf(buf + n, size - n, " -> %llu", d.v->d_inum);
+		out += bch_scnmemcpy(out, end - out, d.v->d_name,
+				     bch2_dirent_name_bytes(d));
+		out += scnprintf(out, end - out, " -> %llu", d.v->d_inum);
 		break;
 	case BCH_DIRENT_WHITEOUT:
-		scnprintf(buf, size, "whiteout");
+		out += scnprintf(out, end - out, "whiteout");
 		break;
 	}
+
+	return out - buf;
 }
 
 static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans,
diff --git a/fs/bcachefs/dirent.h b/fs/bcachefs/dirent.h
index d02dc3e10d95..ac28f83d6b2d 100644
--- a/fs/bcachefs/dirent.h
+++ b/fs/bcachefs/dirent.h
@@ -7,7 +7,7 @@
 extern const struct bch_hash_desc bch2_dirent_hash_desc;
 
 const char *bch2_dirent_invalid(const struct bch_fs *, struct bkey_s_c);
-void bch2_dirent_to_text(struct bch_fs *, char *, size_t, struct bkey_s_c);
+int bch2_dirent_to_text(struct bch_fs *, char *, size_t, struct bkey_s_c);
 
 #define bch2_bkey_dirent_ops (struct bkey_ops) {	\
 	.key_invalid	= bch2_dirent_invalid,		\
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 2c1cf29e265a..e0150fbe85af 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -733,8 +733,8 @@ err:
 		      mark.gen, (unsigned) mark.v.counter);
 }
 
-void bch2_btree_ptr_to_text(struct bch_fs *c, char *buf,
-			    size_t size, struct bkey_s_c k)
+int bch2_btree_ptr_to_text(struct bch_fs *c, char *buf,
+			   size_t size, struct bkey_s_c k)
 {
 	char *out = buf, *end = buf + size;
 	const char *invalid;
@@ -748,6 +748,7 @@ void bch2_btree_ptr_to_text(struct bch_fs *c, char *buf,
 	if (invalid)
 		p(" invalid: %s", invalid);
 #undef p
+	return out - buf;
 }
 
 int bch2_btree_pick_ptr(struct bch_fs *c, const struct btree *b,
@@ -1877,8 +1878,8 @@ void bch2_extent_debugcheck(struct bch_fs *c, struct btree *b, struct bkey_s_c k
 	}
 }
 
-void bch2_extent_to_text(struct bch_fs *c, char *buf,
-			 size_t size, struct bkey_s_c k)
+int bch2_extent_to_text(struct bch_fs *c, char *buf,
+			size_t size, struct bkey_s_c k)
 {
 	char *out = buf, *end = buf + size;
 	const char *invalid;
@@ -1892,6 +1893,7 @@ void bch2_extent_to_text(struct bch_fs *c, char *buf,
 	if (invalid)
 		p(" invalid: %s", invalid);
 #undef p
+	return out - buf;
 }
 
 static void bch2_extent_crc_init(union bch_extent_crc *crc,
diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h
index 15aed3c0665b..0598d6309697 100644
--- a/fs/bcachefs/extents.h
+++ b/fs/bcachefs/extents.h
@@ -19,7 +19,7 @@ union bch_extent_crc;
 const char *bch2_btree_ptr_invalid(const struct bch_fs *, struct bkey_s_c);
 void bch2_btree_ptr_debugcheck(struct bch_fs *, struct btree *,
 			       struct bkey_s_c);
-void bch2_btree_ptr_to_text(struct bch_fs *, char *, size_t, struct bkey_s_c);
+int bch2_btree_ptr_to_text(struct bch_fs *, char *, size_t, struct bkey_s_c);
 void bch2_ptr_swab(const struct bkey_format *, struct bkey_packed *);
 
 #define bch2_bkey_btree_ops (struct bkey_ops) {			\
@@ -31,7 +31,7 @@ void bch2_ptr_swab(const struct bkey_format *, struct bkey_packed *);
 
 const char *bch2_extent_invalid(const struct bch_fs *, struct bkey_s_c);
 void bch2_extent_debugcheck(struct bch_fs *, struct btree *, struct bkey_s_c);
-void bch2_extent_to_text(struct bch_fs *, char *, size_t, struct bkey_s_c);
+int bch2_extent_to_text(struct bch_fs *, char *, size_t, struct bkey_s_c);
 bool bch2_ptr_normalize(struct bch_fs *, struct btree *, struct bkey_s);
 enum merge_result bch2_extent_merge(struct bch_fs *, struct btree *,
 				    struct bkey_i *, struct bkey_i *);
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index f40ec37d7f0f..002232ffed62 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -228,8 +228,8 @@ const char *bch2_inode_invalid(const struct bch_fs *c, struct bkey_s_c k)
 	}
 }
 
-void bch2_inode_to_text(struct bch_fs *c, char *buf,
-			size_t size, struct bkey_s_c k)
+int bch2_inode_to_text(struct bch_fs *c, char *buf,
+		       size_t size, struct bkey_s_c k)
 {
 	char *out = buf, *end = out + size;
 	struct bkey_s_c_inode inode;
@@ -249,6 +249,8 @@ void bch2_inode_to_text(struct bch_fs *c, char *buf,
 #undef  BCH_INODE_FIELD
 		break;
 	}
+
+	return out - buf;
 }
 
 void bch2_inode_init(struct bch_fs *c, struct bch_inode_unpacked *inode_u,
diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h
index bd6166c40e6f..ce423a5f2af5 100644
--- a/fs/bcachefs/inode.h
+++ b/fs/bcachefs/inode.h
@@ -7,7 +7,7 @@
 #include <linux/math64.h>
 
 const char *bch2_inode_invalid(const struct bch_fs *, struct bkey_s_c);
-void bch2_inode_to_text(struct bch_fs *, char *, size_t, struct bkey_s_c);
+int bch2_inode_to_text(struct bch_fs *, char *, size_t, struct bkey_s_c);
 
 #define bch2_bkey_inode_ops (struct bkey_ops) {		\
 	.key_invalid	= bch2_inode_invalid,		\
diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c
index 0adbfe523f51..0a305ad08188 100644
--- a/fs/bcachefs/quota.c
+++ b/fs/bcachefs/quota.c
@@ -46,10 +46,10 @@ static const char * const bch2_quota_counters[] = {
 	"inodes",
 };
 
-void bch2_quota_to_text(struct bch_fs *c, char *buf,
-			size_t size, struct bkey_s_c k)
+int bch2_quota_to_text(struct bch_fs *c, char *buf,
+		       size_t size, struct bkey_s_c k)
 {
-	char *out = buf, *end= buf + size;
+	char *out = buf, *end = buf + size;
 	struct bkey_s_c_quota dq;
 	unsigned i;
 
@@ -64,6 +64,8 @@ void bch2_quota_to_text(struct bch_fs *c, char *buf,
 					 le64_to_cpu(dq.v->c[i].softlimit));
 		break;
 	}
+
+	return out - buf;
 }
 
 #ifdef CONFIG_BCACHEFS_QUOTA
diff --git a/fs/bcachefs/quota.h b/fs/bcachefs/quota.h
index 4a76b49f9e00..9650e518cd64 100644
--- a/fs/bcachefs/quota.h
+++ b/fs/bcachefs/quota.h
@@ -8,7 +8,7 @@
 extern const struct bch_sb_field_ops bch_sb_field_ops_quota;
 
 const char *bch2_quota_invalid(const struct bch_fs *, struct bkey_s_c);
-void bch2_quota_to_text(struct bch_fs *, char *, size_t, struct bkey_s_c);
+int bch2_quota_to_text(struct bch_fs *, char *, size_t, struct bkey_s_c);
 
 #define bch2_bkey_quota_ops (struct bkey_ops) {		\
 	.key_invalid	= bch2_quota_invalid,		\
diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c
index cb84bdabb6ed..44bf4a2f3c84 100644
--- a/fs/bcachefs/xattr.c
+++ b/fs/bcachefs/xattr.c
@@ -111,12 +111,12 @@ const char *bch2_xattr_invalid(const struct bch_fs *c, struct bkey_s_c k)
 	}
 }
 
-void bch2_xattr_to_text(struct bch_fs *c, char *buf,
-			size_t size, struct bkey_s_c k)
+int bch2_xattr_to_text(struct bch_fs *c, char *buf,
+		       size_t size, struct bkey_s_c k)
 {
+	char *out = buf, *end = buf + size;
 	const struct xattr_handler *handler;
 	struct bkey_s_c_xattr xattr;
-	size_t n = 0;
 
 	switch (k.k->type) {
 	case BCH_XATTR:
@@ -124,24 +124,26 @@ void bch2_xattr_to_text(struct bch_fs *c, char *buf,
 
 		handler = bch2_xattr_type_to_handler(xattr.v->x_type);
 		if (handler && handler->prefix)
-			n += scnprintf(buf + n, size - n, "%s", handler->prefix);
+			out += scnprintf(out, end - out, "%s", handler->prefix);
 		else if (handler)
-			n += scnprintf(buf + n, size - n, "(type %u)",
-				       xattr.v->x_type);
+			out += scnprintf(out, end - out, "(type %u)",
+					 xattr.v->x_type);
 		else
-			n += scnprintf(buf + n, size - n, "(unknown type %u)",
-				       xattr.v->x_type);
-
-		n += bch_scnmemcpy(buf + n, size - n, xattr.v->x_name,
-				   xattr.v->x_name_len);
-		n += scnprintf(buf + n, size - n, ":");
-		n += bch_scnmemcpy(buf + n, size - n, xattr_val(xattr.v),
-				   le16_to_cpu(xattr.v->x_val_len));
+			out += scnprintf(out, end - out, "(unknown type %u)",
+					 xattr.v->x_type);
+
+		out += bch_scnmemcpy(out, end - out, xattr.v->x_name,
+				     xattr.v->x_name_len);
+		out += scnprintf(out, end - out, ":");
+		out += bch_scnmemcpy(out, end - out, xattr_val(xattr.v),
+				     le16_to_cpu(xattr.v->x_val_len));
 		break;
 	case BCH_XATTR_WHITEOUT:
-		scnprintf(buf, size, "whiteout");
+		out += scnprintf(out, end - out, "whiteout");
 		break;
 	}
+
+	return out - buf;
 }
 
 int bch2_xattr_get(struct bch_fs *c, struct bch_inode_info *inode,
diff --git a/fs/bcachefs/xattr.h b/fs/bcachefs/xattr.h
index 0e7d2fa86213..b2fe1dc42b83 100644
--- a/fs/bcachefs/xattr.h
+++ b/fs/bcachefs/xattr.h
@@ -7,7 +7,7 @@
 extern const struct bch_hash_desc bch2_xattr_hash_desc;
 
 const char *bch2_xattr_invalid(const struct bch_fs *, struct bkey_s_c);
-void bch2_xattr_to_text(struct bch_fs *, char *, size_t, struct bkey_s_c);
+int bch2_xattr_to_text(struct bch_fs *, char *, size_t, struct bkey_s_c);
 
 #define bch2_bkey_xattr_ops (struct bkey_ops) {		\
 	.key_invalid	= bch2_xattr_invalid,		\
-- 
cgit 


From 94c1f4adec42c03c8fb1b7dc41f2fc07481a5395 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 24 Jul 2018 19:45:22 -0400
Subject: bcachefs: Fix locking in allocator thread

gc lock must be held while invalidating buckets - fixes
"1f7a95698e bcachefs: Invalidate buckets when writing to alloc btree"

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc.c b/fs/bcachefs/alloc.c
index ea1dc52e5ff6..192ab655ca23 100644
--- a/fs/bcachefs/alloc.c
+++ b/fs/bcachefs/alloc.c
@@ -964,16 +964,21 @@ static int bch2_allocator_thread(void *arg)
 		if (ret)
 			goto stop;
 
+		down_read(&c->gc_lock);
+
 		ret = bch2_invalidate_buckets(c, ca);
-		if (ret)
+		if (ret) {
+			up_read(&c->gc_lock);
 			goto stop;
+		}
 
-		if (!fifo_empty(&ca->free_inc))
+		if (!fifo_empty(&ca->free_inc)) {
+			up_read(&c->gc_lock);
 			continue;
+		}
 
 		pr_debug("free_inc now empty");
 
-		down_read(&c->gc_lock);
 		do {
 			if (test_bit(BCH_FS_GC_FAILURE, &c->flags)) {
 				up_read(&c->gc_lock);
-- 
cgit 


From 647d7b60b193967f113a47fcfd20102c4498f109 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 24 Jul 2018 16:42:27 -0400
Subject: bcachefs: Fix an assertion in the btree node merge path

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c    | 55 ++++++---------------------------------------
 fs/bcachefs/btree_locking.h | 21 +++++++++++++++++
 2 files changed, 28 insertions(+), 48 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 7bead41b226f..8918268f99f4 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -69,26 +69,6 @@ void __bch2_btree_node_lock_write(struct btree *b, struct btree_iter *iter)
 		     &b->lock.state.counter);
 }
 
-/*
- * Lock a btree node if we already have it locked on one of our linked
- * iterators:
- */
-static inline bool btree_node_lock_increment(struct btree_iter *iter,
-					     struct btree *b, unsigned level,
-					     enum btree_node_locked_type want)
-{
-	struct btree_iter *linked;
-
-	for_each_linked_btree_iter(iter, linked)
-		if (linked->l[level].b == b &&
-		    btree_node_locked_type(linked, level) >= want) {
-			six_lock_increment(&b->lock, (enum six_lock_type) want);
-			return true;
-		}
-
-	return false;
-}
-
 bool __bch2_btree_node_relock(struct btree_iter *iter, unsigned level)
 {
 	struct btree *b = btree_iter_node(iter, level);
@@ -190,34 +170,12 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
 	struct btree_iter *linked;
 	bool ret = true;
 
-	/* Can't have children locked before ancestors: */
-	EBUG_ON(iter->nodes_locked && level > __ffs(iter->nodes_locked));
-
-	/*
-	 * Can't hold any read locks while we block taking an intent lock - see
-	 * below for reasoning, and we should have already dropped any read
-	 * locks in the current iterator
-	 */
-	EBUG_ON(type == SIX_LOCK_intent &&
-		iter->nodes_locked != iter->nodes_intent_locked);
-
-	if (btree_node_lock_increment(iter, b, level, (enum btree_node_locked_type) type))
-		return true;
-
-	/*
-	 * Must lock btree nodes in key order - this case happens when locking
-	 * the prev sibling in btree node merging:
-	 */
-	if (iter->nodes_locked &&
-	    __ffs(iter->nodes_locked) <= level &&
-	    __btree_iter_cmp(iter->btree_id, pos, iter))
-		return false;
-
-	for_each_linked_btree_iter(iter, linked) {
+	/* Check if it's safe to block: */
+	for_each_btree_iter(iter, linked) {
 		if (!linked->nodes_locked)
 			continue;
 
-		/* We have to lock btree nodes in key order: */
+		/* * Must lock btree nodes in key order: */
 		if (__btree_iter_cmp(iter->btree_id, pos, linked) < 0)
 			ret = false;
 
@@ -252,9 +210,10 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
 		if (linked->btree_id == iter->btree_id &&
 		    level > __fls(linked->nodes_locked)) {
 			if (may_drop_locks) {
-				linked->locks_want = max_t(unsigned,
-							   linked->locks_want,
-							   iter->locks_want);
+				linked->locks_want =
+					max(level + 1, max_t(unsigned,
+					    linked->locks_want,
+					    iter->locks_want));
 				btree_iter_get_locks(linked, true);
 			}
 			ret = false;
diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h
index de3fc0a239da..f262e4431414 100644
--- a/fs/bcachefs/btree_locking.h
+++ b/fs/bcachefs/btree_locking.h
@@ -147,6 +147,26 @@ static inline void btree_node_lock_type(struct bch_fs *c, struct btree *b,
 		__btree_node_lock_type(c, b, type);
 }
 
+/*
+ * Lock a btree node if we already have it locked on one of our linked
+ * iterators:
+ */
+static inline bool btree_node_lock_increment(struct btree_iter *iter,
+					     struct btree *b, unsigned level,
+					     enum btree_node_locked_type want)
+{
+	struct btree_iter *linked;
+
+	for_each_linked_btree_iter(iter, linked)
+		if (linked->l[level].b == b &&
+		    btree_node_locked_type(linked, level) >= want) {
+			six_lock_increment(&b->lock, want);
+			return true;
+		}
+
+	return false;
+}
+
 bool __bch2_btree_node_lock(struct btree *, struct bpos, unsigned,
 			    struct btree_iter *, enum six_lock_type, bool);
 
@@ -159,6 +179,7 @@ static inline bool btree_node_lock(struct btree *b, struct bpos pos,
 	EBUG_ON(level >= BTREE_MAX_DEPTH);
 
 	return likely(six_trylock_type(&b->lock, type)) ||
+		btree_node_lock_increment(iter, b, level, type) ||
 		__bch2_btree_node_lock(b, pos, level, iter,
 				       type, may_drop_locks);
 }
-- 
cgit 


From a7c7a3092eb2a0313b535d05fb98817ef1efed0c Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 24 Jul 2018 12:59:13 -0400
Subject: bcachefs: bch2_mark_key() now takes bch_data_type

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_gc.c              |  8 +++++---
 fs/bcachefs/btree_update_interior.c | 10 +++++-----
 fs/bcachefs/buckets.c               | 12 +++++-------
 fs/bcachefs/buckets.h               |  4 ++--
 fs/bcachefs/extents.c               |  2 +-
 5 files changed, 18 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 5053247a6b42..a82677d053b0 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -123,13 +123,14 @@ static u8 bch2_gc_mark_key(struct bch_fs *c, enum bkey_type type,
 
 	switch (type) {
 	case BKEY_TYPE_BTREE:
-		bch2_mark_key(c, k, c->opts.btree_node_size, true, pos, NULL,
+		bch2_mark_key(c, k, c->opts.btree_node_size,
+			      BCH_DATA_BTREE, pos, NULL,
 			      0, flags|
 			      BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
 			      BCH_BUCKET_MARK_GC_LOCK_HELD);
 		break;
 	case BKEY_TYPE_EXTENTS:
-		bch2_mark_key(c, k, k.k->size, false, pos, NULL,
+		bch2_mark_key(c, k, k.k->size, BCH_DATA_USER, pos, NULL,
 			      0, flags|
 			      BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
 			      BCH_BUCKET_MARK_GC_LOCK_HELD);
@@ -398,7 +399,8 @@ static void bch2_mark_pending_btree_node_frees(struct bch_fs *c)
 	for_each_pending_btree_node_free(c, as, d)
 		if (d->index_update_done)
 			bch2_mark_key(c, bkey_i_to_s_c(&d->key),
-				      c->opts.btree_node_size, true, pos,
+				      c->opts.btree_node_size,
+				      BCH_DATA_BTREE, pos,
 				      &stats, 0,
 				      BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
 				      BCH_BUCKET_MARK_GC_LOCK_HELD);
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index bc667ac70f57..c0f1c77b340f 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -211,7 +211,7 @@ found:
 		struct bch_fs_usage tmp = { 0 };
 
 		bch2_mark_key(c, bkey_i_to_s_c(&d->key),
-			     -c->opts.btree_node_size, true, b
+			     -c->opts.btree_node_size, BCH_DATA_BTREE, b
 			     ? gc_pos_btree_node(b)
 			     : gc_pos_btree_root(as->btree_id),
 			     &tmp, 0, 0);
@@ -290,7 +290,7 @@ static void bch2_btree_node_free_ondisk(struct bch_fs *c,
 	BUG_ON(!pending->index_update_done);
 
 	bch2_mark_key(c, bkey_i_to_s_c(&pending->key),
-		     -c->opts.btree_node_size, true,
+		     -c->opts.btree_node_size, BCH_DATA_BTREE,
 		     gc_phase(GC_PHASE_PENDING_DELETE),
 		     &stats, 0, 0);
 	/*
@@ -1098,7 +1098,7 @@ static void bch2_btree_set_root_inmem(struct btree_update *as, struct btree *b)
 	__bch2_btree_set_root_inmem(c, b);
 
 	bch2_mark_key(c, bkey_i_to_s_c(&b->key),
-		      c->opts.btree_node_size, true,
+		      c->opts.btree_node_size, BCH_DATA_BTREE,
 		      gc_pos_btree_root(b->btree_id),
 		      &stats, 0, 0);
 
@@ -1186,7 +1186,7 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, struct btree *b
 
 	if (bkey_extent_is_data(&insert->k))
 		bch2_mark_key(c, bkey_i_to_s_c(insert),
-			     c->opts.btree_node_size, true,
+			     c->opts.btree_node_size, BCH_DATA_BTREE,
 			     gc_pos_btree_node(b), &stats, 0, 0);
 
 	while ((k = bch2_btree_node_iter_peek_all(node_iter, b)) &&
@@ -1967,7 +1967,7 @@ static void __bch2_btree_node_update_key(struct bch_fs *c,
 		bch2_btree_node_lock_write(b, iter);
 
 		bch2_mark_key(c, bkey_i_to_s_c(&new_key->k_i),
-			      c->opts.btree_node_size, true,
+			      c->opts.btree_node_size, BCH_DATA_BTREE,
 			      gc_pos_btree_root(b->btree_id),
 			      &stats, 0, 0);
 		bch2_btree_node_free_index(as, NULL,
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 65232f5e61bc..06ef268fd991 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -512,15 +512,13 @@ static void bch2_mark_pointer(struct bch_fs *c,
 			      struct bkey_s_c_extent e,
 			      const struct bch_extent_ptr *ptr,
 			      struct bch_extent_crc_unpacked crc,
-			      s64 sectors, enum s_alloc type,
+			      s64 sectors, enum bch_data_type data_type,
 			      struct bch_fs_usage *stats,
 			      u64 journal_seq, unsigned flags)
 {
 	struct bucket_mark old, new;
 	struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
 	struct bucket *g = PTR_BUCKET(ca, ptr);
-	enum bch_data_type data_type = type == S_META
-		? BCH_DATA_BTREE : BCH_DATA_USER;
 	u64 v;
 
 	if (crc.compression_type) {
@@ -596,7 +594,7 @@ static void bch2_mark_pointer(struct bch_fs *c,
 }
 
 void bch2_mark_key(struct bch_fs *c, struct bkey_s_c k,
-		   s64 sectors, bool metadata,
+		   s64 sectors, enum bch_data_type data_type,
 		   struct gc_pos pos,
 		   struct bch_fs_usage *stats,
 		   u64 journal_seq, unsigned flags)
@@ -643,14 +641,14 @@ void bch2_mark_key(struct bch_fs *c, struct bkey_s_c k,
 		struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
 		const struct bch_extent_ptr *ptr;
 		struct bch_extent_crc_unpacked crc;
-		enum s_alloc type = metadata ? S_META : S_DIRTY;
+		enum s_alloc type = data_type == BCH_DATA_USER
+			? S_DIRTY : S_META;
 		unsigned replicas = 0;
 
-		BUG_ON(metadata && bkey_extent_is_cached(e.k));
 		BUG_ON(!sectors);
 
 		extent_for_each_ptr_crc(e, ptr, crc) {
-			bch2_mark_pointer(c, e, ptr, crc, sectors, type,
+			bch2_mark_pointer(c, e, ptr, crc, sectors, data_type,
 					  stats, journal_seq, flags);
 			replicas += !ptr->cached;
 		}
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index 2671ad29edf9..6b312d322389 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -218,8 +218,8 @@ void bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *,
 #define BCH_BUCKET_MARK_GC_WILL_VISIT		(1 << 2)
 #define BCH_BUCKET_MARK_GC_LOCK_HELD		(1 << 3)
 
-void bch2_mark_key(struct bch_fs *, struct bkey_s_c, s64, bool, struct gc_pos,
-		   struct bch_fs_usage *, u64, unsigned);
+void bch2_mark_key(struct bch_fs *, struct bkey_s_c, s64, enum bch_data_type,
+		   struct gc_pos, struct bch_fs_usage *, u64, unsigned);
 
 void bch2_recalc_sectors_available(struct bch_fs *);
 
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index e0150fbe85af..276545dfa246 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -1071,7 +1071,7 @@ static void bch2_add_sectors(struct extent_insert_state *s,
 	if (!sectors)
 		return;
 
-	bch2_mark_key(c, k, sectors, false, gc_pos_btree_node(b),
+	bch2_mark_key(c, k, sectors, BCH_DATA_USER, gc_pos_btree_node(b),
 		      &s->stats, s->trans->journal_res.seq, 0);
 }
 
-- 
cgit 


From 09f3297ac90aae99d8f7e776c8df5dd0d32c1ba9 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 24 Jul 2018 13:33:07 -0400
Subject: bcachefs: kill s_alloc, use bch_data_type

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update_interior.c |  2 +-
 fs/bcachefs/buckets.c               | 30 ++++++++++----------
 fs/bcachefs/buckets.h               | 12 --------
 fs/bcachefs/buckets_types.h         | 10 ++-----
 fs/bcachefs/chardev.c               |  5 ++--
 fs/bcachefs/sysfs.c                 | 55 ++++++++++++++++---------------------
 6 files changed, 44 insertions(+), 70 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index c0f1c77b340f..aba01a77e4af 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -184,7 +184,7 @@ found:
 	 */
 	replicas = bch2_extent_nr_dirty_ptrs(k);
 	if (replicas)
-		stats->s[replicas - 1].data[S_META] -= c->opts.btree_node_size;
+		stats->s[replicas - 1].data[BCH_DATA_BTREE] -= c->opts.btree_node_size;
 
 	/*
 	 * We're dropping @k from the btree, but it's still live until the
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 06ef268fd991..c0dc0ce1f585 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -82,16 +82,14 @@ static void bch2_fs_stats_verify(struct bch_fs *c)
 {
 	struct bch_fs_usage stats =
 		__bch2_fs_usage_read(c);
-	unsigned i;
+	unsigned i, j;
 
 	for (i = 0; i < ARRAY_SIZE(stats.s); i++) {
-		if ((s64) stats.s[i].data[S_META] < 0)
-			panic("replicas %u meta underflow: %lli\n",
-			      i + 1, stats.s[i].data[S_META]);
-
-		if ((s64) stats.s[i].data[S_DIRTY] < 0)
-			panic("replicas %u dirty underflow: %lli\n",
-			      i + 1, stats.s[i].data[S_DIRTY]);
+		for (j = 0; j < ARRAY_SIZE(stats.s[i].data); j++)
+			if ((s64) stats.s[i].data[j] < 0)
+				panic("replicas %u %s underflow: %lli\n",
+				      i + 1, bch_data_types[j],
+				      stats.s[i].data[j]);
 
 		if ((s64) stats.s[i].persistent_reserved < 0)
 			panic("replicas %u reserved underflow: %lli\n",
@@ -247,12 +245,16 @@ struct fs_usage_sum {
 static inline struct fs_usage_sum __fs_usage_sum(struct bch_fs_usage stats)
 {
 	struct fs_usage_sum sum = { 0 };
-	unsigned i;
+	unsigned i, j;
 
 	for (i = 0; i < ARRAY_SIZE(stats.s); i++) {
-		sum.data += (stats.s[i].data[S_META] +
-			     stats.s[i].data[S_DIRTY]) * (i + 1);
-		sum.reserved += stats.s[i].persistent_reserved * (i + 1);
+		u64 a = 0;
+
+		for (j = 0; j < ARRAY_SIZE(stats.s[i].data); j++)
+			a += stats.s[i].data[j];
+
+		sum.data	+= a * (i + 1);
+		sum.reserved	+= stats.s[i].persistent_reserved * (i + 1);
 	}
 
 	sum.reserved += stats.online_reserved;
@@ -641,8 +643,6 @@ void bch2_mark_key(struct bch_fs *c, struct bkey_s_c k,
 		struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
 		const struct bch_extent_ptr *ptr;
 		struct bch_extent_crc_unpacked crc;
-		enum s_alloc type = data_type == BCH_DATA_USER
-			? S_DIRTY : S_META;
 		unsigned replicas = 0;
 
 		BUG_ON(!sectors);
@@ -655,7 +655,7 @@ void bch2_mark_key(struct bch_fs *c, struct bkey_s_c k,
 
 		if (replicas) {
 			BUG_ON(replicas - 1 > ARRAY_SIZE(stats->s));
-			stats->s[replicas - 1].data[type] += sectors;
+			stats->s[replicas - 1].data[data_type] += sectors;
 		}
 		break;
 	}
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index 6b312d322389..016201ba1b8b 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -168,18 +168,6 @@ static inline u64 dev_buckets_free(struct bch_fs *c, struct bch_dev *ca)
 
 /* Filesystem usage: */
 
-static inline enum bch_data_type s_alloc_to_data_type(enum s_alloc s)
-{
-	switch (s) {
-	case S_META:
-		return BCH_DATA_BTREE;
-	case S_DIRTY:
-		return BCH_DATA_USER;
-	default:
-		BUG();
-	}
-}
-
 struct bch_fs_usage __bch2_fs_usage_read(struct bch_fs *);
 struct bch_fs_usage bch2_fs_usage_read(struct bch_fs *);
 void bch2_fs_usage_apply(struct bch_fs *, struct bch_fs_usage *,
diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h
index cad35a70192d..d528194ccf7e 100644
--- a/fs/bcachefs/buckets_types.h
+++ b/fs/bcachefs/buckets_types.h
@@ -2,6 +2,7 @@
 #ifndef _BUCKETS_TYPES_H
 #define _BUCKETS_TYPES_H
 
+#include "bcachefs_format.h"
 #include "util.h"
 
 #define BUCKET_JOURNAL_SEQ_BITS		16
@@ -59,13 +60,6 @@ struct bch_dev_usage {
 	u64			sectors_fragmented;
 };
 
-/* kill, switch to bch_data_type? */
-enum s_alloc {
-	S_META,
-	S_DIRTY,
-	S_ALLOC_NR,
-};
-
 struct bch_fs_usage {
 	/* all fields are in units of 512 byte sectors: */
 	/* _uncompressed_ sectors: */
@@ -73,7 +67,7 @@ struct bch_fs_usage {
 	u64			available_cache;
 
 	struct {
-		u64		data[S_ALLOC_NR];
+		u64		data[BCH_DATA_NR];
 		u64		persistent_reserved;
 	}			s[BCH_REPLICAS_MAX];
 };
diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c
index 2aa86331969a..283828fe2dc3 100644
--- a/fs/bcachefs/chardev.c
+++ b/fs/bcachefs/chardev.c
@@ -406,9 +406,8 @@ static long bch2_ioctl_usage(struct bch_fs *c,
 			dst.persistent_reserved[i] =
 				src.s[i].persistent_reserved;
 
-			for (j = 0; j < S_ALLOC_NR; j++)
-				dst.sectors[s_alloc_to_data_type(j)][i] =
-					src.s[i].data[j];
+			for (j = 0; j < BCH_DATA_NR; j++)
+				dst.sectors[j][i] = src.s[i].data[j];
 		}
 
 		ret = copy_to_user(&user_arg->fs, &dst, sizeof(dst));
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index 430dcbcb6e8a..db8af44c7921 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -230,41 +230,34 @@ static size_t bch2_btree_cache_size(struct bch_fs *c)
 
 static ssize_t show_fs_alloc_debug(struct bch_fs *c, char *buf)
 {
+	char *out = buf, *end = buf + PAGE_SIZE;
 	struct bch_fs_usage stats = bch2_fs_usage_read(c);
+	unsigned replicas, type;
+
+	out += scnprintf(out, end - out,
+			 "capacity:\t\t%llu\n",
+			 c->capacity);
+
+	for (replicas = 0; replicas < ARRAY_SIZE(stats.s); replicas++) {
+		out += scnprintf(out, end - out,
+				 "%u replicas:\n",
+				 replicas + 1);
+
+		for (type = BCH_DATA_SB; type < BCH_DATA_NR; type++)
+			out += scnprintf(out, end - out,
+					 "\t%s:\t\t%llu\n",
+					 bch2_data_types[type],
+					 stats.s[replicas].data[type]);
+		out += scnprintf(out, end - out,
+				 "\treserved:\t%llu\n",
+				 stats.s[replicas].persistent_reserved);
+	}
 
-	return scnprintf(buf, PAGE_SIZE,
-			 "capacity:\t\t%llu\n"
-			 "1 replicas:\n"
-			 "\tmeta:\t\t%llu\n"
-			 "\tdirty:\t\t%llu\n"
-			 "\treserved:\t%llu\n"
-			 "2 replicas:\n"
-			 "\tmeta:\t\t%llu\n"
-			 "\tdirty:\t\t%llu\n"
-			 "\treserved:\t%llu\n"
-			 "3 replicas:\n"
-			 "\tmeta:\t\t%llu\n"
-			 "\tdirty:\t\t%llu\n"
-			 "\treserved:\t%llu\n"
-			 "4 replicas:\n"
-			 "\tmeta:\t\t%llu\n"
-			 "\tdirty:\t\t%llu\n"
-			 "\treserved:\t%llu\n"
+	out += scnprintf(out, end - out,
 			 "online reserved:\t%llu\n",
-			 c->capacity,
-			 stats.s[0].data[S_META],
-			 stats.s[0].data[S_DIRTY],
-			 stats.s[0].persistent_reserved,
-			 stats.s[1].data[S_META],
-			 stats.s[1].data[S_DIRTY],
-			 stats.s[1].persistent_reserved,
-			 stats.s[2].data[S_META],
-			 stats.s[2].data[S_DIRTY],
-			 stats.s[2].persistent_reserved,
-			 stats.s[3].data[S_META],
-			 stats.s[3].data[S_DIRTY],
-			 stats.s[3].persistent_reserved,
 			 stats.online_reserved);
+
+	return out - buf;
 }
 
 static ssize_t bch2_compression_stats(struct bch_fs *c, char *buf)
-- 
cgit 


From 5b650fd11a00271b9d4c033d1d0780826e050137 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 24 Jul 2018 14:54:39 -0400
Subject: bcachefs: Account for internal fragmentation better

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_gc.c              |   3 +-
 fs/bcachefs/btree_update_interior.c |   3 +-
 fs/bcachefs/buckets.c               | 115 +++++++++++++++++++++---------------
 fs/bcachefs/buckets.h               |   2 -
 fs/bcachefs/buckets_types.h         |   4 +-
 fs/bcachefs/chardev.c               |   4 +-
 fs/bcachefs/fs.c                    |   9 ++-
 fs/bcachefs/sysfs.c                 |  14 ++++-
 8 files changed, 93 insertions(+), 61 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index a82677d053b0..1fbb9c657fc6 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -493,7 +493,8 @@ static void bch2_gc_start(struct bch_fs *c)
 		struct bch_fs_usage *p =
 			per_cpu_ptr(c->usage_percpu, cpu);
 
-		memset(p->s, 0, sizeof(p->s));
+		memset(p->replicas, 0, sizeof(p->replicas));
+		memset(p->buckets, 0, sizeof(p->buckets));
 	}
 
 	percpu_up_write(&c->usage_lock);
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index aba01a77e4af..a37b5edea699 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -184,7 +184,8 @@ found:
 	 */
 	replicas = bch2_extent_nr_dirty_ptrs(k);
 	if (replicas)
-		stats->s[replicas - 1].data[BCH_DATA_BTREE] -= c->opts.btree_node_size;
+		stats->replicas[replicas - 1].data[BCH_DATA_BTREE] -=
+			c->opts.btree_node_size;
 
 	/*
 	 * We're dropping @k from the btree, but it's still live until the
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index c0dc0ce1f585..56b197bff4f0 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -73,6 +73,8 @@
 
 #include <linux/preempt.h>
 
+static inline u64 __bch2_fs_sectors_used(struct bch_fs *, struct bch_fs_usage);
+
 #ifdef DEBUG_BUCKETS
 
 #define lg_local_lock	lg_global_lock
@@ -84,18 +86,24 @@ static void bch2_fs_stats_verify(struct bch_fs *c)
 		__bch2_fs_usage_read(c);
 	unsigned i, j;
 
-	for (i = 0; i < ARRAY_SIZE(stats.s); i++) {
-		for (j = 0; j < ARRAY_SIZE(stats.s[i].data); j++)
-			if ((s64) stats.s[i].data[j] < 0)
-				panic("replicas %u %s underflow: %lli\n",
+	for (i = 0; i < ARRAY_SIZE(stats.replicas); i++) {
+		for (j = 0; j < ARRAY_SIZE(stats.replicas[i].data); j++)
+			if ((s64) stats.replicas[i].data[j] < 0)
+				panic("replicas %u %s sectors underflow: %lli\n",
 				      i + 1, bch_data_types[j],
-				      stats.s[i].data[j]);
+				      stats.replicas[i].data[j]);
 
-		if ((s64) stats.s[i].persistent_reserved < 0)
+		if ((s64) stats.replicas[i].persistent_reserved < 0)
 			panic("replicas %u reserved underflow: %lli\n",
-			      i + 1, stats.s[i].persistent_reserved);
+			      i + 1, stats.replicas[i].persistent_reserved);
 	}
 
+	for (j = 0; j < ARRAY_SIZE(stats.buckets); j++)
+		if ((s64) stats.replicas[i].data_buckets[j] < 0)
+			panic("%s buckets underflow: %lli\n",
+			      bch_data_types[j],
+			      stats.buckets[j]);
+
 	if ((s64) stats.online_reserved < 0)
 		panic("sectors_online_reserved underflow: %lli\n",
 		      stats.online_reserved);
@@ -238,6 +246,7 @@ bch2_fs_usage_read(struct bch_fs *c)
 }
 
 struct fs_usage_sum {
+	u64	hidden;
 	u64	data;
 	u64	reserved;
 };
@@ -247,14 +256,21 @@ static inline struct fs_usage_sum __fs_usage_sum(struct bch_fs_usage stats)
 	struct fs_usage_sum sum = { 0 };
 	unsigned i, j;
 
-	for (i = 0; i < ARRAY_SIZE(stats.s); i++) {
-		u64 a = 0;
+	/*
+	 * For superblock and journal we count bucket usage, not sector usage,
+	 * because any internal fragmentation should _not_ be counted as
+	 * free space:
+	 */
+	for (j = 1; j < BCH_DATA_BTREE; j++)
+		sum.hidden += stats.buckets[j];
 
-		for (j = 0; j < ARRAY_SIZE(stats.s[i].data); j++)
-			a += stats.s[i].data[j];
+	for (i = 0; i < ARRAY_SIZE(stats.replicas); i++) {
+		for (j = BCH_DATA_BTREE;
+		     j < ARRAY_SIZE(stats.replicas[i].data);
+		     j++)
+			sum.data += stats.replicas[i].data[j] * (i + 1);
 
-		sum.data	+= a * (i + 1);
-		sum.reserved	+= stats.s[i].persistent_reserved * (i + 1);
+		sum.reserved += stats.replicas[i].persistent_reserved * (i + 1);
 	}
 
 	sum.reserved += stats.online_reserved;
@@ -270,14 +286,14 @@ static u64 reserve_factor(u64 r)
 
 static u64 avail_factor(u64 r)
 {
-	return (r << RESERVE_FACTOR) / (1 << RESERVE_FACTOR) + 1;
+	return (r << RESERVE_FACTOR) / ((1 << RESERVE_FACTOR) + 1);
 }
 
-u64 __bch2_fs_sectors_used(struct bch_fs *c, struct bch_fs_usage stats)
+static inline u64 __bch2_fs_sectors_used(struct bch_fs *c, struct bch_fs_usage stats)
 {
 	struct fs_usage_sum sum = __fs_usage_sum(stats);
 
-	return sum.data + reserve_factor(sum.reserved);
+	return sum.hidden + sum.data + reserve_factor(sum.reserved);
 }
 
 u64 bch2_fs_sectors_used(struct bch_fs *c, struct bch_fs_usage stats)
@@ -285,9 +301,9 @@ u64 bch2_fs_sectors_used(struct bch_fs *c, struct bch_fs_usage stats)
 	return min(c->capacity, __bch2_fs_sectors_used(c, stats));
 }
 
-u64 bch2_fs_sectors_free(struct bch_fs *c, struct bch_fs_usage stats)
+static u64 bch2_fs_sectors_free(struct bch_fs *c, struct bch_fs_usage stats)
 {
-	return avail_factor(c->capacity - bch2_fs_sectors_used(c, stats));
+	return c->capacity - bch2_fs_sectors_used(c, stats);
 }
 
 static inline int is_unavailable_bucket(struct bucket_mark m)
@@ -323,9 +339,9 @@ static bool bucket_became_unavailable(struct bch_fs *c,
 }
 
 void bch2_fs_usage_apply(struct bch_fs *c,
-			struct bch_fs_usage *stats,
-			struct disk_reservation *disk_res,
-			struct gc_pos gc_pos)
+			 struct bch_fs_usage *stats,
+			 struct disk_reservation *disk_res,
+			 struct gc_pos gc_pos)
 {
 	struct fs_usage_sum sum = __fs_usage_sum(*stats);
 	s64 added = sum.data + sum.reserved;
@@ -358,6 +374,7 @@ void bch2_fs_usage_apply(struct bch_fs *c,
 }
 
 static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
+				  struct bch_fs_usage *stats,
 				  struct bucket_mark old, struct bucket_mark new)
 {
 	struct bch_dev_usage *dev_usage;
@@ -374,6 +391,9 @@ static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
 			bch2_data_types[new.data_type]);
 	}
 
+	stats->buckets[bucket_type(old)] -= ca->mi.bucket_size;
+	stats->buckets[bucket_type(new)] += ca->mi.bucket_size;
+
 	preempt_disable();
 	dev_usage = this_cpu_ptr(ca->usage_percpu);
 
@@ -399,17 +419,18 @@ static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
 	bch2_dev_stats_verify(ca);
 }
 
-#define bucket_data_cmpxchg(c, ca, g, new, expr)		\
+#define bucket_data_cmpxchg(c, ca, stats, g, new, expr)		\
 ({								\
 	struct bucket_mark _old = bucket_cmpxchg(g, new, expr);	\
 								\
-	bch2_dev_usage_update(c, ca, _old, new);		\
+	bch2_dev_usage_update(c, ca, stats, _old, new);		\
 	_old;							\
 })
 
 void bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca,
 			    size_t b, struct bucket_mark *old)
 {
+	struct bch_fs_usage *stats = this_cpu_ptr(c->usage_percpu);
 	struct bucket *g;
 	struct bucket_mark new;
 
@@ -417,7 +438,7 @@ void bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca,
 
 	g = bucket(ca, b);
 
-	*old = bucket_data_cmpxchg(c, ca, g, new, ({
+	*old = bucket_data_cmpxchg(c, ca, stats, g, new, ({
 		BUG_ON(!is_available_bucket(new));
 
 		new.owned_by_allocator	= 1;
@@ -436,6 +457,7 @@ void bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
 			    size_t b, bool owned_by_allocator,
 			    struct gc_pos pos, unsigned flags)
 {
+	struct bch_fs_usage *stats = this_cpu_ptr(c->usage_percpu);
 	struct bucket *g;
 	struct bucket_mark old, new;
 
@@ -446,7 +468,7 @@ void bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
 	    gc_will_visit(c, pos))
 		return;
 
-	old = bucket_data_cmpxchg(c, ca, g, new, ({
+	old = bucket_data_cmpxchg(c, ca, stats, g, new, ({
 		new.owned_by_allocator	= owned_by_allocator;
 	}));
 
@@ -466,10 +488,12 @@ void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
 			       unsigned sectors, struct gc_pos pos,
 			       unsigned flags)
 {
+	struct bch_fs_usage *stats;
 	struct bucket *g;
 	struct bucket_mark old, new;
 
-	BUG_ON(!type);
+	BUG_ON(type != BCH_DATA_SB &&
+	       type != BCH_DATA_JOURNAL);
 
 	if (likely(c)) {
 		percpu_rwsem_assert_held(&c->usage_lock);
@@ -479,16 +503,17 @@ void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
 			return;
 	}
 
-	rcu_read_lock();
+	preempt_disable();
+	stats = this_cpu_ptr(c->usage_percpu);
 
 	g = bucket(ca, b);
-	old = bucket_data_cmpxchg(c, ca, g, new, ({
+	old = bucket_data_cmpxchg(c, ca, stats, g, new, ({
 		new.data_type = type;
 		checked_add(new.dirty_sectors, sectors);
-		new.dirty_sectors += sectors;
 	}));
 
-	rcu_read_unlock();
+	stats->replicas[0].data[type] += sectors;
+	preempt_enable();
 
 	BUG_ON(!(flags & BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE) &&
 	       bucket_became_unavailable(c, old, new));
@@ -589,7 +614,7 @@ static void bch2_mark_pointer(struct bch_fs *c,
 			      old.v.counter,
 			      new.v.counter)) != old.v.counter);
 
-	bch2_dev_usage_update(c, ca, old, new);
+	bch2_dev_usage_update(c, ca, stats, old, new);
 
 	BUG_ON(!(flags & BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE) &&
 	       bucket_became_unavailable(c, old, new));
@@ -601,6 +626,10 @@ void bch2_mark_key(struct bch_fs *c, struct bkey_s_c k,
 		   struct bch_fs_usage *stats,
 		   u64 journal_seq, unsigned flags)
 {
+	unsigned replicas = bch2_extent_nr_dirty_ptrs(k);
+
+	BUG_ON(replicas && replicas - 1 > ARRAY_SIZE(stats->replicas));
+
 	/*
 	 * synchronization w.r.t. GC:
 	 *
@@ -643,32 +672,22 @@ void bch2_mark_key(struct bch_fs *c, struct bkey_s_c k,
 		struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
 		const struct bch_extent_ptr *ptr;
 		struct bch_extent_crc_unpacked crc;
-		unsigned replicas = 0;
 
 		BUG_ON(!sectors);
 
-		extent_for_each_ptr_crc(e, ptr, crc) {
+		extent_for_each_ptr_crc(e, ptr, crc)
 			bch2_mark_pointer(c, e, ptr, crc, sectors, data_type,
 					  stats, journal_seq, flags);
-			replicas += !ptr->cached;
-		}
 
-		if (replicas) {
-			BUG_ON(replicas - 1 > ARRAY_SIZE(stats->s));
-			stats->s[replicas - 1].data[data_type] += sectors;
-		}
+		if (replicas)
+			stats->replicas[replicas - 1].data[data_type] += sectors;
 		break;
 	}
-	case BCH_RESERVATION: {
-		struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k);
-
-		if (r.v->nr_replicas) {
-			BUG_ON(r.v->nr_replicas - 1 > ARRAY_SIZE(stats->s));
-			stats->s[r.v->nr_replicas - 1].persistent_reserved += sectors;
-		}
+	case BCH_RESERVATION:
+		if (replicas)
+			stats->replicas[replicas - 1].persistent_reserved += sectors;
 		break;
 	}
-	}
 	percpu_up_read(&c->usage_lock);
 }
 
@@ -681,7 +700,7 @@ static u64 __recalc_sectors_available(struct bch_fs *c)
 	for_each_possible_cpu(cpu)
 		per_cpu_ptr(c->usage_percpu, cpu)->available_cache = 0;
 
-	return bch2_fs_sectors_free(c, bch2_fs_usage_read(c));
+	return avail_factor(bch2_fs_sectors_free(c, bch2_fs_usage_read(c)));
 }
 
 /* Used by gc when it's starting: */
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index 016201ba1b8b..9aeccbb11d54 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -173,9 +173,7 @@ struct bch_fs_usage bch2_fs_usage_read(struct bch_fs *);
 void bch2_fs_usage_apply(struct bch_fs *, struct bch_fs_usage *,
 			 struct disk_reservation *, struct gc_pos);
 
-u64 __bch2_fs_sectors_used(struct bch_fs *, struct bch_fs_usage);
 u64 bch2_fs_sectors_used(struct bch_fs *, struct bch_fs_usage);
-u64 bch2_fs_sectors_free(struct bch_fs *, struct bch_fs_usage);
 
 static inline bool is_available_bucket(struct bucket_mark mark)
 {
diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h
index d528194ccf7e..9968570832e3 100644
--- a/fs/bcachefs/buckets_types.h
+++ b/fs/bcachefs/buckets_types.h
@@ -69,7 +69,9 @@ struct bch_fs_usage {
 	struct {
 		u64		data[BCH_DATA_NR];
 		u64		persistent_reserved;
-	}			s[BCH_REPLICAS_MAX];
+	}			replicas[BCH_REPLICAS_MAX];
+
+	u64			buckets[BCH_DATA_NR];
 };
 
 /*
diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c
index 283828fe2dc3..db0f990bebf4 100644
--- a/fs/bcachefs/chardev.c
+++ b/fs/bcachefs/chardev.c
@@ -404,10 +404,10 @@ static long bch2_ioctl_usage(struct bch_fs *c,
 
 		for (i = 0; i < BCH_REPLICAS_MAX; i++) {
 			dst.persistent_reserved[i] =
-				src.s[i].persistent_reserved;
+				src.replicas[i].persistent_reserved;
 
 			for (j = 0; j < BCH_DATA_NR; j++)
-				dst.sectors[j][i] = src.s[i].data[j];
+				dst.sectors[j][i] = src.replicas[i].data[j];
 		}
 
 		ret = copy_to_user(&user_arg->fs, &dst, sizeof(dst));
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index 5963f88b8156..67ddad95f91a 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -1428,13 +1428,16 @@ static int bch2_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
 	struct super_block *sb = dentry->d_sb;
 	struct bch_fs *c = sb->s_fs_info;
+	struct bch_fs_usage usage = bch2_fs_usage_read(c);
+	u64 hidden_metadata = usage.buckets[BCH_DATA_SB] +
+		usage.buckets[BCH_DATA_JOURNAL];
+	unsigned shift = sb->s_blocksize_bits - 9;
 	u64 fsid;
 
 	buf->f_type	= BCACHEFS_STATFS_MAGIC;
 	buf->f_bsize	= sb->s_blocksize;
-	buf->f_blocks	= c->capacity >> PAGE_SECTOR_SHIFT;
-	buf->f_bfree	= bch2_fs_sectors_free(c, bch2_fs_usage_read(c)) >>
-			   PAGE_SECTOR_SHIFT;
+	buf->f_blocks	= (c->capacity - hidden_metadata) >> shift;
+	buf->f_bfree	= (c->capacity - bch2_fs_sectors_used(c, usage)) >> shift;
 	buf->f_bavail	= buf->f_bfree;
 	buf->f_files	= atomic_long_read(&c->nr_inodes);
 	buf->f_ffree	= U64_MAX;
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index db8af44c7921..4ce7168e930b 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -238,7 +238,7 @@ static ssize_t show_fs_alloc_debug(struct bch_fs *c, char *buf)
 			 "capacity:\t\t%llu\n",
 			 c->capacity);
 
-	for (replicas = 0; replicas < ARRAY_SIZE(stats.s); replicas++) {
+	for (replicas = 0; replicas < ARRAY_SIZE(stats.replicas); replicas++) {
 		out += scnprintf(out, end - out,
 				 "%u replicas:\n",
 				 replicas + 1);
@@ -247,12 +247,20 @@ static ssize_t show_fs_alloc_debug(struct bch_fs *c, char *buf)
 			out += scnprintf(out, end - out,
 					 "\t%s:\t\t%llu\n",
 					 bch2_data_types[type],
-					 stats.s[replicas].data[type]);
+					 stats.replicas[replicas].data[type]);
 		out += scnprintf(out, end - out,
 				 "\treserved:\t%llu\n",
-				 stats.s[replicas].persistent_reserved);
+				 stats.replicas[replicas].persistent_reserved);
 	}
 
+	out += scnprintf(out, end - out, "bucket usage\n");
+
+	for (type = BCH_DATA_SB; type < BCH_DATA_NR; type++)
+		out += scnprintf(out, end - out,
+				 "\t%s:\t\t%llu\n",
+				 bch2_data_types[type],
+				 stats.buckets[type]);
+
 	out += scnprintf(out, end - out,
 			 "online reserved:\t%llu\n",
 			 stats.online_reserved);
-- 
cgit 


From 6eac2c2e2440280ca551d4936807a8a130970469 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 24 Jul 2018 16:42:49 -0400
Subject: bcachefs: Change how replicated data is accounted

Due to compression, the different replicas of a replicated extent don't
necessarily have to take up the same amount of space - so replicated
data sector counts shouldn't be stored divided by the number of
replicas.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update_interior.c |  2 +-
 fs/bcachefs/buckets.c               | 99 +++++++++++++++++++++++--------------
 fs/bcachefs/buckets_types.h         |  1 -
 fs/bcachefs/super.c                 | 51 ++++++++++++++++---
 fs/bcachefs/sysfs.c                 |  4 +-
 5 files changed, 107 insertions(+), 50 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index a37b5edea699..b60eb3d33c7b 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -185,7 +185,7 @@ found:
 	replicas = bch2_extent_nr_dirty_ptrs(k);
 	if (replicas)
 		stats->replicas[replicas - 1].data[BCH_DATA_BTREE] -=
-			c->opts.btree_node_size;
+			c->opts.btree_node_size * replicas;
 
 	/*
 	 * We're dropping @k from the btree, but it's still live until the
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 56b197bff4f0..ab61abdf975d 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -248,29 +248,28 @@ bch2_fs_usage_read(struct bch_fs *c)
 struct fs_usage_sum {
 	u64	hidden;
 	u64	data;
+	u64	cached;
 	u64	reserved;
 };
 
 static inline struct fs_usage_sum __fs_usage_sum(struct bch_fs_usage stats)
 {
 	struct fs_usage_sum sum = { 0 };
-	unsigned i, j;
+	unsigned i;
 
 	/*
 	 * For superblock and journal we count bucket usage, not sector usage,
 	 * because any internal fragmentation should _not_ be counted as
 	 * free space:
 	 */
-	for (j = 1; j < BCH_DATA_BTREE; j++)
-		sum.hidden += stats.buckets[j];
+	sum.hidden += stats.buckets[BCH_DATA_SB];
+	sum.hidden += stats.buckets[BCH_DATA_JOURNAL];
 
 	for (i = 0; i < ARRAY_SIZE(stats.replicas); i++) {
-		for (j = BCH_DATA_BTREE;
-		     j < ARRAY_SIZE(stats.replicas[i].data);
-		     j++)
-			sum.data += stats.replicas[i].data[j] * (i + 1);
-
-		sum.reserved += stats.replicas[i].persistent_reserved * (i + 1);
+		sum.data	+= stats.replicas[i].data[BCH_DATA_BTREE];
+		sum.data	+= stats.replicas[i].data[BCH_DATA_USER];
+		sum.cached	+= stats.replicas[i].data[BCH_DATA_CACHED];
+		sum.reserved	+= stats.replicas[i].persistent_reserved;
 	}
 
 	sum.reserved += stats.online_reserved;
@@ -379,17 +378,13 @@ static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
 {
 	struct bch_dev_usage *dev_usage;
 
-	if (c)
-		percpu_rwsem_assert_held(&c->usage_lock);
+	percpu_rwsem_assert_held(&c->usage_lock);
 
-	if (old.data_type && new.data_type &&
-	    old.data_type != new.data_type) {
-		BUG_ON(!c);
-		bch2_fs_inconsistent(c,
-			"different types of data in same bucket: %s, %s",
-			bch2_data_types[old.data_type],
-			bch2_data_types[new.data_type]);
-	}
+	bch2_fs_inconsistent_on(old.data_type && new.data_type &&
+				old.data_type != new.data_type, c,
+		"different types of data in same bucket: %s, %s",
+		bch2_data_types[old.data_type],
+		bch2_data_types[new.data_type]);
 
 	stats->buckets[bucket_type(old)] -= ca->mi.bucket_size;
 	stats->buckets[bucket_type(new)] += ca->mi.bucket_size;
@@ -448,6 +443,12 @@ void bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca,
 		new.gen++;
 	}));
 
+	/*
+	 * This isn't actually correct yet, since fs usage is still
+	 * uncompressed sectors:
+	 */
+	stats->replicas[0].data[BCH_DATA_CACHED] -= old->cached_sectors;
+
 	if (!old->owned_by_allocator && old->cached_sectors)
 		trace_invalidate(ca, bucket_to_sector(ca, b),
 				 old->cached_sectors);
@@ -501,26 +502,34 @@ void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
 		if (!(flags & BCH_BUCKET_MARK_GC_LOCK_HELD) &&
 		    gc_will_visit(c, pos))
 			return;
-	}
 
-	preempt_disable();
-	stats = this_cpu_ptr(c->usage_percpu);
+		preempt_disable();
+		stats = this_cpu_ptr(c->usage_percpu);
 
-	g = bucket(ca, b);
-	old = bucket_data_cmpxchg(c, ca, stats, g, new, ({
-		new.data_type = type;
-		checked_add(new.dirty_sectors, sectors);
-	}));
+		g = bucket(ca, b);
+		old = bucket_data_cmpxchg(c, ca, stats, g, new, ({
+			new.data_type = type;
+			checked_add(new.dirty_sectors, sectors);
+		}));
 
-	stats->replicas[0].data[type] += sectors;
-	preempt_enable();
+		stats->replicas[0].data[type] += sectors;
+		preempt_enable();
+	} else {
+		rcu_read_lock();
+
+		g = bucket(ca, b);
+		old = bucket_cmpxchg(g, new, ({
+			new.data_type = type;
+			checked_add(new.dirty_sectors, sectors);
+		}));
+
+		rcu_read_unlock();
+	}
 
 	BUG_ON(!(flags & BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE) &&
 	       bucket_became_unavailable(c, old, new));
 }
 
-/* Reverting this until the copygc + compression issue is fixed: */
-
 static int __disk_sectors(struct bch_extent_crc_unpacked crc, unsigned sectors)
 {
 	if (!sectors)
@@ -540,12 +549,14 @@ static void bch2_mark_pointer(struct bch_fs *c,
 			      const struct bch_extent_ptr *ptr,
 			      struct bch_extent_crc_unpacked crc,
 			      s64 sectors, enum bch_data_type data_type,
-			      struct bch_fs_usage *stats,
+			      unsigned replicas,
+			      struct bch_fs_usage *fs_usage,
 			      u64 journal_seq, unsigned flags)
 {
 	struct bucket_mark old, new;
 	struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
 	struct bucket *g = PTR_BUCKET(ca, ptr);
+	s64 uncompressed_sectors = sectors;
 	u64 v;
 
 	if (crc.compression_type) {
@@ -563,6 +574,20 @@ static void bch2_mark_pointer(struct bch_fs *c,
 			  +__disk_sectors(crc, new_sectors);
 	}
 
+	/*
+	 * fs level usage (which determines free space) is in uncompressed
+	 * sectors, until copygc + compression is sorted out:
+	 *
+	 * note also that we always update @fs_usage, even when we otherwise
+	 * wouldn't do anything because gc is running - this is because the
+	 * caller still needs to account w.r.t. its disk reservation. It is
+	 * caller's responsibility to not apply @fs_usage if gc is in progress.
+	 */
+	fs_usage->replicas
+		[!ptr->cached && replicas ? replicas - 1 : 0].data
+		[!ptr->cached ? data_type : BCH_DATA_CACHED] +=
+			uncompressed_sectors;
+
 	if (flags & BCH_BUCKET_MARK_GC_WILL_VISIT) {
 		if (journal_seq)
 			bucket_cmpxchg(g, new, ({
@@ -614,7 +639,7 @@ static void bch2_mark_pointer(struct bch_fs *c,
 			      old.v.counter,
 			      new.v.counter)) != old.v.counter);
 
-	bch2_dev_usage_update(c, ca, stats, old, new);
+	bch2_dev_usage_update(c, ca, fs_usage, old, new);
 
 	BUG_ON(!(flags & BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE) &&
 	       bucket_became_unavailable(c, old, new));
@@ -677,15 +702,13 @@ void bch2_mark_key(struct bch_fs *c, struct bkey_s_c k,
 
 		extent_for_each_ptr_crc(e, ptr, crc)
 			bch2_mark_pointer(c, e, ptr, crc, sectors, data_type,
-					  stats, journal_seq, flags);
-
-		if (replicas)
-			stats->replicas[replicas - 1].data[data_type] += sectors;
+					  replicas, stats, journal_seq, flags);
 		break;
 	}
 	case BCH_RESERVATION:
 		if (replicas)
-			stats->replicas[replicas - 1].persistent_reserved += sectors;
+			stats->replicas[replicas - 1].persistent_reserved +=
+				sectors * replicas;
 		break;
 	}
 	percpu_up_read(&c->usage_lock);
diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h
index 9968570832e3..49f3ab9009ea 100644
--- a/fs/bcachefs/buckets_types.h
+++ b/fs/bcachefs/buckets_types.h
@@ -62,7 +62,6 @@ struct bch_dev_usage {
 
 struct bch_fs_usage {
 	/* all fields are in units of 512 byte sectors: */
-	/* _uncompressed_ sectors: */
 	u64			online_reserved;
 	u64			available_cache;
 
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index fe95b8b026e8..e44bc95d8deb 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -985,14 +985,6 @@ static int __bch2_dev_attach_bdev(struct bch_dev *ca, struct bch_sb_handle *sb)
 	ca->disk_sb = *sb;
 	memset(sb, 0, sizeof(*sb));
 
-	if (ca->fs)
-		mutex_lock(&ca->fs->sb_lock);
-
-	bch2_mark_dev_superblock(ca->fs, ca, BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE);
-
-	if (ca->fs)
-		mutex_unlock(&ca->fs->sb_lock);
-
 	percpu_ref_reinit(&ca->io_ref);
 
 	return 0;
@@ -1018,6 +1010,11 @@ static int bch2_dev_attach_bdev(struct bch_fs *c, struct bch_sb_handle *sb)
 	if (ret)
 		return ret;
 
+	mutex_lock(&c->sb_lock);
+	bch2_mark_dev_superblock(ca->fs, ca,
+			BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE);
+	mutex_unlock(&c->sb_lock);
+
 	bch2_dev_sysfs_online(c, ca);
 
 	if (c->sb.nr_devices == 1)
@@ -1295,6 +1292,24 @@ err:
 	return ret;
 }
 
+static void dev_usage_clear(struct bch_dev *ca)
+{
+	struct bucket_array *buckets;
+	int cpu;
+
+	for_each_possible_cpu(cpu) {
+		struct bch_dev_usage *p =
+			per_cpu_ptr(ca->usage_percpu, cpu);
+		memset(p, 0, sizeof(*p));
+	}
+
+	down_read(&ca->bucket_lock);
+	buckets = bucket_array(ca);
+
+	memset(buckets->b, 0, sizeof(buckets->b[0]) * buckets->nbuckets);
+	up_read(&ca->bucket_lock);
+}
+
 /* Add new device to running filesystem: */
 int bch2_dev_add(struct bch_fs *c, const char *path)
 {
@@ -1333,11 +1348,28 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
 		return ret;
 	}
 
+	/*
+	 * We want to allocate journal on the new device before adding the new
+	 * device to the filesystem because allocating after we attach requires
+	 * spinning up the allocator thread, and the allocator thread requires
+	 * doing btree writes, which if the existing devices are RO isn't going
+	 * to work
+	 *
+	 * So we have to mark where the superblocks are, but marking allocated
+	 * data normally updates the filesystem usage too, so we have to mark,
+	 * allocate the journal, reset all the marks, then remark after we
+	 * attach...
+	 */
+	bch2_mark_dev_superblock(ca->fs, ca,
+			BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE);
+
 	err = "journal alloc failed";
 	ret = bch2_dev_journal_alloc(ca);
 	if (ret)
 		goto err;
 
+	dev_usage_clear(ca);
+
 	mutex_lock(&c->state_lock);
 	mutex_lock(&c->sb_lock);
 
@@ -1388,6 +1420,9 @@ have_slot:
 	ca->disk_sb.sb->dev_idx	= dev_idx;
 	bch2_dev_attach(c, ca, dev_idx);
 
+	bch2_mark_dev_superblock(c, ca,
+			BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE);
+
 	bch2_write_super(c);
 	mutex_unlock(&c->sb_lock);
 
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index 4ce7168e930b..582e281694a9 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -781,7 +781,7 @@ static ssize_t show_dev_alloc_debug(struct bch_dev *ca, char *buf)
 		"    meta:               %llu\n"
 		"    user:               %llu\n"
 		"    cached:             %llu\n"
-		"    available:          %llu\n"
+		"    available:          %lli\n"
 		"sectors:\n"
 		"    sb:                 %llu\n"
 		"    journal:            %llu\n"
@@ -802,7 +802,7 @@ static ssize_t show_dev_alloc_debug(struct bch_dev *ca, char *buf)
 		stats.buckets[BCH_DATA_BTREE],
 		stats.buckets[BCH_DATA_USER],
 		stats.buckets[BCH_DATA_CACHED],
-		__dev_buckets_available(ca, stats),
+		ca->mi.nbuckets - ca->mi.first_bucket - stats.buckets_unavailable,
 		stats.sectors[BCH_DATA_SB],
 		stats.sectors[BCH_DATA_JOURNAL],
 		stats.sectors[BCH_DATA_BTREE],
-- 
cgit 


From a9bec5208b4379c87fa8361f813cb71b5581540e Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 1 Aug 2018 14:26:55 -0400
Subject: bcachefs: Better calculation of copygc threshold

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc.c    | 42 ++++++++++++++++++++++++++----------------
 fs/bcachefs/bcachefs.h |  1 +
 fs/bcachefs/movinggc.c |  8 +-------
 fs/bcachefs/sysfs.c    |  4 ++++
 4 files changed, 32 insertions(+), 23 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc.c b/fs/bcachefs/alloc.c
index 192ab655ca23..bde22df25134 100644
--- a/fs/bcachefs/alloc.c
+++ b/fs/bcachefs/alloc.c
@@ -1711,7 +1711,7 @@ void bch2_alloc_sectors_done(struct bch_fs *c, struct write_point *wp)
 void bch2_recalc_capacity(struct bch_fs *c)
 {
 	struct bch_dev *ca;
-	u64 total_capacity, capacity = 0, reserved_sectors = 0;
+	u64 capacity = 0, reserved_sectors = 0;
 	unsigned long ra_pages = 0;
 	unsigned i, j;
 
@@ -1726,7 +1726,7 @@ void bch2_recalc_capacity(struct bch_fs *c)
 	bch2_set_ra_pages(c, ra_pages);
 
 	for_each_rw_member(ca, c, i) {
-		size_t reserve = 0;
+		u64 dev_capacity, dev_reserve = 0;
 
 		/*
 		 * We need to reserve buckets (from the number
@@ -1745,30 +1745,40 @@ void bch2_recalc_capacity(struct bch_fs *c)
 		 * not -ENOSPC calculations.
 		 */
 		for (j = 0; j < RESERVE_NONE; j++)
-			reserve += ca->free[j].size;
+			dev_reserve += ca->free[j].size;
 
-		reserve += ca->free_inc.size;
+		dev_reserve += ca->free_inc.size;
 
-		reserve += ARRAY_SIZE(c->write_points);
+		dev_reserve += ARRAY_SIZE(c->write_points);
 
-		reserve += 1;	/* btree write point */
+		dev_reserve += 1;	/* btree write point */
+		dev_reserve += 1;	/* copygc write point */
+		dev_reserve += 1;	/* rebalance write point */
+		dev_reserve += WRITE_POINT_COUNT;
 
-		reserved_sectors += bucket_to_sector(ca, reserve);
+		dev_reserve *= ca->mi.bucket_size;
 
-		capacity += bucket_to_sector(ca, ca->mi.nbuckets -
-					     ca->mi.first_bucket);
-	}
+		dev_reserve *= 2;
+
+		dev_capacity = bucket_to_sector(ca, ca->mi.nbuckets -
+						ca->mi.first_bucket);
 
-	total_capacity = capacity;
+		ca->copygc_threshold =
+			max(div64_u64(dev_capacity *
+				      c->opts.gc_reserve_percent, 100),
+			    dev_reserve) / 2;
 
-	capacity *= (100 - c->opts.gc_reserve_percent);
-	capacity = div64_u64(capacity, 100);
+		capacity += dev_capacity;
+		reserved_sectors += dev_reserve;
+	}
 
-	BUG_ON(reserved_sectors > total_capacity);
+	reserved_sectors = max(div64_u64(capacity *
+					 c->opts.gc_reserve_percent, 100),
+			       reserved_sectors);
 
-	capacity = min(capacity, total_capacity - reserved_sectors);
+	BUG_ON(reserved_sectors > capacity);
 
-	c->capacity = capacity;
+	c->capacity = capacity - reserved_sectors;
 
 	if (c->capacity) {
 		bch2_io_timer_add(&c->io_clock[READ],
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 8dd96a2de1a3..a9ac68c17533 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -427,6 +427,7 @@ struct bch_dev {
 	copygc_heap		copygc_heap;
 	struct bch_pd_controller copygc_pd;
 	struct write_point	copygc_write_point;
+	u64			copygc_threshold;
 
 	atomic64_t		rebalance_work;
 
diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c
index 8b61b163faf5..26b8e95db1db 100644
--- a/fs/bcachefs/movinggc.c
+++ b/fs/bcachefs/movinggc.c
@@ -228,16 +228,10 @@ static int bch2_copygc_thread(void *arg)
 
 		last = atomic_long_read(&clock->now);
 
-		reserve = div64_u64((ca->mi.nbuckets - ca->mi.first_bucket) *
-				 ca->mi.bucket_size *
-				 c->opts.gc_reserve_percent, 200);
+		reserve = ca->copygc_threshold;
 
 		usage = bch2_dev_usage_read(c, ca);
 
-		/*
-		 * don't start copygc until less than half the gc reserve is
-		 * available:
-		 */
 		available = __dev_buckets_available(ca, usage) *
 			ca->mi.bucket_size;
 		if (available > reserve) {
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index 582e281694a9..a472e454099b 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -788,6 +788,8 @@ static ssize_t show_dev_alloc_debug(struct bch_dev *ca, char *buf)
 		"    meta:               %llu\n"
 		"    user:               %llu\n"
 		"    cached:             %llu\n"
+		"    fragmented:         %llu\n"
+		"    copygc threshold:   %llu\n"
 		"freelist_wait:          %s\n"
 		"open buckets:           %u/%u (reserved %u)\n"
 		"open_buckets_wait:      %s\n",
@@ -808,6 +810,8 @@ static ssize_t show_dev_alloc_debug(struct bch_dev *ca, char *buf)
 		stats.sectors[BCH_DATA_BTREE],
 		stats.sectors[BCH_DATA_USER],
 		stats.sectors[BCH_DATA_CACHED],
+		stats.sectors_fragmented,
+		ca->copygc_threshold,
 		c->freelist_wait.list.first		? "waiting" : "empty",
 		c->open_buckets_nr_free, OPEN_BUCKETS_COUNT, BTREE_NODE_RESERVE,
 		c->open_buckets_wait.list.first		? "waiting" : "empty");
-- 
cgit 


From a50ed7c8e83e52dbfd54a47b5e123f85f5cd91f1 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 24 Jul 2018 14:55:05 -0400
Subject: bcachefs: BCH_SB_RESERVE_BYTES

Add an option, gc_reserve_bytes, to set the copygc reserve as a size
instead of a percent

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc.c           | 28 ++++++++++++----------------
 fs/bcachefs/bcachefs_format.h |  1 +
 fs/bcachefs/opts.h            |  5 ++++-
 3 files changed, 17 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc.c b/fs/bcachefs/alloc.c
index bde22df25134..e6aeab0b47c7 100644
--- a/fs/bcachefs/alloc.c
+++ b/fs/bcachefs/alloc.c
@@ -1711,7 +1711,7 @@ void bch2_alloc_sectors_done(struct bch_fs *c, struct write_point *wp)
 void bch2_recalc_capacity(struct bch_fs *c)
 {
 	struct bch_dev *ca;
-	u64 capacity = 0, reserved_sectors = 0;
+	u64 capacity = 0, reserved_sectors = 0, gc_reserve;
 	unsigned long ra_pages = 0;
 	unsigned i, j;
 
@@ -1726,7 +1726,7 @@ void bch2_recalc_capacity(struct bch_fs *c)
 	bch2_set_ra_pages(c, ra_pages);
 
 	for_each_rw_member(ca, c, i) {
-		u64 dev_capacity, dev_reserve = 0;
+		u64 dev_reserve = 0;
 
 		/*
 		 * We need to reserve buckets (from the number
@@ -1758,25 +1758,21 @@ void bch2_recalc_capacity(struct bch_fs *c)
 
 		dev_reserve *= ca->mi.bucket_size;
 
-		dev_reserve *= 2;
+		ca->copygc_threshold = dev_reserve;
 
-		dev_capacity = bucket_to_sector(ca, ca->mi.nbuckets -
-						ca->mi.first_bucket);
+		capacity += bucket_to_sector(ca, ca->mi.nbuckets -
+					     ca->mi.first_bucket);
 
-		ca->copygc_threshold =
-			max(div64_u64(dev_capacity *
-				      c->opts.gc_reserve_percent, 100),
-			    dev_reserve) / 2;
-
-		capacity += dev_capacity;
-		reserved_sectors += dev_reserve;
+		reserved_sectors += dev_reserve * 2;
 	}
 
-	reserved_sectors = max(div64_u64(capacity *
-					 c->opts.gc_reserve_percent, 100),
-			       reserved_sectors);
+	gc_reserve = c->opts.gc_reserve_bytes
+		? c->opts.gc_reserve_bytes >> 9
+		: div64_u64(capacity * c->opts.gc_reserve_percent, 100);
+
+	reserved_sectors = max(gc_reserve, reserved_sectors);
 
-	BUG_ON(reserved_sectors > capacity);
+	reserved_sectors = min(reserved_sectors, capacity);
 
 	c->capacity = capacity - reserved_sectors;
 
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index eb14dba87402..ac0c7d6a07fb 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -1221,6 +1221,7 @@ LE64_BITMASK(BCH_SB_BACKGROUND_TARGET,	struct bch_sb, flags[1], 52, 64);
 
 LE64_BITMASK(BCH_SB_BACKGROUND_COMPRESSION_TYPE,
 					struct bch_sb, flags[2],  0,  4);
+LE64_BITMASK(BCH_SB_GC_RESERVE_BYTES,	struct bch_sb, flags[2],  4, 64);
 
 /* Features: */
 enum bch_sb_features {
diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
index 3b5eddbf56bf..01f1cb53eb5f 100644
--- a/fs/bcachefs/opts.h
+++ b/fs/bcachefs/opts.h
@@ -114,9 +114,12 @@ enum opt_type {
 	BCH_OPT(inodes_32bit,		u8,	OPT_RUNTIME,		\
 		OPT_BOOL(),						\
 		BCH_SB_INODE_32BIT,		false)			\
-	BCH_OPT(gc_reserve_percent,	u8,	OPT_MOUNT,		\
+	BCH_OPT(gc_reserve_percent,	u8,	OPT_RUNTIME,		\
 		OPT_UINT(5, 21),					\
 		BCH_SB_GC_RESERVE,		8)			\
+	BCH_OPT(gc_reserve_bytes,	u64,	OPT_RUNTIME,		\
+		OPT_UINT(0, U64_MAX),					\
+		BCH_SB_GC_RESERVE_BYTES,	0)			\
 	BCH_OPT(root_reserve_percent,	u8,	OPT_MOUNT,		\
 		OPT_UINT(0, 100),					\
 		BCH_SB_ROOT_RESERVE,		0)			\
-- 
cgit 


From b0004d8dfac514f8591b8a45dc470becf3356150 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 3 Aug 2018 19:41:44 -0400
Subject: bcachefs: Factor out btree_key_can_insert()

working on getting rid of all the reasons bch2_insert_fixup_extent() can
fail/stop partway, which is needed for other refactorings.

One of the reasons we could have to bail out is if we're splitting a
compressed extent we might need to add to our disk reservation - but we
can check that before actually starting the insert.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_types.h       |  1 -
 fs/bcachefs/btree_update_leaf.c | 55 +++++++++++++++++++++++++++++++----------
 fs/bcachefs/extents.c           | 46 ++++++++++++++++++++--------------
 fs/bcachefs/extents.h           |  6 +++--
 4 files changed, 73 insertions(+), 35 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 438ef0c07623..2ca3b1f0236f 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -448,7 +448,6 @@ enum btree_insert_ret {
 	/* extent spanned multiple leaf nodes: have to traverse to next node: */
 	BTREE_INSERT_NEED_TRAVERSE,
 	/* write lock held for too long */
-	BTREE_INSERT_NEED_RESCHED,
 	/* leaf node needs to be split */
 	BTREE_INSERT_BTREE_NODE_FULL,
 	BTREE_INSERT_JOURNAL_RES_FULL,
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 3d0c6f5c98ad..32126b02ce3a 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -297,6 +297,30 @@ static inline int btree_trans_cmp(struct btree_insert_entry l,
 
 /* Normal update interface: */
 
+static enum btree_insert_ret
+btree_key_can_insert(struct btree_insert *trans,
+		      struct btree_insert_entry *insert,
+		      unsigned *u64s)
+{
+	struct bch_fs *c = trans->c;
+	struct btree *b = insert->iter->l[0].b;
+	static enum btree_insert_ret ret;
+
+	if (unlikely(btree_node_fake(b)))
+		return BTREE_INSERT_BTREE_NODE_FULL;
+
+	ret = !btree_node_is_extents(b)
+		? BTREE_INSERT_OK
+		: bch2_extent_can_insert(trans, insert, u64s);
+	if (ret)
+		return ret;
+
+	if (*u64s > bch_btree_keys_u64s_remaining(c, b))
+		return BTREE_INSERT_BTREE_NODE_FULL;
+
+	return BTREE_INSERT_OK;
+}
+
 /*
  * Get journal reservation, take write locks, and attempt to do btree update(s):
  */
@@ -336,24 +360,34 @@ static inline int do_btree_insert_at(struct btree_insert *trans,
 		goto out;
 	}
 
+	/*
+	 * Check if the insert will fit in the leaf node with the write lock
+	 * held, otherwise another thread could write the node changing the
+	 * amount of space available:
+	 */
 	u64s = 0;
 	trans_for_each_entry(trans, i) {
 		/* Multiple inserts might go to same leaf: */
 		if (!same_leaf_as_prev(trans, i))
 			u64s = 0;
 
-		/*
-		 * bch2_btree_node_insert_fits() must be called under write lock:
-		 * with only an intent lock, another thread can still call
-		 * bch2_btree_node_write(), converting an unwritten bset to a
-		 * written one
-		 */
 		u64s += i->k->k.u64s + i->extra_res;
-		if (!bch2_btree_node_insert_fits(c,
-				i->iter->l[0].b, u64s)) {
+		switch (btree_key_can_insert(trans, i, &u64s)) {
+		case BTREE_INSERT_OK:
+			break;
+		case BTREE_INSERT_BTREE_NODE_FULL:
 			ret = -EINTR;
 			*split = i->iter;
 			goto out;
+		case BTREE_INSERT_ENOSPC:
+			ret = -ENOSPC;
+			goto out;
+		case BTREE_INSERT_NEED_GC_LOCK:
+			ret = -EINTR;
+			*cycle_gc_lock = true;
+			goto out;
+		default:
+			BUG();
 		}
 	}
 
@@ -373,7 +407,6 @@ static inline int do_btree_insert_at(struct btree_insert *trans,
 			break;
 		case BTREE_INSERT_JOURNAL_RES_FULL:
 		case BTREE_INSERT_NEED_TRAVERSE:
-		case BTREE_INSERT_NEED_RESCHED:
 			ret = -EINTR;
 			break;
 		case BTREE_INSERT_BTREE_NODE_FULL:
@@ -383,10 +416,6 @@ static inline int do_btree_insert_at(struct btree_insert *trans,
 		case BTREE_INSERT_ENOSPC:
 			ret = -ENOSPC;
 			break;
-		case BTREE_INSERT_NEED_GC_LOCK:
-			ret = -EINTR;
-			*cycle_gc_lock = true;
-			break;
 		default:
 			BUG();
 		}
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 276545dfa246..02a49d9845fb 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -1113,8 +1113,6 @@ static bool bch2_extent_merge_inline(struct bch_fs *,
 				     struct bkey_packed *,
 				     bool);
 
-#define MAX_LOCK_HOLD_TIME	(5 * NSEC_PER_MSEC)
-
 static enum btree_insert_ret
 extent_insert_should_stop(struct extent_insert_state *s)
 {
@@ -1287,23 +1285,41 @@ extent_insert_advance_pos(struct extent_insert_state *s, struct bkey_s_c k)
 	return __extent_insert_advance_pos(s, next_pos, k);
 }
 
-static enum btree_insert_ret
-extent_insert_check_split_compressed(struct extent_insert_state *s,
-				     struct bkey_s_c k,
-				     enum bch_extent_overlap overlap)
+enum btree_insert_ret
+bch2_extent_can_insert(struct btree_insert *trans,
+		       struct btree_insert_entry *insert,
+		       unsigned *u64s)
 {
-	struct bch_fs *c = s->trans->c;
-	unsigned sectors;
+	struct btree_iter_level *l = &insert->iter->l[0];
+	struct btree_node_iter node_iter = l->iter;
+	enum bch_extent_overlap overlap;
+	struct bkey_packed *_k;
+	struct bkey unpacked;
+	struct bkey_s_c k;
+	int sectors;
+
+	_k = bch2_btree_node_iter_peek_filter(&node_iter, l->b,
+					      KEY_TYPE_DISCARD);
+	if (!_k)
+		return BTREE_INSERT_OK;
+
+	k = bkey_disassemble(l->b, _k, &unpacked);
+
+	overlap = bch2_extent_overlap(&insert->k->k, k.k);
+
+	/* account for having to split existing extent: */
+	if (overlap == BCH_EXTENT_OVERLAP_MIDDLE)
+		*u64s += _k->u64s;
 
 	if (overlap == BCH_EXTENT_OVERLAP_MIDDLE &&
 	    (sectors = bch2_extent_is_compressed(k))) {
 		int flags = BCH_DISK_RESERVATION_BTREE_LOCKS_HELD;
 
-		if (s->trans->flags & BTREE_INSERT_NOFAIL)
+		if (trans->flags & BTREE_INSERT_NOFAIL)
 			flags |= BCH_DISK_RESERVATION_NOFAIL;
 
-		switch (bch2_disk_reservation_add(c,
-				s->trans->disk_res,
+		switch (bch2_disk_reservation_add(trans->c,
+				trans->disk_res,
 				sectors * bch2_extent_nr_dirty_ptrs(k),
 				flags)) {
 		case 0:
@@ -1471,10 +1487,6 @@ __bch2_delete_fixup_extent(struct extent_insert_state *s)
 
 		overlap = bch2_extent_overlap(&insert->k, k.k);
 
-		ret = extent_insert_check_split_compressed(s, k.s_c, overlap);
-		if (ret)
-			break;
-
 		ret = extent_insert_advance_pos(s, k.s_c);
 		if (ret)
 			break;
@@ -1550,10 +1562,6 @@ __bch2_insert_fixup_extent(struct extent_insert_state *s)
 
 		overlap = bch2_extent_overlap(&insert->k, k.k);
 
-		ret = extent_insert_check_split_compressed(s, k.s_c, overlap);
-		if (ret)
-			break;
-
 		if (!k.k->size)
 			goto squash;
 
diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h
index 0598d6309697..fddf25c3fa4b 100644
--- a/fs/bcachefs/extents.h
+++ b/fs/bcachefs/extents.h
@@ -63,8 +63,10 @@ int bch2_extent_pick_ptr(struct bch_fs *, struct bkey_s_c,
 			 struct extent_pick_ptr *);
 
 enum btree_insert_ret
-bch2_insert_fixup_extent(struct btree_insert *,
-			struct btree_insert_entry *);
+bch2_extent_can_insert(struct btree_insert *, struct btree_insert_entry *,
+		       unsigned *);
+enum btree_insert_ret
+bch2_insert_fixup_extent(struct btree_insert *, struct btree_insert_entry *);
 
 bool bch2_extent_normalize(struct bch_fs *, struct bkey_s);
 void bch2_extent_mark_replicas_cached(struct bch_fs *, struct bkey_s_extent,
-- 
cgit 


From 617391baa50c5bd8f239115bf4a7b45e1ee1bcaf Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 5 Aug 2018 22:34:03 -0400
Subject: bcachefs: improved rw_aux_tree_bsearch()

shouldn't be any reason for an actual binary search here

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bset.c        | 40 +++++++++++++++++++---------------------
 fs/bcachefs/btree_types.h |  5 +++++
 2 files changed, 24 insertions(+), 21 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c
index faf58b4c0eb4..a74e93a7215c 100644
--- a/fs/bcachefs/bset.c
+++ b/fs/bcachefs/bset.c
@@ -614,28 +614,30 @@ static unsigned rw_aux_tree_bsearch(struct btree *b,
 				    struct bset_tree *t,
 				    unsigned offset)
 {
-	unsigned l = 0, r = t->size;
+	unsigned bset_offs = offset - btree_bkey_first_offset(t);
+	unsigned bset_u64s = t->end_offset - btree_bkey_first_offset(t);
+	unsigned idx = bset_u64s ? bset_offs * t->size / bset_u64s : 0;
 
 	EBUG_ON(bset_aux_tree_type(t) != BSET_RW_AUX_TREE);
+	EBUG_ON(!t->size);
+	EBUG_ON(idx > t->size);
 
-	while (l < r) {
-		unsigned m = (l + r) >> 1;
+	while (idx < t->size &&
+	       rw_aux_tree(b, t)[idx].offset < offset)
+		idx++;
 
-		if (rw_aux_tree(b, t)[m].offset < offset)
-			l = m + 1;
-		else
-			r = m;
-	}
+	while (idx &&
+	       rw_aux_tree(b, t)[idx - 1].offset >= offset)
+		idx--;
 
-	EBUG_ON(l < t->size &&
-		rw_aux_tree(b, t)[l].offset < offset);
-	EBUG_ON(l &&
-		rw_aux_tree(b, t)[l - 1].offset >= offset);
-
-	EBUG_ON(l > r);
-	EBUG_ON(l > t->size);
+	EBUG_ON(idx < t->size &&
+		rw_aux_tree(b, t)[idx].offset < offset);
+	EBUG_ON(idx && rw_aux_tree(b, t)[idx - 1].offset >= offset);
+	EBUG_ON(idx + 1 < t->size &&
+		rw_aux_tree(b, t)[idx].offset ==
+		rw_aux_tree(b, t)[idx + 1].offset);
 
-	return l;
+	return idx;
 }
 
 static inline unsigned bfloat_mantissa(const struct bkey_float *f,
@@ -1150,13 +1152,9 @@ static void bch2_bset_fix_lookup_table(struct btree *b,
 	if (!bset_has_rw_aux_tree(t))
 		return;
 
+	/* returns first entry >= where */
 	l = rw_aux_tree_bsearch(b, t, where);
 
-	/* l is first >= than @where */
-
-	EBUG_ON(l < t->size && rw_aux_tree(b, t)[l].offset < where);
-	EBUG_ON(l && rw_aux_tree(b, t)[l - 1].offset >= where);
-
 	if (!l) /* never delete first entry */
 		l++;
 	else if (l < t->size &&
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 2ca3b1f0236f..fcd660470e52 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -371,6 +371,11 @@ __btree_node_offset_to_key(const struct btree *b, u16 k)
 	return (void *) ((u64 *) b->data + k + 1);
 }
 
+static inline unsigned btree_bkey_first_offset(const struct bset_tree *t)
+{
+	return t->data_offset + offsetof(struct bset, _data) / sizeof(u64);
+}
+
 #define btree_bkey_first(_b, _t)	(bset(_b, _t)->start)
 
 #define btree_bkey_last(_b, _t)						\
-- 
cgit 


From 1fe08f31b2af8ca93e7ee211ac2799d2ef2aae24 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 5 Aug 2018 22:23:44 -0400
Subject: bcachefs: bkey_written()

also cleanups of btree node offsets

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bset.c                  | 45 +++++++++++++--------------
 fs/bcachefs/btree_io.c              | 18 +++++------
 fs/bcachefs/btree_types.h           | 62 +++++++++++++++++++++++--------------
 fs/bcachefs/btree_update_interior.h | 21 +++++++------
 fs/bcachefs/btree_update_leaf.c     |  4 +--
 fs/bcachefs/extents.c               | 10 +++---
 6 files changed, 87 insertions(+), 73 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c
index a74e93a7215c..cf83911b3f5d 100644
--- a/fs/bcachefs/bset.c
+++ b/fs/bcachefs/bset.c
@@ -20,12 +20,14 @@
 
 struct bset_tree *bch2_bkey_to_bset(struct btree *b, struct bkey_packed *k)
 {
+	unsigned offset = __btree_node_key_to_offset(b, k);
 	struct bset_tree *t;
 
 	for_each_bset(b, t)
-		if (k >= btree_bkey_first(b, t) &&
-		    k < btree_bkey_last(b, t))
+		if (offset <= t->end_offset) {
+			EBUG_ON(offset < btree_bkey_first_offset(t));
 			return t;
+		}
 
 	BUG();
 }
@@ -172,34 +174,29 @@ static void bch2_btree_node_iter_next_check(struct btree_node_iter *iter,
 void bch2_btree_node_iter_verify(struct btree_node_iter *iter,
 				struct btree *b)
 {
-	struct btree_node_iter_set *set, *prev = NULL;
+	struct btree_node_iter_set *set, *s2;
 	struct bset_tree *t;
-	struct bkey_packed *k, *first;
 
-	if (bch2_btree_node_iter_end(iter))
-		return;
+	/* Verify no duplicates: */
+	btree_node_iter_for_each(iter, set)
+		btree_node_iter_for_each(iter, s2)
+			BUG_ON(set != s2 && set->end == s2->end);
 
+	/* Verify that set->end is correct: */
 	btree_node_iter_for_each(iter, set) {
-		k = __btree_node_offset_to_key(b, set->k);
-		t = bch2_bkey_to_bset(b, k);
-
-		BUG_ON(__btree_node_offset_to_key(b, set->end) !=
-		       btree_bkey_last(b, t));
-
-		BUG_ON(prev &&
-		       btree_node_iter_cmp(iter, b, *prev, *set) > 0);
-
-		prev = set;
+		for_each_bset(b, t)
+			if (set->end == t->end_offset)
+				goto found;
+		BUG();
+found:
+		BUG_ON(set->k < btree_bkey_first_offset(t) ||
+		       set->k >= t->end_offset);
 	}
 
-	first = __btree_node_offset_to_key(b, iter->data[0].k);
-
-	for_each_bset(b, t)
-		if (bch2_btree_node_iter_bset_pos(iter, b, t) ==
-		    btree_bkey_last(b, t) &&
-		    (k = bch2_bkey_prev_all(b, t, btree_bkey_last(b, t))))
-			BUG_ON(__btree_node_iter_cmp(iter->is_extents, b,
-						     k, first) > 0);
+	/* Verify iterator is sorted: */
+	btree_node_iter_for_each(iter, set)
+		BUG_ON(set != iter->data &&
+		       btree_node_iter_cmp(iter, b, set[-1], set[0]) > 0);
 }
 
 void bch2_verify_key_order(struct btree *b,
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 2d004941c52e..5c36acef2b13 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -309,7 +309,7 @@ static unsigned should_compact_bset(struct btree *b, struct bset_tree *t,
 
 	if (mode == COMPACT_LAZY) {
 		if (should_compact_bset_lazy(b, t) ||
-		    (compacting && bset_unwritten(b, bset(b, t))))
+		    (compacting && !bset_written(b, bset(b, t))))
 			return dead_u64s;
 	} else {
 		if (bset_written(b, bset(b, t)))
@@ -356,7 +356,7 @@ bool __bch2_compact_whiteouts(struct bch_fs *c, struct btree *b,
 		struct bkey_packed *k, *n, *out, *start, *end;
 		struct btree_node_entry *src = NULL, *dst = NULL;
 
-		if (t != b->set && bset_unwritten(b, i)) {
+		if (t != b->set && !bset_written(b, i)) {
 			src = container_of(i, struct btree_node_entry, keys);
 			dst = max(write_block(b),
 				  (void *) btree_bkey_last(b, t -1));
@@ -396,7 +396,7 @@ bool __bch2_compact_whiteouts(struct bch_fs *c, struct btree *b,
 				continue;
 
 			if (bkey_whiteout(k)) {
-				unreserve_whiteout(b, t, k);
+				unreserve_whiteout(b, k);
 				memcpy_u64s(u_pos, k, bkeyp_key_u64s(f, k));
 				set_bkeyp_val_u64s(f, u_pos, 0);
 				u_pos = bkey_next(u_pos);
@@ -467,7 +467,7 @@ static bool bch2_drop_whiteouts(struct btree *b)
 		start	= btree_bkey_first(b, t);
 		end	= btree_bkey_last(b, t);
 
-		if (bset_unwritten(b, i) &&
+		if (!bset_written(b, i) &&
 		    t != b->set) {
 			struct bset *dst =
 			       max_t(struct bset *, write_block(b),
@@ -829,7 +829,7 @@ static bool btree_node_compact(struct bch_fs *c, struct btree *b,
 	for (unwritten_idx = 0;
 	     unwritten_idx < b->nsets;
 	     unwritten_idx++)
-		if (bset_unwritten(b, bset(b, &b->set[unwritten_idx])))
+		if (!bset_written(b, bset(b, &b->set[unwritten_idx])))
 			break;
 
 	if (b->nsets - unwritten_idx > 1) {
@@ -852,7 +852,7 @@ void bch2_btree_build_aux_trees(struct btree *b)
 
 	for_each_bset(b, t)
 		bch2_bset_build_aux_tree(b, t,
-				bset_unwritten(b, bset(b, t)) &&
+				!bset_written(b, bset(b, t)) &&
 				t == bset_tree_last(b));
 }
 
@@ -1949,9 +1949,9 @@ bool bch2_btree_post_write_cleanup(struct bch_fs *c, struct btree *b)
 	clear_btree_node_just_written(b);
 
 	/*
-	 * Note: immediately after write, bset_unwritten()/bset_written() don't
-	 * work - the amount of data we had to write after compaction might have
-	 * been smaller than the offset of the last bset.
+	 * Note: immediately after write, bset_written() doesn't work - the
+	 * amount of data we had to write after compaction might have been
+	 * smaller than the offset of the last bset.
 	 *
 	 * However, we know that all bsets have been written here, as long as
 	 * we're still holding the write lock:
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index fcd660470e52..5376388e91e6 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -340,10 +340,38 @@ static inline struct bset_tree *bset_tree_last(struct btree *b)
 	return b->set + b->nsets - 1;
 }
 
+static inline void *
+__btree_node_offset_to_ptr(const struct btree *b, u16 offset)
+{
+	return (void *) ((u64 *) b->data + 1 + offset);
+}
+
+static inline u16
+__btree_node_ptr_to_offset(const struct btree *b, const void *p)
+{
+	u16 ret = (u64 *) p - 1 - (u64 *) b->data;
+
+	EBUG_ON(__btree_node_offset_to_ptr(b, ret) != p);
+	return ret;
+}
+
 static inline struct bset *bset(const struct btree *b,
 				const struct bset_tree *t)
 {
-	return (void *) b->data + t->data_offset * sizeof(u64);
+	return __btree_node_offset_to_ptr(b, t->data_offset);
+}
+
+static inline void set_btree_bset_end(struct btree *b, struct bset_tree *t)
+{
+	t->end_offset =
+		__btree_node_ptr_to_offset(b, vstruct_last(bset(b, t)));
+}
+
+static inline void set_btree_bset(struct btree *b, struct bset_tree *t,
+				  const struct bset *i)
+{
+	t->data_offset = __btree_node_ptr_to_offset(b, i);
+	set_btree_bset_end(b, t);
 }
 
 static inline struct bset *btree_bset_first(struct btree *b)
@@ -359,16 +387,13 @@ static inline struct bset *btree_bset_last(struct btree *b)
 static inline u16
 __btree_node_key_to_offset(const struct btree *b, const struct bkey_packed *k)
 {
-	size_t ret = (u64 *) k - (u64 *) b->data - 1;
-
-	EBUG_ON(ret > U16_MAX);
-	return ret;
+	return __btree_node_ptr_to_offset(b, k);
 }
 
 static inline struct bkey_packed *
 __btree_node_offset_to_key(const struct btree *b, u16 k)
 {
-	return (void *) ((u64 *) b->data + k + 1);
+	return __btree_node_offset_to_ptr(b, k);
 }
 
 static inline unsigned btree_bkey_first_offset(const struct bset_tree *t)
@@ -376,7 +401,13 @@ static inline unsigned btree_bkey_first_offset(const struct bset_tree *t)
 	return t->data_offset + offsetof(struct bset, _data) / sizeof(u64);
 }
 
-#define btree_bkey_first(_b, _t)	(bset(_b, _t)->start)
+#define btree_bkey_first(_b, _t)					\
+({									\
+	EBUG_ON(bset(_b, _t)->start !=					\
+		__btree_node_offset_to_key(_b, btree_bkey_first_offset(_t)));\
+									\
+	bset(_b, _t)->start;						\
+})
 
 #define btree_bkey_last(_b, _t)						\
 ({									\
@@ -386,23 +417,6 @@ static inline unsigned btree_bkey_first_offset(const struct bset_tree *t)
 	__btree_node_offset_to_key(_b, (_t)->end_offset);		\
 })
 
-static inline void set_btree_bset_end(struct btree *b, struct bset_tree *t)
-{
-	t->end_offset =
-		__btree_node_key_to_offset(b, vstruct_last(bset(b, t)));
-	btree_bkey_last(b, t);
-}
-
-static inline void set_btree_bset(struct btree *b, struct bset_tree *t,
-				  const struct bset *i)
-{
-	t->data_offset = (u64 *) i - (u64 *) b->data;
-
-	EBUG_ON(bset(b, t) != i);
-
-	set_btree_bset_end(b, t);
-}
-
 static inline unsigned bset_byte_offset(struct btree *b, void *i)
 {
 	return i - (void *) b->data;
diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h
index 7a19a52bbcff..711fbe63eb3a 100644
--- a/fs/bcachefs/btree_update_interior.h
+++ b/fs/bcachefs/btree_update_interior.h
@@ -241,14 +241,19 @@ static inline void *write_block(struct btree *b)
 	return (void *) b->data + (b->written << 9);
 }
 
+static inline bool __btree_addr_written(struct btree *b, void *p)
+{
+	return p < write_block(b);
+}
+
 static inline bool bset_written(struct btree *b, struct bset *i)
 {
-	return (void *) i < write_block(b);
+	return __btree_addr_written(b, i);
 }
 
-static inline bool bset_unwritten(struct btree *b, struct bset *i)
+static inline bool bkey_written(struct btree *b, struct bkey_packed *k)
 {
-	return (void *) i > write_block(b);
+	return __btree_addr_written(b, k);
 }
 
 static inline ssize_t __bch_btree_u64s_remaining(struct bch_fs *c,
@@ -307,10 +312,9 @@ static inline struct btree_node_entry *want_new_bset(struct bch_fs *c,
 	return NULL;
 }
 
-static inline void unreserve_whiteout(struct btree *b, struct bset_tree *t,
-				      struct bkey_packed *k)
+static inline void unreserve_whiteout(struct btree *b, struct bkey_packed *k)
 {
-	if (bset_written(b, bset(b, t))) {
+	if (bkey_written(b, k)) {
 		EBUG_ON(b->uncompacted_whiteout_u64s <
 			bkeyp_key_u64s(&b->format, k));
 		b->uncompacted_whiteout_u64s -=
@@ -318,10 +322,9 @@ static inline void unreserve_whiteout(struct btree *b, struct bset_tree *t,
 	}
 }
 
-static inline void reserve_whiteout(struct btree *b, struct bset_tree *t,
-				    struct bkey_packed *k)
+static inline void reserve_whiteout(struct btree *b, struct bkey_packed *k)
 {
-	if (bset_written(b, bset(b, t))) {
+	if (bkey_written(b, k)) {
 		BUG_ON(!k->needs_whiteout);
 		b->uncompacted_whiteout_u64s +=
 			bkeyp_key_u64s(&b->format, k);
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 32126b02ce3a..ce0223bd52b5 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -40,7 +40,7 @@ bool bch2_btree_bset_insert_key(struct btree_iter *iter,
 
 		t = bch2_bkey_to_bset(b, k);
 
-		if (bset_unwritten(b, bset(b, t)) &&
+		if (!bkey_written(b, k) &&
 		    bkey_val_u64s(&insert->k) == bkeyp_val_u64s(f, k) &&
 		    !bkey_whiteout(&insert->k)) {
 			k->type = insert->k.type;
@@ -76,7 +76,7 @@ bool bch2_btree_bset_insert_key(struct btree_iter *iter,
 					k->u64s, k->u64s);
 
 		if (bkey_whiteout(&insert->k)) {
-			reserve_whiteout(b, t, k);
+			reserve_whiteout(b, k);
 			return true;
 		} else {
 			k->needs_whiteout = false;
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 02a49d9845fb..803272b10e61 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -1429,7 +1429,7 @@ extent_squash(struct extent_insert_state *s, struct bkey_i *insert,
 		 * what k points to)
 		 */
 		bkey_reassemble(&split.k, k.s_c);
-		split.k.k.needs_whiteout |= bset_written(b, bset(b, t));
+		split.k.k.needs_whiteout |= bkey_written(b, _k);
 
 		bch2_cut_back(bkey_start_pos(&insert->k), &split.k.k);
 		BUG_ON(bkey_deleted(&split.k.k));
@@ -1499,9 +1499,9 @@ __bch2_delete_fixup_extent(struct extent_insert_state *s)
 			bch2_subtract_sectors(s, k.s_c,
 					     bkey_start_offset(k.k), k.k->size);
 			_k->type = KEY_TYPE_DISCARD;
-			reserve_whiteout(b, t, _k);
+			reserve_whiteout(b, _k);
 		} else if (k.k->needs_whiteout ||
-			   bset_written(b, bset(b, t))) {
+			   bkey_written(b, _k)) {
 			struct bkey_i discard = *insert;
 
 			discard.k.type = KEY_TYPE_DISCARD;
@@ -1573,13 +1573,13 @@ __bch2_insert_fixup_extent(struct extent_insert_state *s)
 			break;
 
 		if (k.k->size &&
-		    (k.k->needs_whiteout || bset_written(b, bset(b, t))))
+		    (k.k->needs_whiteout || bkey_written(b, _k)))
 			insert->k.needs_whiteout = true;
 
 		if (overlap == BCH_EXTENT_OVERLAP_ALL &&
 		    bkey_whiteout(k.k) &&
 		    k.k->needs_whiteout) {
-			unreserve_whiteout(b, t, _k);
+			unreserve_whiteout(b, _k);
 			_k->needs_whiteout = false;
 		}
 squash:
-- 
cgit 


From 0fdf18047fd38e7b5cc6adba3a81704c88333e1c Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 1 Aug 2018 23:03:41 -0400
Subject: bcachefs: extent unit tests

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/tests.c | 62 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 62 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c
index c522fb795e63..a408fa9ed8b5 100644
--- a/fs/bcachefs/tests.c
+++ b/fs/bcachefs/tests.c
@@ -271,6 +271,63 @@ static void test_iterate_slots_extents(struct bch_fs *c, u64 nr)
 	bch2_btree_iter_unlock(&iter);
 }
 
+/* extent unit tests */
+
+u64 test_version;
+
+static void insert_test_extent(struct bch_fs *c,
+			       u64 start, u64 end)
+{
+	struct bkey_i_cookie k;
+	int ret;
+
+	//pr_info("inserting %llu-%llu v %llu", start, end, test_version);
+
+	bkey_cookie_init(&k.k_i);
+	k.k_i.k.p.offset = end;
+	k.k_i.k.size = end - start;
+	k.k_i.k.version.lo = test_version++;
+
+	ret = bch2_btree_insert(c, BTREE_ID_EXTENTS, &k.k_i,
+				NULL, NULL, NULL, 0);
+	BUG_ON(ret);
+}
+
+static void __test_extent_overwrite(struct bch_fs *c,
+				    u64 e1_start, u64 e1_end,
+				    u64 e2_start, u64 e2_end)
+{
+	insert_test_extent(c, e1_start, e1_end);
+	insert_test_extent(c, e2_start, e2_end);
+
+	delete_test_keys(c);
+}
+
+static void test_extent_overwrite_front(struct bch_fs *c, u64 nr)
+{
+	__test_extent_overwrite(c, 0, 64, 0, 32);
+	__test_extent_overwrite(c, 8, 64, 0, 32);
+}
+
+static void test_extent_overwrite_back(struct bch_fs *c, u64 nr)
+{
+	__test_extent_overwrite(c, 0, 64, 32, 64);
+	__test_extent_overwrite(c, 0, 64, 32, 72);
+}
+
+static void test_extent_overwrite_middle(struct bch_fs *c, u64 nr)
+{
+	__test_extent_overwrite(c, 0, 64, 32, 40);
+}
+
+static void test_extent_overwrite_all(struct bch_fs *c, u64 nr)
+{
+	__test_extent_overwrite(c, 32, 64,  0,  64);
+	__test_extent_overwrite(c, 32, 64,  0, 128);
+	__test_extent_overwrite(c, 32, 64, 32,  64);
+	__test_extent_overwrite(c, 32, 64, 32, 128);
+}
+
 /* perf tests */
 
 static u64 test_rand(void)
@@ -499,6 +556,11 @@ void bch2_btree_perf_test(struct bch_fs *c, const char *testname,
 	perf_test(test_iterate_slots);
 	perf_test(test_iterate_slots_extents);
 
+	perf_test(test_extent_overwrite_front);
+	perf_test(test_extent_overwrite_back);
+	perf_test(test_extent_overwrite_middle);
+	perf_test(test_extent_overwrite_all);
+
 	if (!j.fn) {
 		pr_err("unknown test %s", testname);
 		return;
-- 
cgit 


From 271a3d3a4b30dcd9fd274a923fb382f5f113d279 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 21 Jul 2016 19:05:06 -0800
Subject: bcachefs: lift ordering restriction on 0 size extents

This lifts the restriction that 0 size extents must not overlap with
other extents, which means we can now sort extents and non extents the
same way, and will let us simplify a bunch of other stuff as well.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bset.c                  | 193 ++++++-------
 fs/bcachefs/bset.h                  |  68 ++---
 fs/bcachefs/btree_gc.c              |   2 -
 fs/bcachefs/btree_io.c              |  10 +-
 fs/bcachefs/btree_io.h              |   9 -
 fs/bcachefs/btree_iter.c            | 175 ++++++++----
 fs/bcachefs/btree_types.h           |   5 -
 fs/bcachefs/btree_update_interior.c |   4 +-
 fs/bcachefs/btree_update_leaf.c     |   9 +-
 fs/bcachefs/extents.c               | 547 +++++++++++++++---------------------
 10 files changed, 475 insertions(+), 547 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c
index cf83911b3f5d..27fa3e230e6e 100644
--- a/fs/bcachefs/bset.c
+++ b/fs/bcachefs/bset.c
@@ -18,6 +18,9 @@
 #include <linux/random.h>
 #include <linux/prefetch.h>
 
+static inline void __bch2_btree_node_iter_advance(struct btree_node_iter *,
+						  struct btree *);
+
 struct bset_tree *bch2_bkey_to_bset(struct btree *b, struct bkey_packed *k)
 {
 	unsigned offset = __btree_node_key_to_offset(b, k);
@@ -63,8 +66,8 @@ void bch2_dump_bset(struct btree *b, struct bset *i, unsigned set)
 		_n = bkey_next(_k);
 
 		bch2_bkey_to_text(buf, sizeof(buf), &k);
-		printk(KERN_ERR "block %u key %zi/%u: %s\n", set,
-		       _k->_data - i->_data, i->u64s, buf);
+		printk(KERN_ERR "block %u key %5u: %s\n", set,
+		       __btree_node_key_to_offset(b, _k), buf);
 
 		if (_n == vstruct_last(i))
 			continue;
@@ -120,20 +123,6 @@ void bch2_dump_btree_node_iter(struct btree *b,
 
 #ifdef CONFIG_BCACHEFS_DEBUG
 
-static bool keys_out_of_order(struct btree *b,
-			      const struct bkey_packed *prev,
-			      const struct bkey_packed *next,
-			      bool is_extents)
-{
-	struct bkey nextu = bkey_unpack_key(b, next);
-
-	return bkey_cmp_left_packed_byval(b, prev, bkey_start_pos(&nextu)) > 0 ||
-		((is_extents
-		  ? !bkey_deleted(next)
-		  : !bkey_deleted(prev)) &&
-		 !bkey_cmp_packed(b, prev, next));
-}
-
 void __bch2_verify_btree_nr_keys(struct btree *b)
 {
 	struct bset_tree *t;
@@ -150,16 +139,21 @@ void __bch2_verify_btree_nr_keys(struct btree *b)
 	BUG_ON(memcmp(&nr, &b->nr, sizeof(nr)));
 }
 
-static void bch2_btree_node_iter_next_check(struct btree_node_iter *iter,
-					   struct btree *b,
-					   struct bkey_packed *k)
+static void bch2_btree_node_iter_next_check(struct btree_node_iter *_iter,
+					    struct btree *b)
 {
-	const struct bkey_packed *n = bch2_btree_node_iter_peek_all(iter, b);
+	struct btree_node_iter iter = *_iter;
+	const struct bkey_packed *k, *n;
+
+	k = bch2_btree_node_iter_peek_all(&iter, b);
+	__bch2_btree_node_iter_advance(&iter, b);
+	n = bch2_btree_node_iter_peek_all(&iter, b);
 
 	bkey_unpack_key(b, k);
 
 	if (n &&
-	    keys_out_of_order(b, k, n, iter->is_extents)) {
+	    __btree_node_iter_cmp(b, k, n) > 0) {
+		struct btree_node_iter_set *set;
 		struct bkey ku = bkey_unpack_key(b, k);
 		struct bkey nu = bkey_unpack_key(b, n);
 		char buf1[80], buf2[80];
@@ -167,12 +161,22 @@ static void bch2_btree_node_iter_next_check(struct btree_node_iter *iter,
 		bch2_dump_btree_node(b);
 		bch2_bkey_to_text(buf1, sizeof(buf1), &ku);
 		bch2_bkey_to_text(buf2, sizeof(buf2), &nu);
-		panic("out of order/overlapping:\n%s\n%s\n", buf1, buf2);
+		printk(KERN_ERR "out of order/overlapping:\n%s\n%s\n",
+		       buf1, buf2);
+		printk(KERN_ERR "iter was:");
+
+		btree_node_iter_for_each(_iter, set) {
+			struct bkey_packed *k = __btree_node_offset_to_key(b, set->k);
+			struct bset_tree *t = bch2_bkey_to_bset(b, k);
+			printk(" [%zi %zi]", t - b->set,
+			       k->_data - bset(b, t)->_data);
+		}
+		panic("\n");
 	}
 }
 
 void bch2_btree_node_iter_verify(struct btree_node_iter *iter,
-				struct btree *b)
+				 struct btree *b)
 {
 	struct btree_node_iter_set *set, *s2;
 	struct bset_tree *t;
@@ -196,72 +200,72 @@ found:
 	/* Verify iterator is sorted: */
 	btree_node_iter_for_each(iter, set)
 		BUG_ON(set != iter->data &&
-		       btree_node_iter_cmp(iter, b, set[-1], set[0]) > 0);
+		       btree_node_iter_cmp(b, set[-1], set[0]) > 0);
 }
 
-void bch2_verify_key_order(struct btree *b,
-			  struct btree_node_iter *iter,
-			  struct bkey_packed *where)
+void bch2_verify_insert_pos(struct btree *b, struct bkey_packed *where,
+			    struct bkey_packed *insert, unsigned clobber_u64s)
 {
 	struct bset_tree *t = bch2_bkey_to_bset(b, where);
-	struct bkey_packed *k, *prev;
-	struct bkey uk, uw = bkey_unpack_key(b, where);
-
-	k = bch2_bkey_prev_all(b, t, where);
-	if (k &&
-	    keys_out_of_order(b, k, where, iter->is_extents)) {
-		char buf1[100], buf2[100];
+	struct bkey_packed *prev = bch2_bkey_prev_all(b, t, where);
+	struct bkey_packed *next = (void *) (where->_data + clobber_u64s);
+#if 0
+	BUG_ON(prev &&
+	       __btree_node_iter_cmp(b, prev, insert) > 0);
+#else
+	if (prev &&
+	    __btree_node_iter_cmp(b, prev, insert) > 0) {
+		struct bkey k1 = bkey_unpack_key(b, prev);
+		struct bkey k2 = bkey_unpack_key(b, insert);
+		char buf1[100];
+		char buf2[100];
 
 		bch2_dump_btree_node(b);
-		uk = bkey_unpack_key(b, k);
-		bch2_bkey_to_text(buf1, sizeof(buf1), &uk);
-		bch2_bkey_to_text(buf2, sizeof(buf2), &uw);
-		panic("out of order with prev:\n%s\n%s\n",
-		      buf1, buf2);
+		bch2_bkey_to_text(buf1, sizeof(buf1), &k1);
+		bch2_bkey_to_text(buf2, sizeof(buf2), &k2);
+
+		panic("prev > insert:\n"
+		      "prev    key %5u %s\n"
+		      "insert  key %5u %s\n",
+		       __btree_node_key_to_offset(b, prev), buf1,
+		       __btree_node_key_to_offset(b, insert), buf2);
 	}
+#endif
+#if 0
+	BUG_ON(next != btree_bkey_last(b, t) &&
+	       __btree_node_iter_cmp(b, insert, next) > 0);
+#else
+	if (next != btree_bkey_last(b, t) &&
+	    __btree_node_iter_cmp(b, insert, next) > 0) {
+		struct bkey k1 = bkey_unpack_key(b, insert);
+		struct bkey k2 = bkey_unpack_key(b, next);
+		char buf1[100];
+		char buf2[100];
 
-	k = bkey_next(where);
-	BUG_ON(k != btree_bkey_last(b, t) &&
-	       keys_out_of_order(b, where, k, iter->is_extents));
-
-	for_each_bset(b, t) {
-		if (where >= btree_bkey_first(b, t) ||
-		    where < btree_bkey_last(b, t))
-			continue;
-
-		k = bch2_btree_node_iter_bset_pos(iter, b, t);
-
-		if (k == btree_bkey_last(b, t))
-			k = bch2_bkey_prev_all(b, t, k);
-
-		while (bkey_cmp_left_packed_byval(b, k, bkey_start_pos(&uw)) > 0 &&
-		       (prev = bch2_bkey_prev_all(b, t, k)))
-			k = prev;
-
-		for (;
-		     k != btree_bkey_last(b, t);
-		     k = bkey_next(k)) {
-			uk = bkey_unpack_key(b, k);
-
-			if (iter->is_extents) {
-				BUG_ON(!(bkey_cmp(uw.p, bkey_start_pos(&uk)) <= 0 ||
-					 bkey_cmp(uk.p, bkey_start_pos(&uw)) <= 0));
-			} else {
-				BUG_ON(!bkey_cmp(uw.p, uk.p) &&
-				       !bkey_deleted(&uk));
-			}
-
-			if (bkey_cmp(uw.p, bkey_start_pos(&uk)) <= 0)
-				break;
-		}
+		bch2_dump_btree_node(b);
+		bch2_bkey_to_text(buf1, sizeof(buf1), &k1);
+		bch2_bkey_to_text(buf2, sizeof(buf2), &k2);
+
+		panic("insert > next:\n"
+		      "insert  key %5u %s\n"
+		      "next    key %5u %s\n",
+		       __btree_node_key_to_offset(b, insert), buf1,
+		       __btree_node_key_to_offset(b, next), buf2);
 	}
+#endif
+}
+
+void bch2_verify_key_order(struct btree *b,
+			   struct btree_node_iter *_iter,
+			   struct bkey_packed *where)
+{
+	bch2_verify_insert_pos(b, where, where, where->u64s);
 }
 
 #else
 
 static inline void bch2_btree_node_iter_next_check(struct btree_node_iter *iter,
-						   struct btree *b,
-						   struct bkey_packed *k) {}
+						   struct btree *b) {}
 
 #endif
 
@@ -1229,6 +1233,7 @@ void bch2_bset_insert(struct btree *b,
 	struct bkey_packed packed, *src = bkey_to_packed(insert);
 
 	bch2_bset_verify_rw_aux_tree(b, t);
+	bch2_verify_insert_pos(b, where, bkey_to_packed(insert), clobber_u64s);
 
 	if (bch2_bkey_pack_key(&packed, &insert->k, f))
 		src = &packed;
@@ -1255,7 +1260,6 @@ void bch2_bset_insert(struct btree *b,
 
 	bch2_bset_fix_lookup_table(b, t, where, clobber_u64s, src->u64s);
 
-	bch2_verify_key_order(b, iter, where);
 	bch2_verify_btree_nr_keys(b);
 }
 
@@ -1461,7 +1465,7 @@ void bch2_btree_node_iter_push(struct btree_node_iter *iter,
 noinline __flatten __attribute__((cold))
 static void btree_node_iter_init_pack_failed(struct btree_node_iter *iter,
 			      struct btree *b, struct bpos search,
-			      bool strictly_greater, bool is_extents)
+			      bool strictly_greater)
 {
 	struct bset_tree *t;
 
@@ -1518,7 +1522,7 @@ static void btree_node_iter_init_pack_failed(struct btree_node_iter *iter,
  */
 void bch2_btree_node_iter_init(struct btree_node_iter *iter,
 			       struct btree *b, struct bpos search,
-			       bool strictly_greater, bool is_extents)
+			       bool strictly_greater)
 {
 	struct bset_tree *t;
 	struct bkey_packed p, *packed_search = NULL;
@@ -1526,7 +1530,7 @@ void bch2_btree_node_iter_init(struct btree_node_iter *iter,
 	EBUG_ON(bkey_cmp(search, b->data->min_key) < 0);
 	bset_aux_tree_verify(b);
 
-	__bch2_btree_node_iter_init(iter, is_extents);
+	memset(iter, 0, sizeof(*iter));
 
 	switch (bch2_bkey_pack_pos_lossy(&p, search, b)) {
 	case BKEY_PACK_POS_EXACT:
@@ -1537,7 +1541,7 @@ void bch2_btree_node_iter_init(struct btree_node_iter *iter,
 		break;
 	case BKEY_PACK_POS_FAIL:
 		btree_node_iter_init_pack_failed(iter, b, search,
-					strictly_greater, is_extents);
+						 strictly_greater);
 		return;
 	}
 
@@ -1552,12 +1556,11 @@ void bch2_btree_node_iter_init(struct btree_node_iter *iter,
 }
 
 void bch2_btree_node_iter_init_from_start(struct btree_node_iter *iter,
-					  struct btree *b,
-					  bool is_extents)
+					  struct btree *b)
 {
 	struct bset_tree *t;
 
-	__bch2_btree_node_iter_init(iter, is_extents);
+	memset(iter, 0, sizeof(*iter));
 
 	for_each_bset(b, t)
 		__bch2_btree_node_iter_push(iter, b,
@@ -1585,7 +1588,7 @@ static inline bool btree_node_iter_sort_two(struct btree_node_iter *iter,
 {
 	bool ret;
 
-	if ((ret = (btree_node_iter_cmp(iter, b,
+	if ((ret = (btree_node_iter_cmp(b,
 					iter->data[first],
 					iter->data[first + 1]) > 0)))
 		swap(iter->data[first], iter->data[first + 1]);
@@ -1640,23 +1643,14 @@ static inline void __bch2_btree_node_iter_advance(struct btree_node_iter *iter,
 	btree_node_iter_sort_two(iter, b, 1);
 }
 
-/**
- * bch_btree_node_iter_advance - advance @iter by one key
- *
- * Doesn't do debugchecks - for cases where (insert_fixup_extent()) a bset might
- * momentarily have out of order extents.
- */
 void bch2_btree_node_iter_advance(struct btree_node_iter *iter,
 				  struct btree *b)
 {
 #ifdef CONFIG_BCACHEFS_DEBUG
-	struct bkey_packed *k = bch2_btree_node_iter_peek_all(iter, b);
-
-	__bch2_btree_node_iter_advance(iter, b);
-	bch2_btree_node_iter_next_check(iter, b, k);
-#else
-	__bch2_btree_node_iter_advance(iter, b);
+	bch2_btree_node_iter_verify(iter, b);
+	bch2_btree_node_iter_next_check(iter, b);
 #endif
+	__bch2_btree_node_iter_advance(iter, b);
 }
 
 static inline unsigned __btree_node_iter_used(struct btree_node_iter *iter)
@@ -1689,8 +1683,7 @@ struct bkey_packed *bch2_btree_node_iter_prev_filter(struct btree_node_iter *ite
 			bch2_btree_node_iter_bset_pos(iter, b, t),
 			min_key_type);
 		if (k &&
-		    (!prev || __btree_node_iter_cmp(iter->is_extents, b,
-						    k, prev) > 0)) {
+		    (!prev || __btree_node_iter_cmp(b, k, prev) > 0)) {
 			prev = k;
 			end = t->end_offset;
 		}
@@ -1723,11 +1716,11 @@ out:
 		struct btree_node_iter iter2 = *iter;
 
 		if (prev)
-			bch2_btree_node_iter_advance(&iter2, b);
+			__bch2_btree_node_iter_advance(&iter2, b);
 
 		while ((k = bch2_btree_node_iter_peek_all(&iter2, b)) != orig_pos) {
 			BUG_ON(k->type >= min_key_type);
-			bch2_btree_node_iter_advance(&iter2, b);
+			__bch2_btree_node_iter_advance(&iter2, b);
 		}
 	}
 
diff --git a/fs/bcachefs/bset.h b/fs/bcachefs/bset.h
index 2fa71d7c0e8a..0787030ccc7e 100644
--- a/fs/bcachefs/bset.h
+++ b/fs/bcachefs/bset.h
@@ -369,6 +369,17 @@ static inline int bkey_cmp_p_or_unp(const struct btree *b,
 	return __bch2_bkey_cmp_left_packed_format_checked(b, l, r);
 }
 
+/* Returns true if @k is after iterator position @pos */
+static inline bool btree_iter_pos_cmp(struct btree_iter *iter,
+				      const struct bkey *k)
+{
+	int cmp = bkey_cmp(k->p, iter->pos);
+
+	return cmp > 0 ||
+		(cmp == 0 &&
+		 !(iter->flags & BTREE_ITER_IS_EXTENTS) && !bkey_deleted(k));
+}
+
 /* Returns true if @k is after iterator position @pos */
 static inline bool btree_iter_pos_cmp_packed(const struct btree *b,
 					     struct bpos *pos,
@@ -419,7 +430,7 @@ enum bch_extent_overlap {
 
 /* Returns how k overlaps with m */
 static inline enum bch_extent_overlap bch2_extent_overlap(const struct bkey *k,
-							 const struct bkey *m)
+							  const struct bkey *m)
 {
 	int cmp1 = bkey_cmp(k->p, m->p) < 0;
 	int cmp2 = bkey_cmp(bkey_start_pos(k),
@@ -430,20 +441,13 @@ static inline enum bch_extent_overlap bch2_extent_overlap(const struct bkey *k,
 
 /* Btree key iteration */
 
-static inline void __bch2_btree_node_iter_init(struct btree_node_iter *iter,
-					      bool is_extents)
-{
-	iter->is_extents = is_extents;
-	memset(iter->data, 0, sizeof(iter->data));
-}
-
 void bch2_btree_node_iter_push(struct btree_node_iter *, struct btree *,
 			      const struct bkey_packed *,
 			      const struct bkey_packed *);
 void bch2_btree_node_iter_init(struct btree_node_iter *, struct btree *,
-			      struct bpos, bool, bool);
+			       struct bpos, bool);
 void bch2_btree_node_iter_init_from_start(struct btree_node_iter *,
-					 struct btree *, bool);
+					  struct btree *);
 struct bkey_packed *bch2_btree_node_iter_bset_pos(struct btree_node_iter *,
 						 struct btree *,
 						 struct bset_tree *);
@@ -470,32 +474,21 @@ static inline bool bch2_btree_node_iter_end(struct btree_node_iter *iter)
 	return __btree_node_iter_set_end(iter, 0);
 }
 
-static inline int __btree_node_iter_cmp(bool is_extents,
-					struct btree *b,
-					struct bkey_packed *l,
-					struct bkey_packed *r)
+static inline int __btree_node_iter_cmp(struct btree *b,
+					const struct bkey_packed *l,
+					const struct bkey_packed *r)
 {
-	/*
-	 * For non extents, when keys compare equal the deleted keys have to
-	 * come first - so that bch2_btree_node_iter_next_check() can detect
-	 * duplicate nondeleted keys (and possibly other reasons?)
-	 *
-	 * For extents, bkey_deleted() is used as a proxy for k->size == 0, so
-	 * deleted keys have to sort last.
-	 */
+	/* When keys compare equal deleted keys come first */
 	return bkey_cmp_packed(b, l, r)
-		?: (is_extents
-		    ? (int) bkey_deleted(l) - (int) bkey_deleted(r)
-		    : (int) bkey_deleted(r) - (int) bkey_deleted(l))
+		?: (int) bkey_deleted(r) - (int) bkey_deleted(l)
 		?: (l > r) - (l < r);
 }
 
-static inline int btree_node_iter_cmp(struct btree_node_iter *iter,
-				      struct btree *b,
+static inline int btree_node_iter_cmp(struct btree *b,
 				      struct btree_node_iter_set l,
 				      struct btree_node_iter_set r)
 {
-	return __btree_node_iter_cmp(iter->is_extents, b,
+	return __btree_node_iter_cmp(b,
 			__btree_node_offset_to_key(b, l.k),
 			__btree_node_offset_to_key(b, r.k));
 }
@@ -582,21 +575,12 @@ bch2_btree_node_iter_prev(struct btree_node_iter *iter, struct btree *b)
 	return bch2_btree_node_iter_prev_filter(iter, b, KEY_TYPE_DISCARD + 1);
 }
 
-/*
- * Iterates over all _live_ keys - skipping deleted (and potentially
- * overlapping) keys
- */
-#define for_each_btree_node_key(b, k, iter, _is_extents)		\
-	for (bch2_btree_node_iter_init_from_start((iter), (b), (_is_extents));\
-	     ((k) = bch2_btree_node_iter_peek(iter, b));			\
-	     bch2_btree_node_iter_advance(iter, b))
-
 struct bkey_s_c bch2_btree_node_iter_peek_unpack(struct btree_node_iter *,
 						struct btree *,
 						struct bkey *);
 
-#define for_each_btree_node_key_unpack(b, k, iter, _is_extents, unpacked)\
-	for (bch2_btree_node_iter_init_from_start((iter), (b), (_is_extents));\
+#define for_each_btree_node_key_unpack(b, k, iter, unpacked)		\
+	for (bch2_btree_node_iter_init_from_start((iter), (b));		\
 	     (k = bch2_btree_node_iter_peek_unpack((iter), (b), (unpacked))).k;\
 	     bch2_btree_node_iter_advance(iter, b))
 
@@ -646,6 +630,8 @@ void bch2_dump_btree_node_iter(struct btree *, struct btree_node_iter *);
 
 void __bch2_verify_btree_nr_keys(struct btree *);
 void bch2_btree_node_iter_verify(struct btree_node_iter *, struct btree *);
+void bch2_verify_insert_pos(struct btree *, struct bkey_packed *,
+			    struct bkey_packed *, unsigned);
 void bch2_verify_key_order(struct btree *, struct btree_node_iter *,
 			  struct bkey_packed *);
 
@@ -654,6 +640,10 @@ void bch2_verify_key_order(struct btree *, struct btree_node_iter *,
 static inline void __bch2_verify_btree_nr_keys(struct btree *b) {}
 static inline void bch2_btree_node_iter_verify(struct btree_node_iter *iter,
 					      struct btree *b) {}
+static inline void bch2_verify_insert_pos(struct btree *b,
+					  struct bkey_packed *where,
+					  struct bkey_packed *insert,
+					  unsigned clobber_u64s) {}
 static inline void bch2_verify_key_order(struct btree *b,
 					struct btree_node_iter *iter,
 					struct bkey_packed *where) {}
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 1fbb9c657fc6..2526118fe9ce 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -217,7 +217,6 @@ static unsigned btree_gc_mark_node(struct bch_fs *c, struct btree *b)
 
 	if (btree_node_has_ptrs(b))
 		for_each_btree_node_key_unpack(b, k, &iter,
-					       btree_node_is_extents(b),
 					       &unpacked) {
 			bch2_bkey_debugcheck(c, b, k);
 			stale = max(stale, bch2_gc_mark_key(c, type, k, 0));
@@ -1044,7 +1043,6 @@ static int bch2_initial_gc_btree(struct bch_fs *c, enum btree_id id)
 			struct bkey_s_c k;
 
 			for_each_btree_node_key_unpack(b, k, &node_iter,
-						       btree_node_is_extents(b),
 						       &unpacked) {
 				ret = bch2_btree_mark_key_initial(c,
 							btree_node_type(b), k);
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 5c36acef2b13..889870582566 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -22,7 +22,7 @@
 /* btree_node_iter_large: */
 
 #define btree_node_iter_cmp_heap(h, _l, _r)				\
-	__btree_node_iter_cmp((iter)->is_extents, b,			\
+	__btree_node_iter_cmp(b,					\
 			       __btree_node_offset_to_key(b, (_l).k),	\
 			       __btree_node_offset_to_key(b, (_r).k))
 
@@ -248,6 +248,9 @@ static unsigned sort_extent_whiteouts(struct bkey_packed *dst,
 	sort_iter_sort(iter, sort_extent_whiteouts_cmp);
 
 	while ((in = sort_iter_next(iter, sort_extent_whiteouts_cmp))) {
+		if (bkey_deleted(in))
+			continue;
+
 		EBUG_ON(bkeyp_val_u64s(f, in));
 		EBUG_ON(in->type != KEY_TYPE_DISCARD);
 
@@ -785,8 +788,7 @@ void bch2_btree_sort_into(struct bch_fs *c,
 
 	bch2_bset_set_no_aux_tree(dst, dst->set);
 
-	bch2_btree_node_iter_init_from_start(&src_iter, src,
-					    btree_node_is_extents(src));
+	bch2_btree_node_iter_init_from_start(&src_iter, src);
 
 	if (btree_node_ops(src)->key_normalize ||
 	    btree_node_ops(src)->key_merge)
@@ -1171,7 +1173,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry
 	int ret, retry_read = 0, write = READ;
 
 	iter = mempool_alloc(&c->fill_iter, GFP_NOIO);
-	__bch2_btree_node_iter_large_init(iter, btree_node_is_extents(b));
+	iter->used = 0;
 
 	if (bch2_meta_read_fault("btree"))
 		btree_err(BTREE_ERR_MUST_RETRY, c, b, NULL,
diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h
index 0688ce420610..7835f8a9e3a0 100644
--- a/fs/bcachefs/btree_io.h
+++ b/fs/bcachefs/btree_io.h
@@ -146,20 +146,11 @@ ssize_t bch2_dirty_btree_nodes_print(struct bch_fs *, char *);
 /* Sorting */
 
 struct btree_node_iter_large {
-	u8		is_extents;
 	u16		used;
 
 	struct btree_node_iter_set data[MAX_BSETS];
 };
 
-static inline void
-__bch2_btree_node_iter_large_init(struct btree_node_iter_large *iter,
-				  bool is_extents)
-{
-	iter->used = 0;
-	iter->is_extents = is_extents;
-}
-
 void bch2_btree_node_iter_large_advance(struct btree_node_iter_large *,
 					struct btree *);
 
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 8918268f99f4..9d92826181dc 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -375,14 +375,20 @@ static void __bch2_btree_iter_verify(struct btree_iter *iter,
 	struct btree_node_iter tmp = l->iter;
 	struct bkey_packed *k;
 
+	if (iter->uptodate > BTREE_ITER_NEED_PEEK)
+		return;
+
 	bch2_btree_node_iter_verify(&l->iter, b);
 
 	/*
 	 * For interior nodes, the iterator will have skipped past
 	 * deleted keys:
+	 *
+	 * For extents, the iterator may have skipped past deleted keys (but not
+	 * whiteouts)
 	 */
-	k = b->level
-		? bch2_btree_node_iter_prev(&tmp, b)
+	k = b->level || iter->flags & BTREE_ITER_IS_EXTENTS
+		? bch2_btree_node_iter_prev_filter(&tmp, b, KEY_TYPE_DISCARD)
 		: bch2_btree_node_iter_prev_all(&tmp, b);
 	if (k && btree_iter_pos_cmp_packed(b, &iter->pos, k,
 				iter->flags & BTREE_ITER_IS_EXTENTS)) {
@@ -390,7 +396,7 @@ static void __bch2_btree_iter_verify(struct btree_iter *iter,
 		struct bkey uk = bkey_unpack_key(b, k);
 
 		bch2_bkey_to_text(buf, sizeof(buf), &uk);
-		panic("prev key should be before after pos:\n%s\n%llu:%llu\n",
+		panic("prev key should be before iter pos:\n%s\n%llu:%llu\n",
 		      buf, iter->pos.inode, iter->pos.offset);
 	}
 
@@ -401,15 +407,16 @@ static void __bch2_btree_iter_verify(struct btree_iter *iter,
 		struct bkey uk = bkey_unpack_key(b, k);
 
 		bch2_bkey_to_text(buf, sizeof(buf), &uk);
-		panic("next key should be before iter pos:\n%llu:%llu\n%s\n",
+		panic("iter should be after current key:\n"
+		      "iter pos %llu:%llu\n"
+		      "cur key  %s\n",
 		      iter->pos.inode, iter->pos.offset, buf);
 	}
 
-	if (iter->uptodate == BTREE_ITER_UPTODATE &&
-	    (iter->flags & BTREE_ITER_TYPE) != BTREE_ITER_NODES) {
-		BUG_ON(!bkey_whiteout(&iter->k) &&
-		       bch2_btree_node_iter_end(&l->iter));
-	}
+	BUG_ON(iter->uptodate == BTREE_ITER_UPTODATE &&
+	       (iter->flags & BTREE_ITER_TYPE) == BTREE_ITER_KEYS &&
+	       !bkey_whiteout(&iter->k) &&
+	       bch2_btree_node_iter_end(&l->iter));
 }
 
 void bch2_btree_iter_verify(struct btree_iter *iter, struct btree *b)
@@ -420,6 +427,11 @@ void bch2_btree_iter_verify(struct btree_iter *iter, struct btree *b)
 		__bch2_btree_iter_verify(linked, b);
 }
 
+#else
+
+static inline void __bch2_btree_iter_verify(struct btree_iter *iter,
+					    struct btree *b) {}
+
 #endif
 
 static void __bch2_btree_node_iter_fix(struct btree_iter *iter,
@@ -434,7 +446,7 @@ static void __bch2_btree_node_iter_fix(struct btree_iter *iter,
 	struct btree_node_iter_set *set;
 	unsigned offset = __btree_node_key_to_offset(b, where);
 	int shift = new_u64s - clobber_u64s;
-	unsigned old_end = (int) __btree_node_key_to_offset(b, end) - shift;
+	unsigned old_end = t->end_offset - shift;
 
 	btree_node_iter_for_each(node_iter, set)
 		if (set->end == old_end)
@@ -456,7 +468,7 @@ static void __bch2_btree_node_iter_fix(struct btree_iter *iter,
 	}
 	return;
 found:
-	set->end = (int) set->end + shift;
+	set->end = t->end_offset;
 
 	/* Iterator hasn't gotten to the key that changed yet: */
 	if (set->k < offset)
@@ -517,8 +529,7 @@ iter_current_key_not_modified:
 			k = bch2_bkey_prev_all(b, t,
 				bch2_btree_node_iter_bset_pos(node_iter, b, t));
 			if (k &&
-			    __btree_node_iter_cmp(node_iter, b,
-						  k, where) > 0) {
+			    __btree_node_iter_cmp(b, k, where) > 0) {
 				struct btree_node_iter_set *set;
 				unsigned offset =
 					__btree_node_key_to_offset(b, bkey_next(k));
@@ -557,10 +568,6 @@ void bch2_btree_node_iter_fix(struct btree_iter *iter,
 		__bch2_btree_node_iter_fix(linked, b,
 					  &linked->l[b->level].iter, t,
 					  where, clobber_u64s, new_u64s);
-
-	/* interior node iterators are... special... */
-	if (!b->level)
-		bch2_btree_iter_verify(iter, b);
 }
 
 static inline struct bkey_s_c __btree_iter_unpack(struct btree_iter *iter,
@@ -647,17 +654,6 @@ static void btree_iter_verify_new_node(struct btree_iter *iter, struct btree *b)
 		btree_node_unlock(iter, b->level + 1);
 }
 
-/* Returns true if @k is after iterator position @pos */
-static inline bool btree_iter_pos_cmp(struct btree_iter *iter,
-				      const struct bkey *k)
-{
-	int cmp = bkey_cmp(k->p, iter->pos);
-
-	return cmp > 0 ||
-		(cmp == 0 &&
-		 !(iter->flags & BTREE_ITER_IS_EXTENTS) && !bkey_deleted(k));
-}
-
 static inline bool btree_iter_pos_after_node(struct btree_iter *iter,
 					     struct btree *b)
 {
@@ -679,8 +675,7 @@ static inline void __btree_iter_init(struct btree_iter *iter,
 	struct btree_iter_level *l = &iter->l[b->level];
 
 	bch2_btree_node_iter_init(&l->iter, b, iter->pos,
-				  iter->flags & BTREE_ITER_IS_EXTENTS,
-				  btree_node_is_extents(b));
+				  iter->flags & BTREE_ITER_IS_EXTENTS);
 
 	/* Skip to first non whiteout: */
 	if (b->level)
@@ -1022,7 +1017,9 @@ int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter)
 	}
 
 	iter->uptodate = BTREE_ITER_NEED_PEEK;
+
 	bch2_btree_iter_verify_locks(iter);
+	__bch2_btree_iter_verify(iter, iter->l[iter->level].b);
 	return 0;
 }
 
@@ -1363,9 +1360,10 @@ struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *iter)
 }
 
 static inline struct bkey_s_c
-__bch2_btree_iter_peek_slot(struct btree_iter *iter)
+__bch2_btree_iter_peek_slot_extents(struct btree_iter *iter)
 {
 	struct btree_iter_level *l = &iter->l[0];
+	struct btree_node_iter node_iter;
 	struct bkey_s_c k;
 	struct bkey n;
 	int ret;
@@ -1376,6 +1374,17 @@ recheck:
 	       bkey_cmp(bkey_start_pos(k.k), iter->pos) == 0)
 		__btree_iter_advance(l);
 
+	/*
+	 * iterator is now at the correct position for inserting at iter->pos,
+	 * but we need to keep iterating until we find the first non whiteout so
+	 * we know how big a hole we have, if any:
+	 */
+
+	node_iter = l->iter;
+	if (k.k && bkey_whiteout(k.k))
+		k = __btree_iter_unpack(iter, l, &iter->k,
+			bch2_btree_node_iter_peek(&node_iter, l->b));
+
 	/*
 	 * If we got to the end of the node, check if we need to traverse to the
 	 * next node:
@@ -1392,6 +1401,13 @@ recheck:
 	if (k.k &&
 	    !bkey_whiteout(k.k) &&
 	    bkey_cmp(bkey_start_pos(k.k), iter->pos) <= 0) {
+		/*
+		 * if we skipped forward to find the first non whiteout and
+		 * there _wasn't_ actually a hole, we want the iterator to be
+		 * pointed at the key we found:
+		 */
+		l->iter = node_iter;
+
 		EBUG_ON(bkey_cmp(k.k->p, iter->pos) < 0);
 		EBUG_ON(bkey_deleted(k.k));
 		iter->uptodate = BTREE_ITER_UPTODATE;
@@ -1399,41 +1415,88 @@ recheck:
 	}
 
 	/* hole */
+
+	/* holes can't span inode numbers: */
+	if (iter->pos.offset == KEY_OFFSET_MAX) {
+		if (iter->pos.inode == KEY_INODE_MAX)
+			return bkey_s_c_null;
+
+		iter->pos = bkey_successor(iter->pos);
+		goto recheck;
+	}
+
+	if (!k.k)
+		k.k = &l->b->key.k;
+
 	bkey_init(&n);
 	n.p = iter->pos;
+	bch2_key_resize(&n,
+			min_t(u64, KEY_SIZE_MAX,
+			      (k.k->p.inode == n.p.inode
+			       ? bkey_start_offset(k.k)
+			       : KEY_OFFSET_MAX) -
+			      n.p.offset));
+
+	//EBUG_ON(!n.size);
+	if (!n.size) {
+		char buf[100];
+		bch2_dump_btree_node(iter->l[0].b);
+
+		bch2_bkey_to_text(buf, sizeof(buf), k.k);
+		panic("iter at %llu:%llu\n"
+		      "next key %s\n",
+		      iter->pos.inode,
+		      iter->pos.offset,
+		      buf);
+	}
 
-	if (iter->flags & BTREE_ITER_IS_EXTENTS) {
-		if (n.p.offset == KEY_OFFSET_MAX) {
-			if (n.p.inode == KEY_INODE_MAX)
-				return bkey_s_c_null;
-
-			iter->pos = bkey_successor(iter->pos);
-			goto recheck;
-		}
+	iter->k	= n;
+	iter->uptodate = BTREE_ITER_UPTODATE;
+	return (struct bkey_s_c) { &iter->k, NULL };
+}
 
-		if (k.k && bkey_whiteout(k.k)) {
-			struct btree_node_iter node_iter = l->iter;
+static inline struct bkey_s_c
+__bch2_btree_iter_peek_slot(struct btree_iter *iter)
+{
+	struct btree_iter_level *l = &iter->l[0];
+	struct bkey_s_c k;
+	int ret;
 
-			k = __btree_iter_unpack(iter, l, &iter->k,
-				bch2_btree_node_iter_peek(&node_iter, l->b));
-		}
+	if (iter->flags & BTREE_ITER_IS_EXTENTS)
+		return __bch2_btree_iter_peek_slot_extents(iter);
 
-		if (!k.k)
-			k.k = &l->b->key.k;
+recheck:
+	while ((k = __btree_iter_peek_all(iter, l, &iter->k)).k &&
+	       bkey_deleted(k.k) &&
+	       bkey_cmp(k.k->p, iter->pos) == 0)
+		__btree_iter_advance(l);
 
-		bch2_key_resize(&n,
-				min_t(u64, KEY_SIZE_MAX,
-				      (k.k->p.inode == n.p.inode
-				       ? bkey_start_offset(k.k)
-				       : KEY_OFFSET_MAX) -
-				      n.p.offset));
+	/*
+	 * If we got to the end of the node, check if we need to traverse to the
+	 * next node:
+	 */
+	if (unlikely(!k.k && btree_iter_pos_after_node(iter, l->b))) {
+		btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
+		ret = bch2_btree_iter_traverse(iter);
+		if (unlikely(ret))
+			return bkey_s_c_err(ret);
 
-		EBUG_ON(!n.size);
+		goto recheck;
 	}
 
-	iter->k	= n;
-	iter->uptodate = BTREE_ITER_UPTODATE;
-	return (struct bkey_s_c) { &iter->k, NULL };
+	if (k.k &&
+	    !bkey_deleted(k.k) &&
+	    !bkey_cmp(iter->pos, k.k->p)) {
+		iter->uptodate = BTREE_ITER_UPTODATE;
+		return k;
+	} else {
+		/* hole */
+		bkey_init(&iter->k);
+		iter->k.p = iter->pos;
+
+		iter->uptodate = BTREE_ITER_UPTODATE;
+		return (struct bkey_s_c) { &iter->k, NULL };
+	}
 }
 
 struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 5376388e91e6..d57ca3d08c16 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -176,8 +176,6 @@ struct btree_cache {
 };
 
 struct btree_node_iter {
-	u8		is_extents;
-
 	struct btree_node_iter_set {
 		u16	k, end;
 	} data[MAX_BSETS];
@@ -459,9 +457,6 @@ struct btree_root {
  * we're holding the write lock and we know what key is about to be overwritten:
  */
 
-struct btree_iter;
-struct btree_node_iter;
-
 enum btree_insert_ret {
 	BTREE_INSERT_OK,
 	/* extent spanned multiple leaf nodes: have to traverse to next node: */
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index b60eb3d33c7b..1fe6f1e3e843 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -35,7 +35,7 @@ static void btree_node_interior_verify(struct btree *b)
 
 	BUG_ON(!b->level);
 
-	bch2_btree_node_iter_init(&iter, b, b->key.k.p, false, false);
+	bch2_btree_node_iter_init(&iter, b, b->key.k.p, false);
 #if 1
 	BUG_ON(!(k = bch2_btree_node_iter_peek(&iter, b)) ||
 	       bkey_cmp_left_packed(b, k, &b->key.k.p));
@@ -1322,7 +1322,7 @@ static void btree_split_insert_keys(struct btree_update *as, struct btree *b,
 
 	BUG_ON(btree_node_type(b) != BKEY_TYPE_BTREE);
 
-	bch2_btree_node_iter_init(&node_iter, b, k->k.p, false, false);
+	bch2_btree_node_iter_init(&node_iter, b, k->k.p, false);
 
 	while (!bch2_keylist_empty(keys)) {
 		k = bch2_keylist_front(keys);
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index ce0223bd52b5..0ef519e8feed 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -64,7 +64,8 @@ bool bch2_btree_bset_insert_key(struct btree_iter *iter,
 			if (bkey_whiteout(&insert->k) && !k->needs_whiteout) {
 				bch2_bset_delete(b, k, clobber_u64s);
 				bch2_btree_node_iter_fix(iter, b, node_iter, t,
-							k, clobber_u64s, 0);
+							 k, clobber_u64s, 0);
+				bch2_btree_iter_verify(iter, b);
 				return true;
 			}
 
@@ -73,7 +74,8 @@ bool bch2_btree_bset_insert_key(struct btree_iter *iter,
 
 		k->type = KEY_TYPE_DELETED;
 		bch2_btree_node_iter_fix(iter, b, node_iter, t, k,
-					k->u64s, k->u64s);
+					 k->u64s, k->u64s);
+		bch2_btree_iter_verify(iter, b);
 
 		if (bkey_whiteout(&insert->k)) {
 			reserve_whiteout(b, k);
@@ -98,7 +100,8 @@ overwrite:
 	bch2_bset_insert(b, node_iter, k, insert, clobber_u64s);
 	if (k->u64s != clobber_u64s || bkey_whiteout(&insert->k))
 		bch2_btree_node_iter_fix(iter, b, node_iter, t, k,
-					clobber_u64s, k->u64s);
+					 clobber_u64s, k->u64s);
+	bch2_btree_iter_verify(iter, b);
 	return true;
 }
 
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 803272b10e61..df04af882c16 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -858,30 +858,34 @@ void bch2_key_resize(struct bkey *k,
  * that we have to unpack the key, modify the unpacked key - then this
  * copies/repacks the unpacked to the original as necessary.
  */
-static bool __extent_save(struct btree *b, struct btree_node_iter *iter,
-			  struct bkey_packed *dst, struct bkey *src)
+static void extent_save(struct btree *b, struct bkey_packed *dst,
+			struct bkey *src)
 {
 	struct bkey_format *f = &b->format;
 	struct bkey_i *dst_unpacked;
-	bool ret;
 
-	if ((dst_unpacked = packed_to_bkey(dst))) {
+	if ((dst_unpacked = packed_to_bkey(dst)))
 		dst_unpacked->k = *src;
-		ret = true;
-	} else {
-		ret = bch2_bkey_pack_key(dst, src, f);
-	}
-
-	if (ret && iter)
-		bch2_verify_key_order(b, iter, dst);
-
-	return ret;
+	else
+		BUG_ON(!bch2_bkey_pack_key(dst, src, f));
 }
 
-static void extent_save(struct btree *b, struct btree_node_iter *iter,
-			struct bkey_packed *dst, struct bkey *src)
+static bool extent_i_save(struct btree *b, struct bkey_packed *dst,
+			  struct bkey_i *src)
 {
-	BUG_ON(!__extent_save(b, iter, dst, src));
+	struct bkey_format *f = &b->format;
+	struct bkey_i *dst_unpacked;
+	struct bkey_packed tmp;
+
+	if ((dst_unpacked = packed_to_bkey(dst)))
+		dst_unpacked->k = src->k;
+	else if (bch2_bkey_pack_key(&tmp, &src->k, f))
+		memcpy_u64s(dst, &tmp, f->key_u64s);
+	else
+		return false;
+
+	memcpy_u64s(bkeyp_val(f, dst), &src->v, bkey_val_u64s(&src->k));
+	return true;
 }
 
 /*
@@ -1010,7 +1014,7 @@ struct btree_nr_keys bch2_extent_sort_fix_overlapping(struct bch_fs *c,
 				sort_key_next(iter, b, _r);
 			} else {
 				__bch2_cut_front(l.k->p, r);
-				extent_save(b, NULL, rk, r.k);
+				extent_save(b, rk, r.k);
 			}
 
 			extent_sort_sift(iter, b, _r - iter->data);
@@ -1024,7 +1028,7 @@ struct btree_nr_keys bch2_extent_sort_fix_overlapping(struct bch_fs *c,
 			bch2_cut_back(bkey_start_pos(r.k), &tmp.k.k);
 
 			__bch2_cut_front(r.k->p, l);
-			extent_save(b, NULL, lk, l.k);
+			extent_save(b, lk, l.k);
 
 			extent_sort_sift(iter, b, 0);
 
@@ -1032,7 +1036,7 @@ struct btree_nr_keys bch2_extent_sort_fix_overlapping(struct bch_fs *c,
 					   bkey_to_packed(&tmp.k));
 		} else {
 			bch2_cut_back(bkey_start_pos(r.k), l.k);
-			extent_save(b, NULL, lk, l.k);
+			extent_save(b, lk, l.k);
 		}
 	}
 
@@ -1135,6 +1139,55 @@ extent_insert_should_stop(struct extent_insert_state *s)
 		return BTREE_INSERT_OK;
 }
 
+static void verify_extent_nonoverlapping(struct btree *b,
+					 struct btree_node_iter *_iter,
+					 struct bkey_i *insert)
+{
+#ifdef CONFIG_BCACHEFS_DEBUG
+	struct btree_node_iter iter;
+	struct bkey_packed *k;
+	struct bkey uk;
+
+	iter = *_iter;
+	k = bch2_btree_node_iter_prev_filter(&iter, b, KEY_TYPE_DISCARD);
+	BUG_ON(k &&
+	       (uk = bkey_unpack_key(b, k),
+		bkey_cmp(uk.p, bkey_start_pos(&insert->k)) > 0));
+
+	iter = *_iter;
+	k = bch2_btree_node_iter_peek_filter(&iter, b, KEY_TYPE_DISCARD);
+#if 0
+	BUG_ON(k &&
+	       (uk = bkey_unpack_key(b, k),
+		bkey_cmp(insert->k.p, bkey_start_pos(&uk))) > 0);
+#else
+	if (k &&
+	    (uk = bkey_unpack_key(b, k),
+	     bkey_cmp(insert->k.p, bkey_start_pos(&uk))) > 0) {
+		char buf1[100];
+		char buf2[100];
+
+		bch2_bkey_to_text(buf1, sizeof(buf1), &insert->k);
+		bch2_bkey_to_text(buf2, sizeof(buf2), &uk);
+
+		bch2_dump_btree_node(b);
+		panic("insert > next :\n"
+		      "insert %s\n"
+		      "next   %s\n",
+		      buf1, buf2);
+	}
+#endif
+
+#endif
+}
+
+static void verify_modified_extent(struct btree_iter *iter,
+				   struct bkey_packed *k)
+{
+	bch2_btree_iter_verify(iter, iter->l[0].b);
+	bch2_verify_insert_pos(iter->l[0].b, k, k, k->u64s);
+}
+
 static void extent_bset_insert(struct bch_fs *c, struct btree_iter *iter,
 			       struct bkey_i *insert)
 {
@@ -1148,6 +1201,14 @@ static void extent_bset_insert(struct bch_fs *c, struct btree_iter *iter,
 	unsigned clobber_u64s;
 
 	EBUG_ON(bkey_deleted(&insert->k) || !insert->k.size);
+	verify_extent_nonoverlapping(l->b, &l->iter, insert);
+
+	if (!prev) {
+		while ((prev = bch2_bkey_prev_all(l->b, t, where)) &&
+		       (bkey_cmp_left_packed(l->b, prev, &insert->k.p) ?:
+			((int) bkey_deleted(&insert->k) - (int) bkey_deleted(prev))) > 0)
+			where = prev;
+	}
 
 	if (prev)
 		where = bkey_next(prev);
@@ -1173,12 +1234,15 @@ static void extent_bset_insert(struct bch_fs *c, struct btree_iter *iter,
 
 	bch2_bset_insert(l->b, &l->iter, where, insert, clobber_u64s);
 	bch2_btree_node_iter_fix(iter, l->b, &l->iter, t, where,
-				clobber_u64s, where->u64s);
+				 clobber_u64s, where->u64s);
+	bch2_verify_key_order(l->b, &l->iter, where);
+	bch2_btree_iter_verify(iter, l->b);
 	return;
 drop_deleted_keys:
 	bch2_bset_delete(l->b, where, clobber_u64s);
 	bch2_btree_node_iter_fix(iter, l->b, &l->iter, t,
 				 where, clobber_u64s, 0);
+	bch2_btree_iter_verify(iter, l->b);
 }
 
 static void extent_insert_committed(struct extent_insert_state *s)
@@ -1226,8 +1290,10 @@ static void extent_insert_committed(struct extent_insert_state *s)
 
 	bch2_btree_journal_key(s->trans, iter, &split.k);
 
-	if (!s->deleting)
+	if (!s->deleting) {
+		bch2_btree_iter_set_pos_same_leaf(iter, s->committed);
 		extent_bset_insert(c, iter, &split.k);
+	}
 done:
 	bch2_btree_iter_set_pos_same_leaf(iter, s->committed);
 
@@ -1345,22 +1411,21 @@ extent_squash(struct extent_insert_state *s, struct bkey_i *insert,
 	struct btree_iter *iter = s->insert->iter;
 	struct btree_iter_level *l = &iter->l[0];
 	struct btree *b = l->b;
-	struct btree_node_iter *node_iter = &l->iter;
-	enum btree_insert_ret ret;
 
 	switch (overlap) {
 	case BCH_EXTENT_OVERLAP_FRONT:
 		/* insert overlaps with start of k: */
 		bch2_cut_subtract_front(s, insert->k.p, k);
 		BUG_ON(bkey_deleted(k.k));
-		extent_save(b, node_iter, _k, k.k);
+		extent_save(b, _k, k.k);
+		bch2_verify_key_order(b, &l->iter, _k);
 		break;
 
 	case BCH_EXTENT_OVERLAP_BACK:
 		/* insert overlaps with end of k: */
 		bch2_cut_subtract_back(s, bkey_start_pos(&insert->k), k);
 		BUG_ON(bkey_deleted(k.k));
-		extent_save(b, node_iter, _k, k.k);
+		extent_save(b, _k, k.k);
 
 		/*
 		 * As the auxiliary tree is indexed by the end of the
@@ -1368,46 +1433,31 @@ extent_squash(struct extent_insert_state *s, struct bkey_i *insert,
 		 * auxiliary tree.
 		 */
 		bch2_bset_fix_invalidated_key(b, t, _k);
-		bch2_btree_node_iter_fix(iter, b, node_iter, t,
-					_k, _k->u64s, _k->u64s);
+		bch2_btree_node_iter_fix(iter, b, &l->iter, t,
+					 _k, _k->u64s, _k->u64s);
+		bch2_verify_key_order(b, &l->iter, _k);
 		break;
 
 	case BCH_EXTENT_OVERLAP_ALL: {
-		struct bpos orig_pos = k.k->p;
-
 		/* The insert key completely covers k, invalidate k */
 		if (!bkey_whiteout(k.k))
 			btree_keys_account_key_drop(&b->nr,
 						t - b->set, _k);
 
 		bch2_drop_subtract(s, k);
-		k.k->p = bkey_start_pos(&insert->k);
-		if (!__extent_save(b, node_iter, _k, k.k)) {
-			/*
-			 * Couldn't repack: we aren't necessarily able
-			 * to repack if the new key is outside the range
-			 * of the old extent, so we have to split
-			 * @insert:
-			 */
-			k.k->p = orig_pos;
-			extent_save(b, node_iter, _k, k.k);
 
-			ret = extent_insert_advance_pos(s, k.s_c);
-			if (ret != BTREE_INSERT_OK)
-				return ret;
+		if (t == bset_tree_last(l->b)) {
+			unsigned u64s = _k->u64s;
 
-			extent_insert_committed(s);
-			/*
-			 * We split and inserted upto at k.k->p - that
-			 * has to coincide with iter->pos, so that we
-			 * don't have anything more we have to insert
-			 * until we recheck our journal reservation:
-			 */
-			EBUG_ON(bkey_cmp(s->committed, k.k->p));
+			bch2_bset_delete(l->b, _k, _k->u64s);
+			bch2_btree_node_iter_fix(iter, b, &l->iter, t,
+						 _k, u64s, 0);
+			bch2_btree_iter_verify(iter, b);
 		} else {
-			bch2_bset_fix_invalidated_key(b, t, _k);
-			bch2_btree_node_iter_fix(iter, b, node_iter, t,
-						_k, _k->u64s, _k->u64s);
+			extent_save(b, _k, k.k);
+			bch2_btree_node_iter_fix(iter, b, &l->iter, t,
+						 _k, _k->u64s, _k->u64s);
+			bch2_verify_key_order(b, &l->iter, _k);
 		}
 
 		break;
@@ -1436,7 +1486,8 @@ extent_squash(struct extent_insert_state *s, struct bkey_i *insert,
 
 		bch2_cut_subtract_front(s, insert->k.p, k);
 		BUG_ON(bkey_deleted(k.k));
-		extent_save(b, node_iter, _k, k.k);
+		extent_save(b, _k, k.k);
+		bch2_verify_key_order(b, &l->iter, _k);
 
 		bch2_add_sectors(s, bkey_i_to_s_c(&split.k),
 				bkey_start_offset(&split.k.k),
@@ -1450,26 +1501,20 @@ extent_squash(struct extent_insert_state *s, struct bkey_i *insert,
 }
 
 static enum btree_insert_ret
-__bch2_delete_fixup_extent(struct extent_insert_state *s)
+__bch2_insert_fixup_extent(struct extent_insert_state *s)
 {
 	struct bch_fs *c = s->trans->c;
 	struct btree_iter *iter = s->insert->iter;
 	struct btree_iter_level *l = &iter->l[0];
 	struct btree *b = l->b;
-	struct btree_node_iter *node_iter = &l->iter;
 	struct bkey_packed *_k;
 	struct bkey unpacked;
 	struct bkey_i *insert = s->insert->k;
 	enum btree_insert_ret ret = BTREE_INSERT_OK;
 
-	EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k)));
-
-	s->whiteout = *insert;
-	s->whiteout.k.type = KEY_TYPE_DISCARD;
-
 	while (bkey_cmp(s->committed, insert->k.p) < 0 &&
 	       (ret = extent_insert_should_stop(s)) == BTREE_INSERT_OK &&
-	       (_k = bch2_btree_node_iter_peek_all(node_iter, b))) {
+	       (_k = bch2_btree_node_iter_peek_filter(&l->iter, b, KEY_TYPE_DISCARD))) {
 		struct bset_tree *t = bch2_bkey_to_bset(b, _k);
 		struct bkey_s k = __bkey_disassemble(b, _k, &unpacked);
 		enum bch_extent_overlap overlap;
@@ -1480,112 +1525,92 @@ __bch2_delete_fixup_extent(struct extent_insert_state *s)
 		if (bkey_cmp(bkey_start_pos(k.k), insert->k.p) >= 0)
 			break;
 
-		if (bkey_whiteout(k.k)) {
-			s->committed = bpos_min(insert->k.p, k.k->p);
-			goto next;
-		}
-
-		overlap = bch2_extent_overlap(&insert->k, k.k);
-
 		ret = extent_insert_advance_pos(s, k.s_c);
 		if (ret)
 			break;
 
-		s->do_journal = true;
+		overlap = bch2_extent_overlap(&insert->k, k.k);
 
-		if (overlap == BCH_EXTENT_OVERLAP_ALL) {
-			btree_keys_account_key_drop(&b->nr,
-						t - b->set, _k);
-			bch2_subtract_sectors(s, k.s_c,
-					     bkey_start_offset(k.k), k.k->size);
-			_k->type = KEY_TYPE_DISCARD;
-			reserve_whiteout(b, _k);
-		} else if (k.k->needs_whiteout ||
-			   bkey_written(b, _k)) {
-			struct bkey_i discard = *insert;
-
-			discard.k.type = KEY_TYPE_DISCARD;
-
-			switch (overlap) {
-			case BCH_EXTENT_OVERLAP_FRONT:
-				bch2_cut_front(bkey_start_pos(k.k), &discard);
-				break;
-			case BCH_EXTENT_OVERLAP_BACK:
-				bch2_cut_back(k.k->p, &discard.k);
-				break;
-			default:
-				break;
-			}
+		if (!s->deleting) {
+			if (k.k->needs_whiteout || bkey_written(b, _k))
+				insert->k.needs_whiteout = true;
 
-			discard.k.needs_whiteout = true;
+			if (overlap == BCH_EXTENT_OVERLAP_ALL &&
+			    bkey_whiteout(k.k) &&
+			    k.k->needs_whiteout) {
+				unreserve_whiteout(b, _k);
+				_k->needs_whiteout = false;
+			}
 
 			ret = extent_squash(s, insert, t, _k, k, overlap);
-			BUG_ON(ret != BTREE_INSERT_OK);
-
-			extent_bset_insert(c, iter, &discard);
 		} else {
-			ret = extent_squash(s, insert, t, _k, k, overlap);
-			BUG_ON(ret != BTREE_INSERT_OK);
-		}
-next:
-		bch2_cut_front(s->committed, insert);
-		bch2_btree_iter_set_pos_same_leaf(iter, s->committed);
-	}
-
-	return ret;
-}
-
-static enum btree_insert_ret
-__bch2_insert_fixup_extent(struct extent_insert_state *s)
-{
-	struct btree_iter *iter = s->insert->iter;
-	struct btree_iter_level *l = &iter->l[0];
-	struct btree *b = l->b;
-	struct btree_node_iter *node_iter = &l->iter;
-	struct bkey_packed *_k;
-	struct bkey unpacked;
-	struct bkey_i *insert = s->insert->k;
-	enum btree_insert_ret ret = BTREE_INSERT_OK;
-
-	while (bkey_cmp(s->committed, insert->k.p) < 0 &&
-	       (ret = extent_insert_should_stop(s)) == BTREE_INSERT_OK &&
-	       (_k = bch2_btree_node_iter_peek_all(node_iter, b))) {
-		struct bset_tree *t = bch2_bkey_to_bset(b, _k);
-		struct bkey_s k = __bkey_disassemble(b, _k, &unpacked);
-		enum bch_extent_overlap overlap;
-
-		EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k)));
-		EBUG_ON(bkey_cmp(iter->pos, k.k->p) >= 0);
+			if (bkey_whiteout(k.k))
+				goto next;
+
+			s->do_journal = true;
+
+			if (overlap == BCH_EXTENT_OVERLAP_ALL) {
+				btree_keys_account_key_drop(&b->nr,
+							t - b->set, _k);
+				bch2_subtract_sectors(s, k.s_c,
+						     bkey_start_offset(k.k), k.k->size);
+				_k->type = KEY_TYPE_DISCARD;
+				reserve_whiteout(b, _k);
+			} else if (k.k->needs_whiteout ||
+				   bkey_written(b, _k)) {
+				struct bkey_i discard = *insert;
+
+				discard.k.type = KEY_TYPE_DISCARD;
+
+				switch (overlap) {
+				case BCH_EXTENT_OVERLAP_FRONT:
+					bch2_cut_front(bkey_start_pos(k.k), &discard);
+					break;
+				case BCH_EXTENT_OVERLAP_BACK:
+					bch2_cut_back(k.k->p, &discard.k);
+					break;
+				default:
+					break;
+				}
 
-		if (bkey_cmp(bkey_start_pos(k.k), insert->k.p) >= 0)
-			break;
+				discard.k.needs_whiteout = true;
 
-		overlap = bch2_extent_overlap(&insert->k, k.k);
+				ret = extent_squash(s, insert, t, _k, k, overlap);
+				BUG_ON(ret != BTREE_INSERT_OK);
 
-		if (!k.k->size)
-			goto squash;
+				extent_bset_insert(c, iter, &discard);
+			} else {
+				ret = extent_squash(s, insert, t, _k, k, overlap);
+				BUG_ON(ret != BTREE_INSERT_OK);
+			}
+next:
+			bch2_cut_front(s->committed, insert);
+			bch2_btree_iter_set_pos_same_leaf(iter, s->committed);
+		}
 
-		/*
-		 * Only call advance pos & call hook for nonzero size extents:
-		 */
-		ret = extent_insert_advance_pos(s, k.s_c);
-		if (ret)
+		if (ret != BTREE_INSERT_OK ||
+		    overlap == BCH_EXTENT_OVERLAP_FRONT ||
+		    overlap == BCH_EXTENT_OVERLAP_MIDDLE)
 			break;
+	}
 
-		if (k.k->size &&
-		    (k.k->needs_whiteout || bkey_written(b, _k)))
-			insert->k.needs_whiteout = true;
+	if (ret == BTREE_INSERT_OK &&
+	    bkey_cmp(s->committed, insert->k.p) < 0)
+		ret = extent_insert_advance_pos(s, bkey_s_c_null);
 
-		if (overlap == BCH_EXTENT_OVERLAP_ALL &&
-		    bkey_whiteout(k.k) &&
-		    k.k->needs_whiteout) {
-			unreserve_whiteout(b, _k);
-			_k->needs_whiteout = false;
-		}
-squash:
-		ret = extent_squash(s, insert, t, _k, k, overlap);
-		if (ret != BTREE_INSERT_OK)
-			break;
+	/*
+	 * may have skipped past some deleted extents greater than the insert
+	 * key, before we got to a non deleted extent and knew we could bail out
+	 * rewind the iterator a bit if necessary:
+	 */
+	{
+		struct btree_node_iter node_iter = l->iter;
+		struct bkey uk;
+
+		while ((_k = bch2_btree_node_iter_prev_all(&node_iter, l->b)) &&
+		       (uk = bkey_unpack_key(l->b, _k),
+			bkey_cmp(uk.p, s->committed) > 0))
+			l->iter = node_iter;
 	}
 
 	return ret;
@@ -1647,6 +1672,11 @@ bch2_insert_fixup_extent(struct btree_insert *trans,
 		.deleting	= bkey_whiteout(&insert->k->k),
 	};
 
+	if (s.deleting) {
+		s.whiteout = *insert->k;
+		s.whiteout.k.type = KEY_TYPE_DISCARD;
+	}
+
 	EBUG_ON(iter->level);
 	EBUG_ON(!insert->k->k.size);
 
@@ -1657,6 +1687,7 @@ bch2_insert_fixup_extent(struct btree_insert *trans,
 	 * @insert->k and the node iterator that we're advancing:
 	 */
 	EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k->k)));
+	bch2_btree_iter_verify(iter, b);
 
 	if (!s.deleting &&
 	    !(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))
@@ -1664,13 +1695,7 @@ bch2_insert_fixup_extent(struct btree_insert *trans,
 				bkey_start_offset(&insert->k->k),
 				insert->k->k.size);
 
-	ret = !s.deleting
-		? __bch2_insert_fixup_extent(&s)
-		: __bch2_delete_fixup_extent(&s);
-
-	if (ret == BTREE_INSERT_OK &&
-	    bkey_cmp(s.committed, insert->k->k.p) < 0)
-		ret = extent_insert_advance_pos(&s, bkey_s_c_null);
+	ret = __bch2_insert_fixup_extent(&s);
 
 	extent_insert_committed(&s);
 
@@ -2172,130 +2197,6 @@ enum merge_result bch2_extent_merge(struct bch_fs *c, struct btree *b,
 	return BCH_MERGE_MERGE;
 }
 
-static void extent_i_save(struct btree *b, struct bkey_packed *dst,
-			  struct bkey_i *src)
-{
-	struct bkey_format *f = &b->format;
-	struct bkey_i *dst_unpacked;
-
-	BUG_ON(bkeyp_val_u64s(f, dst) != bkey_val_u64s(&src->k));
-
-	/*
-	 * We don't want the bch2_verify_key_order() call in extent_save(),
-	 * because we may be out of order with deleted keys that are about to be
-	 * removed by extent_bset_insert()
-	 */
-
-	if ((dst_unpacked = packed_to_bkey(dst)))
-		bkey_copy(dst_unpacked, src);
-	else
-		BUG_ON(!bch2_bkey_pack(dst, src, f));
-}
-
-static bool extent_merge_one_overlapping(struct btree_iter *iter,
-					 struct bpos new_pos,
-					 struct bset_tree *t,
-					 struct bkey_packed *k, struct bkey uk,
-					 bool check, bool could_pack)
-{
-	struct btree_iter_level *l = &iter->l[0];
-
-	BUG_ON(!bkey_deleted(k));
-
-	if (check) {
-		return !bkey_packed(k) || could_pack;
-	} else {
-		uk.p = new_pos;
-		extent_save(l->b, &l->iter, k, &uk);
-		bch2_bset_fix_invalidated_key(l->b, t, k);
-		bch2_btree_node_iter_fix(iter, l->b, &l->iter, t,
-					 k, k->u64s, k->u64s);
-		return true;
-	}
-}
-
-static bool extent_merge_do_overlapping(struct btree_iter *iter,
-					struct bkey *m, bool back_merge)
-{
-	struct btree_iter_level *l = &iter->l[0];
-	struct btree *b = l->b;
-	struct btree_node_iter *node_iter = &l->iter;
-	struct bset_tree *t;
-	struct bkey_packed *k;
-	struct bkey uk;
-	struct bpos new_pos = back_merge ? m->p : bkey_start_pos(m);
-	bool could_pack = bkey_pack_pos((void *) &uk, new_pos, b);
-	bool check = true;
-
-	/*
-	 * @m is the new merged extent:
-	 *
-	 * The merge took place in the last bset; we know there can't be any 0
-	 * size extents overlapping with m there because if so they would have
-	 * been between the two extents we merged.
-	 *
-	 * But in the other bsets, we have to check for and fix such extents:
-	 */
-do_fixup:
-	for_each_bset(b, t) {
-		if (t == bset_tree_last(b))
-			break;
-
-		/*
-		 * if we don't find this bset in the iterator we already got to
-		 * the end of that bset, so start searching from the end.
-		 */
-		k = bch2_btree_node_iter_bset_pos(node_iter, b, t);
-
-		if (k == btree_bkey_last(b, t))
-			k = bch2_bkey_prev_all(b, t, k);
-		if (!k)
-			continue;
-
-		if (back_merge) {
-			/*
-			 * Back merge: 0 size extents will be before the key
-			 * that was just inserted (and thus the iterator
-			 * position) - walk backwards to find them
-			 */
-			for (;
-			     k &&
-			     (uk = bkey_unpack_key(b, k),
-			      bkey_cmp(uk.p, bkey_start_pos(m)) > 0);
-			     k = bch2_bkey_prev_all(b, t, k)) {
-				if (bkey_cmp(uk.p, m->p) >= 0)
-					continue;
-
-				if (!extent_merge_one_overlapping(iter, new_pos,
-						t, k, uk, check, could_pack))
-					return false;
-			}
-		} else {
-			/* Front merge - walk forwards */
-			for (;
-			     k != btree_bkey_last(b, t) &&
-			     (uk = bkey_unpack_key(b, k),
-			      bkey_cmp(uk.p, m->p) < 0);
-			     k = bkey_next(k)) {
-				if (bkey_cmp(uk.p,
-					     bkey_start_pos(m)) <= 0)
-					continue;
-
-				if (!extent_merge_one_overlapping(iter, new_pos,
-						t, k, uk, check, could_pack))
-					return false;
-			}
-		}
-	}
-
-	if (check) {
-		check = false;
-		goto do_fixup;
-	}
-
-	return true;
-}
-
 /*
  * When merging an extent that we're inserting into a btree node, the new merged
  * extent could overlap with an existing 0 size extent - if we don't fix that,
@@ -2312,13 +2213,13 @@ static bool bch2_extent_merge_inline(struct bch_fs *c,
 {
 	struct btree *b = iter->l[0].b;
 	struct btree_node_iter *node_iter = &iter->l[0].iter;
-	const struct bkey_format *f = &b->format;
-	struct bset_tree *t = bset_tree_last(b);
-	struct bkey_packed *m;
-	BKEY_PADDED(k) li;
-	BKEY_PADDED(k) ri;
-	struct bkey_i *mi;
-	struct bkey tmp;
+	BKEY_PADDED(k) li, ri;
+	struct bkey_packed *m	= back_merge ? l : r;
+	struct bkey_i *mi	= back_merge ? &li.k : &ri.k;
+	struct bset_tree *t	= bch2_bkey_to_bset(b, m);
+	enum merge_result ret;
+
+	EBUG_ON(bkey_written(b, m));
 
 	/*
 	 * We need to save copies of both l and r, because we might get a
@@ -2327,57 +2228,49 @@ static bool bch2_extent_merge_inline(struct bch_fs *c,
 	bch2_bkey_unpack(b, &li.k, l);
 	bch2_bkey_unpack(b, &ri.k, r);
 
-	m = back_merge ? l : r;
-	mi = back_merge ? &li.k : &ri.k;
+	ret = bch2_extent_merge(c, b, &li.k, &ri.k);
+	if (ret == BCH_MERGE_NOMERGE)
+		return false;
 
-	/* l & r should be in last bset: */
-	EBUG_ON(bch2_bkey_to_bset(b, m) != t);
+	/*
+	 * check if we overlap with deleted extents - would break the sort
+	 * order:
+	 */
+	if (back_merge) {
+		struct bkey_packed *n = bkey_next(m);
 
-	switch (bch2_extent_merge(c, b, &li.k, &ri.k)) {
-	case BCH_MERGE_NOMERGE:
-		return false;
-	case BCH_MERGE_PARTIAL:
-		if (bkey_packed(m) && !bch2_bkey_pack_key((void *) &tmp, &mi->k, f))
+		if (n != btree_bkey_last(b, t) &&
+		    bkey_cmp_left_packed(b, n, &li.k.k.p) <= 0 &&
+		    bkey_deleted(n))
 			return false;
+	} else if (ret == BCH_MERGE_MERGE) {
+		struct bkey_packed *prev = bch2_bkey_prev_all(b, t, m);
 
-		if (!extent_merge_do_overlapping(iter, &li.k.k, back_merge))
+		if (prev &&
+		    bkey_cmp_left_packed_byval(b, prev,
+				bkey_start_pos(&li.k.k)) > 0)
 			return false;
+	}
 
-		extent_i_save(b, m, mi);
-		bch2_bset_fix_invalidated_key(b, t, m);
-
-		/*
-		 * Update iterator to reflect what we just inserted - otherwise,
-		 * the iter_fix() call is going to put us _before_ the key we
-		 * just partially merged with:
-		 */
-		if (back_merge)
-			bch2_btree_iter_set_pos_same_leaf(iter, li.k.k.p);
-
-		bch2_btree_node_iter_fix(iter, b, node_iter,
-					 t, m, m->u64s, m->u64s);
+	if (ret == BCH_MERGE_PARTIAL) {
+		if (!extent_i_save(b, m, mi))
+			return false;
 
 		if (!back_merge)
 			bkey_copy(packed_to_bkey(l), &li.k);
 		else
 			bkey_copy(packed_to_bkey(r), &ri.k);
-		return false;
-	case BCH_MERGE_MERGE:
-		if (bkey_packed(m) && !bch2_bkey_pack_key((void *) &tmp, &li.k.k, f))
-			return false;
-
-		if (!extent_merge_do_overlapping(iter, &li.k.k, back_merge))
+	} else {
+		if (!extent_i_save(b, m, &li.k))
 			return false;
+	}
 
-		extent_i_save(b, m, &li.k);
-		bch2_bset_fix_invalidated_key(b, t, m);
+	bch2_bset_fix_invalidated_key(b, t, m);
+	bch2_btree_node_iter_fix(iter, b, node_iter,
+				 t, m, m->u64s, m->u64s);
+	verify_modified_extent(iter, m);
 
-		bch2_btree_node_iter_fix(iter, b, node_iter,
-					 t, m, m->u64s, m->u64s);
-		return true;
-	default:
-		BUG();
-	}
+	return ret == BCH_MERGE_MERGE;
 }
 
 int bch2_check_range_allocated(struct bch_fs *c, struct bpos pos, u64 size)
-- 
cgit 


From e4ccb251312003a9feed402e1958d0fef24987f1 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 5 Aug 2018 14:41:29 -0400
Subject: bcachefs: make struct btree_iter a bit smaller

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c    | 14 +++++++-------
 fs/bcachefs/btree_iter.h    |  2 +-
 fs/bcachefs/btree_locking.h |  2 +-
 fs/bcachefs/btree_types.h   |  3 +--
 4 files changed, 10 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 9d92826181dc..ae19ba125a71 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -35,10 +35,10 @@ void bch2_btree_node_unlock_write(struct btree *b, struct btree_iter *iter)
 	struct btree_iter *linked;
 
 	EBUG_ON(iter->l[b->level].b != b);
-	EBUG_ON(iter->lock_seq[b->level] + 1 != b->lock.state.seq);
+	EBUG_ON(iter->l[b->level].lock_seq + 1 != b->lock.state.seq);
 
 	for_each_btree_iter_with_node(iter, b, linked)
-		linked->lock_seq[b->level] += 2;
+		linked->l[b->level].lock_seq += 2;
 
 	six_unlock_write(&b->lock);
 }
@@ -80,8 +80,8 @@ bool __bch2_btree_node_relock(struct btree_iter *iter, unsigned level)
 	if (race_fault())
 		return false;
 
-	if (!six_relock_type(&b->lock, want, iter->lock_seq[level]) &&
-	    !(iter->lock_seq[level] >> 1 == b->lock.state.seq >> 1 &&
+	if (!six_relock_type(&b->lock, want, iter->l[level].lock_seq) &&
+	    !(iter->l[level].lock_seq >> 1 == b->lock.state.seq >> 1 &&
 	      btree_node_lock_increment(iter, b, level, want)))
 		return false;
 
@@ -106,10 +106,10 @@ static bool bch2_btree_node_upgrade(struct btree_iter *iter, unsigned level)
 
 	if (btree_node_locked(iter, level)
 	    ? six_lock_tryupgrade(&b->lock)
-	    : six_relock_type(&b->lock, SIX_LOCK_intent, iter->lock_seq[level]))
+	    : six_relock_type(&b->lock, SIX_LOCK_intent, iter->l[level].lock_seq))
 		goto success;
 
-	if (iter->lock_seq[level] >> 1 == b->lock.state.seq >> 1 &&
+	if (iter->l[level].lock_seq >> 1 == b->lock.state.seq >> 1 &&
 	    btree_node_lock_increment(iter, b, level, BTREE_NODE_INTENT_LOCKED)) {
 		btree_node_unlock(iter, level);
 		goto success;
@@ -692,7 +692,7 @@ static inline void btree_iter_node_set(struct btree_iter *iter,
 	EBUG_ON(!btree_iter_pos_in_node(iter, b));
 	EBUG_ON(b->lock.state.seq & 1);
 
-	iter->lock_seq[b->level] = b->lock.state.seq;
+	iter->l[b->level].lock_seq = b->lock.state.seq;
 	iter->l[b->level].b = b;
 	__btree_iter_init(iter, b);
 }
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index 315cba28f6b2..1667ba448a18 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -39,7 +39,7 @@ static inline bool __iter_has_node(const struct btree_iter *iter,
 	 */
 
 	return iter->l[b->level].b == b &&
-		iter->lock_seq[b->level] >> 1 == b->lock.state.seq >> 1;
+		iter->l[b->level].lock_seq >> 1 == b->lock.state.seq >> 1;
 }
 
 static inline struct btree_iter *
diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h
index f262e4431414..c1d16411154e 100644
--- a/fs/bcachefs/btree_locking.h
+++ b/fs/bcachefs/btree_locking.h
@@ -206,7 +206,7 @@ void __bch2_btree_node_lock_write(struct btree *, struct btree_iter *);
 static inline void bch2_btree_node_lock_write(struct btree *b, struct btree_iter *iter)
 {
 	EBUG_ON(iter->l[b->level].b != b);
-	EBUG_ON(iter->lock_seq[b->level] != b->lock.state.seq);
+	EBUG_ON(iter->l[b->level].lock_seq != b->lock.state.seq);
 
 	if (!six_trylock_write(&b->lock))
 		__bch2_btree_node_lock_write(b, iter);
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index d57ca3d08c16..14d8c75a4e8d 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -231,10 +231,9 @@ struct btree_iter {
 	struct btree_iter_level {
 		struct btree	*b;
 		struct btree_node_iter iter;
+		u32		lock_seq;
 	}			l[BTREE_MAX_DEPTH];
 
-	u32			lock_seq[BTREE_MAX_DEPTH];
-
 	/*
 	 * Current unpacked key - so that bch2_btree_iter_next()/
 	 * bch2_btree_iter_next_slot() can correctly advance pos.
-- 
cgit 


From df8a42393e038574a84d256720b2ba9745af0ad5 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 5 Aug 2018 15:28:29 -0400
Subject: bcachefs: extent_squash() can no longer fail

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/extents.c | 15 +++++----------
 1 file changed, 5 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index df04af882c16..6cc6961047cf 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -1402,7 +1402,7 @@ bch2_extent_can_insert(struct btree_insert *trans,
 	return BTREE_INSERT_OK;
 }
 
-static enum btree_insert_ret
+static void
 extent_squash(struct extent_insert_state *s, struct bkey_i *insert,
 	      struct bset_tree *t, struct bkey_packed *_k, struct bkey_s k,
 	      enum bch_extent_overlap overlap)
@@ -1496,8 +1496,6 @@ extent_squash(struct extent_insert_state *s, struct bkey_i *insert,
 		break;
 	}
 	}
-
-	return BTREE_INSERT_OK;
 }
 
 static enum btree_insert_ret
@@ -1542,7 +1540,7 @@ __bch2_insert_fixup_extent(struct extent_insert_state *s)
 				_k->needs_whiteout = false;
 			}
 
-			ret = extent_squash(s, insert, t, _k, k, overlap);
+			extent_squash(s, insert, t, _k, k, overlap);
 		} else {
 			if (bkey_whiteout(k.k))
 				goto next;
@@ -1575,21 +1573,18 @@ __bch2_insert_fixup_extent(struct extent_insert_state *s)
 
 				discard.k.needs_whiteout = true;
 
-				ret = extent_squash(s, insert, t, _k, k, overlap);
-				BUG_ON(ret != BTREE_INSERT_OK);
+				extent_squash(s, insert, t, _k, k, overlap);
 
 				extent_bset_insert(c, iter, &discard);
 			} else {
-				ret = extent_squash(s, insert, t, _k, k, overlap);
-				BUG_ON(ret != BTREE_INSERT_OK);
+				extent_squash(s, insert, t, _k, k, overlap);
 			}
 next:
 			bch2_cut_front(s->committed, insert);
 			bch2_btree_iter_set_pos_same_leaf(iter, s->committed);
 		}
 
-		if (ret != BTREE_INSERT_OK ||
-		    overlap == BCH_EXTENT_OVERLAP_FRONT ||
+		if (overlap == BCH_EXTENT_OVERLAP_FRONT ||
 		    overlap == BCH_EXTENT_OVERLAP_MIDDLE)
 			break;
 	}
-- 
cgit 


From cc1add4a80935e020b0aaf358b4341314abc7ef4 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 5 Aug 2018 15:21:52 -0400
Subject: bcachefs: BTREE_INSERT_JOURNAL_RES_FULL is no longer possible

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bset.c                  |   7 -
 fs/bcachefs/bset.h                  |   5 -
 fs/bcachefs/btree_types.h           |   1 -
 fs/bcachefs/btree_update_interior.h |  28 +---
 fs/bcachefs/btree_update_leaf.c     |   1 -
 fs/bcachefs/extents.c               | 286 +++++++++++++-----------------------
 6 files changed, 107 insertions(+), 221 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c
index 27fa3e230e6e..b95cfe7ece9a 100644
--- a/fs/bcachefs/bset.c
+++ b/fs/bcachefs/bset.c
@@ -255,13 +255,6 @@ void bch2_verify_insert_pos(struct btree *b, struct bkey_packed *where,
 #endif
 }
 
-void bch2_verify_key_order(struct btree *b,
-			   struct btree_node_iter *_iter,
-			   struct bkey_packed *where)
-{
-	bch2_verify_insert_pos(b, where, where, where->u64s);
-}
-
 #else
 
 static inline void bch2_btree_node_iter_next_check(struct btree_node_iter *iter,
diff --git a/fs/bcachefs/bset.h b/fs/bcachefs/bset.h
index 0787030ccc7e..66a8da2192ed 100644
--- a/fs/bcachefs/bset.h
+++ b/fs/bcachefs/bset.h
@@ -632,8 +632,6 @@ void __bch2_verify_btree_nr_keys(struct btree *);
 void bch2_btree_node_iter_verify(struct btree_node_iter *, struct btree *);
 void bch2_verify_insert_pos(struct btree *, struct bkey_packed *,
 			    struct bkey_packed *, unsigned);
-void bch2_verify_key_order(struct btree *, struct btree_node_iter *,
-			  struct bkey_packed *);
 
 #else
 
@@ -644,9 +642,6 @@ static inline void bch2_verify_insert_pos(struct btree *b,
 					  struct bkey_packed *where,
 					  struct bkey_packed *insert,
 					  unsigned clobber_u64s) {}
-static inline void bch2_verify_key_order(struct btree *b,
-					struct btree_node_iter *iter,
-					struct bkey_packed *where) {}
 #endif
 
 static inline void bch2_verify_btree_nr_keys(struct btree *b)
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 14d8c75a4e8d..03c319611d72 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -463,7 +463,6 @@ enum btree_insert_ret {
 	/* write lock held for too long */
 	/* leaf node needs to be split */
 	BTREE_INSERT_BTREE_NODE_FULL,
-	BTREE_INSERT_JOURNAL_RES_FULL,
 	BTREE_INSERT_ENOSPC,
 	BTREE_INSERT_NEED_GC_LOCK,
 };
diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h
index 711fbe63eb3a..4125cddded61 100644
--- a/fs/bcachefs/btree_update_interior.h
+++ b/fs/bcachefs/btree_update_interior.h
@@ -336,40 +336,14 @@ static inline void reserve_whiteout(struct btree *b, struct bkey_packed *k)
  * insert into could be written out from under us)
  */
 static inline bool bch2_btree_node_insert_fits(struct bch_fs *c,
-					      struct btree *b, unsigned u64s)
+					       struct btree *b, unsigned u64s)
 {
 	if (unlikely(btree_node_fake(b)))
 		return false;
 
-	if (btree_node_is_extents(b)) {
-		/* The insert key might split an existing key
-		 * (bch2_insert_fixup_extent() -> BCH_EXTENT_OVERLAP_MIDDLE case:
-		 */
-		u64s += BKEY_EXTENT_U64s_MAX;
-	}
-
 	return u64s <= bch_btree_keys_u64s_remaining(c, b);
 }
 
-static inline bool journal_res_insert_fits(struct btree_insert *trans,
-					   struct btree_insert_entry *insert)
-{
-	unsigned u64s = 0;
-	struct btree_insert_entry *i;
-
-	/*
-	 * If we didn't get a journal reservation, we're in journal replay and
-	 * we're not journalling updates:
-	 */
-	if (!trans->journal_res.ref)
-		return true;
-
-	for (i = insert; i < trans->entries + trans->nr; i++)
-		u64s += jset_u64s(i->k->k.u64s + i->extra_res);
-
-	return u64s <= trans->journal_res.u64s;
-}
-
 ssize_t bch2_btree_updates_print(struct bch_fs *, char *);
 
 size_t bch2_btree_interior_updates_nr_pending(struct bch_fs *);
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 0ef519e8feed..598d7a107792 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -408,7 +408,6 @@ static inline int do_btree_insert_at(struct btree_insert *trans,
 		case BTREE_INSERT_OK:
 			i->done = true;
 			break;
-		case BTREE_INSERT_JOURNAL_RES_FULL:
 		case BTREE_INSERT_NEED_TRAVERSE:
 			ret = -EINTR;
 			break;
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 6cc6961047cf..b2f50e162784 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -1060,7 +1060,8 @@ struct extent_insert_state {
 
 	/* for deleting: */
 	struct bkey_i			whiteout;
-	bool				do_journal;
+	bool				update_journal;
+	bool				update_btree;
 	bool				deleting;
 };
 
@@ -1117,28 +1118,6 @@ static bool bch2_extent_merge_inline(struct bch_fs *,
 				     struct bkey_packed *,
 				     bool);
 
-static enum btree_insert_ret
-extent_insert_should_stop(struct extent_insert_state *s)
-{
-	struct btree *b = s->insert->iter->l[0].b;
-
-	/*
-	 * Check if we have sufficient space in both the btree node and the
-	 * journal reservation:
-	 *
-	 * Each insert checks for room in the journal entry, but we check for
-	 * room in the btree node up-front. In the worst case, bkey_cmpxchg()
-	 * will insert two keys, and one iteration of this room will insert one
-	 * key, so we need room for three keys.
-	 */
-	if (!bch2_btree_node_insert_fits(s->trans->c, b, s->insert->k->k.u64s))
-		return BTREE_INSERT_BTREE_NODE_FULL;
-	else if (!journal_res_insert_fits(s->trans, s->insert))
-		return BTREE_INSERT_JOURNAL_RES_FULL; /* XXX worth tracing */
-	else
-		return BTREE_INSERT_OK;
-}
-
 static void verify_extent_nonoverlapping(struct btree *b,
 					 struct btree_node_iter *_iter,
 					 struct bkey_i *insert)
@@ -1193,55 +1172,30 @@ static void extent_bset_insert(struct bch_fs *c, struct btree_iter *iter,
 {
 	struct btree_iter_level *l = &iter->l[0];
 	struct bset_tree *t = bset_tree_last(l->b);
-	struct bkey_packed *where =
-		bch2_btree_node_iter_bset_pos(&l->iter, l->b, t);
-	struct bkey_packed *prev = bch2_bkey_prev_filter(l->b, t, where,
-							 KEY_TYPE_DISCARD);
-	struct bkey_packed *next_live_key = where;
-	unsigned clobber_u64s;
+	struct btree_node_iter node_iter;
+	struct bkey_packed *k;
+
+	BUG_ON(insert->k.u64s > bch_btree_keys_u64s_remaining(c, l->b));
 
 	EBUG_ON(bkey_deleted(&insert->k) || !insert->k.size);
 	verify_extent_nonoverlapping(l->b, &l->iter, insert);
 
-	if (!prev) {
-		while ((prev = bch2_bkey_prev_all(l->b, t, where)) &&
-		       (bkey_cmp_left_packed(l->b, prev, &insert->k.p) ?:
-			((int) bkey_deleted(&insert->k) - (int) bkey_deleted(prev))) > 0)
-			where = prev;
-	}
-
-	if (prev)
-		where = bkey_next(prev);
-
-	while (next_live_key != btree_bkey_last(l->b, t) &&
-	       bkey_deleted(next_live_key))
-		next_live_key = bkey_next(next_live_key);
-
-	/*
-	 * Everything between where and next_live_key is now deleted keys, and
-	 * is overwritten:
-	 */
-	clobber_u64s = (u64 *) next_live_key - (u64 *) where;
+	node_iter = l->iter;
+	k = bch2_btree_node_iter_prev_filter(&node_iter, l->b, KEY_TYPE_DISCARD);
+	if (k && !bkey_written(l->b, k) &&
+	    bch2_extent_merge_inline(c, iter, k, bkey_to_packed(insert), true))
+		return;
 
-	if (prev &&
-	    bch2_extent_merge_inline(c, iter, prev, bkey_to_packed(insert), true))
-		goto drop_deleted_keys;
+	node_iter = l->iter;
+	k = bch2_btree_node_iter_peek_filter(&node_iter, l->b, KEY_TYPE_DISCARD);
+	if (k && !bkey_written(l->b, k) &&
+	    bch2_extent_merge_inline(c, iter, bkey_to_packed(insert), k, false))
+		return;
 
-	if (next_live_key != btree_bkey_last(l->b, t) &&
-	    bch2_extent_merge_inline(c, iter, bkey_to_packed(insert),
-				    next_live_key, false))
-		goto drop_deleted_keys;
+	k = bch2_btree_node_iter_bset_pos(&l->iter, l->b, t);
 
-	bch2_bset_insert(l->b, &l->iter, where, insert, clobber_u64s);
-	bch2_btree_node_iter_fix(iter, l->b, &l->iter, t, where,
-				 clobber_u64s, where->u64s);
-	bch2_verify_key_order(l->b, &l->iter, where);
-	bch2_btree_iter_verify(iter, l->b);
-	return;
-drop_deleted_keys:
-	bch2_bset_delete(l->b, where, clobber_u64s);
-	bch2_btree_node_iter_fix(iter, l->b, &l->iter, t,
-				 where, clobber_u64s, 0);
+	bch2_bset_insert(l->b, &l->iter, k, insert, 0);
+	bch2_btree_node_iter_fix(iter, l->b, &l->iter, t, k, 0, k->u64s);
 	bch2_btree_iter_verify(iter, l->b);
 }
 
@@ -1249,56 +1203,52 @@ static void extent_insert_committed(struct extent_insert_state *s)
 {
 	struct bch_fs *c = s->trans->c;
 	struct btree_iter *iter = s->insert->iter;
-	struct bkey_i *insert = !s->deleting
-		? s->insert->k
-		: &s->whiteout;
+	struct bkey_i *insert = s->insert->k;
 	BKEY_PADDED(k) split;
 
-	EBUG_ON(bkey_deleted(&insert->k) || !insert->k.size);
 	EBUG_ON(bkey_cmp(insert->k.p, s->committed) < 0);
 	EBUG_ON(bkey_cmp(s->committed, bkey_start_pos(&insert->k)) < 0);
 
-	if (!bkey_cmp(s->committed, bkey_start_pos(&insert->k)))
+	bkey_copy(&split.k, insert);
+	if (s->deleting)
+		split.k.k.type = KEY_TYPE_DISCARD;
+
+	if (!(s->trans->flags & BTREE_INSERT_JOURNAL_REPLAY))
+		bch2_cut_subtract_back(s, s->committed,
+				       bkey_i_to_s(&split.k));
+	else
+		bch2_cut_back(s->committed, &split.k.k);
+
+	if (!bkey_cmp(s->committed, iter->pos))
 		return;
 
-	if (s->deleting && !s->do_journal) {
-		bch2_cut_front(s->committed, insert);
-		goto done;
-	}
+	bch2_btree_iter_set_pos_same_leaf(iter, s->committed);
 
-	EBUG_ON(bkey_deleted(&insert->k) || !insert->k.size);
+	if (s->update_btree) {
+		if (debug_check_bkeys(c))
+			bch2_bkey_debugcheck(c, iter->l[0].b,
+					     bkey_i_to_s_c(&split.k));
 
-	bkey_copy(&split.k, insert);
+		EBUG_ON(bkey_deleted(&split.k.k) || !split.k.k.size);
 
-	if (!(s->trans->flags & BTREE_INSERT_JOURNAL_REPLAY) &&
-	    bkey_cmp(s->committed, insert->k.p) &&
-	    bch2_extent_is_compressed(bkey_i_to_s_c(insert))) {
-		/* XXX: possibly need to increase our reservation? */
-		bch2_cut_subtract_back(s, s->committed,
-				      bkey_i_to_s(&split.k));
-		bch2_cut_front(s->committed, insert);
-		bch2_add_sectors(s, bkey_i_to_s_c(insert),
-				bkey_start_offset(&insert->k),
-				insert->k.size);
-	} else {
-		bch2_cut_back(s->committed, &split.k.k);
-		bch2_cut_front(s->committed, insert);
+		extent_bset_insert(c, iter, &split.k);
 	}
 
-	if (debug_check_bkeys(c))
-		bch2_bkey_debugcheck(c, iter->l[0].b, bkey_i_to_s_c(&split.k));
+	if (s->update_journal) {
+		bkey_copy(&split.k, !s->deleting ? insert : &s->whiteout);
+		if (s->deleting)
+			split.k.k.type = KEY_TYPE_DISCARD;
 
-	bch2_btree_journal_key(s->trans, iter, &split.k);
+		bch2_cut_back(s->committed, &split.k.k);
 
-	if (!s->deleting) {
-		bch2_btree_iter_set_pos_same_leaf(iter, s->committed);
-		extent_bset_insert(c, iter, &split.k);
+		EBUG_ON(bkey_deleted(&split.k.k) || !split.k.k.size);
+
+		bch2_btree_journal_key(s->trans, iter, &split.k);
 	}
-done:
-	bch2_btree_iter_set_pos_same_leaf(iter, s->committed);
+
+	bch2_cut_front(s->committed, insert);
 
 	insert->k.needs_whiteout	= false;
-	s->do_journal			= false;
 	s->trans->did_work		= true;
 }
 
@@ -1333,9 +1283,6 @@ extent_insert_advance_pos(struct extent_insert_state *s, struct bkey_s_c k)
 					k.k ? k.k->p : b->key.k.p);
 	enum btree_insert_ret ret;
 
-	if (race_fault())
-		return BTREE_INSERT_NEED_TRAVERSE;
-
 	/* hole? */
 	if (k.k && bkey_cmp(s->committed, bkey_start_pos(k.k)) < 0) {
 		ret = __extent_insert_advance_pos(s, bkey_start_pos(k.k),
@@ -1364,6 +1311,15 @@ bch2_extent_can_insert(struct btree_insert *trans,
 	struct bkey_s_c k;
 	int sectors;
 
+	/*
+	 * We avoid creating whiteouts whenever possible when deleting, but
+	 * those optimizations mean we may potentially insert two whiteouts
+	 * instead of one (when we overlap with the front of one extent and the
+	 * back of another):
+	 */
+	if (bkey_whiteout(&insert->k->k))
+		*u64s += BKEY_U64s;
+
 	_k = bch2_btree_node_iter_peek_filter(&node_iter, l->b,
 					      KEY_TYPE_DISCARD);
 	if (!_k)
@@ -1418,7 +1374,7 @@ extent_squash(struct extent_insert_state *s, struct bkey_i *insert,
 		bch2_cut_subtract_front(s, insert->k.p, k);
 		BUG_ON(bkey_deleted(k.k));
 		extent_save(b, _k, k.k);
-		bch2_verify_key_order(b, &l->iter, _k);
+		verify_modified_extent(iter, _k);
 		break;
 
 	case BCH_EXTENT_OVERLAP_BACK:
@@ -1435,7 +1391,7 @@ extent_squash(struct extent_insert_state *s, struct bkey_i *insert,
 		bch2_bset_fix_invalidated_key(b, t, _k);
 		bch2_btree_node_iter_fix(iter, b, &l->iter, t,
 					 _k, _k->u64s, _k->u64s);
-		bch2_verify_key_order(b, &l->iter, _k);
+		verify_modified_extent(iter, _k);
 		break;
 
 	case BCH_EXTENT_OVERLAP_ALL: {
@@ -1457,7 +1413,7 @@ extent_squash(struct extent_insert_state *s, struct bkey_i *insert,
 			extent_save(b, _k, k.k);
 			bch2_btree_node_iter_fix(iter, b, &l->iter, t,
 						 _k, _k->u64s, _k->u64s);
-			bch2_verify_key_order(b, &l->iter, _k);
+			verify_modified_extent(iter, _k);
 		}
 
 		break;
@@ -1487,7 +1443,7 @@ extent_squash(struct extent_insert_state *s, struct bkey_i *insert,
 		bch2_cut_subtract_front(s, insert->k.p, k);
 		BUG_ON(bkey_deleted(k.k));
 		extent_save(b, _k, k.k);
-		bch2_verify_key_order(b, &l->iter, _k);
+		verify_modified_extent(iter, _k);
 
 		bch2_add_sectors(s, bkey_i_to_s_c(&split.k),
 				bkey_start_offset(&split.k.k),
@@ -1501,7 +1457,6 @@ extent_squash(struct extent_insert_state *s, struct bkey_i *insert,
 static enum btree_insert_ret
 __bch2_insert_fixup_extent(struct extent_insert_state *s)
 {
-	struct bch_fs *c = s->trans->c;
 	struct btree_iter *iter = s->insert->iter;
 	struct btree_iter_level *l = &iter->l[0];
 	struct btree *b = l->b;
@@ -1511,13 +1466,12 @@ __bch2_insert_fixup_extent(struct extent_insert_state *s)
 	enum btree_insert_ret ret = BTREE_INSERT_OK;
 
 	while (bkey_cmp(s->committed, insert->k.p) < 0 &&
-	       (ret = extent_insert_should_stop(s)) == BTREE_INSERT_OK &&
-	       (_k = bch2_btree_node_iter_peek_filter(&l->iter, b, KEY_TYPE_DISCARD))) {
+	       (_k = bch2_btree_node_iter_peek_filter(&l->iter, b,
+						      KEY_TYPE_DISCARD))) {
 		struct bset_tree *t = bch2_bkey_to_bset(b, _k);
 		struct bkey_s k = __bkey_disassemble(b, _k, &unpacked);
-		enum bch_extent_overlap overlap;
+		enum bch_extent_overlap overlap = bch2_extent_overlap(&insert->k, k.k);
 
-		EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k)));
 		EBUG_ON(bkey_cmp(iter->pos, k.k->p) >= 0);
 
 		if (bkey_cmp(bkey_start_pos(k.k), insert->k.p) >= 0)
@@ -1527,63 +1481,53 @@ __bch2_insert_fixup_extent(struct extent_insert_state *s)
 		if (ret)
 			break;
 
-		overlap = bch2_extent_overlap(&insert->k, k.k);
-
-		if (!s->deleting) {
-			if (k.k->needs_whiteout || bkey_written(b, _k))
-				insert->k.needs_whiteout = true;
-
-			if (overlap == BCH_EXTENT_OVERLAP_ALL &&
-			    bkey_whiteout(k.k) &&
-			    k.k->needs_whiteout) {
-				unreserve_whiteout(b, _k);
-				_k->needs_whiteout = false;
-			}
-
-			extent_squash(s, insert, t, _k, k, overlap);
-		} else {
-			if (bkey_whiteout(k.k))
-				goto next;
+		if (!bkey_whiteout(k.k))
+			s->update_journal = true;
 
-			s->do_journal = true;
+		if (!s->update_journal) {
+			bch2_cut_front(s->committed, insert);
+			bch2_cut_front(s->committed, &s->whiteout);
+			bch2_btree_iter_set_pos_same_leaf(iter, s->committed);
+			goto next;
+		}
 
-			if (overlap == BCH_EXTENT_OVERLAP_ALL) {
-				btree_keys_account_key_drop(&b->nr,
-							t - b->set, _k);
+		/*
+		 * When deleting, if possible just do it by switching the type
+		 * of the key we're deleting, instead of creating and inserting
+		 * a new whiteout:
+		 */
+		if (s->deleting &&
+		    !s->update_btree &&
+		    !bkey_cmp(insert->k.p, k.k->p) &&
+		    !bkey_cmp(bkey_start_pos(&insert->k), bkey_start_pos(k.k))) {
+			if (!bkey_whiteout(k.k)) {
+				btree_keys_account_key_drop(&b->nr, t - b->set, _k);
 				bch2_subtract_sectors(s, k.s_c,
-						     bkey_start_offset(k.k), k.k->size);
+						      bkey_start_offset(k.k), k.k->size);
 				_k->type = KEY_TYPE_DISCARD;
 				reserve_whiteout(b, _k);
-			} else if (k.k->needs_whiteout ||
-				   bkey_written(b, _k)) {
-				struct bkey_i discard = *insert;
-
-				discard.k.type = KEY_TYPE_DISCARD;
+			}
+			break;
+		}
 
-				switch (overlap) {
-				case BCH_EXTENT_OVERLAP_FRONT:
-					bch2_cut_front(bkey_start_pos(k.k), &discard);
-					break;
-				case BCH_EXTENT_OVERLAP_BACK:
-					bch2_cut_back(k.k->p, &discard.k);
-					break;
-				default:
-					break;
-				}
+		if (k.k->needs_whiteout || bkey_written(b, _k)) {
+			insert->k.needs_whiteout = true;
+			s->update_btree = true;
+		}
 
-				discard.k.needs_whiteout = true;
+		if (s->update_btree &&
+		    overlap == BCH_EXTENT_OVERLAP_ALL &&
+		    bkey_whiteout(k.k) &&
+		    k.k->needs_whiteout) {
+			unreserve_whiteout(b, _k);
+			_k->needs_whiteout = false;
+		}
 
-				extent_squash(s, insert, t, _k, k, overlap);
+		extent_squash(s, insert, t, _k, k, overlap);
 
-				extent_bset_insert(c, iter, &discard);
-			} else {
-				extent_squash(s, insert, t, _k, k, overlap);
-			}
-next:
+		if (!s->update_btree)
 			bch2_cut_front(s->committed, insert);
-			bch2_btree_iter_set_pos_same_leaf(iter, s->committed);
-		}
-
+next:
 		if (overlap == BCH_EXTENT_OVERLAP_FRONT ||
 		    overlap == BCH_EXTENT_OVERLAP_MIDDLE)
 			break;
@@ -1600,11 +1544,9 @@ next:
 	 */
 	{
 		struct btree_node_iter node_iter = l->iter;
-		struct bkey uk;
 
 		while ((_k = bch2_btree_node_iter_prev_all(&node_iter, l->b)) &&
-		       (uk = bkey_unpack_key(l->b, _k),
-			bkey_cmp(uk.p, s->committed) > 0))
+		       bkey_cmp_left_packed(b, _k, &s->committed) > 0)
 			l->iter = node_iter;
 	}
 
@@ -1664,14 +1606,13 @@ bch2_insert_fixup_extent(struct btree_insert *trans,
 		.trans		= trans,
 		.insert		= insert,
 		.committed	= insert->iter->pos,
+
+		.whiteout	= *insert->k,
+		.update_journal	= !bkey_whiteout(&insert->k->k),
+		.update_btree	= !bkey_whiteout(&insert->k->k),
 		.deleting	= bkey_whiteout(&insert->k->k),
 	};
 
-	if (s.deleting) {
-		s.whiteout = *insert->k;
-		s.whiteout.k.type = KEY_TYPE_DISCARD;
-	}
-
 	EBUG_ON(iter->level);
 	EBUG_ON(!insert->k->k.size);
 
@@ -1682,7 +1623,6 @@ bch2_insert_fixup_extent(struct btree_insert *trans,
 	 * @insert->k and the node iterator that we're advancing:
 	 */
 	EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k->k)));
-	bch2_btree_iter_verify(iter, b);
 
 	if (!s.deleting &&
 	    !(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))
@@ -1694,20 +1634,6 @@ bch2_insert_fixup_extent(struct btree_insert *trans,
 
 	extent_insert_committed(&s);
 
-	if (s.deleting)
-		bch2_cut_front(iter->pos, insert->k);
-
-	/*
-	 * Subtract any remaining sectors from @insert, if we bailed out early
-	 * and didn't fully insert @insert:
-	 */
-	if (!s.deleting &&
-	    !(trans->flags & BTREE_INSERT_JOURNAL_REPLAY) &&
-	    insert->k->k.size)
-		bch2_subtract_sectors(&s, bkey_i_to_s_c(insert->k),
-				     bkey_start_offset(&insert->k->k),
-				     insert->k->k.size);
-
 	bch2_fs_usage_apply(c, &s.stats, trans->disk_res,
 			   gc_pos_btree_node(b));
 
-- 
cgit 


From 581edb634140fe3da0bc3c8430116e735ebff897 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 8 Aug 2018 21:22:46 -0400
Subject: bcachefs: mempoolify btree_trans

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs.h   |  2 ++
 fs/bcachefs/btree_iter.c | 15 +++++----------
 fs/bcachefs/btree_iter.h |  2 +-
 fs/bcachefs/fsck.c       |  4 ++--
 fs/bcachefs/super.c      |  3 +++
 5 files changed, 13 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index a9ac68c17533..770b26f28c75 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -580,6 +580,8 @@ struct bch_fs {
 	struct mutex		btree_interior_update_lock;
 	struct closure_waitlist	btree_interior_update_wait;
 
+	mempool_t		btree_iters_pool;
+
 	struct workqueue_struct	*wq;
 	/* copygc needs its own workqueue for index updates.. */
 	struct workqueue_struct	*copygc_wq;
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index ae19ba125a71..6cde68537c3e 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1648,10 +1648,7 @@ static int btree_trans_realloc_iters(struct btree_trans *trans)
 
 	bch2_trans_unlock(trans);
 
-	new_iters = kmalloc(sizeof(struct btree_iter) * BTREE_ITER_MAX,
-			    GFP_NOFS);
-	if (!new_iters)
-		return -ENOMEM;
+	new_iters = mempool_alloc(&trans->c->btree_iters_pool, GFP_NOFS);
 
 	memcpy(new_iters, trans->iters,
 	       sizeof(struct btree_iter) * trans->nr_iters);
@@ -1679,12 +1676,10 @@ static int btree_trans_realloc_iters(struct btree_trans *trans)
 	return 0;
 }
 
-int bch2_trans_preload_iters(struct btree_trans *trans)
+void bch2_trans_preload_iters(struct btree_trans *trans)
 {
-	if (trans->iters != trans->iters_onstack)
-		return 0;
-
-	return btree_trans_realloc_iters(trans);
+	if (trans->iters == trans->iters_onstack)
+		btree_trans_realloc_iters(trans);
 }
 
 static struct btree_iter *__btree_trans_get_iter(struct btree_trans *trans,
@@ -1868,7 +1863,7 @@ int bch2_trans_exit(struct btree_trans *trans)
 
 	kfree(trans->mem);
 	if (trans->iters != trans->iters_onstack)
-		kfree(trans->iters);
+		mempool_free(trans->iters, &trans->c->btree_iters_pool);
 	trans->mem	= (void *) 0x1;
 	trans->iters	= (void *) 0x1;
 	return ret;
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index 1667ba448a18..63ff89644fe4 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -270,7 +270,7 @@ static inline int btree_iter_err(struct bkey_s_c k)
 
 /* new multiple iterator interface: */
 
-int bch2_trans_preload_iters(struct btree_trans *);
+void bch2_trans_preload_iters(struct btree_trans *);
 void bch2_trans_iter_free(struct btree_trans *,
 				struct btree_iter *);
 
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 50e310fea4cf..33fff198858a 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -398,7 +398,7 @@ static int check_dirents(struct bch_fs *c)
 
 	bch2_trans_init(&trans, c);
 
-	BUG_ON(bch2_trans_preload_iters(&trans));
+	bch2_trans_preload_iters(&trans);
 
 	iter = bch2_trans_get_iter(&trans, BTREE_ID_DIRENTS,
 				   POS(BCACHEFS_ROOT_INO, 0), 0);
@@ -539,7 +539,7 @@ static int check_xattrs(struct bch_fs *c)
 
 	bch2_trans_init(&trans, c);
 
-	BUG_ON(bch2_trans_preload_iters(&trans));
+	bch2_trans_preload_iters(&trans);
 
 	iter = bch2_trans_get_iter(&trans, BTREE_ID_XATTRS,
 				   POS(BCACHEFS_ROOT_INO, 0), 0);
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index e44bc95d8deb..63e4d97d15d7 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -372,6 +372,7 @@ static void bch2_fs_free(struct bch_fs *c)
 	bch2_fs_compress_exit(c);
 	percpu_free_rwsem(&c->usage_lock);
 	free_percpu(c->usage_percpu);
+	mempool_exit(&c->btree_iters_pool);
 	mempool_exit(&c->btree_bounce_pool);
 	bioset_exit(&c->btree_bio);
 	mempool_exit(&c->btree_interior_update_pool);
@@ -600,6 +601,8 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 	    percpu_init_rwsem(&c->usage_lock) ||
 	    mempool_init_kvpmalloc_pool(&c->btree_bounce_pool, 1,
 					btree_bytes(c)) ||
+	    mempool_init_kmalloc_pool(&c->btree_iters_pool, 1,
+			sizeof(struct btree_iter) * BTREE_ITER_MAX) ||
 	    bch2_io_clock_init(&c->io_clock[READ]) ||
 	    bch2_io_clock_init(&c->io_clock[WRITE]) ||
 	    bch2_fs_journal_init(&c->journal) ||
-- 
cgit 


From e2d9912c6f25301923783e7e785870f821d31c40 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 5 Aug 2018 17:46:41 -0400
Subject: bcachefs: bch2_extent_trim_atomic()

Prep work for extents insert hook removal

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/extents.c | 14 ++++++++++++++
 fs/bcachefs/extents.h | 11 +++++++++++
 fs/bcachefs/fs-io.c   | 30 ++++++++++++++++++++++--------
 3 files changed, 47 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index b2f50e162784..59d2eaea9edf 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -1298,6 +1298,17 @@ extent_insert_advance_pos(struct extent_insert_state *s, struct bkey_s_c k)
 	return __extent_insert_advance_pos(s, next_pos, k);
 }
 
+void bch2_extent_trim_atomic(struct bkey_i *k, struct btree_iter *iter)
+{
+	struct btree *b = iter->l[0].b;
+
+	BUG_ON(iter->uptodate > BTREE_ITER_NEED_PEEK);
+
+	bch2_cut_back(b->key.k.p, &k->k);
+
+	BUG_ON(bkey_cmp(bkey_start_pos(&k->k), b->data->min_key) < 0);
+}
+
 enum btree_insert_ret
 bch2_extent_can_insert(struct btree_insert *trans,
 		       struct btree_insert_entry *insert,
@@ -1311,6 +1322,9 @@ bch2_extent_can_insert(struct btree_insert *trans,
 	struct bkey_s_c k;
 	int sectors;
 
+	BUG_ON(trans->flags & BTREE_INSERT_ATOMIC &&
+	       !bch2_extent_is_atomic(&insert->k->k, insert->iter));
+
 	/*
 	 * We avoid creating whiteouts whenever possible when deleting, but
 	 * those optimizations mean we may potentially insert two whiteouts
diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h
index fddf25c3fa4b..0721d1829f98 100644
--- a/fs/bcachefs/extents.h
+++ b/fs/bcachefs/extents.h
@@ -62,6 +62,17 @@ int bch2_extent_pick_ptr(struct bch_fs *, struct bkey_s_c,
 			 struct bch_devs_mask *,
 			 struct extent_pick_ptr *);
 
+void bch2_extent_trim_atomic(struct bkey_i *, struct btree_iter *);
+
+static inline bool bch2_extent_is_atomic(struct bkey *k,
+					 struct btree_iter *iter)
+{
+	struct btree *b = iter->l[0].b;
+
+	return bkey_cmp(k->p, b->key.k.p) <= 0 &&
+		bkey_cmp(bkey_start_pos(k), b->data->min_key) >= 0;
+}
+
 enum btree_insert_ret
 bch2_extent_can_insert(struct btree_insert *, struct btree_insert_entry *,
 		       unsigned *);
diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index cc99eb1b36e0..da8c1917c760 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -6,6 +6,7 @@
 #include "buckets.h"
 #include "clock.h"
 #include "error.h"
+#include "extents.h"
 #include "fs.h"
 #include "fs-io.h"
 #include "fsck.h"
@@ -430,17 +431,22 @@ static int bchfs_write_index_update(struct bch_write_op *wop)
 	hook.need_inode_update	= false;
 
 	do {
-		/* XXX: inode->i_size locking */
-		k = bch2_keylist_front(keys);
-		if (min(k->k.p.offset << 9, op->new_i_size) >
-		    op->inode->ei_inode.bi_size)
-			hook.need_inode_update = true;
+		BKEY_PADDED(k) tmp;
 
-		/* optimization for fewer transaction restarts: */
 		ret = bch2_btree_iter_traverse(extent_iter);
 		if (ret)
 			goto err;
 
+		bkey_copy(&tmp.k, bch2_keylist_front(keys));
+		k = &tmp.k;
+
+		bch2_extent_trim_atomic(k, extent_iter);
+
+		/* XXX: inode->i_size locking */
+		if (min(k->k.p.offset << 9, op->new_i_size) >
+		    op->inode->ei_inode.bi_size)
+			hook.need_inode_update = true;
+
 		if (hook.need_inode_update) {
 			struct bkey_s_c inode;
 
@@ -515,8 +521,10 @@ err:
 		if (hook.need_inode_update)
 			op->inode->ei_inode = hook.inode_u;
 
-		BUG_ON(bkey_cmp(extent_iter->pos, k->k.p) < 0);
-		bch2_keylist_pop_front(keys);
+		if (bkey_cmp(extent_iter->pos, bch2_keylist_front(keys)->k.p) < 0)
+			bch2_cut_front(extent_iter->pos, bch2_keylist_front(keys));
+		else
+			bch2_keylist_pop_front(keys);
 	} while (!bch2_keylist_empty(keys));
 
 	bch2_trans_exit(&trans);
@@ -2458,6 +2466,12 @@ static long bch2_fcollapse(struct bch_inode_info *inode,
 		bch2_cut_front(src->pos, &copy.k);
 		copy.k.k.p.offset -= len >> 9;
 
+		ret = bch2_btree_iter_traverse(dst);
+		if (ret)
+			goto btree_iter_err;
+
+		bch2_extent_trim_atomic(&copy.k, dst);
+
 		BUG_ON(bkey_cmp(dst->pos, bkey_start_pos(&copy.k.k)));
 
 		ret = bch2_disk_reservation_get(c, &disk_res, copy.k.k.size,
-- 
cgit 


From 08af47dfc26e244160a9ff85e8181234190abf98 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 8 Aug 2018 18:42:04 -0400
Subject: bcachefs: convert bchfs_write_index_update() to bch2_extent_update()

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c |  36 +++---
 fs/bcachefs/btree_iter.h |   9 +-
 fs/bcachefs/fs-io.c      | 293 ++++++++++++++++++++++-------------------------
 3 files changed, 165 insertions(+), 173 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 6cde68537c3e..aad7d8ff3f53 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1040,7 +1040,6 @@ static inline void bch2_btree_iter_checks(struct btree_iter *iter,
 					  enum btree_iter_type type)
 {
 	EBUG_ON(iter->btree_id >= BTREE_ID_NR);
-	EBUG_ON((iter->flags & BTREE_ITER_TYPE) != type);
 	EBUG_ON(!!(iter->flags & BTREE_ITER_IS_EXTENTS) !=
 		(iter->btree_id == BTREE_ID_EXTENTS &&
 		 type != BTREE_ITER_NODES));
@@ -1624,17 +1623,29 @@ static void btree_trans_verify(struct btree_trans *trans)
 	}
 }
 
+static inline unsigned btree_trans_iter_idx(struct btree_trans *trans,
+					    struct btree_iter *iter)
+{
+	ssize_t idx = iter - trans->iters;
+
+	BUG_ON(idx < 0 || idx >= trans->nr_iters);
+	BUG_ON(!(trans->iters_live & (1U << idx)));
+
+	return idx;
+}
+
+void bch2_trans_iter_put(struct btree_trans *trans,
+			 struct btree_iter *iter)
+{
+	ssize_t idx = btree_trans_iter_idx(trans, iter);
+
+	trans->iters_live	&= ~(1U << idx);
+}
+
 void bch2_trans_iter_free(struct btree_trans *trans,
 			  struct btree_iter *iter)
 {
-	unsigned idx;
-
-	for (idx = 0; idx < trans->nr_iters; idx++)
-		if (&trans->iters[idx] == iter)
-			goto found;
-	BUG();
-found:
-	BUG_ON(!(trans->iters_linked & (1U << idx)));
+	ssize_t idx = btree_trans_iter_idx(trans, iter);
 
 	trans->iters_live	&= ~(1U << idx);
 	trans->iters_linked	&= ~(1U << idx);
@@ -1719,10 +1730,6 @@ got_slot:
 	} else {
 		iter = &trans->iters[idx];
 
-		BUG_ON(iter->btree_id != btree_id);
-		BUG_ON((iter->flags ^ flags) &
-		       (BTREE_ITER_SLOTS|BTREE_ITER_IS_EXTENTS));
-
 		iter->flags &= ~(BTREE_ITER_INTENT|BTREE_ITER_PREFETCH);
 		iter->flags |= flags & (BTREE_ITER_INTENT|BTREE_ITER_PREFETCH);
 	}
@@ -1739,6 +1746,9 @@ got_slot:
 
 	btree_trans_verify(trans);
 
+	BUG_ON(iter->btree_id != btree_id);
+	BUG_ON((iter->flags ^ flags) & BTREE_ITER_TYPE);
+
 	return iter;
 }
 
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index 63ff89644fe4..775fdf4260cc 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -271,8 +271,8 @@ static inline int btree_iter_err(struct bkey_s_c k)
 /* new multiple iterator interface: */
 
 void bch2_trans_preload_iters(struct btree_trans *);
-void bch2_trans_iter_free(struct btree_trans *,
-				struct btree_iter *);
+void bch2_trans_iter_put(struct btree_trans *, struct btree_iter *);
+void bch2_trans_iter_free(struct btree_trans *, struct btree_iter *);
 
 struct btree_iter *__bch2_trans_get_iter(struct btree_trans *, enum btree_id,
 					 struct bpos, unsigned, u64);
@@ -307,6 +307,11 @@ bch2_trans_copy_iter(struct btree_trans *trans, struct btree_iter *src)
 
 void __bch2_trans_begin(struct btree_trans *);
 
+static inline void bch2_trans_begin_updates(struct btree_trans *trans)
+{
+	trans->nr_updates = 0;
+}
+
 void *bch2_trans_kmalloc(struct btree_trans *, size_t);
 int bch2_trans_unlock(struct btree_trans *);
 void bch2_trans_init(struct btree_trans *, struct bch_fs *);
diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index da8c1917c760..acee8fc2102b 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -330,212 +330,189 @@ i_sectors_hook_init(struct bch_inode_info *inode, unsigned flags)
 
 /* normal i_size/i_sectors update machinery: */
 
-struct bchfs_extent_trans_hook {
-	struct bchfs_write_op		*op;
-	struct extent_insert_hook	hook;
+static s64 sum_sector_overwrites(struct bkey_i *new, struct btree_iter *_iter,
+				 bool *allocating)
+{
+	struct btree_iter iter;
+	struct bkey_s_c old;
+	s64 delta = 0;
 
-	struct bch_inode_unpacked	inode_u;
-	struct bkey_inode_buf		inode_p;
+	bch2_btree_iter_init(&iter, _iter->c, BTREE_ID_EXTENTS, POS_MIN,
+			     BTREE_ITER_SLOTS);
 
-	bool				need_inode_update;
-};
+	bch2_btree_iter_link(_iter, &iter);
+	bch2_btree_iter_copy(&iter, _iter);
 
-static enum btree_insert_ret
-bchfs_extent_update_hook(struct extent_insert_hook *hook,
-			 struct bpos committed_pos,
-			 struct bpos next_pos,
-			 struct bkey_s_c k,
-			 const struct bkey_i *insert)
-{
-	struct bchfs_extent_trans_hook *h = container_of(hook,
-				struct bchfs_extent_trans_hook, hook);
-	struct bch_inode_info *inode = h->op->inode;
-	int sign = bkey_extent_is_allocation(&insert->k) -
-		(k.k && bkey_extent_is_allocation(k.k));
-	s64 sectors = (s64) (next_pos.offset - committed_pos.offset) * sign;
-	u64 offset = min(next_pos.offset << 9, h->op->new_i_size);
-	bool do_pack = false;
+	for_each_btree_key_continue(&iter, BTREE_ITER_SLOTS, old) {
+		if (bkey_cmp(new->k.p, bkey_start_pos(old.k)) <= 0)
+			break;
 
-	if (h->op->unalloc &&
-	    !bch2_extent_is_fully_allocated(k))
-		return BTREE_INSERT_ENOSPC;
+		if (allocating &&
+		    !bch2_extent_is_fully_allocated(old))
+			*allocating = true;
 
-	BUG_ON((next_pos.offset << 9) > round_up(offset, PAGE_SIZE));
+		delta += (min(new->k.p.offset,
+			      old.k->p.offset) -
+			  max(bkey_start_offset(&new->k),
+			      bkey_start_offset(old.k))) *
+			(bkey_extent_is_allocation(&new->k) -
+			 bkey_extent_is_allocation(old.k));
+	}
 
-	/* XXX: inode->i_size locking */
-	if (offset > inode->ei_inode.bi_size) {
-		if (!h->need_inode_update) {
-			h->need_inode_update = true;
-			return BTREE_INSERT_NEED_TRAVERSE;
-		}
+	bch2_btree_iter_unlink(&iter);
 
-		/* truncate in progress? */
-		if (h->inode_u.bi_flags & BCH_INODE_I_SIZE_DIRTY)
-			goto no_i_size_update;
+	return delta;
+}
 
-		h->inode_u.bi_size = offset;
-		do_pack = true;
+static int bch2_extent_update(struct btree_trans *trans,
+			      struct bch_inode_info *inode,
+			      struct disk_reservation *disk_res,
+			      struct quota_res *quota_res,
+			      struct btree_iter *extent_iter,
+			      struct bkey_i *k,
+			      u64 new_i_size,
+			      bool may_allocate,
+			      bool direct,
+			      s64 *total_delta)
+{
+	struct btree_iter *inode_iter = NULL;
+	struct bch_inode_unpacked inode_u;
+	struct bkey_inode_buf inode_p;
+	bool allocating = false;
+	bool extended = false;
+	s64 i_sectors_delta;
+	int ret;
+
+	bch2_trans_begin_updates(trans);
 
-		spin_lock(&inode->v.i_lock);
-		if (offset > inode->v.i_size) {
-			if (h->op->is_dio)
-				i_size_write(&inode->v, offset);
-			else
-				BUG();
+	ret = bch2_btree_iter_traverse(extent_iter);
+	if (ret)
+		return ret;
+
+	bch2_extent_trim_atomic(k, extent_iter);
+
+	i_sectors_delta = sum_sector_overwrites(k, extent_iter, &allocating);
+	if (!may_allocate && allocating)
+		return -ENOSPC;
+
+	bch2_trans_update(trans, BTREE_INSERT_ENTRY(extent_iter, k));
+
+	new_i_size = min(k->k.p.offset << 9, new_i_size);
+
+	/* XXX: inode->i_size locking */
+	if (i_sectors_delta ||
+	    new_i_size > inode->ei_inode.bi_size) {
+		inode_iter = bch2_trans_get_iter(trans,
+			BTREE_ID_INODES,
+			POS(k->k.p.inode, 0),
+			BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+		if (IS_ERR(inode_iter))
+			return PTR_ERR(inode_iter);
+
+		ret = bch2_btree_iter_traverse(inode_iter);
+		if (ret)
+			goto err;
+
+		inode_u = inode->ei_inode;
+		inode_u.bi_sectors += i_sectors_delta;
+
+		/* XXX: this is slightly suspect */
+		if (!(inode_u.bi_flags & BCH_INODE_I_SIZE_DIRTY) &&
+		    new_i_size > inode_u.bi_size) {
+			inode_u.bi_size = new_i_size;
+			extended = true;
 		}
-		spin_unlock(&inode->v.i_lock);
+
+		bch2_inode_pack(&inode_p, &inode_u);
+		bch2_trans_update(trans,
+			BTREE_INSERT_ENTRY(inode_iter, &inode_p.inode.k_i));
 	}
-no_i_size_update:
-	if (sectors) {
-		if (!h->need_inode_update) {
-			h->need_inode_update = true;
-			return BTREE_INSERT_NEED_TRAVERSE;
-		}
 
-		h->inode_u.bi_sectors += sectors;
-		do_pack = true;
+	ret = bch2_trans_commit(trans, disk_res, NULL,
+				&inode->ei_journal_seq,
+				BTREE_INSERT_NOFAIL|
+				BTREE_INSERT_ATOMIC|
+				BTREE_INSERT_NOUNLOCK|
+				BTREE_INSERT_USE_RESERVE);
+	if (ret)
+		goto err;
+
+	inode->ei_inode.bi_sectors += i_sectors_delta;
+
+	EBUG_ON(i_sectors_delta &&
+		inode->ei_inode.bi_sectors != inode_u.bi_sectors);
+
+	if (extended) {
+		inode->ei_inode.bi_size = new_i_size;
 
-		h->op->sectors_added += sectors;
+		if (direct) {
+			spin_lock(&inode->v.i_lock);
+			if (new_i_size > inode->v.i_size)
+				i_size_write(&inode->v, new_i_size);
+			spin_unlock(&inode->v.i_lock);
+		}
 	}
 
-	if (do_pack)
-		bch2_inode_pack(&h->inode_p, &h->inode_u);
+	if (direct)
+		i_sectors_acct(trans->c, inode, quota_res, i_sectors_delta);
 
-	return BTREE_INSERT_OK;
+	if (total_delta)
+		*total_delta += i_sectors_delta;
+err:
+	if (!IS_ERR_OR_NULL(inode_iter))
+		bch2_trans_iter_put(trans, inode_iter);
+	return ret;
 }
 
 static int bchfs_write_index_update(struct bch_write_op *wop)
 {
 	struct bchfs_write_op *op = container_of(wop,
 				struct bchfs_write_op, op);
+	struct quota_res *quota_res = op->is_dio
+		? &container_of(op, struct dio_write, iop)->quota_res
+		: NULL;
+	struct bch_inode_info *inode = op->inode;
 	struct keylist *keys = &op->op.insert_keys;
-	struct btree_trans trans;
-	struct btree_iter *extent_iter, *inode_iter = NULL;
-	struct bchfs_extent_trans_hook hook;
 	struct bkey_i *k = bch2_keylist_front(keys);
-	s64 orig_sectors_added = op->sectors_added;
+	struct btree_trans trans;
+	struct btree_iter *iter;
 	int ret;
 
-	BUG_ON(k->k.p.inode != op->inode->v.i_ino);
+	BUG_ON(k->k.p.inode != inode->v.i_ino);
 
 	bch2_trans_init(&trans, wop->c);
+	bch2_trans_preload_iters(&trans);
 
-	extent_iter = bch2_trans_get_iter(&trans,
+	iter = bch2_trans_get_iter(&trans,
 				BTREE_ID_EXTENTS,
-				bkey_start_pos(&bch2_keylist_front(keys)->k),
+				bkey_start_pos(&k->k),
 				BTREE_ITER_INTENT);
-	BUG_ON(IS_ERR(extent_iter));
-
-	hook.op			= op;
-	hook.hook.fn		= bchfs_extent_update_hook;
-	hook.need_inode_update	= false;
 
 	do {
 		BKEY_PADDED(k) tmp;
 
-		ret = bch2_btree_iter_traverse(extent_iter);
-		if (ret)
-			goto err;
-
 		bkey_copy(&tmp.k, bch2_keylist_front(keys));
-		k = &tmp.k;
-
-		bch2_extent_trim_atomic(k, extent_iter);
-
-		/* XXX: inode->i_size locking */
-		if (min(k->k.p.offset << 9, op->new_i_size) >
-		    op->inode->ei_inode.bi_size)
-			hook.need_inode_update = true;
 
-		if (hook.need_inode_update) {
-			struct bkey_s_c inode;
-
-			if (!inode_iter) {
-				inode_iter = bch2_trans_get_iter(&trans,
-					BTREE_ID_INODES,
-					POS(extent_iter->pos.inode, 0),
-					BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
-				BUG_ON(IS_ERR(inode_iter));
-			}
-
-			inode = bch2_btree_iter_peek_slot(inode_iter);
-			if ((ret = btree_iter_err(inode)))
-				goto err;
-
-			if (WARN_ONCE(inode.k->type != BCH_INODE_FS,
-				      "inode %llu not found when updating",
-				      extent_iter->pos.inode)) {
-				ret = -ENOENT;
-				break;
-			}
-
-			if (WARN_ONCE(bkey_bytes(inode.k) >
-				      sizeof(hook.inode_p),
-				      "inode %llu too big (%zu bytes, buf %zu)",
-				      extent_iter->pos.inode,
-				      bkey_bytes(inode.k),
-				      sizeof(hook.inode_p))) {
-				ret = -ENOENT;
-				break;
-			}
-
-			bkey_reassemble(&hook.inode_p.inode.k_i, inode);
-			ret = bch2_inode_unpack(bkey_s_c_to_inode(inode),
-					       &hook.inode_u);
-			if (WARN_ONCE(ret,
-				      "error %i unpacking inode %llu",
-				      ret, extent_iter->pos.inode)) {
-				ret = -ENOENT;
-				break;
-			}
-
-			ret = bch2_btree_insert_at(wop->c, &wop->res,
-					&hook.hook, op_journal_seq(wop),
-					BTREE_INSERT_NOFAIL|
-					BTREE_INSERT_ATOMIC|
-					BTREE_INSERT_USE_RESERVE,
-					BTREE_INSERT_ENTRY(extent_iter, k),
-					BTREE_INSERT_ENTRY_EXTRA_RES(inode_iter,
-							&hook.inode_p.inode.k_i, 2));
-		} else {
-			ret = bch2_btree_insert_at(wop->c, &wop->res,
-					&hook.hook, op_journal_seq(wop),
-					BTREE_INSERT_NOFAIL|
-					BTREE_INSERT_ATOMIC|
-					BTREE_INSERT_NOUNLOCK|
-					BTREE_INSERT_USE_RESERVE,
-					BTREE_INSERT_ENTRY(extent_iter, k));
-		}
-
-		BUG_ON(bkey_cmp(extent_iter->pos, bkey_start_pos(&k->k)));
-
-		if (WARN_ONCE(!ret != !k->k.size,
-			      "ret %i k->size %u", ret, k->k.size))
-			ret = k->k.size ? -EINTR : 0;
-err:
+		ret = bch2_extent_update(&trans, inode,
+				&wop->res, quota_res,
+				iter, &tmp.k,
+				op->new_i_size,
+				!op->unalloc,
+				op->is_dio,
+				&op->sectors_added);
 		if (ret == -EINTR)
 			continue;
 		if (ret)
 			break;
 
-		if (hook.need_inode_update)
-			op->inode->ei_inode = hook.inode_u;
-
-		if (bkey_cmp(extent_iter->pos, bch2_keylist_front(keys)->k.p) < 0)
-			bch2_cut_front(extent_iter->pos, bch2_keylist_front(keys));
+		if (bkey_cmp(iter->pos, bch2_keylist_front(keys)->k.p) < 0)
+			bch2_cut_front(iter->pos, bch2_keylist_front(keys));
 		else
 			bch2_keylist_pop_front(keys);
 	} while (!bch2_keylist_empty(keys));
 
 	bch2_trans_exit(&trans);
 
-	if (op->is_dio) {
-		struct dio_write *dio = container_of(op, struct dio_write, iop);
-
-		i_sectors_acct(wop->c, op->inode, &dio->quota_res,
-			       op->sectors_added - orig_sectors_added);
-	}
-
 	return ret;
 }
 
-- 
cgit 


From 54e2264e17941c3b21b1240d719cd2a3b330bec1 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 8 Aug 2018 21:09:31 -0400
Subject: bcachefs: convert truncate to bch2_extent_update()

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-io.c | 116 +++++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 93 insertions(+), 23 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index acee8fc2102b..2389b1f69aa5 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -2116,6 +2116,55 @@ out:
 
 /* truncate: */
 
+static int __bch2_fpunch(struct bch_fs *c, struct bch_inode_info *inode,
+			 u64 start_offset, u64 end_offset, u64 *journal_seq)
+{
+	struct bpos start	= POS(inode->v.i_ino, start_offset);
+	struct bpos end		= POS(inode->v.i_ino, end_offset);
+	unsigned max_sectors	= KEY_SIZE_MAX & (~0 << c->block_bits);
+	struct btree_trans trans;
+	struct btree_iter *iter;
+	struct bkey_s_c k;
+	int ret = 0;
+
+	bch2_trans_init(&trans, c);
+	bch2_trans_preload_iters(&trans);
+
+	iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, start,
+				   BTREE_ITER_INTENT);
+
+	while ((k = bch2_btree_iter_peek(iter)).k &&
+	       !(ret = btree_iter_err(k)) &&
+	       bkey_cmp(iter->pos, end) < 0) {
+		struct disk_reservation disk_res =
+			bch2_disk_reservation_init(c, 0);
+		struct bkey_i delete;
+
+		bkey_init(&delete.k);
+		delete.k.p = iter->pos;
+
+		/* create the biggest key we can */
+		bch2_key_resize(&delete.k, max_sectors);
+		bch2_cut_back(end, &delete.k);
+
+		ret = bch2_extent_update(&trans, inode,
+				&disk_res, NULL, iter, &delete,
+				0, true, true, NULL);
+		bch2_disk_reservation_put(c, &disk_res);
+
+		if (ret == -EINTR)
+			ret = 0;
+		if (ret)
+			break;
+
+		bch2_btree_iter_cond_resched(iter);
+	}
+
+	bch2_trans_exit(&trans);
+
+	return ret;
+}
+
 static inline int range_has_data(struct bch_fs *c,
 				  struct bpos start,
 				  struct bpos end)
@@ -2238,12 +2287,32 @@ static int bch2_extend(struct bch_inode_info *inode, struct iattr *iattr)
 	return ret;
 }
 
+static int bch2_truncate_finish_fn(struct bch_inode_info *inode,
+				   struct bch_inode_unpacked *bi,
+				   void *p)
+{
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+
+	bi->bi_flags &= ~BCH_INODE_I_SIZE_DIRTY;
+	bi->bi_mtime = bi->bi_ctime = bch2_current_time(c);
+	return 0;
+}
+
+static int bch2_truncate_start_fn(struct bch_inode_info *inode,
+				  struct bch_inode_unpacked *bi, void *p)
+{
+	u64 *new_i_size = p;
+
+	bi->bi_flags |= BCH_INODE_I_SIZE_DIRTY;
+	bi->bi_size = *new_i_size;
+	return 0;
+}
+
 int bch2_truncate(struct bch_inode_info *inode, struct iattr *iattr)
 {
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
 	struct address_space *mapping = inode->v.i_mapping;
-	struct i_sectors_hook i_sectors_hook =
-		i_sectors_hook_init(inode, BCH_INODE_I_SIZE_DIRTY);
+	u64 new_i_size = iattr->ia_size;
 	bool shrink;
 	int ret = 0;
 
@@ -2256,12 +2325,12 @@ int bch2_truncate(struct bch_inode_info *inode, struct iattr *iattr)
 
 	if (!shrink) {
 		ret = bch2_extend(inode, iattr);
-		goto err_put_pagecache;
+		goto err;
 	}
 
 	ret = bch2_truncate_page(inode, iattr->ia_size);
 	if (unlikely(ret))
-		goto err_put_pagecache;
+		goto err;
 
 	if (iattr->ia_size > inode->ei_inode.bi_size)
 		ret = filemap_write_and_wait_range(mapping,
@@ -2272,37 +2341,38 @@ int bch2_truncate(struct bch_inode_info *inode, struct iattr *iattr)
 				round_down(iattr->ia_size, PAGE_SIZE),
 				iattr->ia_size - 1);
 	if (ret)
-		goto err_put_pagecache;
+		goto err;
 
-	i_sectors_hook.new_i_size = iattr->ia_size;
+	mutex_lock(&inode->ei_update_lock);
+	ret = bch2_write_inode(c, inode, bch2_truncate_start_fn,
+			       &new_i_size, 0);
+	mutex_unlock(&inode->ei_update_lock);
 
-	ret = i_sectors_dirty_start(c, &i_sectors_hook);
 	if (unlikely(ret))
-		goto err_put_pagecache;
+		goto err;
 
 	truncate_setsize(&inode->v, iattr->ia_size);
 
-	ret = bch2_inode_truncate(c, inode->v.i_ino,
-				  round_up(iattr->ia_size, PAGE_SIZE) >> 9,
-				  &i_sectors_hook.hook,
-				  &inode->ei_journal_seq);
+	/*
+	 * XXX: need a comment explaining why PAGE_SIZE and not block_bytes()
+	 * here:
+	 */
+	ret = __bch2_fpunch(c, inode,
+			round_up(iattr->ia_size, PAGE_SIZE) >> 9,
+			U64_MAX, &inode->ei_journal_seq);
 	if (unlikely(ret))
-		goto err_put_sectors_dirty;
+		goto err;
 
 	/* ATTR_MODE will never be set here, ns argument isn't needed: */
 	setattr_copy(NULL, &inode->v, iattr);
-out:
-	ret = i_sectors_dirty_finish(c, &i_sectors_hook) ?: ret;
-err_put_pagecache:
+
+	mutex_lock(&inode->ei_update_lock);
+	ret = bch2_write_inode(c, inode, bch2_truncate_finish_fn, NULL,
+			       ATTR_MTIME|ATTR_CTIME);
+	mutex_unlock(&inode->ei_update_lock);
+err:
 	bch2_pagecache_block_put(&inode->ei_pagecache_lock);
 	return ret;
-err_put_sectors_dirty:
-	/*
-	 * On error - in particular, bch2_truncate_page() error - don't clear
-	 * I_SIZE_DIRTY, as we've left data above i_size!:
-	 */
-	i_sectors_hook.flags &= ~BCH_INODE_I_SIZE_DIRTY;
-	goto out;
 }
 
 /* fallocate: */
-- 
cgit 


From 5f461e01b830a6730765d4a19ab0865749733289 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 8 Aug 2018 21:11:43 -0400
Subject: bcachefs: convert fpunch to bch2_extent_update()

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-io.c | 32 +++-----------------------------
 1 file changed, 3 insertions(+), 29 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 2389b1f69aa5..7954455bea93 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -2380,7 +2380,6 @@ err:
 static long bch2_fpunch(struct bch_inode_info *inode, loff_t offset, loff_t len)
 {
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	u64 ino = inode->v.i_ino;
 	u64 discard_start = round_up(offset, PAGE_SIZE) >> 9;
 	u64 discard_end = round_down(offset + len, PAGE_SIZE) >> 9;
 	int ret = 0;
@@ -2406,34 +2405,9 @@ static long bch2_fpunch(struct bch_inode_info *inode, loff_t offset, loff_t len)
 
 	truncate_pagecache_range(&inode->v, offset, offset + len - 1);
 
-	if (discard_start < discard_end) {
-		/*
-		 * We need to pass in a disk reservation here because we might
-		 * be splitting a compressed extent into two. This isn't a
-		 * problem with truncate because truncate will never split an
-		 * extent, only truncate it...
-		 */
-		struct disk_reservation disk_res =
-			bch2_disk_reservation_init(c, 0);
-		struct i_sectors_hook i_sectors_hook =
-			i_sectors_hook_init(inode, 0);
-		int ret;
-
-		ret = i_sectors_dirty_start(c, &i_sectors_hook);
-		if (unlikely(ret))
-			goto err;
-
-		ret = bch2_btree_delete_range(c,
-				BTREE_ID_EXTENTS,
-				POS(ino, discard_start),
-				POS(ino, discard_end),
-				ZERO_VERSION,
-				&disk_res,
-				&i_sectors_hook.hook,
-				&inode->ei_journal_seq);
-
-		ret = i_sectors_dirty_finish(c, &i_sectors_hook) ?: ret;
-	}
+	if (discard_start < discard_end)
+		ret = __bch2_fpunch(c, inode, discard_start, discard_end,
+				    &inode->ei_journal_seq);
 err:
 	bch2_pagecache_block_put(&inode->ei_pagecache_lock);
 	inode_unlock(&inode->v);
-- 
cgit 


From 8ef231bd5174935686903888a73559503c3a2a64 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 11 Aug 2018 17:26:11 -0400
Subject: bcachefs: convert fcollapse to bch2_extent_update()

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-io.c | 67 ++++++++++++++++++++++++-----------------------------
 1 file changed, 30 insertions(+), 37 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 7954455bea93..c2f8345d1e69 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -2424,7 +2424,6 @@ static long bch2_fcollapse(struct bch_inode_info *inode,
 	struct btree_iter *src, *dst;
 	BKEY_PADDED(k) copy;
 	struct bkey_s_c k;
-	struct i_sectors_hook i_sectors_hook = i_sectors_hook_init(inode, 0);
 	loff_t new_size;
 	int ret;
 
@@ -2432,16 +2431,7 @@ static long bch2_fcollapse(struct bch_inode_info *inode,
 		return -EINVAL;
 
 	bch2_trans_init(&trans, c);
-
-	dst = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
-			     POS(inode->v.i_ino, offset >> 9),
-			     BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
-	BUG_ON(IS_ERR(dst));
-
-	/* position will be set from dst iter's position: */
-	src = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, POS_MIN,
-			     BTREE_ITER_SLOTS);
-	BUG_ON(IS_ERR(src));
+	bch2_trans_preload_iters(&trans);
 
 	/*
 	 * We need i_mutex to keep the page cache consistent with the extents
@@ -2466,15 +2456,24 @@ static long bch2_fcollapse(struct bch_inode_info *inode,
 	if (ret)
 		goto err;
 
-	ret = i_sectors_dirty_start(c, &i_sectors_hook);
-	if (ret)
-		goto err;
+	dst = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
+			POS(inode->v.i_ino, offset >> 9),
+			BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+	BUG_ON(IS_ERR_OR_NULL(dst));
+
+	src = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
+			POS_MIN, BTREE_ITER_SLOTS);
+	BUG_ON(IS_ERR_OR_NULL(src));
 
 	while (bkey_cmp(dst->pos,
 			POS(inode->v.i_ino,
 			    round_up(new_size, PAGE_SIZE) >> 9)) < 0) {
 		struct disk_reservation disk_res;
 
+		ret = bch2_btree_iter_traverse(dst);
+		if (ret)
+			goto btree_iter_err;
+
 		bch2_btree_iter_set_pos(src,
 			POS(dst->pos.inode, dst->pos.offset + (len >> 9)));
 
@@ -2487,10 +2486,6 @@ static long bch2_fcollapse(struct bch_inode_info *inode,
 		bch2_cut_front(src->pos, &copy.k);
 		copy.k.k.p.offset -= len >> 9;
 
-		ret = bch2_btree_iter_traverse(dst);
-		if (ret)
-			goto btree_iter_err;
-
 		bch2_extent_trim_atomic(&copy.k, dst);
 
 		BUG_ON(bkey_cmp(dst->pos, bkey_start_pos(&copy.k.k)));
@@ -2500,19 +2495,16 @@ static long bch2_fcollapse(struct bch_inode_info *inode,
 				BCH_DISK_RESERVATION_NOFAIL);
 		BUG_ON(ret);
 
-		ret = bch2_btree_insert_at(c, &disk_res, &i_sectors_hook.hook,
-					   &inode->ei_journal_seq,
-					   BTREE_INSERT_ATOMIC|
-					   BTREE_INSERT_NOFAIL,
-					   BTREE_INSERT_ENTRY(dst, &copy.k));
+		ret = bch2_extent_update(&trans, inode,
+				&disk_res, NULL,
+				dst, &copy.k,
+				0, true, true, NULL);
 		bch2_disk_reservation_put(c, &disk_res);
 btree_iter_err:
 		if (ret == -EINTR)
 			ret = 0;
-		if (ret) {
-			bch2_trans_exit(&trans);
-			goto err_put_sectors_dirty;
-		}
+		if (ret)
+			goto err;
 		/*
 		 * XXX: if we error here we've left data with multiple
 		 * pointers... which isn't a _super_ serious problem...
@@ -2520,20 +2512,21 @@ btree_iter_err:
 
 		bch2_btree_iter_cond_resched(src);
 	}
+	bch2_trans_unlock(&trans);
 
-	bch2_trans_exit(&trans);
-
-	ret = bch2_inode_truncate(c, inode->v.i_ino,
-				 round_up(new_size, block_bytes(c)) >> 9,
-				 &i_sectors_hook.hook,
-				 &inode->ei_journal_seq);
+	ret = __bch2_fpunch(c, inode,
+			round_up(new_size, block_bytes(c)) >> 9,
+			U64_MAX, &inode->ei_journal_seq);
 	if (ret)
-		goto err_put_sectors_dirty;
+		goto err;
 
-	i_sectors_hook.new_i_size = new_size;
-err_put_sectors_dirty:
-	ret = i_sectors_dirty_finish(c, &i_sectors_hook) ?: ret;
+	i_size_write(&inode->v, new_size);
+	mutex_lock(&inode->ei_update_lock);
+	ret = bch2_write_inode_size(c, inode, new_size,
+				    ATTR_MTIME|ATTR_CTIME);
+	mutex_unlock(&inode->ei_update_lock);
 err:
+	bch2_trans_exit(&trans);
 	bch2_pagecache_block_put(&inode->ei_pagecache_lock);
 	inode_unlock(&inode->v);
 	return ret;
-- 
cgit 


From 190fa7af39a41867f98166661422f0cf84812358 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 5 Aug 2018 17:48:00 -0400
Subject: bcachefs: kill i_sectors_hook

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-io.c | 176 +++++++++-------------------------------------------
 1 file changed, 31 insertions(+), 145 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index c2f8345d1e69..2ab2d612e90c 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -34,16 +34,6 @@ struct quota_res {
 	u64				sectors;
 };
 
-struct i_sectors_hook {
-	struct extent_insert_hook	hook;
-	struct bch_inode_info		*inode;
-	struct quota_res		quota_res;
-	s64				sectors;
-	u64				new_i_size;
-	unsigned			flags;
-	unsigned			appending:1;
-};
-
 struct bchfs_write_op {
 	struct bch_inode_info		*inode;
 	s64				sectors_added;
@@ -215,8 +205,11 @@ static int __must_check bch2_write_inode_size(struct bch_fs *c,
 }
 
 static void i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode,
-			   struct quota_res *quota_res, int sectors)
+			   struct quota_res *quota_res, s64 sectors)
 {
+	if (!sectors)
+		return;
+
 	mutex_lock(&inode->ei_quota_lock);
 #ifdef CONFIG_BCACHEFS_QUOTA
 	if (quota_res && sectors > 0) {
@@ -233,101 +226,6 @@ static void i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode,
 	mutex_unlock(&inode->ei_quota_lock);
 }
 
-/* i_sectors accounting: */
-
-static enum btree_insert_ret
-i_sectors_hook_fn(struct extent_insert_hook *hook,
-		  struct bpos committed_pos,
-		  struct bpos next_pos,
-		  struct bkey_s_c k,
-		  const struct bkey_i *insert)
-{
-	struct i_sectors_hook *h = container_of(hook,
-				struct i_sectors_hook, hook);
-	s64 sectors = next_pos.offset - committed_pos.offset;
-	int sign = bkey_extent_is_allocation(&insert->k) -
-		(k.k && bkey_extent_is_allocation(k.k));
-
-	EBUG_ON(!(h->inode->ei_inode.bi_flags & BCH_INODE_I_SECTORS_DIRTY));
-
-	h->sectors += sectors * sign;
-
-	return BTREE_INSERT_OK;
-}
-
-static int i_sectors_dirty_finish_fn(struct bch_inode_info *inode,
-				     struct bch_inode_unpacked *bi,
-				     void *p)
-{
-	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	struct i_sectors_hook *h = p;
-
-	if (h->new_i_size != U64_MAX &&
-	    (!h->appending ||
-	     h->new_i_size > bi->bi_size))
-		bi->bi_size = h->new_i_size;
-	bi->bi_sectors	+= h->sectors;
-	bi->bi_flags	&= ~h->flags;
-	bi->bi_mtime	= bi->bi_ctime = bch2_current_time(c);
-	return 0;
-}
-
-static int i_sectors_dirty_finish(struct bch_fs *c, struct i_sectors_hook *h)
-{
-	int ret;
-
-	mutex_lock(&h->inode->ei_update_lock);
-	i_sectors_acct(c, h->inode, &h->quota_res, h->sectors);
-
-	ret = bch2_write_inode(c, h->inode, i_sectors_dirty_finish_fn,
-			       h, ATTR_MTIME|ATTR_CTIME);
-
-	if (!ret && h->new_i_size != U64_MAX)
-		i_size_write(&h->inode->v, h->new_i_size);
-	mutex_unlock(&h->inode->ei_update_lock);
-
-	bch2_quota_reservation_put(c, h->inode, &h->quota_res);
-
-	h->sectors = 0;
-
-	return ret;
-}
-
-static int i_sectors_dirty_start_fn(struct bch_inode_info *inode,
-				    struct bch_inode_unpacked *bi, void *p)
-{
-	struct i_sectors_hook *h = p;
-
-	if (h->flags & BCH_INODE_I_SIZE_DIRTY)
-		bi->bi_size = h->new_i_size;
-
-	bi->bi_flags |= h->flags;
-	return 0;
-}
-
-static int i_sectors_dirty_start(struct bch_fs *c, struct i_sectors_hook *h)
-{
-	int ret;
-
-	mutex_lock(&h->inode->ei_update_lock);
-	ret = bch2_write_inode(c, h->inode, i_sectors_dirty_start_fn, h, 0);
-	mutex_unlock(&h->inode->ei_update_lock);
-
-	return ret;
-}
-
-static inline struct i_sectors_hook
-i_sectors_hook_init(struct bch_inode_info *inode, unsigned flags)
-{
-	return (struct i_sectors_hook) {
-		.hook.fn	= i_sectors_hook_fn,
-		.inode		= inode,
-		.sectors	= 0,
-		.new_i_size	= U64_MAX,
-		.flags		= flags|BCH_INODE_I_SECTORS_DIRTY,
-	};
-}
-
 /* normal i_size/i_sectors update machinery: */
 
 static s64 sum_sector_overwrites(struct bkey_i *new, struct btree_iter *_iter,
@@ -2537,8 +2435,8 @@ static long bch2_fallocate(struct bch_inode_info *inode, int mode,
 {
 	struct address_space *mapping = inode->v.i_mapping;
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	struct i_sectors_hook i_sectors_hook = i_sectors_hook_init(inode, 0);
-	struct btree_iter iter;
+	struct btree_trans trans;
+	struct btree_iter *iter;
 	struct bpos end_pos;
 	loff_t block_start, block_end;
 	loff_t end = offset + len;
@@ -2546,8 +2444,8 @@ static long bch2_fallocate(struct bch_inode_info *inode, int mode,
 	unsigned replicas = io_opts(c, inode).data_replicas;
 	int ret;
 
-	bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, POS_MIN,
-			     BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+	bch2_trans_init(&trans, c);
+	bch2_trans_preload_iters(&trans);
 
 	inode_lock(&inode->v);
 	inode_dio_wait(&inode->v);
@@ -2582,34 +2480,32 @@ static long bch2_fallocate(struct bch_inode_info *inode, int mode,
 		block_end	= round_up(end, PAGE_SIZE);
 	}
 
-	bch2_btree_iter_set_pos(&iter, POS(inode->v.i_ino, block_start >> 9));
+	iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
+			POS(inode->v.i_ino, block_start >> 9),
+			BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
 	end_pos = POS(inode->v.i_ino, block_end >> 9);
 
-	ret = i_sectors_dirty_start(c, &i_sectors_hook);
-	if (unlikely(ret))
-		goto err;
-
-	while (bkey_cmp(iter.pos, end_pos) < 0) {
+	while (bkey_cmp(iter->pos, end_pos) < 0) {
 		struct disk_reservation disk_res = { 0 };
+		struct quota_res quota_res = { 0 };
 		struct bkey_i_reservation reservation;
 		struct bkey_s_c k;
 
-		k = bch2_btree_iter_peek_slot(&iter);
+		k = bch2_btree_iter_peek_slot(iter);
 		if ((ret = btree_iter_err(k)))
 			goto btree_iter_err;
 
 		/* already reserved */
 		if (k.k->type == BCH_RESERVATION &&
 		    bkey_s_c_to_reservation(k).v->nr_replicas >= replicas) {
-			bch2_btree_iter_next_slot(&iter);
+			bch2_btree_iter_next_slot(iter);
 			continue;
 		}
 
-		if (bkey_extent_is_data(k.k)) {
-			if (!(mode & FALLOC_FL_ZERO_RANGE)) {
-				bch2_btree_iter_next_slot(&iter);
-				continue;
-			}
+		if (bkey_extent_is_data(k.k) &&
+		    !(mode & FALLOC_FL_ZERO_RANGE)) {
+			bch2_btree_iter_next_slot(iter);
+			continue;
 		}
 
 		bkey_reservation_init(&reservation.k_i);
@@ -2617,7 +2513,7 @@ static long bch2_fallocate(struct bch_inode_info *inode, int mode,
 		reservation.k.p		= k.k->p;
 		reservation.k.size	= k.k->size;
 
-		bch2_cut_front(iter.pos, &reservation.k_i);
+		bch2_cut_front(iter->pos, &reservation.k_i);
 		bch2_cut_back(end_pos, &reservation.k);
 
 		sectors = reservation.k.size;
@@ -2625,7 +2521,7 @@ static long bch2_fallocate(struct bch_inode_info *inode, int mode,
 
 		if (!bkey_extent_is_allocation(k.k)) {
 			ret = bch2_quota_reservation_add(c, inode,
-					&i_sectors_hook.quota_res,
+					&quota_res,
 					sectors, true);
 			if (unlikely(ret))
 				goto btree_iter_err;
@@ -2641,24 +2537,20 @@ static long bch2_fallocate(struct bch_inode_info *inode, int mode,
 			reservation.v.nr_replicas = disk_res.nr_replicas;
 		}
 
-		ret = bch2_btree_insert_at(c, &disk_res, &i_sectors_hook.hook,
-					  &inode->ei_journal_seq,
-					  BTREE_INSERT_ATOMIC|
-					  BTREE_INSERT_NOFAIL,
-					  BTREE_INSERT_ENTRY(&iter, &reservation.k_i));
+		ret = bch2_extent_update(&trans, inode,
+				&disk_res, &quota_res,
+				iter, &reservation.k_i,
+				0, true, true, NULL);
+
+		bch2_quota_reservation_put(c, inode, &quota_res);
 		bch2_disk_reservation_put(c, &disk_res);
 btree_iter_err:
 		if (ret == -EINTR)
 			ret = 0;
-		if (ret) {
-			bch2_btree_iter_unlock(&iter);
-			goto err_put_sectors_dirty;
-		}
-
+		if (ret)
+			goto err;
 	}
-	bch2_btree_iter_unlock(&iter);
-
-	ret = i_sectors_dirty_finish(c, &i_sectors_hook) ?: ret;
+	bch2_trans_unlock(&trans);
 
 	if (!(mode & FALLOC_FL_KEEP_SIZE) &&
 	    end > inode->v.i_size) {
@@ -2686,14 +2578,8 @@ btree_iter_err:
 			mutex_unlock(&inode->ei_update_lock);
 		}
 	}
-
-	bch2_pagecache_block_put(&inode->ei_pagecache_lock);
-	inode_unlock(&inode->v);
-
-	return 0;
-err_put_sectors_dirty:
-	ret = i_sectors_dirty_finish(c, &i_sectors_hook) ?: ret;
 err:
+	bch2_trans_exit(&trans);
 	bch2_pagecache_block_put(&inode->ei_pagecache_lock);
 	inode_unlock(&inode->v);
 	return ret;
-- 
cgit 


From fc3268c13c1925df9bdc427ffe9bd5466f672b83 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 8 Aug 2018 19:53:30 -0400
Subject: bcachefs: kill extent_insert_hook

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/acl.c                   |  2 +-
 fs/bcachefs/alloc.c                 |  2 +-
 fs/bcachefs/btree_iter.c            |  6 +--
 fs/bcachefs/btree_types.h           | 18 +-------
 fs/bcachefs/btree_update.h          | 27 +++---------
 fs/bcachefs/btree_update_interior.h |  9 ----
 fs/bcachefs/btree_update_leaf.c     | 57 ++++++------------------
 fs/bcachefs/extents.c               | 88 +++++++------------------------------
 fs/bcachefs/extents.h               |  1 -
 fs/bcachefs/fs-io.c                 |  2 +-
 fs/bcachefs/fs.c                    | 12 ++---
 fs/bcachefs/fsck.c                  | 34 +++++++-------
 fs/bcachefs/inode.c                 | 37 +++++-----------
 fs/bcachefs/inode.h                 |  2 -
 fs/bcachefs/io.c                    |  4 +-
 fs/bcachefs/journal_io.c            |  2 +-
 fs/bcachefs/migrate.c               |  2 +-
 fs/bcachefs/move.c                  |  2 +-
 fs/bcachefs/quota.c                 |  8 ++--
 fs/bcachefs/recovery.c              |  4 +-
 fs/bcachefs/super.c                 |  3 +-
 fs/bcachefs/tests.c                 | 30 ++++++-------
 22 files changed, 98 insertions(+), 254 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/acl.c b/fs/bcachefs/acl.c
index 2856736f7224..0074b3eb196d 100644
--- a/fs/bcachefs/acl.c
+++ b/fs/bcachefs/acl.c
@@ -322,7 +322,7 @@ retry:
 		bch2_write_inode_trans(&trans, inode, &inode_u,
 				       inode_update_for_set_acl_fn,
 				       (void *)(unsigned long) mode) ?:
-		bch2_trans_commit(&trans, NULL, NULL,
+		bch2_trans_commit(&trans, NULL,
 				  &inode->ei_journal_seq,
 				  BTREE_INSERT_ATOMIC|
 				  BTREE_INSERT_NOUNLOCK);
diff --git a/fs/bcachefs/alloc.c b/fs/bcachefs/alloc.c
index e6aeab0b47c7..82f27a57dc61 100644
--- a/fs/bcachefs/alloc.c
+++ b/fs/bcachefs/alloc.c
@@ -319,7 +319,7 @@ static int __bch2_alloc_write_key(struct bch_fs *c, struct bch_dev *ca,
 
 	bch2_btree_iter_set_pos(iter, a->k.p);
 
-	return bch2_btree_insert_at(c, NULL, NULL, journal_seq,
+	return bch2_btree_insert_at(c, NULL, journal_seq,
 				    BTREE_INSERT_NOFAIL|
 				    BTREE_INSERT_USE_RESERVE|
 				    BTREE_INSERT_USE_ALLOC_RESERVE|
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index aad7d8ff3f53..754f35f6b56c 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -975,8 +975,6 @@ int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter)
 	if (__bch2_btree_iter_relock(iter))
 		return 0;
 
-	iter->flags &= ~BTREE_ITER_AT_END_OF_LEAF;
-
 	/*
 	 * XXX: correctly using BTREE_ITER_UPTODATE should make using check_pos
 	 * here unnecessary
@@ -1155,10 +1153,8 @@ void bch2_btree_iter_set_pos_same_leaf(struct btree_iter *iter, struct bpos new_
 					  iter->flags & BTREE_ITER_IS_EXTENTS))
 		__btree_iter_advance(l);
 
-	if (!k && btree_iter_pos_after_node(iter, l->b)) {
+	if (!k && btree_iter_pos_after_node(iter, l->b))
 		btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
-		iter->flags |= BTREE_ITER_AT_END_OF_LEAF;
-	}
 }
 
 void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos)
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 03c319611d72..5053ed5f2762 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -196,11 +196,7 @@ enum btree_iter_type {
  * @pos or the first key strictly greater than @pos
  */
 #define BTREE_ITER_IS_EXTENTS		(1 << 4)
-/*
- * indicates we need to call bch2_btree_iter_traverse() to revalidate iterator:
- */
-#define BTREE_ITER_AT_END_OF_LEAF	(1 << 5)
-#define BTREE_ITER_ERROR		(1 << 6)
+#define BTREE_ITER_ERROR		(1 << 5)
 
 enum btree_iter_uptodate {
 	BTREE_ITER_UPTODATE		= 0,
@@ -256,12 +252,6 @@ struct btree_iter {
 struct btree_insert_entry {
 	struct btree_iter *iter;
 	struct bkey_i	*k;
-	unsigned	extra_res;
-	/*
-	 * true if entire key was inserted - can only be false for
-	 * extents
-	 */
-	bool		done;
 };
 
 struct btree_trans {
@@ -467,12 +457,6 @@ enum btree_insert_ret {
 	BTREE_INSERT_NEED_GC_LOCK,
 };
 
-struct extent_insert_hook {
-	enum btree_insert_ret
-	(*fn)(struct extent_insert_hook *, struct bpos, struct bpos,
-	      struct bkey_s_c, const struct bkey_i *);
-};
-
 enum btree_gc_coalesce_fail_reason {
 	BTREE_GC_COALESCE_FAIL_RESERVE_GET,
 	BTREE_GC_COALESCE_FAIL_KEYLIST_REALLOC,
diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
index 31b72895f6eb..f6b0082235af 100644
--- a/fs/bcachefs/btree_update.h
+++ b/fs/bcachefs/btree_update.h
@@ -23,7 +23,6 @@ struct btree_insert {
 	struct disk_reservation *disk_res;
 	struct journal_res	journal_res;
 	u64			*journal_seq;
-	struct extent_insert_hook *hook;
 	unsigned		flags;
 	bool			did_work;
 
@@ -37,15 +36,6 @@ int __bch2_btree_insert_at(struct btree_insert *);
 	((struct btree_insert_entry) {					\
 		.iter		= (_iter),				\
 		.k		= (_k),					\
-		.done		= false,				\
-	})
-
-#define BTREE_INSERT_ENTRY_EXTRA_RES(_iter, _k, _extra)			\
-	((struct btree_insert_entry) {					\
-		.iter		= (_iter),				\
-		.k		= (_k),					\
-		.extra_res = (_extra),					\
-		.done		= false,				\
 	})
 
 /**
@@ -61,13 +51,11 @@ int __bch2_btree_insert_at(struct btree_insert *);
  * -EROFS: filesystem read only
  * -EIO: journal or btree node IO error
  */
-#define bch2_btree_insert_at(_c, _disk_res, _hook,			\
-			    _journal_seq, _flags, ...)			\
+#define bch2_btree_insert_at(_c, _disk_res, _journal_seq, _flags, ...)	\
 	__bch2_btree_insert_at(&(struct btree_insert) {			\
 		.c		= (_c),					\
 		.disk_res	= (_disk_res),				\
 		.journal_seq	= (_journal_seq),			\
-		.hook		= (_hook),				\
 		.flags		= (_flags),				\
 		.nr		= COUNT_ARGS(__VA_ARGS__),		\
 		.entries	= (struct btree_insert_entry[]) {	\
@@ -121,17 +109,13 @@ enum {
 int bch2_btree_delete_at(struct btree_iter *, unsigned);
 
 int bch2_btree_insert_list_at(struct btree_iter *, struct keylist *,
-			     struct disk_reservation *,
-			     struct extent_insert_hook *, u64 *, unsigned);
+			     struct disk_reservation *, u64 *, unsigned);
 
 int bch2_btree_insert(struct bch_fs *, enum btree_id, struct bkey_i *,
-		     struct disk_reservation *,
-		     struct extent_insert_hook *, u64 *, int flags);
+		     struct disk_reservation *, u64 *, int flags);
 
 int bch2_btree_delete_range(struct bch_fs *, enum btree_id,
-			   struct bpos, struct bpos, struct bversion,
-			   struct disk_reservation *,
-			   struct extent_insert_hook *, u64 *);
+			    struct bpos, struct bpos, u64 *);
 
 int bch2_btree_node_rewrite(struct bch_fs *c, struct btree_iter *,
 			    __le64, unsigned);
@@ -151,7 +135,6 @@ bch2_trans_update(struct btree_trans *trans,
 
 int bch2_trans_commit(struct btree_trans *,
 		      struct disk_reservation *,
-		      struct extent_insert_hook *,
 		      u64 *, unsigned);
 
 #define bch2_trans_do(_c, _journal_seq, _flags, _do)			\
@@ -164,7 +147,7 @@ int bch2_trans_commit(struct btree_trans *,
 	do {								\
 		bch2_trans_begin(&trans);				\
 									\
-		_ret = (_do) ?:	bch2_trans_commit(&trans, NULL, NULL,	\
+		_ret = (_do) ?:	bch2_trans_commit(&trans, NULL,		\
 					(_journal_seq), (_flags));	\
 	} while (_ret == -EINTR);					\
 									\
diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h
index 4125cddded61..b24988352b03 100644
--- a/fs/bcachefs/btree_update_interior.h
+++ b/fs/bcachefs/btree_update_interior.h
@@ -161,15 +161,6 @@ static inline void bch2_foreground_maybe_merge_sibling(struct bch_fs *c,
 {
 	struct btree *b;
 
-	/*
-	 * iterators are inconsistent when they hit end of leaf, until
-	 * traversed again
-	 *
-	 * XXX inconsistent how?
-	 */
-	if (iter->flags & BTREE_ITER_AT_END_OF_LEAF)
-		return;
-
 	if (iter->uptodate >= BTREE_ITER_NEED_TRAVERSE)
 		return;
 
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 598d7a107792..6b8954493e05 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -336,14 +336,12 @@ static inline int do_btree_insert_at(struct btree_insert *trans,
 	unsigned u64s;
 	int ret;
 
-	trans_for_each_entry(trans, i) {
-		BUG_ON(i->done);
+	trans_for_each_entry(trans, i)
 		BUG_ON(i->iter->uptodate >= BTREE_ITER_NEED_RELOCK);
-	}
 
 	u64s = 0;
 	trans_for_each_entry(trans, i)
-		u64s += jset_u64s(i->k->k.u64s + i->extra_res);
+		u64s += jset_u64s(i->k->k.u64s);
 
 	memset(&trans->journal_res, 0, sizeof(trans->journal_res));
 
@@ -374,7 +372,7 @@ static inline int do_btree_insert_at(struct btree_insert *trans,
 		if (!same_leaf_as_prev(trans, i))
 			u64s = 0;
 
-		u64s += i->k->k.u64s + i->extra_res;
+		u64s += i->k->k.u64s;
 		switch (btree_key_can_insert(trans, i, &u64s)) {
 		case BTREE_INSERT_OK:
 			break;
@@ -406,28 +404,14 @@ static inline int do_btree_insert_at(struct btree_insert *trans,
 	trans_for_each_entry(trans, i) {
 		switch (btree_insert_key_leaf(trans, i)) {
 		case BTREE_INSERT_OK:
-			i->done = true;
 			break;
 		case BTREE_INSERT_NEED_TRAVERSE:
+			BUG_ON((trans->flags & BTREE_INSERT_ATOMIC));
 			ret = -EINTR;
-			break;
-		case BTREE_INSERT_BTREE_NODE_FULL:
-			ret = -EINTR;
-			*split = i->iter;
-			break;
-		case BTREE_INSERT_ENOSPC:
-			ret = -ENOSPC;
-			break;
+			goto out;
 		default:
 			BUG();
 		}
-
-		/*
-		 * If we did some work (i.e. inserted part of an extent),
-		 * we have to do all the other updates as well:
-		 */
-		if (!trans->did_work && (ret || *split))
-			break;
 	}
 out:
 	multi_unlock_write(trans);
@@ -523,11 +507,6 @@ out:
 			       trans->did_work &&
 			       !btree_node_locked(linked, 0));
 		}
-
-		/* make sure we didn't lose an error: */
-		if (!ret)
-			trans_for_each_entry(trans, i)
-				BUG_ON(!i->done);
 	}
 
 	BUG_ON(!(trans->flags & BTREE_INSERT_ATOMIC) && ret == -EINTR);
@@ -614,7 +593,6 @@ err:
 
 int bch2_trans_commit(struct btree_trans *trans,
 		      struct disk_reservation *disk_res,
-		      struct extent_insert_hook *hook,
 		      u64 *journal_seq,
 		      unsigned flags)
 {
@@ -642,7 +620,7 @@ int bch2_btree_delete_at(struct btree_iter *iter, unsigned flags)
 	bkey_init(&k.k);
 	k.k.p = iter->pos;
 
-	return bch2_btree_insert_at(iter->c, NULL, NULL, NULL,
+	return bch2_btree_insert_at(iter->c, NULL, NULL,
 				    BTREE_INSERT_NOFAIL|
 				    BTREE_INSERT_USE_RESERVE|flags,
 				    BTREE_INSERT_ENTRY(iter, &k));
@@ -651,7 +629,6 @@ int bch2_btree_delete_at(struct btree_iter *iter, unsigned flags)
 int bch2_btree_insert_list_at(struct btree_iter *iter,
 			     struct keylist *keys,
 			     struct disk_reservation *disk_res,
-			     struct extent_insert_hook *hook,
 			     u64 *journal_seq, unsigned flags)
 {
 	BUG_ON(flags & BTREE_INSERT_ATOMIC);
@@ -659,7 +636,7 @@ int bch2_btree_insert_list_at(struct btree_iter *iter,
 	bch2_verify_keylist_sorted(keys);
 
 	while (!bch2_keylist_empty(keys)) {
-		int ret = bch2_btree_insert_at(iter->c, disk_res, hook,
+		int ret = bch2_btree_insert_at(iter->c, disk_res,
 				journal_seq, flags,
 				BTREE_INSERT_ENTRY(iter, bch2_keylist_front(keys)));
 		if (ret)
@@ -681,7 +658,6 @@ int bch2_btree_insert_list_at(struct btree_iter *iter,
 int bch2_btree_insert(struct bch_fs *c, enum btree_id id,
 		     struct bkey_i *k,
 		     struct disk_reservation *disk_res,
-		     struct extent_insert_hook *hook,
 		     u64 *journal_seq, int flags)
 {
 	struct btree_iter iter;
@@ -689,7 +665,7 @@ int bch2_btree_insert(struct bch_fs *c, enum btree_id id,
 
 	bch2_btree_iter_init(&iter, c, id, bkey_start_pos(&k->k),
 			     BTREE_ITER_INTENT);
-	ret = bch2_btree_insert_at(c, disk_res, hook, journal_seq, flags,
+	ret = bch2_btree_insert_at(c, disk_res, journal_seq, flags,
 				   BTREE_INSERT_ENTRY(&iter, k));
 	bch2_btree_iter_unlock(&iter);
 
@@ -702,12 +678,8 @@ int bch2_btree_insert(struct bch_fs *c, enum btree_id id,
  * Range is a half open interval - [start, end)
  */
 int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id,
-			   struct bpos start,
-			   struct bpos end,
-			   struct bversion version,
-			   struct disk_reservation *disk_res,
-			   struct extent_insert_hook *hook,
-			   u64 *journal_seq)
+			    struct bpos start, struct bpos end,
+			    u64 *journal_seq)
 {
 	struct btree_iter iter;
 	struct bkey_s_c k;
@@ -717,14 +689,12 @@ int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id,
 			     BTREE_ITER_INTENT);
 
 	while ((k = bch2_btree_iter_peek(&iter)).k &&
-	       !(ret = btree_iter_err(k))) {
+	       !(ret = btree_iter_err(k)) &&
+	       bkey_cmp(iter.pos, end) < 0) {
 		unsigned max_sectors = KEY_SIZE_MAX & (~0 << c->block_bits);
 		/* really shouldn't be using a bare, unpadded bkey_i */
 		struct bkey_i delete;
 
-		if (bkey_cmp(iter.pos, end) >= 0)
-			break;
-
 		bkey_init(&delete.k);
 
 		/*
@@ -738,7 +708,6 @@ int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id,
 		 * bkey_start_pos(k.k)).
 		 */
 		delete.k.p = iter.pos;
-		delete.k.version = version;
 
 		if (iter.flags & BTREE_ITER_IS_EXTENTS) {
 			/* create the biggest key we can */
@@ -746,7 +715,7 @@ int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id,
 			bch2_cut_back(end, &delete.k);
 		}
 
-		ret = bch2_btree_insert_at(c, disk_res, hook, journal_seq,
+		ret = bch2_btree_insert_at(c, NULL, journal_seq,
 					   BTREE_INSERT_NOFAIL,
 					   BTREE_INSERT_ENTRY(&iter, &delete));
 		if (ret)
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 59d2eaea9edf..9f39e9dea51a 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -1252,52 +1252,6 @@ static void extent_insert_committed(struct extent_insert_state *s)
 	s->trans->did_work		= true;
 }
 
-static enum btree_insert_ret
-__extent_insert_advance_pos(struct extent_insert_state *s,
-			    struct bpos next_pos,
-			    struct bkey_s_c k)
-{
-	struct extent_insert_hook *hook = s->trans->hook;
-	enum btree_insert_ret ret;
-
-	if (hook)
-		ret = hook->fn(hook, s->committed, next_pos, k, s->insert->k);
-	else
-		ret = BTREE_INSERT_OK;
-
-	if (ret == BTREE_INSERT_OK)
-		s->committed = next_pos;
-
-	return ret;
-}
-
-/*
- * Update iter->pos, marking how much of @insert we've processed, and call hook
- * fn:
- */
-static enum btree_insert_ret
-extent_insert_advance_pos(struct extent_insert_state *s, struct bkey_s_c k)
-{
-	struct btree *b = s->insert->iter->l[0].b;
-	struct bpos next_pos = bpos_min(s->insert->k->k.p,
-					k.k ? k.k->p : b->key.k.p);
-	enum btree_insert_ret ret;
-
-	/* hole? */
-	if (k.k && bkey_cmp(s->committed, bkey_start_pos(k.k)) < 0) {
-		ret = __extent_insert_advance_pos(s, bkey_start_pos(k.k),
-						    bkey_s_c_null);
-		if (ret != BTREE_INSERT_OK)
-			return ret;
-	}
-
-	/* avoid redundant calls to hook fn: */
-	if (!bkey_cmp(s->committed, next_pos))
-		return BTREE_INSERT_OK;
-
-	return __extent_insert_advance_pos(s, next_pos, k);
-}
-
 void bch2_extent_trim_atomic(struct bkey_i *k, struct btree_iter *iter)
 {
 	struct btree *b = iter->l[0].b;
@@ -1468,8 +1422,7 @@ extent_squash(struct extent_insert_state *s, struct bkey_i *insert,
 	}
 }
 
-static enum btree_insert_ret
-__bch2_insert_fixup_extent(struct extent_insert_state *s)
+static void __bch2_insert_fixup_extent(struct extent_insert_state *s)
 {
 	struct btree_iter *iter = s->insert->iter;
 	struct btree_iter_level *l = &iter->l[0];
@@ -1477,7 +1430,6 @@ __bch2_insert_fixup_extent(struct extent_insert_state *s)
 	struct bkey_packed *_k;
 	struct bkey unpacked;
 	struct bkey_i *insert = s->insert->k;
-	enum btree_insert_ret ret = BTREE_INSERT_OK;
 
 	while (bkey_cmp(s->committed, insert->k.p) < 0 &&
 	       (_k = bch2_btree_node_iter_peek_filter(&l->iter, b,
@@ -1491,9 +1443,7 @@ __bch2_insert_fixup_extent(struct extent_insert_state *s)
 		if (bkey_cmp(bkey_start_pos(k.k), insert->k.p) >= 0)
 			break;
 
-		ret = extent_insert_advance_pos(s, k.s_c);
-		if (ret)
-			break;
+		s->committed = bpos_min(s->insert->k->k.p, k.k->p);
 
 		if (!bkey_whiteout(k.k))
 			s->update_journal = true;
@@ -1547,9 +1497,8 @@ next:
 			break;
 	}
 
-	if (ret == BTREE_INSERT_OK &&
-	    bkey_cmp(s->committed, insert->k.p) < 0)
-		ret = extent_insert_advance_pos(s, bkey_s_c_null);
+	if (bkey_cmp(s->committed, insert->k.p) < 0)
+		s->committed = bpos_min(s->insert->k->k.p, b->key.k.p);
 
 	/*
 	 * may have skipped past some deleted extents greater than the insert
@@ -1563,8 +1512,6 @@ next:
 		       bkey_cmp_left_packed(b, _k, &s->committed) > 0)
 			l->iter = node_iter;
 	}
-
-	return ret;
 }
 
 /**
@@ -1610,16 +1557,13 @@ enum btree_insert_ret
 bch2_insert_fixup_extent(struct btree_insert *trans,
 			 struct btree_insert_entry *insert)
 {
-	struct bch_fs *c = trans->c;
-	struct btree_iter *iter = insert->iter;
-	struct btree_iter_level *l = &iter->l[0];
-	struct btree *b = l->b;
-	enum btree_insert_ret ret = BTREE_INSERT_OK;
-
+	struct bch_fs *c	= trans->c;
+	struct btree_iter *iter	= insert->iter;
+	struct btree *b		= iter->l[0].b;
 	struct extent_insert_state s = {
 		.trans		= trans,
 		.insert		= insert,
-		.committed	= insert->iter->pos,
+		.committed	= iter->pos,
 
 		.whiteout	= *insert->k,
 		.update_journal	= !bkey_whiteout(&insert->k->k),
@@ -1644,7 +1588,7 @@ bch2_insert_fixup_extent(struct btree_insert *trans,
 				bkey_start_offset(&insert->k->k),
 				insert->k->k.size);
 
-	ret = __bch2_insert_fixup_extent(&s);
+	__bch2_insert_fixup_extent(&s);
 
 	extent_insert_committed(&s);
 
@@ -1653,16 +1597,14 @@ bch2_insert_fixup_extent(struct btree_insert *trans,
 
 	EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k->k)));
 	EBUG_ON(bkey_cmp(iter->pos, s.committed));
-	EBUG_ON((bkey_cmp(iter->pos, b->key.k.p) == 0) !=
-		!!(iter->flags & BTREE_ITER_AT_END_OF_LEAF));
-
-	if (insert->k->k.size && (iter->flags & BTREE_ITER_AT_END_OF_LEAF))
-		ret = BTREE_INSERT_NEED_TRAVERSE;
 
-	WARN_ONCE((ret == BTREE_INSERT_OK) != (insert->k->k.size == 0),
-		  "ret %u insert->k.size %u", ret, insert->k->k.size);
+	if (insert->k->k.size) {
+		/* got to the end of this leaf node */
+		BUG_ON(bkey_cmp(iter->pos, b->key.k.p));
+		return BTREE_INSERT_NEED_TRAVERSE;
+	}
 
-	return ret;
+	return BTREE_INSERT_OK;
 }
 
 const char *bch2_extent_invalid(const struct bch_fs *c, struct bkey_s_c k)
diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h
index 0721d1829f98..66143d8d3895 100644
--- a/fs/bcachefs/extents.h
+++ b/fs/bcachefs/extents.h
@@ -12,7 +12,6 @@ struct btree_node_iter;
 struct btree_node_iter_large;
 struct btree_insert;
 struct btree_insert_entry;
-struct extent_insert_hook;
 struct bch_devs_mask;
 union bch_extent_crc;
 
diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 2ab2d612e90c..195af78cb474 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -326,7 +326,7 @@ static int bch2_extent_update(struct btree_trans *trans,
 			BTREE_INSERT_ENTRY(inode_iter, &inode_p.inode.k_i));
 	}
 
-	ret = bch2_trans_commit(trans, disk_res, NULL,
+	ret = bch2_trans_commit(trans, disk_res,
 				&inode->ei_journal_seq,
 				BTREE_INSERT_NOFAIL|
 				BTREE_INSERT_ATOMIC|
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index 67ddad95f91a..b67cf83f7fcd 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -229,7 +229,7 @@ retry:
 	bch2_trans_begin(&trans);
 
 	ret = bch2_write_inode_trans(&trans, inode, &inode_u, set, p) ?:
-		bch2_trans_commit(&trans, NULL, NULL,
+		bch2_trans_commit(&trans, NULL,
 				  &inode->ei_journal_seq,
 				  BTREE_INSERT_ATOMIC|
 				  BTREE_INSERT_NOUNLOCK|
@@ -391,7 +391,7 @@ retry:
 					  inode_update_for_create_fn,
 					  &inode_u)
 		 : 0) ?:
-		bch2_trans_commit(&trans, NULL, NULL,
+		bch2_trans_commit(&trans, NULL,
 				  &journal_seq,
 				  BTREE_INSERT_ATOMIC|
 				  BTREE_INSERT_NOUNLOCK);
@@ -535,7 +535,7 @@ retry:
 		bch2_write_inode_trans(&trans, inode, &inode_u,
 				       inode_update_for_link_fn,
 				       NULL) ?:
-		bch2_trans_commit(&trans, NULL, NULL,
+		bch2_trans_commit(&trans, NULL,
 				  &inode->ei_journal_seq,
 				  BTREE_INSERT_ATOMIC|
 				  BTREE_INSERT_NOUNLOCK);
@@ -622,7 +622,7 @@ retry:
 		bch2_write_inode_trans(&trans, inode, &inode_u,
 				       inode_update_for_unlink_fn,
 				       NULL) ?:
-		bch2_trans_commit(&trans, NULL, NULL,
+		bch2_trans_commit(&trans, NULL,
 				  &dir->ei_journal_seq,
 				  BTREE_INSERT_ATOMIC|
 				  BTREE_INSERT_NOUNLOCK|
@@ -832,7 +832,7 @@ retry:
 		 ? bch2_write_inode_trans(&trans, i.dst_inode, &dst_inode_u,
 				       inode_update_for_rename_fn, &i)
 		 : 0 ) ?:
-		bch2_trans_commit(&trans, NULL, NULL,
+		bch2_trans_commit(&trans, NULL,
 				  &journal_seq,
 				  BTREE_INSERT_ATOMIC|
 				  BTREE_INSERT_NOUNLOCK);
@@ -958,7 +958,7 @@ retry:
 		(iattr->ia_valid & ATTR_MODE
 		 ? bch2_acl_chmod(&trans, inode, iattr->ia_mode, &acl)
 		 : 0) ?:
-		bch2_trans_commit(&trans, NULL, NULL,
+		bch2_trans_commit(&trans, NULL,
 				  &inode->ei_journal_seq,
 				  BTREE_INSERT_ATOMIC|
 				  BTREE_INSERT_NOUNLOCK|
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 33fff198858a..2430833dbce8 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -73,8 +73,7 @@ static int reattach_inode(struct bch_fs *c,
 	bch2_inode_pack(&packed, lostfound_inode);
 
 	ret = bch2_btree_insert(c, BTREE_ID_INODES, &packed.inode.k_i,
-			       NULL, NULL, NULL,
-			       BTREE_INSERT_NOFAIL);
+				NULL, NULL, BTREE_INSERT_NOFAIL);
 	if (ret) {
 		bch_err(c, "error %i reattaching inode %llu while updating lost+found",
 			ret, inum);
@@ -202,7 +201,7 @@ retry:
 	}
 
 	ret   = bch2_hash_delete_at(&trans, desc, info, iter) ?:
-		bch2_trans_commit(&trans, NULL, NULL, NULL,
+		bch2_trans_commit(&trans, NULL, NULL,
 				  BTREE_INSERT_ATOMIC|
 				  BTREE_INSERT_NOFAIL);
 err:
@@ -290,6 +289,13 @@ fsck_err:
 	return ret;
 }
 
+static int bch2_inode_truncate(struct bch_fs *c, u64 inode_nr, u64 new_size)
+{
+	return bch2_btree_delete_range(c, BTREE_ID_EXTENTS,
+			POS(inode_nr, round_up(new_size, block_bytes(c)) >> 9),
+			POS(inode_nr + 1, 0), NULL);
+}
+
 /*
  * Walk extents: verify that extents have a corresponding S_ISREG inode, and
  * that i_size an i_sectors are consistent
@@ -320,7 +326,7 @@ static int check_extents(struct bch_fs *c)
 			k.k->type, k.k->p.inode, w.inode.bi_mode)) {
 			bch2_btree_iter_unlock(&iter);
 
-			ret = bch2_inode_truncate(c, k.k->p.inode, 0, NULL, NULL);
+			ret = bch2_inode_truncate(c, k.k->p.inode, 0);
 			if (ret)
 				goto err;
 			continue;
@@ -342,10 +348,7 @@ static int check_extents(struct bch_fs *c)
 			bch2_inode_pack(&p, &w.inode);
 
 			ret = bch2_btree_insert(c, BTREE_ID_INODES,
-						&p.inode.k_i,
-						NULL,
-						NULL,
-						NULL,
+						&p.inode.k_i, NULL, NULL,
 						BTREE_INSERT_NOFAIL);
 			if (ret) {
 				bch_err(c, "error in fs gc: error %i "
@@ -366,8 +369,7 @@ static int check_extents(struct bch_fs *c)
 			bch2_btree_iter_unlock(&iter);
 
 			ret = bch2_inode_truncate(c, k.k->p.inode,
-					round_up(w.inode.bi_size, PAGE_SIZE) >> 9,
-					NULL, NULL);
+						  w.inode.bi_size);
 			if (ret)
 				goto err;
 			continue;
@@ -508,7 +510,7 @@ static int check_dirents(struct bch_fs *c)
 			bkey_reassemble(&n->k_i, d.s_c);
 			n->v.d_type = mode_to_type(target.bi_mode);
 
-			ret = bch2_btree_insert_at(c, NULL, NULL, NULL,
+			ret = bch2_btree_insert_at(c, NULL, NULL,
 					BTREE_INSERT_NOFAIL,
 					BTREE_INSERT_ENTRY(iter, &n->k_i));
 			kfree(n);
@@ -602,7 +604,7 @@ create_root:
 	bch2_inode_pack(&packed, root_inode);
 
 	return bch2_btree_insert(c, BTREE_ID_INODES, &packed.inode.k_i,
-				 NULL, NULL, NULL, BTREE_INSERT_NOFAIL);
+				 NULL, NULL, BTREE_INSERT_NOFAIL);
 }
 
 /* Get lost+found, create if it doesn't exist: */
@@ -646,7 +648,7 @@ create_lostfound:
 	bch2_inode_pack(&packed, root_inode);
 
 	ret = bch2_btree_insert(c, BTREE_ID_INODES, &packed.inode.k_i,
-				NULL, NULL, NULL, BTREE_INSERT_NOFAIL);
+				NULL, NULL, BTREE_INSERT_NOFAIL);
 	if (ret)
 		return ret;
 
@@ -1094,9 +1096,7 @@ static int check_inode(struct bch_fs *c,
 		 * just switch units to bytes and that issue goes away
 		 */
 
-		ret = bch2_inode_truncate(c, u.bi_inum,
-				round_up(u.bi_size, PAGE_SIZE) >> 9,
-				NULL, NULL);
+		ret = bch2_inode_truncate(c, u.bi_inum, u.bi_size);
 		if (ret) {
 			bch_err(c, "error in fs gc: error %i "
 				"truncating inode", ret);
@@ -1142,7 +1142,7 @@ static int check_inode(struct bch_fs *c,
 
 		bch2_inode_pack(&p, &u);
 
-		ret = bch2_btree_insert_at(c, NULL, NULL, NULL,
+		ret = bch2_btree_insert_at(c, NULL, NULL,
 					  BTREE_INSERT_NOFAIL,
 					  BTREE_INSERT_ENTRY(iter, &p.inode.k_i));
 		if (ret && ret != -EINTR)
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index 002232ffed62..debdbf58dd79 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -371,33 +371,14 @@ int bch2_inode_create(struct bch_fs *c, struct bch_inode_unpacked *inode_u,
 			__bch2_inode_create(&trans, inode_u, min, max, hint));
 }
 
-int bch2_inode_truncate(struct bch_fs *c, u64 inode_nr, u64 new_size,
-			struct extent_insert_hook *hook, u64 *journal_seq)
-{
-	return bch2_btree_delete_range(c, BTREE_ID_EXTENTS,
-				       POS(inode_nr, new_size),
-				       POS(inode_nr + 1, 0),
-				       ZERO_VERSION, NULL, hook,
-				       journal_seq);
-}
-
 int bch2_inode_rm(struct bch_fs *c, u64 inode_nr)
 {
 	struct btree_iter iter;
 	struct bkey_i_inode_generation delete;
+	struct bpos start = POS(inode_nr, 0);
+	struct bpos end = POS(inode_nr + 1, 0);
 	int ret;
 
-	ret = bch2_inode_truncate(c, inode_nr, 0, NULL, NULL);
-	if (ret < 0)
-		return ret;
-
-	ret = bch2_btree_delete_range(c, BTREE_ID_XATTRS,
-				     POS(inode_nr, 0),
-				     POS(inode_nr + 1, 0),
-				     ZERO_VERSION, NULL, NULL, NULL);
-	if (ret < 0)
-		return ret;
-
 	/*
 	 * If this was a directory, there shouldn't be any real dirents left -
 	 * but there could be whiteouts (from hash collisions) that we should
@@ -406,11 +387,13 @@ int bch2_inode_rm(struct bch_fs *c, u64 inode_nr)
 	 * XXX: the dirent could ideally would delete whiteouts when they're no
 	 * longer needed
 	 */
-	ret = bch2_btree_delete_range(c, BTREE_ID_DIRENTS,
-				     POS(inode_nr, 0),
-				     POS(inode_nr + 1, 0),
-				     ZERO_VERSION, NULL, NULL, NULL);
-	if (ret < 0)
+	ret   = bch2_btree_delete_range(c, BTREE_ID_EXTENTS,
+					start, end, NULL) ?:
+		bch2_btree_delete_range(c, BTREE_ID_XATTRS,
+					start, end, NULL) ?:
+		bch2_btree_delete_range(c, BTREE_ID_DIRENTS,
+					start, end, NULL);
+	if (ret)
 		return ret;
 
 	bch2_btree_iter_init(&iter, c, BTREE_ID_INODES, POS(inode_nr, 0),
@@ -454,7 +437,7 @@ int bch2_inode_rm(struct bch_fs *c, u64 inode_nr)
 			delete.v.bi_generation = cpu_to_le32(bi_generation);
 		}
 
-		ret = bch2_btree_insert_at(c, NULL, NULL, NULL,
+		ret = bch2_btree_insert_at(c, NULL, NULL,
 				BTREE_INSERT_ATOMIC|
 				BTREE_INSERT_NOFAIL,
 				BTREE_INSERT_ENTRY(&iter, &delete.k_i));
diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h
index ce423a5f2af5..8713b51d3af7 100644
--- a/fs/bcachefs/inode.h
+++ b/fs/bcachefs/inode.h
@@ -46,8 +46,6 @@ int __bch2_inode_create(struct btree_trans *,
 int bch2_inode_create(struct bch_fs *, struct bch_inode_unpacked *,
 		      u64, u64, u64 *);
 
-int bch2_inode_truncate(struct bch_fs *, u64, u64,
-		       struct extent_insert_hook *, u64 *);
 int bch2_inode_rm(struct bch_fs *, u64);
 
 int bch2_inode_find_by_inum(struct bch_fs *, u64,
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index d1935ef1d6c3..50cc87b7875d 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -302,7 +302,7 @@ int bch2_write_index_default(struct bch_write_op *op)
 			     BTREE_ITER_INTENT);
 
 	ret = bch2_btree_insert_list_at(&iter, keys, &op->res,
-					NULL, op_journal_seq(op),
+					op_journal_seq(op),
 					BTREE_INSERT_NOFAIL|
 					BTREE_INSERT_USE_RESERVE);
 	bch2_btree_iter_unlock(&iter);
@@ -1403,7 +1403,7 @@ retry:
 	if (!bch2_extent_narrow_crcs(e, new_crc))
 		goto out;
 
-	ret = bch2_btree_insert_at(c, NULL, NULL, NULL,
+	ret = bch2_btree_insert_at(c, NULL, NULL,
 				   BTREE_INSERT_ATOMIC|
 				   BTREE_INSERT_NOFAIL|
 				   BTREE_INSERT_NOWAIT,
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 00c454673a04..16ea32dc1fa4 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -904,7 +904,7 @@ int bch2_journal_replay(struct bch_fs *c, struct list_head *list)
 					bch2_disk_reservation_init(c, 0);
 
 				ret = bch2_btree_insert(c, entry->btree_id, k,
-							&disk_res, NULL, NULL,
+							&disk_res, NULL,
 							BTREE_INSERT_NOFAIL|
 							BTREE_INSERT_JOURNAL_REPLAY);
 			}
diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c
index 8f618dc5160d..9337a8729a5b 100644
--- a/fs/bcachefs/migrate.c
+++ b/fs/bcachefs/migrate.c
@@ -79,7 +79,7 @@ static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
 
 		iter.pos = bkey_start_pos(&tmp.key.k);
 
-		ret = bch2_btree_insert_at(c, NULL, NULL, NULL,
+		ret = bch2_btree_insert_at(c, NULL, NULL,
 					   BTREE_INSERT_ATOMIC|
 					   BTREE_INSERT_NOFAIL,
 					   BTREE_INSERT_ENTRY(&iter, &tmp.key));
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index b6310a60d5b7..93083cfff9bf 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -158,7 +158,7 @@ static int bch2_migrate_index_update(struct bch_write_op *op)
 			break;
 
 		ret = bch2_btree_insert_at(c, &op->res,
-				NULL, op_journal_seq(op),
+				op_journal_seq(op),
 				BTREE_INSERT_ATOMIC|
 				BTREE_INSERT_NOFAIL|
 				BTREE_INSERT_USE_RESERVE|
diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c
index 0a305ad08188..79a7f82868d6 100644
--- a/fs/bcachefs/quota.c
+++ b/fs/bcachefs/quota.c
@@ -541,7 +541,7 @@ static int bch2_quota_remove(struct super_block *sb, unsigned uflags)
 		ret = bch2_btree_delete_range(c, BTREE_ID_QUOTAS,
 					      POS(QTYP_USR, 0),
 					      POS(QTYP_USR + 1, 0),
-					      ZERO_VERSION, NULL, NULL, NULL);
+					      NULL);
 		if (ret)
 			return ret;
 	}
@@ -553,7 +553,7 @@ static int bch2_quota_remove(struct super_block *sb, unsigned uflags)
 		ret = bch2_btree_delete_range(c, BTREE_ID_QUOTAS,
 					      POS(QTYP_GRP, 0),
 					      POS(QTYP_GRP + 1, 0),
-					      ZERO_VERSION, NULL, NULL, NULL);
+					      NULL);
 		if (ret)
 			return ret;
 	}
@@ -565,7 +565,7 @@ static int bch2_quota_remove(struct super_block *sb, unsigned uflags)
 		ret = bch2_btree_delete_range(c, BTREE_ID_QUOTAS,
 					      POS(QTYP_PRJ, 0),
 					      POS(QTYP_PRJ + 1, 0),
-					      ZERO_VERSION, NULL, NULL, NULL);
+					      NULL);
 		if (ret)
 			return ret;
 	}
@@ -764,7 +764,7 @@ static int bch2_set_quota(struct super_block *sb, struct kqid qid,
 	if (qdq->d_fieldmask & QC_INO_HARD)
 		new_quota.v.c[Q_INO].hardlimit = cpu_to_le64(qdq->d_ino_hardlimit);
 
-	ret = bch2_btree_insert_at(c, NULL, NULL, NULL, 0,
+	ret = bch2_btree_insert_at(c, NULL, NULL, 0,
 				   BTREE_INSERT_ENTRY(&iter, &new_quota.k_i));
 	bch2_btree_iter_unlock(&iter);
 
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 624d97dc4537..3deb59a675e1 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -331,7 +331,7 @@ int bch2_fs_initialize(struct bch_fs *c)
 	err = "error creating root directory";
 	ret = bch2_btree_insert(c, BTREE_ID_INODES,
 				&packed_inode.inode.k_i,
-				NULL, NULL, NULL, 0);
+				NULL, NULL, 0);
 	if (ret)
 		goto err;
 
@@ -344,7 +344,7 @@ int bch2_fs_initialize(struct bch_fs *c)
 	err = "error creating lost+found";
 	ret = bch2_btree_insert(c, BTREE_ID_INODES,
 				&packed_inode.inode.k_i,
-				NULL, NULL, NULL, 0);
+				NULL, NULL, 0);
 	if (ret)
 		goto err;
 
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 63e4d97d15d7..ffeffd50b083 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -1246,8 +1246,7 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
 	ret = bch2_btree_delete_range(c, BTREE_ID_ALLOC,
 				      POS(ca->dev_idx, 0),
 				      POS(ca->dev_idx + 1, 0),
-				      ZERO_VERSION,
-				      NULL, NULL, NULL);
+				      NULL);
 	if (ret) {
 		bch_err(ca, "Remove failed, error deleting alloc info");
 		goto err;
diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c
index a408fa9ed8b5..f0d28b45a610 100644
--- a/fs/bcachefs/tests.c
+++ b/fs/bcachefs/tests.c
@@ -15,12 +15,12 @@ static void delete_test_keys(struct bch_fs *c)
 
 	ret = bch2_btree_delete_range(c, BTREE_ID_EXTENTS,
 				      POS(0, 0), POS(0, U64_MAX),
-				      ZERO_VERSION, NULL, NULL, NULL);
+				      NULL);
 	BUG_ON(ret);
 
 	ret = bch2_btree_delete_range(c, BTREE_ID_DIRENTS,
 				      POS(0, 0), POS(0, U64_MAX),
-				      ZERO_VERSION, NULL, NULL, NULL);
+				      NULL);
 	BUG_ON(ret);
 }
 
@@ -40,7 +40,7 @@ static void test_delete(struct bch_fs *c, u64 nr)
 	ret = bch2_btree_iter_traverse(&iter);
 	BUG_ON(ret);
 
-	ret = bch2_btree_insert_at(c, NULL, NULL, NULL, 0,
+	ret = bch2_btree_insert_at(c, NULL, NULL, 0,
 				   BTREE_INSERT_ENTRY(&iter, &k.k_i));
 	BUG_ON(ret);
 
@@ -69,7 +69,7 @@ static void test_delete_written(struct bch_fs *c, u64 nr)
 	ret = bch2_btree_iter_traverse(&iter);
 	BUG_ON(ret);
 
-	ret = bch2_btree_insert_at(c, NULL, NULL, NULL, 0,
+	ret = bch2_btree_insert_at(c, NULL, NULL, 0,
 				   BTREE_INSERT_ENTRY(&iter, &k.k_i));
 	BUG_ON(ret);
 
@@ -99,7 +99,7 @@ static void test_iterate(struct bch_fs *c, u64 nr)
 		k.k.p.offset = i;
 
 		ret = bch2_btree_insert(c, BTREE_ID_DIRENTS, &k.k_i,
-					NULL, NULL, NULL, 0);
+					NULL, NULL, 0);
 		BUG_ON(ret);
 	}
 
@@ -141,7 +141,7 @@ static void test_iterate_extents(struct bch_fs *c, u64 nr)
 		k.k.size = 8;
 
 		ret = bch2_btree_insert(c, BTREE_ID_EXTENTS, &k.k_i,
-					NULL, NULL, NULL, 0);
+					NULL, NULL, 0);
 		BUG_ON(ret);
 	}
 
@@ -186,7 +186,7 @@ static void test_iterate_slots(struct bch_fs *c, u64 nr)
 		k.k.p.offset = i * 2;
 
 		ret = bch2_btree_insert(c, BTREE_ID_DIRENTS, &k.k_i,
-					NULL, NULL, NULL, 0);
+					NULL, NULL, 0);
 		BUG_ON(ret);
 	}
 
@@ -236,7 +236,7 @@ static void test_iterate_slots_extents(struct bch_fs *c, u64 nr)
 		k.k.size = 8;
 
 		ret = bch2_btree_insert(c, BTREE_ID_EXTENTS, &k.k_i,
-					NULL, NULL, NULL, 0);
+					NULL, NULL, 0);
 		BUG_ON(ret);
 	}
 
@@ -289,7 +289,7 @@ static void insert_test_extent(struct bch_fs *c,
 	k.k_i.k.version.lo = test_version++;
 
 	ret = bch2_btree_insert(c, BTREE_ID_EXTENTS, &k.k_i,
-				NULL, NULL, NULL, 0);
+				NULL, NULL, 0);
 	BUG_ON(ret);
 }
 
@@ -352,7 +352,7 @@ static void rand_insert(struct bch_fs *c, u64 nr)
 		k.k.p.offset = test_rand();
 
 		ret = bch2_btree_insert(c, BTREE_ID_DIRENTS, &k.k_i,
-					NULL, NULL, NULL, 0);
+					NULL, NULL, 0);
 		BUG_ON(ret);
 	}
 }
@@ -393,7 +393,7 @@ static void rand_mixed(struct bch_fs *c, u64 nr)
 			bkey_cookie_init(&k.k_i);
 			k.k.p = iter.pos;
 
-			ret = bch2_btree_insert_at(c, NULL, NULL, NULL, 0,
+			ret = bch2_btree_insert_at(c, NULL, NULL, 0,
 						   BTREE_INSERT_ENTRY(&iter, &k.k_i));
 			BUG_ON(ret);
 		}
@@ -414,7 +414,7 @@ static void rand_delete(struct bch_fs *c, u64 nr)
 		k.k.p.offset = test_rand();
 
 		ret = bch2_btree_insert(c, BTREE_ID_DIRENTS, &k,
-					NULL, NULL, NULL, 0);
+					NULL, NULL, 0);
 		BUG_ON(ret);
 	}
 }
@@ -433,7 +433,7 @@ static void seq_insert(struct bch_fs *c, u64 nr)
 			   BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k) {
 		insert.k.p = iter.pos;
 
-		ret = bch2_btree_insert_at(c, NULL, NULL, NULL, 0,
+		ret = bch2_btree_insert_at(c, NULL, NULL, 0,
 				BTREE_INSERT_ENTRY(&iter, &insert.k_i));
 		BUG_ON(ret);
 
@@ -465,7 +465,7 @@ static void seq_overwrite(struct bch_fs *c, u64 nr)
 
 		bkey_reassemble(&u.k_i, k);
 
-		ret = bch2_btree_insert_at(c, NULL, NULL, NULL, 0,
+		ret = bch2_btree_insert_at(c, NULL, NULL, 0,
 					   BTREE_INSERT_ENTRY(&iter, &u.k_i));
 		BUG_ON(ret);
 	}
@@ -478,7 +478,7 @@ static void seq_delete(struct bch_fs *c, u64 nr)
 
 	ret = bch2_btree_delete_range(c, BTREE_ID_DIRENTS,
 				      POS(0, 0), POS(0, U64_MAX),
-				      ZERO_VERSION, NULL, NULL, NULL);
+				      NULL);
 	BUG_ON(ret);
 }
 
-- 
cgit 


From 216c9facfd7568f6b91d43784994830b235abd91 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 11 Aug 2018 19:12:05 -0400
Subject: bcachefs: Pass around bset_tree less

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bset.c              | 16 +++---------
 fs/bcachefs/bset.h              | 25 ++++++++++++++++--
 fs/bcachefs/btree_gc.c          |  5 ++--
 fs/bcachefs/btree_iter.c        | 16 ++++++------
 fs/bcachefs/btree_iter.h        |  4 +--
 fs/bcachefs/btree_update_leaf.c | 16 +++++-------
 fs/bcachefs/extents.c           | 57 +++++++++++++++++++----------------------
 7 files changed, 71 insertions(+), 68 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c
index b95cfe7ece9a..91c6bc2c8418 100644
--- a/fs/bcachefs/bset.c
+++ b/fs/bcachefs/bset.c
@@ -23,16 +23,7 @@ static inline void __bch2_btree_node_iter_advance(struct btree_node_iter *,
 
 struct bset_tree *bch2_bkey_to_bset(struct btree *b, struct bkey_packed *k)
 {
-	unsigned offset = __btree_node_key_to_offset(b, k);
-	struct bset_tree *t;
-
-	for_each_bset(b, t)
-		if (offset <= t->end_offset) {
-			EBUG_ON(offset < btree_bkey_first_offset(t));
-			return t;
-		}
-
-	BUG();
+	return bch2_bkey_to_bset_inlined(b, k);
 }
 
 /*
@@ -1117,9 +1108,10 @@ static void ro_aux_tree_fix_invalidated_key(struct btree *b,
  * modified, fix any auxiliary search tree by remaking all the nodes in the
  * auxiliary search tree that @k corresponds to
  */
-void bch2_bset_fix_invalidated_key(struct btree *b, struct bset_tree *t,
-				   struct bkey_packed *k)
+void bch2_bset_fix_invalidated_key(struct btree *b, struct bkey_packed *k)
 {
+	struct bset_tree *t = bch2_bkey_to_bset_inlined(b, k);
+
 	switch (bset_aux_tree_type(t)) {
 	case BSET_NO_AUX_TREE:
 		break;
diff --git a/fs/bcachefs/bset.h b/fs/bcachefs/bset.h
index 66a8da2192ed..d273a1fd0f1c 100644
--- a/fs/bcachefs/bset.h
+++ b/fs/bcachefs/bset.h
@@ -343,8 +343,7 @@ void bch2_bset_init_first(struct btree *, struct bset *);
 void bch2_bset_init_next(struct bch_fs *, struct btree *,
 			 struct btree_node_entry *);
 void bch2_bset_build_aux_tree(struct btree *, struct bset_tree *, bool);
-void bch2_bset_fix_invalidated_key(struct btree *, struct bset_tree *,
-				  struct bkey_packed *);
+void bch2_bset_fix_invalidated_key(struct btree *, struct bkey_packed *);
 
 void bch2_bset_insert(struct btree *, struct btree_node_iter *,
 		     struct bkey_packed *, struct bkey_i *, unsigned);
@@ -404,6 +403,21 @@ static inline bool btree_iter_pos_cmp_p_or_unp(const struct btree *b,
 		(cmp == 0 && !strictly_greater && !bkey_deleted(k));
 }
 
+static inline struct bset_tree *
+bch2_bkey_to_bset_inlined(struct btree *b, struct bkey_packed *k)
+{
+	unsigned offset = __btree_node_key_to_offset(b, k);
+	struct bset_tree *t;
+
+	for_each_bset(b, t)
+		if (offset <= t->end_offset) {
+			EBUG_ON(offset < btree_bkey_first_offset(t));
+			return t;
+		}
+
+	BUG();
+}
+
 struct bset_tree *bch2_bkey_to_bset(struct btree *, struct bkey_packed *);
 
 struct bkey_packed *bch2_bkey_prev_filter(struct btree *, struct bset_tree *,
@@ -605,6 +619,13 @@ static inline void btree_keys_account_key(struct btree_nr_keys *n,
 #define btree_keys_account_key_drop(_nr, _bset_idx, _k)	\
 	btree_keys_account_key(_nr, _bset_idx, _k, -1)
 
+#define btree_account_key_add(_b, _k)				\
+	btree_keys_account_key(&(_b)->nr,			\
+		bch2_bkey_to_bset(_b, _k) - (_b)->set, _k, 1)
+#define btree_account_key_drop(_b, _k)				\
+	btree_keys_account_key(&(_b)->nr,			\
+		bch2_bkey_to_bset(_b, _k) - (_b)->set, _k, -1)
+
 struct bset_stats {
 	struct {
 		size_t nr, bytes;
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 2526118fe9ce..5b1e1aab36e9 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -598,15 +598,14 @@ out:
 
 static void recalc_packed_keys(struct btree *b)
 {
+	struct bset *i = btree_bset_first(b);
 	struct bkey_packed *k;
 
 	memset(&b->nr, 0, sizeof(b->nr));
 
 	BUG_ON(b->nsets != 1);
 
-	for (k =  btree_bkey_first(b, b->set);
-	     k != btree_bkey_last(b, b->set);
-	     k = bkey_next(k))
+	vstruct_for_each(i, k)
 		btree_keys_account_key_add(&b->nr, 0, k);
 }
 
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 754f35f6b56c..b450f936347c 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -519,11 +519,11 @@ iter_current_key_not_modified:
 	if (b->level && new_u64s && !bkey_deleted(where) &&
 	    btree_iter_pos_cmp_packed(b, &iter->pos, where,
 				iter->flags & BTREE_ITER_IS_EXTENTS)) {
-		struct bset_tree *t;
+		struct bset_tree *t, *where_set = bch2_bkey_to_bset_inlined(b, where);
 		struct bkey_packed *k;
 
 		for_each_bset(b, t) {
-			if (bch2_bkey_to_bset(b, where) == t)
+			if (where_set == t)
 				continue;
 
 			k = bch2_bkey_prev_all(b, t,
@@ -551,13 +551,13 @@ next_bset:
 }
 
 void bch2_btree_node_iter_fix(struct btree_iter *iter,
-			     struct btree *b,
-			     struct btree_node_iter *node_iter,
-			     struct bset_tree *t,
-			     struct bkey_packed *where,
-			     unsigned clobber_u64s,
-			     unsigned new_u64s)
+			      struct btree *b,
+			      struct btree_node_iter *node_iter,
+			      struct bkey_packed *where,
+			      unsigned clobber_u64s,
+			      unsigned new_u64s)
 {
+	struct bset_tree *t = bch2_bkey_to_bset_inlined(b, where);
 	struct btree_iter *linked;
 
 	if (node_iter != &iter->l[b->level].iter)
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index 775fdf4260cc..912292dad6e5 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -99,8 +99,8 @@ static inline void bch2_btree_iter_verify_locks(struct btree_iter *iter) {}
 #endif
 
 void bch2_btree_node_iter_fix(struct btree_iter *, struct btree *,
-			     struct btree_node_iter *, struct bset_tree *,
-			     struct bkey_packed *, unsigned, unsigned);
+			      struct btree_node_iter *, struct bkey_packed *,
+			      unsigned, unsigned);
 
 int bch2_btree_iter_unlock(struct btree_iter *);
 
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 6b8954493e05..104c0b91da75 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -25,7 +25,6 @@ bool bch2_btree_bset_insert_key(struct btree_iter *iter,
 {
 	const struct bkey_format *f = &b->format;
 	struct bkey_packed *k;
-	struct bset_tree *t;
 	unsigned clobber_u64s;
 
 	EBUG_ON(btree_node_just_written(b));
@@ -38,8 +37,6 @@ bool bch2_btree_bset_insert_key(struct btree_iter *iter,
 	if (k && !bkey_cmp_packed(b, k, &insert->k)) {
 		BUG_ON(bkey_whiteout(k));
 
-		t = bch2_bkey_to_bset(b, k);
-
 		if (!bkey_written(b, k) &&
 		    bkey_val_u64s(&insert->k) == bkeyp_val_u64s(f, k) &&
 		    !bkey_whiteout(&insert->k)) {
@@ -51,9 +48,9 @@ bool bch2_btree_bset_insert_key(struct btree_iter *iter,
 
 		insert->k.needs_whiteout = k->needs_whiteout;
 
-		btree_keys_account_key_drop(&b->nr, t - b->set, k);
+		btree_account_key_drop(b, k);
 
-		if (t == bset_tree_last(b)) {
+		if (k >= btree_bset_last(b)->start) {
 			clobber_u64s = k->u64s;
 
 			/*
@@ -63,7 +60,7 @@ bool bch2_btree_bset_insert_key(struct btree_iter *iter,
 			 */
 			if (bkey_whiteout(&insert->k) && !k->needs_whiteout) {
 				bch2_bset_delete(b, k, clobber_u64s);
-				bch2_btree_node_iter_fix(iter, b, node_iter, t,
+				bch2_btree_node_iter_fix(iter, b, node_iter,
 							 k, clobber_u64s, 0);
 				bch2_btree_iter_verify(iter, b);
 				return true;
@@ -73,7 +70,7 @@ bool bch2_btree_bset_insert_key(struct btree_iter *iter,
 		}
 
 		k->type = KEY_TYPE_DELETED;
-		bch2_btree_node_iter_fix(iter, b, node_iter, t, k,
+		bch2_btree_node_iter_fix(iter, b, node_iter, k,
 					 k->u64s, k->u64s);
 		bch2_btree_iter_verify(iter, b);
 
@@ -93,13 +90,12 @@ bool bch2_btree_bset_insert_key(struct btree_iter *iter,
 		insert->k.needs_whiteout = false;
 	}
 
-	t = bset_tree_last(b);
-	k = bch2_btree_node_iter_bset_pos(node_iter, b, t);
+	k = bch2_btree_node_iter_bset_pos(node_iter, b, bset_tree_last(b));
 	clobber_u64s = 0;
 overwrite:
 	bch2_bset_insert(b, node_iter, k, insert, clobber_u64s);
 	if (k->u64s != clobber_u64s || bkey_whiteout(&insert->k))
-		bch2_btree_node_iter_fix(iter, b, node_iter, t, k,
+		bch2_btree_node_iter_fix(iter, b, node_iter, k,
 					 clobber_u64s, k->u64s);
 	bch2_btree_iter_verify(iter, b);
 	return true;
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 9f39e9dea51a..77bc33d0a344 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -1171,7 +1171,6 @@ static void extent_bset_insert(struct bch_fs *c, struct btree_iter *iter,
 			       struct bkey_i *insert)
 {
 	struct btree_iter_level *l = &iter->l[0];
-	struct bset_tree *t = bset_tree_last(l->b);
 	struct btree_node_iter node_iter;
 	struct bkey_packed *k;
 
@@ -1192,10 +1191,10 @@ static void extent_bset_insert(struct bch_fs *c, struct btree_iter *iter,
 	    bch2_extent_merge_inline(c, iter, bkey_to_packed(insert), k, false))
 		return;
 
-	k = bch2_btree_node_iter_bset_pos(&l->iter, l->b, t);
+	k = bch2_btree_node_iter_bset_pos(&l->iter, l->b, bset_tree_last(l->b));
 
 	bch2_bset_insert(l->b, &l->iter, k, insert, 0);
-	bch2_btree_node_iter_fix(iter, l->b, &l->iter, t, k, 0, k->u64s);
+	bch2_btree_node_iter_fix(iter, l->b, &l->iter, k, 0, k->u64s);
 	bch2_btree_iter_verify(iter, l->b);
 }
 
@@ -1328,20 +1327,19 @@ bch2_extent_can_insert(struct btree_insert *trans,
 
 static void
 extent_squash(struct extent_insert_state *s, struct bkey_i *insert,
-	      struct bset_tree *t, struct bkey_packed *_k, struct bkey_s k,
+	      struct bkey_packed *_k, struct bkey_s k,
 	      enum bch_extent_overlap overlap)
 {
 	struct bch_fs *c = s->trans->c;
 	struct btree_iter *iter = s->insert->iter;
 	struct btree_iter_level *l = &iter->l[0];
-	struct btree *b = l->b;
 
 	switch (overlap) {
 	case BCH_EXTENT_OVERLAP_FRONT:
 		/* insert overlaps with start of k: */
 		bch2_cut_subtract_front(s, insert->k.p, k);
 		BUG_ON(bkey_deleted(k.k));
-		extent_save(b, _k, k.k);
+		extent_save(l->b, _k, k.k);
 		verify_modified_extent(iter, _k);
 		break;
 
@@ -1349,15 +1347,15 @@ extent_squash(struct extent_insert_state *s, struct bkey_i *insert,
 		/* insert overlaps with end of k: */
 		bch2_cut_subtract_back(s, bkey_start_pos(&insert->k), k);
 		BUG_ON(bkey_deleted(k.k));
-		extent_save(b, _k, k.k);
+		extent_save(l->b, _k, k.k);
 
 		/*
 		 * As the auxiliary tree is indexed by the end of the
 		 * key and we've just changed the end, update the
 		 * auxiliary tree.
 		 */
-		bch2_bset_fix_invalidated_key(b, t, _k);
-		bch2_btree_node_iter_fix(iter, b, &l->iter, t,
+		bch2_bset_fix_invalidated_key(l->b, _k);
+		bch2_btree_node_iter_fix(iter, l->b, &l->iter,
 					 _k, _k->u64s, _k->u64s);
 		verify_modified_extent(iter, _k);
 		break;
@@ -1365,21 +1363,20 @@ extent_squash(struct extent_insert_state *s, struct bkey_i *insert,
 	case BCH_EXTENT_OVERLAP_ALL: {
 		/* The insert key completely covers k, invalidate k */
 		if (!bkey_whiteout(k.k))
-			btree_keys_account_key_drop(&b->nr,
-						t - b->set, _k);
+			btree_account_key_drop(l->b, _k);
 
 		bch2_drop_subtract(s, k);
 
-		if (t == bset_tree_last(l->b)) {
+		if (_k >= btree_bset_last(l->b)->start) {
 			unsigned u64s = _k->u64s;
 
 			bch2_bset_delete(l->b, _k, _k->u64s);
-			bch2_btree_node_iter_fix(iter, b, &l->iter, t,
+			bch2_btree_node_iter_fix(iter, l->b, &l->iter,
 						 _k, u64s, 0);
-			bch2_btree_iter_verify(iter, b);
+			bch2_btree_iter_verify(iter, l->b);
 		} else {
-			extent_save(b, _k, k.k);
-			bch2_btree_node_iter_fix(iter, b, &l->iter, t,
+			extent_save(l->b, _k, k.k);
+			bch2_btree_node_iter_fix(iter, l->b, &l->iter,
 						 _k, _k->u64s, _k->u64s);
 			verify_modified_extent(iter, _k);
 		}
@@ -1403,14 +1400,14 @@ extent_squash(struct extent_insert_state *s, struct bkey_i *insert,
 		 * what k points to)
 		 */
 		bkey_reassemble(&split.k, k.s_c);
-		split.k.k.needs_whiteout |= bkey_written(b, _k);
+		split.k.k.needs_whiteout |= bkey_written(l->b, _k);
 
 		bch2_cut_back(bkey_start_pos(&insert->k), &split.k.k);
 		BUG_ON(bkey_deleted(&split.k.k));
 
 		bch2_cut_subtract_front(s, insert->k.p, k);
 		BUG_ON(bkey_deleted(k.k));
-		extent_save(b, _k, k.k);
+		extent_save(l->b, _k, k.k);
 		verify_modified_extent(iter, _k);
 
 		bch2_add_sectors(s, bkey_i_to_s_c(&split.k),
@@ -1426,16 +1423,14 @@ static void __bch2_insert_fixup_extent(struct extent_insert_state *s)
 {
 	struct btree_iter *iter = s->insert->iter;
 	struct btree_iter_level *l = &iter->l[0];
-	struct btree *b = l->b;
 	struct bkey_packed *_k;
 	struct bkey unpacked;
 	struct bkey_i *insert = s->insert->k;
 
 	while (bkey_cmp(s->committed, insert->k.p) < 0 &&
-	       (_k = bch2_btree_node_iter_peek_filter(&l->iter, b,
+	       (_k = bch2_btree_node_iter_peek_filter(&l->iter, l->b,
 						      KEY_TYPE_DISCARD))) {
-		struct bset_tree *t = bch2_bkey_to_bset(b, _k);
-		struct bkey_s k = __bkey_disassemble(b, _k, &unpacked);
+		struct bkey_s k = __bkey_disassemble(l->b, _k, &unpacked);
 		enum bch_extent_overlap overlap = bch2_extent_overlap(&insert->k, k.k);
 
 		EBUG_ON(bkey_cmp(iter->pos, k.k->p) >= 0);
@@ -1465,16 +1460,16 @@ static void __bch2_insert_fixup_extent(struct extent_insert_state *s)
 		    !bkey_cmp(insert->k.p, k.k->p) &&
 		    !bkey_cmp(bkey_start_pos(&insert->k), bkey_start_pos(k.k))) {
 			if (!bkey_whiteout(k.k)) {
-				btree_keys_account_key_drop(&b->nr, t - b->set, _k);
+				btree_account_key_drop(l->b, _k);
 				bch2_subtract_sectors(s, k.s_c,
 						      bkey_start_offset(k.k), k.k->size);
 				_k->type = KEY_TYPE_DISCARD;
-				reserve_whiteout(b, _k);
+				reserve_whiteout(l->b, _k);
 			}
 			break;
 		}
 
-		if (k.k->needs_whiteout || bkey_written(b, _k)) {
+		if (k.k->needs_whiteout || bkey_written(l->b, _k)) {
 			insert->k.needs_whiteout = true;
 			s->update_btree = true;
 		}
@@ -1483,11 +1478,11 @@ static void __bch2_insert_fixup_extent(struct extent_insert_state *s)
 		    overlap == BCH_EXTENT_OVERLAP_ALL &&
 		    bkey_whiteout(k.k) &&
 		    k.k->needs_whiteout) {
-			unreserve_whiteout(b, _k);
+			unreserve_whiteout(l->b, _k);
 			_k->needs_whiteout = false;
 		}
 
-		extent_squash(s, insert, t, _k, k, overlap);
+		extent_squash(s, insert, _k, k, overlap);
 
 		if (!s->update_btree)
 			bch2_cut_front(s->committed, insert);
@@ -1498,7 +1493,7 @@ next:
 	}
 
 	if (bkey_cmp(s->committed, insert->k.p) < 0)
-		s->committed = bpos_min(s->insert->k->k.p, b->key.k.p);
+		s->committed = bpos_min(s->insert->k->k.p, l->b->key.k.p);
 
 	/*
 	 * may have skipped past some deleted extents greater than the insert
@@ -1509,7 +1504,7 @@ next:
 		struct btree_node_iter node_iter = l->iter;
 
 		while ((_k = bch2_btree_node_iter_prev_all(&node_iter, l->b)) &&
-		       bkey_cmp_left_packed(b, _k, &s->committed) > 0)
+		       bkey_cmp_left_packed(l->b, _k, &s->committed) > 0)
 			l->iter = node_iter;
 	}
 }
@@ -2142,9 +2137,9 @@ static bool bch2_extent_merge_inline(struct bch_fs *c,
 			return false;
 	}
 
-	bch2_bset_fix_invalidated_key(b, t, m);
+	bch2_bset_fix_invalidated_key(b, m);
 	bch2_btree_node_iter_fix(iter, b, node_iter,
-				 t, m, m->u64s, m->u64s);
+				 m, m->u64s, m->u64s);
 	verify_modified_extent(iter, m);
 
 	return ret == BCH_MERGE_MERGE;
-- 
cgit 


From f84306a5700fba38050ad14ab46ed86aea321aba Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 21 Aug 2018 15:19:33 -0400
Subject: bcachefs: Prioritize fragmentation in bucket allocator

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc.c b/fs/bcachefs/alloc.c
index 82f27a57dc61..73c11f808edc 100644
--- a/fs/bcachefs/alloc.c
+++ b/fs/bcachefs/alloc.c
@@ -610,7 +610,7 @@ static unsigned long bucket_sort_key(struct bch_fs *c, struct bch_dev *ca,
 
 	return  (data_wantness << 9) |
 		(needs_journal_commit << 8) |
-		bucket_gc_gen(ca, b);
+		(bucket_gc_gen(ca, b) / 16);
 }
 
 static inline int bucket_alloc_cmp(alloc_heap *h,
-- 
cgit 


From a00fd8c535d91c59913756582ed9a4bfbb3c8a95 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 21 Aug 2018 16:30:14 -0400
Subject: bcachefs: Comparison function cleanups

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bset.c                  |  77 ++++++++++++---------
 fs/bcachefs/bset.h                  |  81 +++++++---------------
 fs/bcachefs/btree_io.c              |   5 +-
 fs/bcachefs/btree_iter.c            | 131 +++++++++++++++++++-----------------
 fs/bcachefs/btree_update_interior.c |   6 +-
 fs/bcachefs/tests.c                 |  38 +++++++++++
 6 files changed, 183 insertions(+), 155 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c
index 91c6bc2c8418..8e14e4be0b5c 100644
--- a/fs/bcachefs/bset.c
+++ b/fs/bcachefs/bset.c
@@ -143,7 +143,7 @@ static void bch2_btree_node_iter_next_check(struct btree_node_iter *_iter,
 	bkey_unpack_key(b, k);
 
 	if (n &&
-	    __btree_node_iter_cmp(b, k, n) > 0) {
+	    bkey_iter_cmp(b, k, n) > 0) {
 		struct btree_node_iter_set *set;
 		struct bkey ku = bkey_unpack_key(b, k);
 		struct bkey nu = bkey_unpack_key(b, n);
@@ -202,10 +202,10 @@ void bch2_verify_insert_pos(struct btree *b, struct bkey_packed *where,
 	struct bkey_packed *next = (void *) (where->_data + clobber_u64s);
 #if 0
 	BUG_ON(prev &&
-	       __btree_node_iter_cmp(b, prev, insert) > 0);
+	       bkey_iter_cmp(b, prev, insert) > 0);
 #else
 	if (prev &&
-	    __btree_node_iter_cmp(b, prev, insert) > 0) {
+	    bkey_iter_cmp(b, prev, insert) > 0) {
 		struct bkey k1 = bkey_unpack_key(b, prev);
 		struct bkey k2 = bkey_unpack_key(b, insert);
 		char buf1[100];
@@ -224,10 +224,10 @@ void bch2_verify_insert_pos(struct btree *b, struct bkey_packed *where,
 #endif
 #if 0
 	BUG_ON(next != btree_bkey_last(b, t) &&
-	       __btree_node_iter_cmp(b, insert, next) > 0);
+	       bkey_iter_cmp(b, insert, next) > 0);
 #else
 	if (next != btree_bkey_last(b, t) &&
-	    __btree_node_iter_cmp(b, insert, next) > 0) {
+	    bkey_iter_cmp(b, insert, next) > 0) {
 		struct bkey k1 = bkey_unpack_key(b, insert);
 		struct bkey k2 = bkey_unpack_key(b, next);
 		char buf1[100];
@@ -1272,7 +1272,7 @@ void bch2_bset_delete(struct btree *b,
 __flatten
 static struct bkey_packed *bset_search_write_set(const struct btree *b,
 				struct bset_tree *t,
-				struct bpos search,
+				struct bpos *search,
 				const struct bkey_packed *packed_search)
 {
 	unsigned l = 0, r = t->size;
@@ -1280,7 +1280,7 @@ static struct bkey_packed *bset_search_write_set(const struct btree *b,
 	while (l + 1 != r) {
 		unsigned m = (l + r) >> 1;
 
-		if (bkey_cmp(rw_aux_tree(b, t)[m].k, search) < 0)
+		if (bkey_cmp(rw_aux_tree(b, t)[m].k, *search) < 0)
 			l = m;
 		else
 			r = m;
@@ -1302,7 +1302,7 @@ static int bset_search_tree_slowpath(const struct btree *b,
 __flatten
 static struct bkey_packed *bset_search_tree(const struct btree *b,
 				struct bset_tree *t,
-				struct bpos search,
+				struct bpos *search,
 				const struct bkey_packed *packed_search)
 {
 	struct ro_aux_tree *base = ro_aux_tree_base(b, t);
@@ -1343,7 +1343,7 @@ static struct bkey_packed *bset_search_tree(const struct btree *b,
 				     bkey_mantissa(packed_search, f, n));
 		else
 			n = n * 2 + bset_search_tree_slowpath(b, t,
-						&search, packed_search, n);
+						search, packed_search, n);
 	} while (n < t->size);
 
 	inorder = __eytzinger1_to_inorder(n >> 1, t->size, t->extra);
@@ -1370,10 +1370,9 @@ static struct bkey_packed *bset_search_tree(const struct btree *b,
 __always_inline __flatten
 static struct bkey_packed *bch2_bset_search(struct btree *b,
 				struct bset_tree *t,
-				struct bpos search,
+				struct bpos *search,
 				struct bkey_packed *packed_search,
-				const struct bkey_packed *lossy_packed_search,
-				bool strictly_greater)
+				const struct bkey_packed *lossy_packed_search)
 {
 	struct bkey_packed *m;
 
@@ -1407,7 +1406,7 @@ static struct bkey_packed *bch2_bset_search(struct btree *b,
 		 * start and end - handle that here:
 		 */
 
-		if (bkey_cmp(search, t->max_key) > 0)
+		if (bkey_cmp(*search, t->max_key) > 0)
 			return btree_bkey_last(b, t);
 
 		m = bset_search_tree(b, t, search, lossy_packed_search);
@@ -1416,21 +1415,21 @@ static struct bkey_packed *bch2_bset_search(struct btree *b,
 
 	if (lossy_packed_search)
 		while (m != btree_bkey_last(b, t) &&
-		       !btree_iter_pos_cmp_p_or_unp(b, search, lossy_packed_search,
-						    m, strictly_greater))
+		       bkey_iter_cmp_p_or_unp(b, search, lossy_packed_search,
+					      m) > 0)
 			m = bkey_next(m);
 
 	if (!packed_search)
 		while (m != btree_bkey_last(b, t) &&
-		       !btree_iter_pos_cmp_packed(b, &search, m, strictly_greater))
+		       bkey_iter_pos_cmp(b, search, m) > 0)
 			m = bkey_next(m);
 
 	if (btree_keys_expensive_checks(b)) {
 		struct bkey_packed *prev = bch2_bkey_prev_all(b, t, m);
 
 		BUG_ON(prev &&
-		       btree_iter_pos_cmp_p_or_unp(b, search, packed_search,
-						   prev, strictly_greater));
+		       bkey_iter_cmp_p_or_unp(b, search, packed_search,
+					      prev) <= 0);
 	}
 
 	return m;
@@ -1438,6 +1437,25 @@ static struct bkey_packed *bch2_bset_search(struct btree *b,
 
 /* Btree node iterator */
 
+static inline void __bch2_btree_node_iter_push(struct btree_node_iter *iter,
+			      struct btree *b,
+			      const struct bkey_packed *k,
+			      const struct bkey_packed *end)
+{
+	if (k != end) {
+		struct btree_node_iter_set *pos;
+
+		btree_node_iter_for_each(iter, pos)
+			;
+
+		BUG_ON(pos >= iter->data + ARRAY_SIZE(iter->data));
+		*pos = (struct btree_node_iter_set) {
+			__btree_node_key_to_offset(b, k),
+			__btree_node_key_to_offset(b, end)
+		};
+	}
+}
+
 void bch2_btree_node_iter_push(struct btree_node_iter *iter,
 			       struct btree *b,
 			       const struct bkey_packed *k,
@@ -1449,17 +1467,15 @@ void bch2_btree_node_iter_push(struct btree_node_iter *iter,
 
 noinline __flatten __attribute__((cold))
 static void btree_node_iter_init_pack_failed(struct btree_node_iter *iter,
-			      struct btree *b, struct bpos search,
-			      bool strictly_greater)
+			      struct btree *b, struct bpos *search)
 {
 	struct bset_tree *t;
 
-	trace_bkey_pack_pos_fail(&search);
+	trace_bkey_pack_pos_fail(search);
 
 	for_each_bset(b, t)
 		__bch2_btree_node_iter_push(iter, b,
-			bch2_bset_search(b, t, search, NULL, NULL,
-					strictly_greater),
+			bch2_bset_search(b, t, search, NULL, NULL),
 			btree_bkey_last(b, t));
 
 	bch2_btree_node_iter_sort(iter, b);
@@ -1506,18 +1522,17 @@ static void btree_node_iter_init_pack_failed(struct btree_node_iter *iter,
  *    past any extents that compare equal to the position we searched for.
  */
 void bch2_btree_node_iter_init(struct btree_node_iter *iter,
-			       struct btree *b, struct bpos search,
-			       bool strictly_greater)
+			       struct btree *b, struct bpos *search)
 {
 	struct bset_tree *t;
 	struct bkey_packed p, *packed_search = NULL;
 
-	EBUG_ON(bkey_cmp(search, b->data->min_key) < 0);
+	EBUG_ON(bkey_cmp(*search, b->data->min_key) < 0);
 	bset_aux_tree_verify(b);
 
 	memset(iter, 0, sizeof(*iter));
 
-	switch (bch2_bkey_pack_pos_lossy(&p, search, b)) {
+	switch (bch2_bkey_pack_pos_lossy(&p, *search, b)) {
 	case BKEY_PACK_POS_EXACT:
 		packed_search = &p;
 		break;
@@ -1525,16 +1540,14 @@ void bch2_btree_node_iter_init(struct btree_node_iter *iter,
 		packed_search = NULL;
 		break;
 	case BKEY_PACK_POS_FAIL:
-		btree_node_iter_init_pack_failed(iter, b, search,
-						 strictly_greater);
+		btree_node_iter_init_pack_failed(iter, b, search);
 		return;
 	}
 
 	for_each_bset(b, t)
 		__bch2_btree_node_iter_push(iter, b,
 					   bch2_bset_search(b, t, search,
-							   packed_search, &p,
-							   strictly_greater),
+							    packed_search, &p),
 					   btree_bkey_last(b, t));
 
 	bch2_btree_node_iter_sort(iter, b);
@@ -1668,7 +1681,7 @@ struct bkey_packed *bch2_btree_node_iter_prev_filter(struct btree_node_iter *ite
 			bch2_btree_node_iter_bset_pos(iter, b, t),
 			min_key_type);
 		if (k &&
-		    (!prev || __btree_node_iter_cmp(b, k, prev) > 0)) {
+		    (!prev || bkey_iter_cmp(b, k, prev) > 0)) {
 			prev = k;
 			end = t->end_offset;
 		}
diff --git a/fs/bcachefs/bset.h b/fs/bcachefs/bset.h
index d273a1fd0f1c..1b0122dad2bc 100644
--- a/fs/bcachefs/bset.h
+++ b/fs/bcachefs/bset.h
@@ -368,41 +368,6 @@ static inline int bkey_cmp_p_or_unp(const struct btree *b,
 	return __bch2_bkey_cmp_left_packed_format_checked(b, l, r);
 }
 
-/* Returns true if @k is after iterator position @pos */
-static inline bool btree_iter_pos_cmp(struct btree_iter *iter,
-				      const struct bkey *k)
-{
-	int cmp = bkey_cmp(k->p, iter->pos);
-
-	return cmp > 0 ||
-		(cmp == 0 &&
-		 !(iter->flags & BTREE_ITER_IS_EXTENTS) && !bkey_deleted(k));
-}
-
-/* Returns true if @k is after iterator position @pos */
-static inline bool btree_iter_pos_cmp_packed(const struct btree *b,
-					     struct bpos *pos,
-					     const struct bkey_packed *k,
-					     bool strictly_greater)
-{
-	int cmp = bkey_cmp_left_packed(b, k, pos);
-
-	return cmp > 0 ||
-		(cmp == 0 && !strictly_greater && !bkey_deleted(k));
-}
-
-static inline bool btree_iter_pos_cmp_p_or_unp(const struct btree *b,
-					struct bpos pos,
-					const struct bkey_packed *pos_packed,
-					const struct bkey_packed *k,
-					bool strictly_greater)
-{
-	int cmp = bkey_cmp_p_or_unp(b, k, pos_packed, &pos);
-
-	return cmp > 0 ||
-		(cmp == 0 && !strictly_greater && !bkey_deleted(k));
-}
-
 static inline struct bset_tree *
 bch2_bkey_to_bset_inlined(struct btree *b, struct bkey_packed *k)
 {
@@ -459,7 +424,7 @@ void bch2_btree_node_iter_push(struct btree_node_iter *, struct btree *,
 			      const struct bkey_packed *,
 			      const struct bkey_packed *);
 void bch2_btree_node_iter_init(struct btree_node_iter *, struct btree *,
-			       struct bpos, bool);
+			       struct bpos *);
 void bch2_btree_node_iter_init_from_start(struct btree_node_iter *,
 					  struct btree *);
 struct bkey_packed *bch2_btree_node_iter_bset_pos(struct btree_node_iter *,
@@ -488,11 +453,16 @@ static inline bool bch2_btree_node_iter_end(struct btree_node_iter *iter)
 	return __btree_node_iter_set_end(iter, 0);
 }
 
-static inline int __btree_node_iter_cmp(struct btree *b,
-					const struct bkey_packed *l,
-					const struct bkey_packed *r)
+/*
+ * When keys compare equal, deleted keys compare first:
+ *
+ * XXX: only need to compare pointers for keys that are both within a
+ * btree_node_iterator - we need to break ties for prev() to work correctly
+ */
+static inline int bkey_iter_cmp(struct btree *b,
+				const struct bkey_packed *l,
+				const struct bkey_packed *r)
 {
-	/* When keys compare equal deleted keys come first */
 	return bkey_cmp_packed(b, l, r)
 		?: (int) bkey_deleted(r) - (int) bkey_deleted(l)
 		?: (l > r) - (l < r);
@@ -502,28 +472,27 @@ static inline int btree_node_iter_cmp(struct btree *b,
 				      struct btree_node_iter_set l,
 				      struct btree_node_iter_set r)
 {
-	return __btree_node_iter_cmp(b,
+	return bkey_iter_cmp(b,
 			__btree_node_offset_to_key(b, l.k),
 			__btree_node_offset_to_key(b, r.k));
 }
 
-static inline void __bch2_btree_node_iter_push(struct btree_node_iter *iter,
-			      struct btree *b,
-			      const struct bkey_packed *k,
-			      const struct bkey_packed *end)
+/* These assume l (the search key) is not a deleted key: */
+static inline int bkey_iter_pos_cmp(struct btree *b,
+			struct bpos *l,
+			const struct bkey_packed *r)
 {
-	if (k != end) {
-		struct btree_node_iter_set *pos;
-
-		btree_node_iter_for_each(iter, pos)
-			;
+	return -bkey_cmp_left_packed(b, r, l)
+		?: (int) bkey_deleted(r);
+}
 
-		BUG_ON(pos >= iter->data + ARRAY_SIZE(iter->data));
-		*pos = (struct btree_node_iter_set) {
-			__btree_node_key_to_offset(b, k),
-			__btree_node_key_to_offset(b, end)
-		};
-	}
+static inline int bkey_iter_cmp_p_or_unp(struct btree *b,
+			struct bpos *l,
+			const struct bkey_packed *l_packed,
+			const struct bkey_packed *r)
+{
+	return -bkey_cmp_p_or_unp(b, r, l_packed, l)
+		?: (int) bkey_deleted(r);
 }
 
 static inline struct bkey_packed *
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 889870582566..1db103815dd3 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -21,10 +21,7 @@
 
 /* btree_node_iter_large: */
 
-#define btree_node_iter_cmp_heap(h, _l, _r)				\
-	__btree_node_iter_cmp(b,					\
-			       __btree_node_offset_to_key(b, (_l).k),	\
-			       __btree_node_offset_to_key(b, (_r).k))
+#define btree_node_iter_cmp_heap(h, _l, _r) btree_node_iter_cmp(b, _l, _r)
 
 void bch2_btree_node_iter_large_push(struct btree_node_iter_large *iter,
 				     struct btree *b,
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index b450f936347c..1ab468a29d21 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -24,6 +24,30 @@ static inline bool is_btree_node(struct btree_iter *iter, unsigned l)
 		iter->l[l].b != BTREE_ITER_NOT_END;
 }
 
+/* Returns < 0 if @k is before iter pos, > 0 if @k is after */
+static inline int __btree_iter_pos_cmp(struct btree_iter *iter,
+				       const struct btree *b,
+				       const struct bkey_packed *k,
+				       bool interior_node)
+{
+	int cmp = bkey_cmp_left_packed(b, k, &iter->pos);
+
+	if (cmp)
+		return cmp;
+	if (bkey_deleted(k))
+		return -1;
+	if (iter->flags & BTREE_ITER_IS_EXTENTS)
+		return -1;
+	return 1;
+}
+
+static inline int btree_iter_pos_cmp(struct btree_iter *iter,
+				     const struct btree *b,
+				     const struct bkey_packed *k)
+{
+	return __btree_iter_pos_cmp(iter, b, k, b->level != 0);
+}
+
 /* Btree node locking: */
 
 /*
@@ -390,8 +414,7 @@ static void __bch2_btree_iter_verify(struct btree_iter *iter,
 	k = b->level || iter->flags & BTREE_ITER_IS_EXTENTS
 		? bch2_btree_node_iter_prev_filter(&tmp, b, KEY_TYPE_DISCARD)
 		: bch2_btree_node_iter_prev_all(&tmp, b);
-	if (k && btree_iter_pos_cmp_packed(b, &iter->pos, k,
-				iter->flags & BTREE_ITER_IS_EXTENTS)) {
+	if (k && btree_iter_pos_cmp(iter, b, k) > 0) {
 		char buf[100];
 		struct bkey uk = bkey_unpack_key(b, k);
 
@@ -401,8 +424,7 @@ static void __bch2_btree_iter_verify(struct btree_iter *iter,
 	}
 
 	k = bch2_btree_node_iter_peek_all(&l->iter, b);
-	if (k && !btree_iter_pos_cmp_packed(b, &iter->pos, k,
-				iter->flags & BTREE_ITER_IS_EXTENTS)) {
+	if (k && btree_iter_pos_cmp(iter, b, k) < 0) {
 		char buf[100];
 		struct bkey uk = bkey_unpack_key(b, k);
 
@@ -454,8 +476,7 @@ static void __bch2_btree_node_iter_fix(struct btree_iter *iter,
 
 	/* didn't find the bset in the iterator - might have to readd it: */
 	if (new_u64s &&
-	    btree_iter_pos_cmp_packed(b, &iter->pos, where,
-				      iter->flags & BTREE_ITER_IS_EXTENTS)) {
+	    btree_iter_pos_cmp(iter, b, where) > 0) {
 		btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
 
 		bch2_btree_node_iter_push(node_iter, b, where, end);
@@ -475,8 +496,7 @@ found:
 		return;
 
 	if (new_u64s &&
-	    btree_iter_pos_cmp_packed(b, &iter->pos, where,
-				iter->flags & BTREE_ITER_IS_EXTENTS)) {
+	    btree_iter_pos_cmp(iter, b, where) > 0) {
 		set->k = offset;
 	} else if (set->k < offset + clobber_u64s) {
 		set->k = offset + new_u64s;
@@ -516,9 +536,8 @@ iter_current_key_not_modified:
 	 * always point to the key for the child node the btree iterator points
 	 * to.
 	 */
-	if (b->level && new_u64s && !bkey_deleted(where) &&
-	    btree_iter_pos_cmp_packed(b, &iter->pos, where,
-				iter->flags & BTREE_ITER_IS_EXTENTS)) {
+	if (b->level && new_u64s &&
+	    btree_iter_pos_cmp(iter, b, where) > 0) {
 		struct bset_tree *t, *where_set = bch2_bkey_to_bset_inlined(b, where);
 		struct bkey_packed *k;
 
@@ -529,7 +548,7 @@ iter_current_key_not_modified:
 			k = bch2_bkey_prev_all(b, t,
 				bch2_btree_node_iter_bset_pos(node_iter, b, t));
 			if (k &&
-			    __btree_node_iter_cmp(b, k, where) > 0) {
+			    bkey_iter_cmp(b, k, where) > 0) {
 				struct btree_node_iter_set *set;
 				unsigned offset =
 					__btree_node_key_to_offset(b, bkey_next(k));
@@ -610,9 +629,23 @@ static inline struct bkey_s_c __btree_iter_peek(struct btree_iter *iter,
 			bch2_btree_node_iter_peek(&l->iter, l->b));
 }
 
-static inline void __btree_iter_advance(struct btree_iter_level *l)
+static inline bool btree_iter_advance_to_pos(struct btree_iter *iter,
+					     struct btree_iter_level *l,
+					     int max_advance)
 {
-	bch2_btree_node_iter_advance(&l->iter, l->b);
+	struct bkey_packed *k;
+	int nr_advanced = 0;
+
+	while ((k = bch2_btree_node_iter_peek_all(&l->iter, l->b)) &&
+	       btree_iter_pos_cmp(iter, l->b, k) < 0) {
+		if (max_advance > 0 && nr_advanced >= max_advance)
+			return false;
+
+		bch2_btree_node_iter_advance(&l->iter, l->b);
+		nr_advanced++;
+	}
+
+	return true;
 }
 
 /*
@@ -657,7 +690,8 @@ static void btree_iter_verify_new_node(struct btree_iter *iter, struct btree *b)
 static inline bool btree_iter_pos_after_node(struct btree_iter *iter,
 					     struct btree *b)
 {
-	return !btree_iter_pos_cmp(iter, &b->key.k) &&
+	return __btree_iter_pos_cmp(iter, NULL,
+			bkey_to_packed(&b->key), true) < 0 &&
 		bkey_cmp(b->key.k.p, POS_MAX);
 }
 
@@ -670,16 +704,18 @@ static inline bool btree_iter_pos_in_node(struct btree_iter *iter,
 }
 
 static inline void __btree_iter_init(struct btree_iter *iter,
-				     struct btree *b)
+				     unsigned level)
 {
-	struct btree_iter_level *l = &iter->l[b->level];
+	struct btree_iter_level *l = &iter->l[level];
+
+	bch2_btree_node_iter_init(&l->iter, l->b, &iter->pos);
 
-	bch2_btree_node_iter_init(&l->iter, b, iter->pos,
-				  iter->flags & BTREE_ITER_IS_EXTENTS);
+	if (iter->flags & BTREE_ITER_IS_EXTENTS)
+		btree_iter_advance_to_pos(iter, l, -1);
 
 	/* Skip to first non whiteout: */
-	if (b->level)
-		bch2_btree_node_iter_peek(&l->iter, b);
+	if (level)
+		bch2_btree_node_iter_peek(&l->iter, l->b);
 
 	btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
 }
@@ -694,7 +730,7 @@ static inline void btree_iter_node_set(struct btree_iter *iter,
 
 	iter->l[b->level].lock_seq = b->lock.state.seq;
 	iter->l[b->level].b = b;
-	__btree_iter_init(iter, b);
+	__btree_iter_init(iter, b->level);
 }
 
 /*
@@ -748,7 +784,7 @@ void bch2_btree_iter_reinit_node(struct btree_iter *iter, struct btree *b)
 	struct btree_iter *linked;
 
 	for_each_btree_iter_with_node(iter, b, linked)
-		__btree_iter_init(linked, b);
+		__btree_iter_init(linked, b->level);
 }
 
 static inline int btree_iter_lock_root(struct btree_iter *iter,
@@ -987,15 +1023,8 @@ int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter)
 	 *
 	 * XXX correctly using BTREE_ITER_UPTODATE should make this unnecessary
 	 */
-	if (btree_iter_node(iter, iter->level)) {
-		struct btree_iter_level *l = &iter->l[iter->level];
-		struct bkey_s_c k;
-		struct bkey u;
-
-		while ((k = __btree_iter_peek_all(iter, l, &u)).k &&
-		       !btree_iter_pos_cmp(iter, k.k))
-			__btree_iter_advance(l);
-	}
+	if (btree_iter_node(iter, iter->level))
+		btree_iter_advance_to_pos(iter, &iter->l[iter->level], -1);
 
 	/*
 	 * Note: iter->nodes[iter->level] may be temporarily NULL here - that
@@ -1138,7 +1167,6 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter, unsigned depth)
 void bch2_btree_iter_set_pos_same_leaf(struct btree_iter *iter, struct bpos new_pos)
 {
 	struct btree_iter_level *l = &iter->l[0];
-	struct bkey_packed *k;
 
 	EBUG_ON(iter->level != 0);
 	EBUG_ON(bkey_cmp(new_pos, iter->pos) < 0);
@@ -1148,12 +1176,10 @@ void bch2_btree_iter_set_pos_same_leaf(struct btree_iter *iter, struct bpos new_
 	iter->pos = new_pos;
 	btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
 
-	while ((k = bch2_btree_node_iter_peek_all(&l->iter, l->b)) &&
-	       !btree_iter_pos_cmp_packed(l->b, &iter->pos, k,
-					  iter->flags & BTREE_ITER_IS_EXTENTS))
-		__btree_iter_advance(l);
+	btree_iter_advance_to_pos(iter, l, -1);
 
-	if (!k && btree_iter_pos_after_node(iter, l->b))
+	if (bch2_btree_node_iter_end(&l->iter) &&
+	    btree_iter_pos_after_node(iter, l->b))
 		btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
 }
 
@@ -1170,30 +1196,15 @@ void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos)
 	level = btree_iter_up_until_locked(iter, true);
 
 	if (btree_iter_node(iter, level)) {
-		unsigned nr_advanced = 0;
-		struct btree_iter_level *l = &iter->l[level];
-		struct bkey_s_c k;
-		struct bkey u;
-
 		/*
 		 * We might have to skip over many keys, or just a few: try
 		 * advancing the node iterator, and if we have to skip over too
 		 * many keys just reinit it (or if we're rewinding, since that
 		 * is expensive).
 		 */
-		if (cmp > 0) {
-			while ((k = __btree_iter_peek_all(iter, l, &u)).k &&
-			       !btree_iter_pos_cmp(iter, k.k)) {
-				if (nr_advanced > 8)
-					goto reinit_node;
-
-				__btree_iter_advance(l);
-				nr_advanced++;
-			}
-		} else {
-reinit_node:
-			__btree_iter_init(iter, iter->l[level].b);
-		}
+		if (cmp < 0 ||
+		    !btree_iter_advance_to_pos(iter, &iter->l[level], 8))
+			__btree_iter_init(iter, level);
 
 		/* Don't leave it locked if we're not supposed to: */
 		if (btree_lock_want(iter, level) == BTREE_NODE_UNLOCKED)
@@ -1296,7 +1307,7 @@ struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter)
 	}
 
 	do {
-		__btree_iter_advance(l);
+		bch2_btree_node_iter_advance(&l->iter, l->b);
 		p = bch2_btree_node_iter_peek_all(&l->iter, l->b);
 		if (unlikely(!p))
 			return bch2_btree_iter_peek_next_leaf(iter);
@@ -1367,7 +1378,7 @@ recheck:
 	while ((k = __btree_iter_peek_all(iter, l, &iter->k)).k &&
 	       bkey_deleted(k.k) &&
 	       bkey_cmp(bkey_start_pos(k.k), iter->pos) == 0)
-		__btree_iter_advance(l);
+		bch2_btree_node_iter_advance(&l->iter, l->b);
 
 	/*
 	 * iterator is now at the correct position for inserting at iter->pos,
@@ -1464,7 +1475,7 @@ recheck:
 	while ((k = __btree_iter_peek_all(iter, l, &iter->k)).k &&
 	       bkey_deleted(k.k) &&
 	       bkey_cmp(k.k->p, iter->pos) == 0)
-		__btree_iter_advance(l);
+		bch2_btree_node_iter_advance(&l->iter, l->b);
 
 	/*
 	 * If we got to the end of the node, check if we need to traverse to the
@@ -1528,7 +1539,7 @@ struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *iter)
 	}
 
 	if (!bkey_deleted(&iter->k))
-		__btree_iter_advance(&iter->l[0]);
+		bch2_btree_node_iter_advance(&iter->l[0].iter, iter->l[0].b);
 
 	btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
 
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 1fe6f1e3e843..6a2fcc4201b6 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -35,7 +35,7 @@ static void btree_node_interior_verify(struct btree *b)
 
 	BUG_ON(!b->level);
 
-	bch2_btree_node_iter_init(&iter, b, b->key.k.p, false);
+	bch2_btree_node_iter_init(&iter, b, &b->key.k.p);
 #if 1
 	BUG_ON(!(k = bch2_btree_node_iter_peek(&iter, b)) ||
 	       bkey_cmp_left_packed(b, k, &b->key.k.p));
@@ -1191,7 +1191,7 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, struct btree *b
 			     gc_pos_btree_node(b), &stats, 0, 0);
 
 	while ((k = bch2_btree_node_iter_peek_all(node_iter, b)) &&
-	       !btree_iter_pos_cmp_packed(b, &insert->k.p, k, false))
+	       bkey_iter_pos_cmp(b, &insert->k.p, k) > 0)
 		bch2_btree_node_iter_advance(node_iter, b);
 
 	/*
@@ -1322,7 +1322,7 @@ static void btree_split_insert_keys(struct btree_update *as, struct btree *b,
 
 	BUG_ON(btree_node_type(b) != BKEY_TYPE_BTREE);
 
-	bch2_btree_node_iter_init(&node_iter, b, k->k.p, false);
+	bch2_btree_node_iter_init(&node_iter, b, &k->k.p);
 
 	while (!bch2_keylist_empty(keys)) {
 		k = bch2_keylist_front(keys);
diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c
index f0d28b45a610..dc8abce94ff0 100644
--- a/fs/bcachefs/tests.c
+++ b/fs/bcachefs/tests.c
@@ -271,6 +271,42 @@ static void test_iterate_slots_extents(struct bch_fs *c, u64 nr)
 	bch2_btree_iter_unlock(&iter);
 }
 
+/*
+ * XXX: we really want to make sure we've got a btree with depth > 0 for these
+ * tests
+ */
+static void test_peek_end(struct bch_fs *c, u64 nr)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+
+	bch2_btree_iter_init(&iter, c, BTREE_ID_DIRENTS, POS_MIN, 0);
+
+	k = bch2_btree_iter_peek(&iter);
+	BUG_ON(k.k);
+
+	k = bch2_btree_iter_peek(&iter);
+	BUG_ON(k.k);
+
+	bch2_btree_iter_unlock(&iter);
+}
+
+static void test_peek_end_extents(struct bch_fs *c, u64 nr)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+
+	bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, POS_MIN, 0);
+
+	k = bch2_btree_iter_peek(&iter);
+	BUG_ON(k.k);
+
+	k = bch2_btree_iter_peek(&iter);
+	BUG_ON(k.k);
+
+	bch2_btree_iter_unlock(&iter);
+}
+
 /* extent unit tests */
 
 u64 test_version;
@@ -555,6 +591,8 @@ void bch2_btree_perf_test(struct bch_fs *c, const char *testname,
 	perf_test(test_iterate_extents);
 	perf_test(test_iterate_slots);
 	perf_test(test_iterate_slots_extents);
+	perf_test(test_peek_end);
+	perf_test(test_peek_end_extents);
 
 	perf_test(test_extent_overwrite_front);
 	perf_test(test_extent_overwrite_back);
-- 
cgit 


From cbdf24cef19be489f6885cecb4887fe407cebdfc Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 21 Aug 2018 17:38:41 -0400
Subject: bcachefs: Fix a btree iter bug when iter pos == POS_MAX

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 1ab468a29d21..1ba59c53c36f 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -36,7 +36,16 @@ static inline int __btree_iter_pos_cmp(struct btree_iter *iter,
 		return cmp;
 	if (bkey_deleted(k))
 		return -1;
-	if (iter->flags & BTREE_ITER_IS_EXTENTS)
+
+	/*
+	 * Normally, for extents we want the first key strictly greater than
+	 * the iterator position - with the exception that for interior nodes,
+	 * we don't want to advance past the last key if the iterator position
+	 * is POS_MAX:
+	 */
+	if (iter->flags & BTREE_ITER_IS_EXTENTS &&
+	    (!interior_node ||
+	     bkey_cmp_left_packed_byval(b, k, POS_MAX)))
 		return -1;
 	return 1;
 }
@@ -691,8 +700,7 @@ static inline bool btree_iter_pos_after_node(struct btree_iter *iter,
 					     struct btree *b)
 {
 	return __btree_iter_pos_cmp(iter, NULL,
-			bkey_to_packed(&b->key), true) < 0 &&
-		bkey_cmp(b->key.k.p, POS_MAX);
+			bkey_to_packed(&b->key), true) < 0;
 }
 
 static inline bool btree_iter_pos_in_node(struct btree_iter *iter,
-- 
cgit 


From 741daa5be5c8d9cccaaac9759eb99893f1beb09b Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 21 Aug 2018 19:42:00 -0400
Subject: bcachefs: Dirent repair code

There was a bug for awhile in previous kernels where we weren't
computing dirent name lengths correctly and we weren't zeroing out
padding at the end of dirents (due to struct bch_dirent changing size by
adding __attribute__((aligned)), and not updating other code to use
offsetof).

This patch fixes dirents with junk at the end, by going off of the
dirent's hash.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/dirent.c |  14 +-----
 fs/bcachefs/dirent.h |   6 +++
 fs/bcachefs/fsck.c   | 123 +++++++++++++++++++++++++++++++++++++++++++++++++--
 3 files changed, 126 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c
index d5e174e1e59f..0651f5575131 100644
--- a/fs/bcachefs/dirent.c
+++ b/fs/bcachefs/dirent.c
@@ -16,16 +16,7 @@ unsigned bch2_dirent_name_bytes(struct bkey_s_c_dirent d)
 	unsigned len = bkey_val_bytes(d.k) -
 		offsetof(struct bch_dirent, d_name);
 
-	while (len && !d.v->d_name[len - 1])
-		--len;
-
-	return len;
-}
-
-static unsigned dirent_val_u64s(unsigned len)
-{
-	return DIV_ROUND_UP(offsetof(struct bch_dirent, d_name) + len,
-			    sizeof(u64));
+	return strnlen(d.v->d_name, len);
 }
 
 static u64 bch2_dirent_hash(const struct bch_hash_info *info,
@@ -108,9 +99,6 @@ const char *bch2_dirent_invalid(const struct bch_fs *c, struct bkey_s_c k)
 		if (len > BCH_NAME_MAX)
 			return "dirent name too big";
 
-		if (memchr(d.v->d_name, '/', len))
-			return "dirent name has invalid characters";
-
 		return NULL;
 	case BCH_DIRENT_WHITEOUT:
 		return bkey_val_bytes(k.k) != 0
diff --git a/fs/bcachefs/dirent.h b/fs/bcachefs/dirent.h
index ac28f83d6b2d..30d2143d4ca7 100644
--- a/fs/bcachefs/dirent.h
+++ b/fs/bcachefs/dirent.h
@@ -23,6 +23,12 @@ struct bch_inode_info;
 
 unsigned bch2_dirent_name_bytes(struct bkey_s_c_dirent);
 
+static inline unsigned dirent_val_u64s(unsigned len)
+{
+	return DIV_ROUND_UP(offsetof(struct bch_dirent, d_name) + len,
+			    sizeof(u64));
+}
+
 int __bch2_dirent_create(struct btree_trans *, u64,
 			 const struct bch_hash_info *, u8,
 			 const struct qstr *, u64, int);
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 2430833dbce8..1bdb31c5d5de 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -248,6 +248,29 @@ fsck_err:
 	return ret;
 }
 
+static bool key_has_correct_hash(const struct bch_hash_desc desc,
+				 struct hash_check *h, struct bch_fs *c,
+				 struct btree_iter *k_iter, struct bkey_s_c k)
+{
+	u64 hash;
+
+	if (k.k->type != desc.whiteout_type &&
+	    k.k->type != desc.key_type)
+		return true;
+
+	if (k.k->p.offset != h->next)
+		bch2_btree_iter_copy(h->chain, k_iter);
+	h->next = k.k->p.offset + 1;
+
+	if (k.k->type != desc.key_type)
+		return true;
+
+	hash = desc.hash_bkey(&h->info, k);
+
+	return hash >= h->chain->pos.offset &&
+		hash <= k.k->p.offset;
+}
+
 static int hash_check_key(const struct bch_hash_desc desc,
 			  struct hash_check *h, struct bch_fs *c,
 			  struct btree_iter *k_iter, struct bkey_s_c k)
@@ -271,9 +294,10 @@ static int hash_check_key(const struct bch_hash_desc desc,
 
 	if (fsck_err_on(hashed < h->chain->pos.offset ||
 			hashed > k.k->p.offset, c,
-			"hash table key at wrong offset: %llu, "
+			"hash table key at wrong offset: btree %u, %llu, "
 			"hashed to %llu chain starts at %llu\n%s",
-			k.k->p.offset, hashed, h->chain->pos.offset,
+			desc.btree_id, k.k->p.offset,
+			hashed, h->chain->pos.offset,
 			(bch2_bkey_val_to_text(c, bkey_type(0, desc.btree_id),
 					       buf, sizeof(buf), k), buf))) {
 		ret = hash_redo_key(desc, h, c, k_iter, k, hashed);
@@ -289,6 +313,90 @@ fsck_err:
 	return ret;
 }
 
+static int check_dirent_hash(struct hash_check *h, struct bch_fs *c,
+			     struct btree_iter *iter, struct bkey_s_c *k)
+{
+	struct bkey_i_dirent *d = NULL;
+	int ret = -EINVAL;
+	char buf[200];
+	unsigned len;
+	u64 hash;
+
+	if (key_has_correct_hash(bch2_dirent_hash_desc, h, c, iter, *k))
+		return 0;
+
+	len = bch2_dirent_name_bytes(bkey_s_c_to_dirent(*k));
+	BUG_ON(!len);
+
+	memcpy(buf, bkey_s_c_to_dirent(*k).v->d_name, len);
+	buf[len] = '\0';
+
+	d = kmalloc(bkey_bytes(k->k), GFP_KERNEL);
+	if (!d) {
+		bch_err(c, "memory allocation failure");
+		return -ENOMEM;
+	}
+
+	bkey_reassemble(&d->k_i, *k);
+
+	do {
+		--len;
+		if (!len)
+			goto err_redo;
+
+		d->k.u64s = BKEY_U64s + dirent_val_u64s(len);
+
+		BUG_ON(bkey_val_bytes(&d->k) <
+		       offsetof(struct bch_dirent, d_name) + len);
+
+		memset(d->v.d_name + len, 0,
+		       bkey_val_bytes(&d->k) -
+		       offsetof(struct bch_dirent, d_name) - len);
+
+		hash = bch2_dirent_hash_desc.hash_bkey(&h->info,
+						bkey_i_to_s_c(&d->k_i));
+	} while (hash < h->chain->pos.offset ||
+		 hash > k->k->p.offset);
+
+	if (fsck_err(c, "dirent with junk at end, was %s (%zu) now %s (%u)",
+		     buf, strlen(buf), d->v.d_name, len)) {
+		ret = bch2_btree_insert_at(c, NULL, NULL,
+					   BTREE_INSERT_NOFAIL,
+					   BTREE_INSERT_ENTRY(iter, &d->k_i));
+		if (ret)
+			goto err;
+
+		*k = bch2_btree_iter_peek(iter);
+
+		BUG_ON(k->k->type != BCH_DIRENT);
+	}
+err:
+fsck_err:
+	kfree(d);
+	return ret;
+err_redo:
+	bch_err(c, "cannot fix dirent by removing trailing garbage %s (%zu)",
+		buf, strlen(buf));
+
+	hash = bch2_dirent_hash_desc.hash_bkey(&h->info, *k);
+
+	if (fsck_err(c, "hash table key at wrong offset: btree %u, offset %llu, "
+			"hashed to %llu chain starts at %llu\n%s",
+			BTREE_ID_DIRENTS,
+			k->k->p.offset, hash, h->chain->pos.offset,
+			(bch2_bkey_val_to_text(c, bkey_type(0, BTREE_ID_DIRENTS),
+					       buf, sizeof(buf), *k), buf))) {
+		ret = hash_redo_key(bch2_dirent_hash_desc,
+				    h, c, iter, *k, hash);
+		if (ret)
+			bch_err(c, "hash_redo_key err %i", ret);
+		else
+			ret = 1;
+	}
+
+	goto err;
+}
+
 static int bch2_inode_truncate(struct bch_fs *c, u64 inode_nr, u64 new_size)
 {
 	return bch2_btree_delete_range(c, BTREE_ID_EXTENTS,
@@ -435,11 +543,13 @@ static int check_dirents(struct bch_fs *c)
 		if (w.first_this_inode && w.have_inode)
 			hash_check_set_inode(&h, c, &w.inode);
 
-		ret = hash_check_key(bch2_dirent_hash_desc, &h, c, iter, k);
+		ret = check_dirent_hash(&h, c, iter, &k);
 		if (ret > 0) {
 			ret = 0;
 			continue;
 		}
+		if (ret)
+			goto fsck_err;
 
 		if (ret)
 			goto fsck_err;
@@ -458,7 +568,12 @@ static int check_dirents(struct bch_fs *c)
 				". dirent") ||
 		    fsck_err_on(name_len == 2 &&
 				!memcmp(d.v->d_name, "..", 2), c,
-				".. dirent")) {
+				".. dirent") ||
+		    fsck_err_on(name_len == 2 &&
+				!memcmp(d.v->d_name, "..", 2), c,
+				".. dirent") ||
+		    fsck_err_on(memchr(d.v->d_name, '/', name_len), c,
+				"dirent name has invalid chars")) {
 			ret = remove_dirent(c, iter, d);
 			if (ret)
 				goto err;
-- 
cgit 


From bc230209d36eec7f226e183dc2935857dc080464 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 28 Aug 2018 18:54:42 -0400
Subject: bcachefs: make fsck spew less

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fsck.c | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 1bdb31c5d5de..99852e0eb22f 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -375,17 +375,15 @@ fsck_err:
 	kfree(d);
 	return ret;
 err_redo:
-	bch_err(c, "cannot fix dirent by removing trailing garbage %s (%zu)",
-		buf, strlen(buf));
-
 	hash = bch2_dirent_hash_desc.hash_bkey(&h->info, *k);
 
-	if (fsck_err(c, "hash table key at wrong offset: btree %u, offset %llu, "
-			"hashed to %llu chain starts at %llu\n%s",
-			BTREE_ID_DIRENTS,
-			k->k->p.offset, hash, h->chain->pos.offset,
-			(bch2_bkey_val_to_text(c, bkey_type(0, BTREE_ID_DIRENTS),
-					       buf, sizeof(buf), *k), buf))) {
+	if (fsck_err(c, "cannot fix dirent by removing trailing garbage %s (%zu)\n"
+		     "hash table key at wrong offset: btree %u, offset %llu, "
+		     "hashed to %llu chain starts at %llu\n%s",
+		     buf, strlen(buf), BTREE_ID_DIRENTS,
+		     k->k->p.offset, hash, h->chain->pos.offset,
+		     (bch2_bkey_val_to_text(c, bkey_type(0, BTREE_ID_DIRENTS),
+					    buf, sizeof(buf), *k), buf))) {
 		ret = hash_redo_key(bch2_dirent_hash_desc,
 				    h, c, iter, *k, hash);
 		if (ret)
-- 
cgit 


From cf0517af15e1ef43a0ece3e3dcbfc94a19e89bac Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 6 Sep 2018 17:09:07 -0400
Subject: bcachefs: fix a divide

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/rebalance.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c
index 04824f667693..74702e753f60 100644
--- a/fs/bcachefs/rebalance.c
+++ b/fs/bcachefs/rebalance.c
@@ -113,7 +113,7 @@ static void rebalance_work_accumulate(struct rebalance_work *w,
 		work = U64_MAX;
 	work = min(work, capacity);
 
-	percent_full = div_u64(work * 100, capacity);
+	percent_full = div64_u64(work * 100, capacity);
 
 	if (percent_full >= w->dev_most_full_percent) {
 		w->dev_most_full_idx		= idx;
-- 
cgit 


From 34b8e552764d6f759072b8452831c6f72a28111c Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 15 Sep 2018 17:57:22 -0400
Subject: bcachefs: Fix a deadlock

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update_interior.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 6a2fcc4201b6..6d99f3d191d3 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -1550,7 +1550,13 @@ void bch2_btree_insert_node(struct btree_update *as, struct btree *b,
 
 	btree_node_interior_verify(b);
 
-	bch2_foreground_maybe_merge(c, iter, b->level, flags);
+	/*
+	 * when called from the btree_split path the new nodes aren't added to
+	 * the btree iterator yet, so the merge path's unlock/wait/relock dance
+	 * won't work:
+	 */
+	bch2_foreground_maybe_merge(c, iter, b->level,
+				    flags|BTREE_INSERT_NOUNLOCK);
 	return;
 split:
 	btree_split(as, b, iter, keys, flags);
-- 
cgit 


From d06182cadb5bbd9ab7fa3d3e59608bb573bffbee Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 21 Sep 2018 17:37:13 -0400
Subject: bcachefs: fix bch2_acl_chmod()

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/acl.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/bcachefs/acl.c b/fs/bcachefs/acl.c
index 0074b3eb196d..eb6fa4d7c1f6 100644
--- a/fs/bcachefs/acl.c
+++ b/fs/bcachefs/acl.c
@@ -377,6 +377,7 @@ int bch2_acl_chmod(struct btree_trans *trans,
 		goto err;
 	}
 
+	new->k.p = iter->pos;
 	bch2_trans_update(trans, BTREE_INSERT_ENTRY(iter, &new->k_i));
 	*new_acl = acl;
 	acl = NULL;
-- 
cgit 


From c2fcff5973c93af7ffa87ad28eca2fddd2be83c5 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 25 Sep 2018 23:27:57 -0400
Subject: bcachefs: Fix suspend when moving data faster than ratelimit

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/move.c | 34 ++++++++++++++++++++++++++--------
 fs/bcachefs/util.c | 21 ---------------------
 fs/bcachefs/util.h |  1 -
 3 files changed, 26 insertions(+), 30 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index 93083cfff9bf..b29e7c322e9a 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -470,7 +470,7 @@ int bch2_move_data(struct bch_fs *c,
 	struct bkey_s_c_extent e;
 	struct data_opts data_opts;
 	enum data_cmd data_cmd;
-	u64 cur_inum = U64_MAX;
+	u64 delay, cur_inum = U64_MAX;
 	int ret = 0, ret2;
 
 	closure_init_stack(&ctxt.cl);
@@ -484,12 +484,30 @@ int bch2_move_data(struct bch_fs *c,
 	if (rate)
 		bch2_ratelimit_reset(rate);
 
-	while (!kthread || !(ret = kthread_should_stop())) {
-		if (rate &&
-		    bch2_ratelimit_delay(rate) &&
-		    (bch2_btree_iter_unlock(&stats->iter),
-		     (ret = bch2_ratelimit_wait_freezable_stoppable(rate))))
-			break;
+	while (1) {
+		do {
+			delay = rate ? bch2_ratelimit_delay(rate) : 0;
+
+			if (delay) {
+				bch2_btree_iter_unlock(&stats->iter);
+				set_current_state(TASK_INTERRUPTIBLE);
+			}
+
+			if (kthread && (ret = kthread_should_stop())) {
+				__set_current_state(TASK_RUNNING);
+				goto out;
+			}
+
+			if (delay)
+				schedule_timeout(delay);
+
+			if (unlikely(freezing(current))) {
+				bch2_btree_iter_unlock(&stats->iter);
+				move_ctxt_wait_event(&ctxt, list_empty(&ctxt.reads));
+				closure_sync(&ctxt.cl);
+				try_to_freeze();
+			}
+		} while (delay);
 peek:
 		k = bch2_btree_iter_peek(&stats->iter);
 		if (!k.k)
@@ -560,7 +578,7 @@ next_nondata:
 		bch2_btree_iter_next(&stats->iter);
 		bch2_btree_iter_cond_resched(&stats->iter);
 	}
-
+out:
 	bch2_btree_iter_unlock(&stats->iter);
 
 	move_ctxt_wait_event(&ctxt, list_empty(&ctxt.reads));
diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c
index 6666c3aed05f..75053322d0f0 100644
--- a/fs/bcachefs/util.c
+++ b/fs/bcachefs/util.c
@@ -424,27 +424,6 @@ void bch2_ratelimit_increment(struct bch_ratelimit *d, u64 done)
 		d->next = now - NSEC_PER_SEC * 2;
 }
 
-int bch2_ratelimit_wait_freezable_stoppable(struct bch_ratelimit *d)
-{
-	bool kthread = (current->flags & PF_KTHREAD) != 0;
-
-	while (1) {
-		u64 delay = bch2_ratelimit_delay(d);
-
-		if (delay)
-			set_current_state(TASK_INTERRUPTIBLE);
-
-		if (kthread && kthread_should_stop())
-			return 1;
-
-		if (!delay)
-			return 0;
-
-		schedule_timeout(delay);
-		try_to_freeze();
-	}
-}
-
 /* pd controller: */
 
 /*
diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h
index c0b26123af4c..446216eb8c76 100644
--- a/fs/bcachefs/util.h
+++ b/fs/bcachefs/util.h
@@ -377,7 +377,6 @@ static inline void bch2_ratelimit_reset(struct bch_ratelimit *d)
 
 u64 bch2_ratelimit_delay(struct bch_ratelimit *);
 void bch2_ratelimit_increment(struct bch_ratelimit *, u64);
-int bch2_ratelimit_wait_freezable_stoppable(struct bch_ratelimit *);
 
 struct bch_pd_controller {
 	struct bch_ratelimit	rate;
-- 
cgit 


From f43cc5be6e08bd3e0425cc848a7c2a4d3c1974f3 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 1 Oct 2018 00:33:42 -0400
Subject: bcachefs: Fix failure to suspend

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/move.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index b29e7c322e9a..9402b45bf868 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -504,7 +504,6 @@ int bch2_move_data(struct bch_fs *c,
 			if (unlikely(freezing(current))) {
 				bch2_btree_iter_unlock(&stats->iter);
 				move_ctxt_wait_event(&ctxt, list_empty(&ctxt.reads));
-				closure_sync(&ctxt.cl);
 				try_to_freeze();
 			}
 		} while (delay);
-- 
cgit 


From 7b3f84ea7d3f9bcbb7f0f1264a4c228a27a32703 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 6 Oct 2018 00:46:55 -0400
Subject: bcachefs: Split out alloc_background.c

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/Makefile                |    3 +-
 fs/bcachefs/alloc.c                 | 2180 -----------------------------------
 fs/bcachefs/alloc.h                 |  143 ---
 fs/bcachefs/alloc_background.c      | 1428 +++++++++++++++++++++++
 fs/bcachefs/alloc_background.h      |   62 +
 fs/bcachefs/alloc_foreground.c      |  741 ++++++++++++
 fs/bcachefs/alloc_foreground.h      |  116 ++
 fs/bcachefs/bkey_methods.c          |    2 +-
 fs/bcachefs/btree_gc.c              |    2 +-
 fs/bcachefs/btree_update_interior.c |    2 +-
 fs/bcachefs/buckets.c               |    2 +-
 fs/bcachefs/chardev.c               |    1 -
 fs/bcachefs/fs-io.c                 |    1 +
 fs/bcachefs/io.c                    |    2 +-
 fs/bcachefs/io.h                    |    1 -
 fs/bcachefs/journal.c               |    2 +-
 fs/bcachefs/journal_io.c            |    3 +-
 fs/bcachefs/move.c                  |    1 +
 fs/bcachefs/movinggc.c              |    1 +
 fs/bcachefs/rebalance.c             |    2 +-
 fs/bcachefs/recovery.c              |    2 +-
 fs/bcachefs/super.c                 |    3 +-
 fs/bcachefs/sysfs.c                 |    2 +-
 23 files changed, 2365 insertions(+), 2337 deletions(-)
 delete mode 100644 fs/bcachefs/alloc.c
 delete mode 100644 fs/bcachefs/alloc.h
 create mode 100644 fs/bcachefs/alloc_background.c
 create mode 100644 fs/bcachefs/alloc_background.h
 create mode 100644 fs/bcachefs/alloc_foreground.c
 create mode 100644 fs/bcachefs/alloc_foreground.h

(limited to 'fs')

diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile
index 13cd6d2cdc91..5318287c5ac4 100644
--- a/fs/bcachefs/Makefile
+++ b/fs/bcachefs/Makefile
@@ -3,7 +3,8 @@ obj-$(CONFIG_BCACHEFS_FS)	+= bcachefs.o
 
 bcachefs-y		:=	\
 	acl.o			\
-	alloc.o			\
+	alloc_background.o	\
+	alloc_foreground.o	\
 	bkey.o			\
 	bkey_methods.o		\
 	bset.o			\
diff --git a/fs/bcachefs/alloc.c b/fs/bcachefs/alloc.c
deleted file mode 100644
index 73c11f808edc..000000000000
--- a/fs/bcachefs/alloc.c
+++ /dev/null
@@ -1,2180 +0,0 @@
-/*
- * Primary bucket allocation code
- *
- * Copyright 2012 Google, Inc.
- *
- * Allocation in bcache is done in terms of buckets:
- *
- * Each bucket has associated an 8 bit gen; this gen corresponds to the gen in
- * btree pointers - they must match for the pointer to be considered valid.
- *
- * Thus (assuming a bucket has no dirty data or metadata in it) we can reuse a
- * bucket simply by incrementing its gen.
- *
- * The gens (along with the priorities; it's really the gens are important but
- * the code is named as if it's the priorities) are written in an arbitrary list
- * of buckets on disk, with a pointer to them in the journal header.
- *
- * When we invalidate a bucket, we have to write its new gen to disk and wait
- * for that write to complete before we use it - otherwise after a crash we
- * could have pointers that appeared to be good but pointed to data that had
- * been overwritten.
- *
- * Since the gens and priorities are all stored contiguously on disk, we can
- * batch this up: We fill up the free_inc list with freshly invalidated buckets,
- * call prio_write(), and when prio_write() finishes we pull buckets off the
- * free_inc list and optionally discard them.
- *
- * free_inc isn't the only freelist - if it was, we'd often have to sleep while
- * priorities and gens were being written before we could allocate. c->free is a
- * smaller freelist, and buckets on that list are always ready to be used.
- *
- * If we've got discards enabled, that happens when a bucket moves from the
- * free_inc list to the free list.
- *
- * It's important to ensure that gens don't wrap around - with respect to
- * either the oldest gen in the btree or the gen on disk. This is quite
- * difficult to do in practice, but we explicitly guard against it anyways - if
- * a bucket is in danger of wrapping around we simply skip invalidating it that
- * time around, and we garbage collect or rewrite the priorities sooner than we
- * would have otherwise.
- *
- * bch2_bucket_alloc() allocates a single bucket from a specific device.
- *
- * bch2_bucket_alloc_set() allocates one or more buckets from different devices
- * in a given filesystem.
- *
- * invalidate_buckets() drives all the processes described above. It's called
- * from bch2_bucket_alloc() and a few other places that need to make sure free
- * buckets are ready.
- *
- * invalidate_buckets_(lru|fifo)() find buckets that are available to be
- * invalidated, and then invalidate them and stick them on the free_inc list -
- * in either lru or fifo order.
- */
-
-#include "bcachefs.h"
-#include "alloc.h"
-#include "btree_cache.h"
-#include "btree_io.h"
-#include "btree_update.h"
-#include "btree_update_interior.h"
-#include "btree_gc.h"
-#include "buckets.h"
-#include "checksum.h"
-#include "clock.h"
-#include "debug.h"
-#include "disk_groups.h"
-#include "error.h"
-#include "extents.h"
-#include "io.h"
-#include "journal.h"
-#include "journal_io.h"
-#include "super-io.h"
-#include "trace.h"
-
-#include <linux/blkdev.h>
-#include <linux/kthread.h>
-#include <linux/math64.h>
-#include <linux/random.h>
-#include <linux/rculist.h>
-#include <linux/rcupdate.h>
-#include <linux/sched/task.h>
-#include <linux/sort.h>
-
-static void bch2_recalc_oldest_io(struct bch_fs *, struct bch_dev *, int);
-
-/* Ratelimiting/PD controllers */
-
-static void pd_controllers_update(struct work_struct *work)
-{
-	struct bch_fs *c = container_of(to_delayed_work(work),
-					   struct bch_fs,
-					   pd_controllers_update);
-	struct bch_dev *ca;
-	unsigned i;
-
-	for_each_member_device(ca, c, i) {
-		struct bch_dev_usage stats = bch2_dev_usage_read(c, ca);
-
-		u64 free = bucket_to_sector(ca,
-				__dev_buckets_free(ca, stats)) << 9;
-		/*
-		 * Bytes of internal fragmentation, which can be
-		 * reclaimed by copy GC
-		 */
-		s64 fragmented = (bucket_to_sector(ca,
-					stats.buckets[BCH_DATA_USER] +
-					stats.buckets[BCH_DATA_CACHED]) -
-				  (stats.sectors[BCH_DATA_USER] +
-				   stats.sectors[BCH_DATA_CACHED])) << 9;
-
-		fragmented = max(0LL, fragmented);
-
-		bch2_pd_controller_update(&ca->copygc_pd,
-					 free, fragmented, -1);
-	}
-
-	schedule_delayed_work(&c->pd_controllers_update,
-			      c->pd_controllers_update_seconds * HZ);
-}
-
-/* Persistent alloc info: */
-
-static unsigned bch_alloc_val_u64s(const struct bch_alloc *a)
-{
-	unsigned bytes = offsetof(struct bch_alloc, data);
-
-	if (a->fields & (1 << BCH_ALLOC_FIELD_READ_TIME))
-		bytes += 2;
-	if (a->fields & (1 << BCH_ALLOC_FIELD_WRITE_TIME))
-		bytes += 2;
-
-	return DIV_ROUND_UP(bytes, sizeof(u64));
-}
-
-const char *bch2_alloc_invalid(const struct bch_fs *c, struct bkey_s_c k)
-{
-	if (k.k->p.inode >= c->sb.nr_devices ||
-	    !c->devs[k.k->p.inode])
-		return "invalid device";
-
-	switch (k.k->type) {
-	case BCH_ALLOC: {
-		struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k);
-
-		if (bch_alloc_val_u64s(a.v) != bkey_val_u64s(a.k))
-			return "incorrect value size";
-		break;
-	}
-	default:
-		return "invalid type";
-	}
-
-	return NULL;
-}
-
-int bch2_alloc_to_text(struct bch_fs *c, char *buf,
-		       size_t size, struct bkey_s_c k)
-{
-	buf[0] = '\0';
-
-	switch (k.k->type) {
-	case BCH_ALLOC:
-		break;
-	}
-
-	return 0;
-}
-
-static inline unsigned get_alloc_field(const u8 **p, unsigned bytes)
-{
-	unsigned v;
-
-	switch (bytes) {
-	case 1:
-		v = **p;
-		break;
-	case 2:
-		v = le16_to_cpup((void *) *p);
-		break;
-	case 4:
-		v = le32_to_cpup((void *) *p);
-		break;
-	default:
-		BUG();
-	}
-
-	*p += bytes;
-	return v;
-}
-
-static inline void put_alloc_field(u8 **p, unsigned bytes, unsigned v)
-{
-	switch (bytes) {
-	case 1:
-		**p = v;
-		break;
-	case 2:
-		*((__le16 *) *p) = cpu_to_le16(v);
-		break;
-	case 4:
-		*((__le32 *) *p) = cpu_to_le32(v);
-		break;
-	default:
-		BUG();
-	}
-
-	*p += bytes;
-}
-
-static void bch2_alloc_read_key(struct bch_fs *c, struct bkey_s_c k)
-{
-	struct bch_dev *ca;
-	struct bkey_s_c_alloc a;
-	struct bucket_mark new;
-	struct bucket *g;
-	const u8 *d;
-
-	if (k.k->type != BCH_ALLOC)
-		return;
-
-	a = bkey_s_c_to_alloc(k);
-	ca = bch_dev_bkey_exists(c, a.k->p.inode);
-
-	if (a.k->p.offset >= ca->mi.nbuckets)
-		return;
-
-	percpu_down_read(&c->usage_lock);
-
-	g = bucket(ca, a.k->p.offset);
-	bucket_cmpxchg(g, new, ({
-		new.gen = a.v->gen;
-		new.gen_valid = 1;
-	}));
-
-	d = a.v->data;
-	if (a.v->fields & (1 << BCH_ALLOC_FIELD_READ_TIME))
-		g->io_time[READ] = get_alloc_field(&d, 2);
-	if (a.v->fields & (1 << BCH_ALLOC_FIELD_WRITE_TIME))
-		g->io_time[WRITE] = get_alloc_field(&d, 2);
-
-	percpu_up_read(&c->usage_lock);
-}
-
-int bch2_alloc_read(struct bch_fs *c, struct list_head *journal_replay_list)
-{
-	struct journal_replay *r;
-	struct btree_iter iter;
-	struct bkey_s_c k;
-	struct bch_dev *ca;
-	unsigned i;
-	int ret;
-
-	for_each_btree_key(&iter, c, BTREE_ID_ALLOC, POS_MIN, 0, k) {
-		bch2_alloc_read_key(c, k);
-		bch2_btree_iter_cond_resched(&iter);
-	}
-
-	ret = bch2_btree_iter_unlock(&iter);
-	if (ret)
-		return ret;
-
-	list_for_each_entry(r, journal_replay_list, list) {
-		struct bkey_i *k, *n;
-		struct jset_entry *entry;
-
-		for_each_jset_key(k, n, entry, &r->j)
-			if (entry->btree_id == BTREE_ID_ALLOC)
-				bch2_alloc_read_key(c, bkey_i_to_s_c(k));
-	}
-
-	mutex_lock(&c->bucket_clock[READ].lock);
-	for_each_member_device(ca, c, i) {
-		down_read(&ca->bucket_lock);
-		bch2_recalc_oldest_io(c, ca, READ);
-		up_read(&ca->bucket_lock);
-	}
-	mutex_unlock(&c->bucket_clock[READ].lock);
-
-	mutex_lock(&c->bucket_clock[WRITE].lock);
-	for_each_member_device(ca, c, i) {
-		down_read(&ca->bucket_lock);
-		bch2_recalc_oldest_io(c, ca, WRITE);
-		up_read(&ca->bucket_lock);
-	}
-	mutex_unlock(&c->bucket_clock[WRITE].lock);
-
-	return 0;
-}
-
-static int __bch2_alloc_write_key(struct bch_fs *c, struct bch_dev *ca,
-				  size_t b, struct btree_iter *iter,
-				  u64 *journal_seq, unsigned flags)
-{
-	struct bucket_mark m;
-	__BKEY_PADDED(k, DIV_ROUND_UP(sizeof(struct bch_alloc), 8)) alloc_key;
-	struct bucket *g;
-	struct bkey_i_alloc *a;
-	u8 *d;
-
-	percpu_down_read(&c->usage_lock);
-	g = bucket(ca, b);
-
-	m = READ_ONCE(g->mark);
-	a = bkey_alloc_init(&alloc_key.k);
-	a->k.p		= POS(ca->dev_idx, b);
-	a->v.fields	= 0;
-	a->v.gen	= m.gen;
-	set_bkey_val_u64s(&a->k, bch_alloc_val_u64s(&a->v));
-
-	d = a->v.data;
-	if (a->v.fields & (1 << BCH_ALLOC_FIELD_READ_TIME))
-		put_alloc_field(&d, 2, g->io_time[READ]);
-	if (a->v.fields & (1 << BCH_ALLOC_FIELD_WRITE_TIME))
-		put_alloc_field(&d, 2, g->io_time[WRITE]);
-	percpu_up_read(&c->usage_lock);
-
-	bch2_btree_iter_cond_resched(iter);
-
-	bch2_btree_iter_set_pos(iter, a->k.p);
-
-	return bch2_btree_insert_at(c, NULL, journal_seq,
-				    BTREE_INSERT_NOFAIL|
-				    BTREE_INSERT_USE_RESERVE|
-				    BTREE_INSERT_USE_ALLOC_RESERVE|
-				    flags,
-				    BTREE_INSERT_ENTRY(iter, &a->k_i));
-}
-
-int bch2_alloc_replay_key(struct bch_fs *c, struct bpos pos)
-{
-	struct bch_dev *ca;
-	struct btree_iter iter;
-	int ret;
-
-	if (pos.inode >= c->sb.nr_devices || !c->devs[pos.inode])
-		return 0;
-
-	ca = bch_dev_bkey_exists(c, pos.inode);
-
-	if (pos.offset >= ca->mi.nbuckets)
-		return 0;
-
-	bch2_btree_iter_init(&iter, c, BTREE_ID_ALLOC, POS_MIN,
-			     BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
-
-	ret = __bch2_alloc_write_key(c, ca, pos.offset, &iter, NULL, 0);
-	bch2_btree_iter_unlock(&iter);
-	return ret;
-}
-
-int bch2_alloc_write(struct bch_fs *c)
-{
-	struct bch_dev *ca;
-	unsigned i;
-	int ret = 0;
-
-	for_each_rw_member(ca, c, i) {
-		struct btree_iter iter;
-		unsigned long bucket;
-
-		bch2_btree_iter_init(&iter, c, BTREE_ID_ALLOC, POS_MIN,
-				     BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
-
-		down_read(&ca->bucket_lock);
-		for_each_set_bit(bucket, ca->buckets_dirty, ca->mi.nbuckets) {
-			ret = __bch2_alloc_write_key(c, ca, bucket,
-						     &iter, NULL, 0);
-			if (ret)
-				break;
-
-			clear_bit(bucket, ca->buckets_dirty);
-		}
-		up_read(&ca->bucket_lock);
-		bch2_btree_iter_unlock(&iter);
-
-		if (ret) {
-			percpu_ref_put(&ca->io_ref);
-			break;
-		}
-	}
-
-	return ret;
-}
-
-/* Bucket IO clocks: */
-
-static void bch2_recalc_oldest_io(struct bch_fs *c, struct bch_dev *ca, int rw)
-{
-	struct bucket_clock *clock = &c->bucket_clock[rw];
-	struct bucket_array *buckets = bucket_array(ca);
-	struct bucket *g;
-	u16 max_last_io = 0;
-	unsigned i;
-
-	lockdep_assert_held(&c->bucket_clock[rw].lock);
-
-	/* Recalculate max_last_io for this device: */
-	for_each_bucket(g, buckets)
-		max_last_io = max(max_last_io, bucket_last_io(c, g, rw));
-
-	ca->max_last_bucket_io[rw] = max_last_io;
-
-	/* Recalculate global max_last_io: */
-	max_last_io = 0;
-
-	for_each_member_device(ca, c, i)
-		max_last_io = max(max_last_io, ca->max_last_bucket_io[rw]);
-
-	clock->max_last_io = max_last_io;
-}
-
-static void bch2_rescale_bucket_io_times(struct bch_fs *c, int rw)
-{
-	struct bucket_clock *clock = &c->bucket_clock[rw];
-	struct bucket_array *buckets;
-	struct bch_dev *ca;
-	struct bucket *g;
-	unsigned i;
-
-	trace_rescale_prios(c);
-
-	for_each_member_device(ca, c, i) {
-		down_read(&ca->bucket_lock);
-		buckets = bucket_array(ca);
-
-		for_each_bucket(g, buckets)
-			g->io_time[rw] = clock->hand -
-			bucket_last_io(c, g, rw) / 2;
-
-		bch2_recalc_oldest_io(c, ca, rw);
-
-		up_read(&ca->bucket_lock);
-	}
-}
-
-static void bch2_inc_clock_hand(struct io_timer *timer)
-{
-	struct bucket_clock *clock = container_of(timer,
-						struct bucket_clock, rescale);
-	struct bch_fs *c = container_of(clock,
-					struct bch_fs, bucket_clock[clock->rw]);
-	struct bch_dev *ca;
-	u64 capacity;
-	unsigned i;
-
-	mutex_lock(&clock->lock);
-
-	/* if clock cannot be advanced more, rescale prio */
-	if (clock->max_last_io >= U16_MAX - 2)
-		bch2_rescale_bucket_io_times(c, clock->rw);
-
-	BUG_ON(clock->max_last_io >= U16_MAX - 2);
-
-	for_each_member_device(ca, c, i)
-		ca->max_last_bucket_io[clock->rw]++;
-	clock->max_last_io++;
-	clock->hand++;
-
-	mutex_unlock(&clock->lock);
-
-	capacity = READ_ONCE(c->capacity);
-
-	if (!capacity)
-		return;
-
-	/*
-	 * we only increment when 0.1% of the filesystem capacity has been read
-	 * or written too, this determines if it's time
-	 *
-	 * XXX: we shouldn't really be going off of the capacity of devices in
-	 * RW mode (that will be 0 when we're RO, yet we can still service
-	 * reads)
-	 */
-	timer->expire += capacity >> 10;
-
-	bch2_io_timer_add(&c->io_clock[clock->rw], timer);
-}
-
-static void bch2_bucket_clock_init(struct bch_fs *c, int rw)
-{
-	struct bucket_clock *clock = &c->bucket_clock[rw];
-
-	clock->hand		= 1;
-	clock->rw		= rw;
-	clock->rescale.fn	= bch2_inc_clock_hand;
-	clock->rescale.expire	= c->capacity >> 10;
-	mutex_init(&clock->lock);
-}
-
-/* Background allocator thread: */
-
-/*
- * Scans for buckets to be invalidated, invalidates them, rewrites prios/gens
- * (marking them as invalidated on disk), then optionally issues discard
- * commands to the newly free buckets, then puts them on the various freelists.
- */
-
-static void verify_not_on_freelist(struct bch_fs *c, struct bch_dev *ca,
-				   size_t bucket)
-{
-	if (expensive_debug_checks(c) &&
-	    test_bit(BCH_FS_ALLOCATOR_STARTED, &c->flags)) {
-		size_t iter;
-		long i;
-		unsigned j;
-
-		for (j = 0; j < RESERVE_NR; j++)
-			fifo_for_each_entry(i, &ca->free[j], iter)
-				BUG_ON(i == bucket);
-		fifo_for_each_entry(i, &ca->free_inc, iter)
-			BUG_ON(i == bucket);
-	}
-}
-
-#define BUCKET_GC_GEN_MAX	96U
-
-/**
- * wait_buckets_available - wait on reclaimable buckets
- *
- * If there aren't enough available buckets to fill up free_inc, wait until
- * there are.
- */
-static int wait_buckets_available(struct bch_fs *c, struct bch_dev *ca)
-{
-	unsigned long gc_count = c->gc_count;
-	int ret = 0;
-
-	while (1) {
-		set_current_state(TASK_INTERRUPTIBLE);
-		if (kthread_should_stop()) {
-			ret = 1;
-			break;
-		}
-
-		if (gc_count != c->gc_count)
-			ca->inc_gen_really_needs_gc = 0;
-
-		if ((ssize_t) (dev_buckets_available(c, ca) -
-			       ca->inc_gen_really_needs_gc) >=
-		    (ssize_t) fifo_free(&ca->free_inc))
-			break;
-
-		up_read(&c->gc_lock);
-		schedule();
-		try_to_freeze();
-		down_read(&c->gc_lock);
-	}
-
-	__set_current_state(TASK_RUNNING);
-	return ret;
-}
-
-static bool bch2_can_invalidate_bucket(struct bch_dev *ca,
-				       size_t bucket,
-				       struct bucket_mark mark)
-{
-	u8 gc_gen;
-
-	if (!is_available_bucket(mark))
-		return false;
-
-	gc_gen = bucket_gc_gen(ca, bucket);
-
-	if (gc_gen >= BUCKET_GC_GEN_MAX / 2)
-		ca->inc_gen_needs_gc++;
-
-	if (gc_gen >= BUCKET_GC_GEN_MAX)
-		ca->inc_gen_really_needs_gc++;
-
-	return gc_gen < BUCKET_GC_GEN_MAX;
-}
-
-/*
- * Determines what order we're going to reuse buckets, smallest bucket_key()
- * first.
- *
- *
- * - We take into account the read prio of the bucket, which gives us an
- *   indication of how hot the data is -- we scale the prio so that the prio
- *   farthest from the clock is worth 1/8th of the closest.
- *
- * - The number of sectors of cached data in the bucket, which gives us an
- *   indication of the cost in cache misses this eviction will cause.
- *
- * - If hotness * sectors used compares equal, we pick the bucket with the
- *   smallest bucket_gc_gen() - since incrementing the same bucket's generation
- *   number repeatedly forces us to run mark and sweep gc to avoid generation
- *   number wraparound.
- */
-
-static unsigned long bucket_sort_key(struct bch_fs *c, struct bch_dev *ca,
-				     size_t b, struct bucket_mark m)
-{
-	unsigned last_io = bucket_last_io(c, bucket(ca, b), READ);
-	unsigned max_last_io = ca->max_last_bucket_io[READ];
-
-	/*
-	 * Time since last read, scaled to [0, 8) where larger value indicates
-	 * more recently read data:
-	 */
-	unsigned long hotness = (max_last_io - last_io) * 7 / max_last_io;
-
-	/* How much we want to keep the data in this bucket: */
-	unsigned long data_wantness =
-		(hotness + 1) * bucket_sectors_used(m);
-
-	unsigned long needs_journal_commit =
-		bucket_needs_journal_commit(m, c->journal.last_seq_ondisk);
-
-	return  (data_wantness << 9) |
-		(needs_journal_commit << 8) |
-		(bucket_gc_gen(ca, b) / 16);
-}
-
-static inline int bucket_alloc_cmp(alloc_heap *h,
-				   struct alloc_heap_entry l,
-				   struct alloc_heap_entry r)
-{
-	return (l.key > r.key) - (l.key < r.key) ?:
-		(l.nr < r.nr)  - (l.nr  > r.nr) ?:
-		(l.bucket > r.bucket) - (l.bucket < r.bucket);
-}
-
-static inline int bucket_idx_cmp(const void *_l, const void *_r)
-{
-	const struct alloc_heap_entry *l = _l, *r = _r;
-
-	return (l->bucket > r->bucket) - (l->bucket < r->bucket);
-}
-
-static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca)
-{
-	struct bucket_array *buckets;
-	struct alloc_heap_entry e = { 0 };
-	size_t b, i, nr = 0;
-
-	ca->alloc_heap.used = 0;
-
-	mutex_lock(&c->bucket_clock[READ].lock);
-	down_read(&ca->bucket_lock);
-
-	buckets = bucket_array(ca);
-
-	bch2_recalc_oldest_io(c, ca, READ);
-
-	/*
-	 * Find buckets with lowest read priority, by building a maxheap sorted
-	 * by read priority and repeatedly replacing the maximum element until
-	 * all buckets have been visited.
-	 */
-	for (b = ca->mi.first_bucket; b < ca->mi.nbuckets; b++) {
-		struct bucket_mark m = READ_ONCE(buckets->b[b].mark);
-		unsigned long key = bucket_sort_key(c, ca, b, m);
-
-		if (!bch2_can_invalidate_bucket(ca, b, m))
-			continue;
-
-		if (e.nr && e.bucket + e.nr == b && e.key == key) {
-			e.nr++;
-		} else {
-			if (e.nr)
-				heap_add_or_replace(&ca->alloc_heap, e, -bucket_alloc_cmp);
-
-			e = (struct alloc_heap_entry) {
-				.bucket = b,
-				.nr	= 1,
-				.key	= key,
-			};
-		}
-
-		cond_resched();
-	}
-
-	if (e.nr)
-		heap_add_or_replace(&ca->alloc_heap, e, -bucket_alloc_cmp);
-
-	for (i = 0; i < ca->alloc_heap.used; i++)
-		nr += ca->alloc_heap.data[i].nr;
-
-	while (nr - ca->alloc_heap.data[0].nr >= ALLOC_SCAN_BATCH(ca)) {
-		nr -= ca->alloc_heap.data[0].nr;
-		heap_pop(&ca->alloc_heap, e, -bucket_alloc_cmp);
-	}
-
-	up_read(&ca->bucket_lock);
-	mutex_unlock(&c->bucket_clock[READ].lock);
-}
-
-static void find_reclaimable_buckets_fifo(struct bch_fs *c, struct bch_dev *ca)
-{
-	struct bucket_array *buckets = bucket_array(ca);
-	struct bucket_mark m;
-	size_t b, start;
-
-	if (ca->fifo_last_bucket <  ca->mi.first_bucket ||
-	    ca->fifo_last_bucket >= ca->mi.nbuckets)
-		ca->fifo_last_bucket = ca->mi.first_bucket;
-
-	start = ca->fifo_last_bucket;
-
-	do {
-		ca->fifo_last_bucket++;
-		if (ca->fifo_last_bucket == ca->mi.nbuckets)
-			ca->fifo_last_bucket = ca->mi.first_bucket;
-
-		b = ca->fifo_last_bucket;
-		m = READ_ONCE(buckets->b[b].mark);
-
-		if (bch2_can_invalidate_bucket(ca, b, m)) {
-			struct alloc_heap_entry e = { .bucket = b, .nr = 1, };
-
-			heap_add(&ca->alloc_heap, e, bucket_alloc_cmp);
-			if (heap_full(&ca->alloc_heap))
-				break;
-		}
-
-		cond_resched();
-	} while (ca->fifo_last_bucket != start);
-}
-
-static void find_reclaimable_buckets_random(struct bch_fs *c, struct bch_dev *ca)
-{
-	struct bucket_array *buckets = bucket_array(ca);
-	struct bucket_mark m;
-	size_t checked, i;
-
-	for (checked = 0;
-	     checked < ca->mi.nbuckets / 2;
-	     checked++) {
-		size_t b = bch2_rand_range(ca->mi.nbuckets -
-					   ca->mi.first_bucket) +
-			ca->mi.first_bucket;
-
-		m = READ_ONCE(buckets->b[b].mark);
-
-		if (bch2_can_invalidate_bucket(ca, b, m)) {
-			struct alloc_heap_entry e = { .bucket = b, .nr = 1, };
-
-			heap_add(&ca->alloc_heap, e, bucket_alloc_cmp);
-			if (heap_full(&ca->alloc_heap))
-				break;
-		}
-
-		cond_resched();
-	}
-
-	sort(ca->alloc_heap.data,
-	     ca->alloc_heap.used,
-	     sizeof(ca->alloc_heap.data[0]),
-	     bucket_idx_cmp, NULL);
-
-	/* remove duplicates: */
-	for (i = 0; i + 1 < ca->alloc_heap.used; i++)
-		if (ca->alloc_heap.data[i].bucket ==
-		    ca->alloc_heap.data[i + 1].bucket)
-			ca->alloc_heap.data[i].nr = 0;
-}
-
-static size_t find_reclaimable_buckets(struct bch_fs *c, struct bch_dev *ca)
-{
-	size_t i, nr = 0;
-
-	ca->inc_gen_needs_gc			= 0;
-
-	switch (ca->mi.replacement) {
-	case CACHE_REPLACEMENT_LRU:
-		find_reclaimable_buckets_lru(c, ca);
-		break;
-	case CACHE_REPLACEMENT_FIFO:
-		find_reclaimable_buckets_fifo(c, ca);
-		break;
-	case CACHE_REPLACEMENT_RANDOM:
-		find_reclaimable_buckets_random(c, ca);
-		break;
-	}
-
-	heap_resort(&ca->alloc_heap, bucket_alloc_cmp);
-
-	for (i = 0; i < ca->alloc_heap.used; i++)
-		nr += ca->alloc_heap.data[i].nr;
-
-	return nr;
-}
-
-static inline long next_alloc_bucket(struct bch_dev *ca)
-{
-	struct alloc_heap_entry e, *top = ca->alloc_heap.data;
-
-	while (ca->alloc_heap.used) {
-		if (top->nr) {
-			size_t b = top->bucket;
-
-			top->bucket++;
-			top->nr--;
-			return b;
-		}
-
-		heap_pop(&ca->alloc_heap, e, bucket_alloc_cmp);
-	}
-
-	return -1;
-}
-
-static bool bch2_invalidate_one_bucket(struct bch_fs *c, struct bch_dev *ca,
-				       size_t bucket, u64 *flush_seq)
-{
-	struct bucket_mark m;
-
-	percpu_down_read(&c->usage_lock);
-	spin_lock(&c->freelist_lock);
-
-	bch2_invalidate_bucket(c, ca, bucket, &m);
-
-	verify_not_on_freelist(c, ca, bucket);
-	BUG_ON(!fifo_push(&ca->free_inc, bucket));
-
-	spin_unlock(&c->freelist_lock);
-
-	bucket_io_clock_reset(c, ca, bucket, READ);
-	bucket_io_clock_reset(c, ca, bucket, WRITE);
-
-	percpu_up_read(&c->usage_lock);
-
-	if (m.journal_seq_valid) {
-		u64 journal_seq = atomic64_read(&c->journal.seq);
-		u64 bucket_seq	= journal_seq;
-
-		bucket_seq &= ~((u64) U16_MAX);
-		bucket_seq |= m.journal_seq;
-
-		if (bucket_seq > journal_seq)
-			bucket_seq -= 1 << 16;
-
-		*flush_seq = max(*flush_seq, bucket_seq);
-	}
-
-	return m.cached_sectors != 0;
-}
-
-/*
- * Pull buckets off ca->alloc_heap, invalidate them, move them to ca->free_inc:
- */
-static int bch2_invalidate_buckets(struct bch_fs *c, struct bch_dev *ca)
-{
-	struct btree_iter iter;
-	u64 journal_seq = 0;
-	int ret = 0;
-	long b;
-
-	bch2_btree_iter_init(&iter, c, BTREE_ID_ALLOC, POS(ca->dev_idx, 0),
-			     BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
-
-	/* Only use nowait if we've already invalidated at least one bucket: */
-	while (!ret &&
-	       !fifo_full(&ca->free_inc) &&
-	       (b = next_alloc_bucket(ca)) >= 0) {
-		bool must_flush =
-			bch2_invalidate_one_bucket(c, ca, b, &journal_seq);
-
-		ret = __bch2_alloc_write_key(c, ca, b, &iter,
-				must_flush ? &journal_seq : NULL,
-				!fifo_empty(&ca->free_inc) ? BTREE_INSERT_NOWAIT : 0);
-	}
-
-	bch2_btree_iter_unlock(&iter);
-
-	/* If we used NOWAIT, don't return the error: */
-	if (!fifo_empty(&ca->free_inc))
-		ret = 0;
-	if (ret) {
-		bch_err(ca, "error invalidating buckets: %i", ret);
-		return ret;
-	}
-
-	if (journal_seq)
-		ret = bch2_journal_flush_seq(&c->journal, journal_seq);
-	if (ret) {
-		bch_err(ca, "journal error: %i", ret);
-		return ret;
-	}
-
-	return 0;
-}
-
-static int push_invalidated_bucket(struct bch_fs *c, struct bch_dev *ca, size_t bucket)
-{
-	unsigned i;
-	int ret = 0;
-
-	while (1) {
-		set_current_state(TASK_INTERRUPTIBLE);
-
-		spin_lock(&c->freelist_lock);
-		for (i = 0; i < RESERVE_NR; i++)
-			if (fifo_push(&ca->free[i], bucket)) {
-				fifo_pop(&ca->free_inc, bucket);
-				closure_wake_up(&c->freelist_wait);
-				spin_unlock(&c->freelist_lock);
-				goto out;
-			}
-		spin_unlock(&c->freelist_lock);
-
-		if ((current->flags & PF_KTHREAD) &&
-		    kthread_should_stop()) {
-			ret = 1;
-			break;
-		}
-
-		schedule();
-		try_to_freeze();
-	}
-out:
-	__set_current_state(TASK_RUNNING);
-	return ret;
-}
-
-/*
- * Pulls buckets off free_inc, discards them (if enabled), then adds them to
- * freelists, waiting until there's room if necessary:
- */
-static int discard_invalidated_buckets(struct bch_fs *c, struct bch_dev *ca)
-{
-	while (!fifo_empty(&ca->free_inc)) {
-		size_t bucket = fifo_peek(&ca->free_inc);
-
-		if (ca->mi.discard &&
-		    bdev_max_discard_sectors(ca->disk_sb.bdev))
-			blkdev_issue_discard(ca->disk_sb.bdev,
-					     bucket_to_sector(ca, bucket),
-					     ca->mi.bucket_size, GFP_NOIO);
-
-		if (push_invalidated_bucket(c, ca, bucket))
-			return 1;
-	}
-
-	return 0;
-}
-
-/**
- * bch_allocator_thread - move buckets from free_inc to reserves
- *
- * The free_inc FIFO is populated by find_reclaimable_buckets(), and
- * the reserves are depleted by bucket allocation. When we run out
- * of free_inc, try to invalidate some buckets and write out
- * prios and gens.
- */
-static int bch2_allocator_thread(void *arg)
-{
-	struct bch_dev *ca = arg;
-	struct bch_fs *c = ca->fs;
-	size_t nr;
-	int ret;
-
-	set_freezable();
-
-	while (1) {
-		cond_resched();
-
-		pr_debug("discarding %zu invalidated buckets",
-			 fifo_used(&ca->free_inc));
-
-		ret = discard_invalidated_buckets(c, ca);
-		if (ret)
-			goto stop;
-
-		down_read(&c->gc_lock);
-
-		ret = bch2_invalidate_buckets(c, ca);
-		if (ret) {
-			up_read(&c->gc_lock);
-			goto stop;
-		}
-
-		if (!fifo_empty(&ca->free_inc)) {
-			up_read(&c->gc_lock);
-			continue;
-		}
-
-		pr_debug("free_inc now empty");
-
-		do {
-			if (test_bit(BCH_FS_GC_FAILURE, &c->flags)) {
-				up_read(&c->gc_lock);
-				bch_err(ca, "gc failure");
-				goto stop;
-			}
-
-			/*
-			 * Find some buckets that we can invalidate, either
-			 * they're completely unused, or only contain clean data
-			 * that's been written back to the backing device or
-			 * another cache tier
-			 */
-
-			pr_debug("scanning for reclaimable buckets");
-
-			nr = find_reclaimable_buckets(c, ca);
-
-			pr_debug("found %zu buckets", nr);
-
-			trace_alloc_batch(ca, nr, ca->alloc_heap.size);
-
-			if ((ca->inc_gen_needs_gc >= ALLOC_SCAN_BATCH(ca) ||
-			     ca->inc_gen_really_needs_gc) &&
-			    c->gc_thread) {
-				atomic_inc(&c->kick_gc);
-				wake_up_process(c->gc_thread);
-			}
-
-			/*
-			 * If we found any buckets, we have to invalidate them
-			 * before we scan for more - but if we didn't find very
-			 * many we may want to wait on more buckets being
-			 * available so we don't spin:
-			 */
-			if (!nr ||
-			    (nr < ALLOC_SCAN_BATCH(ca) &&
-			     !fifo_full(&ca->free[RESERVE_MOVINGGC]))) {
-				ca->allocator_blocked = true;
-				closure_wake_up(&c->freelist_wait);
-
-				ret = wait_buckets_available(c, ca);
-				if (ret) {
-					up_read(&c->gc_lock);
-					goto stop;
-				}
-			}
-		} while (!nr);
-
-		ca->allocator_blocked = false;
-		up_read(&c->gc_lock);
-
-		pr_debug("%zu buckets to invalidate", nr);
-
-		/*
-		 * alloc_heap is now full of newly-invalidated buckets: next,
-		 * write out the new bucket gens:
-		 */
-	}
-
-stop:
-	pr_debug("alloc thread stopping (ret %i)", ret);
-	return 0;
-}
-
-/* Allocation */
-
-/*
- * Open buckets represent a bucket that's currently being allocated from.  They
- * serve two purposes:
- *
- *  - They track buckets that have been partially allocated, allowing for
- *    sub-bucket sized allocations - they're used by the sector allocator below
- *
- *  - They provide a reference to the buckets they own that mark and sweep GC
- *    can find, until the new allocation has a pointer to it inserted into the
- *    btree
- *
- * When allocating some space with the sector allocator, the allocation comes
- * with a reference to an open bucket - the caller is required to put that
- * reference _after_ doing the index update that makes its allocation reachable.
- */
-
-void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob)
-{
-	struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev);
-
-	percpu_down_read(&c->usage_lock);
-	spin_lock(&ob->lock);
-
-	bch2_mark_alloc_bucket(c, ca, PTR_BUCKET_NR(ca, &ob->ptr),
-			       false, gc_pos_alloc(c, ob), 0);
-	ob->valid = false;
-
-	spin_unlock(&ob->lock);
-	percpu_up_read(&c->usage_lock);
-
-	spin_lock(&c->freelist_lock);
-	ob->freelist = c->open_buckets_freelist;
-	c->open_buckets_freelist = ob - c->open_buckets;
-	c->open_buckets_nr_free++;
-	spin_unlock(&c->freelist_lock);
-
-	closure_wake_up(&c->open_buckets_wait);
-}
-
-static struct open_bucket *bch2_open_bucket_alloc(struct bch_fs *c)
-{
-	struct open_bucket *ob;
-
-	BUG_ON(!c->open_buckets_freelist || !c->open_buckets_nr_free);
-
-	ob = c->open_buckets + c->open_buckets_freelist;
-	c->open_buckets_freelist = ob->freelist;
-	atomic_set(&ob->pin, 1);
-
-	c->open_buckets_nr_free--;
-	return ob;
-}
-
-/* _only_ for allocating the journal on a new device: */
-long bch2_bucket_alloc_new_fs(struct bch_dev *ca)
-{
-	struct bucket_array *buckets;
-	ssize_t b;
-
-	rcu_read_lock();
-	buckets = bucket_array(ca);
-
-	for (b = ca->mi.first_bucket; b < ca->mi.nbuckets; b++)
-		if (is_available_bucket(buckets->b[b].mark))
-			goto success;
-	b = -1;
-success:
-	rcu_read_unlock();
-	return b;
-}
-
-static inline unsigned open_buckets_reserved(enum alloc_reserve reserve)
-{
-	switch (reserve) {
-	case RESERVE_ALLOC:
-		return 0;
-	case RESERVE_BTREE:
-		return BTREE_NODE_RESERVE / 2;
-	default:
-		return BTREE_NODE_RESERVE;
-	}
-}
-
-/**
- * bch_bucket_alloc - allocate a single bucket from a specific device
- *
- * Returns index of bucket on success, 0 on failure
- * */
-int bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca,
-		      enum alloc_reserve reserve,
-		      bool may_alloc_partial,
-		      struct closure *cl)
-{
-	struct bucket_array *buckets;
-	struct open_bucket *ob;
-	long bucket;
-
-	spin_lock(&c->freelist_lock);
-
-	if (may_alloc_partial &&
-	    ca->open_buckets_partial_nr) {
-		int ret = ca->open_buckets_partial[--ca->open_buckets_partial_nr];
-		c->open_buckets[ret].on_partial_list = false;
-		spin_unlock(&c->freelist_lock);
-		return ret;
-	}
-
-	if (unlikely(c->open_buckets_nr_free <= open_buckets_reserved(reserve))) {
-		if (cl)
-			closure_wait(&c->open_buckets_wait, cl);
-		spin_unlock(&c->freelist_lock);
-		trace_open_bucket_alloc_fail(ca, reserve);
-		return OPEN_BUCKETS_EMPTY;
-	}
-
-	if (likely(fifo_pop(&ca->free[RESERVE_NONE], bucket)))
-		goto out;
-
-	switch (reserve) {
-	case RESERVE_ALLOC:
-		if (fifo_pop(&ca->free[RESERVE_BTREE], bucket))
-			goto out;
-		break;
-	case RESERVE_BTREE:
-		if (fifo_used(&ca->free[RESERVE_BTREE]) * 2 >=
-		    ca->free[RESERVE_BTREE].size &&
-		    fifo_pop(&ca->free[RESERVE_BTREE], bucket))
-			goto out;
-		break;
-	case RESERVE_MOVINGGC:
-		if (fifo_pop(&ca->free[RESERVE_MOVINGGC], bucket))
-			goto out;
-		break;
-	default:
-		break;
-	}
-
-	if (cl)
-		closure_wait(&c->freelist_wait, cl);
-
-	spin_unlock(&c->freelist_lock);
-
-	trace_bucket_alloc_fail(ca, reserve);
-	return FREELIST_EMPTY;
-out:
-	verify_not_on_freelist(c, ca, bucket);
-
-	ob = bch2_open_bucket_alloc(c);
-
-	spin_lock(&ob->lock);
-	buckets = bucket_array(ca);
-
-	ob->valid	= true;
-	ob->sectors_free = ca->mi.bucket_size;
-	ob->ptr		= (struct bch_extent_ptr) {
-		.gen	= buckets->b[bucket].mark.gen,
-		.offset	= bucket_to_sector(ca, bucket),
-		.dev	= ca->dev_idx,
-	};
-
-	bucket_io_clock_reset(c, ca, bucket, READ);
-	bucket_io_clock_reset(c, ca, bucket, WRITE);
-	spin_unlock(&ob->lock);
-
-	spin_unlock(&c->freelist_lock);
-
-	bch2_wake_allocator(ca);
-
-	trace_bucket_alloc(ca, reserve);
-	return ob - c->open_buckets;
-}
-
-static int __dev_alloc_cmp(struct write_point *wp,
-			   unsigned l, unsigned r)
-{
-	return ((wp->next_alloc[l] > wp->next_alloc[r]) -
-		(wp->next_alloc[l] < wp->next_alloc[r]));
-}
-
-#define dev_alloc_cmp(l, r) __dev_alloc_cmp(wp, l, r)
-
-struct dev_alloc_list bch2_wp_alloc_list(struct bch_fs *c,
-					 struct write_point *wp,
-					 struct bch_devs_mask *devs)
-{
-	struct dev_alloc_list ret = { .nr = 0 };
-	struct bch_dev *ca;
-	unsigned i;
-
-	for_each_member_device_rcu(ca, c, i, devs)
-		ret.devs[ret.nr++] = i;
-
-	bubble_sort(ret.devs, ret.nr, dev_alloc_cmp);
-	return ret;
-}
-
-void bch2_wp_rescale(struct bch_fs *c, struct bch_dev *ca,
-		     struct write_point *wp)
-{
-	u64 *v = wp->next_alloc + ca->dev_idx;
-	u64 free_space = dev_buckets_free(c, ca);
-	u64 free_space_inv = free_space
-		? div64_u64(1ULL << 48, free_space)
-		: 1ULL << 48;
-	u64 scale = *v / 4;
-
-	if (*v + free_space_inv >= *v)
-		*v += free_space_inv;
-	else
-		*v = U64_MAX;
-
-	for (v = wp->next_alloc;
-	     v < wp->next_alloc + ARRAY_SIZE(wp->next_alloc); v++)
-		*v = *v < scale ? 0 : *v - scale;
-}
-
-static enum bucket_alloc_ret bch2_bucket_alloc_set(struct bch_fs *c,
-					struct write_point *wp,
-					unsigned nr_replicas,
-					enum alloc_reserve reserve,
-					struct bch_devs_mask *devs,
-					struct closure *cl)
-{
-	enum bucket_alloc_ret ret = NO_DEVICES;
-	struct dev_alloc_list devs_sorted;
-	struct bch_dev *ca;
-	unsigned i, nr_ptrs_effective = 0;
-	bool have_cache_dev = false;
-
-	BUG_ON(nr_replicas > ARRAY_SIZE(wp->ptrs));
-
-	for (i = wp->first_ptr; i < wp->nr_ptrs; i++) {
-		ca = bch_dev_bkey_exists(c, wp->ptrs[i]->ptr.dev);
-
-		nr_ptrs_effective += ca->mi.durability;
-		have_cache_dev |= !ca->mi.durability;
-	}
-
-	if (nr_ptrs_effective >= nr_replicas)
-		return ALLOC_SUCCESS;
-
-	devs_sorted = bch2_wp_alloc_list(c, wp, devs);
-
-	for (i = 0; i < devs_sorted.nr; i++) {
-		int ob;
-
-		ca = rcu_dereference(c->devs[devs_sorted.devs[i]]);
-		if (!ca)
-			continue;
-
-		if (!ca->mi.durability &&
-		    (have_cache_dev ||
-		     wp->type != BCH_DATA_USER))
-			continue;
-
-		ob = bch2_bucket_alloc(c, ca, reserve,
-				       wp->type == BCH_DATA_USER, cl);
-		if (ob < 0) {
-			ret = ob;
-			if (ret == OPEN_BUCKETS_EMPTY)
-				break;
-			continue;
-		}
-
-		BUG_ON(ob <= 0 || ob > U8_MAX);
-		BUG_ON(wp->nr_ptrs >= ARRAY_SIZE(wp->ptrs));
-
-		wp->ptrs[wp->nr_ptrs++] = c->open_buckets + ob;
-
-		bch2_wp_rescale(c, ca, wp);
-
-		nr_ptrs_effective += ca->mi.durability;
-		have_cache_dev |= !ca->mi.durability;
-
-		__clear_bit(ca->dev_idx, devs->d);
-
-		if (nr_ptrs_effective >= nr_replicas) {
-			ret = ALLOC_SUCCESS;
-			break;
-		}
-	}
-
-	EBUG_ON(reserve == RESERVE_MOVINGGC &&
-		ret != ALLOC_SUCCESS &&
-		ret != OPEN_BUCKETS_EMPTY);
-
-	switch (ret) {
-	case ALLOC_SUCCESS:
-		return 0;
-	case NO_DEVICES:
-		return -EROFS;
-	case FREELIST_EMPTY:
-	case OPEN_BUCKETS_EMPTY:
-		return cl ? -EAGAIN : -ENOSPC;
-	default:
-		BUG();
-	}
-}
-
-/* Sector allocator */
-
-static void writepoint_drop_ptr(struct bch_fs *c,
-				struct write_point *wp,
-				unsigned i)
-{
-	struct open_bucket *ob = wp->ptrs[i];
-	struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev);
-
-	BUG_ON(ca->open_buckets_partial_nr >=
-	       ARRAY_SIZE(ca->open_buckets_partial));
-
-	if (wp->type == BCH_DATA_USER) {
-		spin_lock(&c->freelist_lock);
-		ob->on_partial_list = true;
-		ca->open_buckets_partial[ca->open_buckets_partial_nr++] =
-			ob - c->open_buckets;
-		spin_unlock(&c->freelist_lock);
-
-		closure_wake_up(&c->open_buckets_wait);
-		closure_wake_up(&c->freelist_wait);
-	} else {
-		bch2_open_bucket_put(c, ob);
-	}
-
-	array_remove_item(wp->ptrs, wp->nr_ptrs, i);
-
-	if (i < wp->first_ptr)
-		wp->first_ptr--;
-}
-
-static void writepoint_drop_ptrs(struct bch_fs *c,
-				 struct write_point *wp,
-				 u16 target, bool in_target)
-{
-	int i;
-
-	for (i = wp->first_ptr - 1; i >= 0; --i)
-		if (bch2_dev_in_target(c, wp->ptrs[i]->ptr.dev,
-				       target) == in_target)
-			writepoint_drop_ptr(c, wp, i);
-}
-
-static void verify_not_stale(struct bch_fs *c, const struct write_point *wp)
-{
-#ifdef CONFIG_BCACHEFS_DEBUG
-	struct open_bucket *ob;
-	unsigned i;
-
-	writepoint_for_each_ptr_all(wp, ob, i) {
-		struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev);
-
-		BUG_ON(ptr_stale(ca, &ob->ptr));
-	}
-#endif
-}
-
-static int open_bucket_add_buckets(struct bch_fs *c,
-				   u16 target,
-				   struct write_point *wp,
-				   struct bch_devs_list *devs_have,
-				   unsigned nr_replicas,
-				   enum alloc_reserve reserve,
-				   struct closure *cl)
-{
-	struct bch_devs_mask devs = c->rw_devs[wp->type];
-	const struct bch_devs_mask *t;
-	struct open_bucket *ob;
-	unsigned i;
-	int ret;
-
-	percpu_down_read(&c->usage_lock);
-	rcu_read_lock();
-
-	/* Don't allocate from devices we already have pointers to: */
-	for (i = 0; i < devs_have->nr; i++)
-		__clear_bit(devs_have->devs[i], devs.d);
-
-	writepoint_for_each_ptr_all(wp, ob, i)
-		__clear_bit(ob->ptr.dev, devs.d);
-
-	t = bch2_target_to_mask(c, target);
-	if (t)
-		bitmap_and(devs.d, devs.d, t->d, BCH_SB_MEMBERS_MAX);
-
-	ret = bch2_bucket_alloc_set(c, wp, nr_replicas, reserve, &devs, cl);
-
-	rcu_read_unlock();
-	percpu_up_read(&c->usage_lock);
-
-	return ret;
-}
-
-static struct write_point *__writepoint_find(struct hlist_head *head,
-					     unsigned long write_point)
-{
-	struct write_point *wp;
-
-	hlist_for_each_entry_rcu(wp, head, node)
-		if (wp->write_point == write_point)
-			return wp;
-
-	return NULL;
-}
-
-static struct hlist_head *writepoint_hash(struct bch_fs *c,
-					  unsigned long write_point)
-{
-	unsigned hash =
-		hash_long(write_point, ilog2(ARRAY_SIZE(c->write_points_hash)));
-
-	return &c->write_points_hash[hash];
-}
-
-static struct write_point *writepoint_find(struct bch_fs *c,
-					   unsigned long write_point)
-{
-	struct write_point *wp, *oldest;
-	struct hlist_head *head;
-
-	if (!(write_point & 1UL)) {
-		wp = (struct write_point *) write_point;
-		mutex_lock(&wp->lock);
-		return wp;
-	}
-
-	head = writepoint_hash(c, write_point);
-restart_find:
-	wp = __writepoint_find(head, write_point);
-	if (wp) {
-lock_wp:
-		mutex_lock(&wp->lock);
-		if (wp->write_point == write_point)
-			goto out;
-		mutex_unlock(&wp->lock);
-		goto restart_find;
-	}
-
-	oldest = NULL;
-	for (wp = c->write_points;
-	     wp < c->write_points + ARRAY_SIZE(c->write_points);
-	     wp++)
-		if (!oldest || time_before64(wp->last_used, oldest->last_used))
-			oldest = wp;
-
-	mutex_lock(&oldest->lock);
-	mutex_lock(&c->write_points_hash_lock);
-	wp = __writepoint_find(head, write_point);
-	if (wp && wp != oldest) {
-		mutex_unlock(&c->write_points_hash_lock);
-		mutex_unlock(&oldest->lock);
-		goto lock_wp;
-	}
-
-	wp = oldest;
-	hlist_del_rcu(&wp->node);
-	wp->write_point = write_point;
-	hlist_add_head_rcu(&wp->node, head);
-	mutex_unlock(&c->write_points_hash_lock);
-out:
-	wp->last_used = sched_clock();
-	return wp;
-}
-
-/*
- * Get us an open_bucket we can allocate from, return with it locked:
- */
-struct write_point *bch2_alloc_sectors_start(struct bch_fs *c,
-				unsigned target,
-				struct write_point_specifier write_point,
-				struct bch_devs_list *devs_have,
-				unsigned nr_replicas,
-				unsigned nr_replicas_required,
-				enum alloc_reserve reserve,
-				unsigned flags,
-				struct closure *cl)
-{
-	struct write_point *wp;
-	struct open_bucket *ob;
-	struct bch_dev *ca;
-	unsigned nr_ptrs_have, nr_ptrs_effective;
-	int ret, i, cache_idx = -1;
-
-	BUG_ON(!nr_replicas || !nr_replicas_required);
-
-	wp = writepoint_find(c, write_point.v);
-
-	wp->first_ptr = 0;
-
-	/* does writepoint have ptrs we can't use? */
-	writepoint_for_each_ptr(wp, ob, i)
-		if (bch2_dev_list_has_dev(*devs_have, ob->ptr.dev)) {
-			swap(wp->ptrs[i], wp->ptrs[wp->first_ptr]);
-			wp->first_ptr++;
-		}
-
-	nr_ptrs_have = wp->first_ptr;
-
-	/* does writepoint have ptrs we don't want to use? */
-	if (target)
-		writepoint_for_each_ptr(wp, ob, i)
-			if (!bch2_dev_in_target(c, ob->ptr.dev, target)) {
-				swap(wp->ptrs[i], wp->ptrs[wp->first_ptr]);
-				wp->first_ptr++;
-			}
-
-	if (flags & BCH_WRITE_ONLY_SPECIFIED_DEVS) {
-		ret = open_bucket_add_buckets(c, target, wp, devs_have,
-					      nr_replicas, reserve, cl);
-	} else {
-		ret = open_bucket_add_buckets(c, target, wp, devs_have,
-					      nr_replicas, reserve, NULL);
-		if (!ret)
-			goto alloc_done;
-
-		wp->first_ptr = nr_ptrs_have;
-
-		ret = open_bucket_add_buckets(c, 0, wp, devs_have,
-					      nr_replicas, reserve, cl);
-	}
-
-	if (ret && ret != -EROFS)
-		goto err;
-alloc_done:
-	/* check for more than one cache: */
-	for (i = wp->nr_ptrs - 1; i >= wp->first_ptr; --i) {
-		ca = bch_dev_bkey_exists(c, wp->ptrs[i]->ptr.dev);
-
-		if (ca->mi.durability)
-			continue;
-
-		/*
-		 * if we ended up with more than one cache device, prefer the
-		 * one in the target we want:
-		 */
-		if (cache_idx >= 0) {
-			if (!bch2_dev_in_target(c, wp->ptrs[i]->ptr.dev,
-						target)) {
-				writepoint_drop_ptr(c, wp, i);
-			} else {
-				writepoint_drop_ptr(c, wp, cache_idx);
-				cache_idx = i;
-			}
-		} else {
-			cache_idx = i;
-		}
-	}
-
-	/* we might have more effective replicas than required: */
-	nr_ptrs_effective = 0;
-	writepoint_for_each_ptr(wp, ob, i) {
-		ca = bch_dev_bkey_exists(c, ob->ptr.dev);
-		nr_ptrs_effective += ca->mi.durability;
-	}
-
-	if (ret == -EROFS &&
-	    nr_ptrs_effective >= nr_replicas_required)
-		ret = 0;
-
-	if (ret)
-		goto err;
-
-	if (nr_ptrs_effective > nr_replicas) {
-		writepoint_for_each_ptr(wp, ob, i) {
-			ca = bch_dev_bkey_exists(c, ob->ptr.dev);
-
-			if (ca->mi.durability &&
-			    ca->mi.durability <= nr_ptrs_effective - nr_replicas &&
-			    !bch2_dev_in_target(c, ob->ptr.dev, target)) {
-				swap(wp->ptrs[i], wp->ptrs[wp->first_ptr]);
-				wp->first_ptr++;
-				nr_ptrs_effective -= ca->mi.durability;
-			}
-		}
-	}
-
-	if (nr_ptrs_effective > nr_replicas) {
-		writepoint_for_each_ptr(wp, ob, i) {
-			ca = bch_dev_bkey_exists(c, ob->ptr.dev);
-
-			if (ca->mi.durability &&
-			    ca->mi.durability <= nr_ptrs_effective - nr_replicas) {
-				swap(wp->ptrs[i], wp->ptrs[wp->first_ptr]);
-				wp->first_ptr++;
-				nr_ptrs_effective -= ca->mi.durability;
-			}
-		}
-	}
-
-	/* Remove pointers we don't want to use: */
-	if (target)
-		writepoint_drop_ptrs(c, wp, target, false);
-
-	BUG_ON(wp->first_ptr >= wp->nr_ptrs);
-	BUG_ON(nr_ptrs_effective < nr_replicas_required);
-
-	wp->sectors_free = UINT_MAX;
-
-	writepoint_for_each_ptr(wp, ob, i)
-		wp->sectors_free = min(wp->sectors_free, ob->sectors_free);
-
-	BUG_ON(!wp->sectors_free || wp->sectors_free == UINT_MAX);
-
-	verify_not_stale(c, wp);
-
-	return wp;
-err:
-	mutex_unlock(&wp->lock);
-	return ERR_PTR(ret);
-}
-
-/*
- * Append pointers to the space we just allocated to @k, and mark @sectors space
- * as allocated out of @ob
- */
-void bch2_alloc_sectors_append_ptrs(struct bch_fs *c, struct write_point *wp,
-				    struct bkey_i_extent *e, unsigned sectors)
-{
-	struct open_bucket *ob;
-	unsigned i;
-
-	BUG_ON(sectors > wp->sectors_free);
-	wp->sectors_free -= sectors;
-
-	writepoint_for_each_ptr(wp, ob, i) {
-		struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev);
-		struct bch_extent_ptr tmp = ob->ptr;
-
-		EBUG_ON(bch2_extent_has_device(extent_i_to_s_c(e), ob->ptr.dev));
-
-		tmp.cached = bkey_extent_is_cached(&e->k) ||
-			(!ca->mi.durability && wp->type == BCH_DATA_USER);
-
-		tmp.offset += ca->mi.bucket_size - ob->sectors_free;
-		extent_ptr_append(e, tmp);
-
-		BUG_ON(sectors > ob->sectors_free);
-		ob->sectors_free -= sectors;
-	}
-}
-
-/*
- * Append pointers to the space we just allocated to @k, and mark @sectors space
- * as allocated out of @ob
- */
-void bch2_alloc_sectors_done(struct bch_fs *c, struct write_point *wp)
-{
-	int i;
-
-	for (i = wp->nr_ptrs - 1; i >= 0; --i) {
-		struct open_bucket *ob = wp->ptrs[i];
-
-		if (!ob->sectors_free) {
-			array_remove_item(wp->ptrs, wp->nr_ptrs, i);
-			bch2_open_bucket_put(c, ob);
-		}
-	}
-
-	mutex_unlock(&wp->lock);
-}
-
-/* Startup/shutdown (ro/rw): */
-
-void bch2_recalc_capacity(struct bch_fs *c)
-{
-	struct bch_dev *ca;
-	u64 capacity = 0, reserved_sectors = 0, gc_reserve;
-	unsigned long ra_pages = 0;
-	unsigned i, j;
-
-	lockdep_assert_held(&c->state_lock);
-
-	for_each_online_member(ca, c, i) {
-		struct backing_dev_info *bdi = ca->disk_sb.bdev->bd_disk->bdi;
-
-		ra_pages += bdi->ra_pages;
-	}
-
-	bch2_set_ra_pages(c, ra_pages);
-
-	for_each_rw_member(ca, c, i) {
-		u64 dev_reserve = 0;
-
-		/*
-		 * We need to reserve buckets (from the number
-		 * of currently available buckets) against
-		 * foreground writes so that mainly copygc can
-		 * make forward progress.
-		 *
-		 * We need enough to refill the various reserves
-		 * from scratch - copygc will use its entire
-		 * reserve all at once, then run against when
-		 * its reserve is refilled (from the formerly
-		 * available buckets).
-		 *
-		 * This reserve is just used when considering if
-		 * allocations for foreground writes must wait -
-		 * not -ENOSPC calculations.
-		 */
-		for (j = 0; j < RESERVE_NONE; j++)
-			dev_reserve += ca->free[j].size;
-
-		dev_reserve += ca->free_inc.size;
-
-		dev_reserve += ARRAY_SIZE(c->write_points);
-
-		dev_reserve += 1;	/* btree write point */
-		dev_reserve += 1;	/* copygc write point */
-		dev_reserve += 1;	/* rebalance write point */
-		dev_reserve += WRITE_POINT_COUNT;
-
-		dev_reserve *= ca->mi.bucket_size;
-
-		ca->copygc_threshold = dev_reserve;
-
-		capacity += bucket_to_sector(ca, ca->mi.nbuckets -
-					     ca->mi.first_bucket);
-
-		reserved_sectors += dev_reserve * 2;
-	}
-
-	gc_reserve = c->opts.gc_reserve_bytes
-		? c->opts.gc_reserve_bytes >> 9
-		: div64_u64(capacity * c->opts.gc_reserve_percent, 100);
-
-	reserved_sectors = max(gc_reserve, reserved_sectors);
-
-	reserved_sectors = min(reserved_sectors, capacity);
-
-	c->capacity = capacity - reserved_sectors;
-
-	if (c->capacity) {
-		bch2_io_timer_add(&c->io_clock[READ],
-				 &c->bucket_clock[READ].rescale);
-		bch2_io_timer_add(&c->io_clock[WRITE],
-				 &c->bucket_clock[WRITE].rescale);
-	} else {
-		bch2_io_timer_del(&c->io_clock[READ],
-				 &c->bucket_clock[READ].rescale);
-		bch2_io_timer_del(&c->io_clock[WRITE],
-				 &c->bucket_clock[WRITE].rescale);
-	}
-
-	/* Wake up case someone was waiting for buckets */
-	closure_wake_up(&c->freelist_wait);
-}
-
-static void bch2_stop_write_point(struct bch_fs *c, struct bch_dev *ca,
-				  struct write_point *wp)
-{
-	struct bch_devs_mask not_self;
-
-	bitmap_complement(not_self.d, ca->self.d, BCH_SB_MEMBERS_MAX);
-
-	mutex_lock(&wp->lock);
-	wp->first_ptr = wp->nr_ptrs;
-	writepoint_drop_ptrs(c, wp, dev_to_target(ca->dev_idx), true);
-	mutex_unlock(&wp->lock);
-}
-
-static bool bch2_dev_has_open_write_point(struct bch_fs *c, struct bch_dev *ca)
-{
-	struct open_bucket *ob;
-	bool ret = false;
-
-	for (ob = c->open_buckets;
-	     ob < c->open_buckets + ARRAY_SIZE(c->open_buckets);
-	     ob++) {
-		spin_lock(&ob->lock);
-		if (ob->valid && !ob->on_partial_list &&
-		    ob->ptr.dev == ca->dev_idx)
-			ret = true;
-		spin_unlock(&ob->lock);
-	}
-
-	return ret;
-}
-
-/* device goes ro: */
-void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca)
-{
-	unsigned i;
-
-	BUG_ON(ca->alloc_thread);
-
-	/* First, remove device from allocation groups: */
-
-	for (i = 0; i < ARRAY_SIZE(c->rw_devs); i++)
-		clear_bit(ca->dev_idx, c->rw_devs[i].d);
-
-	/*
-	 * Capacity is calculated based off of devices in allocation groups:
-	 */
-	bch2_recalc_capacity(c);
-
-	/* Next, close write points that point to this device... */
-	for (i = 0; i < ARRAY_SIZE(c->write_points); i++)
-		bch2_stop_write_point(c, ca, &c->write_points[i]);
-
-	bch2_stop_write_point(c, ca, &ca->copygc_write_point);
-	bch2_stop_write_point(c, ca, &c->rebalance_write_point);
-	bch2_stop_write_point(c, ca, &c->btree_write_point);
-
-	mutex_lock(&c->btree_reserve_cache_lock);
-	while (c->btree_reserve_cache_nr) {
-		struct btree_alloc *a =
-			&c->btree_reserve_cache[--c->btree_reserve_cache_nr];
-
-		bch2_open_bucket_put_refs(c, &a->ob.nr, a->ob.refs);
-	}
-	mutex_unlock(&c->btree_reserve_cache_lock);
-
-	/*
-	 * Wake up threads that were blocked on allocation, so they can notice
-	 * the device can no longer be removed and the capacity has changed:
-	 */
-	closure_wake_up(&c->freelist_wait);
-
-	/*
-	 * journal_res_get() can block waiting for free space in the journal -
-	 * it needs to notice there may not be devices to allocate from anymore:
-	 */
-	wake_up(&c->journal.wait);
-
-	/* Now wait for any in flight writes: */
-
-	closure_wait_event(&c->open_buckets_wait,
-			   !bch2_dev_has_open_write_point(c, ca));
-}
-
-/* device goes rw: */
-void bch2_dev_allocator_add(struct bch_fs *c, struct bch_dev *ca)
-{
-	unsigned i;
-
-	for (i = 0; i < ARRAY_SIZE(c->rw_devs); i++)
-		if (ca->mi.data_allowed & (1 << i))
-			set_bit(ca->dev_idx, c->rw_devs[i].d);
-}
-
-/* stop allocator thread: */
-void bch2_dev_allocator_stop(struct bch_dev *ca)
-{
-	struct task_struct *p;
-
-	p = rcu_dereference_protected(ca->alloc_thread, 1);
-	ca->alloc_thread = NULL;
-
-	/*
-	 * We need an rcu barrier between setting ca->alloc_thread = NULL and
-	 * the thread shutting down to avoid bch2_wake_allocator() racing:
-	 *
-	 * XXX: it would be better to have the rcu barrier be asynchronous
-	 * instead of blocking us here
-	 */
-	synchronize_rcu();
-
-	if (p) {
-		kthread_stop(p);
-		put_task_struct(p);
-	}
-}
-
-/* start allocator thread: */
-int bch2_dev_allocator_start(struct bch_dev *ca)
-{
-	struct task_struct *p;
-
-	/*
-	 * allocator thread already started?
-	 */
-	if (ca->alloc_thread)
-		return 0;
-
-	p = kthread_create(bch2_allocator_thread, ca,
-			   "bch_alloc[%s]", ca->name);
-	if (IS_ERR(p))
-		return PTR_ERR(p);
-
-	get_task_struct(p);
-	rcu_assign_pointer(ca->alloc_thread, p);
-	wake_up_process(p);
-	return 0;
-}
-
-static void flush_held_btree_writes(struct bch_fs *c)
-{
-	struct bucket_table *tbl;
-	struct rhash_head *pos;
-	struct btree *b;
-	bool flush_updates;
-	size_t i, nr_pending_updates;
-
-	clear_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags);
-again:
-	pr_debug("flushing dirty btree nodes");
-	cond_resched();
-
-	flush_updates = false;
-	nr_pending_updates = bch2_btree_interior_updates_nr_pending(c);
-
-	rcu_read_lock();
-	for_each_cached_btree(b, c, tbl, i, pos)
-		if (btree_node_dirty(b) && (!b->written || b->level)) {
-			if (btree_node_may_write(b)) {
-				rcu_read_unlock();
-				btree_node_lock_type(c, b, SIX_LOCK_read);
-				bch2_btree_node_write(c, b, SIX_LOCK_read);
-				six_unlock_read(&b->lock);
-				goto again;
-			} else {
-				flush_updates = true;
-			}
-		}
-	rcu_read_unlock();
-
-	if (c->btree_roots_dirty)
-		bch2_journal_meta(&c->journal);
-
-	/*
-	 * This is ugly, but it's needed to flush btree node writes
-	 * without spinning...
-	 */
-	if (flush_updates) {
-		closure_wait_event(&c->btree_interior_update_wait,
-				   bch2_btree_interior_updates_nr_pending(c) <
-				   nr_pending_updates);
-		goto again;
-	}
-
-}
-
-static void allocator_start_issue_discards(struct bch_fs *c)
-{
-	struct bch_dev *ca;
-	unsigned dev_iter;
-	size_t bu;
-
-	for_each_rw_member(ca, c, dev_iter)
-		while (fifo_pop(&ca->free_inc, bu))
-			blkdev_issue_discard(ca->disk_sb.bdev,
-					     bucket_to_sector(ca, bu),
-					     ca->mi.bucket_size, GFP_NOIO);
-}
-
-static int __bch2_fs_allocator_start(struct bch_fs *c)
-{
-	struct bch_dev *ca;
-	unsigned dev_iter;
-	u64 journal_seq = 0;
-	long bu;
-	bool invalidating_data = false;
-	int ret = 0;
-
-	if (test_bit(BCH_FS_GC_FAILURE, &c->flags))
-		return -1;
-
-	if (test_alloc_startup(c)) {
-		invalidating_data = true;
-		goto not_enough;
-	}
-
-	/* Scan for buckets that are already invalidated: */
-	for_each_rw_member(ca, c, dev_iter) {
-		struct btree_iter iter;
-		struct bucket_mark m;
-		struct bkey_s_c k;
-
-		for_each_btree_key(&iter, c, BTREE_ID_ALLOC, POS(ca->dev_idx, 0), 0, k) {
-			if (k.k->type != BCH_ALLOC)
-				continue;
-
-			bu = k.k->p.offset;
-			m = READ_ONCE(bucket(ca, bu)->mark);
-
-			if (!is_available_bucket(m) || m.cached_sectors)
-				continue;
-
-			percpu_down_read(&c->usage_lock);
-			bch2_mark_alloc_bucket(c, ca, bu, true,
-					gc_pos_alloc(c, NULL),
-					BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
-					BCH_BUCKET_MARK_GC_LOCK_HELD);
-			percpu_up_read(&c->usage_lock);
-
-			fifo_push(&ca->free_inc, bu);
-
-			if (fifo_full(&ca->free_inc))
-				break;
-		}
-		bch2_btree_iter_unlock(&iter);
-	}
-
-	/* did we find enough buckets? */
-	for_each_rw_member(ca, c, dev_iter)
-		if (fifo_used(&ca->free_inc) < ca->free[RESERVE_BTREE].size) {
-			percpu_ref_put(&ca->io_ref);
-			goto not_enough;
-		}
-
-	return 0;
-not_enough:
-	pr_debug("did not find enough empty buckets; issuing discards");
-
-	/* clear out free_inc, we'll be using it again below: */
-	for_each_rw_member(ca, c, dev_iter)
-		discard_invalidated_buckets(c, ca);
-
-	pr_debug("scanning for reclaimable buckets");
-
-	for_each_rw_member(ca, c, dev_iter) {
-		find_reclaimable_buckets(c, ca);
-
-		while (!fifo_full(&ca->free[RESERVE_BTREE]) &&
-		       (bu = next_alloc_bucket(ca)) >= 0) {
-			invalidating_data |=
-				bch2_invalidate_one_bucket(c, ca, bu, &journal_seq);
-
-			fifo_push(&ca->free[RESERVE_BTREE], bu);
-			set_bit(bu, ca->buckets_dirty);
-		}
-	}
-
-	pr_debug("done scanning for reclaimable buckets");
-
-	/*
-	 * We're moving buckets to freelists _before_ they've been marked as
-	 * invalidated on disk - we have to so that we can allocate new btree
-	 * nodes to mark them as invalidated on disk.
-	 *
-	 * However, we can't _write_ to any of these buckets yet - they might
-	 * have cached data in them, which is live until they're marked as
-	 * invalidated on disk:
-	 */
-	if (invalidating_data) {
-		BUG();
-		pr_info("holding writes");
-		pr_debug("invalidating existing data");
-		set_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags);
-	} else {
-		pr_debug("issuing discards");
-		allocator_start_issue_discards(c);
-	}
-
-	/*
-	 * XXX: it's possible for this to deadlock waiting on journal reclaim,
-	 * since we're holding btree writes. What then?
-	 */
-	ret = bch2_alloc_write(c);
-	if (ret)
-		return ret;
-
-	if (invalidating_data) {
-		pr_debug("flushing journal");
-
-		ret = bch2_journal_flush_seq(&c->journal, journal_seq);
-		if (ret)
-			return ret;
-
-		pr_debug("issuing discards");
-		allocator_start_issue_discards(c);
-	}
-
-	set_bit(BCH_FS_ALLOCATOR_STARTED, &c->flags);
-
-	/* now flush dirty btree nodes: */
-	if (invalidating_data)
-		flush_held_btree_writes(c);
-
-	return 0;
-}
-
-int bch2_fs_allocator_start(struct bch_fs *c)
-{
-	struct bch_dev *ca;
-	unsigned i;
-	int ret;
-
-	down_read(&c->gc_lock);
-	ret = __bch2_fs_allocator_start(c);
-	up_read(&c->gc_lock);
-
-	if (ret)
-		return ret;
-
-	for_each_rw_member(ca, c, i) {
-		ret = bch2_dev_allocator_start(ca);
-		if (ret) {
-			percpu_ref_put(&ca->io_ref);
-			return ret;
-		}
-	}
-
-	return bch2_alloc_write(c);
-}
-
-void bch2_fs_allocator_init(struct bch_fs *c)
-{
-	struct open_bucket *ob;
-	struct write_point *wp;
-
-	mutex_init(&c->write_points_hash_lock);
-	spin_lock_init(&c->freelist_lock);
-	bch2_bucket_clock_init(c, READ);
-	bch2_bucket_clock_init(c, WRITE);
-
-	/* open bucket 0 is a sentinal NULL: */
-	spin_lock_init(&c->open_buckets[0].lock);
-
-	for (ob = c->open_buckets + 1;
-	     ob < c->open_buckets + ARRAY_SIZE(c->open_buckets); ob++) {
-		spin_lock_init(&ob->lock);
-		c->open_buckets_nr_free++;
-
-		ob->freelist = c->open_buckets_freelist;
-		c->open_buckets_freelist = ob - c->open_buckets;
-	}
-
-	writepoint_init(&c->btree_write_point, BCH_DATA_BTREE);
-	writepoint_init(&c->rebalance_write_point, BCH_DATA_USER);
-
-	for (wp = c->write_points;
-	     wp < c->write_points + ARRAY_SIZE(c->write_points); wp++) {
-		writepoint_init(wp, BCH_DATA_USER);
-
-		wp->last_used	= sched_clock();
-		wp->write_point	= (unsigned long) wp;
-		hlist_add_head_rcu(&wp->node, writepoint_hash(c, wp->write_point));
-	}
-
-	c->pd_controllers_update_seconds = 5;
-	INIT_DELAYED_WORK(&c->pd_controllers_update, pd_controllers_update);
-}
diff --git a/fs/bcachefs/alloc.h b/fs/bcachefs/alloc.h
deleted file mode 100644
index 739df233236c..000000000000
--- a/fs/bcachefs/alloc.h
+++ /dev/null
@@ -1,143 +0,0 @@
-#ifndef _BCACHEFS_ALLOC_H
-#define _BCACHEFS_ALLOC_H
-
-#include "bcachefs.h"
-#include "alloc_types.h"
-
-struct bkey;
-struct bch_dev;
-struct bch_fs;
-struct bch_devs_List;
-
-#define ALLOC_SCAN_BATCH(ca)		((ca)->mi.nbuckets >> 9)
-
-const char *bch2_alloc_invalid(const struct bch_fs *, struct bkey_s_c);
-int bch2_alloc_to_text(struct bch_fs *, char *, size_t, struct bkey_s_c);
-
-#define bch2_bkey_alloc_ops (struct bkey_ops) {		\
-	.key_invalid	= bch2_alloc_invalid,		\
-	.val_to_text	= bch2_alloc_to_text,		\
-}
-
-struct dev_alloc_list {
-	unsigned	nr;
-	u8		devs[BCH_SB_MEMBERS_MAX];
-};
-
-struct dev_alloc_list bch2_wp_alloc_list(struct bch_fs *,
-					 struct write_point *,
-					 struct bch_devs_mask *);
-void bch2_wp_rescale(struct bch_fs *, struct bch_dev *,
-		     struct write_point *);
-
-int bch2_alloc_read(struct bch_fs *, struct list_head *);
-int bch2_alloc_replay_key(struct bch_fs *, struct bpos);
-
-enum bucket_alloc_ret {
-	ALLOC_SUCCESS		= 0,
-	OPEN_BUCKETS_EMPTY	= -1,
-	FREELIST_EMPTY		= -2,	/* Allocator thread not keeping up */
-	NO_DEVICES		= -3,	/* -EROFS */
-};
-
-long bch2_bucket_alloc_new_fs(struct bch_dev *);
-
-int bch2_bucket_alloc(struct bch_fs *, struct bch_dev *, enum alloc_reserve, bool,
-		      struct closure *);
-
-#define __writepoint_for_each_ptr(_wp, _ob, _i, _start)			\
-	for ((_i) = (_start);						\
-	     (_i) < (_wp)->nr_ptrs && ((_ob) = (_wp)->ptrs[_i], true);	\
-	     (_i)++)
-
-#define writepoint_for_each_ptr_all(_wp, _ob, _i)			\
-	__writepoint_for_each_ptr(_wp, _ob, _i, 0)
-
-#define writepoint_for_each_ptr(_wp, _ob, _i)				\
-	__writepoint_for_each_ptr(_wp, _ob, _i, wp->first_ptr)
-
-void __bch2_open_bucket_put(struct bch_fs *, struct open_bucket *);
-
-static inline void bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob)
-{
-	if (atomic_dec_and_test(&ob->pin))
-		__bch2_open_bucket_put(c, ob);
-}
-
-static inline void bch2_open_bucket_put_refs(struct bch_fs *c, u8 *nr, u8 *refs)
-{
-	unsigned i;
-
-	for (i = 0; i < *nr; i++)
-		bch2_open_bucket_put(c, c->open_buckets + refs[i]);
-
-	*nr = 0;
-}
-
-static inline void bch2_open_bucket_get(struct bch_fs *c,
-					struct write_point *wp,
-					u8 *nr, u8 *refs)
-{
-	struct open_bucket *ob;
-	unsigned i;
-
-	writepoint_for_each_ptr(wp, ob, i) {
-		atomic_inc(&ob->pin);
-		refs[(*nr)++] = ob - c->open_buckets;
-	}
-}
-
-struct write_point *bch2_alloc_sectors_start(struct bch_fs *,
-					     unsigned,
-					     struct write_point_specifier,
-					     struct bch_devs_list *,
-					     unsigned, unsigned,
-					     enum alloc_reserve,
-					     unsigned,
-					     struct closure *);
-
-void bch2_alloc_sectors_append_ptrs(struct bch_fs *, struct write_point *,
-				    struct bkey_i_extent *, unsigned);
-void bch2_alloc_sectors_done(struct bch_fs *, struct write_point *);
-
-static inline void bch2_wake_allocator(struct bch_dev *ca)
-{
-	struct task_struct *p;
-
-	rcu_read_lock();
-	p = rcu_dereference(ca->alloc_thread);
-	if (p)
-		wake_up_process(p);
-	rcu_read_unlock();
-}
-
-static inline struct write_point_specifier writepoint_hashed(unsigned long v)
-{
-	return (struct write_point_specifier) { .v = v | 1 };
-}
-
-static inline struct write_point_specifier writepoint_ptr(struct write_point *wp)
-{
-	return (struct write_point_specifier) { .v = (unsigned long) wp };
-}
-
-void bch2_recalc_capacity(struct bch_fs *);
-
-void bch2_dev_allocator_remove(struct bch_fs *, struct bch_dev *);
-void bch2_dev_allocator_add(struct bch_fs *, struct bch_dev *);
-
-void bch2_dev_allocator_stop(struct bch_dev *);
-int bch2_dev_allocator_start(struct bch_dev *);
-
-static inline void writepoint_init(struct write_point *wp,
-				   enum bch_data_type type)
-{
-	mutex_init(&wp->lock);
-	wp->type = type;
-}
-
-int bch2_alloc_write(struct bch_fs *);
-int bch2_fs_allocator_start(struct bch_fs *);
-void bch2_fs_allocator_init(struct bch_fs *);
-
-#endif /* _BCACHEFS_ALLOC_H */
diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
new file mode 100644
index 000000000000..d22b2b72b0d1
--- /dev/null
+++ b/fs/bcachefs/alloc_background.c
@@ -0,0 +1,1428 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "bcachefs.h"
+#include "alloc_background.h"
+#include "alloc_foreground.h"
+#include "btree_cache.h"
+#include "btree_io.h"
+#include "btree_update.h"
+#include "btree_update_interior.h"
+#include "btree_gc.h"
+#include "buckets.h"
+#include "clock.h"
+#include "debug.h"
+#include "error.h"
+#include "journal_io.h"
+#include "trace.h"
+
+#include <linux/kthread.h>
+#include <linux/math64.h>
+#include <linux/random.h>
+#include <linux/rculist.h>
+#include <linux/rcupdate.h>
+#include <linux/sched/task.h>
+#include <linux/sort.h>
+
+static void bch2_recalc_oldest_io(struct bch_fs *, struct bch_dev *, int);
+
+/* Ratelimiting/PD controllers */
+
+static void pd_controllers_update(struct work_struct *work)
+{
+	struct bch_fs *c = container_of(to_delayed_work(work),
+					   struct bch_fs,
+					   pd_controllers_update);
+	struct bch_dev *ca;
+	unsigned i;
+
+	for_each_member_device(ca, c, i) {
+		struct bch_dev_usage stats = bch2_dev_usage_read(c, ca);
+
+		u64 free = bucket_to_sector(ca,
+				__dev_buckets_free(ca, stats)) << 9;
+		/*
+		 * Bytes of internal fragmentation, which can be
+		 * reclaimed by copy GC
+		 */
+		s64 fragmented = (bucket_to_sector(ca,
+					stats.buckets[BCH_DATA_USER] +
+					stats.buckets[BCH_DATA_CACHED]) -
+				  (stats.sectors[BCH_DATA_USER] +
+				   stats.sectors[BCH_DATA_CACHED])) << 9;
+
+		fragmented = max(0LL, fragmented);
+
+		bch2_pd_controller_update(&ca->copygc_pd,
+					 free, fragmented, -1);
+	}
+
+	schedule_delayed_work(&c->pd_controllers_update,
+			      c->pd_controllers_update_seconds * HZ);
+}
+
+/* Persistent alloc info: */
+
+static unsigned bch_alloc_val_u64s(const struct bch_alloc *a)
+{
+	unsigned bytes = offsetof(struct bch_alloc, data);
+
+	if (a->fields & (1 << BCH_ALLOC_FIELD_READ_TIME))
+		bytes += 2;
+	if (a->fields & (1 << BCH_ALLOC_FIELD_WRITE_TIME))
+		bytes += 2;
+
+	return DIV_ROUND_UP(bytes, sizeof(u64));
+}
+
+const char *bch2_alloc_invalid(const struct bch_fs *c, struct bkey_s_c k)
+{
+	if (k.k->p.inode >= c->sb.nr_devices ||
+	    !c->devs[k.k->p.inode])
+		return "invalid device";
+
+	switch (k.k->type) {
+	case BCH_ALLOC: {
+		struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k);
+
+		if (bch_alloc_val_u64s(a.v) != bkey_val_u64s(a.k))
+			return "incorrect value size";
+		break;
+	}
+	default:
+		return "invalid type";
+	}
+
+	return NULL;
+}
+
+int bch2_alloc_to_text(struct bch_fs *c, char *buf,
+		       size_t size, struct bkey_s_c k)
+{
+	buf[0] = '\0';
+
+	switch (k.k->type) {
+	case BCH_ALLOC:
+		break;
+	}
+
+	return 0;
+}
+
+static inline unsigned get_alloc_field(const u8 **p, unsigned bytes)
+{
+	unsigned v;
+
+	switch (bytes) {
+	case 1:
+		v = **p;
+		break;
+	case 2:
+		v = le16_to_cpup((void *) *p);
+		break;
+	case 4:
+		v = le32_to_cpup((void *) *p);
+		break;
+	default:
+		BUG();
+	}
+
+	*p += bytes;
+	return v;
+}
+
+static inline void put_alloc_field(u8 **p, unsigned bytes, unsigned v)
+{
+	switch (bytes) {
+	case 1:
+		**p = v;
+		break;
+	case 2:
+		*((__le16 *) *p) = cpu_to_le16(v);
+		break;
+	case 4:
+		*((__le32 *) *p) = cpu_to_le32(v);
+		break;
+	default:
+		BUG();
+	}
+
+	*p += bytes;
+}
+
+static void bch2_alloc_read_key(struct bch_fs *c, struct bkey_s_c k)
+{
+	struct bch_dev *ca;
+	struct bkey_s_c_alloc a;
+	struct bucket_mark new;
+	struct bucket *g;
+	const u8 *d;
+
+	if (k.k->type != BCH_ALLOC)
+		return;
+
+	a = bkey_s_c_to_alloc(k);
+	ca = bch_dev_bkey_exists(c, a.k->p.inode);
+
+	if (a.k->p.offset >= ca->mi.nbuckets)
+		return;
+
+	percpu_down_read(&c->usage_lock);
+
+	g = bucket(ca, a.k->p.offset);
+	bucket_cmpxchg(g, new, ({
+		new.gen = a.v->gen;
+		new.gen_valid = 1;
+	}));
+
+	d = a.v->data;
+	if (a.v->fields & (1 << BCH_ALLOC_FIELD_READ_TIME))
+		g->io_time[READ] = get_alloc_field(&d, 2);
+	if (a.v->fields & (1 << BCH_ALLOC_FIELD_WRITE_TIME))
+		g->io_time[WRITE] = get_alloc_field(&d, 2);
+
+	percpu_up_read(&c->usage_lock);
+}
+
+int bch2_alloc_read(struct bch_fs *c, struct list_head *journal_replay_list)
+{
+	struct journal_replay *r;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	struct bch_dev *ca;
+	unsigned i;
+	int ret;
+
+	for_each_btree_key(&iter, c, BTREE_ID_ALLOC, POS_MIN, 0, k) {
+		bch2_alloc_read_key(c, k);
+		bch2_btree_iter_cond_resched(&iter);
+	}
+
+	ret = bch2_btree_iter_unlock(&iter);
+	if (ret)
+		return ret;
+
+	list_for_each_entry(r, journal_replay_list, list) {
+		struct bkey_i *k, *n;
+		struct jset_entry *entry;
+
+		for_each_jset_key(k, n, entry, &r->j)
+			if (entry->btree_id == BTREE_ID_ALLOC)
+				bch2_alloc_read_key(c, bkey_i_to_s_c(k));
+	}
+
+	mutex_lock(&c->bucket_clock[READ].lock);
+	for_each_member_device(ca, c, i) {
+		down_read(&ca->bucket_lock);
+		bch2_recalc_oldest_io(c, ca, READ);
+		up_read(&ca->bucket_lock);
+	}
+	mutex_unlock(&c->bucket_clock[READ].lock);
+
+	mutex_lock(&c->bucket_clock[WRITE].lock);
+	for_each_member_device(ca, c, i) {
+		down_read(&ca->bucket_lock);
+		bch2_recalc_oldest_io(c, ca, WRITE);
+		up_read(&ca->bucket_lock);
+	}
+	mutex_unlock(&c->bucket_clock[WRITE].lock);
+
+	return 0;
+}
+
+static int __bch2_alloc_write_key(struct bch_fs *c, struct bch_dev *ca,
+				  size_t b, struct btree_iter *iter,
+				  u64 *journal_seq, unsigned flags)
+{
+	struct bucket_mark m;
+	__BKEY_PADDED(k, DIV_ROUND_UP(sizeof(struct bch_alloc), 8)) alloc_key;
+	struct bucket *g;
+	struct bkey_i_alloc *a;
+	u8 *d;
+
+	percpu_down_read(&c->usage_lock);
+	g = bucket(ca, b);
+
+	m = READ_ONCE(g->mark);
+	a = bkey_alloc_init(&alloc_key.k);
+	a->k.p		= POS(ca->dev_idx, b);
+	a->v.fields	= 0;
+	a->v.gen	= m.gen;
+	set_bkey_val_u64s(&a->k, bch_alloc_val_u64s(&a->v));
+
+	d = a->v.data;
+	if (a->v.fields & (1 << BCH_ALLOC_FIELD_READ_TIME))
+		put_alloc_field(&d, 2, g->io_time[READ]);
+	if (a->v.fields & (1 << BCH_ALLOC_FIELD_WRITE_TIME))
+		put_alloc_field(&d, 2, g->io_time[WRITE]);
+	percpu_up_read(&c->usage_lock);
+
+	bch2_btree_iter_cond_resched(iter);
+
+	bch2_btree_iter_set_pos(iter, a->k.p);
+
+	return bch2_btree_insert_at(c, NULL, journal_seq,
+				    BTREE_INSERT_NOFAIL|
+				    BTREE_INSERT_USE_RESERVE|
+				    BTREE_INSERT_USE_ALLOC_RESERVE|
+				    flags,
+				    BTREE_INSERT_ENTRY(iter, &a->k_i));
+}
+
+int bch2_alloc_replay_key(struct bch_fs *c, struct bpos pos)
+{
+	struct bch_dev *ca;
+	struct btree_iter iter;
+	int ret;
+
+	if (pos.inode >= c->sb.nr_devices || !c->devs[pos.inode])
+		return 0;
+
+	ca = bch_dev_bkey_exists(c, pos.inode);
+
+	if (pos.offset >= ca->mi.nbuckets)
+		return 0;
+
+	bch2_btree_iter_init(&iter, c, BTREE_ID_ALLOC, POS_MIN,
+			     BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+
+	ret = __bch2_alloc_write_key(c, ca, pos.offset, &iter, NULL, 0);
+	bch2_btree_iter_unlock(&iter);
+	return ret;
+}
+
+int bch2_alloc_write(struct bch_fs *c)
+{
+	struct bch_dev *ca;
+	unsigned i;
+	int ret = 0;
+
+	for_each_rw_member(ca, c, i) {
+		struct btree_iter iter;
+		unsigned long bucket;
+
+		bch2_btree_iter_init(&iter, c, BTREE_ID_ALLOC, POS_MIN,
+				     BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+
+		down_read(&ca->bucket_lock);
+		for_each_set_bit(bucket, ca->buckets_dirty, ca->mi.nbuckets) {
+			ret = __bch2_alloc_write_key(c, ca, bucket,
+						     &iter, NULL, 0);
+			if (ret)
+				break;
+
+			clear_bit(bucket, ca->buckets_dirty);
+		}
+		up_read(&ca->bucket_lock);
+		bch2_btree_iter_unlock(&iter);
+
+		if (ret) {
+			percpu_ref_put(&ca->io_ref);
+			break;
+		}
+	}
+
+	return ret;
+}
+
+/* Bucket IO clocks: */
+
+static void bch2_recalc_oldest_io(struct bch_fs *c, struct bch_dev *ca, int rw)
+{
+	struct bucket_clock *clock = &c->bucket_clock[rw];
+	struct bucket_array *buckets = bucket_array(ca);
+	struct bucket *g;
+	u16 max_last_io = 0;
+	unsigned i;
+
+	lockdep_assert_held(&c->bucket_clock[rw].lock);
+
+	/* Recalculate max_last_io for this device: */
+	for_each_bucket(g, buckets)
+		max_last_io = max(max_last_io, bucket_last_io(c, g, rw));
+
+	ca->max_last_bucket_io[rw] = max_last_io;
+
+	/* Recalculate global max_last_io: */
+	max_last_io = 0;
+
+	for_each_member_device(ca, c, i)
+		max_last_io = max(max_last_io, ca->max_last_bucket_io[rw]);
+
+	clock->max_last_io = max_last_io;
+}
+
+static void bch2_rescale_bucket_io_times(struct bch_fs *c, int rw)
+{
+	struct bucket_clock *clock = &c->bucket_clock[rw];
+	struct bucket_array *buckets;
+	struct bch_dev *ca;
+	struct bucket *g;
+	unsigned i;
+
+	trace_rescale_prios(c);
+
+	for_each_member_device(ca, c, i) {
+		down_read(&ca->bucket_lock);
+		buckets = bucket_array(ca);
+
+		for_each_bucket(g, buckets)
+			g->io_time[rw] = clock->hand -
+			bucket_last_io(c, g, rw) / 2;
+
+		bch2_recalc_oldest_io(c, ca, rw);
+
+		up_read(&ca->bucket_lock);
+	}
+}
+
+static void bch2_inc_clock_hand(struct io_timer *timer)
+{
+	struct bucket_clock *clock = container_of(timer,
+						struct bucket_clock, rescale);
+	struct bch_fs *c = container_of(clock,
+					struct bch_fs, bucket_clock[clock->rw]);
+	struct bch_dev *ca;
+	u64 capacity;
+	unsigned i;
+
+	mutex_lock(&clock->lock);
+
+	/* if clock cannot be advanced more, rescale prio */
+	if (clock->max_last_io >= U16_MAX - 2)
+		bch2_rescale_bucket_io_times(c, clock->rw);
+
+	BUG_ON(clock->max_last_io >= U16_MAX - 2);
+
+	for_each_member_device(ca, c, i)
+		ca->max_last_bucket_io[clock->rw]++;
+	clock->max_last_io++;
+	clock->hand++;
+
+	mutex_unlock(&clock->lock);
+
+	capacity = READ_ONCE(c->capacity);
+
+	if (!capacity)
+		return;
+
+	/*
+	 * we only increment when 0.1% of the filesystem capacity has been read
+	 * or written too, this determines if it's time
+	 *
+	 * XXX: we shouldn't really be going off of the capacity of devices in
+	 * RW mode (that will be 0 when we're RO, yet we can still service
+	 * reads)
+	 */
+	timer->expire += capacity >> 10;
+
+	bch2_io_timer_add(&c->io_clock[clock->rw], timer);
+}
+
+static void bch2_bucket_clock_init(struct bch_fs *c, int rw)
+{
+	struct bucket_clock *clock = &c->bucket_clock[rw];
+
+	clock->hand		= 1;
+	clock->rw		= rw;
+	clock->rescale.fn	= bch2_inc_clock_hand;
+	clock->rescale.expire	= c->capacity >> 10;
+	mutex_init(&clock->lock);
+}
+
+/* Background allocator thread: */
+
+/*
+ * Scans for buckets to be invalidated, invalidates them, rewrites prios/gens
+ * (marking them as invalidated on disk), then optionally issues discard
+ * commands to the newly free buckets, then puts them on the various freelists.
+ */
+
+#define BUCKET_GC_GEN_MAX	96U
+
+/**
+ * wait_buckets_available - wait on reclaimable buckets
+ *
+ * If there aren't enough available buckets to fill up free_inc, wait until
+ * there are.
+ */
+static int wait_buckets_available(struct bch_fs *c, struct bch_dev *ca)
+{
+	unsigned long gc_count = c->gc_count;
+	int ret = 0;
+
+	while (1) {
+		set_current_state(TASK_INTERRUPTIBLE);
+		if (kthread_should_stop()) {
+			ret = 1;
+			break;
+		}
+
+		if (gc_count != c->gc_count)
+			ca->inc_gen_really_needs_gc = 0;
+
+		if ((ssize_t) (dev_buckets_available(c, ca) -
+			       ca->inc_gen_really_needs_gc) >=
+		    (ssize_t) fifo_free(&ca->free_inc))
+			break;
+
+		up_read(&c->gc_lock);
+		schedule();
+		try_to_freeze();
+		down_read(&c->gc_lock);
+	}
+
+	__set_current_state(TASK_RUNNING);
+	return ret;
+}
+
+static bool bch2_can_invalidate_bucket(struct bch_dev *ca,
+				       size_t bucket,
+				       struct bucket_mark mark)
+{
+	u8 gc_gen;
+
+	if (!is_available_bucket(mark))
+		return false;
+
+	gc_gen = bucket_gc_gen(ca, bucket);
+
+	if (gc_gen >= BUCKET_GC_GEN_MAX / 2)
+		ca->inc_gen_needs_gc++;
+
+	if (gc_gen >= BUCKET_GC_GEN_MAX)
+		ca->inc_gen_really_needs_gc++;
+
+	return gc_gen < BUCKET_GC_GEN_MAX;
+}
+
+/*
+ * Determines what order we're going to reuse buckets, smallest bucket_key()
+ * first.
+ *
+ *
+ * - We take into account the read prio of the bucket, which gives us an
+ *   indication of how hot the data is -- we scale the prio so that the prio
+ *   farthest from the clock is worth 1/8th of the closest.
+ *
+ * - The number of sectors of cached data in the bucket, which gives us an
+ *   indication of the cost in cache misses this eviction will cause.
+ *
+ * - If hotness * sectors used compares equal, we pick the bucket with the
+ *   smallest bucket_gc_gen() - since incrementing the same bucket's generation
+ *   number repeatedly forces us to run mark and sweep gc to avoid generation
+ *   number wraparound.
+ */
+
+static unsigned long bucket_sort_key(struct bch_fs *c, struct bch_dev *ca,
+				     size_t b, struct bucket_mark m)
+{
+	unsigned last_io = bucket_last_io(c, bucket(ca, b), READ);
+	unsigned max_last_io = ca->max_last_bucket_io[READ];
+
+	/*
+	 * Time since last read, scaled to [0, 8) where larger value indicates
+	 * more recently read data:
+	 */
+	unsigned long hotness = (max_last_io - last_io) * 7 / max_last_io;
+
+	/* How much we want to keep the data in this bucket: */
+	unsigned long data_wantness =
+		(hotness + 1) * bucket_sectors_used(m);
+
+	unsigned long needs_journal_commit =
+		bucket_needs_journal_commit(m, c->journal.last_seq_ondisk);
+
+	return  (data_wantness << 9) |
+		(needs_journal_commit << 8) |
+		(bucket_gc_gen(ca, b) / 16);
+}
+
+static inline int bucket_alloc_cmp(alloc_heap *h,
+				   struct alloc_heap_entry l,
+				   struct alloc_heap_entry r)
+{
+	return (l.key > r.key) - (l.key < r.key) ?:
+		(l.nr < r.nr)  - (l.nr  > r.nr) ?:
+		(l.bucket > r.bucket) - (l.bucket < r.bucket);
+}
+
+static inline int bucket_idx_cmp(const void *_l, const void *_r)
+{
+	const struct alloc_heap_entry *l = _l, *r = _r;
+
+	return (l->bucket > r->bucket) - (l->bucket < r->bucket);
+}
+
+static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca)
+{
+	struct bucket_array *buckets;
+	struct alloc_heap_entry e = { 0 };
+	size_t b, i, nr = 0;
+
+	ca->alloc_heap.used = 0;
+
+	mutex_lock(&c->bucket_clock[READ].lock);
+	down_read(&ca->bucket_lock);
+
+	buckets = bucket_array(ca);
+
+	bch2_recalc_oldest_io(c, ca, READ);
+
+	/*
+	 * Find buckets with lowest read priority, by building a maxheap sorted
+	 * by read priority and repeatedly replacing the maximum element until
+	 * all buckets have been visited.
+	 */
+	for (b = ca->mi.first_bucket; b < ca->mi.nbuckets; b++) {
+		struct bucket_mark m = READ_ONCE(buckets->b[b].mark);
+		unsigned long key = bucket_sort_key(c, ca, b, m);
+
+		if (!bch2_can_invalidate_bucket(ca, b, m))
+			continue;
+
+		if (e.nr && e.bucket + e.nr == b && e.key == key) {
+			e.nr++;
+		} else {
+			if (e.nr)
+				heap_add_or_replace(&ca->alloc_heap, e, -bucket_alloc_cmp);
+
+			e = (struct alloc_heap_entry) {
+				.bucket = b,
+				.nr	= 1,
+				.key	= key,
+			};
+		}
+
+		cond_resched();
+	}
+
+	if (e.nr)
+		heap_add_or_replace(&ca->alloc_heap, e, -bucket_alloc_cmp);
+
+	for (i = 0; i < ca->alloc_heap.used; i++)
+		nr += ca->alloc_heap.data[i].nr;
+
+	while (nr - ca->alloc_heap.data[0].nr >= ALLOC_SCAN_BATCH(ca)) {
+		nr -= ca->alloc_heap.data[0].nr;
+		heap_pop(&ca->alloc_heap, e, -bucket_alloc_cmp);
+	}
+
+	up_read(&ca->bucket_lock);
+	mutex_unlock(&c->bucket_clock[READ].lock);
+}
+
+static void find_reclaimable_buckets_fifo(struct bch_fs *c, struct bch_dev *ca)
+{
+	struct bucket_array *buckets = bucket_array(ca);
+	struct bucket_mark m;
+	size_t b, start;
+
+	if (ca->fifo_last_bucket <  ca->mi.first_bucket ||
+	    ca->fifo_last_bucket >= ca->mi.nbuckets)
+		ca->fifo_last_bucket = ca->mi.first_bucket;
+
+	start = ca->fifo_last_bucket;
+
+	do {
+		ca->fifo_last_bucket++;
+		if (ca->fifo_last_bucket == ca->mi.nbuckets)
+			ca->fifo_last_bucket = ca->mi.first_bucket;
+
+		b = ca->fifo_last_bucket;
+		m = READ_ONCE(buckets->b[b].mark);
+
+		if (bch2_can_invalidate_bucket(ca, b, m)) {
+			struct alloc_heap_entry e = { .bucket = b, .nr = 1, };
+
+			heap_add(&ca->alloc_heap, e, bucket_alloc_cmp);
+			if (heap_full(&ca->alloc_heap))
+				break;
+		}
+
+		cond_resched();
+	} while (ca->fifo_last_bucket != start);
+}
+
+static void find_reclaimable_buckets_random(struct bch_fs *c, struct bch_dev *ca)
+{
+	struct bucket_array *buckets = bucket_array(ca);
+	struct bucket_mark m;
+	size_t checked, i;
+
+	for (checked = 0;
+	     checked < ca->mi.nbuckets / 2;
+	     checked++) {
+		size_t b = bch2_rand_range(ca->mi.nbuckets -
+					   ca->mi.first_bucket) +
+			ca->mi.first_bucket;
+
+		m = READ_ONCE(buckets->b[b].mark);
+
+		if (bch2_can_invalidate_bucket(ca, b, m)) {
+			struct alloc_heap_entry e = { .bucket = b, .nr = 1, };
+
+			heap_add(&ca->alloc_heap, e, bucket_alloc_cmp);
+			if (heap_full(&ca->alloc_heap))
+				break;
+		}
+
+		cond_resched();
+	}
+
+	sort(ca->alloc_heap.data,
+	     ca->alloc_heap.used,
+	     sizeof(ca->alloc_heap.data[0]),
+	     bucket_idx_cmp, NULL);
+
+	/* remove duplicates: */
+	for (i = 0; i + 1 < ca->alloc_heap.used; i++)
+		if (ca->alloc_heap.data[i].bucket ==
+		    ca->alloc_heap.data[i + 1].bucket)
+			ca->alloc_heap.data[i].nr = 0;
+}
+
+static size_t find_reclaimable_buckets(struct bch_fs *c, struct bch_dev *ca)
+{
+	size_t i, nr = 0;
+
+	ca->inc_gen_needs_gc			= 0;
+
+	switch (ca->mi.replacement) {
+	case CACHE_REPLACEMENT_LRU:
+		find_reclaimable_buckets_lru(c, ca);
+		break;
+	case CACHE_REPLACEMENT_FIFO:
+		find_reclaimable_buckets_fifo(c, ca);
+		break;
+	case CACHE_REPLACEMENT_RANDOM:
+		find_reclaimable_buckets_random(c, ca);
+		break;
+	}
+
+	heap_resort(&ca->alloc_heap, bucket_alloc_cmp);
+
+	for (i = 0; i < ca->alloc_heap.used; i++)
+		nr += ca->alloc_heap.data[i].nr;
+
+	return nr;
+}
+
+static inline long next_alloc_bucket(struct bch_dev *ca)
+{
+	struct alloc_heap_entry e, *top = ca->alloc_heap.data;
+
+	while (ca->alloc_heap.used) {
+		if (top->nr) {
+			size_t b = top->bucket;
+
+			top->bucket++;
+			top->nr--;
+			return b;
+		}
+
+		heap_pop(&ca->alloc_heap, e, bucket_alloc_cmp);
+	}
+
+	return -1;
+}
+
+static bool bch2_invalidate_one_bucket(struct bch_fs *c, struct bch_dev *ca,
+				       size_t bucket, u64 *flush_seq)
+{
+	struct bucket_mark m;
+
+	percpu_down_read(&c->usage_lock);
+	spin_lock(&c->freelist_lock);
+
+	bch2_invalidate_bucket(c, ca, bucket, &m);
+
+	verify_not_on_freelist(c, ca, bucket);
+	BUG_ON(!fifo_push(&ca->free_inc, bucket));
+
+	spin_unlock(&c->freelist_lock);
+
+	bucket_io_clock_reset(c, ca, bucket, READ);
+	bucket_io_clock_reset(c, ca, bucket, WRITE);
+
+	percpu_up_read(&c->usage_lock);
+
+	if (m.journal_seq_valid) {
+		u64 journal_seq = atomic64_read(&c->journal.seq);
+		u64 bucket_seq	= journal_seq;
+
+		bucket_seq &= ~((u64) U16_MAX);
+		bucket_seq |= m.journal_seq;
+
+		if (bucket_seq > journal_seq)
+			bucket_seq -= 1 << 16;
+
+		*flush_seq = max(*flush_seq, bucket_seq);
+	}
+
+	return m.cached_sectors != 0;
+}
+
+/*
+ * Pull buckets off ca->alloc_heap, invalidate them, move them to ca->free_inc:
+ */
+static int bch2_invalidate_buckets(struct bch_fs *c, struct bch_dev *ca)
+{
+	struct btree_iter iter;
+	u64 journal_seq = 0;
+	int ret = 0;
+	long b;
+
+	bch2_btree_iter_init(&iter, c, BTREE_ID_ALLOC, POS(ca->dev_idx, 0),
+			     BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+
+	/* Only use nowait if we've already invalidated at least one bucket: */
+	while (!ret &&
+	       !fifo_full(&ca->free_inc) &&
+	       (b = next_alloc_bucket(ca)) >= 0) {
+		bool must_flush =
+			bch2_invalidate_one_bucket(c, ca, b, &journal_seq);
+
+		ret = __bch2_alloc_write_key(c, ca, b, &iter,
+				must_flush ? &journal_seq : NULL,
+				!fifo_empty(&ca->free_inc) ? BTREE_INSERT_NOWAIT : 0);
+	}
+
+	bch2_btree_iter_unlock(&iter);
+
+	/* If we used NOWAIT, don't return the error: */
+	if (!fifo_empty(&ca->free_inc))
+		ret = 0;
+	if (ret) {
+		bch_err(ca, "error invalidating buckets: %i", ret);
+		return ret;
+	}
+
+	if (journal_seq)
+		ret = bch2_journal_flush_seq(&c->journal, journal_seq);
+	if (ret) {
+		bch_err(ca, "journal error: %i", ret);
+		return ret;
+	}
+
+	return 0;
+}
+
+static int push_invalidated_bucket(struct bch_fs *c, struct bch_dev *ca, size_t bucket)
+{
+	unsigned i;
+	int ret = 0;
+
+	while (1) {
+		set_current_state(TASK_INTERRUPTIBLE);
+
+		spin_lock(&c->freelist_lock);
+		for (i = 0; i < RESERVE_NR; i++)
+			if (fifo_push(&ca->free[i], bucket)) {
+				fifo_pop(&ca->free_inc, bucket);
+				closure_wake_up(&c->freelist_wait);
+				spin_unlock(&c->freelist_lock);
+				goto out;
+			}
+		spin_unlock(&c->freelist_lock);
+
+		if ((current->flags & PF_KTHREAD) &&
+		    kthread_should_stop()) {
+			ret = 1;
+			break;
+		}
+
+		schedule();
+		try_to_freeze();
+	}
+out:
+	__set_current_state(TASK_RUNNING);
+	return ret;
+}
+
+/*
+ * Pulls buckets off free_inc, discards them (if enabled), then adds them to
+ * freelists, waiting until there's room if necessary:
+ */
+static int discard_invalidated_buckets(struct bch_fs *c, struct bch_dev *ca)
+{
+	while (!fifo_empty(&ca->free_inc)) {
+		size_t bucket = fifo_peek(&ca->free_inc);
+
+		if (ca->mi.discard &&
+		    bdev_max_discard_sectors(ca->disk_sb.bdev))
+			blkdev_issue_discard(ca->disk_sb.bdev,
+					     bucket_to_sector(ca, bucket),
+					     ca->mi.bucket_size, GFP_NOIO);
+
+		if (push_invalidated_bucket(c, ca, bucket))
+			return 1;
+	}
+
+	return 0;
+}
+
+/**
+ * bch_allocator_thread - move buckets from free_inc to reserves
+ *
+ * The free_inc FIFO is populated by find_reclaimable_buckets(), and
+ * the reserves are depleted by bucket allocation. When we run out
+ * of free_inc, try to invalidate some buckets and write out
+ * prios and gens.
+ */
+static int bch2_allocator_thread(void *arg)
+{
+	struct bch_dev *ca = arg;
+	struct bch_fs *c = ca->fs;
+	size_t nr;
+	int ret;
+
+	set_freezable();
+
+	while (1) {
+		cond_resched();
+
+		pr_debug("discarding %zu invalidated buckets",
+			 fifo_used(&ca->free_inc));
+
+		ret = discard_invalidated_buckets(c, ca);
+		if (ret)
+			goto stop;
+
+		down_read(&c->gc_lock);
+
+		ret = bch2_invalidate_buckets(c, ca);
+		if (ret) {
+			up_read(&c->gc_lock);
+			goto stop;
+		}
+
+		if (!fifo_empty(&ca->free_inc)) {
+			up_read(&c->gc_lock);
+			continue;
+		}
+
+		pr_debug("free_inc now empty");
+
+		do {
+			if (test_bit(BCH_FS_GC_FAILURE, &c->flags)) {
+				up_read(&c->gc_lock);
+				bch_err(ca, "gc failure");
+				goto stop;
+			}
+
+			/*
+			 * Find some buckets that we can invalidate, either
+			 * they're completely unused, or only contain clean data
+			 * that's been written back to the backing device or
+			 * another cache tier
+			 */
+
+			pr_debug("scanning for reclaimable buckets");
+
+			nr = find_reclaimable_buckets(c, ca);
+
+			pr_debug("found %zu buckets", nr);
+
+			trace_alloc_batch(ca, nr, ca->alloc_heap.size);
+
+			if ((ca->inc_gen_needs_gc >= ALLOC_SCAN_BATCH(ca) ||
+			     ca->inc_gen_really_needs_gc) &&
+			    c->gc_thread) {
+				atomic_inc(&c->kick_gc);
+				wake_up_process(c->gc_thread);
+			}
+
+			/*
+			 * If we found any buckets, we have to invalidate them
+			 * before we scan for more - but if we didn't find very
+			 * many we may want to wait on more buckets being
+			 * available so we don't spin:
+			 */
+			if (!nr ||
+			    (nr < ALLOC_SCAN_BATCH(ca) &&
+			     !fifo_full(&ca->free[RESERVE_MOVINGGC]))) {
+				ca->allocator_blocked = true;
+				closure_wake_up(&c->freelist_wait);
+
+				ret = wait_buckets_available(c, ca);
+				if (ret) {
+					up_read(&c->gc_lock);
+					goto stop;
+				}
+			}
+		} while (!nr);
+
+		ca->allocator_blocked = false;
+		up_read(&c->gc_lock);
+
+		pr_debug("%zu buckets to invalidate", nr);
+
+		/*
+		 * alloc_heap is now full of newly-invalidated buckets: next,
+		 * write out the new bucket gens:
+		 */
+	}
+
+stop:
+	pr_debug("alloc thread stopping (ret %i)", ret);
+	return 0;
+}
+
+/* Startup/shutdown (ro/rw): */
+
+void bch2_recalc_capacity(struct bch_fs *c)
+{
+	struct bch_dev *ca;
+	u64 capacity = 0, reserved_sectors = 0, gc_reserve;
+	unsigned long ra_pages = 0;
+	unsigned i, j;
+
+	lockdep_assert_held(&c->state_lock);
+
+	for_each_online_member(ca, c, i) {
+		struct backing_dev_info *bdi = ca->disk_sb.bdev->bd_disk->bdi;
+
+		ra_pages += bdi->ra_pages;
+	}
+
+	bch2_set_ra_pages(c, ra_pages);
+
+	for_each_rw_member(ca, c, i) {
+		u64 dev_reserve = 0;
+
+		/*
+		 * We need to reserve buckets (from the number
+		 * of currently available buckets) against
+		 * foreground writes so that mainly copygc can
+		 * make forward progress.
+		 *
+		 * We need enough to refill the various reserves
+		 * from scratch - copygc will use its entire
+		 * reserve all at once, then run against when
+		 * its reserve is refilled (from the formerly
+		 * available buckets).
+		 *
+		 * This reserve is just used when considering if
+		 * allocations for foreground writes must wait -
+		 * not -ENOSPC calculations.
+		 */
+		for (j = 0; j < RESERVE_NONE; j++)
+			dev_reserve += ca->free[j].size;
+
+		dev_reserve += ca->free_inc.size;
+
+		dev_reserve += ARRAY_SIZE(c->write_points);
+
+		dev_reserve += 1;	/* btree write point */
+		dev_reserve += 1;	/* copygc write point */
+		dev_reserve += 1;	/* rebalance write point */
+		dev_reserve += WRITE_POINT_COUNT;
+
+		dev_reserve *= ca->mi.bucket_size;
+
+		ca->copygc_threshold = dev_reserve;
+
+		capacity += bucket_to_sector(ca, ca->mi.nbuckets -
+					     ca->mi.first_bucket);
+
+		reserved_sectors += dev_reserve * 2;
+	}
+
+	gc_reserve = c->opts.gc_reserve_bytes
+		? c->opts.gc_reserve_bytes >> 9
+		: div64_u64(capacity * c->opts.gc_reserve_percent, 100);
+
+	reserved_sectors = max(gc_reserve, reserved_sectors);
+
+	reserved_sectors = min(reserved_sectors, capacity);
+
+	c->capacity = capacity - reserved_sectors;
+
+	if (c->capacity) {
+		bch2_io_timer_add(&c->io_clock[READ],
+				 &c->bucket_clock[READ].rescale);
+		bch2_io_timer_add(&c->io_clock[WRITE],
+				 &c->bucket_clock[WRITE].rescale);
+	} else {
+		bch2_io_timer_del(&c->io_clock[READ],
+				 &c->bucket_clock[READ].rescale);
+		bch2_io_timer_del(&c->io_clock[WRITE],
+				 &c->bucket_clock[WRITE].rescale);
+	}
+
+	/* Wake up case someone was waiting for buckets */
+	closure_wake_up(&c->freelist_wait);
+}
+
+static bool bch2_dev_has_open_write_point(struct bch_fs *c, struct bch_dev *ca)
+{
+	struct open_bucket *ob;
+	bool ret = false;
+
+	for (ob = c->open_buckets;
+	     ob < c->open_buckets + ARRAY_SIZE(c->open_buckets);
+	     ob++) {
+		spin_lock(&ob->lock);
+		if (ob->valid && !ob->on_partial_list &&
+		    ob->ptr.dev == ca->dev_idx)
+			ret = true;
+		spin_unlock(&ob->lock);
+	}
+
+	return ret;
+}
+
+/* device goes ro: */
+void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca)
+{
+	unsigned i;
+
+	BUG_ON(ca->alloc_thread);
+
+	/* First, remove device from allocation groups: */
+
+	for (i = 0; i < ARRAY_SIZE(c->rw_devs); i++)
+		clear_bit(ca->dev_idx, c->rw_devs[i].d);
+
+	/*
+	 * Capacity is calculated based off of devices in allocation groups:
+	 */
+	bch2_recalc_capacity(c);
+
+	/* Next, close write points that point to this device... */
+	for (i = 0; i < ARRAY_SIZE(c->write_points); i++)
+		bch2_writepoint_stop(c, ca, &c->write_points[i]);
+
+	bch2_writepoint_stop(c, ca, &ca->copygc_write_point);
+	bch2_writepoint_stop(c, ca, &c->rebalance_write_point);
+	bch2_writepoint_stop(c, ca, &c->btree_write_point);
+
+	mutex_lock(&c->btree_reserve_cache_lock);
+	while (c->btree_reserve_cache_nr) {
+		struct btree_alloc *a =
+			&c->btree_reserve_cache[--c->btree_reserve_cache_nr];
+
+		bch2_open_bucket_put_refs(c, &a->ob.nr, a->ob.refs);
+	}
+	mutex_unlock(&c->btree_reserve_cache_lock);
+
+	/*
+	 * Wake up threads that were blocked on allocation, so they can notice
+	 * the device can no longer be removed and the capacity has changed:
+	 */
+	closure_wake_up(&c->freelist_wait);
+
+	/*
+	 * journal_res_get() can block waiting for free space in the journal -
+	 * it needs to notice there may not be devices to allocate from anymore:
+	 */
+	wake_up(&c->journal.wait);
+
+	/* Now wait for any in flight writes: */
+
+	closure_wait_event(&c->open_buckets_wait,
+			   !bch2_dev_has_open_write_point(c, ca));
+}
+
+/* device goes rw: */
+void bch2_dev_allocator_add(struct bch_fs *c, struct bch_dev *ca)
+{
+	unsigned i;
+
+	for (i = 0; i < ARRAY_SIZE(c->rw_devs); i++)
+		if (ca->mi.data_allowed & (1 << i))
+			set_bit(ca->dev_idx, c->rw_devs[i].d);
+}
+
+/* stop allocator thread: */
+void bch2_dev_allocator_stop(struct bch_dev *ca)
+{
+	struct task_struct *p;
+
+	p = rcu_dereference_protected(ca->alloc_thread, 1);
+	ca->alloc_thread = NULL;
+
+	/*
+	 * We need an rcu barrier between setting ca->alloc_thread = NULL and
+	 * the thread shutting down to avoid bch2_wake_allocator() racing:
+	 *
+	 * XXX: it would be better to have the rcu barrier be asynchronous
+	 * instead of blocking us here
+	 */
+	synchronize_rcu();
+
+	if (p) {
+		kthread_stop(p);
+		put_task_struct(p);
+	}
+}
+
+/* start allocator thread: */
+int bch2_dev_allocator_start(struct bch_dev *ca)
+{
+	struct task_struct *p;
+
+	/*
+	 * allocator thread already started?
+	 */
+	if (ca->alloc_thread)
+		return 0;
+
+	p = kthread_create(bch2_allocator_thread, ca,
+			   "bch_alloc[%s]", ca->name);
+	if (IS_ERR(p))
+		return PTR_ERR(p);
+
+	get_task_struct(p);
+	rcu_assign_pointer(ca->alloc_thread, p);
+	wake_up_process(p);
+	return 0;
+}
+
+static void flush_held_btree_writes(struct bch_fs *c)
+{
+	struct bucket_table *tbl;
+	struct rhash_head *pos;
+	struct btree *b;
+	bool flush_updates;
+	size_t i, nr_pending_updates;
+
+	clear_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags);
+again:
+	pr_debug("flushing dirty btree nodes");
+	cond_resched();
+
+	flush_updates = false;
+	nr_pending_updates = bch2_btree_interior_updates_nr_pending(c);
+
+	rcu_read_lock();
+	for_each_cached_btree(b, c, tbl, i, pos)
+		if (btree_node_dirty(b) && (!b->written || b->level)) {
+			if (btree_node_may_write(b)) {
+				rcu_read_unlock();
+				btree_node_lock_type(c, b, SIX_LOCK_read);
+				bch2_btree_node_write(c, b, SIX_LOCK_read);
+				six_unlock_read(&b->lock);
+				goto again;
+			} else {
+				flush_updates = true;
+			}
+		}
+	rcu_read_unlock();
+
+	if (c->btree_roots_dirty)
+		bch2_journal_meta(&c->journal);
+
+	/*
+	 * This is ugly, but it's needed to flush btree node writes
+	 * without spinning...
+	 */
+	if (flush_updates) {
+		closure_wait_event(&c->btree_interior_update_wait,
+				   bch2_btree_interior_updates_nr_pending(c) <
+				   nr_pending_updates);
+		goto again;
+	}
+
+}
+
+static void allocator_start_issue_discards(struct bch_fs *c)
+{
+	struct bch_dev *ca;
+	unsigned dev_iter;
+	size_t bu;
+
+	for_each_rw_member(ca, c, dev_iter)
+		while (fifo_pop(&ca->free_inc, bu))
+			blkdev_issue_discard(ca->disk_sb.bdev,
+					     bucket_to_sector(ca, bu),
+					     ca->mi.bucket_size, GFP_NOIO);
+}
+
+static int __bch2_fs_allocator_start(struct bch_fs *c)
+{
+	struct bch_dev *ca;
+	unsigned dev_iter;
+	u64 journal_seq = 0;
+	long bu;
+	bool invalidating_data = false;
+	int ret = 0;
+
+	if (test_bit(BCH_FS_GC_FAILURE, &c->flags))
+		return -1;
+
+	if (test_alloc_startup(c)) {
+		invalidating_data = true;
+		goto not_enough;
+	}
+
+	/* Scan for buckets that are already invalidated: */
+	for_each_rw_member(ca, c, dev_iter) {
+		struct btree_iter iter;
+		struct bucket_mark m;
+		struct bkey_s_c k;
+
+		for_each_btree_key(&iter, c, BTREE_ID_ALLOC, POS(ca->dev_idx, 0), 0, k) {
+			if (k.k->type != BCH_ALLOC)
+				continue;
+
+			bu = k.k->p.offset;
+			m = READ_ONCE(bucket(ca, bu)->mark);
+
+			if (!is_available_bucket(m) || m.cached_sectors)
+				continue;
+
+			percpu_down_read(&c->usage_lock);
+			bch2_mark_alloc_bucket(c, ca, bu, true,
+					gc_pos_alloc(c, NULL),
+					BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
+					BCH_BUCKET_MARK_GC_LOCK_HELD);
+			percpu_up_read(&c->usage_lock);
+
+			fifo_push(&ca->free_inc, bu);
+
+			if (fifo_full(&ca->free_inc))
+				break;
+		}
+		bch2_btree_iter_unlock(&iter);
+	}
+
+	/* did we find enough buckets? */
+	for_each_rw_member(ca, c, dev_iter)
+		if (fifo_used(&ca->free_inc) < ca->free[RESERVE_BTREE].size) {
+			percpu_ref_put(&ca->io_ref);
+			goto not_enough;
+		}
+
+	return 0;
+not_enough:
+	pr_debug("did not find enough empty buckets; issuing discards");
+
+	/* clear out free_inc, we'll be using it again below: */
+	for_each_rw_member(ca, c, dev_iter)
+		discard_invalidated_buckets(c, ca);
+
+	pr_debug("scanning for reclaimable buckets");
+
+	for_each_rw_member(ca, c, dev_iter) {
+		find_reclaimable_buckets(c, ca);
+
+		while (!fifo_full(&ca->free[RESERVE_BTREE]) &&
+		       (bu = next_alloc_bucket(ca)) >= 0) {
+			invalidating_data |=
+				bch2_invalidate_one_bucket(c, ca, bu, &journal_seq);
+
+			fifo_push(&ca->free[RESERVE_BTREE], bu);
+			set_bit(bu, ca->buckets_dirty);
+		}
+	}
+
+	pr_debug("done scanning for reclaimable buckets");
+
+	/*
+	 * We're moving buckets to freelists _before_ they've been marked as
+	 * invalidated on disk - we have to so that we can allocate new btree
+	 * nodes to mark them as invalidated on disk.
+	 *
+	 * However, we can't _write_ to any of these buckets yet - they might
+	 * have cached data in them, which is live until they're marked as
+	 * invalidated on disk:
+	 */
+	if (invalidating_data) {
+		BUG();
+		pr_info("holding writes");
+		pr_debug("invalidating existing data");
+		set_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags);
+	} else {
+		pr_debug("issuing discards");
+		allocator_start_issue_discards(c);
+	}
+
+	/*
+	 * XXX: it's possible for this to deadlock waiting on journal reclaim,
+	 * since we're holding btree writes. What then?
+	 */
+	ret = bch2_alloc_write(c);
+	if (ret)
+		return ret;
+
+	if (invalidating_data) {
+		pr_debug("flushing journal");
+
+		ret = bch2_journal_flush_seq(&c->journal, journal_seq);
+		if (ret)
+			return ret;
+
+		pr_debug("issuing discards");
+		allocator_start_issue_discards(c);
+	}
+
+	set_bit(BCH_FS_ALLOCATOR_STARTED, &c->flags);
+
+	/* now flush dirty btree nodes: */
+	if (invalidating_data)
+		flush_held_btree_writes(c);
+
+	return 0;
+}
+
+int bch2_fs_allocator_start(struct bch_fs *c)
+{
+	struct bch_dev *ca;
+	unsigned i;
+	int ret;
+
+	down_read(&c->gc_lock);
+	ret = __bch2_fs_allocator_start(c);
+	up_read(&c->gc_lock);
+
+	if (ret)
+		return ret;
+
+	for_each_rw_member(ca, c, i) {
+		ret = bch2_dev_allocator_start(ca);
+		if (ret) {
+			percpu_ref_put(&ca->io_ref);
+			return ret;
+		}
+	}
+
+	return bch2_alloc_write(c);
+}
+
+void bch2_fs_allocator_init(struct bch_fs *c)
+{
+	struct open_bucket *ob;
+	struct write_point *wp;
+
+	mutex_init(&c->write_points_hash_lock);
+	spin_lock_init(&c->freelist_lock);
+	bch2_bucket_clock_init(c, READ);
+	bch2_bucket_clock_init(c, WRITE);
+
+	/* open bucket 0 is a sentinal NULL: */
+	spin_lock_init(&c->open_buckets[0].lock);
+
+	for (ob = c->open_buckets + 1;
+	     ob < c->open_buckets + ARRAY_SIZE(c->open_buckets); ob++) {
+		spin_lock_init(&ob->lock);
+		c->open_buckets_nr_free++;
+
+		ob->freelist = c->open_buckets_freelist;
+		c->open_buckets_freelist = ob - c->open_buckets;
+	}
+
+	writepoint_init(&c->btree_write_point, BCH_DATA_BTREE);
+	writepoint_init(&c->rebalance_write_point, BCH_DATA_USER);
+
+	for (wp = c->write_points;
+	     wp < c->write_points + ARRAY_SIZE(c->write_points); wp++) {
+		writepoint_init(wp, BCH_DATA_USER);
+
+		wp->last_used	= sched_clock();
+		wp->write_point	= (unsigned long) wp;
+		hlist_add_head_rcu(&wp->node, writepoint_hash(c, wp->write_point));
+	}
+
+	c->pd_controllers_update_seconds = 5;
+	INIT_DELAYED_WORK(&c->pd_controllers_update, pd_controllers_update);
+}
diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h
new file mode 100644
index 000000000000..6dbabe83cab7
--- /dev/null
+++ b/fs/bcachefs/alloc_background.h
@@ -0,0 +1,62 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_ALLOC_BACKGROUND_H
+#define _BCACHEFS_ALLOC_BACKGROUND_H
+
+#include "bcachefs.h"
+#include "alloc_types.h"
+#include "debug.h"
+
+#define ALLOC_SCAN_BATCH(ca)		((ca)->mi.nbuckets >> 9)
+
+const char *bch2_alloc_invalid(const struct bch_fs *, struct bkey_s_c);
+int bch2_alloc_to_text(struct bch_fs *, char *, size_t, struct bkey_s_c);
+
+#define bch2_bkey_alloc_ops (struct bkey_ops) {		\
+	.key_invalid	= bch2_alloc_invalid,		\
+	.val_to_text	= bch2_alloc_to_text,		\
+}
+
+int bch2_alloc_read(struct bch_fs *, struct list_head *);
+int bch2_alloc_replay_key(struct bch_fs *, struct bpos);
+
+static inline void bch2_wake_allocator(struct bch_dev *ca)
+{
+	struct task_struct *p;
+
+	rcu_read_lock();
+	p = rcu_dereference(ca->alloc_thread);
+	if (p)
+		wake_up_process(p);
+	rcu_read_unlock();
+}
+
+static inline void verify_not_on_freelist(struct bch_fs *c, struct bch_dev *ca,
+					  size_t bucket)
+{
+	if (expensive_debug_checks(c) &&
+	    test_bit(BCH_FS_ALLOCATOR_STARTED, &c->flags)) {
+		size_t iter;
+		long i;
+		unsigned j;
+
+		for (j = 0; j < RESERVE_NR; j++)
+			fifo_for_each_entry(i, &ca->free[j], iter)
+				BUG_ON(i == bucket);
+		fifo_for_each_entry(i, &ca->free_inc, iter)
+			BUG_ON(i == bucket);
+	}
+}
+
+void bch2_recalc_capacity(struct bch_fs *);
+
+void bch2_dev_allocator_remove(struct bch_fs *, struct bch_dev *);
+void bch2_dev_allocator_add(struct bch_fs *, struct bch_dev *);
+
+void bch2_dev_allocator_stop(struct bch_dev *);
+int bch2_dev_allocator_start(struct bch_dev *);
+
+int bch2_alloc_write(struct bch_fs *);
+int bch2_fs_allocator_start(struct bch_fs *);
+void bch2_fs_allocator_init(struct bch_fs *);
+
+#endif /* _BCACHEFS_ALLOC_BACKGROUND_H */
diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
new file mode 100644
index 000000000000..be25e01a7d00
--- /dev/null
+++ b/fs/bcachefs/alloc_foreground.c
@@ -0,0 +1,741 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Primary bucket allocation code
+ *
+ * Copyright 2012 Google, Inc.
+ *
+ * Allocation in bcache is done in terms of buckets:
+ *
+ * Each bucket has associated an 8 bit gen; this gen corresponds to the gen in
+ * btree pointers - they must match for the pointer to be considered valid.
+ *
+ * Thus (assuming a bucket has no dirty data or metadata in it) we can reuse a
+ * bucket simply by incrementing its gen.
+ *
+ * The gens (along with the priorities; it's really the gens are important but
+ * the code is named as if it's the priorities) are written in an arbitrary list
+ * of buckets on disk, with a pointer to them in the journal header.
+ *
+ * When we invalidate a bucket, we have to write its new gen to disk and wait
+ * for that write to complete before we use it - otherwise after a crash we
+ * could have pointers that appeared to be good but pointed to data that had
+ * been overwritten.
+ *
+ * Since the gens and priorities are all stored contiguously on disk, we can
+ * batch this up: We fill up the free_inc list with freshly invalidated buckets,
+ * call prio_write(), and when prio_write() finishes we pull buckets off the
+ * free_inc list and optionally discard them.
+ *
+ * free_inc isn't the only freelist - if it was, we'd often have to sleep while
+ * priorities and gens were being written before we could allocate. c->free is a
+ * smaller freelist, and buckets on that list are always ready to be used.
+ *
+ * If we've got discards enabled, that happens when a bucket moves from the
+ * free_inc list to the free list.
+ *
+ * It's important to ensure that gens don't wrap around - with respect to
+ * either the oldest gen in the btree or the gen on disk. This is quite
+ * difficult to do in practice, but we explicitly guard against it anyways - if
+ * a bucket is in danger of wrapping around we simply skip invalidating it that
+ * time around, and we garbage collect or rewrite the priorities sooner than we
+ * would have otherwise.
+ *
+ * bch2_bucket_alloc() allocates a single bucket from a specific device.
+ *
+ * bch2_bucket_alloc_set() allocates one or more buckets from different devices
+ * in a given filesystem.
+ *
+ * invalidate_buckets() drives all the processes described above. It's called
+ * from bch2_bucket_alloc() and a few other places that need to make sure free
+ * buckets are ready.
+ *
+ * invalidate_buckets_(lru|fifo)() find buckets that are available to be
+ * invalidated, and then invalidate them and stick them on the free_inc list -
+ * in either lru or fifo order.
+ */
+
+#include "bcachefs.h"
+#include "alloc_background.h"
+#include "alloc_foreground.h"
+#include "btree_gc.h"
+#include "buckets.h"
+#include "clock.h"
+#include "debug.h"
+#include "disk_groups.h"
+#include "io.h"
+#include "trace.h"
+
+#include <linux/math64.h>
+#include <linux/rculist.h>
+#include <linux/rcupdate.h>
+
+enum bucket_alloc_ret {
+	ALLOC_SUCCESS		= 0,
+	OPEN_BUCKETS_EMPTY	= -1,
+	FREELIST_EMPTY		= -2,	/* Allocator thread not keeping up */
+	NO_DEVICES		= -3,	/* -EROFS */
+};
+
+/*
+ * Open buckets represent a bucket that's currently being allocated from.  They
+ * serve two purposes:
+ *
+ *  - They track buckets that have been partially allocated, allowing for
+ *    sub-bucket sized allocations - they're used by the sector allocator below
+ *
+ *  - They provide a reference to the buckets they own that mark and sweep GC
+ *    can find, until the new allocation has a pointer to it inserted into the
+ *    btree
+ *
+ * When allocating some space with the sector allocator, the allocation comes
+ * with a reference to an open bucket - the caller is required to put that
+ * reference _after_ doing the index update that makes its allocation reachable.
+ */
+
+void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob)
+{
+	struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev);
+
+	percpu_down_read(&c->usage_lock);
+	spin_lock(&ob->lock);
+
+	bch2_mark_alloc_bucket(c, ca, PTR_BUCKET_NR(ca, &ob->ptr),
+			       false, gc_pos_alloc(c, ob), 0);
+	ob->valid = false;
+
+	spin_unlock(&ob->lock);
+	percpu_up_read(&c->usage_lock);
+
+	spin_lock(&c->freelist_lock);
+	ob->freelist = c->open_buckets_freelist;
+	c->open_buckets_freelist = ob - c->open_buckets;
+	c->open_buckets_nr_free++;
+	spin_unlock(&c->freelist_lock);
+
+	closure_wake_up(&c->open_buckets_wait);
+}
+
+static struct open_bucket *bch2_open_bucket_alloc(struct bch_fs *c)
+{
+	struct open_bucket *ob;
+
+	BUG_ON(!c->open_buckets_freelist || !c->open_buckets_nr_free);
+
+	ob = c->open_buckets + c->open_buckets_freelist;
+	c->open_buckets_freelist = ob->freelist;
+	atomic_set(&ob->pin, 1);
+
+	c->open_buckets_nr_free--;
+	return ob;
+}
+
+/* _only_ for allocating the journal on a new device: */
+long bch2_bucket_alloc_new_fs(struct bch_dev *ca)
+{
+	struct bucket_array *buckets;
+	ssize_t b;
+
+	rcu_read_lock();
+	buckets = bucket_array(ca);
+
+	for (b = ca->mi.first_bucket; b < ca->mi.nbuckets; b++)
+		if (is_available_bucket(buckets->b[b].mark))
+			goto success;
+	b = -1;
+success:
+	rcu_read_unlock();
+	return b;
+}
+
+static inline unsigned open_buckets_reserved(enum alloc_reserve reserve)
+{
+	switch (reserve) {
+	case RESERVE_ALLOC:
+		return 0;
+	case RESERVE_BTREE:
+		return BTREE_NODE_RESERVE / 2;
+	default:
+		return BTREE_NODE_RESERVE;
+	}
+}
+
+/**
+ * bch_bucket_alloc - allocate a single bucket from a specific device
+ *
+ * Returns index of bucket on success, 0 on failure
+ * */
+int bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca,
+		      enum alloc_reserve reserve,
+		      bool may_alloc_partial,
+		      struct closure *cl)
+{
+	struct bucket_array *buckets;
+	struct open_bucket *ob;
+	long bucket;
+
+	spin_lock(&c->freelist_lock);
+
+	if (may_alloc_partial &&
+	    ca->open_buckets_partial_nr) {
+		int ret = ca->open_buckets_partial[--ca->open_buckets_partial_nr];
+		c->open_buckets[ret].on_partial_list = false;
+		spin_unlock(&c->freelist_lock);
+		return ret;
+	}
+
+	if (unlikely(c->open_buckets_nr_free <= open_buckets_reserved(reserve))) {
+		if (cl)
+			closure_wait(&c->open_buckets_wait, cl);
+		spin_unlock(&c->freelist_lock);
+		trace_open_bucket_alloc_fail(ca, reserve);
+		return OPEN_BUCKETS_EMPTY;
+	}
+
+	if (likely(fifo_pop(&ca->free[RESERVE_NONE], bucket)))
+		goto out;
+
+	switch (reserve) {
+	case RESERVE_ALLOC:
+		if (fifo_pop(&ca->free[RESERVE_BTREE], bucket))
+			goto out;
+		break;
+	case RESERVE_BTREE:
+		if (fifo_used(&ca->free[RESERVE_BTREE]) * 2 >=
+		    ca->free[RESERVE_BTREE].size &&
+		    fifo_pop(&ca->free[RESERVE_BTREE], bucket))
+			goto out;
+		break;
+	case RESERVE_MOVINGGC:
+		if (fifo_pop(&ca->free[RESERVE_MOVINGGC], bucket))
+			goto out;
+		break;
+	default:
+		break;
+	}
+
+	if (cl)
+		closure_wait(&c->freelist_wait, cl);
+
+	spin_unlock(&c->freelist_lock);
+
+	trace_bucket_alloc_fail(ca, reserve);
+	return FREELIST_EMPTY;
+out:
+	verify_not_on_freelist(c, ca, bucket);
+
+	ob = bch2_open_bucket_alloc(c);
+
+	spin_lock(&ob->lock);
+	buckets = bucket_array(ca);
+
+	ob->valid	= true;
+	ob->sectors_free = ca->mi.bucket_size;
+	ob->ptr		= (struct bch_extent_ptr) {
+		.gen	= buckets->b[bucket].mark.gen,
+		.offset	= bucket_to_sector(ca, bucket),
+		.dev	= ca->dev_idx,
+	};
+
+	bucket_io_clock_reset(c, ca, bucket, READ);
+	bucket_io_clock_reset(c, ca, bucket, WRITE);
+	spin_unlock(&ob->lock);
+
+	spin_unlock(&c->freelist_lock);
+
+	bch2_wake_allocator(ca);
+
+	trace_bucket_alloc(ca, reserve);
+	return ob - c->open_buckets;
+}
+
+static int __dev_alloc_cmp(struct write_point *wp,
+			   unsigned l, unsigned r)
+{
+	return ((wp->next_alloc[l] > wp->next_alloc[r]) -
+		(wp->next_alloc[l] < wp->next_alloc[r]));
+}
+
+#define dev_alloc_cmp(l, r) __dev_alloc_cmp(wp, l, r)
+
+struct dev_alloc_list bch2_wp_alloc_list(struct bch_fs *c,
+					 struct write_point *wp,
+					 struct bch_devs_mask *devs)
+{
+	struct dev_alloc_list ret = { .nr = 0 };
+	struct bch_dev *ca;
+	unsigned i;
+
+	for_each_member_device_rcu(ca, c, i, devs)
+		ret.devs[ret.nr++] = i;
+
+	bubble_sort(ret.devs, ret.nr, dev_alloc_cmp);
+	return ret;
+}
+
+void bch2_wp_rescale(struct bch_fs *c, struct bch_dev *ca,
+		     struct write_point *wp)
+{
+	u64 *v = wp->next_alloc + ca->dev_idx;
+	u64 free_space = dev_buckets_free(c, ca);
+	u64 free_space_inv = free_space
+		? div64_u64(1ULL << 48, free_space)
+		: 1ULL << 48;
+	u64 scale = *v / 4;
+
+	if (*v + free_space_inv >= *v)
+		*v += free_space_inv;
+	else
+		*v = U64_MAX;
+
+	for (v = wp->next_alloc;
+	     v < wp->next_alloc + ARRAY_SIZE(wp->next_alloc); v++)
+		*v = *v < scale ? 0 : *v - scale;
+}
+
+static enum bucket_alloc_ret bch2_bucket_alloc_set(struct bch_fs *c,
+					struct write_point *wp,
+					unsigned nr_replicas,
+					enum alloc_reserve reserve,
+					struct bch_devs_mask *devs,
+					struct closure *cl)
+{
+	enum bucket_alloc_ret ret = NO_DEVICES;
+	struct dev_alloc_list devs_sorted;
+	struct bch_dev *ca;
+	unsigned i, nr_ptrs_effective = 0;
+	bool have_cache_dev = false;
+
+	BUG_ON(nr_replicas > ARRAY_SIZE(wp->ptrs));
+
+	for (i = wp->first_ptr; i < wp->nr_ptrs; i++) {
+		ca = bch_dev_bkey_exists(c, wp->ptrs[i]->ptr.dev);
+
+		nr_ptrs_effective += ca->mi.durability;
+		have_cache_dev |= !ca->mi.durability;
+	}
+
+	if (nr_ptrs_effective >= nr_replicas)
+		return ALLOC_SUCCESS;
+
+	devs_sorted = bch2_wp_alloc_list(c, wp, devs);
+
+	for (i = 0; i < devs_sorted.nr; i++) {
+		int ob;
+
+		ca = rcu_dereference(c->devs[devs_sorted.devs[i]]);
+		if (!ca)
+			continue;
+
+		if (!ca->mi.durability &&
+		    (have_cache_dev ||
+		     wp->type != BCH_DATA_USER))
+			continue;
+
+		ob = bch2_bucket_alloc(c, ca, reserve,
+				       wp->type == BCH_DATA_USER, cl);
+		if (ob < 0) {
+			ret = ob;
+			if (ret == OPEN_BUCKETS_EMPTY)
+				break;
+			continue;
+		}
+
+		BUG_ON(ob <= 0 || ob > U8_MAX);
+		BUG_ON(wp->nr_ptrs >= ARRAY_SIZE(wp->ptrs));
+
+		wp->ptrs[wp->nr_ptrs++] = c->open_buckets + ob;
+
+		bch2_wp_rescale(c, ca, wp);
+
+		nr_ptrs_effective += ca->mi.durability;
+		have_cache_dev |= !ca->mi.durability;
+
+		__clear_bit(ca->dev_idx, devs->d);
+
+		if (nr_ptrs_effective >= nr_replicas) {
+			ret = ALLOC_SUCCESS;
+			break;
+		}
+	}
+
+	EBUG_ON(reserve == RESERVE_MOVINGGC &&
+		ret != ALLOC_SUCCESS &&
+		ret != OPEN_BUCKETS_EMPTY);
+
+	switch (ret) {
+	case ALLOC_SUCCESS:
+		return 0;
+	case NO_DEVICES:
+		return -EROFS;
+	case FREELIST_EMPTY:
+	case OPEN_BUCKETS_EMPTY:
+		return cl ? -EAGAIN : -ENOSPC;
+	default:
+		BUG();
+	}
+}
+
+/* Sector allocator */
+
+static void bch2_writepoint_drop_ptr(struct bch_fs *c,
+				     struct write_point *wp,
+				     unsigned i)
+{
+	struct open_bucket *ob = wp->ptrs[i];
+	struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev);
+
+	BUG_ON(ca->open_buckets_partial_nr >=
+	       ARRAY_SIZE(ca->open_buckets_partial));
+
+	if (wp->type == BCH_DATA_USER) {
+		spin_lock(&c->freelist_lock);
+		ob->on_partial_list = true;
+		ca->open_buckets_partial[ca->open_buckets_partial_nr++] =
+			ob - c->open_buckets;
+		spin_unlock(&c->freelist_lock);
+
+		closure_wake_up(&c->open_buckets_wait);
+		closure_wake_up(&c->freelist_wait);
+	} else {
+		bch2_open_bucket_put(c, ob);
+	}
+
+	array_remove_item(wp->ptrs, wp->nr_ptrs, i);
+
+	if (i < wp->first_ptr)
+		wp->first_ptr--;
+}
+
+void bch2_writepoint_drop_ptrs(struct bch_fs *c,
+			       struct write_point *wp,
+			       u16 target, bool in_target)
+{
+	int i;
+
+	for (i = wp->first_ptr - 1; i >= 0; --i)
+		if (bch2_dev_in_target(c, wp->ptrs[i]->ptr.dev,
+				       target) == in_target)
+			bch2_writepoint_drop_ptr(c, wp, i);
+}
+
+static void verify_not_stale(struct bch_fs *c, const struct write_point *wp)
+{
+#ifdef CONFIG_BCACHEFS_DEBUG
+	struct open_bucket *ob;
+	unsigned i;
+
+	writepoint_for_each_ptr_all(wp, ob, i) {
+		struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev);
+
+		BUG_ON(ptr_stale(ca, &ob->ptr));
+	}
+#endif
+}
+
+static int open_bucket_add_buckets(struct bch_fs *c,
+				   u16 target,
+				   struct write_point *wp,
+				   struct bch_devs_list *devs_have,
+				   unsigned nr_replicas,
+				   enum alloc_reserve reserve,
+				   struct closure *cl)
+{
+	struct bch_devs_mask devs = c->rw_devs[wp->type];
+	const struct bch_devs_mask *t;
+	struct open_bucket *ob;
+	unsigned i;
+	int ret;
+
+	percpu_down_read(&c->usage_lock);
+	rcu_read_lock();
+
+	/* Don't allocate from devices we already have pointers to: */
+	for (i = 0; i < devs_have->nr; i++)
+		__clear_bit(devs_have->devs[i], devs.d);
+
+	writepoint_for_each_ptr_all(wp, ob, i)
+		__clear_bit(ob->ptr.dev, devs.d);
+
+	t = bch2_target_to_mask(c, target);
+	if (t)
+		bitmap_and(devs.d, devs.d, t->d, BCH_SB_MEMBERS_MAX);
+
+	ret = bch2_bucket_alloc_set(c, wp, nr_replicas, reserve, &devs, cl);
+
+	rcu_read_unlock();
+	percpu_up_read(&c->usage_lock);
+
+	return ret;
+}
+
+void bch2_writepoint_stop(struct bch_fs *c, struct bch_dev *ca,
+			  struct write_point *wp)
+{
+	struct bch_devs_mask not_self;
+
+	bitmap_complement(not_self.d, ca->self.d, BCH_SB_MEMBERS_MAX);
+
+	mutex_lock(&wp->lock);
+	wp->first_ptr = wp->nr_ptrs;
+	bch2_writepoint_drop_ptrs(c, wp, dev_to_target(ca->dev_idx), true);
+	mutex_unlock(&wp->lock);
+}
+
+static struct write_point *__writepoint_find(struct hlist_head *head,
+					     unsigned long write_point)
+{
+	struct write_point *wp;
+
+	hlist_for_each_entry_rcu(wp, head, node)
+		if (wp->write_point == write_point)
+			return wp;
+
+	return NULL;
+}
+
+static struct write_point *writepoint_find(struct bch_fs *c,
+					   unsigned long write_point)
+{
+	struct write_point *wp, *oldest;
+	struct hlist_head *head;
+
+	if (!(write_point & 1UL)) {
+		wp = (struct write_point *) write_point;
+		mutex_lock(&wp->lock);
+		return wp;
+	}
+
+	head = writepoint_hash(c, write_point);
+restart_find:
+	wp = __writepoint_find(head, write_point);
+	if (wp) {
+lock_wp:
+		mutex_lock(&wp->lock);
+		if (wp->write_point == write_point)
+			goto out;
+		mutex_unlock(&wp->lock);
+		goto restart_find;
+	}
+
+	oldest = NULL;
+	for (wp = c->write_points;
+	     wp < c->write_points + ARRAY_SIZE(c->write_points);
+	     wp++)
+		if (!oldest || time_before64(wp->last_used, oldest->last_used))
+			oldest = wp;
+
+	mutex_lock(&oldest->lock);
+	mutex_lock(&c->write_points_hash_lock);
+	wp = __writepoint_find(head, write_point);
+	if (wp && wp != oldest) {
+		mutex_unlock(&c->write_points_hash_lock);
+		mutex_unlock(&oldest->lock);
+		goto lock_wp;
+	}
+
+	wp = oldest;
+	hlist_del_rcu(&wp->node);
+	wp->write_point = write_point;
+	hlist_add_head_rcu(&wp->node, head);
+	mutex_unlock(&c->write_points_hash_lock);
+out:
+	wp->last_used = sched_clock();
+	return wp;
+}
+
+/*
+ * Get us an open_bucket we can allocate from, return with it locked:
+ */
+struct write_point *bch2_alloc_sectors_start(struct bch_fs *c,
+				unsigned target,
+				struct write_point_specifier write_point,
+				struct bch_devs_list *devs_have,
+				unsigned nr_replicas,
+				unsigned nr_replicas_required,
+				enum alloc_reserve reserve,
+				unsigned flags,
+				struct closure *cl)
+{
+	struct write_point *wp;
+	struct open_bucket *ob;
+	struct bch_dev *ca;
+	unsigned nr_ptrs_have, nr_ptrs_effective;
+	int ret, i, cache_idx = -1;
+
+	BUG_ON(!nr_replicas || !nr_replicas_required);
+
+	wp = writepoint_find(c, write_point.v);
+
+	wp->first_ptr = 0;
+
+	/* does writepoint have ptrs we can't use? */
+	writepoint_for_each_ptr(wp, ob, i)
+		if (bch2_dev_list_has_dev(*devs_have, ob->ptr.dev)) {
+			swap(wp->ptrs[i], wp->ptrs[wp->first_ptr]);
+			wp->first_ptr++;
+		}
+
+	nr_ptrs_have = wp->first_ptr;
+
+	/* does writepoint have ptrs we don't want to use? */
+	if (target)
+		writepoint_for_each_ptr(wp, ob, i)
+			if (!bch2_dev_in_target(c, ob->ptr.dev, target)) {
+				swap(wp->ptrs[i], wp->ptrs[wp->first_ptr]);
+				wp->first_ptr++;
+			}
+
+	if (flags & BCH_WRITE_ONLY_SPECIFIED_DEVS) {
+		ret = open_bucket_add_buckets(c, target, wp, devs_have,
+					      nr_replicas, reserve, cl);
+	} else {
+		ret = open_bucket_add_buckets(c, target, wp, devs_have,
+					      nr_replicas, reserve, NULL);
+		if (!ret)
+			goto alloc_done;
+
+		wp->first_ptr = nr_ptrs_have;
+
+		ret = open_bucket_add_buckets(c, 0, wp, devs_have,
+					      nr_replicas, reserve, cl);
+	}
+
+	if (ret && ret != -EROFS)
+		goto err;
+alloc_done:
+	/* check for more than one cache: */
+	for (i = wp->nr_ptrs - 1; i >= wp->first_ptr; --i) {
+		ca = bch_dev_bkey_exists(c, wp->ptrs[i]->ptr.dev);
+
+		if (ca->mi.durability)
+			continue;
+
+		/*
+		 * if we ended up with more than one cache device, prefer the
+		 * one in the target we want:
+		 */
+		if (cache_idx >= 0) {
+			if (!bch2_dev_in_target(c, wp->ptrs[i]->ptr.dev,
+						target)) {
+				bch2_writepoint_drop_ptr(c, wp, i);
+			} else {
+				bch2_writepoint_drop_ptr(c, wp, cache_idx);
+				cache_idx = i;
+			}
+		} else {
+			cache_idx = i;
+		}
+	}
+
+	/* we might have more effective replicas than required: */
+	nr_ptrs_effective = 0;
+	writepoint_for_each_ptr(wp, ob, i) {
+		ca = bch_dev_bkey_exists(c, ob->ptr.dev);
+		nr_ptrs_effective += ca->mi.durability;
+	}
+
+	if (ret == -EROFS &&
+	    nr_ptrs_effective >= nr_replicas_required)
+		ret = 0;
+
+	if (ret)
+		goto err;
+
+	if (nr_ptrs_effective > nr_replicas) {
+		writepoint_for_each_ptr(wp, ob, i) {
+			ca = bch_dev_bkey_exists(c, ob->ptr.dev);
+
+			if (ca->mi.durability &&
+			    ca->mi.durability <= nr_ptrs_effective - nr_replicas &&
+			    !bch2_dev_in_target(c, ob->ptr.dev, target)) {
+				swap(wp->ptrs[i], wp->ptrs[wp->first_ptr]);
+				wp->first_ptr++;
+				nr_ptrs_effective -= ca->mi.durability;
+			}
+		}
+	}
+
+	if (nr_ptrs_effective > nr_replicas) {
+		writepoint_for_each_ptr(wp, ob, i) {
+			ca = bch_dev_bkey_exists(c, ob->ptr.dev);
+
+			if (ca->mi.durability &&
+			    ca->mi.durability <= nr_ptrs_effective - nr_replicas) {
+				swap(wp->ptrs[i], wp->ptrs[wp->first_ptr]);
+				wp->first_ptr++;
+				nr_ptrs_effective -= ca->mi.durability;
+			}
+		}
+	}
+
+	/* Remove pointers we don't want to use: */
+	if (target)
+		bch2_writepoint_drop_ptrs(c, wp, target, false);
+
+	BUG_ON(wp->first_ptr >= wp->nr_ptrs);
+	BUG_ON(nr_ptrs_effective < nr_replicas_required);
+
+	wp->sectors_free = UINT_MAX;
+
+	writepoint_for_each_ptr(wp, ob, i)
+		wp->sectors_free = min(wp->sectors_free, ob->sectors_free);
+
+	BUG_ON(!wp->sectors_free || wp->sectors_free == UINT_MAX);
+
+	verify_not_stale(c, wp);
+
+	return wp;
+err:
+	mutex_unlock(&wp->lock);
+	return ERR_PTR(ret);
+}
+
+/*
+ * Append pointers to the space we just allocated to @k, and mark @sectors space
+ * as allocated out of @ob
+ */
+void bch2_alloc_sectors_append_ptrs(struct bch_fs *c, struct write_point *wp,
+				    struct bkey_i_extent *e, unsigned sectors)
+{
+	struct open_bucket *ob;
+	unsigned i;
+
+	BUG_ON(sectors > wp->sectors_free);
+	wp->sectors_free -= sectors;
+
+	writepoint_for_each_ptr(wp, ob, i) {
+		struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev);
+		struct bch_extent_ptr tmp = ob->ptr;
+
+		EBUG_ON(bch2_extent_has_device(extent_i_to_s_c(e), ob->ptr.dev));
+
+		tmp.cached = bkey_extent_is_cached(&e->k) ||
+			(!ca->mi.durability && wp->type == BCH_DATA_USER);
+
+		tmp.offset += ca->mi.bucket_size - ob->sectors_free;
+		extent_ptr_append(e, tmp);
+
+		BUG_ON(sectors > ob->sectors_free);
+		ob->sectors_free -= sectors;
+	}
+}
+
+/*
+ * Append pointers to the space we just allocated to @k, and mark @sectors space
+ * as allocated out of @ob
+ */
+void bch2_alloc_sectors_done(struct bch_fs *c, struct write_point *wp)
+{
+	int i;
+
+	for (i = wp->nr_ptrs - 1; i >= 0; --i) {
+		struct open_bucket *ob = wp->ptrs[i];
+
+		if (!ob->sectors_free) {
+			array_remove_item(wp->ptrs, wp->nr_ptrs, i);
+			bch2_open_bucket_put(c, ob);
+		}
+	}
+
+	mutex_unlock(&wp->lock);
+}
diff --git a/fs/bcachefs/alloc_foreground.h b/fs/bcachefs/alloc_foreground.h
new file mode 100644
index 000000000000..1c738e4ba6c9
--- /dev/null
+++ b/fs/bcachefs/alloc_foreground.h
@@ -0,0 +1,116 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_ALLOC_FOREGROUND_H
+#define _BCACHEFS_ALLOC_FOREGROUND_H
+
+#include "bcachefs.h"
+#include "alloc_types.h"
+
+struct bkey;
+struct bch_dev;
+struct bch_fs;
+struct bch_devs_List;
+
+struct dev_alloc_list {
+	unsigned	nr;
+	u8		devs[BCH_SB_MEMBERS_MAX];
+};
+
+struct dev_alloc_list bch2_wp_alloc_list(struct bch_fs *,
+					 struct write_point *,
+					 struct bch_devs_mask *);
+void bch2_wp_rescale(struct bch_fs *, struct bch_dev *,
+		     struct write_point *);
+
+long bch2_bucket_alloc_new_fs(struct bch_dev *);
+
+int bch2_bucket_alloc(struct bch_fs *, struct bch_dev *, enum alloc_reserve, bool,
+		      struct closure *);
+
+#define __writepoint_for_each_ptr(_wp, _ob, _i, _start)			\
+	for ((_i) = (_start);						\
+	     (_i) < (_wp)->nr_ptrs && ((_ob) = (_wp)->ptrs[_i], true);	\
+	     (_i)++)
+
+#define writepoint_for_each_ptr_all(_wp, _ob, _i)			\
+	__writepoint_for_each_ptr(_wp, _ob, _i, 0)
+
+#define writepoint_for_each_ptr(_wp, _ob, _i)				\
+	__writepoint_for_each_ptr(_wp, _ob, _i, wp->first_ptr)
+
+void __bch2_open_bucket_put(struct bch_fs *, struct open_bucket *);
+
+static inline void bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob)
+{
+	if (atomic_dec_and_test(&ob->pin))
+		__bch2_open_bucket_put(c, ob);
+}
+
+static inline void bch2_open_bucket_put_refs(struct bch_fs *c, u8 *nr, u8 *refs)
+{
+	unsigned i;
+
+	for (i = 0; i < *nr; i++)
+		bch2_open_bucket_put(c, c->open_buckets + refs[i]);
+
+	*nr = 0;
+}
+
+static inline void bch2_open_bucket_get(struct bch_fs *c,
+					struct write_point *wp,
+					u8 *nr, u8 *refs)
+{
+	struct open_bucket *ob;
+	unsigned i;
+
+	writepoint_for_each_ptr(wp, ob, i) {
+		atomic_inc(&ob->pin);
+		refs[(*nr)++] = ob - c->open_buckets;
+	}
+}
+
+struct write_point *bch2_alloc_sectors_start(struct bch_fs *,
+					     unsigned,
+					     struct write_point_specifier,
+					     struct bch_devs_list *,
+					     unsigned, unsigned,
+					     enum alloc_reserve,
+					     unsigned,
+					     struct closure *);
+
+void bch2_alloc_sectors_append_ptrs(struct bch_fs *, struct write_point *,
+				    struct bkey_i_extent *, unsigned);
+void bch2_alloc_sectors_done(struct bch_fs *, struct write_point *);
+
+void bch2_writepoint_stop(struct bch_fs *, struct bch_dev *,
+			  struct write_point *);
+
+void bch2_writepoint_drop_ptrs(struct bch_fs *, struct write_point *,
+			       u16, bool);
+
+static inline struct hlist_head *writepoint_hash(struct bch_fs *c,
+						 unsigned long write_point)
+{
+	unsigned hash =
+		hash_long(write_point, ilog2(ARRAY_SIZE(c->write_points_hash)));
+
+	return &c->write_points_hash[hash];
+}
+
+static inline struct write_point_specifier writepoint_hashed(unsigned long v)
+{
+	return (struct write_point_specifier) { .v = v | 1 };
+}
+
+static inline struct write_point_specifier writepoint_ptr(struct write_point *wp)
+{
+	return (struct write_point_specifier) { .v = (unsigned long) wp };
+}
+
+static inline void writepoint_init(struct write_point *wp,
+				   enum bch_data_type type)
+{
+	mutex_init(&wp->lock);
+	wp->type = type;
+}
+
+#endif /* _BCACHEFS_ALLOC_FOREGROUND_H */
diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c
index 8c6c2ca3c992..b3f5f28b8761 100644
--- a/fs/bcachefs/bkey_methods.c
+++ b/fs/bcachefs/bkey_methods.c
@@ -3,7 +3,7 @@
 #include "bcachefs.h"
 #include "bkey_methods.h"
 #include "btree_types.h"
-#include "alloc.h"
+#include "alloc_background.h"
 #include "dirent.h"
 #include "error.h"
 #include "extents.h"
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 5b1e1aab36e9..7fd75435542b 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -5,7 +5,7 @@
  */
 
 #include "bcachefs.h"
-#include "alloc.h"
+#include "alloc_background.h"
 #include "bkey_methods.h"
 #include "btree_locking.h"
 #include "btree_update_interior.h"
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 6d99f3d191d3..0ca998035bab 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 
 #include "bcachefs.h"
-#include "alloc.h"
+#include "alloc_foreground.h"
 #include "bkey_methods.h"
 #include "btree_cache.h"
 #include "btree_gc.h"
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index ab61abdf975d..c6544f35eb09 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -64,7 +64,7 @@
  */
 
 #include "bcachefs.h"
-#include "alloc.h"
+#include "alloc_background.h"
 #include "btree_gc.h"
 #include "buckets.h"
 #include "error.h"
diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c
index db0f990bebf4..d24cff52ba96 100644
--- a/fs/bcachefs/chardev.c
+++ b/fs/bcachefs/chardev.c
@@ -2,7 +2,6 @@
 #ifndef NO_BCACHEFS_CHARDEV
 
 #include "bcachefs.h"
-#include "alloc.h"
 #include "bcachefs_ioctl.h"
 #include "buckets.h"
 #include "chardev.h"
diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 195af78cb474..2902e5f925ef 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -2,6 +2,7 @@
 #ifndef NO_BCACHEFS_FS
 
 #include "bcachefs.h"
+#include "alloc_foreground.h"
 #include "btree_update.h"
 #include "buckets.h"
 #include "clock.h"
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index 50cc87b7875d..dfd2d3b708c5 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -7,7 +7,7 @@
  */
 
 #include "bcachefs.h"
-#include "alloc.h"
+#include "alloc_foreground.h"
 #include "bset.h"
 #include "btree_update.h"
 #include "buckets.h"
diff --git a/fs/bcachefs/io.h b/fs/bcachefs/io.h
index f814226a5196..62f5861005ea 100644
--- a/fs/bcachefs/io.h
+++ b/fs/bcachefs/io.h
@@ -2,7 +2,6 @@
 #ifndef _BCACHEFS_IO_H
 #define _BCACHEFS_IO_H
 
-#include "alloc.h"
 #include "checksum.h"
 #include "io_types.h"
 
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index b83548ae33b2..97fbc2698dc0 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -6,7 +6,7 @@
  */
 
 #include "bcachefs.h"
-#include "alloc.h"
+#include "alloc_foreground.h"
 #include "bkey_methods.h"
 #include "btree_gc.h"
 #include "buckets.h"
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 16ea32dc1fa4..9dd881c0410e 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 #include "bcachefs.h"
-#include "alloc.h"
+#include "alloc_background.h"
+#include "alloc_foreground.h"
 #include "btree_gc.h"
 #include "btree_update.h"
 #include "buckets.h"
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index 9402b45bf868..96f04f349fb1 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 
 #include "bcachefs.h"
+#include "alloc_foreground.h"
 #include "btree_gc.h"
 #include "btree_update.h"
 #include "buckets.h"
diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c
index 26b8e95db1db..b7def19bdd85 100644
--- a/fs/bcachefs/movinggc.c
+++ b/fs/bcachefs/movinggc.c
@@ -6,6 +6,7 @@
  */
 
 #include "bcachefs.h"
+#include "alloc_foreground.h"
 #include "btree_iter.h"
 #include "btree_update.h"
 #include "buckets.h"
diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c
index 74702e753f60..461af44dbde7 100644
--- a/fs/bcachefs/rebalance.c
+++ b/fs/bcachefs/rebalance.c
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 
 #include "bcachefs.h"
-#include "alloc.h"
+#include "alloc_foreground.h"
 #include "btree_iter.h"
 #include "buckets.h"
 #include "clock.h"
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 3deb59a675e1..6d603654d150 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 
 #include "bcachefs.h"
-#include "alloc.h"
+#include "alloc_background.h"
 #include "btree_gc.h"
 #include "btree_update.h"
 #include "btree_update_interior.h"
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index ffeffd50b083..9d9d4fb8348b 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -8,7 +8,8 @@
  */
 
 #include "bcachefs.h"
-#include "alloc.h"
+#include "alloc_background.h"
+#include "alloc_foreground.h"
 #include "btree_cache.h"
 #include "btree_gc.h"
 #include "btree_update_interior.h"
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index a472e454099b..b7a65bc20430 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -9,7 +9,7 @@
 #ifndef NO_BCACHEFS_SYSFS
 
 #include "bcachefs.h"
-#include "alloc.h"
+#include "alloc_background.h"
 #include "compress.h"
 #include "sysfs.h"
 #include "btree_cache.h"
-- 
cgit 


From ef337c54c6d18d4c6ce0aef8f4f327d4cf42ae08 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 6 Oct 2018 04:12:42 -0400
Subject: bcachefs: Allocation code refactoring

bch2_alloc_sectors_start() was a nightmare to work with - it's got some
tricky stuff to do, since it wants to use the buckets the writepoint
already has, unless they're not in the target it wants to write to,
unless it can't allocate from any other devices in which case it will
use those buckets if it has to - et cetera.

This restructures the code to start with a new empty list of open
buckets we're going to use for the new allocation, pulling buckets from
the write point's list as we decide that we really are going to use
them - making the code somewhat more functional and drastically easier
to understand.

Also fixes a bug where we could end up waiting on c->freelist_wait
(because allocating from one device failed) but return success from
bch2_bucket_alloc(), because allocating from a different device
succeeded.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c      |   2 +-
 fs/bcachefs/alloc_foreground.c      | 415 ++++++++++++++++--------------------
 fs/bcachefs/alloc_foreground.h      |  44 ++--
 fs/bcachefs/alloc_types.h           |  12 +-
 fs/bcachefs/btree_gc.c              |   3 +-
 fs/bcachefs/btree_types.h           |   9 +-
 fs/bcachefs/btree_update_interior.c |  28 +--
 fs/bcachefs/btree_update_interior.h |   1 -
 fs/bcachefs/io.c                    |  12 +-
 fs/bcachefs/io.h                    |   2 +-
 fs/bcachefs/io_types.h              |   3 +-
 fs/bcachefs/journal.c               |   6 +-
 12 files changed, 238 insertions(+), 299 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index d22b2b72b0d1..45e8b124a9f3 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -1101,7 +1101,7 @@ void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca)
 		struct btree_alloc *a =
 			&c->btree_reserve_cache[--c->btree_reserve_cache_nr];
 
-		bch2_open_bucket_put_refs(c, &a->ob.nr, a->ob.refs);
+		bch2_open_buckets_put(c, &a->ob);
 	}
 	mutex_unlock(&c->btree_reserve_cache_lock);
 
diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index be25e01a7d00..562c1317aa9e 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -70,10 +70,9 @@
 #include <linux/rcupdate.h>
 
 enum bucket_alloc_ret {
-	ALLOC_SUCCESS		= 0,
-	OPEN_BUCKETS_EMPTY	= -1,
-	FREELIST_EMPTY		= -2,	/* Allocator thread not keeping up */
-	NO_DEVICES		= -3,	/* -EROFS */
+	ALLOC_SUCCESS,
+	OPEN_BUCKETS_EMPTY,
+	FREELIST_EMPTY,		/* Allocator thread not keeping up */
 };
 
 /*
@@ -129,6 +128,43 @@ static struct open_bucket *bch2_open_bucket_alloc(struct bch_fs *c)
 	return ob;
 }
 
+static void open_bucket_free_unused(struct bch_fs *c,
+				    struct write_point *wp,
+				    struct open_bucket *ob)
+{
+	struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev);
+
+	BUG_ON(ca->open_buckets_partial_nr >=
+	       ARRAY_SIZE(ca->open_buckets_partial));
+
+	if (wp->type == BCH_DATA_USER) {
+		spin_lock(&c->freelist_lock);
+		ob->on_partial_list = true;
+		ca->open_buckets_partial[ca->open_buckets_partial_nr++] =
+			ob - c->open_buckets;
+		spin_unlock(&c->freelist_lock);
+
+		closure_wake_up(&c->open_buckets_wait);
+		closure_wake_up(&c->freelist_wait);
+	} else {
+		bch2_open_bucket_put(c, ob);
+	}
+}
+
+static void verify_not_stale(struct bch_fs *c, const struct open_buckets *obs)
+{
+#ifdef CONFIG_BCACHEFS_DEBUG
+	struct open_bucket *ob;
+	unsigned i;
+
+	open_bucket_for_each(c, obs, ob, i) {
+		struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev);
+
+		BUG_ON(ptr_stale(ca, &ob->ptr));
+	}
+#endif
+}
+
 /* _only_ for allocating the journal on a new device: */
 long bch2_bucket_alloc_new_fs(struct bch_dev *ca)
 {
@@ -164,10 +200,10 @@ static inline unsigned open_buckets_reserved(enum alloc_reserve reserve)
  *
  * Returns index of bucket on success, 0 on failure
  * */
-int bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca,
-		      enum alloc_reserve reserve,
-		      bool may_alloc_partial,
-		      struct closure *cl)
+struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca,
+				      enum alloc_reserve reserve,
+				      bool may_alloc_partial,
+				      struct closure *cl)
 {
 	struct bucket_array *buckets;
 	struct open_bucket *ob;
@@ -177,10 +213,11 @@ int bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca,
 
 	if (may_alloc_partial &&
 	    ca->open_buckets_partial_nr) {
-		int ret = ca->open_buckets_partial[--ca->open_buckets_partial_nr];
-		c->open_buckets[ret].on_partial_list = false;
+		ob = c->open_buckets +
+			ca->open_buckets_partial[--ca->open_buckets_partial_nr];
+		ob->on_partial_list = false;
 		spin_unlock(&c->freelist_lock);
-		return ret;
+		return ob;
 	}
 
 	if (unlikely(c->open_buckets_nr_free <= open_buckets_reserved(reserve))) {
@@ -188,7 +225,7 @@ int bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca,
 			closure_wait(&c->open_buckets_wait, cl);
 		spin_unlock(&c->freelist_lock);
 		trace_open_bucket_alloc_fail(ca, reserve);
-		return OPEN_BUCKETS_EMPTY;
+		return ERR_PTR(-OPEN_BUCKETS_EMPTY);
 	}
 
 	if (likely(fifo_pop(&ca->free[RESERVE_NONE], bucket)))
@@ -219,7 +256,7 @@ int bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca,
 	spin_unlock(&c->freelist_lock);
 
 	trace_bucket_alloc_fail(ca, reserve);
-	return FREELIST_EMPTY;
+	return ERR_PTR(-FREELIST_EMPTY);
 out:
 	verify_not_on_freelist(c, ca, bucket);
 
@@ -245,7 +282,7 @@ out:
 	bch2_wake_allocator(ca);
 
 	trace_bucket_alloc(ca, reserve);
-	return ob - c->open_buckets;
+	return ob;
 }
 
 static int __dev_alloc_cmp(struct write_point *wp,
@@ -292,155 +329,114 @@ void bch2_wp_rescale(struct bch_fs *c, struct bch_dev *ca,
 		*v = *v < scale ? 0 : *v - scale;
 }
 
-static enum bucket_alloc_ret bch2_bucket_alloc_set(struct bch_fs *c,
-					struct write_point *wp,
-					unsigned nr_replicas,
-					enum alloc_reserve reserve,
-					struct bch_devs_mask *devs,
-					struct closure *cl)
+static int bch2_bucket_alloc_set(struct bch_fs *c,
+				 struct open_buckets *ptrs,
+				 struct write_point *wp,
+				 struct bch_devs_mask *devs_may_alloc,
+				 unsigned nr_replicas,
+				 unsigned *nr_effective,
+				 bool *have_cache,
+				 enum alloc_reserve reserve,
+				 struct closure *cl)
 {
-	enum bucket_alloc_ret ret = NO_DEVICES;
-	struct dev_alloc_list devs_sorted;
+	struct dev_alloc_list devs_sorted =
+		bch2_wp_alloc_list(c, wp, devs_may_alloc);
 	struct bch_dev *ca;
-	unsigned i, nr_ptrs_effective = 0;
-	bool have_cache_dev = false;
-
-	BUG_ON(nr_replicas > ARRAY_SIZE(wp->ptrs));
-
-	for (i = wp->first_ptr; i < wp->nr_ptrs; i++) {
-		ca = bch_dev_bkey_exists(c, wp->ptrs[i]->ptr.dev);
-
-		nr_ptrs_effective += ca->mi.durability;
-		have_cache_dev |= !ca->mi.durability;
-	}
-
-	if (nr_ptrs_effective >= nr_replicas)
-		return ALLOC_SUCCESS;
+	bool alloc_failure = false;
+	unsigned i;
 
-	devs_sorted = bch2_wp_alloc_list(c, wp, devs);
+	BUG_ON(*nr_effective >= nr_replicas);
 
 	for (i = 0; i < devs_sorted.nr; i++) {
-		int ob;
+		struct open_bucket *ob;
 
 		ca = rcu_dereference(c->devs[devs_sorted.devs[i]]);
 		if (!ca)
 			continue;
 
 		if (!ca->mi.durability &&
-		    (have_cache_dev ||
+		    (*have_cache ||
 		     wp->type != BCH_DATA_USER))
 			continue;
 
 		ob = bch2_bucket_alloc(c, ca, reserve,
 				       wp->type == BCH_DATA_USER, cl);
-		if (ob < 0) {
-			ret = ob;
+		if (IS_ERR(ob)) {
+			enum bucket_alloc_ret ret = -PTR_ERR(ob);
+
+			WARN_ON(reserve == RESERVE_MOVINGGC &&
+				ret != OPEN_BUCKETS_EMPTY);
+
+			if (cl)
+				return -EAGAIN;
 			if (ret == OPEN_BUCKETS_EMPTY)
-				break;
+				return -ENOSPC;
+			alloc_failure = true;
 			continue;
 		}
 
-		BUG_ON(ob <= 0 || ob > U8_MAX);
-		BUG_ON(wp->nr_ptrs >= ARRAY_SIZE(wp->ptrs));
+		__clear_bit(ca->dev_idx, devs_may_alloc->d);
+		*nr_effective	+= ca->mi.durability;
+		*have_cache	|= !ca->mi.durability;
 
-		wp->ptrs[wp->nr_ptrs++] = c->open_buckets + ob;
+		ob_push(c, ptrs, ob);
 
 		bch2_wp_rescale(c, ca, wp);
 
-		nr_ptrs_effective += ca->mi.durability;
-		have_cache_dev |= !ca->mi.durability;
-
-		__clear_bit(ca->dev_idx, devs->d);
-
-		if (nr_ptrs_effective >= nr_replicas) {
-			ret = ALLOC_SUCCESS;
-			break;
-		}
+		if (*nr_effective >= nr_replicas)
+			return 0;
 	}
 
-	EBUG_ON(reserve == RESERVE_MOVINGGC &&
-		ret != ALLOC_SUCCESS &&
-		ret != OPEN_BUCKETS_EMPTY);
-
-	switch (ret) {
-	case ALLOC_SUCCESS:
-		return 0;
-	case NO_DEVICES:
-		return -EROFS;
-	case FREELIST_EMPTY:
-	case OPEN_BUCKETS_EMPTY:
-		return cl ? -EAGAIN : -ENOSPC;
-	default:
-		BUG();
-	}
+	return alloc_failure ? -ENOSPC : -EROFS;
 }
 
 /* Sector allocator */
 
-static void bch2_writepoint_drop_ptr(struct bch_fs *c,
-				     struct write_point *wp,
-				     unsigned i)
+static int get_buckets_from_writepoint(struct bch_fs *c,
+				       struct open_buckets *ptrs,
+				       struct write_point *wp,
+				       struct bch_devs_mask *devs_may_alloc,
+				       unsigned nr_replicas,
+				       unsigned *nr_effective,
+				       bool *have_cache)
 {
-	struct open_bucket *ob = wp->ptrs[i];
-	struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev);
-
-	BUG_ON(ca->open_buckets_partial_nr >=
-	       ARRAY_SIZE(ca->open_buckets_partial));
-
-	if (wp->type == BCH_DATA_USER) {
-		spin_lock(&c->freelist_lock);
-		ob->on_partial_list = true;
-		ca->open_buckets_partial[ca->open_buckets_partial_nr++] =
-			ob - c->open_buckets;
-		spin_unlock(&c->freelist_lock);
-
-		closure_wake_up(&c->open_buckets_wait);
-		closure_wake_up(&c->freelist_wait);
-	} else {
-		bch2_open_bucket_put(c, ob);
-	}
-
-	array_remove_item(wp->ptrs, wp->nr_ptrs, i);
-
-	if (i < wp->first_ptr)
-		wp->first_ptr--;
-}
-
-void bch2_writepoint_drop_ptrs(struct bch_fs *c,
-			       struct write_point *wp,
-			       u16 target, bool in_target)
-{
-	int i;
-
-	for (i = wp->first_ptr - 1; i >= 0; --i)
-		if (bch2_dev_in_target(c, wp->ptrs[i]->ptr.dev,
-				       target) == in_target)
-			bch2_writepoint_drop_ptr(c, wp, i);
-}
-
-static void verify_not_stale(struct bch_fs *c, const struct write_point *wp)
-{
-#ifdef CONFIG_BCACHEFS_DEBUG
+	struct open_buckets ptrs_skip = { .nr = 0 };
 	struct open_bucket *ob;
 	unsigned i;
 
-	writepoint_for_each_ptr_all(wp, ob, i) {
+	open_bucket_for_each(c, &wp->ptrs, ob, i) {
 		struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev);
 
-		BUG_ON(ptr_stale(ca, &ob->ptr));
+		if (*nr_effective < nr_replicas &&
+		    test_bit(ob->ptr.dev, devs_may_alloc->d) &&
+		    (ca->mi.durability ||
+		     (wp->type == BCH_DATA_USER && !*have_cache))) {
+			__clear_bit(ob->ptr.dev, devs_may_alloc->d);
+			*nr_effective	+= ca->mi.durability;
+			*have_cache	|= !ca->mi.durability;
+
+			ob_push(c, ptrs, ob);
+		} else {
+			ob_push(c, &ptrs_skip, ob);
+		}
 	}
-#endif
+	wp->ptrs = ptrs_skip;
+
+	return *nr_effective < nr_replicas ? -ENOSPC : 0;
 }
 
 static int open_bucket_add_buckets(struct bch_fs *c,
-				   u16 target,
+				   struct open_buckets *ptrs,
 				   struct write_point *wp,
 				   struct bch_devs_list *devs_have,
+				   u16 target,
 				   unsigned nr_replicas,
+				   unsigned *nr_effective,
+				   bool *have_cache,
 				   enum alloc_reserve reserve,
 				   struct closure *cl)
 {
-	struct bch_devs_mask devs = c->rw_devs[wp->type];
+	struct bch_devs_mask devs;
 	const struct bch_devs_mask *t;
 	struct open_bucket *ob;
 	unsigned i;
@@ -449,19 +445,38 @@ static int open_bucket_add_buckets(struct bch_fs *c,
 	percpu_down_read(&c->usage_lock);
 	rcu_read_lock();
 
+	devs = c->rw_devs[wp->type];
+
 	/* Don't allocate from devices we already have pointers to: */
 	for (i = 0; i < devs_have->nr; i++)
 		__clear_bit(devs_have->devs[i], devs.d);
 
-	writepoint_for_each_ptr_all(wp, ob, i)
+	open_bucket_for_each(c, ptrs, ob, i)
 		__clear_bit(ob->ptr.dev, devs.d);
 
 	t = bch2_target_to_mask(c, target);
 	if (t)
 		bitmap_and(devs.d, devs.d, t->d, BCH_SB_MEMBERS_MAX);
 
-	ret = bch2_bucket_alloc_set(c, wp, nr_replicas, reserve, &devs, cl);
+	ret = get_buckets_from_writepoint(c, ptrs, wp, &devs,
+				nr_replicas, nr_effective, have_cache);
+	if (!ret)
+		goto out;
 
+	/*
+	 * Try nonblocking first, so that if one device is full we'll try from
+	 * other devices:
+	 */
+	ret = bch2_bucket_alloc_set(c, ptrs, wp, &devs,
+				nr_replicas, nr_effective, have_cache,
+				reserve, NULL);
+	if (!ret || ret == -EROFS || !cl)
+		goto out;
+
+	ret = bch2_bucket_alloc_set(c, ptrs, wp, &devs,
+				nr_replicas, nr_effective, have_cache,
+				reserve, cl);
+out:
 	rcu_read_unlock();
 	percpu_up_read(&c->usage_lock);
 
@@ -471,13 +486,18 @@ static int open_bucket_add_buckets(struct bch_fs *c,
 void bch2_writepoint_stop(struct bch_fs *c, struct bch_dev *ca,
 			  struct write_point *wp)
 {
-	struct bch_devs_mask not_self;
-
-	bitmap_complement(not_self.d, ca->self.d, BCH_SB_MEMBERS_MAX);
+	struct open_buckets ptrs = { .nr = 0 };
+	struct open_bucket *ob;
+	unsigned i;
 
 	mutex_lock(&wp->lock);
-	wp->first_ptr = wp->nr_ptrs;
-	bch2_writepoint_drop_ptrs(c, wp, dev_to_target(ca->dev_idx), true);
+	open_bucket_for_each(c, &wp->ptrs, ob, i)
+		if (ob->ptr.dev == ca->dev_idx)
+			open_bucket_free_unused(c, wp, ob);
+		else
+			ob_push(c, &ptrs, ob);
+
+	wp->ptrs = ptrs;
 	mutex_unlock(&wp->lock);
 }
 
@@ -558,134 +578,64 @@ struct write_point *bch2_alloc_sectors_start(struct bch_fs *c,
 {
 	struct write_point *wp;
 	struct open_bucket *ob;
-	struct bch_dev *ca;
-	unsigned nr_ptrs_have, nr_ptrs_effective;
-	int ret, i, cache_idx = -1;
+	unsigned nr_effective = 0;
+	struct open_buckets ptrs = { .nr = 0 };
+	bool have_cache = false;
+	int ret = 0, i;
 
 	BUG_ON(!nr_replicas || !nr_replicas_required);
 
 	wp = writepoint_find(c, write_point.v);
 
-	wp->first_ptr = 0;
-
-	/* does writepoint have ptrs we can't use? */
-	writepoint_for_each_ptr(wp, ob, i)
-		if (bch2_dev_list_has_dev(*devs_have, ob->ptr.dev)) {
-			swap(wp->ptrs[i], wp->ptrs[wp->first_ptr]);
-			wp->first_ptr++;
-		}
-
-	nr_ptrs_have = wp->first_ptr;
-
-	/* does writepoint have ptrs we don't want to use? */
-	if (target)
-		writepoint_for_each_ptr(wp, ob, i)
-			if (!bch2_dev_in_target(c, ob->ptr.dev, target)) {
-				swap(wp->ptrs[i], wp->ptrs[wp->first_ptr]);
-				wp->first_ptr++;
-			}
-
-	if (flags & BCH_WRITE_ONLY_SPECIFIED_DEVS) {
-		ret = open_bucket_add_buckets(c, target, wp, devs_have,
-					      nr_replicas, reserve, cl);
+	if (!target || (flags & BCH_WRITE_ONLY_SPECIFIED_DEVS)) {
+		ret = open_bucket_add_buckets(c, &ptrs, wp, devs_have, target,
+					      nr_replicas, &nr_effective,
+					      &have_cache, reserve, cl);
 	} else {
-		ret = open_bucket_add_buckets(c, target, wp, devs_have,
-					      nr_replicas, reserve, NULL);
+		ret = open_bucket_add_buckets(c, &ptrs, wp, devs_have, target,
+					      nr_replicas, &nr_effective,
+					      &have_cache, reserve, NULL);
 		if (!ret)
 			goto alloc_done;
 
-		wp->first_ptr = nr_ptrs_have;
-
-		ret = open_bucket_add_buckets(c, 0, wp, devs_have,
-					      nr_replicas, reserve, cl);
+		ret = open_bucket_add_buckets(c, &ptrs, wp, devs_have, 0,
+					      nr_replicas, &nr_effective,
+					      &have_cache, reserve, cl);
 	}
-
-	if (ret && ret != -EROFS)
-		goto err;
 alloc_done:
-	/* check for more than one cache: */
-	for (i = wp->nr_ptrs - 1; i >= wp->first_ptr; --i) {
-		ca = bch_dev_bkey_exists(c, wp->ptrs[i]->ptr.dev);
-
-		if (ca->mi.durability)
-			continue;
-
-		/*
-		 * if we ended up with more than one cache device, prefer the
-		 * one in the target we want:
-		 */
-		if (cache_idx >= 0) {
-			if (!bch2_dev_in_target(c, wp->ptrs[i]->ptr.dev,
-						target)) {
-				bch2_writepoint_drop_ptr(c, wp, i);
-			} else {
-				bch2_writepoint_drop_ptr(c, wp, cache_idx);
-				cache_idx = i;
-			}
-		} else {
-			cache_idx = i;
-		}
-	}
-
-	/* we might have more effective replicas than required: */
-	nr_ptrs_effective = 0;
-	writepoint_for_each_ptr(wp, ob, i) {
-		ca = bch_dev_bkey_exists(c, ob->ptr.dev);
-		nr_ptrs_effective += ca->mi.durability;
-	}
+	BUG_ON(!ret && nr_effective < nr_replicas);
 
 	if (ret == -EROFS &&
-	    nr_ptrs_effective >= nr_replicas_required)
+	    nr_effective >= nr_replicas_required)
 		ret = 0;
 
 	if (ret)
 		goto err;
 
-	if (nr_ptrs_effective > nr_replicas) {
-		writepoint_for_each_ptr(wp, ob, i) {
-			ca = bch_dev_bkey_exists(c, ob->ptr.dev);
-
-			if (ca->mi.durability &&
-			    ca->mi.durability <= nr_ptrs_effective - nr_replicas &&
-			    !bch2_dev_in_target(c, ob->ptr.dev, target)) {
-				swap(wp->ptrs[i], wp->ptrs[wp->first_ptr]);
-				wp->first_ptr++;
-				nr_ptrs_effective -= ca->mi.durability;
-			}
-		}
-	}
-
-	if (nr_ptrs_effective > nr_replicas) {
-		writepoint_for_each_ptr(wp, ob, i) {
-			ca = bch_dev_bkey_exists(c, ob->ptr.dev);
-
-			if (ca->mi.durability &&
-			    ca->mi.durability <= nr_ptrs_effective - nr_replicas) {
-				swap(wp->ptrs[i], wp->ptrs[wp->first_ptr]);
-				wp->first_ptr++;
-				nr_ptrs_effective -= ca->mi.durability;
-			}
-		}
-	}
-
-	/* Remove pointers we don't want to use: */
-	if (target)
-		bch2_writepoint_drop_ptrs(c, wp, target, false);
+	/* Free buckets we didn't use: */
+	open_bucket_for_each(c, &wp->ptrs, ob, i)
+		open_bucket_free_unused(c, wp, ob);
 
-	BUG_ON(wp->first_ptr >= wp->nr_ptrs);
-	BUG_ON(nr_ptrs_effective < nr_replicas_required);
+	wp->ptrs = ptrs;
 
 	wp->sectors_free = UINT_MAX;
 
-	writepoint_for_each_ptr(wp, ob, i)
+	open_bucket_for_each(c, &wp->ptrs, ob, i)
 		wp->sectors_free = min(wp->sectors_free, ob->sectors_free);
 
 	BUG_ON(!wp->sectors_free || wp->sectors_free == UINT_MAX);
 
-	verify_not_stale(c, wp);
+	verify_not_stale(c, &wp->ptrs);
 
 	return wp;
 err:
+	open_bucket_for_each(c, &wp->ptrs, ob, i)
+		if (ptrs.nr < ARRAY_SIZE(ptrs.v))
+			ob_push(c, &ptrs, ob);
+		else
+			open_bucket_free_unused(c, wp, ob);
+	wp->ptrs = ptrs;
+
 	mutex_unlock(&wp->lock);
 	return ERR_PTR(ret);
 }
@@ -703,7 +653,7 @@ void bch2_alloc_sectors_append_ptrs(struct bch_fs *c, struct write_point *wp,
 	BUG_ON(sectors > wp->sectors_free);
 	wp->sectors_free -= sectors;
 
-	writepoint_for_each_ptr(wp, ob, i) {
+	open_bucket_for_each(c, &wp->ptrs, ob, i) {
 		struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev);
 		struct bch_extent_ptr tmp = ob->ptr;
 
@@ -726,16 +676,15 @@ void bch2_alloc_sectors_append_ptrs(struct bch_fs *c, struct write_point *wp,
  */
 void bch2_alloc_sectors_done(struct bch_fs *c, struct write_point *wp)
 {
-	int i;
+	struct open_buckets ptrs = { .nr = 0 }, keep = { .nr = 0 };
+	struct open_bucket *ob;
+	unsigned i;
 
-	for (i = wp->nr_ptrs - 1; i >= 0; --i) {
-		struct open_bucket *ob = wp->ptrs[i];
-
-		if (!ob->sectors_free) {
-			array_remove_item(wp->ptrs, wp->nr_ptrs, i);
-			bch2_open_bucket_put(c, ob);
-		}
-	}
+	open_bucket_for_each(c, &wp->ptrs, ob, i)
+		ob_push(c, !ob->sectors_free ? &ptrs : &keep, ob);
+	wp->ptrs = keep;
 
 	mutex_unlock(&wp->lock);
+
+	bch2_open_buckets_put(c, &ptrs);
 }
diff --git a/fs/bcachefs/alloc_foreground.h b/fs/bcachefs/alloc_foreground.h
index 1c738e4ba6c9..609685d08642 100644
--- a/fs/bcachefs/alloc_foreground.h
+++ b/fs/bcachefs/alloc_foreground.h
@@ -23,19 +23,23 @@ void bch2_wp_rescale(struct bch_fs *, struct bch_dev *,
 
 long bch2_bucket_alloc_new_fs(struct bch_dev *);
 
-int bch2_bucket_alloc(struct bch_fs *, struct bch_dev *, enum alloc_reserve, bool,
-		      struct closure *);
+struct open_bucket *bch2_bucket_alloc(struct bch_fs *, struct bch_dev *,
+				      enum alloc_reserve, bool,
+				      struct closure *);
 
-#define __writepoint_for_each_ptr(_wp, _ob, _i, _start)			\
-	for ((_i) = (_start);						\
-	     (_i) < (_wp)->nr_ptrs && ((_ob) = (_wp)->ptrs[_i], true);	\
-	     (_i)++)
+static inline void ob_push(struct bch_fs *c, struct open_buckets *obs,
+			   struct open_bucket *ob)
+{
+	BUG_ON(obs->nr >= ARRAY_SIZE(obs->v));
 
-#define writepoint_for_each_ptr_all(_wp, _ob, _i)			\
-	__writepoint_for_each_ptr(_wp, _ob, _i, 0)
+	obs->v[obs->nr++] = ob - c->open_buckets;
+}
 
-#define writepoint_for_each_ptr(_wp, _ob, _i)				\
-	__writepoint_for_each_ptr(_wp, _ob, _i, wp->first_ptr)
+#define open_bucket_for_each(_c, _obs, _ob, _i)				\
+	for ((_i) = 0;							\
+	     (_i) < (_obs)->nr &&					\
+	     ((_ob) = (_c)->open_buckets + (_obs)->v[_i], true);	\
+	     (_i)++)
 
 void __bch2_open_bucket_put(struct bch_fs *, struct open_bucket *);
 
@@ -45,26 +49,27 @@ static inline void bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob
 		__bch2_open_bucket_put(c, ob);
 }
 
-static inline void bch2_open_bucket_put_refs(struct bch_fs *c, u8 *nr, u8 *refs)
+static inline void bch2_open_buckets_put(struct bch_fs *c,
+					 struct open_buckets *ptrs)
 {
+	struct open_bucket *ob;
 	unsigned i;
 
-	for (i = 0; i < *nr; i++)
-		bch2_open_bucket_put(c, c->open_buckets + refs[i]);
-
-	*nr = 0;
+	open_bucket_for_each(c, ptrs, ob, i)
+		bch2_open_bucket_put(c, ob);
+	ptrs->nr = 0;
 }
 
 static inline void bch2_open_bucket_get(struct bch_fs *c,
 					struct write_point *wp,
-					u8 *nr, u8 *refs)
+					struct open_buckets *ptrs)
 {
 	struct open_bucket *ob;
 	unsigned i;
 
-	writepoint_for_each_ptr(wp, ob, i) {
+	open_bucket_for_each(c, &wp->ptrs, ob, i) {
 		atomic_inc(&ob->pin);
-		refs[(*nr)++] = ob - c->open_buckets;
+		ob_push(c, ptrs, ob);
 	}
 }
 
@@ -84,9 +89,6 @@ void bch2_alloc_sectors_done(struct bch_fs *, struct write_point *);
 void bch2_writepoint_stop(struct bch_fs *, struct bch_dev *,
 			  struct write_point *);
 
-void bch2_writepoint_drop_ptrs(struct bch_fs *, struct write_point *,
-			       u16, bool);
-
 static inline struct hlist_head *writepoint_hash(struct bch_fs *c,
 						 unsigned long write_point)
 {
diff --git a/fs/bcachefs/alloc_types.h b/fs/bcachefs/alloc_types.h
index 035c50052167..e0306d68ae9f 100644
--- a/fs/bcachefs/alloc_types.h
+++ b/fs/bcachefs/alloc_types.h
@@ -58,6 +58,13 @@ struct open_bucket {
 	struct bch_extent_ptr	ptr;
 };
 
+#define OPEN_BUCKET_LIST_MAX	15
+
+struct open_buckets {
+	u8			nr;
+	u8			v[OPEN_BUCKET_LIST_MAX];
+};
+
 struct write_point {
 	struct hlist_node	node;
 	struct mutex		lock;
@@ -65,13 +72,10 @@ struct write_point {
 	unsigned long		write_point;
 	enum bch_data_type	type;
 
-	u8			nr_ptrs;
-	u8			first_ptr;
-
 	/* calculated based on how many pointers we're actually going to use: */
 	unsigned		sectors_free;
 
-	struct open_bucket	*ptrs[BCH_REPLICAS_MAX * 2];
+	struct open_buckets	ptrs;
 	u64			next_alloc[BCH_SB_MEMBERS_MAX];
 };
 
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 7fd75435542b..d07a6b297078 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -6,6 +6,7 @@
 
 #include "bcachefs.h"
 #include "alloc_background.h"
+#include "alloc_foreground.h"
 #include "bkey_methods.h"
 #include "btree_locking.h"
 #include "btree_update_interior.h"
@@ -803,7 +804,7 @@ next:
 	bch2_btree_iter_node_replace(iter, new_nodes[0]);
 
 	for (i = 0; i < nr_new_nodes; i++)
-		bch2_btree_open_bucket_put(c, new_nodes[i]);
+		bch2_open_buckets_put(c, &new_nodes[i]->ob);
 
 	/* Free the old nodes and update our sliding window */
 	for (i = 0; i < nr_old_nodes; i++) {
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 5053ed5f2762..dd9660a9f12b 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -54,13 +54,8 @@ struct btree_write {
 	struct closure_waitlist		wait;
 };
 
-struct btree_ob_ref {
-	u8			nr;
-	u8			refs[BCH_REPLICAS_MAX];
-};
-
 struct btree_alloc {
-	struct btree_ob_ref	ob;
+	struct open_buckets	ob;
 	BKEY_PADDED(k);
 };
 
@@ -127,7 +122,7 @@ struct btree {
 	 */
 	unsigned long		will_make_reachable;
 
-	struct btree_ob_ref	ob;
+	struct open_buckets	ob;
 
 	/* lru list */
 	struct list_head	list;
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 0ca998035bab..26721c5a871c 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -247,7 +247,7 @@ static void __btree_node_free(struct bch_fs *c, struct btree *b)
 
 void bch2_btree_node_free_never_inserted(struct bch_fs *c, struct btree *b)
 {
-	struct btree_ob_ref ob = b->ob;
+	struct open_buckets ob = b->ob;
 
 	btree_update_drop_new_node(c, b);
 
@@ -259,7 +259,7 @@ void bch2_btree_node_free_never_inserted(struct bch_fs *c, struct btree *b)
 	__btree_node_free(c, b);
 	six_unlock_write(&b->lock);
 
-	bch2_open_bucket_put_refs(c, &ob.nr, ob.refs);
+	bch2_open_buckets_put(c, &ob);
 }
 
 void bch2_btree_node_free_inmem(struct bch_fs *c, struct btree *b,
@@ -300,11 +300,6 @@ static void bch2_btree_node_free_ondisk(struct bch_fs *c,
 	 */
 }
 
-void bch2_btree_open_bucket_put(struct bch_fs *c, struct btree *b)
-{
-	bch2_open_bucket_put_refs(c, &b->ob.nr, b->ob.refs);
-}
-
 static struct btree *__bch2_btree_node_alloc(struct bch_fs *c,
 					     struct disk_reservation *res,
 					     struct closure *cl,
@@ -314,7 +309,7 @@ static struct btree *__bch2_btree_node_alloc(struct bch_fs *c,
 	struct btree *b;
 	BKEY_PADDED(k) tmp;
 	struct bkey_i_extent *e;
-	struct btree_ob_ref ob;
+	struct open_buckets ob = { .nr = 0 };
 	struct bch_devs_list devs_have = (struct bch_devs_list) { 0 };
 	unsigned nr_reserve;
 	enum alloc_reserve alloc_reserve;
@@ -356,7 +351,7 @@ retry:
 		struct open_bucket *ob;
 		unsigned i;
 
-		writepoint_for_each_ptr(wp, ob, i)
+		open_bucket_for_each(c, &wp->ptrs, ob, i)
 			if (ob->sectors_free < c->opts.btree_node_size)
 				ob->sectors_free = 0;
 
@@ -367,8 +362,7 @@ retry:
 	e = bkey_extent_init(&tmp.k);
 	bch2_alloc_sectors_append_ptrs(c, wp, e, c->opts.btree_node_size);
 
-	ob.nr = 0;
-	bch2_open_bucket_get(c, wp, &ob.nr, ob.refs);
+	bch2_open_bucket_get(c, wp, &ob);
 	bch2_alloc_sectors_done(c, wp);
 mem_alloc:
 	b = bch2_btree_node_mem_alloc(c);
@@ -489,7 +483,7 @@ static void bch2_btree_reserve_put(struct bch_fs *c, struct btree_reserve *reser
 			b->ob.nr = 0;
 			bkey_copy(&a->k, &b->key);
 		} else {
-			bch2_btree_open_bucket_put(c, b);
+			bch2_open_buckets_put(c, &b->ob);
 		}
 
 		btree_node_lock_type(c, b, SIX_LOCK_write);
@@ -1432,11 +1426,11 @@ static void btree_split(struct btree_update *as, struct btree *b,
 		bch2_btree_set_root(as, n1, iter);
 	}
 
-	bch2_btree_open_bucket_put(c, n1);
+	bch2_open_buckets_put(c, &n1->ob);
 	if (n2)
-		bch2_btree_open_bucket_put(c, n2);
+		bch2_open_buckets_put(c, &n2->ob);
 	if (n3)
-		bch2_btree_open_bucket_put(c, n3);
+		bch2_open_buckets_put(c, &n3->ob);
 
 	/*
 	 * Note - at this point other linked iterators could still have @b read
@@ -1751,7 +1745,7 @@ retry:
 
 	bch2_btree_insert_node(as, parent, iter, &as->parent_keys, flags);
 
-	bch2_btree_open_bucket_put(c, n);
+	bch2_open_buckets_put(c, &n->ob);
 	bch2_btree_node_free_inmem(c, b, iter);
 	bch2_btree_node_free_inmem(c, m, iter);
 	bch2_btree_iter_node_replace(iter, n);
@@ -1843,7 +1837,7 @@ static int __btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter,
 		bch2_btree_set_root(as, n, iter);
 	}
 
-	bch2_btree_open_bucket_put(c, n);
+	bch2_open_buckets_put(c, &n->ob);
 
 	bch2_btree_node_free_inmem(c, b, iter);
 
diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h
index b24988352b03..e5156e908110 100644
--- a/fs/bcachefs/btree_update_interior.h
+++ b/fs/bcachefs/btree_update_interior.h
@@ -132,7 +132,6 @@ struct btree_update {
 void bch2_btree_node_free_inmem(struct bch_fs *, struct btree *,
 				struct btree_iter *);
 void bch2_btree_node_free_never_inserted(struct bch_fs *, struct btree *);
-void bch2_btree_open_bucket_put(struct bch_fs *, struct btree *);
 
 struct btree *__bch2_btree_node_alloc_replacement(struct btree_update *,
 						  struct btree *,
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index dfd2d3b708c5..d5c17024c884 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -369,7 +369,7 @@ static void __bch2_write_index(struct bch_write_op *op)
 		}
 	}
 out:
-	bch2_open_bucket_put_refs(c, &op->open_buckets_nr, op->open_buckets);
+	bch2_open_buckets_put(c, &op->open_buckets);
 	return;
 err:
 	keys->top = keys->keys;
@@ -816,8 +816,8 @@ static void __bch2_write(struct closure *cl)
 again:
 	do {
 		/* +1 for possible cache device: */
-		if (op->open_buckets_nr + op->nr_replicas + 1 >
-		    ARRAY_SIZE(op->open_buckets))
+		if (op->open_buckets.nr + op->nr_replicas + 1 >
+		    ARRAY_SIZE(op->open_buckets.v))
 			goto flush_io;
 
 		if (bch2_keylist_realloc(&op->insert_keys,
@@ -848,11 +848,7 @@ again:
 
 		ret = bch2_write_extent(op, wp);
 
-		BUG_ON(op->open_buckets_nr + wp->nr_ptrs - wp->first_ptr >
-		       ARRAY_SIZE(op->open_buckets));
-		bch2_open_bucket_get(c, wp,
-				     &op->open_buckets_nr,
-				     op->open_buckets);
+		bch2_open_bucket_get(c, wp, &op->open_buckets);
 		bch2_alloc_sectors_done(c, wp);
 
 		if (ret < 0)
diff --git a/fs/bcachefs/io.h b/fs/bcachefs/io.h
index 62f5861005ea..1cc040a413ee 100644
--- a/fs/bcachefs/io.h
+++ b/fs/bcachefs/io.h
@@ -75,7 +75,7 @@ static inline void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c,
 	op->nr_replicas		= 0;
 	op->nr_replicas_required = c->opts.data_replicas_required;
 	op->alloc_reserve	= RESERVE_NONE;
-	op->open_buckets_nr	= 0;
+	op->open_buckets.nr	= 0;
 	op->devs_have.nr	= 0;
 	op->target		= 0;
 	op->opts		= opts;
diff --git a/fs/bcachefs/io_types.h b/fs/bcachefs/io_types.h
index b313128ed857..48273bb68c94 100644
--- a/fs/bcachefs/io_types.h
+++ b/fs/bcachefs/io_types.h
@@ -106,7 +106,6 @@ struct bch_write_op {
 	unsigned		nr_replicas_required:4;
 	unsigned		alloc_reserve:4;
 
-	u8			open_buckets_nr;
 	struct bch_devs_list	devs_have;
 	u16			target;
 	u16			nonce;
@@ -123,7 +122,7 @@ struct bch_write_op {
 
 	struct disk_reservation	res;
 
-	u8			open_buckets[16];
+	struct open_buckets	open_buckets;
 
 	/*
 	 * If caller wants to flush but hasn't passed us a journal_seq ptr, we
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index 97fbc2698dc0..7499e15a2982 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -717,13 +717,13 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
 				goto err;
 			}
 		} else {
-			int ob_idx = bch2_bucket_alloc(c, ca, RESERVE_ALLOC, false, cl);
-			if (ob_idx < 0) {
+			ob = bch2_bucket_alloc(c, ca, RESERVE_ALLOC,
+					       false, cl);
+			if (IS_ERR(ob)) {
 				ret = cl ? -EAGAIN : -ENOSPC;
 				goto err;
 			}
 
-			ob = c->open_buckets + ob_idx;
 			bucket = sector_to_bucket(ca, ob->ptr.offset);
 		}
 
-- 
cgit 


From 6278a46da3b39dee17d067a9634c3c68051e916e Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 12 Oct 2018 14:53:25 -0400
Subject: bcachefs: fix a spurious gcc warning

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_foreground.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index 562c1317aa9e..920d9ff3c53b 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -207,7 +207,7 @@ struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca,
 {
 	struct bucket_array *buckets;
 	struct open_bucket *ob;
-	long bucket;
+	long bucket = 0;
 
 	spin_lock(&c->freelist_lock);
 
-- 
cgit 


From deb3318b0ea4b288431f09f8dbcf5de90a14b27b Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 12 Oct 2018 14:57:57 -0400
Subject: bcachefs: fix missing include

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_foreground.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_foreground.h b/fs/bcachefs/alloc_foreground.h
index 609685d08642..636fe686dc48 100644
--- a/fs/bcachefs/alloc_foreground.h
+++ b/fs/bcachefs/alloc_foreground.h
@@ -5,6 +5,8 @@
 #include "bcachefs.h"
 #include "alloc_types.h"
 
+#include <linux/hash.h>
+
 struct bkey;
 struct bch_dev;
 struct bch_fs;
-- 
cgit 


From 4cb13156665d73963fc608af0429d29fa77072c5 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 2 Oct 2018 11:03:39 -0400
Subject: bcachefs: extent_ptr_decoded

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_io.c      | 2 +-
 fs/bcachefs/btree_io.h      | 2 +-
 fs/bcachefs/debug.c         | 2 +-
 fs/bcachefs/extents.c       | 8 ++++----
 fs/bcachefs/extents.h       | 4 ++--
 fs/bcachefs/extents_types.h | 4 ++--
 fs/bcachefs/io.c            | 8 ++++----
 fs/bcachefs/io.h            | 2 +-
 fs/bcachefs/io_types.h      | 2 +-
 9 files changed, 17 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 1db103815dd3..96bcdf5339e8 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -1407,7 +1407,7 @@ static void btree_node_read_endio(struct bio *bio)
 void bch2_btree_node_read(struct bch_fs *c, struct btree *b,
 			  bool sync)
 {
-	struct extent_pick_ptr pick;
+	struct extent_ptr_decoded pick;
 	struct btree_read_bio *rb;
 	struct bch_dev *ca;
 	struct bio *bio;
diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h
index 7835f8a9e3a0..34d0c0fe8b25 100644
--- a/fs/bcachefs/btree_io.h
+++ b/fs/bcachefs/btree_io.h
@@ -15,7 +15,7 @@ struct btree_read_bio {
 	struct bch_fs		*c;
 	u64			start_time;
 	unsigned		have_ioref:1;
-	struct extent_pick_ptr	pick;
+	struct extent_ptr_decoded	pick;
 	struct work_struct	work;
 	struct bio		bio;
 };
diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c
index 7db0e65927c6..550bb10bbb7b 100644
--- a/fs/bcachefs/debug.c
+++ b/fs/bcachefs/debug.c
@@ -36,7 +36,7 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b)
 	struct btree *v = c->verify_data;
 	struct btree_node *n_ondisk, *n_sorted, *n_inmemory;
 	struct bset *sorted, *inmemory;
-	struct extent_pick_ptr pick;
+	struct extent_ptr_decoded pick;
 	struct bch_dev *ca;
 	struct bio *bio;
 
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 77bc33d0a344..45d1f9f29d7c 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -603,7 +603,7 @@ static inline bool dev_latency_better(struct bch_fs *c,
 static int extent_pick_read_device(struct bch_fs *c,
 				   struct bkey_s_c_extent e,
 				   struct bch_devs_mask *avoid,
-				   struct extent_pick_ptr *pick)
+				   struct extent_ptr_decoded *pick)
 {
 	const struct bch_extent_ptr *ptr;
 	struct bch_extent_crc_unpacked crc;
@@ -622,7 +622,7 @@ static int extent_pick_read_device(struct bch_fs *c,
 		if (ret && !dev_latency_better(c, ptr, &pick->ptr))
 			continue;
 
-		*pick = (struct extent_pick_ptr) {
+		*pick = (struct extent_ptr_decoded) {
 			.ptr	= *ptr,
 			.crc	= crc,
 		};
@@ -753,7 +753,7 @@ int bch2_btree_ptr_to_text(struct bch_fs *c, char *buf,
 
 int bch2_btree_pick_ptr(struct bch_fs *c, const struct btree *b,
 			struct bch_devs_mask *avoid,
-			struct extent_pick_ptr *pick)
+			struct extent_ptr_decoded *pick)
 {
 	return extent_pick_read_device(c, bkey_i_to_s_c_extent(&b->key),
 				       avoid, pick);
@@ -1958,7 +1958,7 @@ void bch2_extent_mark_replicas_cached(struct bch_fs *c,
  */
 int bch2_extent_pick_ptr(struct bch_fs *c, struct bkey_s_c k,
 			 struct bch_devs_mask *avoid,
-			 struct extent_pick_ptr *pick)
+			 struct extent_ptr_decoded *pick)
 {
 	int ret;
 
diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h
index 66143d8d3895..7a61c20b101b 100644
--- a/fs/bcachefs/extents.h
+++ b/fs/bcachefs/extents.h
@@ -55,11 +55,11 @@ struct btree_nr_keys bch2_extent_sort_fix_overlapping(struct bch_fs *c,
 
 int bch2_btree_pick_ptr(struct bch_fs *, const struct btree *,
 			struct bch_devs_mask *avoid,
-			struct extent_pick_ptr *);
+			struct extent_ptr_decoded *);
 
 int bch2_extent_pick_ptr(struct bch_fs *, struct bkey_s_c,
 			 struct bch_devs_mask *,
-			 struct extent_pick_ptr *);
+			 struct extent_ptr_decoded *);
 
 void bch2_extent_trim_atomic(struct bkey_i *, struct btree_iter *);
 
diff --git a/fs/bcachefs/extents_types.h b/fs/bcachefs/extents_types.h
index 27b2bde85e5c..3540e2558c0f 100644
--- a/fs/bcachefs/extents_types.h
+++ b/fs/bcachefs/extents_types.h
@@ -19,9 +19,9 @@ struct bch_extent_crc_unpacked {
 	struct bch_csum		csum;
 };
 
-struct extent_pick_ptr {
-	struct bch_extent_ptr		ptr;
+struct extent_ptr_decoded {
 	struct bch_extent_crc_unpacked	crc;
+	struct bch_extent_ptr		ptr;
 };
 
 #endif /* _BCACHEFS_EXTENTS_TYPES_H */
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index d5c17024c884..d64463751b84 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -1026,7 +1026,7 @@ static void promote_start(struct promote_op *op, struct bch_read_bio *rbio)
 noinline
 static struct promote_op *__promote_alloc(struct bch_fs *c,
 					  struct bpos pos,
-					  struct extent_pick_ptr *pick,
+					  struct extent_ptr_decoded *pick,
 					  struct bch_io_opts opts,
 					  unsigned rbio_sectors,
 					  struct bch_read_bio **rbio)
@@ -1104,7 +1104,7 @@ err:
 static inline struct promote_op *promote_alloc(struct bch_fs *c,
 					       struct bvec_iter iter,
 					       struct bkey_s_c k,
-					       struct extent_pick_ptr *pick,
+					       struct extent_ptr_decoded *pick,
 					       struct bch_io_opts opts,
 					       unsigned flags,
 					       struct bch_read_bio **rbio,
@@ -1411,7 +1411,7 @@ out:
 }
 
 static bool should_narrow_crcs(struct bkey_s_c k,
-			       struct extent_pick_ptr *pick,
+			       struct extent_ptr_decoded *pick,
 			       unsigned flags)
 {
 	return !(flags & BCH_READ_IN_RETRY) &&
@@ -1566,7 +1566,7 @@ int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig,
 		       struct bvec_iter iter, struct bkey_s_c k,
 		       struct bch_devs_mask *avoid, unsigned flags)
 {
-	struct extent_pick_ptr pick;
+	struct extent_ptr_decoded pick;
 	struct bch_read_bio *rbio = NULL;
 	struct bch_dev *ca;
 	struct promote_op *promote = NULL;
diff --git a/fs/bcachefs/io.h b/fs/bcachefs/io.h
index 1cc040a413ee..c832b7291005 100644
--- a/fs/bcachefs/io.h
+++ b/fs/bcachefs/io.h
@@ -99,7 +99,7 @@ static inline struct bch_write_bio *wbio_init(struct bio *bio)
 
 struct bch_devs_mask;
 struct cache_promote_op;
-struct extent_pick_ptr;
+struct extent_ptr_decoded;
 
 int __bch2_read_extent(struct bch_fs *, struct bch_read_bio *, struct bvec_iter,
 		       struct bkey_s_c, struct bch_devs_mask *, unsigned);
diff --git a/fs/bcachefs/io_types.h b/fs/bcachefs/io_types.h
index 48273bb68c94..c697191172b0 100644
--- a/fs/bcachefs/io_types.h
+++ b/fs/bcachefs/io_types.h
@@ -55,7 +55,7 @@ struct bch_read_bio {
 
 	struct bch_devs_list	devs_have;
 
-	struct extent_pick_ptr	pick;
+	struct extent_ptr_decoded pick;
 	/* start pos of data we read (may not be pos of data we want) */
 	struct bpos		pos;
 	struct bversion		version;
-- 
cgit 


From 642d66d1bbf930ae3b2f4a099cc1359b6489a3d5 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 2 Oct 2018 16:40:12 -0400
Subject: bcachefs: kill bch_extent_crc_type

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/extents.h | 49 ++++++-------------------------------------------
 1 file changed, 6 insertions(+), 43 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h
index 7a61c20b101b..6c8498d4b295 100644
--- a/fs/bcachefs/extents.h
+++ b/fs/bcachefs/extents.h
@@ -235,44 +235,6 @@ union bch_extent_crc {
 
 /* checksum entries: */
 
-enum bch_extent_crc_type {
-	BCH_EXTENT_CRC_NONE,
-	BCH_EXTENT_CRC32,
-	BCH_EXTENT_CRC64,
-	BCH_EXTENT_CRC128,
-};
-
-static inline enum bch_extent_crc_type
-__extent_crc_type(const union bch_extent_crc *crc)
-{
-	if (!crc)
-		return BCH_EXTENT_CRC_NONE;
-
-	switch (extent_entry_type(to_entry(crc))) {
-	case BCH_EXTENT_ENTRY_crc32:
-		return BCH_EXTENT_CRC32;
-	case BCH_EXTENT_ENTRY_crc64:
-		return BCH_EXTENT_CRC64;
-	case BCH_EXTENT_ENTRY_crc128:
-		return BCH_EXTENT_CRC128;
-	default:
-		BUG();
-	}
-}
-
-#define extent_crc_type(_crc)						\
-({									\
-	BUILD_BUG_ON(!type_is(_crc, struct bch_extent_crc32 *) &&	\
-		     !type_is(_crc, struct bch_extent_crc64 *) &&	\
-		     !type_is(_crc, struct bch_extent_crc128 *) &&	\
-		     !type_is(_crc, union bch_extent_crc *));		\
-									\
-	  type_is(_crc, struct bch_extent_crc32 *)  ? BCH_EXTENT_CRC32	\
-	: type_is(_crc, struct bch_extent_crc64 *)  ? BCH_EXTENT_CRC64	\
-	: type_is(_crc, struct bch_extent_crc128 *) ? BCH_EXTENT_CRC128	\
-	: __extent_crc_type((union bch_extent_crc *) _crc);		\
-})
-
 static inline struct bch_extent_crc_unpacked
 bch2_extent_crc_unpack(const struct bkey *k, const union bch_extent_crc *crc)
 {
@@ -284,14 +246,15 @@ bch2_extent_crc_unpack(const struct bkey *k, const union bch_extent_crc *crc)
 		.offset			= _crc.offset,			\
 		.live_size		= k->size
 
-	switch (extent_crc_type(crc)) {
-	case BCH_EXTENT_CRC_NONE:
+	if (!crc)
 		return (struct bch_extent_crc_unpacked) {
 			.compressed_size	= k->size,
 			.uncompressed_size	= k->size,
 			.live_size		= k->size,
 		};
-	case BCH_EXTENT_CRC32: {
+
+	switch (extent_entry_type(to_entry(crc))) {
+	case BCH_EXTENT_ENTRY_crc32: {
 		struct bch_extent_crc_unpacked ret = (struct bch_extent_crc_unpacked) {
 			common_fields(crc->crc32),
 		};
@@ -303,7 +266,7 @@ bch2_extent_crc_unpack(const struct bkey *k, const union bch_extent_crc *crc)
 
 		return ret;
 	}
-	case BCH_EXTENT_CRC64: {
+	case BCH_EXTENT_ENTRY_crc64: {
 		struct bch_extent_crc_unpacked ret = (struct bch_extent_crc_unpacked) {
 			common_fields(crc->crc64),
 			.nonce			= crc->crc64.nonce,
@@ -314,7 +277,7 @@ bch2_extent_crc_unpack(const struct bkey *k, const union bch_extent_crc *crc)
 
 		return ret;
 	}
-	case BCH_EXTENT_CRC128: {
+	case BCH_EXTENT_ENTRY_crc128: {
 		struct bch_extent_crc_unpacked ret = (struct bch_extent_crc_unpacked) {
 			common_fields(crc->crc128),
 			.nonce			= crc->crc128.nonce,
-- 
cgit 


From 1742237ba1db942b84a697509543fc5a9a25fcfa Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 27 Sep 2018 21:08:39 -0400
Subject: bcachefs: extent_for_each_ptr_decode()

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/buckets.c   | 31 ++++++++--------
 fs/bcachefs/extents.c   | 76 +++++++++++++++++++--------------------
 fs/bcachefs/extents.h   | 95 ++++++++++++++++++++++++++++---------------------
 fs/bcachefs/fs-io.c     |  8 ++---
 fs/bcachefs/fs.c        | 18 +++++-----
 fs/bcachefs/move.c      | 23 ++++++------
 fs/bcachefs/rebalance.c | 31 ++++++++--------
 fs/bcachefs/sysfs.c     | 12 +++----
 8 files changed, 154 insertions(+), 140 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index c6544f35eb09..84972b67f193 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -546,20 +546,19 @@ static int __disk_sectors(struct bch_extent_crc_unpacked crc, unsigned sectors)
  */
 static void bch2_mark_pointer(struct bch_fs *c,
 			      struct bkey_s_c_extent e,
-			      const struct bch_extent_ptr *ptr,
-			      struct bch_extent_crc_unpacked crc,
+			      struct extent_ptr_decoded p,
 			      s64 sectors, enum bch_data_type data_type,
 			      unsigned replicas,
 			      struct bch_fs_usage *fs_usage,
 			      u64 journal_seq, unsigned flags)
 {
 	struct bucket_mark old, new;
-	struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
-	struct bucket *g = PTR_BUCKET(ca, ptr);
+	struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev);
+	struct bucket *g = PTR_BUCKET(ca, &p.ptr);
 	s64 uncompressed_sectors = sectors;
 	u64 v;
 
-	if (crc.compression_type) {
+	if (p.crc.compression_type) {
 		unsigned old_sectors, new_sectors;
 
 		if (sectors > 0) {
@@ -570,8 +569,8 @@ static void bch2_mark_pointer(struct bch_fs *c,
 			new_sectors = e.k->size + sectors;
 		}
 
-		sectors = -__disk_sectors(crc, old_sectors)
-			  +__disk_sectors(crc, new_sectors);
+		sectors = -__disk_sectors(p.crc, old_sectors)
+			  +__disk_sectors(p.crc, new_sectors);
 	}
 
 	/*
@@ -584,8 +583,8 @@ static void bch2_mark_pointer(struct bch_fs *c,
 	 * caller's responsibility to not apply @fs_usage if gc is in progress.
 	 */
 	fs_usage->replicas
-		[!ptr->cached && replicas ? replicas - 1 : 0].data
-		[!ptr->cached ? data_type : BCH_DATA_CACHED] +=
+		[!p.ptr.cached && replicas ? replicas - 1 : 0].data
+		[!p.ptr.cached ? data_type : BCH_DATA_CACHED] +=
 			uncompressed_sectors;
 
 	if (flags & BCH_BUCKET_MARK_GC_WILL_VISIT) {
@@ -607,14 +606,14 @@ static void bch2_mark_pointer(struct bch_fs *c,
 		 * the allocator invalidating a bucket after we've already
 		 * checked the gen
 		 */
-		if (gen_after(new.gen, ptr->gen)) {
+		if (gen_after(new.gen, p.ptr.gen)) {
 			BUG_ON(!test_bit(BCH_FS_ALLOC_READ_DONE, &c->flags));
-			EBUG_ON(!ptr->cached &&
+			EBUG_ON(!p.ptr.cached &&
 				test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags));
 			return;
 		}
 
-		if (!ptr->cached)
+		if (!p.ptr.cached)
 			checked_add(new.dirty_sectors, sectors);
 		else
 			checked_add(new.cached_sectors, sectors);
@@ -695,13 +694,13 @@ void bch2_mark_key(struct bch_fs *c, struct bkey_s_c k,
 	case BCH_EXTENT:
 	case BCH_EXTENT_CACHED: {
 		struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
-		const struct bch_extent_ptr *ptr;
-		struct bch_extent_crc_unpacked crc;
+		const union bch_extent_entry *entry;
+		struct extent_ptr_decoded p;
 
 		BUG_ON(!sectors);
 
-		extent_for_each_ptr_crc(e, ptr, crc)
-			bch2_mark_pointer(c, e, ptr, crc, sectors, data_type,
+		extent_for_each_ptr_decode(e, p, entry)
+			bch2_mark_pointer(c, e, p, sectors, data_type,
 					  replicas, stats, journal_seq, flags);
 		break;
 	}
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 45d1f9f29d7c..0441e42bb1c4 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -231,21 +231,21 @@ unsigned bch2_extent_durability(struct bch_fs *c, struct bkey_s_c_extent e)
 
 unsigned bch2_extent_is_compressed(struct bkey_s_c k)
 {
-	struct bkey_s_c_extent e;
-	const struct bch_extent_ptr *ptr;
-	struct bch_extent_crc_unpacked crc;
 	unsigned ret = 0;
 
 	switch (k.k->type) {
 	case BCH_EXTENT:
-	case BCH_EXTENT_CACHED:
-		e = bkey_s_c_to_extent(k);
+	case BCH_EXTENT_CACHED: {
+		struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
+		const union bch_extent_entry *entry;
+		struct extent_ptr_decoded p;
 
-		extent_for_each_ptr_crc(e, ptr, crc)
-			if (!ptr->cached &&
-			    crc.compression_type != BCH_COMPRESSION_NONE &&
-			    crc.compressed_size < crc.live_size)
-				ret = max_t(unsigned, ret, crc.compressed_size);
+		extent_for_each_ptr_decode(e, p, entry)
+			if (!p.ptr.cached &&
+			    p.crc.compression_type != BCH_COMPRESSION_NONE &&
+			    p.crc.compressed_size < p.crc.live_size)
+				ret = max_t(unsigned, ret, p.crc.compressed_size);
+	}
 	}
 
 	return ret;
@@ -254,17 +254,17 @@ unsigned bch2_extent_is_compressed(struct bkey_s_c k)
 bool bch2_extent_matches_ptr(struct bch_fs *c, struct bkey_s_c_extent e,
 			     struct bch_extent_ptr m, u64 offset)
 {
-	const struct bch_extent_ptr *ptr;
-	struct bch_extent_crc_unpacked crc;
+	const union bch_extent_entry *entry;
+	struct extent_ptr_decoded p;
 
-	extent_for_each_ptr_crc(e, ptr, crc)
-		if (ptr->dev	== m.dev &&
-		    ptr->gen	== m.gen &&
-		    (s64) ptr->offset + crc.offset - bkey_start_offset(e.k) ==
+	extent_for_each_ptr_decode(e, p, entry)
+		if (p.ptr.dev	== m.dev &&
+		    p.ptr.gen	== m.gen &&
+		    (s64) p.ptr.offset + p.crc.offset - bkey_start_offset(e.k) ==
 		    (s64) m.offset  - offset)
-			return ptr;
+			return true;
 
-	return NULL;
+	return false;
 }
 
 /* Doesn't cleanup redundant crcs */
@@ -323,7 +323,7 @@ bool bch2_extent_narrow_crcs(struct bkey_i_extent *e,
 			     struct bch_extent_crc_unpacked n)
 {
 	struct bch_extent_crc_unpacked u;
-	struct bch_extent_ptr *ptr;
+	struct extent_ptr_decoded p;
 	union bch_extent_entry *i;
 
 	/* Find a checksum entry that covers only live data: */
@@ -345,11 +345,11 @@ bool bch2_extent_narrow_crcs(struct bkey_i_extent *e,
 
 	bch2_extent_crc_append(e, n);
 restart_narrow_pointers:
-	extent_for_each_ptr_crc(extent_i_to_s(e), ptr, u)
-		if (can_narrow_crc(u, n)) {
-			ptr->offset += u.offset;
-			extent_ptr_append(e, *ptr);
-			__bch2_extent_drop_ptr(extent_i_to_s(e), ptr);
+	extent_for_each_ptr_decode(extent_i_to_s(e), p, i)
+		if (can_narrow_crc(p.crc, n)) {
+			i->ptr.offset += p.crc.offset;
+			extent_ptr_append(e, i->ptr);
+			__bch2_extent_drop_ptr(extent_i_to_s(e), &i->ptr);
 			goto restart_narrow_pointers;
 		}
 
@@ -475,6 +475,8 @@ void bch2_ptr_swab(const struct bkey_format *f, struct bkey_packed *k)
 		     entry < (union bch_extent_entry *) (d + bkeyp_val_u64s(f, k));
 		     entry = extent_entry_next(entry)) {
 			switch (extent_entry_type(entry)) {
+			case BCH_EXTENT_ENTRY_ptr:
+				break;
 			case BCH_EXTENT_ENTRY_crc32:
 				entry->crc32.csum = swab32(entry->crc32.csum);
 				break;
@@ -488,8 +490,6 @@ void bch2_ptr_swab(const struct bkey_format *f, struct bkey_packed *k)
 				entry->crc128.csum.lo = (__force __le64)
 					swab64((__force u64) entry->crc128.csum.lo);
 				break;
-			case BCH_EXTENT_ENTRY_ptr:
-				break;
 			}
 		}
 		break;
@@ -605,28 +605,28 @@ static int extent_pick_read_device(struct bch_fs *c,
 				   struct bch_devs_mask *avoid,
 				   struct extent_ptr_decoded *pick)
 {
-	const struct bch_extent_ptr *ptr;
-	struct bch_extent_crc_unpacked crc;
+	const union bch_extent_entry *entry;
+	struct extent_ptr_decoded p;
 	struct bch_dev *ca;
 	int ret = 0;
 
-	extent_for_each_ptr_crc(e, ptr, crc) {
-		ca = bch_dev_bkey_exists(c, ptr->dev);
+	extent_for_each_ptr_decode(e, p, entry) {
+		ca = bch_dev_bkey_exists(c, p.ptr.dev);
 
-		if (ptr->cached && ptr_stale(ca, ptr))
+		if (p.ptr.cached && ptr_stale(ca, &p.ptr))
 			continue;
 
-		if (avoid && test_bit(ptr->dev, avoid->d))
-			continue;
+		/*
+		 * XXX: need to make avoid work correctly for stripe ptrs
+		 */
 
-		if (ret && !dev_latency_better(c, ptr, &pick->ptr))
+		if (avoid && test_bit(p.ptr.dev, avoid->d))
 			continue;
 
-		*pick = (struct extent_ptr_decoded) {
-			.ptr	= *ptr,
-			.crc	= crc,
-		};
+		if (ret && !dev_latency_better(c, &p.ptr, &pick->ptr))
+			continue;
 
+		*pick = p;
 		ret = 1;
 	}
 
diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h
index 6c8498d4b295..b1b9c189867a 100644
--- a/fs/bcachefs/extents.h
+++ b/fs/bcachefs/extents.h
@@ -182,12 +182,24 @@ static inline size_t extent_entry_u64s(const union bch_extent_entry *entry)
 
 static inline bool extent_entry_is_ptr(const union bch_extent_entry *e)
 {
-	return extent_entry_type(e) == BCH_EXTENT_ENTRY_ptr;
+	switch (extent_entry_type(e)) {
+	case BCH_EXTENT_ENTRY_ptr:
+		return true;
+	default:
+		return false;
+	}
 }
 
 static inline bool extent_entry_is_crc(const union bch_extent_entry *e)
 {
-	return !extent_entry_is_ptr(e);
+	switch (extent_entry_type(e)) {
+	case BCH_EXTENT_ENTRY_crc32:
+	case BCH_EXTENT_ENTRY_crc64:
+	case BCH_EXTENT_ENTRY_crc128:
+		return true;
+	default:
+		return false;
+	}
 }
 
 union bch_extent_crc {
@@ -310,23 +322,25 @@ bch2_extent_crc_unpack(const struct bkey *k, const union bch_extent_crc *crc)
 #define extent_for_each_entry(_e, _entry)				\
 	extent_for_each_entry_from(_e, _entry, (_e).v->start)
 
-/* Iterate over crcs only: */
+/* Iterate over pointers only: */
 
-#define __extent_crc_next(_e, _p)					\
+#define extent_ptr_next(_e, _ptr)					\
 ({									\
-	typeof(&(_e).v->start[0]) _entry = _p;				\
+	typeof(&(_e).v->start[0]) _entry;				\
 									\
-	while ((_entry) < extent_entry_last(_e) &&			\
-	       !extent_entry_is_crc(_entry))				\
-		(_entry) = extent_entry_next(_entry);			\
+	extent_for_each_entry_from(_e, _entry, to_entry(_ptr))		\
+		if (extent_entry_is_ptr(_entry))			\
+			break;						\
 									\
-	entry_to_crc(_entry < extent_entry_last(_e) ? _entry : NULL);	\
+	_entry < extent_entry_last(_e) ? entry_to_ptr(_entry) : NULL;	\
 })
 
-#define __extent_for_each_crc(_e, _crc)					\
-	for ((_crc) = __extent_crc_next(_e, (_e).v->start);		\
-	     (_crc);							\
-	     (_crc) = __extent_crc_next(_e, extent_entry_next(to_entry(_crc))))
+#define extent_for_each_ptr(_e, _ptr)					\
+	for ((_ptr) = &(_e).v->start->ptr;				\
+	     ((_ptr) = extent_ptr_next(_e, _ptr));			\
+	     (_ptr)++)
+
+/* Iterate over crcs only: */
 
 #define extent_crc_next(_e, _crc, _iter)				\
 ({									\
@@ -347,43 +361,44 @@ bch2_extent_crc_unpack(const struct bkey *k, const union bch_extent_crc *crc)
 
 /* Iterate over pointers, with crcs: */
 
-#define extent_ptr_crc_next(_e, _ptr, _crc)				\
+static inline struct extent_ptr_decoded
+__extent_ptr_decoded_init(const struct bkey *k)
+{
+	return (struct extent_ptr_decoded) {
+		.crc		= bch2_extent_crc_unpack(k, NULL),
+	};
+}
+
+#define EXTENT_ITERATE_EC		(1 << 0)
+
+#define __extent_ptr_next_decode(_e, _ptr, _entry)			\
 ({									\
 	__label__ out;							\
-	typeof(&(_e).v->start[0]) _entry;				\
 									\
-	extent_for_each_entry_from(_e, _entry, to_entry(_ptr))		\
-		if (extent_entry_is_crc(_entry)) {			\
-			(_crc) = bch2_extent_crc_unpack((_e).k, entry_to_crc(_entry));\
-		} else {						\
-			_ptr = entry_to_ptr(_entry);			\
+	extent_for_each_entry_from(_e, _entry, _entry)			\
+		switch (extent_entry_type(_entry)) {			\
+		case BCH_EXTENT_ENTRY_ptr:				\
+			(_ptr).ptr		= _entry->ptr;		\
 			goto out;					\
+		case BCH_EXTENT_ENTRY_crc32:				\
+		case BCH_EXTENT_ENTRY_crc64:				\
+		case BCH_EXTENT_ENTRY_crc128:				\
+			(_ptr).crc = bch2_extent_crc_unpack((_e).k,	\
+					entry_to_crc(_entry));		\
+			break;						\
 		}							\
 									\
-	_ptr = NULL;							\
 out:									\
-	_ptr;								\
+	_entry < extent_entry_last(_e);					\
 })
 
-#define extent_for_each_ptr_crc(_e, _ptr, _crc)				\
-	for ((_crc) = bch2_extent_crc_unpack((_e).k, NULL),		\
-	     (_ptr) = &(_e).v->start->ptr;				\
-	     ((_ptr) = extent_ptr_crc_next(_e, _ptr, _crc));		\
-	     (_ptr)++)
-
-/* Iterate over pointers only, and from a given position: */
-
-#define extent_ptr_next(_e, _ptr)					\
-({									\
-	struct bch_extent_crc_unpacked _crc;				\
-									\
-	extent_ptr_crc_next(_e, _ptr, _crc);				\
-})
+#define extent_for_each_ptr_decode(_e, _ptr, _entry)			\
+	for ((_ptr) = __extent_ptr_decoded_init((_e).k),		\
+	     (_entry) = (_e).v->start;					\
+	     __extent_ptr_next_decode(_e, _ptr, _entry);		\
+	     (_entry) = extent_entry_next(_entry))
 
-#define extent_for_each_ptr(_e, _ptr)					\
-	for ((_ptr) = &(_e).v->start->ptr;				\
-	     ((_ptr) = extent_ptr_next(_e, _ptr));			\
-	     (_ptr)++)
+/* Iterate over pointers backwards: */
 
 #define extent_ptr_prev(_e, _ptr)					\
 ({									\
diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 2902e5f925ef..eecf792198e4 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -920,12 +920,12 @@ static void bchfs_read(struct bch_fs *c, struct btree_iter *iter,
 
 			if (bkey_extent_is_data(k.k)) {
 				struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
-				struct bch_extent_crc_unpacked crc;
 				const union bch_extent_entry *i;
+				struct extent_ptr_decoded p;
 
-				extent_for_each_crc(e, crc, i)
-					want_full_extent |= ((crc.csum_type != 0) |
-							     (crc.compression_type != 0));
+				extent_for_each_ptr_decode(e, p, i)
+					want_full_extent |= ((p.crc.csum_type != 0) |
+							     (p.crc.compression_type != 0));
 			}
 
 			readpage_bio_extend(readpages_iter,
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index b67cf83f7fcd..88bf88c047ae 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -1053,27 +1053,27 @@ static int bch2_fill_extent(struct fiemap_extent_info *info,
 {
 	if (bkey_extent_is_data(&k->k)) {
 		struct bkey_s_c_extent e = bkey_i_to_s_c_extent(k);
-		const struct bch_extent_ptr *ptr;
-		struct bch_extent_crc_unpacked crc;
+		const union bch_extent_entry *entry;
+		struct extent_ptr_decoded p;
 		int ret;
 
-		extent_for_each_ptr_crc(e, ptr, crc) {
+		extent_for_each_ptr_decode(e, p, entry) {
 			int flags2 = 0;
-			u64 offset = ptr->offset;
+			u64 offset = p.ptr.offset;
 
-			if (crc.compression_type)
+			if (p.crc.compression_type)
 				flags2 |= FIEMAP_EXTENT_ENCODED;
 			else
-				offset += crc.offset;
+				offset += p.crc.offset;
 
 			if ((offset & (PAGE_SECTORS - 1)) ||
 			    (e.k->size & (PAGE_SECTORS - 1)))
 				flags2 |= FIEMAP_EXTENT_NOT_ALIGNED;
 
 			ret = fiemap_fill_next_extent(info,
-						      bkey_start_offset(e.k) << 9,
-						      offset << 9,
-						      e.k->size << 9, flags|flags2);
+						bkey_start_offset(e.k) << 9,
+						offset << 9,
+						e.k->size << 9, flags|flags2);
 			if (ret)
 				return ret;
 		}
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index 96f04f349fb1..1e63d0e5ce53 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -67,8 +67,8 @@ static int bch2_migrate_index_update(struct bch_write_op *op)
 		struct bkey_i_extent *insert, *new =
 			bkey_i_to_extent(bch2_keylist_front(keys));
 		BKEY_PADDED(k) _new, _insert;
-		struct bch_extent_ptr *ptr;
-		struct bch_extent_crc_unpacked crc;
+		const union bch_extent_entry *entry;
+		struct extent_ptr_decoded p;
 		bool did_work = false;
 		int nr;
 
@@ -99,14 +99,15 @@ static int bch2_migrate_index_update(struct bch_write_op *op)
 		bch2_cut_back(insert->k.p, &new->k);
 
 		if (m->data_cmd == DATA_REWRITE) {
-			ptr = (struct bch_extent_ptr *)
+			struct bch_extent_ptr *ptr = (void *)
 				bch2_extent_has_device(extent_i_to_s_c(insert),
 						       m->data_opts.rewrite_dev);
+			BUG_ON(!ptr);
 			bch2_extent_drop_ptr(extent_i_to_s(insert), ptr);
 		}
 
-		extent_for_each_ptr_crc(extent_i_to_s(new), ptr, crc) {
-			if (bch2_extent_has_device(extent_i_to_s_c(insert), ptr->dev)) {
+		extent_for_each_ptr_decode(extent_i_to_s(new), p, entry) {
+			if (bch2_extent_has_device(extent_i_to_s_c(insert), p.ptr.dev)) {
 				/*
 				 * raced with another move op? extent already
 				 * has a pointer to the device we just wrote
@@ -115,8 +116,8 @@ static int bch2_migrate_index_update(struct bch_write_op *op)
 				continue;
 			}
 
-			bch2_extent_crc_append(insert, crc);
-			extent_ptr_append(insert, *ptr);
+			bch2_extent_crc_append(insert, p.crc);
+			extent_ptr_append(insert, p.ptr);
 			did_work = true;
 		}
 
@@ -379,8 +380,8 @@ static int bch2_move_extent(struct bch_fs *c,
 			    struct data_opts data_opts)
 {
 	struct moving_io *io;
-	const struct bch_extent_ptr *ptr;
-	struct bch_extent_crc_unpacked crc;
+	const union bch_extent_entry *entry;
+	struct extent_ptr_decoded p;
 	unsigned sectors = e.k->size, pages;
 	int ret = -ENOMEM;
 
@@ -393,8 +394,8 @@ static int bch2_move_extent(struct bch_fs *c,
 		SECTORS_IN_FLIGHT_PER_DEVICE);
 
 	/* write path might have to decompress data: */
-	extent_for_each_ptr_crc(e, ptr, crc)
-		sectors = max_t(unsigned, sectors, crc.uncompressed_size);
+	extent_for_each_ptr_decode(e, p, entry)
+		sectors = max_t(unsigned, sectors, p.crc.uncompressed_size);
 
 	pages = DIV_ROUND_UP(sectors, PAGE_SECTORS);
 	io = kzalloc(sizeof(struct moving_io) +
diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c
index 461af44dbde7..570dbae5a240 100644
--- a/fs/bcachefs/rebalance.c
+++ b/fs/bcachefs/rebalance.c
@@ -18,17 +18,16 @@
 #include <linux/sched/cputime.h>
 
 static inline bool rebalance_ptr_pred(struct bch_fs *c,
-				      const struct bch_extent_ptr *ptr,
-				      struct bch_extent_crc_unpacked crc,
+				      struct extent_ptr_decoded p,
 				      struct bch_io_opts *io_opts)
 {
 	if (io_opts->background_target &&
-	    !bch2_dev_in_target(c, ptr->dev, io_opts->background_target) &&
-	    !ptr->cached)
+	    !bch2_dev_in_target(c, p.ptr.dev, io_opts->background_target) &&
+	    !p.ptr.cached)
 		return true;
 
 	if (io_opts->background_compression &&
-	    crc.compression_type !=
+	    p.crc.compression_type !=
 	    bch2_compression_opt_to_type[io_opts->background_compression])
 		return true;
 
@@ -39,8 +38,8 @@ void bch2_rebalance_add_key(struct bch_fs *c,
 			    struct bkey_s_c k,
 			    struct bch_io_opts *io_opts)
 {
-	const struct bch_extent_ptr *ptr;
-	struct bch_extent_crc_unpacked crc;
+	const union bch_extent_entry *entry;
+	struct extent_ptr_decoded p;
 	struct bkey_s_c_extent e;
 
 	if (!bkey_extent_is_data(k.k))
@@ -52,13 +51,13 @@ void bch2_rebalance_add_key(struct bch_fs *c,
 
 	e = bkey_s_c_to_extent(k);
 
-	extent_for_each_ptr_crc(e, ptr, crc)
-		if (rebalance_ptr_pred(c, ptr, crc, io_opts)) {
-			struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+	extent_for_each_ptr_decode(e, p, entry)
+		if (rebalance_ptr_pred(c, p, io_opts)) {
+			struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev);
 
-			if (atomic64_add_return(crc.compressed_size,
+			if (atomic64_add_return(p.crc.compressed_size,
 						&ca->rebalance_work) ==
-			    crc.compressed_size)
+			    p.crc.compressed_size)
 				rebalance_wakeup(c);
 		}
 }
@@ -76,16 +75,16 @@ static enum data_cmd rebalance_pred(struct bch_fs *c, void *arg,
 				    struct bch_io_opts *io_opts,
 				    struct data_opts *data_opts)
 {
-	const struct bch_extent_ptr *ptr;
-	struct bch_extent_crc_unpacked crc;
+	const union bch_extent_entry *entry;
+	struct extent_ptr_decoded p;
 
 	/* Make sure we have room to add a new pointer: */
 	if (bkey_val_u64s(e.k) + BKEY_EXTENT_PTR_U64s_MAX >
 	    BKEY_EXTENT_VAL_U64s_MAX)
 		return DATA_SKIP;
 
-	extent_for_each_ptr_crc(e, ptr, crc)
-		if (rebalance_ptr_pred(c, ptr, crc, io_opts))
+	extent_for_each_ptr_decode(e, p, entry)
+		if (rebalance_ptr_pred(c, p, io_opts))
 			goto found;
 
 	return DATA_SKIP;
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index b7a65bc20430..ee91bcc6433c 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -283,19 +283,19 @@ static ssize_t bch2_compression_stats(struct bch_fs *c, char *buf)
 	for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, POS_MIN, 0, k)
 		if (k.k->type == BCH_EXTENT) {
 			struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
-			const struct bch_extent_ptr *ptr;
-			struct bch_extent_crc_unpacked crc;
+			const union bch_extent_entry *entry;
+			struct extent_ptr_decoded p;
 
-			extent_for_each_ptr_crc(e, ptr, crc) {
-				if (crc.compression_type == BCH_COMPRESSION_NONE) {
+			extent_for_each_ptr_decode(e, p, entry) {
+				if (p.crc.compression_type == BCH_COMPRESSION_NONE) {
 					nr_uncompressed_extents++;
 					uncompressed_sectors += e.k->size;
 				} else {
 					nr_compressed_extents++;
 					compressed_sectors_compressed +=
-						crc.compressed_size;
+						p.crc.compressed_size;
 					compressed_sectors_uncompressed +=
-						crc.uncompressed_size;
+						p.crc.uncompressed_size;
 				}
 
 				/* only looking at the first ptr */
-- 
cgit 


From a2753581f6c5c05ea93978f8217a29115450ac58 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 30 Sep 2018 18:28:23 -0400
Subject: bcachefs: bch2_extent_drop_ptrs()

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_io.c   |  6 ++--
 fs/bcachefs/extents.c    | 73 ++++++++++++++++++++++--------------------------
 fs/bcachefs/extents.h    | 43 +++++++++++-----------------
 fs/bcachefs/io.c         |  6 ++--
 fs/bcachefs/journal_io.c | 19 ++++++++-----
 fs/bcachefs/move.c       | 10 ++-----
 6 files changed, 71 insertions(+), 86 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 96bcdf5339e8..e64e53e9d9ab 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -1570,9 +1570,9 @@ retry:
 
 	new_key = bkey_i_to_extent(&tmp.k);
 	e = extent_i_to_s(new_key);
-	extent_for_each_ptr_backwards(e, ptr)
-		if (bch2_dev_list_has_dev(wbio->wbio.failed, ptr->dev))
-			bch2_extent_drop_ptr(e, ptr);
+
+	bch2_extent_drop_ptrs(e, ptr,
+		bch2_dev_list_has_dev(wbio->wbio.failed, ptr->dev));
 
 	if (!bch2_extent_nr_ptrs(e.c))
 		goto err;
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 0441e42bb1c4..8b84c5e00a26 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -122,20 +122,11 @@ bch2_extent_has_device(struct bkey_s_c_extent e, unsigned dev)
 	return NULL;
 }
 
-bool bch2_extent_drop_device(struct bkey_s_extent e, unsigned dev)
+void bch2_extent_drop_device(struct bkey_s_extent e, unsigned dev)
 {
 	struct bch_extent_ptr *ptr;
-	bool dropped = false;
 
-	extent_for_each_ptr_backwards(e, ptr)
-		if (ptr->dev == dev) {
-			__bch2_extent_drop_ptr(e, ptr);
-			dropped = true;
-		}
-
-	if (dropped)
-		bch2_extent_drop_redundant_crcs(e);
-	return dropped;
+	bch2_extent_drop_ptrs(e, ptr, ptr->dev == dev);
 }
 
 const struct bch_extent_ptr *
@@ -267,21 +258,37 @@ bool bch2_extent_matches_ptr(struct bch_fs *c, struct bkey_s_c_extent e,
 	return false;
 }
 
-/* Doesn't cleanup redundant crcs */
-void __bch2_extent_drop_ptr(struct bkey_s_extent e, struct bch_extent_ptr *ptr)
+union bch_extent_entry *bch2_extent_drop_ptr(struct bkey_s_extent e,
+					     struct bch_extent_ptr *ptr)
 {
+	union bch_extent_entry *dst;
+	union bch_extent_entry *src;
+
 	EBUG_ON(ptr < &e.v->start->ptr ||
 		ptr >= &extent_entry_last(e)->ptr);
 	EBUG_ON(ptr->type != 1 << BCH_EXTENT_ENTRY_ptr);
-	memmove_u64s_down(ptr, ptr + 1,
-			  (u64 *) extent_entry_last(e) - (u64 *) (ptr + 1));
-	e.k->u64s -= sizeof(*ptr) / sizeof(u64);
-}
 
-void bch2_extent_drop_ptr(struct bkey_s_extent e, struct bch_extent_ptr *ptr)
-{
-	__bch2_extent_drop_ptr(e, ptr);
-	bch2_extent_drop_redundant_crcs(e);
+	src = to_entry(ptr + 1);
+
+	if (src != extent_entry_last(e) &&
+	    extent_entry_type(src) == BCH_EXTENT_ENTRY_ptr) {
+		dst = to_entry(ptr);
+	} else {
+		extent_for_each_entry(e, dst) {
+			if (dst == to_entry(ptr))
+				break;
+
+			if (extent_entry_next(dst) == to_entry(ptr) &&
+			    extent_entry_is_crc(dst))
+				break;
+		}
+	}
+
+	memmove_u64s_down(dst, src,
+			  (u64 *) extent_entry_last(e) - (u64 *) src);
+	e.k->u64s -= (u64 *) src - (u64 *) dst;
+
+	return dst;
 }
 
 static inline bool can_narrow_crc(struct bch_extent_crc_unpacked u,
@@ -349,7 +356,7 @@ restart_narrow_pointers:
 		if (can_narrow_crc(p.crc, n)) {
 			i->ptr.offset += p.crc.offset;
 			extent_ptr_append(e, i->ptr);
-			__bch2_extent_drop_ptr(extent_i_to_s(e), &i->ptr);
+			bch2_extent_drop_ptr(extent_i_to_s(e), &i->ptr);
 			goto restart_narrow_pointers;
 		}
 
@@ -431,27 +438,13 @@ drop:
 	EBUG_ON(bkey_val_u64s(e.k) && !bch2_extent_nr_ptrs(e.c));
 }
 
-static bool should_drop_ptr(const struct bch_fs *c,
-			    struct bkey_s_c_extent e,
-			    const struct bch_extent_ptr *ptr)
-{
-	return ptr->cached && ptr_stale(bch_dev_bkey_exists(c, ptr->dev), ptr);
-}
-
 static void bch2_extent_drop_stale(struct bch_fs *c, struct bkey_s_extent e)
 {
-	struct bch_extent_ptr *ptr = &e.v->start->ptr;
-	bool dropped = false;
-
-	while ((ptr = extent_ptr_next(e, ptr)))
-		if (should_drop_ptr(c, e.c, ptr)) {
-			__bch2_extent_drop_ptr(e, ptr);
-			dropped = true;
-		} else
-			ptr++;
+	struct bch_extent_ptr *ptr;
 
-	if (dropped)
-		bch2_extent_drop_redundant_crcs(e);
+	bch2_extent_drop_ptrs(e, ptr,
+		ptr->cached &&
+		ptr_stale(bch_dev_bkey_exists(c, ptr->dev), ptr));
 }
 
 bool bch2_ptr_normalize(struct bch_fs *c, struct btree *b, struct bkey_s k)
diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h
index b1b9c189867a..c45d70657a89 100644
--- a/fs/bcachefs/extents.h
+++ b/fs/bcachefs/extents.h
@@ -84,7 +84,7 @@ void bch2_extent_mark_replicas_cached(struct bch_fs *, struct bkey_s_extent,
 
 const struct bch_extent_ptr *
 bch2_extent_has_device(struct bkey_s_c_extent, unsigned);
-bool bch2_extent_drop_device(struct bkey_s_extent, unsigned);
+void bch2_extent_drop_device(struct bkey_s_extent, unsigned);
 const struct bch_extent_ptr *
 bch2_extent_has_group(struct bch_fs *, struct bkey_s_c_extent, unsigned);
 const struct bch_extent_ptr *
@@ -400,29 +400,6 @@ out:									\
 
 /* Iterate over pointers backwards: */
 
-#define extent_ptr_prev(_e, _ptr)					\
-({									\
-	typeof(&(_e).v->start->ptr) _p;					\
-	typeof(&(_e).v->start->ptr) _prev = NULL;			\
-									\
-	extent_for_each_ptr(_e, _p) {					\
-		if (_p == (_ptr))					\
-			break;						\
-		_prev = _p;						\
-	}								\
-									\
-	_prev;								\
-})
-
-/*
- * Use this when you'll be dropping pointers as you iterate. Quadratic,
- * unfortunately:
- */
-#define extent_for_each_ptr_backwards(_e, _ptr)				\
-	for ((_ptr) = extent_ptr_prev(_e, NULL);			\
-	     (_ptr);							\
-	     (_ptr) = extent_ptr_prev(_e, _ptr))
-
 void bch2_extent_crc_append(struct bkey_i_extent *,
 			    struct bch_extent_crc_unpacked);
 
@@ -517,8 +494,22 @@ bool bch2_can_narrow_extent_crcs(struct bkey_s_c_extent,
 bool bch2_extent_narrow_crcs(struct bkey_i_extent *, struct bch_extent_crc_unpacked);
 void bch2_extent_drop_redundant_crcs(struct bkey_s_extent);
 
-void __bch2_extent_drop_ptr(struct bkey_s_extent, struct bch_extent_ptr *);
-void bch2_extent_drop_ptr(struct bkey_s_extent, struct bch_extent_ptr *);
+union bch_extent_entry *bch2_extent_drop_ptr(struct bkey_s_extent ,
+					     struct bch_extent_ptr *);
+
+#define bch2_extent_drop_ptrs(_e, _ptr, _cond)				\
+do {									\
+	_ptr = &(_e).v->start->ptr;					\
+									\
+	while ((_ptr = extent_ptr_next(e, _ptr))) {			\
+		if (_cond) {						\
+			_ptr = (void *) bch2_extent_drop_ptr(_e, _ptr);	\
+			continue;					\
+		}							\
+									\
+		(_ptr)++;						\
+	}								\
+} while (0)
 
 bool bch2_cut_front(struct bpos, struct bkey_i *);
 bool bch2_cut_back(struct bpos, struct bkey *);
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index d64463751b84..d17128f50f98 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -327,9 +327,9 @@ static void __bch2_write_index(struct bch_write_op *op)
 		bkey_copy(dst, src);
 
 		e = bkey_i_to_s_extent(dst);
-		extent_for_each_ptr_backwards(e, ptr)
-			if (test_bit(ptr->dev, op->failed.d))
-				bch2_extent_drop_ptr(e, ptr);
+
+		bch2_extent_drop_ptrs(e, ptr,
+			test_bit(ptr->dev, op->failed.d));
 
 		if (!bch2_extent_nr_ptrs(e.c)) {
 			ret = -EIO;
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 9dd881c0410e..648c4ac58a2c 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -1067,14 +1067,19 @@ static int journal_write_alloc(struct journal *j, struct journal_buf *w,
 	 * entry - that's why we drop pointers to devices <= current free space,
 	 * i.e. whichever device was limiting the current journal entry size.
 	 */
-	extent_for_each_ptr_backwards(e, ptr) {
-		   ca = bch_dev_bkey_exists(c, ptr->dev);
+	bch2_extent_drop_ptrs(e, ptr, ({
+		ca = bch_dev_bkey_exists(c, ptr->dev);
 
-		if (ca->mi.state != BCH_MEMBER_STATE_RW ||
-		    ca->journal.sectors_free <= sectors)
-			__bch2_extent_drop_ptr(e, ptr);
-		else
-			ca->journal.sectors_free -= sectors;
+		ca->mi.state != BCH_MEMBER_STATE_RW ||
+		ca->journal.sectors_free <= sectors;
+	}));
+
+	extent_for_each_ptr(e, ptr) {
+		ca = bch_dev_bkey_exists(c, ptr->dev);
+
+		BUG_ON(ca->mi.state != BCH_MEMBER_STATE_RW ||
+		       ca->journal.sectors_free <= sectors);
+		ca->journal.sectors_free -= sectors;
 	}
 
 	replicas = bch2_extent_nr_ptrs(e.c);
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index 1e63d0e5ce53..edc45201faa6 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -98,13 +98,9 @@ static int bch2_migrate_index_update(struct bch_write_op *op)
 		bch2_cut_back(new->k.p, &insert->k);
 		bch2_cut_back(insert->k.p, &new->k);
 
-		if (m->data_cmd == DATA_REWRITE) {
-			struct bch_extent_ptr *ptr = (void *)
-				bch2_extent_has_device(extent_i_to_s_c(insert),
-						       m->data_opts.rewrite_dev);
-			BUG_ON(!ptr);
-			bch2_extent_drop_ptr(extent_i_to_s(insert), ptr);
-		}
+		if (m->data_cmd == DATA_REWRITE)
+			bch2_extent_drop_device(extent_i_to_s(insert),
+						m->data_opts.rewrite_dev);
 
 		extent_for_each_ptr_decode(extent_i_to_s(new), p, entry) {
 			if (bch2_extent_has_device(extent_i_to_s_c(insert), p.ptr.dev)) {
-- 
cgit 


From 71c9e0ba427ae0572693c133e33dad30efaf3aba Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 27 Sep 2018 21:08:39 -0400
Subject: bcachefs: bch2_extent_ptr_decoded_append()

This new helper for the move path avoids creating a new CRC entry when
we already have one that matches the pointer being added.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/extents.c | 129 +++++++++++++++++---------------------------------
 fs/bcachefs/extents.h |   9 ++--
 fs/bcachefs/io.c      |  12 +++--
 fs/bcachefs/move.c    |   3 +-
 fs/bcachefs/util.h    |  10 ++++
 5 files changed, 69 insertions(+), 94 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 8b84c5e00a26..ae6b1a17abfa 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -332,36 +332,36 @@ bool bch2_extent_narrow_crcs(struct bkey_i_extent *e,
 	struct bch_extent_crc_unpacked u;
 	struct extent_ptr_decoded p;
 	union bch_extent_entry *i;
+	bool ret = false;
 
 	/* Find a checksum entry that covers only live data: */
-	if (!n.csum_type)
+	if (!n.csum_type) {
 		extent_for_each_crc(extent_i_to_s(e), u, i)
 			if (!u.compression_type &&
 			    u.csum_type &&
 			    u.live_size == u.uncompressed_size) {
 				n = u;
-				break;
+				goto found;
 			}
-
-	if (!bch2_can_narrow_extent_crcs(extent_i_to_s_c(e), n))
 		return false;
-
+	}
+found:
 	BUG_ON(n.compression_type);
 	BUG_ON(n.offset);
 	BUG_ON(n.live_size != e->k.size);
 
-	bch2_extent_crc_append(e, n);
 restart_narrow_pointers:
 	extent_for_each_ptr_decode(extent_i_to_s(e), p, i)
 		if (can_narrow_crc(p.crc, n)) {
-			i->ptr.offset += p.crc.offset;
-			extent_ptr_append(e, i->ptr);
 			bch2_extent_drop_ptr(extent_i_to_s(e), &i->ptr);
+			p.ptr.offset += p.crc.offset;
+			p.crc = n;
+			bch2_extent_ptr_decoded_append(e, &p);
+			ret = true;
 			goto restart_narrow_pointers;
 		}
 
-	bch2_extent_drop_redundant_crcs(extent_i_to_s(e));
-	return true;
+	return ret;
 }
 
 /* returns true if not equal */
@@ -378,66 +378,6 @@ static inline bool bch2_crc_unpacked_cmp(struct bch_extent_crc_unpacked l,
 		bch2_crc_cmp(l.csum, r.csum));
 }
 
-void bch2_extent_drop_redundant_crcs(struct bkey_s_extent e)
-{
-	union bch_extent_entry *entry = e.v->start;
-	union bch_extent_crc *crc, *prev = NULL;
-	struct bch_extent_crc_unpacked u, prev_u = { 0 };
-
-	while (entry != extent_entry_last(e)) {
-		union bch_extent_entry *next = extent_entry_next(entry);
-		size_t crc_u64s = extent_entry_u64s(entry);
-
-		if (!extent_entry_is_crc(entry))
-			goto next;
-
-		crc = entry_to_crc(entry);
-		u = bch2_extent_crc_unpack(e.k, crc);
-
-		if (next == extent_entry_last(e)) {
-			/* crc entry with no pointers after it: */
-			goto drop;
-		}
-
-		if (extent_entry_is_crc(next)) {
-			/* no pointers before next crc entry: */
-			goto drop;
-		}
-
-		if (prev && !bch2_crc_unpacked_cmp(u, prev_u)) {
-			/* identical to previous crc entry: */
-			goto drop;
-		}
-
-		if (!prev &&
-		    !u.csum_type &&
-		    !u.compression_type) {
-			/* null crc entry: */
-			union bch_extent_entry *e2;
-
-			extent_for_each_entry_from(e, e2, extent_entry_next(entry)) {
-				if (!extent_entry_is_ptr(e2))
-					break;
-
-				e2->ptr.offset += u.offset;
-			}
-			goto drop;
-		}
-
-		prev = crc;
-		prev_u = u;
-next:
-		entry = next;
-		continue;
-drop:
-		memmove_u64s_down(crc, next,
-				  (u64 *) extent_entry_last(e) - (u64 *) next);
-		e.k->u64s -= crc_u64s;
-	}
-
-	EBUG_ON(bkey_val_u64s(e.k) && !bch2_extent_nr_ptrs(e.c));
-}
-
 static void bch2_extent_drop_stale(struct bch_fs *c, struct bkey_s_extent e)
 {
 	struct bch_extent_ptr *ptr;
@@ -1846,25 +1786,44 @@ static void bch2_extent_crc_init(union bch_extent_crc *crc,
 void bch2_extent_crc_append(struct bkey_i_extent *e,
 			    struct bch_extent_crc_unpacked new)
 {
-	struct bch_extent_crc_unpacked crc;
-	const union bch_extent_entry *i;
+	bch2_extent_crc_init((void *) extent_entry_last(extent_i_to_s(e)), new);
+	__extent_entry_push(e);
+}
 
-	BUG_ON(new.compressed_size > new.uncompressed_size);
-	BUG_ON(new.live_size != e->k.size);
-	BUG_ON(!new.compressed_size || !new.uncompressed_size);
+static inline void __extent_entry_insert(struct bkey_i_extent *e,
+					 union bch_extent_entry *dst,
+					 union bch_extent_entry *new)
+{
+	union bch_extent_entry *end = extent_entry_last(extent_i_to_s(e));
 
-	/*
-	 * Look up the last crc entry, so we can check if we need to add
-	 * another:
-	 */
-	extent_for_each_crc(extent_i_to_s(e), crc, i)
-		;
+	memmove_u64s_up((u64 *) dst + extent_entry_u64s(new),
+			dst, (u64 *) end - (u64 *) dst);
+	e->k.u64s += extent_entry_u64s(new);
+	memcpy_u64s_small(dst, new, extent_entry_u64s(new));
+}
 
-	if (!bch2_crc_unpacked_cmp(crc, new))
-		return;
+void bch2_extent_ptr_decoded_append(struct bkey_i_extent *e,
+				    struct extent_ptr_decoded *p)
+{
+	struct bch_extent_crc_unpacked crc = bch2_extent_crc_unpack(&e->k, NULL);
+	union bch_extent_entry *pos;
 
-	bch2_extent_crc_init((void *) extent_entry_last(extent_i_to_s(e)), new);
-	__extent_entry_push(e);
+	if (!bch2_crc_unpacked_cmp(crc, p->crc)) {
+		pos = e->v.start;
+		goto found;
+	}
+
+	extent_for_each_crc(extent_i_to_s(e), crc, pos)
+		if (!bch2_crc_unpacked_cmp(crc, p->crc)) {
+			pos = extent_entry_next(pos);
+			goto found;
+		}
+
+	bch2_extent_crc_append(e, p->crc);
+	pos = extent_entry_last(extent_i_to_s(e));
+found:
+	p->ptr.type = 1 << BCH_EXTENT_ENTRY_ptr;
+	__extent_entry_insert(e, pos, to_entry(&p->ptr));
 }
 
 /*
diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h
index c45d70657a89..fe5eb32b6ed9 100644
--- a/fs/bcachefs/extents.h
+++ b/fs/bcachefs/extents.h
@@ -213,11 +213,13 @@ union bch_extent_crc {
 #define to_entry(_entry)						\
 ({									\
 	BUILD_BUG_ON(!type_is(_entry, union bch_extent_crc *) &&	\
-		     !type_is(_entry, struct bch_extent_ptr *));	\
+		     !type_is(_entry, struct bch_extent_ptr *) &&	\
+		     !type_is(_entry, struct bch_extent_stripe_ptr *));	\
 									\
 	__builtin_choose_expr(						\
 		(type_is_exact(_entry, const union bch_extent_crc *) ||	\
-		 type_is_exact(_entry, const struct bch_extent_ptr *)),	\
+		 type_is_exact(_entry, const struct bch_extent_ptr *) ||\
+		 type_is_exact(_entry, const struct bch_extent_stripe_ptr *)),\
 		(const union bch_extent_entry *) (_entry),		\
 		(union bch_extent_entry *) (_entry));			\
 })
@@ -402,6 +404,8 @@ out:									\
 
 void bch2_extent_crc_append(struct bkey_i_extent *,
 			    struct bch_extent_crc_unpacked);
+void bch2_extent_ptr_decoded_append(struct bkey_i_extent *,
+				    struct extent_ptr_decoded *);
 
 static inline void __extent_entry_push(struct bkey_i_extent *e)
 {
@@ -492,7 +496,6 @@ static inline struct bch_devs_list bch2_bkey_cached_devs(struct bkey_s_c k)
 bool bch2_can_narrow_extent_crcs(struct bkey_s_c_extent,
 				 struct bch_extent_crc_unpacked);
 bool bch2_extent_narrow_crcs(struct bkey_i_extent *, struct bch_extent_crc_unpacked);
-void bch2_extent_drop_redundant_crcs(struct bkey_s_extent);
 
 union bch_extent_entry *bch2_extent_drop_ptr(struct bkey_s_extent ,
 					     struct bch_extent_ptr *);
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index d17128f50f98..549a179b85e6 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -431,12 +431,16 @@ static void init_append_extent(struct bch_write_op *op,
 	struct bkey_i_extent *e = bkey_extent_init(op->insert_keys.top);
 
 	op->pos.offset += crc.uncompressed_size;
-	e->k.p = op->pos;
-	e->k.size = crc.uncompressed_size;
-	e->k.version = version;
+	e->k.p		= op->pos;
+	e->k.size	= crc.uncompressed_size;
+	e->k.version	= version;
 	bkey_extent_set_cached(&e->k, op->flags & BCH_WRITE_CACHED);
 
-	bch2_extent_crc_append(e, crc);
+	if (crc.csum_type ||
+	    crc.compression_type ||
+	    crc.nonce)
+		bch2_extent_crc_append(e, crc);
+
 	bch2_alloc_sectors_append_ptrs(op->c, wp, e, crc.compressed_size);
 
 	bch2_keylist_push(&op->insert_keys);
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index edc45201faa6..c7132a65566b 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -112,8 +112,7 @@ static int bch2_migrate_index_update(struct bch_write_op *op)
 				continue;
 			}
 
-			bch2_extent_crc_append(insert, p.crc);
-			extent_ptr_append(insert, p.ptr);
+			bch2_extent_ptr_decoded_append(insert, &p);
 			did_work = true;
 		}
 
diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h
index 446216eb8c76..44e2c96b6509 100644
--- a/fs/bcachefs/util.h
+++ b/fs/bcachefs/util.h
@@ -581,6 +581,16 @@ size_t bch2_rand_range(size_t);
 void memcpy_to_bio(struct bio *, struct bvec_iter, void *);
 void memcpy_from_bio(void *, struct bio *, struct bvec_iter);
 
+static inline void memcpy_u64s_small(void *dst, const void *src,
+				     unsigned u64s)
+{
+	u64 *d = dst;
+	const u64 *s = src;
+
+	while (u64s--)
+		*d++ = *s++;
+}
+
 static inline void __memcpy_u64s(void *dst, const void *src,
 				 unsigned u64s)
 {
-- 
cgit 


From abce30b79b6f9661c4a84f8f8ee20c26165b6f71 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 30 Sep 2018 18:39:20 -0400
Subject: bcachefs: BCH_EXTENT_ENTRY_TYPES()

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs_format.h | 24 ++++++++++++++----------
 fs/bcachefs/extents.h         | 13 +++++--------
 2 files changed, 19 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index ac0c7d6a07fb..72df3fe1572b 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -461,15 +461,19 @@ enum bch_compression_type {
 	BCH_COMPRESSION_NR		= 5,
 };
 
+#define BCH_EXTENT_ENTRY_TYPES()		\
+	x(ptr,			0)		\
+	x(crc32,		1)		\
+	x(crc64,		2)		\
+	x(crc128,		3)
+#define BCH_EXTENT_ENTRY_MAX	4
+
 enum bch_extent_entry_type {
-	BCH_EXTENT_ENTRY_ptr		= 0,
-	BCH_EXTENT_ENTRY_crc32		= 1,
-	BCH_EXTENT_ENTRY_crc64		= 2,
-	BCH_EXTENT_ENTRY_crc128		= 3,
+#define x(f, n) BCH_EXTENT_ENTRY_##f = n,
+	BCH_EXTENT_ENTRY_TYPES()
+#undef x
 };
 
-#define BCH_EXTENT_ENTRY_MAX		4
-
 /* Compressed/uncompressed size are stored biased by 1: */
 struct bch_extent_crc32 {
 #if defined(__LITTLE_ENDIAN_BITFIELD)
@@ -594,10 +598,10 @@ union bch_extent_entry {
 #else
 #error edit for your odd byteorder.
 #endif
-	struct bch_extent_crc32		crc32;
-	struct bch_extent_crc64		crc64;
-	struct bch_extent_crc128	crc128;
-	struct bch_extent_ptr		ptr;
+
+#define x(f, n) struct bch_extent_##f	f;
+	BCH_EXTENT_ENTRY_TYPES()
+#undef x
 };
 
 enum {
diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h
index fe5eb32b6ed9..e2f6caefcb31 100644
--- a/fs/bcachefs/extents.h
+++ b/fs/bcachefs/extents.h
@@ -162,14 +162,11 @@ extent_entry_type(const union bch_extent_entry *e)
 static inline size_t extent_entry_bytes(const union bch_extent_entry *entry)
 {
 	switch (extent_entry_type(entry)) {
-	case BCH_EXTENT_ENTRY_crc32:
-		return sizeof(struct bch_extent_crc32);
-	case BCH_EXTENT_ENTRY_crc64:
-		return sizeof(struct bch_extent_crc64);
-	case BCH_EXTENT_ENTRY_crc128:
-		return sizeof(struct bch_extent_crc128);
-	case BCH_EXTENT_ENTRY_ptr:
-		return sizeof(struct bch_extent_ptr);
+#define x(f, n)						\
+	case BCH_EXTENT_ENTRY_##f:			\
+		return sizeof(struct bch_extent_##f);
+	BCH_EXTENT_ENTRY_TYPES()
+#undef x
 	default:
 		BUG();
 	}
-- 
cgit 


From 2252aa271c1761589ae584ca738233c7d89c083c Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 21 Oct 2018 10:56:11 -0400
Subject: bcachefs: btree gc refactoring

prep work for erasure coding

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bkey_methods.h |  11 --
 fs/bcachefs/btree_gc.c     | 298 ++++++++++++++++++++++++---------------------
 fs/bcachefs/btree_gc.h     |   2 -
 fs/bcachefs/btree_types.h  |   5 -
 fs/bcachefs/journal.h      |   4 -
 fs/bcachefs/journal_io.c   |  22 ----
 fs/bcachefs/journal_io.h   |   2 +
 7 files changed, 160 insertions(+), 184 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bkey_methods.h b/fs/bcachefs/bkey_methods.h
index 989b577da928..6ee774ba3d7a 100644
--- a/fs/bcachefs/bkey_methods.h
+++ b/fs/bcachefs/bkey_methods.h
@@ -19,17 +19,6 @@ static inline enum bkey_type bkey_type(unsigned level, enum btree_id id)
 	return level ? BKEY_TYPE_BTREE : (enum bkey_type) id;
 }
 
-static inline bool btree_type_has_ptrs(enum bkey_type type)
-{
-	switch (type) {
-	case BKEY_TYPE_BTREE:
-	case BKEY_TYPE_EXTENTS:
-		return true;
-	default:
-		return false;
-	}
-}
-
 struct bch_fs;
 struct btree;
 struct bkey;
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index d07a6b297078..757a170e7508 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -18,6 +18,7 @@
 #include "error.h"
 #include "extents.h"
 #include "journal.h"
+#include "journal_io.h"
 #include "keylist.h"
 #include "move.h"
 #include "replicas.h"
@@ -32,6 +33,23 @@
 #include <linux/rcupdate.h>
 #include <linux/sched/task.h>
 
+static inline void __gc_pos_set(struct bch_fs *c, struct gc_pos new_pos)
+{
+	preempt_disable();
+	write_seqcount_begin(&c->gc_pos_lock);
+	c->gc_pos = new_pos;
+	write_seqcount_end(&c->gc_pos_lock);
+	preempt_enable();
+}
+
+static inline void gc_pos_set(struct bch_fs *c, struct gc_pos new_pos)
+{
+	BUG_ON(gc_pos_cmp(new_pos, c->gc_pos) <= 0);
+	__gc_pos_set(c, new_pos);
+}
+
+/* range_checks - for validating min/max pos of each btree node: */
+
 struct range_checks {
 	struct range_level {
 		struct bpos	min;
@@ -91,6 +109,19 @@ static void btree_node_range_checks(struct bch_fs *c, struct btree *b,
 	}
 }
 
+/* marking of btree keys/nodes: */
+
+static bool bkey_type_needs_gc(enum bkey_type type)
+{
+	switch (type) {
+	case BKEY_TYPE_BTREE:
+	case BKEY_TYPE_EXTENTS:
+		return true;
+	default:
+		return false;
+	}
+}
+
 u8 bch2_btree_key_recalc_oldest_gen(struct bch_fs *c, struct bkey_s_c k)
 {
 	const struct bch_extent_ptr *ptr;
@@ -113,39 +144,8 @@ u8 bch2_btree_key_recalc_oldest_gen(struct bch_fs *c, struct bkey_s_c k)
 	return max_stale;
 }
 
-/*
- * For runtime mark and sweep:
- */
-static u8 bch2_gc_mark_key(struct bch_fs *c, enum bkey_type type,
-			   struct bkey_s_c k, unsigned flags)
-{
-	struct gc_pos pos = { 0 };
-	u8 ret = 0;
-
-	switch (type) {
-	case BKEY_TYPE_BTREE:
-		bch2_mark_key(c, k, c->opts.btree_node_size,
-			      BCH_DATA_BTREE, pos, NULL,
-			      0, flags|
-			      BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
-			      BCH_BUCKET_MARK_GC_LOCK_HELD);
-		break;
-	case BKEY_TYPE_EXTENTS:
-		bch2_mark_key(c, k, k.k->size, BCH_DATA_USER, pos, NULL,
-			      0, flags|
-			      BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
-			      BCH_BUCKET_MARK_GC_LOCK_HELD);
-		ret = bch2_btree_key_recalc_oldest_gen(c, k);
-		break;
-	default:
-		BUG();
-	}
-
-	return ret;
-}
-
-int bch2_btree_mark_key_initial(struct bch_fs *c, enum bkey_type type,
-				struct bkey_s_c k)
+static int bch2_btree_mark_ptrs_initial(struct bch_fs *c, enum bkey_type type,
+					struct bkey_s_c k)
 {
 	enum bch_data_type data_type = type == BKEY_TYPE_BTREE
 		? BCH_DATA_BTREE : BCH_DATA_USER;
@@ -199,54 +199,90 @@ int bch2_btree_mark_key_initial(struct bch_fs *c, enum bkey_type type,
 	}
 	}
 
-	atomic64_set(&c->key_version,
-		     max_t(u64, k.k->version.lo,
-			   atomic64_read(&c->key_version)));
-
-	bch2_gc_mark_key(c, type, k, BCH_BUCKET_MARK_NOATOMIC);
+	if (k.k->version.lo > atomic64_read(&c->key_version))
+		atomic64_set(&c->key_version, k.k->version.lo);
 fsck_err:
 	return ret;
 }
 
-static unsigned btree_gc_mark_node(struct bch_fs *c, struct btree *b)
+/*
+ * For runtime mark and sweep:
+ */
+static int bch2_gc_mark_key(struct bch_fs *c, enum bkey_type type,
+			    struct bkey_s_c k, bool initial)
+{
+	struct gc_pos pos = { 0 };
+	unsigned flags = initial ? BCH_BUCKET_MARK_NOATOMIC : 0;
+	int ret = 0;
+
+	switch (type) {
+	case BKEY_TYPE_BTREE:
+		if (initial) {
+			ret = bch2_btree_mark_ptrs_initial(c, type, k);
+			if (ret < 0)
+				return ret;
+		}
+
+		bch2_mark_key(c, k, c->opts.btree_node_size,
+			      BCH_DATA_BTREE, pos, NULL,
+			      0, flags|
+			      BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
+			      BCH_BUCKET_MARK_GC_LOCK_HELD);
+		break;
+	case BKEY_TYPE_EXTENTS:
+		if (initial) {
+			ret = bch2_btree_mark_ptrs_initial(c, type, k);
+			if (ret < 0)
+				return ret;
+		}
+
+		bch2_mark_key(c, k, k.k->size, BCH_DATA_USER, pos, NULL,
+			      0, flags|
+			      BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
+			      BCH_BUCKET_MARK_GC_LOCK_HELD);
+		ret = bch2_btree_key_recalc_oldest_gen(c, k);
+		break;
+	default:
+		break;
+	}
+
+	return ret;
+}
+
+static int btree_gc_mark_node(struct bch_fs *c, struct btree *b,
+			      bool initial)
 {
 	enum bkey_type type = btree_node_type(b);
 	struct btree_node_iter iter;
 	struct bkey unpacked;
 	struct bkey_s_c k;
 	u8 stale = 0;
+	int ret;
 
-	if (btree_node_has_ptrs(b))
-		for_each_btree_node_key_unpack(b, k, &iter,
-					       &unpacked) {
-			bch2_bkey_debugcheck(c, b, k);
-			stale = max(stale, bch2_gc_mark_key(c, type, k, 0));
-		}
+	if (!bkey_type_needs_gc(type))
+		return 0;
 
-	return stale;
-}
+	for_each_btree_node_key_unpack(b, k, &iter,
+				       &unpacked) {
+		bch2_bkey_debugcheck(c, b, k);
 
-static inline void __gc_pos_set(struct bch_fs *c, struct gc_pos new_pos)
-{
-	preempt_disable();
-	write_seqcount_begin(&c->gc_pos_lock);
-	c->gc_pos = new_pos;
-	write_seqcount_end(&c->gc_pos_lock);
-	preempt_enable();
-}
+		ret = bch2_gc_mark_key(c, type, k, initial);
+		if (ret < 0)
+			return ret;
 
-static inline void gc_pos_set(struct bch_fs *c, struct gc_pos new_pos)
-{
-	BUG_ON(gc_pos_cmp(new_pos, c->gc_pos) <= 0);
-	__gc_pos_set(c, new_pos);
+		stale = max_t(u8, stale, ret);
+	}
+
+	return stale;
 }
 
-static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id)
+static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id,
+			 bool initial)
 {
 	struct btree_iter iter;
 	struct btree *b;
 	struct range_checks r;
-	unsigned depth = btree_id == BTREE_ID_EXTENTS ? 0 : 1;
+	unsigned depth = bkey_type_needs_gc(btree_id) ? 0 : 1;
 	unsigned max_stale;
 	int ret = 0;
 
@@ -257,8 +293,11 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id)
 
 	/*
 	 * if expensive_debug_checks is on, run range_checks on all leaf nodes:
+	 *
+	 * and on startup, we have to read every btree node (XXX: only if it was
+	 * an unclean shutdown)
 	 */
-	if (expensive_debug_checks(c))
+	if (initial || expensive_debug_checks(c))
 		depth = 0;
 
 	btree_node_range_checks_init(&r, depth);
@@ -269,22 +308,24 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id)
 
 		bch2_verify_btree_nr_keys(b);
 
-		max_stale = btree_gc_mark_node(c, b);
+		max_stale = btree_gc_mark_node(c, b, initial);
 
 		gc_pos_set(c, gc_pos_btree_node(b));
 
-		if (max_stale > 64)
-			bch2_btree_node_rewrite(c, &iter,
-					b->data->keys.seq,
-					BTREE_INSERT_USE_RESERVE|
-					BTREE_INSERT_NOWAIT|
-					BTREE_INSERT_GC_LOCK_HELD);
-		else if (!btree_gc_rewrite_disabled(c) &&
-			 (btree_gc_always_rewrite(c) || max_stale > 16))
-			bch2_btree_node_rewrite(c, &iter,
-					b->data->keys.seq,
-					BTREE_INSERT_NOWAIT|
-					BTREE_INSERT_GC_LOCK_HELD);
+		if (!initial) {
+			if (max_stale > 64)
+				bch2_btree_node_rewrite(c, &iter,
+						b->data->keys.seq,
+						BTREE_INSERT_USE_RESERVE|
+						BTREE_INSERT_NOWAIT|
+						BTREE_INSERT_GC_LOCK_HELD);
+			else if (!btree_gc_rewrite_disabled(c) &&
+				 (btree_gc_always_rewrite(c) || max_stale > 16))
+				bch2_btree_node_rewrite(c, &iter,
+						b->data->keys.seq,
+						BTREE_INSERT_NOWAIT|
+						BTREE_INSERT_GC_LOCK_HELD);
+		}
 
 		bch2_btree_iter_cond_resched(&iter);
 	}
@@ -296,13 +337,47 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id)
 
 	b = c->btree_roots[btree_id].b;
 	if (!btree_node_fake(b))
-		bch2_gc_mark_key(c, BKEY_TYPE_BTREE, bkey_i_to_s_c(&b->key), 0);
+		bch2_gc_mark_key(c, BKEY_TYPE_BTREE,
+				 bkey_i_to_s_c(&b->key), initial);
 	gc_pos_set(c, gc_pos_btree_root(b->btree_id));
 
 	mutex_unlock(&c->btree_root_lock);
 	return 0;
 }
 
+static int bch2_gc_btrees(struct bch_fs *c, struct list_head *journal,
+			  bool initial)
+{
+	unsigned i;
+
+	for (i = 0; i < BTREE_ID_NR; i++) {
+		enum bkey_type type = bkey_type(0, i);
+
+		int ret = bch2_gc_btree(c, i, initial);
+		if (ret)
+			return ret;
+
+		if (journal && bkey_type_needs_gc(type)) {
+			struct bkey_i *k, *n;
+			struct jset_entry *j;
+			struct journal_replay *r;
+			int ret;
+
+			list_for_each_entry(r, journal, list)
+				for_each_jset_key(k, n, j, &r->j) {
+					if (type == bkey_type(j->level, j->btree_id)) {
+						ret = bch2_gc_mark_key(c, type,
+							bkey_i_to_s_c(k), initial);
+						if (ret < 0)
+							return ret;
+					}
+				}
+		}
+	}
+
+	return 0;
+}
+
 static void mark_metadata_sectors(struct bch_fs *c, struct bch_dev *ca,
 				  u64 start, u64 end,
 				  enum bch_data_type type,
@@ -525,6 +600,7 @@ void bch2_gc(struct bch_fs *c)
 	struct bch_dev *ca;
 	u64 start_time = local_clock();
 	unsigned i;
+	int ret;
 
 	/*
 	 * Walk _all_ references to buckets, and recompute them:
@@ -560,14 +636,11 @@ void bch2_gc(struct bch_fs *c)
 
 	bch2_mark_superblocks(c);
 
-	/* Walk btree: */
-	for (i = 0; i < BTREE_ID_NR; i++) {
-		int ret = bch2_gc_btree(c, i);
-		if (ret) {
-			bch_err(c, "btree gc failed: %d", ret);
-			set_bit(BCH_FS_GC_FAILURE, &c->flags);
-			goto out;
-		}
+	ret = bch2_gc_btrees(c, NULL, false);
+	if (ret) {
+		bch_err(c, "btree gc failed: %d", ret);
+		set_bit(BCH_FS_GC_FAILURE, &c->flags);
+		goto out;
 	}
 
 	bch2_mark_pending_btree_node_frees(c);
@@ -1009,58 +1082,9 @@ int bch2_gc_thread_start(struct bch_fs *c)
 
 /* Initial GC computes bucket marks during startup */
 
-static int bch2_initial_gc_btree(struct bch_fs *c, enum btree_id id)
-{
-	struct btree_iter iter;
-	struct btree *b;
-	struct range_checks r;
-	int ret = 0;
-
-	btree_node_range_checks_init(&r, 0);
-
-	gc_pos_set(c, gc_pos_btree(id, POS_MIN, 0));
-
-	if (!c->btree_roots[id].b)
-		return 0;
-
-	b = c->btree_roots[id].b;
-	if (!btree_node_fake(b))
-		ret = bch2_btree_mark_key_initial(c, BKEY_TYPE_BTREE,
-						  bkey_i_to_s_c(&b->key));
-	if (ret)
-		return ret;
-
-	/*
-	 * We have to hit every btree node before starting journal replay, in
-	 * order for the journal seq blacklist machinery to work:
-	 */
-	for_each_btree_node(&iter, c, id, POS_MIN, BTREE_ITER_PREFETCH, b) {
-		btree_node_range_checks(c, b, &r);
-
-		if (btree_node_has_ptrs(b)) {
-			struct btree_node_iter node_iter;
-			struct bkey unpacked;
-			struct bkey_s_c k;
-
-			for_each_btree_node_key_unpack(b, k, &node_iter,
-						       &unpacked) {
-				ret = bch2_btree_mark_key_initial(c,
-							btree_node_type(b), k);
-				if (ret)
-					goto err;
-			}
-		}
-
-		bch2_btree_iter_cond_resched(&iter);
-	}
-err:
-	return bch2_btree_iter_unlock(&iter) ?: ret;
-}
-
 int bch2_initial_gc(struct bch_fs *c, struct list_head *journal)
 {
 	unsigned iter = 0;
-	enum btree_id id;
 	int ret = 0;
 
 	down_write(&c->gc_lock);
@@ -1069,13 +1093,7 @@ again:
 
 	bch2_mark_superblocks(c);
 
-	for (id = 0; id < BTREE_ID_NR; id++) {
-		ret = bch2_initial_gc_btree(c, id);
-		if (ret)
-			goto err;
-	}
-
-	ret = bch2_journal_mark(c, journal);
+	ret = bch2_gc_btrees(c, journal, true);
 	if (ret)
 		goto err;
 
diff --git a/fs/bcachefs/btree_gc.h b/fs/bcachefs/btree_gc.h
index 9d2b9d5953d2..54c6bc845930 100644
--- a/fs/bcachefs/btree_gc.h
+++ b/fs/bcachefs/btree_gc.h
@@ -12,8 +12,6 @@ void bch2_gc_thread_stop(struct bch_fs *);
 int bch2_gc_thread_start(struct bch_fs *);
 int bch2_initial_gc(struct bch_fs *, struct list_head *);
 u8 bch2_btree_key_recalc_oldest_gen(struct bch_fs *, struct bkey_s_c);
-int bch2_btree_mark_key_initial(struct bch_fs *, enum bkey_type,
-				struct bkey_s_c);
 void bch2_mark_dev_superblock(struct bch_fs *, struct bch_dev *, unsigned);
 
 /*
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index dd9660a9f12b..467c619f7f6d 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -415,11 +415,6 @@ static inline const struct bkey_ops *btree_node_ops(struct btree *b)
 	return &bch2_bkey_ops[btree_node_type(b)];
 }
 
-static inline bool btree_node_has_ptrs(struct btree *b)
-{
-	return btree_type_has_ptrs(btree_node_type(b));
-}
-
 static inline bool btree_node_is_extents(struct btree *b)
 {
 	return btree_node_type(b) == BKEY_TYPE_EXTENTS;
diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h
index f39b37e6e3d5..77cf39cc64ff 100644
--- a/fs/bcachefs/journal.h
+++ b/fs/bcachefs/journal.h
@@ -355,10 +355,6 @@ static inline bool journal_flushes_device(struct bch_dev *ca)
 	return true;
 }
 
-int bch2_journal_mark(struct bch_fs *, struct list_head *);
-void bch2_journal_entries_free(struct list_head *);
-int bch2_journal_replay(struct bch_fs *, struct list_head *);
-
 static inline void bch2_journal_set_replay_done(struct journal *j)
 {
 	BUG_ON(!test_bit(JOURNAL_STARTED, &j->flags));
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 648c4ac58a2c..3dc24b39022f 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -852,28 +852,6 @@ fsck_err:
 
 /* journal replay: */
 
-int bch2_journal_mark(struct bch_fs *c, struct list_head *list)
-{
-	struct bkey_i *k, *n;
-	struct jset_entry *j;
-	struct journal_replay *r;
-	int ret;
-
-	list_for_each_entry(r, list, list)
-		for_each_jset_key(k, n, j, &r->j) {
-			enum bkey_type type = bkey_type(j->level, j->btree_id);
-			struct bkey_s_c k_s_c = bkey_i_to_s_c(k);
-
-			if (btree_type_has_ptrs(type)) {
-				ret = bch2_btree_mark_key_initial(c, type, k_s_c);
-				if (ret)
-					return ret;
-			}
-		}
-
-	return 0;
-}
-
 int bch2_journal_replay(struct bch_fs *c, struct list_head *list)
 {
 	struct journal *j = &c->journal;
diff --git a/fs/bcachefs/journal_io.h b/fs/bcachefs/journal_io.h
index 35f90c96008a..e19e549baf8a 100644
--- a/fs/bcachefs/journal_io.h
+++ b/fs/bcachefs/journal_io.h
@@ -37,6 +37,8 @@ static inline struct jset_entry *__jset_entry_type_next(struct jset *jset,
 
 int bch2_journal_set_seq(struct bch_fs *c, u64, u64);
 int bch2_journal_read(struct bch_fs *, struct list_head *);
+void bch2_journal_entries_free(struct list_head *);
+int bch2_journal_replay(struct bch_fs *, struct list_head *);
 
 int bch2_journal_entry_sectors(struct journal *);
 void bch2_journal_write(struct closure *);
-- 
cgit 


From 198d67006b6015724a840e8586a484c6590fc975 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 21 Oct 2018 16:32:51 -0400
Subject: bcachefs: add functionality for heaps to update backpointers

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c | 16 +++++++-----
 fs/bcachefs/btree_io.c         |  6 ++---
 fs/bcachefs/clock.c            |  6 ++---
 fs/bcachefs/extents.c          | 10 +++----
 fs/bcachefs/movinggc.c         |  4 +--
 fs/bcachefs/util.h             | 59 ++++++++++++++++++++++++++----------------
 6 files changed, 59 insertions(+), 42 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 45e8b124a9f3..88be5f4be4b1 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -583,7 +583,8 @@ static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca)
 			e.nr++;
 		} else {
 			if (e.nr)
-				heap_add_or_replace(&ca->alloc_heap, e, -bucket_alloc_cmp);
+				heap_add_or_replace(&ca->alloc_heap, e,
+					-bucket_alloc_cmp, NULL);
 
 			e = (struct alloc_heap_entry) {
 				.bucket = b,
@@ -596,14 +597,15 @@ static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca)
 	}
 
 	if (e.nr)
-		heap_add_or_replace(&ca->alloc_heap, e, -bucket_alloc_cmp);
+		heap_add_or_replace(&ca->alloc_heap, e,
+				-bucket_alloc_cmp, NULL);
 
 	for (i = 0; i < ca->alloc_heap.used; i++)
 		nr += ca->alloc_heap.data[i].nr;
 
 	while (nr - ca->alloc_heap.data[0].nr >= ALLOC_SCAN_BATCH(ca)) {
 		nr -= ca->alloc_heap.data[0].nr;
-		heap_pop(&ca->alloc_heap, e, -bucket_alloc_cmp);
+		heap_pop(&ca->alloc_heap, e, -bucket_alloc_cmp, NULL);
 	}
 
 	up_read(&ca->bucket_lock);
@@ -633,7 +635,7 @@ static void find_reclaimable_buckets_fifo(struct bch_fs *c, struct bch_dev *ca)
 		if (bch2_can_invalidate_bucket(ca, b, m)) {
 			struct alloc_heap_entry e = { .bucket = b, .nr = 1, };
 
-			heap_add(&ca->alloc_heap, e, bucket_alloc_cmp);
+			heap_add(&ca->alloc_heap, e, bucket_alloc_cmp, NULL);
 			if (heap_full(&ca->alloc_heap))
 				break;
 		}
@@ -660,7 +662,7 @@ static void find_reclaimable_buckets_random(struct bch_fs *c, struct bch_dev *ca
 		if (bch2_can_invalidate_bucket(ca, b, m)) {
 			struct alloc_heap_entry e = { .bucket = b, .nr = 1, };
 
-			heap_add(&ca->alloc_heap, e, bucket_alloc_cmp);
+			heap_add(&ca->alloc_heap, e, bucket_alloc_cmp, NULL);
 			if (heap_full(&ca->alloc_heap))
 				break;
 		}
@@ -698,7 +700,7 @@ static size_t find_reclaimable_buckets(struct bch_fs *c, struct bch_dev *ca)
 		break;
 	}
 
-	heap_resort(&ca->alloc_heap, bucket_alloc_cmp);
+	heap_resort(&ca->alloc_heap, bucket_alloc_cmp, NULL);
 
 	for (i = 0; i < ca->alloc_heap.used; i++)
 		nr += ca->alloc_heap.data[i].nr;
@@ -719,7 +721,7 @@ static inline long next_alloc_bucket(struct bch_dev *ca)
 			return b;
 		}
 
-		heap_pop(&ca->alloc_heap, e, bucket_alloc_cmp);
+		heap_pop(&ca->alloc_heap, e, bucket_alloc_cmp, NULL);
 	}
 
 	return -1;
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index e64e53e9d9ab..8f8e5fab1086 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -35,7 +35,7 @@ void bch2_btree_node_iter_large_push(struct btree_node_iter_large *iter,
 				 __btree_node_key_to_offset(b, end)
 			 });
 
-		__heap_add(iter, n, btree_node_iter_cmp_heap);
+		__heap_add(iter, n, btree_node_iter_cmp_heap, NULL);
 	}
 }
 
@@ -48,9 +48,9 @@ void bch2_btree_node_iter_large_advance(struct btree_node_iter_large *iter,
 	EBUG_ON(iter->data->k > iter->data->end);
 
 	if (iter->data->k == iter->data->end)
-		heap_del(iter, 0, btree_node_iter_cmp_heap);
+		heap_del(iter, 0, btree_node_iter_cmp_heap, NULL);
 	else
-		heap_sift_down(iter, 0, btree_node_iter_cmp_heap);
+		heap_sift_down(iter, 0, btree_node_iter_cmp_heap, NULL);
 }
 
 static void verify_no_dups(struct btree *b,
diff --git a/fs/bcachefs/clock.c b/fs/bcachefs/clock.c
index 96f8030384fa..e4486fcbea19 100644
--- a/fs/bcachefs/clock.c
+++ b/fs/bcachefs/clock.c
@@ -22,7 +22,7 @@ void bch2_io_timer_add(struct io_clock *clock, struct io_timer *timer)
 		if (clock->timers.data[i] == timer)
 			goto out;
 
-	BUG_ON(!heap_add(&clock->timers, timer, io_timer_cmp));
+	BUG_ON(!heap_add(&clock->timers, timer, io_timer_cmp, NULL));
 out:
 	spin_unlock(&clock->timer_lock);
 }
@@ -35,7 +35,7 @@ void bch2_io_timer_del(struct io_clock *clock, struct io_timer *timer)
 
 	for (i = 0; i < clock->timers.used; i++)
 		if (clock->timers.data[i] == timer) {
-			heap_del(&clock->timers, i, io_timer_cmp);
+			heap_del(&clock->timers, i, io_timer_cmp, NULL);
 			break;
 		}
 
@@ -128,7 +128,7 @@ static struct io_timer *get_expired_timer(struct io_clock *clock,
 
 	if (clock->timers.used &&
 	    time_after_eq(now, clock->timers.data[0]->expire))
-		heap_pop(&clock->timers, ret, io_timer_cmp);
+		heap_pop(&clock->timers, ret, io_timer_cmp, NULL);
 
 	spin_unlock(&clock->timer_lock);
 
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index ae6b1a17abfa..5dd552bf1d1b 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -88,7 +88,7 @@ struct btree_nr_keys bch2_key_sort_fix_overlapping(struct bset *dst,
 
 	memset(&nr, 0, sizeof(nr));
 
-	heap_resort(iter, key_sort_cmp);
+	heap_resort(iter, key_sort_cmp, NULL);
 
 	while (!bch2_btree_node_iter_large_end(iter)) {
 		if (!should_drop_next_key(iter, b)) {
@@ -101,7 +101,7 @@ struct btree_nr_keys bch2_key_sort_fix_overlapping(struct bset *dst,
 		}
 
 		sort_key_next(iter, b, iter->data);
-		heap_sift_down(iter, 0, key_sort_cmp);
+		heap_sift_down(iter, 0, key_sort_cmp, NULL);
 	}
 
 	dst->u64s = cpu_to_le16((u64 *) out - dst->_data);
@@ -841,7 +841,7 @@ static bool extent_i_save(struct btree *b, struct bkey_packed *dst,
 static inline void extent_sort_sift(struct btree_node_iter_large *iter,
 				    struct btree *b, size_t i)
 {
-	heap_sift_down(iter, i, extent_sort_cmp);
+	heap_sift_down(iter, i, extent_sort_cmp, NULL);
 }
 
 static inline void extent_sort_next(struct btree_node_iter_large *iter,
@@ -849,7 +849,7 @@ static inline void extent_sort_next(struct btree_node_iter_large *iter,
 				    struct btree_node_iter_set *i)
 {
 	sort_key_next(iter, b, i);
-	heap_sift_down(iter, i - iter->data, extent_sort_cmp);
+	heap_sift_down(iter, i - iter->data, extent_sort_cmp, NULL);
 }
 
 static void extent_sort_append(struct bch_fs *c,
@@ -897,7 +897,7 @@ struct btree_nr_keys bch2_extent_sort_fix_overlapping(struct bch_fs *c,
 
 	memset(&nr, 0, sizeof(nr));
 
-	heap_resort(iter, extent_sort_cmp);
+	heap_resort(iter, extent_sort_cmp, NULL);
 
 	while (!bch2_btree_node_iter_large_end(iter)) {
 		lk = __btree_node_offset_to_key(b, _l->k);
diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c
index b7def19bdd85..80577661e008 100644
--- a/fs/bcachefs/movinggc.c
+++ b/fs/bcachefs/movinggc.c
@@ -161,7 +161,7 @@ static void bch2_copygc(struct bch_fs *c, struct bch_dev *ca)
 			.sectors	= bucket_sectors_used(m),
 			.offset		= bucket_to_sector(ca, b),
 		};
-		heap_add_or_replace(h, e, -sectors_used_cmp);
+		heap_add_or_replace(h, e, -sectors_used_cmp, NULL);
 	}
 	up_read(&ca->bucket_lock);
 	up_read(&c->gc_lock);
@@ -170,7 +170,7 @@ static void bch2_copygc(struct bch_fs *c, struct bch_dev *ca)
 		sectors_to_move += i->sectors;
 
 	while (sectors_to_move > COPYGC_SECTORS_PER_ITER(ca)) {
-		BUG_ON(!heap_pop(h, e, -sectors_used_cmp));
+		BUG_ON(!heap_pop(h, e, -sectors_used_cmp, NULL));
 		sectors_to_move -= e.sectors;
 	}
 
diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h
index 44e2c96b6509..9caf2487ee63 100644
--- a/fs/bcachefs/util.h
+++ b/fs/bcachefs/util.h
@@ -127,7 +127,19 @@ do {									\
 	(heap)->data = NULL;						\
 } while (0)
 
-#define heap_swap(h, i, j)	swap((h)->data[i], (h)->data[j])
+#define heap_set_backpointer(h, i, _fn)					\
+do {									\
+	void (*fn)(typeof(h), size_t) = _fn;				\
+	if (fn)								\
+		fn(h, i);						\
+} while (0)
+
+#define heap_swap(h, i, j, set_backpointer)				\
+do {									\
+	swap((h)->data[i], (h)->data[j]);				\
+	heap_set_backpointer(h, i, set_backpointer);			\
+	heap_set_backpointer(h, j, set_backpointer);			\
+} while (0)
 
 #define heap_peek(h)							\
 ({									\
@@ -137,7 +149,7 @@ do {									\
 
 #define heap_full(h)	((h)->used == (h)->size)
 
-#define heap_sift_down(h, i, cmp)					\
+#define heap_sift_down(h, i, cmp, set_backpointer)			\
 do {									\
 	size_t _c, _j = i;						\
 									\
@@ -149,72 +161,75 @@ do {									\
 									\
 		if (cmp(h, (h)->data[_c], (h)->data[_j]) >= 0)		\
 			break;						\
-		heap_swap(h, _c, _j);					\
+		heap_swap(h, _c, _j, set_backpointer);			\
 	}								\
 } while (0)
 
-#define heap_sift_up(h, i, cmp)						\
+#define heap_sift_up(h, i, cmp, set_backpointer)			\
 do {									\
 	while (i) {							\
 		size_t p = (i - 1) / 2;					\
 		if (cmp(h, (h)->data[i], (h)->data[p]) >= 0)		\
 			break;						\
-		heap_swap(h, i, p);					\
+		heap_swap(h, i, p, set_backpointer);			\
 		i = p;							\
 	}								\
 } while (0)
 
-#define __heap_add(h, d, cmp)						\
-do {									\
+#define __heap_add(h, d, cmp, set_backpointer)				\
+({									\
 	size_t _i = (h)->used++;					\
 	(h)->data[_i] = d;						\
+	heap_set_backpointer(h, _i, set_backpointer);			\
 									\
-	heap_sift_up(h, _i, cmp);					\
-} while (0)
+	heap_sift_up(h, _i, cmp, set_backpointer);			\
+	_i;								\
+})
 
-#define heap_add(h, d, cmp)						\
+#define heap_add(h, d, cmp, set_backpointer)				\
 ({									\
 	bool _r = !heap_full(h);					\
 	if (_r)								\
-		__heap_add(h, d, cmp);					\
+		__heap_add(h, d, cmp, set_backpointer);			\
 	_r;								\
 })
 
-#define heap_add_or_replace(h, new, cmp)				\
+#define heap_add_or_replace(h, new, cmp, set_backpointer)		\
 do {									\
-	if (!heap_add(h, new, cmp) &&					\
+	if (!heap_add(h, new, cmp, set_backpointer) &&			\
 	    cmp(h, new, heap_peek(h)) >= 0) {				\
 		(h)->data[0] = new;					\
-		heap_sift_down(h, 0, cmp);				\
+		heap_set_backpointer(h, 0, set_backpointer);		\
+		heap_sift_down(h, 0, cmp, set_backpointer);		\
 	}								\
 } while (0)
 
-#define heap_del(h, i, cmp)						\
+#define heap_del(h, i, cmp, set_backpointer)				\
 do {									\
 	size_t _i = (i);						\
 									\
 	BUG_ON(_i >= (h)->used);					\
 	(h)->used--;							\
-	heap_swap(h, _i, (h)->used);					\
-	heap_sift_up(h, _i, cmp);					\
-	heap_sift_down(h, _i, cmp);					\
+	heap_swap(h, _i, (h)->used, set_backpointer);			\
+	heap_sift_up(h, _i, cmp, set_backpointer);			\
+	heap_sift_down(h, _i, cmp, set_backpointer);			\
 } while (0)
 
-#define heap_pop(h, d, cmp)						\
+#define heap_pop(h, d, cmp, set_backpointer)				\
 ({									\
 	bool _r = (h)->used;						\
 	if (_r) {							\
 		(d) = (h)->data[0];					\
-		heap_del(h, 0, cmp);					\
+		heap_del(h, 0, cmp, set_backpointer);			\
 	}								\
 	_r;								\
 })
 
-#define heap_resort(heap, cmp)						\
+#define heap_resort(heap, cmp, set_backpointer)				\
 do {									\
 	ssize_t _i;							\
 	for (_i = (ssize_t) (heap)->used / 2 -  1; _i >= 0; --_i)	\
-		heap_sift_down(heap, _i, cmp);				\
+		heap_sift_down(heap, _i, cmp, set_backpointer);		\
 } while (0)
 
 #define ANYSINT_MAX(t)							\
-- 
cgit 


From 7a920560d727701c4397a5448085f99bf9f060d5 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 30 Oct 2018 14:14:19 -0400
Subject: bcachefs: kill struct bch_replicas_cpu_entry

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs.h        |   1 +
 fs/bcachefs/bcachefs_format.h |   6 +-
 fs/bcachefs/replicas.c        | 326 ++++++++++++++++--------------------------
 fs/bcachefs/replicas.h        |  12 +-
 fs/bcachefs/replicas_types.h  |  11 ++
 fs/bcachefs/super_types.h     |  12 --
 6 files changed, 149 insertions(+), 219 deletions(-)
 create mode 100644 fs/bcachefs/replicas_types.h

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 770b26f28c75..95d505aaf82f 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -315,6 +315,7 @@ enum bch_time_stats {
 #include "keylist_types.h"
 #include "quota_types.h"
 #include "rebalance_types.h"
+#include "replicas_types.h"
 #include "super_types.h"
 
 /* Number of nodes btree coalesce will try to coalesce at once */
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index 72df3fe1572b..46355f006793 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -1016,9 +1016,9 @@ enum bch_data_type {
 };
 
 struct bch_replicas_entry {
-	u8			data_type;
-	u8			nr;
-	u8			devs[];
+	__u8			data_type;
+	__u8			nr_devs;
+	__u8			devs[];
 };
 
 struct bch_sb_field_replicas {
diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c
index 4b87aa8e1f75..a1ece679954c 100644
--- a/fs/bcachefs/replicas.c
+++ b/fs/bcachefs/replicas.c
@@ -4,17 +4,32 @@
 #include "replicas.h"
 #include "super-io.h"
 
+struct bch_replicas_entry_padded {
+	struct bch_replicas_entry	e;
+	u8				pad[BCH_SB_MEMBERS_MAX];
+};
+
 static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *,
 					    struct bch_replicas_cpu *);
 
 /* Replicas tracking - in memory: */
 
+static inline int u8_cmp(u8 l, u8 r)
+{
+	return (l > r) - (l < r);
+}
+
+static void replicas_entry_sort(struct bch_replicas_entry *e)
+{
+	bubble_sort(e->devs, e->nr_devs, u8_cmp);
+}
+
 #define for_each_cpu_replicas_entry(_r, _i)				\
 	for (_i = (_r)->entries;					\
 	     (void *) (_i) < (void *) (_r)->entries + (_r)->nr * (_r)->entry_size;\
 	     _i = (void *) (_i) + (_r)->entry_size)
 
-static inline struct bch_replicas_cpu_entry *
+static inline struct bch_replicas_entry *
 cpu_replicas_entry(struct bch_replicas_cpu *r, unsigned i)
 {
 	return (void *) r->entries + r->entry_size * i;
@@ -25,84 +40,43 @@ static void bch2_cpu_replicas_sort(struct bch_replicas_cpu *r)
 	eytzinger0_sort(r->entries, r->nr, r->entry_size, memcmp, NULL);
 }
 
-static inline bool replicas_test_dev(struct bch_replicas_cpu_entry *e,
-				     unsigned dev)
+static int replicas_entry_to_text(struct bch_replicas_entry *e,
+				  char *buf, size_t size)
 {
-	return (e->devs[dev >> 3] & (1 << (dev & 7))) != 0;
-}
+	char *out = buf, *end = out + size;
+	unsigned i;
 
-static inline void replicas_set_dev(struct bch_replicas_cpu_entry *e,
-				    unsigned dev)
-{
-	e->devs[dev >> 3] |= 1 << (dev & 7);
-}
+	out += scnprintf(out, end - out, "%u: [", e->data_type);
 
-static inline unsigned replicas_dev_slots(struct bch_replicas_cpu *r)
-{
-	return (r->entry_size -
-		offsetof(struct bch_replicas_cpu_entry, devs)) * 8;
+	for (i = 0; i < e->nr_devs; i++)
+		out += scnprintf(out, end - out,
+				 i ? " %u" : "%u", e->devs[i]);
+	out += scnprintf(out, end - out, "]");
+
+	return out - buf;
 }
 
 int bch2_cpu_replicas_to_text(struct bch_replicas_cpu *r,
 			      char *buf, size_t size)
 {
 	char *out = buf, *end = out + size;
-	struct bch_replicas_cpu_entry *e;
+	struct bch_replicas_entry *e;
 	bool first = true;
-	unsigned i;
 
 	for_each_cpu_replicas_entry(r, e) {
-		bool first_e = true;
-
 		if (!first)
 			out += scnprintf(out, end - out, " ");
 		first = false;
 
-		out += scnprintf(out, end - out, "%u: [", e->data_type);
-
-		for (i = 0; i < replicas_dev_slots(r); i++)
-			if (replicas_test_dev(e, i)) {
-				if (!first_e)
-					out += scnprintf(out, end - out, " ");
-				first_e = false;
-				out += scnprintf(out, end - out, "%u", i);
-			}
-		out += scnprintf(out, end - out, "]");
+		out += replicas_entry_to_text(e, out, end - out);
 	}
 
 	return out - buf;
 }
 
-static inline unsigned bkey_to_replicas(struct bkey_s_c_extent e,
-					enum bch_data_type data_type,
-					struct bch_replicas_cpu_entry *r,
-					unsigned *max_dev)
-{
-	const struct bch_extent_ptr *ptr;
-	unsigned nr = 0;
-
-	BUG_ON(!data_type ||
-	       data_type == BCH_DATA_SB ||
-	       data_type >= BCH_DATA_NR);
-
-	memset(r, 0, sizeof(*r));
-	r->data_type = data_type;
-
-	*max_dev = 0;
-
-	extent_for_each_ptr(e, ptr)
-		if (!ptr->cached) {
-			*max_dev = max_t(unsigned, *max_dev, ptr->dev);
-			replicas_set_dev(r, ptr->dev);
-			nr++;
-		}
-	return nr;
-}
-
 static inline void devlist_to_replicas(struct bch_devs_list devs,
 				       enum bch_data_type data_type,
-				       struct bch_replicas_cpu_entry *r,
-				       unsigned *max_dev)
+				       struct bch_replicas_entry *e)
 {
 	unsigned i;
 
@@ -110,28 +84,24 @@ static inline void devlist_to_replicas(struct bch_devs_list devs,
 	       data_type == BCH_DATA_SB ||
 	       data_type >= BCH_DATA_NR);
 
-	memset(r, 0, sizeof(*r));
-	r->data_type = data_type;
+	e->data_type	= data_type;
+	e->nr_devs	= 0;
 
-	*max_dev = 0;
+	for (i = 0; i < devs.nr; i++)
+		e->devs[e->nr_devs++] = devs.devs[i];
 
-	for (i = 0; i < devs.nr; i++) {
-		*max_dev = max_t(unsigned, *max_dev, devs.devs[i]);
-		replicas_set_dev(r, devs.devs[i]);
-	}
+	replicas_entry_sort(e);
 }
 
 static struct bch_replicas_cpu *
 cpu_replicas_add_entry(struct bch_replicas_cpu *old,
-		       struct bch_replicas_cpu_entry new_entry,
-		       unsigned max_dev)
+		       struct bch_replicas_entry *new_entry)
 {
 	struct bch_replicas_cpu *new;
 	unsigned i, nr, entry_size;
 
-	entry_size = offsetof(struct bch_replicas_cpu_entry, devs) +
-		DIV_ROUND_UP(max_dev + 1, 8);
-	entry_size = max(entry_size, old->entry_size);
+	entry_size = max_t(unsigned, old->entry_size,
+			   replicas_entry_bytes(new_entry));
 	nr = old->nr + 1;
 
 	new = kzalloc(sizeof(struct bch_replicas_cpu) +
@@ -145,30 +115,28 @@ cpu_replicas_add_entry(struct bch_replicas_cpu *old,
 	for (i = 0; i < old->nr; i++)
 		memcpy(cpu_replicas_entry(new, i),
 		       cpu_replicas_entry(old, i),
-		       min(new->entry_size, old->entry_size));
+		       old->entry_size);
 
 	memcpy(cpu_replicas_entry(new, old->nr),
-	       &new_entry,
-	       new->entry_size);
+	       new_entry,
+	       replicas_entry_bytes(new_entry));
 
 	bch2_cpu_replicas_sort(new);
 	return new;
 }
 
 static bool replicas_has_entry(struct bch_replicas_cpu *r,
-				struct bch_replicas_cpu_entry search,
-				unsigned max_dev)
+			       struct bch_replicas_entry *search)
 {
-	return max_dev < replicas_dev_slots(r) &&
+	return replicas_entry_bytes(search) <= r->entry_size &&
 		eytzinger0_find(r->entries, r->nr,
 				r->entry_size,
-				memcmp, &search) < r->nr;
+				memcmp, search) < r->nr;
 }
 
 noinline
 static int bch2_mark_replicas_slowpath(struct bch_fs *c,
-				struct bch_replicas_cpu_entry new_entry,
-				unsigned max_dev)
+				struct bch_replicas_entry *new_entry)
 {
 	struct bch_replicas_cpu *old_gc, *new_gc = NULL, *old_r, *new_r = NULL;
 	int ret = -ENOMEM;
@@ -177,16 +145,16 @@ static int bch2_mark_replicas_slowpath(struct bch_fs *c,
 
 	old_gc = rcu_dereference_protected(c->replicas_gc,
 					   lockdep_is_held(&c->sb_lock));
-	if (old_gc && !replicas_has_entry(old_gc, new_entry, max_dev)) {
-		new_gc = cpu_replicas_add_entry(old_gc, new_entry, max_dev);
+	if (old_gc && !replicas_has_entry(old_gc, new_entry)) {
+		new_gc = cpu_replicas_add_entry(old_gc, new_entry);
 		if (!new_gc)
 			goto err;
 	}
 
 	old_r = rcu_dereference_protected(c->replicas,
 					  lockdep_is_held(&c->sb_lock));
-	if (!replicas_has_entry(old_r, new_entry, max_dev)) {
-		new_r = cpu_replicas_add_entry(old_r, new_entry, max_dev);
+	if (!replicas_has_entry(old_r, new_entry)) {
+		new_r = cpu_replicas_add_entry(old_r, new_entry);
 		if (!new_r)
 			goto err;
 
@@ -225,27 +193,28 @@ int bch2_mark_replicas(struct bch_fs *c,
 		       enum bch_data_type data_type,
 		       struct bch_devs_list devs)
 {
-	struct bch_replicas_cpu_entry search;
+	struct bch_replicas_entry_padded search;
 	struct bch_replicas_cpu *r, *gc_r;
-	unsigned max_dev;
 	bool marked;
 
 	if (!devs.nr)
 		return 0;
 
+	memset(&search, 0, sizeof(search));
+
 	BUG_ON(devs.nr >= BCH_REPLICAS_MAX);
 
-	devlist_to_replicas(devs, data_type, &search, &max_dev);
+	devlist_to_replicas(devs, data_type, &search.e);
 
 	rcu_read_lock();
 	r = rcu_dereference(c->replicas);
 	gc_r = rcu_dereference(c->replicas_gc);
-	marked = replicas_has_entry(r, search, max_dev) &&
-		(!likely(gc_r) || replicas_has_entry(gc_r, search, max_dev));
+	marked = replicas_has_entry(r, &search.e) &&
+		(!likely(gc_r) || replicas_has_entry(gc_r, &search.e));
 	rcu_read_unlock();
 
 	return likely(marked) ? 0
-		: bch2_mark_replicas_slowpath(c, search, max_dev);
+		: bch2_mark_replicas_slowpath(c, &search.e);
 }
 
 int bch2_mark_bkey_replicas(struct bch_fs *c,
@@ -304,7 +273,7 @@ err:
 int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask)
 {
 	struct bch_replicas_cpu *dst, *src;
-	struct bch_replicas_cpu_entry *e;
+	struct bch_replicas_entry *e;
 
 	lockdep_assert_held(&c->replicas_gc_lock);
 
@@ -339,40 +308,19 @@ int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask)
 
 /* Replicas tracking - superblock: */
 
-static void bch2_sb_replicas_nr_entries(struct bch_sb_field_replicas *r,
-					unsigned *nr,
-					unsigned *bytes,
-					unsigned *max_dev)
-{
-	struct bch_replicas_entry *i;
-	unsigned j;
-
-	*nr	= 0;
-	*bytes	= sizeof(*r);
-	*max_dev = 0;
-
-	if (!r)
-		return;
-
-	for_each_replicas_entry(r, i) {
-		for (j = 0; j < i->nr; j++)
-			*max_dev = max_t(unsigned, *max_dev, i->devs[j]);
-		(*nr)++;
-	}
-
-	*bytes = (void *) i - (void *) r;
-}
-
 static struct bch_replicas_cpu *
 __bch2_sb_replicas_to_cpu_replicas(struct bch_sb_field_replicas *sb_r)
 {
+	struct bch_replicas_entry *e, *dst;
 	struct bch_replicas_cpu *cpu_r;
-	unsigned i, nr, bytes, max_dev, entry_size;
-
-	bch2_sb_replicas_nr_entries(sb_r, &nr, &bytes, &max_dev);
+	unsigned nr = 0, entry_size = 0;
 
-	entry_size = offsetof(struct bch_replicas_cpu_entry, devs) +
-		DIV_ROUND_UP(max_dev + 1, 8);
+	if (sb_r)
+		for_each_replicas_entry(sb_r, e) {
+			entry_size = max_t(unsigned, entry_size,
+					   replicas_entry_bytes(e));
+			nr++;
+		}
 
 	cpu_r = kzalloc(sizeof(struct bch_replicas_cpu) +
 			nr * entry_size, GFP_NOIO);
@@ -382,20 +330,14 @@ __bch2_sb_replicas_to_cpu_replicas(struct bch_sb_field_replicas *sb_r)
 	cpu_r->nr		= nr;
 	cpu_r->entry_size	= entry_size;
 
-	if (nr) {
-		struct bch_replicas_cpu_entry *dst =
-			cpu_replicas_entry(cpu_r, 0);
-		struct bch_replicas_entry *src = sb_r->entries;
-
-		while (dst < cpu_replicas_entry(cpu_r, nr)) {
-			dst->data_type = src->data_type;
-			for (i = 0; i < src->nr; i++)
-				replicas_set_dev(dst, src->devs[i]);
+	nr = 0;
 
-			src	= replicas_entry_next(src);
-			dst	= (void *) dst + entry_size;
+	if (sb_r)
+		for_each_replicas_entry(sb_r, e) {
+			dst = cpu_replicas_entry(cpu_r, nr++);
+			memcpy(dst, e, replicas_entry_bytes(e));
+			replicas_entry_sort(dst);
 		}
-	}
 
 	bch2_cpu_replicas_sort(cpu_r);
 	return cpu_r;
@@ -423,20 +365,16 @@ static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *c,
 					    struct bch_replicas_cpu *r)
 {
 	struct bch_sb_field_replicas *sb_r;
-	struct bch_replicas_entry *sb_e;
-	struct bch_replicas_cpu_entry *e;
-	size_t i, bytes;
+	struct bch_replicas_entry *dst, *src;
+	size_t bytes;
 
 	bytes = sizeof(struct bch_sb_field_replicas);
 
-	for_each_cpu_replicas_entry(r, e) {
-		bytes += sizeof(struct bch_replicas_entry);
-		for (i = 0; i < r->entry_size - 1; i++)
-			bytes += hweight8(e->devs[i]);
-	}
+	for_each_cpu_replicas_entry(r, src)
+		bytes += replicas_entry_bytes(src);
 
 	sb_r = bch2_sb_resize_replicas(&c->disk_sb,
-			DIV_ROUND_UP(sizeof(*sb_r) + bytes, sizeof(u64)));
+			DIV_ROUND_UP(bytes, sizeof(u64)));
 	if (!sb_r)
 		return -ENOSPC;
 
@@ -444,22 +382,42 @@ static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *c,
 	       vstruct_end(&sb_r->field) -
 	       (void *) &sb_r->entries);
 
-	sb_e = sb_r->entries;
-	for_each_cpu_replicas_entry(r, e) {
-		sb_e->data_type = e->data_type;
-
-		for (i = 0; i < replicas_dev_slots(r); i++)
-			if (replicas_test_dev(e, i))
-				sb_e->devs[sb_e->nr++] = i;
+	dst = sb_r->entries;
+	for_each_cpu_replicas_entry(r, src) {
+		memcpy(dst, src, replicas_entry_bytes(src));
 
-		sb_e = replicas_entry_next(sb_e);
+		dst = replicas_entry_next(dst);
 
-		BUG_ON((void *) sb_e > vstruct_end(&sb_r->field));
+		BUG_ON((void *) dst > vstruct_end(&sb_r->field));
 	}
 
 	return 0;
 }
 
+static const char *check_dup_replicas_entries(struct bch_replicas_cpu *cpu_r)
+{
+	unsigned i;
+
+	sort_cmp_size(cpu_r->entries,
+		      cpu_r->nr,
+		      cpu_r->entry_size,
+		      memcmp, NULL);
+
+	for (i = 0; i + 1 < cpu_r->nr; i++) {
+		struct bch_replicas_entry *l =
+			cpu_replicas_entry(cpu_r, i);
+		struct bch_replicas_entry *r =
+			cpu_replicas_entry(cpu_r, i + 1);
+
+		BUG_ON(memcmp(l, r, cpu_r->entry_size) > 0);
+
+		if (!memcmp(l, r, cpu_r->entry_size))
+			return "duplicate replicas entry";
+	}
+
+	return NULL;
+}
+
 static const char *bch2_sb_validate_replicas(struct bch_sb *sb, struct bch_sb_field *f)
 {
 	struct bch_sb_field_replicas *sb_r = field_to_type(f, replicas);
@@ -475,15 +433,15 @@ static const char *bch2_sb_validate_replicas(struct bch_sb *sb, struct bch_sb_fi
 			goto err;
 
 		err = "invalid replicas entry: no devices";
-		if (!e->nr)
+		if (!e->nr_devs)
 			goto err;
 
 		err = "invalid replicas entry: too many devices";
-		if (e->nr >= BCH_REPLICAS_MAX)
+		if (e->nr_devs >= BCH_REPLICAS_MAX)
 			goto err;
 
 		err = "invalid replicas entry: invalid device";
-		for (i = 0; i < e->nr; i++)
+		for (i = 0; i < e->nr_devs; i++)
 			if (!bch2_dev_exists(sb, mi, e->devs[i]))
 				goto err;
 	}
@@ -493,25 +451,7 @@ static const char *bch2_sb_validate_replicas(struct bch_sb *sb, struct bch_sb_fi
 	if (!cpu_r)
 		goto err;
 
-	sort_cmp_size(cpu_r->entries,
-		      cpu_r->nr,
-		      cpu_r->entry_size,
-		      memcmp, NULL);
-
-	for (i = 0; i + 1 < cpu_r->nr; i++) {
-		struct bch_replicas_cpu_entry *l =
-			cpu_replicas_entry(cpu_r, i);
-		struct bch_replicas_cpu_entry *r =
-			cpu_replicas_entry(cpu_r, i + 1);
-
-		BUG_ON(memcmp(l, r, cpu_r->entry_size) > 0);
-
-		err = "duplicate replicas entry";
-		if (!memcmp(l, r, cpu_r->entry_size))
-			goto err;
-	}
-
-	err = NULL;
+	err = check_dup_replicas_entries(cpu_r);
 err:
 	kfree(cpu_r);
 	return err;
@@ -526,7 +466,6 @@ int bch2_sb_replicas_to_text(struct bch_sb_field_replicas *r, char *buf, size_t
 	char *out = buf, *end = out + size;
 	struct bch_replicas_entry *e;
 	bool first = true;
-	unsigned i;
 
 	if (!r) {
 		out += scnprintf(out, end - out, "(no replicas section found)");
@@ -538,12 +477,7 @@ int bch2_sb_replicas_to_text(struct bch_sb_field_replicas *r, char *buf, size_t
 			out += scnprintf(out, end - out, " ");
 		first = false;
 
-		out += scnprintf(out, end - out, "%u: [", e->data_type);
-
-		for (i = 0; i < e->nr; i++)
-			out += scnprintf(out, end - out,
-					 i ? " %u" : "%u", e->devs[i]);
-		out += scnprintf(out, end - out, "]");
+		out += replicas_entry_to_text(e, out, end - out);
 	}
 
 	return out - buf;
@@ -555,18 +489,18 @@ bool bch2_replicas_marked(struct bch_fs *c,
 			  enum bch_data_type data_type,
 			  struct bch_devs_list devs)
 {
-	struct bch_replicas_cpu_entry search;
-	unsigned max_dev;
+	struct bch_replicas_entry_padded search;
 	bool ret;
 
 	if (!devs.nr)
 		return true;
 
-	devlist_to_replicas(devs, data_type, &search, &max_dev);
+	memset(&search, 0, sizeof(search));
+
+	devlist_to_replicas(devs, data_type, &search.e);
 
 	rcu_read_lock();
-	ret = replicas_has_entry(rcu_dereference(c->replicas),
-				 search, max_dev);
+	ret = replicas_has_entry(rcu_dereference(c->replicas), &search.e);
 	rcu_read_unlock();
 
 	return ret;
@@ -591,9 +525,9 @@ struct replicas_status __bch2_replicas_status(struct bch_fs *c,
 					      struct bch_devs_mask online_devs)
 {
 	struct bch_sb_field_members *mi;
-	struct bch_replicas_cpu_entry *e;
+	struct bch_replicas_entry *e;
 	struct bch_replicas_cpu *r;
-	unsigned i, dev, dev_slots, nr_online, nr_offline;
+	unsigned i, nr_online, nr_offline;
 	struct replicas_status ret;
 
 	memset(&ret, 0, sizeof(ret));
@@ -603,9 +537,7 @@ struct replicas_status __bch2_replicas_status(struct bch_fs *c,
 
 	mi = bch2_sb_get_members(c->disk_sb.sb);
 	rcu_read_lock();
-
 	r = rcu_dereference(c->replicas);
-	dev_slots = replicas_dev_slots(r);
 
 	for_each_cpu_replicas_entry(r, e) {
 		if (e->data_type >= ARRAY_SIZE(ret.replicas))
@@ -613,13 +545,11 @@ struct replicas_status __bch2_replicas_status(struct bch_fs *c,
 
 		nr_online = nr_offline = 0;
 
-		for (dev = 0; dev < dev_slots; dev++) {
-			if (!replicas_test_dev(e, dev))
-				continue;
-
-			BUG_ON(!bch2_dev_exists(c->disk_sb.sb, mi, dev));
+		for (i = 0; i < e->nr_devs; i++) {
+			BUG_ON(!bch2_dev_exists(c->disk_sb.sb, mi,
+						e->devs[i]));
 
-			if (test_bit(dev, online_devs.d))
+			if (test_bit(e->devs[i], online_devs.d))
 				nr_online++;
 			else
 				nr_offline++;
@@ -678,20 +608,18 @@ unsigned bch2_replicas_online(struct bch_fs *c, bool meta)
 
 unsigned bch2_dev_has_data(struct bch_fs *c, struct bch_dev *ca)
 {
-	struct bch_replicas_cpu_entry *e;
+	struct bch_replicas_entry *e;
 	struct bch_replicas_cpu *r;
-	unsigned ret = 0;
+	unsigned i, ret = 0;
 
 	rcu_read_lock();
 	r = rcu_dereference(c->replicas);
 
-	if (ca->dev_idx >= replicas_dev_slots(r))
-		goto out;
-
 	for_each_cpu_replicas_entry(r, e)
-		if (replicas_test_dev(e, ca->dev_idx))
-			ret |= 1 << e->data_type;
-out:
+		for (i = 0; i < e->nr_devs; i++)
+			if (e->devs[i] == ca->dev_idx)
+				ret |= 1 << e->data_type;
+
 	rcu_read_unlock();
 
 	return ret;
diff --git a/fs/bcachefs/replicas.h b/fs/bcachefs/replicas.h
index de506cf9e11d..6c01f35296e7 100644
--- a/fs/bcachefs/replicas.h
+++ b/fs/bcachefs/replicas.h
@@ -2,6 +2,8 @@
 #ifndef _BCACHEFS_REPLICAS_H
 #define _BCACHEFS_REPLICAS_H
 
+#include "replicas_types.h"
+
 bool bch2_replicas_marked(struct bch_fs *, enum bch_data_type,
 			  struct bch_devs_list);
 bool bch2_bkey_replicas_marked(struct bch_fs *, enum bch_data_type,
@@ -34,11 +36,11 @@ int bch2_replicas_gc_start(struct bch_fs *, unsigned);
 
 /* iterate over superblock replicas - used by userspace tools: */
 
-static inline struct bch_replicas_entry *
-replicas_entry_next(struct bch_replicas_entry *i)
-{
-	return (void *) i + offsetof(struct bch_replicas_entry, devs) + i->nr;
-}
+#define replicas_entry_bytes(_i)					\
+	(offsetof(typeof(*(_i)), devs) + (_i)->nr_devs)
+
+#define replicas_entry_next(_i)						\
+	((typeof(_i)) ((void *) (_i) + replicas_entry_bytes(_i)))
 
 #define for_each_replicas_entry(_r, _i)					\
 	for (_i = (_r)->entries;					\
diff --git a/fs/bcachefs/replicas_types.h b/fs/bcachefs/replicas_types.h
new file mode 100644
index 000000000000..3061840b6a02
--- /dev/null
+++ b/fs/bcachefs/replicas_types.h
@@ -0,0 +1,11 @@
+#ifndef _BCACHEFS_REPLICAS_TYPES_H
+#define _BCACHEFS_REPLICAS_TYPES_H
+
+struct bch_replicas_cpu {
+	struct rcu_head		rcu;
+	unsigned		nr;
+	unsigned		entry_size;
+	struct bch_replicas_entry entries[];
+};
+
+#endif /* _BCACHEFS_REPLICAS_TYPES_H */
diff --git a/fs/bcachefs/super_types.h b/fs/bcachefs/super_types.h
index 4d8265bb3154..04a15729a244 100644
--- a/fs/bcachefs/super_types.h
+++ b/fs/bcachefs/super_types.h
@@ -36,18 +36,6 @@ struct bch_member_cpu {
 	u8			valid;
 };
 
-struct bch_replicas_cpu_entry {
-	u8			data_type;
-	u8			devs[BCH_SB_MEMBERS_MAX / 8];
-};
-
-struct bch_replicas_cpu {
-	struct rcu_head		rcu;
-	unsigned		nr;
-	unsigned		entry_size;
-	struct bch_replicas_cpu_entry entries[];
-};
-
 struct bch_disk_group_cpu {
 	bool				deleted;
 	u16				parent;
-- 
cgit 


From 103e212785561df4ee3f29024868d8e3468f1f40 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 30 Oct 2018 14:32:21 -0400
Subject: bcachefs: replicas: prep work for stripes

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_gc.c              |   4 +-
 fs/bcachefs/btree_update_interior.c |   4 +-
 fs/bcachefs/extents.c               |   5 +-
 fs/bcachefs/io.c                    |   3 +-
 fs/bcachefs/migrate.c               |   6 +-
 fs/bcachefs/move.c                  |   6 +-
 fs/bcachefs/replicas.c              | 119 ++++++++++++++++++++++++++++--------
 fs/bcachefs/replicas.h              |   4 +-
 8 files changed, 108 insertions(+), 43 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 757a170e7508..c9a013f43374 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -155,10 +155,10 @@ static int bch2_btree_mark_ptrs_initial(struct bch_fs *c, enum bkey_type type,
 	       k.k->version.lo > journal_cur_seq(&c->journal));
 
 	if (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) ||
-	    fsck_err_on(!bch2_bkey_replicas_marked(c, data_type, k), c,
+	    fsck_err_on(!bch2_bkey_replicas_marked(c, type, k), c,
 			"superblock not marked as containing replicas (type %u)",
 			data_type)) {
-		ret = bch2_mark_bkey_replicas(c, data_type, k);
+		ret = bch2_mark_bkey_replicas(c, type, k);
 		if (ret)
 			return ret;
 	}
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 26721c5a871c..c9facec494ef 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -551,7 +551,7 @@ static struct btree_reserve *bch2_btree_reserve_get(struct bch_fs *c,
 			goto err_free;
 		}
 
-		ret = bch2_mark_bkey_replicas(c, BCH_DATA_BTREE,
+		ret = bch2_mark_bkey_replicas(c, BKEY_TYPE_BTREE,
 					      bkey_i_to_s_c(&b->key));
 		if (ret)
 			goto err_free;
@@ -2063,7 +2063,7 @@ int bch2_btree_node_update_key(struct bch_fs *c, struct btree_iter *iter,
 			goto err;
 	}
 
-	ret = bch2_mark_bkey_replicas(c, BCH_DATA_BTREE,
+	ret = bch2_mark_bkey_replicas(c, BKEY_TYPE_BTREE,
 				      extent_i_to_s_c(new_key).s_c);
 	if (ret)
 		goto err_free_update;
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 5dd552bf1d1b..4e4918524a77 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -648,7 +648,7 @@ void bch2_btree_ptr_debugcheck(struct bch_fs *c, struct btree *b,
 			goto err;
 	}
 
-	if (!bch2_bkey_replicas_marked(c, BCH_DATA_BTREE, e.s_c)) {
+	if (!bch2_bkey_replicas_marked(c, btree_node_type(b), e.s_c)) {
 		bch2_bkey_val_to_text(c, btree_node_type(b),
 				     buf, sizeof(buf), k);
 		bch2_fs_bug(c,
@@ -1681,8 +1681,7 @@ static void bch2_extent_debugcheck_extent(struct bch_fs *c, struct btree *b,
 		return;
 	}
 
-	if (!bkey_extent_is_cached(e.k) &&
-	    !bch2_bkey_replicas_marked(c, BCH_DATA_USER, e.s_c)) {
+	if (!bch2_bkey_replicas_marked(c, btree_node_type(b), e.s_c)) {
 		bch2_bkey_val_to_text(c, btree_node_type(b),
 				     buf, sizeof(buf), e.s_c);
 		bch2_fs_bug(c,
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index 549a179b85e6..a4660746be0d 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -337,7 +337,8 @@ static void __bch2_write_index(struct bch_write_op *op)
 		}
 
 		if (!(op->flags & BCH_WRITE_NOMARK_REPLICAS)) {
-			ret = bch2_mark_bkey_replicas(c, BCH_DATA_USER, e.s_c);
+			ret = bch2_mark_bkey_replicas(c, BKEY_TYPE_EXTENTS,
+						      e.s_c);
 			if (ret)
 				goto err;
 		}
diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c
index 9337a8729a5b..38b392472521 100644
--- a/fs/bcachefs/migrate.c
+++ b/fs/bcachefs/migrate.c
@@ -51,7 +51,7 @@ static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
 	       !(ret = btree_iter_err(k))) {
 		if (!bkey_extent_is_data(k.k) ||
 		    !bch2_extent_has_device(bkey_s_c_to_extent(k), dev_idx)) {
-			ret = bch2_mark_bkey_replicas(c, BCH_DATA_USER, k);
+			ret = bch2_mark_bkey_replicas(c, BKEY_TYPE_EXTENTS, k);
 			if (ret)
 				break;
 			bch2_btree_iter_next(&iter);
@@ -72,7 +72,7 @@ static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
 		 */
 		bch2_extent_normalize(c, e.s);
 
-		ret = bch2_mark_bkey_replicas(c, BCH_DATA_USER,
+		ret = bch2_mark_bkey_replicas(c, BKEY_TYPE_EXTENTS,
 					      bkey_i_to_s_c(&tmp.key));
 		if (ret)
 			break;
@@ -135,7 +135,7 @@ retry:
 				 */
 				bch2_btree_iter_downgrade(&iter);
 
-				ret = bch2_mark_bkey_replicas(c, BCH_DATA_BTREE,
+				ret = bch2_mark_bkey_replicas(c, BKEY_TYPE_BTREE,
 							      bkey_i_to_s_c(&b->key));
 				if (ret)
 					goto err;
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index c7132a65566b..b2bf0944d59d 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -149,7 +149,7 @@ static int bch2_migrate_index_update(struct bch_write_op *op)
 			goto next;
 		}
 
-		ret = bch2_mark_bkey_replicas(c, BCH_DATA_USER,
+		ret = bch2_mark_bkey_replicas(c, BKEY_TYPE_EXTENTS,
 					      extent_i_to_s_c(insert).s_c);
 		if (ret)
 			break;
@@ -600,7 +600,7 @@ static int bch2_gc_data_replicas(struct bch_fs *c)
 
 	for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, POS_MIN,
 			   BTREE_ITER_PREFETCH, k) {
-		ret = bch2_mark_bkey_replicas(c, BCH_DATA_USER, k);
+		ret = bch2_mark_bkey_replicas(c, BKEY_TYPE_EXTENTS, k);
 		if (ret)
 			break;
 	}
@@ -624,7 +624,7 @@ static int bch2_gc_btree_replicas(struct bch_fs *c)
 
 	for (id = 0; id < BTREE_ID_NR; id++) {
 		for_each_btree_node(&iter, c, id, POS_MIN, BTREE_ITER_PREFETCH, b) {
-			ret = bch2_mark_bkey_replicas(c, BCH_DATA_BTREE,
+			ret = bch2_mark_bkey_replicas(c, BKEY_TYPE_BTREE,
 						      bkey_i_to_s_c(&b->key));
 
 			bch2_btree_iter_cond_resched(&iter);
diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c
index a1ece679954c..72dd70b00abb 100644
--- a/fs/bcachefs/replicas.c
+++ b/fs/bcachefs/replicas.c
@@ -74,6 +74,42 @@ int bch2_cpu_replicas_to_text(struct bch_replicas_cpu *r,
 	return out - buf;
 }
 
+static void extent_to_replicas(struct bkey_s_c k,
+			       struct bch_replicas_entry *r)
+{
+	if (bkey_extent_is_data(k.k)) {
+		struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
+		const union bch_extent_entry *entry;
+		struct extent_ptr_decoded p;
+
+		extent_for_each_ptr_decode(e, p, entry)
+			if (!p.ptr.cached)
+				r->devs[r->nr_devs++] = p.ptr.dev;
+	}
+}
+
+static void bkey_to_replicas(enum bkey_type type,
+			     struct bkey_s_c k,
+			     struct bch_replicas_entry *e)
+{
+	e->nr_devs = 0;
+
+	switch (type) {
+	case BKEY_TYPE_BTREE:
+		e->data_type = BCH_DATA_BTREE;
+		extent_to_replicas(k, e);
+		break;
+	case BKEY_TYPE_EXTENTS:
+		e->data_type = BCH_DATA_USER;
+		extent_to_replicas(k, e);
+		break;
+	default:
+		break;
+	}
+
+	replicas_entry_sort(e);
+}
+
 static inline void devlist_to_replicas(struct bch_devs_list devs,
 				       enum bch_data_type data_type,
 				       struct bch_replicas_entry *e)
@@ -189,13 +225,28 @@ err:
 	return ret;
 }
 
+static int __bch2_mark_replicas(struct bch_fs *c,
+				struct bch_replicas_entry *devs)
+{
+	struct bch_replicas_cpu *r, *gc_r;
+	bool marked;
+
+	rcu_read_lock();
+	r = rcu_dereference(c->replicas);
+	gc_r = rcu_dereference(c->replicas_gc);
+	marked = replicas_has_entry(r, devs) &&
+		(!likely(gc_r) || replicas_has_entry(gc_r, devs));
+	rcu_read_unlock();
+
+	return likely(marked) ? 0
+		: bch2_mark_replicas_slowpath(c, devs);
+}
+
 int bch2_mark_replicas(struct bch_fs *c,
 		       enum bch_data_type data_type,
 		       struct bch_devs_list devs)
 {
 	struct bch_replicas_entry_padded search;
-	struct bch_replicas_cpu *r, *gc_r;
-	bool marked;
 
 	if (!devs.nr)
 		return 0;
@@ -206,31 +257,31 @@ int bch2_mark_replicas(struct bch_fs *c,
 
 	devlist_to_replicas(devs, data_type, &search.e);
 
-	rcu_read_lock();
-	r = rcu_dereference(c->replicas);
-	gc_r = rcu_dereference(c->replicas_gc);
-	marked = replicas_has_entry(r, &search.e) &&
-		(!likely(gc_r) || replicas_has_entry(gc_r, &search.e));
-	rcu_read_unlock();
-
-	return likely(marked) ? 0
-		: bch2_mark_replicas_slowpath(c, &search.e);
+	return __bch2_mark_replicas(c, &search.e);
 }
 
 int bch2_mark_bkey_replicas(struct bch_fs *c,
-			    enum bch_data_type data_type,
+			    enum bkey_type type,
 			    struct bkey_s_c k)
 {
-	struct bch_devs_list cached = bch2_bkey_cached_devs(k);
-	unsigned i;
+	struct bch_replicas_entry_padded search;
 	int ret;
 
-	for (i = 0; i < cached.nr; i++)
-		if ((ret = bch2_mark_replicas(c, BCH_DATA_CACHED,
-					      bch2_dev_list_single(cached.devs[i]))))
-			return ret;
+	if (type == BKEY_TYPE_EXTENTS) {
+		struct bch_devs_list cached = bch2_bkey_cached_devs(k);
+		unsigned i;
+
+		for (i = 0; i < cached.nr; i++)
+			if ((ret = bch2_mark_replicas(c, BCH_DATA_CACHED,
+						bch2_dev_list_single(cached.devs[i]))))
+				return ret;
+	}
+
+	bkey_to_replicas(type, k, &search.e);
 
-	return bch2_mark_replicas(c, data_type, bch2_bkey_dirty_devs(k));
+	return search.e.nr_devs
+		? __bch2_mark_replicas(c, &search.e)
+		: 0;
 }
 
 int bch2_replicas_gc_end(struct bch_fs *c, int ret)
@@ -507,18 +558,32 @@ bool bch2_replicas_marked(struct bch_fs *c,
 }
 
 bool bch2_bkey_replicas_marked(struct bch_fs *c,
-			       enum bch_data_type data_type,
+			       enum bkey_type type,
 			       struct bkey_s_c k)
 {
-	struct bch_devs_list cached = bch2_bkey_cached_devs(k);
-	unsigned i;
+	struct bch_replicas_entry_padded search;
+	bool ret;
+
+	if (type == BKEY_TYPE_EXTENTS) {
+		struct bch_devs_list cached = bch2_bkey_cached_devs(k);
+		unsigned i;
+
+		for (i = 0; i < cached.nr; i++)
+			if (!bch2_replicas_marked(c, BCH_DATA_CACHED,
+					bch2_dev_list_single(cached.devs[i])))
+				return false;
+	}
+
+	bkey_to_replicas(type, k, &search.e);
 
-	for (i = 0; i < cached.nr; i++)
-		if (!bch2_replicas_marked(c, BCH_DATA_CACHED,
-					  bch2_dev_list_single(cached.devs[i])))
-			return false;
+	if (!search.e.nr_devs)
+		return true;
+
+	rcu_read_lock();
+	ret = replicas_has_entry(rcu_dereference(c->replicas), &search.e);
+	rcu_read_unlock();
 
-	return bch2_replicas_marked(c, data_type, bch2_bkey_dirty_devs(k));
+	return ret;
 }
 
 struct replicas_status __bch2_replicas_status(struct bch_fs *c,
diff --git a/fs/bcachefs/replicas.h b/fs/bcachefs/replicas.h
index 6c01f35296e7..ebbb1334cc2c 100644
--- a/fs/bcachefs/replicas.h
+++ b/fs/bcachefs/replicas.h
@@ -6,11 +6,11 @@
 
 bool bch2_replicas_marked(struct bch_fs *, enum bch_data_type,
 			  struct bch_devs_list);
-bool bch2_bkey_replicas_marked(struct bch_fs *, enum bch_data_type,
+bool bch2_bkey_replicas_marked(struct bch_fs *, enum bkey_type,
 			       struct bkey_s_c);
 int bch2_mark_replicas(struct bch_fs *, enum bch_data_type,
 		       struct bch_devs_list);
-int bch2_mark_bkey_replicas(struct bch_fs *, enum bch_data_type,
+int bch2_mark_bkey_replicas(struct bch_fs *, enum bkey_type,
 			    struct bkey_s_c);
 
 int bch2_cpu_replicas_to_text(struct bch_replicas_cpu *, char *, size_t);
-- 
cgit 


From 47799326bcdccd44f34845fd81814c1d3689a0a0 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 1 Nov 2018 15:21:48 -0400
Subject: bcachefs: more key marking refactoring

prep work for erasure coding

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_gc.c              |  39 +++++-----
 fs/bcachefs/btree_update_interior.c |  37 +++++-----
 fs/bcachefs/buckets.c               | 139 ++++++++++++++++++++++--------------
 fs/bcachefs/buckets.h               |   5 +-
 fs/bcachefs/extents.c               |   5 +-
 5 files changed, 132 insertions(+), 93 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index c9a013f43374..0fb89e03fac8 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -212,34 +212,31 @@ static int bch2_gc_mark_key(struct bch_fs *c, enum bkey_type type,
 			    struct bkey_s_c k, bool initial)
 {
 	struct gc_pos pos = { 0 };
-	unsigned flags = initial ? BCH_BUCKET_MARK_NOATOMIC : 0;
+	unsigned flags =
+		BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
+		BCH_BUCKET_MARK_GC_LOCK_HELD|
+		(initial ? BCH_BUCKET_MARK_NOATOMIC : 0);
 	int ret = 0;
 
 	switch (type) {
 	case BKEY_TYPE_BTREE:
-		if (initial) {
-			ret = bch2_btree_mark_ptrs_initial(c, type, k);
-			if (ret < 0)
-				return ret;
-		}
-
-		bch2_mark_key(c, k, c->opts.btree_node_size,
-			      BCH_DATA_BTREE, pos, NULL,
-			      0, flags|
-			      BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
-			      BCH_BUCKET_MARK_GC_LOCK_HELD);
-		break;
 	case BKEY_TYPE_EXTENTS:
 		if (initial) {
 			ret = bch2_btree_mark_ptrs_initial(c, type, k);
 			if (ret < 0)
 				return ret;
 		}
+		break;
+	default:
+		break;
+	}
+
+	bch2_mark_key(c, type, k, true, k.k->size,
+		      pos, NULL, 0, flags);
 
-		bch2_mark_key(c, k, k.k->size, BCH_DATA_USER, pos, NULL,
-			      0, flags|
-			      BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
-			      BCH_BUCKET_MARK_GC_LOCK_HELD);
+	switch (type) {
+	case BKEY_TYPE_BTREE:
+	case BKEY_TYPE_EXTENTS:
 		ret = bch2_btree_key_recalc_oldest_gen(c, k);
 		break;
 	default:
@@ -473,10 +470,10 @@ static void bch2_mark_pending_btree_node_frees(struct bch_fs *c)
 
 	for_each_pending_btree_node_free(c, as, d)
 		if (d->index_update_done)
-			bch2_mark_key(c, bkey_i_to_s_c(&d->key),
-				      c->opts.btree_node_size,
-				      BCH_DATA_BTREE, pos,
-				      &stats, 0,
+			bch2_mark_key(c, BKEY_TYPE_BTREE,
+				      bkey_i_to_s_c(&d->key),
+				      true, 0,
+				      pos, &stats, 0,
 				      BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
 				      BCH_BUCKET_MARK_GC_LOCK_HELD);
 	/*
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index c9facec494ef..4ec448718fd8 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -211,11 +211,12 @@ found:
 	if (gc_pos_cmp(c->gc_pos, gc_phase(GC_PHASE_PENDING_DELETE)) < 0) {
 		struct bch_fs_usage tmp = { 0 };
 
-		bch2_mark_key(c, bkey_i_to_s_c(&d->key),
-			     -c->opts.btree_node_size, BCH_DATA_BTREE, b
-			     ? gc_pos_btree_node(b)
-			     : gc_pos_btree_root(as->btree_id),
-			     &tmp, 0, 0);
+		bch2_mark_key(c, BKEY_TYPE_BTREE,
+			      bkey_i_to_s_c(&d->key),
+			      false, 0, b
+			      ? gc_pos_btree_node(b)
+			      : gc_pos_btree_root(as->btree_id),
+			      &tmp, 0, 0);
 		/*
 		 * Don't apply tmp - pending deletes aren't tracked in
 		 * bch_alloc_stats:
@@ -290,10 +291,11 @@ static void bch2_btree_node_free_ondisk(struct bch_fs *c,
 
 	BUG_ON(!pending->index_update_done);
 
-	bch2_mark_key(c, bkey_i_to_s_c(&pending->key),
-		     -c->opts.btree_node_size, BCH_DATA_BTREE,
-		     gc_phase(GC_PHASE_PENDING_DELETE),
-		     &stats, 0, 0);
+	bch2_mark_key(c, BKEY_TYPE_BTREE,
+		      bkey_i_to_s_c(&pending->key),
+		      false, 0,
+		      gc_phase(GC_PHASE_PENDING_DELETE),
+		      &stats, 0, 0);
 	/*
 	 * Don't apply stats - pending deletes aren't tracked in
 	 * bch_alloc_stats:
@@ -1092,8 +1094,9 @@ static void bch2_btree_set_root_inmem(struct btree_update *as, struct btree *b)
 
 	__bch2_btree_set_root_inmem(c, b);
 
-	bch2_mark_key(c, bkey_i_to_s_c(&b->key),
-		      c->opts.btree_node_size, BCH_DATA_BTREE,
+	bch2_mark_key(c, BKEY_TYPE_BTREE,
+		      bkey_i_to_s_c(&b->key),
+		      true, 0,
 		      gc_pos_btree_root(b->btree_id),
 		      &stats, 0, 0);
 
@@ -1180,9 +1183,10 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, struct btree *b
 	BUG_ON(insert->k.u64s > bch_btree_keys_u64s_remaining(c, b));
 
 	if (bkey_extent_is_data(&insert->k))
-		bch2_mark_key(c, bkey_i_to_s_c(insert),
-			     c->opts.btree_node_size, BCH_DATA_BTREE,
-			     gc_pos_btree_node(b), &stats, 0, 0);
+		bch2_mark_key(c, BKEY_TYPE_BTREE,
+			      bkey_i_to_s_c(insert),
+			      true, 0,
+			      gc_pos_btree_node(b), &stats, 0, 0);
 
 	while ((k = bch2_btree_node_iter_peek_all(node_iter, b)) &&
 	       bkey_iter_pos_cmp(b, &insert->k.p, k) > 0)
@@ -1967,8 +1971,9 @@ static void __bch2_btree_node_update_key(struct bch_fs *c,
 
 		bch2_btree_node_lock_write(b, iter);
 
-		bch2_mark_key(c, bkey_i_to_s_c(&new_key->k_i),
-			      c->opts.btree_node_size, BCH_DATA_BTREE,
+		bch2_mark_key(c, BKEY_TYPE_BTREE,
+			      bkey_i_to_s_c(&new_key->k_i),
+			      true, 0,
 			      gc_pos_btree_root(b->btree_id),
 			      &stats, 0, 0);
 		bch2_btree_node_free_index(as, NULL,
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 84972b67f193..6f40c4bd16ec 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -539,24 +539,10 @@ static int __disk_sectors(struct bch_extent_crc_unpacked crc, unsigned sectors)
 				    crc.uncompressed_size));
 }
 
-/*
- * Checking against gc's position has to be done here, inside the cmpxchg()
- * loop, to avoid racing with the start of gc clearing all the marks - GC does
- * that with the gc pos seqlock held.
- */
-static void bch2_mark_pointer(struct bch_fs *c,
-			      struct bkey_s_c_extent e,
-			      struct extent_ptr_decoded p,
-			      s64 sectors, enum bch_data_type data_type,
-			      unsigned replicas,
-			      struct bch_fs_usage *fs_usage,
-			      u64 journal_seq, unsigned flags)
+static s64 ptr_disk_sectors(struct bkey_s_c_extent e,
+			    struct extent_ptr_decoded p,
+			    s64 sectors)
 {
-	struct bucket_mark old, new;
-	struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev);
-	struct bucket *g = PTR_BUCKET(ca, &p.ptr);
-	s64 uncompressed_sectors = sectors;
-	u64 v;
 
 	if (p.crc.compression_type) {
 		unsigned old_sectors, new_sectors;
@@ -573,19 +559,25 @@ static void bch2_mark_pointer(struct bch_fs *c,
 			  +__disk_sectors(p.crc, new_sectors);
 	}
 
-	/*
-	 * fs level usage (which determines free space) is in uncompressed
-	 * sectors, until copygc + compression is sorted out:
-	 *
-	 * note also that we always update @fs_usage, even when we otherwise
-	 * wouldn't do anything because gc is running - this is because the
-	 * caller still needs to account w.r.t. its disk reservation. It is
-	 * caller's responsibility to not apply @fs_usage if gc is in progress.
-	 */
-	fs_usage->replicas
-		[!p.ptr.cached && replicas ? replicas - 1 : 0].data
-		[!p.ptr.cached ? data_type : BCH_DATA_CACHED] +=
-			uncompressed_sectors;
+	return sectors;
+}
+
+/*
+ * Checking against gc's position has to be done here, inside the cmpxchg()
+ * loop, to avoid racing with the start of gc clearing all the marks - GC does
+ * that with the gc pos seqlock held.
+ */
+static void bch2_mark_pointer(struct bch_fs *c,
+			      struct bkey_s_c_extent e,
+			      struct extent_ptr_decoded p,
+			      s64 sectors, enum bch_data_type data_type,
+			      struct bch_fs_usage *fs_usage,
+			      u64 journal_seq, unsigned flags)
+{
+	struct bucket_mark old, new;
+	struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev);
+	struct bucket *g = PTR_BUCKET(ca, &p.ptr);
+	u64 v;
 
 	if (flags & BCH_BUCKET_MARK_GC_WILL_VISIT) {
 		if (journal_seq)
@@ -644,16 +636,64 @@ static void bch2_mark_pointer(struct bch_fs *c,
 	       bucket_became_unavailable(c, old, new));
 }
 
-void bch2_mark_key(struct bch_fs *c, struct bkey_s_c k,
-		   s64 sectors, enum bch_data_type data_type,
-		   struct gc_pos pos,
-		   struct bch_fs_usage *stats,
-		   u64 journal_seq, unsigned flags)
+static void bch2_mark_extent(struct bch_fs *c, struct bkey_s_c k,
+			     s64 sectors, enum bch_data_type data_type,
+			     struct gc_pos pos,
+			     struct bch_fs_usage *stats,
+			     u64 journal_seq, unsigned flags)
 {
 	unsigned replicas = bch2_extent_nr_dirty_ptrs(k);
 
 	BUG_ON(replicas && replicas - 1 > ARRAY_SIZE(stats->replicas));
+	BUG_ON(!sectors);
+
+	switch (k.k->type) {
+	case BCH_EXTENT:
+	case BCH_EXTENT_CACHED: {
+		struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
+		const union bch_extent_entry *entry;
+		struct extent_ptr_decoded p;
+
+		extent_for_each_ptr_decode(e, p, entry) {
+			s64 disk_sectors = ptr_disk_sectors(e, p, sectors);
+
+			/*
+			 * fs level usage (which determines free space) is in
+			 * uncompressed sectors, until copygc + compression is
+			 * sorted out:
+			 *
+			 * note also that we always update @fs_usage, even when
+			 * we otherwise wouldn't do anything because gc is
+			 * running - this is because the caller still needs to
+			 * account w.r.t. its disk reservation. It is caller's
+			 * responsibility to not apply @fs_usage if gc is in
+			 * progress.
+			 */
+			stats->replicas
+				[!p.ptr.cached && replicas ? replicas - 1 : 0].data
+				[!p.ptr.cached ? data_type : BCH_DATA_CACHED] +=
+					sectors;
+
+			bch2_mark_pointer(c, e, p, disk_sectors, data_type,
+					  stats, journal_seq, flags);
+		}
+		break;
+	}
+	case BCH_RESERVATION:
+		if (replicas)
+			stats->replicas[replicas - 1].persistent_reserved +=
+				sectors * replicas;
+		break;
+	}
+}
 
+void bch2_mark_key(struct bch_fs *c,
+		   enum bkey_type type, struct bkey_s_c k,
+		   bool inserting, s64 sectors,
+		   struct gc_pos pos,
+		   struct bch_fs_usage *stats,
+		   u64 journal_seq, unsigned flags)
+{
 	/*
 	 * synchronization w.r.t. GC:
 	 *
@@ -690,24 +730,19 @@ void bch2_mark_key(struct bch_fs *c, struct bkey_s_c k,
 	if (!stats)
 		stats = this_cpu_ptr(c->usage_percpu);
 
-	switch (k.k->type) {
-	case BCH_EXTENT:
-	case BCH_EXTENT_CACHED: {
-		struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
-		const union bch_extent_entry *entry;
-		struct extent_ptr_decoded p;
-
-		BUG_ON(!sectors);
-
-		extent_for_each_ptr_decode(e, p, entry)
-			bch2_mark_pointer(c, e, p, sectors, data_type,
-					  replicas, stats, journal_seq, flags);
+	switch (type) {
+	case BKEY_TYPE_BTREE:
+		bch2_mark_extent(c, k, inserting
+				 ?  c->opts.btree_node_size
+				 : -c->opts.btree_node_size,
+				 BCH_DATA_BTREE,
+				 pos, stats, journal_seq, flags);
 		break;
-	}
-	case BCH_RESERVATION:
-		if (replicas)
-			stats->replicas[replicas - 1].persistent_reserved +=
-				sectors * replicas;
+	case BKEY_TYPE_EXTENTS:
+		bch2_mark_extent(c, k, sectors, BCH_DATA_USER,
+				 pos, stats, journal_seq, flags);
+		break;
+	default:
 		break;
 	}
 	percpu_up_read(&c->usage_lock);
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index 9aeccbb11d54..e22c51972c31 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -204,8 +204,9 @@ void bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *,
 #define BCH_BUCKET_MARK_GC_WILL_VISIT		(1 << 2)
 #define BCH_BUCKET_MARK_GC_LOCK_HELD		(1 << 3)
 
-void bch2_mark_key(struct bch_fs *, struct bkey_s_c, s64, enum bch_data_type,
-		   struct gc_pos, struct bch_fs_usage *, u64, unsigned);
+void bch2_mark_key(struct bch_fs *, enum bkey_type, struct bkey_s_c,
+		   bool, s64, struct gc_pos,
+		   struct bch_fs_usage *, u64, unsigned);
 
 void bch2_recalc_sectors_available(struct bch_fs *);
 
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 4e4918524a77..4a1ec3bba91b 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -1009,8 +1009,9 @@ static void bch2_add_sectors(struct extent_insert_state *s,
 	if (!sectors)
 		return;
 
-	bch2_mark_key(c, k, sectors, BCH_DATA_USER, gc_pos_btree_node(b),
-		      &s->stats, s->trans->journal_res.seq, 0);
+	bch2_mark_key(c, BKEY_TYPE_EXTENTS, k, sectors > 0, sectors,
+		      gc_pos_btree_node(b), &s->stats,
+		      s->trans->journal_res.seq, 0);
 }
 
 static void bch2_subtract_sectors(struct extent_insert_state *s,
-- 
cgit 


From 5bd95a371856ef3ade9cb417a6151499672c8a90 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 1 Nov 2018 15:28:45 -0400
Subject: bcachefs: new avoid mechanism for io retries

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_io.c      |  9 +++----
 fs/bcachefs/extents.c       | 63 +++++++++++++++++++++++++++++++++------------
 fs/bcachefs/extents.h       |  7 ++---
 fs/bcachefs/extents_types.h |  9 +++++++
 fs/bcachefs/io.c            | 25 +++++++++---------
 fs/bcachefs/io.h            |  2 +-
 6 files changed, 77 insertions(+), 38 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 8f8e5fab1086..c8809a59a765 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -1345,11 +1345,9 @@ static void btree_node_read_work(struct work_struct *work)
 	struct bch_dev *ca	= bch_dev_bkey_exists(c, rb->pick.ptr.dev);
 	struct btree *b		= rb->bio.bi_private;
 	struct bio *bio		= &rb->bio;
-	struct bch_devs_mask avoid;
+	struct bch_io_failures failed = { .nr = 0 };
 	bool can_retry;
 
-	memset(&avoid, 0, sizeof(avoid));
-
 	goto start;
 	while (1) {
 		bch_info(c, "retrying read");
@@ -1371,8 +1369,9 @@ start:
 			percpu_ref_put(&ca->io_ref);
 		rb->have_ioref = false;
 
-		__set_bit(rb->pick.ptr.dev, avoid.d);
-		can_retry = bch2_btree_pick_ptr(c, b, &avoid, &rb->pick) > 0;
+		bch2_mark_io_failure(&failed, &rb->pick);
+
+		can_retry = bch2_btree_pick_ptr(c, b, &failed, &rb->pick) > 0;
 
 		if (!bio->bi_status &&
 		    !bch2_btree_node_read_done(c, b, can_retry))
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 4a1ec3bba91b..4a62eefd40cd 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -519,12 +519,45 @@ out:
 	return out - buf;
 }
 
-static inline bool dev_latency_better(struct bch_fs *c,
-			      const struct bch_extent_ptr *ptr1,
-			      const struct bch_extent_ptr *ptr2)
+static struct bch_dev_io_failures *dev_io_failures(struct bch_io_failures *f,
+						   unsigned dev)
 {
-	struct bch_dev *dev1 = bch_dev_bkey_exists(c, ptr1->dev);
-	struct bch_dev *dev2 = bch_dev_bkey_exists(c, ptr2->dev);
+	struct bch_dev_io_failures *i;
+
+	for (i = f->devs; i < f->devs + f->nr; i++)
+		if (i->dev == dev)
+			return i;
+
+	return NULL;
+}
+
+void bch2_mark_io_failure(struct bch_io_failures *failed,
+			  struct extent_ptr_decoded *p)
+{
+	struct bch_dev_io_failures *f = dev_io_failures(failed, p->ptr.dev);
+
+	if (!f) {
+		BUG_ON(failed->nr >= ARRAY_SIZE(failed->devs));
+
+		f = &failed->devs[failed->nr++];
+		f->dev		= p->ptr.dev;
+		f->nr_failed	= 1;
+		f->nr_retries	= 0;
+	} else {
+		f->nr_failed++;
+	}
+}
+
+/*
+ * returns true if p1 is better than p2:
+ */
+static inline bool ptr_better(struct bch_fs *c,
+			      const struct extent_ptr_decoded p1,
+			      const struct extent_ptr_decoded p2)
+{
+	struct bch_dev *dev1 = bch_dev_bkey_exists(c, p1.ptr.dev);
+	struct bch_dev *dev2 = bch_dev_bkey_exists(c, p2.ptr.dev);
+
 	u64 l1 = atomic64_read(&dev1->cur_latency[READ]);
 	u64 l2 = atomic64_read(&dev2->cur_latency[READ]);
 
@@ -535,11 +568,12 @@ static inline bool dev_latency_better(struct bch_fs *c,
 
 static int extent_pick_read_device(struct bch_fs *c,
 				   struct bkey_s_c_extent e,
-				   struct bch_devs_mask *avoid,
+				   struct bch_io_failures *failed,
 				   struct extent_ptr_decoded *pick)
 {
 	const union bch_extent_entry *entry;
 	struct extent_ptr_decoded p;
+	struct bch_dev_io_failures *f;
 	struct bch_dev *ca;
 	int ret = 0;
 
@@ -549,14 +583,11 @@ static int extent_pick_read_device(struct bch_fs *c,
 		if (p.ptr.cached && ptr_stale(ca, &p.ptr))
 			continue;
 
-		/*
-		 * XXX: need to make avoid work correctly for stripe ptrs
-		 */
-
-		if (avoid && test_bit(p.ptr.dev, avoid->d))
+		f = failed ? dev_io_failures(failed, p.ptr.dev) : NULL;
+		if (f && f->nr_failed >= f->nr_retries)
 			continue;
 
-		if (ret && !dev_latency_better(c, &p.ptr, &pick->ptr))
+		if (ret && !ptr_better(c, p, *pick))
 			continue;
 
 		*pick = p;
@@ -685,11 +716,11 @@ int bch2_btree_ptr_to_text(struct bch_fs *c, char *buf,
 }
 
 int bch2_btree_pick_ptr(struct bch_fs *c, const struct btree *b,
-			struct bch_devs_mask *avoid,
+			struct bch_io_failures *failed,
 			struct extent_ptr_decoded *pick)
 {
 	return extent_pick_read_device(c, bkey_i_to_s_c_extent(&b->key),
-				       avoid, pick);
+				       failed, pick);
 }
 
 /* Extents */
@@ -1909,7 +1940,7 @@ void bch2_extent_mark_replicas_cached(struct bch_fs *c,
  * other devices, it will still pick a pointer from avoid.
  */
 int bch2_extent_pick_ptr(struct bch_fs *c, struct bkey_s_c k,
-			 struct bch_devs_mask *avoid,
+			 struct bch_io_failures *failed,
 			 struct extent_ptr_decoded *pick)
 {
 	int ret;
@@ -1921,7 +1952,7 @@ int bch2_extent_pick_ptr(struct bch_fs *c, struct bkey_s_c k,
 	case BCH_EXTENT:
 	case BCH_EXTENT_CACHED:
 		ret = extent_pick_read_device(c, bkey_s_c_to_extent(k),
-					      avoid, pick);
+					      failed, pick);
 
 		if (!ret && !bkey_extent_is_cached(k.k))
 			ret = -EIO;
diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h
index e2f6caefcb31..8754a940a476 100644
--- a/fs/bcachefs/extents.h
+++ b/fs/bcachefs/extents.h
@@ -53,12 +53,13 @@ struct btree_nr_keys bch2_extent_sort_fix_overlapping(struct bch_fs *c,
 						     struct btree *,
 						     struct btree_node_iter_large *);
 
+void bch2_mark_io_failure(struct bch_io_failures *,
+			  struct extent_ptr_decoded *);
 int bch2_btree_pick_ptr(struct bch_fs *, const struct btree *,
-			struct bch_devs_mask *avoid,
+			struct bch_io_failures *,
 			struct extent_ptr_decoded *);
-
 int bch2_extent_pick_ptr(struct bch_fs *, struct bkey_s_c,
-			 struct bch_devs_mask *,
+			 struct bch_io_failures *,
 			 struct extent_ptr_decoded *);
 
 void bch2_extent_trim_atomic(struct bkey_i *, struct btree_iter *);
diff --git a/fs/bcachefs/extents_types.h b/fs/bcachefs/extents_types.h
index 3540e2558c0f..5738738d7953 100644
--- a/fs/bcachefs/extents_types.h
+++ b/fs/bcachefs/extents_types.h
@@ -24,4 +24,13 @@ struct extent_ptr_decoded {
 	struct bch_extent_ptr		ptr;
 };
 
+struct bch_io_failures {
+	u8			nr;
+	struct bch_dev_io_failures {
+		u8		dev;
+		u8		nr_failed;
+		u8		nr_retries;
+	}			devs[BCH_REPLICAS_MAX];
+};
+
 #endif /* _BCACHEFS_EXTENTS_TYPES_H */
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index a4660746be0d..133b702299dd 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -1203,7 +1203,8 @@ static void bch2_rbio_done(struct bch_read_bio *rbio)
 
 static void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio,
 				     struct bvec_iter bvec_iter, u64 inode,
-				     struct bch_devs_mask *avoid, unsigned flags)
+				     struct bch_io_failures *failed,
+				     unsigned flags)
 {
 	struct btree_iter iter;
 	BKEY_PADDED(k) tmp;
@@ -1237,7 +1238,7 @@ retry:
 		goto out;
 	}
 
-	ret = __bch2_read_extent(c, rbio, bvec_iter, k, avoid, flags);
+	ret = __bch2_read_extent(c, rbio, bvec_iter, k, failed, flags);
 	if (ret == READ_RETRY)
 		goto retry;
 	if (ret)
@@ -1251,7 +1252,7 @@ out:
 
 static void bch2_read_retry(struct bch_fs *c, struct bch_read_bio *rbio,
 			    struct bvec_iter bvec_iter, u64 inode,
-			    struct bch_devs_mask *avoid, unsigned flags)
+			    struct bch_io_failures *failed, unsigned flags)
 {
 	struct btree_iter iter;
 	struct bkey_s_c k;
@@ -1274,7 +1275,7 @@ retry:
 			      (k.k->p.offset - bvec_iter.bi_sector) << 9);
 		swap(bvec_iter.bi_size, bytes);
 
-		ret = __bch2_read_extent(c, rbio, bvec_iter, k, avoid, flags);
+		ret = __bch2_read_extent(c, rbio, bvec_iter, k, failed, flags);
 		switch (ret) {
 		case READ_RETRY:
 			goto retry;
@@ -1310,14 +1311,12 @@ static void bch2_rbio_retry(struct work_struct *work)
 	struct bvec_iter iter	= rbio->bvec_iter;
 	unsigned flags		= rbio->flags;
 	u64 inode		= rbio->pos.inode;
-	struct bch_devs_mask avoid;
+	struct bch_io_failures failed = { .nr = 0 };
 
 	trace_read_retry(&rbio->bio);
 
-	memset(&avoid, 0, sizeof(avoid));
-
 	if (rbio->retry == READ_RETRY_AVOID)
-		__set_bit(rbio->pick.ptr.dev, avoid.d);
+		bch2_mark_io_failure(&failed, &rbio->pick);
 
 	rbio->bio.bi_status = 0;
 
@@ -1327,9 +1326,9 @@ static void bch2_rbio_retry(struct work_struct *work)
 	flags &= ~BCH_READ_MAY_PROMOTE;
 
 	if (flags & BCH_READ_NODECODE)
-		bch2_read_retry_nodecode(c, rbio, iter, inode, &avoid, flags);
+		bch2_read_retry_nodecode(c, rbio, iter, inode, &failed, flags);
 	else
-		bch2_read_retry(c, rbio, iter, inode, &avoid, flags);
+		bch2_read_retry(c, rbio, iter, inode, &failed, flags);
 }
 
 static void bch2_rbio_error(struct bch_read_bio *rbio, int retry,
@@ -1569,7 +1568,7 @@ static void bch2_read_endio(struct bio *bio)
 
 int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig,
 		       struct bvec_iter iter, struct bkey_s_c k,
-		       struct bch_devs_mask *avoid, unsigned flags)
+		       struct bch_io_failures *failed, unsigned flags)
 {
 	struct extent_ptr_decoded pick;
 	struct bch_read_bio *rbio = NULL;
@@ -1579,7 +1578,7 @@ int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig,
 	struct bpos pos = bkey_start_pos(k.k);
 	int pick_ret;
 
-	pick_ret = bch2_extent_pick_ptr(c, k, avoid, &pick);
+	pick_ret = bch2_extent_pick_ptr(c, k, failed, &pick);
 
 	/* hole or reservation - just zero fill: */
 	if (!pick_ret)
@@ -1750,7 +1749,7 @@ noclone:
 		rbio = bch2_rbio_free(rbio);
 
 		if (ret == READ_RETRY_AVOID) {
-			__set_bit(pick.ptr.dev, avoid->d);
+			bch2_mark_io_failure(failed, &pick);
 			ret = READ_RETRY;
 		}
 
diff --git a/fs/bcachefs/io.h b/fs/bcachefs/io.h
index c832b7291005..8a7f246e8823 100644
--- a/fs/bcachefs/io.h
+++ b/fs/bcachefs/io.h
@@ -102,7 +102,7 @@ struct cache_promote_op;
 struct extent_ptr_decoded;
 
 int __bch2_read_extent(struct bch_fs *, struct bch_read_bio *, struct bvec_iter,
-		       struct bkey_s_c, struct bch_devs_mask *, unsigned);
+		       struct bkey_s_c, struct bch_io_failures *, unsigned);
 void bch2_read(struct bch_fs *, struct bch_read_bio *, u64);
 
 enum bch_read_flags {
-- 
cgit 


From b564513cf990d2d30305ac63a72a013fc197e7da Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 1 Nov 2018 16:02:02 -0400
Subject: bcachefs: fix bch2_bkey_print_bfloat

was popping an assertion in the eytzinger code

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bset.c | 100 ++++++++++++++++++++++++++++-------------------------
 1 file changed, 52 insertions(+), 48 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c
index 8e14e4be0b5c..74d8871528a6 100644
--- a/fs/bcachefs/bset.c
+++ b/fs/bcachefs/bset.c
@@ -1774,7 +1774,7 @@ int bch2_bkey_print_bfloat(struct btree *b, struct bkey_packed *k,
 	struct bkey_packed *l, *r, *p;
 	struct bkey uk, up;
 	char buf1[200], buf2[200];
-	unsigned j;
+	unsigned j, inorder;
 
 	if (!size)
 		return 0;
@@ -1782,53 +1782,57 @@ int bch2_bkey_print_bfloat(struct btree *b, struct bkey_packed *k,
 	if (!bset_has_ro_aux_tree(t))
 		goto out;
 
-	j = __inorder_to_eytzinger1(bkey_to_cacheline(b, t, k), t->size, t->extra);
-	if (j &&
-	    j < t->size &&
-	    k == tree_to_bkey(b, t, j))
-		switch (bkey_float(b, t, j)->exponent) {
-		case BFLOAT_FAILED_UNPACKED:
-			uk = bkey_unpack_key(b, k);
-			return scnprintf(buf, size,
-					 "    failed unpacked at depth %u\n"
-					 "\t%llu:%llu\n",
-					 ilog2(j),
-					 uk.p.inode, uk.p.offset);
-		case BFLOAT_FAILED_PREV:
-			p = tree_to_prev_bkey(b, t, j);
-			l = is_power_of_2(j)
-				? btree_bkey_first(b, t)
-				: tree_to_prev_bkey(b, t, j >> ffs(j));
-			r = is_power_of_2(j + 1)
-				? bch2_bkey_prev_all(b, t, btree_bkey_last(b, t))
-				: tree_to_bkey(b, t, j >> (ffz(j) + 1));
-
-			up = bkey_unpack_key(b, p);
-			uk = bkey_unpack_key(b, k);
-			bch2_to_binary(buf1, high_word(&b->format, p), b->nr_key_bits);
-			bch2_to_binary(buf2, high_word(&b->format, k), b->nr_key_bits);
-
-			return scnprintf(buf, size,
-					 "    failed prev at depth %u\n"
-					 "\tkey starts at bit %u but first differing bit at %u\n"
-					 "\t%llu:%llu\n"
-					 "\t%llu:%llu\n"
-					 "\t%s\n"
-					 "\t%s\n",
-					 ilog2(j),
-					 bch2_bkey_greatest_differing_bit(b, l, r),
-					 bch2_bkey_greatest_differing_bit(b, p, k),
-					 uk.p.inode, uk.p.offset,
-					 up.p.inode, up.p.offset,
-					 buf1, buf2);
-		case BFLOAT_FAILED_OVERFLOW:
-			uk = bkey_unpack_key(b, k);
-			return scnprintf(buf, size,
-					 "    failed overflow at depth %u\n"
-					 "\t%llu:%llu\n",
-					 ilog2(j),
-					 uk.p.inode, uk.p.offset);
-		}
+	inorder = bkey_to_cacheline(b, t, k);
+	if (!inorder || inorder >= t->size)
+		goto out;
+
+	j = __inorder_to_eytzinger1(inorder, t->size, t->extra);
+	if (k != tree_to_bkey(b, t, j))
+		goto out;
+
+	switch (bkey_float(b, t, j)->exponent) {
+	case BFLOAT_FAILED_UNPACKED:
+		uk = bkey_unpack_key(b, k);
+		return scnprintf(buf, size,
+				 "    failed unpacked at depth %u\n"
+				 "\t%llu:%llu\n",
+				 ilog2(j),
+				 uk.p.inode, uk.p.offset);
+	case BFLOAT_FAILED_PREV:
+		p = tree_to_prev_bkey(b, t, j);
+		l = is_power_of_2(j)
+			? btree_bkey_first(b, t)
+			: tree_to_prev_bkey(b, t, j >> ffs(j));
+		r = is_power_of_2(j + 1)
+			? bch2_bkey_prev_all(b, t, btree_bkey_last(b, t))
+			: tree_to_bkey(b, t, j >> (ffz(j) + 1));
+
+		up = bkey_unpack_key(b, p);
+		uk = bkey_unpack_key(b, k);
+		bch2_to_binary(buf1, high_word(&b->format, p), b->nr_key_bits);
+		bch2_to_binary(buf2, high_word(&b->format, k), b->nr_key_bits);
+
+		return scnprintf(buf, size,
+				 "    failed prev at depth %u\n"
+				 "\tkey starts at bit %u but first differing bit at %u\n"
+				 "\t%llu:%llu\n"
+				 "\t%llu:%llu\n"
+				 "\t%s\n"
+				 "\t%s\n",
+				 ilog2(j),
+				 bch2_bkey_greatest_differing_bit(b, l, r),
+				 bch2_bkey_greatest_differing_bit(b, p, k),
+				 uk.p.inode, uk.p.offset,
+				 up.p.inode, up.p.offset,
+				 buf1, buf2);
+	case BFLOAT_FAILED_OVERFLOW:
+		uk = bkey_unpack_key(b, k);
+		return scnprintf(buf, size,
+				 "    failed overflow at depth %u\n"
+				 "\t%llu:%llu\n",
+				 ilog2(j),
+				 uk.p.inode, uk.p.offset);
+	}
 out:
 	*buf = '\0';
 	return 0;
-- 
cgit 


From ac10a9611d8794c849092a777a5febc4f69788ae Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 3 Nov 2018 20:04:54 -0400
Subject: bcachefs: Some fixes for building in userspace

userspace allocators don't align allocations as nicely as kernel
allocators, which meant that in some cases we weren't allocating big
enough bvec arrays - just make the calculations more rigorous and
explicit to fix it.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bset.c       |  2 +-
 fs/bcachefs/btree_io.c   |  3 ++-
 fs/bcachefs/journal_io.c | 12 +++++++++---
 3 files changed, 12 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c
index 74d8871528a6..7fc8fb85069f 100644
--- a/fs/bcachefs/bset.c
+++ b/fs/bcachefs/bset.c
@@ -1672,7 +1672,7 @@ struct bkey_packed *bch2_btree_node_iter_prev_filter(struct btree_node_iter *ite
 	struct bkey_packed *orig_pos = bch2_btree_node_iter_peek_all(iter, b);
 	struct btree_node_iter_set *set;
 	struct bset_tree *t;
-	unsigned end;
+	unsigned end = 0;
 
 	bch2_btree_node_iter_verify(iter, b);
 
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index c8809a59a765..1036b72f1ae6 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -1881,7 +1881,8 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
 
 	trace_btree_write(b, bytes_to_write, sectors_to_write);
 
-	wbio = container_of(bio_alloc_bioset(NULL, 1 << order,
+	wbio = container_of(bio_alloc_bioset(NULL,
+				buf_pages(data, sectors_to_write << 9),
 				REQ_OP_WRITE|REQ_META|REQ_FUA,
 				GFP_NOIO,
 				&c->btree_bio),
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 3dc24b39022f..0bcc4346285c 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -429,7 +429,6 @@ static int journal_read_bucket(struct bch_dev *ca,
 {
 	struct bch_fs *c = ca->fs;
 	struct journal_device *ja = &ca->journal;
-	struct bio *bio = ja->bio;
 	struct jset *j = NULL;
 	unsigned sectors, sectors_read = 0;
 	u64 offset = bucket_to_sector(ca, ja->buckets[bucket]),
@@ -441,15 +440,22 @@ static int journal_read_bucket(struct bch_dev *ca,
 
 	while (offset < end) {
 		if (!sectors_read) {
-reread:			sectors_read = min_t(unsigned,
+			struct bio *bio;
+			unsigned nr_bvecs;
+reread:
+			sectors_read = min_t(unsigned,
 				end - offset, buf->size >> 9);
+			nr_bvecs = buf_pages(buf->data, sectors_read << 9);
+
+			bio = bio_kmalloc(nr_bvecs, GFP_KERNEL);
+			bio_init(bio, ca->disk_sb.bdev, bio->bi_inline_vecs, nr_bvecs, REQ_OP_READ);
 
-			bio_reset(bio, ca->disk_sb.bdev, REQ_OP_READ);
 			bio->bi_iter.bi_sector	= offset;
 			bio->bi_iter.bi_size	= sectors_read << 9;
 			bch2_bio_map(bio, buf->data);
 
 			ret = submit_bio_wait(bio);
+			kfree(bio);
 
 			if (bch2_dev_io_err_on(ret, ca,
 					       "journal read from sector %llu",
-- 
cgit 


From 636ad1d391b9e0f22107ace04e6dbc07d8875739 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 3 Nov 2018 20:19:04 -0400
Subject: bcachefs: fix bounds checks in bch2_bio_map()

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/util.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c
index 75053322d0f0..ed90bd3a5d18 100644
--- a/fs/bcachefs/util.c
+++ b/fs/bcachefs/util.c
@@ -529,15 +529,17 @@ void bch2_bio_map(struct bio *bio, void *base)
 
 	BUG_ON(!bio->bi_iter.bi_size);
 	BUG_ON(bio->bi_vcnt);
+	BUG_ON(!bio->bi_max_vecs);
 
 	bv->bv_offset = base ? offset_in_page(base) : 0;
 	goto start;
 
 	for (; size; bio->bi_vcnt++, bv++) {
+		BUG_ON(bio->bi_vcnt >= bio->bi_max_vecs);
+
 		bv->bv_offset	= 0;
 start:		bv->bv_len	= min_t(size_t, PAGE_SIZE - bv->bv_offset,
 					size);
-		BUG_ON(bio->bi_vcnt >= bio->bi_max_vecs);
 		if (base) {
 			bv->bv_page = is_vmalloc_addr(base)
 				? vmalloc_to_page(base)
-- 
cgit 


From 6bdbfa87a832c0d3766e1b680729fede21eca7dc Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 3 Nov 2018 21:51:31 -0400
Subject: bcachefs: Fix journal replay when replicas sb section missing

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/journal_io.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 0bcc4346285c..eb2fbe235483 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -746,7 +746,8 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list)
 	jlist.ret = 0;
 
 	for_each_member_device(ca, c, iter) {
-		if (!(bch2_dev_has_data(c, ca) & (1 << BCH_DATA_JOURNAL)))
+		if (!test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) &&
+		    !(bch2_dev_has_data(c, ca) & (1 << BCH_DATA_JOURNAL)))
 			continue;
 
 		if ((ca->mi.state == BCH_MEMBER_STATE_RW ||
-- 
cgit 


From 02f1a96c135a7c99518261bf8d244091416c30c6 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 3 Nov 2018 21:52:52 -0400
Subject: bcachefs: Rename nofsck opt to fsck

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fsck.c     | 2 +-
 fs/bcachefs/opts.h     | 4 ++--
 fs/bcachefs/recovery.c | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 99852e0eb22f..7e08592253a6 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -1465,7 +1465,7 @@ static int bch2_fsck_walk_inodes_only(struct bch_fs *c)
 
 int bch2_fsck(struct bch_fs *c)
 {
-	if (!c->opts.nofsck)
+	if (c->opts.fsck)
 		return bch2_fsck_full(c);
 
 	if (!c->sb.clean &&
diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
index 01f1cb53eb5f..52fb9781d933 100644
--- a/fs/bcachefs/opts.h
+++ b/fs/bcachefs/opts.h
@@ -153,9 +153,9 @@ enum opt_type {
 	BCH_OPT(journal_flush_disabled, u8,	OPT_RUNTIME,		\
 		OPT_BOOL(),						\
 		NO_SB_OPT,			false)			\
-	BCH_OPT(nofsck,			u8,	OPT_MOUNT,		\
+	BCH_OPT(fsck,			u8,	OPT_MOUNT,		\
 		OPT_BOOL(),						\
-		NO_SB_OPT,			false)			\
+		NO_SB_OPT,			true)			\
 	BCH_OPT(fix_errors,		u8,	OPT_MOUNT,		\
 		OPT_BOOL(),						\
 		NO_SB_OPT,			false)			\
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 6d603654d150..2fd68a39d76a 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -152,7 +152,7 @@ int bch2_fs_recovery(struct bch_fs *c)
 		bch_info(c, "recovering from clean shutdown, journal seq %llu",
 			 le64_to_cpu(clean->journal_seq));
 
-	if (!clean || !c->opts.nofsck) {
+	if (!clean || c->opts.fsck) {
 		ret = bch2_journal_read(c, &journal);
 		if (ret)
 			goto err;
-- 
cgit 


From 72644db153e6d3fc146d2c805dcba5b5d676cd99 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 3 Nov 2018 22:00:50 -0400
Subject: bcachefs: Fix an assertion when rebuilding replicas

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/extents.c  | 6 ++++--
 fs/bcachefs/recovery.c | 2 ++
 2 files changed, 6 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 4a62eefd40cd..e2bb1502eaad 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -679,7 +679,8 @@ void bch2_btree_ptr_debugcheck(struct bch_fs *c, struct btree *b,
 			goto err;
 	}
 
-	if (!bch2_bkey_replicas_marked(c, btree_node_type(b), e.s_c)) {
+	if (!test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) &&
+	    !bch2_bkey_replicas_marked(c, btree_node_type(b), e.s_c)) {
 		bch2_bkey_val_to_text(c, btree_node_type(b),
 				     buf, sizeof(buf), k);
 		bch2_fs_bug(c,
@@ -1713,7 +1714,8 @@ static void bch2_extent_debugcheck_extent(struct bch_fs *c, struct btree *b,
 		return;
 	}
 
-	if (!bch2_bkey_replicas_marked(c, btree_node_type(b), e.s_c)) {
+	if (!test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) &&
+	    !bch2_bkey_replicas_marked(c, btree_node_type(b), e.s_c)) {
 		bch2_bkey_val_to_text(c, btree_node_type(b),
 				     buf, sizeof(buf), e.s_c);
 		bch2_fs_bug(c,
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 2fd68a39d76a..e21551e8d6cc 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -219,6 +219,8 @@ int bch2_fs_recovery(struct bch_fs *c)
 		goto err;
 	bch_verbose(c, "mark and sweep done");
 
+	clear_bit(BCH_FS_REBUILD_REPLICAS, &c->flags);
+
 	if (c->opts.noreplay)
 		goto out;
 
-- 
cgit 


From b092dadd55fb242a480f81c421303a9e53302156 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 4 Nov 2018 21:55:35 -0500
Subject: bcachefs: Scale down number of writepoints when low on space

this means we don't have to reserve space for them when calculating
filesystem capacity

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c |  41 +++------------
 fs/bcachefs/alloc_background.h |   2 +-
 fs/bcachefs/alloc_foreground.c | 113 +++++++++++++++++++++++++++++++++++++++--
 fs/bcachefs/alloc_foreground.h |  11 +---
 fs/bcachefs/alloc_types.h      |   4 +-
 fs/bcachefs/bcachefs.h         |   6 ++-
 fs/bcachefs/buckets.c          |   5 --
 fs/bcachefs/buckets.h          |   6 +++
 fs/bcachefs/super.c            |   3 +-
 9 files changed, 133 insertions(+), 58 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 88be5f4be4b1..1eb39283e7e2 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -975,6 +975,7 @@ void bch2_recalc_capacity(struct bch_fs *c)
 {
 	struct bch_dev *ca;
 	u64 capacity = 0, reserved_sectors = 0, gc_reserve;
+	unsigned bucket_size_max = 0;
 	unsigned long ra_pages = 0;
 	unsigned i, j;
 
@@ -1012,12 +1013,9 @@ void bch2_recalc_capacity(struct bch_fs *c)
 
 		dev_reserve += ca->free_inc.size;
 
-		dev_reserve += ARRAY_SIZE(c->write_points);
-
 		dev_reserve += 1;	/* btree write point */
 		dev_reserve += 1;	/* copygc write point */
 		dev_reserve += 1;	/* rebalance write point */
-		dev_reserve += WRITE_POINT_COUNT;
 
 		dev_reserve *= ca->mi.bucket_size;
 
@@ -1027,6 +1025,9 @@ void bch2_recalc_capacity(struct bch_fs *c)
 					     ca->mi.first_bucket);
 
 		reserved_sectors += dev_reserve * 2;
+
+		bucket_size_max = max_t(unsigned, bucket_size_max,
+					ca->mi.bucket_size);
 	}
 
 	gc_reserve = c->opts.gc_reserve_bytes
@@ -1039,6 +1040,8 @@ void bch2_recalc_capacity(struct bch_fs *c)
 
 	c->capacity = capacity - reserved_sectors;
 
+	c->bucket_size_max = bucket_size_max;
+
 	if (c->capacity) {
 		bch2_io_timer_add(&c->io_clock[READ],
 				 &c->bucket_clock[READ].rescale);
@@ -1330,8 +1333,6 @@ not_enough:
 	 * invalidated on disk:
 	 */
 	if (invalidating_data) {
-		BUG();
-		pr_info("holding writes");
 		pr_debug("invalidating existing data");
 		set_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags);
 	} else {
@@ -1391,40 +1392,12 @@ int bch2_fs_allocator_start(struct bch_fs *c)
 	return bch2_alloc_write(c);
 }
 
-void bch2_fs_allocator_init(struct bch_fs *c)
+void bch2_fs_allocator_background_init(struct bch_fs *c)
 {
-	struct open_bucket *ob;
-	struct write_point *wp;
-
-	mutex_init(&c->write_points_hash_lock);
 	spin_lock_init(&c->freelist_lock);
 	bch2_bucket_clock_init(c, READ);
 	bch2_bucket_clock_init(c, WRITE);
 
-	/* open bucket 0 is a sentinal NULL: */
-	spin_lock_init(&c->open_buckets[0].lock);
-
-	for (ob = c->open_buckets + 1;
-	     ob < c->open_buckets + ARRAY_SIZE(c->open_buckets); ob++) {
-		spin_lock_init(&ob->lock);
-		c->open_buckets_nr_free++;
-
-		ob->freelist = c->open_buckets_freelist;
-		c->open_buckets_freelist = ob - c->open_buckets;
-	}
-
-	writepoint_init(&c->btree_write_point, BCH_DATA_BTREE);
-	writepoint_init(&c->rebalance_write_point, BCH_DATA_USER);
-
-	for (wp = c->write_points;
-	     wp < c->write_points + ARRAY_SIZE(c->write_points); wp++) {
-		writepoint_init(wp, BCH_DATA_USER);
-
-		wp->last_used	= sched_clock();
-		wp->write_point	= (unsigned long) wp;
-		hlist_add_head_rcu(&wp->node, writepoint_hash(c, wp->write_point));
-	}
-
 	c->pd_controllers_update_seconds = 5;
 	INIT_DELAYED_WORK(&c->pd_controllers_update, pd_controllers_update);
 }
diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h
index 6dbabe83cab7..245e037fbaea 100644
--- a/fs/bcachefs/alloc_background.h
+++ b/fs/bcachefs/alloc_background.h
@@ -57,6 +57,6 @@ int bch2_dev_allocator_start(struct bch_dev *);
 
 int bch2_alloc_write(struct bch_fs *);
 int bch2_fs_allocator_start(struct bch_fs *);
-void bch2_fs_allocator_init(struct bch_fs *);
+void bch2_fs_allocator_background_init(struct bch_fs *);
 
 #endif /* _BCACHEFS_ALLOC_BACKGROUND_H */
diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index 920d9ff3c53b..df74e41ec890 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -492,7 +492,7 @@ void bch2_writepoint_stop(struct bch_fs *c, struct bch_dev *ca,
 
 	mutex_lock(&wp->lock);
 	open_bucket_for_each(c, &wp->ptrs, ob, i)
-		if (ob->ptr.dev == ca->dev_idx)
+		if (!ca || ob->ptr.dev == ca->dev_idx)
 			open_bucket_free_unused(c, wp, ob);
 		else
 			ob_push(c, &ptrs, ob);
@@ -501,6 +501,15 @@ void bch2_writepoint_stop(struct bch_fs *c, struct bch_dev *ca,
 	mutex_unlock(&wp->lock);
 }
 
+static inline struct hlist_head *writepoint_hash(struct bch_fs *c,
+						 unsigned long write_point)
+{
+	unsigned hash =
+		hash_long(write_point, ilog2(ARRAY_SIZE(c->write_points_hash)));
+
+	return &c->write_points_hash[hash];
+}
+
 static struct write_point *__writepoint_find(struct hlist_head *head,
 					     unsigned long write_point)
 {
@@ -513,6 +522,53 @@ static struct write_point *__writepoint_find(struct hlist_head *head,
 	return NULL;
 }
 
+static inline bool too_many_writepoints(struct bch_fs *c, unsigned factor)
+{
+	u64 stranded	= c->write_points_nr * c->bucket_size_max;
+	u64 free	= bch2_fs_sectors_free(c, bch2_fs_usage_read(c));
+
+	return stranded * factor > free;
+}
+
+static bool try_increase_writepoints(struct bch_fs *c)
+{
+	struct write_point *wp;
+
+	if (c->write_points_nr == ARRAY_SIZE(c->write_points) ||
+	    too_many_writepoints(c, 32))
+		return false;
+
+	wp = c->write_points + c->write_points_nr++;
+	hlist_add_head_rcu(&wp->node, writepoint_hash(c, wp->write_point));
+	return true;
+}
+
+static bool try_decrease_writepoints(struct bch_fs *c,
+				     unsigned old_nr)
+{
+	struct write_point *wp;
+
+	mutex_lock(&c->write_points_hash_lock);
+	if (c->write_points_nr < old_nr) {
+		mutex_unlock(&c->write_points_hash_lock);
+		return true;
+	}
+
+	if (c->write_points_nr == 1 ||
+	    !too_many_writepoints(c, 8)) {
+		mutex_unlock(&c->write_points_hash_lock);
+		return false;
+	}
+
+	wp = c->write_points + --c->write_points_nr;
+
+	hlist_del_rcu(&wp->node);
+	mutex_unlock(&c->write_points_hash_lock);
+
+	bch2_writepoint_stop(c, NULL, wp);
+	return true;
+}
+
 static struct write_point *writepoint_find(struct bch_fs *c,
 					   unsigned long write_point)
 {
@@ -536,16 +592,22 @@ lock_wp:
 		mutex_unlock(&wp->lock);
 		goto restart_find;
 	}
-
+restart_find_oldest:
 	oldest = NULL;
 	for (wp = c->write_points;
-	     wp < c->write_points + ARRAY_SIZE(c->write_points);
-	     wp++)
+	     wp < c->write_points + c->write_points_nr; wp++)
 		if (!oldest || time_before64(wp->last_used, oldest->last_used))
 			oldest = wp;
 
 	mutex_lock(&oldest->lock);
 	mutex_lock(&c->write_points_hash_lock);
+	if (oldest >= c->write_points + c->write_points_nr ||
+	    try_increase_writepoints(c)) {
+		mutex_unlock(&c->write_points_hash_lock);
+		mutex_unlock(&oldest->lock);
+		goto restart_find_oldest;
+	}
+
 	wp = __writepoint_find(head, write_point);
 	if (wp && wp != oldest) {
 		mutex_unlock(&c->write_points_hash_lock);
@@ -581,10 +643,12 @@ struct write_point *bch2_alloc_sectors_start(struct bch_fs *c,
 	unsigned nr_effective = 0;
 	struct open_buckets ptrs = { .nr = 0 };
 	bool have_cache = false;
+	unsigned write_points_nr;
 	int ret = 0, i;
 
 	BUG_ON(!nr_replicas || !nr_replicas_required);
-
+retry:
+	write_points_nr = c->write_points_nr;
 	wp = writepoint_find(c, write_point.v);
 
 	if (!target || (flags & BCH_WRITE_ONLY_SPECIFIED_DEVS)) {
@@ -637,6 +701,11 @@ err:
 	wp->ptrs = ptrs;
 
 	mutex_unlock(&wp->lock);
+
+	if (ret == -ENOSPC &&
+	    try_decrease_writepoints(c, write_points_nr))
+		goto retry;
+
 	return ERR_PTR(ret);
 }
 
@@ -688,3 +757,37 @@ void bch2_alloc_sectors_done(struct bch_fs *c, struct write_point *wp)
 
 	bch2_open_buckets_put(c, &ptrs);
 }
+
+void bch2_fs_allocator_foreground_init(struct bch_fs *c)
+{
+	struct open_bucket *ob;
+	struct write_point *wp;
+
+	mutex_init(&c->write_points_hash_lock);
+	c->write_points_nr = ARRAY_SIZE(c->write_points);
+
+	/* open bucket 0 is a sentinal NULL: */
+	spin_lock_init(&c->open_buckets[0].lock);
+
+	for (ob = c->open_buckets + 1;
+	     ob < c->open_buckets + ARRAY_SIZE(c->open_buckets); ob++) {
+		spin_lock_init(&ob->lock);
+		c->open_buckets_nr_free++;
+
+		ob->freelist = c->open_buckets_freelist;
+		c->open_buckets_freelist = ob - c->open_buckets;
+	}
+
+	writepoint_init(&c->btree_write_point, BCH_DATA_BTREE);
+	writepoint_init(&c->rebalance_write_point, BCH_DATA_USER);
+
+	for (wp = c->write_points;
+	     wp < c->write_points + c->write_points_nr; wp++) {
+		writepoint_init(wp, BCH_DATA_USER);
+
+		wp->last_used	= sched_clock();
+		wp->write_point	= (unsigned long) wp;
+		hlist_add_head_rcu(&wp->node,
+				   writepoint_hash(c, wp->write_point));
+	}
+}
diff --git a/fs/bcachefs/alloc_foreground.h b/fs/bcachefs/alloc_foreground.h
index 636fe686dc48..6672101cbe26 100644
--- a/fs/bcachefs/alloc_foreground.h
+++ b/fs/bcachefs/alloc_foreground.h
@@ -91,15 +91,6 @@ void bch2_alloc_sectors_done(struct bch_fs *, struct write_point *);
 void bch2_writepoint_stop(struct bch_fs *, struct bch_dev *,
 			  struct write_point *);
 
-static inline struct hlist_head *writepoint_hash(struct bch_fs *c,
-						 unsigned long write_point)
-{
-	unsigned hash =
-		hash_long(write_point, ilog2(ARRAY_SIZE(c->write_points_hash)));
-
-	return &c->write_points_hash[hash];
-}
-
 static inline struct write_point_specifier writepoint_hashed(unsigned long v)
 {
 	return (struct write_point_specifier) { .v = v | 1 };
@@ -117,4 +108,6 @@ static inline void writepoint_init(struct write_point *wp,
 	wp->type = type;
 }
 
+void bch2_fs_allocator_foreground_init(struct bch_fs *);
+
 #endif /* _BCACHEFS_ALLOC_FOREGROUND_H */
diff --git a/fs/bcachefs/alloc_types.h b/fs/bcachefs/alloc_types.h
index e0306d68ae9f..2a9c6f0344ed 100644
--- a/fs/bcachefs/alloc_types.h
+++ b/fs/bcachefs/alloc_types.h
@@ -46,7 +46,9 @@ typedef FIFO(long)	alloc_fifo;
 
 /* Enough for 16 cache devices, 2 tiers and some left over for pipelining */
 #define OPEN_BUCKETS_COUNT	256
-#define WRITE_POINT_COUNT	32
+
+#define WRITE_POINT_HASH_NR	32
+#define WRITE_POINT_MAX		32
 
 struct open_bucket {
 	spinlock_t		lock;
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 95d505aaf82f..5665b93f200b 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -601,6 +601,7 @@ struct bch_fs {
 	 * and forces them to be revalidated
 	 */
 	u32			capacity_gen;
+	unsigned		bucket_size_max;
 
 	atomic64_t		sectors_available;
 
@@ -630,9 +631,10 @@ struct bch_fs {
 	struct write_point	btree_write_point;
 	struct write_point	rebalance_write_point;
 
-	struct write_point	write_points[WRITE_POINT_COUNT];
-	struct hlist_head	write_points_hash[WRITE_POINT_COUNT];
+	struct write_point	write_points[WRITE_POINT_MAX];
+	struct hlist_head	write_points_hash[WRITE_POINT_HASH_NR];
 	struct mutex		write_points_hash_lock;
+	unsigned		write_points_nr;
 
 	/* GARBAGE COLLECTION */
 	struct task_struct	*gc_thread;
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 6f40c4bd16ec..cfbe3ed41d0e 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -300,11 +300,6 @@ u64 bch2_fs_sectors_used(struct bch_fs *c, struct bch_fs_usage stats)
 	return min(c->capacity, __bch2_fs_sectors_used(c, stats));
 }
 
-static u64 bch2_fs_sectors_free(struct bch_fs *c, struct bch_fs_usage stats)
-{
-	return c->capacity - bch2_fs_sectors_used(c, stats);
-}
-
 static inline int is_unavailable_bucket(struct bucket_mark m)
 {
 	return !is_available_bucket(m);
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index e22c51972c31..c40ffe862a06 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -175,6 +175,12 @@ void bch2_fs_usage_apply(struct bch_fs *, struct bch_fs_usage *,
 
 u64 bch2_fs_sectors_used(struct bch_fs *, struct bch_fs_usage);
 
+static inline u64 bch2_fs_sectors_free(struct bch_fs *c,
+				       struct bch_fs_usage stats)
+{
+	return c->capacity - bch2_fs_sectors_used(c, stats);
+}
+
 static inline bool is_available_bucket(struct bucket_mark mark)
 {
 	return (!mark.owned_by_allocator &&
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 9d9d4fb8348b..a2ee698970a8 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -524,7 +524,8 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 	for (i = 0; i < BCH_TIME_STAT_NR; i++)
 		bch2_time_stats_init(&c->times[i]);
 
-	bch2_fs_allocator_init(c);
+	bch2_fs_allocator_background_init(c);
+	bch2_fs_allocator_foreground_init(c);
 	bch2_fs_rebalance_init(c);
 	bch2_fs_quota_init(c);
 
-- 
cgit 


From 8b335baef22768deb7140e45f32f37ea51a1faf4 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 4 Nov 2018 22:09:51 -0500
Subject: bcachefs: Assorted fixes for running on very small devices

It's now possible to create and use a filesystem on a 512k device with
4k buckets (though at that size we still waste almost half to internal
reserves)

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c | 11 +++++++----
 fs/bcachefs/alloc_background.h |  2 +-
 fs/bcachefs/bcachefs.h         |  2 +-
 fs/bcachefs/bcachefs_format.h  |  4 +++-
 fs/bcachefs/buckets.c          |  6 +++---
 fs/bcachefs/recovery.c         |  4 ++--
 fs/bcachefs/super-io.c         |  2 +-
 7 files changed, 18 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 1eb39283e7e2..a4c4a08aed59 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -374,6 +374,11 @@ static void bch2_rescale_bucket_io_times(struct bch_fs *c, int rw)
 	}
 }
 
+static inline u64 bucket_clock_freq(u64 capacity)
+{
+	return max(capacity >> 10, 2028ULL);
+}
+
 static void bch2_inc_clock_hand(struct io_timer *timer)
 {
 	struct bucket_clock *clock = container_of(timer,
@@ -412,7 +417,7 @@ static void bch2_inc_clock_hand(struct io_timer *timer)
 	 * RW mode (that will be 0 when we're RO, yet we can still service
 	 * reads)
 	 */
-	timer->expire += capacity >> 10;
+	timer->expire += bucket_clock_freq(capacity);
 
 	bch2_io_timer_add(&c->io_clock[clock->rw], timer);
 }
@@ -424,7 +429,7 @@ static void bch2_bucket_clock_init(struct bch_fs *c, int rw)
 	clock->hand		= 1;
 	clock->rw		= rw;
 	clock->rescale.fn	= bch2_inc_clock_hand;
-	clock->rescale.expire	= c->capacity >> 10;
+	clock->rescale.expire	= bucket_clock_freq(c->capacity);
 	mutex_init(&clock->lock);
 }
 
@@ -1011,8 +1016,6 @@ void bch2_recalc_capacity(struct bch_fs *c)
 		for (j = 0; j < RESERVE_NONE; j++)
 			dev_reserve += ca->free[j].size;
 
-		dev_reserve += ca->free_inc.size;
-
 		dev_reserve += 1;	/* btree write point */
 		dev_reserve += 1;	/* copygc write point */
 		dev_reserve += 1;	/* rebalance write point */
diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h
index 245e037fbaea..33224070e827 100644
--- a/fs/bcachefs/alloc_background.h
+++ b/fs/bcachefs/alloc_background.h
@@ -6,7 +6,7 @@
 #include "alloc_types.h"
 #include "debug.h"
 
-#define ALLOC_SCAN_BATCH(ca)		((ca)->mi.nbuckets >> 9)
+#define ALLOC_SCAN_BATCH(ca)		max_t(size_t, 1, (ca)->mi.nbuckets >> 9)
 
 const char *bch2_alloc_invalid(const struct bch_fs *, struct bkey_s_c);
 int bch2_alloc_to_text(struct bch_fs *, char *, size_t, struct bkey_s_c);
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 5665b93f200b..22df84b78f4b 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -325,7 +325,7 @@ enum bch_time_stats {
 #define BTREE_RESERVE_MAX	(BTREE_MAX_DEPTH + (BTREE_MAX_DEPTH - 1))
 
 /* Size of the freelist we allocate btree nodes from: */
-#define BTREE_NODE_RESERVE	(BTREE_RESERVE_MAX * 4)
+#define BTREE_NODE_RESERVE	BTREE_RESERVE_MAX
 
 struct btree;
 
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index 46355f006793..d74f1e5c21e0 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -909,6 +909,8 @@ struct bch_sb_field_journal {
 
 /* BCH_SB_FIELD_members: */
 
+#define BCH_MIN_NR_NBUCKETS	(1 << 6)
+
 struct bch_member {
 	__uuid_t		uuid;
 	__le64			nbuckets;	/* device size */
@@ -1391,7 +1393,7 @@ struct jset {
 LE32_BITMASK(JSET_CSUM_TYPE,	struct jset, flags, 0, 4);
 LE32_BITMASK(JSET_BIG_ENDIAN,	struct jset, flags, 4, 5);
 
-#define BCH_JOURNAL_BUCKETS_MIN		20
+#define BCH_JOURNAL_BUCKETS_MIN		8
 
 /* Btree: */
 
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index cfbe3ed41d0e..28ec8a58319c 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -887,9 +887,9 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
 	size_t btree_reserve	= DIV_ROUND_UP(BTREE_NODE_RESERVE,
 			     ca->mi.bucket_size / c->opts.btree_node_size);
 	/* XXX: these should be tunable */
-	size_t reserve_none	= max_t(size_t, 4, nbuckets >> 9);
-	size_t copygc_reserve	= max_t(size_t, 16, nbuckets >> 7);
-	size_t free_inc_nr	= max(max_t(size_t, 16, nbuckets >> 12),
+	size_t reserve_none	= max_t(size_t, 1, nbuckets >> 9);
+	size_t copygc_reserve	= max_t(size_t, 2, nbuckets >> 7);
+	size_t free_inc_nr	= max(max_t(size_t, 1, nbuckets >> 12),
 				      btree_reserve);
 	bool resize = ca->buckets != NULL,
 	     start_copygc = ca->copygc_thread != NULL;
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index e21551e8d6cc..696e01f4962f 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -278,7 +278,7 @@ out:
 	return ret;
 err:
 fsck_err:
-	BUG_ON(!ret);
+	pr_err("Error in recovery: %s (%i)", err, ret);
 	goto out;
 }
 
@@ -381,6 +381,6 @@ int bch2_fs_initialize(struct bch_fs *c)
 
 	return 0;
 err:
-	BUG_ON(!ret);
+	pr_err("Error initializing new filesystem: %s (%i)", err, ret);
 	return ret;
 }
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index 64c2375302a0..58c35d9665eb 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -811,7 +811,7 @@ static const char *bch2_sb_validate_members(struct bch_sb *sb,
 			return "Too many buckets";
 
 		if (le64_to_cpu(m->nbuckets) -
-		    le16_to_cpu(m->first_bucket) < 1 << 10)
+		    le16_to_cpu(m->first_bucket) < BCH_MIN_NR_NBUCKETS)
 			return "Not enough buckets";
 
 		if (le16_to_cpu(m->bucket_size) <
-- 
cgit 


From 4628529f152782933865257796000f1f6702a9ee Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 4 Nov 2018 23:10:09 -0500
Subject: bcachefs: Disk usage in compressed sectors, not uncompressed

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/buckets.c |  9 +++++++--
 fs/bcachefs/extents.c |  5 ++---
 fs/bcachefs/move.c    | 21 ++++++++++++++++++++-
 3 files changed, 29 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 28ec8a58319c..54eb1b6b820b 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -339,12 +339,17 @@ void bch2_fs_usage_apply(struct bch_fs *c,
 {
 	struct fs_usage_sum sum = __fs_usage_sum(*stats);
 	s64 added = sum.data + sum.reserved;
+	s64 should_not_have_added;
 
 	/*
 	 * Not allowed to reduce sectors_available except by getting a
 	 * reservation:
 	 */
-	BUG_ON(added > (s64) (disk_res ? disk_res->sectors : 0));
+	should_not_have_added = added - (s64) (disk_res ? disk_res->sectors : 0);
+	if (WARN_ON(should_not_have_added > 0)) {
+		atomic64_sub(should_not_have_added, &c->sectors_available);
+		added -= should_not_have_added;
+	}
 
 	if (added > 0) {
 		disk_res->sectors	-= added;
@@ -667,7 +672,7 @@ static void bch2_mark_extent(struct bch_fs *c, struct bkey_s_c k,
 			stats->replicas
 				[!p.ptr.cached && replicas ? replicas - 1 : 0].data
 				[!p.ptr.cached ? data_type : BCH_DATA_CACHED] +=
-					sectors;
+					disk_sectors;
 
 			bch2_mark_pointer(c, e, p, disk_sectors, data_type,
 					  stats, journal_seq, flags);
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index e2bb1502eaad..1606826e7802 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -235,7 +235,7 @@ unsigned bch2_extent_is_compressed(struct bkey_s_c k)
 			if (!p.ptr.cached &&
 			    p.crc.compression_type != BCH_COMPRESSION_NONE &&
 			    p.crc.compressed_size < p.crc.live_size)
-				ret = max_t(unsigned, ret, p.crc.compressed_size);
+				ret += p.crc.compressed_size;
 	}
 	}
 
@@ -1275,8 +1275,7 @@ bch2_extent_can_insert(struct btree_insert *trans,
 
 		switch (bch2_disk_reservation_add(trans->c,
 				trans->disk_res,
-				sectors * bch2_extent_nr_dirty_ptrs(k),
-				flags)) {
+				sectors, flags)) {
 		case 0:
 			break;
 		case -ENOSPC:
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index b2bf0944d59d..1f6bad1ae388 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -5,6 +5,7 @@
 #include "btree_gc.h"
 #include "btree_update.h"
 #include "buckets.h"
+#include "disk_groups.h"
 #include "inode.h"
 #include "io.h"
 #include "journal_reclaim.h"
@@ -260,8 +261,26 @@ int bch2_migrate_write_init(struct bch_fs *c, struct migrate_write *m,
 		}
 		break;
 	}
-	case DATA_REWRITE:
+	case DATA_REWRITE: {
+		const union bch_extent_entry *entry;
+		struct extent_ptr_decoded p;
+		unsigned compressed_sectors = 0;
+
+		extent_for_each_ptr_decode(bkey_s_c_to_extent(k), p, entry)
+			if (!p.ptr.cached &&
+			    p.crc.compression_type != BCH_COMPRESSION_NONE &&
+			    bch2_dev_in_target(c, p.ptr.dev, data_opts.target))
+				compressed_sectors += p.crc.compressed_size;
+
+		if (compressed_sectors) {
+			ret = bch2_disk_reservation_add(c, &m->op.res,
+					compressed_sectors,
+					BCH_DISK_RESERVATION_NOFAIL);
+			if (ret)
+				return ret;
+		}
 		break;
+	}
 	case DATA_PROMOTE:
 		m->op.flags	|= BCH_WRITE_ALLOC_NOWAIT;
 		m->op.flags	|= BCH_WRITE_CACHED;
-- 
cgit 


From a420eea6890d709b7b9b33222d4fecc7a7ad9bd4 Mon Sep 17 00:00:00 2001
From: Tim Schlueter <schlueter.tim@linux.com>
Date: Sun, 4 Nov 2018 20:14:46 -0800
Subject: bcachefs: Set the last mount time using the realtime clock

This way the last mount time is actually meaningful instead of just being
various times from 1970 (which happens with the monotonic clock).

Also, roundup_pow_of_two() is undefined when passed in 0, so check before
calling it.

Signed-off-by: Tim Schlueter <schlueter.tim@linux.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fifo.h  | 4 +++-
 fs/bcachefs/super.c | 6 +++---
 2 files changed, 6 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fifo.h b/fs/bcachefs/fifo.h
index 00d245efe72a..0cd5f1931aac 100644
--- a/fs/bcachefs/fifo.h
+++ b/fs/bcachefs/fifo.h
@@ -13,7 +13,9 @@ struct {								\
 #define DECLARE_FIFO(type, name)	FIFO(type) name
 
 #define fifo_buf_size(fifo)						\
-	(roundup_pow_of_two((fifo)->size) * sizeof((fifo)->data[0]))
+	((fifo)->size							\
+	 ? roundup_pow_of_two((fifo)->size) * sizeof((fifo)->data[0])	\
+	 : 0)
 
 #define init_fifo(fifo, _size, _gfp)					\
 ({									\
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index a2ee698970a8..54d23cf46f95 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -654,7 +654,7 @@ const char *bch2_fs_start(struct bch_fs *c)
 	const char *err = "cannot allocate memory";
 	struct bch_sb_field_members *mi;
 	struct bch_dev *ca;
-	time64_t now = ktime_get_seconds();
+	time64_t now = ktime_get_real_seconds();
 	unsigned i;
 	int ret = -EINVAL;
 
@@ -1418,7 +1418,7 @@ have_slot:
 	/* success: */
 
 	mi->members[dev_idx] = dev_mi;
-	mi->members[dev_idx].last_mount = cpu_to_le64(ktime_get_seconds());
+	mi->members[dev_idx].last_mount = cpu_to_le64(ktime_get_real_seconds());
 	c->disk_sb.sb->nr_devices	= nr_devices;
 
 	ca->disk_sb.sb->dev_idx	= dev_idx;
@@ -1494,7 +1494,7 @@ int bch2_dev_online(struct bch_fs *c, const char *path)
 	mi = bch2_sb_get_members(c->disk_sb.sb);
 
 	mi->members[ca->dev_idx].last_mount =
-		cpu_to_le64(ktime_get_seconds());
+		cpu_to_le64(ktime_get_real_seconds());
 
 	bch2_write_super(c);
 	mutex_unlock(&c->sb_lock);
-- 
cgit 


From 82ebd49cd6c68a5af44ada2487e00da606423404 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 9 Nov 2018 00:55:20 -0500
Subject: bcachefs: fix a replicas bug

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/replicas.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c
index 72dd70b00abb..a7c3aca1bf01 100644
--- a/fs/bcachefs/replicas.c
+++ b/fs/bcachefs/replicas.c
@@ -267,6 +267,8 @@ int bch2_mark_bkey_replicas(struct bch_fs *c,
 	struct bch_replicas_entry_padded search;
 	int ret;
 
+	memset(&search, 0, sizeof(search));
+
 	if (type == BKEY_TYPE_EXTENTS) {
 		struct bch_devs_list cached = bch2_bkey_cached_devs(k);
 		unsigned i;
@@ -564,6 +566,8 @@ bool bch2_bkey_replicas_marked(struct bch_fs *c,
 	struct bch_replicas_entry_padded search;
 	bool ret;
 
+	memset(&search, 0, sizeof(search));
+
 	if (type == BKEY_TYPE_EXTENTS) {
 		struct bch_devs_list cached = bch2_bkey_cached_devs(k);
 		unsigned i;
-- 
cgit 


From 75369d4ec3d2dfc52af18a2d20cd0af14c935ac9 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 9 Nov 2018 01:42:49 -0500
Subject: bcachefs: delete some dead code

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/util.h | 84 ------------------------------------------------------
 1 file changed, 84 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h
index 9caf2487ee63..cb6bed68abf8 100644
--- a/fs/bcachefs/util.h
+++ b/fs/bcachefs/util.h
@@ -455,96 +455,12 @@ do {									\
 			    (var)->p_term_inverse, 1, INT_MAX);		\
 } while (0)
 
-#define __DIV_SAFE(n, d, zero)						\
-({									\
-	typeof(n) _n = (n);						\
-	typeof(d) _d = (d);						\
-	_d ? _n / _d : zero;						\
-})
-
-#define DIV_SAFE(n, d)	__DIV_SAFE(n, d, 0)
-
 #define container_of_or_null(ptr, type, member)				\
 ({									\
 	typeof(ptr) _ptr = ptr;						\
 	_ptr ? container_of(_ptr, type, member) : NULL;			\
 })
 
-#define RB_INSERT(root, new, member, cmp)				\
-({									\
-	__label__ dup;							\
-	struct rb_node **n = &(root)->rb_node, *parent = NULL;		\
-	typeof(new) this;						\
-	int res, ret = -1;						\
-									\
-	while (*n) {							\
-		parent = *n;						\
-		this = container_of(*n, typeof(*(new)), member);	\
-		res = cmp(new, this);					\
-		if (!res)						\
-			goto dup;					\
-		n = res < 0						\
-			? &(*n)->rb_left				\
-			: &(*n)->rb_right;				\
-	}								\
-									\
-	rb_link_node(&(new)->member, parent, n);			\
-	rb_insert_color(&(new)->member, root);				\
-	ret = 0;							\
-dup:									\
-	ret;								\
-})
-
-#define RB_SEARCH(root, search, member, cmp)				\
-({									\
-	struct rb_node *n = (root)->rb_node;				\
-	typeof(&(search)) this, ret = NULL;				\
-	int res;							\
-									\
-	while (n) {							\
-		this = container_of(n, typeof(search), member);		\
-		res = cmp(&(search), this);				\
-		if (!res) {						\
-			ret = this;					\
-			break;						\
-		}							\
-		n = res < 0						\
-			? n->rb_left					\
-			: n->rb_right;					\
-	}								\
-	ret;								\
-})
-
-#define RB_GREATER(root, search, member, cmp)				\
-({									\
-	struct rb_node *n = (root)->rb_node;				\
-	typeof(&(search)) this, ret = NULL;				\
-	int res;							\
-									\
-	while (n) {							\
-		this = container_of(n, typeof(search), member);		\
-		res = cmp(&(search), this);				\
-		if (res < 0) {						\
-			ret = this;					\
-			n = n->rb_left;					\
-		} else							\
-			n = n->rb_right;				\
-	}								\
-	ret;								\
-})
-
-#define RB_FIRST(root, type, member)					\
-	container_of_or_null(rb_first(root), type, member)
-
-#define RB_LAST(root, type, member)					\
-	container_of_or_null(rb_last(root), type, member)
-
-#define RB_NEXT(ptr, member)						\
-	container_of_or_null(rb_next(&(ptr)->member), typeof(*ptr), member)
-
-#define RB_PREV(ptr, member)						\
-	container_of_or_null(rb_prev(&(ptr)->member), typeof(*ptr), member)
-
 /* Does linear interpolation between powers of two */
 static inline unsigned fract_exp_two(unsigned x, unsigned fract_bits)
 {
-- 
cgit 


From 319f9ac38eaba628d69b6ddbf402b35487315fc1 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 9 Nov 2018 01:24:07 -0500
Subject: bcachefs: revamp to_text methods

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c      |  14 ++---
 fs/bcachefs/alloc_background.h      |   2 +-
 fs/bcachefs/bkey.c                  |   4 +-
 fs/bcachefs/bkey_methods.c          |  60 +++++++------------
 fs/bcachefs/bkey_methods.h          |  16 ++---
 fs/bcachefs/bset.c                  |  82 +++++++++++++-------------
 fs/bcachefs/bset.h                  |   4 +-
 fs/bcachefs/btree_cache.c           |  85 +++++++++++++--------------
 fs/bcachefs/btree_cache.h           |   4 +-
 fs/bcachefs/btree_io.c              |  65 ++++++++++----------
 fs/bcachefs/btree_iter.c            |  19 ++----
 fs/bcachefs/btree_update_interior.c |  16 ++---
 fs/bcachefs/debug.c                 |  17 +++---
 fs/bcachefs/dirent.c                |  15 ++---
 fs/bcachefs/dirent.h                |   2 +-
 fs/bcachefs/disk_groups.c           |  56 +++++++-----------
 fs/bcachefs/disk_groups.h           |   5 +-
 fs/bcachefs/extents.c               |  92 ++++++++++++-----------------
 fs/bcachefs/extents.h               |   5 +-
 fs/bcachefs/fs.c                    |   2 +-
 fs/bcachefs/fsck.c                  |  40 ++++++++-----
 fs/bcachefs/inode.c                 |  11 ++--
 fs/bcachefs/inode.h                 |   2 +-
 fs/bcachefs/journal.c               |  88 +++++++++++++---------------
 fs/bcachefs/journal_io.c            |   7 ++-
 fs/bcachefs/opts.c                  |  36 ++++++------
 fs/bcachefs/opts.h                  |   7 ++-
 fs/bcachefs/quota.c                 |  15 ++---
 fs/bcachefs/quota.h                 |   2 +-
 fs/bcachefs/rebalance.c             |  34 +++++------
 fs/bcachefs/replicas.c              |  51 +++++++---------
 fs/bcachefs/replicas.h              |   3 +-
 fs/bcachefs/super-io.c              |  25 ++++----
 fs/bcachefs/super-io.h              |   6 +-
 fs/bcachefs/super.c                 |   7 +--
 fs/bcachefs/sysfs.c                 | 114 ++++++++++++++++--------------------
 fs/bcachefs/util.c                  |  89 ++++++++++++----------------
 fs/bcachefs/util.h                  |  33 +++++++++--
 fs/bcachefs/xattr.c                 |  48 ++++++++-------
 fs/bcachefs/xattr.h                 |   2 +-
 40 files changed, 550 insertions(+), 635 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index a4c4a08aed59..291d352ee370 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -94,17 +94,17 @@ const char *bch2_alloc_invalid(const struct bch_fs *c, struct bkey_s_c k)
 	return NULL;
 }
 
-int bch2_alloc_to_text(struct bch_fs *c, char *buf,
-		       size_t size, struct bkey_s_c k)
+void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c,
+			struct bkey_s_c k)
 {
-	buf[0] = '\0';
-
 	switch (k.k->type) {
-	case BCH_ALLOC:
+	case BCH_ALLOC: {
+		struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k);
+
+		pr_buf(out, "gen %u", a.v->gen);
 		break;
 	}
-
-	return 0;
+	}
 }
 
 static inline unsigned get_alloc_field(const u8 **p, unsigned bytes)
diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h
index 33224070e827..99535fa60214 100644
--- a/fs/bcachefs/alloc_background.h
+++ b/fs/bcachefs/alloc_background.h
@@ -9,7 +9,7 @@
 #define ALLOC_SCAN_BATCH(ca)		max_t(size_t, 1, (ca)->mi.nbuckets >> 9)
 
 const char *bch2_alloc_invalid(const struct bch_fs *, struct bkey_s_c);
-int bch2_alloc_to_text(struct bch_fs *, char *, size_t, struct bkey_s_c);
+void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 
 #define bch2_bkey_alloc_ops (struct bkey_ops) {		\
 	.key_invalid	= bch2_alloc_invalid,		\
diff --git a/fs/bcachefs/bkey.c b/fs/bcachefs/bkey.c
index c0e86ada1c53..d7e022ba2027 100644
--- a/fs/bcachefs/bkey.c
+++ b/fs/bcachefs/bkey.c
@@ -60,8 +60,8 @@ static void bch2_bkey_pack_verify(const struct bkey_packed *packed,
 		char buf1[160], buf2[160];
 		char buf3[160], buf4[160];
 
-		bch2_bkey_to_text(buf1, sizeof(buf1), unpacked);
-		bch2_bkey_to_text(buf2, sizeof(buf2), &tmp);
+		bch2_bkey_to_text(&PBUF(buf1), unpacked);
+		bch2_bkey_to_text(&PBUF(buf2), &tmp);
 		bch2_to_binary(buf3, (void *) unpacked, 80);
 		bch2_to_binary(buf4, high_word(format, packed), 80);
 
diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c
index b3f5f28b8761..7335fbbb3f61 100644
--- a/fs/bcachefs/bkey_methods.c
+++ b/fs/bcachefs/bkey_methods.c
@@ -111,7 +111,7 @@ void bch2_bkey_debugcheck(struct bch_fs *c, struct btree *b, struct bkey_s_c k)
 	if (invalid) {
 		char buf[160];
 
-		bch2_bkey_val_to_text(c, type, buf, sizeof(buf), k);
+		bch2_bkey_val_to_text(&PBUF(buf), c, type, k);
 		bch2_fs_bug(c, "invalid bkey %s: %s", buf, invalid);
 		return;
 	}
@@ -121,73 +121,57 @@ void bch2_bkey_debugcheck(struct bch_fs *c, struct btree *b, struct bkey_s_c k)
 		ops->key_debugcheck(c, b, k);
 }
 
-#define p(...)	(out += scnprintf(out, end - out, __VA_ARGS__))
-
-int bch2_bpos_to_text(char *buf, size_t size, struct bpos pos)
+void bch2_bpos_to_text(struct printbuf *out, struct bpos pos)
 {
-	char *out = buf, *end = buf + size;
-
 	if (!bkey_cmp(pos, POS_MIN))
-		p("POS_MIN");
+		pr_buf(out, "POS_MIN");
 	else if (!bkey_cmp(pos, POS_MAX))
-		p("POS_MAX");
+		pr_buf(out, "POS_MAX");
 	else
-		p("%llu:%llu", pos.inode, pos.offset);
-
-	return out - buf;
+		pr_buf(out, "%llu:%llu", pos.inode, pos.offset);
 }
 
-int bch2_bkey_to_text(char *buf, size_t size, const struct bkey *k)
+void bch2_bkey_to_text(struct printbuf *out, const struct bkey *k)
 {
-	char *out = buf, *end = buf + size;
-
-	p("u64s %u type %u ", k->u64s, k->type);
+	pr_buf(out, "u64s %u type %u ", k->u64s, k->type);
 
-	out += bch2_bpos_to_text(out, end - out, k->p);
+	bch2_bpos_to_text(out, k->p);
 
-	p(" snap %u len %u ver %llu", k->p.snapshot, k->size, k->version.lo);
-
-	return out - buf;
+	pr_buf(out, " snap %u len %u ver %llu",
+	       k->p.snapshot, k->size, k->version.lo);
 }
 
-int bch2_val_to_text(struct bch_fs *c, enum bkey_type type,
-		     char *buf, size_t size, struct bkey_s_c k)
+void bch2_val_to_text(struct printbuf *out, struct bch_fs *c,
+		      enum bkey_type type, struct bkey_s_c k)
 {
 	const struct bkey_ops *ops = &bch2_bkey_ops[type];
-	char *out = buf, *end = buf + size;
 
 	switch (k.k->type) {
 	case KEY_TYPE_DELETED:
-		p(" deleted");
+		pr_buf(out, " deleted");
 		break;
 	case KEY_TYPE_DISCARD:
-		p(" discard");
+		pr_buf(out, " discard");
 		break;
 	case KEY_TYPE_ERROR:
-		p(" error");
+		pr_buf(out, " error");
 		break;
 	case KEY_TYPE_COOKIE:
-		p(" cookie");
+		pr_buf(out, " cookie");
 		break;
 	default:
 		if (k.k->type >= KEY_TYPE_GENERIC_NR && ops->val_to_text)
-			out += ops->val_to_text(c, out, end - out, k);
+			ops->val_to_text(out, c, k);
 		break;
 	}
-
-	return out - buf;
 }
 
-int bch2_bkey_val_to_text(struct bch_fs *c, enum bkey_type type,
-			  char *buf, size_t size, struct bkey_s_c k)
+void bch2_bkey_val_to_text(struct printbuf *out, struct bch_fs *c,
+			   enum bkey_type type, struct bkey_s_c k)
 {
-	char *out = buf, *end = buf + size;
-
-	out += bch2_bkey_to_text(out, end - out, k.k);
-	out += scnprintf(out, end - out, ": ");
-	out += bch2_val_to_text(c, type, out, end - out, k);
-
-	return out - buf;
+	bch2_bkey_to_text(out, k.k);
+	pr_buf(out, ": ");
+	bch2_val_to_text(out, c, type, k);
 }
 
 void bch2_bkey_swab(enum bkey_type type,
diff --git a/fs/bcachefs/bkey_methods.h b/fs/bcachefs/bkey_methods.h
index 6ee774ba3d7a..be6041e92c05 100644
--- a/fs/bcachefs/bkey_methods.h
+++ b/fs/bcachefs/bkey_methods.h
@@ -46,8 +46,8 @@ struct bkey_ops {
 				       struct bkey_s_c);
 	void		(*key_debugcheck)(struct bch_fs *, struct btree *,
 					  struct bkey_s_c);
-	int		(*val_to_text)(struct bch_fs *, char *,
-				       size_t, struct bkey_s_c);
+	void		(*val_to_text)(struct printbuf *, struct bch_fs *,
+				       struct bkey_s_c);
 	void		(*swab)(const struct bkey_format *, struct bkey_packed *);
 	key_filter_fn	key_normalize;
 	key_merge_fn	key_merge;
@@ -62,12 +62,12 @@ const char *bch2_bkey_in_btree_node(struct btree *, struct bkey_s_c);
 
 void bch2_bkey_debugcheck(struct bch_fs *, struct btree *, struct bkey_s_c);
 
-int bch2_bpos_to_text(char *, size_t, struct bpos);
-int bch2_bkey_to_text(char *, size_t, const struct bkey *);
-int bch2_val_to_text(struct bch_fs *, enum bkey_type,
-		     char *, size_t, struct bkey_s_c);
-int bch2_bkey_val_to_text(struct bch_fs *, enum bkey_type,
-			  char *, size_t, struct bkey_s_c);
+void bch2_bpos_to_text(struct printbuf *, struct bpos);
+void bch2_bkey_to_text(struct printbuf *, const struct bkey *);
+void bch2_val_to_text(struct printbuf *, struct bch_fs *, enum bkey_type,
+		      struct bkey_s_c);
+void bch2_bkey_val_to_text(struct printbuf *, struct bch_fs *,
+			   enum bkey_type, struct bkey_s_c);
 
 void bch2_bkey_swab(enum bkey_type, const struct bkey_format *,
 		    struct bkey_packed *);
diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c
index 7fc8fb85069f..ac84aac4a263 100644
--- a/fs/bcachefs/bset.c
+++ b/fs/bcachefs/bset.c
@@ -56,7 +56,7 @@ void bch2_dump_bset(struct btree *b, struct bset *i, unsigned set)
 	     _k = _n, k = n) {
 		_n = bkey_next(_k);
 
-		bch2_bkey_to_text(buf, sizeof(buf), &k);
+		bch2_bkey_to_text(&PBUF(buf), &k);
 		printk(KERN_ERR "block %u key %5u: %s\n", set,
 		       __btree_node_key_to_offset(b, _k), buf);
 
@@ -106,7 +106,7 @@ void bch2_dump_btree_node_iter(struct btree *b,
 		struct bkey uk = bkey_unpack_key(b, k);
 		char buf[100];
 
-		bch2_bkey_to_text(buf, sizeof(buf), &uk);
+		bch2_bkey_to_text(&PBUF(buf), &uk);
 		printk(KERN_ERR "set %zu key %zi/%u: %s\n", t - b->set,
 		       k->_data - bset(b, t)->_data, bset(b, t)->u64s, buf);
 	}
@@ -150,8 +150,8 @@ static void bch2_btree_node_iter_next_check(struct btree_node_iter *_iter,
 		char buf1[80], buf2[80];
 
 		bch2_dump_btree_node(b);
-		bch2_bkey_to_text(buf1, sizeof(buf1), &ku);
-		bch2_bkey_to_text(buf2, sizeof(buf2), &nu);
+		bch2_bkey_to_text(&PBUF(buf1), &ku);
+		bch2_bkey_to_text(&PBUF(buf2), &nu);
 		printk(KERN_ERR "out of order/overlapping:\n%s\n%s\n",
 		       buf1, buf2);
 		printk(KERN_ERR "iter was:");
@@ -212,8 +212,8 @@ void bch2_verify_insert_pos(struct btree *b, struct bkey_packed *where,
 		char buf2[100];
 
 		bch2_dump_btree_node(b);
-		bch2_bkey_to_text(buf1, sizeof(buf1), &k1);
-		bch2_bkey_to_text(buf2, sizeof(buf2), &k2);
+		bch2_bkey_to_text(&PBUF(buf1), &k1);
+		bch2_bkey_to_text(&PBUF(buf2), &k2);
 
 		panic("prev > insert:\n"
 		      "prev    key %5u %s\n"
@@ -234,8 +234,8 @@ void bch2_verify_insert_pos(struct btree *b, struct bkey_packed *where,
 		char buf2[100];
 
 		bch2_dump_btree_node(b);
-		bch2_bkey_to_text(buf1, sizeof(buf1), &k1);
-		bch2_bkey_to_text(buf2, sizeof(buf2), &k2);
+		bch2_bkey_to_text(&PBUF(buf1), &k1);
+		bch2_bkey_to_text(&PBUF(buf2), &k2);
 
 		panic("insert > next:\n"
 		      "insert  key %5u %s\n"
@@ -1767,8 +1767,8 @@ void bch2_btree_keys_stats(struct btree *b, struct bset_stats *stats)
 	}
 }
 
-int bch2_bkey_print_bfloat(struct btree *b, struct bkey_packed *k,
-			   char *buf, size_t size)
+void bch2_bfloat_to_text(struct printbuf *out, struct btree *b,
+			 struct bkey_packed *k)
 {
 	struct bset_tree *t = bch2_bkey_to_bset(b, k);
 	struct bkey_packed *l, *r, *p;
@@ -1776,28 +1776,29 @@ int bch2_bkey_print_bfloat(struct btree *b, struct bkey_packed *k,
 	char buf1[200], buf2[200];
 	unsigned j, inorder;
 
-	if (!size)
-		return 0;
+	if (out->pos != out->end)
+		*out->pos = '\0';
 
 	if (!bset_has_ro_aux_tree(t))
-		goto out;
+		return;
 
 	inorder = bkey_to_cacheline(b, t, k);
 	if (!inorder || inorder >= t->size)
-		goto out;
+		return;
 
 	j = __inorder_to_eytzinger1(inorder, t->size, t->extra);
 	if (k != tree_to_bkey(b, t, j))
-		goto out;
+		return;
 
 	switch (bkey_float(b, t, j)->exponent) {
 	case BFLOAT_FAILED_UNPACKED:
 		uk = bkey_unpack_key(b, k);
-		return scnprintf(buf, size,
-				 "    failed unpacked at depth %u\n"
-				 "\t%llu:%llu\n",
-				 ilog2(j),
-				 uk.p.inode, uk.p.offset);
+		pr_buf(out,
+		       "    failed unpacked at depth %u\n"
+		       "\t%llu:%llu\n",
+		       ilog2(j),
+		       uk.p.inode, uk.p.offset);
+		break;
 	case BFLOAT_FAILED_PREV:
 		p = tree_to_prev_bkey(b, t, j);
 		l = is_power_of_2(j)
@@ -1812,28 +1813,27 @@ int bch2_bkey_print_bfloat(struct btree *b, struct bkey_packed *k,
 		bch2_to_binary(buf1, high_word(&b->format, p), b->nr_key_bits);
 		bch2_to_binary(buf2, high_word(&b->format, k), b->nr_key_bits);
 
-		return scnprintf(buf, size,
-				 "    failed prev at depth %u\n"
-				 "\tkey starts at bit %u but first differing bit at %u\n"
-				 "\t%llu:%llu\n"
-				 "\t%llu:%llu\n"
-				 "\t%s\n"
-				 "\t%s\n",
-				 ilog2(j),
-				 bch2_bkey_greatest_differing_bit(b, l, r),
-				 bch2_bkey_greatest_differing_bit(b, p, k),
-				 uk.p.inode, uk.p.offset,
-				 up.p.inode, up.p.offset,
-				 buf1, buf2);
+		pr_buf(out,
+		       "    failed prev at depth %u\n"
+		       "\tkey starts at bit %u but first differing bit at %u\n"
+		       "\t%llu:%llu\n"
+		       "\t%llu:%llu\n"
+		       "\t%s\n"
+		       "\t%s\n",
+		       ilog2(j),
+		       bch2_bkey_greatest_differing_bit(b, l, r),
+		       bch2_bkey_greatest_differing_bit(b, p, k),
+		       uk.p.inode, uk.p.offset,
+		       up.p.inode, up.p.offset,
+		       buf1, buf2);
+		break;
 	case BFLOAT_FAILED_OVERFLOW:
 		uk = bkey_unpack_key(b, k);
-		return scnprintf(buf, size,
-				 "    failed overflow at depth %u\n"
-				 "\t%llu:%llu\n",
-				 ilog2(j),
-				 uk.p.inode, uk.p.offset);
+		pr_buf(out,
+		       "    failed overflow at depth %u\n"
+		       "\t%llu:%llu\n",
+		       ilog2(j),
+		       uk.p.inode, uk.p.offset);
+		break;
 	}
-out:
-	*buf = '\0';
-	return 0;
 }
diff --git a/fs/bcachefs/bset.h b/fs/bcachefs/bset.h
index 1b0122dad2bc..5d03036620b9 100644
--- a/fs/bcachefs/bset.h
+++ b/fs/bcachefs/bset.h
@@ -607,8 +607,8 @@ struct bset_stats {
 };
 
 void bch2_btree_keys_stats(struct btree *, struct bset_stats *);
-int bch2_bkey_print_bfloat(struct btree *, struct bkey_packed *,
-			  char *, size_t);
+void bch2_bfloat_to_text(struct printbuf *, struct btree *,
+			 struct bkey_packed *);
 
 /* Debug stuff */
 
diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index 3cb3da363d11..846d5e816aa2 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -888,55 +888,54 @@ void bch2_btree_node_prefetch(struct bch_fs *c, struct btree_iter *iter,
 	bch2_btree_node_fill(c, iter, k, level, SIX_LOCK_read, false);
 }
 
-int bch2_print_btree_node(struct bch_fs *c, struct btree *b,
-			  char *buf, size_t len)
+void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c,
+			     struct btree *b)
 {
 	const struct bkey_format *f = &b->format;
 	struct bset_stats stats;
-	char ptrs[100];
 
 	memset(&stats, 0, sizeof(stats));
 
-	bch2_val_to_text(c, BKEY_TYPE_BTREE, ptrs, sizeof(ptrs),
-			bkey_i_to_s_c(&b->key));
 	bch2_btree_keys_stats(b, &stats);
 
-	return scnprintf(buf, len,
-			 "l %u %llu:%llu - %llu:%llu:\n"
-			 "    ptrs: %s\n"
-			 "    format: u64s %u fields %u %u %u %u %u\n"
-			 "    unpack fn len: %u\n"
-			 "    bytes used %zu/%zu (%zu%% full)\n"
-			 "    sib u64s: %u, %u (merge threshold %zu)\n"
-			 "    nr packed keys %u\n"
-			 "    nr unpacked keys %u\n"
-			 "    floats %zu\n"
-			 "    failed unpacked %zu\n"
-			 "    failed prev %zu\n"
-			 "    failed overflow %zu\n",
-			 b->level,
-			 b->data->min_key.inode,
-			 b->data->min_key.offset,
-			 b->data->max_key.inode,
-			 b->data->max_key.offset,
-			 ptrs,
-			 f->key_u64s,
-			 f->bits_per_field[0],
-			 f->bits_per_field[1],
-			 f->bits_per_field[2],
-			 f->bits_per_field[3],
-			 f->bits_per_field[4],
-			 b->unpack_fn_len,
-			 b->nr.live_u64s * sizeof(u64),
-			 btree_bytes(c) - sizeof(struct btree_node),
-			 b->nr.live_u64s * 100 / btree_max_u64s(c),
-			 b->sib_u64s[0],
-			 b->sib_u64s[1],
-			 BTREE_FOREGROUND_MERGE_THRESHOLD(c),
-			 b->nr.packed_keys,
-			 b->nr.unpacked_keys,
-			 stats.floats,
-			 stats.failed_unpacked,
-			 stats.failed_prev,
-			 stats.failed_overflow);
+	pr_buf(out,
+	       "l %u %llu:%llu - %llu:%llu:\n"
+	       "    ptrs: ",
+	       b->level,
+	       b->data->min_key.inode,
+	       b->data->min_key.offset,
+	       b->data->max_key.inode,
+	       b->data->max_key.offset);
+	bch2_val_to_text(out, c, BKEY_TYPE_BTREE,
+			 bkey_i_to_s_c(&b->key));
+	pr_buf(out, "\n"
+	       "    format: u64s %u fields %u %u %u %u %u\n"
+	       "    unpack fn len: %u\n"
+	       "    bytes used %zu/%zu (%zu%% full)\n"
+	       "    sib u64s: %u, %u (merge threshold %zu)\n"
+	       "    nr packed keys %u\n"
+	       "    nr unpacked keys %u\n"
+	       "    floats %zu\n"
+	       "    failed unpacked %zu\n"
+	       "    failed prev %zu\n"
+	       "    failed overflow %zu\n",
+	       f->key_u64s,
+	       f->bits_per_field[0],
+	       f->bits_per_field[1],
+	       f->bits_per_field[2],
+	       f->bits_per_field[3],
+	       f->bits_per_field[4],
+	       b->unpack_fn_len,
+	       b->nr.live_u64s * sizeof(u64),
+	       btree_bytes(c) - sizeof(struct btree_node),
+	       b->nr.live_u64s * 100 / btree_max_u64s(c),
+	       b->sib_u64s[0],
+	       b->sib_u64s[1],
+	       BTREE_FOREGROUND_MERGE_THRESHOLD(c),
+	       b->nr.packed_keys,
+	       b->nr.unpacked_keys,
+	       stats.floats,
+	       stats.failed_unpacked,
+	       stats.failed_prev,
+	       stats.failed_overflow);
 }
diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h
index f7b9bcfe09a3..cb7f66fc8bd4 100644
--- a/fs/bcachefs/btree_cache.h
+++ b/fs/bcachefs/btree_cache.h
@@ -85,7 +85,7 @@ static inline unsigned btree_blocks(struct bch_fs *c)
 
 #define btree_node_root(_c, _b)	((_c)->btree_roots[(_b)->btree_id].b)
 
-int bch2_print_btree_node(struct bch_fs *, struct btree *,
-			 char *, size_t);
+void bch2_btree_node_to_text(struct printbuf *, struct bch_fs *,
+			     struct btree *);
 
 #endif /* _BCACHEFS_BTREE_CACHE_H */
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 1036b72f1ae6..f1c31e74348a 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -913,26 +913,20 @@ static void bset_encrypt(struct bch_fs *c, struct bset *i, unsigned offset)
 		     vstruct_end(i) - (void *) i->_data);
 }
 
-static int btree_err_msg(struct bch_fs *c, struct btree *b, struct bset *i,
-			 unsigned offset, int write, char *buf, size_t len)
+static void btree_err_msg(struct printbuf *out, struct bch_fs *c,
+			  struct btree *b, struct bset *i,
+			  unsigned offset, int write)
 {
-	char *out = buf, *end = buf + len;
-
-	out += scnprintf(out, end - out,
-			 "error validating btree node %s"
-			 "at btree %u level %u/%u\n"
-			 "pos %llu:%llu node offset %u",
-			 write ? "before write " : "",
-			 b->btree_id, b->level,
-			 c->btree_roots[b->btree_id].level,
-			 b->key.k.p.inode, b->key.k.p.offset,
-			 b->written);
+	pr_buf(out, "error validating btree node %s"
+	       "at btree %u level %u/%u\n"
+	       "pos %llu:%llu node offset %u",
+	       write ? "before write " : "",
+	       b->btree_id, b->level,
+	       c->btree_roots[b->btree_id].level,
+	       b->key.k.p.inode, b->key.k.p.offset,
+	       b->written);
 	if (i)
-		out += scnprintf(out, end - out,
-				 " bset u64s %u",
-				 le16_to_cpu(i->u64s));
-
-	return out - buf;
+		pr_buf(out, " bset u64s %u", le16_to_cpu(i->u64s));
 }
 
 enum btree_err_type {
@@ -949,10 +943,11 @@ enum btree_validate_ret {
 #define btree_err(type, c, b, i, msg, ...)				\
 ({									\
 	__label__ out;							\
-	char _buf[300], *out = _buf, *end = out + sizeof(_buf);		\
+	char _buf[300];							\
+	struct printbuf out = PBUF(_buf);				\
 									\
-	out += btree_err_msg(c, b, i, b->written, write, out, end - out);\
-	out += scnprintf(out, end - out, ": " msg, ##__VA_ARGS__);	\
+	btree_err_msg(&out, c, b, i, b->written, write);		\
+	pr_buf(&out, ": " msg, ##__VA_ARGS__);				\
 									\
 	if (type == BTREE_ERR_FIXABLE &&				\
 	    write == READ &&						\
@@ -1117,7 +1112,7 @@ static int validate_bset(struct bch_fs *c, struct btree *b,
 		if (invalid) {
 			char buf[160];
 
-			bch2_bkey_val_to_text(c, type, buf, sizeof(buf), u);
+			bch2_bkey_val_to_text(&PBUF(buf), c, type, u);
 			btree_err(BTREE_ERR_FIXABLE, c, b, i,
 				  "invalid bkey:\n%s\n%s", invalid, buf);
 
@@ -1302,7 +1297,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry
 		     !bversion_cmp(u.k->version, MAX_VERSION))) {
 			char buf[160];
 
-			bch2_bkey_val_to_text(c, type, buf, sizeof(buf), u);
+			bch2_bkey_val_to_text(&PBUF(buf), c, type, u);
 			btree_err(BTREE_ERR_FIXABLE, c, b, i,
 				  "invalid bkey %s: %s", buf, invalid);
 
@@ -2060,7 +2055,7 @@ void bch2_btree_verify_flushed(struct bch_fs *c)
 
 ssize_t bch2_dirty_btree_nodes_print(struct bch_fs *c, char *buf)
 {
-	char *out = buf, *end = buf + PAGE_SIZE;
+	struct printbuf out = _PBUF(buf, PAGE_SIZE);
 	struct bucket_table *tbl;
 	struct rhash_head *pos;
 	struct btree *b;
@@ -2077,18 +2072,18 @@ ssize_t bch2_dirty_btree_nodes_print(struct bch_fs *c, char *buf)
 		    !(b->will_make_reachable & 1))
 			continue;
 
-		out += scnprintf(out, end - out, "%p d %u l %u w %u b %u r %u:%lu c %u p %u\n",
-				 b,
-				 (flags & (1 << BTREE_NODE_dirty)) != 0,
-				 b->level,
-				 b->written,
-				 !list_empty_careful(&b->write_blocked),
-				 b->will_make_reachable != 0,
-				 b->will_make_reachable & 1,
-				 b->writes[ idx].wait.list.first != NULL,
-				 b->writes[!idx].wait.list.first != NULL);
+		pr_buf(&out, "%p d %u l %u w %u b %u r %u:%lu c %u p %u\n",
+		       b,
+		       (flags & (1 << BTREE_NODE_dirty)) != 0,
+		       b->level,
+		       b->written,
+		       !list_empty_careful(&b->write_blocked),
+		       b->will_make_reachable != 0,
+		       b->will_make_reachable & 1,
+		       b->writes[ idx].wait.list.first != NULL,
+		       b->writes[!idx].wait.list.first != NULL);
 	}
 	rcu_read_unlock();
 
-	return out - buf;
+	return out.pos - buf;
 }
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 1ba59c53c36f..ea37fa21ed6e 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -427,7 +427,7 @@ static void __bch2_btree_iter_verify(struct btree_iter *iter,
 		char buf[100];
 		struct bkey uk = bkey_unpack_key(b, k);
 
-		bch2_bkey_to_text(buf, sizeof(buf), &uk);
+		bch2_bkey_to_text(&PBUF(buf), &uk);
 		panic("prev key should be before iter pos:\n%s\n%llu:%llu\n",
 		      buf, iter->pos.inode, iter->pos.offset);
 	}
@@ -437,7 +437,7 @@ static void __bch2_btree_iter_verify(struct btree_iter *iter,
 		char buf[100];
 		struct bkey uk = bkey_unpack_key(b, k);
 
-		bch2_bkey_to_text(buf, sizeof(buf), &uk);
+		bch2_bkey_to_text(&PBUF(buf), &uk);
 		panic("iter should be after current key:\n"
 		      "iter pos %llu:%llu\n"
 		      "cur key  %s\n",
@@ -687,7 +687,7 @@ static void btree_iter_verify_new_node(struct btree_iter *iter, struct btree *b)
 		char buf[100];
 		struct bkey uk = bkey_unpack_key(b, k);
 
-		bch2_bkey_to_text(buf, sizeof(buf), &uk);
+		bch2_bkey_to_text(&PBUF(buf), &uk);
 		panic("parent iter doesn't point to new node:\n%s\n%llu:%llu\n",
 		      buf, b->key.k.p.inode, b->key.k.p.offset);
 	}
@@ -1451,18 +1451,7 @@ recheck:
 			       : KEY_OFFSET_MAX) -
 			      n.p.offset));
 
-	//EBUG_ON(!n.size);
-	if (!n.size) {
-		char buf[100];
-		bch2_dump_btree_node(iter->l[0].b);
-
-		bch2_bkey_to_text(buf, sizeof(buf), k.k);
-		panic("iter at %llu:%llu\n"
-		      "next key %s\n",
-		      iter->pos.inode,
-		      iter->pos.offset,
-		      buf);
-	}
+	EBUG_ON(!n.size);
 
 	iter->k	= n;
 	iter->uptodate = BTREE_ITER_UPTODATE;
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 4ec448718fd8..92bacd16fdc3 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -2150,20 +2150,20 @@ void bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id)
 
 ssize_t bch2_btree_updates_print(struct bch_fs *c, char *buf)
 {
-	char *out = buf, *end = buf + PAGE_SIZE;
+	struct printbuf out = _PBUF(buf, PAGE_SIZE);
 	struct btree_update *as;
 
 	mutex_lock(&c->btree_interior_update_lock);
 	list_for_each_entry(as, &c->btree_interior_update_list, list)
-		out += scnprintf(out, end - out, "%p m %u w %u r %u j %llu\n",
-				 as,
-				 as->mode,
-				 as->nodes_written,
-				 atomic_read(&as->cl.remaining) & CLOSURE_REMAINING_MASK,
-				 as->journal.seq);
+		pr_buf(&out, "%p m %u w %u r %u j %llu\n",
+		       as,
+		       as->mode,
+		       as->nodes_written,
+		       atomic_read(&as->cl.remaining) & CLOSURE_REMAINING_MASK,
+		       as->journal.seq);
 	mutex_unlock(&c->btree_interior_update_lock);
 
-	return out - buf;
+	return out.pos - buf;
 }
 
 size_t bch2_btree_interior_updates_nr_pending(struct bch_fs *c)
diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c
index 550bb10bbb7b..0a9efe57d5a9 100644
--- a/fs/bcachefs/debug.c
+++ b/fs/bcachefs/debug.c
@@ -223,8 +223,8 @@ static ssize_t bch2_read_btree(struct file *file, char __user *buf,
 	k = bch2_btree_iter_peek(&iter);
 
 	while (k.k && !(err = btree_iter_err(k))) {
-		bch2_bkey_val_to_text(i->c, bkey_type(0, i->id),
-				      i->buf, sizeof(i->buf), k);
+		bch2_bkey_val_to_text(&PBUF(i->buf), i->c,
+				      bkey_type(0, i->id), k);
 		i->bytes = strlen(i->buf);
 		BUG_ON(i->bytes >= PAGE_SIZE);
 		i->buf[i->bytes] = '\n';
@@ -272,8 +272,8 @@ static ssize_t bch2_read_btree_formats(struct file *file, char __user *buf,
 		return i->ret;
 
 	for_each_btree_node(&iter, i->c, i->id, i->from, 0, b) {
-		i->bytes = bch2_print_btree_node(i->c, b, i->buf,
-						sizeof(i->buf));
+		bch2_btree_node_to_text(&PBUF(i->buf), i->c, b);
+		i->bytes = strlen(i->buf);
 		err = flush_buf(i);
 		if (err)
 			break;
@@ -330,17 +330,16 @@ static ssize_t bch2_read_bfloat_failed(struct file *file, char __user *buf,
 			bch2_btree_node_iter_peek(&l->iter, l->b);
 
 		if (l->b != prev_node) {
-			i->bytes = bch2_print_btree_node(i->c, l->b, i->buf,
-							sizeof(i->buf));
+			bch2_btree_node_to_text(&PBUF(i->buf), i->c, l->b);
+			i->bytes = strlen(i->buf);
 			err = flush_buf(i);
 			if (err)
 				break;
 		}
 		prev_node = l->b;
 
-		i->bytes = bch2_bkey_print_bfloat(l->b, _k, i->buf,
-						  sizeof(i->buf));
-
+		bch2_bfloat_to_text(&PBUF(i->buf), l->b, _k);
+		i->bytes = strlen(i->buf);
 		err = flush_buf(i);
 		if (err)
 			break;
diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c
index 0651f5575131..c1a611b4d9ec 100644
--- a/fs/bcachefs/dirent.c
+++ b/fs/bcachefs/dirent.c
@@ -110,26 +110,23 @@ const char *bch2_dirent_invalid(const struct bch_fs *c, struct bkey_s_c k)
 	}
 }
 
-int bch2_dirent_to_text(struct bch_fs *c, char *buf,
-			size_t size, struct bkey_s_c k)
+void bch2_dirent_to_text(struct printbuf *out, struct bch_fs *c,
+			 struct bkey_s_c k)
 {
-	char *out = buf, *end = buf + size;
 	struct bkey_s_c_dirent d;
 
 	switch (k.k->type) {
 	case BCH_DIRENT:
 		d = bkey_s_c_to_dirent(k);
 
-		out += bch_scnmemcpy(out, end - out, d.v->d_name,
-				     bch2_dirent_name_bytes(d));
-		out += scnprintf(out, end - out, " -> %llu", d.v->d_inum);
+		bch_scnmemcpy(out, d.v->d_name,
+			      bch2_dirent_name_bytes(d));
+		pr_buf(out, " -> %llu", d.v->d_inum);
 		break;
 	case BCH_DIRENT_WHITEOUT:
-		out += scnprintf(out, end - out, "whiteout");
+		pr_buf(out, "whiteout");
 		break;
 	}
-
-	return out - buf;
 }
 
 static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans,
diff --git a/fs/bcachefs/dirent.h b/fs/bcachefs/dirent.h
index 30d2143d4ca7..2afb0baed11a 100644
--- a/fs/bcachefs/dirent.h
+++ b/fs/bcachefs/dirent.h
@@ -7,7 +7,7 @@
 extern const struct bch_hash_desc bch2_dirent_hash_desc;
 
 const char *bch2_dirent_invalid(const struct bch_fs *, struct bkey_s_c);
-int bch2_dirent_to_text(struct bch_fs *, char *, size_t, struct bkey_s_c);
+void bch2_dirent_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 
 #define bch2_bkey_dirent_ops (struct bkey_ops) {	\
 	.key_invalid	= bch2_dirent_invalid,		\
diff --git a/fs/bcachefs/disk_groups.c b/fs/bcachefs/disk_groups.c
index 48f472a384f1..ee10308131e9 100644
--- a/fs/bcachefs/disk_groups.c
+++ b/fs/bcachefs/disk_groups.c
@@ -83,11 +83,10 @@ err:
 	return err;
 }
 
-static size_t bch2_sb_disk_groups_to_text(char *buf, size_t size,
+static void bch2_sb_disk_groups_to_text(struct printbuf *out,
 					struct bch_sb *sb,
 					struct bch_sb_field *f)
 {
-	char *out = buf, *end = buf + size;
 	struct bch_sb_field_disk_groups *groups =
 		field_to_type(f, disk_groups);
 	struct bch_disk_group *g;
@@ -97,18 +96,14 @@ static size_t bch2_sb_disk_groups_to_text(char *buf, size_t size,
 	     g < groups->entries + nr_groups;
 	     g++) {
 		if (g != groups->entries)
-			out += scnprintf(out, end - out, " ");
+			pr_buf(out, " ");
 
 		if (BCH_GROUP_DELETED(g))
-			out += scnprintf(out, end - out, "[deleted]");
+			pr_buf(out, "[deleted]");
 		else
-			out += scnprintf(out, end - out,
-					 "[parent %llu name %s]",
-					 BCH_GROUP_PARENT(g),
-					 g->label);
+			pr_buf(out, "[parent %llu name %s]",
+			       BCH_GROUP_PARENT(g), g->label);
 	}
-
-	return out - buf;
 }
 
 const struct bch_sb_field_ops bch_sb_field_ops_disk_groups = {
@@ -343,10 +338,10 @@ int bch2_disk_path_find_or_create(struct bch_sb_handle *sb, const char *name)
 	return v;
 }
 
-int bch2_disk_path_print(struct bch_sb_handle *sb,
-			 char *buf, size_t len, unsigned v)
+void bch2_disk_path_to_text(struct printbuf *out,
+			    struct bch_sb_handle *sb,
+			    unsigned v)
 {
-	char *out = buf, *end = out + len;
 	struct bch_sb_field_disk_groups *groups =
 		bch2_sb_get_disk_groups(sb->sb);
 	struct bch_disk_group *g;
@@ -374,26 +369,18 @@ int bch2_disk_path_print(struct bch_sb_handle *sb,
 	}
 
 	while (nr) {
-		unsigned b = 0;
-
 		v = path[--nr];
 		g = groups->entries + v;
 
-		if (end != out)
-			b = min_t(size_t, end - out,
-				  strnlen(g->label, sizeof(g->label)));
-		memcpy(out, g->label, b);
-		if (b < end - out)
-			out[b] = '\0';
-		out += b;
+		bch_scnmemcpy(out, g->label,
+			      strnlen(g->label, sizeof(g->label)));
 
 		if (nr)
-			out += scnprintf(out, end - out, ".");
+			pr_buf(out, ".");
 	}
-
-	return out - buf;
+	return;
 inval:
-	return scnprintf(buf, len, "invalid group %u", v);
+	pr_buf(out, "invalid group %u", v);
 }
 
 int bch2_dev_group_set(struct bch_fs *c, struct bch_dev *ca, const char *name)
@@ -452,14 +439,14 @@ int bch2_opt_target_parse(struct bch_fs *c, const char *buf, u64 *v)
 	return -EINVAL;
 }
 
-int bch2_opt_target_print(struct bch_fs *c, char *buf, size_t len, u64 v)
+void bch2_opt_target_to_text(struct printbuf *out, struct bch_fs *c, u64 v)
 {
 	struct target t = target_decode(v);
-	int ret;
 
 	switch (t.type) {
 	case TARGET_NULL:
-		return scnprintf(buf, len, "none");
+		pr_buf(out, "none");
+		break;
 	case TARGET_DEV: {
 		struct bch_dev *ca;
 
@@ -469,13 +456,12 @@ int bch2_opt_target_print(struct bch_fs *c, char *buf, size_t len, u64 v)
 			: NULL;
 
 		if (ca && percpu_ref_tryget(&ca->io_ref)) {
-			ret = scnprintf(buf, len, "/dev/%pg",
-					ca->disk_sb.bdev);
+			pr_buf(out, "/dev/%pg", ca->disk_sb.bdev);
 			percpu_ref_put(&ca->io_ref);
 		} else if (ca) {
-			ret = scnprintf(buf, len, "offline device %u", t.dev);
+			pr_buf(out, "offline device %u", t.dev);
 		} else {
-			ret = scnprintf(buf, len, "invalid device %u", t.dev);
+			pr_buf(out, "invalid device %u", t.dev);
 		}
 
 		rcu_read_unlock();
@@ -483,12 +469,10 @@ int bch2_opt_target_print(struct bch_fs *c, char *buf, size_t len, u64 v)
 	}
 	case TARGET_GROUP:
 		mutex_lock(&c->sb_lock);
-		ret = bch2_disk_path_print(&c->disk_sb, buf, len, t.group);
+		bch2_disk_path_to_text(out, &c->disk_sb, t.group);
 		mutex_unlock(&c->sb_lock);
 		break;
 	default:
 		BUG();
 	}
-
-	return ret;
 }
diff --git a/fs/bcachefs/disk_groups.h b/fs/bcachefs/disk_groups.h
index d202eb3a9de6..ceb75f86b615 100644
--- a/fs/bcachefs/disk_groups.h
+++ b/fs/bcachefs/disk_groups.h
@@ -59,10 +59,11 @@ bool bch2_dev_in_target(struct bch_fs *, unsigned, unsigned);
 
 int bch2_disk_path_find(struct bch_sb_handle *, const char *);
 int bch2_disk_path_find_or_create(struct bch_sb_handle *, const char *);
-int bch2_disk_path_print(struct bch_sb_handle *, char *, size_t, unsigned);
+void bch2_disk_path_to_text(struct printbuf *, struct bch_sb_handle *,
+			    unsigned);
 
 int bch2_opt_target_parse(struct bch_fs *, const char *, u64 *);
-int bch2_opt_target_print(struct bch_fs *, char *, size_t, u64);
+void bch2_opt_target_to_text(struct printbuf *, struct bch_fs *, u64);
 
 int bch2_sb_disk_groups_to_cpu(struct bch_fs *);
 
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 1606826e7802..a7223e7c8793 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -464,21 +464,18 @@ static const char *extent_ptr_invalid(const struct bch_fs *c,
 	return NULL;
 }
 
-static size_t extent_print_ptrs(struct bch_fs *c, char *buf,
-				size_t size, struct bkey_s_c_extent e)
+static void extent_print_ptrs(struct printbuf *out, struct bch_fs *c,
+			      struct bkey_s_c_extent e)
 {
-	char *out = buf, *end = buf + size;
 	const union bch_extent_entry *entry;
 	struct bch_extent_crc_unpacked crc;
 	const struct bch_extent_ptr *ptr;
 	struct bch_dev *ca;
 	bool first = true;
 
-#define p(...)	(out += scnprintf(out, end - out, __VA_ARGS__))
-
 	extent_for_each_entry(e, entry) {
 		if (!first)
-			p(" ");
+			pr_buf(out, " ");
 
 		switch (__extent_entry_type(entry)) {
 		case BCH_EXTENT_ENTRY_crc32:
@@ -486,12 +483,12 @@ static size_t extent_print_ptrs(struct bch_fs *c, char *buf,
 		case BCH_EXTENT_ENTRY_crc128:
 			crc = bch2_extent_crc_unpack(e.k, entry_to_crc(entry));
 
-			p("crc: c_size %u size %u offset %u nonce %u csum %u compress %u",
-			  crc.compressed_size,
-			  crc.uncompressed_size,
-			  crc.offset, crc.nonce,
-			  crc.csum_type,
-			  crc.compression_type);
+			pr_buf(out, "crc: c_size %u size %u offset %u nonce %u csum %u compress %u",
+			       crc.compressed_size,
+			       crc.uncompressed_size,
+			       crc.offset, crc.nonce,
+			       crc.csum_type,
+			       crc.compression_type);
 			break;
 		case BCH_EXTENT_ENTRY_ptr:
 			ptr = entry_to_ptr(entry);
@@ -499,14 +496,14 @@ static size_t extent_print_ptrs(struct bch_fs *c, char *buf,
 				? bch_dev_bkey_exists(c, ptr->dev)
 				: NULL;
 
-			p("ptr: %u:%llu gen %u%s%s", ptr->dev,
-			  (u64) ptr->offset, ptr->gen,
-			  ptr->cached ? " cached" : "",
-			  ca && ptr_stale(ca, ptr)
-			  ? " stale" : "");
+			pr_buf(out, "ptr: %u:%llu gen %u%s%s", ptr->dev,
+			       (u64) ptr->offset, ptr->gen,
+			       ptr->cached ? " cached" : "",
+			       ca && ptr_stale(ca, ptr)
+			       ? " stale" : "");
 			break;
 		default:
-			p("(invalid extent entry %.16llx)", *((u64 *) entry));
+			pr_buf(out, "(invalid extent entry %.16llx)", *((u64 *) entry));
 			goto out;
 		}
 
@@ -514,9 +511,7 @@ static size_t extent_print_ptrs(struct bch_fs *c, char *buf,
 	}
 out:
 	if (bkey_extent_is_cached(e.k))
-		p(" cached");
-#undef p
-	return out - buf;
+		pr_buf(out, " cached");
 }
 
 static struct bch_dev_io_failures *dev_io_failures(struct bch_io_failures *f,
@@ -681,8 +676,7 @@ void bch2_btree_ptr_debugcheck(struct bch_fs *c, struct btree *b,
 
 	if (!test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) &&
 	    !bch2_bkey_replicas_marked(c, btree_node_type(b), e.s_c)) {
-		bch2_bkey_val_to_text(c, btree_node_type(b),
-				     buf, sizeof(buf), k);
+		bch2_bkey_val_to_text(&PBUF(buf), c, btree_node_type(b), k);
 		bch2_fs_bug(c,
 			"btree key bad (replicas not marked in superblock):\n%s",
 			buf);
@@ -691,29 +685,23 @@ void bch2_btree_ptr_debugcheck(struct bch_fs *c, struct btree *b,
 
 	return;
 err:
-	bch2_bkey_val_to_text(c, btree_node_type(b), buf, sizeof(buf), k);
-	bch2_fs_bug(c, "%s btree pointer %s: bucket %zi "
-		      "gen %i mark %08x",
-		      err, buf, PTR_BUCKET_NR(ca, ptr),
-		      mark.gen, (unsigned) mark.v.counter);
+	bch2_bkey_val_to_text(&PBUF(buf), c, btree_node_type(b), k);
+	bch2_fs_bug(c, "%s btree pointer %s: bucket %zi gen %i mark %08x",
+		    err, buf, PTR_BUCKET_NR(ca, ptr),
+		    mark.gen, (unsigned) mark.v.counter);
 }
 
-int bch2_btree_ptr_to_text(struct bch_fs *c, char *buf,
-			   size_t size, struct bkey_s_c k)
+void bch2_btree_ptr_to_text(struct printbuf *out, struct bch_fs *c,
+			    struct bkey_s_c k)
 {
-	char *out = buf, *end = buf + size;
 	const char *invalid;
 
-#define p(...)	(out += scnprintf(out, end - out, __VA_ARGS__))
-
 	if (bkey_extent_is_data(k.k))
-		out += extent_print_ptrs(c, buf, size, bkey_s_c_to_extent(k));
+		extent_print_ptrs(out, c, bkey_s_c_to_extent(k));
 
 	invalid = bch2_btree_ptr_invalid(c, k);
 	if (invalid)
-		p(" invalid: %s", invalid);
-#undef p
-	return out - buf;
+		pr_buf(out, " invalid: %s", invalid);
 }
 
 int bch2_btree_pick_ptr(struct bch_fs *c, const struct btree *b,
@@ -1112,8 +1100,8 @@ static void verify_extent_nonoverlapping(struct btree *b,
 		char buf1[100];
 		char buf2[100];
 
-		bch2_bkey_to_text(buf1, sizeof(buf1), &insert->k);
-		bch2_bkey_to_text(buf2, sizeof(buf2), &uk);
+		bch2_bkey_to_text(&PBUF(buf1), &insert->k);
+		bch2_bkey_to_text(&PBUF(buf2), &uk);
 
 		bch2_dump_btree_node(b);
 		panic("insert > next :\n"
@@ -1705,8 +1693,8 @@ static void bch2_extent_debugcheck_extent(struct bch_fs *c, struct btree *b,
 	}
 
 	if (replicas > BCH_REPLICAS_MAX) {
-		bch2_bkey_val_to_text(c, btree_node_type(b), buf,
-				     sizeof(buf), e.s_c);
+		bch2_bkey_val_to_text(&PBUF(buf), c, btree_node_type(b),
+				      e.s_c);
 		bch2_fs_bug(c,
 			"extent key bad (too many replicas: %u): %s",
 			replicas, buf);
@@ -1715,8 +1703,8 @@ static void bch2_extent_debugcheck_extent(struct bch_fs *c, struct btree *b,
 
 	if (!test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) &&
 	    !bch2_bkey_replicas_marked(c, btree_node_type(b), e.s_c)) {
-		bch2_bkey_val_to_text(c, btree_node_type(b),
-				     buf, sizeof(buf), e.s_c);
+		bch2_bkey_val_to_text(&PBUF(buf), c, btree_node_type(b),
+				      e.s_c);
 		bch2_fs_bug(c,
 			"extent key bad (replicas not marked in superblock):\n%s",
 			buf);
@@ -1726,12 +1714,11 @@ static void bch2_extent_debugcheck_extent(struct bch_fs *c, struct btree *b,
 	return;
 
 bad_ptr:
-	bch2_bkey_val_to_text(c, btree_node_type(b), buf,
-			     sizeof(buf), e.s_c);
+	bch2_bkey_val_to_text(&PBUF(buf), c, btree_node_type(b),
+			      e.s_c);
 	bch2_fs_bug(c, "extent pointer bad gc mark: %s:\nbucket %zu "
 		   "gen %i type %u", buf,
 		   PTR_BUCKET_NR(ca, ptr), mark.gen, mark.data_type);
-	return;
 }
 
 void bch2_extent_debugcheck(struct bch_fs *c, struct btree *b, struct bkey_s_c k)
@@ -1748,22 +1735,17 @@ void bch2_extent_debugcheck(struct bch_fs *c, struct btree *b, struct bkey_s_c k
 	}
 }
 
-int bch2_extent_to_text(struct bch_fs *c, char *buf,
-			size_t size, struct bkey_s_c k)
+void bch2_extent_to_text(struct printbuf *out, struct bch_fs *c,
+			 struct bkey_s_c k)
 {
-	char *out = buf, *end = buf + size;
 	const char *invalid;
 
-#define p(...)	(out += scnprintf(out, end - out, __VA_ARGS__))
-
 	if (bkey_extent_is_data(k.k))
-		out += extent_print_ptrs(c, buf, size, bkey_s_c_to_extent(k));
+		extent_print_ptrs(out, c, bkey_s_c_to_extent(k));
 
 	invalid = bch2_extent_invalid(c, k);
 	if (invalid)
-		p(" invalid: %s", invalid);
-#undef p
-	return out - buf;
+		pr_buf(out, " invalid: %s", invalid);
 }
 
 static void bch2_extent_crc_init(union bch_extent_crc *crc,
diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h
index 8754a940a476..d121ce5b3225 100644
--- a/fs/bcachefs/extents.h
+++ b/fs/bcachefs/extents.h
@@ -18,7 +18,8 @@ union bch_extent_crc;
 const char *bch2_btree_ptr_invalid(const struct bch_fs *, struct bkey_s_c);
 void bch2_btree_ptr_debugcheck(struct bch_fs *, struct btree *,
 			       struct bkey_s_c);
-int bch2_btree_ptr_to_text(struct bch_fs *, char *, size_t, struct bkey_s_c);
+void bch2_btree_ptr_to_text(struct printbuf *, struct bch_fs *,
+			    struct bkey_s_c);
 void bch2_ptr_swab(const struct bkey_format *, struct bkey_packed *);
 
 #define bch2_bkey_btree_ops (struct bkey_ops) {			\
@@ -30,7 +31,7 @@ void bch2_ptr_swab(const struct bkey_format *, struct bkey_packed *);
 
 const char *bch2_extent_invalid(const struct bch_fs *, struct bkey_s_c);
 void bch2_extent_debugcheck(struct bch_fs *, struct btree *, struct bkey_s_c);
-int bch2_extent_to_text(struct bch_fs *, char *, size_t, struct bkey_s_c);
+void bch2_extent_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 bool bch2_ptr_normalize(struct bch_fs *, struct btree *, struct bkey_s);
 enum merge_result bch2_extent_merge(struct bch_fs *, struct btree *,
 				    struct bkey_i *, struct bkey_i *);
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index 88bf88c047ae..b6fe2059fe5f 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -1619,7 +1619,7 @@ static int bch2_show_options(struct seq_file *seq, struct dentry *root)
 		if (v == bch2_opt_get_by_id(&bch2_opts_default, i))
 			continue;
 
-		bch2_opt_to_text(c, buf, sizeof(buf), opt, v,
+		bch2_opt_to_text(&PBUF(buf), c, opt, v,
 				 OPT_SHOW_MOUNT_STYLE);
 		seq_putc(seq, ',');
 		seq_puts(seq, buf);
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 7e08592253a6..74b83201c213 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -234,8 +234,9 @@ static int hash_check_duplicates(const struct bch_hash_desc desc,
 		if (fsck_err_on(k2.k->type == desc.key_type &&
 				!desc.cmp_bkey(k, k2), c,
 				"duplicate hash table keys:\n%s",
-				(bch2_bkey_val_to_text(c, bkey_type(0, desc.btree_id),
-						       buf, sizeof(buf), k), buf))) {
+				(bch2_bkey_val_to_text(&PBUF(buf), c,
+						       bkey_type(0, desc.btree_id),
+						       k), buf))) {
 			ret = fsck_hash_delete_at(desc, &h->info, k_iter);
 			if (ret)
 				return ret;
@@ -298,8 +299,9 @@ static int hash_check_key(const struct bch_hash_desc desc,
 			"hashed to %llu chain starts at %llu\n%s",
 			desc.btree_id, k.k->p.offset,
 			hashed, h->chain->pos.offset,
-			(bch2_bkey_val_to_text(c, bkey_type(0, desc.btree_id),
-					       buf, sizeof(buf), k), buf))) {
+			(bch2_bkey_val_to_text(&PBUF(buf), c,
+					       bkey_type(0, desc.btree_id),
+					       k), buf))) {
 		ret = hash_redo_key(desc, h, c, k_iter, k, hashed);
 		if (ret) {
 			bch_err(c, "hash_redo_key err %i", ret);
@@ -382,8 +384,9 @@ err_redo:
 		     "hashed to %llu chain starts at %llu\n%s",
 		     buf, strlen(buf), BTREE_ID_DIRENTS,
 		     k->k->p.offset, hash, h->chain->pos.offset,
-		     (bch2_bkey_val_to_text(c, bkey_type(0, BTREE_ID_DIRENTS),
-					    buf, sizeof(buf), *k), buf))) {
+		     (bch2_bkey_val_to_text(&PBUF(buf), c,
+					    bkey_type(0, BTREE_ID_DIRENTS),
+					    *k), buf))) {
 		ret = hash_redo_key(bch2_dirent_hash_desc,
 				    h, c, iter, *k, hash);
 		if (ret)
@@ -525,13 +528,15 @@ static int check_dirents(struct bch_fs *c)
 
 		if (fsck_err_on(!w.have_inode, c,
 				"dirent in nonexisting directory:\n%s",
-				(bch2_bkey_val_to_text(c, (enum bkey_type) BTREE_ID_DIRENTS,
-						       buf, sizeof(buf), k), buf)) ||
+				(bch2_bkey_val_to_text(&PBUF(buf), c,
+						       (enum bkey_type) BTREE_ID_DIRENTS,
+						       k), buf)) ||
 		    fsck_err_on(!S_ISDIR(w.inode.bi_mode), c,
 				"dirent in non directory inode type %u:\n%s",
 				mode_to_type(w.inode.bi_mode),
-				(bch2_bkey_val_to_text(c, (enum bkey_type) BTREE_ID_DIRENTS,
-						       buf, sizeof(buf), k), buf))) {
+				(bch2_bkey_val_to_text(&PBUF(buf), c,
+						       (enum bkey_type) BTREE_ID_DIRENTS,
+						       k), buf))) {
 			ret = bch2_btree_delete_at(iter, 0);
 			if (ret)
 				goto err;
@@ -580,8 +585,9 @@ static int check_dirents(struct bch_fs *c)
 
 		if (fsck_err_on(d_inum == d.k->p.inode, c,
 				"dirent points to own directory:\n%s",
-				(bch2_bkey_val_to_text(c, (enum bkey_type) BTREE_ID_DIRENTS,
-						       buf, sizeof(buf), k), buf))) {
+				(bch2_bkey_val_to_text(&PBUF(buf), c,
+						       (enum bkey_type) BTREE_ID_DIRENTS,
+						       k), buf))) {
 			ret = remove_dirent(c, iter, d);
 			if (ret)
 				goto err;
@@ -597,8 +603,9 @@ static int check_dirents(struct bch_fs *c)
 
 		if (fsck_err_on(!have_target, c,
 				"dirent points to missing inode:\n%s",
-				(bch2_bkey_val_to_text(c, (enum bkey_type) BTREE_ID_DIRENTS,
-						       buf, sizeof(buf), k), buf))) {
+				(bch2_bkey_val_to_text(&PBUF(buf), c,
+						       (enum bkey_type) BTREE_ID_DIRENTS,
+						       k), buf))) {
 			ret = remove_dirent(c, iter, d);
 			if (ret)
 				goto err;
@@ -610,8 +617,9 @@ static int check_dirents(struct bch_fs *c)
 				mode_to_type(target.bi_mode), c,
 				"incorrect d_type: should be %u:\n%s",
 				mode_to_type(target.bi_mode),
-				(bch2_bkey_val_to_text(c, (enum bkey_type) BTREE_ID_DIRENTS,
-						       buf, sizeof(buf), k), buf))) {
+				(bch2_bkey_val_to_text(&PBUF(buf), c,
+						       (enum bkey_type) BTREE_ID_DIRENTS,
+						       k), buf))) {
 			struct bkey_i_dirent *n;
 
 			n = kmalloc(bkey_bytes(d.k), GFP_KERNEL);
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index debdbf58dd79..0a350c6d0932 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -228,10 +228,9 @@ const char *bch2_inode_invalid(const struct bch_fs *c, struct bkey_s_c k)
 	}
 }
 
-int bch2_inode_to_text(struct bch_fs *c, char *buf,
-		       size_t size, struct bkey_s_c k)
+void bch2_inode_to_text(struct printbuf *out, struct bch_fs *c,
+		       struct bkey_s_c k)
 {
-	char *out = buf, *end = out + size;
 	struct bkey_s_c_inode inode;
 	struct bch_inode_unpacked unpacked;
 
@@ -239,18 +238,16 @@ int bch2_inode_to_text(struct bch_fs *c, char *buf,
 	case BCH_INODE_FS:
 		inode = bkey_s_c_to_inode(k);
 		if (bch2_inode_unpack(inode, &unpacked)) {
-			out += scnprintf(out, end - out, "(unpack error)");
+			pr_buf(out, "(unpack error)");
 			break;
 		}
 
 #define BCH_INODE_FIELD(_name, _bits)						\
-		out += scnprintf(out, end - out, #_name ": %llu ", (u64) unpacked._name);
+		pr_buf(out, #_name ": %llu ", (u64) unpacked._name);
 		BCH_INODE_FIELDS()
 #undef  BCH_INODE_FIELD
 		break;
 	}
-
-	return out - buf;
 }
 
 void bch2_inode_init(struct bch_fs *c, struct bch_inode_unpacked *inode_u,
diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h
index 8713b51d3af7..897ff65d01cb 100644
--- a/fs/bcachefs/inode.h
+++ b/fs/bcachefs/inode.h
@@ -7,7 +7,7 @@
 #include <linux/math64.h>
 
 const char *bch2_inode_invalid(const struct bch_fs *, struct bkey_s_c);
-int bch2_inode_to_text(struct bch_fs *, char *, size_t, struct bkey_s_c);
+void bch2_inode_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 
 #define bch2_bkey_inode_ops (struct bkey_ops) {		\
 	.key_invalid	= bch2_inode_invalid,		\
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index 7499e15a2982..b4d037664628 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -1027,38 +1027,38 @@ out:
 
 ssize_t bch2_journal_print_debug(struct journal *j, char *buf)
 {
+	struct printbuf out = _PBUF(buf, PAGE_SIZE);
 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
 	union journal_res_state *s = &j->reservations;
 	struct bch_dev *ca;
 	unsigned iter;
-	ssize_t ret = 0;
 
 	rcu_read_lock();
 	spin_lock(&j->lock);
 
-	ret += scnprintf(buf + ret, PAGE_SIZE - ret,
-			 "active journal entries:\t%llu\n"
-			 "seq:\t\t\t%llu\n"
-			 "last_seq:\t\t%llu\n"
-			 "last_seq_ondisk:\t%llu\n"
-			 "reservation count:\t%u\n"
-			 "reservation offset:\t%u\n"
-			 "current entry u64s:\t%u\n"
-			 "io in flight:\t\t%i\n"
-			 "need write:\t\t%i\n"
-			 "dirty:\t\t\t%i\n"
-			 "replay done:\t\t%i\n",
-			 fifo_used(&j->pin),
-			 journal_cur_seq(j),
-			 journal_last_seq(j),
-			 j->last_seq_ondisk,
-			 journal_state_count(*s, s->idx),
-			 s->cur_entry_offset,
-			 j->cur_entry_u64s,
-			 s->prev_buf_unwritten,
-			 test_bit(JOURNAL_NEED_WRITE,	&j->flags),
-			 journal_entry_is_open(j),
-			 test_bit(JOURNAL_REPLAY_DONE,	&j->flags));
+	pr_buf(&out,
+	       "active journal entries:\t%llu\n"
+	       "seq:\t\t\t%llu\n"
+	       "last_seq:\t\t%llu\n"
+	       "last_seq_ondisk:\t%llu\n"
+	       "reservation count:\t%u\n"
+	       "reservation offset:\t%u\n"
+	       "current entry u64s:\t%u\n"
+	       "io in flight:\t\t%i\n"
+	       "need write:\t\t%i\n"
+	       "dirty:\t\t\t%i\n"
+	       "replay done:\t\t%i\n",
+	       fifo_used(&j->pin),
+	       journal_cur_seq(j),
+	       journal_last_seq(j),
+	       j->last_seq_ondisk,
+	       journal_state_count(*s, s->idx),
+	       s->cur_entry_offset,
+	       j->cur_entry_u64s,
+	       s->prev_buf_unwritten,
+	       test_bit(JOURNAL_NEED_WRITE,	&j->flags),
+	       journal_entry_is_open(j),
+	       test_bit(JOURNAL_REPLAY_DONE,	&j->flags));
 
 	for_each_member_device_rcu(ca, c, iter,
 				   &c->rw_devs[BCH_DATA_JOURNAL]) {
@@ -1067,50 +1067,46 @@ ssize_t bch2_journal_print_debug(struct journal *j, char *buf)
 		if (!ja->nr)
 			continue;
 
-		ret += scnprintf(buf + ret, PAGE_SIZE - ret,
-				 "dev %u:\n"
-				 "\tnr\t\t%u\n"
-				 "\tcur_idx\t\t%u (seq %llu)\n"
-				 "\tlast_idx\t%u (seq %llu)\n",
-				 iter, ja->nr,
-				 ja->cur_idx,	ja->bucket_seq[ja->cur_idx],
-				 ja->last_idx,	ja->bucket_seq[ja->last_idx]);
+		pr_buf(&out,
+		       "dev %u:\n"
+		       "\tnr\t\t%u\n"
+		       "\tcur_idx\t\t%u (seq %llu)\n"
+		       "\tlast_idx\t%u (seq %llu)\n",
+		       iter, ja->nr,
+		       ja->cur_idx,	ja->bucket_seq[ja->cur_idx],
+		       ja->last_idx,	ja->bucket_seq[ja->last_idx]);
 	}
 
 	spin_unlock(&j->lock);
 	rcu_read_unlock();
 
-	return ret;
+	return out.pos - buf;
 }
 
 ssize_t bch2_journal_print_pins(struct journal *j, char *buf)
 {
+	struct printbuf out = _PBUF(buf, PAGE_SIZE);
 	struct journal_entry_pin_list *pin_list;
 	struct journal_entry_pin *pin;
-	ssize_t ret = 0;
 	u64 i;
 
 	spin_lock(&j->lock);
 	fifo_for_each_entry_ptr(pin_list, &j->pin, i) {
-		ret += scnprintf(buf + ret, PAGE_SIZE - ret,
-				 "%llu: count %u\n",
-				 i, atomic_read(&pin_list->count));
+		pr_buf(&out, "%llu: count %u\n",
+		       i, atomic_read(&pin_list->count));
 
 		list_for_each_entry(pin, &pin_list->list, list)
-			ret += scnprintf(buf + ret, PAGE_SIZE - ret,
-					 "\t%p %pf\n",
-					 pin, pin->flush);
+			pr_buf(&out, "\t%p %pf\n",
+			       pin, pin->flush);
 
 		if (!list_empty(&pin_list->flushed))
-			ret += scnprintf(buf + ret, PAGE_SIZE - ret,
-					 "flushed:\n");
+			pr_buf(&out, "flushed:\n");
 
 		list_for_each_entry(pin, &pin_list->flushed, list)
-			ret += scnprintf(buf + ret, PAGE_SIZE - ret,
-					 "\t%p %pf\n",
-					 pin, pin->flush);
+			pr_buf(&out, "\t%p %pf\n",
+			       pin, pin->flush);
 	}
 	spin_unlock(&j->lock);
 
-	return ret;
+	return out.pos - buf;
 }
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index eb2fbe235483..4555d55b23dd 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -146,7 +146,6 @@ static int journal_validate_key(struct bch_fs *c, struct jset *jset,
 {
 	void *next = vstruct_next(entry);
 	const char *invalid;
-	char buf[160];
 	int ret = 0;
 
 	if (journal_entry_err_on(!k->k.u64s, c,
@@ -179,8 +178,10 @@ static int journal_validate_key(struct bch_fs *c, struct jset *jset,
 
 	invalid = bch2_bkey_invalid(c, key_type, bkey_i_to_s_c(k));
 	if (invalid) {
-		bch2_bkey_val_to_text(c, key_type, buf, sizeof(buf),
-				     bkey_i_to_s_c(k));
+		char buf[160];
+
+		bch2_bkey_val_to_text(&PBUF(buf), c, key_type,
+				      bkey_i_to_s_c(k));
 		mustfix_fsck_err(c, "invalid %s in journal: %s\n%s",
 				 type, invalid, buf);
 
diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c
index 9351caeb6630..c12af1a86f0b 100644
--- a/fs/bcachefs/opts.c
+++ b/fs/bcachefs/opts.c
@@ -145,7 +145,7 @@ const struct bch_option bch2_opt_table[] = {
 #define OPT_STR(_choices)	.type = BCH_OPT_STR, .choices = _choices
 #define OPT_FN(_fn)		.type = BCH_OPT_FN,			\
 				.parse = _fn##_parse,			\
-				.print = _fn##_print
+				.to_text = _fn##_to_text
 
 #define BCH_OPT(_name, _bits, _mode, _type, _sb_opt, _default)		\
 	[Opt_##_name] = {						\
@@ -235,38 +235,38 @@ int bch2_opt_parse(struct bch_fs *c, const struct bch_option *opt,
 	return 0;
 }
 
-int bch2_opt_to_text(struct bch_fs *c, char *buf, size_t len,
-		     const struct bch_option *opt, u64 v,
-		     unsigned flags)
+void bch2_opt_to_text(struct printbuf *out, struct bch_fs *c,
+		      const struct bch_option *opt, u64 v,
+		      unsigned flags)
 {
-	char *out = buf, *end = buf + len;
-
 	if (flags & OPT_SHOW_MOUNT_STYLE) {
-		if (opt->type == BCH_OPT_BOOL)
-			return scnprintf(out, end - out, "%s%s",
-					 v ? "" : "no",
-					 opt->attr.name);
+		if (opt->type == BCH_OPT_BOOL) {
+			pr_buf(out, "%s%s",
+			       v ? "" : "no",
+			       opt->attr.name);
+			return;
+		}
 
-		out += scnprintf(out, end - out, "%s=", opt->attr.name);
+		pr_buf(out, "%s=", opt->attr.name);
 	}
 
 	switch (opt->type) {
 	case BCH_OPT_BOOL:
 	case BCH_OPT_UINT:
-		out += scnprintf(out, end - out, "%lli", v);
+		pr_buf(out, "%lli", v);
 		break;
 	case BCH_OPT_STR:
-		out += (flags & OPT_SHOW_FULL_LIST)
-			? bch2_scnprint_string_list(out, end - out, opt->choices, v)
-			: scnprintf(out, end - out, opt->choices[v]);
+		if (flags & OPT_SHOW_FULL_LIST)
+			bch2_string_opt_to_text(out, opt->choices, v);
+		else
+			pr_buf(out, opt->choices[v]);
 		break;
 	case BCH_OPT_FN:
-		return opt->print(c, out, end - out, v);
+		opt->to_text(out, c, v);
+		break;
 	default:
 		BUG();
 	}
-
-	return out - buf;
 }
 
 int bch2_parse_mount_opts(struct bch_opts *opts, char *options)
diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
index 52fb9781d933..47617cd011ff 100644
--- a/fs/bcachefs/opts.h
+++ b/fs/bcachefs/opts.h
@@ -229,6 +229,7 @@ enum bch_opt_id {
 };
 
 struct bch_fs;
+struct printbuf;
 
 struct bch_option {
 	struct attribute	attr;
@@ -245,7 +246,7 @@ struct bch_option {
 	};
 	struct {
 		int (*parse)(struct bch_fs *, const char *, u64 *);
-		int (*print)(struct bch_fs *, char *, size_t, u64);
+		void (*to_text)(struct printbuf *, struct bch_fs *, u64);
 	};
 	};
 
@@ -265,8 +266,8 @@ int bch2_opt_parse(struct bch_fs *, const struct bch_option *, const char *, u64
 #define OPT_SHOW_FULL_LIST	(1 << 0)
 #define OPT_SHOW_MOUNT_STYLE	(1 << 1)
 
-int bch2_opt_to_text(struct bch_fs *, char *, size_t,
-		     const struct bch_option *, u64, unsigned);
+void bch2_opt_to_text(struct printbuf *, struct bch_fs *,
+		      const struct bch_option *, u64, unsigned);
 
 int bch2_parse_mount_opts(struct bch_opts *, char *);
 
diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c
index 79a7f82868d6..8127f4454dac 100644
--- a/fs/bcachefs/quota.c
+++ b/fs/bcachefs/quota.c
@@ -46,10 +46,9 @@ static const char * const bch2_quota_counters[] = {
 	"inodes",
 };
 
-int bch2_quota_to_text(struct bch_fs *c, char *buf,
-		       size_t size, struct bkey_s_c k)
+void bch2_quota_to_text(struct printbuf *out, struct bch_fs *c,
+			struct bkey_s_c k)
 {
-	char *out = buf, *end = buf + size;
 	struct bkey_s_c_quota dq;
 	unsigned i;
 
@@ -58,14 +57,12 @@ int bch2_quota_to_text(struct bch_fs *c, char *buf,
 		dq = bkey_s_c_to_quota(k);
 
 		for (i = 0; i < Q_COUNTERS; i++)
-			out += scnprintf(out, end - out, "%s hardlimit %llu softlimit %llu",
-					 bch2_quota_counters[i],
-					 le64_to_cpu(dq.v->c[i].hardlimit),
-					 le64_to_cpu(dq.v->c[i].softlimit));
+			pr_buf(out, "%s hardlimit %llu softlimit %llu",
+			       bch2_quota_counters[i],
+			       le64_to_cpu(dq.v->c[i].hardlimit),
+			       le64_to_cpu(dq.v->c[i].softlimit));
 		break;
 	}
-
-	return out - buf;
 }
 
 #ifdef CONFIG_BCACHEFS_QUOTA
diff --git a/fs/bcachefs/quota.h b/fs/bcachefs/quota.h
index 9650e518cd64..9c06eb07bccb 100644
--- a/fs/bcachefs/quota.h
+++ b/fs/bcachefs/quota.h
@@ -8,7 +8,7 @@
 extern const struct bch_sb_field_ops bch_sb_field_ops_quota;
 
 const char *bch2_quota_invalid(const struct bch_fs *, struct bkey_s_c);
-int bch2_quota_to_text(struct bch_fs *, char *, size_t, struct bkey_s_c);
+void bch2_quota_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 
 #define bch2_bkey_quota_ops (struct bkey_ops) {		\
 	.key_invalid	= bch2_quota_invalid,		\
diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c
index 570dbae5a240..5d246c5b8186 100644
--- a/fs/bcachefs/rebalance.c
+++ b/fs/bcachefs/rebalance.c
@@ -252,49 +252,43 @@ static int bch2_rebalance_thread(void *arg)
 
 ssize_t bch2_rebalance_work_show(struct bch_fs *c, char *buf)
 {
-	char *out = buf, *end = out + PAGE_SIZE;
+	struct printbuf out = _PBUF(buf, PAGE_SIZE);
 	struct bch_fs_rebalance *r = &c->rebalance;
 	struct rebalance_work w = rebalance_work(c);
 	char h1[21], h2[21];
 
 	bch2_hprint(h1, w.dev_most_full_work << 9);
 	bch2_hprint(h2, w.dev_most_full_capacity << 9);
-	out += scnprintf(out, end - out,
-			 "fullest_dev (%i):\t%s/%s\n",
-			 w.dev_most_full_idx, h1, h2);
+	pr_buf(&out, "fullest_dev (%i):\t%s/%s\n",
+	       w.dev_most_full_idx, h1, h2);
 
 	bch2_hprint(h1, w.total_work << 9);
 	bch2_hprint(h2, c->capacity << 9);
-	out += scnprintf(out, end - out,
-			 "total work:\t\t%s/%s\n",
-			 h1, h2);
+	pr_buf(&out, "total work:\t\t%s/%s\n", h1, h2);
 
-	out += scnprintf(out, end - out,
-			 "rate:\t\t\t%u\n",
-			 r->pd.rate.rate);
+	pr_buf(&out, "rate:\t\t\t%u\n", r->pd.rate.rate);
 
 	switch (r->state) {
 	case REBALANCE_WAITING:
-		out += scnprintf(out, end - out, "waiting\n");
+		pr_buf(&out, "waiting\n");
 		break;
 	case REBALANCE_THROTTLED:
 		bch2_hprint(h1,
 			    (r->throttled_until_iotime -
 			     atomic_long_read(&c->io_clock[WRITE].now)) << 9);
-		out += scnprintf(out, end - out,
-				 "throttled for %lu sec or %s io\n",
-				 (r->throttled_until_cputime - jiffies) / HZ,
-				 h1);
+		pr_buf(&out, "throttled for %lu sec or %s io\n",
+		       (r->throttled_until_cputime - jiffies) / HZ,
+		       h1);
 		break;
 	case REBALANCE_RUNNING:
-		out += scnprintf(out, end - out, "running\n");
-		out += scnprintf(out, end - out, "pos %llu:%llu\n",
-				 r->move_stats.iter.pos.inode,
-				 r->move_stats.iter.pos.offset);
+		pr_buf(&out, "running\n");
+		pr_buf(&out, "pos %llu:%llu\n",
+		       r->move_stats.iter.pos.inode,
+		       r->move_stats.iter.pos.offset);
 		break;
 	}
 
-	return out - buf;
+	return out.pos - buf;
 }
 
 void bch2_rebalance_stop(struct bch_fs *c)
diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c
index a7c3aca1bf01..fb11b97cdeee 100644
--- a/fs/bcachefs/replicas.c
+++ b/fs/bcachefs/replicas.c
@@ -40,38 +40,31 @@ static void bch2_cpu_replicas_sort(struct bch_replicas_cpu *r)
 	eytzinger0_sort(r->entries, r->nr, r->entry_size, memcmp, NULL);
 }
 
-static int replicas_entry_to_text(struct bch_replicas_entry *e,
-				  char *buf, size_t size)
+static void replicas_entry_to_text(struct printbuf *out,
+				  struct bch_replicas_entry *e)
 {
-	char *out = buf, *end = out + size;
 	unsigned i;
 
-	out += scnprintf(out, end - out, "%u: [", e->data_type);
+	pr_buf(out, "%u: [", e->data_type);
 
 	for (i = 0; i < e->nr_devs; i++)
-		out += scnprintf(out, end - out,
-				 i ? " %u" : "%u", e->devs[i]);
-	out += scnprintf(out, end - out, "]");
-
-	return out - buf;
+		pr_buf(out, i ? " %u" : "%u", e->devs[i]);
+	pr_buf(out, "]");
 }
 
-int bch2_cpu_replicas_to_text(struct bch_replicas_cpu *r,
-			      char *buf, size_t size)
+void bch2_cpu_replicas_to_text(struct printbuf *out,
+			      struct bch_replicas_cpu *r)
 {
-	char *out = buf, *end = out + size;
 	struct bch_replicas_entry *e;
 	bool first = true;
 
 	for_each_cpu_replicas_entry(r, e) {
 		if (!first)
-			out += scnprintf(out, end - out, " ");
+			pr_buf(out, " ");
 		first = false;
 
-		out += replicas_entry_to_text(e, out, end - out);
+		replicas_entry_to_text(out, e);
 	}
-
-	return out - buf;
 }
 
 static void extent_to_replicas(struct bkey_s_c k,
@@ -510,32 +503,28 @@ err:
 	return err;
 }
 
-const struct bch_sb_field_ops bch_sb_field_ops_replicas = {
-	.validate	= bch2_sb_validate_replicas,
-};
-
-int bch2_sb_replicas_to_text(struct bch_sb_field_replicas *r, char *buf, size_t size)
+static void bch2_sb_replicas_to_text(struct printbuf *out,
+				     struct bch_sb *sb,
+				     struct bch_sb_field *f)
 {
-	char *out = buf, *end = out + size;
+	struct bch_sb_field_replicas *r = field_to_type(f, replicas);
 	struct bch_replicas_entry *e;
 	bool first = true;
 
-	if (!r) {
-		out += scnprintf(out, end - out, "(no replicas section found)");
-		return out - buf;
-	}
-
 	for_each_replicas_entry(r, e) {
 		if (!first)
-			out += scnprintf(out, end - out, " ");
+			pr_buf(out, " ");
 		first = false;
 
-		out += replicas_entry_to_text(e, out, end - out);
+		replicas_entry_to_text(out, e);
 	}
-
-	return out - buf;
 }
 
+const struct bch_sb_field_ops bch_sb_field_ops_replicas = {
+	.validate	= bch2_sb_validate_replicas,
+	.to_text	= bch2_sb_replicas_to_text,
+};
+
 /* Query replicas: */
 
 bool bch2_replicas_marked(struct bch_fs *c,
diff --git a/fs/bcachefs/replicas.h b/fs/bcachefs/replicas.h
index ebbb1334cc2c..d3d81a1a39cd 100644
--- a/fs/bcachefs/replicas.h
+++ b/fs/bcachefs/replicas.h
@@ -13,8 +13,7 @@ int bch2_mark_replicas(struct bch_fs *, enum bch_data_type,
 int bch2_mark_bkey_replicas(struct bch_fs *, enum bkey_type,
 			    struct bkey_s_c);
 
-int bch2_cpu_replicas_to_text(struct bch_replicas_cpu *, char *, size_t);
-int bch2_sb_replicas_to_text(struct bch_sb_field_replicas *, char *, size_t);
+void bch2_cpu_replicas_to_text(struct printbuf *, struct bch_replicas_cpu *);
 
 struct replicas_status {
 	struct {
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index 58c35d9665eb..0c2b20c9e8c4 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -951,21 +951,20 @@ static const char *bch2_sb_field_validate(struct bch_sb *sb,
 		: NULL;
 }
 
-size_t bch2_sb_field_to_text(char *buf, size_t size,
-			     struct bch_sb *sb, struct bch_sb_field *f)
+void bch2_sb_field_to_text(struct printbuf *out, struct bch_sb *sb,
+			   struct bch_sb_field *f)
 {
 	unsigned type = le32_to_cpu(f->type);
-	size_t (*to_text)(char *, size_t, struct bch_sb *,
-				   struct bch_sb_field *) =
-		type < BCH_SB_FIELD_NR
-		? bch2_sb_field_ops[type]->to_text
-		: NULL;
+	const struct bch_sb_field_ops *ops = type < BCH_SB_FIELD_NR
+		? bch2_sb_field_ops[type] : NULL;
 
-	if (!to_text) {
-		if (size)
-			buf[0] = '\0';
-		return 0;
-	}
+	if (ops)
+		pr_buf(out, "%s", bch2_sb_fields[type]);
+	else
+		pr_buf(out, "(unknown field %u)", type);
+
+	pr_buf(out, " (size %llu):", vstruct_bytes(f));
 
-	return to_text(buf, size, sb, f);
+	if (ops && ops->to_text)
+		bch2_sb_field_ops[type]->to_text(out, sb, f);
 }
diff --git a/fs/bcachefs/super-io.h b/fs/bcachefs/super-io.h
index 1ea91f71f3b0..ceef650d55dd 100644
--- a/fs/bcachefs/super-io.h
+++ b/fs/bcachefs/super-io.h
@@ -38,7 +38,7 @@ extern const char * const bch2_sb_fields[];
 
 struct bch_sb_field_ops {
 	const char *	(*validate)(struct bch_sb *, struct bch_sb_field *);
-	size_t		(*to_text)(char *, size_t, struct bch_sb *,
+	void		(*to_text)(struct printbuf *, struct bch_sb *,
 				   struct bch_sb_field *);
 };
 
@@ -136,7 +136,7 @@ static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi)
 
 void bch2_fs_mark_clean(struct bch_fs *, bool);
 
-size_t bch2_sb_field_to_text(char *, size_t, struct bch_sb *,
-			     struct bch_sb_field *);
+void bch2_sb_field_to_text(struct printbuf *, struct bch_sb *,
+			   struct bch_sb_field *);
 
 #endif /* _BCACHEFS_SUPER_IO_H */
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 54d23cf46f95..a22beff7cc96 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -1236,10 +1236,9 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
 	data = bch2_dev_has_data(c, ca);
 	if (data) {
 		char data_has_str[100];
-		bch2_scnprint_flag_list(data_has_str,
-					sizeof(data_has_str),
-					bch2_data_types,
-					data);
+
+		bch2_string_opt_to_text(&PBUF(data_has_str),
+					bch2_data_types, data);
 		bch_err(ca, "Remove failed, still has data (%s)", data_has_str);
 		ret = -EBUSY;
 		goto err;
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index ee91bcc6433c..4ca84de6ab0e 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -230,42 +230,34 @@ static size_t bch2_btree_cache_size(struct bch_fs *c)
 
 static ssize_t show_fs_alloc_debug(struct bch_fs *c, char *buf)
 {
-	char *out = buf, *end = buf + PAGE_SIZE;
+	struct printbuf out = _PBUF(buf, PAGE_SIZE);
 	struct bch_fs_usage stats = bch2_fs_usage_read(c);
 	unsigned replicas, type;
 
-	out += scnprintf(out, end - out,
-			 "capacity:\t\t%llu\n",
-			 c->capacity);
+	pr_buf(&out, "capacity:\t\t%llu\n", c->capacity);
 
 	for (replicas = 0; replicas < ARRAY_SIZE(stats.replicas); replicas++) {
-		out += scnprintf(out, end - out,
-				 "%u replicas:\n",
-				 replicas + 1);
+		pr_buf(&out, "%u replicas:\n", replicas + 1);
 
 		for (type = BCH_DATA_SB; type < BCH_DATA_NR; type++)
-			out += scnprintf(out, end - out,
-					 "\t%s:\t\t%llu\n",
-					 bch2_data_types[type],
-					 stats.replicas[replicas].data[type]);
-		out += scnprintf(out, end - out,
-				 "\treserved:\t%llu\n",
-				 stats.replicas[replicas].persistent_reserved);
+			pr_buf(&out, "\t%s:\t\t%llu\n",
+			       bch2_data_types[type],
+			       stats.replicas[replicas].data[type]);
+		pr_buf(&out, "\treserved:\t%llu\n",
+		       stats.replicas[replicas].persistent_reserved);
 	}
 
-	out += scnprintf(out, end - out, "bucket usage\n");
+	pr_buf(&out, "bucket usage\n");
 
 	for (type = BCH_DATA_SB; type < BCH_DATA_NR; type++)
-		out += scnprintf(out, end - out,
-				 "\t%s:\t\t%llu\n",
-				 bch2_data_types[type],
-				 stats.buckets[type]);
+		pr_buf(&out, "\t%s:\t\t%llu\n",
+		       bch2_data_types[type],
+		       stats.buckets[type]);
 
-	out += scnprintf(out, end - out,
-			 "online reserved:\t%llu\n",
-			 stats.online_reserved);
+	pr_buf(&out, "online reserved:\t%llu\n",
+	       stats.online_reserved);
 
-	return out - buf;
+	return out.pos - buf;
 }
 
 static ssize_t bch2_compression_stats(struct bch_fs *c, char *buf)
@@ -559,16 +551,16 @@ struct attribute *bch2_fs_internal_files[] = {
 
 SHOW(bch2_fs_opts_dir)
 {
-	char *out = buf, *end = buf + PAGE_SIZE;
+	struct printbuf out = _PBUF(buf, PAGE_SIZE);
 	struct bch_fs *c = container_of(kobj, struct bch_fs, opts_dir);
 	const struct bch_option *opt = container_of(attr, struct bch_option, attr);
 	int id = opt - bch2_opt_table;
 	u64 v = bch2_opt_get_by_id(&c->opts, id);
 
-	out += bch2_opt_to_text(c, out, end - out, opt, v, OPT_SHOW_FULL_LIST);
-	out += scnprintf(out, end - out, "\n");
+	bch2_opt_to_text(&out, c, opt, v, OPT_SHOW_FULL_LIST);
+	pr_buf(&out, "\n");
 
-	return out - buf;
+	return out.pos - buf;
 }
 
 STORE(bch2_fs_opts_dir)
@@ -742,25 +734,23 @@ static ssize_t show_quantiles(struct bch_fs *c, struct bch_dev *ca,
 
 static ssize_t show_reserve_stats(struct bch_dev *ca, char *buf)
 {
+	struct printbuf out = _PBUF(buf, PAGE_SIZE);
 	enum alloc_reserve i;
-	ssize_t ret;
 
 	spin_lock(&ca->freelist_lock);
 
-	ret = scnprintf(buf, PAGE_SIZE,
-			"free_inc:\t%zu\t%zu\n",
-			fifo_used(&ca->free_inc),
-			ca->free_inc.size);
+	pr_buf(&out, "free_inc:\t%zu\t%zu\n",
+	       fifo_used(&ca->free_inc),
+	       ca->free_inc.size);
 
 	for (i = 0; i < RESERVE_NR; i++)
-		ret += scnprintf(buf + ret, PAGE_SIZE - ret,
-				 "free[%u]:\t%zu\t%zu\n", i,
-				 fifo_used(&ca->free[i]),
-				 ca->free[i].size);
+		pr_buf(&out, "free[%u]:\t%zu\t%zu\n", i,
+		       fifo_used(&ca->free[i]),
+		       ca->free[i].size);
 
 	spin_unlock(&ca->freelist_lock);
 
-	return ret;
+	return out.pos - buf;
 }
 
 static ssize_t show_dev_alloc_debug(struct bch_dev *ca, char *buf)
@@ -825,11 +815,11 @@ static const char * const bch2_rw[] = {
 
 static ssize_t show_dev_iodone(struct bch_dev *ca, char *buf)
 {
-	char *out = buf, *end = buf + PAGE_SIZE;
+	struct printbuf out = _PBUF(buf, PAGE_SIZE);
 	int rw, i, cpu;
 
 	for (rw = 0; rw < 2; rw++) {
-		out += scnprintf(out, end - out, "%s:\n", bch2_rw[rw]);
+		pr_buf(&out, "%s:\n", bch2_rw[rw]);
 
 		for (i = 1; i < BCH_DATA_NR; i++) {
 			u64 n = 0;
@@ -837,19 +827,19 @@ static ssize_t show_dev_iodone(struct bch_dev *ca, char *buf)
 			for_each_possible_cpu(cpu)
 				n += per_cpu_ptr(ca->io_done, cpu)->sectors[rw][i];
 
-			out += scnprintf(out, end - out, "%-12s:%12llu\n",
-					 bch2_data_types[i], n << 9);
+			pr_buf(&out, "%-12s:%12llu\n",
+			       bch2_data_types[i], n << 9);
 		}
 	}
 
-	return out - buf;
+	return out.pos - buf;
 }
 
 SHOW(bch2_dev)
 {
 	struct bch_dev *ca = container_of(kobj, struct bch_dev, kobj);
 	struct bch_fs *c = ca->fs;
-	char *out = buf, *end = buf + PAGE_SIZE;
+	struct printbuf out = _PBUF(buf, PAGE_SIZE);
 
 	sysfs_printf(uuid,		"%pU\n", ca->uuid.b);
 
@@ -863,41 +853,39 @@ SHOW(bch2_dev)
 	if (attr == &sysfs_label) {
 		if (ca->mi.group) {
 			mutex_lock(&c->sb_lock);
-			out += bch2_disk_path_print(&c->disk_sb, out, end - out,
-						    ca->mi.group - 1);
+			bch2_disk_path_to_text(&out, &c->disk_sb,
+					       ca->mi.group - 1);
 			mutex_unlock(&c->sb_lock);
 		} else {
-			out += scnprintf(out, end - out, "none");
+			pr_buf(&out, "none");
 		}
 
-		out += scnprintf(out, end - out, "\n");
-		return out - buf;
+		pr_buf(&out, "\n");
+		return out.pos - buf;
 	}
 
 	if (attr == &sysfs_has_data) {
-		out += bch2_scnprint_flag_list(out, end - out,
-					       bch2_data_types,
-					       bch2_dev_has_data(c, ca));
-		out += scnprintf(out, end - out, "\n");
-		return out - buf;
+		bch2_flags_to_text(&out, bch2_data_types,
+				   bch2_dev_has_data(c, ca));
+		pr_buf(&out, "\n");
+		return out.pos - buf;
 	}
 
 	sysfs_pd_controller_show(copy_gc, &ca->copygc_pd);
 
 	if (attr == &sysfs_cache_replacement_policy) {
-		out += bch2_scnprint_string_list(out, end - out,
-						 bch2_cache_replacement_policies,
-						 ca->mi.replacement);
-		out += scnprintf(out, end - out, "\n");
-		return out - buf;
+		bch2_string_opt_to_text(&out,
+					bch2_cache_replacement_policies,
+					ca->mi.replacement);
+		pr_buf(&out, "\n");
+		return out.pos - buf;
 	}
 
 	if (attr == &sysfs_state_rw) {
-		out += bch2_scnprint_string_list(out, end - out,
-						 bch2_dev_state,
-						 ca->mi.state);
-		out += scnprintf(out, end - out, "\n");
-		return out - buf;
+		bch2_string_opt_to_text(&out, bch2_dev_state,
+					ca->mi.state);
+		pr_buf(&out, "\n");
+		return out.pos - buf;
 	}
 
 	if (attr == &sysfs_iodone)
diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c
index ed90bd3a5d18..bb6b4383d33f 100644
--- a/fs/bcachefs/util.c
+++ b/fs/bcachefs/util.c
@@ -124,47 +124,31 @@ ssize_t bch2_hprint(char *buf, s64 v)
 	return sprintf(buf, "%lli%s%c", v, dec, si_units[u]);
 }
 
-ssize_t bch2_scnprint_string_list(char *buf, size_t size,
-				  const char * const list[],
-				  size_t selected)
+void bch2_string_opt_to_text(struct printbuf *out,
+			     const char * const list[],
+			     size_t selected)
 {
-	char *out = buf;
 	size_t i;
 
-	if (size)
-		*out = '\0';
-
 	for (i = 0; list[i]; i++)
-		out += scnprintf(out, buf + size - out,
-				 i == selected ? "[%s] " : "%s ", list[i]);
-
-	if (out != buf)
-		*--out = '\0';
-
-	return out - buf;
+		pr_buf(out, i == selected ? "[%s] " : "%s ", list[i]);
 }
 
-ssize_t bch2_scnprint_flag_list(char *buf, size_t size,
-				const char * const list[], u64 flags)
+void bch2_flags_to_text(struct printbuf *out,
+			const char * const list[], u64 flags)
 {
-	char *out = buf, *end = buf + size;
 	unsigned bit, nr = 0;
 
+	if (out->pos != out->end)
+		*out->pos = '\0';
+
 	while (list[nr])
 		nr++;
 
-	if (size)
-		*out = '\0';
-
 	while (flags && (bit = __ffs(flags)) < nr) {
-		out += scnprintf(out, end - out, "%s,", list[bit]);
+		pr_buf(out, "%s,", list[bit]);
 		flags ^= 1 << bit;
 	}
-
-	if (out != buf)
-		*--out = '\0';
-
-	return out - buf;
 }
 
 u64 bch2_read_flag_list(char *opt, const char * const list[])
@@ -329,50 +313,50 @@ static const struct time_unit *pick_time_units(u64 ns)
 	return u;
 }
 
-static size_t pr_time_units(char *buf, size_t len, u64 ns)
+static void pr_time_units(struct printbuf *out, u64 ns)
 {
 	const struct time_unit *u = pick_time_units(ns);
 
-	return scnprintf(buf, len, "%llu %s", div_u64(ns, u->nsecs), u->name);
+	pr_buf(out, "%llu %s", div_u64(ns, u->nsecs), u->name);
 }
 
 size_t bch2_time_stats_print(struct bch2_time_stats *stats, char *buf, size_t len)
 {
-	char *out = buf, *end = buf + len;
+	struct printbuf out = _PBUF(buf, len);
 	const struct time_unit *u;
 	u64 freq = READ_ONCE(stats->average_frequency);
 	u64 q, last_q = 0;
 	int i;
 
-	out += scnprintf(out, end - out, "count:\t\t%llu\n",
+	pr_buf(&out, "count:\t\t%llu\n",
 			 stats->count);
-	out += scnprintf(out, end - out, "rate:\t\t%llu/sec\n",
-			 freq ?  div64_u64(NSEC_PER_SEC, freq) : 0);
+	pr_buf(&out, "rate:\t\t%llu/sec\n",
+	       freq ?  div64_u64(NSEC_PER_SEC, freq) : 0);
 
-	out += scnprintf(out, end - out, "frequency:\t");
-	out += pr_time_units(out, end - out, freq);
+	pr_buf(&out, "frequency:\t");
+	pr_time_units(&out, freq);
 
-	out += scnprintf(out, end - out, "\navg duration:\t");
-	out += pr_time_units(out, end - out, stats->average_duration);
+	pr_buf(&out, "\navg duration:\t");
+	pr_time_units(&out, stats->average_duration);
 
-	out += scnprintf(out, end - out, "\nmax duration:\t");
-	out += pr_time_units(out, end - out, stats->max_duration);
+	pr_buf(&out, "\nmax duration:\t");
+	pr_time_units(&out, stats->max_duration);
 
 	i = eytzinger0_first(NR_QUANTILES);
 	u = pick_time_units(stats->quantiles.entries[i].m);
 
-	out += scnprintf(out, end - out, "\nquantiles (%s):\t", u->name);
+	pr_buf(&out, "\nquantiles (%s):\t", u->name);
 	eytzinger0_for_each(i, NR_QUANTILES) {
 		bool is_last = eytzinger0_next(i, NR_QUANTILES) == -1;
 
 		q = max(stats->quantiles.entries[i].m, last_q);
-		out += scnprintf(out, end - out, "%llu%s",
-				 div_u64(q, u->nsecs),
-				 is_last ? "\n" : " ");
+		pr_buf(&out, "%llu%s",
+		       div_u64(q, u->nsecs),
+		       is_last ? "\n" : " ");
 		last_q = q;
 	}
 
-	return out - buf;
+	return out.pos - buf;
 }
 
 void bch2_time_stats_exit(struct bch2_time_stats *stats)
@@ -615,18 +599,17 @@ void memcpy_from_bio(void *dst, struct bio *src, struct bvec_iter src_iter)
 	}
 }
 
-size_t bch_scnmemcpy(char *buf, size_t size, const char *src, size_t len)
+void bch_scnmemcpy(struct printbuf *out,
+		   const char *src, size_t len)
 {
-	size_t n;
-
-	if (!size)
-		return 0;
+	size_t n = printbuf_remaining(out);
 
-	n = min(size - 1, len);
-	memcpy(buf, src, n);
-	buf[n] = '\0';
-
-	return n;
+	if (n) {
+		n = min(n - 1, len);
+		memcpy(out->pos, src, n);
+		out->pos += n;
+		*out->pos = '\0';
+	}
 }
 
 #include "eytzinger.h"
diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h
index cb6bed68abf8..47afd3955c7a 100644
--- a/fs/bcachefs/util.h
+++ b/fs/bcachefs/util.h
@@ -235,6 +235,32 @@ do {									\
 #define ANYSINT_MAX(t)							\
 	((((t) 1 << (sizeof(t) * 8 - 2)) - (t) 1) * (t) 2 + (t) 1)
 
+struct printbuf {
+	char		*pos;
+	char		*end;
+};
+
+static inline size_t printbuf_remaining(struct printbuf *buf)
+{
+	return buf->end - buf->pos;
+}
+
+#define _PBUF(_buf, _len)						\
+	((struct printbuf) {						\
+		.pos	= _buf,						\
+		.end	= _buf + _len,					\
+	})
+
+#define PBUF(_buf) _PBUF(_buf, sizeof(_buf))
+
+#define pr_buf(_out, ...)						\
+do {									\
+	(_out)->pos += scnprintf((_out)->pos, printbuf_remaining(_out),	\
+				 __VA_ARGS__);				\
+} while (0)
+
+void bch_scnmemcpy(struct printbuf *, const char *, size_t);
+
 int bch2_strtoint_h(const char *, int *);
 int bch2_strtouint_h(const char *, unsigned int *);
 int bch2_strtoll_h(const char *, long long *);
@@ -311,9 +337,10 @@ ssize_t bch2_hprint(char *buf, s64 v);
 
 bool bch2_is_zero(const void *, size_t);
 
-ssize_t bch2_scnprint_string_list(char *, size_t, const char * const[], size_t);
+void bch2_string_opt_to_text(struct printbuf *,
+			     const char * const [], size_t);
 
-ssize_t bch2_scnprint_flag_list(char *, size_t, const char * const[], u64);
+void bch2_flags_to_text(struct printbuf *, const char * const[], u64);
 u64 bch2_read_flag_list(char *, const char * const[]);
 
 #define NR_QUANTILES	15
@@ -629,8 +656,6 @@ static inline struct bio_vec next_contig_bvec(struct bio *bio,
 #define bio_for_each_contig_segment(bv, bio, iter)			\
 	__bio_for_each_contig_segment(bv, bio, iter, (bio)->bi_iter)
 
-size_t bch_scnmemcpy(char *, size_t, const char *, size_t);
-
 void sort_cmp_size(void *base, size_t num, size_t size,
 	  int (*cmp_func)(const void *, const void *, size_t),
 	  void (*swap_func)(void *, void *, size_t));
diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c
index 44bf4a2f3c84..7f6258e09a0d 100644
--- a/fs/bcachefs/xattr.c
+++ b/fs/bcachefs/xattr.c
@@ -111,10 +111,9 @@ const char *bch2_xattr_invalid(const struct bch_fs *c, struct bkey_s_c k)
 	}
 }
 
-int bch2_xattr_to_text(struct bch_fs *c, char *buf,
-		       size_t size, struct bkey_s_c k)
+void bch2_xattr_to_text(struct printbuf *out, struct bch_fs *c,
+			struct bkey_s_c k)
 {
-	char *out = buf, *end = buf + size;
 	const struct xattr_handler *handler;
 	struct bkey_s_c_xattr xattr;
 
@@ -124,26 +123,22 @@ int bch2_xattr_to_text(struct bch_fs *c, char *buf,
 
 		handler = bch2_xattr_type_to_handler(xattr.v->x_type);
 		if (handler && handler->prefix)
-			out += scnprintf(out, end - out, "%s", handler->prefix);
+			pr_buf(out, "%s", handler->prefix);
 		else if (handler)
-			out += scnprintf(out, end - out, "(type %u)",
-					 xattr.v->x_type);
+			pr_buf(out, "(type %u)", xattr.v->x_type);
 		else
-			out += scnprintf(out, end - out, "(unknown type %u)",
-					 xattr.v->x_type);
-
-		out += bch_scnmemcpy(out, end - out, xattr.v->x_name,
-				     xattr.v->x_name_len);
-		out += scnprintf(out, end - out, ":");
-		out += bch_scnmemcpy(out, end - out, xattr_val(xattr.v),
-				     le16_to_cpu(xattr.v->x_val_len));
+			pr_buf(out, "(unknown type %u)", xattr.v->x_type);
+
+		bch_scnmemcpy(out, xattr.v->x_name,
+			      xattr.v->x_name_len);
+		pr_buf(out, ":");
+		bch_scnmemcpy(out, xattr_val(xattr.v),
+			      le16_to_cpu(xattr.v->x_val_len));
 		break;
 	case BCH_XATTR_WHITEOUT:
-		out += scnprintf(out, end - out, "whiteout");
+		pr_buf(out, "whiteout");
 		break;
 	}
-
-	return out - buf;
 }
 
 int bch2_xattr_get(struct bch_fs *c, struct bch_inode_info *inode,
@@ -355,7 +350,7 @@ static int bch2_xattr_bcachefs_get(const struct xattr_handler *handler,
 	struct bch_opts opts =
 		bch2_inode_opts_to_opts(bch2_inode_opts_get(&inode->ei_inode));
 	const struct bch_option *opt;
-	int ret, id;
+	int id;
 	u64 v;
 
 	id = bch2_opt_lookup(name);
@@ -369,9 +364,22 @@ static int bch2_xattr_bcachefs_get(const struct xattr_handler *handler,
 
 	v = bch2_opt_get_by_id(&opts, id);
 
-	ret = bch2_opt_to_text(c, buffer, size, opt, v, 0);
+	if (!buffer) {
+		char buf[512];
+		struct printbuf out = PBUF(buf);
 
-	return ret < size || !buffer ? ret : -ERANGE;
+		bch2_opt_to_text(&out, c, opt, v, 0);
+
+		return out.pos - buf;
+	} else {
+		struct printbuf out = _PBUF(buffer, size);
+
+		bch2_opt_to_text(&out, c, opt, v, 0);
+
+		return printbuf_remaining(&out)
+			? (void *) out.pos - buffer
+			: -ERANGE;
+	}
 }
 
 struct inode_opt_set {
diff --git a/fs/bcachefs/xattr.h b/fs/bcachefs/xattr.h
index b2fe1dc42b83..63be44b02a2b 100644
--- a/fs/bcachefs/xattr.h
+++ b/fs/bcachefs/xattr.h
@@ -7,7 +7,7 @@
 extern const struct bch_hash_desc bch2_xattr_hash_desc;
 
 const char *bch2_xattr_invalid(const struct bch_fs *, struct bkey_s_c);
-int bch2_xattr_to_text(struct bch_fs *, char *, size_t, struct bkey_s_c);
+void bch2_xattr_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 
 #define bch2_bkey_xattr_ops (struct bkey_ops) {		\
 	.key_invalid	= bch2_xattr_invalid,		\
-- 
cgit 


From c258f28ebab6be176f20173aac725092b39cbd2c Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 12 Nov 2018 18:30:55 -0500
Subject: bcachefs: Check for unsupported features

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs_format.h |  3 ++-
 fs/bcachefs/opts.c            | 15 +++++++++++++++
 fs/bcachefs/opts.h            |  1 +
 fs/bcachefs/super-io.c        |  4 ++++
 fs/bcachefs/sysfs.c           | 12 +++---------
 fs/bcachefs/xattr.c           | 10 +++-------
 6 files changed, 28 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index d74f1e5c21e0..eb14fcf15a96 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -1234,7 +1234,8 @@ enum bch_sb_features {
 	BCH_FEATURE_LZ4			= 0,
 	BCH_FEATURE_GZIP		= 1,
 	BCH_FEATURE_ZSTD		= 2,
-	BCH_FEATURE_ATOMIC_NLINK	= 3,
+	BCH_FEATURE_ATOMIC_NLINK	= 3, /* should have gone under compat */
+	BCH_FEATURE_NR,
 };
 
 /* options: */
diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c
index c12af1a86f0b..74e92a196ccd 100644
--- a/fs/bcachefs/opts.c
+++ b/fs/bcachefs/opts.c
@@ -3,6 +3,7 @@
 #include <linux/kernel.h>
 
 #include "bcachefs.h"
+#include "compress.h"
 #include "disk_groups.h"
 #include "opts.h"
 #include "super-io.h"
@@ -269,6 +270,20 @@ void bch2_opt_to_text(struct printbuf *out, struct bch_fs *c,
 	}
 }
 
+int bch2_opt_check_may_set(struct bch_fs *c, int id, u64 v)
+{
+	int ret = 0;
+
+	switch (id) {
+	case Opt_compression:
+	case Opt_background_compression:
+		ret = bch2_check_set_has_compressed_data(c, v);
+		break;
+	}
+
+	return ret;
+}
+
 int bch2_parse_mount_opts(struct bch_opts *opts, char *options)
 {
 	char *opt, *name, *val;
diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
index 47617cd011ff..8f4fab7f7dc8 100644
--- a/fs/bcachefs/opts.h
+++ b/fs/bcachefs/opts.h
@@ -269,6 +269,7 @@ int bch2_opt_parse(struct bch_fs *, const struct bch_option *, const char *, u64
 void bch2_opt_to_text(struct printbuf *, struct bch_fs *,
 		      const struct bch_option *, u64, unsigned);
 
+int bch2_opt_check_may_set(struct bch_fs *, int, u64);
 int bch2_parse_mount_opts(struct bch_opts *, char *);
 
 /* inode opts: */
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index 0c2b20c9e8c4..22e28d1eeadc 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -230,6 +230,10 @@ const char *bch2_sb_validate(struct bch_sb_handle *disk_sb)
 	    le16_to_cpu(sb->version) > BCH_SB_VERSION_MAX)
 		return "Unsupported superblock version";
 
+	if (sb->features[1] ||
+	    (le64_to_cpu(sb->features[0]) & (~0ULL << BCH_FEATURE_NR)))
+		return "Filesystem has incompatible features";
+
 	if (le16_to_cpu(sb->version) < BCH_SB_VERSION_EXTENT_MAX) {
 		SET_BCH_SB_ENCODED_EXTENT_MAX_BITS(sb, 7);
 		SET_BCH_SB_POSIX_ACL(sb, 1);
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index 4ca84de6ab0e..4273aad16675 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -10,7 +10,6 @@
 
 #include "bcachefs.h"
 #include "alloc_background.h"
-#include "compress.h"
 #include "sysfs.h"
 #include "btree_cache.h"
 #include "btree_io.h"
@@ -581,14 +580,9 @@ STORE(bch2_fs_opts_dir)
 	if (ret < 0)
 		return ret;
 
-	if (id == Opt_compression ||
-	    id == Opt_background_compression) {
-		int ret = bch2_check_set_has_compressed_data(c, v);
-		if (ret) {
-			mutex_unlock(&c->sb_lock);
-			return ret;
-		}
-	}
+	ret = bch2_opt_check_may_set(c, id, v);
+	if (ret < 0)
+		return ret;
 
 	if (opt->set_sb != SET_NO_SB_OPT) {
 		mutex_lock(&c->sb_lock);
diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c
index 7f6258e09a0d..ab358c434753 100644
--- a/fs/bcachefs/xattr.c
+++ b/fs/bcachefs/xattr.c
@@ -3,7 +3,6 @@
 #include "bcachefs.h"
 #include "bkey_methods.h"
 #include "btree_update.h"
-#include "compress.h"
 #include "extents.h"
 #include "fs.h"
 #include "rebalance.h"
@@ -433,12 +432,9 @@ static int bch2_xattr_bcachefs_set(const struct xattr_handler *handler,
 		if (ret < 0)
 			return ret;
 
-		if (s.id == Opt_compression ||
-		    s.id == Opt_background_compression) {
-			ret = bch2_check_set_has_compressed_data(c, s.v);
-			if (ret)
-				return ret;
-		}
+		ret = bch2_opt_check_may_set(c, s.id, s.v);
+		if (ret < 0)
+			return ret;
 
 		s.defined = true;
 	} else {
-- 
cgit 


From af9d3bc203c9ecb66f5ca344090b61722d9b755a Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 30 Oct 2018 14:32:47 -0400
Subject: bcachefs: stripe support for replicas tracking

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs_format.h |  21 ++++-
 fs/bcachefs/recovery.c        |   3 +-
 fs/bcachefs/replicas.c        | 205 +++++++++++++++++++++++++++++++++++-------
 fs/bcachefs/replicas.h        |  10 ++-
 fs/bcachefs/super-io.c        |  36 ++++++--
 fs/bcachefs/super-io.h        |   1 +
 fs/bcachefs/sysfs.c           |   4 +-
 7 files changed, 230 insertions(+), 50 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index eb14fcf15a96..ecb7a97ee533 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -888,10 +888,11 @@ struct bch_sb_field {
 	x(journal,	0)	\
 	x(members,	1)	\
 	x(crypt,	2)	\
-	x(replicas,	3)	\
+	x(replicas_v0,	3)	\
 	x(quota,	4)	\
 	x(disk_groups,	5)	\
-	x(clean,	6)
+	x(clean,	6)	\
+	x(replicas,	7)
 
 enum bch_sb_field_type {
 #define x(f, nr)	BCH_SB_FIELD_##f = nr,
@@ -1017,16 +1018,28 @@ enum bch_data_type {
 	BCH_DATA_NR		= 6,
 };
 
+struct bch_replicas_entry_v0 {
+	__u8			data_type;
+	__u8			nr_devs;
+	__u8			devs[];
+} __attribute__((packed));
+
+struct bch_sb_field_replicas_v0 {
+	struct bch_sb_field	field;
+	struct bch_replicas_entry_v0 entries[];
+} __attribute__((packed, aligned(8)));
+
 struct bch_replicas_entry {
 	__u8			data_type;
 	__u8			nr_devs;
+	__u8			nr_required;
 	__u8			devs[];
-};
+} __attribute__((packed));
 
 struct bch_sb_field_replicas {
 	struct bch_sb_field	field;
 	struct bch_replicas_entry entries[];
-};
+} __attribute__((packed, aligned(8)));
 
 /* BCH_SB_FIELD_quota: */
 
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 696e01f4962f..1ae8133a1ef7 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -130,7 +130,8 @@ int bch2_fs_recovery(struct bch_fs *c)
 	int ret;
 
 	mutex_lock(&c->sb_lock);
-	if (!bch2_sb_get_replicas(c->disk_sb.sb)) {
+	if (!rcu_dereference_protected(c->replicas,
+			lockdep_is_held(&c->sb_lock))->nr) {
 		bch_info(c, "building replicas info");
 		set_bit(BCH_FS_REBUILD_REPLICAS, &c->flags);
 	}
diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c
index fb11b97cdeee..ef62756e8908 100644
--- a/fs/bcachefs/replicas.c
+++ b/fs/bcachefs/replicas.c
@@ -45,7 +45,10 @@ static void replicas_entry_to_text(struct printbuf *out,
 {
 	unsigned i;
 
-	pr_buf(out, "%u: [", e->data_type);
+	pr_buf(out, "%s: %u/%u [",
+	       bch2_data_types[e->data_type],
+	       e->nr_required,
+	       e->nr_devs);
 
 	for (i = 0; i < e->nr_devs; i++)
 		pr_buf(out, i ? " %u" : "%u", e->devs[i]);
@@ -75,6 +78,8 @@ static void extent_to_replicas(struct bkey_s_c k,
 		const union bch_extent_entry *entry;
 		struct extent_ptr_decoded p;
 
+		r->nr_required	= 1;
+
 		extent_for_each_ptr_decode(e, p, entry)
 			if (!p.ptr.cached)
 				r->devs[r->nr_devs++] = p.ptr.dev;
@@ -115,6 +120,7 @@ static inline void devlist_to_replicas(struct bch_devs_list devs,
 
 	e->data_type	= data_type;
 	e->nr_devs	= 0;
+	e->nr_required	= 1;
 
 	for (i = 0; i < devs.nr; i++)
 		e->devs[e->nr_devs++] = devs.devs[i];
@@ -359,14 +365,13 @@ __bch2_sb_replicas_to_cpu_replicas(struct bch_sb_field_replicas *sb_r)
 {
 	struct bch_replicas_entry *e, *dst;
 	struct bch_replicas_cpu *cpu_r;
-	unsigned nr = 0, entry_size = 0;
+	unsigned nr = 0, entry_size = 0, idx = 0;
 
-	if (sb_r)
-		for_each_replicas_entry(sb_r, e) {
-			entry_size = max_t(unsigned, entry_size,
-					   replicas_entry_bytes(e));
-			nr++;
-		}
+	for_each_replicas_entry(sb_r, e) {
+		entry_size = max_t(unsigned, entry_size,
+				   replicas_entry_bytes(e));
+		nr++;
+	}
 
 	cpu_r = kzalloc(sizeof(struct bch_replicas_cpu) +
 			nr * entry_size, GFP_NOIO);
@@ -376,29 +381,71 @@ __bch2_sb_replicas_to_cpu_replicas(struct bch_sb_field_replicas *sb_r)
 	cpu_r->nr		= nr;
 	cpu_r->entry_size	= entry_size;
 
-	nr = 0;
+	for_each_replicas_entry(sb_r, e) {
+		dst = cpu_replicas_entry(cpu_r, idx++);
+		memcpy(dst, e, replicas_entry_bytes(e));
+		replicas_entry_sort(dst);
+	}
 
-	if (sb_r)
-		for_each_replicas_entry(sb_r, e) {
-			dst = cpu_replicas_entry(cpu_r, nr++);
-			memcpy(dst, e, replicas_entry_bytes(e));
-			replicas_entry_sort(dst);
-		}
+	return cpu_r;
+}
+
+static struct bch_replicas_cpu *
+__bch2_sb_replicas_v0_to_cpu_replicas(struct bch_sb_field_replicas_v0 *sb_r)
+{
+	struct bch_replicas_entry_v0 *e;
+	struct bch_replicas_cpu *cpu_r;
+	unsigned nr = 0, entry_size = 0, idx = 0;
+
+	for_each_replicas_entry(sb_r, e) {
+		entry_size = max_t(unsigned, entry_size,
+				   replicas_entry_bytes(e));
+		nr++;
+	}
+
+	entry_size += sizeof(struct bch_replicas_entry) -
+		sizeof(struct bch_replicas_entry_v0);
+
+	cpu_r = kzalloc(sizeof(struct bch_replicas_cpu) +
+			nr * entry_size, GFP_NOIO);
+	if (!cpu_r)
+		return NULL;
+
+	cpu_r->nr		= nr;
+	cpu_r->entry_size	= entry_size;
+
+	for_each_replicas_entry(sb_r, e) {
+		struct bch_replicas_entry *dst =
+			cpu_replicas_entry(cpu_r, idx++);
+
+		dst->data_type	= e->data_type;
+		dst->nr_devs	= e->nr_devs;
+		dst->nr_required = 1;
+		memcpy(dst->devs, e->devs, e->nr_devs);
+		replicas_entry_sort(dst);
+	}
 
-	bch2_cpu_replicas_sort(cpu_r);
 	return cpu_r;
 }
 
 int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *c)
 {
-	struct bch_sb_field_replicas *sb_r;
+	struct bch_sb_field_replicas *sb_v1;
+	struct bch_sb_field_replicas_v0 *sb_v0;
 	struct bch_replicas_cpu *cpu_r, *old_r;
 
-	sb_r	= bch2_sb_get_replicas(c->disk_sb.sb);
-	cpu_r	= __bch2_sb_replicas_to_cpu_replicas(sb_r);
+	if ((sb_v1 = bch2_sb_get_replicas(c->disk_sb.sb)))
+		cpu_r = __bch2_sb_replicas_to_cpu_replicas(sb_v1);
+	else if ((sb_v0 = bch2_sb_get_replicas_v0(c->disk_sb.sb)))
+		cpu_r = __bch2_sb_replicas_v0_to_cpu_replicas(sb_v0);
+	else
+		cpu_r = kzalloc(sizeof(struct bch_replicas_cpu), GFP_NOIO);
+
 	if (!cpu_r)
 		return -ENOMEM;
 
+	bch2_cpu_replicas_sort(cpu_r);
+
 	old_r = rcu_dereference_check(c->replicas, lockdep_is_held(&c->sb_lock));
 	rcu_assign_pointer(c->replicas, cpu_r);
 	if (old_r)
@@ -407,23 +454,72 @@ int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *c)
 	return 0;
 }
 
+static int bch2_cpu_replicas_to_sb_replicas_v0(struct bch_fs *c,
+					       struct bch_replicas_cpu *r)
+{
+	struct bch_sb_field_replicas_v0 *sb_r;
+	struct bch_replicas_entry_v0 *dst;
+	struct bch_replicas_entry *src;
+	size_t bytes;
+
+	bytes = sizeof(struct bch_sb_field_replicas);
+
+	for_each_cpu_replicas_entry(r, src)
+		bytes += replicas_entry_bytes(src) - 1;
+
+	sb_r = bch2_sb_resize_replicas_v0(&c->disk_sb,
+			DIV_ROUND_UP(bytes, sizeof(u64)));
+	if (!sb_r)
+		return -ENOSPC;
+
+	bch2_sb_field_delete(&c->disk_sb, BCH_SB_FIELD_replicas);
+	sb_r = bch2_sb_get_replicas_v0(c->disk_sb.sb);
+
+	memset(&sb_r->entries, 0,
+	       vstruct_end(&sb_r->field) -
+	       (void *) &sb_r->entries);
+
+	dst = sb_r->entries;
+	for_each_cpu_replicas_entry(r, src) {
+		dst->data_type	= src->data_type;
+		dst->nr_devs	= src->nr_devs;
+		memcpy(dst->devs, src->devs, src->nr_devs);
+
+		dst = replicas_entry_next(dst);
+
+		BUG_ON((void *) dst > vstruct_end(&sb_r->field));
+	}
+
+	return 0;
+}
+
 static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *c,
 					    struct bch_replicas_cpu *r)
 {
 	struct bch_sb_field_replicas *sb_r;
 	struct bch_replicas_entry *dst, *src;
+	bool need_v1 = false;
 	size_t bytes;
 
 	bytes = sizeof(struct bch_sb_field_replicas);
 
-	for_each_cpu_replicas_entry(r, src)
+	for_each_cpu_replicas_entry(r, src) {
 		bytes += replicas_entry_bytes(src);
+		if (src->nr_required != 1)
+			need_v1 = true;
+	}
+
+	if (!need_v1)
+		return bch2_cpu_replicas_to_sb_replicas_v0(c, r);
 
 	sb_r = bch2_sb_resize_replicas(&c->disk_sb,
 			DIV_ROUND_UP(bytes, sizeof(u64)));
 	if (!sb_r)
 		return -ENOSPC;
 
+	bch2_sb_field_delete(&c->disk_sb, BCH_SB_FIELD_replicas_v0);
+	sb_r = bch2_sb_get_replicas(c->disk_sb.sb);
+
 	memset(&sb_r->entries, 0,
 	       vstruct_end(&sb_r->field) -
 	       (void *) &sb_r->entries);
@@ -482,8 +578,10 @@ static const char *bch2_sb_validate_replicas(struct bch_sb *sb, struct bch_sb_fi
 		if (!e->nr_devs)
 			goto err;
 
-		err = "invalid replicas entry: too many devices";
-		if (e->nr_devs >= BCH_REPLICAS_MAX)
+		err = "invalid replicas entry: bad nr_required";
+		if (!e->nr_required ||
+		    (e->nr_required > 1 &&
+		     e->nr_required >= e->nr_devs))
 			goto err;
 
 		err = "invalid replicas entry: invalid device";
@@ -525,6 +623,45 @@ const struct bch_sb_field_ops bch_sb_field_ops_replicas = {
 	.to_text	= bch2_sb_replicas_to_text,
 };
 
+static const char *bch2_sb_validate_replicas_v0(struct bch_sb *sb, struct bch_sb_field *f)
+{
+	struct bch_sb_field_replicas_v0 *sb_r = field_to_type(f, replicas_v0);
+	struct bch_sb_field_members *mi = bch2_sb_get_members(sb);
+	struct bch_replicas_cpu *cpu_r = NULL;
+	struct bch_replicas_entry_v0 *e;
+	const char *err;
+	unsigned i;
+
+	for_each_replicas_entry_v0(sb_r, e) {
+		err = "invalid replicas entry: invalid data type";
+		if (e->data_type >= BCH_DATA_NR)
+			goto err;
+
+		err = "invalid replicas entry: no devices";
+		if (!e->nr_devs)
+			goto err;
+
+		err = "invalid replicas entry: invalid device";
+		for (i = 0; i < e->nr_devs; i++)
+			if (!bch2_dev_exists(sb, mi, e->devs[i]))
+				goto err;
+	}
+
+	err = "cannot allocate memory";
+	cpu_r = __bch2_sb_replicas_v0_to_cpu_replicas(sb_r);
+	if (!cpu_r)
+		goto err;
+
+	err = check_dup_replicas_entries(cpu_r);
+err:
+	kfree(cpu_r);
+	return err;
+}
+
+const struct bch_sb_field_ops bch_sb_field_ops_replicas_v0 = {
+	.validate	= bch2_sb_validate_replicas_v0,
+};
+
 /* Query replicas: */
 
 bool bch2_replicas_marked(struct bch_fs *c,
@@ -591,7 +728,7 @@ struct replicas_status __bch2_replicas_status(struct bch_fs *c,
 	memset(&ret, 0, sizeof(ret));
 
 	for (i = 0; i < ARRAY_SIZE(ret.replicas); i++)
-		ret.replicas[i].nr_online = UINT_MAX;
+		ret.replicas[i].redundancy = INT_MAX;
 
 	mi = bch2_sb_get_members(c->disk_sb.sb);
 	rcu_read_lock();
@@ -613,9 +750,9 @@ struct replicas_status __bch2_replicas_status(struct bch_fs *c,
 				nr_offline++;
 		}
 
-		ret.replicas[e->data_type].nr_online =
-			min(ret.replicas[e->data_type].nr_online,
-			    nr_online);
+		ret.replicas[e->data_type].redundancy =
+			min(ret.replicas[e->data_type].redundancy,
+			    (int) nr_online - (int) e->nr_required);
 
 		ret.replicas[e->data_type].nr_offline =
 			max(ret.replicas[e->data_type].nr_offline,
@@ -624,6 +761,10 @@ struct replicas_status __bch2_replicas_status(struct bch_fs *c,
 
 	rcu_read_unlock();
 
+	for (i = 0; i < ARRAY_SIZE(ret.replicas); i++)
+		if (ret.replicas[i].redundancy == INT_MAX)
+			ret.replicas[i].redundancy = 0;
+
 	return ret;
 }
 
@@ -638,7 +779,7 @@ static bool have_enough_devs(struct replicas_status s,
 			     bool force_if_lost)
 {
 	return (!s.replicas[type].nr_offline || force_if_degraded) &&
-		(s.replicas[type].nr_online || force_if_lost);
+		(s.replicas[type].redundancy >= 0 || force_if_lost);
 }
 
 bool bch2_have_enough_devs(struct replicas_status s, unsigned flags)
@@ -654,14 +795,14 @@ bool bch2_have_enough_devs(struct replicas_status s, unsigned flags)
 				 flags & BCH_FORCE_IF_DATA_LOST));
 }
 
-unsigned bch2_replicas_online(struct bch_fs *c, bool meta)
+int bch2_replicas_online(struct bch_fs *c, bool meta)
 {
 	struct replicas_status s = bch2_replicas_status(c);
 
-	return meta
-		? min(s.replicas[BCH_DATA_JOURNAL].nr_online,
-		      s.replicas[BCH_DATA_BTREE].nr_online)
-		: s.replicas[BCH_DATA_USER].nr_online;
+	return (meta
+		? min(s.replicas[BCH_DATA_JOURNAL].redundancy,
+		      s.replicas[BCH_DATA_BTREE].redundancy)
+		: s.replicas[BCH_DATA_USER].redundancy) + 1;
 }
 
 unsigned bch2_dev_has_data(struct bch_fs *c, struct bch_dev *ca)
diff --git a/fs/bcachefs/replicas.h b/fs/bcachefs/replicas.h
index d3d81a1a39cd..a343dd9cd97f 100644
--- a/fs/bcachefs/replicas.h
+++ b/fs/bcachefs/replicas.h
@@ -17,7 +17,7 @@ void bch2_cpu_replicas_to_text(struct printbuf *, struct bch_replicas_cpu *);
 
 struct replicas_status {
 	struct {
-		unsigned	nr_online;
+		int		redundancy;
 		unsigned	nr_offline;
 	}			replicas[BCH_DATA_NR];
 };
@@ -27,7 +27,7 @@ struct replicas_status __bch2_replicas_status(struct bch_fs *,
 struct replicas_status bch2_replicas_status(struct bch_fs *);
 bool bch2_have_enough_devs(struct replicas_status, unsigned);
 
-unsigned bch2_replicas_online(struct bch_fs *, bool);
+int bch2_replicas_online(struct bch_fs *, bool);
 unsigned bch2_dev_has_data(struct bch_fs *, struct bch_dev *);
 
 int bch2_replicas_gc_end(struct bch_fs *, int);
@@ -46,8 +46,14 @@ int bch2_replicas_gc_start(struct bch_fs *, unsigned);
 	     (void *) (_i) < vstruct_end(&(_r)->field) && (_i)->data_type;\
 	     (_i) = replicas_entry_next(_i))
 
+#define for_each_replicas_entry_v0(_r, _i)				\
+	for (_i = (_r)->entries;					\
+	     (void *) (_i) < vstruct_end(&(_r)->field) && (_i)->data_type;\
+	     (_i) = replicas_entry_next(_i))
+
 int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *);
 
 extern const struct bch_sb_field_ops bch_sb_field_ops_replicas;
+extern const struct bch_sb_field_ops bch_sb_field_ops_replicas_v0;
 
 #endif /* _BCACHEFS_REPLICAS_H */
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index 22e28d1eeadc..071543033096 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -60,8 +60,13 @@ static struct bch_sb_field *__bch2_sb_field_resize(struct bch_sb_handle *sb,
 		void *src, *dst;
 
 		src = vstruct_end(f);
-		f->u64s = cpu_to_le32(u64s);
-		dst = vstruct_end(f);
+
+		if (u64s) {
+			f->u64s = cpu_to_le32(u64s);
+			dst = vstruct_end(f);
+		} else {
+			dst = f;
+		}
 
 		memmove(dst, src, vstruct_end(sb->sb) - src);
 
@@ -71,7 +76,16 @@ static struct bch_sb_field *__bch2_sb_field_resize(struct bch_sb_handle *sb,
 
 	sb->sb->u64s = cpu_to_le32(sb_u64s);
 
-	return f;
+	return u64s ? f : NULL;
+}
+
+void bch2_sb_field_delete(struct bch_sb_handle *sb,
+			  enum bch_sb_field_type type)
+{
+	struct bch_sb_field *f = bch2_sb_field_get(sb->sb, type);
+
+	if (f)
+		__bch2_sb_field_resize(sb, f, 0);
 }
 
 /* Superblock realloc/free: */
@@ -174,7 +188,8 @@ struct bch_sb_field *bch2_sb_field_resize(struct bch_sb_handle *sb,
 	}
 
 	f = __bch2_sb_field_resize(sb, f, u64s);
-	f->type = cpu_to_le32(type);
+	if (f)
+		f->type = cpu_to_le32(type);
 	return f;
 }
 
@@ -366,6 +381,7 @@ static void __copy_super(struct bch_sb_handle *dst_handle, struct bch_sb *src)
 {
 	struct bch_sb_field *src_f, *dst_f;
 	struct bch_sb *dst = dst_handle->sb;
+	unsigned i;
 
 	dst->version		= src->version;
 	dst->seq		= src->seq;
@@ -384,15 +400,17 @@ static void __copy_super(struct bch_sb_handle *dst_handle, struct bch_sb *src)
 	memcpy(dst->features,	src->features,	sizeof(dst->features));
 	memcpy(dst->compat,	src->compat,	sizeof(dst->compat));
 
-	vstruct_for_each(src, src_f) {
-		if (src_f->type == BCH_SB_FIELD_journal)
+	for (i = 0; i < BCH_SB_FIELD_NR; i++) {
+		if (i == BCH_SB_FIELD_journal)
 			continue;
 
-		dst_f = bch2_sb_field_get(dst, le32_to_cpu(src_f->type));
+		src_f = bch2_sb_field_get(src, i);
+		dst_f = bch2_sb_field_get(dst, i);
 		dst_f = __bch2_sb_field_resize(dst_handle, dst_f,
-					       le32_to_cpu(src_f->u64s));
+				src_f ? le32_to_cpu(src_f->u64s) : 0);
 
-		memcpy(dst_f, src_f, vstruct_bytes(src_f));
+		if (src_f)
+			memcpy(dst_f, src_f, vstruct_bytes(src_f));
 	}
 }
 
diff --git a/fs/bcachefs/super-io.h b/fs/bcachefs/super-io.h
index ceef650d55dd..aa618fe9cd22 100644
--- a/fs/bcachefs/super-io.h
+++ b/fs/bcachefs/super-io.h
@@ -12,6 +12,7 @@
 struct bch_sb_field *bch2_sb_field_get(struct bch_sb *, enum bch_sb_field_type);
 struct bch_sb_field *bch2_sb_field_resize(struct bch_sb_handle *,
 					  enum bch_sb_field_type, unsigned);
+void bch2_sb_field_delete(struct bch_sb_handle *, enum bch_sb_field_type);
 
 #define field_to_type(_f, _name)					\
 	container_of_or_null(_f, struct bch_sb_field_##_name, field)
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index 4273aad16675..6a5da0f12713 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -346,8 +346,8 @@ SHOW(bch2_fs)
 
 	sysfs_print(promote_whole_extents,	c->promote_whole_extents);
 
-	sysfs_printf(meta_replicas_have, "%u",	bch2_replicas_online(c, true));
-	sysfs_printf(data_replicas_have, "%u",	bch2_replicas_online(c, false));
+	sysfs_printf(meta_replicas_have, "%i",	bch2_replicas_online(c, true));
+	sysfs_printf(data_replicas_have, "%i",	bch2_replicas_online(c, false));
 
 	/* Debugging: */
 
-- 
cgit 


From b35b1925832e6384fcb64c347a70ee205f6e89ea Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 5 Nov 2018 02:31:48 -0500
Subject: bcachefs: Move key marking out of extents.c

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update_leaf.c |   3 +
 fs/bcachefs/buckets.c           | 125 ++++++++++++++++++++++++++++++++--------
 fs/bcachefs/buckets.h           |   1 +
 fs/bcachefs/extents.c           |  79 ++-----------------------
 4 files changed, 110 insertions(+), 98 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 104c0b91da75..44501e98a4ac 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -6,6 +6,7 @@
 #include "btree_io.h"
 #include "btree_iter.h"
 #include "btree_locking.h"
+#include "buckets.h"
 #include "debug.h"
 #include "extents.h"
 #include "journal.h"
@@ -204,6 +205,8 @@ btree_insert_key_leaf(struct btree_insert *trans,
 	int old_live_u64s = b->nr.live_u64s;
 	int live_u64s_added, u64s_added;
 
+	bch2_mark_update(trans, insert);
+
 	ret = !btree_node_is_extents(b)
 		? bch2_insert_fixup_key(trans, insert)
 		: bch2_insert_fixup_extent(trans, insert);
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 54eb1b6b820b..ea28788b26dd 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -65,7 +65,9 @@
 
 #include "bcachefs.h"
 #include "alloc_background.h"
+#include "bset.h"
 #include "btree_gc.h"
+#include "btree_update.h"
 #include "buckets.h"
 #include "error.h"
 #include "movinggc.h"
@@ -346,7 +348,8 @@ void bch2_fs_usage_apply(struct bch_fs *c,
 	 * reservation:
 	 */
 	should_not_have_added = added - (s64) (disk_res ? disk_res->sectors : 0);
-	if (WARN_ON(should_not_have_added > 0)) {
+	if (WARN_ONCE(should_not_have_added > 0,
+		      "disk usage increased without a reservation")) {
 		atomic64_sub(should_not_have_added, &c->sectors_available);
 		added -= should_not_have_added;
 	}
@@ -642,9 +645,6 @@ static void bch2_mark_extent(struct bch_fs *c, struct bkey_s_c k,
 			     struct bch_fs_usage *stats,
 			     u64 journal_seq, unsigned flags)
 {
-	unsigned replicas = bch2_extent_nr_dirty_ptrs(k);
-
-	BUG_ON(replicas && replicas - 1 > ARRAY_SIZE(stats->replicas));
 	BUG_ON(!sectors);
 
 	switch (k.k->type) {
@@ -653,38 +653,43 @@ static void bch2_mark_extent(struct bch_fs *c, struct bkey_s_c k,
 		struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
 		const union bch_extent_entry *entry;
 		struct extent_ptr_decoded p;
+		s64 cached_sectors	= 0;
+		s64 dirty_sectors	= 0;
+		unsigned replicas	= 0;
 
 		extent_for_each_ptr_decode(e, p, entry) {
 			s64 disk_sectors = ptr_disk_sectors(e, p, sectors);
 
-			/*
-			 * fs level usage (which determines free space) is in
-			 * uncompressed sectors, until copygc + compression is
-			 * sorted out:
-			 *
-			 * note also that we always update @fs_usage, even when
-			 * we otherwise wouldn't do anything because gc is
-			 * running - this is because the caller still needs to
-			 * account w.r.t. its disk reservation. It is caller's
-			 * responsibility to not apply @fs_usage if gc is in
-			 * progress.
-			 */
-			stats->replicas
-				[!p.ptr.cached && replicas ? replicas - 1 : 0].data
-				[!p.ptr.cached ? data_type : BCH_DATA_CACHED] +=
-					disk_sectors;
-
 			bch2_mark_pointer(c, e, p, disk_sectors, data_type,
 					  stats, journal_seq, flags);
+
+			if (!p.ptr.cached)
+				replicas++;
+
+			if (p.ptr.cached)
+				cached_sectors	+= disk_sectors;
+			else
+				dirty_sectors	+= disk_sectors;
 		}
+
+		replicas	= clamp_t(unsigned,	replicas,
+					  1, ARRAY_SIZE(stats->replicas));
+
+		stats->replicas[0].data[BCH_DATA_CACHED]	+= cached_sectors;
+		stats->replicas[replicas - 1].data[data_type]	+= dirty_sectors;
 		break;
 	}
-	case BCH_RESERVATION:
-		if (replicas)
-			stats->replicas[replicas - 1].persistent_reserved +=
-				sectors * replicas;
+	case BCH_RESERVATION: {
+		unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas;
+
+		sectors *= replicas;
+		replicas = clamp_t(unsigned, replicas,
+				   1, ARRAY_SIZE(stats->replicas));
+
+		stats->replicas[replicas - 1].persistent_reserved += sectors;
 		break;
 	}
+	}
 }
 
 void bch2_mark_key(struct bch_fs *c,
@@ -748,6 +753,76 @@ void bch2_mark_key(struct bch_fs *c,
 	percpu_up_read(&c->usage_lock);
 }
 
+void bch2_mark_update(struct btree_insert *trans,
+		      struct btree_insert_entry *insert)
+{
+	struct bch_fs		*c = trans->c;
+	struct btree_iter	*iter = insert->iter;
+	struct btree		*b = iter->l[0].b;
+	struct btree_node_iter	node_iter = iter->l[0].iter;
+	struct bch_fs_usage	stats = { 0 };
+	struct gc_pos		pos = gc_pos_btree_node(b);
+	struct bkey_packed	*_k;
+
+	if (!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))
+		bch2_mark_key(c, btree_node_type(b), bkey_i_to_s_c(insert->k),
+			      true,
+			      bpos_min(insert->k->k.p, b->key.k.p).offset -
+			      bkey_start_offset(&insert->k->k),
+			      pos, &stats, trans->journal_res.seq, 0);
+
+	while ((_k = bch2_btree_node_iter_peek_filter(&node_iter, b,
+						      KEY_TYPE_DISCARD))) {
+		struct bkey		unpacked;
+		struct bkey_s_c		k;
+		s64			sectors = 0;
+
+		k = bkey_disassemble(b, _k, &unpacked);
+
+		if (btree_node_is_extents(b)
+		    ? bkey_cmp(insert->k->k.p, bkey_start_pos(k.k)) <= 0
+		    : bkey_cmp(insert->k->k.p, k.k->p))
+			break;
+
+		if (btree_node_is_extents(b)) {
+			switch (bch2_extent_overlap(&insert->k->k, k.k)) {
+			case BCH_EXTENT_OVERLAP_ALL:
+				sectors = -((s64) k.k->size);
+				break;
+			case BCH_EXTENT_OVERLAP_BACK:
+				sectors = bkey_start_offset(&insert->k->k) -
+					k.k->p.offset;
+				break;
+			case BCH_EXTENT_OVERLAP_FRONT:
+				sectors = bkey_start_offset(k.k) -
+					insert->k->k.p.offset;
+				break;
+			case BCH_EXTENT_OVERLAP_MIDDLE:
+				sectors = k.k->p.offset - insert->k->k.p.offset;
+				BUG_ON(sectors <= 0);
+
+				bch2_mark_key(c, btree_node_type(b), k,
+					      true, sectors,
+					      pos, &stats, trans->journal_res.seq, 0);
+
+				sectors = bkey_start_offset(&insert->k->k) -
+					k.k->p.offset;
+				break;
+			}
+
+			BUG_ON(sectors >= 0);
+		}
+
+		bch2_mark_key(c, btree_node_type(b), k,
+			      false, sectors,
+			      pos, &stats, trans->journal_res.seq, 0);
+
+		bch2_btree_node_iter_advance(&node_iter, b);
+	}
+
+	bch2_fs_usage_apply(c, &stats, trans->disk_res, pos);
+}
+
 /* Disk reservations: */
 
 static u64 __recalc_sectors_available(struct bch_fs *c)
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index c40ffe862a06..8fe6871ad165 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -213,6 +213,7 @@ void bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *,
 void bch2_mark_key(struct bch_fs *, enum bkey_type, struct bkey_s_c,
 		   bool, s64, struct gc_pos,
 		   struct bch_fs_usage *, u64, unsigned);
+void bch2_mark_update(struct btree_insert *, struct btree_insert_entry *);
 
 void bch2_recalc_sectors_available(struct bch_fs *);
 
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index a7223e7c8793..0cf343624793 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -1009,7 +1009,6 @@ struct extent_insert_state {
 	struct btree_insert		*trans;
 	struct btree_insert_entry	*insert;
 	struct bpos			committed;
-	struct bch_fs_usage		stats;
 
 	/* for deleting: */
 	struct bkey_i			whiteout;
@@ -1018,54 +1017,6 @@ struct extent_insert_state {
 	bool				deleting;
 };
 
-static void bch2_add_sectors(struct extent_insert_state *s,
-			     struct bkey_s_c k, u64 offset, s64 sectors)
-{
-	struct bch_fs *c = s->trans->c;
-	struct btree *b = s->insert->iter->l[0].b;
-
-	EBUG_ON(bkey_cmp(bkey_start_pos(k.k), b->data->min_key) < 0);
-
-	if (!sectors)
-		return;
-
-	bch2_mark_key(c, BKEY_TYPE_EXTENTS, k, sectors > 0, sectors,
-		      gc_pos_btree_node(b), &s->stats,
-		      s->trans->journal_res.seq, 0);
-}
-
-static void bch2_subtract_sectors(struct extent_insert_state *s,
-				 struct bkey_s_c k, u64 offset, s64 sectors)
-{
-	bch2_add_sectors(s, k, offset, -sectors);
-}
-
-/* These wrappers subtract exactly the sectors that we're removing from @k */
-static void bch2_cut_subtract_back(struct extent_insert_state *s,
-				  struct bpos where, struct bkey_s k)
-{
-	bch2_subtract_sectors(s, k.s_c, where.offset,
-			     k.k->p.offset - where.offset);
-	bch2_cut_back(where, k.k);
-}
-
-static void bch2_cut_subtract_front(struct extent_insert_state *s,
-				   struct bpos where, struct bkey_s k)
-{
-	bch2_subtract_sectors(s, k.s_c, bkey_start_offset(k.k),
-			     where.offset - bkey_start_offset(k.k));
-	__bch2_cut_front(where, k);
-}
-
-static void bch2_drop_subtract(struct extent_insert_state *s, struct bkey_s k)
-{
-	if (k.k->size)
-		bch2_subtract_sectors(s, k.s_c,
-				     bkey_start_offset(k.k), k.k->size);
-	k.k->size = 0;
-	k.k->type = KEY_TYPE_DELETED;
-}
-
 static bool bch2_extent_merge_inline(struct bch_fs *,
 				     struct btree_iter *,
 				     struct bkey_packed *,
@@ -1166,11 +1117,7 @@ static void extent_insert_committed(struct extent_insert_state *s)
 	if (s->deleting)
 		split.k.k.type = KEY_TYPE_DISCARD;
 
-	if (!(s->trans->flags & BTREE_INSERT_JOURNAL_REPLAY))
-		bch2_cut_subtract_back(s, s->committed,
-				       bkey_i_to_s(&split.k));
-	else
-		bch2_cut_back(s->committed, &split.k.k);
+	bch2_cut_back(s->committed, &split.k.k);
 
 	if (!bkey_cmp(s->committed, iter->pos))
 		return;
@@ -1290,7 +1237,7 @@ extent_squash(struct extent_insert_state *s, struct bkey_i *insert,
 	switch (overlap) {
 	case BCH_EXTENT_OVERLAP_FRONT:
 		/* insert overlaps with start of k: */
-		bch2_cut_subtract_front(s, insert->k.p, k);
+		__bch2_cut_front(insert->k.p, k);
 		BUG_ON(bkey_deleted(k.k));
 		extent_save(l->b, _k, k.k);
 		verify_modified_extent(iter, _k);
@@ -1298,7 +1245,7 @@ extent_squash(struct extent_insert_state *s, struct bkey_i *insert,
 
 	case BCH_EXTENT_OVERLAP_BACK:
 		/* insert overlaps with end of k: */
-		bch2_cut_subtract_back(s, bkey_start_pos(&insert->k), k);
+		bch2_cut_back(bkey_start_pos(&insert->k), k.k);
 		BUG_ON(bkey_deleted(k.k));
 		extent_save(l->b, _k, k.k);
 
@@ -1318,7 +1265,8 @@ extent_squash(struct extent_insert_state *s, struct bkey_i *insert,
 		if (!bkey_whiteout(k.k))
 			btree_account_key_drop(l->b, _k);
 
-		bch2_drop_subtract(s, k);
+		k.k->size = 0;
+		k.k->type = KEY_TYPE_DELETED;
 
 		if (_k >= btree_bset_last(l->b)->start) {
 			unsigned u64s = _k->u64s;
@@ -1358,14 +1306,11 @@ extent_squash(struct extent_insert_state *s, struct bkey_i *insert,
 		bch2_cut_back(bkey_start_pos(&insert->k), &split.k.k);
 		BUG_ON(bkey_deleted(&split.k.k));
 
-		bch2_cut_subtract_front(s, insert->k.p, k);
+		__bch2_cut_front(insert->k.p, k);
 		BUG_ON(bkey_deleted(k.k));
 		extent_save(l->b, _k, k.k);
 		verify_modified_extent(iter, _k);
 
-		bch2_add_sectors(s, bkey_i_to_s_c(&split.k),
-				bkey_start_offset(&split.k.k),
-				split.k.k.size);
 		extent_bset_insert(c, iter, &split.k);
 		break;
 	}
@@ -1414,8 +1359,6 @@ static void __bch2_insert_fixup_extent(struct extent_insert_state *s)
 		    !bkey_cmp(bkey_start_pos(&insert->k), bkey_start_pos(k.k))) {
 			if (!bkey_whiteout(k.k)) {
 				btree_account_key_drop(l->b, _k);
-				bch2_subtract_sectors(s, k.s_c,
-						      bkey_start_offset(k.k), k.k->size);
 				_k->type = KEY_TYPE_DISCARD;
 				reserve_whiteout(l->b, _k);
 			}
@@ -1505,7 +1448,6 @@ enum btree_insert_ret
 bch2_insert_fixup_extent(struct btree_insert *trans,
 			 struct btree_insert_entry *insert)
 {
-	struct bch_fs *c	= trans->c;
 	struct btree_iter *iter	= insert->iter;
 	struct btree *b		= iter->l[0].b;
 	struct extent_insert_state s = {
@@ -1530,19 +1472,10 @@ bch2_insert_fixup_extent(struct btree_insert *trans,
 	 */
 	EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k->k)));
 
-	if (!s.deleting &&
-	    !(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))
-		bch2_add_sectors(&s, bkey_i_to_s_c(insert->k),
-				bkey_start_offset(&insert->k->k),
-				insert->k->k.size);
-
 	__bch2_insert_fixup_extent(&s);
 
 	extent_insert_committed(&s);
 
-	bch2_fs_usage_apply(c, &s.stats, trans->disk_res,
-			   gc_pos_btree_node(b));
-
 	EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k->k)));
 	EBUG_ON(bkey_cmp(iter->pos, s.committed));
 
-- 
cgit 


From 1d25849c2c2d552b09494b984da915be4a703a18 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 7 Nov 2018 17:48:32 -0500
Subject: bcachefs: Centralize marking of replicas in btree update path

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_gc.c          |  2 +-
 fs/bcachefs/btree_types.h       |  2 +-
 fs/bcachefs/btree_update_leaf.c | 68 +++++++++++++++++++++++------------------
 fs/bcachefs/extents.c           |  6 ++--
 fs/bcachefs/io.c                |  8 -----
 fs/bcachefs/io.h                |  3 +-
 fs/bcachefs/journal_io.c        |  2 +-
 fs/bcachefs/migrate.c           |  5 ---
 fs/bcachefs/move.c              |  8 +----
 fs/bcachefs/replicas.c          | 67 ++++++++++++++++++++--------------------
 fs/bcachefs/replicas.h          |  4 +--
 11 files changed, 84 insertions(+), 91 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 0fb89e03fac8..b0d04ed5f2a6 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -155,7 +155,7 @@ static int bch2_btree_mark_ptrs_initial(struct bch_fs *c, enum bkey_type type,
 	       k.k->version.lo > journal_cur_seq(&c->journal));
 
 	if (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) ||
-	    fsck_err_on(!bch2_bkey_replicas_marked(c, type, k), c,
+	    fsck_err_on(!bch2_bkey_replicas_marked(c, type, k, false), c,
 			"superblock not marked as containing replicas (type %u)",
 			data_type)) {
 		ret = bch2_mark_bkey_replicas(c, type, k);
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 467c619f7f6d..7e9ba60288aa 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -440,11 +440,11 @@ enum btree_insert_ret {
 	BTREE_INSERT_OK,
 	/* extent spanned multiple leaf nodes: have to traverse to next node: */
 	BTREE_INSERT_NEED_TRAVERSE,
-	/* write lock held for too long */
 	/* leaf node needs to be split */
 	BTREE_INSERT_BTREE_NODE_FULL,
 	BTREE_INSERT_ENOSPC,
 	BTREE_INSERT_NEED_GC_LOCK,
+	BTREE_INSERT_NEED_MARK_REPLICAS,
 };
 
 enum btree_gc_coalesce_fail_reason {
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 44501e98a4ac..093e480977c7 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -12,6 +12,7 @@
 #include "journal.h"
 #include "journal_reclaim.h"
 #include "keylist.h"
+#include "replicas.h"
 #include "trace.h"
 
 #include <linux/sort.h>
@@ -301,8 +302,8 @@ static inline int btree_trans_cmp(struct btree_insert_entry l,
 
 static enum btree_insert_ret
 btree_key_can_insert(struct btree_insert *trans,
-		      struct btree_insert_entry *insert,
-		      unsigned *u64s)
+		     struct btree_insert_entry *insert,
+		     unsigned *u64s)
 {
 	struct bch_fs *c = trans->c;
 	struct btree *b = insert->iter->l[0].b;
@@ -311,6 +312,12 @@ btree_key_can_insert(struct btree_insert *trans,
 	if (unlikely(btree_node_fake(b)))
 		return BTREE_INSERT_BTREE_NODE_FULL;
 
+	if (!bch2_bkey_replicas_marked(c,
+			insert->iter->btree_id,
+			bkey_i_to_s_c(insert->k),
+			true))
+		return BTREE_INSERT_NEED_MARK_REPLICAS;
+
 	ret = !btree_node_is_extents(b)
 		? BTREE_INSERT_OK
 		: bch2_extent_can_insert(trans, insert, u64s);
@@ -327,8 +334,7 @@ btree_key_can_insert(struct btree_insert *trans,
  * Get journal reservation, take write locks, and attempt to do btree update(s):
  */
 static inline int do_btree_insert_at(struct btree_insert *trans,
-				     struct btree_iter **split,
-				     bool *cycle_gc_lock)
+				     struct btree_insert_entry **stopped_at)
 {
 	struct bch_fs *c = trans->c;
 	struct btree_insert_entry *i;
@@ -372,22 +378,10 @@ static inline int do_btree_insert_at(struct btree_insert *trans,
 			u64s = 0;
 
 		u64s += i->k->k.u64s;
-		switch (btree_key_can_insert(trans, i, &u64s)) {
-		case BTREE_INSERT_OK:
-			break;
-		case BTREE_INSERT_BTREE_NODE_FULL:
-			ret = -EINTR;
-			*split = i->iter;
-			goto out;
-		case BTREE_INSERT_ENOSPC:
-			ret = -ENOSPC;
+		ret = btree_key_can_insert(trans, i, &u64s);
+		if (ret) {
+			*stopped_at = i;
 			goto out;
-		case BTREE_INSERT_NEED_GC_LOCK:
-			ret = -EINTR;
-			*cycle_gc_lock = true;
-			goto out;
-		default:
-			BUG();
 		}
 	}
 
@@ -445,8 +439,7 @@ int __bch2_btree_insert_at(struct btree_insert *trans)
 {
 	struct bch_fs *c = trans->c;
 	struct btree_insert_entry *i;
-	struct btree_iter *linked, *split = NULL;
-	bool cycle_gc_lock = false;
+	struct btree_iter *linked;
 	unsigned flags;
 	int ret;
 
@@ -466,9 +459,6 @@ int __bch2_btree_insert_at(struct btree_insert *trans)
 	if (unlikely(!percpu_ref_tryget(&c->writes)))
 		return -EROFS;
 retry:
-	split = NULL;
-	cycle_gc_lock = false;
-
 	trans_for_each_entry(trans, i) {
 		unsigned old_locks_want = i->iter->locks_want;
 		unsigned old_uptodate = i->iter->uptodate;
@@ -486,7 +476,7 @@ retry:
 		}
 	}
 
-	ret = do_btree_insert_at(trans, &split, &cycle_gc_lock);
+	ret = do_btree_insert_at(trans, &i);
 	if (unlikely(ret))
 		goto err;
 
@@ -521,8 +511,9 @@ err:
 	if (!trans->did_work)
 		flags &= ~BTREE_INSERT_NOUNLOCK;
 
-	if (split) {
-		ret = bch2_btree_split_leaf(c, split, flags);
+	switch (ret) {
+	case BTREE_INSERT_BTREE_NODE_FULL:
+		ret = bch2_btree_split_leaf(c, i->iter, flags);
 
 		/*
 		 * if the split succeeded without dropping locks the insert will
@@ -547,9 +538,10 @@ err:
 			trans_restart(" (split)");
 			ret = -EINTR;
 		}
-	}
+		break;
+	case BTREE_INSERT_NEED_GC_LOCK:
+		ret = -EINTR;
 
-	if (cycle_gc_lock) {
 		if (!down_read_trylock(&c->gc_lock)) {
 			if (flags & BTREE_INSERT_NOUNLOCK)
 				goto out;
@@ -558,6 +550,24 @@ err:
 			down_read(&c->gc_lock);
 		}
 		up_read(&c->gc_lock);
+		break;
+	case BTREE_INSERT_ENOSPC:
+		ret = -ENOSPC;
+		break;
+	case BTREE_INSERT_NEED_MARK_REPLICAS:
+		if (flags & BTREE_INSERT_NOUNLOCK) {
+			ret = -EINTR;
+			goto out;
+		}
+
+		bch2_btree_iter_unlock(trans->entries[0].iter);
+		ret = bch2_mark_bkey_replicas(c, i->iter->btree_id,
+					      bkey_i_to_s_c(i->k))
+			?: -EINTR;
+		break;
+	default:
+		BUG_ON(ret >= 0);
+		break;
 	}
 
 	if (ret == -EINTR) {
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 0cf343624793..df0ca1fcf2e8 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -675,7 +675,8 @@ void bch2_btree_ptr_debugcheck(struct bch_fs *c, struct btree *b,
 	}
 
 	if (!test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) &&
-	    !bch2_bkey_replicas_marked(c, btree_node_type(b), e.s_c)) {
+	    !bch2_bkey_replicas_marked(c, btree_node_type(b),
+				       e.s_c, false)) {
 		bch2_bkey_val_to_text(&PBUF(buf), c, btree_node_type(b), k);
 		bch2_fs_bug(c,
 			"btree key bad (replicas not marked in superblock):\n%s",
@@ -1635,7 +1636,8 @@ static void bch2_extent_debugcheck_extent(struct bch_fs *c, struct btree *b,
 	}
 
 	if (!test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) &&
-	    !bch2_bkey_replicas_marked(c, btree_node_type(b), e.s_c)) {
+	    !bch2_bkey_replicas_marked(c, btree_node_type(b),
+				       e.s_c, false)) {
 		bch2_bkey_val_to_text(&PBUF(buf), c, btree_node_type(b),
 				      e.s_c);
 		bch2_fs_bug(c,
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index 133b702299dd..fbd0a82fdeac 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -23,7 +23,6 @@
 #include "keylist.h"
 #include "move.h"
 #include "rebalance.h"
-#include "replicas.h"
 #include "super.h"
 #include "super-io.h"
 #include "trace.h"
@@ -336,13 +335,6 @@ static void __bch2_write_index(struct bch_write_op *op)
 			goto err;
 		}
 
-		if (!(op->flags & BCH_WRITE_NOMARK_REPLICAS)) {
-			ret = bch2_mark_bkey_replicas(c, BKEY_TYPE_EXTENTS,
-						      e.s_c);
-			if (ret)
-				goto err;
-		}
-
 		dst = bkey_next(dst);
 	}
 
diff --git a/fs/bcachefs/io.h b/fs/bcachefs/io.h
index 8a7f246e8823..84070b674187 100644
--- a/fs/bcachefs/io.h
+++ b/fs/bcachefs/io.h
@@ -35,10 +35,9 @@ enum bch_write_flags {
 	BCH_WRITE_PAGES_OWNED		= (1 << 5),
 	BCH_WRITE_ONLY_SPECIFIED_DEVS	= (1 << 6),
 	BCH_WRITE_NOPUT_RESERVATION	= (1 << 7),
-	BCH_WRITE_NOMARK_REPLICAS	= (1 << 8),
 
 	/* Internal: */
-	BCH_WRITE_JOURNAL_SEQ_PTR	= (1 << 9),
+	BCH_WRITE_JOURNAL_SEQ_PTR	= (1 << 8),
 };
 
 static inline u64 *op_journal_seq(struct bch_write_op *op)
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 4555d55b23dd..b1f6433cf9e9 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -785,7 +785,7 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list)
 		if (!degraded &&
 		    (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) ||
 		     fsck_err_on(!bch2_replicas_marked(c, BCH_DATA_JOURNAL,
-						       i->devs), c,
+						       i->devs, false), c,
 				 "superblock not marked as containing replicas (type %u)",
 				 BCH_DATA_JOURNAL))) {
 			ret = bch2_mark_replicas(c, BCH_DATA_JOURNAL, i->devs);
diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c
index 38b392472521..46878590327d 100644
--- a/fs/bcachefs/migrate.c
+++ b/fs/bcachefs/migrate.c
@@ -72,11 +72,6 @@ static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
 		 */
 		bch2_extent_normalize(c, e.s);
 
-		ret = bch2_mark_bkey_replicas(c, BKEY_TYPE_EXTENTS,
-					      bkey_i_to_s_c(&tmp.key));
-		if (ret)
-			break;
-
 		iter.pos = bkey_start_pos(&tmp.key.k);
 
 		ret = bch2_btree_insert_at(c, NULL, NULL,
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index 1f6bad1ae388..7de3c6c475be 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -150,11 +150,6 @@ static int bch2_migrate_index_update(struct bch_write_op *op)
 			goto next;
 		}
 
-		ret = bch2_mark_bkey_replicas(c, BKEY_TYPE_EXTENTS,
-					      extent_i_to_s_c(insert).s_c);
-		if (ret)
-			break;
-
 		ret = bch2_btree_insert_at(c, &op->res,
 				op_journal_seq(op),
 				BTREE_INSERT_ATOMIC|
@@ -239,8 +234,7 @@ int bch2_migrate_write_init(struct bch_fs *c, struct migrate_write *m,
 	m->op.flags |= BCH_WRITE_ONLY_SPECIFIED_DEVS|
 		BCH_WRITE_PAGES_STABLE|
 		BCH_WRITE_PAGES_OWNED|
-		BCH_WRITE_DATA_ENCODED|
-		BCH_WRITE_NOMARK_REPLICAS;
+		BCH_WRITE_DATA_ENCODED;
 
 	m->op.nr_replicas	= 1;
 	m->op.nr_replicas_required = 1;
diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c
index ef62756e8908..83fc9c93d295 100644
--- a/fs/bcachefs/replicas.c
+++ b/fs/bcachefs/replicas.c
@@ -160,8 +160,8 @@ cpu_replicas_add_entry(struct bch_replicas_cpu *old,
 	return new;
 }
 
-static bool replicas_has_entry(struct bch_replicas_cpu *r,
-			       struct bch_replicas_entry *search)
+static bool __replicas_has_entry(struct bch_replicas_cpu *r,
+				 struct bch_replicas_entry *search)
 {
 	return replicas_entry_bytes(search) <= r->entry_size &&
 		eytzinger0_find(r->entries, r->nr,
@@ -169,6 +169,24 @@ static bool replicas_has_entry(struct bch_replicas_cpu *r,
 				memcmp, search) < r->nr;
 }
 
+static bool replicas_has_entry(struct bch_fs *c,
+			       struct bch_replicas_entry *search,
+			       bool check_gc_replicas)
+{
+	struct bch_replicas_cpu *r, *gc_r;
+	bool marked;
+
+	rcu_read_lock();
+	r = rcu_dereference(c->replicas);
+	marked = __replicas_has_entry(r, search) &&
+		(!check_gc_replicas ||
+		 likely(!(gc_r = rcu_dereference(c->replicas_gc))) ||
+		 __replicas_has_entry(gc_r, search));
+	rcu_read_unlock();
+
+	return marked;
+}
+
 noinline
 static int bch2_mark_replicas_slowpath(struct bch_fs *c,
 				struct bch_replicas_entry *new_entry)
@@ -180,7 +198,7 @@ static int bch2_mark_replicas_slowpath(struct bch_fs *c,
 
 	old_gc = rcu_dereference_protected(c->replicas_gc,
 					   lockdep_is_held(&c->sb_lock));
-	if (old_gc && !replicas_has_entry(old_gc, new_entry)) {
+	if (old_gc && !__replicas_has_entry(old_gc, new_entry)) {
 		new_gc = cpu_replicas_add_entry(old_gc, new_entry);
 		if (!new_gc)
 			goto err;
@@ -188,7 +206,7 @@ static int bch2_mark_replicas_slowpath(struct bch_fs *c,
 
 	old_r = rcu_dereference_protected(c->replicas,
 					  lockdep_is_held(&c->sb_lock));
-	if (!replicas_has_entry(old_r, new_entry)) {
+	if (!__replicas_has_entry(old_r, new_entry)) {
 		new_r = cpu_replicas_add_entry(old_r, new_entry);
 		if (!new_r)
 			goto err;
@@ -227,17 +245,8 @@ err:
 static int __bch2_mark_replicas(struct bch_fs *c,
 				struct bch_replicas_entry *devs)
 {
-	struct bch_replicas_cpu *r, *gc_r;
-	bool marked;
-
-	rcu_read_lock();
-	r = rcu_dereference(c->replicas);
-	gc_r = rcu_dereference(c->replicas_gc);
-	marked = replicas_has_entry(r, devs) &&
-		(!likely(gc_r) || replicas_has_entry(gc_r, devs));
-	rcu_read_unlock();
-
-	return likely(marked) ? 0
+	return likely(replicas_has_entry(c, devs, true))
+		? 0
 		: bch2_mark_replicas_slowpath(c, devs);
 }
 
@@ -666,10 +675,10 @@ const struct bch_sb_field_ops bch_sb_field_ops_replicas_v0 = {
 
 bool bch2_replicas_marked(struct bch_fs *c,
 			  enum bch_data_type data_type,
-			  struct bch_devs_list devs)
+			  struct bch_devs_list devs,
+			  bool check_gc_replicas)
 {
 	struct bch_replicas_entry_padded search;
-	bool ret;
 
 	if (!devs.nr)
 		return true;
@@ -678,19 +687,15 @@ bool bch2_replicas_marked(struct bch_fs *c,
 
 	devlist_to_replicas(devs, data_type, &search.e);
 
-	rcu_read_lock();
-	ret = replicas_has_entry(rcu_dereference(c->replicas), &search.e);
-	rcu_read_unlock();
-
-	return ret;
+	return replicas_has_entry(c, &search.e, check_gc_replicas);
 }
 
 bool bch2_bkey_replicas_marked(struct bch_fs *c,
 			       enum bkey_type type,
-			       struct bkey_s_c k)
+			       struct bkey_s_c k,
+			       bool check_gc_replicas)
 {
 	struct bch_replicas_entry_padded search;
-	bool ret;
 
 	memset(&search, 0, sizeof(search));
 
@@ -700,20 +705,16 @@ bool bch2_bkey_replicas_marked(struct bch_fs *c,
 
 		for (i = 0; i < cached.nr; i++)
 			if (!bch2_replicas_marked(c, BCH_DATA_CACHED,
-					bch2_dev_list_single(cached.devs[i])))
+					bch2_dev_list_single(cached.devs[i]),
+					check_gc_replicas))
 				return false;
 	}
 
 	bkey_to_replicas(type, k, &search.e);
 
-	if (!search.e.nr_devs)
-		return true;
-
-	rcu_read_lock();
-	ret = replicas_has_entry(rcu_dereference(c->replicas), &search.e);
-	rcu_read_unlock();
-
-	return ret;
+	return search.e.nr_devs
+		? replicas_has_entry(c, &search.e, check_gc_replicas)
+		: true;
 }
 
 struct replicas_status __bch2_replicas_status(struct bch_fs *c,
diff --git a/fs/bcachefs/replicas.h b/fs/bcachefs/replicas.h
index a343dd9cd97f..e22d2d7cd08a 100644
--- a/fs/bcachefs/replicas.h
+++ b/fs/bcachefs/replicas.h
@@ -5,9 +5,9 @@
 #include "replicas_types.h"
 
 bool bch2_replicas_marked(struct bch_fs *, enum bch_data_type,
-			  struct bch_devs_list);
+			  struct bch_devs_list, bool);
 bool bch2_bkey_replicas_marked(struct bch_fs *, enum bkey_type,
-			       struct bkey_s_c);
+			       struct bkey_s_c, bool);
 int bch2_mark_replicas(struct bch_fs *, enum bch_data_type,
 		       struct bch_devs_list);
 int bch2_mark_bkey_replicas(struct bch_fs *, enum bkey_type,
-- 
cgit 


From 91f8b5677b5d831cff34b25ef03322ae49e03256 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 12 Nov 2018 17:26:36 -0500
Subject: bcachefs: More btree gc refactorings

more prep work for erasure coding

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_gc.c | 174 +++++++++++++++++++++++++++----------------------
 fs/bcachefs/btree_gc.h |   1 -
 2 files changed, 97 insertions(+), 78 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index b0d04ed5f2a6..92b82eaee69d 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -122,86 +122,105 @@ static bool bkey_type_needs_gc(enum bkey_type type)
 	}
 }
 
-u8 bch2_btree_key_recalc_oldest_gen(struct bch_fs *c, struct bkey_s_c k)
+static void ptr_gen_recalc_oldest(struct bch_fs *c,
+				  const struct bch_extent_ptr *ptr,
+				  u8 *max_stale)
 {
-	const struct bch_extent_ptr *ptr;
-	u8 max_stale = 0;
+	struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+	size_t b = PTR_BUCKET_NR(ca, ptr);
 
-	if (bkey_extent_is_data(k.k)) {
-		struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
+	if (gen_after(ca->oldest_gens[b], ptr->gen))
+		ca->oldest_gens[b] = ptr->gen;
 
-		extent_for_each_ptr(e, ptr) {
-			struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
-			size_t b = PTR_BUCKET_NR(ca, ptr);
+	*max_stale = max(*max_stale, ptr_stale(ca, ptr));
+}
 
-			if (gen_after(ca->oldest_gens[b], ptr->gen))
-				ca->oldest_gens[b] = ptr->gen;
+static u8 ptr_gens_recalc_oldest(struct bch_fs *c,
+				 enum bkey_type type,
+				 struct bkey_s_c k)
+{
+	const struct bch_extent_ptr *ptr;
+	u8 max_stale = 0;
 
-			max_stale = max(max_stale, ptr_stale(ca, ptr));
+	switch (type) {
+	case BKEY_TYPE_BTREE:
+	case BKEY_TYPE_EXTENTS:
+		switch (k.k->type) {
+		case BCH_EXTENT:
+		case BCH_EXTENT_CACHED: {
+			struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
+
+			extent_for_each_ptr(e, ptr)
+				ptr_gen_recalc_oldest(c, ptr, &max_stale);
+			break;
 		}
+		}
+		break;
+	default:
+		break;
 	}
 
 	return max_stale;
 }
 
-static int bch2_btree_mark_ptrs_initial(struct bch_fs *c, enum bkey_type type,
-					struct bkey_s_c k)
+static int ptr_gen_check(struct bch_fs *c,
+			 enum bkey_type type,
+			 const struct bch_extent_ptr *ptr)
 {
-	enum bch_data_type data_type = type == BKEY_TYPE_BTREE
-		? BCH_DATA_BTREE : BCH_DATA_USER;
+	struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+	size_t b = PTR_BUCKET_NR(ca, ptr);
+	struct bucket *g = PTR_BUCKET(ca, ptr);
 	int ret = 0;
 
-	BUG_ON(journal_seq_verify(c) &&
-	       k.k->version.lo > journal_cur_seq(&c->journal));
+	if (mustfix_fsck_err_on(!g->mark.gen_valid, c,
+				"found ptr with missing gen in alloc btree,\n"
+				"type %u gen %u",
+				type, ptr->gen)) {
+		g->_mark.gen = ptr->gen;
+		g->_mark.gen_valid = 1;
+		set_bit(b, ca->buckets_dirty);
+	}
 
-	if (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) ||
-	    fsck_err_on(!bch2_bkey_replicas_marked(c, type, k, false), c,
-			"superblock not marked as containing replicas (type %u)",
-			data_type)) {
-		ret = bch2_mark_bkey_replicas(c, type, k);
-		if (ret)
-			return ret;
+	if (mustfix_fsck_err_on(gen_cmp(ptr->gen, g->mark.gen) > 0, c,
+				"%u ptr gen in the future: %u > %u",
+				type, ptr->gen, g->mark.gen)) {
+		g->_mark.gen = ptr->gen;
+		g->_mark.gen_valid = 1;
+		set_bit(b, ca->buckets_dirty);
+		set_bit(BCH_FS_FIXED_GENS, &c->flags);
 	}
+fsck_err:
+	return ret;
+}
 
-	switch (k.k->type) {
-	case BCH_EXTENT:
-	case BCH_EXTENT_CACHED: {
-		struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
-		const struct bch_extent_ptr *ptr;
-
-		extent_for_each_ptr(e, ptr) {
-			struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
-			size_t b = PTR_BUCKET_NR(ca, ptr);
-			struct bucket *g = PTR_BUCKET(ca, ptr);
-
-			if (mustfix_fsck_err_on(!g->mark.gen_valid, c,
-					"found ptr with missing gen in alloc btree,\n"
-					"type %s gen %u",
-					bch2_data_types[data_type],
-					ptr->gen)) {
-				g->_mark.gen = ptr->gen;
-				g->_mark.gen_valid = 1;
-				set_bit(b, ca->buckets_dirty);
-			}
+static int ptr_gens_check(struct bch_fs *c, enum bkey_type type,
+			  struct bkey_s_c k)
+{
+	const struct bch_extent_ptr *ptr;
+	int ret = 0;
 
-			if (mustfix_fsck_err_on(gen_cmp(ptr->gen, g->mark.gen) > 0, c,
-					"%s ptr gen in the future: %u > %u",
-					bch2_data_types[data_type],
-					ptr->gen, g->mark.gen)) {
-				g->_mark.gen = ptr->gen;
-				g->_mark.gen_valid = 1;
-				set_bit(b, ca->buckets_dirty);
-				set_bit(BCH_FS_FIXED_GENS, &c->flags);
-			}
+	switch (type) {
+	case BKEY_TYPE_BTREE:
+	case BKEY_TYPE_EXTENTS:
+		switch (k.k->type) {
+		case BCH_EXTENT:
+		case BCH_EXTENT_CACHED: {
+			struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
 
+			extent_for_each_ptr(e, ptr) {
+				ret = ptr_gen_check(c, type, ptr);
+				if (ret)
+					return ret;
+
+			}
+			break;
+		}
 		}
 		break;
-	}
+	default:
+		break;
 	}
 
-	if (k.k->version.lo > atomic64_read(&c->key_version))
-		atomic64_set(&c->key_version, k.k->version.lo);
-fsck_err:
 	return ret;
 }
 
@@ -218,31 +237,32 @@ static int bch2_gc_mark_key(struct bch_fs *c, enum bkey_type type,
 		(initial ? BCH_BUCKET_MARK_NOATOMIC : 0);
 	int ret = 0;
 
-	switch (type) {
-	case BKEY_TYPE_BTREE:
-	case BKEY_TYPE_EXTENTS:
-		if (initial) {
-			ret = bch2_btree_mark_ptrs_initial(c, type, k);
-			if (ret < 0)
+	if (initial) {
+		BUG_ON(journal_seq_verify(c) &&
+		       k.k->version.lo > journal_cur_seq(&c->journal));
+
+		if (k.k->version.lo > atomic64_read(&c->key_version))
+			atomic64_set(&c->key_version, k.k->version.lo);
+
+		if (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) ||
+		    fsck_err_on(!bch2_bkey_replicas_marked(c, type, k,
+							   false), c,
+				"superblock not marked as containing replicas (type %u)",
+				type)) {
+			ret = bch2_mark_bkey_replicas(c, type, k);
+			if (ret)
 				return ret;
 		}
-		break;
-	default:
-		break;
-	}
 
-	bch2_mark_key(c, type, k, true, k.k->size,
-		      pos, NULL, 0, flags);
-
-	switch (type) {
-	case BKEY_TYPE_BTREE:
-	case BKEY_TYPE_EXTENTS:
-		ret = bch2_btree_key_recalc_oldest_gen(c, k);
-		break;
-	default:
-		break;
+		ret = ptr_gens_check(c, type, k);
+		if (ret)
+			return ret;
 	}
 
+	bch2_mark_key(c, type, k, true, k.k->size, pos, NULL, 0, flags);
+
+	ret = ptr_gens_recalc_oldest(c, type, k);
+fsck_err:
 	return ret;
 }
 
diff --git a/fs/bcachefs/btree_gc.h b/fs/bcachefs/btree_gc.h
index 54c6bc845930..86b80e32e310 100644
--- a/fs/bcachefs/btree_gc.h
+++ b/fs/bcachefs/btree_gc.h
@@ -11,7 +11,6 @@ void bch2_gc(struct bch_fs *);
 void bch2_gc_thread_stop(struct bch_fs *);
 int bch2_gc_thread_start(struct bch_fs *);
 int bch2_initial_gc(struct bch_fs *, struct list_head *);
-u8 bch2_btree_key_recalc_oldest_gen(struct bch_fs *, struct bkey_s_c);
 void bch2_mark_dev_superblock(struct bch_fs *, struct bch_dev *, unsigned);
 
 /*
-- 
cgit 


From cd575ddf57af004913ff5a994aa5f3203216fa68 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 1 Nov 2018 15:13:19 -0400
Subject: bcachefs: Erasure coding

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/Kconfig                 |    2 +
 fs/bcachefs/Makefile                |    1 +
 fs/bcachefs/alloc_background.c      |   19 +
 fs/bcachefs/alloc_foreground.c      |  353 ++++++++--
 fs/bcachefs/alloc_foreground.h      |   31 +-
 fs/bcachefs/alloc_types.h           |   11 +-
 fs/bcachefs/bcachefs.h              |   33 +-
 fs/bcachefs/bcachefs_format.h       |   69 +-
 fs/bcachefs/bkey.h                  |    2 +
 fs/bcachefs/bkey_methods.c          |    2 +
 fs/bcachefs/btree_gc.c              |   45 +-
 fs/bcachefs/btree_gc.h              |   13 +-
 fs/bcachefs/btree_update_interior.c |    2 +-
 fs/bcachefs/buckets.c               |  142 +++-
 fs/bcachefs/buckets.h               |    1 +
 fs/bcachefs/buckets_types.h         |    5 +-
 fs/bcachefs/disk_groups.h           |   13 +
 fs/bcachefs/ec.c                    | 1265 +++++++++++++++++++++++++++++++++++
 fs/bcachefs/ec.h                    |  109 +++
 fs/bcachefs/ec_types.h              |   31 +
 fs/bcachefs/extents.c               |  196 ++++--
 fs/bcachefs/extents.h               |   21 +-
 fs/bcachefs/extents_types.h         |    4 +
 fs/bcachefs/io.c                    |  143 ++--
 fs/bcachefs/journal_io.c            |    5 +-
 fs/bcachefs/opts.c                  |   26 +
 fs/bcachefs/opts.h                  |    7 +-
 fs/bcachefs/recovery.c              |    6 +
 fs/bcachefs/replicas.c              |   34 +-
 fs/bcachefs/super-io.c              |    1 +
 fs/bcachefs/super.c                 |   12 +
 fs/bcachefs/sysfs.c                 |   47 ++
 32 files changed, 2418 insertions(+), 233 deletions(-)
 create mode 100644 fs/bcachefs/ec.c
 create mode 100644 fs/bcachefs/ec.h
 create mode 100644 fs/bcachefs/ec_types.h

(limited to 'fs')

diff --git a/fs/bcachefs/Kconfig b/fs/bcachefs/Kconfig
index c13f2cfa6489..2f8300b60807 100644
--- a/fs/bcachefs/Kconfig
+++ b/fs/bcachefs/Kconfig
@@ -16,6 +16,8 @@ config BCACHEFS_FS
 	select CRYPTO_CHACHA20
 	select CRYPTO_POLY1305
 	select KEYS
+	select RAID6_PQ
+	select XOR_BLOCKS
 	help
 	The bcachefs filesystem - a modern, copy on write filesystem, with
 	support for multiple devices, compression, checksumming, etc.
diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile
index 5318287c5ac4..b9521d772db1 100644
--- a/fs/bcachefs/Makefile
+++ b/fs/bcachefs/Makefile
@@ -22,6 +22,7 @@ bcachefs-y		:=	\
 	debug.o			\
 	dirent.o		\
 	disk_groups.o		\
+	ec.o			\
 	error.o			\
 	extents.o		\
 	fs.o			\
diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 291d352ee370..b49d0cd84b78 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -10,6 +10,7 @@
 #include "buckets.h"
 #include "clock.h"
 #include "debug.h"
+#include "ec.h"
 #include "error.h"
 #include "journal_io.h"
 #include "trace.h"
@@ -1113,6 +1114,24 @@ void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca)
 	}
 	mutex_unlock(&c->btree_reserve_cache_lock);
 
+	while (1) {
+		struct open_bucket *ob;
+
+		spin_lock(&c->freelist_lock);
+		if (!ca->open_buckets_partial_nr) {
+			spin_unlock(&c->freelist_lock);
+			break;
+		}
+		ob = c->open_buckets +
+			ca->open_buckets_partial[--ca->open_buckets_partial_nr];
+		ob->on_partial_list = false;
+		spin_unlock(&c->freelist_lock);
+
+		bch2_open_bucket_put(c, ob);
+	}
+
+	bch2_ec_stop_dev(c, ca);
+
 	/*
 	 * Wake up threads that were blocked on allocation, so they can notice
 	 * the device can no longer be removed and the capacity has changed:
diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index df74e41ec890..6e5f6e57da56 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -62,6 +62,7 @@
 #include "clock.h"
 #include "debug.h"
 #include "disk_groups.h"
+#include "ec.h"
 #include "io.h"
 #include "trace.h"
 
@@ -95,6 +96,11 @@ void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob)
 {
 	struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev);
 
+	if (ob->ec) {
+		bch2_ec_bucket_written(c, ob);
+		return;
+	}
+
 	percpu_down_read(&c->usage_lock);
 	spin_lock(&ob->lock);
 
@@ -114,6 +120,19 @@ void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob)
 	closure_wake_up(&c->open_buckets_wait);
 }
 
+void bch2_open_bucket_write_error(struct bch_fs *c,
+				  struct open_buckets *obs,
+				  unsigned dev)
+{
+	struct open_bucket *ob;
+	unsigned i;
+
+	open_bucket_for_each(c, obs, ob, i)
+		if (ob->ptr.dev == dev &&
+		    ob->ec)
+			bch2_ec_bucket_cancel(c, ob);
+}
+
 static struct open_bucket *bch2_open_bucket_alloc(struct bch_fs *c)
 {
 	struct open_bucket *ob;
@@ -129,15 +148,17 @@ static struct open_bucket *bch2_open_bucket_alloc(struct bch_fs *c)
 }
 
 static void open_bucket_free_unused(struct bch_fs *c,
-				    struct write_point *wp,
-				    struct open_bucket *ob)
+				    struct open_bucket *ob,
+				    bool may_realloc)
 {
 	struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev);
 
 	BUG_ON(ca->open_buckets_partial_nr >=
 	       ARRAY_SIZE(ca->open_buckets_partial));
 
-	if (wp->type == BCH_DATA_USER) {
+	if (ca->open_buckets_partial_nr <
+	    ARRAY_SIZE(ca->open_buckets_partial) &&
+	    may_realloc) {
 		spin_lock(&c->freelist_lock);
 		ob->on_partial_list = true;
 		ca->open_buckets_partial[ca->open_buckets_partial_nr++] =
@@ -285,18 +306,18 @@ out:
 	return ob;
 }
 
-static int __dev_alloc_cmp(struct write_point *wp,
-			   unsigned l, unsigned r)
+static int __dev_stripe_cmp(struct dev_stripe_state *stripe,
+			    unsigned l, unsigned r)
 {
-	return ((wp->next_alloc[l] > wp->next_alloc[r]) -
-		(wp->next_alloc[l] < wp->next_alloc[r]));
+	return ((stripe->next_alloc[l] > stripe->next_alloc[r]) -
+		(stripe->next_alloc[l] < stripe->next_alloc[r]));
 }
 
-#define dev_alloc_cmp(l, r) __dev_alloc_cmp(wp, l, r)
+#define dev_stripe_cmp(l, r) __dev_stripe_cmp(stripe, l, r)
 
-struct dev_alloc_list bch2_wp_alloc_list(struct bch_fs *c,
-					 struct write_point *wp,
-					 struct bch_devs_mask *devs)
+struct dev_alloc_list bch2_dev_alloc_list(struct bch_fs *c,
+					  struct dev_stripe_state *stripe,
+					  struct bch_devs_mask *devs)
 {
 	struct dev_alloc_list ret = { .nr = 0 };
 	struct bch_dev *ca;
@@ -305,14 +326,14 @@ struct dev_alloc_list bch2_wp_alloc_list(struct bch_fs *c,
 	for_each_member_device_rcu(ca, c, i, devs)
 		ret.devs[ret.nr++] = i;
 
-	bubble_sort(ret.devs, ret.nr, dev_alloc_cmp);
+	bubble_sort(ret.devs, ret.nr, dev_stripe_cmp);
 	return ret;
 }
 
-void bch2_wp_rescale(struct bch_fs *c, struct bch_dev *ca,
-		     struct write_point *wp)
+void bch2_dev_stripe_increment(struct bch_fs *c, struct bch_dev *ca,
+			       struct dev_stripe_state *stripe)
 {
-	u64 *v = wp->next_alloc + ca->dev_idx;
+	u64 *v = stripe->next_alloc + ca->dev_idx;
 	u64 free_space = dev_buckets_free(c, ca);
 	u64 free_space_inv = free_space
 		? div64_u64(1ULL << 48, free_space)
@@ -324,26 +345,30 @@ void bch2_wp_rescale(struct bch_fs *c, struct bch_dev *ca,
 	else
 		*v = U64_MAX;
 
-	for (v = wp->next_alloc;
-	     v < wp->next_alloc + ARRAY_SIZE(wp->next_alloc); v++)
+	for (v = stripe->next_alloc;
+	     v < stripe->next_alloc + ARRAY_SIZE(stripe->next_alloc); v++)
 		*v = *v < scale ? 0 : *v - scale;
 }
 
+#define BUCKET_MAY_ALLOC_PARTIAL	(1 << 0)
+#define BUCKET_ALLOC_USE_DURABILITY	(1 << 1)
+
 static int bch2_bucket_alloc_set(struct bch_fs *c,
 				 struct open_buckets *ptrs,
-				 struct write_point *wp,
+				 struct dev_stripe_state *stripe,
 				 struct bch_devs_mask *devs_may_alloc,
 				 unsigned nr_replicas,
 				 unsigned *nr_effective,
 				 bool *have_cache,
 				 enum alloc_reserve reserve,
+				 unsigned flags,
 				 struct closure *cl)
 {
 	struct dev_alloc_list devs_sorted =
-		bch2_wp_alloc_list(c, wp, devs_may_alloc);
+		bch2_dev_alloc_list(c, stripe, devs_may_alloc);
 	struct bch_dev *ca;
 	bool alloc_failure = false;
-	unsigned i;
+	unsigned i, durability;
 
 	BUG_ON(*nr_effective >= nr_replicas);
 
@@ -354,13 +379,11 @@ static int bch2_bucket_alloc_set(struct bch_fs *c,
 		if (!ca)
 			continue;
 
-		if (!ca->mi.durability &&
-		    (*have_cache ||
-		     wp->type != BCH_DATA_USER))
+		if (!ca->mi.durability && *have_cache)
 			continue;
 
 		ob = bch2_bucket_alloc(c, ca, reserve,
-				       wp->type == BCH_DATA_USER, cl);
+				flags & BUCKET_MAY_ALLOC_PARTIAL, cl);
 		if (IS_ERR(ob)) {
 			enum bucket_alloc_ret ret = -PTR_ERR(ob);
 
@@ -375,13 +398,16 @@ static int bch2_bucket_alloc_set(struct bch_fs *c,
 			continue;
 		}
 
+		durability = (flags & BUCKET_ALLOC_USE_DURABILITY)
+			? ca->mi.durability : 1;
+
 		__clear_bit(ca->dev_idx, devs_may_alloc->d);
-		*nr_effective	+= ca->mi.durability;
-		*have_cache	|= !ca->mi.durability;
+		*nr_effective	+= durability;
+		*have_cache	|= !durability;
 
 		ob_push(c, ptrs, ob);
 
-		bch2_wp_rescale(c, ca, wp);
+		bch2_dev_stripe_increment(c, ca, stripe);
 
 		if (*nr_effective >= nr_replicas)
 			return 0;
@@ -390,15 +416,150 @@ static int bch2_bucket_alloc_set(struct bch_fs *c,
 	return alloc_failure ? -ENOSPC : -EROFS;
 }
 
+/* Allocate from stripes: */
+
+/*
+ * XXX: use a higher watermark for allocating open buckets here:
+ */
+static int ec_stripe_alloc(struct bch_fs *c, struct ec_stripe_head *h)
+{
+	struct bch_devs_mask devs;
+	struct open_bucket *ob;
+	unsigned i, nr_have = 0, nr_data =
+		min_t(unsigned, h->nr_active_devs,
+		      EC_STRIPE_MAX) - h->redundancy;
+	bool have_cache = true;
+	int ret = 0;
+
+	BUG_ON(h->blocks.nr > nr_data);
+	BUG_ON(h->parity.nr > h->redundancy);
+
+	devs = h->devs;
+
+	open_bucket_for_each(c, &h->parity, ob, i)
+		__clear_bit(ob->ptr.dev, devs.d);
+	open_bucket_for_each(c, &h->blocks, ob, i)
+		__clear_bit(ob->ptr.dev, devs.d);
+
+	percpu_down_read(&c->usage_lock);
+	rcu_read_lock();
+
+	if (h->parity.nr < h->redundancy) {
+		nr_have = h->parity.nr;
+
+		ret = bch2_bucket_alloc_set(c, &h->parity,
+					    &h->parity_stripe,
+					    &devs,
+					    h->redundancy,
+					    &nr_have,
+					    &have_cache,
+					    RESERVE_NONE,
+					    0,
+					    NULL);
+		if (ret)
+			goto err;
+	}
+
+	if (h->blocks.nr < nr_data) {
+		nr_have = h->blocks.nr;
+
+		ret = bch2_bucket_alloc_set(c, &h->blocks,
+					    &h->block_stripe,
+					    &devs,
+					    nr_data,
+					    &nr_have,
+					    &have_cache,
+					    RESERVE_NONE,
+					    0,
+					    NULL);
+		if (ret)
+			goto err;
+	}
+
+	rcu_read_unlock();
+	percpu_up_read(&c->usage_lock);
+
+	return bch2_ec_stripe_new_alloc(c, h);
+err:
+	rcu_read_unlock();
+	percpu_up_read(&c->usage_lock);
+	return -1;
+}
+
+/*
+ * if we can't allocate a new stripe because there are already too many
+ * partially filled stripes, force allocating from an existing stripe even when
+ * it's to a device we don't want:
+ */
+
+static void bucket_alloc_from_stripe(struct bch_fs *c,
+				     struct open_buckets *ptrs,
+				     struct write_point *wp,
+				     struct bch_devs_mask *devs_may_alloc,
+				     u16 target,
+				     unsigned erasure_code,
+				     unsigned nr_replicas,
+				     unsigned *nr_effective,
+				     bool *have_cache)
+{
+	struct dev_alloc_list devs_sorted;
+	struct ec_stripe_head *h;
+	struct open_bucket *ob;
+	struct bch_dev *ca;
+	unsigned i, ec_idx;
+
+	if (!erasure_code)
+		return;
+
+	if (nr_replicas < 2)
+		return;
+
+	if (ec_open_bucket(c, ptrs))
+		return;
+
+	h = bch2_ec_stripe_head_get(c, target, erasure_code, nr_replicas - 1);
+	if (!h)
+		return;
+
+	if (!h->s && ec_stripe_alloc(c, h))
+		goto out_put_head;
+
+	rcu_read_lock();
+	devs_sorted = bch2_dev_alloc_list(c, &wp->stripe, devs_may_alloc);
+	rcu_read_unlock();
+
+	for (i = 0; i < devs_sorted.nr; i++)
+		open_bucket_for_each(c, &h->s->blocks, ob, ec_idx)
+			if (ob->ptr.dev == devs_sorted.devs[i] &&
+			    !test_and_set_bit(ec_idx, h->s->blocks_allocated))
+				goto got_bucket;
+	goto out_put_head;
+got_bucket:
+	ca = bch_dev_bkey_exists(c, ob->ptr.dev);
+
+	ob->ec_idx	= ec_idx;
+	ob->ec		= h->s;
+
+	__clear_bit(ob->ptr.dev, devs_may_alloc->d);
+	*nr_effective	+= ca->mi.durability;
+	*have_cache	|= !ca->mi.durability;
+
+	ob_push(c, ptrs, ob);
+	atomic_inc(&h->s->pin);
+out_put_head:
+	bch2_ec_stripe_head_put(h);
+}
+
 /* Sector allocator */
 
-static int get_buckets_from_writepoint(struct bch_fs *c,
-				       struct open_buckets *ptrs,
-				       struct write_point *wp,
-				       struct bch_devs_mask *devs_may_alloc,
-				       unsigned nr_replicas,
-				       unsigned *nr_effective,
-				       bool *have_cache)
+static void get_buckets_from_writepoint(struct bch_fs *c,
+					struct open_buckets *ptrs,
+					struct write_point *wp,
+					struct bch_devs_mask *devs_may_alloc,
+					unsigned nr_replicas,
+					unsigned *nr_effective,
+					bool *have_cache,
+					bool need_ec)
 {
 	struct open_buckets ptrs_skip = { .nr = 0 };
 	struct open_bucket *ob;
@@ -410,7 +571,8 @@ static int get_buckets_from_writepoint(struct bch_fs *c,
 		if (*nr_effective < nr_replicas &&
 		    test_bit(ob->ptr.dev, devs_may_alloc->d) &&
 		    (ca->mi.durability ||
-		     (wp->type == BCH_DATA_USER && !*have_cache))) {
+		     (wp->type == BCH_DATA_USER && !*have_cache)) &&
+		    (ob->ec || !need_ec)) {
 			__clear_bit(ob->ptr.dev, devs_may_alloc->d);
 			*nr_effective	+= ca->mi.durability;
 			*have_cache	|= !ca->mi.durability;
@@ -421,8 +583,6 @@ static int get_buckets_from_writepoint(struct bch_fs *c,
 		}
 	}
 	wp->ptrs = ptrs_skip;
-
-	return *nr_effective < nr_replicas ? -ENOSPC : 0;
 }
 
 static int open_bucket_add_buckets(struct bch_fs *c,
@@ -430,22 +590,25 @@ static int open_bucket_add_buckets(struct bch_fs *c,
 				   struct write_point *wp,
 				   struct bch_devs_list *devs_have,
 				   u16 target,
+				   unsigned erasure_code,
 				   unsigned nr_replicas,
 				   unsigned *nr_effective,
 				   bool *have_cache,
 				   enum alloc_reserve reserve,
-				   struct closure *cl)
+				   struct closure *_cl)
 {
 	struct bch_devs_mask devs;
-	const struct bch_devs_mask *t;
 	struct open_bucket *ob;
-	unsigned i;
+	struct closure *cl = NULL;
+	unsigned i, flags = BUCKET_ALLOC_USE_DURABILITY;
 	int ret;
 
-	percpu_down_read(&c->usage_lock);
-	rcu_read_lock();
+	if (wp->type == BCH_DATA_USER)
+		flags |= BUCKET_MAY_ALLOC_PARTIAL;
 
-	devs = c->rw_devs[wp->type];
+	rcu_read_lock();
+	devs = target_rw_devs(c, wp->type, target);
+	rcu_read_unlock();
 
 	/* Don't allocate from devices we already have pointers to: */
 	for (i = 0; i < devs_have->nr; i++)
@@ -454,50 +617,83 @@ static int open_bucket_add_buckets(struct bch_fs *c,
 	open_bucket_for_each(c, ptrs, ob, i)
 		__clear_bit(ob->ptr.dev, devs.d);
 
-	t = bch2_target_to_mask(c, target);
-	if (t)
-		bitmap_and(devs.d, devs.d, t->d, BCH_SB_MEMBERS_MAX);
+	if (erasure_code) {
+		get_buckets_from_writepoint(c, ptrs, wp, &devs,
+					    nr_replicas, nr_effective,
+					    have_cache, true);
+		if (*nr_effective >= nr_replicas)
+			return 0;
 
-	ret = get_buckets_from_writepoint(c, ptrs, wp, &devs,
-				nr_replicas, nr_effective, have_cache);
-	if (!ret)
-		goto out;
+		bucket_alloc_from_stripe(c, ptrs, wp, &devs,
+					 target, erasure_code,
+					 nr_replicas, nr_effective,
+					 have_cache);
+		if (*nr_effective >= nr_replicas)
+			return 0;
+	}
+
+	get_buckets_from_writepoint(c, ptrs, wp, &devs,
+				    nr_replicas, nr_effective,
+				    have_cache, false);
+	if (*nr_effective >= nr_replicas)
+		return 0;
+
+	percpu_down_read(&c->usage_lock);
+	rcu_read_lock();
 
+retry_blocking:
 	/*
 	 * Try nonblocking first, so that if one device is full we'll try from
 	 * other devices:
 	 */
-	ret = bch2_bucket_alloc_set(c, ptrs, wp, &devs,
+	ret = bch2_bucket_alloc_set(c, ptrs, &wp->stripe, &devs,
 				nr_replicas, nr_effective, have_cache,
-				reserve, NULL);
-	if (!ret || ret == -EROFS || !cl)
-		goto out;
+				reserve, flags, cl);
+	if (ret && ret != -EROFS && !cl && _cl) {
+		cl = _cl;
+		goto retry_blocking;
+	}
 
-	ret = bch2_bucket_alloc_set(c, ptrs, wp, &devs,
-				nr_replicas, nr_effective, have_cache,
-				reserve, cl);
-out:
 	rcu_read_unlock();
 	percpu_up_read(&c->usage_lock);
 
 	return ret;
 }
 
-void bch2_writepoint_stop(struct bch_fs *c, struct bch_dev *ca,
-			  struct write_point *wp)
+void bch2_open_buckets_stop_dev(struct bch_fs *c, struct bch_dev *ca,
+				struct open_buckets *obs,
+				enum bch_data_type data_type)
 {
 	struct open_buckets ptrs = { .nr = 0 };
-	struct open_bucket *ob;
-	unsigned i;
+	struct open_bucket *ob, *ob2;
+	unsigned i, j;
 
-	mutex_lock(&wp->lock);
-	open_bucket_for_each(c, &wp->ptrs, ob, i)
-		if (!ca || ob->ptr.dev == ca->dev_idx)
-			open_bucket_free_unused(c, wp, ob);
+	open_bucket_for_each(c, obs, ob, i) {
+		bool drop = !ca || ob->ptr.dev == ca->dev_idx;
+
+		if (!drop && ob->ec) {
+			mutex_lock(&ob->ec->lock);
+			open_bucket_for_each(c, &ob->ec->blocks, ob2, j)
+				drop |= ob2->ptr.dev == ca->dev_idx;
+			open_bucket_for_each(c, &ob->ec->parity, ob2, j)
+				drop |= ob2->ptr.dev == ca->dev_idx;
+			mutex_unlock(&ob->ec->lock);
+		}
+
+		if (drop)
+			bch2_open_bucket_put(c, ob);
 		else
 			ob_push(c, &ptrs, ob);
+	}
 
-	wp->ptrs = ptrs;
+	*obs = ptrs;
+}
+
+void bch2_writepoint_stop(struct bch_fs *c, struct bch_dev *ca,
+			  struct write_point *wp)
+{
+	mutex_lock(&wp->lock);
+	bch2_open_buckets_stop_dev(c, ca, &wp->ptrs, wp->type);
 	mutex_unlock(&wp->lock);
 }
 
@@ -630,6 +826,7 @@ out:
  */
 struct write_point *bch2_alloc_sectors_start(struct bch_fs *c,
 				unsigned target,
+				unsigned erasure_code,
 				struct write_point_specifier write_point,
 				struct bch_devs_list *devs_have,
 				unsigned nr_replicas,
@@ -649,26 +846,37 @@ struct write_point *bch2_alloc_sectors_start(struct bch_fs *c,
 	BUG_ON(!nr_replicas || !nr_replicas_required);
 retry:
 	write_points_nr = c->write_points_nr;
+
 	wp = writepoint_find(c, write_point.v);
 
+	/* metadata may not allocate on cache devices: */
+	if (wp->type != BCH_DATA_USER)
+		have_cache = true;
+
 	if (!target || (flags & BCH_WRITE_ONLY_SPECIFIED_DEVS)) {
-		ret = open_bucket_add_buckets(c, &ptrs, wp, devs_have, target,
+		ret = open_bucket_add_buckets(c, &ptrs, wp, devs_have,
+					      target, erasure_code,
 					      nr_replicas, &nr_effective,
 					      &have_cache, reserve, cl);
 	} else {
-		ret = open_bucket_add_buckets(c, &ptrs, wp, devs_have, target,
+		ret = open_bucket_add_buckets(c, &ptrs, wp, devs_have,
+					      target, erasure_code,
 					      nr_replicas, &nr_effective,
 					      &have_cache, reserve, NULL);
 		if (!ret)
 			goto alloc_done;
 
-		ret = open_bucket_add_buckets(c, &ptrs, wp, devs_have, 0,
+		ret = open_bucket_add_buckets(c, &ptrs, wp, devs_have,
+					      0, erasure_code,
 					      nr_replicas, &nr_effective,
 					      &have_cache, reserve, cl);
 	}
 alloc_done:
 	BUG_ON(!ret && nr_effective < nr_replicas);
 
+	if (erasure_code && !ec_open_bucket(c, &ptrs))
+		pr_debug("failed to get ec bucket: ret %u", ret);
+
 	if (ret == -EROFS &&
 	    nr_effective >= nr_replicas_required)
 		ret = 0;
@@ -678,7 +886,7 @@ alloc_done:
 
 	/* Free buckets we didn't use: */
 	open_bucket_for_each(c, &wp->ptrs, ob, i)
-		open_bucket_free_unused(c, wp, ob);
+		open_bucket_free_unused(c, ob, wp->type == BCH_DATA_USER);
 
 	wp->ptrs = ptrs;
 
@@ -697,7 +905,8 @@ err:
 		if (ptrs.nr < ARRAY_SIZE(ptrs.v))
 			ob_push(c, &ptrs, ob);
 		else
-			open_bucket_free_unused(c, wp, ob);
+			open_bucket_free_unused(c, ob,
+					wp->type == BCH_DATA_USER);
 	wp->ptrs = ptrs;
 
 	mutex_unlock(&wp->lock);
diff --git a/fs/bcachefs/alloc_foreground.h b/fs/bcachefs/alloc_foreground.h
index 6672101cbe26..c71cf7381729 100644
--- a/fs/bcachefs/alloc_foreground.h
+++ b/fs/bcachefs/alloc_foreground.h
@@ -17,11 +17,11 @@ struct dev_alloc_list {
 	u8		devs[BCH_SB_MEMBERS_MAX];
 };
 
-struct dev_alloc_list bch2_wp_alloc_list(struct bch_fs *,
-					 struct write_point *,
-					 struct bch_devs_mask *);
-void bch2_wp_rescale(struct bch_fs *, struct bch_dev *,
-		     struct write_point *);
+struct dev_alloc_list bch2_dev_alloc_list(struct bch_fs *,
+					  struct dev_stripe_state *,
+					  struct bch_devs_mask *);
+void bch2_dev_stripe_increment(struct bch_fs *, struct bch_dev *,
+			       struct dev_stripe_state *);
 
 long bch2_bucket_alloc_new_fs(struct bch_dev *);
 
@@ -43,6 +43,22 @@ static inline void ob_push(struct bch_fs *c, struct open_buckets *obs,
 	     ((_ob) = (_c)->open_buckets + (_obs)->v[_i], true);	\
 	     (_i)++)
 
+static inline struct open_bucket *ec_open_bucket(struct bch_fs *c,
+						 struct open_buckets *obs)
+{
+	struct open_bucket *ob;
+	unsigned i;
+
+	open_bucket_for_each(c, obs, ob, i)
+		if (ob->ec)
+			return ob;
+
+	return NULL;
+}
+
+void bch2_open_bucket_write_error(struct bch_fs *,
+			struct open_buckets *, unsigned);
+
 void __bch2_open_bucket_put(struct bch_fs *, struct open_bucket *);
 
 static inline void bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob)
@@ -76,7 +92,7 @@ static inline void bch2_open_bucket_get(struct bch_fs *c,
 }
 
 struct write_point *bch2_alloc_sectors_start(struct bch_fs *,
-					     unsigned,
+					     unsigned, unsigned,
 					     struct write_point_specifier,
 					     struct bch_devs_list *,
 					     unsigned, unsigned,
@@ -88,6 +104,9 @@ void bch2_alloc_sectors_append_ptrs(struct bch_fs *, struct write_point *,
 				    struct bkey_i_extent *, unsigned);
 void bch2_alloc_sectors_done(struct bch_fs *, struct write_point *);
 
+void bch2_open_buckets_stop_dev(struct bch_fs *, struct bch_dev *,
+				struct open_buckets *, enum bch_data_type);
+
 void bch2_writepoint_stop(struct bch_fs *, struct bch_dev *,
 			  struct write_point *);
 
diff --git a/fs/bcachefs/alloc_types.h b/fs/bcachefs/alloc_types.h
index 2a9c6f0344ed..ef3e400c7d3d 100644
--- a/fs/bcachefs/alloc_types.h
+++ b/fs/bcachefs/alloc_types.h
@@ -8,6 +8,8 @@
 #include "clock_types.h"
 #include "fifo.h"
 
+struct ec_bucket_buf;
+
 /* There's two of these clocks, one for reads and one for writes: */
 struct bucket_clock {
 	/*
@@ -56,8 +58,10 @@ struct open_bucket {
 	u8			freelist;
 	bool			valid;
 	bool			on_partial_list;
+	u8			ec_idx;
 	unsigned		sectors_free;
 	struct bch_extent_ptr	ptr;
+	struct ec_stripe_new	*ec;
 };
 
 #define OPEN_BUCKET_LIST_MAX	15
@@ -67,18 +71,23 @@ struct open_buckets {
 	u8			v[OPEN_BUCKET_LIST_MAX];
 };
 
+struct dev_stripe_state {
+	u64			next_alloc[BCH_SB_MEMBERS_MAX];
+};
+
 struct write_point {
 	struct hlist_node	node;
 	struct mutex		lock;
 	u64			last_used;
 	unsigned long		write_point;
 	enum bch_data_type	type;
+	bool			is_ec;
 
 	/* calculated based on how many pointers we're actually going to use: */
 	unsigned		sectors_free;
 
 	struct open_buckets	ptrs;
-	u64			next_alloc[BCH_SB_MEMBERS_MAX];
+	struct dev_stripe_state	stripe;
 };
 
 struct write_point_specifier {
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 22df84b78f4b..b33fbf709705 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -204,7 +204,7 @@
 #define dynamic_fault(...)		0
 #define race_fault(...)			0
 
-#define bch2_fs_init_fault(name)						\
+#define bch2_fs_init_fault(name)					\
 	dynamic_fault("bcachefs:bch_fs_init:" name)
 #define bch2_meta_read_fault(name)					\
 	 dynamic_fault("bcachefs:meta:read:" name)
@@ -273,7 +273,10 @@ do {									\
 	BCH_DEBUG_PARAM(test_alloc_startup,				\
 		"Force allocator startup to use the slowpath where it"	\
 		"can't find enough free buckets without invalidating"	\
-		"cached data")
+		"cached data")						\
+	BCH_DEBUG_PARAM(force_reconstruct_read,				\
+		"Force reads to use the reconstruct path, when reading"	\
+		"from erasure coded extents")
 
 #define BCH_DEBUG_PARAMS_ALL() BCH_DEBUG_PARAMS_ALWAYS() BCH_DEBUG_PARAMS_DEBUG()
 
@@ -311,6 +314,7 @@ enum bch_time_stats {
 #include "btree_types.h"
 #include "buckets_types.h"
 #include "clock_types.h"
+#include "ec_types.h"
 #include "journal_types.h"
 #include "keylist_types.h"
 #include "quota_types.h"
@@ -333,9 +337,13 @@ enum gc_phase {
 	GC_PHASE_START,
 	GC_PHASE_SB,
 
-#define DEF_BTREE_ID(kwd, val, name) GC_PHASE_BTREE_##kwd,
-	DEFINE_BCH_BTREE_IDS()
-#undef DEF_BTREE_ID
+	GC_PHASE_BTREE_EC,
+	GC_PHASE_BTREE_EXTENTS,
+	GC_PHASE_BTREE_INODES,
+	GC_PHASE_BTREE_DIRENTS,
+	GC_PHASE_BTREE_XATTRS,
+	GC_PHASE_BTREE_ALLOC,
+	GC_PHASE_BTREE_QUOTAS,
 
 	GC_PHASE_PENDING_DELETE,
 	GC_PHASE_ALLOC,
@@ -684,6 +692,21 @@ struct bch_fs {
 	/* REBALANCE */
 	struct bch_fs_rebalance	rebalance;
 
+	/* ERASURE CODING */
+	struct list_head	ec_new_stripe_list;
+	struct mutex		ec_new_stripe_lock;
+
+	GENRADIX(struct ec_stripe) ec_stripes;
+	struct mutex		ec_stripes_lock;
+
+	ec_stripes_heap		ec_stripes_heap;
+	spinlock_t		ec_stripes_heap_lock;
+
+	struct bio_set		ec_bioset;
+
+	struct work_struct	ec_stripe_delete_work;
+	struct llist_head	ec_stripe_delete_list;
+
 	/* VFS IO PATH - fs-io.c */
 	struct bio_set		writepage_bioset;
 	struct bio_set		dio_write_bioset;
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index ecb7a97ee533..a00e77fa1d37 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -238,6 +238,9 @@ struct bkey_packed {
 } __attribute__((packed, aligned(8)));
 
 #define BKEY_U64s			(sizeof(struct bkey) / sizeof(__u64))
+#define BKEY_U64s_MAX			U8_MAX
+#define BKEY_VAL_U64s_MAX		(BKEY_U64s_MAX - BKEY_U64s)
+
 #define KEY_PACKED_BITS_START		24
 
 #define KEY_FORMAT_LOCAL_BTREE		0
@@ -465,8 +468,9 @@ enum bch_compression_type {
 	x(ptr,			0)		\
 	x(crc32,		1)		\
 	x(crc64,		2)		\
-	x(crc128,		3)
-#define BCH_EXTENT_ENTRY_MAX	4
+	x(crc128,		3)		\
+	x(stripe_ptr,		4)
+#define BCH_EXTENT_ENTRY_MAX	5
 
 enum bch_extent_entry_type {
 #define x(f, n) BCH_EXTENT_ENTRY_##f = n,
@@ -557,7 +561,7 @@ struct bch_extent_ptr {
 #if defined(__LITTLE_ENDIAN_BITFIELD)
 	__u64			type:1,
 				cached:1,
-				erasure_coded:1,
+				unused:1,
 				reservation:1,
 				offset:44, /* 8 petabytes */
 				dev:8,
@@ -567,23 +571,35 @@ struct bch_extent_ptr {
 				dev:8,
 				offset:44,
 				reservation:1,
-				erasure_coded:1,
+				unused:1,
 				cached:1,
 				type:1;
 #endif
 } __attribute__((packed, aligned(8)));
 
-struct bch_extent_reservation {
+struct bch_extent_stripe_ptr {
 #if defined(__LITTLE_ENDIAN_BITFIELD)
 	__u64			type:5,
-				unused:23,
+				block:8,
+				idx:51;
+#elif defined (__BIG_ENDIAN_BITFIELD)
+	__u64			idx:51,
+				block:8,
+				type:5;
+#endif
+};
+
+struct bch_extent_reservation {
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+	__u64			type:6,
+				unused:22,
 				replicas:4,
 				generation:32;
 #elif defined (__BIG_ENDIAN_BITFIELD)
 	__u64			generation:32,
 				replicas:4,
-				unused:23,
-				type:5;
+				unused:22,
+				type:6;
 #endif
 };
 
@@ -706,7 +722,8 @@ BKEY_VAL_TYPE(inode_generation,	BCH_INODE_GENERATION);
 	BCH_INODE_FIELD(bi_data_replicas,		8)	\
 	BCH_INODE_FIELD(bi_promote_target,		16)	\
 	BCH_INODE_FIELD(bi_foreground_target,		16)	\
-	BCH_INODE_FIELD(bi_background_target,		16)
+	BCH_INODE_FIELD(bi_background_target,		16)	\
+	BCH_INODE_FIELD(bi_erasure_code,		16)
 
 #define BCH_INODE_FIELDS_INHERIT()				\
 	BCH_INODE_FIELD(bi_data_checksum)			\
@@ -716,7 +733,8 @@ BKEY_VAL_TYPE(inode_generation,	BCH_INODE_GENERATION);
 	BCH_INODE_FIELD(bi_data_replicas)			\
 	BCH_INODE_FIELD(bi_promote_target)			\
 	BCH_INODE_FIELD(bi_foreground_target)			\
-	BCH_INODE_FIELD(bi_background_target)
+	BCH_INODE_FIELD(bi_background_target)			\
+	BCH_INODE_FIELD(bi_erasure_code)
 
 enum {
 	/*
@@ -876,6 +894,27 @@ struct bch_quota {
 } __attribute__((packed, aligned(8)));
 BKEY_VAL_TYPE(quota,	BCH_QUOTA);
 
+/* Erasure coding */
+
+enum {
+	BCH_STRIPE		= 128,
+};
+
+struct bch_stripe {
+	struct bch_val		v;
+	__le16			sectors;
+	__u8			algorithm;
+	__u8			nr_blocks;
+	__u8			nr_redundant;
+
+	__u8			csum_granularity_bits;
+	__u8			csum_type;
+	__u8			pad;
+
+	struct bch_extent_ptr	ptrs[0];
+} __attribute__((packed, aligned(8)));
+BKEY_VAL_TYPE(stripe,	BCH_STRIPE);
+
 /* Optional/variable size superblock sections: */
 
 struct bch_sb_field {
@@ -1065,7 +1104,7 @@ struct bch_sb_field_quota {
 struct bch_disk_group {
 	__u8			label[BCH_SB_LABEL_SIZE];
 	__le64			flags[2];
-};
+} __attribute__((packed, aligned(8)));
 
 LE64_BITMASK(BCH_GROUP_DELETED,		struct bch_disk_group, flags[0], 0,  1)
 LE64_BITMASK(BCH_GROUP_DATA_ALLOWED,	struct bch_disk_group, flags[0], 1,  6)
@@ -1074,7 +1113,7 @@ LE64_BITMASK(BCH_GROUP_PARENT,		struct bch_disk_group, flags[0], 6, 24)
 struct bch_sb_field_disk_groups {
 	struct bch_sb_field	field;
 	struct bch_disk_group	entries[0];
-};
+} __attribute__((packed, aligned(8)));
 
 /*
  * On clean shutdown, store btree roots and current journal sequence number in
@@ -1242,12 +1281,15 @@ LE64_BITMASK(BCH_SB_BACKGROUND_COMPRESSION_TYPE,
 					struct bch_sb, flags[2],  0,  4);
 LE64_BITMASK(BCH_SB_GC_RESERVE_BYTES,	struct bch_sb, flags[2],  4, 64);
 
+LE64_BITMASK(BCH_SB_ERASURE_CODE,	struct bch_sb, flags[3],  0, 16);
+
 /* Features: */
 enum bch_sb_features {
 	BCH_FEATURE_LZ4			= 0,
 	BCH_FEATURE_GZIP		= 1,
 	BCH_FEATURE_ZSTD		= 2,
 	BCH_FEATURE_ATOMIC_NLINK	= 3, /* should have gone under compat */
+	BCH_FEATURE_EC			= 4,
 	BCH_FEATURE_NR,
 };
 
@@ -1417,7 +1459,8 @@ LE32_BITMASK(JSET_BIG_ENDIAN,	struct jset, flags, 4, 5);
 	DEF_BTREE_ID(DIRENTS,	2, "dirents")			\
 	DEF_BTREE_ID(XATTRS,	3, "xattrs")			\
 	DEF_BTREE_ID(ALLOC,	4, "alloc")			\
-	DEF_BTREE_ID(QUOTAS,	5, "quotas")
+	DEF_BTREE_ID(QUOTAS,	5, "quotas")			\
+	DEF_BTREE_ID(EC,	6, "erasure_coding")
 
 #define DEF_BTREE_ID(kwd, val, name) BTREE_ID_##kwd = val,
 
diff --git a/fs/bcachefs/bkey.h b/fs/bcachefs/bkey.h
index 9a0286d86784..9679631a7e89 100644
--- a/fs/bcachefs/bkey.h
+++ b/fs/bcachefs/bkey.h
@@ -588,6 +588,8 @@ BKEY_VAL_ACCESSORS(alloc,		BCH_ALLOC);
 
 BKEY_VAL_ACCESSORS(quota,		BCH_QUOTA);
 
+BKEY_VAL_ACCESSORS(stripe,		BCH_STRIPE);
+
 /* byte order helpers */
 
 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c
index 7335fbbb3f61..81c66950668c 100644
--- a/fs/bcachefs/bkey_methods.c
+++ b/fs/bcachefs/bkey_methods.c
@@ -5,6 +5,7 @@
 #include "btree_types.h"
 #include "alloc_background.h"
 #include "dirent.h"
+#include "ec.h"
 #include "error.h"
 #include "extents.h"
 #include "inode.h"
@@ -18,6 +19,7 @@ const struct bkey_ops bch2_bkey_ops[] = {
 	[BKEY_TYPE_XATTRS]	= bch2_bkey_xattr_ops,
 	[BKEY_TYPE_ALLOC]	= bch2_bkey_alloc_ops,
 	[BKEY_TYPE_QUOTAS]	= bch2_bkey_quota_ops,
+	[BKEY_TYPE_EC]		= bch2_bkey_ec_ops,
 	[BKEY_TYPE_BTREE]	= bch2_bkey_btree_ops,
 };
 
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 92b82eaee69d..e900fd4ffd06 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -15,6 +15,7 @@
 #include "buckets.h"
 #include "clock.h"
 #include "debug.h"
+#include "ec.h"
 #include "error.h"
 #include "extents.h"
 #include "journal.h"
@@ -116,6 +117,7 @@ static bool bkey_type_needs_gc(enum bkey_type type)
 	switch (type) {
 	case BKEY_TYPE_BTREE:
 	case BKEY_TYPE_EXTENTS:
+	case BKEY_TYPE_EC:
 		return true;
 	default:
 		return false;
@@ -156,6 +158,17 @@ static u8 ptr_gens_recalc_oldest(struct bch_fs *c,
 		}
 		}
 		break;
+	case BKEY_TYPE_EC:
+		switch (k.k->type) {
+		case BCH_STRIPE: {
+			struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k);
+
+			for (ptr = s.v->ptrs;
+			     ptr < s.v->ptrs + s.v->nr_blocks;
+			     ptr++)
+				ptr_gen_recalc_oldest(c, ptr, &max_stale);
+		}
+		}
 	default:
 		break;
 	}
@@ -217,6 +230,21 @@ static int ptr_gens_check(struct bch_fs *c, enum bkey_type type,
 		}
 		}
 		break;
+	case BKEY_TYPE_EC:
+		switch (k.k->type) {
+		case BCH_STRIPE: {
+			struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k);
+
+			for (ptr = s.v->ptrs;
+			     ptr < s.v->ptrs + s.v->nr_blocks;
+			     ptr++) {
+				ret = ptr_gen_check(c, type, ptr);
+				if (ret)
+					return ret;
+			}
+		}
+		}
+		break;
 	default:
 		break;
 	}
@@ -362,15 +390,27 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id,
 	return 0;
 }
 
+static inline int btree_id_gc_phase_cmp(enum btree_id l, enum btree_id r)
+{
+	return  (int) btree_id_to_gc_phase(l) -
+		(int) btree_id_to_gc_phase(r);
+}
+
 static int bch2_gc_btrees(struct bch_fs *c, struct list_head *journal,
 			  bool initial)
 {
+	enum btree_id ids[BTREE_ID_NR];
 	unsigned i;
 
+	for (i = 0; i < BTREE_ID_NR; i++)
+		ids[i] = i;
+	bubble_sort(ids, BTREE_ID_NR, btree_id_gc_phase_cmp);
+
 	for (i = 0; i < BTREE_ID_NR; i++) {
-		enum bkey_type type = bkey_type(0, i);
+		enum btree_id id = ids[i];
+		enum bkey_type type = bkey_type(0, id);
 
-		int ret = bch2_gc_btree(c, i, initial);
+		int ret = bch2_gc_btree(c, id, initial);
 		if (ret)
 			return ret;
 
@@ -602,6 +642,7 @@ static void bch2_gc_start(struct bch_fs *c)
 				new.data_type		= 0;
 				new.cached_sectors	= 0;
 				new.dirty_sectors	= 0;
+				new.stripe		= 0;
 			}));
 			ca->oldest_gens[b] = new.gen;
 		}
diff --git a/fs/bcachefs/btree_gc.h b/fs/bcachefs/btree_gc.h
index 86b80e32e310..47a590015325 100644
--- a/fs/bcachefs/btree_gc.h
+++ b/fs/bcachefs/btree_gc.h
@@ -55,11 +55,22 @@ static inline int gc_pos_cmp(struct gc_pos l, struct gc_pos r)
 	return 0;
 }
 
+static inline enum gc_phase btree_id_to_gc_phase(enum btree_id id)
+{
+	switch (id) {
+#define DEF_BTREE_ID(n, v, s) case BTREE_ID_##n: return GC_PHASE_BTREE_##n;
+	DEFINE_BCH_BTREE_IDS()
+#undef DEF_BTREE_ID
+	default:
+		BUG();
+	}
+}
+
 static inline struct gc_pos gc_pos_btree(enum btree_id id,
 					 struct bpos pos, unsigned level)
 {
 	return (struct gc_pos) {
-		.phase	= GC_PHASE_BTREE_EXTENTS + id,
+		.phase	= btree_id_to_gc_phase(id),
 		.pos	= pos,
 		.level	= level,
 	};
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 92bacd16fdc3..01e476d72595 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -340,7 +340,7 @@ static struct btree *__bch2_btree_node_alloc(struct bch_fs *c,
 	mutex_unlock(&c->btree_reserve_cache_lock);
 
 retry:
-	wp = bch2_alloc_sectors_start(c, c->opts.foreground_target,
+	wp = bch2_alloc_sectors_start(c, c->opts.foreground_target, 0,
 				      writepoint_ptr(&c->btree_write_point),
 				      &devs_have,
 				      res->nr_replicas,
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index ea28788b26dd..9558129e77ba 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -69,6 +69,7 @@
 #include "btree_gc.h"
 #include "btree_update.h"
 #include "buckets.h"
+#include "ec.h"
 #include "error.h"
 #include "movinggc.h"
 #include "trace.h"
@@ -270,6 +271,7 @@ static inline struct fs_usage_sum __fs_usage_sum(struct bch_fs_usage stats)
 	for (i = 0; i < ARRAY_SIZE(stats.replicas); i++) {
 		sum.data	+= stats.replicas[i].data[BCH_DATA_BTREE];
 		sum.data	+= stats.replicas[i].data[BCH_DATA_USER];
+		sum.data	+= stats.replicas[i].ec_data;
 		sum.cached	+= stats.replicas[i].data[BCH_DATA_CACHED];
 		sum.reserved	+= stats.replicas[i].persistent_reserved;
 	}
@@ -400,6 +402,8 @@ static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
 
 	dev_usage->buckets_alloc +=
 		(int) new.owned_by_allocator - (int) old.owned_by_allocator;
+	dev_usage->buckets_ec +=
+		(int) new.stripe - (int) old.stripe;
 	dev_usage->buckets_unavailable +=
 		is_unavailable_bucket(new) - is_unavailable_bucket(old);
 
@@ -639,6 +643,49 @@ static void bch2_mark_pointer(struct bch_fs *c,
 	       bucket_became_unavailable(c, old, new));
 }
 
+static void bch2_mark_stripe_ptr(struct bch_fs *c,
+				 struct bch_extent_stripe_ptr p,
+				 s64 sectors, unsigned flags,
+				 s64 *adjusted_disk_sectors,
+				 unsigned *redundancy)
+{
+	struct ec_stripe *m;
+	unsigned old, new, nr_data;
+	int blocks_nonempty_delta;
+	s64 parity_sectors;
+
+	m = genradix_ptr(&c->ec_stripes, p.idx);
+	if (WARN_ON(!m))
+		return;
+
+	if (WARN_ON(!m->alive))
+		return;
+
+	nr_data = m->nr_blocks - m->nr_redundant;
+
+	parity_sectors = DIV_ROUND_UP(abs(sectors) * m->nr_redundant, nr_data);
+
+	if (sectors < 0)
+		parity_sectors = -parity_sectors;
+
+	*adjusted_disk_sectors += parity_sectors;
+
+	*redundancy = max_t(unsigned, *redundancy, m->nr_redundant + 1);
+
+	new = atomic_add_return(sectors, &m->block_sectors[p.block]);
+	old = new - sectors;
+
+	blocks_nonempty_delta = (int) !!new - (int) !!old;
+	if (!blocks_nonempty_delta)
+		return;
+
+	atomic_add(blocks_nonempty_delta, &m->blocks_nonempty);
+
+	BUG_ON(atomic_read(&m->blocks_nonempty) < 0);
+
+	bch2_stripes_heap_update(c, m, p.idx);
+}
+
 static void bch2_mark_extent(struct bch_fs *c, struct bkey_s_c k,
 			     s64 sectors, enum bch_data_type data_type,
 			     struct gc_pos pos,
@@ -655,28 +702,43 @@ static void bch2_mark_extent(struct bch_fs *c, struct bkey_s_c k,
 		struct extent_ptr_decoded p;
 		s64 cached_sectors	= 0;
 		s64 dirty_sectors	= 0;
+		s64 ec_sectors		= 0;
 		unsigned replicas	= 0;
+		unsigned ec_redundancy	= 0;
+		unsigned i;
 
 		extent_for_each_ptr_decode(e, p, entry) {
 			s64 disk_sectors = ptr_disk_sectors(e, p, sectors);
+			s64 adjusted_disk_sectors = disk_sectors;
 
 			bch2_mark_pointer(c, e, p, disk_sectors, data_type,
 					  stats, journal_seq, flags);
 
+			if (!p.ptr.cached)
+				for (i = 0; i < p.ec_nr; i++)
+					bch2_mark_stripe_ptr(c, p.ec[i],
+							disk_sectors, flags,
+							&adjusted_disk_sectors,
+							&ec_redundancy);
 			if (!p.ptr.cached)
 				replicas++;
 
 			if (p.ptr.cached)
-				cached_sectors	+= disk_sectors;
+				cached_sectors	+= adjusted_disk_sectors;
+			else if (!p.ec_nr)
+				dirty_sectors	+= adjusted_disk_sectors;
 			else
-				dirty_sectors	+= disk_sectors;
+				ec_sectors	+= adjusted_disk_sectors;
 		}
 
 		replicas	= clamp_t(unsigned,	replicas,
 					  1, ARRAY_SIZE(stats->replicas));
+		ec_redundancy	= clamp_t(unsigned,	ec_redundancy,
+					  1, ARRAY_SIZE(stats->replicas));
 
 		stats->replicas[0].data[BCH_DATA_CACHED]	+= cached_sectors;
 		stats->replicas[replicas - 1].data[data_type]	+= dirty_sectors;
+		stats->replicas[ec_redundancy - 1].ec_data	+= ec_sectors;
 		break;
 	}
 	case BCH_RESERVATION: {
@@ -692,6 +754,78 @@ static void bch2_mark_extent(struct bch_fs *c, struct bkey_s_c k,
 	}
 }
 
+static void bucket_set_stripe(struct bch_fs *c,
+			      const struct bch_stripe *v,
+			      bool enabled,
+			      struct bch_fs_usage *fs_usage,
+			      u64 journal_seq)
+{
+	unsigned i;
+
+	for (i = 0; i < v->nr_blocks; i++) {
+		const struct bch_extent_ptr *ptr = v->ptrs + i;
+		struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+		struct bucket *g;
+		struct bucket_mark new, old;
+
+		BUG_ON(ptr_stale(ca, ptr));
+
+		rcu_read_lock();
+		g = PTR_BUCKET(ca, ptr);
+
+		old = bucket_cmpxchg(g, new, ({
+			new.stripe			= enabled;
+			if (journal_seq) {
+				new.journal_seq_valid	= 1;
+				new.journal_seq		= journal_seq;
+			}
+		}));
+		rcu_read_unlock();
+
+		BUG_ON(old.stripe == enabled);
+
+		bch2_dev_usage_update(c, ca, fs_usage, old, new);
+	}
+}
+
+static void bch2_mark_stripe(struct bch_fs *c, struct bkey_s_c k,
+			     bool inserting, struct gc_pos pos,
+			     struct bch_fs_usage *fs_usage,
+			     u64 journal_seq, unsigned flags)
+{
+	switch (k.k->type) {
+	case BCH_STRIPE: {
+		struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k);
+		size_t idx = s.k->p.offset;
+		struct ec_stripe *m = genradix_ptr(&c->ec_stripes, idx);
+		unsigned i;
+
+		BUG_ON(!m);
+		BUG_ON(m->alive == inserting);
+
+		BUG_ON(atomic_read(&m->blocks_nonempty));
+
+		for (i = 0; i < EC_STRIPE_MAX; i++)
+			BUG_ON(atomic_read(&m->block_sectors[i]));
+
+		if (inserting) {
+			m->sectors	= le16_to_cpu(s.v->sectors);
+			m->algorithm	= s.v->algorithm;
+			m->nr_blocks	= s.v->nr_blocks;
+			m->nr_redundant	= s.v->nr_redundant;
+		}
+
+		if (inserting)
+			bch2_stripes_heap_insert(c, m, idx);
+		else
+			bch2_stripes_heap_del(c, m, idx);
+
+		bucket_set_stripe(c, s.v, inserting, fs_usage, 0);
+		break;
+	}
+	}
+}
+
 void bch2_mark_key(struct bch_fs *c,
 		   enum bkey_type type, struct bkey_s_c k,
 		   bool inserting, s64 sectors,
@@ -747,6 +881,10 @@ void bch2_mark_key(struct bch_fs *c,
 		bch2_mark_extent(c, k, sectors, BCH_DATA_USER,
 				 pos, stats, journal_seq, flags);
 		break;
+	case BKEY_TYPE_EC:
+		bch2_mark_stripe(c, k, inserting,
+				 pos, stats, journal_seq, flags);
+		break;
 	default:
 		break;
 	}
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index 8fe6871ad165..b48960fa5ce7 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -185,6 +185,7 @@ static inline bool is_available_bucket(struct bucket_mark mark)
 {
 	return (!mark.owned_by_allocator &&
 		!mark.dirty_sectors &&
+		!mark.stripe &&
 		!mark.nouse);
 }
 
diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h
index 49f3ab9009ea..9ec96dbab0e8 100644
--- a/fs/bcachefs/buckets_types.h
+++ b/fs/bcachefs/buckets_types.h
@@ -19,7 +19,8 @@ struct bucket_mark {
 				gen_valid:1,
 				owned_by_allocator:1,
 				nouse:1,
-				journal_seq_valid:1;
+				journal_seq_valid:1,
+				stripe:1;
 		u16		dirty_sectors;
 		u16		cached_sectors;
 
@@ -53,6 +54,7 @@ struct bucket_array {
 struct bch_dev_usage {
 	u64			buckets[BCH_DATA_NR];
 	u64			buckets_alloc;
+	u64			buckets_ec;
 	u64			buckets_unavailable;
 
 	/* _compressed_ sectors: */
@@ -67,6 +69,7 @@ struct bch_fs_usage {
 
 	struct {
 		u64		data[BCH_DATA_NR];
+		u64		ec_data;
 		u64		persistent_reserved;
 	}			replicas[BCH_REPLICAS_MAX];
 
diff --git a/fs/bcachefs/disk_groups.h b/fs/bcachefs/disk_groups.h
index ceb75f86b615..c8e0c37a5e1a 100644
--- a/fs/bcachefs/disk_groups.h
+++ b/fs/bcachefs/disk_groups.h
@@ -55,6 +55,19 @@ static inline struct target target_decode(unsigned target)
 }
 
 const struct bch_devs_mask *bch2_target_to_mask(struct bch_fs *, unsigned);
+
+static inline struct bch_devs_mask target_rw_devs(struct bch_fs *c,
+						  enum bch_data_type data_type,
+						  u16 target)
+{
+	struct bch_devs_mask devs = c->rw_devs[data_type];
+	const struct bch_devs_mask *t = bch2_target_to_mask(c, target);
+
+	if (t)
+		bitmap_and(devs.d, devs.d, t->d, BCH_SB_MEMBERS_MAX);
+	return devs;
+}
+
 bool bch2_dev_in_target(struct bch_fs *, unsigned, unsigned);
 
 int bch2_disk_path_find(struct bch_sb_handle *, const char *);
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
new file mode 100644
index 000000000000..f6314aa6a0f1
--- /dev/null
+++ b/fs/bcachefs/ec.c
@@ -0,0 +1,1265 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/* erasure coding */
+
+#include "bcachefs.h"
+#include "alloc_foreground.h"
+#include "bset.h"
+#include "btree_gc.h"
+#include "btree_update.h"
+#include "buckets.h"
+#include "disk_groups.h"
+#include "ec.h"
+#include "error.h"
+#include "io.h"
+#include "keylist.h"
+#include "super-io.h"
+#include "util.h"
+
+#include <linux/raid/pq.h>
+#include <linux/raid/xor.h>
+#include <linux/sort.h>
+
+struct ec_bio {
+	struct bch_dev		*ca;
+	struct ec_stripe_buf	*buf;
+	size_t			idx;
+	struct bio		bio;
+};
+
+/* Stripes btree keys: */
+
+static unsigned stripe_csums_per_device(const struct bch_stripe *s)
+{
+	return DIV_ROUND_UP(le16_to_cpu(s->sectors),
+			    1 << s->csum_granularity_bits);
+}
+
+static unsigned stripe_val_u64s(const struct bch_stripe *s)
+{
+	unsigned bytes = sizeof(struct bch_stripe) +
+		sizeof(struct bch_extent_ptr) * s->nr_blocks +
+		bch_crc_bytes[s->csum_type] * s->nr_blocks * stripe_csums_per_device(s);
+	return DIV_ROUND_UP(bytes, sizeof(u64));
+}
+
+static void *stripe_csum(struct bch_stripe *s, unsigned dev, unsigned csum_idx)
+{
+	unsigned csum_bytes = bch_crc_bytes[s->csum_type];
+	void *csums = s->ptrs + s->nr_blocks;
+
+	BUG_ON(!csum_bytes);
+
+	return csums + (dev * stripe_csums_per_device(s) + csum_idx) * csum_bytes;
+}
+
+const char *bch2_ec_key_invalid(const struct bch_fs *c, struct bkey_s_c k)
+{
+	if (k.k->p.inode)
+		return "invalid stripe key";
+
+	switch (k.k->type) {
+	case BCH_STRIPE: {
+		const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
+
+		if (bkey_val_bytes(k.k) < sizeof(*s))
+			return "incorrect value size";
+
+		if (bkey_val_u64s(k.k) != stripe_val_u64s(s))
+			return "incorrect value size";
+
+		return NULL;
+	}
+	default:
+		return "invalid type";
+	}
+}
+
+void bch2_ec_key_to_text(struct printbuf *out, struct bch_fs *c,
+			 struct bkey_s_c k)
+{
+	switch (k.k->type) {
+	case BCH_STRIPE: {
+		const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
+		unsigned i;
+
+		pr_buf(out, "algo %u sectors %u blocks %u:%u csum %u gran %u",
+		       s->algorithm,
+		       le16_to_cpu(s->sectors),
+		       s->nr_blocks - s->nr_redundant,
+		       s->nr_redundant,
+		       s->csum_type,
+		       1U << s->csum_granularity_bits);
+
+		for (i = 0; i < s->nr_blocks; i++)
+			pr_buf(out, " %u:%llu", s->ptrs[i].dev,
+			       (u64) s->ptrs[i].offset);
+	}
+	}
+}
+
+static int ptr_matches_stripe(struct bch_fs *c,
+			      struct bch_stripe *v,
+			      const struct bch_extent_ptr *ptr)
+{
+	unsigned i;
+
+	for (i = 0; i < v->nr_blocks - v->nr_redundant; i++) {
+		const struct bch_extent_ptr *ptr2 = v->ptrs + i;
+
+		if (ptr->dev == ptr2->dev &&
+		    ptr->gen == ptr2->gen &&
+		    ptr->offset >= ptr2->offset &&
+		    ptr->offset <  ptr2->offset + le16_to_cpu(v->sectors))
+			return i;
+	}
+
+	return -1;
+}
+
+static int extent_matches_stripe(struct bch_fs *c,
+				 struct bch_stripe *v,
+				 struct bkey_s_c k)
+{
+	struct bkey_s_c_extent e;
+	const struct bch_extent_ptr *ptr;
+	int idx;
+
+	if (!bkey_extent_is_data(k.k))
+		return -1;
+
+	e = bkey_s_c_to_extent(k);
+
+	extent_for_each_ptr(e, ptr) {
+		idx = ptr_matches_stripe(c, v, ptr);
+		if (idx >= 0)
+			return idx;
+	}
+
+	return -1;
+}
+
+static void ec_stripe_key_init(struct bch_fs *c,
+			       struct bkey_i_stripe *s,
+			       struct open_buckets *blocks,
+			       struct open_buckets *parity,
+			       unsigned stripe_size)
+{
+	struct open_bucket *ob;
+	unsigned i, u64s;
+
+	bkey_stripe_init(&s->k_i);
+	s->v.sectors			= cpu_to_le16(stripe_size);
+	s->v.algorithm			= 0;
+	s->v.nr_blocks			= parity->nr + blocks->nr;
+	s->v.nr_redundant		= parity->nr;
+	s->v.csum_granularity_bits	= ilog2(c->sb.encoded_extent_max);
+	s->v.csum_type			= BCH_CSUM_CRC32C;
+	s->v.pad			= 0;
+
+	open_bucket_for_each(c, blocks, ob, i)
+		s->v.ptrs[i]			= ob->ptr;
+
+	open_bucket_for_each(c, parity, ob, i)
+		s->v.ptrs[blocks->nr + i]	= ob->ptr;
+
+	while ((u64s = stripe_val_u64s(&s->v)) > BKEY_VAL_U64s_MAX) {
+		BUG_ON(1 << s->v.csum_granularity_bits >=
+		       le16_to_cpu(s->v.sectors) ||
+		       s->v.csum_granularity_bits == U8_MAX);
+		s->v.csum_granularity_bits++;
+	}
+
+	set_bkey_val_u64s(&s->k, u64s);
+}
+
+/* Checksumming: */
+
+static void ec_generate_checksums(struct ec_stripe_buf *buf)
+{
+	struct bch_stripe *v = &buf->key.v;
+	unsigned csum_granularity = 1 << v->csum_granularity_bits;
+	unsigned csums_per_device = stripe_csums_per_device(v);
+	unsigned csum_bytes = bch_crc_bytes[v->csum_type];
+	unsigned i, j;
+
+	if (!csum_bytes)
+		return;
+
+	BUG_ON(buf->offset);
+	BUG_ON(buf->size != le16_to_cpu(v->sectors));
+
+	for (i = 0; i < v->nr_blocks; i++) {
+		for (j = 0; j < csums_per_device; j++) {
+			unsigned offset = j << v->csum_granularity_bits;
+			unsigned len = min(csum_granularity, buf->size - offset);
+
+			struct bch_csum csum =
+				bch2_checksum(NULL, v->csum_type,
+					      null_nonce(),
+					      buf->data[i] + (offset << 9),
+					      len << 9);
+
+			memcpy(stripe_csum(v, i, j), &csum, csum_bytes);
+		}
+	}
+}
+
+static void ec_validate_checksums(struct bch_fs *c, struct ec_stripe_buf *buf)
+{
+	struct bch_stripe *v = &buf->key.v;
+	unsigned csum_granularity = 1 << v->csum_granularity_bits;
+	unsigned csum_bytes = bch_crc_bytes[v->csum_type];
+	unsigned i;
+
+	if (!csum_bytes)
+		return;
+
+	for (i = 0; i < v->nr_blocks; i++) {
+		unsigned offset = buf->offset;
+		unsigned end = buf->offset + buf->size;
+
+		if (!test_bit(i, buf->valid))
+			continue;
+
+		while (offset < end) {
+			unsigned j = offset >> v->csum_granularity_bits;
+			unsigned len = min(csum_granularity, end - offset);
+			struct bch_csum csum;
+
+			BUG_ON(offset & (csum_granularity - 1));
+			BUG_ON(offset + len != le16_to_cpu(v->sectors) &&
+			       ((offset + len) & (csum_granularity - 1)));
+
+			csum = bch2_checksum(NULL, v->csum_type,
+					     null_nonce(),
+					     buf->data[i] + ((offset - buf->offset) << 9),
+					     len << 9);
+
+			if (memcmp(stripe_csum(v, i, j), &csum, csum_bytes)) {
+				__bcache_io_error(c,
+					"checksum error while doing reconstruct read (%u:%u)",
+					i, j);
+				clear_bit(i, buf->valid);
+				break;
+			}
+
+			offset += len;
+		}
+	}
+}
+
+/* Erasure coding: */
+
+static void raid5_recov(unsigned disks, unsigned bytes,
+			unsigned failed, void **data)
+{
+	unsigned i = 2, nr;
+
+	BUG_ON(failed >= disks);
+
+	swap(data[0], data[failed]);
+	memcpy(data[0], data[1], bytes);
+
+	while (i < disks) {
+		nr = min_t(unsigned, disks - i, MAX_XOR_BLOCKS);
+		xor_blocks(nr, bytes, data[0], data + i);
+		i += nr;
+	}
+
+	swap(data[0], data[failed]);
+}
+
+static void ec_generate_ec(struct ec_stripe_buf *buf)
+{
+	struct bch_stripe *v = &buf->key.v;
+	unsigned nr_data = v->nr_blocks - v->nr_redundant;
+	unsigned bytes = le16_to_cpu(v->sectors) << 9;
+
+	switch (v->nr_redundant) {
+	case 2:
+		raid6_call.gen_syndrome(v->nr_blocks, bytes, buf->data);
+		fallthrough;
+	case 1:
+		raid5_recov(v->nr_blocks, bytes, nr_data, buf->data);
+		break;
+	default:
+		BUG();
+	}
+}
+
+static unsigned __ec_nr_failed(struct ec_stripe_buf *buf, unsigned nr)
+{
+	return nr - bitmap_weight(buf->valid, nr);
+}
+
+static unsigned ec_nr_failed(struct ec_stripe_buf *buf)
+{
+	return __ec_nr_failed(buf, buf->key.v.nr_blocks);
+}
+
+static int ec_do_recov(struct bch_fs *c, struct ec_stripe_buf *buf)
+{
+	struct bch_stripe *v = &buf->key.v;
+	unsigned i, failed[EC_STRIPE_MAX], nr_failed = 0;
+	unsigned nr_data = v->nr_blocks - v->nr_redundant;
+	unsigned bytes = buf->size << 9;
+
+	if (ec_nr_failed(buf) > v->nr_redundant) {
+		__bcache_io_error(c,
+			"error doing reconstruct read: unable to read enough blocks");
+		return -1;
+	}
+
+	for (i = 0; i < nr_data; i++)
+		if (!test_bit(i, buf->valid))
+			failed[nr_failed++] = i;
+
+	switch (nr_failed) {
+	case 0:
+		break;
+	case 1:
+		if (test_bit(nr_data, buf->valid))
+			raid5_recov(nr_data + 1, bytes, failed[0], buf->data);
+		else
+			raid6_datap_recov(v->nr_blocks, bytes, failed[0], buf->data);
+		break;
+	case 2:
+		/* data+data failure. */
+		raid6_2data_recov(v->nr_blocks, bytes, failed[0], failed[1], buf->data);
+		break;
+
+	default:
+		BUG();
+	}
+
+	for (i = nr_data; i < v->nr_blocks; i++)
+		if (!test_bit(i, buf->valid)) {
+			ec_generate_ec(buf);
+			break;
+		}
+
+	return 0;
+}
+
+/* IO: */
+
+static void ec_block_endio(struct bio *bio)
+{
+	struct ec_bio *ec_bio = container_of(bio, struct ec_bio, bio);
+	struct bch_dev *ca = ec_bio->ca;
+	struct closure *cl = bio->bi_private;
+
+	if (bch2_dev_io_err_on(bio->bi_status, ca, "erasure coding"))
+		clear_bit(ec_bio->idx, ec_bio->buf->valid);
+
+	bio_put(&ec_bio->bio);
+	percpu_ref_put(&ca->io_ref);
+	closure_put(cl);
+}
+
+static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf,
+			unsigned rw, unsigned idx, struct closure *cl)
+{
+	struct bch_stripe *v = &buf->key.v;
+	unsigned offset = 0, bytes = buf->size << 9;
+	struct bch_extent_ptr *ptr = &v->ptrs[idx];
+	struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+
+	if (!bch2_dev_get_ioref(ca, rw)) {
+		clear_bit(idx, buf->valid);
+		return;
+	}
+
+	while (offset < bytes) {
+		unsigned nr_iovecs = min_t(size_t, BIO_MAX_VECS,
+					   DIV_ROUND_UP(bytes, PAGE_SIZE));
+		unsigned b = min_t(size_t, bytes - offset,
+				   nr_iovecs << PAGE_SHIFT);
+		struct ec_bio *ec_bio;
+
+		ec_bio = container_of(bio_alloc_bioset(ca->disk_sb.bdev,
+						       nr_iovecs,
+						       rw,
+						       GFP_KERNEL,
+						       &c->ec_bioset),
+				      struct ec_bio, bio);
+
+		ec_bio->ca			= ca;
+		ec_bio->buf			= buf;
+		ec_bio->idx			= idx;
+
+		ec_bio->bio.bi_iter.bi_sector	= ptr->offset + buf->offset + (offset >> 9);
+		ec_bio->bio.bi_iter.bi_size	= b;
+		ec_bio->bio.bi_end_io		= ec_block_endio;
+		ec_bio->bio.bi_private		= cl;
+
+		bch2_bio_map(&ec_bio->bio, buf->data[idx] + offset);
+
+		closure_get(cl);
+		percpu_ref_get(&ca->io_ref);
+
+		submit_bio(&ec_bio->bio);
+
+		offset += b;
+	}
+
+	percpu_ref_put(&ca->io_ref);
+}
+
+/* recovery read path: */
+int bch2_ec_read_extent(struct bch_fs *c, struct bch_read_bio *rbio)
+{
+	struct btree_iter iter;
+	struct ec_stripe_buf *buf;
+	struct closure cl;
+	struct bkey_s_c k;
+	struct bch_stripe *v;
+	unsigned stripe_idx;
+	unsigned offset, end;
+	unsigned i, nr_data, csum_granularity;
+	int ret = 0, idx;
+
+	closure_init_stack(&cl);
+
+	BUG_ON(!rbio->pick.idx ||
+	       rbio->pick.idx - 1 >= rbio->pick.ec_nr);
+
+	stripe_idx = rbio->pick.ec[rbio->pick.idx - 1].idx;
+
+	buf = kzalloc(sizeof(*buf), GFP_NOIO);
+	if (!buf)
+		return -ENOMEM;
+
+	bch2_btree_iter_init(&iter, c, BTREE_ID_EC,
+			     POS(0, stripe_idx),
+			     BTREE_ITER_SLOTS);
+	k = bch2_btree_iter_peek_slot(&iter);
+	if (btree_iter_err(k) || k.k->type != BCH_STRIPE) {
+		__bcache_io_error(c,
+			"error doing reconstruct read: stripe not found");
+		kfree(buf);
+		return bch2_btree_iter_unlock(&iter) ?: -EIO;
+	}
+
+	bkey_reassemble(&buf->key.k_i, k);
+	bch2_btree_iter_unlock(&iter);
+
+	v = &buf->key.v;
+
+	nr_data = v->nr_blocks - v->nr_redundant;
+
+	idx = ptr_matches_stripe(c, v, &rbio->pick.ptr);
+	BUG_ON(idx < 0);
+
+	csum_granularity = 1U << v->csum_granularity_bits;
+
+	offset	= rbio->bio.bi_iter.bi_sector - v->ptrs[idx].offset;
+	end	= offset + bio_sectors(&rbio->bio);
+
+	BUG_ON(end > le16_to_cpu(v->sectors));
+
+	buf->offset	= round_down(offset, csum_granularity);
+	buf->size	= min_t(unsigned, le16_to_cpu(v->sectors),
+				round_up(end, csum_granularity)) - buf->offset;
+
+	for (i = 0; i < v->nr_blocks; i++) {
+		buf->data[i] = kmalloc(buf->size << 9, GFP_NOIO);
+		if (!buf->data[i]) {
+			ret = -ENOMEM;
+			goto err;
+		}
+	}
+
+	memset(buf->valid, 0xFF, sizeof(buf->valid));
+
+	for (i = 0; i < v->nr_blocks; i++) {
+		struct bch_extent_ptr *ptr = v->ptrs + i;
+		struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+
+		if (ptr_stale(ca, ptr)) {
+			__bcache_io_error(c,
+					  "error doing reconstruct read: stale pointer");
+			clear_bit(i, buf->valid);
+			continue;
+		}
+
+		ec_block_io(c, buf, REQ_OP_READ, i, &cl);
+	}
+
+	closure_sync(&cl);
+
+	if (ec_nr_failed(buf) > v->nr_redundant) {
+		__bcache_io_error(c,
+			"error doing reconstruct read: unable to read enough blocks");
+		ret = -EIO;
+		goto err;
+	}
+
+	ec_validate_checksums(c, buf);
+
+	ret = ec_do_recov(c, buf);
+	if (ret)
+		goto err;
+
+	memcpy_to_bio(&rbio->bio, rbio->bio.bi_iter,
+		      buf->data[idx] + ((offset - buf->offset) << 9));
+err:
+	for (i = 0; i < v->nr_blocks; i++)
+		kfree(buf->data[i]);
+	kfree(buf);
+	return ret;
+}
+
+/* ec_stripe bucket accounting: */
+
+static int __ec_stripe_mem_alloc(struct bch_fs *c, size_t idx, gfp_t gfp)
+{
+	ec_stripes_heap n, *h = &c->ec_stripes_heap;
+
+	if (idx >= h->size) {
+		if (!init_heap(&n, max(1024UL, roundup_pow_of_two(idx + 1)), gfp))
+			return -ENOMEM;
+
+		spin_lock(&c->ec_stripes_heap_lock);
+		if (n.size > h->size) {
+			memcpy(n.data, h->data, h->used * sizeof(h->data[0]));
+			n.used = h->used;
+			swap(*h, n);
+		}
+		spin_unlock(&c->ec_stripes_heap_lock);
+
+		free_heap(&n);
+	}
+
+	if (!genradix_ptr_alloc(&c->ec_stripes, idx, gfp))
+		return -ENOMEM;
+
+	return 0;
+}
+
+static int ec_stripe_mem_alloc(struct bch_fs *c,
+			       struct btree_iter *iter)
+{
+	size_t idx = iter->pos.offset;
+
+	if (!__ec_stripe_mem_alloc(c, idx, GFP_NOWAIT|__GFP_NOWARN))
+		return 0;
+
+	bch2_btree_iter_unlock(iter);
+
+	if (!__ec_stripe_mem_alloc(c, idx, GFP_KERNEL))
+		return -EINTR;
+	return -ENOMEM;
+}
+
+static ssize_t stripe_idx_to_delete(struct bch_fs *c)
+{
+	ec_stripes_heap *h = &c->ec_stripes_heap;
+
+	return h->data[0].blocks_nonempty == 0 ? h->data[0].idx : -1;
+}
+
+static inline int ec_stripes_heap_cmp(ec_stripes_heap *h,
+				      struct ec_stripe_heap_entry l,
+				      struct ec_stripe_heap_entry r)
+{
+	return ((l.blocks_nonempty > r.blocks_nonempty) -
+		(l.blocks_nonempty < r.blocks_nonempty));
+}
+
+static inline void ec_stripes_heap_set_backpointer(ec_stripes_heap *h,
+						   size_t i)
+{
+	struct bch_fs *c = container_of(h, struct bch_fs, ec_stripes_heap);
+
+	genradix_ptr(&c->ec_stripes, h->data[i].idx)->heap_idx = i;
+}
+
+static void heap_verify_backpointer(struct bch_fs *c, size_t idx)
+{
+	ec_stripes_heap *h = &c->ec_stripes_heap;
+	struct ec_stripe *m = genradix_ptr(&c->ec_stripes, idx);
+
+	BUG_ON(!m->alive);
+	BUG_ON(m->heap_idx >= h->used);
+	BUG_ON(h->data[m->heap_idx].idx != idx);
+}
+
+static inline unsigned stripe_entry_blocks(struct ec_stripe *m)
+{
+	return atomic_read(&m->pin)
+		? UINT_MAX : atomic_read(&m->blocks_nonempty);
+}
+
+void bch2_stripes_heap_update(struct bch_fs *c,
+			      struct ec_stripe *m, size_t idx)
+{
+	ec_stripes_heap *h = &c->ec_stripes_heap;
+	bool queue_delete;
+	size_t i;
+
+	spin_lock(&c->ec_stripes_heap_lock);
+
+	if (!m->alive) {
+		spin_unlock(&c->ec_stripes_heap_lock);
+		return;
+	}
+
+	heap_verify_backpointer(c, idx);
+
+	h->data[m->heap_idx].blocks_nonempty =
+		stripe_entry_blocks(m);
+
+	i = m->heap_idx;
+	heap_sift_up(h,	  i, ec_stripes_heap_cmp,
+		     ec_stripes_heap_set_backpointer);
+	heap_sift_down(h, i, ec_stripes_heap_cmp,
+		       ec_stripes_heap_set_backpointer);
+
+	heap_verify_backpointer(c, idx);
+
+	queue_delete = stripe_idx_to_delete(c) >= 0;
+	spin_unlock(&c->ec_stripes_heap_lock);
+
+	if (queue_delete)
+		schedule_work(&c->ec_stripe_delete_work);
+}
+
+void bch2_stripes_heap_del(struct bch_fs *c,
+			   struct ec_stripe *m, size_t idx)
+{
+	spin_lock(&c->ec_stripes_heap_lock);
+	heap_verify_backpointer(c, idx);
+
+	m->alive = false;
+	heap_del(&c->ec_stripes_heap, m->heap_idx,
+		 ec_stripes_heap_cmp,
+		 ec_stripes_heap_set_backpointer);
+	spin_unlock(&c->ec_stripes_heap_lock);
+}
+
+void bch2_stripes_heap_insert(struct bch_fs *c,
+			      struct ec_stripe *m, size_t idx)
+{
+	spin_lock(&c->ec_stripes_heap_lock);
+
+	BUG_ON(heap_full(&c->ec_stripes_heap));
+
+	heap_add(&c->ec_stripes_heap, ((struct ec_stripe_heap_entry) {
+			.idx = idx,
+			.blocks_nonempty = stripe_entry_blocks(m),
+		}),
+		 ec_stripes_heap_cmp,
+		 ec_stripes_heap_set_backpointer);
+	m->alive = true;
+
+	heap_verify_backpointer(c, idx);
+
+	spin_unlock(&c->ec_stripes_heap_lock);
+}
+
+static void ec_stripe_delete(struct bch_fs *c, unsigned idx)
+{
+	struct btree_iter iter;
+	struct bch_stripe *v = NULL;
+	struct bkey_s_c k;
+	struct bkey_i delete;
+	u64 journal_seq = 0;
+
+	bch2_btree_iter_init(&iter, c, BTREE_ID_EC,
+			     POS(0, idx),
+			     BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+	k = bch2_btree_iter_peek_slot(&iter);
+	if (btree_iter_err(k) || k.k->type != BCH_STRIPE)
+		goto out;
+
+	v = kmalloc(bkey_val_bytes(k.k), GFP_KERNEL);
+	BUG_ON(!v);
+	memcpy(v, bkey_s_c_to_stripe(k).v, bkey_val_bytes(k.k));
+
+	bkey_init(&delete.k);
+	delete.k.p = iter.pos;
+
+	bch2_btree_insert_at(c, NULL, &journal_seq,
+			     BTREE_INSERT_NOFAIL|
+			     BTREE_INSERT_USE_RESERVE|
+			     BTREE_INSERT_NOUNLOCK,
+			     BTREE_INSERT_ENTRY(&iter, &delete));
+out:
+	bch2_btree_iter_unlock(&iter);
+	kfree(v);
+}
+
+static void ec_stripe_delete_work(struct work_struct *work)
+{
+	struct bch_fs *c =
+		container_of(work, struct bch_fs, ec_stripe_delete_work);
+	ssize_t idx;
+
+	down_read(&c->gc_lock);
+
+	while (1) {
+		spin_lock(&c->ec_stripes_heap_lock);
+		idx = stripe_idx_to_delete(c);
+		spin_unlock(&c->ec_stripes_heap_lock);
+
+		if (idx < 0)
+			break;
+
+		ec_stripe_delete(c, idx);
+	}
+
+	up_read(&c->gc_lock);
+}
+
+static int ec_stripe_bkey_insert(struct bch_fs *c,
+				 struct bkey_i_stripe *stripe)
+{
+	struct ec_stripe *m;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	int ret;
+
+	/* XXX: start pos hint */
+retry:
+	for_each_btree_key(&iter, c, BTREE_ID_EC, POS_MIN,
+			   BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k) {
+		if (bkey_cmp(k.k->p, POS(0, U32_MAX)) > 0) {
+			bch2_btree_iter_unlock(&iter);
+			return -ENOSPC;
+		}
+
+		if (bkey_deleted(k.k))
+			goto found_slot;
+	}
+
+	return bch2_btree_iter_unlock(&iter) ?: -ENOSPC;
+found_slot:
+	mutex_lock(&c->ec_stripes_lock);
+	ret = ec_stripe_mem_alloc(c, &iter);
+	mutex_unlock(&c->ec_stripes_lock);
+
+	if (ret == -EINTR)
+		goto retry;
+	if (ret)
+		return ret;
+
+	m = genradix_ptr(&c->ec_stripes, iter.pos.offset);
+	atomic_inc(&m->pin);
+
+	stripe->k.p = iter.pos;
+
+	ret = bch2_btree_insert_at(c, NULL, NULL,
+				   BTREE_INSERT_NOFAIL|
+				   BTREE_INSERT_USE_RESERVE,
+				   BTREE_INSERT_ENTRY(&iter, &stripe->k_i));
+	bch2_btree_iter_unlock(&iter);
+
+	if (ret)
+		atomic_dec(&m->pin);
+
+	return ret;
+}
+
+/* stripe creation: */
+
+static void extent_stripe_ptr_add(struct bkey_s_extent e,
+				  struct ec_stripe_buf *s,
+				  struct bch_extent_ptr *ptr,
+				  unsigned block)
+{
+	struct bch_extent_stripe_ptr *dst = (void *) ptr;
+	union bch_extent_entry *end = extent_entry_last(e);
+
+	memmove_u64s_up(dst + 1, dst, (u64 *) end - (u64 *) dst);
+	e.k->u64s += sizeof(*dst) / sizeof(u64);
+
+	*dst = (struct bch_extent_stripe_ptr) {
+		.type = 1 << BCH_EXTENT_ENTRY_stripe_ptr,
+		.block		= block,
+		.idx		= s->key.k.p.offset,
+	};
+}
+
+static int ec_stripe_update_ptrs(struct bch_fs *c,
+				 struct ec_stripe_buf *s,
+				 struct bkey *pos)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	struct bkey_s_extent e;
+	struct bch_extent_ptr *ptr;
+	BKEY_PADDED(k) tmp;
+	int ret = 0, dev, idx;
+
+	bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS,
+			     bkey_start_pos(pos),
+			     BTREE_ITER_INTENT);
+
+	while ((k = bch2_btree_iter_peek(&iter)).k &&
+	       !btree_iter_err(k) &&
+	       bkey_cmp(bkey_start_pos(k.k), pos->p) < 0) {
+		idx = extent_matches_stripe(c, &s->key.v, k);
+		if (idx < 0) {
+			bch2_btree_iter_next(&iter);
+			continue;
+		}
+
+		dev = s->key.v.ptrs[idx].dev;
+
+		bkey_reassemble(&tmp.k, k);
+		e = bkey_i_to_s_extent(&tmp.k);
+
+		extent_for_each_ptr(e, ptr)
+			if (ptr->dev != dev)
+				ptr->cached = true;
+
+		ptr = (void *) bch2_extent_has_device(e.c, dev);
+		BUG_ON(!ptr);
+
+		extent_stripe_ptr_add(e, s, ptr, idx);
+
+		ret = bch2_btree_insert_at(c, NULL, NULL,
+				BTREE_INSERT_ATOMIC|
+				BTREE_INSERT_NOFAIL|
+				BTREE_INSERT_USE_RESERVE,
+				BTREE_INSERT_ENTRY(&iter, &tmp.k));
+		if (ret == -EINTR)
+			ret = 0;
+		if (ret)
+			break;
+	}
+
+	return bch2_btree_iter_unlock(&iter) ?: ret;
+}
+
+/*
+ * data buckets of new stripe all written: create the stripe
+ */
+static void ec_stripe_create(struct ec_stripe_new *s)
+{
+	struct ec_stripe *ec_stripe;
+	struct bch_fs *c = s->c;
+	struct open_bucket *ob;
+	struct bkey_i *k;
+	struct bch_stripe *v = &s->stripe.key.v;
+	unsigned i, nr_data = v->nr_blocks - v->nr_redundant;
+	struct closure cl;
+	int ret;
+
+	BUG_ON(s->h->s == s);
+
+	closure_init_stack(&cl);
+
+	if (s->err) {
+		bch_err(c, "error creating stripe: error writing data buckets");
+		goto err;
+	}
+
+	if (!percpu_ref_tryget(&c->writes))
+		goto err;
+
+	BUG_ON(bitmap_weight(s->blocks_allocated,
+			     s->blocks.nr) != s->blocks.nr);
+
+	ec_generate_ec(&s->stripe);
+
+	ec_generate_checksums(&s->stripe);
+
+	/* write p/q: */
+	for (i = nr_data; i < v->nr_blocks; i++)
+		ec_block_io(c, &s->stripe, REQ_OP_WRITE, i, &cl);
+
+	closure_sync(&cl);
+
+	for (i = nr_data; i < v->nr_blocks; i++)
+		if (!test_bit(i, s->stripe.valid)) {
+			bch_err(c, "error creating stripe: error writing redundancy buckets");
+			goto err_put_writes;
+		}
+
+	ret = ec_stripe_bkey_insert(c, &s->stripe.key);
+	if (ret) {
+		bch_err(c, "error creating stripe: error creating stripe key");
+		goto err_put_writes;
+	}
+
+	for_each_keylist_key(&s->keys, k) {
+		ret = ec_stripe_update_ptrs(c, &s->stripe, &k->k);
+		if (ret)
+			break;
+	}
+
+	ec_stripe = genradix_ptr(&c->ec_stripes, s->stripe.key.k.p.offset);
+
+	atomic_dec(&ec_stripe->pin);
+	bch2_stripes_heap_update(c, ec_stripe,
+				 s->stripe.key.k.p.offset);
+
+err_put_writes:
+	percpu_ref_put(&c->writes);
+err:
+	open_bucket_for_each(c, &s->blocks, ob, i) {
+		ob->ec = NULL;
+		__bch2_open_bucket_put(c, ob);
+	}
+
+	bch2_open_buckets_put(c, &s->parity);
+
+	bch2_keylist_free(&s->keys, s->inline_keys);
+
+	mutex_lock(&s->h->lock);
+	list_del(&s->list);
+	mutex_unlock(&s->h->lock);
+
+	for (i = 0; i < s->stripe.key.v.nr_blocks; i++)
+		kvpfree(s->stripe.data[i], s->stripe.size << 9);
+	kfree(s);
+}
+
+static struct ec_stripe_new *ec_stripe_set_pending(struct ec_stripe_head *h)
+{
+	struct ec_stripe_new *s = h->s;
+
+	list_add(&s->list, &h->stripes);
+	h->s = NULL;
+
+	return s;
+}
+
+static void ec_stripe_new_put(struct ec_stripe_new *s)
+{
+	BUG_ON(atomic_read(&s->pin) <= 0);
+	if (atomic_dec_and_test(&s->pin))
+		ec_stripe_create(s);
+}
+
+/* have a full bucket - hand it off to be erasure coded: */
+void bch2_ec_bucket_written(struct bch_fs *c, struct open_bucket *ob)
+{
+	struct ec_stripe_new *s = ob->ec;
+
+	if (ob->sectors_free)
+		s->err = -1;
+
+	ec_stripe_new_put(s);
+}
+
+void bch2_ec_bucket_cancel(struct bch_fs *c, struct open_bucket *ob)
+{
+	struct ec_stripe_new *s = ob->ec;
+
+	s->err = -EIO;
+}
+
+void *bch2_writepoint_ec_buf(struct bch_fs *c, struct write_point *wp)
+{
+	struct open_bucket *ob = ec_open_bucket(c, &wp->ptrs);
+	struct bch_dev *ca;
+	unsigned offset;
+
+	if (!ob)
+		return NULL;
+
+	ca	= bch_dev_bkey_exists(c, ob->ptr.dev);
+	offset	= ca->mi.bucket_size - ob->sectors_free;
+
+	return ob->ec->stripe.data[ob->ec_idx] + (offset << 9);
+}
+
+void bch2_ec_add_backpointer(struct bch_fs *c, struct write_point *wp,
+			     struct bpos pos, unsigned sectors)
+{
+	struct open_bucket *ob = ec_open_bucket(c, &wp->ptrs);
+	struct ec_stripe_new *ec;
+
+	if (!ob)
+		return;
+
+	ec = ob->ec;
+	mutex_lock(&ec->lock);
+
+	if (bch2_keylist_realloc(&ec->keys, ec->inline_keys,
+				 ARRAY_SIZE(ec->inline_keys),
+				 BKEY_U64s)) {
+		BUG();
+	}
+
+	bkey_init(&ec->keys.top->k);
+	ec->keys.top->k.p	= pos;
+	bch2_key_resize(&ec->keys.top->k, sectors);
+	bch2_keylist_push(&ec->keys);
+
+	mutex_unlock(&ec->lock);
+}
+
+static int unsigned_cmp(const void *_l, const void *_r)
+{
+	unsigned l = *((const unsigned *) _l);
+	unsigned r = *((const unsigned *) _r);
+
+	return (l > r) - (l < r);
+}
+
+/* pick most common bucket size: */
+static unsigned pick_blocksize(struct bch_fs *c,
+			       struct bch_devs_mask *devs)
+{
+	struct bch_dev *ca;
+	unsigned i, nr = 0, sizes[BCH_SB_MEMBERS_MAX];
+	struct {
+		unsigned nr, size;
+	} cur = { 0, 0 }, best = { 0, 0 };
+
+	for_each_member_device_rcu(ca, c, i, devs)
+		sizes[nr++] = ca->mi.bucket_size;
+
+	sort(sizes, nr, sizeof(unsigned), unsigned_cmp, NULL);
+
+	for (i = 0; i < nr; i++) {
+		if (sizes[i] != cur.size) {
+			if (cur.nr > best.nr)
+				best = cur;
+
+			cur.nr = 0;
+			cur.size = sizes[i];
+		}
+
+		cur.nr++;
+	}
+
+	if (cur.nr > best.nr)
+		best = cur;
+
+	return best.size;
+}
+
+int bch2_ec_stripe_new_alloc(struct bch_fs *c, struct ec_stripe_head *h)
+{
+	struct ec_stripe_new *s;
+	unsigned i;
+
+	BUG_ON(h->parity.nr != h->redundancy);
+	BUG_ON(!h->blocks.nr);
+	BUG_ON(h->parity.nr + h->blocks.nr > EC_STRIPE_MAX);
+	lockdep_assert_held(&h->lock);
+
+	s = kzalloc(sizeof(*s), GFP_KERNEL);
+	if (!s)
+		return -ENOMEM;
+
+	mutex_init(&s->lock);
+	atomic_set(&s->pin, 1);
+	s->c		= c;
+	s->h		= h;
+	s->blocks	= h->blocks;
+	s->parity	= h->parity;
+
+	memset(&h->blocks, 0, sizeof(h->blocks));
+	memset(&h->parity, 0, sizeof(h->parity));
+
+	bch2_keylist_init(&s->keys, s->inline_keys);
+
+	s->stripe.offset	= 0;
+	s->stripe.size		= h->blocksize;
+	memset(s->stripe.valid, 0xFF, sizeof(s->stripe.valid));
+
+	ec_stripe_key_init(c, &s->stripe.key,
+			   &s->blocks, &s->parity,
+			   h->blocksize);
+
+	for (i = 0; i < s->stripe.key.v.nr_blocks; i++) {
+		s->stripe.data[i] = kvpmalloc(s->stripe.size << 9, GFP_KERNEL);
+		if (!s->stripe.data[i])
+			goto err;
+	}
+
+	h->s = s;
+
+	return 0;
+err:
+	for (i = 0; i < s->stripe.key.v.nr_blocks; i++)
+		kvpfree(s->stripe.data[i], s->stripe.size << 9);
+	kfree(s);
+	return -ENOMEM;
+}
+
+static struct ec_stripe_head *
+ec_new_stripe_head_alloc(struct bch_fs *c, unsigned target,
+			 unsigned algo, unsigned redundancy)
+{
+	struct ec_stripe_head *h;
+	struct bch_dev *ca;
+	unsigned i;
+
+	h = kzalloc(sizeof(*h), GFP_KERNEL);
+	if (!h)
+		return NULL;
+
+	mutex_init(&h->lock);
+	mutex_lock(&h->lock);
+	INIT_LIST_HEAD(&h->stripes);
+
+	h->target	= target;
+	h->algo		= algo;
+	h->redundancy	= redundancy;
+
+	rcu_read_lock();
+	h->devs = target_rw_devs(c, BCH_DATA_USER, target);
+
+	for_each_member_device_rcu(ca, c, i, &h->devs)
+		if (!ca->mi.durability)
+			__clear_bit(i, h->devs.d);
+
+	h->blocksize = pick_blocksize(c, &h->devs);
+
+	for_each_member_device_rcu(ca, c, i, &h->devs)
+		if (ca->mi.bucket_size == h->blocksize)
+			h->nr_active_devs++;
+
+	rcu_read_unlock();
+	list_add(&h->list, &c->ec_new_stripe_list);
+	return h;
+}
+
+void bch2_ec_stripe_head_put(struct ec_stripe_head *h)
+{
+	struct ec_stripe_new *s = NULL;
+
+	if (h->s &&
+	    bitmap_weight(h->s->blocks_allocated,
+			  h->s->blocks.nr) == h->s->blocks.nr)
+		s = ec_stripe_set_pending(h);
+
+	mutex_unlock(&h->lock);
+
+	if (s)
+		ec_stripe_new_put(s);
+}
+
+struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *c,
+					       unsigned target,
+					       unsigned algo,
+					       unsigned redundancy)
+{
+	struct ec_stripe_head *h;
+
+	if (!redundancy)
+		return NULL;
+
+	mutex_lock(&c->ec_new_stripe_lock);
+	list_for_each_entry(h, &c->ec_new_stripe_list, list)
+		if (h->target		== target &&
+		    h->algo		== algo &&
+		    h->redundancy	== redundancy) {
+			mutex_lock(&h->lock);
+			goto found;
+		}
+
+	h = ec_new_stripe_head_alloc(c, target, algo, redundancy);
+found:
+	mutex_unlock(&c->ec_new_stripe_lock);
+	return h;
+}
+
+void bch2_ec_stop_dev(struct bch_fs *c, struct bch_dev *ca)
+{
+	struct ec_stripe_head *h;
+	struct open_bucket *ob;
+	unsigned i;
+
+	mutex_lock(&c->ec_new_stripe_lock);
+	list_for_each_entry(h, &c->ec_new_stripe_list, list) {
+		struct ec_stripe_new *s = NULL;
+
+		mutex_lock(&h->lock);
+		bch2_open_buckets_stop_dev(c, ca,
+					   &h->blocks,
+					   BCH_DATA_USER);
+		bch2_open_buckets_stop_dev(c, ca,
+					   &h->parity,
+					   BCH_DATA_USER);
+
+		if (!h->s)
+			goto unlock;
+
+		open_bucket_for_each(c, &h->s->blocks, ob, i)
+			if (ob->ptr.dev == ca->dev_idx)
+				goto found;
+		open_bucket_for_each(c, &h->s->parity, ob, i)
+			if (ob->ptr.dev == ca->dev_idx)
+				goto found;
+		goto unlock;
+found:
+		h->s->err = -1;
+		s = ec_stripe_set_pending(h);
+unlock:
+		mutex_unlock(&h->lock);
+
+		if (s)
+			ec_stripe_new_put(s);
+	}
+	mutex_unlock(&c->ec_new_stripe_lock);
+}
+
+int bch2_fs_ec_start(struct bch_fs *c)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	size_t i, idx = 0;
+	int ret = 0;
+
+	bch2_btree_iter_init(&iter, c, BTREE_ID_EC, POS(0, U64_MAX), 0);
+
+	k = bch2_btree_iter_prev(&iter);
+	if (!IS_ERR_OR_NULL(k.k))
+		idx = k.k->p.offset + 1;
+	ret = bch2_btree_iter_unlock(&iter);
+	if (ret)
+		return ret;
+
+	if (!init_heap(&c->ec_stripes_heap, roundup_pow_of_two(idx),
+		       GFP_KERNEL))
+		return -ENOMEM;
+#if 0
+	ret = genradix_prealloc(&c->ec_stripes, idx, GFP_KERNEL);
+#else
+	for (i = 0; i < idx; i++)
+		if (!genradix_ptr_alloc(&c->ec_stripes, i, GFP_KERNEL))
+			return -ENOMEM;
+#endif
+	return 0;
+}
+
+void bch2_fs_ec_exit(struct bch_fs *c)
+{
+	struct ec_stripe_head *h;
+
+	while (1) {
+		mutex_lock(&c->ec_new_stripe_lock);
+		h = list_first_entry_or_null(&c->ec_new_stripe_list,
+					     struct ec_stripe_head, list);
+		if (h)
+			list_del(&h->list);
+		mutex_unlock(&c->ec_new_stripe_lock);
+		if (!h)
+			break;
+
+		BUG_ON(h->s);
+		BUG_ON(!list_empty(&h->stripes));
+		kfree(h);
+	}
+
+	free_heap(&c->ec_stripes_heap);
+	genradix_free(&c->ec_stripes);
+	bioset_exit(&c->ec_bioset);
+}
+
+int bch2_fs_ec_init(struct bch_fs *c)
+{
+	INIT_WORK(&c->ec_stripe_delete_work, ec_stripe_delete_work);
+
+	return bioset_init(&c->ec_bioset, 1, offsetof(struct ec_bio, bio),
+			   BIOSET_NEED_BVECS);
+}
diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h
new file mode 100644
index 000000000000..bcf06529dcfc
--- /dev/null
+++ b/fs/bcachefs/ec.h
@@ -0,0 +1,109 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_EC_H
+#define _BCACHEFS_EC_H
+
+#include "ec_types.h"
+#include "keylist_types.h"
+
+const char *bch2_ec_key_invalid(const struct bch_fs *, struct bkey_s_c);
+void bch2_ec_key_to_text(struct printbuf *, struct bch_fs *,
+			 struct bkey_s_c);
+
+#define bch2_bkey_ec_ops (struct bkey_ops) {		\
+	.key_invalid	= bch2_ec_key_invalid,		\
+	.val_to_text	= bch2_ec_key_to_text,		\
+}
+
+struct bch_read_bio;
+
+struct ec_stripe_buf {
+	/* might not be buffering the entire stripe: */
+	unsigned		offset;
+	unsigned		size;
+	unsigned long		valid[BITS_TO_LONGS(EC_STRIPE_MAX)];
+
+	void			*data[EC_STRIPE_MAX];
+
+	union {
+		struct bkey_i_stripe	key;
+		u64			pad[255];
+	};
+};
+
+struct ec_stripe_head;
+
+struct ec_stripe_new {
+	struct bch_fs		*c;
+	struct ec_stripe_head	*h;
+	struct mutex		lock;
+	struct list_head	list;
+
+	/* counts in flight writes, stripe is created when pin == 0 */
+	atomic_t		pin;
+
+	int			err;
+
+	unsigned long		blocks_allocated[BITS_TO_LONGS(EC_STRIPE_MAX)];
+
+	struct open_buckets	blocks;
+	struct open_buckets	parity;
+
+	struct keylist		keys;
+	u64			inline_keys[BKEY_U64s * 8];
+
+	struct ec_stripe_buf	stripe;
+};
+
+struct ec_stripe_head {
+	struct list_head	list;
+	struct mutex		lock;
+
+	struct list_head	stripes;
+
+	unsigned		target;
+	unsigned		algo;
+	unsigned		redundancy;
+
+	struct bch_devs_mask	devs;
+	unsigned		nr_active_devs;
+
+	unsigned		blocksize;
+
+	struct dev_stripe_state	block_stripe;
+	struct dev_stripe_state	parity_stripe;
+
+	struct open_buckets	blocks;
+	struct open_buckets	parity;
+
+	struct ec_stripe_new	*s;
+};
+
+int bch2_ec_read_extent(struct bch_fs *, struct bch_read_bio *);
+
+void *bch2_writepoint_ec_buf(struct bch_fs *, struct write_point *);
+void bch2_ec_add_backpointer(struct bch_fs *, struct write_point *,
+			     struct bpos, unsigned);
+
+void bch2_ec_bucket_written(struct bch_fs *, struct open_bucket *);
+void bch2_ec_bucket_cancel(struct bch_fs *, struct open_bucket *);
+
+int bch2_ec_stripe_new_alloc(struct bch_fs *, struct ec_stripe_head *);
+
+void bch2_ec_stripe_head_put(struct ec_stripe_head *);
+struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *, unsigned,
+					       unsigned, unsigned);
+
+void bch2_stripes_heap_update(struct bch_fs *, struct ec_stripe *, size_t);
+void bch2_stripes_heap_del(struct bch_fs *, struct ec_stripe *, size_t);
+void bch2_stripes_heap_insert(struct bch_fs *, struct ec_stripe *, size_t);
+
+void bch2_ec_stop_dev(struct bch_fs *, struct bch_dev *);
+
+void bch2_ec_flush_new_stripes(struct bch_fs *);
+
+int bch2_fs_ec_start(struct bch_fs *);
+
+void bch2_fs_ec_exit(struct bch_fs *);
+int bch2_fs_ec_init(struct bch_fs *);
+
+#endif /* _BCACHEFS_EC_H */
diff --git a/fs/bcachefs/ec_types.h b/fs/bcachefs/ec_types.h
new file mode 100644
index 000000000000..00e89c3b7767
--- /dev/null
+++ b/fs/bcachefs/ec_types.h
@@ -0,0 +1,31 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_EC_TYPES_H
+#define _BCACHEFS_EC_TYPES_H
+
+#include <linux/llist.h>
+
+#define EC_STRIPE_MAX	16
+
+struct ec_stripe {
+	size_t			heap_idx;
+
+	u16			sectors;
+	u8			algorithm;
+
+	u8			nr_blocks;
+	u8			nr_redundant;
+
+	u8			alive;
+	atomic_t		pin;
+	atomic_t		blocks_nonempty;
+	atomic_t		block_sectors[EC_STRIPE_MAX];
+};
+
+struct ec_stripe_heap_entry {
+	size_t			idx;
+	unsigned		blocks_nonempty;
+};
+
+typedef HEAP(struct ec_stripe_heap_entry) ec_stripes_heap;
+
+#endif /* _BCACHEFS_EC_TYPES_H */
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index df0ca1fcf2e8..9bb4e10283e1 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -193,29 +193,41 @@ unsigned bch2_extent_nr_dirty_ptrs(struct bkey_s_c k)
 	return nr_ptrs;
 }
 
-unsigned bch2_extent_ptr_durability(struct bch_fs *c,
-				    const struct bch_extent_ptr *ptr)
+static unsigned bch2_extent_ptr_durability(struct bch_fs *c,
+					   struct extent_ptr_decoded p)
 {
+	unsigned i, durability = 0;
 	struct bch_dev *ca;
 
-	if (ptr->cached)
+	if (p.ptr.cached)
 		return 0;
 
-	ca = bch_dev_bkey_exists(c, ptr->dev);
+	ca = bch_dev_bkey_exists(c, p.ptr.dev);
 
-	if (ca->mi.state == BCH_MEMBER_STATE_FAILED)
-		return 0;
+	if (ca->mi.state != BCH_MEMBER_STATE_FAILED)
+		durability = max_t(unsigned, durability, ca->mi.durability);
+
+	for (i = 0; i < p.ec_nr; i++) {
+		struct ec_stripe *s =
+			genradix_ptr(&c->ec_stripes, p.idx);
 
-	return ca->mi.durability;
+		if (WARN_ON(!s))
+			continue;
+
+		durability = max_t(unsigned, durability, s->nr_redundant);
+	}
+
+	return durability;
 }
 
 unsigned bch2_extent_durability(struct bch_fs *c, struct bkey_s_c_extent e)
 {
-	const struct bch_extent_ptr *ptr;
+	const union bch_extent_entry *entry;
+	struct extent_ptr_decoded p;
 	unsigned durability = 0;
 
-	extent_for_each_ptr(e, ptr)
-		durability += bch2_extent_ptr_durability(c, ptr);
+	extent_for_each_ptr_decode(e, p, entry)
+		durability += bch2_extent_ptr_durability(c, p);
 
 	return durability;
 }
@@ -258,30 +270,46 @@ bool bch2_extent_matches_ptr(struct bch_fs *c, struct bkey_s_c_extent e,
 	return false;
 }
 
+static union bch_extent_entry *extent_entry_prev(struct bkey_s_extent e,
+					  union bch_extent_entry *entry)
+{
+	union bch_extent_entry *i = e.v->start;
+
+	if (i == entry)
+		return NULL;
+
+	while (extent_entry_next(i) != entry)
+		i = extent_entry_next(i);
+	return i;
+}
+
 union bch_extent_entry *bch2_extent_drop_ptr(struct bkey_s_extent e,
 					     struct bch_extent_ptr *ptr)
 {
-	union bch_extent_entry *dst;
-	union bch_extent_entry *src;
+	union bch_extent_entry *dst, *src, *prev;
+	bool drop_crc = true;
 
 	EBUG_ON(ptr < &e.v->start->ptr ||
 		ptr >= &extent_entry_last(e)->ptr);
 	EBUG_ON(ptr->type != 1 << BCH_EXTENT_ENTRY_ptr);
 
-	src = to_entry(ptr + 1);
-
+	src = extent_entry_next(to_entry(ptr));
 	if (src != extent_entry_last(e) &&
-	    extent_entry_type(src) == BCH_EXTENT_ENTRY_ptr) {
-		dst = to_entry(ptr);
-	} else {
-		extent_for_each_entry(e, dst) {
-			if (dst == to_entry(ptr))
-				break;
+	    !extent_entry_is_crc(src))
+		drop_crc = false;
 
-			if (extent_entry_next(dst) == to_entry(ptr) &&
-			    extent_entry_is_crc(dst))
-				break;
+	dst = to_entry(ptr);
+	while ((prev = extent_entry_prev(e, dst))) {
+		if (extent_entry_is_ptr(prev))
+			break;
+
+		if (extent_entry_is_crc(prev)) {
+			if (drop_crc)
+				dst = prev;
+			break;
 		}
+
+		dst = prev;
 	}
 
 	memmove_u64s_down(dst, src,
@@ -423,6 +451,8 @@ void bch2_ptr_swab(const struct bkey_format *f, struct bkey_packed *k)
 				entry->crc128.csum.lo = (__force __le64)
 					swab64((__force u64) entry->crc128.csum.lo);
 				break;
+			case BCH_EXTENT_ENTRY_stripe_ptr:
+				break;
 			}
 		}
 		break;
@@ -470,6 +500,7 @@ static void extent_print_ptrs(struct printbuf *out, struct bch_fs *c,
 	const union bch_extent_entry *entry;
 	struct bch_extent_crc_unpacked crc;
 	const struct bch_extent_ptr *ptr;
+	const struct bch_extent_stripe_ptr *ec;
 	struct bch_dev *ca;
 	bool first = true;
 
@@ -478,6 +509,18 @@ static void extent_print_ptrs(struct printbuf *out, struct bch_fs *c,
 			pr_buf(out, " ");
 
 		switch (__extent_entry_type(entry)) {
+		case BCH_EXTENT_ENTRY_ptr:
+			ptr = entry_to_ptr(entry);
+			ca = ptr->dev < c->sb.nr_devices && c->devs[ptr->dev]
+				? bch_dev_bkey_exists(c, ptr->dev)
+				: NULL;
+
+			pr_buf(out, "ptr: %u:%llu gen %u%s%s", ptr->dev,
+			       (u64) ptr->offset, ptr->gen,
+			       ptr->cached ? " cached" : "",
+			       ca && ptr_stale(ca, ptr)
+			       ? " stale" : "");
+			break;
 		case BCH_EXTENT_ENTRY_crc32:
 		case BCH_EXTENT_ENTRY_crc64:
 		case BCH_EXTENT_ENTRY_crc128:
@@ -490,17 +533,11 @@ static void extent_print_ptrs(struct printbuf *out, struct bch_fs *c,
 			       crc.csum_type,
 			       crc.compression_type);
 			break;
-		case BCH_EXTENT_ENTRY_ptr:
-			ptr = entry_to_ptr(entry);
-			ca = ptr->dev < c->sb.nr_devices && c->devs[ptr->dev]
-				? bch_dev_bkey_exists(c, ptr->dev)
-				: NULL;
+		case BCH_EXTENT_ENTRY_stripe_ptr:
+			ec = &entry->stripe_ptr;
 
-			pr_buf(out, "ptr: %u:%llu gen %u%s%s", ptr->dev,
-			       (u64) ptr->offset, ptr->gen,
-			       ptr->cached ? " cached" : "",
-			       ca && ptr_stale(ca, ptr)
-			       ? " stale" : "");
+			pr_buf(out, "ec: idx %llu block %u",
+			       (u64) ec->idx, ec->block);
 			break;
 		default:
 			pr_buf(out, "(invalid extent entry %.16llx)", *((u64 *) entry));
@@ -536,6 +573,11 @@ void bch2_mark_io_failure(struct bch_io_failures *failed,
 
 		f = &failed->devs[failed->nr++];
 		f->dev		= p->ptr.dev;
+		f->idx		= p->idx;
+		f->nr_failed	= 1;
+		f->nr_retries	= 0;
+	} else if (p->idx != f->idx) {
+		f->idx		= p->idx;
 		f->nr_failed	= 1;
 		f->nr_retries	= 0;
 	} else {
@@ -550,15 +592,22 @@ static inline bool ptr_better(struct bch_fs *c,
 			      const struct extent_ptr_decoded p1,
 			      const struct extent_ptr_decoded p2)
 {
-	struct bch_dev *dev1 = bch_dev_bkey_exists(c, p1.ptr.dev);
-	struct bch_dev *dev2 = bch_dev_bkey_exists(c, p2.ptr.dev);
+	if (likely(!p1.idx && !p2.idx)) {
+		struct bch_dev *dev1 = bch_dev_bkey_exists(c, p1.ptr.dev);
+		struct bch_dev *dev2 = bch_dev_bkey_exists(c, p2.ptr.dev);
+
+		u64 l1 = atomic64_read(&dev1->cur_latency[READ]);
+		u64 l2 = atomic64_read(&dev2->cur_latency[READ]);
 
-	u64 l1 = atomic64_read(&dev1->cur_latency[READ]);
-	u64 l2 = atomic64_read(&dev2->cur_latency[READ]);
+		/* Pick at random, biased in favor of the faster device: */
+
+		return bch2_rand_range(l1 + l2) > l1;
+	}
 
-	/* Pick at random, biased in favor of the faster device: */
+	if (force_reconstruct_read(c))
+		return p1.idx > p2.idx;
 
-	return bch2_rand_range(l1 + l2) > l1;
+	return p1.idx < p2.idx;
 }
 
 static int extent_pick_read_device(struct bch_fs *c,
@@ -579,7 +628,20 @@ static int extent_pick_read_device(struct bch_fs *c,
 			continue;
 
 		f = failed ? dev_io_failures(failed, p.ptr.dev) : NULL;
-		if (f && f->nr_failed >= f->nr_retries)
+		if (f)
+			p.idx = f->nr_failed < f->nr_retries
+				? f->idx
+				: f->idx + 1;
+
+		if (!p.idx &&
+		    !bch2_dev_is_readable(ca))
+			p.idx++;
+
+		if (!p.idx && p.ec_nr)
+			p.idx++;
+
+		if (force_reconstruct_read(c) &&
+		    p.idx >= p.ec_nr + 1)
 			continue;
 
 		if (ret && !ptr_better(c, p, *pick))
@@ -616,8 +678,8 @@ const char *bch2_btree_ptr_invalid(const struct bch_fs *c, struct bkey_s_c k)
 			if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX)
 				return "invalid extent entry type";
 
-			if (extent_entry_is_crc(entry))
-				return "has crc field";
+			if (!extent_entry_is_ptr(entry))
+				return "has non ptr field";
 		}
 
 		extent_for_each_ptr(e, ptr) {
@@ -754,6 +816,8 @@ static bool __bch2_cut_front(struct bpos where, struct bkey_s k)
 			case BCH_EXTENT_ENTRY_crc128:
 				entry->crc128.offset += e.k->size - len;
 				break;
+			case BCH_EXTENT_ENTRY_stripe_ptr:
+				break;
 			}
 
 			if (extent_entry_is_crc(entry))
@@ -1512,7 +1576,18 @@ const char *bch2_extent_invalid(const struct bch_fs *c, struct bkey_s_c k)
 			if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX)
 				return "invalid extent entry type";
 
-			if (extent_entry_is_crc(entry)) {
+			switch (extent_entry_type(entry)) {
+			case BCH_EXTENT_ENTRY_ptr:
+				ptr = entry_to_ptr(entry);
+
+				reason = extent_ptr_invalid(c, e, &entry->ptr,
+							    size_ondisk, false);
+				if (reason)
+					return reason;
+				break;
+			case BCH_EXTENT_ENTRY_crc32:
+			case BCH_EXTENT_ENTRY_crc64:
+			case BCH_EXTENT_ENTRY_crc128:
 				crc = bch2_extent_crc_unpack(e.k, entry_to_crc(entry));
 
 				if (crc.offset + e.k->size >
@@ -1533,13 +1608,9 @@ const char *bch2_extent_invalid(const struct bch_fs *c, struct bkey_s_c k)
 					else if (nonce != crc.offset + crc.nonce)
 						return "incorrect nonce";
 				}
-			} else {
-				ptr = entry_to_ptr(entry);
-
-				reason = extent_ptr_invalid(c, e, &entry->ptr,
-							    size_ondisk, false);
-				if (reason)
-					return reason;
+				break;
+			case BCH_EXTENT_ENTRY_stripe_ptr:
+				break;
 			}
 		}
 
@@ -1756,6 +1827,7 @@ void bch2_extent_ptr_decoded_append(struct bkey_i_extent *e,
 {
 	struct bch_extent_crc_unpacked crc = bch2_extent_crc_unpack(&e->k, NULL);
 	union bch_extent_entry *pos;
+	unsigned i;
 
 	if (!bch2_crc_unpacked_cmp(crc, p->crc)) {
 		pos = e->v.start;
@@ -1773,6 +1845,11 @@ void bch2_extent_ptr_decoded_append(struct bkey_i_extent *e,
 found:
 	p->ptr.type = 1 << BCH_EXTENT_ENTRY_ptr;
 	__extent_entry_insert(e, pos, to_entry(&p->ptr));
+
+	for (i = 0; i < p->ec_nr; i++) {
+		p->ec[i].type = 1 << BCH_EXTENT_ENTRY_stripe_ptr;
+		__extent_entry_insert(e, pos, to_entry(&p->ec[i]));
+	}
 }
 
 /*
@@ -1827,26 +1904,27 @@ void bch2_extent_mark_replicas_cached(struct bch_fs *c,
 				      unsigned target,
 				      unsigned nr_desired_replicas)
 {
-	struct bch_extent_ptr *ptr;
+	union bch_extent_entry *entry;
+	struct extent_ptr_decoded p;
 	int extra = bch2_extent_durability(c, e.c) - nr_desired_replicas;
 
 	if (target && extra > 0)
-		extent_for_each_ptr(e, ptr) {
-			int n = bch2_extent_ptr_durability(c, ptr);
+		extent_for_each_ptr_decode(e, p, entry) {
+			int n = bch2_extent_ptr_durability(c, p);
 
 			if (n && n <= extra &&
-			    !bch2_dev_in_target(c, ptr->dev, target)) {
-				ptr->cached = true;
+			    !bch2_dev_in_target(c, p.ptr.dev, target)) {
+				entry->ptr.cached = true;
 				extra -= n;
 			}
 		}
 
 	if (extra > 0)
-		extent_for_each_ptr(e, ptr) {
-			int n = bch2_extent_ptr_durability(c, ptr);
+		extent_for_each_ptr_decode(e, p, entry) {
+			int n = bch2_extent_ptr_durability(c, p);
 
 			if (n && n <= extra) {
-				ptr->cached = true;
+				entry->ptr.cached = true;
 				extra -= n;
 			}
 		}
@@ -1922,7 +2000,7 @@ enum merge_result bch2_extent_merge(struct bch_fs *c, struct btree *b,
 
 			if ((extent_entry_type(en_l) !=
 			     extent_entry_type(en_r)) ||
-			    extent_entry_is_crc(en_l))
+			    !extent_entry_is_ptr(en_l))
 				return BCH_MERGE_NOMERGE;
 
 			lp = &en_l->ptr;
diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h
index d121ce5b3225..15865b27847d 100644
--- a/fs/bcachefs/extents.h
+++ b/fs/bcachefs/extents.h
@@ -96,8 +96,6 @@ unsigned bch2_extent_nr_ptrs(struct bkey_s_c_extent);
 unsigned bch2_extent_nr_dirty_ptrs(struct bkey_s_c);
 unsigned bch2_extent_is_compressed(struct bkey_s_c);
 
-unsigned bch2_extent_ptr_durability(struct bch_fs *,
-				    const struct bch_extent_ptr *);
 unsigned bch2_extent_durability(struct bch_fs *, struct bkey_s_c_extent);
 
 bool bch2_extent_matches_ptr(struct bch_fs *, struct bkey_s_c_extent,
@@ -362,20 +360,13 @@ bch2_extent_crc_unpack(const struct bkey *k, const union bch_extent_crc *crc)
 
 /* Iterate over pointers, with crcs: */
 
-static inline struct extent_ptr_decoded
-__extent_ptr_decoded_init(const struct bkey *k)
-{
-	return (struct extent_ptr_decoded) {
-		.crc		= bch2_extent_crc_unpack(k, NULL),
-	};
-}
-
-#define EXTENT_ITERATE_EC		(1 << 0)
-
 #define __extent_ptr_next_decode(_e, _ptr, _entry)			\
 ({									\
 	__label__ out;							\
 									\
+	(_ptr).idx	= 0;						\
+	(_ptr).ec_nr	= 0;						\
+									\
 	extent_for_each_entry_from(_e, _entry, _entry)			\
 		switch (extent_entry_type(_entry)) {			\
 		case BCH_EXTENT_ENTRY_ptr:				\
@@ -387,14 +378,16 @@ __extent_ptr_decoded_init(const struct bkey *k)
 			(_ptr).crc = bch2_extent_crc_unpack((_e).k,	\
 					entry_to_crc(_entry));		\
 			break;						\
+		case BCH_EXTENT_ENTRY_stripe_ptr:			\
+			(_ptr).ec[(_ptr).ec_nr++] = _entry->stripe_ptr;	\
+			break;						\
 		}							\
-									\
 out:									\
 	_entry < extent_entry_last(_e);					\
 })
 
 #define extent_for_each_ptr_decode(_e, _ptr, _entry)			\
-	for ((_ptr) = __extent_ptr_decoded_init((_e).k),		\
+	for ((_ptr).crc = bch2_extent_crc_unpack((_e).k, NULL),		\
 	     (_entry) = (_e).v->start;					\
 	     __extent_ptr_next_decode(_e, _ptr, _entry);		\
 	     (_entry) = extent_entry_next(_entry))
diff --git a/fs/bcachefs/extents_types.h b/fs/bcachefs/extents_types.h
index 5738738d7953..a85cda0e7a6a 100644
--- a/fs/bcachefs/extents_types.h
+++ b/fs/bcachefs/extents_types.h
@@ -20,14 +20,18 @@ struct bch_extent_crc_unpacked {
 };
 
 struct extent_ptr_decoded {
+	unsigned			idx;
+	unsigned			ec_nr;
 	struct bch_extent_crc_unpacked	crc;
 	struct bch_extent_ptr		ptr;
+	struct bch_extent_stripe_ptr	ec[4];
 };
 
 struct bch_io_failures {
 	u8			nr;
 	struct bch_dev_io_failures {
 		u8		dev;
+		u8		idx;
 		u8		nr_failed;
 		u8		nr_retries;
 	}			devs[BCH_REPLICAS_MAX];
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index fbd0a82fdeac..2fee2f2efd38 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -16,6 +16,7 @@
 #include "clock.h"
 #include "debug.h"
 #include "disk_groups.h"
+#include "ec.h"
 #include "error.h"
 #include "extents.h"
 #include "io.h"
@@ -319,6 +320,7 @@ static void __bch2_write_index(struct bch_write_op *op)
 	struct bkey_s_extent e;
 	struct bch_extent_ptr *ptr;
 	struct bkey_i *src, *dst = keys->keys, *n, *k;
+	unsigned dev;
 	int ret;
 
 	for (src = keys->keys; src != keys->top; src = n) {
@@ -362,6 +364,10 @@ static void __bch2_write_index(struct bch_write_op *op)
 		}
 	}
 out:
+	/* If some a bucket wasn't written, we can't erasure code it: */
+	for_each_set_bit(dev, op->failed.d, BCH_SB_MEMBERS_MAX)
+		bch2_open_bucket_write_error(c, &op->open_buckets, dev);
+
 	bch2_open_buckets_put(c, &op->open_buckets);
 	return;
 err:
@@ -442,7 +448,8 @@ static void init_append_extent(struct bch_write_op *op,
 static struct bio *bch2_write_bio_alloc(struct bch_fs *c,
 					struct write_point *wp,
 					struct bio *src,
-					bool *page_alloc_failed)
+					bool *page_alloc_failed,
+					void *buf)
 {
 	struct bch_write_bio *wbio;
 	struct bio *bio;
@@ -453,11 +460,18 @@ static struct bio *bch2_write_bio_alloc(struct bch_fs *c,
 	bio = bio_alloc_bioset(NULL, pages, 0,
 			       GFP_NOIO, &c->bio_write);
 	wbio			= wbio_init(bio);
-	wbio->bounce		= true;
 	wbio->put_bio		= true;
 	/* copy WRITE_SYNC flag */
 	wbio->bio.bi_opf	= src->bi_opf;
 
+	if (buf) {
+		bio->bi_iter.bi_size = output_available;
+		bch2_bio_map(bio, buf);
+		return bio;
+	}
+
+	wbio->bounce		= true;
+
 	/*
 	 * We can't use mempool for more than c->sb.encoded_extent_max
 	 * worth of pages, but we'd like to allocate more if we can:
@@ -622,14 +636,18 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp)
 	struct bio *src = &op->wbio.bio, *dst = src;
 	struct bvec_iter saved_iter;
 	struct bkey_i *key_to_write;
+	void *ec_buf;
 	unsigned key_to_write_offset = op->insert_keys.top_p -
 		op->insert_keys.keys_p;
-	unsigned total_output = 0;
-	bool bounce = false, page_alloc_failed = false;
+	unsigned total_output = 0, total_input = 0;
+	bool bounce = false;
+	bool page_alloc_failed = false;
 	int ret, more = 0;
 
 	BUG_ON(!bio_sectors(src));
 
+	ec_buf = bch2_writepoint_ec_buf(c, wp);
+
 	switch (bch2_write_prep_encoded_data(op, wp)) {
 	case PREP_ENCODED_OK:
 		break;
@@ -639,16 +657,26 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp)
 	case PREP_ENCODED_CHECKSUM_ERR:
 		goto csum_err;
 	case PREP_ENCODED_DO_WRITE:
+		if (ec_buf) {
+			dst = bch2_write_bio_alloc(c, wp, src,
+						   &page_alloc_failed,
+						   ec_buf);
+			bio_copy_data(dst, src);
+			bounce = true;
+		}
 		init_append_extent(op, wp, op->version, op->crc);
 		goto do_write;
 	}
 
-	if (op->compression_type ||
+	if (ec_buf ||
+	    op->compression_type ||
 	    (op->csum_type &&
 	     !(op->flags & BCH_WRITE_PAGES_STABLE)) ||
 	    (bch2_csum_type_is_encryption(op->csum_type) &&
 	     !(op->flags & BCH_WRITE_PAGES_OWNED))) {
-		dst = bch2_write_bio_alloc(c, wp, src, &page_alloc_failed);
+		dst = bch2_write_bio_alloc(c, wp, src,
+					   &page_alloc_failed,
+					   ec_buf);
 		bounce = true;
 	}
 
@@ -751,7 +779,8 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp)
 		if (dst != src)
 			bio_advance(dst, dst_len);
 		bio_advance(src, src_len);
-		total_output += dst_len;
+		total_output	+= dst_len;
+		total_input	+= src_len;
 	} while (dst->bi_iter.bi_size &&
 		 src->bi_iter.bi_size &&
 		 wp->sectors_free &&
@@ -764,16 +793,20 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp)
 
 	dst->bi_iter = saved_iter;
 
-	if (!bounce && more) {
-		dst = bio_split(src, total_output >> 9,
+	if (dst == src && more) {
+		BUG_ON(total_output != total_input);
+
+		dst = bio_split(src, total_input >> 9,
 				GFP_NOIO, &c->bio_write);
-		wbio_init(dst)->put_bio = true;
+		wbio_init(dst)->put_bio	= true;
+		/* copy WRITE_SYNC flag */
+		dst->bi_opf		= src->bi_opf;
 	}
 
 	dst->bi_iter.bi_size = total_output;
 
 	/* Free unneeded pages after compressing: */
-	if (bounce)
+	if (to_wbio(dst)->bounce)
 		while (dst->bi_vcnt > DIV_ROUND_UP(dst->bi_iter.bi_size, PAGE_SIZE))
 			mempool_free(dst->bi_io_vec[--dst->bi_vcnt].bv_page,
 				     &c->bio_bounce_pages);
@@ -782,6 +815,10 @@ do_write:
 
 	key_to_write = (void *) (op->insert_keys.keys_p + key_to_write_offset);
 
+	bch2_ec_add_backpointer(c, wp,
+				bkey_start_pos(&key_to_write->k),
+				total_input >> 9);
+
 	dst->bi_end_io	= bch2_write_endio;
 	dst->bi_private	= &op->cl;
 	dst->bi_opf	= REQ_OP_WRITE;
@@ -796,10 +833,10 @@ csum_err:
 		"rewriting existing data (memory corruption?)");
 	ret = -EIO;
 err:
-	if (bounce) {
+	if (to_wbio(dst)->bounce)
 		bch2_bio_free_pages_pool(c, dst);
+	if (to_wbio(dst)->put_bio)
 		bio_put(dst);
-	}
 
 	return ret;
 }
@@ -811,6 +848,8 @@ static void __bch2_write(struct closure *cl)
 	struct write_point *wp;
 	int ret;
 again:
+	memset(&op->failed, 0, sizeof(op->failed));
+
 	do {
 		/* +1 for possible cache device: */
 		if (op->open_buckets.nr + op->nr_replicas + 1 >
@@ -825,6 +864,7 @@ again:
 
 		wp = bch2_alloc_sectors_start(c,
 			op->target,
+			op->opts.erasure_code,
 			op->write_point,
 			&op->devs_have,
 			op->nr_replicas,
@@ -904,8 +944,6 @@ void bch2_write(struct closure *cl)
 
 	op->start_time = local_clock();
 
-	memset(&op->failed, 0, sizeof(op->failed));
-
 	bch2_keylist_init(&op->insert_keys, op->inline_keys);
 	wbio_init(&op->wbio.bio)->put_bio = false;
 
@@ -1576,8 +1614,10 @@ int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig,
 	if (!pick_ret)
 		goto hole;
 
-	if (pick_ret < 0)
-		goto no_device;
+	if (pick_ret < 0) {
+		__bcache_io_error(c, "no device to read from");
+		goto err;
+	}
 
 	if (pick_ret > 0)
 		ca = bch_dev_bkey_exists(c, pick.ptr.dev);
@@ -1704,36 +1744,51 @@ noclone:
 
 	bch2_increment_clock(c, bio_sectors(&rbio->bio), READ);
 
-	if (!rbio->have_ioref)
-		goto no_device_postclone;
-
 	percpu_down_read(&c->usage_lock);
 	bucket_io_clock_reset(c, ca, PTR_BUCKET_NR(ca, &pick.ptr), READ);
 	percpu_up_read(&c->usage_lock);
 
-	this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_USER],
-		     bio_sectors(&rbio->bio));
-
-	bio_set_dev(&rbio->bio, ca->disk_sb.bdev);
+	if (likely(!(flags & (BCH_READ_IN_RETRY|BCH_READ_LAST_FRAGMENT)))) {
+		bio_inc_remaining(&orig->bio);
+		trace_read_split(&orig->bio);
+	}
 
-	if (likely(!(flags & BCH_READ_IN_RETRY))) {
-		if (!(flags & BCH_READ_LAST_FRAGMENT)) {
-			bio_inc_remaining(&orig->bio);
-			trace_read_split(&orig->bio);
+	if (!rbio->pick.idx) {
+		if (!rbio->have_ioref) {
+			__bcache_io_error(c, "no device to read from");
+			bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
+			goto out;
 		}
 
+		this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_USER],
+			     bio_sectors(&rbio->bio));
+		bio_set_dev(&rbio->bio, ca->disk_sb.bdev);
+
 		if (unlikely(c->opts.no_data_io)) {
-			bio_endio(&rbio->bio);
-			return 0;
+			if (likely(!(flags & BCH_READ_IN_RETRY)))
+				bio_endio(&rbio->bio);
+		} else {
+			if (likely(!(flags & BCH_READ_IN_RETRY)))
+				submit_bio(&rbio->bio);
+			else
+				submit_bio_wait(&rbio->bio);
+		}
+	} else {
+		/* Attempting reconstruct read: */
+		if (bch2_ec_read_extent(c, rbio)) {
+			bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
+			goto out;
 		}
 
-		submit_bio(&rbio->bio);
+		if (likely(!(flags & BCH_READ_IN_RETRY)))
+			bio_endio(&rbio->bio);
+	}
+out:
+	if (likely(!(flags & BCH_READ_IN_RETRY))) {
 		return 0;
 	} else {
 		int ret;
 
-		submit_bio_wait(&rbio->bio);
-
 		rbio->context = RBIO_CONTEXT_UNBOUND;
 		bch2_read_endio(&rbio->bio);
 
@@ -1748,22 +1803,12 @@ noclone:
 		return ret;
 	}
 
-no_device_postclone:
-	if (!rbio->split)
-		rbio->bio.bi_end_io = rbio->end_io;
-	bch2_rbio_free(rbio);
-no_device:
-	__bcache_io_error(c, "no device to read from");
-
-	if (likely(!(flags & BCH_READ_IN_RETRY))) {
-		orig->bio.bi_status = BLK_STS_IOERR;
-
-		if (flags & BCH_READ_LAST_FRAGMENT)
-			bch2_rbio_done(orig);
-		return 0;
-	} else {
+err:
+	if (flags & BCH_READ_IN_RETRY)
 		return READ_ERR;
-	}
+
+	orig->bio.bi_status = BLK_STS_IOERR;
+	goto out_read_done;
 
 hole:
 	/*
@@ -1775,7 +1820,7 @@ hole:
 		orig->hole = true;
 
 	zero_fill_bio_iter(&orig->bio, iter);
-
+out_read_done:
 	if (flags & BCH_READ_LAST_FRAGMENT)
 		bch2_rbio_done(orig);
 	return 0;
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index b1f6433cf9e9..6eea96ad03fb 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -1071,7 +1071,7 @@ static int journal_write_alloc(struct journal *j, struct journal_buf *w,
 	replicas = bch2_extent_nr_ptrs(e.c);
 
 	rcu_read_lock();
-	devs_sorted = bch2_wp_alloc_list(c, &j->wp,
+	devs_sorted = bch2_dev_alloc_list(c, &j->wp.stripe,
 					 &c->rw_devs[BCH_DATA_JOURNAL]);
 
 	for (i = 0; i < devs_sorted.nr; i++) {
@@ -1098,8 +1098,7 @@ static int journal_write_alloc(struct journal *j, struct journal_buf *w,
 		    sectors > ca->mi.bucket_size)
 			continue;
 
-		j->wp.next_alloc[ca->dev_idx] += U32_MAX;
-		bch2_wp_rescale(c, ca, &j->wp);
+		bch2_dev_stripe_increment(c, ca, &j->wp.stripe);
 
 		ja->sectors_free = ca->mi.bucket_size - sectors;
 		ja->cur_idx = (ja->cur_idx + 1) % ja->nr;
diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c
index 74e92a196ccd..4d86c4bc4a5f 100644
--- a/fs/bcachefs/opts.c
+++ b/fs/bcachefs/opts.c
@@ -279,11 +279,37 @@ int bch2_opt_check_may_set(struct bch_fs *c, int id, u64 v)
 	case Opt_background_compression:
 		ret = bch2_check_set_has_compressed_data(c, v);
 		break;
+	case Opt_erasure_code:
+		if (v &&
+		    !(c->sb.features & (1ULL << BCH_FEATURE_EC))) {
+			mutex_lock(&c->sb_lock);
+			c->disk_sb.sb->features[0] |=
+				cpu_to_le64(1ULL << BCH_FEATURE_EC);
+
+			bch2_write_super(c);
+			mutex_unlock(&c->sb_lock);
+		}
+		break;
 	}
 
 	return ret;
 }
 
+int bch2_opts_check_may_set(struct bch_fs *c)
+{
+	unsigned i;
+	int ret;
+
+	for (i = 0; i < bch2_opts_nr; i++) {
+		ret = bch2_opt_check_may_set(c, i,
+				bch2_opt_get_by_id(&c->opts, i));
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
 int bch2_parse_mount_opts(struct bch_opts *opts, char *options)
 {
 	char *opt, *name, *val;
diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
index 8f4fab7f7dc8..80869e34e3b6 100644
--- a/fs/bcachefs/opts.h
+++ b/fs/bcachefs/opts.h
@@ -111,6 +111,9 @@ enum opt_type {
 	BCH_OPT(promote_target,		u16,	OPT_RUNTIME,		\
 		OPT_FN(bch2_opt_target),				\
 		BCH_SB_PROMOTE_TARGET,	0)				\
+	BCH_OPT(erasure_code,		u16,	OPT_RUNTIME,		\
+		OPT_BOOL(),						\
+		BCH_SB_ERASURE_CODE,		false)			\
 	BCH_OPT(inodes_32bit,		u8,	OPT_RUNTIME,		\
 		OPT_BOOL(),						\
 		BCH_SB_INODE_32BIT,		false)			\
@@ -270,6 +273,7 @@ void bch2_opt_to_text(struct printbuf *, struct bch_fs *,
 		      const struct bch_option *, u64, unsigned);
 
 int bch2_opt_check_may_set(struct bch_fs *, int, u64);
+int bch2_opts_check_may_set(struct bch_fs *);
 int bch2_parse_mount_opts(struct bch_opts *, char *);
 
 /* inode opts: */
@@ -281,7 +285,8 @@ int bch2_parse_mount_opts(struct bch_opts *, char *);
 	BCH_INODE_OPT(data_replicas,			8)	\
 	BCH_INODE_OPT(promote_target,			16)	\
 	BCH_INODE_OPT(foreground_target,		16)	\
-	BCH_INODE_OPT(background_target,		16)
+	BCH_INODE_OPT(background_target,		16)	\
+	BCH_INODE_OPT(erasure_code,			16)
 
 struct bch_io_opts {
 #define BCH_INODE_OPT(_name, _bits)	unsigned _name##_defined:1;
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 1ae8133a1ef7..ddfba16a2998 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -7,6 +7,7 @@
 #include "btree_update_interior.h"
 #include "btree_io.h"
 #include "dirent.h"
+#include "ec.h"
 #include "error.h"
 #include "fsck.h"
 #include "journal_io.h"
@@ -213,6 +214,11 @@ int bch2_fs_recovery(struct bch_fs *c)
 
 	set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags);
 
+	err = "cannot allocate memory";
+	ret = bch2_fs_ec_start(c);
+	if (ret)
+		goto err;
+
 	bch_verbose(c, "starting mark and sweep:");
 	err = "error in recovery";
 	ret = bch2_initial_gc(c, &journal);
diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c
index 83fc9c93d295..0296931b6b8c 100644
--- a/fs/bcachefs/replicas.c
+++ b/fs/bcachefs/replicas.c
@@ -80,9 +80,33 @@ static void extent_to_replicas(struct bkey_s_c k,
 
 		r->nr_required	= 1;
 
-		extent_for_each_ptr_decode(e, p, entry)
-			if (!p.ptr.cached)
-				r->devs[r->nr_devs++] = p.ptr.dev;
+		extent_for_each_ptr_decode(e, p, entry) {
+			if (p.ptr.cached)
+				continue;
+
+			if (p.ec_nr) {
+				r->nr_devs = 0;
+				break;
+			}
+
+			r->devs[r->nr_devs++] = p.ptr.dev;
+		}
+	}
+}
+
+static void stripe_to_replicas(struct bkey_s_c k,
+			       struct bch_replicas_entry *r)
+{
+	if (k.k->type == BCH_STRIPE) {
+		struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k);
+		const struct bch_extent_ptr *ptr;
+
+		r->nr_required	= s.v->nr_blocks - s.v->nr_redundant;
+
+		for (ptr = s.v->ptrs;
+		     ptr < s.v->ptrs + s.v->nr_blocks;
+		     ptr++)
+			r->devs[r->nr_devs++] = ptr->dev;
 	}
 }
 
@@ -101,6 +125,10 @@ static void bkey_to_replicas(enum bkey_type type,
 		e->data_type = BCH_DATA_USER;
 		extent_to_replicas(k, e);
 		break;
+	case BKEY_TYPE_EC:
+		e->data_type = BCH_DATA_USER;
+		stripe_to_replicas(k, e);
+		break;
 	default:
 		break;
 	}
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index 071543033096..3dbcb6d7d261 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -3,6 +3,7 @@
 #include "bcachefs.h"
 #include "checksum.h"
 #include "disk_groups.h"
+#include "ec.h"
 #include "error.h"
 #include "io.h"
 #include "journal.h"
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index a22beff7cc96..931e50e8ad57 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -20,6 +20,7 @@
 #include "compress.h"
 #include "debug.h"
 #include "disk_groups.h"
+#include "ec.h"
 #include "error.h"
 #include "fs.h"
 #include "fs-io.h"
@@ -364,6 +365,7 @@ static void bch2_fs_free(struct bch_fs *c)
 
 	bch2_fs_quota_exit(c);
 	bch2_fs_fsio_exit(c);
+	bch2_fs_ec_exit(c);
 	bch2_fs_encryption_exit(c);
 	bch2_fs_io_exit(c);
 	bch2_fs_btree_cache_exit(c);
@@ -544,6 +546,11 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 	INIT_LIST_HEAD(&c->fsck_errors);
 	mutex_init(&c->fsck_error_lock);
 
+	INIT_LIST_HEAD(&c->ec_new_stripe_list);
+	mutex_init(&c->ec_new_stripe_lock);
+	mutex_init(&c->ec_stripes_lock);
+	spin_lock_init(&c->ec_stripes_heap_lock);
+
 	seqcount_init(&c->gc_pos_lock);
 
 	c->copy_gc_enabled		= 1;
@@ -612,6 +619,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 	    bch2_fs_io_init(c) ||
 	    bch2_fs_encryption_init(c) ||
 	    bch2_fs_compress_init(c) ||
+	    bch2_fs_ec_init(c) ||
 	    bch2_fs_fsio_init(c))
 		goto err;
 
@@ -683,6 +691,10 @@ const char *bch2_fs_start(struct bch_fs *c)
 	if (ret)
 		goto err;
 
+	ret = bch2_opts_check_may_set(c);
+	if (ret)
+		goto err;
+
 	err = "dynamic fault";
 	if (bch2_fs_init_fault("fs_start"))
 		goto err;
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index 6a5da0f12713..188e19572d91 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -19,6 +19,7 @@
 #include "btree_gc.h"
 #include "buckets.h"
 #include "disk_groups.h"
+#include "ec.h"
 #include "inode.h"
 #include "journal.h"
 #include "keylist.h"
@@ -188,6 +189,8 @@ sysfs_pd_controller_attribute(rebalance);
 read_attribute(rebalance_work);
 rw_attribute(promote_whole_extents);
 
+read_attribute(new_stripes);
+
 rw_attribute(pd_controllers_update_seconds);
 
 read_attribute(meta_replicas_have);
@@ -242,6 +245,8 @@ static ssize_t show_fs_alloc_debug(struct bch_fs *c, char *buf)
 			pr_buf(&out, "\t%s:\t\t%llu\n",
 			       bch2_data_types[type],
 			       stats.replicas[replicas].data[type]);
+		pr_buf(&out, "\terasure coded:\t%llu\n",
+		       stats.replicas[replicas].ec_data);
 		pr_buf(&out, "\treserved:\t%llu\n",
 		       stats.replicas[replicas].persistent_reserved);
 	}
@@ -310,6 +315,41 @@ static ssize_t bch2_compression_stats(struct bch_fs *c, char *buf)
 			compressed_sectors_uncompressed << 9);
 }
 
+static ssize_t bch2_new_stripes(struct bch_fs *c, char *buf)
+{
+	char *out = buf, *end = buf + PAGE_SIZE;
+	struct ec_stripe_head *h;
+	struct ec_stripe_new *s;
+
+	mutex_lock(&c->ec_new_stripe_lock);
+	list_for_each_entry(h, &c->ec_new_stripe_list, list) {
+		out += scnprintf(out, end - out,
+				 "target %u algo %u redundancy %u:\n",
+				 h->target, h->algo, h->redundancy);
+
+		if (h->s)
+			out += scnprintf(out, end - out,
+					 "\tpending: blocks %u allocated %u\n",
+					 h->s->blocks.nr,
+					 bitmap_weight(h->s->blocks_allocated,
+						       h->s->blocks.nr));
+
+		mutex_lock(&h->lock);
+		list_for_each_entry(s, &h->stripes, list)
+			out += scnprintf(out, end - out,
+					 "\tin flight: blocks %u allocated %u pin %u\n",
+					 s->blocks.nr,
+					 bitmap_weight(s->blocks_allocated,
+						       s->blocks.nr),
+					 atomic_read(&s->pin));
+		mutex_unlock(&h->lock);
+
+	}
+	mutex_unlock(&c->ec_new_stripe_lock);
+
+	return out - buf;
+}
+
 SHOW(bch2_fs)
 {
 	struct bch_fs *c = container_of(kobj, struct bch_fs, kobj);
@@ -369,6 +409,9 @@ SHOW(bch2_fs)
 	if (attr == &sysfs_compression_stats)
 		return bch2_compression_stats(c, buf);
 
+	if (attr == &sysfs_new_stripes)
+		return bch2_new_stripes(c, buf);
+
 #define BCH_DEBUG_PARAM(name, description) sysfs_print(name, c->name);
 	BCH_DEBUG_PARAMS()
 #undef BCH_DEBUG_PARAM
@@ -537,6 +580,8 @@ struct attribute *bch2_fs_internal_files[] = {
 	&sysfs_rebalance_work,
 	sysfs_pd_controller_files(rebalance),
 
+	&sysfs_new_stripes,
+
 	&sysfs_internal_uuid,
 
 #define BCH_DEBUG_PARAM(name, description) &sysfs_##name,
@@ -765,6 +810,7 @@ static ssize_t show_dev_alloc_debug(struct bch_dev *ca, char *buf)
 		"    meta:               %llu\n"
 		"    user:               %llu\n"
 		"    cached:             %llu\n"
+		"    erasure coded:      %llu\n"
 		"    available:          %lli\n"
 		"sectors:\n"
 		"    sb:                 %llu\n"
@@ -788,6 +834,7 @@ static ssize_t show_dev_alloc_debug(struct bch_dev *ca, char *buf)
 		stats.buckets[BCH_DATA_BTREE],
 		stats.buckets[BCH_DATA_USER],
 		stats.buckets[BCH_DATA_CACHED],
+		stats.buckets_ec,
 		ca->mi.nbuckets - ca->mi.first_bucket - stats.buckets_unavailable,
 		stats.sectors[BCH_DATA_SB],
 		stats.sectors[BCH_DATA_JOURNAL],
-- 
cgit 


From 283cca30a8ae878e6f78cfbd73886fd78c1b13f8 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 13 Nov 2018 22:10:27 -0500
Subject: bcachefs: fix typo when picking read method

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/extents.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 9bb4e10283e1..9e3ac910572e 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -637,11 +637,11 @@ static int extent_pick_read_device(struct bch_fs *c,
 		    !bch2_dev_is_readable(ca))
 			p.idx++;
 
-		if (!p.idx && p.ec_nr)
+		if (force_reconstruct_read(c) &&
+		    !p.idx && p.ec_nr)
 			p.idx++;
 
-		if (force_reconstruct_read(c) &&
-		    p.idx >= p.ec_nr + 1)
+		if (p.idx >= p.ec_nr + 1)
 			continue;
 
 		if (ret && !ptr_better(c, p, *pick))
-- 
cgit 


From b1ba2359fb86b2f87b55d7d8cc9847b826242b33 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 14 Nov 2018 21:53:25 -0500
Subject: bcachefs: Fix an error path

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-io.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index eecf792198e4..a65e6a6d54e7 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -2542,10 +2542,9 @@ static long bch2_fallocate(struct bch_inode_info *inode, int mode,
 				&disk_res, &quota_res,
 				iter, &reservation.k_i,
 				0, true, true, NULL);
-
+btree_iter_err:
 		bch2_quota_reservation_put(c, inode, &quota_res);
 		bch2_disk_reservation_put(c, &disk_res);
-btree_iter_err:
 		if (ret == -EINTR)
 			ret = 0;
 		if (ret)
-- 
cgit 


From f81b648d1f8f285d1471bed0d47bb4316044b864 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 14 Nov 2018 21:53:40 -0500
Subject: bcachefs: Clean up, possixly fix page disk reservation accounting

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-io.c | 146 ++++++++++++++++++++++++++--------------------------
 1 file changed, 72 insertions(+), 74 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index a65e6a6d54e7..40d3f02d86d8 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -454,12 +454,12 @@ struct bch_page_state {
 union { struct {
 	/* existing data: */
 	unsigned		sectors:PAGE_SECTOR_SHIFT + 1;
+
+	/* Uncompressed, fully allocated replicas: */
 	unsigned		nr_replicas:4;
-	unsigned		compressed:1;
 
-	/* Owns PAGE_SECTORS sized reservation: */
-	unsigned		reserved:1;
-	unsigned		reservation_replicas:4;
+	/* Owns PAGE_SECTORS * replicas_reserved sized reservation: */
+	unsigned		replicas_reserved:4;
 
 	/* Owns PAGE_SECTORS sized quota reservation: */
 	unsigned		quota_reserved:1;
@@ -506,7 +506,7 @@ static inline struct bch_page_state *page_state(struct page *page)
 static inline unsigned page_res_sectors(struct bch_page_state s)
 {
 
-	return s.reserved ? s.reservation_replicas * PAGE_SECTORS : 0;
+	return s.replicas_reserved * PAGE_SECTORS;
 }
 
 static void __bch2_put_page_reservation(struct bch_fs *c, struct bch_inode_info *inode,
@@ -524,8 +524,10 @@ static void bch2_put_page_reservation(struct bch_fs *c, struct bch_inode_info *i
 {
 	struct bch_page_state s;
 
+	EBUG_ON(!PageLocked(page));
+
 	s = page_state_cmpxchg(page_state(page), s, {
-		s.reserved		= 0;
+		s.replicas_reserved	= 0;
 		s.quota_reserved	= 0;
 	});
 
@@ -535,62 +537,46 @@ static void bch2_put_page_reservation(struct bch_fs *c, struct bch_inode_info *i
 static int bch2_get_page_reservation(struct bch_fs *c, struct bch_inode_info *inode,
 				     struct page *page, bool check_enospc)
 {
-	struct bch_page_state *s = page_state(page), new, old;
+	struct bch_page_state *s = page_state(page), new;
 
 	/* XXX: this should not be open coded */
 	unsigned nr_replicas = inode->ei_inode.bi_data_replicas
 		? inode->ei_inode.bi_data_replicas - 1
 		: c->opts.data_replicas;
-
-	struct disk_reservation disk_res = bch2_disk_reservation_init(c,
-						nr_replicas);
+	struct disk_reservation disk_res;
 	struct quota_res quota_res = { 0 };
-	int ret = 0;
+	int ret;
 
-	/*
-	 * XXX: this could likely be quite a bit simpler, page reservations
-	 * _should_ only be manipulated with page locked:
-	 */
+	EBUG_ON(!PageLocked(page));
 
-	old = page_state_cmpxchg(s, new, {
-		if (new.reserved
-		    ? (new.reservation_replicas < disk_res.nr_replicas)
-		    : (new.sectors < PAGE_SECTORS ||
-		       new.nr_replicas < disk_res.nr_replicas ||
-		       new.compressed)) {
-			int sectors = (disk_res.nr_replicas * PAGE_SECTORS -
-				       page_res_sectors(new) -
-				       disk_res.sectors);
-
-			if (sectors > 0) {
-				ret = bch2_disk_reservation_add(c, &disk_res, sectors,
-						!check_enospc
-						? BCH_DISK_RESERVATION_NOFAIL : 0);
-				if (unlikely(ret))
-					goto err;
-			}
+	if (s->replicas_reserved < nr_replicas) {
+		ret = bch2_disk_reservation_get(c, &disk_res, PAGE_SECTORS,
+				nr_replicas - s->replicas_reserved,
+				!check_enospc ? BCH_DISK_RESERVATION_NOFAIL : 0);
+		if (unlikely(ret))
+			return ret;
 
-			new.reserved = 1;
-			new.reservation_replicas = disk_res.nr_replicas;
-		}
+		page_state_cmpxchg(s, new, ({
+			BUG_ON(new.replicas_reserved +
+			       disk_res.nr_replicas != nr_replicas);
+			new.replicas_reserved += disk_res.nr_replicas;
+		}));
+	}
 
-		if (!new.quota_reserved &&
-		    new.sectors + new.dirty_sectors < PAGE_SECTORS) {
-			ret = bch2_quota_reservation_add(c, inode, &quota_res,
-						PAGE_SECTORS - quota_res.sectors,
-						check_enospc);
-			if (unlikely(ret))
-				goto err;
+	if (!s->quota_reserved &&
+	    s->sectors + s->dirty_sectors < PAGE_SECTORS) {
+		ret = bch2_quota_reservation_add(c, inode, &quota_res,
+						 PAGE_SECTORS,
+						 check_enospc);
+		if (unlikely(ret))
+			return ret;
 
+		page_state_cmpxchg(s, new, ({
+			BUG_ON(new.quota_reserved);
 			new.quota_reserved = 1;
-		}
-	});
+		}));
+	}
 
-	quota_res.sectors -= (new.quota_reserved - old.quota_reserved) * PAGE_SECTORS;
-	disk_res.sectors -= page_res_sectors(new) - page_res_sectors(old);
-err:
-	bch2_quota_reservation_put(c, inode, &quota_res);
-	bch2_disk_reservation_put(c, &disk_res);
 	return ret;
 }
 
@@ -600,6 +586,8 @@ static void bch2_clear_page_bits(struct page *page)
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
 	struct bch_page_state s;
 
+	EBUG_ON(!PageLocked(page));
+
 	if (!PagePrivate(page))
 		return;
 
@@ -763,11 +751,8 @@ static void bch2_readpages_end_io(struct bio *bio)
 
 static inline void page_state_init_for_read(struct page *page)
 {
-	struct bch_page_state *s = page_state(page);
-
-	BUG_ON(s->reserved);
-	s->sectors	= 0;
-	s->compressed	= 0;
+	SetPagePrivate(page);
+	page->private = 0;
 }
 
 struct readpages_iter {
@@ -816,10 +801,13 @@ static void bch2_add_page_sectors(struct bio *bio, struct bkey_s_c k)
 {
 	struct bvec_iter iter;
 	struct bio_vec bv;
-	bool compressed = bch2_extent_is_compressed(k);
-	unsigned nr_ptrs = bch2_extent_nr_dirty_ptrs(k);
+	unsigned nr_ptrs = !bch2_extent_is_compressed(k)
+		? bch2_extent_nr_dirty_ptrs(k)
+		: 0;
 
 	bio_for_each_segment(bv, bio, iter) {
+		/* brand new pages, don't need to be locked: */
+
 		struct bch_page_state *s = page_state(bv.bv_page);
 
 		/* sectors in @k from the start of this page: */
@@ -827,14 +815,11 @@ static void bch2_add_page_sectors(struct bio *bio, struct bkey_s_c k)
 
 		unsigned page_sectors = min(bv.bv_len >> 9, k_sectors);
 
-		s->nr_replicas = !s->sectors
-			? nr_ptrs
-			: min_t(unsigned, s->nr_replicas, nr_ptrs);
+		s->nr_replicas = page_sectors == PAGE_SECTORS
+			? nr_ptrs : 0;
 
 		BUG_ON(s->sectors + page_sectors > PAGE_SECTORS);
 		s->sectors += page_sectors;
-
-		s->compressed |= compressed;
 	}
 }
 
@@ -1163,7 +1148,7 @@ static int __bch2_writepage(struct folio *folio,
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
 	struct bch_writepage_state *w = data;
 	struct bch_page_state new, old;
-	unsigned offset;
+	unsigned offset, nr_replicas_this_write;
 	loff_t i_size = i_size_read(&inode->v);
 	pgoff_t end_index = i_size >> PAGE_SHIFT;
 
@@ -1189,19 +1174,31 @@ static int __bch2_writepage(struct folio *folio,
 	 */
 	zero_user_segment(page, offset, PAGE_SIZE);
 do_io:
+	EBUG_ON(!PageLocked(page));
+
 	/* Before unlocking the page, transfer reservation to w->io: */
 	old = page_state_cmpxchg(page_state(page), new, {
-		EBUG_ON(!new.reserved &&
-			(new.sectors != PAGE_SECTORS ||
-			new.compressed));
+		/*
+		 * If we didn't get a reservation, we can only write out the
+		 * number of (fully allocated) replicas that currently exist,
+		 * and only if the entire page has been written:
+		 */
+		nr_replicas_this_write =
+			max_t(unsigned,
+			      new.replicas_reserved,
+			      (new.sectors == PAGE_SECTORS
+			       ? new.nr_replicas : 0));
+
+		BUG_ON(!nr_replicas_this_write);
 
-		if (new.reserved)
-			new.nr_replicas = new.reservation_replicas;
-		new.reserved = 0;
+		new.nr_replicas = w->opts.compression
+			? 0
+			: nr_replicas_this_write;
 
-		new.compressed |= w->opts.compression != 0;
+		new.replicas_reserved = 0;
 
 		new.sectors += new.dirty_sectors;
+		BUG_ON(new.sectors != PAGE_SECTORS);
 		new.dirty_sectors = 0;
 	});
 
@@ -1210,21 +1207,20 @@ do_io:
 	unlock_page(page);
 
 	if (w->io &&
-	    (w->io->op.op.res.nr_replicas != new.nr_replicas ||
+	    (w->io->op.op.res.nr_replicas != nr_replicas_this_write ||
 	     !bio_can_add_page_contig(&w->io->op.op.wbio.bio, page)))
 		bch2_writepage_do_io(w);
 
 	if (!w->io)
-		bch2_writepage_io_alloc(c, w, inode, page, new.nr_replicas);
+		bch2_writepage_io_alloc(c, w, inode, page,
+					nr_replicas_this_write);
 
 	w->io->new_sectors += new.sectors - old.sectors;
 
 	BUG_ON(inode != w->io->op.inode);
 	BUG_ON(bio_add_page_contig(&w->io->op.op.wbio.bio, page));
 
-	if (old.reserved)
-		w->io->op.op.res.sectors += old.reservation_replicas * PAGE_SECTORS;
-
+	w->io->op.op.res.sectors += old.replicas_reserved * PAGE_SECTORS;
 	w->io->op.new_i_size = i_size;
 
 	if (wbc->sync_mode == WB_SYNC_ALL)
@@ -2606,6 +2602,8 @@ long bch2_fallocate_dispatch(struct file *file, int mode,
 
 static bool folio_is_data(struct folio *folio)
 {
+	EBUG_ON(!PageLocked(&folio->page));
+
 	/* XXX: should only have to check PageDirty */
 	return folio_test_private(folio) &&
 		(page_state(&folio->page)->sectors ||
-- 
cgit 


From 129550c4d08fcc518c7cbe747657ed18470f712a Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 18 Nov 2018 20:42:02 -0500
Subject: bcachefs: start erasure coding after journal replay

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/recovery.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index ddfba16a2998..f00e327d4d35 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -214,11 +214,6 @@ int bch2_fs_recovery(struct bch_fs *c)
 
 	set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags);
 
-	err = "cannot allocate memory";
-	ret = bch2_fs_ec_start(c);
-	if (ret)
-		goto err;
-
 	bch_verbose(c, "starting mark and sweep:");
 	err = "error in recovery";
 	ret = bch2_initial_gc(c, &journal);
@@ -279,6 +274,11 @@ int bch2_fs_recovery(struct bch_fs *c)
 		bch_verbose(c, "quotas done");
 	}
 
+	err = "cannot allocate memory";
+	ret = bch2_fs_ec_start(c);
+	if (ret)
+		goto err;
+
 out:
 	bch2_journal_entries_free(&journal);
 	kfree(clean);
-- 
cgit 


From f1a79365a7416c5046f88d0db025e1d84c32b252 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 18 Nov 2018 21:35:59 -0500
Subject: bcachefs: Don't block on journal reservation with btree locks held

Fixes a deadlock between the allocator thread, when it first starts up,
and journal replay

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update_interior.c |  6 +--
 fs/bcachefs/btree_update_leaf.c     | 38 ++++++++++++-----
 fs/bcachefs/journal.c               | 84 ++++++++++++++++++++++++-------------
 fs/bcachefs/journal.h               | 39 +++++++++--------
 4 files changed, 108 insertions(+), 59 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 01e476d72595..af31819c88c7 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -638,12 +638,12 @@ static void btree_update_wait_on_journal(struct closure *cl)
 	int ret;
 
 	ret = bch2_journal_open_seq_async(&c->journal, as->journal_seq, cl);
-	if (ret < 0)
-		goto err;
-	if (!ret) {
+	if (ret == -EAGAIN) {
 		continue_at(cl, btree_update_wait_on_journal, system_wq);
 		return;
 	}
+	if (ret < 0)
+		goto err;
 
 	bch2_journal_flush_seq_async(&c->journal, as->journal_seq, cl);
 err:
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 093e480977c7..41691bebf679 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -344,19 +344,35 @@ static inline int do_btree_insert_at(struct btree_insert *trans,
 	trans_for_each_entry(trans, i)
 		BUG_ON(i->iter->uptodate >= BTREE_ITER_NEED_RELOCK);
 
-	u64s = 0;
-	trans_for_each_entry(trans, i)
-		u64s += jset_u64s(i->k->k.u64s);
-
 	memset(&trans->journal_res, 0, sizeof(trans->journal_res));
 
-	ret = !(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)
-		? bch2_journal_res_get(&c->journal,
-				      &trans->journal_res,
-				      u64s, u64s)
-		: 0;
-	if (ret)
-		return ret;
+	if (likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) {
+		u64s = 0;
+		trans_for_each_entry(trans, i)
+			u64s += jset_u64s(i->k->k.u64s);
+
+		while ((ret = bch2_journal_res_get(&c->journal,
+					&trans->journal_res, u64s,
+					JOURNAL_RES_GET_NONBLOCK)) == -EAGAIN) {
+			struct btree_iter *iter = trans->entries[0].iter;
+
+			bch2_btree_iter_unlock(iter);
+
+			ret = bch2_journal_res_get(&c->journal,
+					&trans->journal_res, u64s,
+					JOURNAL_RES_GET_CHECK);
+			if (ret)
+				return ret;
+
+			if (!bch2_btree_iter_relock(iter)) {
+				trans_restart(" (iter relock after journal res get blocked)");
+				return -EINTR;
+			}
+		}
+
+		if (ret)
+			return ret;
+	}
 
 	multi_lock_write(c, trans);
 
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index b4d037664628..5db0a469ac24 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -335,15 +335,14 @@ u64 bch2_inode_journal_seq(struct journal *j, u64 inode)
 }
 
 static int __journal_res_get(struct journal *j, struct journal_res *res,
-			      unsigned u64s_min, unsigned u64s_max)
+			     unsigned flags)
 {
 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
 	struct journal_buf *buf;
 	int ret;
 retry:
-	ret = journal_res_get_fast(j, res, u64s_min, u64s_max);
-	if (ret)
-		return ret;
+	if (journal_res_get_fast(j, res, flags))
+		return 0;
 
 	spin_lock(&j->lock);
 	/*
@@ -351,10 +350,9 @@ retry:
 	 * that just did journal_entry_open() and call journal_entry_close()
 	 * unnecessarily
 	 */
-	ret = journal_res_get_fast(j, res, u64s_min, u64s_max);
-	if (ret) {
+	if (journal_res_get_fast(j, res, flags)) {
 		spin_unlock(&j->lock);
-		return 1;
+		return 0;
 	}
 
 	/*
@@ -377,7 +375,12 @@ retry:
 		spin_unlock(&j->lock);
 		return -EROFS;
 	case JOURNAL_ENTRY_INUSE:
-		/* haven't finished writing out the previous one: */
+		/*
+		 * The current journal entry is still open, but we failed to get
+		 * a journal reservation because there's not enough space in it,
+		 * and we can't close it and start another because we haven't
+		 * finished writing out the previous entry:
+		 */
 		spin_unlock(&j->lock);
 		trace_journal_entry_full(c);
 		goto blocked;
@@ -408,7 +411,7 @@ retry:
 blocked:
 	if (!j->res_get_blocked_start)
 		j->res_get_blocked_start = local_clock() ?: 1;
-	return 0;
+	return -EAGAIN;
 }
 
 /*
@@ -422,14 +425,14 @@ blocked:
  * btree node write locks.
  */
 int bch2_journal_res_get_slowpath(struct journal *j, struct journal_res *res,
-				 unsigned u64s_min, unsigned u64s_max)
+				  unsigned flags)
 {
 	int ret;
 
 	wait_event(j->wait,
-		   (ret = __journal_res_get(j, res, u64s_min,
-					    u64s_max)));
-	return ret < 0 ? ret : 0;
+		   (ret = __journal_res_get(j, res, flags)) != -EAGAIN ||
+		   (flags & JOURNAL_RES_GET_NONBLOCK));
+	return ret;
 }
 
 u64 bch2_journal_last_unwritten_seq(struct journal *j)
@@ -453,28 +456,55 @@ u64 bch2_journal_last_unwritten_seq(struct journal *j)
  * btree root - every journal entry contains the roots of all the btrees, so it
  * doesn't need to bother with getting a journal reservation
  */
-int bch2_journal_open_seq_async(struct journal *j, u64 seq, struct closure *parent)
+int bch2_journal_open_seq_async(struct journal *j, u64 seq, struct closure *cl)
 {
-	int ret;
-
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
+	bool need_reclaim = false;
+retry:
 	spin_lock(&j->lock);
-	BUG_ON(seq > journal_cur_seq(j));
 
 	if (seq < journal_cur_seq(j) ||
 	    journal_entry_is_open(j)) {
 		spin_unlock(&j->lock);
-		return 1;
+		return 0;
+	}
+
+	if (journal_cur_seq(j) < seq) {
+		switch (journal_buf_switch(j, false)) {
+		case JOURNAL_ENTRY_ERROR:
+			spin_unlock(&j->lock);
+			return -EROFS;
+		case JOURNAL_ENTRY_INUSE:
+			/* haven't finished writing out the previous one: */
+			trace_journal_entry_full(c);
+			goto blocked;
+		case JOURNAL_ENTRY_CLOSED:
+			break;
+		case JOURNAL_UNLOCKED:
+			goto retry;
+		}
+	}
+
+	BUG_ON(journal_cur_seq(j) < seq);
+
+	if (!journal_entry_open(j)) {
+		need_reclaim = true;
+		goto blocked;
 	}
 
-	ret = journal_entry_open(j);
-	if (!ret)
-		closure_wait(&j->async_wait, parent);
 	spin_unlock(&j->lock);
 
-	if (!ret)
-		bch2_journal_reclaim_work(&j->reclaim_work.work);
+	return 0;
+blocked:
+	if (!j->res_get_blocked_start)
+		j->res_get_blocked_start = local_clock() ?: 1;
 
-	return ret;
+	closure_wait(&j->async_wait, cl);
+	spin_unlock(&j->lock);
+
+	if (need_reclaim)
+		bch2_journal_reclaim_work(&j->reclaim_work.work);
+	return -EAGAIN;
 }
 
 static int journal_seq_error(struct journal *j, u64 seq)
@@ -594,11 +624,10 @@ int bch2_journal_flush_seq(struct journal *j, u64 seq)
 void bch2_journal_meta_async(struct journal *j, struct closure *parent)
 {
 	struct journal_res res;
-	unsigned u64s = jset_u64s(0);
 
 	memset(&res, 0, sizeof(res));
 
-	bch2_journal_res_get(j, &res, u64s, u64s);
+	bch2_journal_res_get(j, &res, jset_u64s(0), 0);
 	bch2_journal_res_put(j, &res);
 
 	bch2_journal_flush_seq_async(j, res.seq, parent);
@@ -607,12 +636,11 @@ void bch2_journal_meta_async(struct journal *j, struct closure *parent)
 int bch2_journal_meta(struct journal *j)
 {
 	struct journal_res res;
-	unsigned u64s = jset_u64s(0);
 	int ret;
 
 	memset(&res, 0, sizeof(res));
 
-	ret = bch2_journal_res_get(j, &res, u64s, u64s);
+	ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0);
 	if (ret)
 		return ret;
 
diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h
index 77cf39cc64ff..d9c094ba2ca0 100644
--- a/fs/bcachefs/journal.h
+++ b/fs/bcachefs/journal.h
@@ -272,12 +272,14 @@ static inline void bch2_journal_res_put(struct journal *j,
 }
 
 int bch2_journal_res_get_slowpath(struct journal *, struct journal_res *,
-				 unsigned, unsigned);
+				  unsigned);
+
+#define JOURNAL_RES_GET_NONBLOCK	(1 << 0)
+#define JOURNAL_RES_GET_CHECK		(1 << 1)
 
 static inline int journal_res_get_fast(struct journal *j,
 				       struct journal_res *res,
-				       unsigned u64s_min,
-				       unsigned u64s_max)
+				       unsigned flags)
 {
 	union journal_res_state old, new;
 	u64 v = atomic64_read(&j->reservations.counter);
@@ -289,42 +291,45 @@ static inline int journal_res_get_fast(struct journal *j,
 		 * Check if there is still room in the current journal
 		 * entry:
 		 */
-		if (old.cur_entry_offset + u64s_min > j->cur_entry_u64s)
+		if (new.cur_entry_offset + res->u64s > j->cur_entry_u64s)
 			return 0;
 
-		res->offset	= old.cur_entry_offset;
-		res->u64s	= min(u64s_max, j->cur_entry_u64s -
-				      old.cur_entry_offset);
+		if (flags & JOURNAL_RES_GET_CHECK)
+			return 1;
 
-		journal_state_inc(&new);
 		new.cur_entry_offset += res->u64s;
+		journal_state_inc(&new);
 	} while ((v = atomic64_cmpxchg(&j->reservations.counter,
 				       old.v, new.v)) != old.v);
 
-	res->ref = true;
-	res->idx = new.idx;
-	res->seq = le64_to_cpu(j->buf[res->idx].data->seq);
+	res->ref	= true;
+	res->idx	= old.idx;
+	res->offset	= old.cur_entry_offset;
+	res->seq	= le64_to_cpu(j->buf[old.idx].data->seq);
 	return 1;
 }
 
 static inline int bch2_journal_res_get(struct journal *j, struct journal_res *res,
-				      unsigned u64s_min, unsigned u64s_max)
+				       unsigned u64s, unsigned flags)
 {
 	int ret;
 
 	EBUG_ON(res->ref);
-	EBUG_ON(u64s_max < u64s_min);
 	EBUG_ON(!test_bit(JOURNAL_STARTED, &j->flags));
 
-	if (journal_res_get_fast(j, res, u64s_min, u64s_max))
+	res->u64s = u64s;
+
+	if (journal_res_get_fast(j, res, flags))
 		goto out;
 
-	ret = bch2_journal_res_get_slowpath(j, res, u64s_min, u64s_max);
+	ret = bch2_journal_res_get_slowpath(j, res, flags);
 	if (ret)
 		return ret;
 out:
-	lock_acquire_shared(&j->res_map, 0, 0, NULL, _THIS_IP_);
-	EBUG_ON(!res->ref);
+	if (!(flags & JOURNAL_RES_GET_CHECK)) {
+		lock_acquire_shared(&j->res_map, 0, 0, NULL, _THIS_IP_);
+		EBUG_ON(!res->ref);
+	}
 	return 0;
 }
 
-- 
cgit 


From a9ec3454011f218e583832d93244057257c7dfe2 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 18 Nov 2018 18:32:16 -0500
Subject: bcachefs: Journal refactoring

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/journal.c         |   8 +-
 fs/bcachefs/journal_io.c      | 342 +++++++++++++++---------------------------
 fs/bcachefs/journal_reclaim.c |   3 +-
 fs/bcachefs/journal_types.h   |   1 -
 4 files changed, 124 insertions(+), 230 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index 5db0a469ac24..939caa3b8183 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -134,6 +134,8 @@ static enum {
 		c->opts.block_size;
 	BUG_ON(j->prev_buf_sectors > j->cur_buf_sectors);
 
+	bkey_extent_init(&buf->key);
+
 	/*
 	 * We have to set last_seq here, _before_ opening a new journal entry:
 	 *
@@ -890,10 +892,6 @@ static bool bch2_journal_writing_to_device(struct journal *j, unsigned dev_idx)
 
 void bch2_dev_journal_stop(struct journal *j, struct bch_dev *ca)
 {
-	spin_lock(&j->lock);
-	bch2_extent_drop_device(bkey_i_to_s_extent(&j->key), ca->dev_idx);
-	spin_unlock(&j->lock);
-
 	wait_event(j->wait, !bch2_journal_writing_to_device(j, ca->dev_idx));
 }
 
@@ -1032,8 +1030,6 @@ int bch2_fs_journal_init(struct journal *j)
 	j->write_delay_ms	= 1000;
 	j->reclaim_delay_ms	= 100;
 
-	bkey_extent_init(&j->key);
-
 	atomic64_set(&j->reservations.counter,
 		((union journal_res_state)
 		 { .cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL }).v);
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 6eea96ad03fb..60fc2eced71a 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -426,7 +426,7 @@ static int journal_read_buf_realloc(struct journal_read_buf *b,
 static int journal_read_bucket(struct bch_dev *ca,
 			       struct journal_read_buf *buf,
 			       struct journal_list *jlist,
-			       unsigned bucket, u64 *seq, bool *entries_found)
+			       unsigned bucket)
 {
 	struct bch_fs *c = ca->fs;
 	struct journal_device *ja = &ca->journal;
@@ -511,7 +511,6 @@ reread:
 
 		switch (ret) {
 		case JOURNAL_ENTRY_ADD_OK:
-			*entries_found = true;
 			break;
 		case JOURNAL_ENTRY_ADD_OUT_OF_RANGE:
 			break;
@@ -519,9 +518,6 @@ reread:
 			return ret;
 		}
 
-		if (le64_to_cpu(j->seq) > *seq)
-			*seq = le64_to_cpu(j->seq);
-
 		sectors = vstruct_sectors(j, c->block_bits);
 next_block:
 		pr_debug("next");
@@ -535,124 +531,51 @@ next_block:
 
 static void bch2_journal_read_device(struct closure *cl)
 {
-#define read_bucket(b)							\
-	({								\
-		bool entries_found = false;				\
-		ret = journal_read_bucket(ca, &buf, jlist, b, &seq,	\
-					  &entries_found);		\
-		if (ret)						\
-			goto err;					\
-		__set_bit(b, bitmap);					\
-		entries_found;						\
-	 })
-
 	struct journal_device *ja =
 		container_of(cl, struct journal_device, read);
 	struct bch_dev *ca = container_of(ja, struct bch_dev, journal);
 	struct journal_list *jlist =
 		container_of(cl->parent, struct journal_list, cl);
-	struct request_queue *q = bdev_get_queue(ca->disk_sb.bdev);
 	struct journal_read_buf buf = { NULL, 0 };
-	unsigned long *bitmap;
-	unsigned i, l, r;
-	u64 seq = 0;
+	u64 min_seq = U64_MAX;
+	unsigned i;
 	int ret;
 
 	if (!ja->nr)
 		goto out;
 
-	bitmap = kcalloc(BITS_TO_LONGS(ja->nr), ja->nr, GFP_KERNEL);
-	if (!bitmap) {
-		ret = -ENOMEM;
-		goto err;
-	}
-
 	ret = journal_read_buf_realloc(&buf, PAGE_SIZE);
 	if (ret)
 		goto err;
 
 	pr_debug("%u journal buckets", ja->nr);
 
-	/*
-	 * If the device supports discard but not secure discard, we can't do
-	 * the fancy fibonacci hash/binary search because the live journal
-	 * entries might not form a contiguous range:
-	 */
-	for (i = 0; i < ja->nr; i++)
-		read_bucket(i);
-	goto search_done;
-
-	if (!blk_queue_nonrot(q))
-		goto linear_scan;
-
-	/*
-	 * Read journal buckets ordered by golden ratio hash to quickly
-	 * find a sequence of buckets with valid journal entries
-	 */
 	for (i = 0; i < ja->nr; i++) {
-		l = (i * 2654435769U) % ja->nr;
-
-		if (test_bit(l, bitmap))
-			break;
-
-		if (read_bucket(l))
-			goto bsearch;
+		ret = journal_read_bucket(ca, &buf, jlist, i);
+		if (ret)
+			goto err;
 	}
 
-	/*
-	 * If that fails, check all the buckets we haven't checked
-	 * already
-	 */
-	pr_debug("falling back to linear search");
-linear_scan:
-	for (l = find_first_zero_bit(bitmap, ja->nr);
-	     l < ja->nr;
-	     l = find_next_zero_bit(bitmap, ja->nr, l + 1))
-		if (read_bucket(l))
-			goto bsearch;
-
-	/* no journal entries on this device? */
-	if (l == ja->nr)
-		goto out;
-bsearch:
-	/* Binary search */
-	r = find_next_bit(bitmap, ja->nr, l + 1);
-	pr_debug("starting binary search, l %u r %u", l, r);
-
-	while (l + 1 < r) {
-		unsigned m = (l + r) >> 1;
-		u64 cur_seq = seq;
-
-		read_bucket(m);
+	/* Find the journal bucket with the highest sequence number: */
+	for (i = 0; i < ja->nr; i++) {
+		if (ja->bucket_seq[i] > ja->bucket_seq[ja->cur_idx])
+			ja->cur_idx = i;
 
-		if (cur_seq != seq)
-			l = m;
-		else
-			r = m;
+		min_seq = min(ja->bucket_seq[i], min_seq);
 	}
 
-search_done:
 	/*
-	 * Find the journal bucket with the highest sequence number:
-	 *
 	 * If there's duplicate journal entries in multiple buckets (which
 	 * definitely isn't supposed to happen, but...) - make sure to start
 	 * cur_idx at the last of those buckets, so we don't deadlock trying to
 	 * allocate
 	 */
-	seq = 0;
-
-	for (i = 0; i < ja->nr; i++)
-		if (ja->bucket_seq[i] >= seq &&
-		    ja->bucket_seq[i] != ja->bucket_seq[(i + 1) % ja->nr]) {
-			/*
-			 * When journal_next_bucket() goes to allocate for
-			 * the first time, it'll use the bucket after
-			 * ja->cur_idx
-			 */
-			ja->cur_idx = i;
-			seq = ja->bucket_seq[i];
-		}
+	while (ja->bucket_seq[ja->cur_idx] > min_seq &&
+	       ja->bucket_seq[ja->cur_idx] >
+	       ja->bucket_seq[(ja->cur_idx + 1) % ja->nr])
+		ja->cur_idx++;
+
+	ja->sectors_free = 0;
 
 	/*
 	 * Set last_idx to indicate the entire journal is full and needs to be
@@ -660,20 +583,8 @@ search_done:
 	 * pinned when it first runs:
 	 */
 	ja->last_idx = (ja->cur_idx + 1) % ja->nr;
-
-	/*
-	 * Read buckets in reverse order until we stop finding more journal
-	 * entries:
-	 */
-	for (i = (ja->cur_idx + ja->nr - 1) % ja->nr;
-	     i != ja->cur_idx;
-	     i = (i + ja->nr - 1) % ja->nr)
-		if (!test_bit(i, bitmap) &&
-		    !read_bucket(i))
-			break;
 out:
 	kvpfree(buf.data, buf.size);
-	kfree(bitmap);
 	percpu_ref_put(&ca->io_ref);
 	closure_return(cl);
 	return;
@@ -682,7 +593,6 @@ err:
 	jlist->ret = ret;
 	mutex_unlock(&jlist->lock);
 	goto out;
-#undef read_bucket
 }
 
 void bch2_journal_entries_free(struct list_head *list)
@@ -937,32 +847,18 @@ static void bch2_journal_add_btree_root(struct journal_buf *buf,
 }
 
 static unsigned journal_dev_buckets_available(struct journal *j,
-					      struct bch_dev *ca)
+					      struct journal_device *ja)
 {
-	struct journal_device *ja = &ca->journal;
 	unsigned next = (ja->cur_idx + 1) % ja->nr;
 	unsigned available = (ja->last_idx + ja->nr - next) % ja->nr;
 
-	/*
-	 * Hack to avoid a deadlock during journal replay:
-	 * journal replay might require setting a new btree
-	 * root, which requires writing another journal entry -
-	 * thus, if the journal is full (and this happens when
-	 * replaying the first journal bucket's entries) we're
-	 * screwed.
-	 *
-	 * So don't let the journal fill up unless we're in
-	 * replay:
-	 */
-	if (test_bit(JOURNAL_REPLAY_DONE, &j->flags))
-		available = max((int) available - 2, 0);
-
 	/*
 	 * Don't use the last bucket unless writing the new last_seq
 	 * will make another bucket available:
 	 */
-	if (ja->bucket_seq[ja->last_idx] >= journal_last_seq(j))
-		available = max((int) available - 1, 0);
+	if (available &&
+	    journal_last_seq(j) <= ja->bucket_seq[ja->last_idx])
+		--available;
 
 	return available;
 }
@@ -972,7 +868,6 @@ int bch2_journal_entry_sectors(struct journal *j)
 {
 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
 	struct bch_dev *ca;
-	struct bkey_s_extent e = bkey_i_to_s_extent(&j->key);
 	unsigned sectors_available = UINT_MAX;
 	unsigned i, nr_online = 0, nr_devs = 0;
 
@@ -982,38 +877,39 @@ int bch2_journal_entry_sectors(struct journal *j)
 	for_each_member_device_rcu(ca, c, i,
 				   &c->rw_devs[BCH_DATA_JOURNAL]) {
 		struct journal_device *ja = &ca->journal;
-		unsigned buckets_required = 0;
+		unsigned buckets_this_device, sectors_this_device;
 
 		if (!ja->nr)
 			continue;
 
-		sectors_available = min_t(unsigned, sectors_available,
-					  ca->mi.bucket_size);
+		buckets_this_device = journal_dev_buckets_available(j, ja);
+		sectors_this_device = ja->sectors_free;
+
+		nr_online++;
 
 		/*
-		 * Note that we don't allocate the space for a journal entry
-		 * until we write it out - thus, if we haven't started the write
-		 * for the previous entry we have to make sure we have space for
-		 * it too:
+		 * We that we don't allocate the space for a journal entry
+		 * until we write it out - thus, account for it here:
 		 */
-		if (bch2_extent_has_device(e.c, ca->dev_idx)) {
-			if (j->prev_buf_sectors > ja->sectors_free)
-				buckets_required++;
-
-			if (j->prev_buf_sectors + sectors_available >
-			    ja->sectors_free)
-				buckets_required++;
-		} else {
-			if (j->prev_buf_sectors + sectors_available >
-			    ca->mi.bucket_size)
-				buckets_required++;
-
-			buckets_required++;
+		if (j->prev_buf_sectors >= sectors_this_device) {
+			if (!buckets_this_device)
+				continue;
+
+			buckets_this_device--;
+			sectors_this_device = ca->mi.bucket_size;
 		}
 
-		if (journal_dev_buckets_available(j, ca) >= buckets_required)
-			nr_devs++;
-		nr_online++;
+		sectors_this_device -= j->prev_buf_sectors;
+
+		if (buckets_this_device)
+			sectors_this_device = ca->mi.bucket_size;
+
+		if (!sectors_this_device)
+			continue;
+
+		sectors_available = min(sectors_available,
+					sectors_this_device);
+		nr_devs++;
 	}
 	rcu_read_unlock();
 
@@ -1026,106 +922,111 @@ int bch2_journal_entry_sectors(struct journal *j)
 	return sectors_available;
 }
 
-/**
- * journal_next_bucket - move on to the next journal bucket if possible
- */
-static int journal_write_alloc(struct journal *j, struct journal_buf *w,
-			       unsigned sectors)
+static void __journal_write_alloc(struct journal *j,
+				  struct journal_buf *w,
+				  struct dev_alloc_list *devs_sorted,
+				  unsigned sectors,
+				  unsigned *replicas,
+				  unsigned replicas_want)
 {
 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
-	struct bkey_s_extent e;
-	struct bch_extent_ptr *ptr;
+	struct bkey_i_extent *e = bkey_i_to_extent(&w->key);
 	struct journal_device *ja;
 	struct bch_dev *ca;
-	struct dev_alloc_list devs_sorted;
-	unsigned i, replicas, replicas_want =
-		READ_ONCE(c->opts.metadata_replicas);
-
-	spin_lock(&j->lock);
-	e = bkey_i_to_s_extent(&j->key);
-
-	/*
-	 * Drop any pointers to devices that have been removed, are no longer
-	 * empty, or filled up their current journal bucket:
-	 *
-	 * Note that a device may have had a small amount of free space (perhaps
-	 * one sector) that wasn't enough for the smallest possible journal
-	 * entry - that's why we drop pointers to devices <= current free space,
-	 * i.e. whichever device was limiting the current journal entry size.
-	 */
-	bch2_extent_drop_ptrs(e, ptr, ({
-		ca = bch_dev_bkey_exists(c, ptr->dev);
-
-		ca->mi.state != BCH_MEMBER_STATE_RW ||
-		ca->journal.sectors_free <= sectors;
-	}));
-
-	extent_for_each_ptr(e, ptr) {
-		ca = bch_dev_bkey_exists(c, ptr->dev);
+	unsigned i;
 
-		BUG_ON(ca->mi.state != BCH_MEMBER_STATE_RW ||
-		       ca->journal.sectors_free <= sectors);
-		ca->journal.sectors_free -= sectors;
-	}
-
-	replicas = bch2_extent_nr_ptrs(e.c);
-
-	rcu_read_lock();
-	devs_sorted = bch2_dev_alloc_list(c, &j->wp.stripe,
-					 &c->rw_devs[BCH_DATA_JOURNAL]);
+	if (*replicas >= replicas_want)
+		return;
 
-	for (i = 0; i < devs_sorted.nr; i++) {
-		ca = rcu_dereference(c->devs[devs_sorted.devs[i]]);
+	for (i = 0; i < devs_sorted->nr; i++) {
+		ca = rcu_dereference(c->devs[devs_sorted->devs[i]]);
 		if (!ca)
 			continue;
 
-		if (!ca->mi.durability)
-			continue;
-
 		ja = &ca->journal;
-		if (!ja->nr)
-			continue;
-
-		if (replicas >= replicas_want)
-			break;
 
 		/*
 		 * Check that we can use this device, and aren't already using
 		 * it:
 		 */
-		if (bch2_extent_has_device(e.c, ca->dev_idx) ||
-		    !journal_dev_buckets_available(j, ca) ||
-		    sectors > ca->mi.bucket_size)
+		if (!ca->mi.durability ||
+		    ca->mi.state != BCH_MEMBER_STATE_RW ||
+		    !ja->nr ||
+		    bch2_extent_has_device(extent_i_to_s_c(e), ca->dev_idx) ||
+		    sectors > ja->sectors_free)
 			continue;
 
 		bch2_dev_stripe_increment(c, ca, &j->wp.stripe);
 
-		ja->sectors_free = ca->mi.bucket_size - sectors;
-		ja->cur_idx = (ja->cur_idx + 1) % ja->nr;
-		ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq);
-
-		extent_ptr_append(bkey_i_to_extent(&j->key),
+		extent_ptr_append(e,
 			(struct bch_extent_ptr) {
 				  .offset = bucket_to_sector(ca,
-					ja->buckets[ja->cur_idx]),
+					ja->buckets[ja->cur_idx]) +
+					ca->mi.bucket_size -
+					ja->sectors_free,
 				  .dev = ca->dev_idx,
 		});
 
-		replicas += ca->mi.durability;
+		ja->sectors_free -= sectors;
+		ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq);
+
+		*replicas += ca->mi.durability;
+
+		if (*replicas >= replicas_want)
+			break;
 	}
-	rcu_read_unlock();
+}
 
-	j->prev_buf_sectors = 0;
+/**
+ * journal_next_bucket - move on to the next journal bucket if possible
+ */
+static int journal_write_alloc(struct journal *j, struct journal_buf *w,
+			       unsigned sectors)
+{
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
+	struct journal_device *ja;
+	struct bch_dev *ca;
+	struct dev_alloc_list devs_sorted;
+	unsigned i, replicas = 0, replicas_want =
+		READ_ONCE(c->opts.metadata_replicas);
 
-	bkey_copy(&w->key, &j->key);
-	spin_unlock(&j->lock);
+	rcu_read_lock();
 
-	if (replicas < c->opts.metadata_replicas_required)
-		return -EROFS;
+	devs_sorted = bch2_dev_alloc_list(c, &j->wp.stripe,
+					  &c->rw_devs[BCH_DATA_JOURNAL]);
 
-	BUG_ON(!replicas);
+	spin_lock(&j->lock);
+	__journal_write_alloc(j, w, &devs_sorted,
+			      sectors, &replicas, replicas_want);
 
-	return 0;
+	if (replicas >= replicas_want)
+		goto done;
+
+	for (i = 0; i < devs_sorted.nr; i++) {
+		ca = rcu_dereference(c->devs[devs_sorted.devs[i]]);
+		if (!ca)
+			continue;
+
+		ja = &ca->journal;
+
+		if (sectors > ja->sectors_free &&
+		    sectors <= ca->mi.bucket_size &&
+		    journal_dev_buckets_available(j, ja)) {
+			ja->cur_idx = (ja->cur_idx + 1) % ja->nr;
+			ja->sectors_free = ca->mi.bucket_size;
+		}
+	}
+
+	__journal_write_alloc(j, w, &devs_sorted,
+			      sectors, &replicas, replicas_want);
+done:
+	if (replicas >= replicas_want)
+		j->prev_buf_sectors = 0;
+
+	spin_unlock(&j->lock);
+	rcu_read_unlock();
+
+	return replicas >= replicas_want ? 0 : -EROFS;
 }
 
 static void journal_write_compact(struct jset *jset)
@@ -1376,9 +1277,6 @@ void bch2_journal_write(struct closure *cl)
 		}
 
 no_io:
-	extent_for_each_ptr(bkey_i_to_s_extent(&j->key), ptr)
-		ptr->offset += sectors;
-
 	bch2_bucket_seq_cleanup(c);
 
 	continue_at(cl, journal_write_done, system_highpri_wq);
diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
index e1d5d41ba118..6ada63f1bb25 100644
--- a/fs/bcachefs/journal_reclaim.c
+++ b/fs/bcachefs/journal_reclaim.c
@@ -126,7 +126,8 @@ void bch2_journal_reclaim_fast(struct journal *j)
 	 * Unpin journal entries whose reference counts reached zero, meaning
 	 * all btree nodes got written out
 	 */
-	while (!atomic_read(&fifo_peek_front(&j->pin).count)) {
+	while (!fifo_empty(&j->pin) &&
+	       !atomic_read(&fifo_peek_front(&j->pin).count)) {
 		BUG_ON(!list_empty(&fifo_peek_front(&j->pin).list));
 		BUG_ON(!fifo_pop(&j->pin, temp));
 		popped = true;
diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h
index 8502a930a05e..51e453652d67 100644
--- a/fs/bcachefs/journal_types.h
+++ b/fs/bcachefs/journal_types.h
@@ -185,7 +185,6 @@ struct journal {
 	struct list_head	seq_blacklist;
 	struct journal_seq_blacklist *new_blacklist;
 
-	BKEY_PADDED(key);
 	struct write_point	wp;
 	spinlock_t		err_lock;
 
-- 
cgit 


From 61274e9d45547e741cfafc80fb78a81275c8394a Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 18 Nov 2018 23:20:21 -0500
Subject: bcachefs: Allocator startup improvements

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c | 83 +++++++++++++++++++++++++-----------------
 fs/bcachefs/alloc_background.h |  2 +-
 fs/bcachefs/bcachefs.h         |  1 +
 fs/bcachefs/buckets.c          | 12 ++++++
 fs/bcachefs/journal_io.c       |  9 ++---
 5 files changed, 68 insertions(+), 39 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index b49d0cd84b78..c17fba1eae96 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -237,6 +237,7 @@ static int __bch2_alloc_write_key(struct bch_fs *c, struct bch_dev *ca,
 	__BKEY_PADDED(k, DIV_ROUND_UP(sizeof(struct bch_alloc), 8)) alloc_key;
 	struct bucket *g;
 	struct bkey_i_alloc *a;
+	int ret;
 	u8 *d;
 
 	percpu_down_read(&c->usage_lock);
@@ -260,32 +261,50 @@ static int __bch2_alloc_write_key(struct bch_fs *c, struct bch_dev *ca,
 
 	bch2_btree_iter_set_pos(iter, a->k.p);
 
-	return bch2_btree_insert_at(c, NULL, journal_seq,
-				    BTREE_INSERT_NOFAIL|
-				    BTREE_INSERT_USE_RESERVE|
-				    BTREE_INSERT_USE_ALLOC_RESERVE|
-				    flags,
-				    BTREE_INSERT_ENTRY(iter, &a->k_i));
+	ret = bch2_btree_insert_at(c, NULL, journal_seq,
+				   BTREE_INSERT_NOFAIL|
+				   BTREE_INSERT_USE_RESERVE|
+				   BTREE_INSERT_USE_ALLOC_RESERVE|
+				   flags,
+				   BTREE_INSERT_ENTRY(iter, &a->k_i));
+
+	if (!ret && ca->buckets_written)
+		set_bit(b, ca->buckets_written);
+
+	return ret;
 }
 
-int bch2_alloc_replay_key(struct bch_fs *c, struct bpos pos)
+int bch2_alloc_replay_key(struct bch_fs *c, struct bkey_i *k)
 {
 	struct bch_dev *ca;
 	struct btree_iter iter;
 	int ret;
 
-	if (pos.inode >= c->sb.nr_devices || !c->devs[pos.inode])
+	if (k->k.p.inode >= c->sb.nr_devices ||
+	    !c->devs[k->k.p.inode])
 		return 0;
 
-	ca = bch_dev_bkey_exists(c, pos.inode);
+	ca = bch_dev_bkey_exists(c, k->k.p.inode);
 
-	if (pos.offset >= ca->mi.nbuckets)
+	if (k->k.p.offset >= ca->mi.nbuckets)
 		return 0;
 
-	bch2_btree_iter_init(&iter, c, BTREE_ID_ALLOC, POS_MIN,
-			     BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+	bch2_btree_iter_init(&iter, c, BTREE_ID_ALLOC, k->k.p,
+			     BTREE_ITER_INTENT);
 
-	ret = __bch2_alloc_write_key(c, ca, pos.offset, &iter, NULL, 0);
+	ret = bch2_btree_iter_traverse(&iter);
+	if (ret)
+		goto err;
+
+	/* check buckets_written with btree node locked: */
+
+	ret = test_bit(k->k.p.offset, ca->buckets_written)
+		? 0
+		: bch2_btree_insert_at(c, NULL, NULL,
+				       BTREE_INSERT_NOFAIL|
+				       BTREE_INSERT_JOURNAL_REPLAY,
+				       BTREE_INSERT_ENTRY(&iter, k));
+err:
 	bch2_btree_iter_unlock(&iter);
 	return ret;
 }
@@ -1284,51 +1303,49 @@ static int __bch2_fs_allocator_start(struct bch_fs *c)
 
 	/* Scan for buckets that are already invalidated: */
 	for_each_rw_member(ca, c, dev_iter) {
-		struct btree_iter iter;
+		struct bucket_array *buckets;
 		struct bucket_mark m;
-		struct bkey_s_c k;
 
-		for_each_btree_key(&iter, c, BTREE_ID_ALLOC, POS(ca->dev_idx, 0), 0, k) {
-			if (k.k->type != BCH_ALLOC)
-				continue;
+		down_read(&ca->bucket_lock);
+		percpu_down_read(&c->usage_lock);
+
+		buckets = bucket_array(ca);
 
-			bu = k.k->p.offset;
-			m = READ_ONCE(bucket(ca, bu)->mark);
+		for (bu = buckets->first_bucket;
+		     bu < buckets->nbuckets; bu++) {
+			m = READ_ONCE(buckets->b[bu].mark);
 
-			if (!is_available_bucket(m) || m.cached_sectors)
+			if (!m.gen_valid ||
+			    !is_available_bucket(m) ||
+			    m.cached_sectors)
 				continue;
 
-			percpu_down_read(&c->usage_lock);
 			bch2_mark_alloc_bucket(c, ca, bu, true,
 					gc_pos_alloc(c, NULL),
 					BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
 					BCH_BUCKET_MARK_GC_LOCK_HELD);
-			percpu_up_read(&c->usage_lock);
 
 			fifo_push(&ca->free_inc, bu);
 
-			if (fifo_full(&ca->free_inc))
+			discard_invalidated_buckets(c, ca);
+
+			if (fifo_full(&ca->free[RESERVE_BTREE]))
 				break;
 		}
-		bch2_btree_iter_unlock(&iter);
+		percpu_up_read(&c->usage_lock);
+		up_read(&ca->bucket_lock);
 	}
 
 	/* did we find enough buckets? */
 	for_each_rw_member(ca, c, dev_iter)
-		if (fifo_used(&ca->free_inc) < ca->free[RESERVE_BTREE].size) {
+		if (!fifo_full(&ca->free[RESERVE_BTREE])) {
 			percpu_ref_put(&ca->io_ref);
 			goto not_enough;
 		}
 
 	return 0;
 not_enough:
-	pr_debug("did not find enough empty buckets; issuing discards");
-
-	/* clear out free_inc, we'll be using it again below: */
-	for_each_rw_member(ca, c, dev_iter)
-		discard_invalidated_buckets(c, ca);
-
-	pr_debug("scanning for reclaimable buckets");
+	pr_debug("not enough empty buckets; scanning for reclaimable buckets");
 
 	for_each_rw_member(ca, c, dev_iter) {
 		find_reclaimable_buckets(c, ca);
diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h
index 99535fa60214..59b6a5f2f890 100644
--- a/fs/bcachefs/alloc_background.h
+++ b/fs/bcachefs/alloc_background.h
@@ -17,7 +17,7 @@ void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 }
 
 int bch2_alloc_read(struct bch_fs *, struct list_head *);
-int bch2_alloc_replay_key(struct bch_fs *, struct bpos);
+int bch2_alloc_replay_key(struct bch_fs *, struct bkey_i *);
 
 static inline void bch2_wake_allocator(struct bch_dev *ca)
 {
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index b33fbf709705..cdea3a1d9176 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -394,6 +394,7 @@ struct bch_dev {
 	 */
 	struct bucket_array __rcu *buckets;
 	unsigned long		*buckets_dirty;
+	unsigned long		*buckets_written;
 	/* most out of date gen in the btree */
 	u8			*oldest_gens;
 	struct rw_semaphore	bucket_lock;
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 9558129e77ba..201798866242 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -1096,6 +1096,7 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
 {
 	struct bucket_array *buckets = NULL, *old_buckets = NULL;
 	unsigned long *buckets_dirty = NULL;
+	unsigned long *buckets_written = NULL;
 	u8 *oldest_gens = NULL;
 	alloc_fifo	free[RESERVE_NR];
 	alloc_fifo	free_inc;
@@ -1127,6 +1128,9 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
 	    !(buckets_dirty	= kvpmalloc(BITS_TO_LONGS(nbuckets) *
 					    sizeof(unsigned long),
 					    GFP_KERNEL|__GFP_ZERO)) ||
+	    !(buckets_written	= kvpmalloc(BITS_TO_LONGS(nbuckets) *
+					    sizeof(unsigned long),
+					    GFP_KERNEL|__GFP_ZERO)) ||
 	    !init_fifo(&free[RESERVE_BTREE], btree_reserve, GFP_KERNEL) ||
 	    !init_fifo(&free[RESERVE_MOVINGGC],
 		       copygc_reserve, GFP_KERNEL) ||
@@ -1161,6 +1165,9 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
 		memcpy(buckets_dirty,
 		       ca->buckets_dirty,
 		       BITS_TO_LONGS(n) * sizeof(unsigned long));
+		memcpy(buckets_written,
+		       ca->buckets_written,
+		       BITS_TO_LONGS(n) * sizeof(unsigned long));
 	}
 
 	rcu_assign_pointer(ca->buckets, buckets);
@@ -1168,6 +1175,7 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
 
 	swap(ca->oldest_gens, oldest_gens);
 	swap(ca->buckets_dirty, buckets_dirty);
+	swap(ca->buckets_written, buckets_written);
 
 	if (resize)
 		percpu_up_write(&c->usage_lock);
@@ -1207,6 +1215,8 @@ err:
 		free_fifo(&free[i]);
 	kvpfree(buckets_dirty,
 		BITS_TO_LONGS(nbuckets) * sizeof(unsigned long));
+	kvpfree(buckets_written,
+		BITS_TO_LONGS(nbuckets) * sizeof(unsigned long));
 	kvpfree(oldest_gens,
 		nbuckets * sizeof(u8));
 	if (buckets)
@@ -1224,6 +1234,8 @@ void bch2_dev_buckets_free(struct bch_dev *ca)
 	free_fifo(&ca->free_inc);
 	for (i = 0; i < RESERVE_NR; i++)
 		free_fifo(&ca->free[i]);
+	kvpfree(ca->buckets_written,
+		BITS_TO_LONGS(ca->mi.nbuckets) * sizeof(unsigned long));
 	kvpfree(ca->buckets_dirty,
 		BITS_TO_LONGS(ca->mi.nbuckets) * sizeof(unsigned long));
 	kvpfree(ca->oldest_gens, ca->mi.nbuckets * sizeof(u8));
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 60fc2eced71a..a74566764630 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -780,7 +780,6 @@ int bch2_journal_replay(struct bch_fs *c, struct list_head *list)
 	int ret = 0;
 
 	list_for_each_entry_safe(i, n, list, list) {
-
 		j->replay_journal_seq = le64_to_cpu(i->j.seq);
 
 		for_each_jset_key(k, _n, entry, &i->j) {
@@ -790,7 +789,7 @@ int bch2_journal_replay(struct bch_fs *c, struct list_head *list)
 				 * allocation code handles replay for
 				 * BTREE_ID_ALLOC keys:
 				 */
-				ret = bch2_alloc_replay_key(c, k->k.p);
+				ret = bch2_alloc_replay_key(c, k);
 			} else {
 				/*
 				 * We might cause compressed extents to be
@@ -801,9 +800,9 @@ int bch2_journal_replay(struct bch_fs *c, struct list_head *list)
 					bch2_disk_reservation_init(c, 0);
 
 				ret = bch2_btree_insert(c, entry->btree_id, k,
-							&disk_res, NULL,
-							BTREE_INSERT_NOFAIL|
-							BTREE_INSERT_JOURNAL_REPLAY);
+						&disk_res, NULL,
+						BTREE_INSERT_NOFAIL|
+						BTREE_INSERT_JOURNAL_REPLAY);
 			}
 
 			if (ret) {
-- 
cgit 


From 8812600c2953bf7e394ad11d44c7d71fbdd719cc Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 21 Nov 2018 02:59:07 -0500
Subject: bcachefs: fix btree iterator bug when using depth > 0

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index ea37fa21ed6e..a50a6a51a3a5 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -818,7 +818,7 @@ static inline int btree_iter_lock_root(struct btree_iter *iter,
 			 */
 			iter->level = depth_want;
 			iter->l[iter->level].b = NULL;
-			return 0;
+			return 1;
 		}
 
 		lock_type = __btree_lock_want(iter, iter->level);
@@ -1045,6 +1045,9 @@ int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter)
 			? btree_iter_down(iter)
 			: btree_iter_lock_root(iter, depth_want);
 		if (unlikely(ret)) {
+			if (ret == 1)
+				return 0;
+
 			iter->level = depth_want;
 			iter->l[iter->level].b = BTREE_ITER_NOT_END;
 			return ret;
-- 
cgit 


From e647369168e02a06ff5ee229cc14ad72b2f5ddfd Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 21 Nov 2018 07:16:28 -0500
Subject: bcachefs: fix mempool double initialization

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/compress.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c
index 42ae4cfdcb6b..6b5b61f10fcb 100644
--- a/fs/bcachefs/compress.c
+++ b/fs/bcachefs/compress.c
@@ -596,11 +596,13 @@ have_compressed:
 			goto out;
 	}
 
-	ret = mempool_init_kmalloc_pool(
-			&c->decompress_workspace,
-			1, decompress_workspace_size);
-	if (ret)
-		goto out;
+	if (!mempool_initialized(&c->decompress_workspace)) {
+		ret = mempool_init_kmalloc_pool(
+				&c->decompress_workspace,
+				1, decompress_workspace_size);
+		if (ret)
+			goto out;
+	}
 out:
 	pr_verbose_init(c->opts, "ret %i", ret);
 	return ret;
-- 
cgit 


From 9ca53b55f7415783c6cc8b751c99f2af6cc0a932 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 23 Jul 2018 05:32:01 -0400
Subject: bcachefs: gc now operates on second set of bucket marks

This means we can now use gc to verify the allocation information -
important for testing persistant alloc info

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c      |  13 +-
 fs/bcachefs/bcachefs.h              |  14 +-
 fs/bcachefs/btree_gc.c              | 399 ++++++++++++++++++++++++------------
 fs/bcachefs/btree_gc.h              |   6 +-
 fs/bcachefs/btree_update_interior.c |  50 ++---
 fs/bcachefs/buckets.c               | 340 ++++++++++++++----------------
 fs/bcachefs/buckets.h               |  29 ++-
 fs/bcachefs/buckets_types.h         |   6 +-
 fs/bcachefs/journal.c               |   4 +-
 fs/bcachefs/super.c                 |  15 +-
 fs/bcachefs/sysfs.c                 |   2 +-
 11 files changed, 495 insertions(+), 383 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index c17fba1eae96..3f0e2dd29fde 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -930,12 +930,6 @@ static int bch2_allocator_thread(void *arg)
 		pr_debug("free_inc now empty");
 
 		do {
-			if (test_bit(BCH_FS_GC_FAILURE, &c->flags)) {
-				up_read(&c->gc_lock);
-				bch_err(ca, "gc failure");
-				goto stop;
-			}
-
 			/*
 			 * Find some buckets that we can invalidate, either
 			 * they're completely unused, or only contain clean data
@@ -1293,9 +1287,6 @@ static int __bch2_fs_allocator_start(struct bch_fs *c)
 	bool invalidating_data = false;
 	int ret = 0;
 
-	if (test_bit(BCH_FS_GC_FAILURE, &c->flags))
-		return -1;
-
 	if (test_alloc_startup(c)) {
 		invalidating_data = true;
 		goto not_enough;
@@ -1321,9 +1312,7 @@ static int __bch2_fs_allocator_start(struct bch_fs *c)
 				continue;
 
 			bch2_mark_alloc_bucket(c, ca, bu, true,
-					gc_pos_alloc(c, NULL),
-					BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
-					BCH_BUCKET_MARK_GC_LOCK_HELD);
+					gc_pos_alloc(c, NULL), 0);
 
 			fifo_push(&ca->free_inc, bu);
 
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index cdea3a1d9176..eaa2055000b6 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -347,7 +347,6 @@ enum gc_phase {
 
 	GC_PHASE_PENDING_DELETE,
 	GC_PHASE_ALLOC,
-	GC_PHASE_DONE
 };
 
 struct gc_pos {
@@ -392,15 +391,14 @@ struct bch_dev {
 	 * gc_lock, for device resize - holding any is sufficient for access:
 	 * Or rcu_read_lock(), but only for ptr_stale():
 	 */
-	struct bucket_array __rcu *buckets;
+	struct bucket_array __rcu *buckets[2];
 	unsigned long		*buckets_dirty;
 	unsigned long		*buckets_written;
 	/* most out of date gen in the btree */
 	u8			*oldest_gens;
 	struct rw_semaphore	bucket_lock;
 
-	struct bch_dev_usage __percpu *usage_percpu;
-	struct bch_dev_usage	usage_cached;
+	struct bch_dev_usage __percpu *usage[2];
 
 	/* Allocator: */
 	struct task_struct __rcu *alloc_thread;
@@ -478,7 +476,6 @@ enum {
 
 	/* errors: */
 	BCH_FS_ERROR,
-	BCH_FS_GC_FAILURE,
 
 	/* misc: */
 	BCH_FS_BDEV_MOUNTED,
@@ -614,8 +611,8 @@ struct bch_fs {
 
 	atomic64_t		sectors_available;
 
-	struct bch_fs_usage __percpu *usage_percpu;
-	struct bch_fs_usage	usage_cached;
+	struct bch_fs_usage __percpu *usage[2];
+
 	struct percpu_rw_semaphore usage_lock;
 
 	struct closure_waitlist	freelist_wait;
@@ -656,9 +653,6 @@ struct bch_fs {
 	 *
 	 * gc_cur_phase is a superset of btree_ids (BTREE_ID_EXTENTS etc.)
 	 *
-	 * gc_cur_phase == GC_PHASE_DONE indicates that gc is finished/not
-	 * currently running, and gc marks are currently valid
-	 *
 	 * Protected by gc_pos_lock. Only written to by GC thread, so GC thread
 	 * can read without a lock.
 	 */
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index e900fd4ffd06..6eba65fcb52c 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -260,8 +260,7 @@ static int bch2_gc_mark_key(struct bch_fs *c, enum bkey_type type,
 {
 	struct gc_pos pos = { 0 };
 	unsigned flags =
-		BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
-		BCH_BUCKET_MARK_GC_LOCK_HELD|
+		BCH_BUCKET_MARK_GC|
 		(initial ? BCH_BUCKET_MARK_NOATOMIC : 0);
 	int ret = 0;
 
@@ -484,9 +483,6 @@ void bch2_mark_dev_superblock(struct bch_fs *c, struct bch_dev *ca,
 				      BCH_DATA_SB, flags);
 	}
 
-	if (c)
-		spin_lock(&c->journal.lock);
-
 	for (i = 0; i < ca->journal.nr; i++) {
 		b = ca->journal.buckets[i];
 		bch2_mark_metadata_bucket(c, ca, b, BCH_DATA_JOURNAL,
@@ -495,7 +491,6 @@ void bch2_mark_dev_superblock(struct bch_fs *c, struct bch_dev *ca,
 	}
 
 	if (c) {
-		spin_unlock(&c->journal.lock);
 		percpu_up_read(&c->usage_lock);
 	} else {
 		preempt_enable();
@@ -511,9 +506,7 @@ static void bch2_mark_superblocks(struct bch_fs *c)
 	gc_pos_set(c, gc_phase(GC_PHASE_SB));
 
 	for_each_online_member(ca, c, i)
-		bch2_mark_dev_superblock(c, ca,
-					 BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
-					 BCH_BUCKET_MARK_GC_LOCK_HELD);
+		bch2_mark_dev_superblock(c, ca, BCH_BUCKET_MARK_GC);
 	mutex_unlock(&c->sb_lock);
 }
 
@@ -521,7 +514,6 @@ static void bch2_mark_superblocks(struct bch_fs *c)
 static void bch2_mark_pending_btree_node_frees(struct bch_fs *c)
 {
 	struct gc_pos pos = { 0 };
-	struct bch_fs_usage stats = { 0 };
 	struct btree_update *as;
 	struct pending_btree_node_free *d;
 
@@ -533,13 +525,8 @@ static void bch2_mark_pending_btree_node_frees(struct bch_fs *c)
 			bch2_mark_key(c, BKEY_TYPE_BTREE,
 				      bkey_i_to_s_c(&d->key),
 				      true, 0,
-				      pos, &stats, 0,
-				      BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
-				      BCH_BUCKET_MARK_GC_LOCK_HELD);
-	/*
-	 * Don't apply stats - pending deletes aren't tracked in
-	 * bch_alloc_stats:
-	 */
+				      pos, NULL, 0,
+				      BCH_BUCKET_MARK_GC);
 
 	mutex_unlock(&c->btree_interior_update_lock);
 }
@@ -560,8 +547,7 @@ static void bch2_mark_allocator_buckets(struct bch_fs *c)
 		fifo_for_each_entry(i, &ca->free_inc, iter)
 			bch2_mark_alloc_bucket(c, ca, i, true,
 					       gc_pos_alloc(c, NULL),
-					       BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
-					       BCH_BUCKET_MARK_GC_LOCK_HELD);
+					       BCH_BUCKET_MARK_GC);
 
 
@@ -569,8 +555,7 @@ static void bch2_mark_allocator_buckets(struct bch_fs *c)
 			fifo_for_each_entry(i, &ca->free[j], iter)
 				bch2_mark_alloc_bucket(c, ca, i, true,
 						       gc_pos_alloc(c, NULL),
-						       BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
-						       BCH_BUCKET_MARK_GC_LOCK_HELD);
+						       BCH_BUCKET_MARK_GC);
 	}
 
 	spin_unlock(&c->freelist_lock);
@@ -584,8 +569,7 @@ static void bch2_mark_allocator_buckets(struct bch_fs *c)
 			ca = bch_dev_bkey_exists(c, ob->ptr.dev);
 			bch2_mark_alloc_bucket(c, ca, PTR_BUCKET_NR(ca, &ob->ptr), true,
 					       gc_pos_alloc(c, ob),
-					       BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
-					       BCH_BUCKET_MARK_GC_LOCK_HELD);
+					       BCH_BUCKET_MARK_GC);
 		}
 		spin_unlock(&ob->lock);
 	}
@@ -593,122 +577,310 @@ static void bch2_mark_allocator_buckets(struct bch_fs *c)
 	percpu_up_read(&c->usage_lock);
 }
 
-static void bch2_gc_start(struct bch_fs *c)
+static void bch2_gc_free(struct bch_fs *c)
+{
+	struct bch_dev *ca;
+	unsigned i;
+
+	for_each_member_device(ca, c, i) {
+		kvpfree(rcu_dereference_protected(ca->buckets[1], 1),
+			sizeof(struct bucket_array) +
+			ca->mi.nbuckets * sizeof(struct bucket));
+		ca->buckets[1] = NULL;
+
+		free_percpu(ca->usage[1]);
+		ca->usage[1] = NULL;
+	}
+
+	free_percpu(c->usage[1]);
+	c->usage[1] = NULL;
+}
+
+static void bch2_gc_done_nocheck(struct bch_fs *c)
 {
 	struct bch_dev *ca;
-	struct bucket_array *buckets;
-	struct bucket_mark new;
 	unsigned i;
-	size_t b;
 	int cpu;
 
-	percpu_down_write(&c->usage_lock);
+	for_each_member_device(ca, c, i) {
+		struct bucket_array *src = __bucket_array(ca, 1);
 
-	/*
-	 * Indicates to buckets code that gc is now in progress - done under
-	 * usage_lock to avoid racing with bch2_mark_key():
-	 */
-	__gc_pos_set(c, gc_phase(GC_PHASE_START));
+		memcpy(__bucket_array(ca, 0), src,
+		       sizeof(struct bucket_array) +
+		       sizeof(struct bucket) * src->nbuckets);
+	};
 
-	/* Save a copy of the existing bucket stats while we recompute them: */
 	for_each_member_device(ca, c, i) {
-		ca->usage_cached = __bch2_dev_usage_read(ca);
+		struct bch_dev_usage *p;
+
 		for_each_possible_cpu(cpu) {
-			struct bch_dev_usage *p =
-				per_cpu_ptr(ca->usage_percpu, cpu);
+			p = per_cpu_ptr(ca->usage[0], cpu);
 			memset(p, 0, sizeof(*p));
 		}
+
+		preempt_disable();
+		*this_cpu_ptr(ca->usage[0]) = __bch2_dev_usage_read(ca, 1);
+		preempt_enable();
 	}
 
-	c->usage_cached = __bch2_fs_usage_read(c);
-	for_each_possible_cpu(cpu) {
-		struct bch_fs_usage *p =
-			per_cpu_ptr(c->usage_percpu, cpu);
+	{
+		struct bch_fs_usage src = __bch2_fs_usage_read(c, 1);
+		struct bch_fs_usage *p;
 
-		memset(p->replicas, 0, sizeof(p->replicas));
-		memset(p->buckets, 0, sizeof(p->buckets));
+		for_each_possible_cpu(cpu) {
+			p = per_cpu_ptr(c->usage[0], cpu);
+			memset(p, 0, offsetof(typeof(*p), online_reserved));
+		}
+
+		preempt_disable();
+		memcpy(this_cpu_ptr(c->usage[0]),
+		       &src,
+		       offsetof(typeof(*p), online_reserved));
+		preempt_enable();
 	}
 
+}
+
+static void bch2_gc_done(struct bch_fs *c, bool initial)
+{
+	struct bch_dev *ca;
+	unsigned i;
+	int cpu;
+
+#define copy_field(_f, _msg, ...)					\
+	if (dst._f != src._f) {						\
+		pr_info(_msg ": got %llu, should be %llu, fixing"	\
+			, ##__VA_ARGS__, dst._f, src._f);		\
+		dst._f = src._f;					\
+	}
+#define copy_bucket_field(_f)						\
+	if (dst->b[b].mark._f != src->b[b].mark._f) {			\
+		pr_info("dev %u bucket %zu has wrong " #_f		\
+			": got %u, should be %u, fixing",		\
+			i, b, dst->b[b].mark._f, src->b[b].mark._f);	\
+		dst->b[b]._mark._f = src->b[b].mark._f;			\
+	}
+#define copy_dev_field(_f, _msg, ...)					\
+	copy_field(_f, "dev %u has wrong " _msg, i, ##__VA_ARGS__)
+#define copy_fs_field(_f, _msg, ...)					\
+	copy_field(_f, "fs has wrong " _msg, ##__VA_ARGS__)
+
+	percpu_down_write(&c->usage_lock);
+
+	if (initial) {
+		bch2_gc_done_nocheck(c);
+		goto out;
+	}
+
+	for_each_member_device(ca, c, i) {
+		struct bucket_array *dst = __bucket_array(ca, 0);
+		struct bucket_array *src = __bucket_array(ca, 1);
+		size_t b;
+
+		if (initial) {
+			memcpy(dst, src,
+			       sizeof(struct bucket_array) +
+			       sizeof(struct bucket) * dst->nbuckets);
+		}
+
+		for (b = 0; b < src->nbuckets; b++) {
+			copy_bucket_field(gen);
+			copy_bucket_field(data_type);
+			copy_bucket_field(owned_by_allocator);
+			copy_bucket_field(stripe);
+			copy_bucket_field(dirty_sectors);
+			copy_bucket_field(cached_sectors);
+		}
+	};
+
+	for_each_member_device(ca, c, i) {
+		struct bch_dev_usage dst = __bch2_dev_usage_read(ca, 0);
+		struct bch_dev_usage src = __bch2_dev_usage_read(ca, 1);
+		struct bch_dev_usage *p;
+		unsigned b;
+
+		for (b = 0; b < BCH_DATA_NR; b++)
+			copy_dev_field(buckets[b],
+				       "buckets[%s]", bch2_data_types[b]);
+		copy_dev_field(buckets_alloc, "buckets_alloc");
+		copy_dev_field(buckets_ec, "buckets_ec");
+
+		for (b = 0; b < BCH_DATA_NR; b++)
+			copy_dev_field(sectors[b],
+				       "sectors[%s]", bch2_data_types[b]);
+		copy_dev_field(sectors_fragmented,
+			       "sectors_fragmented");
+
+		for_each_possible_cpu(cpu) {
+			p = per_cpu_ptr(ca->usage[0], cpu);
+			memset(p, 0, sizeof(*p));
+		}
+
+		preempt_disable();
+		p = this_cpu_ptr(ca->usage[0]);
+		*p = dst;
+		preempt_enable();
+	}
+
+	{
+		struct bch_fs_usage dst = __bch2_fs_usage_read(c, 0);
+		struct bch_fs_usage src = __bch2_fs_usage_read(c, 1);
+		struct bch_fs_usage *p;
+		unsigned r, b;
+
+		for (r = 0; r < BCH_REPLICAS_MAX; r++) {
+			for (b = 0; b < BCH_DATA_NR; b++)
+				copy_fs_field(replicas[r].data[b],
+					      "replicas[%i].data[%s]",
+					      r, bch2_data_types[b]);
+			copy_fs_field(replicas[r].ec_data,
+				      "replicas[%i].ec_data", r);
+			copy_fs_field(replicas[r].persistent_reserved,
+				      "replicas[%i].persistent_reserved", r);
+		}
+
+		for (b = 0; b < BCH_DATA_NR; b++)
+			copy_fs_field(buckets[b],
+				      "buckets[%s]", bch2_data_types[b]);
+
+		for_each_possible_cpu(cpu) {
+			p = per_cpu_ptr(c->usage[0], cpu);
+			memset(p, 0, offsetof(typeof(*p), online_reserved));
+		}
+
+		preempt_disable();
+		p = this_cpu_ptr(c->usage[0]);
+		memcpy(p, &dst, offsetof(typeof(*p), online_reserved));
+		preempt_enable();
+	}
+out:
 	percpu_up_write(&c->usage_lock);
 
-	/* Clear bucket marks: */
+#undef copy_field
+#undef copy_fs_field
+#undef copy_dev_field
+#undef copy_bucket_field
+}
+
+static int bch2_gc_start(struct bch_fs *c)
+{
+	struct bch_dev *ca;
+	unsigned i;
+
+	BUG_ON(c->usage[1]);
+
+	c->usage[1] = alloc_percpu(struct bch_fs_usage);
+	if (!c->usage[1])
+		return -ENOMEM;
+
 	for_each_member_device(ca, c, i) {
-		down_read(&ca->bucket_lock);
-		buckets = bucket_array(ca);
-
-		for (b = buckets->first_bucket; b < buckets->nbuckets; b++) {
-			bucket_cmpxchg(buckets->b + b, new, ({
-				new.owned_by_allocator	= 0;
-				new.data_type		= 0;
-				new.cached_sectors	= 0;
-				new.dirty_sectors	= 0;
-				new.stripe		= 0;
-			}));
-			ca->oldest_gens[b] = new.gen;
+		BUG_ON(ca->buckets[1]);
+		BUG_ON(ca->usage[1]);
+
+		ca->buckets[1] = kvpmalloc(sizeof(struct bucket_array) +
+				ca->mi.nbuckets * sizeof(struct bucket),
+				GFP_KERNEL|__GFP_ZERO);
+		if (!ca->buckets[1]) {
+			percpu_ref_put(&ca->ref);
+			return -ENOMEM;
+		}
+
+		ca->usage[1] = alloc_percpu(struct bch_dev_usage);
+		if (!ca->usage[1]) {
+			percpu_ref_put(&ca->ref);
+			return -ENOMEM;
 		}
-		up_read(&ca->bucket_lock);
 	}
+
+	percpu_down_write(&c->usage_lock);
+
+	for_each_member_device(ca, c, i) {
+		struct bucket_array *dst = __bucket_array(ca, 1);
+		struct bucket_array *src = __bucket_array(ca, 0);
+		size_t b;
+
+		dst->first_bucket	= src->first_bucket;
+		dst->nbuckets		= src->nbuckets;
+
+		for (b = 0; b < src->nbuckets; b++)
+			dst->b[b]._mark.gen = src->b[b].mark.gen;
+	};
+
+	percpu_up_write(&c->usage_lock);
+
+	return 0;
 }
 
 /**
- * bch_gc - recompute bucket marks and oldest_gen, rewrite btree nodes
+ * bch2_gc - walk _all_ references to buckets, and recompute them:
+ *
+ * Order matters here:
+ *  - Concurrent GC relies on the fact that we have a total ordering for
+ *    everything that GC walks - see  gc_will_visit_node(),
+ *    gc_will_visit_root()
+ *
+ *  - also, references move around in the course of index updates and
+ *    various other crap: everything needs to agree on the ordering
+ *    references are allowed to move around in - e.g., we're allowed to
+ *    start with a reference owned by an open_bucket (the allocator) and
+ *    move it to the btree, but not the reverse.
+ *
+ *    This is necessary to ensure that gc doesn't miss references that
+ *    move around - if references move backwards in the ordering GC
+ *    uses, GC could skip past them
  */
-void bch2_gc(struct bch_fs *c)
+int bch2_gc(struct bch_fs *c, struct list_head *journal, bool initial)
 {
 	struct bch_dev *ca;
 	u64 start_time = local_clock();
-	unsigned i;
+	unsigned i, iter = 0;
 	int ret;
 
-	/*
-	 * Walk _all_ references to buckets, and recompute them:
-	 *
-	 * Order matters here:
-	 *  - Concurrent GC relies on the fact that we have a total ordering for
-	 *    everything that GC walks - see  gc_will_visit_node(),
-	 *    gc_will_visit_root()
-	 *
-	 *  - also, references move around in the course of index updates and
-	 *    various other crap: everything needs to agree on the ordering
-	 *    references are allowed to move around in - e.g., we're allowed to
-	 *    start with a reference owned by an open_bucket (the allocator) and
-	 *    move it to the btree, but not the reverse.
-	 *
-	 *    This is necessary to ensure that gc doesn't miss references that
-	 *    move around - if references move backwards in the ordering GC
-	 *    uses, GC could skip past them
-	 */
 	trace_gc_start(c);
 
-	/*
-	 * Do this before taking gc_lock - bch2_disk_reservation_get() blocks on
-	 * gc_lock if sectors_available goes to 0:
-	 */
-	bch2_recalc_sectors_available(c);
-
 	down_write(&c->gc_lock);
-	if (test_bit(BCH_FS_GC_FAILURE, &c->flags))
+again:
+	ret = bch2_gc_start(c);
+	if (ret)
 		goto out;
 
-	bch2_gc_start(c);
-
 	bch2_mark_superblocks(c);
 
-	ret = bch2_gc_btrees(c, NULL, false);
-	if (ret) {
-		bch_err(c, "btree gc failed: %d", ret);
-		set_bit(BCH_FS_GC_FAILURE, &c->flags);
+	ret = bch2_gc_btrees(c, journal, initial);
+	if (ret)
 		goto out;
-	}
 
 	bch2_mark_pending_btree_node_frees(c);
 	bch2_mark_allocator_buckets(c);
 
-	/* Indicates that gc is no longer in progress: */
-	gc_pos_set(c, gc_phase(GC_PHASE_DONE));
 	c->gc_count++;
 out:
+	if (!ret && test_bit(BCH_FS_FIXED_GENS, &c->flags)) {
+		/*
+		 * XXX: make sure gens we fixed got saved
+		 */
+		if (iter++ <= 2) {
+			bch_info(c, "Fixed gens, restarting mark and sweep:");
+			clear_bit(BCH_FS_FIXED_GENS, &c->flags);
+			goto again;
+		}
+
+		bch_info(c, "Unable to fix bucket gens, looping");
+		ret = -EINVAL;
+	}
+
+	if (!ret)
+		bch2_gc_done(c, initial);
+
+	/* Indicates that gc is no longer in progress: */
+	__gc_pos_set(c, gc_phase(GC_PHASE_START));
+
+	bch2_gc_free(c);
 	up_write(&c->gc_lock);
+
+	if (!ret && initial)
+		set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags);
+
 	trace_gc_end(c);
 	bch2_time_stats_update(&c->times[BCH_TIME_btree_gc], start_time);
 
@@ -724,6 +896,7 @@ out:
 	 * allocator thread - issue wakeup in case they blocked on gc_lock:
 	 */
 	closure_wake_up(&c->freelist_wait);
+	return ret;
 }
 
 /* Btree coalescing */
@@ -1039,9 +1212,6 @@ void bch2_coalesce(struct bch_fs *c)
 {
 	enum btree_id id;
 
-	if (test_bit(BCH_FS_GC_FAILURE, &c->flags))
-		return;
-
 	down_read(&c->gc_lock);
 	trace_gc_coalesce_start(c);
 
@@ -1053,7 +1223,6 @@ void bch2_coalesce(struct bch_fs *c)
 		if (ret) {
 			if (ret != -ESHUTDOWN)
 				bch_err(c, "btree coalescing failed: %d", ret);
-			set_bit(BCH_FS_GC_FAILURE, &c->flags);
 			return;
 		}
 	}
@@ -1068,6 +1237,7 @@ static int bch2_gc_thread(void *arg)
 	struct io_clock *clock = &c->io_clock[WRITE];
 	unsigned long last = atomic_long_read(&clock->now);
 	unsigned last_kick = atomic_read(&c->kick_gc);
+	int ret;
 
 	set_freezable();
 
@@ -1101,7 +1271,9 @@ static int bch2_gc_thread(void *arg)
 		last = atomic_long_read(&clock->now);
 		last_kick = atomic_read(&c->kick_gc);
 
-		bch2_gc(c);
+		ret = bch2_gc(c, NULL, false);
+		if (ret)
+			bch_err(c, "btree gc failed: %i", ret);
 
 		debug_check_no_locks_held();
 	}
@@ -1142,30 +1314,7 @@ int bch2_gc_thread_start(struct bch_fs *c)
 
 int bch2_initial_gc(struct bch_fs *c, struct list_head *journal)
 {
-	unsigned iter = 0;
-	int ret = 0;
-
-	down_write(&c->gc_lock);
-again:
-	bch2_gc_start(c);
-
-	bch2_mark_superblocks(c);
-
-	ret = bch2_gc_btrees(c, journal, true);
-	if (ret)
-		goto err;
-
-	if (test_bit(BCH_FS_FIXED_GENS, &c->flags)) {
-		if (iter++ > 2) {
-			bch_info(c, "Unable to fix bucket gens, looping");
-			ret = -EINVAL;
-			goto err;
-		}
-
-		bch_info(c, "Fixed gens, restarting initial mark and sweep:");
-		clear_bit(BCH_FS_FIXED_GENS, &c->flags);
-		goto again;
-	}
+	int ret = bch2_gc(c, journal, true);
 
 	/*
 	 * Skip past versions that might have possibly been used (as nonces),
@@ -1174,9 +1323,5 @@ again:
 	if (c->sb.encryption_type)
 		atomic64_add(1 << 16, &c->key_version);
 
-	gc_pos_set(c, gc_phase(GC_PHASE_DONE));
-	set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags);
-err:
-	up_write(&c->gc_lock);
 	return ret;
 }
diff --git a/fs/bcachefs/btree_gc.h b/fs/bcachefs/btree_gc.h
index 47a590015325..bb77564b9463 100644
--- a/fs/bcachefs/btree_gc.h
+++ b/fs/bcachefs/btree_gc.h
@@ -7,7 +7,7 @@
 enum bkey_type;
 
 void bch2_coalesce(struct bch_fs *);
-void bch2_gc(struct bch_fs *);
+int bch2_gc(struct bch_fs *, struct list_head *, bool);
 void bch2_gc_thread_stop(struct bch_fs *);
 int bch2_gc_thread_start(struct bch_fs *);
 int bch2_initial_gc(struct bch_fs *, struct list_head *);
@@ -105,14 +105,14 @@ static inline struct gc_pos gc_pos_alloc(struct bch_fs *c, struct open_bucket *o
 	};
 }
 
-static inline bool gc_will_visit(struct bch_fs *c, struct gc_pos pos)
+static inline bool gc_visited(struct bch_fs *c, struct gc_pos pos)
 {
 	unsigned seq;
 	bool ret;
 
 	do {
 		seq = read_seqcount_begin(&c->gc_pos_lock);
-		ret = gc_pos_cmp(c->gc_pos, pos) < 0;
+		ret = gc_pos_cmp(pos, c->gc_pos) <= 0;
 	} while (read_seqcount_retry(&c->gc_pos_lock, seq));
 
 	return ret;
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index af31819c88c7..2631b0732d4b 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -160,7 +160,6 @@ static void bch2_btree_node_free_index(struct btree_update *as, struct btree *b,
 {
 	struct bch_fs *c = as->c;
 	struct pending_btree_node_free *d;
-	unsigned replicas;
 
 	/*
 	 * btree_update lock is only needed here to avoid racing with
@@ -178,15 +177,6 @@ found:
 	BUG_ON(d->index_update_done);
 	d->index_update_done = true;
 
-	/*
-	 * Btree nodes are accounted as freed in bch_alloc_stats when they're
-	 * freed from the index:
-	 */
-	replicas = bch2_extent_nr_dirty_ptrs(k);
-	if (replicas)
-		stats->replicas[replicas - 1].data[BCH_DATA_BTREE] -=
-			c->opts.btree_node_size * replicas;
-
 	/*
 	 * We're dropping @k from the btree, but it's still live until the
 	 * index update is persistent so we need to keep a reference around for
@@ -208,15 +198,16 @@ found:
 	 * bch2_mark_key() compares the current gc pos to the pos we're
 	 * moving this reference from, hence one comparison here:
 	 */
-	if (gc_pos_cmp(c->gc_pos, gc_phase(GC_PHASE_PENDING_DELETE)) < 0) {
-		struct bch_fs_usage tmp = { 0 };
+	if (gc_pos_cmp(c->gc_pos, b
+		       ? gc_pos_btree_node(b)
+		       : gc_pos_btree_root(as->btree_id)) >= 0 &&
+	    gc_pos_cmp(c->gc_pos, gc_phase(GC_PHASE_PENDING_DELETE)) < 0) {
+		struct gc_pos pos = { 0 };
 
 		bch2_mark_key(c, BKEY_TYPE_BTREE,
 			      bkey_i_to_s_c(&d->key),
-			      false, 0, b
-			      ? gc_pos_btree_node(b)
-			      : gc_pos_btree_root(as->btree_id),
-			      &tmp, 0, 0);
+			      false, 0, pos,
+			      NULL, 0, BCH_BUCKET_MARK_GC);
 		/*
 		 * Don't apply tmp - pending deletes aren't tracked in
 		 * bch_alloc_stats:
@@ -287,19 +278,13 @@ void bch2_btree_node_free_inmem(struct bch_fs *c, struct btree *b,
 static void bch2_btree_node_free_ondisk(struct bch_fs *c,
 					struct pending_btree_node_free *pending)
 {
-	struct bch_fs_usage stats = { 0 };
-
 	BUG_ON(!pending->index_update_done);
 
 	bch2_mark_key(c, BKEY_TYPE_BTREE,
 		      bkey_i_to_s_c(&pending->key),
 		      false, 0,
 		      gc_phase(GC_PHASE_PENDING_DELETE),
-		      &stats, 0, 0);
-	/*
-	 * Don't apply stats - pending deletes aren't tracked in
-	 * bch_alloc_stats:
-	 */
+		      NULL, 0, 0);
 }
 
 static struct btree *__bch2_btree_node_alloc(struct bch_fs *c,
@@ -1939,6 +1924,25 @@ static void __bch2_btree_node_update_key(struct bch_fs *c,
 
 	btree_interior_update_add_node_reference(as, b);
 
+	/*
+	 * XXX: the rest of the update path treats this like we're actually
+	 * inserting a new node and deleting the existing node, so the
+	 * reservation needs to include enough space for @b
+	 *
+	 * that is actually sketch as fuck though and I am surprised the code
+	 * seems to work like that, definitely need to go back and rework it
+	 * into something saner.
+	 *
+	 * (I think @b is just getting double counted until the btree update
+	 * finishes and "deletes" @b on disk)
+	 */
+	ret = bch2_disk_reservation_add(c, &as->reserve->disk_res,
+			c->opts.btree_node_size *
+			bch2_extent_nr_ptrs(extent_i_to_s_c(new_key)),
+			BCH_DISK_RESERVATION_NOFAIL|
+			BCH_DISK_RESERVATION_GC_LOCK_HELD);
+	BUG_ON(ret);
+
 	parent = btree_node_parent(iter, b);
 	if (parent) {
 		if (new_hash) {
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 201798866242..2ebe8bad978e 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -85,8 +85,7 @@ static inline u64 __bch2_fs_sectors_used(struct bch_fs *, struct bch_fs_usage);
 
 static void bch2_fs_stats_verify(struct bch_fs *c)
 {
-	struct bch_fs_usage stats =
-		__bch2_fs_usage_read(c);
+	struct bch_fs_usage stats =_bch2_fs_usage_read(c);
 	unsigned i, j;
 
 	for (i = 0; i < ARRAY_SIZE(stats.replicas); i++) {
@@ -209,43 +208,24 @@ do {									\
 	_acc;								\
 })
 
-#define bch2_usage_read_cached(_c, _cached, _uncached)			\
-({									\
-	typeof(_cached) _ret;						\
-	unsigned _seq;							\
-									\
-	do {								\
-		_seq = read_seqcount_begin(&(_c)->gc_pos_lock);		\
-		_ret = (_c)->gc_pos.phase == GC_PHASE_DONE		\
-			? bch2_usage_read_raw(_uncached)			\
-			: (_cached);					\
-	} while (read_seqcount_retry(&(_c)->gc_pos_lock, _seq));	\
-									\
-	_ret;								\
-})
-
-struct bch_dev_usage __bch2_dev_usage_read(struct bch_dev *ca)
+struct bch_dev_usage __bch2_dev_usage_read(struct bch_dev *ca, bool gc)
 {
-	return bch2_usage_read_raw(ca->usage_percpu);
+	return bch2_usage_read_raw(ca->usage[gc]);
 }
 
 struct bch_dev_usage bch2_dev_usage_read(struct bch_fs *c, struct bch_dev *ca)
 {
-	return bch2_usage_read_cached(c, ca->usage_cached, ca->usage_percpu);
+	return bch2_usage_read_raw(ca->usage[0]);
 }
 
-struct bch_fs_usage
-__bch2_fs_usage_read(struct bch_fs *c)
+struct bch_fs_usage __bch2_fs_usage_read(struct bch_fs *c, bool gc)
 {
-	return bch2_usage_read_raw(c->usage_percpu);
+	return bch2_usage_read_raw(c->usage[gc]);
 }
 
-struct bch_fs_usage
-bch2_fs_usage_read(struct bch_fs *c)
+struct bch_fs_usage bch2_fs_usage_read(struct bch_fs *c)
 {
-	return bch2_usage_read_cached(c,
-				     c->usage_cached,
-				     c->usage_percpu);
+	return bch2_usage_read_raw(c->usage[0]);
 }
 
 struct fs_usage_sum {
@@ -327,13 +307,11 @@ static inline enum bch_data_type bucket_type(struct bucket_mark m)
 		: m.data_type;
 }
 
-static bool bucket_became_unavailable(struct bch_fs *c,
-				      struct bucket_mark old,
+static bool bucket_became_unavailable(struct bucket_mark old,
 				      struct bucket_mark new)
 {
 	return is_available_bucket(old) &&
-	       !is_available_bucket(new) &&
-	       (!c || c->gc_pos.phase == GC_PHASE_DONE);
+	       !is_available_bucket(new);
 }
 
 void bch2_fs_usage_apply(struct bch_fs *c,
@@ -364,11 +342,13 @@ void bch2_fs_usage_apply(struct bch_fs *c,
 	percpu_down_read(&c->usage_lock);
 	preempt_disable();
 	/* online_reserved not subject to gc: */
-	this_cpu_add(c->usage_percpu->online_reserved, stats->online_reserved);
+	this_cpu_add(c->usage[0]->online_reserved, stats->online_reserved);
 	stats->online_reserved = 0;
 
-	if (!gc_will_visit(c, gc_pos))
-		bch2_usage_add(this_cpu_ptr(c->usage_percpu), stats);
+	bch2_usage_add(this_cpu_ptr(c->usage[0]), stats);
+
+	if (gc_visited(c, gc_pos))
+		bch2_usage_add(this_cpu_ptr(c->usage[1]), stats);
 
 	bch2_fs_stats_verify(c);
 	preempt_enable();
@@ -378,8 +358,9 @@ void bch2_fs_usage_apply(struct bch_fs *c,
 }
 
 static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
-				  struct bch_fs_usage *stats,
-				  struct bucket_mark old, struct bucket_mark new)
+				  struct bch_fs_usage *fs_usage,
+				  struct bucket_mark old, struct bucket_mark new,
+				  bool gc)
 {
 	struct bch_dev_usage *dev_usage;
 
@@ -391,14 +372,18 @@ static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
 		bch2_data_types[old.data_type],
 		bch2_data_types[new.data_type]);
 
-	stats->buckets[bucket_type(old)] -= ca->mi.bucket_size;
-	stats->buckets[bucket_type(new)] += ca->mi.bucket_size;
-
 	preempt_disable();
-	dev_usage = this_cpu_ptr(ca->usage_percpu);
+	dev_usage = this_cpu_ptr(ca->usage[gc]);
 
-	dev_usage->buckets[bucket_type(old)]--;
-	dev_usage->buckets[bucket_type(new)]++;
+	if (bucket_type(old) != bucket_type(new)) {
+		if (bucket_type(old)) {
+			fs_usage->buckets[bucket_type(old)] -= ca->mi.bucket_size;
+			dev_usage->buckets[bucket_type(old)]--;
+		} else {
+			fs_usage->buckets[bucket_type(new)] += ca->mi.bucket_size;
+			dev_usage->buckets[bucket_type(new)]++;
+		}
+	}
 
 	dev_usage->buckets_alloc +=
 		(int) new.owned_by_allocator - (int) old.owned_by_allocator;
@@ -425,21 +410,18 @@ static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
 ({								\
 	struct bucket_mark _old = bucket_cmpxchg(g, new, expr);	\
 								\
-	bch2_dev_usage_update(c, ca, stats, _old, new);		\
+	bch2_dev_usage_update(c, ca, stats, _old, new, gc);	\
 	_old;							\
 })
 
-void bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca,
-			    size_t b, struct bucket_mark *old)
+static void __bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca,
+				     size_t b, struct bucket_mark *old,
+				     bool gc)
 {
-	struct bch_fs_usage *stats = this_cpu_ptr(c->usage_percpu);
-	struct bucket *g;
+	struct bch_fs_usage *stats = this_cpu_ptr(c->usage[gc]);
+	struct bucket *g = __bucket(ca, b, gc);
 	struct bucket_mark new;
 
-	percpu_rwsem_assert_held(&c->usage_lock);
-
-	g = bucket(ca, b);
-
 	*old = bucket_data_cmpxchg(c, ca, stats, g, new, ({
 		BUG_ON(!is_available_bucket(new));
 
@@ -450,38 +432,49 @@ void bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca,
 		new.gen++;
 	}));
 
-	/*
-	 * This isn't actually correct yet, since fs usage is still
-	 * uncompressed sectors:
-	 */
 	stats->replicas[0].data[BCH_DATA_CACHED] -= old->cached_sectors;
+}
+
+void bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca,
+			    size_t b, struct bucket_mark *old)
+{
+	percpu_rwsem_assert_held(&c->usage_lock);
+
+	__bch2_invalidate_bucket(c, ca, b, old, false);
 
 	if (!old->owned_by_allocator && old->cached_sectors)
 		trace_invalidate(ca, bucket_to_sector(ca, b),
 				 old->cached_sectors);
 }
 
-void bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
-			    size_t b, bool owned_by_allocator,
-			    struct gc_pos pos, unsigned flags)
+static void __bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
+				     size_t b, bool owned_by_allocator,
+				     bool gc)
 {
-	struct bch_fs_usage *stats = this_cpu_ptr(c->usage_percpu);
-	struct bucket *g;
+	struct bch_fs_usage *stats = this_cpu_ptr(c->usage[gc]);
+	struct bucket *g = __bucket(ca, b, gc);
 	struct bucket_mark old, new;
 
-	percpu_rwsem_assert_held(&c->usage_lock);
-	g = bucket(ca, b);
-
-	if (!(flags & BCH_BUCKET_MARK_GC_LOCK_HELD) &&
-	    gc_will_visit(c, pos))
-		return;
-
 	old = bucket_data_cmpxchg(c, ca, stats, g, new, ({
 		new.owned_by_allocator	= owned_by_allocator;
 	}));
 
-	BUG_ON(!owned_by_allocator && !old.owned_by_allocator &&
-	       c->gc_pos.phase == GC_PHASE_DONE);
+	BUG_ON(!gc &&
+	       !owned_by_allocator && !old.owned_by_allocator);
+}
+
+void bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
+			    size_t b, bool owned_by_allocator,
+			    struct gc_pos pos, unsigned flags)
+{
+	percpu_rwsem_assert_held(&c->usage_lock);
+
+	if (!(flags & BCH_BUCKET_MARK_GC))
+		__bch2_mark_alloc_bucket(c, ca, b, owned_by_allocator, false);
+
+	if ((flags & BCH_BUCKET_MARK_GC) ||
+	    gc_visited(c, pos))
+		__bch2_mark_alloc_bucket(c, ca, b, owned_by_allocator, true);
 }
 
 #define checked_add(a, b)					\
@@ -491,37 +484,49 @@ do {								\
 	BUG_ON((a) != _res);					\
 } while (0)
 
+static void __bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
+					size_t b, enum bch_data_type type,
+					unsigned sectors, bool gc)
+{
+	struct bch_fs_usage *fs_usage = this_cpu_ptr(c->usage[gc]);
+	struct bucket *g = __bucket(ca, b, gc);
+	struct bucket_mark old, new;
+
+	BUG_ON(type != BCH_DATA_SB &&
+	       type != BCH_DATA_JOURNAL);
+
+	old = bucket_data_cmpxchg(c, ca, fs_usage, g, new, ({
+		new.data_type	= type;
+		checked_add(new.dirty_sectors, sectors);
+	}));
+
+	fs_usage->replicas[0].data[type] += sectors;
+}
+
 void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
 			       size_t b, enum bch_data_type type,
 			       unsigned sectors, struct gc_pos pos,
 			       unsigned flags)
 {
-	struct bch_fs_usage *stats;
-	struct bucket *g;
-	struct bucket_mark old, new;
-
 	BUG_ON(type != BCH_DATA_SB &&
 	       type != BCH_DATA_JOURNAL);
 
+	preempt_disable();
+
 	if (likely(c)) {
 		percpu_rwsem_assert_held(&c->usage_lock);
 
-		if (!(flags & BCH_BUCKET_MARK_GC_LOCK_HELD) &&
-		    gc_will_visit(c, pos))
-			return;
-
-		preempt_disable();
-		stats = this_cpu_ptr(c->usage_percpu);
-
-		g = bucket(ca, b);
-		old = bucket_data_cmpxchg(c, ca, stats, g, new, ({
-			new.data_type = type;
-			checked_add(new.dirty_sectors, sectors);
-		}));
-
-		stats->replicas[0].data[type] += sectors;
-		preempt_enable();
+		if (!(flags & BCH_BUCKET_MARK_GC))
+			__bch2_mark_metadata_bucket(c, ca, b, type, sectors,
+						    false);
+		if ((flags & BCH_BUCKET_MARK_GC) ||
+		    gc_visited(c, pos))
+			__bch2_mark_metadata_bucket(c, ca, b, type, sectors,
+						    true);
 	} else {
+		struct bucket *g;
+		struct bucket_mark old, new;
+
 		rcu_read_lock();
 
 		g = bucket(ca, b);
@@ -533,8 +538,7 @@ void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
 		rcu_read_unlock();
 	}
 
-	BUG_ON(!(flags & BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE) &&
-	       bucket_became_unavailable(c, old, new));
+	preempt_enable();
 }
 
 static int __disk_sectors(struct bch_extent_crc_unpacked crc, unsigned sectors)
@@ -579,23 +583,15 @@ static void bch2_mark_pointer(struct bch_fs *c,
 			      struct extent_ptr_decoded p,
 			      s64 sectors, enum bch_data_type data_type,
 			      struct bch_fs_usage *fs_usage,
-			      u64 journal_seq, unsigned flags)
+			      u64 journal_seq, unsigned flags,
+			      bool gc)
 {
 	struct bucket_mark old, new;
 	struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev);
-	struct bucket *g = PTR_BUCKET(ca, &p.ptr);
+	size_t b = PTR_BUCKET_NR(ca, &p.ptr);
+	struct bucket *g = __bucket(ca, b, gc);
 	u64 v;
 
-	if (flags & BCH_BUCKET_MARK_GC_WILL_VISIT) {
-		if (journal_seq)
-			bucket_cmpxchg(g, new, ({
-				new.journal_seq_valid	= 1;
-				new.journal_seq		= journal_seq;
-			}));
-
-		return;
-	}
-
 	v = atomic64_read(&g->_mark.v);
 	do {
 		new.v.counter = old.v.counter = v;
@@ -637,10 +633,9 @@ static void bch2_mark_pointer(struct bch_fs *c,
 			      old.v.counter,
 			      new.v.counter)) != old.v.counter);
 
-	bch2_dev_usage_update(c, ca, fs_usage, old, new);
+	bch2_dev_usage_update(c, ca, fs_usage, old, new, gc);
 
-	BUG_ON(!(flags & BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE) &&
-	       bucket_became_unavailable(c, old, new));
+	BUG_ON(!gc && bucket_became_unavailable(old, new));
 }
 
 static void bch2_mark_stripe_ptr(struct bch_fs *c,
@@ -688,9 +683,9 @@ static void bch2_mark_stripe_ptr(struct bch_fs *c,
 
 static void bch2_mark_extent(struct bch_fs *c, struct bkey_s_c k,
 			     s64 sectors, enum bch_data_type data_type,
-			     struct gc_pos pos,
 			     struct bch_fs_usage *stats,
-			     u64 journal_seq, unsigned flags)
+			     u64 journal_seq, unsigned flags,
+			     bool gc)
 {
 	BUG_ON(!sectors);
 
@@ -712,7 +707,7 @@ static void bch2_mark_extent(struct bch_fs *c, struct bkey_s_c k,
 			s64 adjusted_disk_sectors = disk_sectors;
 
 			bch2_mark_pointer(c, e, p, disk_sectors, data_type,
-					  stats, journal_seq, flags);
+					  stats, journal_seq, flags, gc);
 
 			if (!p.ptr.cached)
 				for (i = 0; i < p.ec_nr; i++)
@@ -758,21 +753,20 @@ static void bucket_set_stripe(struct bch_fs *c,
 			      const struct bch_stripe *v,
 			      bool enabled,
 			      struct bch_fs_usage *fs_usage,
-			      u64 journal_seq)
+			      u64 journal_seq,
+			      bool gc)
 {
 	unsigned i;
 
 	for (i = 0; i < v->nr_blocks; i++) {
 		const struct bch_extent_ptr *ptr = v->ptrs + i;
 		struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
-		struct bucket *g;
+		size_t b = PTR_BUCKET_NR(ca, ptr);
+		struct bucket *g = __bucket(ca, b, gc);
 		struct bucket_mark new, old;
 
 		BUG_ON(ptr_stale(ca, ptr));
 
-		rcu_read_lock();
-		g = PTR_BUCKET(ca, ptr);
-
 		old = bucket_cmpxchg(g, new, ({
 			new.stripe			= enabled;
 			if (journal_seq) {
@@ -780,18 +774,18 @@ static void bucket_set_stripe(struct bch_fs *c,
 				new.journal_seq		= journal_seq;
 			}
 		}));
-		rcu_read_unlock();
 
 		BUG_ON(old.stripe == enabled);
 
-		bch2_dev_usage_update(c, ca, fs_usage, old, new);
+		bch2_dev_usage_update(c, ca, fs_usage, old, new, gc);
 	}
 }
 
 static void bch2_mark_stripe(struct bch_fs *c, struct bkey_s_c k,
-			     bool inserting, struct gc_pos pos,
+			     bool inserting,
 			     struct bch_fs_usage *fs_usage,
-			     u64 journal_seq, unsigned flags)
+			     u64 journal_seq, unsigned flags,
+			     bool gc)
 {
 	switch (k.k->type) {
 	case BCH_STRIPE: {
@@ -820,74 +814,64 @@ static void bch2_mark_stripe(struct bch_fs *c, struct bkey_s_c k,
 		else
 			bch2_stripes_heap_del(c, m, idx);
 
-		bucket_set_stripe(c, s.v, inserting, fs_usage, 0);
+		bucket_set_stripe(c, s.v, inserting, fs_usage, 0, gc);
 		break;
 	}
 	}
 }
 
-void bch2_mark_key(struct bch_fs *c,
-		   enum bkey_type type, struct bkey_s_c k,
-		   bool inserting, s64 sectors,
-		   struct gc_pos pos,
-		   struct bch_fs_usage *stats,
-		   u64 journal_seq, unsigned flags)
+static void __bch2_mark_key(struct bch_fs *c,
+			    enum bkey_type type, struct bkey_s_c k,
+			    bool inserting, s64 sectors,
+			    struct bch_fs_usage *stats,
+			    u64 journal_seq, unsigned flags,
+			    bool gc)
 {
-	/*
-	 * synchronization w.r.t. GC:
-	 *
-	 * Normally, bucket sector counts/marks are updated on the fly, as
-	 * references are added/removed from the btree, the lists of buckets the
-	 * allocator owns, other metadata buckets, etc.
-	 *
-	 * When GC is in progress and going to mark this reference, we do _not_
-	 * mark this reference here, to avoid double counting - GC will count it
-	 * when it gets to it.
-	 *
-	 * To know whether we should mark a given reference (GC either isn't
-	 * running, or has already marked references at this position) we
-	 * construct a total order for everything GC walks. Then, we can simply
-	 * compare the position of the reference we're marking - @pos - with
-	 * GC's current position. If GC is going to mark this reference, GC's
-	 * current position will be less than @pos; if GC's current position is
-	 * greater than @pos GC has either already walked this position, or
-	 * isn't running.
-	 *
-	 * To avoid racing with GC's position changing, we have to deal with
-	 *  - GC's position being set to GC_POS_MIN when GC starts:
-	 *    usage_lock guards against this
-	 *  - GC's position overtaking @pos: we guard against this with
-	 *    whatever lock protects the data structure the reference lives in
-	 *    (e.g. the btree node lock, or the relevant allocator lock).
-	 */
-
-	percpu_down_read(&c->usage_lock);
-	if (!(flags & BCH_BUCKET_MARK_GC_LOCK_HELD) &&
-	    gc_will_visit(c, pos))
-		flags |= BCH_BUCKET_MARK_GC_WILL_VISIT;
-
-	if (!stats)
-		stats = this_cpu_ptr(c->usage_percpu);
-
 	switch (type) {
 	case BKEY_TYPE_BTREE:
 		bch2_mark_extent(c, k, inserting
 				 ?  c->opts.btree_node_size
 				 : -c->opts.btree_node_size,
 				 BCH_DATA_BTREE,
-				 pos, stats, journal_seq, flags);
+				 stats, journal_seq, flags, gc);
 		break;
 	case BKEY_TYPE_EXTENTS:
 		bch2_mark_extent(c, k, sectors, BCH_DATA_USER,
-				 pos, stats, journal_seq, flags);
+				 stats, journal_seq, flags, gc);
 		break;
 	case BKEY_TYPE_EC:
 		bch2_mark_stripe(c, k, inserting,
-				 pos, stats, journal_seq, flags);
+				 stats, journal_seq, flags, gc);
 		break;
 	default:
 		break;
 	}
+}
+
+void bch2_mark_key(struct bch_fs *c,
+		   enum bkey_type type, struct bkey_s_c k,
+		   bool inserting, s64 sectors,
+		   struct gc_pos pos,
+		   struct bch_fs_usage *stats,
+		   u64 journal_seq, unsigned flags)
+{
+	percpu_down_read(&c->usage_lock);
+
+	if (!(flags & BCH_BUCKET_MARK_GC)) {
+		if (!stats)
+			stats = this_cpu_ptr(c->usage[0]);
+
+		__bch2_mark_key(c, type, k, inserting, sectors,
+				stats, journal_seq, flags, false);
+	}
+
+	if ((flags & BCH_BUCKET_MARK_GC) ||
+	    gc_visited(c, pos)) {
+		__bch2_mark_key(c, type, k, inserting, sectors,
+				this_cpu_ptr(c->usage[1]),
+				journal_seq, flags, true);
+	}
+
 	percpu_up_read(&c->usage_lock);
 }
 
@@ -963,28 +947,20 @@ void bch2_mark_update(struct btree_insert *trans,
 
 /* Disk reservations: */
 
-static u64 __recalc_sectors_available(struct bch_fs *c)
+static u64 bch2_recalc_sectors_available(struct bch_fs *c)
 {
 	int cpu;
 
 	for_each_possible_cpu(cpu)
-		per_cpu_ptr(c->usage_percpu, cpu)->available_cache = 0;
+		per_cpu_ptr(c->usage[0], cpu)->available_cache = 0;
 
 	return avail_factor(bch2_fs_sectors_free(c, bch2_fs_usage_read(c)));
 }
 
-/* Used by gc when it's starting: */
-void bch2_recalc_sectors_available(struct bch_fs *c)
-{
-	percpu_down_write(&c->usage_lock);
-	atomic64_set(&c->sectors_available, __recalc_sectors_available(c));
-	percpu_up_write(&c->usage_lock);
-}
-
 void __bch2_disk_reservation_put(struct bch_fs *c, struct disk_reservation *res)
 {
 	percpu_down_read(&c->usage_lock);
-	this_cpu_sub(c->usage_percpu->online_reserved,
+	this_cpu_sub(c->usage[0]->online_reserved,
 		     res->sectors);
 
 	bch2_fs_stats_verify(c);
@@ -1005,7 +981,7 @@ int bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res,
 
 	percpu_down_read(&c->usage_lock);
 	preempt_disable();
-	stats = this_cpu_ptr(c->usage_percpu);
+	stats = this_cpu_ptr(c->usage[0]);
 
 	if (sectors <= stats->available_cache)
 		goto out;
@@ -1055,7 +1031,7 @@ recalculate:
 	}
 
 	percpu_down_write(&c->usage_lock);
-	sectors_available = __recalc_sectors_available(c);
+	sectors_available = bch2_recalc_sectors_available(c);
 
 	if (sectors <= sectors_available ||
 	    (flags & BCH_DISK_RESERVATION_NOFAIL)) {
@@ -1110,7 +1086,7 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
 	size_t copygc_reserve	= max_t(size_t, 2, nbuckets >> 7);
 	size_t free_inc_nr	= max(max_t(size_t, 1, nbuckets >> 12),
 				      btree_reserve);
-	bool resize = ca->buckets != NULL,
+	bool resize = ca->buckets[0] != NULL,
 	     start_copygc = ca->copygc_thread != NULL;
 	int ret = -ENOMEM;
 	unsigned i;
@@ -1170,7 +1146,7 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
 		       BITS_TO_LONGS(n) * sizeof(unsigned long));
 	}
 
-	rcu_assign_pointer(ca->buckets, buckets);
+	rcu_assign_pointer(ca->buckets[0], buckets);
 	buckets = old_buckets;
 
 	swap(ca->oldest_gens, oldest_gens);
@@ -1239,16 +1215,16 @@ void bch2_dev_buckets_free(struct bch_dev *ca)
 	kvpfree(ca->buckets_dirty,
 		BITS_TO_LONGS(ca->mi.nbuckets) * sizeof(unsigned long));
 	kvpfree(ca->oldest_gens, ca->mi.nbuckets * sizeof(u8));
-	kvpfree(rcu_dereference_protected(ca->buckets, 1),
+	kvpfree(rcu_dereference_protected(ca->buckets[0], 1),
 		sizeof(struct bucket_array) +
 		ca->mi.nbuckets * sizeof(struct bucket));
 
-	free_percpu(ca->usage_percpu);
+	free_percpu(ca->usage[0]);
 }
 
 int bch2_dev_buckets_alloc(struct bch_fs *c, struct bch_dev *ca)
 {
-	if (!(ca->usage_percpu = alloc_percpu(struct bch_dev_usage)))
+	if (!(ca->usage[0] = alloc_percpu(struct bch_dev_usage)))
 		return -ENOMEM;
 
 	return bch2_dev_buckets_resize(c, ca, ca->mi.nbuckets);;
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index b48960fa5ce7..813e0c44e107 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -29,23 +29,34 @@
 	_old;							\
 })
 
-static inline struct bucket_array *bucket_array(struct bch_dev *ca)
+static inline struct bucket_array *__bucket_array(struct bch_dev *ca,
+						  bool gc)
 {
-	return rcu_dereference_check(ca->buckets,
+	return rcu_dereference_check(ca->buckets[gc],
 				     !ca->fs ||
 				     percpu_rwsem_is_held(&ca->fs->usage_lock) ||
 				     lockdep_is_held(&ca->fs->gc_lock) ||
 				     lockdep_is_held(&ca->bucket_lock));
 }
 
-static inline struct bucket *bucket(struct bch_dev *ca, size_t b)
+static inline struct bucket_array *bucket_array(struct bch_dev *ca)
+{
+	return __bucket_array(ca, false);
+}
+
+static inline struct bucket *__bucket(struct bch_dev *ca, size_t b, bool gc)
 {
-	struct bucket_array *buckets = bucket_array(ca);
+	struct bucket_array *buckets = __bucket_array(ca, gc);
 
 	BUG_ON(b < buckets->first_bucket || b >= buckets->nbuckets);
 	return buckets->b + b;
 }
 
+static inline struct bucket *bucket(struct bch_dev *ca, size_t b)
+{
+	return __bucket(ca, b, false);
+}
+
 static inline void bucket_io_clock_reset(struct bch_fs *c, struct bch_dev *ca,
 					 size_t b, int rw)
 {
@@ -129,7 +140,7 @@ static inline bool bucket_unused(struct bucket_mark mark)
 
 /* Device usage: */
 
-struct bch_dev_usage __bch2_dev_usage_read(struct bch_dev *);
+struct bch_dev_usage __bch2_dev_usage_read(struct bch_dev *, bool);
 struct bch_dev_usage bch2_dev_usage_read(struct bch_fs *, struct bch_dev *);
 
 static inline u64 __dev_buckets_available(struct bch_dev *ca,
@@ -168,7 +179,7 @@ static inline u64 dev_buckets_free(struct bch_fs *c, struct bch_dev *ca)
 
 /* Filesystem usage: */
 
-struct bch_fs_usage __bch2_fs_usage_read(struct bch_fs *);
+struct bch_fs_usage __bch2_fs_usage_read(struct bch_fs *, bool);
 struct bch_fs_usage bch2_fs_usage_read(struct bch_fs *);
 void bch2_fs_usage_apply(struct bch_fs *, struct bch_fs_usage *,
 			 struct disk_reservation *, struct gc_pos);
@@ -207,17 +218,13 @@ void bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *,
 			       struct gc_pos, unsigned);
 
 #define BCH_BUCKET_MARK_NOATOMIC		(1 << 0)
-#define BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE	(1 << 1)
-#define BCH_BUCKET_MARK_GC_WILL_VISIT		(1 << 2)
-#define BCH_BUCKET_MARK_GC_LOCK_HELD		(1 << 3)
+#define BCH_BUCKET_MARK_GC			(1 << 1)
 
 void bch2_mark_key(struct bch_fs *, enum bkey_type, struct bkey_s_c,
 		   bool, s64, struct gc_pos,
 		   struct bch_fs_usage *, u64, unsigned);
 void bch2_mark_update(struct btree_insert *, struct btree_insert_entry *);
 
-void bch2_recalc_sectors_available(struct bch_fs *);
-
 void __bch2_disk_reservation_put(struct bch_fs *, struct disk_reservation *);
 
 static inline void bch2_disk_reservation_put(struct bch_fs *c,
diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h
index 9ec96dbab0e8..0187f465d23f 100644
--- a/fs/bcachefs/buckets_types.h
+++ b/fs/bcachefs/buckets_types.h
@@ -64,8 +64,6 @@ struct bch_dev_usage {
 
 struct bch_fs_usage {
 	/* all fields are in units of 512 byte sectors: */
-	u64			online_reserved;
-	u64			available_cache;
 
 	struct {
 		u64		data[BCH_DATA_NR];
@@ -74,6 +72,10 @@ struct bch_fs_usage {
 	}			replicas[BCH_REPLICAS_MAX];
 
 	u64			buckets[BCH_DATA_NR];
+
+	/* fields starting here aren't touched by gc: */
+	u64			online_reserved;
+	u64			available_cache;
 };
 
 /*
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index 939caa3b8183..4045c0e68462 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -782,9 +782,7 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
 		bch2_mark_metadata_bucket(c, ca, bucket, BCH_DATA_JOURNAL,
 				ca->mi.bucket_size,
 				gc_phase(GC_PHASE_SB),
-				new_fs
-				? BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE
-				: 0);
+				0);
 
 		if (c) {
 			spin_unlock(&c->journal.lock);
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 931e50e8ad57..59f2aa7e047c 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -374,7 +374,7 @@ static void bch2_fs_free(struct bch_fs *c)
 	bch2_io_clock_exit(&c->io_clock[READ]);
 	bch2_fs_compress_exit(c);
 	percpu_free_rwsem(&c->usage_lock);
-	free_percpu(c->usage_percpu);
+	free_percpu(c->usage[0]);
 	mempool_exit(&c->btree_iters_pool);
 	mempool_exit(&c->btree_bounce_pool);
 	bioset_exit(&c->btree_bio);
@@ -606,7 +606,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 			max(offsetof(struct btree_read_bio, bio),
 			    offsetof(struct btree_write_bio, wbio.bio)),
 			BIOSET_NEED_BVECS) ||
-	    !(c->usage_percpu = alloc_percpu(struct bch_fs_usage)) ||
+	    !(c->usage[0] = alloc_percpu(struct bch_fs_usage)) ||
 	    percpu_init_rwsem(&c->usage_lock) ||
 	    mempool_init_kvpmalloc_pool(&c->btree_bounce_pool, 1,
 					btree_bytes(c)) ||
@@ -1028,8 +1028,7 @@ static int bch2_dev_attach_bdev(struct bch_fs *c, struct bch_sb_handle *sb)
 		return ret;
 
 	mutex_lock(&c->sb_lock);
-	bch2_mark_dev_superblock(ca->fs, ca,
-			BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE);
+	bch2_mark_dev_superblock(ca->fs, ca, 0);
 	mutex_unlock(&c->sb_lock);
 
 	bch2_dev_sysfs_online(c, ca);
@@ -1314,7 +1313,7 @@ static void dev_usage_clear(struct bch_dev *ca)
 
 	for_each_possible_cpu(cpu) {
 		struct bch_dev_usage *p =
-			per_cpu_ptr(ca->usage_percpu, cpu);
+			per_cpu_ptr(ca->usage[0], cpu);
 		memset(p, 0, sizeof(*p));
 	}
 
@@ -1375,8 +1374,7 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
 	 * allocate the journal, reset all the marks, then remark after we
 	 * attach...
 	 */
-	bch2_mark_dev_superblock(ca->fs, ca,
-			BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE);
+	bch2_mark_dev_superblock(ca->fs, ca, 0);
 
 	err = "journal alloc failed";
 	ret = bch2_dev_journal_alloc(ca);
@@ -1435,8 +1433,7 @@ have_slot:
 	ca->disk_sb.sb->dev_idx	= dev_idx;
 	bch2_dev_attach(c, ca, dev_idx);
 
-	bch2_mark_dev_superblock(c, ca,
-			BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE);
+	bch2_mark_dev_superblock(c, ca, 0);
 
 	bch2_write_super(c);
 	mutex_unlock(&c->sb_lock);
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index 188e19572d91..8eacc0d2550b 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -478,7 +478,7 @@ STORE(__bch2_fs)
 		bch2_coalesce(c);
 
 	if (attr == &sysfs_trigger_gc)
-		bch2_gc(c);
+		bch2_gc(c, NULL, false);
 
 	if (attr == &sysfs_prune_cache) {
 		struct shrink_control sc;
-- 
cgit 


From e88973373aaabebba6e59ff0ff74333eacd7bffb Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 22 Nov 2018 23:05:13 -0500
Subject: bcachefs: Allow for new alloc fields

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 3f0e2dd29fde..390b008b0200 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -84,7 +84,8 @@ const char *bch2_alloc_invalid(const struct bch_fs *c, struct bkey_s_c k)
 	case BCH_ALLOC: {
 		struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k);
 
-		if (bch_alloc_val_u64s(a.v) != bkey_val_u64s(a.k))
+		/* allow for unknown fields */
+		if (bkey_val_u64s(a.k) < bch_alloc_val_u64s(a.v))
 			return "incorrect value size";
 		break;
 	}
-- 
cgit 


From 4e65431c855e959700cc9456f305fcfd94ee6241 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 23 Nov 2018 02:50:33 -0500
Subject: Revert "bcachefs: start erasure coding after journal replay"

This reverts commit 36f389604294dfc953e6f5624ceb683818d32f28.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/recovery.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index f00e327d4d35..ddfba16a2998 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -214,6 +214,11 @@ int bch2_fs_recovery(struct bch_fs *c)
 
 	set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags);
 
+	err = "cannot allocate memory";
+	ret = bch2_fs_ec_start(c);
+	if (ret)
+		goto err;
+
 	bch_verbose(c, "starting mark and sweep:");
 	err = "error in recovery";
 	ret = bch2_initial_gc(c, &journal);
@@ -274,11 +279,6 @@ int bch2_fs_recovery(struct bch_fs *c)
 		bch_verbose(c, "quotas done");
 	}
 
-	err = "cannot allocate memory";
-	ret = bch2_fs_ec_start(c);
-	if (ret)
-		goto err;
-
 out:
 	bch2_journal_entries_free(&journal);
 	kfree(clean);
-- 
cgit 


From de5bb710f93fb87aef8303336a49d09323286822 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 23 Nov 2018 02:06:18 -0500
Subject: bcachefs: shim for userspace raid library

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/ec.c | 127 ++++++++++++++++++++++++++++++++-----------------------
 1 file changed, 73 insertions(+), 54 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index f6314aa6a0f1..727324f15f43 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -16,9 +16,79 @@
 #include "super-io.h"
 #include "util.h"
 
+#include <linux/sort.h>
+
+#ifdef __KERNEL__
+
 #include <linux/raid/pq.h>
 #include <linux/raid/xor.h>
-#include <linux/sort.h>
+
+static void raid5_recov(unsigned disks, unsigned failed_idx,
+			size_t size, void **data)
+{
+	unsigned i = 2, nr;
+
+	BUG_ON(failed_idx >= disks);
+
+	swap(data[0], data[failed_idx]);
+	memcpy(data[0], data[1], size);
+
+	while (i < disks) {
+		nr = min_t(unsigned, disks - i, MAX_XOR_BLOCKS);
+		xor_blocks(nr, size, data[0], data + i);
+		i += nr;
+	}
+
+	swap(data[0], data[failed_idx]);
+}
+
+static void raid_gen(int nd, int np, size_t size, void **v)
+{
+	if (np >= 1)
+		raid5_recov(nd + np, nd, size, v);
+	if (np >= 2)
+		raid6_call.gen_syndrome(nd + np, size, v);
+	BUG_ON(np > 2);
+}
+
+static void raid_rec(int nr, int *ir, int nd, int np, size_t size, void **v)
+{
+	switch (nr) {
+	case 0:
+		break;
+	case 1:
+		if (ir[0] < nd + 1)
+			raid5_recov(nd + 1, ir[0], size, v);
+		else
+			raid6_call.gen_syndrome(nd + np, size, v);
+		break;
+	case 2:
+		if (ir[1] < nd) {
+			/* data+data failure. */
+			raid6_2data_recov(nd + np, size, ir[0], ir[1], v);
+		} else if (ir[0] < nd) {
+			/* data + p/q failure */
+
+			if (ir[1] == nd) /* data + p failure */
+				raid6_datap_recov(nd + np, size, ir[0], v);
+			else { /* data + q failure */
+				raid5_recov(nd + 1, ir[0], size, v);
+				raid6_call.gen_syndrome(nd + np, size, v);
+			}
+		} else {
+			raid_gen(nd, np, size, v);
+		}
+		break;
+	default:
+		BUG();
+	}
+}
+
+#else
+
+#include <raid/raid.h>
+
+#endif
 
 struct ec_bio {
 	struct bch_dev		*ca;
@@ -251,41 +321,13 @@ static void ec_validate_checksums(struct bch_fs *c, struct ec_stripe_buf *buf)
 
 /* Erasure coding: */
 
-static void raid5_recov(unsigned disks, unsigned bytes,
-			unsigned failed, void **data)
-{
-	unsigned i = 2, nr;
-
-	BUG_ON(failed >= disks);
-
-	swap(data[0], data[failed]);
-	memcpy(data[0], data[1], bytes);
-
-	while (i < disks) {
-		nr = min_t(unsigned, disks - i, MAX_XOR_BLOCKS);
-		xor_blocks(nr, bytes, data[0], data + i);
-		i += nr;
-	}
-
-	swap(data[0], data[failed]);
-}
-
 static void ec_generate_ec(struct ec_stripe_buf *buf)
 {
 	struct bch_stripe *v = &buf->key.v;
 	unsigned nr_data = v->nr_blocks - v->nr_redundant;
 	unsigned bytes = le16_to_cpu(v->sectors) << 9;
 
-	switch (v->nr_redundant) {
-	case 2:
-		raid6_call.gen_syndrome(v->nr_blocks, bytes, buf->data);
-		fallthrough;
-	case 1:
-		raid5_recov(v->nr_blocks, bytes, nr_data, buf->data);
-		break;
-	default:
-		BUG();
-	}
+	raid_gen(nr_data, v->nr_redundant, bytes, buf->data);
 }
 
 static unsigned __ec_nr_failed(struct ec_stripe_buf *buf, unsigned nr)
@@ -315,30 +357,7 @@ static int ec_do_recov(struct bch_fs *c, struct ec_stripe_buf *buf)
 		if (!test_bit(i, buf->valid))
 			failed[nr_failed++] = i;
 
-	switch (nr_failed) {
-	case 0:
-		break;
-	case 1:
-		if (test_bit(nr_data, buf->valid))
-			raid5_recov(nr_data + 1, bytes, failed[0], buf->data);
-		else
-			raid6_datap_recov(v->nr_blocks, bytes, failed[0], buf->data);
-		break;
-	case 2:
-		/* data+data failure. */
-		raid6_2data_recov(v->nr_blocks, bytes, failed[0], failed[1], buf->data);
-		break;
-
-	default:
-		BUG();
-	}
-
-	for (i = nr_data; i < v->nr_blocks; i++)
-		if (!test_bit(i, buf->valid)) {
-			ec_generate_ec(buf);
-			break;
-		}
-
+	raid_rec(nr_failed, failed, nr_data, v->nr_redundant, bytes, buf->data);
 	return 0;
 }
 
-- 
cgit 


From ad7ae8d63fa82e5d713e73a1a6a4ca9728f84898 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 23 Nov 2018 05:19:25 -0500
Subject: bcachefs: Btree locking fix, refactoring

Hit an assertion, probably spurious, indicating an iterator was unlocked
when it shouldn't have been (spurious because it wasn't locked at all
when the caller called btree_insert_at()).

Add a flag, BTREE_ITER_NOUNLOCK, and tighten up the assertions

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_gc.c              |  1 -
 fs/bcachefs/btree_iter.c            | 28 ++++++++++++++++++++++----
 fs/bcachefs/btree_locking.h         |  9 ++++++++-
 fs/bcachefs/btree_types.h           |  1 +
 fs/bcachefs/btree_update_interior.c | 40 +++++++++++++++++++------------------
 fs/bcachefs/btree_update_leaf.c     | 34 +++++++++++++++++++------------
 fs/bcachefs/extents.c               |  1 -
 7 files changed, 75 insertions(+), 39 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 6eba65fcb52c..55d49677d5fe 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -1113,7 +1113,6 @@ next:
 	/* Free the old nodes and update our sliding window */
 	for (i = 0; i < nr_old_nodes; i++) {
 		bch2_btree_node_free_inmem(c, old_nodes[i], iter);
-		six_unlock_intent(&old_nodes[i]->lock);
 
 		/*
 		 * the index update might have triggered a split, in which case
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index a50a6a51a3a5..afc43722c1fc 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -264,10 +264,13 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
 /* Btree iterator locking: */
 
 #ifdef CONFIG_BCACHEFS_DEBUG
-void bch2_btree_iter_verify_locks(struct btree_iter *iter)
+void __bch2_btree_iter_verify_locks(struct btree_iter *iter)
 {
 	unsigned l;
 
+	BUG_ON((iter->flags & BTREE_ITER_NOUNLOCK) &&
+	       !btree_node_locked(iter, 0));
+
 	for (l = 0; btree_iter_node(iter, l); l++) {
 		if (iter->uptodate >= BTREE_ITER_NEED_RELOCK &&
 		    !btree_node_locked(iter, l))
@@ -277,6 +280,15 @@ void bch2_btree_iter_verify_locks(struct btree_iter *iter)
 		       btree_node_locked_type(iter, l));
 	}
 }
+
+void bch2_btree_iter_verify_locks(struct btree_iter *iter)
+{
+	struct btree_iter *linked;
+
+	for_each_btree_iter(iter, linked)
+		__bch2_btree_iter_verify_locks(linked);
+
+}
 #endif
 
 __flatten
@@ -382,9 +394,9 @@ void __bch2_btree_iter_downgrade(struct btree_iter *iter,
 				break;
 			}
 		}
-
-		bch2_btree_iter_verify_locks(linked);
 	}
+
+	bch2_btree_iter_verify_locks(iter);
 }
 
 int bch2_btree_iter_unlock(struct btree_iter *iter)
@@ -776,9 +788,17 @@ void bch2_btree_iter_node_drop(struct btree_iter *iter, struct btree *b)
 	struct btree_iter *linked;
 	unsigned level = b->level;
 
+	/* caller now responsible for unlocking @b */
+
+	BUG_ON(iter->l[level].b != b);
+	BUG_ON(!btree_node_intent_locked(iter, level));
+
+	iter->l[level].b = BTREE_ITER_NOT_END;
+	mark_btree_node_unlocked(iter, level);
+
 	for_each_btree_iter(iter, linked)
 		if (linked->l[level].b == b) {
-			btree_node_unlock(linked, level);
+			__btree_node_unlock(linked, level);
 			linked->l[level].b = BTREE_ITER_NOT_END;
 		}
 }
diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h
index c1d16411154e..3871e14e480d 100644
--- a/fs/bcachefs/btree_locking.h
+++ b/fs/bcachefs/btree_locking.h
@@ -95,7 +95,7 @@ btree_lock_want(struct btree_iter *iter, int level)
 	return BTREE_NODE_UNLOCKED;
 }
 
-static inline void btree_node_unlock(struct btree_iter *iter, unsigned level)
+static inline void __btree_node_unlock(struct btree_iter *iter, unsigned level)
 {
 	int lock_type = btree_node_locked_type(iter, level);
 
@@ -106,6 +106,13 @@ static inline void btree_node_unlock(struct btree_iter *iter, unsigned level)
 	mark_btree_node_unlocked(iter, level);
 }
 
+static inline void btree_node_unlock(struct btree_iter *iter, unsigned level)
+{
+	BUG_ON(!level && iter->flags & BTREE_ITER_NOUNLOCK);
+
+	__btree_node_unlock(iter, level);
+}
+
 static inline void __bch2_btree_iter_unlock(struct btree_iter *iter)
 {
 	btree_iter_set_dirty(iter, BTREE_ITER_NEED_RELOCK);
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 7e9ba60288aa..7eecaa6cd5a2 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -192,6 +192,7 @@ enum btree_iter_type {
  */
 #define BTREE_ITER_IS_EXTENTS		(1 << 4)
 #define BTREE_ITER_ERROR		(1 << 5)
+#define BTREE_ITER_NOUNLOCK		(1 << 6)
 
 enum btree_iter_uptodate {
 	BTREE_ITER_UPTODATE		= 0,
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 2631b0732d4b..4fcda31290b2 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -257,6 +257,11 @@ void bch2_btree_node_free_never_inserted(struct bch_fs *c, struct btree *b)
 void bch2_btree_node_free_inmem(struct bch_fs *c, struct btree *b,
 				struct btree_iter *iter)
 {
+	struct btree_iter *linked;
+
+	for_each_btree_iter(iter, linked)
+		BUG_ON(linked->l[b->level].b == b);
+
 	/*
 	 * Is this a node that isn't reachable on disk yet?
 	 *
@@ -268,11 +273,10 @@ void bch2_btree_node_free_inmem(struct bch_fs *c, struct btree *b,
 	 */
 	btree_update_drop_new_node(c, b);
 
-	__bch2_btree_node_lock_write(b, iter);
+	six_lock_write(&b->lock, NULL, NULL);
 	__btree_node_free(c, b);
 	six_unlock_write(&b->lock);
-
-	bch2_btree_iter_node_drop(iter, b);
+	six_unlock_intent(&b->lock);
 }
 
 static void bch2_btree_node_free_ondisk(struct bch_fs *c,
@@ -1421,25 +1425,19 @@ static void btree_split(struct btree_update *as, struct btree *b,
 	if (n3)
 		bch2_open_buckets_put(c, &n3->ob);
 
-	/*
-	 * Note - at this point other linked iterators could still have @b read
-	 * locked; we're depending on the bch2_btree_iter_node_replace() calls
-	 * below removing all references to @b so we don't return with other
-	 * iterators pointing to a node they have locked that's been freed.
-	 *
-	 * We have to free the node first because the bch2_iter_node_replace()
-	 * calls will drop _our_ iterator's reference - and intent lock - to @b.
-	 */
-	bch2_btree_node_free_inmem(c, b, iter);
-
 	/* Successful split, update the iterator to point to the new nodes: */
 
+	bch2_btree_iter_node_drop(iter, b);
 	if (n3)
 		bch2_btree_iter_node_replace(iter, n3);
 	if (n2)
 		bch2_btree_iter_node_replace(iter, n2);
 	bch2_btree_iter_node_replace(iter, n1);
 
+	bch2_btree_node_free_inmem(c, b, iter);
+
+	bch2_btree_iter_verify_locks(iter);
+
 	bch2_time_stats_update(&c->times[BCH_TIME_btree_split], start_time);
 }
 
@@ -1735,17 +1733,21 @@ retry:
 	bch2_btree_insert_node(as, parent, iter, &as->parent_keys, flags);
 
 	bch2_open_buckets_put(c, &n->ob);
-	bch2_btree_node_free_inmem(c, b, iter);
-	bch2_btree_node_free_inmem(c, m, iter);
+
+	bch2_btree_iter_node_drop(iter, b);
 	bch2_btree_iter_node_replace(iter, n);
 
 	bch2_btree_iter_verify(iter, n);
 
+	bch2_btree_node_free_inmem(c, b, iter);
+	bch2_btree_node_free_inmem(c, m, iter);
+
 	bch2_btree_update_done(as);
 
-	six_unlock_intent(&m->lock);
 	up_read(&c->gc_lock);
 out:
+	bch2_btree_iter_verify_locks(iter);
+
 	/*
 	 * Don't downgrade locks here: we're called after successful insert,
 	 * and the caller will downgrade locks after a successful insert
@@ -1828,9 +1830,9 @@ static int __btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter,
 
 	bch2_open_buckets_put(c, &n->ob);
 
-	bch2_btree_node_free_inmem(c, b, iter);
-
+	bch2_btree_iter_node_drop(iter, b);
 	bch2_btree_iter_node_replace(iter, n);
+	bch2_btree_node_free_inmem(c, b, iter);
 
 	bch2_btree_update_done(as);
 	return 0;
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 41691bebf679..4b0d674472db 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -187,7 +187,6 @@ bch2_insert_fixup_key(struct btree_insert *trans,
 				       insert->k))
 		bch2_btree_journal_key(trans, iter, insert->k);
 
-	trans->did_work = true;
 	return BTREE_INSERT_OK;
 }
 
@@ -338,6 +337,7 @@ static inline int do_btree_insert_at(struct btree_insert *trans,
 {
 	struct bch_fs *c = trans->c;
 	struct btree_insert_entry *i;
+	struct btree_iter *linked;
 	unsigned u64s;
 	int ret;
 
@@ -410,12 +410,25 @@ static inline int do_btree_insert_at(struct btree_insert *trans,
 				i->k->k.version = MAX_VERSION;
 	}
 
+	if (trans->flags & BTREE_INSERT_NOUNLOCK) {
+		/*
+		 * linked iterators that weren't being updated may or may not
+		 * have been traversed/locked, depending on what the caller was
+		 * doing:
+		 */
+		for_each_btree_iter(trans->entries[0].iter, linked)
+			if (linked->uptodate < BTREE_ITER_NEED_RELOCK)
+				linked->flags |= BTREE_ITER_NOUNLOCK;
+	}
+	trans->did_work = true;
+
 	trans_for_each_entry(trans, i) {
 		switch (btree_insert_key_leaf(trans, i)) {
 		case BTREE_INSERT_OK:
 			break;
 		case BTREE_INSERT_NEED_TRAVERSE:
-			BUG_ON((trans->flags & BTREE_INSERT_ATOMIC));
+			BUG_ON((trans->flags &
+				(BTREE_INSERT_ATOMIC|BTREE_INSERT_NOUNLOCK)));
 			ret = -EINTR;
 			goto out;
 		default:
@@ -461,8 +474,7 @@ int __bch2_btree_insert_at(struct btree_insert *trans)
 
 	BUG_ON(!trans->nr);
 
-	for_each_btree_iter(trans->entries[0].iter, linked)
-		bch2_btree_iter_verify_locks(linked);
+	bch2_btree_iter_verify_locks(trans->entries[0].iter);
 
 	/* for the sake of sanity: */
 	BUG_ON(trans->nr > 1 && !(trans->flags & BTREE_INSERT_ATOMIC));
@@ -504,15 +516,11 @@ retry:
 out:
 	percpu_ref_put(&c->writes);
 
-	if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) {
-		/* make sure we didn't drop or screw up locks: */
-		for_each_btree_iter(trans->entries[0].iter, linked) {
-			bch2_btree_iter_verify_locks(linked);
-			BUG_ON((trans->flags & BTREE_INSERT_NOUNLOCK) &&
-			       trans->did_work &&
-			       !btree_node_locked(linked, 0));
-		}
-	}
+	/* make sure we didn't drop or screw up locks: */
+	bch2_btree_iter_verify_locks(trans->entries[0].iter);
+
+	for_each_btree_iter(trans->entries[0].iter, linked)
+		linked->flags &= ~BTREE_ITER_NOUNLOCK;
 
 	BUG_ON(!(trans->flags & BTREE_INSERT_ATOMIC) && ret == -EINTR);
 
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 9e3ac910572e..eeeebfaa4557 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -1214,7 +1214,6 @@ static void extent_insert_committed(struct extent_insert_state *s)
 	bch2_cut_front(s->committed, insert);
 
 	insert->k.needs_whiteout	= false;
-	s->trans->did_work		= true;
 }
 
 void bch2_extent_trim_atomic(struct bkey_i *k, struct btree_iter *iter)
-- 
cgit 


From dfe9bfb32e380df67d25cd5afb887b3466230e03 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 24 Nov 2018 17:09:44 -0500
Subject: bcachefs: Stripes now properly subject to gc

gc now verifies the contents of the stripes radix tree, important for
persistent alloc info

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs.h |  16 +++--
 fs/bcachefs/btree_gc.c |  79 ++++++++++++++++++---
 fs/bcachefs/buckets.c  | 184 +++++++++++++++++++++++++++++--------------------
 fs/bcachefs/buckets.h  |   6 +-
 fs/bcachefs/ec.c       |  71 +++++++++----------
 fs/bcachefs/ec.h       |   8 ++-
 fs/bcachefs/ec_types.h |   3 +-
 fs/bcachefs/extents.c  |   4 +-
 fs/bcachefs/quota.c    |   7 +-
 fs/bcachefs/recovery.c |   6 +-
 fs/bcachefs/super.c    |   2 +-
 11 files changed, 244 insertions(+), 142 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index eaa2055000b6..258a67d4437b 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -225,6 +225,8 @@
 	printk(KERN_WARNING bch2_fmt(c, fmt), ##__VA_ARGS__)
 #define bch_err(c, fmt, ...) \
 	printk(KERN_ERR bch2_fmt(c, fmt), ##__VA_ARGS__)
+#define bch_err_ratelimited(c, fmt, ...) \
+	printk_ratelimited(KERN_ERR bch2_fmt(c, fmt), ##__VA_ARGS__)
 
 #define bch_verbose(c, fmt, ...)					\
 do {									\
@@ -334,6 +336,7 @@ enum bch_time_stats {
 struct btree;
 
 enum gc_phase {
+	GC_PHASE_NOT_RUNNING,
 	GC_PHASE_START,
 	GC_PHASE_SB,
 
@@ -687,16 +690,17 @@ struct bch_fs {
 	/* REBALANCE */
 	struct bch_fs_rebalance	rebalance;
 
-	/* ERASURE CODING */
-	struct list_head	ec_new_stripe_list;
-	struct mutex		ec_new_stripe_lock;
-
-	GENRADIX(struct ec_stripe) ec_stripes;
-	struct mutex		ec_stripes_lock;
+	/* STRIPES: */
+	GENRADIX(struct stripe) stripes[2];
+	struct mutex		ec_stripe_create_lock;
 
 	ec_stripes_heap		ec_stripes_heap;
 	spinlock_t		ec_stripes_heap_lock;
 
+	/* ERASURE CODING */
+	struct list_head	ec_new_stripe_list;
+	struct mutex		ec_new_stripe_lock;
+
 	struct bio_set		ec_bioset;
 
 	struct work_struct	ec_stripe_delete_work;
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 55d49677d5fe..f350634ce7a0 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -332,9 +332,6 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id,
 
 	gc_pos_set(c, gc_pos_btree(btree_id, POS_MIN, 0));
 
-	if (!c->btree_roots[btree_id].b)
-		return 0;
-
 	/*
 	 * if expensive_debug_checks is on, run range_checks on all leaf nodes:
 	 *
@@ -582,6 +579,8 @@ static void bch2_gc_free(struct bch_fs *c)
 	struct bch_dev *ca;
 	unsigned i;
 
+	genradix_free(&c->stripes[1]);
+
 	for_each_member_device(ca, c, i) {
 		kvpfree(rcu_dereference_protected(ca->buckets[1], 1),
 			sizeof(struct bucket_array) +
@@ -602,6 +601,25 @@ static void bch2_gc_done_nocheck(struct bch_fs *c)
 	unsigned i;
 	int cpu;
 
+	{
+		struct genradix_iter dst_iter = genradix_iter_init(&c->stripes[0], 0);
+		struct genradix_iter src_iter = genradix_iter_init(&c->stripes[1], 0);
+		struct stripe *dst, *src;
+
+		c->ec_stripes_heap.used = 0;
+
+		while ((dst = genradix_iter_peek(&dst_iter, &c->stripes[0])) &&
+		       (src = genradix_iter_peek(&src_iter, &c->stripes[1]))) {
+			*dst = *src;
+
+			if (dst->alive)
+				bch2_stripes_heap_insert(c, dst, dst_iter.pos);
+
+			genradix_iter_advance(&dst_iter, &c->stripes[0]);
+			genradix_iter_advance(&src_iter, &c->stripes[1]);
+		}
+	}
+
 	for_each_member_device(ca, c, i) {
 		struct bucket_array *src = __bucket_array(ca, 1);
 
@@ -649,13 +667,21 @@ static void bch2_gc_done(struct bch_fs *c, bool initial)
 
 #define copy_field(_f, _msg, ...)					\
 	if (dst._f != src._f) {						\
-		pr_info(_msg ": got %llu, should be %llu, fixing"	\
+		bch_err(c, _msg ": got %llu, should be %llu, fixing"\
 			, ##__VA_ARGS__, dst._f, src._f);		\
 		dst._f = src._f;					\
 	}
+#define copy_stripe_field(_f, _msg, ...)				\
+	if (dst->_f != src->_f) {					\
+		bch_err_ratelimited(c, "stripe %zu has wrong "_msg	\
+			": got %u, should be %u, fixing",		\
+			dst_iter.pos, ##__VA_ARGS__,			\
+			dst->_f, src->_f);				\
+		dst->_f = src->_f;					\
+	}
 #define copy_bucket_field(_f)						\
 	if (dst->b[b].mark._f != src->b[b].mark._f) {			\
-		pr_info("dev %u bucket %zu has wrong " #_f		\
+		bch_err_ratelimited(c, "dev %u bucket %zu has wrong " #_f\
 			": got %u, should be %u, fixing",		\
 			i, b, dst->b[b].mark._f, src->b[b].mark._f);	\
 		dst->b[b]._mark._f = src->b[b].mark._f;			\
@@ -672,6 +698,36 @@ static void bch2_gc_done(struct bch_fs *c, bool initial)
 		goto out;
 	}
 
+	{
+		struct genradix_iter dst_iter = genradix_iter_init(&c->stripes[0], 0);
+		struct genradix_iter src_iter = genradix_iter_init(&c->stripes[1], 0);
+		struct stripe *dst, *src;
+		unsigned i;
+
+		c->ec_stripes_heap.used = 0;
+
+		while ((dst = genradix_iter_peek(&dst_iter, &c->stripes[0])) &&
+		       (src = genradix_iter_peek(&src_iter, &c->stripes[1]))) {
+			copy_stripe_field(alive,	"alive");
+			copy_stripe_field(sectors,	"sectors");
+			copy_stripe_field(algorithm,	"algorithm");
+			copy_stripe_field(nr_blocks,	"nr_blocks");
+			copy_stripe_field(nr_redundant,	"nr_redundant");
+			copy_stripe_field(blocks_nonempty.counter,
+					  "blocks_nonempty");
+
+			for (i = 0; i < ARRAY_SIZE(dst->block_sectors); i++)
+				copy_stripe_field(block_sectors[i].counter,
+						  "block_sectors[%u]", i);
+
+			if (dst->alive)
+				bch2_stripes_heap_insert(c, dst, dst_iter.pos);
+
+			genradix_iter_advance(&dst_iter, &c->stripes[0]);
+			genradix_iter_advance(&src_iter, &c->stripes[1]);
+		}
+	}
+
 	for_each_member_device(ca, c, i) {
 		struct bucket_array *dst = __bucket_array(ca, 0);
 		struct bucket_array *src = __bucket_array(ca, 1);
@@ -756,10 +812,11 @@ static void bch2_gc_done(struct bch_fs *c, bool initial)
 out:
 	percpu_up_write(&c->usage_lock);
 
-#undef copy_field
 #undef copy_fs_field
 #undef copy_dev_field
 #undef copy_bucket_field
+#undef copy_stripe_field
+#undef copy_field
 }
 
 static int bch2_gc_start(struct bch_fs *c)
@@ -767,6 +824,12 @@ static int bch2_gc_start(struct bch_fs *c)
 	struct bch_dev *ca;
 	unsigned i;
 
+	/*
+	 * indicate to stripe code that we need to allocate for the gc stripes
+	 * radix tree, too
+	 */
+	gc_pos_set(c, gc_phase(GC_PHASE_START));
+
 	BUG_ON(c->usage[1]);
 
 	c->usage[1] = alloc_percpu(struct bch_fs_usage);
@@ -808,7 +871,7 @@ static int bch2_gc_start(struct bch_fs *c)
 
 	percpu_up_write(&c->usage_lock);
 
-	return 0;
+	return bch2_ec_mem_alloc(c, true);
 }
 
 /**
@@ -873,7 +936,7 @@ out:
 		bch2_gc_done(c, initial);
 
 	/* Indicates that gc is no longer in progress: */
-	__gc_pos_set(c, gc_phase(GC_PHASE_START));
+	__gc_pos_set(c, gc_phase(GC_PHASE_NOT_RUNNING));
 
 	bch2_gc_free(c);
 	up_write(&c->gc_lock);
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 2ebe8bad978e..87ff4b2c8434 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -303,7 +303,7 @@ static inline int is_fragmented_bucket(struct bucket_mark m,
 static inline enum bch_data_type bucket_type(struct bucket_mark m)
 {
 	return m.cached_sectors && !m.dirty_sectors
-		?  BCH_DATA_CACHED
+		? BCH_DATA_CACHED
 		: m.data_type;
 }
 
@@ -375,14 +375,14 @@ static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
 	preempt_disable();
 	dev_usage = this_cpu_ptr(ca->usage[gc]);
 
-	if (bucket_type(old) != bucket_type(new)) {
-		if (bucket_type(old)) {
-			fs_usage->buckets[bucket_type(old)] -= ca->mi.bucket_size;
-			dev_usage->buckets[bucket_type(old)]--;
-		} else {
-			fs_usage->buckets[bucket_type(new)] += ca->mi.bucket_size;
-			dev_usage->buckets[bucket_type(new)]++;
-		}
+	if (bucket_type(old)) {
+		fs_usage->buckets[bucket_type(old)] -= ca->mi.bucket_size;
+		dev_usage->buckets[bucket_type(old)]--;
+	}
+
+	if (bucket_type(new)) {
+		fs_usage->buckets[bucket_type(new)] += ca->mi.bucket_size;
+		dev_usage->buckets[bucket_type(new)]++;
 	}
 
 	dev_usage->buckets_alloc +=
@@ -406,11 +406,11 @@ static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
 	bch2_dev_stats_verify(ca);
 }
 
-#define bucket_data_cmpxchg(c, ca, stats, g, new, expr)		\
+#define bucket_data_cmpxchg(c, ca, fs_usage, g, new, expr)		\
 ({								\
 	struct bucket_mark _old = bucket_cmpxchg(g, new, expr);	\
 								\
-	bch2_dev_usage_update(c, ca, stats, _old, new, gc);	\
+	bch2_dev_usage_update(c, ca, fs_usage, _old, new, gc);	\
 	_old;							\
 })
 
@@ -638,23 +638,25 @@ static void bch2_mark_pointer(struct bch_fs *c,
 	BUG_ON(!gc && bucket_became_unavailable(old, new));
 }
 
-static void bch2_mark_stripe_ptr(struct bch_fs *c,
-				 struct bch_extent_stripe_ptr p,
-				 s64 sectors, unsigned flags,
-				 s64 *adjusted_disk_sectors,
-				 unsigned *redundancy)
+static int bch2_mark_stripe_ptr(struct bch_fs *c,
+				struct bch_extent_stripe_ptr p,
+				s64 sectors, unsigned flags,
+				s64 *adjusted_disk_sectors,
+				unsigned *redundancy,
+				bool gc)
 {
-	struct ec_stripe *m;
+	struct stripe *m;
 	unsigned old, new, nr_data;
 	int blocks_nonempty_delta;
 	s64 parity_sectors;
 
-	m = genradix_ptr(&c->ec_stripes, p.idx);
-	if (WARN_ON(!m))
-		return;
+	m = genradix_ptr(&c->stripes[gc], p.idx);
 
-	if (WARN_ON(!m->alive))
-		return;
+	if (!m || !m->alive) {
+		bch_err_ratelimited(c, "pointer to nonexistent stripe %llu",
+				    (u64) p.idx);
+		return -1;
+	}
 
 	nr_data = m->nr_blocks - m->nr_redundant;
 
@@ -672,20 +674,23 @@ static void bch2_mark_stripe_ptr(struct bch_fs *c,
 
 	blocks_nonempty_delta = (int) !!new - (int) !!old;
 	if (!blocks_nonempty_delta)
-		return;
+		return 0;
 
 	atomic_add(blocks_nonempty_delta, &m->blocks_nonempty);
 
 	BUG_ON(atomic_read(&m->blocks_nonempty) < 0);
 
-	bch2_stripes_heap_update(c, m, p.idx);
+	if (!gc)
+		bch2_stripes_heap_update(c, m, p.idx);
+
+	return 0;
 }
 
-static void bch2_mark_extent(struct bch_fs *c, struct bkey_s_c k,
-			     s64 sectors, enum bch_data_type data_type,
-			     struct bch_fs_usage *stats,
-			     u64 journal_seq, unsigned flags,
-			     bool gc)
+static int bch2_mark_extent(struct bch_fs *c, struct bkey_s_c k,
+			    s64 sectors, enum bch_data_type data_type,
+			    struct bch_fs_usage *stats,
+			    u64 journal_seq, unsigned flags,
+			    bool gc)
 {
 	BUG_ON(!sectors);
 
@@ -701,6 +706,7 @@ static void bch2_mark_extent(struct bch_fs *c, struct bkey_s_c k,
 		unsigned replicas	= 0;
 		unsigned ec_redundancy	= 0;
 		unsigned i;
+		int ret;
 
 		extent_for_each_ptr_decode(e, p, entry) {
 			s64 disk_sectors = ptr_disk_sectors(e, p, sectors);
@@ -710,11 +716,14 @@ static void bch2_mark_extent(struct bch_fs *c, struct bkey_s_c k,
 					  stats, journal_seq, flags, gc);
 
 			if (!p.ptr.cached)
-				for (i = 0; i < p.ec_nr; i++)
-					bch2_mark_stripe_ptr(c, p.ec[i],
+				for (i = 0; i < p.ec_nr; i++) {
+					ret = bch2_mark_stripe_ptr(c, p.ec[i],
 							disk_sectors, flags,
 							&adjusted_disk_sectors,
-							&ec_redundancy);
+							&ec_redundancy, gc);
+					if (ret)
+						return ret;
+				}
 			if (!p.ptr.cached)
 				replicas++;
 
@@ -747,6 +756,8 @@ static void bch2_mark_extent(struct bch_fs *c, struct bkey_s_c k,
 		break;
 	}
 	}
+
+	return 0;
 }
 
 static void bucket_set_stripe(struct bch_fs *c,
@@ -767,7 +778,7 @@ static void bucket_set_stripe(struct bch_fs *c,
 
 		BUG_ON(ptr_stale(ca, ptr));
 
-		old = bucket_cmpxchg(g, new, ({
+		old = bucket_data_cmpxchg(c, ca, fs_usage, g, new, ({
 			new.stripe			= enabled;
 			if (journal_seq) {
 				new.journal_seq_valid	= 1;
@@ -776,26 +787,33 @@ static void bucket_set_stripe(struct bch_fs *c,
 		}));
 
 		BUG_ON(old.stripe == enabled);
-
-		bch2_dev_usage_update(c, ca, fs_usage, old, new, gc);
 	}
 }
 
-static void bch2_mark_stripe(struct bch_fs *c, struct bkey_s_c k,
-			     bool inserting,
-			     struct bch_fs_usage *fs_usage,
-			     u64 journal_seq, unsigned flags,
-			     bool gc)
+static int bch2_mark_stripe(struct bch_fs *c, struct bkey_s_c k,
+			    bool inserting,
+			    struct bch_fs_usage *fs_usage,
+			    u64 journal_seq, unsigned flags,
+			    bool gc)
 {
 	switch (k.k->type) {
 	case BCH_STRIPE: {
 		struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k);
 		size_t idx = s.k->p.offset;
-		struct ec_stripe *m = genradix_ptr(&c->ec_stripes, idx);
+		struct stripe *m = genradix_ptr(&c->stripes[gc], idx);
 		unsigned i;
 
-		BUG_ON(!m);
-		BUG_ON(m->alive == inserting);
+		if (!m || (!inserting && !m->alive)) {
+			bch_err_ratelimited(c, "error marking nonexistent stripe %zu",
+					    idx);
+			return -1;
+		}
+
+		if (inserting && m->alive) {
+			bch_err_ratelimited(c, "error marking stripe %zu: already exists",
+					    idx);
+			return -1;
+		}
 
 		BUG_ON(atomic_read(&m->blocks_nonempty));
 
@@ -809,70 +827,88 @@ static void bch2_mark_stripe(struct bch_fs *c, struct bkey_s_c k,
 			m->nr_redundant	= s.v->nr_redundant;
 		}
 
-		if (inserting)
-			bch2_stripes_heap_insert(c, m, idx);
-		else
-			bch2_stripes_heap_del(c, m, idx);
+		if (!gc) {
+			if (inserting)
+				bch2_stripes_heap_insert(c, m, idx);
+			else
+				bch2_stripes_heap_del(c, m, idx);
+		} else {
+			m->alive = inserting;
+		}
 
 		bucket_set_stripe(c, s.v, inserting, fs_usage, 0, gc);
 		break;
 	}
 	}
+
+	return 0;
 }
 
-static void __bch2_mark_key(struct bch_fs *c,
-			    enum bkey_type type, struct bkey_s_c k,
-			    bool inserting, s64 sectors,
-			    struct bch_fs_usage *stats,
-			    u64 journal_seq, unsigned flags,
-			    bool gc)
+static int __bch2_mark_key(struct bch_fs *c,
+			   enum bkey_type type, struct bkey_s_c k,
+			   bool inserting, s64 sectors,
+			   struct bch_fs_usage *stats,
+			   u64 journal_seq, unsigned flags,
+			   bool gc)
 {
+	int ret = 0;
+
 	switch (type) {
 	case BKEY_TYPE_BTREE:
-		bch2_mark_extent(c, k, inserting
-				 ?  c->opts.btree_node_size
-				 : -c->opts.btree_node_size,
-				 BCH_DATA_BTREE,
-				 stats, journal_seq, flags, gc);
+		ret = bch2_mark_extent(c, k, inserting
+				       ?  c->opts.btree_node_size
+				       : -c->opts.btree_node_size,
+				       BCH_DATA_BTREE,
+				       stats, journal_seq, flags, gc);
 		break;
 	case BKEY_TYPE_EXTENTS:
-		bch2_mark_extent(c, k, sectors, BCH_DATA_USER,
-				 stats, journal_seq, flags, gc);
+		ret = bch2_mark_extent(c, k, sectors, BCH_DATA_USER,
+				       stats, journal_seq, flags, gc);
 		break;
 	case BKEY_TYPE_EC:
-		bch2_mark_stripe(c, k, inserting,
-				 stats, journal_seq, flags, gc);
+		ret = bch2_mark_stripe(c, k, inserting,
+				       stats, journal_seq, flags, gc);
 		break;
 	default:
 		break;
 	}
+
+	return ret;
 }
 
-void bch2_mark_key(struct bch_fs *c,
-		   enum bkey_type type, struct bkey_s_c k,
-		   bool inserting, s64 sectors,
-		   struct gc_pos pos,
-		   struct bch_fs_usage *stats,
-		   u64 journal_seq, unsigned flags)
+int bch2_mark_key(struct bch_fs *c,
+		  enum bkey_type type, struct bkey_s_c k,
+		  bool inserting, s64 sectors,
+		  struct gc_pos pos,
+		  struct bch_fs_usage *stats,
+		  u64 journal_seq, unsigned flags)
 {
+	int ret = 0;
+
 	percpu_down_read(&c->usage_lock);
 
 	if (!(flags & BCH_BUCKET_MARK_GC)) {
 		if (!stats)
 			stats = this_cpu_ptr(c->usage[0]);
 
-		__bch2_mark_key(c, type, k, inserting, sectors,
-				stats, journal_seq, flags, false);
+		ret = __bch2_mark_key(c, type, k, inserting, sectors,
+				      stats, journal_seq, flags, false);
+		if (ret)
+			goto out;
 	}
 
 	if ((flags & BCH_BUCKET_MARK_GC) ||
 	    gc_visited(c, pos)) {
-		__bch2_mark_key(c, type, k, inserting, sectors,
-				this_cpu_ptr(c->usage[1]),
-				journal_seq, flags, true);
+		ret = __bch2_mark_key(c, type, k, inserting, sectors,
+				      this_cpu_ptr(c->usage[1]),
+				      journal_seq, flags, true);
+		if (ret)
+			goto out;
 	}
-
+out:
 	percpu_up_read(&c->usage_lock);
+
+	return ret;
 }
 
 void bch2_mark_update(struct btree_insert *trans,
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index 813e0c44e107..4eec96101bf6 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -220,9 +220,9 @@ void bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *,
 #define BCH_BUCKET_MARK_NOATOMIC		(1 << 0)
 #define BCH_BUCKET_MARK_GC			(1 << 1)
 
-void bch2_mark_key(struct bch_fs *, enum bkey_type, struct bkey_s_c,
-		   bool, s64, struct gc_pos,
-		   struct bch_fs_usage *, u64, unsigned);
+int bch2_mark_key(struct bch_fs *, enum bkey_type, struct bkey_s_c,
+		  bool, s64, struct gc_pos,
+		  struct bch_fs_usage *, u64, unsigned);
 void bch2_mark_update(struct btree_insert *, struct btree_insert_entry *);
 
 void __bch2_disk_reservation_put(struct bch_fs *, struct disk_reservation *);
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index 727324f15f43..091a1f0a0432 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -530,7 +530,7 @@ err:
 	return ret;
 }
 
-/* ec_stripe bucket accounting: */
+/* stripe bucket accounting: */
 
 static int __ec_stripe_mem_alloc(struct bch_fs *c, size_t idx, gfp_t gfp)
 {
@@ -551,7 +551,11 @@ static int __ec_stripe_mem_alloc(struct bch_fs *c, size_t idx, gfp_t gfp)
 		free_heap(&n);
 	}
 
-	if (!genradix_ptr_alloc(&c->ec_stripes, idx, gfp))
+	if (!genradix_ptr_alloc(&c->stripes[0], idx, gfp))
+		return -ENOMEM;
+
+	if (c->gc_pos.phase != GC_PHASE_NOT_RUNNING &&
+	    !genradix_ptr_alloc(&c->stripes[1], idx, gfp))
 		return -ENOMEM;
 
 	return 0;
@@ -592,27 +596,26 @@ static inline void ec_stripes_heap_set_backpointer(ec_stripes_heap *h,
 {
 	struct bch_fs *c = container_of(h, struct bch_fs, ec_stripes_heap);
 
-	genradix_ptr(&c->ec_stripes, h->data[i].idx)->heap_idx = i;
+	genradix_ptr(&c->stripes[0], h->data[i].idx)->heap_idx = i;
 }
 
 static void heap_verify_backpointer(struct bch_fs *c, size_t idx)
 {
 	ec_stripes_heap *h = &c->ec_stripes_heap;
-	struct ec_stripe *m = genradix_ptr(&c->ec_stripes, idx);
+	struct stripe *m = genradix_ptr(&c->stripes[0], idx);
 
 	BUG_ON(!m->alive);
 	BUG_ON(m->heap_idx >= h->used);
 	BUG_ON(h->data[m->heap_idx].idx != idx);
 }
 
-static inline unsigned stripe_entry_blocks(struct ec_stripe *m)
+static inline unsigned stripe_entry_blocks(struct stripe *m)
 {
-	return atomic_read(&m->pin)
-		? UINT_MAX : atomic_read(&m->blocks_nonempty);
+	return atomic_read(&m->blocks_nonempty);
 }
 
 void bch2_stripes_heap_update(struct bch_fs *c,
-			      struct ec_stripe *m, size_t idx)
+			      struct stripe *m, size_t idx)
 {
 	ec_stripes_heap *h = &c->ec_stripes_heap;
 	bool queue_delete;
@@ -646,7 +649,7 @@ void bch2_stripes_heap_update(struct bch_fs *c,
 }
 
 void bch2_stripes_heap_del(struct bch_fs *c,
-			   struct ec_stripe *m, size_t idx)
+			   struct stripe *m, size_t idx)
 {
 	spin_lock(&c->ec_stripes_heap_lock);
 	heap_verify_backpointer(c, idx);
@@ -659,7 +662,7 @@ void bch2_stripes_heap_del(struct bch_fs *c,
 }
 
 void bch2_stripes_heap_insert(struct bch_fs *c,
-			      struct ec_stripe *m, size_t idx)
+			      struct stripe *m, size_t idx)
 {
 	spin_lock(&c->ec_stripes_heap_lock);
 
@@ -678,7 +681,9 @@ void bch2_stripes_heap_insert(struct bch_fs *c,
 	spin_unlock(&c->ec_stripes_heap_lock);
 }
 
-static void ec_stripe_delete(struct bch_fs *c, unsigned idx)
+/* stripe deletion */
+
+static void ec_stripe_delete(struct bch_fs *c, size_t idx)
 {
 	struct btree_iter iter;
 	struct bch_stripe *v = NULL;
@@ -717,6 +722,7 @@ static void ec_stripe_delete_work(struct work_struct *work)
 	ssize_t idx;
 
 	down_read(&c->gc_lock);
+	mutex_lock(&c->ec_stripe_create_lock);
 
 	while (1) {
 		spin_lock(&c->ec_stripes_heap_lock);
@@ -729,13 +735,15 @@ static void ec_stripe_delete_work(struct work_struct *work)
 		ec_stripe_delete(c, idx);
 	}
 
+	mutex_unlock(&c->ec_stripe_create_lock);
 	up_read(&c->gc_lock);
 }
 
+/* stripe creation: */
+
 static int ec_stripe_bkey_insert(struct bch_fs *c,
 				 struct bkey_i_stripe *stripe)
 {
-	struct ec_stripe *m;
 	struct btree_iter iter;
 	struct bkey_s_c k;
 	int ret;
@@ -755,18 +763,13 @@ retry:
 
 	return bch2_btree_iter_unlock(&iter) ?: -ENOSPC;
 found_slot:
-	mutex_lock(&c->ec_stripes_lock);
 	ret = ec_stripe_mem_alloc(c, &iter);
-	mutex_unlock(&c->ec_stripes_lock);
 
 	if (ret == -EINTR)
 		goto retry;
 	if (ret)
 		return ret;
 
-	m = genradix_ptr(&c->ec_stripes, iter.pos.offset);
-	atomic_inc(&m->pin);
-
 	stripe->k.p = iter.pos;
 
 	ret = bch2_btree_insert_at(c, NULL, NULL,
@@ -775,14 +778,9 @@ found_slot:
 				   BTREE_INSERT_ENTRY(&iter, &stripe->k_i));
 	bch2_btree_iter_unlock(&iter);
 
-	if (ret)
-		atomic_dec(&m->pin);
-
 	return ret;
 }
 
-/* stripe creation: */
-
 static void extent_stripe_ptr_add(struct bkey_s_extent e,
 				  struct ec_stripe_buf *s,
 				  struct bch_extent_ptr *ptr,
@@ -858,7 +856,6 @@ static int ec_stripe_update_ptrs(struct bch_fs *c,
  */
 static void ec_stripe_create(struct ec_stripe_new *s)
 {
-	struct ec_stripe *ec_stripe;
 	struct bch_fs *c = s->c;
 	struct open_bucket *ob;
 	struct bkey_i *k;
@@ -898,10 +895,12 @@ static void ec_stripe_create(struct ec_stripe_new *s)
 			goto err_put_writes;
 		}
 
+	mutex_lock(&c->ec_stripe_create_lock);
+
 	ret = ec_stripe_bkey_insert(c, &s->stripe.key);
 	if (ret) {
 		bch_err(c, "error creating stripe: error creating stripe key");
-		goto err_put_writes;
+		goto err_unlock;
 	}
 
 	for_each_keylist_key(&s->keys, k) {
@@ -910,12 +909,8 @@ static void ec_stripe_create(struct ec_stripe_new *s)
 			break;
 	}
 
-	ec_stripe = genradix_ptr(&c->ec_stripes, s->stripe.key.k.p.offset);
-
-	atomic_dec(&ec_stripe->pin);
-	bch2_stripes_heap_update(c, ec_stripe,
-				 s->stripe.key.k.p.offset);
-
+err_unlock:
+	mutex_unlock(&c->ec_stripe_create_lock);
 err_put_writes:
 	percpu_ref_put(&c->writes);
 err:
@@ -1222,7 +1217,7 @@ unlock:
 	mutex_unlock(&c->ec_new_stripe_lock);
 }
 
-int bch2_fs_ec_start(struct bch_fs *c)
+int bch2_ec_mem_alloc(struct bch_fs *c, bool gc)
 {
 	struct btree_iter iter;
 	struct bkey_s_c k;
@@ -1238,19 +1233,25 @@ int bch2_fs_ec_start(struct bch_fs *c)
 	if (ret)
 		return ret;
 
-	if (!init_heap(&c->ec_stripes_heap, roundup_pow_of_two(idx),
+	if (!gc &&
+	    !init_heap(&c->ec_stripes_heap, roundup_pow_of_two(idx),
 		       GFP_KERNEL))
 		return -ENOMEM;
 #if 0
-	ret = genradix_prealloc(&c->ec_stripes, idx, GFP_KERNEL);
+	ret = genradix_prealloc(&c->stripes[gc], idx, GFP_KERNEL);
 #else
 	for (i = 0; i < idx; i++)
-		if (!genradix_ptr_alloc(&c->ec_stripes, i, GFP_KERNEL))
+		if (!genradix_ptr_alloc(&c->stripes[gc], i, GFP_KERNEL))
 			return -ENOMEM;
 #endif
 	return 0;
 }
 
+int bch2_fs_ec_start(struct bch_fs *c)
+{
+	return bch2_ec_mem_alloc(c, false);
+}
+
 void bch2_fs_ec_exit(struct bch_fs *c)
 {
 	struct ec_stripe_head *h;
@@ -1271,7 +1272,7 @@ void bch2_fs_ec_exit(struct bch_fs *c)
 	}
 
 	free_heap(&c->ec_stripes_heap);
-	genradix_free(&c->ec_stripes);
+	genradix_free(&c->stripes[0]);
 	bioset_exit(&c->ec_bioset);
 }
 
diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h
index bcf06529dcfc..c35de8b1ef64 100644
--- a/fs/bcachefs/ec.h
+++ b/fs/bcachefs/ec.h
@@ -93,14 +93,16 @@ void bch2_ec_stripe_head_put(struct ec_stripe_head *);
 struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *, unsigned,
 					       unsigned, unsigned);
 
-void bch2_stripes_heap_update(struct bch_fs *, struct ec_stripe *, size_t);
-void bch2_stripes_heap_del(struct bch_fs *, struct ec_stripe *, size_t);
-void bch2_stripes_heap_insert(struct bch_fs *, struct ec_stripe *, size_t);
+void bch2_stripes_heap_update(struct bch_fs *, struct stripe *, size_t);
+void bch2_stripes_heap_del(struct bch_fs *, struct stripe *, size_t);
+void bch2_stripes_heap_insert(struct bch_fs *, struct stripe *, size_t);
 
 void bch2_ec_stop_dev(struct bch_fs *, struct bch_dev *);
 
 void bch2_ec_flush_new_stripes(struct bch_fs *);
 
+int bch2_ec_mem_alloc(struct bch_fs *, bool);
+
 int bch2_fs_ec_start(struct bch_fs *);
 
 void bch2_fs_ec_exit(struct bch_fs *);
diff --git a/fs/bcachefs/ec_types.h b/fs/bcachefs/ec_types.h
index 00e89c3b7767..a3216ca01913 100644
--- a/fs/bcachefs/ec_types.h
+++ b/fs/bcachefs/ec_types.h
@@ -6,7 +6,7 @@
 
 #define EC_STRIPE_MAX	16
 
-struct ec_stripe {
+struct stripe {
 	size_t			heap_idx;
 
 	u16			sectors;
@@ -16,7 +16,6 @@ struct ec_stripe {
 	u8			nr_redundant;
 
 	u8			alive;
-	atomic_t		pin;
 	atomic_t		blocks_nonempty;
 	atomic_t		block_sectors[EC_STRIPE_MAX];
 };
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index eeeebfaa4557..30852090ce75 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -208,8 +208,8 @@ static unsigned bch2_extent_ptr_durability(struct bch_fs *c,
 		durability = max_t(unsigned, durability, ca->mi.durability);
 
 	for (i = 0; i < p.ec_nr; i++) {
-		struct ec_stripe *s =
-			genradix_ptr(&c->ec_stripes, p.idx);
+		struct stripe *s =
+			genradix_ptr(&c->stripes[0], p.idx);
 
 		if (WARN_ON(!s))
 			continue;
diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c
index 8127f4454dac..cc20742d542b 100644
--- a/fs/bcachefs/quota.c
+++ b/fs/bcachefs/quota.c
@@ -700,22 +700,19 @@ static int bch2_get_next_quota(struct super_block *sb, struct kqid *kqid,
 	struct bch_fs *c		= sb->s_fs_info;
 	struct bch_memquota_type *q	= &c->quotas[kqid->type];
 	qid_t qid			= from_kqid(&init_user_ns, *kqid);
-	struct genradix_iter iter	= genradix_iter_init(&q->table, qid);
+	struct genradix_iter iter;
 	struct bch_memquota *mq;
 	int ret = 0;
 
 	mutex_lock(&q->lock);
 
-	while ((mq = genradix_iter_peek(&iter, &q->table))) {
+	genradix_for_each_from(&q->table, iter, mq, qid)
 		if (memcmp(mq, page_address(ZERO_PAGE(0)), sizeof(*mq))) {
 			__bch2_quota_get(qdq, mq);
 			*kqid = make_kqid(current_user_ns(), kqid->type, iter.pos);
 			goto found;
 		}
 
-		genradix_iter_advance(&iter, &q->table);
-	}
-
 	ret = -ENOENT;
 found:
 	mutex_unlock(&q->lock);
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index ddfba16a2998..1cb0c9940ec1 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -305,6 +305,9 @@ int bch2_fs_initialize(struct bch_fs *c)
 
 	set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags);
 
+	for (i = 0; i < BTREE_ID_NR; i++)
+		bch2_btree_root_alloc(c, i);
+
 	ret = bch2_initial_gc(c, &journal);
 	if (ret)
 		goto err;
@@ -316,9 +319,6 @@ int bch2_fs_initialize(struct bch_fs *c)
 			goto err;
 		}
 
-	for (i = 0; i < BTREE_ID_NR; i++)
-		bch2_btree_root_alloc(c, i);
-
 	/*
 	 * journal_res_get() will crash if called before this has
 	 * set up the journal.pin FIFO and journal.cur pointer:
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 59f2aa7e047c..78a2668fc03e 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -548,7 +548,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 
 	INIT_LIST_HEAD(&c->ec_new_stripe_list);
 	mutex_init(&c->ec_new_stripe_lock);
-	mutex_init(&c->ec_stripes_lock);
+	mutex_init(&c->ec_stripe_create_lock);
 	spin_lock_init(&c->ec_stripes_heap_lock);
 
 	seqcount_init(&c->gc_pos_lock);
-- 
cgit 


From eeb83e25bb07ff1d00297c541c03e35c8c3c762c Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 22 Nov 2018 22:50:35 -0500
Subject: bcachefs: Hold usage_lock over mark_key and fs_usage_apply

Fixes an inconsistency at the end of gc

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bkey_methods.h          | 12 +++++
 fs/bcachefs/btree_gc.c              | 12 -----
 fs/bcachefs/btree_update_interior.c | 44 ++++++++++--------
 fs/bcachefs/buckets.c               | 93 ++++++++++++++++++++++++++-----------
 fs/bcachefs/buckets.h               |  3 ++
 5 files changed, 104 insertions(+), 60 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bkey_methods.h b/fs/bcachefs/bkey_methods.h
index be6041e92c05..62b86a8e2ba8 100644
--- a/fs/bcachefs/bkey_methods.h
+++ b/fs/bcachefs/bkey_methods.h
@@ -54,6 +54,18 @@ struct bkey_ops {
 	bool		is_extents;
 };
 
+static inline bool bkey_type_needs_gc(enum bkey_type type)
+{
+	switch (type) {
+	case BKEY_TYPE_BTREE:
+	case BKEY_TYPE_EXTENTS:
+	case BKEY_TYPE_EC:
+		return true;
+	default:
+		return false;
+	}
+}
+
 const char *bch2_bkey_val_invalid(struct bch_fs *, enum bkey_type,
 				  struct bkey_s_c);
 const char *__bch2_bkey_invalid(struct bch_fs *, enum bkey_type, struct bkey_s_c);
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index f350634ce7a0..73775cbd1daf 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -112,18 +112,6 @@ static void btree_node_range_checks(struct bch_fs *c, struct btree *b,
 
 /* marking of btree keys/nodes: */
 
-static bool bkey_type_needs_gc(enum bkey_type type)
-{
-	switch (type) {
-	case BKEY_TYPE_BTREE:
-	case BKEY_TYPE_EXTENTS:
-	case BKEY_TYPE_EC:
-		return true;
-	default:
-		return false;
-	}
-}
-
 static void ptr_gen_recalc_oldest(struct bch_fs *c,
 				  const struct bch_extent_ptr *ptr,
 				  u8 *max_stale)
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 4fcda31290b2..7d7a021416f3 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -160,12 +160,7 @@ static void bch2_btree_node_free_index(struct btree_update *as, struct btree *b,
 {
 	struct bch_fs *c = as->c;
 	struct pending_btree_node_free *d;
-
-	/*
-	 * btree_update lock is only needed here to avoid racing with
-	 * gc:
-	 */
-	mutex_lock(&c->btree_interior_update_lock);
+	struct gc_pos pos = { 0 };
 
 	for (d = as->pending; d < as->pending + as->nr_pending; d++)
 		if (!bkey_cmp(k.k->p, d->key.k.p) &&
@@ -201,20 +196,11 @@ found:
 	if (gc_pos_cmp(c->gc_pos, b
 		       ? gc_pos_btree_node(b)
 		       : gc_pos_btree_root(as->btree_id)) >= 0 &&
-	    gc_pos_cmp(c->gc_pos, gc_phase(GC_PHASE_PENDING_DELETE)) < 0) {
-		struct gc_pos pos = { 0 };
-
-		bch2_mark_key(c, BKEY_TYPE_BTREE,
+	    gc_pos_cmp(c->gc_pos, gc_phase(GC_PHASE_PENDING_DELETE)) < 0)
+		bch2_mark_key_locked(c, BKEY_TYPE_BTREE,
 			      bkey_i_to_s_c(&d->key),
 			      false, 0, pos,
 			      NULL, 0, BCH_BUCKET_MARK_GC);
-		/*
-		 * Don't apply tmp - pending deletes aren't tracked in
-		 * bch_alloc_stats:
-		 */
-	}
-
-	mutex_unlock(&c->btree_interior_update_lock);
 }
 
 static void __btree_node_free(struct bch_fs *c, struct btree *b)
@@ -1083,7 +1069,10 @@ static void bch2_btree_set_root_inmem(struct btree_update *as, struct btree *b)
 
 	__bch2_btree_set_root_inmem(c, b);
 
-	bch2_mark_key(c, BKEY_TYPE_BTREE,
+	mutex_lock(&c->btree_interior_update_lock);
+	percpu_down_read(&c->usage_lock);
+
+	bch2_mark_key_locked(c, BKEY_TYPE_BTREE,
 		      bkey_i_to_s_c(&b->key),
 		      true, 0,
 		      gc_pos_btree_root(b->btree_id),
@@ -1095,6 +1084,9 @@ static void bch2_btree_set_root_inmem(struct btree_update *as, struct btree *b)
 					   &stats);
 	bch2_fs_usage_apply(c, &stats, &as->reserve->disk_res,
 			    gc_pos_btree_root(b->btree_id));
+
+	percpu_up_read(&c->usage_lock);
+	mutex_unlock(&c->btree_interior_update_lock);
 }
 
 static void bch2_btree_set_root_ondisk(struct bch_fs *c, struct btree *b, int rw)
@@ -1171,8 +1163,11 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, struct btree *b
 
 	BUG_ON(insert->k.u64s > bch_btree_keys_u64s_remaining(c, b));
 
+	mutex_lock(&c->btree_interior_update_lock);
+	percpu_down_read(&c->usage_lock);
+
 	if (bkey_extent_is_data(&insert->k))
-		bch2_mark_key(c, BKEY_TYPE_BTREE,
+		bch2_mark_key_locked(c, BKEY_TYPE_BTREE,
 			      bkey_i_to_s_c(insert),
 			      true, 0,
 			      gc_pos_btree_node(b), &stats, 0, 0);
@@ -1193,6 +1188,9 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, struct btree *b
 	bch2_fs_usage_apply(c, &stats, &as->reserve->disk_res,
 			    gc_pos_btree_node(b));
 
+	percpu_up_read(&c->usage_lock);
+	mutex_unlock(&c->btree_interior_update_lock);
+
 	bch2_btree_bset_insert_key(iter, b, node_iter, insert);
 	set_btree_node_dirty(b);
 	set_btree_node_need_write(b);
@@ -1977,7 +1975,10 @@ static void __bch2_btree_node_update_key(struct bch_fs *c,
 
 		bch2_btree_node_lock_write(b, iter);
 
-		bch2_mark_key(c, BKEY_TYPE_BTREE,
+		mutex_lock(&c->btree_interior_update_lock);
+		percpu_down_read(&c->usage_lock);
+
+		bch2_mark_key_locked(c, BKEY_TYPE_BTREE,
 			      bkey_i_to_s_c(&new_key->k_i),
 			      true, 0,
 			      gc_pos_btree_root(b->btree_id),
@@ -1988,6 +1989,9 @@ static void __bch2_btree_node_update_key(struct bch_fs *c,
 		bch2_fs_usage_apply(c, &stats, &as->reserve->disk_res,
 				    gc_pos_btree_root(b->btree_id));
 
+		percpu_up_read(&c->usage_lock);
+		mutex_unlock(&c->btree_interior_update_lock);
+
 		if (PTR_HASH(&new_key->k_i) != PTR_HASH(&b->key)) {
 			mutex_lock(&c->btree_cache.lock);
 			bch2_btree_node_hash_remove(&c->btree_cache, b);
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 87ff4b2c8434..3f4bbf280a78 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -323,6 +323,8 @@ void bch2_fs_usage_apply(struct bch_fs *c,
 	s64 added = sum.data + sum.reserved;
 	s64 should_not_have_added;
 
+	percpu_rwsem_assert_held(&c->usage_lock);
+
 	/*
 	 * Not allowed to reduce sectors_available except by getting a
 	 * reservation:
@@ -339,7 +341,6 @@ void bch2_fs_usage_apply(struct bch_fs *c,
 		stats->online_reserved	-= added;
 	}
 
-	percpu_down_read(&c->usage_lock);
 	preempt_disable();
 	/* online_reserved not subject to gc: */
 	this_cpu_add(c->usage[0]->online_reserved, stats->online_reserved);
@@ -352,7 +353,6 @@ void bch2_fs_usage_apply(struct bch_fs *c,
 
 	bch2_fs_stats_verify(c);
 	preempt_enable();
-	percpu_up_read(&c->usage_lock);
 
 	memset(stats, 0, sizeof(*stats));
 }
@@ -406,7 +406,24 @@ static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
 	bch2_dev_stats_verify(ca);
 }
 
-#define bucket_data_cmpxchg(c, ca, fs_usage, g, new, expr)		\
+void bch2_dev_usage_from_buckets(struct bch_fs *c, struct bch_dev *ca)
+{
+	struct bucket_mark old = { .v.counter = 0 };
+	struct bch_fs_usage *fs_usage;
+	struct bucket_array *buckets;
+	struct bucket *g;
+
+	percpu_down_read(&c->usage_lock);
+	fs_usage = this_cpu_ptr(c->usage[0]);
+	buckets = bucket_array(ca);
+
+	for_each_bucket(g, buckets)
+		if (g->mark.data_type)
+			bch2_dev_usage_update(c, ca, fs_usage, old, g->mark, false);
+	percpu_up_read(&c->usage_lock);
+}
+
+#define bucket_data_cmpxchg(c, ca, fs_usage, g, new, expr)	\
 ({								\
 	struct bucket_mark _old = bucket_cmpxchg(g, new, expr);	\
 								\
@@ -490,12 +507,12 @@ static void __bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
 {
 	struct bch_fs_usage *fs_usage = this_cpu_ptr(c->usage[gc]);
 	struct bucket *g = __bucket(ca, b, gc);
-	struct bucket_mark old, new;
+	struct bucket_mark new;
 
 	BUG_ON(type != BCH_DATA_SB &&
 	       type != BCH_DATA_JOURNAL);
 
-	old = bucket_data_cmpxchg(c, ca, fs_usage, g, new, ({
+	bucket_data_cmpxchg(c, ca, fs_usage, g, new, ({
 		new.data_type	= type;
 		checked_add(new.dirty_sectors, sectors);
 	}));
@@ -876,16 +893,14 @@ static int __bch2_mark_key(struct bch_fs *c,
 	return ret;
 }
 
-int bch2_mark_key(struct bch_fs *c,
-		  enum bkey_type type, struct bkey_s_c k,
-		  bool inserting, s64 sectors,
-		  struct gc_pos pos,
-		  struct bch_fs_usage *stats,
-		  u64 journal_seq, unsigned flags)
+int bch2_mark_key_locked(struct bch_fs *c,
+		   enum bkey_type type, struct bkey_s_c k,
+		   bool inserting, s64 sectors,
+		   struct gc_pos pos,
+		   struct bch_fs_usage *stats,
+		   u64 journal_seq, unsigned flags)
 {
-	int ret = 0;
-
-	percpu_down_read(&c->usage_lock);
+	int ret;
 
 	if (!(flags & BCH_BUCKET_MARK_GC)) {
 		if (!stats)
@@ -894,7 +909,7 @@ int bch2_mark_key(struct bch_fs *c,
 		ret = __bch2_mark_key(c, type, k, inserting, sectors,
 				      stats, journal_seq, flags, false);
 		if (ret)
-			goto out;
+			return ret;
 	}
 
 	if ((flags & BCH_BUCKET_MARK_GC) ||
@@ -903,9 +918,24 @@ int bch2_mark_key(struct bch_fs *c,
 				      this_cpu_ptr(c->usage[1]),
 				      journal_seq, flags, true);
 		if (ret)
-			goto out;
+			return ret;
 	}
-out:
+
+	return 0;
+}
+
+int bch2_mark_key(struct bch_fs *c,
+		  enum bkey_type type, struct bkey_s_c k,
+		  bool inserting, s64 sectors,
+		  struct gc_pos pos,
+		  struct bch_fs_usage *stats,
+		  u64 journal_seq, unsigned flags)
+{
+	int ret;
+
+	percpu_down_read(&c->usage_lock);
+	ret = bch2_mark_key_locked(c, type, k, inserting, sectors,
+				   pos, stats, journal_seq, flags);
 	percpu_up_read(&c->usage_lock);
 
 	return ret;
@@ -922,12 +952,17 @@ void bch2_mark_update(struct btree_insert *trans,
 	struct gc_pos		pos = gc_pos_btree_node(b);
 	struct bkey_packed	*_k;
 
+	if (!bkey_type_needs_gc(iter->btree_id))
+		return;
+
+	percpu_down_read(&c->usage_lock);
+
 	if (!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))
-		bch2_mark_key(c, btree_node_type(b), bkey_i_to_s_c(insert->k),
-			      true,
-			      bpos_min(insert->k->k.p, b->key.k.p).offset -
-			      bkey_start_offset(&insert->k->k),
-			      pos, &stats, trans->journal_res.seq, 0);
+		bch2_mark_key_locked(c, btree_node_type(b),
+			bkey_i_to_s_c(insert->k), true,
+			bpos_min(insert->k->k.p, b->key.k.p).offset -
+			bkey_start_offset(&insert->k->k),
+			pos, &stats, trans->journal_res.seq, 0);
 
 	while ((_k = bch2_btree_node_iter_peek_filter(&node_iter, b,
 						      KEY_TYPE_DISCARD))) {
@@ -959,9 +994,9 @@ void bch2_mark_update(struct btree_insert *trans,
 				sectors = k.k->p.offset - insert->k->k.p.offset;
 				BUG_ON(sectors <= 0);
 
-				bch2_mark_key(c, btree_node_type(b), k,
-					      true, sectors,
-					      pos, &stats, trans->journal_res.seq, 0);
+				bch2_mark_key_locked(c, btree_node_type(b),
+					k, true, sectors, pos, &stats,
+					trans->journal_res.seq, 0);
 
 				sectors = bkey_start_offset(&insert->k->k) -
 					k.k->p.offset;
@@ -971,14 +1006,16 @@ void bch2_mark_update(struct btree_insert *trans,
 			BUG_ON(sectors >= 0);
 		}
 
-		bch2_mark_key(c, btree_node_type(b), k,
-			      false, sectors,
-			      pos, &stats, trans->journal_res.seq, 0);
+		bch2_mark_key_locked(c, btree_node_type(b),
+			k, false, sectors, pos, &stats,
+			trans->journal_res.seq, 0);
 
 		bch2_btree_node_iter_advance(&node_iter, b);
 	}
 
 	bch2_fs_usage_apply(c, &stats, trans->disk_res, pos);
+
+	percpu_up_read(&c->usage_lock);
 }
 
 /* Disk reservations: */
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index 4eec96101bf6..884041b53eb9 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -220,6 +220,9 @@ void bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *,
 #define BCH_BUCKET_MARK_NOATOMIC		(1 << 0)
 #define BCH_BUCKET_MARK_GC			(1 << 1)
 
+int bch2_mark_key_locked(struct bch_fs *, enum bkey_type, struct bkey_s_c,
+		  bool, s64, struct gc_pos,
+		  struct bch_fs_usage *, u64, unsigned);
 int bch2_mark_key(struct bch_fs *, enum bkey_type, struct bkey_s_c,
 		  bool, s64, struct gc_pos,
 		  struct bch_fs_usage *, u64, unsigned);
-- 
cgit 


From d034c09b268398df2a395ca2308f6791a4745e7b Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 27 Nov 2018 08:14:51 -0500
Subject: bcachefs: return errors correctly from gc

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_gc.c | 52 +++++++++++++++++++++++++-------------------------
 1 file changed, 26 insertions(+), 26 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 73775cbd1daf..a849f9e320b3 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -125,12 +125,10 @@ static void ptr_gen_recalc_oldest(struct bch_fs *c,
 	*max_stale = max(*max_stale, ptr_stale(ca, ptr));
 }
 
-static u8 ptr_gens_recalc_oldest(struct bch_fs *c,
-				 enum bkey_type type,
-				 struct bkey_s_c k)
+static void ptr_gens_recalc_oldest(struct bch_fs *c, enum bkey_type type,
+				   struct bkey_s_c k, u8 *max_stale)
 {
 	const struct bch_extent_ptr *ptr;
-	u8 max_stale = 0;
 
 	switch (type) {
 	case BKEY_TYPE_BTREE:
@@ -141,7 +139,7 @@ static u8 ptr_gens_recalc_oldest(struct bch_fs *c,
 			struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
 
 			extent_for_each_ptr(e, ptr)
-				ptr_gen_recalc_oldest(c, ptr, &max_stale);
+				ptr_gen_recalc_oldest(c, ptr, max_stale);
 			break;
 		}
 		}
@@ -154,14 +152,12 @@ static u8 ptr_gens_recalc_oldest(struct bch_fs *c,
 			for (ptr = s.v->ptrs;
 			     ptr < s.v->ptrs + s.v->nr_blocks;
 			     ptr++)
-				ptr_gen_recalc_oldest(c, ptr, &max_stale);
+				ptr_gen_recalc_oldest(c, ptr, max_stale);
 		}
 		}
 	default:
 		break;
 	}
-
-	return max_stale;
 }
 
 static int ptr_gen_check(struct bch_fs *c,
@@ -244,7 +240,8 @@ static int ptr_gens_check(struct bch_fs *c, enum bkey_type type,
  * For runtime mark and sweep:
  */
 static int bch2_gc_mark_key(struct bch_fs *c, enum bkey_type type,
-			    struct bkey_s_c k, bool initial)
+			    struct bkey_s_c k,
+			    u8 *max_stale, bool initial)
 {
 	struct gc_pos pos = { 0 };
 	unsigned flags =
@@ -276,20 +273,21 @@ static int bch2_gc_mark_key(struct bch_fs *c, enum bkey_type type,
 
 	bch2_mark_key(c, type, k, true, k.k->size, pos, NULL, 0, flags);
 
-	ret = ptr_gens_recalc_oldest(c, type, k);
+	ptr_gens_recalc_oldest(c, type, k, max_stale);
 fsck_err:
 	return ret;
 }
 
 static int btree_gc_mark_node(struct bch_fs *c, struct btree *b,
-			      bool initial)
+			      u8 *max_stale, bool initial)
 {
 	enum bkey_type type = btree_node_type(b);
 	struct btree_node_iter iter;
 	struct bkey unpacked;
 	struct bkey_s_c k;
-	u8 stale = 0;
-	int ret;
+	int ret = 0;
+
+	*max_stale = 0;
 
 	if (!bkey_type_needs_gc(type))
 		return 0;
@@ -298,14 +296,12 @@ static int btree_gc_mark_node(struct bch_fs *c, struct btree *b,
 				       &unpacked) {
 		bch2_bkey_debugcheck(c, b, k);
 
-		ret = bch2_gc_mark_key(c, type, k, initial);
-		if (ret < 0)
-			return ret;
-
-		stale = max_t(u8, stale, ret);
+		ret = bch2_gc_mark_key(c, type, k, max_stale, initial);
+		if (ret)
+			break;
 	}
 
-	return stale;
+	return ret;
 }
 
 static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id,
@@ -315,7 +311,7 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id,
 	struct btree *b;
 	struct range_checks r;
 	unsigned depth = bkey_type_needs_gc(btree_id) ? 0 : 1;
-	unsigned max_stale;
+	u8 max_stale;
 	int ret = 0;
 
 	gc_pos_set(c, gc_pos_btree(btree_id, POS_MIN, 0));
@@ -337,7 +333,9 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id,
 
 		bch2_verify_btree_nr_keys(b);
 
-		max_stale = btree_gc_mark_node(c, b, initial);
+		ret = btree_gc_mark_node(c, b, &max_stale, initial);
+		if (ret)
+			break;
 
 		gc_pos_set(c, gc_pos_btree_node(b));
 
@@ -358,7 +356,7 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id,
 
 		bch2_btree_iter_cond_resched(&iter);
 	}
-	ret = bch2_btree_iter_unlock(&iter);
+	ret = bch2_btree_iter_unlock(&iter) ?: ret;
 	if (ret)
 		return ret;
 
@@ -366,8 +364,8 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id,
 
 	b = c->btree_roots[btree_id].b;
 	if (!btree_node_fake(b))
-		bch2_gc_mark_key(c, BKEY_TYPE_BTREE,
-				 bkey_i_to_s_c(&b->key), initial);
+		bch2_gc_mark_key(c, BKEY_TYPE_BTREE, bkey_i_to_s_c(&b->key),
+				 &max_stale, initial);
 	gc_pos_set(c, gc_pos_btree_root(b->btree_id));
 
 	mutex_unlock(&c->btree_root_lock);
@@ -384,6 +382,7 @@ static int bch2_gc_btrees(struct bch_fs *c, struct list_head *journal,
 			  bool initial)
 {
 	enum btree_id ids[BTREE_ID_NR];
+	u8 max_stale;
 	unsigned i;
 
 	for (i = 0; i < BTREE_ID_NR; i++)
@@ -408,8 +407,9 @@ static int bch2_gc_btrees(struct bch_fs *c, struct list_head *journal,
 				for_each_jset_key(k, n, j, &r->j) {
 					if (type == bkey_type(j->level, j->btree_id)) {
 						ret = bch2_gc_mark_key(c, type,
-							bkey_i_to_s_c(k), initial);
-						if (ret < 0)
+							bkey_i_to_s_c(k),
+							&max_stale, initial);
+						if (ret)
 							return ret;
 					}
 				}
-- 
cgit 


From 9d11058a789a86eb580d2b0684604a1a5d795fe3 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 27 Nov 2018 10:06:18 -0500
Subject: bcachefs: fix waiting on an open journal entry

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/journal.c | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index 4045c0e68462..2f8dae4013af 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -461,7 +461,7 @@ u64 bch2_journal_last_unwritten_seq(struct journal *j)
 int bch2_journal_open_seq_async(struct journal *j, u64 seq, struct closure *cl)
 {
 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
-	bool need_reclaim = false;
+	int ret;
 retry:
 	spin_lock(&j->lock);
 
@@ -489,14 +489,11 @@ retry:
 
 	BUG_ON(journal_cur_seq(j) < seq);
 
-	if (!journal_entry_open(j)) {
-		need_reclaim = true;
-		goto blocked;
+	ret = journal_entry_open(j);
+	if (ret) {
+		spin_unlock(&j->lock);
+		return ret < 0 ? ret : 0;
 	}
-
-	spin_unlock(&j->lock);
-
-	return 0;
 blocked:
 	if (!j->res_get_blocked_start)
 		j->res_get_blocked_start = local_clock() ?: 1;
@@ -504,8 +501,7 @@ blocked:
 	closure_wait(&j->async_wait, cl);
 	spin_unlock(&j->lock);
 
-	if (need_reclaim)
-		bch2_journal_reclaim_work(&j->reclaim_work.work);
+	bch2_journal_reclaim_work(&j->reclaim_work.work);
 	return -EAGAIN;
 }
 
-- 
cgit 


From 5b8a9227f8a4acd9652d6d89a608fbf4c39c6f44 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 27 Nov 2018 18:30:56 -0500
Subject: bcachefs: Split out bkey_sort.c

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/Makefile    |   1 +
 fs/bcachefs/bkey_sort.c | 658 ++++++++++++++++++++++++++++++++++++++++++++++++
 fs/bcachefs/bkey_sort.h |  68 +++++
 fs/bcachefs/btree_io.c  | 438 ++------------------------------
 fs/bcachefs/btree_io.h  |  42 ----
 fs/bcachefs/extents.c   | 270 +-------------------
 fs/bcachefs/extents.h   |  39 +--
 fs/bcachefs/super.c     |   1 +
 8 files changed, 772 insertions(+), 745 deletions(-)
 create mode 100644 fs/bcachefs/bkey_sort.c
 create mode 100644 fs/bcachefs/bkey_sort.h

(limited to 'fs')

diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile
index b9521d772db1..c29ccdb45965 100644
--- a/fs/bcachefs/Makefile
+++ b/fs/bcachefs/Makefile
@@ -7,6 +7,7 @@ bcachefs-y		:=	\
 	alloc_foreground.o	\
 	bkey.o			\
 	bkey_methods.o		\
+	bkey_sort.o		\
 	bset.o			\
 	btree_cache.o		\
 	btree_gc.o		\
diff --git a/fs/bcachefs/bkey_sort.c b/fs/bcachefs/bkey_sort.c
new file mode 100644
index 000000000000..706ca77d4b17
--- /dev/null
+++ b/fs/bcachefs/bkey_sort.c
@@ -0,0 +1,658 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "bcachefs.h"
+#include "bkey_sort.h"
+#include "bset.h"
+#include "extents.h"
+
+/* too many iterators, need to clean this up */
+
+/* btree_node_iter_large: */
+
+#define btree_node_iter_cmp_heap(h, _l, _r) btree_node_iter_cmp(b, _l, _r)
+
+static inline bool
+bch2_btree_node_iter_large_end(struct btree_node_iter_large *iter)
+{
+	return !iter->used;
+}
+
+static inline struct bkey_packed *
+bch2_btree_node_iter_large_peek_all(struct btree_node_iter_large *iter,
+				    struct btree *b)
+{
+	return bch2_btree_node_iter_large_end(iter)
+		? NULL
+		: __btree_node_offset_to_key(b, iter->data->k);
+}
+
+static void
+bch2_btree_node_iter_large_advance(struct btree_node_iter_large *iter,
+				   struct btree *b)
+{
+	iter->data->k += __btree_node_offset_to_key(b, iter->data->k)->u64s;
+
+	EBUG_ON(!iter->used);
+	EBUG_ON(iter->data->k > iter->data->end);
+
+	if (iter->data->k == iter->data->end)
+		heap_del(iter, 0, btree_node_iter_cmp_heap, NULL);
+	else
+		heap_sift_down(iter, 0, btree_node_iter_cmp_heap, NULL);
+}
+
+static inline struct bkey_packed *
+bch2_btree_node_iter_large_next_all(struct btree_node_iter_large *iter,
+				    struct btree *b)
+{
+	struct bkey_packed *ret = bch2_btree_node_iter_large_peek_all(iter, b);
+
+	if (ret)
+		bch2_btree_node_iter_large_advance(iter, b);
+
+	return ret;
+}
+
+void bch2_btree_node_iter_large_push(struct btree_node_iter_large *iter,
+				     struct btree *b,
+				     const struct bkey_packed *k,
+				     const struct bkey_packed *end)
+{
+	if (k != end) {
+		struct btree_node_iter_set n =
+			((struct btree_node_iter_set) {
+				 __btree_node_key_to_offset(b, k),
+				 __btree_node_key_to_offset(b, end)
+			 });
+
+		__heap_add(iter, n, btree_node_iter_cmp_heap, NULL);
+	}
+}
+
+static void sort_key_next(struct btree_node_iter_large *iter,
+			  struct btree *b,
+			  struct btree_node_iter_set *i)
+{
+	i->k += __btree_node_offset_to_key(b, i->k)->u64s;
+
+	if (i->k == i->end)
+		*i = iter->data[--iter->used];
+}
+
+/* regular sort_iters */
+
+typedef int (*sort_cmp_fn)(struct btree *,
+			   struct bkey_packed *,
+			   struct bkey_packed *);
+
+static inline void __sort_iter_sift(struct sort_iter *iter,
+				    unsigned from,
+				    sort_cmp_fn cmp)
+{
+	unsigned i;
+
+	for (i = from;
+	     i + 1 < iter->used &&
+	     cmp(iter->b, iter->data[i].k, iter->data[i + 1].k) > 0;
+	     i++)
+		swap(iter->data[i], iter->data[i + 1]);
+}
+
+static inline void sort_iter_sift(struct sort_iter *iter, sort_cmp_fn cmp)
+{
+
+	__sort_iter_sift(iter, 0, cmp);
+}
+
+static inline void sort_iter_sort(struct sort_iter *iter, sort_cmp_fn cmp)
+{
+	unsigned i = iter->used;
+
+	while (i--)
+		__sort_iter_sift(iter, i, cmp);
+}
+
+static inline struct bkey_packed *sort_iter_peek(struct sort_iter *iter)
+{
+	return iter->used ? iter->data->k : NULL;
+}
+
+static inline void sort_iter_advance(struct sort_iter *iter, sort_cmp_fn cmp)
+{
+	iter->data->k = bkey_next(iter->data->k);
+
+	BUG_ON(iter->data->k > iter->data->end);
+
+	if (iter->data->k == iter->data->end)
+		array_remove_item(iter->data, iter->used, 0);
+	else
+		sort_iter_sift(iter, cmp);
+}
+
+static inline struct bkey_packed *sort_iter_next(struct sort_iter *iter,
+						 sort_cmp_fn cmp)
+{
+	struct bkey_packed *ret = sort_iter_peek(iter);
+
+	if (ret)
+		sort_iter_advance(iter, cmp);
+
+	return ret;
+}
+
+/*
+ * Returns true if l > r - unless l == r, in which case returns true if l is
+ * older than r.
+ *
+ * Necessary for btree_sort_fixup() - if there are multiple keys that compare
+ * equal in different sets, we have to process them newest to oldest.
+ */
+#define key_sort_cmp(h, l, r)						\
+({									\
+	bkey_cmp_packed(b,						\
+			__btree_node_offset_to_key(b, (l).k),		\
+			__btree_node_offset_to_key(b, (r).k))		\
+									\
+	?: (l).k - (r).k;						\
+})
+
+static inline bool should_drop_next_key(struct btree_node_iter_large *iter,
+					struct btree *b)
+{
+	struct btree_node_iter_set *l = iter->data, *r = iter->data + 1;
+	struct bkey_packed *k = __btree_node_offset_to_key(b, l->k);
+
+	if (bkey_whiteout(k))
+		return true;
+
+	if (iter->used < 2)
+		return false;
+
+	if (iter->used > 2 &&
+	    key_sort_cmp(iter, r[0], r[1]) >= 0)
+		r++;
+
+	/*
+	 * key_sort_cmp() ensures that when keys compare equal the older key
+	 * comes first; so if l->k compares equal to r->k then l->k is older and
+	 * should be dropped.
+	 */
+	return !bkey_cmp_packed(b,
+				__btree_node_offset_to_key(b, l->k),
+				__btree_node_offset_to_key(b, r->k));
+}
+
+struct btree_nr_keys bch2_key_sort_fix_overlapping(struct bset *dst,
+					struct btree *b,
+					struct btree_node_iter_large *iter)
+{
+	struct bkey_packed *out = dst->start;
+	struct btree_nr_keys nr;
+
+	memset(&nr, 0, sizeof(nr));
+
+	heap_resort(iter, key_sort_cmp, NULL);
+
+	while (!bch2_btree_node_iter_large_end(iter)) {
+		if (!should_drop_next_key(iter, b)) {
+			struct bkey_packed *k =
+				__btree_node_offset_to_key(b, iter->data->k);
+
+			bkey_copy(out, k);
+			btree_keys_account_key_add(&nr, 0, out);
+			out = bkey_next(out);
+		}
+
+		sort_key_next(iter, b, iter->data);
+		heap_sift_down(iter, 0, key_sort_cmp, NULL);
+	}
+
+	dst->u64s = cpu_to_le16((u64 *) out - dst->_data);
+	return nr;
+}
+
+/*
+ * If keys compare equal, compare by pointer order:
+ *
+ * Necessary for sort_fix_overlapping() - if there are multiple keys that
+ * compare equal in different sets, we have to process them newest to oldest.
+ */
+#define extent_sort_cmp(h, l, r)					\
+({									\
+	struct bkey _ul = bkey_unpack_key(b,				\
+				__btree_node_offset_to_key(b, (l).k));	\
+	struct bkey _ur = bkey_unpack_key(b,				\
+				__btree_node_offset_to_key(b, (r).k));	\
+									\
+	bkey_cmp(bkey_start_pos(&_ul),					\
+		 bkey_start_pos(&_ur)) ?: (r).k - (l).k;		\
+})
+
+static inline void extent_sort_sift(struct btree_node_iter_large *iter,
+				    struct btree *b, size_t i)
+{
+	heap_sift_down(iter, i, extent_sort_cmp, NULL);
+}
+
+static inline void extent_sort_next(struct btree_node_iter_large *iter,
+				    struct btree *b,
+				    struct btree_node_iter_set *i)
+{
+	sort_key_next(iter, b, i);
+	heap_sift_down(iter, i - iter->data, extent_sort_cmp, NULL);
+}
+
+static void extent_sort_append(struct bch_fs *c,
+			       struct btree *b,
+			       struct btree_nr_keys *nr,
+			       struct bkey_packed *start,
+			       struct bkey_packed **prev,
+			       struct bkey_packed *k)
+{
+	struct bkey_format *f = &b->format;
+	BKEY_PADDED(k) tmp;
+
+	if (bkey_whiteout(k))
+		return;
+
+	bch2_bkey_unpack(b, &tmp.k, k);
+
+	if (*prev &&
+	    bch2_extent_merge(c, b, (void *) *prev, &tmp.k))
+		return;
+
+	if (*prev) {
+		bch2_bkey_pack(*prev, (void *) *prev, f);
+
+		btree_keys_account_key_add(nr, 0, *prev);
+		*prev = bkey_next(*prev);
+	} else {
+		*prev = start;
+	}
+
+	bkey_copy(*prev, &tmp.k);
+}
+
+struct btree_nr_keys bch2_extent_sort_fix_overlapping(struct bch_fs *c,
+					struct bset *dst,
+					struct btree *b,
+					struct btree_node_iter_large *iter)
+{
+	struct bkey_format *f = &b->format;
+	struct btree_node_iter_set *_l = iter->data, *_r;
+	struct bkey_packed *prev = NULL, *out, *lk, *rk;
+	struct bkey l_unpacked, r_unpacked;
+	struct bkey_s l, r;
+	struct btree_nr_keys nr;
+
+	memset(&nr, 0, sizeof(nr));
+
+	heap_resort(iter, extent_sort_cmp, NULL);
+
+	while (!bch2_btree_node_iter_large_end(iter)) {
+		lk = __btree_node_offset_to_key(b, _l->k);
+
+		if (iter->used == 1) {
+			extent_sort_append(c, b, &nr, dst->start, &prev, lk);
+			extent_sort_next(iter, b, _l);
+			continue;
+		}
+
+		_r = iter->data + 1;
+		if (iter->used > 2 &&
+		    extent_sort_cmp(iter, _r[0], _r[1]) >= 0)
+			_r++;
+
+		rk = __btree_node_offset_to_key(b, _r->k);
+
+		l = __bkey_disassemble(b, lk, &l_unpacked);
+		r = __bkey_disassemble(b, rk, &r_unpacked);
+
+		/* If current key and next key don't overlap, just append */
+		if (bkey_cmp(l.k->p, bkey_start_pos(r.k)) <= 0) {
+			extent_sort_append(c, b, &nr, dst->start, &prev, lk);
+			extent_sort_next(iter, b, _l);
+			continue;
+		}
+
+		/* Skip 0 size keys */
+		if (!r.k->size) {
+			extent_sort_next(iter, b, _r);
+			continue;
+		}
+
+		/*
+		 * overlap: keep the newer key and trim the older key so they
+		 * don't overlap. comparing pointers tells us which one is
+		 * newer, since the bsets are appended one after the other.
+		 */
+
+		/* can't happen because of comparison func */
+		BUG_ON(_l->k < _r->k &&
+		       !bkey_cmp(bkey_start_pos(l.k), bkey_start_pos(r.k)));
+
+		if (_l->k > _r->k) {
+			/* l wins, trim r */
+			if (bkey_cmp(l.k->p, r.k->p) >= 0) {
+				sort_key_next(iter, b, _r);
+			} else {
+				__bch2_cut_front(l.k->p, r);
+				extent_save(b, rk, r.k);
+			}
+
+			extent_sort_sift(iter, b, _r - iter->data);
+		} else if (bkey_cmp(l.k->p, r.k->p) > 0) {
+			BKEY_PADDED(k) tmp;
+
+			/*
+			 * r wins, but it overlaps in the middle of l - split l:
+			 */
+			bkey_reassemble(&tmp.k, l.s_c);
+			bch2_cut_back(bkey_start_pos(r.k), &tmp.k.k);
+
+			__bch2_cut_front(r.k->p, l);
+			extent_save(b, lk, l.k);
+
+			extent_sort_sift(iter, b, 0);
+
+			extent_sort_append(c, b, &nr, dst->start, &prev,
+					   bkey_to_packed(&tmp.k));
+		} else {
+			bch2_cut_back(bkey_start_pos(r.k), l.k);
+			extent_save(b, lk, l.k);
+		}
+	}
+
+	if (prev) {
+		bch2_bkey_pack(prev, (void *) prev, f);
+		btree_keys_account_key_add(&nr, 0, prev);
+		out = bkey_next(prev);
+	} else {
+		out = dst->start;
+	}
+
+	dst->u64s = cpu_to_le16((u64 *) out - dst->_data);
+	return nr;
+}
+
+/* Sort + repack in a new format: */
+static struct btree_nr_keys
+bch2_sort_repack(struct bset *dst, struct btree *src,
+		 struct btree_node_iter *src_iter,
+		 struct bkey_format *out_f,
+		 bool filter_whiteouts)
+{
+	struct bkey_format *in_f = &src->format;
+	struct bkey_packed *in, *out = vstruct_last(dst);
+	struct btree_nr_keys nr;
+
+	memset(&nr, 0, sizeof(nr));
+
+	while ((in = bch2_btree_node_iter_next_all(src_iter, src))) {
+		if (filter_whiteouts && bkey_whiteout(in))
+			continue;
+
+		if (bch2_bkey_transform(out_f, out, bkey_packed(in)
+				       ? in_f : &bch2_bkey_format_current, in))
+			out->format = KEY_FORMAT_LOCAL_BTREE;
+		else
+			bch2_bkey_unpack(src, (void *) out, in);
+
+		btree_keys_account_key_add(&nr, 0, out);
+		out = bkey_next(out);
+	}
+
+	dst->u64s = cpu_to_le16((u64 *) out - dst->_data);
+	return nr;
+}
+
+/* Sort, repack, and merge: */
+struct btree_nr_keys
+bch2_sort_repack_merge(struct bch_fs *c,
+		       struct bset *dst, struct btree *src,
+		       struct btree_node_iter *iter,
+		       struct bkey_format *out_f,
+		       bool filter_whiteouts,
+		       key_filter_fn filter,
+		       key_merge_fn merge)
+{
+	struct bkey_packed *k, *prev = NULL, *out;
+	struct btree_nr_keys nr;
+	BKEY_PADDED(k) tmp;
+
+	if (!filter && !merge)
+		return bch2_sort_repack(dst, src, iter, out_f,
+					filter_whiteouts);
+
+	memset(&nr, 0, sizeof(nr));
+
+	while ((k = bch2_btree_node_iter_next_all(iter, src))) {
+		if (filter_whiteouts && bkey_whiteout(k))
+			continue;
+
+		/*
+		 * The filter might modify pointers, so we have to unpack the
+		 * key and values to &tmp.k:
+		 */
+		bch2_bkey_unpack(src, &tmp.k, k);
+
+		if (filter && filter(c, src, bkey_i_to_s(&tmp.k)))
+			continue;
+
+		/* prev is always unpacked, for key merging: */
+
+		if (prev &&
+		    merge &&
+		    merge(c, src, (void *) prev, &tmp.k) == BCH_MERGE_MERGE)
+			continue;
+
+		/*
+		 * the current key becomes the new prev: advance prev, then
+		 * copy the current key - but first pack prev (in place):
+		 */
+		if (prev) {
+			bch2_bkey_pack(prev, (void *) prev, out_f);
+
+			btree_keys_account_key_add(&nr, 0, prev);
+			prev = bkey_next(prev);
+		} else {
+			prev = vstruct_last(dst);
+		}
+
+		bkey_copy(prev, &tmp.k);
+	}
+
+	if (prev) {
+		bch2_bkey_pack(prev, (void *) prev, out_f);
+		btree_keys_account_key_add(&nr, 0, prev);
+		out = bkey_next(prev);
+	} else {
+		out = vstruct_last(dst);
+	}
+
+	dst->u64s = cpu_to_le16((u64 *) out - dst->_data);
+	return nr;
+}
+
+static inline int sort_keys_cmp(struct btree *b,
+				struct bkey_packed *l,
+				struct bkey_packed *r)
+{
+	return bkey_cmp_packed(b, l, r) ?:
+		(int) bkey_whiteout(r) - (int) bkey_whiteout(l) ?:
+		(int) l->needs_whiteout - (int) r->needs_whiteout;
+}
+
+unsigned bch2_sort_keys(struct bkey_packed *dst,
+			struct sort_iter *iter,
+			bool filter_whiteouts)
+{
+	const struct bkey_format *f = &iter->b->format;
+	struct bkey_packed *in, *next, *out = dst;
+
+	sort_iter_sort(iter, sort_keys_cmp);
+
+	while ((in = sort_iter_next(iter, sort_keys_cmp))) {
+		if (bkey_whiteout(in) &&
+		    (filter_whiteouts || !in->needs_whiteout))
+			continue;
+
+		if (bkey_whiteout(in) &&
+		    (next = sort_iter_peek(iter)) &&
+		    !bkey_cmp_packed(iter->b, in, next)) {
+			BUG_ON(in->needs_whiteout &&
+			       next->needs_whiteout);
+			/*
+			 * XXX racy, called with read lock from write path
+			 *
+			 * leads to spurious BUG_ON() in bkey_unpack_key() in
+			 * debug mode
+			 */
+			next->needs_whiteout |= in->needs_whiteout;
+			continue;
+		}
+
+		if (bkey_whiteout(in)) {
+			memcpy_u64s(out, in, bkeyp_key_u64s(f, in));
+			set_bkeyp_val_u64s(f, out, 0);
+		} else {
+			bkey_copy(out, in);
+		}
+		out = bkey_next(out);
+	}
+
+	return (u64 *) out - (u64 *) dst;
+}
+
+static inline int sort_extents_cmp(struct btree *b,
+				   struct bkey_packed *l,
+				   struct bkey_packed *r)
+{
+	return bkey_cmp_packed(b, l, r) ?:
+		(int) bkey_deleted(l) - (int) bkey_deleted(r);
+}
+
+unsigned bch2_sort_extents(struct bkey_packed *dst,
+			   struct sort_iter *iter,
+			   bool filter_whiteouts)
+{
+	struct bkey_packed *in, *out = dst;
+
+	sort_iter_sort(iter, sort_extents_cmp);
+
+	while ((in = sort_iter_next(iter, sort_extents_cmp))) {
+		if (bkey_deleted(in))
+			continue;
+
+		if (bkey_whiteout(in) &&
+		    (filter_whiteouts || !in->needs_whiteout))
+			continue;
+
+		bkey_copy(out, in);
+		out = bkey_next(out);
+	}
+
+	return (u64 *) out - (u64 *) dst;
+}
+
+static inline int sort_key_whiteouts_cmp(struct btree *b,
+					 struct bkey_packed *l,
+					 struct bkey_packed *r)
+{
+	return bkey_cmp_packed(b, l, r);
+}
+
+unsigned bch2_sort_key_whiteouts(struct bkey_packed *dst,
+				 struct sort_iter *iter)
+{
+	struct bkey_packed *in, *out = dst;
+
+	sort_iter_sort(iter, sort_key_whiteouts_cmp);
+
+	while ((in = sort_iter_next(iter, sort_key_whiteouts_cmp))) {
+		bkey_copy(out, in);
+		out = bkey_next(out);
+	}
+
+	return (u64 *) out - (u64 *) dst;
+}
+
+static inline int sort_extent_whiteouts_cmp(struct btree *b,
+					    struct bkey_packed *l,
+					    struct bkey_packed *r)
+{
+	struct bkey ul = bkey_unpack_key(b, l);
+	struct bkey ur = bkey_unpack_key(b, r);
+
+	return bkey_cmp(bkey_start_pos(&ul), bkey_start_pos(&ur));
+}
+
+unsigned bch2_sort_extent_whiteouts(struct bkey_packed *dst,
+				    struct sort_iter *iter)
+{
+	const struct bkey_format *f = &iter->b->format;
+	struct bkey_packed *in, *out = dst;
+	struct bkey_i l, r;
+	bool prev = false, l_packed = false;
+	u64 max_packed_size	= bkey_field_max(f, BKEY_FIELD_SIZE);
+	u64 max_packed_offset	= bkey_field_max(f, BKEY_FIELD_OFFSET);
+	u64 new_size;
+
+	max_packed_size = min_t(u64, max_packed_size, KEY_SIZE_MAX);
+
+	sort_iter_sort(iter, sort_extent_whiteouts_cmp);
+
+	while ((in = sort_iter_next(iter, sort_extent_whiteouts_cmp))) {
+		if (bkey_deleted(in))
+			continue;
+
+		EBUG_ON(bkeyp_val_u64s(f, in));
+		EBUG_ON(in->type != KEY_TYPE_DISCARD);
+
+		r.k = bkey_unpack_key(iter->b, in);
+
+		if (prev &&
+		    bkey_cmp(l.k.p, bkey_start_pos(&r.k)) >= 0) {
+			if (bkey_cmp(l.k.p, r.k.p) >= 0)
+				continue;
+
+			new_size = l_packed
+				? min(max_packed_size, max_packed_offset -
+				      bkey_start_offset(&l.k))
+				: KEY_SIZE_MAX;
+
+			new_size = min(new_size, r.k.p.offset -
+				       bkey_start_offset(&l.k));
+
+			BUG_ON(new_size < l.k.size);
+
+			bch2_key_resize(&l.k, new_size);
+
+			if (bkey_cmp(l.k.p, r.k.p) >= 0)
+				continue;
+
+			bch2_cut_front(l.k.p, &r);
+		}
+
+		if (prev) {
+			if (!bch2_bkey_pack(out, &l, f)) {
+				BUG_ON(l_packed);
+				bkey_copy(out, &l);
+			}
+			out = bkey_next(out);
+		}
+
+		l = r;
+		prev = true;
+		l_packed = bkey_packed(in);
+	}
+
+	if (prev) {
+		if (!bch2_bkey_pack(out, &l, f)) {
+			BUG_ON(l_packed);
+			bkey_copy(out, &l);
+		}
+		out = bkey_next(out);
+	}
+
+	return (u64 *) out - (u64 *) dst;
+}
diff --git a/fs/bcachefs/bkey_sort.h b/fs/bcachefs/bkey_sort.h
new file mode 100644
index 000000000000..6b1661dd221a
--- /dev/null
+++ b/fs/bcachefs/bkey_sort.h
@@ -0,0 +1,68 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BKEY_SORT_H
+#define _BCACHEFS_BKEY_SORT_H
+
+struct btree_node_iter_large {
+	u16		used;
+
+	struct btree_node_iter_set data[MAX_BSETS];
+};
+
+void bch2_btree_node_iter_large_push(struct btree_node_iter_large *,
+				     struct btree *,
+				     const struct bkey_packed *,
+				     const struct bkey_packed *);
+
+struct sort_iter {
+	struct btree	*b;
+	unsigned		used;
+
+	struct sort_iter_set {
+		struct bkey_packed *k, *end;
+	} data[MAX_BSETS + 1];
+};
+
+static inline void sort_iter_init(struct sort_iter *iter, struct btree *b)
+{
+	memset(iter, 0, sizeof(*iter));
+	iter->b = b;
+}
+
+static inline void sort_iter_add(struct sort_iter *iter,
+				 struct bkey_packed *k,
+				 struct bkey_packed *end)
+{
+	BUG_ON(iter->used >= ARRAY_SIZE(iter->data));
+
+	if (k != end)
+		iter->data[iter->used++] = (struct sort_iter_set) { k, end };
+}
+
+struct btree_nr_keys
+bch2_key_sort_fix_overlapping(struct bset *, struct btree *,
+			      struct btree_node_iter_large *);
+struct btree_nr_keys
+bch2_extent_sort_fix_overlapping(struct bch_fs *, struct bset *,
+				 struct btree *,
+				 struct btree_node_iter_large *);
+
+struct btree_nr_keys
+bch2_sort_repack_merge(struct bch_fs *,
+		       struct bset *, struct btree *,
+		       struct btree_node_iter *,
+		       struct bkey_format *,
+		       bool,
+		       key_filter_fn,
+		       key_merge_fn);
+
+unsigned bch2_sort_keys(struct bkey_packed *,
+			struct sort_iter *, bool);
+unsigned bch2_sort_extents(struct bkey_packed *,
+			   struct sort_iter *, bool);
+
+unsigned bch2_sort_key_whiteouts(struct bkey_packed *,
+				 struct sort_iter *);
+unsigned bch2_sort_extent_whiteouts(struct bkey_packed *,
+				    struct sort_iter *);
+
+#endif /* _BCACHEFS_BKEY_SORT_H */
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index f1c31e74348a..506bf9e8df38 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -2,6 +2,7 @@
 
 #include "bcachefs.h"
 #include "bkey_methods.h"
+#include "bkey_sort.h"
 #include "btree_cache.h"
 #include "btree_io.h"
 #include "btree_iter.h"
@@ -19,40 +20,6 @@
 #include "super-io.h"
 #include "trace.h"
 
-/* btree_node_iter_large: */
-
-#define btree_node_iter_cmp_heap(h, _l, _r) btree_node_iter_cmp(b, _l, _r)
-
-void bch2_btree_node_iter_large_push(struct btree_node_iter_large *iter,
-				     struct btree *b,
-				     const struct bkey_packed *k,
-				     const struct bkey_packed *end)
-{
-	if (k != end) {
-		struct btree_node_iter_set n =
-			((struct btree_node_iter_set) {
-				 __btree_node_key_to_offset(b, k),
-				 __btree_node_key_to_offset(b, end)
-			 });
-
-		__heap_add(iter, n, btree_node_iter_cmp_heap, NULL);
-	}
-}
-
-void bch2_btree_node_iter_large_advance(struct btree_node_iter_large *iter,
-					struct btree *b)
-{
-	iter->data->k += __btree_node_offset_to_key(b, iter->data->k)->u64s;
-
-	EBUG_ON(!iter->used);
-	EBUG_ON(iter->data->k > iter->data->end);
-
-	if (iter->data->k == iter->data->end)
-		heap_del(iter, 0, btree_node_iter_cmp_heap, NULL);
-	else
-		heap_sift_down(iter, 0, btree_node_iter_cmp_heap, NULL);
-}
-
 static void verify_no_dups(struct btree *b,
 			   struct bkey_packed *start,
 			   struct bkey_packed *end)
@@ -113,193 +80,6 @@ static void *btree_bounce_alloc(struct bch_fs *c, unsigned order,
 	return mempool_alloc(&c->btree_bounce_pool, GFP_NOIO);
 }
 
-typedef int (*sort_cmp_fn)(struct btree *,
-			   struct bkey_packed *,
-			   struct bkey_packed *);
-
-struct sort_iter {
-	struct btree	*b;
-	unsigned		used;
-
-	struct sort_iter_set {
-		struct bkey_packed *k, *end;
-	} data[MAX_BSETS + 1];
-};
-
-static void sort_iter_init(struct sort_iter *iter, struct btree *b)
-{
-	memset(iter, 0, sizeof(*iter));
-	iter->b = b;
-}
-
-static inline void __sort_iter_sift(struct sort_iter *iter,
-				    unsigned from,
-				    sort_cmp_fn cmp)
-{
-	unsigned i;
-
-	for (i = from;
-	     i + 1 < iter->used &&
-	     cmp(iter->b, iter->data[i].k, iter->data[i + 1].k) > 0;
-	     i++)
-		swap(iter->data[i], iter->data[i + 1]);
-}
-
-static inline void sort_iter_sift(struct sort_iter *iter, sort_cmp_fn cmp)
-{
-
-	__sort_iter_sift(iter, 0, cmp);
-}
-
-static inline void sort_iter_sort(struct sort_iter *iter, sort_cmp_fn cmp)
-{
-	unsigned i = iter->used;
-
-	while (i--)
-		__sort_iter_sift(iter, i, cmp);
-}
-
-static void sort_iter_add(struct sort_iter *iter,
-			  struct bkey_packed *k,
-			  struct bkey_packed *end)
-{
-	BUG_ON(iter->used >= ARRAY_SIZE(iter->data));
-
-	if (k != end)
-		iter->data[iter->used++] = (struct sort_iter_set) { k, end };
-}
-
-static inline struct bkey_packed *sort_iter_peek(struct sort_iter *iter)
-{
-	return iter->used ? iter->data->k : NULL;
-}
-
-static inline void sort_iter_advance(struct sort_iter *iter, sort_cmp_fn cmp)
-{
-	iter->data->k = bkey_next(iter->data->k);
-
-	BUG_ON(iter->data->k > iter->data->end);
-
-	if (iter->data->k == iter->data->end)
-		array_remove_item(iter->data, iter->used, 0);
-	else
-		sort_iter_sift(iter, cmp);
-}
-
-static inline struct bkey_packed *sort_iter_next(struct sort_iter *iter,
-						 sort_cmp_fn cmp)
-{
-	struct bkey_packed *ret = sort_iter_peek(iter);
-
-	if (ret)
-		sort_iter_advance(iter, cmp);
-
-	return ret;
-}
-
-static inline int sort_key_whiteouts_cmp(struct btree *b,
-					 struct bkey_packed *l,
-					 struct bkey_packed *r)
-{
-	return bkey_cmp_packed(b, l, r);
-}
-
-static unsigned sort_key_whiteouts(struct bkey_packed *dst,
-				   struct sort_iter *iter)
-{
-	struct bkey_packed *in, *out = dst;
-
-	sort_iter_sort(iter, sort_key_whiteouts_cmp);
-
-	while ((in = sort_iter_next(iter, sort_key_whiteouts_cmp))) {
-		bkey_copy(out, in);
-		out = bkey_next(out);
-	}
-
-	return (u64 *) out - (u64 *) dst;
-}
-
-static inline int sort_extent_whiteouts_cmp(struct btree *b,
-					    struct bkey_packed *l,
-					    struct bkey_packed *r)
-{
-	struct bkey ul = bkey_unpack_key(b, l);
-	struct bkey ur = bkey_unpack_key(b, r);
-
-	return bkey_cmp(bkey_start_pos(&ul), bkey_start_pos(&ur));
-}
-
-static unsigned sort_extent_whiteouts(struct bkey_packed *dst,
-				      struct sort_iter *iter)
-{
-	const struct bkey_format *f = &iter->b->format;
-	struct bkey_packed *in, *out = dst;
-	struct bkey_i l, r;
-	bool prev = false, l_packed = false;
-	u64 max_packed_size	= bkey_field_max(f, BKEY_FIELD_SIZE);
-	u64 max_packed_offset	= bkey_field_max(f, BKEY_FIELD_OFFSET);
-	u64 new_size;
-
-	max_packed_size = min_t(u64, max_packed_size, KEY_SIZE_MAX);
-
-	sort_iter_sort(iter, sort_extent_whiteouts_cmp);
-
-	while ((in = sort_iter_next(iter, sort_extent_whiteouts_cmp))) {
-		if (bkey_deleted(in))
-			continue;
-
-		EBUG_ON(bkeyp_val_u64s(f, in));
-		EBUG_ON(in->type != KEY_TYPE_DISCARD);
-
-		r.k = bkey_unpack_key(iter->b, in);
-
-		if (prev &&
-		    bkey_cmp(l.k.p, bkey_start_pos(&r.k)) >= 0) {
-			if (bkey_cmp(l.k.p, r.k.p) >= 0)
-				continue;
-
-			new_size = l_packed
-				? min(max_packed_size, max_packed_offset -
-				      bkey_start_offset(&l.k))
-				: KEY_SIZE_MAX;
-
-			new_size = min(new_size, r.k.p.offset -
-				       bkey_start_offset(&l.k));
-
-			BUG_ON(new_size < l.k.size);
-
-			bch2_key_resize(&l.k, new_size);
-
-			if (bkey_cmp(l.k.p, r.k.p) >= 0)
-				continue;
-
-			bch2_cut_front(l.k.p, &r);
-		}
-
-		if (prev) {
-			if (!bch2_bkey_pack(out, &l, f)) {
-				BUG_ON(l_packed);
-				bkey_copy(out, &l);
-			}
-			out = bkey_next(out);
-		}
-
-		l = r;
-		prev = true;
-		l_packed = bkey_packed(in);
-	}
-
-	if (prev) {
-		if (!bch2_bkey_pack(out, &l, f)) {
-			BUG_ON(l_packed);
-			bkey_copy(out, &l);
-		}
-		out = bkey_next(out);
-	}
-
-	return (u64 *) out - (u64 *) dst;
-}
-
 static unsigned should_compact_bset(struct btree *b, struct bset_tree *t,
 				    bool compacting,
 				    enum compact_mode mode)
@@ -420,11 +200,10 @@ bool __bch2_compact_whiteouts(struct bch_fs *c, struct btree *b,
 	BUG_ON((void *) unwritten_whiteouts_start(c, b) <
 	       (void *) btree_bkey_last(b, bset_tree_last(b)));
 
-	u64s = btree_node_is_extents(b)
-		? sort_extent_whiteouts(unwritten_whiteouts_start(c, b),
-					&sort_iter)
-		: sort_key_whiteouts(unwritten_whiteouts_start(c, b),
-				     &sort_iter);
+	u64s = (btree_node_is_extents(b)
+		? bch2_sort_extent_whiteouts
+		: bch2_sort_key_whiteouts)(unwritten_whiteouts_start(c, b),
+					   &sort_iter);
 
 	BUG_ON(u64s > b->whiteout_u64s);
 	BUG_ON(u64s != b->whiteout_u64s && !btree_node_is_extents(b));
@@ -499,87 +278,6 @@ static bool bch2_drop_whiteouts(struct btree *b)
 	return ret;
 }
 
-static inline int sort_keys_cmp(struct btree *b,
-				struct bkey_packed *l,
-				struct bkey_packed *r)
-{
-	return bkey_cmp_packed(b, l, r) ?:
-		(int) bkey_whiteout(r) - (int) bkey_whiteout(l) ?:
-		(int) l->needs_whiteout - (int) r->needs_whiteout;
-}
-
-static unsigned sort_keys(struct bkey_packed *dst,
-			  struct sort_iter *iter,
-			  bool filter_whiteouts)
-{
-	const struct bkey_format *f = &iter->b->format;
-	struct bkey_packed *in, *next, *out = dst;
-
-	sort_iter_sort(iter, sort_keys_cmp);
-
-	while ((in = sort_iter_next(iter, sort_keys_cmp))) {
-		if (bkey_whiteout(in) &&
-		    (filter_whiteouts || !in->needs_whiteout))
-			continue;
-
-		if (bkey_whiteout(in) &&
-		    (next = sort_iter_peek(iter)) &&
-		    !bkey_cmp_packed(iter->b, in, next)) {
-			BUG_ON(in->needs_whiteout &&
-			       next->needs_whiteout);
-			/*
-			 * XXX racy, called with read lock from write path
-			 *
-			 * leads to spurious BUG_ON() in bkey_unpack_key() in
-			 * debug mode
-			 */
-			next->needs_whiteout |= in->needs_whiteout;
-			continue;
-		}
-
-		if (bkey_whiteout(in)) {
-			memcpy_u64s(out, in, bkeyp_key_u64s(f, in));
-			set_bkeyp_val_u64s(f, out, 0);
-		} else {
-			bkey_copy(out, in);
-		}
-		out = bkey_next(out);
-	}
-
-	return (u64 *) out - (u64 *) dst;
-}
-
-static inline int sort_extents_cmp(struct btree *b,
-				   struct bkey_packed *l,
-				   struct bkey_packed *r)
-{
-	return bkey_cmp_packed(b, l, r) ?:
-		(int) bkey_deleted(l) - (int) bkey_deleted(r);
-}
-
-static unsigned sort_extents(struct bkey_packed *dst,
-			     struct sort_iter *iter,
-			     bool filter_whiteouts)
-{
-	struct bkey_packed *in, *out = dst;
-
-	sort_iter_sort(iter, sort_extents_cmp);
-
-	while ((in = sort_iter_next(iter, sort_extents_cmp))) {
-		if (bkey_deleted(in))
-			continue;
-
-		if (bkey_whiteout(in) &&
-		    (filter_whiteouts || !in->needs_whiteout))
-			continue;
-
-		bkey_copy(out, in);
-		out = bkey_next(out);
-	}
-
-	return (u64 *) out - (u64 *) dst;
-}
-
 static void btree_node_sort(struct bch_fs *c, struct btree *b,
 			    struct btree_iter *iter,
 			    unsigned start_idx,
@@ -618,9 +316,11 @@ static void btree_node_sort(struct bch_fs *c, struct btree *b,
 	if (btree_node_is_extents(b))
 		filter_whiteouts = bset_written(b, start_bset);
 
-	u64s = btree_node_is_extents(b)
-		? sort_extents(out->keys.start, &sort_iter, filter_whiteouts)
-		: sort_keys(out->keys.start, &sort_iter, filter_whiteouts);
+	u64s = (btree_node_is_extents(b)
+		? bch2_sort_extents
+		: bch2_sort_keys)(out->keys.start,
+				  &sort_iter,
+				  filter_whiteouts);
 
 	out->keys.u64s = cpu_to_le16(u64s);
 
@@ -678,101 +378,6 @@ static void btree_node_sort(struct bch_fs *c, struct btree *b,
 	bch2_verify_btree_nr_keys(b);
 }
 
-/* Sort + repack in a new format: */
-static struct btree_nr_keys sort_repack(struct bset *dst,
-					struct btree *src,
-					struct btree_node_iter *src_iter,
-					struct bkey_format *out_f,
-					bool filter_whiteouts)
-{
-	struct bkey_format *in_f = &src->format;
-	struct bkey_packed *in, *out = vstruct_last(dst);
-	struct btree_nr_keys nr;
-
-	memset(&nr, 0, sizeof(nr));
-
-	while ((in = bch2_btree_node_iter_next_all(src_iter, src))) {
-		if (filter_whiteouts && bkey_whiteout(in))
-			continue;
-
-		if (bch2_bkey_transform(out_f, out, bkey_packed(in)
-				       ? in_f : &bch2_bkey_format_current, in))
-			out->format = KEY_FORMAT_LOCAL_BTREE;
-		else
-			bch2_bkey_unpack(src, (void *) out, in);
-
-		btree_keys_account_key_add(&nr, 0, out);
-		out = bkey_next(out);
-	}
-
-	dst->u64s = cpu_to_le16((u64 *) out - dst->_data);
-	return nr;
-}
-
-/* Sort, repack, and merge: */
-static struct btree_nr_keys sort_repack_merge(struct bch_fs *c,
-					      struct bset *dst,
-					      struct btree *src,
-					      struct btree_node_iter *iter,
-					      struct bkey_format *out_f,
-					      bool filter_whiteouts,
-					      key_filter_fn filter,
-					      key_merge_fn merge)
-{
-	struct bkey_packed *k, *prev = NULL, *out;
-	struct btree_nr_keys nr;
-	BKEY_PADDED(k) tmp;
-
-	memset(&nr, 0, sizeof(nr));
-
-	while ((k = bch2_btree_node_iter_next_all(iter, src))) {
-		if (filter_whiteouts && bkey_whiteout(k))
-			continue;
-
-		/*
-		 * The filter might modify pointers, so we have to unpack the
-		 * key and values to &tmp.k:
-		 */
-		bch2_bkey_unpack(src, &tmp.k, k);
-
-		if (filter && filter(c, src, bkey_i_to_s(&tmp.k)))
-			continue;
-
-		/* prev is always unpacked, for key merging: */
-
-		if (prev &&
-		    merge &&
-		    merge(c, src, (void *) prev, &tmp.k) == BCH_MERGE_MERGE)
-			continue;
-
-		/*
-		 * the current key becomes the new prev: advance prev, then
-		 * copy the current key - but first pack prev (in place):
-		 */
-		if (prev) {
-			bch2_bkey_pack(prev, (void *) prev, out_f);
-
-			btree_keys_account_key_add(&nr, 0, prev);
-			prev = bkey_next(prev);
-		} else {
-			prev = vstruct_last(dst);
-		}
-
-		bkey_copy(prev, &tmp.k);
-	}
-
-	if (prev) {
-		bch2_bkey_pack(prev, (void *) prev, out_f);
-		btree_keys_account_key_add(&nr, 0, prev);
-		out = bkey_next(prev);
-	} else {
-		out = vstruct_last(dst);
-	}
-
-	dst->u64s = cpu_to_le16((u64 *) out - dst->_data);
-	return nr;
-}
-
 void bch2_btree_sort_into(struct bch_fs *c,
 			 struct btree *dst,
 			 struct btree *src)
@@ -787,19 +392,12 @@ void bch2_btree_sort_into(struct bch_fs *c,
 
 	bch2_btree_node_iter_init_from_start(&src_iter, src);
 
-	if (btree_node_ops(src)->key_normalize ||
-	    btree_node_ops(src)->key_merge)
-		nr = sort_repack_merge(c, btree_bset_first(dst),
-				src, &src_iter,
-				&dst->format,
-				true,
-				btree_node_ops(src)->key_normalize,
-				btree_node_ops(src)->key_merge);
-	else
-		nr = sort_repack(btree_bset_first(dst),
-				src, &src_iter,
-				&dst->format,
-				true);
+	nr = bch2_sort_repack_merge(c, btree_bset_first(dst),
+			src, &src_iter,
+			&dst->format,
+			true,
+			btree_node_ops(src)->key_normalize,
+			btree_node_ops(src)->key_merge);
 
 	bch2_time_stats_update(&c->times[BCH_TIME_btree_sort], start_time);
 
@@ -1815,8 +1413,8 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
 	b->whiteout_u64s = 0;
 
 	u64s = btree_node_is_extents(b)
-		? sort_extents(vstruct_last(i), &sort_iter, false)
-		: sort_keys(i->start, &sort_iter, false);
+		? bch2_sort_extents(vstruct_last(i), &sort_iter, false)
+		: bch2_sort_keys(i->start, &sort_iter, false);
 	le16_add_cpu(&i->u64s, u64s);
 
 	clear_needs_whiteout(i);
diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h
index 34d0c0fe8b25..9c5a6f9471bd 100644
--- a/fs/bcachefs/btree_io.h
+++ b/fs/bcachefs/btree_io.h
@@ -143,46 +143,4 @@ void bch2_btree_flush_all_writes(struct bch_fs *);
 void bch2_btree_verify_flushed(struct bch_fs *);
 ssize_t bch2_dirty_btree_nodes_print(struct bch_fs *, char *);
 
-/* Sorting */
-
-struct btree_node_iter_large {
-	u16		used;
-
-	struct btree_node_iter_set data[MAX_BSETS];
-};
-
-void bch2_btree_node_iter_large_advance(struct btree_node_iter_large *,
-					struct btree *);
-
-void bch2_btree_node_iter_large_push(struct btree_node_iter_large *,
-				     struct btree *,
-				     const struct bkey_packed *,
-				     const struct bkey_packed *);
-
-static inline bool bch2_btree_node_iter_large_end(struct btree_node_iter_large *iter)
-{
-	return !iter->used;
-}
-
-static inline struct bkey_packed *
-bch2_btree_node_iter_large_peek_all(struct btree_node_iter_large *iter,
-				    struct btree *b)
-{
-	return bch2_btree_node_iter_large_end(iter)
-		? NULL
-		: __btree_node_offset_to_key(b, iter->data->k);
-}
-
-static inline struct bkey_packed *
-bch2_btree_node_iter_large_next_all(struct btree_node_iter_large *iter,
-				    struct btree *b)
-{
-	struct bkey_packed *ret = bch2_btree_node_iter_large_peek_all(iter, b);
-
-	if (ret)
-		bch2_btree_node_iter_large_advance(iter, b);
-
-	return ret;
-}
-
 #endif /* _BCACHEFS_BTREE_IO_H */
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 30852090ce75..582499b08f31 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -27,87 +27,6 @@
 #include "util.h"
 #include "xattr.h"
 
-static void sort_key_next(struct btree_node_iter_large *iter,
-			  struct btree *b,
-			  struct btree_node_iter_set *i)
-{
-	i->k += __btree_node_offset_to_key(b, i->k)->u64s;
-
-	if (i->k == i->end)
-		*i = iter->data[--iter->used];
-}
-
-/*
- * Returns true if l > r - unless l == r, in which case returns true if l is
- * older than r.
- *
- * Necessary for btree_sort_fixup() - if there are multiple keys that compare
- * equal in different sets, we have to process them newest to oldest.
- */
-#define key_sort_cmp(h, l, r)						\
-({									\
-	bkey_cmp_packed(b,						\
-			__btree_node_offset_to_key(b, (l).k),		\
-			__btree_node_offset_to_key(b, (r).k))		\
-									\
-	?: (l).k - (r).k;						\
-})
-
-static inline bool should_drop_next_key(struct btree_node_iter_large *iter,
-					struct btree *b)
-{
-	struct btree_node_iter_set *l = iter->data, *r = iter->data + 1;
-	struct bkey_packed *k = __btree_node_offset_to_key(b, l->k);
-
-	if (bkey_whiteout(k))
-		return true;
-
-	if (iter->used < 2)
-		return false;
-
-	if (iter->used > 2 &&
-	    key_sort_cmp(iter, r[0], r[1]) >= 0)
-		r++;
-
-	/*
-	 * key_sort_cmp() ensures that when keys compare equal the older key
-	 * comes first; so if l->k compares equal to r->k then l->k is older and
-	 * should be dropped.
-	 */
-	return !bkey_cmp_packed(b,
-				__btree_node_offset_to_key(b, l->k),
-				__btree_node_offset_to_key(b, r->k));
-}
-
-struct btree_nr_keys bch2_key_sort_fix_overlapping(struct bset *dst,
-					struct btree *b,
-					struct btree_node_iter_large *iter)
-{
-	struct bkey_packed *out = dst->start;
-	struct btree_nr_keys nr;
-
-	memset(&nr, 0, sizeof(nr));
-
-	heap_resort(iter, key_sort_cmp, NULL);
-
-	while (!bch2_btree_node_iter_large_end(iter)) {
-		if (!should_drop_next_key(iter, b)) {
-			struct bkey_packed *k =
-				__btree_node_offset_to_key(b, iter->data->k);
-
-			bkey_copy(out, k);
-			btree_keys_account_key_add(&nr, 0, out);
-			out = bkey_next(out);
-		}
-
-		sort_key_next(iter, b, iter->data);
-		heap_sift_down(iter, 0, key_sort_cmp, NULL);
-	}
-
-	dst->u64s = cpu_to_le16((u64 *) out - dst->_data);
-	return nr;
-}
-
 /* Common among btree and extent ptrs */
 
 const struct bch_extent_ptr *
@@ -777,7 +696,7 @@ int bch2_btree_pick_ptr(struct bch_fs *c, const struct btree *b,
 
 /* Extents */
 
-static bool __bch2_cut_front(struct bpos where, struct bkey_s k)
+bool __bch2_cut_front(struct bpos where, struct bkey_s k)
 {
 	u64 len = 0;
 
@@ -830,11 +749,6 @@ static bool __bch2_cut_front(struct bpos where, struct bkey_s k)
 	return true;
 }
 
-bool bch2_cut_front(struct bpos where, struct bkey_i *k)
-{
-	return __bch2_cut_front(where, bkey_i_to_s(k));
-}
-
 bool bch2_cut_back(struct bpos where, struct bkey *k)
 {
 	u64 len = 0;
@@ -870,24 +784,6 @@ void bch2_key_resize(struct bkey *k,
 	k->size = new_size;
 }
 
-/*
- * In extent_sort_fix_overlapping(), insert_fixup_extent(),
- * extent_merge_inline() - we're modifying keys in place that are packed. To do
- * that we have to unpack the key, modify the unpacked key - then this
- * copies/repacks the unpacked to the original as necessary.
- */
-static void extent_save(struct btree *b, struct bkey_packed *dst,
-			struct bkey *src)
-{
-	struct bkey_format *f = &b->format;
-	struct bkey_i *dst_unpacked;
-
-	if ((dst_unpacked = packed_to_bkey(dst)))
-		dst_unpacked->k = *src;
-	else
-		BUG_ON(!bch2_bkey_pack_key(dst, src, f));
-}
-
 static bool extent_i_save(struct btree *b, struct bkey_packed *dst,
 			  struct bkey_i *src)
 {
@@ -906,170 +802,6 @@ static bool extent_i_save(struct btree *b, struct bkey_packed *dst,
 	return true;
 }
 
-/*
- * If keys compare equal, compare by pointer order:
- *
- * Necessary for sort_fix_overlapping() - if there are multiple keys that
- * compare equal in different sets, we have to process them newest to oldest.
- */
-#define extent_sort_cmp(h, l, r)					\
-({									\
-	struct bkey _ul = bkey_unpack_key(b,				\
-				__btree_node_offset_to_key(b, (l).k));	\
-	struct bkey _ur = bkey_unpack_key(b,				\
-				__btree_node_offset_to_key(b, (r).k));	\
-									\
-	bkey_cmp(bkey_start_pos(&_ul),					\
-		 bkey_start_pos(&_ur)) ?: (r).k - (l).k;		\
-})
-
-static inline void extent_sort_sift(struct btree_node_iter_large *iter,
-				    struct btree *b, size_t i)
-{
-	heap_sift_down(iter, i, extent_sort_cmp, NULL);
-}
-
-static inline void extent_sort_next(struct btree_node_iter_large *iter,
-				    struct btree *b,
-				    struct btree_node_iter_set *i)
-{
-	sort_key_next(iter, b, i);
-	heap_sift_down(iter, i - iter->data, extent_sort_cmp, NULL);
-}
-
-static void extent_sort_append(struct bch_fs *c,
-			       struct btree *b,
-			       struct btree_nr_keys *nr,
-			       struct bkey_packed *start,
-			       struct bkey_packed **prev,
-			       struct bkey_packed *k)
-{
-	struct bkey_format *f = &b->format;
-	BKEY_PADDED(k) tmp;
-
-	if (bkey_whiteout(k))
-		return;
-
-	bch2_bkey_unpack(b, &tmp.k, k);
-
-	if (*prev &&
-	    bch2_extent_merge(c, b, (void *) *prev, &tmp.k))
-		return;
-
-	if (*prev) {
-		bch2_bkey_pack(*prev, (void *) *prev, f);
-
-		btree_keys_account_key_add(nr, 0, *prev);
-		*prev = bkey_next(*prev);
-	} else {
-		*prev = start;
-	}
-
-	bkey_copy(*prev, &tmp.k);
-}
-
-struct btree_nr_keys bch2_extent_sort_fix_overlapping(struct bch_fs *c,
-					struct bset *dst,
-					struct btree *b,
-					struct btree_node_iter_large *iter)
-{
-	struct bkey_format *f = &b->format;
-	struct btree_node_iter_set *_l = iter->data, *_r;
-	struct bkey_packed *prev = NULL, *out, *lk, *rk;
-	struct bkey l_unpacked, r_unpacked;
-	struct bkey_s l, r;
-	struct btree_nr_keys nr;
-
-	memset(&nr, 0, sizeof(nr));
-
-	heap_resort(iter, extent_sort_cmp, NULL);
-
-	while (!bch2_btree_node_iter_large_end(iter)) {
-		lk = __btree_node_offset_to_key(b, _l->k);
-
-		if (iter->used == 1) {
-			extent_sort_append(c, b, &nr, dst->start, &prev, lk);
-			extent_sort_next(iter, b, _l);
-			continue;
-		}
-
-		_r = iter->data + 1;
-		if (iter->used > 2 &&
-		    extent_sort_cmp(iter, _r[0], _r[1]) >= 0)
-			_r++;
-
-		rk = __btree_node_offset_to_key(b, _r->k);
-
-		l = __bkey_disassemble(b, lk, &l_unpacked);
-		r = __bkey_disassemble(b, rk, &r_unpacked);
-
-		/* If current key and next key don't overlap, just append */
-		if (bkey_cmp(l.k->p, bkey_start_pos(r.k)) <= 0) {
-			extent_sort_append(c, b, &nr, dst->start, &prev, lk);
-			extent_sort_next(iter, b, _l);
-			continue;
-		}
-
-		/* Skip 0 size keys */
-		if (!r.k->size) {
-			extent_sort_next(iter, b, _r);
-			continue;
-		}
-
-		/*
-		 * overlap: keep the newer key and trim the older key so they
-		 * don't overlap. comparing pointers tells us which one is
-		 * newer, since the bsets are appended one after the other.
-		 */
-
-		/* can't happen because of comparison func */
-		BUG_ON(_l->k < _r->k &&
-		       !bkey_cmp(bkey_start_pos(l.k), bkey_start_pos(r.k)));
-
-		if (_l->k > _r->k) {
-			/* l wins, trim r */
-			if (bkey_cmp(l.k->p, r.k->p) >= 0) {
-				sort_key_next(iter, b, _r);
-			} else {
-				__bch2_cut_front(l.k->p, r);
-				extent_save(b, rk, r.k);
-			}
-
-			extent_sort_sift(iter, b, _r - iter->data);
-		} else if (bkey_cmp(l.k->p, r.k->p) > 0) {
-			BKEY_PADDED(k) tmp;
-
-			/*
-			 * r wins, but it overlaps in the middle of l - split l:
-			 */
-			bkey_reassemble(&tmp.k, l.s_c);
-			bch2_cut_back(bkey_start_pos(r.k), &tmp.k.k);
-
-			__bch2_cut_front(r.k->p, l);
-			extent_save(b, lk, l.k);
-
-			extent_sort_sift(iter, b, 0);
-
-			extent_sort_append(c, b, &nr, dst->start, &prev,
-					   bkey_to_packed(&tmp.k));
-		} else {
-			bch2_cut_back(bkey_start_pos(r.k), l.k);
-			extent_save(b, lk, l.k);
-		}
-	}
-
-	if (prev) {
-		bch2_bkey_pack(prev, (void *) prev, f);
-		btree_keys_account_key_add(&nr, 0, prev);
-		out = bkey_next(prev);
-	} else {
-		out = dst->start;
-	}
-
-	dst->u64s = cpu_to_le16((u64 *) out - dst->_data);
-	return nr;
-}
-
 struct extent_insert_state {
 	struct btree_insert		*trans;
 	struct btree_insert_entry	*insert;
diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h
index 15865b27847d..389604f25630 100644
--- a/fs/bcachefs/extents.h
+++ b/fs/bcachefs/extents.h
@@ -7,13 +7,8 @@
 #include "extents_types.h"
 
 struct bch_fs;
-struct journal_res;
-struct btree_node_iter;
-struct btree_node_iter_large;
 struct btree_insert;
 struct btree_insert_entry;
-struct bch_devs_mask;
-union bch_extent_crc;
 
 const char *bch2_btree_ptr_invalid(const struct bch_fs *, struct bkey_s_c);
 void bch2_btree_ptr_debugcheck(struct bch_fs *, struct btree *,
@@ -46,14 +41,6 @@ enum merge_result bch2_extent_merge(struct bch_fs *, struct btree *,
 	.is_extents	= true,					\
 }
 
-struct btree_nr_keys bch2_key_sort_fix_overlapping(struct bset *,
-						  struct btree *,
-						  struct btree_node_iter_large *);
-struct btree_nr_keys bch2_extent_sort_fix_overlapping(struct bch_fs *c,
-						     struct bset *,
-						     struct btree *,
-						     struct btree_node_iter_large *);
-
 void bch2_mark_io_failure(struct bch_io_failures *,
 			  struct extent_ptr_decoded *);
 int bch2_btree_pick_ptr(struct bch_fs *, const struct btree *,
@@ -506,10 +493,34 @@ do {									\
 	}								\
 } while (0)
 
-bool bch2_cut_front(struct bpos, struct bkey_i *);
+bool __bch2_cut_front(struct bpos, struct bkey_s);
+
+static inline bool bch2_cut_front(struct bpos where, struct bkey_i *k)
+{
+	return __bch2_cut_front(where, bkey_i_to_s(k));
+}
+
 bool bch2_cut_back(struct bpos, struct bkey *);
 void bch2_key_resize(struct bkey *, unsigned);
 
+/*
+ * In extent_sort_fix_overlapping(), insert_fixup_extent(),
+ * extent_merge_inline() - we're modifying keys in place that are packed. To do
+ * that we have to unpack the key, modify the unpacked key - then this
+ * copies/repacks the unpacked to the original as necessary.
+ */
+static inline void extent_save(struct btree *b, struct bkey_packed *dst,
+			       struct bkey *src)
+{
+	struct bkey_format *f = &b->format;
+	struct bkey_i *dst_unpacked;
+
+	if ((dst_unpacked = packed_to_bkey(dst)))
+		dst_unpacked->k = *src;
+	else
+		BUG_ON(!bch2_bkey_pack_key(dst, src, f));
+}
+
 int bch2_check_range_allocated(struct bch_fs *, struct bpos, u64);
 
 #endif /* _BCACHEFS_EXTENTS_H */
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 78a2668fc03e..cadbc5481bcb 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -10,6 +10,7 @@
 #include "bcachefs.h"
 #include "alloc_background.h"
 #include "alloc_foreground.h"
+#include "bkey_sort.h"
 #include "btree_cache.h"
 #include "btree_gc.h"
 #include "btree_update_interior.h"
-- 
cgit 


From 01a0108f0139a2f6dbace54dd5d592d2d76415c1 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 29 Nov 2018 03:24:06 -0500
Subject: bcachefs: Fix a btree iter usage error

previously, if the code traversed to the next btree node, that could
return an error (due to lock restarts) - which was not being checked
for.

fix is to rework it so it never iterates past the current leaf node, and
pops an assertion if it ever sees an error.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-io.c | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 40d3f02d86d8..e7d7c5fe6db7 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -242,9 +242,15 @@ static s64 sum_sector_overwrites(struct bkey_i *new, struct btree_iter *_iter,
 	bch2_btree_iter_link(_iter, &iter);
 	bch2_btree_iter_copy(&iter, _iter);
 
-	for_each_btree_key_continue(&iter, BTREE_ITER_SLOTS, old) {
-		if (bkey_cmp(new->k.p, bkey_start_pos(old.k)) <= 0)
-			break;
+	old = bch2_btree_iter_peek_slot(&iter);
+
+	while (1) {
+		/*
+		 * should not be possible to get an error here, since we're
+		 * carefully not advancing past @new and thus whatever leaf node
+		 * @_iter currently points to:
+		 */
+		BUG_ON(btree_iter_err(old));
 
 		if (allocating &&
 		    !bch2_extent_is_fully_allocated(old))
@@ -256,6 +262,11 @@ static s64 sum_sector_overwrites(struct bkey_i *new, struct btree_iter *_iter,
 			      bkey_start_offset(old.k))) *
 			(bkey_extent_is_allocation(&new->k) -
 			 bkey_extent_is_allocation(old.k));
+
+		if (bkey_cmp(old.k->p, new->k.p) >= 0)
+			break;
+
+		old = bch2_btree_iter_next_slot(&iter);
 	}
 
 	bch2_btree_iter_unlink(&iter);
-- 
cgit 


From 26609b619fa2301eb7eb5855a7005d99f8a07a73 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 1 Nov 2018 15:10:01 -0400
Subject: bcachefs: Make bkey types globally unique

this lets us get rid of a lot of extra switch statements - in a lot of
places we dispatch on the btree node type, and then the key type, so
this is a nice cleanup across a lot of code.

Also improve the on disk format versioning stuff.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/acl.c                   |    6 +-
 fs/bcachefs/alloc_background.c      |   28 +-
 fs/bcachefs/alloc_background.h      |    2 +-
 fs/bcachefs/alloc_foreground.c      |   11 +-
 fs/bcachefs/alloc_foreground.h      |    2 +-
 fs/bcachefs/bcachefs.h              |    1 +
 fs/bcachefs/bcachefs_format.h       |  184 +++----
 fs/bcachefs/bkey.c                  |    2 +-
 fs/bcachefs/bkey.h                  |   88 ++-
 fs/bcachefs/bkey_methods.c          |  218 +++++---
 fs/bcachefs/bkey_methods.h          |   65 +--
 fs/bcachefs/bkey_sort.c             |   21 +-
 fs/bcachefs/bkey_sort.h             |    9 +-
 fs/bcachefs/bset.h                  |    6 +-
 fs/bcachefs/btree_cache.c           |   18 +-
 fs/bcachefs/btree_cache.h           |    6 +-
 fs/bcachefs/btree_gc.c              |  197 ++-----
 fs/bcachefs/btree_gc.h              |    8 +-
 fs/bcachefs/btree_io.c              |   97 ++--
 fs/bcachefs/btree_iter.c            |    4 +-
 fs/bcachefs/btree_types.h           |   35 +-
 fs/bcachefs/btree_update.h          |    2 +-
 fs/bcachefs/btree_update_interior.c |   56 +-
 fs/bcachefs/btree_update_leaf.c     |   10 +-
 fs/bcachefs/buckets.c               |  222 ++++----
 fs/bcachefs/buckets.h               |    4 +-
 fs/bcachefs/debug.c                 |    6 +-
 fs/bcachefs/dirent.c                |   72 +--
 fs/bcachefs/dirent.h                |    2 +-
 fs/bcachefs/ec.c                    |   60 +-
 fs/bcachefs/ec.h                    |   10 +-
 fs/bcachefs/extents.c               | 1031 ++++++++++++++++-------------------
 fs/bcachefs/extents.h               |  529 ++++++++++--------
 fs/bcachefs/fs-io.c                 |   16 +-
 fs/bcachefs/fs.c                    |   12 +-
 fs/bcachefs/fsck.c                  |   28 +-
 fs/bcachefs/inode.c                 |  108 ++--
 fs/bcachefs/inode.h                 |   12 +-
 fs/bcachefs/io.c                    |   18 +-
 fs/bcachefs/journal_io.c            |   58 +-
 fs/bcachefs/migrate.c               |   30 +-
 fs/bcachefs/move.c                  |   58 +-
 fs/bcachefs/move.h                  |    2 +-
 fs/bcachefs/movinggc.c              |   34 +-
 fs/bcachefs/opts.h                  |    3 +
 fs/bcachefs/quota.c                 |   56 +-
 fs/bcachefs/quota.h                 |    8 +-
 fs/bcachefs/rebalance.c             |   36 +-
 fs/bcachefs/recovery.c              |   23 +-
 fs/bcachefs/replicas.c              |   96 ++--
 fs/bcachefs/replicas.h              |    5 +-
 fs/bcachefs/str_hash.h              |    9 +-
 fs/bcachefs/super-io.c              |   51 +-
 fs/bcachefs/super-io.h              |    2 +
 fs/bcachefs/super.c                 |    2 +-
 fs/bcachefs/sysfs.c                 |    2 +-
 fs/bcachefs/trace.h                 |    2 +-
 fs/bcachefs/xattr.c                 |  102 ++--
 fs/bcachefs/xattr.h                 |    2 +-
 59 files changed, 1777 insertions(+), 2010 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/acl.c b/fs/bcachefs/acl.c
index eb6fa4d7c1f6..bcfc9fdce35e 100644
--- a/fs/bcachefs/acl.c
+++ b/fs/bcachefs/acl.c
@@ -24,9 +24,9 @@ static inline int acl_to_xattr_type(int type)
 {
 	switch (type) {
 	case ACL_TYPE_ACCESS:
-		return BCH_XATTR_INDEX_POSIX_ACL_ACCESS;
+		return KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS;
 	case ACL_TYPE_DEFAULT:
-		return BCH_XATTR_INDEX_POSIX_ACL_DEFAULT;
+		return KEY_TYPE_XATTR_INDEX_POSIX_ACL_DEFAULT;
 	default:
 		BUG();
 	}
@@ -355,7 +355,7 @@ int bch2_acl_chmod(struct btree_trans *trans,
 
 	iter = bch2_hash_lookup(trans, bch2_xattr_hash_desc,
 			&inode->ei_str_hash, inode->v.i_ino,
-			&X_SEARCH(BCH_XATTR_INDEX_POSIX_ACL_ACCESS, "", 0),
+			&X_SEARCH(KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS, "", 0),
 			BTREE_ITER_INTENT);
 	if (IS_ERR(iter))
 		return PTR_ERR(iter) != -ENOENT ? PTR_ERR(iter) : 0;
diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 390b008b0200..885aff511f97 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -76,22 +76,15 @@ static unsigned bch_alloc_val_u64s(const struct bch_alloc *a)
 
 const char *bch2_alloc_invalid(const struct bch_fs *c, struct bkey_s_c k)
 {
+	struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k);
+
 	if (k.k->p.inode >= c->sb.nr_devices ||
 	    !c->devs[k.k->p.inode])
 		return "invalid device";
 
-	switch (k.k->type) {
-	case BCH_ALLOC: {
-		struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k);
-
-		/* allow for unknown fields */
-		if (bkey_val_u64s(a.k) < bch_alloc_val_u64s(a.v))
-			return "incorrect value size";
-		break;
-	}
-	default:
-		return "invalid type";
-	}
+	/* allow for unknown fields */
+	if (bkey_val_u64s(a.k) < bch_alloc_val_u64s(a.v))
+		return "incorrect value size";
 
 	return NULL;
 }
@@ -99,14 +92,9 @@ const char *bch2_alloc_invalid(const struct bch_fs *c, struct bkey_s_c k)
 void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c,
 			struct bkey_s_c k)
 {
-	switch (k.k->type) {
-	case BCH_ALLOC: {
-		struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k);
+	struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k);
 
-		pr_buf(out, "gen %u", a.v->gen);
-		break;
-	}
-	}
+	pr_buf(out, "gen %u", a.v->gen);
 }
 
 static inline unsigned get_alloc_field(const u8 **p, unsigned bytes)
@@ -158,7 +146,7 @@ static void bch2_alloc_read_key(struct bch_fs *c, struct bkey_s_c k)
 	struct bucket *g;
 	const u8 *d;
 
-	if (k.k->type != BCH_ALLOC)
+	if (k.k->type != KEY_TYPE_alloc)
 		return;
 
 	a = bkey_s_c_to_alloc(k);
diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h
index 59b6a5f2f890..8ced4e845281 100644
--- a/fs/bcachefs/alloc_background.h
+++ b/fs/bcachefs/alloc_background.h
@@ -11,7 +11,7 @@
 const char *bch2_alloc_invalid(const struct bch_fs *, struct bkey_s_c);
 void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 
-#define bch2_bkey_alloc_ops (struct bkey_ops) {		\
+#define bch2_bkey_ops_alloc (struct bkey_ops) {		\
 	.key_invalid	= bch2_alloc_invalid,		\
 	.val_to_text	= bch2_alloc_to_text,		\
 }
diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index 6e5f6e57da56..ddcf2c407764 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -923,7 +923,8 @@ err:
  * as allocated out of @ob
  */
 void bch2_alloc_sectors_append_ptrs(struct bch_fs *c, struct write_point *wp,
-				    struct bkey_i_extent *e, unsigned sectors)
+				    struct bkey_i *k, unsigned sectors)
+
 {
 	struct open_bucket *ob;
 	unsigned i;
@@ -935,13 +936,11 @@ void bch2_alloc_sectors_append_ptrs(struct bch_fs *c, struct write_point *wp,
 		struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev);
 		struct bch_extent_ptr tmp = ob->ptr;
 
-		EBUG_ON(bch2_extent_has_device(extent_i_to_s_c(e), ob->ptr.dev));
-
-		tmp.cached = bkey_extent_is_cached(&e->k) ||
-			(!ca->mi.durability && wp->type == BCH_DATA_USER);
+		tmp.cached = !ca->mi.durability &&
+			wp->type == BCH_DATA_USER;
 
 		tmp.offset += ca->mi.bucket_size - ob->sectors_free;
-		extent_ptr_append(e, tmp);
+		bch2_bkey_append_ptr(k, tmp);
 
 		BUG_ON(sectors > ob->sectors_free);
 		ob->sectors_free -= sectors;
diff --git a/fs/bcachefs/alloc_foreground.h b/fs/bcachefs/alloc_foreground.h
index c71cf7381729..94389052fa94 100644
--- a/fs/bcachefs/alloc_foreground.h
+++ b/fs/bcachefs/alloc_foreground.h
@@ -101,7 +101,7 @@ struct write_point *bch2_alloc_sectors_start(struct bch_fs *,
 					     struct closure *);
 
 void bch2_alloc_sectors_append_ptrs(struct bch_fs *, struct write_point *,
-				    struct bkey_i_extent *, unsigned);
+				    struct bkey_i *, unsigned);
 void bch2_alloc_sectors_done(struct bch_fs *, struct write_point *);
 
 void bch2_open_buckets_stop_dev(struct bch_fs *, struct bch_dev *,
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 258a67d4437b..cd2fff851bbe 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -541,6 +541,7 @@ struct bch_fs {
 		__uuid_t	uuid;
 		__uuid_t	user_uuid;
 
+		u16		version;
 		u16		encoded_extent_max;
 
 		u8		nr_devices;
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index a00e77fa1d37..801156b74335 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -307,15 +307,6 @@ static inline void bkey_init(struct bkey *k)
 #define __BKEY_PADDED(key, pad)					\
 	struct { struct bkey_i key; __u64 key ## _pad[pad]; }
 
-#define BKEY_VAL_TYPE(name, nr)						\
-struct bkey_i_##name {							\
-	union {								\
-		struct bkey		k;				\
-		struct bkey_i		k_i;				\
-	};								\
-	struct bch_##name		v;				\
-}
-
 /*
  * - DELETED keys are used internally to mark keys that should be ignored but
  *   override keys in composition order.  Their version number is ignored.
@@ -330,19 +321,37 @@ struct bkey_i_##name {							\
  *   by new writes or cluster-wide GC. Node repair can also overwrite them with
  *   the same or a more recent version number, but not with an older version
  *   number.
+ *
+ * - WHITEOUT: for hash table btrees
 */
-#define KEY_TYPE_DELETED		0
-#define KEY_TYPE_DISCARD		1
-#define KEY_TYPE_ERROR			2
-#define KEY_TYPE_COOKIE			3
-#define KEY_TYPE_PERSISTENT_DISCARD	4
-#define KEY_TYPE_GENERIC_NR		128
+#define BCH_BKEY_TYPES()				\
+	x(deleted,		0)			\
+	x(discard,		1)			\
+	x(error,		2)			\
+	x(cookie,		3)			\
+	x(whiteout,		4)			\
+	x(btree_ptr,		5)			\
+	x(extent,		6)			\
+	x(reservation,		7)			\
+	x(inode,		8)			\
+	x(inode_generation,	9)			\
+	x(dirent,		10)			\
+	x(xattr,		11)			\
+	x(alloc,		12)			\
+	x(quota,		13)			\
+	x(stripe,		14)
+
+enum bch_bkey_type {
+#define x(name, nr) KEY_TYPE_##name	= nr,
+	BCH_BKEY_TYPES()
+#undef x
+	KEY_TYPE_MAX,
+};
 
 struct bch_cookie {
 	struct bch_val		v;
 	__le64			cookie;
 };
-BKEY_VAL_TYPE(cookie,		KEY_TYPE_COOKIE);
 
 /* Extents */
 
@@ -620,21 +629,12 @@ union bch_extent_entry {
 #undef x
 };
 
-enum {
-	BCH_EXTENT		= 128,
-
-	/*
-	 * This is kind of a hack, we're overloading the type for a boolean that
-	 * really should be part of the value - BCH_EXTENT and BCH_EXTENT_CACHED
-	 * have the same value type:
-	 */
-	BCH_EXTENT_CACHED	= 129,
+struct bch_btree_ptr {
+	struct bch_val		v;
 
-	/*
-	 * Persistent reservation:
-	 */
-	BCH_RESERVATION		= 130,
-};
+	__u64			_data[0];
+	struct bch_extent_ptr	start[];
+} __attribute__((packed, aligned(8)));
 
 struct bch_extent {
 	struct bch_val		v;
@@ -642,7 +642,6 @@ struct bch_extent {
 	__u64			_data[0];
 	union bch_extent_entry	start[];
 } __attribute__((packed, aligned(8)));
-BKEY_VAL_TYPE(extent,		BCH_EXTENT);
 
 struct bch_reservation {
 	struct bch_val		v;
@@ -651,7 +650,6 @@ struct bch_reservation {
 	__u8			nr_replicas;
 	__u8			pad[3];
 } __attribute__((packed, aligned(8)));
-BKEY_VAL_TYPE(reservation,	BCH_RESERVATION);
 
 /* Maximum size (in u64s) a single pointer could be: */
 #define BKEY_EXTENT_PTR_U64s_MAX\
@@ -679,12 +677,6 @@ BKEY_VAL_TYPE(reservation,	BCH_RESERVATION);
 
 #define BCACHEFS_ROOT_INO	4096
 
-enum bch_inode_types {
-	BCH_INODE_FS		= 128,
-	BCH_INODE_BLOCKDEV	= 129,
-	BCH_INODE_GENERATION	= 130,
-};
-
 struct bch_inode {
 	struct bch_val		v;
 
@@ -693,7 +685,6 @@ struct bch_inode {
 	__le16			bi_mode;
 	__u8			fields[0];
 } __attribute__((packed, aligned(8)));
-BKEY_VAL_TYPE(inode,		BCH_INODE_FS);
 
 struct bch_inode_generation {
 	struct bch_val		v;
@@ -701,7 +692,6 @@ struct bch_inode_generation {
 	__le32			bi_generation;
 	__le32			pad;
 } __attribute__((packed, aligned(8)));
-BKEY_VAL_TYPE(inode_generation,	BCH_INODE_GENERATION);
 
 #define BCH_INODE_FIELDS()					\
 	BCH_INODE_FIELD(bi_atime,			64)	\
@@ -766,24 +756,6 @@ enum {
 LE32_BITMASK(INODE_STR_HASH,	struct bch_inode, bi_flags, 20, 24);
 LE32_BITMASK(INODE_NR_FIELDS,	struct bch_inode, bi_flags, 24, 32);
 
-struct bch_inode_blockdev {
-	struct bch_val		v;
-
-	__le64			i_size;
-	__le64			i_flags;
-
-	/* Seconds: */
-	__le64			i_ctime;
-	__le64			i_mtime;
-
-	__uuid_t		i_uuid;
-	__u8			i_label[32];
-} __attribute__((packed, aligned(8)));
-BKEY_VAL_TYPE(inode_blockdev,	BCH_INODE_BLOCKDEV);
-
-/* Thin provisioned volume, or cache for another block device? */
-LE64_BITMASK(CACHED_DEV,	struct bch_inode_blockdev, i_flags, 0,  1)
-
 /* Dirents */
 
 /*
@@ -797,11 +769,6 @@ LE64_BITMASK(CACHED_DEV,	struct bch_inode_blockdev, i_flags, 0,  1)
  * collision:
  */
 
-enum {
-	BCH_DIRENT		= 128,
-	BCH_DIRENT_WHITEOUT	= 129,
-};
-
 struct bch_dirent {
 	struct bch_val		v;
 
@@ -816,7 +783,6 @@ struct bch_dirent {
 
 	__u8			d_name[];
 } __attribute__((packed, aligned(8)));
-BKEY_VAL_TYPE(dirent,		BCH_DIRENT);
 
 #define BCH_NAME_MAX	(U8_MAX * sizeof(u64) -				\
 			 sizeof(struct bkey) -				\
@@ -825,16 +791,11 @@ BKEY_VAL_TYPE(dirent,		BCH_DIRENT);
 
 /* Xattrs */
 
-enum {
-	BCH_XATTR		= 128,
-	BCH_XATTR_WHITEOUT	= 129,
-};
-
-#define BCH_XATTR_INDEX_USER			0
-#define BCH_XATTR_INDEX_POSIX_ACL_ACCESS	1
-#define BCH_XATTR_INDEX_POSIX_ACL_DEFAULT	2
-#define BCH_XATTR_INDEX_TRUSTED			3
-#define BCH_XATTR_INDEX_SECURITY	        4
+#define KEY_TYPE_XATTR_INDEX_USER			0
+#define KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS	1
+#define KEY_TYPE_XATTR_INDEX_POSIX_ACL_DEFAULT	2
+#define KEY_TYPE_XATTR_INDEX_TRUSTED			3
+#define KEY_TYPE_XATTR_INDEX_SECURITY	        4
 
 struct bch_xattr {
 	struct bch_val		v;
@@ -843,14 +804,9 @@ struct bch_xattr {
 	__le16			x_val_len;
 	__u8			x_name[];
 } __attribute__((packed, aligned(8)));
-BKEY_VAL_TYPE(xattr,		BCH_XATTR);
 
 /* Bucket/allocation information: */
 
-enum {
-	BCH_ALLOC		= 128,
-};
-
 enum {
 	BCH_ALLOC_FIELD_READ_TIME	= 0,
 	BCH_ALLOC_FIELD_WRITE_TIME	= 1,
@@ -862,14 +818,9 @@ struct bch_alloc {
 	__u8			gen;
 	__u8			data[];
 } __attribute__((packed, aligned(8)));
-BKEY_VAL_TYPE(alloc,	BCH_ALLOC);
 
 /* Quotas: */
 
-enum {
-	BCH_QUOTA		= 128,
-};
-
 enum quota_types {
 	QTYP_USR		= 0,
 	QTYP_GRP		= 1,
@@ -892,14 +843,9 @@ struct bch_quota {
 	struct bch_val		v;
 	struct bch_quota_counter c[Q_COUNTERS];
 } __attribute__((packed, aligned(8)));
-BKEY_VAL_TYPE(quota,	BCH_QUOTA);
 
 /* Erasure coding */
 
-enum {
-	BCH_STRIPE		= 128,
-};
-
 struct bch_stripe {
 	struct bch_val		v;
 	__le16			sectors;
@@ -913,7 +859,6 @@ struct bch_stripe {
 
 	struct bch_extent_ptr	ptrs[0];
 } __attribute__((packed, aligned(8)));
-BKEY_VAL_TYPE(stripe,	BCH_STRIPE);
 
 /* Optional/variable size superblock sections: */
 
@@ -1149,15 +1094,21 @@ struct bch_sb_field_clean {
 /* Superblock: */
 
 /*
- * Version 8:	BCH_SB_ENCODED_EXTENT_MAX_BITS
- *		BCH_MEMBER_DATA_ALLOWED
- * Version 9:	incompatible extent nonce change
+ * New versioning scheme:
+ * One common version number for all on disk data structures - superblock, btree
+ * nodes, journal entries
  */
+#define BCH_JSET_VERSION_OLD			2
+#define BCH_BSET_VERSION_OLD			3
+
+enum bcachefs_metadata_version {
+	bcachefs_metadata_version_min			= 9,
+	bcachefs_metadata_version_new_versioning	= 10,
+	bcachefs_metadata_version_bkey_renumber		= 10,
+	bcachefs_metadata_version_max			= 11,
+};
 
-#define BCH_SB_VERSION_MIN		7
-#define BCH_SB_VERSION_EXTENT_MAX	8
-#define BCH_SB_VERSION_EXTENT_NONCE_V1	9
-#define BCH_SB_VERSION_MAX		9
+#define bcachefs_metadata_version_current	(bcachefs_metadata_version_max - 1)
 
 #define BCH_SB_SECTOR			8
 #define BCH_SB_MEMBERS_MAX		64 /* XXX kill */
@@ -1176,6 +1127,9 @@ struct bch_sb_layout {
 /*
  * @offset	- sector where this sb was written
  * @version	- on disk format version
+ * @version_min	- Oldest metadata version this filesystem contains; so we can
+ *		  safely drop compatibility code and refuse to mount filesystems
+ *		  we'd need it for
  * @magic	- identifies as a bcachefs superblock (BCACHE_MAGIC)
  * @seq		- incremented each time superblock is written
  * @uuid	- used for generating various magic numbers and identifying
@@ -1369,11 +1323,6 @@ static inline __u64 __bset_magic(struct bch_sb *sb)
 
 /* Journal */
 
-#define BCACHE_JSET_VERSION_UUIDv1	1
-#define BCACHE_JSET_VERSION_UUID	1	/* Always latest UUID format */
-#define BCACHE_JSET_VERSION_JKEYS	2
-#define BCACHE_JSET_VERSION		2
-
 #define JSET_KEYS_U64s	(sizeof(struct jset_entry) / sizeof(__u64))
 
 #define BCH_JSET_ENTRY_TYPES()			\
@@ -1453,35 +1402,26 @@ LE32_BITMASK(JSET_BIG_ENDIAN,	struct jset, flags, 4, 5);
 
 /* Btree: */
 
-#define DEFINE_BCH_BTREE_IDS()					\
-	DEF_BTREE_ID(EXTENTS,	0, "extents")			\
-	DEF_BTREE_ID(INODES,	1, "inodes")			\
-	DEF_BTREE_ID(DIRENTS,	2, "dirents")			\
-	DEF_BTREE_ID(XATTRS,	3, "xattrs")			\
-	DEF_BTREE_ID(ALLOC,	4, "alloc")			\
-	DEF_BTREE_ID(QUOTAS,	5, "quotas")			\
-	DEF_BTREE_ID(EC,	6, "erasure_coding")
-
-#define DEF_BTREE_ID(kwd, val, name) BTREE_ID_##kwd = val,
+#define BCH_BTREE_IDS()				\
+	x(EXTENTS,	0, "extents")			\
+	x(INODES,	1, "inodes")			\
+	x(DIRENTS,	2, "dirents")			\
+	x(XATTRS,	3, "xattrs")			\
+	x(ALLOC,	4, "alloc")			\
+	x(QUOTAS,	5, "quotas")			\
+	x(EC,		6, "erasure_coding")
 
 enum btree_id {
-	DEFINE_BCH_BTREE_IDS()
+#define x(kwd, val, name) BTREE_ID_##kwd = val,
+	BCH_BTREE_IDS()
+#undef x
 	BTREE_ID_NR
 };
 
-#undef DEF_BTREE_ID
-
 #define BTREE_MAX_DEPTH		4U
 
 /* Btree nodes */
 
-/* Version 1: Seed pointer into btree node checksum
- */
-#define BCACHE_BSET_CSUM		1
-#define BCACHE_BSET_KEY_v1		2
-#define BCACHE_BSET_JOURNAL_SEQ		3
-#define BCACHE_BSET_VERSION		3
-
 /*
  * Btree nodes
  *
diff --git a/fs/bcachefs/bkey.c b/fs/bcachefs/bkey.c
index d7e022ba2027..d35cdde299c4 100644
--- a/fs/bcachefs/bkey.c
+++ b/fs/bcachefs/bkey.c
@@ -488,7 +488,7 @@ enum bkey_pack_pos_ret bch2_bkey_pack_pos_lossy(struct bkey_packed *out,
 	pack_state_finish(&state, out);
 	out->u64s	= f->key_u64s;
 	out->format	= KEY_FORMAT_LOCAL_BTREE;
-	out->type	= KEY_TYPE_DELETED;
+	out->type	= KEY_TYPE_deleted;
 
 #ifdef CONFIG_BCACHEFS_DEBUG
 	if (exact) {
diff --git a/fs/bcachefs/bkey.h b/fs/bcachefs/bkey.h
index 9679631a7e89..44044fcd6f9f 100644
--- a/fs/bcachefs/bkey.h
+++ b/fs/bcachefs/bkey.h
@@ -61,10 +61,12 @@ static inline void set_bkey_val_bytes(struct bkey *k, unsigned bytes)
 	k->u64s = BKEY_U64s + DIV_ROUND_UP(bytes, sizeof(u64));
 }
 
-#define bkey_deleted(_k)	((_k)->type == KEY_TYPE_DELETED)
+#define bkey_val_end(_k)	vstruct_idx((_k).v, bkey_val_u64s((_k).k))
+
+#define bkey_deleted(_k)	((_k)->type == KEY_TYPE_deleted)
 
 #define bkey_whiteout(_k)				\
-	((_k)->type == KEY_TYPE_DELETED || (_k)->type == KEY_TYPE_DISCARD)
+	((_k)->type == KEY_TYPE_deleted || (_k)->type == KEY_TYPE_discard)
 
 #define bkey_packed_typecheck(_k)					\
 ({									\
@@ -439,7 +441,15 @@ static inline struct bkey_s_c bkey_i_to_s_c(const struct bkey_i *k)
  * bkey_i_extent to a bkey_i - since that's always safe, instead of conversion
  * functions.
  */
-#define __BKEY_VAL_ACCESSORS(name, nr, _assert)				\
+#define BKEY_VAL_ACCESSORS(name)					\
+struct bkey_i_##name {							\
+	union {								\
+		struct bkey		k;				\
+		struct bkey_i		k_i;				\
+	};								\
+	struct bch_##name		v;				\
+};									\
+									\
 struct bkey_s_c_##name {						\
 	union {								\
 	struct {							\
@@ -464,20 +474,20 @@ struct bkey_s_##name {							\
 									\
 static inline struct bkey_i_##name *bkey_i_to_##name(struct bkey_i *k)	\
 {									\
-	_assert(k->k.type, nr);						\
+	EBUG_ON(k->k.type != KEY_TYPE_##name);				\
 	return container_of(&k->k, struct bkey_i_##name, k);		\
 }									\
 									\
 static inline const struct bkey_i_##name *				\
 bkey_i_to_##name##_c(const struct bkey_i *k)				\
 {									\
-	_assert(k->k.type, nr);						\
+	EBUG_ON(k->k.type != KEY_TYPE_##name);				\
 	return container_of(&k->k, struct bkey_i_##name, k);		\
 }									\
 									\
 static inline struct bkey_s_##name bkey_s_to_##name(struct bkey_s k)	\
 {									\
-	_assert(k.k->type, nr);						\
+	EBUG_ON(k.k->type != KEY_TYPE_##name);				\
 	return (struct bkey_s_##name) {					\
 		.k = k.k,						\
 		.v = container_of(k.v, struct bch_##name, v),		\
@@ -486,7 +496,7 @@ static inline struct bkey_s_##name bkey_s_to_##name(struct bkey_s k)	\
 									\
 static inline struct bkey_s_c_##name bkey_s_c_to_##name(struct bkey_s_c k)\
 {									\
-	_assert(k.k->type, nr);						\
+	EBUG_ON(k.k->type != KEY_TYPE_##name);				\
 	return (struct bkey_s_c_##name) {				\
 		.k = k.k,						\
 		.v = container_of(k.v, struct bch_##name, v),		\
@@ -512,7 +522,7 @@ name##_i_to_s_c(const struct bkey_i_##name *k)				\
 									\
 static inline struct bkey_s_##name bkey_i_to_s_##name(struct bkey_i *k)	\
 {									\
-	_assert(k->k.type, nr);						\
+	EBUG_ON(k->k.type != KEY_TYPE_##name);				\
 	return (struct bkey_s_##name) {					\
 		.k = &k->k,						\
 		.v = container_of(&k->v, struct bch_##name, v),		\
@@ -522,27 +532,13 @@ static inline struct bkey_s_##name bkey_i_to_s_##name(struct bkey_i *k)	\
 static inline struct bkey_s_c_##name					\
 bkey_i_to_s_c_##name(const struct bkey_i *k)				\
 {									\
-	_assert(k->k.type, nr);						\
+	EBUG_ON(k->k.type != KEY_TYPE_##name);				\
 	return (struct bkey_s_c_##name) {				\
 		.k = &k->k,						\
 		.v = container_of(&k->v, struct bch_##name, v),		\
 	};								\
 }									\
 									\
-static inline struct bch_##name *					\
-bkey_p_##name##_val(const struct bkey_format *f,			\
-		    struct bkey_packed *k)				\
-{									\
-	return container_of(bkeyp_val(f, k), struct bch_##name, v);	\
-}									\
-									\
-static inline const struct bch_##name *					\
-bkey_p_c_##name##_val(const struct bkey_format *f,			\
-		      const struct bkey_packed *k)			\
-{									\
-	return container_of(bkeyp_val(f, k), struct bch_##name, v);	\
-}									\
-									\
 static inline struct bkey_i_##name *bkey_##name##_init(struct bkey_i *_k)\
 {									\
 	struct bkey_i_##name *k =					\
@@ -550,45 +546,23 @@ static inline struct bkey_i_##name *bkey_##name##_init(struct bkey_i *_k)\
 									\
 	bkey_init(&k->k);						\
 	memset(&k->v, 0, sizeof(k->v));					\
-	k->k.type = nr;							\
+	k->k.type = KEY_TYPE_##name;					\
 	set_bkey_val_bytes(&k->k, sizeof(k->v));			\
 									\
 	return k;							\
 }
 
-#define __BKEY_VAL_ASSERT(_type, _nr)	EBUG_ON(_type != _nr)
-
-#define BKEY_VAL_ACCESSORS(name, _nr)					\
-	static inline void __bch_##name##_assert(u8 type, u8 nr)	\
-	{								\
-		EBUG_ON(type != _nr);					\
-	}								\
-									\
-	__BKEY_VAL_ACCESSORS(name, _nr, __bch_##name##_assert)
-
-BKEY_VAL_ACCESSORS(cookie,		KEY_TYPE_COOKIE);
-
-static inline void __bch2_extent_assert(u8 type, u8 nr)
-{
-	EBUG_ON(type != BCH_EXTENT && type != BCH_EXTENT_CACHED);
-}
-
-__BKEY_VAL_ACCESSORS(extent,		BCH_EXTENT, __bch2_extent_assert);
-BKEY_VAL_ACCESSORS(reservation,		BCH_RESERVATION);
-
-BKEY_VAL_ACCESSORS(inode,		BCH_INODE_FS);
-BKEY_VAL_ACCESSORS(inode_blockdev,	BCH_INODE_BLOCKDEV);
-BKEY_VAL_ACCESSORS(inode_generation,	BCH_INODE_GENERATION);
-
-BKEY_VAL_ACCESSORS(dirent,		BCH_DIRENT);
-
-BKEY_VAL_ACCESSORS(xattr,		BCH_XATTR);
-
-BKEY_VAL_ACCESSORS(alloc,		BCH_ALLOC);
-
-BKEY_VAL_ACCESSORS(quota,		BCH_QUOTA);
-
-BKEY_VAL_ACCESSORS(stripe,		BCH_STRIPE);
+BKEY_VAL_ACCESSORS(cookie);
+BKEY_VAL_ACCESSORS(btree_ptr);
+BKEY_VAL_ACCESSORS(extent);
+BKEY_VAL_ACCESSORS(reservation);
+BKEY_VAL_ACCESSORS(inode);
+BKEY_VAL_ACCESSORS(inode_generation);
+BKEY_VAL_ACCESSORS(dirent);
+BKEY_VAL_ACCESSORS(xattr);
+BKEY_VAL_ACCESSORS(alloc);
+BKEY_VAL_ACCESSORS(quota);
+BKEY_VAL_ACCESSORS(stripe);
 
 /* byte order helpers */
 
diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c
index 81c66950668c..f518062d896b 100644
--- a/fs/bcachefs/bkey_methods.c
+++ b/fs/bcachefs/bkey_methods.c
@@ -12,66 +12,84 @@
 #include "quota.h"
 #include "xattr.h"
 
-const struct bkey_ops bch2_bkey_ops[] = {
-	[BKEY_TYPE_EXTENTS]	= bch2_bkey_extent_ops,
-	[BKEY_TYPE_INODES]	= bch2_bkey_inode_ops,
-	[BKEY_TYPE_DIRENTS]	= bch2_bkey_dirent_ops,
-	[BKEY_TYPE_XATTRS]	= bch2_bkey_xattr_ops,
-	[BKEY_TYPE_ALLOC]	= bch2_bkey_alloc_ops,
-	[BKEY_TYPE_QUOTAS]	= bch2_bkey_quota_ops,
-	[BKEY_TYPE_EC]		= bch2_bkey_ec_ops,
-	[BKEY_TYPE_BTREE]	= bch2_bkey_btree_ops,
+const char * const bch_bkey_types[] = {
+#define x(name, nr) #name,
+	BCH_BKEY_TYPES()
+#undef x
+	NULL
 };
 
-const char *bch2_bkey_val_invalid(struct bch_fs *c, enum bkey_type type,
-				  struct bkey_s_c k)
+static const char *deleted_key_invalid(const struct bch_fs *c,
+					struct bkey_s_c k)
 {
-	const struct bkey_ops *ops = &bch2_bkey_ops[type];
+	return NULL;
+}
+
+const struct bkey_ops bch2_bkey_ops_deleted = {
+	.key_invalid = deleted_key_invalid,
+};
+
+const struct bkey_ops bch2_bkey_ops_discard = {
+	.key_invalid = deleted_key_invalid,
+};
 
-	switch (k.k->type) {
-	case KEY_TYPE_DELETED:
-	case KEY_TYPE_DISCARD:
-		return NULL;
+static const char *empty_val_key_invalid(const struct bch_fs *c, struct bkey_s_c k)
+{
+	if (bkey_val_bytes(k.k))
+		return "value size should be zero";
 
-	case KEY_TYPE_ERROR:
-		return bkey_val_bytes(k.k) != 0
-			? "value size should be zero"
-			: NULL;
+	return NULL;
+}
 
-	case KEY_TYPE_COOKIE:
-		return bkey_val_bytes(k.k) != sizeof(struct bch_cookie)
-			? "incorrect value size"
-			: NULL;
+const struct bkey_ops bch2_bkey_ops_error = {
+	.key_invalid = empty_val_key_invalid,
+};
 
-	default:
-		if (k.k->type < KEY_TYPE_GENERIC_NR)
-			return "invalid type";
+static const char *key_type_cookie_invalid(const struct bch_fs *c,
+					   struct bkey_s_c k)
+{
+	if (bkey_val_bytes(k.k) != sizeof(struct bch_cookie))
+		return "incorrect value size";
 
-		return ops->key_invalid(c, k);
-	}
+	return NULL;
 }
 
-const char *__bch2_bkey_invalid(struct bch_fs *c, enum bkey_type type,
-			      struct bkey_s_c k)
+const struct bkey_ops bch2_bkey_ops_cookie = {
+	.key_invalid = key_type_cookie_invalid,
+};
+
+const struct bkey_ops bch2_bkey_ops_whiteout = {
+	.key_invalid = empty_val_key_invalid,
+};
+
+static const struct bkey_ops bch2_bkey_ops[] = {
+#define x(name, nr) [KEY_TYPE_##name]	= bch2_bkey_ops_##name,
+	BCH_BKEY_TYPES()
+#undef x
+};
+
+const char *bch2_bkey_val_invalid(struct bch_fs *c, struct bkey_s_c k)
 {
-	const struct bkey_ops *ops = &bch2_bkey_ops[type];
+	if (k.k->type >= KEY_TYPE_MAX)
+		return "invalid type";
+
+	return bch2_bkey_ops[k.k->type].key_invalid(c, k);
+}
 
+const char *__bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k,
+				enum btree_node_type type)
+{
 	if (k.k->u64s < BKEY_U64s)
 		return "u64s too small";
 
-	if (!ops->is_extents) {
-		if (k.k->size)
-			return "nonzero size field";
-	} else {
+	if (btree_node_type_is_extents(type)) {
 		if ((k.k->size == 0) != bkey_deleted(k.k))
 			return "bad size field";
+	} else {
+		if (k.k->size)
+			return "nonzero size field";
 	}
 
-	if (ops->is_extents &&
-	    !k.k->size &&
-	    !bkey_deleted(k.k))
-		return "zero size field";
-
 	if (k.k->p.snapshot)
 		return "nonzero snapshot";
 
@@ -82,11 +100,11 @@ const char *__bch2_bkey_invalid(struct bch_fs *c, enum bkey_type type,
 	return NULL;
 }
 
-const char *bch2_bkey_invalid(struct bch_fs *c, enum bkey_type type,
-			      struct bkey_s_c k)
+const char *bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k,
+			      enum btree_node_type type)
 {
-	return __bch2_bkey_invalid(c, type, k) ?:
-		bch2_bkey_val_invalid(c, type, k);
+	return __bch2_bkey_invalid(c, k, type) ?:
+		bch2_bkey_val_invalid(c, k);
 }
 
 const char *bch2_bkey_in_btree_node(struct btree *b, struct bkey_s_c k)
@@ -102,24 +120,22 @@ const char *bch2_bkey_in_btree_node(struct btree *b, struct bkey_s_c k)
 
 void bch2_bkey_debugcheck(struct bch_fs *c, struct btree *b, struct bkey_s_c k)
 {
-	enum bkey_type type = btree_node_type(b);
-	const struct bkey_ops *ops = &bch2_bkey_ops[type];
+	const struct bkey_ops *ops = &bch2_bkey_ops[k.k->type];
 	const char *invalid;
 
 	BUG_ON(!k.k->u64s);
 
-	invalid = bch2_bkey_invalid(c, type, k) ?:
+	invalid = bch2_bkey_invalid(c, k, btree_node_type(b)) ?:
 		bch2_bkey_in_btree_node(b, k);
 	if (invalid) {
 		char buf[160];
 
-		bch2_bkey_val_to_text(&PBUF(buf), c, type, k);
+		bch2_bkey_val_to_text(&PBUF(buf), c, k);
 		bch2_fs_bug(c, "invalid bkey %s: %s", buf, invalid);
 		return;
 	}
 
-	if (k.k->type >= KEY_TYPE_GENERIC_NR &&
-	    ops->key_debugcheck)
+	if (ops->key_debugcheck)
 		ops->key_debugcheck(c, b, k);
 }
 
@@ -144,46 +160,90 @@ void bch2_bkey_to_text(struct printbuf *out, const struct bkey *k)
 }
 
 void bch2_val_to_text(struct printbuf *out, struct bch_fs *c,
-		      enum bkey_type type, struct bkey_s_c k)
-{
-	const struct bkey_ops *ops = &bch2_bkey_ops[type];
-
-	switch (k.k->type) {
-	case KEY_TYPE_DELETED:
-		pr_buf(out, " deleted");
-		break;
-	case KEY_TYPE_DISCARD:
-		pr_buf(out, " discard");
-		break;
-	case KEY_TYPE_ERROR:
-		pr_buf(out, " error");
-		break;
-	case KEY_TYPE_COOKIE:
-		pr_buf(out, " cookie");
-		break;
-	default:
-		if (k.k->type >= KEY_TYPE_GENERIC_NR && ops->val_to_text)
-			ops->val_to_text(out, c, k);
-		break;
-	}
+		      struct bkey_s_c k)
+{
+	const struct bkey_ops *ops = &bch2_bkey_ops[k.k->type];
+
+	if (likely(ops->val_to_text))
+		ops->val_to_text(out, c, k);
+	else
+		pr_buf(out, " %s", bch_bkey_types[k.k->type]);
 }
 
 void bch2_bkey_val_to_text(struct printbuf *out, struct bch_fs *c,
-			   enum bkey_type type, struct bkey_s_c k)
+			   struct bkey_s_c k)
 {
 	bch2_bkey_to_text(out, k.k);
 	pr_buf(out, ": ");
-	bch2_val_to_text(out, c, type, k);
+	bch2_val_to_text(out, c, k);
 }
 
-void bch2_bkey_swab(enum bkey_type type,
-		   const struct bkey_format *f,
-		   struct bkey_packed *k)
+void bch2_bkey_swab(const struct bkey_format *f,
+		    struct bkey_packed *k)
 {
-	const struct bkey_ops *ops = &bch2_bkey_ops[type];
+	const struct bkey_ops *ops = &bch2_bkey_ops[k->type];
 
 	bch2_bkey_swab_key(f, k);
 
 	if (ops->swab)
 		ops->swab(f, k);
 }
+
+bool bch2_bkey_normalize(struct bch_fs *c, struct bkey_s k)
+{
+	const struct bkey_ops *ops = &bch2_bkey_ops[k.k->type];
+
+	return ops->key_normalize
+		? ops->key_normalize(c, k)
+		: false;
+}
+
+enum merge_result bch2_bkey_merge(struct bch_fs *c,
+				  struct bkey_i *l, struct bkey_i *r)
+{
+	const struct bkey_ops *ops = &bch2_bkey_ops[l->k.type];
+
+	if (!key_merging_disabled(c) &&
+	    ops->key_merge &&
+	    l->k.type == r->k.type &&
+	    !bversion_cmp(l->k.version, r->k.version) &&
+	    !bkey_cmp(l->k.p, bkey_start_pos(&r->k)))
+		return ops->key_merge(c, l, r);
+
+	return BCH_MERGE_NOMERGE;
+}
+
+static const struct old_bkey_type {
+	u8		btree_node_type;
+	u8		old;
+	u8		new;
+} bkey_renumber_table[] = {
+	{BKEY_TYPE_BTREE,	128, KEY_TYPE_btree_ptr		},
+	{BKEY_TYPE_EXTENTS,	128, KEY_TYPE_extent		},
+	{BKEY_TYPE_EXTENTS,	129, KEY_TYPE_extent		},
+	{BKEY_TYPE_EXTENTS,	130, KEY_TYPE_reservation	},
+	{BKEY_TYPE_INODES,	128, KEY_TYPE_inode		},
+	{BKEY_TYPE_INODES,	130, KEY_TYPE_inode_generation	},
+	{BKEY_TYPE_DIRENTS,	128, KEY_TYPE_dirent		},
+	{BKEY_TYPE_DIRENTS,	129, KEY_TYPE_whiteout		},
+	{BKEY_TYPE_XATTRS,	128, KEY_TYPE_xattr		},
+	{BKEY_TYPE_XATTRS,	129, KEY_TYPE_whiteout		},
+	{BKEY_TYPE_ALLOC,	128, KEY_TYPE_alloc		},
+	{BKEY_TYPE_QUOTAS,	128, KEY_TYPE_quota		},
+};
+
+void bch2_bkey_renumber(enum btree_node_type btree_node_type,
+			struct bkey_packed *k,
+			int write)
+{
+	const struct old_bkey_type *i;
+
+	for (i = bkey_renumber_table;
+	     i < bkey_renumber_table + ARRAY_SIZE(bkey_renumber_table);
+	     i++)
+		if (btree_node_type == i->btree_node_type &&
+		    k->type == (write ? i->new : i->old)) {
+			k->type = write ? i->old : i->new;
+			break;
+		}
+}
diff --git a/fs/bcachefs/bkey_methods.h b/fs/bcachefs/bkey_methods.h
index 62b86a8e2ba8..a4bfd2aef5bf 100644
--- a/fs/bcachefs/bkey_methods.h
+++ b/fs/bcachefs/bkey_methods.h
@@ -4,24 +4,12 @@
 
 #include "bkey.h"
 
-#define DEF_BTREE_ID(kwd, val, name) BKEY_TYPE_##kwd = val,
-
-enum bkey_type {
-	DEFINE_BCH_BTREE_IDS()
-	BKEY_TYPE_BTREE,
-};
-
-#undef DEF_BTREE_ID
-
-/* Type of a key in btree @id at level @level: */
-static inline enum bkey_type bkey_type(unsigned level, enum btree_id id)
-{
-	return level ? BKEY_TYPE_BTREE : (enum bkey_type) id;
-}
-
 struct bch_fs;
 struct btree;
 struct bkey;
+enum btree_node_type;
+
+extern const char * const bch_bkey_types[];
 
 enum merge_result {
 	BCH_MERGE_NOMERGE,
@@ -34,12 +22,6 @@ enum merge_result {
 	BCH_MERGE_MERGE,
 };
 
-typedef bool (*key_filter_fn)(struct bch_fs *, struct btree *,
-			      struct bkey_s);
-typedef enum merge_result (*key_merge_fn)(struct bch_fs *,
-					  struct btree *,
-					  struct bkey_i *, struct bkey_i *);
-
 struct bkey_ops {
 	/* Returns reason for being invalid if invalid, else NULL: */
 	const char *	(*key_invalid)(const struct bch_fs *,
@@ -49,41 +31,34 @@ struct bkey_ops {
 	void		(*val_to_text)(struct printbuf *, struct bch_fs *,
 				       struct bkey_s_c);
 	void		(*swab)(const struct bkey_format *, struct bkey_packed *);
-	key_filter_fn	key_normalize;
-	key_merge_fn	key_merge;
-	bool		is_extents;
+	bool		(*key_normalize)(struct bch_fs *, struct bkey_s);
+	enum merge_result (*key_merge)(struct bch_fs *,
+				       struct bkey_i *, struct bkey_i *);
 };
 
-static inline bool bkey_type_needs_gc(enum bkey_type type)
-{
-	switch (type) {
-	case BKEY_TYPE_BTREE:
-	case BKEY_TYPE_EXTENTS:
-	case BKEY_TYPE_EC:
-		return true;
-	default:
-		return false;
-	}
-}
-
-const char *bch2_bkey_val_invalid(struct bch_fs *, enum bkey_type,
-				  struct bkey_s_c);
-const char *__bch2_bkey_invalid(struct bch_fs *, enum bkey_type, struct bkey_s_c);
-const char *bch2_bkey_invalid(struct bch_fs *, enum bkey_type, struct bkey_s_c);
+const char *bch2_bkey_val_invalid(struct bch_fs *, struct bkey_s_c);
+const char *__bch2_bkey_invalid(struct bch_fs *, struct bkey_s_c,
+				enum btree_node_type);
+const char *bch2_bkey_invalid(struct bch_fs *, struct bkey_s_c,
+			      enum btree_node_type);
 const char *bch2_bkey_in_btree_node(struct btree *, struct bkey_s_c);
 
 void bch2_bkey_debugcheck(struct bch_fs *, struct btree *, struct bkey_s_c);
 
 void bch2_bpos_to_text(struct printbuf *, struct bpos);
 void bch2_bkey_to_text(struct printbuf *, const struct bkey *);
-void bch2_val_to_text(struct printbuf *, struct bch_fs *, enum bkey_type,
+void bch2_val_to_text(struct printbuf *, struct bch_fs *,
 		      struct bkey_s_c);
 void bch2_bkey_val_to_text(struct printbuf *, struct bch_fs *,
-			   enum bkey_type, struct bkey_s_c);
+			   struct bkey_s_c);
+
+void bch2_bkey_swab(const struct bkey_format *, struct bkey_packed *);
+
+bool bch2_bkey_normalize(struct bch_fs *, struct bkey_s);
 
-void bch2_bkey_swab(enum bkey_type, const struct bkey_format *,
-		    struct bkey_packed *);
+enum merge_result bch2_bkey_merge(struct bch_fs *,
+				  struct bkey_i *, struct bkey_i *);
 
-extern const struct bkey_ops bch2_bkey_ops[];
+void bch2_bkey_renumber(enum btree_node_type, struct bkey_packed *, int);
 
 #endif /* _BCACHEFS_BKEY_METHODS_H */
diff --git a/fs/bcachefs/bkey_sort.c b/fs/bcachefs/bkey_sort.c
index 706ca77d4b17..12825c1b292f 100644
--- a/fs/bcachefs/bkey_sort.c
+++ b/fs/bcachefs/bkey_sort.c
@@ -257,7 +257,7 @@ static void extent_sort_append(struct bch_fs *c,
 	bch2_bkey_unpack(b, &tmp.k, k);
 
 	if (*prev &&
-	    bch2_extent_merge(c, b, (void *) *prev, &tmp.k))
+	    bch2_bkey_merge(c, (void *) *prev, &tmp.k))
 		return;
 
 	if (*prev) {
@@ -375,7 +375,7 @@ struct btree_nr_keys bch2_extent_sort_fix_overlapping(struct bch_fs *c,
 }
 
 /* Sort + repack in a new format: */
-static struct btree_nr_keys
+struct btree_nr_keys
 bch2_sort_repack(struct bset *dst, struct btree *src,
 		 struct btree_node_iter *src_iter,
 		 struct bkey_format *out_f,
@@ -411,18 +411,12 @@ bch2_sort_repack_merge(struct bch_fs *c,
 		       struct bset *dst, struct btree *src,
 		       struct btree_node_iter *iter,
 		       struct bkey_format *out_f,
-		       bool filter_whiteouts,
-		       key_filter_fn filter,
-		       key_merge_fn merge)
+		       bool filter_whiteouts)
 {
 	struct bkey_packed *k, *prev = NULL, *out;
 	struct btree_nr_keys nr;
 	BKEY_PADDED(k) tmp;
 
-	if (!filter && !merge)
-		return bch2_sort_repack(dst, src, iter, out_f,
-					filter_whiteouts);
-
 	memset(&nr, 0, sizeof(nr));
 
 	while ((k = bch2_btree_node_iter_next_all(iter, src))) {
@@ -435,14 +429,15 @@ bch2_sort_repack_merge(struct bch_fs *c,
 		 */
 		bch2_bkey_unpack(src, &tmp.k, k);
 
-		if (filter && filter(c, src, bkey_i_to_s(&tmp.k)))
+		if (filter_whiteouts &&
+		    bch2_bkey_normalize(c, bkey_i_to_s(&tmp.k)))
 			continue;
 
 		/* prev is always unpacked, for key merging: */
 
 		if (prev &&
-		    merge &&
-		    merge(c, src, (void *) prev, &tmp.k) == BCH_MERGE_MERGE)
+		    bch2_bkey_merge(c, (void *) prev, &tmp.k) ==
+		    BCH_MERGE_MERGE)
 			continue;
 
 		/*
@@ -606,7 +601,7 @@ unsigned bch2_sort_extent_whiteouts(struct bkey_packed *dst,
 			continue;
 
 		EBUG_ON(bkeyp_val_u64s(f, in));
-		EBUG_ON(in->type != KEY_TYPE_DISCARD);
+		EBUG_ON(in->type != KEY_TYPE_discard);
 
 		r.k = bkey_unpack_key(iter->b, in);
 
diff --git a/fs/bcachefs/bkey_sort.h b/fs/bcachefs/bkey_sort.h
index 6b1661dd221a..397009181eae 100644
--- a/fs/bcachefs/bkey_sort.h
+++ b/fs/bcachefs/bkey_sort.h
@@ -47,13 +47,14 @@ bch2_extent_sort_fix_overlapping(struct bch_fs *, struct bset *,
 				 struct btree_node_iter_large *);
 
 struct btree_nr_keys
+bch2_sort_repack(struct bset *, struct btree *,
+		 struct btree_node_iter *,
+		 struct bkey_format *, bool);
+struct btree_nr_keys
 bch2_sort_repack_merge(struct bch_fs *,
 		       struct bset *, struct btree *,
 		       struct btree_node_iter *,
-		       struct bkey_format *,
-		       bool,
-		       key_filter_fn,
-		       key_merge_fn);
+		       struct bkey_format *, bool);
 
 unsigned bch2_sort_keys(struct bkey_packed *,
 			struct sort_iter *, bool);
diff --git a/fs/bcachefs/bset.h b/fs/bcachefs/bset.h
index 5d03036620b9..329ffb0b6b3d 100644
--- a/fs/bcachefs/bset.h
+++ b/fs/bcachefs/bset.h
@@ -397,7 +397,7 @@ bch2_bkey_prev_all(struct btree *b, struct bset_tree *t, struct bkey_packed *k)
 static inline struct bkey_packed *
 bch2_bkey_prev(struct btree *b, struct bset_tree *t, struct bkey_packed *k)
 {
-	return bch2_bkey_prev_filter(b, t, k, KEY_TYPE_DISCARD + 1);
+	return bch2_bkey_prev_filter(b, t, k, KEY_TYPE_discard + 1);
 }
 
 enum bch_extent_overlap {
@@ -529,7 +529,7 @@ bch2_btree_node_iter_peek_all(struct btree_node_iter *iter,
 static inline struct bkey_packed *
 bch2_btree_node_iter_peek(struct btree_node_iter *iter, struct btree *b)
 {
-	return bch2_btree_node_iter_peek_filter(iter, b, KEY_TYPE_DISCARD + 1);
+	return bch2_btree_node_iter_peek_filter(iter, b, KEY_TYPE_discard + 1);
 }
 
 static inline struct bkey_packed *
@@ -555,7 +555,7 @@ bch2_btree_node_iter_prev_all(struct btree_node_iter *iter, struct btree *b)
 static inline struct bkey_packed *
 bch2_btree_node_iter_prev(struct btree_node_iter *iter, struct btree *b)
 {
-	return bch2_btree_node_iter_prev_filter(iter, b, KEY_TYPE_DISCARD + 1);
+	return bch2_btree_node_iter_prev_filter(iter, b, KEY_TYPE_discard + 1);
 }
 
 struct bkey_s_c bch2_btree_node_iter_peek_unpack(struct btree_node_iter *,
diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index 846d5e816aa2..b748afc778f4 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -6,20 +6,17 @@
 #include "btree_iter.h"
 #include "btree_locking.h"
 #include "debug.h"
-#include "extents.h"
 #include "trace.h"
 
 #include <linux/prefetch.h>
 
-#define DEF_BTREE_ID(kwd, val, name) name,
-
 const char * const bch2_btree_ids[] = {
-	DEFINE_BCH_BTREE_IDS()
+#define x(kwd, val, name) name,
+	BCH_BTREE_IDS()
+#undef x
 	NULL
 };
 
-#undef DEF_BTREE_ID
-
 void bch2_recalc_btree_reserve(struct bch_fs *c)
 {
 	unsigned i, reserve = 16;
@@ -100,7 +97,7 @@ static struct btree *btree_node_mem_alloc(struct bch_fs *c, gfp_t gfp)
 	if (!b)
 		return NULL;
 
-	bkey_extent_init(&b->key);
+	bkey_btree_ptr_init(&b->key);
 	six_lock_init(&b->lock);
 	lockdep_set_novalidate_class(&b->lock);
 	INIT_LIST_HEAD(&b->list);
@@ -117,7 +114,7 @@ void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b)
 	rhashtable_remove_fast(&bc->table, &b->hash, bch_btree_cache_params);
 
 	/* Cause future lookups for this node to fail: */
-	bkey_i_to_extent(&b->key)->v._data[0] = 0;
+	PTR_HASH(&b->key) = 0;
 }
 
 int __bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b)
@@ -604,7 +601,7 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c,
 		/* raced with another fill: */
 
 		/* mark as unhashed... */
-		bkey_i_to_extent(&b->key)->v._data[0] = 0;
+		PTR_HASH(&b->key) = 0;
 
 		mutex_lock(&bc->lock);
 		list_add(&b->list, &bc->freeable);
@@ -906,8 +903,7 @@ void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c,
 	       b->data->min_key.offset,
 	       b->data->max_key.inode,
 	       b->data->max_key.offset);
-	bch2_val_to_text(out, c, BKEY_TYPE_BTREE,
-			 bkey_i_to_s_c(&b->key));
+	bch2_val_to_text(out, c, bkey_i_to_s_c(&b->key));
 	pr_buf(out, "\n"
 	       "    format: u64s %u fields %u %u %u %u %u\n"
 	       "    unpack fn len: %u\n"
diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h
index cb7f66fc8bd4..7bd2bc84160d 100644
--- a/fs/bcachefs/btree_cache.h
+++ b/fs/bcachefs/btree_cache.h
@@ -4,7 +4,6 @@
 
 #include "bcachefs.h"
 #include "btree_types.h"
-#include "extents.h"
 
 struct btree_iter;
 
@@ -37,12 +36,13 @@ void bch2_fs_btree_cache_exit(struct bch_fs *);
 int bch2_fs_btree_cache_init(struct bch_fs *);
 void bch2_fs_btree_cache_init_early(struct btree_cache *);
 
-#define PTR_HASH(_k)	(bkey_i_to_extent_c(_k)->v._data[0])
+#define PTR_HASH(_k)	*((u64 *) &bkey_i_to_btree_ptr_c(_k)->v)
 
 /* is btree node in hash table? */
 static inline bool btree_node_hashed(struct btree *b)
 {
-	return bkey_extent_is_data(&b->key.k) && PTR_HASH(&b->key);
+	return b->key.k.type == KEY_TYPE_btree_ptr &&
+		PTR_HASH(&b->key);
 }
 
 #define for_each_cached_btree(_b, _c, _tbl, _iter, _pos)		\
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index a849f9e320b3..85fc181e76a8 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -112,137 +112,11 @@ static void btree_node_range_checks(struct bch_fs *c, struct btree *b,
 
 /* marking of btree keys/nodes: */
 
-static void ptr_gen_recalc_oldest(struct bch_fs *c,
-				  const struct bch_extent_ptr *ptr,
-				  u8 *max_stale)
-{
-	struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
-	size_t b = PTR_BUCKET_NR(ca, ptr);
-
-	if (gen_after(ca->oldest_gens[b], ptr->gen))
-		ca->oldest_gens[b] = ptr->gen;
-
-	*max_stale = max(*max_stale, ptr_stale(ca, ptr));
-}
-
-static void ptr_gens_recalc_oldest(struct bch_fs *c, enum bkey_type type,
-				   struct bkey_s_c k, u8 *max_stale)
-{
-	const struct bch_extent_ptr *ptr;
-
-	switch (type) {
-	case BKEY_TYPE_BTREE:
-	case BKEY_TYPE_EXTENTS:
-		switch (k.k->type) {
-		case BCH_EXTENT:
-		case BCH_EXTENT_CACHED: {
-			struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
-
-			extent_for_each_ptr(e, ptr)
-				ptr_gen_recalc_oldest(c, ptr, max_stale);
-			break;
-		}
-		}
-		break;
-	case BKEY_TYPE_EC:
-		switch (k.k->type) {
-		case BCH_STRIPE: {
-			struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k);
-
-			for (ptr = s.v->ptrs;
-			     ptr < s.v->ptrs + s.v->nr_blocks;
-			     ptr++)
-				ptr_gen_recalc_oldest(c, ptr, max_stale);
-		}
-		}
-	default:
-		break;
-	}
-}
-
-static int ptr_gen_check(struct bch_fs *c,
-			 enum bkey_type type,
-			 const struct bch_extent_ptr *ptr)
-{
-	struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
-	size_t b = PTR_BUCKET_NR(ca, ptr);
-	struct bucket *g = PTR_BUCKET(ca, ptr);
-	int ret = 0;
-
-	if (mustfix_fsck_err_on(!g->mark.gen_valid, c,
-				"found ptr with missing gen in alloc btree,\n"
-				"type %u gen %u",
-				type, ptr->gen)) {
-		g->_mark.gen = ptr->gen;
-		g->_mark.gen_valid = 1;
-		set_bit(b, ca->buckets_dirty);
-	}
-
-	if (mustfix_fsck_err_on(gen_cmp(ptr->gen, g->mark.gen) > 0, c,
-				"%u ptr gen in the future: %u > %u",
-				type, ptr->gen, g->mark.gen)) {
-		g->_mark.gen = ptr->gen;
-		g->_mark.gen_valid = 1;
-		set_bit(b, ca->buckets_dirty);
-		set_bit(BCH_FS_FIXED_GENS, &c->flags);
-	}
-fsck_err:
-	return ret;
-}
-
-static int ptr_gens_check(struct bch_fs *c, enum bkey_type type,
-			  struct bkey_s_c k)
-{
-	const struct bch_extent_ptr *ptr;
-	int ret = 0;
-
-	switch (type) {
-	case BKEY_TYPE_BTREE:
-	case BKEY_TYPE_EXTENTS:
-		switch (k.k->type) {
-		case BCH_EXTENT:
-		case BCH_EXTENT_CACHED: {
-			struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
-
-			extent_for_each_ptr(e, ptr) {
-				ret = ptr_gen_check(c, type, ptr);
-				if (ret)
-					return ret;
-
-			}
-			break;
-		}
-		}
-		break;
-	case BKEY_TYPE_EC:
-		switch (k.k->type) {
-		case BCH_STRIPE: {
-			struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k);
-
-			for (ptr = s.v->ptrs;
-			     ptr < s.v->ptrs + s.v->nr_blocks;
-			     ptr++) {
-				ret = ptr_gen_check(c, type, ptr);
-				if (ret)
-					return ret;
-			}
-		}
-		}
-		break;
-	default:
-		break;
-	}
-
-	return ret;
-}
-
-/*
- * For runtime mark and sweep:
- */
-static int bch2_gc_mark_key(struct bch_fs *c, enum bkey_type type,
-			    struct bkey_s_c k,
+static int bch2_gc_mark_key(struct bch_fs *c, struct bkey_s_c k,
 			    u8 *max_stale, bool initial)
 {
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+	const struct bch_extent_ptr *ptr;
 	struct gc_pos pos = { 0 };
 	unsigned flags =
 		BCH_BUCKET_MARK_GC|
@@ -257,23 +131,50 @@ static int bch2_gc_mark_key(struct bch_fs *c, enum bkey_type type,
 			atomic64_set(&c->key_version, k.k->version.lo);
 
 		if (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) ||
-		    fsck_err_on(!bch2_bkey_replicas_marked(c, type, k,
-							   false), c,
+		    fsck_err_on(!bch2_bkey_replicas_marked(c, k, false), c,
 				"superblock not marked as containing replicas (type %u)",
-				type)) {
-			ret = bch2_mark_bkey_replicas(c, type, k);
+				k.k->type)) {
+			ret = bch2_mark_bkey_replicas(c, k);
 			if (ret)
 				return ret;
 		}
 
-		ret = ptr_gens_check(c, type, k);
-		if (ret)
-			return ret;
+		bkey_for_each_ptr(ptrs, ptr) {
+			struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+			size_t b = PTR_BUCKET_NR(ca, ptr);
+			struct bucket *g = PTR_BUCKET(ca, ptr);
+
+			if (mustfix_fsck_err_on(!g->mark.gen_valid, c,
+					"found ptr with missing gen in alloc btree,\n"
+					"type %u gen %u",
+					k.k->type, ptr->gen)) {
+				g->_mark.gen = ptr->gen;
+				g->_mark.gen_valid = 1;
+				set_bit(b, ca->buckets_dirty);
+			}
+
+			if (mustfix_fsck_err_on(gen_cmp(ptr->gen, g->mark.gen) > 0, c,
+					"%u ptr gen in the future: %u > %u",
+					k.k->type, ptr->gen, g->mark.gen)) {
+				g->_mark.gen = ptr->gen;
+				g->_mark.gen_valid = 1;
+				set_bit(b, ca->buckets_dirty);
+				set_bit(BCH_FS_FIXED_GENS, &c->flags);
+			}
+		}
 	}
 
-	bch2_mark_key(c, type, k, true, k.k->size, pos, NULL, 0, flags);
+	bkey_for_each_ptr(ptrs, ptr) {
+		struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+		size_t b = PTR_BUCKET_NR(ca, ptr);
+
+		if (gen_after(ca->oldest_gens[b], ptr->gen))
+			ca->oldest_gens[b] = ptr->gen;
+
+		*max_stale = max(*max_stale, ptr_stale(ca, ptr));
+	}
 
-	ptr_gens_recalc_oldest(c, type, k, max_stale);
+	bch2_mark_key(c, k, true, k.k->size, pos, NULL, 0, flags);
 fsck_err:
 	return ret;
 }
@@ -281,7 +182,6 @@ fsck_err:
 static int btree_gc_mark_node(struct bch_fs *c, struct btree *b,
 			      u8 *max_stale, bool initial)
 {
-	enum bkey_type type = btree_node_type(b);
 	struct btree_node_iter iter;
 	struct bkey unpacked;
 	struct bkey_s_c k;
@@ -289,14 +189,14 @@ static int btree_gc_mark_node(struct bch_fs *c, struct btree *b,
 
 	*max_stale = 0;
 
-	if (!bkey_type_needs_gc(type))
+	if (!btree_node_type_needs_gc(btree_node_type(b)))
 		return 0;
 
 	for_each_btree_node_key_unpack(b, k, &iter,
 				       &unpacked) {
 		bch2_bkey_debugcheck(c, b, k);
 
-		ret = bch2_gc_mark_key(c, type, k, max_stale, initial);
+		ret = bch2_gc_mark_key(c, k, max_stale, initial);
 		if (ret)
 			break;
 	}
@@ -310,7 +210,7 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id,
 	struct btree_iter iter;
 	struct btree *b;
 	struct range_checks r;
-	unsigned depth = bkey_type_needs_gc(btree_id) ? 0 : 1;
+	unsigned depth = btree_node_type_needs_gc(btree_id) ? 0 : 1;
 	u8 max_stale;
 	int ret = 0;
 
@@ -364,7 +264,7 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id,
 
 	b = c->btree_roots[btree_id].b;
 	if (!btree_node_fake(b))
-		bch2_gc_mark_key(c, BKEY_TYPE_BTREE, bkey_i_to_s_c(&b->key),
+		bch2_gc_mark_key(c, bkey_i_to_s_c(&b->key),
 				 &max_stale, initial);
 	gc_pos_set(c, gc_pos_btree_root(b->btree_id));
 
@@ -391,13 +291,13 @@ static int bch2_gc_btrees(struct bch_fs *c, struct list_head *journal,
 
 	for (i = 0; i < BTREE_ID_NR; i++) {
 		enum btree_id id = ids[i];
-		enum bkey_type type = bkey_type(0, id);
+		enum btree_node_type type = __btree_node_type(0, id);
 
 		int ret = bch2_gc_btree(c, id, initial);
 		if (ret)
 			return ret;
 
-		if (journal && bkey_type_needs_gc(type)) {
+		if (journal && btree_node_type_needs_gc(type)) {
 			struct bkey_i *k, *n;
 			struct jset_entry *j;
 			struct journal_replay *r;
@@ -405,8 +305,8 @@ static int bch2_gc_btrees(struct bch_fs *c, struct list_head *journal,
 
 			list_for_each_entry(r, journal, list)
 				for_each_jset_key(k, n, j, &r->j) {
-					if (type == bkey_type(j->level, j->btree_id)) {
-						ret = bch2_gc_mark_key(c, type,
+					if (type == __btree_node_type(j->level, j->btree_id)) {
+						ret = bch2_gc_mark_key(c,
 							bkey_i_to_s_c(k),
 							&max_stale, initial);
 						if (ret)
@@ -507,8 +407,7 @@ static void bch2_mark_pending_btree_node_frees(struct bch_fs *c)
 
 	for_each_pending_btree_node_free(c, as, d)
 		if (d->index_update_done)
-			bch2_mark_key(c, BKEY_TYPE_BTREE,
-				      bkey_i_to_s_c(&d->key),
+			bch2_mark_key(c, bkey_i_to_s_c(&d->key),
 				      true, 0,
 				      pos, NULL, 0,
 				      BCH_BUCKET_MARK_GC);
diff --git a/fs/bcachefs/btree_gc.h b/fs/bcachefs/btree_gc.h
index bb77564b9463..89ee72ac49f6 100644
--- a/fs/bcachefs/btree_gc.h
+++ b/fs/bcachefs/btree_gc.h
@@ -4,8 +4,6 @@
 
 #include "btree_types.h"
 
-enum bkey_type;
-
 void bch2_coalesce(struct bch_fs *);
 int bch2_gc(struct bch_fs *, struct list_head *, bool);
 void bch2_gc_thread_stop(struct bch_fs *);
@@ -58,9 +56,9 @@ static inline int gc_pos_cmp(struct gc_pos l, struct gc_pos r)
 static inline enum gc_phase btree_id_to_gc_phase(enum btree_id id)
 {
 	switch (id) {
-#define DEF_BTREE_ID(n, v, s) case BTREE_ID_##n: return GC_PHASE_BTREE_##n;
-	DEFINE_BCH_BTREE_IDS()
-#undef DEF_BTREE_ID
+#define x(n, v, s) case BTREE_ID_##n: return GC_PHASE_BTREE_##n;
+	BCH_BTREE_IDS()
+#undef x
 	default:
 		BUG();
 	}
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 506bf9e8df38..f205bddd814d 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -392,12 +392,16 @@ void bch2_btree_sort_into(struct bch_fs *c,
 
 	bch2_btree_node_iter_init_from_start(&src_iter, src);
 
-	nr = bch2_sort_repack_merge(c, btree_bset_first(dst),
-			src, &src_iter,
-			&dst->format,
-			true,
-			btree_node_ops(src)->key_normalize,
-			btree_node_ops(src)->key_merge);
+	if (btree_node_is_extents(src))
+		nr = bch2_sort_repack_merge(c, btree_bset_first(dst),
+				src, &src_iter,
+				&dst->format,
+				true);
+	else
+		nr = bch2_sort_repack(btree_bset_first(dst),
+				src, &src_iter,
+				&dst->format,
+				true);
 
 	bch2_time_stats_update(&c->times[BCH_TIME_btree_sort], start_time);
 
@@ -598,8 +602,8 @@ static int validate_bset(struct bch_fs *c, struct btree *b,
 {
 	struct bkey_packed *k, *prev = NULL;
 	struct bpos prev_pos = POS_MIN;
-	enum bkey_type type = btree_node_type(b);
 	bool seen_non_whiteout = false;
+	unsigned version;
 	const char *err;
 	int ret = 0;
 
@@ -645,13 +649,12 @@ static int validate_bset(struct bch_fs *c, struct btree *b,
 			     "invalid bkey format: %s", err);
 	}
 
-	if (btree_err_on(le16_to_cpu(i->version) != BCACHE_BSET_VERSION,
-			 BTREE_ERR_FIXABLE, c, b, i,
-			 "unsupported bset version")) {
-		i->version = cpu_to_le16(BCACHE_BSET_VERSION);
-		i->u64s = 0;
-		return 0;
-	}
+	version = le16_to_cpu(i->version);
+	btree_err_on((version != BCH_BSET_VERSION_OLD &&
+		      version < bcachefs_metadata_version_min) ||
+		     version >= bcachefs_metadata_version_max,
+		     BTREE_ERR_FATAL, c, b, i,
+		     "unsupported bset version");
 
 	if (btree_err_on(b->written + sectors > c->opts.btree_node_size,
 			 BTREE_ERR_FIXABLE, c, b, i,
@@ -700,17 +703,21 @@ static int validate_bset(struct bch_fs *c, struct btree *b,
 		}
 
 		if (BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN)
-			bch2_bkey_swab(type, &b->format, k);
+			bch2_bkey_swab(&b->format, k);
+
+		if (!write &&
+		    version < bcachefs_metadata_version_bkey_renumber)
+			bch2_bkey_renumber(btree_node_type(b), k, write);
 
 		u = bkey_disassemble(b, k, &tmp);
 
-		invalid = __bch2_bkey_invalid(c, type, u) ?:
+		invalid = __bch2_bkey_invalid(c, u, btree_node_type(b)) ?:
 			bch2_bkey_in_btree_node(b, u) ?:
-			(write ? bch2_bkey_val_invalid(c, type, u) : NULL);
+			(write ? bch2_bkey_val_invalid(c, u) : NULL);
 		if (invalid) {
 			char buf[160];
 
-			bch2_bkey_val_to_text(&PBUF(buf), c, type, u);
+			bch2_bkey_val_to_text(&PBUF(buf), c, u);
 			btree_err(BTREE_ERR_FIXABLE, c, b, i,
 				  "invalid bkey:\n%s\n%s", invalid, buf);
 
@@ -720,6 +727,10 @@ static int validate_bset(struct bch_fs *c, struct btree *b,
 			continue;
 		}
 
+		if (write &&
+		    version < bcachefs_metadata_version_bkey_renumber)
+			bch2_bkey_renumber(btree_node_type(b), k, write);
+
 		/*
 		 * with the separate whiteouts thing (used for extents), the
 		 * second set of keys actually can have whiteouts too, so we
@@ -885,17 +896,16 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry
 
 	i = &b->data->keys;
 	for (k = i->start; k != vstruct_last(i);) {
-		enum bkey_type type = btree_node_type(b);
 		struct bkey tmp;
 		struct bkey_s_c u = bkey_disassemble(b, k, &tmp);
-		const char *invalid = bch2_bkey_val_invalid(c, type, u);
+		const char *invalid = bch2_bkey_val_invalid(c, u);
 
 		if (invalid ||
 		    (inject_invalid_keys(c) &&
 		     !bversion_cmp(u.k->version, MAX_VERSION))) {
 			char buf[160];
 
-			bch2_bkey_val_to_text(&PBUF(buf), c, type, u);
+			bch2_bkey_val_to_text(&PBUF(buf), c, u);
 			btree_err(BTREE_ERR_FIXABLE, c, b, i,
 				  "invalid bkey %s: %s", buf, invalid);
 
@@ -964,7 +974,9 @@ start:
 
 		bch2_mark_io_failure(&failed, &rb->pick);
 
-		can_retry = bch2_btree_pick_ptr(c, b, &failed, &rb->pick) > 0;
+		can_retry = bch2_bkey_pick_read_device(c,
+				bkey_i_to_s_c(&b->key),
+				&failed, &rb->pick) > 0;
 
 		if (!bio->bi_status &&
 		    !bch2_btree_node_read_done(c, b, can_retry))
@@ -1007,7 +1019,8 @@ void bch2_btree_node_read(struct bch_fs *c, struct btree *b,
 
 	trace_btree_read(c, b);
 
-	ret = bch2_btree_pick_ptr(c, b, NULL, &pick);
+	ret = bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key),
+					 NULL, &pick);
 	if (bch2_fs_fatal_err_on(ret <= 0, c,
 			"btree node read error: no device to read from")) {
 		set_btree_node_read_error(b);
@@ -1135,8 +1148,8 @@ static void bch2_btree_node_write_error(struct bch_fs *c,
 {
 	struct btree *b		= wbio->wbio.bio.bi_private;
 	__BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp;
-	struct bkey_i_extent *new_key;
-	struct bkey_s_extent e;
+	struct bkey_i_btree_ptr *new_key;
+	struct bkey_s_btree_ptr bp;
 	struct bch_extent_ptr *ptr;
 	struct btree_iter iter;
 	int ret;
@@ -1160,13 +1173,13 @@ retry:
 
 	bkey_copy(&tmp.k, &b->key);
 
-	new_key = bkey_i_to_extent(&tmp.k);
-	e = extent_i_to_s(new_key);
+	new_key = bkey_i_to_btree_ptr(&tmp.k);
+	bp = btree_ptr_i_to_s(new_key);
 
-	bch2_extent_drop_ptrs(e, ptr,
+	bch2_bkey_drop_ptrs(bkey_i_to_s(&tmp.k), ptr,
 		bch2_dev_list_has_dev(wbio->wbio.failed, ptr->dev));
 
-	if (!bch2_extent_nr_ptrs(e.c))
+	if (!bch2_bkey_nr_ptrs(bp.s_c))
 		goto err;
 
 	ret = bch2_btree_node_update_key(c, &iter, b, new_key);
@@ -1269,12 +1282,11 @@ static void btree_node_write_endio(struct bio *bio)
 static int validate_bset_for_write(struct bch_fs *c, struct btree *b,
 				   struct bset *i, unsigned sectors)
 {
-	const struct bch_extent_ptr *ptr;
 	unsigned whiteout_u64s = 0;
 	int ret;
 
-	extent_for_each_ptr(bkey_i_to_s_c_extent(&b->key), ptr)
-		break;
+	if (bch2_bkey_invalid(c, bkey_i_to_s_c(&b->key), BKEY_TYPE_BTREE))
+		return -1;
 
 	ret = validate_bset(c, b, i, sectors, &whiteout_u64s, WRITE, false);
 	if (ret)
@@ -1292,7 +1304,6 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
 	struct btree_node *bn = NULL;
 	struct btree_node_entry *bne = NULL;
 	BKEY_PADDED(key) k;
-	struct bkey_s_extent e;
 	struct bch_extent_ptr *ptr;
 	struct sort_iter sort_iter;
 	struct nonce nonce;
@@ -1300,6 +1311,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
 	u64 seq = 0;
 	bool used_mempool;
 	unsigned long old, new;
+	bool validate_before_checksum = false;
 	void *data;
 
 	if (test_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags))
@@ -1433,11 +1445,21 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
 	BUG_ON(BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN);
 	BUG_ON(i->seq != b->data->keys.seq);
 
-	i->version = cpu_to_le16(BCACHE_BSET_VERSION);
+	i->version = c->sb.version < bcachefs_metadata_version_new_versioning
+		? cpu_to_le16(BCH_BSET_VERSION_OLD)
+		: cpu_to_le16(c->sb.version);
 	SET_BSET_CSUM_TYPE(i, bch2_meta_checksum_type(c));
 
+	if (bch2_csum_type_is_encryption(BSET_CSUM_TYPE(i)))
+		validate_before_checksum = true;
+
+	/* validate_bset will be modifying: */
+	if (le16_to_cpu(i->version) <
+	    bcachefs_metadata_version_bkey_renumber)
+		validate_before_checksum = true;
+
 	/* if we're going to be encrypting, check metadata validity first: */
-	if (bch2_csum_type_is_encryption(BSET_CSUM_TYPE(i)) &&
+	if (validate_before_checksum &&
 	    validate_bset_for_write(c, b, i, sectors_to_write))
 		goto err;
 
@@ -1451,7 +1473,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
 		bne->csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne);
 
 	/* if we're not encrypting, check metadata after checksumming: */
-	if (!bch2_csum_type_is_encryption(BSET_CSUM_TYPE(i)) &&
+	if (!validate_before_checksum &&
 	    validate_bset_for_write(c, b, i, sectors_to_write))
 		goto err;
 
@@ -1506,9 +1528,8 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
 	 */
 
 	bkey_copy(&k.key, &b->key);
-	e = bkey_i_to_s_extent(&k.key);
 
-	extent_for_each_ptr(e, ptr)
+	bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&k.key)), ptr)
 		ptr->offset += b->written;
 
 	b->written += sectors_to_write;
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index afc43722c1fc..4720061e9562 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -433,7 +433,7 @@ static void __bch2_btree_iter_verify(struct btree_iter *iter,
 	 * whiteouts)
 	 */
 	k = b->level || iter->flags & BTREE_ITER_IS_EXTENTS
-		? bch2_btree_node_iter_prev_filter(&tmp, b, KEY_TYPE_DISCARD)
+		? bch2_btree_node_iter_prev_filter(&tmp, b, KEY_TYPE_discard)
 		: bch2_btree_node_iter_prev_all(&tmp, b);
 	if (k && btree_iter_pos_cmp(iter, b, k) > 0) {
 		char buf[100];
@@ -622,7 +622,7 @@ static inline struct bkey_s_c __btree_iter_unpack(struct btree_iter *iter,
 		 * signal to bch2_btree_iter_peek_slot() that we're currently at
 		 * a hole
 		 */
-		u->type = KEY_TYPE_DELETED;
+		u->type = KEY_TYPE_deleted;
 		return bkey_s_c_null;
 	}
 
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 7eecaa6cd5a2..b4a826369a57 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -405,20 +405,45 @@ static inline unsigned bset_byte_offset(struct btree *b, void *i)
 	return i - (void *) b->data;
 }
 
+enum btree_node_type {
+#define x(kwd, val, name) BKEY_TYPE_##kwd = val,
+	BCH_BTREE_IDS()
+#undef x
+	BKEY_TYPE_BTREE,
+};
+
+/* Type of a key in btree @id at level @level: */
+static inline enum btree_node_type __btree_node_type(unsigned level, enum btree_id id)
+{
+	return level ? BKEY_TYPE_BTREE : (enum btree_node_type) id;
+}
+
 /* Type of keys @b contains: */
-static inline enum bkey_type btree_node_type(struct btree *b)
+static inline enum btree_node_type btree_node_type(struct btree *b)
 {
-	return b->level ? BKEY_TYPE_BTREE : b->btree_id;
+	return __btree_node_type(b->level, b->btree_id);
 }
 
-static inline const struct bkey_ops *btree_node_ops(struct btree *b)
+static inline bool btree_node_type_is_extents(enum btree_node_type type)
 {
-	return &bch2_bkey_ops[btree_node_type(b)];
+	return type == BKEY_TYPE_EXTENTS;
 }
 
 static inline bool btree_node_is_extents(struct btree *b)
 {
-	return btree_node_type(b) == BKEY_TYPE_EXTENTS;
+	return btree_node_type_is_extents(btree_node_type(b));
+}
+
+static inline bool btree_node_type_needs_gc(enum btree_node_type type)
+{
+	switch (type) {
+	case BKEY_TYPE_BTREE:
+	case BKEY_TYPE_EXTENTS:
+	case BKEY_TYPE_EC:
+		return true;
+	default:
+		return false;
+	}
 }
 
 struct btree_root {
diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
index f6b0082235af..d1647f6eb476 100644
--- a/fs/bcachefs/btree_update.h
+++ b/fs/bcachefs/btree_update.h
@@ -120,7 +120,7 @@ int bch2_btree_delete_range(struct bch_fs *, enum btree_id,
 int bch2_btree_node_rewrite(struct bch_fs *c, struct btree_iter *,
 			    __le64, unsigned);
 int bch2_btree_node_update_key(struct bch_fs *, struct btree_iter *,
-			       struct btree *, struct bkey_i_extent *);
+			       struct btree *, struct bkey_i_btree_ptr *);
 
 /* new transactional interface: */
 
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 7d7a021416f3..22f087098776 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -132,13 +132,15 @@ bool bch2_btree_node_format_fits(struct bch_fs *c, struct btree *b,
 /* Btree node freeing/allocation: */
 
 static bool btree_key_matches(struct bch_fs *c,
-			      struct bkey_s_c_extent l,
-			      struct bkey_s_c_extent r)
+			      struct bkey_s_c l,
+			      struct bkey_s_c r)
 {
+	struct bkey_ptrs_c ptrs1 = bch2_bkey_ptrs_c(l);
+	struct bkey_ptrs_c ptrs2 = bch2_bkey_ptrs_c(r);
 	const struct bch_extent_ptr *ptr1, *ptr2;
 
-	extent_for_each_ptr(l, ptr1)
-		extent_for_each_ptr(r, ptr2)
+	bkey_for_each_ptr(ptrs1, ptr1)
+		bkey_for_each_ptr(ptrs2, ptr2)
 			if (ptr1->dev == ptr2->dev &&
 			    ptr1->gen == ptr2->gen &&
 			    ptr1->offset == ptr2->offset)
@@ -164,8 +166,7 @@ static void bch2_btree_node_free_index(struct btree_update *as, struct btree *b,
 
 	for (d = as->pending; d < as->pending + as->nr_pending; d++)
 		if (!bkey_cmp(k.k->p, d->key.k.p) &&
-		    btree_key_matches(c, bkey_s_c_to_extent(k),
-				      bkey_i_to_s_c_extent(&d->key)))
+		    btree_key_matches(c, k, bkey_i_to_s_c(&d->key)))
 			goto found;
 	BUG();
 found:
@@ -197,7 +198,7 @@ found:
 		       ? gc_pos_btree_node(b)
 		       : gc_pos_btree_root(as->btree_id)) >= 0 &&
 	    gc_pos_cmp(c->gc_pos, gc_phase(GC_PHASE_PENDING_DELETE)) < 0)
-		bch2_mark_key_locked(c, BKEY_TYPE_BTREE,
+		bch2_mark_key_locked(c,
 			      bkey_i_to_s_c(&d->key),
 			      false, 0, pos,
 			      NULL, 0, BCH_BUCKET_MARK_GC);
@@ -270,8 +271,7 @@ static void bch2_btree_node_free_ondisk(struct bch_fs *c,
 {
 	BUG_ON(!pending->index_update_done);
 
-	bch2_mark_key(c, BKEY_TYPE_BTREE,
-		      bkey_i_to_s_c(&pending->key),
+	bch2_mark_key(c, bkey_i_to_s_c(&pending->key),
 		      false, 0,
 		      gc_phase(GC_PHASE_PENDING_DELETE),
 		      NULL, 0, 0);
@@ -285,7 +285,6 @@ static struct btree *__bch2_btree_node_alloc(struct bch_fs *c,
 	struct write_point *wp;
 	struct btree *b;
 	BKEY_PADDED(k) tmp;
-	struct bkey_i_extent *e;
 	struct open_buckets ob = { .nr = 0 };
 	struct bch_devs_list devs_have = (struct bch_devs_list) { 0 };
 	unsigned nr_reserve;
@@ -336,8 +335,8 @@ retry:
 		goto retry;
 	}
 
-	e = bkey_extent_init(&tmp.k);
-	bch2_alloc_sectors_append_ptrs(c, wp, e, c->opts.btree_node_size);
+	bkey_btree_ptr_init(&tmp.k);
+	bch2_alloc_sectors_append_ptrs(c, wp, &tmp.k, c->opts.btree_node_size);
 
 	bch2_open_bucket_get(c, wp, &ob);
 	bch2_alloc_sectors_done(c, wp);
@@ -375,7 +374,7 @@ static struct btree *bch2_btree_node_alloc(struct btree_update *as, unsigned lev
 	b->data->flags = 0;
 	SET_BTREE_NODE_ID(b->data, as->btree_id);
 	SET_BTREE_NODE_LEVEL(b->data, level);
-	b->data->ptr = bkey_i_to_extent(&b->key)->v.start->ptr;
+	b->data->ptr = bkey_i_to_btree_ptr(&b->key)->v.start[0];
 
 	bch2_btree_build_aux_trees(b);
 
@@ -528,8 +527,7 @@ static struct btree_reserve *bch2_btree_reserve_get(struct bch_fs *c,
 			goto err_free;
 		}
 
-		ret = bch2_mark_bkey_replicas(c, BKEY_TYPE_BTREE,
-					      bkey_i_to_s_c(&b->key));
+		ret = bch2_mark_bkey_replicas(c, bkey_i_to_s_c(&b->key));
 		if (ret)
 			goto err_free;
 
@@ -1072,8 +1070,7 @@ static void bch2_btree_set_root_inmem(struct btree_update *as, struct btree *b)
 	mutex_lock(&c->btree_interior_update_lock);
 	percpu_down_read(&c->usage_lock);
 
-	bch2_mark_key_locked(c, BKEY_TYPE_BTREE,
-		      bkey_i_to_s_c(&b->key),
+	bch2_mark_key_locked(c, bkey_i_to_s_c(&b->key),
 		      true, 0,
 		      gc_pos_btree_root(b->btree_id),
 		      &stats, 0, 0);
@@ -1166,11 +1163,9 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, struct btree *b
 	mutex_lock(&c->btree_interior_update_lock);
 	percpu_down_read(&c->usage_lock);
 
-	if (bkey_extent_is_data(&insert->k))
-		bch2_mark_key_locked(c, BKEY_TYPE_BTREE,
-			      bkey_i_to_s_c(insert),
-			      true, 0,
-			      gc_pos_btree_node(b), &stats, 0, 0);
+	bch2_mark_key_locked(c, bkey_i_to_s_c(insert),
+			     true, 0,
+			     gc_pos_btree_node(b), &stats, 0, 0);
 
 	while ((k = bch2_btree_node_iter_peek_all(node_iter, b)) &&
 	       bkey_iter_pos_cmp(b, &insert->k.p, k) > 0)
@@ -1893,7 +1888,7 @@ static void __bch2_btree_node_update_key(struct bch_fs *c,
 					 struct btree_update *as,
 					 struct btree_iter *iter,
 					 struct btree *b, struct btree *new_hash,
-					 struct bkey_i_extent *new_key)
+					 struct bkey_i_btree_ptr *new_key)
 {
 	struct btree *parent;
 	int ret;
@@ -1938,7 +1933,7 @@ static void __bch2_btree_node_update_key(struct bch_fs *c,
 	 */
 	ret = bch2_disk_reservation_add(c, &as->reserve->disk_res,
 			c->opts.btree_node_size *
-			bch2_extent_nr_ptrs(extent_i_to_s_c(new_key)),
+			bch2_bkey_nr_ptrs(bkey_i_to_s_c(&new_key->k_i)),
 			BCH_DISK_RESERVATION_NOFAIL|
 			BCH_DISK_RESERVATION_GC_LOCK_HELD);
 	BUG_ON(ret);
@@ -1978,8 +1973,7 @@ static void __bch2_btree_node_update_key(struct bch_fs *c,
 		mutex_lock(&c->btree_interior_update_lock);
 		percpu_down_read(&c->usage_lock);
 
-		bch2_mark_key_locked(c, BKEY_TYPE_BTREE,
-			      bkey_i_to_s_c(&new_key->k_i),
+		bch2_mark_key_locked(c, bkey_i_to_s_c(&new_key->k_i),
 			      true, 0,
 			      gc_pos_btree_root(b->btree_id),
 			      &stats, 0, 0);
@@ -2012,7 +2006,8 @@ static void __bch2_btree_node_update_key(struct bch_fs *c,
 }
 
 int bch2_btree_node_update_key(struct bch_fs *c, struct btree_iter *iter,
-			       struct btree *b, struct bkey_i_extent *new_key)
+			       struct btree *b,
+			       struct bkey_i_btree_ptr *new_key)
 {
 	struct btree *parent = btree_node_parent(iter, b);
 	struct btree_update *as = NULL;
@@ -2078,8 +2073,7 @@ int bch2_btree_node_update_key(struct bch_fs *c, struct btree_iter *iter,
 			goto err;
 	}
 
-	ret = bch2_mark_bkey_replicas(c, BKEY_TYPE_BTREE,
-				      extent_i_to_s_c(new_key).s_c);
+	ret = bch2_mark_bkey_replicas(c, bkey_i_to_s_c(&new_key->k_i));
 	if (ret)
 		goto err_free_update;
 
@@ -2137,9 +2131,9 @@ void bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id)
 	b->level	= 0;
 	b->btree_id	= id;
 
-	bkey_extent_init(&b->key);
+	bkey_btree_ptr_init(&b->key);
 	b->key.k.p = POS_MAX;
-	bkey_i_to_extent(&b->key)->v._data[0] = U64_MAX - id;
+	PTR_HASH(&b->key) = U64_MAX - id;
 
 	bch2_bset_init_first(b, &b->data->keys);
 	bch2_btree_build_aux_trees(b);
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 4b0d674472db..fd27334cf2a4 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -71,7 +71,7 @@ bool bch2_btree_bset_insert_key(struct btree_iter *iter,
 			goto overwrite;
 		}
 
-		k->type = KEY_TYPE_DELETED;
+		k->type = KEY_TYPE_deleted;
 		bch2_btree_node_iter_fix(iter, b, node_iter, k,
 					 k->u64s, k->u64s);
 		bch2_btree_iter_verify(iter, b);
@@ -312,7 +312,6 @@ btree_key_can_insert(struct btree_insert *trans,
 		return BTREE_INSERT_BTREE_NODE_FULL;
 
 	if (!bch2_bkey_replicas_marked(c,
-			insert->iter->btree_id,
 			bkey_i_to_s_c(insert->k),
 			true))
 		return BTREE_INSERT_NEED_MARK_REPLICAS;
@@ -449,8 +448,8 @@ static inline void btree_insert_entry_checks(struct bch_fs *c,
 	BUG_ON(bkey_cmp(bkey_start_pos(&i->k->k), i->iter->pos));
 	BUG_ON(debug_check_bkeys(c) &&
 	       !bkey_deleted(&i->k->k) &&
-	       bch2_bkey_invalid(c, (enum bkey_type) i->iter->btree_id,
-				 bkey_i_to_s_c(i->k)));
+	       bch2_bkey_invalid(c, bkey_i_to_s_c(i->k),
+				 i->iter->btree_id));
 }
 
 /**
@@ -585,8 +584,7 @@ err:
 		}
 
 		bch2_btree_iter_unlock(trans->entries[0].iter);
-		ret = bch2_mark_bkey_replicas(c, i->iter->btree_id,
-					      bkey_i_to_s_c(i->k))
+		ret = bch2_mark_bkey_replicas(c, bkey_i_to_s_c(i->k))
 			?: -EINTR;
 		break;
 	default:
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 3f4bbf280a78..d08e95020cef 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -567,7 +567,7 @@ static int __disk_sectors(struct bch_extent_crc_unpacked crc, unsigned sectors)
 				    crc.uncompressed_size));
 }
 
-static s64 ptr_disk_sectors(struct bkey_s_c_extent e,
+static s64 ptr_disk_sectors(const struct bkey *k,
 			    struct extent_ptr_decoded p,
 			    s64 sectors)
 {
@@ -579,8 +579,8 @@ static s64 ptr_disk_sectors(struct bkey_s_c_extent e,
 			old_sectors = 0;
 			new_sectors = sectors;
 		} else {
-			old_sectors = e.k->size;
-			new_sectors = e.k->size + sectors;
+			old_sectors = k->size;
+			new_sectors = k->size + sectors;
 		}
 
 		sectors = -__disk_sectors(p.crc, old_sectors)
@@ -596,7 +596,6 @@ static s64 ptr_disk_sectors(struct bkey_s_c_extent e,
  * that with the gc pos seqlock held.
  */
 static void bch2_mark_pointer(struct bch_fs *c,
-			      struct bkey_s_c_extent e,
 			      struct extent_ptr_decoded p,
 			      s64 sectors, enum bch_data_type data_type,
 			      struct bch_fs_usage *fs_usage,
@@ -709,70 +708,54 @@ static int bch2_mark_extent(struct bch_fs *c, struct bkey_s_c k,
 			    u64 journal_seq, unsigned flags,
 			    bool gc)
 {
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+	const union bch_extent_entry *entry;
+	struct extent_ptr_decoded p;
+	s64 cached_sectors	= 0;
+	s64 dirty_sectors	= 0;
+	s64 ec_sectors		= 0;
+	unsigned replicas	= 0;
+	unsigned ec_redundancy	= 0;
+	unsigned i;
+	int ret;
+
 	BUG_ON(!sectors);
 
-	switch (k.k->type) {
-	case BCH_EXTENT:
-	case BCH_EXTENT_CACHED: {
-		struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
-		const union bch_extent_entry *entry;
-		struct extent_ptr_decoded p;
-		s64 cached_sectors	= 0;
-		s64 dirty_sectors	= 0;
-		s64 ec_sectors		= 0;
-		unsigned replicas	= 0;
-		unsigned ec_redundancy	= 0;
-		unsigned i;
-		int ret;
-
-		extent_for_each_ptr_decode(e, p, entry) {
-			s64 disk_sectors = ptr_disk_sectors(e, p, sectors);
-			s64 adjusted_disk_sectors = disk_sectors;
-
-			bch2_mark_pointer(c, e, p, disk_sectors, data_type,
-					  stats, journal_seq, flags, gc);
-
-			if (!p.ptr.cached)
-				for (i = 0; i < p.ec_nr; i++) {
-					ret = bch2_mark_stripe_ptr(c, p.ec[i],
-							disk_sectors, flags,
-							&adjusted_disk_sectors,
-							&ec_redundancy, gc);
-					if (ret)
-						return ret;
-				}
-			if (!p.ptr.cached)
-				replicas++;
-
-			if (p.ptr.cached)
-				cached_sectors	+= adjusted_disk_sectors;
-			else if (!p.ec_nr)
-				dirty_sectors	+= adjusted_disk_sectors;
-			else
-				ec_sectors	+= adjusted_disk_sectors;
-		}
+	bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+		s64 disk_sectors = ptr_disk_sectors(k.k, p, sectors);
+		s64 adjusted_disk_sectors = disk_sectors;
 
-		replicas	= clamp_t(unsigned,	replicas,
-					  1, ARRAY_SIZE(stats->replicas));
-		ec_redundancy	= clamp_t(unsigned,	ec_redundancy,
-					  1, ARRAY_SIZE(stats->replicas));
+		bch2_mark_pointer(c, p, disk_sectors, data_type,
+				  stats, journal_seq, flags, gc);
 
-		stats->replicas[0].data[BCH_DATA_CACHED]	+= cached_sectors;
-		stats->replicas[replicas - 1].data[data_type]	+= dirty_sectors;
-		stats->replicas[ec_redundancy - 1].ec_data	+= ec_sectors;
-		break;
+		if (!p.ptr.cached)
+			for (i = 0; i < p.ec_nr; i++) {
+				ret = bch2_mark_stripe_ptr(c, p.ec[i],
+						disk_sectors, flags,
+						&adjusted_disk_sectors,
+						&ec_redundancy, gc);
+				if (ret)
+					return ret;
+			}
+		if (!p.ptr.cached)
+			replicas++;
+
+		if (p.ptr.cached)
+			cached_sectors	+= adjusted_disk_sectors;
+		else if (!p.ec_nr)
+			dirty_sectors	+= adjusted_disk_sectors;
+		else
+			ec_sectors	+= adjusted_disk_sectors;
 	}
-	case BCH_RESERVATION: {
-		unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas;
 
-		sectors *= replicas;
-		replicas = clamp_t(unsigned, replicas,
-				   1, ARRAY_SIZE(stats->replicas));
+	replicas	= clamp_t(unsigned,	replicas,
+				  1, ARRAY_SIZE(stats->replicas));
+	ec_redundancy	= clamp_t(unsigned,	ec_redundancy,
+				  1, ARRAY_SIZE(stats->replicas));
 
-		stats->replicas[replicas - 1].persistent_reserved += sectors;
-		break;
-	}
-	}
+	stats->replicas[0].data[BCH_DATA_CACHED]	+= cached_sectors;
+	stats->replicas[replicas - 1].data[data_type]	+= dirty_sectors;
+	stats->replicas[ec_redundancy - 1].ec_data	+= ec_sectors;
 
 	return 0;
 }
@@ -813,56 +796,49 @@ static int bch2_mark_stripe(struct bch_fs *c, struct bkey_s_c k,
 			    u64 journal_seq, unsigned flags,
 			    bool gc)
 {
-	switch (k.k->type) {
-	case BCH_STRIPE: {
-		struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k);
-		size_t idx = s.k->p.offset;
-		struct stripe *m = genradix_ptr(&c->stripes[gc], idx);
-		unsigned i;
-
-		if (!m || (!inserting && !m->alive)) {
-			bch_err_ratelimited(c, "error marking nonexistent stripe %zu",
-					    idx);
-			return -1;
-		}
-
-		if (inserting && m->alive) {
-			bch_err_ratelimited(c, "error marking stripe %zu: already exists",
-					    idx);
-			return -1;
-		}
+	struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k);
+	size_t idx = s.k->p.offset;
+	struct stripe *m = genradix_ptr(&c->stripes[gc], idx);
+	unsigned i;
 
-		BUG_ON(atomic_read(&m->blocks_nonempty));
+	if (!m || (!inserting && !m->alive)) {
+		bch_err_ratelimited(c, "error marking nonexistent stripe %zu",
+				    idx);
+		return -1;
+	}
 
-		for (i = 0; i < EC_STRIPE_MAX; i++)
-			BUG_ON(atomic_read(&m->block_sectors[i]));
+	if (inserting && m->alive) {
+		bch_err_ratelimited(c, "error marking stripe %zu: already exists",
+				    idx);
+		return -1;
+	}
 
-		if (inserting) {
-			m->sectors	= le16_to_cpu(s.v->sectors);
-			m->algorithm	= s.v->algorithm;
-			m->nr_blocks	= s.v->nr_blocks;
-			m->nr_redundant	= s.v->nr_redundant;
-		}
+	BUG_ON(atomic_read(&m->blocks_nonempty));
 
-		if (!gc) {
-			if (inserting)
-				bch2_stripes_heap_insert(c, m, idx);
-			else
-				bch2_stripes_heap_del(c, m, idx);
-		} else {
-			m->alive = inserting;
-		}
+	for (i = 0; i < EC_STRIPE_MAX; i++)
+		BUG_ON(atomic_read(&m->block_sectors[i]));
 
-		bucket_set_stripe(c, s.v, inserting, fs_usage, 0, gc);
-		break;
+	if (inserting) {
+		m->sectors	= le16_to_cpu(s.v->sectors);
+		m->algorithm	= s.v->algorithm;
+		m->nr_blocks	= s.v->nr_blocks;
+		m->nr_redundant	= s.v->nr_redundant;
 	}
+
+	if (!gc) {
+		if (inserting)
+			bch2_stripes_heap_insert(c, m, idx);
+		else
+			bch2_stripes_heap_del(c, m, idx);
+	} else {
+		m->alive = inserting;
 	}
 
+	bucket_set_stripe(c, s.v, inserting, fs_usage, 0, gc);
 	return 0;
 }
 
-static int __bch2_mark_key(struct bch_fs *c,
-			   enum bkey_type type, struct bkey_s_c k,
+static int __bch2_mark_key(struct bch_fs *c, struct bkey_s_c k,
 			   bool inserting, s64 sectors,
 			   struct bch_fs_usage *stats,
 			   u64 journal_seq, unsigned flags,
@@ -870,22 +846,32 @@ static int __bch2_mark_key(struct bch_fs *c,
 {
 	int ret = 0;
 
-	switch (type) {
-	case BKEY_TYPE_BTREE:
+	switch (k.k->type) {
+	case KEY_TYPE_btree_ptr:
 		ret = bch2_mark_extent(c, k, inserting
 				       ?  c->opts.btree_node_size
 				       : -c->opts.btree_node_size,
 				       BCH_DATA_BTREE,
 				       stats, journal_seq, flags, gc);
 		break;
-	case BKEY_TYPE_EXTENTS:
+	case KEY_TYPE_extent:
 		ret = bch2_mark_extent(c, k, sectors, BCH_DATA_USER,
 				       stats, journal_seq, flags, gc);
 		break;
-	case BKEY_TYPE_EC:
+	case KEY_TYPE_stripe:
 		ret = bch2_mark_stripe(c, k, inserting,
 				       stats, journal_seq, flags, gc);
 		break;
+	case KEY_TYPE_reservation: {
+		unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas;
+
+		sectors *= replicas;
+		replicas = clamp_t(unsigned, replicas,
+				   1, ARRAY_SIZE(stats->replicas));
+
+		stats->replicas[replicas - 1].persistent_reserved += sectors;
+		break;
+	}
 	default:
 		break;
 	}
@@ -894,7 +880,7 @@ static int __bch2_mark_key(struct bch_fs *c,
 }
 
 int bch2_mark_key_locked(struct bch_fs *c,
-		   enum bkey_type type, struct bkey_s_c k,
+		   struct bkey_s_c k,
 		   bool inserting, s64 sectors,
 		   struct gc_pos pos,
 		   struct bch_fs_usage *stats,
@@ -906,7 +892,7 @@ int bch2_mark_key_locked(struct bch_fs *c,
 		if (!stats)
 			stats = this_cpu_ptr(c->usage[0]);
 
-		ret = __bch2_mark_key(c, type, k, inserting, sectors,
+		ret = __bch2_mark_key(c, k, inserting, sectors,
 				      stats, journal_seq, flags, false);
 		if (ret)
 			return ret;
@@ -914,7 +900,7 @@ int bch2_mark_key_locked(struct bch_fs *c,
 
 	if ((flags & BCH_BUCKET_MARK_GC) ||
 	    gc_visited(c, pos)) {
-		ret = __bch2_mark_key(c, type, k, inserting, sectors,
+		ret = __bch2_mark_key(c, k, inserting, sectors,
 				      this_cpu_ptr(c->usage[1]),
 				      journal_seq, flags, true);
 		if (ret)
@@ -924,8 +910,7 @@ int bch2_mark_key_locked(struct bch_fs *c,
 	return 0;
 }
 
-int bch2_mark_key(struct bch_fs *c,
-		  enum bkey_type type, struct bkey_s_c k,
+int bch2_mark_key(struct bch_fs *c, struct bkey_s_c k,
 		  bool inserting, s64 sectors,
 		  struct gc_pos pos,
 		  struct bch_fs_usage *stats,
@@ -934,7 +919,7 @@ int bch2_mark_key(struct bch_fs *c,
 	int ret;
 
 	percpu_down_read(&c->usage_lock);
-	ret = bch2_mark_key_locked(c, type, k, inserting, sectors,
+	ret = bch2_mark_key_locked(c, k, inserting, sectors,
 				   pos, stats, journal_seq, flags);
 	percpu_up_read(&c->usage_lock);
 
@@ -952,20 +937,19 @@ void bch2_mark_update(struct btree_insert *trans,
 	struct gc_pos		pos = gc_pos_btree_node(b);
 	struct bkey_packed	*_k;
 
-	if (!bkey_type_needs_gc(iter->btree_id))
+	if (!btree_node_type_needs_gc(iter->btree_id))
 		return;
 
 	percpu_down_read(&c->usage_lock);
 
 	if (!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))
-		bch2_mark_key_locked(c, btree_node_type(b),
-			bkey_i_to_s_c(insert->k), true,
+		bch2_mark_key_locked(c, bkey_i_to_s_c(insert->k), true,
 			bpos_min(insert->k->k.p, b->key.k.p).offset -
 			bkey_start_offset(&insert->k->k),
 			pos, &stats, trans->journal_res.seq, 0);
 
 	while ((_k = bch2_btree_node_iter_peek_filter(&node_iter, b,
-						      KEY_TYPE_DISCARD))) {
+						      KEY_TYPE_discard))) {
 		struct bkey		unpacked;
 		struct bkey_s_c		k;
 		s64			sectors = 0;
@@ -994,9 +978,8 @@ void bch2_mark_update(struct btree_insert *trans,
 				sectors = k.k->p.offset - insert->k->k.p.offset;
 				BUG_ON(sectors <= 0);
 
-				bch2_mark_key_locked(c, btree_node_type(b),
-					k, true, sectors, pos, &stats,
-					trans->journal_res.seq, 0);
+				bch2_mark_key_locked(c, k, true, sectors,
+					pos, &stats, trans->journal_res.seq, 0);
 
 				sectors = bkey_start_offset(&insert->k->k) -
 					k.k->p.offset;
@@ -1006,9 +989,8 @@ void bch2_mark_update(struct btree_insert *trans,
 			BUG_ON(sectors >= 0);
 		}
 
-		bch2_mark_key_locked(c, btree_node_type(b),
-			k, false, sectors, pos, &stats,
-			trans->journal_res.seq, 0);
+		bch2_mark_key_locked(c, k, false, sectors,
+			pos, &stats, trans->journal_res.seq, 0);
 
 		bch2_btree_node_iter_advance(&node_iter, b);
 	}
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index 884041b53eb9..c584ad1b4375 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -220,10 +220,10 @@ void bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *,
 #define BCH_BUCKET_MARK_NOATOMIC		(1 << 0)
 #define BCH_BUCKET_MARK_GC			(1 << 1)
 
-int bch2_mark_key_locked(struct bch_fs *, enum bkey_type, struct bkey_s_c,
+int bch2_mark_key_locked(struct bch_fs *, struct bkey_s_c,
 		  bool, s64, struct gc_pos,
 		  struct bch_fs_usage *, u64, unsigned);
-int bch2_mark_key(struct bch_fs *, enum bkey_type, struct bkey_s_c,
+int bch2_mark_key(struct bch_fs *, struct bkey_s_c,
 		  bool, s64, struct gc_pos,
 		  struct bch_fs_usage *, u64, unsigned);
 void bch2_mark_update(struct btree_insert *, struct btree_insert_entry *);
diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c
index 0a9efe57d5a9..f15c29878a9e 100644
--- a/fs/bcachefs/debug.c
+++ b/fs/bcachefs/debug.c
@@ -56,7 +56,8 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b)
 	v->btree_id	= b->btree_id;
 	bch2_btree_keys_init(v, &c->expensive_debug_checks);
 
-	if (bch2_btree_pick_ptr(c, b, NULL, &pick) <= 0)
+	if (bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key),
+				       NULL, &pick) <= 0)
 		return;
 
 	ca = bch_dev_bkey_exists(c, pick.ptr.dev);
@@ -223,8 +224,7 @@ static ssize_t bch2_read_btree(struct file *file, char __user *buf,
 	k = bch2_btree_iter_peek(&iter);
 
 	while (k.k && !(err = btree_iter_err(k))) {
-		bch2_bkey_val_to_text(&PBUF(i->buf), i->c,
-				      bkey_type(0, i->id), k);
+		bch2_bkey_val_to_text(&PBUF(i->buf), i->c, k);
 		i->bytes = strlen(i->buf);
 		BUG_ON(i->bytes >= PAGE_SIZE);
 		i->buf[i->bytes] = '\n';
diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c
index c1a611b4d9ec..80d37c568272 100644
--- a/fs/bcachefs/dirent.c
+++ b/fs/bcachefs/dirent.c
@@ -65,8 +65,7 @@ static bool dirent_cmp_bkey(struct bkey_s_c _l, struct bkey_s_c _r)
 
 const struct bch_hash_desc bch2_dirent_hash_desc = {
 	.btree_id	= BTREE_ID_DIRENTS,
-	.key_type	= BCH_DIRENT,
-	.whiteout_type	= BCH_DIRENT_WHITEOUT,
+	.key_type	= KEY_TYPE_dirent,
 	.hash_key	= dirent_hash_key,
 	.hash_bkey	= dirent_hash_bkey,
 	.cmp_key	= dirent_cmp_key,
@@ -75,58 +74,37 @@ const struct bch_hash_desc bch2_dirent_hash_desc = {
 
 const char *bch2_dirent_invalid(const struct bch_fs *c, struct bkey_s_c k)
 {
-	struct bkey_s_c_dirent d;
+	struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
 	unsigned len;
 
-	switch (k.k->type) {
-	case BCH_DIRENT:
-		if (bkey_val_bytes(k.k) < sizeof(struct bch_dirent))
-			return "value too small";
-
-		d = bkey_s_c_to_dirent(k);
-		len = bch2_dirent_name_bytes(d);
-
-		if (!len)
-			return "empty name";
+	if (bkey_val_bytes(k.k) < sizeof(struct bch_dirent))
+		return "value too small";
 
-		/*
-		 * older versions of bcachefs were buggy and creating dirent
-		 * keys that were bigger than necessary:
-		 */
-		if (bkey_val_u64s(k.k) > dirent_val_u64s(len + 7))
-			return "value too big";
+	len = bch2_dirent_name_bytes(d);
+	if (!len)
+		return "empty name";
 
-		if (len > BCH_NAME_MAX)
-			return "dirent name too big";
+	/*
+	 * older versions of bcachefs were buggy and creating dirent
+	 * keys that were bigger than necessary:
+	 */
+	if (bkey_val_u64s(k.k) > dirent_val_u64s(len + 7))
+		return "value too big";
 
-		return NULL;
-	case BCH_DIRENT_WHITEOUT:
-		return bkey_val_bytes(k.k) != 0
-			? "value size should be zero"
-			: NULL;
+	if (len > BCH_NAME_MAX)
+		return "dirent name too big";
 
-	default:
-		return "invalid type";
-	}
+	return NULL;
 }
 
 void bch2_dirent_to_text(struct printbuf *out, struct bch_fs *c,
 			 struct bkey_s_c k)
 {
-	struct bkey_s_c_dirent d;
-
-	switch (k.k->type) {
-	case BCH_DIRENT:
-		d = bkey_s_c_to_dirent(k);
-
-		bch_scnmemcpy(out, d.v->d_name,
-			      bch2_dirent_name_bytes(d));
-		pr_buf(out, " -> %llu", d.v->d_inum);
-		break;
-	case BCH_DIRENT_WHITEOUT:
-		pr_buf(out, "whiteout");
-		break;
-	}
+	struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
+
+	bch_scnmemcpy(out, d.v->d_name,
+		      bch2_dirent_name_bytes(d));
+	pr_buf(out, " -> %llu", d.v->d_inum);
 }
 
 static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans,
@@ -287,7 +265,7 @@ int bch2_dirent_rename(struct btree_trans *trans,
 				 * overwrite old_dst - just make sure to use a
 				 * whiteout when deleting src:
 				 */
-				new_src->k.type = BCH_DIRENT_WHITEOUT;
+				new_src->k.type = KEY_TYPE_whiteout;
 			}
 		} else {
 			/* Check if we need a whiteout to delete src: */
@@ -298,7 +276,7 @@ int bch2_dirent_rename(struct btree_trans *trans,
 				return ret;
 
 			if (ret)
-				new_src->k.type = BCH_DIRENT_WHITEOUT;
+				new_src->k.type = KEY_TYPE_whiteout;
 		}
 	}
 
@@ -361,7 +339,7 @@ int bch2_empty_dir(struct bch_fs *c, u64 dir_inum)
 		if (k.k->p.inode > dir_inum)
 			break;
 
-		if (k.k->type == BCH_DIRENT) {
+		if (k.k->type == KEY_TYPE_dirent) {
 			ret = -ENOTEMPTY;
 			break;
 		}
@@ -385,7 +363,7 @@ int bch2_readdir(struct bch_fs *c, struct file *file,
 
 	for_each_btree_key(&iter, c, BTREE_ID_DIRENTS,
 			   POS(inode->v.i_ino, ctx->pos), 0, k) {
-		if (k.k->type != BCH_DIRENT)
+		if (k.k->type != KEY_TYPE_dirent)
 			continue;
 
 		dirent = bkey_s_c_to_dirent(k);
diff --git a/fs/bcachefs/dirent.h b/fs/bcachefs/dirent.h
index 2afb0baed11a..7b47573dcc46 100644
--- a/fs/bcachefs/dirent.h
+++ b/fs/bcachefs/dirent.h
@@ -9,7 +9,7 @@ extern const struct bch_hash_desc bch2_dirent_hash_desc;
 const char *bch2_dirent_invalid(const struct bch_fs *, struct bkey_s_c);
 void bch2_dirent_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 
-#define bch2_bkey_dirent_ops (struct bkey_ops) {	\
+#define bch2_bkey_ops_dirent (struct bkey_ops) {	\
 	.key_invalid	= bch2_dirent_invalid,		\
 	.val_to_text	= bch2_dirent_to_text,		\
 }
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index 091a1f0a0432..010b9b90f2fc 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -123,49 +123,39 @@ static void *stripe_csum(struct bch_stripe *s, unsigned dev, unsigned csum_idx)
 	return csums + (dev * stripe_csums_per_device(s) + csum_idx) * csum_bytes;
 }
 
-const char *bch2_ec_key_invalid(const struct bch_fs *c, struct bkey_s_c k)
+const char *bch2_stripe_invalid(const struct bch_fs *c, struct bkey_s_c k)
 {
+	const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
+
 	if (k.k->p.inode)
 		return "invalid stripe key";
 
-	switch (k.k->type) {
-	case BCH_STRIPE: {
-		const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
-
-		if (bkey_val_bytes(k.k) < sizeof(*s))
-			return "incorrect value size";
+	if (bkey_val_bytes(k.k) < sizeof(*s))
+		return "incorrect value size";
 
-		if (bkey_val_u64s(k.k) != stripe_val_u64s(s))
-			return "incorrect value size";
+	if (bkey_val_u64s(k.k) != stripe_val_u64s(s))
+		return "incorrect value size";
 
-		return NULL;
-	}
-	default:
-		return "invalid type";
-	}
+	return NULL;
 }
 
-void bch2_ec_key_to_text(struct printbuf *out, struct bch_fs *c,
+void bch2_stripe_to_text(struct printbuf *out, struct bch_fs *c,
 			 struct bkey_s_c k)
 {
-	switch (k.k->type) {
-	case BCH_STRIPE: {
-		const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
-		unsigned i;
-
-		pr_buf(out, "algo %u sectors %u blocks %u:%u csum %u gran %u",
-		       s->algorithm,
-		       le16_to_cpu(s->sectors),
-		       s->nr_blocks - s->nr_redundant,
-		       s->nr_redundant,
-		       s->csum_type,
-		       1U << s->csum_granularity_bits);
-
-		for (i = 0; i < s->nr_blocks; i++)
-			pr_buf(out, " %u:%llu", s->ptrs[i].dev,
-			       (u64) s->ptrs[i].offset);
-	}
-	}
+	const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
+	unsigned i;
+
+	pr_buf(out, "algo %u sectors %u blocks %u:%u csum %u gran %u",
+	       s->algorithm,
+	       le16_to_cpu(s->sectors),
+	       s->nr_blocks - s->nr_redundant,
+	       s->nr_redundant,
+	       s->csum_type,
+	       1U << s->csum_granularity_bits);
+
+	for (i = 0; i < s->nr_blocks; i++)
+		pr_buf(out, " %u:%llu", s->ptrs[i].dev,
+		       (u64) s->ptrs[i].offset);
 }
 
 static int ptr_matches_stripe(struct bch_fs *c,
@@ -454,7 +444,7 @@ int bch2_ec_read_extent(struct bch_fs *c, struct bch_read_bio *rbio)
 			     POS(0, stripe_idx),
 			     BTREE_ITER_SLOTS);
 	k = bch2_btree_iter_peek_slot(&iter);
-	if (btree_iter_err(k) || k.k->type != BCH_STRIPE) {
+	if (btree_iter_err(k) || k.k->type != KEY_TYPE_stripe) {
 		__bcache_io_error(c,
 			"error doing reconstruct read: stripe not found");
 		kfree(buf);
@@ -695,7 +685,7 @@ static void ec_stripe_delete(struct bch_fs *c, size_t idx)
 			     POS(0, idx),
 			     BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
 	k = bch2_btree_iter_peek_slot(&iter);
-	if (btree_iter_err(k) || k.k->type != BCH_STRIPE)
+	if (btree_iter_err(k) || k.k->type != KEY_TYPE_stripe)
 		goto out;
 
 	v = kmalloc(bkey_val_bytes(k.k), GFP_KERNEL);
diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h
index c35de8b1ef64..4a8cade37c7a 100644
--- a/fs/bcachefs/ec.h
+++ b/fs/bcachefs/ec.h
@@ -5,13 +5,13 @@
 #include "ec_types.h"
 #include "keylist_types.h"
 
-const char *bch2_ec_key_invalid(const struct bch_fs *, struct bkey_s_c);
-void bch2_ec_key_to_text(struct printbuf *, struct bch_fs *,
+const char *bch2_stripe_invalid(const struct bch_fs *, struct bkey_s_c);
+void bch2_stripe_to_text(struct printbuf *, struct bch_fs *,
 			 struct bkey_s_c);
 
-#define bch2_bkey_ec_ops (struct bkey_ops) {		\
-	.key_invalid	= bch2_ec_key_invalid,		\
-	.val_to_text	= bch2_ec_key_to_text,		\
+#define bch2_bkey_ops_stripe (struct bkey_ops) {	\
+	.key_invalid	= bch2_stripe_invalid,		\
+	.val_to_text	= bch2_stripe_to_text,		\
 }
 
 struct bch_read_bio;
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 582499b08f31..c9a6f6e4a165 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -27,84 +27,34 @@
 #include "util.h"
 #include "xattr.h"
 
-/* Common among btree and extent ptrs */
-
-const struct bch_extent_ptr *
-bch2_extent_has_device(struct bkey_s_c_extent e, unsigned dev)
-{
-	const struct bch_extent_ptr *ptr;
-
-	extent_for_each_ptr(e, ptr)
-		if (ptr->dev == dev)
-			return ptr;
-
-	return NULL;
-}
-
-void bch2_extent_drop_device(struct bkey_s_extent e, unsigned dev)
-{
-	struct bch_extent_ptr *ptr;
-
-	bch2_extent_drop_ptrs(e, ptr, ptr->dev == dev);
-}
-
-const struct bch_extent_ptr *
-bch2_extent_has_group(struct bch_fs *c, struct bkey_s_c_extent e, unsigned group)
-{
-	const struct bch_extent_ptr *ptr;
-
-	extent_for_each_ptr(e, ptr) {
-		struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
-
-		if (ca->mi.group &&
-		    ca->mi.group - 1 == group)
-			return ptr;
-	}
-
-	return NULL;
-}
-
-const struct bch_extent_ptr *
-bch2_extent_has_target(struct bch_fs *c, struct bkey_s_c_extent e, unsigned target)
-{
-	const struct bch_extent_ptr *ptr;
-
-	extent_for_each_ptr(e, ptr)
-		if (bch2_dev_in_target(c, ptr->dev, target) &&
-		    (!ptr->cached ||
-		     !ptr_stale(bch_dev_bkey_exists(c, ptr->dev), ptr)))
-			return ptr;
-
-	return NULL;
-}
-
-unsigned bch2_extent_nr_ptrs(struct bkey_s_c_extent e)
+unsigned bch2_bkey_nr_ptrs(struct bkey_s_c k)
 {
+	struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k);
 	const struct bch_extent_ptr *ptr;
 	unsigned nr_ptrs = 0;
 
-	extent_for_each_ptr(e, ptr)
+	bkey_for_each_ptr(p, ptr)
 		nr_ptrs++;
 
 	return nr_ptrs;
 }
 
-unsigned bch2_extent_nr_dirty_ptrs(struct bkey_s_c k)
+unsigned bch2_bkey_nr_dirty_ptrs(struct bkey_s_c k)
 {
-	struct bkey_s_c_extent e;
-	const struct bch_extent_ptr *ptr;
 	unsigned nr_ptrs = 0;
 
 	switch (k.k->type) {
-	case BCH_EXTENT:
-	case BCH_EXTENT_CACHED:
-		e = bkey_s_c_to_extent(k);
+	case KEY_TYPE_btree_ptr:
+	case KEY_TYPE_extent: {
+		struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k);
+		const struct bch_extent_ptr *ptr;
 
-		extent_for_each_ptr(e, ptr)
+		bkey_for_each_ptr(p, ptr)
 			nr_ptrs += !ptr->cached;
+		BUG_ON(!nr_ptrs);
 		break;
-
-	case BCH_RESERVATION:
+	}
+	case KEY_TYPE_reservation:
 		nr_ptrs = bkey_s_c_to_reservation(k).v->nr_replicas;
 		break;
 	}
@@ -139,25 +89,216 @@ static unsigned bch2_extent_ptr_durability(struct bch_fs *c,
 	return durability;
 }
 
-unsigned bch2_extent_durability(struct bch_fs *c, struct bkey_s_c_extent e)
+unsigned bch2_bkey_durability(struct bch_fs *c, struct bkey_s_c k)
 {
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
 	const union bch_extent_entry *entry;
 	struct extent_ptr_decoded p;
 	unsigned durability = 0;
 
-	extent_for_each_ptr_decode(e, p, entry)
+	bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
 		durability += bch2_extent_ptr_durability(c, p);
 
 	return durability;
 }
 
+static struct bch_dev_io_failures *dev_io_failures(struct bch_io_failures *f,
+						   unsigned dev)
+{
+	struct bch_dev_io_failures *i;
+
+	for (i = f->devs; i < f->devs + f->nr; i++)
+		if (i->dev == dev)
+			return i;
+
+	return NULL;
+}
+
+void bch2_mark_io_failure(struct bch_io_failures *failed,
+			  struct extent_ptr_decoded *p)
+{
+	struct bch_dev_io_failures *f = dev_io_failures(failed, p->ptr.dev);
+
+	if (!f) {
+		BUG_ON(failed->nr >= ARRAY_SIZE(failed->devs));
+
+		f = &failed->devs[failed->nr++];
+		f->dev		= p->ptr.dev;
+		f->idx		= p->idx;
+		f->nr_failed	= 1;
+		f->nr_retries	= 0;
+	} else if (p->idx != f->idx) {
+		f->idx		= p->idx;
+		f->nr_failed	= 1;
+		f->nr_retries	= 0;
+	} else {
+		f->nr_failed++;
+	}
+}
+
+/*
+ * returns true if p1 is better than p2:
+ */
+static inline bool ptr_better(struct bch_fs *c,
+			      const struct extent_ptr_decoded p1,
+			      const struct extent_ptr_decoded p2)
+{
+	if (likely(!p1.idx && !p2.idx)) {
+		struct bch_dev *dev1 = bch_dev_bkey_exists(c, p1.ptr.dev);
+		struct bch_dev *dev2 = bch_dev_bkey_exists(c, p2.ptr.dev);
+
+		u64 l1 = atomic64_read(&dev1->cur_latency[READ]);
+		u64 l2 = atomic64_read(&dev2->cur_latency[READ]);
+
+		/* Pick at random, biased in favor of the faster device: */
+
+		return bch2_rand_range(l1 + l2) > l1;
+	}
+
+	if (force_reconstruct_read(c))
+		return p1.idx > p2.idx;
+
+	return p1.idx < p2.idx;
+}
+
+/*
+ * This picks a non-stale pointer, preferably from a device other than @avoid.
+ * Avoid can be NULL, meaning pick any. If there are no non-stale pointers to
+ * other devices, it will still pick a pointer from avoid.
+ */
+int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k,
+			       struct bch_io_failures *failed,
+			       struct extent_ptr_decoded *pick)
+{
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+	const union bch_extent_entry *entry;
+	struct extent_ptr_decoded p;
+	struct bch_dev_io_failures *f;
+	struct bch_dev *ca;
+	int ret = 0;
+
+	if (k.k->type == KEY_TYPE_error)
+		return -EIO;
+
+	bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+		ca = bch_dev_bkey_exists(c, p.ptr.dev);
+
+		/*
+		 * If there are any dirty pointers it's an error if we can't
+		 * read:
+		 */
+		if (!ret && !p.ptr.cached)
+			ret = -EIO;
+
+		if (p.ptr.cached && ptr_stale(ca, &p.ptr))
+			continue;
+
+		f = failed ? dev_io_failures(failed, p.ptr.dev) : NULL;
+		if (f)
+			p.idx = f->nr_failed < f->nr_retries
+				? f->idx
+				: f->idx + 1;
+
+		if (!p.idx &&
+		    !bch2_dev_is_readable(ca))
+			p.idx++;
+
+		if (force_reconstruct_read(c) &&
+		    !p.idx && p.ec_nr)
+			p.idx++;
+
+		if (p.idx >= p.ec_nr + 1)
+			continue;
+
+		if (ret > 0 && !ptr_better(c, p, *pick))
+			continue;
+
+		*pick = p;
+		ret = 1;
+	}
+
+	return ret;
+}
+
+void bch2_bkey_append_ptr(struct bkey_i *k,
+			  struct bch_extent_ptr ptr)
+{
+	EBUG_ON(bch2_bkey_has_device(bkey_i_to_s_c(k), ptr.dev));
+
+	switch (k->k.type) {
+	case KEY_TYPE_btree_ptr:
+	case KEY_TYPE_extent:
+		EBUG_ON(bkey_val_u64s(&k->k) >= BKEY_EXTENT_VAL_U64s_MAX);
+
+		ptr.type = 1 << BCH_EXTENT_ENTRY_ptr;
+
+		memcpy((void *) &k->v + bkey_val_bytes(&k->k),
+		       &ptr,
+		       sizeof(ptr));
+		k->u64s++;
+		break;
+	default:
+		BUG();
+	}
+}
+
+void bch2_bkey_drop_device(struct bkey_s k, unsigned dev)
+{
+	struct bch_extent_ptr *ptr;
+
+	bch2_bkey_drop_ptrs(k, ptr, ptr->dev == dev);
+}
+
+/* extent specific utility code */
+
+const struct bch_extent_ptr *
+bch2_extent_has_device(struct bkey_s_c_extent e, unsigned dev)
+{
+	const struct bch_extent_ptr *ptr;
+
+	extent_for_each_ptr(e, ptr)
+		if (ptr->dev == dev)
+			return ptr;
+
+	return NULL;
+}
+
+const struct bch_extent_ptr *
+bch2_extent_has_group(struct bch_fs *c, struct bkey_s_c_extent e, unsigned group)
+{
+	const struct bch_extent_ptr *ptr;
+
+	extent_for_each_ptr(e, ptr) {
+		struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+
+		if (ca->mi.group &&
+		    ca->mi.group - 1 == group)
+			return ptr;
+	}
+
+	return NULL;
+}
+
+const struct bch_extent_ptr *
+bch2_extent_has_target(struct bch_fs *c, struct bkey_s_c_extent e, unsigned target)
+{
+	const struct bch_extent_ptr *ptr;
+
+	extent_for_each_ptr(e, ptr)
+		if (bch2_dev_in_target(c, ptr->dev, target) &&
+		    (!ptr->cached ||
+		     !ptr_stale(bch_dev_bkey_exists(c, ptr->dev), ptr)))
+			return ptr;
+
+	return NULL;
+}
+
 unsigned bch2_extent_is_compressed(struct bkey_s_c k)
 {
 	unsigned ret = 0;
 
 	switch (k.k->type) {
-	case BCH_EXTENT:
-	case BCH_EXTENT_CACHED: {
+	case KEY_TYPE_extent: {
 		struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
 		const union bch_extent_entry *entry;
 		struct extent_ptr_decoded p;
@@ -189,10 +330,10 @@ bool bch2_extent_matches_ptr(struct bch_fs *c, struct bkey_s_c_extent e,
 	return false;
 }
 
-static union bch_extent_entry *extent_entry_prev(struct bkey_s_extent e,
+static union bch_extent_entry *extent_entry_prev(struct bkey_ptrs ptrs,
 					  union bch_extent_entry *entry)
 {
-	union bch_extent_entry *i = e.v->start;
+	union bch_extent_entry *i = ptrs.start;
 
 	if (i == entry)
 		return NULL;
@@ -202,23 +343,24 @@ static union bch_extent_entry *extent_entry_prev(struct bkey_s_extent e,
 	return i;
 }
 
-union bch_extent_entry *bch2_extent_drop_ptr(struct bkey_s_extent e,
-					     struct bch_extent_ptr *ptr)
+union bch_extent_entry *bch2_bkey_drop_ptr(struct bkey_s k,
+					   struct bch_extent_ptr *ptr)
 {
+	struct bkey_ptrs ptrs = bch2_bkey_ptrs(k);
 	union bch_extent_entry *dst, *src, *prev;
 	bool drop_crc = true;
 
-	EBUG_ON(ptr < &e.v->start->ptr ||
-		ptr >= &extent_entry_last(e)->ptr);
+	EBUG_ON(ptr < &ptrs.start->ptr ||
+		ptr >= &ptrs.end->ptr);
 	EBUG_ON(ptr->type != 1 << BCH_EXTENT_ENTRY_ptr);
 
 	src = extent_entry_next(to_entry(ptr));
-	if (src != extent_entry_last(e) &&
+	if (src != ptrs.end &&
 	    !extent_entry_is_crc(src))
 		drop_crc = false;
 
 	dst = to_entry(ptr);
-	while ((prev = extent_entry_prev(e, dst))) {
+	while ((prev = extent_entry_prev(ptrs, dst))) {
 		if (extent_entry_is_ptr(prev))
 			break;
 
@@ -232,8 +374,8 @@ union bch_extent_entry *bch2_extent_drop_ptr(struct bkey_s_extent e,
 	}
 
 	memmove_u64s_down(dst, src,
-			  (u64 *) extent_entry_last(e) - (u64 *) src);
-	e.k->u64s -= (u64 *) src - (u64 *) dst;
+			  (u64 *) ptrs.end - (u64 *) src);
+	k.k->u64s -= (u64 *) src - (u64 *) dst;
 
 	return dst;
 }
@@ -300,7 +442,7 @@ found:
 restart_narrow_pointers:
 	extent_for_each_ptr_decode(extent_i_to_s(e), p, i)
 		if (can_narrow_crc(p.crc, n)) {
-			bch2_extent_drop_ptr(extent_i_to_s(e), &i->ptr);
+			bch2_bkey_drop_ptr(extent_i_to_s(e).s, &i->ptr);
 			p.ptr.offset += p.crc.offset;
 			p.crc = n;
 			bch2_extent_ptr_decoded_append(e, &p);
@@ -325,302 +467,165 @@ static inline bool bch2_crc_unpacked_cmp(struct bch_extent_crc_unpacked l,
 		bch2_crc_cmp(l.csum, r.csum));
 }
 
-static void bch2_extent_drop_stale(struct bch_fs *c, struct bkey_s_extent e)
-{
-	struct bch_extent_ptr *ptr;
-
-	bch2_extent_drop_ptrs(e, ptr,
-		ptr->cached &&
-		ptr_stale(bch_dev_bkey_exists(c, ptr->dev), ptr));
-}
-
-bool bch2_ptr_normalize(struct bch_fs *c, struct btree *b, struct bkey_s k)
-{
-	return bch2_extent_normalize(c, k);
-}
-
 void bch2_ptr_swab(const struct bkey_format *f, struct bkey_packed *k)
 {
-	switch (k->type) {
-	case BCH_EXTENT:
-	case BCH_EXTENT_CACHED: {
-		union bch_extent_entry *entry;
-		u64 *d = (u64 *) bkeyp_val(f, k);
-		unsigned i;
-
-		for (i = 0; i < bkeyp_val_u64s(f, k); i++)
-			d[i] = swab64(d[i]);
-
-		for (entry = (union bch_extent_entry *) d;
-		     entry < (union bch_extent_entry *) (d + bkeyp_val_u64s(f, k));
-		     entry = extent_entry_next(entry)) {
-			switch (extent_entry_type(entry)) {
-			case BCH_EXTENT_ENTRY_ptr:
-				break;
-			case BCH_EXTENT_ENTRY_crc32:
-				entry->crc32.csum = swab32(entry->crc32.csum);
-				break;
-			case BCH_EXTENT_ENTRY_crc64:
-				entry->crc64.csum_hi = swab16(entry->crc64.csum_hi);
-				entry->crc64.csum_lo = swab64(entry->crc64.csum_lo);
-				break;
-			case BCH_EXTENT_ENTRY_crc128:
-				entry->crc128.csum.hi = (__force __le64)
-					swab64((__force u64) entry->crc128.csum.hi);
-				entry->crc128.csum.lo = (__force __le64)
-					swab64((__force u64) entry->crc128.csum.lo);
-				break;
-			case BCH_EXTENT_ENTRY_stripe_ptr:
-				break;
-			}
-		}
-		break;
-	}
-	}
-}
-
-static const char *extent_ptr_invalid(const struct bch_fs *c,
-				      struct bkey_s_c_extent e,
-				      const struct bch_extent_ptr *ptr,
-				      unsigned size_ondisk,
-				      bool metadata)
-{
-	const struct bch_extent_ptr *ptr2;
-	struct bch_dev *ca;
-
-	if (ptr->dev >= c->sb.nr_devices ||
-	    !c->devs[ptr->dev])
-		return "pointer to invalid device";
-
-	ca = bch_dev_bkey_exists(c, ptr->dev);
-	if (!ca)
-		return "pointer to invalid device";
-
-	extent_for_each_ptr(e, ptr2)
-		if (ptr != ptr2 && ptr->dev == ptr2->dev)
-			return "multiple pointers to same device";
-
-	if (ptr->offset + size_ondisk > bucket_to_sector(ca, ca->mi.nbuckets))
-		return "offset past end of device";
-
-	if (ptr->offset < bucket_to_sector(ca, ca->mi.first_bucket))
-		return "offset before first bucket";
-
-	if (bucket_remainder(ca, ptr->offset) +
-	    size_ondisk > ca->mi.bucket_size)
-		return "spans multiple buckets";
-
-	return NULL;
-}
-
-static void extent_print_ptrs(struct printbuf *out, struct bch_fs *c,
-			      struct bkey_s_c_extent e)
-{
-	const union bch_extent_entry *entry;
-	struct bch_extent_crc_unpacked crc;
-	const struct bch_extent_ptr *ptr;
-	const struct bch_extent_stripe_ptr *ec;
-	struct bch_dev *ca;
-	bool first = true;
+	union bch_extent_entry *entry;
+	u64 *d = (u64 *) bkeyp_val(f, k);
+	unsigned i;
 
-	extent_for_each_entry(e, entry) {
-		if (!first)
-			pr_buf(out, " ");
+	for (i = 0; i < bkeyp_val_u64s(f, k); i++)
+		d[i] = swab64(d[i]);
 
-		switch (__extent_entry_type(entry)) {
+	for (entry = (union bch_extent_entry *) d;
+	     entry < (union bch_extent_entry *) (d + bkeyp_val_u64s(f, k));
+	     entry = extent_entry_next(entry)) {
+		switch (extent_entry_type(entry)) {
 		case BCH_EXTENT_ENTRY_ptr:
-			ptr = entry_to_ptr(entry);
-			ca = ptr->dev < c->sb.nr_devices && c->devs[ptr->dev]
-				? bch_dev_bkey_exists(c, ptr->dev)
-				: NULL;
-
-			pr_buf(out, "ptr: %u:%llu gen %u%s%s", ptr->dev,
-			       (u64) ptr->offset, ptr->gen,
-			       ptr->cached ? " cached" : "",
-			       ca && ptr_stale(ca, ptr)
-			       ? " stale" : "");
 			break;
 		case BCH_EXTENT_ENTRY_crc32:
+			entry->crc32.csum = swab32(entry->crc32.csum);
+			break;
 		case BCH_EXTENT_ENTRY_crc64:
+			entry->crc64.csum_hi = swab16(entry->crc64.csum_hi);
+			entry->crc64.csum_lo = swab64(entry->crc64.csum_lo);
+			break;
 		case BCH_EXTENT_ENTRY_crc128:
-			crc = bch2_extent_crc_unpack(e.k, entry_to_crc(entry));
-
-			pr_buf(out, "crc: c_size %u size %u offset %u nonce %u csum %u compress %u",
-			       crc.compressed_size,
-			       crc.uncompressed_size,
-			       crc.offset, crc.nonce,
-			       crc.csum_type,
-			       crc.compression_type);
+			entry->crc128.csum.hi = (__force __le64)
+				swab64((__force u64) entry->crc128.csum.hi);
+			entry->crc128.csum.lo = (__force __le64)
+				swab64((__force u64) entry->crc128.csum.lo);
 			break;
 		case BCH_EXTENT_ENTRY_stripe_ptr:
-			ec = &entry->stripe_ptr;
-
-			pr_buf(out, "ec: idx %llu block %u",
-			       (u64) ec->idx, ec->block);
 			break;
-		default:
-			pr_buf(out, "(invalid extent entry %.16llx)", *((u64 *) entry));
-			goto out;
 		}
-
-		first = false;
 	}
-out:
-	if (bkey_extent_is_cached(e.k))
-		pr_buf(out, " cached");
 }
 
-static struct bch_dev_io_failures *dev_io_failures(struct bch_io_failures *f,
-						   unsigned dev)
-{
-	struct bch_dev_io_failures *i;
-
-	for (i = f->devs; i < f->devs + f->nr; i++)
-		if (i->dev == dev)
-			return i;
-
-	return NULL;
-}
-
-void bch2_mark_io_failure(struct bch_io_failures *failed,
-			  struct extent_ptr_decoded *p)
-{
-	struct bch_dev_io_failures *f = dev_io_failures(failed, p->ptr.dev);
-
-	if (!f) {
-		BUG_ON(failed->nr >= ARRAY_SIZE(failed->devs));
-
-		f = &failed->devs[failed->nr++];
-		f->dev		= p->ptr.dev;
-		f->idx		= p->idx;
-		f->nr_failed	= 1;
-		f->nr_retries	= 0;
-	} else if (p->idx != f->idx) {
-		f->idx		= p->idx;
-		f->nr_failed	= 1;
-		f->nr_retries	= 0;
-	} else {
-		f->nr_failed++;
-	}
-}
-
-/*
- * returns true if p1 is better than p2:
- */
-static inline bool ptr_better(struct bch_fs *c,
-			      const struct extent_ptr_decoded p1,
-			      const struct extent_ptr_decoded p2)
+static const char *extent_ptr_invalid(const struct bch_fs *c,
+				      struct bkey_s_c k,
+				      const struct bch_extent_ptr *ptr,
+				      unsigned size_ondisk,
+				      bool metadata)
 {
-	if (likely(!p1.idx && !p2.idx)) {
-		struct bch_dev *dev1 = bch_dev_bkey_exists(c, p1.ptr.dev);
-		struct bch_dev *dev2 = bch_dev_bkey_exists(c, p2.ptr.dev);
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+	const struct bch_extent_ptr *ptr2;
+	struct bch_dev *ca;
 
-		u64 l1 = atomic64_read(&dev1->cur_latency[READ]);
-		u64 l2 = atomic64_read(&dev2->cur_latency[READ]);
+	if (ptr->dev >= c->sb.nr_devices ||
+	    !c->devs[ptr->dev])
+		return "pointer to invalid device";
 
-		/* Pick at random, biased in favor of the faster device: */
+	ca = bch_dev_bkey_exists(c, ptr->dev);
+	if (!ca)
+		return "pointer to invalid device";
 
-		return bch2_rand_range(l1 + l2) > l1;
-	}
+	bkey_for_each_ptr(ptrs, ptr2)
+		if (ptr != ptr2 && ptr->dev == ptr2->dev)
+			return "multiple pointers to same device";
 
-	if (force_reconstruct_read(c))
-		return p1.idx > p2.idx;
+	if (ptr->offset + size_ondisk > bucket_to_sector(ca, ca->mi.nbuckets))
+		return "offset past end of device";
 
-	return p1.idx < p2.idx;
+	if (ptr->offset < bucket_to_sector(ca, ca->mi.first_bucket))
+		return "offset before first bucket";
+
+	if (bucket_remainder(ca, ptr->offset) +
+	    size_ondisk > ca->mi.bucket_size)
+		return "spans multiple buckets";
+
+	return NULL;
 }
 
-static int extent_pick_read_device(struct bch_fs *c,
-				   struct bkey_s_c_extent e,
-				   struct bch_io_failures *failed,
-				   struct extent_ptr_decoded *pick)
+static void bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
+			      struct bkey_s_c k)
 {
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
 	const union bch_extent_entry *entry;
-	struct extent_ptr_decoded p;
-	struct bch_dev_io_failures *f;
+	struct bch_extent_crc_unpacked crc;
+	const struct bch_extent_ptr *ptr;
+	const struct bch_extent_stripe_ptr *ec;
 	struct bch_dev *ca;
-	int ret = 0;
-
-	extent_for_each_ptr_decode(e, p, entry) {
-		ca = bch_dev_bkey_exists(c, p.ptr.dev);
-
-		if (p.ptr.cached && ptr_stale(ca, &p.ptr))
-			continue;
+	bool first = true;
 
-		f = failed ? dev_io_failures(failed, p.ptr.dev) : NULL;
-		if (f)
-			p.idx = f->nr_failed < f->nr_retries
-				? f->idx
-				: f->idx + 1;
+	bkey_extent_entry_for_each(ptrs, entry) {
+		if (!first)
+			pr_buf(out, " ");
 
-		if (!p.idx &&
-		    !bch2_dev_is_readable(ca))
-			p.idx++;
+		switch (__extent_entry_type(entry)) {
+		case BCH_EXTENT_ENTRY_ptr:
+			ptr = entry_to_ptr(entry);
+			ca = ptr->dev < c->sb.nr_devices && c->devs[ptr->dev]
+				? bch_dev_bkey_exists(c, ptr->dev)
+				: NULL;
 
-		if (force_reconstruct_read(c) &&
-		    !p.idx && p.ec_nr)
-			p.idx++;
+			pr_buf(out, "ptr: %u:%llu gen %u%s%s", ptr->dev,
+			       (u64) ptr->offset, ptr->gen,
+			       ptr->cached ? " cached" : "",
+			       ca && ptr_stale(ca, ptr)
+			       ? " stale" : "");
+			break;
+		case BCH_EXTENT_ENTRY_crc32:
+		case BCH_EXTENT_ENTRY_crc64:
+		case BCH_EXTENT_ENTRY_crc128:
+			crc = bch2_extent_crc_unpack(k.k, entry_to_crc(entry));
 
-		if (p.idx >= p.ec_nr + 1)
-			continue;
+			pr_buf(out, "crc: c_size %u size %u offset %u nonce %u csum %u compress %u",
+			       crc.compressed_size,
+			       crc.uncompressed_size,
+			       crc.offset, crc.nonce,
+			       crc.csum_type,
+			       crc.compression_type);
+			break;
+		case BCH_EXTENT_ENTRY_stripe_ptr:
+			ec = &entry->stripe_ptr;
 
-		if (ret && !ptr_better(c, p, *pick))
-			continue;
+			pr_buf(out, "ec: idx %llu block %u",
+			       (u64) ec->idx, ec->block);
+			break;
+		default:
+			pr_buf(out, "(invalid extent entry %.16llx)", *((u64 *) entry));
+			return;
+		}
 
-		*pick = p;
-		ret = 1;
+		first = false;
 	}
-
-	return ret;
 }
 
 /* Btree ptrs */
 
 const char *bch2_btree_ptr_invalid(const struct bch_fs *c, struct bkey_s_c k)
 {
-	if (bkey_extent_is_cached(k.k))
-		return "cached";
-
-	if (k.k->size)
-		return "nonzero key size";
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+	const union bch_extent_entry *entry;
+	const struct bch_extent_ptr *ptr;
+	const char *reason;
 
 	if (bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX)
 		return "value too big";
 
-	switch (k.k->type) {
-	case BCH_EXTENT: {
-		struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
-		const union bch_extent_entry *entry;
-		const struct bch_extent_ptr *ptr;
-		const char *reason;
-
-		extent_for_each_entry(e, entry) {
-			if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX)
-				return "invalid extent entry type";
-
-			if (!extent_entry_is_ptr(entry))
-				return "has non ptr field";
-		}
-
-		extent_for_each_ptr(e, ptr) {
-			reason = extent_ptr_invalid(c, e, ptr,
-						    c->opts.btree_node_size,
-						    true);
-			if (reason)
-				return reason;
-		}
+	bkey_extent_entry_for_each(ptrs, entry) {
+		if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX)
+			return "invalid extent entry type";
 
-		return NULL;
+		if (!extent_entry_is_ptr(entry))
+			return "has non ptr field";
 	}
 
-	default:
-		return "invalid value type";
+	bkey_for_each_ptr(ptrs, ptr) {
+		reason = extent_ptr_invalid(c, k, ptr,
+					    c->opts.btree_node_size,
+					    true);
+		if (reason)
+			return reason;
 	}
+
+	return NULL;
 }
 
 void bch2_btree_ptr_debugcheck(struct bch_fs *c, struct btree *b,
 			       struct bkey_s_c k)
 {
-	struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
 	const struct bch_extent_ptr *ptr;
 	unsigned seq;
 	const char *err;
@@ -630,7 +635,7 @@ void bch2_btree_ptr_debugcheck(struct bch_fs *c, struct btree *b,
 	unsigned replicas = 0;
 	bool bad;
 
-	extent_for_each_ptr(e, ptr) {
+	bkey_for_each_ptr(ptrs, ptr) {
 		ca = bch_dev_bkey_exists(c, ptr->dev);
 		replicas++;
 
@@ -656,9 +661,8 @@ void bch2_btree_ptr_debugcheck(struct bch_fs *c, struct btree *b,
 	}
 
 	if (!test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) &&
-	    !bch2_bkey_replicas_marked(c, btree_node_type(b),
-				       e.s_c, false)) {
-		bch2_bkey_val_to_text(&PBUF(buf), c, btree_node_type(b), k);
+	    !bch2_bkey_replicas_marked(c, k, false)) {
+		bch2_bkey_val_to_text(&PBUF(buf), c, k);
 		bch2_fs_bug(c,
 			"btree key bad (replicas not marked in superblock):\n%s",
 			buf);
@@ -667,7 +671,7 @@ void bch2_btree_ptr_debugcheck(struct bch_fs *c, struct btree *b,
 
 	return;
 err:
-	bch2_bkey_val_to_text(&PBUF(buf), c, btree_node_type(b), k);
+	bch2_bkey_val_to_text(&PBUF(buf), c, k);
 	bch2_fs_bug(c, "%s btree pointer %s: bucket %zi gen %i mark %08x",
 		    err, buf, PTR_BUCKET_NR(ca, ptr),
 		    mark.gen, (unsigned) mark.v.counter);
@@ -678,22 +682,13 @@ void bch2_btree_ptr_to_text(struct printbuf *out, struct bch_fs *c,
 {
 	const char *invalid;
 
-	if (bkey_extent_is_data(k.k))
-		extent_print_ptrs(out, c, bkey_s_c_to_extent(k));
+	bkey_ptrs_to_text(out, c, k);
 
 	invalid = bch2_btree_ptr_invalid(c, k);
 	if (invalid)
 		pr_buf(out, " invalid: %s", invalid);
 }
 
-int bch2_btree_pick_ptr(struct bch_fs *c, const struct btree *b,
-			struct bch_io_failures *failed,
-			struct extent_ptr_decoded *pick)
-{
-	return extent_pick_read_device(c, bkey_i_to_s_c_extent(&b->key),
-				       failed, pick);
-}
-
 /* Extents */
 
 bool __bch2_cut_front(struct bpos where, struct bkey_s k)
@@ -714,7 +709,7 @@ bool __bch2_cut_front(struct bpos where, struct bkey_s k)
 	 * cause offset to point to the next bucket:
 	 */
 	if (!len)
-		k.k->type = KEY_TYPE_DELETED;
+		k.k->type = KEY_TYPE_deleted;
 	else if (bkey_extent_is_data(k.k)) {
 		struct bkey_s_extent e = bkey_s_to_extent(k);
 		union bch_extent_entry *entry;
@@ -766,7 +761,7 @@ bool bch2_cut_back(struct bpos where, struct bkey *k)
 	k->size = len;
 
 	if (!len)
-		k->type = KEY_TYPE_DELETED;
+		k->type = KEY_TYPE_deleted;
 
 	return true;
 }
@@ -830,13 +825,13 @@ static void verify_extent_nonoverlapping(struct btree *b,
 	struct bkey uk;
 
 	iter = *_iter;
-	k = bch2_btree_node_iter_prev_filter(&iter, b, KEY_TYPE_DISCARD);
+	k = bch2_btree_node_iter_prev_filter(&iter, b, KEY_TYPE_discard);
 	BUG_ON(k &&
 	       (uk = bkey_unpack_key(b, k),
 		bkey_cmp(uk.p, bkey_start_pos(&insert->k)) > 0));
 
 	iter = *_iter;
-	k = bch2_btree_node_iter_peek_filter(&iter, b, KEY_TYPE_DISCARD);
+	k = bch2_btree_node_iter_peek_filter(&iter, b, KEY_TYPE_discard);
 #if 0
 	BUG_ON(k &&
 	       (uk = bkey_unpack_key(b, k),
@@ -882,13 +877,13 @@ static void extent_bset_insert(struct bch_fs *c, struct btree_iter *iter,
 	verify_extent_nonoverlapping(l->b, &l->iter, insert);
 
 	node_iter = l->iter;
-	k = bch2_btree_node_iter_prev_filter(&node_iter, l->b, KEY_TYPE_DISCARD);
+	k = bch2_btree_node_iter_prev_filter(&node_iter, l->b, KEY_TYPE_discard);
 	if (k && !bkey_written(l->b, k) &&
 	    bch2_extent_merge_inline(c, iter, k, bkey_to_packed(insert), true))
 		return;
 
 	node_iter = l->iter;
-	k = bch2_btree_node_iter_peek_filter(&node_iter, l->b, KEY_TYPE_DISCARD);
+	k = bch2_btree_node_iter_peek_filter(&node_iter, l->b, KEY_TYPE_discard);
 	if (k && !bkey_written(l->b, k) &&
 	    bch2_extent_merge_inline(c, iter, bkey_to_packed(insert), k, false))
 		return;
@@ -912,7 +907,7 @@ static void extent_insert_committed(struct extent_insert_state *s)
 
 	bkey_copy(&split.k, insert);
 	if (s->deleting)
-		split.k.k.type = KEY_TYPE_DISCARD;
+		split.k.k.type = KEY_TYPE_discard;
 
 	bch2_cut_back(s->committed, &split.k.k);
 
@@ -934,7 +929,7 @@ static void extent_insert_committed(struct extent_insert_state *s)
 	if (s->update_journal) {
 		bkey_copy(&split.k, !s->deleting ? insert : &s->whiteout);
 		if (s->deleting)
-			split.k.k.type = KEY_TYPE_DISCARD;
+			split.k.k.type = KEY_TYPE_discard;
 
 		bch2_cut_back(s->committed, &split.k.k);
 
@@ -985,7 +980,7 @@ bch2_extent_can_insert(struct btree_insert *trans,
 		*u64s += BKEY_U64s;
 
 	_k = bch2_btree_node_iter_peek_filter(&node_iter, l->b,
-					      KEY_TYPE_DISCARD);
+					      KEY_TYPE_discard);
 	if (!_k)
 		return BTREE_INSERT_OK;
 
@@ -1062,7 +1057,7 @@ extent_squash(struct extent_insert_state *s, struct bkey_i *insert,
 			btree_account_key_drop(l->b, _k);
 
 		k.k->size = 0;
-		k.k->type = KEY_TYPE_DELETED;
+		k.k->type = KEY_TYPE_deleted;
 
 		if (_k >= btree_bset_last(l->b)->start) {
 			unsigned u64s = _k->u64s;
@@ -1123,7 +1118,7 @@ static void __bch2_insert_fixup_extent(struct extent_insert_state *s)
 
 	while (bkey_cmp(s->committed, insert->k.p) < 0 &&
 	       (_k = bch2_btree_node_iter_peek_filter(&l->iter, l->b,
-						      KEY_TYPE_DISCARD))) {
+						      KEY_TYPE_discard))) {
 		struct bkey_s k = __bkey_disassemble(l->b, _k, &unpacked);
 		enum bch_extent_overlap overlap = bch2_extent_overlap(&insert->k, k.k);
 
@@ -1155,7 +1150,7 @@ static void __bch2_insert_fixup_extent(struct extent_insert_state *s)
 		    !bkey_cmp(bkey_start_pos(&insert->k), bkey_start_pos(k.k))) {
 			if (!bkey_whiteout(k.k)) {
 				btree_account_key_drop(l->b, _k);
-				_k->type = KEY_TYPE_DISCARD;
+				_k->type = KEY_TYPE_discard;
 				reserve_whiteout(l->b, _k);
 			}
 			break;
@@ -1286,88 +1281,66 @@ bch2_insert_fixup_extent(struct btree_insert *trans,
 
 const char *bch2_extent_invalid(const struct bch_fs *c, struct bkey_s_c k)
 {
-	if (bkey_val_u64s(k.k) > BKEY_EXTENT_VAL_U64s_MAX)
-		return "value too big";
-
-	if (!k.k->size)
-		return "zero key size";
+	struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
+	const union bch_extent_entry *entry;
+	struct bch_extent_crc_unpacked crc;
+	const struct bch_extent_ptr *ptr;
+	unsigned size_ondisk = e.k->size;
+	const char *reason;
+	unsigned nonce = UINT_MAX;
 
-	switch (k.k->type) {
-	case BCH_EXTENT:
-	case BCH_EXTENT_CACHED: {
-		struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
-		const union bch_extent_entry *entry;
-		struct bch_extent_crc_unpacked crc;
-		const struct bch_extent_ptr *ptr;
-		unsigned size_ondisk = e.k->size;
-		const char *reason;
-		unsigned nonce = UINT_MAX;
+	if (bkey_val_u64s(e.k) > BKEY_EXTENT_VAL_U64s_MAX)
+		return "value too big";
 
-		extent_for_each_entry(e, entry) {
-			if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX)
-				return "invalid extent entry type";
+	extent_for_each_entry(e, entry) {
+		if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX)
+			return "invalid extent entry type";
 
-			switch (extent_entry_type(entry)) {
-			case BCH_EXTENT_ENTRY_ptr:
-				ptr = entry_to_ptr(entry);
+		switch (extent_entry_type(entry)) {
+		case BCH_EXTENT_ENTRY_ptr:
+			ptr = entry_to_ptr(entry);
 
-				reason = extent_ptr_invalid(c, e, &entry->ptr,
-							    size_ondisk, false);
-				if (reason)
-					return reason;
-				break;
-			case BCH_EXTENT_ENTRY_crc32:
-			case BCH_EXTENT_ENTRY_crc64:
-			case BCH_EXTENT_ENTRY_crc128:
-				crc = bch2_extent_crc_unpack(e.k, entry_to_crc(entry));
+			reason = extent_ptr_invalid(c, e.s_c, &entry->ptr,
+						    size_ondisk, false);
+			if (reason)
+				return reason;
+			break;
+		case BCH_EXTENT_ENTRY_crc32:
+		case BCH_EXTENT_ENTRY_crc64:
+		case BCH_EXTENT_ENTRY_crc128:
+			crc = bch2_extent_crc_unpack(e.k, entry_to_crc(entry));
 
-				if (crc.offset + e.k->size >
-				    crc.uncompressed_size)
-					return "checksum offset + key size > uncompressed size";
+			if (crc.offset + e.k->size >
+			    crc.uncompressed_size)
+				return "checksum offset + key size > uncompressed size";
 
-				size_ondisk = crc.compressed_size;
+			size_ondisk = crc.compressed_size;
 
-				if (!bch2_checksum_type_valid(c, crc.csum_type))
-					return "invalid checksum type";
+			if (!bch2_checksum_type_valid(c, crc.csum_type))
+				return "invalid checksum type";
 
-				if (crc.compression_type >= BCH_COMPRESSION_NR)
-					return "invalid compression type";
+			if (crc.compression_type >= BCH_COMPRESSION_NR)
+				return "invalid compression type";
 
-				if (bch2_csum_type_is_encryption(crc.csum_type)) {
-					if (nonce == UINT_MAX)
-						nonce = crc.offset + crc.nonce;
-					else if (nonce != crc.offset + crc.nonce)
-						return "incorrect nonce";
-				}
-				break;
-			case BCH_EXTENT_ENTRY_stripe_ptr:
-				break;
+			if (bch2_csum_type_is_encryption(crc.csum_type)) {
+				if (nonce == UINT_MAX)
+					nonce = crc.offset + crc.nonce;
+				else if (nonce != crc.offset + crc.nonce)
+					return "incorrect nonce";
 			}
+			break;
+		case BCH_EXTENT_ENTRY_stripe_ptr:
+			break;
 		}
-
-		return NULL;
-	}
-
-	case BCH_RESERVATION: {
-		struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k);
-
-		if (bkey_val_bytes(k.k) != sizeof(struct bch_reservation))
-			return "incorrect value size";
-
-		if (!r.v->nr_replicas || r.v->nr_replicas > BCH_REPLICAS_MAX)
-			return "invalid nr_replicas";
-
-		return NULL;
 	}
 
-	default:
-		return "invalid value type";
-	}
+	return NULL;
 }
 
-static void bch2_extent_debugcheck_extent(struct bch_fs *c, struct btree *b,
-					  struct bkey_s_c_extent e)
+void bch2_extent_debugcheck(struct bch_fs *c, struct btree *b,
+			    struct bkey_s_c k)
 {
+	struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
 	const struct bch_extent_ptr *ptr;
 	struct bch_dev *ca;
 	struct bucket_mark mark;
@@ -1429,8 +1402,7 @@ static void bch2_extent_debugcheck_extent(struct bch_fs *c, struct btree *b,
 	}
 
 	if (replicas > BCH_REPLICAS_MAX) {
-		bch2_bkey_val_to_text(&PBUF(buf), c, btree_node_type(b),
-				      e.s_c);
+		bch2_bkey_val_to_text(&PBUF(buf), c, e.s_c);
 		bch2_fs_bug(c,
 			"extent key bad (too many replicas: %u): %s",
 			replicas, buf);
@@ -1438,10 +1410,8 @@ static void bch2_extent_debugcheck_extent(struct bch_fs *c, struct btree *b,
 	}
 
 	if (!test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) &&
-	    !bch2_bkey_replicas_marked(c, btree_node_type(b),
-				       e.s_c, false)) {
-		bch2_bkey_val_to_text(&PBUF(buf), c, btree_node_type(b),
-				      e.s_c);
+	    !bch2_bkey_replicas_marked(c, e.s_c, false)) {
+		bch2_bkey_val_to_text(&PBUF(buf), c, e.s_c);
 		bch2_fs_bug(c,
 			"extent key bad (replicas not marked in superblock):\n%s",
 			buf);
@@ -1451,34 +1421,18 @@ static void bch2_extent_debugcheck_extent(struct bch_fs *c, struct btree *b,
 	return;
 
 bad_ptr:
-	bch2_bkey_val_to_text(&PBUF(buf), c, btree_node_type(b),
-			      e.s_c);
+	bch2_bkey_val_to_text(&PBUF(buf), c, e.s_c);
 	bch2_fs_bug(c, "extent pointer bad gc mark: %s:\nbucket %zu "
 		   "gen %i type %u", buf,
 		   PTR_BUCKET_NR(ca, ptr), mark.gen, mark.data_type);
 }
 
-void bch2_extent_debugcheck(struct bch_fs *c, struct btree *b, struct bkey_s_c k)
-{
-	switch (k.k->type) {
-	case BCH_EXTENT:
-	case BCH_EXTENT_CACHED:
-		bch2_extent_debugcheck_extent(c, b, bkey_s_c_to_extent(k));
-		break;
-	case BCH_RESERVATION:
-		break;
-	default:
-		BUG();
-	}
-}
-
 void bch2_extent_to_text(struct printbuf *out, struct bch_fs *c,
 			 struct bkey_s_c k)
 {
 	const char *invalid;
 
-	if (bkey_extent_is_data(k.k))
-		extent_print_ptrs(out, c, bkey_s_c_to_extent(k));
+	bkey_ptrs_to_text(out, c, k);
 
 	invalid = bch2_extent_invalid(c, k);
 	if (invalid)
@@ -1593,41 +1547,17 @@ found:
  */
 bool bch2_extent_normalize(struct bch_fs *c, struct bkey_s k)
 {
-	struct bkey_s_extent e;
-
-	switch (k.k->type) {
-	case KEY_TYPE_ERROR:
-		return false;
-
-	case KEY_TYPE_DELETED:
-		return true;
-	case KEY_TYPE_DISCARD:
-		return bversion_zero(k.k->version);
-	case KEY_TYPE_COOKIE:
-		return false;
-
-	case BCH_EXTENT:
-	case BCH_EXTENT_CACHED:
-		e = bkey_s_to_extent(k);
+	struct bch_extent_ptr *ptr;
 
-		bch2_extent_drop_stale(c, e);
+	bch2_bkey_drop_ptrs(k, ptr,
+		ptr->cached &&
+		ptr_stale(bch_dev_bkey_exists(c, ptr->dev), ptr));
 
-		if (!bkey_val_u64s(e.k)) {
-			if (bkey_extent_is_cached(e.k)) {
-				k.k->type = KEY_TYPE_DISCARD;
-				if (bversion_zero(k.k->version))
-					return true;
-			} else {
-				k.k->type = KEY_TYPE_ERROR;
-			}
-		}
+	/* will only happen if all pointers were cached: */
+	if (!bkey_val_u64s(k.k))
+		k.k->type = KEY_TYPE_deleted;
 
-		return false;
-	case BCH_RESERVATION:
-		return false;
-	default:
-		BUG();
-	}
+	return false;
 }
 
 void bch2_extent_mark_replicas_cached(struct bch_fs *c,
@@ -1637,7 +1567,7 @@ void bch2_extent_mark_replicas_cached(struct bch_fs *c,
 {
 	union bch_extent_entry *entry;
 	struct extent_ptr_decoded p;
-	int extra = bch2_extent_durability(c, e.c) - nr_desired_replicas;
+	int extra = bch2_bkey_durability(c, e.s_c) - nr_desired_replicas;
 
 	if (target && extra > 0)
 		extent_for_each_ptr_decode(e, p, entry) {
@@ -1661,106 +1591,40 @@ void bch2_extent_mark_replicas_cached(struct bch_fs *c,
 		}
 }
 
-/*
- * This picks a non-stale pointer, preferably from a device other than @avoid.
- * Avoid can be NULL, meaning pick any. If there are no non-stale pointers to
- * other devices, it will still pick a pointer from avoid.
- */
-int bch2_extent_pick_ptr(struct bch_fs *c, struct bkey_s_c k,
-			 struct bch_io_failures *failed,
-			 struct extent_ptr_decoded *pick)
-{
-	int ret;
-
-	switch (k.k->type) {
-	case KEY_TYPE_ERROR:
-		return -EIO;
-
-	case BCH_EXTENT:
-	case BCH_EXTENT_CACHED:
-		ret = extent_pick_read_device(c, bkey_s_c_to_extent(k),
-					      failed, pick);
-
-		if (!ret && !bkey_extent_is_cached(k.k))
-			ret = -EIO;
-
-		return ret;
-
-	default:
-		return 0;
-	}
-}
-
-enum merge_result bch2_extent_merge(struct bch_fs *c, struct btree *b,
+enum merge_result bch2_extent_merge(struct bch_fs *c,
 				    struct bkey_i *l, struct bkey_i *r)
 {
-	struct bkey_s_extent el, er;
+	struct bkey_s_extent el = bkey_i_to_s_extent(l);
+	struct bkey_s_extent er = bkey_i_to_s_extent(r);
 	union bch_extent_entry *en_l, *en_r;
 
-	if (key_merging_disabled(c))
-		return BCH_MERGE_NOMERGE;
-
-	/*
-	 * Generic header checks
-	 * Assumes left and right are in order
-	 * Left and right must be exactly aligned
-	 */
-
-	if (l->k.u64s		!= r->k.u64s ||
-	    l->k.type		!= r->k.type ||
-	    bversion_cmp(l->k.version, r->k.version) ||
-	    bkey_cmp(l->k.p, bkey_start_pos(&r->k)))
+	if (bkey_val_u64s(&l->k) != bkey_val_u64s(&r->k))
 		return BCH_MERGE_NOMERGE;
 
-	switch (l->k.type) {
-	case KEY_TYPE_DISCARD:
-	case KEY_TYPE_ERROR:
-		/* These types are mergeable, and no val to check */
-		break;
-
-	case BCH_EXTENT:
-	case BCH_EXTENT_CACHED:
-		el = bkey_i_to_s_extent(l);
-		er = bkey_i_to_s_extent(r);
-
-		extent_for_each_entry(el, en_l) {
-			struct bch_extent_ptr *lp, *rp;
-			struct bch_dev *ca;
-
-			en_r = vstruct_idx(er.v, (u64 *) en_l - el.v->_data);
-
-			if ((extent_entry_type(en_l) !=
-			     extent_entry_type(en_r)) ||
-			    !extent_entry_is_ptr(en_l))
-				return BCH_MERGE_NOMERGE;
+	extent_for_each_entry(el, en_l) {
+		struct bch_extent_ptr *lp, *rp;
+		struct bch_dev *ca;
 
-			lp = &en_l->ptr;
-			rp = &en_r->ptr;
+		en_r = vstruct_idx(er.v, (u64 *) en_l - el.v->_data);
 
-			if (lp->offset + el.k->size	!= rp->offset ||
-			    lp->dev			!= rp->dev ||
-			    lp->gen			!= rp->gen)
-				return BCH_MERGE_NOMERGE;
+		if ((extent_entry_type(en_l) !=
+		     extent_entry_type(en_r)) ||
+		    !extent_entry_is_ptr(en_l))
+			return BCH_MERGE_NOMERGE;
 
-			/* We don't allow extents to straddle buckets: */
-			ca = bch_dev_bkey_exists(c, lp->dev);
+		lp = &en_l->ptr;
+		rp = &en_r->ptr;
 
-			if (PTR_BUCKET_NR(ca, lp) != PTR_BUCKET_NR(ca, rp))
-				return BCH_MERGE_NOMERGE;
-		}
+		if (lp->offset + el.k->size	!= rp->offset ||
+		    lp->dev			!= rp->dev ||
+		    lp->gen			!= rp->gen)
+			return BCH_MERGE_NOMERGE;
 
-		break;
-	case BCH_RESERVATION: {
-		struct bkey_i_reservation *li = bkey_i_to_reservation(l);
-		struct bkey_i_reservation *ri = bkey_i_to_reservation(r);
+		/* We don't allow extents to straddle buckets: */
+		ca = bch_dev_bkey_exists(c, lp->dev);
 
-		if (li->v.generation != ri->v.generation ||
-		    li->v.nr_replicas != ri->v.nr_replicas)
+		if (PTR_BUCKET_NR(ca, lp) != PTR_BUCKET_NR(ca, rp))
 			return BCH_MERGE_NOMERGE;
-		break;
-	}
-	default:
-		return BCH_MERGE_NOMERGE;
 	}
 
 	l->k.needs_whiteout |= r->k.needs_whiteout;
@@ -1810,7 +1674,7 @@ static bool bch2_extent_merge_inline(struct bch_fs *c,
 	bch2_bkey_unpack(b, &li.k, l);
 	bch2_bkey_unpack(b, &ri.k, r);
 
-	ret = bch2_extent_merge(c, b, &li.k, &ri.k);
+	ret = bch2_bkey_merge(c, &li.k, &ri.k);
 	if (ret == BCH_MERGE_NOMERGE)
 		return false;
 
@@ -1878,3 +1742,54 @@ int bch2_check_range_allocated(struct bch_fs *c, struct bpos pos, u64 size)
 
 	return ret;
 }
+
+/* KEY_TYPE_reservation: */
+
+const char *bch2_reservation_invalid(const struct bch_fs *c, struct bkey_s_c k)
+{
+	struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k);
+
+	if (bkey_val_bytes(k.k) != sizeof(struct bch_reservation))
+		return "incorrect value size";
+
+	if (!r.v->nr_replicas || r.v->nr_replicas > BCH_REPLICAS_MAX)
+		return "invalid nr_replicas";
+
+	return NULL;
+}
+
+void bch2_reservation_to_text(struct printbuf *out, struct bch_fs *c,
+			      struct bkey_s_c k)
+{
+	struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k);
+
+	pr_buf(out, "generation %u replicas %u",
+	       le32_to_cpu(r.v->generation),
+	       r.v->nr_replicas);
+}
+
+enum merge_result bch2_reservation_merge(struct bch_fs *c,
+					 struct bkey_i *l, struct bkey_i *r)
+{
+	struct bkey_i_reservation *li = bkey_i_to_reservation(l);
+	struct bkey_i_reservation *ri = bkey_i_to_reservation(r);
+
+	if (li->v.generation != ri->v.generation ||
+	    li->v.nr_replicas != ri->v.nr_replicas)
+		return BCH_MERGE_NOMERGE;
+
+	l->k.needs_whiteout |= r->k.needs_whiteout;
+
+	/* Keys with no pointers aren't restricted to one bucket and could
+	 * overflow KEY_SIZE
+	 */
+	if ((u64) l->k.size + r->k.size > KEY_SIZE_MAX) {
+		bch2_key_resize(&l->k, KEY_SIZE_MAX);
+		bch2_cut_front(l->k.p, r);
+		return BCH_MERGE_PARTIAL;
+	}
+
+	bch2_key_resize(&l->k, l->k.size + r->k.size);
+
+	return BCH_MERGE_MERGE;
+}
diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h
index 389604f25630..57eb35699545 100644
--- a/fs/bcachefs/extents.h
+++ b/fs/bcachefs/extents.h
@@ -10,125 +10,34 @@ struct bch_fs;
 struct btree_insert;
 struct btree_insert_entry;
 
-const char *bch2_btree_ptr_invalid(const struct bch_fs *, struct bkey_s_c);
-void bch2_btree_ptr_debugcheck(struct bch_fs *, struct btree *,
-			       struct bkey_s_c);
-void bch2_btree_ptr_to_text(struct printbuf *, struct bch_fs *,
-			    struct bkey_s_c);
-void bch2_ptr_swab(const struct bkey_format *, struct bkey_packed *);
-
-#define bch2_bkey_btree_ops (struct bkey_ops) {			\
-	.key_invalid	= bch2_btree_ptr_invalid,		\
-	.key_debugcheck	= bch2_btree_ptr_debugcheck,		\
-	.val_to_text	= bch2_btree_ptr_to_text,		\
-	.swab		= bch2_ptr_swab,			\
-}
-
-const char *bch2_extent_invalid(const struct bch_fs *, struct bkey_s_c);
-void bch2_extent_debugcheck(struct bch_fs *, struct btree *, struct bkey_s_c);
-void bch2_extent_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
-bool bch2_ptr_normalize(struct bch_fs *, struct btree *, struct bkey_s);
-enum merge_result bch2_extent_merge(struct bch_fs *, struct btree *,
-				    struct bkey_i *, struct bkey_i *);
-
-#define bch2_bkey_extent_ops (struct bkey_ops) {		\
-	.key_invalid	= bch2_extent_invalid,			\
-	.key_debugcheck	= bch2_extent_debugcheck,		\
-	.val_to_text	= bch2_extent_to_text,			\
-	.swab		= bch2_ptr_swab,			\
-	.key_normalize	= bch2_ptr_normalize,			\
-	.key_merge	= bch2_extent_merge,			\
-	.is_extents	= true,					\
-}
-
-void bch2_mark_io_failure(struct bch_io_failures *,
-			  struct extent_ptr_decoded *);
-int bch2_btree_pick_ptr(struct bch_fs *, const struct btree *,
-			struct bch_io_failures *,
-			struct extent_ptr_decoded *);
-int bch2_extent_pick_ptr(struct bch_fs *, struct bkey_s_c,
-			 struct bch_io_failures *,
-			 struct extent_ptr_decoded *);
-
-void bch2_extent_trim_atomic(struct bkey_i *, struct btree_iter *);
-
-static inline bool bch2_extent_is_atomic(struct bkey *k,
-					 struct btree_iter *iter)
-{
-	struct btree *b = iter->l[0].b;
-
-	return bkey_cmp(k->p, b->key.k.p) <= 0 &&
-		bkey_cmp(bkey_start_pos(k), b->data->min_key) >= 0;
-}
-
-enum btree_insert_ret
-bch2_extent_can_insert(struct btree_insert *, struct btree_insert_entry *,
-		       unsigned *);
-enum btree_insert_ret
-bch2_insert_fixup_extent(struct btree_insert *, struct btree_insert_entry *);
-
-bool bch2_extent_normalize(struct bch_fs *, struct bkey_s);
-void bch2_extent_mark_replicas_cached(struct bch_fs *, struct bkey_s_extent,
-				      unsigned, unsigned);
-
-const struct bch_extent_ptr *
-bch2_extent_has_device(struct bkey_s_c_extent, unsigned);
-void bch2_extent_drop_device(struct bkey_s_extent, unsigned);
-const struct bch_extent_ptr *
-bch2_extent_has_group(struct bch_fs *, struct bkey_s_c_extent, unsigned);
-const struct bch_extent_ptr *
-bch2_extent_has_target(struct bch_fs *, struct bkey_s_c_extent, unsigned);
-
-unsigned bch2_extent_nr_ptrs(struct bkey_s_c_extent);
-unsigned bch2_extent_nr_dirty_ptrs(struct bkey_s_c);
-unsigned bch2_extent_is_compressed(struct bkey_s_c);
-
-unsigned bch2_extent_durability(struct bch_fs *, struct bkey_s_c_extent);
-
-bool bch2_extent_matches_ptr(struct bch_fs *, struct bkey_s_c_extent,
-			     struct bch_extent_ptr, u64);
-
-static inline bool bkey_extent_is_data(const struct bkey *k)
-{
-	switch (k->type) {
-	case BCH_EXTENT:
-	case BCH_EXTENT_CACHED:
-		return true;
-	default:
-		return false;
-	}
-}
-
-static inline bool bkey_extent_is_allocation(const struct bkey *k)
-{
-	switch (k->type) {
-	case BCH_EXTENT:
-	case BCH_EXTENT_CACHED:
-	case BCH_RESERVATION:
-		return true;
-	default:
-		return false;
-	}
-}
-
-static inline bool bch2_extent_is_fully_allocated(struct bkey_s_c k)
-{
-	return bkey_extent_is_allocation(k.k) &&
-		!bch2_extent_is_compressed(k);
-}
+/* extent entries: */
 
-static inline bool bkey_extent_is_cached(const struct bkey *k)
-{
-	return k->type == BCH_EXTENT_CACHED;
-}
+#define extent_entry_last(_e)		bkey_val_end(_e)
 
-static inline void bkey_extent_set_cached(struct bkey *k, bool cached)
-{
-	EBUG_ON(k->type != BCH_EXTENT &&
-		k->type != BCH_EXTENT_CACHED);
+#define entry_to_ptr(_entry)						\
+({									\
+	EBUG_ON((_entry) && !extent_entry_is_ptr(_entry));		\
+									\
+	__builtin_choose_expr(						\
+		type_is_exact(_entry, const union bch_extent_entry *),	\
+		(const struct bch_extent_ptr *) (_entry),		\
+		(struct bch_extent_ptr *) (_entry));			\
+})
 
-	k->type = cached ? BCH_EXTENT_CACHED : BCH_EXTENT;
-}
+/* downcast, preserves const */
+#define to_entry(_entry)						\
+({									\
+	BUILD_BUG_ON(!type_is(_entry, union bch_extent_crc *) &&	\
+		     !type_is(_entry, struct bch_extent_ptr *) &&	\
+		     !type_is(_entry, struct bch_extent_stripe_ptr *));	\
+									\
+	__builtin_choose_expr(						\
+		(type_is_exact(_entry, const union bch_extent_crc *) ||	\
+		 type_is_exact(_entry, const struct bch_extent_ptr *) ||\
+		 type_is_exact(_entry, const struct bch_extent_stripe_ptr *)),\
+		(const union bch_extent_entry *) (_entry),		\
+		(union bch_extent_entry *) (_entry));			\
+})
 
 static inline unsigned
 __extent_entry_type(const union bch_extent_entry *e)
@@ -193,21 +102,6 @@ union bch_extent_crc {
 	struct bch_extent_crc128	crc128;
 };
 
-/* downcast, preserves const */
-#define to_entry(_entry)						\
-({									\
-	BUILD_BUG_ON(!type_is(_entry, union bch_extent_crc *) &&	\
-		     !type_is(_entry, struct bch_extent_ptr *) &&	\
-		     !type_is(_entry, struct bch_extent_stripe_ptr *));	\
-									\
-	__builtin_choose_expr(						\
-		(type_is_exact(_entry, const union bch_extent_crc *) ||	\
-		 type_is_exact(_entry, const struct bch_extent_ptr *) ||\
-		 type_is_exact(_entry, const struct bch_extent_stripe_ptr *)),\
-		(const union bch_extent_entry *) (_entry),		\
-		(union bch_extent_entry *) (_entry));			\
-})
-
 #define __entry_to_crc(_entry)						\
 	__builtin_choose_expr(						\
 		type_is_exact(_entry, const union bch_extent_entry *),	\
@@ -221,18 +115,6 @@ union bch_extent_crc {
 	__entry_to_crc(_entry);						\
 })
 
-#define entry_to_ptr(_entry)						\
-({									\
-	EBUG_ON((_entry) && !extent_entry_is_ptr(_entry));		\
-									\
-	__builtin_choose_expr(						\
-		type_is_exact(_entry, const union bch_extent_entry *),	\
-		(const struct bch_extent_ptr *) (_entry),		\
-		(struct bch_extent_ptr *) (_entry));			\
-})
-
-/* checksum entries: */
-
 static inline struct bch_extent_crc_unpacked
 bch2_extent_crc_unpack(const struct bkey *k, const union bch_extent_crc *crc)
 {
@@ -290,71 +172,64 @@ bch2_extent_crc_unpack(const struct bkey *k, const union bch_extent_crc *crc)
 #undef common_fields
 }
 
-/* Extent entry iteration: */
-
-#define extent_entry_next(_entry)					\
-	((typeof(_entry)) ((void *) (_entry) + extent_entry_bytes(_entry)))
+/* bkey_ptrs: generically over any key type that has ptrs */
 
-#define extent_entry_last(_e)						\
-	vstruct_idx((_e).v, bkey_val_u64s((_e).k))
+struct bkey_ptrs_c {
+	const union bch_extent_entry	*start;
+	const union bch_extent_entry	*end;
+};
 
-/* Iterate over all entries: */
+struct bkey_ptrs {
+	union bch_extent_entry	*start;
+	union bch_extent_entry	*end;
+};
 
-#define extent_for_each_entry_from(_e, _entry, _start)			\
-	for ((_entry) = _start;						\
-	     (_entry) < extent_entry_last(_e);				\
-	     (_entry) = extent_entry_next(_entry))
+/* iterate over bkey ptrs */
 
-#define extent_for_each_entry(_e, _entry)				\
-	extent_for_each_entry_from(_e, _entry, (_e).v->start)
+#define extent_entry_next(_entry)					\
+	((typeof(_entry)) ((void *) (_entry) + extent_entry_bytes(_entry)))
 
-/* Iterate over pointers only: */
+#define __bkey_extent_entry_for_each_from(_start, _end, _entry)		\
+	for ((_entry) = (_start);					\
+	     (_entry) < (_end);						\
+	     (_entry) = extent_entry_next(_entry))
 
-#define extent_ptr_next(_e, _ptr)					\
+#define __bkey_ptr_next(_ptr, _end)					\
 ({									\
-	typeof(&(_e).v->start[0]) _entry;				\
+	typeof(_end) _entry;						\
 									\
-	extent_for_each_entry_from(_e, _entry, to_entry(_ptr))		\
+	__bkey_extent_entry_for_each_from(to_entry(_ptr), _end, _entry)	\
 		if (extent_entry_is_ptr(_entry))			\
 			break;						\
 									\
-	_entry < extent_entry_last(_e) ? entry_to_ptr(_entry) : NULL;	\
+	_entry < (_end) ? entry_to_ptr(_entry) : NULL;			\
 })
 
-#define extent_for_each_ptr(_e, _ptr)					\
-	for ((_ptr) = &(_e).v->start->ptr;				\
-	     ((_ptr) = extent_ptr_next(_e, _ptr));			\
-	     (_ptr)++)
+#define bkey_extent_entry_for_each_from(_p, _entry, _start)		\
+	__bkey_extent_entry_for_each_from(_start, (_p).end, _entry)
 
-/* Iterate over crcs only: */
+#define bkey_extent_entry_for_each(_p, _entry)				\
+	bkey_extent_entry_for_each_from(_p, _entry, _p.start)
 
-#define extent_crc_next(_e, _crc, _iter)				\
-({									\
-	extent_for_each_entry_from(_e, _iter, _iter)			\
-		if (extent_entry_is_crc(_iter)) {			\
-			(_crc) = bch2_extent_crc_unpack((_e).k, entry_to_crc(_iter));\
-			break;						\
-		}							\
-									\
-	(_iter) < extent_entry_last(_e);				\
-})
+#define __bkey_for_each_ptr(_start, _end, _ptr)				\
+	for ((_ptr) = (_start);						\
+	     ((_ptr) = __bkey_ptr_next(_ptr, _end));			\
+	     (_ptr)++)
 
-#define extent_for_each_crc(_e, _crc, _iter)				\
-	for ((_crc) = bch2_extent_crc_unpack((_e).k, NULL),		\
-	     (_iter) = (_e).v->start;					\
-	     extent_crc_next(_e, _crc, _iter);				\
-	     (_iter) = extent_entry_next(_iter))
+#define bkey_ptr_next(_p, _ptr)						\
+	__bkey_ptr_next(_ptr, (_p).end)
 
-/* Iterate over pointers, with crcs: */
+#define bkey_for_each_ptr(_p, _ptr)					\
+	__bkey_for_each_ptr(&(_p).start->ptr, (_p).end, _ptr)
 
-#define __extent_ptr_next_decode(_e, _ptr, _entry)			\
+#define __bkey_ptr_next_decode(_k, _end, _ptr, _entry)			\
 ({									\
 	__label__ out;							\
 									\
 	(_ptr).idx	= 0;						\
 	(_ptr).ec_nr	= 0;						\
 									\
-	extent_for_each_entry_from(_e, _entry, _entry)			\
+	__bkey_extent_entry_for_each_from(_entry, _end, _entry)		\
 		switch (extent_entry_type(_entry)) {			\
 		case BCH_EXTENT_ENTRY_ptr:				\
 			(_ptr).ptr		= _entry->ptr;		\
@@ -362,7 +237,7 @@ bch2_extent_crc_unpack(const struct bkey *k, const union bch_extent_crc *crc)
 		case BCH_EXTENT_ENTRY_crc32:				\
 		case BCH_EXTENT_ENTRY_crc64:				\
 		case BCH_EXTENT_ENTRY_crc128:				\
-			(_ptr).crc = bch2_extent_crc_unpack((_e).k,	\
+			(_ptr).crc = bch2_extent_crc_unpack(_k,		\
 					entry_to_crc(_entry));		\
 			break;						\
 		case BCH_EXTENT_ENTRY_stripe_ptr:			\
@@ -370,122 +245,298 @@ bch2_extent_crc_unpack(const struct bkey *k, const union bch_extent_crc *crc)
 			break;						\
 		}							\
 out:									\
-	_entry < extent_entry_last(_e);					\
+	_entry < (_end);						\
 })
 
-#define extent_for_each_ptr_decode(_e, _ptr, _entry)			\
-	for ((_ptr).crc = bch2_extent_crc_unpack((_e).k, NULL),		\
-	     (_entry) = (_e).v->start;					\
-	     __extent_ptr_next_decode(_e, _ptr, _entry);		\
+#define __bkey_for_each_ptr_decode(_k, _start, _end, _ptr, _entry)	\
+	for ((_ptr).crc = bch2_extent_crc_unpack(_k, NULL),		\
+	     (_entry) = _start;						\
+	     __bkey_ptr_next_decode(_k, _end, _ptr, _entry);		\
 	     (_entry) = extent_entry_next(_entry))
 
-/* Iterate over pointers backwards: */
+#define bkey_for_each_ptr_decode(_k, _p, _ptr, _entry)			\
+	__bkey_for_each_ptr_decode(_k, (_p).start, (_p).end,		\
+				   _ptr, _entry)
 
-void bch2_extent_crc_append(struct bkey_i_extent *,
-			    struct bch_extent_crc_unpacked);
-void bch2_extent_ptr_decoded_append(struct bkey_i_extent *,
-				    struct extent_ptr_decoded *);
+/* utility code common to all keys with pointers: */
 
-static inline void __extent_entry_push(struct bkey_i_extent *e)
+static inline struct bkey_ptrs_c bch2_bkey_ptrs_c(struct bkey_s_c k)
 {
-	union bch_extent_entry *entry = extent_entry_last(extent_i_to_s(e));
-
-	EBUG_ON(bkey_val_u64s(&e->k) + extent_entry_u64s(entry) >
-		BKEY_EXTENT_VAL_U64s_MAX);
-
-	e->k.u64s += extent_entry_u64s(entry);
+	switch (k.k->type) {
+	case KEY_TYPE_btree_ptr: {
+		struct bkey_s_c_btree_ptr e = bkey_s_c_to_btree_ptr(k);
+		return (struct bkey_ptrs_c) {
+			to_entry(&e.v->start[0]),
+			to_entry(bkey_val_end(e))
+		};
+	}
+	case KEY_TYPE_extent: {
+		struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
+		return (struct bkey_ptrs_c) {
+			e.v->start,
+			extent_entry_last(e)
+		};
+	}
+	case KEY_TYPE_stripe: {
+		struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k);
+		return (struct bkey_ptrs_c) {
+			to_entry(&s.v->ptrs[0]),
+			to_entry(&s.v->ptrs[s.v->nr_blocks]),
+		};
+	}
+	default:
+		return (struct bkey_ptrs_c) { NULL, NULL };
+	}
 }
 
-static inline void extent_ptr_append(struct bkey_i_extent *e,
-				     struct bch_extent_ptr ptr)
+static inline struct bkey_ptrs bch2_bkey_ptrs(struct bkey_s k)
 {
-	ptr.type = 1 << BCH_EXTENT_ENTRY_ptr;
-	extent_entry_last(extent_i_to_s(e))->ptr = ptr;
-	__extent_entry_push(e);
+	struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k.s_c);
+
+	return (struct bkey_ptrs) {
+		(void *) p.start,
+		(void *) p.end
+	};
 }
 
-static inline struct bch_devs_list bch2_extent_devs(struct bkey_s_c_extent e)
+static inline struct bch_devs_list bch2_bkey_devs(struct bkey_s_c k)
 {
 	struct bch_devs_list ret = (struct bch_devs_list) { 0 };
+	struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k);
 	const struct bch_extent_ptr *ptr;
 
-	extent_for_each_ptr(e, ptr)
+	bkey_for_each_ptr(p, ptr)
 		ret.devs[ret.nr++] = ptr->dev;
 
 	return ret;
 }
 
-static inline struct bch_devs_list bch2_extent_dirty_devs(struct bkey_s_c_extent e)
+static inline struct bch_devs_list bch2_bkey_dirty_devs(struct bkey_s_c k)
 {
 	struct bch_devs_list ret = (struct bch_devs_list) { 0 };
+	struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k);
 	const struct bch_extent_ptr *ptr;
 
-	extent_for_each_ptr(e, ptr)
+	bkey_for_each_ptr(p, ptr)
 		if (!ptr->cached)
 			ret.devs[ret.nr++] = ptr->dev;
 
 	return ret;
 }
 
-static inline struct bch_devs_list bch2_extent_cached_devs(struct bkey_s_c_extent e)
+static inline struct bch_devs_list bch2_bkey_cached_devs(struct bkey_s_c k)
 {
 	struct bch_devs_list ret = (struct bch_devs_list) { 0 };
+	struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k);
 	const struct bch_extent_ptr *ptr;
 
-	extent_for_each_ptr(e, ptr)
+	bkey_for_each_ptr(p, ptr)
 		if (ptr->cached)
 			ret.devs[ret.nr++] = ptr->dev;
 
 	return ret;
 }
 
-static inline struct bch_devs_list bch2_bkey_devs(struct bkey_s_c k)
+static inline bool bch2_bkey_has_device(struct bkey_s_c k, unsigned dev)
 {
-	switch (k.k->type) {
-	case BCH_EXTENT:
-	case BCH_EXTENT_CACHED:
-		return bch2_extent_devs(bkey_s_c_to_extent(k));
-	default:
-		return (struct bch_devs_list) { .nr = 0 };
-	}
+	struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k);
+	const struct bch_extent_ptr *ptr;
+
+	bkey_for_each_ptr(p, ptr)
+		if (ptr->dev == dev)
+			return ptr;
+
+	return NULL;
 }
 
-static inline struct bch_devs_list bch2_bkey_dirty_devs(struct bkey_s_c k)
+unsigned bch2_bkey_nr_ptrs(struct bkey_s_c);
+unsigned bch2_bkey_nr_dirty_ptrs(struct bkey_s_c);
+unsigned bch2_bkey_durability(struct bch_fs *, struct bkey_s_c);
+
+void bch2_mark_io_failure(struct bch_io_failures *,
+			  struct extent_ptr_decoded *);
+int bch2_bkey_pick_read_device(struct bch_fs *, struct bkey_s_c,
+			       struct bch_io_failures *,
+			       struct extent_ptr_decoded *);
+
+/* bch_btree_ptr: */
+
+const char *bch2_btree_ptr_invalid(const struct bch_fs *, struct bkey_s_c);
+void bch2_btree_ptr_debugcheck(struct bch_fs *, struct btree *,
+			       struct bkey_s_c);
+void bch2_btree_ptr_to_text(struct printbuf *, struct bch_fs *,
+			    struct bkey_s_c);
+void bch2_ptr_swab(const struct bkey_format *, struct bkey_packed *);
+
+#define bch2_bkey_ops_btree_ptr (struct bkey_ops) {		\
+	.key_invalid	= bch2_btree_ptr_invalid,		\
+	.key_debugcheck	= bch2_btree_ptr_debugcheck,		\
+	.val_to_text	= bch2_btree_ptr_to_text,		\
+	.swab		= bch2_ptr_swab,			\
+}
+
+/* bch_extent: */
+
+const char *bch2_extent_invalid(const struct bch_fs *, struct bkey_s_c);
+void bch2_extent_debugcheck(struct bch_fs *, struct btree *, struct bkey_s_c);
+void bch2_extent_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
+bool bch2_extent_normalize(struct bch_fs *, struct bkey_s);
+enum merge_result bch2_extent_merge(struct bch_fs *,
+				    struct bkey_i *, struct bkey_i *);
+
+#define bch2_bkey_ops_extent (struct bkey_ops) {		\
+	.key_invalid	= bch2_extent_invalid,			\
+	.key_debugcheck	= bch2_extent_debugcheck,		\
+	.val_to_text	= bch2_extent_to_text,			\
+	.swab		= bch2_ptr_swab,			\
+	.key_normalize	= bch2_extent_normalize,		\
+	.key_merge	= bch2_extent_merge,			\
+}
+
+/* bch_reservation: */
+
+const char *bch2_reservation_invalid(const struct bch_fs *, struct bkey_s_c);
+void bch2_reservation_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
+enum merge_result bch2_reservation_merge(struct bch_fs *,
+					 struct bkey_i *, struct bkey_i *);
+
+#define bch2_bkey_ops_reservation (struct bkey_ops) {		\
+	.key_invalid	= bch2_reservation_invalid,		\
+	.val_to_text	= bch2_reservation_to_text,		\
+	.key_merge	= bch2_reservation_merge,		\
+}
+
+void bch2_extent_trim_atomic(struct bkey_i *, struct btree_iter *);
+
+static inline bool bch2_extent_is_atomic(struct bkey *k,
+					 struct btree_iter *iter)
 {
-	switch (k.k->type) {
-	case BCH_EXTENT:
-	case BCH_EXTENT_CACHED:
-		return bch2_extent_dirty_devs(bkey_s_c_to_extent(k));
+	struct btree *b = iter->l[0].b;
+
+	return bkey_cmp(k->p, b->key.k.p) <= 0 &&
+		bkey_cmp(bkey_start_pos(k), b->data->min_key) >= 0;
+}
+
+enum btree_insert_ret
+bch2_extent_can_insert(struct btree_insert *, struct btree_insert_entry *,
+		       unsigned *);
+enum btree_insert_ret
+bch2_insert_fixup_extent(struct btree_insert *, struct btree_insert_entry *);
+
+void bch2_extent_mark_replicas_cached(struct bch_fs *, struct bkey_s_extent,
+				      unsigned, unsigned);
+
+const struct bch_extent_ptr *
+bch2_extent_has_device(struct bkey_s_c_extent, unsigned);
+const struct bch_extent_ptr *
+bch2_extent_has_group(struct bch_fs *, struct bkey_s_c_extent, unsigned);
+const struct bch_extent_ptr *
+bch2_extent_has_target(struct bch_fs *, struct bkey_s_c_extent, unsigned);
+
+unsigned bch2_extent_is_compressed(struct bkey_s_c);
+
+bool bch2_extent_matches_ptr(struct bch_fs *, struct bkey_s_c_extent,
+			     struct bch_extent_ptr, u64);
+
+static inline bool bkey_extent_is_data(const struct bkey *k)
+{
+	switch (k->type) {
+	case KEY_TYPE_btree_ptr:
+	case KEY_TYPE_extent:
+		return true;
 	default:
-		return (struct bch_devs_list) { .nr = 0 };
+		return false;
 	}
 }
 
-static inline struct bch_devs_list bch2_bkey_cached_devs(struct bkey_s_c k)
+static inline bool bkey_extent_is_allocation(const struct bkey *k)
 {
-	switch (k.k->type) {
-	case BCH_EXTENT:
-	case BCH_EXTENT_CACHED:
-		return bch2_extent_cached_devs(bkey_s_c_to_extent(k));
+	switch (k->type) {
+	case KEY_TYPE_extent:
+	case KEY_TYPE_reservation:
+		return true;
 	default:
-		return (struct bch_devs_list) { .nr = 0 };
+		return false;
 	}
 }
 
+static inline bool bch2_extent_is_fully_allocated(struct bkey_s_c k)
+{
+	return bkey_extent_is_allocation(k.k) &&
+		!bch2_extent_is_compressed(k);
+}
+
+void bch2_bkey_append_ptr(struct bkey_i *, struct bch_extent_ptr);
+void bch2_bkey_drop_device(struct bkey_s, unsigned);
+
+/* Extent entry iteration: */
+
+#define extent_for_each_entry_from(_e, _entry, _start)			\
+	__bkey_extent_entry_for_each_from(_start,			\
+				extent_entry_last(_e),_entry)
+
+#define extent_for_each_entry(_e, _entry)				\
+	extent_for_each_entry_from(_e, _entry, (_e).v->start)
+
+#define extent_ptr_next(_e, _ptr)					\
+	__bkey_ptr_next(_ptr, extent_entry_last(_e))
+
+#define extent_for_each_ptr(_e, _ptr)					\
+	__bkey_for_each_ptr(&(_e).v->start->ptr, extent_entry_last(_e), _ptr)
+
+#define extent_crc_next(_e, _crc, _iter)				\
+({									\
+	extent_for_each_entry_from(_e, _iter, _iter)			\
+		if (extent_entry_is_crc(_iter)) {			\
+			(_crc) = bch2_extent_crc_unpack((_e).k, entry_to_crc(_iter));\
+			break;						\
+		}							\
+									\
+	(_iter) < extent_entry_last(_e);				\
+})
+
+#define extent_for_each_crc(_e, _crc, _iter)				\
+	for ((_crc) = bch2_extent_crc_unpack((_e).k, NULL),		\
+	     (_iter) = (_e).v->start;					\
+	     extent_crc_next(_e, _crc, _iter);				\
+	     (_iter) = extent_entry_next(_iter))
+
+#define extent_for_each_ptr_decode(_e, _ptr, _entry)			\
+	__bkey_for_each_ptr_decode((_e).k, (_e).v->start,		\
+				   extent_entry_last(_e), _ptr, _entry)
+
+void bch2_extent_crc_append(struct bkey_i_extent *,
+			    struct bch_extent_crc_unpacked);
+void bch2_extent_ptr_decoded_append(struct bkey_i_extent *,
+				    struct extent_ptr_decoded *);
+
+static inline void __extent_entry_push(struct bkey_i_extent *e)
+{
+	union bch_extent_entry *entry = extent_entry_last(extent_i_to_s(e));
+
+	EBUG_ON(bkey_val_u64s(&e->k) + extent_entry_u64s(entry) >
+		BKEY_EXTENT_VAL_U64s_MAX);
+
+	e->k.u64s += extent_entry_u64s(entry);
+}
+
 bool bch2_can_narrow_extent_crcs(struct bkey_s_c_extent,
 				 struct bch_extent_crc_unpacked);
 bool bch2_extent_narrow_crcs(struct bkey_i_extent *, struct bch_extent_crc_unpacked);
 
-union bch_extent_entry *bch2_extent_drop_ptr(struct bkey_s_extent ,
-					     struct bch_extent_ptr *);
+union bch_extent_entry *bch2_bkey_drop_ptr(struct bkey_s,
+					   struct bch_extent_ptr *);
 
-#define bch2_extent_drop_ptrs(_e, _ptr, _cond)				\
+#define bch2_bkey_drop_ptrs(_k, _ptr, _cond)				\
 do {									\
-	_ptr = &(_e).v->start->ptr;					\
+	struct bkey_ptrs _ptrs = bch2_bkey_ptrs(_k);			\
+									\
+	_ptr = &_ptrs.start->ptr;					\
 									\
-	while ((_ptr = extent_ptr_next(e, _ptr))) {			\
+	while ((_ptr = bkey_ptr_next(_ptrs, _ptr))) {			\
 		if (_cond) {						\
-			_ptr = (void *) bch2_extent_drop_ptr(_e, _ptr);	\
+			_ptr = (void *) bch2_bkey_drop_ptr(_k, _ptr);	\
+			_ptrs = bch2_bkey_ptrs(_k);			\
 			continue;					\
 		}							\
 									\
diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index e7d7c5fe6db7..ad06db069fcf 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -121,7 +121,7 @@ static void bch2_quota_reservation_put(struct bch_fs *c,
 	BUG_ON(res->sectors > inode->ei_quota_reserved);
 
 	bch2_quota_acct(c, inode->ei_qid, Q_SPC,
-			-((s64) res->sectors), BCH_QUOTA_PREALLOC);
+			-((s64) res->sectors), KEY_TYPE_QUOTA_PREALLOC);
 	inode->ei_quota_reserved -= res->sectors;
 	mutex_unlock(&inode->ei_quota_lock);
 
@@ -138,7 +138,7 @@ static int bch2_quota_reservation_add(struct bch_fs *c,
 
 	mutex_lock(&inode->ei_quota_lock);
 	ret = bch2_quota_acct(c, inode->ei_qid, Q_SPC, sectors,
-			      check_enospc ? BCH_QUOTA_PREALLOC : BCH_QUOTA_NOCHECK);
+			      check_enospc ? KEY_TYPE_QUOTA_PREALLOC : KEY_TYPE_QUOTA_NOCHECK);
 	if (likely(!ret)) {
 		inode->ei_quota_reserved += sectors;
 		res->sectors += sectors;
@@ -220,7 +220,7 @@ static void i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode,
 		quota_res->sectors -= sectors;
 		inode->ei_quota_reserved -= sectors;
 	} else {
-		bch2_quota_acct(c, inode->ei_qid, Q_SPC, sectors, BCH_QUOTA_WARN);
+		bch2_quota_acct(c, inode->ei_qid, Q_SPC, sectors, KEY_TYPE_QUOTA_WARN);
 	}
 #endif
 	inode->v.i_blocks += sectors;
@@ -813,7 +813,7 @@ static void bch2_add_page_sectors(struct bio *bio, struct bkey_s_c k)
 	struct bvec_iter iter;
 	struct bio_vec bv;
 	unsigned nr_ptrs = !bch2_extent_is_compressed(k)
-		? bch2_extent_nr_dirty_ptrs(k)
+		? bch2_bkey_nr_dirty_ptrs(k)
 		: 0;
 
 	bio_for_each_segment(bv, bio, iter) {
@@ -2397,7 +2397,7 @@ static long bch2_fcollapse(struct bch_inode_info *inode,
 		BUG_ON(bkey_cmp(dst->pos, bkey_start_pos(&copy.k.k)));
 
 		ret = bch2_disk_reservation_get(c, &disk_res, copy.k.k.size,
-				bch2_extent_nr_dirty_ptrs(bkey_i_to_s_c(&copy.k)),
+				bch2_bkey_nr_dirty_ptrs(bkey_i_to_s_c(&copy.k)),
 				BCH_DISK_RESERVATION_NOFAIL);
 		BUG_ON(ret);
 
@@ -2504,7 +2504,7 @@ static long bch2_fallocate(struct bch_inode_info *inode, int mode,
 			goto btree_iter_err;
 
 		/* already reserved */
-		if (k.k->type == BCH_RESERVATION &&
+		if (k.k->type == KEY_TYPE_reservation &&
 		    bkey_s_c_to_reservation(k).v->nr_replicas >= replicas) {
 			bch2_btree_iter_next_slot(iter);
 			continue;
@@ -2517,7 +2517,7 @@ static long bch2_fallocate(struct bch_inode_info *inode, int mode,
 		}
 
 		bkey_reservation_init(&reservation.k_i);
-		reservation.k.type	= BCH_RESERVATION;
+		reservation.k.type	= KEY_TYPE_reservation;
 		reservation.k.p		= k.k->p;
 		reservation.k.size	= k.k->size;
 
@@ -2525,7 +2525,7 @@ static long bch2_fallocate(struct bch_inode_info *inode, int mode,
 		bch2_cut_back(end_pos, &reservation.k);
 
 		sectors = reservation.k.size;
-		reservation.v.nr_replicas = bch2_extent_nr_dirty_ptrs(k);
+		reservation.v.nr_replicas = bch2_bkey_nr_dirty_ptrs(k);
 
 		if (!bkey_extent_is_allocation(k.k)) {
 			ret = bch2_quota_reservation_add(c, inode,
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index b6fe2059fe5f..93e1f3aaacd4 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -340,7 +340,7 @@ __bch2_create(struct mnt_idmap *idmap,
 	if (tmpfile)
 		inode_u.bi_flags |= BCH_INODE_UNLINKED;
 
-	ret = bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, 1, BCH_QUOTA_PREALLOC);
+	ret = bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, 1, KEY_TYPE_QUOTA_PREALLOC);
 	if (ret)
 		return ERR_PTR(ret);
 
@@ -457,7 +457,7 @@ err_trans:
 	make_bad_inode(&inode->v);
 	iput(&inode->v);
 err:
-	bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, -1, BCH_QUOTA_WARN);
+	bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, -1, KEY_TYPE_QUOTA_WARN);
 	inode = ERR_PTR(ret);
 	goto out;
 }
@@ -1079,7 +1079,7 @@ static int bch2_fill_extent(struct fiemap_extent_info *info,
 		}
 
 		return 0;
-	} else if (k->k.type == BCH_RESERVATION) {
+	} else if (k->k.type == KEY_TYPE_reservation) {
 		return fiemap_fill_next_extent(info,
 					       bkey_start_offset(&k->k) << 9,
 					       0, k->k.size << 9,
@@ -1112,7 +1112,7 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
 	for_each_btree_key(&iter, c, BTREE_ID_EXTENTS,
 			   POS(ei->v.i_ino, start >> 9), 0, k)
 		if (bkey_extent_is_data(k.k) ||
-		    k.k->type == BCH_RESERVATION) {
+		    k.k->type == KEY_TYPE_reservation) {
 			if (bkey_cmp(bkey_start_pos(k.k),
 				     POS(ei->v.i_ino, (start + len) >> 9)) >= 0)
 				break;
@@ -1414,9 +1414,9 @@ static void bch2_evict_inode(struct inode *vinode)
 
 	if (!inode->v.i_nlink && !is_bad_inode(&inode->v)) {
 		bch2_quota_acct(c, inode->ei_qid, Q_SPC, -((s64) inode->v.i_blocks),
-				BCH_QUOTA_WARN);
+				KEY_TYPE_QUOTA_WARN);
 		bch2_quota_acct(c, inode->ei_qid, Q_INO, -1,
-				BCH_QUOTA_WARN);
+				KEY_TYPE_QUOTA_WARN);
 		bch2_inode_rm(c, inode->v.i_ino);
 
 		WARN_ONCE(atomic_long_dec_return(&c->nr_inodes) < 0,
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 74b83201c213..57ab8f088415 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -235,7 +235,6 @@ static int hash_check_duplicates(const struct bch_hash_desc desc,
 				!desc.cmp_bkey(k, k2), c,
 				"duplicate hash table keys:\n%s",
 				(bch2_bkey_val_to_text(&PBUF(buf), c,
-						       bkey_type(0, desc.btree_id),
 						       k), buf))) {
 			ret = fsck_hash_delete_at(desc, &h->info, k_iter);
 			if (ret)
@@ -255,7 +254,7 @@ static bool key_has_correct_hash(const struct bch_hash_desc desc,
 {
 	u64 hash;
 
-	if (k.k->type != desc.whiteout_type &&
+	if (k.k->type != KEY_TYPE_whiteout &&
 	    k.k->type != desc.key_type)
 		return true;
 
@@ -280,7 +279,7 @@ static int hash_check_key(const struct bch_hash_desc desc,
 	u64 hashed;
 	int ret = 0;
 
-	if (k.k->type != desc.whiteout_type &&
+	if (k.k->type != KEY_TYPE_whiteout &&
 	    k.k->type != desc.key_type)
 		return 0;
 
@@ -300,7 +299,6 @@ static int hash_check_key(const struct bch_hash_desc desc,
 			desc.btree_id, k.k->p.offset,
 			hashed, h->chain->pos.offset,
 			(bch2_bkey_val_to_text(&PBUF(buf), c,
-					       bkey_type(0, desc.btree_id),
 					       k), buf))) {
 		ret = hash_redo_key(desc, h, c, k_iter, k, hashed);
 		if (ret) {
@@ -370,7 +368,7 @@ static int check_dirent_hash(struct hash_check *h, struct bch_fs *c,
 
 		*k = bch2_btree_iter_peek(iter);
 
-		BUG_ON(k->k->type != BCH_DIRENT);
+		BUG_ON(k->k->type != KEY_TYPE_dirent);
 	}
 err:
 fsck_err:
@@ -385,7 +383,6 @@ err_redo:
 		     buf, strlen(buf), BTREE_ID_DIRENTS,
 		     k->k->p.offset, hash, h->chain->pos.offset,
 		     (bch2_bkey_val_to_text(&PBUF(buf), c,
-					    bkey_type(0, BTREE_ID_DIRENTS),
 					    *k), buf))) {
 		ret = hash_redo_key(bch2_dirent_hash_desc,
 				    h, c, iter, *k, hash);
@@ -471,7 +468,7 @@ static int check_extents(struct bch_fs *c)
 
 		if (fsck_err_on(w.have_inode &&
 			!(w.inode.bi_flags & BCH_INODE_I_SIZE_DIRTY) &&
-			k.k->type != BCH_RESERVATION &&
+			k.k->type != KEY_TYPE_reservation &&
 			k.k->p.offset > round_up(w.inode.bi_size, PAGE_SIZE) >> 9, c,
 			"extent type %u offset %llu past end of inode %llu, i_size %llu",
 			k.k->type, k.k->p.offset, k.k->p.inode, w.inode.bi_size)) {
@@ -529,13 +526,11 @@ static int check_dirents(struct bch_fs *c)
 		if (fsck_err_on(!w.have_inode, c,
 				"dirent in nonexisting directory:\n%s",
 				(bch2_bkey_val_to_text(&PBUF(buf), c,
-						       (enum bkey_type) BTREE_ID_DIRENTS,
 						       k), buf)) ||
 		    fsck_err_on(!S_ISDIR(w.inode.bi_mode), c,
 				"dirent in non directory inode type %u:\n%s",
 				mode_to_type(w.inode.bi_mode),
 				(bch2_bkey_val_to_text(&PBUF(buf), c,
-						       (enum bkey_type) BTREE_ID_DIRENTS,
 						       k), buf))) {
 			ret = bch2_btree_delete_at(iter, 0);
 			if (ret)
@@ -557,7 +552,7 @@ static int check_dirents(struct bch_fs *c)
 		if (ret)
 			goto fsck_err;
 
-		if (k.k->type != BCH_DIRENT)
+		if (k.k->type != KEY_TYPE_dirent)
 			continue;
 
 		d = bkey_s_c_to_dirent(k);
@@ -586,7 +581,6 @@ static int check_dirents(struct bch_fs *c)
 		if (fsck_err_on(d_inum == d.k->p.inode, c,
 				"dirent points to own directory:\n%s",
 				(bch2_bkey_val_to_text(&PBUF(buf), c,
-						       (enum bkey_type) BTREE_ID_DIRENTS,
 						       k), buf))) {
 			ret = remove_dirent(c, iter, d);
 			if (ret)
@@ -604,7 +598,6 @@ static int check_dirents(struct bch_fs *c)
 		if (fsck_err_on(!have_target, c,
 				"dirent points to missing inode:\n%s",
 				(bch2_bkey_val_to_text(&PBUF(buf), c,
-						       (enum bkey_type) BTREE_ID_DIRENTS,
 						       k), buf))) {
 			ret = remove_dirent(c, iter, d);
 			if (ret)
@@ -618,7 +611,6 @@ static int check_dirents(struct bch_fs *c)
 				"incorrect d_type: should be %u:\n%s",
 				mode_to_type(target.bi_mode),
 				(bch2_bkey_val_to_text(&PBUF(buf), c,
-						       (enum bkey_type) BTREE_ID_DIRENTS,
 						       k), buf))) {
 			struct bkey_i_dirent *n;
 
@@ -899,7 +891,7 @@ next:
 
 			e->offset = k.k->p.offset;
 
-			if (k.k->type != BCH_DIRENT)
+			if (k.k->type != KEY_TYPE_dirent)
 				continue;
 
 			dirent = bkey_s_c_to_dirent(k);
@@ -942,7 +934,7 @@ up:
 	}
 
 	for_each_btree_key(&iter, c, BTREE_ID_INODES, POS_MIN, 0, k) {
-		if (k.k->type != BCH_INODE_FS)
+		if (k.k->type != KEY_TYPE_inode)
 			continue;
 
 		if (!S_ISDIR(le16_to_cpu(bkey_s_c_to_inode(k).v->bi_mode)))
@@ -1030,7 +1022,7 @@ static int bch2_gc_walk_dirents(struct bch_fs *c, nlink_table *links,
 
 	for_each_btree_key(&iter, c, BTREE_ID_DIRENTS, POS_MIN, 0, k) {
 		switch (k.k->type) {
-		case BCH_DIRENT:
+		case KEY_TYPE_dirent:
 			d = bkey_s_c_to_dirent(k);
 			d_inum = le64_to_cpu(d.v->d_inum);
 
@@ -1310,7 +1302,7 @@ peek_nlinks:	link = genradix_iter_peek(&nlinks_iter, links);
 		if (iter.pos.inode < nlinks_pos || !link)
 			link = &zero_links;
 
-		if (k.k && k.k->type == BCH_INODE_FS) {
+		if (k.k && k.k->type == KEY_TYPE_inode) {
 			/*
 			 * Avoid potential deadlocks with iter for
 			 * truncate/rm/etc.:
@@ -1392,7 +1384,7 @@ static int check_inodes_fast(struct bch_fs *c)
 	int ret = 0;
 
 	for_each_btree_key(&iter, c, BTREE_ID_INODES, POS_MIN, 0, k) {
-		if (k.k->type != BCH_INODE_FS)
+		if (k.k->type != KEY_TYPE_inode)
 			continue;
 
 		inode = bkey_s_c_to_inode(k);
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index 0a350c6d0932..30f93fbe280d 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -178,76 +178,69 @@ int bch2_inode_unpack(struct bkey_s_c_inode inode,
 
 const char *bch2_inode_invalid(const struct bch_fs *c, struct bkey_s_c k)
 {
-	if (k.k->p.offset)
-		return "nonzero offset";
-
-	switch (k.k->type) {
-	case BCH_INODE_FS: {
 		struct bkey_s_c_inode inode = bkey_s_c_to_inode(k);
 		struct bch_inode_unpacked unpacked;
 
-		if (bkey_val_bytes(k.k) < sizeof(struct bch_inode))
-			return "incorrect value size";
-
-		if (k.k->p.inode < BLOCKDEV_INODE_MAX)
-			return "fs inode in blockdev range";
+	if (k.k->p.offset)
+		return "nonzero offset";
 
-		if (INODE_STR_HASH(inode.v) >= BCH_STR_HASH_NR)
-			return "invalid str hash type";
+	if (bkey_val_bytes(k.k) < sizeof(struct bch_inode))
+		return "incorrect value size";
 
-		if (bch2_inode_unpack(inode, &unpacked))
-			return "invalid variable length fields";
+	if (k.k->p.inode < BLOCKDEV_INODE_MAX)
+		return "fs inode in blockdev range";
 
-		if (unpacked.bi_data_checksum >= BCH_CSUM_OPT_NR + 1)
-			return "invalid data checksum type";
+	if (INODE_STR_HASH(inode.v) >= BCH_STR_HASH_NR)
+		return "invalid str hash type";
 
-		if (unpacked.bi_compression >= BCH_COMPRESSION_OPT_NR + 1)
-			return "invalid data checksum type";
+	if (bch2_inode_unpack(inode, &unpacked))
+		return "invalid variable length fields";
 
-		if ((unpacked.bi_flags & BCH_INODE_UNLINKED) &&
-		    unpacked.bi_nlink != 0)
-			return "flagged as unlinked but bi_nlink != 0";
+	if (unpacked.bi_data_checksum >= BCH_CSUM_OPT_NR + 1)
+		return "invalid data checksum type";
 
-		return NULL;
-	}
-	case BCH_INODE_BLOCKDEV:
-		if (bkey_val_bytes(k.k) != sizeof(struct bch_inode_blockdev))
-			return "incorrect value size";
+	if (unpacked.bi_compression >= BCH_COMPRESSION_OPT_NR + 1)
+		return "invalid data checksum type";
 
-		if (k.k->p.inode >= BLOCKDEV_INODE_MAX)
-			return "blockdev inode in fs range";
+	if ((unpacked.bi_flags & BCH_INODE_UNLINKED) &&
+	    unpacked.bi_nlink != 0)
+		return "flagged as unlinked but bi_nlink != 0";
 
-		return NULL;
-	case BCH_INODE_GENERATION:
-		if (bkey_val_bytes(k.k) != sizeof(struct bch_inode_generation))
-			return "incorrect value size";
-
-		return NULL;
-	default:
-		return "invalid type";
-	}
+	return NULL;
 }
 
 void bch2_inode_to_text(struct printbuf *out, struct bch_fs *c,
 		       struct bkey_s_c k)
 {
-	struct bkey_s_c_inode inode;
+	struct bkey_s_c_inode inode = bkey_s_c_to_inode(k);
 	struct bch_inode_unpacked unpacked;
 
-	switch (k.k->type) {
-	case BCH_INODE_FS:
-		inode = bkey_s_c_to_inode(k);
-		if (bch2_inode_unpack(inode, &unpacked)) {
-			pr_buf(out, "(unpack error)");
-			break;
-		}
+	if (bch2_inode_unpack(inode, &unpacked)) {
+		pr_buf(out, "(unpack error)");
+		return;
+	}
 
 #define BCH_INODE_FIELD(_name, _bits)						\
-		pr_buf(out, #_name ": %llu ", (u64) unpacked._name);
-		BCH_INODE_FIELDS()
+	pr_buf(out, #_name ": %llu ", (u64) unpacked._name);
+	BCH_INODE_FIELDS()
 #undef  BCH_INODE_FIELD
-		break;
-	}
+}
+
+const char *bch2_inode_generation_invalid(const struct bch_fs *c,
+					  struct bkey_s_c k)
+{
+	if (k.k->p.offset)
+		return "nonzero offset";
+
+	if (bkey_val_bytes(k.k) != sizeof(struct bch_inode_generation))
+		return "incorrect value size";
+
+	return NULL;
+}
+
+void bch2_inode_generation_to_text(struct printbuf *out, struct bch_fs *c,
+				   struct bkey_s_c k)
+{
 }
 
 void bch2_inode_init(struct bch_fs *c, struct bch_inode_unpacked *inode_u,
@@ -281,10 +274,9 @@ void bch2_inode_init(struct bch_fs *c, struct bch_inode_unpacked *inode_u,
 static inline u32 bkey_generation(struct bkey_s_c k)
 {
 	switch (k.k->type) {
-	case BCH_INODE_BLOCKDEV:
-	case BCH_INODE_FS:
+	case KEY_TYPE_inode:
 		BUG();
-	case BCH_INODE_GENERATION:
+	case KEY_TYPE_inode_generation:
 		return le32_to_cpu(bkey_s_c_to_inode_generation(k).v->bi_generation);
 	default:
 		return 0;
@@ -330,8 +322,7 @@ again:
 			return ret;
 
 		switch (k.k->type) {
-		case BCH_INODE_BLOCKDEV:
-		case BCH_INODE_FS:
+		case KEY_TYPE_inode:
 			/* slot used */
 			if (iter->pos.inode >= max)
 				goto out;
@@ -405,19 +396,19 @@ int bch2_inode_rm(struct bch_fs *c, u64 inode_nr)
 			return ret;
 		}
 
-		bch2_fs_inconsistent_on(k.k->type != BCH_INODE_FS, c,
+		bch2_fs_inconsistent_on(k.k->type != KEY_TYPE_inode, c,
 					"inode %llu not found when deleting",
 					inode_nr);
 
 		switch (k.k->type) {
-		case BCH_INODE_FS: {
+		case KEY_TYPE_inode: {
 			struct bch_inode_unpacked inode_u;
 
 			if (!bch2_inode_unpack(bkey_s_c_to_inode(k), &inode_u))
 				bi_generation = inode_u.bi_generation + 1;
 			break;
 		}
-		case BCH_INODE_GENERATION: {
+		case KEY_TYPE_inode_generation: {
 			struct bkey_s_c_inode_generation g =
 				bkey_s_c_to_inode_generation(k);
 			bi_generation = le32_to_cpu(g.v->bi_generation);
@@ -455,7 +446,7 @@ int bch2_inode_find_by_inum(struct bch_fs *c, u64 inode_nr,
 			   POS(inode_nr, 0),
 			   BTREE_ITER_SLOTS, k) {
 		switch (k.k->type) {
-		case BCH_INODE_FS:
+		case KEY_TYPE_inode:
 			ret = bch2_inode_unpack(bkey_s_c_to_inode(k), inode);
 			break;
 		default:
@@ -464,7 +455,6 @@ int bch2_inode_find_by_inum(struct bch_fs *c, u64 inode_nr,
 		}
 
 		break;
-
 	}
 
 	return bch2_btree_iter_unlock(&iter) ?: ret;
diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h
index 897ff65d01cb..0bc852e69355 100644
--- a/fs/bcachefs/inode.h
+++ b/fs/bcachefs/inode.h
@@ -9,11 +9,21 @@
 const char *bch2_inode_invalid(const struct bch_fs *, struct bkey_s_c);
 void bch2_inode_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 
-#define bch2_bkey_inode_ops (struct bkey_ops) {		\
+#define bch2_bkey_ops_inode (struct bkey_ops) {		\
 	.key_invalid	= bch2_inode_invalid,		\
 	.val_to_text	= bch2_inode_to_text,		\
 }
 
+const char *bch2_inode_generation_invalid(const struct bch_fs *,
+					  struct bkey_s_c);
+void bch2_inode_generation_to_text(struct printbuf *, struct bch_fs *,
+				   struct bkey_s_c);
+
+#define bch2_bkey_ops_inode_generation (struct bkey_ops) {	\
+	.key_invalid	= bch2_inode_generation_invalid,	\
+	.val_to_text	= bch2_inode_generation_to_text,	\
+}
+
 struct bch_inode_unpacked {
 	u64			bi_inum;
 	__le64			bi_hash_seed;
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index 2fee2f2efd38..3e990709fedb 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -213,20 +213,20 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
 			       enum bch_data_type type,
 			       const struct bkey_i *k)
 {
-	struct bkey_s_c_extent e = bkey_i_to_s_c_extent(k);
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(k));
 	const struct bch_extent_ptr *ptr;
 	struct bch_write_bio *n;
 	struct bch_dev *ca;
 
 	BUG_ON(c->opts.nochanges);
 
-	extent_for_each_ptr(e, ptr) {
+	bkey_for_each_ptr(ptrs, ptr) {
 		BUG_ON(ptr->dev >= BCH_SB_MEMBERS_MAX ||
 		       !c->devs[ptr->dev]);
 
 		ca = bch_dev_bkey_exists(c, ptr->dev);
 
-		if (ptr + 1 < &extent_entry_last(e)->ptr) {
+		if (to_entry(ptr + 1) < ptrs.end) {
 			n = to_wbio(bio_alloc_clone(NULL, &wbio->bio,
 						GFP_NOIO, &ca->replica_set));
 
@@ -317,7 +317,6 @@ static void __bch2_write_index(struct bch_write_op *op)
 {
 	struct bch_fs *c = op->c;
 	struct keylist *keys = &op->insert_keys;
-	struct bkey_s_extent e;
 	struct bch_extent_ptr *ptr;
 	struct bkey_i *src, *dst = keys->keys, *n, *k;
 	unsigned dev;
@@ -327,12 +326,10 @@ static void __bch2_write_index(struct bch_write_op *op)
 		n = bkey_next(src);
 		bkey_copy(dst, src);
 
-		e = bkey_i_to_s_extent(dst);
-
-		bch2_extent_drop_ptrs(e, ptr,
+		bch2_bkey_drop_ptrs(bkey_i_to_s(dst), ptr,
 			test_bit(ptr->dev, op->failed.d));
 
-		if (!bch2_extent_nr_ptrs(e.c)) {
+		if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(dst))) {
 			ret = -EIO;
 			goto err;
 		}
@@ -433,14 +430,13 @@ static void init_append_extent(struct bch_write_op *op,
 	e->k.p		= op->pos;
 	e->k.size	= crc.uncompressed_size;
 	e->k.version	= version;
-	bkey_extent_set_cached(&e->k, op->flags & BCH_WRITE_CACHED);
 
 	if (crc.csum_type ||
 	    crc.compression_type ||
 	    crc.nonce)
 		bch2_extent_crc_append(e, crc);
 
-	bch2_alloc_sectors_append_ptrs(op->c, wp, e, crc.compressed_size);
+	bch2_alloc_sectors_append_ptrs(op->c, wp, &e->k_i, crc.compressed_size);
 
 	bch2_keylist_push(&op->insert_keys);
 }
@@ -1608,7 +1604,7 @@ int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig,
 	struct bpos pos = bkey_start_pos(k.k);
 	int pick_ret;
 
-	pick_ret = bch2_extent_pick_ptr(c, k, failed, &pick);
+	pick_ret = bch2_bkey_pick_read_device(c, k, failed, &pick);
 
 	/* hole or reservation - just zero fill: */
 	if (!pick_ret)
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index a74566764630..f3bb28f32c6e 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -141,11 +141,12 @@ static void journal_entry_null_range(void *start, void *end)
 
 static int journal_validate_key(struct bch_fs *c, struct jset *jset,
 				struct jset_entry *entry,
-				struct bkey_i *k, enum bkey_type key_type,
+				struct bkey_i *k, enum btree_node_type key_type,
 				const char *type, int write)
 {
 	void *next = vstruct_next(entry);
 	const char *invalid;
+	unsigned version = le32_to_cpu(jset->version);
 	int ret = 0;
 
 	if (journal_entry_err_on(!k->k.u64s, c,
@@ -174,14 +175,17 @@ static int journal_validate_key(struct bch_fs *c, struct jset *jset,
 	}
 
 	if (JSET_BIG_ENDIAN(jset) != CPU_BIG_ENDIAN)
-		bch2_bkey_swab(key_type, NULL, bkey_to_packed(k));
+		bch2_bkey_swab(NULL, bkey_to_packed(k));
 
-	invalid = bch2_bkey_invalid(c, key_type, bkey_i_to_s_c(k));
+	if (!write &&
+	    version < bcachefs_metadata_version_bkey_renumber)
+		bch2_bkey_renumber(key_type, bkey_to_packed(k), write);
+
+	invalid = bch2_bkey_invalid(c, bkey_i_to_s_c(k), key_type);
 	if (invalid) {
 		char buf[160];
 
-		bch2_bkey_val_to_text(&PBUF(buf), c, key_type,
-				      bkey_i_to_s_c(k));
+		bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(k));
 		mustfix_fsck_err(c, "invalid %s in journal: %s\n%s",
 				 type, invalid, buf);
 
@@ -190,6 +194,10 @@ static int journal_validate_key(struct bch_fs *c, struct jset *jset,
 		journal_entry_null_range(vstruct_next(entry), next);
 		return 0;
 	}
+
+	if (write &&
+	    version < bcachefs_metadata_version_bkey_renumber)
+		bch2_bkey_renumber(key_type, bkey_to_packed(k), write);
 fsck_err:
 	return ret;
 }
@@ -203,8 +211,8 @@ static int journal_entry_validate_btree_keys(struct bch_fs *c,
 
 	vstruct_for_each(entry, k) {
 		int ret = journal_validate_key(c, jset, entry, k,
-				bkey_type(entry->level,
-					  entry->btree_id),
+				__btree_node_type(entry->level,
+						  entry->btree_id),
 				"key", write);
 		if (ret)
 			return ret;
@@ -351,14 +359,17 @@ static int jset_validate(struct bch_fs *c,
 {
 	size_t bytes = vstruct_bytes(jset);
 	struct bch_csum csum;
+	unsigned version;
 	int ret = 0;
 
 	if (le64_to_cpu(jset->magic) != jset_magic(c))
 		return JOURNAL_ENTRY_NONE;
 
-	if (le32_to_cpu(jset->version) != BCACHE_JSET_VERSION) {
-		bch_err(c, "unknown journal entry version %u",
-			le32_to_cpu(jset->version));
+	version = le32_to_cpu(jset->version);
+	if ((version != BCH_JSET_VERSION_OLD &&
+	     version < bcachefs_metadata_version_min) ||
+	    version >= bcachefs_metadata_version_max) {
+		bch_err(c, "unknown journal entry version %u", jset->version);
 		return BCH_FSCK_UNKNOWN_VERSION;
 	}
 
@@ -929,7 +940,6 @@ static void __journal_write_alloc(struct journal *j,
 				  unsigned replicas_want)
 {
 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
-	struct bkey_i_extent *e = bkey_i_to_extent(&w->key);
 	struct journal_device *ja;
 	struct bch_dev *ca;
 	unsigned i;
@@ -951,13 +961,14 @@ static void __journal_write_alloc(struct journal *j,
 		if (!ca->mi.durability ||
 		    ca->mi.state != BCH_MEMBER_STATE_RW ||
 		    !ja->nr ||
-		    bch2_extent_has_device(extent_i_to_s_c(e), ca->dev_idx) ||
+		    bch2_bkey_has_device(bkey_i_to_s_c(&w->key),
+					 ca->dev_idx) ||
 		    sectors > ja->sectors_free)
 			continue;
 
 		bch2_dev_stripe_increment(c, ca, &j->wp.stripe);
 
-		extent_ptr_append(e,
+		bch2_bkey_append_ptr(&w->key,
 			(struct bch_extent_ptr) {
 				  .offset = bucket_to_sector(ca,
 					ja->buckets[ja->cur_idx]) +
@@ -1096,7 +1107,7 @@ static void journal_write_done(struct closure *cl)
 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
 	struct journal_buf *w = journal_prev_buf(j);
 	struct bch_devs_list devs =
-		bch2_extent_devs(bkey_i_to_s_c_extent(&w->key));
+		bch2_bkey_devs(bkey_i_to_s_c(&w->key));
 	u64 seq = le64_to_cpu(w->data->seq);
 	u64 last_seq = le64_to_cpu(w->data->last_seq);
 
@@ -1158,7 +1169,7 @@ static void journal_write_endio(struct bio *bio)
 		unsigned long flags;
 
 		spin_lock_irqsave(&j->err_lock, flags);
-		bch2_extent_drop_device(bkey_i_to_s_extent(&w->key), ca->dev_idx);
+		bch2_bkey_drop_device(bkey_i_to_s(&w->key), ca->dev_idx);
 		spin_unlock_irqrestore(&j->err_lock, flags);
 	}
 
@@ -1175,6 +1186,7 @@ void bch2_journal_write(struct closure *cl)
 	struct jset *jset;
 	struct bio *bio;
 	struct bch_extent_ptr *ptr;
+	bool validate_before_checksum = false;
 	unsigned i, sectors, bytes;
 
 	journal_buf_realloc(j, w);
@@ -1196,12 +1208,22 @@ void bch2_journal_write(struct closure *cl)
 	jset->read_clock	= cpu_to_le16(c->bucket_clock[READ].hand);
 	jset->write_clock	= cpu_to_le16(c->bucket_clock[WRITE].hand);
 	jset->magic		= cpu_to_le64(jset_magic(c));
-	jset->version		= cpu_to_le32(BCACHE_JSET_VERSION);
+
+	jset->version		= c->sb.version < bcachefs_metadata_version_new_versioning
+		? cpu_to_le32(BCH_JSET_VERSION_OLD)
+		: cpu_to_le32(c->sb.version);
 
 	SET_JSET_BIG_ENDIAN(jset, CPU_BIG_ENDIAN);
 	SET_JSET_CSUM_TYPE(jset, bch2_meta_checksum_type(c));
 
-	if (bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset)) &&
+	if (bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset)))
+		validate_before_checksum = true;
+
+	if (le32_to_cpu(jset->version) <
+	    bcachefs_metadata_version_bkey_renumber)
+		validate_before_checksum = true;
+
+	if (validate_before_checksum &&
 	    jset_validate_entries(c, jset, WRITE))
 		goto err;
 
@@ -1212,7 +1234,7 @@ void bch2_journal_write(struct closure *cl)
 	jset->csum = csum_vstruct(c, JSET_CSUM_TYPE(jset),
 				  journal_nonce(jset), jset);
 
-	if (!bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset)) &&
+	if (!validate_before_checksum &&
 	    jset_validate_entries(c, jset, WRITE))
 		goto err;
 
diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c
index 46878590327d..63fe8cbb0564 100644
--- a/fs/bcachefs/migrate.c
+++ b/fs/bcachefs/migrate.c
@@ -15,7 +15,7 @@
 #include "replicas.h"
 #include "super-io.h"
 
-static int drop_dev_ptrs(struct bch_fs *c, struct bkey_s_extent e,
+static int drop_dev_ptrs(struct bch_fs *c, struct bkey_s k,
 			 unsigned dev_idx, int flags, bool metadata)
 {
 	unsigned replicas = metadata ? c->opts.metadata_replicas : c->opts.data_replicas;
@@ -23,9 +23,9 @@ static int drop_dev_ptrs(struct bch_fs *c, struct bkey_s_extent e,
 	unsigned degraded = metadata ? BCH_FORCE_IF_METADATA_DEGRADED : BCH_FORCE_IF_DATA_DEGRADED;
 	unsigned nr_good;
 
-	bch2_extent_drop_device(e, dev_idx);
+	bch2_bkey_drop_device(k, dev_idx);
 
-	nr_good = bch2_extent_durability(c, e.c);
+	nr_good = bch2_bkey_durability(c, k.s_c);
 	if ((!nr_good && !(flags & lost)) ||
 	    (nr_good < replicas && !(flags & degraded)))
 		return -EINVAL;
@@ -36,7 +36,6 @@ static int drop_dev_ptrs(struct bch_fs *c, struct bkey_s_extent e,
 static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
 {
 	struct bkey_s_c k;
-	struct bkey_s_extent e;
 	BKEY_PADDED(key) tmp;
 	struct btree_iter iter;
 	int ret = 0;
@@ -51,7 +50,7 @@ static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
 	       !(ret = btree_iter_err(k))) {
 		if (!bkey_extent_is_data(k.k) ||
 		    !bch2_extent_has_device(bkey_s_c_to_extent(k), dev_idx)) {
-			ret = bch2_mark_bkey_replicas(c, BKEY_TYPE_EXTENTS, k);
+			ret = bch2_mark_bkey_replicas(c, k);
 			if (ret)
 				break;
 			bch2_btree_iter_next(&iter);
@@ -59,18 +58,18 @@ static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
 		}
 
 		bkey_reassemble(&tmp.key, k);
-		e = bkey_i_to_s_extent(&tmp.key);
 
-		ret = drop_dev_ptrs(c, e, dev_idx, flags, false);
+		ret = drop_dev_ptrs(c, bkey_i_to_s(&tmp.key),
+				    dev_idx, flags, false);
 		if (ret)
 			break;
 
 		/*
 		 * If the new extent no longer has any pointers, bch2_extent_normalize()
 		 * will do the appropriate thing with it (turning it into a
-		 * KEY_TYPE_ERROR key, or just a discard if it was a cached extent)
+		 * KEY_TYPE_error key, or just a discard if it was a cached extent)
 		 */
-		bch2_extent_normalize(c, e.s);
+		bch2_extent_normalize(c, bkey_i_to_s(&tmp.key));
 
 		iter.pos = bkey_start_pos(&tmp.key.k);
 
@@ -118,10 +117,10 @@ static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
 	for (id = 0; id < BTREE_ID_NR; id++) {
 		for_each_btree_node(&iter, c, id, POS_MIN, BTREE_ITER_PREFETCH, b) {
 			__BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp;
-			struct bkey_i_extent *new_key;
+			struct bkey_i_btree_ptr *new_key;
 retry:
-			if (!bch2_extent_has_device(bkey_i_to_s_c_extent(&b->key),
-						    dev_idx)) {
+			if (!bch2_bkey_has_device(bkey_i_to_s_c(&b->key),
+						  dev_idx)) {
 				/*
 				 * we might have found a btree node key we
 				 * needed to update, and then tried to update it
@@ -130,15 +129,14 @@ retry:
 				 */
 				bch2_btree_iter_downgrade(&iter);
 
-				ret = bch2_mark_bkey_replicas(c, BKEY_TYPE_BTREE,
-							      bkey_i_to_s_c(&b->key));
+				ret = bch2_mark_bkey_replicas(c, bkey_i_to_s_c(&b->key));
 				if (ret)
 					goto err;
 			} else {
 				bkey_copy(&tmp.k, &b->key);
-				new_key = bkey_i_to_extent(&tmp.k);
+				new_key = bkey_i_to_btree_ptr(&tmp.k);
 
-				ret = drop_dev_ptrs(c, extent_i_to_s(new_key),
+				ret = drop_dev_ptrs(c, bkey_i_to_s(&new_key->k_i),
 						    dev_idx, flags, true);
 				if (ret)
 					goto err;
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index 7de3c6c475be..aff611c908ef 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -100,8 +100,8 @@ static int bch2_migrate_index_update(struct bch_write_op *op)
 		bch2_cut_back(insert->k.p, &new->k);
 
 		if (m->data_cmd == DATA_REWRITE)
-			bch2_extent_drop_device(extent_i_to_s(insert),
-						m->data_opts.rewrite_dev);
+			bch2_bkey_drop_device(extent_i_to_s(insert).s,
+					      m->data_opts.rewrite_dev);
 
 		extent_for_each_ptr_decode(extent_i_to_s(new), p, entry) {
 			if (bch2_extent_has_device(extent_i_to_s_c(insert), p.ptr.dev)) {
@@ -132,8 +132,8 @@ static int bch2_migrate_index_update(struct bch_write_op *op)
 		 * has fewer replicas than when we last looked at it - meaning
 		 * we need to get a disk reservation here:
 		 */
-		nr = bch2_extent_nr_dirty_ptrs(bkey_i_to_s_c(&insert->k_i)) -
-			(bch2_extent_nr_dirty_ptrs(k) + m->nr_ptrs_reserved);
+		nr = bch2_bkey_nr_dirty_ptrs(bkey_i_to_s_c(&insert->k_i)) -
+			(bch2_bkey_nr_dirty_ptrs(k) + m->nr_ptrs_reserved);
 		if (nr > 0) {
 			/*
 			 * can't call bch2_disk_reservation_add() with btree
@@ -243,7 +243,7 @@ int bch2_migrate_write_init(struct bch_fs *c, struct migrate_write *m,
 	switch (data_cmd) {
 	case DATA_ADD_REPLICAS: {
 		int nr = (int) io_opts.data_replicas -
-			bch2_extent_nr_dirty_ptrs(k);
+			bch2_bkey_nr_dirty_ptrs(k);
 
 		if (nr > 0) {
 			m->op.nr_replicas = m->nr_ptrs_reserved = nr;
@@ -477,7 +477,6 @@ int bch2_move_data(struct bch_fs *c,
 	struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts);
 	BKEY_PADDED(k) tmp;
 	struct bkey_s_c k;
-	struct bkey_s_c_extent e;
 	struct data_opts data_opts;
 	enum data_cmd data_cmd;
 	u64 delay, cur_inum = U64_MAX;
@@ -530,8 +529,6 @@ peek:
 		if (!bkey_extent_is_data(k.k))
 			goto next_nondata;
 
-		e = bkey_s_c_to_extent(k);
-
 		if (cur_inum != k.k->p.inode) {
 			struct bch_inode_unpacked inode;
 
@@ -545,8 +542,7 @@ peek:
 			goto peek;
 		}
 
-		switch ((data_cmd = pred(c, arg, BKEY_TYPE_EXTENTS, e,
-					 &io_opts, &data_opts))) {
+		switch ((data_cmd = pred(c, arg, k, &io_opts, &data_opts))) {
 		case DATA_SKIP:
 			goto next;
 		case DATA_SCRUB:
@@ -581,7 +577,7 @@ peek:
 		if (rate)
 			bch2_ratelimit_increment(rate, k.k->size);
 next:
-		atomic64_add(k.k->size * bch2_extent_nr_dirty_ptrs(k),
+		atomic64_add(k.k->size * bch2_bkey_nr_dirty_ptrs(k),
 			     &stats->sectors_seen);
 next_nondata:
 		bch2_btree_iter_next(&stats->iter);
@@ -613,7 +609,7 @@ static int bch2_gc_data_replicas(struct bch_fs *c)
 
 	for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, POS_MIN,
 			   BTREE_ITER_PREFETCH, k) {
-		ret = bch2_mark_bkey_replicas(c, BKEY_TYPE_EXTENTS, k);
+		ret = bch2_mark_bkey_replicas(c, k);
 		if (ret)
 			break;
 	}
@@ -637,8 +633,7 @@ static int bch2_gc_btree_replicas(struct bch_fs *c)
 
 	for (id = 0; id < BTREE_ID_NR; id++) {
 		for_each_btree_node(&iter, c, id, POS_MIN, BTREE_ITER_PREFETCH, b) {
-			ret = bch2_mark_bkey_replicas(c, BKEY_TYPE_BTREE,
-						      bkey_i_to_s_c(&b->key));
+			ret = bch2_mark_bkey_replicas(c, bkey_i_to_s_c(&b->key));
 
 			bch2_btree_iter_cond_resched(&iter);
 		}
@@ -668,10 +663,9 @@ static int bch2_move_btree(struct bch_fs *c,
 
 	for (id = 0; id < BTREE_ID_NR; id++) {
 		for_each_btree_node(&stats->iter, c, id, POS_MIN, BTREE_ITER_PREFETCH, b) {
-			switch ((cmd = pred(c, arg, BKEY_TYPE_BTREE,
-					    bkey_i_to_s_c_extent(&b->key),
-					    &io_opts,
-					    &data_opts))) {
+			switch ((cmd = pred(c, arg,
+					    bkey_i_to_s_c(&b->key),
+					    &io_opts, &data_opts))) {
 			case DATA_SKIP:
 				goto next;
 			case DATA_SCRUB:
@@ -697,8 +691,7 @@ next:
 
 #if 0
 static enum data_cmd scrub_pred(struct bch_fs *c, void *arg,
-				enum bkey_type type,
-				struct bkey_s_c_extent e,
+				struct bkey_s_c k,
 				struct bch_io_opts *io_opts,
 				struct data_opts *data_opts)
 {
@@ -707,33 +700,38 @@ static enum data_cmd scrub_pred(struct bch_fs *c, void *arg,
 #endif
 
 static enum data_cmd rereplicate_pred(struct bch_fs *c, void *arg,
-				      enum bkey_type type,
-				      struct bkey_s_c_extent e,
+				      struct bkey_s_c k,
 				      struct bch_io_opts *io_opts,
 				      struct data_opts *data_opts)
 {
-	unsigned nr_good = bch2_extent_durability(c, e);
-	unsigned replicas = type == BKEY_TYPE_BTREE
-		? c->opts.metadata_replicas
-		: io_opts->data_replicas;
+	unsigned nr_good = bch2_bkey_durability(c, k);
+	unsigned replicas = 0;
+
+	switch (k.k->type) {
+	case KEY_TYPE_btree_ptr:
+		replicas = c->opts.metadata_replicas;
+		break;
+	case KEY_TYPE_extent:
+		replicas = io_opts->data_replicas;
+		break;
+	}
 
 	if (!nr_good || nr_good >= replicas)
 		return DATA_SKIP;
 
 	data_opts->target		= 0;
-	data_opts->btree_insert_flags = 0;
+	data_opts->btree_insert_flags	= 0;
 	return DATA_ADD_REPLICAS;
 }
 
 static enum data_cmd migrate_pred(struct bch_fs *c, void *arg,
-				  enum bkey_type type,
-				  struct bkey_s_c_extent e,
+				  struct bkey_s_c k,
 				  struct bch_io_opts *io_opts,
 				  struct data_opts *data_opts)
 {
 	struct bch_ioctl_data *op = arg;
 
-	if (!bch2_extent_has_device(e, op->migrate.dev))
+	if (!bch2_bkey_has_device(k, op->migrate.dev))
 		return DATA_SKIP;
 
 	data_opts->target		= 0;
diff --git a/fs/bcachefs/move.h b/fs/bcachefs/move.h
index 3f7e31cc8f6e..71b3d2b2ddb6 100644
--- a/fs/bcachefs/move.h
+++ b/fs/bcachefs/move.h
@@ -47,7 +47,7 @@ int bch2_migrate_write_init(struct bch_fs *, struct migrate_write *,
 			    struct bkey_s_c);
 
 typedef enum data_cmd (*move_pred_fn)(struct bch_fs *, void *,
-				enum bkey_type, struct bkey_s_c_extent,
+				struct bkey_s_c,
 				struct bch_io_opts *, struct data_opts *);
 
 int bch2_move_data(struct bch_fs *, struct bch_ratelimit *,
diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c
index 80577661e008..4bf4cc33dbb1 100644
--- a/fs/bcachefs/movinggc.c
+++ b/fs/bcachefs/movinggc.c
@@ -66,36 +66,42 @@ static int bucket_offset_cmp(const void *_l, const void *_r, size_t size)
 }
 
 static bool __copygc_pred(struct bch_dev *ca,
-			  struct bkey_s_c_extent e)
+			  struct bkey_s_c k)
 {
 	copygc_heap *h = &ca->copygc_heap;
-	const struct bch_extent_ptr *ptr =
-		bch2_extent_has_device(e, ca->dev_idx);
 
-	if (ptr) {
-		struct copygc_heap_entry search = { .offset = ptr->offset };
+	switch (k.k->type) {
+	case KEY_TYPE_extent: {
+		struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
+		const struct bch_extent_ptr *ptr =
+			bch2_extent_has_device(e, ca->dev_idx);
 
-		ssize_t i = eytzinger0_find_le(h->data, h->used,
-					       sizeof(h->data[0]),
-					       bucket_offset_cmp, &search);
+		if (ptr) {
+			struct copygc_heap_entry search = { .offset = ptr->offset };
 
-		return (i >= 0 &&
-			ptr->offset < h->data[i].offset + ca->mi.bucket_size &&
-			ptr->gen == h->data[i].gen);
+			ssize_t i = eytzinger0_find_le(h->data, h->used,
+						       sizeof(h->data[0]),
+						       bucket_offset_cmp, &search);
+
+			return (i >= 0 &&
+				ptr->offset < h->data[i].offset + ca->mi.bucket_size &&
+				ptr->gen == h->data[i].gen);
+		}
+		break;
+	}
 	}
 
 	return false;
 }
 
 static enum data_cmd copygc_pred(struct bch_fs *c, void *arg,
-				 enum bkey_type type,
-				 struct bkey_s_c_extent e,
+				 struct bkey_s_c k,
 				 struct bch_io_opts *io_opts,
 				 struct data_opts *data_opts)
 {
 	struct bch_dev *ca = arg;
 
-	if (!__copygc_pred(ca, e))
+	if (!__copygc_pred(ca, k))
 		return DATA_SKIP;
 
 	data_opts->target		= dev_to_target(ca->dev_idx);
diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
index 80869e34e3b6..acdc952c48be 100644
--- a/fs/bcachefs/opts.h
+++ b/fs/bcachefs/opts.h
@@ -184,6 +184,9 @@ enum opt_type {
 		OPT_BOOL(),						\
 		NO_SB_OPT,			false)			\
 	BCH_OPT(no_data_io,		u8,	OPT_MOUNT,		\
+		OPT_BOOL(),						\
+		NO_SB_OPT,			false)			\
+	BCH_OPT(version_upgrade,	u8,	OPT_MOUNT,		\
 		OPT_BOOL(),						\
 		NO_SB_OPT,			false)
 
diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c
index cc20742d542b..7c38daac1cac 100644
--- a/fs/bcachefs/quota.c
+++ b/fs/bcachefs/quota.c
@@ -22,23 +22,13 @@ const struct bch_sb_field_ops bch_sb_field_ops_quota = {
 
 const char *bch2_quota_invalid(const struct bch_fs *c, struct bkey_s_c k)
 {
-	struct bkey_s_c_quota dq;
-
 	if (k.k->p.inode >= QTYP_NR)
 		return "invalid quota type";
 
-	switch (k.k->type) {
-	case BCH_QUOTA: {
-		dq = bkey_s_c_to_quota(k);
+	if (bkey_val_bytes(k.k) != sizeof(struct bch_quota))
+		return "incorrect value size";
 
-		if (bkey_val_bytes(k.k) != sizeof(struct bch_quota))
-			return "incorrect value size";
-
-		return NULL;
-	}
-	default:
-		return "invalid type";
-	}
+	return NULL;
 }
 
 static const char * const bch2_quota_counters[] = {
@@ -49,20 +39,14 @@ static const char * const bch2_quota_counters[] = {
 void bch2_quota_to_text(struct printbuf *out, struct bch_fs *c,
 			struct bkey_s_c k)
 {
-	struct bkey_s_c_quota dq;
+	struct bkey_s_c_quota dq = bkey_s_c_to_quota(k);
 	unsigned i;
 
-	switch (k.k->type) {
-	case BCH_QUOTA:
-		dq = bkey_s_c_to_quota(k);
-
-		for (i = 0; i < Q_COUNTERS; i++)
-			pr_buf(out, "%s hardlimit %llu softlimit %llu",
-			       bch2_quota_counters[i],
-			       le64_to_cpu(dq.v->c[i].hardlimit),
-			       le64_to_cpu(dq.v->c[i].softlimit));
-		break;
-	}
+	for (i = 0; i < Q_COUNTERS; i++)
+		pr_buf(out, "%s hardlimit %llu softlimit %llu",
+		       bch2_quota_counters[i],
+		       le64_to_cpu(dq.v->c[i].hardlimit),
+		       le64_to_cpu(dq.v->c[i].softlimit));
 }
 
 #ifdef CONFIG_BCACHEFS_QUOTA
@@ -178,7 +162,7 @@ static int bch2_quota_check_limit(struct bch_fs *c,
 
 	BUG_ON((s64) n < 0);
 
-	if (mode == BCH_QUOTA_NOCHECK)
+	if (mode == KEY_TYPE_QUOTA_NOCHECK)
 		return 0;
 
 	if (v <= 0) {
@@ -201,7 +185,7 @@ static int bch2_quota_check_limit(struct bch_fs *c,
 	if (qc->hardlimit &&
 	    qc->hardlimit < n &&
 	    !ignore_hardlimit(q)) {
-		if (mode == BCH_QUOTA_PREALLOC)
+		if (mode == KEY_TYPE_QUOTA_PREALLOC)
 			return -EDQUOT;
 
 		prepare_warning(qc, qtype, counter, msgs, HARDWARN);
@@ -212,7 +196,7 @@ static int bch2_quota_check_limit(struct bch_fs *c,
 	    qc->timer &&
 	    ktime_get_real_seconds() >= qc->timer &&
 	    !ignore_hardlimit(q)) {
-		if (mode == BCH_QUOTA_PREALLOC)
+		if (mode == KEY_TYPE_QUOTA_PREALLOC)
 			return -EDQUOT;
 
 		prepare_warning(qc, qtype, counter, msgs, SOFTLONGWARN);
@@ -221,7 +205,7 @@ static int bch2_quota_check_limit(struct bch_fs *c,
 	if (qc->softlimit &&
 	    qc->softlimit < n &&
 	    qc->timer == 0) {
-		if (mode == BCH_QUOTA_PREALLOC)
+		if (mode == KEY_TYPE_QUOTA_PREALLOC)
 			return -EDQUOT;
 
 		prepare_warning(qc, qtype, counter, msgs, SOFTWARN);
@@ -312,13 +296,13 @@ int bch2_quota_transfer(struct bch_fs *c, unsigned qtypes,
 
 		ret = bch2_quota_check_limit(c, i, dst_q[i], &msgs, Q_SPC,
 					     dst_q[i]->c[Q_SPC].v + space,
-					     BCH_QUOTA_PREALLOC);
+					     KEY_TYPE_QUOTA_PREALLOC);
 		if (ret)
 			goto err;
 
 		ret = bch2_quota_check_limit(c, i, dst_q[i], &msgs, Q_INO,
 					     dst_q[i]->c[Q_INO].v + 1,
-					     BCH_QUOTA_PREALLOC);
+					     KEY_TYPE_QUOTA_PREALLOC);
 		if (ret)
 			goto err;
 	}
@@ -347,7 +331,7 @@ static int __bch2_quota_set(struct bch_fs *c, struct bkey_s_c k)
 	BUG_ON(k.k->p.inode >= QTYP_NR);
 
 	switch (k.k->type) {
-	case BCH_QUOTA:
+	case KEY_TYPE_quota:
 		dq = bkey_s_c_to_quota(k);
 		q = &c->quotas[k.k->p.inode];
 
@@ -447,15 +431,15 @@ int bch2_fs_quota_read(struct bch_fs *c)
 	for_each_btree_key(&iter, c, BTREE_ID_INODES, POS_MIN,
 			   BTREE_ITER_PREFETCH, k) {
 		switch (k.k->type) {
-		case BCH_INODE_FS:
+		case KEY_TYPE_inode:
 			ret = bch2_inode_unpack(bkey_s_c_to_inode(k), &u);
 			if (ret)
 				return ret;
 
 			bch2_quota_acct(c, bch_qid(&u), Q_SPC, u.bi_sectors,
-					BCH_QUOTA_NOCHECK);
+					KEY_TYPE_QUOTA_NOCHECK);
 			bch2_quota_acct(c, bch_qid(&u), Q_INO, 1,
-					BCH_QUOTA_NOCHECK);
+					KEY_TYPE_QUOTA_NOCHECK);
 		}
 	}
 	return bch2_btree_iter_unlock(&iter) ?: ret;
@@ -743,7 +727,7 @@ static int bch2_set_quota(struct super_block *sb, struct kqid qid,
 		return ret;
 
 	switch (k.k->type) {
-	case BCH_QUOTA:
+	case KEY_TYPE_quota:
 		new_quota.v = *bkey_s_c_to_quota(k).v;
 		break;
 	}
diff --git a/fs/bcachefs/quota.h b/fs/bcachefs/quota.h
index 9c06eb07bccb..294a04db84bf 100644
--- a/fs/bcachefs/quota.h
+++ b/fs/bcachefs/quota.h
@@ -10,15 +10,15 @@ extern const struct bch_sb_field_ops bch_sb_field_ops_quota;
 const char *bch2_quota_invalid(const struct bch_fs *, struct bkey_s_c);
 void bch2_quota_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 
-#define bch2_bkey_quota_ops (struct bkey_ops) {		\
+#define bch2_bkey_ops_quota (struct bkey_ops) {		\
 	.key_invalid	= bch2_quota_invalid,		\
 	.val_to_text	= bch2_quota_to_text,		\
 }
 
 enum quota_acct_mode {
-	BCH_QUOTA_PREALLOC,
-	BCH_QUOTA_WARN,
-	BCH_QUOTA_NOCHECK,
+	KEY_TYPE_QUOTA_PREALLOC,
+	KEY_TYPE_QUOTA_WARN,
+	KEY_TYPE_QUOTA_NOCHECK,
 };
 
 static inline struct bch_qid bch_qid(struct bch_inode_unpacked *u)
diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c
index 5d246c5b8186..eec74d4a5712 100644
--- a/fs/bcachefs/rebalance.c
+++ b/fs/bcachefs/rebalance.c
@@ -70,28 +70,34 @@ void bch2_rebalance_add_work(struct bch_fs *c, u64 sectors)
 }
 
 static enum data_cmd rebalance_pred(struct bch_fs *c, void *arg,
-				    enum bkey_type type,
-				    struct bkey_s_c_extent e,
+				    struct bkey_s_c k,
 				    struct bch_io_opts *io_opts,
 				    struct data_opts *data_opts)
 {
-	const union bch_extent_entry *entry;
-	struct extent_ptr_decoded p;
+	switch (k.k->type) {
+	case KEY_TYPE_extent: {
+		struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
+		const union bch_extent_entry *entry;
+		struct extent_ptr_decoded p;
 
-	/* Make sure we have room to add a new pointer: */
-	if (bkey_val_u64s(e.k) + BKEY_EXTENT_PTR_U64s_MAX >
-	    BKEY_EXTENT_VAL_U64s_MAX)
-		return DATA_SKIP;
+		/* Make sure we have room to add a new pointer: */
+		if (bkey_val_u64s(e.k) + BKEY_EXTENT_PTR_U64s_MAX >
+		    BKEY_EXTENT_VAL_U64s_MAX)
+			return DATA_SKIP;
 
-	extent_for_each_ptr_decode(e, p, entry)
-		if (rebalance_ptr_pred(c, p, io_opts))
-			goto found;
+		extent_for_each_ptr_decode(e, p, entry)
+			if (rebalance_ptr_pred(c, p, io_opts))
+				goto found;
 
-	return DATA_SKIP;
+		return DATA_SKIP;
 found:
-	data_opts->target		= io_opts->background_target;
-	data_opts->btree_insert_flags	= 0;
-	return DATA_ADD_REPLICAS;
+		data_opts->target		= io_opts->background_target;
+		data_opts->btree_insert_flags	= 0;
+		return DATA_ADD_REPLICAS;
+	}
+	default:
+		return DATA_SKIP;
+	}
 }
 
 struct rebalance_work {
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 1cb0c9940ec1..172770606294 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -147,6 +147,10 @@ int bch2_fs_recovery(struct bch_fs *c)
 			mutex_unlock(&c->sb_lock);
 			goto err;
 		}
+
+		if (le16_to_cpu(c->disk_sb.sb->version) <
+		    bcachefs_metadata_version_bkey_renumber)
+			bch2_sb_clean_renumber(clean, READ);
 	}
 	mutex_unlock(&c->sb_lock);
 
@@ -265,12 +269,18 @@ int bch2_fs_recovery(struct bch_fs *c)
 	if (ret)
 		goto err;
 
-	if (!test_bit(BCH_FS_FSCK_UNFIXED_ERRORS, &c->flags)) {
-		mutex_lock(&c->sb_lock);
-		c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_ATOMIC_NLINK;
-		mutex_unlock(&c->sb_lock);
+	mutex_lock(&c->sb_lock);
+	if (c->opts.version_upgrade) {
+		if (c->sb.version < bcachefs_metadata_version_new_versioning)
+			c->disk_sb.sb->version_min =
+				le16_to_cpu(bcachefs_metadata_version_min);
+		c->disk_sb.sb->version = le16_to_cpu(bcachefs_metadata_version_current);
 	}
 
+	if (!test_bit(BCH_FS_FSCK_UNFIXED_ERRORS, &c->flags))
+		c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_ATOMIC_NLINK;
+	mutex_unlock(&c->sb_lock);
+
 	if (enabled_qtypes(c)) {
 		bch_verbose(c, "reading quotas:");
 		ret = bch2_fs_quota_read(c);
@@ -379,9 +389,12 @@ int bch2_fs_initialize(struct bch_fs *c)
 		goto err;
 
 	mutex_lock(&c->sb_lock);
+	c->disk_sb.sb->version = c->disk_sb.sb->version_min =
+		le16_to_cpu(bcachefs_metadata_version_current);
+	c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_ATOMIC_NLINK;
+
 	SET_BCH_SB_INITIALIZED(c->disk_sb.sb, true);
 	SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
-	c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_ATOMIC_NLINK;
 
 	bch2_write_super(c);
 	mutex_unlock(&c->sb_lock);
diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c
index 0296931b6b8c..77d175f34b2b 100644
--- a/fs/bcachefs/replicas.c
+++ b/fs/bcachefs/replicas.c
@@ -73,64 +73,57 @@ void bch2_cpu_replicas_to_text(struct printbuf *out,
 static void extent_to_replicas(struct bkey_s_c k,
 			       struct bch_replicas_entry *r)
 {
-	if (bkey_extent_is_data(k.k)) {
-		struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
-		const union bch_extent_entry *entry;
-		struct extent_ptr_decoded p;
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+	const union bch_extent_entry *entry;
+	struct extent_ptr_decoded p;
 
-		r->nr_required	= 1;
+	r->nr_required	= 1;
 
-		extent_for_each_ptr_decode(e, p, entry) {
-			if (p.ptr.cached)
-				continue;
+	bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+		if (p.ptr.cached)
+			continue;
 
-			if (p.ec_nr) {
-				r->nr_devs = 0;
-				break;
-			}
-
-			r->devs[r->nr_devs++] = p.ptr.dev;
+		if (p.ec_nr) {
+			r->nr_devs = 0;
+			break;
 		}
+
+		r->devs[r->nr_devs++] = p.ptr.dev;
 	}
 }
 
 static void stripe_to_replicas(struct bkey_s_c k,
 			       struct bch_replicas_entry *r)
 {
-	if (k.k->type == BCH_STRIPE) {
-		struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k);
-		const struct bch_extent_ptr *ptr;
+	struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k);
+	const struct bch_extent_ptr *ptr;
 
-		r->nr_required	= s.v->nr_blocks - s.v->nr_redundant;
+	r->nr_required	= s.v->nr_blocks - s.v->nr_redundant;
 
-		for (ptr = s.v->ptrs;
-		     ptr < s.v->ptrs + s.v->nr_blocks;
-		     ptr++)
-			r->devs[r->nr_devs++] = ptr->dev;
-	}
+	for (ptr = s.v->ptrs;
+	     ptr < s.v->ptrs + s.v->nr_blocks;
+	     ptr++)
+		r->devs[r->nr_devs++] = ptr->dev;
 }
 
-static void bkey_to_replicas(enum bkey_type type,
-			     struct bkey_s_c k,
+static void bkey_to_replicas(struct bkey_s_c k,
 			     struct bch_replicas_entry *e)
 {
 	e->nr_devs = 0;
 
-	switch (type) {
-	case BKEY_TYPE_BTREE:
+	switch (k.k->type) {
+	case KEY_TYPE_btree_ptr:
 		e->data_type = BCH_DATA_BTREE;
 		extent_to_replicas(k, e);
 		break;
-	case BKEY_TYPE_EXTENTS:
+	case KEY_TYPE_extent:
 		e->data_type = BCH_DATA_USER;
 		extent_to_replicas(k, e);
 		break;
-	case BKEY_TYPE_EC:
+	case KEY_TYPE_stripe:
 		e->data_type = BCH_DATA_USER;
 		stripe_to_replicas(k, e);
 		break;
-	default:
-		break;
 	}
 
 	replicas_entry_sort(e);
@@ -296,26 +289,21 @@ int bch2_mark_replicas(struct bch_fs *c,
 	return __bch2_mark_replicas(c, &search.e);
 }
 
-int bch2_mark_bkey_replicas(struct bch_fs *c,
-			    enum bkey_type type,
-			    struct bkey_s_c k)
+int bch2_mark_bkey_replicas(struct bch_fs *c, struct bkey_s_c k)
 {
 	struct bch_replicas_entry_padded search;
+	struct bch_devs_list cached = bch2_bkey_cached_devs(k);
+	unsigned i;
 	int ret;
 
 	memset(&search, 0, sizeof(search));
 
-	if (type == BKEY_TYPE_EXTENTS) {
-		struct bch_devs_list cached = bch2_bkey_cached_devs(k);
-		unsigned i;
+	for (i = 0; i < cached.nr; i++)
+		if ((ret = bch2_mark_replicas(c, BCH_DATA_CACHED,
+					      bch2_dev_list_single(cached.devs[i]))))
+			return ret;
 
-		for (i = 0; i < cached.nr; i++)
-			if ((ret = bch2_mark_replicas(c, BCH_DATA_CACHED,
-						bch2_dev_list_single(cached.devs[i]))))
-				return ret;
-	}
-
-	bkey_to_replicas(type, k, &search.e);
+	bkey_to_replicas(k, &search.e);
 
 	return search.e.nr_devs
 		? __bch2_mark_replicas(c, &search.e)
@@ -719,26 +707,22 @@ bool bch2_replicas_marked(struct bch_fs *c,
 }
 
 bool bch2_bkey_replicas_marked(struct bch_fs *c,
-			       enum bkey_type type,
 			       struct bkey_s_c k,
 			       bool check_gc_replicas)
 {
 	struct bch_replicas_entry_padded search;
+	struct bch_devs_list cached = bch2_bkey_cached_devs(k);
+	unsigned i;
 
 	memset(&search, 0, sizeof(search));
 
-	if (type == BKEY_TYPE_EXTENTS) {
-		struct bch_devs_list cached = bch2_bkey_cached_devs(k);
-		unsigned i;
-
-		for (i = 0; i < cached.nr; i++)
-			if (!bch2_replicas_marked(c, BCH_DATA_CACHED,
-					bch2_dev_list_single(cached.devs[i]),
-					check_gc_replicas))
-				return false;
-	}
+	for (i = 0; i < cached.nr; i++)
+		if (!bch2_replicas_marked(c, BCH_DATA_CACHED,
+					  bch2_dev_list_single(cached.devs[i]),
+					  check_gc_replicas))
+			return false;
 
-	bkey_to_replicas(type, k, &search.e);
+	bkey_to_replicas(k, &search.e);
 
 	return search.e.nr_devs
 		? replicas_has_entry(c, &search.e, check_gc_replicas)
diff --git a/fs/bcachefs/replicas.h b/fs/bcachefs/replicas.h
index e22d2d7cd08a..03aaafdc7c17 100644
--- a/fs/bcachefs/replicas.h
+++ b/fs/bcachefs/replicas.h
@@ -6,12 +6,11 @@
 
 bool bch2_replicas_marked(struct bch_fs *, enum bch_data_type,
 			  struct bch_devs_list, bool);
-bool bch2_bkey_replicas_marked(struct bch_fs *, enum bkey_type,
+bool bch2_bkey_replicas_marked(struct bch_fs *,
 			       struct bkey_s_c, bool);
 int bch2_mark_replicas(struct bch_fs *, enum bch_data_type,
 		       struct bch_devs_list);
-int bch2_mark_bkey_replicas(struct bch_fs *, enum bkey_type,
-			    struct bkey_s_c);
+int bch2_mark_bkey_replicas(struct bch_fs *, struct bkey_s_c);
 
 void bch2_cpu_replicas_to_text(struct printbuf *, struct bch_replicas_cpu *);
 
diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h
index fbd6c3372677..6f30fbe44eb8 100644
--- a/fs/bcachefs/str_hash.h
+++ b/fs/bcachefs/str_hash.h
@@ -118,7 +118,6 @@ static inline u64 bch2_str_hash_end(struct bch_str_hash_ctx *ctx,
 struct bch_hash_desc {
 	enum btree_id	btree_id;
 	u8		key_type;
-	u8		whiteout_type;
 
 	u64		(*hash_key)(const struct bch_hash_info *, const void *);
 	u64		(*hash_bkey)(const struct bch_hash_info *, struct bkey_s_c);
@@ -149,7 +148,7 @@ bch2_hash_lookup(struct btree_trans *trans,
 		if (k.k->type == desc.key_type) {
 			if (!desc.cmp_key(k, key))
 				return iter;
-		} else if (k.k->type == desc.whiteout_type) {
+		} else if (k.k->type == KEY_TYPE_whiteout) {
 			;
 		} else {
 			/* hole, not found */
@@ -202,7 +201,7 @@ static inline int bch2_hash_needs_whiteout(struct btree_trans *trans,
 
 	for_each_btree_key_continue(iter, BTREE_ITER_SLOTS, k) {
 		if (k.k->type != desc.key_type &&
-		    k.k->type != desc.whiteout_type)
+		    k.k->type != KEY_TYPE_whiteout)
 			return false;
 
 		if (k.k->type == desc.key_type &&
@@ -245,7 +244,7 @@ static inline int __bch2_hash_set(struct btree_trans *trans,
 				return PTR_ERR(slot);
 		}
 
-		if (k.k->type != desc.whiteout_type)
+		if (k.k->type != KEY_TYPE_whiteout)
 			goto not_found;
 	}
 
@@ -295,7 +294,7 @@ static inline int bch2_hash_delete_at(struct btree_trans *trans,
 
 	bkey_init(&delete->k);
 	delete->k.p = iter->pos;
-	delete->k.type = ret ? desc.whiteout_type : KEY_TYPE_DELETED;
+	delete->k.type = ret ? KEY_TYPE_whiteout : KEY_TYPE_deleted;
 
 	bch2_trans_update(trans, BTREE_INSERT_ENTRY(iter, delete));
 	return 0;
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index 3dbcb6d7d261..dafdc45b442c 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -240,21 +240,25 @@ const char *bch2_sb_validate(struct bch_sb_handle *disk_sb)
 	struct bch_sb_field *f;
 	struct bch_sb_field_members *mi;
 	const char *err;
+	u32 version, version_min;
 	u16 block_size;
 
-	if (le16_to_cpu(sb->version) < BCH_SB_VERSION_MIN ||
-	    le16_to_cpu(sb->version) > BCH_SB_VERSION_MAX)
+	version		= le16_to_cpu(sb->version);
+	version_min	= version >= bcachefs_metadata_version_new_versioning
+		? le16_to_cpu(sb->version_min)
+		: version;
+
+	if (version    >= bcachefs_metadata_version_max ||
+	    version_min < bcachefs_metadata_version_min)
 		return "Unsupported superblock version";
 
+	if (version_min > version)
+		return "Bad minimum version";
+
 	if (sb->features[1] ||
 	    (le64_to_cpu(sb->features[0]) & (~0ULL << BCH_FEATURE_NR)))
 		return "Filesystem has incompatible features";
 
-	if (le16_to_cpu(sb->version) < BCH_SB_VERSION_EXTENT_MAX) {
-		SET_BCH_SB_ENCODED_EXTENT_MAX_BITS(sb, 7);
-		SET_BCH_SB_POSIX_ACL(sb, 1);
-	}
-
 	block_size = le16_to_cpu(sb->block_size);
 
 	if (!is_power_of_2(block_size) ||
@@ -341,13 +345,6 @@ const char *bch2_sb_validate(struct bch_sb_handle *disk_sb)
 			return err;
 	}
 
-	if (le16_to_cpu(sb->version) < BCH_SB_VERSION_EXTENT_NONCE_V1 &&
-	    bch2_sb_get_crypt(sb) &&
-	    BCH_SB_INITIALIZED(sb))
-		return "Incompatible extent nonces";
-
-	sb->version = cpu_to_le16(BCH_SB_VERSION_MAX);
-
 	return NULL;
 }
 
@@ -364,6 +361,7 @@ static void bch2_sb_update(struct bch_fs *c)
 
 	c->sb.uuid		= src->uuid;
 	c->sb.user_uuid		= src->user_uuid;
+	c->sb.version		= le16_to_cpu(src->version);
 	c->sb.nr_devices	= src->nr_devices;
 	c->sb.clean		= BCH_SB_CLEAN(src);
 	c->sb.encryption_type	= BCH_SB_ENCRYPTION_TYPE(src);
@@ -385,6 +383,7 @@ static void __copy_super(struct bch_sb_handle *dst_handle, struct bch_sb *src)
 	unsigned i;
 
 	dst->version		= src->version;
+	dst->version_min	= src->version_min;
 	dst->seq		= src->seq;
 	dst->uuid		= src->uuid;
 	dst->user_uuid		= src->user_uuid;
@@ -483,8 +482,8 @@ reread:
 	    !uuid_equal(&sb->sb->magic, &BCHFS_MAGIC))
 		return "Not a bcachefs superblock";
 
-	if (le16_to_cpu(sb->sb->version) < BCH_SB_VERSION_MIN ||
-	    le16_to_cpu(sb->sb->version) > BCH_SB_VERSION_MAX)
+	if (le16_to_cpu(sb->sb->version) <  bcachefs_metadata_version_min ||
+	    le16_to_cpu(sb->sb->version) >= bcachefs_metadata_version_max)
 		return "Unsupported superblock version";
 
 	bytes = vstruct_bytes(sb->sb);
@@ -846,12 +845,6 @@ static const char *bch2_sb_validate_members(struct bch_sb *sb,
 			return "bucket size smaller than btree node size";
 	}
 
-	if (le16_to_cpu(sb->version) < BCH_SB_VERSION_EXTENT_MAX)
-		for (m = mi->members;
-		     m < mi->members + sb->nr_devices;
-		     m++)
-			SET_BCH_MEMBER_DATA_ALLOWED(m, ~0);
-
 	return NULL;
 }
 
@@ -881,6 +874,16 @@ static const struct bch_sb_field_ops bch_sb_field_ops_crypt = {
 
 /* BCH_SB_FIELD_clean: */
 
+void bch2_sb_clean_renumber(struct bch_sb_field_clean *clean, int write)
+{
+	struct jset_entry *entry;
+
+	for (entry = clean->start;
+	     entry < (struct jset_entry *) vstruct_end(&clean->field);
+	     entry = vstruct_next(entry))
+		bch2_bkey_renumber(BKEY_TYPE_BTREE, bkey_to_packed(entry->start), write);
+}
+
 void bch2_fs_mark_clean(struct bch_fs *c, bool clean)
 {
 	struct bch_sb_field_clean *sb_clean;
@@ -935,6 +938,10 @@ void bch2_fs_mark_clean(struct bch_fs *c, bool clean)
 
 	BUG_ON(entry != vstruct_end(&sb_clean->field));
 
+	if (le16_to_cpu(c->disk_sb.sb->version) <
+	    bcachefs_metadata_version_bkey_renumber)
+		bch2_sb_clean_renumber(sb_clean, WRITE);
+
 	mutex_unlock(&c->btree_root_lock);
 write_super:
 	bch2_write_super(c);
diff --git a/fs/bcachefs/super-io.h b/fs/bcachefs/super-io.h
index aa618fe9cd22..ac3b704f0540 100644
--- a/fs/bcachefs/super-io.h
+++ b/fs/bcachefs/super-io.h
@@ -135,6 +135,8 @@ static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi)
 
 /* BCH_SB_FIELD_clean: */
 
+void bch2_sb_clean_renumber(struct bch_sb_field_clean *, int);
+
 void bch2_fs_mark_clean(struct bch_fs *, bool);
 
 void bch2_sb_field_to_text(struct printbuf *, struct bch_sb *,
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index cadbc5481bcb..7405b5cdd1bf 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -1799,7 +1799,7 @@ err:
 BCH_DEBUG_PARAMS()
 #undef BCH_DEBUG_PARAM
 
-unsigned bch2_metadata_version = BCH_SB_VERSION_MAX;
+unsigned bch2_metadata_version = bcachefs_metadata_version_current;
 module_param_named(version, bch2_metadata_version, uint, 0400);
 
 module_exit(bcachefs_exit);
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index 8eacc0d2550b..7e46b254da38 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -277,7 +277,7 @@ static ssize_t bch2_compression_stats(struct bch_fs *c, char *buf)
 		return -EPERM;
 
 	for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, POS_MIN, 0, k)
-		if (k.k->type == BCH_EXTENT) {
+		if (k.k->type == KEY_TYPE_extent) {
 			struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
 			const union bch_extent_entry *entry;
 			struct extent_ptr_decoded p;
diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h
index 9730540f7375..1aa6ac05d50e 100644
--- a/fs/bcachefs/trace.h
+++ b/fs/bcachefs/trace.h
@@ -165,7 +165,7 @@ TRACE_EVENT(btree_write,
 	TP_ARGS(b, bytes, sectors),
 
 	TP_STRUCT__entry(
-		__field(enum bkey_type,	type)
+		__field(enum btree_node_type,	type)
 		__field(unsigned,	bytes			)
 		__field(unsigned,	sectors			)
 	),
diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c
index ab358c434753..ff2d59ee1658 100644
--- a/fs/bcachefs/xattr.c
+++ b/fs/bcachefs/xattr.c
@@ -62,8 +62,7 @@ static bool xattr_cmp_bkey(struct bkey_s_c _l, struct bkey_s_c _r)
 
 const struct bch_hash_desc bch2_xattr_hash_desc = {
 	.btree_id	= BTREE_ID_XATTRS,
-	.key_type	= BCH_XATTR,
-	.whiteout_type	= BCH_XATTR_WHITEOUT,
+	.key_type	= KEY_TYPE_xattr,
 	.hash_key	= xattr_hash_key,
 	.hash_bkey	= xattr_hash_bkey,
 	.cmp_key	= xattr_cmp_key,
@@ -73,71 +72,50 @@ const struct bch_hash_desc bch2_xattr_hash_desc = {
 const char *bch2_xattr_invalid(const struct bch_fs *c, struct bkey_s_c k)
 {
 	const struct xattr_handler *handler;
-	struct bkey_s_c_xattr xattr;
-
-	switch (k.k->type) {
-	case BCH_XATTR:
-		if (bkey_val_bytes(k.k) < sizeof(struct bch_xattr))
-			return "value too small";
-
-		xattr = bkey_s_c_to_xattr(k);
+	struct bkey_s_c_xattr xattr = bkey_s_c_to_xattr(k);
 
-		if (bkey_val_u64s(k.k) <
-			xattr_val_u64s(xattr.v->x_name_len,
-				       le16_to_cpu(xattr.v->x_val_len)))
-			return "value too small";
+	if (bkey_val_bytes(k.k) < sizeof(struct bch_xattr))
+		return "value too small";
 
-		if (bkey_val_u64s(k.k) >
-			xattr_val_u64s(xattr.v->x_name_len,
-				       le16_to_cpu(xattr.v->x_val_len) + 4))
-			return "value too big";
+	if (bkey_val_u64s(k.k) <
+	    xattr_val_u64s(xattr.v->x_name_len,
+			   le16_to_cpu(xattr.v->x_val_len)))
+		return "value too small";
 
-		handler = bch2_xattr_type_to_handler(xattr.v->x_type);
-		if (!handler)
-			return "invalid type";
+	if (bkey_val_u64s(k.k) >
+	    xattr_val_u64s(xattr.v->x_name_len,
+			   le16_to_cpu(xattr.v->x_val_len) + 4))
+		return "value too big";
 
-		if (memchr(xattr.v->x_name, '\0', xattr.v->x_name_len))
-			return "xattr name has invalid characters";
+	handler = bch2_xattr_type_to_handler(xattr.v->x_type);
+	if (!handler)
+		return "invalid type";
 
-		return NULL;
-	case BCH_XATTR_WHITEOUT:
-		return bkey_val_bytes(k.k) != 0
-			? "value size should be zero"
-			: NULL;
+	if (memchr(xattr.v->x_name, '\0', xattr.v->x_name_len))
+		return "xattr name has invalid characters";
 
-	default:
-		return "invalid type";
-	}
+	return NULL;
 }
 
 void bch2_xattr_to_text(struct printbuf *out, struct bch_fs *c,
 			struct bkey_s_c k)
 {
 	const struct xattr_handler *handler;
-	struct bkey_s_c_xattr xattr;
+	struct bkey_s_c_xattr xattr = bkey_s_c_to_xattr(k);
 
-	switch (k.k->type) {
-	case BCH_XATTR:
-		xattr = bkey_s_c_to_xattr(k);
+	handler = bch2_xattr_type_to_handler(xattr.v->x_type);
+	if (handler && handler->prefix)
+		pr_buf(out, "%s", handler->prefix);
+	else if (handler)
+		pr_buf(out, "(type %u)", xattr.v->x_type);
+	else
+		pr_buf(out, "(unknown type %u)", xattr.v->x_type);
 
-		handler = bch2_xattr_type_to_handler(xattr.v->x_type);
-		if (handler && handler->prefix)
-			pr_buf(out, "%s", handler->prefix);
-		else if (handler)
-			pr_buf(out, "(type %u)", xattr.v->x_type);
-		else
-			pr_buf(out, "(unknown type %u)", xattr.v->x_type);
-
-		bch_scnmemcpy(out, xattr.v->x_name,
-			      xattr.v->x_name_len);
-		pr_buf(out, ":");
-		bch_scnmemcpy(out, xattr_val(xattr.v),
-			      le16_to_cpu(xattr.v->x_val_len));
-		break;
-	case BCH_XATTR_WHITEOUT:
-		pr_buf(out, "whiteout");
-		break;
-	}
+	bch_scnmemcpy(out, xattr.v->x_name,
+		      xattr.v->x_name_len);
+	pr_buf(out, ":");
+	bch_scnmemcpy(out, xattr_val(xattr.v),
+		      le16_to_cpu(xattr.v->x_val_len));
 }
 
 int bch2_xattr_get(struct bch_fs *c, struct bch_inode_info *inode,
@@ -261,7 +239,7 @@ ssize_t bch2_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size)
 		if (k.k->p.inode > inum)
 			break;
 
-		if (k.k->type != BCH_XATTR)
+		if (k.k->type != KEY_TYPE_xattr)
 			continue;
 
 		xattr = bkey_s_c_to_xattr(k).v;
@@ -315,7 +293,7 @@ static const struct xattr_handler bch_xattr_user_handler = {
 	.prefix	= XATTR_USER_PREFIX,
 	.get	= bch2_xattr_get_handler,
 	.set	= bch2_xattr_set_handler,
-	.flags	= BCH_XATTR_INDEX_USER,
+	.flags	= KEY_TYPE_XATTR_INDEX_USER,
 };
 
 static bool bch2_xattr_trusted_list(struct dentry *dentry)
@@ -328,14 +306,14 @@ static const struct xattr_handler bch_xattr_trusted_handler = {
 	.list	= bch2_xattr_trusted_list,
 	.get	= bch2_xattr_get_handler,
 	.set	= bch2_xattr_set_handler,
-	.flags	= BCH_XATTR_INDEX_TRUSTED,
+	.flags	= KEY_TYPE_XATTR_INDEX_TRUSTED,
 };
 
 static const struct xattr_handler bch_xattr_security_handler = {
 	.prefix	= XATTR_SECURITY_PREFIX,
 	.get	= bch2_xattr_get_handler,
 	.set	= bch2_xattr_set_handler,
-	.flags	= BCH_XATTR_INDEX_SECURITY,
+	.flags	= KEY_TYPE_XATTR_INDEX_SECURITY,
 };
 
 #ifndef NO_BCACHEFS_FS
@@ -474,13 +452,13 @@ const struct xattr_handler *bch2_xattr_handlers[] = {
 };
 
 static const struct xattr_handler *bch_xattr_handler_map[] = {
-	[BCH_XATTR_INDEX_USER]			= &bch_xattr_user_handler,
-	[BCH_XATTR_INDEX_POSIX_ACL_ACCESS]	=
+	[KEY_TYPE_XATTR_INDEX_USER]			= &bch_xattr_user_handler,
+	[KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS]	=
 		&nop_posix_acl_access,
-	[BCH_XATTR_INDEX_POSIX_ACL_DEFAULT]	=
+	[KEY_TYPE_XATTR_INDEX_POSIX_ACL_DEFAULT]	=
 		&nop_posix_acl_default,
-	[BCH_XATTR_INDEX_TRUSTED]		= &bch_xattr_trusted_handler,
-	[BCH_XATTR_INDEX_SECURITY]		= &bch_xattr_security_handler,
+	[KEY_TYPE_XATTR_INDEX_TRUSTED]		= &bch_xattr_trusted_handler,
+	[KEY_TYPE_XATTR_INDEX_SECURITY]		= &bch_xattr_security_handler,
 };
 
 static const struct xattr_handler *bch2_xattr_type_to_handler(unsigned type)
diff --git a/fs/bcachefs/xattr.h b/fs/bcachefs/xattr.h
index 63be44b02a2b..4151065ab853 100644
--- a/fs/bcachefs/xattr.h
+++ b/fs/bcachefs/xattr.h
@@ -9,7 +9,7 @@ extern const struct bch_hash_desc bch2_xattr_hash_desc;
 const char *bch2_xattr_invalid(const struct bch_fs *, struct bkey_s_c);
 void bch2_xattr_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 
-#define bch2_bkey_xattr_ops (struct bkey_ops) {		\
+#define bch2_bkey_ops_xattr (struct bkey_ops) {		\
 	.key_invalid	= bch2_xattr_invalid,		\
 	.val_to_text	= bch2_xattr_to_text,		\
 }
-- 
cgit 


From f0cfb963ec0370b021bb21c899b5fdcd020014cf Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 29 Nov 2018 02:14:31 -0500
Subject: bcachefs: Track nr_inodes with the key marking machinery

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs.h      | 3 ---
 fs/bcachefs/btree_gc.c      | 1 +
 fs/bcachefs/btree_types.h   | 1 +
 fs/bcachefs/buckets.c       | 6 ++++++
 fs/bcachefs/buckets_types.h | 2 ++
 fs/bcachefs/fs.c            | 7 +------
 fs/bcachefs/fsck.c          | 8 --------
 fs/bcachefs/recovery.c      | 2 --
 8 files changed, 11 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index cd2fff851bbe..d774ddf6cbb3 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -721,9 +721,6 @@ struct bch_fs {
 	struct mutex		fsck_error_lock;
 	bool			fsck_alloc_err;
 
-	/* FILESYSTEM */
-	atomic_long_t		nr_inodes;
-
 	/* QUOTAS */
 	struct bch_memquota_type quotas[QTYP_NR];
 
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 85fc181e76a8..65cf64f22522 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -685,6 +685,7 @@ static void bch2_gc_done(struct bch_fs *c, bool initial)
 		for (b = 0; b < BCH_DATA_NR; b++)
 			copy_fs_field(buckets[b],
 				      "buckets[%s]", bch2_data_types[b]);
+		copy_fs_field(nr_inodes, "nr_inodes");
 
 		for_each_possible_cpu(cpu) {
 			p = per_cpu_ptr(c->usage[0], cpu);
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index b4a826369a57..f34f340ff034 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -439,6 +439,7 @@ static inline bool btree_node_type_needs_gc(enum btree_node_type type)
 	switch (type) {
 	case BKEY_TYPE_BTREE:
 	case BKEY_TYPE_EXTENTS:
+	case BKEY_TYPE_INODES:
 	case BKEY_TYPE_EC:
 		return true;
 	default:
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index d08e95020cef..8cbc1c5c8af5 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -862,6 +862,12 @@ static int __bch2_mark_key(struct bch_fs *c, struct bkey_s_c k,
 		ret = bch2_mark_stripe(c, k, inserting,
 				       stats, journal_seq, flags, gc);
 		break;
+	case KEY_TYPE_alloc:
+		if (inserting)
+			stats->nr_inodes++;
+		else
+			stats->nr_inodes--;
+		break;
 	case KEY_TYPE_reservation: {
 		unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas;
 
diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h
index 0187f465d23f..9f7812c69bbc 100644
--- a/fs/bcachefs/buckets_types.h
+++ b/fs/bcachefs/buckets_types.h
@@ -73,6 +73,8 @@ struct bch_fs_usage {
 
 	u64			buckets[BCH_DATA_NR];
 
+	u64			nr_inodes;
+
 	/* fields starting here aren't touched by gc: */
 	u64			online_reserved;
 	u64			available_cache;
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index 93e1f3aaacd4..db3c5962ad31 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -400,8 +400,6 @@ retry:
 	if (unlikely(ret))
 		goto err_trans;
 
-	atomic_long_inc(&c->nr_inodes);
-
 	if (!tmpfile) {
 		bch2_inode_update_after_write(c, dir, &dir_u,
 					      ATTR_MTIME|ATTR_CTIME);
@@ -1418,9 +1416,6 @@ static void bch2_evict_inode(struct inode *vinode)
 		bch2_quota_acct(c, inode->ei_qid, Q_INO, -1,
 				KEY_TYPE_QUOTA_WARN);
 		bch2_inode_rm(c, inode->v.i_ino);
-
-		WARN_ONCE(atomic_long_dec_return(&c->nr_inodes) < 0,
-			  "nr_inodes < 0");
 	}
 }
 
@@ -1439,7 +1434,7 @@ static int bch2_statfs(struct dentry *dentry, struct kstatfs *buf)
 	buf->f_blocks	= (c->capacity - hidden_metadata) >> shift;
 	buf->f_bfree	= (c->capacity - bch2_fs_sectors_used(c, usage)) >> shift;
 	buf->f_bavail	= buf->f_bfree;
-	buf->f_files	= atomic_long_read(&c->nr_inodes);
+	buf->f_files	= usage.nr_inodes;
 	buf->f_ffree	= U64_MAX;
 
 	fsid = le64_to_cpup((void *) c->sb.user_uuid.b) ^
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 57ab8f088415..810e1c3f4c49 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -1314,9 +1314,6 @@ peek_nlinks:	link = genradix_iter_peek(&nlinks_iter, links);
 			BUG_ON(ret == -EINTR);
 			if (ret)
 				break;
-
-			if (link->count)
-				atomic_long_inc(&c->nr_inodes);
 		} else {
 			/* Should have been caught by dirents pass: */
 			need_fsck_err_on(link->count, c,
@@ -1380,7 +1377,6 @@ static int check_inodes_fast(struct bch_fs *c)
 	struct btree_iter iter;
 	struct bkey_s_c k;
 	struct bkey_s_c_inode inode;
-	unsigned long nr_inodes = 0;
 	int ret = 0;
 
 	for_each_btree_key(&iter, c, BTREE_ID_INODES, POS_MIN, 0, k) {
@@ -1389,9 +1385,6 @@ static int check_inodes_fast(struct bch_fs *c)
 
 		inode = bkey_s_c_to_inode(k);
 
-		if (!(inode.v->bi_flags & BCH_INODE_UNLINKED))
-			nr_inodes++;
-
 		if (inode.v->bi_flags &
 		    (BCH_INODE_I_SIZE_DIRTY|
 		     BCH_INODE_I_SECTORS_DIRTY|
@@ -1405,7 +1398,6 @@ static int check_inodes_fast(struct bch_fs *c)
 				break;
 		}
 	}
-	atomic_long_set(&c->nr_inodes, nr_inodes);
 fsck_err:
 	return bch2_btree_iter_unlock(&iter) ?: ret;
 }
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 172770606294..2d0736caa5ef 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -375,8 +375,6 @@ int bch2_fs_initialize(struct bch_fs *c)
 	if (ret)
 		goto err;
 
-	atomic_long_set(&c->nr_inodes, 2);
-
 	if (enabled_qtypes(c)) {
 		ret = bch2_fs_quota_read(c);
 		if (ret)
-- 
cgit 


From eb8632657f79ee29941f4013b81cdd4aaeeca1a8 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 24 Nov 2018 18:27:16 -0500
Subject: bcachefs: drop bogus percpu_ref_tryget

caller should already be guarding against rw, and checking here breaks
when caller needs to finish updates for going RO

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update_interior.c | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 22f087098776..83a72cccb870 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -562,7 +562,6 @@ static void bch2_btree_update_free(struct btree_update *as)
 
 	closure_debug_destroy(&as->cl);
 	mempool_free(as, &c->btree_interior_update_pool);
-	percpu_ref_put(&c->writes);
 
 	closure_wake_up(&c->btree_interior_update_wait);
 	mutex_unlock(&c->btree_interior_update_lock);
@@ -1012,14 +1011,9 @@ bch2_btree_update_start(struct bch_fs *c, enum btree_id id,
 	struct btree_reserve *reserve;
 	struct btree_update *as;
 
-	if (unlikely(!percpu_ref_tryget(&c->writes)))
-		return ERR_PTR(-EROFS);
-
 	reserve = bch2_btree_reserve_get(c, nr_nodes, flags, cl);
-	if (IS_ERR(reserve)) {
-		percpu_ref_put(&c->writes);
+	if (IS_ERR(reserve))
 		return ERR_CAST(reserve);
-	}
 
 	as = mempool_alloc(&c->btree_interior_update_pool, GFP_NOIO);
 	memset(as, 0, sizeof(*as));
-- 
cgit 


From 3636ed489ac05e61d59be29b8e69111ef781d528 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 17 Jul 2018 13:50:15 -0400
Subject: bcachefs: Deferred btree updates

Will be used in the future for inode updates, which will be very helpful
for multithreaded workloads that have to update the inode with every
extent update (appends, or updates that change i_sectors)

Also will be used eventually for fully persistent alloc info

However - we still need a mechanism for reserving space in the journal
prior to getting a journal reservation, so it's not technically safe to
make use of this just yet, we could deadlock with the journal full
(although not likely to be an issue in practice)

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_types.h       |  23 +++-
 fs/bcachefs/btree_update.h      |  12 ++
 fs/bcachefs/btree_update_leaf.c | 258 +++++++++++++++++++++++++++++++---------
 fs/bcachefs/journal_reclaim.c   |  19 +++
 fs/bcachefs/journal_reclaim.h   |   2 +
 5 files changed, 259 insertions(+), 55 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index f34f340ff034..ce5127301cb2 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -245,9 +245,28 @@ struct btree_iter {
 
 #define BTREE_ITER_MAX		8
 
+struct deferred_update {
+	struct journal_entry_pin journal;
+
+	spinlock_t		lock;
+	unsigned		gen;
+
+	u8			allocated_u64s;
+	enum btree_id		btree_id;
+
+	/* must be last: */
+	struct bkey_i		k;
+};
+
 struct btree_insert_entry {
-	struct btree_iter *iter;
-	struct bkey_i	*k;
+	struct bkey_i		*k;
+
+	union {
+	struct btree_iter	*iter;
+	struct deferred_update	*d;
+	};
+
+	bool			deferred;
 };
 
 struct btree_trans {
diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
index d1647f6eb476..824fb0d1b7f0 100644
--- a/fs/bcachefs/btree_update.h
+++ b/fs/bcachefs/btree_update.h
@@ -16,6 +16,11 @@ bool bch2_btree_bset_insert_key(struct btree_iter *, struct btree *,
 void bch2_btree_journal_key(struct btree_insert *trans, struct btree_iter *,
 			    struct bkey_i *);
 
+void bch2_deferred_update_free(struct bch_fs *,
+			       struct deferred_update *);
+struct deferred_update *
+bch2_deferred_update_alloc(struct bch_fs *, enum btree_id, unsigned);
+
 /* Normal update interface: */
 
 struct btree_insert {
@@ -38,6 +43,13 @@ int __bch2_btree_insert_at(struct btree_insert *);
 		.k		= (_k),					\
 	})
 
+#define BTREE_INSERT_DEFERRED(_d, _k)					\
+	((struct btree_insert_entry) {					\
+		.k		= (_k),					\
+		.d		= (_d),					\
+		.deferred	= true,					\
+	})
+
 /**
  * bch_btree_insert_at - insert one or more keys at iterator positions
  * @iter:		btree iterator
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index fd27334cf2a4..12fd7fba3e9a 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -8,6 +8,7 @@
 #include "btree_locking.h"
 #include "buckets.h"
 #include "debug.h"
+#include "error.h"
 #include "extents.h"
 #include "journal.h"
 #include "journal_reclaim.h"
@@ -126,6 +127,27 @@ static void btree_node_flush1(struct journal *j, struct journal_entry_pin *pin,
 	return __btree_node_flush(j, pin, 1, seq);
 }
 
+static inline void __btree_journal_key(struct btree_insert *trans,
+				       enum btree_id btree_id,
+				       struct bkey_i *insert)
+{
+	struct journal *j = &trans->c->journal;
+	u64 seq = trans->journal_res.seq;
+	bool needs_whiteout = insert->k.needs_whiteout;
+
+	/* ick */
+	insert->k.needs_whiteout = false;
+	bch2_journal_add_keys(j, &trans->journal_res,
+			      btree_id, insert);
+	insert->k.needs_whiteout = needs_whiteout;
+
+	bch2_journal_set_has_inode(j, &trans->journal_res,
+				   insert->k.p.inode);
+
+	if (trans->journal_seq)
+		*trans->journal_seq = seq;
+}
+
 void bch2_btree_journal_key(struct btree_insert *trans,
 			   struct btree_iter *iter,
 			   struct bkey_i *insert)
@@ -140,21 +162,9 @@ void bch2_btree_journal_key(struct btree_insert *trans,
 		!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY));
 
 	if (likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) {
-		u64 seq = trans->journal_res.seq;
-		bool needs_whiteout = insert->k.needs_whiteout;
-
-		/* ick */
-		insert->k.needs_whiteout = false;
-		bch2_journal_add_keys(j, &trans->journal_res,
-				      iter->btree_id, insert);
-		insert->k.needs_whiteout = needs_whiteout;
-
-		bch2_journal_set_has_inode(j, &trans->journal_res,
-					   insert->k.p.inode);
-
-		if (trans->journal_seq)
-			*trans->journal_seq = seq;
-		btree_bset_last(b)->journal_seq = cpu_to_le64(seq);
+		__btree_journal_key(trans, iter->btree_id, insert);
+		btree_bset_last(b)->journal_seq =
+			cpu_to_le64(trans->journal_res.seq);
 	}
 
 	if (unlikely(!journal_pin_active(&w->journal))) {
@@ -227,8 +237,109 @@ btree_insert_key_leaf(struct btree_insert *trans,
 	return ret;
 }
 
-#define trans_for_each_entry(trans, i)					\
-	for ((i) = (trans)->entries; (i) < (trans)->entries + (trans)->nr; (i)++)
+/* Deferred btree updates: */
+
+static void deferred_update_flush(struct journal *j,
+					struct journal_entry_pin *pin,
+					u64 seq)
+{
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
+	struct deferred_update *d =
+		container_of(pin, struct deferred_update, journal);
+	u64 tmp[32];
+	struct bkey_i *k = (void *) tmp;
+	unsigned gen;
+	int ret;
+
+	if (d->allocated_u64s > ARRAY_SIZE(tmp)) {
+		k = kmalloc(d->allocated_u64s * sizeof(u64), GFP_NOFS);
+
+		BUG_ON(!k); /* XXX */
+	}
+
+	spin_lock(&d->lock);
+	gen = d->gen;
+
+	if (journal_pin_active(&d->journal)) {
+		BUG_ON(d->k.k.u64s > d->allocated_u64s);
+		bkey_copy(k, &d->k);
+
+		spin_unlock(&d->lock);
+
+		ret = bch2_btree_insert(c, d->btree_id, k, NULL, NULL,
+					BTREE_INSERT_NOFAIL);
+		bch2_fs_fatal_err_on(ret && !bch2_journal_error(j),
+			c, "error flushing deferred btree update: %i", ret);
+
+		spin_lock(&d->lock);
+	}
+
+	if (gen == d->gen)
+		bch2_journal_pin_drop(j, &d->journal);
+	spin_unlock(&d->lock);
+
+	if (k != (void *) tmp)
+		kfree(k);
+}
+
+static enum btree_insert_ret
+btree_insert_key_deferred(struct btree_insert *trans,
+			  struct btree_insert_entry *insert)
+{
+	struct bch_fs *c = trans->c;
+	struct journal *j = &c->journal;
+	struct deferred_update *d = insert->d;
+
+	BUG_ON(trans->flags & BTREE_INSERT_JOURNAL_REPLAY);
+	BUG_ON(insert->k->u64s > d->allocated_u64s);
+
+	__btree_journal_key(trans, d->btree_id, insert->k);
+
+	spin_lock(&d->lock);
+	d->gen++;
+	bkey_copy(&d->k, insert->k);
+	spin_unlock(&d->lock);
+
+	bch2_journal_pin_update(j, trans->journal_res.seq, &d->journal,
+				deferred_update_flush);
+
+	return BTREE_INSERT_OK;
+}
+
+void bch2_deferred_update_free(struct bch_fs *c,
+			       struct deferred_update *d)
+{
+	deferred_update_flush(&c->journal, &d->journal, 0);
+
+	BUG_ON(journal_pin_active(&d->journal));
+
+	bch2_journal_pin_flush(&c->journal, &d->journal);
+	kfree(d);
+}
+
+struct deferred_update *
+bch2_deferred_update_alloc(struct bch_fs *c,
+			   enum btree_id btree_id,
+			   unsigned u64s)
+{
+	struct deferred_update *d;
+
+	BUG_ON(u64s > U8_MAX);
+
+	d = kmalloc(offsetof(struct deferred_update, k) +
+		    u64s * sizeof(u64), GFP_NOFS);
+	BUG_ON(!d);
+
+	memset(d, 0, offsetof(struct deferred_update, k));
+
+	spin_lock_init(&d->lock);
+	d->allocated_u64s	= u64s;
+	d->btree_id		= btree_id;
+
+	return d;
+}
+
+/* struct btree_insert operations: */
 
 /*
  * We sort transaction entries so that if multiple iterators point to the same
@@ -238,25 +349,32 @@ static bool same_leaf_as_prev(struct btree_insert *trans,
 			      struct btree_insert_entry *i)
 {
 	return i != trans->entries &&
+		!i->deferred &&
 		i[0].iter->l[0].b == i[-1].iter->l[0].b;
 }
 
-static inline struct btree_insert_entry *trans_next_leaf(struct btree_insert *trans,
-							 struct btree_insert_entry *i)
-{
-	struct btree *b = i->iter->l[0].b;
+#define __trans_next_entry(_trans, _i, _filter)				\
+({									\
+	while ((_i) < (_trans)->entries + (_trans->nr) && !(_filter))	\
+		(_i)++;							\
+									\
+	(_i) < (_trans)->entries + (_trans->nr);			\
+})
 
-	do {
-		i++;
-	} while (i < trans->entries + trans->nr && b == i->iter->l[0].b);
+#define __trans_for_each_entry(_trans, _i, _filter)			\
+	for ((_i) = (_trans)->entries;					\
+	     __trans_next_entry(_trans, _i, _filter);			\
+	     (_i)++)
 
-	return i;
-}
+#define trans_for_each_entry(trans, i)					\
+	__trans_for_each_entry(trans, i, true)
+
+#define trans_for_each_iter(trans, i)					\
+	__trans_for_each_entry(trans, i, !(i)->deferred)
 
 #define trans_for_each_leaf(trans, i)					\
-	for ((i) = (trans)->entries;					\
-	     (i) < (trans)->entries + (trans)->nr;			\
-	     (i) = trans_next_leaf(trans, i))
+	__trans_for_each_entry(trans, i, !(i)->deferred &&		\
+			       !same_leaf_as_prev(trans, i))
 
 inline void bch2_btree_node_lock_for_insert(struct bch_fs *c, struct btree *b,
 					    struct btree_iter *iter)
@@ -294,7 +412,8 @@ static void multi_unlock_write(struct btree_insert *trans)
 static inline int btree_trans_cmp(struct btree_insert_entry l,
 				  struct btree_insert_entry r)
 {
-	return btree_iter_cmp(l.iter, r.iter);
+	return (l.deferred > r.deferred) - (l.deferred < r.deferred) ?:
+		btree_iter_cmp(l.iter, r.iter);
 }
 
 /* Normal update interface: */
@@ -328,6 +447,15 @@ btree_key_can_insert(struct btree_insert *trans,
 	return BTREE_INSERT_OK;
 }
 
+static inline enum btree_insert_ret
+do_btree_insert_one(struct btree_insert *trans,
+		    struct btree_insert_entry *insert)
+{
+	return likely(!insert->deferred)
+		? btree_insert_key_leaf(trans, insert)
+		: btree_insert_key_deferred(trans, insert);
+}
+
 /*
  * Get journal reservation, take write locks, and attempt to do btree update(s):
  */
@@ -340,9 +468,14 @@ static inline int do_btree_insert_at(struct btree_insert *trans,
 	unsigned u64s;
 	int ret;
 
-	trans_for_each_entry(trans, i)
+	trans_for_each_iter(trans, i)
 		BUG_ON(i->iter->uptodate >= BTREE_ITER_NEED_RELOCK);
 
+	/* reserve space for deferred updates */
+	__trans_for_each_entry(trans, i, i->deferred) {
+
+	}
+
 	memset(&trans->journal_res, 0, sizeof(trans->journal_res));
 
 	if (likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) {
@@ -353,9 +486,13 @@ static inline int do_btree_insert_at(struct btree_insert *trans,
 		while ((ret = bch2_journal_res_get(&c->journal,
 					&trans->journal_res, u64s,
 					JOURNAL_RES_GET_NONBLOCK)) == -EAGAIN) {
-			struct btree_iter *iter = trans->entries[0].iter;
+			struct btree_iter *iter = NULL;
+
+			trans_for_each_iter(trans, i)
+				iter = i->iter;
 
-			bch2_btree_iter_unlock(iter);
+			if (iter)
+				bch2_btree_iter_unlock(iter);
 
 			ret = bch2_journal_res_get(&c->journal,
 					&trans->journal_res, u64s,
@@ -363,7 +500,7 @@ static inline int do_btree_insert_at(struct btree_insert *trans,
 			if (ret)
 				return ret;
 
-			if (!bch2_btree_iter_relock(iter)) {
+			if (iter && !bch2_btree_iter_relock(iter)) {
 				trans_restart(" (iter relock after journal res get blocked)");
 				return -EINTR;
 			}
@@ -387,7 +524,7 @@ static inline int do_btree_insert_at(struct btree_insert *trans,
 	 * amount of space available:
 	 */
 	u64s = 0;
-	trans_for_each_entry(trans, i) {
+	trans_for_each_iter(trans, i) {
 		/* Multiple inserts might go to same leaf: */
 		if (!same_leaf_as_prev(trans, i))
 			u64s = 0;
@@ -415,14 +552,17 @@ static inline int do_btree_insert_at(struct btree_insert *trans,
 		 * have been traversed/locked, depending on what the caller was
 		 * doing:
 		 */
-		for_each_btree_iter(trans->entries[0].iter, linked)
-			if (linked->uptodate < BTREE_ITER_NEED_RELOCK)
-				linked->flags |= BTREE_ITER_NOUNLOCK;
+		trans_for_each_iter(trans, i) {
+			for_each_btree_iter(i->iter, linked)
+				if (linked->uptodate < BTREE_ITER_NEED_RELOCK)
+					linked->flags |= BTREE_ITER_NOUNLOCK;
+			break;
+		}
 	}
 	trans->did_work = true;
 
 	trans_for_each_entry(trans, i) {
-		switch (btree_insert_key_leaf(trans, i)) {
+		switch (do_btree_insert_one(trans, i)) {
 		case BTREE_INSERT_OK:
 			break;
 		case BTREE_INSERT_NEED_TRAVERSE:
@@ -444,12 +584,20 @@ out:
 static inline void btree_insert_entry_checks(struct bch_fs *c,
 					     struct btree_insert_entry *i)
 {
-	BUG_ON(i->iter->level);
-	BUG_ON(bkey_cmp(bkey_start_pos(&i->k->k), i->iter->pos));
+	enum btree_id btree_id = !i->deferred
+		? i->iter->btree_id
+		: i->d->btree_id;
+
+	if (!i->deferred) {
+		BUG_ON(i->iter->level);
+		BUG_ON(bkey_cmp(bkey_start_pos(&i->k->k), i->iter->pos));
+
+		bch2_btree_iter_verify_locks(i->iter);
+	}
+
 	BUG_ON(debug_check_bkeys(c) &&
 	       !bkey_deleted(&i->k->k) &&
-	       bch2_bkey_invalid(c, bkey_i_to_s_c(i->k),
-				 i->iter->btree_id));
+	       bch2_bkey_invalid(c, bkey_i_to_s_c(i->k), btree_id));
 }
 
 /**
@@ -473,20 +621,18 @@ int __bch2_btree_insert_at(struct btree_insert *trans)
 
 	BUG_ON(!trans->nr);
 
-	bch2_btree_iter_verify_locks(trans->entries[0].iter);
-
 	/* for the sake of sanity: */
 	BUG_ON(trans->nr > 1 && !(trans->flags & BTREE_INSERT_ATOMIC));
 
+	bubble_sort(trans->entries, trans->nr, btree_trans_cmp);
+
 	trans_for_each_entry(trans, i)
 		btree_insert_entry_checks(c, i);
 
-	bubble_sort(trans->entries, trans->nr, btree_trans_cmp);
-
 	if (unlikely(!percpu_ref_tryget(&c->writes)))
 		return -EROFS;
 retry:
-	trans_for_each_entry(trans, i) {
+	trans_for_each_iter(trans, i) {
 		unsigned old_locks_want = i->iter->locks_want;
 		unsigned old_uptodate = i->iter->uptodate;
 
@@ -510,16 +656,22 @@ retry:
 	trans_for_each_leaf(trans, i)
 		bch2_foreground_maybe_merge(c, i->iter, 0, trans->flags);
 
-	trans_for_each_entry(trans, i)
+	trans_for_each_iter(trans, i)
 		bch2_btree_iter_downgrade(i->iter);
 out:
 	percpu_ref_put(&c->writes);
 
 	/* make sure we didn't drop or screw up locks: */
-	bch2_btree_iter_verify_locks(trans->entries[0].iter);
+	trans_for_each_iter(trans, i) {
+		bch2_btree_iter_verify_locks(i->iter);
+		break;
+	}
 
-	for_each_btree_iter(trans->entries[0].iter, linked)
-		linked->flags &= ~BTREE_ITER_NOUNLOCK;
+	trans_for_each_iter(trans, i) {
+		for_each_btree_iter(i->iter, linked)
+			linked->flags &= ~BTREE_ITER_NOUNLOCK;
+		break;
+	}
 
 	BUG_ON(!(trans->flags & BTREE_INSERT_ATOMIC) && ret == -EINTR);
 
@@ -598,7 +750,7 @@ err:
 			goto out;
 		}
 
-		trans_for_each_entry(trans, i) {
+		trans_for_each_iter(trans, i) {
 			int ret2 = bch2_btree_iter_traverse(i->iter);
 			if (ret2) {
 				ret = ret2;
diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
index 6ada63f1bb25..770a6e0c7d97 100644
--- a/fs/bcachefs/journal_reclaim.c
+++ b/fs/bcachefs/journal_reclaim.c
@@ -75,6 +75,25 @@ void bch2_journal_pin_drop(struct journal *j,
 	spin_unlock(&j->lock);
 }
 
+void bch2_journal_pin_update(struct journal *j, u64 seq,
+			     struct journal_entry_pin *pin,
+			     journal_pin_flush_fn flush_fn)
+{
+	spin_lock(&j->lock);
+
+	if (pin->seq != seq) {
+		__journal_pin_drop(j, pin);
+		__journal_pin_add(j, seq, pin, flush_fn);
+	} else {
+		struct journal_entry_pin_list *pin_list =
+			journal_seq_pin(j, seq);
+
+		list_move(&pin->list, &pin_list->list);
+	}
+
+	spin_unlock(&j->lock);
+}
+
 void bch2_journal_pin_add_if_older(struct journal *j,
 				  struct journal_entry_pin *src_pin,
 				  struct journal_entry_pin *pin,
diff --git a/fs/bcachefs/journal_reclaim.h b/fs/bcachefs/journal_reclaim.h
index f5af4252c88a..e06ac0492960 100644
--- a/fs/bcachefs/journal_reclaim.h
+++ b/fs/bcachefs/journal_reclaim.h
@@ -19,6 +19,8 @@ journal_seq_pin(struct journal *j, u64 seq)
 
 void bch2_journal_pin_add(struct journal *, u64, struct journal_entry_pin *,
 			  journal_pin_flush_fn);
+void bch2_journal_pin_update(struct journal *, u64, struct journal_entry_pin *,
+			     journal_pin_flush_fn);
 void bch2_journal_pin_drop(struct journal *, struct journal_entry_pin *);
 void bch2_journal_pin_add_if_older(struct journal *,
 				  struct journal_entry_pin *,
-- 
cgit 


From 90541a741d74373b8cca2bcd56c469927d093064 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 21 Jul 2018 23:36:11 -0400
Subject: bcachefs: Add new alloc fields

prep work for persistent alloc info

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c | 192 +++++++++++++++++++++++++----------------
 fs/bcachefs/bcachefs.h         |   3 +-
 fs/bcachefs/bcachefs_format.h  |  31 +++++--
 fs/bcachefs/btree_gc.c         |   6 +-
 fs/bcachefs/buckets_types.h    |  38 ++++----
 5 files changed, 166 insertions(+), 104 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 885aff511f97..bd3070539e28 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -23,6 +23,13 @@
 #include <linux/sched/task.h>
 #include <linux/sort.h>
 
+static const char * const bch2_alloc_field_names[] = {
+#define x(name, bytes) #name,
+	BCH_ALLOC_FIELDS()
+#undef x
+	NULL
+};
+
 static void bch2_recalc_oldest_io(struct bch_fs *, struct bch_dev *, int);
 
 /* Ratelimiting/PD controllers */
@@ -62,14 +69,73 @@ static void pd_controllers_update(struct work_struct *work)
 
 /* Persistent alloc info: */
 
+static inline u64 get_alloc_field(const struct bch_alloc *a,
+				  const void **p, unsigned field)
+{
+	unsigned bytes = BCH_ALLOC_FIELD_BYTES[field];
+	u64 v;
+
+	if (!(a->fields & (1 << field)))
+		return 0;
+
+	switch (bytes) {
+	case 1:
+		v = *((const u8 *) *p);
+		break;
+	case 2:
+		v = le16_to_cpup(*p);
+		break;
+	case 4:
+		v = le32_to_cpup(*p);
+		break;
+	case 8:
+		v = le64_to_cpup(*p);
+		break;
+	default:
+		BUG();
+	}
+
+	*p += bytes;
+	return v;
+}
+
+static inline void put_alloc_field(struct bkey_i_alloc *a, void **p,
+				   unsigned field, u64 v)
+{
+	unsigned bytes = BCH_ALLOC_FIELD_BYTES[field];
+
+	if (!v)
+		return;
+
+	a->v.fields |= 1 << field;
+
+	switch (bytes) {
+	case 1:
+		*((u8 *) *p) = v;
+		break;
+	case 2:
+		*((__le16 *) *p) = cpu_to_le16(v);
+		break;
+	case 4:
+		*((__le32 *) *p) = cpu_to_le32(v);
+		break;
+	case 8:
+		*((__le64 *) *p) = cpu_to_le64(v);
+		break;
+	default:
+		BUG();
+	}
+
+	*p += bytes;
+}
+
 static unsigned bch_alloc_val_u64s(const struct bch_alloc *a)
 {
-	unsigned bytes = offsetof(struct bch_alloc, data);
+	unsigned i, bytes = offsetof(struct bch_alloc, data);
 
-	if (a->fields & (1 << BCH_ALLOC_FIELD_READ_TIME))
-		bytes += 2;
-	if (a->fields & (1 << BCH_ALLOC_FIELD_WRITE_TIME))
-		bytes += 2;
+	for (i = 0; i < ARRAY_SIZE(BCH_ALLOC_FIELD_BYTES); i++)
+		if (a->fields & (1 << i))
+			bytes += BCH_ALLOC_FIELD_BYTES[i];
 
 	return DIV_ROUND_UP(bytes, sizeof(u64));
 }
@@ -93,58 +159,55 @@ void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c,
 			struct bkey_s_c k)
 {
 	struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k);
+	const void *d = a.v->data;
+	unsigned i;
 
 	pr_buf(out, "gen %u", a.v->gen);
+
+	for (i = 0; i < BCH_ALLOC_FIELD_NR; i++)
+		if (a.v->fields & (1 << i))
+			pr_buf(out, " %s %llu",
+			       bch2_alloc_field_names[i],
+			       get_alloc_field(a.v, &d, i));
 }
 
-static inline unsigned get_alloc_field(const u8 **p, unsigned bytes)
+static void __alloc_read_key(struct bucket *g, const struct bch_alloc *a)
 {
-	unsigned v;
-
-	switch (bytes) {
-	case 1:
-		v = **p;
-		break;
-	case 2:
-		v = le16_to_cpup((void *) *p);
-		break;
-	case 4:
-		v = le32_to_cpup((void *) *p);
-		break;
-	default:
-		BUG();
-	}
-
-	*p += bytes;
-	return v;
+	const void *d = a->data;
+	unsigned idx = 0;
+
+	g->_mark.gen		= a->gen;
+	g->gen_valid		= 1;
+	g->io_time[READ]	= get_alloc_field(a, &d, idx++);
+	g->io_time[WRITE]	= get_alloc_field(a, &d, idx++);
+	g->_mark.data_type	= get_alloc_field(a, &d, idx++);
+	g->_mark.dirty_sectors	= get_alloc_field(a, &d, idx++);
+	g->_mark.cached_sectors	= get_alloc_field(a, &d, idx++);
 }
 
-static inline void put_alloc_field(u8 **p, unsigned bytes, unsigned v)
+static void __alloc_write_key(struct bkey_i_alloc *a, struct bucket *g)
 {
-	switch (bytes) {
-	case 1:
-		**p = v;
-		break;
-	case 2:
-		*((__le16 *) *p) = cpu_to_le16(v);
-		break;
-	case 4:
-		*((__le32 *) *p) = cpu_to_le32(v);
-		break;
-	default:
-		BUG();
-	}
+	struct bucket_mark m = READ_ONCE(g->mark);
+	unsigned idx = 0;
+	void *d = a->v.data;
 
-	*p += bytes;
+	a->v.fields	= 0;
+	a->v.gen	= m.gen;
+
+	d = a->v.data;
+	put_alloc_field(a, &d, idx++, g->io_time[READ]);
+	put_alloc_field(a, &d, idx++, g->io_time[WRITE]);
+	put_alloc_field(a, &d, idx++, m.data_type);
+	put_alloc_field(a, &d, idx++, m.dirty_sectors);
+	put_alloc_field(a, &d, idx++, m.cached_sectors);
+
+	set_bkey_val_bytes(&a->k, (void *) d - (void *) &a->v);
 }
 
 static void bch2_alloc_read_key(struct bch_fs *c, struct bkey_s_c k)
 {
 	struct bch_dev *ca;
 	struct bkey_s_c_alloc a;
-	struct bucket_mark new;
-	struct bucket *g;
-	const u8 *d;
 
 	if (k.k->type != KEY_TYPE_alloc)
 		return;
@@ -156,19 +219,7 @@ static void bch2_alloc_read_key(struct bch_fs *c, struct bkey_s_c k)
 		return;
 
 	percpu_down_read(&c->usage_lock);
-
-	g = bucket(ca, a.k->p.offset);
-	bucket_cmpxchg(g, new, ({
-		new.gen = a.v->gen;
-		new.gen_valid = 1;
-	}));
-
-	d = a.v->data;
-	if (a.v->fields & (1 << BCH_ALLOC_FIELD_READ_TIME))
-		g->io_time[READ] = get_alloc_field(&d, 2);
-	if (a.v->fields & (1 << BCH_ALLOC_FIELD_WRITE_TIME))
-		g->io_time[WRITE] = get_alloc_field(&d, 2);
-
+	__alloc_read_key(bucket(ca, a.k->p.offset), a.v);
 	percpu_up_read(&c->usage_lock);
 }
 
@@ -222,28 +273,21 @@ static int __bch2_alloc_write_key(struct bch_fs *c, struct bch_dev *ca,
 				  size_t b, struct btree_iter *iter,
 				  u64 *journal_seq, unsigned flags)
 {
-	struct bucket_mark m;
-	__BKEY_PADDED(k, DIV_ROUND_UP(sizeof(struct bch_alloc), 8)) alloc_key;
-	struct bucket *g;
-	struct bkey_i_alloc *a;
+#if 0
+	__BKEY_PADDED(k, BKEY_ALLOC_VAL_U64s_MAX) alloc_key;
+#else
+	/* hack: */
+	__BKEY_PADDED(k, 8) alloc_key;
+#endif
+	struct bkey_i_alloc *a = bkey_alloc_init(&alloc_key.k);
 	int ret;
-	u8 *d;
 
-	percpu_down_read(&c->usage_lock);
-	g = bucket(ca, b);
+	BUG_ON(BKEY_ALLOC_VAL_U64s_MAX > 8);
 
-	m = READ_ONCE(g->mark);
-	a = bkey_alloc_init(&alloc_key.k);
-	a->k.p		= POS(ca->dev_idx, b);
-	a->v.fields	= 0;
-	a->v.gen	= m.gen;
-	set_bkey_val_u64s(&a->k, bch_alloc_val_u64s(&a->v));
+	a->k.p = POS(ca->dev_idx, b);
 
-	d = a->v.data;
-	if (a->v.fields & (1 << BCH_ALLOC_FIELD_READ_TIME))
-		put_alloc_field(&d, 2, g->io_time[READ]);
-	if (a->v.fields & (1 << BCH_ALLOC_FIELD_WRITE_TIME))
-		put_alloc_field(&d, 2, g->io_time[WRITE]);
+	percpu_down_read(&c->usage_lock);
+	__alloc_write_key(a, bucket(ca, b));
 	percpu_up_read(&c->usage_lock);
 
 	bch2_btree_iter_cond_resched(iter);
@@ -1295,7 +1339,7 @@ static int __bch2_fs_allocator_start(struct bch_fs *c)
 		     bu < buckets->nbuckets; bu++) {
 			m = READ_ONCE(buckets->b[bu].mark);
 
-			if (!m.gen_valid ||
+			if (!buckets->b[bu].gen_valid ||
 			    !is_available_bucket(m) ||
 			    m.cached_sectors)
 				continue;
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index d774ddf6cbb3..e9ae7e6d53b2 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -619,8 +619,6 @@ struct bch_fs {
 
 	struct percpu_rw_semaphore usage_lock;
 
-	struct closure_waitlist	freelist_wait;
-
 	/*
 	 * When we invalidate buckets, we use both the priority and the amount
 	 * of good data to determine which buckets to reuse first - to weight
@@ -633,6 +631,7 @@ struct bch_fs {
 
 	/* ALLOCATOR */
 	spinlock_t		freelist_lock;
+	struct closure_waitlist	freelist_wait;
 	u8			open_buckets_freelist;
 	u8			open_buckets_nr_free;
 	struct closure_waitlist	open_buckets_wait;
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index 801156b74335..162a0a307f1b 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -807,11 +807,6 @@ struct bch_xattr {
 
 /* Bucket/allocation information: */
 
-enum {
-	BCH_ALLOC_FIELD_READ_TIME	= 0,
-	BCH_ALLOC_FIELD_WRITE_TIME	= 1,
-};
-
 struct bch_alloc {
 	struct bch_val		v;
 	__u8			fields;
@@ -819,6 +814,32 @@ struct bch_alloc {
 	__u8			data[];
 } __attribute__((packed, aligned(8)));
 
+#define BCH_ALLOC_FIELDS()			\
+	x(read_time, 2)				\
+	x(write_time, 2)			\
+	x(data_type, 1)				\
+	x(dirty_sectors, 2)			\
+	x(cached_sectors, 2)
+
+enum {
+#define x(name, bytes) BCH_ALLOC_FIELD_##name,
+	BCH_ALLOC_FIELDS()
+#undef x
+	BCH_ALLOC_FIELD_NR
+};
+
+static const unsigned BCH_ALLOC_FIELD_BYTES[] = {
+#define x(name, bytes) [BCH_ALLOC_FIELD_##name] = bytes,
+	BCH_ALLOC_FIELDS()
+#undef x
+};
+
+#define x(name, bytes) + bytes
+static const unsigned BKEY_ALLOC_VAL_U64s_MAX =
+	DIV_ROUND_UP(offsetof(struct bch_alloc, data)
+		     BCH_ALLOC_FIELDS(), sizeof(u64));
+#undef x
+
 /* Quotas: */
 
 enum quota_types {
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 65cf64f22522..997e72aa4cb1 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -144,12 +144,12 @@ static int bch2_gc_mark_key(struct bch_fs *c, struct bkey_s_c k,
 			size_t b = PTR_BUCKET_NR(ca, ptr);
 			struct bucket *g = PTR_BUCKET(ca, ptr);
 
-			if (mustfix_fsck_err_on(!g->mark.gen_valid, c,
+			if (mustfix_fsck_err_on(!g->gen_valid, c,
 					"found ptr with missing gen in alloc btree,\n"
 					"type %u gen %u",
 					k.k->type, ptr->gen)) {
 				g->_mark.gen = ptr->gen;
-				g->_mark.gen_valid = 1;
+				g->gen_valid = 1;
 				set_bit(b, ca->buckets_dirty);
 			}
 
@@ -157,7 +157,7 @@ static int bch2_gc_mark_key(struct bch_fs *c, struct bkey_s_c k,
 					"%u ptr gen in the future: %u > %u",
 					k.k->type, ptr->gen, g->mark.gen)) {
 				g->_mark.gen = ptr->gen;
-				g->_mark.gen_valid = 1;
+				g->gen_valid = 1;
 				set_bit(b, ca->buckets_dirty);
 				set_bit(BCH_FS_FIXED_GENS, &c->flags);
 			}
diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h
index 9f7812c69bbc..a47a7856eee4 100644
--- a/fs/bcachefs/buckets_types.h
+++ b/fs/bcachefs/buckets_types.h
@@ -9,28 +9,25 @@
 
 struct bucket_mark {
 	union {
-	struct {
-		atomic64_t	v;
-	};
+	atomic64_t	v;
 
 	struct {
-		u8		gen;
-		u8		data_type:3,
-				gen_valid:1,
-				owned_by_allocator:1,
-				nouse:1,
-				journal_seq_valid:1,
-				stripe:1;
-		u16		dirty_sectors;
-		u16		cached_sectors;
-
-		/*
-		 * low bits of journal sequence number when this bucket was most
-		 * recently modified: if journal_seq_valid is set, this bucket
-		 * can't be reused until the journal sequence number written to
-		 * disk is >= the bucket's journal sequence number:
-		 */
-		u16		journal_seq;
+	u8		gen;
+	u8		data_type:3,
+			owned_by_allocator:1,
+			nouse:1,
+			journal_seq_valid:1,
+			stripe:1;
+	u16		dirty_sectors;
+	u16		cached_sectors;
+
+	/*
+	 * low bits of journal sequence number when this bucket was most
+	 * recently modified: if journal_seq_valid is set, this bucket can't be
+	 * reused until the journal sequence number written to disk is >= the
+	 * bucket's journal sequence number:
+	 */
+	u16		journal_seq;
 	};
 	};
 };
@@ -42,6 +39,7 @@ struct bucket {
 	};
 
 	u16				io_time[2];
+	unsigned			gen_valid:1;
 };
 
 struct bucket_array {
-- 
cgit 


From 8eb7f3ee46f23207d3e0ae6428b780a0708c53c1 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 19 Nov 2018 01:16:07 -0500
Subject: bcachefs: move dirty into bucket_mark

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c | 35 +++++++++++++++++++++++++----------
 fs/bcachefs/bcachefs.h         |  2 +-
 fs/bcachefs/btree_gc.c         |  4 ++--
 fs/bcachefs/buckets.c          | 14 +++++++-------
 fs/bcachefs/buckets.h          | 15 +++++++++++++--
 fs/bcachefs/buckets_types.h    |  2 +-
 6 files changed, 49 insertions(+), 23 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index bd3070539e28..8756b2efb53e 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -185,9 +185,9 @@ static void __alloc_read_key(struct bucket *g, const struct bch_alloc *a)
 	g->_mark.cached_sectors	= get_alloc_field(a, &d, idx++);
 }
 
-static void __alloc_write_key(struct bkey_i_alloc *a, struct bucket *g)
+static void __alloc_write_key(struct bkey_i_alloc *a, struct bucket *g,
+			      struct bucket_mark m)
 {
-	struct bucket_mark m = READ_ONCE(g->mark);
 	unsigned idx = 0;
 	void *d = a->v.data;
 
@@ -280,6 +280,8 @@ static int __bch2_alloc_write_key(struct bch_fs *c, struct bch_dev *ca,
 	__BKEY_PADDED(k, 8) alloc_key;
 #endif
 	struct bkey_i_alloc *a = bkey_alloc_init(&alloc_key.k);
+	struct bucket *g;
+	struct bucket_mark m;
 	int ret;
 
 	BUG_ON(BKEY_ALLOC_VAL_U64s_MAX > 8);
@@ -287,7 +289,10 @@ static int __bch2_alloc_write_key(struct bch_fs *c, struct bch_dev *ca,
 	a->k.p = POS(ca->dev_idx, b);
 
 	percpu_down_read(&c->usage_lock);
-	__alloc_write_key(a, bucket(ca, b));
+	g = bucket(ca, b);
+	m = bucket_cmpxchg(g, m, m.dirty = false);
+
+	__alloc_write_key(a, g, m);
 	percpu_up_read(&c->usage_lock);
 
 	bch2_btree_iter_cond_resched(iter);
@@ -350,19 +355,24 @@ int bch2_alloc_write(struct bch_fs *c)
 
 	for_each_rw_member(ca, c, i) {
 		struct btree_iter iter;
-		unsigned long bucket;
+		struct bucket_array *buckets;
+		size_t b;
 
 		bch2_btree_iter_init(&iter, c, BTREE_ID_ALLOC, POS_MIN,
 				     BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
 
 		down_read(&ca->bucket_lock);
-		for_each_set_bit(bucket, ca->buckets_dirty, ca->mi.nbuckets) {
-			ret = __bch2_alloc_write_key(c, ca, bucket,
-						     &iter, NULL, 0);
+		buckets = bucket_array(ca);
+
+		for (b = buckets->first_bucket;
+		     b < buckets->nbuckets;
+		     b++) {
+			if (!buckets->b[b].mark.dirty)
+				continue;
+
+			ret = __bch2_alloc_write_key(c, ca, b, &iter, NULL, 0);
 			if (ret)
 				break;
-
-			clear_bit(bucket, ca->buckets_dirty);
 		}
 		up_read(&ca->bucket_lock);
 		bch2_btree_iter_unlock(&iter);
@@ -541,6 +551,10 @@ static bool bch2_can_invalidate_bucket(struct bch_dev *ca,
 	if (!is_available_bucket(mark))
 		return false;
 
+	if (ca->buckets_nouse &&
+	    test_bit(bucket, ca->buckets_nouse))
+		return false;
+
 	gc_gen = bucket_gc_gen(ca, bucket);
 
 	if (gc_gen >= BUCKET_GC_GEN_MAX / 2)
@@ -1340,6 +1354,7 @@ static int __bch2_fs_allocator_start(struct bch_fs *c)
 			m = READ_ONCE(buckets->b[bu].mark);
 
 			if (!buckets->b[bu].gen_valid ||
+			    !test_bit(bu, ca->buckets_nouse) ||
 			    !is_available_bucket(m) ||
 			    m.cached_sectors)
 				continue;
@@ -1378,7 +1393,7 @@ not_enough:
 				bch2_invalidate_one_bucket(c, ca, bu, &journal_seq);
 
 			fifo_push(&ca->free[RESERVE_BTREE], bu);
-			set_bit(bu, ca->buckets_dirty);
+			bucket_set_dirty(ca, bu);
 		}
 	}
 
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index e9ae7e6d53b2..48926fda44ff 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -395,7 +395,7 @@ struct bch_dev {
 	 * Or rcu_read_lock(), but only for ptr_stale():
 	 */
 	struct bucket_array __rcu *buckets[2];
-	unsigned long		*buckets_dirty;
+	unsigned long		*buckets_nouse;
 	unsigned long		*buckets_written;
 	/* most out of date gen in the btree */
 	u8			*oldest_gens;
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 997e72aa4cb1..1f99c5cb3439 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -150,7 +150,7 @@ static int bch2_gc_mark_key(struct bch_fs *c, struct bkey_s_c k,
 					k.k->type, ptr->gen)) {
 				g->_mark.gen = ptr->gen;
 				g->gen_valid = 1;
-				set_bit(b, ca->buckets_dirty);
+				bucket_set_dirty(ca, b);
 			}
 
 			if (mustfix_fsck_err_on(gen_cmp(ptr->gen, g->mark.gen) > 0, c,
@@ -158,7 +158,7 @@ static int bch2_gc_mark_key(struct bch_fs *c, struct bkey_s_c k,
 					k.k->type, ptr->gen, g->mark.gen)) {
 				g->_mark.gen = ptr->gen;
 				g->gen_valid = 1;
-				set_bit(b, ca->buckets_dirty);
+				bucket_set_dirty(ca, b);
 				set_bit(BCH_FS_FIXED_GENS, &c->flags);
 			}
 		}
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 8cbc1c5c8af5..cc0a318ff133 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -1132,7 +1132,7 @@ static void buckets_free_rcu(struct rcu_head *rcu)
 int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
 {
 	struct bucket_array *buckets = NULL, *old_buckets = NULL;
-	unsigned long *buckets_dirty = NULL;
+	unsigned long *buckets_nouse = NULL;
 	unsigned long *buckets_written = NULL;
 	u8 *oldest_gens = NULL;
 	alloc_fifo	free[RESERVE_NR];
@@ -1162,7 +1162,7 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
 					    GFP_KERNEL|__GFP_ZERO)) ||
 	    !(oldest_gens	= kvpmalloc(nbuckets * sizeof(u8),
 					    GFP_KERNEL|__GFP_ZERO)) ||
-	    !(buckets_dirty	= kvpmalloc(BITS_TO_LONGS(nbuckets) *
+	    !(buckets_nouse	= kvpmalloc(BITS_TO_LONGS(nbuckets) *
 					    sizeof(unsigned long),
 					    GFP_KERNEL|__GFP_ZERO)) ||
 	    !(buckets_written	= kvpmalloc(BITS_TO_LONGS(nbuckets) *
@@ -1199,8 +1199,8 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
 		memcpy(oldest_gens,
 		       ca->oldest_gens,
 		       n * sizeof(u8));
-		memcpy(buckets_dirty,
-		       ca->buckets_dirty,
+		memcpy(buckets_nouse,
+		       ca->buckets_nouse,
 		       BITS_TO_LONGS(n) * sizeof(unsigned long));
 		memcpy(buckets_written,
 		       ca->buckets_written,
@@ -1211,7 +1211,7 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
 	buckets = old_buckets;
 
 	swap(ca->oldest_gens, oldest_gens);
-	swap(ca->buckets_dirty, buckets_dirty);
+	swap(ca->buckets_nouse, buckets_nouse);
 	swap(ca->buckets_written, buckets_written);
 
 	if (resize)
@@ -1250,7 +1250,7 @@ err:
 	free_fifo(&free_inc);
 	for (i = 0; i < RESERVE_NR; i++)
 		free_fifo(&free[i]);
-	kvpfree(buckets_dirty,
+	kvpfree(buckets_nouse,
 		BITS_TO_LONGS(nbuckets) * sizeof(unsigned long));
 	kvpfree(buckets_written,
 		BITS_TO_LONGS(nbuckets) * sizeof(unsigned long));
@@ -1273,7 +1273,7 @@ void bch2_dev_buckets_free(struct bch_dev *ca)
 		free_fifo(&ca->free[i]);
 	kvpfree(ca->buckets_written,
 		BITS_TO_LONGS(ca->mi.nbuckets) * sizeof(unsigned long));
-	kvpfree(ca->buckets_dirty,
+	kvpfree(ca->buckets_nouse,
 		BITS_TO_LONGS(ca->mi.nbuckets) * sizeof(unsigned long));
 	kvpfree(ca->oldest_gens, ca->mi.nbuckets * sizeof(u8));
 	kvpfree(rcu_dereference_protected(ca->buckets[0], 1),
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index c584ad1b4375..d34181c78f9b 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -57,6 +57,18 @@ static inline struct bucket *bucket(struct bch_dev *ca, size_t b)
 	return __bucket(ca, b, false);
 }
 
+static inline void bucket_set_dirty(struct bch_dev *ca, size_t b)
+{
+	struct bucket *g;
+	struct bucket_mark m;
+
+	rcu_read_lock();
+	g = bucket(ca, b);
+	bucket_cmpxchg(g, m, m.dirty = true);
+	rcu_read_unlock();
+
+}
+
 static inline void bucket_io_clock_reset(struct bch_fs *c, struct bch_dev *ca,
 					 size_t b, int rw)
 {
@@ -196,8 +208,7 @@ static inline bool is_available_bucket(struct bucket_mark mark)
 {
 	return (!mark.owned_by_allocator &&
 		!mark.dirty_sectors &&
-		!mark.stripe &&
-		!mark.nouse);
+		!mark.stripe);
 }
 
 static inline bool bucket_needs_journal_commit(struct bucket_mark m,
diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h
index a47a7856eee4..35d5cf48003b 100644
--- a/fs/bcachefs/buckets_types.h
+++ b/fs/bcachefs/buckets_types.h
@@ -15,7 +15,7 @@ struct bucket_mark {
 	u8		gen;
 	u8		data_type:3,
 			owned_by_allocator:1,
-			nouse:1,
+			dirty:1,
 			journal_seq_valid:1,
 			stripe:1;
 	u16		dirty_sectors;
-- 
cgit 


From 76640280ac45e2d56f5bf7c519b10653b9361fcd Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 24 Nov 2018 19:01:45 -0500
Subject: bcachefs: New blockcount field for bch_stripe

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/ec.c | 31 +++++++++++++++++++++----------
 1 file changed, 21 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index 010b9b90f2fc..5a5baba8d79b 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -105,22 +105,32 @@ static unsigned stripe_csums_per_device(const struct bch_stripe *s)
 			    1 << s->csum_granularity_bits);
 }
 
-static unsigned stripe_val_u64s(const struct bch_stripe *s)
+static unsigned stripe_csum_offset(const struct bch_stripe *s,
+				   unsigned dev, unsigned csum_idx)
 {
-	unsigned bytes = sizeof(struct bch_stripe) +
+	unsigned csum_bytes = bch_crc_bytes[s->csum_type];
+
+	return sizeof(struct bch_stripe) +
 		sizeof(struct bch_extent_ptr) * s->nr_blocks +
-		bch_crc_bytes[s->csum_type] * s->nr_blocks * stripe_csums_per_device(s);
-	return DIV_ROUND_UP(bytes, sizeof(u64));
+		(dev * stripe_csums_per_device(s) + csum_idx) * csum_bytes;
 }
 
-static void *stripe_csum(struct bch_stripe *s, unsigned dev, unsigned csum_idx)
+static unsigned stripe_blockcount_offset(const struct bch_stripe *s,
+					 unsigned idx)
 {
-	unsigned csum_bytes = bch_crc_bytes[s->csum_type];
-	void *csums = s->ptrs + s->nr_blocks;
+	return stripe_csum_offset(s, s->nr_blocks, 0) +
+		sizeof(16) * idx;
+}
 
-	BUG_ON(!csum_bytes);
+static unsigned stripe_val_u64s(const struct bch_stripe *s)
+{
+	return DIV_ROUND_UP(stripe_blockcount_offset(s, s->nr_blocks),
+			    sizeof(u64));
+}
 
-	return csums + (dev * stripe_csums_per_device(s) + csum_idx) * csum_bytes;
+static void *stripe_csum(struct bch_stripe *s, unsigned dev, unsigned csum_idx)
+{
+	return (void *) s + stripe_csum_offset(s, dev, csum_idx);
 }
 
 const char *bch2_stripe_invalid(const struct bch_fs *c, struct bkey_s_c k)
@@ -133,7 +143,8 @@ const char *bch2_stripe_invalid(const struct bch_fs *c, struct bkey_s_c k)
 	if (bkey_val_bytes(k.k) < sizeof(*s))
 		return "incorrect value size";
 
-	if (bkey_val_u64s(k.k) != stripe_val_u64s(s))
+	if (bkey_val_bytes(k.k) < sizeof(*s) ||
+	    bkey_val_u64s(k.k) < stripe_val_u64s(s))
 		return "incorrect value size";
 
 	return NULL;
-- 
cgit 


From 9166b41db1ded0ed284ae40fbe5ec5b83191cc65 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 26 Nov 2018 00:13:33 -0500
Subject: bcachefs: s/usage_lock/mark_lock

better describes what it's for, and we're going to call a new lock
usage_lock

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c      | 16 +++++++--------
 fs/bcachefs/alloc_foreground.c      | 14 ++++++-------
 fs/bcachefs/bcachefs.h              |  4 ++--
 fs/bcachefs/btree_gc.c              | 16 +++++++--------
 fs/bcachefs/btree_update_interior.c | 12 +++++------
 fs/bcachefs/buckets.c               | 40 ++++++++++++++++++-------------------
 fs/bcachefs/buckets.h               |  2 +-
 fs/bcachefs/io.c                    |  4 ++--
 fs/bcachefs/journal.c               |  4 ++--
 fs/bcachefs/super.c                 |  4 ++--
 10 files changed, 58 insertions(+), 58 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 8756b2efb53e..9c9464efd333 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -218,9 +218,9 @@ static void bch2_alloc_read_key(struct bch_fs *c, struct bkey_s_c k)
 	if (a.k->p.offset >= ca->mi.nbuckets)
 		return;
 
-	percpu_down_read(&c->usage_lock);
+	percpu_down_read(&c->mark_lock);
 	__alloc_read_key(bucket(ca, a.k->p.offset), a.v);
-	percpu_up_read(&c->usage_lock);
+	percpu_up_read(&c->mark_lock);
 }
 
 int bch2_alloc_read(struct bch_fs *c, struct list_head *journal_replay_list)
@@ -288,12 +288,12 @@ static int __bch2_alloc_write_key(struct bch_fs *c, struct bch_dev *ca,
 
 	a->k.p = POS(ca->dev_idx, b);
 
-	percpu_down_read(&c->usage_lock);
+	percpu_down_read(&c->mark_lock);
 	g = bucket(ca, b);
 	m = bucket_cmpxchg(g, m, m.dirty = false);
 
 	__alloc_write_key(a, g, m);
-	percpu_up_read(&c->usage_lock);
+	percpu_up_read(&c->mark_lock);
 
 	bch2_btree_iter_cond_resched(iter);
 
@@ -804,7 +804,7 @@ static bool bch2_invalidate_one_bucket(struct bch_fs *c, struct bch_dev *ca,
 {
 	struct bucket_mark m;
 
-	percpu_down_read(&c->usage_lock);
+	percpu_down_read(&c->mark_lock);
 	spin_lock(&c->freelist_lock);
 
 	bch2_invalidate_bucket(c, ca, bucket, &m);
@@ -817,7 +817,7 @@ static bool bch2_invalidate_one_bucket(struct bch_fs *c, struct bch_dev *ca,
 	bucket_io_clock_reset(c, ca, bucket, READ);
 	bucket_io_clock_reset(c, ca, bucket, WRITE);
 
-	percpu_up_read(&c->usage_lock);
+	percpu_up_read(&c->mark_lock);
 
 	if (m.journal_seq_valid) {
 		u64 journal_seq = atomic64_read(&c->journal.seq);
@@ -1345,7 +1345,7 @@ static int __bch2_fs_allocator_start(struct bch_fs *c)
 		struct bucket_mark m;
 
 		down_read(&ca->bucket_lock);
-		percpu_down_read(&c->usage_lock);
+		percpu_down_read(&c->mark_lock);
 
 		buckets = bucket_array(ca);
 
@@ -1369,7 +1369,7 @@ static int __bch2_fs_allocator_start(struct bch_fs *c)
 			if (fifo_full(&ca->free[RESERVE_BTREE]))
 				break;
 		}
-		percpu_up_read(&c->usage_lock);
+		percpu_up_read(&c->mark_lock);
 		up_read(&ca->bucket_lock);
 	}
 
diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index ddcf2c407764..3e77af4305a5 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -101,7 +101,7 @@ void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob)
 		return;
 	}
 
-	percpu_down_read(&c->usage_lock);
+	percpu_down_read(&c->mark_lock);
 	spin_lock(&ob->lock);
 
 	bch2_mark_alloc_bucket(c, ca, PTR_BUCKET_NR(ca, &ob->ptr),
@@ -109,7 +109,7 @@ void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob)
 	ob->valid = false;
 
 	spin_unlock(&ob->lock);
-	percpu_up_read(&c->usage_lock);
+	percpu_up_read(&c->mark_lock);
 
 	spin_lock(&c->freelist_lock);
 	ob->freelist = c->open_buckets_freelist;
@@ -441,7 +441,7 @@ static int ec_stripe_alloc(struct bch_fs *c, struct ec_stripe_head *h)
 	open_bucket_for_each(c, &h->blocks, ob, i)
 		__clear_bit(ob->ptr.dev, devs.d);
 
-	percpu_down_read(&c->usage_lock);
+	percpu_down_read(&c->mark_lock);
 	rcu_read_lock();
 
 	if (h->parity.nr < h->redundancy) {
@@ -477,12 +477,12 @@ static int ec_stripe_alloc(struct bch_fs *c, struct ec_stripe_head *h)
 	}
 
 	rcu_read_unlock();
-	percpu_up_read(&c->usage_lock);
+	percpu_up_read(&c->mark_lock);
 
 	return bch2_ec_stripe_new_alloc(c, h);
 err:
 	rcu_read_unlock();
-	percpu_up_read(&c->usage_lock);
+	percpu_up_read(&c->mark_lock);
 	return -1;
 }
 
@@ -638,7 +638,7 @@ static int open_bucket_add_buckets(struct bch_fs *c,
 	if (*nr_effective >= nr_replicas)
 		return 0;
 
-	percpu_down_read(&c->usage_lock);
+	percpu_down_read(&c->mark_lock);
 	rcu_read_lock();
 
 retry_blocking:
@@ -655,7 +655,7 @@ retry_blocking:
 	}
 
 	rcu_read_unlock();
-	percpu_up_read(&c->usage_lock);
+	percpu_up_read(&c->mark_lock);
 
 	return ret;
 }
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 48926fda44ff..829b2c8b067b 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -390,7 +390,7 @@ struct bch_dev {
 
 	/*
 	 * Buckets:
-	 * Per-bucket arrays are protected by c->usage_lock, bucket_lock and
+	 * Per-bucket arrays are protected by c->mark_lock, bucket_lock and
 	 * gc_lock, for device resize - holding any is sufficient for access:
 	 * Or rcu_read_lock(), but only for ptr_stale():
 	 */
@@ -617,7 +617,7 @@ struct bch_fs {
 
 	struct bch_fs_usage __percpu *usage[2];
 
-	struct percpu_rw_semaphore usage_lock;
+	struct percpu_rw_semaphore mark_lock;
 
 	/*
 	 * When we invalidate buckets, we use both the priority and the amount
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 1f99c5cb3439..75ea243d4bbc 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -351,7 +351,7 @@ void bch2_mark_dev_superblock(struct bch_fs *c, struct bch_dev *ca,
 	 */
 	if (c) {
 		lockdep_assert_held(&c->sb_lock);
-		percpu_down_read(&c->usage_lock);
+		percpu_down_read(&c->mark_lock);
 	} else {
 		preempt_disable();
 	}
@@ -376,7 +376,7 @@ void bch2_mark_dev_superblock(struct bch_fs *c, struct bch_dev *ca,
 	}
 
 	if (c) {
-		percpu_up_read(&c->usage_lock);
+		percpu_up_read(&c->mark_lock);
 	} else {
 		preempt_enable();
 	}
@@ -422,7 +422,7 @@ static void bch2_mark_allocator_buckets(struct bch_fs *c)
 	size_t i, j, iter;
 	unsigned ci;
 
-	percpu_down_read(&c->usage_lock);
+	percpu_down_read(&c->mark_lock);
 
 	spin_lock(&c->freelist_lock);
 	gc_pos_set(c, gc_pos_alloc(c, NULL));
@@ -458,7 +458,7 @@ static void bch2_mark_allocator_buckets(struct bch_fs *c)
 		spin_unlock(&ob->lock);
 	}
 
-	percpu_up_read(&c->usage_lock);
+	percpu_up_read(&c->mark_lock);
 }
 
 static void bch2_gc_free(struct bch_fs *c)
@@ -578,7 +578,7 @@ static void bch2_gc_done(struct bch_fs *c, bool initial)
 #define copy_fs_field(_f, _msg, ...)					\
 	copy_field(_f, "fs has wrong " _msg, ##__VA_ARGS__)
 
-	percpu_down_write(&c->usage_lock);
+	percpu_down_write(&c->mark_lock);
 
 	if (initial) {
 		bch2_gc_done_nocheck(c);
@@ -698,7 +698,7 @@ static void bch2_gc_done(struct bch_fs *c, bool initial)
 		preempt_enable();
 	}
 out:
-	percpu_up_write(&c->usage_lock);
+	percpu_up_write(&c->mark_lock);
 
 #undef copy_fs_field
 #undef copy_dev_field
@@ -743,7 +743,7 @@ static int bch2_gc_start(struct bch_fs *c)
 		}
 	}
 
-	percpu_down_write(&c->usage_lock);
+	percpu_down_write(&c->mark_lock);
 
 	for_each_member_device(ca, c, i) {
 		struct bucket_array *dst = __bucket_array(ca, 1);
@@ -757,7 +757,7 @@ static int bch2_gc_start(struct bch_fs *c)
 			dst->b[b]._mark.gen = src->b[b].mark.gen;
 	};
 
-	percpu_up_write(&c->usage_lock);
+	percpu_up_write(&c->mark_lock);
 
 	return bch2_ec_mem_alloc(c, true);
 }
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 83a72cccb870..a314bda544dd 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -1062,7 +1062,7 @@ static void bch2_btree_set_root_inmem(struct btree_update *as, struct btree *b)
 	__bch2_btree_set_root_inmem(c, b);
 
 	mutex_lock(&c->btree_interior_update_lock);
-	percpu_down_read(&c->usage_lock);
+	percpu_down_read(&c->mark_lock);
 
 	bch2_mark_key_locked(c, bkey_i_to_s_c(&b->key),
 		      true, 0,
@@ -1076,7 +1076,7 @@ static void bch2_btree_set_root_inmem(struct btree_update *as, struct btree *b)
 	bch2_fs_usage_apply(c, &stats, &as->reserve->disk_res,
 			    gc_pos_btree_root(b->btree_id));
 
-	percpu_up_read(&c->usage_lock);
+	percpu_up_read(&c->mark_lock);
 	mutex_unlock(&c->btree_interior_update_lock);
 }
 
@@ -1155,7 +1155,7 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, struct btree *b
 	BUG_ON(insert->k.u64s > bch_btree_keys_u64s_remaining(c, b));
 
 	mutex_lock(&c->btree_interior_update_lock);
-	percpu_down_read(&c->usage_lock);
+	percpu_down_read(&c->mark_lock);
 
 	bch2_mark_key_locked(c, bkey_i_to_s_c(insert),
 			     true, 0,
@@ -1177,7 +1177,7 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, struct btree *b
 	bch2_fs_usage_apply(c, &stats, &as->reserve->disk_res,
 			    gc_pos_btree_node(b));
 
-	percpu_up_read(&c->usage_lock);
+	percpu_up_read(&c->mark_lock);
 	mutex_unlock(&c->btree_interior_update_lock);
 
 	bch2_btree_bset_insert_key(iter, b, node_iter, insert);
@@ -1965,7 +1965,7 @@ static void __bch2_btree_node_update_key(struct bch_fs *c,
 		bch2_btree_node_lock_write(b, iter);
 
 		mutex_lock(&c->btree_interior_update_lock);
-		percpu_down_read(&c->usage_lock);
+		percpu_down_read(&c->mark_lock);
 
 		bch2_mark_key_locked(c, bkey_i_to_s_c(&new_key->k_i),
 			      true, 0,
@@ -1977,7 +1977,7 @@ static void __bch2_btree_node_update_key(struct bch_fs *c,
 		bch2_fs_usage_apply(c, &stats, &as->reserve->disk_res,
 				    gc_pos_btree_root(b->btree_id));
 
-		percpu_up_read(&c->usage_lock);
+		percpu_up_read(&c->mark_lock);
 		mutex_unlock(&c->btree_interior_update_lock);
 
 		if (PTR_HASH(&new_key->k_i) != PTR_HASH(&b->key)) {
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index cc0a318ff133..c4fe703d1681 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -323,7 +323,7 @@ void bch2_fs_usage_apply(struct bch_fs *c,
 	s64 added = sum.data + sum.reserved;
 	s64 should_not_have_added;
 
-	percpu_rwsem_assert_held(&c->usage_lock);
+	percpu_rwsem_assert_held(&c->mark_lock);
 
 	/*
 	 * Not allowed to reduce sectors_available except by getting a
@@ -364,7 +364,7 @@ static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
 {
 	struct bch_dev_usage *dev_usage;
 
-	percpu_rwsem_assert_held(&c->usage_lock);
+	percpu_rwsem_assert_held(&c->mark_lock);
 
 	bch2_fs_inconsistent_on(old.data_type && new.data_type &&
 				old.data_type != new.data_type, c,
@@ -413,14 +413,14 @@ void bch2_dev_usage_from_buckets(struct bch_fs *c, struct bch_dev *ca)
 	struct bucket_array *buckets;
 	struct bucket *g;
 
-	percpu_down_read(&c->usage_lock);
+	percpu_down_read(&c->mark_lock);
 	fs_usage = this_cpu_ptr(c->usage[0]);
 	buckets = bucket_array(ca);
 
 	for_each_bucket(g, buckets)
 		if (g->mark.data_type)
 			bch2_dev_usage_update(c, ca, fs_usage, old, g->mark, false);
-	percpu_up_read(&c->usage_lock);
+	percpu_up_read(&c->mark_lock);
 }
 
 #define bucket_data_cmpxchg(c, ca, fs_usage, g, new, expr)	\
@@ -455,7 +455,7 @@ static void __bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca,
 void bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca,
 			    size_t b, struct bucket_mark *old)
 {
-	percpu_rwsem_assert_held(&c->usage_lock);
+	percpu_rwsem_assert_held(&c->mark_lock);
 
 	__bch2_invalidate_bucket(c, ca, b, old, false);
 
@@ -484,7 +484,7 @@ void bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
 			    size_t b, bool owned_by_allocator,
 			    struct gc_pos pos, unsigned flags)
 {
-	percpu_rwsem_assert_held(&c->usage_lock);
+	percpu_rwsem_assert_held(&c->mark_lock);
 
 	if (!(flags & BCH_BUCKET_MARK_GC))
 		__bch2_mark_alloc_bucket(c, ca, b, owned_by_allocator, false);
@@ -531,7 +531,7 @@ void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
 	preempt_disable();
 
 	if (likely(c)) {
-		percpu_rwsem_assert_held(&c->usage_lock);
+		percpu_rwsem_assert_held(&c->mark_lock);
 
 		if (!(flags & BCH_BUCKET_MARK_GC))
 			__bch2_mark_metadata_bucket(c, ca, b, type, sectors,
@@ -924,10 +924,10 @@ int bch2_mark_key(struct bch_fs *c, struct bkey_s_c k,
 {
 	int ret;
 
-	percpu_down_read(&c->usage_lock);
+	percpu_down_read(&c->mark_lock);
 	ret = bch2_mark_key_locked(c, k, inserting, sectors,
 				   pos, stats, journal_seq, flags);
-	percpu_up_read(&c->usage_lock);
+	percpu_up_read(&c->mark_lock);
 
 	return ret;
 }
@@ -946,7 +946,7 @@ void bch2_mark_update(struct btree_insert *trans,
 	if (!btree_node_type_needs_gc(iter->btree_id))
 		return;
 
-	percpu_down_read(&c->usage_lock);
+	percpu_down_read(&c->mark_lock);
 
 	if (!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))
 		bch2_mark_key_locked(c, bkey_i_to_s_c(insert->k), true,
@@ -1003,7 +1003,7 @@ void bch2_mark_update(struct btree_insert *trans,
 
 	bch2_fs_usage_apply(c, &stats, trans->disk_res, pos);
 
-	percpu_up_read(&c->usage_lock);
+	percpu_up_read(&c->mark_lock);
 }
 
 /* Disk reservations: */
@@ -1020,12 +1020,12 @@ static u64 bch2_recalc_sectors_available(struct bch_fs *c)
 
 void __bch2_disk_reservation_put(struct bch_fs *c, struct disk_reservation *res)
 {
-	percpu_down_read(&c->usage_lock);
+	percpu_down_read(&c->mark_lock);
 	this_cpu_sub(c->usage[0]->online_reserved,
 		     res->sectors);
 
 	bch2_fs_stats_verify(c);
-	percpu_up_read(&c->usage_lock);
+	percpu_up_read(&c->mark_lock);
 
 	res->sectors = 0;
 }
@@ -1040,7 +1040,7 @@ int bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res,
 	s64 sectors_available;
 	int ret;
 
-	percpu_down_read(&c->usage_lock);
+	percpu_down_read(&c->mark_lock);
 	preempt_disable();
 	stats = this_cpu_ptr(c->usage[0]);
 
@@ -1054,7 +1054,7 @@ int bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res,
 
 		if (get < sectors) {
 			preempt_enable();
-			percpu_up_read(&c->usage_lock);
+			percpu_up_read(&c->mark_lock);
 			goto recalculate;
 		}
 	} while ((v = atomic64_cmpxchg(&c->sectors_available,
@@ -1070,7 +1070,7 @@ out:
 	bch2_disk_reservations_verify(c, flags);
 	bch2_fs_stats_verify(c);
 	preempt_enable();
-	percpu_up_read(&c->usage_lock);
+	percpu_up_read(&c->mark_lock);
 	return 0;
 
 recalculate:
@@ -1091,7 +1091,7 @@ recalculate:
 			return -EINTR;
 	}
 
-	percpu_down_write(&c->usage_lock);
+	percpu_down_write(&c->mark_lock);
 	sectors_available = bch2_recalc_sectors_available(c);
 
 	if (sectors <= sectors_available ||
@@ -1109,7 +1109,7 @@ recalculate:
 	}
 
 	bch2_fs_stats_verify(c);
-	percpu_up_write(&c->usage_lock);
+	percpu_up_write(&c->mark_lock);
 
 	if (!(flags & BCH_DISK_RESERVATION_GC_LOCK_HELD))
 		up_read(&c->gc_lock);
@@ -1185,7 +1185,7 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
 	if (resize) {
 		down_write(&c->gc_lock);
 		down_write(&ca->bucket_lock);
-		percpu_down_write(&c->usage_lock);
+		percpu_down_write(&c->mark_lock);
 	}
 
 	old_buckets = bucket_array(ca);
@@ -1215,7 +1215,7 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
 	swap(ca->buckets_written, buckets_written);
 
 	if (resize)
-		percpu_up_write(&c->usage_lock);
+		percpu_up_write(&c->mark_lock);
 
 	spin_lock(&c->freelist_lock);
 	for (i = 0; i < RESERVE_NR; i++) {
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index d34181c78f9b..a13f7e068c61 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -34,7 +34,7 @@ static inline struct bucket_array *__bucket_array(struct bch_dev *ca,
 {
 	return rcu_dereference_check(ca->buckets[gc],
 				     !ca->fs ||
-				     percpu_rwsem_is_held(&ca->fs->usage_lock) ||
+				     percpu_rwsem_is_held(&ca->fs->mark_lock) ||
 				     lockdep_is_held(&ca->fs->gc_lock) ||
 				     lockdep_is_held(&ca->bucket_lock));
 }
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index 3e990709fedb..542d8ee6144c 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -1740,9 +1740,9 @@ noclone:
 
 	bch2_increment_clock(c, bio_sectors(&rbio->bio), READ);
 
-	percpu_down_read(&c->usage_lock);
+	percpu_down_read(&c->mark_lock);
 	bucket_io_clock_reset(c, ca, PTR_BUCKET_NR(ca, &pick.ptr), READ);
-	percpu_up_read(&c->usage_lock);
+	percpu_up_read(&c->mark_lock);
 
 	if (likely(!(flags & (BCH_READ_IN_RETRY|BCH_READ_LAST_FRAGMENT)))) {
 		bio_inc_remaining(&orig->bio);
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index 2f8dae4013af..310553bd5323 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -754,7 +754,7 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
 		}
 
 		if (c) {
-			percpu_down_read(&c->usage_lock);
+			percpu_down_read(&c->mark_lock);
 			spin_lock(&c->journal.lock);
 		} else {
 			preempt_disable();
@@ -782,7 +782,7 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
 
 		if (c) {
 			spin_unlock(&c->journal.lock);
-			percpu_up_read(&c->usage_lock);
+			percpu_up_read(&c->mark_lock);
 		} else {
 			preempt_enable();
 		}
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 7405b5cdd1bf..cc14d4310436 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -374,7 +374,7 @@ static void bch2_fs_free(struct bch_fs *c)
 	bch2_io_clock_exit(&c->io_clock[WRITE]);
 	bch2_io_clock_exit(&c->io_clock[READ]);
 	bch2_fs_compress_exit(c);
-	percpu_free_rwsem(&c->usage_lock);
+	percpu_free_rwsem(&c->mark_lock);
 	free_percpu(c->usage[0]);
 	mempool_exit(&c->btree_iters_pool);
 	mempool_exit(&c->btree_bounce_pool);
@@ -608,7 +608,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 			    offsetof(struct btree_write_bio, wbio.bio)),
 			BIOSET_NEED_BVECS) ||
 	    !(c->usage[0] = alloc_percpu(struct bch_fs_usage)) ||
-	    percpu_init_rwsem(&c->usage_lock) ||
+	    percpu_init_rwsem(&c->mark_lock) ||
 	    mempool_init_kvpmalloc_pool(&c->btree_bounce_pool, 1,
 					btree_bytes(c)) ||
 	    mempool_init_kmalloc_pool(&c->btree_iters_pool, 1,
-- 
cgit 


From 56338d3dc3abb9ddece01ddc7cc892102dda2842 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 5 Dec 2018 12:28:35 -0500
Subject: bcachefs: propagate BCH_WRITE_CACHED

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/io.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index 542d8ee6144c..74c3a848e153 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -425,6 +425,7 @@ static void init_append_extent(struct bch_write_op *op,
 			       struct bch_extent_crc_unpacked crc)
 {
 	struct bkey_i_extent *e = bkey_extent_init(op->insert_keys.top);
+	struct bch_extent_ptr *ptr;
 
 	op->pos.offset += crc.uncompressed_size;
 	e->k.p		= op->pos;
@@ -438,6 +439,10 @@ static void init_append_extent(struct bch_write_op *op,
 
 	bch2_alloc_sectors_append_ptrs(op->c, wp, &e->k_i, crc.compressed_size);
 
+	if (op->flags & BCH_WRITE_CACHED)
+		extent_for_each_ptr(extent_i_to_s(e), ptr)
+			ptr->cached = true;
+
 	bch2_keylist_push(&op->insert_keys);
 }
 
-- 
cgit 


From db636adb2e74fd54489f00667722e09a75702aaa Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 5 Dec 2018 12:30:02 -0500
Subject: bcachefs: Compression fixes

regressions from switching disk space accounting to be in compressed
sectors

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/extents.c |  3 +--
 fs/bcachefs/move.c    | 20 +++++++++++++++-----
 2 files changed, 16 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index c9a6f6e4a165..1076d32945f8 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -305,8 +305,7 @@ unsigned bch2_extent_is_compressed(struct bkey_s_c k)
 
 		extent_for_each_ptr_decode(e, p, entry)
 			if (!p.ptr.cached &&
-			    p.crc.compression_type != BCH_COMPRESSION_NONE &&
-			    p.crc.compressed_size < p.crc.live_size)
+			    p.crc.compression_type != BCH_COMPRESSION_NONE)
 				ret += p.crc.compressed_size;
 	}
 	}
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index aff611c908ef..9081952316b0 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -128,13 +128,15 @@ static int bch2_migrate_index_update(struct bch_write_op *op)
 						 op->opts.data_replicas);
 
 		/*
-		 * It's possible we race, and for whatever reason the extent now
-		 * has fewer replicas than when we last looked at it - meaning
-		 * we need to get a disk reservation here:
+		 * If we're not fully overwriting @k, and it's compressed, we
+		 * need a reservation for all the pointers in @insert
 		 */
 		nr = bch2_bkey_nr_dirty_ptrs(bkey_i_to_s_c(&insert->k_i)) -
-			(bch2_bkey_nr_dirty_ptrs(k) + m->nr_ptrs_reserved);
-		if (nr > 0) {
+			 m->nr_ptrs_reserved;
+
+		if (insert->k.size < k.k->size &&
+		    bch2_extent_is_compressed(k) &&
+		    nr > 0) {
 			/*
 			 * can't call bch2_disk_reservation_add() with btree
 			 * locks held, at least not without a song and dance
@@ -242,8 +244,16 @@ int bch2_migrate_write_init(struct bch_fs *c, struct migrate_write *m,
 
 	switch (data_cmd) {
 	case DATA_ADD_REPLICAS: {
+		/*
+		 * DATA_ADD_REPLICAS is used for moving data to a different
+		 * device in the background, and due to compression the new copy
+		 * might take up more space than the old copy:
+		 */
+#if 0
 		int nr = (int) io_opts.data_replicas -
 			bch2_bkey_nr_dirty_ptrs(k);
+#endif
+		int nr = (int) io_opts.data_replicas;
 
 		if (nr > 0) {
 			m->op.nr_replicas = m->nr_ptrs_reserved = nr;
-- 
cgit 


From 57cb2142ed1aadf2bf737f732bc74e5649dbcb15 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 6 Dec 2018 09:58:03 -0500
Subject: bcachefs: Fix for running in degraded mode

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/journal_io.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index f3bb28f32c6e..4178dd9ceb8e 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -1036,7 +1036,7 @@ done:
 	spin_unlock(&j->lock);
 	rcu_read_unlock();
 
-	return replicas >= replicas_want ? 0 : -EROFS;
+	return replicas >= c->opts.metadata_replicas_required ? 0 : -EROFS;
 }
 
 static void journal_write_compact(struct jset *jset)
-- 
cgit 


From 641ab736437a3f9881467c0005b0d677194fff63 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 6 Dec 2018 11:52:58 -0500
Subject: bcachefs: improve/clarify ptr_disk_sectors()

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/buckets.c       | 47 ++++++++++++++++++---------------------------
 fs/bcachefs/buckets.h       | 14 ++++++++++++++
 fs/bcachefs/extents_types.h |  8 ++++----
 3 files changed, 37 insertions(+), 32 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index c4fe703d1681..ab68c5138ade 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -558,36 +558,25 @@ void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
 	preempt_enable();
 }
 
-static int __disk_sectors(struct bch_extent_crc_unpacked crc, unsigned sectors)
+static s64 ptr_disk_sectors_delta(struct extent_ptr_decoded p,
+				  s64 delta)
 {
-	if (!sectors)
-		return 0;
-
-	return max(1U, DIV_ROUND_UP(sectors * crc.compressed_size,
-				    crc.uncompressed_size));
-}
-
-static s64 ptr_disk_sectors(const struct bkey *k,
-			    struct extent_ptr_decoded p,
-			    s64 sectors)
-{
-
-	if (p.crc.compression_type) {
-		unsigned old_sectors, new_sectors;
-
-		if (sectors > 0) {
-			old_sectors = 0;
-			new_sectors = sectors;
-		} else {
-			old_sectors = k->size;
-			new_sectors = k->size + sectors;
-		}
+	if (delta > 0) {
+		/*
+		 * marking a new extent, which _will have size_ @delta
+		 *
+		 * in the bch2_mark_update -> BCH_EXTENT_OVERLAP_MIDDLE
+		 * case, we haven't actually created the key we'll be inserting
+		 * yet (for the split) - so we don't want to be using
+		 * k->size/crc.live_size here:
+		 */
+		return __ptr_disk_sectors(p, delta);
+	} else {
+		BUG_ON(-delta > p.crc.live_size);
 
-		sectors = -__disk_sectors(p.crc, old_sectors)
-			  +__disk_sectors(p.crc, new_sectors);
+		return (s64) __ptr_disk_sectors(p, p.crc.live_size + delta) -
+			(s64) ptr_disk_sectors(p);
 	}
-
-	return sectors;
 }
 
 /*
@@ -722,7 +711,9 @@ static int bch2_mark_extent(struct bch_fs *c, struct bkey_s_c k,
 	BUG_ON(!sectors);
 
 	bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
-		s64 disk_sectors = ptr_disk_sectors(k.k, p, sectors);
+		s64 disk_sectors = data_type == BCH_DATA_BTREE
+			? sectors
+			: ptr_disk_sectors_delta(p, sectors);
 		s64 adjusted_disk_sectors = disk_sectors;
 
 		bch2_mark_pointer(c, p, disk_sectors, data_type,
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index a13f7e068c61..d76e65316245 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -136,6 +136,20 @@ static inline u8 ptr_stale(struct bch_dev *ca,
 	return gen_after(ptr_bucket_mark(ca, ptr).gen, ptr->gen);
 }
 
+static inline unsigned __ptr_disk_sectors(struct extent_ptr_decoded p,
+					  unsigned live_size)
+{
+	return live_size && p.crc.compression_type
+		? max(1U, DIV_ROUND_UP(live_size * p.crc.compressed_size,
+				       p.crc.uncompressed_size))
+		: live_size;
+}
+
+static inline unsigned ptr_disk_sectors(struct extent_ptr_decoded p)
+{
+	return __ptr_disk_sectors(p, p.crc.live_size);
+}
+
 /* bucket gc marks */
 
 static inline unsigned bucket_sectors_used(struct bucket_mark mark)
diff --git a/fs/bcachefs/extents_types.h b/fs/bcachefs/extents_types.h
index a85cda0e7a6a..a8dd6952d989 100644
--- a/fs/bcachefs/extents_types.h
+++ b/fs/bcachefs/extents_types.h
@@ -5,14 +5,14 @@
 #include "bcachefs_format.h"
 
 struct bch_extent_crc_unpacked {
+	u32			compressed_size;
+	u32			uncompressed_size;
+	u32			live_size;
+
 	u8			csum_type;
 	u8			compression_type;
 
-	u16			compressed_size;
-	u16			uncompressed_size;
-
 	u16			offset;
-	u16			live_size;
 
 	u16			nonce;
 
-- 
cgit 


From 45dbb73e407fe2e9f2819daf4f74e36e9e998a59 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 6 Dec 2018 10:24:22 -0500
Subject: bcachefs: improve extent debugcheck fn

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/extents.c | 141 ++++++++++++++++----------------------------------
 1 file changed, 46 insertions(+), 95 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 1076d32945f8..67f6250ef91a 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -626,48 +626,34 @@ void bch2_btree_ptr_debugcheck(struct bch_fs *c, struct btree *b,
 {
 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
 	const struct bch_extent_ptr *ptr;
-	unsigned seq;
 	const char *err;
 	char buf[160];
 	struct bucket_mark mark;
 	struct bch_dev *ca;
-	unsigned replicas = 0;
-	bool bad;
+
+	bch2_fs_bug_on(!test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) &&
+		       !bch2_bkey_replicas_marked(c, k, false), c,
+		       "btree key bad (replicas not marked in superblock):\n%s",
+		       (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
+
+	if (!test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags))
+		return;
 
 	bkey_for_each_ptr(ptrs, ptr) {
 		ca = bch_dev_bkey_exists(c, ptr->dev);
-		replicas++;
 
-		if (!test_bit(BCH_FS_ALLOC_READ_DONE, &c->flags))
-			continue;
+		mark = ptr_bucket_mark(ca, ptr);
 
 		err = "stale";
-		if (ptr_stale(ca, ptr))
+		if (gen_after(mark.gen, ptr->gen))
 			goto err;
 
-		do {
-			seq = read_seqcount_begin(&c->gc_pos_lock);
-			mark = ptr_bucket_mark(ca, ptr);
-
-			bad = gc_pos_cmp(c->gc_pos, gc_pos_btree_node(b)) > 0 &&
-				(mark.data_type != BCH_DATA_BTREE ||
-				 mark.dirty_sectors < c->opts.btree_node_size);
-		} while (read_seqcount_retry(&c->gc_pos_lock, seq));
-
 		err = "inconsistent";
-		if (bad)
+		if (mark.data_type != BCH_DATA_BTREE ||
+		    mark.dirty_sectors < c->opts.btree_node_size)
 			goto err;
 	}
 
-	if (!test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) &&
-	    !bch2_bkey_replicas_marked(c, k, false)) {
-		bch2_bkey_val_to_text(&PBUF(buf), c, k);
-		bch2_fs_bug(c,
-			"btree key bad (replicas not marked in superblock):\n%s",
-			buf);
-		return;
-	}
-
 	return;
 err:
 	bch2_bkey_val_to_text(&PBUF(buf), c, k);
@@ -1340,13 +1326,9 @@ void bch2_extent_debugcheck(struct bch_fs *c, struct btree *b,
 			    struct bkey_s_c k)
 {
 	struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
-	const struct bch_extent_ptr *ptr;
-	struct bch_dev *ca;
-	struct bucket_mark mark;
-	unsigned seq, stale;
+	const union bch_extent_entry *entry;
+	struct extent_ptr_decoded p;
 	char buf[160];
-	bool bad;
-	unsigned replicas = 0;
 
 	/*
 	 * XXX: we should be doing most/all of these checks at startup time,
@@ -1357,73 +1339,42 @@ void bch2_extent_debugcheck(struct bch_fs *c, struct btree *b,
 	 * going to get overwritten during replay)
 	 */
 
-	extent_for_each_ptr(e, ptr) {
-		ca = bch_dev_bkey_exists(c, ptr->dev);
-		replicas++;
-
-		/*
-		 * If journal replay hasn't finished, we might be seeing keys
-		 * that will be overwritten by the time journal replay is done:
-		 */
-		if (!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags))
-			continue;
-
-		stale = 0;
-
-		do {
-			seq = read_seqcount_begin(&c->gc_pos_lock);
-			mark = ptr_bucket_mark(ca, ptr);
-
-			/* between mark and bucket gen */
-			smp_rmb();
-
-			stale = ptr_stale(ca, ptr);
-
-			bch2_fs_bug_on(stale && !ptr->cached, c,
-					 "stale dirty pointer");
-
-			bch2_fs_bug_on(stale > 96, c,
-					 "key too stale: %i",
-					 stale);
-
-			if (stale)
-				break;
-
-			bad = gc_pos_cmp(c->gc_pos, gc_pos_btree_node(b)) > 0 &&
-				(mark.data_type != BCH_DATA_USER ||
-				 !(ptr->cached
-				   ? mark.cached_sectors
-				   : mark.dirty_sectors));
-		} while (read_seqcount_retry(&c->gc_pos_lock, seq));
+	bch2_fs_bug_on(!test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) &&
+		       !bch2_bkey_replicas_marked(c, e.s_c, false), c,
+		       "extent key bad (replicas not marked in superblock):\n%s",
+		       (bch2_bkey_val_to_text(&PBUF(buf), c, e.s_c), buf));
 
-		if (bad)
-			goto bad_ptr;
-	}
-
-	if (replicas > BCH_REPLICAS_MAX) {
-		bch2_bkey_val_to_text(&PBUF(buf), c, e.s_c);
-		bch2_fs_bug(c,
-			"extent key bad (too many replicas: %u): %s",
-			replicas, buf);
+	/*
+	 * If journal replay hasn't finished, we might be seeing keys
+	 * that will be overwritten by the time journal replay is done:
+	 */
+	if (!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags))
 		return;
-	}
 
-	if (!test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) &&
-	    !bch2_bkey_replicas_marked(c, e.s_c, false)) {
-		bch2_bkey_val_to_text(&PBUF(buf), c, e.s_c);
-		bch2_fs_bug(c,
-			"extent key bad (replicas not marked in superblock):\n%s",
-			buf);
-		return;
+	extent_for_each_ptr_decode(e, p, entry) {
+		struct bch_dev *ca	= bch_dev_bkey_exists(c, p.ptr.dev);
+		struct bucket_mark mark = ptr_bucket_mark(ca, &p.ptr);
+		unsigned stale		= gen_after(mark.gen, p.ptr.gen);
+		unsigned disk_sectors	= ptr_disk_sectors(p);
+		unsigned mark_sectors	= p.ptr.cached
+			? mark.cached_sectors
+			: mark.dirty_sectors;
+
+		bch2_fs_bug_on(stale && !p.ptr.cached, c,
+			       "stale dirty pointer (ptr gen %u bucket %u",
+			       p.ptr.gen, mark.gen);
+
+		bch2_fs_bug_on(stale > 96, c, "key too stale: %i", stale);
+
+		bch2_fs_bug_on(!stale &&
+			       (mark.data_type != BCH_DATA_USER ||
+				mark_sectors < disk_sectors), c,
+			       "extent pointer not marked: %s:\n"
+			       "type %u sectors %u < %u",
+			       (bch2_bkey_val_to_text(&PBUF(buf), c, e.s_c), buf),
+			       mark.data_type,
+			       mark_sectors, disk_sectors);
 	}
-
-	return;
-
-bad_ptr:
-	bch2_bkey_val_to_text(&PBUF(buf), c, e.s_c);
-	bch2_fs_bug(c, "extent pointer bad gc mark: %s:\nbucket %zu "
-		   "gen %i type %u", buf,
-		   PTR_BUCKET_NR(ca, ptr), mark.gen, mark.data_type);
 }
 
 void bch2_extent_to_text(struct printbuf *out, struct bch_fs *c,
-- 
cgit 


From 56e0e7c79f492df2a5ad207a0ba16f13511b6416 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 6 Dec 2018 12:01:29 -0500
Subject: bcachefs: fix an incorrect bkey_debugcheck() call

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 4720061e9562..3eb51724f9e1 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -531,8 +531,24 @@ found:
 	btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
 
 	bch2_btree_node_iter_sort(node_iter, b);
-	if (!b->level && node_iter == &iter->l[0].iter)
+	if (!b->level && node_iter == &iter->l[0].iter) {
+		/*
+		 * not legal to call bkey_debugcheck() here, because we're
+		 * called midway through the update path after update has been
+		 * marked but before deletes have actually happened:
+		 */
+#if 0
 		__btree_iter_peek_all(iter, &iter->l[0], &iter->k);
+#endif
+		struct btree_iter_level *l = &iter->l[0];
+		struct bkey_packed *k =
+			bch2_btree_node_iter_peek_all(&l->iter, l->b);
+
+		if (unlikely(!k))
+			iter->k.type = KEY_TYPE_deleted;
+		else
+			bkey_disassemble(l->b, k, &iter->k);
+	}
 iter_current_key_not_modified:
 
 	/*
-- 
cgit 


From 73e6ab95640a7d370b7af481e8ed44be7c76b898 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 1 Dec 2018 10:32:48 -0500
Subject: bcachefs: Switch replicas to mark_lock

Prep work for upcoming disk accounting changes

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs.h       |   4 +-
 fs/bcachefs/recovery.c       |   3 +-
 fs/bcachefs/replicas.c       | 245 ++++++++++++++++++++-----------------------
 fs/bcachefs/replicas_types.h |   3 +-
 fs/bcachefs/super.c          |   7 +-
 5 files changed, 124 insertions(+), 138 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 829b2c8b067b..512498c275a5 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -528,8 +528,8 @@ struct bch_fs {
 
 	struct bch_dev __rcu	*devs[BCH_SB_MEMBERS_MAX];
 
-	struct bch_replicas_cpu __rcu *replicas;
-	struct bch_replicas_cpu __rcu *replicas_gc;
+	struct bch_replicas_cpu replicas;
+	struct bch_replicas_cpu replicas_gc;
 	struct mutex		replicas_gc_lock;
 
 	struct bch_disk_groups_cpu __rcu *disk_groups;
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 2d0736caa5ef..cfdf9b563637 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -131,8 +131,7 @@ int bch2_fs_recovery(struct bch_fs *c)
 	int ret;
 
 	mutex_lock(&c->sb_lock);
-	if (!rcu_dereference_protected(c->replicas,
-			lockdep_is_held(&c->sb_lock))->nr) {
+	if (!c->replicas.entries) {
 		bch_info(c, "building replicas info");
 		set_bit(BCH_FS_REBUILD_REPLICAS, &c->flags);
 	}
diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c
index 77d175f34b2b..a694b0fcd6a1 100644
--- a/fs/bcachefs/replicas.c
+++ b/fs/bcachefs/replicas.c
@@ -149,35 +149,31 @@ static inline void devlist_to_replicas(struct bch_devs_list devs,
 	replicas_entry_sort(e);
 }
 
-static struct bch_replicas_cpu *
+static struct bch_replicas_cpu
 cpu_replicas_add_entry(struct bch_replicas_cpu *old,
 		       struct bch_replicas_entry *new_entry)
 {
-	struct bch_replicas_cpu *new;
-	unsigned i, nr, entry_size;
-
-	entry_size = max_t(unsigned, old->entry_size,
-			   replicas_entry_bytes(new_entry));
-	nr = old->nr + 1;
-
-	new = kzalloc(sizeof(struct bch_replicas_cpu) +
-		      nr * entry_size, GFP_NOIO);
-	if (!new)
-		return NULL;
+	unsigned i;
+	struct bch_replicas_cpu new = {
+		.nr		= old->nr + 1,
+		.entry_size	= max_t(unsigned, old->entry_size,
+					replicas_entry_bytes(new_entry)),
+	};
 
-	new->nr		= nr;
-	new->entry_size	= entry_size;
+	new.entries = kcalloc(new.nr, new.entry_size, GFP_NOIO);
+	if (!new.entries)
+		return new;
 
 	for (i = 0; i < old->nr; i++)
-		memcpy(cpu_replicas_entry(new, i),
+		memcpy(cpu_replicas_entry(&new, i),
 		       cpu_replicas_entry(old, i),
 		       old->entry_size);
 
-	memcpy(cpu_replicas_entry(new, old->nr),
+	memcpy(cpu_replicas_entry(&new, old->nr),
 	       new_entry,
 	       replicas_entry_bytes(new_entry));
 
-	bch2_cpu_replicas_sort(new);
+	bch2_cpu_replicas_sort(&new);
 	return new;
 }
 
@@ -194,16 +190,14 @@ static bool replicas_has_entry(struct bch_fs *c,
 			       struct bch_replicas_entry *search,
 			       bool check_gc_replicas)
 {
-	struct bch_replicas_cpu *r, *gc_r;
 	bool marked;
 
-	rcu_read_lock();
-	r = rcu_dereference(c->replicas);
-	marked = __replicas_has_entry(r, search) &&
+	percpu_down_read(&c->mark_lock);
+	marked = __replicas_has_entry(&c->replicas, search) &&
 		(!check_gc_replicas ||
-		 likely(!(gc_r = rcu_dereference(c->replicas_gc))) ||
-		 __replicas_has_entry(gc_r, search));
-	rcu_read_unlock();
+		 likely((!c->replicas_gc.entries)) ||
+		 __replicas_has_entry(&c->replicas_gc, search));
+	percpu_up_read(&c->mark_lock);
 
 	return marked;
 }
@@ -212,54 +206,55 @@ noinline
 static int bch2_mark_replicas_slowpath(struct bch_fs *c,
 				struct bch_replicas_entry *new_entry)
 {
-	struct bch_replicas_cpu *old_gc, *new_gc = NULL, *old_r, *new_r = NULL;
+	struct bch_replicas_cpu new_r, new_gc;
 	int ret = -ENOMEM;
 
+	memset(&new_r, 0, sizeof(new_r));
+	memset(&new_gc, 0, sizeof(new_gc));
+
 	mutex_lock(&c->sb_lock);
 
-	old_gc = rcu_dereference_protected(c->replicas_gc,
-					   lockdep_is_held(&c->sb_lock));
-	if (old_gc && !__replicas_has_entry(old_gc, new_entry)) {
-		new_gc = cpu_replicas_add_entry(old_gc, new_entry);
-		if (!new_gc)
+	if (c->replicas_gc.entries &&
+	    !__replicas_has_entry(&c->replicas_gc, new_entry)) {
+		new_gc = cpu_replicas_add_entry(&c->replicas_gc, new_entry);
+		if (!new_gc.entries)
 			goto err;
 	}
 
-	old_r = rcu_dereference_protected(c->replicas,
-					  lockdep_is_held(&c->sb_lock));
-	if (!__replicas_has_entry(old_r, new_entry)) {
-		new_r = cpu_replicas_add_entry(old_r, new_entry);
-		if (!new_r)
+	if (!__replicas_has_entry(&c->replicas, new_entry)) {
+		new_r = cpu_replicas_add_entry(&c->replicas, new_entry);
+		if (!new_r.entries)
 			goto err;
 
-		ret = bch2_cpu_replicas_to_sb_replicas(c, new_r);
+		ret = bch2_cpu_replicas_to_sb_replicas(c, &new_r);
 		if (ret)
 			goto err;
 	}
 
+	if (!new_r.entries &&
+	    !new_gc.entries)
+		goto out;
+
 	/* allocations done, now commit: */
 
-	if (new_r)
+	if (new_r.entries)
 		bch2_write_super(c);
 
 	/* don't update in memory replicas until changes are persistent */
-
-	if (new_gc) {
-		rcu_assign_pointer(c->replicas_gc, new_gc);
-		kfree_rcu(old_gc, rcu);
-	}
-
-	if (new_r) {
-		rcu_assign_pointer(c->replicas, new_r);
-		kfree_rcu(old_r, rcu);
-	}
-
-	mutex_unlock(&c->sb_lock);
-	return 0;
+	percpu_down_write(&c->mark_lock);
+	if (new_r.entries)
+		swap(new_r, c->replicas);
+	if (new_gc.entries)
+		swap(new_gc, c->replicas_gc);
+	percpu_up_write(&c->mark_lock);
+out:
+	ret = 0;
 err:
 	mutex_unlock(&c->sb_lock);
-	kfree(new_gc);
-	kfree(new_r);
+
+	kfree(new_r.entries);
+	kfree(new_gc.entries);
+
 	return ret;
 }
 
@@ -312,20 +307,14 @@ int bch2_mark_bkey_replicas(struct bch_fs *c, struct bkey_s_c k)
 
 int bch2_replicas_gc_end(struct bch_fs *c, int ret)
 {
-	struct bch_replicas_cpu *new_r, *old_r;
-
 	lockdep_assert_held(&c->replicas_gc_lock);
 
 	mutex_lock(&c->sb_lock);
 
-	new_r = rcu_dereference_protected(c->replicas_gc,
-					  lockdep_is_held(&c->sb_lock));
-	rcu_assign_pointer(c->replicas_gc, NULL);
-
 	if (ret)
 		goto err;
 
-	if (bch2_cpu_replicas_to_sb_replicas(c, new_r)) {
+	if (bch2_cpu_replicas_to_sb_replicas(c, &c->replicas_gc)) {
 		ret = -ENOSPC;
 		goto err;
 	}
@@ -333,51 +322,54 @@ int bch2_replicas_gc_end(struct bch_fs *c, int ret)
 	bch2_write_super(c);
 
 	/* don't update in memory replicas until changes are persistent */
+err:
+	percpu_down_write(&c->mark_lock);
+	if (!ret)
+		swap(c->replicas, c->replicas_gc);
 
-	old_r = rcu_dereference_protected(c->replicas,
-					  lockdep_is_held(&c->sb_lock));
+	kfree(c->replicas_gc.entries);
+	c->replicas_gc.entries = NULL;
+	percpu_up_write(&c->mark_lock);
 
-	rcu_assign_pointer(c->replicas, new_r);
-	kfree_rcu(old_r, rcu);
-out:
 	mutex_unlock(&c->sb_lock);
 	return ret;
-err:
-	kfree_rcu(new_r, rcu);
-	goto out;
 }
 
 int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask)
 {
-	struct bch_replicas_cpu *dst, *src;
 	struct bch_replicas_entry *e;
+	unsigned i = 0;
 
 	lockdep_assert_held(&c->replicas_gc_lock);
 
 	mutex_lock(&c->sb_lock);
-	BUG_ON(c->replicas_gc);
+	BUG_ON(c->replicas_gc.entries);
 
-	src = rcu_dereference_protected(c->replicas,
-					lockdep_is_held(&c->sb_lock));
+	c->replicas_gc.nr		= 0;
+	c->replicas_gc.entry_size	= 0;
 
-	dst = kzalloc(sizeof(struct bch_replicas_cpu) +
-		      src->nr * src->entry_size, GFP_NOIO);
-	if (!dst) {
+	for_each_cpu_replicas_entry(&c->replicas, e)
+		if (!((1 << e->data_type) & typemask)) {
+			c->replicas_gc.nr++;
+			c->replicas_gc.entry_size =
+				max_t(unsigned, c->replicas_gc.entry_size,
+				      replicas_entry_bytes(e));
+		}
+
+	c->replicas_gc.entries = kcalloc(c->replicas_gc.nr,
+					 c->replicas_gc.entry_size,
+					 GFP_NOIO);
+	if (!c->replicas_gc.entries) {
 		mutex_unlock(&c->sb_lock);
 		return -ENOMEM;
 	}
 
-	dst->nr		= 0;
-	dst->entry_size	= src->entry_size;
-
-	for_each_cpu_replicas_entry(src, e)
+	for_each_cpu_replicas_entry(&c->replicas, e)
 		if (!((1 << e->data_type) & typemask))
-			memcpy(cpu_replicas_entry(dst, dst->nr++),
-			       e, dst->entry_size);
-
-	bch2_cpu_replicas_sort(dst);
+			memcpy(cpu_replicas_entry(&c->replicas_gc, i++),
+			       e, c->replicas_gc.entry_size);
 
-	rcu_assign_pointer(c->replicas_gc, dst);
+	bch2_cpu_replicas_sort(&c->replicas_gc);
 	mutex_unlock(&c->sb_lock);
 
 	return 0;
@@ -385,11 +377,11 @@ int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask)
 
 /* Replicas tracking - superblock: */
 
-static struct bch_replicas_cpu *
-__bch2_sb_replicas_to_cpu_replicas(struct bch_sb_field_replicas *sb_r)
+static int
+__bch2_sb_replicas_to_cpu_replicas(struct bch_sb_field_replicas *sb_r,
+				   struct bch_replicas_cpu *cpu_r)
 {
 	struct bch_replicas_entry *e, *dst;
-	struct bch_replicas_cpu *cpu_r;
 	unsigned nr = 0, entry_size = 0, idx = 0;
 
 	for_each_replicas_entry(sb_r, e) {
@@ -398,10 +390,9 @@ __bch2_sb_replicas_to_cpu_replicas(struct bch_sb_field_replicas *sb_r)
 		nr++;
 	}
 
-	cpu_r = kzalloc(sizeof(struct bch_replicas_cpu) +
-			nr * entry_size, GFP_NOIO);
-	if (!cpu_r)
-		return NULL;
+	cpu_r->entries = kcalloc(nr, entry_size, GFP_NOIO);
+	if (!cpu_r->entries)
+		return -ENOMEM;
 
 	cpu_r->nr		= nr;
 	cpu_r->entry_size	= entry_size;
@@ -412,14 +403,14 @@ __bch2_sb_replicas_to_cpu_replicas(struct bch_sb_field_replicas *sb_r)
 		replicas_entry_sort(dst);
 	}
 
-	return cpu_r;
+	return 0;
 }
 
-static struct bch_replicas_cpu *
-__bch2_sb_replicas_v0_to_cpu_replicas(struct bch_sb_field_replicas_v0 *sb_r)
+static int
+__bch2_sb_replicas_v0_to_cpu_replicas(struct bch_sb_field_replicas_v0 *sb_r,
+				      struct bch_replicas_cpu *cpu_r)
 {
 	struct bch_replicas_entry_v0 *e;
-	struct bch_replicas_cpu *cpu_r;
 	unsigned nr = 0, entry_size = 0, idx = 0;
 
 	for_each_replicas_entry(sb_r, e) {
@@ -431,10 +422,9 @@ __bch2_sb_replicas_v0_to_cpu_replicas(struct bch_sb_field_replicas_v0 *sb_r)
 	entry_size += sizeof(struct bch_replicas_entry) -
 		sizeof(struct bch_replicas_entry_v0);
 
-	cpu_r = kzalloc(sizeof(struct bch_replicas_cpu) +
-			nr * entry_size, GFP_NOIO);
-	if (!cpu_r)
-		return NULL;
+	cpu_r->entries = kcalloc(nr, entry_size, GFP_NOIO);
+	if (!cpu_r->entries)
+		return -ENOMEM;
 
 	cpu_r->nr		= nr;
 	cpu_r->entry_size	= entry_size;
@@ -450,31 +440,31 @@ __bch2_sb_replicas_v0_to_cpu_replicas(struct bch_sb_field_replicas_v0 *sb_r)
 		replicas_entry_sort(dst);
 	}
 
-	return cpu_r;
+	return 0;
 }
 
 int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *c)
 {
 	struct bch_sb_field_replicas *sb_v1;
 	struct bch_sb_field_replicas_v0 *sb_v0;
-	struct bch_replicas_cpu *cpu_r, *old_r;
+	struct bch_replicas_cpu new_r = { 0, 0, NULL };
+	int ret = 0;
 
 	if ((sb_v1 = bch2_sb_get_replicas(c->disk_sb.sb)))
-		cpu_r = __bch2_sb_replicas_to_cpu_replicas(sb_v1);
+		ret = __bch2_sb_replicas_to_cpu_replicas(sb_v1, &new_r);
 	else if ((sb_v0 = bch2_sb_get_replicas_v0(c->disk_sb.sb)))
-		cpu_r = __bch2_sb_replicas_v0_to_cpu_replicas(sb_v0);
-	else
-		cpu_r = kzalloc(sizeof(struct bch_replicas_cpu), GFP_NOIO);
+		ret = __bch2_sb_replicas_v0_to_cpu_replicas(sb_v0, &new_r);
 
-	if (!cpu_r)
+	if (ret)
 		return -ENOMEM;
 
-	bch2_cpu_replicas_sort(cpu_r);
+	bch2_cpu_replicas_sort(&new_r);
+
+	percpu_down_write(&c->mark_lock);
+	swap(c->replicas, new_r);
+	percpu_up_write(&c->mark_lock);
 
-	old_r = rcu_dereference_check(c->replicas, lockdep_is_held(&c->sb_lock));
-	rcu_assign_pointer(c->replicas, cpu_r);
-	if (old_r)
-		kfree_rcu(old_r, rcu);
+	kfree(new_r.entries);
 
 	return 0;
 }
@@ -589,7 +579,7 @@ static const char *bch2_sb_validate_replicas(struct bch_sb *sb, struct bch_sb_fi
 {
 	struct bch_sb_field_replicas *sb_r = field_to_type(f, replicas);
 	struct bch_sb_field_members *mi = bch2_sb_get_members(sb);
-	struct bch_replicas_cpu *cpu_r = NULL;
+	struct bch_replicas_cpu cpu_r = { .entries = NULL };
 	struct bch_replicas_entry *e;
 	const char *err;
 	unsigned i;
@@ -616,13 +606,12 @@ static const char *bch2_sb_validate_replicas(struct bch_sb *sb, struct bch_sb_fi
 	}
 
 	err = "cannot allocate memory";
-	cpu_r = __bch2_sb_replicas_to_cpu_replicas(sb_r);
-	if (!cpu_r)
+	if (__bch2_sb_replicas_to_cpu_replicas(sb_r, &cpu_r))
 		goto err;
 
-	err = check_dup_replicas_entries(cpu_r);
+	err = check_dup_replicas_entries(&cpu_r);
 err:
-	kfree(cpu_r);
+	kfree(cpu_r.entries);
 	return err;
 }
 
@@ -652,7 +641,7 @@ static const char *bch2_sb_validate_replicas_v0(struct bch_sb *sb, struct bch_sb
 {
 	struct bch_sb_field_replicas_v0 *sb_r = field_to_type(f, replicas_v0);
 	struct bch_sb_field_members *mi = bch2_sb_get_members(sb);
-	struct bch_replicas_cpu *cpu_r = NULL;
+	struct bch_replicas_cpu cpu_r = { .entries = NULL };
 	struct bch_replicas_entry_v0 *e;
 	const char *err;
 	unsigned i;
@@ -673,13 +662,12 @@ static const char *bch2_sb_validate_replicas_v0(struct bch_sb *sb, struct bch_sb
 	}
 
 	err = "cannot allocate memory";
-	cpu_r = __bch2_sb_replicas_v0_to_cpu_replicas(sb_r);
-	if (!cpu_r)
+	if (__bch2_sb_replicas_v0_to_cpu_replicas(sb_r, &cpu_r))
 		goto err;
 
-	err = check_dup_replicas_entries(cpu_r);
+	err = check_dup_replicas_entries(&cpu_r);
 err:
-	kfree(cpu_r);
+	kfree(cpu_r.entries);
 	return err;
 }
 
@@ -734,7 +722,6 @@ struct replicas_status __bch2_replicas_status(struct bch_fs *c,
 {
 	struct bch_sb_field_members *mi;
 	struct bch_replicas_entry *e;
-	struct bch_replicas_cpu *r;
 	unsigned i, nr_online, nr_offline;
 	struct replicas_status ret;
 
@@ -744,10 +731,10 @@ struct replicas_status __bch2_replicas_status(struct bch_fs *c,
 		ret.replicas[i].redundancy = INT_MAX;
 
 	mi = bch2_sb_get_members(c->disk_sb.sb);
-	rcu_read_lock();
-	r = rcu_dereference(c->replicas);
 
-	for_each_cpu_replicas_entry(r, e) {
+	percpu_down_read(&c->mark_lock);
+
+	for_each_cpu_replicas_entry(&c->replicas, e) {
 		if (e->data_type >= ARRAY_SIZE(ret.replicas))
 			panic("e %p data_type %u\n", e, e->data_type);
 
@@ -772,7 +759,7 @@ struct replicas_status __bch2_replicas_status(struct bch_fs *c,
 			    nr_offline);
 	}
 
-	rcu_read_unlock();
+	percpu_up_read(&c->mark_lock);
 
 	for (i = 0; i < ARRAY_SIZE(ret.replicas); i++)
 		if (ret.replicas[i].redundancy == INT_MAX)
@@ -821,18 +808,16 @@ int bch2_replicas_online(struct bch_fs *c, bool meta)
 unsigned bch2_dev_has_data(struct bch_fs *c, struct bch_dev *ca)
 {
 	struct bch_replicas_entry *e;
-	struct bch_replicas_cpu *r;
 	unsigned i, ret = 0;
 
-	rcu_read_lock();
-	r = rcu_dereference(c->replicas);
+	percpu_down_read(&c->mark_lock);
 
-	for_each_cpu_replicas_entry(r, e)
+	for_each_cpu_replicas_entry(&c->replicas, e)
 		for (i = 0; i < e->nr_devs; i++)
 			if (e->devs[i] == ca->dev_idx)
 				ret |= 1 << e->data_type;
 
-	rcu_read_unlock();
+	percpu_up_read(&c->mark_lock);
 
 	return ret;
 }
diff --git a/fs/bcachefs/replicas_types.h b/fs/bcachefs/replicas_types.h
index 3061840b6a02..0535b1d3760e 100644
--- a/fs/bcachefs/replicas_types.h
+++ b/fs/bcachefs/replicas_types.h
@@ -2,10 +2,9 @@
 #define _BCACHEFS_REPLICAS_TYPES_H
 
 struct bch_replicas_cpu {
-	struct rcu_head		rcu;
 	unsigned		nr;
 	unsigned		entry_size;
-	struct bch_replicas_entry entries[];
+	struct bch_replicas_entry *entries;
 };
 
 #endif /* _BCACHEFS_REPLICAS_TYPES_H */
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index cc14d4310436..3887e63c0756 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -383,7 +383,8 @@ static void bch2_fs_free(struct bch_fs *c)
 	mempool_exit(&c->btree_reserve_pool);
 	mempool_exit(&c->fill_iter);
 	percpu_ref_exit(&c->writes);
-	kfree(rcu_dereference_protected(c->replicas, 1));
+	kfree(c->replicas.entries);
+	kfree(c->replicas_gc.entries);
 	kfree(rcu_dereference_protected(c->disk_groups, 1));
 
 	if (c->copygc_wq)
@@ -565,6 +566,9 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 
 	bch2_fs_btree_cache_init_early(&c->btree_cache);
 
+	if (percpu_init_rwsem(&c->mark_lock))
+		goto err;
+
 	mutex_lock(&c->sb_lock);
 
 	if (bch2_sb_to_fs(c, sb)) {
@@ -608,7 +612,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 			    offsetof(struct btree_write_bio, wbio.bio)),
 			BIOSET_NEED_BVECS) ||
 	    !(c->usage[0] = alloc_percpu(struct bch_fs_usage)) ||
-	    percpu_init_rwsem(&c->mark_lock) ||
 	    mempool_init_kvpmalloc_pool(&c->btree_bounce_pool, 1,
 					btree_bytes(c)) ||
 	    mempool_init_kmalloc_pool(&c->btree_iters_pool, 1,
-- 
cgit 


From 5663a4152138fdf23a300934128d77a1bf784237 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 27 Nov 2018 08:23:22 -0500
Subject: bcachefs: refactor bch_fs_usage

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_foreground.c |   2 +-
 fs/bcachefs/bcachefs.h         |  10 +++-
 fs/bcachefs/buckets.c          | 131 ++++++++++++++++++++++-------------------
 fs/bcachefs/buckets.h          |  42 +++++++------
 fs/bcachefs/buckets_types.h    |  14 +++--
 fs/bcachefs/chardev.c          |   2 +-
 fs/bcachefs/fs.c               |   8 +--
 fs/bcachefs/super.c            |   2 +
 8 files changed, 119 insertions(+), 92 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index 3e77af4305a5..36aa7a5f2806 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -721,7 +721,7 @@ static struct write_point *__writepoint_find(struct hlist_head *head,
 static inline bool too_many_writepoints(struct bch_fs *c, unsigned factor)
 {
 	u64 stranded	= c->write_points_nr * c->bucket_size_max;
-	u64 free	= bch2_fs_sectors_free(c, bch2_fs_usage_read(c));
+	u64 free	= bch2_fs_sectors_free(c);
 
 	return stranded * factor > free;
 }
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 512498c275a5..92a0ecd8fbc3 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -503,6 +503,10 @@ enum bch_fs_state {
 	BCH_FS_RW,
 };
 
+struct bch_fs_pcpu {
+	u64			sectors_available;
+};
+
 struct bch_fs {
 	struct closure		cl;
 
@@ -615,9 +619,11 @@ struct bch_fs {
 
 	atomic64_t		sectors_available;
 
-	struct bch_fs_usage __percpu *usage[2];
+	struct bch_fs_pcpu __percpu	*pcpu;
+
+	struct bch_fs_usage __percpu	*usage[2];
 
-	struct percpu_rw_semaphore mark_lock;
+	struct percpu_rw_semaphore	mark_lock;
 
 	/*
 	 * When we invalidate buckets, we use both the priority and the amount
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index ab68c5138ade..c53d7a030832 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -272,16 +272,31 @@ static u64 avail_factor(u64 r)
 	return (r << RESERVE_FACTOR) / ((1 << RESERVE_FACTOR) + 1);
 }
 
-static inline u64 __bch2_fs_sectors_used(struct bch_fs *c, struct bch_fs_usage stats)
+static inline u64 __bch2_fs_sectors_used(struct bch_fs *c, struct bch_fs_usage fs_usage)
 {
-	struct fs_usage_sum sum = __fs_usage_sum(stats);
+	struct fs_usage_sum sum = __fs_usage_sum(fs_usage);
 
 	return sum.hidden + sum.data + reserve_factor(sum.reserved);
 }
 
-u64 bch2_fs_sectors_used(struct bch_fs *c, struct bch_fs_usage stats)
+u64 bch2_fs_sectors_used(struct bch_fs *c, struct bch_fs_usage fs_usage)
 {
-	return min(c->capacity, __bch2_fs_sectors_used(c, stats));
+	return min(c->capacity, __bch2_fs_sectors_used(c, fs_usage));
+}
+
+struct bch_fs_usage_short
+bch2_fs_usage_read_short(struct bch_fs *c)
+{
+	struct bch_fs_usage usage = bch2_fs_usage_read(c);
+	struct fs_usage_sum sum = __fs_usage_sum(usage);
+	struct bch_fs_usage_short ret;
+
+	ret.capacity	= READ_ONCE(c->capacity) - sum.hidden;
+	ret.used	= min(ret.capacity, sum.data +
+			      reserve_factor(sum.reserved));
+	ret.nr_inodes	= usage.nr_inodes;
+
+	return ret;
 }
 
 static inline int is_unavailable_bucket(struct bucket_mark m)
@@ -315,11 +330,11 @@ static bool bucket_became_unavailable(struct bucket_mark old,
 }
 
 void bch2_fs_usage_apply(struct bch_fs *c,
-			 struct bch_fs_usage *stats,
+			 struct bch_fs_usage *fs_usage,
 			 struct disk_reservation *disk_res,
 			 struct gc_pos gc_pos)
 {
-	struct fs_usage_sum sum = __fs_usage_sum(*stats);
+	struct fs_usage_sum sum = __fs_usage_sum(*fs_usage);
 	s64 added = sum.data + sum.reserved;
 	s64 should_not_have_added;
 
@@ -337,24 +352,20 @@ void bch2_fs_usage_apply(struct bch_fs *c,
 	}
 
 	if (added > 0) {
-		disk_res->sectors	-= added;
-		stats->online_reserved	-= added;
+		disk_res->sectors		-= added;
+		fs_usage->online_reserved	-= added;
 	}
 
 	preempt_disable();
-	/* online_reserved not subject to gc: */
-	this_cpu_add(c->usage[0]->online_reserved, stats->online_reserved);
-	stats->online_reserved = 0;
-
-	bch2_usage_add(this_cpu_ptr(c->usage[0]), stats);
+	bch2_usage_add(this_cpu_ptr(c->usage[0]), fs_usage);
 
 	if (gc_visited(c, gc_pos))
-		bch2_usage_add(this_cpu_ptr(c->usage[1]), stats);
+		bch2_usage_add(this_cpu_ptr(c->usage[1]), fs_usage);
 
 	bch2_fs_stats_verify(c);
 	preempt_enable();
 
-	memset(stats, 0, sizeof(*stats));
+	memset(fs_usage, 0, sizeof(*fs_usage));
 }
 
 static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
@@ -435,11 +446,11 @@ static void __bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca,
 				     size_t b, struct bucket_mark *old,
 				     bool gc)
 {
-	struct bch_fs_usage *stats = this_cpu_ptr(c->usage[gc]);
+	struct bch_fs_usage *fs_usage = this_cpu_ptr(c->usage[gc]);
 	struct bucket *g = __bucket(ca, b, gc);
 	struct bucket_mark new;
 
-	*old = bucket_data_cmpxchg(c, ca, stats, g, new, ({
+	*old = bucket_data_cmpxchg(c, ca, fs_usage, g, new, ({
 		BUG_ON(!is_available_bucket(new));
 
 		new.owned_by_allocator	= 1;
@@ -449,7 +460,7 @@ static void __bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca,
 		new.gen++;
 	}));
 
-	stats->replicas[0].data[BCH_DATA_CACHED] -= old->cached_sectors;
+	fs_usage->replicas[0].data[BCH_DATA_CACHED] -= old->cached_sectors;
 }
 
 void bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca,
@@ -468,11 +479,11 @@ static void __bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
 				     size_t b, bool owned_by_allocator,
 				     bool gc)
 {
-	struct bch_fs_usage *stats = this_cpu_ptr(c->usage[gc]);
+	struct bch_fs_usage *fs_usage = this_cpu_ptr(c->usage[gc]);
 	struct bucket *g = __bucket(ca, b, gc);
 	struct bucket_mark old, new;
 
-	old = bucket_data_cmpxchg(c, ca, stats, g, new, ({
+	old = bucket_data_cmpxchg(c, ca, fs_usage, g, new, ({
 		new.owned_by_allocator	= owned_by_allocator;
 	}));
 
@@ -588,7 +599,7 @@ static void bch2_mark_pointer(struct bch_fs *c,
 			      struct extent_ptr_decoded p,
 			      s64 sectors, enum bch_data_type data_type,
 			      struct bch_fs_usage *fs_usage,
-			      u64 journal_seq, unsigned flags,
+			      unsigned journal_seq, unsigned flags,
 			      bool gc)
 {
 	struct bucket_mark old, new;
@@ -693,8 +704,8 @@ static int bch2_mark_stripe_ptr(struct bch_fs *c,
 
 static int bch2_mark_extent(struct bch_fs *c, struct bkey_s_c k,
 			    s64 sectors, enum bch_data_type data_type,
-			    struct bch_fs_usage *stats,
-			    u64 journal_seq, unsigned flags,
+			    struct bch_fs_usage *fs_usage,
+			    unsigned journal_seq, unsigned flags,
 			    bool gc)
 {
 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
@@ -717,7 +728,7 @@ static int bch2_mark_extent(struct bch_fs *c, struct bkey_s_c k,
 		s64 adjusted_disk_sectors = disk_sectors;
 
 		bch2_mark_pointer(c, p, disk_sectors, data_type,
-				  stats, journal_seq, flags, gc);
+				  fs_usage, journal_seq, flags, gc);
 
 		if (!p.ptr.cached)
 			for (i = 0; i < p.ec_nr; i++) {
@@ -740,13 +751,13 @@ static int bch2_mark_extent(struct bch_fs *c, struct bkey_s_c k,
 	}
 
 	replicas	= clamp_t(unsigned,	replicas,
-				  1, ARRAY_SIZE(stats->replicas));
+				  1, ARRAY_SIZE(fs_usage->replicas));
 	ec_redundancy	= clamp_t(unsigned,	ec_redundancy,
-				  1, ARRAY_SIZE(stats->replicas));
+				  1, ARRAY_SIZE(fs_usage->replicas));
 
-	stats->replicas[0].data[BCH_DATA_CACHED]	+= cached_sectors;
-	stats->replicas[replicas - 1].data[data_type]	+= dirty_sectors;
-	stats->replicas[ec_redundancy - 1].ec_data	+= ec_sectors;
+	fs_usage->replicas[0].data[BCH_DATA_CACHED]		+= cached_sectors;
+	fs_usage->replicas[replicas - 1].data[data_type]	+= dirty_sectors;
+	fs_usage->replicas[ec_redundancy - 1].ec_data		+= ec_sectors;
 
 	return 0;
 }
@@ -831,8 +842,8 @@ static int bch2_mark_stripe(struct bch_fs *c, struct bkey_s_c k,
 
 static int __bch2_mark_key(struct bch_fs *c, struct bkey_s_c k,
 			   bool inserting, s64 sectors,
-			   struct bch_fs_usage *stats,
-			   u64 journal_seq, unsigned flags,
+			   struct bch_fs_usage *fs_usage,
+			   unsigned journal_seq, unsigned flags,
 			   bool gc)
 {
 	int ret = 0;
@@ -843,30 +854,30 @@ static int __bch2_mark_key(struct bch_fs *c, struct bkey_s_c k,
 				       ?  c->opts.btree_node_size
 				       : -c->opts.btree_node_size,
 				       BCH_DATA_BTREE,
-				       stats, journal_seq, flags, gc);
+				       fs_usage, journal_seq, flags, gc);
 		break;
 	case KEY_TYPE_extent:
 		ret = bch2_mark_extent(c, k, sectors, BCH_DATA_USER,
-				       stats, journal_seq, flags, gc);
+				       fs_usage, journal_seq, flags, gc);
 		break;
 	case KEY_TYPE_stripe:
 		ret = bch2_mark_stripe(c, k, inserting,
-				       stats, journal_seq, flags, gc);
+				       fs_usage, journal_seq, flags, gc);
 		break;
 	case KEY_TYPE_alloc:
 		if (inserting)
-			stats->nr_inodes++;
+			fs_usage->nr_inodes++;
 		else
-			stats->nr_inodes--;
+			fs_usage->nr_inodes--;
 		break;
 	case KEY_TYPE_reservation: {
 		unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas;
 
 		sectors *= replicas;
 		replicas = clamp_t(unsigned, replicas,
-				   1, ARRAY_SIZE(stats->replicas));
+				   1, ARRAY_SIZE(fs_usage->replicas));
 
-		stats->replicas[replicas - 1].persistent_reserved += sectors;
+		fs_usage->replicas[replicas - 1].persistent_reserved += sectors;
 		break;
 	}
 	default:
@@ -880,17 +891,15 @@ int bch2_mark_key_locked(struct bch_fs *c,
 		   struct bkey_s_c k,
 		   bool inserting, s64 sectors,
 		   struct gc_pos pos,
-		   struct bch_fs_usage *stats,
+		   struct bch_fs_usage *fs_usage,
 		   u64 journal_seq, unsigned flags)
 {
 	int ret;
 
 	if (!(flags & BCH_BUCKET_MARK_GC)) {
-		if (!stats)
-			stats = this_cpu_ptr(c->usage[0]);
-
 		ret = __bch2_mark_key(c, k, inserting, sectors,
-				      stats, journal_seq, flags, false);
+				      fs_usage ?: this_cpu_ptr(c->usage[0]),
+				      journal_seq, flags, false);
 		if (ret)
 			return ret;
 	}
@@ -910,14 +919,14 @@ int bch2_mark_key_locked(struct bch_fs *c,
 int bch2_mark_key(struct bch_fs *c, struct bkey_s_c k,
 		  bool inserting, s64 sectors,
 		  struct gc_pos pos,
-		  struct bch_fs_usage *stats,
+		  struct bch_fs_usage *fs_usage,
 		  u64 journal_seq, unsigned flags)
 {
 	int ret;
 
 	percpu_down_read(&c->mark_lock);
 	ret = bch2_mark_key_locked(c, k, inserting, sectors,
-				   pos, stats, journal_seq, flags);
+				   pos, fs_usage, journal_seq, flags);
 	percpu_up_read(&c->mark_lock);
 
 	return ret;
@@ -930,7 +939,7 @@ void bch2_mark_update(struct btree_insert *trans,
 	struct btree_iter	*iter = insert->iter;
 	struct btree		*b = iter->l[0].b;
 	struct btree_node_iter	node_iter = iter->l[0].iter;
-	struct bch_fs_usage	stats = { 0 };
+	struct bch_fs_usage	fs_usage = { 0 };
 	struct gc_pos		pos = gc_pos_btree_node(b);
 	struct bkey_packed	*_k;
 
@@ -943,7 +952,7 @@ void bch2_mark_update(struct btree_insert *trans,
 		bch2_mark_key_locked(c, bkey_i_to_s_c(insert->k), true,
 			bpos_min(insert->k->k.p, b->key.k.p).offset -
 			bkey_start_offset(&insert->k->k),
-			pos, &stats, trans->journal_res.seq, 0);
+			pos, &fs_usage, trans->journal_res.seq, 0);
 
 	while ((_k = bch2_btree_node_iter_peek_filter(&node_iter, b,
 						      KEY_TYPE_discard))) {
@@ -976,7 +985,7 @@ void bch2_mark_update(struct btree_insert *trans,
 				BUG_ON(sectors <= 0);
 
 				bch2_mark_key_locked(c, k, true, sectors,
-					pos, &stats, trans->journal_res.seq, 0);
+					pos, &fs_usage, trans->journal_res.seq, 0);
 
 				sectors = bkey_start_offset(&insert->k->k) -
 					k.k->p.offset;
@@ -987,12 +996,12 @@ void bch2_mark_update(struct btree_insert *trans,
 		}
 
 		bch2_mark_key_locked(c, k, false, sectors,
-			pos, &stats, trans->journal_res.seq, 0);
+			pos, &fs_usage, trans->journal_res.seq, 0);
 
 		bch2_btree_node_iter_advance(&node_iter, b);
 	}
 
-	bch2_fs_usage_apply(c, &stats, trans->disk_res, pos);
+	bch2_fs_usage_apply(c, &fs_usage, trans->disk_res, pos);
 
 	percpu_up_read(&c->mark_lock);
 }
@@ -1004,9 +1013,9 @@ static u64 bch2_recalc_sectors_available(struct bch_fs *c)
 	int cpu;
 
 	for_each_possible_cpu(cpu)
-		per_cpu_ptr(c->usage[0], cpu)->available_cache = 0;
+		per_cpu_ptr(c->pcpu, cpu)->sectors_available = 0;
 
-	return avail_factor(bch2_fs_sectors_free(c, bch2_fs_usage_read(c)));
+	return avail_factor(bch2_fs_sectors_free(c));
 }
 
 void __bch2_disk_reservation_put(struct bch_fs *c, struct disk_reservation *res)
@@ -1026,16 +1035,16 @@ void __bch2_disk_reservation_put(struct bch_fs *c, struct disk_reservation *res)
 int bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res,
 			      unsigned sectors, int flags)
 {
-	struct bch_fs_usage *stats;
+	struct bch_fs_pcpu *pcpu;
 	u64 old, v, get;
 	s64 sectors_available;
 	int ret;
 
 	percpu_down_read(&c->mark_lock);
 	preempt_disable();
-	stats = this_cpu_ptr(c->usage[0]);
+	pcpu = this_cpu_ptr(c->pcpu);
 
-	if (sectors <= stats->available_cache)
+	if (sectors <= pcpu->sectors_available)
 		goto out;
 
 	v = atomic64_read(&c->sectors_available);
@@ -1051,12 +1060,12 @@ int bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res,
 	} while ((v = atomic64_cmpxchg(&c->sectors_available,
 				       old, old - get)) != old);
 
-	stats->available_cache	+= get;
+	pcpu->sectors_available		+= get;
 
 out:
-	stats->available_cache	-= sectors;
-	stats->online_reserved	+= sectors;
-	res->sectors		+= sectors;
+	pcpu->sectors_available		-= sectors;
+	this_cpu_add(c->usage[0]->online_reserved, sectors);
+	res->sectors			+= sectors;
 
 	bch2_disk_reservations_verify(c, flags);
 	bch2_fs_stats_verify(c);
@@ -1089,8 +1098,8 @@ recalculate:
 	    (flags & BCH_DISK_RESERVATION_NOFAIL)) {
 		atomic64_set(&c->sectors_available,
 			     max_t(s64, 0, sectors_available - sectors));
-		stats->online_reserved	+= sectors;
-		res->sectors		+= sectors;
+		this_cpu_add(c->usage[0]->online_reserved, sectors);
+		res->sectors			+= sectors;
 		ret = 0;
 
 		bch2_disk_reservations_verify(c, flags);
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index d76e65316245..3db0e3b8a180 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -164,6 +164,20 @@ static inline bool bucket_unused(struct bucket_mark mark)
 		!bucket_sectors_used(mark);
 }
 
+static inline bool is_available_bucket(struct bucket_mark mark)
+{
+	return (!mark.owned_by_allocator &&
+		!mark.dirty_sectors &&
+		!mark.stripe);
+}
+
+static inline bool bucket_needs_journal_commit(struct bucket_mark m,
+					       u16 last_seq_ondisk)
+{
+	return m.journal_seq_valid &&
+		((s16) m.journal_seq - (s16) last_seq_ondisk > 0);
+}
+
 /* Device usage: */
 
 struct bch_dev_usage __bch2_dev_usage_read(struct bch_dev *, bool);
@@ -207,31 +221,21 @@ static inline u64 dev_buckets_free(struct bch_fs *c, struct bch_dev *ca)
 
 struct bch_fs_usage __bch2_fs_usage_read(struct bch_fs *, bool);
 struct bch_fs_usage bch2_fs_usage_read(struct bch_fs *);
-void bch2_fs_usage_apply(struct bch_fs *, struct bch_fs_usage *,
-			 struct disk_reservation *, struct gc_pos);
 
 u64 bch2_fs_sectors_used(struct bch_fs *, struct bch_fs_usage);
 
-static inline u64 bch2_fs_sectors_free(struct bch_fs *c,
-				       struct bch_fs_usage stats)
-{
-	return c->capacity - bch2_fs_sectors_used(c, stats);
-}
+struct bch_fs_usage_short
+bch2_fs_usage_read_short(struct bch_fs *);
 
-static inline bool is_available_bucket(struct bucket_mark mark)
+static inline u64 bch2_fs_sectors_free(struct bch_fs *c)
 {
-	return (!mark.owned_by_allocator &&
-		!mark.dirty_sectors &&
-		!mark.stripe);
-}
+	struct bch_fs_usage_short usage = bch2_fs_usage_read_short(c);
 
-static inline bool bucket_needs_journal_commit(struct bucket_mark m,
-					       u16 last_seq_ondisk)
-{
-	return m.journal_seq_valid &&
-		((s16) m.journal_seq - (s16) last_seq_ondisk > 0);
+	return usage.capacity - usage.used;
 }
 
+/* key/bucket marking: */
+
 void bch2_bucket_seq_cleanup(struct bch_fs *);
 
 void bch2_invalidate_bucket(struct bch_fs *, struct bch_dev *,
@@ -252,6 +256,10 @@ int bch2_mark_key(struct bch_fs *, struct bkey_s_c,
 		  bool, s64, struct gc_pos,
 		  struct bch_fs_usage *, u64, unsigned);
 void bch2_mark_update(struct btree_insert *, struct btree_insert_entry *);
+void bch2_fs_usage_apply(struct bch_fs *, struct bch_fs_usage *,
+			 struct disk_reservation *, struct gc_pos);
+
+/* disk reservations: */
 
 void __bch2_disk_reservation_put(struct bch_fs *, struct disk_reservation *);
 
diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h
index 35d5cf48003b..f451a96f432c 100644
--- a/fs/bcachefs/buckets_types.h
+++ b/fs/bcachefs/buckets_types.h
@@ -73,18 +73,22 @@ struct bch_fs_usage {
 
 	u64			nr_inodes;
 
-	/* fields starting here aren't touched by gc: */
 	u64			online_reserved;
-	u64			available_cache;
+};
+
+struct bch_fs_usage_short {
+	u64			capacity;
+	u64			used;
+	u64			nr_inodes;
 };
 
 /*
  * A reservation for space on disk:
  */
 struct disk_reservation {
-	u64		sectors;
-	u32		gen;
-	unsigned	nr_replicas;
+	u64			sectors;
+	u32			gen;
+	unsigned		nr_replicas;
 };
 
 struct copygc_heap_entry {
diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c
index d24cff52ba96..c11f8f4d24cf 100644
--- a/fs/bcachefs/chardev.c
+++ b/fs/bcachefs/chardev.c
@@ -306,7 +306,7 @@ static ssize_t bch2_data_job_read(struct file *file, char __user *buf,
 		.p.btree_id		= ctx->stats.iter.btree_id,
 		.p.pos			= ctx->stats.iter.pos,
 		.p.sectors_done		= atomic64_read(&ctx->stats.sectors_seen),
-		.p.sectors_total	= bch2_fs_sectors_used(c, bch2_fs_usage_read(c)),
+		.p.sectors_total	= bch2_fs_usage_read_short(c).used,
 	};
 
 	if (len < sizeof(e))
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index db3c5962ad31..8f0b049aa1ec 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -1423,16 +1423,14 @@ static int bch2_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
 	struct super_block *sb = dentry->d_sb;
 	struct bch_fs *c = sb->s_fs_info;
-	struct bch_fs_usage usage = bch2_fs_usage_read(c);
-	u64 hidden_metadata = usage.buckets[BCH_DATA_SB] +
-		usage.buckets[BCH_DATA_JOURNAL];
+	struct bch_fs_usage_short usage = bch2_fs_usage_read_short(c);
 	unsigned shift = sb->s_blocksize_bits - 9;
 	u64 fsid;
 
 	buf->f_type	= BCACHEFS_STATFS_MAGIC;
 	buf->f_bsize	= sb->s_blocksize;
-	buf->f_blocks	= (c->capacity - hidden_metadata) >> shift;
-	buf->f_bfree	= (c->capacity - bch2_fs_sectors_used(c, usage)) >> shift;
+	buf->f_blocks	= usage.capacity >> shift;
+	buf->f_bfree	= (usage.capacity - usage.used) >> shift;
 	buf->f_bavail	= buf->f_bfree;
 	buf->f_files	= usage.nr_inodes;
 	buf->f_ffree	= U64_MAX;
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 3887e63c0756..b2113c5426ca 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -376,6 +376,7 @@ static void bch2_fs_free(struct bch_fs *c)
 	bch2_fs_compress_exit(c);
 	percpu_free_rwsem(&c->mark_lock);
 	free_percpu(c->usage[0]);
+	free_percpu(c->pcpu);
 	mempool_exit(&c->btree_iters_pool);
 	mempool_exit(&c->btree_bounce_pool);
 	bioset_exit(&c->btree_bio);
@@ -612,6 +613,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 			    offsetof(struct btree_write_bio, wbio.bio)),
 			BIOSET_NEED_BVECS) ||
 	    !(c->usage[0] = alloc_percpu(struct bch_fs_usage)) ||
+	    !(c->pcpu = alloc_percpu(struct bch_fs_pcpu)) ||
 	    mempool_init_kvpmalloc_pool(&c->btree_bounce_pool, 1,
 					btree_bytes(c)) ||
 	    mempool_init_kmalloc_pool(&c->btree_iters_pool, 1,
-- 
cgit 


From 06b7345cc282ec383942afb3b5b8d42bd9eec1b8 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 1 Dec 2018 11:32:12 -0500
Subject: bcachefs: Include summarized counts in fs_usage

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_gc.c      |  46 +++++++++++-------
 fs/bcachefs/buckets.c       | 112 ++++++++++++++++++++------------------------
 fs/bcachefs/buckets_types.h |  19 ++++++--
 fs/bcachefs/chardev.c       |   2 +-
 fs/bcachefs/sysfs.c         |   2 +-
 5 files changed, 97 insertions(+), 84 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 75ea243d4bbc..c353fbbed975 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -482,6 +482,24 @@ static void bch2_gc_free(struct bch_fs *c)
 	c->usage[1] = NULL;
 }
 
+static void fs_usage_reset(struct bch_fs_usage *fs_usage)
+{
+	unsigned offset = offsetof(typeof(*fs_usage), s.gc_start);
+
+	memset((void *) fs_usage + offset, 0,
+	       sizeof(*fs_usage) - offset);
+}
+
+static void fs_usage_cpy(struct bch_fs_usage *dst,
+			 struct bch_fs_usage *src)
+{
+	unsigned offset = offsetof(typeof(*dst), s.gc_start);
+
+	memcpy((void *) dst + offset,
+	       (void *) src + offset,
+	       sizeof(*dst) - offset);
+}
+
 static void bch2_gc_done_nocheck(struct bch_fs *c)
 {
 	struct bch_dev *ca;
@@ -530,17 +548,12 @@ static void bch2_gc_done_nocheck(struct bch_fs *c)
 
 	{
 		struct bch_fs_usage src = __bch2_fs_usage_read(c, 1);
-		struct bch_fs_usage *p;
 
-		for_each_possible_cpu(cpu) {
-			p = per_cpu_ptr(c->usage[0], cpu);
-			memset(p, 0, offsetof(typeof(*p), online_reserved));
-		}
+		for_each_possible_cpu(cpu)
+			fs_usage_reset(per_cpu_ptr(c->usage[0], cpu));
 
 		preempt_disable();
-		memcpy(this_cpu_ptr(c->usage[0]),
-		       &src,
-		       offsetof(typeof(*p), online_reserved));
+		fs_usage_cpy(this_cpu_ptr(c->usage[0]), &src);
 		preempt_enable();
 	}
 
@@ -668,9 +681,14 @@ static void bch2_gc_done(struct bch_fs *c, bool initial)
 	{
 		struct bch_fs_usage dst = __bch2_fs_usage_read(c, 0);
 		struct bch_fs_usage src = __bch2_fs_usage_read(c, 1);
-		struct bch_fs_usage *p;
 		unsigned r, b;
 
+		copy_fs_field(s.hidden,		"hidden");
+		copy_fs_field(s.data,		"data");
+		copy_fs_field(s.cached,		"cached");
+		copy_fs_field(s.reserved,	"reserved");
+		copy_fs_field(s.nr_inodes,	"nr_inodes");
+
 		for (r = 0; r < BCH_REPLICAS_MAX; r++) {
 			for (b = 0; b < BCH_DATA_NR; b++)
 				copy_fs_field(replicas[r].data[b],
@@ -685,16 +703,12 @@ static void bch2_gc_done(struct bch_fs *c, bool initial)
 		for (b = 0; b < BCH_DATA_NR; b++)
 			copy_fs_field(buckets[b],
 				      "buckets[%s]", bch2_data_types[b]);
-		copy_fs_field(nr_inodes, "nr_inodes");
 
-		for_each_possible_cpu(cpu) {
-			p = per_cpu_ptr(c->usage[0], cpu);
-			memset(p, 0, offsetof(typeof(*p), online_reserved));
-		}
+		for_each_possible_cpu(cpu)
+			fs_usage_reset(per_cpu_ptr(c->usage[0], cpu));
 
 		preempt_disable();
-		p = this_cpu_ptr(c->usage[0]);
-		memcpy(p, &dst, offsetof(typeof(*p), online_reserved));
+		fs_usage_cpy(this_cpu_ptr(c->usage[0]), &dst);
 		preempt_enable();
 	}
 out:
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index c53d7a030832..16aafe8502a0 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -106,9 +106,9 @@ static void bch2_fs_stats_verify(struct bch_fs *c)
 			      bch_data_types[j],
 			      stats.buckets[j]);
 
-	if ((s64) stats.online_reserved < 0)
+	if ((s64) stats.s.online_reserved < 0)
 		panic("sectors_online_reserved underflow: %lli\n",
-		      stats.online_reserved);
+		      stats.s.online_reserved);
 }
 
 static void bch2_dev_stats_verify(struct bch_dev *ca)
@@ -228,38 +228,6 @@ struct bch_fs_usage bch2_fs_usage_read(struct bch_fs *c)
 	return bch2_usage_read_raw(c->usage[0]);
 }
 
-struct fs_usage_sum {
-	u64	hidden;
-	u64	data;
-	u64	cached;
-	u64	reserved;
-};
-
-static inline struct fs_usage_sum __fs_usage_sum(struct bch_fs_usage stats)
-{
-	struct fs_usage_sum sum = { 0 };
-	unsigned i;
-
-	/*
-	 * For superblock and journal we count bucket usage, not sector usage,
-	 * because any internal fragmentation should _not_ be counted as
-	 * free space:
-	 */
-	sum.hidden += stats.buckets[BCH_DATA_SB];
-	sum.hidden += stats.buckets[BCH_DATA_JOURNAL];
-
-	for (i = 0; i < ARRAY_SIZE(stats.replicas); i++) {
-		sum.data	+= stats.replicas[i].data[BCH_DATA_BTREE];
-		sum.data	+= stats.replicas[i].data[BCH_DATA_USER];
-		sum.data	+= stats.replicas[i].ec_data;
-		sum.cached	+= stats.replicas[i].data[BCH_DATA_CACHED];
-		sum.reserved	+= stats.replicas[i].persistent_reserved;
-	}
-
-	sum.reserved += stats.online_reserved;
-	return sum;
-}
-
 #define RESERVE_FACTOR	6
 
 static u64 reserve_factor(u64 r)
@@ -274,9 +242,10 @@ static u64 avail_factor(u64 r)
 
 static inline u64 __bch2_fs_sectors_used(struct bch_fs *c, struct bch_fs_usage fs_usage)
 {
-	struct fs_usage_sum sum = __fs_usage_sum(fs_usage);
-
-	return sum.hidden + sum.data + reserve_factor(sum.reserved);
+	return fs_usage.s.hidden +
+		fs_usage.s.data +
+		reserve_factor(fs_usage.s.reserved +
+			       fs_usage.s.online_reserved);
 }
 
 u64 bch2_fs_sectors_used(struct bch_fs *c, struct bch_fs_usage fs_usage)
@@ -287,13 +256,14 @@ u64 bch2_fs_sectors_used(struct bch_fs *c, struct bch_fs_usage fs_usage)
 struct bch_fs_usage_short
 bch2_fs_usage_read_short(struct bch_fs *c)
 {
-	struct bch_fs_usage usage = bch2_fs_usage_read(c);
-	struct fs_usage_sum sum = __fs_usage_sum(usage);
+	struct bch_fs_usage_summarized usage =
+		bch2_usage_read_raw(&c->usage[0]->s);
 	struct bch_fs_usage_short ret;
 
-	ret.capacity	= READ_ONCE(c->capacity) - sum.hidden;
-	ret.used	= min(ret.capacity, sum.data +
-			      reserve_factor(sum.reserved));
+	ret.capacity	= READ_ONCE(c->capacity) - usage.hidden;
+	ret.used	= min(ret.capacity, usage.data +
+			      reserve_factor(usage.reserved +
+					     usage.online_reserved));
 	ret.nr_inodes	= usage.nr_inodes;
 
 	return ret;
@@ -334,8 +304,7 @@ void bch2_fs_usage_apply(struct bch_fs *c,
 			 struct disk_reservation *disk_res,
 			 struct gc_pos gc_pos)
 {
-	struct fs_usage_sum sum = __fs_usage_sum(*fs_usage);
-	s64 added = sum.data + sum.reserved;
+	s64 added = fs_usage->s.data + fs_usage->s.reserved;
 	s64 should_not_have_added;
 
 	percpu_rwsem_assert_held(&c->mark_lock);
@@ -353,7 +322,7 @@ void bch2_fs_usage_apply(struct bch_fs *c,
 
 	if (added > 0) {
 		disk_res->sectors		-= added;
-		fs_usage->online_reserved	-= added;
+		fs_usage->s.online_reserved	-= added;
 	}
 
 	preempt_disable();
@@ -368,6 +337,18 @@ void bch2_fs_usage_apply(struct bch_fs *c,
 	memset(fs_usage, 0, sizeof(*fs_usage));
 }
 
+static inline void account_bucket(struct bch_fs_usage *fs_usage,
+				  struct bch_dev_usage *dev_usage,
+				  enum bch_data_type type,
+				  int nr, s64 size)
+{
+	if (type == BCH_DATA_SB || type == BCH_DATA_JOURNAL)
+		fs_usage->s.hidden	+= size;
+
+	fs_usage->buckets[type]		+= size;
+	dev_usage->buckets[type]	+= nr;
+}
+
 static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
 				  struct bch_fs_usage *fs_usage,
 				  struct bucket_mark old, struct bucket_mark new,
@@ -386,15 +367,13 @@ static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
 	preempt_disable();
 	dev_usage = this_cpu_ptr(ca->usage[gc]);
 
-	if (bucket_type(old)) {
-		fs_usage->buckets[bucket_type(old)] -= ca->mi.bucket_size;
-		dev_usage->buckets[bucket_type(old)]--;
-	}
+	if (bucket_type(old))
+		account_bucket(fs_usage, dev_usage, bucket_type(old),
+			       -1, -ca->mi.bucket_size);
 
-	if (bucket_type(new)) {
-		fs_usage->buckets[bucket_type(new)] += ca->mi.bucket_size;
-		dev_usage->buckets[bucket_type(new)]++;
-	}
+	if (bucket_type(new))
+		account_bucket(fs_usage, dev_usage, bucket_type(new),
+			       1, ca->mi.bucket_size);
 
 	dev_usage->buckets_alloc +=
 		(int) new.owned_by_allocator - (int) old.owned_by_allocator;
@@ -460,7 +439,8 @@ static void __bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca,
 		new.gen++;
 	}));
 
-	fs_usage->replicas[0].data[BCH_DATA_CACHED] -= old->cached_sectors;
+	fs_usage->replicas[0].data[BCH_DATA_CACHED]	-= old->cached_sectors;
+	fs_usage->s.cached				-= old->cached_sectors;
 }
 
 void bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca,
@@ -528,7 +508,10 @@ static void __bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
 		checked_add(new.dirty_sectors, sectors);
 	}));
 
-	fs_usage->replicas[0].data[type] += sectors;
+	if (type == BCH_DATA_BTREE ||
+	    type == BCH_DATA_USER)
+		fs_usage->s.data		+= sectors;
+	fs_usage->replicas[0].data[type]	+= sectors;
 }
 
 void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
@@ -755,8 +738,13 @@ static int bch2_mark_extent(struct bch_fs *c, struct bkey_s_c k,
 	ec_redundancy	= clamp_t(unsigned,	ec_redundancy,
 				  1, ARRAY_SIZE(fs_usage->replicas));
 
+	fs_usage->s.cached					+= cached_sectors;
 	fs_usage->replicas[0].data[BCH_DATA_CACHED]		+= cached_sectors;
+
+	fs_usage->s.data					+= dirty_sectors;
 	fs_usage->replicas[replicas - 1].data[data_type]	+= dirty_sectors;
+
+	fs_usage->s.data					+= ec_sectors;
 	fs_usage->replicas[ec_redundancy - 1].ec_data		+= ec_sectors;
 
 	return 0;
@@ -866,9 +854,9 @@ static int __bch2_mark_key(struct bch_fs *c, struct bkey_s_c k,
 		break;
 	case KEY_TYPE_alloc:
 		if (inserting)
-			fs_usage->nr_inodes++;
+			fs_usage->s.nr_inodes++;
 		else
-			fs_usage->nr_inodes--;
+			fs_usage->s.nr_inodes--;
 		break;
 	case KEY_TYPE_reservation: {
 		unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas;
@@ -877,7 +865,8 @@ static int __bch2_mark_key(struct bch_fs *c, struct bkey_s_c k,
 		replicas = clamp_t(unsigned, replicas,
 				   1, ARRAY_SIZE(fs_usage->replicas));
 
-		fs_usage->replicas[replicas - 1].persistent_reserved += sectors;
+		fs_usage->s.reserved					+= sectors;
+		fs_usage->replicas[replicas - 1].persistent_reserved	+= sectors;
 		break;
 	}
 	default:
@@ -1021,8 +1010,7 @@ static u64 bch2_recalc_sectors_available(struct bch_fs *c)
 void __bch2_disk_reservation_put(struct bch_fs *c, struct disk_reservation *res)
 {
 	percpu_down_read(&c->mark_lock);
-	this_cpu_sub(c->usage[0]->online_reserved,
-		     res->sectors);
+	this_cpu_sub(c->usage[0]->s.online_reserved, res->sectors);
 
 	bch2_fs_stats_verify(c);
 	percpu_up_read(&c->mark_lock);
@@ -1064,7 +1052,7 @@ int bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res,
 
 out:
 	pcpu->sectors_available		-= sectors;
-	this_cpu_add(c->usage[0]->online_reserved, sectors);
+	this_cpu_add(c->usage[0]->s.online_reserved, sectors);
 	res->sectors			+= sectors;
 
 	bch2_disk_reservations_verify(c, flags);
@@ -1098,7 +1086,7 @@ recalculate:
 	    (flags & BCH_DISK_RESERVATION_NOFAIL)) {
 		atomic64_set(&c->sectors_available,
 			     max_t(s64, 0, sectors_available - sectors));
-		this_cpu_add(c->usage[0]->online_reserved, sectors);
+		this_cpu_add(c->usage[0]->s.online_reserved, sectors);
 		res->sectors			+= sectors;
 		ret = 0;
 
diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h
index f451a96f432c..196f07f41728 100644
--- a/fs/bcachefs/buckets_types.h
+++ b/fs/bcachefs/buckets_types.h
@@ -63,6 +63,21 @@ struct bch_dev_usage {
 struct bch_fs_usage {
 	/* all fields are in units of 512 byte sectors: */
 
+	/* summarized: */
+	struct bch_fs_usage_summarized {
+		u64		online_reserved;
+
+		/* fields after online_reserved are cleared/recalculated by gc: */
+		u64		gc_start[0];
+
+		u64		hidden;
+		u64		data;
+		u64		cached;
+		u64		reserved;
+		u64		nr_inodes;
+	} s;
+
+	/* broken out: */
 	struct {
 		u64		data[BCH_DATA_NR];
 		u64		ec_data;
@@ -70,10 +85,6 @@ struct bch_fs_usage {
 	}			replicas[BCH_REPLICAS_MAX];
 
 	u64			buckets[BCH_DATA_NR];
-
-	u64			nr_inodes;
-
-	u64			online_reserved;
 };
 
 struct bch_fs_usage_short {
diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c
index c11f8f4d24cf..7f79f020d904 100644
--- a/fs/bcachefs/chardev.c
+++ b/fs/bcachefs/chardev.c
@@ -398,7 +398,7 @@ static long bch2_ioctl_usage(struct bch_fs *c,
 		struct bch_ioctl_fs_usage dst = {
 			.capacity		= c->capacity,
 			.used			= bch2_fs_sectors_used(c, src),
-			.online_reserved	= src.online_reserved,
+			.online_reserved	= src.s.online_reserved,
 		};
 
 		for (i = 0; i < BCH_REPLICAS_MAX; i++) {
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index 7e46b254da38..a423159b6ed5 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -259,7 +259,7 @@ static ssize_t show_fs_alloc_debug(struct bch_fs *c, char *buf)
 		       stats.buckets[type]);
 
 	pr_buf(&out, "online reserved:\t%llu\n",
-	       stats.online_reserved);
+	       stats.s.online_reserved);
 
 	return out.pos - buf;
 }
-- 
cgit 


From f9ccc30824a6b9f44b975c8ce952938eff5920f3 Mon Sep 17 00:00:00 2001
From: Tim Schlueter <schlueter.tim@linux.com>
Date: Sun, 9 Dec 2018 13:20:52 -0800
Subject: bcachefs: Fix bkey_method compilation on gcc 7.3.0

Signed-off-by: Tim Schlueter <schlueter.tim@linux.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bkey_methods.c | 30 +++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c
index f518062d896b..37c44f087a0b 100644
--- a/fs/bcachefs/bkey_methods.c
+++ b/fs/bcachefs/bkey_methods.c
@@ -25,13 +25,13 @@ static const char *deleted_key_invalid(const struct bch_fs *c,
 	return NULL;
 }
 
-const struct bkey_ops bch2_bkey_ops_deleted = {
-	.key_invalid = deleted_key_invalid,
-};
+#define bch2_bkey_ops_deleted (struct bkey_ops) {	\
+	.key_invalid = deleted_key_invalid,		\
+}
 
-const struct bkey_ops bch2_bkey_ops_discard = {
-	.key_invalid = deleted_key_invalid,
-};
+#define bch2_bkey_ops_discard (struct bkey_ops) {	\
+	.key_invalid = deleted_key_invalid,		\
+}
 
 static const char *empty_val_key_invalid(const struct bch_fs *c, struct bkey_s_c k)
 {
@@ -41,9 +41,9 @@ static const char *empty_val_key_invalid(const struct bch_fs *c, struct bkey_s_c
 	return NULL;
 }
 
-const struct bkey_ops bch2_bkey_ops_error = {
-	.key_invalid = empty_val_key_invalid,
-};
+#define bch2_bkey_ops_error (struct bkey_ops) {		\
+	.key_invalid = empty_val_key_invalid,		\
+}
 
 static const char *key_type_cookie_invalid(const struct bch_fs *c,
 					   struct bkey_s_c k)
@@ -54,13 +54,13 @@ static const char *key_type_cookie_invalid(const struct bch_fs *c,
 	return NULL;
 }
 
-const struct bkey_ops bch2_bkey_ops_cookie = {
-	.key_invalid = key_type_cookie_invalid,
-};
+#define bch2_bkey_ops_cookie (struct bkey_ops) {	\
+	.key_invalid = key_type_cookie_invalid,		\
+}
 
-const struct bkey_ops bch2_bkey_ops_whiteout = {
-	.key_invalid = empty_val_key_invalid,
-};
+#define bch2_bkey_ops_whiteout (struct bkey_ops) {	\
+	.key_invalid = empty_val_key_invalid,		\
+}
 
 static const struct bkey_ops bch2_bkey_ops[] = {
 #define x(name, nr) [KEY_TYPE_##name]	= bch2_bkey_ops_##name,
-- 
cgit 


From 7121643e4fe20cbf916bf7ff032873dfbc19ee8e Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 12 Dec 2018 06:23:25 -0500
Subject: bcachefs: Fix for building in userspace

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs_format.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index 162a0a307f1b..8ef17c7ddaf7 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -74,6 +74,7 @@
 
 #include <asm/types.h>
 #include <asm/byteorder.h>
+#include <linux/kernel.h>
 #include <linux/uuid.h>
 
 #ifdef __KERNEL__
-- 
cgit 


From a3e70fb287ee62ee14512a69e9a3e1870a057e11 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 13 Dec 2018 06:01:30 -0500
Subject: bcachefs: use x-macros more consistently

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs_format.h | 64 +++++++++++++++++++++----------------------
 fs/bcachefs/inode.c           | 20 +++++++-------
 fs/bcachefs/inode.h           | 16 +++++------
 fs/bcachefs/opts.c            | 16 +++++------
 fs/bcachefs/opts.h            | 26 +++++++++---------
 5 files changed, 71 insertions(+), 71 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index 8ef17c7ddaf7..2e048d25d3a9 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -694,38 +694,38 @@ struct bch_inode_generation {
 	__le32			pad;
 } __attribute__((packed, aligned(8)));
 
-#define BCH_INODE_FIELDS()					\
-	BCH_INODE_FIELD(bi_atime,			64)	\
-	BCH_INODE_FIELD(bi_ctime,			64)	\
-	BCH_INODE_FIELD(bi_mtime,			64)	\
-	BCH_INODE_FIELD(bi_otime,			64)	\
-	BCH_INODE_FIELD(bi_size,			64)	\
-	BCH_INODE_FIELD(bi_sectors,			64)	\
-	BCH_INODE_FIELD(bi_uid,				32)	\
-	BCH_INODE_FIELD(bi_gid,				32)	\
-	BCH_INODE_FIELD(bi_nlink,			32)	\
-	BCH_INODE_FIELD(bi_generation,			32)	\
-	BCH_INODE_FIELD(bi_dev,				32)	\
-	BCH_INODE_FIELD(bi_data_checksum,		8)	\
-	BCH_INODE_FIELD(bi_compression,			8)	\
-	BCH_INODE_FIELD(bi_project,			32)	\
-	BCH_INODE_FIELD(bi_background_compression,	8)	\
-	BCH_INODE_FIELD(bi_data_replicas,		8)	\
-	BCH_INODE_FIELD(bi_promote_target,		16)	\
-	BCH_INODE_FIELD(bi_foreground_target,		16)	\
-	BCH_INODE_FIELD(bi_background_target,		16)	\
-	BCH_INODE_FIELD(bi_erasure_code,		16)
-
-#define BCH_INODE_FIELDS_INHERIT()				\
-	BCH_INODE_FIELD(bi_data_checksum)			\
-	BCH_INODE_FIELD(bi_compression)				\
-	BCH_INODE_FIELD(bi_project)				\
-	BCH_INODE_FIELD(bi_background_compression)		\
-	BCH_INODE_FIELD(bi_data_replicas)			\
-	BCH_INODE_FIELD(bi_promote_target)			\
-	BCH_INODE_FIELD(bi_foreground_target)			\
-	BCH_INODE_FIELD(bi_background_target)			\
-	BCH_INODE_FIELD(bi_erasure_code)
+#define BCH_INODE_FIELDS()			\
+	x(bi_atime,			64)	\
+	x(bi_ctime,			64)	\
+	x(bi_mtime,			64)	\
+	x(bi_otime,			64)	\
+	x(bi_size,			64)	\
+	x(bi_sectors,			64)	\
+	x(bi_uid,			32)	\
+	x(bi_gid,			32)	\
+	x(bi_nlink,			32)	\
+	x(bi_generation,		32)	\
+	x(bi_dev,			32)	\
+	x(bi_data_checksum,		8)	\
+	x(bi_compression,		8)	\
+	x(bi_project,			32)	\
+	x(bi_background_compression,	8)	\
+	x(bi_data_replicas,		8)	\
+	x(bi_promote_target,		16)	\
+	x(bi_foreground_target,		16)	\
+	x(bi_background_target,		16)	\
+	x(bi_erasure_code,		16)
+
+#define BCH_INODE_FIELDS_INHERIT()		\
+	x(bi_data_checksum)			\
+	x(bi_compression)			\
+	x(bi_project)				\
+	x(bi_background_compression)		\
+	x(bi_data_replicas)			\
+	x(bi_promote_target)			\
+	x(bi_foreground_target)			\
+	x(bi_background_target)			\
+	x(bi_erasure_code)
 
 enum {
 	/*
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index 30f93fbe280d..f967029584a7 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -98,7 +98,7 @@ void bch2_inode_pack(struct bkey_inode_buf *packed,
 	packed->inode.v.bi_flags	= cpu_to_le32(inode->bi_flags);
 	packed->inode.v.bi_mode		= cpu_to_le16(inode->bi_mode);
 
-#define BCH_INODE_FIELD(_name, _bits)					\
+#define x(_name, _bits)					\
 	out += inode_encode_field(out, end, 0, inode->_name);		\
 	nr_fields++;							\
 									\
@@ -108,7 +108,7 @@ void bch2_inode_pack(struct bkey_inode_buf *packed,
 	}
 
 	BCH_INODE_FIELDS()
-#undef  BCH_INODE_FIELD
+#undef  x
 
 	out = last_nonzero_field;
 	nr_fields = last_nonzero_fieldnr;
@@ -130,9 +130,9 @@ void bch2_inode_pack(struct bkey_inode_buf *packed,
 		BUG_ON(unpacked.bi_hash_seed	!= inode->bi_hash_seed);
 		BUG_ON(unpacked.bi_mode		!= inode->bi_mode);
 
-#define BCH_INODE_FIELD(_name, _bits)	BUG_ON(unpacked._name != inode->_name);
+#define x(_name, _bits)	BUG_ON(unpacked._name != inode->_name);
 		BCH_INODE_FIELDS()
-#undef  BCH_INODE_FIELD
+#undef  x
 	}
 }
 
@@ -150,7 +150,7 @@ int bch2_inode_unpack(struct bkey_s_c_inode inode,
 	unpacked->bi_flags	= le32_to_cpu(inode.v->bi_flags);
 	unpacked->bi_mode	= le16_to_cpu(inode.v->bi_mode);
 
-#define BCH_INODE_FIELD(_name, _bits)					\
+#define x(_name, _bits)					\
 	if (fieldnr++ == INODE_NR_FIELDS(inode.v)) {			\
 		unsigned offset = offsetof(struct bch_inode_unpacked, _name);\
 		memset((void *) unpacked + offset, 0,			\
@@ -169,7 +169,7 @@ int bch2_inode_unpack(struct bkey_s_c_inode inode,
 	in += ret;
 
 	BCH_INODE_FIELDS()
-#undef  BCH_INODE_FIELD
+#undef  x
 
 	/* XXX: signal if there were more fields than expected? */
 
@@ -220,10 +220,10 @@ void bch2_inode_to_text(struct printbuf *out, struct bch_fs *c,
 		return;
 	}
 
-#define BCH_INODE_FIELD(_name, _bits)						\
+#define x(_name, _bits)						\
 	pr_buf(out, #_name ": %llu ", (u64) unpacked._name);
 	BCH_INODE_FIELDS()
-#undef  BCH_INODE_FIELD
+#undef  x
 }
 
 const char *bch2_inode_generation_invalid(const struct bch_fs *c,
@@ -265,9 +265,9 @@ void bch2_inode_init(struct bch_fs *c, struct bch_inode_unpacked *inode_u,
 	inode_u->bi_otime	= now;
 
 	if (parent) {
-#define BCH_INODE_FIELD(_name)	inode_u->_name = parent->_name;
+#define x(_name)	inode_u->_name = parent->_name;
 		BCH_INODE_FIELDS_INHERIT()
-#undef BCH_INODE_FIELD
+#undef x
 	}
 }
 
diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h
index 0bc852e69355..74fa6ff84111 100644
--- a/fs/bcachefs/inode.h
+++ b/fs/bcachefs/inode.h
@@ -30,17 +30,17 @@ struct bch_inode_unpacked {
 	u32			bi_flags;
 	u16			bi_mode;
 
-#define BCH_INODE_FIELD(_name, _bits)	u##_bits _name;
+#define x(_name, _bits)	u##_bits _name;
 	BCH_INODE_FIELDS()
-#undef  BCH_INODE_FIELD
+#undef  x
 };
 
 struct bkey_inode_buf {
 	struct bkey_i_inode	inode;
 
-#define BCH_INODE_FIELD(_name, _bits)		+ 8 + _bits / 8
+#define x(_name, _bits)		+ 8 + _bits / 8
 	u8		_pad[0 + BCH_INODE_FIELDS()];
-#undef  BCH_INODE_FIELD
+#undef  x
 } __attribute__((packed, aligned(8)));
 
 void bch2_inode_pack(struct bkey_inode_buf *, const struct bch_inode_unpacked *);
@@ -65,11 +65,11 @@ static inline struct bch_io_opts bch2_inode_opts_get(struct bch_inode_unpacked *
 {
 	struct bch_io_opts ret = { 0 };
 
-#define BCH_INODE_OPT(_name, _bits)					\
+#define x(_name, _bits)					\
 	if (inode->bi_##_name)						\
 		opt_set(ret, _name, inode->bi_##_name - 1);
 	BCH_INODE_OPTS()
-#undef BCH_INODE_OPT
+#undef x
 	return ret;
 }
 
@@ -77,12 +77,12 @@ static inline void __bch2_inode_opt_set(struct bch_inode_unpacked *inode,
 					enum bch_opt_id id, u64 v)
 {
 	switch (id) {
-#define BCH_INODE_OPT(_name, ...)					\
+#define x(_name, ...)					\
 	case Opt_##_name:						\
 		inode->bi_##_name = v;					\
 		break;
 	BCH_INODE_OPTS()
-#undef BCH_INODE_OPT
+#undef x
 	default:
 		BUG();
 	}
diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c
index 4d86c4bc4a5f..17245e0b4a73 100644
--- a/fs/bcachefs/opts.c
+++ b/fs/bcachefs/opts.c
@@ -377,40 +377,40 @@ no_val:
 struct bch_io_opts bch2_opts_to_inode_opts(struct bch_opts src)
 {
 	struct bch_io_opts ret = { 0 };
-#define BCH_INODE_OPT(_name, _bits)					\
+#define x(_name, _bits)					\
 	if (opt_defined(src, _name))					\
 		opt_set(ret, _name, src._name);
 	BCH_INODE_OPTS()
-#undef BCH_INODE_OPT
+#undef x
 	return ret;
 }
 
 struct bch_opts bch2_inode_opts_to_opts(struct bch_io_opts src)
 {
 	struct bch_opts ret = { 0 };
-#define BCH_INODE_OPT(_name, _bits)					\
+#define x(_name, _bits)					\
 	if (opt_defined(src, _name))					\
 		opt_set(ret, _name, src._name);
 	BCH_INODE_OPTS()
-#undef BCH_INODE_OPT
+#undef x
 	return ret;
 }
 
 void bch2_io_opts_apply(struct bch_io_opts *dst, struct bch_io_opts src)
 {
-#define BCH_INODE_OPT(_name, _bits)					\
+#define x(_name, _bits)					\
 	if (opt_defined(src, _name))					\
 		opt_set(*dst, _name, src._name);
 	BCH_INODE_OPTS()
-#undef BCH_INODE_OPT
+#undef x
 }
 
 bool bch2_opt_is_inode_opt(enum bch_opt_id id)
 {
 	static const enum bch_opt_id inode_opt_list[] = {
-#define BCH_INODE_OPT(_name, _bits)	Opt_##_name,
+#define x(_name, _bits)	Opt_##_name,
 	BCH_INODE_OPTS()
-#undef BCH_INODE_OPT
+#undef x
 	};
 	unsigned i;
 
diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
index acdc952c48be..222c130c2054 100644
--- a/fs/bcachefs/opts.h
+++ b/fs/bcachefs/opts.h
@@ -281,24 +281,24 @@ int bch2_parse_mount_opts(struct bch_opts *, char *);
 
 /* inode opts: */
 
-#define BCH_INODE_OPTS()					\
-	BCH_INODE_OPT(data_checksum,			8)	\
-	BCH_INODE_OPT(compression,			8)	\
-	BCH_INODE_OPT(background_compression,		8)	\
-	BCH_INODE_OPT(data_replicas,			8)	\
-	BCH_INODE_OPT(promote_target,			16)	\
-	BCH_INODE_OPT(foreground_target,		16)	\
-	BCH_INODE_OPT(background_target,		16)	\
-	BCH_INODE_OPT(erasure_code,			16)
+#define BCH_INODE_OPTS()			\
+	x(data_checksum,		8)	\
+	x(compression,			8)	\
+	x(background_compression,	8)	\
+	x(data_replicas,		8)	\
+	x(promote_target,		16)	\
+	x(foreground_target,		16)	\
+	x(background_target,		16)	\
+	x(erasure_code,			16)
 
 struct bch_io_opts {
-#define BCH_INODE_OPT(_name, _bits)	unsigned _name##_defined:1;
+#define x(_name, _bits)	unsigned _name##_defined:1;
 	BCH_INODE_OPTS()
-#undef BCH_INODE_OPT
+#undef x
 
-#define BCH_INODE_OPT(_name, _bits)	u##_bits _name;
+#define x(_name, _bits)	u##_bits _name;
 	BCH_INODE_OPTS()
-#undef BCH_INODE_OPT
+#undef x
 };
 
 struct bch_io_opts bch2_opts_to_inode_opts(struct bch_opts);
-- 
cgit 


From d42dd4ad6d7e15a7742ed008d1be3c37247547c9 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 13 Dec 2018 08:24:21 -0500
Subject: bcachefs: merge BCH_INODE_FIELDS_INHERIT/BCH_INODE_OPTS

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs_format.h | 21 +++++++++++----------
 fs/bcachefs/inode.c           |  4 ++--
 fs/bcachefs/opts.h            | 15 ++++-----------
 3 files changed, 17 insertions(+), 23 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index 2e048d25d3a9..19d05a1a0224 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -716,16 +716,17 @@ struct bch_inode_generation {
 	x(bi_background_target,		16)	\
 	x(bi_erasure_code,		16)
 
-#define BCH_INODE_FIELDS_INHERIT()		\
-	x(bi_data_checksum)			\
-	x(bi_compression)			\
-	x(bi_project)				\
-	x(bi_background_compression)		\
-	x(bi_data_replicas)			\
-	x(bi_promote_target)			\
-	x(bi_foreground_target)			\
-	x(bi_background_target)			\
-	x(bi_erasure_code)
+/* subset of BCH_INODE_FIELDS */
+#define BCH_INODE_OPTS()			\
+	x(data_checksum,		8)	\
+	x(compression,			8)	\
+	x(project,			32)	\
+	x(background_compression,	8)	\
+	x(data_replicas,		8)	\
+	x(promote_target,		16)	\
+	x(foreground_target,		16)	\
+	x(background_target,		16)	\
+	x(erasure_code,			16)
 
 enum {
 	/*
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index f967029584a7..a85b7a683a3a 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -265,8 +265,8 @@ void bch2_inode_init(struct bch_fs *c, struct bch_inode_unpacked *inode_u,
 	inode_u->bi_otime	= now;
 
 	if (parent) {
-#define x(_name)	inode_u->_name = parent->_name;
-		BCH_INODE_FIELDS_INHERIT()
+#define x(_name, ...)	inode_u->bi_##_name = parent->bi_##_name;
+		BCH_INODE_OPTS()
 #undef x
 	}
 }
diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
index 222c130c2054..faa2a72c8c3b 100644
--- a/fs/bcachefs/opts.h
+++ b/fs/bcachefs/opts.h
@@ -188,7 +188,10 @@ enum opt_type {
 		NO_SB_OPT,			false)			\
 	BCH_OPT(version_upgrade,	u8,	OPT_MOUNT,		\
 		OPT_BOOL(),						\
-		NO_SB_OPT,			false)
+		NO_SB_OPT,			false)			\
+	BCH_OPT(project,		u8,	OPT_INTERNAL,		\
+		OPT_BOOL(),						\
+		NO_SB_OPT,			false)			\
 
 struct bch_opts {
 #define BCH_OPT(_name, _bits, ...)	unsigned _name##_defined:1;
@@ -281,16 +284,6 @@ int bch2_parse_mount_opts(struct bch_opts *, char *);
 
 /* inode opts: */
 
-#define BCH_INODE_OPTS()			\
-	x(data_checksum,		8)	\
-	x(compression,			8)	\
-	x(background_compression,	8)	\
-	x(data_replicas,		8)	\
-	x(promote_target,		16)	\
-	x(foreground_target,		16)	\
-	x(background_target,		16)	\
-	x(erasure_code,			16)
-
 struct bch_io_opts {
 #define x(_name, _bits)	unsigned _name##_defined:1;
 	BCH_INODE_OPTS()
-- 
cgit 


From 0f5254aa98befa5187cc4d02584ab0f19d18ff68 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 17 Dec 2018 05:43:00 -0500
Subject: bcachefs: bch2_fs_quota_transfer

improve quota transfer locking & make ei_qid usage more consistent

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-ioctl.c    | 15 +++--------
 fs/bcachefs/fs.c          | 67 ++++++++++++++++++++++++++++++++---------------
 fs/bcachefs/fs.h          |  6 +++++
 fs/bcachefs/quota.c       |  7 ++---
 fs/bcachefs/quota.h       | 11 +++-----
 fs/bcachefs/quota_types.h |  6 +++++
 6 files changed, 68 insertions(+), 44 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c
index a89786f295cf..701882ce6024 100644
--- a/fs/bcachefs/fs-ioctl.c
+++ b/fs/bcachefs/fs-ioctl.c
@@ -108,21 +108,12 @@ static int bch2_set_projid(struct bch_fs *c,
 			   u32 projid)
 {
 	struct bch_qid qid = inode->ei_qid;
-	int ret;
-
-	if (projid == inode->ei_qid.q[QTYP_PRJ])
-		return 0;
 
 	qid.q[QTYP_PRJ] = projid;
 
-	return bch2_quota_transfer(c, 1 << QTYP_PRJ, qid, inode->ei_qid,
-				   inode->v.i_blocks +
-				   inode->ei_quota_reserved);
-	if (ret)
-		return ret;
-
-	inode->ei_qid.q[QTYP_PRJ] = projid;
-	return 0;
+	return bch2_fs_quota_transfer(c, inode, qid,
+				      1 << QTYP_PRJ,
+				      KEY_TYPE_QUOTA_PREALLOC);
 }
 
 static int fssetxattr_inode_update_fn(struct bch_inode_info *inode,
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index 8f0b049aa1ec..d22b9e7e2082 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -170,7 +170,6 @@ void bch2_inode_update_after_write(struct bch_fs *c,
 		inode_set_ctime_to_ts(&inode->v, bch2_time_to_timespec(c, bi->bi_ctime));
 
 	inode->ei_inode		= *bi;
-	inode->ei_qid		= bch_qid(bi);
 
 	bch2_inode_flags_to_vfs(inode);
 }
@@ -248,6 +247,41 @@ retry:
 	return ret < 0 ? ret : 0;
 }
 
+int bch2_fs_quota_transfer(struct bch_fs *c,
+			   struct bch_inode_info *inode,
+			   struct bch_qid new_qid,
+			   unsigned qtypes,
+			   enum quota_acct_mode mode)
+{
+	unsigned i;
+	int ret;
+
+	qtypes &= enabled_qtypes(c);
+
+	for (i = 0; i < QTYP_NR; i++)
+		if (new_qid.q[i] == inode->ei_qid.q[i])
+			qtypes &= ~(1U << i);
+
+	if (!qtypes)
+		return 0;
+
+	mutex_lock(&inode->ei_quota_lock);
+
+	ret = bch2_quota_transfer(c, qtypes, new_qid,
+				  inode->ei_qid,
+				  inode->v.i_blocks +
+				  inode->ei_quota_reserved,
+				  mode);
+	if (!ret)
+		for (i = 0; i < QTYP_NR; i++)
+			if (qtypes & (1 << i))
+				inode->ei_qid.q[i] = new_qid.q[i];
+
+	mutex_unlock(&inode->ei_quota_lock);
+
+	return ret;
+}
+
 static struct inode *bch2_vfs_inode_get(struct bch_fs *c, u64 inum)
 {
 	struct bch_inode_unpacked inode_u;
@@ -913,37 +947,27 @@ static int bch2_setattr_nonsize(struct mnt_idmap *idmap,
 				struct iattr *iattr)
 {
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	struct bch_qid qid = inode->ei_qid;
+	struct bch_qid qid;
 	struct btree_trans trans;
 	struct bch_inode_unpacked inode_u;
 	struct posix_acl *acl = NULL;
 	struct inode_write_setattr s = { iattr, idmap };
-	unsigned qtypes = 0;
 	int ret;
 
 	mutex_lock(&inode->ei_update_lock);
 
-	if (c->opts.usrquota &&
-	    (iattr->ia_valid & ATTR_UID) &&
-	    !uid_eq(iattr->ia_uid, inode->v.i_uid)) {
-		qid.q[QTYP_USR] = from_kuid(i_user_ns(&inode->v), iattr->ia_uid),
-		qtypes |= 1 << QTYP_USR;
-	}
+	qid = inode->ei_qid;
+
+	if (iattr->ia_valid & ATTR_UID)
+		qid.q[QTYP_USR] = from_kuid(i_user_ns(&inode->v), iattr->ia_uid);
 
-	if (c->opts.grpquota &&
-	    (iattr->ia_valid & ATTR_GID) &&
-	    !gid_eq(iattr->ia_gid, inode->v.i_gid)) {
+	if (iattr->ia_valid & ATTR_GID)
 		qid.q[QTYP_GRP] = from_kgid(i_user_ns(&inode->v), iattr->ia_gid);
-		qtypes |= 1 << QTYP_GRP;
-	}
 
-	if (qtypes) {
-		ret = bch2_quota_transfer(c, qtypes, qid, inode->ei_qid,
-					  inode->v.i_blocks +
-					  inode->ei_quota_reserved);
-		if (ret)
-			goto err;
-	}
+	ret = bch2_fs_quota_transfer(c, inode, qid, ~0,
+				     KEY_TYPE_QUOTA_PREALLOC);
+	if (ret)
+		goto err;
 
 	bch2_trans_init(&trans, c);
 retry:
@@ -1312,6 +1336,7 @@ static void bch2_vfs_inode_init(struct bch_fs *c,
 	inode->ei_journal_seq	= 0;
 	inode->ei_quota_reserved = 0;
 	inode->ei_str_hash	= bch2_hash_info_init(c, bi);
+	inode->ei_qid		= bch_qid(bi);
 
 	inode->v.i_mapping->a_ops = &bch_address_space_operations;
 
diff --git a/fs/bcachefs/fs.h b/fs/bcachefs/fs.h
index 4fdc11762cd7..fbb31976bc55 100644
--- a/fs/bcachefs/fs.h
+++ b/fs/bcachefs/fs.h
@@ -70,6 +70,12 @@ struct bch_inode_unpacked;
 
 #ifndef NO_BCACHEFS_FS
 
+int bch2_fs_quota_transfer(struct bch_fs *,
+			   struct bch_inode_info *,
+			   struct bch_qid,
+			   unsigned,
+			   enum quota_acct_mode);
+
 /* returns 0 if we want to do the update, or error is passed up */
 typedef int (*inode_set_fn)(struct bch_inode_info *,
 			    struct bch_inode_unpacked *, void *);
diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c
index 7c38daac1cac..113a2ca88ffc 100644
--- a/fs/bcachefs/quota.c
+++ b/fs/bcachefs/quota.c
@@ -270,7 +270,8 @@ static void __bch2_quota_transfer(struct bch_memquota *src_q,
 
 int bch2_quota_transfer(struct bch_fs *c, unsigned qtypes,
 			struct bch_qid dst,
-			struct bch_qid src, u64 space)
+			struct bch_qid src, u64 space,
+			enum quota_acct_mode mode)
 {
 	struct bch_memquota_type *q;
 	struct bch_memquota *src_q[3], *dst_q[3];
@@ -296,13 +297,13 @@ int bch2_quota_transfer(struct bch_fs *c, unsigned qtypes,
 
 		ret = bch2_quota_check_limit(c, i, dst_q[i], &msgs, Q_SPC,
 					     dst_q[i]->c[Q_SPC].v + space,
-					     KEY_TYPE_QUOTA_PREALLOC);
+					     mode);
 		if (ret)
 			goto err;
 
 		ret = bch2_quota_check_limit(c, i, dst_q[i], &msgs, Q_INO,
 					     dst_q[i]->c[Q_INO].v + 1,
-					     KEY_TYPE_QUOTA_PREALLOC);
+					     mode);
 		if (ret)
 			goto err;
 	}
diff --git a/fs/bcachefs/quota.h b/fs/bcachefs/quota.h
index 294a04db84bf..72b5ea0d77c5 100644
--- a/fs/bcachefs/quota.h
+++ b/fs/bcachefs/quota.h
@@ -15,12 +15,6 @@ void bch2_quota_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 	.val_to_text	= bch2_quota_to_text,		\
 }
 
-enum quota_acct_mode {
-	KEY_TYPE_QUOTA_PREALLOC,
-	KEY_TYPE_QUOTA_WARN,
-	KEY_TYPE_QUOTA_NOCHECK,
-};
-
 static inline struct bch_qid bch_qid(struct bch_inode_unpacked *u)
 {
 	return (struct bch_qid) {
@@ -43,7 +37,7 @@ int bch2_quota_acct(struct bch_fs *, struct bch_qid, enum quota_counters,
 		    s64, enum quota_acct_mode);
 
 int bch2_quota_transfer(struct bch_fs *, unsigned, struct bch_qid,
-			struct bch_qid, u64);
+			struct bch_qid, u64, enum quota_acct_mode);
 
 void bch2_fs_quota_exit(struct bch_fs *);
 void bch2_fs_quota_init(struct bch_fs *);
@@ -62,7 +56,8 @@ static inline int bch2_quota_acct(struct bch_fs *c, struct bch_qid qid,
 
 static inline int bch2_quota_transfer(struct bch_fs *c, unsigned qtypes,
 				      struct bch_qid dst,
-				      struct bch_qid src, u64 space)
+				      struct bch_qid src, u64 space,
+				      enum quota_acct_mode mode)
 {
 	return 0;
 }
diff --git a/fs/bcachefs/quota_types.h b/fs/bcachefs/quota_types.h
index 9eda6c363736..6a136083d389 100644
--- a/fs/bcachefs/quota_types.h
+++ b/fs/bcachefs/quota_types.h
@@ -8,6 +8,12 @@ struct bch_qid {
 	u32		q[QTYP_NR];
 };
 
+enum quota_acct_mode {
+	KEY_TYPE_QUOTA_PREALLOC,
+	KEY_TYPE_QUOTA_WARN,
+	KEY_TYPE_QUOTA_NOCHECK,
+};
+
 struct memquota_counter {
 	u64				v;
 	u64				hardlimit;
-- 
cgit 


From 721d4ad8eb55bf66ef55b31438b6c8361acf283f Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 13 Dec 2018 08:32:11 -0500
Subject: bcachefs: Add flags to indicate if inode opts were inherited or
 explicitly set

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs_format.h | 11 ++++++++++-
 fs/bcachefs/fs-ioctl.c        |  9 ++++++++-
 fs/bcachefs/inode.h           |  4 ++++
 3 files changed, 22 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index 19d05a1a0224..f6cf4ccedcb1 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -714,7 +714,8 @@ struct bch_inode_generation {
 	x(bi_promote_target,		16)	\
 	x(bi_foreground_target,		16)	\
 	x(bi_background_target,		16)	\
-	x(bi_erasure_code,		16)
+	x(bi_erasure_code,		16)	\
+	x(bi_fields_set,		16)
 
 /* subset of BCH_INODE_FIELDS */
 #define BCH_INODE_OPTS()			\
@@ -728,6 +729,14 @@ struct bch_inode_generation {
 	x(background_target,		16)	\
 	x(erasure_code,			16)
 
+enum inode_opt_id {
+#define x(name, ...)				\
+	Inode_opt_##name,
+	BCH_INODE_OPTS()
+#undef  x
+	Inode_opt_nr,
+};
+
 enum {
 	/*
 	 * User flags (get/settable with FS_IOC_*FLAGS, correspond to FS_*_FL
diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c
index 701882ce6024..d6563370bec4 100644
--- a/fs/bcachefs/fs-ioctl.c
+++ b/fs/bcachefs/fs-ioctl.c
@@ -122,7 +122,14 @@ static int fssetxattr_inode_update_fn(struct bch_inode_info *inode,
 {
 	struct flags_set *s = p;
 
-	bi->bi_project = s->projid;
+	if (s->projid != bi->bi_project) {
+		if (s->projid)
+			bi->bi_fields_set |= 1U << Inode_opt_project;
+		else
+			bi->bi_fields_set &= ~(1U << Inode_opt_project);
+
+		bi->bi_project = s->projid;
+	}
 
 	return bch2_inode_flags_set(inode, bi, p);
 }
diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h
index 74fa6ff84111..7bf95f889d35 100644
--- a/fs/bcachefs/inode.h
+++ b/fs/bcachefs/inode.h
@@ -80,6 +80,10 @@ static inline void __bch2_inode_opt_set(struct bch_inode_unpacked *inode,
 #define x(_name, ...)					\
 	case Opt_##_name:						\
 		inode->bi_##_name = v;					\
+		if (v)							\
+			inode->bi_fields_set |= 1U << Inode_opt_##_name;\
+		else							\
+			inode->bi_fields_set &= ~(1U << Inode_opt_##_name);\
 		break;
 	BCH_INODE_OPTS()
 #undef x
-- 
cgit 


From 4d269918ed502cba80ddad998bdb087a633c63ab Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 17 Dec 2018 05:31:09 -0500
Subject: bcachefs: add bcachefs_effective xattrs

Allows seeing xattrs that were inherited, not explicitly set

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/inode.c |   7 +-
 fs/bcachefs/inode.h |  34 ++++-----
 fs/bcachefs/xattr.c | 203 ++++++++++++++++++++++++++++++++++++++++------------
 3 files changed, 180 insertions(+), 64 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index a85b7a683a3a..23d3668b4567 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -13,7 +13,12 @@
 
 #include <asm/unaligned.h>
 
-#define FIELD_BYTES()						\
+const char * const bch2_inode_opts[] = {
+#define x(name, ...)	#name,
+	BCH_INODE_OPTS()
+#undef  x
+	NULL,
+};
 
 static const u8 byte_table[8] = { 1, 2, 3, 4, 6, 8, 10, 13 };
 static const u8 bits_table[8] = {
diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h
index 7bf95f889d35..07d7020f230d 100644
--- a/fs/bcachefs/inode.h
+++ b/fs/bcachefs/inode.h
@@ -6,6 +6,8 @@
 
 #include <linux/math64.h>
 
+extern const char * const bch2_inode_opts[];
+
 const char *bch2_inode_invalid(const struct bch_fs *, struct bkey_s_c);
 void bch2_inode_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 
@@ -73,17 +75,13 @@ static inline struct bch_io_opts bch2_inode_opts_get(struct bch_inode_unpacked *
 	return ret;
 }
 
-static inline void __bch2_inode_opt_set(struct bch_inode_unpacked *inode,
-					enum bch_opt_id id, u64 v)
+static inline void bch2_inode_opt_set(struct bch_inode_unpacked *inode,
+				      enum inode_opt_id id, u64 v)
 {
 	switch (id) {
-#define x(_name, ...)					\
-	case Opt_##_name:						\
+#define x(_name, ...)							\
+	case Inode_opt_##_name:						\
 		inode->bi_##_name = v;					\
-		if (v)							\
-			inode->bi_fields_set |= 1U << Inode_opt_##_name;\
-		else							\
-			inode->bi_fields_set &= ~(1U << Inode_opt_##_name);\
 		break;
 	BCH_INODE_OPTS()
 #undef x
@@ -92,16 +90,18 @@ static inline void __bch2_inode_opt_set(struct bch_inode_unpacked *inode,
 	}
 }
 
-static inline void bch2_inode_opt_set(struct bch_inode_unpacked *inode,
-				      enum bch_opt_id id, u64 v)
-{
-	return __bch2_inode_opt_set(inode, id, v + 1);
-}
-
-static inline void bch2_inode_opt_clear(struct bch_inode_unpacked *inode,
-					enum bch_opt_id id)
+static inline u64 bch2_inode_opt_get(struct bch_inode_unpacked *inode,
+				     enum inode_opt_id id)
 {
-	return __bch2_inode_opt_set(inode, id, 0);
+	switch (id) {
+#define x(_name, ...)							\
+	case Inode_opt_##_name:						\
+		return inode->bi_##_name;
+	BCH_INODE_OPTS()
+#undef x
+	default:
+		BUG();
+	}
 }
 
 #ifdef CONFIG_BCACHEFS_DEBUG
diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c
index ff2d59ee1658..dfb5c385e8c3 100644
--- a/fs/bcachefs/xattr.c
+++ b/fs/bcachefs/xattr.c
@@ -198,40 +198,83 @@ int bch2_xattr_set(struct btree_trans *trans, u64 inum,
 	return ret;
 }
 
-static size_t bch2_xattr_emit(struct dentry *dentry,
-			     const struct bch_xattr *xattr,
-			     char *buffer, size_t buffer_size)
+static void __bch2_xattr_emit(const char *prefix,
+			      const char *name, size_t name_len,
+			      char **buffer, size_t *buffer_size,
+			      ssize_t *ret)
+{
+	const size_t prefix_len = strlen(prefix);
+	const size_t total_len = prefix_len + name_len + 1;
+
+	if (*buffer) {
+		if (total_len > *buffer_size) {
+			*ret = -ERANGE;
+			return;
+		}
+
+		memcpy(*buffer, prefix, prefix_len);
+		memcpy(*buffer + prefix_len,
+		       name, name_len);
+		(*buffer)[prefix_len + name_len] = '\0';
+
+		*buffer		+= total_len;
+		*buffer_size	-= total_len;
+	}
+
+	*ret += total_len;
+}
+
+static void bch2_xattr_emit(struct dentry *dentry,
+			    const struct bch_xattr *xattr,
+			    char **buffer, size_t *buffer_size,
+			    ssize_t *ret)
 {
 	const struct xattr_handler *handler =
 		bch2_xattr_type_to_handler(xattr->x_type);
 
-	if (handler && (!handler->list || handler->list(dentry))) {
-		const char *prefix = handler->prefix ?: handler->name;
-		const size_t prefix_len = strlen(prefix);
-		const size_t total_len = prefix_len + xattr->x_name_len + 1;
+	if (handler && (!handler->list || handler->list(dentry)))
+		__bch2_xattr_emit(handler->prefix ?: handler->name,
+				  xattr->x_name, xattr->x_name_len,
+				  buffer, buffer_size, ret);
+}
 
-		if (buffer && total_len <= buffer_size) {
-			memcpy(buffer, prefix, prefix_len);
-			memcpy(buffer + prefix_len,
-			       xattr->x_name, xattr->x_name_len);
-			buffer[prefix_len + xattr->x_name_len] = '\0';
-		}
+static void bch2_xattr_list_bcachefs(struct bch_fs *c,
+				     struct bch_inode_info *inode,
+				     char **buffer,
+				     size_t *buffer_size,
+				     ssize_t *ret,
+				     bool all)
+{
+	const char *prefix = all ? "bcachefs_effective." : "bcachefs.";
+	unsigned id;
+	u64 v;
 
-		return total_len;
-	} else {
-		return 0;
+	for (id = 0; id < Inode_opt_nr; id++) {
+		v = bch2_inode_opt_get(&inode->ei_inode, id);
+		if (!v)
+			continue;
+
+		if (!all &&
+		    !(inode->ei_inode.bi_fields_set & (1 << id)))
+			continue;
+
+		__bch2_xattr_emit(prefix,
+				  bch2_inode_opts[id],
+				  strlen(bch2_inode_opts[id]),
+				  buffer, buffer_size, ret);
+		if (*ret < 0)
+			break;
 	}
 }
 
 ssize_t bch2_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size)
 {
 	struct bch_fs *c = dentry->d_sb->s_fs_info;
+	struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
 	struct btree_iter iter;
 	struct bkey_s_c k;
-	const struct bch_xattr *xattr;
 	u64 inum = dentry->d_inode->i_ino;
 	ssize_t ret = 0;
-	size_t len;
 
 	for_each_btree_key(&iter, c, BTREE_ID_XATTRS, POS(inum, 0), 0, k) {
 		BUG_ON(k.k->p.inode < inum);
@@ -242,23 +285,25 @@ ssize_t bch2_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size)
 		if (k.k->type != KEY_TYPE_xattr)
 			continue;
 
-		xattr = bkey_s_c_to_xattr(k).v;
-
-		len = bch2_xattr_emit(dentry, xattr, buffer, buffer_size);
-		if (buffer) {
-			if (len > buffer_size) {
-				bch2_btree_iter_unlock(&iter);
-				return -ERANGE;
-			}
+		bch2_xattr_emit(dentry, bkey_s_c_to_xattr(k).v,
+				&buffer, &buffer_size, &ret);
+		if (ret < 0)
+			break;
+	}
+	bch2_btree_iter_unlock(&iter);
 
-			buffer += len;
-			buffer_size -= len;
-		}
+	if (ret < 0)
+		return ret;
 
-		ret += len;
+	bch2_xattr_list_bcachefs(c, inode, &buffer,
+				 &buffer_size, &ret, false);
+	if (ret < 0)
+		return ret;
 
-	}
-	bch2_btree_iter_unlock(&iter);
+	bch2_xattr_list_bcachefs(c, inode, &buffer,
+				 &buffer_size, &ret, true);
+	if (ret < 0)
+		return ret;
 
 	return ret;
 }
@@ -318,27 +363,48 @@ static const struct xattr_handler bch_xattr_security_handler = {
 
 #ifndef NO_BCACHEFS_FS
 
-static int bch2_xattr_bcachefs_get(const struct xattr_handler *handler,
-				   struct dentry *dentry, struct inode *vinode,
-				   const char *name, void *buffer, size_t size)
+static int opt_to_inode_opt(int id)
+{
+	switch (id) {
+#define x(name, ...)				\
+	case Opt_##name: return Inode_opt_##name;
+	BCH_INODE_OPTS()
+#undef  x
+	default:
+		return -1;
+	}
+}
+
+static int __bch2_xattr_bcachefs_get(const struct xattr_handler *handler,
+				struct dentry *dentry, struct inode *vinode,
+				const char *name, void *buffer, size_t size,
+				bool all)
 {
 	struct bch_inode_info *inode = to_bch_ei(vinode);
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
 	struct bch_opts opts =
 		bch2_inode_opts_to_opts(bch2_inode_opts_get(&inode->ei_inode));
 	const struct bch_option *opt;
-	int id;
+	int id, inode_opt_id;
 	u64 v;
 
 	id = bch2_opt_lookup(name);
 	if (id < 0 || !bch2_opt_is_inode_opt(id))
 		return -EINVAL;
 
+	inode_opt_id = opt_to_inode_opt(id);
+	if (inode_opt_id < 0)
+		return -EINVAL;
+
 	opt = bch2_opt_table + id;
 
 	if (!bch2_opt_defined_by_id(&opts, id))
 		return -ENODATA;
 
+	if (!all &&
+	    !(inode->ei_inode.bi_fields_set & (1 << inode_opt_id)))
+		return -ENODATA;
+
 	v = bch2_opt_get_by_id(&opts, id);
 
 	if (!buffer) {
@@ -359,6 +425,14 @@ static int bch2_xattr_bcachefs_get(const struct xattr_handler *handler,
 	}
 }
 
+static int bch2_xattr_bcachefs_get(const struct xattr_handler *handler,
+				   struct dentry *dentry, struct inode *vinode,
+				   const char *name, void *buffer, size_t size)
+{
+	return __bch2_xattr_bcachefs_get(handler, dentry, vinode,
+					 name, buffer, size, false);
+}
+
 struct inode_opt_set {
 	int			id;
 	u64			v;
@@ -372,9 +446,12 @@ static int inode_opt_set_fn(struct bch_inode_info *inode,
 	struct inode_opt_set *s = p;
 
 	if (s->defined)
-		bch2_inode_opt_set(bi, s->id, s->v);
+		bi->bi_fields_set |= 1U << s->id;
 	else
-		bch2_inode_opt_clear(bi, s->id);
+		bi->bi_fields_set &= ~(1U << s->id);
+
+	bch2_inode_opt_set(bi, s->id, s->v);
+
 	return 0;
 }
 
@@ -389,33 +466,51 @@ static int bch2_xattr_bcachefs_set(const struct xattr_handler *handler,
 	const struct bch_option *opt;
 	char *buf;
 	struct inode_opt_set s;
-	int ret;
+	int opt_id, inode_opt_id, ret;
+
+	opt_id = bch2_opt_lookup(name);
+	if (opt_id < 0)
+		return -EINVAL;
 
-	s.id = bch2_opt_lookup(name);
-	if (s.id < 0 || !bch2_opt_is_inode_opt(s.id))
+	opt = bch2_opt_table + opt_id;
+
+	inode_opt_id = opt_to_inode_opt(opt_id);
+	if (inode_opt_id < 0)
 		return -EINVAL;
 
-	opt = bch2_opt_table + s.id;
+	s.id = inode_opt_id;
 
 	if (value) {
+		u64 v = 0;
+
 		buf = kmalloc(size + 1, GFP_KERNEL);
 		if (!buf)
 			return -ENOMEM;
 		memcpy(buf, value, size);
 		buf[size] = '\0';
 
-		ret = bch2_opt_parse(c, opt, buf, &s.v);
+		ret = bch2_opt_parse(c, opt, buf, &v);
 		kfree(buf);
 
 		if (ret < 0)
 			return ret;
 
-		ret = bch2_opt_check_may_set(c, s.id, s.v);
+		ret = bch2_opt_check_may_set(c, opt_id, v);
 		if (ret < 0)
 			return ret;
 
+		s.v = v + 1;
 		s.defined = true;
 	} else {
+		if (!IS_ROOT(dentry)) {
+			struct bch_inode_info *dir =
+				to_bch_ei(d_inode(dentry->d_parent));
+
+			s.v = bch2_inode_opt_get(&dir->ei_inode, inode_opt_id);
+		} else {
+			s.v = 0;
+		}
+
 		s.defined = false;
 	}
 
@@ -424,8 +519,8 @@ static int bch2_xattr_bcachefs_set(const struct xattr_handler *handler,
 	mutex_unlock(&inode->ei_update_lock);
 
 	if (value &&
-	    (s.id == Opt_background_compression ||
-	     s.id == Opt_background_target))
+	    (opt_id == Opt_background_compression ||
+	     opt_id == Opt_background_target))
 		bch2_rebalance_add_work(c, inode->v.i_blocks);
 
 	return ret;
@@ -437,6 +532,21 @@ static const struct xattr_handler bch_xattr_bcachefs_handler = {
 	.set	= bch2_xattr_bcachefs_set,
 };
 
+static int bch2_xattr_bcachefs_get_effective(
+				const struct xattr_handler *handler,
+				struct dentry *dentry, struct inode *vinode,
+				const char *name, void *buffer, size_t size)
+{
+	return __bch2_xattr_bcachefs_get(handler, dentry, vinode,
+					 name, buffer, size, true);
+}
+
+static const struct xattr_handler bch_xattr_bcachefs_effective_handler = {
+	.prefix	= "bcachefs_effective.",
+	.get	= bch2_xattr_bcachefs_get_effective,
+	.set	= bch2_xattr_bcachefs_set,
+};
+
 #endif /* NO_BCACHEFS_FS */
 
 const struct xattr_handler *bch2_xattr_handlers[] = {
@@ -447,6 +557,7 @@ const struct xattr_handler *bch2_xattr_handlers[] = {
 	&bch_xattr_security_handler,
 #ifndef NO_BCACHEFS_FS
 	&bch_xattr_bcachefs_handler,
+	&bch_xattr_bcachefs_effective_handler,
 #endif
 	NULL
 };
-- 
cgit 


From 96012e143e699db1a7644e4c5903b63bdde33772 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 17 Dec 2018 05:31:49 -0500
Subject: bcachefs: rename keeps inheritable inode opts consistent

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs.c | 84 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/bcachefs/fs.h | 25 +++++++++++++++++
 2 files changed, 109 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index d22b9e7e2082..033582a87852 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -282,6 +282,32 @@ int bch2_fs_quota_transfer(struct bch_fs *c,
 	return ret;
 }
 
+int bch2_reinherit_attrs_fn(struct bch_inode_info *inode,
+			    struct bch_inode_unpacked *bi,
+			    void *p)
+{
+	struct bch_inode_info *dir = p;
+	u64 src, dst;
+	unsigned id;
+	int ret = 1;
+
+	for (id = 0; id < Inode_opt_nr; id++) {
+		if (bi->bi_fields_set & (1 << id))
+			continue;
+
+		src = bch2_inode_opt_get(&dir->ei_inode, id);
+		dst = bch2_inode_opt_get(bi, id);
+
+		if (src == dst)
+			continue;
+
+		bch2_inode_opt_set(bi, id, src);
+		ret = 0;
+	}
+
+	return ret;
+}
+
 static struct inode *bch2_vfs_inode_get(struct bch_fs *c, u64 inum)
 {
 	struct bch_inode_unpacked inode_u;
@@ -765,6 +791,7 @@ static int inode_update_for_rename_fn(struct bch_inode_info *inode,
 				      void *p)
 {
 	struct rename_info *info = p;
+	int ret;
 
 	if (inode == info->src_dir) {
 		bi->bi_nlink -= S_ISDIR(info->src_inode->v.i_mode);
@@ -779,6 +806,19 @@ static int inode_update_for_rename_fn(struct bch_inode_info *inode,
 			S_ISDIR(info->dst_inode->v.i_mode);
 	}
 
+	if (inode == info->src_inode) {
+		ret = bch2_reinherit_attrs_fn(inode, bi, info->dst_dir);
+
+		BUG_ON(!ret && S_ISDIR(info->src_inode->v.i_mode));
+	}
+
+	if (inode == info->dst_inode &&
+	    info->mode == BCH_RENAME_EXCHANGE) {
+		ret = bch2_reinherit_attrs_fn(inode, bi, info->src_dir);
+
+		BUG_ON(!ret && S_ISDIR(info->dst_inode->v.i_mode));
+	}
+
 	if (inode == info->dst_inode &&
 	    info->mode == BCH_RENAME_OVERWRITE) {
 		BUG_ON(bi->bi_nlink &&
@@ -844,6 +884,39 @@ static int bch2_rename2(struct mnt_idmap *idmap,
 			 i.dst_inode);
 
 	bch2_trans_init(&trans, c);
+
+	if (S_ISDIR(i.src_inode->v.i_mode) &&
+	    inode_attrs_changing(i.dst_dir, i.src_inode)) {
+		ret = -EXDEV;
+		goto err;
+	}
+
+	if (i.mode == BCH_RENAME_EXCHANGE &&
+	    S_ISDIR(i.dst_inode->v.i_mode) &&
+	    inode_attrs_changing(i.src_dir, i.dst_inode)) {
+		ret = -EXDEV;
+		goto err;
+	}
+
+	if (inode_attr_changing(i.dst_dir, i.src_inode, Inode_opt_project)) {
+		ret = bch2_fs_quota_transfer(c, i.src_inode,
+					     i.dst_dir->ei_qid,
+					     1 << QTYP_PRJ,
+					     KEY_TYPE_QUOTA_PREALLOC);
+		if (ret)
+			goto err;
+	}
+
+	if (i.mode == BCH_RENAME_EXCHANGE &&
+	    inode_attr_changing(i.src_dir, i.dst_inode, Inode_opt_project)) {
+		ret = bch2_fs_quota_transfer(c, i.dst_inode,
+					     i.src_dir->ei_qid,
+					     1 << QTYP_PRJ,
+					     KEY_TYPE_QUOTA_PREALLOC);
+		if (ret)
+			goto err;
+	}
+
 retry:
 	bch2_trans_begin(&trans);
 	i.now = bch2_current_time(c);
@@ -894,6 +967,17 @@ retry:
 					      ATTR_CTIME);
 err:
 	bch2_trans_exit(&trans);
+
+	bch2_fs_quota_transfer(c, i.src_inode,
+			       bch_qid(&i.src_inode->ei_inode),
+			       1 << QTYP_PRJ,
+			       KEY_TYPE_QUOTA_NOCHECK);
+	if (i.dst_inode)
+		bch2_fs_quota_transfer(c, i.dst_inode,
+				       bch_qid(&i.dst_inode->ei_inode),
+				       1 << QTYP_PRJ,
+				       KEY_TYPE_QUOTA_NOCHECK);
+
 	bch2_unlock_inodes(i.src_dir,
 			   i.dst_dir,
 			   i.src_inode,
diff --git a/fs/bcachefs/fs.h b/fs/bcachefs/fs.h
index fbb31976bc55..18e41609c89d 100644
--- a/fs/bcachefs/fs.h
+++ b/fs/bcachefs/fs.h
@@ -66,6 +66,27 @@ static inline unsigned nlink_bias(umode_t mode)
 	return S_ISDIR(mode) ? 2 : 1;
 }
 
+static inline bool inode_attr_changing(struct bch_inode_info *dir,
+				struct bch_inode_info *inode,
+				enum inode_opt_id id)
+{
+	return !(inode->ei_inode.bi_fields_set & (1 << id)) &&
+		bch2_inode_opt_get(&dir->ei_inode, id) !=
+		bch2_inode_opt_get(&inode->ei_inode, id);
+}
+
+static inline bool inode_attrs_changing(struct bch_inode_info *dir,
+				 struct bch_inode_info *inode)
+{
+	unsigned id;
+
+	for (id = 0; id < Inode_opt_nr; id++)
+		if (inode_attr_changing(dir, inode, id))
+			return true;
+
+	return false;
+}
+
 struct bch_inode_unpacked;
 
 #ifndef NO_BCACHEFS_FS
@@ -91,6 +112,10 @@ int __must_check bch2_write_inode_trans(struct btree_trans *,
 int __must_check bch2_write_inode(struct bch_fs *, struct bch_inode_info *,
 				  inode_set_fn, void *, unsigned);
 
+int bch2_reinherit_attrs_fn(struct bch_inode_info *,
+			    struct bch_inode_unpacked *,
+			    void *);
+
 void bch2_vfs_exit(void);
 int bch2_vfs_init(void);
 
-- 
cgit 


From 8095708fce72a911e20799078639e95c1a008176 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 17 Dec 2018 06:11:14 -0500
Subject: bcachefs: bch2_ioc_reinherit_attrs()

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs_ioctl.h |  2 ++
 fs/bcachefs/fs-ioctl.c       | 77 +++++++++++++++++++++++++++++++++++++++++++-
 fs/bcachefs/fs.c             | 31 ++----------------
 fs/bcachefs/fs.h             | 26 +++++++++++++++
 fs/bcachefs/inode.c          |  3 +-
 5 files changed, 109 insertions(+), 30 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs_ioctl.h b/fs/bcachefs/bcachefs_ioctl.h
index c65104ed454a..2dca4bb0362b 100644
--- a/fs/bcachefs/bcachefs_ioctl.h
+++ b/fs/bcachefs/bcachefs_ioctl.h
@@ -307,4 +307,6 @@ struct bch_ioctl_disk_resize {
 	__u64			nbuckets;
 };
 
+#define BCHFS_IOC_REINHERIT_ATTRS	_IOR(0xbc, 14, const char __user *)
+
 #endif /* _BCACHEFS_IOCTL_H */
diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c
index d6563370bec4..92939befe507 100644
--- a/fs/bcachefs/fs-ioctl.c
+++ b/fs/bcachefs/fs-ioctl.c
@@ -3,6 +3,7 @@
 
 #include "bcachefs.h"
 #include "chardev.h"
+#include "dirent.h"
 #include "fs.h"
 #include "fs-ioctl.h"
 #include "quota.h"
@@ -177,6 +178,75 @@ err:
 	return ret;
 }
 
+static int bch2_ioc_reinherit_attrs(struct bch_fs *c,
+				    struct file *file,
+				    struct bch_inode_info *src,
+				    const char __user *name)
+{
+	struct bch_inode_info *dst;
+	struct inode *vinode = NULL;
+	char *kname = NULL;
+	struct qstr qstr;
+	int ret = 0;
+	u64 inum;
+
+	kname = kmalloc(BCH_NAME_MAX + 1, GFP_KERNEL);
+	if (!kname)
+		return -ENOMEM;
+
+	ret = strncpy_from_user(kname, name, BCH_NAME_MAX);
+	if (unlikely(ret < 0))
+		goto err1;
+
+	qstr.hash_len	= ret;
+	qstr.name	= kname;
+
+	ret = -ENOENT;
+	inum = bch2_dirent_lookup(c, src->v.i_ino,
+				  &src->ei_str_hash,
+				  &qstr);
+	if (!inum)
+		goto err1;
+
+	vinode = bch2_vfs_inode_get(c, inum);
+	ret = PTR_ERR_OR_ZERO(vinode);
+	if (ret)
+		goto err1;
+
+	dst = to_bch_ei(vinode);
+
+	ret = mnt_want_write_file(file);
+	if (ret)
+		goto err2;
+
+	bch2_lock_inodes(src, dst);
+
+	if (inode_attr_changing(src, dst, Inode_opt_project)) {
+		ret = bch2_fs_quota_transfer(c, dst,
+					     src->ei_qid,
+					     1 << QTYP_PRJ,
+					     KEY_TYPE_QUOTA_PREALLOC);
+		if (ret)
+			goto err3;
+	}
+
+	ret = bch2_write_inode(c, dst, bch2_reinherit_attrs_fn, src, 0);
+err3:
+	bch2_unlock_inodes(src, dst);
+
+	/* return true if we did work */
+	if (ret >= 0)
+		ret = !ret;
+
+	mnt_drop_write_file(file);
+err2:
+	iput(vinode);
+err1:
+	kfree(kname);
+
+	return ret;
+}
+
 long bch2_fs_file_ioctl(struct file *file, unsigned cmd, unsigned long arg)
 {
 	struct bch_inode_info *inode = file_bch_inode(file);
@@ -193,7 +263,12 @@ long bch2_fs_file_ioctl(struct file *file, unsigned cmd, unsigned long arg)
 	case FS_IOC_FSGETXATTR:
 		return bch2_ioc_fsgetxattr(inode, (void __user *) arg);
 	case FS_IOC_FSSETXATTR:
-		return bch2_ioc_fssetxattr(c, file, inode, (void __user *) arg);
+		return bch2_ioc_fssetxattr(c, file, inode,
+					   (void __user *) arg);
+
+	case BCHFS_IOC_REINHERIT_ATTRS:
+		return bch2_ioc_reinherit_attrs(c, file, inode,
+						(void __user *) arg);
 
 	case FS_IOC_GETVERSION:
 		return -ENOTTY;
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index 033582a87852..d23a82d94c5e 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -51,30 +51,6 @@ static void journal_seq_copy(struct bch_inode_info *dst,
 	} while ((v = cmpxchg(&dst->ei_journal_seq, old, journal_seq)) != old);
 }
 
-static inline int ptrcmp(void *l, void *r)
-{
-	return (l > r) - (l < r);
-}
-
-#define __bch2_lock_inodes(_lock, ...)					\
-do {									\
-	struct bch_inode_info *a[] = { NULL, __VA_ARGS__ };		\
-	unsigned i;							\
-									\
-	bubble_sort(&a[1], ARRAY_SIZE(a) - 1 , ptrcmp);			\
-									\
-	for (i = ARRAY_SIZE(a) - 1; a[i]; --i)				\
-		if (a[i] != a[i - 1]) {					\
-			if (_lock)					\
-				mutex_lock_nested(&a[i]->ei_update_lock, i);\
-			else						\
-				mutex_unlock(&a[i]->ei_update_lock);	\
-		}							\
-} while (0)
-
-#define bch2_lock_inodes(...)	__bch2_lock_inodes(true, __VA_ARGS__)
-#define bch2_unlock_inodes(...)	__bch2_lock_inodes(false, __VA_ARGS__)
-
 static void __pagecache_lock_put(struct pagecache_lock *lock, long i)
 {
 	BUG_ON(atomic_long_read(&lock->v) == 0);
@@ -308,7 +284,7 @@ int bch2_reinherit_attrs_fn(struct bch_inode_info *inode,
 	return ret;
 }
 
-static struct inode *bch2_vfs_inode_get(struct bch_fs *c, u64 inum)
+struct inode *bch2_vfs_inode_get(struct bch_fs *c, u64 inum)
 {
 	struct bch_inode_unpacked inode_u;
 	struct bch_inode_info *inode;
@@ -393,14 +369,13 @@ __bch2_create(struct mnt_idmap *idmap,
 	bch2_inode_init(c, &inode_u, 0, 0, 0, rdev, &dir->ei_inode);
 	bch2_inode_init_owner(&inode_u, &dir->v, mode);
 
-	inode_u.bi_project = dir->ei_qid.q[QTYP_PRJ];
-
 	hash_info = bch2_hash_info_init(c, &inode_u);
 
 	if (tmpfile)
 		inode_u.bi_flags |= BCH_INODE_UNLINKED;
 
-	ret = bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, 1, KEY_TYPE_QUOTA_PREALLOC);
+	ret = bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, 1,
+			      KEY_TYPE_QUOTA_PREALLOC);
 	if (ret)
 		return ERR_PTR(ret);
 
diff --git a/fs/bcachefs/fs.h b/fs/bcachefs/fs.h
index 18e41609c89d..4c584d3a27c3 100644
--- a/fs/bcachefs/fs.h
+++ b/fs/bcachefs/fs.h
@@ -51,6 +51,30 @@ struct bch_inode_info {
 #define to_bch_ei(_inode)					\
 	container_of_or_null(_inode, struct bch_inode_info, v)
 
+static inline int ptrcmp(void *l, void *r)
+{
+	return (l > r) - (l < r);
+}
+
+#define __bch2_lock_inodes(_lock, ...)					\
+do {									\
+	struct bch_inode_info *a[] = { NULL, __VA_ARGS__ };		\
+	unsigned i;							\
+									\
+	bubble_sort(&a[1], ARRAY_SIZE(a) - 1 , ptrcmp);			\
+									\
+	for (i = ARRAY_SIZE(a) - 1; a[i]; --i)				\
+		if (a[i] != a[i - 1]) {					\
+			if (_lock)					\
+				mutex_lock_nested(&a[i]->ei_update_lock, i);\
+			else						\
+				mutex_unlock(&a[i]->ei_update_lock);	\
+		}							\
+} while (0)
+
+#define bch2_lock_inodes(...)	__bch2_lock_inodes(true, __VA_ARGS__)
+#define bch2_unlock_inodes(...)	__bch2_lock_inodes(false, __VA_ARGS__)
+
 static inline struct bch_inode_info *file_bch_inode(struct file *file)
 {
 	return to_bch_ei(file_inode(file));
@@ -97,6 +121,8 @@ int bch2_fs_quota_transfer(struct bch_fs *,
 			   unsigned,
 			   enum quota_acct_mode);
 
+struct inode *bch2_vfs_inode_get(struct bch_fs *, u64);
+
 /* returns 0 if we want to do the update, or error is passed up */
 typedef int (*inode_set_fn)(struct bch_inode_info *,
 			    struct bch_inode_unpacked *, void *);
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index 23d3668b4567..6acb487312a8 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -258,7 +258,8 @@ void bch2_inode_init(struct bch_fs *c, struct bch_inode_unpacked *inode_u,
 
 	/* ick */
 	inode_u->bi_flags |= c->opts.str_hash << INODE_STR_HASH_OFFSET;
-	get_random_bytes(&inode_u->bi_hash_seed, sizeof(inode_u->bi_hash_seed));
+	get_random_bytes(&inode_u->bi_hash_seed,
+			 sizeof(inode_u->bi_hash_seed));
 
 	inode_u->bi_mode	= mode;
 	inode_u->bi_uid		= uid;
-- 
cgit 


From 19b505a93434500469d9e2726967f28685f777a3 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 17 Dec 2018 09:24:49 -0500
Subject: bcachefs: Fix duplicate ioctl nr

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs_ioctl.h | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs_ioctl.h b/fs/bcachefs/bcachefs_ioctl.h
index 2dca4bb0362b..8c0599618404 100644
--- a/fs/bcachefs/bcachefs_ioctl.h
+++ b/fs/bcachefs/bcachefs_ioctl.h
@@ -71,7 +71,11 @@ struct bch_ioctl_incremental {
 #define BCH_IOCTL_USAGE		_IOWR(0xbc,	11, struct bch_ioctl_usage)
 #define BCH_IOCTL_READ_SUPER	_IOW(0xbc,	12, struct bch_ioctl_read_super)
 #define BCH_IOCTL_DISK_GET_IDX	_IOW(0xbc,	13,  struct bch_ioctl_disk_get_idx)
-#define BCH_IOCTL_DISK_RESIZE	_IOW(0xbc,	13,  struct bch_ioctl_disk_resize)
+#define BCH_IOCTL_DISK_RESIZE	_IOW(0xbc,	14,  struct bch_ioctl_disk_resize)
+
+/* ioctl below act on a particular file, not the filesystem as a whole: */
+
+#define BCHFS_IOC_REINHERIT_ATTRS	_IOR(0xbc, 64, const char __user *)
 
 /*
  * BCH_IOCTL_QUERY_UUID: get filesystem UUID
@@ -307,6 +311,4 @@ struct bch_ioctl_disk_resize {
 	__u64			nbuckets;
 };
 
-#define BCHFS_IOC_REINHERIT_ATTRS	_IOR(0xbc, 14, const char __user *)
-
 #endif /* _BCACHEFS_IOCTL_H */
-- 
cgit 


From d3bb629d04ec4ec49556fe3d974e0744a0dd0084 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 18 Dec 2018 08:41:58 -0500
Subject: bcachefs: fix device remove error path

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/super.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index b2113c5426ca..4e811ffdec8e 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -1306,7 +1306,8 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
 	mutex_unlock(&c->state_lock);
 	return 0;
 err:
-	if (ca->mi.state == BCH_MEMBER_STATE_RW)
+	if (ca->mi.state == BCH_MEMBER_STATE_RW &&
+	    !percpu_ref_is_zero(&ca->io_ref))
 		__bch2_dev_read_write(c, ca);
 	mutex_unlock(&c->state_lock);
 	return ret;
-- 
cgit 


From a36d3685bbc3a5193f0c284233386d56136363c7 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 18 Dec 2018 10:15:35 -0500
Subject: bcachefs: fix ja->cur_idx use while reading journal

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/journal_io.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 4178dd9ceb8e..67ff2633ba16 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -584,7 +584,7 @@ static void bch2_journal_read_device(struct closure *cl)
 	while (ja->bucket_seq[ja->cur_idx] > min_seq &&
 	       ja->bucket_seq[ja->cur_idx] >
 	       ja->bucket_seq[(ja->cur_idx + 1) % ja->nr])
-		ja->cur_idx++;
+		ja->cur_idx = (ja->cur_idx + 1) % ja->nr;
 
 	ja->sectors_free = 0;
 
-- 
cgit 


From 69d46f903120d4aab0e0ad239191245d839224cc Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 18 Dec 2018 14:43:00 -0500
Subject: bcachefs: fix an rcu usage bug

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/io.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index 74c3a848e153..e1c7d572fbff 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -967,6 +967,7 @@ void bch2_write(struct closure *cl)
 
 struct promote_op {
 	struct closure		cl;
+	struct rcu_head		rcu;
 	u64			start_time;
 
 	struct rhash_head	hash;
@@ -1020,7 +1021,7 @@ static void promote_free(struct bch_fs *c, struct promote_op *op)
 				     bch_promote_params);
 	BUG_ON(ret);
 	percpu_ref_put(&c->writes);
-	kfree(op);
+	kfree_rcu(op, rcu);
 }
 
 static void promote_done(struct closure *cl)
-- 
cgit 


From 2fab25cdd70be6868936639dfb03eaa9fa0245c0 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 19 Dec 2018 08:43:01 -0500
Subject: bcachefs: more project quota fixes

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-ioctl.c | 26 ++++++--------------------
 fs/bcachefs/fs.h       | 13 +++++++++++++
 fs/bcachefs/quota.h    |  2 +-
 fs/bcachefs/xattr.c    |  7 +++++++
 4 files changed, 27 insertions(+), 21 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c
index 92939befe507..4925a127a335 100644
--- a/fs/bcachefs/fs-ioctl.c
+++ b/fs/bcachefs/fs-ioctl.c
@@ -104,19 +104,6 @@ static int bch2_ioc_fsgetxattr(struct bch_inode_info *inode,
 	return copy_to_user(arg, &fa, sizeof(fa));
 }
 
-static int bch2_set_projid(struct bch_fs *c,
-			   struct bch_inode_info *inode,
-			   u32 projid)
-{
-	struct bch_qid qid = inode->ei_qid;
-
-	qid.q[QTYP_PRJ] = projid;
-
-	return bch2_fs_quota_transfer(c, inode, qid,
-				      1 << QTYP_PRJ,
-				      KEY_TYPE_QUOTA_PREALLOC);
-}
-
 static int fssetxattr_inode_update_fn(struct bch_inode_info *inode,
 				      struct bch_inode_unpacked *bi,
 				      void *p)
@@ -124,11 +111,7 @@ static int fssetxattr_inode_update_fn(struct bch_inode_info *inode,
 	struct flags_set *s = p;
 
 	if (s->projid != bi->bi_project) {
-		if (s->projid)
-			bi->bi_fields_set |= 1U << Inode_opt_project;
-		else
-			bi->bi_fields_set &= ~(1U << Inode_opt_project);
-
+		bi->bi_fields_set |= 1U << Inode_opt_project;
 		bi->bi_project = s->projid;
 	}
 
@@ -151,7 +134,10 @@ static int bch2_ioc_fssetxattr(struct bch_fs *c,
 	if (fa.fsx_xflags)
 		return -EOPNOTSUPP;
 
-	s.projid = fa.fsx_projid;
+	if (fa.fsx_projid >= U32_MAX)
+		return -EINVAL;
+
+	s.projid = fa.fsx_projid + 1;
 
 	ret = mnt_want_write_file(file);
 	if (ret)
@@ -164,7 +150,7 @@ static int bch2_ioc_fssetxattr(struct bch_fs *c,
 	}
 
 	mutex_lock(&inode->ei_update_lock);
-	ret = bch2_set_projid(c, inode, fa.fsx_projid);
+	ret = bch2_set_projid(c, inode, s.projid);
 	if (ret)
 		goto err_unlock;
 
diff --git a/fs/bcachefs/fs.h b/fs/bcachefs/fs.h
index 4c584d3a27c3..f949cd0d2a68 100644
--- a/fs/bcachefs/fs.h
+++ b/fs/bcachefs/fs.h
@@ -121,6 +121,19 @@ int bch2_fs_quota_transfer(struct bch_fs *,
 			   unsigned,
 			   enum quota_acct_mode);
 
+static inline int bch2_set_projid(struct bch_fs *c,
+				  struct bch_inode_info *inode,
+				  u32 projid)
+{
+	struct bch_qid qid = inode->ei_qid;
+
+	qid.q[QTYP_PRJ] = projid;
+
+	return bch2_fs_quota_transfer(c, inode, qid,
+				      1 << QTYP_PRJ,
+				      KEY_TYPE_QUOTA_PREALLOC);
+}
+
 struct inode *bch2_vfs_inode_get(struct bch_fs *, u64);
 
 /* returns 0 if we want to do the update, or error is passed up */
diff --git a/fs/bcachefs/quota.h b/fs/bcachefs/quota.h
index 72b5ea0d77c5..51e4f9713ef0 100644
--- a/fs/bcachefs/quota.h
+++ b/fs/bcachefs/quota.h
@@ -20,7 +20,7 @@ static inline struct bch_qid bch_qid(struct bch_inode_unpacked *u)
 	return (struct bch_qid) {
 		.q[QTYP_USR] = u->bi_uid,
 		.q[QTYP_GRP] = u->bi_gid,
-		.q[QTYP_PRJ] = u->bi_project,
+		.q[QTYP_PRJ] = u->bi_project ? u->bi_project - 1 : 0,
 	};
 }
 
diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c
index dfb5c385e8c3..f31eec2f1fce 100644
--- a/fs/bcachefs/xattr.c
+++ b/fs/bcachefs/xattr.c
@@ -515,7 +515,14 @@ static int bch2_xattr_bcachefs_set(const struct xattr_handler *handler,
 	}
 
 	mutex_lock(&inode->ei_update_lock);
+	if (inode_opt_id == Inode_opt_project) {
+		ret = bch2_set_projid(c, inode, s.v);
+		if (ret)
+			goto err;
+	}
+
 	ret = bch2_write_inode(c, inode, inode_opt_set_fn, &s, 0);
+err:
 	mutex_unlock(&inode->ei_update_lock);
 
 	if (value &&
-- 
cgit 


From 0b847a19d96b66baeb651317d5e22f8bd4368975 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 19 Dec 2018 12:58:56 -0500
Subject: bcachefs: Lots of option handling improvements

Add helptext to option definitions - so  we can unify the option
handling with the format command

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs.h  |   4 +-
 fs/bcachefs/fs.c        |   2 +-
 fs/bcachefs/opts.c      |  47 ++++--
 fs/bcachefs/opts.h      | 386 +++++++++++++++++++++++++++++++-----------------
 fs/bcachefs/rebalance.c |  10 +-
 fs/bcachefs/sysfs.c     |   9 +-
 fs/bcachefs/tests.c     |   4 +-
 fs/bcachefs/util.c      |  27 ++--
 fs/bcachefs/util.h      |   3 +-
 9 files changed, 310 insertions(+), 182 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 92a0ecd8fbc3..244b808688b3 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -230,13 +230,13 @@
 
 #define bch_verbose(c, fmt, ...)					\
 do {									\
-	if ((c)->opts.verbose_recovery)					\
+	if ((c)->opts.verbose)						\
 		bch_info(c, fmt, ##__VA_ARGS__);			\
 } while (0)
 
 #define pr_verbose_init(opts, fmt, ...)					\
 do {									\
-	if (opt_get(opts, verbose_init))				\
+	if (opt_get(opts, verbose))					\
 		pr_info(fmt, ##__VA_ARGS__);				\
 } while (0)
 
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index d23a82d94c5e..02c7543e40c8 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -1690,7 +1690,7 @@ static int bch2_show_options(struct seq_file *seq, struct dentry *root)
 		const struct bch_option *opt = &bch2_opt_table[i];
 		u64 v = bch2_opt_get_by_id(&c->opts, i);
 
-		if (opt->mode < OPT_MOUNT)
+		if (!(opt->mode & OPT_MOUNT))
 			continue;
 
 		if (v == bch2_opt_get_by_id(&bch2_opts_default, i))
diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c
index 17245e0b4a73..13a9a2fcd575 100644
--- a/fs/bcachefs/opts.c
+++ b/fs/bcachefs/opts.c
@@ -75,22 +75,22 @@ const char * const bch2_dev_state[] = {
 
 void bch2_opts_apply(struct bch_opts *dst, struct bch_opts src)
 {
-#define BCH_OPT(_name, ...)						\
+#define x(_name, ...)						\
 	if (opt_defined(src, _name))					\
 		opt_set(*dst, _name, src._name);
 
 	BCH_OPTS()
-#undef BCH_OPT
+#undef x
 }
 
 bool bch2_opt_defined_by_id(const struct bch_opts *opts, enum bch_opt_id id)
 {
 	switch (id) {
-#define BCH_OPT(_name, ...)						\
+#define x(_name, ...)						\
 	case Opt_##_name:						\
 		return opt_defined(*opts, _name);
 	BCH_OPTS()
-#undef BCH_OPT
+#undef x
 	default:
 		BUG();
 	}
@@ -99,11 +99,11 @@ bool bch2_opt_defined_by_id(const struct bch_opts *opts, enum bch_opt_id id)
 u64 bch2_opt_get_by_id(const struct bch_opts *opts, enum bch_opt_id id)
 {
 	switch (id) {
-#define BCH_OPT(_name, ...)						\
+#define x(_name, ...)						\
 	case Opt_##_name:						\
 		return opts->_name;
 	BCH_OPTS()
-#undef BCH_OPT
+#undef x
 	default:
 		BUG();
 	}
@@ -112,12 +112,12 @@ u64 bch2_opt_get_by_id(const struct bch_opts *opts, enum bch_opt_id id)
 void bch2_opt_set_by_id(struct bch_opts *opts, enum bch_opt_id id, u64 v)
 {
 	switch (id) {
-#define BCH_OPT(_name, ...)						\
+#define x(_name, ...)						\
 	case Opt_##_name:						\
 		opt_set(*opts, _name, v);				\
 		break;
 	BCH_OPTS()
-#undef BCH_OPT
+#undef x
 	default:
 		BUG();
 	}
@@ -131,11 +131,11 @@ struct bch_opts bch2_opts_from_sb(struct bch_sb *sb)
 {
 	struct bch_opts opts = bch2_opts_empty();
 
-#define BCH_OPT(_name, _bits, _mode, _type, _sb_opt, _default)		\
+#define x(_name, _bits, _mode, _type, _sb_opt, ...)			\
 	if (_sb_opt != NO_SB_OPT)					\
 		opt_set(opts, _name, _sb_opt(sb));
 	BCH_OPTS()
-#undef BCH_OPT
+#undef x
 
 	return opts;
 }
@@ -143,24 +143,27 @@ struct bch_opts bch2_opts_from_sb(struct bch_sb *sb)
 const struct bch_option bch2_opt_table[] = {
 #define OPT_BOOL()		.type = BCH_OPT_BOOL
 #define OPT_UINT(_min, _max)	.type = BCH_OPT_UINT, .min = _min, .max = _max
+#define OPT_SECTORS(_min, _max)	.type = BCH_OPT_SECTORS, .min = _min, .max = _max
 #define OPT_STR(_choices)	.type = BCH_OPT_STR, .choices = _choices
 #define OPT_FN(_fn)		.type = BCH_OPT_FN,			\
 				.parse = _fn##_parse,			\
 				.to_text = _fn##_to_text
 
-#define BCH_OPT(_name, _bits, _mode, _type, _sb_opt, _default)		\
+#define x(_name, _bits, _mode, _type, _sb_opt, _default, _hint, _help)	\
 	[Opt_##_name] = {						\
 		.attr	= {						\
 			.name	= #_name,				\
-			.mode = _mode == OPT_RUNTIME ? 0644 : 0444,	\
+			.mode = (_mode) & OPT_RUNTIME ? 0644 : 0444,	\
 		},							\
 		.mode	= _mode,					\
+		.hint	= _hint,					\
+		.help	= _help,					\
 		.set_sb	= SET_##_sb_opt,				\
 		_type							\
 	},
 
 	BCH_OPTS()
-#undef BCH_OPT
+#undef x
 };
 
 int bch2_opt_lookup(const char *name)
@@ -216,6 +219,19 @@ int bch2_opt_parse(struct bch_fs *c, const struct bch_option *opt,
 		if (ret < 0)
 			return ret;
 
+		if (*res < opt->min || *res >= opt->max)
+			return -ERANGE;
+		break;
+	case BCH_OPT_SECTORS:
+		ret = bch2_strtou64_h(val, res);
+		if (ret < 0)
+			return ret;
+
+		if (*res & 511)
+			return -EINVAL;
+
+		*res >>= 9;
+
 		if (*res < opt->min || *res >= opt->max)
 			return -ERANGE;
 		break;
@@ -256,6 +272,9 @@ void bch2_opt_to_text(struct printbuf *out, struct bch_fs *c,
 	case BCH_OPT_UINT:
 		pr_buf(out, "%lli", v);
 		break;
+	case BCH_OPT_SECTORS:
+		bch2_hprint(out, v);
+		break;
 	case BCH_OPT_STR:
 		if (flags & OPT_SHOW_FULL_LIST)
 			bch2_string_opt_to_text(out, opt->choices, v);
@@ -345,7 +364,7 @@ int bch2_parse_mount_opts(struct bch_opts *opts, char *options)
 				goto no_val;
 		}
 
-		if (bch2_opt_table[id].mode < OPT_MOUNT)
+		if (!(bch2_opt_table[id].mode & OPT_MOUNT))
 			goto bad_opt;
 
 		if (id == Opt_acl &&
diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
index faa2a72c8c3b..f4cb0625c3cc 100644
--- a/fs/bcachefs/opts.h
+++ b/fs/bcachefs/opts.h
@@ -32,22 +32,25 @@ extern const char * const bch2_dev_state[];
 /* dummy option, for options that aren't stored in the superblock */
 LE64_BITMASK(NO_SB_OPT,		struct bch_sb, flags[0], 0, 0);
 
+/* When can be set: */
 enum opt_mode {
-	OPT_INTERNAL,
-	OPT_FORMAT,
-	OPT_MOUNT,
-	OPT_RUNTIME,
+	OPT_FORMAT	= (1 << 0),
+	OPT_MOUNT	= (1 << 1),
+	OPT_RUNTIME	= (1 << 2),
+	OPT_INODE	= (1 << 3),
+	OPT_DEVICE	= (1 << 4),
 };
 
 enum opt_type {
 	BCH_OPT_BOOL,
 	BCH_OPT_UINT,
+	BCH_OPT_SECTORS,
 	BCH_OPT_STR,
 	BCH_OPT_FN,
 };
 
 /**
- * BCH_OPT(name, type, in mem type, mode, sb_opt)
+ * x(name, shortopt, type, in mem type, mode, sb_opt)
  *
  * @name	- name of mount option, sysfs attribute, and struct bch_opts
  *		  member
@@ -66,150 +69,252 @@ enum opt_type {
  */
 
 #define BCH_OPTS()							\
-	BCH_OPT(block_size,		u16,	OPT_FORMAT,		\
-		OPT_UINT(1, 128),					\
-		BCH_SB_BLOCK_SIZE,		8)			\
-	BCH_OPT(btree_node_size,	u16,	OPT_FORMAT,		\
-		OPT_UINT(1, 128),					\
-		BCH_SB_BTREE_NODE_SIZE,		512)			\
-	BCH_OPT(errors,			u8,	OPT_RUNTIME,		\
-		OPT_STR(bch2_error_actions),				\
-		BCH_SB_ERROR_ACTION,		BCH_ON_ERROR_RO)	\
-	BCH_OPT(metadata_replicas,	u8,	OPT_RUNTIME,		\
-		OPT_UINT(1, BCH_REPLICAS_MAX),				\
-		BCH_SB_META_REPLICAS_WANT,	1)			\
-	BCH_OPT(data_replicas,		u8,	OPT_RUNTIME,		\
-		OPT_UINT(1, BCH_REPLICAS_MAX),				\
-		BCH_SB_DATA_REPLICAS_WANT,	1)			\
-	BCH_OPT(metadata_replicas_required, u8,	OPT_MOUNT,		\
-		OPT_UINT(1, BCH_REPLICAS_MAX),				\
-		BCH_SB_META_REPLICAS_REQ,	1)			\
-	BCH_OPT(data_replicas_required, u8,	OPT_MOUNT,		\
-		OPT_UINT(1, BCH_REPLICAS_MAX),				\
-		BCH_SB_DATA_REPLICAS_REQ,	1)			\
-	BCH_OPT(metadata_checksum,	u8,	OPT_RUNTIME,		\
-		OPT_STR(bch2_csum_types),				\
-		BCH_SB_META_CSUM_TYPE,		BCH_CSUM_OPT_CRC32C)	\
-	BCH_OPT(data_checksum,		u8,	OPT_RUNTIME,		\
-		OPT_STR(bch2_csum_types),				\
-		BCH_SB_DATA_CSUM_TYPE,		BCH_CSUM_OPT_CRC32C)	\
-	BCH_OPT(compression,		u8,	OPT_RUNTIME,		\
-		OPT_STR(bch2_compression_types),			\
-		BCH_SB_COMPRESSION_TYPE,	BCH_COMPRESSION_OPT_NONE)\
-	BCH_OPT(background_compression,	u8,	OPT_RUNTIME,		\
-		OPT_STR(bch2_compression_types),			\
-		BCH_SB_BACKGROUND_COMPRESSION_TYPE,BCH_COMPRESSION_OPT_NONE)\
-	BCH_OPT(str_hash,		u8,	OPT_RUNTIME,		\
-		OPT_STR(bch2_str_hash_types),				\
-		BCH_SB_STR_HASH_TYPE,		BCH_STR_HASH_SIPHASH)	\
-	BCH_OPT(foreground_target,	u16,	OPT_RUNTIME,		\
-		OPT_FN(bch2_opt_target),				\
-		BCH_SB_FOREGROUND_TARGET,	0)			\
-	BCH_OPT(background_target,	u16,	OPT_RUNTIME,		\
-		OPT_FN(bch2_opt_target),				\
-		BCH_SB_BACKGROUND_TARGET,	0)			\
-	BCH_OPT(promote_target,		u16,	OPT_RUNTIME,		\
-		OPT_FN(bch2_opt_target),				\
-		BCH_SB_PROMOTE_TARGET,	0)				\
-	BCH_OPT(erasure_code,		u16,	OPT_RUNTIME,		\
-		OPT_BOOL(),						\
-		BCH_SB_ERASURE_CODE,		false)			\
-	BCH_OPT(inodes_32bit,		u8,	OPT_RUNTIME,		\
-		OPT_BOOL(),						\
-		BCH_SB_INODE_32BIT,		false)			\
-	BCH_OPT(gc_reserve_percent,	u8,	OPT_RUNTIME,		\
-		OPT_UINT(5, 21),					\
-		BCH_SB_GC_RESERVE,		8)			\
-	BCH_OPT(gc_reserve_bytes,	u64,	OPT_RUNTIME,		\
-		OPT_UINT(0, U64_MAX),					\
-		BCH_SB_GC_RESERVE_BYTES,	0)			\
-	BCH_OPT(root_reserve_percent,	u8,	OPT_MOUNT,		\
-		OPT_UINT(0, 100),					\
-		BCH_SB_ROOT_RESERVE,		0)			\
-	BCH_OPT(wide_macs,		u8,	OPT_RUNTIME,		\
-		OPT_BOOL(),						\
-		BCH_SB_128_BIT_MACS,		false)			\
-	BCH_OPT(acl,			u8,	OPT_MOUNT,		\
-		OPT_BOOL(),						\
-		BCH_SB_POSIX_ACL,		true)			\
-	BCH_OPT(usrquota,		u8,	OPT_MOUNT,		\
-		OPT_BOOL(),						\
-		BCH_SB_USRQUOTA,		false)			\
-	BCH_OPT(grpquota,		u8,	OPT_MOUNT,		\
-		OPT_BOOL(),						\
-		BCH_SB_GRPQUOTA,		false)			\
-	BCH_OPT(prjquota,		u8,	OPT_MOUNT,		\
-		OPT_BOOL(),						\
-		BCH_SB_PRJQUOTA,		false)			\
-	BCH_OPT(degraded,		u8,	OPT_MOUNT,		\
-		OPT_BOOL(),						\
-		NO_SB_OPT,			false)			\
-	BCH_OPT(discard,		u8,	OPT_MOUNT,		\
-		OPT_BOOL(),						\
-		NO_SB_OPT,			false)			\
-	BCH_OPT(verbose_recovery,	u8,	OPT_MOUNT,		\
-		OPT_BOOL(),						\
-		NO_SB_OPT,			false)			\
-	BCH_OPT(verbose_init,		u8,	OPT_MOUNT,		\
-		OPT_BOOL(),						\
-		NO_SB_OPT,			false)			\
-	BCH_OPT(journal_flush_disabled, u8,	OPT_RUNTIME,		\
-		OPT_BOOL(),						\
-		NO_SB_OPT,			false)			\
-	BCH_OPT(fsck,			u8,	OPT_MOUNT,		\
-		OPT_BOOL(),						\
-		NO_SB_OPT,			true)			\
-	BCH_OPT(fix_errors,		u8,	OPT_MOUNT,		\
-		OPT_BOOL(),						\
-		NO_SB_OPT,			false)			\
-	BCH_OPT(nochanges,		u8,	OPT_MOUNT,		\
-		OPT_BOOL(),						\
-		NO_SB_OPT,			false)			\
-	BCH_OPT(noreplay,		u8,	OPT_MOUNT,		\
-		OPT_BOOL(),						\
-		NO_SB_OPT,			false)			\
-	BCH_OPT(norecovery,		u8,	OPT_MOUNT,		\
-		OPT_BOOL(),						\
-		NO_SB_OPT,			false)			\
-	BCH_OPT(noexcl,			u8,	OPT_MOUNT,		\
-		OPT_BOOL(),						\
-		NO_SB_OPT,			false)			\
-	BCH_OPT(sb,			u64,	OPT_MOUNT,		\
-		OPT_UINT(0, S64_MAX),					\
-		NO_SB_OPT,			BCH_SB_SECTOR)		\
-	BCH_OPT(read_only,		u8,	OPT_INTERNAL,		\
-		OPT_BOOL(),						\
-		NO_SB_OPT,			false)			\
-	BCH_OPT(nostart,		u8,	OPT_INTERNAL,		\
-		OPT_BOOL(),						\
-		NO_SB_OPT,			false)			\
-	BCH_OPT(no_data_io,		u8,	OPT_MOUNT,		\
-		OPT_BOOL(),						\
-		NO_SB_OPT,			false)			\
-	BCH_OPT(version_upgrade,	u8,	OPT_MOUNT,		\
-		OPT_BOOL(),						\
-		NO_SB_OPT,			false)			\
-	BCH_OPT(project,		u8,	OPT_INTERNAL,		\
-		OPT_BOOL(),						\
-		NO_SB_OPT,			false)			\
+	x(block_size,			u16,				\
+	  OPT_FORMAT,							\
+	  OPT_SECTORS(1, 128),						\
+	  BCH_SB_BLOCK_SIZE,		8,				\
+	  "size",	NULL)						\
+	x(btree_node_size,		u16,				\
+	  OPT_FORMAT,							\
+	  OPT_SECTORS(1, 128),						\
+	  BCH_SB_BTREE_NODE_SIZE,	512,				\
+	  "size",	"Btree node size, default 256k")		\
+	x(errors,			u8,				\
+	  OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,				\
+	  OPT_STR(bch2_error_actions),					\
+	  BCH_SB_ERROR_ACTION,		BCH_ON_ERROR_RO,		\
+	  NULL,		"Action to take on filesystem error")		\
+	x(metadata_replicas,		u8,				\
+	  OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,				\
+	  OPT_UINT(1, BCH_REPLICAS_MAX),				\
+	  BCH_SB_META_REPLICAS_WANT,	1,				\
+	  "#",		"Number of metadata replicas")			\
+	x(data_replicas,		u8,				\
+	  OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE,			\
+	  OPT_UINT(1, BCH_REPLICAS_MAX),				\
+	  BCH_SB_DATA_REPLICAS_WANT,	1,				\
+	  "#",		"Number of data replicas")			\
+	x(metadata_replicas_required, u8,				\
+	  OPT_FORMAT|OPT_MOUNT,						\
+	  OPT_UINT(1, BCH_REPLICAS_MAX),				\
+	  BCH_SB_META_REPLICAS_REQ,	1,				\
+	  "#",		NULL)						\
+	x(data_replicas_required,	u8,				\
+	  OPT_FORMAT|OPT_MOUNT,						\
+	  OPT_UINT(1, BCH_REPLICAS_MAX),				\
+	  BCH_SB_DATA_REPLICAS_REQ,	1,				\
+	  "#",		NULL)						\
+	x(metadata_checksum,		u8,				\
+	  OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,				\
+	  OPT_STR(bch2_csum_types),					\
+	  BCH_SB_META_CSUM_TYPE,	BCH_CSUM_OPT_CRC32C,		\
+	  NULL,		NULL)						\
+	x(data_checksum,		u8,				\
+	  OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE,			\
+	  OPT_STR(bch2_csum_types),					\
+	  BCH_SB_DATA_CSUM_TYPE,	BCH_CSUM_OPT_CRC32C,		\
+	  NULL,		NULL)						\
+	x(compression,			u8,				\
+	  OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE,			\
+	  OPT_STR(bch2_compression_types),				\
+	  BCH_SB_COMPRESSION_TYPE,	BCH_COMPRESSION_OPT_NONE,	\
+	  NULL,		NULL)						\
+	x(background_compression,	u8,				\
+	  OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE,			\
+	  OPT_STR(bch2_compression_types),				\
+	  BCH_SB_BACKGROUND_COMPRESSION_TYPE,BCH_COMPRESSION_OPT_NONE,	\
+	  NULL,		NULL)						\
+	x(str_hash,			u8,				\
+	  OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,				\
+	  OPT_STR(bch2_str_hash_types),					\
+	  BCH_SB_STR_HASH_TYPE,		BCH_STR_HASH_SIPHASH,		\
+	  NULL,		"Hash function for directory entries and xattrs")\
+	x(foreground_target,		u16,				\
+	  OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE,			\
+	  OPT_FN(bch2_opt_target),					\
+	  BCH_SB_FOREGROUND_TARGET,	0,				\
+	  "(target)",	"Device or disk group for foreground writes")	\
+	x(background_target,		u16,				\
+	  OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE,			\
+	  OPT_FN(bch2_opt_target),					\
+	  BCH_SB_BACKGROUND_TARGET,	0,				\
+	  "(target)",	"Device or disk group to move data to in the background")\
+	x(promote_target,		u16,				\
+	  OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE,			\
+	  OPT_FN(bch2_opt_target),					\
+	  BCH_SB_PROMOTE_TARGET,	0,				\
+	  "(target)",	"Device or disk group to promote data to on read")\
+	x(erasure_code,			u16,				\
+	  OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE,			\
+	  OPT_BOOL(),							\
+	  BCH_SB_ERASURE_CODE,		false,				\
+	  NULL,		"Enable erasure coding (DO NOT USE YET)")	\
+	x(inodes_32bit,			u8,				\
+	  OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,				\
+	  OPT_BOOL(),							\
+	  BCH_SB_INODE_32BIT,		false,				\
+	  NULL,		"Constrain inode numbers to 32 bits")		\
+	x(gc_reserve_percent,		u8,				\
+	  OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,				\
+	  OPT_UINT(5, 21),						\
+	  BCH_SB_GC_RESERVE,		8,				\
+	  "%",		"Percentage of disk space to reserve for copygc")\
+	x(gc_reserve_bytes,		u64,				\
+	  OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,				\
+	  OPT_SECTORS(0, U64_MAX),					\
+	  BCH_SB_GC_RESERVE_BYTES,	0,				\
+	  "%",		"Amount of disk space to reserve for copygc\n"	\
+			"Takes precedence over gc_reserve_percent if set")\
+	x(root_reserve_percent,		u8,				\
+	  OPT_FORMAT|OPT_MOUNT,						\
+	  OPT_UINT(0, 100),						\
+	  BCH_SB_ROOT_RESERVE,		0,				\
+	  "%",		"Percentage of disk space to reserve for superuser")\
+	x(wide_macs,			u8,				\
+	  OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,				\
+	  OPT_BOOL(),							\
+	  BCH_SB_128_BIT_MACS,		false,				\
+	  NULL,		"Store full 128 bits of cryptographic MACs, instead of 80")\
+	x(acl,				u8,				\
+	  OPT_FORMAT|OPT_MOUNT,						\
+	  OPT_BOOL(),							\
+	  BCH_SB_POSIX_ACL,		true,				\
+	  NULL,		"Enable POSIX acls")				\
+	x(usrquota,			u8,				\
+	  OPT_FORMAT|OPT_MOUNT,						\
+	  OPT_BOOL(),							\
+	  BCH_SB_USRQUOTA,		false,				\
+	  NULL,		"Enable user quotas")				\
+	x(grpquota,			u8,				\
+	  OPT_FORMAT|OPT_MOUNT,						\
+	  OPT_BOOL(),							\
+	  BCH_SB_GRPQUOTA,		false,				\
+	  NULL,		"Enable group quotas")				\
+	x(prjquota,			u8,				\
+	  OPT_FORMAT|OPT_MOUNT,						\
+	  OPT_BOOL(),							\
+	  BCH_SB_PRJQUOTA,		false,				\
+	  NULL,		"Enable project quotas")			\
+	x(degraded,			u8,				\
+	  OPT_MOUNT,							\
+	  OPT_BOOL(),							\
+	  NO_SB_OPT,			false,				\
+	  NULL,		"Allow mounting in degraded mode")		\
+	x(discard,			u8,				\
+	  OPT_MOUNT|OPT_DEVICE,						\
+	  OPT_BOOL(),							\
+	  NO_SB_OPT,			false,				\
+	  NULL,		"Enable discard/TRIM support")			\
+	x(verbose,			u8,				\
+	  OPT_MOUNT,							\
+	  OPT_BOOL(),							\
+	  NO_SB_OPT,			false,				\
+	  NULL,		"Extra debugging information during mount/recovery")\
+	x(journal_flush_disabled,	u8,				\
+	  OPT_MOUNT|OPT_RUNTIME,					\
+	  OPT_BOOL(),							\
+	  NO_SB_OPT,			false,				\
+	  NULL,		"Disable journal flush on sync/fsync\n"		\
+			"If enabled, writes can be lost, but only since the\n"\
+			"last journal write (default 1 second)")	\
+	x(fsck,				u8,				\
+	  OPT_MOUNT,							\
+	  OPT_BOOL(),							\
+	  NO_SB_OPT,			true,				\
+	  NULL,		"Run fsck on mount")				\
+	x(fix_errors,			u8,				\
+	  OPT_MOUNT,							\
+	  OPT_BOOL(),							\
+	  NO_SB_OPT,			false,				\
+	  NULL,		"Fix errors during fsck without asking")	\
+	x(nochanges,			u8,				\
+	  OPT_MOUNT,							\
+	  OPT_BOOL(),							\
+	  NO_SB_OPT,			false,				\
+	  NULL,		"Super read only mode - no writes at all will be issued,\n"\
+			"even if we have to replay the journal")	\
+	x(noreplay,			u8,				\
+	  OPT_MOUNT,							\
+	  OPT_BOOL(),							\
+	  NO_SB_OPT,			false,				\
+	  NULL,		"Don't replay the journal (only for internal tools)")\
+	x(norecovery,			u8,				\
+	  OPT_MOUNT,							\
+	  OPT_BOOL(),							\
+	  NO_SB_OPT,			false,				\
+	  NULL,		NULL)						\
+	x(noexcl,			u8,				\
+	  OPT_MOUNT,							\
+	  OPT_BOOL(),							\
+	  NO_SB_OPT,			false,				\
+	  NULL,		"Don't open device in exclusive mode")		\
+	x(sb,				u64,				\
+	  OPT_MOUNT,							\
+	  OPT_UINT(0, S64_MAX),						\
+	  NO_SB_OPT,			BCH_SB_SECTOR,			\
+	  "offset",	"Sector offset of superblock")			\
+	x(read_only,			u8,				\
+	  0,								\
+	  OPT_BOOL(),							\
+	  NO_SB_OPT,			false,				\
+	  NULL,		NULL)						\
+	x(nostart,			u8,				\
+	  0,								\
+	  OPT_BOOL(),							\
+	  NO_SB_OPT,			false,				\
+	  NULL,		"Don\'t start filesystem, only open devices")	\
+	x(version_upgrade,		u8,				\
+	  OPT_MOUNT,							\
+	  OPT_BOOL(),							\
+	  NO_SB_OPT,			false,				\
+	  NULL,		"Set superblock to latest version,\n"		\
+			"allowing any new features to be used")		\
+	x(project,			u8,				\
+	  OPT_INODE,							\
+	  OPT_BOOL(),							\
+	  NO_SB_OPT,			false,				\
+	  NULL,		NULL)						\
+	x(no_data_io,			u8,				\
+	  OPT_MOUNT,							\
+	  OPT_BOOL(),							\
+	  NO_SB_OPT,			false,				\
+	  NULL,		"Skip submit_bio() for data reads and writes, "	\
+			"for performance testing purposes")		\
+	x(fs_size,			u64,				\
+	  OPT_DEVICE,							\
+	  OPT_SECTORS(0, S64_MAX),					\
+	  NO_SB_OPT,			0,				\
+	  "size",	"Size of filesystem on device")			\
+	x(bucket,			u32,				\
+	  OPT_DEVICE,							\
+	  OPT_SECTORS(0, S64_MAX),					\
+	  NO_SB_OPT,			0,				\
+	  "size",	"Size of filesystem on device")			\
+	x(durability,			u8,				\
+	  OPT_DEVICE,							\
+	  OPT_UINT(0, BCH_REPLICAS_MAX),				\
+	  NO_SB_OPT,			1,				\
+	  "n",		"Data written to this device will be considered\n"\
+			"to have already been replicated n times")
+
 
 struct bch_opts {
-#define BCH_OPT(_name, _bits, ...)	unsigned _name##_defined:1;
+#define x(_name, _bits, ...)	unsigned _name##_defined:1;
 	BCH_OPTS()
-#undef BCH_OPT
+#undef x
 
-#define BCH_OPT(_name, _bits, ...)	_bits	_name;
+#define x(_name, _bits, ...)	_bits	_name;
 	BCH_OPTS()
-#undef BCH_OPT
+#undef x
 };
 
 static const struct bch_opts bch2_opts_default = {
-#define BCH_OPT(_name, _bits, _mode, _type, _sb_opt, _default)		\
+#define x(_name, _bits, _mode, _type, _sb_opt, _default, ...)		\
 	._name##_defined = true,					\
 	._name = _default,						\
 
 	BCH_OPTS()
-#undef BCH_OPT
+#undef x
 };
 
 #define opt_defined(_opts, _name)	((_opts)._name##_defined)
@@ -231,9 +336,9 @@ static inline struct bch_opts bch2_opts_empty(void)
 void bch2_opts_apply(struct bch_opts *, struct bch_opts);
 
 enum bch_opt_id {
-#define BCH_OPT(_name, ...)	Opt_##_name,
+#define x(_name, ...)	Opt_##_name,
 	BCH_OPTS()
-#undef BCH_OPT
+#undef x
 	bch2_opts_nr
 };
 
@@ -259,6 +364,9 @@ struct bch_option {
 	};
 	};
 
+	const char		*hint;
+	const char		*help;
+
 };
 
 extern const struct bch_option bch2_opt_table[];
diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c
index eec74d4a5712..cc1a7deb90bc 100644
--- a/fs/bcachefs/rebalance.c
+++ b/fs/bcachefs/rebalance.c
@@ -263,13 +263,13 @@ ssize_t bch2_rebalance_work_show(struct bch_fs *c, char *buf)
 	struct rebalance_work w = rebalance_work(c);
 	char h1[21], h2[21];
 
-	bch2_hprint(h1, w.dev_most_full_work << 9);
-	bch2_hprint(h2, w.dev_most_full_capacity << 9);
+	bch2_hprint(&PBUF(h1), w.dev_most_full_work << 9);
+	bch2_hprint(&PBUF(h2), w.dev_most_full_capacity << 9);
 	pr_buf(&out, "fullest_dev (%i):\t%s/%s\n",
 	       w.dev_most_full_idx, h1, h2);
 
-	bch2_hprint(h1, w.total_work << 9);
-	bch2_hprint(h2, c->capacity << 9);
+	bch2_hprint(&PBUF(h1), w.total_work << 9);
+	bch2_hprint(&PBUF(h2), c->capacity << 9);
 	pr_buf(&out, "total work:\t\t%s/%s\n", h1, h2);
 
 	pr_buf(&out, "rate:\t\t\t%u\n", r->pd.rate.rate);
@@ -279,7 +279,7 @@ ssize_t bch2_rebalance_work_show(struct bch_fs *c, char *buf)
 		pr_buf(&out, "waiting\n");
 		break;
 	case REBALANCE_THROTTLED:
-		bch2_hprint(h1,
+		bch2_hprint(&PBUF(h1),
 			    (r->throttled_until_iotime -
 			     atomic_long_read(&c->io_clock[WRITE].now)) << 9);
 		pr_buf(&out, "throttled for %lu sec or %s io\n",
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index a423159b6ed5..b59b7a5a4cbb 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -73,9 +73,10 @@ do {									\
 #define sysfs_hprint(file, val)						\
 do {									\
 	if (attr == &sysfs_ ## file) {					\
-		ssize_t ret = bch2_hprint(buf, val);			\
-		strcat(buf, "\n");					\
-		return ret + 1;						\
+		struct printbuf out = _PBUF(buf, PAGE_SIZE);		\
+		bch2_hprint(&out, val);					\
+		pr_buf(&out, "\n");					\
+		return out.pos - buf;					\
 	}								\
 } while (0)
 
@@ -658,7 +659,7 @@ int bch2_opts_create_sysfs_files(struct kobject *kobj)
 	for (i = bch2_opt_table;
 	     i < bch2_opt_table + bch2_opts_nr;
 	     i++) {
-		if (i->mode == OPT_INTERNAL)
+		if (!(i->mode & (OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME)))
 			continue;
 
 		ret = sysfs_create_file(kobj, &i->attr);
diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c
index dc8abce94ff0..bcbe782260f0 100644
--- a/fs/bcachefs/tests.c
+++ b/fs/bcachefs/tests.c
@@ -619,8 +619,8 @@ void bch2_btree_perf_test(struct bch_fs *c, const char *testname,
 	time = j.finish - j.start;
 
 	scnprintf(name_buf, sizeof(name_buf), "%s:", testname);
-	bch2_hprint(nr_buf, nr);
-	bch2_hprint(per_sec_buf, nr * NSEC_PER_SEC / time);
+	bch2_hprint(&PBUF(nr_buf), nr);
+	bch2_hprint(&PBUF(per_sec_buf), nr * NSEC_PER_SEC / time);
 	printk(KERN_INFO "%-12s %s with %u threads in %5llu sec, %5llu nsec per iter, %5s per sec\n",
 		name_buf, nr_buf, nr_threads,
 		time / NSEC_PER_SEC,
diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c
index bb6b4383d33f..8931aa6a1e2a 100644
--- a/fs/bcachefs/util.c
+++ b/fs/bcachefs/util.c
@@ -100,10 +100,10 @@ STRTO_H(strtoint, int)
 STRTO_H(strtouint, unsigned int)
 STRTO_H(strtoll, long long)
 STRTO_H(strtoull, unsigned long long)
+STRTO_H(strtou64, u64)
 
-ssize_t bch2_hprint(char *buf, s64 v)
+void bch2_hprint(struct printbuf *buf, s64 v)
 {
-	char dec[4] = "";
 	int u, t = 0;
 
 	for (u = 0; v >= 1024 || v <= -1024; u++) {
@@ -111,17 +111,16 @@ ssize_t bch2_hprint(char *buf, s64 v)
 		v >>= 10;
 	}
 
-	if (!u)
-		return sprintf(buf, "%lli", v);
+	pr_buf(buf, "%lli", v);
 
 	/*
 	 * 103 is magic: t is in the range [-1023, 1023] and we want
 	 * to turn it into [-9, 9]
 	 */
-	if (v < 100 && v > -100)
-		scnprintf(dec, sizeof(dec), ".%i", t / 103);
-
-	return sprintf(buf, "%lli%s%c", v, dec, si_units[u]);
+	if (u && v < 100 && v > -100)
+		pr_buf(buf, ".%i", t / 103);
+	if (u)
+		pr_buf(buf, "%c", si_units[u]);
 }
 
 void bch2_string_opt_to_text(struct printbuf *out,
@@ -483,12 +482,12 @@ size_t bch2_pd_controller_print_debug(struct bch_pd_controller *pd, char *buf)
 	char change[21];
 	s64 next_io;
 
-	bch2_hprint(rate,	pd->rate.rate);
-	bch2_hprint(actual,	pd->last_actual);
-	bch2_hprint(target,	pd->last_target);
-	bch2_hprint(proportional, pd->last_proportional);
-	bch2_hprint(derivative,	pd->last_derivative);
-	bch2_hprint(change,	pd->last_change);
+	bch2_hprint(&PBUF(rate),	pd->rate.rate);
+	bch2_hprint(&PBUF(actual),	pd->last_actual);
+	bch2_hprint(&PBUF(target),	pd->last_target);
+	bch2_hprint(&PBUF(proportional), pd->last_proportional);
+	bch2_hprint(&PBUF(derivative),	pd->last_derivative);
+	bch2_hprint(&PBUF(change),	pd->last_change);
 
 	next_io = div64_s64(pd->rate.next - local_clock(), NSEC_PER_MSEC);
 
diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h
index 47afd3955c7a..7d1e6cc6afda 100644
--- a/fs/bcachefs/util.h
+++ b/fs/bcachefs/util.h
@@ -265,6 +265,7 @@ int bch2_strtoint_h(const char *, int *);
 int bch2_strtouint_h(const char *, unsigned int *);
 int bch2_strtoll_h(const char *, long long *);
 int bch2_strtoull_h(const char *, unsigned long long *);
+int bch2_strtou64_h(const char *, u64 *);
 
 static inline int bch2_strtol_h(const char *cp, long *res)
 {
@@ -333,7 +334,7 @@ static inline int bch2_strtoul_h(const char *cp, long *res)
 		 : type_is(var, char *)		? "%s\n"		\
 		 : "%i\n", var)
 
-ssize_t bch2_hprint(char *buf, s64 v);
+void bch2_hprint(struct printbuf *, s64);
 
 bool bch2_is_zero(const void *, size_t);
 
-- 
cgit 


From e19e57f8a1e5ebe1fe78dc0dff5120b4752ee8b1 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 19 Dec 2018 16:01:38 -0500
Subject: bcachefs: fix new reinherit_attrs ioctl

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-ioctl.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c
index 4925a127a335..b00d25b18ed4 100644
--- a/fs/bcachefs/fs-ioctl.c
+++ b/fs/bcachefs/fs-ioctl.c
@@ -184,7 +184,7 @@ static int bch2_ioc_reinherit_attrs(struct bch_fs *c,
 	if (unlikely(ret < 0))
 		goto err1;
 
-	qstr.hash_len	= ret;
+	qstr.len	= ret;
 	qstr.name	= kname;
 
 	ret = -ENOENT;
-- 
cgit 


From e47c017144fe302dfba09ba9f6629ed5dca6d77f Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 17 Dec 2018 08:29:44 -0500
Subject: bcachefs: Minor replicas.c refactoring

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/replicas.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c
index a694b0fcd6a1..b63da1bef760 100644
--- a/fs/bcachefs/replicas.c
+++ b/fs/bcachefs/replicas.c
@@ -4,7 +4,7 @@
 #include "replicas.h"
 #include "super-io.h"
 
-struct bch_replicas_entry_padded {
+struct bch_replicas_padded {
 	struct bch_replicas_entry	e;
 	u8				pad[BCH_SB_MEMBERS_MAX];
 };
@@ -270,7 +270,7 @@ int bch2_mark_replicas(struct bch_fs *c,
 		       enum bch_data_type data_type,
 		       struct bch_devs_list devs)
 {
-	struct bch_replicas_entry_padded search;
+	struct bch_replicas_padded search;
 
 	if (!devs.nr)
 		return 0;
@@ -286,7 +286,7 @@ int bch2_mark_replicas(struct bch_fs *c,
 
 int bch2_mark_bkey_replicas(struct bch_fs *c, struct bkey_s_c k)
 {
-	struct bch_replicas_entry_padded search;
+	struct bch_replicas_padded search;
 	struct bch_devs_list cached = bch2_bkey_cached_devs(k);
 	unsigned i;
 	int ret;
@@ -682,7 +682,7 @@ bool bch2_replicas_marked(struct bch_fs *c,
 			  struct bch_devs_list devs,
 			  bool check_gc_replicas)
 {
-	struct bch_replicas_entry_padded search;
+	struct bch_replicas_padded search;
 
 	if (!devs.nr)
 		return true;
@@ -698,7 +698,7 @@ bool bch2_bkey_replicas_marked(struct bch_fs *c,
 			       struct bkey_s_c k,
 			       bool check_gc_replicas)
 {
-	struct bch_replicas_entry_padded search;
+	struct bch_replicas_padded search;
 	struct bch_devs_list cached = bch2_bkey_cached_devs(k);
 	unsigned i;
 
-- 
cgit 


From 23f80d2b3bf7898579c841786c49842789f32ff5 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 17 Dec 2018 08:44:56 -0500
Subject: bcachefs: Factor out acc_u64s()

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_gc.c | 107 ++++++++++++++++++++-----------------------
 fs/bcachefs/buckets.c  | 122 ++++++-------------------------------------------
 fs/bcachefs/buckets.h  |   2 -
 fs/bcachefs/util.h     |  17 +++++++
 4 files changed, 79 insertions(+), 169 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index c353fbbed975..466469a0d852 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -482,29 +482,35 @@ static void bch2_gc_free(struct bch_fs *c)
 	c->usage[1] = NULL;
 }
 
-static void fs_usage_reset(struct bch_fs_usage *fs_usage)
+/*
+ * Accumulate percpu counters onto one cpu's copy - only valid when access
+ * against any percpu counter is guarded against
+ */
+static u64 *acc_percpu_u64s(u64 __percpu *p, unsigned nr)
 {
-	unsigned offset = offsetof(typeof(*fs_usage), s.gc_start);
+	u64 *ret;
+	int cpu;
 
-	memset((void *) fs_usage + offset, 0,
-	       sizeof(*fs_usage) - offset);
-}
+	preempt_disable();
+	ret = this_cpu_ptr(p);
+	preempt_enable();
 
-static void fs_usage_cpy(struct bch_fs_usage *dst,
-			 struct bch_fs_usage *src)
-{
-	unsigned offset = offsetof(typeof(*dst), s.gc_start);
+	for_each_possible_cpu(cpu) {
+		u64 *i = per_cpu_ptr(p, cpu);
 
-	memcpy((void *) dst + offset,
-	       (void *) src + offset,
-	       sizeof(*dst) - offset);
+		if (i != ret) {
+			acc_u64s(ret, i, nr);
+			memset(i, 0, nr * sizeof(u64));
+		}
+	}
+
+	return ret;
 }
 
 static void bch2_gc_done_nocheck(struct bch_fs *c)
 {
 	struct bch_dev *ca;
 	unsigned i;
-	int cpu;
 
 	{
 		struct genradix_iter dst_iter = genradix_iter_init(&c->stripes[0], 0);
@@ -534,42 +540,39 @@ static void bch2_gc_done_nocheck(struct bch_fs *c)
 	};
 
 	for_each_member_device(ca, c, i) {
-		struct bch_dev_usage *p;
-
-		for_each_possible_cpu(cpu) {
-			p = per_cpu_ptr(ca->usage[0], cpu);
-			memset(p, 0, sizeof(*p));
-		}
+		unsigned nr = sizeof(struct bch_dev_usage) / sizeof(u64);
+		struct bch_dev_usage *dst = (void *)
+			acc_percpu_u64s((void *) ca->usage[0], nr);
+		struct bch_dev_usage *src = (void *)
+			acc_percpu_u64s((void *) ca->usage[1], nr);
 
-		preempt_disable();
-		*this_cpu_ptr(ca->usage[0]) = __bch2_dev_usage_read(ca, 1);
-		preempt_enable();
+		*dst = *src;
 	}
 
 	{
-		struct bch_fs_usage src = __bch2_fs_usage_read(c, 1);
-
-		for_each_possible_cpu(cpu)
-			fs_usage_reset(per_cpu_ptr(c->usage[0], cpu));
-
-		preempt_disable();
-		fs_usage_cpy(this_cpu_ptr(c->usage[0]), &src);
-		preempt_enable();
+		unsigned nr = sizeof(struct bch_fs_usage) / sizeof(u64);
+		struct bch_fs_usage *dst = (void *)
+			acc_percpu_u64s((void *) c->usage[0], nr);
+		struct bch_fs_usage *src = (void *)
+			acc_percpu_u64s((void *) c->usage[1], nr);
+		unsigned offset = offsetof(typeof(*dst), s.gc_start);
+
+		memcpy((void *) dst + offset,
+		       (void *) src + offset,
+		       sizeof(*dst) - offset);
 	}
-
 }
 
 static void bch2_gc_done(struct bch_fs *c, bool initial)
 {
 	struct bch_dev *ca;
 	unsigned i;
-	int cpu;
 
 #define copy_field(_f, _msg, ...)					\
-	if (dst._f != src._f) {						\
-		bch_err(c, _msg ": got %llu, should be %llu, fixing"\
-			, ##__VA_ARGS__, dst._f, src._f);		\
-		dst._f = src._f;					\
+	if (dst->_f != src->_f) {					\
+		bch_err(c, _msg ": got %llu, should be %llu, fixing"	\
+			, ##__VA_ARGS__, dst->_f, src->_f);		\
+		dst->_f = src->_f;					\
 	}
 #define copy_stripe_field(_f, _msg, ...)				\
 	if (dst->_f != src->_f) {					\
@@ -650,9 +653,11 @@ static void bch2_gc_done(struct bch_fs *c, bool initial)
 	};
 
 	for_each_member_device(ca, c, i) {
-		struct bch_dev_usage dst = __bch2_dev_usage_read(ca, 0);
-		struct bch_dev_usage src = __bch2_dev_usage_read(ca, 1);
-		struct bch_dev_usage *p;
+		unsigned nr = sizeof(struct bch_dev_usage) / sizeof(u64);
+		struct bch_dev_usage *dst = (void *)
+			acc_percpu_u64s((void *) ca->usage[0], nr);
+		struct bch_dev_usage *src = (void *)
+			acc_percpu_u64s((void *) ca->usage[1], nr);
 		unsigned b;
 
 		for (b = 0; b < BCH_DATA_NR; b++)
@@ -666,21 +671,14 @@ static void bch2_gc_done(struct bch_fs *c, bool initial)
 				       "sectors[%s]", bch2_data_types[b]);
 		copy_dev_field(sectors_fragmented,
 			       "sectors_fragmented");
-
-		for_each_possible_cpu(cpu) {
-			p = per_cpu_ptr(ca->usage[0], cpu);
-			memset(p, 0, sizeof(*p));
-		}
-
-		preempt_disable();
-		p = this_cpu_ptr(ca->usage[0]);
-		*p = dst;
-		preempt_enable();
 	}
 
 	{
-		struct bch_fs_usage dst = __bch2_fs_usage_read(c, 0);
-		struct bch_fs_usage src = __bch2_fs_usage_read(c, 1);
+		unsigned nr = sizeof(struct bch_fs_usage) / sizeof(u64);
+		struct bch_fs_usage *dst = (void *)
+			acc_percpu_u64s((void *) c->usage[0], nr);
+		struct bch_fs_usage *src = (void *)
+			acc_percpu_u64s((void *) c->usage[1], nr);
 		unsigned r, b;
 
 		copy_fs_field(s.hidden,		"hidden");
@@ -703,13 +701,6 @@ static void bch2_gc_done(struct bch_fs *c, bool initial)
 		for (b = 0; b < BCH_DATA_NR; b++)
 			copy_fs_field(buckets[b],
 				      "buckets[%s]", bch2_data_types[b]);
-
-		for_each_possible_cpu(cpu)
-			fs_usage_reset(per_cpu_ptr(c->usage[0], cpu));
-
-		preempt_disable();
-		fs_usage_cpy(this_cpu_ptr(c->usage[0]), &dst);
-		preempt_enable();
 	}
 out:
 	percpu_up_write(&c->mark_lock);
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 16aafe8502a0..6501dcf12d59 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -78,77 +78,6 @@
 
 static inline u64 __bch2_fs_sectors_used(struct bch_fs *, struct bch_fs_usage);
 
-#ifdef DEBUG_BUCKETS
-
-#define lg_local_lock	lg_global_lock
-#define lg_local_unlock	lg_global_unlock
-
-static void bch2_fs_stats_verify(struct bch_fs *c)
-{
-	struct bch_fs_usage stats =_bch2_fs_usage_read(c);
-	unsigned i, j;
-
-	for (i = 0; i < ARRAY_SIZE(stats.replicas); i++) {
-		for (j = 0; j < ARRAY_SIZE(stats.replicas[i].data); j++)
-			if ((s64) stats.replicas[i].data[j] < 0)
-				panic("replicas %u %s sectors underflow: %lli\n",
-				      i + 1, bch_data_types[j],
-				      stats.replicas[i].data[j]);
-
-		if ((s64) stats.replicas[i].persistent_reserved < 0)
-			panic("replicas %u reserved underflow: %lli\n",
-			      i + 1, stats.replicas[i].persistent_reserved);
-	}
-
-	for (j = 0; j < ARRAY_SIZE(stats.buckets); j++)
-		if ((s64) stats.replicas[i].data_buckets[j] < 0)
-			panic("%s buckets underflow: %lli\n",
-			      bch_data_types[j],
-			      stats.buckets[j]);
-
-	if ((s64) stats.s.online_reserved < 0)
-		panic("sectors_online_reserved underflow: %lli\n",
-		      stats.s.online_reserved);
-}
-
-static void bch2_dev_stats_verify(struct bch_dev *ca)
-{
-	struct bch_dev_usage stats =
-		__bch2_dev_usage_read(ca);
-	u64 n = ca->mi.nbuckets - ca->mi.first_bucket;
-	unsigned i;
-
-	for (i = 0; i < ARRAY_SIZE(stats.buckets); i++)
-		BUG_ON(stats.buckets[i]		> n);
-	BUG_ON(stats.buckets_alloc		> n);
-	BUG_ON(stats.buckets_unavailable	> n);
-}
-
-static void bch2_disk_reservations_verify(struct bch_fs *c, int flags)
-{
-	if (!(flags & BCH_DISK_RESERVATION_NOFAIL)) {
-		u64 used = __bch2_fs_sectors_used(c);
-		u64 cached = 0;
-		u64 avail = atomic64_read(&c->sectors_available);
-		int cpu;
-
-		for_each_possible_cpu(cpu)
-			cached += per_cpu_ptr(c->usage_percpu, cpu)->available_cache;
-
-		if (used + avail + cached > c->capacity)
-			panic("used %llu avail %llu cached %llu capacity %llu\n",
-			      used, avail, cached, c->capacity);
-	}
-}
-
-#else
-
-static void bch2_fs_stats_verify(struct bch_fs *c) {}
-static void bch2_dev_stats_verify(struct bch_dev *ca) {}
-static void bch2_disk_reservations_verify(struct bch_fs *c, int flags) {}
-
-#endif
-
 /*
  * Clear journal_seq_valid for buckets for which it's not needed, to prevent
  * wraparound:
@@ -186,43 +115,23 @@ void bch2_bucket_seq_cleanup(struct bch_fs *c)
 	}
 }
 
-#define bch2_usage_add(_acc, _stats)					\
-do {									\
-	typeof(_acc) _a = (_acc), _s = (_stats);			\
-	unsigned i;							\
-									\
-	for (i = 0; i < sizeof(*_a) / sizeof(u64); i++)			\
-		((u64 *) (_a))[i] += ((u64 *) (_s))[i];			\
-} while (0)
-
 #define bch2_usage_read_raw(_stats)					\
 ({									\
 	typeof(*this_cpu_ptr(_stats)) _acc;				\
-	int cpu;							\
 									\
 	memset(&_acc, 0, sizeof(_acc));					\
-									\
-	for_each_possible_cpu(cpu)					\
-		bch2_usage_add(&_acc, per_cpu_ptr((_stats), cpu));	\
+	acc_u64s_percpu((u64 *) &_acc,					\
+			(u64 __percpu *) _stats,			\
+			sizeof(_acc) / sizeof(u64));			\
 									\
 	_acc;								\
 })
 
-struct bch_dev_usage __bch2_dev_usage_read(struct bch_dev *ca, bool gc)
-{
-	return bch2_usage_read_raw(ca->usage[gc]);
-}
-
 struct bch_dev_usage bch2_dev_usage_read(struct bch_fs *c, struct bch_dev *ca)
 {
 	return bch2_usage_read_raw(ca->usage[0]);
 }
 
-struct bch_fs_usage __bch2_fs_usage_read(struct bch_fs *c, bool gc)
-{
-	return bch2_usage_read_raw(c->usage[gc]);
-}
-
 struct bch_fs_usage bch2_fs_usage_read(struct bch_fs *c)
 {
 	return bch2_usage_read_raw(c->usage[0]);
@@ -326,12 +235,16 @@ void bch2_fs_usage_apply(struct bch_fs *c,
 	}
 
 	preempt_disable();
-	bch2_usage_add(this_cpu_ptr(c->usage[0]), fs_usage);
-
-	if (gc_visited(c, gc_pos))
-		bch2_usage_add(this_cpu_ptr(c->usage[1]), fs_usage);
-
-	bch2_fs_stats_verify(c);
+	acc_u64s((u64 *) this_cpu_ptr(c->usage[0]),
+		 (u64 *) fs_usage,
+		 sizeof(*fs_usage) / sizeof(u64));
+
+	if (gc_visited(c, gc_pos)) {
+		BUG_ON(!c->usage[1]);
+		acc_u64s((u64 *) this_cpu_ptr(c->usage[1]),
+			 (u64 *) fs_usage,
+			 sizeof(*fs_usage) / sizeof(u64));
+	}
 	preempt_enable();
 
 	memset(fs_usage, 0, sizeof(*fs_usage));
@@ -392,8 +305,6 @@ static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
 
 	if (!is_available_bucket(old) && is_available_bucket(new))
 		bch2_wake_allocator(ca);
-
-	bch2_dev_stats_verify(ca);
 }
 
 void bch2_dev_usage_from_buckets(struct bch_fs *c, struct bch_dev *ca)
@@ -1011,8 +922,6 @@ void __bch2_disk_reservation_put(struct bch_fs *c, struct disk_reservation *res)
 {
 	percpu_down_read(&c->mark_lock);
 	this_cpu_sub(c->usage[0]->s.online_reserved, res->sectors);
-
-	bch2_fs_stats_verify(c);
 	percpu_up_read(&c->mark_lock);
 
 	res->sectors = 0;
@@ -1055,8 +964,6 @@ out:
 	this_cpu_add(c->usage[0]->s.online_reserved, sectors);
 	res->sectors			+= sectors;
 
-	bch2_disk_reservations_verify(c, flags);
-	bch2_fs_stats_verify(c);
 	preempt_enable();
 	percpu_up_read(&c->mark_lock);
 	return 0;
@@ -1089,14 +996,11 @@ recalculate:
 		this_cpu_add(c->usage[0]->s.online_reserved, sectors);
 		res->sectors			+= sectors;
 		ret = 0;
-
-		bch2_disk_reservations_verify(c, flags);
 	} else {
 		atomic64_set(&c->sectors_available, sectors_available);
 		ret = -ENOSPC;
 	}
 
-	bch2_fs_stats_verify(c);
 	percpu_up_write(&c->mark_lock);
 
 	if (!(flags & BCH_DISK_RESERVATION_GC_LOCK_HELD))
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index 3db0e3b8a180..88e083325232 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -180,7 +180,6 @@ static inline bool bucket_needs_journal_commit(struct bucket_mark m,
 
 /* Device usage: */
 
-struct bch_dev_usage __bch2_dev_usage_read(struct bch_dev *, bool);
 struct bch_dev_usage bch2_dev_usage_read(struct bch_fs *, struct bch_dev *);
 
 static inline u64 __dev_buckets_available(struct bch_dev *ca,
@@ -219,7 +218,6 @@ static inline u64 dev_buckets_free(struct bch_fs *c, struct bch_dev *ca)
 
 /* Filesystem usage: */
 
-struct bch_fs_usage __bch2_fs_usage_read(struct bch_fs *, bool);
 struct bch_fs_usage bch2_fs_usage_read(struct bch_fs *);
 
 u64 bch2_fs_sectors_used(struct bch_fs *, struct bch_fs_usage);
diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h
index 7d1e6cc6afda..094c35f40209 100644
--- a/fs/bcachefs/util.h
+++ b/fs/bcachefs/util.h
@@ -700,4 +700,21 @@ do {									\
 	}								\
 } while (0)
 
+static inline void acc_u64s(u64 *acc, const u64 *src, unsigned nr)
+{
+	unsigned i;
+
+	for (i = 0; i < nr; i++)
+		acc[i] += src[i];
+}
+
+static inline void acc_u64s_percpu(u64 *acc, const u64 __percpu *src,
+				   unsigned nr)
+{
+	int cpu;
+
+	for_each_possible_cpu(cpu)
+		acc_u64s(acc, per_cpu_ptr(src, cpu), nr);
+}
+
 #endif /* _BCACHEFS_UTIL_H */
-- 
cgit 


From 04c2c34f00e1dacae2d329764b912e4a560efbb7 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 11 Jan 2019 05:11:07 -0500
Subject: bcachefs: use crc64 from lib/

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/Kconfig    |   1 +
 fs/bcachefs/checksum.c | 122 -------------------------------------------------
 fs/bcachefs/checksum.h |   6 ++-
 3 files changed, 6 insertions(+), 123 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/Kconfig b/fs/bcachefs/Kconfig
index 2f8300b60807..eccf643e9081 100644
--- a/fs/bcachefs/Kconfig
+++ b/fs/bcachefs/Kconfig
@@ -5,6 +5,7 @@ config BCACHEFS_FS
 	select EXPORTFS
 	select CLOSURES
 	select LIBCRC32C
+	select CRC64
 	select FS_POSIX_ACL
 	select LZ4_COMPRESS
 	select LZ4_DECOMPRESS
diff --git a/fs/bcachefs/checksum.c b/fs/bcachefs/checksum.c
index 3733cbfa1c91..98dc39de1e73 100644
--- a/fs/bcachefs/checksum.c
+++ b/fs/bcachefs/checksum.c
@@ -16,128 +16,6 @@
 #include <crypto/skcipher.h>
 #include <keys/user-type.h>
 
-/*
- * Portions Copyright (c) 1996-2001, PostgreSQL Global Development Group (Any
- * use permitted, subject to terms of PostgreSQL license; see.)
-
- * If we have a 64-bit integer type, then a 64-bit CRC looks just like the
- * usual sort of implementation. (See Ross Williams' excellent introduction
- * A PAINLESS GUIDE TO CRC ERROR DETECTION ALGORITHMS, available from
- * ftp://ftp.rocksoft.com/papers/crc_v3.txt or several other net sites.)
- * If we have no working 64-bit type, then fake it with two 32-bit registers.
- *
- * The present implementation is a normal (not "reflected", in Williams'
- * terms) 64-bit CRC, using initial all-ones register contents and a final
- * bit inversion. The chosen polynomial is borrowed from the DLT1 spec
- * (ECMA-182, available from http://www.ecma.ch/ecma1/STAND/ECMA-182.HTM):
- *
- * x^64 + x^62 + x^57 + x^55 + x^54 + x^53 + x^52 + x^47 + x^46 + x^45 +
- * x^40 + x^39 + x^38 + x^37 + x^35 + x^33 + x^32 + x^31 + x^29 + x^27 +
- * x^24 + x^23 + x^22 + x^21 + x^19 + x^17 + x^13 + x^12 + x^10 + x^9 +
- * x^7 + x^4 + x + 1
-*/
-
-static const u64 crc_table[256] = {
-	0x0000000000000000ULL, 0x42F0E1EBA9EA3693ULL, 0x85E1C3D753D46D26ULL,
-	0xC711223CFA3E5BB5ULL, 0x493366450E42ECDFULL, 0x0BC387AEA7A8DA4CULL,
-	0xCCD2A5925D9681F9ULL, 0x8E224479F47CB76AULL, 0x9266CC8A1C85D9BEULL,
-	0xD0962D61B56FEF2DULL, 0x17870F5D4F51B498ULL, 0x5577EEB6E6BB820BULL,
-	0xDB55AACF12C73561ULL, 0x99A54B24BB2D03F2ULL, 0x5EB4691841135847ULL,
-	0x1C4488F3E8F96ED4ULL, 0x663D78FF90E185EFULL, 0x24CD9914390BB37CULL,
-	0xE3DCBB28C335E8C9ULL, 0xA12C5AC36ADFDE5AULL, 0x2F0E1EBA9EA36930ULL,
-	0x6DFEFF5137495FA3ULL, 0xAAEFDD6DCD770416ULL, 0xE81F3C86649D3285ULL,
-	0xF45BB4758C645C51ULL, 0xB6AB559E258E6AC2ULL, 0x71BA77A2DFB03177ULL,
-	0x334A9649765A07E4ULL, 0xBD68D2308226B08EULL, 0xFF9833DB2BCC861DULL,
-	0x388911E7D1F2DDA8ULL, 0x7A79F00C7818EB3BULL, 0xCC7AF1FF21C30BDEULL,
-	0x8E8A101488293D4DULL, 0x499B3228721766F8ULL, 0x0B6BD3C3DBFD506BULL,
-	0x854997BA2F81E701ULL, 0xC7B97651866BD192ULL, 0x00A8546D7C558A27ULL,
-	0x4258B586D5BFBCB4ULL, 0x5E1C3D753D46D260ULL, 0x1CECDC9E94ACE4F3ULL,
-	0xDBFDFEA26E92BF46ULL, 0x990D1F49C77889D5ULL, 0x172F5B3033043EBFULL,
-	0x55DFBADB9AEE082CULL, 0x92CE98E760D05399ULL, 0xD03E790CC93A650AULL,
-	0xAA478900B1228E31ULL, 0xE8B768EB18C8B8A2ULL, 0x2FA64AD7E2F6E317ULL,
-	0x6D56AB3C4B1CD584ULL, 0xE374EF45BF6062EEULL, 0xA1840EAE168A547DULL,
-	0x66952C92ECB40FC8ULL, 0x2465CD79455E395BULL, 0x3821458AADA7578FULL,
-	0x7AD1A461044D611CULL, 0xBDC0865DFE733AA9ULL, 0xFF3067B657990C3AULL,
-	0x711223CFA3E5BB50ULL, 0x33E2C2240A0F8DC3ULL, 0xF4F3E018F031D676ULL,
-	0xB60301F359DBE0E5ULL, 0xDA050215EA6C212FULL, 0x98F5E3FE438617BCULL,
-	0x5FE4C1C2B9B84C09ULL, 0x1D14202910527A9AULL, 0x93366450E42ECDF0ULL,
-	0xD1C685BB4DC4FB63ULL, 0x16D7A787B7FAA0D6ULL, 0x5427466C1E109645ULL,
-	0x4863CE9FF6E9F891ULL, 0x0A932F745F03CE02ULL, 0xCD820D48A53D95B7ULL,
-	0x8F72ECA30CD7A324ULL, 0x0150A8DAF8AB144EULL, 0x43A04931514122DDULL,
-	0x84B16B0DAB7F7968ULL, 0xC6418AE602954FFBULL, 0xBC387AEA7A8DA4C0ULL,
-	0xFEC89B01D3679253ULL, 0x39D9B93D2959C9E6ULL, 0x7B2958D680B3FF75ULL,
-	0xF50B1CAF74CF481FULL, 0xB7FBFD44DD257E8CULL, 0x70EADF78271B2539ULL,
-	0x321A3E938EF113AAULL, 0x2E5EB66066087D7EULL, 0x6CAE578BCFE24BEDULL,
-	0xABBF75B735DC1058ULL, 0xE94F945C9C3626CBULL, 0x676DD025684A91A1ULL,
-	0x259D31CEC1A0A732ULL, 0xE28C13F23B9EFC87ULL, 0xA07CF2199274CA14ULL,
-	0x167FF3EACBAF2AF1ULL, 0x548F120162451C62ULL, 0x939E303D987B47D7ULL,
-	0xD16ED1D631917144ULL, 0x5F4C95AFC5EDC62EULL, 0x1DBC74446C07F0BDULL,
-	0xDAAD56789639AB08ULL, 0x985DB7933FD39D9BULL, 0x84193F60D72AF34FULL,
-	0xC6E9DE8B7EC0C5DCULL, 0x01F8FCB784FE9E69ULL, 0x43081D5C2D14A8FAULL,
-	0xCD2A5925D9681F90ULL, 0x8FDAB8CE70822903ULL, 0x48CB9AF28ABC72B6ULL,
-	0x0A3B7B1923564425ULL, 0x70428B155B4EAF1EULL, 0x32B26AFEF2A4998DULL,
-	0xF5A348C2089AC238ULL, 0xB753A929A170F4ABULL, 0x3971ED50550C43C1ULL,
-	0x7B810CBBFCE67552ULL, 0xBC902E8706D82EE7ULL, 0xFE60CF6CAF321874ULL,
-	0xE224479F47CB76A0ULL, 0xA0D4A674EE214033ULL, 0x67C58448141F1B86ULL,
-	0x253565A3BDF52D15ULL, 0xAB1721DA49899A7FULL, 0xE9E7C031E063ACECULL,
-	0x2EF6E20D1A5DF759ULL, 0x6C0603E6B3B7C1CAULL, 0xF6FAE5C07D3274CDULL,
-	0xB40A042BD4D8425EULL, 0x731B26172EE619EBULL, 0x31EBC7FC870C2F78ULL,
-	0xBFC9838573709812ULL, 0xFD39626EDA9AAE81ULL, 0x3A28405220A4F534ULL,
-	0x78D8A1B9894EC3A7ULL, 0x649C294A61B7AD73ULL, 0x266CC8A1C85D9BE0ULL,
-	0xE17DEA9D3263C055ULL, 0xA38D0B769B89F6C6ULL, 0x2DAF4F0F6FF541ACULL,
-	0x6F5FAEE4C61F773FULL, 0xA84E8CD83C212C8AULL, 0xEABE6D3395CB1A19ULL,
-	0x90C79D3FEDD3F122ULL, 0xD2377CD44439C7B1ULL, 0x15265EE8BE079C04ULL,
-	0x57D6BF0317EDAA97ULL, 0xD9F4FB7AE3911DFDULL, 0x9B041A914A7B2B6EULL,
-	0x5C1538ADB04570DBULL, 0x1EE5D94619AF4648ULL, 0x02A151B5F156289CULL,
-	0x4051B05E58BC1E0FULL, 0x87409262A28245BAULL, 0xC5B073890B687329ULL,
-	0x4B9237F0FF14C443ULL, 0x0962D61B56FEF2D0ULL, 0xCE73F427ACC0A965ULL,
-	0x8C8315CC052A9FF6ULL, 0x3A80143F5CF17F13ULL, 0x7870F5D4F51B4980ULL,
-	0xBF61D7E80F251235ULL, 0xFD913603A6CF24A6ULL, 0x73B3727A52B393CCULL,
-	0x31439391FB59A55FULL, 0xF652B1AD0167FEEAULL, 0xB4A25046A88DC879ULL,
-	0xA8E6D8B54074A6ADULL, 0xEA16395EE99E903EULL, 0x2D071B6213A0CB8BULL,
-	0x6FF7FA89BA4AFD18ULL, 0xE1D5BEF04E364A72ULL, 0xA3255F1BE7DC7CE1ULL,
-	0x64347D271DE22754ULL, 0x26C49CCCB40811C7ULL, 0x5CBD6CC0CC10FAFCULL,
-	0x1E4D8D2B65FACC6FULL, 0xD95CAF179FC497DAULL, 0x9BAC4EFC362EA149ULL,
-	0x158E0A85C2521623ULL, 0x577EEB6E6BB820B0ULL, 0x906FC95291867B05ULL,
-	0xD29F28B9386C4D96ULL, 0xCEDBA04AD0952342ULL, 0x8C2B41A1797F15D1ULL,
-	0x4B3A639D83414E64ULL, 0x09CA82762AAB78F7ULL, 0x87E8C60FDED7CF9DULL,
-	0xC51827E4773DF90EULL, 0x020905D88D03A2BBULL, 0x40F9E43324E99428ULL,
-	0x2CFFE7D5975E55E2ULL, 0x6E0F063E3EB46371ULL, 0xA91E2402C48A38C4ULL,
-	0xEBEEC5E96D600E57ULL, 0x65CC8190991CB93DULL, 0x273C607B30F68FAEULL,
-	0xE02D4247CAC8D41BULL, 0xA2DDA3AC6322E288ULL, 0xBE992B5F8BDB8C5CULL,
-	0xFC69CAB42231BACFULL, 0x3B78E888D80FE17AULL, 0x7988096371E5D7E9ULL,
-	0xF7AA4D1A85996083ULL, 0xB55AACF12C735610ULL, 0x724B8ECDD64D0DA5ULL,
-	0x30BB6F267FA73B36ULL, 0x4AC29F2A07BFD00DULL, 0x08327EC1AE55E69EULL,
-	0xCF235CFD546BBD2BULL, 0x8DD3BD16FD818BB8ULL, 0x03F1F96F09FD3CD2ULL,
-	0x41011884A0170A41ULL, 0x86103AB85A2951F4ULL, 0xC4E0DB53F3C36767ULL,
-	0xD8A453A01B3A09B3ULL, 0x9A54B24BB2D03F20ULL, 0x5D45907748EE6495ULL,
-	0x1FB5719CE1045206ULL, 0x919735E51578E56CULL, 0xD367D40EBC92D3FFULL,
-	0x1476F63246AC884AULL, 0x568617D9EF46BED9ULL, 0xE085162AB69D5E3CULL,
-	0xA275F7C11F7768AFULL, 0x6564D5FDE549331AULL, 0x279434164CA30589ULL,
-	0xA9B6706FB8DFB2E3ULL, 0xEB46918411358470ULL, 0x2C57B3B8EB0BDFC5ULL,
-	0x6EA7525342E1E956ULL, 0x72E3DAA0AA188782ULL, 0x30133B4B03F2B111ULL,
-	0xF7021977F9CCEAA4ULL, 0xB5F2F89C5026DC37ULL, 0x3BD0BCE5A45A6B5DULL,
-	0x79205D0E0DB05DCEULL, 0xBE317F32F78E067BULL, 0xFCC19ED95E6430E8ULL,
-	0x86B86ED5267CDBD3ULL, 0xC4488F3E8F96ED40ULL, 0x0359AD0275A8B6F5ULL,
-	0x41A94CE9DC428066ULL, 0xCF8B0890283E370CULL, 0x8D7BE97B81D4019FULL,
-	0x4A6ACB477BEA5A2AULL, 0x089A2AACD2006CB9ULL, 0x14DEA25F3AF9026DULL,
-	0x562E43B4931334FEULL, 0x913F6188692D6F4BULL, 0xD3CF8063C0C759D8ULL,
-	0x5DEDC41A34BBEEB2ULL, 0x1F1D25F19D51D821ULL, 0xD80C07CD676F8394ULL,
-	0x9AFCE626CE85B507ULL,
-};
-
-u64 bch2_crc64_update(u64 crc, const void *_data, size_t len)
-{
-	const unsigned char *data = _data;
-
-	while (len--) {
-		int i = ((int) (crc >> 56) ^ *data++) & 0xFF;
-		crc = crc_table[i] ^ (crc << 8);
-	}
-
-	return crc;
-}
-
 static u64 bch2_checksum_init(unsigned type)
 {
 	switch (type) {
diff --git a/fs/bcachefs/checksum.h b/fs/bcachefs/checksum.h
index 42c86466293e..e2f2d797f90c 100644
--- a/fs/bcachefs/checksum.h
+++ b/fs/bcachefs/checksum.h
@@ -6,9 +6,13 @@
 #include "extents_types.h"
 #include "super-io.h"
 
+#include <linux/crc64.h>
 #include <crypto/chacha.h>
 
-u64 bch2_crc64_update(u64, const void *, size_t);
+static inline u64 bch2_crc64_update(u64 crc, const void *p, size_t len)
+{
+	return crc64_be(crc, p, len);
+}
 
 #define BCH_NONCE_EXTENT	cpu_to_le32(1 << 28)
 #define BCH_NONCE_BTREE		cpu_to_le32(2 << 28)
-- 
cgit 


From b8adb833652909221efde19b1813627382b5bf51 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 11 Jan 2019 23:23:27 -0500
Subject: bcachefs: correctly initialize bch_extent_ptr

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_foreground.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index 36aa7a5f2806..9eae8c29e3c4 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -289,6 +289,7 @@ out:
 	ob->valid	= true;
 	ob->sectors_free = ca->mi.bucket_size;
 	ob->ptr		= (struct bch_extent_ptr) {
+		.type	= 1 << BCH_EXTENT_ENTRY_ptr,
 		.gen	= buckets->b[bucket].mark.gen,
 		.offset	= bucket_to_sector(ca, bucket),
 		.dev	= ca->dev_idx,
-- 
cgit 


From d0cc3defba58889e38eaa0c275d4728b4ac3b8c2 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 13 Jan 2019 16:02:22 -0500
Subject: bcachefs: More allocator startup improvements

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c      | 131 ++++++++++++++++++++----------------
 fs/bcachefs/alloc_background.h      |   2 +-
 fs/bcachefs/btree_cache.c           |   4 ++
 fs/bcachefs/btree_io.c              |  12 ++--
 fs/bcachefs/btree_io.h              |  53 +++++++--------
 fs/bcachefs/btree_iter.h            |   1 +
 fs/bcachefs/btree_locking.h         |   1 -
 fs/bcachefs/btree_update_interior.c |  13 ++++
 fs/bcachefs/buckets.c               |   2 +-
 fs/bcachefs/util.c                  |   3 -
 10 files changed, 120 insertions(+), 102 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 9c9464efd333..871a41b923da 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -347,12 +347,14 @@ err:
 	return ret;
 }
 
-int bch2_alloc_write(struct bch_fs *c)
+int bch2_alloc_write(struct bch_fs *c, bool nowait, bool *wrote)
 {
 	struct bch_dev *ca;
 	unsigned i;
 	int ret = 0;
 
+	*wrote = false;
+
 	for_each_rw_member(ca, c, i) {
 		struct btree_iter iter;
 		struct bucket_array *buckets;
@@ -370,9 +372,14 @@ int bch2_alloc_write(struct bch_fs *c)
 			if (!buckets->b[b].mark.dirty)
 				continue;
 
-			ret = __bch2_alloc_write_key(c, ca, b, &iter, NULL, 0);
+			ret = __bch2_alloc_write_key(c, ca, b, &iter, NULL,
+						     nowait
+						     ? BTREE_INSERT_NOWAIT
+						     : 0);
 			if (ret)
 				break;
+
+			*wrote = true;
 		}
 		up_read(&ca->bucket_lock);
 		bch2_btree_iter_unlock(&iter);
@@ -1270,20 +1277,23 @@ static void flush_held_btree_writes(struct bch_fs *c)
 	struct bucket_table *tbl;
 	struct rhash_head *pos;
 	struct btree *b;
-	bool flush_updates;
-	size_t i, nr_pending_updates;
+	bool nodes_blocked;
+	size_t i;
+	struct closure cl;
+
+	closure_init_stack(&cl);
 
 	clear_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags);
 again:
 	pr_debug("flushing dirty btree nodes");
 	cond_resched();
+	closure_wait(&c->btree_interior_update_wait, &cl);
 
-	flush_updates = false;
-	nr_pending_updates = bch2_btree_interior_updates_nr_pending(c);
+	nodes_blocked = false;
 
 	rcu_read_lock();
 	for_each_cached_btree(b, c, tbl, i, pos)
-		if (btree_node_dirty(b) && (!b->written || b->level)) {
+		if (btree_node_need_write(b)) {
 			if (btree_node_may_write(b)) {
 				rcu_read_unlock();
 				btree_node_lock_type(c, b, SIX_LOCK_read);
@@ -1291,7 +1301,7 @@ again:
 				six_unlock_read(&b->lock);
 				goto again;
 			} else {
-				flush_updates = true;
+				nodes_blocked = true;
 			}
 		}
 	rcu_read_unlock();
@@ -1299,17 +1309,16 @@ again:
 	if (c->btree_roots_dirty)
 		bch2_journal_meta(&c->journal);
 
-	/*
-	 * This is ugly, but it's needed to flush btree node writes
-	 * without spinning...
-	 */
-	if (flush_updates) {
-		closure_wait_event(&c->btree_interior_update_wait,
-				   bch2_btree_interior_updates_nr_pending(c) <
-				   nr_pending_updates);
+	if (nodes_blocked) {
+		closure_sync(&cl);
 		goto again;
 	}
 
+	closure_wake_up(&c->btree_interior_update_wait);
+	closure_sync(&cl);
+
+	closure_wait_event(&c->btree_interior_update_wait,
+			   !bch2_btree_interior_updates_nr_pending(c));
 }
 
 static void allocator_start_issue_discards(struct bch_fs *c)
@@ -1331,13 +1340,10 @@ static int __bch2_fs_allocator_start(struct bch_fs *c)
 	unsigned dev_iter;
 	u64 journal_seq = 0;
 	long bu;
-	bool invalidating_data = false;
 	int ret = 0;
 
-	if (test_alloc_startup(c)) {
-		invalidating_data = true;
+	if (test_alloc_startup(c))
 		goto not_enough;
-	}
 
 	/* Scan for buckets that are already invalidated: */
 	for_each_rw_member(ca, c, dev_iter) {
@@ -1384,21 +1390,6 @@ static int __bch2_fs_allocator_start(struct bch_fs *c)
 not_enough:
 	pr_debug("not enough empty buckets; scanning for reclaimable buckets");
 
-	for_each_rw_member(ca, c, dev_iter) {
-		find_reclaimable_buckets(c, ca);
-
-		while (!fifo_full(&ca->free[RESERVE_BTREE]) &&
-		       (bu = next_alloc_bucket(ca)) >= 0) {
-			invalidating_data |=
-				bch2_invalidate_one_bucket(c, ca, bu, &journal_seq);
-
-			fifo_push(&ca->free[RESERVE_BTREE], bu);
-			bucket_set_dirty(ca, bu);
-		}
-	}
-
-	pr_debug("done scanning for reclaimable buckets");
-
 	/*
 	 * We're moving buckets to freelists _before_ they've been marked as
 	 * invalidated on disk - we have to so that we can allocate new btree
@@ -1408,38 +1399,59 @@ not_enough:
 	 * have cached data in them, which is live until they're marked as
 	 * invalidated on disk:
 	 */
-	if (invalidating_data) {
-		pr_debug("invalidating existing data");
-		set_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags);
-	} else {
-		pr_debug("issuing discards");
-		allocator_start_issue_discards(c);
-	}
+	set_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags);
 
-	/*
-	 * XXX: it's possible for this to deadlock waiting on journal reclaim,
-	 * since we're holding btree writes. What then?
-	 */
-	ret = bch2_alloc_write(c);
-	if (ret)
-		return ret;
+	while (1) {
+		bool wrote = false;
 
-	if (invalidating_data) {
-		pr_debug("flushing journal");
+		for_each_rw_member(ca, c, dev_iter) {
+			find_reclaimable_buckets(c, ca);
 
-		ret = bch2_journal_flush_seq(&c->journal, journal_seq);
-		if (ret)
-			return ret;
+			while (!fifo_full(&ca->free[RESERVE_BTREE]) &&
+			       (bu = next_alloc_bucket(ca)) >= 0) {
+				bch2_invalidate_one_bucket(c, ca, bu,
+							   &journal_seq);
+
+				fifo_push(&ca->free[RESERVE_BTREE], bu);
+				bucket_set_dirty(ca, bu);
+			}
+		}
+
+		pr_debug("done scanning for reclaimable buckets");
+
+		/*
+		 * XXX: it's possible for this to deadlock waiting on journal reclaim,
+		 * since we're holding btree writes. What then?
+		 */
+		ret = bch2_alloc_write(c, true, &wrote);
 
-		pr_debug("issuing discards");
-		allocator_start_issue_discards(c);
+		/*
+		 * If bch2_alloc_write() did anything, it may have used some
+		 * buckets, and we need the RESERVE_BTREE freelist full - so we
+		 * need to loop and scan again.
+		 * And if it errored, it may have been because there weren't
+		 * enough buckets, so just scan and loop again as long as it
+		 * made some progress:
+		 */
+		if (!wrote && ret)
+			return ret;
+		if (!wrote && !ret)
+			break;
 	}
 
+	pr_debug("flushing journal");
+
+	ret = bch2_journal_flush(&c->journal);
+	if (ret)
+		return ret;
+
+	pr_debug("issuing discards");
+	allocator_start_issue_discards(c);
+
 	set_bit(BCH_FS_ALLOCATOR_STARTED, &c->flags);
 
 	/* now flush dirty btree nodes: */
-	if (invalidating_data)
-		flush_held_btree_writes(c);
+	flush_held_btree_writes(c);
 
 	return 0;
 }
@@ -1448,6 +1460,7 @@ int bch2_fs_allocator_start(struct bch_fs *c)
 {
 	struct bch_dev *ca;
 	unsigned i;
+	bool wrote;
 	int ret;
 
 	down_read(&c->gc_lock);
@@ -1465,7 +1478,7 @@ int bch2_fs_allocator_start(struct bch_fs *c)
 		}
 	}
 
-	return bch2_alloc_write(c);
+	return bch2_alloc_write(c, false, &wrote);
 }
 
 void bch2_fs_allocator_background_init(struct bch_fs *c)
diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h
index 8ced4e845281..ef5ec659b05d 100644
--- a/fs/bcachefs/alloc_background.h
+++ b/fs/bcachefs/alloc_background.h
@@ -55,7 +55,7 @@ void bch2_dev_allocator_add(struct bch_fs *, struct bch_dev *);
 void bch2_dev_allocator_stop(struct bch_dev *);
 int bch2_dev_allocator_start(struct bch_dev *);
 
-int bch2_alloc_write(struct bch_fs *);
+int bch2_alloc_write(struct bch_fs *, bool, bool *);
 int bch2_fs_allocator_start(struct bch_fs *);
 void bch2_fs_allocator_background_init(struct bch_fs *);
 
diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index b748afc778f4..65fc82fba071 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -171,6 +171,10 @@ static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush)
 	if (!btree_node_may_write(b))
 		goto out_unlock;
 
+	if (btree_node_dirty(b) &&
+	    test_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags))
+		goto out_unlock;
+
 	if (btree_node_dirty(b) ||
 	    btree_node_write_in_flight(b) ||
 	    btree_node_read_in_flight(b)) {
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index f205bddd814d..6f1b1e4317a0 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -1330,8 +1330,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
 		if (!(old & (1 << BTREE_NODE_dirty)))
 			return;
 
-		if (b->written &&
-		    !btree_node_may_write(b))
+		if (!btree_node_may_write(b))
 			return;
 
 		if (old & (1 << BTREE_NODE_write_in_flight)) {
@@ -1347,7 +1346,6 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
 	} while (cmpxchg_acquire(&b->flags, old, new) != old);
 
 	BUG_ON(btree_node_fake(b));
-	BUG_ON(!list_empty(&b->write_blocked));
 	BUG_ON((b->will_make_reachable != 0) != !b->written);
 
 	BUG_ON(b->written >= c->opts.btree_node_size);
@@ -1685,15 +1683,13 @@ ssize_t bch2_dirty_btree_nodes_print(struct bch_fs *c, char *buf)
 		unsigned long flags = READ_ONCE(b->flags);
 		unsigned idx = (flags & (1 << BTREE_NODE_write_idx)) != 0;
 
-		if (//!(flags & (1 << BTREE_NODE_dirty)) &&
-		    !b->writes[0].wait.list.first &&
-		    !b->writes[1].wait.list.first &&
-		    !(b->will_make_reachable & 1))
+		if (!(flags & (1 << BTREE_NODE_dirty)))
 			continue;
 
-		pr_buf(&out, "%p d %u l %u w %u b %u r %u:%lu c %u p %u\n",
+		pr_buf(&out, "%p d %u n %u l %u w %u b %u r %u:%lu c %u p %u\n",
 		       b,
 		       (flags & (1 << BTREE_NODE_dirty)) != 0,
+		       (flags & (1 << BTREE_NODE_need_write)) != 0,
 		       b->level,
 		       b->written,
 		       !list_empty_careful(&b->write_blocked),
diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h
index 9c5a6f9471bd..c817aeed878a 100644
--- a/fs/bcachefs/btree_io.h
+++ b/fs/bcachefs/btree_io.h
@@ -3,6 +3,7 @@
 #define _BCACHEFS_BTREE_IO_H
 
 #include "bset.h"
+#include "btree_locking.h"
 #include "extents.h"
 #include "io_types.h"
 
@@ -48,7 +49,7 @@ static inline void btree_node_wait_on_io(struct btree *b)
 static inline bool btree_node_may_write(struct btree *b)
 {
 	return list_empty_careful(&b->write_blocked) &&
-		!b->will_make_reachable;
+		(!b->written || !b->will_make_reachable);
 }
 
 enum compact_mode {
@@ -100,42 +101,36 @@ bool bch2_btree_post_write_cleanup(struct bch_fs *, struct btree *);
 void bch2_btree_node_write(struct bch_fs *, struct btree *,
 			  enum six_lock_type);
 
-/*
- * btree_node_dirty() can be cleared with only a read lock,
- * and for bch2_btree_node_write_cond() we want to set need_write iff it's
- * still dirty:
- */
-static inline void set_btree_node_need_write_if_dirty(struct btree *b)
+static inline void btree_node_write_if_need(struct bch_fs *c, struct btree *b)
 {
-	unsigned long old, new, v = READ_ONCE(b->flags);
-
-	do {
-		old = new = v;
-
-		if (!(old & (1 << BTREE_NODE_dirty)))
-			return;
-
-		new |= (1 << BTREE_NODE_need_write);
-	} while ((v = cmpxchg(&b->flags, old, new)) != old);
+	while (b->written &&
+	       btree_node_need_write(b) &&
+	       btree_node_may_write(b)) {
+		if (!btree_node_write_in_flight(b)) {
+			bch2_btree_node_write(c, b, SIX_LOCK_read);
+			break;
+		}
+
+		six_unlock_read(&b->lock);
+		btree_node_wait_on_io(b);
+		btree_node_lock_type(c, b, SIX_LOCK_read);
+	}
 }
 
 #define bch2_btree_node_write_cond(_c, _b, cond)			\
 do {									\
-	while ((_b)->written && btree_node_dirty(_b) &&	(cond)) {	\
-		if (!btree_node_may_write(_b)) {			\
-			set_btree_node_need_write_if_dirty(_b);		\
-			break;						\
-		}							\
+	unsigned long old, new, v = READ_ONCE((_b)->flags);		\
+									\
+	do {								\
+		old = new = v;						\
 									\
-		if (!btree_node_write_in_flight(_b)) {			\
-			bch2_btree_node_write(_c, _b, SIX_LOCK_read);	\
+		if (!(old & (1 << BTREE_NODE_dirty)) || !(cond))	\
 			break;						\
-		}							\
 									\
-		six_unlock_read(&(_b)->lock);				\
-		btree_node_wait_on_io(_b);				\
-		btree_node_lock_type(c, b, SIX_LOCK_read);		\
-	}								\
+		new |= (1 << BTREE_NODE_need_write);			\
+	} while ((v = cmpxchg(&(_b)->flags, old, new)) != old);		\
+									\
+	btree_node_write_if_need(_c, _b);				\
 } while (0)
 
 void bch2_btree_flush_all_reads(struct bch_fs *);
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index 912292dad6e5..52e0e003153b 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -2,6 +2,7 @@
 #ifndef _BCACHEFS_BTREE_ITER_H
 #define _BCACHEFS_BTREE_ITER_H
 
+#include "bset.h"
 #include "btree_types.h"
 
 static inline void btree_iter_set_dirty(struct btree_iter *iter,
diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h
index 3871e14e480d..48b50e066186 100644
--- a/fs/bcachefs/btree_locking.h
+++ b/fs/bcachefs/btree_locking.h
@@ -11,7 +11,6 @@
  */
 
 #include "btree_iter.h"
-#include "btree_io.h"
 #include "six.h"
 
 /* matches six lock types */
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index a314bda544dd..2efe191cdc30 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -367,6 +367,7 @@ static struct btree *bch2_btree_node_alloc(struct btree_update *as, unsigned lev
 
 	set_btree_node_accessed(b);
 	set_btree_node_dirty(b);
+	set_btree_node_need_write(b);
 
 	bch2_bset_init_first(b, &b->data->keys);
 	memset(&b->nr, 0, sizeof(b->nr));
@@ -655,6 +656,12 @@ retry:
 		closure_wait(&btree_current_write(b)->wait, cl);
 
 		list_del(&as->write_blocked_list);
+
+		/*
+		 * for flush_held_btree_writes() waiting on updates to flush or
+		 * nodes to be writeable:
+		 */
+		closure_wake_up(&c->btree_interior_update_wait);
 		mutex_unlock(&c->btree_interior_update_lock);
 
 		/*
@@ -958,6 +965,12 @@ void bch2_btree_interior_update_will_free_node(struct btree_update *as,
 	list_for_each_entry_safe(p, n, &b->write_blocked, write_blocked_list) {
 		list_del(&p->write_blocked_list);
 		btree_update_reparent(as, p);
+
+		/*
+		 * for flush_held_btree_writes() waiting on updates to flush or
+		 * nodes to be writeable:
+		 */
+		closure_wake_up(&c->btree_interior_update_wait);
 	}
 
 	clear_btree_node_dirty(b);
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 6501dcf12d59..34e5f81b2b5e 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -1038,7 +1038,7 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
 	size_t reserve_none	= max_t(size_t, 1, nbuckets >> 9);
 	size_t copygc_reserve	= max_t(size_t, 2, nbuckets >> 7);
 	size_t free_inc_nr	= max(max_t(size_t, 1, nbuckets >> 12),
-				      btree_reserve);
+				      btree_reserve * 2);
 	bool resize = ca->buckets[0] != NULL,
 	     start_copygc = ca->copygc_thread != NULL;
 	int ret = -ENOMEM;
diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c
index 8931aa6a1e2a..d998e51dbc30 100644
--- a/fs/bcachefs/util.c
+++ b/fs/bcachefs/util.c
@@ -25,9 +25,6 @@
 #include "eytzinger.h"
 #include "util.h"
 
-#define simple_strtoint(c, end, base)	simple_strtol(c, end, base)
-#define simple_strtouint(c, end, base)	simple_strtoul(c, end, base)
-
 static const char si_units[] = "?kMGTPEZY";
 
 static int __bch2_strtoh(const char *cp, u64 *res,
-- 
cgit 


From ed4840308c9bd3f70e395bd6e37a410a96dcd883 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 13 Jan 2019 21:36:14 -0500
Subject: bcachefs: Fix a dio bug

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-io.c | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index ad06db069fcf..c9a136797aa7 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -57,7 +57,7 @@ struct bch_writepage_io {
 struct dio_write {
 	struct closure			cl;
 	struct kiocb			*req;
-	struct task_struct		*task;
+	struct mm_struct		*mm;
 	unsigned			loop:1,
 					sync:1,
 					free_iov:1;
@@ -1775,6 +1775,7 @@ static void bch2_dio_write_loop_async(struct closure *);
 
 static long bch2_dio_write_loop(struct dio_write *dio)
 {
+	bool kthread = (current->flags & PF_KTHREAD) != 0;
 	struct kiocb *req = dio->req;
 	struct address_space *mapping = req->ki_filp->f_mapping;
 	struct bch_inode_info *inode = dio->iop.inode;
@@ -1797,16 +1798,16 @@ static long bch2_dio_write_loop(struct dio_write *dio)
 		goto err;
 
 	while (1) {
-		if (current != dio->task)
-			kthread_use_mm(dio->task->mm);
+		if (kthread)
+			kthread_use_mm(dio->mm);
 		BUG_ON(current->faults_disabled_mapping);
 		current->faults_disabled_mapping = mapping;
 
 		ret = bio_iov_iter_get_pages(bio, &dio->iter);
 
 		current->faults_disabled_mapping = NULL;
-		if (current != dio->task)
-			kthread_unuse_mm(dio->task->mm);
+		if (kthread)
+			kthread_unuse_mm(dio->mm);
 
 		if (unlikely(ret < 0))
 			goto err;
@@ -1906,7 +1907,7 @@ ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter)
 	dio = container_of(bio, struct dio_write, iop.op.wbio.bio);
 	closure_init(&dio->cl, NULL);
 	dio->req		= req;
-	dio->task		= current;
+	dio->mm			= current->mm;
 	dio->loop		= false;
 	dio->sync		= is_sync_kiocb(req) ||
 		offset + iter->count > inode->v.i_size;
@@ -1914,7 +1915,7 @@ ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter)
 	dio->quota_res.sectors	= 0;
 	dio->iter		= *iter;
 	bch2_fswrite_op_init(&dio->iop, c, inode, io_opts(c, inode), true);
-	dio->iop.op.write_point	= writepoint_hashed((unsigned long) dio->task);
+	dio->iop.op.write_point	= writepoint_hashed((unsigned long) current);
 	dio->iop.op.flags |= BCH_WRITE_NOPUT_RESERVATION;
 
 	if ((req->ki_flags & IOCB_DSYNC) &&
-- 
cgit 


From 000de45996c4b0f9a7b4f2830c11ed584f473257 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 14 Jan 2019 00:38:47 -0500
Subject: bcachefs: fixes for getting stuck flushing journal pins

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/journal_reclaim.c | 43 +++++++++++++++++++------------------------
 fs/bcachefs/util.h            |  1 +
 2 files changed, 20 insertions(+), 24 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
index 770a6e0c7d97..eaf77c77b2c4 100644
--- a/fs/bcachefs/journal_reclaim.c
+++ b/fs/bcachefs/journal_reclaim.c
@@ -65,6 +65,9 @@ static inline void __journal_pin_drop(struct journal *j,
 	if (atomic_dec_and_test(&pin_list->count) &&
 	    pin_list == &fifo_peek_front(&j->pin))
 		bch2_journal_reclaim_fast(j);
+	else if (fifo_used(&j->pin) == 1 &&
+		 atomic_read(&pin_list->count) == 1)
+		journal_wake(j);
 }
 
 void bch2_journal_pin_drop(struct journal *j,
@@ -337,56 +340,48 @@ void bch2_journal_reclaim_work(struct work_struct *work)
 				   msecs_to_jiffies(j->reclaim_delay_ms));
 }
 
-static int journal_flush_done(struct journal *j, u64 seq_to_flush,
-			      struct journal_entry_pin **pin,
-			      u64 *pin_seq)
+static int journal_flush_done(struct journal *j, u64 seq_to_flush)
 {
+	struct journal_entry_pin *pin;
+	u64 pin_seq;
 	int ret;
 
-	*pin = NULL;
-
 	ret = bch2_journal_error(j);
 	if (ret)
 		return ret;
 
+	mutex_lock(&j->reclaim_lock);
 	spin_lock(&j->lock);
+
+	while ((pin = journal_get_next_pin(j, seq_to_flush, &pin_seq))) {
+		journal_pin_mark_flushing(j, pin, pin_seq);
+		spin_unlock(&j->lock);
+
+		journal_pin_flush(j, pin, pin_seq);
+
+		spin_lock(&j->lock);
+	}
 	/*
 	 * If journal replay hasn't completed, the unreplayed journal entries
 	 * hold refs on their corresponding sequence numbers
 	 */
-	ret = (*pin = journal_get_next_pin(j, seq_to_flush, pin_seq)) != NULL ||
-		!test_bit(JOURNAL_REPLAY_DONE, &j->flags) ||
+	ret = !test_bit(JOURNAL_REPLAY_DONE, &j->flags) ||
 		journal_last_seq(j) > seq_to_flush ||
 		(fifo_used(&j->pin) == 1 &&
 		 atomic_read(&fifo_peek_front(&j->pin).count) == 1);
-	if (*pin)
-		journal_pin_mark_flushing(j, *pin, *pin_seq);
 
 	spin_unlock(&j->lock);
+	mutex_unlock(&j->reclaim_lock);
 
 	return ret;
 }
 
 void bch2_journal_flush_pins(struct journal *j, u64 seq_to_flush)
 {
-	struct journal_entry_pin *pin;
-	u64 pin_seq;
-
 	if (!test_bit(JOURNAL_STARTED, &j->flags))
 		return;
 
-	mutex_lock(&j->reclaim_lock);
-
-	while (1) {
-		wait_event(j->wait, journal_flush_done(j, seq_to_flush,
-						       &pin, &pin_seq));
-		if (!pin)
-			break;
-
-		journal_pin_flush(j, pin, pin_seq);
-	}
-
-	mutex_unlock(&j->reclaim_lock);
+	closure_wait_event(&j->async_wait, journal_flush_done(j, seq_to_flush));
 }
 
 int bch2_journal_flush_device_pins(struct journal *j, int dev_idx)
diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h
index 094c35f40209..8bbb0e30d07f 100644
--- a/fs/bcachefs/util.h
+++ b/fs/bcachefs/util.h
@@ -11,6 +11,7 @@
 #include <linux/sched/clock.h>
 #include <linux/llist.h>
 #include <linux/log2.h>
+#include <linux/percpu.h>
 #include <linux/ratelimit.h>
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
-- 
cgit 


From b0cbf659a5349ec73c5745bdf1975d5915f89071 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 18 Jan 2019 18:58:51 -0500
Subject: bcachefs: Fix an allocator error path

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_foreground.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index 9eae8c29e3c4..f37110497b51 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -838,15 +838,17 @@ struct write_point *bch2_alloc_sectors_start(struct bch_fs *c,
 {
 	struct write_point *wp;
 	struct open_bucket *ob;
-	unsigned nr_effective = 0;
-	struct open_buckets ptrs = { .nr = 0 };
-	bool have_cache = false;
-	unsigned write_points_nr;
-	int ret = 0, i;
+	struct open_buckets ptrs;
+	unsigned nr_effective, write_points_nr;
+	bool have_cache;
+	int ret, i;
 
 	BUG_ON(!nr_replicas || !nr_replicas_required);
 retry:
+	ptrs.nr		= 0;
+	nr_effective	= 0;
 	write_points_nr = c->write_points_nr;
+	have_cache	= false;
 
 	wp = writepoint_find(c, write_point.v);
 
-- 
cgit 


From 0519b72dd24754c560b6ac47b0224aaf462978c8 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 19 Jan 2019 12:20:00 -0500
Subject: bcachefs: Add a workqueue for journal reclaim

journal reclaim writes btree nodes, which can end up waiting for in
flight btree writes to complete, and btree write completions run out of
workqueues - so we can't run out of the same workqueue or we risk
deadlock

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs.h        | 1 +
 fs/bcachefs/journal_reclaim.c | 2 +-
 fs/bcachefs/super.c           | 4 ++++
 3 files changed, 6 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 244b808688b3..64836a8c69d8 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -600,6 +600,7 @@ struct bch_fs {
 	struct workqueue_struct	*wq;
 	/* copygc needs its own workqueue for index updates.. */
 	struct workqueue_struct	*copygc_wq;
+	struct workqueue_struct	*journal_reclaim_wq;
 
 	/* ALLOCATION */
 	struct delayed_work	pd_controllers_update;
diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
index eaf77c77b2c4..f24546dbf3ed 100644
--- a/fs/bcachefs/journal_reclaim.c
+++ b/fs/bcachefs/journal_reclaim.c
@@ -336,7 +336,7 @@ void bch2_journal_reclaim_work(struct work_struct *work)
 		mutex_unlock(&j->reclaim_lock);
 
 	if (!test_bit(BCH_FS_RO, &c->flags))
-		queue_delayed_work(system_freezable_wq, &j->reclaim_work,
+		queue_delayed_work(c->journal_reclaim_wq, &j->reclaim_work,
 				   msecs_to_jiffies(j->reclaim_delay_ms));
 }
 
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 4e811ffdec8e..198f78dbb6d9 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -388,6 +388,8 @@ static void bch2_fs_free(struct bch_fs *c)
 	kfree(c->replicas_gc.entries);
 	kfree(rcu_dereference_protected(c->disk_groups, 1));
 
+	if (c->journal_reclaim_wq)
+		destroy_workqueue(c->journal_reclaim_wq);
 	if (c->copygc_wq)
 		destroy_workqueue(c->copygc_wq);
 	if (c->wq)
@@ -602,6 +604,8 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 				WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_HIGHPRI, 1)) ||
 	    !(c->copygc_wq = alloc_workqueue("bcache_copygc",
 				WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_HIGHPRI, 1)) ||
+	    !(c->journal_reclaim_wq = alloc_workqueue("bcache_journal",
+				WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_HIGHPRI, 1)) ||
 	    percpu_ref_init(&c->writes, bch2_writes_disabled, 0, GFP_KERNEL) ||
 	    mempool_init_kmalloc_pool(&c->btree_reserve_pool, 1,
 				      sizeof(struct btree_reserve)) ||
-- 
cgit 


From b030f691da68835e46a50f221c36b59898c91bf6 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 19 Jan 2019 13:13:29 -0500
Subject: bcachefs: Fix some reserve calculations

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_foreground.c |  6 ++++--
 fs/bcachefs/alloc_foreground.h |  1 +
 fs/bcachefs/alloc_types.h      |  5 +++--
 fs/bcachefs/bcachefs.h         |  2 ++
 fs/bcachefs/sysfs.c            | 19 ++++++++++++++++---
 5 files changed, 26 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index f37110497b51..f40fca9328f9 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -107,6 +107,7 @@ void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob)
 	bch2_mark_alloc_bucket(c, ca, PTR_BUCKET_NR(ca, &ob->ptr),
 			       false, gc_pos_alloc(c, ob), 0);
 	ob->valid = false;
+	ob->type = 0;
 
 	spin_unlock(&ob->lock);
 	percpu_up_read(&c->mark_lock);
@@ -142,6 +143,7 @@ static struct open_bucket *bch2_open_bucket_alloc(struct bch_fs *c)
 	ob = c->open_buckets + c->open_buckets_freelist;
 	c->open_buckets_freelist = ob->freelist;
 	atomic_set(&ob->pin, 1);
+	ob->type = 0;
 
 	c->open_buckets_nr_free--;
 	return ob;
@@ -210,9 +212,9 @@ static inline unsigned open_buckets_reserved(enum alloc_reserve reserve)
 	case RESERVE_ALLOC:
 		return 0;
 	case RESERVE_BTREE:
-		return BTREE_NODE_RESERVE / 2;
+		return BTREE_NODE_OPEN_BUCKET_RESERVE;
 	default:
-		return BTREE_NODE_RESERVE;
+		return BTREE_NODE_OPEN_BUCKET_RESERVE * 2;
 	}
 }
 
diff --git a/fs/bcachefs/alloc_foreground.h b/fs/bcachefs/alloc_foreground.h
index 94389052fa94..6d8ffb0cd06d 100644
--- a/fs/bcachefs/alloc_foreground.h
+++ b/fs/bcachefs/alloc_foreground.h
@@ -86,6 +86,7 @@ static inline void bch2_open_bucket_get(struct bch_fs *c,
 	unsigned i;
 
 	open_bucket_for_each(c, &wp->ptrs, ob, i) {
+		ob->type = wp->type;
 		atomic_inc(&ob->pin);
 		ob_push(c, ptrs, ob);
 	}
diff --git a/fs/bcachefs/alloc_types.h b/fs/bcachefs/alloc_types.h
index ef3e400c7d3d..832568dc9551 100644
--- a/fs/bcachefs/alloc_types.h
+++ b/fs/bcachefs/alloc_types.h
@@ -56,9 +56,10 @@ struct open_bucket {
 	spinlock_t		lock;
 	atomic_t		pin;
 	u8			freelist;
-	bool			valid;
-	bool			on_partial_list;
 	u8			ec_idx;
+	u8			type;
+	unsigned		valid:1;
+	unsigned		on_partial_list:1;
 	unsigned		sectors_free;
 	struct bch_extent_ptr	ptr;
 	struct ec_stripe_new	*ec;
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 64836a8c69d8..a5203fbc089e 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -333,6 +333,8 @@ enum bch_time_stats {
 /* Size of the freelist we allocate btree nodes from: */
 #define BTREE_NODE_RESERVE	BTREE_RESERVE_MAX
 
+#define BTREE_NODE_OPEN_BUCKET_RESERVE	(BTREE_RESERVE_MAX * BCH_REPLICAS_MAX)
+
 struct btree;
 
 enum gc_phase {
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index b59b7a5a4cbb..27fd6dfe83f5 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -797,6 +797,12 @@ static ssize_t show_dev_alloc_debug(struct bch_dev *ca, char *buf)
 {
 	struct bch_fs *c = ca->fs;
 	struct bch_dev_usage stats = bch2_dev_usage_read(c, ca);
+	unsigned i, nr[BCH_DATA_NR];
+
+	memset(nr, 0, sizeof(nr));
+
+	for (i = 0; i < ARRAY_SIZE(c->open_buckets); i++)
+		nr[c->open_buckets[i].type]++;
 
 	return scnprintf(buf, PAGE_SIZE,
 		"free_inc:               %zu/%zu\n"
@@ -823,7 +829,10 @@ static ssize_t show_dev_alloc_debug(struct bch_dev *ca, char *buf)
 		"    copygc threshold:   %llu\n"
 		"freelist_wait:          %s\n"
 		"open buckets:           %u/%u (reserved %u)\n"
-		"open_buckets_wait:      %s\n",
+		"open_buckets_wait:      %s\n"
+		"open_buckets_btree:     %u\n"
+		"open_buckets_user:      %u\n"
+		"btree reserve cache:    %u\n",
 		fifo_used(&ca->free_inc),		ca->free_inc.size,
 		fifo_used(&ca->free[RESERVE_BTREE]),	ca->free[RESERVE_BTREE].size,
 		fifo_used(&ca->free[RESERVE_MOVINGGC]),	ca->free[RESERVE_MOVINGGC].size,
@@ -845,8 +854,12 @@ static ssize_t show_dev_alloc_debug(struct bch_dev *ca, char *buf)
 		stats.sectors_fragmented,
 		ca->copygc_threshold,
 		c->freelist_wait.list.first		? "waiting" : "empty",
-		c->open_buckets_nr_free, OPEN_BUCKETS_COUNT, BTREE_NODE_RESERVE,
-		c->open_buckets_wait.list.first		? "waiting" : "empty");
+		c->open_buckets_nr_free, OPEN_BUCKETS_COUNT,
+		BTREE_NODE_OPEN_BUCKET_RESERVE,
+		c->open_buckets_wait.list.first		? "waiting" : "empty",
+		nr[BCH_DATA_BTREE],
+		nr[BCH_DATA_USER],
+		c->btree_reserve_cache_nr);
 }
 
 static const char * const bch2_rw[] = {
-- 
cgit 


From 919dbbd18b590bc3235b96e498a67cc66e4fbb1f Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 19 Jan 2019 18:12:24 -0500
Subject: bcachefs: dio arithmetic improvements

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-io.c | 23 +++++++++++++----------
 1 file changed, 13 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index c9a136797aa7..a59fedcaed07 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -1782,6 +1782,7 @@ static long bch2_dio_write_loop(struct dio_write *dio)
 	struct bio *bio = &dio->iop.op.wbio.bio;
 	struct bvec_iter_all iter;
 	struct bio_vec *bv;
+	loff_t offset;
 	bool sync;
 	long ret;
 
@@ -1792,12 +1793,16 @@ static long bch2_dio_write_loop(struct dio_write *dio)
 	bch2_pagecache_block_get(&inode->ei_pagecache_lock);
 
 	/* Write and invalidate pagecache range that we're writing to: */
-	ret = write_invalidate_inode_pages_range(mapping, req->ki_pos,
-				req->ki_pos + iov_iter_count(&dio->iter) - 1);
+	offset = req->ki_pos + (dio->iop.op.written << 9);
+	ret = write_invalidate_inode_pages_range(mapping,
+					offset,
+					offset + iov_iter_count(&dio->iter) - 1);
 	if (unlikely(ret))
 		goto err;
 
 	while (1) {
+		offset = req->ki_pos + (dio->iop.op.written << 9);
+
 		if (kthread)
 			kthread_use_mm(dio->mm);
 		BUG_ON(current->faults_disabled_mapping);
@@ -1814,13 +1819,12 @@ static long bch2_dio_write_loop(struct dio_write *dio)
 
 		/* gup might have faulted pages back in: */
 		ret = write_invalidate_inode_pages_range(mapping,
-				req->ki_pos + (dio->iop.op.written << 9),
-				req->ki_pos + iov_iter_count(&dio->iter) - 1);
+				offset,
+				offset + bio->bi_iter.bi_size - 1);
 		if (unlikely(ret))
 			goto err;
 
-		dio->iop.op.pos = POS(inode->v.i_ino,
-				(req->ki_pos >> 9) + dio->iop.op.written);
+		dio->iop.op.pos = POS(inode->v.i_ino, offset >> 9);
 
 		task_io_account_write(bio->bi_iter.bi_size);
 
@@ -1888,7 +1892,6 @@ ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter)
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
 	struct dio_write *dio;
 	struct bio *bio;
-	loff_t offset = req->ki_pos;
 	ssize_t ret;
 
 	lockdep_assert_held(&inode->v.i_rwsem);
@@ -1896,7 +1899,7 @@ ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter)
 	if (unlikely(!iter->count))
 		return 0;
 
-	if (unlikely((offset|iter->count) & (block_bytes(c) - 1)))
+	if (unlikely((req->ki_pos|iter->count) & (block_bytes(c) - 1)))
 		return -EINVAL;
 
 	bio = bio_alloc_bioset(NULL,
@@ -1910,7 +1913,7 @@ ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter)
 	dio->mm			= current->mm;
 	dio->loop		= false;
 	dio->sync		= is_sync_kiocb(req) ||
-		offset + iter->count > inode->v.i_size;
+		req->ki_pos + iter->count > inode->v.i_size;
 	dio->free_iov		= false;
 	dio->quota_res.sectors	= 0;
 	dio->iter		= *iter;
@@ -1931,7 +1934,7 @@ ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter)
 					dio->iop.op.opts.data_replicas, 0);
 	if (unlikely(ret)) {
 		if (bch2_check_range_allocated(c, POS(inode->v.i_ino,
-						      offset >> 9),
+						      req->ki_pos >> 9),
 					       iter->count >> 9))
 			goto err;
 
-- 
cgit 


From dbaee468461bfa82e6453ca0e009e9661cc570da Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 20 Jan 2019 21:16:25 -0500
Subject: bcachefs: fix error message in device remove path

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/super.c | 4 ++--
 fs/bcachefs/util.c  | 6 +++++-
 2 files changed, 7 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 198f78dbb6d9..55069f40d04b 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -1258,8 +1258,8 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
 	if (data) {
 		char data_has_str[100];
 
-		bch2_string_opt_to_text(&PBUF(data_has_str),
-					bch2_data_types, data);
+		bch2_flags_to_text(&PBUF(data_has_str),
+				   bch2_data_types, data);
 		bch_err(ca, "Remove failed, still has data (%s)", data_has_str);
 		ret = -EBUSY;
 		goto err;
diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c
index d998e51dbc30..9f3eafb3e0d4 100644
--- a/fs/bcachefs/util.c
+++ b/fs/bcachefs/util.c
@@ -134,6 +134,7 @@ void bch2_flags_to_text(struct printbuf *out,
 			const char * const list[], u64 flags)
 {
 	unsigned bit, nr = 0;
+	bool first = true;
 
 	if (out->pos != out->end)
 		*out->pos = '\0';
@@ -142,7 +143,10 @@ void bch2_flags_to_text(struct printbuf *out,
 		nr++;
 
 	while (flags && (bit = __ffs(flags)) < nr) {
-		pr_buf(out, "%s,", list[bit]);
+		pr_buf(out, "%s", list[bit]);
+		if (!first)
+			pr_buf(out, ",");
+		first = false;
 		flags ^= 1 << bit;
 	}
 }
-- 
cgit 


From 7ef2a73a5881323d53453cc3be7261fe1a49af1d Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 21 Jan 2019 15:32:13 -0500
Subject: bcachefs: Fix check for if extent update is allocating

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs.h              |   5 +-
 fs/bcachefs/btree_gc.c              |  82 +++++------
 fs/bcachefs/btree_update_interior.c |  29 ++--
 fs/bcachefs/buckets.c               | 228 +++++++++++++++++++----------
 fs/bcachefs/buckets.h               |  17 ++-
 fs/bcachefs/buckets_types.h         |  14 +-
 fs/bcachefs/chardev.c               |  20 ++-
 fs/bcachefs/ec_types.h              |   7 +
 fs/bcachefs/extents.c               |  32 ++++-
 fs/bcachefs/extents.h               |   3 +-
 fs/bcachefs/eytzinger.h             |  26 ++--
 fs/bcachefs/fs-io.c                 |  19 +--
 fs/bcachefs/journal_io.c            |  20 ++-
 fs/bcachefs/journal_reclaim.c       |   8 +-
 fs/bcachefs/migrate.c               |  11 ++
 fs/bcachefs/move.c                  |  11 ++
 fs/bcachefs/replicas.c              | 280 ++++++++++++++++++++++++------------
 fs/bcachefs/replicas.h              |  35 ++++-
 fs/bcachefs/super.c                 |   9 +-
 fs/bcachefs/sysfs.c                 |  38 +++--
 fs/bcachefs/util.c                  |  25 ++++
 fs/bcachefs/util.h                  |   2 +
 22 files changed, 622 insertions(+), 299 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index a5203fbc089e..17eb0dd657a8 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -624,10 +624,11 @@ struct bch_fs {
 
 	struct bch_fs_pcpu __percpu	*pcpu;
 
-	struct bch_fs_usage __percpu	*usage[2];
-
 	struct percpu_rw_semaphore	mark_lock;
 
+	struct bch_fs_usage __percpu	*usage[2];
+	struct bch_fs_usage __percpu	*usage_scratch;
+
 	/*
 	 * When we invalidate buckets, we use both the priority and the amount
 	 * of good data to determine which buckets to reuse first - to weight
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 466469a0d852..a725a106f6dc 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -478,33 +478,12 @@ static void bch2_gc_free(struct bch_fs *c)
 		ca->usage[1] = NULL;
 	}
 
+	percpu_down_write(&c->mark_lock);
+
 	free_percpu(c->usage[1]);
 	c->usage[1] = NULL;
-}
-
-/*
- * Accumulate percpu counters onto one cpu's copy - only valid when access
- * against any percpu counter is guarded against
- */
-static u64 *acc_percpu_u64s(u64 __percpu *p, unsigned nr)
-{
-	u64 *ret;
-	int cpu;
-
-	preempt_disable();
-	ret = this_cpu_ptr(p);
-	preempt_enable();
-
-	for_each_possible_cpu(cpu) {
-		u64 *i = per_cpu_ptr(p, cpu);
 
-		if (i != ret) {
-			acc_u64s(ret, i, nr);
-			memset(i, 0, nr * sizeof(u64));
-		}
-	}
-
-	return ret;
+	percpu_up_write(&c->mark_lock);
 }
 
 static void bch2_gc_done_nocheck(struct bch_fs *c)
@@ -542,24 +521,25 @@ static void bch2_gc_done_nocheck(struct bch_fs *c)
 	for_each_member_device(ca, c, i) {
 		unsigned nr = sizeof(struct bch_dev_usage) / sizeof(u64);
 		struct bch_dev_usage *dst = (void *)
-			acc_percpu_u64s((void *) ca->usage[0], nr);
+			bch2_acc_percpu_u64s((void *) ca->usage[0], nr);
 		struct bch_dev_usage *src = (void *)
-			acc_percpu_u64s((void *) ca->usage[1], nr);
+			bch2_acc_percpu_u64s((void *) ca->usage[1], nr);
 
 		*dst = *src;
 	}
 
 	{
-		unsigned nr = sizeof(struct bch_fs_usage) / sizeof(u64);
+		unsigned nr = sizeof(struct bch_fs_usage) / sizeof(u64) +
+			c->replicas.nr;
 		struct bch_fs_usage *dst = (void *)
-			acc_percpu_u64s((void *) c->usage[0], nr);
+			bch2_acc_percpu_u64s((void *) c->usage[0], nr);
 		struct bch_fs_usage *src = (void *)
-			acc_percpu_u64s((void *) c->usage[1], nr);
+			bch2_acc_percpu_u64s((void *) c->usage[1], nr);
 		unsigned offset = offsetof(typeof(*dst), s.gc_start);
 
 		memcpy((void *) dst + offset,
 		       (void *) src + offset,
-		       sizeof(*dst) - offset);
+		       nr * sizeof(u64) - offset);
 	}
 }
 
@@ -655,9 +635,9 @@ static void bch2_gc_done(struct bch_fs *c, bool initial)
 	for_each_member_device(ca, c, i) {
 		unsigned nr = sizeof(struct bch_dev_usage) / sizeof(u64);
 		struct bch_dev_usage *dst = (void *)
-			acc_percpu_u64s((void *) ca->usage[0], nr);
+			bch2_acc_percpu_u64s((void *) ca->usage[0], nr);
 		struct bch_dev_usage *src = (void *)
-			acc_percpu_u64s((void *) ca->usage[1], nr);
+			bch2_acc_percpu_u64s((void *) ca->usage[1], nr);
 		unsigned b;
 
 		for (b = 0; b < BCH_DATA_NR; b++)
@@ -674,12 +654,12 @@ static void bch2_gc_done(struct bch_fs *c, bool initial)
 	}
 
 	{
-		unsigned nr = sizeof(struct bch_fs_usage) / sizeof(u64);
+		unsigned nr = sizeof(struct bch_fs_usage) / sizeof(u64) +
+			c->replicas.nr;
 		struct bch_fs_usage *dst = (void *)
-			acc_percpu_u64s((void *) c->usage[0], nr);
+			bch2_acc_percpu_u64s((void *) c->usage[0], nr);
 		struct bch_fs_usage *src = (void *)
-			acc_percpu_u64s((void *) c->usage[1], nr);
-		unsigned r, b;
+			bch2_acc_percpu_u64s((void *) c->usage[1], nr);
 
 		copy_fs_field(s.hidden,		"hidden");
 		copy_fs_field(s.data,		"data");
@@ -687,20 +667,16 @@ static void bch2_gc_done(struct bch_fs *c, bool initial)
 		copy_fs_field(s.reserved,	"reserved");
 		copy_fs_field(s.nr_inodes,	"nr_inodes");
 
-		for (r = 0; r < BCH_REPLICAS_MAX; r++) {
-			for (b = 0; b < BCH_DATA_NR; b++)
-				copy_fs_field(replicas[r].data[b],
-					      "replicas[%i].data[%s]",
-					      r, bch2_data_types[b]);
-			copy_fs_field(replicas[r].ec_data,
-				      "replicas[%i].ec_data", r);
-			copy_fs_field(replicas[r].persistent_reserved,
-				      "replicas[%i].persistent_reserved", r);
-		}
+		for (i = 0; i < BCH_REPLICAS_MAX; i++)
+			copy_fs_field(persistent_reserved[i],
+				      "persistent_reserved[%i]", i);
 
-		for (b = 0; b < BCH_DATA_NR; b++)
-			copy_fs_field(buckets[b],
-				      "buckets[%s]", bch2_data_types[b]);
+		for (i = 0; i < c->replicas.nr; i++) {
+			/*
+			 * XXX: print out replicas entry
+			 */
+			copy_fs_field(data[i], "data[%i]", i);
+		}
 	}
 out:
 	percpu_up_write(&c->mark_lock);
@@ -723,9 +699,15 @@ static int bch2_gc_start(struct bch_fs *c)
 	 */
 	gc_pos_set(c, gc_phase(GC_PHASE_START));
 
+	percpu_down_write(&c->mark_lock);
 	BUG_ON(c->usage[1]);
 
-	c->usage[1] = alloc_percpu(struct bch_fs_usage);
+	c->usage[1] = __alloc_percpu_gfp(sizeof(struct bch_fs_usage) +
+					 sizeof(u64) * c->replicas.nr,
+					 sizeof(u64),
+					 GFP_KERNEL);
+	percpu_up_write(&c->mark_lock);
+
 	if (!c->usage[1])
 		return -ENOMEM;
 
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 2efe191cdc30..d55778696bcd 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -1070,25 +1070,28 @@ static void bch2_btree_set_root_inmem(struct btree_update *as, struct btree *b)
 {
 	struct bch_fs *c = as->c;
 	struct btree *old = btree_node_root(c, b);
-	struct bch_fs_usage stats = { 0 };
+	struct bch_fs_usage *fs_usage;
 
 	__bch2_btree_set_root_inmem(c, b);
 
 	mutex_lock(&c->btree_interior_update_lock);
 	percpu_down_read(&c->mark_lock);
+	preempt_disable();
+	fs_usage = bch2_fs_usage_get_scratch(c);
 
 	bch2_mark_key_locked(c, bkey_i_to_s_c(&b->key),
 		      true, 0,
 		      gc_pos_btree_root(b->btree_id),
-		      &stats, 0, 0);
+		      fs_usage, 0, 0);
 
 	if (old && !btree_node_fake(old))
 		bch2_btree_node_free_index(as, NULL,
 					   bkey_i_to_s_c(&old->key),
-					   &stats);
-	bch2_fs_usage_apply(c, &stats, &as->reserve->disk_res,
+					   fs_usage);
+	bch2_fs_usage_apply(c, fs_usage, &as->reserve->disk_res,
 			    gc_pos_btree_root(b->btree_id));
 
+	preempt_enable();
 	percpu_up_read(&c->mark_lock);
 	mutex_unlock(&c->btree_interior_update_lock);
 }
@@ -1161,7 +1164,7 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, struct btree *b
 					struct btree_node_iter *node_iter)
 {
 	struct bch_fs *c = as->c;
-	struct bch_fs_usage stats = { 0 };
+	struct bch_fs_usage *fs_usage;
 	struct bkey_packed *k;
 	struct bkey tmp;
 
@@ -1169,10 +1172,11 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, struct btree *b
 
 	mutex_lock(&c->btree_interior_update_lock);
 	percpu_down_read(&c->mark_lock);
+	fs_usage = bch2_fs_usage_get_scratch(c);
 
 	bch2_mark_key_locked(c, bkey_i_to_s_c(insert),
 			     true, 0,
-			     gc_pos_btree_node(b), &stats, 0, 0);
+			     gc_pos_btree_node(b), fs_usage, 0, 0);
 
 	while ((k = bch2_btree_node_iter_peek_all(node_iter, b)) &&
 	       bkey_iter_pos_cmp(b, &insert->k.p, k) > 0)
@@ -1185,9 +1189,9 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, struct btree *b
 	if (k && !bkey_cmp_packed(b, k, &insert->k))
 		bch2_btree_node_free_index(as, b,
 					   bkey_disassemble(b, k, &tmp),
-					   &stats);
+					   fs_usage);
 
-	bch2_fs_usage_apply(c, &stats, &as->reserve->disk_res,
+	bch2_fs_usage_apply(c, fs_usage, &as->reserve->disk_res,
 			    gc_pos_btree_node(b));
 
 	percpu_up_read(&c->mark_lock);
@@ -1971,7 +1975,7 @@ static void __bch2_btree_node_update_key(struct bch_fs *c,
 			bkey_copy(&b->key, &new_key->k_i);
 		}
 	} else {
-		struct bch_fs_usage stats = { 0 };
+		struct bch_fs_usage *fs_usage;
 
 		BUG_ON(btree_node_root(c, b) != b);
 
@@ -1979,15 +1983,16 @@ static void __bch2_btree_node_update_key(struct bch_fs *c,
 
 		mutex_lock(&c->btree_interior_update_lock);
 		percpu_down_read(&c->mark_lock);
+		fs_usage = bch2_fs_usage_get_scratch(c);
 
 		bch2_mark_key_locked(c, bkey_i_to_s_c(&new_key->k_i),
 			      true, 0,
 			      gc_pos_btree_root(b->btree_id),
-			      &stats, 0, 0);
+			      fs_usage, 0, 0);
 		bch2_btree_node_free_index(as, NULL,
 					   bkey_i_to_s_c(&b->key),
-					   &stats);
-		bch2_fs_usage_apply(c, &stats, &as->reserve->disk_res,
+					   fs_usage);
+		bch2_fs_usage_apply(c, fs_usage, &as->reserve->disk_res,
 				    gc_pos_btree_root(b->btree_id));
 
 		percpu_up_read(&c->mark_lock);
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 34e5f81b2b5e..cbebc712a1da 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -72,12 +72,11 @@
 #include "ec.h"
 #include "error.h"
 #include "movinggc.h"
+#include "replicas.h"
 #include "trace.h"
 
 #include <linux/preempt.h>
 
-static inline u64 __bch2_fs_sectors_used(struct bch_fs *, struct bch_fs_usage);
-
 /*
  * Clear journal_seq_valid for buckets for which it's not needed, to prevent
  * wraparound:
@@ -132,9 +131,29 @@ struct bch_dev_usage bch2_dev_usage_read(struct bch_fs *c, struct bch_dev *ca)
 	return bch2_usage_read_raw(ca->usage[0]);
 }
 
-struct bch_fs_usage bch2_fs_usage_read(struct bch_fs *c)
+struct bch_fs_usage *bch2_fs_usage_read(struct bch_fs *c)
 {
-	return bch2_usage_read_raw(c->usage[0]);
+	struct bch_fs_usage *ret;
+	unsigned nr = READ_ONCE(c->replicas.nr);
+retry:
+	ret = kzalloc(sizeof(*ret) + nr * sizeof(u64), GFP_NOFS);
+	if (unlikely(!ret))
+		return NULL;
+
+	percpu_down_read(&c->mark_lock);
+
+	if (unlikely(nr < c->replicas.nr)) {
+		nr = c->replicas.nr;
+		percpu_up_read(&c->mark_lock);
+		kfree(ret);
+		goto retry;
+	}
+
+	acc_u64s_percpu((u64 *) ret,
+			(u64 __percpu *) c->usage[0],
+			sizeof(*ret) / sizeof(u64) + nr);
+
+	return ret;
 }
 
 #define RESERVE_FACTOR	6
@@ -149,17 +168,13 @@ static u64 avail_factor(u64 r)
 	return (r << RESERVE_FACTOR) / ((1 << RESERVE_FACTOR) + 1);
 }
 
-static inline u64 __bch2_fs_sectors_used(struct bch_fs *c, struct bch_fs_usage fs_usage)
-{
-	return fs_usage.s.hidden +
-		fs_usage.s.data +
-		reserve_factor(fs_usage.s.reserved +
-			       fs_usage.s.online_reserved);
-}
-
 u64 bch2_fs_sectors_used(struct bch_fs *c, struct bch_fs_usage fs_usage)
 {
-	return min(c->capacity, __bch2_fs_sectors_used(c, fs_usage));
+	return min(fs_usage.s.hidden +
+		   fs_usage.s.data +
+		   reserve_factor(fs_usage.s.reserved +
+				  fs_usage.s.online_reserved),
+		   c->capacity);
 }
 
 struct bch_fs_usage_short
@@ -208,13 +223,14 @@ static bool bucket_became_unavailable(struct bucket_mark old,
 	       !is_available_bucket(new);
 }
 
-void bch2_fs_usage_apply(struct bch_fs *c,
-			 struct bch_fs_usage *fs_usage,
-			 struct disk_reservation *disk_res,
-			 struct gc_pos gc_pos)
+int bch2_fs_usage_apply(struct bch_fs *c,
+			struct bch_fs_usage *fs_usage,
+			struct disk_reservation *disk_res,
+			struct gc_pos gc_pos)
 {
 	s64 added = fs_usage->s.data + fs_usage->s.reserved;
 	s64 should_not_have_added;
+	int ret = 0;
 
 	percpu_rwsem_assert_held(&c->mark_lock);
 
@@ -227,6 +243,7 @@ void bch2_fs_usage_apply(struct bch_fs *c,
 		      "disk usage increased without a reservation")) {
 		atomic64_sub(should_not_have_added, &c->sectors_available);
 		added -= should_not_have_added;
+		ret = -1;
 	}
 
 	if (added > 0) {
@@ -237,17 +254,17 @@ void bch2_fs_usage_apply(struct bch_fs *c,
 	preempt_disable();
 	acc_u64s((u64 *) this_cpu_ptr(c->usage[0]),
 		 (u64 *) fs_usage,
-		 sizeof(*fs_usage) / sizeof(u64));
+		 sizeof(*fs_usage) / sizeof(u64) + c->replicas.nr);
 
 	if (gc_visited(c, gc_pos)) {
 		BUG_ON(!c->usage[1]);
 		acc_u64s((u64 *) this_cpu_ptr(c->usage[1]),
 			 (u64 *) fs_usage,
-			 sizeof(*fs_usage) / sizeof(u64));
+			 sizeof(*fs_usage) / sizeof(u64) + c->replicas.nr);
 	}
 	preempt_enable();
 
-	memset(fs_usage, 0, sizeof(*fs_usage));
+	return ret;
 }
 
 static inline void account_bucket(struct bch_fs_usage *fs_usage,
@@ -258,7 +275,6 @@ static inline void account_bucket(struct bch_fs_usage *fs_usage,
 	if (type == BCH_DATA_SB || type == BCH_DATA_JOURNAL)
 		fs_usage->s.hidden	+= size;
 
-	fs_usage->buckets[type]		+= size;
 	dev_usage->buckets[type]	+= nr;
 }
 
@@ -332,6 +348,34 @@ void bch2_dev_usage_from_buckets(struct bch_fs *c, struct bch_dev *ca)
 	_old;							\
 })
 
+static inline void update_replicas(struct bch_fs *c,
+				   struct bch_fs_usage *fs_usage,
+				   struct bch_replicas_entry *r,
+				   s64 sectors)
+{
+	int idx = bch2_replicas_entry_idx(c, r);
+
+	BUG_ON(idx < 0);
+	BUG_ON(!sectors);
+
+	if (r->data_type == BCH_DATA_CACHED)
+		fs_usage->s.cached	+= sectors;
+	else
+		fs_usage->s.data	+= sectors;
+	fs_usage->data[idx]		+= sectors;
+}
+
+static inline void update_cached_sectors(struct bch_fs *c,
+					 struct bch_fs_usage *fs_usage,
+					 unsigned dev, s64 sectors)
+{
+	struct bch_replicas_padded r;
+
+	bch2_replicas_entry_cached(&r.e, dev);
+
+	update_replicas(c, fs_usage, &r.e, sectors);
+}
+
 static void __bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca,
 				     size_t b, struct bucket_mark *old,
 				     bool gc)
@@ -350,8 +394,9 @@ static void __bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca,
 		new.gen++;
 	}));
 
-	fs_usage->replicas[0].data[BCH_DATA_CACHED]	-= old->cached_sectors;
-	fs_usage->s.cached				-= old->cached_sectors;
+	if (old->cached_sectors)
+		update_cached_sectors(c, fs_usage, ca->dev_idx,
+				      -old->cached_sectors);
 }
 
 void bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca,
@@ -418,11 +463,6 @@ static void __bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
 		new.data_type	= type;
 		checked_add(new.dirty_sectors, sectors);
 	}));
-
-	if (type == BCH_DATA_BTREE ||
-	    type == BCH_DATA_USER)
-		fs_usage->s.data		+= sectors;
-	fs_usage->replicas[0].data[type]	+= sectors;
 }
 
 void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
@@ -550,9 +590,9 @@ static void bch2_mark_pointer(struct bch_fs *c,
 
 static int bch2_mark_stripe_ptr(struct bch_fs *c,
 				struct bch_extent_stripe_ptr p,
+				enum bch_data_type data_type,
+				struct bch_fs_usage *fs_usage,
 				s64 sectors, unsigned flags,
-				s64 *adjusted_disk_sectors,
-				unsigned *redundancy,
 				bool gc)
 {
 	struct stripe *m;
@@ -568,16 +608,15 @@ static int bch2_mark_stripe_ptr(struct bch_fs *c,
 		return -1;
 	}
 
+	BUG_ON(m->r.e.data_type != data_type);
+
 	nr_data = m->nr_blocks - m->nr_redundant;
 
 	parity_sectors = DIV_ROUND_UP(abs(sectors) * m->nr_redundant, nr_data);
 
 	if (sectors < 0)
 		parity_sectors = -parity_sectors;
-
-	*adjusted_disk_sectors += parity_sectors;
-
-	*redundancy = max_t(unsigned, *redundancy, m->nr_redundant + 1);
+	sectors += parity_sectors;
 
 	new = atomic_add_return(sectors, &m->block_sectors[p.block]);
 	old = new - sectors;
@@ -593,6 +632,8 @@ static int bch2_mark_stripe_ptr(struct bch_fs *c,
 	if (!gc)
 		bch2_stripes_heap_update(c, m, p.idx);
 
+	update_replicas(c, fs_usage, &m->r.e, sectors);
+
 	return 0;
 }
 
@@ -605,58 +646,46 @@ static int bch2_mark_extent(struct bch_fs *c, struct bkey_s_c k,
 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
 	const union bch_extent_entry *entry;
 	struct extent_ptr_decoded p;
-	s64 cached_sectors	= 0;
-	s64 dirty_sectors	= 0;
-	s64 ec_sectors		= 0;
-	unsigned replicas	= 0;
-	unsigned ec_redundancy	= 0;
+	struct bch_replicas_padded r;
+	s64 dirty_sectors = 0;
 	unsigned i;
 	int ret;
 
+	r.e.data_type	= data_type;
+	r.e.nr_devs	= 0;
+	r.e.nr_required	= 1;
+
 	BUG_ON(!sectors);
 
 	bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
 		s64 disk_sectors = data_type == BCH_DATA_BTREE
 			? sectors
 			: ptr_disk_sectors_delta(p, sectors);
-		s64 adjusted_disk_sectors = disk_sectors;
 
 		bch2_mark_pointer(c, p, disk_sectors, data_type,
 				  fs_usage, journal_seq, flags, gc);
 
-		if (!p.ptr.cached)
+		if (p.ptr.cached) {
+			update_cached_sectors(c, fs_usage, p.ptr.dev,
+					      disk_sectors);
+		} else if (!p.ec_nr) {
+			dirty_sectors	       += disk_sectors;
+			r.e.devs[r.e.nr_devs++]	= p.ptr.dev;
+		} else {
 			for (i = 0; i < p.ec_nr; i++) {
 				ret = bch2_mark_stripe_ptr(c, p.ec[i],
-						disk_sectors, flags,
-						&adjusted_disk_sectors,
-						&ec_redundancy, gc);
+						data_type, fs_usage,
+						disk_sectors, flags, gc);
 				if (ret)
 					return ret;
 			}
-		if (!p.ptr.cached)
-			replicas++;
 
-		if (p.ptr.cached)
-			cached_sectors	+= adjusted_disk_sectors;
-		else if (!p.ec_nr)
-			dirty_sectors	+= adjusted_disk_sectors;
-		else
-			ec_sectors	+= adjusted_disk_sectors;
+			r.e.nr_required = 0;
+		}
 	}
 
-	replicas	= clamp_t(unsigned,	replicas,
-				  1, ARRAY_SIZE(fs_usage->replicas));
-	ec_redundancy	= clamp_t(unsigned,	ec_redundancy,
-				  1, ARRAY_SIZE(fs_usage->replicas));
-
-	fs_usage->s.cached					+= cached_sectors;
-	fs_usage->replicas[0].data[BCH_DATA_CACHED]		+= cached_sectors;
-
-	fs_usage->s.data					+= dirty_sectors;
-	fs_usage->replicas[replicas - 1].data[data_type]	+= dirty_sectors;
-
-	fs_usage->s.data					+= ec_sectors;
-	fs_usage->replicas[ec_redundancy - 1].ec_data		+= ec_sectors;
+	if (dirty_sectors)
+		update_replicas(c, fs_usage, &r.e, dirty_sectors);
 
 	return 0;
 }
@@ -724,8 +753,24 @@ static int bch2_mark_stripe(struct bch_fs *c, struct bkey_s_c k,
 		m->algorithm	= s.v->algorithm;
 		m->nr_blocks	= s.v->nr_blocks;
 		m->nr_redundant	= s.v->nr_redundant;
+
+		memset(&m->r, 0, sizeof(m->r));
+
+		m->r.e.data_type	= BCH_DATA_USER;
+		m->r.e.nr_devs		= s.v->nr_blocks;
+		m->r.e.nr_required	= s.v->nr_blocks - s.v->nr_redundant;
+
+		for (i = 0; i < s.v->nr_blocks; i++)
+			m->r.e.devs[i] = s.v->ptrs[i].dev;
 	}
 
+	/*
+	 * XXX: account for stripes somehow here
+	 */
+#if 0
+	update_replicas(c, fs_usage, &m->r.e, stripe_sectors);
+#endif
+
 	if (!gc) {
 		if (inserting)
 			bch2_stripes_heap_insert(c, m, idx);
@@ -773,11 +818,11 @@ static int __bch2_mark_key(struct bch_fs *c, struct bkey_s_c k,
 		unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas;
 
 		sectors *= replicas;
-		replicas = clamp_t(unsigned, replicas,
-				   1, ARRAY_SIZE(fs_usage->replicas));
+		replicas = clamp_t(unsigned, replicas, 1,
+				   ARRAY_SIZE(fs_usage->persistent_reserved));
 
-		fs_usage->s.reserved					+= sectors;
-		fs_usage->replicas[replicas - 1].persistent_reserved	+= sectors;
+		fs_usage->s.reserved				+= sectors;
+		fs_usage->persistent_reserved[replicas - 1]	+= sectors;
 		break;
 	}
 	default:
@@ -839,20 +884,24 @@ void bch2_mark_update(struct btree_insert *trans,
 	struct btree_iter	*iter = insert->iter;
 	struct btree		*b = iter->l[0].b;
 	struct btree_node_iter	node_iter = iter->l[0].iter;
-	struct bch_fs_usage	fs_usage = { 0 };
+	struct bch_fs_usage	*fs_usage;
 	struct gc_pos		pos = gc_pos_btree_node(b);
 	struct bkey_packed	*_k;
+	u64 disk_res_sectors = trans->disk_res ? trans->disk_res->sectors : 0;
+	static int warned_disk_usage = 0;
 
 	if (!btree_node_type_needs_gc(iter->btree_id))
 		return;
 
 	percpu_down_read(&c->mark_lock);
+	preempt_disable();
+	fs_usage = bch2_fs_usage_get_scratch(c);
 
 	if (!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))
 		bch2_mark_key_locked(c, bkey_i_to_s_c(insert->k), true,
 			bpos_min(insert->k->k.p, b->key.k.p).offset -
 			bkey_start_offset(&insert->k->k),
-			pos, &fs_usage, trans->journal_res.seq, 0);
+			pos, fs_usage, trans->journal_res.seq, 0);
 
 	while ((_k = bch2_btree_node_iter_peek_filter(&node_iter, b,
 						      KEY_TYPE_discard))) {
@@ -885,7 +934,7 @@ void bch2_mark_update(struct btree_insert *trans,
 				BUG_ON(sectors <= 0);
 
 				bch2_mark_key_locked(c, k, true, sectors,
-					pos, &fs_usage, trans->journal_res.seq, 0);
+					pos, fs_usage, trans->journal_res.seq, 0);
 
 				sectors = bkey_start_offset(&insert->k->k) -
 					k.k->p.offset;
@@ -896,13 +945,44 @@ void bch2_mark_update(struct btree_insert *trans,
 		}
 
 		bch2_mark_key_locked(c, k, false, sectors,
-			pos, &fs_usage, trans->journal_res.seq, 0);
+			pos, fs_usage, trans->journal_res.seq, 0);
 
 		bch2_btree_node_iter_advance(&node_iter, b);
 	}
 
-	bch2_fs_usage_apply(c, &fs_usage, trans->disk_res, pos);
+	if (bch2_fs_usage_apply(c, fs_usage, trans->disk_res, pos) &&
+	    !warned_disk_usage &&
+	    !xchg(&warned_disk_usage, 1)) {
+		char buf[200];
+
+		pr_err("disk usage increased more than %llu sectors reserved", disk_res_sectors);
+
+		pr_err("while inserting");
+		bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(insert->k));
+		pr_err("%s", buf);
+		pr_err("overlapping with");
+
+		node_iter = iter->l[0].iter;
+		while ((_k = bch2_btree_node_iter_peek_filter(&node_iter, b,
+							      KEY_TYPE_discard))) {
+			struct bkey		unpacked;
+			struct bkey_s_c		k;
+
+			k = bkey_disassemble(b, _k, &unpacked);
 
+			if (btree_node_is_extents(b)
+			    ? bkey_cmp(insert->k->k.p, bkey_start_pos(k.k)) <= 0
+			    : bkey_cmp(insert->k->k.p, k.k->p))
+				break;
+
+			bch2_bkey_val_to_text(&PBUF(buf), c, k);
+			pr_err("%s", buf);
+
+			bch2_btree_node_iter_advance(&node_iter, b);
+		}
+	}
+
+	preempt_enable();
 	percpu_up_read(&c->mark_lock);
 }
 
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index 88e083325232..107cb48e3929 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -218,7 +218,18 @@ static inline u64 dev_buckets_free(struct bch_fs *c, struct bch_dev *ca)
 
 /* Filesystem usage: */
 
-struct bch_fs_usage bch2_fs_usage_read(struct bch_fs *);
+static inline struct bch_fs_usage *bch2_fs_usage_get_scratch(struct bch_fs *c)
+{
+	struct bch_fs_usage *ret;
+
+	ret = this_cpu_ptr(c->usage_scratch);
+
+	memset(ret, 0, sizeof(*ret) + c->replicas.nr * sizeof(u64));
+
+	return ret;
+}
+
+struct bch_fs_usage *bch2_fs_usage_read(struct bch_fs *);
 
 u64 bch2_fs_sectors_used(struct bch_fs *, struct bch_fs_usage);
 
@@ -254,8 +265,8 @@ int bch2_mark_key(struct bch_fs *, struct bkey_s_c,
 		  bool, s64, struct gc_pos,
 		  struct bch_fs_usage *, u64, unsigned);
 void bch2_mark_update(struct btree_insert *, struct btree_insert_entry *);
-void bch2_fs_usage_apply(struct bch_fs *, struct bch_fs_usage *,
-			 struct disk_reservation *, struct gc_pos);
+int bch2_fs_usage_apply(struct bch_fs *, struct bch_fs_usage *,
+			struct disk_reservation *, struct gc_pos);
 
 /* disk reservations: */
 
diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h
index 196f07f41728..65b4bb39f88e 100644
--- a/fs/bcachefs/buckets_types.h
+++ b/fs/bcachefs/buckets_types.h
@@ -75,16 +75,18 @@ struct bch_fs_usage {
 		u64		cached;
 		u64		reserved;
 		u64		nr_inodes;
+
+		/* XXX: add stats for compression ratio */
+#if 0
+		u64		uncompressed;
+		u64		compressed;
+#endif
 	} s;
 
 	/* broken out: */
-	struct {
-		u64		data[BCH_DATA_NR];
-		u64		ec_data;
-		u64		persistent_reserved;
-	}			replicas[BCH_REPLICAS_MAX];
 
-	u64			buckets[BCH_DATA_NR];
+	u64			persistent_reserved[BCH_REPLICAS_MAX];
+	u64			data[];
 };
 
 struct bch_fs_usage_short {
diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c
index 7f79f020d904..f090b61f23f1 100644
--- a/fs/bcachefs/chardev.c
+++ b/fs/bcachefs/chardev.c
@@ -394,21 +394,31 @@ static long bch2_ioctl_usage(struct bch_fs *c,
 	}
 
 	{
-		struct bch_fs_usage src = bch2_fs_usage_read(c);
+		struct bch_fs_usage *src;
 		struct bch_ioctl_fs_usage dst = {
 			.capacity		= c->capacity,
-			.used			= bch2_fs_sectors_used(c, src),
-			.online_reserved	= src.s.online_reserved,
 		};
 
+		src = bch2_fs_usage_read(c);
+		if (!src)
+			return -ENOMEM;
+
+		percpu_up_read(&c->mark_lock);
+
+		dst.used		= bch2_fs_sectors_used(c, *src);
+		dst.online_reserved	= src->s.online_reserved;
+
 		for (i = 0; i < BCH_REPLICAS_MAX; i++) {
 			dst.persistent_reserved[i] =
-				src.replicas[i].persistent_reserved;
-
+				src->persistent_reserved[i];
+#if 0
 			for (j = 0; j < BCH_DATA_NR; j++)
 				dst.sectors[j][i] = src.replicas[i].data[j];
+#endif
 		}
 
+		kfree(src);
+
 		ret = copy_to_user(&user_arg->fs, &dst, sizeof(dst));
 		if (ret)
 			return ret;
diff --git a/fs/bcachefs/ec_types.h b/fs/bcachefs/ec_types.h
index a3216ca01913..e416dac7ee19 100644
--- a/fs/bcachefs/ec_types.h
+++ b/fs/bcachefs/ec_types.h
@@ -6,6 +6,11 @@
 
 #define EC_STRIPE_MAX	16
 
+struct bch_replicas_padded {
+	struct bch_replicas_entry	e;
+	u8				pad[EC_STRIPE_MAX];
+};
+
 struct stripe {
 	size_t			heap_idx;
 
@@ -18,6 +23,8 @@ struct stripe {
 	u8			alive;
 	atomic_t		blocks_nonempty;
 	atomic_t		block_sectors[EC_STRIPE_MAX];
+
+	struct bch_replicas_padded r;
 };
 
 struct ec_stripe_heap_entry {
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 67f6250ef91a..1d96a1773f74 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -1669,12 +1669,13 @@ static bool bch2_extent_merge_inline(struct bch_fs *c,
 	return ret == BCH_MERGE_MERGE;
 }
 
-int bch2_check_range_allocated(struct bch_fs *c, struct bpos pos, u64 size)
+bool bch2_check_range_allocated(struct bch_fs *c, struct bpos pos, u64 size,
+			       unsigned nr_replicas)
 {
 	struct btree_iter iter;
 	struct bpos end = pos;
 	struct bkey_s_c k;
-	int ret = 0;
+	bool ret = true;
 
 	end.offset += size;
 
@@ -1683,8 +1684,8 @@ int bch2_check_range_allocated(struct bch_fs *c, struct bpos pos, u64 size)
 		if (bkey_cmp(bkey_start_pos(k.k), end) >= 0)
 			break;
 
-		if (!bch2_extent_is_fully_allocated(k)) {
-			ret = -ENOSPC;
+		if (nr_replicas > bch2_bkey_nr_ptrs_allocated(k)) {
+			ret = false;
 			break;
 		}
 	}
@@ -1693,6 +1694,29 @@ int bch2_check_range_allocated(struct bch_fs *c, struct bpos pos, u64 size)
 	return ret;
 }
 
+unsigned bch2_bkey_nr_ptrs_allocated(struct bkey_s_c k)
+{
+	unsigned ret = 0;
+
+	switch (k.k->type) {
+	case KEY_TYPE_extent: {
+		struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
+		const union bch_extent_entry *entry;
+		struct extent_ptr_decoded p;
+
+		extent_for_each_ptr_decode(e, p, entry)
+			ret += !p.ptr.cached &&
+				p.crc.compression_type == BCH_COMPRESSION_NONE;
+		break;
+	}
+	case KEY_TYPE_reservation:
+		ret = bkey_s_c_to_reservation(k).v->nr_replicas;
+		break;
+	}
+
+	return ret;
+}
+
 /* KEY_TYPE_reservation: */
 
 const char *bch2_reservation_invalid(const struct bch_fs *c, struct bkey_s_c k)
diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h
index 57eb35699545..17cae891bccb 100644
--- a/fs/bcachefs/extents.h
+++ b/fs/bcachefs/extents.h
@@ -572,6 +572,7 @@ static inline void extent_save(struct btree *b, struct bkey_packed *dst,
 		BUG_ON(!bch2_bkey_pack_key(dst, src, f));
 }
 
-int bch2_check_range_allocated(struct bch_fs *, struct bpos, u64);
+bool bch2_check_range_allocated(struct bch_fs *, struct bpos, u64, unsigned);
+unsigned bch2_bkey_nr_ptrs_allocated(struct bkey_s_c);
 
 #endif /* _BCACHEFS_EXTENTS_H */
diff --git a/fs/bcachefs/eytzinger.h b/fs/bcachefs/eytzinger.h
index 7cb4942cacf7..26d5cad7e6a5 100644
--- a/fs/bcachefs/eytzinger.h
+++ b/fs/bcachefs/eytzinger.h
@@ -263,18 +263,20 @@ static inline ssize_t eytzinger0_find_le(void *base, size_t nr, size_t size,
 	}
 }
 
-static inline size_t eytzinger0_find(void *base, size_t nr, size_t size,
-				     eytzinger_cmp_fn cmp, const void *search)
-{
-	size_t i = 0;
-	int res;
-
-	while (i < nr &&
-	       (res = cmp(search, base + i * size, size)))
-		i = eytzinger0_child(i, res > 0);
-
-	return i;
-}
+#define eytzinger0_find(base, nr, size, _cmp, search)			\
+({									\
+	void *_base	= (base);					\
+	void *_search	= (search);					\
+	size_t _nr	= (nr);						\
+	size_t _size	= (size);					\
+	size_t _i	= 0;						\
+	int _res;							\
+									\
+	while (_i < _nr &&						\
+	       (_res = _cmp(_search, _base + _i * _size, _size)))	\
+		_i = eytzinger0_child(_i, _res > 0);			\
+	_i;								\
+})
 
 void eytzinger0_sort(void *, size_t, size_t,
 		    int (*cmp_func)(const void *, const void *, size_t),
diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index a59fedcaed07..7681cfbc6bed 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -253,7 +253,9 @@ static s64 sum_sector_overwrites(struct bkey_i *new, struct btree_iter *_iter,
 		BUG_ON(btree_iter_err(old));
 
 		if (allocating &&
-		    !bch2_extent_is_fully_allocated(old))
+		    !*allocating &&
+		    bch2_bkey_nr_ptrs_allocated(old) <
+		    bch2_bkey_nr_dirty_ptrs(bkey_i_to_s_c(new)))
 			*allocating = true;
 
 		delta += (min(new->k.p.offset,
@@ -812,9 +814,7 @@ static void bch2_add_page_sectors(struct bio *bio, struct bkey_s_c k)
 {
 	struct bvec_iter iter;
 	struct bio_vec bv;
-	unsigned nr_ptrs = !bch2_extent_is_compressed(k)
-		? bch2_bkey_nr_dirty_ptrs(k)
-		: 0;
+	unsigned nr_ptrs = bch2_bkey_nr_ptrs_allocated(k);
 
 	bio_for_each_segment(bv, bio, iter) {
 		/* brand new pages, don't need to be locked: */
@@ -1930,19 +1930,20 @@ ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter)
 	if (unlikely(ret))
 		goto err;
 
+	dio->iop.op.nr_replicas	= dio->iop.op.opts.data_replicas;
+
 	ret = bch2_disk_reservation_get(c, &dio->iop.op.res, iter->count >> 9,
 					dio->iop.op.opts.data_replicas, 0);
 	if (unlikely(ret)) {
-		if (bch2_check_range_allocated(c, POS(inode->v.i_ino,
-						      req->ki_pos >> 9),
-					       iter->count >> 9))
+		if (!bch2_check_range_allocated(c, POS(inode->v.i_ino,
+						       req->ki_pos >> 9),
+						iter->count >> 9,
+						dio->iop.op.opts.data_replicas))
 			goto err;
 
 		dio->iop.unalloc = true;
 	}
 
-	dio->iop.op.nr_replicas	= dio->iop.op.res.nr_replicas;
-
 	return bch2_dio_write_loop(dio);
 err:
 	bch2_disk_reservation_put(c, &dio->iop.op.res);
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 67ff2633ba16..9c794c9a1924 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -694,6 +694,11 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list)
 	}
 
 	list_for_each_entry(i, list, list) {
+		struct bch_replicas_padded replicas;
+		char buf[80];
+
+		bch2_devlist_to_replicas(&replicas.e, BCH_DATA_JOURNAL, i->devs);
+
 		ret = jset_validate_entries(c, &i->j, READ);
 		if (ret)
 			goto fsck_err;
@@ -705,11 +710,11 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list)
 
 		if (!degraded &&
 		    (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) ||
-		     fsck_err_on(!bch2_replicas_marked(c, BCH_DATA_JOURNAL,
-						       i->devs, false), c,
-				 "superblock not marked as containing replicas (type %u)",
-				 BCH_DATA_JOURNAL))) {
-			ret = bch2_mark_replicas(c, BCH_DATA_JOURNAL, i->devs);
+		     fsck_err_on(!bch2_replicas_marked(c, &replicas.e, false), c,
+				 "superblock not marked as containing replicas %s",
+				 (bch2_replicas_entry_to_text(&PBUF(buf),
+							      &replicas.e), buf)))) {
+			ret = bch2_mark_replicas(c, &replicas.e);
 			if (ret)
 				return ret;
 		}
@@ -1108,6 +1113,7 @@ static void journal_write_done(struct closure *cl)
 	struct journal_buf *w = journal_prev_buf(j);
 	struct bch_devs_list devs =
 		bch2_bkey_devs(bkey_i_to_s_c(&w->key));
+	struct bch_replicas_padded replicas;
 	u64 seq = le64_to_cpu(w->data->seq);
 	u64 last_seq = le64_to_cpu(w->data->last_seq);
 
@@ -1118,7 +1124,9 @@ static void journal_write_done(struct closure *cl)
 		goto err;
 	}
 
-	if (bch2_mark_replicas(c, BCH_DATA_JOURNAL, devs))
+	bch2_devlist_to_replicas(&replicas.e, BCH_DATA_JOURNAL, devs);
+
+	if (bch2_mark_replicas(c, &replicas.e))
 		goto err;
 
 	spin_lock(&j->lock);
diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
index f24546dbf3ed..98345dcd1e67 100644
--- a/fs/bcachefs/journal_reclaim.c
+++ b/fs/bcachefs/journal_reclaim.c
@@ -388,7 +388,6 @@ int bch2_journal_flush_device_pins(struct journal *j, int dev_idx)
 {
 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
 	struct journal_entry_pin_list *p;
-	struct bch_devs_list devs;
 	u64 iter, seq = 0;
 	int ret = 0;
 
@@ -413,12 +412,15 @@ int bch2_journal_flush_device_pins(struct journal *j, int dev_idx)
 
 	spin_lock(&j->lock);
 	while (!ret && seq < j->pin.back) {
+		struct bch_replicas_padded replicas;
+
 		seq = max(seq, journal_last_seq(j));
-		devs = journal_seq_pin(j, seq)->devs;
+		bch2_devlist_to_replicas(&replicas.e, BCH_DATA_JOURNAL,
+					 journal_seq_pin(j, seq)->devs);
 		seq++;
 
 		spin_unlock(&j->lock);
-		ret = bch2_mark_replicas(c, BCH_DATA_JOURNAL, devs);
+		ret = bch2_mark_replicas(c, &replicas.e);
 		spin_lock(&j->lock);
 	}
 	spin_unlock(&j->lock);
diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c
index 63fe8cbb0564..b97a5a8f3910 100644
--- a/fs/bcachefs/migrate.c
+++ b/fs/bcachefs/migrate.c
@@ -5,6 +5,7 @@
 
 #include "bcachefs.h"
 #include "btree_update.h"
+#include "btree_update_interior.h"
 #include "buckets.h"
 #include "extents.h"
 #include "io.h"
@@ -153,6 +154,16 @@ retry:
 		bch2_btree_iter_unlock(&iter);
 	}
 
+	/* flush relevant btree updates */
+	while (1) {
+		closure_wait_event(&c->btree_interior_update_wait,
+				   !bch2_btree_interior_updates_nr_pending(c) ||
+				   c->btree_roots_dirty);
+		if (!bch2_btree_interior_updates_nr_pending(c))
+			break;
+		bch2_journal_meta(&c->journal);
+	}
+
 	ret = 0;
 out:
 	ret = bch2_replicas_gc_end(c, ret);
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index 9081952316b0..5a35f76006cf 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -4,6 +4,7 @@
 #include "alloc_foreground.h"
 #include "btree_gc.h"
 #include "btree_update.h"
+#include "btree_update_interior.h"
 #include "buckets.h"
 #include "disk_groups.h"
 #include "inode.h"
@@ -762,6 +763,16 @@ int bch2_data_job(struct bch_fs *c,
 		ret = bch2_journal_flush_device_pins(&c->journal, -1);
 
 		ret = bch2_move_btree(c, rereplicate_pred, c, stats) ?: ret;
+
+		while (1) {
+			closure_wait_event(&c->btree_interior_update_wait,
+					   !bch2_btree_interior_updates_nr_pending(c) ||
+					   c->btree_roots_dirty);
+			if (!bch2_btree_interior_updates_nr_pending(c))
+				break;
+			bch2_journal_meta(&c->journal);
+		}
+
 		ret = bch2_gc_btree_replicas(c) ?: ret;
 
 		ret = bch2_move_data(c, NULL,
diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c
index b63da1bef760..34a5475cfaba 100644
--- a/fs/bcachefs/replicas.c
+++ b/fs/bcachefs/replicas.c
@@ -4,11 +4,6 @@
 #include "replicas.h"
 #include "super-io.h"
 
-struct bch_replicas_padded {
-	struct bch_replicas_entry	e;
-	u8				pad[BCH_SB_MEMBERS_MAX];
-};
-
 static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *,
 					    struct bch_replicas_cpu *);
 
@@ -19,6 +14,16 @@ static inline int u8_cmp(u8 l, u8 r)
 	return (l > r) - (l < r);
 }
 
+static void verify_replicas_entry_sorted(struct bch_replicas_entry *e)
+{
+#ifdef CONFIG_BCACHES_DEBUG
+	unsigned i;
+
+	for (i = 0; i + 1 < e->nr_devs; i++)
+		BUG_ON(e->devs[i] >= e->devs[i + 1]);
+#endif
+}
+
 static void replicas_entry_sort(struct bch_replicas_entry *e)
 {
 	bubble_sort(e->devs, e->nr_devs, u8_cmp);
@@ -29,19 +34,13 @@ static void replicas_entry_sort(struct bch_replicas_entry *e)
 	     (void *) (_i) < (void *) (_r)->entries + (_r)->nr * (_r)->entry_size;\
 	     _i = (void *) (_i) + (_r)->entry_size)
 
-static inline struct bch_replicas_entry *
-cpu_replicas_entry(struct bch_replicas_cpu *r, unsigned i)
-{
-	return (void *) r->entries + r->entry_size * i;
-}
-
 static void bch2_cpu_replicas_sort(struct bch_replicas_cpu *r)
 {
 	eytzinger0_sort(r->entries, r->nr, r->entry_size, memcmp, NULL);
 }
 
-static void replicas_entry_to_text(struct printbuf *out,
-				  struct bch_replicas_entry *e)
+void bch2_replicas_entry_to_text(struct printbuf *out,
+				 struct bch_replicas_entry *e)
 {
 	unsigned i;
 
@@ -66,7 +65,7 @@ void bch2_cpu_replicas_to_text(struct printbuf *out,
 			pr_buf(out, " ");
 		first = false;
 
-		replicas_entry_to_text(out, e);
+		bch2_replicas_entry_to_text(out, e);
 	}
 }
 
@@ -106,8 +105,8 @@ static void stripe_to_replicas(struct bkey_s_c k,
 		r->devs[r->nr_devs++] = ptr->dev;
 }
 
-static void bkey_to_replicas(struct bkey_s_c k,
-			     struct bch_replicas_entry *e)
+static void bkey_to_replicas(struct bch_replicas_entry *e,
+			     struct bkey_s_c k)
 {
 	e->nr_devs = 0;
 
@@ -129,9 +128,9 @@ static void bkey_to_replicas(struct bkey_s_c k,
 	replicas_entry_sort(e);
 }
 
-static inline void devlist_to_replicas(struct bch_devs_list devs,
-				       enum bch_data_type data_type,
-				       struct bch_replicas_entry *e)
+void bch2_devlist_to_replicas(struct bch_replicas_entry *e,
+			      enum bch_data_type data_type,
+			      struct bch_devs_list devs)
 {
 	unsigned i;
 
@@ -160,6 +159,9 @@ cpu_replicas_add_entry(struct bch_replicas_cpu *old,
 					replicas_entry_bytes(new_entry)),
 	};
 
+	BUG_ON(!new_entry->data_type);
+	verify_replicas_entry_sorted(new_entry);
+
 	new.entries = kcalloc(new.nr, new.entry_size, GFP_NOIO);
 	if (!new.entries)
 		return new;
@@ -177,21 +179,49 @@ cpu_replicas_add_entry(struct bch_replicas_cpu *old,
 	return new;
 }
 
+static inline int __replicas_entry_idx(struct bch_replicas_cpu *r,
+				       struct bch_replicas_entry *search)
+{
+	int idx, entry_size = replicas_entry_bytes(search);
+
+	if (unlikely(entry_size > r->entry_size))
+		return -1;
+
+	verify_replicas_entry_sorted(search);
+
+#define entry_cmp(_l, _r, size)	memcmp(_l, _r, entry_size)
+	idx = eytzinger0_find(r->entries, r->nr, r->entry_size,
+			      entry_cmp, search);
+#undef entry_cmp
+
+	return idx < r->nr ? idx : -1;
+}
+
+int bch2_replicas_entry_idx(struct bch_fs *c,
+			    struct bch_replicas_entry *search)
+{
+	replicas_entry_sort(search);
+
+	return __replicas_entry_idx(&c->replicas, search);
+}
+
 static bool __replicas_has_entry(struct bch_replicas_cpu *r,
 				 struct bch_replicas_entry *search)
 {
-	return replicas_entry_bytes(search) <= r->entry_size &&
-		eytzinger0_find(r->entries, r->nr,
-				r->entry_size,
-				memcmp, search) < r->nr;
+	return __replicas_entry_idx(r, search) >= 0;
 }
 
-static bool replicas_has_entry(struct bch_fs *c,
-			       struct bch_replicas_entry *search,
-			       bool check_gc_replicas)
+bool bch2_replicas_marked(struct bch_fs *c,
+			  struct bch_replicas_entry *search,
+			  bool check_gc_replicas)
 {
 	bool marked;
 
+	if (!search->nr_devs)
+		return true;
+
+	verify_replicas_entry_sorted(search);
+
 	percpu_down_read(&c->mark_lock);
 	marked = __replicas_has_entry(&c->replicas, search) &&
 		(!check_gc_replicas ||
@@ -202,6 +232,76 @@ static bool replicas_has_entry(struct bch_fs *c,
 	return marked;
 }
 
+static void __replicas_table_update(struct bch_fs_usage __percpu *dst_p,
+				    struct bch_replicas_cpu *dst_r,
+				    struct bch_fs_usage __percpu *src_p,
+				    struct bch_replicas_cpu *src_r)
+{
+	unsigned src_nr = sizeof(struct bch_fs_usage) / sizeof(u64) + src_r->nr;
+	struct bch_fs_usage *dst, *src = (void *)
+		bch2_acc_percpu_u64s((void *) src_p, src_nr);
+	int src_idx, dst_idx;
+
+	preempt_disable();
+	dst = this_cpu_ptr(dst_p);
+	preempt_enable();
+
+	*dst = *src;
+
+	for (src_idx = 0; src_idx < src_r->nr; src_idx++) {
+		if (!src->data[src_idx])
+			continue;
+
+		dst_idx = __replicas_entry_idx(dst_r,
+				cpu_replicas_entry(src_r, src_idx));
+		BUG_ON(dst_idx < 0);
+
+		dst->data[dst_idx] = src->data[src_idx];
+	}
+}
+
+/*
+ * Resize filesystem accounting:
+ */
+static int replicas_table_update(struct bch_fs *c,
+				 struct bch_replicas_cpu *new_r)
+{
+	struct bch_fs_usage __percpu *new_usage[3] = { NULL, NULL, NULL };
+	unsigned bytes = sizeof(struct bch_fs_usage) +
+		sizeof(u64) * new_r->nr;
+	unsigned i;
+	int ret = -ENOMEM;
+
+	for (i = 0; i < 3; i++) {
+		if (i < 2 && !c->usage[i])
+			continue;
+
+		new_usage[i] = __alloc_percpu_gfp(bytes, sizeof(u64),
+						  GFP_NOIO);
+		if (!new_usage[i])
+			goto err;
+	}
+
+	for (i = 0; i < 2; i++) {
+		if (!c->usage[i])
+			continue;
+
+		__replicas_table_update(new_usage[i],	new_r,
+					c->usage[i],	&c->replicas);
+
+		swap(c->usage[i], new_usage[i]);
+	}
+
+	swap(c->usage_scratch, new_usage[2]);
+
+	swap(c->replicas, *new_r);
+	ret = 0;
+err:
+	for (i = 0; i < 3; i++)
+		free_percpu(new_usage[i]);
+	return ret;
+}
+
 noinline
 static int bch2_mark_replicas_slowpath(struct bch_fs *c,
 				struct bch_replicas_entry *new_entry)
@@ -243,7 +343,7 @@ static int bch2_mark_replicas_slowpath(struct bch_fs *c,
 	/* don't update in memory replicas until changes are persistent */
 	percpu_down_write(&c->mark_lock);
 	if (new_r.entries)
-		swap(new_r, c->replicas);
+		ret = replicas_table_update(c, &new_r);
 	if (new_gc.entries)
 		swap(new_gc, c->replicas_gc);
 	percpu_up_write(&c->mark_lock);
@@ -258,30 +358,32 @@ err:
 	return ret;
 }
 
-static int __bch2_mark_replicas(struct bch_fs *c,
-				struct bch_replicas_entry *devs)
+int bch2_mark_replicas(struct bch_fs *c,
+		       struct bch_replicas_entry *r)
 {
-	return likely(replicas_has_entry(c, devs, true))
+	return likely(bch2_replicas_marked(c, r, true))
 		? 0
-		: bch2_mark_replicas_slowpath(c, devs);
+		: bch2_mark_replicas_slowpath(c, r);
 }
 
-int bch2_mark_replicas(struct bch_fs *c,
-		       enum bch_data_type data_type,
-		       struct bch_devs_list devs)
+bool bch2_bkey_replicas_marked(struct bch_fs *c,
+			       struct bkey_s_c k,
+			       bool check_gc_replicas)
 {
 	struct bch_replicas_padded search;
+	struct bch_devs_list cached = bch2_bkey_cached_devs(k);
+	unsigned i;
 
-	if (!devs.nr)
-		return 0;
-
-	memset(&search, 0, sizeof(search));
+	for (i = 0; i < cached.nr; i++) {
+		bch2_replicas_entry_cached(&search.e, cached.devs[i]);
 
-	BUG_ON(devs.nr >= BCH_REPLICAS_MAX);
+		if (!bch2_replicas_marked(c, &search.e, check_gc_replicas))
+			return false;
+	}
 
-	devlist_to_replicas(devs, data_type, &search.e);
+	bkey_to_replicas(&search.e, k);
 
-	return __bch2_mark_replicas(c, &search.e);
+	return bch2_replicas_marked(c, &search.e, check_gc_replicas);
 }
 
 int bch2_mark_bkey_replicas(struct bch_fs *c, struct bkey_s_c k)
@@ -291,22 +393,23 @@ int bch2_mark_bkey_replicas(struct bch_fs *c, struct bkey_s_c k)
 	unsigned i;
 	int ret;
 
-	memset(&search, 0, sizeof(search));
+	for (i = 0; i < cached.nr; i++) {
+		bch2_replicas_entry_cached(&search.e, cached.devs[i]);
 
-	for (i = 0; i < cached.nr; i++)
-		if ((ret = bch2_mark_replicas(c, BCH_DATA_CACHED,
-					      bch2_dev_list_single(cached.devs[i]))))
+		ret = bch2_mark_replicas(c, &search.e);
+		if (ret)
 			return ret;
+	}
 
-	bkey_to_replicas(k, &search.e);
+	bkey_to_replicas(&search.e, k);
 
-	return search.e.nr_devs
-		? __bch2_mark_replicas(c, &search.e)
-		: 0;
+	return bch2_mark_replicas(c, &search.e);
 }
 
 int bch2_replicas_gc_end(struct bch_fs *c, int ret)
 {
+	unsigned i;
+
 	lockdep_assert_held(&c->replicas_gc_lock);
 
 	mutex_lock(&c->sb_lock);
@@ -314,6 +417,39 @@ int bch2_replicas_gc_end(struct bch_fs *c, int ret)
 	if (ret)
 		goto err;
 
+	/*
+	 * this is kind of crappy; the replicas gc mechanism needs to be ripped
+	 * out
+	 */
+
+	for (i = 0; i < c->replicas.nr; i++) {
+		struct bch_replicas_entry *e =
+			cpu_replicas_entry(&c->replicas, i);
+		struct bch_replicas_cpu n;
+		u64 v = 0;
+		int cpu;
+
+		if (__replicas_has_entry(&c->replicas_gc, e))
+			continue;
+
+		for_each_possible_cpu(cpu)
+			v += *per_cpu_ptr(&c->usage[0]->data[i], cpu);
+		if (!v)
+			continue;
+
+		n = cpu_replicas_add_entry(&c->replicas_gc, e);
+		if (!n.entries) {
+			ret = -ENOSPC;
+			goto err;
+		}
+
+		percpu_down_write(&c->mark_lock);
+		swap(n, c->replicas_gc);
+		percpu_up_write(&c->mark_lock);
+
+		kfree(n.entries);
+	}
+
 	if (bch2_cpu_replicas_to_sb_replicas(c, &c->replicas_gc)) {
 		ret = -ENOSPC;
 		goto err;
@@ -325,7 +461,7 @@ int bch2_replicas_gc_end(struct bch_fs *c, int ret)
 err:
 	percpu_down_write(&c->mark_lock);
 	if (!ret)
-		swap(c->replicas, c->replicas_gc);
+		ret = replicas_table_update(c, &c->replicas_gc);
 
 	kfree(c->replicas_gc.entries);
 	c->replicas_gc.entries = NULL;
@@ -461,7 +597,7 @@ int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *c)
 	bch2_cpu_replicas_sort(&new_r);
 
 	percpu_down_write(&c->mark_lock);
-	swap(c->replicas, new_r);
+	ret = replicas_table_update(c, &new_r);
 	percpu_up_write(&c->mark_lock);
 
 	kfree(new_r.entries);
@@ -628,7 +764,7 @@ static void bch2_sb_replicas_to_text(struct printbuf *out,
 			pr_buf(out, " ");
 		first = false;
 
-		replicas_entry_to_text(out, e);
+		bch2_replicas_entry_to_text(out, e);
 	}
 }
 
@@ -677,46 +813,6 @@ const struct bch_sb_field_ops bch_sb_field_ops_replicas_v0 = {
 
 /* Query replicas: */
 
-bool bch2_replicas_marked(struct bch_fs *c,
-			  enum bch_data_type data_type,
-			  struct bch_devs_list devs,
-			  bool check_gc_replicas)
-{
-	struct bch_replicas_padded search;
-
-	if (!devs.nr)
-		return true;
-
-	memset(&search, 0, sizeof(search));
-
-	devlist_to_replicas(devs, data_type, &search.e);
-
-	return replicas_has_entry(c, &search.e, check_gc_replicas);
-}
-
-bool bch2_bkey_replicas_marked(struct bch_fs *c,
-			       struct bkey_s_c k,
-			       bool check_gc_replicas)
-{
-	struct bch_replicas_padded search;
-	struct bch_devs_list cached = bch2_bkey_cached_devs(k);
-	unsigned i;
-
-	memset(&search, 0, sizeof(search));
-
-	for (i = 0; i < cached.nr; i++)
-		if (!bch2_replicas_marked(c, BCH_DATA_CACHED,
-					  bch2_dev_list_single(cached.devs[i]),
-					  check_gc_replicas))
-			return false;
-
-	bkey_to_replicas(k, &search.e);
-
-	return search.e.nr_devs
-		? replicas_has_entry(c, &search.e, check_gc_replicas)
-		: true;
-}
-
 struct replicas_status __bch2_replicas_status(struct bch_fs *c,
 					      struct bch_devs_mask online_devs)
 {
diff --git a/fs/bcachefs/replicas.h b/fs/bcachefs/replicas.h
index 03aaafdc7c17..923bddb21ec3 100644
--- a/fs/bcachefs/replicas.h
+++ b/fs/bcachefs/replicas.h
@@ -2,17 +2,42 @@
 #ifndef _BCACHEFS_REPLICAS_H
 #define _BCACHEFS_REPLICAS_H
 
+#include "eytzinger.h"
 #include "replicas_types.h"
 
-bool bch2_replicas_marked(struct bch_fs *, enum bch_data_type,
-			  struct bch_devs_list, bool);
+void bch2_replicas_entry_to_text(struct printbuf *,
+				 struct bch_replicas_entry *);
+void bch2_cpu_replicas_to_text(struct printbuf *, struct bch_replicas_cpu *);
+
+static inline struct bch_replicas_entry *
+cpu_replicas_entry(struct bch_replicas_cpu *r, unsigned i)
+{
+	return (void *) r->entries + r->entry_size * i;
+}
+
+int bch2_replicas_entry_idx(struct bch_fs *,
+			    struct bch_replicas_entry *);
+
+void bch2_devlist_to_replicas(struct bch_replicas_entry *,
+			      enum bch_data_type,
+			      struct bch_devs_list);
+bool bch2_replicas_marked(struct bch_fs *,
+			  struct bch_replicas_entry *, bool);
+int bch2_mark_replicas(struct bch_fs *,
+		       struct bch_replicas_entry *);
+
 bool bch2_bkey_replicas_marked(struct bch_fs *,
 			       struct bkey_s_c, bool);
-int bch2_mark_replicas(struct bch_fs *, enum bch_data_type,
-		       struct bch_devs_list);
 int bch2_mark_bkey_replicas(struct bch_fs *, struct bkey_s_c);
 
-void bch2_cpu_replicas_to_text(struct printbuf *, struct bch_replicas_cpu *);
+static inline void bch2_replicas_entry_cached(struct bch_replicas_entry *e,
+					      unsigned dev)
+{
+	e->data_type	= BCH_DATA_CACHED;
+	e->nr_devs	= 1;
+	e->nr_required	= 1;
+	e->devs[0]	= dev;
+}
 
 struct replicas_status {
 	struct {
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 55069f40d04b..9a862b19ce22 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -375,6 +375,7 @@ static void bch2_fs_free(struct bch_fs *c)
 	bch2_io_clock_exit(&c->io_clock[READ]);
 	bch2_fs_compress_exit(c);
 	percpu_free_rwsem(&c->mark_lock);
+	free_percpu(c->usage_scratch);
 	free_percpu(c->usage[0]);
 	free_percpu(c->pcpu);
 	mempool_exit(&c->btree_iters_pool);
@@ -506,7 +507,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 {
 	struct bch_sb_field_members *mi;
 	struct bch_fs *c;
-	unsigned i, iter_size;
+	unsigned i, iter_size, fs_usage_size;
 	const char *err;
 
 	pr_verbose_init(opts, "");
@@ -600,6 +601,9 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 		(btree_blocks(c) + 1) * 2 *
 		sizeof(struct btree_node_iter_set);
 
+	fs_usage_size = sizeof(struct bch_fs_usage) +
+		sizeof(u64) * c->replicas.nr;
+
 	if (!(c->wq = alloc_workqueue("bcachefs",
 				WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_HIGHPRI, 1)) ||
 	    !(c->copygc_wq = alloc_workqueue("bcache_copygc",
@@ -616,7 +620,8 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 			max(offsetof(struct btree_read_bio, bio),
 			    offsetof(struct btree_write_bio, wbio.bio)),
 			BIOSET_NEED_BVECS) ||
-	    !(c->usage[0] = alloc_percpu(struct bch_fs_usage)) ||
+	    !(c->usage[0] = __alloc_percpu(fs_usage_size, sizeof(u64))) ||
+	    !(c->usage_scratch = __alloc_percpu(fs_usage_size, sizeof(u64))) ||
 	    !(c->pcpu = alloc_percpu(struct bch_fs_pcpu)) ||
 	    mempool_init_kvpmalloc_pool(&c->btree_bounce_pool, 1,
 					btree_bytes(c)) ||
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index 27fd6dfe83f5..424636310bbf 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -234,33 +234,45 @@ static size_t bch2_btree_cache_size(struct bch_fs *c)
 static ssize_t show_fs_alloc_debug(struct bch_fs *c, char *buf)
 {
 	struct printbuf out = _PBUF(buf, PAGE_SIZE);
-	struct bch_fs_usage stats = bch2_fs_usage_read(c);
-	unsigned replicas, type;
+	struct bch_fs_usage *fs_usage = bch2_fs_usage_read(c);
+	unsigned i;
 
-	pr_buf(&out, "capacity:\t\t%llu\n", c->capacity);
+	if (!fs_usage)
+		return -ENOMEM;
 
-	for (replicas = 0; replicas < ARRAY_SIZE(stats.replicas); replicas++) {
-		pr_buf(&out, "%u replicas:\n", replicas + 1);
+	pr_buf(&out, "capacity:\t\t%llu\n", c->capacity);
 
+	for (i = 0;
+	     i < ARRAY_SIZE(fs_usage->persistent_reserved);
+	     i++) {
+		pr_buf(&out, "%u replicas:\n", i + 1);
+#if 0
 		for (type = BCH_DATA_SB; type < BCH_DATA_NR; type++)
 			pr_buf(&out, "\t%s:\t\t%llu\n",
 			       bch2_data_types[type],
 			       stats.replicas[replicas].data[type]);
 		pr_buf(&out, "\terasure coded:\t%llu\n",
 		       stats.replicas[replicas].ec_data);
+#endif
 		pr_buf(&out, "\treserved:\t%llu\n",
-		       stats.replicas[replicas].persistent_reserved);
+		       fs_usage->persistent_reserved[i]);
 	}
 
-	pr_buf(&out, "bucket usage\n");
+	pr_buf(&out, "online reserved:\t%llu\n",
+	       fs_usage->s.online_reserved);
 
-	for (type = BCH_DATA_SB; type < BCH_DATA_NR; type++)
-		pr_buf(&out, "\t%s:\t\t%llu\n",
-		       bch2_data_types[type],
-		       stats.buckets[type]);
+	for (i = 0; i < c->replicas.nr; i++) {
+		struct bch_replicas_entry *e =
+			cpu_replicas_entry(&c->replicas, i);
 
-	pr_buf(&out, "online reserved:\t%llu\n",
-	       stats.s.online_reserved);
+		pr_buf(&out, "\t");
+		bch2_replicas_entry_to_text(&out, e);
+		pr_buf(&out, ":\t%llu\n", fs_usage->data[i]);
+	}
+
+	percpu_up_read(&c->mark_lock);
+
+	kfree(fs_usage);
 
 	return out.pos - buf;
 }
diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c
index 9f3eafb3e0d4..295f4577e9c1 100644
--- a/fs/bcachefs/util.c
+++ b/fs/bcachefs/util.c
@@ -904,3 +904,28 @@ void eytzinger0_find_test(void)
 	kfree(test_array);
 }
 #endif
+
+/*
+ * Accumulate percpu counters onto one cpu's copy - only valid when access
+ * against any percpu counter is guarded against
+ */
+u64 *bch2_acc_percpu_u64s(u64 __percpu *p, unsigned nr)
+{
+	u64 *ret;
+	int cpu;
+
+	preempt_disable();
+	ret = this_cpu_ptr(p);
+	preempt_enable();
+
+	for_each_possible_cpu(cpu) {
+		u64 *i = per_cpu_ptr(p, cpu);
+
+		if (i != ret) {
+			acc_u64s(ret, i, nr);
+			memset(i, 0, nr * sizeof(u64));
+		}
+	}
+
+	return ret;
+}
diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h
index 8bbb0e30d07f..fa1a3adc87df 100644
--- a/fs/bcachefs/util.h
+++ b/fs/bcachefs/util.h
@@ -718,4 +718,6 @@ static inline void acc_u64s_percpu(u64 *acc, const u64 __percpu *src,
 		acc_u64s(acc, per_cpu_ptr(src, cpu), nr);
 }
 
+u64 *bch2_acc_percpu_u64s(u64 __percpu *, unsigned);
+
 #endif /* _BCACHEFS_UTIL_H */
-- 
cgit 


From 5e5d9bdbb89c51603653360ecfbfa90c0bceb108 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 22 Jan 2019 20:04:06 -0500
Subject: bcachefs: Fix fifo overflow in allocator startup

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 871a41b923da..b79d5b059621 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -1334,6 +1334,24 @@ static void allocator_start_issue_discards(struct bch_fs *c)
 					     ca->mi.bucket_size, GFP_NOIO);
 }
 
+static int resize_free_inc(struct bch_dev *ca)
+{
+	alloc_fifo free_inc;
+
+	if (!fifo_full(&ca->free_inc))
+		return 0;
+
+	if (!init_fifo(&free_inc,
+		       ca->free_inc.size * 2,
+		       GFP_KERNEL))
+		return -ENOMEM;
+
+	fifo_move(&free_inc, &ca->free_inc);
+	swap(free_inc, ca->free_inc);
+	free_fifo(&free_inc);
+	return 0;
+}
+
 static int __bch2_fs_allocator_start(struct bch_fs *c)
 {
 	struct bch_dev *ca;
@@ -1409,6 +1427,12 @@ not_enough:
 
 			while (!fifo_full(&ca->free[RESERVE_BTREE]) &&
 			       (bu = next_alloc_bucket(ca)) >= 0) {
+				ret = resize_free_inc(ca);
+				if (ret) {
+					percpu_ref_put(&ca->io_ref);
+					return ret;
+				}
+
 				bch2_invalidate_one_bucket(c, ca, bu,
 							   &journal_seq);
 
-- 
cgit 


From 430735cd1a0304195a080f8ee239016444a02715 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 19 Nov 2018 01:31:41 -0500
Subject: bcachefs: Persist alloc info on clean shutdown

 - Does not persist alloc info for stripes yet
 - Also does not yet include filesystem block/sector counts yet, from
struct fs_usage
 - Not made use of just yet

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c      | 45 ++++++++++++++++++++++++++++++++-----
 fs/bcachefs/alloc_background.h      |  1 +
 fs/bcachefs/bcachefs.h              |  6 +++++
 fs/bcachefs/btree_update.h          |  3 +++
 fs/bcachefs/btree_update_interior.c |  2 ++
 fs/bcachefs/btree_update_leaf.c     |  6 +++--
 fs/bcachefs/buckets.c               | 14 ++++++++----
 fs/bcachefs/buckets.h               |  2 ++
 fs/bcachefs/super.c                 | 39 ++++++++++++++++++++++----------
 9 files changed, 94 insertions(+), 24 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index b79d5b059621..686287d12d14 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -250,6 +250,9 @@ int bch2_alloc_read(struct bch_fs *c, struct list_head *journal_replay_list)
 				bch2_alloc_read_key(c, bkey_i_to_s_c(k));
 	}
 
+	for_each_member_device(ca, c, i)
+		bch2_dev_usage_from_buckets(c, ca);
+
 	mutex_lock(&c->bucket_clock[READ].lock);
 	for_each_member_device(ca, c, i) {
 		down_read(&ca->bucket_lock);
@@ -281,35 +284,51 @@ static int __bch2_alloc_write_key(struct bch_fs *c, struct bch_dev *ca,
 #endif
 	struct bkey_i_alloc *a = bkey_alloc_init(&alloc_key.k);
 	struct bucket *g;
-	struct bucket_mark m;
+	struct bucket_mark m, new;
 	int ret;
 
 	BUG_ON(BKEY_ALLOC_VAL_U64s_MAX > 8);
 
 	a->k.p = POS(ca->dev_idx, b);
 
+	bch2_btree_iter_set_pos(iter, a->k.p);
+
+	ret = bch2_btree_iter_traverse(iter);
+	if (ret)
+		return ret;
+
 	percpu_down_read(&c->mark_lock);
 	g = bucket(ca, b);
-	m = bucket_cmpxchg(g, m, m.dirty = false);
+	m = READ_ONCE(g->mark);
+
+	if (!m.dirty) {
+		percpu_up_read(&c->mark_lock);
+		return 0;
+	}
 
 	__alloc_write_key(a, g, m);
 	percpu_up_read(&c->mark_lock);
 
 	bch2_btree_iter_cond_resched(iter);
 
-	bch2_btree_iter_set_pos(iter, a->k.p);
-
 	ret = bch2_btree_insert_at(c, NULL, journal_seq,
+				   BTREE_INSERT_NOCHECK_RW|
 				   BTREE_INSERT_NOFAIL|
 				   BTREE_INSERT_USE_RESERVE|
 				   BTREE_INSERT_USE_ALLOC_RESERVE|
 				   flags,
 				   BTREE_INSERT_ENTRY(iter, &a->k_i));
+	if (ret)
+		return ret;
 
-	if (!ret && ca->buckets_written)
+	new = m;
+	new.dirty = false;
+	atomic64_cmpxchg(&g->_mark.v, m.v.counter, new.v.counter);
+
+	if (ca->buckets_written)
 		set_bit(b, ca->buckets_written);
 
-	return ret;
+	return 0;
 }
 
 int bch2_alloc_replay_key(struct bch_fs *c, struct bkey_i *k)
@@ -899,10 +918,19 @@ static int push_invalidated_bucket(struct bch_fs *c, struct bch_dev *ca, size_t
 		for (i = 0; i < RESERVE_NR; i++)
 			if (fifo_push(&ca->free[i], bucket)) {
 				fifo_pop(&ca->free_inc, bucket);
+
 				closure_wake_up(&c->freelist_wait);
+				ca->allocator_blocked_full = false;
+
 				spin_unlock(&c->freelist_lock);
 				goto out;
 			}
+
+		if (!ca->allocator_blocked_full) {
+			ca->allocator_blocked_full = true;
+			closure_wake_up(&c->freelist_wait);
+		}
+
 		spin_unlock(&c->freelist_lock);
 
 		if ((current->flags & PF_KTHREAD) &&
@@ -1227,6 +1255,11 @@ void bch2_dev_allocator_add(struct bch_fs *c, struct bch_dev *ca)
 			set_bit(ca->dev_idx, c->rw_devs[i].d);
 }
 
+void bch2_dev_allocator_quiesce(struct bch_fs *c, struct bch_dev *ca)
+{
+	closure_wait_event(&c->freelist_wait, ca->allocator_blocked_full);
+}
+
 /* stop allocator thread: */
 void bch2_dev_allocator_stop(struct bch_dev *ca)
 {
diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h
index ef5ec659b05d..04f1e9152494 100644
--- a/fs/bcachefs/alloc_background.h
+++ b/fs/bcachefs/alloc_background.h
@@ -52,6 +52,7 @@ void bch2_recalc_capacity(struct bch_fs *);
 void bch2_dev_allocator_remove(struct bch_fs *, struct bch_dev *);
 void bch2_dev_allocator_add(struct bch_fs *, struct bch_dev *);
 
+void bch2_dev_allocator_quiesce(struct bch_fs *, struct bch_dev *);
 void bch2_dev_allocator_stop(struct bch_dev *);
 int bch2_dev_allocator_start(struct bch_dev *);
 
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 17eb0dd657a8..2d67c9911fbb 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -431,7 +431,13 @@ struct bch_dev {
 
 	size_t			inc_gen_needs_gc;
 	size_t			inc_gen_really_needs_gc;
+
+	/*
+	 * XXX: this should be an enum for allocator state, so as to include
+	 * error state
+	 */
 	bool			allocator_blocked;
+	bool			allocator_blocked_full;
 
 	alloc_heap		alloc_heap;
 
diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
index 824fb0d1b7f0..9bcab29bd033 100644
--- a/fs/bcachefs/btree_update.h
+++ b/fs/bcachefs/btree_update.h
@@ -78,6 +78,7 @@ enum {
 	__BTREE_INSERT_ATOMIC,
 	__BTREE_INSERT_NOUNLOCK,
 	__BTREE_INSERT_NOFAIL,
+	__BTREE_INSERT_NOCHECK_RW,
 	__BTREE_INSERT_USE_RESERVE,
 	__BTREE_INSERT_USE_ALLOC_RESERVE,
 	__BTREE_INSERT_JOURNAL_REPLAY,
@@ -101,6 +102,8 @@ enum {
 /* Don't check for -ENOSPC: */
 #define BTREE_INSERT_NOFAIL		(1 << __BTREE_INSERT_NOFAIL)
 
+#define BTREE_INSERT_NOCHECK_RW		(1 << __BTREE_INSERT_NOCHECK_RW)
+
 /* for copygc, or when merging btree nodes */
 #define BTREE_INSERT_USE_RESERVE	(1 << __BTREE_INSERT_USE_RESERVE)
 #define BTREE_INSERT_USE_ALLOC_RESERVE	(1 << __BTREE_INSERT_USE_ALLOC_RESERVE)
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index d55778696bcd..4bc7be9b5298 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -1172,6 +1172,7 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, struct btree *b
 
 	mutex_lock(&c->btree_interior_update_lock);
 	percpu_down_read(&c->mark_lock);
+	preempt_disable();
 	fs_usage = bch2_fs_usage_get_scratch(c);
 
 	bch2_mark_key_locked(c, bkey_i_to_s_c(insert),
@@ -1194,6 +1195,7 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, struct btree *b
 	bch2_fs_usage_apply(c, fs_usage, &as->reserve->disk_res,
 			    gc_pos_btree_node(b));
 
+	preempt_enable();
 	percpu_up_read(&c->mark_lock);
 	mutex_unlock(&c->btree_interior_update_lock);
 
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 12fd7fba3e9a..e052a3debadb 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -629,7 +629,8 @@ int __bch2_btree_insert_at(struct btree_insert *trans)
 	trans_for_each_entry(trans, i)
 		btree_insert_entry_checks(c, i);
 
-	if (unlikely(!percpu_ref_tryget(&c->writes)))
+	if (unlikely(!(trans->flags & BTREE_INSERT_NOCHECK_RW) &&
+		     !percpu_ref_tryget(&c->writes)))
 		return -EROFS;
 retry:
 	trans_for_each_iter(trans, i) {
@@ -659,7 +660,8 @@ retry:
 	trans_for_each_iter(trans, i)
 		bch2_btree_iter_downgrade(i->iter);
 out:
-	percpu_ref_put(&c->writes);
+	if (unlikely(!(trans->flags & BTREE_INSERT_NOCHECK_RW)))
+		percpu_ref_put(&c->writes);
 
 	/* make sure we didn't drop or screw up locks: */
 	trans_for_each_iter(trans, i) {
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index cbebc712a1da..3e92a1f6d7fc 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -387,7 +387,8 @@ static void __bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca,
 	*old = bucket_data_cmpxchg(c, ca, fs_usage, g, new, ({
 		BUG_ON(!is_available_bucket(new));
 
-		new.owned_by_allocator	= 1;
+		new.owned_by_allocator	= true;
+		new.dirty		= true;
 		new.data_type		= 0;
 		new.cached_sectors	= 0;
 		new.dirty_sectors	= 0;
@@ -460,6 +461,7 @@ static void __bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
 	       type != BCH_DATA_JOURNAL);
 
 	bucket_data_cmpxchg(c, ca, fs_usage, g, new, ({
+		new.dirty	= true;
 		new.data_type	= type;
 		checked_add(new.dirty_sectors, sectors);
 	}));
@@ -487,13 +489,14 @@ void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
 						    true);
 	} else {
 		struct bucket *g;
-		struct bucket_mark old, new;
+		struct bucket_mark new;
 
 		rcu_read_lock();
 
 		g = bucket(ca, b);
-		old = bucket_cmpxchg(g, new, ({
-			new.data_type = type;
+		bucket_cmpxchg(g, new, ({
+			new.dirty	= true;
+			new.data_type	= type;
 			checked_add(new.dirty_sectors, sectors);
 		}));
 
@@ -546,6 +549,8 @@ static void bch2_mark_pointer(struct bch_fs *c,
 	do {
 		new.v.counter = old.v.counter = v;
 
+		new.dirty = true;
+
 		/*
 		 * Check this after reading bucket mark to guard against
 		 * the allocator invalidating a bucket after we've already
@@ -709,6 +714,7 @@ static void bucket_set_stripe(struct bch_fs *c,
 		BUG_ON(ptr_stale(ca, ptr));
 
 		old = bucket_data_cmpxchg(c, ca, fs_usage, g, new, ({
+			new.dirty			= true;
 			new.stripe			= enabled;
 			if (journal_seq) {
 				new.journal_seq_valid	= 1;
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index 107cb48e3929..ee8c9e9a1f23 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -182,6 +182,8 @@ static inline bool bucket_needs_journal_commit(struct bucket_mark m,
 
 struct bch_dev_usage bch2_dev_usage_read(struct bch_fs *, struct bch_dev *);
 
+void bch2_dev_usage_from_buckets(struct bch_fs *, struct bch_dev *);
+
 static inline u64 __dev_buckets_available(struct bch_dev *ca,
 					  struct bch_dev_usage stats)
 {
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 9a862b19ce22..0ad624294052 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -174,7 +174,9 @@ struct bch_fs *bch2_uuid_to_fs(__uuid_t uuid)
 static void __bch2_fs_read_only(struct bch_fs *c)
 {
 	struct bch_dev *ca;
+	bool wrote;
 	unsigned i;
+	int ret;
 
 	bch2_rebalance_stop(c);
 
@@ -189,23 +191,36 @@ static void __bch2_fs_read_only(struct bch_fs *c)
 	 */
 	bch2_journal_flush_all_pins(&c->journal);
 
-	for_each_member_device(ca, c, i)
-		bch2_dev_allocator_stop(ca);
+	do {
+		ret = bch2_alloc_write(c, false, &wrote);
+		if (ret) {
+			bch2_fs_inconsistent(c, "error writing out alloc info %i", ret);
+			break;
+		}
 
-	bch2_journal_flush_all_pins(&c->journal);
+		for_each_member_device(ca, c, i)
+			bch2_dev_allocator_quiesce(c, ca);
 
-	/*
-	 * We need to explicitly wait on btree interior updates to complete
-	 * before stopping the journal, flushing all journal pins isn't
-	 * sufficient, because in the BTREE_INTERIOR_UPDATING_ROOT case btree
-	 * interior updates have to drop their journal pin before they're
-	 * fully complete:
-	 */
-	closure_wait_event(&c->btree_interior_update_wait,
-			   !bch2_btree_interior_updates_nr_pending(c));
+		bch2_journal_flush_all_pins(&c->journal);
+
+		/*
+		 * We need to explicitly wait on btree interior updates to complete
+		 * before stopping the journal, flushing all journal pins isn't
+		 * sufficient, because in the BTREE_INTERIOR_UPDATING_ROOT case btree
+		 * interior updates have to drop their journal pin before they're
+		 * fully complete:
+		 */
+		closure_wait_event(&c->btree_interior_update_wait,
+				   !bch2_btree_interior_updates_nr_pending(c));
+	} while (wrote);
+
+	for_each_member_device(ca, c, i)
+		bch2_dev_allocator_stop(ca);
 
 	bch2_fs_journal_stop(&c->journal);
 
+	/* XXX: mark super that alloc info is persistent */
+
 	/*
 	 * the journal kicks off btree writes via reclaim - wait for in flight
 	 * writes after stopping journal:
-- 
cgit 


From ed1646ca7412151dbdd81f4e4d05437cc99e9c8c Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 22 Jan 2019 13:46:14 -0500
Subject: bcachefs: Improve c version of __bkey_cmp_bits

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bkey.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bkey.c b/fs/bcachefs/bkey.c
index d35cdde299c4..8a3295ff9631 100644
--- a/fs/bcachefs/bkey.c
+++ b/fs/bcachefs/bkey.c
@@ -1014,11 +1014,8 @@ static inline int __bkey_cmp_bits(const u64 *l, const u64 *r,
 			nr_key_bits -= 64;
 		}
 
-		if (l_v != r_v)
-			return l_v < r_v ? -1 : 1;
-
-		if (!nr_key_bits)
-			return 0;
+		if (!nr_key_bits || l_v != r_v)
+			break;
 
 		l = next_word(l);
 		r = next_word(r);
@@ -1026,6 +1023,8 @@ static inline int __bkey_cmp_bits(const u64 *l, const u64 *r,
 		l_v = *l;
 		r_v = *r;
 	}
+
+	return (l_v > r_v) - (l_v < r_v);
 }
 #endif
 
-- 
cgit 


From 61c8d7c8eb0e0169e04df64fab45c575aaad0739 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 25 Nov 2018 20:53:51 -0500
Subject: bcachefs: Persist stripe blocks_used

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_gc.c |   7 +-
 fs/bcachefs/buckets.c  |  59 +++++++++-------
 fs/bcachefs/ec.c       | 179 ++++++++++++++++++++++++++++++++-----------------
 fs/bcachefs/ec.h       |  52 ++++++++++++++
 fs/bcachefs/ec_types.h |   7 +-
 fs/bcachefs/recovery.c |   8 +--
 fs/bcachefs/super.c    |   6 ++
 7 files changed, 222 insertions(+), 96 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index a725a106f6dc..b63dcbdb95c0 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -561,6 +561,7 @@ static void bch2_gc_done(struct bch_fs *c, bool initial)
 			dst_iter.pos, ##__VA_ARGS__,			\
 			dst->_f, src->_f);				\
 		dst->_f = src->_f;					\
+		dst->dirty = true;					\
 	}
 #define copy_bucket_field(_f)						\
 	if (dst->b[b].mark._f != src->b[b].mark._f) {			\
@@ -591,16 +592,18 @@ static void bch2_gc_done(struct bch_fs *c, bool initial)
 
 		while ((dst = genradix_iter_peek(&dst_iter, &c->stripes[0])) &&
 		       (src = genradix_iter_peek(&src_iter, &c->stripes[1]))) {
+			BUG_ON(src_iter.pos != dst_iter.pos);
+
 			copy_stripe_field(alive,	"alive");
 			copy_stripe_field(sectors,	"sectors");
 			copy_stripe_field(algorithm,	"algorithm");
 			copy_stripe_field(nr_blocks,	"nr_blocks");
 			copy_stripe_field(nr_redundant,	"nr_redundant");
-			copy_stripe_field(blocks_nonempty.counter,
+			copy_stripe_field(blocks_nonempty,
 					  "blocks_nonempty");
 
 			for (i = 0; i < ARRAY_SIZE(dst->block_sectors); i++)
-				copy_stripe_field(block_sectors[i].counter,
+				copy_stripe_field(block_sectors[i],
 						  "block_sectors[%u]", i);
 
 			if (dst->alive)
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 3e92a1f6d7fc..a1c243f622a8 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -605,9 +605,14 @@ static int bch2_mark_stripe_ptr(struct bch_fs *c,
 	int blocks_nonempty_delta;
 	s64 parity_sectors;
 
+	BUG_ON(!sectors);
+
 	m = genradix_ptr(&c->stripes[gc], p.idx);
 
+	spin_lock(&c->ec_stripes_heap_lock);
+
 	if (!m || !m->alive) {
+		spin_unlock(&c->ec_stripes_heap_lock);
 		bch_err_ratelimited(c, "pointer to nonexistent stripe %llu",
 				    (u64) p.idx);
 		return -1;
@@ -623,19 +628,21 @@ static int bch2_mark_stripe_ptr(struct bch_fs *c,
 		parity_sectors = -parity_sectors;
 	sectors += parity_sectors;
 
-	new = atomic_add_return(sectors, &m->block_sectors[p.block]);
-	old = new - sectors;
+	old = m->block_sectors[p.block];
+	m->block_sectors[p.block] += sectors;
+	new = m->block_sectors[p.block];
 
 	blocks_nonempty_delta = (int) !!new - (int) !!old;
-	if (!blocks_nonempty_delta)
-		return 0;
+	if (blocks_nonempty_delta) {
+		m->blocks_nonempty += blocks_nonempty_delta;
 
-	atomic_add(blocks_nonempty_delta, &m->blocks_nonempty);
+		if (!gc)
+			bch2_stripes_heap_update(c, m, p.idx);
+	}
 
-	BUG_ON(atomic_read(&m->blocks_nonempty) < 0);
+	m->dirty = true;
 
-	if (!gc)
-		bch2_stripes_heap_update(c, m, p.idx);
+	spin_unlock(&c->ec_stripes_heap_lock);
 
 	update_replicas(c, fs_usage, &m->r.e, sectors);
 
@@ -721,8 +728,6 @@ static void bucket_set_stripe(struct bch_fs *c,
 				new.journal_seq		= journal_seq;
 			}
 		}));
-
-		BUG_ON(old.stripe == enabled);
 	}
 }
 
@@ -737,22 +742,19 @@ static int bch2_mark_stripe(struct bch_fs *c, struct bkey_s_c k,
 	struct stripe *m = genradix_ptr(&c->stripes[gc], idx);
 	unsigned i;
 
+	spin_lock(&c->ec_stripes_heap_lock);
+
 	if (!m || (!inserting && !m->alive)) {
+		spin_unlock(&c->ec_stripes_heap_lock);
 		bch_err_ratelimited(c, "error marking nonexistent stripe %zu",
 				    idx);
 		return -1;
 	}
 
-	if (inserting && m->alive) {
-		bch_err_ratelimited(c, "error marking stripe %zu: already exists",
-				    idx);
-		return -1;
-	}
+	if (m->alive)
+		bch2_stripes_heap_del(c, m, idx);
 
-	BUG_ON(atomic_read(&m->blocks_nonempty));
-
-	for (i = 0; i < EC_STRIPE_MAX; i++)
-		BUG_ON(atomic_read(&m->block_sectors[i]));
+	memset(m, 0, sizeof(*m));
 
 	if (inserting) {
 		m->sectors	= le16_to_cpu(s.v->sectors);
@@ -768,7 +770,6 @@ static int bch2_mark_stripe(struct bch_fs *c, struct bkey_s_c k,
 
 		for (i = 0; i < s.v->nr_blocks; i++)
 			m->r.e.devs[i] = s.v->ptrs[i].dev;
-	}
 
 	/*
 	 * XXX: account for stripes somehow here
@@ -777,15 +778,23 @@ static int bch2_mark_stripe(struct bch_fs *c, struct bkey_s_c k,
 	update_replicas(c, fs_usage, &m->r.e, stripe_sectors);
 #endif
 
-	if (!gc) {
-		if (inserting)
+		/* gc recalculates these fields: */
+		if (!(flags & BCH_BUCKET_MARK_GC)) {
+			for (i = 0; i < s.v->nr_blocks; i++) {
+				m->block_sectors[i] =
+					stripe_blockcount_get(s.v, i);
+				m->blocks_nonempty += !!m->block_sectors[i];
+			}
+		}
+
+		if (!gc)
 			bch2_stripes_heap_insert(c, m, idx);
 		else
-			bch2_stripes_heap_del(c, m, idx);
-	} else {
-		m->alive = inserting;
+			m->alive = true;
 	}
 
+	spin_unlock(&c->ec_stripes_heap_lock);
+
 	bucket_set_stripe(c, s.v, inserting, fs_usage, 0, gc);
 	return 0;
 }
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index 5a5baba8d79b..fc73823f6358 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -12,6 +12,7 @@
 #include "ec.h"
 #include "error.h"
 #include "io.h"
+#include "journal_io.h"
 #include "keylist.h"
 #include "super-io.h"
 #include "util.h"
@@ -99,40 +100,6 @@ struct ec_bio {
 
 /* Stripes btree keys: */
 
-static unsigned stripe_csums_per_device(const struct bch_stripe *s)
-{
-	return DIV_ROUND_UP(le16_to_cpu(s->sectors),
-			    1 << s->csum_granularity_bits);
-}
-
-static unsigned stripe_csum_offset(const struct bch_stripe *s,
-				   unsigned dev, unsigned csum_idx)
-{
-	unsigned csum_bytes = bch_crc_bytes[s->csum_type];
-
-	return sizeof(struct bch_stripe) +
-		sizeof(struct bch_extent_ptr) * s->nr_blocks +
-		(dev * stripe_csums_per_device(s) + csum_idx) * csum_bytes;
-}
-
-static unsigned stripe_blockcount_offset(const struct bch_stripe *s,
-					 unsigned idx)
-{
-	return stripe_csum_offset(s, s->nr_blocks, 0) +
-		sizeof(16) * idx;
-}
-
-static unsigned stripe_val_u64s(const struct bch_stripe *s)
-{
-	return DIV_ROUND_UP(stripe_blockcount_offset(s, s->nr_blocks),
-			    sizeof(u64));
-}
-
-static void *stripe_csum(struct bch_stripe *s, unsigned dev, unsigned csum_idx)
-{
-	return (void *) s + stripe_csum_offset(s, dev, csum_idx);
-}
-
 const char *bch2_stripe_invalid(const struct bch_fs *c, struct bkey_s_c k)
 {
 	const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
@@ -165,8 +132,9 @@ void bch2_stripe_to_text(struct printbuf *out, struct bch_fs *c,
 	       1U << s->csum_granularity_bits);
 
 	for (i = 0; i < s->nr_blocks; i++)
-		pr_buf(out, " %u:%llu", s->ptrs[i].dev,
-		       (u64) s->ptrs[i].offset);
+		pr_buf(out, " %u:%llu:%u", s->ptrs[i].dev,
+		       (u64) s->ptrs[i].offset,
+		       stripe_blockcount_get(s, i));
 }
 
 static int ptr_matches_stripe(struct bch_fs *c,
@@ -610,29 +578,15 @@ static void heap_verify_backpointer(struct bch_fs *c, size_t idx)
 	BUG_ON(h->data[m->heap_idx].idx != idx);
 }
 
-static inline unsigned stripe_entry_blocks(struct stripe *m)
-{
-	return atomic_read(&m->blocks_nonempty);
-}
-
 void bch2_stripes_heap_update(struct bch_fs *c,
 			      struct stripe *m, size_t idx)
 {
 	ec_stripes_heap *h = &c->ec_stripes_heap;
-	bool queue_delete;
 	size_t i;
 
-	spin_lock(&c->ec_stripes_heap_lock);
-
-	if (!m->alive) {
-		spin_unlock(&c->ec_stripes_heap_lock);
-		return;
-	}
-
 	heap_verify_backpointer(c, idx);
 
-	h->data[m->heap_idx].blocks_nonempty =
-		stripe_entry_blocks(m);
+	h->data[m->heap_idx].blocks_nonempty = m->blocks_nonempty;
 
 	i = m->heap_idx;
 	heap_sift_up(h,	  i, ec_stripes_heap_cmp,
@@ -642,44 +596,35 @@ void bch2_stripes_heap_update(struct bch_fs *c,
 
 	heap_verify_backpointer(c, idx);
 
-	queue_delete = stripe_idx_to_delete(c) >= 0;
-	spin_unlock(&c->ec_stripes_heap_lock);
-
-	if (queue_delete)
+	if (stripe_idx_to_delete(c) >= 0)
 		schedule_work(&c->ec_stripe_delete_work);
 }
 
 void bch2_stripes_heap_del(struct bch_fs *c,
 			   struct stripe *m, size_t idx)
 {
-	spin_lock(&c->ec_stripes_heap_lock);
 	heap_verify_backpointer(c, idx);
 
 	m->alive = false;
 	heap_del(&c->ec_stripes_heap, m->heap_idx,
 		 ec_stripes_heap_cmp,
 		 ec_stripes_heap_set_backpointer);
-	spin_unlock(&c->ec_stripes_heap_lock);
 }
 
 void bch2_stripes_heap_insert(struct bch_fs *c,
 			      struct stripe *m, size_t idx)
 {
-	spin_lock(&c->ec_stripes_heap_lock);
-
 	BUG_ON(heap_full(&c->ec_stripes_heap));
 
 	heap_add(&c->ec_stripes_heap, ((struct ec_stripe_heap_entry) {
 			.idx = idx,
-			.blocks_nonempty = stripe_entry_blocks(m),
+			.blocks_nonempty = m->blocks_nonempty,
 		}),
 		 ec_stripes_heap_cmp,
 		 ec_stripes_heap_set_backpointer);
 	m->alive = true;
 
 	heap_verify_backpointer(c, idx);
-
-	spin_unlock(&c->ec_stripes_heap_lock);
 }
 
 /* stripe deletion */
@@ -1218,6 +1163,116 @@ unlock:
 	mutex_unlock(&c->ec_new_stripe_lock);
 }
 
+static int __bch2_stripe_write_key(struct bch_fs *c,
+				   struct btree_iter *iter,
+				   struct stripe *m,
+				   size_t idx,
+				   struct bkey_i_stripe *new_key,
+				   unsigned flags)
+{
+	struct bkey_s_c k;
+	unsigned i;
+	int ret;
+
+	bch2_btree_iter_set_pos(iter, POS(0, idx));
+
+	k = bch2_btree_iter_peek_slot(iter);
+	ret = btree_iter_err(k);
+	if (ret)
+		return ret;
+
+	if (k.k->type != KEY_TYPE_stripe)
+		return -EIO;
+
+	bkey_reassemble(&new_key->k_i, k);
+
+	spin_lock(&c->ec_stripes_heap_lock);
+
+	for (i = 0; i < new_key->v.nr_blocks; i++)
+		stripe_blockcount_set(&new_key->v, i,
+				      m->block_sectors[i]);
+	m->dirty = false;
+
+	spin_unlock(&c->ec_stripes_heap_lock);
+
+	return bch2_btree_insert_at(c, NULL, NULL,
+				   BTREE_INSERT_NOFAIL|flags,
+				   BTREE_INSERT_ENTRY(iter, &new_key->k_i));
+}
+
+int bch2_stripes_write(struct bch_fs *c, bool *wrote)
+{
+	struct btree_iter iter;
+	struct genradix_iter giter;
+	struct bkey_i_stripe *new_key;
+	struct stripe *m;
+	int ret = 0;
+
+	new_key = kmalloc(255 * sizeof(u64), GFP_KERNEL);
+	BUG_ON(!new_key);
+
+	bch2_btree_iter_init(&iter, c, BTREE_ID_EC, POS_MIN,
+			     BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+
+	genradix_for_each(&c->stripes[0], giter, m) {
+		if (!m->dirty)
+			continue;
+
+		ret = __bch2_stripe_write_key(c, &iter, m, giter.pos,
+					new_key, BTREE_INSERT_NOCHECK_RW);
+		if (ret)
+			break;
+
+		*wrote = true;
+	}
+
+	bch2_btree_iter_unlock(&iter);
+
+	kfree(new_key);
+
+	return ret;
+}
+
+static void bch2_stripe_read_key(struct bch_fs *c, struct bkey_s_c k)
+{
+
+	struct gc_pos pos = { 0 };
+
+	bch2_mark_key(c, k, true, 0, pos, NULL, 0, 0);
+}
+
+int bch2_stripes_read(struct bch_fs *c, struct list_head *journal_replay_list)
+{
+	struct journal_replay *r;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	int ret;
+
+	ret = bch2_fs_ec_start(c);
+	if (ret)
+		return ret;
+
+	for_each_btree_key(&iter, c, BTREE_ID_EC, POS_MIN, 0, k) {
+		bch2_stripe_read_key(c, k);
+		bch2_btree_iter_cond_resched(&iter);
+	}
+
+	ret = bch2_btree_iter_unlock(&iter);
+	if (ret)
+		return ret;
+
+	list_for_each_entry(r, journal_replay_list, list) {
+		struct bkey_i *k, *n;
+		struct jset_entry *entry;
+
+		for_each_jset_key(k, n, entry, &r->j)
+			if (entry->btree_id == BTREE_ID_EC)
+				bch2_stripe_read_key(c, bkey_i_to_s_c(k));
+	}
+
+	return 0;
+}
+
 int bch2_ec_mem_alloc(struct bch_fs *c, bool gc)
 {
 	struct btree_iter iter;
diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h
index 4a8cade37c7a..7bcbb7e11377 100644
--- a/fs/bcachefs/ec.h
+++ b/fs/bcachefs/ec.h
@@ -14,6 +14,55 @@ void bch2_stripe_to_text(struct printbuf *, struct bch_fs *,
 	.val_to_text	= bch2_stripe_to_text,		\
 }
 
+static inline unsigned stripe_csums_per_device(const struct bch_stripe *s)
+{
+	return DIV_ROUND_UP(le16_to_cpu(s->sectors),
+			    1 << s->csum_granularity_bits);
+}
+
+static inline unsigned stripe_csum_offset(const struct bch_stripe *s,
+					  unsigned dev, unsigned csum_idx)
+{
+	unsigned csum_bytes = bch_crc_bytes[s->csum_type];
+
+	return sizeof(struct bch_stripe) +
+		sizeof(struct bch_extent_ptr) * s->nr_blocks +
+		(dev * stripe_csums_per_device(s) + csum_idx) * csum_bytes;
+}
+
+static inline unsigned stripe_blockcount_offset(const struct bch_stripe *s,
+						unsigned idx)
+{
+	return stripe_csum_offset(s, s->nr_blocks, 0) +
+		sizeof(u16) * idx;
+}
+
+static inline unsigned stripe_blockcount_get(const struct bch_stripe *s,
+					     unsigned idx)
+{
+	return le16_to_cpup((void *) s + stripe_blockcount_offset(s, idx));
+}
+
+static inline void stripe_blockcount_set(struct bch_stripe *s,
+					 unsigned idx, unsigned v)
+{
+	__le16 *p = (void *) s + stripe_blockcount_offset(s, idx);
+
+	*p = cpu_to_le16(v);
+}
+
+static inline unsigned stripe_val_u64s(const struct bch_stripe *s)
+{
+	return DIV_ROUND_UP(stripe_blockcount_offset(s, s->nr_blocks),
+			    sizeof(u64));
+}
+
+static inline void *stripe_csum(struct bch_stripe *s,
+				unsigned dev, unsigned csum_idx)
+{
+	return (void *) s + stripe_csum_offset(s, dev, csum_idx);
+}
+
 struct bch_read_bio;
 
 struct ec_stripe_buf {
@@ -101,6 +150,9 @@ void bch2_ec_stop_dev(struct bch_fs *, struct bch_dev *);
 
 void bch2_ec_flush_new_stripes(struct bch_fs *);
 
+int bch2_stripes_read(struct bch_fs *, struct list_head *);
+int bch2_stripes_write(struct bch_fs *, bool *);
+
 int bch2_ec_mem_alloc(struct bch_fs *, bool);
 
 int bch2_fs_ec_start(struct bch_fs *);
diff --git a/fs/bcachefs/ec_types.h b/fs/bcachefs/ec_types.h
index e416dac7ee19..5c3f77c8aac7 100644
--- a/fs/bcachefs/ec_types.h
+++ b/fs/bcachefs/ec_types.h
@@ -20,9 +20,10 @@ struct stripe {
 	u8			nr_blocks;
 	u8			nr_redundant;
 
-	u8			alive;
-	atomic_t		blocks_nonempty;
-	atomic_t		block_sectors[EC_STRIPE_MAX];
+	unsigned		alive:1;
+	unsigned		dirty:1;
+	u8			blocks_nonempty;
+	u16			block_sectors[EC_STRIPE_MAX];
 
 	struct bch_replicas_padded r;
 };
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index cfdf9b563637..2eaff9a6fee5 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -215,12 +215,12 @@ int bch2_fs_recovery(struct bch_fs *c)
 	if (ret)
 		goto err;
 
-	set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags);
-
-	err = "cannot allocate memory";
-	ret = bch2_fs_ec_start(c);
+	ret = bch2_stripes_read(c, &journal);
 	if (ret)
 		goto err;
+	pr_info("stripes_read done");
+
+	set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags);
 
 	bch_verbose(c, "starting mark and sweep:");
 	err = "error in recovery";
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 0ad624294052..f1e60c377fa2 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -198,6 +198,12 @@ static void __bch2_fs_read_only(struct bch_fs *c)
 			break;
 		}
 
+		ret = bch2_stripes_write(c, &wrote);
+		if (ret) {
+			bch2_fs_inconsistent(c, "error writing out stripes");
+			break;
+		}
+
 		for_each_member_device(ca, c, i)
 			bch2_dev_allocator_quiesce(c, ca);
 
-- 
cgit 


From bdba6c29ffc9041257eeeb5916bc3ae6f8828a0d Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 24 Jan 2019 20:00:57 -0500
Subject: bcachefs: fix inode counting

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/buckets.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index a1c243f622a8..d40aa69532cc 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -823,7 +823,7 @@ static int __bch2_mark_key(struct bch_fs *c, struct bkey_s_c k,
 		ret = bch2_mark_stripe(c, k, inserting,
 				       fs_usage, journal_seq, flags, gc);
 		break;
-	case KEY_TYPE_alloc:
+	case KEY_TYPE_inode:
 		if (inserting)
 			fs_usage->s.nr_inodes++;
 		else
-- 
cgit 


From 6e1b07183a32583cbe4d781ea0e14e0b06bc44af Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 24 Jan 2019 19:50:00 -0500
Subject: bcachefs: improve alloc_debug

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/sysfs.c | 28 +++++++++++++++-------------
 1 file changed, 15 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index 424636310bbf..a7b4f76d0e61 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -240,27 +240,29 @@ static ssize_t show_fs_alloc_debug(struct bch_fs *c, char *buf)
 	if (!fs_usage)
 		return -ENOMEM;
 
-	pr_buf(&out, "capacity:\t\t%llu\n", c->capacity);
+	pr_buf(&out, "capacity:\t\t\t%llu\n", c->capacity);
+
+	pr_buf(&out, "hidden:\t\t\t\t%llu\n",
+	       fs_usage->s.hidden);
+	pr_buf(&out, "data:\t\t\t\t%llu\n",
+	       fs_usage->s.data);
+	pr_buf(&out, "cached:\t\t\t\t%llu\n",
+	       fs_usage->s.cached);
+	pr_buf(&out, "reserved:\t\t\t%llu\n",
+	       fs_usage->s.reserved);
+	pr_buf(&out, "nr_inodes:\t\t\t%llu\n",
+	       fs_usage->s.nr_inodes);
+	pr_buf(&out, "online reserved:\t\t%llu\n",
+	       fs_usage->s.online_reserved);
 
 	for (i = 0;
 	     i < ARRAY_SIZE(fs_usage->persistent_reserved);
 	     i++) {
 		pr_buf(&out, "%u replicas:\n", i + 1);
-#if 0
-		for (type = BCH_DATA_SB; type < BCH_DATA_NR; type++)
-			pr_buf(&out, "\t%s:\t\t%llu\n",
-			       bch2_data_types[type],
-			       stats.replicas[replicas].data[type]);
-		pr_buf(&out, "\terasure coded:\t%llu\n",
-		       stats.replicas[replicas].ec_data);
-#endif
-		pr_buf(&out, "\treserved:\t%llu\n",
+		pr_buf(&out, "\treserved:\t\t%llu\n",
 		       fs_usage->persistent_reserved[i]);
 	}
 
-	pr_buf(&out, "online reserved:\t%llu\n",
-	       fs_usage->s.online_reserved);
-
 	for (i = 0; i < c->replicas.nr; i++) {
 		struct bch_replicas_entry *e =
 			cpu_replicas_entry(&c->replicas, i);
-- 
cgit 


From eac3ca0f49737ba3120ccaa990877b2a05bc88cc Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 24 Jan 2019 16:50:48 -0500
Subject: bcachefs: New journal_entry_res mechanism

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/journal.c       | 57 ++++++++++++++++++++++++++++++++++++---------
 fs/bcachefs/journal.h       |  4 ++++
 fs/bcachefs/journal_types.h | 11 +++++++++
 3 files changed, 61 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index 310553bd5323..dd10f1c993e5 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -64,11 +64,6 @@ static void bch2_journal_buf_init(struct journal *j)
 	buf->data->u64s	= 0;
 }
 
-static inline size_t journal_entry_u64s_reserve(struct journal_buf *buf)
-{
-	return BTREE_ID_NR * (JSET_KEYS_U64s + BKEY_EXTENT_U64s_MAX);
-}
-
 static inline bool journal_entry_empty(struct jset *j)
 {
 	struct jset_entry *i;
@@ -130,7 +125,7 @@ static enum {
 
 	j->prev_buf_sectors =
 		vstruct_blocks_plus(buf->data, c->block_bits,
-				    journal_entry_u64s_reserve(buf)) *
+				    buf->u64s_reserved) *
 		c->opts.block_size;
 	BUG_ON(j->prev_buf_sectors > j->cur_buf_sectors);
 
@@ -225,6 +220,7 @@ static int journal_entry_open(struct journal *j)
 		return sectors;
 
 	buf->disk_sectors	= sectors;
+	buf->u64s_reserved	= j->entry_u64s_reserved;
 
 	sectors = min_t(unsigned, sectors, buf->size >> 9);
 	j->cur_buf_sectors	= sectors;
@@ -233,11 +229,7 @@ static int journal_entry_open(struct journal *j)
 
 	/* Subtract the journal header */
 	u64s -= sizeof(struct jset) / sizeof(u64);
-	/*
-	 * Btree roots, prio pointers don't get added until right before we do
-	 * the write:
-	 */
-	u64s -= journal_entry_u64s_reserve(buf);
+	u64s -= buf->u64s_reserved;
 	u64s  = max_t(ssize_t, 0L, u64s);
 
 	BUG_ON(u64s >= JOURNAL_ENTRY_CLOSED_VAL);
@@ -437,6 +429,45 @@ int bch2_journal_res_get_slowpath(struct journal *j, struct journal_res *res,
 	return ret;
 }
 
+/* journal_entry_res: */
+
+void bch2_journal_entry_res_resize(struct journal *j,
+				   struct journal_entry_res *res,
+				   unsigned new_u64s)
+{
+	union journal_res_state state;
+	int d = new_u64s - res->u64s;
+
+	spin_lock(&j->lock);
+
+	j->entry_u64s_reserved += d;
+	if (d <= 0)
+		goto out_unlock;
+
+	j->cur_entry_u64s -= d;
+	smp_mb();
+	state = READ_ONCE(j->reservations);
+
+	if (state.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL &&
+	    state.cur_entry_offset > j->cur_entry_u64s) {
+		j->cur_entry_u64s += d;
+		/*
+		 * Not enough room in current journal entry, have to flush it:
+		 */
+		__journal_entry_close(j);
+		goto out;
+	}
+
+	journal_cur_buf(j)->u64s_reserved += d;
+out_unlock:
+	spin_unlock(&j->lock);
+out:
+	res->u64s += d;
+	return;
+}
+
+/* journal flushing: */
+
 u64 bch2_journal_last_unwritten_seq(struct journal *j)
 {
 	u64 seq;
@@ -1024,6 +1055,10 @@ int bch2_fs_journal_init(struct journal *j)
 	j->write_delay_ms	= 1000;
 	j->reclaim_delay_ms	= 100;
 
+	/* Btree roots: */
+	j->entry_u64s_reserved +=
+		BTREE_ID_NR * (JSET_KEYS_U64s + BKEY_EXTENT_U64s_MAX);
+
 	atomic64_set(&j->reservations.counter,
 		((union journal_res_state)
 		 { .cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL }).v);
diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h
index d9c094ba2ca0..6ef34bdae628 100644
--- a/fs/bcachefs/journal.h
+++ b/fs/bcachefs/journal.h
@@ -333,6 +333,10 @@ out:
 	return 0;
 }
 
+void bch2_journal_entry_res_resize(struct journal *,
+				   struct journal_entry_res *,
+				   unsigned);
+
 u64 bch2_journal_last_unwritten_seq(struct journal *);
 int bch2_journal_open_seq_async(struct journal *, u64, struct closure *);
 
diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h
index 51e453652d67..5f6d2320c5cd 100644
--- a/fs/bcachefs/journal_types.h
+++ b/fs/bcachefs/journal_types.h
@@ -24,6 +24,7 @@ struct journal_buf {
 
 	unsigned		size;
 	unsigned		disk_sectors;
+	unsigned		u64s_reserved;
 	/* bloom filter: */
 	unsigned long		has_inode[1024 / sizeof(unsigned long)];
 };
@@ -155,6 +156,9 @@ struct journal {
 	u64			seq_ondisk;
 	u64			last_seq_ondisk;
 
+	/* Reserved space in journal entry to be used just prior to write */
+	unsigned		entry_u64s_reserved;
+
 	/*
 	 * FIFO of journal entries whose btree updates have not yet been
 	 * written out.
@@ -243,4 +247,11 @@ struct journal_device {
 	struct closure		read;
 };
 
+/*
+ * journal_entry_res - reserve space in every journal entry:
+ */
+struct journal_entry_res {
+	unsigned		u64s;
+};
+
 #endif /* _BCACHEFS_JOURNAL_TYPES_H */
-- 
cgit 


From fe112812ae41bfed0aa61cdfbe8233e4122e5cb8 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 24 Jan 2019 18:32:13 -0500
Subject: bcachefs: sysfs trigger for bch2_alloc_write

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/sysfs.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index a7b4f76d0e61..f33a533ee6b8 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -133,6 +133,7 @@ do {									\
 write_attribute(trigger_journal_flush);
 write_attribute(trigger_btree_coalesce);
 write_attribute(trigger_gc);
+write_attribute(trigger_alloc_write);
 write_attribute(prune_cache);
 rw_attribute(btree_gc_periodic);
 
@@ -495,6 +496,12 @@ STORE(__bch2_fs)
 	if (attr == &sysfs_trigger_gc)
 		bch2_gc(c, NULL, false);
 
+	if (attr == &sysfs_trigger_alloc_write) {
+		bool wrote;
+
+		bch2_alloc_write(c, false, &wrote);
+	}
+
 	if (attr == &sysfs_prune_cache) {
 		struct shrink_control sc;
 
@@ -587,6 +594,7 @@ struct attribute *bch2_fs_internal_files[] = {
 	&sysfs_trigger_journal_flush,
 	&sysfs_trigger_btree_coalesce,
 	&sysfs_trigger_gc,
+	&sysfs_trigger_alloc_write,
 	&sysfs_prune_cache,
 
 	&sysfs_copy_gc_enabled,
-- 
cgit 


From 4c97e04aa8818da266a690543aca28e2e7c26820 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 6 Feb 2019 11:42:13 -0500
Subject: bcachefs: percpu utility code

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/buckets.c  |  5 +----
 fs/bcachefs/replicas.c |  6 ++----
 fs/bcachefs/sysfs.c    | 13 ++++---------
 fs/bcachefs/util.h     | 23 +++++++++++++++++++++++
 4 files changed, 30 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index d40aa69532cc..d919c1cacee5 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -1005,10 +1005,7 @@ void bch2_mark_update(struct btree_insert *trans,
 
 static u64 bch2_recalc_sectors_available(struct bch_fs *c)
 {
-	int cpu;
-
-	for_each_possible_cpu(cpu)
-		per_cpu_ptr(c->pcpu, cpu)->sectors_available = 0;
+	percpu_u64_set(&c->pcpu->sectors_available, 0);
 
 	return avail_factor(bch2_fs_sectors_free(c));
 }
diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c
index 34a5475cfaba..5663441fd7ce 100644
--- a/fs/bcachefs/replicas.c
+++ b/fs/bcachefs/replicas.c
@@ -426,14 +426,12 @@ int bch2_replicas_gc_end(struct bch_fs *c, int ret)
 		struct bch_replicas_entry *e =
 			cpu_replicas_entry(&c->replicas, i);
 		struct bch_replicas_cpu n;
-		u64 v = 0;
-		int cpu;
+		u64 v;
 
 		if (__replicas_has_entry(&c->replicas_gc, e))
 			continue;
 
-		for_each_possible_cpu(cpu)
-			v += *per_cpu_ptr(&c->usage[0]->data[i], cpu);
+		v = percpu_u64_get(&c->usage[0]->data[i]);
 		if (!v)
 			continue;
 
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index f33a533ee6b8..8ad7b6026d1b 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -893,20 +893,15 @@ static const char * const bch2_rw[] = {
 static ssize_t show_dev_iodone(struct bch_dev *ca, char *buf)
 {
 	struct printbuf out = _PBUF(buf, PAGE_SIZE);
-	int rw, i, cpu;
+	int rw, i;
 
 	for (rw = 0; rw < 2; rw++) {
 		pr_buf(&out, "%s:\n", bch2_rw[rw]);
 
-		for (i = 1; i < BCH_DATA_NR; i++) {
-			u64 n = 0;
-
-			for_each_possible_cpu(cpu)
-				n += per_cpu_ptr(ca->io_done, cpu)->sectors[rw][i];
-
+		for (i = 1; i < BCH_DATA_NR; i++)
 			pr_buf(&out, "%-12s:%12llu\n",
-			       bch2_data_types[i], n << 9);
-		}
+			       bch2_data_types[i],
+			       percpu_u64_get(&ca->io_done->sectors[rw][i]) << 9);
 	}
 
 	return out.pos - buf;
diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h
index fa1a3adc87df..dc40a52ac8c7 100644
--- a/fs/bcachefs/util.h
+++ b/fs/bcachefs/util.h
@@ -12,6 +12,7 @@
 #include <linux/llist.h>
 #include <linux/log2.h>
 #include <linux/percpu.h>
+#include <linux/preempt.h>
 #include <linux/ratelimit.h>
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
@@ -701,6 +702,28 @@ do {									\
 	}								\
 } while (0)
 
+static inline u64 percpu_u64_get(u64 __percpu *src)
+{
+	u64 ret = 0;
+	int cpu;
+
+	for_each_possible_cpu(cpu)
+		ret += *per_cpu_ptr(src, cpu);
+	return ret;
+}
+
+static inline void percpu_u64_set(u64 __percpu *dst, u64 src)
+{
+	int cpu;
+
+	for_each_possible_cpu(cpu)
+		*per_cpu_ptr(dst, cpu) = 0;
+
+	preempt_disable();
+	*this_cpu_ptr(dst) = src;
+	preempt_enable();
+}
+
 static inline void acc_u64s(u64 *acc, const u64 *src, unsigned nr)
 {
 	unsigned i;
-- 
cgit 


From b935a8a67ab744d20002aefd7f8053381b38532f Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 9 Feb 2019 16:15:29 -0500
Subject: bcachefs: Fix a bug when shutting down before allocator started

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c | 2 ++
 fs/bcachefs/bcachefs.h         | 1 +
 fs/bcachefs/super.c            | 9 ++++++++-
 3 files changed, 11 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 686287d12d14..83b3445cc76f 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -1535,6 +1535,8 @@ int bch2_fs_allocator_start(struct bch_fs *c)
 		}
 	}
 
+	set_bit(BCH_FS_ALLOCATOR_RUNNING, &c->flags);
+
 	return bch2_alloc_write(c, false, &wrote);
 }
 
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 2d67c9911fbb..4a0e2f63e645 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -477,6 +477,7 @@ enum {
 	/* startup: */
 	BCH_FS_ALLOC_READ_DONE,
 	BCH_FS_ALLOCATOR_STARTED,
+	BCH_FS_ALLOCATOR_RUNNING,
 	BCH_FS_INITIAL_GC_DONE,
 	BCH_FS_FSCK_DONE,
 	BCH_FS_STARTED,
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index f1e60c377fa2..9ec05410aeb4 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -191,6 +191,9 @@ static void __bch2_fs_read_only(struct bch_fs *c)
 	 */
 	bch2_journal_flush_all_pins(&c->journal);
 
+	if (!test_bit(BCH_FS_ALLOCATOR_RUNNING, &c->flags))
+		goto allocator_not_running;
+
 	do {
 		ret = bch2_alloc_write(c, false, &wrote);
 		if (ret) {
@@ -219,10 +222,12 @@ static void __bch2_fs_read_only(struct bch_fs *c)
 		closure_wait_event(&c->btree_interior_update_wait,
 				   !bch2_btree_interior_updates_nr_pending(c));
 	} while (wrote);
-
+allocator_not_running:
 	for_each_member_device(ca, c, i)
 		bch2_dev_allocator_stop(ca);
 
+	clear_bit(BCH_FS_ALLOCATOR_RUNNING, &c->flags);
+
 	bch2_fs_journal_stop(&c->journal);
 
 	/* XXX: mark super that alloc info is persistent */
@@ -349,6 +354,8 @@ const char *bch2_fs_read_write(struct bch_fs *c)
 			goto err;
 		}
 
+	set_bit(BCH_FS_ALLOCATOR_RUNNING, &c->flags);
+
 	err = "error starting btree GC thread";
 	if (bch2_gc_thread_start(c))
 		goto err;
-- 
cgit 


From 736affa8bb3eeff49014dff9fd728f7ba3ffa550 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 8 Feb 2019 14:43:53 -0500
Subject: bcachefs: fix for unmount hang

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 83b3445cc76f..c44e0417af87 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -1257,7 +1257,8 @@ void bch2_dev_allocator_add(struct bch_fs *c, struct bch_dev *ca)
 
 void bch2_dev_allocator_quiesce(struct bch_fs *c, struct bch_dev *ca)
 {
-	closure_wait_event(&c->freelist_wait, ca->allocator_blocked_full);
+	if (ca->alloc_thread)
+		closure_wait_event(&c->freelist_wait, ca->allocator_blocked_full);
 }
 
 /* stop allocator thread: */
-- 
cgit 


From 94cd106f8db3e454fa900acc1ffcbc5dcc0eddd8 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 9 Feb 2019 16:50:53 -0500
Subject: bcachefs: delete a debug printk

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/recovery.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 2eaff9a6fee5..2ff86262d41c 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -215,10 +215,11 @@ int bch2_fs_recovery(struct bch_fs *c)
 	if (ret)
 		goto err;
 
+	bch_verbose(c, "starting stripes_read");
 	ret = bch2_stripes_read(c, &journal);
 	if (ret)
 		goto err;
-	pr_info("stripes_read done");
+	bch_verbose(c, "stripes_read done");
 
 	set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags);
 
-- 
cgit 


From 24547d097a520312cf9d727d3af8da1b0c985f98 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 9 Feb 2019 18:24:20 -0500
Subject: bcachefs: fix bch2_sb_field_resize()

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/super-io.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index dafdc45b442c..2ad1266e167d 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -188,6 +188,7 @@ struct bch_sb_field *bch2_sb_field_resize(struct bch_sb_handle *sb,
 		}
 	}
 
+	f = bch2_sb_field_get(sb->sb, type);
 	f = __bch2_sb_field_resize(sb, f, u64s);
 	if (f)
 		f->type = cpu_to_le32(type);
-- 
cgit 


From 2c5af169f72c1018e83b79ac82ffe387534910e8 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 24 Jan 2019 17:12:00 -0500
Subject: bcachefs: reserve space in journal for fs usage entries

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs.h        |  2 ++
 fs/bcachefs/bcachefs_format.h | 17 ++++++++++++++++-
 fs/bcachefs/journal_io.c      | 39 +++++++++++++++++++++++++++------------
 fs/bcachefs/recovery.c        |  3 ++-
 fs/bcachefs/replicas.c        | 34 ++++++++++++++++++++++++++++++++++
 fs/bcachefs/replicas.h        |  2 ++
 fs/bcachefs/super.c           |  1 +
 7 files changed, 84 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 4a0e2f63e645..90c44ef0fbe0 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -545,6 +545,8 @@ struct bch_fs {
 	struct bch_replicas_cpu replicas_gc;
 	struct mutex		replicas_gc_lock;
 
+	struct journal_entry_res replicas_journal_res;
+
 	struct bch_disk_groups_cpu __rcu *disk_groups;
 
 	struct bch_opts		opts;
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index f6cf4ccedcb1..bd41628f2995 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -1362,7 +1362,8 @@ static inline __u64 __bset_magic(struct bch_sb *sb)
 	x(btree_root,		1)		\
 	x(prio_ptrs,		2)		\
 	x(blacklist,		3)		\
-	x(blacklist_v2,		4)
+	x(blacklist_v2,		4)		\
+	x(usage,		5)
 
 enum {
 #define x(f, nr)	BCH_JSET_ENTRY_##f	= nr,
@@ -1392,6 +1393,20 @@ struct jset_entry_blacklist_v2 {
 	__le64			end;
 };
 
+enum {
+	FS_USAGE_REPLICAS		= 0,
+	FS_USAGE_INODES			= 1,
+	FS_USAGE_KEY_VERSION		= 2,
+	FS_USAGE_NR			= 3
+};
+
+struct jset_entry_usage {
+	struct jset_entry	entry;
+	__le64			sectors;
+	__u8			type;
+	struct bch_replicas_entry r;
+} __attribute__((packed));
+
 /*
  * On disk format for a journal entry:
  * seq is monotonically increasing; every journal entry has its own unique
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 9c794c9a1924..173aecfaebc2 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -284,6 +284,7 @@ static int journal_entry_validate_blacklist_v2(struct bch_fs *c,
 	if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 2, c,
 		"invalid journal seq blacklist entry: bad size")) {
 		journal_entry_null_range(entry, vstruct_next(entry));
+		goto out;
 	}
 
 	bl_entry = container_of(entry, struct jset_entry_blacklist_v2, entry);
@@ -293,6 +294,28 @@ static int journal_entry_validate_blacklist_v2(struct bch_fs *c,
 		"invalid journal seq blacklist entry: start > end")) {
 		journal_entry_null_range(entry, vstruct_next(entry));
 	}
+out:
+fsck_err:
+	return ret;
+}
+
+static int journal_entry_validate_usage(struct bch_fs *c,
+					struct jset *jset,
+					struct jset_entry *entry,
+					int write)
+{
+	struct jset_entry_usage *u =
+		container_of(entry, struct jset_entry_usage, entry);
+	unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64);
+	int ret = 0;
+
+	if (journal_entry_err_on(bytes < sizeof(*u) ||
+				 bytes < sizeof(*u) + u->r.nr_devs,
+				 c,
+				 "invalid journal entry usage: bad size")) {
+		journal_entry_null_range(entry, vstruct_next(entry));
+		return ret;
+	}
 
 fsck_err:
 	return ret;
@@ -315,18 +338,10 @@ static const struct jset_entry_ops bch2_jset_entry_ops[] = {
 static int journal_entry_validate(struct bch_fs *c, struct jset *jset,
 				  struct jset_entry *entry, int write)
 {
-	int ret = 0;
-
-	if (entry->type >= BCH_JSET_ENTRY_NR) {
-		journal_entry_err(c, "invalid journal entry type %u",
-				  entry->type);
-		journal_entry_null_range(entry, vstruct_next(entry));
-		return 0;
-	}
-
-	ret = bch2_jset_entry_ops[entry->type].validate(c, jset, entry, write);
-fsck_err:
-	return ret;
+	return entry->type < BCH_JSET_ENTRY_NR
+		? bch2_jset_entry_ops[entry->type].validate(c, jset,
+							    entry, write)
+		: 0;
 }
 
 static int jset_validate_entries(struct bch_fs *c, struct jset *jset,
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 2ff86262d41c..cb9601dfcd37 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -108,7 +108,8 @@ static bool journal_empty(struct list_head *journal)
 
 	list_for_each_entry(i, journal, list) {
 		vstruct_for_each(&i->j, entry) {
-			if (entry->type == BCH_JSET_ENTRY_btree_root)
+			if (entry->type == BCH_JSET_ENTRY_btree_root ||
+			    entry->type == BCH_JSET_ENTRY_usage)
 				continue;
 
 			if (entry->type == BCH_JSET_ENTRY_btree_keys &&
diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c
index 5663441fd7ce..991d409b6a86 100644
--- a/fs/bcachefs/replicas.c
+++ b/fs/bcachefs/replicas.c
@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 
 #include "bcachefs.h"
+#include "journal.h"
 #include "replicas.h"
 #include "super-io.h"
 
@@ -302,6 +303,27 @@ err:
 	return ret;
 }
 
+static unsigned reserve_journal_replicas(struct bch_fs *c,
+				     struct bch_replicas_cpu *r)
+{
+	struct bch_replicas_entry *e;
+	unsigned journal_res_u64s = 0;
+
+	/* nr_inodes: */
+	journal_res_u64s +=
+		DIV_ROUND_UP(sizeof(struct jset_entry_usage), sizeof(u64));
+
+	/* key_version: */
+	journal_res_u64s +=
+		DIV_ROUND_UP(sizeof(struct jset_entry_usage), sizeof(u64));
+
+	for_each_cpu_replicas_entry(r, e)
+		journal_res_u64s +=
+			DIV_ROUND_UP(sizeof(struct jset_entry_usage) +
+				     e->nr_devs, sizeof(u64));
+	return journal_res_u64s;
+}
+
 noinline
 static int bch2_mark_replicas_slowpath(struct bch_fs *c,
 				struct bch_replicas_entry *new_entry)
@@ -329,6 +351,10 @@ static int bch2_mark_replicas_slowpath(struct bch_fs *c,
 		ret = bch2_cpu_replicas_to_sb_replicas(c, &new_r);
 		if (ret)
 			goto err;
+
+		bch2_journal_entry_res_resize(&c->journal,
+				&c->replicas_journal_res,
+				reserve_journal_replicas(c, &new_r));
 	}
 
 	if (!new_r.entries &&
@@ -595,6 +621,7 @@ int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *c)
 	bch2_cpu_replicas_sort(&new_r);
 
 	percpu_down_write(&c->mark_lock);
+
 	ret = replicas_table_update(c, &new_r);
 	percpu_up_write(&c->mark_lock);
 
@@ -915,3 +942,10 @@ unsigned bch2_dev_has_data(struct bch_fs *c, struct bch_dev *ca)
 
 	return ret;
 }
+
+int bch2_fs_replicas_init(struct bch_fs *c)
+{
+	c->journal.entry_u64s_reserved +=
+		reserve_journal_replicas(c, &c->replicas);
+	return 0;
+}
diff --git a/fs/bcachefs/replicas.h b/fs/bcachefs/replicas.h
index 923bddb21ec3..4fabe0372ec3 100644
--- a/fs/bcachefs/replicas.h
+++ b/fs/bcachefs/replicas.h
@@ -80,4 +80,6 @@ int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *);
 extern const struct bch_sb_field_ops bch_sb_field_ops_replicas;
 extern const struct bch_sb_field_ops bch_sb_field_ops_replicas_v0;
 
+int bch2_fs_replicas_init(struct bch_fs *);
+
 #endif /* _BCACHEFS_REPLICAS_H */
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 9ec05410aeb4..8a5ee2835bbd 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -658,6 +658,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 	    bch2_io_clock_init(&c->io_clock[READ]) ||
 	    bch2_io_clock_init(&c->io_clock[WRITE]) ||
 	    bch2_fs_journal_init(&c->journal) ||
+	    bch2_fs_replicas_init(c) ||
 	    bch2_fs_btree_cache_init(c) ||
 	    bch2_fs_io_init(c) ||
 	    bch2_fs_encryption_init(c) ||
-- 
cgit 


From 3ccc5c50f29fb6934fb71e12983f1f2beabe29b9 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 24 Jan 2019 17:54:51 -0500
Subject: bcachefs: Write out fs usage

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/journal_io.c |  33 ++++--------
 fs/bcachefs/replicas.c   |   5 --
 fs/bcachefs/replicas.h   |   5 ++
 fs/bcachefs/super-io.c   | 131 ++++++++++++++++++++++++++++++++++++-----------
 fs/bcachefs/super-io.h   |   4 ++
 5 files changed, 120 insertions(+), 58 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 173aecfaebc2..2f04f0074ec4 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -863,19 +863,6 @@ err:
 
 /* journal write: */
 
-static void bch2_journal_add_btree_root(struct journal_buf *buf,
-				       enum btree_id id, struct bkey_i *k,
-				       unsigned level)
-{
-	struct jset_entry *entry;
-
-	entry = bch2_journal_add_entry_noreservation(buf, k->k.u64s);
-	entry->type	= BCH_JSET_ENTRY_btree_root;
-	entry->btree_id = id;
-	entry->level	= level;
-	memcpy_u64s(entry->_data, k, k->k.u64s);
-}
-
 static unsigned journal_dev_buckets_available(struct journal *j,
 					      struct journal_device *ja)
 {
@@ -1206,25 +1193,27 @@ void bch2_journal_write(struct closure *cl)
 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
 	struct bch_dev *ca;
 	struct journal_buf *w = journal_prev_buf(j);
+	struct jset_entry *start, *end;
 	struct jset *jset;
 	struct bio *bio;
 	struct bch_extent_ptr *ptr;
 	bool validate_before_checksum = false;
-	unsigned i, sectors, bytes;
+	unsigned i, sectors, bytes, u64s;
 
 	journal_buf_realloc(j, w);
 	jset = w->data;
 
 	j->write_start_time = local_clock();
-	mutex_lock(&c->btree_root_lock);
-	for (i = 0; i < BTREE_ID_NR; i++) {
-		struct btree_root *r = &c->btree_roots[i];
 
-		if (r->alive)
-			bch2_journal_add_btree_root(w, i, &r->key, r->level);
-	}
-	c->btree_roots_dirty = false;
-	mutex_unlock(&c->btree_root_lock);
+	start	= vstruct_last(w->data);
+	end	= bch2_journal_super_entries_add_common(c, start,
+						le64_to_cpu(jset->seq));
+	u64s	= (u64 *) end - (u64 *) start;
+	BUG_ON(u64s > j->entry_u64s_reserved);
+
+	le32_add_cpu(&w->data->u64s, u64s);
+	BUG_ON(vstruct_sectors(jset, c->block_bits) >
+	       w->disk_sectors);
 
 	journal_write_compact(jset);
 
diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c
index 991d409b6a86..8495cac29a14 100644
--- a/fs/bcachefs/replicas.c
+++ b/fs/bcachefs/replicas.c
@@ -30,11 +30,6 @@ static void replicas_entry_sort(struct bch_replicas_entry *e)
 	bubble_sort(e->devs, e->nr_devs, u8_cmp);
 }
 
-#define for_each_cpu_replicas_entry(_r, _i)				\
-	for (_i = (_r)->entries;					\
-	     (void *) (_i) < (void *) (_r)->entries + (_r)->nr * (_r)->entry_size;\
-	     _i = (void *) (_i) + (_r)->entry_size)
-
 static void bch2_cpu_replicas_sort(struct bch_replicas_cpu *r)
 {
 	eytzinger0_sort(r->entries, r->nr, r->entry_size, memcmp, NULL);
diff --git a/fs/bcachefs/replicas.h b/fs/bcachefs/replicas.h
index 4fabe0372ec3..35164887dffb 100644
--- a/fs/bcachefs/replicas.h
+++ b/fs/bcachefs/replicas.h
@@ -57,6 +57,11 @@ unsigned bch2_dev_has_data(struct bch_fs *, struct bch_dev *);
 int bch2_replicas_gc_end(struct bch_fs *, int);
 int bch2_replicas_gc_start(struct bch_fs *, unsigned);
 
+#define for_each_cpu_replicas_entry(_r, _i)				\
+	for (_i = (_r)->entries;					\
+	     (void *) (_i) < (void *) (_r)->entries + (_r)->nr * (_r)->entry_size;\
+	     _i = (void *) (_i) + (_r)->entry_size)
+
 /* iterate over superblock replicas - used by userspace tools: */
 
 #define replicas_entry_bytes(_i)					\
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index 2ad1266e167d..9e991be3d90d 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -885,29 +885,112 @@ void bch2_sb_clean_renumber(struct bch_sb_field_clean *clean, int write)
 		bch2_bkey_renumber(BKEY_TYPE_BTREE, bkey_to_packed(entry->start), write);
 }
 
-void bch2_fs_mark_clean(struct bch_fs *c, bool clean)
+static void bch2_fs_mark_dirty(struct bch_fs *c)
 {
-	struct bch_sb_field_clean *sb_clean;
-	unsigned u64s = sizeof(*sb_clean) / sizeof(u64);
-	struct jset_entry *entry;
-	struct btree_root *r;
-
 	mutex_lock(&c->sb_lock);
-	if (clean == BCH_SB_CLEAN(c->disk_sb.sb))
-		goto out;
-
-	SET_BCH_SB_CLEAN(c->disk_sb.sb, clean);
+	if (BCH_SB_CLEAN(c->disk_sb.sb)) {
+		SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
+		bch2_write_super(c);
+	}
+	mutex_unlock(&c->sb_lock);
+}
 
-	if (!clean)
-		goto write_super;
+struct jset_entry *
+bch2_journal_super_entries_add_common(struct bch_fs *c,
+				      struct jset_entry *entry,
+				      u64 journal_seq)
+{
+	struct jset_entry_usage *u;
+	struct btree_root *r;
+	unsigned i;
 
 	mutex_lock(&c->btree_root_lock);
 
 	for (r = c->btree_roots;
 	     r < c->btree_roots + BTREE_ID_NR;
 	     r++)
-		if (r->alive)
-			u64s += jset_u64s(r->key.u64s);
+		if (r->alive) {
+			entry->u64s	= r->key.u64s;
+			entry->btree_id	= r - c->btree_roots;
+			entry->level	= r->level;
+			entry->type	= BCH_JSET_ENTRY_btree_root;
+			bkey_copy(&entry->start[0], &r->key);
+
+			entry = vstruct_next(entry);
+		}
+	c->btree_roots_dirty = false;
+
+	mutex_unlock(&c->btree_root_lock);
+
+	if (journal_seq)
+		return entry;
+
+	percpu_down_write(&c->mark_lock);
+
+	{
+		u64 nr_inodes = percpu_u64_get(&c->usage[0]->s.nr_inodes);
+
+		u = container_of(entry, struct jset_entry_usage, entry);
+		memset(u, 0, sizeof(*u));
+		u->entry.u64s	= DIV_ROUND_UP(sizeof(*u), sizeof(u64)) - 1;
+		u->entry.type	= BCH_JSET_ENTRY_usage;
+		u->sectors	= cpu_to_le64(nr_inodes);
+		u->type		= FS_USAGE_INODES;
+
+		entry = vstruct_next(entry);
+	}
+
+	{
+		u = container_of(entry, struct jset_entry_usage, entry);
+		memset(u, 0, sizeof(*u));
+		u->entry.u64s	= DIV_ROUND_UP(sizeof(*u), sizeof(u64)) - 1;
+		u->entry.type	= BCH_JSET_ENTRY_usage;
+		u->sectors	= cpu_to_le64(atomic64_read(&c->key_version));
+		u->type		= FS_USAGE_KEY_VERSION;
+
+		entry = vstruct_next(entry);
+	}
+
+	for (i = 0; i < c->replicas.nr; i++) {
+		struct bch_replicas_entry *e =
+			cpu_replicas_entry(&c->replicas, i);
+		u64 sectors = percpu_u64_get(&c->usage[0]->data[i]);
+
+		u = container_of(entry, struct jset_entry_usage, entry);
+		u->entry.u64s	= DIV_ROUND_UP(sizeof(*u) + e->nr_devs,
+					       sizeof(u64)) - 1;
+		u->entry.type	= BCH_JSET_ENTRY_usage;
+		u->sectors	= cpu_to_le64(sectors);
+		u->type		= FS_USAGE_REPLICAS;
+		unsafe_memcpy(&u->r, e, replicas_entry_bytes(e),
+			      "embedded variable length struct");
+
+		entry = vstruct_next(entry);
+	}
+
+	percpu_up_write(&c->mark_lock);
+
+	return entry;
+}
+
+void bch2_fs_mark_clean(struct bch_fs *c, bool clean)
+{
+	struct bch_sb_field_clean *sb_clean;
+	struct jset_entry *entry;
+	unsigned u64s;
+
+	if (!clean) {
+		bch2_fs_mark_dirty(c);
+		return;
+	}
+
+	mutex_lock(&c->sb_lock);
+	if (BCH_SB_CLEAN(c->disk_sb.sb))
+		goto out;
+
+	SET_BCH_SB_CLEAN(c->disk_sb.sb, true);
+
+	u64s = sizeof(*sb_clean) / sizeof(u64) + c->journal.entry_u64s_reserved;
 
 	sb_clean = bch2_sb_resize_clean(&c->disk_sb, u64s);
 	if (!sb_clean) {
@@ -921,30 +1004,16 @@ void bch2_fs_mark_clean(struct bch_fs *c, bool clean)
 	sb_clean->journal_seq	= journal_cur_seq(&c->journal) - 1;
 
 	entry = sb_clean->start;
+	entry = bch2_journal_super_entries_add_common(c, entry, 0);
+	BUG_ON((void *) entry > vstruct_end(&sb_clean->field));
+
 	memset(entry, 0,
 	       vstruct_end(&sb_clean->field) - (void *) entry);
 
-	for (r = c->btree_roots;
-	     r < c->btree_roots + BTREE_ID_NR;
-	     r++)
-		if (r->alive) {
-			entry->u64s	= r->key.u64s;
-			entry->btree_id	= r - c->btree_roots;
-			entry->level	= r->level;
-			entry->type	= BCH_JSET_ENTRY_btree_root;
-			bkey_copy(&entry->start[0], &r->key);
-			entry = vstruct_next(entry);
-			BUG_ON((void *) entry > vstruct_end(&sb_clean->field));
-		}
-
-	BUG_ON(entry != vstruct_end(&sb_clean->field));
-
 	if (le16_to_cpu(c->disk_sb.sb->version) <
 	    bcachefs_metadata_version_bkey_renumber)
 		bch2_sb_clean_renumber(sb_clean, WRITE);
 
-	mutex_unlock(&c->btree_root_lock);
-write_super:
 	bch2_write_super(c);
 out:
 	mutex_unlock(&c->sb_lock);
diff --git a/fs/bcachefs/super-io.h b/fs/bcachefs/super-io.h
index ac3b704f0540..498a9e887d4e 100644
--- a/fs/bcachefs/super-io.h
+++ b/fs/bcachefs/super-io.h
@@ -135,6 +135,10 @@ static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi)
 
 /* BCH_SB_FIELD_clean: */
 
+struct jset_entry *
+bch2_journal_super_entries_add_common(struct bch_fs *,
+				      struct jset_entry *, u64);
+
 void bch2_sb_clean_renumber(struct bch_sb_field_clean *, int);
 
 void bch2_fs_mark_clean(struct bch_fs *, bool);
-- 
cgit 


From 42b72e0ba226fa15dda0fb9e9f1646a1ae5d03fd Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 24 Jan 2019 19:09:49 -0500
Subject: bcachefs: journal_replay_early()

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_gc.c              | 16 -------
 fs/bcachefs/btree_gc.h              |  1 -
 fs/bcachefs/btree_types.h           |  1 +
 fs/bcachefs/btree_update_interior.c |  1 -
 fs/bcachefs/recovery.c              | 94 ++++++++++++++++++++++++++++++++-----
 fs/bcachefs/replicas.c              | 28 +++++++++++
 fs/bcachefs/replicas.h              |  4 ++
 7 files changed, 115 insertions(+), 30 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index b63dcbdb95c0..ac3fa1efb649 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -1249,19 +1249,3 @@ int bch2_gc_thread_start(struct bch_fs *c)
 	wake_up_process(p);
 	return 0;
 }
-
-/* Initial GC computes bucket marks during startup */
-
-int bch2_initial_gc(struct bch_fs *c, struct list_head *journal)
-{
-	int ret = bch2_gc(c, journal, true);
-
-	/*
-	 * Skip past versions that might have possibly been used (as nonces),
-	 * but hadn't had their pointers written:
-	 */
-	if (c->sb.encryption_type)
-		atomic64_add(1 << 16, &c->key_version);
-
-	return ret;
-}
diff --git a/fs/bcachefs/btree_gc.h b/fs/bcachefs/btree_gc.h
index 89ee72ac49f6..9eb2b0527a92 100644
--- a/fs/bcachefs/btree_gc.h
+++ b/fs/bcachefs/btree_gc.h
@@ -8,7 +8,6 @@ void bch2_coalesce(struct bch_fs *);
 int bch2_gc(struct bch_fs *, struct list_head *, bool);
 void bch2_gc_thread_stop(struct bch_fs *);
 int bch2_gc_thread_start(struct bch_fs *);
-int bch2_initial_gc(struct bch_fs *, struct list_head *);
 void bch2_mark_dev_superblock(struct bch_fs *, struct bch_dev *, unsigned);
 
 /*
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index ce5127301cb2..b5a4853451a7 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -475,6 +475,7 @@ struct btree_root {
 	__BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX);
 	u8			level;
 	u8			alive;
+	s8			error;
 };
 
 /*
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 4bc7be9b5298..451b293c44a6 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -2122,7 +2122,6 @@ void bch2_btree_set_root_for_read(struct bch_fs *c, struct btree *b)
 	BUG_ON(btree_node_root(c, b));
 
 	__bch2_btree_set_root_inmem(c, b);
-	bch2_btree_set_root_ondisk(c, b, READ);
 }
 
 void bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id)
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index cb9601dfcd37..6349c394be45 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -13,16 +13,17 @@
 #include "journal_io.h"
 #include "quota.h"
 #include "recovery.h"
+#include "replicas.h"
 #include "super-io.h"
 
 #include <linux/stat.h>
 
 #define QSTR(n) { { { .len = strlen(n) } }, .name = n }
 
-struct bkey_i *btree_root_find(struct bch_fs *c,
-			       struct bch_sb_field_clean *clean,
-			       struct jset *j,
-			       enum btree_id id, unsigned *level)
+static struct bkey_i *btree_root_find(struct bch_fs *c,
+				      struct bch_sb_field_clean *clean,
+				      struct jset *j,
+				      enum btree_id id, unsigned *level)
 {
 	struct bkey_i *k;
 	struct jset_entry *entry, *start, *end;
@@ -50,6 +51,51 @@ found:
 	return k;
 }
 
+static int journal_replay_entry_early(struct bch_fs *c,
+				      struct jset_entry *entry)
+{
+	int ret = 0;
+
+	switch (entry->type) {
+	case BCH_JSET_ENTRY_btree_root: {
+		struct btree_root *r = &c->btree_roots[entry->btree_id];
+
+		if (entry->u64s) {
+			r->level = entry->level;
+			bkey_copy(&r->key, &entry->start[0]);
+			r->error = 0;
+		} else {
+			r->error = -EIO;
+		}
+		r->alive = true;
+		break;
+	}
+	case BCH_JSET_ENTRY_usage: {
+		struct jset_entry_usage *u =
+			container_of(entry, struct jset_entry_usage, entry);
+
+		switch (u->type) {
+		case FS_USAGE_REPLICAS:
+			ret = bch2_replicas_set_usage(c, &u->r,
+						le64_to_cpu(u->sectors));
+			break;
+		case FS_USAGE_INODES:
+			percpu_u64_set(&c->usage[0]->s.nr_inodes,
+						le64_to_cpu(u->sectors));
+			break;
+		case FS_USAGE_KEY_VERSION:
+			atomic64_set(&c->key_version,
+				     le64_to_cpu(u->sectors));
+			break;
+		}
+
+		break;
+	}
+	}
+
+	return ret;
+}
+
 static int verify_superblock_clean(struct bch_fs *c,
 				   struct bch_sb_field_clean *clean,
 				   struct jset *j)
@@ -126,6 +172,7 @@ int bch2_fs_recovery(struct bch_fs *c)
 {
 	const char *err = "cannot allocate memory";
 	struct bch_sb_field_clean *clean = NULL, *sb_clean = NULL;
+	struct jset_entry *entry;
 	LIST_HEAD(journal);
 	struct jset *j = NULL;
 	unsigned i;
@@ -178,28 +225,44 @@ int bch2_fs_recovery(struct bch_fs *c)
 	fsck_err_on(clean && !journal_empty(&journal), c,
 		    "filesystem marked clean but journal not empty");
 
+	err = "insufficient memory";
 	if (clean) {
 		c->bucket_clock[READ].hand = le16_to_cpu(clean->read_clock);
 		c->bucket_clock[WRITE].hand = le16_to_cpu(clean->write_clock);
+
+		for (entry = clean->start;
+		     entry != vstruct_end(&clean->field);
+		     entry = vstruct_next(entry)) {
+			ret = journal_replay_entry_early(c, entry);
+			if (ret)
+				goto err;
+		}
 	} else {
+		struct journal_replay *i;
+
 		c->bucket_clock[READ].hand = le16_to_cpu(j->read_clock);
 		c->bucket_clock[WRITE].hand = le16_to_cpu(j->write_clock);
+
+		list_for_each_entry(i, &journal, list)
+			vstruct_for_each(&i->j, entry) {
+				ret = journal_replay_entry_early(c, entry);
+				if (ret)
+					goto err;
+			}
 	}
 
 	for (i = 0; i < BTREE_ID_NR; i++) {
-		unsigned level;
-		struct bkey_i *k;
+		struct btree_root *r = &c->btree_roots[i];
 
-		k = btree_root_find(c, clean, j, i, &level);
-		if (!k)
+		if (!r->alive)
 			continue;
 
 		err = "invalid btree root pointer";
-		if (IS_ERR(k))
+		if (r->error)
 			goto err;
 
 		err = "error reading btree root";
-		if (bch2_btree_root_read(c, i, k, level)) {
+		if (bch2_btree_root_read(c, i, &r->key, r->level)) {
 			if (i != BTREE_ID_ALLOC)
 				goto err;
 
@@ -226,13 +289,20 @@ int bch2_fs_recovery(struct bch_fs *c)
 
 	bch_verbose(c, "starting mark and sweep:");
 	err = "error in recovery";
-	ret = bch2_initial_gc(c, &journal);
+	ret = bch2_gc(c, &journal, true);
 	if (ret)
 		goto err;
 	bch_verbose(c, "mark and sweep done");
 
 	clear_bit(BCH_FS_REBUILD_REPLICAS, &c->flags);
 
+	/*
+	 * Skip past versions that might have possibly been used (as nonces),
+	 * but hadn't had their pointers written:
+	 */
+	if (c->sb.encryption_type && !c->sb.clean)
+		atomic64_add(1 << 16, &c->key_version);
+
 	if (c->opts.noreplay)
 		goto out;
 
@@ -319,7 +389,7 @@ int bch2_fs_initialize(struct bch_fs *c)
 	for (i = 0; i < BTREE_ID_NR; i++)
 		bch2_btree_root_alloc(c, i);
 
-	ret = bch2_initial_gc(c, &journal);
+	ret = bch2_gc(c, &journal, true);
 	if (ret)
 		goto err;
 
diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c
index 8495cac29a14..52a422ac5ace 100644
--- a/fs/bcachefs/replicas.c
+++ b/fs/bcachefs/replicas.c
@@ -530,6 +530,34 @@ int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask)
 	return 0;
 }
 
+int bch2_replicas_set_usage(struct bch_fs *c,
+			    struct bch_replicas_entry *r,
+			    u64 sectors)
+{
+	int ret, idx = bch2_replicas_entry_idx(c, r);
+
+	if (idx < 0) {
+		struct bch_replicas_cpu n;
+
+		n = cpu_replicas_add_entry(&c->replicas, r);
+		if (!n.entries)
+			return -ENOMEM;
+
+		ret = replicas_table_update(c, &n);
+		if (ret)
+			return ret;
+
+		kfree(n.entries);
+
+		idx = bch2_replicas_entry_idx(c, r);
+		BUG_ON(ret < 0);
+	}
+
+	percpu_u64_set(&c->usage[0]->data[idx], sectors);
+
+	return 0;
+}
+
 /* Replicas tracking - superblock: */
 
 static int
diff --git a/fs/bcachefs/replicas.h b/fs/bcachefs/replicas.h
index 35164887dffb..d1457c786bb5 100644
--- a/fs/bcachefs/replicas.h
+++ b/fs/bcachefs/replicas.h
@@ -57,6 +57,10 @@ unsigned bch2_dev_has_data(struct bch_fs *, struct bch_dev *);
 int bch2_replicas_gc_end(struct bch_fs *, int);
 int bch2_replicas_gc_start(struct bch_fs *, unsigned);
 
+int bch2_replicas_set_usage(struct bch_fs *,
+			    struct bch_replicas_entry *,
+			    u64);
+
 #define for_each_cpu_replicas_entry(_r, _i)				\
 	for (_i = (_r)->entries;					\
 	     (void *) (_i) < (void *) (_r)->entries + (_r)->nr * (_r)->entry_size;\
-- 
cgit 


From 3e0745e28363c1675a05775425312c049d5857b3 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 24 Jan 2019 20:25:40 -0500
Subject: bcachefs: initialize fs usage summary in recovery

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/buckets.c  | 27 +++++++++++++++++++++++++++
 fs/bcachefs/buckets.h  |  1 +
 fs/bcachefs/recovery.c |  8 ++++++++
 fs/bcachefs/super.c    |  9 ++++++---
 4 files changed, 42 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index d919c1cacee5..f65132a0ebf4 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -114,6 +114,33 @@ void bch2_bucket_seq_cleanup(struct bch_fs *c)
 	}
 }
 
+void bch2_fs_usage_initialize(struct bch_fs *c)
+{
+	struct bch_fs_usage *usage;
+	unsigned i, nr;
+
+	percpu_down_write(&c->mark_lock);
+	nr = sizeof(struct bch_fs_usage) / sizeof(u64) + c->replicas.nr;
+	usage = (void *) bch2_acc_percpu_u64s((void *) c->usage[0], nr);
+
+	for (i = 0; i < c->replicas.nr; i++) {
+		struct bch_replicas_entry *e =
+			cpu_replicas_entry(&c->replicas, i);
+
+		switch (e->data_type) {
+		case BCH_DATA_BTREE:
+		case BCH_DATA_USER:
+			usage->s.data	+= usage->data[i];
+			break;
+		case BCH_DATA_CACHED:
+			usage->s.cached	+= usage->data[i];
+			break;
+		}
+	}
+
+	percpu_up_write(&c->mark_lock);
+}
+
 #define bch2_usage_read_raw(_stats)					\
 ({									\
 	typeof(*this_cpu_ptr(_stats)) _acc;				\
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index ee8c9e9a1f23..e5b9d5cb1215 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -248,6 +248,7 @@ static inline u64 bch2_fs_sectors_free(struct bch_fs *c)
 /* key/bucket marking: */
 
 void bch2_bucket_seq_cleanup(struct bch_fs *);
+void bch2_fs_usage_initialize(struct bch_fs *);
 
 void bch2_invalidate_bucket(struct bch_fs *, struct bch_dev *,
 			    size_t, struct bucket_mark *);
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 6349c394be45..a9b8d565c82f 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -6,6 +6,7 @@
 #include "btree_update.h"
 #include "btree_update_interior.h"
 #include "btree_io.h"
+#include "buckets.h"
 #include "dirent.h"
 #include "ec.h"
 #include "error.h"
@@ -251,6 +252,8 @@ int bch2_fs_recovery(struct bch_fs *c)
 			}
 	}
 
+	bch2_fs_usage_initialize(c);
+
 	for (i = 0; i < BTREE_ID_NR; i++) {
 		struct btree_root *r = &c->btree_roots[i];
 
@@ -384,6 +387,11 @@ int bch2_fs_initialize(struct bch_fs *c)
 
 	bch_notice(c, "initializing new filesystem");
 
+	mutex_lock(&c->sb_lock);
+	for_each_online_member(ca, c, i)
+		bch2_mark_dev_superblock(c, ca, 0);
+	mutex_unlock(&c->sb_lock);
+
 	set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags);
 
 	for (i = 0; i < BTREE_ID_NR; i++)
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 8a5ee2835bbd..29cb12d841e7 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -1071,9 +1071,12 @@ static int bch2_dev_attach_bdev(struct bch_fs *c, struct bch_sb_handle *sb)
 	if (ret)
 		return ret;
 
-	mutex_lock(&c->sb_lock);
-	bch2_mark_dev_superblock(ca->fs, ca, 0);
-	mutex_unlock(&c->sb_lock);
+	if (test_bit(BCH_FS_ALLOC_READ_DONE, &c->flags) &&
+	    !percpu_u64_get(&ca->usage[0]->buckets[BCH_DATA_SB])) {
+		mutex_lock(&c->sb_lock);
+		bch2_mark_dev_superblock(ca->fs, ca, 0);
+		mutex_unlock(&c->sb_lock);
+	}
 
 	bch2_dev_sysfs_online(c, ca);
 
-- 
cgit 


From 3577df5f7f25f6669c4b53e76cf159d550a0fd83 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 9 Feb 2019 19:20:57 -0500
Subject: bcachefs: serialize persistent_reserved

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs_format.h | 13 +++++++++----
 fs/bcachefs/buckets.c         |  3 +++
 fs/bcachefs/journal_io.c      | 21 +++++++++++++++++++++
 fs/bcachefs/recovery.c        | 24 +++++++++++++++++-------
 fs/bcachefs/replicas.c        |  7 ++++++-
 fs/bcachefs/super-io.c        | 43 ++++++++++++++++++++++++++++++++-----------
 6 files changed, 88 insertions(+), 23 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index bd41628f2995..71ba708c3e2b 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -1363,7 +1363,8 @@ static inline __u64 __bset_magic(struct bch_sb *sb)
 	x(prio_ptrs,		2)		\
 	x(blacklist,		3)		\
 	x(blacklist_v2,		4)		\
-	x(usage,		5)
+	x(usage,		5)		\
+	x(data_usage,		6)
 
 enum {
 #define x(f, nr)	BCH_JSET_ENTRY_##f	= nr,
@@ -1394,7 +1395,7 @@ struct jset_entry_blacklist_v2 {
 };
 
 enum {
-	FS_USAGE_REPLICAS		= 0,
+	FS_USAGE_RESERVED		= 0,
 	FS_USAGE_INODES			= 1,
 	FS_USAGE_KEY_VERSION		= 2,
 	FS_USAGE_NR			= 3
@@ -1402,8 +1403,12 @@ enum {
 
 struct jset_entry_usage {
 	struct jset_entry	entry;
-	__le64			sectors;
-	__u8			type;
+	__le64			v;
+} __attribute__((packed));
+
+struct jset_entry_data_usage {
+	struct jset_entry	entry;
+	__le64			v;
 	struct bch_replicas_entry r;
 } __attribute__((packed));
 
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index f65132a0ebf4..d2e047ee29cf 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -123,6 +123,9 @@ void bch2_fs_usage_initialize(struct bch_fs *c)
 	nr = sizeof(struct bch_fs_usage) / sizeof(u64) + c->replicas.nr;
 	usage = (void *) bch2_acc_percpu_u64s((void *) c->usage[0], nr);
 
+	for (i = 0; i < BCH_REPLICAS_MAX; i++)
+		usage->s.reserved += usage->persistent_reserved[i];
+
 	for (i = 0; i < c->replicas.nr; i++) {
 		struct bch_replicas_entry *e =
 			cpu_replicas_entry(&c->replicas, i);
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 2f04f0074ec4..bfa1045b0eb5 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -309,6 +309,27 @@ static int journal_entry_validate_usage(struct bch_fs *c,
 	unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64);
 	int ret = 0;
 
+	if (journal_entry_err_on(bytes < sizeof(*u),
+				 c,
+				 "invalid journal entry usage: bad size")) {
+		journal_entry_null_range(entry, vstruct_next(entry));
+		return ret;
+	}
+
+fsck_err:
+	return ret;
+}
+
+static int journal_entry_validate_data_usage(struct bch_fs *c,
+					struct jset *jset,
+					struct jset_entry *entry,
+					int write)
+{
+	struct jset_entry_data_usage *u =
+		container_of(entry, struct jset_entry_data_usage, entry);
+	unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64);
+	int ret = 0;
+
 	if (journal_entry_err_on(bytes < sizeof(*u) ||
 				 bytes < sizeof(*u) + u->r.nr_devs,
 				 c,
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index a9b8d565c82f..31d2bce7bb57 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -75,23 +75,32 @@ static int journal_replay_entry_early(struct bch_fs *c,
 		struct jset_entry_usage *u =
 			container_of(entry, struct jset_entry_usage, entry);
 
-		switch (u->type) {
-		case FS_USAGE_REPLICAS:
-			ret = bch2_replicas_set_usage(c, &u->r,
-						le64_to_cpu(u->sectors));
+		switch (entry->btree_id) {
+		case FS_USAGE_RESERVED:
+			if (entry->level < BCH_REPLICAS_MAX)
+				percpu_u64_set(&c->usage[0]->
+					       persistent_reserved[entry->level],
+					       le64_to_cpu(u->v));
 			break;
 		case FS_USAGE_INODES:
 			percpu_u64_set(&c->usage[0]->s.nr_inodes,
-						le64_to_cpu(u->sectors));
+				       le64_to_cpu(u->v));
 			break;
 		case FS_USAGE_KEY_VERSION:
 			atomic64_set(&c->key_version,
-				     le64_to_cpu(u->sectors));
+				     le64_to_cpu(u->v));
 			break;
 		}
 
 		break;
 	}
+	case BCH_JSET_ENTRY_data_usage: {
+		struct jset_entry_data_usage *u =
+			container_of(entry, struct jset_entry_data_usage, entry);
+		ret = bch2_replicas_set_usage(c, &u->r,
+					      le64_to_cpu(u->v));
+		break;
+	}
 	}
 
 	return ret;
@@ -156,7 +165,8 @@ static bool journal_empty(struct list_head *journal)
 	list_for_each_entry(i, journal, list) {
 		vstruct_for_each(&i->j, entry) {
 			if (entry->type == BCH_JSET_ENTRY_btree_root ||
-			    entry->type == BCH_JSET_ENTRY_usage)
+			    entry->type == BCH_JSET_ENTRY_usage ||
+			    entry->type == BCH_JSET_ENTRY_data_usage)
 				continue;
 
 			if (entry->type == BCH_JSET_ENTRY_btree_keys &&
diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c
index 52a422ac5ace..6fee8fe37688 100644
--- a/fs/bcachefs/replicas.c
+++ b/fs/bcachefs/replicas.c
@@ -312,9 +312,14 @@ static unsigned reserve_journal_replicas(struct bch_fs *c,
 	journal_res_u64s +=
 		DIV_ROUND_UP(sizeof(struct jset_entry_usage), sizeof(u64));
 
+	/* persistent_reserved: */
+	journal_res_u64s +=
+		DIV_ROUND_UP(sizeof(struct jset_entry_usage), sizeof(u64)) *
+		BCH_REPLICAS_MAX;
+
 	for_each_cpu_replicas_entry(r, e)
 		journal_res_u64s +=
-			DIV_ROUND_UP(sizeof(struct jset_entry_usage) +
+			DIV_ROUND_UP(sizeof(struct jset_entry_data_usage) +
 				     e->nr_devs, sizeof(u64));
 	return journal_res_u64s;
 }
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index 9e991be3d90d..0cc8565b070f 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -900,7 +900,6 @@ bch2_journal_super_entries_add_common(struct bch_fs *c,
 				      struct jset_entry *entry,
 				      u64 journal_seq)
 {
-	struct jset_entry_usage *u;
 	struct btree_root *r;
 	unsigned i;
 
@@ -929,24 +928,45 @@ bch2_journal_super_entries_add_common(struct bch_fs *c,
 
 	{
 		u64 nr_inodes = percpu_u64_get(&c->usage[0]->s.nr_inodes);
+		struct jset_entry_usage *u =
+			container_of(entry, struct jset_entry_usage, entry);
 
-		u = container_of(entry, struct jset_entry_usage, entry);
 		memset(u, 0, sizeof(*u));
 		u->entry.u64s	= DIV_ROUND_UP(sizeof(*u), sizeof(u64)) - 1;
 		u->entry.type	= BCH_JSET_ENTRY_usage;
-		u->sectors	= cpu_to_le64(nr_inodes);
-		u->type		= FS_USAGE_INODES;
+		u->entry.btree_id = FS_USAGE_INODES;
+		u->v		= cpu_to_le64(nr_inodes);
 
 		entry = vstruct_next(entry);
 	}
 
 	{
-		u = container_of(entry, struct jset_entry_usage, entry);
+		struct jset_entry_usage *u =
+			container_of(entry, struct jset_entry_usage, entry);
+
 		memset(u, 0, sizeof(*u));
 		u->entry.u64s	= DIV_ROUND_UP(sizeof(*u), sizeof(u64)) - 1;
 		u->entry.type	= BCH_JSET_ENTRY_usage;
-		u->sectors	= cpu_to_le64(atomic64_read(&c->key_version));
-		u->type		= FS_USAGE_KEY_VERSION;
+		u->entry.btree_id = FS_USAGE_KEY_VERSION;
+		u->v		= cpu_to_le64(atomic64_read(&c->key_version));
+
+		entry = vstruct_next(entry);
+	}
+
+	for (i = 0; i < BCH_REPLICAS_MAX; i++) {
+		struct jset_entry_usage *u =
+			container_of(entry, struct jset_entry_usage, entry);
+		u64 sectors = percpu_u64_get(&c->usage[0]->persistent_reserved[i]);
+
+		if (!sectors)
+			continue;
+
+		memset(u, 0, sizeof(*u));
+		u->entry.u64s	= DIV_ROUND_UP(sizeof(*u), sizeof(u64)) - 1;
+		u->entry.type	= BCH_JSET_ENTRY_usage;
+		u->entry.btree_id = FS_USAGE_RESERVED;
+		u->entry.level	= i;
+		u->v		= sectors;
 
 		entry = vstruct_next(entry);
 	}
@@ -955,13 +975,14 @@ bch2_journal_super_entries_add_common(struct bch_fs *c,
 		struct bch_replicas_entry *e =
 			cpu_replicas_entry(&c->replicas, i);
 		u64 sectors = percpu_u64_get(&c->usage[0]->data[i]);
+		struct jset_entry_data_usage *u =
+			container_of(entry, struct jset_entry_data_usage, entry);
 
-		u = container_of(entry, struct jset_entry_usage, entry);
+		memset(u, 0, sizeof(*u));
 		u->entry.u64s	= DIV_ROUND_UP(sizeof(*u) + e->nr_devs,
 					       sizeof(u64)) - 1;
-		u->entry.type	= BCH_JSET_ENTRY_usage;
-		u->sectors	= cpu_to_le64(sectors);
-		u->type		= FS_USAGE_REPLICAS;
+		u->entry.type	= BCH_JSET_ENTRY_data_usage;
+		u->v		= cpu_to_le64(sectors);
 		unsafe_memcpy(&u->r, e, replicas_entry_bytes(e),
 			      "embedded variable length struct");
 
-- 
cgit 


From 1df42b571535ed3fd8d9d94f674c626746dc0275 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 6 Feb 2019 11:56:51 -0500
Subject: bcachefs: don't do initial gc if have alloc info feature

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs.h        |  1 +
 fs/bcachefs/bcachefs_format.h |  4 ++++
 fs/bcachefs/btree_gc.c        |  6 ++----
 fs/bcachefs/recovery.c        | 18 ++++++++++++------
 fs/bcachefs/super-io.c        |  7 ++++++-
 5 files changed, 25 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 90c44ef0fbe0..81597383dc20 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -568,6 +568,7 @@ struct bch_fs {
 		u32		time_base_hi;
 		u32		time_precision;
 		u64		features;
+		u64		compat;
 	}			sb;
 
 	struct bch_sb_handle	disk_sb;
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index 71ba708c3e2b..a663f9d3fb51 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -1279,6 +1279,10 @@ enum bch_sb_features {
 	BCH_FEATURE_NR,
 };
 
+enum bch_sb_compat {
+	BCH_COMPAT_FEAT_ALLOC_INFO	= 0,
+};
+
 /* options: */
 
 #define BCH_REPLICAS_MAX		4U
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index ac3fa1efb649..899bdfa4d6d3 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -577,7 +577,8 @@ static void bch2_gc_done(struct bch_fs *c, bool initial)
 
 	percpu_down_write(&c->mark_lock);
 
-	if (initial) {
+	if (initial &&
+	    !(c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_INFO))) {
 		bch2_gc_done_nocheck(c);
 		goto out;
 	}
@@ -819,9 +820,6 @@ out:
 	bch2_gc_free(c);
 	up_write(&c->gc_lock);
 
-	if (!ret && initial)
-		set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags);
-
 	trace_gc_end(c);
 	bch2_time_stats_update(&c->times[BCH_TIME_btree_gc], start_time);
 
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 31d2bce7bb57..1c09ae4f5f2f 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -300,14 +300,18 @@ int bch2_fs_recovery(struct bch_fs *c)
 
 	set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags);
 
-	bch_verbose(c, "starting mark and sweep:");
-	err = "error in recovery";
-	ret = bch2_gc(c, &journal, true);
-	if (ret)
-		goto err;
-	bch_verbose(c, "mark and sweep done");
+	if (!(c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_INFO)) ||
+	    c->opts.fsck) {
+		bch_verbose(c, "starting mark and sweep:");
+		err = "error in recovery";
+		ret = bch2_gc(c, &journal, true);
+		if (ret)
+			goto err;
+		bch_verbose(c, "mark and sweep done");
+	}
 
 	clear_bit(BCH_FS_REBUILD_REPLICAS, &c->flags);
+	set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags);
 
 	/*
 	 * Skip past versions that might have possibly been used (as nonces),
@@ -411,6 +415,8 @@ int bch2_fs_initialize(struct bch_fs *c)
 	if (ret)
 		goto err;
 
+	set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags);
+
 	err = "unable to allocate journal buckets";
 	for_each_online_member(ca, c, i)
 		if (bch2_dev_journal_alloc(ca)) {
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index 0cc8565b070f..ff9728b62b6e 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -371,6 +371,7 @@ static void bch2_sb_update(struct bch_fs *c)
 	c->sb.time_base_hi	= le32_to_cpu(src->time_base_hi);
 	c->sb.time_precision	= le32_to_cpu(src->time_precision);
 	c->sb.features		= le64_to_cpu(src->features[0]);
+	c->sb.compat		= le64_to_cpu(src->compat[0]);
 
 	for_each_member_device(ca, c, i)
 		ca->mi = bch2_mi_to_cpu(mi->members + i);
@@ -888,8 +889,10 @@ void bch2_sb_clean_renumber(struct bch_sb_field_clean *clean, int write)
 static void bch2_fs_mark_dirty(struct bch_fs *c)
 {
 	mutex_lock(&c->sb_lock);
-	if (BCH_SB_CLEAN(c->disk_sb.sb)) {
+	if (BCH_SB_CLEAN(c->disk_sb.sb) ||
+	    (c->disk_sb.sb->compat[0] & (1ULL << BCH_COMPAT_FEAT_ALLOC_INFO))) {
 		SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
+		c->disk_sb.sb->compat[0] &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_INFO);
 		bch2_write_super(c);
 	}
 	mutex_unlock(&c->sb_lock);
@@ -1011,6 +1014,8 @@ void bch2_fs_mark_clean(struct bch_fs *c, bool clean)
 
 	SET_BCH_SB_CLEAN(c->disk_sb.sb, true);
 
+	c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_FEAT_ALLOC_INFO;
+
 	u64s = sizeof(*sb_clean) / sizeof(u64) + c->journal.entry_u64s_reserved;
 
 	sb_clean = bch2_sb_resize_clean(&c->disk_sb, u64s);
-- 
cgit 


From 6d033aa404e71710de217d63999fade2f2b0491f Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 9 Feb 2019 19:45:36 -0500
Subject: bcachefs: Don't need to walk inodes on clean shutdown

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fsck.c | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 810e1c3f4c49..e79846a96f9c 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -1187,6 +1187,11 @@ static int check_inode(struct bch_fs *c,
 	}
 
 	if (u.bi_flags & BCH_INODE_UNLINKED) {
+		fsck_err_on(c->sb.clean, c,
+			    "filesystem marked clean, "
+			    "but inode %llu unlinked",
+			    u.bi_inum);
+
 		bch_verbose(c, "deleting inode %llu", u.bi_inum);
 
 		ret = bch2_inode_rm(c, u.bi_inum);
@@ -1389,16 +1394,13 @@ static int check_inodes_fast(struct bch_fs *c)
 		    (BCH_INODE_I_SIZE_DIRTY|
 		     BCH_INODE_I_SECTORS_DIRTY|
 		     BCH_INODE_UNLINKED)) {
-			fsck_err_on(c->sb.clean, c,
-				"filesystem marked clean but found inode %llu with flags %x",
-				inode.k->p.inode, inode.v->bi_flags);
 			ret = check_inode(c, NULL, &iter, inode, NULL);
 			BUG_ON(ret == -EINTR);
 			if (ret)
 				break;
 		}
 	}
-fsck_err:
+
 	return bch2_btree_iter_unlock(&iter) ?: ret;
 }
 
@@ -1460,9 +1462,10 @@ int bch2_fsck(struct bch_fs *c)
 	if (c->opts.fsck)
 		return bch2_fsck_full(c);
 
-	if (!c->sb.clean &&
-	    !(c->sb.features & (1 << BCH_FEATURE_ATOMIC_NLINK)))
-		return bch2_fsck_inode_nlink(c);
+	if (c->sb.clean)
+		return 0;
 
-	return bch2_fsck_walk_inodes_only(c);
+	return c->sb.features & (1 << BCH_FEATURE_ATOMIC_NLINK)
+		? bch2_fsck_walk_inodes_only(c)
+		: bch2_fsck_inode_nlink(c);
 }
-- 
cgit 


From f7e76361c4c77941ffe0b4bb25db04642714a283 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 10 Feb 2019 19:16:55 -0500
Subject: bcachefs: no need to run gc when initializing new fs

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/recovery.c | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 1c09ae4f5f2f..e28917cf2cec 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -407,16 +407,11 @@ int bch2_fs_initialize(struct bch_fs *c)
 	mutex_unlock(&c->sb_lock);
 
 	set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags);
+	set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags);
 
 	for (i = 0; i < BTREE_ID_NR; i++)
 		bch2_btree_root_alloc(c, i);
 
-	ret = bch2_gc(c, &journal, true);
-	if (ret)
-		goto err;
-
-	set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags);
-
 	err = "unable to allocate journal buckets";
 	for_each_online_member(ca, c, i)
 		if (bch2_dev_journal_alloc(ca)) {
-- 
cgit 


From 66393392533048b7e45a202f349974d16c4c9ea3 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 11 Feb 2019 15:33:14 -0500
Subject: bcachefs: Fix a lockdep splat

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/super-io.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index ff9728b62b6e..0b3a761fe93e 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -143,7 +143,7 @@ int bch2_sb_realloc(struct bch_sb_handle *sb, unsigned u64s)
 		sb->bio = bio;
 	}
 
-	new_sb = (void *) __get_free_pages(GFP_KERNEL|__GFP_ZERO, order);
+	new_sb = (void *) __get_free_pages(GFP_NOFS|__GFP_ZERO, order);
 	if (!new_sb)
 		return -ENOMEM;
 
-- 
cgit 


From 053dbb377dd2874942a1ca8517d454f35916a782 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 11 Feb 2019 19:04:40 -0500
Subject: bcachefs: Fix a locking bug

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c  | 1 +
 fs/bcachefs/btree_update_leaf.c | 3 +++
 2 files changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index c44e0417af87..34c1de63e43b 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -883,6 +883,7 @@ static int bch2_invalidate_buckets(struct bch_fs *c, struct bch_dev *ca)
 
 		ret = __bch2_alloc_write_key(c, ca, b, &iter,
 				must_flush ? &journal_seq : NULL,
+				BTREE_INSERT_GC_LOCK_HELD|
 				!fifo_empty(&ca->free_inc) ? BTREE_INSERT_NOWAIT : 0);
 	}
 
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index e052a3debadb..d1a2ac48ed29 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -624,6 +624,9 @@ int __bch2_btree_insert_at(struct btree_insert *trans)
 	/* for the sake of sanity: */
 	BUG_ON(trans->nr > 1 && !(trans->flags & BTREE_INSERT_ATOMIC));
 
+	if (trans->flags & BTREE_INSERT_GC_LOCK_HELD)
+		lockdep_assert_held(&c->gc_lock);
+
 	bubble_sort(trans->entries, trans->nr, btree_trans_cmp);
 
 	trans_for_each_entry(trans, i)
-- 
cgit 


From 76f4c7b0c33d86bfa3973655ea6ed6182039ca99 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 11 Feb 2019 19:27:33 -0500
Subject: bcachefs: Fix oldest_gen handling

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c |   2 +
 fs/bcachefs/bcachefs.h         |   2 -
 fs/bcachefs/bcachefs_format.h  |   3 +-
 fs/bcachefs/btree_gc.c         | 119 ++++++++++++-----------------------------
 fs/bcachefs/buckets.c          |  10 ----
 fs/bcachefs/buckets.h          |   4 +-
 fs/bcachefs/buckets_types.h    |   1 +
 7 files changed, 43 insertions(+), 98 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 34c1de63e43b..9d2e21d99e6e 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -183,6 +183,7 @@ static void __alloc_read_key(struct bucket *g, const struct bch_alloc *a)
 	g->_mark.data_type	= get_alloc_field(a, &d, idx++);
 	g->_mark.dirty_sectors	= get_alloc_field(a, &d, idx++);
 	g->_mark.cached_sectors	= get_alloc_field(a, &d, idx++);
+	g->oldest_gen		= get_alloc_field(a, &d, idx++);
 }
 
 static void __alloc_write_key(struct bkey_i_alloc *a, struct bucket *g,
@@ -200,6 +201,7 @@ static void __alloc_write_key(struct bkey_i_alloc *a, struct bucket *g,
 	put_alloc_field(a, &d, idx++, m.data_type);
 	put_alloc_field(a, &d, idx++, m.dirty_sectors);
 	put_alloc_field(a, &d, idx++, m.cached_sectors);
+	put_alloc_field(a, &d, idx++, g->oldest_gen);
 
 	set_bkey_val_bytes(&a->k, (void *) d - (void *) &a->v);
 }
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 81597383dc20..0b495dd32f67 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -399,8 +399,6 @@ struct bch_dev {
 	struct bucket_array __rcu *buckets[2];
 	unsigned long		*buckets_nouse;
 	unsigned long		*buckets_written;
-	/* most out of date gen in the btree */
-	u8			*oldest_gens;
 	struct rw_semaphore	bucket_lock;
 
 	struct bch_dev_usage __percpu *usage[2];
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index a663f9d3fb51..c7971e5c7c36 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -830,7 +830,8 @@ struct bch_alloc {
 	x(write_time, 2)			\
 	x(data_type, 1)				\
 	x(dirty_sectors, 2)			\
-	x(cached_sectors, 2)
+	x(cached_sectors, 2)			\
+	x(oldest_gen, 1)
 
 enum {
 #define x(name, bytes) BCH_ALLOC_FIELD_##name,
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 899bdfa4d6d3..391389d431c8 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -167,9 +167,10 @@ static int bch2_gc_mark_key(struct bch_fs *c, struct bkey_s_c k,
 	bkey_for_each_ptr(ptrs, ptr) {
 		struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
 		size_t b = PTR_BUCKET_NR(ca, ptr);
+		struct bucket *g = __bucket(ca, b, true);
 
-		if (gen_after(ca->oldest_gens[b], ptr->gen))
-			ca->oldest_gens[b] = ptr->gen;
+		if (gen_after(g->oldest_gen, ptr->gen))
+			g->oldest_gen = ptr->gen;
 
 		*max_stale = max(*max_stale, ptr_stale(ca, ptr));
 	}
@@ -486,89 +487,38 @@ static void bch2_gc_free(struct bch_fs *c)
 	percpu_up_write(&c->mark_lock);
 }
 
-static void bch2_gc_done_nocheck(struct bch_fs *c)
-{
-	struct bch_dev *ca;
-	unsigned i;
-
-	{
-		struct genradix_iter dst_iter = genradix_iter_init(&c->stripes[0], 0);
-		struct genradix_iter src_iter = genradix_iter_init(&c->stripes[1], 0);
-		struct stripe *dst, *src;
-
-		c->ec_stripes_heap.used = 0;
-
-		while ((dst = genradix_iter_peek(&dst_iter, &c->stripes[0])) &&
-		       (src = genradix_iter_peek(&src_iter, &c->stripes[1]))) {
-			*dst = *src;
-
-			if (dst->alive)
-				bch2_stripes_heap_insert(c, dst, dst_iter.pos);
-
-			genradix_iter_advance(&dst_iter, &c->stripes[0]);
-			genradix_iter_advance(&src_iter, &c->stripes[1]);
-		}
-	}
-
-	for_each_member_device(ca, c, i) {
-		struct bucket_array *src = __bucket_array(ca, 1);
-
-		memcpy(__bucket_array(ca, 0), src,
-		       sizeof(struct bucket_array) +
-		       sizeof(struct bucket) * src->nbuckets);
-	};
-
-	for_each_member_device(ca, c, i) {
-		unsigned nr = sizeof(struct bch_dev_usage) / sizeof(u64);
-		struct bch_dev_usage *dst = (void *)
-			bch2_acc_percpu_u64s((void *) ca->usage[0], nr);
-		struct bch_dev_usage *src = (void *)
-			bch2_acc_percpu_u64s((void *) ca->usage[1], nr);
-
-		*dst = *src;
-	}
-
-	{
-		unsigned nr = sizeof(struct bch_fs_usage) / sizeof(u64) +
-			c->replicas.nr;
-		struct bch_fs_usage *dst = (void *)
-			bch2_acc_percpu_u64s((void *) c->usage[0], nr);
-		struct bch_fs_usage *src = (void *)
-			bch2_acc_percpu_u64s((void *) c->usage[1], nr);
-		unsigned offset = offsetof(typeof(*dst), s.gc_start);
-
-		memcpy((void *) dst + offset,
-		       (void *) src + offset,
-		       nr * sizeof(u64) - offset);
-	}
-}
-
 static void bch2_gc_done(struct bch_fs *c, bool initial)
 {
 	struct bch_dev *ca;
+	bool verify = !initial ||
+		(c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_INFO));
 	unsigned i;
 
 #define copy_field(_f, _msg, ...)					\
 	if (dst->_f != src->_f) {					\
-		bch_err(c, _msg ": got %llu, should be %llu, fixing"	\
-			, ##__VA_ARGS__, dst->_f, src->_f);		\
+		if (verify)						\
+			bch_err(c, _msg ": got %llu, should be %llu, fixing"\
+				, ##__VA_ARGS__, dst->_f, src->_f);	\
 		dst->_f = src->_f;					\
 	}
 #define copy_stripe_field(_f, _msg, ...)				\
 	if (dst->_f != src->_f) {					\
-		bch_err_ratelimited(c, "stripe %zu has wrong "_msg	\
-			": got %u, should be %u, fixing",		\
-			dst_iter.pos, ##__VA_ARGS__,			\
-			dst->_f, src->_f);				\
+		if (verify)						\
+			bch_err_ratelimited(c, "stripe %zu has wrong "_msg\
+				": got %u, should be %u, fixing",	\
+				dst_iter.pos, ##__VA_ARGS__,		\
+				dst->_f, src->_f);			\
 		dst->_f = src->_f;					\
 		dst->dirty = true;					\
 	}
 #define copy_bucket_field(_f)						\
 	if (dst->b[b].mark._f != src->b[b].mark._f) {			\
-		bch_err_ratelimited(c, "dev %u bucket %zu has wrong " #_f\
-			": got %u, should be %u, fixing",		\
-			i, b, dst->b[b].mark._f, src->b[b].mark._f);	\
+		if (verify)						\
+			bch_err_ratelimited(c, "dev %u bucket %zu has wrong " #_f\
+				": got %u, should be %u, fixing", i, b,	\
+				dst->b[b].mark._f, src->b[b].mark._f);	\
 		dst->b[b]._mark._f = src->b[b].mark._f;			\
+		dst->b[b]._mark.dirty = true;				\
 	}
 #define copy_dev_field(_f, _msg, ...)					\
 	copy_field(_f, "dev %u has wrong " _msg, i, ##__VA_ARGS__)
@@ -577,12 +527,6 @@ static void bch2_gc_done(struct bch_fs *c, bool initial)
 
 	percpu_down_write(&c->mark_lock);
 
-	if (initial &&
-	    !(c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_INFO))) {
-		bch2_gc_done_nocheck(c);
-		goto out;
-	}
-
 	{
 		struct genradix_iter dst_iter = genradix_iter_init(&c->stripes[0], 0);
 		struct genradix_iter src_iter = genradix_iter_init(&c->stripes[1], 0);
@@ -633,6 +577,11 @@ static void bch2_gc_done(struct bch_fs *c, bool initial)
 			copy_bucket_field(stripe);
 			copy_bucket_field(dirty_sectors);
 			copy_bucket_field(cached_sectors);
+
+			if (dst->b[b].oldest_gen != src->b[b].oldest_gen) {
+				dst->b[b].oldest_gen = src->b[b].oldest_gen;
+				dst->b[b]._mark.dirty = true;
+			}
 		}
 	};
 
@@ -645,16 +594,16 @@ static void bch2_gc_done(struct bch_fs *c, bool initial)
 		unsigned b;
 
 		for (b = 0; b < BCH_DATA_NR; b++)
-			copy_dev_field(buckets[b],
-				       "buckets[%s]", bch2_data_types[b]);
-		copy_dev_field(buckets_alloc, "buckets_alloc");
-		copy_dev_field(buckets_ec, "buckets_ec");
+			copy_dev_field(buckets[b],	"buckets[%s]",
+				       bch2_data_types[b]);
+		copy_dev_field(buckets_alloc,		"buckets_alloc");
+		copy_dev_field(buckets_ec,		"buckets_ec");
+		copy_dev_field(buckets_unavailable,	"buckets_unavailable");
 
 		for (b = 0; b < BCH_DATA_NR; b++)
-			copy_dev_field(sectors[b],
-				       "sectors[%s]", bch2_data_types[b]);
-		copy_dev_field(sectors_fragmented,
-			       "sectors_fragmented");
+			copy_dev_field(sectors[b],	"sectors[%s]",
+				       bch2_data_types[b]);
+		copy_dev_field(sectors_fragmented,	"sectors_fragmented");
 	}
 
 	{
@@ -682,7 +631,7 @@ static void bch2_gc_done(struct bch_fs *c, bool initial)
 			copy_fs_field(data[i], "data[%i]", i);
 		}
 	}
-out:
+
 	percpu_up_write(&c->mark_lock);
 
 #undef copy_fs_field
@@ -745,7 +694,9 @@ static int bch2_gc_start(struct bch_fs *c)
 		dst->nbuckets		= src->nbuckets;
 
 		for (b = 0; b < src->nbuckets; b++)
-			dst->b[b]._mark.gen = src->b[b].mark.gen;
+			dst->b[b]._mark.gen =
+				dst->b[b].oldest_gen =
+				src->b[b].mark.gen;
 	};
 
 	percpu_up_write(&c->mark_lock);
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index d2e047ee29cf..5a3ecbcd5ad4 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -1148,7 +1148,6 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
 	struct bucket_array *buckets = NULL, *old_buckets = NULL;
 	unsigned long *buckets_nouse = NULL;
 	unsigned long *buckets_written = NULL;
-	u8 *oldest_gens = NULL;
 	alloc_fifo	free[RESERVE_NR];
 	alloc_fifo	free_inc;
 	alloc_heap	alloc_heap;
@@ -1174,8 +1173,6 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
 	if (!(buckets		= kvpmalloc(sizeof(struct bucket_array) +
 					    nbuckets * sizeof(struct bucket),
 					    GFP_KERNEL|__GFP_ZERO)) ||
-	    !(oldest_gens	= kvpmalloc(nbuckets * sizeof(u8),
-					    GFP_KERNEL|__GFP_ZERO)) ||
 	    !(buckets_nouse	= kvpmalloc(BITS_TO_LONGS(nbuckets) *
 					    sizeof(unsigned long),
 					    GFP_KERNEL|__GFP_ZERO)) ||
@@ -1210,9 +1207,6 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
 		memcpy(buckets->b,
 		       old_buckets->b,
 		       n * sizeof(struct bucket));
-		memcpy(oldest_gens,
-		       ca->oldest_gens,
-		       n * sizeof(u8));
 		memcpy(buckets_nouse,
 		       ca->buckets_nouse,
 		       BITS_TO_LONGS(n) * sizeof(unsigned long));
@@ -1224,7 +1218,6 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
 	rcu_assign_pointer(ca->buckets[0], buckets);
 	buckets = old_buckets;
 
-	swap(ca->oldest_gens, oldest_gens);
 	swap(ca->buckets_nouse, buckets_nouse);
 	swap(ca->buckets_written, buckets_written);
 
@@ -1268,8 +1261,6 @@ err:
 		BITS_TO_LONGS(nbuckets) * sizeof(unsigned long));
 	kvpfree(buckets_written,
 		BITS_TO_LONGS(nbuckets) * sizeof(unsigned long));
-	kvpfree(oldest_gens,
-		nbuckets * sizeof(u8));
 	if (buckets)
 		call_rcu(&old_buckets->rcu, buckets_free_rcu);
 
@@ -1289,7 +1280,6 @@ void bch2_dev_buckets_free(struct bch_dev *ca)
 		BITS_TO_LONGS(ca->mi.nbuckets) * sizeof(unsigned long));
 	kvpfree(ca->buckets_nouse,
 		BITS_TO_LONGS(ca->mi.nbuckets) * sizeof(unsigned long));
-	kvpfree(ca->oldest_gens, ca->mi.nbuckets * sizeof(u8));
 	kvpfree(rcu_dereference_protected(ca->buckets[0], 1),
 		sizeof(struct bucket_array) +
 		ca->mi.nbuckets * sizeof(struct bucket));
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index e5b9d5cb1215..885280899dc6 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -87,7 +87,9 @@ static inline u16 bucket_last_io(struct bch_fs *c, struct bucket *g, int rw)
 
 static inline u8 bucket_gc_gen(struct bch_dev *ca, size_t b)
 {
-	return bucket(ca, b)->mark.gen - ca->oldest_gens[b];
+	struct bucket *g = bucket(ca, b);
+
+	return g->mark.gen - g->oldest_gen;
 }
 
 static inline size_t PTR_BUCKET_NR(const struct bch_dev *ca,
diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h
index 65b4bb39f88e..6eaee889f1e1 100644
--- a/fs/bcachefs/buckets_types.h
+++ b/fs/bcachefs/buckets_types.h
@@ -39,6 +39,7 @@ struct bucket {
 	};
 
 	u16				io_time[2];
+	u8				oldest_gen;
 	unsigned			gen_valid:1;
 };
 
-- 
cgit 


From 39fbc5a49f3377d21980cdc34c5fb55332bff3b9 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 11 Feb 2019 22:08:09 -0500
Subject: bcachefs: gc lock no longer needed for disk reservations

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c      |  1 -
 fs/bcachefs/btree_gc.c              | 21 ++++++++++-----------
 fs/bcachefs/btree_types.h           |  1 -
 fs/bcachefs/btree_update_interior.c |  5 ++---
 fs/bcachefs/btree_update_leaf.c     | 12 ------------
 fs/bcachefs/buckets.c               | 37 ++++++++++++-------------------------
 fs/bcachefs/buckets.h               | 17 ++---------------
 fs/bcachefs/extents.c               |  8 ++------
 8 files changed, 28 insertions(+), 74 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 9d2e21d99e6e..7c57de5390b4 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -1474,7 +1474,6 @@ not_enough:
 							   &journal_seq);
 
 				fifo_push(&ca->free[RESERVE_BTREE], bu);
-				bucket_set_dirty(ca, bu);
 			}
 		}
 
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 391389d431c8..315f2d76947a 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -141,24 +141,23 @@ static int bch2_gc_mark_key(struct bch_fs *c, struct bkey_s_c k,
 
 		bkey_for_each_ptr(ptrs, ptr) {
 			struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
-			size_t b = PTR_BUCKET_NR(ca, ptr);
-			struct bucket *g = PTR_BUCKET(ca, ptr);
+			struct bucket *g = PTR_BUCKET(ca, ptr, true);
 
 			if (mustfix_fsck_err_on(!g->gen_valid, c,
 					"found ptr with missing gen in alloc btree,\n"
 					"type %u gen %u",
 					k.k->type, ptr->gen)) {
-				g->_mark.gen = ptr->gen;
-				g->gen_valid = 1;
-				bucket_set_dirty(ca, b);
+				g->_mark.gen	= ptr->gen;
+				g->_mark.dirty	= true;
+				g->gen_valid	= 1;
 			}
 
 			if (mustfix_fsck_err_on(gen_cmp(ptr->gen, g->mark.gen) > 0, c,
 					"%u ptr gen in the future: %u > %u",
 					k.k->type, ptr->gen, g->mark.gen)) {
-				g->_mark.gen = ptr->gen;
-				g->gen_valid = 1;
-				bucket_set_dirty(ca, b);
+				g->_mark.gen	= ptr->gen;
+				g->_mark.dirty	= true;
+				g->gen_valid	= 1;
 				set_bit(BCH_FS_FIXED_GENS, &c->flags);
 			}
 		}
@@ -166,8 +165,7 @@ static int bch2_gc_mark_key(struct bch_fs *c, struct bkey_s_c k,
 
 	bkey_for_each_ptr(ptrs, ptr) {
 		struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
-		size_t b = PTR_BUCKET_NR(ca, ptr);
-		struct bucket *g = __bucket(ca, b, true);
+		struct bucket *g = PTR_BUCKET(ca, ptr, true);
 
 		if (gen_after(g->oldest_gen, ptr->gen))
 			g->oldest_gen = ptr->gen;
@@ -646,13 +644,14 @@ static int bch2_gc_start(struct bch_fs *c)
 	struct bch_dev *ca;
 	unsigned i;
 
+	percpu_down_write(&c->mark_lock);
+
 	/*
 	 * indicate to stripe code that we need to allocate for the gc stripes
 	 * radix tree, too
 	 */
 	gc_pos_set(c, gc_phase(GC_PHASE_START));
 
-	percpu_down_write(&c->mark_lock);
 	BUG_ON(c->usage[1]);
 
 	c->usage[1] = __alloc_percpu_gfp(sizeof(struct bch_fs_usage) +
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index b5a4853451a7..5f0e0009ec5d 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -490,7 +490,6 @@ enum btree_insert_ret {
 	/* leaf node needs to be split */
 	BTREE_INSERT_BTREE_NODE_FULL,
 	BTREE_INSERT_ENOSPC,
-	BTREE_INSERT_NEED_GC_LOCK,
 	BTREE_INSERT_NEED_MARK_REPLICAS,
 };
 
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 451b293c44a6..6dff960e095d 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -484,7 +484,7 @@ static struct btree_reserve *bch2_btree_reserve_get(struct bch_fs *c,
 	struct btree *b;
 	struct disk_reservation disk_res = { 0, 0 };
 	unsigned sectors = nr_nodes * c->opts.btree_node_size;
-	int ret, disk_res_flags = BCH_DISK_RESERVATION_GC_LOCK_HELD;
+	int ret, disk_res_flags = 0;
 
 	if (flags & BTREE_INSERT_NOFAIL)
 		disk_res_flags |= BCH_DISK_RESERVATION_NOFAIL;
@@ -1947,8 +1947,7 @@ static void __bch2_btree_node_update_key(struct bch_fs *c,
 	ret = bch2_disk_reservation_add(c, &as->reserve->disk_res,
 			c->opts.btree_node_size *
 			bch2_bkey_nr_ptrs(bkey_i_to_s_c(&new_key->k_i)),
-			BCH_DISK_RESERVATION_NOFAIL|
-			BCH_DISK_RESERVATION_GC_LOCK_HELD);
+			BCH_DISK_RESERVATION_NOFAIL);
 	BUG_ON(ret);
 
 	parent = btree_node_parent(iter, b);
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index d1a2ac48ed29..5555c6e1c7cf 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -719,18 +719,6 @@ err:
 			ret = -EINTR;
 		}
 		break;
-	case BTREE_INSERT_NEED_GC_LOCK:
-		ret = -EINTR;
-
-		if (!down_read_trylock(&c->gc_lock)) {
-			if (flags & BTREE_INSERT_NOUNLOCK)
-				goto out;
-
-			bch2_btree_iter_unlock(trans->entries[0].iter);
-			down_read(&c->gc_lock);
-		}
-		up_read(&c->gc_lock);
-		break;
 	case BTREE_INSERT_ENOSPC:
 		ret = -ENOSPC;
 		break;
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 5a3ecbcd5ad4..9aa369c6f28e 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -407,14 +407,14 @@ static inline void update_cached_sectors(struct bch_fs *c,
 }
 
 static void __bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca,
-				     size_t b, struct bucket_mark *old,
+				     size_t b, struct bucket_mark *ret,
 				     bool gc)
 {
 	struct bch_fs_usage *fs_usage = this_cpu_ptr(c->usage[gc]);
 	struct bucket *g = __bucket(ca, b, gc);
-	struct bucket_mark new;
+	struct bucket_mark old, new;
 
-	*old = bucket_data_cmpxchg(c, ca, fs_usage, g, new, ({
+	old = bucket_data_cmpxchg(c, ca, fs_usage, g, new, ({
 		BUG_ON(!is_available_bucket(new));
 
 		new.owned_by_allocator	= true;
@@ -425,9 +425,12 @@ static void __bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca,
 		new.gen++;
 	}));
 
-	if (old->cached_sectors)
+	if (old.cached_sectors)
 		update_cached_sectors(c, fs_usage, ca->dev_idx,
-				      -old->cached_sectors);
+				      -old.cached_sectors);
+
+	if (ret)
+		*ret = old;
 }
 
 void bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca,
@@ -437,6 +440,9 @@ void bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca,
 
 	__bch2_invalidate_bucket(c, ca, b, old, false);
 
+	if (gc_visited(c, gc_phase(GC_PHASE_START)))
+		__bch2_invalidate_bucket(c, ca, b, NULL, true);
+
 	if (!old->owned_by_allocator && old->cached_sectors)
 		trace_invalidate(ca, bucket_to_sector(ca, b),
 				 old->cached_sectors);
@@ -1091,24 +1097,8 @@ out:
 	return 0;
 
 recalculate:
-	/*
-	 * GC recalculates sectors_available when it starts, so that hopefully
-	 * we don't normally end up blocking here:
-	 */
-
-	/*
-	 * Piss fuck, we can be called from extent_insert_fixup() with btree
-	 * locks held:
-	 */
-
-	if (!(flags & BCH_DISK_RESERVATION_GC_LOCK_HELD)) {
-		if (!(flags & BCH_DISK_RESERVATION_BTREE_LOCKS_HELD))
-			down_read(&c->gc_lock);
-		else if (!down_read_trylock(&c->gc_lock))
-			return -EINTR;
-	}
-
 	percpu_down_write(&c->mark_lock);
+
 	sectors_available = bch2_recalc_sectors_available(c);
 
 	if (sectors <= sectors_available ||
@@ -1125,9 +1115,6 @@ recalculate:
 
 	percpu_up_write(&c->mark_lock);
 
-	if (!(flags & BCH_DISK_RESERVATION_GC_LOCK_HELD))
-		up_read(&c->gc_lock);
-
 	return ret;
 }
 
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index 885280899dc6..ecc4ae22f736 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -57,18 +57,6 @@ static inline struct bucket *bucket(struct bch_dev *ca, size_t b)
 	return __bucket(ca, b, false);
 }
 
-static inline void bucket_set_dirty(struct bch_dev *ca, size_t b)
-{
-	struct bucket *g;
-	struct bucket_mark m;
-
-	rcu_read_lock();
-	g = bucket(ca, b);
-	bucket_cmpxchg(g, m, m.dirty = true);
-	rcu_read_unlock();
-
-}
-
 static inline void bucket_io_clock_reset(struct bch_fs *c, struct bch_dev *ca,
 					 size_t b, int rw)
 {
@@ -99,7 +87,8 @@ static inline size_t PTR_BUCKET_NR(const struct bch_dev *ca,
 }
 
 static inline struct bucket *PTR_BUCKET(struct bch_dev *ca,
-					const struct bch_extent_ptr *ptr)
+					const struct bch_extent_ptr *ptr,
+					bool gc)
 {
 	return bucket(ca, PTR_BUCKET_NR(ca, ptr));
 }
@@ -285,8 +274,6 @@ static inline void bch2_disk_reservation_put(struct bch_fs *c,
 }
 
 #define BCH_DISK_RESERVATION_NOFAIL		(1 << 0)
-#define BCH_DISK_RESERVATION_GC_LOCK_HELD	(1 << 1)
-#define BCH_DISK_RESERVATION_BTREE_LOCKS_HELD	(1 << 2)
 
 int bch2_disk_reservation_add(struct bch_fs *,
 			     struct disk_reservation *,
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 1d96a1773f74..41194462be30 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -979,10 +979,8 @@ bch2_extent_can_insert(struct btree_insert *trans,
 
 	if (overlap == BCH_EXTENT_OVERLAP_MIDDLE &&
 	    (sectors = bch2_extent_is_compressed(k))) {
-		int flags = BCH_DISK_RESERVATION_BTREE_LOCKS_HELD;
-
-		if (trans->flags & BTREE_INSERT_NOFAIL)
-			flags |= BCH_DISK_RESERVATION_NOFAIL;
+		int flags = trans->flags & BTREE_INSERT_NOFAIL
+			? BCH_DISK_RESERVATION_NOFAIL : 0;
 
 		switch (bch2_disk_reservation_add(trans->c,
 				trans->disk_res,
@@ -991,8 +989,6 @@ bch2_extent_can_insert(struct btree_insert *trans,
 			break;
 		case -ENOSPC:
 			return BTREE_INSERT_ENOSPC;
-		case -EINTR:
-			return BTREE_INSERT_NEED_GC_LOCK;
 		default:
 			BUG();
 		}
-- 
cgit 


From 2ecc6171a3267fe24d7ee28059631a630344c310 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 12 Feb 2019 14:58:55 -0500
Subject: bcachefs: Fix double counting when gc is running

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update_interior.c |  9 +++------
 fs/bcachefs/buckets.c               | 12 ++----------
 fs/bcachefs/buckets.h               |  2 +-
 3 files changed, 6 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 6dff960e095d..60aa28e3f5f8 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -1088,8 +1088,7 @@ static void bch2_btree_set_root_inmem(struct btree_update *as, struct btree *b)
 		bch2_btree_node_free_index(as, NULL,
 					   bkey_i_to_s_c(&old->key),
 					   fs_usage);
-	bch2_fs_usage_apply(c, fs_usage, &as->reserve->disk_res,
-			    gc_pos_btree_root(b->btree_id));
+	bch2_fs_usage_apply(c, fs_usage, &as->reserve->disk_res);
 
 	preempt_enable();
 	percpu_up_read(&c->mark_lock);
@@ -1192,8 +1191,7 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, struct btree *b
 					   bkey_disassemble(b, k, &tmp),
 					   fs_usage);
 
-	bch2_fs_usage_apply(c, fs_usage, &as->reserve->disk_res,
-			    gc_pos_btree_node(b));
+	bch2_fs_usage_apply(c, fs_usage, &as->reserve->disk_res);
 
 	preempt_enable();
 	percpu_up_read(&c->mark_lock);
@@ -1993,8 +1991,7 @@ static void __bch2_btree_node_update_key(struct bch_fs *c,
 		bch2_btree_node_free_index(as, NULL,
 					   bkey_i_to_s_c(&b->key),
 					   fs_usage);
-		bch2_fs_usage_apply(c, fs_usage, &as->reserve->disk_res,
-				    gc_pos_btree_root(b->btree_id));
+		bch2_fs_usage_apply(c, fs_usage, &as->reserve->disk_res);
 
 		percpu_up_read(&c->mark_lock);
 		mutex_unlock(&c->btree_interior_update_lock);
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 9aa369c6f28e..58074f791c62 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -255,8 +255,7 @@ static bool bucket_became_unavailable(struct bucket_mark old,
 
 int bch2_fs_usage_apply(struct bch_fs *c,
 			struct bch_fs_usage *fs_usage,
-			struct disk_reservation *disk_res,
-			struct gc_pos gc_pos)
+			struct disk_reservation *disk_res)
 {
 	s64 added = fs_usage->s.data + fs_usage->s.reserved;
 	s64 should_not_have_added;
@@ -285,13 +284,6 @@ int bch2_fs_usage_apply(struct bch_fs *c,
 	acc_u64s((u64 *) this_cpu_ptr(c->usage[0]),
 		 (u64 *) fs_usage,
 		 sizeof(*fs_usage) / sizeof(u64) + c->replicas.nr);
-
-	if (gc_visited(c, gc_pos)) {
-		BUG_ON(!c->usage[1]);
-		acc_u64s((u64 *) this_cpu_ptr(c->usage[1]),
-			 (u64 *) fs_usage,
-			 sizeof(*fs_usage) / sizeof(u64) + c->replicas.nr);
-	}
 	preempt_enable();
 
 	return ret;
@@ -1001,7 +993,7 @@ void bch2_mark_update(struct btree_insert *trans,
 		bch2_btree_node_iter_advance(&node_iter, b);
 	}
 
-	if (bch2_fs_usage_apply(c, fs_usage, trans->disk_res, pos) &&
+	if (bch2_fs_usage_apply(c, fs_usage, trans->disk_res) &&
 	    !warned_disk_usage &&
 	    !xchg(&warned_disk_usage, 1)) {
 		char buf[200];
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index ecc4ae22f736..4d4a10203e5c 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -260,7 +260,7 @@ int bch2_mark_key(struct bch_fs *, struct bkey_s_c,
 		  struct bch_fs_usage *, u64, unsigned);
 void bch2_mark_update(struct btree_insert *, struct btree_insert_entry *);
 int bch2_fs_usage_apply(struct bch_fs *, struct bch_fs_usage *,
-			struct disk_reservation *, struct gc_pos);
+			struct disk_reservation *);
 
 /* disk reservations: */
 
-- 
cgit 


From 8777210b92c661a50fb8147574cb0a366566ae07 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 12 Feb 2019 15:03:47 -0500
Subject: bcachefs: refactor key marking code a bit

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_gc.c |  15 +++---
 fs/bcachefs/buckets.c  | 122 +++++++++++++++++++++----------------------------
 fs/bcachefs/buckets.h  |   4 +-
 3 files changed, 62 insertions(+), 79 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 315f2d76947a..922d34abc675 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -232,12 +232,12 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id,
 
 		bch2_verify_btree_nr_keys(b);
 
+		gc_pos_set(c, gc_pos_btree_node(b));
+
 		ret = btree_gc_mark_node(c, b, &max_stale, initial);
 		if (ret)
 			break;
 
-		gc_pos_set(c, gc_pos_btree_node(b));
-
 		if (!initial) {
 			if (max_stale > 64)
 				bch2_btree_node_rewrite(c, &iter,
@@ -623,10 +623,13 @@ static void bch2_gc_done(struct bch_fs *c, bool initial)
 				      "persistent_reserved[%i]", i);
 
 		for (i = 0; i < c->replicas.nr; i++) {
-			/*
-			 * XXX: print out replicas entry
-			 */
-			copy_fs_field(data[i], "data[%i]", i);
+			struct bch_replicas_entry *e =
+				cpu_replicas_entry(&c->replicas, i);
+			char buf[80];
+
+			bch2_replicas_entry_to_text(&PBUF(buf), e);
+
+			copy_fs_field(data[i], "%s", buf);
 		}
 	}
 
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 58074f791c62..16d82832277c 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -398,9 +398,22 @@ static inline void update_cached_sectors(struct bch_fs *c,
 	update_replicas(c, fs_usage, &r.e, sectors);
 }
 
-static void __bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca,
-				     size_t b, struct bucket_mark *ret,
-				     bool gc)
+#define do_mark_fn(fn, c, pos, flags, ...)				\
+({									\
+	int gc, ret = 0;						\
+									\
+	percpu_rwsem_assert_held(&c->mark_lock);			\
+									\
+	for (gc = 0; gc < 2 && !ret; gc++)				\
+		if (!gc == !(flags & BCH_BUCKET_MARK_GC) ||		\
+		    (gc && gc_visited(c, pos)))				\
+			ret = fn(c, __VA_ARGS__, gc);			\
+	ret;								\
+})
+
+static int __bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca,
+				    size_t b, struct bucket_mark *ret,
+				    bool gc)
 {
 	struct bch_fs_usage *fs_usage = this_cpu_ptr(c->usage[gc]);
 	struct bucket *g = __bucket(ca, b, gc);
@@ -421,28 +434,25 @@ static void __bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca,
 		update_cached_sectors(c, fs_usage, ca->dev_idx,
 				      -old.cached_sectors);
 
-	if (ret)
+	if (!gc)
 		*ret = old;
+	return 0;
 }
 
 void bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca,
 			    size_t b, struct bucket_mark *old)
 {
-	percpu_rwsem_assert_held(&c->mark_lock);
-
-	__bch2_invalidate_bucket(c, ca, b, old, false);
-
-	if (gc_visited(c, gc_phase(GC_PHASE_START)))
-		__bch2_invalidate_bucket(c, ca, b, NULL, true);
+	do_mark_fn(__bch2_invalidate_bucket, c, gc_phase(GC_PHASE_START), 0,
+		   ca, b, old);
 
 	if (!old->owned_by_allocator && old->cached_sectors)
 		trace_invalidate(ca, bucket_to_sector(ca, b),
 				 old->cached_sectors);
 }
 
-static void __bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
-				     size_t b, bool owned_by_allocator,
-				     bool gc)
+static int __bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
+				    size_t b, bool owned_by_allocator,
+				    bool gc)
 {
 	struct bch_fs_usage *fs_usage = this_cpu_ptr(c->usage[gc]);
 	struct bucket *g = __bucket(ca, b, gc);
@@ -454,20 +464,16 @@ static void __bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
 
 	BUG_ON(!gc &&
 	       !owned_by_allocator && !old.owned_by_allocator);
+
+	return 0;
 }
 
 void bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
 			    size_t b, bool owned_by_allocator,
 			    struct gc_pos pos, unsigned flags)
 {
-	percpu_rwsem_assert_held(&c->mark_lock);
-
-	if (!(flags & BCH_BUCKET_MARK_GC))
-		__bch2_mark_alloc_bucket(c, ca, b, owned_by_allocator, false);
-
-	if ((flags & BCH_BUCKET_MARK_GC) ||
-	    gc_visited(c, pos))
-		__bch2_mark_alloc_bucket(c, ca, b, owned_by_allocator, true);
+	do_mark_fn(__bch2_mark_alloc_bucket, c, pos, flags,
+		   ca, b, owned_by_allocator);
 }
 
 #define checked_add(a, b)					\
@@ -477,9 +483,9 @@ do {								\
 	BUG_ON((a) != _res);					\
 } while (0)
 
-static void __bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
-					size_t b, enum bch_data_type type,
-					unsigned sectors, bool gc)
+static int __bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
+				       size_t b, enum bch_data_type type,
+				       unsigned sectors, bool gc)
 {
 	struct bch_fs_usage *fs_usage = this_cpu_ptr(c->usage[gc]);
 	struct bucket *g = __bucket(ca, b, gc);
@@ -493,6 +499,8 @@ static void __bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
 		new.data_type	= type;
 		checked_add(new.dirty_sectors, sectors);
 	}));
+
+	return 0;
 }
 
 void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
@@ -506,15 +514,8 @@ void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
 	preempt_disable();
 
 	if (likely(c)) {
-		percpu_rwsem_assert_held(&c->mark_lock);
-
-		if (!(flags & BCH_BUCKET_MARK_GC))
-			__bch2_mark_metadata_bucket(c, ca, b, type, sectors,
-						    false);
-		if ((flags & BCH_BUCKET_MARK_GC) ||
-		    gc_visited(c, pos))
-			__bch2_mark_metadata_bucket(c, ca, b, type, sectors,
-						    true);
+		do_mark_fn(__bch2_mark_metadata_bucket, c, pos, flags,
+			   ca, b, type, sectors);
 	} else {
 		struct bucket *g;
 		struct bucket_mark new;
@@ -833,30 +834,28 @@ static int __bch2_mark_key(struct bch_fs *c, struct bkey_s_c k,
 			   unsigned journal_seq, unsigned flags,
 			   bool gc)
 {
-	int ret = 0;
+	if (!fs_usage || gc)
+		fs_usage = this_cpu_ptr(c->usage[gc]);
 
 	switch (k.k->type) {
 	case KEY_TYPE_btree_ptr:
-		ret = bch2_mark_extent(c, k, inserting
-				       ?  c->opts.btree_node_size
-				       : -c->opts.btree_node_size,
-				       BCH_DATA_BTREE,
-				       fs_usage, journal_seq, flags, gc);
-		break;
+		return bch2_mark_extent(c, k, inserting
+					?  c->opts.btree_node_size
+					: -c->opts.btree_node_size,
+					BCH_DATA_BTREE,
+					fs_usage, journal_seq, flags, gc);
 	case KEY_TYPE_extent:
-		ret = bch2_mark_extent(c, k, sectors, BCH_DATA_USER,
-				       fs_usage, journal_seq, flags, gc);
-		break;
+		return bch2_mark_extent(c, k, sectors, BCH_DATA_USER,
+					fs_usage, journal_seq, flags, gc);
 	case KEY_TYPE_stripe:
-		ret = bch2_mark_stripe(c, k, inserting,
-				       fs_usage, journal_seq, flags, gc);
-		break;
+		return bch2_mark_stripe(c, k, inserting,
+					fs_usage, journal_seq, flags, gc);
 	case KEY_TYPE_inode:
 		if (inserting)
 			fs_usage->s.nr_inodes++;
 		else
 			fs_usage->s.nr_inodes--;
-		break;
+		return 0;
 	case KEY_TYPE_reservation: {
 		unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas;
 
@@ -866,13 +865,11 @@ static int __bch2_mark_key(struct bch_fs *c, struct bkey_s_c k,
 
 		fs_usage->s.reserved				+= sectors;
 		fs_usage->persistent_reserved[replicas - 1]	+= sectors;
-		break;
+		return 0;
 	}
 	default:
-		break;
+		return 0;
 	}
-
-	return ret;
 }
 
 int bch2_mark_key_locked(struct bch_fs *c,
@@ -882,26 +879,9 @@ int bch2_mark_key_locked(struct bch_fs *c,
 		   struct bch_fs_usage *fs_usage,
 		   u64 journal_seq, unsigned flags)
 {
-	int ret;
-
-	if (!(flags & BCH_BUCKET_MARK_GC)) {
-		ret = __bch2_mark_key(c, k, inserting, sectors,
-				      fs_usage ?: this_cpu_ptr(c->usage[0]),
-				      journal_seq, flags, false);
-		if (ret)
-			return ret;
-	}
-
-	if ((flags & BCH_BUCKET_MARK_GC) ||
-	    gc_visited(c, pos)) {
-		ret = __bch2_mark_key(c, k, inserting, sectors,
-				      this_cpu_ptr(c->usage[1]),
-				      journal_seq, flags, true);
-		if (ret)
-			return ret;
-	}
-
-	return 0;
+	return do_mark_fn(__bch2_mark_key, c, pos, flags,
+			  k, inserting, sectors, fs_usage,
+			  journal_seq, flags);
 }
 
 int bch2_mark_key(struct bch_fs *c, struct bkey_s_c k,
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index 4d4a10203e5c..ffdf176d7ed2 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -249,8 +249,8 @@ void bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *,
 			       size_t, enum bch_data_type, unsigned,
 			       struct gc_pos, unsigned);
 
-#define BCH_BUCKET_MARK_NOATOMIC		(1 << 0)
-#define BCH_BUCKET_MARK_GC			(1 << 1)
+#define BCH_BUCKET_MARK_GC			(1 << 0)
+#define BCH_BUCKET_MARK_NOATOMIC		(1 << 1)
 
 int bch2_mark_key_locked(struct bch_fs *, struct bkey_s_c,
 		  bool, s64, struct gc_pos,
-- 
cgit 


From 8c96cfccf045efff12d8287a41f2b8f4ef3094c2 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 13 Feb 2019 15:17:23 -0500
Subject: bcachefs: fix more locking bugs

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c      |  3 ++-
 fs/bcachefs/btree_update_interior.c | 15 ++++++++++-----
 2 files changed, 12 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 7c57de5390b4..5b9d6c77d037 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -886,7 +886,8 @@ static int bch2_invalidate_buckets(struct bch_fs *c, struct bch_dev *ca)
 		ret = __bch2_alloc_write_key(c, ca, b, &iter,
 				must_flush ? &journal_seq : NULL,
 				BTREE_INSERT_GC_LOCK_HELD|
-				!fifo_empty(&ca->free_inc) ? BTREE_INSERT_NOWAIT : 0);
+				(!fifo_empty(&ca->free_inc)
+				 ? BTREE_INSERT_NOWAIT : 0));
 	}
 
 	bch2_btree_iter_unlock(&iter);
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 60aa28e3f5f8..7ccf2f935701 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -1567,7 +1567,8 @@ int bch2_btree_split_leaf(struct bch_fs *c, struct btree_iter *iter,
 	closure_init_stack(&cl);
 
 	/* Hack, because gc and splitting nodes doesn't mix yet: */
-	if (!down_read_trylock(&c->gc_lock)) {
+	if (!(flags & BTREE_INSERT_GC_LOCK_HELD) &&
+	    !down_read_trylock(&c->gc_lock)) {
 		if (flags & BTREE_INSERT_NOUNLOCK)
 			return -EINTR;
 
@@ -1610,7 +1611,8 @@ int bch2_btree_split_leaf(struct bch_fs *c, struct btree_iter *iter,
 	 */
 	__bch2_btree_iter_downgrade(iter, 1);
 out:
-	up_read(&c->gc_lock);
+	if (!(flags & BTREE_INSERT_GC_LOCK_HELD))
+		up_read(&c->gc_lock);
 	closure_sync(&cl);
 	return ret;
 }
@@ -1688,7 +1690,8 @@ retry:
 	}
 
 	/* We're changing btree topology, doesn't mix with gc: */
-	if (!down_read_trylock(&c->gc_lock))
+	if (!(flags & BTREE_INSERT_GC_LOCK_HELD) &&
+	    !down_read_trylock(&c->gc_lock))
 		goto err_cycle_gc_lock;
 
 	if (!bch2_btree_iter_upgrade(iter, U8_MAX,
@@ -1748,7 +1751,8 @@ retry:
 
 	bch2_btree_update_done(as);
 
-	up_read(&c->gc_lock);
+	if (!(flags & BTREE_INSERT_GC_LOCK_HELD))
+		up_read(&c->gc_lock);
 out:
 	bch2_btree_iter_verify_locks(iter);
 
@@ -1779,7 +1783,8 @@ err_cycle_gc_lock:
 
 err_unlock:
 	six_unlock_intent(&m->lock);
-	up_read(&c->gc_lock);
+	if (!(flags & BTREE_INSERT_GC_LOCK_HELD))
+		up_read(&c->gc_lock);
 err:
 	BUG_ON(ret == -EAGAIN && (flags & BTREE_INSERT_NOUNLOCK));
 
-- 
cgit 


From 73c27c60956ed55d165d41658745dead49d689c4 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 14 Feb 2019 15:42:41 -0500
Subject: bcachefs: fixes for cached data accounting

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/buckets.c | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 16d82832277c..949541f15e7d 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -432,7 +432,7 @@ static int __bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca,
 
 	if (old.cached_sectors)
 		update_cached_sectors(c, fs_usage, ca->dev_idx,
-				      -old.cached_sectors);
+				      -((s64) old.cached_sectors));
 
 	if (!gc)
 		*ret = old;
@@ -561,7 +561,7 @@ static s64 ptr_disk_sectors_delta(struct extent_ptr_decoded p,
  * loop, to avoid racing with the start of gc clearing all the marks - GC does
  * that with the gc pos seqlock held.
  */
-static void bch2_mark_pointer(struct bch_fs *c,
+static bool bch2_mark_pointer(struct bch_fs *c,
 			      struct extent_ptr_decoded p,
 			      s64 sectors, enum bch_data_type data_type,
 			      struct bch_fs_usage *fs_usage,
@@ -589,7 +589,7 @@ static void bch2_mark_pointer(struct bch_fs *c,
 			BUG_ON(!test_bit(BCH_FS_ALLOC_READ_DONE, &c->flags));
 			EBUG_ON(!p.ptr.cached &&
 				test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags));
-			return;
+			return true;
 		}
 
 		if (!p.ptr.cached)
@@ -620,6 +620,8 @@ static void bch2_mark_pointer(struct bch_fs *c,
 	bch2_dev_usage_update(c, ca, fs_usage, old, new, gc);
 
 	BUG_ON(!gc && bucket_became_unavailable(old, new));
+
+	return false;
 }
 
 static int bch2_mark_stripe_ptr(struct bch_fs *c,
@@ -702,13 +704,13 @@ static int bch2_mark_extent(struct bch_fs *c, struct bkey_s_c k,
 		s64 disk_sectors = data_type == BCH_DATA_BTREE
 			? sectors
 			: ptr_disk_sectors_delta(p, sectors);
-
-		bch2_mark_pointer(c, p, disk_sectors, data_type,
-				  fs_usage, journal_seq, flags, gc);
+		bool stale = bch2_mark_pointer(c, p, disk_sectors, data_type,
+					fs_usage, journal_seq, flags, gc);
 
 		if (p.ptr.cached) {
-			update_cached_sectors(c, fs_usage, p.ptr.dev,
-					      disk_sectors);
+			if (disk_sectors && !stale)
+				update_cached_sectors(c, fs_usage, p.ptr.dev,
+						      disk_sectors);
 		} else if (!p.ec_nr) {
 			dirty_sectors	       += disk_sectors;
 			r.e.devs[r.e.nr_devs++]	= p.ptr.dev;
-- 
cgit 


From 8fe826f90aad4ea314d0acdf7425a9bf2324e17f Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 13 Feb 2019 14:46:32 -0500
Subject: bcachefs: Convert bucket invalidation to key marking path

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c | 211 +++++++++++++++++++++++++++++++++++------
 fs/bcachefs/alloc_background.h |   9 ++
 fs/bcachefs/bcachefs_format.h  |  16 ++--
 fs/bcachefs/btree_types.h      |   1 +
 fs/bcachefs/btree_update.h     |   9 +-
 fs/bcachefs/buckets.c          |  71 ++++++++++++--
 fs/bcachefs/buckets.h          |   3 +-
 fs/bcachefs/fifo.h             |   2 +-
 fs/bcachefs/journal_io.c       |   3 +-
 9 files changed, 276 insertions(+), 49 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 5b9d6c77d037..04b75367fcde 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -129,6 +129,34 @@ static inline void put_alloc_field(struct bkey_i_alloc *a, void **p,
 	*p += bytes;
 }
 
+struct bkey_alloc_unpacked bch2_alloc_unpack(const struct bch_alloc *a)
+{
+	struct bkey_alloc_unpacked ret = { .gen = a->gen };
+	const void *d = a->data;
+	unsigned idx = 0;
+
+#define x(_name, _bits)	ret._name = get_alloc_field(a, &d, idx++);
+	BCH_ALLOC_FIELDS()
+#undef  x
+	return ret;
+}
+
+static void bch2_alloc_pack(struct bkey_i_alloc *dst,
+			    const struct bkey_alloc_unpacked src)
+{
+	unsigned idx = 0;
+	void *d = dst->v.data;
+
+	dst->v.fields	= 0;
+	dst->v.gen	= src.gen;
+
+#define x(_name, _bits)	put_alloc_field(dst, &d, idx++, src._name);
+	BCH_ALLOC_FIELDS()
+#undef  x
+
+	set_bkey_val_bytes(&dst->k, (void *) d - (void *) &dst->v);
+}
+
 static unsigned bch_alloc_val_u64s(const struct bch_alloc *a)
 {
 	unsigned i, bytes = offsetof(struct bch_alloc, data);
@@ -174,16 +202,24 @@ void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c,
 static void __alloc_read_key(struct bucket *g, const struct bch_alloc *a)
 {
 	const void *d = a->data;
-	unsigned idx = 0;
+	unsigned idx = 0, data_type, dirty_sectors, cached_sectors;
+	struct bucket_mark m;
 
-	g->_mark.gen		= a->gen;
-	g->gen_valid		= 1;
 	g->io_time[READ]	= get_alloc_field(a, &d, idx++);
 	g->io_time[WRITE]	= get_alloc_field(a, &d, idx++);
-	g->_mark.data_type	= get_alloc_field(a, &d, idx++);
-	g->_mark.dirty_sectors	= get_alloc_field(a, &d, idx++);
-	g->_mark.cached_sectors	= get_alloc_field(a, &d, idx++);
+	data_type		= get_alloc_field(a, &d, idx++);
+	dirty_sectors		= get_alloc_field(a, &d, idx++);
+	cached_sectors		= get_alloc_field(a, &d, idx++);
 	g->oldest_gen		= get_alloc_field(a, &d, idx++);
+
+	bucket_cmpxchg(g, m, ({
+		m.gen			= a->gen;
+		m.data_type		= data_type;
+		m.dirty_sectors		= dirty_sectors;
+		m.cached_sectors	= cached_sectors;
+	}));
+
+	g->gen_valid		= 1;
 }
 
 static void __alloc_write_key(struct bkey_i_alloc *a, struct bucket *g,
@@ -318,6 +354,7 @@ static int __bch2_alloc_write_key(struct bch_fs *c, struct bch_dev *ca,
 				   BTREE_INSERT_NOFAIL|
 				   BTREE_INSERT_USE_RESERVE|
 				   BTREE_INSERT_USE_ALLOC_RESERVE|
+				   BTREE_INSERT_NOMARK|
 				   flags,
 				   BTREE_INSERT_ENTRY(iter, &a->k_i));
 	if (ret)
@@ -361,7 +398,8 @@ int bch2_alloc_replay_key(struct bch_fs *c, struct bkey_i *k)
 		? 0
 		: bch2_btree_insert_at(c, NULL, NULL,
 				       BTREE_INSERT_NOFAIL|
-				       BTREE_INSERT_JOURNAL_REPLAY,
+				       BTREE_INSERT_JOURNAL_REPLAY|
+				       BTREE_INSERT_NOMARK,
 				       BTREE_INSERT_ENTRY(&iter, k));
 err:
 	bch2_btree_iter_unlock(&iter);
@@ -827,6 +865,142 @@ static inline long next_alloc_bucket(struct bch_dev *ca)
 	return -1;
 }
 
+/*
+ * returns sequence number of most recent journal entry that updated this
+ * bucket:
+ */
+static u64 bucket_journal_seq(struct bch_fs *c, struct bucket_mark m)
+{
+	if (m.journal_seq_valid) {
+		u64 journal_seq = atomic64_read(&c->journal.seq);
+		u64 bucket_seq	= journal_seq;
+
+		bucket_seq &= ~((u64) U16_MAX);
+		bucket_seq |= m.journal_seq;
+
+		if (bucket_seq > journal_seq)
+			bucket_seq -= 1 << 16;
+
+		return bucket_seq;
+	} else {
+		return 0;
+	}
+}
+
+static int bch2_invalidate_one_bucket2(struct bch_fs *c, struct bch_dev *ca,
+				       struct btree_iter *iter,
+				       u64 *journal_seq, unsigned flags)
+{
+#if 0
+	__BKEY_PADDED(k, BKEY_ALLOC_VAL_U64s_MAX) alloc_key;
+#else
+	/* hack: */
+	__BKEY_PADDED(k, 8) alloc_key;
+#endif
+	struct bkey_i_alloc *a;
+	struct bkey_alloc_unpacked u;
+	struct bucket_mark m;
+	struct bkey_s_c k;
+	bool invalidating_cached_data;
+	size_t b;
+	int ret;
+
+	BUG_ON(!ca->alloc_heap.used ||
+	       !ca->alloc_heap.data[0].nr);
+	b = ca->alloc_heap.data[0].bucket;
+
+	/* first, put on free_inc and mark as owned by allocator: */
+	percpu_down_read(&c->mark_lock);
+	spin_lock(&c->freelist_lock);
+
+	verify_not_on_freelist(c, ca, b);
+
+	BUG_ON(!fifo_push(&ca->free_inc, b));
+
+	bch2_mark_alloc_bucket(c, ca, b, true, gc_pos_alloc(c, NULL), 0);
+	m = bucket(ca, b)->mark;
+
+	spin_unlock(&c->freelist_lock);
+	percpu_up_read(&c->mark_lock);
+
+	bch2_btree_iter_cond_resched(iter);
+
+	BUG_ON(BKEY_ALLOC_VAL_U64s_MAX > 8);
+
+	bch2_btree_iter_set_pos(iter, POS(ca->dev_idx, b));
+retry:
+	k = bch2_btree_iter_peek_slot(iter);
+	ret = btree_iter_err(k);
+	if (ret)
+		return ret;
+
+	if (k.k && k.k->type == KEY_TYPE_alloc)
+		u = bch2_alloc_unpack(bkey_s_c_to_alloc(k).v);
+	else
+		memset(&u, 0, sizeof(u));
+
+	invalidating_cached_data = u.cached_sectors != 0;
+
+	//BUG_ON(u.dirty_sectors);
+	u.data_type	= 0;
+	u.dirty_sectors	= 0;
+	u.cached_sectors = 0;
+	u.read_time	= c->bucket_clock[READ].hand;
+	u.write_time	= c->bucket_clock[WRITE].hand;
+	u.gen++;
+
+	a = bkey_alloc_init(&alloc_key.k);
+	a->k.p = iter->pos;
+	bch2_alloc_pack(a, u);
+
+	ret = bch2_btree_insert_at(c, NULL,
+			invalidating_cached_data ? journal_seq : NULL,
+			BTREE_INSERT_ATOMIC|
+			BTREE_INSERT_NOCHECK_RW|
+			BTREE_INSERT_NOFAIL|
+			BTREE_INSERT_USE_RESERVE|
+			BTREE_INSERT_USE_ALLOC_RESERVE|
+			flags,
+			BTREE_INSERT_ENTRY(iter, &a->k_i));
+	if (ret == -EINTR)
+		goto retry;
+
+	if (!ret) {
+		/* remove from alloc_heap: */
+		struct alloc_heap_entry e, *top = ca->alloc_heap.data;
+
+		top->bucket++;
+		top->nr--;
+
+		if (!top->nr)
+			heap_pop(&ca->alloc_heap, e, bucket_alloc_cmp, NULL);
+
+		/*
+		 * Make sure we flush the last journal entry that updated this
+		 * bucket (i.e. deleting the last reference) before writing to
+		 * this bucket again:
+		 */
+		*journal_seq = max(*journal_seq, bucket_journal_seq(c, m));
+	} else {
+		size_t b2;
+
+		/* remove from free_inc: */
+		percpu_down_read(&c->mark_lock);
+		spin_lock(&c->freelist_lock);
+
+		bch2_mark_alloc_bucket(c, ca, b, false,
+				       gc_pos_alloc(c, NULL), 0);
+
+		BUG_ON(!fifo_pop_back(&ca->free_inc, b2));
+		BUG_ON(b != b2);
+
+		spin_unlock(&c->freelist_lock);
+		percpu_up_read(&c->mark_lock);
+	}
+
+	return ret;
+}
+
 static bool bch2_invalidate_one_bucket(struct bch_fs *c, struct bch_dev *ca,
 				       size_t bucket, u64 *flush_seq)
 {
@@ -847,18 +1021,7 @@ static bool bch2_invalidate_one_bucket(struct bch_fs *c, struct bch_dev *ca,
 
 	percpu_up_read(&c->mark_lock);
 
-	if (m.journal_seq_valid) {
-		u64 journal_seq = atomic64_read(&c->journal.seq);
-		u64 bucket_seq	= journal_seq;
-
-		bucket_seq &= ~((u64) U16_MAX);
-		bucket_seq |= m.journal_seq;
-
-		if (bucket_seq > journal_seq)
-			bucket_seq -= 1 << 16;
-
-		*flush_seq = max(*flush_seq, bucket_seq);
-	}
+	*flush_seq = max(*flush_seq, bucket_journal_seq(c, m));
 
 	return m.cached_sectors != 0;
 }
@@ -871,7 +1034,6 @@ static int bch2_invalidate_buckets(struct bch_fs *c, struct bch_dev *ca)
 	struct btree_iter iter;
 	u64 journal_seq = 0;
 	int ret = 0;
-	long b;
 
 	bch2_btree_iter_init(&iter, c, BTREE_ID_ALLOC, POS(ca->dev_idx, 0),
 			     BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
@@ -879,16 +1041,11 @@ static int bch2_invalidate_buckets(struct bch_fs *c, struct bch_dev *ca)
 	/* Only use nowait if we've already invalidated at least one bucket: */
 	while (!ret &&
 	       !fifo_full(&ca->free_inc) &&
-	       (b = next_alloc_bucket(ca)) >= 0) {
-		bool must_flush =
-			bch2_invalidate_one_bucket(c, ca, b, &journal_seq);
-
-		ret = __bch2_alloc_write_key(c, ca, b, &iter,
-				must_flush ? &journal_seq : NULL,
+	       ca->alloc_heap.used)
+		ret = bch2_invalidate_one_bucket2(c, ca, &iter, &journal_seq,
 				BTREE_INSERT_GC_LOCK_HELD|
 				(!fifo_empty(&ca->free_inc)
 				 ? BTREE_INSERT_NOWAIT : 0));
-	}
 
 	bch2_btree_iter_unlock(&iter);
 
diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h
index 04f1e9152494..ff6eccf904af 100644
--- a/fs/bcachefs/alloc_background.h
+++ b/fs/bcachefs/alloc_background.h
@@ -6,6 +6,15 @@
 #include "alloc_types.h"
 #include "debug.h"
 
+struct bkey_alloc_unpacked {
+	u8		gen;
+#define x(_name, _bits)	u##_bits _name;
+	BCH_ALLOC_FIELDS()
+#undef  x
+};
+
+struct bkey_alloc_unpacked bch2_alloc_unpack(const struct bch_alloc *);
+
 #define ALLOC_SCAN_BATCH(ca)		max_t(size_t, 1, (ca)->mi.nbuckets >> 9)
 
 const char *bch2_alloc_invalid(const struct bch_fs *, struct bkey_s_c);
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index c7971e5c7c36..9a3ca6fa30b7 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -826,12 +826,12 @@ struct bch_alloc {
 } __attribute__((packed, aligned(8)));
 
 #define BCH_ALLOC_FIELDS()			\
-	x(read_time, 2)				\
-	x(write_time, 2)			\
-	x(data_type, 1)				\
-	x(dirty_sectors, 2)			\
-	x(cached_sectors, 2)			\
-	x(oldest_gen, 1)
+	x(read_time,		16)		\
+	x(write_time,		16)		\
+	x(data_type,		8)		\
+	x(dirty_sectors,	16)		\
+	x(cached_sectors,	16)		\
+	x(oldest_gen,		8)
 
 enum {
 #define x(name, bytes) BCH_ALLOC_FIELD_##name,
@@ -841,12 +841,12 @@ enum {
 };
 
 static const unsigned BCH_ALLOC_FIELD_BYTES[] = {
-#define x(name, bytes) [BCH_ALLOC_FIELD_##name] = bytes,
+#define x(name, bits) [BCH_ALLOC_FIELD_##name] = bits / 8,
 	BCH_ALLOC_FIELDS()
 #undef x
 };
 
-#define x(name, bytes) + bytes
+#define x(name, bits) + (bits / 8)
 static const unsigned BKEY_ALLOC_VAL_U64s_MAX =
 	DIV_ROUND_UP(offsetof(struct bch_alloc, data)
 		     BCH_ALLOC_FIELDS(), sizeof(u64));
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 5f0e0009ec5d..7e58e82daec1 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -456,6 +456,7 @@ static inline bool btree_node_is_extents(struct btree *b)
 static inline bool btree_node_type_needs_gc(enum btree_node_type type)
 {
 	switch (type) {
+	case BKEY_TYPE_ALLOC:
 	case BKEY_TYPE_BTREE:
 	case BKEY_TYPE_EXTENTS:
 	case BKEY_TYPE_INODES:
diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
index 9bcab29bd033..1fd01fb40482 100644
--- a/fs/bcachefs/btree_update.h
+++ b/fs/bcachefs/btree_update.h
@@ -82,6 +82,7 @@ enum {
 	__BTREE_INSERT_USE_RESERVE,
 	__BTREE_INSERT_USE_ALLOC_RESERVE,
 	__BTREE_INSERT_JOURNAL_REPLAY,
+	__BTREE_INSERT_NOMARK,
 	__BTREE_INSERT_NOWAIT,
 	__BTREE_INSERT_GC_LOCK_HELD,
 	__BCH_HASH_SET_MUST_CREATE,
@@ -108,12 +109,12 @@ enum {
 #define BTREE_INSERT_USE_RESERVE	(1 << __BTREE_INSERT_USE_RESERVE)
 #define BTREE_INSERT_USE_ALLOC_RESERVE	(1 << __BTREE_INSERT_USE_ALLOC_RESERVE)
 
-/*
- * Insert is for journal replay: don't get journal reservations, or mark extents
- * (bch_mark_key)
- */
+/* Insert is for journal replay - don't get journal reservations: */
 #define BTREE_INSERT_JOURNAL_REPLAY	(1 << __BTREE_INSERT_JOURNAL_REPLAY)
 
+/* Don't call bch2_mark_key: */
+#define BTREE_INSERT_NOMARK		(1 << __BTREE_INSERT_NOMARK)
+
 /* Don't block on allocation failure (for new btree nodes: */
 #define BTREE_INSERT_NOWAIT		(1 << __BTREE_INSERT_NOWAIT)
 #define BTREE_INSERT_GC_LOCK_HELD	(1 << __BTREE_INSERT_GC_LOCK_HELD)
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 949541f15e7d..3286ee26f7e2 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -476,6 +476,60 @@ void bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
 		   ca, b, owned_by_allocator);
 }
 
+static int bch2_mark_alloc(struct bch_fs *c, struct bkey_s_c k,
+			   bool inserting,
+			   struct bch_fs_usage *fs_usage,
+			   unsigned journal_seq, unsigned flags,
+			   bool gc)
+{
+	struct bkey_alloc_unpacked u;
+	struct bch_dev *ca;
+	struct bucket *g;
+	struct bucket_mark old, m;
+
+	if (!inserting)
+		return 0;
+
+	/*
+	 * alloc btree is read in by bch2_alloc_read, not gc:
+	 */
+	if (flags & BCH_BUCKET_MARK_GC)
+		return 0;
+
+	u = bch2_alloc_unpack(bkey_s_c_to_alloc(k).v);
+	ca = bch_dev_bkey_exists(c, k.k->p.inode);
+	g = __bucket(ca, k.k->p.offset, gc);
+
+	/*
+	 * this should currently only be getting called from the bucket
+	 * invalidate path:
+	 */
+	BUG_ON(u.dirty_sectors);
+	BUG_ON(u.cached_sectors);
+	BUG_ON(!g->mark.owned_by_allocator);
+
+	old = bucket_data_cmpxchg(c, ca, fs_usage, g, m, ({
+		m.gen			= u.gen;
+		m.data_type		= u.data_type;
+		m.dirty_sectors		= u.dirty_sectors;
+		m.cached_sectors	= u.cached_sectors;
+	}));
+
+	g->io_time[READ]	= u.read_time;
+	g->io_time[WRITE]	= u.write_time;
+	g->oldest_gen		= u.oldest_gen;
+	g->gen_valid		= 1;
+
+	if (old.cached_sectors) {
+		update_cached_sectors(c, fs_usage, ca->dev_idx,
+				      -old.cached_sectors);
+		trace_invalidate(ca, bucket_to_sector(ca, k.k->p.offset),
+				 old.cached_sectors);
+	}
+
+	return 0;
+}
+
 #define checked_add(a, b)					\
 do {								\
 	unsigned _res = (unsigned) (a) + (b);			\
@@ -840,18 +894,21 @@ static int __bch2_mark_key(struct bch_fs *c, struct bkey_s_c k,
 		fs_usage = this_cpu_ptr(c->usage[gc]);
 
 	switch (k.k->type) {
+	case KEY_TYPE_alloc:
+		return bch2_mark_alloc(c, k, inserting,
+				fs_usage, journal_seq, flags, gc);
 	case KEY_TYPE_btree_ptr:
 		return bch2_mark_extent(c, k, inserting
-					?  c->opts.btree_node_size
-					: -c->opts.btree_node_size,
-					BCH_DATA_BTREE,
-					fs_usage, journal_seq, flags, gc);
+				?  c->opts.btree_node_size
+				: -c->opts.btree_node_size,
+				BCH_DATA_BTREE,
+				fs_usage, journal_seq, flags, gc);
 	case KEY_TYPE_extent:
 		return bch2_mark_extent(c, k, sectors, BCH_DATA_USER,
-					fs_usage, journal_seq, flags, gc);
+				fs_usage, journal_seq, flags, gc);
 	case KEY_TYPE_stripe:
 		return bch2_mark_stripe(c, k, inserting,
-					fs_usage, journal_seq, flags, gc);
+				fs_usage, journal_seq, flags, gc);
 	case KEY_TYPE_inode:
 		if (inserting)
 			fs_usage->s.nr_inodes++;
@@ -922,7 +979,7 @@ void bch2_mark_update(struct btree_insert *trans,
 	preempt_disable();
 	fs_usage = bch2_fs_usage_get_scratch(c);
 
-	if (!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))
+	if (!(trans->flags & BTREE_INSERT_NOMARK))
 		bch2_mark_key_locked(c, bkey_i_to_s_c(insert->k), true,
 			bpos_min(insert->k->k.p, b->key.k.p).offset -
 			bkey_start_offset(&insert->k->k),
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index ffdf176d7ed2..973bf605cbd9 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -17,13 +17,14 @@
 
 #define bucket_cmpxchg(g, new, expr)				\
 ({								\
+	struct bucket *_g = g;					\
 	u64 _v = atomic64_read(&(g)->_mark.v);			\
 	struct bucket_mark _old;				\
 								\
 	do {							\
 		(new).v.counter = _old.v.counter = _v;		\
 		expr;						\
-	} while ((_v = atomic64_cmpxchg(&(g)->_mark.v,		\
+	} while ((_v = atomic64_cmpxchg(&(_g)->_mark.v,		\
 			       _old.v.counter,			\
 			       (new).v.counter)) != _old.v.counter);\
 	_old;							\
diff --git a/fs/bcachefs/fifo.h b/fs/bcachefs/fifo.h
index 0cd5f1931aac..cdb272708a4b 100644
--- a/fs/bcachefs/fifo.h
+++ b/fs/bcachefs/fifo.h
@@ -101,7 +101,7 @@ do {									\
 ({									\
 	bool _r = !fifo_empty((fifo));					\
 	if (_r)								\
-		(i) = (fifo)->data[--(fifo)->back & (fifo)->mask]	\
+		(i) = (fifo)->data[--(fifo)->back & (fifo)->mask];	\
 	_r;								\
 })
 
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index bfa1045b0eb5..17eba4269719 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -854,7 +854,8 @@ int bch2_journal_replay(struct bch_fs *c, struct list_head *list)
 				ret = bch2_btree_insert(c, entry->btree_id, k,
 						&disk_res, NULL,
 						BTREE_INSERT_NOFAIL|
-						BTREE_INSERT_JOURNAL_REPLAY);
+						BTREE_INSERT_JOURNAL_REPLAY|
+						BTREE_INSERT_NOMARK);
 			}
 
 			if (ret) {
-- 
cgit 


From 768ac63924775d9fe2e76fbb254704d5ee3bcb85 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 14 Feb 2019 18:38:52 -0500
Subject: bcachefs: Add a mechanism for blocking the journal

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_foreground.c |  2 +-
 fs/bcachefs/btree_gc.c         | 12 +++---
 fs/bcachefs/buckets.c          | 94 ++++++++++++++++++++++++------------------
 fs/bcachefs/buckets.h          |  9 +---
 fs/bcachefs/buckets_types.h    | 28 ++++++-------
 fs/bcachefs/chardev.c          |  6 +--
 fs/bcachefs/journal.c          | 44 ++++++++++++++++++--
 fs/bcachefs/journal.h          |  3 ++
 fs/bcachefs/journal_types.h    |  3 ++
 fs/bcachefs/recovery.c         |  2 +-
 fs/bcachefs/replicas.c         |  8 ++--
 fs/bcachefs/super-io.c         |  4 +-
 fs/bcachefs/sysfs.c            | 14 +++----
 13 files changed, 138 insertions(+), 91 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index f40fca9328f9..ba0640e3f981 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -724,7 +724,7 @@ static struct write_point *__writepoint_find(struct hlist_head *head,
 static inline bool too_many_writepoints(struct bch_fs *c, unsigned factor)
 {
 	u64 stranded	= c->write_points_nr * c->bucket_size_max;
-	u64 free	= bch2_fs_sectors_free(c);
+	u64 free	= bch2_fs_usage_read_short(c).free;
 
 	return stranded * factor > free;
 }
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 922d34abc675..5091966b7b54 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -612,11 +612,11 @@ static void bch2_gc_done(struct bch_fs *c, bool initial)
 		struct bch_fs_usage *src = (void *)
 			bch2_acc_percpu_u64s((void *) c->usage[1], nr);
 
-		copy_fs_field(s.hidden,		"hidden");
-		copy_fs_field(s.data,		"data");
-		copy_fs_field(s.cached,		"cached");
-		copy_fs_field(s.reserved,	"reserved");
-		copy_fs_field(s.nr_inodes,	"nr_inodes");
+		copy_fs_field(hidden,		"hidden");
+		copy_fs_field(data,		"data");
+		copy_fs_field(cached,		"cached");
+		copy_fs_field(reserved,		"reserved");
+		copy_fs_field(nr_inodes,	"nr_inodes");
 
 		for (i = 0; i < BCH_REPLICAS_MAX; i++)
 			copy_fs_field(persistent_reserved[i],
@@ -629,7 +629,7 @@ static void bch2_gc_done(struct bch_fs *c, bool initial)
 
 			bch2_replicas_entry_to_text(&PBUF(buf), e);
 
-			copy_fs_field(data[i], "%s", buf);
+			copy_fs_field(replicas[i], "%s", buf);
 		}
 	}
 
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 3286ee26f7e2..ac54d82f9e11 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -124,7 +124,7 @@ void bch2_fs_usage_initialize(struct bch_fs *c)
 	usage = (void *) bch2_acc_percpu_u64s((void *) c->usage[0], nr);
 
 	for (i = 0; i < BCH_REPLICAS_MAX; i++)
-		usage->s.reserved += usage->persistent_reserved[i];
+		usage->reserved += usage->persistent_reserved[i];
 
 	for (i = 0; i < c->replicas.nr; i++) {
 		struct bch_replicas_entry *e =
@@ -133,10 +133,10 @@ void bch2_fs_usage_initialize(struct bch_fs *c)
 		switch (e->data_type) {
 		case BCH_DATA_BTREE:
 		case BCH_DATA_USER:
-			usage->s.data	+= usage->data[i];
+			usage->data	+= usage->replicas[i];
 			break;
 		case BCH_DATA_CACHED:
-			usage->s.cached	+= usage->data[i];
+			usage->cached	+= usage->replicas[i];
 			break;
 		}
 	}
@@ -144,21 +144,16 @@ void bch2_fs_usage_initialize(struct bch_fs *c)
 	percpu_up_write(&c->mark_lock);
 }
 
-#define bch2_usage_read_raw(_stats)					\
-({									\
-	typeof(*this_cpu_ptr(_stats)) _acc;				\
-									\
-	memset(&_acc, 0, sizeof(_acc));					\
-	acc_u64s_percpu((u64 *) &_acc,					\
-			(u64 __percpu *) _stats,			\
-			sizeof(_acc) / sizeof(u64));			\
-									\
-	_acc;								\
-})
-
 struct bch_dev_usage bch2_dev_usage_read(struct bch_fs *c, struct bch_dev *ca)
 {
-	return bch2_usage_read_raw(ca->usage[0]);
+	struct bch_dev_usage ret;
+
+	memset(&ret, 0, sizeof(ret));
+	acc_u64s_percpu((u64 *) &ret,
+			(u64 __percpu *) ca->usage[0],
+			sizeof(ret) / sizeof(u64));
+
+	return ret;
 }
 
 struct bch_fs_usage *bch2_fs_usage_read(struct bch_fs *c)
@@ -198,27 +193,44 @@ static u64 avail_factor(u64 r)
 	return (r << RESERVE_FACTOR) / ((1 << RESERVE_FACTOR) + 1);
 }
 
-u64 bch2_fs_sectors_used(struct bch_fs *c, struct bch_fs_usage fs_usage)
+u64 bch2_fs_sectors_used(struct bch_fs *c, struct bch_fs_usage *fs_usage)
 {
-	return min(fs_usage.s.hidden +
-		   fs_usage.s.data +
-		   reserve_factor(fs_usage.s.reserved +
-				  fs_usage.s.online_reserved),
+	return min(fs_usage->hidden +
+		   fs_usage->data +
+		   reserve_factor(fs_usage->reserved +
+				  fs_usage->online_reserved),
 		   c->capacity);
 }
 
+static struct bch_fs_usage_short
+__bch2_fs_usage_read_short(struct bch_fs *c)
+{
+	struct bch_fs_usage_short ret;
+	u64 data, reserved;
+
+	ret.capacity = c->capacity -
+		percpu_u64_get(&c->usage[0]->hidden);
+
+	data		= percpu_u64_get(&c->usage[0]->data);
+	reserved	= percpu_u64_get(&c->usage[0]->reserved) +
+		percpu_u64_get(&c->usage[0]->online_reserved);
+
+	ret.used	= min(ret.capacity, data + reserve_factor(reserved));
+	ret.free	= ret.capacity - ret.used;
+
+	ret.nr_inodes	= percpu_u64_get(&c->usage[0]->nr_inodes);
+
+	return ret;
+}
+
 struct bch_fs_usage_short
 bch2_fs_usage_read_short(struct bch_fs *c)
 {
-	struct bch_fs_usage_summarized usage =
-		bch2_usage_read_raw(&c->usage[0]->s);
 	struct bch_fs_usage_short ret;
 
-	ret.capacity	= READ_ONCE(c->capacity) - usage.hidden;
-	ret.used	= min(ret.capacity, usage.data +
-			      reserve_factor(usage.reserved +
-					     usage.online_reserved));
-	ret.nr_inodes	= usage.nr_inodes;
+	percpu_down_read(&c->mark_lock);
+	ret = __bch2_fs_usage_read_short(c);
+	percpu_up_read(&c->mark_lock);
 
 	return ret;
 }
@@ -257,7 +269,7 @@ int bch2_fs_usage_apply(struct bch_fs *c,
 			struct bch_fs_usage *fs_usage,
 			struct disk_reservation *disk_res)
 {
-	s64 added = fs_usage->s.data + fs_usage->s.reserved;
+	s64 added = fs_usage->data + fs_usage->reserved;
 	s64 should_not_have_added;
 	int ret = 0;
 
@@ -277,7 +289,7 @@ int bch2_fs_usage_apply(struct bch_fs *c,
 
 	if (added > 0) {
 		disk_res->sectors		-= added;
-		fs_usage->s.online_reserved	-= added;
+		fs_usage->online_reserved	-= added;
 	}
 
 	preempt_disable();
@@ -295,7 +307,7 @@ static inline void account_bucket(struct bch_fs_usage *fs_usage,
 				  int nr, s64 size)
 {
 	if (type == BCH_DATA_SB || type == BCH_DATA_JOURNAL)
-		fs_usage->s.hidden	+= size;
+		fs_usage->hidden	+= size;
 
 	dev_usage->buckets[type]	+= nr;
 }
@@ -381,10 +393,10 @@ static inline void update_replicas(struct bch_fs *c,
 	BUG_ON(!sectors);
 
 	if (r->data_type == BCH_DATA_CACHED)
-		fs_usage->s.cached	+= sectors;
+		fs_usage->cached	+= sectors;
 	else
-		fs_usage->s.data	+= sectors;
-	fs_usage->data[idx]		+= sectors;
+		fs_usage->data		+= sectors;
+	fs_usage->replicas[idx]		+= sectors;
 }
 
 static inline void update_cached_sectors(struct bch_fs *c,
@@ -911,9 +923,9 @@ static int __bch2_mark_key(struct bch_fs *c, struct bkey_s_c k,
 				fs_usage, journal_seq, flags, gc);
 	case KEY_TYPE_inode:
 		if (inserting)
-			fs_usage->s.nr_inodes++;
+			fs_usage->nr_inodes++;
 		else
-			fs_usage->s.nr_inodes--;
+			fs_usage->nr_inodes--;
 		return 0;
 	case KEY_TYPE_reservation: {
 		unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas;
@@ -922,7 +934,7 @@ static int __bch2_mark_key(struct bch_fs *c, struct bkey_s_c k,
 		replicas = clamp_t(unsigned, replicas, 1,
 				   ARRAY_SIZE(fs_usage->persistent_reserved));
 
-		fs_usage->s.reserved				+= sectors;
+		fs_usage->reserved				+= sectors;
 		fs_usage->persistent_reserved[replicas - 1]	+= sectors;
 		return 0;
 	}
@@ -1074,13 +1086,13 @@ static u64 bch2_recalc_sectors_available(struct bch_fs *c)
 {
 	percpu_u64_set(&c->pcpu->sectors_available, 0);
 
-	return avail_factor(bch2_fs_sectors_free(c));
+	return avail_factor(__bch2_fs_usage_read_short(c).free);
 }
 
 void __bch2_disk_reservation_put(struct bch_fs *c, struct disk_reservation *res)
 {
 	percpu_down_read(&c->mark_lock);
-	this_cpu_sub(c->usage[0]->s.online_reserved, res->sectors);
+	this_cpu_sub(c->usage[0]->online_reserved, res->sectors);
 	percpu_up_read(&c->mark_lock);
 
 	res->sectors = 0;
@@ -1120,7 +1132,7 @@ int bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res,
 
 out:
 	pcpu->sectors_available		-= sectors;
-	this_cpu_add(c->usage[0]->s.online_reserved, sectors);
+	this_cpu_add(c->usage[0]->online_reserved, sectors);
 	res->sectors			+= sectors;
 
 	preempt_enable();
@@ -1136,7 +1148,7 @@ recalculate:
 	    (flags & BCH_DISK_RESERVATION_NOFAIL)) {
 		atomic64_set(&c->sectors_available,
 			     max_t(s64, 0, sectors_available - sectors));
-		this_cpu_add(c->usage[0]->s.online_reserved, sectors);
+		this_cpu_add(c->usage[0]->online_reserved, sectors);
 		res->sectors			+= sectors;
 		ret = 0;
 	} else {
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index 973bf605cbd9..67a1d17610f3 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -225,18 +225,11 @@ static inline struct bch_fs_usage *bch2_fs_usage_get_scratch(struct bch_fs *c)
 
 struct bch_fs_usage *bch2_fs_usage_read(struct bch_fs *);
 
-u64 bch2_fs_sectors_used(struct bch_fs *, struct bch_fs_usage);
+u64 bch2_fs_sectors_used(struct bch_fs *, struct bch_fs_usage *);
 
 struct bch_fs_usage_short
 bch2_fs_usage_read_short(struct bch_fs *);
 
-static inline u64 bch2_fs_sectors_free(struct bch_fs *c)
-{
-	struct bch_fs_usage_short usage = bch2_fs_usage_read_short(c);
-
-	return usage.capacity - usage.used;
-}
-
 /* key/bucket marking: */
 
 void bch2_bucket_seq_cleanup(struct bch_fs *);
diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h
index 6eaee889f1e1..348d062dd744 100644
--- a/fs/bcachefs/buckets_types.h
+++ b/fs/bcachefs/buckets_types.h
@@ -64,35 +64,33 @@ struct bch_dev_usage {
 struct bch_fs_usage {
 	/* all fields are in units of 512 byte sectors: */
 
-	/* summarized: */
-	struct bch_fs_usage_summarized {
-		u64		online_reserved;
+	u64			online_reserved;
 
-		/* fields after online_reserved are cleared/recalculated by gc: */
-		u64		gc_start[0];
+	/* fields after online_reserved are cleared/recalculated by gc: */
+	u64			gc_start[0];
 
-		u64		hidden;
-		u64		data;
-		u64		cached;
-		u64		reserved;
-		u64		nr_inodes;
+	u64			hidden;
+	u64			data;
+	u64			cached;
+	u64			reserved;
+	u64			nr_inodes;
 
-		/* XXX: add stats for compression ratio */
+	/* XXX: add stats for compression ratio */
 #if 0
-		u64		uncompressed;
-		u64		compressed;
+	u64			uncompressed;
+	u64			compressed;
 #endif
-	} s;
 
 	/* broken out: */
 
 	u64			persistent_reserved[BCH_REPLICAS_MAX];
-	u64			data[];
+	u64			replicas[];
 };
 
 struct bch_fs_usage_short {
 	u64			capacity;
 	u64			used;
+	u64			free;
 	u64			nr_inodes;
 };
 
diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c
index f090b61f23f1..5ee38a6a442f 100644
--- a/fs/bcachefs/chardev.c
+++ b/fs/bcachefs/chardev.c
@@ -403,10 +403,10 @@ static long bch2_ioctl_usage(struct bch_fs *c,
 		if (!src)
 			return -ENOMEM;
 
-		percpu_up_read(&c->mark_lock);
+		dst.used		= bch2_fs_sectors_used(c, src);
+		dst.online_reserved	= src->online_reserved;
 
-		dst.used		= bch2_fs_sectors_used(c, *src);
-		dst.online_reserved	= src->s.online_reserved;
+		percpu_up_read(&c->mark_lock);
 
 		for (i = 0; i < BCH_REPLICAS_MAX; i++) {
 			dst.persistent_reserved[i] =
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index dd10f1c993e5..cf4729b7a083 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -212,6 +212,9 @@ static int journal_entry_open(struct journal *j)
 	lockdep_assert_held(&j->lock);
 	BUG_ON(journal_entry_is_open(j));
 
+	if (j->blocked)
+		return -EAGAIN;
+
 	if (!fifo_free(&j->pin))
 		return 0;
 
@@ -287,7 +290,7 @@ static bool __journal_entry_close(struct journal *j)
 		spin_unlock(&j->lock);
 		fallthrough;
 	case JOURNAL_UNLOCKED:
-		return true;
+		return false;
 	}
 }
 
@@ -297,6 +300,22 @@ static bool journal_entry_close(struct journal *j)
 	return __journal_entry_close(j);
 }
 
+static bool journal_quiesced(struct journal *j)
+{
+	bool ret;
+
+	spin_lock(&j->lock);
+	ret = !j->reservations.prev_buf_unwritten &&
+		!journal_entry_is_open(j);
+	__journal_entry_close(j);
+	return ret;
+}
+
+static void journal_quiesce(struct journal *j)
+{
+	wait_event(j->wait, journal_quiesced(j));
+}
+
 static void journal_write_work(struct work_struct *work)
 {
 	struct journal *j = container_of(work, struct journal, write_work.work);
@@ -722,6 +741,26 @@ int bch2_journal_flush(struct journal *j)
 	return bch2_journal_flush_seq(j, seq);
 }
 
+/* block/unlock the journal: */
+
+void bch2_journal_unblock(struct journal *j)
+{
+	spin_lock(&j->lock);
+	j->blocked--;
+	spin_unlock(&j->lock);
+
+	journal_wake(j);
+}
+
+void bch2_journal_block(struct journal *j)
+{
+	spin_lock(&j->lock);
+	j->blocked++;
+	spin_unlock(&j->lock);
+
+	journal_quiesce(j);
+}
+
 /* allocate journal on a device: */
 
 static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
@@ -931,8 +970,7 @@ void bch2_fs_journal_stop(struct journal *j)
 	    c->btree_roots_dirty)
 		bch2_journal_meta(j);
 
-	BUG_ON(journal_entry_is_open(j) ||
-	       j->reservations.prev_buf_unwritten);
+	journal_quiesce(j);
 
 	BUG_ON(!bch2_journal_error(j) &&
 	       test_bit(JOURNAL_NOT_EMPTY, &j->flags));
diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h
index 6ef34bdae628..5290cdeab585 100644
--- a/fs/bcachefs/journal.h
+++ b/fs/bcachefs/journal.h
@@ -370,6 +370,9 @@ static inline void bch2_journal_set_replay_done(struct journal *j)
 	set_bit(JOURNAL_REPLAY_DONE, &j->flags);
 }
 
+void bch2_journal_unblock(struct journal *);
+void bch2_journal_block(struct journal *);
+
 ssize_t bch2_journal_print_debug(struct journal *, char *);
 ssize_t bch2_journal_print_pins(struct journal *, char *);
 
diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h
index 5f6d2320c5cd..e952eb06eff5 100644
--- a/fs/bcachefs/journal_types.h
+++ b/fs/bcachefs/journal_types.h
@@ -142,6 +142,9 @@ struct journal {
 
 	spinlock_t		lock;
 
+	/* if nonzero, we may not open a new journal entry: */
+	unsigned		blocked;
+
 	/* Used when waiting because the journal was full */
 	wait_queue_head_t	wait;
 	struct closure_waitlist	async_wait;
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index e28917cf2cec..5ceab8c14d72 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -83,7 +83,7 @@ static int journal_replay_entry_early(struct bch_fs *c,
 					       le64_to_cpu(u->v));
 			break;
 		case FS_USAGE_INODES:
-			percpu_u64_set(&c->usage[0]->s.nr_inodes,
+			percpu_u64_set(&c->usage[0]->nr_inodes,
 				       le64_to_cpu(u->v));
 			break;
 		case FS_USAGE_KEY_VERSION:
diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c
index 6fee8fe37688..03bb6b51d15f 100644
--- a/fs/bcachefs/replicas.c
+++ b/fs/bcachefs/replicas.c
@@ -245,14 +245,14 @@ static void __replicas_table_update(struct bch_fs_usage __percpu *dst_p,
 	*dst = *src;
 
 	for (src_idx = 0; src_idx < src_r->nr; src_idx++) {
-		if (!src->data[src_idx])
+		if (!src->replicas[src_idx])
 			continue;
 
 		dst_idx = __replicas_entry_idx(dst_r,
 				cpu_replicas_entry(src_r, src_idx));
 		BUG_ON(dst_idx < 0);
 
-		dst->data[dst_idx] = src->data[src_idx];
+		dst->replicas[dst_idx] = src->replicas[src_idx];
 	}
 }
 
@@ -457,7 +457,7 @@ int bch2_replicas_gc_end(struct bch_fs *c, int ret)
 		if (__replicas_has_entry(&c->replicas_gc, e))
 			continue;
 
-		v = percpu_u64_get(&c->usage[0]->data[i]);
+		v = percpu_u64_get(&c->usage[0]->replicas[i]);
 		if (!v)
 			continue;
 
@@ -558,7 +558,7 @@ int bch2_replicas_set_usage(struct bch_fs *c,
 		BUG_ON(ret < 0);
 	}
 
-	percpu_u64_set(&c->usage[0]->data[idx], sectors);
+	percpu_u64_set(&c->usage[0]->replicas[idx], sectors);
 
 	return 0;
 }
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index 0b3a761fe93e..66e174d93a9c 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -930,7 +930,7 @@ bch2_journal_super_entries_add_common(struct bch_fs *c,
 	percpu_down_write(&c->mark_lock);
 
 	{
-		u64 nr_inodes = percpu_u64_get(&c->usage[0]->s.nr_inodes);
+		u64 nr_inodes = percpu_u64_get(&c->usage[0]->nr_inodes);
 		struct jset_entry_usage *u =
 			container_of(entry, struct jset_entry_usage, entry);
 
@@ -977,7 +977,7 @@ bch2_journal_super_entries_add_common(struct bch_fs *c,
 	for (i = 0; i < c->replicas.nr; i++) {
 		struct bch_replicas_entry *e =
 			cpu_replicas_entry(&c->replicas, i);
-		u64 sectors = percpu_u64_get(&c->usage[0]->data[i]);
+		u64 sectors = percpu_u64_get(&c->usage[0]->replicas[i]);
 		struct jset_entry_data_usage *u =
 			container_of(entry, struct jset_entry_data_usage, entry);
 
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index 8ad7b6026d1b..361f7b7addcf 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -244,17 +244,17 @@ static ssize_t show_fs_alloc_debug(struct bch_fs *c, char *buf)
 	pr_buf(&out, "capacity:\t\t\t%llu\n", c->capacity);
 
 	pr_buf(&out, "hidden:\t\t\t\t%llu\n",
-	       fs_usage->s.hidden);
+	       fs_usage->hidden);
 	pr_buf(&out, "data:\t\t\t\t%llu\n",
-	       fs_usage->s.data);
+	       fs_usage->data);
 	pr_buf(&out, "cached:\t\t\t\t%llu\n",
-	       fs_usage->s.cached);
+	       fs_usage->cached);
 	pr_buf(&out, "reserved:\t\t\t%llu\n",
-	       fs_usage->s.reserved);
+	       fs_usage->reserved);
 	pr_buf(&out, "nr_inodes:\t\t\t%llu\n",
-	       fs_usage->s.nr_inodes);
+	       fs_usage->nr_inodes);
 	pr_buf(&out, "online reserved:\t\t%llu\n",
-	       fs_usage->s.online_reserved);
+	       fs_usage->online_reserved);
 
 	for (i = 0;
 	     i < ARRAY_SIZE(fs_usage->persistent_reserved);
@@ -270,7 +270,7 @@ static ssize_t show_fs_alloc_debug(struct bch_fs *c, char *buf)
 
 		pr_buf(&out, "\t");
 		bch2_replicas_entry_to_text(&out, e);
-		pr_buf(&out, ":\t%llu\n", fs_usage->data[i]);
+		pr_buf(&out, ":\t%llu\n", fs_usage->replicas[i]);
 	}
 
 	percpu_up_read(&c->mark_lock);
-- 
cgit 


From ecf37a4a80ec029d640b9c18f87880d4ec4a726f Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 14 Feb 2019 20:39:17 -0500
Subject: bcachefs: fs_usage_u64s()

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_gc.c |  9 +++------
 fs/bcachefs/buckets.c  | 22 ++++++++++-----------
 fs/bcachefs/buckets.h  | 12 ++++++++----
 fs/bcachefs/replicas.c | 53 +++++++++++++++++++++++++-------------------------
 fs/bcachefs/super.c    |  7 +------
 5 files changed, 48 insertions(+), 55 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 5091966b7b54..56402fc64bc2 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -605,8 +605,7 @@ static void bch2_gc_done(struct bch_fs *c, bool initial)
 	}
 
 	{
-		unsigned nr = sizeof(struct bch_fs_usage) / sizeof(u64) +
-			c->replicas.nr;
+		unsigned nr = fs_usage_u64s(c);
 		struct bch_fs_usage *dst = (void *)
 			bch2_acc_percpu_u64s((void *) c->usage[0], nr);
 		struct bch_fs_usage *src = (void *)
@@ -657,10 +656,8 @@ static int bch2_gc_start(struct bch_fs *c)
 
 	BUG_ON(c->usage[1]);
 
-	c->usage[1] = __alloc_percpu_gfp(sizeof(struct bch_fs_usage) +
-					 sizeof(u64) * c->replicas.nr,
-					 sizeof(u64),
-					 GFP_KERNEL);
+	c->usage[1] = __alloc_percpu_gfp(fs_usage_u64s(c) * sizeof(u64),
+					 sizeof(u64), GFP_KERNEL);
 	percpu_up_write(&c->mark_lock);
 
 	if (!c->usage[1])
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index ac54d82f9e11..5011e7af3563 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -117,11 +117,11 @@ void bch2_bucket_seq_cleanup(struct bch_fs *c)
 void bch2_fs_usage_initialize(struct bch_fs *c)
 {
 	struct bch_fs_usage *usage;
-	unsigned i, nr;
+	unsigned i;
 
 	percpu_down_write(&c->mark_lock);
-	nr = sizeof(struct bch_fs_usage) / sizeof(u64) + c->replicas.nr;
-	usage = (void *) bch2_acc_percpu_u64s((void *) c->usage[0], nr);
+	usage = (void *) bch2_acc_percpu_u64s((void *) c->usage[0],
+					      fs_usage_u64s(c));
 
 	for (i = 0; i < BCH_REPLICAS_MAX; i++)
 		usage->reserved += usage->persistent_reserved[i];
@@ -159,24 +159,23 @@ struct bch_dev_usage bch2_dev_usage_read(struct bch_fs *c, struct bch_dev *ca)
 struct bch_fs_usage *bch2_fs_usage_read(struct bch_fs *c)
 {
 	struct bch_fs_usage *ret;
-	unsigned nr = READ_ONCE(c->replicas.nr);
+	unsigned v, u64s = fs_usage_u64s(c);
 retry:
-	ret = kzalloc(sizeof(*ret) + nr * sizeof(u64), GFP_NOFS);
+	ret = kzalloc(u64s * sizeof(u64), GFP_NOFS);
 	if (unlikely(!ret))
 		return NULL;
 
 	percpu_down_read(&c->mark_lock);
 
-	if (unlikely(nr < c->replicas.nr)) {
-		nr = c->replicas.nr;
+	v = fs_usage_u64s(c);
+	if (unlikely(u64s != v)) {
+		u64s = v;
 		percpu_up_read(&c->mark_lock);
 		kfree(ret);
 		goto retry;
 	}
 
-	acc_u64s_percpu((u64 *) ret,
-			(u64 __percpu *) c->usage[0],
-			sizeof(*ret) / sizeof(u64) + nr);
+	acc_u64s_percpu((u64 *) ret, (u64 __percpu *) c->usage[0], u64s);
 
 	return ret;
 }
@@ -294,8 +293,7 @@ int bch2_fs_usage_apply(struct bch_fs *c,
 
 	preempt_disable();
 	acc_u64s((u64 *) this_cpu_ptr(c->usage[0]),
-		 (u64 *) fs_usage,
-		 sizeof(*fs_usage) / sizeof(u64) + c->replicas.nr);
+		 (u64 *) fs_usage, fs_usage_u64s(c));
 	preempt_enable();
 
 	return ret;
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index 67a1d17610f3..5f0b5a6ec9ad 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -212,14 +212,18 @@ static inline u64 dev_buckets_free(struct bch_fs *c, struct bch_dev *ca)
 
 /* Filesystem usage: */
 
-static inline struct bch_fs_usage *bch2_fs_usage_get_scratch(struct bch_fs *c)
+static inline unsigned fs_usage_u64s(struct bch_fs *c)
 {
-	struct bch_fs_usage *ret;
 
-	ret = this_cpu_ptr(c->usage_scratch);
+	return sizeof(struct bch_fs_usage) / sizeof(u64) +
+		READ_ONCE(c->replicas.nr);
+}
 
-	memset(ret, 0, sizeof(*ret) + c->replicas.nr * sizeof(u64));
+static inline struct bch_fs_usage *bch2_fs_usage_get_scratch(struct bch_fs *c)
+{
+	struct bch_fs_usage *ret = this_cpu_ptr(c->usage_scratch);
 
+	memset(ret, 0, fs_usage_u64s(c) * sizeof(u64));
 	return ret;
 }
 
diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c
index 03bb6b51d15f..72592df9afc0 100644
--- a/fs/bcachefs/replicas.c
+++ b/fs/bcachefs/replicas.c
@@ -262,39 +262,37 @@ static void __replicas_table_update(struct bch_fs_usage __percpu *dst_p,
 static int replicas_table_update(struct bch_fs *c,
 				 struct bch_replicas_cpu *new_r)
 {
-	struct bch_fs_usage __percpu *new_usage[3] = { NULL, NULL, NULL };
+	struct bch_fs_usage __percpu *new_usage[2] = { NULL, NULL };
+	struct bch_fs_usage __percpu *new_scratch = NULL;
 	unsigned bytes = sizeof(struct bch_fs_usage) +
 		sizeof(u64) * new_r->nr;
-	unsigned i;
 	int ret = -ENOMEM;
 
-	for (i = 0; i < 3; i++) {
-		if (i < 2 && !c->usage[i])
-			continue;
-
-		new_usage[i] = __alloc_percpu_gfp(bytes, sizeof(u64),
-						  GFP_NOIO);
-		if (!new_usage[i])
-			goto err;
-	}
-
-	for (i = 0; i < 2; i++) {
-		if (!c->usage[i])
-			continue;
-
-		__replicas_table_update(new_usage[i],	new_r,
-					c->usage[i],	&c->replicas);
-
-		swap(c->usage[i], new_usage[i]);
-	}
-
-	swap(c->usage_scratch, new_usage[2]);
+	if (!(new_usage[0] = __alloc_percpu_gfp(bytes, sizeof(u64),
+						GFP_NOIO)) ||
+	    (c->usage[1] &&
+	     !(new_usage[1] = __alloc_percpu_gfp(bytes, sizeof(u64),
+						 GFP_NOIO))) ||
+	    !(new_scratch  = __alloc_percpu_gfp(bytes, sizeof(u64),
+						GFP_NOIO)))
+		goto err;
 
-	swap(c->replicas, *new_r);
+	if (c->usage[0])
+		__replicas_table_update(new_usage[0],	new_r,
+					c->usage[0],	&c->replicas);
+	if (c->usage[1])
+		__replicas_table_update(new_usage[1],	new_r,
+					c->usage[1],	&c->replicas);
+
+	swap(c->usage[0],	new_usage[0]);
+	swap(c->usage[1],	new_usage[1]);
+	swap(c->usage_scratch,	new_scratch);
+	swap(c->replicas,	*new_r);
 	ret = 0;
 err:
-	for (i = 0; i < 3; i++)
-		free_percpu(new_usage[i]);
+	free_percpu(new_scratch);
+	free_percpu(new_usage[1]);
+	free_percpu(new_usage[0]);
 	return ret;
 }
 
@@ -975,5 +973,6 @@ int bch2_fs_replicas_init(struct bch_fs *c)
 {
 	c->journal.entry_u64s_reserved +=
 		reserve_journal_replicas(c, &c->replicas);
-	return 0;
+
+	return replicas_table_update(c, &c->replicas);
 }
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 29cb12d841e7..be8c4a604d8c 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -535,7 +535,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 {
 	struct bch_sb_field_members *mi;
 	struct bch_fs *c;
-	unsigned i, iter_size, fs_usage_size;
+	unsigned i, iter_size;
 	const char *err;
 
 	pr_verbose_init(opts, "");
@@ -629,9 +629,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 		(btree_blocks(c) + 1) * 2 *
 		sizeof(struct btree_node_iter_set);
 
-	fs_usage_size = sizeof(struct bch_fs_usage) +
-		sizeof(u64) * c->replicas.nr;
-
 	if (!(c->wq = alloc_workqueue("bcachefs",
 				WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_HIGHPRI, 1)) ||
 	    !(c->copygc_wq = alloc_workqueue("bcache_copygc",
@@ -648,8 +645,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 			max(offsetof(struct btree_read_bio, bio),
 			    offsetof(struct btree_write_bio, wbio.bio)),
 			BIOSET_NEED_BVECS) ||
-	    !(c->usage[0] = __alloc_percpu(fs_usage_size, sizeof(u64))) ||
-	    !(c->usage_scratch = __alloc_percpu(fs_usage_size, sizeof(u64))) ||
 	    !(c->pcpu = alloc_percpu(struct bch_fs_pcpu)) ||
 	    mempool_init_kvpmalloc_pool(&c->btree_bounce_pool, 1,
 					btree_bytes(c)) ||
-- 
cgit 


From d16b4a77a5c64fca52ff637c22668b679b47ef22 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 18 Feb 2019 17:39:42 -0500
Subject: bcachefs: Assorted journal refactoring

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/journal.c       | 371 ++++++++++++++++++++------------------------
 fs/bcachefs/journal.h       |  24 +--
 fs/bcachefs/journal_io.c    |  70 +++++----
 fs/bcachefs/journal_io.h    |   2 +-
 fs/bcachefs/journal_types.h |  18 ++-
 5 files changed, 231 insertions(+), 254 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index cf4729b7a083..91d0e5d443ed 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -17,23 +17,14 @@
 #include "super-io.h"
 #include "trace.h"
 
-static bool journal_entry_is_open(struct journal *j)
+static bool __journal_entry_is_open(union journal_res_state state)
 {
-	return j->reservations.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL;
+	return state.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL;
 }
 
-void bch2_journal_buf_put_slowpath(struct journal *j, bool need_write_just_set)
+static bool journal_entry_is_open(struct journal *j)
 {
-	struct journal_buf *w = journal_prev_buf(j);
-
-	atomic_dec_bug(&journal_seq_pin(j, le64_to_cpu(w->data->seq))->count);
-
-	if (!need_write_just_set &&
-	    test_bit(JOURNAL_NEED_WRITE, &j->flags))
-		bch2_time_stats_update(j->delay_time,
-				       j->need_write_time);
-
-	closure_call(&j->io, bch2_journal_write, system_highpri_wq, NULL);
+	return __journal_entry_is_open(j->reservations);
 }
 
 static void journal_pin_new_entry(struct journal *j, int count)
@@ -77,39 +68,76 @@ static inline bool journal_entry_empty(struct jset *j)
 	return true;
 }
 
-static enum {
-	JOURNAL_ENTRY_ERROR,
-	JOURNAL_ENTRY_INUSE,
-	JOURNAL_ENTRY_CLOSED,
-	JOURNAL_UNLOCKED,
-} journal_buf_switch(struct journal *j, bool need_write_just_set)
+void bch2_journal_halt(struct journal *j)
+{
+	union journal_res_state old, new;
+	u64 v = atomic64_read(&j->reservations.counter);
+
+	do {
+		old.v = new.v = v;
+		if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL)
+			return;
+
+		new.cur_entry_offset = JOURNAL_ENTRY_ERROR_VAL;
+	} while ((v = atomic64_cmpxchg(&j->reservations.counter,
+				       old.v, new.v)) != old.v);
+
+	journal_wake(j);
+	closure_wake_up(&journal_cur_buf(j)->wait);
+	closure_wake_up(&journal_prev_buf(j)->wait);
+}
+
+/* journal entry close/open: */
+
+void __bch2_journal_buf_put(struct journal *j, bool need_write_just_set)
+{
+	struct journal_buf *w = journal_prev_buf(j);
+
+	atomic_dec_bug(&journal_seq_pin(j, le64_to_cpu(w->data->seq))->count);
+
+	if (!need_write_just_set &&
+	    test_bit(JOURNAL_NEED_WRITE, &j->flags))
+		bch2_time_stats_update(j->delay_time,
+				       j->need_write_time);
+
+	clear_bit(JOURNAL_NEED_WRITE, &j->flags);
+
+	closure_call(&j->io, bch2_journal_write, system_highpri_wq, NULL);
+}
+
+/*
+ * Returns true if journal entry is now closed:
+ */
+static bool __journal_entry_close(struct journal *j)
 {
 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
 	struct journal_buf *buf = journal_cur_buf(j);
 	union journal_res_state old, new;
 	u64 v = atomic64_read(&j->reservations.counter);
+	bool set_need_write = false;
+	unsigned sectors;
 
 	lockdep_assert_held(&j->lock);
 
 	do {
 		old.v = new.v = v;
 		if (old.cur_entry_offset == JOURNAL_ENTRY_CLOSED_VAL)
-			return JOURNAL_ENTRY_CLOSED;
+			return true;
 
 		if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL) {
 			/* this entry will never be written: */
 			closure_wake_up(&buf->wait);
-			return JOURNAL_ENTRY_ERROR;
+			return true;
 		}
 
-		if (new.prev_buf_unwritten)
-			return JOURNAL_ENTRY_INUSE;
+		if (!test_bit(JOURNAL_NEED_WRITE, &j->flags)) {
+			set_bit(JOURNAL_NEED_WRITE, &j->flags);
+			j->need_write_time = local_clock();
+			set_need_write = true;
+		}
 
-		/*
-		 * avoid race between setting buf->data->u64s and
-		 * journal_res_put starting write:
-		 */
-		journal_state_inc(&new);
+		if (new.prev_buf_unwritten)
+			return false;
 
 		new.cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL;
 		new.idx++;
@@ -119,15 +147,12 @@ static enum {
 	} while ((v = atomic64_cmpxchg(&j->reservations.counter,
 				       old.v, new.v)) != old.v);
 
-	clear_bit(JOURNAL_NEED_WRITE, &j->flags);
-
 	buf->data->u64s		= cpu_to_le32(old.cur_entry_offset);
 
-	j->prev_buf_sectors =
-		vstruct_blocks_plus(buf->data, c->block_bits,
-				    buf->u64s_reserved) *
-		c->opts.block_size;
-	BUG_ON(j->prev_buf_sectors > j->cur_buf_sectors);
+	sectors = vstruct_blocks_plus(buf->data, c->block_bits,
+				      buf->u64s_reserved) << c->block_bits;
+	BUG_ON(sectors > buf->sectors);
+	buf->sectors = sectors;
 
 	bkey_extent_init(&buf->key);
 
@@ -163,32 +188,22 @@ static enum {
 	bch2_journal_buf_init(j);
 
 	cancel_delayed_work(&j->write_work);
-	spin_unlock(&j->lock);
 
 	/* ugh - might be called from __journal_res_get() under wait_event() */
 	__set_current_state(TASK_RUNNING);
-	bch2_journal_buf_put(j, old.idx, need_write_just_set);
-
-	return JOURNAL_UNLOCKED;
+	bch2_journal_buf_put(j, old.idx, set_need_write);
+	return true;
 }
 
-void bch2_journal_halt(struct journal *j)
+static bool journal_entry_close(struct journal *j)
 {
-	union journal_res_state old, new;
-	u64 v = atomic64_read(&j->reservations.counter);
-
-	do {
-		old.v = new.v = v;
-		if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL)
-			return;
+	bool ret;
 
-		new.cur_entry_offset = JOURNAL_ENTRY_ERROR_VAL;
-	} while ((v = atomic64_cmpxchg(&j->reservations.counter,
-				       old.v, new.v)) != old.v);
+	spin_lock(&j->lock);
+	ret = __journal_entry_close(j);
+	spin_unlock(&j->lock);
 
-	journal_wake(j);
-	closure_wake_up(&journal_cur_buf(j)->wait);
-	closure_wake_up(&journal_prev_buf(j)->wait);
+	return ret;
 }
 
 /*
@@ -196,17 +211,16 @@ void bch2_journal_halt(struct journal *j)
  * journal reservation - journal entry is open means journal is dirty:
  *
  * returns:
- * 1:		success
- * 0:		journal currently full (must wait)
- * -EROFS:	insufficient rw devices
- * -EIO:	journal error
+ * 0:		success
+ * -ENOSPC:	journal currently full, must invoke reclaim
+ * -EAGAIN:	journal blocked, must wait
+ * -EROFS:	insufficient rw devices or journal error
  */
 static int journal_entry_open(struct journal *j)
 {
 	struct journal_buf *buf = journal_cur_buf(j);
 	union journal_res_state old, new;
-	ssize_t u64s;
-	int sectors;
+	int u64s, ret;
 	u64 v;
 
 	lockdep_assert_held(&j->lock);
@@ -216,29 +230,22 @@ static int journal_entry_open(struct journal *j)
 		return -EAGAIN;
 
 	if (!fifo_free(&j->pin))
-		return 0;
+		return -ENOSPC;
 
-	sectors = bch2_journal_entry_sectors(j);
-	if (sectors <= 0)
-		return sectors;
+	ret = bch2_journal_space_available(j);
+	if (ret)
+		return ret;
 
-	buf->disk_sectors	= sectors;
 	buf->u64s_reserved	= j->entry_u64s_reserved;
+	buf->disk_sectors	= j->cur_entry_sectors;
+	buf->sectors		= min(buf->disk_sectors, buf->buf_size >> 9);
 
-	sectors = min_t(unsigned, sectors, buf->size >> 9);
-	j->cur_buf_sectors	= sectors;
-
-	u64s = (sectors << 9) / sizeof(u64);
-
-	/* Subtract the journal header */
-	u64s -= sizeof(struct jset) / sizeof(u64);
-	u64s -= buf->u64s_reserved;
-	u64s  = max_t(ssize_t, 0L, u64s);
-
-	BUG_ON(u64s >= JOURNAL_ENTRY_CLOSED_VAL);
+	u64s = (int) (buf->sectors << 9) / sizeof(u64) -
+		journal_entry_overhead(j);
+	u64s  = clamp_t(int, u64s, 0, JOURNAL_ENTRY_CLOSED_VAL - 1);
 
 	if (u64s <= le32_to_cpu(buf->data->u64s))
-		return 0;
+		return -ENOSPC;
 
 	/*
 	 * Must be set before marking the journal entry as open:
@@ -250,10 +257,11 @@ static int journal_entry_open(struct journal *j)
 		old.v = new.v = v;
 
 		if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL)
-			return -EIO;
+			return -EROFS;
 
 		/* Handle any already added entries */
 		new.cur_entry_offset = le32_to_cpu(buf->data->u64s);
+		journal_state_inc(&new);
 	} while ((v = atomic64_cmpxchg(&j->reservations.counter,
 				       old.v, new.v)) != old.v);
 
@@ -266,48 +274,16 @@ static int journal_entry_open(struct journal *j)
 			 &j->write_work,
 			 msecs_to_jiffies(j->write_delay_ms));
 	journal_wake(j);
-	return 1;
-}
-
-static bool __journal_entry_close(struct journal *j)
-{
-	bool set_need_write;
-
-	if (!journal_entry_is_open(j)) {
-		spin_unlock(&j->lock);
-		return true;
-	}
-
-	set_need_write = !test_and_set_bit(JOURNAL_NEED_WRITE, &j->flags);
-	if (set_need_write)
-		j->need_write_time = local_clock();
-
-	switch (journal_buf_switch(j, set_need_write)) {
-	case JOURNAL_ENTRY_INUSE:
-		spin_unlock(&j->lock);
-		return false;
-	default:
-		spin_unlock(&j->lock);
-		fallthrough;
-	case JOURNAL_UNLOCKED:
-		return false;
-	}
-}
-
-static bool journal_entry_close(struct journal *j)
-{
-	spin_lock(&j->lock);
-	return __journal_entry_close(j);
+	return 0;
 }
 
 static bool journal_quiesced(struct journal *j)
 {
-	bool ret;
+	union journal_res_state state = READ_ONCE(j->reservations);
+	bool ret = !state.prev_buf_unwritten && !__journal_entry_is_open(state);
 
-	spin_lock(&j->lock);
-	ret = !j->reservations.prev_buf_unwritten &&
-		!journal_entry_is_open(j);
-	__journal_entry_close(j);
+	if (!ret)
+		journal_entry_close(j);
 	return ret;
 }
 
@@ -357,7 +333,11 @@ retry:
 	if (journal_res_get_fast(j, res, flags))
 		return 0;
 
+	if (bch2_journal_error(j))
+		return -EROFS;
+
 	spin_lock(&j->lock);
+
 	/*
 	 * Recheck after taking the lock, so we don't race with another thread
 	 * that just did journal_entry_open() and call journal_entry_close()
@@ -375,56 +355,42 @@ retry:
 	 */
 	buf = journal_cur_buf(j);
 	if (journal_entry_is_open(j) &&
-	    buf->size >> 9 < buf->disk_sectors &&
-	    buf->size < JOURNAL_ENTRY_SIZE_MAX)
-		j->buf_size_want = max(j->buf_size_want, buf->size << 1);
+	    buf->buf_size >> 9 < buf->disk_sectors &&
+	    buf->buf_size < JOURNAL_ENTRY_SIZE_MAX)
+		j->buf_size_want = max(j->buf_size_want, buf->buf_size << 1);
 
-	/*
-	 * Close the current journal entry if necessary, then try to start a new
-	 * one:
-	 */
-	switch (journal_buf_switch(j, false)) {
-	case JOURNAL_ENTRY_ERROR:
-		spin_unlock(&j->lock);
-		return -EROFS;
-	case JOURNAL_ENTRY_INUSE:
+	if (journal_entry_is_open(j) &&
+	    !__journal_entry_close(j)) {
 		/*
-		 * The current journal entry is still open, but we failed to get
-		 * a journal reservation because there's not enough space in it,
-		 * and we can't close it and start another because we haven't
-		 * finished writing out the previous entry:
+		 * We failed to get a reservation on the current open journal
+		 * entry because it's full, and we can't close it because
+		 * there's still a previous one in flight:
 		 */
-		spin_unlock(&j->lock);
 		trace_journal_entry_full(c);
-		goto blocked;
-	case JOURNAL_ENTRY_CLOSED:
-		break;
-	case JOURNAL_UNLOCKED:
-		goto retry;
+		ret = -EAGAIN;
+	} else {
+		ret = journal_entry_open(j);
 	}
 
-	/* We now have a new, closed journal buf - see if we can open it: */
-	ret = journal_entry_open(j);
+	if ((ret == -EAGAIN || ret == -ENOSPC) &&
+	    !j->res_get_blocked_start)
+		j->res_get_blocked_start = local_clock() ?: 1;
+
 	spin_unlock(&j->lock);
 
-	if (ret < 0)
-		return ret;
-	if (ret)
+	if (!ret)
 		goto retry;
+	if (ret == -ENOSPC) {
+		/*
+		 * Journal is full - can't rely on reclaim from work item due to
+		 * freezing:
+		 */
+		trace_journal_full(c);
+		bch2_journal_reclaim_work(&j->reclaim_work.work);
+		ret = -EAGAIN;
+	}
 
-	/* Journal's full, we have to wait */
-
-	/*
-	 * Direct reclaim - can't rely on reclaim from work item
-	 * due to freezing..
-	 */
-	bch2_journal_reclaim_work(&j->reclaim_work.work);
-
-	trace_journal_full(c);
-blocked:
-	if (!j->res_get_blocked_start)
-		j->res_get_blocked_start = local_clock() ?: 1;
-	return -EAGAIN;
+	return ret;
 }
 
 /*
@@ -461,7 +427,7 @@ void bch2_journal_entry_res_resize(struct journal *j,
 
 	j->entry_u64s_reserved += d;
 	if (d <= 0)
-		goto out_unlock;
+		goto out;
 
 	j->cur_entry_u64s -= d;
 	smp_mb();
@@ -474,15 +440,12 @@ void bch2_journal_entry_res_resize(struct journal *j,
 		 * Not enough room in current journal entry, have to flush it:
 		 */
 		__journal_entry_close(j);
-		goto out;
+	} else {
+		journal_cur_buf(j)->u64s_reserved += d;
 	}
-
-	journal_cur_buf(j)->u64s_reserved += d;
-out_unlock:
-	spin_unlock(&j->lock);
 out:
+	spin_unlock(&j->lock);
 	res->u64s += d;
-	return;
 }
 
 /* journal flushing: */
@@ -512,47 +475,47 @@ int bch2_journal_open_seq_async(struct journal *j, u64 seq, struct closure *cl)
 {
 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
 	int ret;
-retry:
+
 	spin_lock(&j->lock);
 
-	if (seq < journal_cur_seq(j) ||
+	/*
+	 * Can't try to open more than one sequence number ahead:
+	 */
+	BUG_ON(journal_cur_seq(j) < seq && !journal_entry_is_open(j));
+
+	if (journal_cur_seq(j) > seq ||
 	    journal_entry_is_open(j)) {
 		spin_unlock(&j->lock);
 		return 0;
 	}
 
-	if (journal_cur_seq(j) < seq) {
-		switch (journal_buf_switch(j, false)) {
-		case JOURNAL_ENTRY_ERROR:
-			spin_unlock(&j->lock);
-			return -EROFS;
-		case JOURNAL_ENTRY_INUSE:
-			/* haven't finished writing out the previous one: */
-			trace_journal_entry_full(c);
-			goto blocked;
-		case JOURNAL_ENTRY_CLOSED:
-			break;
-		case JOURNAL_UNLOCKED:
-			goto retry;
-		}
-	}
-
-	BUG_ON(journal_cur_seq(j) < seq);
+	if (journal_cur_seq(j) < seq &&
+	    !__journal_entry_close(j)) {
+		/* haven't finished writing out the previous one: */
+		trace_journal_entry_full(c);
+		ret = -EAGAIN;
+	} else {
+		BUG_ON(journal_cur_seq(j) != seq);
 
-	ret = journal_entry_open(j);
-	if (ret) {
-		spin_unlock(&j->lock);
-		return ret < 0 ? ret : 0;
+		ret = journal_entry_open(j);
 	}
-blocked:
-	if (!j->res_get_blocked_start)
+
+	if ((ret == -EAGAIN || ret == -ENOSPC) &&
+	    !j->res_get_blocked_start)
 		j->res_get_blocked_start = local_clock() ?: 1;
 
-	closure_wait(&j->async_wait, cl);
+	if (ret == -EAGAIN || ret == -ENOSPC)
+		closure_wait(&j->async_wait, cl);
+
 	spin_unlock(&j->lock);
 
-	bch2_journal_reclaim_work(&j->reclaim_work.work);
-	return -EAGAIN;
+	if (ret == -ENOSPC) {
+		trace_journal_full(c);
+		bch2_journal_reclaim_work(&j->reclaim_work.work);
+		ret = -EAGAIN;
+	}
+
+	return ret;
 }
 
 static int journal_seq_error(struct journal *j, u64 seq)
@@ -635,8 +598,7 @@ void bch2_journal_flush_seq_async(struct journal *j, u64 seq,
 
 	if (seq == journal_cur_seq(j))
 		__journal_entry_close(j);
-	else
-		spin_unlock(&j->lock);
+	spin_unlock(&j->lock);
 }
 
 static int journal_seq_flushed(struct journal *j, u64 seq)
@@ -648,8 +610,7 @@ static int journal_seq_flushed(struct journal *j, u64 seq)
 
 	if (seq == journal_cur_seq(j))
 		__journal_entry_close(j);
-	else
-		spin_unlock(&j->lock);
+	spin_unlock(&j->lock);
 
 	return ret;
 }
@@ -783,7 +744,7 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
 		goto err;
 
 	journal_buckets = bch2_sb_resize_journal(&ca->disk_sb,
-				nr + sizeof(*journal_buckets) / sizeof(u64));
+						 nr + sizeof(*journal_buckets) / sizeof(u64));
 	if (!journal_buckets)
 		goto err;
 
@@ -846,9 +807,9 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
 		ja->nr++;
 
 		bch2_mark_metadata_bucket(c, ca, bucket, BCH_DATA_JOURNAL,
-				ca->mi.bucket_size,
-				gc_phase(GC_PHASE_SB),
-				0);
+					  ca->mi.bucket_size,
+					  gc_phase(GC_PHASE_SB),
+					  0);
 
 		if (c) {
 			spin_unlock(&c->journal.lock);
@@ -899,7 +860,7 @@ int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca,
 		 */
 
 		if (bch2_disk_reservation_get(c, &disk_res,
-				bucket_to_sector(ca, nr - ja->nr), 1, 0)) {
+					      bucket_to_sector(ca, nr - ja->nr), 1, 0)) {
 			mutex_unlock(&c->sb_lock);
 			return -ENOSPC;
 		}
@@ -996,7 +957,7 @@ void bch2_fs_journal_start(struct journal *j)
 		journal_pin_new_entry(j, 0);
 
 	/*
-	 * journal_buf_switch() only inits the next journal entry when it
+	 * __journal_entry_close() only inits the next journal entry when it
 	 * closes an open journal entry - the very first journal entry gets
 	 * initialized here:
 	 */
@@ -1063,8 +1024,8 @@ int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb)
 
 void bch2_fs_journal_exit(struct journal *j)
 {
-	kvpfree(j->buf[1].data, j->buf[1].size);
-	kvpfree(j->buf[0].data, j->buf[0].size);
+	kvpfree(j->buf[1].data, j->buf[1].buf_size);
+	kvpfree(j->buf[0].data, j->buf[0].buf_size);
 	free_fifo(&j->pin);
 }
 
@@ -1088,8 +1049,8 @@ int bch2_fs_journal_init(struct journal *j)
 
 	lockdep_init_map(&j->res_map, "journal res", &res_key, 0);
 
-	j->buf[0].size		= JOURNAL_ENTRY_SIZE_MIN;
-	j->buf[1].size		= JOURNAL_ENTRY_SIZE_MIN;
+	j->buf[0].buf_size	= JOURNAL_ENTRY_SIZE_MIN;
+	j->buf[1].buf_size	= JOURNAL_ENTRY_SIZE_MIN;
 	j->write_delay_ms	= 1000;
 	j->reclaim_delay_ms	= 100;
 
@@ -1102,8 +1063,8 @@ int bch2_fs_journal_init(struct journal *j)
 		 { .cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL }).v);
 
 	if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)) ||
-	    !(j->buf[0].data = kvpmalloc(j->buf[0].size, GFP_KERNEL)) ||
-	    !(j->buf[1].data = kvpmalloc(j->buf[1].size, GFP_KERNEL))) {
+	    !(j->buf[0].data = kvpmalloc(j->buf[0].buf_size, GFP_KERNEL)) ||
+	    !(j->buf[1].data = kvpmalloc(j->buf[1].buf_size, GFP_KERNEL))) {
 		ret = -ENOMEM;
 		goto out;
 	}
diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h
index 5290cdeab585..4acb0f59396d 100644
--- a/fs/bcachefs/journal.h
+++ b/fs/bcachefs/journal.h
@@ -179,6 +179,11 @@ static inline unsigned jset_u64s(unsigned u64s)
 	return u64s + sizeof(struct jset_entry) / sizeof(u64);
 }
 
+static inline int journal_entry_overhead(struct journal *j)
+{
+	return sizeof(struct jset) / sizeof(u64) + j->entry_u64s_reserved;
+}
+
 static inline struct jset_entry *
 bch2_journal_add_entry_noreservation(struct journal_buf *buf, size_t u64s)
 {
@@ -225,7 +230,7 @@ static inline void bch2_journal_add_keys(struct journal *j, struct journal_res *
 			       id, 0, k, k->k.u64s);
 }
 
-void bch2_journal_buf_put_slowpath(struct journal *, bool);
+void __bch2_journal_buf_put(struct journal *, bool);
 
 static inline void bch2_journal_buf_put(struct journal *j, unsigned idx,
 				       bool need_write_just_set)
@@ -236,17 +241,10 @@ static inline void bch2_journal_buf_put(struct journal *j, unsigned idx,
 				    .buf0_count = idx == 0,
 				    .buf1_count = idx == 1,
 				    }).v, &j->reservations.counter);
-
-	EBUG_ON(s.idx != idx && !s.prev_buf_unwritten);
-
-	/*
-	 * Do not initiate a journal write if the journal is in an error state
-	 * (previous journal entry write may have failed)
-	 */
-	if (s.idx != idx &&
-	    !journal_state_count(s, idx) &&
-	    s.cur_entry_offset != JOURNAL_ENTRY_ERROR_VAL)
-		bch2_journal_buf_put_slowpath(j, need_write_just_set);
+	if (!journal_state_count(s, idx)) {
+		EBUG_ON(s.idx == idx || !s.prev_buf_unwritten);
+		__bch2_journal_buf_put(j, need_write_just_set);
+	}
 }
 
 /*
@@ -333,6 +331,8 @@ out:
 	return 0;
 }
 
+/* journal_entry_res: */
+
 void bch2_journal_entry_res_resize(struct journal *,
 				   struct journal_entry_res *,
 				   unsigned);
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 17eba4269719..e5e50be80126 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -902,13 +902,16 @@ static unsigned journal_dev_buckets_available(struct journal *j,
 	return available;
 }
 
-/* returns number of sectors available for next journal entry: */
-int bch2_journal_entry_sectors(struct journal *j)
+int bch2_journal_space_available(struct journal *j)
 {
 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
 	struct bch_dev *ca;
-	unsigned sectors_available = UINT_MAX;
+	unsigned sectors_next_entry	= UINT_MAX;
 	unsigned i, nr_online = 0, nr_devs = 0;
+	unsigned unwritten_sectors = j->reservations.prev_buf_unwritten
+		? journal_prev_buf(j)->sectors
+		: 0;
+	int ret = 0;
 
 	lockdep_assert_held(&j->lock);
 
@@ -921,16 +924,16 @@ int bch2_journal_entry_sectors(struct journal *j)
 		if (!ja->nr)
 			continue;
 
+		nr_online++;
+
 		buckets_this_device = journal_dev_buckets_available(j, ja);
 		sectors_this_device = ja->sectors_free;
 
-		nr_online++;
-
 		/*
 		 * We that we don't allocate the space for a journal entry
 		 * until we write it out - thus, account for it here:
 		 */
-		if (j->prev_buf_sectors >= sectors_this_device) {
+		if (unwritten_sectors >= sectors_this_device) {
 			if (!buckets_this_device)
 				continue;
 
@@ -938,7 +941,7 @@ int bch2_journal_entry_sectors(struct journal *j)
 			sectors_this_device = ca->mi.bucket_size;
 		}
 
-		sectors_this_device -= j->prev_buf_sectors;
+		sectors_this_device -= unwritten_sectors;
 
 		if (buckets_this_device)
 			sectors_this_device = ca->mi.bucket_size;
@@ -946,19 +949,26 @@ int bch2_journal_entry_sectors(struct journal *j)
 		if (!sectors_this_device)
 			continue;
 
-		sectors_available = min(sectors_available,
-					sectors_this_device);
+		sectors_next_entry = min(sectors_next_entry,
+					 sectors_this_device);
+
 		nr_devs++;
 	}
 	rcu_read_unlock();
 
-	if (nr_online < c->opts.metadata_replicas_required)
-		return -EROFS;
+	if (nr_online < c->opts.metadata_replicas_required) {
+		ret = -EROFS;
+		sectors_next_entry = 0;
+	} else if (!sectors_next_entry ||
+		   nr_devs < min_t(unsigned, nr_online,
+				   c->opts.metadata_replicas)) {
+		ret = -ENOSPC;
+		sectors_next_entry = 0;
+	}
 
-	if (nr_devs < min_t(unsigned, nr_online, c->opts.metadata_replicas))
-		return 0;
+	WRITE_ONCE(j->cur_entry_sectors, sectors_next_entry);
 
-	return sectors_available;
+	return ret;
 }
 
 static void __journal_write_alloc(struct journal *j,
@@ -1059,9 +1069,6 @@ static int journal_write_alloc(struct journal *j, struct journal_buf *w,
 	__journal_write_alloc(j, w, &devs_sorted,
 			      sectors, &replicas, replicas_want);
 done:
-	if (replicas >= replicas_want)
-		j->prev_buf_sectors = 0;
-
 	spin_unlock(&j->lock);
 	rcu_read_unlock();
 
@@ -1117,17 +1124,17 @@ static void journal_buf_realloc(struct journal *j, struct journal_buf *buf)
 	unsigned new_size = READ_ONCE(j->buf_size_want);
 	void *new_buf;
 
-	if (buf->size >= new_size)
+	if (buf->buf_size >= new_size)
 		return;
 
 	new_buf = kvpmalloc(new_size, GFP_NOIO|__GFP_NOWARN);
 	if (!new_buf)
 		return;
 
-	memcpy(new_buf, buf->data, buf->size);
-	kvpfree(buf->data, buf->size);
+	memcpy(new_buf, buf->data, buf->buf_size);
+	kvpfree(buf->data, buf->buf_size);
 	buf->data	= new_buf;
-	buf->size	= new_size;
+	buf->buf_size	= new_size;
 }
 
 static void journal_write_done(struct closure *cl)
@@ -1227,15 +1234,14 @@ void bch2_journal_write(struct closure *cl)
 
 	j->write_start_time = local_clock();
 
-	start	= vstruct_last(w->data);
+	start	= vstruct_last(jset);
 	end	= bch2_journal_super_entries_add_common(c, start,
 						le64_to_cpu(jset->seq));
 	u64s	= (u64 *) end - (u64 *) start;
 	BUG_ON(u64s > j->entry_u64s_reserved);
 
-	le32_add_cpu(&w->data->u64s, u64s);
-	BUG_ON(vstruct_sectors(jset, c->block_bits) >
-	       w->disk_sectors);
+	le32_add_cpu(&jset->u64s, u64s);
+	BUG_ON(vstruct_sectors(jset, c->block_bits) > w->sectors);
 
 	journal_write_compact(jset);
 
@@ -1273,10 +1279,10 @@ void bch2_journal_write(struct closure *cl)
 		goto err;
 
 	sectors = vstruct_sectors(jset, c->block_bits);
-	BUG_ON(sectors > j->prev_buf_sectors);
+	BUG_ON(sectors > w->sectors);
 
-	bytes = vstruct_bytes(w->data);
-	memset((void *) w->data + bytes, 0, (sectors << 9) - bytes);
+	bytes = vstruct_bytes(jset);
+	memset((void *) jset + bytes, 0, (sectors << 9) - bytes);
 
 	if (journal_write_alloc(j, w, sectors)) {
 		bch2_journal_halt(j);
@@ -1286,6 +1292,12 @@ void bch2_journal_write(struct closure *cl)
 		return;
 	}
 
+	/*
+	 * write is allocated, no longer need to account for it in
+	 * bch2_journal_entry_sectors:
+	 */
+	w->sectors = 0;
+
 	/*
 	 * XXX: we really should just disable the entire journal in nochanges
 	 * mode
@@ -1316,7 +1328,7 @@ void bch2_journal_write(struct closure *cl)
 		trace_journal_write(bio);
 		closure_bio_submit(bio, cl);
 
-		ca->journal.bucket_seq[ca->journal.cur_idx] = le64_to_cpu(w->data->seq);
+		ca->journal.bucket_seq[ca->journal.cur_idx] = le64_to_cpu(jset->seq);
 	}
 
 	for_each_rw_member(ca, c, i)
diff --git a/fs/bcachefs/journal_io.h b/fs/bcachefs/journal_io.h
index e19e549baf8a..d1409039724d 100644
--- a/fs/bcachefs/journal_io.h
+++ b/fs/bcachefs/journal_io.h
@@ -40,7 +40,7 @@ int bch2_journal_read(struct bch_fs *, struct list_head *);
 void bch2_journal_entries_free(struct list_head *);
 int bch2_journal_replay(struct bch_fs *, struct list_head *);
 
-int bch2_journal_entry_sectors(struct journal *);
+int bch2_journal_space_available(struct journal *);
 void bch2_journal_write(struct closure *);
 
 #endif /* _BCACHEFS_JOURNAL_IO_H */
diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h
index e952eb06eff5..3372e87be124 100644
--- a/fs/bcachefs/journal_types.h
+++ b/fs/bcachefs/journal_types.h
@@ -22,8 +22,10 @@ struct journal_buf {
 
 	struct closure_waitlist	wait;
 
-	unsigned		size;
-	unsigned		disk_sectors;
+	unsigned		buf_size;	/* size in bytes of @data */
+	unsigned		sectors;	/* maximum size for current entry */
+	unsigned		disk_sectors;	/* maximum size entry could have been, if
+						   buf_size was bigger */
 	unsigned		u64s_reserved;
 	/* bloom filter: */
 	unsigned long		has_inode[1024 / sizeof(unsigned long)];
@@ -129,9 +131,14 @@ struct journal {
 	unsigned long		flags;
 
 	union journal_res_state reservations;
+
+	/* Max size of current journal entry */
 	unsigned		cur_entry_u64s;
-	unsigned		prev_buf_sectors;
-	unsigned		cur_buf_sectors;
+	unsigned		cur_entry_sectors;
+
+	/* Reserved space in journal entry to be used just prior to write */
+	unsigned		entry_u64s_reserved;
+
 	unsigned		buf_size_want;
 
 	/*
@@ -159,9 +166,6 @@ struct journal {
 	u64			seq_ondisk;
 	u64			last_seq_ondisk;
 
-	/* Reserved space in journal entry to be used just prior to write */
-	unsigned		entry_u64s_reserved;
-
 	/*
 	 * FIFO of journal entries whose btree updates have not yet been
 	 * written out.
-- 
cgit 


From 3c5fa33a55ac2a1539c89635c07eee9fc3823bac Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 20 Feb 2019 14:18:44 -0500
Subject: bcachefs: force str_hash code to be inlined

the btree trans iterator stuff relies on a dirty horrible hack that uses
a short backtrace to generate tokens, and gcc uninlining these functions
breaks that hack. ugh.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/str_hash.h | 40 ++++++++++++++++++++++------------------
 1 file changed, 22 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h
index 6f30fbe44eb8..c5bce01bf34c 100644
--- a/fs/bcachefs/str_hash.h
+++ b/fs/bcachefs/str_hash.h
@@ -125,7 +125,7 @@ struct bch_hash_desc {
 	bool		(*cmp_bkey)(struct bkey_s_c, struct bkey_s_c);
 };
 
-static inline struct btree_iter *
+static __always_inline struct btree_iter *
 bch2_hash_lookup(struct btree_trans *trans,
 		 const struct bch_hash_desc desc,
 		 const struct bch_hash_info *info,
@@ -159,7 +159,7 @@ bch2_hash_lookup(struct btree_trans *trans,
 	return IS_ERR(k.k) ? ERR_CAST(k.k) : ERR_PTR(-ENOENT);
 }
 
-static inline struct btree_iter *
+static __always_inline struct btree_iter *
 bch2_hash_hole(struct btree_trans *trans,
 	       const struct bch_hash_desc desc,
 	       const struct bch_hash_info *info,
@@ -185,10 +185,11 @@ bch2_hash_hole(struct btree_trans *trans,
 	return IS_ERR(k.k) ? ERR_CAST(k.k) : ERR_PTR(-ENOSPC);
 }
 
-static inline int bch2_hash_needs_whiteout(struct btree_trans *trans,
-					   const struct bch_hash_desc desc,
-					   const struct bch_hash_info *info,
-					   struct btree_iter *start)
+static __always_inline
+int bch2_hash_needs_whiteout(struct btree_trans *trans,
+			     const struct bch_hash_desc desc,
+			     const struct bch_hash_info *info,
+			     struct btree_iter *start)
 {
 	struct btree_iter *iter;
 	struct bkey_s_c k;
@@ -211,10 +212,11 @@ static inline int bch2_hash_needs_whiteout(struct btree_trans *trans,
 	return btree_iter_err(k);
 }
 
-static inline int __bch2_hash_set(struct btree_trans *trans,
-				  const struct bch_hash_desc desc,
-				  const struct bch_hash_info *info,
-				  u64 inode, struct bkey_i *insert, int flags)
+static __always_inline
+int __bch2_hash_set(struct btree_trans *trans,
+		    const struct bch_hash_desc desc,
+		    const struct bch_hash_info *info,
+		    u64 inode, struct bkey_i *insert, int flags)
 {
 	struct btree_iter *iter, *slot = NULL;
 	struct bkey_s_c k;
@@ -276,10 +278,11 @@ static inline int bch2_hash_set(const struct bch_hash_desc desc,
 					inode, insert, flags));
 }
 
-static inline int bch2_hash_delete_at(struct btree_trans *trans,
-				      const struct bch_hash_desc desc,
-				      const struct bch_hash_info *info,
-				      struct btree_iter *iter)
+static __always_inline
+int bch2_hash_delete_at(struct btree_trans *trans,
+			const struct bch_hash_desc desc,
+			const struct bch_hash_info *info,
+			struct btree_iter *iter)
 {
 	struct bkey_i *delete;
 	int ret;
@@ -300,10 +303,11 @@ static inline int bch2_hash_delete_at(struct btree_trans *trans,
 	return 0;
 }
 
-static inline int bch2_hash_delete(struct btree_trans *trans,
-				   const struct bch_hash_desc desc,
-				   const struct bch_hash_info *info,
-				   u64 inode, const void *key)
+static __always_inline
+int bch2_hash_delete(struct btree_trans *trans,
+		     const struct bch_hash_desc desc,
+		     const struct bch_hash_info *info,
+		     u64 inode, const void *key)
 {
 	struct btree_iter *iter;
 
-- 
cgit 


From 86a225c42d44ba966504801c6d953745184ea9cf Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 20 Feb 2019 16:00:03 -0500
Subject: bcachefs: fix a deadlock on startup

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c | 4 +---
 fs/bcachefs/journal_io.c       | 9 +++++++++
 2 files changed, 10 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 04b75367fcde..fd39eeae5740 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -1678,7 +1678,6 @@ int bch2_fs_allocator_start(struct bch_fs *c)
 {
 	struct bch_dev *ca;
 	unsigned i;
-	bool wrote;
 	int ret;
 
 	down_read(&c->gc_lock);
@@ -1697,8 +1696,7 @@ int bch2_fs_allocator_start(struct bch_fs *c)
 	}
 
 	set_bit(BCH_FS_ALLOCATOR_RUNNING, &c->flags);
-
-	return bch2_alloc_write(c, false, &wrote);
+	return 0;
 }
 
 void bch2_fs_allocator_background_init(struct bch_fs *c)
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index e5e50be80126..1f7bd314f61e 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -888,9 +888,18 @@ err:
 static unsigned journal_dev_buckets_available(struct journal *j,
 					      struct journal_device *ja)
 {
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
 	unsigned next = (ja->cur_idx + 1) % ja->nr;
 	unsigned available = (ja->last_idx + ja->nr - next) % ja->nr;
 
+	/*
+	 * Allocator startup needs some journal space before we can do journal
+	 * replay:
+	 */
+	if (available &&
+	    test_bit(BCH_FS_ALLOCATOR_STARTED, &c->flags))
+		available--;
+
 	/*
 	 * Don't use the last bucket unless writing the new last_seq
 	 * will make another bucket available:
-- 
cgit 


From 28062d320bded23eb7d24633e6ab11ea3c03487b Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 20 Feb 2019 17:57:06 -0500
Subject: bcachefs: Fix gc handling of bucket gens

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_gc.c | 19 ++++++++++++-------
 fs/bcachefs/buckets.h  |  2 +-
 2 files changed, 13 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 56402fc64bc2..c899a77bf891 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -142,22 +142,23 @@ static int bch2_gc_mark_key(struct bch_fs *c, struct bkey_s_c k,
 		bkey_for_each_ptr(ptrs, ptr) {
 			struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
 			struct bucket *g = PTR_BUCKET(ca, ptr, true);
+			struct bucket *g2 = PTR_BUCKET(ca, ptr, false);
 
 			if (mustfix_fsck_err_on(!g->gen_valid, c,
 					"found ptr with missing gen in alloc btree,\n"
 					"type %u gen %u",
 					k.k->type, ptr->gen)) {
-				g->_mark.gen	= ptr->gen;
-				g->_mark.dirty	= true;
-				g->gen_valid	= 1;
+				g2->_mark.gen	= g->_mark.gen		= ptr->gen;
+				g2->_mark.dirty	= g->_mark.dirty	= true;
+				g2->gen_valid	= g->gen_valid		= true;
 			}
 
 			if (mustfix_fsck_err_on(gen_cmp(ptr->gen, g->mark.gen) > 0, c,
 					"%u ptr gen in the future: %u > %u",
 					k.k->type, ptr->gen, g->mark.gen)) {
-				g->_mark.gen	= ptr->gen;
-				g->_mark.dirty	= true;
-				g->gen_valid	= 1;
+				g2->_mark.gen	= g->_mark.gen		= ptr->gen;
+				g2->_mark.dirty	= g->_mark.dirty	= true;
+				g2->gen_valid	= g->gen_valid		= true;
 				set_bit(BCH_FS_FIXED_GENS, &c->flags);
 			}
 		}
@@ -692,10 +693,12 @@ static int bch2_gc_start(struct bch_fs *c)
 		dst->first_bucket	= src->first_bucket;
 		dst->nbuckets		= src->nbuckets;
 
-		for (b = 0; b < src->nbuckets; b++)
+		for (b = 0; b < src->nbuckets; b++) {
 			dst->b[b]._mark.gen =
 				dst->b[b].oldest_gen =
 				src->b[b].mark.gen;
+			dst->b[b].gen_valid = src->b[b].gen_valid;
+		}
 	};
 
 	percpu_up_write(&c->mark_lock);
@@ -754,6 +757,8 @@ out:
 		if (iter++ <= 2) {
 			bch_info(c, "Fixed gens, restarting mark and sweep:");
 			clear_bit(BCH_FS_FIXED_GENS, &c->flags);
+			__gc_pos_set(c, gc_phase(GC_PHASE_NOT_RUNNING));
+			bch2_gc_free(c);
 			goto again;
 		}
 
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index 5f0b5a6ec9ad..342def8cf603 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -91,7 +91,7 @@ static inline struct bucket *PTR_BUCKET(struct bch_dev *ca,
 					const struct bch_extent_ptr *ptr,
 					bool gc)
 {
-	return bucket(ca, PTR_BUCKET_NR(ca, ptr));
+	return __bucket(ca, PTR_BUCKET_NR(ca, ptr), gc);
 }
 
 static inline struct bucket_mark ptr_bucket_mark(struct bch_dev *ca,
-- 
cgit 


From 8db2acde2fca80954b4db12977182aa44b1e85fe Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 27 Feb 2019 19:14:23 -0500
Subject: bcachefs: fix integer underflow in journal code

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/journal.c | 4 +++-
 fs/bcachefs/journal.h | 2 ++
 2 files changed, 5 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index 91d0e5d443ed..3a6040244064 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -256,6 +256,8 @@ static int journal_entry_open(struct journal *j)
 	do {
 		old.v = new.v = v;
 
+		EBUG_ON(journal_state_count(new, new.idx));
+
 		if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL)
 			return -EROFS;
 
@@ -429,7 +431,7 @@ void bch2_journal_entry_res_resize(struct journal *j,
 	if (d <= 0)
 		goto out;
 
-	j->cur_entry_u64s -= d;
+	j->cur_entry_u64s = max_t(int, 0, j->cur_entry_u64s - d);
 	smp_mb();
 	state = READ_ONCE(j->reservations);
 
diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h
index 4acb0f59396d..77d59fb0b151 100644
--- a/fs/bcachefs/journal.h
+++ b/fs/bcachefs/journal.h
@@ -292,6 +292,8 @@ static inline int journal_res_get_fast(struct journal *j,
 		if (new.cur_entry_offset + res->u64s > j->cur_entry_u64s)
 			return 0;
 
+		EBUG_ON(!journal_state_count(new, new.idx));
+
 		if (flags & JOURNAL_RES_GET_CHECK)
 			return 1;
 
-- 
cgit 


From c8cc5b3e3fa154446eae9aa461aeb97bc5a07c09 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 26 Feb 2019 17:13:46 -0500
Subject: bcachefs: Don't get journal reservation until after we know insert
 will succeed

Checking if we can do the insert after getting the journal reservation
means potentially wasting space in the journal, which will break the new
pre reservation mechanism

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update_leaf.c | 88 +++++++++++++++++++++++------------------
 fs/bcachefs/journal.c           |  3 +-
 2 files changed, 52 insertions(+), 39 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 5555c6e1c7cf..7043201ac6a3 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -416,6 +416,25 @@ static inline int btree_trans_cmp(struct btree_insert_entry l,
 		btree_iter_cmp(l.iter, r.iter);
 }
 
+static bool btree_trans_relock(struct btree_insert *trans)
+{
+	struct btree_insert_entry *i;
+
+	trans_for_each_iter(trans, i)
+		return bch2_btree_iter_relock(i->iter);
+	return true;
+}
+
+static void btree_trans_unlock(struct btree_insert *trans)
+{
+	struct btree_insert_entry *i;
+
+	trans_for_each_iter(trans, i) {
+		bch2_btree_iter_unlock(i->iter);
+		break;
+	}
+}
+
 /* Normal update interface: */
 
 static enum btree_insert_ret
@@ -467,49 +486,12 @@ static inline int do_btree_insert_at(struct btree_insert *trans,
 	struct btree_iter *linked;
 	unsigned u64s;
 	int ret;
-
+retry:
 	trans_for_each_iter(trans, i)
 		BUG_ON(i->iter->uptodate >= BTREE_ITER_NEED_RELOCK);
 
-	/* reserve space for deferred updates */
-	__trans_for_each_entry(trans, i, i->deferred) {
-
-	}
-
 	memset(&trans->journal_res, 0, sizeof(trans->journal_res));
 
-	if (likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) {
-		u64s = 0;
-		trans_for_each_entry(trans, i)
-			u64s += jset_u64s(i->k->k.u64s);
-
-		while ((ret = bch2_journal_res_get(&c->journal,
-					&trans->journal_res, u64s,
-					JOURNAL_RES_GET_NONBLOCK)) == -EAGAIN) {
-			struct btree_iter *iter = NULL;
-
-			trans_for_each_iter(trans, i)
-				iter = i->iter;
-
-			if (iter)
-				bch2_btree_iter_unlock(iter);
-
-			ret = bch2_journal_res_get(&c->journal,
-					&trans->journal_res, u64s,
-					JOURNAL_RES_GET_CHECK);
-			if (ret)
-				return ret;
-
-			if (iter && !bch2_btree_iter_relock(iter)) {
-				trans_restart(" (iter relock after journal res get blocked)");
-				return -EINTR;
-			}
-		}
-
-		if (ret)
-			return ret;
-	}
-
 	multi_lock_write(c, trans);
 
 	if (race_fault()) {
@@ -537,6 +519,36 @@ static inline int do_btree_insert_at(struct btree_insert *trans,
 		}
 	}
 
+	if (likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) {
+		u64s = 0;
+		trans_for_each_entry(trans, i)
+			u64s += jset_u64s(i->k->k.u64s);
+
+		ret = bch2_journal_res_get(&c->journal,
+				&trans->journal_res, u64s,
+				JOURNAL_RES_GET_NONBLOCK);
+		if (likely(!ret))
+			goto got_journal_res;
+		if (ret != -EAGAIN)
+			goto out;
+
+		multi_unlock_write(trans);
+		btree_trans_unlock(trans);
+
+		ret = bch2_journal_res_get(&c->journal,
+				&trans->journal_res, u64s,
+				JOURNAL_RES_GET_CHECK);
+		if (ret)
+			return ret;
+
+		if (!btree_trans_relock(trans)) {
+			trans_restart(" (iter relock after journal res get blocked)");
+			return -EINTR;
+		}
+
+		goto retry;
+	}
+got_journal_res:
 	if (!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)) {
 		if (journal_seq_verify(c))
 			trans_for_each_entry(trans, i)
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index 3a6040244064..21e611cdaa06 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -388,7 +388,8 @@ retry:
 		 * freezing:
 		 */
 		trace_journal_full(c);
-		bch2_journal_reclaim_work(&j->reclaim_work.work);
+		if (!(flags & JOURNAL_RES_GET_NONBLOCK))
+			bch2_journal_reclaim_work(&j->reclaim_work.work);
 		ret = -EAGAIN;
 	}
 
-- 
cgit 


From 2d3b581039614a20b064856b8fd899a733a4a1f8 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 26 Feb 2019 14:28:08 -0500
Subject: bcachefs: Better journal debug

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/journal.c | 47 +++++++++++++++++++++++++++++++++--------------
 1 file changed, 33 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index 21e611cdaa06..c9881fc2df17 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -1084,35 +1084,54 @@ ssize_t bch2_journal_print_debug(struct journal *j, char *buf)
 {
 	struct printbuf out = _PBUF(buf, PAGE_SIZE);
 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
-	union journal_res_state *s = &j->reservations;
+	union journal_res_state s;
 	struct bch_dev *ca;
 	unsigned iter;
 
 	rcu_read_lock();
 	spin_lock(&j->lock);
+	s = READ_ONCE(j->reservations);
 
 	pr_buf(&out,
 	       "active journal entries:\t%llu\n"
 	       "seq:\t\t\t%llu\n"
 	       "last_seq:\t\t%llu\n"
 	       "last_seq_ondisk:\t%llu\n"
-	       "reservation count:\t%u\n"
-	       "reservation offset:\t%u\n"
-	       "current entry u64s:\t%u\n"
-	       "io in flight:\t\t%i\n"
-	       "need write:\t\t%i\n"
-	       "dirty:\t\t\t%i\n"
-	       "replay done:\t\t%i\n",
+	       "current entry:\t\t",
 	       fifo_used(&j->pin),
 	       journal_cur_seq(j),
 	       journal_last_seq(j),
-	       j->last_seq_ondisk,
-	       journal_state_count(*s, s->idx),
-	       s->cur_entry_offset,
-	       j->cur_entry_u64s,
-	       s->prev_buf_unwritten,
+	       j->last_seq_ondisk);
+
+	switch (s.cur_entry_offset) {
+	case JOURNAL_ENTRY_ERROR_VAL:
+		pr_buf(&out, "error\n");
+		break;
+	case JOURNAL_ENTRY_CLOSED_VAL:
+		pr_buf(&out, "closed\n");
+		break;
+	default:
+		pr_buf(&out, "%u/%u\n",
+		       s.cur_entry_offset,
+		       j->cur_entry_u64s);
+		break;
+	}
+
+	pr_buf(&out,
+	       "current entry refs:\t%u\n"
+	       "prev entry unwritten:\t",
+	       journal_state_count(s, s.idx));
+
+	if (s.prev_buf_unwritten)
+		pr_buf(&out, "yes, ref %u\n",
+		       journal_state_count(s, !s.idx));
+	else
+		pr_buf(&out, "no\n");
+
+	pr_buf(&out,
+	       "need write:\t\t%i\n"
+	       "replay done:\t\t%i\n",
 	       test_bit(JOURNAL_NEED_WRITE,	&j->flags),
-	       journal_entry_is_open(j),
 	       test_bit(JOURNAL_REPLAY_DONE,	&j->flags));
 
 	for_each_member_device_rcu(ca, c, iter,
-- 
cgit 


From e5a66496a0751c1a7ea692ef5874c4cfad85969b Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 21 Feb 2019 13:33:21 -0500
Subject: bcachefs: Journal reclaim refactoring

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/journal.c         |  26 ++-
 fs/bcachefs/journal_io.c      | 132 +++-----------
 fs/bcachefs/journal_io.h      |   1 -
 fs/bcachefs/journal_reclaim.c | 396 ++++++++++++++++++++++++++----------------
 fs/bcachefs/journal_reclaim.h |   7 +-
 fs/bcachefs/journal_types.h   |   6 +
 6 files changed, 290 insertions(+), 278 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index c9881fc2df17..9b6f7b4136d8 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -84,17 +84,12 @@ void bch2_journal_halt(struct journal *j)
 
 	journal_wake(j);
 	closure_wake_up(&journal_cur_buf(j)->wait);
-	closure_wake_up(&journal_prev_buf(j)->wait);
 }
 
 /* journal entry close/open: */
 
 void __bch2_journal_buf_put(struct journal *j, bool need_write_just_set)
 {
-	struct journal_buf *w = journal_prev_buf(j);
-
-	atomic_dec_bug(&journal_seq_pin(j, le64_to_cpu(w->data->seq))->count);
-
 	if (!need_write_just_set &&
 	    test_bit(JOURNAL_NEED_WRITE, &j->flags))
 		bch2_time_stats_update(j->delay_time,
@@ -175,7 +170,6 @@ static bool __journal_entry_close(struct journal *j)
 	 * Hence, we want update/set last_seq on the current journal entry right
 	 * before we open a new one:
 	 */
-	bch2_journal_reclaim_fast(j);
 	buf->data->last_seq	= cpu_to_le64(journal_last_seq(j));
 
 	if (journal_entry_empty(buf->data))
@@ -189,8 +183,8 @@ static bool __journal_entry_close(struct journal *j)
 
 	cancel_delayed_work(&j->write_work);
 
-	/* ugh - might be called from __journal_res_get() under wait_event() */
-	__set_current_state(TASK_RUNNING);
+	bch2_journal_space_available(j);
+
 	bch2_journal_buf_put(j, old.idx, set_need_write);
 	return true;
 }
@@ -220,7 +214,7 @@ static int journal_entry_open(struct journal *j)
 {
 	struct journal_buf *buf = journal_cur_buf(j);
 	union journal_res_state old, new;
-	int u64s, ret;
+	int u64s;
 	u64 v;
 
 	lockdep_assert_held(&j->lock);
@@ -229,12 +223,10 @@ static int journal_entry_open(struct journal *j)
 	if (j->blocked)
 		return -EAGAIN;
 
-	if (!fifo_free(&j->pin))
-		return -ENOSPC;
+	if (j->cur_entry_error)
+		return j->cur_entry_error;
 
-	ret = bch2_journal_space_available(j);
-	if (ret)
-		return ret;
+	BUG_ON(!j->cur_entry_sectors);
 
 	buf->u64s_reserved	= j->entry_u64s_reserved;
 	buf->disk_sectors	= j->cur_entry_sectors;
@@ -411,7 +403,7 @@ int bch2_journal_res_get_slowpath(struct journal *j, struct journal_res *res,
 {
 	int ret;
 
-	wait_event(j->wait,
+	closure_wait_event(&j->async_wait,
 		   (ret = __journal_res_get(j, res, flags)) != -EAGAIN ||
 		   (flags & JOURNAL_RES_GET_NONBLOCK));
 	return ret;
@@ -969,6 +961,7 @@ void bch2_fs_journal_start(struct journal *j)
 
 	c->last_bucket_seq_cleanup = journal_cur_seq(j);
 
+	bch2_journal_space_available(j);
 	spin_unlock(&j->lock);
 
 	/*
@@ -1144,9 +1137,12 @@ ssize_t bch2_journal_print_debug(struct journal *j, char *buf)
 		pr_buf(&out,
 		       "dev %u:\n"
 		       "\tnr\t\t%u\n"
+		       "\tavailable\t%u:%u\n"
 		       "\tcur_idx\t\t%u (seq %llu)\n"
 		       "\tlast_idx\t%u (seq %llu)\n",
 		       iter, ja->nr,
+		       bch2_journal_dev_buckets_available(j, ja),
+		       ja->sectors_free,
 		       ja->cur_idx,	ja->bucket_seq[ja->cur_idx],
 		       ja->last_idx,	ja->bucket_seq[ja->last_idx]);
 	}
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 1f7bd314f61e..e4466816fafa 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -825,7 +825,6 @@ fsck_err:
 int bch2_journal_replay(struct bch_fs *c, struct list_head *list)
 {
 	struct journal *j = &c->journal;
-	struct journal_entry_pin_list *pin_list;
 	struct bkey_i *k, *_n;
 	struct jset_entry *entry;
 	struct journal_replay *i, *n;
@@ -867,10 +866,7 @@ int bch2_journal_replay(struct bch_fs *c, struct list_head *list)
 			cond_resched();
 		}
 
-		pin_list = journal_seq_pin(j, j->replay_journal_seq);
-
-		if (atomic_dec_and_test(&pin_list->count))
-			journal_wake(j);
+		bch2_journal_pin_put(j, j->replay_journal_seq);
 	}
 
 	j->replay_journal_seq = 0;
@@ -885,101 +881,6 @@ err:
 
 /* journal write: */
 
-static unsigned journal_dev_buckets_available(struct journal *j,
-					      struct journal_device *ja)
-{
-	struct bch_fs *c = container_of(j, struct bch_fs, journal);
-	unsigned next = (ja->cur_idx + 1) % ja->nr;
-	unsigned available = (ja->last_idx + ja->nr - next) % ja->nr;
-
-	/*
-	 * Allocator startup needs some journal space before we can do journal
-	 * replay:
-	 */
-	if (available &&
-	    test_bit(BCH_FS_ALLOCATOR_STARTED, &c->flags))
-		available--;
-
-	/*
-	 * Don't use the last bucket unless writing the new last_seq
-	 * will make another bucket available:
-	 */
-	if (available &&
-	    journal_last_seq(j) <= ja->bucket_seq[ja->last_idx])
-		--available;
-
-	return available;
-}
-
-int bch2_journal_space_available(struct journal *j)
-{
-	struct bch_fs *c = container_of(j, struct bch_fs, journal);
-	struct bch_dev *ca;
-	unsigned sectors_next_entry	= UINT_MAX;
-	unsigned i, nr_online = 0, nr_devs = 0;
-	unsigned unwritten_sectors = j->reservations.prev_buf_unwritten
-		? journal_prev_buf(j)->sectors
-		: 0;
-	int ret = 0;
-
-	lockdep_assert_held(&j->lock);
-
-	rcu_read_lock();
-	for_each_member_device_rcu(ca, c, i,
-				   &c->rw_devs[BCH_DATA_JOURNAL]) {
-		struct journal_device *ja = &ca->journal;
-		unsigned buckets_this_device, sectors_this_device;
-
-		if (!ja->nr)
-			continue;
-
-		nr_online++;
-
-		buckets_this_device = journal_dev_buckets_available(j, ja);
-		sectors_this_device = ja->sectors_free;
-
-		/*
-		 * We that we don't allocate the space for a journal entry
-		 * until we write it out - thus, account for it here:
-		 */
-		if (unwritten_sectors >= sectors_this_device) {
-			if (!buckets_this_device)
-				continue;
-
-			buckets_this_device--;
-			sectors_this_device = ca->mi.bucket_size;
-		}
-
-		sectors_this_device -= unwritten_sectors;
-
-		if (buckets_this_device)
-			sectors_this_device = ca->mi.bucket_size;
-
-		if (!sectors_this_device)
-			continue;
-
-		sectors_next_entry = min(sectors_next_entry,
-					 sectors_this_device);
-
-		nr_devs++;
-	}
-	rcu_read_unlock();
-
-	if (nr_online < c->opts.metadata_replicas_required) {
-		ret = -EROFS;
-		sectors_next_entry = 0;
-	} else if (!sectors_next_entry ||
-		   nr_devs < min_t(unsigned, nr_online,
-				   c->opts.metadata_replicas)) {
-		ret = -ENOSPC;
-		sectors_next_entry = 0;
-	}
-
-	WRITE_ONCE(j->cur_entry_sectors, sectors_next_entry);
-
-	return ret;
-}
-
 static void __journal_write_alloc(struct journal *j,
 				  struct journal_buf *w,
 				  struct dev_alloc_list *devs_sorted,
@@ -1053,7 +954,6 @@ static int journal_write_alloc(struct journal *j, struct journal_buf *w,
 	devs_sorted = bch2_dev_alloc_list(c, &j->wp.stripe,
 					  &c->rw_devs[BCH_DATA_JOURNAL]);
 
-	spin_lock(&j->lock);
 	__journal_write_alloc(j, w, &devs_sorted,
 			      sectors, &replicas, replicas_want);
 
@@ -1069,7 +969,7 @@ static int journal_write_alloc(struct journal *j, struct journal_buf *w,
 
 		if (sectors > ja->sectors_free &&
 		    sectors <= ca->mi.bucket_size &&
-		    journal_dev_buckets_available(j, ja)) {
+		    bch2_journal_dev_buckets_available(j, ja)) {
 			ja->cur_idx = (ja->cur_idx + 1) % ja->nr;
 			ja->sectors_free = ca->mi.bucket_size;
 		}
@@ -1078,7 +978,6 @@ static int journal_write_alloc(struct journal *j, struct journal_buf *w,
 	__journal_write_alloc(j, w, &devs_sorted,
 			      sectors, &replicas, replicas_want);
 done:
-	spin_unlock(&j->lock);
 	rcu_read_unlock();
 
 	return replicas >= c->opts.metadata_replicas_required ? 0 : -EROFS;
@@ -1237,6 +1136,9 @@ void bch2_journal_write(struct closure *cl)
 	struct bch_extent_ptr *ptr;
 	bool validate_before_checksum = false;
 	unsigned i, sectors, bytes, u64s;
+	int ret;
+
+	bch2_journal_pin_put(j, le64_to_cpu(w->data->seq));
 
 	journal_buf_realloc(j, w);
 	jset = w->data;
@@ -1293,7 +1195,23 @@ void bch2_journal_write(struct closure *cl)
 	bytes = vstruct_bytes(jset);
 	memset((void *) jset + bytes, 0, (sectors << 9) - bytes);
 
-	if (journal_write_alloc(j, w, sectors)) {
+	spin_lock(&j->lock);
+	ret = journal_write_alloc(j, w, sectors);
+
+	/*
+	 * write is allocated, no longer need to account for it in
+	 * bch2_journal_space_available():
+	 */
+	w->sectors = 0;
+
+	/*
+	 * journal entry has been compacted and allocated, recalculate space
+	 * available:
+	 */
+	bch2_journal_space_available(j);
+	spin_unlock(&j->lock);
+
+	if (ret) {
 		bch2_journal_halt(j);
 		bch_err(c, "Unable to allocate journal write");
 		bch2_fatal_error(c);
@@ -1301,12 +1219,6 @@ void bch2_journal_write(struct closure *cl)
 		return;
 	}
 
-	/*
-	 * write is allocated, no longer need to account for it in
-	 * bch2_journal_entry_sectors:
-	 */
-	w->sectors = 0;
-
 	/*
 	 * XXX: we really should just disable the entire journal in nochanges
 	 * mode
diff --git a/fs/bcachefs/journal_io.h b/fs/bcachefs/journal_io.h
index d1409039724d..a79c396903f0 100644
--- a/fs/bcachefs/journal_io.h
+++ b/fs/bcachefs/journal_io.h
@@ -40,7 +40,6 @@ int bch2_journal_read(struct bch_fs *, struct list_head *);
 void bch2_journal_entries_free(struct list_head *);
 int bch2_journal_replay(struct bch_fs *, struct list_head *);
 
-int bch2_journal_space_available(struct journal *);
 void bch2_journal_write(struct closure *);
 
 #endif /* _BCACHEFS_JOURNAL_IO_H */
diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
index 98345dcd1e67..3b5b646859cb 100644
--- a/fs/bcachefs/journal_reclaim.c
+++ b/fs/bcachefs/journal_reclaim.c
@@ -2,15 +2,213 @@
 
 #include "bcachefs.h"
 #include "journal.h"
+#include "journal_io.h"
 #include "journal_reclaim.h"
 #include "replicas.h"
 #include "super.h"
 
+/* Free space calculations: */
+
+unsigned bch2_journal_dev_buckets_available(struct journal *j,
+					    struct journal_device *ja)
+{
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
+	unsigned next = (ja->cur_idx + 1) % ja->nr;
+	unsigned available = (ja->last_idx + ja->nr - next) % ja->nr;
+
+	/*
+	 * Allocator startup needs some journal space before we can do journal
+	 * replay:
+	 */
+	if (available &&
+	    test_bit(BCH_FS_ALLOCATOR_STARTED, &c->flags))
+		available--;
+
+	/*
+	 * Don't use the last bucket unless writing the new last_seq
+	 * will make another bucket available:
+	 */
+	if (available &&
+	    journal_last_seq(j) <= ja->bucket_seq[ja->last_idx])
+		--available;
+
+	return available;
+}
+
+void bch2_journal_space_available(struct journal *j)
+{
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
+	struct bch_dev *ca;
+	unsigned sectors_next_entry	= UINT_MAX;
+	unsigned sectors_total		= UINT_MAX;
+	unsigned max_entry_size		= min(j->buf[0].buf_size >> 9,
+					      j->buf[1].buf_size >> 9);
+	unsigned i, nr_online = 0, nr_devs = 0;
+	unsigned unwritten_sectors = j->reservations.prev_buf_unwritten
+		? journal_prev_buf(j)->sectors
+		: 0;
+	int ret = 0;
+
+	lockdep_assert_held(&j->lock);
+
+	rcu_read_lock();
+	for_each_member_device_rcu(ca, c, i,
+				   &c->rw_devs[BCH_DATA_JOURNAL]) {
+		struct journal_device *ja = &ca->journal;
+		unsigned buckets_this_device, sectors_this_device;
+
+		if (!ja->nr)
+			continue;
+
+		nr_online++;
+
+		buckets_this_device = bch2_journal_dev_buckets_available(j, ja);
+		sectors_this_device = ja->sectors_free;
+
+		/*
+		 * We that we don't allocate the space for a journal entry
+		 * until we write it out - thus, account for it here:
+		 */
+		if (unwritten_sectors >= sectors_this_device) {
+			if (!buckets_this_device)
+				continue;
+
+			buckets_this_device--;
+			sectors_this_device = ca->mi.bucket_size;
+		}
+
+		sectors_this_device -= unwritten_sectors;
+
+		if (sectors_this_device < ca->mi.bucket_size &&
+		    buckets_this_device) {
+			buckets_this_device--;
+			sectors_this_device = ca->mi.bucket_size;
+		}
+
+		if (!sectors_this_device)
+			continue;
+
+		sectors_next_entry = min(sectors_next_entry,
+					 sectors_this_device);
+
+		sectors_total = min(sectors_total,
+			buckets_this_device * ca->mi.bucket_size +
+			sectors_this_device);
+
+		max_entry_size = min_t(unsigned, max_entry_size,
+				       ca->mi.bucket_size);
+
+		nr_devs++;
+	}
+	rcu_read_unlock();
+
+	if (nr_online < c->opts.metadata_replicas_required) {
+		ret = -EROFS;
+		sectors_next_entry = 0;
+	} else if (!sectors_next_entry ||
+		   nr_devs < min_t(unsigned, nr_online,
+				   c->opts.metadata_replicas)) {
+		ret = -ENOSPC;
+		sectors_next_entry = 0;
+	} else if (!fifo_free(&j->pin)) {
+		ret = -ENOSPC;
+		sectors_next_entry = 0;
+	}
+
+	j->cur_entry_sectors	= sectors_next_entry;
+	j->cur_entry_error	= ret;
+
+	if (!ret)
+		journal_wake(j);
+}
+
+/* Discards - last part of journal reclaim: */
+
+static bool should_discard_bucket(struct journal *j, struct journal_device *ja)
+{
+	bool ret;
+
+	spin_lock(&j->lock);
+	ret = ja->nr &&
+		ja->last_idx != ja->cur_idx &&
+		ja->bucket_seq[ja->last_idx] < j->last_seq_ondisk;
+	spin_unlock(&j->lock);
+
+	return ret;
+}
+
+/*
+ * Advance ja->last_idx as long as it points to buckets that are no longer
+ * dirty, issuing discards if necessary:
+ */
+static void journal_do_discards(struct journal *j)
+{
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
+	struct bch_dev *ca;
+	unsigned iter;
+
+	mutex_lock(&j->reclaim_lock);
+
+	for_each_rw_member(ca, c, iter) {
+		struct journal_device *ja = &ca->journal;
+
+		while (should_discard_bucket(j, ja)) {
+			if (ca->mi.discard &&
+			    bdev_max_discard_sectors(ca->disk_sb.bdev))
+				blkdev_issue_discard(ca->disk_sb.bdev,
+					bucket_to_sector(ca,
+						ja->buckets[ja->last_idx]),
+					ca->mi.bucket_size, GFP_NOIO);
+
+			spin_lock(&j->lock);
+			ja->last_idx = (ja->last_idx + 1) % ja->nr;
+
+			bch2_journal_space_available(j);
+			spin_unlock(&j->lock);
+		}
+	}
+
+	mutex_unlock(&j->reclaim_lock);
+}
+
 /*
  * Journal entry pinning - machinery for holding a reference on a given journal
  * entry, holding it open to ensure it gets replayed during recovery:
  */
 
+static void bch2_journal_reclaim_fast(struct journal *j)
+{
+	struct journal_entry_pin_list temp;
+	bool popped = false;
+
+	lockdep_assert_held(&j->lock);
+
+	/*
+	 * Unpin journal entries whose reference counts reached zero, meaning
+	 * all btree nodes got written out
+	 */
+	while (!fifo_empty(&j->pin) &&
+	       !atomic_read(&fifo_peek_front(&j->pin).count)) {
+		BUG_ON(!list_empty(&fifo_peek_front(&j->pin).list));
+		BUG_ON(!fifo_pop(&j->pin, temp));
+		popped = true;
+	}
+
+	if (popped)
+		bch2_journal_space_available(j);
+}
+
+void bch2_journal_pin_put(struct journal *j, u64 seq)
+{
+	struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq);
+
+	if (atomic_dec_and_test(&pin_list->count)) {
+		spin_lock(&j->lock);
+		bch2_journal_reclaim_fast(j);
+		spin_unlock(&j->lock);
+	}
+}
+
 static inline void __journal_pin_add(struct journal *j,
 				     u64 seq,
 				     struct journal_entry_pin *pin,
@@ -25,10 +223,7 @@ static inline void __journal_pin_add(struct journal *j,
 	pin->seq	= seq;
 	pin->flush	= flush_fn;
 
-	if (flush_fn)
-		list_add(&pin->list, &pin_list->list);
-	else
-		INIT_LIST_HEAD(&pin->list);
+	list_add(&pin->list, flush_fn ? &pin_list->list : &pin_list->flushed);
 
 	/*
 	 * If the journal is currently full,  we might want to call flush_fn
@@ -130,86 +325,53 @@ void bch2_journal_pin_flush(struct journal *j, struct journal_entry_pin *pin)
  * data off of a specific device:
  */
 
-/**
- * bch2_journal_reclaim_fast - do the fast part of journal reclaim
- *
- * Called from IO submission context, does not block. Cleans up after btree
- * write completions by advancing the journal pin and each cache's last_idx,
- * kicking off discards and background reclaim as necessary.
- */
-void bch2_journal_reclaim_fast(struct journal *j)
-{
-	struct journal_entry_pin_list temp;
-	bool popped = false;
-
-	lockdep_assert_held(&j->lock);
-
-	/*
-	 * Unpin journal entries whose reference counts reached zero, meaning
-	 * all btree nodes got written out
-	 */
-	while (!fifo_empty(&j->pin) &&
-	       !atomic_read(&fifo_peek_front(&j->pin).count)) {
-		BUG_ON(!list_empty(&fifo_peek_front(&j->pin).list));
-		BUG_ON(!fifo_pop(&j->pin, temp));
-		popped = true;
-	}
-
-	if (popped)
-		journal_wake(j);
-}
-
-static void journal_pin_mark_flushing(struct journal *j,
-				      struct journal_entry_pin *pin,
-				      u64 seq)
-{
-	lockdep_assert_held(&j->reclaim_lock);
-
-	list_move(&pin->list, &journal_seq_pin(j, seq)->flushed);
-	BUG_ON(j->flush_in_progress);
-	j->flush_in_progress = pin;
-}
-
-static void journal_pin_flush(struct journal *j,
-			      struct journal_entry_pin *pin,
-			      u64 seq)
-{
-	pin->flush(j, pin, seq);
-
-	BUG_ON(j->flush_in_progress != pin);
-	j->flush_in_progress = NULL;
-	wake_up(&j->pin_flush_wait);
-}
-
 static struct journal_entry_pin *
-journal_get_next_pin(struct journal *j, u64 seq_to_flush, u64 *seq)
+journal_get_next_pin(struct journal *j, u64 max_seq, u64 *seq)
 {
 	struct journal_entry_pin_list *pin_list;
 	struct journal_entry_pin *ret = NULL;
 
-	/* no need to iterate over empty fifo entries: */
-	bch2_journal_reclaim_fast(j);
+	spin_lock(&j->lock);
+
+	BUG_ON(!atomic_read(&fifo_peek_front(&j->pin).count));
 
 	fifo_for_each_entry_ptr(pin_list, &j->pin, *seq)
-		if (*seq > seq_to_flush ||
+		if (*seq > max_seq ||
 		    (ret = list_first_entry_or_null(&pin_list->list,
 				struct journal_entry_pin, list)))
 			break;
 
+	if (ret) {
+		list_move(&ret->list, &pin_list->flushed);
+		BUG_ON(j->flush_in_progress);
+		j->flush_in_progress = ret;
+		j->last_flushed = jiffies;
+	}
+
+	spin_unlock(&j->lock);
+
 	return ret;
 }
 
-static bool should_discard_bucket(struct journal *j, struct journal_device *ja)
+static void journal_flush_pins(struct journal *j, u64 seq_to_flush,
+			       unsigned min_nr)
 {
-	bool ret;
+	struct journal_entry_pin *pin;
+	u64 seq;
 
-	spin_lock(&j->lock);
-	ret = ja->nr &&
-		(ja->last_idx != ja->cur_idx &&
-		 ja->bucket_seq[ja->last_idx] < j->last_seq_ondisk);
-	spin_unlock(&j->lock);
+	lockdep_assert_held(&j->reclaim_lock);
 
-	return ret;
+	while ((pin = journal_get_next_pin(j, min_nr
+				? U64_MAX : seq_to_flush, &seq))) {
+		if (min_nr)
+			min_nr--;
+
+		pin->flush(j, pin, seq);
+
+		BUG_ON(j->flush_in_progress != pin);
+		j->flush_in_progress = NULL;
+		wake_up(&j->pin_flush_wait);
+	}
 }
 
 /**
@@ -236,104 +398,44 @@ void bch2_journal_reclaim_work(struct work_struct *work)
 				struct bch_fs, journal.reclaim_work);
 	struct journal *j = &c->journal;
 	struct bch_dev *ca;
-	struct journal_entry_pin *pin;
-	u64 seq, seq_to_flush = 0;
-	unsigned iter, bucket_to_flush;
-	unsigned long next_flush;
-	bool reclaim_lock_held = false, need_flush;
+	unsigned iter, bucket_to_flush, min_nr = 0;
+	u64 seq_to_flush = 0;
+
+	journal_do_discards(j);
+
+	mutex_lock(&j->reclaim_lock);
+	spin_lock(&j->lock);
 
-	/*
-	 * Advance last_idx to point to the oldest journal entry containing
-	 * btree node updates that have not yet been written out
-	 */
 	for_each_rw_member(ca, c, iter) {
 		struct journal_device *ja = &ca->journal;
 
 		if (!ja->nr)
 			continue;
 
-		while (should_discard_bucket(j, ja)) {
-			if (!reclaim_lock_held) {
-				/*
-				 * ugh:
-				 * might be called from __journal_res_get()
-				 * under wait_event() - have to go back to
-				 * TASK_RUNNING before doing something that
-				 * would block, but only if we're doing work:
-				 */
-				__set_current_state(TASK_RUNNING);
-
-				mutex_lock(&j->reclaim_lock);
-				reclaim_lock_held = true;
-				/* recheck under reclaim_lock: */
-				continue;
-			}
 
-			if (ca->mi.discard &&
-			    bdev_max_discard_sectors(ca->disk_sb.bdev))
-				blkdev_issue_discard(ca->disk_sb.bdev,
-					bucket_to_sector(ca,
-						ja->buckets[ja->last_idx]),
-					ca->mi.bucket_size, GFP_NOIO);
-
-			spin_lock(&j->lock);
-			ja->last_idx = (ja->last_idx + 1) % ja->nr;
-			spin_unlock(&j->lock);
-
-			journal_wake(j);
-		}
-
-		/*
-		 * Write out enough btree nodes to free up 50% journal
-		 * buckets
-		 */
-		spin_lock(&j->lock);
+		/* Try to keep the journal at most half full: */
 		bucket_to_flush = (ja->cur_idx + (ja->nr >> 1)) % ja->nr;
 		seq_to_flush = max_t(u64, seq_to_flush,
 				     ja->bucket_seq[bucket_to_flush]);
-		spin_unlock(&j->lock);
 	}
 
 	/* Also flush if the pin fifo is more than half full */
-	spin_lock(&j->lock);
 	seq_to_flush = max_t(s64, seq_to_flush,
 			     (s64) journal_cur_seq(j) -
 			     (j->pin.size >> 1));
+	spin_unlock(&j->lock);
 
 	/*
 	 * If it's been longer than j->reclaim_delay_ms since we last flushed,
 	 * make sure to flush at least one journal pin:
 	 */
-	next_flush = j->last_flushed + msecs_to_jiffies(j->reclaim_delay_ms);
-	need_flush = time_after(jiffies, next_flush);
-
-	while ((pin = journal_get_next_pin(j, need_flush
-					   ? U64_MAX
-					   : seq_to_flush, &seq))) {
-		if (!reclaim_lock_held) {
-			spin_unlock(&j->lock);
-			__set_current_state(TASK_RUNNING);
-			mutex_lock(&j->reclaim_lock);
-			reclaim_lock_held = true;
-			spin_lock(&j->lock);
-			continue;
-		}
+	if (time_after(jiffies, j->last_flushed +
+		       msecs_to_jiffies(j->reclaim_delay_ms)))
+		min_nr = 1;
 
-		journal_pin_mark_flushing(j, pin, seq);
-		spin_unlock(&j->lock);
-
-		journal_pin_flush(j, pin, seq);
-
-		need_flush = false;
-		j->last_flushed = jiffies;
+	journal_flush_pins(j, seq_to_flush, min_nr);
 
-		spin_lock(&j->lock);
-	}
-
-	spin_unlock(&j->lock);
-
-	if (reclaim_lock_held)
-		mutex_unlock(&j->reclaim_lock);
+	mutex_unlock(&j->reclaim_lock);
 
 	if (!test_bit(BCH_FS_RO, &c->flags))
 		queue_delayed_work(c->journal_reclaim_wq, &j->reclaim_work,
@@ -342,8 +444,6 @@ void bch2_journal_reclaim_work(struct work_struct *work)
 
 static int journal_flush_done(struct journal *j, u64 seq_to_flush)
 {
-	struct journal_entry_pin *pin;
-	u64 pin_seq;
 	int ret;
 
 	ret = bch2_journal_error(j);
@@ -351,16 +451,10 @@ static int journal_flush_done(struct journal *j, u64 seq_to_flush)
 		return ret;
 
 	mutex_lock(&j->reclaim_lock);
-	spin_lock(&j->lock);
-
-	while ((pin = journal_get_next_pin(j, seq_to_flush, &pin_seq))) {
-		journal_pin_mark_flushing(j, pin, pin_seq);
-		spin_unlock(&j->lock);
 
-		journal_pin_flush(j, pin, pin_seq);
+	journal_flush_pins(j, seq_to_flush, 0);
 
-		spin_lock(&j->lock);
-	}
+	spin_lock(&j->lock);
 	/*
 	 * If journal replay hasn't completed, the unreplayed journal entries
 	 * hold refs on their corresponding sequence numbers
diff --git a/fs/bcachefs/journal_reclaim.h b/fs/bcachefs/journal_reclaim.h
index e06ac0492960..a9afb229541b 100644
--- a/fs/bcachefs/journal_reclaim.h
+++ b/fs/bcachefs/journal_reclaim.h
@@ -4,6 +4,10 @@
 
 #define JOURNAL_PIN	(32 * 1024)
 
+unsigned bch2_journal_dev_buckets_available(struct journal *,
+					    struct journal_device *);
+void bch2_journal_space_available(struct journal *);
+
 static inline bool journal_pin_active(struct journal_entry_pin *pin)
 {
 	return pin->seq != 0;
@@ -17,6 +21,8 @@ journal_seq_pin(struct journal *j, u64 seq)
 	return &j->pin.data[seq & j->pin.mask];
 }
 
+void bch2_journal_pin_put(struct journal *, u64);
+
 void bch2_journal_pin_add(struct journal *, u64, struct journal_entry_pin *,
 			  journal_pin_flush_fn);
 void bch2_journal_pin_update(struct journal *, u64, struct journal_entry_pin *,
@@ -28,7 +34,6 @@ void bch2_journal_pin_add_if_older(struct journal *,
 				  journal_pin_flush_fn);
 void bch2_journal_pin_flush(struct journal *, struct journal_entry_pin *);
 
-void bch2_journal_reclaim_fast(struct journal *);
 void bch2_journal_reclaim_work(struct work_struct *);
 
 void bch2_journal_flush_pins(struct journal *, u64);
diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h
index 3372e87be124..2f48008820ac 100644
--- a/fs/bcachefs/journal_types.h
+++ b/fs/bcachefs/journal_types.h
@@ -136,6 +136,12 @@ struct journal {
 	unsigned		cur_entry_u64s;
 	unsigned		cur_entry_sectors;
 
+	/*
+	 * 0, or -ENOSPC if waiting on journal reclaim, or -EROFS if
+	 * insufficient devices:
+	 */
+	int			cur_entry_error;
+
 	/* Reserved space in journal entry to be used just prior to write */
 	unsigned		entry_u64s_reserved;
 
-- 
cgit 


From 6409c6a0aea95c78e353141f1855c11fcff0950c Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 28 Feb 2019 22:32:09 -0500
Subject: bcachefs: use correct wq for journal reclaim

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/journal.c    | 2 +-
 fs/bcachefs/journal_io.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index 9b6f7b4136d8..3b3c342b2df2 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -971,7 +971,7 @@ void bch2_fs_journal_start(struct journal *j)
 	 */
 	bch2_journal_seq_blacklist_write(j);
 
-	queue_delayed_work(system_freezable_wq, &j->reclaim_work, 0);
+	queue_delayed_work(c->journal_reclaim_wq, &j->reclaim_work, 0);
 }
 
 /* init/exit: */
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index e4466816fafa..d4b82344221c 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -1082,7 +1082,7 @@ static void journal_write_done(struct closure *cl)
 	 * Must come before signaling write completion, for
 	 * bch2_fs_journal_stop():
 	 */
-	mod_delayed_work(system_freezable_wq, &j->reclaim_work, 0);
+	mod_delayed_work(c->journal_reclaim_wq, &j->reclaim_work, 0);
 out:
 	/* also must come before signalling write completion: */
 	closure_debug_destroy(cl);
-- 
cgit 


From 1633e492ce07e26af58b11dd26039bf7c0080c96 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 28 Feb 2019 22:33:06 -0500
Subject: bcachefs: improved flush_held_btree_writes()

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c | 33 +++++++++++++--------------------
 fs/bcachefs/recovery.c         |  6 ++++--
 2 files changed, 17 insertions(+), 22 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index fd39eeae5740..52b0d27dbc68 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -1467,24 +1467,16 @@ int bch2_dev_allocator_start(struct bch_dev *ca)
 	return 0;
 }
 
-static void flush_held_btree_writes(struct bch_fs *c)
+static bool flush_done(struct bch_fs *c)
 {
 	struct bucket_table *tbl;
 	struct rhash_head *pos;
 	struct btree *b;
-	bool nodes_blocked;
+	bool nodes_unwritten;
 	size_t i;
-	struct closure cl;
-
-	closure_init_stack(&cl);
-
-	clear_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags);
 again:
-	pr_debug("flushing dirty btree nodes");
 	cond_resched();
-	closure_wait(&c->btree_interior_update_wait, &cl);
-
-	nodes_blocked = false;
+	nodes_unwritten = false;
 
 	rcu_read_lock();
 	for_each_cached_btree(b, c, tbl, i, pos)
@@ -1496,24 +1488,25 @@ again:
 				six_unlock_read(&b->lock);
 				goto again;
 			} else {
-				nodes_blocked = true;
+				nodes_unwritten = true;
 			}
 		}
 	rcu_read_unlock();
 
-	if (c->btree_roots_dirty)
+	if (c->btree_roots_dirty) {
 		bch2_journal_meta(&c->journal);
-
-	if (nodes_blocked) {
-		closure_sync(&cl);
 		goto again;
 	}
 
-	closure_wake_up(&c->btree_interior_update_wait);
-	closure_sync(&cl);
+	return !nodes_unwritten &&
+		!bch2_btree_interior_updates_nr_pending(c);
+}
+
+static void flush_held_btree_writes(struct bch_fs *c)
+{
+	clear_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags);
 
-	closure_wait_event(&c->btree_interior_update_wait,
-			   !bch2_btree_interior_updates_nr_pending(c));
+	closure_wait_event(&c->btree_interior_update_wait, flush_done(c));
 }
 
 static void allocator_start_issue_discards(struct bch_fs *c)
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 5ceab8c14d72..fdc64e199f8b 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -413,11 +413,13 @@ int bch2_fs_initialize(struct bch_fs *c)
 		bch2_btree_root_alloc(c, i);
 
 	err = "unable to allocate journal buckets";
-	for_each_online_member(ca, c, i)
-		if (bch2_dev_journal_alloc(ca)) {
+	for_each_online_member(ca, c, i) {
+		ret = bch2_dev_journal_alloc(ca);
+		if (ret) {
 			percpu_ref_put(&ca->io_ref);
 			goto err;
 		}
+	}
 
 	/*
 	 * journal_res_get() will crash if called before this has
-- 
cgit 


From dc9aa17841e83b6d6ca5abe295545ca0764e1580 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 1 Mar 2019 15:51:57 -0500
Subject: bcachefs: Drop a faulty assertion

the assertion was meant to check that bch2_journal_reclaim_fast() was
always being called, but since the atomic dec can happen outside of
j->lock the assertion itself can race

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/journal_reclaim.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
index 3b5b646859cb..431afeab42b0 100644
--- a/fs/bcachefs/journal_reclaim.c
+++ b/fs/bcachefs/journal_reclaim.c
@@ -333,8 +333,6 @@ journal_get_next_pin(struct journal *j, u64 max_seq, u64 *seq)
 
 	spin_lock(&j->lock);
 
-	BUG_ON(!atomic_read(&fifo_peek_front(&j->pin).count));
-
 	fifo_for_each_entry_ptr(pin_list, &j->pin, *seq)
 		if (*seq > max_seq ||
 		    (ret = list_first_entry_or_null(&pin_list->list,
-- 
cgit 


From fcbf3e509648a94129ae23a6101f5295a3fdced0 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 1 Mar 2019 17:21:44 -0500
Subject: bcachefs: Allocator startup fixes/refactoring

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c | 91 ++++++++++++++++++++++--------------------
 1 file changed, 48 insertions(+), 43 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 52b0d27dbc68..5ea7abc496de 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -1467,7 +1467,7 @@ int bch2_dev_allocator_start(struct bch_dev *ca)
 	return 0;
 }
 
-static bool flush_done(struct bch_fs *c)
+static bool flush_held_btree_writes(struct bch_fs *c)
 {
 	struct bucket_table *tbl;
 	struct rhash_head *pos;
@@ -1502,13 +1502,6 @@ again:
 		!bch2_btree_interior_updates_nr_pending(c);
 }
 
-static void flush_held_btree_writes(struct bch_fs *c)
-{
-	clear_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags);
-
-	closure_wait_event(&c->btree_interior_update_wait, flush_done(c));
-}
-
 static void allocator_start_issue_discards(struct bch_fs *c)
 {
 	struct bch_dev *ca;
@@ -1540,25 +1533,24 @@ static int resize_free_inc(struct bch_dev *ca)
 	return 0;
 }
 
-static int __bch2_fs_allocator_start(struct bch_fs *c)
+static bool bch2_fs_allocator_start_fast(struct bch_fs *c)
 {
 	struct bch_dev *ca;
 	unsigned dev_iter;
-	u64 journal_seq = 0;
-	long bu;
-	int ret = 0;
+	bool ret = true;
 
 	if (test_alloc_startup(c))
-		goto not_enough;
+		return false;
+
+	down_read(&c->gc_lock);
 
 	/* Scan for buckets that are already invalidated: */
 	for_each_rw_member(ca, c, dev_iter) {
 		struct bucket_array *buckets;
 		struct bucket_mark m;
+		long bu;
 
 		down_read(&ca->bucket_lock);
-		percpu_down_read(&c->mark_lock);
-
 		buckets = bucket_array(ca);
 
 		for (bu = buckets->first_bucket;
@@ -1566,13 +1558,16 @@ static int __bch2_fs_allocator_start(struct bch_fs *c)
 			m = READ_ONCE(buckets->b[bu].mark);
 
 			if (!buckets->b[bu].gen_valid ||
-			    !test_bit(bu, ca->buckets_nouse) ||
 			    !is_available_bucket(m) ||
-			    m.cached_sectors)
+			    m.cached_sectors ||
+			    (ca->buckets_nouse &&
+			     test_bit(bu, ca->buckets_nouse)))
 				continue;
 
+			percpu_down_read(&c->mark_lock);
 			bch2_mark_alloc_bucket(c, ca, bu, true,
 					gc_pos_alloc(c, NULL), 0);
+			percpu_up_read(&c->mark_lock);
 
 			fifo_push(&ca->free_inc, bu);
 
@@ -1581,19 +1576,28 @@ static int __bch2_fs_allocator_start(struct bch_fs *c)
 			if (fifo_full(&ca->free[RESERVE_BTREE]))
 				break;
 		}
-		percpu_up_read(&c->mark_lock);
 		up_read(&ca->bucket_lock);
 	}
 
+	up_read(&c->gc_lock);
+
 	/* did we find enough buckets? */
 	for_each_rw_member(ca, c, dev_iter)
-		if (!fifo_full(&ca->free[RESERVE_BTREE])) {
-			percpu_ref_put(&ca->io_ref);
-			goto not_enough;
-		}
+		if (!fifo_full(&ca->free[RESERVE_BTREE]))
+			ret = false;
+
+	return ret;
+}
+
+static int __bch2_fs_allocator_start(struct bch_fs *c)
+{
+	struct bch_dev *ca;
+	unsigned dev_iter;
+	u64 journal_seq = 0;
+	bool wrote;
+	long bu;
+	int ret = 0;
 
-	return 0;
-not_enough:
 	pr_debug("not enough empty buckets; scanning for reclaimable buckets");
 
 	/*
@@ -1607,8 +1611,9 @@ not_enough:
 	 */
 	set_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags);
 
-	while (1) {
-		bool wrote = false;
+	down_read(&c->gc_lock);
+	do {
+		wrote = false;
 
 		for_each_rw_member(ca, c, dev_iter) {
 			find_reclaimable_buckets(c, ca);
@@ -1618,7 +1623,8 @@ not_enough:
 				ret = resize_free_inc(ca);
 				if (ret) {
 					percpu_ref_put(&ca->io_ref);
-					return ret;
+					up_read(&c->gc_lock);
+					goto err;
 				}
 
 				bch2_invalidate_one_bucket(c, ca, bu,
@@ -1644,27 +1650,26 @@ not_enough:
 		 * enough buckets, so just scan and loop again as long as it
 		 * made some progress:
 		 */
-		if (!wrote && ret)
-			return ret;
-		if (!wrote && !ret)
-			break;
-	}
+	} while (wrote);
+	up_read(&c->gc_lock);
+
+	if (ret)
+		goto err;
 
 	pr_debug("flushing journal");
 
 	ret = bch2_journal_flush(&c->journal);
 	if (ret)
-		return ret;
+		goto err;
 
 	pr_debug("issuing discards");
 	allocator_start_issue_discards(c);
+err:
+	clear_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags);
+	closure_wait_event(&c->btree_interior_update_wait,
+			   flush_held_btree_writes(c));
 
-	set_bit(BCH_FS_ALLOCATOR_STARTED, &c->flags);
-
-	/* now flush dirty btree nodes: */
-	flush_held_btree_writes(c);
-
-	return 0;
+	return ret;
 }
 
 int bch2_fs_allocator_start(struct bch_fs *c)
@@ -1673,13 +1678,13 @@ int bch2_fs_allocator_start(struct bch_fs *c)
 	unsigned i;
 	int ret;
 
-	down_read(&c->gc_lock);
-	ret = __bch2_fs_allocator_start(c);
-	up_read(&c->gc_lock);
-
+	ret = bch2_fs_allocator_start_fast(c) ? 0 :
+		__bch2_fs_allocator_start(c);
 	if (ret)
 		return ret;
 
+	set_bit(BCH_FS_ALLOCATOR_STARTED, &c->flags);
+
 	for_each_rw_member(ca, c, i) {
 		ret = bch2_dev_allocator_start(ca);
 		if (ret) {
-- 
cgit 


From 0ce2dbbe9915af85b2ebafe6dfeca6813ba5e13c Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 3 Mar 2019 15:15:55 -0500
Subject: bcachefs: ja->discard_idx, ja->dirty_idx

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/journal.c         | 42 ++++++++++++++++++-----------
 fs/bcachefs/journal_io.c      | 12 +++++----
 fs/bcachefs/journal_reclaim.c | 63 ++++++++++++++++++++++++++-----------------
 fs/bcachefs/journal_types.h   | 24 ++++++++---------
 4 files changed, 83 insertions(+), 58 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index 3b3c342b2df2..17add726f2ac 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -760,6 +760,7 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
 
 	while (ja->nr < nr) {
 		struct open_bucket *ob = NULL;
+		unsigned pos;
 		long bucket;
 
 		if (new_fs) {
@@ -786,20 +787,24 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
 			preempt_disable();
 		}
 
-		__array_insert_item(ja->buckets,		ja->nr, ja->last_idx);
-		__array_insert_item(ja->bucket_seq,		ja->nr, ja->last_idx);
-		__array_insert_item(journal_buckets->buckets,	ja->nr, ja->last_idx);
+		pos = ja->nr ? (ja->cur_idx + 1) % ja->nr : 0;
+		__array_insert_item(ja->buckets,		ja->nr, pos);
+		__array_insert_item(ja->bucket_seq,		ja->nr, pos);
+		__array_insert_item(journal_buckets->buckets,	ja->nr, pos);
+		ja->nr++;
 
-		ja->buckets[ja->last_idx] = bucket;
-		ja->bucket_seq[ja->last_idx] = 0;
-		journal_buckets->buckets[ja->last_idx] = cpu_to_le64(bucket);
+		ja->buckets[pos] = bucket;
+		ja->bucket_seq[pos] = 0;
+		journal_buckets->buckets[pos] = cpu_to_le64(bucket);
 
-		if (ja->last_idx < ja->nr) {
-			if (ja->cur_idx >= ja->last_idx)
-				ja->cur_idx++;
-			ja->last_idx++;
-		}
-		ja->nr++;
+		if (pos <= ja->discard_idx)
+			ja->discard_idx = (ja->discard_idx + 1) % ja->nr;
+		if (pos <= ja->dirty_idx_ondisk)
+			ja->dirty_idx_ondisk = (ja->dirty_idx_ondisk + 1) % ja->nr;
+		if (pos <= ja->dirty_idx)
+			ja->dirty_idx = (ja->dirty_idx + 1) % ja->nr;
+		if (pos <= ja->cur_idx)
+			ja->cur_idx = (ja->cur_idx + 1) % ja->nr;
 
 		bch2_mark_metadata_bucket(c, ca, bucket, BCH_DATA_JOURNAL,
 					  ca->mi.bucket_size,
@@ -1042,6 +1047,7 @@ int bch2_fs_journal_init(struct journal *j)
 	mutex_init(&j->blacklist_lock);
 	INIT_LIST_HEAD(&j->seq_blacklist);
 	mutex_init(&j->reclaim_lock);
+	mutex_init(&j->discard_lock);
 
 	lockdep_init_map(&j->res_map, "journal res", &res_key, 0);
 
@@ -1138,13 +1144,17 @@ ssize_t bch2_journal_print_debug(struct journal *j, char *buf)
 		       "dev %u:\n"
 		       "\tnr\t\t%u\n"
 		       "\tavailable\t%u:%u\n"
-		       "\tcur_idx\t\t%u (seq %llu)\n"
-		       "\tlast_idx\t%u (seq %llu)\n",
+		       "\tdiscard_idx\t\t%u\n"
+		       "\tdirty_idx_ondisk\t%u (seq %llu)\n"
+		       "\tdirty_idx\t\t%u (seq %llu)\n"
+		       "\tcur_idx\t\t%u (seq %llu)\n",
 		       iter, ja->nr,
 		       bch2_journal_dev_buckets_available(j, ja),
 		       ja->sectors_free,
-		       ja->cur_idx,	ja->bucket_seq[ja->cur_idx],
-		       ja->last_idx,	ja->bucket_seq[ja->last_idx]);
+		       ja->discard_idx,
+		       ja->dirty_idx_ondisk,	ja->bucket_seq[ja->dirty_idx_ondisk],
+		       ja->dirty_idx,		ja->bucket_seq[ja->dirty_idx],
+		       ja->cur_idx,		ja->bucket_seq[ja->cur_idx]);
 	}
 
 	spin_unlock(&j->lock);
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index d4b82344221c..b6a51dff0978 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -625,11 +625,12 @@ static void bch2_journal_read_device(struct closure *cl)
 	ja->sectors_free = 0;
 
 	/*
-	 * Set last_idx to indicate the entire journal is full and needs to be
+	 * Set dirty_idx to indicate the entire journal is full and needs to be
 	 * reclaimed - journal reclaim will immediately reclaim whatever isn't
 	 * pinned when it first runs:
 	 */
-	ja->last_idx = (ja->cur_idx + 1) % ja->nr;
+	ja->discard_idx = ja->dirty_idx_ondisk =
+		ja->dirty_idx = (ja->cur_idx + 1) % ja->nr;
 out:
 	kvpfree(buf.data, buf.size);
 	percpu_ref_put(&ca->io_ref);
@@ -1069,12 +1070,13 @@ static void journal_write_done(struct closure *cl)
 		goto err;
 
 	spin_lock(&j->lock);
-	j->seq_ondisk		= seq;
-	j->last_seq_ondisk	= last_seq;
-
 	if (seq >= j->pin.front)
 		journal_seq_pin(j, seq)->devs = devs;
 
+	j->seq_ondisk		= seq;
+	j->last_seq_ondisk	= last_seq;
+	bch2_journal_space_available(j);
+
 	/*
 	 * Updating last_seq_ondisk may let bch2_journal_reclaim_work() discard
 	 * more buckets:
diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
index 431afeab42b0..3a85fb8b8526 100644
--- a/fs/bcachefs/journal_reclaim.c
+++ b/fs/bcachefs/journal_reclaim.c
@@ -14,22 +14,20 @@ unsigned bch2_journal_dev_buckets_available(struct journal *j,
 {
 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
 	unsigned next = (ja->cur_idx + 1) % ja->nr;
-	unsigned available = (ja->last_idx + ja->nr - next) % ja->nr;
+	unsigned available = (ja->discard_idx + ja->nr - next) % ja->nr;
 
 	/*
 	 * Allocator startup needs some journal space before we can do journal
 	 * replay:
 	 */
-	if (available &&
-	    test_bit(BCH_FS_ALLOCATOR_STARTED, &c->flags))
-		available--;
+	if (available && test_bit(BCH_FS_ALLOCATOR_STARTED, &c->flags))
+		--available;
 
 	/*
 	 * Don't use the last bucket unless writing the new last_seq
 	 * will make another bucket available:
 	 */
-	if (available &&
-	    journal_last_seq(j) <= ja->bucket_seq[ja->last_idx])
+	if (available && ja->dirty_idx_ondisk == ja->dirty_idx)
 		--available;
 
 	return available;
@@ -55,12 +53,34 @@ void bch2_journal_space_available(struct journal *j)
 	for_each_member_device_rcu(ca, c, i,
 				   &c->rw_devs[BCH_DATA_JOURNAL]) {
 		struct journal_device *ja = &ca->journal;
-		unsigned buckets_this_device, sectors_this_device;
 
 		if (!ja->nr)
 			continue;
 
+		while (ja->dirty_idx != ja->cur_idx &&
+		       ja->bucket_seq[ja->dirty_idx] < journal_last_seq(j))
+			ja->dirty_idx = (ja->dirty_idx + 1) % ja->nr;
+
+		while (ja->dirty_idx_ondisk != ja->dirty_idx &&
+		       ja->bucket_seq[ja->dirty_idx_ondisk] < j->last_seq_ondisk)
+			ja->dirty_idx_ondisk = (ja->dirty_idx_ondisk + 1) % ja->nr;
+
 		nr_online++;
+	}
+
+	if (nr_online < c->opts.metadata_replicas_required) {
+		ret = -EROFS;
+		sectors_next_entry = 0;
+		goto out;
+	}
+
+	for_each_member_device_rcu(ca, c, i,
+				   &c->rw_devs[BCH_DATA_JOURNAL]) {
+		struct journal_device *ja = &ca->journal;
+		unsigned buckets_this_device, sectors_this_device;
+
+		if (!ja->nr)
+			continue;
 
 		buckets_this_device = bch2_journal_dev_buckets_available(j, ja);
 		sectors_this_device = ja->sectors_free;
@@ -100,20 +120,17 @@ void bch2_journal_space_available(struct journal *j)
 
 		nr_devs++;
 	}
-	rcu_read_unlock();
 
-	if (nr_online < c->opts.metadata_replicas_required) {
-		ret = -EROFS;
-		sectors_next_entry = 0;
-	} else if (!sectors_next_entry ||
-		   nr_devs < min_t(unsigned, nr_online,
-				   c->opts.metadata_replicas)) {
+	if (!sectors_next_entry ||
+	    nr_devs < min_t(unsigned, nr_online, c->opts.metadata_replicas)) {
 		ret = -ENOSPC;
 		sectors_next_entry = 0;
 	} else if (!fifo_free(&j->pin)) {
 		ret = -ENOSPC;
 		sectors_next_entry = 0;
 	}
+out:
+	rcu_read_unlock();
 
 	j->cur_entry_sectors	= sectors_next_entry;
 	j->cur_entry_error	= ret;
@@ -129,25 +146,23 @@ static bool should_discard_bucket(struct journal *j, struct journal_device *ja)
 	bool ret;
 
 	spin_lock(&j->lock);
-	ret = ja->nr &&
-		ja->last_idx != ja->cur_idx &&
-		ja->bucket_seq[ja->last_idx] < j->last_seq_ondisk;
+	ret = ja->discard_idx != ja->dirty_idx_ondisk;
 	spin_unlock(&j->lock);
 
 	return ret;
 }
 
 /*
- * Advance ja->last_idx as long as it points to buckets that are no longer
+ * Advance ja->discard_idx as long as it points to buckets that are no longer
  * dirty, issuing discards if necessary:
  */
-static void journal_do_discards(struct journal *j)
+static void bch2_journal_do_discards(struct journal *j)
 {
 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
 	struct bch_dev *ca;
 	unsigned iter;
 
-	mutex_lock(&j->reclaim_lock);
+	mutex_lock(&j->discard_lock);
 
 	for_each_rw_member(ca, c, iter) {
 		struct journal_device *ja = &ca->journal;
@@ -157,18 +172,18 @@ static void journal_do_discards(struct journal *j)
 			    bdev_max_discard_sectors(ca->disk_sb.bdev))
 				blkdev_issue_discard(ca->disk_sb.bdev,
 					bucket_to_sector(ca,
-						ja->buckets[ja->last_idx]),
+						ja->buckets[ja->discard_idx]),
 					ca->mi.bucket_size, GFP_NOIO);
 
 			spin_lock(&j->lock);
-			ja->last_idx = (ja->last_idx + 1) % ja->nr;
+			ja->discard_idx = (ja->discard_idx + 1) % ja->nr;
 
 			bch2_journal_space_available(j);
 			spin_unlock(&j->lock);
 		}
 	}
 
-	mutex_unlock(&j->reclaim_lock);
+	mutex_unlock(&j->discard_lock);
 }
 
 /*
@@ -399,7 +414,7 @@ void bch2_journal_reclaim_work(struct work_struct *work)
 	unsigned iter, bucket_to_flush, min_nr = 0;
 	u64 seq_to_flush = 0;
 
-	journal_do_discards(j);
+	bch2_journal_do_discards(j);
 
 	mutex_lock(&j->reclaim_lock);
 	spin_lock(&j->lock);
diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h
index 2f48008820ac..09b2d2223033 100644
--- a/fs/bcachefs/journal_types.h
+++ b/fs/bcachefs/journal_types.h
@@ -193,9 +193,6 @@ struct journal {
 		struct journal_entry_pin_list *data;
 	}			pin;
 
-	struct journal_entry_pin *flush_in_progress;
-	wait_queue_head_t	pin_flush_wait;
-
 	u64			replay_journal_seq;
 
 	struct mutex		blacklist_lock;
@@ -206,10 +203,13 @@ struct journal {
 	spinlock_t		err_lock;
 
 	struct delayed_work	reclaim_work;
+	struct mutex		reclaim_lock;
 	unsigned long		last_flushed;
+	struct journal_entry_pin *flush_in_progress;
+	wait_queue_head_t	pin_flush_wait;
 
-	/* protects advancing ja->last_idx: */
-	struct mutex		reclaim_lock;
+	/* protects advancing ja->discard_idx: */
+	struct mutex		discard_lock;
 	unsigned		write_delay_ms;
 	unsigned		reclaim_delay_ms;
 
@@ -240,17 +240,15 @@ struct journal_device {
 
 	unsigned		sectors_free;
 
-	/* Journal bucket we're currently writing to */
-	unsigned		cur_idx;
-
-	/* Last journal bucket that still contains an open journal entry */
-
 	/*
-	 * j->lock and j->reclaim_lock must both be held to modify, j->lock
-	 * sufficient to read:
+	 * discard_idx <= dirty_idx_ondisk <= dirty_idx <= cur_idx:
 	 */
-	unsigned		last_idx;
+	unsigned		discard_idx;		/* Next bucket to discard */
+	unsigned		dirty_idx_ondisk;
+	unsigned		dirty_idx;
+	unsigned		cur_idx;		/* Journal bucket we're currently writing to */
 	unsigned		nr;
+
 	u64			*buckets;
 
 	/* Bio for journal reads/writes to this device */
-- 
cgit 


From 2384db8f32a2df9e71cd3003d213b48f64cbde1e Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 3 Mar 2019 18:39:07 -0500
Subject: bcachefs: Separate discards from rest of journal reclaim

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/journal.c         | 13 ++++++++++++-
 fs/bcachefs/journal_reclaim.c |  8 +++++++-
 fs/bcachefs/journal_reclaim.h |  1 +
 fs/bcachefs/journal_types.h   |  2 ++
 4 files changed, 22 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index 17add726f2ac..80d7980cf5aa 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -322,6 +322,7 @@ static int __journal_res_get(struct journal *j, struct journal_res *res,
 {
 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
 	struct journal_buf *buf;
+	bool can_discard;
 	int ret;
 retry:
 	if (journal_res_get_fast(j, res, flags))
@@ -370,18 +371,28 @@ retry:
 	    !j->res_get_blocked_start)
 		j->res_get_blocked_start = local_clock() ?: 1;
 
+	can_discard = j->can_discard;
 	spin_unlock(&j->lock);
 
 	if (!ret)
 		goto retry;
+
 	if (ret == -ENOSPC) {
 		/*
 		 * Journal is full - can't rely on reclaim from work item due to
 		 * freezing:
 		 */
 		trace_journal_full(c);
-		if (!(flags & JOURNAL_RES_GET_NONBLOCK))
+
+		if (!(flags & JOURNAL_RES_GET_NONBLOCK)) {
+			if (can_discard) {
+				bch2_journal_do_discards(j);
+				goto retry;
+			}
+
 			bch2_journal_reclaim_work(&j->reclaim_work.work);
+		}
+
 		ret = -EAGAIN;
 	}
 
diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
index 3a85fb8b8526..ac9e6cb3d4ee 100644
--- a/fs/bcachefs/journal_reclaim.c
+++ b/fs/bcachefs/journal_reclaim.c
@@ -45,6 +45,7 @@ void bch2_journal_space_available(struct journal *j)
 	unsigned unwritten_sectors = j->reservations.prev_buf_unwritten
 		? journal_prev_buf(j)->sectors
 		: 0;
+	bool can_discard = false;
 	int ret = 0;
 
 	lockdep_assert_held(&j->lock);
@@ -65,9 +66,14 @@ void bch2_journal_space_available(struct journal *j)
 		       ja->bucket_seq[ja->dirty_idx_ondisk] < j->last_seq_ondisk)
 			ja->dirty_idx_ondisk = (ja->dirty_idx_ondisk + 1) % ja->nr;
 
+		if (ja->discard_idx != ja->dirty_idx_ondisk)
+			can_discard = true;
+
 		nr_online++;
 	}
 
+	j->can_discard = can_discard;
+
 	if (nr_online < c->opts.metadata_replicas_required) {
 		ret = -EROFS;
 		sectors_next_entry = 0;
@@ -156,7 +162,7 @@ static bool should_discard_bucket(struct journal *j, struct journal_device *ja)
  * Advance ja->discard_idx as long as it points to buckets that are no longer
  * dirty, issuing discards if necessary:
  */
-static void bch2_journal_do_discards(struct journal *j)
+void bch2_journal_do_discards(struct journal *j)
 {
 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
 	struct bch_dev *ca;
diff --git a/fs/bcachefs/journal_reclaim.h b/fs/bcachefs/journal_reclaim.h
index a9afb229541b..183419ea3e25 100644
--- a/fs/bcachefs/journal_reclaim.h
+++ b/fs/bcachefs/journal_reclaim.h
@@ -34,6 +34,7 @@ void bch2_journal_pin_add_if_older(struct journal *,
 				  journal_pin_flush_fn);
 void bch2_journal_pin_flush(struct journal *, struct journal_entry_pin *);
 
+void bch2_journal_do_discards(struct journal *);
 void bch2_journal_reclaim_work(struct work_struct *);
 
 void bch2_journal_flush_pins(struct journal *, u64);
diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h
index 09b2d2223033..c91a21e07809 100644
--- a/fs/bcachefs/journal_types.h
+++ b/fs/bcachefs/journal_types.h
@@ -210,6 +210,8 @@ struct journal {
 
 	/* protects advancing ja->discard_idx: */
 	struct mutex		discard_lock;
+	bool			can_discard;
+
 	unsigned		write_delay_ms;
 	unsigned		reclaim_delay_ms;
 
-- 
cgit 


From 03d5eaed8624fdc7918478bffd05d67e773ac7d0 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 3 Mar 2019 16:50:40 -0500
Subject: bcachefs: bch2_journal_space_available improvements

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/journal.c         |   2 +-
 fs/bcachefs/journal_io.c      |   3 +-
 fs/bcachefs/journal_reclaim.c | 145 +++++++++++++++++++++++++++---------------
 fs/bcachefs/journal_reclaim.h |   9 ++-
 4 files changed, 103 insertions(+), 56 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index 80d7980cf5aa..5caa01881d00 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -1160,7 +1160,7 @@ ssize_t bch2_journal_print_debug(struct journal *j, char *buf)
 		       "\tdirty_idx\t\t%u (seq %llu)\n"
 		       "\tcur_idx\t\t%u (seq %llu)\n",
 		       iter, ja->nr,
-		       bch2_journal_dev_buckets_available(j, ja),
+		       bch2_journal_dev_buckets_available(j, ja, journal_space_discarded),
 		       ja->sectors_free,
 		       ja->discard_idx,
 		       ja->dirty_idx_ondisk,	ja->bucket_seq[ja->dirty_idx_ondisk],
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index b6a51dff0978..07cfbb975c37 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -970,7 +970,8 @@ static int journal_write_alloc(struct journal *j, struct journal_buf *w,
 
 		if (sectors > ja->sectors_free &&
 		    sectors <= ca->mi.bucket_size &&
-		    bch2_journal_dev_buckets_available(j, ja)) {
+		    bch2_journal_dev_buckets_available(j, ja,
+					journal_space_discarded)) {
 			ja->cur_idx = (ja->cur_idx + 1) % ja->nr;
 			ja->sectors_free = ca->mi.bucket_size;
 		}
diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
index ac9e6cb3d4ee..0884fc823cdf 100644
--- a/fs/bcachefs/journal_reclaim.c
+++ b/fs/bcachefs/journal_reclaim.c
@@ -9,12 +9,28 @@
 
 /* Free space calculations: */
 
+static unsigned journal_space_from(struct journal_device *ja,
+				   enum journal_space_from from)
+{
+	switch (from) {
+	case journal_space_discarded:
+		return ja->discard_idx;
+	case journal_space_clean_ondisk:
+		return ja->dirty_idx_ondisk;
+	case journal_space_clean:
+		return ja->dirty_idx;
+	default:
+		BUG();
+	}
+}
+
 unsigned bch2_journal_dev_buckets_available(struct journal *j,
-					    struct journal_device *ja)
+					    struct journal_device *ja,
+					    enum journal_space_from from)
 {
 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
-	unsigned next = (ja->cur_idx + 1) % ja->nr;
-	unsigned available = (ja->discard_idx + ja->nr - next) % ja->nr;
+	unsigned available = (journal_space_from(ja, from) -
+			      ja->cur_idx - 1 + ja->nr) % ja->nr;
 
 	/*
 	 * Allocator startup needs some journal space before we can do journal
@@ -33,53 +49,22 @@ unsigned bch2_journal_dev_buckets_available(struct journal *j,
 	return available;
 }
 
-void bch2_journal_space_available(struct journal *j)
+static struct journal_space {
+	unsigned	next_entry;
+	unsigned	remaining;
+} __journal_space_available(struct journal *j, unsigned nr_devs_want,
+			    enum journal_space_from from)
 {
 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
 	struct bch_dev *ca;
 	unsigned sectors_next_entry	= UINT_MAX;
 	unsigned sectors_total		= UINT_MAX;
-	unsigned max_entry_size		= min(j->buf[0].buf_size >> 9,
-					      j->buf[1].buf_size >> 9);
-	unsigned i, nr_online = 0, nr_devs = 0;
+	unsigned i, nr_devs = 0;
 	unsigned unwritten_sectors = j->reservations.prev_buf_unwritten
 		? journal_prev_buf(j)->sectors
 		: 0;
-	bool can_discard = false;
-	int ret = 0;
-
-	lockdep_assert_held(&j->lock);
 
 	rcu_read_lock();
-	for_each_member_device_rcu(ca, c, i,
-				   &c->rw_devs[BCH_DATA_JOURNAL]) {
-		struct journal_device *ja = &ca->journal;
-
-		if (!ja->nr)
-			continue;
-
-		while (ja->dirty_idx != ja->cur_idx &&
-		       ja->bucket_seq[ja->dirty_idx] < journal_last_seq(j))
-			ja->dirty_idx = (ja->dirty_idx + 1) % ja->nr;
-
-		while (ja->dirty_idx_ondisk != ja->dirty_idx &&
-		       ja->bucket_seq[ja->dirty_idx_ondisk] < j->last_seq_ondisk)
-			ja->dirty_idx_ondisk = (ja->dirty_idx_ondisk + 1) % ja->nr;
-
-		if (ja->discard_idx != ja->dirty_idx_ondisk)
-			can_discard = true;
-
-		nr_online++;
-	}
-
-	j->can_discard = can_discard;
-
-	if (nr_online < c->opts.metadata_replicas_required) {
-		ret = -EROFS;
-		sectors_next_entry = 0;
-		goto out;
-	}
-
 	for_each_member_device_rcu(ca, c, i,
 				   &c->rw_devs[BCH_DATA_JOURNAL]) {
 		struct journal_device *ja = &ca->journal;
@@ -88,7 +73,7 @@ void bch2_journal_space_available(struct journal *j)
 		if (!ja->nr)
 			continue;
 
-		buckets_this_device = bch2_journal_dev_buckets_available(j, ja);
+		buckets_this_device = bch2_journal_dev_buckets_available(j, ja, from);
 		sectors_this_device = ja->sectors_free;
 
 		/*
@@ -121,24 +106,78 @@ void bch2_journal_space_available(struct journal *j)
 			buckets_this_device * ca->mi.bucket_size +
 			sectors_this_device);
 
-		max_entry_size = min_t(unsigned, max_entry_size,
-				       ca->mi.bucket_size);
-
 		nr_devs++;
 	}
+	rcu_read_unlock();
 
-	if (!sectors_next_entry ||
-	    nr_devs < min_t(unsigned, nr_online, c->opts.metadata_replicas)) {
-		ret = -ENOSPC;
-		sectors_next_entry = 0;
-	} else if (!fifo_free(&j->pin)) {
-		ret = -ENOSPC;
-		sectors_next_entry = 0;
+	if (nr_devs < nr_devs_want)
+		return (struct journal_space) { 0, 0 };
+
+	return (struct journal_space) {
+		.next_entry	= sectors_next_entry,
+		.remaining	= max_t(int, 0, sectors_total - sectors_next_entry),
+	};
+}
+
+void bch2_journal_space_available(struct journal *j)
+{
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
+	struct bch_dev *ca;
+	struct journal_space discarded, clean_ondisk, clean;
+	unsigned max_entry_size		= min(j->buf[0].buf_size >> 9,
+					      j->buf[1].buf_size >> 9);
+	unsigned i, nr_online = 0, nr_devs_want;
+	bool can_discard = false;
+	int ret = 0;
+
+	lockdep_assert_held(&j->lock);
+
+	rcu_read_lock();
+	for_each_member_device_rcu(ca, c, i,
+				   &c->rw_devs[BCH_DATA_JOURNAL]) {
+		struct journal_device *ja = &ca->journal;
+
+		if (!ja->nr)
+			continue;
+
+		while (ja->dirty_idx != ja->cur_idx &&
+		       ja->bucket_seq[ja->dirty_idx] < journal_last_seq(j))
+			ja->dirty_idx = (ja->dirty_idx + 1) % ja->nr;
+
+		while (ja->dirty_idx_ondisk != ja->dirty_idx &&
+		       ja->bucket_seq[ja->dirty_idx_ondisk] < j->last_seq_ondisk)
+			ja->dirty_idx_ondisk = (ja->dirty_idx_ondisk + 1) % ja->nr;
+
+		if (ja->discard_idx != ja->dirty_idx_ondisk)
+			can_discard = true;
+
+		max_entry_size = min_t(unsigned, max_entry_size, ca->mi.bucket_size);
+		nr_online++;
 	}
-out:
 	rcu_read_unlock();
 
-	j->cur_entry_sectors	= sectors_next_entry;
+	j->can_discard = can_discard;
+
+	if (nr_online < c->opts.metadata_replicas_required) {
+		ret = -EROFS;
+		goto out;
+	}
+
+	if (!fifo_free(&j->pin)) {
+		ret = -ENOSPC;
+		goto out;
+	}
+
+	nr_devs_want = min_t(unsigned, nr_online, c->opts.metadata_replicas);
+
+	discarded	= __journal_space_available(j, nr_devs_want, journal_space_discarded);
+	clean_ondisk	= __journal_space_available(j, nr_devs_want, journal_space_clean_ondisk);
+	clean		= __journal_space_available(j, nr_devs_want, journal_space_clean);
+
+	if (!discarded.next_entry)
+		ret = -ENOSPC;
+out:
+	j->cur_entry_sectors	= !ret ? discarded.next_entry : 0;
 	j->cur_entry_error	= ret;
 
 	if (!ret)
diff --git a/fs/bcachefs/journal_reclaim.h b/fs/bcachefs/journal_reclaim.h
index 183419ea3e25..71545ad3bd58 100644
--- a/fs/bcachefs/journal_reclaim.h
+++ b/fs/bcachefs/journal_reclaim.h
@@ -4,8 +4,15 @@
 
 #define JOURNAL_PIN	(32 * 1024)
 
+enum journal_space_from {
+	journal_space_discarded,
+	journal_space_clean_ondisk,
+	journal_space_clean,
+};
+
 unsigned bch2_journal_dev_buckets_available(struct journal *,
-					    struct journal_device *);
+					    struct journal_device *,
+					    enum journal_space_from);
 void bch2_journal_space_available(struct journal *);
 
 static inline bool journal_pin_active(struct journal_entry_pin *pin)
-- 
cgit 


From 9ace606e93e9c6dff919ca8f35d461e8462590b7 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 28 Feb 2019 14:22:52 -0500
Subject: bcachefs: Don't block on reclaim_lock from journal_res_get

When we're doing btree updates from journal flush, this becomes a
locking inversion

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/journal.c         |  5 ++++-
 fs/bcachefs/journal_reclaim.c | 23 +++++++++++++++--------
 fs/bcachefs/journal_reclaim.h |  1 +
 3 files changed, 20 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index 5caa01881d00..ba6adf11ef42 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -390,7 +390,10 @@ retry:
 				goto retry;
 			}
 
-			bch2_journal_reclaim_work(&j->reclaim_work.work);
+			if (mutex_trylock(&j->reclaim_lock)) {
+				bch2_journal_reclaim(j);
+				mutex_unlock(&j->reclaim_lock);
+			}
 		}
 
 		ret = -EAGAIN;
diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
index 0884fc823cdf..a3c53b78ad10 100644
--- a/fs/bcachefs/journal_reclaim.c
+++ b/fs/bcachefs/journal_reclaim.c
@@ -433,7 +433,7 @@ static void journal_flush_pins(struct journal *j, u64 seq_to_flush,
 }
 
 /**
- * bch2_journal_reclaim_work - free up journal buckets
+ * bch2_journal_reclaim - free up journal buckets
  *
  * Background journal reclaim writes out btree nodes. It should be run
  * early enough so that we never completely run out of journal buckets.
@@ -450,18 +450,17 @@ static void journal_flush_pins(struct journal *j, u64 seq_to_flush,
  * 512 journal entries or 25% of all journal buckets, then
  * journal_next_bucket() should not stall.
  */
-void bch2_journal_reclaim_work(struct work_struct *work)
+void bch2_journal_reclaim(struct journal *j)
 {
-	struct bch_fs *c = container_of(to_delayed_work(work),
-				struct bch_fs, journal.reclaim_work);
-	struct journal *j = &c->journal;
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
 	struct bch_dev *ca;
 	unsigned iter, bucket_to_flush, min_nr = 0;
 	u64 seq_to_flush = 0;
 
+	lockdep_assert_held(&j->reclaim_lock);
+
 	bch2_journal_do_discards(j);
 
-	mutex_lock(&j->reclaim_lock);
 	spin_lock(&j->lock);
 
 	for_each_rw_member(ca, c, iter) {
@@ -493,13 +492,21 @@ void bch2_journal_reclaim_work(struct work_struct *work)
 
 	journal_flush_pins(j, seq_to_flush, min_nr);
 
-	mutex_unlock(&j->reclaim_lock);
-
 	if (!test_bit(BCH_FS_RO, &c->flags))
 		queue_delayed_work(c->journal_reclaim_wq, &j->reclaim_work,
 				   msecs_to_jiffies(j->reclaim_delay_ms));
 }
 
+void bch2_journal_reclaim_work(struct work_struct *work)
+{
+	struct journal *j = container_of(to_delayed_work(work),
+				struct journal, reclaim_work);
+
+	mutex_lock(&j->reclaim_lock);
+	bch2_journal_reclaim(j);
+	mutex_unlock(&j->reclaim_lock);
+}
+
 static int journal_flush_done(struct journal *j, u64 seq_to_flush)
 {
 	int ret;
diff --git a/fs/bcachefs/journal_reclaim.h b/fs/bcachefs/journal_reclaim.h
index 71545ad3bd58..9bf982a17797 100644
--- a/fs/bcachefs/journal_reclaim.h
+++ b/fs/bcachefs/journal_reclaim.h
@@ -42,6 +42,7 @@ void bch2_journal_pin_add_if_older(struct journal *,
 void bch2_journal_pin_flush(struct journal *, struct journal_entry_pin *);
 
 void bch2_journal_do_discards(struct journal *);
+void bch2_journal_reclaim(struct journal *);
 void bch2_journal_reclaim_work(struct work_struct *);
 
 void bch2_journal_flush_pins(struct journal *, u64);
-- 
cgit 


From 68ef94a63caf214ee238434bf0d4c7a6a32c33a2 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 19 Feb 2019 13:41:36 -0500
Subject: bcachefs: Add a pre-reserve mechanism for the journal

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/journal.c         | 52 +++++++++++++++++++++++--
 fs/bcachefs/journal.h         | 89 +++++++++++++++++++++++++++++++++++++++++++
 fs/bcachefs/journal_io.c      |  6 +++
 fs/bcachefs/journal_reclaim.c | 43 ++++++++++++++++++---
 fs/bcachefs/journal_types.h   | 27 +++++++++++++
 5 files changed, 208 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index ba6adf11ef42..0aae8fd74c8a 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -343,6 +343,16 @@ retry:
 		return 0;
 	}
 
+	if (!(flags & JOURNAL_RES_GET_RESERVED) &&
+	    !test_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags)) {
+		/*
+		 * Don't want to close current journal entry, just need to
+		 * invoke reclaim:
+		 */
+		ret = -ENOSPC;
+		goto unlock;
+	}
+
 	/*
 	 * If we couldn't get a reservation because the current buf filled up,
 	 * and we had room for a bigger entry on disk, signal that we want to
@@ -366,7 +376,7 @@ retry:
 	} else {
 		ret = journal_entry_open(j);
 	}
-
+unlock:
 	if ((ret == -EAGAIN || ret == -ENOSPC) &&
 	    !j->res_get_blocked_start)
 		j->res_get_blocked_start = local_clock() ?: 1;
@@ -378,6 +388,8 @@ retry:
 		goto retry;
 
 	if (ret == -ENOSPC) {
+		BUG_ON(!can_discard && (flags & JOURNAL_RES_GET_RESERVED));
+
 		/*
 		 * Journal is full - can't rely on reclaim from work item due to
 		 * freezing:
@@ -423,6 +435,32 @@ int bch2_journal_res_get_slowpath(struct journal *j, struct journal_res *res,
 	return ret;
 }
 
+/* journal_preres: */
+
+static bool journal_preres_available(struct journal *j,
+				     struct journal_preres *res,
+				     unsigned new_u64s)
+{
+	bool ret = bch2_journal_preres_get_fast(j, res, new_u64s);
+
+	if (!ret)
+		bch2_journal_reclaim_work(&j->reclaim_work.work);
+
+	return ret;
+}
+
+int __bch2_journal_preres_get(struct journal *j,
+			      struct journal_preres *res,
+			      unsigned new_u64s)
+{
+	int ret;
+
+	closure_wait_event(&j->preres_wait,
+		   (ret = bch2_journal_error(j)) ||
+		   journal_preres_available(j, res, new_u64s));
+	return ret;
+}
+
 /* journal_entry_res: */
 
 void bch2_journal_entry_res_resize(struct journal *j,
@@ -1110,11 +1148,16 @@ ssize_t bch2_journal_print_debug(struct journal *j, char *buf)
 	       "seq:\t\t\t%llu\n"
 	       "last_seq:\t\t%llu\n"
 	       "last_seq_ondisk:\t%llu\n"
+	       "prereserved:\t\t%u/%u\n"
+	       "current entry sectors:\t%u\n"
 	       "current entry:\t\t",
 	       fifo_used(&j->pin),
 	       journal_cur_seq(j),
 	       journal_last_seq(j),
-	       j->last_seq_ondisk);
+	       j->last_seq_ondisk,
+	       j->prereserved.reserved,
+	       j->prereserved.remaining,
+	       j->cur_entry_sectors);
 
 	switch (s.cur_entry_offset) {
 	case JOURNAL_ENTRY_ERROR_VAL:
@@ -1136,8 +1179,9 @@ ssize_t bch2_journal_print_debug(struct journal *j, char *buf)
 	       journal_state_count(s, s.idx));
 
 	if (s.prev_buf_unwritten)
-		pr_buf(&out, "yes, ref %u\n",
-		       journal_state_count(s, !s.idx));
+		pr_buf(&out, "yes, ref %u sectors %u\n",
+		       journal_state_count(s, !s.idx),
+		       journal_prev_buf(j)->sectors);
 	else
 		pr_buf(&out, "no\n");
 
diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h
index 77d59fb0b151..809cf25f5a03 100644
--- a/fs/bcachefs/journal.h
+++ b/fs/bcachefs/journal.h
@@ -119,6 +119,7 @@ static inline void journal_wake(struct journal *j)
 {
 	wake_up(&j->wait);
 	closure_wake_up(&j->async_wait);
+	closure_wake_up(&j->preres_wait);
 }
 
 static inline struct journal_buf *journal_cur_buf(struct journal *j)
@@ -274,6 +275,7 @@ int bch2_journal_res_get_slowpath(struct journal *, struct journal_res *,
 
 #define JOURNAL_RES_GET_NONBLOCK	(1 << 0)
 #define JOURNAL_RES_GET_CHECK		(1 << 1)
+#define JOURNAL_RES_GET_RESERVED	(1 << 2)
 
 static inline int journal_res_get_fast(struct journal *j,
 				       struct journal_res *res,
@@ -294,6 +296,10 @@ static inline int journal_res_get_fast(struct journal *j,
 
 		EBUG_ON(!journal_state_count(new, new.idx));
 
+		if (!(flags & JOURNAL_RES_GET_RESERVED) &&
+		    !test_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags))
+			return 0;
+
 		if (flags & JOURNAL_RES_GET_CHECK)
 			return 1;
 
@@ -333,6 +339,89 @@ out:
 	return 0;
 }
 
+/* journal_preres: */
+
+static inline bool journal_check_may_get_unreserved(struct journal *j)
+{
+	union journal_preres_state s = READ_ONCE(j->prereserved);
+	bool ret = s.reserved <= s.remaining &&
+		fifo_free(&j->pin) > 8;
+
+	lockdep_assert_held(&j->lock);
+
+	if (ret != test_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags)) {
+		if (ret) {
+			set_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags);
+			journal_wake(j);
+		} else {
+			clear_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags);
+		}
+	}
+	return ret;
+}
+
+static inline void bch2_journal_preres_put(struct journal *j,
+					   struct journal_preres *res)
+{
+	union journal_preres_state s = { .reserved = res->u64s };
+
+	if (!res->u64s)
+		return;
+
+	s.v = atomic64_sub_return(s.v, &j->prereserved.counter);
+	res->u64s = 0;
+	closure_wake_up(&j->preres_wait);
+
+	if (s.reserved <= s.remaining &&
+	    !test_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags)) {
+		spin_lock(&j->lock);
+		journal_check_may_get_unreserved(j);
+		spin_unlock(&j->lock);
+	}
+}
+
+int __bch2_journal_preres_get(struct journal *,
+			struct journal_preres *, unsigned);
+
+static inline int bch2_journal_preres_get_fast(struct journal *j,
+					       struct journal_preres *res,
+					       unsigned new_u64s)
+{
+	int d = new_u64s - res->u64s;
+	union journal_preres_state old, new;
+	u64 v = atomic64_read(&j->prereserved.counter);
+
+	do {
+		old.v = new.v = v;
+
+		new.reserved += d;
+
+		if (new.reserved > new.remaining)
+			return 0;
+	} while ((v = atomic64_cmpxchg(&j->prereserved.counter,
+				       old.v, new.v)) != old.v);
+
+	res->u64s += d;
+	return 1;
+}
+
+static inline int bch2_journal_preres_get(struct journal *j,
+					  struct journal_preres *res,
+					  unsigned new_u64s,
+					  unsigned flags)
+{
+	if (new_u64s <= res->u64s)
+		return 0;
+
+	if (bch2_journal_preres_get_fast(j, res, new_u64s))
+		return 0;
+
+	if (flags & JOURNAL_RES_GET_NONBLOCK)
+		return -EAGAIN;
+
+	return __bch2_journal_preres_get(j, res, new_u64s);
+}
+
 /* journal_entry_res: */
 
 void bch2_journal_entry_res_resize(struct journal *,
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 07cfbb975c37..db95257cec11 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -974,6 +974,12 @@ static int journal_write_alloc(struct journal *j, struct journal_buf *w,
 					journal_space_discarded)) {
 			ja->cur_idx = (ja->cur_idx + 1) % ja->nr;
 			ja->sectors_free = ca->mi.bucket_size;
+
+			/*
+			 * ja->bucket_seq[ja->cur_idx] must always have
+			 * something sensible:
+			 */
+			ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq);
 		}
 	}
 
diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
index a3c53b78ad10..053fa4aa4f5f 100644
--- a/fs/bcachefs/journal_reclaim.c
+++ b/fs/bcachefs/journal_reclaim.c
@@ -49,6 +49,18 @@ unsigned bch2_journal_dev_buckets_available(struct journal *j,
 	return available;
 }
 
+static void journal_set_remaining(struct journal *j, unsigned u64s_remaining)
+{
+	union journal_preres_state old, new;
+	u64 v = atomic64_read(&j->prereserved.counter);
+
+	do {
+		old.v = new.v = v;
+		new.remaining = u64s_remaining;
+	} while ((v = atomic64_cmpxchg(&j->prereserved.counter,
+				       old.v, new.v)) != old.v);
+}
+
 static struct journal_space {
 	unsigned	next_entry;
 	unsigned	remaining;
@@ -124,8 +136,9 @@ void bch2_journal_space_available(struct journal *j)
 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
 	struct bch_dev *ca;
 	struct journal_space discarded, clean_ondisk, clean;
-	unsigned max_entry_size		= min(j->buf[0].buf_size >> 9,
-					      j->buf[1].buf_size >> 9);
+	unsigned overhead, u64s_remaining = 0;
+	unsigned max_entry_size	 = min(j->buf[0].buf_size >> 9,
+				       j->buf[1].buf_size >> 9);
 	unsigned i, nr_online = 0, nr_devs_want;
 	bool can_discard = false;
 	int ret = 0;
@@ -176,9 +189,17 @@ void bch2_journal_space_available(struct journal *j)
 
 	if (!discarded.next_entry)
 		ret = -ENOSPC;
+
+	overhead = DIV_ROUND_UP(clean.remaining, max_entry_size) *
+		journal_entry_overhead(j);
+	u64s_remaining = clean.remaining << 6;
+	u64s_remaining = max_t(int, 0, u64s_remaining - overhead);
+	u64s_remaining /= 4;
 out:
 	j->cur_entry_sectors	= !ret ? discarded.next_entry : 0;
 	j->cur_entry_error	= ret;
+	journal_set_remaining(j, u64s_remaining);
+	journal_check_may_get_unreserved(j);
 
 	if (!ret)
 		journal_wake(j);
@@ -454,7 +475,7 @@ void bch2_journal_reclaim(struct journal *j)
 {
 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
 	struct bch_dev *ca;
-	unsigned iter, bucket_to_flush, min_nr = 0;
+	unsigned iter, min_nr = 0;
 	u64 seq_to_flush = 0;
 
 	lockdep_assert_held(&j->reclaim_lock);
@@ -465,13 +486,22 @@ void bch2_journal_reclaim(struct journal *j)
 
 	for_each_rw_member(ca, c, iter) {
 		struct journal_device *ja = &ca->journal;
+		unsigned nr_buckets, bucket_to_flush;
 
 		if (!ja->nr)
 			continue;
 
-
 		/* Try to keep the journal at most half full: */
-		bucket_to_flush = (ja->cur_idx + (ja->nr >> 1)) % ja->nr;
+		nr_buckets = ja->nr / 2;
+
+		/* And include pre-reservations: */
+		nr_buckets += DIV_ROUND_UP(j->prereserved.reserved,
+					   (ca->mi.bucket_size << 6) -
+					   journal_entry_overhead(j));
+
+		nr_buckets = min(nr_buckets, ja->nr);
+
+		bucket_to_flush = (ja->cur_idx + nr_buckets) % ja->nr;
 		seq_to_flush = max_t(u64, seq_to_flush,
 				     ja->bucket_seq[bucket_to_flush]);
 	}
@@ -490,6 +520,9 @@ void bch2_journal_reclaim(struct journal *j)
 		       msecs_to_jiffies(j->reclaim_delay_ms)))
 		min_nr = 1;
 
+	if (j->prereserved.reserved * 2 > j->prereserved.remaining)
+		min_nr = 1;
+
 	journal_flush_pins(j, seq_to_flush, min_nr);
 
 	if (!test_bit(BCH_FS_RO, &c->flags))
diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h
index c91a21e07809..85bf5e2706f7 100644
--- a/fs/bcachefs/journal_types.h
+++ b/fs/bcachefs/journal_types.h
@@ -80,6 +80,14 @@ struct journal_res {
 	u64			seq;
 };
 
+/*
+ * For reserving space in the journal prior to getting a reservation on a
+ * particular journal entry:
+ */
+struct journal_preres {
+	unsigned		u64s;
+};
+
 union journal_res_state {
 	struct {
 		atomic64_t	counter;
@@ -98,6 +106,21 @@ union journal_res_state {
 	};
 };
 
+union journal_preres_state {
+	struct {
+		atomic64_t	counter;
+	};
+
+	struct {
+		u64		v;
+	};
+
+	struct {
+		u32		reserved;
+		u32		remaining;
+	};
+};
+
 /* bytes: */
 #define JOURNAL_ENTRY_SIZE_MIN		(64U << 10) /* 64k */
 #define JOURNAL_ENTRY_SIZE_MAX		(4U  << 20) /* 4M */
@@ -122,6 +145,7 @@ enum {
 	JOURNAL_STARTED,
 	JOURNAL_NEED_WRITE,
 	JOURNAL_NOT_EMPTY,
+	JOURNAL_MAY_GET_UNRESERVED,
 };
 
 /* Embedded in struct bch_fs */
@@ -142,6 +166,8 @@ struct journal {
 	 */
 	int			cur_entry_error;
 
+	union journal_preres_state prereserved;
+
 	/* Reserved space in journal entry to be used just prior to write */
 	unsigned		entry_u64s_reserved;
 
@@ -161,6 +187,7 @@ struct journal {
 	/* Used when waiting because the journal was full */
 	wait_queue_head_t	wait;
 	struct closure_waitlist	async_wait;
+	struct closure_waitlist	preres_wait;
 
 	struct closure		io;
 	struct delayed_work	write_work;
-- 
cgit 


From 3e5d6c59bec6f989e103e364d6a044a8a77e3a13 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 19 Feb 2019 17:56:21 -0500
Subject: bcachefs: Use journal preres for deferred btree updates

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c  |  1 +
 fs/bcachefs/btree_types.h       |  3 +-
 fs/bcachefs/btree_update.h      |  4 ++
 fs/bcachefs/btree_update_leaf.c | 84 ++++++++++++++++++++++++++++++++++-------
 4 files changed, 77 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 5ea7abc496de..bb067e4f627e 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -960,6 +960,7 @@ retry:
 			BTREE_INSERT_NOFAIL|
 			BTREE_INSERT_USE_RESERVE|
 			BTREE_INSERT_USE_ALLOC_RESERVE|
+			BTREE_INSERT_JOURNAL_RESERVED|
 			flags,
 			BTREE_INSERT_ENTRY(iter, &a->k_i));
 	if (ret == -EINTR)
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 7e58e82daec1..c24d2ce01463 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -246,10 +246,11 @@ struct btree_iter {
 #define BTREE_ITER_MAX		8
 
 struct deferred_update {
+	struct journal_preres	res;
 	struct journal_entry_pin journal;
 
 	spinlock_t		lock;
-	unsigned		gen;
+	unsigned		dirty:1;
 
 	u8			allocated_u64s;
 	enum btree_id		btree_id;
diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
index 1fd01fb40482..c4aa5e42b29c 100644
--- a/fs/bcachefs/btree_update.h
+++ b/fs/bcachefs/btree_update.h
@@ -27,6 +27,7 @@ struct btree_insert {
 	struct bch_fs		*c;
 	struct disk_reservation *disk_res;
 	struct journal_res	journal_res;
+	struct journal_preres	journal_preres;
 	u64			*journal_seq;
 	unsigned		flags;
 	bool			did_work;
@@ -82,6 +83,7 @@ enum {
 	__BTREE_INSERT_USE_RESERVE,
 	__BTREE_INSERT_USE_ALLOC_RESERVE,
 	__BTREE_INSERT_JOURNAL_REPLAY,
+	__BTREE_INSERT_JOURNAL_RESERVED,
 	__BTREE_INSERT_NOMARK,
 	__BTREE_INSERT_NOWAIT,
 	__BTREE_INSERT_GC_LOCK_HELD,
@@ -112,6 +114,8 @@ enum {
 /* Insert is for journal replay - don't get journal reservations: */
 #define BTREE_INSERT_JOURNAL_REPLAY	(1 << __BTREE_INSERT_JOURNAL_REPLAY)
 
+#define BTREE_INSERT_JOURNAL_RESERVED	(1 << __BTREE_INSERT_JOURNAL_RESERVED)
+
 /* Don't call bch2_mark_key: */
 #define BTREE_INSERT_NOMARK		(1 << __BTREE_INSERT_NOMARK)
 
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 7043201ac6a3..7749fccd0eab 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -18,6 +18,9 @@
 
 #include <linux/sort.h>
 
+static bool btree_trans_relock(struct btree_insert *);
+static void btree_trans_unlock(struct btree_insert *);
+
 /* Inserting into a given leaf node (last stage of insert): */
 
 /* Handle overwrites and do insert, for non extents: */
@@ -240,15 +243,15 @@ btree_insert_key_leaf(struct btree_insert *trans,
 /* Deferred btree updates: */
 
 static void deferred_update_flush(struct journal *j,
-					struct journal_entry_pin *pin,
-					u64 seq)
+				  struct journal_entry_pin *pin,
+				  u64 seq)
 {
 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
 	struct deferred_update *d =
 		container_of(pin, struct deferred_update, journal);
+	struct journal_preres res = { 0 };
 	u64 tmp[32];
 	struct bkey_i *k = (void *) tmp;
-	unsigned gen;
 	int ret;
 
 	if (d->allocated_u64s > ARRAY_SIZE(tmp)) {
@@ -258,26 +261,32 @@ static void deferred_update_flush(struct journal *j,
 	}
 
 	spin_lock(&d->lock);
-	gen = d->gen;
+	if (d->dirty) {
+		BUG_ON(jset_u64s(d->k.k.u64s) > d->res.u64s);
+
+		swap(res, d->res);
 
-	if (journal_pin_active(&d->journal)) {
 		BUG_ON(d->k.k.u64s > d->allocated_u64s);
-		bkey_copy(k, &d->k);
 
+		bkey_copy(k, &d->k);
+		d->dirty = false;
 		spin_unlock(&d->lock);
 
 		ret = bch2_btree_insert(c, d->btree_id, k, NULL, NULL,
-					BTREE_INSERT_NOFAIL);
+					BTREE_INSERT_NOFAIL|
+					BTREE_INSERT_USE_RESERVE|
+					BTREE_INSERT_JOURNAL_RESERVED);
 		bch2_fs_fatal_err_on(ret && !bch2_journal_error(j),
-			c, "error flushing deferred btree update: %i", ret);
+				     c, "error flushing deferred btree update: %i", ret);
 
 		spin_lock(&d->lock);
 	}
 
-	if (gen == d->gen)
+	if (!d->dirty)
 		bch2_journal_pin_drop(j, &d->journal);
 	spin_unlock(&d->lock);
 
+	bch2_journal_preres_put(j, &res);
 	if (k != (void *) tmp)
 		kfree(k);
 }
@@ -289,6 +298,7 @@ btree_insert_key_deferred(struct btree_insert *trans,
 	struct bch_fs *c = trans->c;
 	struct journal *j = &c->journal;
 	struct deferred_update *d = insert->d;
+	int difference;
 
 	BUG_ON(trans->flags & BTREE_INSERT_JOURNAL_REPLAY);
 	BUG_ON(insert->k->u64s > d->allocated_u64s);
@@ -296,12 +306,21 @@ btree_insert_key_deferred(struct btree_insert *trans,
 	__btree_journal_key(trans, d->btree_id, insert->k);
 
 	spin_lock(&d->lock);
-	d->gen++;
+	BUG_ON(jset_u64s(insert->k->u64s) >
+	       trans->journal_preres.u64s);
+
+	difference = jset_u64s(insert->k->u64s) - d->res.u64s;
+	if (difference > 0) {
+		trans->journal_preres.u64s	-= difference;
+		d->res.u64s			+= difference;
+	}
+
 	bkey_copy(&d->k, insert->k);
-	spin_unlock(&d->lock);
+	d->dirty = true;
 
 	bch2_journal_pin_update(j, trans->journal_res.seq, &d->journal,
 				deferred_update_flush);
+	spin_unlock(&d->lock);
 
 	return BTREE_INSERT_OK;
 }
@@ -520,13 +539,16 @@ retry:
 	}
 
 	if (likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) {
+		unsigned flags = (trans->flags & BTREE_INSERT_JOURNAL_RESERVED)
+			? JOURNAL_RES_GET_RESERVED : 0;
+
 		u64s = 0;
 		trans_for_each_entry(trans, i)
 			u64s += jset_u64s(i->k->k.u64s);
 
 		ret = bch2_journal_res_get(&c->journal,
 				&trans->journal_res, u64s,
-				JOURNAL_RES_GET_NONBLOCK);
+				flags|JOURNAL_RES_GET_NONBLOCK);
 		if (likely(!ret))
 			goto got_journal_res;
 		if (ret != -EAGAIN)
@@ -537,7 +559,7 @@ retry:
 
 		ret = bch2_journal_res_get(&c->journal,
 				&trans->journal_res, u64s,
-				JOURNAL_RES_GET_CHECK);
+				flags|JOURNAL_RES_GET_CHECK);
 		if (ret)
 			return ret;
 
@@ -587,6 +609,10 @@ got_journal_res:
 		}
 	}
 out:
+	BUG_ON(ret &&
+	       (trans->flags & BTREE_INSERT_JOURNAL_RESERVED) &&
+	       trans->journal_res.ref);
+
 	multi_unlock_write(trans);
 	bch2_journal_res_put(&c->journal, &trans->journal_res);
 
@@ -628,7 +654,7 @@ int __bch2_btree_insert_at(struct btree_insert *trans)
 	struct bch_fs *c = trans->c;
 	struct btree_insert_entry *i;
 	struct btree_iter *linked;
-	unsigned flags;
+	unsigned flags, u64s = 0;
 	int ret;
 
 	BUG_ON(!trans->nr);
@@ -639,11 +665,39 @@ int __bch2_btree_insert_at(struct btree_insert *trans)
 	if (trans->flags & BTREE_INSERT_GC_LOCK_HELD)
 		lockdep_assert_held(&c->gc_lock);
 
+	memset(&trans->journal_preres, 0, sizeof(trans->journal_preres));
+
 	bubble_sort(trans->entries, trans->nr, btree_trans_cmp);
 
 	trans_for_each_entry(trans, i)
 		btree_insert_entry_checks(c, i);
 
+	trans_for_each_entry(trans, i)
+		if (i->deferred)
+			u64s += jset_u64s(i->k->k.u64s);
+
+	if (u64s) {
+		ret = bch2_journal_preres_get(&c->journal,
+				&trans->journal_preres, u64s,
+				JOURNAL_RES_GET_NONBLOCK);
+		if (!ret)
+			goto got_journal_preres;
+		if (ret != -EAGAIN)
+			return ret;
+
+		btree_trans_unlock(trans);
+		ret = bch2_journal_preres_get(&c->journal,
+				&trans->journal_preres, u64s, 0);
+		if (ret)
+			return ret;
+
+		if (!btree_trans_relock(trans)) {
+			trans_restart(" (iter relock after journal preres get blocked)");
+			bch2_journal_preres_put(&c->journal, &trans->journal_preres);
+			return -EINTR;
+		}
+	}
+got_journal_preres:
 	if (unlikely(!(trans->flags & BTREE_INSERT_NOCHECK_RW) &&
 		     !percpu_ref_tryget(&c->writes)))
 		return -EROFS;
@@ -675,6 +729,8 @@ retry:
 	trans_for_each_iter(trans, i)
 		bch2_btree_iter_downgrade(i->iter);
 out:
+	bch2_journal_preres_put(&c->journal, &trans->journal_preres);
+
 	if (unlikely(!(trans->flags & BTREE_INSERT_NOCHECK_RW)))
 		percpu_ref_put(&c->writes);
 
-- 
cgit 


From 5154704b29e58a5fd9acd601b831d99298a76a6c Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 20 Jul 2018 22:27:07 -0400
Subject: bcachefs: Use deferred btree updates for inode updates

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.h    |  1 +
 fs/bcachefs/btree_locking.h |  2 --
 fs/bcachefs/fs-io.c         | 35 +++++++++++++++++++++--------------
 fs/bcachefs/fs.c            | 23 +++++++++++++++++++++--
 fs/bcachefs/fs.h            |  1 +
 5 files changed, 44 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index 52e0e003153b..a64ed6d32175 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -104,6 +104,7 @@ void bch2_btree_node_iter_fix(struct btree_iter *, struct btree *,
 			      unsigned, unsigned);
 
 int bch2_btree_iter_unlock(struct btree_iter *);
+bool bch2_btree_iter_relock(struct btree_iter *);
 
 bool __bch2_btree_iter_upgrade(struct btree_iter *, unsigned);
 bool __bch2_btree_iter_upgrade_nounlock(struct btree_iter *, unsigned);
diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h
index 48b50e066186..c036cd0458a4 100644
--- a/fs/bcachefs/btree_locking.h
+++ b/fs/bcachefs/btree_locking.h
@@ -203,8 +203,6 @@ static inline bool bch2_btree_node_relock(struct btree_iter *iter,
 		__bch2_btree_node_relock(iter, level);
 }
 
-bool bch2_btree_iter_relock(struct btree_iter *);
-
 void bch2_btree_node_unlock_write(struct btree *, struct btree_iter *);
 
 void __bch2_btree_node_lock_write(struct btree *, struct btree_iter *);
diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 7681cfbc6bed..f8657baf0521 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -287,11 +287,11 @@ static int bch2_extent_update(struct btree_trans *trans,
 			      bool direct,
 			      s64 *total_delta)
 {
-	struct btree_iter *inode_iter = NULL;
 	struct bch_inode_unpacked inode_u;
 	struct bkey_inode_buf inode_p;
 	bool allocating = false;
 	bool extended = false;
+	bool inode_locked = false;
 	s64 i_sectors_delta;
 	int ret;
 
@@ -314,16 +314,20 @@ static int bch2_extent_update(struct btree_trans *trans,
 	/* XXX: inode->i_size locking */
 	if (i_sectors_delta ||
 	    new_i_size > inode->ei_inode.bi_size) {
-		inode_iter = bch2_trans_get_iter(trans,
-			BTREE_ID_INODES,
-			POS(k->k.p.inode, 0),
-			BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
-		if (IS_ERR(inode_iter))
-			return PTR_ERR(inode_iter);
+		bch2_btree_iter_unlock(extent_iter);
+		mutex_lock(&inode->ei_update_lock);
 
-		ret = bch2_btree_iter_traverse(inode_iter);
-		if (ret)
-			goto err;
+		if (!bch2_btree_iter_relock(extent_iter)) {
+			mutex_unlock(&inode->ei_update_lock);
+			return -EINTR;
+		}
+
+		inode_locked = true;
+
+		if (!inode->ei_inode_update)
+			inode->ei_inode_update =
+				bch2_deferred_update_alloc(trans->c,
+							BTREE_ID_INODES, 64);
 
 		inode_u = inode->ei_inode;
 		inode_u.bi_sectors += i_sectors_delta;
@@ -337,7 +341,8 @@ static int bch2_extent_update(struct btree_trans *trans,
 
 		bch2_inode_pack(&inode_p, &inode_u);
 		bch2_trans_update(trans,
-			BTREE_INSERT_ENTRY(inode_iter, &inode_p.inode.k_i));
+			BTREE_INSERT_DEFERRED(inode->ei_inode_update,
+					      &inode_p.inode.k_i));
 	}
 
 	ret = bch2_trans_commit(trans, disk_res,
@@ -371,13 +376,15 @@ static int bch2_extent_update(struct btree_trans *trans,
 	if (total_delta)
 		*total_delta += i_sectors_delta;
 err:
-	if (!IS_ERR_OR_NULL(inode_iter))
-		bch2_trans_iter_put(trans, inode_iter);
+	if (inode_locked)
+		mutex_unlock(&inode->ei_update_lock);
+
 	return ret;
 }
 
 static int bchfs_write_index_update(struct bch_write_op *wop)
 {
+	struct bch_fs *c = wop->c;
 	struct bchfs_write_op *op = container_of(wop,
 				struct bchfs_write_op, op);
 	struct quota_res *quota_res = op->is_dio
@@ -392,7 +399,7 @@ static int bchfs_write_index_update(struct bch_write_op *wop)
 
 	BUG_ON(k->k.p.inode != inode->v.i_ino);
 
-	bch2_trans_init(&trans, wop->c);
+	bch2_trans_init(&trans, c);
 	bch2_trans_preload_iters(&trans);
 
 	iter = bch2_trans_get_iter(&trans,
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index 02c7543e40c8..5f93ea76785f 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -156,12 +156,18 @@ int __must_check bch2_write_inode_trans(struct btree_trans *trans,
 				inode_set_fn set,
 				void *p)
 {
+	struct bch_fs *c = trans->c;
 	struct btree_iter *iter;
 	struct bkey_inode_buf *inode_p;
 	int ret;
 
 	lockdep_assert_held(&inode->ei_update_lock);
 
+	/* XXX: Don't do this with btree locks held */
+	if (!inode->ei_inode_update)
+		inode->ei_inode_update =
+			bch2_deferred_update_alloc(c, BTREE_ID_INODES, 64);
+#if 0
 	iter = bch2_trans_get_iter(trans, BTREE_ID_INODES,
 			POS(inode->v.i_ino, 0),
 			BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
@@ -172,7 +178,7 @@ int __must_check bch2_write_inode_trans(struct btree_trans *trans,
 	ret = bch2_btree_iter_traverse(iter);
 	if (ret)
 		return ret;
-
+#endif
 	*inode_u = inode->ei_inode;
 
 	if (set) {
@@ -186,7 +192,15 @@ int __must_check bch2_write_inode_trans(struct btree_trans *trans,
 		return PTR_ERR(inode_p);
 
 	bch2_inode_pack(inode_p, inode_u);
-	bch2_trans_update(trans, BTREE_INSERT_ENTRY(iter, &inode_p->inode.k_i));
+
+	if (!inode->ei_inode_update)
+		bch2_trans_update(trans,
+			BTREE_INSERT_ENTRY(iter, &inode_p->inode.k_i));
+	else
+		bch2_trans_update(trans,
+			BTREE_INSERT_DEFERRED(inode->ei_inode_update,
+					      &inode_p->inode.k_i));
+
 	return 0;
 }
 
@@ -1431,6 +1445,7 @@ static struct inode *bch2_alloc_inode(struct super_block *sb)
 	mutex_init(&inode->ei_update_lock);
 	pagecache_lock_init(&inode->ei_pagecache_lock);
 	mutex_init(&inode->ei_quota_lock);
+	inode->ei_inode_update = NULL;
 	inode->ei_journal_seq = 0;
 
 	return &inode->v;
@@ -1494,6 +1509,10 @@ static void bch2_evict_inode(struct inode *vinode)
 
 	BUG_ON(!is_bad_inode(&inode->v) && inode->ei_quota_reserved);
 
+	if (inode->ei_inode_update)
+		bch2_deferred_update_free(c, inode->ei_inode_update);
+	inode->ei_inode_update = NULL;
+
 	if (!inode->v.i_nlink && !is_bad_inode(&inode->v)) {
 		bch2_quota_acct(c, inode->ei_qid, Q_SPC, -((s64) inode->v.i_blocks),
 				KEY_TYPE_QUOTA_WARN);
diff --git a/fs/bcachefs/fs.h b/fs/bcachefs/fs.h
index f949cd0d2a68..b9a8a9bc3e90 100644
--- a/fs/bcachefs/fs.h
+++ b/fs/bcachefs/fs.h
@@ -34,6 +34,7 @@ struct bch_inode_info {
 	struct inode		v;
 
 	struct mutex		ei_update_lock;
+	struct deferred_update	*ei_inode_update;
 	u64			ei_journal_seq;
 	u64			ei_quota_reserved;
 	unsigned long		ei_last_dirtied;
-- 
cgit 


From 446c562c2c60ec074c841725c410ee5106405956 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 4 Mar 2019 17:54:28 -0500
Subject: bcachefs: Remove direct use of bch2_btree_iter_link()

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c |  4 ++--
 fs/bcachefs/btree_iter.h |  2 --
 fs/bcachefs/fs-io.c      | 35 ++++++++++++++++++++---------------
 3 files changed, 22 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 3eb51724f9e1..01f829cc9cc7 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1611,7 +1611,7 @@ void __bch2_btree_iter_init(struct btree_iter *iter, struct bch_fs *c,
 	prefetch(c->btree_roots[btree_id].b);
 }
 
-void bch2_btree_iter_unlink(struct btree_iter *iter)
+static void bch2_btree_iter_unlink(struct btree_iter *iter)
 {
 	struct btree_iter *linked;
 
@@ -1630,7 +1630,7 @@ void bch2_btree_iter_unlink(struct btree_iter *iter)
 	BUG();
 }
 
-void bch2_btree_iter_link(struct btree_iter *iter, struct btree_iter *new)
+static void bch2_btree_iter_link(struct btree_iter *iter, struct btree_iter *new)
 {
 	BUG_ON(btree_iter_linked(new));
 
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index a64ed6d32175..fcec373db39a 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -164,8 +164,6 @@ static inline void bch2_btree_iter_init(struct btree_iter *iter,
 				?  BTREE_ITER_IS_EXTENTS : 0)|flags);
 }
 
-void bch2_btree_iter_link(struct btree_iter *, struct btree_iter *);
-void bch2_btree_iter_unlink(struct btree_iter *);
 void bch2_btree_iter_copy(struct btree_iter *, struct btree_iter *);
 
 static inline struct bpos btree_type_successor(enum btree_id id,
diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index f8657baf0521..eda6d71646e1 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -229,20 +229,21 @@ static void i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode,
 
 /* normal i_size/i_sectors update machinery: */
 
-static s64 sum_sector_overwrites(struct bkey_i *new, struct btree_iter *_iter,
-				 bool *allocating)
+static int sum_sector_overwrites(struct btree_trans *trans,
+				 struct btree_iter *extent_iter,
+				 struct bkey_i *new, bool *allocating,
+				 s64 *delta)
 {
-	struct btree_iter iter;
+	struct btree_iter *iter;
 	struct bkey_s_c old;
-	s64 delta = 0;
 
-	bch2_btree_iter_init(&iter, _iter->c, BTREE_ID_EXTENTS, POS_MIN,
-			     BTREE_ITER_SLOTS);
+	*delta = 0;
 
-	bch2_btree_iter_link(_iter, &iter);
-	bch2_btree_iter_copy(&iter, _iter);
+	iter = bch2_trans_copy_iter(trans, extent_iter);
+	if (IS_ERR(iter))
+		return PTR_ERR(iter);
 
-	old = bch2_btree_iter_peek_slot(&iter);
+	old = bch2_btree_iter_peek_slot(iter);
 
 	while (1) {
 		/*
@@ -258,7 +259,7 @@ static s64 sum_sector_overwrites(struct bkey_i *new, struct btree_iter *_iter,
 		    bch2_bkey_nr_dirty_ptrs(bkey_i_to_s_c(new)))
 			*allocating = true;
 
-		delta += (min(new->k.p.offset,
+		*delta += (min(new->k.p.offset,
 			      old.k->p.offset) -
 			  max(bkey_start_offset(&new->k),
 			      bkey_start_offset(old.k))) *
@@ -268,12 +269,11 @@ static s64 sum_sector_overwrites(struct bkey_i *new, struct btree_iter *_iter,
 		if (bkey_cmp(old.k->p, new->k.p) >= 0)
 			break;
 
-		old = bch2_btree_iter_next_slot(&iter);
+		old = bch2_btree_iter_next_slot(iter);
 	}
 
-	bch2_btree_iter_unlink(&iter);
-
-	return delta;
+	bch2_trans_iter_free(trans, iter);
+	return 0;
 }
 
 static int bch2_extent_update(struct btree_trans *trans,
@@ -303,7 +303,12 @@ static int bch2_extent_update(struct btree_trans *trans,
 
 	bch2_extent_trim_atomic(k, extent_iter);
 
-	i_sectors_delta = sum_sector_overwrites(k, extent_iter, &allocating);
+	ret = sum_sector_overwrites(trans, extent_iter,
+				    k, &allocating,
+				    &i_sectors_delta);
+	if (ret)
+		return ret;
+
 	if (!may_allocate && allocating)
 		return -ENOSPC;
 
-- 
cgit 


From 59928c12206ce8f478113403562947e21a47883a Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 7 Mar 2019 16:33:56 -0500
Subject: bcachefs: Don't BUG_ON() on bucket sector count overflow

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/buckets.c | 50 ++++++++++++++++++++++++++++----------------------
 1 file changed, 28 insertions(+), 22 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 5011e7af3563..bddddcb93bc6 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -541,29 +541,40 @@ static int bch2_mark_alloc(struct bch_fs *c, struct bkey_s_c k,
 }
 
 #define checked_add(a, b)					\
-do {								\
+({								\
 	unsigned _res = (unsigned) (a) + (b);			\
+	bool overflow = _res > U16_MAX;				\
+	if (overflow)						\
+		_res = U16_MAX;					\
 	(a) = _res;						\
-	BUG_ON((a) != _res);					\
-} while (0)
+	overflow;						\
+})
 
 static int __bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
 				       size_t b, enum bch_data_type type,
 				       unsigned sectors, bool gc)
 {
-	struct bch_fs_usage *fs_usage = this_cpu_ptr(c->usage[gc]);
 	struct bucket *g = __bucket(ca, b, gc);
-	struct bucket_mark new;
+	struct bucket_mark old, new;
+	bool overflow;
 
 	BUG_ON(type != BCH_DATA_SB &&
 	       type != BCH_DATA_JOURNAL);
 
-	bucket_data_cmpxchg(c, ca, fs_usage, g, new, ({
+	old = bucket_cmpxchg(g, new, ({
 		new.dirty	= true;
 		new.data_type	= type;
-		checked_add(new.dirty_sectors, sectors);
+		overflow = checked_add(new.dirty_sectors, sectors);
 	}));
 
+	bch2_fs_inconsistent_on(overflow, c,
+		"bucket sector count overflow: %u + %u > U16_MAX",
+		old.dirty_sectors, sectors);
+
+	if (c)
+		bch2_dev_usage_update(c, ca, this_cpu_ptr(c->usage[gc]),
+				      old, new, gc);
+
 	return 0;
 }
 
@@ -581,19 +592,7 @@ void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
 		do_mark_fn(__bch2_mark_metadata_bucket, c, pos, flags,
 			   ca, b, type, sectors);
 	} else {
-		struct bucket *g;
-		struct bucket_mark new;
-
-		rcu_read_lock();
-
-		g = bucket(ca, b);
-		bucket_cmpxchg(g, new, ({
-			new.dirty	= true;
-			new.data_type	= type;
-			checked_add(new.dirty_sectors, sectors);
-		}));
-
-		rcu_read_unlock();
+		__bch2_mark_metadata_bucket(c, ca, b, type, sectors, 0);
 	}
 
 	preempt_enable();
@@ -636,6 +635,7 @@ static bool bch2_mark_pointer(struct bch_fs *c,
 	struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev);
 	size_t b = PTR_BUCKET_NR(ca, &p.ptr);
 	struct bucket *g = __bucket(ca, b, gc);
+	bool overflow;
 	u64 v;
 
 	v = atomic64_read(&g->_mark.v);
@@ -657,9 +657,9 @@ static bool bch2_mark_pointer(struct bch_fs *c,
 		}
 
 		if (!p.ptr.cached)
-			checked_add(new.dirty_sectors, sectors);
+			overflow = checked_add(new.dirty_sectors, sectors);
 		else
-			checked_add(new.cached_sectors, sectors);
+			overflow = checked_add(new.cached_sectors, sectors);
 
 		if (!new.dirty_sectors &&
 		    !new.cached_sectors) {
@@ -681,6 +681,12 @@ static bool bch2_mark_pointer(struct bch_fs *c,
 			      old.v.counter,
 			      new.v.counter)) != old.v.counter);
 
+	bch2_fs_inconsistent_on(overflow, c,
+		"bucket sector count overflow: %u + %lli > U16_MAX",
+		!p.ptr.cached
+		? old.dirty_sectors
+		: old.cached_sectors, sectors);
+
 	bch2_dev_usage_update(c, ca, fs_usage, old, new, gc);
 
 	BUG_ON(!gc && bucket_became_unavailable(old, new));
-- 
cgit 


From b5d056358d1da10738500a272a4e7967d55e9341 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 7 Mar 2019 17:19:04 -0500
Subject: bcachefs: minor journal reclaim fixes

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/journal_reclaim.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
index 053fa4aa4f5f..2f67ea2debd2 100644
--- a/fs/bcachefs/journal_reclaim.c
+++ b/fs/bcachefs/journal_reclaim.c
@@ -502,8 +502,8 @@ void bch2_journal_reclaim(struct journal *j)
 		nr_buckets = min(nr_buckets, ja->nr);
 
 		bucket_to_flush = (ja->cur_idx + nr_buckets) % ja->nr;
-		seq_to_flush = max_t(u64, seq_to_flush,
-				     ja->bucket_seq[bucket_to_flush]);
+		seq_to_flush = max(seq_to_flush,
+				   ja->bucket_seq[bucket_to_flush]);
 	}
 
 	/* Also flush if the pin fifo is more than half full */
@@ -520,12 +520,14 @@ void bch2_journal_reclaim(struct journal *j)
 		       msecs_to_jiffies(j->reclaim_delay_ms)))
 		min_nr = 1;
 
-	if (j->prereserved.reserved * 2 > j->prereserved.remaining)
+	if (j->prereserved.reserved * 2 > j->prereserved.remaining) {
+		seq_to_flush = max(seq_to_flush, journal_last_seq(j));
 		min_nr = 1;
+	}
 
 	journal_flush_pins(j, seq_to_flush, min_nr);
 
-	if (!test_bit(BCH_FS_RO, &c->flags))
+	if (!bch2_journal_error(j))
 		queue_delayed_work(c->journal_reclaim_wq, &j->reclaim_work,
 				   msecs_to_jiffies(j->reclaim_delay_ms));
 }
-- 
cgit 


From 812af308de34f5bc3fc0d30a00f826ad159a724f Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 7 Mar 2019 19:45:57 -0500
Subject: bcachefs: assertion to catch outstanding bug

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/super-io.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index 66e174d93a9c..c89fe5d630e4 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -1027,7 +1027,10 @@ void bch2_fs_mark_clean(struct bch_fs *c, bool clean)
 	sb_clean->flags		= 0;
 	sb_clean->read_clock	= cpu_to_le16(c->bucket_clock[READ].hand);
 	sb_clean->write_clock	= cpu_to_le16(c->bucket_clock[WRITE].hand);
-	sb_clean->journal_seq	= journal_cur_seq(&c->journal) - 1;
+	sb_clean->journal_seq	= cpu_to_le64(journal_cur_seq(&c->journal) - 1);
+
+	/* Trying to catch outstanding bug: */
+	BUG_ON(le64_to_cpu(sb_clean->journal_seq) > S64_MAX);
 
 	entry = sb_clean->start;
 	entry = bch2_journal_super_entries_add_common(c, entry, 0);
-- 
cgit 


From db6447b383e5299e864826a3adc29b5eca04f4c5 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 7 Mar 2019 23:14:35 -0500
Subject: bcachefs: fix a faulty assertion

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/journal.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index 0aae8fd74c8a..64f9c5740ec8 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -248,13 +248,13 @@ static int journal_entry_open(struct journal *j)
 	do {
 		old.v = new.v = v;
 
-		EBUG_ON(journal_state_count(new, new.idx));
-
 		if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL)
 			return -EROFS;
 
 		/* Handle any already added entries */
 		new.cur_entry_offset = le32_to_cpu(buf->data->u64s);
+
+		EBUG_ON(journal_state_count(new, new.idx));
 		journal_state_inc(&new);
 	} while ((v = atomic64_cmpxchg(&j->reservations.counter,
 				       old.v, new.v)) != old.v);
-- 
cgit 


From a8e00bd48a839aa9ee134c9966872523cd8075e6 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 7 Mar 2019 23:13:39 -0500
Subject: bcachefs: increase BTREE_ITER_MAX

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c   | 93 ++++++++++++++++++++++++++++++----------------
 fs/bcachefs/btree_types.h  | 19 ++++++----
 fs/bcachefs/btree_update.h |  2 +-
 fs/bcachefs/super.c        |  4 +-
 4 files changed, 77 insertions(+), 41 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 01f829cc9cc7..db7ae19bd1cd 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1672,7 +1672,7 @@ static inline unsigned btree_trans_iter_idx(struct btree_trans *trans,
 	ssize_t idx = iter - trans->iters;
 
 	BUG_ON(idx < 0 || idx >= trans->nr_iters);
-	BUG_ON(!(trans->iters_live & (1U << idx)));
+	BUG_ON(!(trans->iters_live & (1ULL << idx)));
 
 	return idx;
 }
@@ -1682,7 +1682,7 @@ void bch2_trans_iter_put(struct btree_trans *trans,
 {
 	ssize_t idx = btree_trans_iter_idx(trans, iter);
 
-	trans->iters_live	&= ~(1U << idx);
+	trans->iters_live	&= ~(1ULL << idx);
 }
 
 void bch2_trans_iter_free(struct btree_trans *trans,
@@ -1690,23 +1690,50 @@ void bch2_trans_iter_free(struct btree_trans *trans,
 {
 	ssize_t idx = btree_trans_iter_idx(trans, iter);
 
-	trans->iters_live	&= ~(1U << idx);
-	trans->iters_linked	&= ~(1U << idx);
+	trans->iters_live	&= ~(1ULL << idx);
+	trans->iters_linked	&= ~(1ULL << idx);
 	bch2_btree_iter_unlink(iter);
 }
 
-static int btree_trans_realloc_iters(struct btree_trans *trans)
+static int btree_trans_realloc_iters(struct btree_trans *trans,
+				     unsigned new_size)
 {
-	struct btree_iter *new_iters;
+	void *new_iters, *new_updates;
 	unsigned i;
 
+	BUG_ON(new_size > BTREE_ITER_MAX);
+
+	if (new_size <= trans->size)
+		return 0;
+
+	BUG_ON(trans->used_mempool);
+
 	bch2_trans_unlock(trans);
 
+	new_iters = kmalloc(sizeof(struct btree_iter) * new_size +
+			    sizeof(struct btree_insert_entry) * (new_size + 4),
+			    GFP_NOFS);
+	if (new_iters)
+		goto success;
+
 	new_iters = mempool_alloc(&trans->c->btree_iters_pool, GFP_NOFS);
+	new_size = BTREE_ITER_MAX;
+
+	trans->used_mempool = true;
+success:
+	new_updates = new_iters + sizeof(struct btree_iter) * new_size;
 
 	memcpy(new_iters, trans->iters,
 	       sizeof(struct btree_iter) * trans->nr_iters);
-	trans->iters = new_iters;
+	memcpy(new_updates, trans->updates,
+	       sizeof(struct btree_insert_entry) * trans->nr_updates);
+
+	if (trans->iters != trans->iters_onstack)
+		kfree(trans->iters);
+
+	trans->iters	= new_iters;
+	trans->updates	= new_updates;
+	trans->size	= new_size;
 
 	for (i = 0; i < trans->nr_iters; i++)
 		trans->iters[i].next = &trans->iters[i];
@@ -1732,8 +1759,7 @@ static int btree_trans_realloc_iters(struct btree_trans *trans)
 
 void bch2_trans_preload_iters(struct btree_trans *trans)
 {
-	if (trans->iters == trans->iters_onstack)
-		btree_trans_realloc_iters(trans);
+	btree_trans_realloc_iters(trans, BTREE_ITER_MAX);
 }
 
 static struct btree_iter *__btree_trans_get_iter(struct btree_trans *trans,
@@ -1746,7 +1772,7 @@ static struct btree_iter *__btree_trans_get_iter(struct btree_trans *trans,
 	BUG_ON(trans->nr_iters > BTREE_ITER_MAX);
 
 	for (idx = 0; idx < trans->nr_iters; idx++)
-		if (trans->iter_ids[idx] == iter_id)
+		if (trans->iters[idx].id == iter_id)
 			goto found;
 	idx = -1;
 found:
@@ -1755,19 +1781,20 @@ found:
 		if (idx < trans->nr_iters)
 			goto got_slot;
 
-		BUG_ON(trans->nr_iters == BTREE_ITER_MAX);
+		BUG_ON(trans->nr_iters > trans->size);
 
-		if (trans->iters == trans->iters_onstack &&
-		    trans->nr_iters == ARRAY_SIZE(trans->iters_onstack)) {
-			int ret = btree_trans_realloc_iters(trans);
+		if (trans->nr_iters == trans->size) {
+			int ret = btree_trans_realloc_iters(trans,
+							trans->size * 2);
 			if (ret)
 				return ERR_PTR(ret);
 		}
 
 		idx = trans->nr_iters++;
+		BUG_ON(trans->nr_iters > trans->size);
 got_slot:
-		trans->iter_ids[idx] = iter_id;
 		iter = &trans->iters[idx];
+		iter->id = iter_id;
 
 		bch2_btree_iter_init(iter, trans->c, btree_id, POS_MIN, flags);
 	} else {
@@ -1777,15 +1804,15 @@ got_slot:
 		iter->flags |= flags & (BTREE_ITER_INTENT|BTREE_ITER_PREFETCH);
 	}
 
-	BUG_ON(trans->iters_live & (1 << idx));
-	trans->iters_live |= 1 << idx;
+	BUG_ON(trans->iters_live & (1ULL << idx));
+	trans->iters_live |= 1ULL << idx;
 
 	if (trans->iters_linked &&
 	    !(trans->iters_linked & (1 << idx)))
 		bch2_btree_iter_link(&trans->iters[__ffs(trans->iters_linked)],
 				     iter);
 
-	trans->iters_linked |= 1 << idx;
+	trans->iters_linked |= 1ULL << idx;
 
 	btree_trans_verify(trans);
 
@@ -1869,6 +1896,7 @@ int bch2_trans_unlock(struct btree_trans *trans)
 
 void __bch2_trans_begin(struct btree_trans *trans)
 {
+	u64 linked_not_live;
 	unsigned idx;
 
 	btree_trans_verify(trans);
@@ -1881,10 +1909,15 @@ void __bch2_trans_begin(struct btree_trans *trans)
 	 * further (allocated an iter with a higher idx) than where the iter
 	 * was originally allocated:
 	 */
-	while (trans->iters_linked &&
-	       trans->iters_live &&
-	       (idx = __fls(trans->iters_linked)) >
-	       __fls(trans->iters_live)) {
+	while (1) {
+		linked_not_live = trans->iters_linked & ~trans->iters_live;
+		if (!linked_not_live)
+			break;
+
+		idx = __ffs64(linked_not_live);
+		if (1ULL << idx > trans->iters_live)
+			break;
+
 		trans->iters_linked ^= 1 << idx;
 		bch2_btree_iter_unlink(&trans->iters[idx]);
 	}
@@ -1898,16 +1931,12 @@ void __bch2_trans_begin(struct btree_trans *trans)
 
 void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c)
 {
+	memset(trans, 0, offsetof(struct btree_trans, iters_onstack));
+
 	trans->c		= c;
-	trans->nr_restarts	= 0;
-	trans->nr_iters		= 0;
-	trans->iters_live	= 0;
-	trans->iters_linked	= 0;
-	trans->nr_updates	= 0;
-	trans->mem_top		= 0;
-	trans->mem_bytes	= 0;
-	trans->mem		= NULL;
+	trans->size		= ARRAY_SIZE(trans->iters_onstack);
 	trans->iters		= trans->iters_onstack;
+	trans->updates		= trans->updates_onstack;
 }
 
 int bch2_trans_exit(struct btree_trans *trans)
@@ -1915,8 +1944,10 @@ int bch2_trans_exit(struct btree_trans *trans)
 	int ret = bch2_trans_unlock(trans);
 
 	kfree(trans->mem);
-	if (trans->iters != trans->iters_onstack)
+	if (trans->used_mempool)
 		mempool_free(trans->iters, &trans->c->btree_iters_pool);
+	else if (trans->iters != trans->iters_onstack)
+		kfree(trans->iters);
 	trans->mem	= (void *) 0x1;
 	trans->iters	= (void *) 0x1;
 	return ret;
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index c24d2ce01463..a2a2192865ee 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -232,19 +232,18 @@ struct btree_iter {
 	 */
 	struct bkey		k;
 
+	u64			id;
+
 	/*
 	 * Circular linked list of linked iterators: linked iterators share
 	 * locks (e.g. two linked iterators may have the same node intent
 	 * locked, or read and write locked, at the same time), and insertions
 	 * through one iterator won't invalidate the other linked iterators.
 	 */
-
 	/* Must come last: */
 	struct btree_iter	*next;
 };
 
-#define BTREE_ITER_MAX		8
-
 struct deferred_update {
 	struct journal_preres	res;
 	struct journal_entry_pin journal;
@@ -270,25 +269,29 @@ struct btree_insert_entry {
 	bool			deferred;
 };
 
+#define BTREE_ITER_MAX		64
+
 struct btree_trans {
 	struct bch_fs		*c;
 	size_t			nr_restarts;
 
+	u64			iters_live;
+	u64			iters_linked;
+
 	u8			nr_iters;
-	u8			iters_live;
-	u8			iters_linked;
 	u8			nr_updates;
+	u8			size;
+	unsigned		used_mempool:1;
 
 	unsigned		mem_top;
 	unsigned		mem_bytes;
 	void			*mem;
 
 	struct btree_iter	*iters;
-	u64			iter_ids[BTREE_ITER_MAX];
-
-	struct btree_insert_entry updates[BTREE_ITER_MAX];
+	struct btree_insert_entry *updates;
 
 	struct btree_iter	iters_onstack[2];
+	struct btree_insert_entry updates_onstack[6];
 };
 
 #define BTREE_FLAG(flag)						\
diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
index c4aa5e42b29c..e689841a2cd3 100644
--- a/fs/bcachefs/btree_update.h
+++ b/fs/bcachefs/btree_update.h
@@ -148,7 +148,7 @@ static inline void
 bch2_trans_update(struct btree_trans *trans,
 		  struct btree_insert_entry entry)
 {
-	BUG_ON(trans->nr_updates >= ARRAY_SIZE(trans->updates));
+	BUG_ON(trans->nr_updates >= trans->nr_iters + 4);
 
 	trans->updates[trans->nr_updates++] = entry;
 }
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index be8c4a604d8c..c79aa11f2120 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -649,7 +649,9 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 	    mempool_init_kvpmalloc_pool(&c->btree_bounce_pool, 1,
 					btree_bytes(c)) ||
 	    mempool_init_kmalloc_pool(&c->btree_iters_pool, 1,
-			sizeof(struct btree_iter) * BTREE_ITER_MAX) ||
+			sizeof(struct btree_iter) * BTREE_ITER_MAX +
+			sizeof(struct btree_insert_entry) *
+			(BTREE_ITER_MAX + 4)) ||
 	    bch2_io_clock_init(&c->io_clock[READ]) ||
 	    bch2_io_clock_init(&c->io_clock[WRITE]) ||
 	    bch2_fs_journal_init(&c->journal) ||
-- 
cgit 


From 844045581e52955525fcc487d59fb3981a412f35 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 8 Mar 2019 17:20:33 -0500
Subject: bcachefs: Fix for when compressed extent is split during journal
 replay

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/journal_io.c | 82 ++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 66 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index db95257cec11..9997a2793e0a 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -823,6 +823,63 @@ fsck_err:
 
 /* journal replay: */
 
+static int bch2_extent_replay_key(struct bch_fs *c, struct bkey_i *k)
+{
+	/*
+	 * We might cause compressed extents to be
+	 * split, so we need to pass in a
+	 * disk_reservation:
+	 */
+	struct disk_reservation disk_res =
+		bch2_disk_reservation_init(c, 0);
+	BKEY_PADDED(k) split;
+	struct btree_iter iter;
+	int ret;
+
+	bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS,
+			     bkey_start_pos(&k->k),
+			     BTREE_ITER_INTENT);
+	do {
+		ret = bch2_btree_iter_traverse(&iter);
+		if (ret)
+			break;
+
+		bkey_copy(&split.k, k);
+		bch2_cut_front(iter.pos, &split.k);
+		bch2_extent_trim_atomic(&split.k, &iter);
+
+		ret = bch2_disk_reservation_add(c, &disk_res,
+				split.k.k.size *
+				bch2_bkey_nr_dirty_ptrs(bkey_i_to_s_c(&split.k)),
+				BCH_DISK_RESERVATION_NOFAIL);
+		BUG_ON(ret);
+
+		ret = bch2_btree_insert_at(c, &disk_res, NULL,
+					   BTREE_INSERT_ATOMIC|
+					   BTREE_INSERT_NOFAIL|
+					   BTREE_INSERT_JOURNAL_REPLAY,
+					   BTREE_INSERT_ENTRY(&iter, &split.k));
+	} while ((!ret || ret == -EINTR) &&
+		 bkey_cmp(k->k.p, iter.pos));
+
+	bch2_disk_reservation_put(c, &disk_res);
+
+	/*
+	 * This isn't strictly correct - we should only be relying on the btree
+	 * node lock for synchronization with gc when we've got a write lock
+	 * held.
+	 *
+	 * but - there are other correctness issues if btree gc were to run
+	 * before journal replay finishes
+	 */
+	bch2_mark_key(c, bkey_i_to_s_c(k), false, -((s64) k->k.size),
+		      gc_pos_btree_node(iter.l[0].b),
+		      NULL, 0, 0);
+	bch2_btree_iter_unlock(&iter);
+
+	return ret;
+}
+
 int bch2_journal_replay(struct bch_fs *c, struct list_head *list)
 {
 	struct journal *j = &c->journal;
@@ -835,27 +892,20 @@ int bch2_journal_replay(struct bch_fs *c, struct list_head *list)
 		j->replay_journal_seq = le64_to_cpu(i->j.seq);
 
 		for_each_jset_key(k, _n, entry, &i->j) {
-
-			if (entry->btree_id == BTREE_ID_ALLOC) {
-				/*
-				 * allocation code handles replay for
-				 * BTREE_ID_ALLOC keys:
-				 */
+			switch (entry->btree_id) {
+			case BTREE_ID_ALLOC:
 				ret = bch2_alloc_replay_key(c, k);
-			} else {
-				/*
-				 * We might cause compressed extents to be
-				 * split, so we need to pass in a
-				 * disk_reservation:
-				 */
-				struct disk_reservation disk_res =
-					bch2_disk_reservation_init(c, 0);
-
+				break;
+			case BTREE_ID_EXTENTS:
+				ret = bch2_extent_replay_key(c, k);
+				break;
+			default:
 				ret = bch2_btree_insert(c, entry->btree_id, k,
-						&disk_res, NULL,
+						NULL, NULL,
 						BTREE_INSERT_NOFAIL|
 						BTREE_INSERT_JOURNAL_REPLAY|
 						BTREE_INSERT_NOMARK);
+				break;
 			}
 
 			if (ret) {
-- 
cgit 


From 3aea434272884b8dc77c57624a6d3fec55ade898 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 9 Mar 2019 14:53:03 -0500
Subject: bcachefs: Fix for shutting down before fs started marking it clean

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/super.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index c79aa11f2120..4f627e91f041 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -299,7 +299,8 @@ void bch2_fs_read_only(struct bch_fs *c)
 
 	if (!bch2_journal_error(&c->journal) &&
 	    !test_bit(BCH_FS_ERROR, &c->flags) &&
-	    !test_bit(BCH_FS_EMERGENCY_RO, &c->flags))
+	    !test_bit(BCH_FS_EMERGENCY_RO, &c->flags) &&
+	    test_bit(BCH_FS_STARTED, &c->flags))
 		bch2_fs_mark_clean(c, true);
 
 	if (c->state != BCH_FS_STOPPING)
-- 
cgit 


From 61f321fc8bcb844ff0b2520ba71753cb5a511a9a Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 13 Mar 2019 13:31:02 -0400
Subject: bcachefs: Make deferred inode updates a mount option

Journal reclaim may still need performance tuning

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c |  8 ++++-
 fs/bcachefs/fs-io.c            | 79 +++++++++++++++++++++++++++++-------------
 fs/bcachefs/fs.c               | 26 +++++++-------
 fs/bcachefs/opts.h             |  7 +++-
 4 files changed, 82 insertions(+), 38 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index bb067e4f627e..0ea4bebdd0af 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -953,6 +953,13 @@ retry:
 	a->k.p = iter->pos;
 	bch2_alloc_pack(a, u);
 
+	/*
+	 * XXX:
+	 * when using deferred btree updates, we have journal reclaim doing
+	 * btree updates and thus requiring the allocator to make forward
+	 * progress, and here the allocator is requiring space in the journal -
+	 * so we need a journal pre-reservation:
+	 */
 	ret = bch2_btree_insert_at(c, NULL,
 			invalidating_cached_data ? journal_seq : NULL,
 			BTREE_INSERT_ATOMIC|
@@ -960,7 +967,6 @@ retry:
 			BTREE_INSERT_NOFAIL|
 			BTREE_INSERT_USE_RESERVE|
 			BTREE_INSERT_USE_ALLOC_RESERVE|
-			BTREE_INSERT_JOURNAL_RESERVED|
 			flags,
 			BTREE_INSERT_ENTRY(iter, &a->k_i));
 	if (ret == -EINTR)
diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index eda6d71646e1..251c811abeda 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -287,6 +287,8 @@ static int bch2_extent_update(struct btree_trans *trans,
 			      bool direct,
 			      s64 *total_delta)
 {
+	struct bch_fs *c = trans->c;
+	struct btree_iter *inode_iter = NULL;
 	struct bch_inode_unpacked inode_u;
 	struct bkey_inode_buf inode_p;
 	bool allocating = false;
@@ -319,35 +321,62 @@ static int bch2_extent_update(struct btree_trans *trans,
 	/* XXX: inode->i_size locking */
 	if (i_sectors_delta ||
 	    new_i_size > inode->ei_inode.bi_size) {
-		bch2_btree_iter_unlock(extent_iter);
-		mutex_lock(&inode->ei_update_lock);
+		if (c->opts.new_inode_updates) {
+			bch2_btree_iter_unlock(extent_iter);
+			mutex_lock(&inode->ei_update_lock);
 
-		if (!bch2_btree_iter_relock(extent_iter)) {
-			mutex_unlock(&inode->ei_update_lock);
-			return -EINTR;
-		}
+			if (!bch2_btree_iter_relock(extent_iter)) {
+				mutex_unlock(&inode->ei_update_lock);
+				return -EINTR;
+			}
 
-		inode_locked = true;
+			inode_locked = true;
 
-		if (!inode->ei_inode_update)
-			inode->ei_inode_update =
-				bch2_deferred_update_alloc(trans->c,
-							BTREE_ID_INODES, 64);
+			if (!inode->ei_inode_update)
+				inode->ei_inode_update =
+					bch2_deferred_update_alloc(c,
+								BTREE_ID_INODES, 64);
 
-		inode_u = inode->ei_inode;
-		inode_u.bi_sectors += i_sectors_delta;
+			inode_u = inode->ei_inode;
+			inode_u.bi_sectors += i_sectors_delta;
 
-		/* XXX: this is slightly suspect */
-		if (!(inode_u.bi_flags & BCH_INODE_I_SIZE_DIRTY) &&
-		    new_i_size > inode_u.bi_size) {
-			inode_u.bi_size = new_i_size;
-			extended = true;
-		}
+			/* XXX: this is slightly suspect */
+			if (!(inode_u.bi_flags & BCH_INODE_I_SIZE_DIRTY) &&
+			    new_i_size > inode_u.bi_size) {
+				inode_u.bi_size = new_i_size;
+				extended = true;
+			}
 
-		bch2_inode_pack(&inode_p, &inode_u);
-		bch2_trans_update(trans,
-			BTREE_INSERT_DEFERRED(inode->ei_inode_update,
-					      &inode_p.inode.k_i));
+			bch2_inode_pack(&inode_p, &inode_u);
+			bch2_trans_update(trans,
+				BTREE_INSERT_DEFERRED(inode->ei_inode_update,
+						      &inode_p.inode.k_i));
+		} else {
+			inode_iter = bch2_trans_get_iter(trans,
+				BTREE_ID_INODES,
+				POS(k->k.p.inode, 0),
+				BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+			if (IS_ERR(inode_iter))
+				return PTR_ERR(inode_iter);
+
+			ret = bch2_btree_iter_traverse(inode_iter);
+			if (ret)
+				goto err;
+
+			inode_u = inode->ei_inode;
+			inode_u.bi_sectors += i_sectors_delta;
+
+			/* XXX: this is slightly suspect */
+			if (!(inode_u.bi_flags & BCH_INODE_I_SIZE_DIRTY) &&
+			    new_i_size > inode_u.bi_size) {
+				inode_u.bi_size = new_i_size;
+				extended = true;
+			}
+
+			bch2_inode_pack(&inode_p, &inode_u);
+			bch2_trans_update(trans,
+				BTREE_INSERT_ENTRY(inode_iter, &inode_p.inode.k_i));
+		}
 	}
 
 	ret = bch2_trans_commit(trans, disk_res,
@@ -376,11 +405,13 @@ static int bch2_extent_update(struct btree_trans *trans,
 	}
 
 	if (direct)
-		i_sectors_acct(trans->c, inode, quota_res, i_sectors_delta);
+		i_sectors_acct(c, inode, quota_res, i_sectors_delta);
 
 	if (total_delta)
 		*total_delta += i_sectors_delta;
 err:
+	if (!IS_ERR_OR_NULL(inode_iter))
+		bch2_trans_iter_put(trans, inode_iter);
 	if (inode_locked)
 		mutex_unlock(&inode->ei_update_lock);
 
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index 5f93ea76785f..dc55d36ecfd5 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -163,22 +163,24 @@ int __must_check bch2_write_inode_trans(struct btree_trans *trans,
 
 	lockdep_assert_held(&inode->ei_update_lock);
 
+	if (c->opts.new_inode_updates) {
 	/* XXX: Don't do this with btree locks held */
 	if (!inode->ei_inode_update)
 		inode->ei_inode_update =
 			bch2_deferred_update_alloc(c, BTREE_ID_INODES, 64);
-#if 0
-	iter = bch2_trans_get_iter(trans, BTREE_ID_INODES,
-			POS(inode->v.i_ino, 0),
-			BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
-	if (IS_ERR(iter))
-		return PTR_ERR(iter);
-
-	/* The btree node lock is our lock on the inode: */
-	ret = bch2_btree_iter_traverse(iter);
-	if (ret)
-		return ret;
-#endif
+	} else {
+		iter = bch2_trans_get_iter(trans, BTREE_ID_INODES,
+					   POS(inode->v.i_ino, 0),
+					   BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+		if (IS_ERR(iter))
+			return PTR_ERR(iter);
+
+		/* The btree node lock is our lock on the inode: */
+		ret = bch2_btree_iter_traverse(iter);
+		if (ret)
+			return ret;
+	}
+
 	*inode_u = inode->ei_inode;
 
 	if (set) {
diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
index f4cb0625c3cc..53bf06e70cd5 100644
--- a/fs/bcachefs/opts.h
+++ b/fs/bcachefs/opts.h
@@ -295,7 +295,12 @@ enum opt_type {
 	  OPT_UINT(0, BCH_REPLICAS_MAX),				\
 	  NO_SB_OPT,			1,				\
 	  "n",		"Data written to this device will be considered\n"\
-			"to have already been replicated n times")
+			"to have already been replicated n times")	\
+	x(new_inode_updates,		u8,				\
+	  OPT_MOUNT,							\
+	  OPT_BOOL(),							\
+	  NO_SB_OPT,			false,				\
+	  NULL,		"Enable new btree write-cache for inode updates")
 
 
 struct bch_opts {
-- 
cgit 


From 18c9883e1cddb6a8c3f593b0c2c921bb0179fa50 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 13 Mar 2019 16:56:48 -0400
Subject: bcachefs: fix bch2_invalidate_one_bucket2() during journal replay

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 0ea4bebdd0af..2f1a8e70ad88 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -939,7 +939,7 @@ retry:
 	else
 		memset(&u, 0, sizeof(u));
 
-	invalidating_cached_data = u.cached_sectors != 0;
+	invalidating_cached_data = m.cached_sectors != 0;
 
 	//BUG_ON(u.dirty_sectors);
 	u.data_type	= 0;
@@ -947,7 +947,13 @@ retry:
 	u.cached_sectors = 0;
 	u.read_time	= c->bucket_clock[READ].hand;
 	u.write_time	= c->bucket_clock[WRITE].hand;
-	u.gen++;
+
+	/*
+	 * The allocator has to start before journal replay is finished - thus,
+	 * we have to trust the in memory bucket @m, not the version in the
+	 * btree:
+	 */
+	u.gen		= m.gen + 1;
 
 	a = bkey_alloc_init(&alloc_key.k);
 	a->k.p = iter->pos;
@@ -963,6 +969,7 @@ retry:
 	ret = bch2_btree_insert_at(c, NULL,
 			invalidating_cached_data ? journal_seq : NULL,
 			BTREE_INSERT_ATOMIC|
+			BTREE_INSERT_NOUNLOCK|
 			BTREE_INSERT_NOCHECK_RW|
 			BTREE_INSERT_NOFAIL|
 			BTREE_INSERT_USE_RESERVE|
@@ -982,6 +989,10 @@ retry:
 		if (!top->nr)
 			heap_pop(&ca->alloc_heap, e, bucket_alloc_cmp, NULL);
 
+		/* with btree still locked: */
+		if (ca->buckets_written)
+			set_bit(b, ca->buckets_written);
+
 		/*
 		 * Make sure we flush the last journal entry that updated this
 		 * bucket (i.e. deleting the last reference) before writing to
-- 
cgit 


From db8a5f0a6abd1981caf4b87cc65e5a998851f9c6 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 15 Mar 2019 19:34:16 -0400
Subject: bcachefs: fix bch2_mark_bkey_replicas() call

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update_leaf.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 7749fccd0eab..b484cbc3071e 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -797,8 +797,13 @@ err:
 		}
 
 		bch2_btree_iter_unlock(trans->entries[0].iter);
-		ret = bch2_mark_bkey_replicas(c, bkey_i_to_s_c(i->k))
-			?: -EINTR;
+		ret = -EINTR;
+
+		trans_for_each_iter(trans, i) {
+			int ret2 = bch2_mark_bkey_replicas(c, bkey_i_to_s_c(i->k));
+			if (ret2)
+				ret = ret2;
+		}
 		break;
 	default:
 		BUG_ON(ret >= 0);
-- 
cgit 


From c93cead058779ec6911047d7084c43e3da3e7eaf Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 16 Mar 2019 14:27:40 -0400
Subject: bcachefs: Always use bch2_extent_trim_atomic()

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_types.h       |  2 --
 fs/bcachefs/btree_update.h      |  3 --
 fs/bcachefs/btree_update_leaf.c | 80 +++++++++++------------------------------
 fs/bcachefs/extents.c           | 34 +++++++++---------
 fs/bcachefs/extents.h           | 14 ++------
 fs/bcachefs/io.c                | 28 ++++++++++++---
 6 files changed, 63 insertions(+), 98 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index a2a2192865ee..07d98caf204e 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -490,8 +490,6 @@ struct btree_root {
 
 enum btree_insert_ret {
 	BTREE_INSERT_OK,
-	/* extent spanned multiple leaf nodes: have to traverse to next node: */
-	BTREE_INSERT_NEED_TRAVERSE,
 	/* leaf node needs to be split */
 	BTREE_INSERT_BTREE_NODE_FULL,
 	BTREE_INSERT_ENOSPC,
diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
index e689841a2cd3..df2e24a0688d 100644
--- a/fs/bcachefs/btree_update.h
+++ b/fs/bcachefs/btree_update.h
@@ -128,9 +128,6 @@ enum {
 
 int bch2_btree_delete_at(struct btree_iter *, unsigned);
 
-int bch2_btree_insert_list_at(struct btree_iter *, struct keylist *,
-			     struct disk_reservation *, u64 *, unsigned);
-
 int bch2_btree_insert(struct bch_fs *, enum btree_id, struct bkey_i *,
 		     struct disk_reservation *, u64 *, int flags);
 
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index b484cbc3071e..4ab4dfcf843d 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -185,9 +185,8 @@ void bch2_btree_journal_key(struct btree_insert *trans,
 		set_btree_node_dirty(b);
 }
 
-static enum btree_insert_ret
-bch2_insert_fixup_key(struct btree_insert *trans,
-		     struct btree_insert_entry *insert)
+static void bch2_insert_fixup_key(struct btree_insert *trans,
+				  struct btree_insert_entry *insert)
 {
 	struct btree_iter *iter = insert->iter;
 	struct btree_iter_level *l = &iter->l[0];
@@ -199,30 +198,27 @@ bch2_insert_fixup_key(struct btree_insert *trans,
 	if (bch2_btree_bset_insert_key(iter, l->b, &l->iter,
 				       insert->k))
 		bch2_btree_journal_key(trans, iter, insert->k);
-
-	return BTREE_INSERT_OK;
 }
 
 /**
  * btree_insert_key - insert a key one key into a leaf node
  */
-static enum btree_insert_ret
-btree_insert_key_leaf(struct btree_insert *trans,
-		      struct btree_insert_entry *insert)
+static void btree_insert_key_leaf(struct btree_insert *trans,
+				  struct btree_insert_entry *insert)
 {
 	struct bch_fs *c = trans->c;
 	struct btree_iter *iter = insert->iter;
 	struct btree *b = iter->l[0].b;
-	enum btree_insert_ret ret;
 	int old_u64s = le16_to_cpu(btree_bset_last(b)->u64s);
 	int old_live_u64s = b->nr.live_u64s;
 	int live_u64s_added, u64s_added;
 
 	bch2_mark_update(trans, insert);
 
-	ret = !btree_node_is_extents(b)
-		? bch2_insert_fixup_key(trans, insert)
-		: bch2_insert_fixup_extent(trans, insert);
+	if (!btree_node_is_extents(b))
+		bch2_insert_fixup_key(trans, insert);
+	else
+		bch2_insert_fixup_extent(trans, insert);
 
 	live_u64s_added = (int) b->nr.live_u64s - old_live_u64s;
 	u64s_added = (int) le16_to_cpu(btree_bset_last(b)->u64s) - old_u64s;
@@ -237,7 +233,6 @@ btree_insert_key_leaf(struct btree_insert *trans,
 		bch2_btree_iter_reinit_node(iter, b);
 
 	trace_btree_insert_key(c, b, insert->k);
-	return ret;
 }
 
 /* Deferred btree updates: */
@@ -291,9 +286,8 @@ static void deferred_update_flush(struct journal *j,
 		kfree(k);
 }
 
-static enum btree_insert_ret
-btree_insert_key_deferred(struct btree_insert *trans,
-			  struct btree_insert_entry *insert)
+static void btree_insert_key_deferred(struct btree_insert *trans,
+				      struct btree_insert_entry *insert)
 {
 	struct bch_fs *c = trans->c;
 	struct journal *j = &c->journal;
@@ -321,8 +315,6 @@ btree_insert_key_deferred(struct btree_insert *trans,
 	bch2_journal_pin_update(j, trans->journal_res.seq, &d->journal,
 				deferred_update_flush);
 	spin_unlock(&d->lock);
-
-	return BTREE_INSERT_OK;
 }
 
 void bch2_deferred_update_free(struct bch_fs *c,
@@ -485,13 +477,13 @@ btree_key_can_insert(struct btree_insert *trans,
 	return BTREE_INSERT_OK;
 }
 
-static inline enum btree_insert_ret
-do_btree_insert_one(struct btree_insert *trans,
-		    struct btree_insert_entry *insert)
+static inline void do_btree_insert_one(struct btree_insert *trans,
+				       struct btree_insert_entry *insert)
 {
-	return likely(!insert->deferred)
-		? btree_insert_key_leaf(trans, insert)
-		: btree_insert_key_deferred(trans, insert);
+	if (likely(!insert->deferred))
+		btree_insert_key_leaf(trans, insert);
+	else
+		btree_insert_key_deferred(trans, insert);
 }
 
 /*
@@ -595,19 +587,8 @@ got_journal_res:
 	}
 	trans->did_work = true;
 
-	trans_for_each_entry(trans, i) {
-		switch (do_btree_insert_one(trans, i)) {
-		case BTREE_INSERT_OK:
-			break;
-		case BTREE_INSERT_NEED_TRAVERSE:
-			BUG_ON((trans->flags &
-				(BTREE_INSERT_ATOMIC|BTREE_INSERT_NOUNLOCK)));
-			ret = -EINTR;
-			goto out;
-		default:
-			BUG();
-		}
-	}
+	trans_for_each_entry(trans, i)
+		do_btree_insert_one(trans, i);
 out:
 	BUG_ON(ret &&
 	       (trans->flags & BTREE_INSERT_JOURNAL_RESERVED) &&
@@ -629,6 +610,8 @@ static inline void btree_insert_entry_checks(struct bch_fs *c,
 	if (!i->deferred) {
 		BUG_ON(i->iter->level);
 		BUG_ON(bkey_cmp(bkey_start_pos(&i->k->k), i->iter->pos));
+		EBUG_ON((i->iter->flags & BTREE_ITER_IS_EXTENTS) &&
+			!bch2_extent_is_atomic(i->k, i->iter));
 
 		bch2_btree_iter_verify_locks(i->iter);
 	}
@@ -875,28 +858,6 @@ int bch2_btree_delete_at(struct btree_iter *iter, unsigned flags)
 				    BTREE_INSERT_ENTRY(iter, &k));
 }
 
-int bch2_btree_insert_list_at(struct btree_iter *iter,
-			     struct keylist *keys,
-			     struct disk_reservation *disk_res,
-			     u64 *journal_seq, unsigned flags)
-{
-	BUG_ON(flags & BTREE_INSERT_ATOMIC);
-	BUG_ON(bch2_keylist_empty(keys));
-	bch2_verify_keylist_sorted(keys);
-
-	while (!bch2_keylist_empty(keys)) {
-		int ret = bch2_btree_insert_at(iter->c, disk_res,
-				journal_seq, flags,
-				BTREE_INSERT_ENTRY(iter, bch2_keylist_front(keys)));
-		if (ret)
-			return ret;
-
-		bch2_keylist_pop_front(keys);
-	}
-
-	return 0;
-}
-
 /**
  * bch_btree_insert - insert keys into the extent btree
  * @c:			pointer to struct bch_fs
@@ -962,6 +923,7 @@ int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id,
 			/* create the biggest key we can */
 			bch2_key_resize(&delete.k, max_sectors);
 			bch2_cut_back(end, &delete.k);
+			bch2_extent_trim_atomic(&delete, &iter);
 		}
 
 		ret = bch2_btree_insert_at(c, NULL, journal_seq,
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 41194462be30..1233b3d0b352 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -928,15 +928,25 @@ static void extent_insert_committed(struct extent_insert_state *s)
 	insert->k.needs_whiteout	= false;
 }
 
-void bch2_extent_trim_atomic(struct bkey_i *k, struct btree_iter *iter)
+static inline struct bpos
+bch2_extent_atomic_end(struct bkey_i *k, struct btree_iter *iter)
 {
 	struct btree *b = iter->l[0].b;
 
 	BUG_ON(iter->uptodate > BTREE_ITER_NEED_PEEK);
+	BUG_ON(bkey_cmp(bkey_start_pos(&k->k), b->data->min_key) < 0);
 
-	bch2_cut_back(b->key.k.p, &k->k);
+	return bpos_min(k->k.p, b->key.k.p);
+}
 
-	BUG_ON(bkey_cmp(bkey_start_pos(&k->k), b->data->min_key) < 0);
+void bch2_extent_trim_atomic(struct bkey_i *k, struct btree_iter *iter)
+{
+	bch2_cut_back(bch2_extent_atomic_end(k, iter), &k->k);
+}
+
+bool bch2_extent_is_atomic(struct bkey_i *k, struct btree_iter *iter)
+{
+	return !bkey_cmp(bch2_extent_atomic_end(k, iter), k->k.p);
 }
 
 enum btree_insert_ret
@@ -952,9 +962,6 @@ bch2_extent_can_insert(struct btree_insert *trans,
 	struct bkey_s_c k;
 	int sectors;
 
-	BUG_ON(trans->flags & BTREE_INSERT_ATOMIC &&
-	       !bch2_extent_is_atomic(&insert->k->k, insert->iter));
-
 	/*
 	 * We avoid creating whiteouts whenever possible when deleting, but
 	 * those optimizations mean we may potentially insert two whiteouts
@@ -1216,12 +1223,10 @@ next:
  * If the end of iter->pos is not the same as the end of insert, then
  * key insertion needs to continue/be retried.
  */
-enum btree_insert_ret
-bch2_insert_fixup_extent(struct btree_insert *trans,
-			 struct btree_insert_entry *insert)
+void bch2_insert_fixup_extent(struct btree_insert *trans,
+			      struct btree_insert_entry *insert)
 {
 	struct btree_iter *iter	= insert->iter;
-	struct btree *b		= iter->l[0].b;
 	struct extent_insert_state s = {
 		.trans		= trans,
 		.insert		= insert,
@@ -1248,16 +1253,9 @@ bch2_insert_fixup_extent(struct btree_insert *trans,
 
 	extent_insert_committed(&s);
 
+	BUG_ON(insert->k->k.size);
 	EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k->k)));
 	EBUG_ON(bkey_cmp(iter->pos, s.committed));
-
-	if (insert->k->k.size) {
-		/* got to the end of this leaf node */
-		BUG_ON(bkey_cmp(iter->pos, b->key.k.p));
-		return BTREE_INSERT_NEED_TRAVERSE;
-	}
-
-	return BTREE_INSERT_OK;
 }
 
 const char *bch2_extent_invalid(const struct bch_fs *c, struct bkey_s_c k)
diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h
index 17cae891bccb..c3d67cafc0c3 100644
--- a/fs/bcachefs/extents.h
+++ b/fs/bcachefs/extents.h
@@ -407,21 +407,13 @@ enum merge_result bch2_reservation_merge(struct bch_fs *,
 }
 
 void bch2_extent_trim_atomic(struct bkey_i *, struct btree_iter *);
-
-static inline bool bch2_extent_is_atomic(struct bkey *k,
-					 struct btree_iter *iter)
-{
-	struct btree *b = iter->l[0].b;
-
-	return bkey_cmp(k->p, b->key.k.p) <= 0 &&
-		bkey_cmp(bkey_start_pos(k), b->data->min_key) >= 0;
-}
+bool bch2_extent_is_atomic(struct bkey_i *, struct btree_iter *);
 
 enum btree_insert_ret
 bch2_extent_can_insert(struct btree_insert *, struct btree_insert_entry *,
 		       unsigned *);
-enum btree_insert_ret
-bch2_insert_fixup_extent(struct btree_insert *, struct btree_insert_entry *);
+void bch2_insert_fixup_extent(struct btree_insert *,
+			      struct btree_insert_entry *);
 
 void bch2_extent_mark_replicas_cached(struct bch_fs *, struct bkey_s_extent,
 				      unsigned, unsigned);
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index e1c7d572fbff..092500591b8f 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -293,18 +293,36 @@ static void bch2_write_done(struct closure *cl)
 
 int bch2_write_index_default(struct bch_write_op *op)
 {
+	struct bch_fs *c = op->c;
 	struct keylist *keys = &op->insert_keys;
 	struct btree_iter iter;
 	int ret;
 
-	bch2_btree_iter_init(&iter, op->c, BTREE_ID_EXTENTS,
+	bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS,
 			     bkey_start_pos(&bch2_keylist_front(keys)->k),
 			     BTREE_ITER_INTENT);
 
-	ret = bch2_btree_insert_list_at(&iter, keys, &op->res,
-					op_journal_seq(op),
-					BTREE_INSERT_NOFAIL|
-					BTREE_INSERT_USE_RESERVE);
+	do {
+		BKEY_PADDED(k) split;
+
+		bkey_copy(&split.k, bch2_keylist_front(keys));
+
+		bch2_extent_trim_atomic(&split.k, &iter);
+
+		ret = bch2_btree_insert_at(c, &op->res,
+				op_journal_seq(op),
+				BTREE_INSERT_NOFAIL|
+				BTREE_INSERT_USE_RESERVE,
+				BTREE_INSERT_ENTRY(&iter, &split.k));
+		if (ret)
+			break;
+
+		if (bkey_cmp(iter.pos, bch2_keylist_front(keys)->k.p) < 0)
+			bch2_cut_front(iter.pos, bch2_keylist_front(keys));
+		else
+			bch2_keylist_pop_front(keys);
+	} while (!bch2_keylist_empty(keys));
+
 	bch2_btree_iter_unlock(&iter);
 
 	return ret;
-- 
cgit 


From 9a12b1b0978837f19a5ccc2312aeae535d8289a4 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 16 Mar 2019 14:27:40 -0400
Subject: bcachefs: Refactor extent insert path

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/extents.c | 152 ++++++++++++++++++--------------------------------
 1 file changed, 54 insertions(+), 98 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 1233b3d0b352..420a9a6c59e7 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -782,18 +782,6 @@ static bool extent_i_save(struct btree *b, struct bkey_packed *dst,
 	return true;
 }
 
-struct extent_insert_state {
-	struct btree_insert		*trans;
-	struct btree_insert_entry	*insert;
-	struct bpos			committed;
-
-	/* for deleting: */
-	struct bkey_i			whiteout;
-	bool				update_journal;
-	bool				update_btree;
-	bool				deleting;
-};
-
 static bool bch2_extent_merge_inline(struct bch_fs *,
 				     struct btree_iter *,
 				     struct bkey_packed *,
@@ -880,54 +868,6 @@ static void extent_bset_insert(struct bch_fs *c, struct btree_iter *iter,
 	bch2_btree_iter_verify(iter, l->b);
 }
 
-static void extent_insert_committed(struct extent_insert_state *s)
-{
-	struct bch_fs *c = s->trans->c;
-	struct btree_iter *iter = s->insert->iter;
-	struct bkey_i *insert = s->insert->k;
-	BKEY_PADDED(k) split;
-
-	EBUG_ON(bkey_cmp(insert->k.p, s->committed) < 0);
-	EBUG_ON(bkey_cmp(s->committed, bkey_start_pos(&insert->k)) < 0);
-
-	bkey_copy(&split.k, insert);
-	if (s->deleting)
-		split.k.k.type = KEY_TYPE_discard;
-
-	bch2_cut_back(s->committed, &split.k.k);
-
-	if (!bkey_cmp(s->committed, iter->pos))
-		return;
-
-	bch2_btree_iter_set_pos_same_leaf(iter, s->committed);
-
-	if (s->update_btree) {
-		if (debug_check_bkeys(c))
-			bch2_bkey_debugcheck(c, iter->l[0].b,
-					     bkey_i_to_s_c(&split.k));
-
-		EBUG_ON(bkey_deleted(&split.k.k) || !split.k.k.size);
-
-		extent_bset_insert(c, iter, &split.k);
-	}
-
-	if (s->update_journal) {
-		bkey_copy(&split.k, !s->deleting ? insert : &s->whiteout);
-		if (s->deleting)
-			split.k.k.type = KEY_TYPE_discard;
-
-		bch2_cut_back(s->committed, &split.k.k);
-
-		EBUG_ON(bkey_deleted(&split.k.k) || !split.k.k.size);
-
-		bch2_btree_journal_key(s->trans, iter, &split.k);
-	}
-
-	bch2_cut_front(s->committed, insert);
-
-	insert->k.needs_whiteout	= false;
-}
-
 static inline struct bpos
 bch2_extent_atomic_end(struct bkey_i *k, struct btree_iter *iter)
 {
@@ -1005,12 +945,11 @@ bch2_extent_can_insert(struct btree_insert *trans,
 }
 
 static void
-extent_squash(struct extent_insert_state *s, struct bkey_i *insert,
+extent_squash(struct bch_fs *c, struct btree_iter *iter,
+	      struct bkey_i *insert,
 	      struct bkey_packed *_k, struct bkey_s k,
 	      enum bch_extent_overlap overlap)
 {
-	struct bch_fs *c = s->trans->c;
-	struct btree_iter *iter = s->insert->iter;
 	struct btree_iter_level *l = &iter->l[0];
 
 	switch (overlap) {
@@ -1096,34 +1035,39 @@ extent_squash(struct extent_insert_state *s, struct bkey_i *insert,
 	}
 }
 
-static void __bch2_insert_fixup_extent(struct extent_insert_state *s)
+struct extent_insert_state {
+	struct bkey_i			whiteout;
+	bool				update_journal;
+	bool				update_btree;
+	bool				deleting;
+};
+
+static void __bch2_insert_fixup_extent(struct bch_fs *c,
+				       struct btree_iter *iter,
+				       struct bkey_i *insert,
+				       struct extent_insert_state *s)
 {
-	struct btree_iter *iter = s->insert->iter;
 	struct btree_iter_level *l = &iter->l[0];
 	struct bkey_packed *_k;
 	struct bkey unpacked;
-	struct bkey_i *insert = s->insert->k;
 
-	while (bkey_cmp(s->committed, insert->k.p) < 0 &&
-	       (_k = bch2_btree_node_iter_peek_filter(&l->iter, l->b,
+	while ((_k = bch2_btree_node_iter_peek_filter(&l->iter, l->b,
 						      KEY_TYPE_discard))) {
 		struct bkey_s k = __bkey_disassemble(l->b, _k, &unpacked);
-		enum bch_extent_overlap overlap = bch2_extent_overlap(&insert->k, k.k);
-
-		EBUG_ON(bkey_cmp(iter->pos, k.k->p) >= 0);
+		struct bpos cur_end = bpos_min(insert->k.p, k.k->p);
+		enum bch_extent_overlap overlap =
+			bch2_extent_overlap(&insert->k, k.k);
 
 		if (bkey_cmp(bkey_start_pos(k.k), insert->k.p) >= 0)
 			break;
 
-		s->committed = bpos_min(s->insert->k->k.p, k.k->p);
-
 		if (!bkey_whiteout(k.k))
 			s->update_journal = true;
 
 		if (!s->update_journal) {
-			bch2_cut_front(s->committed, insert);
-			bch2_cut_front(s->committed, &s->whiteout);
-			bch2_btree_iter_set_pos_same_leaf(iter, s->committed);
+			bch2_cut_front(cur_end, insert);
+			bch2_cut_front(cur_end, &s->whiteout);
+			bch2_btree_iter_set_pos_same_leaf(iter, cur_end);
 			goto next;
 		}
 
@@ -1157,19 +1101,16 @@ static void __bch2_insert_fixup_extent(struct extent_insert_state *s)
 			_k->needs_whiteout = false;
 		}
 
-		extent_squash(s, insert, _k, k, overlap);
+		extent_squash(c, iter, insert, _k, k, overlap);
 
 		if (!s->update_btree)
-			bch2_cut_front(s->committed, insert);
+			bch2_cut_front(cur_end, insert);
 next:
 		if (overlap == BCH_EXTENT_OVERLAP_FRONT ||
 		    overlap == BCH_EXTENT_OVERLAP_MIDDLE)
 			break;
 	}
 
-	if (bkey_cmp(s->committed, insert->k.p) < 0)
-		s->committed = bpos_min(s->insert->k->k.p, l->b->key.k.p);
-
 	/*
 	 * may have skipped past some deleted extents greater than the insert
 	 * key, before we got to a non deleted extent and knew we could bail out
@@ -1179,7 +1120,7 @@ next:
 		struct btree_node_iter node_iter = l->iter;
 
 		while ((_k = bch2_btree_node_iter_prev_all(&node_iter, l->b)) &&
-		       bkey_cmp_left_packed(l->b, _k, &s->committed) > 0)
+		       bkey_cmp_left_packed(l->b, _k, &insert->k.p) > 0)
 			l->iter = node_iter;
 	}
 }
@@ -1226,36 +1167,51 @@ next:
 void bch2_insert_fixup_extent(struct btree_insert *trans,
 			      struct btree_insert_entry *insert)
 {
+	struct bch_fs *c = trans->c;
 	struct btree_iter *iter	= insert->iter;
 	struct extent_insert_state s = {
-		.trans		= trans,
-		.insert		= insert,
-		.committed	= iter->pos,
-
 		.whiteout	= *insert->k,
 		.update_journal	= !bkey_whiteout(&insert->k->k),
 		.update_btree	= !bkey_whiteout(&insert->k->k),
 		.deleting	= bkey_whiteout(&insert->k->k),
 	};
+	BKEY_PADDED(k) tmp;
 
 	EBUG_ON(iter->level);
 	EBUG_ON(!insert->k->k.size);
-
-	/*
-	 * As we process overlapping extents, we advance @iter->pos both to
-	 * signal to our caller (btree_insert_key()) how much of @insert->k has
-	 * been inserted, and also to keep @iter->pos consistent with
-	 * @insert->k and the node iterator that we're advancing:
-	 */
 	EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k->k)));
 
-	__bch2_insert_fixup_extent(&s);
+	__bch2_insert_fixup_extent(c, iter, insert->k, &s);
 
-	extent_insert_committed(&s);
+	bch2_btree_iter_set_pos_same_leaf(iter, insert->k->k.p);
 
-	BUG_ON(insert->k->k.size);
-	EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k->k)));
-	EBUG_ON(bkey_cmp(iter->pos, s.committed));
+	if (s.update_btree) {
+		bkey_copy(&tmp.k, insert->k);
+
+		if (s.deleting)
+			tmp.k.k.type = KEY_TYPE_discard;
+
+		if (debug_check_bkeys(c))
+			bch2_bkey_debugcheck(c, iter->l[0].b,
+					     bkey_i_to_s_c(&tmp.k));
+
+		EBUG_ON(bkey_deleted(&tmp.k.k) || !tmp.k.k.size);
+
+		extent_bset_insert(c, iter, &tmp.k);
+	}
+
+	if (s.update_journal) {
+		bkey_copy(&tmp.k, !s.deleting ? insert->k : &s.whiteout);
+
+		if (s.deleting)
+			tmp.k.k.type = KEY_TYPE_discard;
+
+		EBUG_ON(bkey_deleted(&tmp.k.k) || !tmp.k.k.size);
+
+		bch2_btree_journal_key(trans, iter, &tmp.k);
+	}
+
+	bch2_cut_front(insert->k->k.p, insert->k);
 }
 
 const char *bch2_extent_invalid(const struct bch_fs *c, struct bkey_s_c k)
-- 
cgit 


From 94d290e40c255ea854ec1050dbf9a8b60340a749 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 11 Mar 2019 16:25:42 -0400
Subject: bcachefs: drop btree_insert->did_work

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update.h      |  1 -
 fs/bcachefs/btree_update_leaf.c | 10 ++++++----
 2 files changed, 6 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
index df2e24a0688d..3e6a616b8182 100644
--- a/fs/bcachefs/btree_update.h
+++ b/fs/bcachefs/btree_update.h
@@ -30,7 +30,6 @@ struct btree_insert {
 	struct journal_preres	journal_preres;
 	u64			*journal_seq;
 	unsigned		flags;
-	bool			did_work;
 
 	unsigned short		nr;
 	struct btree_insert_entry  *entries;
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 4ab4dfcf843d..21822bda67fc 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -585,7 +585,6 @@ got_journal_res:
 			break;
 		}
 	}
-	trans->did_work = true;
 
 	trans_for_each_entry(trans, i)
 		do_btree_insert_one(trans, i);
@@ -739,8 +738,7 @@ err:
 	 * BTREE_INSERT_NOUNLOCK means don't unlock _after_ successful btree
 	 * update; if we haven't done anything yet it doesn't apply
 	 */
-	if (!trans->did_work)
-		flags &= ~BTREE_INSERT_NOUNLOCK;
+	flags &= ~BTREE_INSERT_NOUNLOCK;
 
 	switch (ret) {
 	case BTREE_INSERT_BTREE_NODE_FULL:
@@ -756,8 +754,12 @@ err:
 		 * XXX:
 		 * split -> btree node merging (of parent node) might still drop
 		 * locks when we're not passing it BTREE_INSERT_NOUNLOCK
+		 *
+		 * we don't want to pass BTREE_INSERT_NOUNLOCK to split as that
+		 * will inhibit merging - but we don't have a reliable way yet
+		 * (do we?) of checking if we dropped locks in this path
 		 */
-		if (!ret && !trans->did_work)
+		if (!ret)
 			goto retry;
 #endif
 
-- 
cgit 


From 0564b16782b39d6f59e06f427f32826db69e75a2 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 13 Mar 2019 20:49:16 -0400
Subject: bcachefs: convert bch2_btree_insert_at() usage to bch2_trans_commit()

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/acl.c               |   4 +-
 fs/bcachefs/alloc_background.c  | 141 +++++++++++++++++++++++-----------------
 fs/bcachefs/btree_update.h      |  32 +--------
 fs/bcachefs/btree_update_leaf.c |  59 ++++++++++-------
 fs/bcachefs/dirent.c            |   4 +-
 fs/bcachefs/ec.c                | 128 +++++++++++++++++-------------------
 fs/bcachefs/fsck.c              | 104 +++++++++++++++++------------
 fs/bcachefs/inode.c             |  29 +++++----
 fs/bcachefs/io.c                |  57 +++++++++-------
 fs/bcachefs/journal_io.c        |  33 +++++-----
 fs/bcachefs/migrate.c           |  30 +++++----
 fs/bcachefs/move.c              |  44 ++++++-------
 fs/bcachefs/quota.c             |  19 ++++--
 fs/bcachefs/str_hash.h          |  19 ++----
 fs/bcachefs/tests.c             |  95 ++++++++++++++++-----------
 fs/bcachefs/xattr.c             |   2 +-
 16 files changed, 424 insertions(+), 376 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/acl.c b/fs/bcachefs/acl.c
index bcfc9fdce35e..c7f6bcb87387 100644
--- a/fs/bcachefs/acl.c
+++ b/fs/bcachefs/acl.c
@@ -268,8 +268,8 @@ int bch2_set_acl_trans(struct btree_trans *trans,
 		if (IS_ERR(xattr))
 			return PTR_ERR(xattr);
 
-		ret = __bch2_hash_set(trans, bch2_xattr_hash_desc, hash_info,
-				      inode_u->bi_inum, &xattr->k_i, 0);
+		ret = bch2_hash_set(trans, bch2_xattr_hash_desc, hash_info,
+				    inode_u->bi_inum, &xattr->k_i, 0);
 	} else {
 		struct xattr_search_key search =
 			X_SEARCH(acl_to_xattr_type(type), "", 0);
diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 2f1a8e70ad88..c11136506352 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -310,10 +310,53 @@ int bch2_alloc_read(struct bch_fs *c, struct list_head *journal_replay_list)
 	return 0;
 }
 
-static int __bch2_alloc_write_key(struct bch_fs *c, struct bch_dev *ca,
+int bch2_alloc_replay_key(struct bch_fs *c, struct bkey_i *k)
+{
+	struct btree_trans trans;
+	struct btree_iter *iter;
+	struct bch_dev *ca;
+	int ret;
+
+	if (k->k.p.inode >= c->sb.nr_devices ||
+	    !c->devs[k->k.p.inode])
+		return 0;
+
+	ca = bch_dev_bkey_exists(c, k->k.p.inode);
+
+	if (k->k.p.offset >= ca->mi.nbuckets)
+		return 0;
+
+	bch2_trans_init(&trans, c);
+
+	iter = bch2_trans_get_iter(&trans, BTREE_ID_ALLOC, k->k.p,
+				   BTREE_ITER_INTENT);
+
+	ret = bch2_btree_iter_traverse(iter);
+	if (ret)
+		goto err;
+
+	/* check buckets_written with btree node locked: */
+	if (test_bit(k->k.p.offset, ca->buckets_written)) {
+		ret = 0;
+		goto err;
+	}
+
+	bch2_trans_update(&trans, BTREE_INSERT_ENTRY(iter, k));
+
+	ret = bch2_trans_commit(&trans, NULL, NULL,
+				BTREE_INSERT_NOFAIL|
+				BTREE_INSERT_JOURNAL_REPLAY|
+				BTREE_INSERT_NOMARK);
+err:
+	bch2_trans_exit(&trans);
+	return ret;
+}
+
+static int __bch2_alloc_write_key(struct btree_trans *trans, struct bch_dev *ca,
 				  size_t b, struct btree_iter *iter,
 				  u64 *journal_seq, unsigned flags)
 {
+	struct bch_fs *c = trans->c;
 #if 0
 	__BKEY_PADDED(k, BKEY_ALLOC_VAL_U64s_MAX) alloc_key;
 #else
@@ -349,14 +392,15 @@ static int __bch2_alloc_write_key(struct bch_fs *c, struct bch_dev *ca,
 
 	bch2_btree_iter_cond_resched(iter);
 
-	ret = bch2_btree_insert_at(c, NULL, journal_seq,
+	bch2_trans_update(trans, BTREE_INSERT_ENTRY(iter, &a->k_i));
+
+	ret = bch2_trans_commit(trans, NULL, journal_seq,
 				   BTREE_INSERT_NOCHECK_RW|
 				   BTREE_INSERT_NOFAIL|
 				   BTREE_INSERT_USE_RESERVE|
 				   BTREE_INSERT_USE_ALLOC_RESERVE|
 				   BTREE_INSERT_NOMARK|
-				   flags,
-				   BTREE_INSERT_ENTRY(iter, &a->k_i));
+				   flags);
 	if (ret)
 		return ret;
 
@@ -370,42 +414,6 @@ static int __bch2_alloc_write_key(struct bch_fs *c, struct bch_dev *ca,
 	return 0;
 }
 
-int bch2_alloc_replay_key(struct bch_fs *c, struct bkey_i *k)
-{
-	struct bch_dev *ca;
-	struct btree_iter iter;
-	int ret;
-
-	if (k->k.p.inode >= c->sb.nr_devices ||
-	    !c->devs[k->k.p.inode])
-		return 0;
-
-	ca = bch_dev_bkey_exists(c, k->k.p.inode);
-
-	if (k->k.p.offset >= ca->mi.nbuckets)
-		return 0;
-
-	bch2_btree_iter_init(&iter, c, BTREE_ID_ALLOC, k->k.p,
-			     BTREE_ITER_INTENT);
-
-	ret = bch2_btree_iter_traverse(&iter);
-	if (ret)
-		goto err;
-
-	/* check buckets_written with btree node locked: */
-
-	ret = test_bit(k->k.p.offset, ca->buckets_written)
-		? 0
-		: bch2_btree_insert_at(c, NULL, NULL,
-				       BTREE_INSERT_NOFAIL|
-				       BTREE_INSERT_JOURNAL_REPLAY|
-				       BTREE_INSERT_NOMARK,
-				       BTREE_INSERT_ENTRY(&iter, k));
-err:
-	bch2_btree_iter_unlock(&iter);
-	return ret;
-}
-
 int bch2_alloc_write(struct bch_fs *c, bool nowait, bool *wrote)
 {
 	struct bch_dev *ca;
@@ -415,12 +423,15 @@ int bch2_alloc_write(struct bch_fs *c, bool nowait, bool *wrote)
 	*wrote = false;
 
 	for_each_rw_member(ca, c, i) {
-		struct btree_iter iter;
+		struct btree_trans trans;
+		struct btree_iter *iter;
 		struct bucket_array *buckets;
 		size_t b;
 
-		bch2_btree_iter_init(&iter, c, BTREE_ID_ALLOC, POS_MIN,
-				     BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+		bch2_trans_init(&trans, c);
+
+		iter = bch2_trans_get_iter(&trans, BTREE_ID_ALLOC, POS_MIN,
+					   BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
 
 		down_read(&ca->bucket_lock);
 		buckets = bucket_array(ca);
@@ -431,7 +442,7 @@ int bch2_alloc_write(struct bch_fs *c, bool nowait, bool *wrote)
 			if (!buckets->b[b].mark.dirty)
 				continue;
 
-			ret = __bch2_alloc_write_key(c, ca, b, &iter, NULL,
+			ret = __bch2_alloc_write_key(&trans, ca, b, iter, NULL,
 						     nowait
 						     ? BTREE_INSERT_NOWAIT
 						     : 0);
@@ -441,7 +452,8 @@ int bch2_alloc_write(struct bch_fs *c, bool nowait, bool *wrote)
 			*wrote = true;
 		}
 		up_read(&ca->bucket_lock);
-		bch2_btree_iter_unlock(&iter);
+
+		bch2_trans_exit(&trans);
 
 		if (ret) {
 			percpu_ref_put(&ca->io_ref);
@@ -887,7 +899,8 @@ static u64 bucket_journal_seq(struct bch_fs *c, struct bucket_mark m)
 	}
 }
 
-static int bch2_invalidate_one_bucket2(struct bch_fs *c, struct bch_dev *ca,
+static int bch2_invalidate_one_bucket2(struct btree_trans *trans,
+				       struct bch_dev *ca,
 				       struct btree_iter *iter,
 				       u64 *journal_seq, unsigned flags)
 {
@@ -897,6 +910,7 @@ static int bch2_invalidate_one_bucket2(struct bch_fs *c, struct bch_dev *ca,
 	/* hack: */
 	__BKEY_PADDED(k, 8) alloc_key;
 #endif
+	struct bch_fs *c = trans->c;
 	struct bkey_i_alloc *a;
 	struct bkey_alloc_unpacked u;
 	struct bucket_mark m;
@@ -959,6 +973,8 @@ retry:
 	a->k.p = iter->pos;
 	bch2_alloc_pack(a, u);
 
+	bch2_trans_update(trans, BTREE_INSERT_ENTRY(iter, &a->k_i));
+
 	/*
 	 * XXX:
 	 * when using deferred btree updates, we have journal reclaim doing
@@ -966,16 +982,15 @@ retry:
 	 * progress, and here the allocator is requiring space in the journal -
 	 * so we need a journal pre-reservation:
 	 */
-	ret = bch2_btree_insert_at(c, NULL,
-			invalidating_cached_data ? journal_seq : NULL,
-			BTREE_INSERT_ATOMIC|
-			BTREE_INSERT_NOUNLOCK|
-			BTREE_INSERT_NOCHECK_RW|
-			BTREE_INSERT_NOFAIL|
-			BTREE_INSERT_USE_RESERVE|
-			BTREE_INSERT_USE_ALLOC_RESERVE|
-			flags,
-			BTREE_INSERT_ENTRY(iter, &a->k_i));
+	ret = bch2_trans_commit(trans, NULL,
+				invalidating_cached_data ? journal_seq : NULL,
+				BTREE_INSERT_ATOMIC|
+				BTREE_INSERT_NOUNLOCK|
+				BTREE_INSERT_NOCHECK_RW|
+				BTREE_INSERT_NOFAIL|
+				BTREE_INSERT_USE_RESERVE|
+				BTREE_INSERT_USE_ALLOC_RESERVE|
+				flags);
 	if (ret == -EINTR)
 		goto retry;
 
@@ -1049,23 +1064,27 @@ static bool bch2_invalidate_one_bucket(struct bch_fs *c, struct bch_dev *ca,
  */
 static int bch2_invalidate_buckets(struct bch_fs *c, struct bch_dev *ca)
 {
-	struct btree_iter iter;
+	struct btree_trans trans;
+	struct btree_iter *iter;
 	u64 journal_seq = 0;
 	int ret = 0;
 
-	bch2_btree_iter_init(&iter, c, BTREE_ID_ALLOC, POS(ca->dev_idx, 0),
-			     BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+	bch2_trans_init(&trans, c);
+
+	iter = bch2_trans_get_iter(&trans, BTREE_ID_ALLOC,
+				   POS(ca->dev_idx, 0),
+				   BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
 
 	/* Only use nowait if we've already invalidated at least one bucket: */
 	while (!ret &&
 	       !fifo_full(&ca->free_inc) &&
 	       ca->alloc_heap.used)
-		ret = bch2_invalidate_one_bucket2(c, ca, &iter, &journal_seq,
+		ret = bch2_invalidate_one_bucket2(&trans, ca, iter, &journal_seq,
 				BTREE_INSERT_GC_LOCK_HELD|
 				(!fifo_empty(&ca->free_inc)
 				 ? BTREE_INSERT_NOWAIT : 0));
 
-	bch2_btree_iter_unlock(&iter);
+	bch2_trans_exit(&trans);
 
 	/* If we used NOWAIT, don't return the error: */
 	if (!fifo_empty(&ca->free_inc))
diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
index 3e6a616b8182..2bdb8b532aad 100644
--- a/fs/bcachefs/btree_update.h
+++ b/fs/bcachefs/btree_update.h
@@ -21,8 +21,6 @@ void bch2_deferred_update_free(struct bch_fs *,
 struct deferred_update *
 bch2_deferred_update_alloc(struct bch_fs *, enum btree_id, unsigned);
 
-/* Normal update interface: */
-
 struct btree_insert {
 	struct bch_fs		*c;
 	struct disk_reservation *disk_res;
@@ -35,8 +33,6 @@ struct btree_insert {
 	struct btree_insert_entry  *entries;
 };
 
-int __bch2_btree_insert_at(struct btree_insert *);
-
 #define BTREE_INSERT_ENTRY(_iter, _k)					\
 	((struct btree_insert_entry) {					\
 		.iter		= (_iter),				\
@@ -50,30 +46,6 @@ int __bch2_btree_insert_at(struct btree_insert *);
 		.deferred	= true,					\
 	})
 
-/**
- * bch_btree_insert_at - insert one or more keys at iterator positions
- * @iter:		btree iterator
- * @insert_key:		key to insert
- * @disk_res:		disk reservation
- * @hook:		extent insert callback
- *
- * Return values:
- * -EINTR: locking changed, this function should be called again. Only returned
- *  if passed BTREE_INSERT_ATOMIC.
- * -EROFS: filesystem read only
- * -EIO: journal or btree node IO error
- */
-#define bch2_btree_insert_at(_c, _disk_res, _journal_seq, _flags, ...)	\
-	__bch2_btree_insert_at(&(struct btree_insert) {			\
-		.c		= (_c),					\
-		.disk_res	= (_disk_res),				\
-		.journal_seq	= (_journal_seq),			\
-		.flags		= (_flags),				\
-		.nr		= COUNT_ARGS(__VA_ARGS__),		\
-		.entries	= (struct btree_insert_entry[]) {	\
-			__VA_ARGS__					\
-		}})
-
 enum {
 	__BTREE_INSERT_ATOMIC,
 	__BTREE_INSERT_NOUNLOCK,
@@ -125,7 +97,7 @@ enum {
 #define BCH_HASH_SET_MUST_CREATE	(1 << __BCH_HASH_SET_MUST_CREATE)
 #define BCH_HASH_SET_MUST_REPLACE	(1 << __BCH_HASH_SET_MUST_REPLACE)
 
-int bch2_btree_delete_at(struct btree_iter *, unsigned);
+int bch2_btree_delete_at(struct btree_trans *, struct btree_iter *, unsigned);
 
 int bch2_btree_insert(struct bch_fs *, enum btree_id, struct bkey_i *,
 		     struct disk_reservation *, u64 *, int flags);
@@ -138,8 +110,6 @@ int bch2_btree_node_rewrite(struct bch_fs *c, struct btree_iter *,
 int bch2_btree_node_update_key(struct bch_fs *, struct btree_iter *,
 			       struct btree *, struct bkey_i_btree_ptr *);
 
-/* new transactional interface: */
-
 static inline void
 bch2_trans_update(struct btree_trans *trans,
 		  struct btree_insert_entry entry)
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 21822bda67fc..d2f57b6b924d 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -631,7 +631,7 @@ static inline void btree_insert_entry_checks(struct bch_fs *c,
  * -EROFS: filesystem read only
  * -EIO: journal or btree node IO error
  */
-int __bch2_btree_insert_at(struct btree_insert *trans)
+static int __bch2_btree_insert_at(struct btree_insert *trans)
 {
 	struct bch_fs *c = trans->c;
 	struct btree_insert_entry *i;
@@ -847,17 +847,18 @@ int bch2_trans_commit(struct btree_trans *trans,
 	return __bch2_btree_insert_at(&insert);
 }
 
-int bch2_btree_delete_at(struct btree_iter *iter, unsigned flags)
+int bch2_btree_delete_at(struct btree_trans *trans,
+			 struct btree_iter *iter, unsigned flags)
 {
 	struct bkey_i k;
 
 	bkey_init(&k.k);
 	k.k.p = iter->pos;
 
-	return bch2_btree_insert_at(iter->c, NULL, NULL,
-				    BTREE_INSERT_NOFAIL|
-				    BTREE_INSERT_USE_RESERVE|flags,
-				    BTREE_INSERT_ENTRY(iter, &k));
+	bch2_trans_update(trans, BTREE_INSERT_ENTRY(iter, &k));
+	return bch2_trans_commit(trans, NULL, NULL,
+				 BTREE_INSERT_NOFAIL|
+				 BTREE_INSERT_USE_RESERVE|flags);
 }
 
 /**
@@ -872,14 +873,19 @@ int bch2_btree_insert(struct bch_fs *c, enum btree_id id,
 		     struct disk_reservation *disk_res,
 		     u64 *journal_seq, int flags)
 {
-	struct btree_iter iter;
+	struct btree_trans trans;
+	struct btree_iter *iter;
 	int ret;
 
-	bch2_btree_iter_init(&iter, c, id, bkey_start_pos(&k->k),
-			     BTREE_ITER_INTENT);
-	ret = bch2_btree_insert_at(c, disk_res, journal_seq, flags,
-				   BTREE_INSERT_ENTRY(&iter, k));
-	bch2_btree_iter_unlock(&iter);
+	bch2_trans_init(&trans, c);
+
+	iter = bch2_trans_get_iter(&trans, id, bkey_start_pos(&k->k),
+				   BTREE_ITER_INTENT);
+
+	bch2_trans_update(&trans, BTREE_INSERT_ENTRY(iter, k));
+
+	ret = bch2_trans_commit(&trans, disk_res, journal_seq, flags);
+	bch2_trans_exit(&trans);
 
 	return ret;
 }
@@ -893,16 +899,18 @@ int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id,
 			    struct bpos start, struct bpos end,
 			    u64 *journal_seq)
 {
-	struct btree_iter iter;
+	struct btree_trans trans;
+	struct btree_iter *iter;
 	struct bkey_s_c k;
 	int ret = 0;
 
-	bch2_btree_iter_init(&iter, c, id, start,
-			     BTREE_ITER_INTENT);
+	bch2_trans_init(&trans, c);
 
-	while ((k = bch2_btree_iter_peek(&iter)).k &&
+	iter = bch2_trans_get_iter(&trans, id, start, BTREE_ITER_INTENT);
+
+	while ((k = bch2_btree_iter_peek(iter)).k &&
 	       !(ret = btree_iter_err(k)) &&
-	       bkey_cmp(iter.pos, end) < 0) {
+	       bkey_cmp(iter->pos, end) < 0) {
 		unsigned max_sectors = KEY_SIZE_MAX & (~0 << c->block_bits);
 		/* really shouldn't be using a bare, unpadded bkey_i */
 		struct bkey_i delete;
@@ -919,24 +927,25 @@ int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id,
 		 * (bch2_btree_iter_peek() does guarantee that iter.pos >=
 		 * bkey_start_pos(k.k)).
 		 */
-		delete.k.p = iter.pos;
+		delete.k.p = iter->pos;
 
-		if (iter.flags & BTREE_ITER_IS_EXTENTS) {
+		if (iter->flags & BTREE_ITER_IS_EXTENTS) {
 			/* create the biggest key we can */
 			bch2_key_resize(&delete.k, max_sectors);
 			bch2_cut_back(end, &delete.k);
-			bch2_extent_trim_atomic(&delete, &iter);
+			bch2_extent_trim_atomic(&delete, iter);
 		}
 
-		ret = bch2_btree_insert_at(c, NULL, journal_seq,
-					   BTREE_INSERT_NOFAIL,
-					   BTREE_INSERT_ENTRY(&iter, &delete));
+		bch2_trans_update(&trans, BTREE_INSERT_ENTRY(iter, &delete));
+
+		ret = bch2_trans_commit(&trans, NULL, journal_seq,
+					BTREE_INSERT_NOFAIL);
 		if (ret)
 			break;
 
-		bch2_btree_iter_cond_resched(&iter);
+		bch2_btree_iter_cond_resched(iter);
 	}
 
-	bch2_btree_iter_unlock(&iter);
+	bch2_trans_exit(&trans);
 	return ret;
 }
diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c
index 80d37c568272..dc3883204d80 100644
--- a/fs/bcachefs/dirent.c
+++ b/fs/bcachefs/dirent.c
@@ -151,8 +151,8 @@ int __bch2_dirent_create(struct btree_trans *trans,
 	if (ret)
 		return ret;
 
-	return __bch2_hash_set(trans, bch2_dirent_hash_desc, hash_info,
-			       dir_inum, &dirent->k_i, flags);
+	return bch2_hash_set(trans, bch2_dirent_hash_desc, hash_info,
+			     dir_inum, &dirent->k_i, flags);
 }
 
 int bch2_dirent_create(struct bch_fs *c, u64 dir_inum,
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index fc73823f6358..a989ba172faa 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -629,36 +629,12 @@ void bch2_stripes_heap_insert(struct bch_fs *c,
 
 /* stripe deletion */
 
-static void ec_stripe_delete(struct bch_fs *c, size_t idx)
+static int ec_stripe_delete(struct bch_fs *c, size_t idx)
 {
-	struct btree_iter iter;
-	struct bch_stripe *v = NULL;
-	struct bkey_s_c k;
-	struct bkey_i delete;
-	u64 journal_seq = 0;
-
-	bch2_btree_iter_init(&iter, c, BTREE_ID_EC,
-			     POS(0, idx),
-			     BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
-	k = bch2_btree_iter_peek_slot(&iter);
-	if (btree_iter_err(k) || k.k->type != KEY_TYPE_stripe)
-		goto out;
-
-	v = kmalloc(bkey_val_bytes(k.k), GFP_KERNEL);
-	BUG_ON(!v);
-	memcpy(v, bkey_s_c_to_stripe(k).v, bkey_val_bytes(k.k));
-
-	bkey_init(&delete.k);
-	delete.k.p = iter.pos;
-
-	bch2_btree_insert_at(c, NULL, &journal_seq,
-			     BTREE_INSERT_NOFAIL|
-			     BTREE_INSERT_USE_RESERVE|
-			     BTREE_INSERT_NOUNLOCK,
-			     BTREE_INSERT_ENTRY(&iter, &delete));
-out:
-	bch2_btree_iter_unlock(&iter);
-	kfree(v);
+	return bch2_btree_delete_range(c, BTREE_ID_EC,
+				       POS(0, idx),
+				       POS(0, idx + 1),
+				       NULL);
 }
 
 static void ec_stripe_delete_work(struct work_struct *work)
@@ -690,39 +666,46 @@ static void ec_stripe_delete_work(struct work_struct *work)
 static int ec_stripe_bkey_insert(struct bch_fs *c,
 				 struct bkey_i_stripe *stripe)
 {
-	struct btree_iter iter;
+	struct btree_trans trans;
+	struct btree_iter *iter;
 	struct bkey_s_c k;
 	int ret;
 
-	/* XXX: start pos hint */
+	bch2_trans_init(&trans, c);
 retry:
-	for_each_btree_key(&iter, c, BTREE_ID_EC, POS_MIN,
-			   BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k) {
-		if (bkey_cmp(k.k->p, POS(0, U32_MAX)) > 0) {
-			bch2_btree_iter_unlock(&iter);
-			return -ENOSPC;
-		}
+	bch2_trans_begin(&trans);
+
+	/* XXX: start pos hint */
+	iter = bch2_trans_get_iter(&trans, BTREE_ID_EC, POS_MIN,
+				   BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+
+	for_each_btree_key_continue(iter, BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k) {
+		if (bkey_cmp(k.k->p, POS(0, U32_MAX)) > 0)
+			break;
 
 		if (bkey_deleted(k.k))
 			goto found_slot;
 	}
 
-	return bch2_btree_iter_unlock(&iter) ?: -ENOSPC;
+	ret = -ENOSPC;
+	goto out;
 found_slot:
-	ret = ec_stripe_mem_alloc(c, &iter);
+	ret = ec_stripe_mem_alloc(c, iter);
 
 	if (ret == -EINTR)
 		goto retry;
 	if (ret)
 		return ret;
 
-	stripe->k.p = iter.pos;
+	stripe->k.p = iter->pos;
 
-	ret = bch2_btree_insert_at(c, NULL, NULL,
-				   BTREE_INSERT_NOFAIL|
-				   BTREE_INSERT_USE_RESERVE,
-				   BTREE_INSERT_ENTRY(&iter, &stripe->k_i));
-	bch2_btree_iter_unlock(&iter);
+	bch2_trans_update(&trans, BTREE_INSERT_ENTRY(iter, &stripe->k_i));
+
+	ret = bch2_trans_commit(&trans, NULL, NULL,
+				BTREE_INSERT_NOFAIL|
+				BTREE_INSERT_USE_RESERVE);
+out:
+	bch2_trans_exit(&trans);
 
 	return ret;
 }
@@ -749,23 +732,26 @@ static int ec_stripe_update_ptrs(struct bch_fs *c,
 				 struct ec_stripe_buf *s,
 				 struct bkey *pos)
 {
-	struct btree_iter iter;
+	struct btree_trans trans;
+	struct btree_iter *iter;
 	struct bkey_s_c k;
 	struct bkey_s_extent e;
 	struct bch_extent_ptr *ptr;
 	BKEY_PADDED(k) tmp;
 	int ret = 0, dev, idx;
 
-	bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS,
-			     bkey_start_pos(pos),
-			     BTREE_ITER_INTENT);
+	bch2_trans_init(&trans, c);
 
-	while ((k = bch2_btree_iter_peek(&iter)).k &&
-	       !btree_iter_err(k) &&
+	iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
+				   bkey_start_pos(pos),
+				   BTREE_ITER_INTENT);
+
+	while ((k = bch2_btree_iter_peek(iter)).k &&
+	       !(ret = btree_iter_err(k)) &&
 	       bkey_cmp(bkey_start_pos(k.k), pos->p) < 0) {
 		idx = extent_matches_stripe(c, &s->key.v, k);
 		if (idx < 0) {
-			bch2_btree_iter_next(&iter);
+			bch2_btree_iter_next(iter);
 			continue;
 		}
 
@@ -783,18 +769,21 @@ static int ec_stripe_update_ptrs(struct bch_fs *c,
 
 		extent_stripe_ptr_add(e, s, ptr, idx);
 
-		ret = bch2_btree_insert_at(c, NULL, NULL,
-				BTREE_INSERT_ATOMIC|
-				BTREE_INSERT_NOFAIL|
-				BTREE_INSERT_USE_RESERVE,
-				BTREE_INSERT_ENTRY(&iter, &tmp.k));
+		bch2_trans_update(&trans, BTREE_INSERT_ENTRY(iter, &tmp.k));
+
+		ret = bch2_trans_commit(&trans, NULL, NULL,
+					BTREE_INSERT_ATOMIC|
+					BTREE_INSERT_NOFAIL|
+					BTREE_INSERT_USE_RESERVE);
 		if (ret == -EINTR)
 			ret = 0;
 		if (ret)
 			break;
 	}
 
-	return bch2_btree_iter_unlock(&iter) ?: ret;
+	bch2_trans_exit(&trans);
+
+	return ret;
 }
 
 /*
@@ -1163,13 +1152,14 @@ unlock:
 	mutex_unlock(&c->ec_new_stripe_lock);
 }
 
-static int __bch2_stripe_write_key(struct bch_fs *c,
+static int __bch2_stripe_write_key(struct btree_trans *trans,
 				   struct btree_iter *iter,
 				   struct stripe *m,
 				   size_t idx,
 				   struct bkey_i_stripe *new_key,
 				   unsigned flags)
 {
+	struct bch_fs *c = trans->c;
 	struct bkey_s_c k;
 	unsigned i;
 	int ret;
@@ -1195,14 +1185,16 @@ static int __bch2_stripe_write_key(struct bch_fs *c,
 
 	spin_unlock(&c->ec_stripes_heap_lock);
 
-	return bch2_btree_insert_at(c, NULL, NULL,
-				   BTREE_INSERT_NOFAIL|flags,
-				   BTREE_INSERT_ENTRY(iter, &new_key->k_i));
+	bch2_trans_update(trans, BTREE_INSERT_ENTRY(iter, &new_key->k_i));
+
+	return bch2_trans_commit(trans, NULL, NULL,
+				 BTREE_INSERT_NOFAIL|flags);
 }
 
 int bch2_stripes_write(struct bch_fs *c, bool *wrote)
 {
-	struct btree_iter iter;
+	struct btree_trans trans;
+	struct btree_iter *iter;
 	struct genradix_iter giter;
 	struct bkey_i_stripe *new_key;
 	struct stripe *m;
@@ -1211,14 +1203,16 @@ int bch2_stripes_write(struct bch_fs *c, bool *wrote)
 	new_key = kmalloc(255 * sizeof(u64), GFP_KERNEL);
 	BUG_ON(!new_key);
 
-	bch2_btree_iter_init(&iter, c, BTREE_ID_EC, POS_MIN,
-			     BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+	bch2_trans_init(&trans, c);
+
+	iter = bch2_trans_get_iter(&trans, BTREE_ID_EC, POS_MIN,
+				   BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
 
 	genradix_for_each(&c->stripes[0], giter, m) {
 		if (!m->dirty)
 			continue;
 
-		ret = __bch2_stripe_write_key(c, &iter, m, giter.pos,
+		ret = __bch2_stripe_write_key(&trans, iter, m, giter.pos,
 					new_key, BTREE_INSERT_NOCHECK_RW);
 		if (ret)
 			break;
@@ -1226,7 +1220,7 @@ int bch2_stripes_write(struct bch_fs *c, bool *wrote)
 		*wrote = true;
 	}
 
-	bch2_btree_iter_unlock(&iter);
+	bch2_trans_exit(&trans);
 
 	kfree(new_key);
 
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index e79846a96f9c..2561773cd6dc 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -152,7 +152,7 @@ static void hash_check_set_inode(struct hash_check *h, struct bch_fs *c,
 }
 
 static int hash_redo_key(const struct bch_hash_desc desc,
-			 struct hash_check *h, struct bch_fs *c,
+			 struct btree_trans *trans, struct hash_check *h,
 			 struct btree_iter *k_iter, struct bkey_s_c k,
 			 u64 hashed)
 {
@@ -165,15 +165,16 @@ static int hash_redo_key(const struct bch_hash_desc desc,
 
 	bkey_reassemble(tmp, k);
 
-	ret = bch2_btree_delete_at(k_iter, 0);
+	ret = bch2_btree_delete_at(trans, k_iter, 0);
 	if (ret)
 		goto err;
 
 	bch2_btree_iter_unlock(k_iter);
 
-	bch2_hash_set(desc, &h->info, c, k_iter->pos.inode, NULL, tmp,
-		      BTREE_INSERT_NOFAIL|
-		      BCH_HASH_SET_MUST_CREATE);
+	bch2_hash_set(trans, desc, &h->info, k_iter->pos.inode,
+		      tmp, BCH_HASH_SET_MUST_CREATE);
+	ret = bch2_trans_commit(trans, NULL, NULL,
+				BTREE_INSERT_NOFAIL);
 err:
 	kfree(tmp);
 	return ret;
@@ -272,9 +273,10 @@ static bool key_has_correct_hash(const struct bch_hash_desc desc,
 }
 
 static int hash_check_key(const struct bch_hash_desc desc,
-			  struct hash_check *h, struct bch_fs *c,
+			  struct btree_trans *trans, struct hash_check *h,
 			  struct btree_iter *k_iter, struct bkey_s_c k)
 {
+	struct bch_fs *c = trans->c;
 	char buf[200];
 	u64 hashed;
 	int ret = 0;
@@ -300,7 +302,7 @@ static int hash_check_key(const struct bch_hash_desc desc,
 			hashed, h->chain->pos.offset,
 			(bch2_bkey_val_to_text(&PBUF(buf), c,
 					       k), buf))) {
-		ret = hash_redo_key(desc, h, c, k_iter, k, hashed);
+		ret = hash_redo_key(desc, trans, h, k_iter, k, hashed);
 		if (ret) {
 			bch_err(c, "hash_redo_key err %i", ret);
 			return ret;
@@ -313,9 +315,10 @@ fsck_err:
 	return ret;
 }
 
-static int check_dirent_hash(struct hash_check *h, struct bch_fs *c,
+static int check_dirent_hash(struct btree_trans *trans, struct hash_check *h,
 			     struct btree_iter *iter, struct bkey_s_c *k)
 {
+	struct bch_fs *c = trans->c;
 	struct bkey_i_dirent *d = NULL;
 	int ret = -EINVAL;
 	char buf[200];
@@ -360,9 +363,9 @@ static int check_dirent_hash(struct hash_check *h, struct bch_fs *c,
 
 	if (fsck_err(c, "dirent with junk at end, was %s (%zu) now %s (%u)",
 		     buf, strlen(buf), d->v.d_name, len)) {
-		ret = bch2_btree_insert_at(c, NULL, NULL,
-					   BTREE_INSERT_NOFAIL,
-					   BTREE_INSERT_ENTRY(iter, &d->k_i));
+		bch2_trans_update(trans, BTREE_INSERT_ENTRY(iter, &d->k_i));
+
+		ret = bch2_trans_commit(trans, NULL, NULL, BTREE_INSERT_NOFAIL);
 		if (ret)
 			goto err;
 
@@ -384,8 +387,8 @@ err_redo:
 		     k->k->p.offset, hash, h->chain->pos.offset,
 		     (bch2_bkey_val_to_text(&PBUF(buf), c,
 					    *k), buf))) {
-		ret = hash_redo_key(bch2_dirent_hash_desc,
-				    h, c, iter, *k, hash);
+		ret = hash_redo_key(bch2_dirent_hash_desc, trans,
+				    h, iter, *k, hash);
 		if (ret)
 			bch_err(c, "hash_redo_key err %i", ret);
 		else
@@ -532,7 +535,7 @@ static int check_dirents(struct bch_fs *c)
 				mode_to_type(w.inode.bi_mode),
 				(bch2_bkey_val_to_text(&PBUF(buf), c,
 						       k), buf))) {
-			ret = bch2_btree_delete_at(iter, 0);
+			ret = bch2_btree_delete_at(&trans, iter, 0);
 			if (ret)
 				goto err;
 			continue;
@@ -541,7 +544,7 @@ static int check_dirents(struct bch_fs *c)
 		if (w.first_this_inode && w.have_inode)
 			hash_check_set_inode(&h, c, &w.inode);
 
-		ret = check_dirent_hash(&h, c, iter, &k);
+		ret = check_dirent_hash(&trans, &h, iter, &k);
 		if (ret > 0) {
 			ret = 0;
 			continue;
@@ -623,9 +626,11 @@ static int check_dirents(struct bch_fs *c)
 			bkey_reassemble(&n->k_i, d.s_c);
 			n->v.d_type = mode_to_type(target.bi_mode);
 
-			ret = bch2_btree_insert_at(c, NULL, NULL,
-					BTREE_INSERT_NOFAIL,
-					BTREE_INSERT_ENTRY(iter, &n->k_i));
+			bch2_trans_update(&trans,
+				BTREE_INSERT_ENTRY(iter, &n->k_i));
+
+			ret = bch2_trans_commit(&trans, NULL, NULL,
+						BTREE_INSERT_NOFAIL);
 			kfree(n);
 			if (ret)
 				goto err;
@@ -669,7 +674,7 @@ static int check_xattrs(struct bch_fs *c)
 		if (fsck_err_on(!w.have_inode, c,
 				"xattr for missing inode %llu",
 				k.k->p.inode)) {
-			ret = bch2_btree_delete_at(iter, 0);
+			ret = bch2_btree_delete_at(&trans, iter, 0);
 			if (ret)
 				goto err;
 			continue;
@@ -678,7 +683,7 @@ static int check_xattrs(struct bch_fs *c)
 		if (w.first_this_inode && w.have_inode)
 			hash_check_set_inode(&h, c, &w.inode);
 
-		ret = hash_check_key(bch2_xattr_hash_desc, &h, c, iter, k);
+		ret = hash_check_key(bch2_xattr_hash_desc, &trans, &h, iter, k);
 		if (ret)
 			goto fsck_err;
 	}
@@ -1163,12 +1168,13 @@ fsck_err:
 	return ret;
 }
 
-static int check_inode(struct bch_fs *c,
+static int check_inode(struct btree_trans *trans,
 		       struct bch_inode_unpacked *lostfound_inode,
 		       struct btree_iter *iter,
 		       struct bkey_s_c_inode inode,
 		       struct nlink *link)
 {
+	struct bch_fs *c = trans->c;
 	struct bch_inode_unpacked u;
 	bool do_update = false;
 	int ret = 0;
@@ -1259,10 +1265,10 @@ static int check_inode(struct bch_fs *c,
 		struct bkey_inode_buf p;
 
 		bch2_inode_pack(&p, &u);
+		bch2_trans_update(trans, BTREE_INSERT_ENTRY(iter, &p.inode.k_i));
 
-		ret = bch2_btree_insert_at(c, NULL, NULL,
-					  BTREE_INSERT_NOFAIL,
-					  BTREE_INSERT_ENTRY(iter, &p.inode.k_i));
+		ret = bch2_trans_commit(trans, NULL, NULL,
+					BTREE_INSERT_NOFAIL);
 		if (ret && ret != -EINTR)
 			bch_err(c, "error in fs gc: error %i "
 				"updating inode", ret);
@@ -1277,25 +1283,29 @@ static int bch2_gc_walk_inodes(struct bch_fs *c,
 			       nlink_table *links,
 			       u64 range_start, u64 range_end)
 {
-	struct btree_iter iter;
+	struct btree_trans trans;
+	struct btree_iter *iter;
 	struct bkey_s_c k;
 	struct nlink *link, zero_links = { 0, 0 };
 	struct genradix_iter nlinks_iter;
 	int ret = 0, ret2 = 0;
 	u64 nlinks_pos;
 
-	bch2_btree_iter_init(&iter, c, BTREE_ID_INODES, POS(range_start, 0), 0);
+	bch2_trans_init(&trans, c);
+
+	iter = bch2_trans_get_iter(&trans, BTREE_ID_INODES,
+				   POS(range_start, 0), 0);
 	nlinks_iter = genradix_iter_init(links, 0);
 
-	while ((k = bch2_btree_iter_peek(&iter)).k &&
-	       !btree_iter_err(k)) {
+	while ((k = bch2_btree_iter_peek(iter)).k &&
+	       !(ret2 = btree_iter_err(k))) {
 peek_nlinks:	link = genradix_iter_peek(&nlinks_iter, links);
 
-		if (!link && (!k.k || iter.pos.inode >= range_end))
+		if (!link && (!k.k || iter->pos.inode >= range_end))
 			break;
 
 		nlinks_pos = range_start + nlinks_iter.pos;
-		if (iter.pos.inode > nlinks_pos) {
+		if (iter->pos.inode > nlinks_pos) {
 			/* Should have been caught by dirents pass: */
 			need_fsck_err_on(link && link->count, c,
 				"missing inode %llu (nlink %u)",
@@ -1304,7 +1314,7 @@ peek_nlinks:	link = genradix_iter_peek(&nlinks_iter, links);
 			goto peek_nlinks;
 		}
 
-		if (iter.pos.inode < nlinks_pos || !link)
+		if (iter->pos.inode < nlinks_pos || !link)
 			link = &zero_links;
 
 		if (k.k && k.k->type == KEY_TYPE_inode) {
@@ -1312,9 +1322,9 @@ peek_nlinks:	link = genradix_iter_peek(&nlinks_iter, links);
 			 * Avoid potential deadlocks with iter for
 			 * truncate/rm/etc.:
 			 */
-			bch2_btree_iter_unlock(&iter);
+			bch2_btree_iter_unlock(iter);
 
-			ret = check_inode(c, lostfound_inode, &iter,
+			ret = check_inode(&trans, lostfound_inode, iter,
 					  bkey_s_c_to_inode(k), link);
 			BUG_ON(ret == -EINTR);
 			if (ret)
@@ -1326,14 +1336,15 @@ peek_nlinks:	link = genradix_iter_peek(&nlinks_iter, links);
 				nlinks_pos, link->count);
 		}
 
-		if (nlinks_pos == iter.pos.inode)
+		if (nlinks_pos == iter->pos.inode)
 			genradix_iter_advance(&nlinks_iter, links);
 
-		bch2_btree_iter_next(&iter);
-		bch2_btree_iter_cond_resched(&iter);
+		bch2_btree_iter_next(iter);
+		bch2_btree_iter_cond_resched(iter);
 	}
 fsck_err:
-	ret2 = bch2_btree_iter_unlock(&iter);
+	bch2_trans_exit(&trans);
+
 	if (ret2)
 		bch_err(c, "error in fs gc: btree error %i while walking inodes", ret2);
 
@@ -1379,12 +1390,18 @@ static int check_inode_nlinks(struct bch_fs *c,
 noinline_for_stack
 static int check_inodes_fast(struct bch_fs *c)
 {
-	struct btree_iter iter;
+	struct btree_trans trans;
+	struct btree_iter *iter;
 	struct bkey_s_c k;
 	struct bkey_s_c_inode inode;
 	int ret = 0;
 
-	for_each_btree_key(&iter, c, BTREE_ID_INODES, POS_MIN, 0, k) {
+	bch2_trans_init(&trans, c);
+
+	iter = bch2_trans_get_iter(&trans, BTREE_ID_INODES,
+				   POS_MIN, 0);
+
+	for_each_btree_key_continue(iter, 0, k) {
 		if (k.k->type != KEY_TYPE_inode)
 			continue;
 
@@ -1394,14 +1411,19 @@ static int check_inodes_fast(struct bch_fs *c)
 		    (BCH_INODE_I_SIZE_DIRTY|
 		     BCH_INODE_I_SECTORS_DIRTY|
 		     BCH_INODE_UNLINKED)) {
-			ret = check_inode(c, NULL, &iter, inode, NULL);
+			ret = check_inode(&trans, NULL, iter, inode, NULL);
 			BUG_ON(ret == -EINTR);
 			if (ret)
 				break;
 		}
 	}
 
-	return bch2_btree_iter_unlock(&iter) ?: ret;
+	if (!ret)
+		ret = bch2_btree_iter_unlock(iter);
+
+	bch2_trans_exit(&trans);
+
+	return ret;
 }
 
 /*
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index 6acb487312a8..811c917cba84 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -367,7 +367,8 @@ int bch2_inode_create(struct bch_fs *c, struct bch_inode_unpacked *inode_u,
 
 int bch2_inode_rm(struct bch_fs *c, u64 inode_nr)
 {
-	struct btree_iter iter;
+	struct btree_trans trans;
+	struct btree_iter *iter;
 	struct bkey_i_inode_generation delete;
 	struct bpos start = POS(inode_nr, 0);
 	struct bpos end = POS(inode_nr + 1, 0);
@@ -390,17 +391,17 @@ int bch2_inode_rm(struct bch_fs *c, u64 inode_nr)
 	if (ret)
 		return ret;
 
-	bch2_btree_iter_init(&iter, c, BTREE_ID_INODES, POS(inode_nr, 0),
-			     BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+	bch2_trans_init(&trans, c);
+
+	iter = bch2_trans_get_iter(&trans, BTREE_ID_INODES, POS(inode_nr, 0),
+				   BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
 	do {
-		struct bkey_s_c k = bch2_btree_iter_peek_slot(&iter);
+		struct bkey_s_c k = bch2_btree_iter_peek_slot(iter);
 		u32 bi_generation = 0;
 
 		ret = btree_iter_err(k);
-		if (ret) {
-			bch2_btree_iter_unlock(&iter);
-			return ret;
-		}
+		if (ret)
+			break;
 
 		bch2_fs_inconsistent_on(k.k->type != KEY_TYPE_inode, c,
 					"inode %llu not found when deleting",
@@ -431,13 +432,15 @@ int bch2_inode_rm(struct bch_fs *c, u64 inode_nr)
 			delete.v.bi_generation = cpu_to_le32(bi_generation);
 		}
 
-		ret = bch2_btree_insert_at(c, NULL, NULL,
-				BTREE_INSERT_ATOMIC|
-				BTREE_INSERT_NOFAIL,
-				BTREE_INSERT_ENTRY(&iter, &delete.k_i));
+		bch2_trans_update(&trans,
+				  BTREE_INSERT_ENTRY(iter, &delete.k_i));
+
+		ret = bch2_trans_commit(&trans, NULL, NULL,
+					BTREE_INSERT_ATOMIC|
+					BTREE_INSERT_NOFAIL);
 	} while (ret == -EINTR);
 
-	bch2_btree_iter_unlock(&iter);
+	bch2_trans_exit(&trans);
 	return ret;
 }
 
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index 092500591b8f..f4c49bf82456 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -294,36 +294,43 @@ static void bch2_write_done(struct closure *cl)
 int bch2_write_index_default(struct bch_write_op *op)
 {
 	struct bch_fs *c = op->c;
+	struct btree_trans trans;
+	struct btree_iter *iter;
 	struct keylist *keys = &op->insert_keys;
-	struct btree_iter iter;
 	int ret;
 
-	bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS,
-			     bkey_start_pos(&bch2_keylist_front(keys)->k),
-			     BTREE_ITER_INTENT);
+	BUG_ON(bch2_keylist_empty(keys));
+	bch2_verify_keylist_sorted(keys);
+
+	bch2_trans_init(&trans, c);
+
+	iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
+				   bkey_start_pos(&bch2_keylist_front(keys)->k),
+				   BTREE_ITER_INTENT);
 
 	do {
 		BKEY_PADDED(k) split;
 
 		bkey_copy(&split.k, bch2_keylist_front(keys));
 
-		bch2_extent_trim_atomic(&split.k, &iter);
+		bch2_extent_trim_atomic(&split.k, iter);
 
-		ret = bch2_btree_insert_at(c, &op->res,
-				op_journal_seq(op),
-				BTREE_INSERT_NOFAIL|
-				BTREE_INSERT_USE_RESERVE,
-				BTREE_INSERT_ENTRY(&iter, &split.k));
+		bch2_trans_update(&trans,
+				  BTREE_INSERT_ENTRY(iter, &split.k));
+
+		ret = bch2_trans_commit(&trans, &op->res, op_journal_seq(op),
+					BTREE_INSERT_NOFAIL|
+					BTREE_INSERT_USE_RESERVE);
 		if (ret)
 			break;
 
-		if (bkey_cmp(iter.pos, bch2_keylist_front(keys)->k.p) < 0)
-			bch2_cut_front(iter.pos, bch2_keylist_front(keys));
+		if (bkey_cmp(iter->pos, bch2_keylist_front(keys)->k.p) < 0)
+			bch2_cut_front(iter->pos, bch2_keylist_front(keys));
 		else
 			bch2_keylist_pop_front(keys);
 	} while (!bch2_keylist_empty(keys));
 
-	bch2_btree_iter_unlock(&iter);
+	bch2_trans_exit(&trans);
 
 	return ret;
 }
@@ -1403,7 +1410,8 @@ static void bch2_rbio_error(struct bch_read_bio *rbio, int retry,
 static void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio)
 {
 	struct bch_fs *c = rbio->c;
-	struct btree_iter iter;
+	struct btree_trans trans;
+	struct btree_iter *iter;
 	struct bkey_s_c k;
 	struct bkey_i_extent *e;
 	BKEY_PADDED(k) new;
@@ -1414,10 +1422,13 @@ static void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio)
 	if (rbio->pick.crc.compression_type)
 		return;
 
-	bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, rbio->pos,
-			     BTREE_ITER_INTENT);
+	bch2_trans_init(&trans, c);
 retry:
-	k = bch2_btree_iter_peek(&iter);
+	bch2_trans_begin(&trans);
+
+	iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, rbio->pos,
+				   BTREE_ITER_INTENT);
+	k = bch2_btree_iter_peek(iter);
 	if (IS_ERR_OR_NULL(k.k))
 		goto out;
 
@@ -1453,15 +1464,15 @@ retry:
 	if (!bch2_extent_narrow_crcs(e, new_crc))
 		goto out;
 
-	ret = bch2_btree_insert_at(c, NULL, NULL,
-				   BTREE_INSERT_ATOMIC|
-				   BTREE_INSERT_NOFAIL|
-				   BTREE_INSERT_NOWAIT,
-				   BTREE_INSERT_ENTRY(&iter, &e->k_i));
+	bch2_trans_update(&trans, BTREE_INSERT_ENTRY(iter, &e->k_i));
+	ret = bch2_trans_commit(&trans, NULL, NULL,
+				BTREE_INSERT_ATOMIC|
+				BTREE_INSERT_NOFAIL|
+				BTREE_INSERT_NOWAIT);
 	if (ret == -EINTR)
 		goto retry;
 out:
-	bch2_btree_iter_unlock(&iter);
+	bch2_trans_exit(&trans);
 }
 
 static bool should_narrow_crcs(struct bkey_s_c k,
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 9997a2793e0a..d20672a37fd3 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -825,6 +825,8 @@ fsck_err:
 
 static int bch2_extent_replay_key(struct bch_fs *c, struct bkey_i *k)
 {
+	struct btree_trans trans;
+	struct btree_iter *iter;
 	/*
 	 * We might cause compressed extents to be
 	 * split, so we need to pass in a
@@ -833,20 +835,21 @@ static int bch2_extent_replay_key(struct bch_fs *c, struct bkey_i *k)
 	struct disk_reservation disk_res =
 		bch2_disk_reservation_init(c, 0);
 	BKEY_PADDED(k) split;
-	struct btree_iter iter;
 	int ret;
 
-	bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS,
-			     bkey_start_pos(&k->k),
-			     BTREE_ITER_INTENT);
+	bch2_trans_init(&trans, c);
+
+	iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
+				   bkey_start_pos(&k->k),
+				   BTREE_ITER_INTENT);
 	do {
-		ret = bch2_btree_iter_traverse(&iter);
+		ret = bch2_btree_iter_traverse(iter);
 		if (ret)
 			break;
 
 		bkey_copy(&split.k, k);
-		bch2_cut_front(iter.pos, &split.k);
-		bch2_extent_trim_atomic(&split.k, &iter);
+		bch2_cut_front(iter->pos, &split.k);
+		bch2_extent_trim_atomic(&split.k, iter);
 
 		ret = bch2_disk_reservation_add(c, &disk_res,
 				split.k.k.size *
@@ -854,13 +857,13 @@ static int bch2_extent_replay_key(struct bch_fs *c, struct bkey_i *k)
 				BCH_DISK_RESERVATION_NOFAIL);
 		BUG_ON(ret);
 
-		ret = bch2_btree_insert_at(c, &disk_res, NULL,
-					   BTREE_INSERT_ATOMIC|
-					   BTREE_INSERT_NOFAIL|
-					   BTREE_INSERT_JOURNAL_REPLAY,
-					   BTREE_INSERT_ENTRY(&iter, &split.k));
+		bch2_trans_update(&trans, BTREE_INSERT_ENTRY(iter, &split.k));
+		ret = bch2_trans_commit(&trans, &disk_res, NULL,
+					BTREE_INSERT_ATOMIC|
+					BTREE_INSERT_NOFAIL|
+					BTREE_INSERT_JOURNAL_REPLAY);
 	} while ((!ret || ret == -EINTR) &&
-		 bkey_cmp(k->k.p, iter.pos));
+		 bkey_cmp(k->k.p, iter->pos));
 
 	bch2_disk_reservation_put(c, &disk_res);
 
@@ -873,9 +876,9 @@ static int bch2_extent_replay_key(struct bch_fs *c, struct bkey_i *k)
 	 * before journal replay finishes
 	 */
 	bch2_mark_key(c, bkey_i_to_s_c(k), false, -((s64) k->k.size),
-		      gc_pos_btree_node(iter.l[0].b),
+		      gc_pos_btree_node(iter->l[0].b),
 		      NULL, 0, 0);
-	bch2_btree_iter_unlock(&iter);
+	bch2_trans_exit(&trans);
 
 	return ret;
 }
diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c
index b97a5a8f3910..955831a50824 100644
--- a/fs/bcachefs/migrate.c
+++ b/fs/bcachefs/migrate.c
@@ -36,25 +36,29 @@ static int drop_dev_ptrs(struct bch_fs *c, struct bkey_s k,
 
 static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
 {
+	struct btree_trans trans;
+	struct btree_iter *iter;
 	struct bkey_s_c k;
 	BKEY_PADDED(key) tmp;
-	struct btree_iter iter;
 	int ret = 0;
 
+	bch2_trans_init(&trans, c);
+
+	iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
+				   POS_MIN, BTREE_ITER_PREFETCH);
+
 	mutex_lock(&c->replicas_gc_lock);
 	bch2_replicas_gc_start(c, (1 << BCH_DATA_USER)|(1 << BCH_DATA_CACHED));
 
-	bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS,
-			     POS_MIN, BTREE_ITER_PREFETCH);
 
-	while ((k = bch2_btree_iter_peek(&iter)).k &&
+	while ((k = bch2_btree_iter_peek(iter)).k &&
 	       !(ret = btree_iter_err(k))) {
 		if (!bkey_extent_is_data(k.k) ||
 		    !bch2_extent_has_device(bkey_s_c_to_extent(k), dev_idx)) {
 			ret = bch2_mark_bkey_replicas(c, k);
 			if (ret)
 				break;
-			bch2_btree_iter_next(&iter);
+			bch2_btree_iter_next(iter);
 			continue;
 		}
 
@@ -72,12 +76,14 @@ static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
 		 */
 		bch2_extent_normalize(c, bkey_i_to_s(&tmp.key));
 
-		iter.pos = bkey_start_pos(&tmp.key.k);
+		/* XXX not sketchy at all */
+		iter->pos = bkey_start_pos(&tmp.key.k);
+
+		bch2_trans_update(&trans, BTREE_INSERT_ENTRY(iter, &tmp.key));
 
-		ret = bch2_btree_insert_at(c, NULL, NULL,
-					   BTREE_INSERT_ATOMIC|
-					   BTREE_INSERT_NOFAIL,
-					   BTREE_INSERT_ENTRY(&iter, &tmp.key));
+		ret = bch2_trans_commit(&trans, NULL, NULL,
+					BTREE_INSERT_ATOMIC|
+					BTREE_INSERT_NOFAIL);
 
 		/*
 		 * don't want to leave ret == -EINTR, since if we raced and
@@ -90,11 +96,11 @@ static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
 			break;
 	}
 
-	bch2_btree_iter_unlock(&iter);
-
 	bch2_replicas_gc_end(c, ret);
 	mutex_unlock(&c->replicas_gc_lock);
 
+	bch2_trans_exit(&trans);
+
 	return ret;
 }
 
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index 5a35f76006cf..8c453ae31525 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -54,18 +54,21 @@ struct moving_context {
 static int bch2_migrate_index_update(struct bch_write_op *op)
 {
 	struct bch_fs *c = op->c;
+	struct btree_trans trans;
+	struct btree_iter *iter;
 	struct migrate_write *m =
 		container_of(op, struct migrate_write, op);
 	struct keylist *keys = &op->insert_keys;
-	struct btree_iter iter;
 	int ret = 0;
 
-	bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS,
-			     bkey_start_pos(&bch2_keylist_front(keys)->k),
-			     BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+	bch2_trans_init(&trans, c);
+
+	iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
+				   bkey_start_pos(&bch2_keylist_front(keys)->k),
+				   BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
 
 	while (1) {
-		struct bkey_s_c k = bch2_btree_iter_peek_slot(&iter);
+		struct bkey_s_c k = bch2_btree_iter_peek_slot(iter);
 		struct bkey_i_extent *insert, *new =
 			bkey_i_to_extent(bch2_keylist_front(keys));
 		BKEY_PADDED(k) _new, _insert;
@@ -74,10 +77,9 @@ static int bch2_migrate_index_update(struct bch_write_op *op)
 		bool did_work = false;
 		int nr;
 
-		if (btree_iter_err(k)) {
-			ret = bch2_btree_iter_unlock(&iter);
+		ret = btree_iter_err(k);
+		if (ret)
 			break;
-		}
 
 		if (bversion_cmp(k.k->version, new->k.version) ||
 		    !bkey_extent_is_data(k.k) ||
@@ -96,7 +98,7 @@ static int bch2_migrate_index_update(struct bch_write_op *op)
 		bkey_copy(&_new.k, bch2_keylist_front(keys));
 		new = bkey_i_to_extent(&_new.k);
 
-		bch2_cut_front(iter.pos, &insert->k_i);
+		bch2_cut_front(iter->pos, &insert->k_i);
 		bch2_cut_back(new->k.p, &insert->k);
 		bch2_cut_back(insert->k.p, &new->k);
 
@@ -138,12 +140,6 @@ static int bch2_migrate_index_update(struct bch_write_op *op)
 		if (insert->k.size < k.k->size &&
 		    bch2_extent_is_compressed(k) &&
 		    nr > 0) {
-			/*
-			 * can't call bch2_disk_reservation_add() with btree
-			 * locks held, at least not without a song and dance
-			 */
-			bch2_btree_iter_unlock(&iter);
-
 			ret = bch2_disk_reservation_add(c, &op->res,
 					keylist_sectors(keys) * nr, 0);
 			if (ret)
@@ -153,13 +149,15 @@ static int bch2_migrate_index_update(struct bch_write_op *op)
 			goto next;
 		}
 
-		ret = bch2_btree_insert_at(c, &op->res,
+		bch2_trans_update(&trans,
+				BTREE_INSERT_ENTRY(iter, &insert->k_i));
+
+		ret = bch2_trans_commit(&trans, &op->res,
 				op_journal_seq(op),
 				BTREE_INSERT_ATOMIC|
 				BTREE_INSERT_NOFAIL|
 				BTREE_INSERT_USE_RESERVE|
-				m->data_opts.btree_insert_flags,
-				BTREE_INSERT_ENTRY(&iter, &insert->k_i));
+				m->data_opts.btree_insert_flags);
 		if (!ret)
 			atomic_long_inc(&c->extent_migrate_done);
 		if (ret == -EINTR)
@@ -167,25 +165,25 @@ static int bch2_migrate_index_update(struct bch_write_op *op)
 		if (ret)
 			break;
 next:
-		while (bkey_cmp(iter.pos, bch2_keylist_front(keys)->k.p) >= 0) {
+		while (bkey_cmp(iter->pos, bch2_keylist_front(keys)->k.p) >= 0) {
 			bch2_keylist_pop_front(keys);
 			if (bch2_keylist_empty(keys))
 				goto out;
 		}
 
-		bch2_cut_front(iter.pos, bch2_keylist_front(keys));
+		bch2_cut_front(iter->pos, bch2_keylist_front(keys));
 		continue;
 nomatch:
 		if (m->ctxt)
-			atomic64_add(k.k->p.offset - iter.pos.offset,
+			atomic64_add(k.k->p.offset - iter->pos.offset,
 				     &m->ctxt->stats->sectors_raced);
 		atomic_long_inc(&c->extent_migrate_raced);
 		trace_move_race(&new->k);
-		bch2_btree_iter_next_slot(&iter);
+		bch2_btree_iter_next_slot(iter);
 		goto next;
 	}
 out:
-	bch2_btree_iter_unlock(&iter);
+	bch2_trans_exit(&trans);
 	return ret;
 }
 
diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c
index 113a2ca88ffc..492ab73c39e7 100644
--- a/fs/bcachefs/quota.c
+++ b/fs/bcachefs/quota.c
@@ -708,7 +708,8 @@ static int bch2_set_quota(struct super_block *sb, struct kqid qid,
 			  struct qc_dqblk *qdq)
 {
 	struct bch_fs *c = sb->s_fs_info;
-	struct btree_iter iter;
+	struct btree_trans trans;
+	struct btree_iter *iter;
 	struct bkey_s_c k;
 	struct bkey_i_quota new_quota;
 	int ret;
@@ -719,9 +720,11 @@ static int bch2_set_quota(struct super_block *sb, struct kqid qid,
 	bkey_quota_init(&new_quota.k_i);
 	new_quota.k.p = POS(qid.type, from_kqid(&init_user_ns, qid));
 
-	bch2_btree_iter_init(&iter, c, BTREE_ID_QUOTAS, new_quota.k.p,
-			     BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
-	k = bch2_btree_iter_peek_slot(&iter);
+	bch2_trans_init(&trans, c);
+
+	iter = bch2_trans_get_iter(&trans, BTREE_ID_QUOTAS, new_quota.k.p,
+				   BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+	k = bch2_btree_iter_peek_slot(iter);
 
 	ret = btree_iter_err(k);
 	if (unlikely(ret))
@@ -743,9 +746,11 @@ static int bch2_set_quota(struct super_block *sb, struct kqid qid,
 	if (qdq->d_fieldmask & QC_INO_HARD)
 		new_quota.v.c[Q_INO].hardlimit = cpu_to_le64(qdq->d_ino_hardlimit);
 
-	ret = bch2_btree_insert_at(c, NULL, NULL, 0,
-				   BTREE_INSERT_ENTRY(&iter, &new_quota.k_i));
-	bch2_btree_iter_unlock(&iter);
+	bch2_trans_update(&trans, BTREE_INSERT_ENTRY(iter, &new_quota.k_i));
+
+	ret = bch2_trans_commit(&trans, NULL, NULL, 0);
+
+	bch2_trans_exit(&trans);
 
 	if (ret)
 		return ret;
diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h
index c5bce01bf34c..ffa7af0820ea 100644
--- a/fs/bcachefs/str_hash.h
+++ b/fs/bcachefs/str_hash.h
@@ -213,10 +213,10 @@ int bch2_hash_needs_whiteout(struct btree_trans *trans,
 }
 
 static __always_inline
-int __bch2_hash_set(struct btree_trans *trans,
-		    const struct bch_hash_desc desc,
-		    const struct bch_hash_info *info,
-		    u64 inode, struct bkey_i *insert, int flags)
+int bch2_hash_set(struct btree_trans *trans,
+		  const struct bch_hash_desc desc,
+		  const struct bch_hash_info *info,
+		  u64 inode, struct bkey_i *insert, int flags)
 {
 	struct btree_iter *iter, *slot = NULL;
 	struct bkey_s_c k;
@@ -267,17 +267,6 @@ found:
 	return 0;
 }
 
-static inline int bch2_hash_set(const struct bch_hash_desc desc,
-			       const struct bch_hash_info *info,
-			       struct bch_fs *c, u64 inode,
-			       u64 *journal_seq,
-			       struct bkey_i *insert, int flags)
-{
-	return bch2_trans_do(c, journal_seq, flags|BTREE_INSERT_ATOMIC,
-			__bch2_hash_set(&trans, desc, info,
-					inode, insert, flags));
-}
-
 static __always_inline
 int bch2_hash_delete_at(struct btree_trans *trans,
 			const struct bch_hash_desc desc,
diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c
index bcbe782260f0..652e22125dcf 100644
--- a/fs/bcachefs/tests.c
+++ b/fs/bcachefs/tests.c
@@ -28,57 +28,63 @@ static void delete_test_keys(struct bch_fs *c)
 
 static void test_delete(struct bch_fs *c, u64 nr)
 {
-	struct btree_iter iter;
+	struct btree_trans trans;
+	struct btree_iter *iter;
 	struct bkey_i_cookie k;
 	int ret;
 
 	bkey_cookie_init(&k.k_i);
 
-	bch2_btree_iter_init(&iter, c, BTREE_ID_DIRENTS, k.k.p,
-			     BTREE_ITER_INTENT);
+	bch2_trans_init(&trans, c);
+
+	iter = bch2_trans_get_iter(&trans, BTREE_ID_DIRENTS, k.k.p,
+				   BTREE_ITER_INTENT);
 
-	ret = bch2_btree_iter_traverse(&iter);
+	ret = bch2_btree_iter_traverse(iter);
 	BUG_ON(ret);
 
-	ret = bch2_btree_insert_at(c, NULL, NULL, 0,
-				   BTREE_INSERT_ENTRY(&iter, &k.k_i));
+	bch2_trans_update(&trans, BTREE_INSERT_ENTRY(iter, &k.k_i));
+	ret = bch2_trans_commit(&trans, NULL, NULL, 0);
 	BUG_ON(ret);
 
 	pr_info("deleting once");
-	ret = bch2_btree_delete_at(&iter, 0);
+	ret = bch2_btree_delete_at(&trans, iter, 0);
 	BUG_ON(ret);
 
 	pr_info("deleting twice");
-	ret = bch2_btree_delete_at(&iter, 0);
+	ret = bch2_btree_delete_at(&trans, iter, 0);
 	BUG_ON(ret);
 
-	bch2_btree_iter_unlock(&iter);
+	bch2_trans_exit(&trans);
 }
 
 static void test_delete_written(struct bch_fs *c, u64 nr)
 {
-	struct btree_iter iter;
+	struct btree_trans trans;
+	struct btree_iter *iter;
 	struct bkey_i_cookie k;
 	int ret;
 
 	bkey_cookie_init(&k.k_i);
 
-	bch2_btree_iter_init(&iter, c, BTREE_ID_DIRENTS, k.k.p,
-			     BTREE_ITER_INTENT);
+	bch2_trans_init(&trans, c);
+
+	iter = bch2_trans_get_iter(&trans, BTREE_ID_DIRENTS, k.k.p,
+				   BTREE_ITER_INTENT);
 
-	ret = bch2_btree_iter_traverse(&iter);
+	ret = bch2_btree_iter_traverse(iter);
 	BUG_ON(ret);
 
-	ret = bch2_btree_insert_at(c, NULL, NULL, 0,
-				   BTREE_INSERT_ENTRY(&iter, &k.k_i));
+	bch2_trans_update(&trans, BTREE_INSERT_ENTRY(iter, &k.k_i));
+	ret = bch2_trans_commit(&trans, NULL, NULL, 0);
 	BUG_ON(ret);
 
 	bch2_journal_flush_all_pins(&c->journal);
 
-	ret = bch2_btree_delete_at(&iter, 0);
+	ret = bch2_btree_delete_at(&trans, iter, 0);
 	BUG_ON(ret);
 
-	bch2_btree_iter_unlock(&iter);
+	bch2_trans_exit(&trans);
 }
 
 static void test_iterate(struct bch_fs *c, u64 nr)
@@ -415,26 +421,29 @@ static void rand_mixed(struct bch_fs *c, u64 nr)
 	u64 i;
 
 	for (i = 0; i < nr; i++) {
-		struct btree_iter iter;
+		struct btree_trans trans;
+		struct btree_iter *iter;
 		struct bkey_s_c k;
 
-		bch2_btree_iter_init(&iter, c, BTREE_ID_DIRENTS,
-				     POS(0, test_rand()), 0);
+		bch2_trans_init(&trans, c);
 
-		k = bch2_btree_iter_peek(&iter);
+		iter = bch2_trans_get_iter(&trans, BTREE_ID_DIRENTS,
+					   POS(0, test_rand()), 0);
+
+		k = bch2_btree_iter_peek(iter);
 
 		if (!(i & 3) && k.k) {
 			struct bkey_i_cookie k;
 
 			bkey_cookie_init(&k.k_i);
-			k.k.p = iter.pos;
+			k.k.p = iter->pos;
 
-			ret = bch2_btree_insert_at(c, NULL, NULL, 0,
-						   BTREE_INSERT_ENTRY(&iter, &k.k_i));
+			bch2_trans_update(&trans, BTREE_INSERT_ENTRY(iter, &k.k_i));
+			ret = bch2_trans_commit(&trans, NULL, NULL, 0);
 			BUG_ON(ret);
 		}
 
-		bch2_btree_iter_unlock(&iter);
+		bch2_trans_exit(&trans);
 	}
 
 }
@@ -457,7 +466,8 @@ static void rand_delete(struct bch_fs *c, u64 nr)
 
 static void seq_insert(struct bch_fs *c, u64 nr)
 {
-	struct btree_iter iter;
+	struct btree_trans trans;
+	struct btree_iter *iter;
 	struct bkey_s_c k;
 	struct bkey_i_cookie insert;
 	int ret;
@@ -465,18 +475,22 @@ static void seq_insert(struct bch_fs *c, u64 nr)
 
 	bkey_cookie_init(&insert.k_i);
 
-	for_each_btree_key(&iter, c, BTREE_ID_DIRENTS, POS_MIN,
-			   BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k) {
-		insert.k.p = iter.pos;
+	bch2_trans_init(&trans, c);
+
+	iter = bch2_trans_get_iter(&trans, BTREE_ID_DIRENTS, POS_MIN,
+				   BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+
+	for_each_btree_key_continue(iter, BTREE_ITER_SLOTS, k) {
+		insert.k.p = iter->pos;
 
-		ret = bch2_btree_insert_at(c, NULL, NULL, 0,
-				BTREE_INSERT_ENTRY(&iter, &insert.k_i));
+		bch2_trans_update(&trans, BTREE_INSERT_ENTRY(iter, &insert.k_i));
+		ret = bch2_trans_commit(&trans, NULL, NULL, 0);
 		BUG_ON(ret);
 
 		if (++i == nr)
 			break;
 	}
-	bch2_btree_iter_unlock(&iter);
+	bch2_trans_exit(&trans);
 }
 
 static void seq_lookup(struct bch_fs *c, u64 nr)
@@ -491,21 +505,26 @@ static void seq_lookup(struct bch_fs *c, u64 nr)
 
 static void seq_overwrite(struct bch_fs *c, u64 nr)
 {
-	struct btree_iter iter;
+	struct btree_trans trans;
+	struct btree_iter *iter;
 	struct bkey_s_c k;
 	int ret;
 
-	for_each_btree_key(&iter, c, BTREE_ID_DIRENTS, POS_MIN,
-			   BTREE_ITER_INTENT, k) {
+	bch2_trans_init(&trans, c);
+
+	iter = bch2_trans_get_iter(&trans, BTREE_ID_DIRENTS, POS_MIN,
+				   BTREE_ITER_INTENT);
+
+	for_each_btree_key_continue(iter, 0, k) {
 		struct bkey_i_cookie u;
 
 		bkey_reassemble(&u.k_i, k);
 
-		ret = bch2_btree_insert_at(c, NULL, NULL, 0,
-					   BTREE_INSERT_ENTRY(&iter, &u.k_i));
+		bch2_trans_update(&trans, BTREE_INSERT_ENTRY(iter, &u.k_i));
+		ret = bch2_trans_commit(&trans, NULL, NULL, 0);
 		BUG_ON(ret);
 	}
-	bch2_btree_iter_unlock(&iter);
+	bch2_trans_exit(&trans);
 }
 
 static void seq_delete(struct bch_fs *c, u64 nr)
diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c
index f31eec2f1fce..545e743972fb 100644
--- a/fs/bcachefs/xattr.c
+++ b/fs/bcachefs/xattr.c
@@ -180,7 +180,7 @@ int bch2_xattr_set(struct btree_trans *trans, u64 inum,
 		memcpy(xattr->v.x_name, name, namelen);
 		memcpy(xattr_val(&xattr->v), value, size);
 
-		ret = __bch2_hash_set(trans, bch2_xattr_hash_desc, hash_info,
+		ret = bch2_hash_set(trans, bch2_xattr_hash_desc, hash_info,
 			      inum, &xattr->k_i,
 			      (flags & XATTR_CREATE ? BCH_HASH_SET_MUST_CREATE : 0)|
 			      (flags & XATTR_REPLACE ? BCH_HASH_SET_MUST_REPLACE : 0));
-- 
cgit 


From 0dc17247f1964a1884de5b57cb9c712534ba1011 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 13 Mar 2019 22:44:04 -0400
Subject: bcachefs: kill struct btree_insert

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_types.h       |  7 +++++
 fs/bcachefs/btree_update.h      | 15 +---------
 fs/bcachefs/btree_update_leaf.c | 65 ++++++++++++++++++++---------------------
 fs/bcachefs/buckets.c           |  2 +-
 fs/bcachefs/buckets.h           |  2 +-
 fs/bcachefs/extents.c           |  4 +--
 fs/bcachefs/extents.h           |  6 ++--
 7 files changed, 47 insertions(+), 54 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 07d98caf204e..674a617a8018 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -289,6 +289,13 @@ struct btree_trans {
 
 	struct btree_iter	*iters;
 	struct btree_insert_entry *updates;
+	struct disk_reservation *disk_res;
+
+	/* update path: */
+	struct journal_res	journal_res;
+	struct journal_preres	journal_preres;
+	u64			*journal_seq;
+	unsigned		flags;
 
 	struct btree_iter	iters_onstack[2];
 	struct btree_insert_entry updates_onstack[6];
diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
index 2bdb8b532aad..876b0e78f982 100644
--- a/fs/bcachefs/btree_update.h
+++ b/fs/bcachefs/btree_update.h
@@ -7,13 +7,12 @@
 
 struct bch_fs;
 struct btree;
-struct btree_insert;
 
 void bch2_btree_node_lock_for_insert(struct bch_fs *, struct btree *,
 				     struct btree_iter *);
 bool bch2_btree_bset_insert_key(struct btree_iter *, struct btree *,
 				struct btree_node_iter *, struct bkey_i *);
-void bch2_btree_journal_key(struct btree_insert *trans, struct btree_iter *,
+void bch2_btree_journal_key(struct btree_trans *, struct btree_iter *,
 			    struct bkey_i *);
 
 void bch2_deferred_update_free(struct bch_fs *,
@@ -21,18 +20,6 @@ void bch2_deferred_update_free(struct bch_fs *,
 struct deferred_update *
 bch2_deferred_update_alloc(struct bch_fs *, enum btree_id, unsigned);
 
-struct btree_insert {
-	struct bch_fs		*c;
-	struct disk_reservation *disk_res;
-	struct journal_res	journal_res;
-	struct journal_preres	journal_preres;
-	u64			*journal_seq;
-	unsigned		flags;
-
-	unsigned short		nr;
-	struct btree_insert_entry  *entries;
-};
-
 #define BTREE_INSERT_ENTRY(_iter, _k)					\
 	((struct btree_insert_entry) {					\
 		.iter		= (_iter),				\
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index d2f57b6b924d..d239aff7c13c 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -18,8 +18,8 @@
 
 #include <linux/sort.h>
 
-static bool btree_trans_relock(struct btree_insert *);
-static void btree_trans_unlock(struct btree_insert *);
+static bool btree_trans_relock(struct btree_trans *);
+static void btree_trans_unlock(struct btree_trans *);
 
 /* Inserting into a given leaf node (last stage of insert): */
 
@@ -130,7 +130,7 @@ static void btree_node_flush1(struct journal *j, struct journal_entry_pin *pin,
 	return __btree_node_flush(j, pin, 1, seq);
 }
 
-static inline void __btree_journal_key(struct btree_insert *trans,
+static inline void __btree_journal_key(struct btree_trans *trans,
 				       enum btree_id btree_id,
 				       struct bkey_i *insert)
 {
@@ -151,7 +151,7 @@ static inline void __btree_journal_key(struct btree_insert *trans,
 		*trans->journal_seq = seq;
 }
 
-void bch2_btree_journal_key(struct btree_insert *trans,
+void bch2_btree_journal_key(struct btree_trans *trans,
 			   struct btree_iter *iter,
 			   struct bkey_i *insert)
 {
@@ -185,7 +185,7 @@ void bch2_btree_journal_key(struct btree_insert *trans,
 		set_btree_node_dirty(b);
 }
 
-static void bch2_insert_fixup_key(struct btree_insert *trans,
+static void bch2_insert_fixup_key(struct btree_trans *trans,
 				  struct btree_insert_entry *insert)
 {
 	struct btree_iter *iter = insert->iter;
@@ -203,7 +203,7 @@ static void bch2_insert_fixup_key(struct btree_insert *trans,
 /**
  * btree_insert_key - insert a key one key into a leaf node
  */
-static void btree_insert_key_leaf(struct btree_insert *trans,
+static void btree_insert_key_leaf(struct btree_trans *trans,
 				  struct btree_insert_entry *insert)
 {
 	struct bch_fs *c = trans->c;
@@ -286,7 +286,7 @@ static void deferred_update_flush(struct journal *j,
 		kfree(k);
 }
 
-static void btree_insert_key_deferred(struct btree_insert *trans,
+static void btree_insert_key_deferred(struct btree_trans *trans,
 				      struct btree_insert_entry *insert)
 {
 	struct bch_fs *c = trans->c;
@@ -356,24 +356,24 @@ bch2_deferred_update_alloc(struct bch_fs *c,
  * We sort transaction entries so that if multiple iterators point to the same
  * leaf node they'll be adjacent:
  */
-static bool same_leaf_as_prev(struct btree_insert *trans,
+static bool same_leaf_as_prev(struct btree_trans *trans,
 			      struct btree_insert_entry *i)
 {
-	return i != trans->entries &&
+	return i != trans->updates &&
 		!i->deferred &&
 		i[0].iter->l[0].b == i[-1].iter->l[0].b;
 }
 
 #define __trans_next_entry(_trans, _i, _filter)				\
 ({									\
-	while ((_i) < (_trans)->entries + (_trans->nr) && !(_filter))	\
+	while ((_i) < (_trans)->updates + (_trans->nr_updates) && !(_filter))\
 		(_i)++;							\
 									\
-	(_i) < (_trans)->entries + (_trans->nr);			\
+	(_i) < (_trans)->updates + (_trans->nr_updates);		\
 })
 
 #define __trans_for_each_entry(_trans, _i, _filter)			\
-	for ((_i) = (_trans)->entries;					\
+	for ((_i) = (_trans)->updates;					\
 	     __trans_next_entry(_trans, _i, _filter);			\
 	     (_i)++)
 
@@ -404,7 +404,7 @@ inline void bch2_btree_node_lock_for_insert(struct bch_fs *c, struct btree *b,
 		bch2_btree_init_next(c, b, iter);
 }
 
-static void multi_lock_write(struct bch_fs *c, struct btree_insert *trans)
+static void multi_lock_write(struct bch_fs *c, struct btree_trans *trans)
 {
 	struct btree_insert_entry *i;
 
@@ -412,7 +412,7 @@ static void multi_lock_write(struct bch_fs *c, struct btree_insert *trans)
 		bch2_btree_node_lock_for_insert(c, i->iter->l[0].b, i->iter);
 }
 
-static void multi_unlock_write(struct btree_insert *trans)
+static void multi_unlock_write(struct btree_trans *trans)
 {
 	struct btree_insert_entry *i;
 
@@ -427,7 +427,7 @@ static inline int btree_trans_cmp(struct btree_insert_entry l,
 		btree_iter_cmp(l.iter, r.iter);
 }
 
-static bool btree_trans_relock(struct btree_insert *trans)
+static bool btree_trans_relock(struct btree_trans *trans)
 {
 	struct btree_insert_entry *i;
 
@@ -436,7 +436,7 @@ static bool btree_trans_relock(struct btree_insert *trans)
 	return true;
 }
 
-static void btree_trans_unlock(struct btree_insert *trans)
+static void btree_trans_unlock(struct btree_trans *trans)
 {
 	struct btree_insert_entry *i;
 
@@ -449,7 +449,7 @@ static void btree_trans_unlock(struct btree_insert *trans)
 /* Normal update interface: */
 
 static enum btree_insert_ret
-btree_key_can_insert(struct btree_insert *trans,
+btree_key_can_insert(struct btree_trans *trans,
 		     struct btree_insert_entry *insert,
 		     unsigned *u64s)
 {
@@ -477,7 +477,7 @@ btree_key_can_insert(struct btree_insert *trans,
 	return BTREE_INSERT_OK;
 }
 
-static inline void do_btree_insert_one(struct btree_insert *trans,
+static inline void do_btree_insert_one(struct btree_trans *trans,
 				       struct btree_insert_entry *insert)
 {
 	if (likely(!insert->deferred))
@@ -489,7 +489,7 @@ static inline void do_btree_insert_one(struct btree_insert *trans,
 /*
  * Get journal reservation, take write locks, and attempt to do btree update(s):
  */
-static inline int do_btree_insert_at(struct btree_insert *trans,
+static inline int do_btree_insert_at(struct btree_trans *trans,
 				     struct btree_insert_entry **stopped_at)
 {
 	struct bch_fs *c = trans->c;
@@ -631,7 +631,7 @@ static inline void btree_insert_entry_checks(struct bch_fs *c,
  * -EROFS: filesystem read only
  * -EIO: journal or btree node IO error
  */
-static int __bch2_btree_insert_at(struct btree_insert *trans)
+static int __bch2_btree_insert_at(struct btree_trans *trans)
 {
 	struct bch_fs *c = trans->c;
 	struct btree_insert_entry *i;
@@ -639,17 +639,17 @@ static int __bch2_btree_insert_at(struct btree_insert *trans)
 	unsigned flags, u64s = 0;
 	int ret;
 
-	BUG_ON(!trans->nr);
+	BUG_ON(!trans->nr_updates);
 
 	/* for the sake of sanity: */
-	BUG_ON(trans->nr > 1 && !(trans->flags & BTREE_INSERT_ATOMIC));
+	BUG_ON(trans->nr_updates > 1 && !(trans->flags & BTREE_INSERT_ATOMIC));
 
 	if (trans->flags & BTREE_INSERT_GC_LOCK_HELD)
 		lockdep_assert_held(&c->gc_lock);
 
 	memset(&trans->journal_preres, 0, sizeof(trans->journal_preres));
 
-	bubble_sort(trans->entries, trans->nr, btree_trans_cmp);
+	bubble_sort(trans->updates, trans->nr_updates, btree_trans_cmp);
 
 	trans_for_each_entry(trans, i)
 		btree_insert_entry_checks(c, i);
@@ -781,7 +781,7 @@ err:
 			goto out;
 		}
 
-		bch2_btree_iter_unlock(trans->entries[0].iter);
+		bch2_trans_unlock(trans);
 		ret = -EINTR;
 
 		trans_for_each_iter(trans, i) {
@@ -830,21 +830,20 @@ int bch2_trans_commit(struct btree_trans *trans,
 		      u64 *journal_seq,
 		      unsigned flags)
 {
-	struct btree_insert insert = {
-		.c		= trans->c,
-		.disk_res	= disk_res,
-		.journal_seq	= journal_seq,
-		.flags		= flags,
-		.nr		= trans->nr_updates,
-		.entries	= trans->updates,
-	};
+	int ret;
 
 	if (!trans->nr_updates)
 		return 0;
 
+	trans->disk_res		= disk_res;
+	trans->journal_seq	= journal_seq;
+	trans->flags		= flags;
+
+	ret = __bch2_btree_insert_at(trans);
+
 	trans->nr_updates = 0;
 
-	return __bch2_btree_insert_at(&insert);
+	return ret;
 }
 
 int bch2_btree_delete_at(struct btree_trans *trans,
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index bddddcb93bc6..3744d55b8495 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -975,7 +975,7 @@ int bch2_mark_key(struct bch_fs *c, struct bkey_s_c k,
 	return ret;
 }
 
-void bch2_mark_update(struct btree_insert *trans,
+void bch2_mark_update(struct btree_trans *trans,
 		      struct btree_insert_entry *insert)
 {
 	struct bch_fs		*c = trans->c;
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index 342def8cf603..fc2c212392b6 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -256,7 +256,7 @@ int bch2_mark_key_locked(struct bch_fs *, struct bkey_s_c,
 int bch2_mark_key(struct bch_fs *, struct bkey_s_c,
 		  bool, s64, struct gc_pos,
 		  struct bch_fs_usage *, u64, unsigned);
-void bch2_mark_update(struct btree_insert *, struct btree_insert_entry *);
+void bch2_mark_update(struct btree_trans *, struct btree_insert_entry *);
 int bch2_fs_usage_apply(struct bch_fs *, struct bch_fs_usage *,
 			struct disk_reservation *);
 
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 420a9a6c59e7..80531017b237 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -890,7 +890,7 @@ bool bch2_extent_is_atomic(struct bkey_i *k, struct btree_iter *iter)
 }
 
 enum btree_insert_ret
-bch2_extent_can_insert(struct btree_insert *trans,
+bch2_extent_can_insert(struct btree_trans *trans,
 		       struct btree_insert_entry *insert,
 		       unsigned *u64s)
 {
@@ -1164,7 +1164,7 @@ next:
  * If the end of iter->pos is not the same as the end of insert, then
  * key insertion needs to continue/be retried.
  */
-void bch2_insert_fixup_extent(struct btree_insert *trans,
+void bch2_insert_fixup_extent(struct btree_trans *trans,
 			      struct btree_insert_entry *insert)
 {
 	struct bch_fs *c = trans->c;
diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h
index c3d67cafc0c3..57a84971637a 100644
--- a/fs/bcachefs/extents.h
+++ b/fs/bcachefs/extents.h
@@ -7,7 +7,7 @@
 #include "extents_types.h"
 
 struct bch_fs;
-struct btree_insert;
+struct btree_trans;
 struct btree_insert_entry;
 
 /* extent entries: */
@@ -410,9 +410,9 @@ void bch2_extent_trim_atomic(struct bkey_i *, struct btree_iter *);
 bool bch2_extent_is_atomic(struct bkey_i *, struct btree_iter *);
 
 enum btree_insert_ret
-bch2_extent_can_insert(struct btree_insert *, struct btree_insert_entry *,
+bch2_extent_can_insert(struct btree_trans *, struct btree_insert_entry *,
 		       unsigned *);
-void bch2_insert_fixup_extent(struct btree_insert *,
+void bch2_insert_fixup_extent(struct btree_trans *,
 			      struct btree_insert_entry *);
 
 void bch2_extent_mark_replicas_cached(struct bch_fs *, struct bkey_s_extent,
-- 
cgit 


From 9623ab27ab27e211e66aa1dd9a5ec90307160c8b Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 15 Mar 2019 17:11:58 -0400
Subject: bcachefs: Btree update path cleanup

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_types.h       |   3 +-
 fs/bcachefs/btree_update.h      |  35 ++++
 fs/bcachefs/btree_update_leaf.c | 424 ++++++++++++++++++++--------------------
 3 files changed, 246 insertions(+), 216 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 674a617a8018..2904239b7947 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -289,12 +289,12 @@ struct btree_trans {
 
 	struct btree_iter	*iters;
 	struct btree_insert_entry *updates;
-	struct disk_reservation *disk_res;
 
 	/* update path: */
 	struct journal_res	journal_res;
 	struct journal_preres	journal_preres;
 	u64			*journal_seq;
+	struct disk_reservation *disk_res;
 	unsigned		flags;
 
 	struct btree_iter	iters_onstack[2];
@@ -501,6 +501,7 @@ enum btree_insert_ret {
 	BTREE_INSERT_BTREE_NODE_FULL,
 	BTREE_INSERT_ENOSPC,
 	BTREE_INSERT_NEED_MARK_REPLICAS,
+	BTREE_INSERT_NEED_JOURNAL_RES,
 };
 
 enum btree_gc_coalesce_fail_reason {
diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
index 876b0e78f982..4d7cef75a017 100644
--- a/fs/bcachefs/btree_update.h
+++ b/fs/bcachefs/btree_update.h
@@ -128,4 +128,39 @@ int bch2_trans_commit(struct btree_trans *,
 	_ret;								\
 })
 
+/*
+ * We sort transaction entries so that if multiple iterators point to the same
+ * leaf node they'll be adjacent:
+ */
+static inline bool same_leaf_as_prev(struct btree_trans *trans,
+				     struct btree_insert_entry *i)
+{
+	return i != trans->updates &&
+		!i->deferred &&
+		i[0].iter->l[0].b == i[-1].iter->l[0].b;
+}
+
+#define __trans_next_update(_trans, _i, _filter)			\
+({									\
+	while ((_i) < (_trans)->updates + (_trans->nr_updates) && !(_filter))\
+		(_i)++;							\
+									\
+	(_i) < (_trans)->updates + (_trans->nr_updates);		\
+})
+
+#define __trans_for_each_update(_trans, _i, _filter)			\
+	for ((_i) = (_trans)->updates;					\
+	     __trans_next_update(_trans, _i, _filter);			\
+	     (_i)++)
+
+#define trans_for_each_update(trans, i)					\
+	__trans_for_each_update(trans, i, true)
+
+#define trans_for_each_update_iter(trans, i)				\
+	__trans_for_each_update(trans, i, !(i)->deferred)
+
+#define trans_for_each_update_leaf(trans, i)				\
+	__trans_for_each_update(trans, i, !(i)->deferred &&		\
+			       !same_leaf_as_prev(trans, i))
+
 #endif /* _BCACHEFS_BTREE_UPDATE_H */
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index d239aff7c13c..42fdb6c2963a 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -18,8 +18,64 @@
 
 #include <linux/sort.h>
 
-static bool btree_trans_relock(struct btree_trans *);
-static void btree_trans_unlock(struct btree_trans *);
+inline void bch2_btree_node_lock_for_insert(struct bch_fs *c, struct btree *b,
+					    struct btree_iter *iter)
+{
+	bch2_btree_node_lock_write(b, iter);
+
+	if (btree_node_just_written(b) &&
+	    bch2_btree_post_write_cleanup(c, b))
+		bch2_btree_iter_reinit_node(iter, b);
+
+	/*
+	 * If the last bset has been written, or if it's gotten too big - start
+	 * a new bset to insert into:
+	 */
+	if (want_new_bset(c, b))
+		bch2_btree_init_next(c, b, iter);
+}
+
+static void btree_trans_lock_write(struct bch_fs *c, struct btree_trans *trans)
+{
+	struct btree_insert_entry *i;
+
+	trans_for_each_update_leaf(trans, i)
+		bch2_btree_node_lock_for_insert(c, i->iter->l[0].b, i->iter);
+}
+
+static void btree_trans_unlock_write(struct btree_trans *trans)
+{
+	struct btree_insert_entry *i;
+
+	trans_for_each_update_leaf(trans, i)
+		bch2_btree_node_unlock_write(i->iter->l[0].b, i->iter);
+}
+
+static bool btree_trans_relock(struct btree_trans *trans)
+{
+	struct btree_insert_entry *i;
+
+	trans_for_each_update_iter(trans, i)
+		return bch2_btree_iter_relock(i->iter);
+	return true;
+}
+
+static void btree_trans_unlock(struct btree_trans *trans)
+{
+	struct btree_insert_entry *i;
+
+	trans_for_each_update_iter(trans, i) {
+		bch2_btree_iter_unlock(i->iter);
+		break;
+	}
+}
+
+static inline int btree_trans_cmp(struct btree_insert_entry l,
+				  struct btree_insert_entry r)
+{
+	return (l.deferred > r.deferred) - (l.deferred < r.deferred) ?:
+		btree_iter_cmp(l.iter, r.iter);
+}
 
 /* Inserting into a given leaf node (last stage of insert): */
 
@@ -350,103 +406,86 @@ bch2_deferred_update_alloc(struct bch_fs *c,
 	return d;
 }
 
-/* struct btree_insert operations: */
+/* Normal update interface: */
 
-/*
- * We sort transaction entries so that if multiple iterators point to the same
- * leaf node they'll be adjacent:
- */
-static bool same_leaf_as_prev(struct btree_trans *trans,
-			      struct btree_insert_entry *i)
+static inline void btree_insert_entry_checks(struct bch_fs *c,
+					     struct btree_insert_entry *i)
 {
-	return i != trans->updates &&
-		!i->deferred &&
-		i[0].iter->l[0].b == i[-1].iter->l[0].b;
-}
-
-#define __trans_next_entry(_trans, _i, _filter)				\
-({									\
-	while ((_i) < (_trans)->updates + (_trans->nr_updates) && !(_filter))\
-		(_i)++;							\
-									\
-	(_i) < (_trans)->updates + (_trans->nr_updates);		\
-})
-
-#define __trans_for_each_entry(_trans, _i, _filter)			\
-	for ((_i) = (_trans)->updates;					\
-	     __trans_next_entry(_trans, _i, _filter);			\
-	     (_i)++)
+	enum btree_id btree_id = !i->deferred
+		? i->iter->btree_id
+		: i->d->btree_id;
 
-#define trans_for_each_entry(trans, i)					\
-	__trans_for_each_entry(trans, i, true)
+	if (!i->deferred) {
+		BUG_ON(i->iter->level);
+		BUG_ON(bkey_cmp(bkey_start_pos(&i->k->k), i->iter->pos));
+		EBUG_ON((i->iter->flags & BTREE_ITER_IS_EXTENTS) &&
+			!bch2_extent_is_atomic(i->k, i->iter));
 
-#define trans_for_each_iter(trans, i)					\
-	__trans_for_each_entry(trans, i, !(i)->deferred)
+		bch2_btree_iter_verify_locks(i->iter);
+	}
 
-#define trans_for_each_leaf(trans, i)					\
-	__trans_for_each_entry(trans, i, !(i)->deferred &&		\
-			       !same_leaf_as_prev(trans, i))
+	BUG_ON(debug_check_bkeys(c) &&
+	       !bkey_deleted(&i->k->k) &&
+	       bch2_bkey_invalid(c, bkey_i_to_s_c(i->k), btree_id));
+}
 
-inline void bch2_btree_node_lock_for_insert(struct bch_fs *c, struct btree *b,
-					    struct btree_iter *iter)
+static int bch2_trans_journal_preres_get(struct btree_trans *trans)
 {
-	bch2_btree_node_lock_write(b, iter);
+	struct bch_fs *c = trans->c;
+	struct btree_insert_entry *i;
+	unsigned u64s = 0;
+	int ret;
 
-	if (btree_node_just_written(b) &&
-	    bch2_btree_post_write_cleanup(c, b))
-		bch2_btree_iter_reinit_node(iter, b);
+	trans_for_each_update(trans, i)
+		if (i->deferred)
+			u64s += jset_u64s(i->k->k.u64s);
 
-	/*
-	 * If the last bset has been written, or if it's gotten too big - start
-	 * a new bset to insert into:
-	 */
-	if (want_new_bset(c, b))
-		bch2_btree_init_next(c, b, iter);
-}
+	if (!u64s)
+		return 0;
 
-static void multi_lock_write(struct bch_fs *c, struct btree_trans *trans)
-{
-	struct btree_insert_entry *i;
+	ret = bch2_journal_preres_get(&c->journal,
+			&trans->journal_preres, u64s,
+			JOURNAL_RES_GET_NONBLOCK);
+	if (ret != -EAGAIN)
+		return ret;
 
-	trans_for_each_leaf(trans, i)
-		bch2_btree_node_lock_for_insert(c, i->iter->l[0].b, i->iter);
-}
+	btree_trans_unlock(trans);
 
-static void multi_unlock_write(struct btree_trans *trans)
-{
-	struct btree_insert_entry *i;
+	ret = bch2_journal_preres_get(&c->journal,
+			&trans->journal_preres, u64s, 0);
+	if (ret)
+		return ret;
 
-	trans_for_each_leaf(trans, i)
-		bch2_btree_node_unlock_write(i->iter->l[0].b, i->iter);
-}
+	if (!btree_trans_relock(trans)) {
+		trans_restart(" (iter relock after journal preres get blocked)");
+		return -EINTR;
+	}
 
-static inline int btree_trans_cmp(struct btree_insert_entry l,
-				  struct btree_insert_entry r)
-{
-	return (l.deferred > r.deferred) - (l.deferred < r.deferred) ?:
-		btree_iter_cmp(l.iter, r.iter);
+	return 0;
 }
 
-static bool btree_trans_relock(struct btree_trans *trans)
+static int bch2_trans_journal_res_get(struct btree_trans *trans,
+				      unsigned flags)
 {
+	struct bch_fs *c = trans->c;
 	struct btree_insert_entry *i;
+	unsigned u64s = 0;
+	int ret;
 
-	trans_for_each_iter(trans, i)
-		return bch2_btree_iter_relock(i->iter);
-	return true;
-}
+	if (unlikely(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))
+		return 0;
 
-static void btree_trans_unlock(struct btree_trans *trans)
-{
-	struct btree_insert_entry *i;
+	if (trans->flags & BTREE_INSERT_JOURNAL_RESERVED)
+		flags |= JOURNAL_RES_GET_RESERVED;
 
-	trans_for_each_iter(trans, i) {
-		bch2_btree_iter_unlock(i->iter);
-		break;
-	}
-}
+	trans_for_each_update(trans, i)
+		u64s += jset_u64s(i->k->k.u64s);
 
-/* Normal update interface: */
+	ret = bch2_journal_res_get(&c->journal, &trans->journal_res,
+				   u64s, flags);
+
+	return ret == -EAGAIN ? BTREE_INSERT_NEED_JOURNAL_RES : ret;
+}
 
 static enum btree_insert_ret
 btree_key_can_insert(struct btree_trans *trans,
@@ -477,6 +516,29 @@ btree_key_can_insert(struct btree_trans *trans,
 	return BTREE_INSERT_OK;
 }
 
+static int btree_trans_check_can_insert(struct btree_trans *trans,
+					struct btree_insert_entry **stopped_at)
+{
+	struct btree_insert_entry *i;
+	unsigned u64s = 0;
+	int ret;
+
+	trans_for_each_update_iter(trans, i) {
+		/* Multiple inserts might go to same leaf: */
+		if (!same_leaf_as_prev(trans, i))
+			u64s = 0;
+
+		u64s += i->k->k.u64s;
+		ret = btree_key_can_insert(trans, i, &u64s);
+		if (ret) {
+			*stopped_at = i;
+			return ret;
+		}
+	}
+
+	return 0;
+}
+
 static inline void do_btree_insert_one(struct btree_trans *trans,
 				       struct btree_insert_entry *insert)
 {
@@ -495,15 +557,12 @@ static inline int do_btree_insert_at(struct btree_trans *trans,
 	struct bch_fs *c = trans->c;
 	struct btree_insert_entry *i;
 	struct btree_iter *linked;
-	unsigned u64s;
 	int ret;
-retry:
-	trans_for_each_iter(trans, i)
-		BUG_ON(i->iter->uptodate >= BTREE_ITER_NEED_RELOCK);
 
-	memset(&trans->journal_res, 0, sizeof(trans->journal_res));
+	trans_for_each_update_iter(trans, i)
+		BUG_ON(i->iter->uptodate >= BTREE_ITER_NEED_RELOCK);
 
-	multi_lock_write(c, trans);
+	btree_trans_lock_write(c, trans);
 
 	if (race_fault()) {
 		ret = -EINTR;
@@ -516,59 +575,24 @@ retry:
 	 * held, otherwise another thread could write the node changing the
 	 * amount of space available:
 	 */
-	u64s = 0;
-	trans_for_each_iter(trans, i) {
-		/* Multiple inserts might go to same leaf: */
-		if (!same_leaf_as_prev(trans, i))
-			u64s = 0;
-
-		u64s += i->k->k.u64s;
-		ret = btree_key_can_insert(trans, i, &u64s);
-		if (ret) {
-			*stopped_at = i;
-			goto out;
-		}
-	}
-
-	if (likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) {
-		unsigned flags = (trans->flags & BTREE_INSERT_JOURNAL_RESERVED)
-			? JOURNAL_RES_GET_RESERVED : 0;
-
-		u64s = 0;
-		trans_for_each_entry(trans, i)
-			u64s += jset_u64s(i->k->k.u64s);
-
-		ret = bch2_journal_res_get(&c->journal,
-				&trans->journal_res, u64s,
-				flags|JOURNAL_RES_GET_NONBLOCK);
-		if (likely(!ret))
-			goto got_journal_res;
-		if (ret != -EAGAIN)
-			goto out;
-
-		multi_unlock_write(trans);
-		btree_trans_unlock(trans);
-
-		ret = bch2_journal_res_get(&c->journal,
-				&trans->journal_res, u64s,
-				flags|JOURNAL_RES_GET_CHECK);
-		if (ret)
-			return ret;
+	ret = btree_trans_check_can_insert(trans, stopped_at);
+	if (ret)
+		goto out;
 
-		if (!btree_trans_relock(trans)) {
-			trans_restart(" (iter relock after journal res get blocked)");
-			return -EINTR;
-		}
+	/*
+	 * Don't get journal reservation until after we know insert will
+	 * succeed:
+	 */
+	ret = bch2_trans_journal_res_get(trans, JOURNAL_RES_GET_NONBLOCK);
+	if (ret)
+		goto out;
 
-		goto retry;
-	}
-got_journal_res:
 	if (!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)) {
 		if (journal_seq_verify(c))
-			trans_for_each_entry(trans, i)
+			trans_for_each_update(trans, i)
 				i->k->k.version.lo = trans->journal_res.seq;
 		else if (inject_invalid_keys(c))
-			trans_for_each_entry(trans, i)
+			trans_for_each_update(trans, i)
 				i->k->k.version = MAX_VERSION;
 	}
 
@@ -578,7 +602,7 @@ got_journal_res:
 		 * have been traversed/locked, depending on what the caller was
 		 * doing:
 		 */
-		trans_for_each_iter(trans, i) {
+		trans_for_each_update_iter(trans, i) {
 			for_each_btree_iter(i->iter, linked)
 				if (linked->uptodate < BTREE_ITER_NEED_RELOCK)
 					linked->flags |= BTREE_ITER_NOUNLOCK;
@@ -586,40 +610,19 @@ got_journal_res:
 		}
 	}
 
-	trans_for_each_entry(trans, i)
+	trans_for_each_update(trans, i)
 		do_btree_insert_one(trans, i);
 out:
 	BUG_ON(ret &&
 	       (trans->flags & BTREE_INSERT_JOURNAL_RESERVED) &&
 	       trans->journal_res.ref);
 
-	multi_unlock_write(trans);
+	btree_trans_unlock_write(trans);
 	bch2_journal_res_put(&c->journal, &trans->journal_res);
 
 	return ret;
 }
 
-static inline void btree_insert_entry_checks(struct bch_fs *c,
-					     struct btree_insert_entry *i)
-{
-	enum btree_id btree_id = !i->deferred
-		? i->iter->btree_id
-		: i->d->btree_id;
-
-	if (!i->deferred) {
-		BUG_ON(i->iter->level);
-		BUG_ON(bkey_cmp(bkey_start_pos(&i->k->k), i->iter->pos));
-		EBUG_ON((i->iter->flags & BTREE_ITER_IS_EXTENTS) &&
-			!bch2_extent_is_atomic(i->k, i->iter));
-
-		bch2_btree_iter_verify_locks(i->iter);
-	}
-
-	BUG_ON(debug_check_bkeys(c) &&
-	       !bkey_deleted(&i->k->k) &&
-	       bch2_bkey_invalid(c, bkey_i_to_s_c(i->k), btree_id));
-}
-
 /**
  * __bch_btree_insert_at - insert keys at given iterator positions
  *
@@ -631,60 +634,15 @@ static inline void btree_insert_entry_checks(struct bch_fs *c,
  * -EROFS: filesystem read only
  * -EIO: journal or btree node IO error
  */
-static int __bch2_btree_insert_at(struct btree_trans *trans)
+static int __bch2_trans_commit(struct btree_trans *trans)
 {
 	struct bch_fs *c = trans->c;
 	struct btree_insert_entry *i;
 	struct btree_iter *linked;
-	unsigned flags, u64s = 0;
+	unsigned flags;
 	int ret;
-
-	BUG_ON(!trans->nr_updates);
-
-	/* for the sake of sanity: */
-	BUG_ON(trans->nr_updates > 1 && !(trans->flags & BTREE_INSERT_ATOMIC));
-
-	if (trans->flags & BTREE_INSERT_GC_LOCK_HELD)
-		lockdep_assert_held(&c->gc_lock);
-
-	memset(&trans->journal_preres, 0, sizeof(trans->journal_preres));
-
-	bubble_sort(trans->updates, trans->nr_updates, btree_trans_cmp);
-
-	trans_for_each_entry(trans, i)
-		btree_insert_entry_checks(c, i);
-
-	trans_for_each_entry(trans, i)
-		if (i->deferred)
-			u64s += jset_u64s(i->k->k.u64s);
-
-	if (u64s) {
-		ret = bch2_journal_preres_get(&c->journal,
-				&trans->journal_preres, u64s,
-				JOURNAL_RES_GET_NONBLOCK);
-		if (!ret)
-			goto got_journal_preres;
-		if (ret != -EAGAIN)
-			return ret;
-
-		btree_trans_unlock(trans);
-		ret = bch2_journal_preres_get(&c->journal,
-				&trans->journal_preres, u64s, 0);
-		if (ret)
-			return ret;
-
-		if (!btree_trans_relock(trans)) {
-			trans_restart(" (iter relock after journal preres get blocked)");
-			bch2_journal_preres_put(&c->journal, &trans->journal_preres);
-			return -EINTR;
-		}
-	}
-got_journal_preres:
-	if (unlikely(!(trans->flags & BTREE_INSERT_NOCHECK_RW) &&
-		     !percpu_ref_tryget(&c->writes)))
-		return -EROFS;
 retry:
-	trans_for_each_iter(trans, i) {
+	trans_for_each_update_iter(trans, i) {
 		unsigned old_locks_want = i->iter->locks_want;
 		unsigned old_uptodate = i->iter->uptodate;
 
@@ -705,24 +663,19 @@ retry:
 	if (unlikely(ret))
 		goto err;
 
-	trans_for_each_leaf(trans, i)
+	trans_for_each_update_leaf(trans, i)
 		bch2_foreground_maybe_merge(c, i->iter, 0, trans->flags);
 
-	trans_for_each_iter(trans, i)
+	trans_for_each_update_iter(trans, i)
 		bch2_btree_iter_downgrade(i->iter);
 out:
-	bch2_journal_preres_put(&c->journal, &trans->journal_preres);
-
-	if (unlikely(!(trans->flags & BTREE_INSERT_NOCHECK_RW)))
-		percpu_ref_put(&c->writes);
-
 	/* make sure we didn't drop or screw up locks: */
-	trans_for_each_iter(trans, i) {
+	trans_for_each_update_iter(trans, i) {
 		bch2_btree_iter_verify_locks(i->iter);
 		break;
 	}
 
-	trans_for_each_iter(trans, i) {
+	trans_for_each_update_iter(trans, i) {
 		for_each_btree_iter(i->iter, linked)
 			linked->flags &= ~BTREE_ITER_NOUNLOCK;
 		break;
@@ -784,12 +737,25 @@ err:
 		bch2_trans_unlock(trans);
 		ret = -EINTR;
 
-		trans_for_each_iter(trans, i) {
+		trans_for_each_update_iter(trans, i) {
 			int ret2 = bch2_mark_bkey_replicas(c, bkey_i_to_s_c(i->k));
 			if (ret2)
 				ret = ret2;
 		}
 		break;
+	case BTREE_INSERT_NEED_JOURNAL_RES:
+		btree_trans_unlock(trans);
+
+		ret = bch2_trans_journal_res_get(trans, JOURNAL_RES_GET_CHECK);
+		if (ret)
+			goto out;
+
+		if (btree_trans_relock(trans))
+			goto retry;
+
+		trans_restart(" (iter relock after journal res get blocked)");
+		ret = -EINTR;
+		break;
 	default:
 		BUG_ON(ret >= 0);
 		break;
@@ -801,7 +767,7 @@ err:
 			goto out;
 		}
 
-		trans_for_each_iter(trans, i) {
+		trans_for_each_update_iter(trans, i) {
 			int ret2 = bch2_btree_iter_traverse(i->iter);
 			if (ret2) {
 				ret = ret2;
@@ -830,16 +796,44 @@ int bch2_trans_commit(struct btree_trans *trans,
 		      u64 *journal_seq,
 		      unsigned flags)
 {
+	struct bch_fs *c = trans->c;
+	struct btree_insert_entry *i;
 	int ret;
 
 	if (!trans->nr_updates)
 		return 0;
 
+	/* for the sake of sanity: */
+	BUG_ON(trans->nr_updates > 1 && !(flags & BTREE_INSERT_ATOMIC));
+
+	if (flags & BTREE_INSERT_GC_LOCK_HELD)
+		lockdep_assert_held(&c->gc_lock);
+
+	memset(&trans->journal_res, 0, sizeof(trans->journal_res));
+	memset(&trans->journal_preres, 0, sizeof(trans->journal_preres));
 	trans->disk_res		= disk_res;
 	trans->journal_seq	= journal_seq;
 	trans->flags		= flags;
 
-	ret = __bch2_btree_insert_at(trans);
+	bubble_sort(trans->updates, trans->nr_updates, btree_trans_cmp);
+
+	trans_for_each_update(trans, i)
+		btree_insert_entry_checks(c, i);
+
+	if (unlikely(!(trans->flags & BTREE_INSERT_NOCHECK_RW) &&
+		     !percpu_ref_tryget(&c->writes)))
+		return -EROFS;
+
+	ret = bch2_trans_journal_preres_get(trans);
+	if (ret)
+		goto err;
+
+	ret = __bch2_trans_commit(trans);
+err:
+	bch2_journal_preres_put(&c->journal, &trans->journal_preres);
+
+	if (unlikely(!(trans->flags & BTREE_INSERT_NOCHECK_RW)))
+		percpu_ref_put(&c->writes);
 
 	trans->nr_updates = 0;
 
@@ -861,7 +855,7 @@ int bch2_btree_delete_at(struct btree_trans *trans,
 }
 
 /**
- * bch_btree_insert - insert keys into the extent btree
+ * bch2_btree_insert - insert keys into the extent btree
  * @c:			pointer to struct bch_fs
  * @id:			btree to insert into
  * @insert_keys:	list of keys to insert
-- 
cgit 


From 4d8100daa9bb6c243cd39be0956005a76eec36ee Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 15 Mar 2019 18:20:46 -0400
Subject: bcachefs: Allocate fs_usage in do_btree_insert_at()

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs.h              |   5 +-
 fs/bcachefs/btree_update_interior.c |  13 +++--
 fs/bcachefs/btree_update_leaf.c     |  44 ++++++++++++----
 fs/bcachefs/buckets.c               | 102 ++++++++++++++++++++++++++----------
 fs/bcachefs/buckets.h               |  14 +++--
 fs/bcachefs/extents.c               |   5 +-
 fs/bcachefs/replicas.c              |  48 ++++++++++++-----
 fs/bcachefs/replicas.h              |   2 +
 fs/bcachefs/super.c                 |   4 +-
 9 files changed, 166 insertions(+), 71 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 0b495dd32f67..27ffecb912a3 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -635,7 +635,10 @@ struct bch_fs {
 	struct percpu_rw_semaphore	mark_lock;
 
 	struct bch_fs_usage __percpu	*usage[2];
-	struct bch_fs_usage __percpu	*usage_scratch;
+
+	/* single element mempool: */
+	struct mutex		usage_scratch_lock;
+	struct bch_fs_usage	*usage_scratch;
 
 	/*
 	 * When we invalidate buckets, we use both the priority and the amount
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 7ccf2f935701..31c1474cd494 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -1076,8 +1076,7 @@ static void bch2_btree_set_root_inmem(struct btree_update *as, struct btree *b)
 
 	mutex_lock(&c->btree_interior_update_lock);
 	percpu_down_read(&c->mark_lock);
-	preempt_disable();
-	fs_usage = bch2_fs_usage_get_scratch(c);
+	fs_usage = bch2_fs_usage_scratch_get(c);
 
 	bch2_mark_key_locked(c, bkey_i_to_s_c(&b->key),
 		      true, 0,
@@ -1090,7 +1089,7 @@ static void bch2_btree_set_root_inmem(struct btree_update *as, struct btree *b)
 					   fs_usage);
 	bch2_fs_usage_apply(c, fs_usage, &as->reserve->disk_res);
 
-	preempt_enable();
+	bch2_fs_usage_scratch_put(c, fs_usage);
 	percpu_up_read(&c->mark_lock);
 	mutex_unlock(&c->btree_interior_update_lock);
 }
@@ -1171,8 +1170,7 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, struct btree *b
 
 	mutex_lock(&c->btree_interior_update_lock);
 	percpu_down_read(&c->mark_lock);
-	preempt_disable();
-	fs_usage = bch2_fs_usage_get_scratch(c);
+	fs_usage = bch2_fs_usage_scratch_get(c);
 
 	bch2_mark_key_locked(c, bkey_i_to_s_c(insert),
 			     true, 0,
@@ -1193,7 +1191,7 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, struct btree *b
 
 	bch2_fs_usage_apply(c, fs_usage, &as->reserve->disk_res);
 
-	preempt_enable();
+	bch2_fs_usage_scratch_put(c, fs_usage);
 	percpu_up_read(&c->mark_lock);
 	mutex_unlock(&c->btree_interior_update_lock);
 
@@ -1987,7 +1985,7 @@ static void __bch2_btree_node_update_key(struct bch_fs *c,
 
 		mutex_lock(&c->btree_interior_update_lock);
 		percpu_down_read(&c->mark_lock);
-		fs_usage = bch2_fs_usage_get_scratch(c);
+		fs_usage = bch2_fs_usage_scratch_get(c);
 
 		bch2_mark_key_locked(c, bkey_i_to_s_c(&new_key->k_i),
 			      true, 0,
@@ -1998,6 +1996,7 @@ static void __bch2_btree_node_update_key(struct bch_fs *c,
 					   fs_usage);
 		bch2_fs_usage_apply(c, fs_usage, &as->reserve->disk_res);
 
+		bch2_fs_usage_scratch_put(c, fs_usage);
 		percpu_up_read(&c->mark_lock);
 		mutex_unlock(&c->btree_interior_update_lock);
 
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 42fdb6c2963a..5349790547f4 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -269,8 +269,6 @@ static void btree_insert_key_leaf(struct btree_trans *trans,
 	int old_live_u64s = b->nr.live_u64s;
 	int live_u64s_added, u64s_added;
 
-	bch2_mark_update(trans, insert);
-
 	if (!btree_node_is_extents(b))
 		bch2_insert_fixup_key(trans, insert);
 	else
@@ -499,11 +497,6 @@ btree_key_can_insert(struct btree_trans *trans,
 	if (unlikely(btree_node_fake(b)))
 		return BTREE_INSERT_BTREE_NODE_FULL;
 
-	if (!bch2_bkey_replicas_marked(c,
-			bkey_i_to_s_c(insert->k),
-			true))
-		return BTREE_INSERT_NEED_MARK_REPLICAS;
-
 	ret = !btree_node_is_extents(b)
 		? BTREE_INSERT_OK
 		: bch2_extent_can_insert(trans, insert, u64s);
@@ -555,6 +548,7 @@ static inline int do_btree_insert_at(struct btree_trans *trans,
 				     struct btree_insert_entry **stopped_at)
 {
 	struct bch_fs *c = trans->c;
+	struct bch_fs_usage *fs_usage = NULL;
 	struct btree_insert_entry *i;
 	struct btree_iter *linked;
 	int ret;
@@ -562,12 +556,29 @@ static inline int do_btree_insert_at(struct btree_trans *trans,
 	trans_for_each_update_iter(trans, i)
 		BUG_ON(i->iter->uptodate >= BTREE_ITER_NEED_RELOCK);
 
+	trans_for_each_update_iter(trans, i) {
+		if (i->deferred ||
+		    !btree_node_type_needs_gc(i->iter->btree_id))
+			continue;
+
+		if (!fs_usage) {
+			percpu_down_read(&c->mark_lock);
+			fs_usage = bch2_fs_usage_scratch_get(c);
+		}
+
+		if (!bch2_bkey_replicas_marked_locked(c,
+				bkey_i_to_s_c(i->k), true)) {
+			ret = BTREE_INSERT_NEED_MARK_REPLICAS;
+			goto out;
+		}
+	}
+
 	btree_trans_lock_write(c, trans);
 
 	if (race_fault()) {
 		ret = -EINTR;
 		trans_restart(" (race)");
-		goto out;
+		goto out_unlock;
 	}
 
 	/*
@@ -577,7 +588,7 @@ static inline int do_btree_insert_at(struct btree_trans *trans,
 	 */
 	ret = btree_trans_check_can_insert(trans, stopped_at);
 	if (ret)
-		goto out;
+		goto out_unlock;
 
 	/*
 	 * Don't get journal reservation until after we know insert will
@@ -585,7 +596,7 @@ static inline int do_btree_insert_at(struct btree_trans *trans,
 	 */
 	ret = bch2_trans_journal_res_get(trans, JOURNAL_RES_GET_NONBLOCK);
 	if (ret)
-		goto out;
+		goto out_unlock;
 
 	if (!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)) {
 		if (journal_seq_verify(c))
@@ -610,14 +621,25 @@ static inline int do_btree_insert_at(struct btree_trans *trans,
 		}
 	}
 
+	trans_for_each_update_iter(trans, i)
+		bch2_mark_update(trans, i, fs_usage);
+	if (fs_usage)
+		bch2_trans_fs_usage_apply(trans, fs_usage);
+
 	trans_for_each_update(trans, i)
 		do_btree_insert_one(trans, i);
-out:
+out_unlock:
 	BUG_ON(ret &&
 	       (trans->flags & BTREE_INSERT_JOURNAL_RESERVED) &&
 	       trans->journal_res.ref);
 
 	btree_trans_unlock_write(trans);
+out:
+	if (fs_usage) {
+		bch2_fs_usage_scratch_put(c, fs_usage);
+		percpu_up_read(&c->mark_lock);
+	}
+
 	bch2_journal_res_put(&c->journal, &trans->journal_res);
 
 	return ret;
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 3744d55b8495..2fbcd85d9e75 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -144,6 +144,37 @@ void bch2_fs_usage_initialize(struct bch_fs *c)
 	percpu_up_write(&c->mark_lock);
 }
 
+void bch2_fs_usage_scratch_put(struct bch_fs *c, struct bch_fs_usage *fs_usage)
+{
+	if (fs_usage == c->usage_scratch)
+		mutex_unlock(&c->usage_scratch_lock);
+	else
+		kfree(fs_usage);
+}
+
+struct bch_fs_usage *bch2_fs_usage_scratch_get(struct bch_fs *c)
+{
+	struct bch_fs_usage *ret;
+	unsigned bytes = fs_usage_u64s(c) * sizeof(u64);
+
+	ret = kzalloc(bytes, GFP_NOWAIT);
+	if (ret)
+		return ret;
+
+	if (mutex_trylock(&c->usage_scratch_lock))
+		goto out_pool;
+
+	ret = kzalloc(bytes, GFP_NOFS);
+	if (ret)
+		return ret;
+
+	mutex_lock(&c->usage_scratch_lock);
+out_pool:
+	ret = c->usage_scratch;
+	memset(ret, 0, bytes);
+	return ret;
+}
+
 struct bch_dev_usage bch2_dev_usage_read(struct bch_fs *c, struct bch_dev *ca)
 {
 	struct bch_dev_usage ret;
@@ -906,31 +937,39 @@ static int __bch2_mark_key(struct bch_fs *c, struct bkey_s_c k,
 			   unsigned journal_seq, unsigned flags,
 			   bool gc)
 {
+	int ret = 0;
+
+	preempt_disable();
+
 	if (!fs_usage || gc)
 		fs_usage = this_cpu_ptr(c->usage[gc]);
 
 	switch (k.k->type) {
 	case KEY_TYPE_alloc:
-		return bch2_mark_alloc(c, k, inserting,
+		ret = bch2_mark_alloc(c, k, inserting,
 				fs_usage, journal_seq, flags, gc);
+		break;
 	case KEY_TYPE_btree_ptr:
-		return bch2_mark_extent(c, k, inserting
+		ret = bch2_mark_extent(c, k, inserting
 				?  c->opts.btree_node_size
 				: -c->opts.btree_node_size,
 				BCH_DATA_BTREE,
 				fs_usage, journal_seq, flags, gc);
+		break;
 	case KEY_TYPE_extent:
-		return bch2_mark_extent(c, k, sectors, BCH_DATA_USER,
+		ret = bch2_mark_extent(c, k, sectors, BCH_DATA_USER,
 				fs_usage, journal_seq, flags, gc);
+		break;
 	case KEY_TYPE_stripe:
-		return bch2_mark_stripe(c, k, inserting,
+		ret = bch2_mark_stripe(c, k, inserting,
 				fs_usage, journal_seq, flags, gc);
+		break;
 	case KEY_TYPE_inode:
 		if (inserting)
 			fs_usage->nr_inodes++;
 		else
 			fs_usage->nr_inodes--;
-		return 0;
+		break;
 	case KEY_TYPE_reservation: {
 		unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas;
 
@@ -940,11 +979,13 @@ static int __bch2_mark_key(struct bch_fs *c, struct bkey_s_c k,
 
 		fs_usage->reserved				+= sectors;
 		fs_usage->persistent_reserved[replicas - 1]	+= sectors;
-		return 0;
+		break;
 	}
-	default:
-		return 0;
 	}
+
+	preempt_enable();
+
+	return ret;
 }
 
 int bch2_mark_key_locked(struct bch_fs *c,
@@ -976,25 +1017,19 @@ int bch2_mark_key(struct bch_fs *c, struct bkey_s_c k,
 }
 
 void bch2_mark_update(struct btree_trans *trans,
-		      struct btree_insert_entry *insert)
+		      struct btree_insert_entry *insert,
+		      struct bch_fs_usage *fs_usage)
 {
 	struct bch_fs		*c = trans->c;
 	struct btree_iter	*iter = insert->iter;
 	struct btree		*b = iter->l[0].b;
 	struct btree_node_iter	node_iter = iter->l[0].iter;
-	struct bch_fs_usage	*fs_usage;
 	struct gc_pos		pos = gc_pos_btree_node(b);
 	struct bkey_packed	*_k;
-	u64 disk_res_sectors = trans->disk_res ? trans->disk_res->sectors : 0;
-	static int warned_disk_usage = 0;
 
 	if (!btree_node_type_needs_gc(iter->btree_id))
 		return;
 
-	percpu_down_read(&c->mark_lock);
-	preempt_disable();
-	fs_usage = bch2_fs_usage_get_scratch(c);
-
 	if (!(trans->flags & BTREE_INSERT_NOMARK))
 		bch2_mark_key_locked(c, bkey_i_to_s_c(insert->k), true,
 			bpos_min(insert->k->k.p, b->key.k.p).offset -
@@ -1047,16 +1082,32 @@ void bch2_mark_update(struct btree_trans *trans,
 
 		bch2_btree_node_iter_advance(&node_iter, b);
 	}
+}
 
-	if (bch2_fs_usage_apply(c, fs_usage, trans->disk_res) &&
-	    !warned_disk_usage &&
-	    !xchg(&warned_disk_usage, 1)) {
-		char buf[200];
+void bch2_trans_fs_usage_apply(struct btree_trans *trans,
+			       struct bch_fs_usage *fs_usage)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_insert_entry *i;
+	static int warned_disk_usage = 0;
+	u64 disk_res_sectors = trans->disk_res ? trans->disk_res->sectors : 0;
+	char buf[200];
+
+	if (!bch2_fs_usage_apply(c, fs_usage, trans->disk_res) ||
+	    warned_disk_usage ||
+	    xchg(&warned_disk_usage, 1))
+		return;
 
-		pr_err("disk usage increased more than %llu sectors reserved", disk_res_sectors);
+	pr_err("disk usage increased more than %llu sectors reserved", disk_res_sectors);
+
+	trans_for_each_update_iter(trans, i) {
+		struct btree_iter	*iter = i->iter;
+		struct btree		*b = iter->l[0].b;
+		struct btree_node_iter	node_iter = iter->l[0].iter;
+		struct bkey_packed	*_k;
 
 		pr_err("while inserting");
-		bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(insert->k));
+		bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(i->k));
 		pr_err("%s", buf);
 		pr_err("overlapping with");
 
@@ -1069,8 +1120,8 @@ void bch2_mark_update(struct btree_trans *trans,
 			k = bkey_disassemble(b, _k, &unpacked);
 
 			if (btree_node_is_extents(b)
-			    ? bkey_cmp(insert->k->k.p, bkey_start_pos(k.k)) <= 0
-			    : bkey_cmp(insert->k->k.p, k.k->p))
+			    ? bkey_cmp(i->k->k.p, bkey_start_pos(k.k)) <= 0
+			    : bkey_cmp(i->k->k.p, k.k->p))
 				break;
 
 			bch2_bkey_val_to_text(&PBUF(buf), c, k);
@@ -1079,9 +1130,6 @@ void bch2_mark_update(struct btree_trans *trans,
 			bch2_btree_node_iter_advance(&node_iter, b);
 		}
 	}
-
-	preempt_enable();
-	percpu_up_read(&c->mark_lock);
 }
 
 /* Disk reservations: */
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index fc2c212392b6..e34c9d24dc38 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -219,13 +219,8 @@ static inline unsigned fs_usage_u64s(struct bch_fs *c)
 		READ_ONCE(c->replicas.nr);
 }
 
-static inline struct bch_fs_usage *bch2_fs_usage_get_scratch(struct bch_fs *c)
-{
-	struct bch_fs_usage *ret = this_cpu_ptr(c->usage_scratch);
-
-	memset(ret, 0, fs_usage_u64s(c) * sizeof(u64));
-	return ret;
-}
+void bch2_fs_usage_scratch_put(struct bch_fs *, struct bch_fs_usage *);
+struct bch_fs_usage *bch2_fs_usage_scratch_get(struct bch_fs *);
 
 struct bch_fs_usage *bch2_fs_usage_read(struct bch_fs *);
 
@@ -256,10 +251,13 @@ int bch2_mark_key_locked(struct bch_fs *, struct bkey_s_c,
 int bch2_mark_key(struct bch_fs *, struct bkey_s_c,
 		  bool, s64, struct gc_pos,
 		  struct bch_fs_usage *, u64, unsigned);
-void bch2_mark_update(struct btree_trans *, struct btree_insert_entry *);
 int bch2_fs_usage_apply(struct bch_fs *, struct bch_fs_usage *,
 			struct disk_reservation *);
 
+void bch2_mark_update(struct btree_trans *, struct btree_insert_entry *,
+		      struct bch_fs_usage *);
+void bch2_trans_fs_usage_apply(struct btree_trans *, struct bch_fs_usage *);
+
 /* disk reservations: */
 
 void __bch2_disk_reservation_put(struct bch_fs *, struct disk_reservation *);
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 80531017b237..194b8d6da1bb 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -1190,11 +1190,12 @@ void bch2_insert_fixup_extent(struct btree_trans *trans,
 
 		if (s.deleting)
 			tmp.k.k.type = KEY_TYPE_discard;
-
+#if 0
+		/* disabled due to lock recursion - mark_lock: */
 		if (debug_check_bkeys(c))
 			bch2_bkey_debugcheck(c, iter->l[0].b,
 					     bkey_i_to_s_c(&tmp.k));
-
+#endif
 		EBUG_ON(bkey_deleted(&tmp.k.k) || !tmp.k.k.size);
 
 		extent_bset_insert(c, iter, &tmp.k);
diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c
index 72592df9afc0..b66217989b71 100644
--- a/fs/bcachefs/replicas.c
+++ b/fs/bcachefs/replicas.c
@@ -207,22 +207,29 @@ static bool __replicas_has_entry(struct bch_replicas_cpu *r,
 	return __replicas_entry_idx(r, search) >= 0;
 }
 
-bool bch2_replicas_marked(struct bch_fs *c,
+static bool bch2_replicas_marked_locked(struct bch_fs *c,
 			  struct bch_replicas_entry *search,
 			  bool check_gc_replicas)
 {
-	bool marked;
-
 	if (!search->nr_devs)
 		return true;
 
 	verify_replicas_entry_sorted(search);
 
-	percpu_down_read(&c->mark_lock);
-	marked = __replicas_has_entry(&c->replicas, search) &&
+	return __replicas_has_entry(&c->replicas, search) &&
 		(!check_gc_replicas ||
 		 likely((!c->replicas_gc.entries)) ||
 		 __replicas_has_entry(&c->replicas_gc, search));
+}
+
+bool bch2_replicas_marked(struct bch_fs *c,
+			  struct bch_replicas_entry *search,
+			  bool check_gc_replicas)
+{
+	bool marked;
+
+	percpu_down_read(&c->mark_lock);
+	marked = bch2_replicas_marked_locked(c, search, check_gc_replicas);
 	percpu_up_read(&c->mark_lock);
 
 	return marked;
@@ -263,7 +270,7 @@ static int replicas_table_update(struct bch_fs *c,
 				 struct bch_replicas_cpu *new_r)
 {
 	struct bch_fs_usage __percpu *new_usage[2] = { NULL, NULL };
-	struct bch_fs_usage __percpu *new_scratch = NULL;
+	struct bch_fs_usage *new_scratch = NULL;
 	unsigned bytes = sizeof(struct bch_fs_usage) +
 		sizeof(u64) * new_r->nr;
 	int ret = -ENOMEM;
@@ -273,8 +280,7 @@ static int replicas_table_update(struct bch_fs *c,
 	    (c->usage[1] &&
 	     !(new_usage[1] = __alloc_percpu_gfp(bytes, sizeof(u64),
 						 GFP_NOIO))) ||
-	    !(new_scratch  = __alloc_percpu_gfp(bytes, sizeof(u64),
-						GFP_NOIO)))
+	    !(new_scratch  = kmalloc(bytes, GFP_NOIO)))
 		goto err;
 
 	if (c->usage[0])
@@ -290,7 +296,7 @@ static int replicas_table_update(struct bch_fs *c,
 	swap(c->replicas,	*new_r);
 	ret = 0;
 err:
-	free_percpu(new_scratch);
+	kfree(new_scratch);
 	free_percpu(new_usage[1]);
 	free_percpu(new_usage[0]);
 	return ret;
@@ -390,9 +396,9 @@ int bch2_mark_replicas(struct bch_fs *c,
 		: bch2_mark_replicas_slowpath(c, r);
 }
 
-bool bch2_bkey_replicas_marked(struct bch_fs *c,
-			       struct bkey_s_c k,
-			       bool check_gc_replicas)
+bool bch2_bkey_replicas_marked_locked(struct bch_fs *c,
+				      struct bkey_s_c k,
+				      bool check_gc_replicas)
 {
 	struct bch_replicas_padded search;
 	struct bch_devs_list cached = bch2_bkey_cached_devs(k);
@@ -401,13 +407,27 @@ bool bch2_bkey_replicas_marked(struct bch_fs *c,
 	for (i = 0; i < cached.nr; i++) {
 		bch2_replicas_entry_cached(&search.e, cached.devs[i]);
 
-		if (!bch2_replicas_marked(c, &search.e, check_gc_replicas))
+		if (!bch2_replicas_marked_locked(c, &search.e,
+						 check_gc_replicas))
 			return false;
 	}
 
 	bkey_to_replicas(&search.e, k);
 
-	return bch2_replicas_marked(c, &search.e, check_gc_replicas);
+	return bch2_replicas_marked_locked(c, &search.e, check_gc_replicas);
+}
+
+bool bch2_bkey_replicas_marked(struct bch_fs *c,
+			       struct bkey_s_c k,
+			       bool check_gc_replicas)
+{
+	bool marked;
+
+	percpu_down_read(&c->mark_lock);
+	marked = bch2_bkey_replicas_marked_locked(c, k, check_gc_replicas);
+	percpu_up_read(&c->mark_lock);
+
+	return marked;
 }
 
 int bch2_mark_bkey_replicas(struct bch_fs *c, struct bkey_s_c k)
diff --git a/fs/bcachefs/replicas.h b/fs/bcachefs/replicas.h
index d1457c786bb5..0777e7056d55 100644
--- a/fs/bcachefs/replicas.h
+++ b/fs/bcachefs/replicas.h
@@ -26,6 +26,8 @@ bool bch2_replicas_marked(struct bch_fs *,
 int bch2_mark_replicas(struct bch_fs *,
 		       struct bch_replicas_entry *);
 
+bool bch2_bkey_replicas_marked_locked(struct bch_fs *,
+				      struct bkey_s_c, bool);
 bool bch2_bkey_replicas_marked(struct bch_fs *,
 			       struct bkey_s_c, bool);
 int bch2_mark_bkey_replicas(struct bch_fs *, struct bkey_s_c);
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 4f627e91f041..b1eb70556f75 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -404,7 +404,7 @@ static void bch2_fs_free(struct bch_fs *c)
 	bch2_io_clock_exit(&c->io_clock[READ]);
 	bch2_fs_compress_exit(c);
 	percpu_free_rwsem(&c->mark_lock);
-	free_percpu(c->usage_scratch);
+	kfree(c->usage_scratch);
 	free_percpu(c->usage[0]);
 	free_percpu(c->pcpu);
 	mempool_exit(&c->btree_iters_pool);
@@ -572,6 +572,8 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 	mutex_init(&c->btree_reserve_cache_lock);
 	mutex_init(&c->btree_interior_update_lock);
 
+	mutex_init(&c->usage_scratch_lock);
+
 	mutex_init(&c->bio_bounce_pages_lock);
 
 	bio_list_init(&c->btree_write_error_list);
-- 
cgit 


From 7c648fe8991a267886b262e8988546ec87ad926e Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 17 Mar 2019 16:49:59 -0400
Subject: bcachefs: Fix a deadlock

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/migrate.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c
index 955831a50824..38bf75b6bc2d 100644
--- a/fs/bcachefs/migrate.c
+++ b/fs/bcachefs/migrate.c
@@ -96,11 +96,11 @@ static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
 			break;
 	}
 
+	bch2_trans_exit(&trans);
+
 	bch2_replicas_gc_end(c, ret);
 	mutex_unlock(&c->replicas_gc_lock);
 
-	bch2_trans_exit(&trans);
-
 	return ret;
 }
 
-- 
cgit 


From 49a67206e4e481a097a3586fbd88ce0deb646307 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 18 Mar 2019 13:42:10 -0400
Subject: bcachefs: Add more time stats for being blocked on allocator

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_foreground.c | 21 +++++++++++++++++++++
 fs/bcachefs/bcachefs.h         |  8 ++++++--
 fs/bcachefs/super.c            |  2 +-
 3 files changed, 28 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index ba0640e3f981..334bc6576b3a 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -246,6 +246,10 @@ struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca,
 	if (unlikely(c->open_buckets_nr_free <= open_buckets_reserved(reserve))) {
 		if (cl)
 			closure_wait(&c->open_buckets_wait, cl);
+
+		if (!c->blocked_allocate_open_bucket)
+			c->blocked_allocate_open_bucket = local_clock();
+
 		spin_unlock(&c->freelist_lock);
 		trace_open_bucket_alloc_fail(ca, reserve);
 		return ERR_PTR(-OPEN_BUCKETS_EMPTY);
@@ -276,6 +280,9 @@ struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca,
 	if (cl)
 		closure_wait(&c->freelist_wait, cl);
 
+	if (!c->blocked_allocate)
+		c->blocked_allocate = local_clock();
+
 	spin_unlock(&c->freelist_lock);
 
 	trace_bucket_alloc_fail(ca, reserve);
@@ -301,6 +308,20 @@ out:
 	bucket_io_clock_reset(c, ca, bucket, WRITE);
 	spin_unlock(&ob->lock);
 
+	if (c->blocked_allocate_open_bucket) {
+		bch2_time_stats_update(
+			&c->times[BCH_TIME_blocked_allocate_open_bucket],
+			c->blocked_allocate_open_bucket);
+		c->blocked_allocate_open_bucket = 0;
+	}
+
+	if (c->blocked_allocate) {
+		bch2_time_stats_update(
+			&c->times[BCH_TIME_blocked_allocate],
+			c->blocked_allocate);
+		c->blocked_allocate = 0;
+	}
+
 	spin_unlock(&c->freelist_lock);
 
 	bch2_wake_allocator(ca);
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 27ffecb912a3..8bf1bfb7ec39 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -302,8 +302,10 @@ do {									\
 	x(data_promote)				\
 	x(journal_write)			\
 	x(journal_delay)			\
-	x(journal_blocked)			\
-	x(journal_flush_seq)
+	x(journal_flush_seq)			\
+	x(blocked_journal)			\
+	x(blocked_allocate)			\
+	x(blocked_allocate_open_bucket)
 
 enum bch_time_stats {
 #define x(name) BCH_TIME_##name,
@@ -653,6 +655,8 @@ struct bch_fs {
 	/* ALLOCATOR */
 	spinlock_t		freelist_lock;
 	struct closure_waitlist	freelist_wait;
+	u64			blocked_allocate;
+	u64			blocked_allocate_open_bucket;
 	u8			open_buckets_freelist;
 	u8			open_buckets_nr_free;
 	struct closure_waitlist	open_buckets_wait;
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index b1eb70556f75..e8242bb70b93 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -596,7 +596,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 
 	c->journal.write_time	= &c->times[BCH_TIME_journal_write];
 	c->journal.delay_time	= &c->times[BCH_TIME_journal_delay];
-	c->journal.blocked_time	= &c->times[BCH_TIME_journal_blocked];
+	c->journal.blocked_time	= &c->times[BCH_TIME_blocked_journal];
 	c->journal.flush_seq_time = &c->times[BCH_TIME_journal_flush_seq];
 
 	bch2_fs_btree_cache_init_early(&c->btree_cache);
-- 
cgit 


From 1a470560c9ab8d6dd13363d5a6bae38d2c3d5261 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 18 Mar 2019 16:18:39 -0400
Subject: bcachefs: BTREE_INSERT_ATOMIC must be used for extents now

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update_leaf.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 5349790547f4..d940fdf01478 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -406,9 +406,10 @@ bch2_deferred_update_alloc(struct bch_fs *c,
 
 /* Normal update interface: */
 
-static inline void btree_insert_entry_checks(struct bch_fs *c,
+static inline void btree_insert_entry_checks(struct btree_trans *trans,
 					     struct btree_insert_entry *i)
 {
+	struct bch_fs *c = trans->c;
 	enum btree_id btree_id = !i->deferred
 		? i->iter->btree_id
 		: i->d->btree_id;
@@ -419,6 +420,9 @@ static inline void btree_insert_entry_checks(struct bch_fs *c,
 		EBUG_ON((i->iter->flags & BTREE_ITER_IS_EXTENTS) &&
 			!bch2_extent_is_atomic(i->k, i->iter));
 
+		EBUG_ON((i->iter->flags & BTREE_ITER_IS_EXTENTS) &&
+			!(trans->flags & BTREE_INSERT_ATOMIC));
+
 		bch2_btree_iter_verify_locks(i->iter);
 	}
 
@@ -840,7 +844,7 @@ int bch2_trans_commit(struct btree_trans *trans,
 	bubble_sort(trans->updates, trans->nr_updates, btree_trans_cmp);
 
 	trans_for_each_update(trans, i)
-		btree_insert_entry_checks(c, i);
+		btree_insert_entry_checks(trans, i);
 
 	if (unlikely(!(trans->flags & BTREE_INSERT_NOCHECK_RW) &&
 		     !percpu_ref_tryget(&c->writes)))
@@ -954,7 +958,10 @@ int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id,
 		bch2_trans_update(&trans, BTREE_INSERT_ENTRY(iter, &delete));
 
 		ret = bch2_trans_commit(&trans, NULL, journal_seq,
+					BTREE_INSERT_ATOMIC|
 					BTREE_INSERT_NOFAIL);
+		if (ret == -EINTR)
+			ret = 0;
 		if (ret)
 			break;
 
-- 
cgit 


From dc3b63dc33e1b651441275f15ea4a21c9078d583 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 21 Mar 2019 16:28:57 -0400
Subject: bcachefs: Add time stats for btree updates

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs.h              |  7 ++++---
 fs/bcachefs/btree_io.c              |  8 +++++---
 fs/bcachefs/btree_types.h           |  1 +
 fs/bcachefs/btree_update_interior.c |  3 ++-
 fs/bcachefs/btree_update_leaf.c     | 13 +++++++++++--
 5 files changed, 23 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 8bf1bfb7ec39..ea648753aec0 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -290,10 +290,11 @@ do {									\
 
 #define BCH_TIME_STATS()			\
 	x(btree_node_mem_alloc)			\
+	x(btree_node_split)			\
+	x(btree_node_sort)			\
+	x(btree_node_read)			\
 	x(btree_gc)				\
-	x(btree_split)				\
-	x(btree_sort)				\
-	x(btree_read)				\
+	x(btree_update)				\
 	x(btree_lock_contended_read)		\
 	x(btree_lock_contended_intent)		\
 	x(btree_lock_contended_write)		\
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 6f1b1e4317a0..d785e6ac22f7 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -327,7 +327,7 @@ static void btree_node_sort(struct bch_fs *c, struct btree *b,
 	BUG_ON(vstruct_end(&out->keys) > (void *) out + (PAGE_SIZE << order));
 
 	if (sorting_entire_node)
-		bch2_time_stats_update(&c->times[BCH_TIME_btree_sort],
+		bch2_time_stats_update(&c->times[BCH_TIME_btree_node_sort],
 				       start_time);
 
 	/* Make sure we preserve bset journal_seq: */
@@ -403,7 +403,8 @@ void bch2_btree_sort_into(struct bch_fs *c,
 				&dst->format,
 				true);
 
-	bch2_time_stats_update(&c->times[BCH_TIME_btree_sort], start_time);
+	bch2_time_stats_update(&c->times[BCH_TIME_btree_node_sort],
+			       start_time);
 
 	set_btree_bset_end(dst, dst->set);
 
@@ -988,7 +989,8 @@ start:
 		}
 	}
 
-	bch2_time_stats_update(&c->times[BCH_TIME_btree_read], rb->start_time);
+	bch2_time_stats_update(&c->times[BCH_TIME_btree_node_read],
+			       rb->start_time);
 	bio_put(&rb->bio);
 	clear_btree_node_read_in_flight(b);
 	wake_up_bit(&b->flags, BTREE_NODE_read_in_flight);
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 2904239b7947..bcc14e40cf5e 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -274,6 +274,7 @@ struct btree_insert_entry {
 struct btree_trans {
 	struct bch_fs		*c;
 	size_t			nr_restarts;
+	u64			commit_start;
 
 	u64			iters_live;
 	u64			iters_linked;
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 31c1474cd494..8b96faf107f8 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -1440,7 +1440,8 @@ static void btree_split(struct btree_update *as, struct btree *b,
 
 	bch2_btree_iter_verify_locks(iter);
 
-	bch2_time_stats_update(&c->times[BCH_TIME_btree_split], start_time);
+	bch2_time_stats_update(&c->times[BCH_TIME_btree_node_split],
+			       start_time);
 }
 
 static void
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index d940fdf01478..14e6a8d19df5 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -824,10 +824,10 @@ int bch2_trans_commit(struct btree_trans *trans,
 {
 	struct bch_fs *c = trans->c;
 	struct btree_insert_entry *i;
-	int ret;
+	int ret = 0;
 
 	if (!trans->nr_updates)
-		return 0;
+		goto out;
 
 	/* for the sake of sanity: */
 	BUG_ON(trans->nr_updates > 1 && !(flags & BTREE_INSERT_ATOMIC));
@@ -850,6 +850,9 @@ int bch2_trans_commit(struct btree_trans *trans,
 		     !percpu_ref_tryget(&c->writes)))
 		return -EROFS;
 
+	if (!trans->commit_start)
+		trans->commit_start = local_clock();
+
 	ret = bch2_trans_journal_preres_get(trans);
 	if (ret)
 		goto err;
@@ -860,6 +863,12 @@ err:
 
 	if (unlikely(!(trans->flags & BTREE_INSERT_NOCHECK_RW)))
 		percpu_ref_put(&c->writes);
+out:
+	if (!ret && trans->commit_start) {
+		bch2_time_stats_update(&c->times[BCH_TIME_btree_update],
+				       trans->commit_start);
+		trans->commit_start = 0;
+	}
 
 	trans->nr_updates = 0;
 
-- 
cgit 


From 082f0801e0c96a4dbea67998b63c29c18da2419f Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 21 Mar 2019 17:04:50 -0400
Subject: bcachefs: Fix error handling in bch2_fs_recovery()

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/recovery.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index fdc64e199f8b..9610b488fbdf 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -271,11 +271,13 @@ int bch2_fs_recovery(struct bch_fs *c)
 			continue;
 
 		err = "invalid btree root pointer";
+		ret = -1;
 		if (r->error)
 			goto err;
 
 		err = "error reading btree root";
-		if (bch2_btree_root_read(c, i, &r->key, r->level)) {
+		ret = bch2_btree_root_read(c, i, &r->key, r->level);
+		if (ret) {
 			if (i != BTREE_ID_ALLOC)
 				goto err;
 
-- 
cgit 


From 05235e99281533a70abee7f86181d3963f317913 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 21 Mar 2019 19:02:48 -0400
Subject: bcachefs: Run gc if failed to read alloc btree

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/recovery.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 9610b488fbdf..24101023a5b9 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -187,6 +187,8 @@ int bch2_fs_recovery(struct bch_fs *c)
 	LIST_HEAD(journal);
 	struct jset *j = NULL;
 	unsigned i;
+	bool run_gc = c->opts.fsck ||
+		!(c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_INFO));
 	int ret;
 
 	mutex_lock(&c->sb_lock);
@@ -282,6 +284,7 @@ int bch2_fs_recovery(struct bch_fs *c)
 				goto err;
 
 			mustfix_fsck_err(c, "error reading btree root");
+			run_gc = true;
 		}
 	}
 
@@ -302,8 +305,7 @@ int bch2_fs_recovery(struct bch_fs *c)
 
 	set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags);
 
-	if (!(c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_INFO)) ||
-	    c->opts.fsck) {
+	if (run_gc) {
 		bch_verbose(c, "starting mark and sweep:");
 		err = "error in recovery";
 		ret = bch2_gc(c, &journal, true);
-- 
cgit 


From 6122ab639c8e3d6afe9a3c8e3e49173cd2a064a3 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 21 Mar 2019 19:03:57 -0400
Subject: bcachefs: More debug params for testing of recovery paths

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs.h | 6 +++++-
 fs/bcachefs/btree_gc.c | 4 +++-
 fs/bcachefs/recovery.c | 4 ++++
 3 files changed, 12 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index ea648753aec0..5a9b776558f6 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -278,7 +278,11 @@ do {									\
 		"cached data")						\
 	BCH_DEBUG_PARAM(force_reconstruct_read,				\
 		"Force reads to use the reconstruct path, when reading"	\
-		"from erasure coded extents")
+		"from erasure coded extents")				\
+	BCH_DEBUG_PARAM(test_restart_gc,				\
+		"Test restarting mark and sweep gc when bucket gens change")\
+	BCH_DEBUG_PARAM(test_reconstruct_alloc,				\
+		"Test reconstructing the alloc btree")
 
 #define BCH_DEBUG_PARAMS_ALL() BCH_DEBUG_PARAMS_ALWAYS() BCH_DEBUG_PARAMS_DEBUG()
 
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index c899a77bf891..0069d6cb6e5d 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -750,7 +750,9 @@ again:
 
 	c->gc_count++;
 out:
-	if (!ret && test_bit(BCH_FS_FIXED_GENS, &c->flags)) {
+	if (!ret &&
+	    (test_bit(BCH_FS_FIXED_GENS, &c->flags) ||
+	     (!iter && test_restart_gc(c)))) {
 		/*
 		 * XXX: make sure gens we fixed got saved
 		 */
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 24101023a5b9..d7be535f3cc1 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -277,6 +277,10 @@ int bch2_fs_recovery(struct bch_fs *c)
 		if (r->error)
 			goto err;
 
+		if (i == BTREE_ID_ALLOC &&
+		    test_reconstruct_alloc(c))
+			continue;
+
 		err = "error reading btree root";
 		ret = bch2_btree_root_read(c, i, &r->key, r->level);
 		if (ret) {
-- 
cgit 


From 8b2b9d11b9d0aea6401546780e84adcf51e27ba4 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 21 Mar 2019 19:12:52 -0400
Subject: bcachefs: Fix error handling in gc

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_gc.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 0069d6cb6e5d..302793d84b92 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -261,15 +261,14 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id,
 		return ret;
 
 	mutex_lock(&c->btree_root_lock);
-
 	b = c->btree_roots[btree_id].b;
 	if (!btree_node_fake(b))
-		bch2_gc_mark_key(c, bkey_i_to_s_c(&b->key),
-				 &max_stale, initial);
+		ret = bch2_gc_mark_key(c, bkey_i_to_s_c(&b->key),
+				       &max_stale, initial);
 	gc_pos_set(c, gc_pos_btree_root(b->btree_id));
-
 	mutex_unlock(&c->btree_root_lock);
-	return 0;
+
+	return ret;
 }
 
 static inline int btree_id_gc_phase_cmp(enum btree_id l, enum btree_id r)
-- 
cgit 


From 11e6f19a30f65b854ba2fd3e142b3247150efe0d Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 21 Mar 2019 21:12:01 -0400
Subject: bcachefs: Rework error handling in btree update path

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update_leaf.c | 187 +++++++++++++++++++++-------------------
 1 file changed, 98 insertions(+), 89 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 14e6a8d19df5..503cbc5ae309 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -649,69 +649,13 @@ out:
 	return ret;
 }
 
-/**
- * __bch_btree_insert_at - insert keys at given iterator positions
- *
- * This is main entry point for btree updates.
- *
- * Return values:
- * -EINTR: locking changed, this function should be called again. Only returned
- *  if passed BTREE_INSERT_ATOMIC.
- * -EROFS: filesystem read only
- * -EIO: journal or btree node IO error
- */
-static int __bch2_trans_commit(struct btree_trans *trans)
+static noinline
+int bch2_trans_commit_error(struct btree_trans *trans,
+			    struct btree_insert_entry *i,
+			    int ret)
 {
 	struct bch_fs *c = trans->c;
-	struct btree_insert_entry *i;
-	struct btree_iter *linked;
-	unsigned flags;
-	int ret;
-retry:
-	trans_for_each_update_iter(trans, i) {
-		unsigned old_locks_want = i->iter->locks_want;
-		unsigned old_uptodate = i->iter->uptodate;
-
-		if (!bch2_btree_iter_upgrade(i->iter, 1, true)) {
-			trans_restart(" (failed upgrade, locks_want %u uptodate %u)",
-				      old_locks_want, old_uptodate);
-			ret = -EINTR;
-			goto err;
-		}
-
-		if (i->iter->flags & BTREE_ITER_ERROR) {
-			ret = -EIO;
-			goto err;
-		}
-	}
-
-	ret = do_btree_insert_at(trans, &i);
-	if (unlikely(ret))
-		goto err;
-
-	trans_for_each_update_leaf(trans, i)
-		bch2_foreground_maybe_merge(c, i->iter, 0, trans->flags);
-
-	trans_for_each_update_iter(trans, i)
-		bch2_btree_iter_downgrade(i->iter);
-out:
-	/* make sure we didn't drop or screw up locks: */
-	trans_for_each_update_iter(trans, i) {
-		bch2_btree_iter_verify_locks(i->iter);
-		break;
-	}
-
-	trans_for_each_update_iter(trans, i) {
-		for_each_btree_iter(i->iter, linked)
-			linked->flags &= ~BTREE_ITER_NOUNLOCK;
-		break;
-	}
-
-	BUG_ON(!(trans->flags & BTREE_INSERT_ATOMIC) && ret == -EINTR);
-
-	return ret;
-err:
-	flags = trans->flags;
+	unsigned flags = trans->flags;
 
 	/*
 	 * BTREE_INSERT_NOUNLOCK means don't unlock _after_ successful btree
@@ -755,29 +699,29 @@ err:
 		ret = -ENOSPC;
 		break;
 	case BTREE_INSERT_NEED_MARK_REPLICAS:
-		if (flags & BTREE_INSERT_NOUNLOCK) {
-			ret = -EINTR;
-			goto out;
-		}
-
 		bch2_trans_unlock(trans);
-		ret = -EINTR;
 
 		trans_for_each_update_iter(trans, i) {
-			int ret2 = bch2_mark_bkey_replicas(c, bkey_i_to_s_c(i->k));
-			if (ret2)
-				ret = ret2;
+			ret = bch2_mark_bkey_replicas(c, bkey_i_to_s_c(i->k));
+			if (ret)
+				return ret;
 		}
+
+		if (btree_trans_relock(trans))
+			return 0;
+
+		trans_restart(" (iter relock after marking replicas)");
+		ret = -EINTR;
 		break;
 	case BTREE_INSERT_NEED_JOURNAL_RES:
 		btree_trans_unlock(trans);
 
 		ret = bch2_trans_journal_res_get(trans, JOURNAL_RES_GET_CHECK);
 		if (ret)
-			goto out;
+			return ret;
 
 		if (btree_trans_relock(trans))
-			goto retry;
+			return 0;
 
 		trans_restart(" (iter relock after journal res get blocked)");
 		ret = -EINTR;
@@ -788,17 +732,11 @@ err:
 	}
 
 	if (ret == -EINTR) {
-		if (flags & BTREE_INSERT_NOUNLOCK) {
-			trans_restart(" (can't unlock)");
-			goto out;
-		}
-
 		trans_for_each_update_iter(trans, i) {
 			int ret2 = bch2_btree_iter_traverse(i->iter);
 			if (ret2) {
-				ret = ret2;
 				trans_restart(" (traverse)");
-				goto out;
+				return ret2;
 			}
 
 			BUG_ON(i->iter->uptodate > BTREE_ITER_NEED_PEEK);
@@ -809,12 +747,73 @@ err:
 		 * dropped locks:
 		 */
 		if (!(flags & BTREE_INSERT_ATOMIC))
-			goto retry;
+			return 0;
 
 		trans_restart(" (atomic)");
 	}
 
-	goto out;
+	return ret;
+}
+
+/**
+ * __bch_btree_insert_at - insert keys at given iterator positions
+ *
+ * This is main entry point for btree updates.
+ *
+ * Return values:
+ * -EINTR: locking changed, this function should be called again. Only returned
+ *  if passed BTREE_INSERT_ATOMIC.
+ * -EROFS: filesystem read only
+ * -EIO: journal or btree node IO error
+ */
+static int __bch2_trans_commit(struct btree_trans *trans,
+			       struct btree_insert_entry **stopped_at)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_insert_entry *i;
+	struct btree_iter *linked;
+	int ret;
+
+	trans_for_each_update_iter(trans, i) {
+		unsigned old_locks_want = i->iter->locks_want;
+		unsigned old_uptodate = i->iter->uptodate;
+
+		if (!bch2_btree_iter_upgrade(i->iter, 1, true)) {
+			trans_restart(" (failed upgrade, locks_want %u uptodate %u)",
+				      old_locks_want, old_uptodate);
+			ret = -EINTR;
+			goto err;
+		}
+
+		if (i->iter->flags & BTREE_ITER_ERROR) {
+			ret = -EIO;
+			goto err;
+		}
+	}
+
+	ret = do_btree_insert_at(trans, stopped_at);
+	if (unlikely(ret))
+		goto err;
+
+	trans_for_each_update_leaf(trans, i)
+		bch2_foreground_maybe_merge(c, i->iter, 0, trans->flags);
+
+	trans_for_each_update_iter(trans, i)
+		bch2_btree_iter_downgrade(i->iter);
+err:
+	/* make sure we didn't drop or screw up locks: */
+	trans_for_each_update_iter(trans, i) {
+		bch2_btree_iter_verify_locks(i->iter);
+		break;
+	}
+
+	trans_for_each_update_iter(trans, i) {
+		for_each_btree_iter(i->iter, linked)
+			linked->flags &= ~BTREE_ITER_NOUNLOCK;
+		break;
+	}
+
+	return ret;
 }
 
 int bch2_trans_commit(struct btree_trans *trans,
@@ -827,7 +826,7 @@ int bch2_trans_commit(struct btree_trans *trans,
 	int ret = 0;
 
 	if (!trans->nr_updates)
-		goto out;
+		goto out_noupdates;
 
 	/* for the sake of sanity: */
 	BUG_ON(trans->nr_updates > 1 && !(flags & BTREE_INSERT_ATOMIC));
@@ -835,6 +834,9 @@ int bch2_trans_commit(struct btree_trans *trans,
 	if (flags & BTREE_INSERT_GC_LOCK_HELD)
 		lockdep_assert_held(&c->gc_lock);
 
+	if (!trans->commit_start)
+		trans->commit_start = local_clock();
+
 	memset(&trans->journal_res, 0, sizeof(trans->journal_res));
 	memset(&trans->journal_preres, 0, sizeof(trans->journal_preres));
 	trans->disk_res		= disk_res;
@@ -849,21 +851,20 @@ int bch2_trans_commit(struct btree_trans *trans,
 	if (unlikely(!(trans->flags & BTREE_INSERT_NOCHECK_RW) &&
 		     !percpu_ref_tryget(&c->writes)))
 		return -EROFS;
-
-	if (!trans->commit_start)
-		trans->commit_start = local_clock();
-
+retry:
 	ret = bch2_trans_journal_preres_get(trans);
 	if (ret)
 		goto err;
 
-	ret = __bch2_trans_commit(trans);
-err:
+	ret = __bch2_trans_commit(trans, &i);
+	if (ret)
+		goto err;
+out:
 	bch2_journal_preres_put(&c->journal, &trans->journal_preres);
 
 	if (unlikely(!(trans->flags & BTREE_INSERT_NOCHECK_RW)))
 		percpu_ref_put(&c->writes);
-out:
+out_noupdates:
 	if (!ret && trans->commit_start) {
 		bch2_time_stats_update(&c->times[BCH_TIME_btree_update],
 				       trans->commit_start);
@@ -872,7 +873,15 @@ out:
 
 	trans->nr_updates = 0;
 
+	BUG_ON(!(trans->flags & BTREE_INSERT_ATOMIC) && ret == -EINTR);
+
 	return ret;
+err:
+	ret = bch2_trans_commit_error(trans, i, ret);
+	if (!ret)
+		goto retry;
+
+	goto out;
 }
 
 int bch2_btree_delete_at(struct btree_trans *trans,
-- 
cgit 


From 65e7ab8f2aa7d45190f1e90b1332c71ef83587a6 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 22 Mar 2019 04:32:29 -0400
Subject: bcachefs: Fix a deadlock

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update_leaf.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 503cbc5ae309..a05fd7104a72 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -560,6 +560,8 @@ static inline int do_btree_insert_at(struct btree_trans *trans,
 	trans_for_each_update_iter(trans, i)
 		BUG_ON(i->iter->uptodate >= BTREE_ITER_NEED_RELOCK);
 
+	btree_trans_lock_write(c, trans);
+
 	trans_for_each_update_iter(trans, i) {
 		if (i->deferred ||
 		    !btree_node_type_needs_gc(i->iter->btree_id))
@@ -577,12 +579,10 @@ static inline int do_btree_insert_at(struct btree_trans *trans,
 		}
 	}
 
-	btree_trans_lock_write(c, trans);
-
 	if (race_fault()) {
 		ret = -EINTR;
 		trans_restart(" (race)");
-		goto out_unlock;
+		goto out;
 	}
 
 	/*
@@ -592,7 +592,7 @@ static inline int do_btree_insert_at(struct btree_trans *trans,
 	 */
 	ret = btree_trans_check_can_insert(trans, stopped_at);
 	if (ret)
-		goto out_unlock;
+		goto out;
 
 	/*
 	 * Don't get journal reservation until after we know insert will
@@ -600,7 +600,7 @@ static inline int do_btree_insert_at(struct btree_trans *trans,
 	 */
 	ret = bch2_trans_journal_res_get(trans, JOURNAL_RES_GET_NONBLOCK);
 	if (ret)
-		goto out_unlock;
+		goto out;
 
 	if (!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)) {
 		if (journal_seq_verify(c))
@@ -632,13 +632,13 @@ static inline int do_btree_insert_at(struct btree_trans *trans,
 
 	trans_for_each_update(trans, i)
 		do_btree_insert_one(trans, i);
-out_unlock:
+out:
 	BUG_ON(ret &&
 	       (trans->flags & BTREE_INSERT_JOURNAL_RESERVED) &&
 	       trans->journal_res.ref);
 
 	btree_trans_unlock_write(trans);
-out:
+
 	if (fs_usage) {
 		bch2_fs_usage_scratch_put(c, fs_usage);
 		percpu_up_read(&c->mark_lock);
-- 
cgit 


From 134915f3d38d830374603b84a9fe2e280f4814ed Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 21 Mar 2019 22:19:57 -0400
Subject: bcachefs: Go rw lazily

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c  |  32 ++---------
 fs/bcachefs/bcachefs.h          |  14 +----
 fs/bcachefs/btree_update.h      |   2 +
 fs/bcachefs/btree_update_leaf.c |  19 +++++-
 fs/bcachefs/fs.c                |  10 ++--
 fs/bcachefs/fsck.c              |  16 ++++--
 fs/bcachefs/journal.c           |   2 -
 fs/bcachefs/journal_io.c        |   2 +
 fs/bcachefs/recovery.c          |  25 +++-----
 fs/bcachefs/super-io.c          |  11 ++--
 fs/bcachefs/super-io.h          |   3 +-
 fs/bcachefs/super.c             | 124 +++++++++++++++++++++++++++++-----------
 fs/bcachefs/super.h             |   4 +-
 fs/bcachefs/sysfs.c             |   4 +-
 14 files changed, 153 insertions(+), 115 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index c11136506352..da25a1ed5206 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -345,6 +345,7 @@ int bch2_alloc_replay_key(struct bch_fs *c, struct bkey_i *k)
 
 	ret = bch2_trans_commit(&trans, NULL, NULL,
 				BTREE_INSERT_NOFAIL|
+				BTREE_INSERT_LAZY_RW|
 				BTREE_INSERT_JOURNAL_REPLAY|
 				BTREE_INSERT_NOMARK);
 err:
@@ -1626,7 +1627,7 @@ static bool bch2_fs_allocator_start_fast(struct bch_fs *c)
 	return ret;
 }
 
-static int __bch2_fs_allocator_start(struct bch_fs *c)
+int bch2_fs_allocator_start(struct bch_fs *c)
 {
 	struct bch_dev *ca;
 	unsigned dev_iter;
@@ -1635,6 +1636,10 @@ static int __bch2_fs_allocator_start(struct bch_fs *c)
 	long bu;
 	int ret = 0;
 
+	if (!test_alloc_startup(c) &&
+	    bch2_fs_allocator_start_fast(c))
+		return 0;
+
 	pr_debug("not enough empty buckets; scanning for reclaimable buckets");
 
 	/*
@@ -1709,31 +1714,6 @@ err:
 	return ret;
 }
 
-int bch2_fs_allocator_start(struct bch_fs *c)
-{
-	struct bch_dev *ca;
-	unsigned i;
-	int ret;
-
-	ret = bch2_fs_allocator_start_fast(c) ? 0 :
-		__bch2_fs_allocator_start(c);
-	if (ret)
-		return ret;
-
-	set_bit(BCH_FS_ALLOCATOR_STARTED, &c->flags);
-
-	for_each_rw_member(ca, c, i) {
-		ret = bch2_dev_allocator_start(ca);
-		if (ret) {
-			percpu_ref_put(&ca->io_ref);
-			return ret;
-		}
-	}
-
-	set_bit(BCH_FS_ALLOCATOR_RUNNING, &c->flags);
-	return 0;
-}
-
 void bch2_fs_allocator_background_init(struct bch_fs *c)
 {
 	spin_lock_init(&c->freelist_lock);
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 5a9b776558f6..5eae18e92bd5 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -486,6 +486,7 @@ enum {
 	BCH_FS_INITIAL_GC_DONE,
 	BCH_FS_FSCK_DONE,
 	BCH_FS_STARTED,
+	BCH_FS_RW,
 
 	/* shutdown: */
 	BCH_FS_EMERGENCY_RO,
@@ -510,13 +511,6 @@ struct btree_debug {
 	struct dentry		*failed;
 };
 
-enum bch_fs_state {
-	BCH_FS_STARTING		= 0,
-	BCH_FS_STOPPING,
-	BCH_FS_RO,
-	BCH_FS_RW,
-};
-
 struct bch_fs_pcpu {
 	u64			sectors_available;
 };
@@ -538,7 +532,6 @@ struct bch_fs {
 
 	/* ro/rw, add/remove devices: */
 	struct mutex		state_lock;
-	enum bch_fs_state	state;
 
 	/* Counts outstanding writes, for clean transition to read-only */
 	struct percpu_ref	writes;
@@ -800,11 +793,6 @@ static inline void bch2_set_ra_pages(struct bch_fs *c, unsigned ra_pages)
 #endif
 }
 
-static inline bool bch2_fs_running(struct bch_fs *c)
-{
-	return c->state == BCH_FS_RO || c->state == BCH_FS_RW;
-}
-
 static inline unsigned bucket_bytes(const struct bch_dev *ca)
 {
 	return ca->mi.bucket_size << 9;
diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
index 4d7cef75a017..879e7ae39586 100644
--- a/fs/bcachefs/btree_update.h
+++ b/fs/bcachefs/btree_update.h
@@ -38,6 +38,7 @@ enum {
 	__BTREE_INSERT_NOUNLOCK,
 	__BTREE_INSERT_NOFAIL,
 	__BTREE_INSERT_NOCHECK_RW,
+	__BTREE_INSERT_LAZY_RW,
 	__BTREE_INSERT_USE_RESERVE,
 	__BTREE_INSERT_USE_ALLOC_RESERVE,
 	__BTREE_INSERT_JOURNAL_REPLAY,
@@ -64,6 +65,7 @@ enum {
 #define BTREE_INSERT_NOFAIL		(1 << __BTREE_INSERT_NOFAIL)
 
 #define BTREE_INSERT_NOCHECK_RW		(1 << __BTREE_INSERT_NOCHECK_RW)
+#define BTREE_INSERT_LAZY_RW		(1 << __BTREE_INSERT_LAZY_RW)
 
 /* for copygc, or when merging btree nodes */
 #define BTREE_INSERT_USE_RESERVE	(1 << __BTREE_INSERT_USE_RESERVE)
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index a05fd7104a72..9c1ca9ad3ead 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -849,8 +849,23 @@ int bch2_trans_commit(struct btree_trans *trans,
 		btree_insert_entry_checks(trans, i);
 
 	if (unlikely(!(trans->flags & BTREE_INSERT_NOCHECK_RW) &&
-		     !percpu_ref_tryget(&c->writes)))
-		return -EROFS;
+		     !percpu_ref_tryget(&c->writes))) {
+		if (likely(!(trans->flags & BTREE_INSERT_LAZY_RW)))
+			return -EROFS;
+
+		btree_trans_unlock(trans);
+
+		ret = bch2_fs_read_write_early(c);
+		if (ret)
+			return ret;
+
+		percpu_ref_get(&c->writes);
+
+		if (!btree_trans_relock(trans)) {
+			ret = -EINTR;
+			goto err;
+		}
+	}
 retry:
 	ret = bch2_trans_journal_preres_get(trans);
 	if (ret)
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index dc55d36ecfd5..2f01d97470b1 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -1616,7 +1616,7 @@ static struct bch_fs *__bch2_open_as_blockdevs(const char *dev_name, char * cons
 
 	mutex_lock(&c->state_lock);
 
-	if (!bch2_fs_running(c)) {
+	if (!test_bit(BCH_FS_STARTED, &c->flags)) {
 		mutex_unlock(&c->state_lock);
 		closure_put(&c->cl);
 		pr_err("err mounting %s: incomplete filesystem", dev_name);
@@ -1672,8 +1672,6 @@ static int bch2_remount(struct super_block *sb, int *flags, char *data)
 		return ret;
 
 	if (opts.read_only != c->opts.read_only) {
-		const char *err = NULL;
-
 		mutex_lock(&c->state_lock);
 
 		if (opts.read_only) {
@@ -1681,9 +1679,9 @@ static int bch2_remount(struct super_block *sb, int *flags, char *data)
 
 			sb->s_flags |= SB_RDONLY;
 		} else {
-			err = bch2_fs_read_write(c);
-			if (err) {
-				bch_err(c, "error going rw: %s", err);
+			ret = bch2_fs_read_write(c);
+			if (ret) {
+				bch_err(c, "error going rw: %i", ret);
 				return -EINVAL;
 			}
 
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 2561773cd6dc..439f758d8178 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -174,7 +174,8 @@ static int hash_redo_key(const struct bch_hash_desc desc,
 	bch2_hash_set(trans, desc, &h->info, k_iter->pos.inode,
 		      tmp, BCH_HASH_SET_MUST_CREATE);
 	ret = bch2_trans_commit(trans, NULL, NULL,
-				BTREE_INSERT_NOFAIL);
+				BTREE_INSERT_NOFAIL|
+				BTREE_INSERT_LAZY_RW);
 err:
 	kfree(tmp);
 	return ret;
@@ -204,7 +205,8 @@ retry:
 	ret   = bch2_hash_delete_at(&trans, desc, info, iter) ?:
 		bch2_trans_commit(&trans, NULL, NULL,
 				  BTREE_INSERT_ATOMIC|
-				  BTREE_INSERT_NOFAIL);
+				  BTREE_INSERT_NOFAIL|
+				  BTREE_INSERT_LAZY_RW);
 err:
 	if (ret == -EINTR)
 		goto retry;
@@ -365,7 +367,9 @@ static int check_dirent_hash(struct btree_trans *trans, struct hash_check *h,
 		     buf, strlen(buf), d->v.d_name, len)) {
 		bch2_trans_update(trans, BTREE_INSERT_ENTRY(iter, &d->k_i));
 
-		ret = bch2_trans_commit(trans, NULL, NULL, BTREE_INSERT_NOFAIL);
+		ret = bch2_trans_commit(trans, NULL, NULL,
+					BTREE_INSERT_NOFAIL|
+					BTREE_INSERT_LAZY_RW);
 		if (ret)
 			goto err;
 
@@ -630,7 +634,8 @@ static int check_dirents(struct bch_fs *c)
 				BTREE_INSERT_ENTRY(iter, &n->k_i));
 
 			ret = bch2_trans_commit(&trans, NULL, NULL,
-						BTREE_INSERT_NOFAIL);
+						BTREE_INSERT_NOFAIL|
+						BTREE_INSERT_LAZY_RW);
 			kfree(n);
 			if (ret)
 				goto err;
@@ -1268,7 +1273,8 @@ static int check_inode(struct btree_trans *trans,
 		bch2_trans_update(trans, BTREE_INSERT_ENTRY(iter, &p.inode.k_i));
 
 		ret = bch2_trans_commit(trans, NULL, NULL,
-					BTREE_INSERT_NOFAIL);
+					BTREE_INSERT_NOFAIL|
+					BTREE_INSERT_LAZY_RW);
 		if (ret && ret != -EINTR)
 			bch_err(c, "error in fs gc: error %i "
 				"updating inode", ret);
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index 64f9c5740ec8..c0dcc0ff65ce 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -1027,8 +1027,6 @@ void bch2_fs_journal_start(struct journal *j)
 	 * only have to go down with the next journal entry we write:
 	 */
 	bch2_journal_seq_blacklist_write(j);
-
-	queue_delayed_work(c->journal_reclaim_wq, &j->reclaim_work, 0);
 }
 
 /* init/exit: */
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index d20672a37fd3..1bb627c05188 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -861,6 +861,7 @@ static int bch2_extent_replay_key(struct bch_fs *c, struct bkey_i *k)
 		ret = bch2_trans_commit(&trans, &disk_res, NULL,
 					BTREE_INSERT_ATOMIC|
 					BTREE_INSERT_NOFAIL|
+					BTREE_INSERT_LAZY_RW|
 					BTREE_INSERT_JOURNAL_REPLAY);
 	} while ((!ret || ret == -EINTR) &&
 		 bkey_cmp(k->k.p, iter->pos));
@@ -906,6 +907,7 @@ int bch2_journal_replay(struct bch_fs *c, struct list_head *list)
 				ret = bch2_btree_insert(c, entry->btree_id, k,
 						NULL, NULL,
 						BTREE_INSERT_NOFAIL|
+						BTREE_INSERT_LAZY_RW|
 						BTREE_INSERT_JOURNAL_REPLAY|
 						BTREE_INSERT_NOMARK);
 				break;
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index d7be535f3cc1..f7e3060428cf 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -119,8 +119,13 @@ static int verify_superblock_clean(struct bch_fs *c,
 	if (mustfix_fsck_err_on(j->seq != clean->journal_seq, c,
 			"superblock journal seq (%llu) doesn't match journal (%llu) after clean shutdown",
 			le64_to_cpu(clean->journal_seq),
-			le64_to_cpu(j->seq)))
-		bch2_fs_mark_clean(c, false);
+			le64_to_cpu(j->seq))) {
+		ret = bch2_fs_mark_dirty(c);
+		if (ret) {
+			bch_err(c, "error going rw");
+			return ret;
+		}
+	}
 
 	mustfix_fsck_err_on(j->read_clock != clean->read_clock, c,
 			"superblock read clock doesn't match journal after clean shutdown");
@@ -331,13 +336,6 @@ int bch2_fs_recovery(struct bch_fs *c)
 	if (c->opts.noreplay)
 		goto out;
 
-	/*
-	 * Mark dirty before journal replay, fsck:
-	 * XXX: after a clean shutdown, this could be done lazily only when fsck
-	 * finds an error
-	 */
-	bch2_fs_mark_clean(c, false);
-
 	/*
 	 * bch2_fs_journal_start() can't happen sooner, or btree_gc_finish()
 	 * will give spurious errors about oldest_gen > bucket_gen -
@@ -345,11 +343,6 @@ int bch2_fs_recovery(struct bch_fs *c)
 	 */
 	bch2_fs_journal_start(&c->journal);
 
-	err = "error starting allocator";
-	ret = bch2_fs_allocator_start(c);
-	if (ret)
-		goto err;
-
 	bch_verbose(c, "starting journal replay:");
 	err = "journal replay failed";
 	ret = bch2_journal_replay(c, &journal);
@@ -436,8 +429,8 @@ int bch2_fs_initialize(struct bch_fs *c)
 	bch2_fs_journal_start(&c->journal);
 	bch2_journal_set_replay_done(&c->journal);
 
-	err = "error starting allocator";
-	ret = bch2_fs_allocator_start(c);
+	err = "error going read write";
+	ret = bch2_fs_read_write_early(c);
 	if (ret)
 		goto err;
 
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index c89fe5d630e4..dec6a737f44f 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -886,7 +886,7 @@ void bch2_sb_clean_renumber(struct bch_sb_field_clean *clean, int write)
 		bch2_bkey_renumber(BKEY_TYPE_BTREE, bkey_to_packed(entry->start), write);
 }
 
-static void bch2_fs_mark_dirty(struct bch_fs *c)
+int bch2_fs_mark_dirty(struct bch_fs *c)
 {
 	mutex_lock(&c->sb_lock);
 	if (BCH_SB_CLEAN(c->disk_sb.sb) ||
@@ -896,6 +896,8 @@ static void bch2_fs_mark_dirty(struct bch_fs *c)
 		bch2_write_super(c);
 	}
 	mutex_unlock(&c->sb_lock);
+
+	return 0;
 }
 
 struct jset_entry *
@@ -997,17 +999,12 @@ bch2_journal_super_entries_add_common(struct bch_fs *c,
 	return entry;
 }
 
-void bch2_fs_mark_clean(struct bch_fs *c, bool clean)
+void bch2_fs_mark_clean(struct bch_fs *c)
 {
 	struct bch_sb_field_clean *sb_clean;
 	struct jset_entry *entry;
 	unsigned u64s;
 
-	if (!clean) {
-		bch2_fs_mark_dirty(c);
-		return;
-	}
-
 	mutex_lock(&c->sb_lock);
 	if (BCH_SB_CLEAN(c->disk_sb.sb))
 		goto out;
diff --git a/fs/bcachefs/super-io.h b/fs/bcachefs/super-io.h
index 498a9e887d4e..afc92d14c254 100644
--- a/fs/bcachefs/super-io.h
+++ b/fs/bcachefs/super-io.h
@@ -141,7 +141,8 @@ bch2_journal_super_entries_add_common(struct bch_fs *,
 
 void bch2_sb_clean_renumber(struct bch_sb_field_clean *, int);
 
-void bch2_fs_mark_clean(struct bch_fs *, bool);
+int bch2_fs_mark_dirty(struct bch_fs *);
+void bch2_fs_mark_clean(struct bch_fs *);
 
 void bch2_sb_field_to_text(struct printbuf *, struct bch_sb *,
 			   struct bch_sb_field *);
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index e8242bb70b93..5364b95cfec9 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -258,8 +258,10 @@ static void bch2_writes_disabled(struct percpu_ref *writes)
 
 void bch2_fs_read_only(struct bch_fs *c)
 {
-	if (c->state == BCH_FS_RO)
+	if (!test_bit(BCH_FS_RW, &c->flags)) {
+		cancel_delayed_work_sync(&c->journal.reclaim_work);
 		return;
+	}
 
 	BUG_ON(test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags));
 
@@ -301,10 +303,9 @@ void bch2_fs_read_only(struct bch_fs *c)
 	    !test_bit(BCH_FS_ERROR, &c->flags) &&
 	    !test_bit(BCH_FS_EMERGENCY_RO, &c->flags) &&
 	    test_bit(BCH_FS_STARTED, &c->flags))
-		bch2_fs_mark_clean(c, true);
+		bch2_fs_mark_clean(c);
 
-	if (c->state != BCH_FS_STOPPING)
-		c->state = BCH_FS_RO;
+	clear_bit(BCH_FS_RW, &c->flags);
 }
 
 static void bch2_fs_read_only_work(struct work_struct *work)
@@ -333,55 +334,106 @@ bool bch2_fs_emergency_read_only(struct bch_fs *c)
 	return ret;
 }
 
-const char *bch2_fs_read_write(struct bch_fs *c)
+static int bch2_fs_read_write_late(struct bch_fs *c)
 {
 	struct bch_dev *ca;
-	const char *err = NULL;
 	unsigned i;
+	int ret;
 
-	if (c->state == BCH_FS_RW)
-		return NULL;
+	ret = bch2_gc_thread_start(c);
+	if (ret) {
+		bch_err(c, "error starting gc thread");
+		return ret;
+	}
+
+	for_each_rw_member(ca, c, i) {
+		ret = bch2_copygc_start(c, ca);
+		if (ret) {
+			bch_err(c, "error starting copygc threads");
+			percpu_ref_put(&ca->io_ref);
+			return ret;
+		}
+	}
+
+	ret = bch2_rebalance_start(c);
+	if (ret) {
+		bch_err(c, "error starting rebalance thread");
+		return ret;
+	}
+
+	schedule_delayed_work(&c->pd_controllers_update, 5 * HZ);
+
+	return 0;
+}
+
+static int __bch2_fs_read_write(struct bch_fs *c, bool early)
+{
+	struct bch_dev *ca;
+	unsigned i;
+	int ret;
+
+	if (test_bit(BCH_FS_RW, &c->flags))
+		return 0;
 
-	bch2_fs_mark_clean(c, false);
+	ret = bch2_fs_mark_dirty(c);
+	if (ret)
+		goto err;
 
 	for_each_rw_member(ca, c, i)
 		bch2_dev_allocator_add(c, ca);
 	bch2_recalc_capacity(c);
 
-	err = "error starting allocator thread";
-	for_each_rw_member(ca, c, i)
-		if (bch2_dev_allocator_start(ca)) {
-			percpu_ref_put(&ca->io_ref);
+	if (!test_bit(BCH_FS_ALLOCATOR_STARTED, &c->flags)) {
+		ret = bch2_fs_allocator_start(c);
+		if (ret) {
+			bch_err(c, "error initializing allocator");
 			goto err;
 		}
 
-	set_bit(BCH_FS_ALLOCATOR_RUNNING, &c->flags);
-
-	err = "error starting btree GC thread";
-	if (bch2_gc_thread_start(c))
-		goto err;
+		set_bit(BCH_FS_ALLOCATOR_STARTED, &c->flags);
+	}
 
-	err = "error starting copygc thread";
-	for_each_rw_member(ca, c, i)
-		if (bch2_copygc_start(c, ca)) {
+	for_each_rw_member(ca, c, i) {
+		ret = bch2_dev_allocator_start(ca);
+		if (ret) {
+			bch_err(c, "error starting allocator threads");
 			percpu_ref_put(&ca->io_ref);
 			goto err;
 		}
+	}
 
-	err = "error starting rebalance thread";
-	if (bch2_rebalance_start(c))
-		goto err;
+	set_bit(BCH_FS_ALLOCATOR_RUNNING, &c->flags);
 
-	schedule_delayed_work(&c->pd_controllers_update, 5 * HZ);
+	if (!early) {
+		ret = bch2_fs_read_write_late(c);
+		if (ret)
+			goto err;
+	}
 
-	if (c->state != BCH_FS_STARTING)
-		percpu_ref_reinit(&c->writes);
+	percpu_ref_reinit(&c->writes);
+	set_bit(BCH_FS_RW, &c->flags);
 
-	c->state = BCH_FS_RW;
-	return NULL;
+	queue_delayed_work(c->journal_reclaim_wq,
+			   &c->journal.reclaim_work, 0);
+	return 0;
 err:
 	__bch2_fs_read_only(c);
-	return err;
+	return ret;
+}
+
+int bch2_fs_read_write(struct bch_fs *c)
+{
+	return __bch2_fs_read_write(c, false);
+}
+
+int bch2_fs_read_write_early(struct bch_fs *c)
+{
+	lockdep_assert_held(&c->state_lock);
+
+	if (c->opts.read_only)
+		return -EROFS;
+
+	return __bch2_fs_read_write(c, true);
 }
 
 /* Filesystem startup/shutdown: */
@@ -638,7 +690,8 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 				WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_HIGHPRI, 1)) ||
 	    !(c->journal_reclaim_wq = alloc_workqueue("bcache_journal",
 				WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_HIGHPRI, 1)) ||
-	    percpu_ref_init(&c->writes, bch2_writes_disabled, 0, GFP_KERNEL) ||
+	    percpu_ref_init(&c->writes, bch2_writes_disabled,
+			    PERCPU_REF_INIT_DEAD, GFP_KERNEL) ||
 	    mempool_init_kmalloc_pool(&c->btree_reserve_pool, 1,
 				      sizeof(struct btree_reserve)) ||
 	    mempool_init_kmalloc_pool(&c->btree_interior_update_pool, 1,
@@ -712,7 +765,7 @@ const char *bch2_fs_start(struct bch_fs *c)
 
 	mutex_lock(&c->state_lock);
 
-	BUG_ON(c->state != BCH_FS_STARTING);
+	BUG_ON(test_bit(BCH_FS_STARTED, &c->flags));
 
 	mutex_lock(&c->sb_lock);
 
@@ -746,9 +799,12 @@ const char *bch2_fs_start(struct bch_fs *c)
 	if (c->opts.read_only) {
 		bch2_fs_read_only(c);
 	} else {
-		err = bch2_fs_read_write(c);
-		if (err)
+		if (!test_bit(BCH_FS_RW, &c->flags)
+		    ? bch2_fs_read_write(c)
+		    : bch2_fs_read_write_late(c)) {
+			err = "error going read write";
 			goto err;
+		}
 	}
 
 	set_bit(BCH_FS_STARTED, &c->flags);
diff --git a/fs/bcachefs/super.h b/fs/bcachefs/super.h
index 3f730164ca69..91df0d729322 100644
--- a/fs/bcachefs/super.h
+++ b/fs/bcachefs/super.h
@@ -217,7 +217,9 @@ struct bch_dev *bch2_dev_lookup(struct bch_fs *, const char *);
 
 bool bch2_fs_emergency_read_only(struct bch_fs *);
 void bch2_fs_read_only(struct bch_fs *);
-const char *bch2_fs_read_write(struct bch_fs *);
+
+int bch2_fs_read_write(struct bch_fs *);
+int bch2_fs_read_write_early(struct bch_fs *);
 
 void bch2_fs_stop(struct bch_fs *);
 
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index 361f7b7addcf..f1e269671374 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -289,7 +289,7 @@ static ssize_t bch2_compression_stats(struct bch_fs *c, char *buf)
 	    compressed_sectors_compressed = 0,
 	    compressed_sectors_uncompressed = 0;
 
-	if (!bch2_fs_running(c))
+	if (!test_bit(BCH_FS_STARTED, &c->flags))
 		return -EPERM;
 
 	for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, POS_MIN, 0, k)
@@ -482,7 +482,7 @@ STORE(__bch2_fs)
 	BCH_DEBUG_PARAMS()
 #undef BCH_DEBUG_PARAM
 
-	if (!bch2_fs_running(c))
+	if (!test_bit(BCH_FS_STARTED, &c->flags))
 		return -EPERM;
 
 	/* Debugging: */
-- 
cgit 


From 03e183cb5d429a3bb53816d70da7c19f0745909e Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 21 Mar 2019 23:13:46 -0400
Subject: bcachefs: Verify fs hasn't been modified before going rw

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs.h    |  1 +
 fs/bcachefs/fs.c          |  1 +
 fs/bcachefs/recovery.c    | 15 +++++----
 fs/bcachefs/super-io.c    | 77 ++++++++++++++++++++++++++++++++++++++---------
 fs/bcachefs/super-io.h    |  2 +-
 fs/bcachefs/super.c       |  4 ++-
 fs/bcachefs/super.h       |  1 +
 fs/bcachefs/super_types.h |  1 +
 8 files changed, 78 insertions(+), 24 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 5eae18e92bd5..a815d7a488a6 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -390,6 +390,7 @@ struct bch_dev {
 	char			name[BDEVNAME_SIZE];
 
 	struct bch_sb_handle	disk_sb;
+	struct bch_sb		*sb_read_scratch;
 	int			sb_write_error;
 
 	struct bch_devs_mask	self;
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index 2f01d97470b1..2a5a90b2a781 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -1682,6 +1682,7 @@ static int bch2_remount(struct super_block *sb, int *flags, char *data)
 			ret = bch2_fs_read_write(c);
 			if (ret) {
 				bch_err(c, "error going rw: %i", ret);
+				mutex_unlock(&c->state_lock);
 				return -EINVAL;
 			}
 
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index f7e3060428cf..93c4d5887e8b 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -107,10 +107,11 @@ static int journal_replay_entry_early(struct bch_fs *c,
 }
 
 static int verify_superblock_clean(struct bch_fs *c,
-				   struct bch_sb_field_clean *clean,
+				   struct bch_sb_field_clean **cleanp,
 				   struct jset *j)
 {
 	unsigned i;
+	struct bch_sb_field_clean *clean = *cleanp;
 	int ret = 0;
 
 	if (!clean || !j)
@@ -120,11 +121,9 @@ static int verify_superblock_clean(struct bch_fs *c,
 			"superblock journal seq (%llu) doesn't match journal (%llu) after clean shutdown",
 			le64_to_cpu(clean->journal_seq),
 			le64_to_cpu(j->seq))) {
-		ret = bch2_fs_mark_dirty(c);
-		if (ret) {
-			bch_err(c, "error going rw");
-			return ret;
-		}
+		kfree(clean);
+		*cleanp = NULL;
+		return 0;
 	}
 
 	mustfix_fsck_err_on(j->read_clock != clean->read_clock, c,
@@ -236,7 +235,7 @@ int bch2_fs_recovery(struct bch_fs *c)
 		BUG_ON(ret);
 	}
 
-	ret = verify_superblock_clean(c, clean, j);
+	ret = verify_superblock_clean(c, &clean, j);
 	if (ret)
 		goto err;
 
@@ -430,7 +429,7 @@ int bch2_fs_initialize(struct bch_fs *c)
 	bch2_journal_set_replay_done(&c->journal);
 
 	err = "error going read write";
-	ret = bch2_fs_read_write_early(c);
+	ret = __bch2_fs_read_write(c, true);
 	if (ret)
 		goto err;
 
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index dec6a737f44f..f504743fff4d 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -509,6 +509,8 @@ reread:
 	if (bch2_crc_cmp(csum, sb->sb->csum))
 		return "bad checksum reading superblock";
 
+	sb->seq = le64_to_cpu(sb->sb->seq);
+
 	return NULL;
 }
 
@@ -642,6 +644,25 @@ static void write_super_endio(struct bio *bio)
 	percpu_ref_put(&ca->io_ref);
 }
 
+static void read_back_super(struct bch_fs *c, struct bch_dev *ca)
+{
+	struct bch_sb *sb = ca->disk_sb.sb;
+	struct bio *bio = ca->disk_sb.bio;
+
+	bio_reset(bio, ca->disk_sb.bdev, REQ_OP_READ|REQ_SYNC|REQ_META);
+	bio->bi_iter.bi_sector	= le64_to_cpu(sb->layout.sb_offset[0]);
+	bio->bi_iter.bi_size	= 4096;
+	bio->bi_end_io		= write_super_endio;
+	bio->bi_private		= ca;
+	bch2_bio_map(bio, ca->sb_read_scratch);
+
+	this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_SB],
+		     bio_sectors(bio));
+
+	percpu_ref_get(&ca->io_ref);
+	closure_bio_submit(bio, &c->sb_write);
+}
+
 static void write_one_super(struct bch_fs *c, struct bch_dev *ca, unsigned idx)
 {
 	struct bch_sb *sb = ca->disk_sb.sb;
@@ -669,7 +690,7 @@ static void write_one_super(struct bch_fs *c, struct bch_dev *ca, unsigned idx)
 	closure_bio_submit(bio, &c->sb_write);
 }
 
-void bch2_write_super(struct bch_fs *c)
+int bch2_write_super(struct bch_fs *c)
 {
 	struct closure *cl = &c->sb_write;
 	struct bch_dev *ca;
@@ -677,6 +698,7 @@ void bch2_write_super(struct bch_fs *c)
 	const char *err;
 	struct bch_devs_mask sb_written;
 	bool wrote, can_mount_without_written, can_mount_with_written;
+	int ret = 0;
 
 	lockdep_assert_held(&c->sb_lock);
 
@@ -692,6 +714,7 @@ void bch2_write_super(struct bch_fs *c)
 		err = bch2_sb_validate(&ca->disk_sb);
 		if (err) {
 			bch2_fs_inconsistent(c, "sb invalid before write: %s", err);
+			ret = -1;
 			goto out;
 		}
 	}
@@ -705,10 +728,27 @@ void bch2_write_super(struct bch_fs *c)
 		ca->sb_write_error = 0;
 	}
 
+	for_each_online_member(ca, c, i)
+		read_back_super(c, ca);
+	closure_sync(cl);
+
+	for_each_online_member(ca, c, i) {
+		if (!ca->sb_write_error &&
+		    ca->disk_sb.seq !=
+		    le64_to_cpu(ca->sb_read_scratch->seq)) {
+			bch2_fs_fatal_error(c,
+				"Superblock modified by another process");
+			percpu_ref_put(&ca->io_ref);
+			ret = -EROFS;
+			goto out;
+		}
+	}
+
 	do {
 		wrote = false;
 		for_each_online_member(ca, c, i)
-			if (sb < ca->disk_sb.sb->layout.nr_superblocks) {
+			if (!ca->sb_write_error &&
+			    sb < ca->disk_sb.sb->layout.nr_superblocks) {
 				write_one_super(c, ca, sb);
 				wrote = true;
 			}
@@ -716,9 +756,12 @@ void bch2_write_super(struct bch_fs *c)
 		sb++;
 	} while (wrote);
 
-	for_each_online_member(ca, c, i)
+	for_each_online_member(ca, c, i) {
 		if (ca->sb_write_error)
 			__clear_bit(ca->dev_idx, sb_written.d);
+		else
+			ca->disk_sb.seq = le64_to_cpu(ca->disk_sb.sb->seq);
+	}
 
 	nr_wrote = dev_mask_nr(&sb_written);
 
@@ -741,13 +784,15 @@ void bch2_write_super(struct bch_fs *c)
 	 * written anything (new filesystem), we continue if we'd be able to
 	 * mount with the devices we did successfully write to:
 	 */
-	bch2_fs_fatal_err_on(!nr_wrote ||
-			     (can_mount_without_written &&
-			      !can_mount_with_written), c,
-		"Unable to write superblock to sufficient devices");
+	if (bch2_fs_fatal_err_on(!nr_wrote ||
+				 (can_mount_without_written &&
+				  !can_mount_with_written), c,
+		"Unable to write superblock to sufficient devices"))
+		ret = -1;
 out:
 	/* Make new options visible after they're persistent: */
 	bch2_sb_update(c);
+	return ret;
 }
 
 /* BCH_SB_FIELD_journal: */
@@ -888,16 +933,20 @@ void bch2_sb_clean_renumber(struct bch_sb_field_clean *clean, int write)
 
 int bch2_fs_mark_dirty(struct bch_fs *c)
 {
+	int ret;
+
+	/*
+	 * Unconditionally write superblock, to verify it hasn't changed before
+	 * we go rw:
+	 */
+
 	mutex_lock(&c->sb_lock);
-	if (BCH_SB_CLEAN(c->disk_sb.sb) ||
-	    (c->disk_sb.sb->compat[0] & (1ULL << BCH_COMPAT_FEAT_ALLOC_INFO))) {
-		SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
-		c->disk_sb.sb->compat[0] &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_INFO);
-		bch2_write_super(c);
-	}
+	SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
+	c->disk_sb.sb->compat[0] &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_INFO);
+	ret = bch2_write_super(c);
 	mutex_unlock(&c->sb_lock);
 
-	return 0;
+	return ret;
 }
 
 struct jset_entry *
diff --git a/fs/bcachefs/super-io.h b/fs/bcachefs/super-io.h
index afc92d14c254..31b8b8307ac3 100644
--- a/fs/bcachefs/super-io.h
+++ b/fs/bcachefs/super-io.h
@@ -89,7 +89,7 @@ int bch2_sb_realloc(struct bch_sb_handle *, unsigned);
 const char *bch2_sb_validate(struct bch_sb_handle *);
 
 int bch2_read_super(const char *, struct bch_opts *, struct bch_sb_handle *);
-void bch2_write_super(struct bch_fs *);
+int bch2_write_super(struct bch_fs *);
 
 /* BCH_SB_FIELD_journal: */
 
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 5364b95cfec9..dd1496af9a06 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -366,7 +366,7 @@ static int bch2_fs_read_write_late(struct bch_fs *c)
 	return 0;
 }
 
-static int __bch2_fs_read_write(struct bch_fs *c, bool early)
+int __bch2_fs_read_write(struct bch_fs *c, bool early)
 {
 	struct bch_dev *ca;
 	unsigned i;
@@ -907,6 +907,7 @@ static void bch2_dev_free(struct bch_dev *ca)
 	free_percpu(ca->io_done);
 	bioset_exit(&ca->replica_set);
 	bch2_dev_buckets_free(ca);
+	kfree(ca->sb_read_scratch);
 
 	bch2_time_stats_exit(&ca->io_latency[WRITE]);
 	bch2_time_stats_exit(&ca->io_latency[READ]);
@@ -1017,6 +1018,7 @@ static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c,
 			    0, GFP_KERNEL) ||
 	    percpu_ref_init(&ca->io_ref, bch2_dev_io_ref_complete,
 			    PERCPU_REF_INIT_DEAD, GFP_KERNEL) ||
+	    !(ca->sb_read_scratch = kmalloc(4096, GFP_KERNEL)) ||
 	    bch2_dev_buckets_alloc(c, ca) ||
 	    bioset_init(&ca->replica_set, 4,
 			offsetof(struct bch_write_bio, bio), 0) ||
diff --git a/fs/bcachefs/super.h b/fs/bcachefs/super.h
index 91df0d729322..92ef3e7c8dc2 100644
--- a/fs/bcachefs/super.h
+++ b/fs/bcachefs/super.h
@@ -218,6 +218,7 @@ struct bch_dev *bch2_dev_lookup(struct bch_fs *, const char *);
 bool bch2_fs_emergency_read_only(struct bch_fs *);
 void bch2_fs_read_only(struct bch_fs *);
 
+int __bch2_fs_read_write(struct bch_fs *, bool);
 int bch2_fs_read_write(struct bch_fs *);
 int bch2_fs_read_write_early(struct bch_fs *);
 
diff --git a/fs/bcachefs/super_types.h b/fs/bcachefs/super_types.h
index 04a15729a244..6d0168a73ee4 100644
--- a/fs/bcachefs/super_types.h
+++ b/fs/bcachefs/super_types.h
@@ -12,6 +12,7 @@ struct bch_sb_handle {
 	unsigned		have_layout:1;
 	unsigned		have_bio:1;
 	unsigned		fs_sb:1;
+	u64			seq;
 };
 
 struct bch_devs_mask {
-- 
cgit 


From 5df4be3f62c9bde73db801504b3db2693b28328c Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 25 Mar 2019 15:34:48 -0400
Subject: bcachefs: Btree iter improvements

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c        | 94 +++++++++++++++++++++++++++++------------
 fs/bcachefs/btree_iter.h        |  3 ++
 fs/bcachefs/btree_types.h       |  5 ++-
 fs/bcachefs/btree_update_leaf.c |  5 ++-
 4 files changed, 76 insertions(+), 31 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index db7ae19bd1cd..3d613e8cd55b 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1671,8 +1671,8 @@ static inline unsigned btree_trans_iter_idx(struct btree_trans *trans,
 {
 	ssize_t idx = iter - trans->iters;
 
-	BUG_ON(idx < 0 || idx >= trans->nr_iters);
-	BUG_ON(!(trans->iters_live & (1ULL << idx)));
+	EBUG_ON(idx < 0 || idx >= trans->nr_iters);
+	EBUG_ON(!(trans->iters_linked & (1ULL << idx)));
 
 	return idx;
 }
@@ -1685,14 +1685,28 @@ void bch2_trans_iter_put(struct btree_trans *trans,
 	trans->iters_live	&= ~(1ULL << idx);
 }
 
+static inline void __bch2_trans_iter_free(struct btree_trans *trans,
+					  unsigned idx)
+{
+	trans->iters_linked		&= ~(1ULL << idx);
+	trans->iters_live		&= ~(1ULL << idx);
+	trans->iters_touched		&= ~(1ULL << idx);
+	trans->iters_unlink_on_restart	&= ~(1ULL << idx);
+	trans->iters_unlink_on_commit	&= ~(1ULL << idx);
+	bch2_btree_iter_unlink(&trans->iters[idx]);
+}
+
 void bch2_trans_iter_free(struct btree_trans *trans,
 			  struct btree_iter *iter)
 {
-	ssize_t idx = btree_trans_iter_idx(trans, iter);
+	__bch2_trans_iter_free(trans, btree_trans_iter_idx(trans, iter));
+}
 
-	trans->iters_live	&= ~(1ULL << idx);
-	trans->iters_linked	&= ~(1ULL << idx);
-	bch2_btree_iter_unlink(iter);
+void bch2_trans_iter_free_on_commit(struct btree_trans *trans,
+				    struct btree_iter *iter)
+{
+	trans->iters_unlink_on_commit |=
+		1ULL << btree_trans_iter_idx(trans, iter);
 }
 
 static int btree_trans_realloc_iters(struct btree_trans *trans,
@@ -1728,6 +1742,11 @@ success:
 	memcpy(new_updates, trans->updates,
 	       sizeof(struct btree_insert_entry) * trans->nr_updates);
 
+	if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG))
+		memset(trans->iters, POISON_FREE,
+		       sizeof(struct btree_iter) * trans->nr_iters +
+		       sizeof(struct btree_insert_entry) * trans->nr_iters);
+
 	if (trans->iters != trans->iters_onstack)
 		kfree(trans->iters);
 
@@ -1763,7 +1782,7 @@ void bch2_trans_preload_iters(struct btree_trans *trans)
 }
 
 static struct btree_iter *__btree_trans_get_iter(struct btree_trans *trans,
-						 unsigned btree_id,
+						 unsigned btree_id, struct bpos pos,
 						 unsigned flags, u64 iter_id)
 {
 	struct btree_iter *iter;
@@ -1771,9 +1790,14 @@ static struct btree_iter *__btree_trans_get_iter(struct btree_trans *trans,
 
 	BUG_ON(trans->nr_iters > BTREE_ITER_MAX);
 
-	for (idx = 0; idx < trans->nr_iters; idx++)
-		if (trans->iters[idx].id == iter_id)
+	for (idx = 0; idx < trans->nr_iters; idx++) {
+		iter = &trans->iters[idx];
+		if (iter_id
+		    ? iter->id == iter_id
+		    : (iter->btree_id == btree_id &&
+		       !bkey_cmp(iter->pos, pos)))
 			goto found;
+	}
 	idx = -1;
 found:
 	if (idx < 0) {
@@ -1804,8 +1828,10 @@ got_slot:
 		iter->flags |= flags & (BTREE_ITER_INTENT|BTREE_ITER_PREFETCH);
 	}
 
+	BUG_ON(iter->btree_id != btree_id);
 	BUG_ON(trans->iters_live & (1ULL << idx));
-	trans->iters_live |= 1ULL << idx;
+	trans->iters_live	|= 1ULL << idx;
+	trans->iters_touched	|= 1ULL << idx;
 
 	if (trans->iters_linked &&
 	    !(trans->iters_linked & (1 << idx)))
@@ -1828,7 +1854,7 @@ struct btree_iter *__bch2_trans_get_iter(struct btree_trans *trans,
 					 u64 iter_id)
 {
 	struct btree_iter *iter =
-		__btree_trans_get_iter(trans, btree_id, flags, iter_id);
+		__btree_trans_get_iter(trans, btree_id, pos, flags, iter_id);
 
 	if (!IS_ERR(iter))
 		bch2_btree_iter_set_pos(iter, pos);
@@ -1841,10 +1867,13 @@ struct btree_iter *__bch2_trans_copy_iter(struct btree_trans *trans,
 {
 	struct btree_iter *iter =
 		__btree_trans_get_iter(trans, src->btree_id,
-				       src->flags, iter_id);
+				       POS_MIN, src->flags, iter_id);
 
-	if (!IS_ERR(iter))
+	if (!IS_ERR(iter)) {
+		trans->iters_unlink_on_restart |=
+			1ULL << btree_trans_iter_idx(trans, iter);
 		bch2_btree_iter_copy(iter, src);
+	}
 	return iter;
 }
 
@@ -1894,10 +1923,21 @@ int bch2_trans_unlock(struct btree_trans *trans)
 	return ret;
 }
 
+inline void bch2_trans_unlink_iters(struct btree_trans *trans, u64 iters)
+{
+	iters &= trans->iters_linked;
+
+	while (iters) {
+		unsigned idx = __ffs64(iters);
+
+		iters &= ~(1ULL << idx);
+		__bch2_trans_iter_free(trans, idx);
+	}
+}
+
 void __bch2_trans_begin(struct btree_trans *trans)
 {
-	u64 linked_not_live;
-	unsigned idx;
+	u64 iters_to_unlink;
 
 	btree_trans_verify(trans);
 
@@ -1909,22 +1949,20 @@ void __bch2_trans_begin(struct btree_trans *trans)
 	 * further (allocated an iter with a higher idx) than where the iter
 	 * was originally allocated:
 	 */
-	while (1) {
-		linked_not_live = trans->iters_linked & ~trans->iters_live;
-		if (!linked_not_live)
-			break;
+	iters_to_unlink = ~trans->iters_live &
+		((1ULL << fls64(trans->iters_live)) - 1);
 
-		idx = __ffs64(linked_not_live);
-		if (1ULL << idx > trans->iters_live)
-			break;
+	iters_to_unlink |= trans->iters_unlink_on_restart;
+	iters_to_unlink |= trans->iters_unlink_on_commit;
 
-		trans->iters_linked ^= 1 << idx;
-		bch2_btree_iter_unlink(&trans->iters[idx]);
-	}
+	bch2_trans_unlink_iters(trans, iters_to_unlink);
 
-	trans->iters_live	= 0;
-	trans->nr_updates	= 0;
-	trans->mem_top		= 0;
+	trans->iters_live		= 0;
+	trans->iters_touched		= 0;
+	trans->iters_unlink_on_restart	= 0;
+	trans->iters_unlink_on_commit	= 0;
+	trans->nr_updates		= 0;
+	trans->mem_top			= 0;
 
 	btree_trans_verify(trans);
 }
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index fcec373db39a..04f747180bd8 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -273,6 +273,9 @@ static inline int btree_iter_err(struct bkey_s_c k)
 void bch2_trans_preload_iters(struct btree_trans *);
 void bch2_trans_iter_put(struct btree_trans *, struct btree_iter *);
 void bch2_trans_iter_free(struct btree_trans *, struct btree_iter *);
+void bch2_trans_iter_free_on_commit(struct btree_trans *, struct btree_iter *);
+
+void bch2_trans_unlink_iters(struct btree_trans *, u64);
 
 struct btree_iter *__bch2_trans_get_iter(struct btree_trans *, enum btree_id,
 					 struct bpos, unsigned, u64);
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index bcc14e40cf5e..18c906ca78be 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -276,8 +276,11 @@ struct btree_trans {
 	size_t			nr_restarts;
 	u64			commit_start;
 
-	u64			iters_live;
 	u64			iters_linked;
+	u64			iters_live;
+	u64			iters_touched;
+	u64			iters_unlink_on_restart;
+	u64			iters_unlink_on_commit;
 
 	u8			nr_iters;
 	u8			nr_updates;
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 9c1ca9ad3ead..1c9bfec922c5 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -886,10 +886,11 @@ out_noupdates:
 		trans->commit_start = 0;
 	}
 
-	trans->nr_updates = 0;
-
 	BUG_ON(!(trans->flags & BTREE_INSERT_ATOMIC) && ret == -EINTR);
 
+	bch2_trans_unlink_iters(trans, trans->iters_unlink_on_commit);
+	trans->nr_updates = 0;
+
 	return ret;
 err:
 	ret = bch2_trans_commit_error(trans, i, ret);
-- 
cgit 


From 424eb881300467a21a108d04c9dd08a6f8c007dc Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 25 Mar 2019 15:10:15 -0400
Subject: bcachefs: Only get btree iters from btree transactions

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c      |  16 ++--
 fs/bcachefs/btree_gc.c              |  34 +++++----
 fs/bcachefs/btree_io.c              |  18 +++--
 fs/bcachefs/btree_iter.c            |  63 ++++++++++++----
 fs/bcachefs/btree_iter.h            |  53 ++++++-------
 fs/bcachefs/btree_update_leaf.c     |   2 +-
 fs/bcachefs/chardev.c               |   4 +-
 fs/bcachefs/debug.c                 |  41 ++++++----
 fs/bcachefs/dirent.c                |  19 +++--
 fs/bcachefs/ec.c                    |  39 ++++++----
 fs/bcachefs/extents.c               |  11 ++-
 fs/bcachefs/fs-io.c                 |  59 +++++++++------
 fs/bcachefs/fs.c                    |  11 ++-
 fs/bcachefs/fsck.c                  | 146 ++++++++++++++++++------------------
 fs/bcachefs/fsck.h                  |   1 -
 fs/bcachefs/inode.c                 |  12 +--
 fs/bcachefs/io.c                    |  57 ++++++++------
 fs/bcachefs/journal_seq_blacklist.c |  17 +++--
 fs/bcachefs/migrate.c               |  22 +++---
 fs/bcachefs/move.c                  |  72 ++++++++++++------
 fs/bcachefs/move_types.h            |   3 +-
 fs/bcachefs/quota.c                 |  18 +++--
 fs/bcachefs/rebalance.c             |   4 +-
 fs/bcachefs/str_hash.h              |  46 ++++++++----
 fs/bcachefs/sysfs.c                 |   9 ++-
 fs/bcachefs/tests.c                 | 121 ++++++++++++++++++------------
 fs/bcachefs/xattr.c                 |  10 ++-
 27 files changed, 550 insertions(+), 358 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index da25a1ed5206..436eb1e1ab07 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -264,18 +264,21 @@ static void bch2_alloc_read_key(struct bch_fs *c, struct bkey_s_c k)
 int bch2_alloc_read(struct bch_fs *c, struct list_head *journal_replay_list)
 {
 	struct journal_replay *r;
-	struct btree_iter iter;
+	struct btree_trans trans;
+	struct btree_iter *iter;
 	struct bkey_s_c k;
 	struct bch_dev *ca;
 	unsigned i;
 	int ret;
 
-	for_each_btree_key(&iter, c, BTREE_ID_ALLOC, POS_MIN, 0, k) {
+	bch2_trans_init(&trans, c);
+
+	for_each_btree_key(&trans, iter, BTREE_ID_ALLOC, POS_MIN, 0, k) {
 		bch2_alloc_read_key(c, k);
-		bch2_btree_iter_cond_resched(&iter);
+		bch2_trans_cond_resched(&trans);
 	}
 
-	ret = bch2_btree_iter_unlock(&iter);
+	ret = bch2_trans_exit(&trans);
 	if (ret)
 		return ret;
 
@@ -391,8 +394,6 @@ static int __bch2_alloc_write_key(struct btree_trans *trans, struct bch_dev *ca,
 	__alloc_write_key(a, g, m);
 	percpu_up_read(&c->mark_lock);
 
-	bch2_btree_iter_cond_resched(iter);
-
 	bch2_trans_update(trans, BTREE_INSERT_ENTRY(iter, &a->k_i));
 
 	ret = bch2_trans_commit(trans, NULL, journal_seq,
@@ -450,6 +451,7 @@ int bch2_alloc_write(struct bch_fs *c, bool nowait, bool *wrote)
 			if (ret)
 				break;
 
+			bch2_trans_cond_resched(&trans);
 			*wrote = true;
 		}
 		up_read(&ca->bucket_lock);
@@ -938,8 +940,6 @@ static int bch2_invalidate_one_bucket2(struct btree_trans *trans,
 	spin_unlock(&c->freelist_lock);
 	percpu_up_read(&c->mark_lock);
 
-	bch2_btree_iter_cond_resched(iter);
-
 	BUG_ON(BKEY_ALLOC_VAL_U64s_MAX > 8);
 
 	bch2_btree_iter_set_pos(iter, POS(ca->dev_idx, b));
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 302793d84b92..aa8ac7d661ee 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -207,13 +207,16 @@ static int btree_gc_mark_node(struct bch_fs *c, struct btree *b,
 static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id,
 			 bool initial)
 {
-	struct btree_iter iter;
+	struct btree_trans trans;
+	struct btree_iter *iter;
 	struct btree *b;
 	struct range_checks r;
 	unsigned depth = btree_node_type_needs_gc(btree_id) ? 0 : 1;
 	u8 max_stale;
 	int ret = 0;
 
+	bch2_trans_init(&trans, c);
+
 	gc_pos_set(c, gc_pos_btree(btree_id, POS_MIN, 0));
 
 	/*
@@ -227,7 +230,7 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id,
 
 	btree_node_range_checks_init(&r, depth);
 
-	__for_each_btree_node(&iter, c, btree_id, POS_MIN,
+	__for_each_btree_node(&trans, iter, btree_id, POS_MIN,
 			      0, depth, BTREE_ITER_PREFETCH, b) {
 		btree_node_range_checks(c, b, &r);
 
@@ -241,22 +244,22 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id,
 
 		if (!initial) {
 			if (max_stale > 64)
-				bch2_btree_node_rewrite(c, &iter,
+				bch2_btree_node_rewrite(c, iter,
 						b->data->keys.seq,
 						BTREE_INSERT_USE_RESERVE|
 						BTREE_INSERT_NOWAIT|
 						BTREE_INSERT_GC_LOCK_HELD);
 			else if (!btree_gc_rewrite_disabled(c) &&
 				 (btree_gc_always_rewrite(c) || max_stale > 16))
-				bch2_btree_node_rewrite(c, &iter,
+				bch2_btree_node_rewrite(c, iter,
 						b->data->keys.seq,
 						BTREE_INSERT_NOWAIT|
 						BTREE_INSERT_GC_LOCK_HELD);
 		}
 
-		bch2_btree_iter_cond_resched(&iter);
+		bch2_trans_cond_resched(&trans);
 	}
-	ret = bch2_btree_iter_unlock(&iter) ?: ret;
+	ret = bch2_trans_exit(&trans) ?: ret;
 	if (ret)
 		return ret;
 
@@ -1030,7 +1033,8 @@ next:
 
 static int bch2_coalesce_btree(struct bch_fs *c, enum btree_id btree_id)
 {
-	struct btree_iter iter;
+	struct btree_trans trans;
+	struct btree_iter *iter;
 	struct btree *b;
 	bool kthread = (current->flags & PF_KTHREAD) != 0;
 	unsigned i;
@@ -1039,6 +1043,8 @@ static int bch2_coalesce_btree(struct bch_fs *c, enum btree_id btree_id)
 	struct btree *merge[GC_MERGE_NODES];
 	u32 lock_seq[GC_MERGE_NODES];
 
+	bch2_trans_init(&trans, c);
+
 	/*
 	 * XXX: We don't have a good way of positively matching on sibling nodes
 	 * that have the same parent - this code works by handling the cases
@@ -1048,7 +1054,7 @@ static int bch2_coalesce_btree(struct bch_fs *c, enum btree_id btree_id)
 	 */
 	memset(merge, 0, sizeof(merge));
 
-	__for_each_btree_node(&iter, c, btree_id, POS_MIN,
+	__for_each_btree_node(&trans, iter, btree_id, POS_MIN,
 			      BTREE_MAX_DEPTH, 0,
 			      BTREE_ITER_PREFETCH, b) {
 		memmove(merge + 1, merge,
@@ -1070,7 +1076,7 @@ static int bch2_coalesce_btree(struct bch_fs *c, enum btree_id btree_id)
 		}
 		memset(merge + i, 0, (GC_MERGE_NODES - i) * sizeof(merge[0]));
 
-		bch2_coalesce_nodes(c, &iter, merge);
+		bch2_coalesce_nodes(c, iter, merge);
 
 		for (i = 1; i < GC_MERGE_NODES && merge[i]; i++) {
 			lock_seq[i] = merge[i]->lock.state.seq;
@@ -1080,23 +1086,23 @@ static int bch2_coalesce_btree(struct bch_fs *c, enum btree_id btree_id)
 		lock_seq[0] = merge[0]->lock.state.seq;
 
 		if (kthread && kthread_should_stop()) {
-			bch2_btree_iter_unlock(&iter);
+			bch2_trans_exit(&trans);
 			return -ESHUTDOWN;
 		}
 
-		bch2_btree_iter_cond_resched(&iter);
+		bch2_trans_cond_resched(&trans);
 
 		/*
 		 * If the parent node wasn't relocked, it might have been split
 		 * and the nodes in our sliding window might not have the same
 		 * parent anymore - blow away the sliding window:
 		 */
-		if (btree_iter_node(&iter, iter.level + 1) &&
-		    !btree_node_intent_locked(&iter, iter.level + 1))
+		if (btree_iter_node(iter, iter->level + 1) &&
+		    !btree_node_intent_locked(iter, iter->level + 1))
 			memset(merge + 1, 0,
 			       (GC_MERGE_NODES - 1) * sizeof(merge[0]));
 	}
-	return bch2_btree_iter_unlock(&iter);
+	return bch2_trans_exit(&trans);
 }
 
 /**
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index d785e6ac22f7..10b3d53b6ebb 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -1153,19 +1153,21 @@ static void bch2_btree_node_write_error(struct bch_fs *c,
 	struct bkey_i_btree_ptr *new_key;
 	struct bkey_s_btree_ptr bp;
 	struct bch_extent_ptr *ptr;
-	struct btree_iter iter;
+	struct btree_trans trans;
+	struct btree_iter *iter;
 	int ret;
 
-	__bch2_btree_iter_init(&iter, c, b->btree_id, b->key.k.p,
-			       BTREE_MAX_DEPTH,
-			       b->level, BTREE_ITER_NODES);
+	bch2_trans_init(&trans, c);
+
+	iter = bch2_trans_get_node_iter(&trans, b->btree_id, b->key.k.p,
+					BTREE_MAX_DEPTH, b->level, 0);
 retry:
-	ret = bch2_btree_iter_traverse(&iter);
+	ret = bch2_btree_iter_traverse(iter);
 	if (ret)
 		goto err;
 
 	/* has node been freed? */
-	if (iter.l[b->level].b != b) {
+	if (iter->l[b->level].b != b) {
 		/* node has been freed: */
 		BUG_ON(!btree_node_dying(b));
 		goto out;
@@ -1184,13 +1186,13 @@ retry:
 	if (!bch2_bkey_nr_ptrs(bp.s_c))
 		goto err;
 
-	ret = bch2_btree_node_update_key(c, &iter, b, new_key);
+	ret = bch2_btree_node_update_key(c, iter, b, new_key);
 	if (ret == -EINTR)
 		goto retry;
 	if (ret)
 		goto err;
 out:
-	bch2_btree_iter_unlock(&iter);
+	bch2_trans_exit(&trans);
 	bio_put(&wbio->wbio.bio);
 	btree_node_write_done(c, b);
 	return;
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 3d613e8cd55b..b2446b14bf33 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1582,15 +1582,15 @@ struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *iter)
 	return __bch2_btree_iter_peek_slot(iter);
 }
 
-void __bch2_btree_iter_init(struct btree_iter *iter, struct bch_fs *c,
-			    enum btree_id btree_id, struct bpos pos,
-			    unsigned locks_want, unsigned depth,
-			    unsigned flags)
+static inline void bch2_btree_iter_init(struct btree_iter *iter,
+			struct bch_fs *c, enum btree_id btree_id,
+			struct bpos pos, unsigned flags)
 {
 	unsigned i;
 
-	EBUG_ON(depth >= BTREE_MAX_DEPTH);
-	EBUG_ON(locks_want > BTREE_MAX_DEPTH);
+	if (btree_id == BTREE_ID_EXTENTS &&
+	    !(flags & BTREE_ITER_NODES))
+		flags |= BTREE_ITER_IS_EXTENTS;
 
 	iter->c				= c;
 	iter->pos			= pos;
@@ -1599,8 +1599,8 @@ void __bch2_btree_iter_init(struct btree_iter *iter, struct bch_fs *c,
 	iter->flags			= flags;
 	iter->uptodate			= BTREE_ITER_NEED_TRAVERSE;
 	iter->btree_id			= btree_id;
-	iter->level			= depth;
-	iter->locks_want		= locks_want;
+	iter->level			= 0;
+	iter->locks_want		= flags & BTREE_ITER_INTENT ? 1 : 0;
 	iter->nodes_locked		= 0;
 	iter->nodes_intent_locked	= 0;
 	for (i = 0; i < ARRAY_SIZE(iter->l); i++)
@@ -1677,12 +1677,14 @@ static inline unsigned btree_trans_iter_idx(struct btree_trans *trans,
 	return idx;
 }
 
-void bch2_trans_iter_put(struct btree_trans *trans,
-			 struct btree_iter *iter)
+int bch2_trans_iter_put(struct btree_trans *trans,
+			struct btree_iter *iter)
 {
 	ssize_t idx = btree_trans_iter_idx(trans, iter);
+	int ret = (iter->flags & BTREE_ITER_ERROR) ? -EIO : 0;
 
 	trans->iters_live	&= ~(1ULL << idx);
+	return ret;
 }
 
 static inline void __bch2_trans_iter_free(struct btree_trans *trans,
@@ -1696,17 +1698,23 @@ static inline void __bch2_trans_iter_free(struct btree_trans *trans,
 	bch2_btree_iter_unlink(&trans->iters[idx]);
 }
 
-void bch2_trans_iter_free(struct btree_trans *trans,
-			  struct btree_iter *iter)
+int bch2_trans_iter_free(struct btree_trans *trans,
+			 struct btree_iter *iter)
 {
+	int ret = (iter->flags & BTREE_ITER_ERROR) ? -EIO : 0;
+
 	__bch2_trans_iter_free(trans, btree_trans_iter_idx(trans, iter));
+	return ret;
 }
 
-void bch2_trans_iter_free_on_commit(struct btree_trans *trans,
-				    struct btree_iter *iter)
+int bch2_trans_iter_free_on_commit(struct btree_trans *trans,
+				   struct btree_iter *iter)
 {
+	int ret = (iter->flags & BTREE_ITER_ERROR) ? -EIO : 0;
+
 	trans->iters_unlink_on_commit |=
 		1ULL << btree_trans_iter_idx(trans, iter);
+	return ret;
 }
 
 static int btree_trans_realloc_iters(struct btree_trans *trans,
@@ -1820,7 +1828,7 @@ got_slot:
 		iter = &trans->iters[idx];
 		iter->id = iter_id;
 
-		bch2_btree_iter_init(iter, trans->c, btree_id, POS_MIN, flags);
+		bch2_btree_iter_init(iter, trans->c, btree_id, pos, flags);
 	} else {
 		iter = &trans->iters[idx];
 
@@ -1861,6 +1869,31 @@ struct btree_iter *__bch2_trans_get_iter(struct btree_trans *trans,
 	return iter;
 }
 
+struct btree_iter *bch2_trans_get_node_iter(struct btree_trans *trans,
+					    enum btree_id btree_id,
+					    struct bpos pos,
+					    unsigned locks_want,
+					    unsigned depth,
+					    unsigned flags)
+{
+	struct btree_iter *iter =
+		__btree_trans_get_iter(trans, btree_id, pos,
+				       flags|BTREE_ITER_NODES, 0);
+	unsigned i;
+
+	BUG_ON(IS_ERR(iter));
+	BUG_ON(bkey_cmp(iter->pos, pos));
+
+	iter->locks_want = locks_want;
+	iter->level	= depth;
+
+	for (i = 0; i < ARRAY_SIZE(iter->l); i++)
+		iter->l[i].b		= NULL;
+	iter->l[iter->level].b		= BTREE_ITER_NOT_END;
+
+	return iter;
+}
+
 struct btree_iter *__bch2_trans_copy_iter(struct btree_trans *trans,
 					  struct btree_iter *src,
 					  u64 iter_id)
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index 04f747180bd8..267cecd05d84 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -150,20 +150,6 @@ struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *);
 void bch2_btree_iter_set_pos_same_leaf(struct btree_iter *, struct bpos);
 void bch2_btree_iter_set_pos(struct btree_iter *, struct bpos);
 
-void __bch2_btree_iter_init(struct btree_iter *, struct bch_fs *,
-			   enum btree_id, struct bpos,
-			   unsigned , unsigned, unsigned);
-
-static inline void bch2_btree_iter_init(struct btree_iter *iter,
-			struct bch_fs *c, enum btree_id btree_id,
-			struct bpos pos, unsigned flags)
-{
-	__bch2_btree_iter_init(iter, c, btree_id, pos,
-			       flags & BTREE_ITER_INTENT ? 1 : 0, 0,
-			       (btree_id == BTREE_ID_EXTENTS
-				?  BTREE_ITER_IS_EXTENTS : 0)|flags);
-}
-
 void bch2_btree_iter_copy(struct btree_iter *, struct btree_iter *);
 
 static inline struct bpos btree_type_successor(enum btree_id id,
@@ -221,17 +207,18 @@ static inline void bch2_btree_iter_cond_resched(struct btree_iter *iter)
 	}
 }
 
-#define __for_each_btree_node(_iter, _c, _btree_id, _start,		\
+#define __for_each_btree_node(_trans, _iter, _btree_id, _start,	\
 			      _locks_want, _depth, _flags, _b)		\
-	for (__bch2_btree_iter_init((_iter), (_c), (_btree_id), _start,	\
-				    _locks_want, _depth,		\
-				    _flags|BTREE_ITER_NODES),		\
+	for (iter = bch2_trans_get_node_iter((_trans), (_btree_id),	\
+				_start, _locks_want, _depth, _flags),	\
 	     _b = bch2_btree_iter_peek_node(_iter);			\
 	     (_b);							\
 	     (_b) = bch2_btree_iter_next_node(_iter, _depth))
 
-#define for_each_btree_node(_iter, _c, _btree_id, _start, _flags, _b)	\
-	__for_each_btree_node(_iter, _c, _btree_id, _start, 0, 0, _flags, _b)
+#define for_each_btree_node(_trans, _iter, _btree_id, _start,		\
+			    _flags, _b)					\
+	__for_each_btree_node(_trans, _iter, _btree_id, _start,		\
+			      0, 0, _flags, _b)
 
 static inline struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter,
 						     unsigned flags)
@@ -251,9 +238,9 @@ static inline struct bkey_s_c __bch2_btree_iter_next(struct btree_iter *iter,
 		: bch2_btree_iter_next(iter);
 }
 
-#define for_each_btree_key(_iter, _c, _btree_id,  _start, _flags, _k)	\
-	for (bch2_btree_iter_init((_iter), (_c), (_btree_id),		\
-				  (_start), (_flags)),			\
+#define for_each_btree_key(_trans, _iter, _btree_id,  _start, _flags, _k)\
+	for (iter = bch2_trans_get_iter((_trans), (_btree_id),		\
+					(_start), (_flags)),		\
 	     (_k) = __bch2_btree_iter_peek(_iter, _flags);		\
 	     !IS_ERR_OR_NULL((_k).k);					\
 	     (_k) = __bch2_btree_iter_next(_iter, _flags))
@@ -271,9 +258,9 @@ static inline int btree_iter_err(struct bkey_s_c k)
 /* new multiple iterator interface: */
 
 void bch2_trans_preload_iters(struct btree_trans *);
-void bch2_trans_iter_put(struct btree_trans *, struct btree_iter *);
-void bch2_trans_iter_free(struct btree_trans *, struct btree_iter *);
-void bch2_trans_iter_free_on_commit(struct btree_trans *, struct btree_iter *);
+int bch2_trans_iter_put(struct btree_trans *, struct btree_iter *);
+int bch2_trans_iter_free(struct btree_trans *, struct btree_iter *);
+int bch2_trans_iter_free_on_commit(struct btree_trans *, struct btree_iter *);
 
 void bch2_trans_unlink_iters(struct btree_trans *, u64);
 
@@ -308,6 +295,10 @@ bch2_trans_copy_iter(struct btree_trans *trans, struct btree_iter *src)
 	return __bch2_trans_copy_iter(trans, src, __btree_iter_id());
 }
 
+struct btree_iter *bch2_trans_get_node_iter(struct btree_trans *,
+				enum btree_id, struct bpos,
+				unsigned, unsigned, unsigned);
+
 void __bch2_trans_begin(struct btree_trans *);
 
 static inline void bch2_trans_begin_updates(struct btree_trans *trans)
@@ -320,6 +311,16 @@ int bch2_trans_unlock(struct btree_trans *);
 void bch2_trans_init(struct btree_trans *, struct bch_fs *);
 int bch2_trans_exit(struct btree_trans *);
 
+static inline void bch2_trans_cond_resched(struct btree_trans *trans)
+{
+	if (need_resched()) {
+		bch2_trans_unlock(trans);
+		schedule();
+	} else if (race_fault()) {
+		bch2_trans_unlock(trans);
+	}
+}
+
 #ifdef TRACE_TRANSACTION_RESTARTS
 #define bch2_trans_begin(_trans)					\
 do {									\
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 1c9bfec922c5..45838db7b991 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -999,7 +999,7 @@ int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id,
 		if (ret)
 			break;
 
-		bch2_btree_iter_cond_resched(iter);
+		bch2_trans_cond_resched(&trans);
 	}
 
 	bch2_trans_exit(&trans);
diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c
index 5ee38a6a442f..f7cfec9f00f9 100644
--- a/fs/bcachefs/chardev.c
+++ b/fs/bcachefs/chardev.c
@@ -303,8 +303,8 @@ static ssize_t bch2_data_job_read(struct file *file, char __user *buf,
 	struct bch_ioctl_data_event e = {
 		.type			= BCH_DATA_EVENT_PROGRESS,
 		.p.data_type		= ctx->stats.data_type,
-		.p.btree_id		= ctx->stats.iter.btree_id,
-		.p.pos			= ctx->stats.iter.pos,
+		.p.btree_id		= ctx->stats.btree_id,
+		.p.pos			= ctx->stats.pos,
 		.p.sectors_done		= atomic64_read(&ctx->stats.sectors_seen),
 		.p.sectors_total	= bch2_fs_usage_read_short(c).used,
 	};
diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c
index f15c29878a9e..64e079280a9a 100644
--- a/fs/bcachefs/debug.c
+++ b/fs/bcachefs/debug.c
@@ -205,7 +205,8 @@ static ssize_t bch2_read_btree(struct file *file, char __user *buf,
 			       size_t size, loff_t *ppos)
 {
 	struct dump_iter *i = file->private_data;
-	struct btree_iter iter;
+	struct btree_trans trans;
+	struct btree_iter *iter;
 	struct bkey_s_c k;
 	int err;
 
@@ -220,8 +221,10 @@ static ssize_t bch2_read_btree(struct file *file, char __user *buf,
 	if (!i->size)
 		return i->ret;
 
-	bch2_btree_iter_init(&iter, i->c, i->id, i->from, BTREE_ITER_PREFETCH);
-	k = bch2_btree_iter_peek(&iter);
+	bch2_trans_init(&trans, i->c);
+
+	iter = bch2_trans_get_iter(&trans, i->id, i->from, BTREE_ITER_PREFETCH);
+	k = bch2_btree_iter_peek(iter);
 
 	while (k.k && !(err = btree_iter_err(k))) {
 		bch2_bkey_val_to_text(&PBUF(i->buf), i->c, k);
@@ -230,8 +233,8 @@ static ssize_t bch2_read_btree(struct file *file, char __user *buf,
 		i->buf[i->bytes] = '\n';
 		i->bytes++;
 
-		k = bch2_btree_iter_next(&iter);
-		i->from = iter.pos;
+		k = bch2_btree_iter_next(iter);
+		i->from = iter->pos;
 
 		err = flush_buf(i);
 		if (err)
@@ -240,7 +243,7 @@ static ssize_t bch2_read_btree(struct file *file, char __user *buf,
 		if (!i->size)
 			break;
 	}
-	bch2_btree_iter_unlock(&iter);
+	bch2_trans_exit(&trans);
 
 	return err < 0 ? err : i->ret;
 }
@@ -256,7 +259,8 @@ static ssize_t bch2_read_btree_formats(struct file *file, char __user *buf,
 				       size_t size, loff_t *ppos)
 {
 	struct dump_iter *i = file->private_data;
-	struct btree_iter iter;
+	struct btree_trans trans;
+	struct btree_iter *iter;
 	struct btree *b;
 	int err;
 
@@ -271,7 +275,9 @@ static ssize_t bch2_read_btree_formats(struct file *file, char __user *buf,
 	if (!i->size || !bkey_cmp(POS_MAX, i->from))
 		return i->ret;
 
-	for_each_btree_node(&iter, i->c, i->id, i->from, 0, b) {
+	bch2_trans_init(&trans, i->c);
+
+	for_each_btree_node(&trans, iter, i->id, i->from, 0, b) {
 		bch2_btree_node_to_text(&PBUF(i->buf), i->c, b);
 		i->bytes = strlen(i->buf);
 		err = flush_buf(i);
@@ -289,7 +295,7 @@ static ssize_t bch2_read_btree_formats(struct file *file, char __user *buf,
 		if (!i->size)
 			break;
 	}
-	bch2_btree_iter_unlock(&iter);
+	bch2_trans_exit(&trans);
 
 	return err < 0 ? err : i->ret;
 }
@@ -305,7 +311,8 @@ static ssize_t bch2_read_bfloat_failed(struct file *file, char __user *buf,
 				       size_t size, loff_t *ppos)
 {
 	struct dump_iter *i = file->private_data;
-	struct btree_iter iter;
+	struct btree_trans trans;
+	struct btree_iter *iter;
 	struct bkey_s_c k;
 	struct btree *prev_node = NULL;
 	int err;
@@ -321,11 +328,13 @@ static ssize_t bch2_read_bfloat_failed(struct file *file, char __user *buf,
 	if (!i->size)
 		return i->ret;
 
-	bch2_btree_iter_init(&iter, i->c, i->id, i->from, BTREE_ITER_PREFETCH);
+	bch2_trans_init(&trans, i->c);
+
+	iter = bch2_trans_get_iter(&trans, i->id, i->from, BTREE_ITER_PREFETCH);
 
-	while ((k = bch2_btree_iter_peek(&iter)).k &&
+	while ((k = bch2_btree_iter_peek(iter)).k &&
 	       !(err = btree_iter_err(k))) {
-		struct btree_iter_level *l = &iter.l[0];
+		struct btree_iter_level *l = &iter->l[0];
 		struct bkey_packed *_k =
 			bch2_btree_node_iter_peek(&l->iter, l->b);
 
@@ -344,8 +353,8 @@ static ssize_t bch2_read_bfloat_failed(struct file *file, char __user *buf,
 		if (err)
 			break;
 
-		bch2_btree_iter_next(&iter);
-		i->from = iter.pos;
+		bch2_btree_iter_next(iter);
+		i->from = iter->pos;
 
 		err = flush_buf(i);
 		if (err)
@@ -354,7 +363,7 @@ static ssize_t bch2_read_bfloat_failed(struct file *file, char __user *buf,
 		if (!i->size)
 			break;
 	}
-	bch2_btree_iter_unlock(&iter);
+	bch2_trans_exit(&trans);
 
 	return err < 0 ? err : i->ret;
 }
diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c
index dc3883204d80..672a94936179 100644
--- a/fs/bcachefs/dirent.c
+++ b/fs/bcachefs/dirent.c
@@ -331,11 +331,15 @@ out:
 
 int bch2_empty_dir(struct bch_fs *c, u64 dir_inum)
 {
-	struct btree_iter iter;
+	struct btree_trans trans;
+	struct btree_iter *iter;
 	struct bkey_s_c k;
 	int ret = 0;
 
-	for_each_btree_key(&iter, c, BTREE_ID_DIRENTS, POS(dir_inum, 0), 0, k) {
+	bch2_trans_init(&trans, c);
+
+	for_each_btree_key(&trans, iter, BTREE_ID_DIRENTS,
+			   POS(dir_inum, 0), 0, k) {
 		if (k.k->p.inode > dir_inum)
 			break;
 
@@ -344,7 +348,7 @@ int bch2_empty_dir(struct bch_fs *c, u64 dir_inum)
 			break;
 		}
 	}
-	bch2_btree_iter_unlock(&iter);
+	bch2_trans_exit(&trans);
 
 	return ret;
 }
@@ -353,7 +357,8 @@ int bch2_readdir(struct bch_fs *c, struct file *file,
 		 struct dir_context *ctx)
 {
 	struct bch_inode_info *inode = file_bch_inode(file);
-	struct btree_iter iter;
+	struct btree_trans trans;
+	struct btree_iter *iter;
 	struct bkey_s_c k;
 	struct bkey_s_c_dirent dirent;
 	unsigned len;
@@ -361,7 +366,9 @@ int bch2_readdir(struct bch_fs *c, struct file *file,
 	if (!dir_emit_dots(file, ctx))
 		return 0;
 
-	for_each_btree_key(&iter, c, BTREE_ID_DIRENTS,
+	bch2_trans_init(&trans, c);
+
+	for_each_btree_key(&trans, iter, BTREE_ID_DIRENTS,
 			   POS(inode->v.i_ino, ctx->pos), 0, k) {
 		if (k.k->type != KEY_TYPE_dirent)
 			continue;
@@ -387,7 +394,7 @@ int bch2_readdir(struct bch_fs *c, struct file *file,
 
 		ctx->pos = k.k->p.offset + 1;
 	}
-	bch2_btree_iter_unlock(&iter);
+	bch2_trans_exit(&trans);
 
 	return 0;
 }
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index a989ba172faa..c33bcffa7871 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -398,7 +398,8 @@ static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf,
 /* recovery read path: */
 int bch2_ec_read_extent(struct bch_fs *c, struct bch_read_bio *rbio)
 {
-	struct btree_iter iter;
+	struct btree_trans trans;
+	struct btree_iter *iter;
 	struct ec_stripe_buf *buf;
 	struct closure cl;
 	struct bkey_s_c k;
@@ -419,19 +420,21 @@ int bch2_ec_read_extent(struct bch_fs *c, struct bch_read_bio *rbio)
 	if (!buf)
 		return -ENOMEM;
 
-	bch2_btree_iter_init(&iter, c, BTREE_ID_EC,
-			     POS(0, stripe_idx),
-			     BTREE_ITER_SLOTS);
-	k = bch2_btree_iter_peek_slot(&iter);
+	bch2_trans_init(&trans, c);
+
+	iter = bch2_trans_get_iter(&trans, BTREE_ID_EC,
+				   POS(0, stripe_idx),
+				   BTREE_ITER_SLOTS);
+	k = bch2_btree_iter_peek_slot(iter);
 	if (btree_iter_err(k) || k.k->type != KEY_TYPE_stripe) {
 		__bcache_io_error(c,
 			"error doing reconstruct read: stripe not found");
 		kfree(buf);
-		return bch2_btree_iter_unlock(&iter) ?: -EIO;
+		return bch2_trans_exit(&trans) ?: -EIO;
 	}
 
 	bkey_reassemble(&buf->key.k_i, k);
-	bch2_btree_iter_unlock(&iter);
+	bch2_trans_exit(&trans);
 
 	v = &buf->key.v;
 
@@ -1238,7 +1241,8 @@ static void bch2_stripe_read_key(struct bch_fs *c, struct bkey_s_c k)
 int bch2_stripes_read(struct bch_fs *c, struct list_head *journal_replay_list)
 {
 	struct journal_replay *r;
-	struct btree_iter iter;
+	struct btree_trans trans;
+	struct btree_iter *iter;
 	struct bkey_s_c k;
 	int ret;
 
@@ -1246,12 +1250,14 @@ int bch2_stripes_read(struct bch_fs *c, struct list_head *journal_replay_list)
 	if (ret)
 		return ret;
 
-	for_each_btree_key(&iter, c, BTREE_ID_EC, POS_MIN, 0, k) {
+	bch2_trans_init(&trans, c);
+
+	for_each_btree_key(&trans, iter, BTREE_ID_EC, POS_MIN, 0, k) {
 		bch2_stripe_read_key(c, k);
-		bch2_btree_iter_cond_resched(&iter);
+		bch2_trans_cond_resched(&trans);
 	}
 
-	ret = bch2_btree_iter_unlock(&iter);
+	ret = bch2_trans_exit(&trans);
 	if (ret)
 		return ret;
 
@@ -1269,17 +1275,20 @@ int bch2_stripes_read(struct bch_fs *c, struct list_head *journal_replay_list)
 
 int bch2_ec_mem_alloc(struct bch_fs *c, bool gc)
 {
-	struct btree_iter iter;
+	struct btree_trans trans;
+	struct btree_iter *iter;
 	struct bkey_s_c k;
 	size_t i, idx = 0;
 	int ret = 0;
 
-	bch2_btree_iter_init(&iter, c, BTREE_ID_EC, POS(0, U64_MAX), 0);
+	bch2_trans_init(&trans, c);
+
+	iter = bch2_trans_get_iter(&trans, BTREE_ID_EC, POS(0, U64_MAX), 0);
 
-	k = bch2_btree_iter_prev(&iter);
+	k = bch2_btree_iter_prev(iter);
 	if (!IS_ERR_OR_NULL(k.k))
 		idx = k.k->p.offset + 1;
-	ret = bch2_btree_iter_unlock(&iter);
+	ret = bch2_trans_exit(&trans);
 	if (ret)
 		return ret;
 
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 194b8d6da1bb..ce46417b07a0 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -1623,15 +1623,18 @@ static bool bch2_extent_merge_inline(struct bch_fs *c,
 bool bch2_check_range_allocated(struct bch_fs *c, struct bpos pos, u64 size,
 			       unsigned nr_replicas)
 {
-	struct btree_iter iter;
+	struct btree_trans trans;
+	struct btree_iter *iter;
 	struct bpos end = pos;
 	struct bkey_s_c k;
 	bool ret = true;
 
 	end.offset += size;
 
-	for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, pos,
-			     BTREE_ITER_SLOTS, k) {
+	bch2_trans_init(&trans, c);
+
+	for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, pos,
+			   BTREE_ITER_SLOTS, k) {
 		if (bkey_cmp(bkey_start_pos(k.k), end) >= 0)
 			break;
 
@@ -1640,7 +1643,7 @@ bool bch2_check_range_allocated(struct bch_fs *c, struct bpos pos, u64 size,
 			break;
 		}
 	}
-	bch2_btree_iter_unlock(&iter);
+	bch2_trans_exit(&trans);
 
 	return ret;
 }
diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 251c811abeda..efc189c02db7 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -997,7 +997,8 @@ void bch2_readahead(struct readahead_control *ractl)
 	struct bch_inode_info *inode = to_bch_ei(ractl->mapping->host);
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
 	struct bch_io_opts opts = io_opts(c, inode);
-	struct btree_iter iter;
+	struct btree_trans trans;
+	struct btree_iter *iter;
 	struct page *page;
 	struct readpages_iter readpages_iter;
 	int ret;
@@ -1005,8 +1006,10 @@ void bch2_readahead(struct readahead_control *ractl)
 	ret = readpages_iter_init(&readpages_iter, ractl);
 	BUG_ON(ret);
 
-	bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, POS_MIN,
-			     BTREE_ITER_SLOTS);
+	bch2_trans_init(&trans, c);
+
+	iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, POS_MIN,
+				   BTREE_ITER_SLOTS);
 
 	bch2_pagecache_add_get(&inode->ei_pagecache_lock);
 
@@ -1027,26 +1030,33 @@ void bch2_readahead(struct readahead_control *ractl)
 		rbio->bio.bi_end_io = bch2_readpages_end_io;
 		__bio_add_page(&rbio->bio, page, PAGE_SIZE, 0);
 
-		bchfs_read(c, &iter, rbio, inode->v.i_ino, &readpages_iter);
+		bchfs_read(c, iter, rbio, inode->v.i_ino, &readpages_iter);
 	}
 
 	bch2_pagecache_add_put(&inode->ei_pagecache_lock);
+
+	bch2_trans_exit(&trans);
 	kfree(readpages_iter.pages);
 }
 
 static void __bchfs_readpage(struct bch_fs *c, struct bch_read_bio *rbio,
 			     u64 inum, struct page *page)
 {
-	struct btree_iter iter;
+	struct btree_trans trans;
+	struct btree_iter *iter;
 
 	page_state_init_for_read(page);
 
 	rbio->bio.bi_opf = REQ_OP_READ|REQ_SYNC;
 	bio_add_page_contig(&rbio->bio, page);
 
-	bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, POS_MIN,
-			     BTREE_ITER_SLOTS);
-	bchfs_read(c, &iter, rbio, inum, NULL);
+	bch2_trans_init(&trans, c);
+	iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, POS_MIN,
+				   BTREE_ITER_SLOTS);
+
+	bchfs_read(c, iter, rbio, inum, NULL);
+
+	bch2_trans_exit(&trans);
 }
 
 static void bch2_read_single_page_end_io(struct bio *bio)
@@ -2111,7 +2121,7 @@ static int __bch2_fpunch(struct bch_fs *c, struct bch_inode_info *inode,
 		if (ret)
 			break;
 
-		bch2_btree_iter_cond_resched(iter);
+		bch2_trans_cond_resched(&trans);
 	}
 
 	bch2_trans_exit(&trans);
@@ -2123,13 +2133,14 @@ static inline int range_has_data(struct bch_fs *c,
 				  struct bpos start,
 				  struct bpos end)
 {
-
-	struct btree_iter iter;
+	struct btree_trans trans;
+	struct btree_iter *iter;
 	struct bkey_s_c k;
 	int ret = 0;
 
-	for_each_btree_key(&iter, c, BTREE_ID_EXTENTS,
-			   start, 0, k) {
+	bch2_trans_init(&trans, c);
+
+	for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, start, 0, k) {
 		if (bkey_cmp(bkey_start_pos(k.k), end) >= 0)
 			break;
 
@@ -2139,7 +2150,7 @@ static inline int range_has_data(struct bch_fs *c,
 		}
 	}
 
-	return bch2_btree_iter_unlock(&iter) ?: ret;
+	return bch2_trans_exit(&trans) ?: ret;
 }
 
 static int __bch2_truncate_page(struct bch_inode_info *inode,
@@ -2464,7 +2475,7 @@ btree_iter_err:
 		 * pointers... which isn't a _super_ serious problem...
 		 */
 
-		bch2_btree_iter_cond_resched(src);
+		bch2_trans_cond_resched(&trans);
 	}
 	bch2_trans_unlock(&trans);
 
@@ -2709,7 +2720,8 @@ static loff_t bch2_seek_data(struct file *file, u64 offset)
 {
 	struct bch_inode_info *inode = file_bch_inode(file);
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	struct btree_iter iter;
+	struct btree_trans trans;
+	struct btree_iter *iter;
 	struct bkey_s_c k;
 	u64 isize, next_data = MAX_LFS_FILESIZE;
 	int ret;
@@ -2718,7 +2730,9 @@ static loff_t bch2_seek_data(struct file *file, u64 offset)
 	if (offset >= isize)
 		return -ENXIO;
 
-	for_each_btree_key(&iter, c, BTREE_ID_EXTENTS,
+	bch2_trans_init(&trans, c);
+
+	for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS,
 			   POS(inode->v.i_ino, offset >> 9), 0, k) {
 		if (k.k->p.inode != inode->v.i_ino) {
 			break;
@@ -2729,7 +2743,7 @@ static loff_t bch2_seek_data(struct file *file, u64 offset)
 			break;
 	}
 
-	ret = bch2_btree_iter_unlock(&iter);
+	ret = bch2_trans_exit(&trans);
 	if (ret)
 		return ret;
 
@@ -2779,7 +2793,8 @@ static loff_t bch2_seek_hole(struct file *file, u64 offset)
 {
 	struct bch_inode_info *inode = file_bch_inode(file);
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	struct btree_iter iter;
+	struct btree_trans trans;
+	struct btree_iter *iter;
 	struct bkey_s_c k;
 	u64 isize, next_hole = MAX_LFS_FILESIZE;
 	int ret;
@@ -2788,7 +2803,9 @@ static loff_t bch2_seek_hole(struct file *file, u64 offset)
 	if (offset >= isize)
 		return -ENXIO;
 
-	for_each_btree_key(&iter, c, BTREE_ID_EXTENTS,
+	bch2_trans_init(&trans, c);
+
+	for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS,
 			   POS(inode->v.i_ino, offset >> 9),
 			   BTREE_ITER_SLOTS, k) {
 		if (k.k->p.inode != inode->v.i_ino) {
@@ -2807,7 +2824,7 @@ static loff_t bch2_seek_hole(struct file *file, u64 offset)
 		}
 	}
 
-	ret = bch2_btree_iter_unlock(&iter);
+	ret = bch2_trans_exit(&trans);
 	if (ret)
 		return ret;
 
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index 2a5a90b2a781..6e377a0e176f 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -157,7 +157,7 @@ int __must_check bch2_write_inode_trans(struct btree_trans *trans,
 				void *p)
 {
 	struct bch_fs *c = trans->c;
-	struct btree_iter *iter;
+	struct btree_iter *iter = NULL;
 	struct bkey_inode_buf *inode_p;
 	int ret;
 
@@ -1193,7 +1193,8 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
 {
 	struct bch_fs *c = vinode->i_sb->s_fs_info;
 	struct bch_inode_info *ei = to_bch_ei(vinode);
-	struct btree_iter iter;
+	struct btree_trans trans;
+	struct btree_iter *iter;
 	struct bkey_s_c k;
 	BKEY_PADDED(k) tmp;
 	bool have_extent = false;
@@ -1206,7 +1207,9 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
 	if (start + len < start)
 		return -EINVAL;
 
-	for_each_btree_key(&iter, c, BTREE_ID_EXTENTS,
+	bch2_trans_init(&trans, c);
+
+	for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS,
 			   POS(ei->v.i_ino, start >> 9), 0, k)
 		if (bkey_extent_is_data(k.k) ||
 		    k.k->type == KEY_TYPE_reservation) {
@@ -1227,7 +1230,7 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
 	if (have_extent)
 		ret = bch2_fill_extent(info, &tmp.k, FIEMAP_EXTENT_LAST);
 out:
-	bch2_btree_iter_unlock(&iter);
+	bch2_trans_exit(&trans);
 	return ret < 0 ? ret : 0;
 }
 
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 439f758d8178..41284d38db2f 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -16,6 +16,23 @@
 
 #define QSTR(n) { { { .len = strlen(n) } }, .name = n }
 
+static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum)
+{
+	struct btree_iter *iter;
+	struct bkey_s_c k;
+	u64 sectors = 0;
+
+	for_each_btree_key(trans, iter, BTREE_ID_EXTENTS, POS(inum, 0), 0, k) {
+		if (k.k->p.inode != inum)
+			break;
+
+		if (bkey_extent_is_allocation(k.k))
+			sectors += k.k->size;
+	}
+
+	return bch2_trans_iter_free(trans, iter) ?: sectors;
+}
+
 static int remove_dirent(struct bch_fs *c, struct btree_iter *iter,
 			 struct bkey_s_c_dirent dirent)
 {
@@ -181,44 +198,32 @@ err:
 	return ret;
 }
 
-/* fsck hasn't been converted to new transactions yet: */
-static int fsck_hash_delete_at(const struct bch_hash_desc desc,
+static int fsck_hash_delete_at(struct btree_trans *trans,
+			       const struct bch_hash_desc desc,
 			       struct bch_hash_info *info,
-			       struct btree_iter *orig_iter)
+			       struct btree_iter *iter)
 {
-	struct btree_trans trans;
-	struct btree_iter *iter;
 	int ret;
-
-	bch2_btree_iter_unlock(orig_iter);
-
-	bch2_trans_init(&trans, orig_iter->c);
 retry:
-	bch2_trans_begin(&trans);
-
-	iter = bch2_trans_copy_iter(&trans, orig_iter);
-	if (IS_ERR(iter)) {
-		ret = PTR_ERR(iter);
-		goto err;
-	}
-
-	ret   = bch2_hash_delete_at(&trans, desc, info, iter) ?:
-		bch2_trans_commit(&trans, NULL, NULL,
+	ret   = bch2_hash_delete_at(trans, desc, info, iter) ?:
+		bch2_trans_commit(trans, NULL, NULL,
 				  BTREE_INSERT_ATOMIC|
 				  BTREE_INSERT_NOFAIL|
 				  BTREE_INSERT_LAZY_RW);
-err:
-	if (ret == -EINTR)
-		goto retry;
+	if (ret == -EINTR) {
+		ret = bch2_btree_iter_traverse(iter);
+		if (!ret)
+			goto retry;
+	}
 
-	bch2_trans_exit(&trans);
 	return ret;
 }
 
-static int hash_check_duplicates(const struct bch_hash_desc desc,
-				 struct hash_check *h, struct bch_fs *c,
-				 struct btree_iter *k_iter, struct bkey_s_c k)
+static int hash_check_duplicates(struct btree_trans *trans,
+			const struct bch_hash_desc desc, struct hash_check *h,
+			struct btree_iter *k_iter, struct bkey_s_c k)
 {
+	struct bch_fs *c = trans->c;
 	struct btree_iter *iter;
 	struct bkey_s_c k2;
 	char buf[200];
@@ -239,7 +244,7 @@ static int hash_check_duplicates(const struct bch_hash_desc desc,
 				"duplicate hash table keys:\n%s",
 				(bch2_bkey_val_to_text(&PBUF(buf), c,
 						       k), buf))) {
-			ret = fsck_hash_delete_at(desc, &h->info, k_iter);
+			ret = fsck_hash_delete_at(trans, desc, &h->info, k_iter);
 			if (ret)
 				return ret;
 			ret = 1;
@@ -274,9 +279,9 @@ static bool key_has_correct_hash(const struct bch_hash_desc desc,
 		hash <= k.k->p.offset;
 }
 
-static int hash_check_key(const struct bch_hash_desc desc,
-			  struct btree_trans *trans, struct hash_check *h,
-			  struct btree_iter *k_iter, struct bkey_s_c k)
+static int hash_check_key(struct btree_trans *trans,
+			const struct bch_hash_desc desc, struct hash_check *h,
+			struct btree_iter *k_iter, struct bkey_s_c k)
 {
 	struct bch_fs *c = trans->c;
 	char buf[200];
@@ -312,7 +317,7 @@ static int hash_check_key(const struct bch_hash_desc desc,
 		return 1;
 	}
 
-	ret = hash_check_duplicates(desc, h, c, k_iter, k);
+	ret = hash_check_duplicates(trans, desc, h, k_iter, k);
 fsck_err:
 	return ret;
 }
@@ -417,14 +422,17 @@ noinline_for_stack
 static int check_extents(struct bch_fs *c)
 {
 	struct inode_walker w = inode_walker_init();
-	struct btree_iter iter;
+	struct btree_trans trans;
+	struct btree_iter *iter;
 	struct bkey_s_c k;
 	u64 i_sectors;
 	int ret = 0;
 
+	bch2_trans_init(&trans, c);
+
 	bch_verbose(c, "checking extents");
 
-	for_each_btree_key(&iter, c, BTREE_ID_EXTENTS,
+	for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS,
 			   POS(BCACHEFS_ROOT_INO, 0), 0, k) {
 		ret = walk_inode(c, &w, k.k->p.inode);
 		if (ret)
@@ -437,7 +445,7 @@ static int check_extents(struct bch_fs *c)
 			!S_ISREG(w.inode.bi_mode) && !S_ISLNK(w.inode.bi_mode), c,
 			"extent type %u for non regular file, inode %llu mode %o",
 			k.k->type, k.k->p.inode, w.inode.bi_mode)) {
-			bch2_btree_iter_unlock(&iter);
+			bch2_trans_unlock(&trans);
 
 			ret = bch2_inode_truncate(c, k.k->p.inode, 0);
 			if (ret)
@@ -449,14 +457,14 @@ static int check_extents(struct bch_fs *c)
 			w.have_inode &&
 			!(w.inode.bi_flags & BCH_INODE_I_SECTORS_DIRTY) &&
 			w.inode.bi_sectors !=
-			(i_sectors = bch2_count_inode_sectors(c, w.cur_inum)),
+			(i_sectors = bch2_count_inode_sectors(&trans, w.cur_inum)),
 			c, "i_sectors wrong: got %llu, should be %llu",
 			w.inode.bi_sectors, i_sectors)) {
 			struct bkey_inode_buf p;
 
 			w.inode.bi_sectors = i_sectors;
 
-			bch2_btree_iter_unlock(&iter);
+			bch2_trans_unlock(&trans);
 
 			bch2_inode_pack(&p, &w.inode);
 
@@ -470,7 +478,7 @@ static int check_extents(struct bch_fs *c)
 			}
 
 			/* revalidate iterator: */
-			k = bch2_btree_iter_peek(&iter);
+			k = bch2_btree_iter_peek(iter);
 		}
 
 		if (fsck_err_on(w.have_inode &&
@@ -479,7 +487,7 @@ static int check_extents(struct bch_fs *c)
 			k.k->p.offset > round_up(w.inode.bi_size, PAGE_SIZE) >> 9, c,
 			"extent type %u offset %llu past end of inode %llu, i_size %llu",
 			k.k->type, k.k->p.offset, k.k->p.inode, w.inode.bi_size)) {
-			bch2_btree_iter_unlock(&iter);
+			bch2_trans_unlock(&trans);
 
 			ret = bch2_inode_truncate(c, k.k->p.inode,
 						  w.inode.bi_size);
@@ -490,7 +498,7 @@ static int check_extents(struct bch_fs *c)
 	}
 err:
 fsck_err:
-	return bch2_btree_iter_unlock(&iter) ?: ret;
+	return bch2_trans_exit(&trans) ?: ret;
 }
 
 /*
@@ -688,7 +696,8 @@ static int check_xattrs(struct bch_fs *c)
 		if (w.first_this_inode && w.have_inode)
 			hash_check_set_inode(&h, c, &w.inode);
 
-		ret = hash_check_key(bch2_xattr_hash_desc, &trans, &h, iter, k);
+		ret = hash_check_key(&trans, bch2_xattr_hash_desc,
+				     &h, iter, k);
 		if (ret)
 			goto fsck_err;
 	}
@@ -863,13 +872,16 @@ static int check_directory_structure(struct bch_fs *c,
 	struct inode_bitmap dirs_done = { NULL, 0 };
 	struct pathbuf path = { 0, 0, NULL };
 	struct pathbuf_entry *e;
-	struct btree_iter iter;
+	struct btree_trans trans;
+	struct btree_iter *iter;
 	struct bkey_s_c k;
 	struct bkey_s_c_dirent dirent;
 	bool had_unreachable;
 	u64 d_inum;
 	int ret = 0;
 
+	bch2_trans_init(&trans, c);
+
 	bch_verbose(c, "checking directory structure");
 
 	/* DFS: */
@@ -894,7 +906,7 @@ next:
 		if (e->offset == U64_MAX)
 			goto up;
 
-		for_each_btree_key(&iter, c, BTREE_ID_DIRENTS,
+		for_each_btree_key(&trans, iter, BTREE_ID_DIRENTS,
 				   POS(e->inum, e->offset + 1), 0, k) {
 			if (k.k->p.inode != e->inum)
 				break;
@@ -914,7 +926,7 @@ next:
 			if (fsck_err_on(inode_bitmap_test(&dirs_done, d_inum), c,
 					"directory %llu has multiple hardlinks",
 					d_inum)) {
-				ret = remove_dirent(c, &iter, dirent);
+				ret = remove_dirent(c, iter, dirent);
 				if (ret)
 					goto err;
 				continue;
@@ -931,10 +943,14 @@ next:
 				goto err;
 			}
 
-			bch2_btree_iter_unlock(&iter);
+			ret = bch2_trans_iter_free(&trans, iter);
+			if (ret) {
+				bch_err(c, "btree error %i in fsck", ret);
+				goto err;
+			}
 			goto next;
 		}
-		ret = bch2_btree_iter_unlock(&iter);
+		ret = bch2_trans_iter_free(&trans, iter);
 		if (ret) {
 			bch_err(c, "btree error %i in fsck", ret);
 			goto err;
@@ -943,7 +959,7 @@ up:
 		path.nr--;
 	}
 
-	for_each_btree_key(&iter, c, BTREE_ID_INODES, POS_MIN, 0, k) {
+	for_each_btree_key(&trans, iter, BTREE_ID_INODES, POS_MIN, 0, k) {
 		if (k.k->type != KEY_TYPE_inode)
 			continue;
 
@@ -956,7 +972,7 @@ up:
 		if (fsck_err_on(!inode_bitmap_test(&dirs_done, k.k->p.inode), c,
 				"unreachable directory found (inum %llu)",
 				k.k->p.inode)) {
-			bch2_btree_iter_unlock(&iter);
+			bch2_btree_iter_unlock(iter);
 
 			ret = reattach_inode(c, lostfound_inode, k.k->p.inode);
 			if (ret) {
@@ -966,7 +982,7 @@ up:
 			had_unreachable = true;
 		}
 	}
-	ret = bch2_btree_iter_unlock(&iter);
+	ret = bch2_trans_iter_free(&trans, iter);
 	if (ret)
 		goto err;
 
@@ -985,7 +1001,7 @@ out:
 	return ret;
 err:
 fsck_err:
-	ret = bch2_btree_iter_unlock(&iter) ?: ret;
+	ret = bch2_trans_exit(&trans) ?: ret;
 	goto out;
 }
 
@@ -1022,15 +1038,18 @@ noinline_for_stack
 static int bch2_gc_walk_dirents(struct bch_fs *c, nlink_table *links,
 			       u64 range_start, u64 *range_end)
 {
-	struct btree_iter iter;
+	struct btree_trans trans;
+	struct btree_iter *iter;
 	struct bkey_s_c k;
 	struct bkey_s_c_dirent d;
 	u64 d_inum;
 	int ret;
 
+	bch2_trans_init(&trans, c);
+
 	inc_link(c, links, range_start, range_end, BCACHEFS_ROOT_INO, false);
 
-	for_each_btree_key(&iter, c, BTREE_ID_DIRENTS, POS_MIN, 0, k) {
+	for_each_btree_key(&trans, iter, BTREE_ID_DIRENTS, POS_MIN, 0, k) {
 		switch (k.k->type) {
 		case KEY_TYPE_dirent:
 			d = bkey_s_c_to_dirent(k);
@@ -1046,32 +1065,15 @@ static int bch2_gc_walk_dirents(struct bch_fs *c, nlink_table *links,
 			break;
 		}
 
-		bch2_btree_iter_cond_resched(&iter);
+		bch2_trans_cond_resched(&trans);
 	}
-	ret = bch2_btree_iter_unlock(&iter);
+	ret = bch2_trans_exit(&trans);
 	if (ret)
 		bch_err(c, "error in fs gc: btree error %i while walking dirents", ret);
 
 	return ret;
 }
 
-s64 bch2_count_inode_sectors(struct bch_fs *c, u64 inum)
-{
-	struct btree_iter iter;
-	struct bkey_s_c k;
-	u64 sectors = 0;
-
-	for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, POS(inum, 0), 0, k) {
-		if (k.k->p.inode != inum)
-			break;
-
-		if (bkey_extent_is_allocation(k.k))
-			sectors += k.k->size;
-	}
-
-	return bch2_btree_iter_unlock(&iter) ?: sectors;
-}
-
 static int check_inode_nlink(struct bch_fs *c,
 			     struct bch_inode_unpacked *lostfound_inode,
 			     struct bch_inode_unpacked *u,
@@ -1253,7 +1255,7 @@ static int check_inode(struct btree_trans *trans,
 		bch_verbose(c, "recounting sectors for inode %llu",
 			    u.bi_inum);
 
-		sectors = bch2_count_inode_sectors(c, u.bi_inum);
+		sectors = bch2_count_inode_sectors(trans, u.bi_inum);
 		if (sectors < 0) {
 			bch_err(c, "error in fs gc: error %i "
 				"recounting inode sectors",
@@ -1346,7 +1348,7 @@ peek_nlinks:	link = genradix_iter_peek(&nlinks_iter, links);
 			genradix_iter_advance(&nlinks_iter, links);
 
 		bch2_btree_iter_next(iter);
-		bch2_btree_iter_cond_resched(iter);
+		bch2_trans_cond_resched(&trans);
 	}
 fsck_err:
 	bch2_trans_exit(&trans);
diff --git a/fs/bcachefs/fsck.h b/fs/bcachefs/fsck.h
index 88da06762d7d..97460452e842 100644
--- a/fs/bcachefs/fsck.h
+++ b/fs/bcachefs/fsck.h
@@ -2,7 +2,6 @@
 #ifndef _BCACHEFS_FSCK_H
 #define _BCACHEFS_FSCK_H
 
-s64 bch2_count_inode_sectors(struct bch_fs *, u64);
 int bch2_fsck(struct bch_fs *);
 
 #endif /* _BCACHEFS_FSCK_H */
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index 811c917cba84..c6336e7a2a23 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -447,13 +447,15 @@ int bch2_inode_rm(struct bch_fs *c, u64 inode_nr)
 int bch2_inode_find_by_inum(struct bch_fs *c, u64 inode_nr,
 			    struct bch_inode_unpacked *inode)
 {
-	struct btree_iter iter;
+	struct btree_trans trans;
+	struct btree_iter *iter;
 	struct bkey_s_c k;
 	int ret = -ENOENT;
 
-	for_each_btree_key(&iter, c, BTREE_ID_INODES,
-			   POS(inode_nr, 0),
-			   BTREE_ITER_SLOTS, k) {
+	bch2_trans_init(&trans, c);
+
+	for_each_btree_key(&trans, iter, BTREE_ID_INODES,
+			   POS(inode_nr, 0), BTREE_ITER_SLOTS, k) {
 		switch (k.k->type) {
 		case KEY_TYPE_inode:
 			ret = bch2_inode_unpack(bkey_s_c_to_inode(k), inode);
@@ -466,7 +468,7 @@ int bch2_inode_find_by_inum(struct bch_fs *c, u64 inode_nr,
 		break;
 	}
 
-	return bch2_btree_iter_unlock(&iter) ?: ret;
+	return bch2_trans_exit(&trans) ?: ret;
 }
 
 #ifdef CONFIG_BCACHEFS_DEBUG
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index f4c49bf82456..62ee09121036 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -1263,27 +1263,28 @@ static void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio
 				     struct bch_io_failures *failed,
 				     unsigned flags)
 {
-	struct btree_iter iter;
+	struct btree_trans trans;
+	struct btree_iter *iter;
 	BKEY_PADDED(k) tmp;
 	struct bkey_s_c k;
 	int ret;
 
 	flags &= ~BCH_READ_LAST_FRAGMENT;
 
-	bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS,
-			     rbio->pos, BTREE_ITER_SLOTS);
+	bch2_trans_init(&trans, c);
+
+	iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
+				   rbio->pos, BTREE_ITER_SLOTS);
 retry:
 	rbio->bio.bi_status = 0;
 
-	k = bch2_btree_iter_peek_slot(&iter);
-	if (btree_iter_err(k)) {
-		bch2_btree_iter_unlock(&iter);
+	k = bch2_btree_iter_peek_slot(iter);
+	if (btree_iter_err(k))
 		goto err;
-	}
 
 	bkey_reassemble(&tmp.k, k);
 	k = bkey_i_to_s_c(&tmp.k);
-	bch2_btree_iter_unlock(&iter);
+	bch2_trans_unlock(&trans);
 
 	if (!bkey_extent_is_data(k.k) ||
 	    !bch2_extent_matches_ptr(c, bkey_i_to_s_c_extent(&tmp.k),
@@ -1300,25 +1301,30 @@ retry:
 		goto retry;
 	if (ret)
 		goto err;
-	goto out;
-err:
-	rbio->bio.bi_status = BLK_STS_IOERR;
 out:
 	bch2_rbio_done(rbio);
+	bch2_trans_exit(&trans);
+	return;
+err:
+	rbio->bio.bi_status = BLK_STS_IOERR;
+	goto out;
 }
 
 static void bch2_read_retry(struct bch_fs *c, struct bch_read_bio *rbio,
 			    struct bvec_iter bvec_iter, u64 inode,
 			    struct bch_io_failures *failed, unsigned flags)
 {
-	struct btree_iter iter;
+	struct btree_trans trans;
+	struct btree_iter *iter;
 	struct bkey_s_c k;
 	int ret;
 
+	bch2_trans_init(&trans, c);
+
 	flags &= ~BCH_READ_LAST_FRAGMENT;
 	flags |= BCH_READ_MUST_CLONE;
 retry:
-	for_each_btree_key(&iter, c, BTREE_ID_EXTENTS,
+	for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS,
 			   POS(inode, bvec_iter.bi_sector),
 			   BTREE_ITER_SLOTS, k) {
 		BKEY_PADDED(k) tmp;
@@ -1326,7 +1332,7 @@ retry:
 
 		bkey_reassemble(&tmp.k, k);
 		k = bkey_i_to_s_c(&tmp.k);
-		bch2_btree_iter_unlock(&iter);
+		bch2_btree_iter_unlock(iter);
 
 		bytes = min_t(unsigned, bvec_iter.bi_size,
 			      (k.k->p.offset - bvec_iter.bi_sector) << 9);
@@ -1351,12 +1357,12 @@ retry:
 	 * If we get here, it better have been because there was an error
 	 * reading a btree node
 	 */
-	ret = bch2_btree_iter_unlock(&iter);
-	BUG_ON(!ret);
-	__bcache_io_error(c, "btree IO error %i", ret);
+	BUG_ON(!(iter->flags & BTREE_ITER_ERROR));
+	__bcache_io_error(c, "btree IO error");
 err:
 	rbio->bio.bi_status = BLK_STS_IOERR;
 out:
+	bch2_trans_exit(&trans);
 	bch2_rbio_done(rbio);
 }
 
@@ -1859,12 +1865,14 @@ out_read_done:
 
 void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, u64 inode)
 {
-	struct btree_iter iter;
+	struct btree_trans trans;
+	struct btree_iter *iter;
 	struct bkey_s_c k;
 	unsigned flags = BCH_READ_RETRY_IF_STALE|
 		BCH_READ_MAY_PROMOTE|
 		BCH_READ_USER_MAPPED;
-	int ret;
+
+	bch2_trans_init(&trans, c);
 
 	BUG_ON(rbio->_state);
 	BUG_ON(flags & BCH_READ_NODECODE);
@@ -1873,7 +1881,7 @@ void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, u64 inode)
 	rbio->c = c;
 	rbio->start_time = local_clock();
 
-	for_each_btree_key(&iter, c, BTREE_ID_EXTENTS,
+	for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS,
 			   POS(inode, rbio->bio.bi_iter.bi_sector),
 			   BTREE_ITER_SLOTS, k) {
 		BKEY_PADDED(k) tmp;
@@ -1885,7 +1893,7 @@ void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, u64 inode)
 		 */
 		bkey_reassemble(&tmp.k, k);
 		k = bkey_i_to_s_c(&tmp.k);
-		bch2_btree_iter_unlock(&iter);
+		bch2_btree_iter_unlock(iter);
 
 		bytes = min_t(unsigned, rbio->bio.bi_iter.bi_size,
 			      (k.k->p.offset - rbio->bio.bi_iter.bi_sector) << 9);
@@ -1907,9 +1915,10 @@ void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, u64 inode)
 	 * If we get here, it better have been because there was an error
 	 * reading a btree node
 	 */
-	ret = bch2_btree_iter_unlock(&iter);
-	BUG_ON(!ret);
-	bcache_io_error(c, &rbio->bio, "btree IO error %i", ret);
+	BUG_ON(!(iter->flags & BTREE_ITER_ERROR));
+	bcache_io_error(c, &rbio->bio, "btree IO error");
+
+	bch2_trans_exit(&trans);
 	bch2_rbio_done(rbio);
 }
 
diff --git a/fs/bcachefs/journal_seq_blacklist.c b/fs/bcachefs/journal_seq_blacklist.c
index c26f36d58633..45c8d38d12de 100644
--- a/fs/bcachefs/journal_seq_blacklist.c
+++ b/fs/bcachefs/journal_seq_blacklist.c
@@ -62,9 +62,12 @@ static void journal_seq_blacklist_flush(struct journal *j,
 	closure_init_stack(&cl);
 
 	for (i = 0;; i++) {
-		struct btree_iter iter;
+		struct btree_trans trans;
+		struct btree_iter *iter;
 		struct btree *b;
 
+		bch2_trans_init(&trans, c);
+
 		mutex_lock(&j->blacklist_lock);
 		if (i >= bl->nr_entries) {
 			mutex_unlock(&j->blacklist_lock);
@@ -73,17 +76,17 @@ static void journal_seq_blacklist_flush(struct journal *j,
 		n = bl->entries[i];
 		mutex_unlock(&j->blacklist_lock);
 
-		__bch2_btree_iter_init(&iter, c, n.btree_id, n.pos,
-				       0, 0, BTREE_ITER_NODES);
+		iter = bch2_trans_get_node_iter(&trans, n.btree_id, n.pos,
+						0, 0, 0);
 
-		b = bch2_btree_iter_peek_node(&iter);
+		b = bch2_btree_iter_peek_node(iter);
 
 		/* The node might have already been rewritten: */
 
 		if (b->data->keys.seq == n.seq) {
-			ret = bch2_btree_node_rewrite(c, &iter, n.seq, 0);
+			ret = bch2_btree_node_rewrite(c, iter, n.seq, 0);
 			if (ret) {
-				bch2_btree_iter_unlock(&iter);
+				bch2_trans_exit(&trans);
 				bch2_fs_fatal_error(c,
 					"error %i rewriting btree node with blacklisted journal seq",
 					ret);
@@ -92,7 +95,7 @@ static void journal_seq_blacklist_flush(struct journal *j,
 			}
 		}
 
-		bch2_btree_iter_unlock(&iter);
+		bch2_trans_exit(&trans);
 	}
 
 	for (i = 0;; i++) {
diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c
index 38bf75b6bc2d..2b63b07db2bc 100644
--- a/fs/bcachefs/migrate.c
+++ b/fs/bcachefs/migrate.c
@@ -106,7 +106,8 @@ static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
 
 static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
 {
-	struct btree_iter iter;
+	struct btree_trans trans;
+	struct btree_iter *iter;
 	struct closure cl;
 	struct btree *b;
 	unsigned id;
@@ -116,13 +117,15 @@ static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
 	if (flags & BCH_FORCE_IF_METADATA_LOST)
 		return -EINVAL;
 
+	bch2_trans_init(&trans, c);
 	closure_init_stack(&cl);
 
 	mutex_lock(&c->replicas_gc_lock);
 	bch2_replicas_gc_start(c, 1 << BCH_DATA_BTREE);
 
 	for (id = 0; id < BTREE_ID_NR; id++) {
-		for_each_btree_node(&iter, c, id, POS_MIN, BTREE_ITER_PREFETCH, b) {
+		for_each_btree_node(&trans, iter, id, POS_MIN,
+				    BTREE_ITER_PREFETCH, b) {
 			__BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp;
 			struct bkey_i_btree_ptr *new_key;
 retry:
@@ -134,7 +137,7 @@ retry:
 				 * but got -EINTR after upgrading the iter, but
 				 * then raced and the node is now gone:
 				 */
-				bch2_btree_iter_downgrade(&iter);
+				bch2_btree_iter_downgrade(iter);
 
 				ret = bch2_mark_bkey_replicas(c, bkey_i_to_s_c(&b->key));
 				if (ret)
@@ -148,16 +151,16 @@ retry:
 				if (ret)
 					goto err;
 
-				ret = bch2_btree_node_update_key(c, &iter, b, new_key);
+				ret = bch2_btree_node_update_key(c, iter, b, new_key);
 				if (ret == -EINTR) {
-					b = bch2_btree_iter_peek_node(&iter);
+					b = bch2_btree_iter_peek_node(iter);
 					goto retry;
 				}
 				if (ret)
 					goto err;
 			}
 		}
-		bch2_btree_iter_unlock(&iter);
+		bch2_trans_iter_free(&trans, iter);
 	}
 
 	/* flush relevant btree updates */
@@ -171,14 +174,13 @@ retry:
 	}
 
 	ret = 0;
-out:
+err:
+	bch2_trans_exit(&trans);
+
 	ret = bch2_replicas_gc_end(c, ret);
 	mutex_unlock(&c->replicas_gc_lock);
 
 	return ret;
-err:
-	bch2_btree_iter_unlock(&iter);
-	goto out;
 }
 
 int bch2_dev_data_drop(struct bch_fs *c, unsigned dev_idx, int flags)
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index 8c453ae31525..3f3e34e07f35 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -485,6 +485,8 @@ int bch2_move_data(struct bch_fs *c,
 	struct moving_context ctxt = { .stats = stats };
 	struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts);
 	BKEY_PADDED(k) tmp;
+	struct btree_trans trans;
+	struct btree_iter *iter;
 	struct bkey_s_c k;
 	struct data_opts data_opts;
 	enum data_cmd data_cmd;
@@ -495,9 +497,14 @@ int bch2_move_data(struct bch_fs *c,
 	INIT_LIST_HEAD(&ctxt.reads);
 	init_waitqueue_head(&ctxt.wait);
 
+	bch2_trans_init(&trans, c);
+
 	stats->data_type = BCH_DATA_USER;
-	bch2_btree_iter_init(&stats->iter, c, BTREE_ID_EXTENTS, start,
-			     BTREE_ITER_PREFETCH);
+	stats->btree_id	= BTREE_ID_EXTENTS;
+	stats->pos	= POS_MIN;
+
+	iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, start,
+				   BTREE_ITER_PREFETCH);
 
 	if (rate)
 		bch2_ratelimit_reset(rate);
@@ -507,7 +514,7 @@ int bch2_move_data(struct bch_fs *c,
 			delay = rate ? bch2_ratelimit_delay(rate) : 0;
 
 			if (delay) {
-				bch2_btree_iter_unlock(&stats->iter);
+				bch2_trans_unlock(&trans);
 				set_current_state(TASK_INTERRUPTIBLE);
 			}
 
@@ -520,13 +527,16 @@ int bch2_move_data(struct bch_fs *c,
 				schedule_timeout(delay);
 
 			if (unlikely(freezing(current))) {
-				bch2_btree_iter_unlock(&stats->iter);
+				bch2_trans_unlock(&trans);
 				move_ctxt_wait_event(&ctxt, list_empty(&ctxt.reads));
 				try_to_freeze();
 			}
 		} while (delay);
 peek:
-		k = bch2_btree_iter_peek(&stats->iter);
+		k = bch2_btree_iter_peek(iter);
+
+		stats->pos = iter->pos;
+
 		if (!k.k)
 			break;
 		ret = btree_iter_err(k);
@@ -542,7 +552,7 @@ peek:
 			struct bch_inode_unpacked inode;
 
 			/* don't hold btree locks while looking up inode: */
-			bch2_btree_iter_unlock(&stats->iter);
+			bch2_trans_unlock(&trans);
 
 			io_opts = bch2_opts_to_inode_opts(c->opts);
 			if (!bch2_inode_find_by_inum(c, k.k->p.inode, &inode))
@@ -567,7 +577,7 @@ peek:
 		/* unlock before doing IO: */
 		bkey_reassemble(&tmp.k, k);
 		k = bkey_i_to_s_c(&tmp.k);
-		bch2_btree_iter_unlock(&stats->iter);
+		bch2_trans_unlock(&trans);
 
 		ret2 = bch2_move_extent(c, &ctxt, wp, io_opts,
 					bkey_s_c_to_extent(k),
@@ -589,11 +599,11 @@ next:
 		atomic64_add(k.k->size * bch2_bkey_nr_dirty_ptrs(k),
 			     &stats->sectors_seen);
 next_nondata:
-		bch2_btree_iter_next(&stats->iter);
-		bch2_btree_iter_cond_resched(&stats->iter);
+		bch2_btree_iter_next(iter);
+		bch2_trans_cond_resched(&trans);
 	}
 out:
-	bch2_btree_iter_unlock(&stats->iter);
+	bch2_trans_exit(&trans);
 
 	move_ctxt_wait_event(&ctxt, list_empty(&ctxt.reads));
 	closure_sync(&ctxt.cl);
@@ -609,20 +619,23 @@ out:
 
 static int bch2_gc_data_replicas(struct bch_fs *c)
 {
-	struct btree_iter iter;
+	struct btree_trans trans;
+	struct btree_iter *iter;
 	struct bkey_s_c k;
 	int ret;
 
+	bch2_trans_init(&trans, c);
+
 	mutex_lock(&c->replicas_gc_lock);
 	bch2_replicas_gc_start(c, (1 << BCH_DATA_USER)|(1 << BCH_DATA_CACHED));
 
-	for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, POS_MIN,
+	for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, POS_MIN,
 			   BTREE_ITER_PREFETCH, k) {
 		ret = bch2_mark_bkey_replicas(c, k);
 		if (ret)
 			break;
 	}
-	ret = bch2_btree_iter_unlock(&iter) ?: ret;
+	ret = bch2_trans_exit(&trans) ?: ret;
 
 	bch2_replicas_gc_end(c, ret);
 	mutex_unlock(&c->replicas_gc_lock);
@@ -632,24 +645,30 @@ static int bch2_gc_data_replicas(struct bch_fs *c)
 
 static int bch2_gc_btree_replicas(struct bch_fs *c)
 {
-	struct btree_iter iter;
+	struct btree_trans trans;
+	struct btree_iter *iter;
 	struct btree *b;
 	unsigned id;
 	int ret = 0;
 
+	bch2_trans_init(&trans, c);
+
 	mutex_lock(&c->replicas_gc_lock);
 	bch2_replicas_gc_start(c, 1 << BCH_DATA_BTREE);
 
 	for (id = 0; id < BTREE_ID_NR; id++) {
-		for_each_btree_node(&iter, c, id, POS_MIN, BTREE_ITER_PREFETCH, b) {
+		for_each_btree_node(&trans, iter, id, POS_MIN,
+				    BTREE_ITER_PREFETCH, b) {
 			ret = bch2_mark_bkey_replicas(c, bkey_i_to_s_c(&b->key));
 
-			bch2_btree_iter_cond_resched(&iter);
+			bch2_trans_cond_resched(&trans);
 		}
 
-		ret = bch2_btree_iter_unlock(&iter) ?: ret;
+		ret = bch2_trans_iter_free(&trans, iter) ?: ret;
 	}
 
+	bch2_trans_exit(&trans);
+
 	bch2_replicas_gc_end(c, ret);
 	mutex_unlock(&c->replicas_gc_lock);
 
@@ -662,16 +681,25 @@ static int bch2_move_btree(struct bch_fs *c,
 			   struct bch_move_stats *stats)
 {
 	struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts);
+	struct btree_trans trans;
+	struct btree_iter *iter;
 	struct btree *b;
 	unsigned id;
 	struct data_opts data_opts;
 	enum data_cmd cmd;
 	int ret = 0;
 
+	bch2_trans_init(&trans, c);
+
 	stats->data_type = BCH_DATA_BTREE;
 
 	for (id = 0; id < BTREE_ID_NR; id++) {
-		for_each_btree_node(&stats->iter, c, id, POS_MIN, BTREE_ITER_PREFETCH, b) {
+		stats->btree_id = id;
+
+		for_each_btree_node(&trans, iter, id, POS_MIN,
+				    BTREE_ITER_PREFETCH, b) {
+			stats->pos = iter->pos;
+
 			switch ((cmd = pred(c, arg,
 					    bkey_i_to_s_c(&b->key),
 					    &io_opts, &data_opts))) {
@@ -686,15 +714,17 @@ static int bch2_move_btree(struct bch_fs *c,
 				BUG();
 			}
 
-			ret = bch2_btree_node_rewrite(c, &stats->iter,
+			ret = bch2_btree_node_rewrite(c, iter,
 					b->data->keys.seq, 0) ?: ret;
 next:
-			bch2_btree_iter_cond_resched(&stats->iter);
+			bch2_trans_cond_resched(&trans);
 		}
 
-		ret = bch2_btree_iter_unlock(&stats->iter) ?: ret;
+		ret = bch2_trans_iter_free(&trans, iter) ?: ret;
 	}
 
+	bch2_trans_exit(&trans);
+
 	return ret;
 }
 
diff --git a/fs/bcachefs/move_types.h b/fs/bcachefs/move_types.h
index 8dbeb6ef727c..6788170d3f95 100644
--- a/fs/bcachefs/move_types.h
+++ b/fs/bcachefs/move_types.h
@@ -4,7 +4,8 @@
 
 struct bch_move_stats {
 	enum bch_data_type	data_type;
-	struct btree_iter	iter;
+	enum btree_id		btree_id;
+	struct bpos		pos;
 
 	atomic64_t		keys_moved;
 	atomic64_t		sectors_moved;
diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c
index 492ab73c39e7..f5dd13e92200 100644
--- a/fs/bcachefs/quota.c
+++ b/fs/bcachefs/quota.c
@@ -356,11 +356,14 @@ static int __bch2_quota_set(struct bch_fs *c, struct bkey_s_c k)
 
 static int bch2_quota_init_type(struct bch_fs *c, enum quota_types type)
 {
-	struct btree_iter iter;
+	struct btree_trans trans;
+	struct btree_iter *iter;
 	struct bkey_s_c k;
 	int ret = 0;
 
-	for_each_btree_key(&iter, c, BTREE_ID_QUOTAS, POS(type, 0),
+	bch2_trans_init(&trans, c);
+
+	for_each_btree_key(&trans, iter, BTREE_ID_QUOTAS, POS(type, 0),
 			   BTREE_ITER_PREFETCH, k) {
 		if (k.k->p.inode != type)
 			break;
@@ -370,7 +373,7 @@ static int bch2_quota_init_type(struct bch_fs *c, enum quota_types type)
 			break;
 	}
 
-	return bch2_btree_iter_unlock(&iter) ?: ret;
+	return bch2_trans_exit(&trans) ?: ret;
 }
 
 void bch2_fs_quota_exit(struct bch_fs *c)
@@ -414,7 +417,8 @@ int bch2_fs_quota_read(struct bch_fs *c)
 {
 	unsigned i, qtypes = enabled_qtypes(c);
 	struct bch_memquota_type *q;
-	struct btree_iter iter;
+	struct btree_trans trans;
+	struct btree_iter *iter;
 	struct bch_inode_unpacked u;
 	struct bkey_s_c k;
 	int ret;
@@ -429,7 +433,9 @@ int bch2_fs_quota_read(struct bch_fs *c)
 			return ret;
 	}
 
-	for_each_btree_key(&iter, c, BTREE_ID_INODES, POS_MIN,
+	bch2_trans_init(&trans, c);
+
+	for_each_btree_key(&trans, iter, BTREE_ID_INODES, POS_MIN,
 			   BTREE_ITER_PREFETCH, k) {
 		switch (k.k->type) {
 		case KEY_TYPE_inode:
@@ -443,7 +449,7 @@ int bch2_fs_quota_read(struct bch_fs *c)
 					KEY_TYPE_QUOTA_NOCHECK);
 		}
 	}
-	return bch2_btree_iter_unlock(&iter) ?: ret;
+	return bch2_trans_exit(&trans) ?: ret;
 }
 
 /* Enable/disable/delete quotas for an entire filesystem: */
diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c
index cc1a7deb90bc..fe4a9af92a76 100644
--- a/fs/bcachefs/rebalance.c
+++ b/fs/bcachefs/rebalance.c
@@ -289,8 +289,8 @@ ssize_t bch2_rebalance_work_show(struct bch_fs *c, char *buf)
 	case REBALANCE_RUNNING:
 		pr_buf(&out, "running\n");
 		pr_buf(&out, "pos %llu:%llu\n",
-		       r->move_stats.iter.pos.inode,
-		       r->move_stats.iter.pos.offset);
+		       r->move_stats.pos.inode,
+		       r->move_stats.pos.offset);
 		break;
 	}
 
diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h
index ffa7af0820ea..0ed28d7f074d 100644
--- a/fs/bcachefs/str_hash.h
+++ b/fs/bcachefs/str_hash.h
@@ -203,13 +203,16 @@ int bch2_hash_needs_whiteout(struct btree_trans *trans,
 	for_each_btree_key_continue(iter, BTREE_ITER_SLOTS, k) {
 		if (k.k->type != desc.key_type &&
 		    k.k->type != KEY_TYPE_whiteout)
-			return false;
+			break;
 
 		if (k.k->type == desc.key_type &&
-		    desc.hash_bkey(info, k) <= start->pos.offset)
-			return true;
+		    desc.hash_bkey(info, k) <= start->pos.offset) {
+			bch2_trans_iter_free_on_commit(trans, iter);
+			return 1;
+		}
 	}
-	return btree_iter_err(k);
+
+	return bch2_trans_iter_free(trans, iter);
 }
 
 static __always_inline
@@ -220,6 +223,8 @@ int bch2_hash_set(struct btree_trans *trans,
 {
 	struct btree_iter *iter, *slot = NULL;
 	struct bkey_s_c k;
+	bool found = false;
+	int ret = 0;
 
 	iter = bch2_trans_get_iter(trans, desc.btree_id,
 			POS(inode, desc.hash_bkey(info, bkey_i_to_s_c(insert))),
@@ -250,21 +255,30 @@ int bch2_hash_set(struct btree_trans *trans,
 			goto not_found;
 	}
 
-	return btree_iter_err(k) ?: -ENOSPC;
-not_found:
-	if (flags & BCH_HASH_SET_MUST_REPLACE)
-		return -ENOENT;
+	if (slot)
+		bch2_trans_iter_free(trans, iter);
 
-	insert->k.p = slot->pos;
-	bch2_trans_update(trans, BTREE_INSERT_ENTRY(slot, insert));
-	return 0;
+	return bch2_trans_iter_free(trans, iter) ?: -ENOSPC;
 found:
-	if (flags & BCH_HASH_SET_MUST_CREATE)
-		return -EEXIST;
+	found = true;
+not_found:
 
-	insert->k.p = iter->pos;
-	bch2_trans_update(trans, BTREE_INSERT_ENTRY(iter, insert));
-	return 0;
+	if (!found && (flags & BCH_HASH_SET_MUST_REPLACE)) {
+		ret = -ENOENT;
+	} else if (found && (flags & BCH_HASH_SET_MUST_CREATE)) {
+		ret = -EEXIST;
+	} else {
+		if (!found && slot) {
+			bch2_trans_iter_free(trans, iter);
+			iter = slot;
+		}
+
+		insert->k.p = iter->pos;
+		bch2_trans_update(trans, BTREE_INSERT_ENTRY(iter, insert));
+		bch2_trans_iter_free_on_commit(trans, iter);
+	}
+
+	return ret;
 }
 
 static __always_inline
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index f1e269671374..1354dd33874c 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -282,7 +282,8 @@ static ssize_t show_fs_alloc_debug(struct bch_fs *c, char *buf)
 
 static ssize_t bch2_compression_stats(struct bch_fs *c, char *buf)
 {
-	struct btree_iter iter;
+	struct btree_trans trans;
+	struct btree_iter *iter;
 	struct bkey_s_c k;
 	u64 nr_uncompressed_extents = 0, uncompressed_sectors = 0,
 	    nr_compressed_extents = 0,
@@ -292,7 +293,9 @@ static ssize_t bch2_compression_stats(struct bch_fs *c, char *buf)
 	if (!test_bit(BCH_FS_STARTED, &c->flags))
 		return -EPERM;
 
-	for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, POS_MIN, 0, k)
+	bch2_trans_init(&trans, c);
+
+	for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, POS_MIN, 0, k)
 		if (k.k->type == KEY_TYPE_extent) {
 			struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
 			const union bch_extent_entry *entry;
@@ -314,7 +317,7 @@ static ssize_t bch2_compression_stats(struct bch_fs *c, char *buf)
 				break;
 			}
 		}
-	bch2_btree_iter_unlock(&iter);
+	bch2_trans_exit(&trans);
 
 	return scnprintf(buf, PAGE_SIZE,
 			"uncompressed data:\n"
diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c
index 652e22125dcf..c8682fe674f6 100644
--- a/fs/bcachefs/tests.c
+++ b/fs/bcachefs/tests.c
@@ -89,11 +89,14 @@ static void test_delete_written(struct bch_fs *c, u64 nr)
 
 static void test_iterate(struct bch_fs *c, u64 nr)
 {
-	struct btree_iter iter;
+	struct btree_trans trans;
+	struct btree_iter *iter;
 	struct bkey_s_c k;
 	u64 i;
 	int ret;
 
+	bch2_trans_init(&trans, c);
+
 	delete_test_keys(c);
 
 	pr_info("inserting test keys");
@@ -113,28 +116,31 @@ static void test_iterate(struct bch_fs *c, u64 nr)
 
 	i = 0;
 
-	for_each_btree_key(&iter, c, BTREE_ID_DIRENTS, POS(0, 0), 0, k)
+	for_each_btree_key(&trans, iter, BTREE_ID_DIRENTS, POS(0, 0), 0, k)
 		BUG_ON(k.k->p.offset != i++);
-	bch2_btree_iter_unlock(&iter);
 
 	BUG_ON(i != nr);
 
 	pr_info("iterating backwards");
 
-	while (!IS_ERR_OR_NULL((k = bch2_btree_iter_prev(&iter)).k))
+	while (!IS_ERR_OR_NULL((k = bch2_btree_iter_prev(iter)).k))
 		BUG_ON(k.k->p.offset != --i);
-	bch2_btree_iter_unlock(&iter);
 
 	BUG_ON(i);
+
+	bch2_trans_exit(&trans);
 }
 
 static void test_iterate_extents(struct bch_fs *c, u64 nr)
 {
-	struct btree_iter iter;
+	struct btree_trans trans;
+	struct btree_iter *iter;
 	struct bkey_s_c k;
 	u64 i;
 	int ret;
 
+	bch2_trans_init(&trans, c);
+
 	delete_test_keys(c);
 
 	pr_info("inserting test extents");
@@ -155,32 +161,35 @@ static void test_iterate_extents(struct bch_fs *c, u64 nr)
 
 	i = 0;
 
-	for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, POS(0, 0), 0, k) {
+	for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, POS(0, 0), 0, k) {
 		BUG_ON(bkey_start_offset(k.k) != i);
 		i = k.k->p.offset;
 	}
-	bch2_btree_iter_unlock(&iter);
 
 	BUG_ON(i != nr);
 
 	pr_info("iterating backwards");
 
-	while (!IS_ERR_OR_NULL((k = bch2_btree_iter_prev(&iter)).k)) {
+	while (!IS_ERR_OR_NULL((k = bch2_btree_iter_prev(iter)).k)) {
 		BUG_ON(k.k->p.offset != i);
 		i = bkey_start_offset(k.k);
 	}
-	bch2_btree_iter_unlock(&iter);
 
 	BUG_ON(i);
+
+	bch2_trans_exit(&trans);
 }
 
 static void test_iterate_slots(struct bch_fs *c, u64 nr)
 {
-	struct btree_iter iter;
+	struct btree_trans trans;
+	struct btree_iter *iter;
 	struct bkey_s_c k;
 	u64 i;
 	int ret;
 
+	bch2_trans_init(&trans, c);
+
 	delete_test_keys(c);
 
 	pr_info("inserting test keys");
@@ -200,11 +209,11 @@ static void test_iterate_slots(struct bch_fs *c, u64 nr)
 
 	i = 0;
 
-	for_each_btree_key(&iter, c, BTREE_ID_DIRENTS, POS(0, 0), 0, k) {
+	for_each_btree_key(&trans, iter, BTREE_ID_DIRENTS, POS(0, 0), 0, k) {
 		BUG_ON(k.k->p.offset != i);
 		i += 2;
 	}
-	bch2_btree_iter_unlock(&iter);
+	bch2_trans_iter_free(&trans, iter);
 
 	BUG_ON(i != nr * 2);
 
@@ -212,7 +221,7 @@ static void test_iterate_slots(struct bch_fs *c, u64 nr)
 
 	i = 0;
 
-	for_each_btree_key(&iter, c, BTREE_ID_DIRENTS, POS(0, 0),
+	for_each_btree_key(&trans, iter, BTREE_ID_DIRENTS, POS(0, 0),
 			   BTREE_ITER_SLOTS, k) {
 		BUG_ON(bkey_deleted(k.k) != (i & 1));
 		BUG_ON(k.k->p.offset != i++);
@@ -220,16 +229,20 @@ static void test_iterate_slots(struct bch_fs *c, u64 nr)
 		if (i == nr * 2)
 			break;
 	}
-	bch2_btree_iter_unlock(&iter);
+
+	bch2_trans_exit(&trans);
 }
 
 static void test_iterate_slots_extents(struct bch_fs *c, u64 nr)
 {
-	struct btree_iter iter;
+	struct btree_trans trans;
+	struct btree_iter *iter;
 	struct bkey_s_c k;
 	u64 i;
 	int ret;
 
+	bch2_trans_init(&trans, c);
+
 	delete_test_keys(c);
 
 	pr_info("inserting test keys");
@@ -250,12 +263,12 @@ static void test_iterate_slots_extents(struct bch_fs *c, u64 nr)
 
 	i = 0;
 
-	for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, POS(0, 0), 0, k) {
+	for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, POS(0, 0), 0, k) {
 		BUG_ON(bkey_start_offset(k.k) != i + 8);
 		BUG_ON(k.k->size != 8);
 		i += 16;
 	}
-	bch2_btree_iter_unlock(&iter);
+	bch2_trans_iter_free(&trans, iter);
 
 	BUG_ON(i != nr);
 
@@ -263,7 +276,7 @@ static void test_iterate_slots_extents(struct bch_fs *c, u64 nr)
 
 	i = 0;
 
-	for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, POS(0, 0),
+	for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, POS(0, 0),
 			   BTREE_ITER_SLOTS, k) {
 		BUG_ON(bkey_deleted(k.k) != !(i % 16));
 
@@ -274,7 +287,8 @@ static void test_iterate_slots_extents(struct bch_fs *c, u64 nr)
 		if (i == nr)
 			break;
 	}
-	bch2_btree_iter_unlock(&iter);
+
+	bch2_trans_exit(&trans);
 }
 
 /*
@@ -283,34 +297,40 @@ static void test_iterate_slots_extents(struct bch_fs *c, u64 nr)
  */
 static void test_peek_end(struct bch_fs *c, u64 nr)
 {
-	struct btree_iter iter;
+	struct btree_trans trans;
+	struct btree_iter *iter;
 	struct bkey_s_c k;
 
-	bch2_btree_iter_init(&iter, c, BTREE_ID_DIRENTS, POS_MIN, 0);
+	bch2_trans_init(&trans, c);
+
+	iter = bch2_trans_get_iter(&trans, BTREE_ID_DIRENTS, POS_MIN, 0);
 
-	k = bch2_btree_iter_peek(&iter);
+	k = bch2_btree_iter_peek(iter);
 	BUG_ON(k.k);
 
-	k = bch2_btree_iter_peek(&iter);
+	k = bch2_btree_iter_peek(iter);
 	BUG_ON(k.k);
 
-	bch2_btree_iter_unlock(&iter);
+	bch2_trans_exit(&trans);
 }
 
 static void test_peek_end_extents(struct bch_fs *c, u64 nr)
 {
-	struct btree_iter iter;
+	struct btree_trans trans;
+	struct btree_iter *iter;
 	struct bkey_s_c k;
 
-	bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, POS_MIN, 0);
+	bch2_trans_init(&trans, c);
+
+	iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, POS_MIN, 0);
 
-	k = bch2_btree_iter_peek(&iter);
+	k = bch2_btree_iter_peek(iter);
 	BUG_ON(k.k);
 
-	k = bch2_btree_iter_peek(&iter);
+	k = bch2_btree_iter_peek(iter);
 	BUG_ON(k.k);
 
-	bch2_btree_iter_unlock(&iter);
+	bch2_trans_exit(&trans);
 }
 
 /* extent unit tests */
@@ -401,32 +421,35 @@ static void rand_insert(struct bch_fs *c, u64 nr)
 
 static void rand_lookup(struct bch_fs *c, u64 nr)
 {
+	struct btree_trans trans;
+	struct btree_iter *iter;
+	struct bkey_s_c k;
 	u64 i;
 
-	for (i = 0; i < nr; i++) {
-		struct btree_iter iter;
-		struct bkey_s_c k;
+	bch2_trans_init(&trans, c);
 
-		bch2_btree_iter_init(&iter, c, BTREE_ID_DIRENTS,
-				     POS(0, test_rand()), 0);
+	for (i = 0; i < nr; i++) {
+		iter = bch2_trans_get_iter(&trans, BTREE_ID_DIRENTS,
+					   POS(0, test_rand()), 0);
 
-		k = bch2_btree_iter_peek(&iter);
-		bch2_btree_iter_unlock(&iter);
+		k = bch2_btree_iter_peek(iter);
+		bch2_trans_iter_free(&trans, iter);
 	}
+
+	bch2_trans_exit(&trans);
 }
 
 static void rand_mixed(struct bch_fs *c, u64 nr)
 {
+	struct btree_trans trans;
+	struct btree_iter *iter;
+	struct bkey_s_c k;
 	int ret;
 	u64 i;
 
-	for (i = 0; i < nr; i++) {
-		struct btree_trans trans;
-		struct btree_iter *iter;
-		struct bkey_s_c k;
-
-		bch2_trans_init(&trans, c);
+	bch2_trans_init(&trans, c);
 
+	for (i = 0; i < nr; i++) {
 		iter = bch2_trans_get_iter(&trans, BTREE_ID_DIRENTS,
 					   POS(0, test_rand()), 0);
 
@@ -443,9 +466,10 @@ static void rand_mixed(struct bch_fs *c, u64 nr)
 			BUG_ON(ret);
 		}
 
-		bch2_trans_exit(&trans);
+		bch2_trans_iter_free(&trans, iter);
 	}
 
+	bch2_trans_exit(&trans);
 }
 
 static void rand_delete(struct bch_fs *c, u64 nr)
@@ -495,12 +519,15 @@ static void seq_insert(struct bch_fs *c, u64 nr)
 
 static void seq_lookup(struct bch_fs *c, u64 nr)
 {
-	struct btree_iter iter;
+	struct btree_trans trans;
+	struct btree_iter *iter;
 	struct bkey_s_c k;
 
-	for_each_btree_key(&iter, c, BTREE_ID_DIRENTS, POS_MIN, 0, k)
+	bch2_trans_init(&trans, c);
+
+	for_each_btree_key(&trans, iter, BTREE_ID_DIRENTS, POS_MIN, 0, k)
 		;
-	bch2_btree_iter_unlock(&iter);
+	bch2_trans_exit(&trans);
 }
 
 static void seq_overwrite(struct bch_fs *c, u64 nr)
diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c
index 545e743972fb..68ece7c0ee7a 100644
--- a/fs/bcachefs/xattr.c
+++ b/fs/bcachefs/xattr.c
@@ -271,12 +271,16 @@ ssize_t bch2_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size)
 {
 	struct bch_fs *c = dentry->d_sb->s_fs_info;
 	struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
-	struct btree_iter iter;
+	struct btree_trans trans;
+	struct btree_iter *iter;
 	struct bkey_s_c k;
 	u64 inum = dentry->d_inode->i_ino;
 	ssize_t ret = 0;
 
-	for_each_btree_key(&iter, c, BTREE_ID_XATTRS, POS(inum, 0), 0, k) {
+	bch2_trans_init(&trans, c);
+
+	for_each_btree_key(&trans, iter, BTREE_ID_XATTRS,
+			   POS(inum, 0), 0, k) {
 		BUG_ON(k.k->p.inode < inum);
 
 		if (k.k->p.inode > inum)
@@ -290,7 +294,7 @@ ssize_t bch2_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size)
 		if (ret < 0)
 			break;
 	}
-	bch2_btree_iter_unlock(&iter);
+	bch2_trans_exit(&trans);
 
 	if (ret < 0)
 		return ret;
-- 
cgit 


From 9e5e5b9e7175ddd66f8c212ce1d460a9e7db3fe7 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 25 Mar 2019 17:06:42 -0400
Subject: bcachefs: Btree iterators now always have a btree_trans

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c  | 36 ++++++++++++++++++------------------
 fs/bcachefs/btree_iter.h  | 21 ++++++---------------
 fs/bcachefs/btree_types.h |  3 ++-
 3 files changed, 26 insertions(+), 34 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index b2446b14bf33..4baa4ab3aa4b 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -78,7 +78,6 @@ void bch2_btree_node_unlock_write(struct btree *b, struct btree_iter *iter)
 
 void __bch2_btree_node_lock_write(struct btree *b, struct btree_iter *iter)
 {
-	struct bch_fs *c = iter->c;
 	struct btree_iter *linked;
 	unsigned readers = 0;
 
@@ -97,7 +96,7 @@ void __bch2_btree_node_lock_write(struct btree *b, struct btree_iter *iter)
 	 */
 	atomic64_sub(__SIX_VAL(read_lock, readers),
 		     &b->lock.state.counter);
-	btree_node_lock_type(c, b, SIX_LOCK_write);
+	btree_node_lock_type(iter->trans->c, b, SIX_LOCK_write);
 	atomic64_add(__SIX_VAL(read_lock, readers),
 		     &b->lock.state.counter);
 }
@@ -199,7 +198,6 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
 			   enum six_lock_type type,
 			   bool may_drop_locks)
 {
-	struct bch_fs *c = iter->c;
 	struct btree_iter *linked;
 	bool ret = true;
 
@@ -254,7 +252,7 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
 	}
 
 	if (ret)
-		__btree_node_lock_type(c, b, type);
+		__btree_node_lock_type(iter->trans->c, b, type);
 	else
 		trans_restart();
 
@@ -644,8 +642,8 @@ static inline struct bkey_s_c __btree_iter_unpack(struct btree_iter *iter,
 
 	ret = bkey_disassemble(l->b, k, u);
 
-	if (debug_check_bkeys(iter->c))
-		bch2_bkey_debugcheck(iter->c, l->b, ret);
+	if (debug_check_bkeys(iter->trans->c))
+		bch2_bkey_debugcheck(iter->trans->c, l->b, ret);
 
 	return ret;
 }
@@ -834,7 +832,7 @@ void bch2_btree_iter_reinit_node(struct btree_iter *iter, struct btree *b)
 static inline int btree_iter_lock_root(struct btree_iter *iter,
 				       unsigned depth_want)
 {
-	struct bch_fs *c = iter->c;
+	struct bch_fs *c = iter->trans->c;
 	struct btree *b;
 	enum six_lock_type lock_type;
 	unsigned i;
@@ -882,11 +880,12 @@ static inline int btree_iter_lock_root(struct btree_iter *iter,
 noinline
 static void btree_iter_prefetch(struct btree_iter *iter)
 {
+	struct bch_fs *c = iter->trans->c;
 	struct btree_iter_level *l = &iter->l[iter->level];
 	struct btree_node_iter node_iter = l->iter;
 	struct bkey_packed *k;
 	BKEY_PADDED(k) tmp;
-	unsigned nr = test_bit(BCH_FS_STARTED, &iter->c->flags)
+	unsigned nr = test_bit(BCH_FS_STARTED, &c->flags)
 		? (iter->level > 1 ? 0 :  2)
 		: (iter->level > 1 ? 1 : 16);
 	bool was_locked = btree_node_locked(iter, iter->level);
@@ -901,8 +900,7 @@ static void btree_iter_prefetch(struct btree_iter *iter)
 			break;
 
 		bch2_bkey_unpack(l->b, &tmp.k, k);
-		bch2_btree_node_prefetch(iter->c, iter, &tmp.k,
-					 iter->level - 1);
+		bch2_btree_node_prefetch(c, iter, &tmp.k, iter->level - 1);
 	}
 
 	if (!was_locked)
@@ -911,6 +909,7 @@ static void btree_iter_prefetch(struct btree_iter *iter)
 
 static inline int btree_iter_down(struct btree_iter *iter)
 {
+	struct bch_fs *c = iter->trans->c;
 	struct btree_iter_level *l = &iter->l[iter->level];
 	struct btree *b;
 	unsigned level = iter->level - 1;
@@ -922,7 +921,7 @@ static inline int btree_iter_down(struct btree_iter *iter)
 	bch2_bkey_unpack(l->b, &tmp.k,
 			 bch2_btree_node_iter_peek(&l->iter, l->b));
 
-	b = bch2_btree_node_get(iter->c, iter, &tmp.k, level, lock_type, true);
+	b = bch2_btree_node_get(c, iter, &tmp.k, level, lock_type, true);
 	if (unlikely(IS_ERR(b)))
 		return PTR_ERR(b);
 
@@ -946,7 +945,7 @@ int __must_check __bch2_btree_iter_traverse(struct btree_iter *);
 
 static int btree_iter_traverse_error(struct btree_iter *iter, int ret)
 {
-	struct bch_fs *c = iter->c;
+	struct bch_fs *c = iter->trans->c;
 	struct btree_iter *linked, *sorted_iters, **i;
 retry_all:
 	bch2_btree_iter_unlock(iter);
@@ -1275,9 +1274,9 @@ static inline struct bkey_s_c btree_iter_peek_uptodate(struct btree_iter *iter)
 			__bch2_btree_node_iter_peek_all(&l->iter, l->b));
 	}
 
-	if (debug_check_bkeys(iter->c) &&
+	if (debug_check_bkeys(iter->trans->c) &&
 	    !bkey_deleted(ret.k))
-		bch2_bkey_debugcheck(iter->c, l->b, ret);
+		bch2_bkey_debugcheck(iter->trans->c, l->b, ret);
 	return ret;
 }
 
@@ -1582,17 +1581,18 @@ struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *iter)
 	return __bch2_btree_iter_peek_slot(iter);
 }
 
-static inline void bch2_btree_iter_init(struct btree_iter *iter,
-			struct bch_fs *c, enum btree_id btree_id,
+static inline void bch2_btree_iter_init(struct btree_trans *trans,
+			struct btree_iter *iter, enum btree_id btree_id,
 			struct bpos pos, unsigned flags)
 {
+	struct bch_fs *c = trans->c;
 	unsigned i;
 
 	if (btree_id == BTREE_ID_EXTENTS &&
 	    !(flags & BTREE_ITER_NODES))
 		flags |= BTREE_ITER_IS_EXTENTS;
 
-	iter->c				= c;
+	iter->trans			= trans;
 	iter->pos			= pos;
 	bkey_init(&iter->k);
 	iter->k.p			= pos;
@@ -1828,7 +1828,7 @@ got_slot:
 		iter = &trans->iters[idx];
 		iter->id = iter_id;
 
-		bch2_btree_iter_init(iter, trans->c, btree_id, pos, flags);
+		bch2_btree_iter_init(trans, iter, btree_id, pos, flags);
 	} else {
 		iter = &trans->iters[idx];
 
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index 267cecd05d84..285490697a96 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -193,17 +193,19 @@ static inline int btree_iter_cmp(const struct btree_iter *l,
 	return __btree_iter_cmp(l->btree_id, l->pos, r);
 }
 
+int bch2_trans_unlock(struct btree_trans *);
+
 /*
  * Unlocks before scheduling
  * Note: does not revalidate iterator
  */
-static inline void bch2_btree_iter_cond_resched(struct btree_iter *iter)
+static inline void bch2_trans_cond_resched(struct btree_trans *trans)
 {
 	if (need_resched()) {
-		bch2_btree_iter_unlock(iter);
+		bch2_trans_unlock(trans);
 		schedule();
 	} else if (race_fault()) {
-		bch2_btree_iter_unlock(iter);
+		bch2_trans_unlock(trans);
 	}
 }
 
@@ -231,7 +233,7 @@ static inline struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter,
 static inline struct bkey_s_c __bch2_btree_iter_next(struct btree_iter *iter,
 						     unsigned flags)
 {
-	bch2_btree_iter_cond_resched(iter);
+	bch2_trans_cond_resched(iter->trans);
 
 	return flags & BTREE_ITER_SLOTS
 		? bch2_btree_iter_next_slot(iter)
@@ -307,20 +309,9 @@ static inline void bch2_trans_begin_updates(struct btree_trans *trans)
 }
 
 void *bch2_trans_kmalloc(struct btree_trans *, size_t);
-int bch2_trans_unlock(struct btree_trans *);
 void bch2_trans_init(struct btree_trans *, struct bch_fs *);
 int bch2_trans_exit(struct btree_trans *);
 
-static inline void bch2_trans_cond_resched(struct btree_trans *trans)
-{
-	if (need_resched()) {
-		bch2_trans_unlock(trans);
-		schedule();
-	} else if (race_fault()) {
-		bch2_trans_unlock(trans);
-	}
-}
-
 #ifdef TRACE_TRANSACTION_RESTARTS
 #define bch2_trans_begin(_trans)					\
 do {									\
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 18c906ca78be..fc79631ea480 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -11,6 +11,7 @@
 
 struct open_bucket;
 struct btree_update;
+struct btree_trans;
 
 #define MAX_BSETS		3U
 
@@ -209,7 +210,7 @@ enum btree_iter_uptodate {
  * @nodes_intent_locked	- bitmask indicating which locks are intent locks
  */
 struct btree_iter {
-	struct bch_fs		*c;
+	struct btree_trans	*trans;
 	struct bpos		pos;
 
 	u8			flags;
-- 
cgit 


From 7c26ecae326aee84bb53cfb163108a20fb3094d1 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 25 Mar 2019 22:43:26 -0400
Subject: bcachefs: Better bch2_trans_copy_iter()

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c | 94 +++++++++++++++++++++++++++++-------------------
 fs/bcachefs/btree_iter.h | 11 ++----
 2 files changed, 59 insertions(+), 46 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 4baa4ab3aa4b..5280e77f548f 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1606,7 +1606,6 @@ static inline void bch2_btree_iter_init(struct btree_trans *trans,
 	for (i = 0; i < ARRAY_SIZE(iter->l); i++)
 		iter->l[i].b		= NULL;
 	iter->l[iter->level].b		= BTREE_ITER_NOT_END;
-	iter->next			= iter;
 
 	prefetch(c->btree_roots[btree_id].b);
 }
@@ -1638,11 +1637,11 @@ static void bch2_btree_iter_link(struct btree_iter *iter, struct btree_iter *new
 	iter->next = new;
 }
 
-void bch2_btree_iter_copy(struct btree_iter *dst, struct btree_iter *src)
+static void __bch2_btree_iter_copy(struct btree_iter *dst,
+				   struct btree_iter *src)
 {
 	unsigned i;
 
-	__bch2_btree_iter_unlock(dst);
 	memcpy(dst, src, offsetof(struct btree_iter, next));
 
 	for (i = 0; i < BTREE_MAX_DEPTH; i++)
@@ -1651,6 +1650,12 @@ void bch2_btree_iter_copy(struct btree_iter *dst, struct btree_iter *src)
 					   __btree_lock_want(dst, i));
 }
 
+void bch2_btree_iter_copy(struct btree_iter *dst, struct btree_iter *src)
+{
+	__bch2_btree_iter_unlock(dst);
+	__bch2_btree_iter_copy(dst, src);
+}
+
 /* new transactional stuff: */
 
 static void btree_trans_verify(struct btree_trans *trans)
@@ -1789,6 +1794,35 @@ void bch2_trans_preload_iters(struct btree_trans *trans)
 	btree_trans_realloc_iters(trans, BTREE_ITER_MAX);
 }
 
+static int btree_trans_iter_alloc(struct btree_trans *trans)
+{
+	struct btree_iter *iter;
+	unsigned idx = ffz(trans->iters_linked);
+
+	if (idx < trans->nr_iters)
+		goto got_slot;
+
+	if (trans->nr_iters == trans->size) {
+		int ret = btree_trans_realloc_iters(trans, trans->size * 2);
+		if (ret)
+			return ret;
+	}
+
+	idx = trans->nr_iters++;
+	BUG_ON(trans->nr_iters > trans->size);
+got_slot:
+	iter = &trans->iters[idx];
+	iter->next = iter;
+
+	BUG_ON(trans->iters_linked & (1ULL << idx));
+
+	if (trans->iters_linked)
+		bch2_btree_iter_link(&trans->iters[__ffs(trans->iters_linked)],
+				     &trans->iters[idx]);
+	trans->iters_linked |= 1ULL << idx;
+	return idx;
+}
+
 static struct btree_iter *__btree_trans_get_iter(struct btree_trans *trans,
 						 unsigned btree_id, struct bpos pos,
 						 unsigned flags, u64 iter_id)
@@ -1799,6 +1833,9 @@ static struct btree_iter *__btree_trans_get_iter(struct btree_trans *trans,
 	BUG_ON(trans->nr_iters > BTREE_ITER_MAX);
 
 	for (idx = 0; idx < trans->nr_iters; idx++) {
+		if (!(trans->iters_linked & (1ULL << idx)))
+			continue;
+
 		iter = &trans->iters[idx];
 		if (iter_id
 		    ? iter->id == iter_id
@@ -1809,22 +1846,10 @@ static struct btree_iter *__btree_trans_get_iter(struct btree_trans *trans,
 	idx = -1;
 found:
 	if (idx < 0) {
-		idx = ffz(trans->iters_linked);
-		if (idx < trans->nr_iters)
-			goto got_slot;
+		idx = btree_trans_iter_alloc(trans);
+		if (idx < 0)
+			return ERR_PTR(idx);
 
-		BUG_ON(trans->nr_iters > trans->size);
-
-		if (trans->nr_iters == trans->size) {
-			int ret = btree_trans_realloc_iters(trans,
-							trans->size * 2);
-			if (ret)
-				return ERR_PTR(ret);
-		}
-
-		idx = trans->nr_iters++;
-		BUG_ON(trans->nr_iters > trans->size);
-got_slot:
 		iter = &trans->iters[idx];
 		iter->id = iter_id;
 
@@ -1841,13 +1866,6 @@ got_slot:
 	trans->iters_live	|= 1ULL << idx;
 	trans->iters_touched	|= 1ULL << idx;
 
-	if (trans->iters_linked &&
-	    !(trans->iters_linked & (1 << idx)))
-		bch2_btree_iter_link(&trans->iters[__ffs(trans->iters_linked)],
-				     iter);
-
-	trans->iters_linked |= 1ULL << idx;
-
 	btree_trans_verify(trans);
 
 	BUG_ON(iter->btree_id != btree_id);
@@ -1894,20 +1912,22 @@ struct btree_iter *bch2_trans_get_node_iter(struct btree_trans *trans,
 	return iter;
 }
 
-struct btree_iter *__bch2_trans_copy_iter(struct btree_trans *trans,
-					  struct btree_iter *src,
-					  u64 iter_id)
+struct btree_iter *bch2_trans_copy_iter(struct btree_trans *trans,
+					struct btree_iter *src)
 {
-	struct btree_iter *iter =
-		__btree_trans_get_iter(trans, src->btree_id,
-				       POS_MIN, src->flags, iter_id);
+	int idx;
 
-	if (!IS_ERR(iter)) {
-		trans->iters_unlink_on_restart |=
-			1ULL << btree_trans_iter_idx(trans, iter);
-		bch2_btree_iter_copy(iter, src);
-	}
-	return iter;
+	idx = btree_trans_iter_alloc(trans);
+	if (idx < 0)
+		return ERR_PTR(idx);
+
+	trans->iters_live		|= 1ULL << idx;
+	trans->iters_touched		|= 1ULL << idx;
+	trans->iters_unlink_on_restart	|= 1ULL << idx;
+
+	__bch2_btree_iter_copy(&trans->iters[idx], src);
+
+	return &trans->iters[idx];
 }
 
 void *bch2_trans_kmalloc(struct btree_trans *trans,
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index 285490697a96..4be472e45310 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -268,8 +268,8 @@ void bch2_trans_unlink_iters(struct btree_trans *, u64);
 
 struct btree_iter *__bch2_trans_get_iter(struct btree_trans *, enum btree_id,
 					 struct bpos, unsigned, u64);
-struct btree_iter *__bch2_trans_copy_iter(struct btree_trans *,
-					  struct btree_iter *, u64);
+struct btree_iter *bch2_trans_copy_iter(struct btree_trans *,
+					struct btree_iter *);
 
 static __always_inline u64 __btree_iter_id(void)
 {
@@ -290,13 +290,6 @@ bch2_trans_get_iter(struct btree_trans *trans, enum btree_id btree_id,
 				     __btree_iter_id());
 }
 
-static __always_inline struct btree_iter *
-bch2_trans_copy_iter(struct btree_trans *trans, struct btree_iter *src)
-{
-
-	return __bch2_trans_copy_iter(trans, src, __btree_iter_id());
-}
-
 struct btree_iter *bch2_trans_get_node_iter(struct btree_trans *,
 				enum btree_id, struct bpos,
 				unsigned, unsigned, unsigned);
-- 
cgit 


From 0f2383677172176691bbc760c5af4d87f67f78fc Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 27 Mar 2019 22:03:30 -0400
Subject: bcachefs: trans_for_each_iter()

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c      |   2 +-
 fs/bcachefs/btree_cache.c           |   6 +-
 fs/bcachefs/btree_iter.c            |  98 +++++++++++++++-------------
 fs/bcachefs/btree_iter.h            | 103 +++++++++++++++++-------------
 fs/bcachefs/btree_locking.h         |   5 +-
 fs/bcachefs/btree_update_interior.c |  30 ++++-----
 fs/bcachefs/btree_update_leaf.c     |  64 +++++--------------
 fs/bcachefs/debug.c                 |   4 +-
 fs/bcachefs/ec.c                    |   8 +--
 fs/bcachefs/fs-io.c                 |  38 +++++------
 fs/bcachefs/fsck.c                  | 123 +++++++++++++++++++-----------------
 fs/bcachefs/inode.c                 |   4 +-
 fs/bcachefs/io.c                    |  10 +--
 fs/bcachefs/migrate.c               |   2 +-
 fs/bcachefs/move.c                  |   4 +-
 fs/bcachefs/quota.c                 |   2 +-
 16 files changed, 255 insertions(+), 248 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 436eb1e1ab07..3549f0f54624 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -945,7 +945,7 @@ static int bch2_invalidate_one_bucket2(struct btree_trans *trans,
 	bch2_btree_iter_set_pos(iter, POS(ca->dev_idx, b));
 retry:
 	k = bch2_btree_iter_peek_slot(iter);
-	ret = btree_iter_err(k);
+	ret = bkey_err(k);
 	if (ret)
 		return ret;
 
diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index 65fc82fba071..55aaa3e4aa84 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -814,7 +814,7 @@ struct btree *bch2_btree_node_get_sibling(struct bch_fs *c,
 		 * We might have got -EINTR because trylock failed, and we're
 		 * holding other locks that would cause us to deadlock:
 		 */
-		for_each_linked_btree_iter(iter, linked)
+		trans_for_each_iter(iter->trans, linked)
 			if (btree_iter_cmp(iter, linked) < 0)
 				__bch2_btree_iter_unlock(linked);
 
@@ -839,13 +839,13 @@ struct btree *bch2_btree_node_get_sibling(struct bch_fs *c,
 			}
 		}
 
-		bch2_btree_iter_relock(iter);
+		bch2_btree_trans_relock(iter->trans);
 	}
 out:
 	if (btree_lock_want(iter, level + 1) == BTREE_NODE_UNLOCKED)
 		btree_node_unlock(iter, level + 1);
 
-	bch2_btree_iter_verify_locks(iter);
+	bch2_btree_trans_verify_locks(iter->trans);
 
 	BUG_ON((!may_drop_locks || !IS_ERR(ret)) &&
 	       (iter->uptodate >= BTREE_ITER_NEED_RELOCK ||
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 5280e77f548f..f018ca7999f7 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -70,7 +70,7 @@ void bch2_btree_node_unlock_write(struct btree *b, struct btree_iter *iter)
 	EBUG_ON(iter->l[b->level].b != b);
 	EBUG_ON(iter->l[b->level].lock_seq + 1 != b->lock.state.seq);
 
-	for_each_btree_iter_with_node(iter, b, linked)
+	trans_for_each_iter_with_node(iter->trans, b, linked)
 		linked->l[b->level].lock_seq += 2;
 
 	six_unlock_write(&b->lock);
@@ -83,7 +83,7 @@ void __bch2_btree_node_lock_write(struct btree *b, struct btree_iter *iter)
 
 	EBUG_ON(btree_node_read_locked(iter, b->level));
 
-	for_each_linked_btree_iter(iter, linked)
+	trans_for_each_iter(iter->trans, linked)
 		if (linked->l[b->level].b == b &&
 		    btree_node_read_locked(linked, b->level))
 			readers++;
@@ -187,7 +187,8 @@ static inline bool btree_iter_get_locks(struct btree_iter *iter,
 	if (iter->uptodate == BTREE_ITER_NEED_RELOCK)
 		iter->uptodate = BTREE_ITER_NEED_PEEK;
 
-	bch2_btree_iter_verify_locks(iter);
+	bch2_btree_trans_verify_locks(iter->trans);
+
 	return iter->uptodate < BTREE_ITER_NEED_RELOCK;
 }
 
@@ -202,7 +203,7 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
 	bool ret = true;
 
 	/* Check if it's safe to block: */
-	for_each_btree_iter(iter, linked) {
+	trans_for_each_iter(iter->trans, linked) {
 		if (!linked->nodes_locked)
 			continue;
 
@@ -262,7 +263,7 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
 /* Btree iterator locking: */
 
 #ifdef CONFIG_BCACHEFS_DEBUG
-void __bch2_btree_iter_verify_locks(struct btree_iter *iter)
+void bch2_btree_iter_verify_locks(struct btree_iter *iter)
 {
 	unsigned l;
 
@@ -279,35 +280,23 @@ void __bch2_btree_iter_verify_locks(struct btree_iter *iter)
 	}
 }
 
-void bch2_btree_iter_verify_locks(struct btree_iter *iter)
+void bch2_btree_trans_verify_locks(struct btree_trans *trans)
 {
-	struct btree_iter *linked;
-
-	for_each_btree_iter(iter, linked)
-		__bch2_btree_iter_verify_locks(linked);
+	struct btree_iter *iter;
 
+	trans_for_each_iter(trans, iter)
+		bch2_btree_iter_verify_locks(iter);
 }
 #endif
 
 __flatten
-static bool __bch2_btree_iter_relock(struct btree_iter *iter)
+static bool bch2_btree_iter_relock(struct btree_iter *iter)
 {
 	return iter->uptodate >= BTREE_ITER_NEED_RELOCK
 		? btree_iter_get_locks(iter, false)
 		: true;
 }
 
-bool bch2_btree_iter_relock(struct btree_iter *iter)
-{
-	struct btree_iter *linked;
-	bool ret = true;
-
-	for_each_btree_iter(iter, linked)
-		ret &= __bch2_btree_iter_relock(linked);
-
-	return ret;
-}
-
 bool __bch2_btree_iter_upgrade(struct btree_iter *iter,
 			       unsigned new_locks_want)
 {
@@ -325,8 +314,9 @@ bool __bch2_btree_iter_upgrade(struct btree_iter *iter,
 	 * on iterators that might lock ancestors before us to avoid getting
 	 * -EINTR later:
 	 */
-	for_each_linked_btree_iter(iter, linked)
-		if (linked->btree_id == iter->btree_id &&
+	trans_for_each_iter(iter->trans, linked)
+		if (linked != iter &&
+		    linked->btree_id == iter->btree_id &&
 		    btree_iter_cmp(linked, iter) <= 0 &&
 		    linked->locks_want < new_locks_want) {
 			linked->locks_want = new_locks_want;
@@ -371,7 +361,7 @@ void __bch2_btree_iter_downgrade(struct btree_iter *iter,
 	 * might have had to modify locks_want on linked iterators due to lock
 	 * ordering:
 	 */
-	for_each_btree_iter(iter, linked) {
+	trans_for_each_iter(iter->trans, linked) {
 		unsigned new_locks_want = downgrade_to ?:
 			(linked->flags & BTREE_ITER_INTENT ? 1 : 0);
 
@@ -394,19 +384,40 @@ void __bch2_btree_iter_downgrade(struct btree_iter *iter,
 		}
 	}
 
-	bch2_btree_iter_verify_locks(iter);
+	bch2_btree_trans_verify_locks(iter->trans);
 }
 
 int bch2_btree_iter_unlock(struct btree_iter *iter)
 {
 	struct btree_iter *linked;
 
-	for_each_btree_iter(iter, linked)
+	trans_for_each_iter(iter->trans, linked)
 		__bch2_btree_iter_unlock(linked);
 
-	return iter->flags & BTREE_ITER_ERROR ? -EIO : 0;
+	return btree_iter_err(iter);
 }
 
+bool bch2_btree_trans_relock(struct btree_trans *trans)
+{
+	struct btree_iter *iter;
+	bool ret = true;
+
+	trans_for_each_iter(trans, iter)
+		ret &= bch2_btree_iter_relock(iter);
+
+	return ret;
+}
+
+void bch2_btree_trans_unlock(struct btree_trans *trans)
+{
+	struct btree_iter *iter;
+
+	trans_for_each_iter(trans, iter)
+		__bch2_btree_iter_unlock(iter);
+}
+
+/* Btree transaction locking: */
+
 /* Btree iterator: */
 
 #ifdef CONFIG_BCACHEFS_DEBUG
@@ -464,7 +475,7 @@ void bch2_btree_iter_verify(struct btree_iter *iter, struct btree *b)
 {
 	struct btree_iter *linked;
 
-	for_each_btree_iter_with_node(iter, b, linked)
+	trans_for_each_iter_with_node(iter->trans, b, linked)
 		__bch2_btree_iter_verify(linked, b);
 }
 
@@ -618,7 +629,7 @@ void bch2_btree_node_iter_fix(struct btree_iter *iter,
 		__bch2_btree_node_iter_fix(iter, b, node_iter, t,
 					  where, clobber_u64s, new_u64s);
 
-	for_each_btree_iter_with_node(iter, b, linked)
+	trans_for_each_iter_with_node(iter->trans, b, linked)
 		__bch2_btree_node_iter_fix(linked, b,
 					  &linked->l[b->level].iter, t,
 					  where, clobber_u64s, new_u64s);
@@ -776,7 +787,7 @@ void bch2_btree_iter_node_replace(struct btree_iter *iter, struct btree *b)
 	enum btree_node_locked_type t;
 	struct btree_iter *linked;
 
-	for_each_btree_iter(iter, linked)
+	trans_for_each_iter(iter->trans, linked)
 		if (btree_iter_pos_in_node(linked, b)) {
 			/*
 			 * bch2_btree_iter_node_drop() has already been called -
@@ -810,7 +821,7 @@ void bch2_btree_iter_node_drop(struct btree_iter *iter, struct btree *b)
 	iter->l[level].b = BTREE_ITER_NOT_END;
 	mark_btree_node_unlocked(iter, level);
 
-	for_each_btree_iter(iter, linked)
+	trans_for_each_iter(iter->trans, linked)
 		if (linked->l[level].b == b) {
 			__btree_node_unlock(linked, level);
 			linked->l[level].b = BTREE_ITER_NOT_END;
@@ -825,7 +836,7 @@ void bch2_btree_iter_reinit_node(struct btree_iter *iter, struct btree *b)
 {
 	struct btree_iter *linked;
 
-	for_each_btree_iter_with_node(iter, b, linked)
+	trans_for_each_iter_with_node(iter->trans, b, linked)
 		__btree_iter_init(linked, b->level);
 }
 
@@ -1005,7 +1016,7 @@ retry:
 		iter = iter->next;
 	} while (iter != sorted_iters);
 
-	ret = btree_iter_linked(iter) ? -EINTR : 0;
+	ret = btree_trans_has_multiple_iters(iter->trans) ? -EINTR : 0;
 out:
 	bch2_btree_cache_cannibalize_unlock(c);
 	return ret;
@@ -1051,7 +1062,7 @@ int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter)
 	if (unlikely(iter->level >= BTREE_MAX_DEPTH))
 		return 0;
 
-	if (__bch2_btree_iter_relock(iter))
+	if (bch2_btree_iter_relock(iter))
 		return 0;
 
 	/*
@@ -1091,7 +1102,7 @@ int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter)
 
 	iter->uptodate = BTREE_ITER_NEED_PEEK;
 
-	bch2_btree_iter_verify_locks(iter);
+	bch2_btree_trans_verify_locks(iter->trans);
 	__bch2_btree_iter_verify(iter, iter->l[iter->level].b);
 	return 0;
 }
@@ -1104,7 +1115,7 @@ int __must_check bch2_btree_iter_traverse(struct btree_iter *iter)
 	if (unlikely(ret))
 		ret = btree_iter_traverse_error(iter, ret);
 
-	BUG_ON(ret == -EINTR && !btree_iter_linked(iter));
+	BUG_ON(ret == -EINTR && !btree_trans_has_multiple_iters(iter->trans));
 
 	return ret;
 }
@@ -1117,7 +1128,7 @@ static inline void bch2_btree_iter_checks(struct btree_iter *iter,
 		(iter->btree_id == BTREE_ID_EXTENTS &&
 		 type != BTREE_ITER_NODES));
 
-	bch2_btree_iter_verify_locks(iter);
+	bch2_btree_trans_verify_locks(iter->trans);
 }
 
 /* Iterate across nodes (leaf and interior nodes) */
@@ -1619,7 +1630,7 @@ static void bch2_btree_iter_unlink(struct btree_iter *iter)
 	if (!btree_iter_linked(iter))
 		return;
 
-	for_each_linked_btree_iter(iter, linked)
+	trans_for_each_iter(iter->trans, linked)
 		if (linked->next == iter) {
 			linked->next = iter->next;
 			iter->next = iter;
@@ -1686,7 +1697,7 @@ int bch2_trans_iter_put(struct btree_trans *trans,
 			struct btree_iter *iter)
 {
 	ssize_t idx = btree_trans_iter_idx(trans, iter);
-	int ret = (iter->flags & BTREE_ITER_ERROR) ? -EIO : 0;
+	int ret = btree_iter_err(iter);
 
 	trans->iters_live	&= ~(1ULL << idx);
 	return ret;
@@ -1706,7 +1717,7 @@ static inline void __bch2_trans_iter_free(struct btree_trans *trans,
 int bch2_trans_iter_free(struct btree_trans *trans,
 			 struct btree_iter *iter)
 {
-	int ret = (iter->flags & BTREE_ITER_ERROR) ? -EIO : 0;
+	int ret = btree_iter_err(iter);
 
 	__bch2_trans_iter_free(trans, btree_trans_iter_idx(trans, iter));
 	return ret;
@@ -1715,7 +1726,7 @@ int bch2_trans_iter_free(struct btree_trans *trans,
 int bch2_trans_iter_free_on_commit(struct btree_trans *trans,
 				   struct btree_iter *iter)
 {
-	int ret = (iter->flags & BTREE_ITER_ERROR) ? -EIO : 0;
+	int ret = btree_iter_err(iter);
 
 	trans->iters_unlink_on_commit |=
 		1ULL << btree_trans_iter_idx(trans, iter);
@@ -1966,8 +1977,7 @@ int bch2_trans_unlock(struct btree_trans *trans)
 		unsigned idx = __ffs(iters);
 		struct btree_iter *iter = &trans->iters[idx];
 
-		if (iter->flags & BTREE_ITER_ERROR)
-			ret = -EIO;
+		ret = ret ?: btree_iter_err(iter);
 
 		__bch2_btree_iter_unlock(iter);
 		iters ^= 1 << idx;
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index 4be472e45310..70b5cc6ee5ab 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -23,11 +23,44 @@ static inline struct btree *btree_node_parent(struct btree_iter *iter,
 	return btree_iter_node(iter, b->level + 1);
 }
 
+static inline bool btree_trans_has_multiple_iters(const struct btree_trans *trans)
+{
+	return hweight64(trans->iters_linked) > 1;
+}
+
 static inline bool btree_iter_linked(const struct btree_iter *iter)
 {
 	return iter->next != iter;
 }
 
+static inline int btree_iter_err(const struct btree_iter *iter)
+{
+	return iter->flags & BTREE_ITER_ERROR ? -EIO : 0;
+}
+
+/* Iterate over iters within a transaction: */
+
+static inline struct btree_iter *
+__trans_next_iter(struct btree_trans *trans, struct btree_iter *iter)
+{
+	unsigned idx;
+
+	/* XXX expensive pointer subtraction: */
+
+	for (idx = iter - trans->iters;
+	     idx < trans->nr_iters;
+	     idx++)
+		if (trans->iters_linked & (1ULL << idx))
+			return &trans->iters[idx];
+
+	return NULL;
+}
+
+#define trans_for_each_iter(_trans, _iter)				\
+	for (_iter = (_trans)->iters;					\
+	     (_iter = __trans_next_iter((_trans), _iter));		\
+	     _iter++)
+
 static inline bool __iter_has_node(const struct btree_iter *iter,
 				   const struct btree *b)
 {
@@ -44,59 +77,39 @@ static inline bool __iter_has_node(const struct btree_iter *iter,
 }
 
 static inline struct btree_iter *
-__next_linked_iter(struct btree_iter *iter, struct btree_iter *linked)
+__trans_next_iter_with_node(struct btree_trans *trans, struct btree *b,
+			    struct btree_iter *iter)
 {
-	return linked->next != iter ? linked->next : NULL;
-}
+	unsigned idx;
 
-static inline struct btree_iter *
-__next_iter_with_node(struct btree_iter *iter, struct btree *b,
-		      struct btree_iter *linked)
-{
-	while (linked && !__iter_has_node(linked, b))
-		linked = __next_linked_iter(iter, linked);
+	/* XXX expensive pointer subtraction: */
+
+	for (idx = iter - trans->iters;
+	     idx < trans->nr_iters;
+	     idx++) {
+		if (!(trans->iters_linked & (1ULL << idx)))
+			continue;
 
-	return linked;
+		iter = &trans->iters[idx];
+		if (__iter_has_node(iter, b))
+			return iter;
+	}
+
+	return NULL;
 }
 
-/**
- * for_each_btree_iter - iterate over all iterators linked with @_iter,
- * including @_iter
- */
-#define for_each_btree_iter(_iter, _linked)				\
-	for ((_linked) = (_iter); (_linked);				\
-	     (_linked) = __next_linked_iter(_iter, _linked))
-
-/**
- * for_each_btree_iter_with_node - iterate over all iterators linked with @_iter
- * that also point to @_b
- *
- * @_b is assumed to be locked by @_iter
- *
- * Filters out iterators that don't have a valid btree_node iterator for @_b -
- * i.e. iterators for which bch2_btree_node_relock() would not succeed.
- */
-#define for_each_btree_iter_with_node(_iter, _b, _linked)		\
-	for ((_linked) = (_iter);					\
-	     ((_linked) = __next_iter_with_node(_iter, _b, _linked));	\
-	     (_linked) = __next_linked_iter(_iter, _linked))
-
-/**
- * for_each_linked_btree_iter - iterate over all iterators linked with @_iter,
- * _not_ including @_iter
- */
-#define for_each_linked_btree_iter(_iter, _linked)			\
-	for ((_linked) = (_iter)->next;					\
-	     (_linked) != (_iter);					\
-	     (_linked) = (_linked)->next)
+#define trans_for_each_iter_with_node(_trans, _b, _iter)		\
+	for (_iter = (_trans)->iters;					\
+	     (_iter = __trans_next_iter_with_node((_trans), (_b), _iter));\
+	     _iter++)
 
 #ifdef CONFIG_BCACHEFS_DEBUG
 void bch2_btree_iter_verify(struct btree_iter *, struct btree *);
-void bch2_btree_iter_verify_locks(struct btree_iter *);
+void bch2_btree_trans_verify_locks(struct btree_trans *);
 #else
 static inline void bch2_btree_iter_verify(struct btree_iter *iter,
 					  struct btree *b) {}
-static inline void bch2_btree_iter_verify_locks(struct btree_iter *iter) {}
+static inline void bch2_btree_trans_verify_locks(struct btree_trans *iter) {}
 #endif
 
 void bch2_btree_node_iter_fix(struct btree_iter *, struct btree *,
@@ -104,7 +117,9 @@ void bch2_btree_node_iter_fix(struct btree_iter *, struct btree *,
 			      unsigned, unsigned);
 
 int bch2_btree_iter_unlock(struct btree_iter *);
-bool bch2_btree_iter_relock(struct btree_iter *);
+
+bool bch2_btree_trans_relock(struct btree_trans *);
+void bch2_btree_trans_unlock(struct btree_trans *);
 
 bool __bch2_btree_iter_upgrade(struct btree_iter *, unsigned);
 bool __bch2_btree_iter_upgrade_nounlock(struct btree_iter *, unsigned);
@@ -252,7 +267,7 @@ static inline struct bkey_s_c __bch2_btree_iter_next(struct btree_iter *iter,
 	     !IS_ERR_OR_NULL((_k).k);					\
 	     (_k) = __bch2_btree_iter_next(_iter, _flags))
 
-static inline int btree_iter_err(struct bkey_s_c k)
+static inline int bkey_err(struct bkey_s_c k)
 {
 	return PTR_ERR_OR_ZERO(k.k);
 }
diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h
index c036cd0458a4..37e09474fde4 100644
--- a/fs/bcachefs/btree_locking.h
+++ b/fs/bcachefs/btree_locking.h
@@ -163,8 +163,9 @@ static inline bool btree_node_lock_increment(struct btree_iter *iter,
 {
 	struct btree_iter *linked;
 
-	for_each_linked_btree_iter(iter, linked)
-		if (linked->l[level].b == b &&
+	trans_for_each_iter(iter->trans, linked)
+		if (linked != iter &&
+		    linked->l[level].b == b &&
 		    btree_node_locked_type(linked, level) >= want) {
 			six_lock_increment(&b->lock, want);
 			return true;
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 8b96faf107f8..6e9a87c6b1be 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -246,7 +246,7 @@ void bch2_btree_node_free_inmem(struct bch_fs *c, struct btree *b,
 {
 	struct btree_iter *linked;
 
-	for_each_btree_iter(iter, linked)
+	trans_for_each_iter(iter->trans, linked)
 		BUG_ON(linked->l[b->level].b == b);
 
 	/*
@@ -1438,7 +1438,7 @@ static void btree_split(struct btree_update *as, struct btree *b,
 
 	bch2_btree_node_free_inmem(c, b, iter);
 
-	bch2_btree_iter_verify_locks(iter);
+	bch2_btree_trans_verify_locks(iter->trans);
 
 	bch2_time_stats_update(&c->times[BCH_TIME_btree_node_split],
 			       start_time);
@@ -1474,7 +1474,7 @@ bch2_btree_insert_keys_interior(struct btree_update *as, struct btree *b,
 
 	btree_update_updated_node(as, b);
 
-	for_each_btree_iter_with_node(iter, b, linked)
+	trans_for_each_iter_with_node(iter->trans, b, linked)
 		bch2_btree_node_iter_peek(&linked->l[b->level].iter, b);
 
 	bch2_btree_iter_verify(iter, b);
@@ -1559,7 +1559,7 @@ int bch2_btree_split_leaf(struct bch_fs *c, struct btree_iter *iter,
 	 * We already have a disk reservation and open buckets pinned; this
 	 * allocation must not block:
 	 */
-	for_each_btree_iter(iter, linked)
+	trans_for_each_iter(iter->trans, linked)
 		if (linked->btree_id == BTREE_ID_EXTENTS)
 			flags |= BTREE_INSERT_USE_RESERVE;
 
@@ -1571,10 +1571,10 @@ int bch2_btree_split_leaf(struct bch_fs *c, struct btree_iter *iter,
 		if (flags & BTREE_INSERT_NOUNLOCK)
 			return -EINTR;
 
-		bch2_btree_iter_unlock(iter);
+		bch2_btree_trans_unlock(iter->trans);
 		down_read(&c->gc_lock);
 
-		if (btree_iter_linked(iter))
+		if (!bch2_btree_trans_relock(iter->trans))
 			ret = -EINTR;
 	}
 
@@ -1753,7 +1753,7 @@ retry:
 	if (!(flags & BTREE_INSERT_GC_LOCK_HELD))
 		up_read(&c->gc_lock);
 out:
-	bch2_btree_iter_verify_locks(iter);
+	bch2_btree_trans_verify_locks(iter->trans);
 
 	/*
 	 * Don't downgrade locks here: we're called after successful insert,
@@ -2036,10 +2036,10 @@ int bch2_btree_node_update_key(struct bch_fs *c, struct btree_iter *iter,
 		return -EINTR;
 
 	if (!down_read_trylock(&c->gc_lock)) {
-		bch2_btree_iter_unlock(iter);
+		bch2_btree_trans_unlock(iter->trans);
 		down_read(&c->gc_lock);
 
-		if (!bch2_btree_iter_relock(iter)) {
+		if (!bch2_btree_trans_relock(iter->trans)) {
 			ret = -EINTR;
 			goto err;
 		}
@@ -2050,15 +2050,15 @@ int bch2_btree_node_update_key(struct bch_fs *c, struct btree_iter *iter,
 		/* bch2_btree_reserve_get will unlock */
 		ret = bch2_btree_cache_cannibalize_lock(c, &cl);
 		if (ret) {
-			ret = -EINTR;
-
-			bch2_btree_iter_unlock(iter);
+			bch2_btree_trans_unlock(iter->trans);
 			up_read(&c->gc_lock);
 			closure_sync(&cl);
 			down_read(&c->gc_lock);
 
-			if (!bch2_btree_iter_relock(iter))
+			if (!bch2_btree_trans_relock(iter->trans)) {
+				ret = -EINTR;
 				goto err;
+			}
 		}
 
 		new_hash = bch2_btree_node_mem_alloc(c);
@@ -2079,12 +2079,12 @@ int bch2_btree_node_update_key(struct bch_fs *c, struct btree_iter *iter,
 		if (ret != -EINTR)
 			goto err;
 
-		bch2_btree_iter_unlock(iter);
+		bch2_btree_trans_unlock(iter->trans);
 		up_read(&c->gc_lock);
 		closure_sync(&cl);
 		down_read(&c->gc_lock);
 
-		if (!bch2_btree_iter_relock(iter))
+		if (!bch2_btree_trans_relock(iter->trans))
 			goto err;
 	}
 
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 45838db7b991..8b043d3c19ad 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -51,25 +51,6 @@ static void btree_trans_unlock_write(struct btree_trans *trans)
 		bch2_btree_node_unlock_write(i->iter->l[0].b, i->iter);
 }
 
-static bool btree_trans_relock(struct btree_trans *trans)
-{
-	struct btree_insert_entry *i;
-
-	trans_for_each_update_iter(trans, i)
-		return bch2_btree_iter_relock(i->iter);
-	return true;
-}
-
-static void btree_trans_unlock(struct btree_trans *trans)
-{
-	struct btree_insert_entry *i;
-
-	trans_for_each_update_iter(trans, i) {
-		bch2_btree_iter_unlock(i->iter);
-		break;
-	}
-}
-
 static inline int btree_trans_cmp(struct btree_insert_entry l,
 				  struct btree_insert_entry r)
 {
@@ -422,8 +403,6 @@ static inline void btree_insert_entry_checks(struct btree_trans *trans,
 
 		EBUG_ON((i->iter->flags & BTREE_ITER_IS_EXTENTS) &&
 			!(trans->flags & BTREE_INSERT_ATOMIC));
-
-		bch2_btree_iter_verify_locks(i->iter);
 	}
 
 	BUG_ON(debug_check_bkeys(c) &&
@@ -451,14 +430,14 @@ static int bch2_trans_journal_preres_get(struct btree_trans *trans)
 	if (ret != -EAGAIN)
 		return ret;
 
-	btree_trans_unlock(trans);
+	bch2_btree_trans_unlock(trans);
 
 	ret = bch2_journal_preres_get(&c->journal,
 			&trans->journal_preres, u64s, 0);
 	if (ret)
 		return ret;
 
-	if (!btree_trans_relock(trans)) {
+	if (!bch2_btree_trans_relock(trans)) {
 		trans_restart(" (iter relock after journal preres get blocked)");
 		return -EINTR;
 	}
@@ -617,12 +596,9 @@ static inline int do_btree_insert_at(struct btree_trans *trans,
 		 * have been traversed/locked, depending on what the caller was
 		 * doing:
 		 */
-		trans_for_each_update_iter(trans, i) {
-			for_each_btree_iter(i->iter, linked)
-				if (linked->uptodate < BTREE_ITER_NEED_RELOCK)
-					linked->flags |= BTREE_ITER_NOUNLOCK;
-			break;
-		}
+		trans_for_each_iter(trans, linked)
+			if (linked->uptodate < BTREE_ITER_NEED_RELOCK)
+				linked->flags |= BTREE_ITER_NOUNLOCK;
 	}
 
 	trans_for_each_update_iter(trans, i)
@@ -707,20 +683,20 @@ int bch2_trans_commit_error(struct btree_trans *trans,
 				return ret;
 		}
 
-		if (btree_trans_relock(trans))
+		if (bch2_btree_trans_relock(trans))
 			return 0;
 
 		trans_restart(" (iter relock after marking replicas)");
 		ret = -EINTR;
 		break;
 	case BTREE_INSERT_NEED_JOURNAL_RES:
-		btree_trans_unlock(trans);
+		bch2_btree_trans_unlock(trans);
 
 		ret = bch2_trans_journal_res_get(trans, JOURNAL_RES_GET_CHECK);
 		if (ret)
 			return ret;
 
-		if (btree_trans_relock(trans))
+		if (bch2_btree_trans_relock(trans))
 			return 0;
 
 		trans_restart(" (iter relock after journal res get blocked)");
@@ -785,10 +761,9 @@ static int __bch2_trans_commit(struct btree_trans *trans,
 			goto err;
 		}
 
-		if (i->iter->flags & BTREE_ITER_ERROR) {
-			ret = -EIO;
+		ret = btree_iter_err(i->iter);
+		if (ret)
 			goto err;
-		}
 	}
 
 	ret = do_btree_insert_at(trans, stopped_at);
@@ -802,16 +777,10 @@ static int __bch2_trans_commit(struct btree_trans *trans,
 		bch2_btree_iter_downgrade(i->iter);
 err:
 	/* make sure we didn't drop or screw up locks: */
-	trans_for_each_update_iter(trans, i) {
-		bch2_btree_iter_verify_locks(i->iter);
-		break;
-	}
+	bch2_btree_trans_verify_locks(trans);
 
-	trans_for_each_update_iter(trans, i) {
-		for_each_btree_iter(i->iter, linked)
-			linked->flags &= ~BTREE_ITER_NOUNLOCK;
-		break;
-	}
+	trans_for_each_iter(trans, linked)
+		linked->flags &= ~BTREE_ITER_NOUNLOCK;
 
 	return ret;
 }
@@ -847,13 +816,14 @@ int bch2_trans_commit(struct btree_trans *trans,
 
 	trans_for_each_update(trans, i)
 		btree_insert_entry_checks(trans, i);
+	bch2_btree_trans_verify_locks(trans);
 
 	if (unlikely(!(trans->flags & BTREE_INSERT_NOCHECK_RW) &&
 		     !percpu_ref_tryget(&c->writes))) {
 		if (likely(!(trans->flags & BTREE_INSERT_LAZY_RW)))
 			return -EROFS;
 
-		btree_trans_unlock(trans);
+		bch2_btree_trans_unlock(trans);
 
 		ret = bch2_fs_read_write_early(c);
 		if (ret)
@@ -861,7 +831,7 @@ int bch2_trans_commit(struct btree_trans *trans,
 
 		percpu_ref_get(&c->writes);
 
-		if (!btree_trans_relock(trans)) {
+		if (!bch2_btree_trans_relock(trans)) {
 			ret = -EINTR;
 			goto err;
 		}
@@ -962,7 +932,7 @@ int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id,
 	iter = bch2_trans_get_iter(&trans, id, start, BTREE_ITER_INTENT);
 
 	while ((k = bch2_btree_iter_peek(iter)).k &&
-	       !(ret = btree_iter_err(k)) &&
+	       !(ret = bkey_err(k)) &&
 	       bkey_cmp(iter->pos, end) < 0) {
 		unsigned max_sectors = KEY_SIZE_MAX & (~0 << c->block_bits);
 		/* really shouldn't be using a bare, unpadded bkey_i */
diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c
index 64e079280a9a..bb69a2acd8dd 100644
--- a/fs/bcachefs/debug.c
+++ b/fs/bcachefs/debug.c
@@ -226,7 +226,7 @@ static ssize_t bch2_read_btree(struct file *file, char __user *buf,
 	iter = bch2_trans_get_iter(&trans, i->id, i->from, BTREE_ITER_PREFETCH);
 	k = bch2_btree_iter_peek(iter);
 
-	while (k.k && !(err = btree_iter_err(k))) {
+	while (k.k && !(err = bkey_err(k))) {
 		bch2_bkey_val_to_text(&PBUF(i->buf), i->c, k);
 		i->bytes = strlen(i->buf);
 		BUG_ON(i->bytes >= PAGE_SIZE);
@@ -333,7 +333,7 @@ static ssize_t bch2_read_bfloat_failed(struct file *file, char __user *buf,
 	iter = bch2_trans_get_iter(&trans, i->id, i->from, BTREE_ITER_PREFETCH);
 
 	while ((k = bch2_btree_iter_peek(iter)).k &&
-	       !(err = btree_iter_err(k))) {
+	       !(err = bkey_err(k))) {
 		struct btree_iter_level *l = &iter->l[0];
 		struct bkey_packed *_k =
 			bch2_btree_node_iter_peek(&l->iter, l->b);
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index c33bcffa7871..5fc0025e66bf 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -426,7 +426,7 @@ int bch2_ec_read_extent(struct bch_fs *c, struct bch_read_bio *rbio)
 				   POS(0, stripe_idx),
 				   BTREE_ITER_SLOTS);
 	k = bch2_btree_iter_peek_slot(iter);
-	if (btree_iter_err(k) || k.k->type != KEY_TYPE_stripe) {
+	if (bkey_err(k) || k.k->type != KEY_TYPE_stripe) {
 		__bcache_io_error(c,
 			"error doing reconstruct read: stripe not found");
 		kfree(buf);
@@ -541,7 +541,7 @@ static int ec_stripe_mem_alloc(struct bch_fs *c,
 	if (!__ec_stripe_mem_alloc(c, idx, GFP_NOWAIT|__GFP_NOWARN))
 		return 0;
 
-	bch2_btree_iter_unlock(iter);
+	bch2_btree_trans_unlock(iter->trans);
 
 	if (!__ec_stripe_mem_alloc(c, idx, GFP_KERNEL))
 		return -EINTR;
@@ -750,7 +750,7 @@ static int ec_stripe_update_ptrs(struct bch_fs *c,
 				   BTREE_ITER_INTENT);
 
 	while ((k = bch2_btree_iter_peek(iter)).k &&
-	       !(ret = btree_iter_err(k)) &&
+	       !(ret = bkey_err(k)) &&
 	       bkey_cmp(bkey_start_pos(k.k), pos->p) < 0) {
 		idx = extent_matches_stripe(c, &s->key.v, k);
 		if (idx < 0) {
@@ -1170,7 +1170,7 @@ static int __bch2_stripe_write_key(struct btree_trans *trans,
 	bch2_btree_iter_set_pos(iter, POS(0, idx));
 
 	k = bch2_btree_iter_peek_slot(iter);
-	ret = btree_iter_err(k);
+	ret = bkey_err(k);
 	if (ret)
 		return ret;
 
diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index efc189c02db7..d865081d4a21 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -251,7 +251,7 @@ static int sum_sector_overwrites(struct btree_trans *trans,
 		 * carefully not advancing past @new and thus whatever leaf node
 		 * @_iter currently points to:
 		 */
-		BUG_ON(btree_iter_err(old));
+		BUG_ON(bkey_err(old));
 
 		if (allocating &&
 		    !*allocating &&
@@ -322,10 +322,10 @@ static int bch2_extent_update(struct btree_trans *trans,
 	if (i_sectors_delta ||
 	    new_i_size > inode->ei_inode.bi_size) {
 		if (c->opts.new_inode_updates) {
-			bch2_btree_iter_unlock(extent_iter);
+			bch2_btree_trans_unlock(trans);
 			mutex_lock(&inode->ei_update_lock);
 
-			if (!bch2_btree_iter_relock(extent_iter)) {
+			if (!bch2_btree_trans_relock(trans)) {
 				mutex_unlock(&inode->ei_update_lock);
 				return -EINTR;
 			}
@@ -921,10 +921,11 @@ static void readpage_bio_extend(struct readpages_iter *iter,
 	}
 }
 
-static void bchfs_read(struct bch_fs *c, struct btree_iter *iter,
+static void bchfs_read(struct btree_trans *trans, struct btree_iter *iter,
 		       struct bch_read_bio *rbio, u64 inum,
 		       struct readpages_iter *readpages_iter)
 {
+	struct bch_fs *c = trans->c;
 	struct bio *bio = &rbio->bio;
 	int flags = BCH_READ_RETRY_IF_STALE|
 		BCH_READ_MAY_PROMOTE;
@@ -943,7 +944,7 @@ static void bchfs_read(struct bch_fs *c, struct btree_iter *iter,
 		BUG_ON(!k.k);
 
 		if (IS_ERR(k.k)) {
-			int ret = bch2_btree_iter_unlock(iter);
+			int ret = btree_iter_err(iter);
 			BUG_ON(!ret);
 			bcache_io_error(c, bio, "btree IO error %i", ret);
 			bio_endio(bio);
@@ -951,7 +952,7 @@ static void bchfs_read(struct bch_fs *c, struct btree_iter *iter,
 		}
 
 		bkey_reassemble(&tmp.k, k);
-		bch2_btree_iter_unlock(iter);
+		bch2_btree_trans_unlock(trans);
 		k = bkey_i_to_s_c(&tmp.k);
 
 		if (readpages_iter) {
@@ -1030,7 +1031,8 @@ void bch2_readahead(struct readahead_control *ractl)
 		rbio->bio.bi_end_io = bch2_readpages_end_io;
 		__bio_add_page(&rbio->bio, page, PAGE_SIZE, 0);
 
-		bchfs_read(c, iter, rbio, inode->v.i_ino, &readpages_iter);
+		bchfs_read(&trans, iter, rbio, inode->v.i_ino,
+			   &readpages_iter);
 	}
 
 	bch2_pagecache_add_put(&inode->ei_pagecache_lock);
@@ -1054,7 +1056,7 @@ static void __bchfs_readpage(struct bch_fs *c, struct bch_read_bio *rbio,
 	iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, POS_MIN,
 				   BTREE_ITER_SLOTS);
 
-	bchfs_read(c, iter, rbio, inum, NULL);
+	bchfs_read(&trans, iter, rbio, inum, NULL);
 
 	bch2_trans_exit(&trans);
 }
@@ -2098,7 +2100,7 @@ static int __bch2_fpunch(struct bch_fs *c, struct bch_inode_info *inode,
 				   BTREE_ITER_INTENT);
 
 	while ((k = bch2_btree_iter_peek(iter)).k &&
-	       !(ret = btree_iter_err(k)) &&
+	       !(ret = bkey_err(k)) &&
 	       bkey_cmp(iter->pos, end) < 0) {
 		struct disk_reservation disk_res =
 			bch2_disk_reservation_init(c, 0);
@@ -2437,14 +2439,14 @@ static long bch2_fcollapse(struct bch_inode_info *inode,
 
 		ret = bch2_btree_iter_traverse(dst);
 		if (ret)
-			goto btree_iter_err;
+			goto bkey_err;
 
 		bch2_btree_iter_set_pos(src,
 			POS(dst->pos.inode, dst->pos.offset + (len >> 9)));
 
 		k = bch2_btree_iter_peek_slot(src);
-		if ((ret = btree_iter_err(k)))
-			goto btree_iter_err;
+		if ((ret = bkey_err(k)))
+			goto bkey_err;
 
 		bkey_reassemble(&copy.k, k);
 
@@ -2465,7 +2467,7 @@ static long bch2_fcollapse(struct bch_inode_info *inode,
 				dst, &copy.k,
 				0, true, true, NULL);
 		bch2_disk_reservation_put(c, &disk_res);
-btree_iter_err:
+bkey_err:
 		if (ret == -EINTR)
 			ret = 0;
 		if (ret)
@@ -2559,8 +2561,8 @@ static long bch2_fallocate(struct bch_inode_info *inode, int mode,
 		struct bkey_s_c k;
 
 		k = bch2_btree_iter_peek_slot(iter);
-		if ((ret = btree_iter_err(k)))
-			goto btree_iter_err;
+		if ((ret = bkey_err(k)))
+			goto bkey_err;
 
 		/* already reserved */
 		if (k.k->type == KEY_TYPE_reservation &&
@@ -2591,7 +2593,7 @@ static long bch2_fallocate(struct bch_inode_info *inode, int mode,
 					&quota_res,
 					sectors, true);
 			if (unlikely(ret))
-				goto btree_iter_err;
+				goto bkey_err;
 		}
 
 		if (reservation.v.nr_replicas < replicas ||
@@ -2599,7 +2601,7 @@ static long bch2_fallocate(struct bch_inode_info *inode, int mode,
 			ret = bch2_disk_reservation_get(c, &disk_res, sectors,
 							replicas, 0);
 			if (unlikely(ret))
-				goto btree_iter_err;
+				goto bkey_err;
 
 			reservation.v.nr_replicas = disk_res.nr_replicas;
 		}
@@ -2608,7 +2610,7 @@ static long bch2_fallocate(struct bch_inode_info *inode, int mode,
 				&disk_res, &quota_res,
 				iter, &reservation.k_i,
 				0, true, true, NULL);
-btree_iter_err:
+bkey_err:
 		bch2_quota_reservation_put(c, inode, &quota_res);
 		bch2_disk_reservation_put(c, &disk_res);
 		if (ret == -EINTR)
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 41284d38db2f..c4d9d2761cdc 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -33,9 +33,10 @@ static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum)
 	return bch2_trans_iter_free(trans, iter) ?: sectors;
 }
 
-static int remove_dirent(struct bch_fs *c, struct btree_iter *iter,
+static int remove_dirent(struct btree_trans *trans,
 			 struct bkey_s_c_dirent dirent)
 {
+	struct bch_fs *c = trans->c;
 	struct qstr name;
 	struct bch_inode_unpacked dir_inode;
 	struct bch_hash_info dir_hash_info;
@@ -52,8 +53,8 @@ static int remove_dirent(struct bch_fs *c, struct btree_iter *iter,
 	buf[name.len] = '\0';
 	name.name = buf;
 
-	/* Unlock iter so we don't deadlock, after copying name: */
-	bch2_btree_iter_unlock(iter);
+	/* Unlock so we don't deadlock, after copying name: */
+	bch2_btree_trans_unlock(trans);
 
 	ret = bch2_inode_find_by_inum(c, dir_inum, &dir_inode);
 	if (ret) {
@@ -143,29 +144,33 @@ static int walk_inode(struct bch_fs *c, struct inode_walker *w, u64 inum)
 
 struct hash_check {
 	struct bch_hash_info	info;
-	struct btree_trans	*trans;
 
 	/* start of current chain of hash collisions: */
 	struct btree_iter	*chain;
 
 	/* next offset in current chain of hash collisions: */
-	u64			next;
+	u64			chain_end;
 };
 
-static void hash_check_init(const struct bch_hash_desc desc,
-			    struct btree_trans *trans,
+static void hash_check_init(struct hash_check *h)
+{
+	h->chain = NULL;
+}
+
+static void hash_stop_chain(struct btree_trans *trans,
 			    struct hash_check *h)
 {
-	h->trans = trans;
-	h->chain = bch2_trans_get_iter(trans, desc.btree_id, POS_MIN, 0);
-	h->next = -1;
+	if (h->chain)
+		bch2_trans_iter_free(trans, h->chain);
+	h->chain = NULL;
 }
 
-static void hash_check_set_inode(struct hash_check *h, struct bch_fs *c,
+static void hash_check_set_inode(struct btree_trans *trans,
+				 struct hash_check *h,
 				 const struct bch_inode_unpacked *bi)
 {
-	h->info = bch2_hash_info_init(c, bi);
-	h->next = -1;
+	h->info = bch2_hash_info_init(trans->c, bi);
+	hash_stop_chain(trans, h);
 }
 
 static int hash_redo_key(const struct bch_hash_desc desc,
@@ -186,8 +191,6 @@ static int hash_redo_key(const struct bch_hash_desc desc,
 	if (ret)
 		goto err;
 
-	bch2_btree_iter_unlock(k_iter);
-
 	bch2_hash_set(trans, desc, &h->info, k_iter->pos.inode,
 		      tmp, BCH_HASH_SET_MUST_CREATE);
 	ret = bch2_trans_commit(trans, NULL, NULL,
@@ -232,7 +235,7 @@ static int hash_check_duplicates(struct btree_trans *trans,
 	if (!bkey_cmp(h->chain->pos, k_iter->pos))
 		return 0;
 
-	iter = bch2_trans_copy_iter(h->trans, h->chain);
+	iter = bch2_trans_copy_iter(trans, h->chain);
 	BUG_ON(IS_ERR(iter));
 
 	for_each_btree_key_continue(iter, 0, k2) {
@@ -252,23 +255,39 @@ static int hash_check_duplicates(struct btree_trans *trans,
 		}
 	}
 fsck_err:
-	bch2_trans_iter_free(h->trans, iter);
+	bch2_trans_iter_free(trans, iter);
 	return ret;
 }
 
-static bool key_has_correct_hash(const struct bch_hash_desc desc,
-				 struct hash_check *h, struct bch_fs *c,
-				 struct btree_iter *k_iter, struct bkey_s_c k)
+static void hash_set_chain_start(struct btree_trans *trans,
+			const struct bch_hash_desc desc,
+			struct hash_check *h,
+			struct btree_iter *k_iter, struct bkey_s_c k)
 {
-	u64 hash;
+	bool hole = (k.k->type != KEY_TYPE_whiteout &&
+		     k.k->type != desc.key_type);
 
-	if (k.k->type != KEY_TYPE_whiteout &&
-	    k.k->type != desc.key_type)
-		return true;
+	if (hole || k.k->p.offset > h->chain_end + 1)
+		hash_stop_chain(trans, h);
+
+	if (!hole) {
+		if (!h->chain) {
+			h->chain = bch2_trans_copy_iter(trans, k_iter);
+			BUG_ON(IS_ERR(h->chain));
+		}
+
+		h->chain_end = k.k->p.offset;
+	}
+}
+
+static bool key_has_correct_hash(struct btree_trans *trans,
+			const struct bch_hash_desc desc,
+			struct hash_check *h,
+			struct btree_iter *k_iter, struct bkey_s_c k)
+{
+	u64 hash;
 
-	if (k.k->p.offset != h->next)
-		bch2_btree_iter_copy(h->chain, k_iter);
-	h->next = k.k->p.offset + 1;
+	hash_set_chain_start(trans, desc, h, k_iter, k);
 
 	if (k.k->type != desc.key_type)
 		return true;
@@ -288,13 +307,7 @@ static int hash_check_key(struct btree_trans *trans,
 	u64 hashed;
 	int ret = 0;
 
-	if (k.k->type != KEY_TYPE_whiteout &&
-	    k.k->type != desc.key_type)
-		return 0;
-
-	if (k.k->p.offset != h->next)
-		bch2_btree_iter_copy(h->chain, k_iter);
-	h->next = k.k->p.offset + 1;
+	hash_set_chain_start(trans, desc, h, k_iter, k);
 
 	if (k.k->type != desc.key_type)
 		return 0;
@@ -332,7 +345,7 @@ static int check_dirent_hash(struct btree_trans *trans, struct hash_check *h,
 	unsigned len;
 	u64 hash;
 
-	if (key_has_correct_hash(bch2_dirent_hash_desc, h, c, iter, *k))
+	if (key_has_correct_hash(trans, bch2_dirent_hash_desc, h, iter, *k))
 		return 0;
 
 	len = bch2_dirent_name_bytes(bkey_s_c_to_dirent(*k));
@@ -526,7 +539,7 @@ static int check_dirents(struct bch_fs *c)
 	iter = bch2_trans_get_iter(&trans, BTREE_ID_DIRENTS,
 				   POS(BCACHEFS_ROOT_INO, 0), 0);
 
-	hash_check_init(bch2_dirent_hash_desc, &trans, &h);
+	hash_check_init(&h);
 
 	for_each_btree_key_continue(iter, 0, k) {
 		struct bkey_s_c_dirent d;
@@ -554,7 +567,7 @@ static int check_dirents(struct bch_fs *c)
 		}
 
 		if (w.first_this_inode && w.have_inode)
-			hash_check_set_inode(&h, c, &w.inode);
+			hash_check_set_inode(&trans, &h, &w.inode);
 
 		ret = check_dirent_hash(&trans, &h, iter, &k);
 		if (ret > 0) {
@@ -587,7 +600,7 @@ static int check_dirents(struct bch_fs *c)
 				".. dirent") ||
 		    fsck_err_on(memchr(d.v->d_name, '/', name_len), c,
 				"dirent name has invalid chars")) {
-			ret = remove_dirent(c, iter, d);
+			ret = remove_dirent(&trans, d);
 			if (ret)
 				goto err;
 			continue;
@@ -597,7 +610,7 @@ static int check_dirents(struct bch_fs *c)
 				"dirent points to own directory:\n%s",
 				(bch2_bkey_val_to_text(&PBUF(buf), c,
 						       k), buf))) {
-			ret = remove_dirent(c, iter, d);
+			ret = remove_dirent(&trans, d);
 			if (ret)
 				goto err;
 			continue;
@@ -614,7 +627,7 @@ static int check_dirents(struct bch_fs *c)
 				"dirent points to missing inode:\n%s",
 				(bch2_bkey_val_to_text(&PBUF(buf), c,
 						       k), buf))) {
-			ret = remove_dirent(c, iter, d);
+			ret = remove_dirent(&trans, d);
 			if (ret)
 				goto err;
 			continue;
@@ -650,6 +663,8 @@ static int check_dirents(struct bch_fs *c)
 
 		}
 	}
+
+	hash_stop_chain(&trans, &h);
 err:
 fsck_err:
 	return bch2_trans_exit(&trans) ?: ret;
@@ -677,7 +692,7 @@ static int check_xattrs(struct bch_fs *c)
 	iter = bch2_trans_get_iter(&trans, BTREE_ID_XATTRS,
 				   POS(BCACHEFS_ROOT_INO, 0), 0);
 
-	hash_check_init(bch2_xattr_hash_desc, &trans, &h);
+	hash_check_init(&h);
 
 	for_each_btree_key_continue(iter, 0, k) {
 		ret = walk_inode(c, &w, k.k->p.inode);
@@ -694,7 +709,7 @@ static int check_xattrs(struct bch_fs *c)
 		}
 
 		if (w.first_this_inode && w.have_inode)
-			hash_check_set_inode(&h, c, &w.inode);
+			hash_check_set_inode(&trans, &h, &w.inode);
 
 		ret = hash_check_key(&trans, bch2_xattr_hash_desc,
 				     &h, iter, k);
@@ -926,7 +941,7 @@ next:
 			if (fsck_err_on(inode_bitmap_test(&dirs_done, d_inum), c,
 					"directory %llu has multiple hardlinks",
 					d_inum)) {
-				ret = remove_dirent(c, iter, dirent);
+				ret = remove_dirent(&trans, dirent);
 				if (ret)
 					goto err;
 				continue;
@@ -972,7 +987,7 @@ up:
 		if (fsck_err_on(!inode_bitmap_test(&dirs_done, k.k->p.inode), c,
 				"unreachable directory found (inum %llu)",
 				k.k->p.inode)) {
-			bch2_btree_iter_unlock(iter);
+			bch2_btree_trans_unlock(&trans);
 
 			ret = reattach_inode(c, lostfound_inode, k.k->p.inode);
 			if (ret) {
@@ -1187,6 +1202,9 @@ static int check_inode(struct btree_trans *trans,
 	int ret = 0;
 
 	ret = bch2_inode_unpack(inode, &u);
+
+	bch2_btree_trans_unlock(trans);
+
 	if (bch2_fs_inconsistent_on(ret, c,
 			 "error unpacking inode %llu in fsck",
 			 inode.k->p.inode))
@@ -1306,7 +1324,7 @@ static int bch2_gc_walk_inodes(struct bch_fs *c,
 	nlinks_iter = genradix_iter_init(links, 0);
 
 	while ((k = bch2_btree_iter_peek(iter)).k &&
-	       !(ret2 = btree_iter_err(k))) {
+	       !(ret2 = bkey_err(k))) {
 peek_nlinks:	link = genradix_iter_peek(&nlinks_iter, links);
 
 		if (!link && (!k.k || iter->pos.inode >= range_end))
@@ -1326,12 +1344,6 @@ peek_nlinks:	link = genradix_iter_peek(&nlinks_iter, links);
 			link = &zero_links;
 
 		if (k.k && k.k->type == KEY_TYPE_inode) {
-			/*
-			 * Avoid potential deadlocks with iter for
-			 * truncate/rm/etc.:
-			 */
-			bch2_btree_iter_unlock(iter);
-
 			ret = check_inode(&trans, lostfound_inode, iter,
 					  bkey_s_c_to_inode(k), link);
 			BUG_ON(ret == -EINTR);
@@ -1402,7 +1414,7 @@ static int check_inodes_fast(struct bch_fs *c)
 	struct btree_iter *iter;
 	struct bkey_s_c k;
 	struct bkey_s_c_inode inode;
-	int ret = 0;
+	int ret = 0, ret2;
 
 	bch2_trans_init(&trans, c);
 
@@ -1426,12 +1438,9 @@ static int check_inodes_fast(struct bch_fs *c)
 		}
 	}
 
-	if (!ret)
-		ret = bch2_btree_iter_unlock(iter);
+	ret2 = bch2_trans_exit(&trans);
 
-	bch2_trans_exit(&trans);
-
-	return ret;
+	return ret ?: ret2;
 }
 
 /*
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index c6336e7a2a23..7be24865cc3f 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -323,7 +323,7 @@ again:
 	while (1) {
 		struct bkey_s_c k = bch2_btree_iter_peek_slot(iter);
 
-		ret = btree_iter_err(k);
+		ret = bkey_err(k);
 		if (ret)
 			return ret;
 
@@ -399,7 +399,7 @@ int bch2_inode_rm(struct bch_fs *c, u64 inode_nr)
 		struct bkey_s_c k = bch2_btree_iter_peek_slot(iter);
 		u32 bi_generation = 0;
 
-		ret = btree_iter_err(k);
+		ret = bkey_err(k);
 		if (ret)
 			break;
 
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index 62ee09121036..71481b9728f5 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -1279,7 +1279,7 @@ retry:
 	rbio->bio.bi_status = 0;
 
 	k = bch2_btree_iter_peek_slot(iter);
-	if (btree_iter_err(k))
+	if (bkey_err(k))
 		goto err;
 
 	bkey_reassemble(&tmp.k, k);
@@ -1332,7 +1332,7 @@ retry:
 
 		bkey_reassemble(&tmp.k, k);
 		k = bkey_i_to_s_c(&tmp.k);
-		bch2_btree_iter_unlock(iter);
+		bch2_btree_trans_unlock(&trans);
 
 		bytes = min_t(unsigned, bvec_iter.bi_size,
 			      (k.k->p.offset - bvec_iter.bi_sector) << 9);
@@ -1357,7 +1357,7 @@ retry:
 	 * If we get here, it better have been because there was an error
 	 * reading a btree node
 	 */
-	BUG_ON(!(iter->flags & BTREE_ITER_ERROR));
+	BUG_ON(!btree_iter_err(iter));
 	__bcache_io_error(c, "btree IO error");
 err:
 	rbio->bio.bi_status = BLK_STS_IOERR;
@@ -1893,7 +1893,7 @@ void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, u64 inode)
 		 */
 		bkey_reassemble(&tmp.k, k);
 		k = bkey_i_to_s_c(&tmp.k);
-		bch2_btree_iter_unlock(iter);
+		bch2_btree_trans_unlock(&trans);
 
 		bytes = min_t(unsigned, rbio->bio.bi_iter.bi_size,
 			      (k.k->p.offset - rbio->bio.bi_iter.bi_sector) << 9);
@@ -1915,7 +1915,7 @@ void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, u64 inode)
 	 * If we get here, it better have been because there was an error
 	 * reading a btree node
 	 */
-	BUG_ON(!(iter->flags & BTREE_ITER_ERROR));
+	BUG_ON(!btree_iter_err(iter));
 	bcache_io_error(c, &rbio->bio, "btree IO error");
 
 	bch2_trans_exit(&trans);
diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c
index 2b63b07db2bc..98202fbabfaf 100644
--- a/fs/bcachefs/migrate.c
+++ b/fs/bcachefs/migrate.c
@@ -52,7 +52,7 @@ static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
 
 
 	while ((k = bch2_btree_iter_peek(iter)).k &&
-	       !(ret = btree_iter_err(k))) {
+	       !(ret = bkey_err(k))) {
 		if (!bkey_extent_is_data(k.k) ||
 		    !bch2_extent_has_device(bkey_s_c_to_extent(k), dev_idx)) {
 			ret = bch2_mark_bkey_replicas(c, k);
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index 3f3e34e07f35..9793896bee77 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -77,7 +77,7 @@ static int bch2_migrate_index_update(struct bch_write_op *op)
 		bool did_work = false;
 		int nr;
 
-		ret = btree_iter_err(k);
+		ret = bkey_err(k);
 		if (ret)
 			break;
 
@@ -539,7 +539,7 @@ peek:
 
 		if (!k.k)
 			break;
-		ret = btree_iter_err(k);
+		ret = bkey_err(k);
 		if (ret)
 			break;
 		if (bkey_cmp(bkey_start_pos(k.k), end) >= 0)
diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c
index f5dd13e92200..a4f75d53b42c 100644
--- a/fs/bcachefs/quota.c
+++ b/fs/bcachefs/quota.c
@@ -732,7 +732,7 @@ static int bch2_set_quota(struct super_block *sb, struct kqid qid,
 				   BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
 	k = bch2_btree_iter_peek_slot(iter);
 
-	ret = btree_iter_err(k);
+	ret = bkey_err(k);
 	if (unlikely(ret))
 		return ret;
 
-- 
cgit 


From e542029eebffbc2c696e3df8d7efe448cbb5e54e Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 27 Mar 2019 22:54:42 -0400
Subject: bcachefs: Change btree_iter_traverse_error() to not use iter->next

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c | 65 ++++++++++++++++++------------------------------
 1 file changed, 24 insertions(+), 41 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index f018ca7999f7..0d3e99370a98 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -956,10 +956,22 @@ int __must_check __bch2_btree_iter_traverse(struct btree_iter *);
 
 static int btree_iter_traverse_error(struct btree_iter *iter, int ret)
 {
-	struct bch_fs *c = iter->trans->c;
-	struct btree_iter *linked, *sorted_iters, **i;
+	struct btree_trans *trans = iter->trans;
+	struct bch_fs *c = trans->c;
+	u8 sorted[BTREE_ITER_MAX];
+	unsigned i, nr_sorted = 0;
+
+	trans_for_each_iter(trans, iter)
+		sorted[nr_sorted++] = iter - trans->iters;
+
+#define btree_iter_cmp_by_idx(_l, _r)				\
+		btree_iter_cmp(&trans->iters[_l], &trans->iters[_r])
+
+	bubble_sort(sorted, nr_sorted, btree_iter_cmp_by_idx);
+#undef btree_iter_cmp_by_idx
+
 retry_all:
-	bch2_btree_iter_unlock(iter);
+	bch2_btree_trans_unlock(trans);
 
 	if (ret != -ENOMEM && ret != -EINTR)
 		goto io_error;
@@ -975,48 +987,19 @@ retry_all:
 		} while (ret);
 	}
 
-	/*
-	 * Linked iters are normally a circular singly linked list - break cycle
-	 * while we sort them:
-	 */
-	linked = iter->next;
-	iter->next = NULL;
-	sorted_iters = NULL;
-
-	while (linked) {
-		iter = linked;
-		linked = linked->next;
-
-		i = &sorted_iters;
-		while (*i && btree_iter_cmp(iter, *i) > 0)
-			i = &(*i)->next;
-
-		iter->next = *i;
-		*i = iter;
-	}
-
-	/* Make list circular again: */
-	iter = sorted_iters;
-	while (iter->next)
-		iter = iter->next;
-	iter->next = sorted_iters;
-
 	/* Now, redo traversals in correct order: */
+	for (i = 0; i < nr_sorted; i++) {
+		iter = &trans->iters[sorted[i]];
 
-	iter = sorted_iters;
-	do {
-retry:
-		ret = __bch2_btree_iter_traverse(iter);
-		if (unlikely(ret)) {
-			if (ret == -EINTR)
-				goto retry;
-			goto retry_all;
-		}
+		do {
+			ret = __bch2_btree_iter_traverse(iter);
+		} while (ret == -EINTR);
 
-		iter = iter->next;
-	} while (iter != sorted_iters);
+		if (ret)
+			goto retry_all;
+	}
 
-	ret = btree_trans_has_multiple_iters(iter->trans) ? -EINTR : 0;
+	ret = btree_trans_has_multiple_iters(trans) ? -EINTR : 0;
 out:
 	bch2_btree_cache_cannibalize_unlock(c);
 	return ret;
-- 
cgit 


From ecc892e40b52213ceb9eee8dfb972d32911e7509 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 27 Mar 2019 22:46:52 -0400
Subject: bcachefs: Kill btree_iter->next

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c  | 101 +++++-----------------------------------------
 fs/bcachefs/btree_iter.h  |   7 ----
 fs/bcachefs/btree_types.h |   9 -----
 3 files changed, 10 insertions(+), 107 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 0d3e99370a98..34d4ce32c2a2 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1604,67 +1604,8 @@ static inline void bch2_btree_iter_init(struct btree_trans *trans,
 	prefetch(c->btree_roots[btree_id].b);
 }
 
-static void bch2_btree_iter_unlink(struct btree_iter *iter)
-{
-	struct btree_iter *linked;
-
-	__bch2_btree_iter_unlock(iter);
-
-	if (!btree_iter_linked(iter))
-		return;
-
-	trans_for_each_iter(iter->trans, linked)
-		if (linked->next == iter) {
-			linked->next = iter->next;
-			iter->next = iter;
-			return;
-		}
-
-	BUG();
-}
-
-static void bch2_btree_iter_link(struct btree_iter *iter, struct btree_iter *new)
-{
-	BUG_ON(btree_iter_linked(new));
-
-	new->next = iter->next;
-	iter->next = new;
-}
-
-static void __bch2_btree_iter_copy(struct btree_iter *dst,
-				   struct btree_iter *src)
-{
-	unsigned i;
-
-	memcpy(dst, src, offsetof(struct btree_iter, next));
-
-	for (i = 0; i < BTREE_MAX_DEPTH; i++)
-		if (btree_node_locked(dst, i))
-			six_lock_increment(&dst->l[i].b->lock,
-					   __btree_lock_want(dst, i));
-}
-
-void bch2_btree_iter_copy(struct btree_iter *dst, struct btree_iter *src)
-{
-	__bch2_btree_iter_unlock(dst);
-	__bch2_btree_iter_copy(dst, src);
-}
-
 /* new transactional stuff: */
 
-static void btree_trans_verify(struct btree_trans *trans)
-{
-	unsigned i;
-
-	for (i = 0; i < trans->nr_iters; i++) {
-		struct btree_iter *iter = &trans->iters[i];
-
-		BUG_ON(btree_iter_linked(iter) !=
-		       ((trans->iters_linked & (1 << i)) &&
-			!is_power_of_2(trans->iters_linked)));
-	}
-}
-
 static inline unsigned btree_trans_iter_idx(struct btree_trans *trans,
 					    struct btree_iter *iter)
 {
@@ -1689,12 +1630,12 @@ int bch2_trans_iter_put(struct btree_trans *trans,
 static inline void __bch2_trans_iter_free(struct btree_trans *trans,
 					  unsigned idx)
 {
+	__bch2_btree_iter_unlock(&trans->iters[idx]);
 	trans->iters_linked		&= ~(1ULL << idx);
 	trans->iters_live		&= ~(1ULL << idx);
 	trans->iters_touched		&= ~(1ULL << idx);
 	trans->iters_unlink_on_restart	&= ~(1ULL << idx);
 	trans->iters_unlink_on_commit	&= ~(1ULL << idx);
-	bch2_btree_iter_unlink(&trans->iters[idx]);
 }
 
 int bch2_trans_iter_free(struct btree_trans *trans,
@@ -1720,7 +1661,6 @@ static int btree_trans_realloc_iters(struct btree_trans *trans,
 				     unsigned new_size)
 {
 	void *new_iters, *new_updates;
-	unsigned i;
 
 	BUG_ON(new_size > BTREE_ITER_MAX);
 
@@ -1761,20 +1701,6 @@ success:
 	trans->updates	= new_updates;
 	trans->size	= new_size;
 
-	for (i = 0; i < trans->nr_iters; i++)
-		trans->iters[i].next = &trans->iters[i];
-
-	if (trans->iters_linked) {
-		unsigned first_linked = __ffs(trans->iters_linked);
-
-		for (i = first_linked + 1; i < trans->nr_iters; i++)
-			if (trans->iters_linked & (1 << i))
-				bch2_btree_iter_link(&trans->iters[first_linked],
-						     &trans->iters[i]);
-	}
-
-	btree_trans_verify(trans);
-
 	if (trans->iters_live) {
 		trans_restart();
 		return -EINTR;
@@ -1790,7 +1716,6 @@ void bch2_trans_preload_iters(struct btree_trans *trans)
 
 static int btree_trans_iter_alloc(struct btree_trans *trans)
 {
-	struct btree_iter *iter;
 	unsigned idx = ffz(trans->iters_linked);
 
 	if (idx < trans->nr_iters)
@@ -1805,14 +1730,7 @@ static int btree_trans_iter_alloc(struct btree_trans *trans)
 	idx = trans->nr_iters++;
 	BUG_ON(trans->nr_iters > trans->size);
 got_slot:
-	iter = &trans->iters[idx];
-	iter->next = iter;
-
 	BUG_ON(trans->iters_linked & (1ULL << idx));
-
-	if (trans->iters_linked)
-		bch2_btree_iter_link(&trans->iters[__ffs(trans->iters_linked)],
-				     &trans->iters[idx]);
 	trans->iters_linked |= 1ULL << idx;
 	return idx;
 }
@@ -1860,8 +1778,6 @@ found:
 	trans->iters_live	|= 1ULL << idx;
 	trans->iters_touched	|= 1ULL << idx;
 
-	btree_trans_verify(trans);
-
 	BUG_ON(iter->btree_id != btree_id);
 	BUG_ON((iter->flags ^ flags) & BTREE_ITER_TYPE);
 
@@ -1909,7 +1825,8 @@ struct btree_iter *bch2_trans_get_node_iter(struct btree_trans *trans,
 struct btree_iter *bch2_trans_copy_iter(struct btree_trans *trans,
 					struct btree_iter *src)
 {
-	int idx;
+	struct btree_iter *iter;
+	int i, idx;
 
 	idx = btree_trans_iter_alloc(trans);
 	if (idx < 0)
@@ -1919,7 +1836,13 @@ struct btree_iter *bch2_trans_copy_iter(struct btree_trans *trans,
 	trans->iters_touched		|= 1ULL << idx;
 	trans->iters_unlink_on_restart	|= 1ULL << idx;
 
-	__bch2_btree_iter_copy(&trans->iters[idx], src);
+	iter = &trans->iters[idx];
+	*iter = *src;
+
+	for (i = 0; i < BTREE_MAX_DEPTH; i++)
+		if (btree_node_locked(iter, i))
+			six_lock_increment(&iter->l[i].b->lock,
+					   __btree_lock_want(iter, i));
 
 	return &trans->iters[idx];
 }
@@ -1985,8 +1908,6 @@ void __bch2_trans_begin(struct btree_trans *trans)
 {
 	u64 iters_to_unlink;
 
-	btree_trans_verify(trans);
-
 	/*
 	 * On transaction restart, the transaction isn't required to allocate
 	 * all the same iterators it on the last iteration:
@@ -2009,8 +1930,6 @@ void __bch2_trans_begin(struct btree_trans *trans)
 	trans->iters_unlink_on_commit	= 0;
 	trans->nr_updates		= 0;
 	trans->mem_top			= 0;
-
-	btree_trans_verify(trans);
 }
 
 void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c)
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index 70b5cc6ee5ab..74eb5ed12ca0 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -28,11 +28,6 @@ static inline bool btree_trans_has_multiple_iters(const struct btree_trans *tran
 	return hweight64(trans->iters_linked) > 1;
 }
 
-static inline bool btree_iter_linked(const struct btree_iter *iter)
-{
-	return iter->next != iter;
-}
-
 static inline int btree_iter_err(const struct btree_iter *iter)
 {
 	return iter->flags & BTREE_ITER_ERROR ? -EIO : 0;
@@ -165,8 +160,6 @@ struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *);
 void bch2_btree_iter_set_pos_same_leaf(struct btree_iter *, struct bpos);
 void bch2_btree_iter_set_pos(struct btree_iter *, struct bpos);
 
-void bch2_btree_iter_copy(struct btree_iter *, struct btree_iter *);
-
 static inline struct bpos btree_type_successor(enum btree_id id,
 					       struct bpos pos)
 {
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index fc79631ea480..7ef1feba2817 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -234,15 +234,6 @@ struct btree_iter {
 	struct bkey		k;
 
 	u64			id;
-
-	/*
-	 * Circular linked list of linked iterators: linked iterators share
-	 * locks (e.g. two linked iterators may have the same node intent
-	 * locked, or read and write locked, at the same time), and insertions
-	 * through one iterator won't invalidate the other linked iterators.
-	 */
-	/* Must come last: */
-	struct btree_iter	*next;
 };
 
 struct deferred_update {
-- 
cgit 


From e1120a4c8dd4c8839265a052d03d5604c30166b5 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 27 Mar 2019 23:14:38 -0400
Subject: bcachefs: Add iter->idx

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c  | 27 ++++++++++-----------------
 fs/bcachefs/btree_iter.h  | 43 ++++++++++++++++---------------------------
 fs/bcachefs/btree_types.h |  2 ++
 3 files changed, 28 insertions(+), 44 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 34d4ce32c2a2..fef5b04440b3 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1606,24 +1606,12 @@ static inline void bch2_btree_iter_init(struct btree_trans *trans,
 
 /* new transactional stuff: */
 
-static inline unsigned btree_trans_iter_idx(struct btree_trans *trans,
-					    struct btree_iter *iter)
-{
-	ssize_t idx = iter - trans->iters;
-
-	EBUG_ON(idx < 0 || idx >= trans->nr_iters);
-	EBUG_ON(!(trans->iters_linked & (1ULL << idx)));
-
-	return idx;
-}
-
 int bch2_trans_iter_put(struct btree_trans *trans,
 			struct btree_iter *iter)
 {
-	ssize_t idx = btree_trans_iter_idx(trans, iter);
 	int ret = btree_iter_err(iter);
 
-	trans->iters_live	&= ~(1ULL << idx);
+	trans->iters_live	&= ~(1ULL << iter->idx);
 	return ret;
 }
 
@@ -1643,7 +1631,7 @@ int bch2_trans_iter_free(struct btree_trans *trans,
 {
 	int ret = btree_iter_err(iter);
 
-	__bch2_trans_iter_free(trans, btree_trans_iter_idx(trans, iter));
+	__bch2_trans_iter_free(trans, iter->idx);
 	return ret;
 }
 
@@ -1652,8 +1640,7 @@ int bch2_trans_iter_free_on_commit(struct btree_trans *trans,
 {
 	int ret = btree_iter_err(iter);
 
-	trans->iters_unlink_on_commit |=
-		1ULL << btree_trans_iter_idx(trans, iter);
+	trans->iters_unlink_on_commit |= 1ULL << iter->idx;
 	return ret;
 }
 
@@ -1729,6 +1716,8 @@ static int btree_trans_iter_alloc(struct btree_trans *trans)
 
 	idx = trans->nr_iters++;
 	BUG_ON(trans->nr_iters > trans->size);
+
+	trans->iters[idx].idx = idx;
 got_slot:
 	BUG_ON(trans->iters_linked & (1ULL << idx));
 	trans->iters_linked |= 1ULL << idx;
@@ -1826,6 +1815,7 @@ struct btree_iter *bch2_trans_copy_iter(struct btree_trans *trans,
 					struct btree_iter *src)
 {
 	struct btree_iter *iter;
+	unsigned offset = offsetof(struct btree_iter, trans);
 	int i, idx;
 
 	idx = btree_trans_iter_alloc(trans);
@@ -1837,7 +1827,10 @@ struct btree_iter *bch2_trans_copy_iter(struct btree_trans *trans,
 	trans->iters_unlink_on_restart	|= 1ULL << idx;
 
 	iter = &trans->iters[idx];
-	*iter = *src;
+
+	memcpy((void *) iter + offset,
+	       (void *)  src + offset,
+	       sizeof(*iter) - offset);
 
 	for (i = 0; i < BTREE_MAX_DEPTH; i++)
 		if (btree_node_locked(iter, i))
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index 74eb5ed12ca0..800320966ff1 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -36,15 +36,11 @@ static inline int btree_iter_err(const struct btree_iter *iter)
 /* Iterate over iters within a transaction: */
 
 static inline struct btree_iter *
-__trans_next_iter(struct btree_trans *trans, struct btree_iter *iter)
+__trans_next_iter(struct btree_trans *trans, unsigned idx)
 {
-	unsigned idx;
+	EBUG_ON(idx < trans->nr_iters && trans->iters[idx].idx != idx);
 
-	/* XXX expensive pointer subtraction: */
-
-	for (idx = iter - trans->iters;
-	     idx < trans->nr_iters;
-	     idx++)
+	for (; idx < trans->nr_iters; idx++)
 		if (trans->iters_linked & (1ULL << idx))
 			return &trans->iters[idx];
 
@@ -52,9 +48,9 @@ __trans_next_iter(struct btree_trans *trans, struct btree_iter *iter)
 }
 
 #define trans_for_each_iter(_trans, _iter)				\
-	for (_iter = (_trans)->iters;					\
-	     (_iter = __trans_next_iter((_trans), _iter));		\
-	     _iter++)
+	for (_iter = __trans_next_iter((_trans), 0);			\
+	     (_iter);							\
+	     _iter = __trans_next_iter((_trans), (_iter)->idx + 1))
 
 static inline bool __iter_has_node(const struct btree_iter *iter,
 				   const struct btree *b)
@@ -73,30 +69,23 @@ static inline bool __iter_has_node(const struct btree_iter *iter,
 
 static inline struct btree_iter *
 __trans_next_iter_with_node(struct btree_trans *trans, struct btree *b,
-			    struct btree_iter *iter)
+			    unsigned idx)
 {
-	unsigned idx;
-
-	/* XXX expensive pointer subtraction: */
-
-	for (idx = iter - trans->iters;
-	     idx < trans->nr_iters;
-	     idx++) {
-		if (!(trans->iters_linked & (1ULL << idx)))
-			continue;
+	EBUG_ON(idx < trans->nr_iters && trans->iters[idx].idx != idx);
 
-		iter = &trans->iters[idx];
-		if (__iter_has_node(iter, b))
-			return iter;
-	}
+	for (; idx < trans->nr_iters; idx++)
+		if ((trans->iters_linked & (1ULL << idx)) &&
+		    __iter_has_node(&trans->iters[idx], b))
+			return &trans->iters[idx];
 
 	return NULL;
 }
 
 #define trans_for_each_iter_with_node(_trans, _b, _iter)		\
-	for (_iter = (_trans)->iters;					\
-	     (_iter = __trans_next_iter_with_node((_trans), (_b), _iter));\
-	     _iter++)
+	for (_iter = __trans_next_iter_with_node((_trans), (_b), 0);	\
+	     (_iter);							\
+	     _iter = __trans_next_iter_with_node((_trans), (_b),	\
+						 (_iter)->idx + 1))
 
 #ifdef CONFIG_BCACHEFS_DEBUG
 void bch2_btree_iter_verify(struct btree_iter *, struct btree *);
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 7ef1feba2817..bd6852d951ea 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -210,6 +210,8 @@ enum btree_iter_uptodate {
  * @nodes_intent_locked	- bitmask indicating which locks are intent locks
  */
 struct btree_iter {
+	u8			idx;
+
 	struct btree_trans	*trans;
 	struct bpos		pos;
 
-- 
cgit 


From bf7b87a4a92fac3e97228ce94c35d4f78c85417e Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 28 Mar 2019 00:07:24 -0400
Subject: bcachefs: traverse all iterators on transaction restart

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c        | 30 ++++++++++++++++++------------
 fs/bcachefs/btree_iter.h        |  1 +
 fs/bcachefs/btree_update_leaf.c | 11 ++++-------
 3 files changed, 23 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index fef5b04440b3..c8122be21029 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -954,9 +954,9 @@ static void btree_iter_up(struct btree_iter *iter)
 
 int __must_check __bch2_btree_iter_traverse(struct btree_iter *);
 
-static int btree_iter_traverse_error(struct btree_iter *iter, int ret)
+static int __btree_iter_traverse_all(struct btree_trans *trans,
+				     struct btree_iter *iter, int ret)
 {
-	struct btree_trans *trans = iter->trans;
 	struct bch_fs *c = trans->c;
 	u8 sorted[BTREE_ITER_MAX];
 	unsigned i, nr_sorted = 0;
@@ -973,10 +973,7 @@ static int btree_iter_traverse_error(struct btree_iter *iter, int ret)
 retry_all:
 	bch2_btree_trans_unlock(trans);
 
-	if (ret != -ENOMEM && ret != -EINTR)
-		goto io_error;
-
-	if (ret == -ENOMEM) {
+	if (unlikely(ret == -ENOMEM)) {
 		struct closure cl;
 
 		closure_init_stack(&cl);
@@ -987,6 +984,14 @@ retry_all:
 		} while (ret);
 	}
 
+	if (unlikely(ret == -EIO)) {
+		iter->flags |= BTREE_ITER_ERROR;
+		iter->l[iter->level].b = BTREE_ITER_NOT_END;
+		goto out;
+	}
+
+	BUG_ON(ret && ret != -EINTR);
+
 	/* Now, redo traversals in correct order: */
 	for (i = 0; i < nr_sorted; i++) {
 		iter = &trans->iters[sorted[i]];
@@ -1003,12 +1008,11 @@ retry_all:
 out:
 	bch2_btree_cache_cannibalize_unlock(c);
 	return ret;
-io_error:
-	BUG_ON(ret != -EIO);
+}
 
-	iter->flags |= BTREE_ITER_ERROR;
-	iter->l[iter->level].b = BTREE_ITER_NOT_END;
-	goto out;
+int bch2_btree_iter_traverse_all(struct btree_trans *trans)
+{
+	return __btree_iter_traverse_all(trans, NULL, 0);
 }
 
 static unsigned btree_iter_up_until_locked(struct btree_iter *iter,
@@ -1096,7 +1100,7 @@ int __must_check bch2_btree_iter_traverse(struct btree_iter *iter)
 
 	ret = __bch2_btree_iter_traverse(iter);
 	if (unlikely(ret))
-		ret = btree_iter_traverse_error(iter, ret);
+		ret = __btree_iter_traverse_all(iter->trans, iter, ret);
 
 	BUG_ON(ret == -EINTR && !btree_trans_has_multiple_iters(iter->trans));
 
@@ -1923,6 +1927,8 @@ void __bch2_trans_begin(struct btree_trans *trans)
 	trans->iters_unlink_on_commit	= 0;
 	trans->nr_updates		= 0;
 	trans->mem_top			= 0;
+
+	bch2_btree_iter_traverse_all(trans);
 }
 
 void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c)
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index 800320966ff1..291c805e3cc5 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -135,6 +135,7 @@ void bch2_btree_iter_node_drop(struct btree_iter *, struct btree *);
 void bch2_btree_iter_reinit_node(struct btree_iter *, struct btree *);
 
 int __must_check bch2_btree_iter_traverse(struct btree_iter *);
+int bch2_btree_iter_traverse_all(struct btree_trans *);
 
 struct btree *bch2_btree_iter_peek_node(struct btree_iter *);
 struct btree *bch2_btree_iter_next_node(struct btree_iter *, unsigned);
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 8b043d3c19ad..a8ac68b94e25 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -708,14 +708,11 @@ int bch2_trans_commit_error(struct btree_trans *trans,
 	}
 
 	if (ret == -EINTR) {
-		trans_for_each_update_iter(trans, i) {
-			int ret2 = bch2_btree_iter_traverse(i->iter);
-			if (ret2) {
-				trans_restart(" (traverse)");
-				return ret2;
-			}
+		int ret2 = bch2_btree_iter_traverse_all(trans);
 
-			BUG_ON(i->iter->uptodate > BTREE_ITER_NEED_PEEK);
+		if (ret2) {
+			trans_restart(" (traverse)");
+			return ret2;
 		}
 
 		/*
-- 
cgit 


From 4afe700060799d0ccf42c9881f3ebfab96953fd8 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 28 Mar 2019 00:32:38 -0400
Subject: bcachefs: Unlink not-touched iters on successful transaction commit

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c        | 4 +++-
 fs/bcachefs/btree_update_leaf.c | 4 ++++
 2 files changed, 7 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index c8122be21029..ad7858d77a58 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1892,6 +1892,7 @@ int bch2_trans_unlock(struct btree_trans *trans)
 inline void bch2_trans_unlink_iters(struct btree_trans *trans, u64 iters)
 {
 	iters &= trans->iters_linked;
+	iters &= ~trans->iters_live;
 
 	while (iters) {
 		unsigned idx = __ffs64(iters);
@@ -1919,9 +1920,10 @@ void __bch2_trans_begin(struct btree_trans *trans)
 	iters_to_unlink |= trans->iters_unlink_on_restart;
 	iters_to_unlink |= trans->iters_unlink_on_commit;
 
+	trans->iters_live		= 0;
+
 	bch2_trans_unlink_iters(trans, iters_to_unlink);
 
-	trans->iters_live		= 0;
 	trans->iters_touched		= 0;
 	trans->iters_unlink_on_restart	= 0;
 	trans->iters_unlink_on_commit	= 0;
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index a8ac68b94e25..d345f5a14fde 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -856,6 +856,10 @@ out_noupdates:
 	BUG_ON(!(trans->flags & BTREE_INSERT_ATOMIC) && ret == -EINTR);
 
 	bch2_trans_unlink_iters(trans, trans->iters_unlink_on_commit);
+	if (!ret) {
+		bch2_trans_unlink_iters(trans, ~trans->iters_touched);
+		trans->iters_touched = 0;
+	}
 	trans->nr_updates = 0;
 
 	return ret;
-- 
cgit 


From 76a0537bf1286f56266fb899014505cba1e332f4 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 28 Mar 2019 00:34:25 -0400
Subject: bcachefs: Sort updates in bch2_trans_update()

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update.h      | 12 +++---------
 fs/bcachefs/btree_update_leaf.c | 22 ++++++++++++++++++++--
 2 files changed, 23 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
index 879e7ae39586..75ed02874767 100644
--- a/fs/bcachefs/btree_update.h
+++ b/fs/bcachefs/btree_update.h
@@ -99,19 +99,13 @@ int bch2_btree_node_rewrite(struct bch_fs *c, struct btree_iter *,
 int bch2_btree_node_update_key(struct bch_fs *, struct btree_iter *,
 			       struct btree *, struct bkey_i_btree_ptr *);
 
-static inline void
-bch2_trans_update(struct btree_trans *trans,
-		  struct btree_insert_entry entry)
-{
-	BUG_ON(trans->nr_updates >= trans->nr_iters + 4);
-
-	trans->updates[trans->nr_updates++] = entry;
-}
-
 int bch2_trans_commit(struct btree_trans *,
 		      struct disk_reservation *,
 		      u64 *, unsigned);
 
+struct btree_insert_entry *bch2_trans_update(struct btree_trans *,
+					     struct btree_insert_entry);
+
 #define bch2_trans_do(_c, _journal_seq, _flags, _do)			\
 ({									\
 	struct btree_trans trans;					\
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index d345f5a14fde..d4d4329767da 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -809,8 +809,6 @@ int bch2_trans_commit(struct btree_trans *trans,
 	trans->journal_seq	= journal_seq;
 	trans->flags		= flags;
 
-	bubble_sort(trans->updates, trans->nr_updates, btree_trans_cmp);
-
 	trans_for_each_update(trans, i)
 		btree_insert_entry_checks(trans, i);
 	bch2_btree_trans_verify_locks(trans);
@@ -871,6 +869,26 @@ err:
 	goto out;
 }
 
+struct btree_insert_entry *bch2_trans_update(struct btree_trans *trans,
+					     struct btree_insert_entry entry)
+{
+	struct btree_insert_entry *i;
+
+	BUG_ON(trans->nr_updates >= trans->nr_iters + 4);
+
+	for (i = trans->updates;
+	     i < trans->updates + trans->nr_updates;
+	     i++)
+		if (btree_trans_cmp(entry, *i) < 0)
+			break;
+
+	memmove(&i[1], &i[0],
+		(void *) &trans->updates[trans->nr_updates] - (void *) i);
+	trans->nr_updates++;
+	*i = entry;
+	return i;
+}
+
 int bch2_btree_delete_at(struct btree_trans *trans,
 			 struct btree_iter *iter, unsigned flags)
 {
-- 
cgit 


From f13f5a8c836f55c6b7b0a58cb26245282f67527c Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 28 Mar 2019 01:51:47 -0400
Subject: bcachefs: move some checks to expensive_debug_checks

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs.h   |  2 ++
 fs/bcachefs/bset.c       | 13 +++++++------
 fs/bcachefs/btree_iter.c |  6 ++++++
 fs/bcachefs/extents.c    |  8 ++++++--
 4 files changed, 21 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index a815d7a488a6..a2d8e37e7eb6 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -257,6 +257,8 @@ do {									\
 	BCH_DEBUG_PARAM(expensive_debug_checks,				\
 		"Enables various runtime debugging checks that "	\
 		"significantly affect performance")			\
+	BCH_DEBUG_PARAM(debug_check_iterators,				\
+		"Enables extra verification for btree iterators")	\
 	BCH_DEBUG_PARAM(debug_check_bkeys,				\
 		"Run bkey_debugcheck (primarily checking GC/allocation "\
 		"information) when iterating over keys")		\
diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c
index ac84aac4a263..68442a26756f 100644
--- a/fs/bcachefs/bset.c
+++ b/fs/bcachefs/bset.c
@@ -1023,7 +1023,7 @@ struct bkey_packed *bch2_bkey_prev_filter(struct btree *b,
 		k = p;
 	}
 
-	if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) {
+	if (btree_keys_expensive_checks(b)) {
 		BUG_ON(ret >= orig_k);
 
 		for (i = ret ? bkey_next(ret) : btree_bkey_first(b, t);
@@ -1644,10 +1644,11 @@ static inline void __bch2_btree_node_iter_advance(struct btree_node_iter *iter,
 void bch2_btree_node_iter_advance(struct btree_node_iter *iter,
 				  struct btree *b)
 {
-#ifdef CONFIG_BCACHEFS_DEBUG
-	bch2_btree_node_iter_verify(iter, b);
-	bch2_btree_node_iter_next_check(iter, b);
-#endif
+	if (btree_keys_expensive_checks(b)) {
+		bch2_btree_node_iter_verify(iter, b);
+		bch2_btree_node_iter_next_check(iter, b);
+	}
+
 	__bch2_btree_node_iter_advance(iter, b);
 }
 
@@ -1710,7 +1711,7 @@ found:
 	iter->data[0].k = __btree_node_key_to_offset(b, prev);
 	iter->data[0].end = end;
 out:
-	if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) {
+	if (btree_keys_expensive_checks(b)) {
 		struct btree_node_iter iter2 = *iter;
 
 		if (prev)
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index ad7858d77a58..bc9d8444e220 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -429,6 +429,9 @@ static void __bch2_btree_iter_verify(struct btree_iter *iter,
 	struct btree_node_iter tmp = l->iter;
 	struct bkey_packed *k;
 
+	if (!debug_check_iterators(iter->trans->c))
+		return;
+
 	if (iter->uptodate > BTREE_ITER_NEED_PEEK)
 		return;
 
@@ -475,6 +478,9 @@ void bch2_btree_iter_verify(struct btree_iter *iter, struct btree *b)
 {
 	struct btree_iter *linked;
 
+	if (!debug_check_iterators(iter->trans->c))
+		return;
+
 	trans_for_each_iter_with_node(iter->trans, b, linked)
 		__bch2_btree_iter_verify(linked, b);
 }
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index ce46417b07a0..2e7c3e82f03b 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -788,7 +788,8 @@ static bool bch2_extent_merge_inline(struct bch_fs *,
 				     struct bkey_packed *,
 				     bool);
 
-static void verify_extent_nonoverlapping(struct btree *b,
+static void verify_extent_nonoverlapping(struct bch_fs *c,
+					 struct btree *b,
 					 struct btree_node_iter *_iter,
 					 struct bkey_i *insert)
 {
@@ -797,6 +798,9 @@ static void verify_extent_nonoverlapping(struct btree *b,
 	struct bkey_packed *k;
 	struct bkey uk;
 
+	if (!expensive_debug_checks(c))
+		return;
+
 	iter = *_iter;
 	k = bch2_btree_node_iter_prev_filter(&iter, b, KEY_TYPE_discard);
 	BUG_ON(k &&
@@ -847,7 +851,7 @@ static void extent_bset_insert(struct bch_fs *c, struct btree_iter *iter,
 	BUG_ON(insert->k.u64s > bch_btree_keys_u64s_remaining(c, l->b));
 
 	EBUG_ON(bkey_deleted(&insert->k) || !insert->k.size);
-	verify_extent_nonoverlapping(l->b, &l->iter, insert);
+	verify_extent_nonoverlapping(c, l->b, &l->iter, insert);
 
 	node_iter = l->iter;
 	k = bch2_btree_node_iter_prev_filter(&node_iter, l->b, KEY_TYPE_discard);
-- 
cgit 


From 05b3d5ac1f4ea75defa4e133ca23a953b4a80c23 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 28 Mar 2019 03:08:40 -0400
Subject: bcachefs: simplify gc locking a bit

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_gc.c | 27 +++++++++++----------------
 1 file changed, 11 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index aa8ac7d661ee..5abd7ac5bb78 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -480,12 +480,8 @@ static void bch2_gc_free(struct bch_fs *c)
 		ca->usage[1] = NULL;
 	}
 
-	percpu_down_write(&c->mark_lock);
-
 	free_percpu(c->usage[1]);
 	c->usage[1] = NULL;
-
-	percpu_up_write(&c->mark_lock);
 }
 
 static void bch2_gc_done(struct bch_fs *c, bool initial)
@@ -526,8 +522,6 @@ static void bch2_gc_done(struct bch_fs *c, bool initial)
 #define copy_fs_field(_f, _msg, ...)					\
 	copy_field(_f, "fs has wrong " _msg, ##__VA_ARGS__)
 
-	percpu_down_write(&c->mark_lock);
-
 	{
 		struct genradix_iter dst_iter = genradix_iter_init(&c->stripes[0], 0);
 		struct genradix_iter src_iter = genradix_iter_init(&c->stripes[1], 0);
@@ -635,8 +629,6 @@ static void bch2_gc_done(struct bch_fs *c, bool initial)
 		}
 	}
 
-	percpu_up_write(&c->mark_lock);
-
 #undef copy_fs_field
 #undef copy_dev_field
 #undef copy_bucket_field
@@ -649,8 +641,6 @@ static int bch2_gc_start(struct bch_fs *c)
 	struct bch_dev *ca;
 	unsigned i;
 
-	percpu_down_write(&c->mark_lock);
-
 	/*
 	 * indicate to stripe code that we need to allocate for the gc stripes
 	 * radix tree, too
@@ -661,8 +651,6 @@ static int bch2_gc_start(struct bch_fs *c)
 
 	c->usage[1] = __alloc_percpu_gfp(fs_usage_u64s(c) * sizeof(u64),
 					 sizeof(u64), GFP_KERNEL);
-	percpu_up_write(&c->mark_lock);
-
 	if (!c->usage[1])
 		return -ENOMEM;
 
@@ -685,8 +673,6 @@ static int bch2_gc_start(struct bch_fs *c)
 		}
 	}
 
-	percpu_down_write(&c->mark_lock);
-
 	for_each_member_device(ca, c, i) {
 		struct bucket_array *dst = __bucket_array(ca, 1);
 		struct bucket_array *src = __bucket_array(ca, 0);
@@ -703,8 +689,6 @@ static int bch2_gc_start(struct bch_fs *c)
 		}
 	};
 
-	percpu_up_write(&c->mark_lock);
-
 	return bch2_ec_mem_alloc(c, true);
 }
 
@@ -737,7 +721,10 @@ int bch2_gc(struct bch_fs *c, struct list_head *journal, bool initial)
 
 	down_write(&c->gc_lock);
 again:
+	percpu_down_write(&c->mark_lock);
 	ret = bch2_gc_start(c);
+	percpu_up_write(&c->mark_lock);
+
 	if (ret)
 		goto out;
 
@@ -762,7 +749,11 @@ out:
 			bch_info(c, "Fixed gens, restarting mark and sweep:");
 			clear_bit(BCH_FS_FIXED_GENS, &c->flags);
 			__gc_pos_set(c, gc_phase(GC_PHASE_NOT_RUNNING));
+
+			percpu_down_write(&c->mark_lock);
 			bch2_gc_free(c);
+			percpu_up_write(&c->mark_lock);
+
 			goto again;
 		}
 
@@ -770,6 +761,8 @@ out:
 		ret = -EINVAL;
 	}
 
+	percpu_down_write(&c->mark_lock);
+
 	if (!ret)
 		bch2_gc_done(c, initial);
 
@@ -777,6 +770,8 @@ out:
 	__gc_pos_set(c, gc_phase(GC_PHASE_NOT_RUNNING));
 
 	bch2_gc_free(c);
+	percpu_up_write(&c->mark_lock);
+
 	up_write(&c->gc_lock);
 
 	trace_gc_end(c);
-- 
cgit 


From 6543f5620d81cfa7b52d00c7ade0f037beb7a71e Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 28 Mar 2019 03:28:59 -0400
Subject: bcachefs: Handle fsck errors at runtime better

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/error.c | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c
index 08e79166dae4..f0a44101b7e7 100644
--- a/fs/bcachefs/error.c
+++ b/fs/bcachefs/error.c
@@ -67,10 +67,20 @@ enum fsck_err_ret bch2_fsck_err(struct bch_fs *c, unsigned flags,
 	bool fix = false, print = true, suppressing = false;
 	char _buf[sizeof(s->buf)], *buf = _buf;
 
-	mutex_lock(&c->fsck_error_lock);
+	if (test_bit(BCH_FS_FSCK_DONE, &c->flags)) {
+		va_start(args, fmt);
+		vprintk(fmt, args);
+		va_end(args);
 
-	if (test_bit(BCH_FS_FSCK_DONE, &c->flags))
-		goto print;
+		if (c->opts.errors == BCH_ON_ERROR_CONTINUE &&
+		    flags & FSCK_CAN_FIX)
+			return FSCK_ERR_FIX;
+
+		bch2_inconsistent_error(c);
+		return FSCK_ERR_EXIT;
+	}
+
+	mutex_lock(&c->fsck_error_lock);
 
 	list_for_each_entry(s, &c->fsck_errors, list)
 		if (s->fmt == fmt)
-- 
cgit 


From ccaa61c9f6a9db28f2e0b480927f6f2c97ff72af Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 28 Mar 2019 04:49:05 -0400
Subject: bcachefs: fix initial gc

Buckets weren't being marked as dirty

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_gc.c | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 5abd7ac5bb78..b5a4ac9a4176 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -559,12 +559,6 @@ static void bch2_gc_done(struct bch_fs *c, bool initial)
 		struct bucket_array *src = __bucket_array(ca, 1);
 		size_t b;
 
-		if (initial) {
-			memcpy(dst, src,
-			       sizeof(struct bucket_array) +
-			       sizeof(struct bucket) * dst->nbuckets);
-		}
-
 		for (b = 0; b < src->nbuckets; b++) {
 			copy_bucket_field(gen);
 			copy_bucket_field(data_type);
-- 
cgit 


From d5f70c1f2750f0917025ea6b1ee0591cd65a6097 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 28 Mar 2019 03:40:39 -0400
Subject: bcachefs: Write out alloc info more carefully

In flight btree updates could update alloc info until they're flushed -
so we have to try writing again after they've been flushed.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/super.c | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index dd1496af9a06..369c533e677b 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -175,7 +175,7 @@ static void __bch2_fs_read_only(struct bch_fs *c)
 {
 	struct bch_dev *ca;
 	bool wrote;
-	unsigned i;
+	unsigned i, clean_passes = 0;
 	int ret;
 
 	bch2_rebalance_stop(c);
@@ -195,15 +195,15 @@ static void __bch2_fs_read_only(struct bch_fs *c)
 		goto allocator_not_running;
 
 	do {
-		ret = bch2_alloc_write(c, false, &wrote);
+		ret = bch2_stripes_write(c, &wrote);
 		if (ret) {
-			bch2_fs_inconsistent(c, "error writing out alloc info %i", ret);
+			bch2_fs_inconsistent(c, "error writing out stripes");
 			break;
 		}
 
-		ret = bch2_stripes_write(c, &wrote);
+		ret = bch2_alloc_write(c, false, &wrote);
 		if (ret) {
-			bch2_fs_inconsistent(c, "error writing out stripes");
+			bch2_fs_inconsistent(c, "error writing out alloc info %i", ret);
 			break;
 		}
 
@@ -221,7 +221,9 @@ static void __bch2_fs_read_only(struct bch_fs *c)
 		 */
 		closure_wait_event(&c->btree_interior_update_wait,
 				   !bch2_btree_interior_updates_nr_pending(c));
-	} while (wrote);
+
+		clean_passes = wrote ? 0 : clean_passes + 1;
+	} while (clean_passes < 2);
 allocator_not_running:
 	for_each_member_device(ca, c, i)
 		bch2_dev_allocator_stop(ca);
-- 
cgit 


From 3a0e06db71f65ae0e7a98a1db170339d40abacdc Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 24 Dec 2022 22:44:56 -0500
Subject: bcachefs: Assorted preemption fixes

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c |  2 ++
 fs/bcachefs/btree_gc.c         |  7 +------
 fs/bcachefs/buckets.c          | 13 +++++++++++--
 fs/bcachefs/journal.c          |  4 ----
 4 files changed, 14 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 3549f0f54624..b5f5c223e008 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -291,8 +291,10 @@ int bch2_alloc_read(struct bch_fs *c, struct list_head *journal_replay_list)
 				bch2_alloc_read_key(c, bkey_i_to_s_c(k));
 	}
 
+	percpu_down_write(&c->mark_lock);
 	for_each_member_device(ca, c, i)
 		bch2_dev_usage_from_buckets(c, ca);
+	percpu_up_write(&c->mark_lock);
 
 	mutex_lock(&c->bucket_clock[READ].lock);
 	for_each_member_device(ca, c, i) {
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index b5a4ac9a4176..6ae03254c281 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -354,8 +354,6 @@ void bch2_mark_dev_superblock(struct bch_fs *c, struct bch_dev *ca,
 	if (c) {
 		lockdep_assert_held(&c->sb_lock);
 		percpu_down_read(&c->mark_lock);
-	} else {
-		preempt_disable();
 	}
 
 	for (i = 0; i < layout->nr_superblocks; i++) {
@@ -377,11 +375,8 @@ void bch2_mark_dev_superblock(struct bch_fs *c, struct bch_dev *ca,
 					  gc_phase(GC_PHASE_SB), flags);
 	}
 
-	if (c) {
+	if (c)
 		percpu_up_read(&c->mark_lock);
-	} else {
-		preempt_enable();
-	}
 }
 
 static void bch2_mark_superblocks(struct bch_fs *c)
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 2fbcd85d9e75..ecb0ca3f3a8f 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -393,14 +393,19 @@ void bch2_dev_usage_from_buckets(struct bch_fs *c, struct bch_dev *ca)
 	struct bucket_array *buckets;
 	struct bucket *g;
 
-	percpu_down_read(&c->mark_lock);
+	/*
+	 * This is only called during startup, before there's any multithreaded
+	 * access to c->usage:
+	 */
+	preempt_disable();
 	fs_usage = this_cpu_ptr(c->usage[0]);
+	preempt_enable();
+
 	buckets = bucket_array(ca);
 
 	for_each_bucket(g, buckets)
 		if (g->mark.data_type)
 			bch2_dev_usage_update(c, ca, fs_usage, old, g->mark, false);
-	percpu_up_read(&c->mark_lock);
 }
 
 #define bucket_data_cmpxchg(c, ca, fs_usage, g, new, expr)	\
@@ -513,8 +518,12 @@ void bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
 			    size_t b, bool owned_by_allocator,
 			    struct gc_pos pos, unsigned flags)
 {
+	preempt_disable();
+
 	do_mark_fn(__bch2_mark_alloc_bucket, c, pos, flags,
 		   ca, b, owned_by_allocator);
+
+	preempt_enable();
 }
 
 static int bch2_mark_alloc(struct bch_fs *c, struct bkey_s_c k,
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index c0dcc0ff65ce..dbecb4072af0 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -835,8 +835,6 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
 		if (c) {
 			percpu_down_read(&c->mark_lock);
 			spin_lock(&c->journal.lock);
-		} else {
-			preempt_disable();
 		}
 
 		pos = ja->nr ? (ja->cur_idx + 1) % ja->nr : 0;
@@ -866,8 +864,6 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
 		if (c) {
 			spin_unlock(&c->journal.lock);
 			percpu_up_read(&c->mark_lock);
-		} else {
-			preempt_enable();
 		}
 
 		if (!new_fs)
-- 
cgit 


From a6d90385e6915429a891408824b8c72219a139f2 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 24 Dec 2022 22:45:11 -0500
Subject: bcachefs: (invalidate|release)_folio fixes

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-io.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index d865081d4a21..c8f6104553aa 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -736,9 +736,6 @@ out:
 
 void bch2_invalidate_folio(struct folio *folio, size_t offset, size_t length)
 {
-	EBUG_ON(!PageLocked(&folio->page));
-	EBUG_ON(folio_test_writeback(folio));
-
 	if (offset || length < folio_size(folio))
 		return;
 
@@ -751,7 +748,7 @@ bool bch2_release_folio(struct folio *folio, gfp_t gfp_mask)
 	EBUG_ON(!PageLocked(&folio->page));
 	EBUG_ON(folio_test_writeback(folio));
 
-	if (folio_test_dirty(folio))
+	if (folio_test_dirty(folio) || folio_test_writeback(folio))
 		return false;
 
 	bch2_clear_page_bits(&folio->page);
-- 
cgit 


From 0bc166ff564f9e2b0bfc7a0c1a92472a600f901d Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 28 Mar 2019 09:34:55 -0400
Subject: bcachefs: Track whether filesystem has errors in superblock

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs.h        | 11 +----------
 fs/bcachefs/bcachefs_format.h |  4 +++-
 fs/bcachefs/error.c           | 32 ++++++++++++++------------------
 fs/bcachefs/recovery.c        |  5 ++++-
 fs/bcachefs/super-io.c        |  6 ++++--
 fs/bcachefs/super.c           |  1 -
 6 files changed, 26 insertions(+), 33 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index a2d8e37e7eb6..d8a9d4962d70 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -473,14 +473,6 @@ struct bch_dev {
 	struct io_count __percpu *io_done;
 };
 
-/*
- * Flag bits for what phase of startup/shutdown the cache set is at, how we're
- * shutting down, etc.:
- *
- * BCH_FS_UNREGISTERING means we're not just shutting down, we're detaching
- * all the backing devices first (their cached data gets invalidated, and they
- * won't automatically reattach).
- */
 enum {
 	/* startup: */
 	BCH_FS_ALLOC_READ_DONE,
@@ -497,11 +489,10 @@ enum {
 
 	/* errors: */
 	BCH_FS_ERROR,
+	BCH_FS_ERRORS_FIXED,
 
 	/* misc: */
 	BCH_FS_BDEV_MOUNTED,
-	BCH_FS_FSCK_FIXED_ERRORS,
-	BCH_FS_FSCK_UNFIXED_ERRORS,
 	BCH_FS_FIXED_GENS,
 	BCH_FS_REBUILD_REPLICAS,
 	BCH_FS_HOLD_BTREE_WRITES,
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index 9a3ca6fa30b7..646910a6a4bb 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -1241,7 +1241,9 @@ LE64_BITMASK(BCH_SB_USRQUOTA,		struct bch_sb, flags[0], 57, 58);
 LE64_BITMASK(BCH_SB_GRPQUOTA,		struct bch_sb, flags[0], 58, 59);
 LE64_BITMASK(BCH_SB_PRJQUOTA,		struct bch_sb, flags[0], 59, 60);
 
-/* 60-64 unused */
+LE64_BITMASK(BCH_SB_HAS_ERRORS,		struct bch_sb, flags[0], 60, 61);
+
+/* 61-64 unused */
 
 LE64_BITMASK(BCH_SB_STR_HASH_TYPE,	struct bch_sb, flags[1],  0,  4);
 LE64_BITMASK(BCH_SB_COMPRESSION_TYPE,	struct bch_sb, flags[1],  4,  8);
diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c
index f0a44101b7e7..1aaff44e18cf 100644
--- a/fs/bcachefs/error.c
+++ b/fs/bcachefs/error.c
@@ -72,12 +72,9 @@ enum fsck_err_ret bch2_fsck_err(struct bch_fs *c, unsigned flags,
 		vprintk(fmt, args);
 		va_end(args);
 
-		if (c->opts.errors == BCH_ON_ERROR_CONTINUE &&
-		    flags & FSCK_CAN_FIX)
-			return FSCK_ERR_FIX;
-
-		bch2_inconsistent_error(c);
-		return FSCK_ERR_EXIT;
+		return bch2_inconsistent_error(c)
+			? FSCK_ERR_EXIT
+			: FSCK_ERR_FIX;
 	}
 
 	mutex_lock(&c->fsck_error_lock);
@@ -110,11 +107,7 @@ print:
 
 	if (c->opts.fix_errors == FSCK_OPT_EXIT) {
 		bch_err(c, "%s, exiting", buf);
-		mutex_unlock(&c->fsck_error_lock);
-		return FSCK_ERR_EXIT;
-	}
-
-	if (flags & FSCK_CAN_FIX) {
+	} else if (flags & FSCK_CAN_FIX) {
 		if (c->opts.fix_errors == FSCK_OPT_ASK) {
 			printk(KERN_ERR "%s: fix?", buf);
 			fix = ask_yn();
@@ -142,13 +135,16 @@ print:
 
 	mutex_unlock(&c->fsck_error_lock);
 
-	set_bit(fix
-		? BCH_FS_FSCK_FIXED_ERRORS
-		: BCH_FS_FSCK_UNFIXED_ERRORS, &c->flags);
-
-	return fix				? FSCK_ERR_FIX
-		: flags & FSCK_CAN_IGNORE	? FSCK_ERR_IGNORE
-						: FSCK_ERR_EXIT;
+	if (fix) {
+		set_bit(BCH_FS_ERRORS_FIXED, &c->flags);
+		return FSCK_ERR_FIX;
+	} else {
+		set_bit(BCH_FS_ERROR, &c->flags);
+		return c->opts.fix_errors == FSCK_OPT_EXIT ||
+			!(flags & FSCK_CAN_IGNORE)
+			? FSCK_ERR_EXIT
+			: FSCK_ERR_IGNORE;
+	}
 }
 
 void bch2_flush_fsck_errs(struct bch_fs *c)
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 93c4d5887e8b..68415df8565b 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -365,8 +365,11 @@ int bch2_fs_recovery(struct bch_fs *c)
 		c->disk_sb.sb->version = le16_to_cpu(bcachefs_metadata_version_current);
 	}
 
-	if (!test_bit(BCH_FS_FSCK_UNFIXED_ERRORS, &c->flags))
+	if (c->opts.fsck &&
+	    !test_bit(BCH_FS_ERROR, &c->flags)) {
 		c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_ATOMIC_NLINK;
+		SET_BCH_SB_HAS_ERRORS(c->disk_sb.sb, 0);
+	}
 	mutex_unlock(&c->sb_lock);
 
 	if (enabled_qtypes(c)) {
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index f504743fff4d..9fd77e57cafe 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -707,6 +707,9 @@ int bch2_write_super(struct bch_fs *c)
 
 	le64_add_cpu(&c->disk_sb.sb->seq, 1);
 
+	if (test_bit(BCH_FS_ERROR, &c->flags))
+		SET_BCH_SB_HAS_ERRORS(c->disk_sb.sb, 1);
+
 	for_each_online_member(ca, c, i)
 		bch2_sb_from_fs(c, ca);
 
@@ -719,8 +722,7 @@ int bch2_write_super(struct bch_fs *c)
 		}
 	}
 
-	if (c->opts.nochanges ||
-	    test_bit(BCH_FS_ERROR, &c->flags))
+	if (c->opts.nochanges)
 		goto out;
 
 	for_each_online_member(ca, c, i) {
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 369c533e677b..f8e921b3fb8d 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -843,7 +843,6 @@ err:
 	}
 
 	BUG_ON(!err);
-	set_bit(BCH_FS_ERROR, &c->flags);
 	goto out;
 }
 
-- 
cgit 


From 9d455b24be5239df23757042703419de9351e461 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 29 Mar 2019 14:34:10 -0400
Subject: bcachefs: make sure to use BTREE_INSERT_LAZY_RW in fsck

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fsck.c | 21 +++++++++++++++------
 1 file changed, 15 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index c4d9d2761cdc..79e4b1b6a556 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -91,7 +91,9 @@ static int reattach_inode(struct bch_fs *c,
 	bch2_inode_pack(&packed, lostfound_inode);
 
 	ret = bch2_btree_insert(c, BTREE_ID_INODES, &packed.inode.k_i,
-				NULL, NULL, BTREE_INSERT_NOFAIL);
+				NULL, NULL,
+				BTREE_INSERT_NOFAIL|
+				BTREE_INSERT_LAZY_RW);
 	if (ret) {
 		bch_err(c, "error %i reattaching inode %llu while updating lost+found",
 			ret, inum);
@@ -101,7 +103,8 @@ static int reattach_inode(struct bch_fs *c,
 	ret = bch2_dirent_create(c, lostfound_inode->bi_inum,
 				 &lostfound_hash_info,
 				 DT_DIR, &name, inum, NULL,
-				 BTREE_INSERT_NOFAIL);
+				 BTREE_INSERT_NOFAIL|
+				 BTREE_INSERT_LAZY_RW);
 	if (ret) {
 		bch_err(c, "error %i reattaching inode %llu while creating new dirent",
 			ret, inum);
@@ -483,7 +486,8 @@ static int check_extents(struct bch_fs *c)
 
 			ret = bch2_btree_insert(c, BTREE_ID_INODES,
 						&p.inode.k_i, NULL, NULL,
-						BTREE_INSERT_NOFAIL);
+						BTREE_INSERT_NOFAIL|
+						BTREE_INSERT_LAZY_RW);
 			if (ret) {
 				bch_err(c, "error in fs gc: error %i "
 					"updating inode", ret);
@@ -751,7 +755,9 @@ create_root:
 	bch2_inode_pack(&packed, root_inode);
 
 	return bch2_btree_insert(c, BTREE_ID_INODES, &packed.inode.k_i,
-				 NULL, NULL, BTREE_INSERT_NOFAIL);
+				 NULL, NULL,
+				 BTREE_INSERT_NOFAIL|
+				 BTREE_INSERT_LAZY_RW);
 }
 
 /* Get lost+found, create if it doesn't exist: */
@@ -795,7 +801,9 @@ create_lostfound:
 	bch2_inode_pack(&packed, root_inode);
 
 	ret = bch2_btree_insert(c, BTREE_ID_INODES, &packed.inode.k_i,
-				NULL, NULL, BTREE_INSERT_NOFAIL);
+				NULL, NULL,
+				BTREE_INSERT_NOFAIL|
+				BTREE_INSERT_LAZY_RW);
 	if (ret)
 		return ret;
 
@@ -809,7 +817,8 @@ create_lostfound:
 
 	ret = bch2_dirent_create(c, BCACHEFS_ROOT_INO, &root_hash_info, DT_DIR,
 				 &lostfound, lostfound_inode->bi_inum, NULL,
-				 BTREE_INSERT_NOFAIL);
+				 BTREE_INSERT_NOFAIL|
+				 BTREE_INSERT_LAZY_RW);
 	if (ret)
 		return ret;
 
-- 
cgit 


From 58a46dc5a2d4073f48a9110a9c343bc2d68a6e88 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 29 Mar 2019 14:29:47 -0400
Subject: bcachefs: allow journal reply on ro mount

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/super.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index f8e921b3fb8d..8c31a9a67eee 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -432,9 +432,6 @@ int bch2_fs_read_write_early(struct bch_fs *c)
 {
 	lockdep_assert_held(&c->state_lock);
 
-	if (c->opts.read_only)
-		return -EROFS;
-
 	return __bch2_fs_read_write(c, true);
 }
 
-- 
cgit 


From a2b6b0729e8b4d06d1cf1baf1a7976b54c872aeb Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 29 Mar 2019 14:42:34 -0400
Subject: bcachefs: add missing bch2_btree_iter_node_drop() call

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c            | 8 --------
 fs/bcachefs/btree_update_interior.c | 5 +++++
 2 files changed, 5 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index bc9d8444e220..bb898911bdc6 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -819,14 +819,6 @@ void bch2_btree_iter_node_drop(struct btree_iter *iter, struct btree *b)
 	struct btree_iter *linked;
 	unsigned level = b->level;
 
-	/* caller now responsible for unlocking @b */
-
-	BUG_ON(iter->l[level].b != b);
-	BUG_ON(!btree_node_intent_locked(iter, level));
-
-	iter->l[level].b = BTREE_ITER_NOT_END;
-	mark_btree_node_unlocked(iter, level);
-
 	trans_for_each_iter(iter->trans, linked)
 		if (linked->l[level].b == b) {
 			__btree_node_unlock(linked, level);
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 6e9a87c6b1be..4931089e2c6d 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -1429,6 +1429,7 @@ static void btree_split(struct btree_update *as, struct btree *b,
 
 	/* Successful split, update the iterator to point to the new nodes: */
 
+	six_lock_increment(&b->lock, SIX_LOCK_intent);
 	bch2_btree_iter_node_drop(iter, b);
 	if (n3)
 		bch2_btree_iter_node_replace(iter, n3);
@@ -1740,7 +1741,10 @@ retry:
 
 	bch2_open_buckets_put(c, &n->ob);
 
+	six_lock_increment(&b->lock, SIX_LOCK_intent);
 	bch2_btree_iter_node_drop(iter, b);
+	bch2_btree_iter_node_drop(iter, m);
+
 	bch2_btree_iter_node_replace(iter, n);
 
 	bch2_btree_iter_verify(iter, n);
@@ -1838,6 +1842,7 @@ static int __btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter,
 
 	bch2_open_buckets_put(c, &n->ob);
 
+	six_lock_increment(&b->lock, SIX_LOCK_intent);
 	bch2_btree_iter_node_drop(iter, b);
 	bch2_btree_iter_node_replace(iter, n);
 	bch2_btree_node_free_inmem(c, b, iter);
-- 
cgit 


From 4c1c1e395373a8b47dee91a78f708176794d04f5 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 31 Mar 2019 18:40:01 -0400
Subject: bcachefs: fix bch2_trans_unlock()

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index bb898911bdc6..fbbb7428c592 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1705,7 +1705,7 @@ void bch2_trans_preload_iters(struct btree_trans *trans)
 
 static int btree_trans_iter_alloc(struct btree_trans *trans)
 {
-	unsigned idx = ffz(trans->iters_linked);
+	unsigned idx = __ffs64(~trans->iters_linked);
 
 	if (idx < trans->nr_iters)
 		goto got_slot;
@@ -1871,17 +1871,17 @@ void *bch2_trans_kmalloc(struct btree_trans *trans,
 
 int bch2_trans_unlock(struct btree_trans *trans)
 {
-	unsigned iters = trans->iters_linked;
+	u64 iters = trans->iters_linked;
 	int ret = 0;
 
 	while (iters) {
-		unsigned idx = __ffs(iters);
+		unsigned idx = __ffs64(iters);
 		struct btree_iter *iter = &trans->iters[idx];
 
 		ret = ret ?: btree_iter_err(iter);
 
 		__bch2_btree_iter_unlock(iter);
-		iters ^= 1 << idx;
+		iters ^= 1ULL << idx;
 	}
 
 	return ret;
-- 
cgit 


From 7b512638e03a1d302f27997c011cc10a9906f04e Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 29 Mar 2019 19:13:54 -0400
Subject: bcachefs: Refactor bch2_fs_recovery()

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/recovery.c | 314 ++++++++++++++++++++++++++++---------------------
 1 file changed, 179 insertions(+), 135 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 68415df8565b..0fa952fa1053 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -52,6 +52,84 @@ found:
 	return k;
 }
 
+static int verify_superblock_clean(struct bch_fs *c,
+				   struct bch_sb_field_clean **cleanp,
+				   struct jset *j)
+{
+	unsigned i;
+	struct bch_sb_field_clean *clean = *cleanp;
+	int ret = 0;
+
+	if (!clean || !j)
+		return 0;
+
+	if (mustfix_fsck_err_on(j->seq != clean->journal_seq, c,
+			"superblock journal seq (%llu) doesn't match journal (%llu) after clean shutdown",
+			le64_to_cpu(clean->journal_seq),
+			le64_to_cpu(j->seq))) {
+		kfree(clean);
+		*cleanp = NULL;
+		return 0;
+	}
+
+	mustfix_fsck_err_on(j->read_clock != clean->read_clock, c,
+			"superblock read clock doesn't match journal after clean shutdown");
+	mustfix_fsck_err_on(j->write_clock != clean->write_clock, c,
+			"superblock read clock doesn't match journal after clean shutdown");
+
+	for (i = 0; i < BTREE_ID_NR; i++) {
+		struct bkey_i *k1, *k2;
+		unsigned l1 = 0, l2 = 0;
+
+		k1 = btree_root_find(c, clean, NULL, i, &l1);
+		k2 = btree_root_find(c, NULL, j, i, &l2);
+
+		if (!k1 && !k2)
+			continue;
+
+		mustfix_fsck_err_on(!k1 || !k2 ||
+				    IS_ERR(k1) ||
+				    IS_ERR(k2) ||
+				    k1->k.u64s != k2->k.u64s ||
+				    memcmp(k1, k2, bkey_bytes(k1)) ||
+				    l1 != l2, c,
+			"superblock btree root doesn't match journal after clean shutdown");
+	}
+fsck_err:
+	return ret;
+}
+
+static struct bch_sb_field_clean *read_superblock_clean(struct bch_fs *c)
+{
+	struct bch_sb_field_clean *clean, *sb_clean;
+
+	if (!c->sb.clean)
+		return NULL;
+
+	mutex_lock(&c->sb_lock);
+	sb_clean = bch2_sb_get_clean(c->disk_sb.sb);
+	if (!sb_clean) {
+		mutex_unlock(&c->sb_lock);
+		bch_err(c, "superblock marked clean but clean section not present");
+		return NULL;
+	}
+
+	clean = kmemdup(sb_clean, vstruct_bytes(&sb_clean->field),
+			GFP_KERNEL);
+	if (!clean) {
+		mutex_unlock(&c->sb_lock);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	if (le16_to_cpu(c->disk_sb.sb->version) <
+	    bcachefs_metadata_version_bkey_renumber)
+		bch2_sb_clean_renumber(clean, READ);
+
+	mutex_unlock(&c->sb_lock);
+
+	return clean;
+}
+
 static int journal_replay_entry_early(struct bch_fs *c,
 				      struct jset_entry *entry)
 {
@@ -106,49 +184,85 @@ static int journal_replay_entry_early(struct bch_fs *c,
 	return ret;
 }
 
-static int verify_superblock_clean(struct bch_fs *c,
-				   struct bch_sb_field_clean **cleanp,
-				   struct jset *j)
+static int load_journal_metadata(struct bch_fs *c,
+				 struct bch_sb_field_clean *clean,
+				 struct list_head *journal)
 {
-	unsigned i;
-	struct bch_sb_field_clean *clean = *cleanp;
-	int ret = 0;
+	struct jset_entry *entry;
+	int ret;
 
-	if (!clean || !j)
-		return 0;
+	if (clean) {
+		c->bucket_clock[READ].hand = le16_to_cpu(clean->read_clock);
+		c->bucket_clock[WRITE].hand = le16_to_cpu(clean->write_clock);
 
-	if (mustfix_fsck_err_on(j->seq != clean->journal_seq, c,
-			"superblock journal seq (%llu) doesn't match journal (%llu) after clean shutdown",
-			le64_to_cpu(clean->journal_seq),
-			le64_to_cpu(j->seq))) {
-		kfree(clean);
-		*cleanp = NULL;
-		return 0;
+		for (entry = clean->start;
+		     entry != vstruct_end(&clean->field);
+		     entry = vstruct_next(entry)) {
+			ret = journal_replay_entry_early(c, entry);
+			if (ret)
+				return ret;
+		}
+	} else {
+		struct journal_replay *i =
+			list_last_entry(journal, struct journal_replay, list);
+
+		c->bucket_clock[READ].hand = le16_to_cpu(i->j.read_clock);
+		c->bucket_clock[WRITE].hand = le16_to_cpu(i->j.write_clock);
+
+		list_for_each_entry(i, journal, list)
+			vstruct_for_each(&i->j, entry) {
+				ret = journal_replay_entry_early(c, entry);
+				if (ret)
+					return ret;
+			}
 	}
 
-	mustfix_fsck_err_on(j->read_clock != clean->read_clock, c,
-			"superblock read clock doesn't match journal after clean shutdown");
-	mustfix_fsck_err_on(j->write_clock != clean->write_clock, c,
-			"superblock read clock doesn't match journal after clean shutdown");
+	bch2_fs_usage_initialize(c);
+
+	return 0;
+}
+
+static int read_btree_roots(struct bch_fs *c)
+{
+	unsigned i;
+	int ret = 0;
 
 	for (i = 0; i < BTREE_ID_NR; i++) {
-		struct bkey_i *k1, *k2;
-		unsigned l1 = 0, l2 = 0;
+		struct btree_root *r = &c->btree_roots[i];
 
-		k1 = btree_root_find(c, clean, NULL, i, &l1);
-		k2 = btree_root_find(c, NULL, j, i, &l2);
+		if (!r->alive)
+			continue;
 
-		if (!k1 && !k2)
+		if (i == BTREE_ID_ALLOC &&
+		    test_reconstruct_alloc(c)) {
+			c->sb.compat &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_INFO);
 			continue;
+		}
 
-		mustfix_fsck_err_on(!k1 || !k2 ||
-				    IS_ERR(k1) ||
-				    IS_ERR(k2) ||
-				    k1->k.u64s != k2->k.u64s ||
-				    memcmp(k1, k2, bkey_bytes(k1)) ||
-				    l1 != l2, c,
-			"superblock btree root doesn't match journal after clean shutdown");
+
+		if (r->error) {
+			__fsck_err(c, i == BTREE_ID_ALLOC
+				   ? FSCK_CAN_IGNORE : 0,
+				   "invalid btree root %s",
+				   bch2_btree_ids[i]);
+			if (i == BTREE_ID_ALLOC)
+				c->sb.compat &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_INFO);
+		}
+
+		ret = bch2_btree_root_read(c, i, &r->key, r->level);
+		if (ret) {
+			__fsck_err(c, i == BTREE_ID_ALLOC
+				   ? FSCK_CAN_IGNORE : 0,
+				   "error reading btree root %s",
+				   bch2_btree_ids[i]);
+			if (i == BTREE_ID_ALLOC)
+				c->sb.compat &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_INFO);
+		}
 	}
+
+	for (i = 0; i < BTREE_ID_NR; i++)
+		if (!c->btree_roots[i].b)
+			bch2_btree_root_alloc(c, i);
 fsck_err:
 	return ret;
 }
@@ -186,38 +300,11 @@ static bool journal_empty(struct list_head *journal)
 int bch2_fs_recovery(struct bch_fs *c)
 {
 	const char *err = "cannot allocate memory";
-	struct bch_sb_field_clean *clean = NULL, *sb_clean = NULL;
-	struct jset_entry *entry;
+	struct bch_sb_field_clean *clean;
 	LIST_HEAD(journal);
-	struct jset *j = NULL;
-	unsigned i;
-	bool run_gc = c->opts.fsck ||
-		!(c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_INFO));
 	int ret;
 
-	mutex_lock(&c->sb_lock);
-	if (!c->replicas.entries) {
-		bch_info(c, "building replicas info");
-		set_bit(BCH_FS_REBUILD_REPLICAS, &c->flags);
-	}
-
-	if (c->sb.clean)
-		sb_clean = bch2_sb_get_clean(c->disk_sb.sb);
-	if (sb_clean) {
-		clean = kmemdup(sb_clean, vstruct_bytes(&sb_clean->field),
-				GFP_KERNEL);
-		if (!clean) {
-			ret = -ENOMEM;
-			mutex_unlock(&c->sb_lock);
-			goto err;
-		}
-
-		if (le16_to_cpu(c->disk_sb.sb->version) <
-		    bcachefs_metadata_version_bkey_renumber)
-			bch2_sb_clean_renumber(clean, READ);
-	}
-	mutex_unlock(&c->sb_lock);
-
+	clean = read_superblock_clean(c);
 	if (clean)
 		bch_info(c, "recovering from clean shutdown, journal seq %llu",
 			 le64_to_cpu(clean->journal_seq));
@@ -227,78 +314,29 @@ int bch2_fs_recovery(struct bch_fs *c)
 		if (ret)
 			goto err;
 
-		j = &list_entry(journal.prev, struct journal_replay, list)->j;
+		ret = verify_superblock_clean(c, &clean,
+			&list_last_entry(&journal, struct journal_replay,
+					 list)->j);
+		if (ret)
+			goto err;
 	} else {
 		ret = bch2_journal_set_seq(c,
 					   le64_to_cpu(clean->journal_seq),
 					   le64_to_cpu(clean->journal_seq));
-		BUG_ON(ret);
+		if (ret)
+			goto err;
 	}
 
-	ret = verify_superblock_clean(c, &clean, j);
-	if (ret)
-		goto err;
-
 	fsck_err_on(clean && !journal_empty(&journal), c,
 		    "filesystem marked clean but journal not empty");
 
-	err = "insufficient memory";
-	if (clean) {
-		c->bucket_clock[READ].hand = le16_to_cpu(clean->read_clock);
-		c->bucket_clock[WRITE].hand = le16_to_cpu(clean->write_clock);
-
-		for (entry = clean->start;
-		     entry != vstruct_end(&clean->field);
-		     entry = vstruct_next(entry)) {
-			ret = journal_replay_entry_early(c, entry);
-			if (ret)
-				goto err;
-		}
-	} else {
-		struct journal_replay *i;
-
-		c->bucket_clock[READ].hand = le16_to_cpu(j->read_clock);
-		c->bucket_clock[WRITE].hand = le16_to_cpu(j->write_clock);
-
-		list_for_each_entry(i, &journal, list)
-			vstruct_for_each(&i->j, entry) {
-				ret = journal_replay_entry_early(c, entry);
-				if (ret)
-					goto err;
-			}
-	}
-
-	bch2_fs_usage_initialize(c);
-
-	for (i = 0; i < BTREE_ID_NR; i++) {
-		struct btree_root *r = &c->btree_roots[i];
-
-		if (!r->alive)
-			continue;
-
-		err = "invalid btree root pointer";
-		ret = -1;
-		if (r->error)
-			goto err;
-
-		if (i == BTREE_ID_ALLOC &&
-		    test_reconstruct_alloc(c))
-			continue;
-
-		err = "error reading btree root";
-		ret = bch2_btree_root_read(c, i, &r->key, r->level);
-		if (ret) {
-			if (i != BTREE_ID_ALLOC)
-				goto err;
-
-			mustfix_fsck_err(c, "error reading btree root");
-			run_gc = true;
-		}
-	}
+	ret = load_journal_metadata(c, clean, &journal);
+	if (ret)
+		goto err;
 
-	for (i = 0; i < BTREE_ID_NR; i++)
-		if (!c->btree_roots[i].b)
-			bch2_btree_root_alloc(c, i);
+	ret = read_btree_roots(c);
+	if (ret)
+		goto err;
 
 	err = "error reading allocation information";
 	ret = bch2_alloc_read(c, &journal);
@@ -313,7 +351,14 @@ int bch2_fs_recovery(struct bch_fs *c)
 
 	set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags);
 
-	if (run_gc) {
+	if (!c->replicas.entries) {
+		bch_info(c, "building replicas info");
+		set_bit(BCH_FS_REBUILD_REPLICAS, &c->flags);
+	}
+
+	if (c->opts.fsck ||
+	    !(c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_INFO)) ||
+	    test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags)) {
 		bch_verbose(c, "starting mark and sweep:");
 		err = "error in recovery";
 		ret = bch2_gc(c, &journal, true);
@@ -332,9 +377,6 @@ int bch2_fs_recovery(struct bch_fs *c)
 	if (c->sb.encryption_type && !c->sb.clean)
 		atomic64_add(1 << 16, &c->key_version);
 
-	if (c->opts.noreplay)
-		goto out;
-
 	/*
 	 * bch2_fs_journal_start() can't happen sooner, or btree_gc_finish()
 	 * will give spurious errors about oldest_gen > bucket_gen -
@@ -342,6 +384,9 @@ int bch2_fs_recovery(struct bch_fs *c)
 	 */
 	bch2_fs_journal_start(&c->journal);
 
+	if (c->opts.noreplay)
+		goto out;
+
 	bch_verbose(c, "starting journal replay:");
 	err = "journal replay failed";
 	ret = bch2_journal_replay(c, &journal);
@@ -357,6 +402,14 @@ int bch2_fs_recovery(struct bch_fs *c)
 	if (ret)
 		goto err;
 
+	if (enabled_qtypes(c)) {
+		bch_verbose(c, "reading quotas:");
+		ret = bch2_fs_quota_read(c);
+		if (ret)
+			goto err;
+		bch_verbose(c, "quotas done");
+	}
+
 	mutex_lock(&c->sb_lock);
 	if (c->opts.version_upgrade) {
 		if (c->sb.version < bcachefs_metadata_version_new_versioning)
@@ -371,15 +424,6 @@ int bch2_fs_recovery(struct bch_fs *c)
 		SET_BCH_SB_HAS_ERRORS(c->disk_sb.sb, 0);
 	}
 	mutex_unlock(&c->sb_lock);
-
-	if (enabled_qtypes(c)) {
-		bch_verbose(c, "reading quotas:");
-		ret = bch2_fs_quota_read(c);
-		if (ret)
-			goto err;
-		bch_verbose(c, "quotas done");
-	}
-
 out:
 	bch2_journal_entries_free(&journal);
 	kfree(clean);
-- 
cgit 


From 36e916e13b694e18d2928d0bda54fb5805051129 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 29 Mar 2019 19:49:17 -0400
Subject: bcachefs: Caller now responsible for calling mark_key for gc

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_gc.c              |  7 ++-----
 fs/bcachefs/btree_update_interior.c | 40 ++++++++++++++++++++-----------------
 fs/bcachefs/btree_update_leaf.c     | 10 +++++++++-
 fs/bcachefs/buckets.c               | 37 ++++++++++++----------------------
 fs/bcachefs/buckets.h               | 10 +++++-----
 fs/bcachefs/ec.c                    |  5 +----
 fs/bcachefs/journal_io.c            |  3 ++-
 7 files changed, 54 insertions(+), 58 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 6ae03254c281..84ed3377f86e 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -117,7 +117,6 @@ static int bch2_gc_mark_key(struct bch_fs *c, struct bkey_s_c k,
 {
 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
 	const struct bch_extent_ptr *ptr;
-	struct gc_pos pos = { 0 };
 	unsigned flags =
 		BCH_BUCKET_MARK_GC|
 		(initial ? BCH_BUCKET_MARK_NOATOMIC : 0);
@@ -174,7 +173,7 @@ static int bch2_gc_mark_key(struct bch_fs *c, struct bkey_s_c k,
 		*max_stale = max(*max_stale, ptr_stale(ca, ptr));
 	}
 
-	bch2_mark_key(c, k, true, k.k->size, pos, NULL, 0, flags);
+	bch2_mark_key(c, k, true, k.k->size, NULL, 0, flags);
 fsck_err:
 	return ret;
 }
@@ -395,7 +394,6 @@ static void bch2_mark_superblocks(struct bch_fs *c)
 /* Also see bch2_pending_btree_node_free_insert_done() */
 static void bch2_mark_pending_btree_node_frees(struct bch_fs *c)
 {
-	struct gc_pos pos = { 0 };
 	struct btree_update *as;
 	struct pending_btree_node_free *d;
 
@@ -405,8 +403,7 @@ static void bch2_mark_pending_btree_node_frees(struct bch_fs *c)
 	for_each_pending_btree_node_free(c, as, d)
 		if (d->index_update_done)
 			bch2_mark_key(c, bkey_i_to_s_c(&d->key),
-				      true, 0,
-				      pos, NULL, 0,
+				      true, 0, NULL, 0,
 				      BCH_BUCKET_MARK_GC);
 
 	mutex_unlock(&c->btree_interior_update_lock);
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 4931089e2c6d..35472cf5e9e0 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -162,7 +162,6 @@ static void bch2_btree_node_free_index(struct btree_update *as, struct btree *b,
 {
 	struct bch_fs *c = as->c;
 	struct pending_btree_node_free *d;
-	struct gc_pos pos = { 0 };
 
 	for (d = as->pending; d < as->pending + as->nr_pending; d++)
 		if (!bkey_cmp(k.k->p, d->key.k.p) &&
@@ -190,18 +189,12 @@ found:
 	 * to cancel out one of mark and sweep's markings if necessary:
 	 */
 
-	/*
-	 * bch2_mark_key() compares the current gc pos to the pos we're
-	 * moving this reference from, hence one comparison here:
-	 */
 	if (gc_pos_cmp(c->gc_pos, b
 		       ? gc_pos_btree_node(b)
 		       : gc_pos_btree_root(as->btree_id)) >= 0 &&
 	    gc_pos_cmp(c->gc_pos, gc_phase(GC_PHASE_PENDING_DELETE)) < 0)
-		bch2_mark_key_locked(c,
-			      bkey_i_to_s_c(&d->key),
-			      false, 0, pos,
-			      NULL, 0, BCH_BUCKET_MARK_GC);
+		bch2_mark_key_locked(c, bkey_i_to_s_c(&d->key),
+			      false, 0, NULL, 0, BCH_BUCKET_MARK_GC);
 }
 
 static void __btree_node_free(struct bch_fs *c, struct btree *b)
@@ -273,8 +266,11 @@ static void bch2_btree_node_free_ondisk(struct bch_fs *c,
 
 	bch2_mark_key(c, bkey_i_to_s_c(&pending->key),
 		      false, 0,
-		      gc_phase(GC_PHASE_PENDING_DELETE),
 		      NULL, 0, 0);
+
+	if (gc_visited(c, gc_phase(GC_PHASE_PENDING_DELETE)))
+		bch2_mark_key(c, bkey_i_to_s_c(&pending->key),
+			      false, 0, NULL, 0, BCH_BUCKET_MARK_GC);
 }
 
 static struct btree *__bch2_btree_node_alloc(struct bch_fs *c,
@@ -1079,9 +1075,11 @@ static void bch2_btree_set_root_inmem(struct btree_update *as, struct btree *b)
 	fs_usage = bch2_fs_usage_scratch_get(c);
 
 	bch2_mark_key_locked(c, bkey_i_to_s_c(&b->key),
-		      true, 0,
-		      gc_pos_btree_root(b->btree_id),
-		      fs_usage, 0, 0);
+		      true, 0, fs_usage, 0, 0);
+	if (gc_visited(c, gc_pos_btree_root(b->btree_id)))
+		bch2_mark_key_locked(c, bkey_i_to_s_c(&b->key),
+				     true, 0, NULL, 0,
+				     BCH_BUCKET_MARK_GC);
 
 	if (old && !btree_node_fake(old))
 		bch2_btree_node_free_index(as, NULL,
@@ -1173,8 +1171,11 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, struct btree *b
 	fs_usage = bch2_fs_usage_scratch_get(c);
 
 	bch2_mark_key_locked(c, bkey_i_to_s_c(insert),
-			     true, 0,
-			     gc_pos_btree_node(b), fs_usage, 0, 0);
+			     true, 0, fs_usage, 0, 0);
+
+	if (gc_visited(c, gc_pos_btree_node(b)))
+		bch2_mark_key_locked(c, bkey_i_to_s_c(insert),
+				     true, 0, NULL, 0, BCH_BUCKET_MARK_GC);
 
 	while ((k = bch2_btree_node_iter_peek_all(node_iter, b)) &&
 	       bkey_iter_pos_cmp(b, &insert->k.p, k) > 0)
@@ -1994,9 +1995,12 @@ static void __bch2_btree_node_update_key(struct bch_fs *c,
 		fs_usage = bch2_fs_usage_scratch_get(c);
 
 		bch2_mark_key_locked(c, bkey_i_to_s_c(&new_key->k_i),
-			      true, 0,
-			      gc_pos_btree_root(b->btree_id),
-			      fs_usage, 0, 0);
+			      true, 0, fs_usage, 0, 0);
+		if (gc_visited(c, gc_pos_btree_root(b->btree_id)))
+			bch2_mark_key_locked(c, bkey_i_to_s_c(&new_key->k_i),
+					     true, 0, NULL, 0,
+					     BCH_BUCKET_MARK_GC);
+
 		bch2_btree_node_free_index(as, NULL,
 					   bkey_i_to_s_c(&b->key),
 					   fs_usage);
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index d4d4329767da..6fa9fa5768aa 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -3,6 +3,7 @@
 #include "bcachefs.h"
 #include "btree_update.h"
 #include "btree_update_interior.h"
+#include "btree_gc.h"
 #include "btree_io.h"
 #include "btree_iter.h"
 #include "btree_locking.h"
@@ -602,10 +603,17 @@ static inline int do_btree_insert_at(struct btree_trans *trans,
 	}
 
 	trans_for_each_update_iter(trans, i)
-		bch2_mark_update(trans, i, fs_usage);
+		bch2_mark_update(trans, i, fs_usage, 0);
 	if (fs_usage)
 		bch2_trans_fs_usage_apply(trans, fs_usage);
 
+	if (unlikely(c->gc_pos.phase)) {
+		trans_for_each_update_iter(trans, i)
+			if (gc_visited(c, gc_pos_btree_node(i->iter->l[0].b)))
+				bch2_mark_update(trans, i, NULL,
+						 BCH_BUCKET_MARK_GC);
+	}
+
 	trans_for_each_update(trans, i)
 		do_btree_insert_one(trans, i);
 out:
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index ecb0ca3f3a8f..495ef4732602 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -940,12 +940,13 @@ static int bch2_mark_stripe(struct bch_fs *c, struct bkey_s_c k,
 	return 0;
 }
 
-static int __bch2_mark_key(struct bch_fs *c, struct bkey_s_c k,
-			   bool inserting, s64 sectors,
-			   struct bch_fs_usage *fs_usage,
-			   unsigned journal_seq, unsigned flags,
-			   bool gc)
+int bch2_mark_key_locked(struct bch_fs *c,
+		   struct bkey_s_c k,
+		   bool inserting, s64 sectors,
+		   struct bch_fs_usage *fs_usage,
+		   u64 journal_seq, unsigned flags)
 {
+	bool gc = flags & BCH_BUCKET_MARK_GC;
 	int ret = 0;
 
 	preempt_disable();
@@ -997,21 +998,8 @@ static int __bch2_mark_key(struct bch_fs *c, struct bkey_s_c k,
 	return ret;
 }
 
-int bch2_mark_key_locked(struct bch_fs *c,
-		   struct bkey_s_c k,
-		   bool inserting, s64 sectors,
-		   struct gc_pos pos,
-		   struct bch_fs_usage *fs_usage,
-		   u64 journal_seq, unsigned flags)
-{
-	return do_mark_fn(__bch2_mark_key, c, pos, flags,
-			  k, inserting, sectors, fs_usage,
-			  journal_seq, flags);
-}
-
 int bch2_mark_key(struct bch_fs *c, struct bkey_s_c k,
 		  bool inserting, s64 sectors,
-		  struct gc_pos pos,
 		  struct bch_fs_usage *fs_usage,
 		  u64 journal_seq, unsigned flags)
 {
@@ -1019,7 +1007,7 @@ int bch2_mark_key(struct bch_fs *c, struct bkey_s_c k,
 
 	percpu_down_read(&c->mark_lock);
 	ret = bch2_mark_key_locked(c, k, inserting, sectors,
-				   pos, fs_usage, journal_seq, flags);
+				   fs_usage, journal_seq, flags);
 	percpu_up_read(&c->mark_lock);
 
 	return ret;
@@ -1027,13 +1015,13 @@ int bch2_mark_key(struct bch_fs *c, struct bkey_s_c k,
 
 void bch2_mark_update(struct btree_trans *trans,
 		      struct btree_insert_entry *insert,
-		      struct bch_fs_usage *fs_usage)
+		      struct bch_fs_usage *fs_usage,
+		      unsigned flags)
 {
 	struct bch_fs		*c = trans->c;
 	struct btree_iter	*iter = insert->iter;
 	struct btree		*b = iter->l[0].b;
 	struct btree_node_iter	node_iter = iter->l[0].iter;
-	struct gc_pos		pos = gc_pos_btree_node(b);
 	struct bkey_packed	*_k;
 
 	if (!btree_node_type_needs_gc(iter->btree_id))
@@ -1043,7 +1031,7 @@ void bch2_mark_update(struct btree_trans *trans,
 		bch2_mark_key_locked(c, bkey_i_to_s_c(insert->k), true,
 			bpos_min(insert->k->k.p, b->key.k.p).offset -
 			bkey_start_offset(&insert->k->k),
-			pos, fs_usage, trans->journal_res.seq, 0);
+			fs_usage, trans->journal_res.seq, flags);
 
 	while ((_k = bch2_btree_node_iter_peek_filter(&node_iter, b,
 						      KEY_TYPE_discard))) {
@@ -1076,7 +1064,8 @@ void bch2_mark_update(struct btree_trans *trans,
 				BUG_ON(sectors <= 0);
 
 				bch2_mark_key_locked(c, k, true, sectors,
-					pos, fs_usage, trans->journal_res.seq, 0);
+					fs_usage, trans->journal_res.seq,
+					flags);
 
 				sectors = bkey_start_offset(&insert->k->k) -
 					k.k->p.offset;
@@ -1087,7 +1076,7 @@ void bch2_mark_update(struct btree_trans *trans,
 		}
 
 		bch2_mark_key_locked(c, k, false, sectors,
-			pos, fs_usage, trans->journal_res.seq, 0);
+			fs_usage, trans->journal_res.seq, flags);
 
 		bch2_btree_node_iter_advance(&node_iter, b);
 	}
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index e34c9d24dc38..6af8b418b1e3 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -246,16 +246,16 @@ void bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *,
 #define BCH_BUCKET_MARK_NOATOMIC		(1 << 1)
 
 int bch2_mark_key_locked(struct bch_fs *, struct bkey_s_c,
-		  bool, s64, struct gc_pos,
-		  struct bch_fs_usage *, u64, unsigned);
+		  bool, s64, struct bch_fs_usage *,
+		  u64, unsigned);
 int bch2_mark_key(struct bch_fs *, struct bkey_s_c,
-		  bool, s64, struct gc_pos,
-		  struct bch_fs_usage *, u64, unsigned);
+		  bool, s64, struct bch_fs_usage *,
+		  u64, unsigned);
 int bch2_fs_usage_apply(struct bch_fs *, struct bch_fs_usage *,
 			struct disk_reservation *);
 
 void bch2_mark_update(struct btree_trans *, struct btree_insert_entry *,
-		      struct bch_fs_usage *);
+		      struct bch_fs_usage *, unsigned);
 void bch2_trans_fs_usage_apply(struct btree_trans *, struct bch_fs_usage *);
 
 /* disk reservations: */
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index 5fc0025e66bf..75fe0c28fa22 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -1232,10 +1232,7 @@ int bch2_stripes_write(struct bch_fs *c, bool *wrote)
 
 static void bch2_stripe_read_key(struct bch_fs *c, struct bkey_s_c k)
 {
-
-	struct gc_pos pos = { 0 };
-
-	bch2_mark_key(c, k, true, 0, pos, NULL, 0, 0);
+	bch2_mark_key(c, k, true, 0, NULL, 0, 0);
 }
 
 int bch2_stripes_read(struct bch_fs *c, struct list_head *journal_replay_list)
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 1bb627c05188..1293bb66e62c 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -876,8 +876,9 @@ static int bch2_extent_replay_key(struct bch_fs *c, struct bkey_i *k)
 	 * but - there are other correctness issues if btree gc were to run
 	 * before journal replay finishes
 	 */
+	BUG_ON(c->gc_pos.phase);
+
 	bch2_mark_key(c, bkey_i_to_s_c(k), false, -((s64) k->k.size),
-		      gc_pos_btree_node(iter->l[0].b),
 		      NULL, 0, 0);
 	bch2_trans_exit(&trans);
 
-- 
cgit 


From 6bd1305735bc4346e0ca6cc0ff27517e8bab8f0d Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 31 Mar 2019 17:37:30 -0400
Subject: bcachefs: Fsck locking improvements

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/dirent.c | 19 +++++++++----
 fs/bcachefs/dirent.h |  1 +
 fs/bcachefs/fsck.c   | 80 +++++++++++++++++++++++++++++++---------------------
 fs/bcachefs/inode.c  | 35 ++++++++++++-----------
 fs/bcachefs/inode.h  |  5 ++--
 5 files changed, 83 insertions(+), 57 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c
index 672a94936179..4479a9f55ddf 100644
--- a/fs/bcachefs/dirent.c
+++ b/fs/bcachefs/dirent.c
@@ -329,17 +329,18 @@ out:
 	return inum;
 }
 
-int bch2_empty_dir(struct bch_fs *c, u64 dir_inum)
+int bch2_empty_dir_trans(struct btree_trans *trans, u64 dir_inum)
 {
-	struct btree_trans trans;
 	struct btree_iter *iter;
 	struct bkey_s_c k;
 	int ret = 0;
 
-	bch2_trans_init(&trans, c);
+	iter = bch2_trans_get_iter(trans, BTREE_ID_DIRENTS,
+				   POS(dir_inum, 0), 0);
+	if (IS_ERR(iter))
+		return PTR_ERR(iter);
 
-	for_each_btree_key(&trans, iter, BTREE_ID_DIRENTS,
-			   POS(dir_inum, 0), 0, k) {
+	for_each_btree_key_continue(iter, 0, k) {
 		if (k.k->p.inode > dir_inum)
 			break;
 
@@ -348,11 +349,17 @@ int bch2_empty_dir(struct bch_fs *c, u64 dir_inum)
 			break;
 		}
 	}
-	bch2_trans_exit(&trans);
+	bch2_trans_iter_put(trans, iter);
 
 	return ret;
 }
 
+int bch2_empty_dir(struct bch_fs *c, u64 dir_inum)
+{
+	return bch2_trans_do(c, NULL, 0,
+		bch2_empty_dir_trans(&trans, dir_inum));
+}
+
 int bch2_readdir(struct bch_fs *c, struct file *file,
 		 struct dir_context *ctx)
 {
diff --git a/fs/bcachefs/dirent.h b/fs/bcachefs/dirent.h
index 7b47573dcc46..bc64718a7832 100644
--- a/fs/bcachefs/dirent.h
+++ b/fs/bcachefs/dirent.h
@@ -55,6 +55,7 @@ int bch2_dirent_rename(struct btree_trans *,
 u64 bch2_dirent_lookup(struct bch_fs *, u64, const struct bch_hash_info *,
 		       const struct qstr *);
 
+int bch2_empty_dir_trans(struct btree_trans *, u64);
 int bch2_empty_dir(struct bch_fs *, u64);
 int bch2_readdir(struct bch_fs *, struct file *, struct dir_context *);
 
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 79e4b1b6a556..661131d5a114 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -128,18 +128,21 @@ static struct inode_walker inode_walker_init(void)
 	};
 }
 
-static int walk_inode(struct bch_fs *c, struct inode_walker *w, u64 inum)
+static int walk_inode(struct btree_trans *trans,
+		      struct inode_walker *w, u64 inum)
 {
-	w->first_this_inode	= inum != w->cur_inum;
-	w->cur_inum		= inum;
-
-	if (w->first_this_inode) {
-		int ret = bch2_inode_find_by_inum(c, inum, &w->inode);
+	if (inum != w->cur_inum) {
+		int ret = bch2_inode_find_by_inum_trans(trans, inum,
+							&w->inode);
 
 		if (ret && ret != -ENOENT)
 			return ret;
 
-		w->have_inode = !ret;
+		w->have_inode	= !ret;
+		w->cur_inum	= inum;
+		w->first_this_inode = true;
+	} else {
+		w->first_this_inode = false;
 	}
 
 	return 0;
@@ -445,12 +448,15 @@ static int check_extents(struct bch_fs *c)
 	int ret = 0;
 
 	bch2_trans_init(&trans, c);
+	bch2_trans_preload_iters(&trans);
 
 	bch_verbose(c, "checking extents");
 
-	for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS,
-			   POS(BCACHEFS_ROOT_INO, 0), 0, k) {
-		ret = walk_inode(c, &w, k.k->p.inode);
+	iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
+				   POS(BCACHEFS_ROOT_INO, 0), 0);
+retry:
+	for_each_btree_key_continue(iter, 0, k) {
+		ret = walk_inode(&trans, &w, k.k->p.inode);
 		if (ret)
 			break;
 
@@ -515,6 +521,8 @@ static int check_extents(struct bch_fs *c)
 	}
 err:
 fsck_err:
+	if (ret == -EINTR)
+		goto retry;
 	return bch2_trans_exit(&trans) ?: ret;
 }
 
@@ -537,21 +545,20 @@ static int check_dirents(struct bch_fs *c)
 	bch_verbose(c, "checking dirents");
 
 	bch2_trans_init(&trans, c);
-
 	bch2_trans_preload_iters(&trans);
 
-	iter = bch2_trans_get_iter(&trans, BTREE_ID_DIRENTS,
-				   POS(BCACHEFS_ROOT_INO, 0), 0);
-
 	hash_check_init(&h);
 
+	iter = bch2_trans_get_iter(&trans, BTREE_ID_DIRENTS,
+				   POS(BCACHEFS_ROOT_INO, 0), 0);
+retry:
 	for_each_btree_key_continue(iter, 0, k) {
 		struct bkey_s_c_dirent d;
 		struct bch_inode_unpacked target;
 		bool have_target;
 		u64 d_inum;
 
-		ret = walk_inode(c, &w, k.k->p.inode);
+		ret = walk_inode(&trans, &w, k.k->p.inode);
 		if (ret)
 			break;
 
@@ -620,7 +627,7 @@ static int check_dirents(struct bch_fs *c)
 			continue;
 		}
 
-		ret = bch2_inode_find_by_inum(c, d_inum, &target);
+		ret = bch2_inode_find_by_inum_trans(&trans, d_inum, &target);
 		if (ret && ret != -ENOENT)
 			break;
 
@@ -671,6 +678,9 @@ static int check_dirents(struct bch_fs *c)
 	hash_stop_chain(&trans, &h);
 err:
 fsck_err:
+	if (ret == -EINTR)
+		goto retry;
+
 	return bch2_trans_exit(&trans) ?: ret;
 }
 
@@ -689,17 +699,16 @@ static int check_xattrs(struct bch_fs *c)
 
 	bch_verbose(c, "checking xattrs");
 
-	bch2_trans_init(&trans, c);
+	hash_check_init(&h);
 
+	bch2_trans_init(&trans, c);
 	bch2_trans_preload_iters(&trans);
 
 	iter = bch2_trans_get_iter(&trans, BTREE_ID_XATTRS,
 				   POS(BCACHEFS_ROOT_INO, 0), 0);
-
-	hash_check_init(&h);
-
+retry:
 	for_each_btree_key_continue(iter, 0, k) {
-		ret = walk_inode(c, &w, k.k->p.inode);
+		ret = walk_inode(&trans, &w, k.k->p.inode);
 		if (ret)
 			break;
 
@@ -722,6 +731,8 @@ static int check_xattrs(struct bch_fs *c)
 	}
 err:
 fsck_err:
+	if (ret == -EINTR)
+		goto retry;
 	return bch2_trans_exit(&trans) ?: ret;
 }
 
@@ -905,6 +916,7 @@ static int check_directory_structure(struct bch_fs *c,
 	int ret = 0;
 
 	bch2_trans_init(&trans, c);
+	bch2_trans_preload_iters(&trans);
 
 	bch_verbose(c, "checking directory structure");
 
@@ -919,9 +931,8 @@ restart_dfs:
 	}
 
 	ret = path_down(&path, BCACHEFS_ROOT_INO);
-	if (ret) {
-		return ret;
-	}
+	if (ret)
+		goto err;
 
 	while (path.nr) {
 next:
@@ -983,14 +994,19 @@ up:
 		path.nr--;
 	}
 
-	for_each_btree_key(&trans, iter, BTREE_ID_INODES, POS_MIN, 0, k) {
+	iter = bch2_trans_get_iter(&trans, BTREE_ID_INODES, POS_MIN, 0);
+retry:
+	for_each_btree_key_continue(iter, 0, k) {
 		if (k.k->type != KEY_TYPE_inode)
 			continue;
 
 		if (!S_ISDIR(le16_to_cpu(bkey_s_c_to_inode(k).v->bi_mode)))
 			continue;
 
-		if (!bch2_empty_dir(c, k.k->p.inode))
+		ret = bch2_empty_dir_trans(&trans, k.k->p.inode);
+		if (ret == -EINTR)
+			goto retry;
+		if (!ret)
 			continue;
 
 		if (fsck_err_on(!inode_bitmap_test(&dirs_done, k.k->p.inode), c,
@@ -1018,15 +1034,12 @@ up:
 		memset(&path, 0, sizeof(path));
 		goto restart_dfs;
 	}
-
-out:
-	kfree(dirs_done.bits);
-	kfree(path.entries);
-	return ret;
 err:
 fsck_err:
 	ret = bch2_trans_exit(&trans) ?: ret;
-	goto out;
+	kfree(dirs_done.bits);
+	kfree(path.entries);
+	return ret;
 }
 
 struct nlink {
@@ -1070,6 +1083,7 @@ static int bch2_gc_walk_dirents(struct bch_fs *c, nlink_table *links,
 	int ret;
 
 	bch2_trans_init(&trans, c);
+	bch2_trans_preload_iters(&trans);
 
 	inc_link(c, links, range_start, range_end, BCACHEFS_ROOT_INO, false);
 
@@ -1327,6 +1341,7 @@ static int bch2_gc_walk_inodes(struct bch_fs *c,
 	u64 nlinks_pos;
 
 	bch2_trans_init(&trans, c);
+	bch2_trans_preload_iters(&trans);
 
 	iter = bch2_trans_get_iter(&trans, BTREE_ID_INODES,
 				   POS(range_start, 0), 0);
@@ -1426,6 +1441,7 @@ static int check_inodes_fast(struct bch_fs *c)
 	int ret = 0, ret2;
 
 	bch2_trans_init(&trans, c);
+	bch2_trans_preload_iters(&trans);
 
 	iter = bch2_trans_get_iter(&trans, BTREE_ID_INODES,
 				   POS_MIN, 0);
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index 7be24865cc3f..8e7bec8ce542 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -444,31 +444,32 @@ int bch2_inode_rm(struct bch_fs *c, u64 inode_nr)
 	return ret;
 }
 
-int bch2_inode_find_by_inum(struct bch_fs *c, u64 inode_nr,
-			    struct bch_inode_unpacked *inode)
+int bch2_inode_find_by_inum_trans(struct btree_trans *trans, u64 inode_nr,
+				  struct bch_inode_unpacked *inode)
 {
-	struct btree_trans trans;
 	struct btree_iter *iter;
 	struct bkey_s_c k;
 	int ret = -ENOENT;
 
-	bch2_trans_init(&trans, c);
+	iter = bch2_trans_get_iter(trans, BTREE_ID_INODES,
+			POS(inode_nr, 0), BTREE_ITER_SLOTS);
+	if (IS_ERR(iter))
+		return PTR_ERR(iter);
 
-	for_each_btree_key(&trans, iter, BTREE_ID_INODES,
-			   POS(inode_nr, 0), BTREE_ITER_SLOTS, k) {
-		switch (k.k->type) {
-		case KEY_TYPE_inode:
-			ret = bch2_inode_unpack(bkey_s_c_to_inode(k), inode);
-			break;
-		default:
-			/* hole, not found */
-			break;
-		}
+	k = bch2_btree_iter_peek_slot(iter);
+	if (k.k->type == KEY_TYPE_inode)
+		ret = bch2_inode_unpack(bkey_s_c_to_inode(k), inode);
 
-		break;
-	}
+	bch2_trans_iter_put(trans, iter);
 
-	return bch2_trans_exit(&trans) ?: ret;
+	return ret;
+}
+
+int bch2_inode_find_by_inum(struct bch_fs *c, u64 inode_nr,
+			    struct bch_inode_unpacked *inode)
+{
+	return bch2_trans_do(c, NULL, 0,
+		bch2_inode_find_by_inum_trans(&trans, inode_nr, inode));
 }
 
 #ifdef CONFIG_BCACHEFS_DEBUG
diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h
index 07d7020f230d..ada639c06619 100644
--- a/fs/bcachefs/inode.h
+++ b/fs/bcachefs/inode.h
@@ -60,8 +60,9 @@ int bch2_inode_create(struct bch_fs *, struct bch_inode_unpacked *,
 
 int bch2_inode_rm(struct bch_fs *, u64);
 
-int bch2_inode_find_by_inum(struct bch_fs *, u64,
-			   struct bch_inode_unpacked *);
+int bch2_inode_find_by_inum_trans(struct btree_trans *, u64,
+				  struct bch_inode_unpacked *);
+int bch2_inode_find_by_inum(struct bch_fs *, u64, struct bch_inode_unpacked *);
 
 static inline struct bch_io_opts bch2_inode_opts_get(struct bch_inode_unpacked *inode)
 {
-- 
cgit 


From dcf77129749fea4f6608d310161be1650dc2a4dc Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 31 Mar 2019 19:23:34 -0400
Subject: bcachefs: minor fsck fix

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fsck.c | 30 ++++++++++++------------------
 1 file changed, 12 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 661131d5a114..9db01437315b 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -1240,12 +1240,10 @@ static int check_inode(struct btree_trans *trans,
 			return ret;
 	}
 
-	if (u.bi_flags & BCH_INODE_UNLINKED) {
-		fsck_err_on(c->sb.clean, c,
-			    "filesystem marked clean, "
-			    "but inode %llu unlinked",
-			    u.bi_inum);
-
+	if (u.bi_flags & BCH_INODE_UNLINKED &&
+	    (!c->sb.clean ||
+	     fsck_err(c, "filesystem marked clean, but inode %llu unlinked",
+		      u.bi_inum))) {
 		bch_verbose(c, "deleting inode %llu", u.bi_inum);
 
 		ret = bch2_inode_rm(c, u.bi_inum);
@@ -1255,12 +1253,10 @@ static int check_inode(struct btree_trans *trans,
 		return ret;
 	}
 
-	if (u.bi_flags & BCH_INODE_I_SIZE_DIRTY) {
-		fsck_err_on(c->sb.clean, c,
-			    "filesystem marked clean, "
-			    "but inode %llu has i_size dirty",
-			    u.bi_inum);
-
+	if (u.bi_flags & BCH_INODE_I_SIZE_DIRTY &&
+	    (!c->sb.clean ||
+	     fsck_err(c, "filesystem marked clean, but inode %llu has i_size dirty",
+		      u.bi_inum))) {
 		bch_verbose(c, "truncating inode %llu", u.bi_inum);
 
 		/*
@@ -1285,14 +1281,12 @@ static int check_inode(struct btree_trans *trans,
 		do_update = true;
 	}
 
-	if (u.bi_flags & BCH_INODE_I_SECTORS_DIRTY) {
+	if (u.bi_flags & BCH_INODE_I_SECTORS_DIRTY &&
+	    (!c->sb.clean ||
+	     fsck_err(c, "filesystem marked clean, but inode %llu has i_sectors dirty",
+		      u.bi_inum))) {
 		s64 sectors;
 
-		fsck_err_on(c->sb.clean, c,
-			    "filesystem marked clean, "
-			    "but inode %llu has i_sectors dirty",
-			    u.bi_inum);
-
 		bch_verbose(c, "recounting sectors for inode %llu",
 			    u.bi_inum);
 
-- 
cgit 


From ac7f0d77c24a20998c150b58caff321a4bbcc828 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 3 Apr 2019 20:38:37 -0400
Subject: bcachefs: ratelimit copygc warning

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs.h | 2 ++
 fs/bcachefs/movinggc.c | 3 ++-
 2 files changed, 4 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index d8a9d4962d70..d8c487e33592 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -223,6 +223,8 @@
 	printk(KERN_NOTICE bch2_fmt(c, fmt), ##__VA_ARGS__)
 #define bch_warn(c, fmt, ...) \
 	printk(KERN_WARNING bch2_fmt(c, fmt), ##__VA_ARGS__)
+#define bch_warn_ratelimited(c, fmt, ...) \
+	printk_ratelimited(KERN_WARNING bch2_fmt(c, fmt), ##__VA_ARGS__)
 #define bch_err(c, fmt, ...) \
 	printk(KERN_ERR bch2_fmt(c, fmt), ##__VA_ARGS__)
 #define bch_err_ratelimited(c, fmt, ...) \
diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c
index 4bf4cc33dbb1..78d9ca8bfc5e 100644
--- a/fs/bcachefs/movinggc.c
+++ b/fs/bcachefs/movinggc.c
@@ -209,7 +209,8 @@ static void bch2_copygc(struct bch_fs *c, struct bch_dev *ca)
 	up_read(&ca->bucket_lock);
 
 	if (sectors_not_moved && !ret)
-		bch_warn(c, "copygc finished but %llu/%llu sectors, %llu/%llu buckets not moved",
+		bch_warn_ratelimited(c,
+			"copygc finished but %llu/%llu sectors, %llu/%llu buckets not moved",
 			 sectors_not_moved, sectors_to_move,
 			 buckets_not_moved, buckets_to_move);
 
-- 
cgit 


From cccf4e6df36ffb4752b4c83efd0723281e629693 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 28 Mar 2019 03:13:27 -0400
Subject: bcachefs: Convert gc errors to fsck errors

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_gc.c | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 84ed3377f86e..4119f48281fb 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -476,25 +476,26 @@ static void bch2_gc_free(struct bch_fs *c)
 	c->usage[1] = NULL;
 }
 
-static void bch2_gc_done(struct bch_fs *c, bool initial)
+static int bch2_gc_done(struct bch_fs *c, bool initial)
 {
 	struct bch_dev *ca;
 	bool verify = !initial ||
 		(c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_INFO));
 	unsigned i;
+	int ret = 0;
 
 #define copy_field(_f, _msg, ...)					\
 	if (dst->_f != src->_f) {					\
 		if (verify)						\
-			bch_err(c, _msg ": got %llu, should be %llu, fixing"\
+			fsck_err(c, _msg ": got %llu, should be %llu"	\
 				, ##__VA_ARGS__, dst->_f, src->_f);	\
 		dst->_f = src->_f;					\
 	}
 #define copy_stripe_field(_f, _msg, ...)				\
 	if (dst->_f != src->_f) {					\
 		if (verify)						\
-			bch_err_ratelimited(c, "stripe %zu has wrong "_msg\
-				": got %u, should be %u, fixing",	\
+			fsck_err(c, "stripe %zu has wrong "_msg		\
+				": got %u, should be %u",		\
 				dst_iter.pos, ##__VA_ARGS__,		\
 				dst->_f, src->_f);			\
 		dst->_f = src->_f;					\
@@ -503,8 +504,8 @@ static void bch2_gc_done(struct bch_fs *c, bool initial)
 #define copy_bucket_field(_f)						\
 	if (dst->b[b].mark._f != src->b[b].mark._f) {			\
 		if (verify)						\
-			bch_err_ratelimited(c, "dev %u bucket %zu has wrong " #_f\
-				": got %u, should be %u, fixing", i, b,	\
+			fsck_err(c, "dev %u bucket %zu has wrong " #_f	\
+				": got %u, should be %u", i, b,		\
 				dst->b[b].mark._f, src->b[b].mark._f);	\
 		dst->b[b]._mark._f = src->b[b].mark._f;			\
 		dst->b[b]._mark.dirty = true;				\
@@ -620,6 +621,8 @@ static void bch2_gc_done(struct bch_fs *c, bool initial)
 #undef copy_bucket_field
 #undef copy_stripe_field
 #undef copy_field
+fsck_err:
+	return ret;
 }
 
 static int bch2_gc_start(struct bch_fs *c)
@@ -750,7 +753,7 @@ out:
 	percpu_down_write(&c->mark_lock);
 
 	if (!ret)
-		bch2_gc_done(c, initial);
+		ret = bch2_gc_done(c, initial);
 
 	/* Indicates that gc is no longer in progress: */
 	__gc_pos_set(c, gc_phase(GC_PHASE_NOT_RUNNING));
-- 
cgit 


From a1d58243f943f5933e65e18e504333ac9eccb679 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 29 Mar 2019 22:22:45 -0400
Subject: bcachefs: add ability to run gc on metadata only

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c |  3 +-
 fs/bcachefs/btree_gc.c         | 97 ++++++++++++++++++++++--------------------
 fs/bcachefs/btree_gc.h         |  2 +-
 fs/bcachefs/buckets.c          | 40 +++++++++++++----
 fs/bcachefs/buckets.h          |  2 +-
 fs/bcachefs/buckets_types.h    |  1 +
 fs/bcachefs/recovery.c         |  2 +-
 fs/bcachefs/sysfs.c            |  2 +-
 8 files changed, 88 insertions(+), 61 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index b5f5c223e008..c6a909bdfc02 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -292,8 +292,7 @@ int bch2_alloc_read(struct bch_fs *c, struct list_head *journal_replay_list)
 	}
 
 	percpu_down_write(&c->mark_lock);
-	for_each_member_device(ca, c, i)
-		bch2_dev_usage_from_buckets(c, ca);
+	bch2_dev_usage_from_buckets(c);
 	percpu_up_write(&c->mark_lock);
 
 	mutex_lock(&c->bucket_clock[READ].lock);
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 4119f48281fb..c572391c4dad 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -204,7 +204,7 @@ static int btree_gc_mark_node(struct bch_fs *c, struct btree *b,
 }
 
 static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id,
-			 bool initial)
+			 bool initial, bool metadata_only)
 {
 	struct btree_trans trans;
 	struct btree_iter *iter;
@@ -224,7 +224,9 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id,
 	 * and on startup, we have to read every btree node (XXX: only if it was
 	 * an unclean shutdown)
 	 */
-	if (initial || expensive_debug_checks(c))
+	if (metadata_only)
+		depth = 1;
+	else if (initial || expensive_debug_checks(c))
 		depth = 0;
 
 	btree_node_range_checks_init(&r, depth);
@@ -280,7 +282,7 @@ static inline int btree_id_gc_phase_cmp(enum btree_id l, enum btree_id r)
 }
 
 static int bch2_gc_btrees(struct bch_fs *c, struct list_head *journal,
-			  bool initial)
+			  bool initial, bool metadata_only)
 {
 	enum btree_id ids[BTREE_ID_NR];
 	u8 max_stale;
@@ -294,11 +296,12 @@ static int bch2_gc_btrees(struct bch_fs *c, struct list_head *journal,
 		enum btree_id id = ids[i];
 		enum btree_node_type type = __btree_node_type(0, id);
 
-		int ret = bch2_gc_btree(c, id, initial);
+		int ret = bch2_gc_btree(c, id, initial, metadata_only);
 		if (ret)
 			return ret;
 
-		if (journal && btree_node_type_needs_gc(type)) {
+		if (journal && !metadata_only &&
+		    btree_node_type_needs_gc(type)) {
 			struct bkey_i *k, *n;
 			struct jset_entry *j;
 			struct journal_replay *r;
@@ -476,11 +479,13 @@ static void bch2_gc_free(struct bch_fs *c)
 	c->usage[1] = NULL;
 }
 
-static int bch2_gc_done(struct bch_fs *c, bool initial)
+static int bch2_gc_done(struct bch_fs *c,
+			bool initial, bool metadata_only)
 {
 	struct bch_dev *ca;
-	bool verify = !initial ||
-		(c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_INFO));
+	bool verify = !metadata_only &&
+		(!initial ||
+		 (c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_INFO)));
 	unsigned i;
 	int ret = 0;
 
@@ -515,7 +520,7 @@ static int bch2_gc_done(struct bch_fs *c, bool initial)
 #define copy_fs_field(_f, _msg, ...)					\
 	copy_field(_f, "fs has wrong " _msg, ##__VA_ARGS__)
 
-	{
+	if (!metadata_only) {
 		struct genradix_iter dst_iter = genradix_iter_init(&c->stripes[0], 0);
 		struct genradix_iter src_iter = genradix_iter_init(&c->stripes[1], 0);
 		struct stripe *dst, *src;
@@ -567,26 +572,7 @@ static int bch2_gc_done(struct bch_fs *c, bool initial)
 		}
 	};
 
-	for_each_member_device(ca, c, i) {
-		unsigned nr = sizeof(struct bch_dev_usage) / sizeof(u64);
-		struct bch_dev_usage *dst = (void *)
-			bch2_acc_percpu_u64s((void *) ca->usage[0], nr);
-		struct bch_dev_usage *src = (void *)
-			bch2_acc_percpu_u64s((void *) ca->usage[1], nr);
-		unsigned b;
-
-		for (b = 0; b < BCH_DATA_NR; b++)
-			copy_dev_field(buckets[b],	"buckets[%s]",
-				       bch2_data_types[b]);
-		copy_dev_field(buckets_alloc,		"buckets_alloc");
-		copy_dev_field(buckets_ec,		"buckets_ec");
-		copy_dev_field(buckets_unavailable,	"buckets_unavailable");
-
-		for (b = 0; b < BCH_DATA_NR; b++)
-			copy_dev_field(sectors[b],	"sectors[%s]",
-				       bch2_data_types[b]);
-		copy_dev_field(sectors_fragmented,	"sectors_fragmented");
-	}
+	bch2_dev_usage_from_buckets(c);
 
 	{
 		unsigned nr = fs_usage_u64s(c);
@@ -596,20 +582,29 @@ static int bch2_gc_done(struct bch_fs *c, bool initial)
 			bch2_acc_percpu_u64s((void *) c->usage[1], nr);
 
 		copy_fs_field(hidden,		"hidden");
-		copy_fs_field(data,		"data");
-		copy_fs_field(cached,		"cached");
-		copy_fs_field(reserved,		"reserved");
-		copy_fs_field(nr_inodes,	"nr_inodes");
+		copy_fs_field(btree,		"btree");
 
-		for (i = 0; i < BCH_REPLICAS_MAX; i++)
-			copy_fs_field(persistent_reserved[i],
-				      "persistent_reserved[%i]", i);
+		if (!metadata_only) {
+			copy_fs_field(data,	"data");
+			copy_fs_field(cached,	"cached");
+			copy_fs_field(reserved,	"reserved");
+			copy_fs_field(nr_inodes,"nr_inodes");
+
+			for (i = 0; i < BCH_REPLICAS_MAX; i++)
+				copy_fs_field(persistent_reserved[i],
+					      "persistent_reserved[%i]", i);
+		}
 
 		for (i = 0; i < c->replicas.nr; i++) {
 			struct bch_replicas_entry *e =
 				cpu_replicas_entry(&c->replicas, i);
 			char buf[80];
 
+			if (metadata_only &&
+			    (e->data_type == BCH_DATA_USER ||
+			     e->data_type == BCH_DATA_CACHED))
+				continue;
+
 			bch2_replicas_entry_to_text(&PBUF(buf), e);
 
 			copy_fs_field(replicas[i], "%s", buf);
@@ -625,7 +620,8 @@ fsck_err:
 	return ret;
 }
 
-static int bch2_gc_start(struct bch_fs *c)
+static int bch2_gc_start(struct bch_fs *c,
+			 bool metadata_only)
 {
 	struct bch_dev *ca;
 	unsigned i;
@@ -671,10 +667,18 @@ static int bch2_gc_start(struct bch_fs *c)
 		dst->nbuckets		= src->nbuckets;
 
 		for (b = 0; b < src->nbuckets; b++) {
-			dst->b[b]._mark.gen =
-				dst->b[b].oldest_gen =
-				src->b[b].mark.gen;
-			dst->b[b].gen_valid = src->b[b].gen_valid;
+			struct bucket *d = &dst->b[b];
+			struct bucket *s = &src->b[b];
+
+			d->_mark.gen = dst->b[b].oldest_gen = s->mark.gen;
+			d->gen_valid = s->gen_valid;
+
+			if (metadata_only &&
+			    (s->mark.data_type == BCH_DATA_USER ||
+			     s->mark.data_type == BCH_DATA_CACHED)) {
+				d->_mark = s->mark;
+				d->_mark.owned_by_allocator = 0;
+			}
 		}
 	};
 
@@ -699,7 +703,8 @@ static int bch2_gc_start(struct bch_fs *c)
  *    move around - if references move backwards in the ordering GC
  *    uses, GC could skip past them
  */
-int bch2_gc(struct bch_fs *c, struct list_head *journal, bool initial)
+int bch2_gc(struct bch_fs *c, struct list_head *journal,
+	    bool initial, bool metadata_only)
 {
 	struct bch_dev *ca;
 	u64 start_time = local_clock();
@@ -711,7 +716,7 @@ int bch2_gc(struct bch_fs *c, struct list_head *journal, bool initial)
 	down_write(&c->gc_lock);
 again:
 	percpu_down_write(&c->mark_lock);
-	ret = bch2_gc_start(c);
+	ret = bch2_gc_start(c, metadata_only);
 	percpu_up_write(&c->mark_lock);
 
 	if (ret)
@@ -719,7 +724,7 @@ again:
 
 	bch2_mark_superblocks(c);
 
-	ret = bch2_gc_btrees(c, journal, initial);
+	ret = bch2_gc_btrees(c, journal, initial, metadata_only);
 	if (ret)
 		goto out;
 
@@ -753,7 +758,7 @@ out:
 	percpu_down_write(&c->mark_lock);
 
 	if (!ret)
-		ret = bch2_gc_done(c, initial);
+		ret = bch2_gc_done(c, initial, metadata_only);
 
 	/* Indicates that gc is no longer in progress: */
 	__gc_pos_set(c, gc_phase(GC_PHASE_NOT_RUNNING));
@@ -1155,7 +1160,7 @@ static int bch2_gc_thread(void *arg)
 		last = atomic_long_read(&clock->now);
 		last_kick = atomic_read(&c->kick_gc);
 
-		ret = bch2_gc(c, NULL, false);
+		ret = bch2_gc(c, NULL, false, false);
 		if (ret)
 			bch_err(c, "btree gc failed: %i", ret);
 
diff --git a/fs/bcachefs/btree_gc.h b/fs/bcachefs/btree_gc.h
index 9eb2b0527a92..b7982e64b235 100644
--- a/fs/bcachefs/btree_gc.h
+++ b/fs/bcachefs/btree_gc.h
@@ -5,7 +5,7 @@
 #include "btree_types.h"
 
 void bch2_coalesce(struct bch_fs *);
-int bch2_gc(struct bch_fs *, struct list_head *, bool);
+int bch2_gc(struct bch_fs *, struct list_head *, bool, bool);
 void bch2_gc_thread_stop(struct bch_fs *);
 int bch2_gc_thread_start(struct bch_fs *);
 void bch2_mark_dev_superblock(struct bch_fs *, struct bch_dev *, unsigned);
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 495ef4732602..4fe66ee1f745 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -132,6 +132,8 @@ void bch2_fs_usage_initialize(struct bch_fs *c)
 
 		switch (e->data_type) {
 		case BCH_DATA_BTREE:
+			usage->btree	+= usage->replicas[i];
+			break;
 		case BCH_DATA_USER:
 			usage->data	+= usage->replicas[i];
 			break;
@@ -226,6 +228,7 @@ static u64 avail_factor(u64 r)
 u64 bch2_fs_sectors_used(struct bch_fs *c, struct bch_fs_usage *fs_usage)
 {
 	return min(fs_usage->hidden +
+		   fs_usage->btree +
 		   fs_usage->data +
 		   reserve_factor(fs_usage->reserved +
 				  fs_usage->online_reserved),
@@ -241,7 +244,8 @@ __bch2_fs_usage_read_short(struct bch_fs *c)
 	ret.capacity = c->capacity -
 		percpu_u64_get(&c->usage[0]->hidden);
 
-	data		= percpu_u64_get(&c->usage[0]->data);
+	data		= percpu_u64_get(&c->usage[0]->data) +
+			  percpu_u64_get(&c->usage[0]->btree);
 	reserved	= percpu_u64_get(&c->usage[0]->reserved) +
 		percpu_u64_get(&c->usage[0]->online_reserved);
 
@@ -386,12 +390,17 @@ static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
 		bch2_wake_allocator(ca);
 }
 
-void bch2_dev_usage_from_buckets(struct bch_fs *c, struct bch_dev *ca)
+void bch2_dev_usage_from_buckets(struct bch_fs *c)
 {
+	struct bch_dev *ca;
 	struct bucket_mark old = { .v.counter = 0 };
 	struct bch_fs_usage *fs_usage;
 	struct bucket_array *buckets;
 	struct bucket *g;
+	unsigned i;
+	int cpu;
+
+	percpu_u64_set(&c->usage[0]->hidden, 0);
 
 	/*
 	 * This is only called during startup, before there's any multithreaded
@@ -401,11 +410,17 @@ void bch2_dev_usage_from_buckets(struct bch_fs *c, struct bch_dev *ca)
 	fs_usage = this_cpu_ptr(c->usage[0]);
 	preempt_enable();
 
-	buckets = bucket_array(ca);
+	for_each_member_device(ca, c, i) {
+		for_each_possible_cpu(cpu)
+			memset(per_cpu_ptr(ca->usage[0], cpu), 0,
+			       sizeof(*ca->usage[0]));
+
+		buckets = bucket_array(ca);
 
-	for_each_bucket(g, buckets)
-		if (g->mark.data_type)
-			bch2_dev_usage_update(c, ca, fs_usage, old, g->mark, false);
+		for_each_bucket(g, buckets)
+			bch2_dev_usage_update(c, ca, fs_usage,
+					      old, g->mark, false);
+	}
 }
 
 #define bucket_data_cmpxchg(c, ca, fs_usage, g, new, expr)	\
@@ -426,10 +441,17 @@ static inline void update_replicas(struct bch_fs *c,
 	BUG_ON(idx < 0);
 	BUG_ON(!sectors);
 
-	if (r->data_type == BCH_DATA_CACHED)
-		fs_usage->cached	+= sectors;
-	else
+	switch (r->data_type) {
+	case BCH_DATA_BTREE:
+		fs_usage->btree		+= sectors;
+		break;
+	case BCH_DATA_USER:
 		fs_usage->data		+= sectors;
+		break;
+	case BCH_DATA_CACHED:
+		fs_usage->cached	+= sectors;
+		break;
+	}
 	fs_usage->replicas[idx]		+= sectors;
 }
 
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index 6af8b418b1e3..095015f17f76 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -174,7 +174,7 @@ static inline bool bucket_needs_journal_commit(struct bucket_mark m,
 
 struct bch_dev_usage bch2_dev_usage_read(struct bch_fs *, struct bch_dev *);
 
-void bch2_dev_usage_from_buckets(struct bch_fs *, struct bch_dev *);
+void bch2_dev_usage_from_buckets(struct bch_fs *);
 
 static inline u64 __dev_buckets_available(struct bch_dev *ca,
 					  struct bch_dev_usage stats)
diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h
index 348d062dd744..a98493dd2ba8 100644
--- a/fs/bcachefs/buckets_types.h
+++ b/fs/bcachefs/buckets_types.h
@@ -70,6 +70,7 @@ struct bch_fs_usage {
 	u64			gc_start[0];
 
 	u64			hidden;
+	u64			btree;
 	u64			data;
 	u64			cached;
 	u64			reserved;
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 0fa952fa1053..67b4dda9cfeb 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -361,7 +361,7 @@ int bch2_fs_recovery(struct bch_fs *c)
 	    test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags)) {
 		bch_verbose(c, "starting mark and sweep:");
 		err = "error in recovery";
-		ret = bch2_gc(c, &journal, true);
+		ret = bch2_gc(c, &journal, true, false);
 		if (ret)
 			goto err;
 		bch_verbose(c, "mark and sweep done");
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index 1354dd33874c..59503ad0006c 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -497,7 +497,7 @@ STORE(__bch2_fs)
 		bch2_coalesce(c);
 
 	if (attr == &sysfs_trigger_gc)
-		bch2_gc(c, NULL, false);
+		bch2_gc(c, NULL, false, false);
 
 	if (attr == &sysfs_trigger_alloc_write) {
 		bool wrote;
-- 
cgit 


From 2a039f1ee4077050c57c51f0463335d262740430 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 3 Apr 2019 17:11:15 -0400
Subject: bcachefs: free trans->mem on commit

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update_leaf.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 6fa9fa5768aa..8e686dc42f9d 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -861,12 +861,15 @@ out_noupdates:
 
 	BUG_ON(!(trans->flags & BTREE_INSERT_ATOMIC) && ret == -EINTR);
 
-	bch2_trans_unlink_iters(trans, trans->iters_unlink_on_commit);
 	if (!ret) {
-		bch2_trans_unlink_iters(trans, ~trans->iters_touched);
+		bch2_trans_unlink_iters(trans, ~trans->iters_touched|
+					trans->iters_unlink_on_commit);
 		trans->iters_touched = 0;
+	} else {
+		bch2_trans_unlink_iters(trans, trans->iters_unlink_on_commit);
 	}
-	trans->nr_updates = 0;
+	trans->nr_updates	= 0;
+	trans->mem_top		= 0;
 
 	return ret;
 err:
-- 
cgit 


From ece254b258980cfd5a0fa11adce8e178c8d34181 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 4 Apr 2019 21:28:16 -0400
Subject: bcachefs: don't lose errors from iterators that have been freed

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c  | 6 ++++--
 fs/bcachefs/btree_types.h | 1 +
 2 files changed, 5 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index fbbb7428c592..02eb28bfe9b9 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -983,6 +983,7 @@ retry_all:
 	}
 
 	if (unlikely(ret == -EIO)) {
+		trans->error = true;
 		iter->flags |= BTREE_ITER_ERROR;
 		iter->l[iter->level].b = BTREE_ITER_NOT_END;
 		goto out;
@@ -1943,7 +1944,7 @@ void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c)
 
 int bch2_trans_exit(struct btree_trans *trans)
 {
-	int ret = bch2_trans_unlock(trans);
+	bch2_trans_unlock(trans);
 
 	kfree(trans->mem);
 	if (trans->used_mempool)
@@ -1952,5 +1953,6 @@ int bch2_trans_exit(struct btree_trans *trans)
 		kfree(trans->iters);
 	trans->mem	= (void *) 0x1;
 	trans->iters	= (void *) 0x1;
-	return ret;
+
+	return trans->error ? -EIO : 0;
 }
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index bd6852d951ea..8c6f5fe6998e 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -280,6 +280,7 @@ struct btree_trans {
 	u8			nr_updates;
 	u8			size;
 	unsigned		used_mempool:1;
+	unsigned		error:1;
 
 	unsigned		mem_top;
 	unsigned		mem_bytes;
-- 
cgit 


From 1dd7f9d98de0740b42f1ac3f0b1d8af9c76801de Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 4 Apr 2019 21:53:12 -0400
Subject: bcachefs: Rewrite journal_seq_blacklist machinery

Now, we store blacklisted journal sequence numbers in the superblock,
not the journal: this helps to greatly simplify the code, and more
importantly it's now implemented in a way that doesn't require all btree
nodes to be visited before starting the journal - instead, we
unconditionally blacklist the next 4 journal sequence numbers after an
unclean shutdown.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs.h              |  16 ++
 fs/bcachefs/bcachefs_format.h       |  18 +-
 fs/bcachefs/btree_io.c              |  24 +-
 fs/bcachefs/btree_iter.c            |   2 +
 fs/bcachefs/inode.h                 |   2 -
 fs/bcachefs/journal.c               |  65 +++--
 fs/bcachefs/journal.h               |   4 +-
 fs/bcachefs/journal_io.c            | 108 +-------
 fs/bcachefs/journal_io.h            |   1 -
 fs/bcachefs/journal_seq_blacklist.c | 491 ++++++++++++++++--------------------
 fs/bcachefs/journal_seq_blacklist.h |  15 +-
 fs/bcachefs/journal_types.h         |  22 --
 fs/bcachefs/recovery.c              | 154 ++++++++---
 fs/bcachefs/super-io.c              |   1 +
 fs/bcachefs/super.c                 |   9 +
 15 files changed, 460 insertions(+), 472 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index d8c487e33592..8acdc7ffeca3 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -185,6 +185,7 @@
 #include <linux/closure.h>
 #include <linux/kobject.h>
 #include <linux/list.h>
+#include <linux/math64.h>
 #include <linux/mutex.h>
 #include <linux/percpu-refcount.h>
 #include <linux/percpu-rwsem.h>
@@ -486,6 +487,7 @@ enum {
 	BCH_FS_RW,
 
 	/* shutdown: */
+	BCH_FS_STOPPING,
 	BCH_FS_EMERGENCY_RO,
 	BCH_FS_WRITE_DISABLE_COMPLETE,
 
@@ -511,6 +513,15 @@ struct bch_fs_pcpu {
 	u64			sectors_available;
 };
 
+struct journal_seq_blacklist_table {
+	size_t			nr;
+	struct journal_seq_blacklist_table_entry {
+		u64		start;
+		u64		end;
+		bool		dirty;
+	}			entries[0];
+};
+
 struct bch_fs {
 	struct closure		cl;
 
@@ -646,6 +657,11 @@ struct bch_fs {
 
 	struct io_clock		io_clock[2];
 
+	/* JOURNAL SEQ BLACKLIST */
+	struct journal_seq_blacklist_table *
+				journal_seq_blacklist_table;
+	struct work_struct	journal_seq_blacklist_gc_work;
+
 	/* ALLOCATOR */
 	spinlock_t		freelist_lock;
 	struct closure_waitlist	freelist_wait;
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index 646910a6a4bb..7edc410c5391 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -909,7 +909,8 @@ struct bch_sb_field {
 	x(quota,	4)	\
 	x(disk_groups,	5)	\
 	x(clean,	6)	\
-	x(replicas,	7)
+	x(replicas,	7)	\
+	x(journal_seq_blacklist, 8)
 
 enum bch_sb_field_type {
 #define x(f, nr)	BCH_SB_FIELD_##f = nr,
@@ -1124,6 +1125,20 @@ struct bch_sb_field_clean {
 	};
 };
 
+struct journal_seq_blacklist_entry {
+	__le64			start;
+	__le64			end;
+};
+
+struct bch_sb_field_journal_seq_blacklist {
+	struct bch_sb_field	field;
+
+	union {
+		struct journal_seq_blacklist_entry start[0];
+		__u64		_data[0];
+	};
+};
+
 /* Superblock: */
 
 /*
@@ -1279,6 +1294,7 @@ enum bch_sb_features {
 	BCH_FEATURE_ZSTD		= 2,
 	BCH_FEATURE_ATOMIC_NLINK	= 3, /* should have gone under compat */
 	BCH_FEATURE_EC			= 4,
+	BCH_FEATURE_JOURNAL_SEQ_BLACKLIST_V3 = 5,
 	BCH_FEATURE_NR,
 };
 
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 10b3d53b6ebb..fa261a175f5e 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -770,7 +770,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry
 	struct btree_node *sorted;
 	struct bkey_packed *k;
 	struct bset *i;
-	bool used_mempool;
+	bool used_mempool, blacklisted;
 	unsigned u64s;
 	int ret, retry_read = 0, write = READ;
 
@@ -844,20 +844,15 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry
 
 		b->written += sectors;
 
-		ret = bch2_journal_seq_should_ignore(c, le64_to_cpu(i->journal_seq), b);
-		if (ret < 0) {
-			btree_err(BTREE_ERR_FATAL, c, b, i,
-				  "insufficient memory");
-			goto err;
-		}
+		blacklisted = bch2_journal_seq_is_blacklisted(c,
+					le64_to_cpu(i->journal_seq),
+					true);
 
-		if (ret) {
-			btree_err_on(first,
-				     BTREE_ERR_FIXABLE, c, b, i,
-				     "first btree node bset has blacklisted journal seq");
-			if (!first)
-				continue;
-		}
+		btree_err_on(blacklisted && first,
+			     BTREE_ERR_FIXABLE, c, b, i,
+			     "first btree node bset has blacklisted journal seq");
+		if (blacklisted && !first)
+			continue;
 
 		bch2_btree_node_iter_large_push(iter, b,
 					   i->start,
@@ -930,7 +925,6 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry
 out:
 	mempool_free(iter, &c->fill_iter);
 	return retry_read;
-err:
 fsck_err:
 	if (ret == BTREE_RETRY_READ) {
 		retry_read = 1;
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 02eb28bfe9b9..6b9af53a3e77 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1156,6 +1156,8 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter, unsigned depth)
 	if (!btree_iter_node(iter, iter->level))
 		return NULL;
 
+	bch2_trans_cond_resched(iter->trans);
+
 	btree_iter_up(iter);
 
 	if (!bch2_btree_node_relock(iter, iter->level))
diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h
index ada639c06619..af0c355f2f04 100644
--- a/fs/bcachefs/inode.h
+++ b/fs/bcachefs/inode.h
@@ -4,8 +4,6 @@
 
 #include "opts.h"
 
-#include <linux/math64.h>
-
 extern const char * const bch2_inode_opts[];
 
 const char *bch2_inode_invalid(const struct bch_fs *, struct bkey_s_c);
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index dbecb4072af0..2e84af8a044c 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -988,27 +988,57 @@ void bch2_fs_journal_stop(struct journal *j)
 	cancel_delayed_work_sync(&j->reclaim_work);
 }
 
-void bch2_fs_journal_start(struct journal *j)
+int bch2_fs_journal_start(struct journal *j, u64 cur_seq,
+			  struct list_head *journal_entries)
 {
 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
-	struct journal_seq_blacklist *bl;
-	u64 blacklist = 0;
+	struct journal_entry_pin_list *p;
+	struct journal_replay *i;
+	u64 last_seq = cur_seq, nr, seq;
+
+	if (!list_empty(journal_entries))
+		last_seq = le64_to_cpu(list_last_entry(journal_entries,
+						       struct journal_replay,
+						       list)->j.last_seq);
+
+	nr = cur_seq - last_seq;
+
+	if (nr + 1 > j->pin.size) {
+		free_fifo(&j->pin);
+		init_fifo(&j->pin, roundup_pow_of_two(nr + 1), GFP_KERNEL);
+		if (!j->pin.data) {
+			bch_err(c, "error reallocating journal fifo (%llu open entries)", nr);
+			return -ENOMEM;
+		}
+	}
+
+	j->last_seq_ondisk	= last_seq;
+	j->pin.front		= last_seq;
+	j->pin.back		= cur_seq;
+	atomic64_set(&j->seq, cur_seq - 1);
+
+	fifo_for_each_entry_ptr(p, &j->pin, seq) {
+		INIT_LIST_HEAD(&p->list);
+		INIT_LIST_HEAD(&p->flushed);
+		atomic_set(&p->count, 0);
+		p->devs.nr = 0;
+	}
+
+	list_for_each_entry(i, journal_entries, list) {
+		seq = le64_to_cpu(i->j.seq);
+
+		BUG_ON(seq < last_seq || seq >= cur_seq);
 
-	list_for_each_entry(bl, &j->seq_blacklist, list)
-		blacklist = max(blacklist, bl->end);
+		p = journal_seq_pin(j, seq);
+
+		atomic_set(&p->count, 1);
+		p->devs = i->devs;
+	}
 
 	spin_lock(&j->lock);
 
 	set_bit(JOURNAL_STARTED, &j->flags);
 
-	while (journal_cur_seq(j) < blacklist)
-		journal_pin_new_entry(j, 0);
-
-	/*
-	 * __journal_entry_close() only inits the next journal entry when it
-	 * closes an open journal entry - the very first journal entry gets
-	 * initialized here:
-	 */
 	journal_pin_new_entry(j, 1);
 	bch2_journal_buf_init(j);
 
@@ -1017,12 +1047,7 @@ void bch2_fs_journal_start(struct journal *j)
 	bch2_journal_space_available(j);
 	spin_unlock(&j->lock);
 
-	/*
-	 * Adding entries to the next journal entry before allocating space on
-	 * disk for the next journal entry - this is ok, because these entries
-	 * only have to go down with the next journal entry we write:
-	 */
-	bch2_journal_seq_blacklist_write(j);
+	return 0;
 }
 
 /* init/exit: */
@@ -1090,8 +1115,6 @@ int bch2_fs_journal_init(struct journal *j)
 	INIT_DELAYED_WORK(&j->write_work, journal_write_work);
 	INIT_DELAYED_WORK(&j->reclaim_work, bch2_journal_reclaim_work);
 	init_waitqueue_head(&j->pin_flush_wait);
-	mutex_init(&j->blacklist_lock);
-	INIT_LIST_HEAD(&j->seq_blacklist);
 	mutex_init(&j->reclaim_lock);
 	mutex_init(&j->discard_lock);
 
diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h
index 809cf25f5a03..3447b4ad462d 100644
--- a/fs/bcachefs/journal.h
+++ b/fs/bcachefs/journal.h
@@ -472,8 +472,10 @@ int bch2_set_nr_journal_buckets(struct bch_fs *, struct bch_dev *,
 int bch2_dev_journal_alloc(struct bch_dev *);
 
 void bch2_dev_journal_stop(struct journal *, struct bch_dev *);
+
 void bch2_fs_journal_stop(struct journal *);
-void bch2_fs_journal_start(struct journal *);
+int bch2_fs_journal_start(struct journal *, u64, struct list_head *);
+
 void bch2_dev_journal_exit(struct bch_dev *);
 int bch2_dev_journal_init(struct bch_dev *, struct bch_sb *);
 void bch2_fs_journal_exit(struct journal *);
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 1293bb66e62c..8010b38114ac 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -10,7 +10,6 @@
 #include "journal.h"
 #include "journal_io.h"
 #include "journal_reclaim.h"
-#include "journal_seq_blacklist.h"
 #include "replicas.h"
 #include "trace.h"
 
@@ -655,45 +654,11 @@ void bch2_journal_entries_free(struct list_head *list)
 	}
 }
 
-int bch2_journal_set_seq(struct bch_fs *c, u64 last_seq, u64 end_seq)
-{
-	struct journal *j = &c->journal;
-	struct journal_entry_pin_list *p;
-	u64 seq, nr = end_seq - last_seq + 1;
-
-	if (nr > j->pin.size) {
-		free_fifo(&j->pin);
-		init_fifo(&j->pin, roundup_pow_of_two(nr), GFP_KERNEL);
-		if (!j->pin.data) {
-			bch_err(c, "error reallocating journal fifo (%llu open entries)", nr);
-			return -ENOMEM;
-		}
-	}
-
-	atomic64_set(&j->seq, end_seq);
-	j->last_seq_ondisk = last_seq;
-
-	j->pin.front	= last_seq;
-	j->pin.back	= end_seq + 1;
-
-	fifo_for_each_entry_ptr(p, &j->pin, seq) {
-		INIT_LIST_HEAD(&p->list);
-		INIT_LIST_HEAD(&p->flushed);
-		atomic_set(&p->count, 0);
-		p->devs.nr = 0;
-	}
-
-	return 0;
-}
-
 int bch2_journal_read(struct bch_fs *c, struct list_head *list)
 {
-	struct journal *j = &c->journal;
 	struct journal_list jlist;
 	struct journal_replay *i;
-	struct journal_entry_pin_list *p;
 	struct bch_dev *ca;
-	u64 cur_seq, end_seq;
 	unsigned iter;
 	size_t keys = 0, entries = 0;
 	bool degraded = false;
@@ -725,17 +690,12 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list)
 	if (jlist.ret)
 		return jlist.ret;
 
-	if (list_empty(list)){
-		bch_err(c, "no journal entries found");
-		return BCH_FSCK_REPAIR_IMPOSSIBLE;
-	}
-
 	list_for_each_entry(i, list, list) {
+		struct jset_entry *entry;
+		struct bkey_i *k, *_n;
 		struct bch_replicas_padded replicas;
 		char buf[80];
 
-		bch2_devlist_to_replicas(&replicas.e, BCH_DATA_JOURNAL, i->devs);
-
 		ret = jset_validate_entries(c, &i->j, READ);
 		if (ret)
 			goto fsck_err;
@@ -745,6 +705,8 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list)
 		 * the devices - this is wrong:
 		 */
 
+		bch2_devlist_to_replicas(&replicas.e, BCH_DATA_JOURNAL, i->devs);
+
 		if (!degraded &&
 		    (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) ||
 		     fsck_err_on(!bch2_replicas_marked(c, &replicas.e, false), c,
@@ -755,68 +717,18 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list)
 			if (ret)
 				return ret;
 		}
-	}
-
-	i = list_last_entry(list, struct journal_replay, list);
-
-	ret = bch2_journal_set_seq(c,
-				   le64_to_cpu(i->j.last_seq),
-				   le64_to_cpu(i->j.seq));
-	if (ret)
-		return ret;
-
-	mutex_lock(&j->blacklist_lock);
-
-	list_for_each_entry(i, list, list) {
-		p = journal_seq_pin(j, le64_to_cpu(i->j.seq));
-
-		atomic_set(&p->count, 1);
-		p->devs = i->devs;
-
-		if (bch2_journal_seq_blacklist_read(j, i)) {
-			mutex_unlock(&j->blacklist_lock);
-			return -ENOMEM;
-		}
-	}
-
-	mutex_unlock(&j->blacklist_lock);
-
-	cur_seq = journal_last_seq(j);
-	end_seq = le64_to_cpu(list_last_entry(list,
-				struct journal_replay, list)->j.seq);
-
-	list_for_each_entry(i, list, list) {
-		struct jset_entry *entry;
-		struct bkey_i *k, *_n;
-		bool blacklisted;
-
-		mutex_lock(&j->blacklist_lock);
-		while (cur_seq < le64_to_cpu(i->j.seq) &&
-		       bch2_journal_seq_blacklist_find(j, cur_seq))
-			cur_seq++;
-
-		blacklisted = bch2_journal_seq_blacklist_find(j,
-							 le64_to_cpu(i->j.seq));
-		mutex_unlock(&j->blacklist_lock);
-
-		fsck_err_on(blacklisted, c,
-			    "found blacklisted journal entry %llu",
-			    le64_to_cpu(i->j.seq));
-
-		fsck_err_on(le64_to_cpu(i->j.seq) != cur_seq, c,
-			"journal entries %llu-%llu missing! (replaying %llu-%llu)",
-			cur_seq, le64_to_cpu(i->j.seq) - 1,
-			journal_last_seq(j), end_seq);
-
-		cur_seq = le64_to_cpu(i->j.seq) + 1;
 
 		for_each_jset_key(k, _n, entry, &i->j)
 			keys++;
 		entries++;
 	}
 
-	bch_info(c, "journal read done, %zu keys in %zu entries, seq %llu",
-		 keys, entries, journal_cur_seq(j));
+	if (!list_empty(list)) {
+		i = list_last_entry(list, struct journal_replay, list);
+
+		bch_info(c, "journal read done, %zu keys in %zu entries, seq %llu",
+			 keys, entries, le64_to_cpu(i->j.seq));
+	}
 fsck_err:
 	return ret;
 }
diff --git a/fs/bcachefs/journal_io.h b/fs/bcachefs/journal_io.h
index a79c396903f0..4bb174839956 100644
--- a/fs/bcachefs/journal_io.h
+++ b/fs/bcachefs/journal_io.h
@@ -35,7 +35,6 @@ static inline struct jset_entry *__jset_entry_type_next(struct jset *jset,
 	for_each_jset_entry_type(entry, jset, BCH_JSET_ENTRY_btree_keys)	\
 		vstruct_for_each_safe(entry, k, _n)
 
-int bch2_journal_set_seq(struct bch_fs *c, u64, u64);
 int bch2_journal_read(struct bch_fs *, struct list_head *);
 void bch2_journal_entries_free(struct list_head *);
 int bch2_journal_replay(struct bch_fs *, struct list_head *);
diff --git a/fs/bcachefs/journal_seq_blacklist.c b/fs/bcachefs/journal_seq_blacklist.c
index 45c8d38d12de..0df8dfccd5b5 100644
--- a/fs/bcachefs/journal_seq_blacklist.c
+++ b/fs/bcachefs/journal_seq_blacklist.c
@@ -1,13 +1,10 @@
 // SPDX-License-Identifier: GPL-2.0
 
 #include "bcachefs.h"
-#include "btree_update.h"
-#include "btree_update_interior.h"
-#include "error.h"
-#include "journal.h"
-#include "journal_io.h"
-#include "journal_reclaim.h"
+#include "btree_iter.h"
+#include "eytzinger.h"
 #include "journal_seq_blacklist.h"
+#include "super-io.h"
 
 /*
  * journal_seq_blacklist machinery:
@@ -37,327 +34,285 @@
  * record that it was blacklisted so that a) on recovery we don't think we have
  * missing journal entries and b) so that the btree code continues to ignore
  * that bset, until that btree node is rewritten.
- *
- * Blacklisted journal sequence numbers are themselves recorded as entries in
- * the journal.
  */
 
-/*
- * Called when journal needs to evict a blacklist entry to reclaim space: find
- * any btree nodes that refer to the blacklist journal sequence numbers, and
- * rewrite them:
- */
-static void journal_seq_blacklist_flush(struct journal *j,
-					struct journal_entry_pin *pin, u64 seq)
+static unsigned
+blacklist_nr_entries(struct bch_sb_field_journal_seq_blacklist *bl)
 {
-	struct bch_fs *c =
-		container_of(j, struct bch_fs, journal);
-	struct journal_seq_blacklist *bl =
-		container_of(pin, struct journal_seq_blacklist, pin);
-	struct blacklisted_node n;
-	struct closure cl;
-	unsigned i;
-	int ret;
+	return bl
+		? ((vstruct_end(&bl->field) - (void *) &bl->start[0]) /
+		   sizeof(struct journal_seq_blacklist_entry))
+		: 0;
+}
 
-	closure_init_stack(&cl);
+static unsigned sb_blacklist_u64s(unsigned nr)
+{
+	struct bch_sb_field_journal_seq_blacklist *bl;
 
-	for (i = 0;; i++) {
-		struct btree_trans trans;
-		struct btree_iter *iter;
-		struct btree *b;
+	return (sizeof(*bl) + sizeof(bl->start[0]) * nr) / sizeof(u64);
+}
 
-		bch2_trans_init(&trans, c);
+static struct bch_sb_field_journal_seq_blacklist *
+blacklist_entry_try_merge(struct bch_fs *c,
+			  struct bch_sb_field_journal_seq_blacklist *bl,
+			  unsigned i)
+{
+	unsigned nr = blacklist_nr_entries(bl);
+
+	if (le64_to_cpu(bl->start[i].end) >=
+	    le64_to_cpu(bl->start[i + 1].start)) {
+		bl->start[i].end = bl->start[i + 1].end;
+		--nr;
+		memmove(&bl->start[i],
+			&bl->start[i + 1],
+			sizeof(bl->start[0]) * (nr - i));
+
+		bl = bch2_sb_resize_journal_seq_blacklist(&c->disk_sb,
+							sb_blacklist_u64s(nr));
+		BUG_ON(!bl);
+	}
 
-		mutex_lock(&j->blacklist_lock);
-		if (i >= bl->nr_entries) {
-			mutex_unlock(&j->blacklist_lock);
-			break;
-		}
-		n = bl->entries[i];
-		mutex_unlock(&j->blacklist_lock);
+	return bl;
+}
 
-		iter = bch2_trans_get_node_iter(&trans, n.btree_id, n.pos,
-						0, 0, 0);
+int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64 start, u64 end)
+{
+	struct bch_sb_field_journal_seq_blacklist *bl;
+	unsigned i, nr;
+	int ret = 0;
 
-		b = bch2_btree_iter_peek_node(iter);
+	mutex_lock(&c->sb_lock);
+	bl = bch2_sb_get_journal_seq_blacklist(c->disk_sb.sb);
+	nr = blacklist_nr_entries(bl);
 
-		/* The node might have already been rewritten: */
+	if (bl) {
+		for (i = 0; i < nr; i++) {
+			struct journal_seq_blacklist_entry *e =
+				bl->start + i;
 
-		if (b->data->keys.seq == n.seq) {
-			ret = bch2_btree_node_rewrite(c, iter, n.seq, 0);
-			if (ret) {
-				bch2_trans_exit(&trans);
-				bch2_fs_fatal_error(c,
-					"error %i rewriting btree node with blacklisted journal seq",
-					ret);
-				bch2_journal_halt(j);
-				return;
+			if (start == le64_to_cpu(e->start) &&
+			    end   == le64_to_cpu(e->end))
+				goto out;
+
+			if (start <= le64_to_cpu(e->start) &&
+			    end   >= le64_to_cpu(e->end)) {
+				e->start = cpu_to_le64(start);
+				e->end	= cpu_to_le64(end);
+
+				if (i + 1 < nr)
+					bl = blacklist_entry_try_merge(c,
+								bl, i);
+				if (i)
+					bl = blacklist_entry_try_merge(c,
+								bl, i - 1);
+				goto out_write_sb;
 			}
 		}
-
-		bch2_trans_exit(&trans);
 	}
 
-	for (i = 0;; i++) {
-		struct btree_update *as;
-		struct pending_btree_node_free *d;
-
-		mutex_lock(&j->blacklist_lock);
-		if (i >= bl->nr_entries) {
-			mutex_unlock(&j->blacklist_lock);
-			break;
-		}
-		n = bl->entries[i];
-		mutex_unlock(&j->blacklist_lock);
-redo_wait:
-		mutex_lock(&c->btree_interior_update_lock);
-
-		/*
-		 * Is the node on the list of pending interior node updates -
-		 * being freed? If so, wait for that to finish:
-		 */
-		for_each_pending_btree_node_free(c, as, d)
-			if (n.seq	== d->seq &&
-			    n.btree_id	== d->btree_id &&
-			    !d->level &&
-			    !bkey_cmp(n.pos, d->key.k.p)) {
-				closure_wait(&as->wait, &cl);
-				mutex_unlock(&c->btree_interior_update_lock);
-				closure_sync(&cl);
-				goto redo_wait;
-			}
-
-		mutex_unlock(&c->btree_interior_update_lock);
+	bl = bch2_sb_resize_journal_seq_blacklist(&c->disk_sb,
+					sb_blacklist_u64s(nr + 1));
+	if (!bl) {
+		ret = -ENOMEM;
+		goto out;
 	}
 
-	mutex_lock(&j->blacklist_lock);
+	bl->start[nr].start	= cpu_to_le64(start);
+	bl->start[nr].end	= cpu_to_le64(end);
+out_write_sb:
+	c->disk_sb.sb->features[0] |=
+		1ULL << BCH_FEATURE_JOURNAL_SEQ_BLACKLIST_V3;
 
-	bch2_journal_pin_drop(j, &bl->pin);
-	list_del(&bl->list);
-	kfree(bl->entries);
-	kfree(bl);
+	ret = bch2_write_super(c);
+out:
+	mutex_unlock(&c->sb_lock);
 
-	mutex_unlock(&j->blacklist_lock);
+	return ret;
 }
 
-/*
- * Determine if a particular sequence number is blacklisted - if so, return
- * blacklist entry:
- */
-struct journal_seq_blacklist *
-bch2_journal_seq_blacklist_find(struct journal *j, u64 seq)
+static int journal_seq_blacklist_table_cmp(const void *_l,
+					   const void *_r, size_t size)
 {
-	struct journal_seq_blacklist *bl;
+	const struct journal_seq_blacklist_table_entry *l = _l;
+	const struct journal_seq_blacklist_table_entry *r = _r;
 
-	lockdep_assert_held(&j->blacklist_lock);
-
-	list_for_each_entry(bl, &j->seq_blacklist, list)
-		if (seq >= bl->start && seq <= bl->end)
-			return bl;
-
-	return NULL;
+	return (l->start > r->start) - (l->start < r->start);
 }
 
-/*
- * Allocate a new, in memory blacklist entry:
- */
-static struct journal_seq_blacklist *
-bch2_journal_seq_blacklisted_new(struct journal *j, u64 start, u64 end)
+bool bch2_journal_seq_is_blacklisted(struct bch_fs *c, u64 seq,
+				     bool dirty)
 {
-	struct journal_seq_blacklist *bl;
+	struct journal_seq_blacklist_table *t = c->journal_seq_blacklist_table;
+	struct journal_seq_blacklist_table_entry search = { .start = seq };
+	int idx;
 
-	lockdep_assert_held(&j->blacklist_lock);
+	if (!t)
+		return false;
 
-	/*
-	 * When we start the journal, bch2_journal_start() will skip over @seq:
-	 */
+	idx = eytzinger0_find_le(t->entries, t->nr,
+				 sizeof(t->entries[0]),
+				 journal_seq_blacklist_table_cmp,
+				 &search);
+	if (idx < 0)
+		return false;
 
-	bl = kzalloc(sizeof(*bl), GFP_KERNEL);
-	if (!bl)
-		return NULL;
+	BUG_ON(t->entries[idx].start > seq);
 
-	bl->start	= start;
-	bl->end		= end;
+	if (seq >= t->entries[idx].end)
+		return false;
 
-	list_add_tail(&bl->list, &j->seq_blacklist);
-	return bl;
+	if (dirty)
+		t->entries[idx].dirty = true;
+	return true;
 }
 
-/*
- * Returns true if @seq is newer than the most recent journal entry that got
- * written, and data corresponding to @seq should be ignored - also marks @seq
- * as blacklisted so that on future restarts the corresponding data will still
- * be ignored:
- */
-int bch2_journal_seq_should_ignore(struct bch_fs *c, u64 seq, struct btree *b)
+int bch2_blacklist_table_initialize(struct bch_fs *c)
 {
-	struct journal *j = &c->journal;
-	struct journal_seq_blacklist *bl = NULL;
-	struct blacklisted_node *n;
-	u64 journal_seq;
-	int ret = 0;
-
-	if (!seq)
-		return 0;
+	struct bch_sb_field_journal_seq_blacklist *bl =
+		bch2_sb_get_journal_seq_blacklist(c->disk_sb.sb);
+	struct journal_seq_blacklist_table *t;
+	unsigned i, nr = blacklist_nr_entries(bl);
 
-	spin_lock(&j->lock);
-	journal_seq = journal_cur_seq(j);
-	spin_unlock(&j->lock);
+	BUG_ON(c->journal_seq_blacklist_table);
 
-	/* Interier updates aren't journalled: */
-	BUG_ON(b->level);
-	BUG_ON(seq > journal_seq && test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags));
+	if (!bl)
+		return 0;
 
-	/*
-	 * Decrease this back to j->seq + 2 when we next rev the on disk format:
-	 * increasing it temporarily to work around bug in old kernels
-	 */
-	fsck_err_on(seq > journal_seq + 4, c,
-		    "bset journal seq too far in the future: %llu > %llu",
-		    seq, journal_seq);
+	t = kzalloc(sizeof(*t) + sizeof(t->entries[0]) * nr,
+		    GFP_KERNEL);
+	if (!t)
+		return -ENOMEM;
 
-	if (seq <= journal_seq &&
-	    list_empty_careful(&j->seq_blacklist))
-		return 0;
+	t->nr = nr;
 
-	mutex_lock(&j->blacklist_lock);
-
-	if (seq <= journal_seq) {
-		bl = bch2_journal_seq_blacklist_find(j, seq);
-		if (!bl)
-			goto out;
-	} else {
-		bch_verbose(c, "btree node %u:%llu:%llu has future journal sequence number %llu, blacklisting",
-			    b->btree_id, b->key.k.p.inode, b->key.k.p.offset, seq);
-
-		if (!j->new_blacklist) {
-			j->new_blacklist = bch2_journal_seq_blacklisted_new(j,
-						journal_seq + 1,
-						journal_seq + 1);
-			if (!j->new_blacklist) {
-				ret = -ENOMEM;
-				goto out;
-			}
-		}
-		bl = j->new_blacklist;
-		bl->end = max(bl->end, seq);
+	for (i = 0; i < nr; i++) {
+		t->entries[i].start	= le64_to_cpu(bl->start[i].start);
+		t->entries[i].end	= le64_to_cpu(bl->start[i].end);
 	}
 
-	for (n = bl->entries; n < bl->entries + bl->nr_entries; n++)
-		if (b->data->keys.seq	== n->seq &&
-		    b->btree_id		== n->btree_id &&
-		    !bkey_cmp(b->key.k.p, n->pos))
-			goto found_entry;
-
-	if (!bl->nr_entries ||
-	    is_power_of_2(bl->nr_entries)) {
-		n = krealloc(bl->entries,
-			     max_t(size_t, bl->nr_entries * 2, 8) * sizeof(*n),
-			     GFP_KERNEL);
-		if (!n) {
-			ret = -ENOMEM;
-			goto out;
-		}
-		bl->entries = n;
-	}
+	eytzinger0_sort(t->entries,
+			t->nr,
+			sizeof(t->entries[0]),
+			journal_seq_blacklist_table_cmp,
+			NULL);
 
-	bl->entries[bl->nr_entries++] = (struct blacklisted_node) {
-		.seq		= b->data->keys.seq,
-		.btree_id	= b->btree_id,
-		.pos		= b->key.k.p,
-	};
-found_entry:
-	ret = 1;
-out:
-fsck_err:
-	mutex_unlock(&j->blacklist_lock);
-	return ret;
+	c->journal_seq_blacklist_table = t;
+	return 0;
 }
 
-static int __bch2_journal_seq_blacklist_read(struct journal *j,
-					     struct journal_replay *i,
-					     u64 start, u64 end)
+static const char *
+bch2_sb_journal_seq_blacklist_validate(struct bch_sb *sb,
+				       struct bch_sb_field *f)
 {
-	struct bch_fs *c = container_of(j, struct bch_fs, journal);
-	struct journal_seq_blacklist *bl;
-
-	bch_verbose(c, "blacklisting existing journal seq %llu-%llu",
-		    start, end);
+	struct bch_sb_field_journal_seq_blacklist *bl =
+		field_to_type(f, journal_seq_blacklist);
+	struct journal_seq_blacklist_entry *i;
+	unsigned nr = blacklist_nr_entries(bl);
+
+	for (i = bl->start; i < bl->start + nr; i++) {
+		if (le64_to_cpu(i->start) >=
+		    le64_to_cpu(i->end))
+			return "entry start >= end";
+
+		if (i + 1 < bl->start + nr &&
+		    le64_to_cpu(i[0].end) >
+		    le64_to_cpu(i[1].start))
+			return "entries out of order";
+	}
 
-	bl = bch2_journal_seq_blacklisted_new(j, start, end);
-	if (!bl)
-		return -ENOMEM;
+	return NULL;
+}
 
-	bch2_journal_pin_add(j, le64_to_cpu(i->j.seq), &bl->pin,
-			     journal_seq_blacklist_flush);
-	return 0;
+static void bch2_sb_journal_seq_blacklist_to_text(struct printbuf *out,
+						  struct bch_sb *sb,
+						  struct bch_sb_field *f)
+{
+	struct bch_sb_field_journal_seq_blacklist *bl =
+		field_to_type(f, journal_seq_blacklist);
+	struct journal_seq_blacklist_entry *i;
+	unsigned nr = blacklist_nr_entries(bl);
+
+	for (i = bl->start; i < bl->start + nr; i++) {
+		if (i != bl->start)
+			pr_buf(out, " ");
+
+		pr_buf(out, "%llu-%llu",
+		       le64_to_cpu(i->start),
+		       le64_to_cpu(i->end));
+	}
 }
 
-/*
- * After reading the journal, find existing journal seq blacklist entries and
- * read them into memory:
- */
-int bch2_journal_seq_blacklist_read(struct journal *j,
-				    struct journal_replay *i)
+const struct bch_sb_field_ops bch_sb_field_ops_journal_seq_blacklist = {
+	.validate	= bch2_sb_journal_seq_blacklist_validate,
+	.to_text	= bch2_sb_journal_seq_blacklist_to_text
+};
+
+void bch2_blacklist_entries_gc(struct work_struct *work)
 {
-	struct jset_entry *entry;
-	int ret = 0;
+	struct bch_fs *c = container_of(work, struct bch_fs,
+					journal_seq_blacklist_gc_work);
+	struct journal_seq_blacklist_table *t;
+	struct bch_sb_field_journal_seq_blacklist *bl;
+	struct journal_seq_blacklist_entry *src, *dst;
+	struct btree_trans trans;
+	unsigned i, nr, new_nr;
+	int ret;
 
-	vstruct_for_each(&i->j, entry) {
-		switch (entry->type) {
-		case BCH_JSET_ENTRY_blacklist: {
-			struct jset_entry_blacklist *bl_entry =
-				container_of(entry, struct jset_entry_blacklist, entry);
+	bch2_trans_init(&trans, c);
 
-			ret = __bch2_journal_seq_blacklist_read(j, i,
-					le64_to_cpu(bl_entry->seq),
-					le64_to_cpu(bl_entry->seq));
-			break;
-		}
-		case BCH_JSET_ENTRY_blacklist_v2: {
-			struct jset_entry_blacklist_v2 *bl_entry =
-				container_of(entry, struct jset_entry_blacklist_v2, entry);
-
-			ret = __bch2_journal_seq_blacklist_read(j, i,
-					le64_to_cpu(bl_entry->start),
-					le64_to_cpu(bl_entry->end));
-			break;
-		}
-		}
+	for (i = 0; i < BTREE_ID_NR; i++) {
+		struct btree_iter *iter;
+		struct btree *b;
 
-		if (ret)
-			break;
+		for_each_btree_node(&trans, iter, i, POS_MIN,
+				    BTREE_ITER_PREFETCH, b)
+			if (test_bit(BCH_FS_STOPPING, &c->flags)) {
+				bch2_trans_exit(&trans);
+				return;
+			}
+		bch2_trans_iter_free(&trans, iter);
 	}
 
-	return ret;
-}
-
-/*
- * After reading the journal and walking the btree, we might have new journal
- * sequence numbers to blacklist - add entries to the next journal entry to be
- * written:
- */
-void bch2_journal_seq_blacklist_write(struct journal *j)
-{
-	struct journal_seq_blacklist *bl = j->new_blacklist;
-	struct jset_entry_blacklist_v2 *bl_entry;
-	struct jset_entry *entry;
+	ret = bch2_trans_exit(&trans);
+	if (ret)
+		return;
 
+	mutex_lock(&c->sb_lock);
+	bl = bch2_sb_get_journal_seq_blacklist(c->disk_sb.sb);
 	if (!bl)
-		return;
+		goto out;
 
-	entry = bch2_journal_add_entry_noreservation(journal_cur_buf(j),
-			(sizeof(*bl_entry) - sizeof(*entry)) / sizeof(u64));
+	nr = blacklist_nr_entries(bl);
+	dst = bl->start;
 
-	bl_entry = container_of(entry, struct jset_entry_blacklist_v2, entry);
-	bl_entry->entry.type	= BCH_JSET_ENTRY_blacklist_v2;
-	bl_entry->start		= cpu_to_le64(bl->start);
-	bl_entry->end		= cpu_to_le64(bl->end);
+	t = c->journal_seq_blacklist_table;
+	BUG_ON(nr != t->nr);
+
+	for (src = bl->start, i = eytzinger0_first(t->nr);
+	     src < bl->start + nr;
+	     src++, i = eytzinger0_next(i, nr)) {
+		BUG_ON(t->entries[i].start	!= le64_to_cpu(src->start));
+		BUG_ON(t->entries[i].end	!= le64_to_cpu(src->end));
+
+		if (t->entries[i].dirty)
+			*dst++ = *src;
+	}
 
-	bch2_journal_pin_add(j,
-			     journal_cur_seq(j),
-			     &bl->pin,
-			     journal_seq_blacklist_flush);
+	new_nr = dst - bl->start;
 
-	j->new_blacklist = NULL;
+	bch_info(c, "nr blacklist entries was %u, now %u", nr, new_nr);
+
+	if (new_nr != nr) {
+		bl = bch2_sb_resize_journal_seq_blacklist(&c->disk_sb,
+				new_nr ? sb_blacklist_u64s(new_nr) : 0);
+		BUG_ON(new_nr && !bl);
+
+		if (!new_nr)
+			c->disk_sb.sb->features[0] &=
+				~(1ULL << BCH_FEATURE_JOURNAL_SEQ_BLACKLIST_V3);
+
+		bch2_write_super(c);
+	}
+out:
+	mutex_unlock(&c->sb_lock);
 }
diff --git a/fs/bcachefs/journal_seq_blacklist.h b/fs/bcachefs/journal_seq_blacklist.h
index b4a3b270e9d2..03f4b97247fd 100644
--- a/fs/bcachefs/journal_seq_blacklist.h
+++ b/fs/bcachefs/journal_seq_blacklist.h
@@ -2,13 +2,12 @@
 #ifndef _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H
 #define _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H
 
-struct journal_replay;
-
-struct journal_seq_blacklist *
-bch2_journal_seq_blacklist_find(struct journal *, u64);
-int bch2_journal_seq_should_ignore(struct bch_fs *, u64, struct btree *);
-int bch2_journal_seq_blacklist_read(struct journal *,
-				    struct journal_replay *);
-void bch2_journal_seq_blacklist_write(struct journal *);
+bool bch2_journal_seq_is_blacklisted(struct bch_fs *, u64, bool);
+int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64, u64);
+int bch2_blacklist_table_initialize(struct bch_fs *);
+
+extern const struct bch_sb_field_ops bch_sb_field_ops_journal_seq_blacklist;
+
+void bch2_blacklist_entries_gc(struct work_struct *);
 
 #endif /* _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H */
diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h
index 85bf5e2706f7..7349b50bc5e7 100644
--- a/fs/bcachefs/journal_types.h
+++ b/fs/bcachefs/journal_types.h
@@ -54,24 +54,6 @@ struct journal_entry_pin {
 	u64				seq;
 };
 
-/* corresponds to a btree node with a blacklisted bset: */
-struct blacklisted_node {
-	__le64			seq;
-	enum btree_id		btree_id;
-	struct bpos		pos;
-};
-
-struct journal_seq_blacklist {
-	struct list_head	list;
-	u64			start;
-	u64			end;
-
-	struct journal_entry_pin pin;
-
-	struct blacklisted_node	*entries;
-	size_t			nr_entries;
-};
-
 struct journal_res {
 	bool			ref;
 	u8			idx;
@@ -222,10 +204,6 @@ struct journal {
 
 	u64			replay_journal_seq;
 
-	struct mutex		blacklist_lock;
-	struct list_head	seq_blacklist;
-	struct journal_seq_blacklist *new_blacklist;
-
 	struct write_point	wp;
 	spinlock_t		err_lock;
 
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 67b4dda9cfeb..9411a1f550f3 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -12,6 +12,7 @@
 #include "error.h"
 #include "fsck.h"
 #include "journal_io.h"
+#include "journal_seq_blacklist.h"
 #include "quota.h"
 #include "recovery.h"
 #include "replicas.h"
@@ -99,18 +100,49 @@ fsck_err:
 	return ret;
 }
 
+static int
+verify_journal_entries_not_blacklisted_or_missing(struct bch_fs *c,
+						  struct list_head *journal)
+{
+	struct journal_replay *i =
+		list_last_entry(journal, struct journal_replay, list);
+	u64 start_seq	= le64_to_cpu(i->j.last_seq);
+	u64 end_seq	= le64_to_cpu(i->j.seq);
+	u64 seq		= start_seq;
+	int ret = 0;
+
+	list_for_each_entry(i, journal, list) {
+		fsck_err_on(seq != le64_to_cpu(i->j.seq), c,
+			"journal entries %llu-%llu missing! (replaying %llu-%llu)",
+			seq, le64_to_cpu(i->j.seq) - 1,
+			start_seq, end_seq);
+
+		seq = le64_to_cpu(i->j.seq);
+
+		fsck_err_on(bch2_journal_seq_is_blacklisted(c, seq, false), c,
+			    "found blacklisted journal entry %llu", seq);
+
+		do {
+			seq++;
+		} while (bch2_journal_seq_is_blacklisted(c, seq, false));
+	}
+fsck_err:
+	return ret;
+}
+
 static struct bch_sb_field_clean *read_superblock_clean(struct bch_fs *c)
 {
 	struct bch_sb_field_clean *clean, *sb_clean;
-
-	if (!c->sb.clean)
-		return NULL;
+	int ret;
 
 	mutex_lock(&c->sb_lock);
 	sb_clean = bch2_sb_get_clean(c->disk_sb.sb);
-	if (!sb_clean) {
+
+	if (fsck_err_on(!sb_clean, c,
+			"superblock marked clean but clean section not present")) {
+		SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
+		c->sb.clean = false;
 		mutex_unlock(&c->sb_lock);
-		bch_err(c, "superblock marked clean but clean section not present");
 		return NULL;
 	}
 
@@ -128,6 +160,9 @@ static struct bch_sb_field_clean *read_superblock_clean(struct bch_fs *c)
 	mutex_unlock(&c->sb_lock);
 
 	return clean;
+fsck_err:
+	mutex_unlock(&c->sb_lock);
+	return ERR_PTR(ret);
 }
 
 static int journal_replay_entry_early(struct bch_fs *c,
@@ -179,14 +214,32 @@ static int journal_replay_entry_early(struct bch_fs *c,
 					      le64_to_cpu(u->v));
 		break;
 	}
+	case BCH_JSET_ENTRY_blacklist: {
+		struct jset_entry_blacklist *bl_entry =
+			container_of(entry, struct jset_entry_blacklist, entry);
+
+		ret = bch2_journal_seq_blacklist_add(c,
+				le64_to_cpu(bl_entry->seq),
+				le64_to_cpu(bl_entry->seq) + 1);
+		break;
+	}
+	case BCH_JSET_ENTRY_blacklist_v2: {
+		struct jset_entry_blacklist_v2 *bl_entry =
+			container_of(entry, struct jset_entry_blacklist_v2, entry);
+
+		ret = bch2_journal_seq_blacklist_add(c,
+				le64_to_cpu(bl_entry->start),
+				le64_to_cpu(bl_entry->end) + 1);
+		break;
+	}
 	}
 
 	return ret;
 }
 
-static int load_journal_metadata(struct bch_fs *c,
-				 struct bch_sb_field_clean *clean,
-				 struct list_head *journal)
+static int journal_replay_early(struct bch_fs *c,
+				struct bch_sb_field_clean *clean,
+				struct list_head *journal)
 {
 	struct jset_entry *entry;
 	int ret;
@@ -300,37 +353,76 @@ static bool journal_empty(struct list_head *journal)
 int bch2_fs_recovery(struct bch_fs *c)
 {
 	const char *err = "cannot allocate memory";
-	struct bch_sb_field_clean *clean;
+	struct bch_sb_field_clean *clean = NULL;
+	u64 journal_seq;
 	LIST_HEAD(journal);
 	int ret;
 
-	clean = read_superblock_clean(c);
-	if (clean)
+	if (c->sb.clean)
+		clean = read_superblock_clean(c);
+	ret = PTR_ERR_OR_ZERO(clean);
+	if (ret)
+		goto err;
+
+	if (c->sb.clean)
 		bch_info(c, "recovering from clean shutdown, journal seq %llu",
 			 le64_to_cpu(clean->journal_seq));
 
-	if (!clean || c->opts.fsck) {
+	if (!c->replicas.entries) {
+		bch_info(c, "building replicas info");
+		set_bit(BCH_FS_REBUILD_REPLICAS, &c->flags);
+	}
+
+	if (!c->sb.clean || c->opts.fsck) {
+		struct jset *j;
+
 		ret = bch2_journal_read(c, &journal);
 		if (ret)
 			goto err;
 
-		ret = verify_superblock_clean(c, &clean,
-			&list_last_entry(&journal, struct journal_replay,
-					 list)->j);
+		fsck_err_on(c->sb.clean && !journal_empty(&journal), c,
+			    "filesystem marked clean but journal not empty");
+
+		if (!c->sb.clean && list_empty(&journal)){
+			bch_err(c, "no journal entries found");
+			ret = BCH_FSCK_REPAIR_IMPOSSIBLE;
+			goto err;
+		}
+
+		j = &list_last_entry(&journal, struct journal_replay, list)->j;
+
+		ret = verify_superblock_clean(c, &clean, j);
 		if (ret)
 			goto err;
+
+		journal_seq = le64_to_cpu(j->seq) + 1;
 	} else {
-		ret = bch2_journal_set_seq(c,
-					   le64_to_cpu(clean->journal_seq),
-					   le64_to_cpu(clean->journal_seq));
-		if (ret)
+		journal_seq = le64_to_cpu(clean->journal_seq) + 1;
+	}
+
+	ret = journal_replay_early(c, clean, &journal);
+	if (ret)
+		goto err;
+
+	if (!c->sb.clean) {
+		ret = bch2_journal_seq_blacklist_add(c,
+				journal_seq,
+				journal_seq + 4);
+		if (ret) {
+			bch_err(c, "error creating new journal seq blacklist entry");
 			goto err;
+		}
+
+		journal_seq += 4;
 	}
 
-	fsck_err_on(clean && !journal_empty(&journal), c,
-		    "filesystem marked clean but journal not empty");
+	ret = bch2_blacklist_table_initialize(c);
+
+	ret = verify_journal_entries_not_blacklisted_or_missing(c, &journal);
+	if (ret)
+		goto err;
 
-	ret = load_journal_metadata(c, clean, &journal);
+	ret = bch2_fs_journal_start(&c->journal, journal_seq, &journal);
 	if (ret)
 		goto err;
 
@@ -351,11 +443,6 @@ int bch2_fs_recovery(struct bch_fs *c)
 
 	set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags);
 
-	if (!c->replicas.entries) {
-		bch_info(c, "building replicas info");
-		set_bit(BCH_FS_REBUILD_REPLICAS, &c->flags);
-	}
-
 	if (c->opts.fsck ||
 	    !(c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_INFO)) ||
 	    test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags)) {
@@ -377,13 +464,6 @@ int bch2_fs_recovery(struct bch_fs *c)
 	if (c->sb.encryption_type && !c->sb.clean)
 		atomic64_add(1 << 16, &c->key_version);
 
-	/*
-	 * bch2_fs_journal_start() can't happen sooner, or btree_gc_finish()
-	 * will give spurious errors about oldest_gen > bucket_gen -
-	 * this is a hack but oh well.
-	 */
-	bch2_fs_journal_start(&c->journal);
-
 	if (c->opts.noreplay)
 		goto out;
 
@@ -424,6 +504,10 @@ int bch2_fs_recovery(struct bch_fs *c)
 		SET_BCH_SB_HAS_ERRORS(c->disk_sb.sb, 0);
 	}
 	mutex_unlock(&c->sb_lock);
+
+	if (c->journal_seq_blacklist_table &&
+	    c->journal_seq_blacklist_table->nr > 128)
+		queue_work(system_long_wq, &c->journal_seq_blacklist_gc_work);
 out:
 	bch2_journal_entries_free(&journal);
 	kfree(clean);
@@ -472,7 +556,7 @@ int bch2_fs_initialize(struct bch_fs *c)
 	 * journal_res_get() will crash if called before this has
 	 * set up the journal.pin FIFO and journal.cur pointer:
 	 */
-	bch2_fs_journal_start(&c->journal);
+	bch2_fs_journal_start(&c->journal, 1, &journal);
 	bch2_journal_set_replay_done(&c->journal);
 
 	err = "error going read write";
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index 9fd77e57cafe..7aaa8b785d57 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -7,6 +7,7 @@
 #include "error.h"
 #include "io.h"
 #include "journal.h"
+#include "journal_seq_blacklist.h"
 #include "replicas.h"
 #include "quota.h"
 #include "super-io.h"
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 8c31a9a67eee..27eacb1cd144 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -30,6 +30,7 @@
 #include "io.h"
 #include "journal.h"
 #include "journal_reclaim.h"
+#include "journal_seq_blacklist.h"
 #include "move.h"
 #include "migrate.h"
 #include "movinggc.h"
@@ -468,6 +469,7 @@ static void bch2_fs_free(struct bch_fs *c)
 	kfree(c->replicas.entries);
 	kfree(c->replicas_gc.entries);
 	kfree(rcu_dereference_protected(c->disk_groups, 1));
+	kfree(c->journal_seq_blacklist_table);
 
 	if (c->journal_reclaim_wq)
 		destroy_workqueue(c->journal_reclaim_wq);
@@ -496,6 +498,10 @@ void bch2_fs_stop(struct bch_fs *c)
 
 	bch_verbose(c, "shutting down");
 
+	set_bit(BCH_FS_STOPPING, &c->flags);
+
+	cancel_work_sync(&c->journal_seq_blacklist_gc_work);
+
 	for_each_member_device(ca, c, i)
 		if (ca->kobj.state_in_sysfs &&
 		    ca->disk_sb.bdev)
@@ -631,6 +637,9 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 	spin_lock_init(&c->btree_write_error_lock);
 	INIT_WORK(&c->btree_write_error_work, bch2_btree_write_error_work);
 
+	INIT_WORK(&c->journal_seq_blacklist_gc_work,
+		  bch2_blacklist_entries_gc);
+
 	INIT_LIST_HEAD(&c->fsck_errors);
 	mutex_init(&c->fsck_error_lock);
 
-- 
cgit 


From 4881fdb7566dcc52aaf05f9b8f044a5ecfeff81b Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 4 Apr 2019 23:01:54 -0400
Subject: bcachefs: initial gc no longer needs to touch every node

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_gc.c | 16 ++++------------
 1 file changed, 4 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index c572391c4dad..84a0bb9202c4 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -210,7 +210,10 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id,
 	struct btree_iter *iter;
 	struct btree *b;
 	struct range_checks r;
-	unsigned depth = btree_node_type_needs_gc(btree_id) ? 0 : 1;
+	unsigned depth = metadata_only			? 1
+		: expensive_debug_checks(c)		? 0
+		: !btree_node_type_needs_gc(btree_id)	? 1
+		: 0;
 	u8 max_stale;
 	int ret = 0;
 
@@ -218,17 +221,6 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id,
 
 	gc_pos_set(c, gc_pos_btree(btree_id, POS_MIN, 0));
 
-	/*
-	 * if expensive_debug_checks is on, run range_checks on all leaf nodes:
-	 *
-	 * and on startup, we have to read every btree node (XXX: only if it was
-	 * an unclean shutdown)
-	 */
-	if (metadata_only)
-		depth = 1;
-	else if (initial || expensive_debug_checks(c))
-		depth = 0;
-
 	btree_node_range_checks_init(&r, depth);
 
 	__for_each_btree_node(&trans, iter, btree_id, POS_MIN,
-- 
cgit 


From 330581f16f3041e7cd4e4d8c3b7a569d663a8035 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 5 Apr 2019 00:18:52 -0400
Subject: bcachefs: disallow ever going rw if nochanges or noreplay

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/super.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 27eacb1cd144..85a9400a47fc 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -378,6 +378,10 @@ int __bch2_fs_read_write(struct bch_fs *c, bool early)
 	if (test_bit(BCH_FS_RW, &c->flags))
 		return 0;
 
+	if (c->opts.nochanges ||
+	    c->opts.noreplay)
+		return -EINVAL;
+
 	ret = bch2_fs_mark_dirty(c);
 	if (ret)
 		goto err;
@@ -684,6 +688,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 
 	c->opts.nochanges	|= c->opts.noreplay;
 	c->opts.read_only	|= c->opts.nochanges;
+	c->opts.read_only	|= c->opts.noreplay;
 
 	if (bch2_fs_init_fault("fs_alloc"))
 		goto err;
-- 
cgit 


From 478259b749d442baaccbe18b02a7352ca1e012ae Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 5 Apr 2019 00:20:02 -0400
Subject: bcachefs: delete duplicated code

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/journal.c  | 13 -------------
 fs/bcachefs/journal.h  | 13 +++++++++++++
 fs/bcachefs/recovery.c | 29 +++--------------------------
 3 files changed, 16 insertions(+), 39 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index 2e84af8a044c..969612e612e0 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -55,19 +55,6 @@ static void bch2_journal_buf_init(struct journal *j)
 	buf->data->u64s	= 0;
 }
 
-static inline bool journal_entry_empty(struct jset *j)
-{
-	struct jset_entry *i;
-
-	if (j->seq != j->last_seq)
-		return false;
-
-	vstruct_for_each(j, i)
-		if (i->type || i->u64s)
-			return false;
-	return true;
-}
-
 void bch2_journal_halt(struct journal *j)
 {
 	union journal_res_state old, new;
diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h
index 3447b4ad462d..36066ea7de7a 100644
--- a/fs/bcachefs/journal.h
+++ b/fs/bcachefs/journal.h
@@ -231,6 +231,19 @@ static inline void bch2_journal_add_keys(struct journal *j, struct journal_res *
 			       id, 0, k, k->k.u64s);
 }
 
+static inline bool journal_entry_empty(struct jset *j)
+{
+	struct jset_entry *i;
+
+	if (j->seq != j->last_seq)
+		return false;
+
+	vstruct_for_each(j, i)
+		if (i->type == BCH_JSET_ENTRY_btree_keys && i->u64s)
+			return false;
+	return true;
+}
+
 void __bch2_journal_buf_put(struct journal *, bool);
 
 static inline void bch2_journal_buf_put(struct journal *j, unsigned idx,
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 9411a1f550f3..b1fcc105cffd 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -322,32 +322,9 @@ fsck_err:
 
 static bool journal_empty(struct list_head *journal)
 {
-	struct journal_replay *i;
-	struct jset_entry *entry;
-
-	if (list_empty(journal))
-		return true;
-
-	i = list_last_entry(journal, struct journal_replay, list);
-
-	if (i->j.last_seq != i->j.seq)
-		return false;
-
-	list_for_each_entry(i, journal, list) {
-		vstruct_for_each(&i->j, entry) {
-			if (entry->type == BCH_JSET_ENTRY_btree_root ||
-			    entry->type == BCH_JSET_ENTRY_usage ||
-			    entry->type == BCH_JSET_ENTRY_data_usage)
-				continue;
-
-			if (entry->type == BCH_JSET_ENTRY_btree_keys &&
-			    !entry->u64s)
-				continue;
-			return false;
-		}
-	}
-
-	return true;
+	return list_empty(journal) ||
+		journal_entry_empty(&list_last_entry(journal,
+					struct journal_replay, list)->j);
 }
 
 int bch2_fs_recovery(struct bch_fs *c)
-- 
cgit 


From d1170ce53c5b332caf647f658c6f2a483c3608a7 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 6 Apr 2019 14:32:06 -0400
Subject: bcachefs: allocate sb_read_scratch with __get_free_page

kmalloc allocations aren't guranteed alignment for io

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/super-io.c | 2 +-
 fs/bcachefs/super.c    | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index 7aaa8b785d57..0fe8ea22c6a1 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -652,7 +652,7 @@ static void read_back_super(struct bch_fs *c, struct bch_dev *ca)
 
 	bio_reset(bio, ca->disk_sb.bdev, REQ_OP_READ|REQ_SYNC|REQ_META);
 	bio->bi_iter.bi_sector	= le64_to_cpu(sb->layout.sb_offset[0]);
-	bio->bi_iter.bi_size	= 4096;
+	bio->bi_iter.bi_size	= PAGE_SIZE;
 	bio->bi_end_io		= write_super_endio;
 	bio->bi_private		= ca;
 	bch2_bio_map(bio, ca->sb_read_scratch);
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 85a9400a47fc..18576538613c 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -919,7 +919,7 @@ static void bch2_dev_free(struct bch_dev *ca)
 	free_percpu(ca->io_done);
 	bioset_exit(&ca->replica_set);
 	bch2_dev_buckets_free(ca);
-	kfree(ca->sb_read_scratch);
+	free_page((unsigned long) ca->sb_read_scratch);
 
 	bch2_time_stats_exit(&ca->io_latency[WRITE]);
 	bch2_time_stats_exit(&ca->io_latency[READ]);
@@ -1030,7 +1030,7 @@ static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c,
 			    0, GFP_KERNEL) ||
 	    percpu_ref_init(&ca->io_ref, bch2_dev_io_ref_complete,
 			    PERCPU_REF_INIT_DEAD, GFP_KERNEL) ||
-	    !(ca->sb_read_scratch = kmalloc(4096, GFP_KERNEL)) ||
+	    !(ca->sb_read_scratch = (void *) __get_free_page(GFP_KERNEL)) ||
 	    bch2_dev_buckets_alloc(c, ca) ||
 	    bioset_init(&ca->replica_set, 4,
 			offsetof(struct bch_write_bio, bio), 0) ||
-- 
cgit 


From a0e0bda117d80b107c137e4c6cd0fb9814bd5214 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 6 Apr 2019 15:12:21 -0400
Subject: bcachefs: Pass flags arg to bch2_alloc_write()

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c | 28 ++++++++++++----------------
 fs/bcachefs/alloc_background.h |  2 +-
 fs/bcachefs/ec.c               |  4 ++--
 fs/bcachefs/ec.h               |  2 +-
 fs/bcachefs/super.c            | 10 +++++++---
 fs/bcachefs/sysfs.c            |  2 +-
 6 files changed, 24 insertions(+), 24 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index c6a909bdfc02..acd7be90fc47 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -359,7 +359,7 @@ err:
 
 static int __bch2_alloc_write_key(struct btree_trans *trans, struct bch_dev *ca,
 				  size_t b, struct btree_iter *iter,
-				  u64 *journal_seq, unsigned flags)
+				  unsigned flags)
 {
 	struct bch_fs *c = trans->c;
 #if 0
@@ -397,13 +397,10 @@ static int __bch2_alloc_write_key(struct btree_trans *trans, struct bch_dev *ca,
 
 	bch2_trans_update(trans, BTREE_INSERT_ENTRY(iter, &a->k_i));
 
-	ret = bch2_trans_commit(trans, NULL, journal_seq,
-				   BTREE_INSERT_NOCHECK_RW|
-				   BTREE_INSERT_NOFAIL|
-				   BTREE_INSERT_USE_RESERVE|
-				   BTREE_INSERT_USE_ALLOC_RESERVE|
-				   BTREE_INSERT_NOMARK|
-				   flags);
+	ret = bch2_trans_commit(trans, NULL, NULL,
+				BTREE_INSERT_NOFAIL|
+				BTREE_INSERT_NOMARK|
+				flags);
 	if (ret)
 		return ret;
 
@@ -417,14 +414,12 @@ static int __bch2_alloc_write_key(struct btree_trans *trans, struct bch_dev *ca,
 	return 0;
 }
 
-int bch2_alloc_write(struct bch_fs *c, bool nowait, bool *wrote)
+int bch2_alloc_write(struct bch_fs *c, unsigned flags, bool *wrote)
 {
 	struct bch_dev *ca;
 	unsigned i;
 	int ret = 0;
 
-	*wrote = false;
-
 	for_each_rw_member(ca, c, i) {
 		struct btree_trans trans;
 		struct btree_iter *iter;
@@ -445,10 +440,8 @@ int bch2_alloc_write(struct bch_fs *c, bool nowait, bool *wrote)
 			if (!buckets->b[b].mark.dirty)
 				continue;
 
-			ret = __bch2_alloc_write_key(&trans, ca, b, iter, NULL,
-						     nowait
-						     ? BTREE_INSERT_NOWAIT
-						     : 0);
+			ret = __bch2_alloc_write_key(&trans, ca, b,
+						     iter, flags);
 			if (ret)
 				break;
 
@@ -1683,7 +1676,10 @@ int bch2_fs_allocator_start(struct bch_fs *c)
 		 * XXX: it's possible for this to deadlock waiting on journal reclaim,
 		 * since we're holding btree writes. What then?
 		 */
-		ret = bch2_alloc_write(c, true, &wrote);
+		ret = bch2_alloc_write(c,
+				       BTREE_INSERT_NOCHECK_RW|
+				       BTREE_INSERT_USE_ALLOC_RESERVE|
+				       BTREE_INSERT_NOWAIT, &wrote);
 
 		/*
 		 * If bch2_alloc_write() did anything, it may have used some
diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h
index ff6eccf904af..25d7426613da 100644
--- a/fs/bcachefs/alloc_background.h
+++ b/fs/bcachefs/alloc_background.h
@@ -65,7 +65,7 @@ void bch2_dev_allocator_quiesce(struct bch_fs *, struct bch_dev *);
 void bch2_dev_allocator_stop(struct bch_dev *);
 int bch2_dev_allocator_start(struct bch_dev *);
 
-int bch2_alloc_write(struct bch_fs *, bool, bool *);
+int bch2_alloc_write(struct bch_fs *, unsigned, bool *);
 int bch2_fs_allocator_start(struct bch_fs *);
 void bch2_fs_allocator_background_init(struct bch_fs *);
 
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index 75fe0c28fa22..ea009f0ff829 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -1194,7 +1194,7 @@ static int __bch2_stripe_write_key(struct btree_trans *trans,
 				 BTREE_INSERT_NOFAIL|flags);
 }
 
-int bch2_stripes_write(struct bch_fs *c, bool *wrote)
+int bch2_stripes_write(struct bch_fs *c, unsigned flags, bool *wrote)
 {
 	struct btree_trans trans;
 	struct btree_iter *iter;
@@ -1216,7 +1216,7 @@ int bch2_stripes_write(struct bch_fs *c, bool *wrote)
 			continue;
 
 		ret = __bch2_stripe_write_key(&trans, iter, m, giter.pos,
-					new_key, BTREE_INSERT_NOCHECK_RW);
+					      new_key, flags);
 		if (ret)
 			break;
 
diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h
index 7bcbb7e11377..b048244a4a45 100644
--- a/fs/bcachefs/ec.h
+++ b/fs/bcachefs/ec.h
@@ -151,7 +151,7 @@ void bch2_ec_stop_dev(struct bch_fs *, struct bch_dev *);
 void bch2_ec_flush_new_stripes(struct bch_fs *);
 
 int bch2_stripes_read(struct bch_fs *, struct list_head *);
-int bch2_stripes_write(struct bch_fs *, bool *);
+int bch2_stripes_write(struct bch_fs *, unsigned, bool *);
 
 int bch2_ec_mem_alloc(struct bch_fs *, bool);
 
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 18576538613c..e0d4898ad0f5 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -196,13 +196,15 @@ static void __bch2_fs_read_only(struct bch_fs *c)
 		goto allocator_not_running;
 
 	do {
-		ret = bch2_stripes_write(c, &wrote);
+		wrote = false;
+
+		ret = bch2_stripes_write(c, BTREE_INSERT_NOCHECK_RW, &wrote);
 		if (ret) {
 			bch2_fs_inconsistent(c, "error writing out stripes");
 			break;
 		}
 
-		ret = bch2_alloc_write(c, false, &wrote);
+		ret = bch2_alloc_write(c, BTREE_INSERT_NOCHECK_RW, &wrote);
 		if (ret) {
 			bch2_fs_inconsistent(c, "error writing out alloc info %i", ret);
 			break;
@@ -305,7 +307,9 @@ void bch2_fs_read_only(struct bch_fs *c)
 	if (!bch2_journal_error(&c->journal) &&
 	    !test_bit(BCH_FS_ERROR, &c->flags) &&
 	    !test_bit(BCH_FS_EMERGENCY_RO, &c->flags) &&
-	    test_bit(BCH_FS_STARTED, &c->flags))
+	    test_bit(BCH_FS_STARTED, &c->flags) &&
+	    !c->opts.noreplay &&
+	    !c->opts.norecovery)
 		bch2_fs_mark_clean(c);
 
 	clear_bit(BCH_FS_RW, &c->flags);
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index 59503ad0006c..db87a63b97cc 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -502,7 +502,7 @@ STORE(__bch2_fs)
 	if (attr == &sysfs_trigger_alloc_write) {
 		bool wrote;
 
-		bch2_alloc_write(c, false, &wrote);
+		bch2_alloc_write(c, 0, &wrote);
 	}
 
 	if (attr == &sysfs_prune_cache) {
-- 
cgit 


From 3ea2b1e12898154d6fae49b22a3509521ba49d38 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 12 Apr 2019 04:54:12 -0400
Subject: bcachefs: cmp_int()

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c      | 8 ++++----
 fs/bcachefs/bkey.c                  | 2 +-
 fs/bcachefs/bkey.h                  | 4 ++--
 fs/bcachefs/bset.h                  | 2 +-
 fs/bcachefs/btree_update_leaf.c     | 2 +-
 fs/bcachefs/ec.c                    | 2 +-
 fs/bcachefs/fs.h                    | 2 +-
 fs/bcachefs/journal_seq_blacklist.c | 2 +-
 fs/bcachefs/movinggc.c              | 4 ++--
 fs/bcachefs/replicas.c              | 2 +-
 fs/bcachefs/sysfs.c                 | 6 +++---
 fs/bcachefs/util.h                  | 2 ++
 12 files changed, 20 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index acd7be90fc47..b3a8ff0b1daa 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -687,16 +687,16 @@ static inline int bucket_alloc_cmp(alloc_heap *h,
 				   struct alloc_heap_entry l,
 				   struct alloc_heap_entry r)
 {
-	return (l.key > r.key) - (l.key < r.key) ?:
-		(l.nr < r.nr)  - (l.nr  > r.nr) ?:
-		(l.bucket > r.bucket) - (l.bucket < r.bucket);
+	return  cmp_int(l.key, r.key) ?:
+		cmp_int(r.nr, l.nr) ?:
+		cmp_int(l.bucket, r.bucket);
 }
 
 static inline int bucket_idx_cmp(const void *_l, const void *_r)
 {
 	const struct alloc_heap_entry *l = _l, *r = _r;
 
-	return (l->bucket > r->bucket) - (l->bucket < r->bucket);
+	return cmp_int(l->bucket, r->bucket);
 }
 
 static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca)
diff --git a/fs/bcachefs/bkey.c b/fs/bcachefs/bkey.c
index 8a3295ff9631..8b3c9ae8d266 100644
--- a/fs/bcachefs/bkey.c
+++ b/fs/bcachefs/bkey.c
@@ -1024,7 +1024,7 @@ static inline int __bkey_cmp_bits(const u64 *l, const u64 *r,
 		r_v = *r;
 	}
 
-	return (l_v > r_v) - (l_v < r_v);
+	return cmp_int(l_v, r_v);
 }
 #endif
 
diff --git a/fs/bcachefs/bkey.h b/fs/bcachefs/bkey.h
index 44044fcd6f9f..45de61d492a4 100644
--- a/fs/bcachefs/bkey.h
+++ b/fs/bcachefs/bkey.h
@@ -217,8 +217,8 @@ void bch2_bkey_swab_key(const struct bkey_format *, struct bkey_packed *);
 
 static __always_inline int bversion_cmp(struct bversion l, struct bversion r)
 {
-	return  (l.hi > r.hi) - (l.hi < r.hi) ?:
-		(l.lo > r.lo) - (l.lo < r.lo);
+	return  cmp_int(l.hi, r.hi) ?:
+		cmp_int(l.lo, r.lo);
 }
 
 #define ZERO_VERSION	((struct bversion) { .hi = 0, .lo = 0 })
diff --git a/fs/bcachefs/bset.h b/fs/bcachefs/bset.h
index 329ffb0b6b3d..da3e41cc9757 100644
--- a/fs/bcachefs/bset.h
+++ b/fs/bcachefs/bset.h
@@ -465,7 +465,7 @@ static inline int bkey_iter_cmp(struct btree *b,
 {
 	return bkey_cmp_packed(b, l, r)
 		?: (int) bkey_deleted(r) - (int) bkey_deleted(l)
-		?: (l > r) - (l < r);
+		?: cmp_int(l, r);
 }
 
 static inline int btree_node_iter_cmp(struct btree *b,
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 8e686dc42f9d..48d3be517471 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -55,7 +55,7 @@ static void btree_trans_unlock_write(struct btree_trans *trans)
 static inline int btree_trans_cmp(struct btree_insert_entry l,
 				  struct btree_insert_entry r)
 {
-	return (l.deferred > r.deferred) - (l.deferred < r.deferred) ?:
+	return cmp_int(l.deferred, r.deferred) ?:
 		btree_iter_cmp(l.iter, r.iter);
 }
 
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index ea009f0ff829..6a357e5b652e 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -951,7 +951,7 @@ static int unsigned_cmp(const void *_l, const void *_r)
 	unsigned l = *((const unsigned *) _l);
 	unsigned r = *((const unsigned *) _r);
 
-	return (l > r) - (l < r);
+	return cmp_int(l, r);
 }
 
 /* pick most common bucket size: */
diff --git a/fs/bcachefs/fs.h b/fs/bcachefs/fs.h
index b9a8a9bc3e90..e72d6a58b322 100644
--- a/fs/bcachefs/fs.h
+++ b/fs/bcachefs/fs.h
@@ -54,7 +54,7 @@ struct bch_inode_info {
 
 static inline int ptrcmp(void *l, void *r)
 {
-	return (l > r) - (l < r);
+	return cmp_int(l, r);
 }
 
 #define __bch2_lock_inodes(_lock, ...)					\
diff --git a/fs/bcachefs/journal_seq_blacklist.c b/fs/bcachefs/journal_seq_blacklist.c
index 0df8dfccd5b5..ae64bf3248ef 100644
--- a/fs/bcachefs/journal_seq_blacklist.c
+++ b/fs/bcachefs/journal_seq_blacklist.c
@@ -136,7 +136,7 @@ static int journal_seq_blacklist_table_cmp(const void *_l,
 	const struct journal_seq_blacklist_table_entry *l = _l;
 	const struct journal_seq_blacklist_table_entry *r = _r;
 
-	return (l->start > r->start) - (l->start < r->start);
+	return cmp_int(l->start, r->start);
 }
 
 bool bch2_journal_seq_is_blacklisted(struct bch_fs *c, u64 seq,
diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c
index 78d9ca8bfc5e..aba13e6ea4ff 100644
--- a/fs/bcachefs/movinggc.c
+++ b/fs/bcachefs/movinggc.c
@@ -54,7 +54,7 @@ static inline int sectors_used_cmp(copygc_heap *heap,
 				   struct copygc_heap_entry l,
 				   struct copygc_heap_entry r)
 {
-	return (l.sectors > r.sectors) - (l.sectors < r.sectors);
+	return cmp_int(l.sectors, r.sectors);
 }
 
 static int bucket_offset_cmp(const void *_l, const void *_r, size_t size)
@@ -62,7 +62,7 @@ static int bucket_offset_cmp(const void *_l, const void *_r, size_t size)
 	const struct copygc_heap_entry *l = _l;
 	const struct copygc_heap_entry *r = _r;
 
-	return (l->offset > r->offset) - (l->offset < r->offset);
+	return cmp_int(l->offset, r->offset);
 }
 
 static bool __copygc_pred(struct bch_dev *ca,
diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c
index b66217989b71..b1df2c1ce4a4 100644
--- a/fs/bcachefs/replicas.c
+++ b/fs/bcachefs/replicas.c
@@ -12,7 +12,7 @@ static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *,
 
 static inline int u8_cmp(u8 l, u8 r)
 {
-	return (l > r) - (l < r);
+	return cmp_int(l, r);
 }
 
 static void verify_replicas_entry_sorted(struct bch_replicas_entry *e)
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index db87a63b97cc..f4b70f66d0ac 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -751,10 +751,10 @@ static unsigned bucket_oldest_gen_fn(struct bch_fs *c, struct bch_dev *ca,
 
 static int unsigned_cmp(const void *_l, const void *_r)
 {
-	unsigned l = *((unsigned *) _l);
-	unsigned r = *((unsigned *) _r);
+	const unsigned *l = _l;
+	const unsigned *r = _r;
 
-	return (l > r) - (l < r);
+	return cmp_int(*l, *r);
 }
 
 static ssize_t show_quantiles(struct bch_fs *c, struct bch_dev *ca,
diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h
index dc40a52ac8c7..59c8a1dac7be 100644
--- a/fs/bcachefs/util.h
+++ b/fs/bcachefs/util.h
@@ -743,4 +743,6 @@ static inline void acc_u64s_percpu(u64 *acc, const u64 __percpu *src,
 
 u64 *bch2_acc_percpu_u64s(u64 __percpu *, unsigned);
 
+#define cmp_int(l, r)		((l > r) - (l < r))
+
 #endif /* _BCACHEFS_UTIL_H */
-- 
cgit 


From 644d180b055fa47be7e6ca8b684f45e2350dfafd Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 11 Apr 2019 22:39:39 -0400
Subject: bcachefs: Journal replay refactoring

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/journal.c       |  15 +-
 fs/bcachefs/journal_io.c    | 130 -----------------
 fs/bcachefs/journal_io.h    |   2 -
 fs/bcachefs/journal_types.h |   1 +
 fs/bcachefs/recovery.c      | 343 +++++++++++++++++++++++++++++++-------------
 5 files changed, 251 insertions(+), 240 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index 969612e612e0..25d0631c43dd 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -984,9 +984,9 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq,
 	u64 last_seq = cur_seq, nr, seq;
 
 	if (!list_empty(journal_entries))
-		last_seq = le64_to_cpu(list_last_entry(journal_entries,
-						       struct journal_replay,
-						       list)->j.last_seq);
+		last_seq = le64_to_cpu(list_first_entry(journal_entries,
+							struct journal_replay,
+							list)->j.seq);
 
 	nr = cur_seq - last_seq;
 
@@ -999,6 +999,8 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq,
 		}
 	}
 
+	j->replay_journal_seq	= last_seq;
+	j->replay_journal_seq_end = cur_seq;
 	j->last_seq_ondisk	= last_seq;
 	j->pin.front		= last_seq;
 	j->pin.back		= cur_seq;
@@ -1007,7 +1009,7 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq,
 	fifo_for_each_entry_ptr(p, &j->pin, seq) {
 		INIT_LIST_HEAD(&p->list);
 		INIT_LIST_HEAD(&p->flushed);
-		atomic_set(&p->count, 0);
+		atomic_set(&p->count, 1);
 		p->devs.nr = 0;
 	}
 
@@ -1016,10 +1018,7 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq,
 
 		BUG_ON(seq < last_seq || seq >= cur_seq);
 
-		p = journal_seq_pin(j, seq);
-
-		atomic_set(&p->count, 1);
-		p->devs = i->devs;
+		journal_seq_pin(j, seq)->devs = i->devs;
 	}
 
 	spin_lock(&j->lock);
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 8010b38114ac..4fd7b048050b 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -1,9 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 #include "bcachefs.h"
-#include "alloc_background.h"
 #include "alloc_foreground.h"
-#include "btree_gc.h"
-#include "btree_update.h"
 #include "buckets.h"
 #include "checksum.h"
 #include "error.h"
@@ -642,18 +639,6 @@ err:
 	goto out;
 }
 
-void bch2_journal_entries_free(struct list_head *list)
-{
-
-	while (!list_empty(list)) {
-		struct journal_replay *i =
-			list_first_entry(list, struct journal_replay, list);
-		list_del(&i->list);
-		kvpfree(i, offsetof(struct journal_replay, j) +
-			vstruct_bytes(&i->j));
-	}
-}
-
 int bch2_journal_read(struct bch_fs *c, struct list_head *list)
 {
 	struct journal_list jlist;
@@ -733,121 +718,6 @@ fsck_err:
 	return ret;
 }
 
-/* journal replay: */
-
-static int bch2_extent_replay_key(struct bch_fs *c, struct bkey_i *k)
-{
-	struct btree_trans trans;
-	struct btree_iter *iter;
-	/*
-	 * We might cause compressed extents to be
-	 * split, so we need to pass in a
-	 * disk_reservation:
-	 */
-	struct disk_reservation disk_res =
-		bch2_disk_reservation_init(c, 0);
-	BKEY_PADDED(k) split;
-	int ret;
-
-	bch2_trans_init(&trans, c);
-
-	iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
-				   bkey_start_pos(&k->k),
-				   BTREE_ITER_INTENT);
-	do {
-		ret = bch2_btree_iter_traverse(iter);
-		if (ret)
-			break;
-
-		bkey_copy(&split.k, k);
-		bch2_cut_front(iter->pos, &split.k);
-		bch2_extent_trim_atomic(&split.k, iter);
-
-		ret = bch2_disk_reservation_add(c, &disk_res,
-				split.k.k.size *
-				bch2_bkey_nr_dirty_ptrs(bkey_i_to_s_c(&split.k)),
-				BCH_DISK_RESERVATION_NOFAIL);
-		BUG_ON(ret);
-
-		bch2_trans_update(&trans, BTREE_INSERT_ENTRY(iter, &split.k));
-		ret = bch2_trans_commit(&trans, &disk_res, NULL,
-					BTREE_INSERT_ATOMIC|
-					BTREE_INSERT_NOFAIL|
-					BTREE_INSERT_LAZY_RW|
-					BTREE_INSERT_JOURNAL_REPLAY);
-	} while ((!ret || ret == -EINTR) &&
-		 bkey_cmp(k->k.p, iter->pos));
-
-	bch2_disk_reservation_put(c, &disk_res);
-
-	/*
-	 * This isn't strictly correct - we should only be relying on the btree
-	 * node lock for synchronization with gc when we've got a write lock
-	 * held.
-	 *
-	 * but - there are other correctness issues if btree gc were to run
-	 * before journal replay finishes
-	 */
-	BUG_ON(c->gc_pos.phase);
-
-	bch2_mark_key(c, bkey_i_to_s_c(k), false, -((s64) k->k.size),
-		      NULL, 0, 0);
-	bch2_trans_exit(&trans);
-
-	return ret;
-}
-
-int bch2_journal_replay(struct bch_fs *c, struct list_head *list)
-{
-	struct journal *j = &c->journal;
-	struct bkey_i *k, *_n;
-	struct jset_entry *entry;
-	struct journal_replay *i, *n;
-	int ret = 0;
-
-	list_for_each_entry_safe(i, n, list, list) {
-		j->replay_journal_seq = le64_to_cpu(i->j.seq);
-
-		for_each_jset_key(k, _n, entry, &i->j) {
-			switch (entry->btree_id) {
-			case BTREE_ID_ALLOC:
-				ret = bch2_alloc_replay_key(c, k);
-				break;
-			case BTREE_ID_EXTENTS:
-				ret = bch2_extent_replay_key(c, k);
-				break;
-			default:
-				ret = bch2_btree_insert(c, entry->btree_id, k,
-						NULL, NULL,
-						BTREE_INSERT_NOFAIL|
-						BTREE_INSERT_LAZY_RW|
-						BTREE_INSERT_JOURNAL_REPLAY|
-						BTREE_INSERT_NOMARK);
-				break;
-			}
-
-			if (ret) {
-				bch_err(c, "journal replay: error %d while replaying key",
-					ret);
-				goto err;
-			}
-
-			cond_resched();
-		}
-
-		bch2_journal_pin_put(j, j->replay_journal_seq);
-	}
-
-	j->replay_journal_seq = 0;
-
-	bch2_journal_set_replay_done(j);
-	bch2_journal_flush_all_pins(j);
-	ret = bch2_journal_error(j);
-err:
-	bch2_journal_entries_free(list);
-	return ret;
-}
-
 /* journal write: */
 
 static void __journal_write_alloc(struct journal *j,
diff --git a/fs/bcachefs/journal_io.h b/fs/bcachefs/journal_io.h
index 4bb174839956..72e575f360af 100644
--- a/fs/bcachefs/journal_io.h
+++ b/fs/bcachefs/journal_io.h
@@ -36,8 +36,6 @@ static inline struct jset_entry *__jset_entry_type_next(struct jset *jset,
 		vstruct_for_each_safe(entry, k, _n)
 
 int bch2_journal_read(struct bch_fs *, struct list_head *);
-void bch2_journal_entries_free(struct list_head *);
-int bch2_journal_replay(struct bch_fs *, struct list_head *);
 
 void bch2_journal_write(struct closure *);
 
diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h
index 7349b50bc5e7..0585e9b6e230 100644
--- a/fs/bcachefs/journal_types.h
+++ b/fs/bcachefs/journal_types.h
@@ -203,6 +203,7 @@ struct journal {
 	}			pin;
 
 	u64			replay_journal_seq;
+	u64			replay_journal_seq_end;
 
 	struct write_point	wp;
 	spinlock_t		err_lock;
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index b1fcc105cffd..2e849135195d 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -12,94 +12,162 @@
 #include "error.h"
 #include "fsck.h"
 #include "journal_io.h"
+#include "journal_reclaim.h"
 #include "journal_seq_blacklist.h"
 #include "quota.h"
 #include "recovery.h"
 #include "replicas.h"
 #include "super-io.h"
 
+#include <linux/sort.h>
 #include <linux/stat.h>
 
 #define QSTR(n) { { { .len = strlen(n) } }, .name = n }
 
-static struct bkey_i *btree_root_find(struct bch_fs *c,
-				      struct bch_sb_field_clean *clean,
-				      struct jset *j,
-				      enum btree_id id, unsigned *level)
+/* journal replay: */
+
+static void bch2_journal_entries_free(struct list_head *list)
 {
-	struct bkey_i *k;
-	struct jset_entry *entry, *start, *end;
 
-	if (clean) {
-		start = clean->start;
-		end = vstruct_end(&clean->field);
-	} else {
-		start = j->start;
-		end = vstruct_last(j);
+	while (!list_empty(list)) {
+		struct journal_replay *i =
+			list_first_entry(list, struct journal_replay, list);
+		list_del(&i->list);
+		kvpfree(i, offsetof(struct journal_replay, j) +
+			vstruct_bytes(&i->j));
 	}
+}
 
-	for (entry = start; entry < end; entry = vstruct_next(entry))
-		if (entry->type == BCH_JSET_ENTRY_btree_root &&
-		    entry->btree_id == id)
-			goto found;
+static int bch2_extent_replay_key(struct bch_fs *c, struct bkey_i *k)
+{
+	struct btree_trans trans;
+	struct btree_iter *iter;
+	/*
+	 * We might cause compressed extents to be
+	 * split, so we need to pass in a
+	 * disk_reservation:
+	 */
+	struct disk_reservation disk_res =
+		bch2_disk_reservation_init(c, 0);
+	BKEY_PADDED(k) split;
+	int ret;
 
-	return NULL;
-found:
-	if (!entry->u64s)
-		return ERR_PTR(-EINVAL);
+	bch2_trans_init(&trans, c);
 
-	k = entry->start;
-	*level = entry->level;
-	return k;
-}
+	iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
+				   bkey_start_pos(&k->k),
+				   BTREE_ITER_INTENT);
+	do {
+		ret = bch2_btree_iter_traverse(iter);
+		if (ret)
+			break;
 
-static int verify_superblock_clean(struct bch_fs *c,
-				   struct bch_sb_field_clean **cleanp,
-				   struct jset *j)
-{
-	unsigned i;
-	struct bch_sb_field_clean *clean = *cleanp;
-	int ret = 0;
+		bkey_copy(&split.k, k);
+		bch2_cut_front(iter->pos, &split.k);
+		bch2_extent_trim_atomic(&split.k, iter);
 
-	if (!clean || !j)
-		return 0;
+		ret = bch2_disk_reservation_add(c, &disk_res,
+				split.k.k.size *
+				bch2_bkey_nr_dirty_ptrs(bkey_i_to_s_c(&split.k)),
+				BCH_DISK_RESERVATION_NOFAIL);
+		BUG_ON(ret);
 
-	if (mustfix_fsck_err_on(j->seq != clean->journal_seq, c,
-			"superblock journal seq (%llu) doesn't match journal (%llu) after clean shutdown",
-			le64_to_cpu(clean->journal_seq),
-			le64_to_cpu(j->seq))) {
-		kfree(clean);
-		*cleanp = NULL;
-		return 0;
+		bch2_trans_update(&trans, BTREE_INSERT_ENTRY(iter, &split.k));
+		ret = bch2_trans_commit(&trans, &disk_res, NULL,
+					BTREE_INSERT_ATOMIC|
+					BTREE_INSERT_NOFAIL|
+					BTREE_INSERT_LAZY_RW|
+					BTREE_INSERT_JOURNAL_REPLAY);
+	} while ((!ret || ret == -EINTR) &&
+		 bkey_cmp(k->k.p, iter->pos));
+
+	bch2_disk_reservation_put(c, &disk_res);
+
+	/*
+	 * This isn't strictly correct - we should only be relying on the btree
+	 * node lock for synchronization with gc when we've got a write lock
+	 * held.
+	 *
+	 * but - there are other correctness issues if btree gc were to run
+	 * before journal replay finishes
+	 */
+	BUG_ON(c->gc_pos.phase);
+
+	bch2_mark_key(c, bkey_i_to_s_c(k), false, -((s64) k->k.size),
+		      NULL, 0, 0);
+	bch2_trans_exit(&trans);
+
+	return ret;
+}
+
+static int bch2_journal_replay_key(struct bch_fs *c, enum btree_id btree_id,
+				   struct bkey_i *k)
+{
+	switch (btree_id) {
+	case BTREE_ID_ALLOC:
+		return bch2_alloc_replay_key(c, k);
+	case BTREE_ID_EXTENTS:
+		return bch2_extent_replay_key(c, k);
+	default:
+		return bch2_btree_insert(c, btree_id, k,
+					 NULL, NULL,
+					 BTREE_INSERT_NOFAIL|
+					 BTREE_INSERT_LAZY_RW|
+					 BTREE_INSERT_JOURNAL_REPLAY|
+					 BTREE_INSERT_NOMARK);
 	}
+}
 
-	mustfix_fsck_err_on(j->read_clock != clean->read_clock, c,
-			"superblock read clock doesn't match journal after clean shutdown");
-	mustfix_fsck_err_on(j->write_clock != clean->write_clock, c,
-			"superblock read clock doesn't match journal after clean shutdown");
+static void replay_now_at(struct journal *j, u64 seq)
+{
+	BUG_ON(seq < j->replay_journal_seq);
+	BUG_ON(seq > j->replay_journal_seq_end);
 
-	for (i = 0; i < BTREE_ID_NR; i++) {
-		struct bkey_i *k1, *k2;
-		unsigned l1 = 0, l2 = 0;
+	while (j->replay_journal_seq < seq)
+		bch2_journal_pin_put(j, j->replay_journal_seq++);
+}
 
-		k1 = btree_root_find(c, clean, NULL, i, &l1);
-		k2 = btree_root_find(c, NULL, j, i, &l2);
+static int bch2_journal_replay(struct bch_fs *c, struct list_head *list)
+{
+	struct journal *j = &c->journal;
+	struct bkey_i *k, *_n;
+	struct jset_entry *entry;
+	struct journal_replay *i, *n;
+	int ret = 0;
 
-		if (!k1 && !k2)
-			continue;
+	list_for_each_entry_safe(i, n, list, list) {
+		replay_now_at(j, le64_to_cpu(i->j.seq));
 
-		mustfix_fsck_err_on(!k1 || !k2 ||
-				    IS_ERR(k1) ||
-				    IS_ERR(k2) ||
-				    k1->k.u64s != k2->k.u64s ||
-				    memcmp(k1, k2, bkey_bytes(k1)) ||
-				    l1 != l2, c,
-			"superblock btree root doesn't match journal after clean shutdown");
+		for_each_jset_key(k, _n, entry, &i->j) {
+			ret = bch2_journal_replay_key(c, entry->btree_id, k);
+			if (ret) {
+				bch_err(c, "journal replay: error %d while replaying key",
+					ret);
+				goto err;
+			}
+
+			cond_resched();
+		}
 	}
-fsck_err:
+
+	replay_now_at(j, j->replay_journal_seq_end);
+	j->replay_journal_seq = 0;
+
+	bch2_journal_set_replay_done(j);
+	bch2_journal_flush_all_pins(j);
+	ret = bch2_journal_error(j);
+err:
+	bch2_journal_entries_free(list);
 	return ret;
 }
 
+static bool journal_empty(struct list_head *journal)
+{
+	return list_empty(journal) ||
+		journal_entry_empty(&list_last_entry(journal,
+					struct journal_replay, list)->j);
+}
+
 static int
 verify_journal_entries_not_blacklisted_or_missing(struct bch_fs *c,
 						  struct list_head *journal)
@@ -130,40 +198,7 @@ fsck_err:
 	return ret;
 }
 
-static struct bch_sb_field_clean *read_superblock_clean(struct bch_fs *c)
-{
-	struct bch_sb_field_clean *clean, *sb_clean;
-	int ret;
-
-	mutex_lock(&c->sb_lock);
-	sb_clean = bch2_sb_get_clean(c->disk_sb.sb);
-
-	if (fsck_err_on(!sb_clean, c,
-			"superblock marked clean but clean section not present")) {
-		SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
-		c->sb.clean = false;
-		mutex_unlock(&c->sb_lock);
-		return NULL;
-	}
-
-	clean = kmemdup(sb_clean, vstruct_bytes(&sb_clean->field),
-			GFP_KERNEL);
-	if (!clean) {
-		mutex_unlock(&c->sb_lock);
-		return ERR_PTR(-ENOMEM);
-	}
-
-	if (le16_to_cpu(c->disk_sb.sb->version) <
-	    bcachefs_metadata_version_bkey_renumber)
-		bch2_sb_clean_renumber(clean, READ);
-
-	mutex_unlock(&c->sb_lock);
-
-	return clean;
-fsck_err:
-	mutex_unlock(&c->sb_lock);
-	return ERR_PTR(ret);
-}
+/* journal replay early: */
 
 static int journal_replay_entry_early(struct bch_fs *c,
 				      struct jset_entry *entry)
@@ -275,6 +310,121 @@ static int journal_replay_early(struct bch_fs *c,
 	return 0;
 }
 
+/* sb clean section: */
+
+static struct bkey_i *btree_root_find(struct bch_fs *c,
+				      struct bch_sb_field_clean *clean,
+				      struct jset *j,
+				      enum btree_id id, unsigned *level)
+{
+	struct bkey_i *k;
+	struct jset_entry *entry, *start, *end;
+
+	if (clean) {
+		start = clean->start;
+		end = vstruct_end(&clean->field);
+	} else {
+		start = j->start;
+		end = vstruct_last(j);
+	}
+
+	for (entry = start; entry < end; entry = vstruct_next(entry))
+		if (entry->type == BCH_JSET_ENTRY_btree_root &&
+		    entry->btree_id == id)
+			goto found;
+
+	return NULL;
+found:
+	if (!entry->u64s)
+		return ERR_PTR(-EINVAL);
+
+	k = entry->start;
+	*level = entry->level;
+	return k;
+}
+
+static int verify_superblock_clean(struct bch_fs *c,
+				   struct bch_sb_field_clean **cleanp,
+				   struct jset *j)
+{
+	unsigned i;
+	struct bch_sb_field_clean *clean = *cleanp;
+	int ret = 0;
+
+	if (!clean || !j)
+		return 0;
+
+	if (mustfix_fsck_err_on(j->seq != clean->journal_seq, c,
+			"superblock journal seq (%llu) doesn't match journal (%llu) after clean shutdown",
+			le64_to_cpu(clean->journal_seq),
+			le64_to_cpu(j->seq))) {
+		kfree(clean);
+		*cleanp = NULL;
+		return 0;
+	}
+
+	mustfix_fsck_err_on(j->read_clock != clean->read_clock, c,
+			"superblock read clock doesn't match journal after clean shutdown");
+	mustfix_fsck_err_on(j->write_clock != clean->write_clock, c,
+			"superblock read clock doesn't match journal after clean shutdown");
+
+	for (i = 0; i < BTREE_ID_NR; i++) {
+		struct bkey_i *k1, *k2;
+		unsigned l1 = 0, l2 = 0;
+
+		k1 = btree_root_find(c, clean, NULL, i, &l1);
+		k2 = btree_root_find(c, NULL, j, i, &l2);
+
+		if (!k1 && !k2)
+			continue;
+
+		mustfix_fsck_err_on(!k1 || !k2 ||
+				    IS_ERR(k1) ||
+				    IS_ERR(k2) ||
+				    k1->k.u64s != k2->k.u64s ||
+				    memcmp(k1, k2, bkey_bytes(k1)) ||
+				    l1 != l2, c,
+			"superblock btree root doesn't match journal after clean shutdown");
+	}
+fsck_err:
+	return ret;
+}
+
+static struct bch_sb_field_clean *read_superblock_clean(struct bch_fs *c)
+{
+	struct bch_sb_field_clean *clean, *sb_clean;
+	int ret;
+
+	mutex_lock(&c->sb_lock);
+	sb_clean = bch2_sb_get_clean(c->disk_sb.sb);
+
+	if (fsck_err_on(!sb_clean, c,
+			"superblock marked clean but clean section not present")) {
+		SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
+		c->sb.clean = false;
+		mutex_unlock(&c->sb_lock);
+		return NULL;
+	}
+
+	clean = kmemdup(sb_clean, vstruct_bytes(&sb_clean->field),
+			GFP_KERNEL);
+	if (!clean) {
+		mutex_unlock(&c->sb_lock);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	if (le16_to_cpu(c->disk_sb.sb->version) <
+	    bcachefs_metadata_version_bkey_renumber)
+		bch2_sb_clean_renumber(clean, READ);
+
+	mutex_unlock(&c->sb_lock);
+
+	return clean;
+fsck_err:
+	mutex_unlock(&c->sb_lock);
+	return ERR_PTR(ret);
+}
+
 static int read_btree_roots(struct bch_fs *c)
 {
 	unsigned i;
@@ -320,13 +470,6 @@ fsck_err:
 	return ret;
 }
 
-static bool journal_empty(struct list_head *journal)
-{
-	return list_empty(journal) ||
-		journal_entry_empty(&list_last_entry(journal,
-					struct journal_replay, list)->j);
-}
-
 int bch2_fs_recovery(struct bch_fs *c)
 {
 	const char *err = "cannot allocate memory";
-- 
cgit 


From d07343561e263fcbbdb8042f35ca29a602190e18 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 11 Apr 2019 22:39:39 -0400
Subject: bcachefs: Deduplicate keys in the journal before replay

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c |  17 +--
 fs/bcachefs/alloc_background.h |   3 +-
 fs/bcachefs/btree_gc.c         |  30 ++---
 fs/bcachefs/btree_gc.h         |   4 +-
 fs/bcachefs/ec.c               |  17 +--
 fs/bcachefs/ec.h               |   3 +-
 fs/bcachefs/recovery.c         | 280 ++++++++++++++++++++++++++++++++---------
 fs/bcachefs/recovery.h         |  16 +++
 8 files changed, 267 insertions(+), 103 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index b3a8ff0b1daa..5c8cebc443d1 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -12,7 +12,7 @@
 #include "debug.h"
 #include "ec.h"
 #include "error.h"
-#include "journal_io.h"
+#include "recovery.h"
 #include "trace.h"
 
 #include <linux/kthread.h>
@@ -261,13 +261,13 @@ static void bch2_alloc_read_key(struct bch_fs *c, struct bkey_s_c k)
 	percpu_up_read(&c->mark_lock);
 }
 
-int bch2_alloc_read(struct bch_fs *c, struct list_head *journal_replay_list)
+int bch2_alloc_read(struct bch_fs *c, struct journal_keys *journal_keys)
 {
-	struct journal_replay *r;
 	struct btree_trans trans;
 	struct btree_iter *iter;
 	struct bkey_s_c k;
 	struct bch_dev *ca;
+	struct journal_key *j;
 	unsigned i;
 	int ret;
 
@@ -282,14 +282,9 @@ int bch2_alloc_read(struct bch_fs *c, struct list_head *journal_replay_list)
 	if (ret)
 		return ret;
 
-	list_for_each_entry(r, journal_replay_list, list) {
-		struct bkey_i *k, *n;
-		struct jset_entry *entry;
-
-		for_each_jset_key(k, n, entry, &r->j)
-			if (entry->btree_id == BTREE_ID_ALLOC)
-				bch2_alloc_read_key(c, bkey_i_to_s_c(k));
-	}
+	for_each_journal_key(*journal_keys, j)
+		if (j->btree_id == BTREE_ID_ALLOC)
+			bch2_alloc_read_key(c, bkey_i_to_s_c(j->k));
 
 	percpu_down_write(&c->mark_lock);
 	bch2_dev_usage_from_buckets(c);
diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h
index 25d7426613da..b75c56a5dae0 100644
--- a/fs/bcachefs/alloc_background.h
+++ b/fs/bcachefs/alloc_background.h
@@ -25,7 +25,8 @@ void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 	.val_to_text	= bch2_alloc_to_text,		\
 }
 
-int bch2_alloc_read(struct bch_fs *, struct list_head *);
+struct journal_keys;
+int bch2_alloc_read(struct bch_fs *, struct journal_keys *);
 int bch2_alloc_replay_key(struct bch_fs *, struct bkey_i *);
 
 static inline void bch2_wake_allocator(struct bch_dev *ca)
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 84a0bb9202c4..cf0a2f4b22af 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -19,9 +19,9 @@
 #include "error.h"
 #include "extents.h"
 #include "journal.h"
-#include "journal_io.h"
 #include "keylist.h"
 #include "move.h"
+#include "recovery.h"
 #include "replicas.h"
 #include "super-io.h"
 #include "trace.h"
@@ -273,7 +273,7 @@ static inline int btree_id_gc_phase_cmp(enum btree_id l, enum btree_id r)
 		(int) btree_id_to_gc_phase(r);
 }
 
-static int bch2_gc_btrees(struct bch_fs *c, struct list_head *journal,
+static int bch2_gc_btrees(struct bch_fs *c, struct journal_keys *journal_keys,
 			  bool initial, bool metadata_only)
 {
 	enum btree_id ids[BTREE_ID_NR];
@@ -292,22 +292,18 @@ static int bch2_gc_btrees(struct bch_fs *c, struct list_head *journal,
 		if (ret)
 			return ret;
 
-		if (journal && !metadata_only &&
+		if (journal_keys && !metadata_only &&
 		    btree_node_type_needs_gc(type)) {
-			struct bkey_i *k, *n;
-			struct jset_entry *j;
-			struct journal_replay *r;
+			struct journal_key *j;
 			int ret;
 
-			list_for_each_entry(r, journal, list)
-				for_each_jset_key(k, n, j, &r->j) {
-					if (type == __btree_node_type(j->level, j->btree_id)) {
-						ret = bch2_gc_mark_key(c,
-							bkey_i_to_s_c(k),
-							&max_stale, initial);
-						if (ret)
-							return ret;
-					}
+			for_each_journal_key(*journal_keys, j)
+				if (j->btree_id == id) {
+					ret = bch2_gc_mark_key(c,
+						bkey_i_to_s_c(j->k),
+						&max_stale, initial);
+					if (ret)
+						return ret;
 				}
 		}
 	}
@@ -695,7 +691,7 @@ static int bch2_gc_start(struct bch_fs *c,
  *    move around - if references move backwards in the ordering GC
  *    uses, GC could skip past them
  */
-int bch2_gc(struct bch_fs *c, struct list_head *journal,
+int bch2_gc(struct bch_fs *c, struct journal_keys *journal_keys,
 	    bool initial, bool metadata_only)
 {
 	struct bch_dev *ca;
@@ -716,7 +712,7 @@ again:
 
 	bch2_mark_superblocks(c);
 
-	ret = bch2_gc_btrees(c, journal, initial, metadata_only);
+	ret = bch2_gc_btrees(c, journal_keys, initial, metadata_only);
 	if (ret)
 		goto out;
 
diff --git a/fs/bcachefs/btree_gc.h b/fs/bcachefs/btree_gc.h
index b7982e64b235..bd5f2752954f 100644
--- a/fs/bcachefs/btree_gc.h
+++ b/fs/bcachefs/btree_gc.h
@@ -5,7 +5,9 @@
 #include "btree_types.h"
 
 void bch2_coalesce(struct bch_fs *);
-int bch2_gc(struct bch_fs *, struct list_head *, bool, bool);
+
+struct journal_keys;
+int bch2_gc(struct bch_fs *, struct journal_keys *, bool, bool);
 void bch2_gc_thread_stop(struct bch_fs *);
 int bch2_gc_thread_start(struct bch_fs *);
 void bch2_mark_dev_superblock(struct bch_fs *, struct bch_dev *, unsigned);
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index 6a357e5b652e..47d197ed5c99 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -12,8 +12,8 @@
 #include "ec.h"
 #include "error.h"
 #include "io.h"
-#include "journal_io.h"
 #include "keylist.h"
+#include "recovery.h"
 #include "super-io.h"
 #include "util.h"
 
@@ -1235,9 +1235,9 @@ static void bch2_stripe_read_key(struct bch_fs *c, struct bkey_s_c k)
 	bch2_mark_key(c, k, true, 0, NULL, 0, 0);
 }
 
-int bch2_stripes_read(struct bch_fs *c, struct list_head *journal_replay_list)
+int bch2_stripes_read(struct bch_fs *c, struct journal_keys *journal_keys)
 {
-	struct journal_replay *r;
+	struct journal_key *i;
 	struct btree_trans trans;
 	struct btree_iter *iter;
 	struct bkey_s_c k;
@@ -1258,14 +1258,9 @@ int bch2_stripes_read(struct bch_fs *c, struct list_head *journal_replay_list)
 	if (ret)
 		return ret;
 
-	list_for_each_entry(r, journal_replay_list, list) {
-		struct bkey_i *k, *n;
-		struct jset_entry *entry;
-
-		for_each_jset_key(k, n, entry, &r->j)
-			if (entry->btree_id == BTREE_ID_EC)
-				bch2_stripe_read_key(c, bkey_i_to_s_c(k));
-	}
+	for_each_journal_key(*journal_keys, i)
+		if (i->btree_id == BTREE_ID_EC)
+			bch2_stripe_read_key(c, bkey_i_to_s_c(i->k));
 
 	return 0;
 }
diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h
index b048244a4a45..8d9fbfd19f66 100644
--- a/fs/bcachefs/ec.h
+++ b/fs/bcachefs/ec.h
@@ -150,7 +150,8 @@ void bch2_ec_stop_dev(struct bch_fs *, struct bch_dev *);
 
 void bch2_ec_flush_new_stripes(struct bch_fs *);
 
-int bch2_stripes_read(struct bch_fs *, struct list_head *);
+struct journal_keys;
+int bch2_stripes_read(struct bch_fs *, struct journal_keys *);
 int bch2_stripes_write(struct bch_fs *, unsigned, bool *);
 
 int bch2_ec_mem_alloc(struct bch_fs *, bool);
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 2e849135195d..5bfb38c4290f 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -24,9 +24,9 @@
 
 #define QSTR(n) { { { .len = strlen(n) } }, .name = n }
 
-/* journal replay: */
+/* sort and dedup all keys in the journal: */
 
-static void bch2_journal_entries_free(struct list_head *list)
+static void journal_entries_free(struct list_head *list)
 {
 
 	while (!list_empty(list)) {
@@ -38,6 +38,168 @@ static void bch2_journal_entries_free(struct list_head *list)
 	}
 }
 
+static int journal_sort_key_cmp(const void *_l, const void *_r)
+{
+	const struct journal_key *l = _l;
+	const struct journal_key *r = _r;
+
+	return cmp_int(l->btree_id, r->btree_id) ?:
+		bkey_cmp(l->pos, r->pos) ?:
+		cmp_int(l->journal_seq, r->journal_seq) ?:
+		cmp_int(l->journal_offset, r->journal_offset);
+}
+
+static int journal_sort_seq_cmp(const void *_l, const void *_r)
+{
+	const struct journal_key *l = _l;
+	const struct journal_key *r = _r;
+
+	return cmp_int(l->journal_seq, r->journal_seq) ?:
+		cmp_int(l->btree_id, r->btree_id) ?:
+		bkey_cmp(l->pos, r->pos);
+}
+
+static void journal_keys_sift(struct journal_keys *keys, struct journal_key *i)
+{
+	while (i + 1 < keys->d + keys->nr &&
+	       journal_sort_key_cmp(i, i + 1) > 0) {
+		swap(i[0], i[1]);
+		i++;
+	}
+}
+
+static void journal_keys_free(struct journal_keys *keys)
+{
+	struct journal_key *i;
+
+	for_each_journal_key(*keys, i)
+		if (i->allocated)
+			kfree(i->k);
+	kvfree(keys->d);
+	keys->d = NULL;
+	keys->nr = 0;
+}
+
+static struct journal_keys journal_keys_sort(struct list_head *journal_entries)
+{
+	struct journal_replay *p;
+	struct jset_entry *entry;
+	struct bkey_i *k, *_n;
+	struct journal_keys keys = { NULL }, keys_deduped = { NULL };
+	struct journal_key *i;
+	size_t nr_keys = 0;
+
+	list_for_each_entry(p, journal_entries, list)
+		for_each_jset_key(k, _n, entry, &p->j)
+			nr_keys++;
+
+	keys.journal_seq_base = keys_deduped.journal_seq_base =
+		le64_to_cpu(list_first_entry(journal_entries,
+					     struct journal_replay,
+					     list)->j.seq);
+
+	keys.d = kvmalloc(sizeof(keys.d[0]) * nr_keys, GFP_KERNEL);
+	if (!keys.d)
+		goto err;
+
+	keys_deduped.d = kvmalloc(sizeof(keys.d[0]) * nr_keys * 2, GFP_KERNEL);
+	if (!keys_deduped.d)
+		goto err;
+
+	list_for_each_entry(p, journal_entries, list)
+		for_each_jset_key(k, _n, entry, &p->j)
+			keys.d[keys.nr++] = (struct journal_key) {
+				.btree_id	= entry->btree_id,
+				.pos		= bkey_start_pos(&k->k),
+				.k		= k,
+				.journal_seq	= le64_to_cpu(p->j.seq) -
+					keys.journal_seq_base,
+				.journal_offset	= k->_data - p->j._data,
+			};
+
+	sort(keys.d, nr_keys, sizeof(keys.d[0]), journal_sort_key_cmp, NULL);
+
+	i = keys.d;
+	while (i < keys.d + keys.nr) {
+		if (i + 1 < keys.d + keys.nr &&
+		    i[0].btree_id == i[1].btree_id &&
+		    !bkey_cmp(i[0].pos, i[1].pos)) {
+			if (bkey_cmp(i[0].k->k.p, i[1].k->k.p) <= 0) {
+				i++;
+			} else {
+				bch2_cut_front(i[1].k->k.p, i[0].k);
+				i[0].pos = i[1].k->k.p;
+				journal_keys_sift(&keys, i);
+			}
+			continue;
+		}
+
+		if (i + 1 < keys.d + keys.nr &&
+		    i[0].btree_id == i[1].btree_id &&
+		    bkey_cmp(i[0].k->k.p, bkey_start_pos(&i[1].k->k)) > 0) {
+			if ((cmp_int(i[0].journal_seq, i[1].journal_seq) ?:
+			     cmp_int(i[0].journal_offset, i[1].journal_offset)) < 0) {
+				if (bkey_cmp(i[0].k->k.p, i[1].k->k.p) <= 0) {
+					bch2_cut_back(bkey_start_pos(&i[1].k->k), &i[0].k->k);
+				} else {
+					struct bkey_i *split =
+						kmalloc(bkey_bytes(i[0].k), GFP_KERNEL);
+
+					if (!split)
+						goto err;
+
+					bkey_copy(split, i[0].k);
+					bch2_cut_back(bkey_start_pos(&i[1].k->k), &split->k);
+					keys_deduped.d[keys_deduped.nr++] = (struct journal_key) {
+						.btree_id	= i[0].btree_id,
+						.allocated	= true,
+						.pos		= bkey_start_pos(&split->k),
+						.k		= split,
+						.journal_seq	= i[0].journal_seq,
+						.journal_offset	= i[0].journal_offset,
+					};
+
+					bch2_cut_front(i[1].k->k.p, i[0].k);
+					i[0].pos = i[1].k->k.p;
+					journal_keys_sift(&keys, i);
+					continue;
+				}
+			} else {
+				if (bkey_cmp(i[0].k->k.p, i[1].k->k.p) >= 0) {
+					i[1] = i[0];
+					i++;
+					continue;
+				} else {
+					bch2_cut_front(i[0].k->k.p, i[1].k);
+					i[1].pos = i[0].k->k.p;
+					journal_keys_sift(&keys, i + 1);
+					continue;
+				}
+			}
+		}
+
+		keys_deduped.d[keys_deduped.nr++] = *i++;
+	}
+
+	kvfree(keys.d);
+	return keys_deduped;
+err:
+	journal_keys_free(&keys_deduped);
+	kvfree(keys.d);
+	return (struct journal_keys) { NULL };
+}
+
+/* journal replay: */
+
+static void replay_now_at(struct journal *j, u64 seq)
+{
+	BUG_ON(seq < j->replay_journal_seq);
+	BUG_ON(seq > j->replay_journal_seq_end);
+
+	while (j->replay_journal_seq < seq)
+		bch2_journal_pin_put(j, j->replay_journal_seq++);
+}
+
 static int bch2_extent_replay_key(struct bch_fs *c, struct bkey_i *k)
 {
 	struct btree_trans trans;
@@ -100,54 +262,42 @@ static int bch2_extent_replay_key(struct bch_fs *c, struct bkey_i *k)
 	return ret;
 }
 
-static int bch2_journal_replay_key(struct bch_fs *c, enum btree_id btree_id,
-				   struct bkey_i *k)
-{
-	switch (btree_id) {
-	case BTREE_ID_ALLOC:
-		return bch2_alloc_replay_key(c, k);
-	case BTREE_ID_EXTENTS:
-		return bch2_extent_replay_key(c, k);
-	default:
-		return bch2_btree_insert(c, btree_id, k,
-					 NULL, NULL,
-					 BTREE_INSERT_NOFAIL|
-					 BTREE_INSERT_LAZY_RW|
-					 BTREE_INSERT_JOURNAL_REPLAY|
-					 BTREE_INSERT_NOMARK);
-	}
-}
-
-static void replay_now_at(struct journal *j, u64 seq)
-{
-	BUG_ON(seq < j->replay_journal_seq);
-	BUG_ON(seq > j->replay_journal_seq_end);
-
-	while (j->replay_journal_seq < seq)
-		bch2_journal_pin_put(j, j->replay_journal_seq++);
-}
-
-static int bch2_journal_replay(struct bch_fs *c, struct list_head *list)
+static int bch2_journal_replay(struct bch_fs *c,
+			       struct journal_keys keys)
 {
 	struct journal *j = &c->journal;
-	struct bkey_i *k, *_n;
-	struct jset_entry *entry;
-	struct journal_replay *i, *n;
-	int ret = 0;
+	struct journal_key *i;
+	int ret;
 
-	list_for_each_entry_safe(i, n, list, list) {
-		replay_now_at(j, le64_to_cpu(i->j.seq));
+	sort(keys.d, keys.nr, sizeof(keys.d[0]), journal_sort_seq_cmp, NULL);
 
-		for_each_jset_key(k, _n, entry, &i->j) {
-			ret = bch2_journal_replay_key(c, entry->btree_id, k);
-			if (ret) {
-				bch_err(c, "journal replay: error %d while replaying key",
-					ret);
-				goto err;
-			}
+	for_each_journal_key(keys, i) {
+		replay_now_at(j, keys.journal_seq_base + i->journal_seq);
+
+		switch (i->btree_id) {
+		case BTREE_ID_ALLOC:
+			ret = bch2_alloc_replay_key(c, i->k);
+			break;
+		case BTREE_ID_EXTENTS:
+			ret = bch2_extent_replay_key(c, i->k);
+			break;
+		default:
+			ret = bch2_btree_insert(c, i->btree_id, i->k,
+						NULL, NULL,
+						BTREE_INSERT_NOFAIL|
+						BTREE_INSERT_LAZY_RW|
+						BTREE_INSERT_JOURNAL_REPLAY|
+						BTREE_INSERT_NOMARK);
+			break;
+		}
 
-			cond_resched();
+		if (ret) {
+			bch_err(c, "journal replay: error %d while replaying key",
+				ret);
+			return ret;
 		}
+
+		cond_resched();
 	}
 
 	replay_now_at(j, j->replay_journal_seq_end);
@@ -155,10 +305,7 @@ static int bch2_journal_replay(struct bch_fs *c, struct list_head *list)
 
 	bch2_journal_set_replay_done(j);
 	bch2_journal_flush_all_pins(j);
-	ret = bch2_journal_error(j);
-err:
-	bch2_journal_entries_free(list);
-	return ret;
+	return bch2_journal_error(j);
 }
 
 static bool journal_empty(struct list_head *journal)
@@ -475,7 +622,8 @@ int bch2_fs_recovery(struct bch_fs *c)
 	const char *err = "cannot allocate memory";
 	struct bch_sb_field_clean *clean = NULL;
 	u64 journal_seq;
-	LIST_HEAD(journal);
+	LIST_HEAD(journal_entries);
+	struct journal_keys journal_keys = { NULL };
 	int ret;
 
 	if (c->sb.clean)
@@ -496,20 +644,27 @@ int bch2_fs_recovery(struct bch_fs *c)
 	if (!c->sb.clean || c->opts.fsck) {
 		struct jset *j;
 
-		ret = bch2_journal_read(c, &journal);
+		ret = bch2_journal_read(c, &journal_entries);
 		if (ret)
 			goto err;
 
-		fsck_err_on(c->sb.clean && !journal_empty(&journal), c,
+		fsck_err_on(c->sb.clean && !journal_empty(&journal_entries), c,
 			    "filesystem marked clean but journal not empty");
 
-		if (!c->sb.clean && list_empty(&journal)){
+		if (!c->sb.clean && list_empty(&journal_entries)) {
 			bch_err(c, "no journal entries found");
 			ret = BCH_FSCK_REPAIR_IMPOSSIBLE;
 			goto err;
 		}
 
-		j = &list_last_entry(&journal, struct journal_replay, list)->j;
+		journal_keys = journal_keys_sort(&journal_entries);
+		if (!journal_keys.d) {
+			ret = -ENOMEM;
+			goto err;
+		}
+
+		j = &list_last_entry(&journal_entries,
+				     struct journal_replay, list)->j;
 
 		ret = verify_superblock_clean(c, &clean, j);
 		if (ret)
@@ -520,7 +675,7 @@ int bch2_fs_recovery(struct bch_fs *c)
 		journal_seq = le64_to_cpu(clean->journal_seq) + 1;
 	}
 
-	ret = journal_replay_early(c, clean, &journal);
+	ret = journal_replay_early(c, clean, &journal_entries);
 	if (ret)
 		goto err;
 
@@ -538,11 +693,13 @@ int bch2_fs_recovery(struct bch_fs *c)
 
 	ret = bch2_blacklist_table_initialize(c);
 
-	ret = verify_journal_entries_not_blacklisted_or_missing(c, &journal);
+	ret = verify_journal_entries_not_blacklisted_or_missing(c,
+						&journal_entries);
 	if (ret)
 		goto err;
 
-	ret = bch2_fs_journal_start(&c->journal, journal_seq, &journal);
+	ret = bch2_fs_journal_start(&c->journal, journal_seq,
+				    &journal_entries);
 	if (ret)
 		goto err;
 
@@ -551,12 +708,12 @@ int bch2_fs_recovery(struct bch_fs *c)
 		goto err;
 
 	err = "error reading allocation information";
-	ret = bch2_alloc_read(c, &journal);
+	ret = bch2_alloc_read(c, &journal_keys);
 	if (ret)
 		goto err;
 
 	bch_verbose(c, "starting stripes_read");
-	ret = bch2_stripes_read(c, &journal);
+	ret = bch2_stripes_read(c, &journal_keys);
 	if (ret)
 		goto err;
 	bch_verbose(c, "stripes_read done");
@@ -568,7 +725,7 @@ int bch2_fs_recovery(struct bch_fs *c)
 	    test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags)) {
 		bch_verbose(c, "starting mark and sweep:");
 		err = "error in recovery";
-		ret = bch2_gc(c, &journal, true, false);
+		ret = bch2_gc(c, &journal_keys, true, false);
 		if (ret)
 			goto err;
 		bch_verbose(c, "mark and sweep done");
@@ -589,7 +746,7 @@ int bch2_fs_recovery(struct bch_fs *c)
 
 	bch_verbose(c, "starting journal replay:");
 	err = "journal replay failed";
-	ret = bch2_journal_replay(c, &journal);
+	ret = bch2_journal_replay(c, journal_keys);
 	if (ret)
 		goto err;
 	bch_verbose(c, "journal replay done");
@@ -629,7 +786,8 @@ int bch2_fs_recovery(struct bch_fs *c)
 	    c->journal_seq_blacklist_table->nr > 128)
 		queue_work(system_long_wq, &c->journal_seq_blacklist_gc_work);
 out:
-	bch2_journal_entries_free(&journal);
+	journal_keys_free(&journal_keys);
+	journal_entries_free(&journal_entries);
 	kfree(clean);
 	return ret;
 err:
diff --git a/fs/bcachefs/recovery.h b/fs/bcachefs/recovery.h
index 912929117c37..a69260d6165a 100644
--- a/fs/bcachefs/recovery.h
+++ b/fs/bcachefs/recovery.h
@@ -2,6 +2,22 @@
 #ifndef _BCACHEFS_RECOVERY_H
 #define _BCACHEFS_RECOVERY_H
 
+struct journal_keys {
+	struct journal_key {
+		enum btree_id	btree_id:8;
+		unsigned	allocated:1;
+		struct bpos	pos;
+		struct bkey_i	*k;
+		u32		journal_seq;
+		u32		journal_offset;
+	}			*d;
+	size_t			nr;
+	u64			journal_seq_base;
+};
+
+#define for_each_journal_key(keys, i)				\
+	for (i = (keys).d; i < (keys).d + (keys).nr; (i)++)
+
 int bch2_fs_recovery(struct bch_fs *);
 int bch2_fs_initialize(struct bch_fs *);
 
-- 
cgit 


From c6dd04f8f5644d92361bb2d6e47fa9b4d5af6d79 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 15 Apr 2019 14:58:00 -0400
Subject: bcachefs: Mark overwrites from journal replay in initial gc

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_gc.c          |  35 +++++++++++--
 fs/bcachefs/btree_update.h      |   4 ++
 fs/bcachefs/btree_update_leaf.c |  46 +++++++++--------
 fs/bcachefs/buckets.c           | 104 +++++++++++++++++++++-----------------
 fs/bcachefs/buckets.h           |   3 ++
 fs/bcachefs/recovery.c          | 107 ++++++++++++++++++++++++++--------------
 6 files changed, 192 insertions(+), 107 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index cf0a2f4b22af..2650f60b7cd7 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -273,11 +273,40 @@ static inline int btree_id_gc_phase_cmp(enum btree_id l, enum btree_id r)
 		(int) btree_id_to_gc_phase(r);
 }
 
+static int mark_journal_key(struct bch_fs *c, enum btree_id id,
+			    struct bkey_i *insert)
+{
+	struct btree_trans trans;
+	struct btree_iter *iter;
+	struct bkey_s_c k;
+	u8 max_stale;
+	int ret = 0;
+
+	ret = bch2_gc_mark_key(c, bkey_i_to_s_c(insert), &max_stale, true);
+	if (ret)
+		return ret;
+
+	bch2_trans_init(&trans, c);
+
+	for_each_btree_key(&trans, iter, id, bkey_start_pos(&insert->k),
+			   BTREE_ITER_SLOTS, k) {
+		percpu_down_read(&c->mark_lock);
+		ret = bch2_mark_overwrite(&trans, iter, k, insert, NULL,
+					 BCH_BUCKET_MARK_GC|
+					 BCH_BUCKET_MARK_NOATOMIC);
+		percpu_up_read(&c->mark_lock);
+
+		if (!ret)
+			break;
+	}
+
+	return bch2_trans_exit(&trans);
+}
+
 static int bch2_gc_btrees(struct bch_fs *c, struct journal_keys *journal_keys,
 			  bool initial, bool metadata_only)
 {
 	enum btree_id ids[BTREE_ID_NR];
-	u8 max_stale;
 	unsigned i;
 
 	for (i = 0; i < BTREE_ID_NR; i++)
@@ -299,9 +328,7 @@ static int bch2_gc_btrees(struct bch_fs *c, struct journal_keys *journal_keys,
 
 			for_each_journal_key(*journal_keys, j)
 				if (j->btree_id == id) {
-					ret = bch2_gc_mark_key(c,
-						bkey_i_to_s_c(j->k),
-						&max_stale, initial);
+					ret = mark_journal_key(c, id, j->k);
 					if (ret)
 						return ret;
 				}
diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
index 75ed02874767..7a638a76634f 100644
--- a/fs/bcachefs/btree_update.h
+++ b/fs/bcachefs/btree_update.h
@@ -43,6 +43,7 @@ enum {
 	__BTREE_INSERT_USE_ALLOC_RESERVE,
 	__BTREE_INSERT_JOURNAL_REPLAY,
 	__BTREE_INSERT_JOURNAL_RESERVED,
+	__BTREE_INSERT_NOMARK_OVERWRITES,
 	__BTREE_INSERT_NOMARK,
 	__BTREE_INSERT_NOWAIT,
 	__BTREE_INSERT_GC_LOCK_HELD,
@@ -76,6 +77,9 @@ enum {
 
 #define BTREE_INSERT_JOURNAL_RESERVED	(1 << __BTREE_INSERT_JOURNAL_RESERVED)
 
+/* Don't mark overwrites, just new key: */
+#define BTREE_INSERT_NOMARK_OVERWRITES	(1 << __BTREE_INSERT_NOMARK_OVERWRITES)
+
 /* Don't call bch2_mark_key: */
 #define BTREE_INSERT_NOMARK		(1 << __BTREE_INSERT_NOMARK)
 
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 48d3be517471..2633a5452b13 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -542,20 +542,22 @@ static inline int do_btree_insert_at(struct btree_trans *trans,
 
 	btree_trans_lock_write(c, trans);
 
-	trans_for_each_update_iter(trans, i) {
-		if (i->deferred ||
-		    !btree_node_type_needs_gc(i->iter->btree_id))
-			continue;
+	if (likely(!(trans->flags & BTREE_INSERT_NOMARK))) {
+		trans_for_each_update_iter(trans, i) {
+			if (i->deferred ||
+			    !btree_node_type_needs_gc(i->iter->btree_id))
+				continue;
 
-		if (!fs_usage) {
-			percpu_down_read(&c->mark_lock);
-			fs_usage = bch2_fs_usage_scratch_get(c);
-		}
+			if (!fs_usage) {
+				percpu_down_read(&c->mark_lock);
+				fs_usage = bch2_fs_usage_scratch_get(c);
+			}
 
-		if (!bch2_bkey_replicas_marked_locked(c,
-				bkey_i_to_s_c(i->k), true)) {
-			ret = BTREE_INSERT_NEED_MARK_REPLICAS;
-			goto out;
+			if (!bch2_bkey_replicas_marked_locked(c,
+					bkey_i_to_s_c(i->k), true)) {
+				ret = BTREE_INSERT_NEED_MARK_REPLICAS;
+				goto out;
+			}
 		}
 	}
 
@@ -602,16 +604,18 @@ static inline int do_btree_insert_at(struct btree_trans *trans,
 				linked->flags |= BTREE_ITER_NOUNLOCK;
 	}
 
-	trans_for_each_update_iter(trans, i)
-		bch2_mark_update(trans, i, fs_usage, 0);
-	if (fs_usage)
-		bch2_trans_fs_usage_apply(trans, fs_usage);
-
-	if (unlikely(c->gc_pos.phase)) {
+	if (likely(!(trans->flags & BTREE_INSERT_NOMARK))) {
 		trans_for_each_update_iter(trans, i)
-			if (gc_visited(c, gc_pos_btree_node(i->iter->l[0].b)))
-				bch2_mark_update(trans, i, NULL,
-						 BCH_BUCKET_MARK_GC);
+			bch2_mark_update(trans, i, fs_usage, 0);
+		if (fs_usage)
+			bch2_trans_fs_usage_apply(trans, fs_usage);
+
+		if (unlikely(c->gc_pos.phase)) {
+			trans_for_each_update_iter(trans, i)
+				if (gc_visited(c, gc_pos_btree_node(i->iter->l[0].b)))
+					bch2_mark_update(trans, i, NULL,
+							 BCH_BUCKET_MARK_GC);
+		}
 	}
 
 	trans_for_each_update(trans, i)
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 4fe66ee1f745..7a05ba5fd589 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -1035,6 +1035,56 @@ int bch2_mark_key(struct bch_fs *c, struct bkey_s_c k,
 	return ret;
 }
 
+inline bool bch2_mark_overwrite(struct btree_trans *trans,
+				struct btree_iter *iter,
+				struct bkey_s_c old,
+				struct bkey_i *new,
+				struct bch_fs_usage *fs_usage,
+				unsigned flags)
+{
+	struct bch_fs		*c = trans->c;
+	struct btree		*b = iter->l[0].b;
+	s64			sectors = 0;
+
+	if (btree_node_is_extents(b)
+	    ? bkey_cmp(new->k.p, bkey_start_pos(old.k)) <= 0
+	    : bkey_cmp(new->k.p, old.k->p))
+		return false;
+
+	if (btree_node_is_extents(b)) {
+		switch (bch2_extent_overlap(&new->k, old.k)) {
+		case BCH_EXTENT_OVERLAP_ALL:
+			sectors = -((s64) old.k->size);
+			break;
+		case BCH_EXTENT_OVERLAP_BACK:
+			sectors = bkey_start_offset(&new->k) -
+				old.k->p.offset;
+			break;
+		case BCH_EXTENT_OVERLAP_FRONT:
+			sectors = bkey_start_offset(old.k) -
+				new->k.p.offset;
+			break;
+		case BCH_EXTENT_OVERLAP_MIDDLE:
+			sectors = old.k->p.offset - new->k.p.offset;
+			BUG_ON(sectors <= 0);
+
+			bch2_mark_key_locked(c, old, true, sectors,
+				fs_usage, trans->journal_res.seq,
+				flags);
+
+			sectors = bkey_start_offset(&new->k) -
+				old.k->p.offset;
+			break;
+		}
+
+		BUG_ON(sectors >= 0);
+	}
+
+	bch2_mark_key_locked(c, old, false, sectors,
+		fs_usage, trans->journal_res.seq, flags);
+	return true;
+}
+
 void bch2_mark_update(struct btree_trans *trans,
 		      struct btree_insert_entry *insert,
 		      struct bch_fs_usage *fs_usage,
@@ -1049,57 +1099,23 @@ void bch2_mark_update(struct btree_trans *trans,
 	if (!btree_node_type_needs_gc(iter->btree_id))
 		return;
 
-	if (!(trans->flags & BTREE_INSERT_NOMARK))
-		bch2_mark_key_locked(c, bkey_i_to_s_c(insert->k), true,
-			bpos_min(insert->k->k.p, b->key.k.p).offset -
-			bkey_start_offset(&insert->k->k),
-			fs_usage, trans->journal_res.seq, flags);
+	bch2_mark_key_locked(c, bkey_i_to_s_c(insert->k), true,
+		bpos_min(insert->k->k.p, b->key.k.p).offset -
+		bkey_start_offset(&insert->k->k),
+		fs_usage, trans->journal_res.seq, flags);
+
+	if (unlikely(trans->flags & BTREE_INSERT_NOMARK_OVERWRITES))
+		return;
 
 	while ((_k = bch2_btree_node_iter_peek_filter(&node_iter, b,
 						      KEY_TYPE_discard))) {
 		struct bkey		unpacked;
-		struct bkey_s_c		k;
-		s64			sectors = 0;
+		struct bkey_s_c		k = bkey_disassemble(b, _k, &unpacked);
 
-		k = bkey_disassemble(b, _k, &unpacked);
-
-		if (btree_node_is_extents(b)
-		    ? bkey_cmp(insert->k->k.p, bkey_start_pos(k.k)) <= 0
-		    : bkey_cmp(insert->k->k.p, k.k->p))
+		if (!bch2_mark_overwrite(trans, iter, k, insert->k,
+					 fs_usage, flags))
 			break;
 
-		if (btree_node_is_extents(b)) {
-			switch (bch2_extent_overlap(&insert->k->k, k.k)) {
-			case BCH_EXTENT_OVERLAP_ALL:
-				sectors = -((s64) k.k->size);
-				break;
-			case BCH_EXTENT_OVERLAP_BACK:
-				sectors = bkey_start_offset(&insert->k->k) -
-					k.k->p.offset;
-				break;
-			case BCH_EXTENT_OVERLAP_FRONT:
-				sectors = bkey_start_offset(k.k) -
-					insert->k->k.p.offset;
-				break;
-			case BCH_EXTENT_OVERLAP_MIDDLE:
-				sectors = k.k->p.offset - insert->k->k.p.offset;
-				BUG_ON(sectors <= 0);
-
-				bch2_mark_key_locked(c, k, true, sectors,
-					fs_usage, trans->journal_res.seq,
-					flags);
-
-				sectors = bkey_start_offset(&insert->k->k) -
-					k.k->p.offset;
-				break;
-			}
-
-			BUG_ON(sectors >= 0);
-		}
-
-		bch2_mark_key_locked(c, k, false, sectors,
-			fs_usage, trans->journal_res.seq, flags);
-
 		bch2_btree_node_iter_advance(&node_iter, b);
 	}
 }
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index 095015f17f76..90fffee1c289 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -254,6 +254,9 @@ int bch2_mark_key(struct bch_fs *, struct bkey_s_c,
 int bch2_fs_usage_apply(struct bch_fs *, struct bch_fs_usage *,
 			struct disk_reservation *);
 
+bool bch2_mark_overwrite(struct btree_trans *, struct btree_iter *,
+			 struct bkey_s_c, struct bkey_i *,
+			 struct bch_fs_usage *, unsigned);
 void bch2_mark_update(struct btree_trans *, struct btree_insert_entry *,
 		      struct bch_fs_usage *, unsigned);
 void bch2_trans_fs_usage_apply(struct btree_trans *, struct bch_fs_usage *);
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 5bfb38c4290f..d207ff7b98f4 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -203,63 +203,94 @@ static void replay_now_at(struct journal *j, u64 seq)
 static int bch2_extent_replay_key(struct bch_fs *c, struct bkey_i *k)
 {
 	struct btree_trans trans;
-	struct btree_iter *iter;
+	struct btree_iter *iter, *split_iter;
 	/*
-	 * We might cause compressed extents to be
-	 * split, so we need to pass in a
-	 * disk_reservation:
+	 * We might cause compressed extents to be split, so we need to pass in
+	 * a disk_reservation:
 	 */
 	struct disk_reservation disk_res =
 		bch2_disk_reservation_init(c, 0);
-	BKEY_PADDED(k) split;
+	struct bkey_i *split;
+	bool split_compressed = false;
+	unsigned flags = BTREE_INSERT_ATOMIC|
+		BTREE_INSERT_NOFAIL|
+		BTREE_INSERT_LAZY_RW|
+		BTREE_INSERT_JOURNAL_REPLAY|
+		BTREE_INSERT_NOMARK;
 	int ret;
 
 	bch2_trans_init(&trans, c);
+	bch2_trans_preload_iters(&trans);
+retry:
+	bch2_trans_begin(&trans);
 
 	iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
 				   bkey_start_pos(&k->k),
 				   BTREE_ITER_INTENT);
+
 	do {
 		ret = bch2_btree_iter_traverse(iter);
 		if (ret)
-			break;
+			goto err;
 
-		bkey_copy(&split.k, k);
-		bch2_cut_front(iter->pos, &split.k);
-		bch2_extent_trim_atomic(&split.k, iter);
-
-		ret = bch2_disk_reservation_add(c, &disk_res,
-				split.k.k.size *
-				bch2_bkey_nr_dirty_ptrs(bkey_i_to_s_c(&split.k)),
-				BCH_DISK_RESERVATION_NOFAIL);
-		BUG_ON(ret);
-
-		bch2_trans_update(&trans, BTREE_INSERT_ENTRY(iter, &split.k));
-		ret = bch2_trans_commit(&trans, &disk_res, NULL,
-					BTREE_INSERT_ATOMIC|
-					BTREE_INSERT_NOFAIL|
-					BTREE_INSERT_LAZY_RW|
-					BTREE_INSERT_JOURNAL_REPLAY);
-	} while ((!ret || ret == -EINTR) &&
-		 bkey_cmp(k->k.p, iter->pos));
+		split_iter = bch2_trans_copy_iter(&trans, iter);
+		ret = PTR_ERR_OR_ZERO(split_iter);
+		if (ret)
+			goto err;
 
-	bch2_disk_reservation_put(c, &disk_res);
+		split = bch2_trans_kmalloc(&trans, bkey_bytes(&k->k));
+		ret = PTR_ERR_OR_ZERO(split);
+		if (ret)
+			goto err;
 
-	/*
-	 * This isn't strictly correct - we should only be relying on the btree
-	 * node lock for synchronization with gc when we've got a write lock
-	 * held.
-	 *
-	 * but - there are other correctness issues if btree gc were to run
-	 * before journal replay finishes
-	 */
-	BUG_ON(c->gc_pos.phase);
+		if (!split_compressed &&
+		    bch2_extent_is_compressed(bkey_i_to_s_c(k)) &&
+		    !bch2_extent_is_atomic(k, split_iter)) {
+			ret = bch2_disk_reservation_add(c, &disk_res,
+					k->k.size *
+					bch2_bkey_nr_dirty_ptrs(bkey_i_to_s_c(k)),
+					BCH_DISK_RESERVATION_NOFAIL);
+			BUG_ON(ret);
+
+			flags &= ~BTREE_INSERT_JOURNAL_REPLAY;
+			flags &= ~BTREE_INSERT_NOMARK;
+			flags |=  BTREE_INSERT_NOMARK_OVERWRITES;
+			split_compressed = true;
+		}
 
-	bch2_mark_key(c, bkey_i_to_s_c(k), false, -((s64) k->k.size),
-		      NULL, 0, 0);
-	bch2_trans_exit(&trans);
+		bkey_copy(split, k);
+		bch2_cut_front(split_iter->pos, split);
+		bch2_extent_trim_atomic(split, split_iter);
 
-	return ret;
+		bch2_trans_update(&trans, BTREE_INSERT_ENTRY(split_iter, split));
+		bch2_btree_iter_set_pos(iter, split->k.p);
+	} while (bkey_cmp(iter->pos, k->k.p) < 0);
+
+	ret = bch2_trans_commit(&trans, &disk_res, NULL, flags);
+	if (ret)
+		goto err;
+
+	if (split_compressed) {
+		/*
+		 * This isn't strictly correct - we should only be relying on
+		 * the btree node lock for synchronization with gc when we've
+		 * got a write lock held.
+		 *
+		 * but - there are other correctness issues if btree gc were to
+		 * run before journal replay finishes
+		 */
+		BUG_ON(c->gc_pos.phase);
+
+		bch2_mark_key(c, bkey_i_to_s_c(k), false, -((s64) k->k.size),
+			      NULL, 0, 0);
+	}
+err:
+	if (ret == -EINTR)
+		goto retry;
+
+	bch2_disk_reservation_put(c, &disk_res);
+
+	return bch2_trans_exit(&trans) ?: ret;
 }
 
 static int bch2_journal_replay(struct bch_fs *c,
-- 
cgit 


From 53beb841623bcdb1fe619efe5f2c34ca3af08c78 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 16 Apr 2019 14:42:05 -0400
Subject: bcachefs: lockdep fix when going rw from bch2_alloc_write()

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c | 35 +++++++++++++++++++++++------------
 1 file changed, 23 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 5c8cebc443d1..c254c08af9d1 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -411,21 +411,21 @@ static int __bch2_alloc_write_key(struct btree_trans *trans, struct bch_dev *ca,
 
 int bch2_alloc_write(struct bch_fs *c, unsigned flags, bool *wrote)
 {
+	struct btree_trans trans;
+	struct btree_iter *iter;
+	struct bucket_array *buckets;
 	struct bch_dev *ca;
 	unsigned i;
+	size_t b;
 	int ret = 0;
 
-	for_each_rw_member(ca, c, i) {
-		struct btree_trans trans;
-		struct btree_iter *iter;
-		struct bucket_array *buckets;
-		size_t b;
-
-		bch2_trans_init(&trans, c);
+	bch2_trans_init(&trans, c);
 
-		iter = bch2_trans_get_iter(&trans, BTREE_ID_ALLOC, POS_MIN,
-					   BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+	iter = bch2_trans_get_iter(&trans, BTREE_ID_ALLOC, POS_MIN,
+				   BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
 
+	for_each_rw_member(ca, c, i) {
+relock:
 		down_read(&ca->bucket_lock);
 		buckets = bucket_array(ca);
 
@@ -435,6 +435,17 @@ int bch2_alloc_write(struct bch_fs *c, unsigned flags, bool *wrote)
 			if (!buckets->b[b].mark.dirty)
 				continue;
 
+			if ((flags & BTREE_INSERT_LAZY_RW) &&
+			    percpu_ref_is_zero(&c->writes)) {
+				up_read(&ca->bucket_lock);
+				bch2_trans_unlock(&trans);
+
+				ret = bch2_fs_read_write_early(c);
+				if (ret)
+					goto out;
+				goto relock;
+			}
+
 			ret = __bch2_alloc_write_key(&trans, ca, b,
 						     iter, flags);
 			if (ret)
@@ -444,15 +455,15 @@ int bch2_alloc_write(struct bch_fs *c, unsigned flags, bool *wrote)
 			*wrote = true;
 		}
 		up_read(&ca->bucket_lock);
-
-		bch2_trans_exit(&trans);
-
+out:
 		if (ret) {
 			percpu_ref_put(&ca->io_ref);
 			break;
 		}
 	}
 
+	bch2_trans_exit(&trans);
+
 	return ret;
 }
 
-- 
cgit 


From f80b4e64a4d79e78053a0e2ed4607f6af9dd2c89 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 16 Apr 2019 15:13:16 -0400
Subject: bcachefs: Fix hang while shutting down

If the allocator thread exited before bch2_dev_allocator_stop() was
called (because of an error), bch2_dev_allocator_quiesce() could hang.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c | 22 ++++++++++++++--------
 fs/bcachefs/bcachefs.h         |  8 ++++++--
 fs/bcachefs/movinggc.c         |  2 +-
 3 files changed, 21 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index c254c08af9d1..4a8f6fa3db1e 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -598,6 +598,9 @@ static int wait_buckets_available(struct bch_fs *c, struct bch_dev *ca)
 	unsigned long gc_count = c->gc_count;
 	int ret = 0;
 
+	ca->allocator_state = ALLOCATOR_BLOCKED;
+	closure_wake_up(&c->freelist_wait);
+
 	while (1) {
 		set_current_state(TASK_INTERRUPTIBLE);
 		if (kthread_should_stop()) {
@@ -620,6 +623,9 @@ static int wait_buckets_available(struct bch_fs *c, struct bch_dev *ca)
 	}
 
 	__set_current_state(TASK_RUNNING);
+	ca->allocator_state = ALLOCATOR_RUNNING;
+	closure_wake_up(&c->freelist_wait);
+
 	return ret;
 }
 
@@ -1119,14 +1125,14 @@ static int push_invalidated_bucket(struct bch_fs *c, struct bch_dev *ca, size_t
 				fifo_pop(&ca->free_inc, bucket);
 
 				closure_wake_up(&c->freelist_wait);
-				ca->allocator_blocked_full = false;
+				ca->allocator_state = ALLOCATOR_RUNNING;
 
 				spin_unlock(&c->freelist_lock);
 				goto out;
 			}
 
-		if (!ca->allocator_blocked_full) {
-			ca->allocator_blocked_full = true;
+		if (ca->allocator_state != ALLOCATOR_BLOCKED_FULL) {
+			ca->allocator_state = ALLOCATOR_BLOCKED_FULL;
 			closure_wake_up(&c->freelist_wait);
 		}
 
@@ -1184,6 +1190,7 @@ static int bch2_allocator_thread(void *arg)
 	int ret;
 
 	set_freezable();
+	ca->allocator_state = ALLOCATOR_RUNNING;
 
 	while (1) {
 		cond_resched();
@@ -1242,9 +1249,6 @@ static int bch2_allocator_thread(void *arg)
 			if (!nr ||
 			    (nr < ALLOC_SCAN_BATCH(ca) &&
 			     !fifo_full(&ca->free[RESERVE_MOVINGGC]))) {
-				ca->allocator_blocked = true;
-				closure_wake_up(&c->freelist_wait);
-
 				ret = wait_buckets_available(c, ca);
 				if (ret) {
 					up_read(&c->gc_lock);
@@ -1253,7 +1257,6 @@ static int bch2_allocator_thread(void *arg)
 			}
 		} while (!nr);
 
-		ca->allocator_blocked = false;
 		up_read(&c->gc_lock);
 
 		pr_debug("%zu buckets to invalidate", nr);
@@ -1266,6 +1269,8 @@ static int bch2_allocator_thread(void *arg)
 
 stop:
 	pr_debug("alloc thread stopping (ret %i)", ret);
+	ca->allocator_state = ALLOCATOR_STOPPED;
+	closure_wake_up(&c->freelist_wait);
 	return 0;
 }
 
@@ -1457,7 +1462,8 @@ void bch2_dev_allocator_add(struct bch_fs *c, struct bch_dev *ca)
 void bch2_dev_allocator_quiesce(struct bch_fs *c, struct bch_dev *ca)
 {
 	if (ca->alloc_thread)
-		closure_wait_event(&c->freelist_wait, ca->allocator_blocked_full);
+		closure_wait_event(&c->freelist_wait,
+				   ca->allocator_state != ALLOCATOR_RUNNING);
 }
 
 /* stop allocator thread: */
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 8acdc7ffeca3..72f9f5f9abe9 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -447,8 +447,12 @@ struct bch_dev {
 	 * XXX: this should be an enum for allocator state, so as to include
 	 * error state
 	 */
-	bool			allocator_blocked;
-	bool			allocator_blocked_full;
+	enum {
+		ALLOCATOR_STOPPED,
+		ALLOCATOR_RUNNING,
+		ALLOCATOR_BLOCKED,
+		ALLOCATOR_BLOCKED_FULL,
+	}			allocator_state;
 
 	alloc_heap		alloc_heap;
 
diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c
index aba13e6ea4ff..d97be76da58f 100644
--- a/fs/bcachefs/movinggc.c
+++ b/fs/bcachefs/movinggc.c
@@ -116,7 +116,7 @@ static bool have_copygc_reserve(struct bch_dev *ca)
 
 	spin_lock(&ca->freelist_lock);
 	ret = fifo_full(&ca->free[RESERVE_MOVINGGC]) ||
-		ca->allocator_blocked;
+		ca->allocator_state != ALLOCATOR_RUNNING;
 	spin_unlock(&ca->freelist_lock);
 
 	return ret;
-- 
cgit 


From ea4160234487ac7baefb919747691a21d2face4a Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 16 Apr 2019 16:03:31 -0400
Subject: bcachefs: use same timesource as current_time()

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 72f9f5f9abe9..6f33121736bb 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -838,7 +838,7 @@ static inline s64 bch2_current_time(struct bch_fs *c)
 {
 	struct timespec64 now;
 
-	ktime_get_real_ts64(&now);
+	ktime_get_coarse_real_ts64(&now);
 	return timespec_to_bch2_time(c, now);
 }
 
-- 
cgit 


From 201a4d4cbed532c73d83ea9ea8166f40e9faa1e1 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 17 Apr 2019 15:49:45 -0400
Subject: bcachefs: fix triggers for stripes btree

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/buckets.c | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 7a05ba5fd589..e9c5889b2c0f 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -1107,6 +1107,15 @@ void bch2_mark_update(struct btree_trans *trans,
 	if (unlikely(trans->flags & BTREE_INSERT_NOMARK_OVERWRITES))
 		return;
 
+	/*
+	 * For non extents, we only mark the new key, not the key being
+	 * overwritten - unless we're actually deleting:
+	 */
+	if ((iter->btree_id == BTREE_ID_ALLOC ||
+	     iter->btree_id == BTREE_ID_EC) &&
+	    !bkey_deleted(&insert->k->k))
+		return;
+
 	while ((_k = bch2_btree_node_iter_peek_filter(&node_iter, b,
 						      KEY_TYPE_discard))) {
 		struct bkey		unpacked;
-- 
cgit 


From 94f651e2c7e2808e82673b46776f951a67da4a2d Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 17 Apr 2019 15:49:28 -0400
Subject: bcachefs: Return errors from for_each_btree_key()

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c | 10 ++---
 fs/bcachefs/btree_gc.c         |  4 +-
 fs/bcachefs/btree_iter.h       | 16 +++++---
 fs/bcachefs/buckets.c          | 41 ++++++++++---------
 fs/bcachefs/buckets.h          | 10 ++---
 fs/bcachefs/dirent.c           | 17 ++++----
 fs/bcachefs/ec.c               | 19 ++++-----
 fs/bcachefs/extents.c          |  3 +-
 fs/bcachefs/fs-io.c            | 10 ++---
 fs/bcachefs/fs.c               | 10 ++---
 fs/bcachefs/fsck.c             | 28 ++++++-------
 fs/bcachefs/io.c               | 13 +++---
 fs/bcachefs/move.c             |  2 +-
 fs/bcachefs/quota.c            |  4 +-
 fs/bcachefs/str_hash.h         | 43 ++++++++-----------
 fs/bcachefs/sysfs.c            |  8 +++-
 fs/bcachefs/tests.c            | 35 ++++++++--------
 fs/bcachefs/xattr.c            | 93 ++++++++++++++++++++----------------------
 18 files changed, 182 insertions(+), 184 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 4a8f6fa3db1e..a6d3417ac262 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -273,14 +273,14 @@ int bch2_alloc_read(struct bch_fs *c, struct journal_keys *journal_keys)
 
 	bch2_trans_init(&trans, c);
 
-	for_each_btree_key(&trans, iter, BTREE_ID_ALLOC, POS_MIN, 0, k) {
+	for_each_btree_key(&trans, iter, BTREE_ID_ALLOC, POS_MIN, 0, k, ret)
 		bch2_alloc_read_key(c, k);
-		bch2_trans_cond_resched(&trans);
-	}
 
-	ret = bch2_trans_exit(&trans);
-	if (ret)
+	ret = bch2_trans_exit(&trans) ?: ret;
+	if (ret) {
+		bch_err(c, "error reading alloc info: %i", ret);
 		return ret;
+	}
 
 	for_each_journal_key(*journal_keys, j)
 		if (j->btree_id == BTREE_ID_ALLOC)
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 2650f60b7cd7..3ba0910c2a47 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -289,7 +289,7 @@ static int mark_journal_key(struct bch_fs *c, enum btree_id id,
 	bch2_trans_init(&trans, c);
 
 	for_each_btree_key(&trans, iter, id, bkey_start_pos(&insert->k),
-			   BTREE_ITER_SLOTS, k) {
+			   BTREE_ITER_SLOTS, k, ret) {
 		percpu_down_read(&c->mark_lock);
 		ret = bch2_mark_overwrite(&trans, iter, k, insert, NULL,
 					 BCH_BUCKET_MARK_GC|
@@ -300,7 +300,7 @@ static int mark_journal_key(struct bch_fs *c, enum btree_id id,
 			break;
 	}
 
-	return bch2_trans_exit(&trans);
+	return bch2_trans_exit(&trans) ?: ret;
 }
 
 static int bch2_gc_btrees(struct bch_fs *c, struct journal_keys *journal_keys,
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index 291c805e3cc5..0a4c6c76e43b 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -238,12 +238,16 @@ static inline struct bkey_s_c __bch2_btree_iter_next(struct btree_iter *iter,
 		: bch2_btree_iter_next(iter);
 }
 
-#define for_each_btree_key(_trans, _iter, _btree_id,  _start, _flags, _k)\
-	for (iter = bch2_trans_get_iter((_trans), (_btree_id),		\
-					(_start), (_flags)),		\
-	     (_k) = __bch2_btree_iter_peek(_iter, _flags);		\
-	     !IS_ERR_OR_NULL((_k).k);					\
-	     (_k) = __bch2_btree_iter_next(_iter, _flags))
+#define for_each_btree_key(_trans, _iter, _btree_id,			\
+			   _start, _flags, _k, _ret)			\
+	for ((_ret) = PTR_ERR_OR_ZERO((_iter) =				\
+			bch2_trans_get_iter((_trans), (_btree_id),	\
+					    (_start), (_flags))) ?:	\
+		      PTR_ERR_OR_ZERO(((_k) =				\
+			__bch2_btree_iter_peek(_iter, _flags)).k);	\
+	     !ret && (_k).k;						\
+	     (_ret) = PTR_ERR_OR_ZERO(((_k) =				\
+			__bch2_btree_iter_next(_iter, _flags)).k))
 
 #define for_each_btree_key_continue(_iter, _flags, _k)			\
 	for ((_k) = __bch2_btree_iter_peek(_iter, _flags);		\
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index e9c5889b2c0f..ff4c61371830 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -1035,12 +1035,12 @@ int bch2_mark_key(struct bch_fs *c, struct bkey_s_c k,
 	return ret;
 }
 
-inline bool bch2_mark_overwrite(struct btree_trans *trans,
-				struct btree_iter *iter,
-				struct bkey_s_c old,
-				struct bkey_i *new,
-				struct bch_fs_usage *fs_usage,
-				unsigned flags)
+inline int bch2_mark_overwrite(struct btree_trans *trans,
+			       struct btree_iter *iter,
+			       struct bkey_s_c old,
+			       struct bkey_i *new,
+			       struct bch_fs_usage *fs_usage,
+			       unsigned flags)
 {
 	struct bch_fs		*c = trans->c;
 	struct btree		*b = iter->l[0].b;
@@ -1049,7 +1049,7 @@ inline bool bch2_mark_overwrite(struct btree_trans *trans,
 	if (btree_node_is_extents(b)
 	    ? bkey_cmp(new->k.p, bkey_start_pos(old.k)) <= 0
 	    : bkey_cmp(new->k.p, old.k->p))
-		return false;
+		return 0;
 
 	if (btree_node_is_extents(b)) {
 		switch (bch2_extent_overlap(&new->k, old.k)) {
@@ -1080,24 +1080,24 @@ inline bool bch2_mark_overwrite(struct btree_trans *trans,
 		BUG_ON(sectors >= 0);
 	}
 
-	bch2_mark_key_locked(c, old, false, sectors,
-		fs_usage, trans->journal_res.seq, flags);
-	return true;
+	return bch2_mark_key_locked(c, old, false, sectors, fs_usage,
+				    trans->journal_res.seq, flags) ?: 1;
 }
 
-void bch2_mark_update(struct btree_trans *trans,
-		      struct btree_insert_entry *insert,
-		      struct bch_fs_usage *fs_usage,
-		      unsigned flags)
+int bch2_mark_update(struct btree_trans *trans,
+		     struct btree_insert_entry *insert,
+		     struct bch_fs_usage *fs_usage,
+		     unsigned flags)
 {
 	struct bch_fs		*c = trans->c;
 	struct btree_iter	*iter = insert->iter;
 	struct btree		*b = iter->l[0].b;
 	struct btree_node_iter	node_iter = iter->l[0].iter;
 	struct bkey_packed	*_k;
+	int ret = 0;
 
 	if (!btree_node_type_needs_gc(iter->btree_id))
-		return;
+		return 0;
 
 	bch2_mark_key_locked(c, bkey_i_to_s_c(insert->k), true,
 		bpos_min(insert->k->k.p, b->key.k.p).offset -
@@ -1105,7 +1105,7 @@ void bch2_mark_update(struct btree_trans *trans,
 		fs_usage, trans->journal_res.seq, flags);
 
 	if (unlikely(trans->flags & BTREE_INSERT_NOMARK_OVERWRITES))
-		return;
+		return 0;
 
 	/*
 	 * For non extents, we only mark the new key, not the key being
@@ -1114,19 +1114,22 @@ void bch2_mark_update(struct btree_trans *trans,
 	if ((iter->btree_id == BTREE_ID_ALLOC ||
 	     iter->btree_id == BTREE_ID_EC) &&
 	    !bkey_deleted(&insert->k->k))
-		return;
+		return 0;
 
 	while ((_k = bch2_btree_node_iter_peek_filter(&node_iter, b,
 						      KEY_TYPE_discard))) {
 		struct bkey		unpacked;
 		struct bkey_s_c		k = bkey_disassemble(b, _k, &unpacked);
 
-		if (!bch2_mark_overwrite(trans, iter, k, insert->k,
-					 fs_usage, flags))
+		ret = bch2_mark_overwrite(trans, iter, k, insert->k,
+					  fs_usage, flags);
+		if (ret <= 0)
 			break;
 
 		bch2_btree_node_iter_advance(&node_iter, b);
 	}
+
+	return ret;
 }
 
 void bch2_trans_fs_usage_apply(struct btree_trans *trans,
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index 90fffee1c289..c51192fae503 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -254,11 +254,11 @@ int bch2_mark_key(struct bch_fs *, struct bkey_s_c,
 int bch2_fs_usage_apply(struct bch_fs *, struct bch_fs_usage *,
 			struct disk_reservation *);
 
-bool bch2_mark_overwrite(struct btree_trans *, struct btree_iter *,
-			 struct bkey_s_c, struct bkey_i *,
-			 struct bch_fs_usage *, unsigned);
-void bch2_mark_update(struct btree_trans *, struct btree_insert_entry *,
-		      struct bch_fs_usage *, unsigned);
+int bch2_mark_overwrite(struct btree_trans *, struct btree_iter *,
+			struct bkey_s_c, struct bkey_i *,
+			struct bch_fs_usage *, unsigned);
+int bch2_mark_update(struct btree_trans *, struct btree_insert_entry *,
+		     struct bch_fs_usage *, unsigned);
 void bch2_trans_fs_usage_apply(struct btree_trans *, struct bch_fs_usage *);
 
 /* disk reservations: */
diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c
index 4479a9f55ddf..71971b3cc851 100644
--- a/fs/bcachefs/dirent.c
+++ b/fs/bcachefs/dirent.c
@@ -333,14 +333,10 @@ int bch2_empty_dir_trans(struct btree_trans *trans, u64 dir_inum)
 {
 	struct btree_iter *iter;
 	struct bkey_s_c k;
-	int ret = 0;
-
-	iter = bch2_trans_get_iter(trans, BTREE_ID_DIRENTS,
-				   POS(dir_inum, 0), 0);
-	if (IS_ERR(iter))
-		return PTR_ERR(iter);
+	int ret;
 
-	for_each_btree_key_continue(iter, 0, k) {
+	for_each_btree_key(trans, iter, BTREE_ID_DIRENTS,
+			   POS(dir_inum, 0), 0, k, ret) {
 		if (k.k->p.inode > dir_inum)
 			break;
 
@@ -369,6 +365,7 @@ int bch2_readdir(struct bch_fs *c, struct file *file,
 	struct bkey_s_c k;
 	struct bkey_s_c_dirent dirent;
 	unsigned len;
+	int ret;
 
 	if (!dir_emit_dots(file, ctx))
 		return 0;
@@ -376,7 +373,7 @@ int bch2_readdir(struct bch_fs *c, struct file *file,
 	bch2_trans_init(&trans, c);
 
 	for_each_btree_key(&trans, iter, BTREE_ID_DIRENTS,
-			   POS(inode->v.i_ino, ctx->pos), 0, k) {
+			   POS(inode->v.i_ino, ctx->pos), 0, k, ret) {
 		if (k.k->type != KEY_TYPE_dirent)
 			continue;
 
@@ -401,7 +398,7 @@ int bch2_readdir(struct bch_fs *c, struct file *file,
 
 		ctx->pos = k.k->p.offset + 1;
 	}
-	bch2_trans_exit(&trans);
+	ret = bch2_trans_exit(&trans) ?: ret;
 
-	return 0;
+	return ret;
 }
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index 47d197ed5c99..063f91fc1b09 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -679,10 +679,8 @@ retry:
 	bch2_trans_begin(&trans);
 
 	/* XXX: start pos hint */
-	iter = bch2_trans_get_iter(&trans, BTREE_ID_EC, POS_MIN,
-				   BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
-
-	for_each_btree_key_continue(iter, BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k) {
+	for_each_btree_key(&trans, iter, BTREE_ID_EC, POS_MIN,
+			   BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) {
 		if (bkey_cmp(k.k->p, POS(0, U32_MAX)) > 0)
 			break;
 
@@ -690,7 +688,8 @@ retry:
 			goto found_slot;
 	}
 
-	ret = -ENOSPC;
+	if (!ret)
+		ret = -ENOSPC;
 	goto out;
 found_slot:
 	ret = ec_stripe_mem_alloc(c, iter);
@@ -1249,14 +1248,14 @@ int bch2_stripes_read(struct bch_fs *c, struct journal_keys *journal_keys)
 
 	bch2_trans_init(&trans, c);
 
-	for_each_btree_key(&trans, iter, BTREE_ID_EC, POS_MIN, 0, k) {
+	for_each_btree_key(&trans, iter, BTREE_ID_EC, POS_MIN, 0, k, ret)
 		bch2_stripe_read_key(c, k);
-		bch2_trans_cond_resched(&trans);
-	}
 
-	ret = bch2_trans_exit(&trans);
-	if (ret)
+	ret = bch2_trans_exit(&trans) ?: ret;
+	if (ret) {
+		bch_err(c, "error reading stripes: %i", ret);
 		return ret;
+	}
 
 	for_each_journal_key(*journal_keys, i)
 		if (i->btree_id == BTREE_ID_EC)
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 2e7c3e82f03b..257c862c9856 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -1632,13 +1632,14 @@ bool bch2_check_range_allocated(struct bch_fs *c, struct bpos pos, u64 size,
 	struct bpos end = pos;
 	struct bkey_s_c k;
 	bool ret = true;
+	int err;
 
 	end.offset += size;
 
 	bch2_trans_init(&trans, c);
 
 	for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, pos,
-			   BTREE_ITER_SLOTS, k) {
+			   BTREE_ITER_SLOTS, k, err) {
 		if (bkey_cmp(bkey_start_pos(k.k), end) >= 0)
 			break;
 
diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index c8f6104553aa..f76dd4d89f25 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -2139,7 +2139,7 @@ static inline int range_has_data(struct bch_fs *c,
 
 	bch2_trans_init(&trans, c);
 
-	for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, start, 0, k) {
+	for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, start, 0, k, ret) {
 		if (bkey_cmp(bkey_start_pos(k.k), end) >= 0)
 			break;
 
@@ -2732,7 +2732,7 @@ static loff_t bch2_seek_data(struct file *file, u64 offset)
 	bch2_trans_init(&trans, c);
 
 	for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS,
-			   POS(inode->v.i_ino, offset >> 9), 0, k) {
+			   POS(inode->v.i_ino, offset >> 9), 0, k, ret) {
 		if (k.k->p.inode != inode->v.i_ino) {
 			break;
 		} else if (bkey_extent_is_data(k.k)) {
@@ -2742,7 +2742,7 @@ static loff_t bch2_seek_data(struct file *file, u64 offset)
 			break;
 	}
 
-	ret = bch2_trans_exit(&trans);
+	ret = bch2_trans_exit(&trans) ?: ret;
 	if (ret)
 		return ret;
 
@@ -2806,7 +2806,7 @@ static loff_t bch2_seek_hole(struct file *file, u64 offset)
 
 	for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS,
 			   POS(inode->v.i_ino, offset >> 9),
-			   BTREE_ITER_SLOTS, k) {
+			   BTREE_ITER_SLOTS, k, ret) {
 		if (k.k->p.inode != inode->v.i_ino) {
 			next_hole = bch2_next_pagecache_hole(&inode->v,
 					offset, MAX_LFS_FILESIZE);
@@ -2823,7 +2823,7 @@ static loff_t bch2_seek_hole(struct file *file, u64 offset)
 		}
 	}
 
-	ret = bch2_trans_exit(&trans);
+	ret = bch2_trans_exit(&trans) ?: ret;
 	if (ret)
 		return ret;
 
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index 6e377a0e176f..ba4b4e942f0c 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -1210,7 +1210,7 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
 	bch2_trans_init(&trans, c);
 
 	for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS,
-			   POS(ei->v.i_ino, start >> 9), 0, k)
+			   POS(ei->v.i_ino, start >> 9), 0, k, ret)
 		if (bkey_extent_is_data(k.k) ||
 		    k.k->type == KEY_TYPE_reservation) {
 			if (bkey_cmp(bkey_start_pos(k.k),
@@ -1220,17 +1220,17 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
 			if (have_extent) {
 				ret = bch2_fill_extent(info, &tmp.k, 0);
 				if (ret)
-					goto out;
+					break;
 			}
 
 			bkey_reassemble(&tmp.k, k);
 			have_extent = true;
 		}
 
-	if (have_extent)
+	if (!ret && have_extent)
 		ret = bch2_fill_extent(info, &tmp.k, FIEMAP_EXTENT_LAST);
-out:
-	bch2_trans_exit(&trans);
+
+	ret = bch2_trans_exit(&trans) ?: ret;
 	return ret < 0 ? ret : 0;
 }
 
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 9db01437315b..ade3446d8dc3 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -21,8 +21,10 @@ static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum)
 	struct btree_iter *iter;
 	struct bkey_s_c k;
 	u64 sectors = 0;
+	int ret;
 
-	for_each_btree_key(trans, iter, BTREE_ID_EXTENTS, POS(inum, 0), 0, k) {
+	for_each_btree_key(trans, iter, BTREE_ID_EXTENTS,
+			   POS(inum, 0), 0, k, ret) {
 		if (k.k->p.inode != inum)
 			break;
 
@@ -30,7 +32,9 @@ static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum)
 			sectors += k.k->size;
 	}
 
-	return bch2_trans_iter_free(trans, iter) ?: sectors;
+	bch2_trans_iter_free(trans, iter);
+
+	return ret ?: sectors;
 }
 
 static int remove_dirent(struct btree_trans *trans,
@@ -942,7 +946,7 @@ next:
 			goto up;
 
 		for_each_btree_key(&trans, iter, BTREE_ID_DIRENTS,
-				   POS(e->inum, e->offset + 1), 0, k) {
+				   POS(e->inum, e->offset + 1), 0, k, ret) {
 			if (k.k->p.inode != e->inum)
 				break;
 
@@ -985,7 +989,7 @@ next:
 			}
 			goto next;
 		}
-		ret = bch2_trans_iter_free(&trans, iter);
+		ret = bch2_trans_iter_free(&trans, iter) ?: ret;
 		if (ret) {
 			bch_err(c, "btree error %i in fsck", ret);
 			goto err;
@@ -1087,7 +1091,7 @@ static int bch2_gc_walk_dirents(struct bch_fs *c, nlink_table *links,
 
 	inc_link(c, links, range_start, range_end, BCACHEFS_ROOT_INO, false);
 
-	for_each_btree_key(&trans, iter, BTREE_ID_DIRENTS, POS_MIN, 0, k) {
+	for_each_btree_key(&trans, iter, BTREE_ID_DIRENTS, POS_MIN, 0, k, ret) {
 		switch (k.k->type) {
 		case KEY_TYPE_dirent:
 			d = bkey_s_c_to_dirent(k);
@@ -1105,7 +1109,7 @@ static int bch2_gc_walk_dirents(struct bch_fs *c, nlink_table *links,
 
 		bch2_trans_cond_resched(&trans);
 	}
-	ret = bch2_trans_exit(&trans);
+	ret = bch2_trans_exit(&trans) ?: ret;
 	if (ret)
 		bch_err(c, "error in fs gc: btree error %i while walking dirents", ret);
 
@@ -1432,15 +1436,12 @@ static int check_inodes_fast(struct bch_fs *c)
 	struct btree_iter *iter;
 	struct bkey_s_c k;
 	struct bkey_s_c_inode inode;
-	int ret = 0, ret2;
+	int ret;
 
 	bch2_trans_init(&trans, c);
 	bch2_trans_preload_iters(&trans);
 
-	iter = bch2_trans_get_iter(&trans, BTREE_ID_INODES,
-				   POS_MIN, 0);
-
-	for_each_btree_key_continue(iter, 0, k) {
+	for_each_btree_key(&trans, iter, BTREE_ID_INODES, POS_MIN, 0, k, ret) {
 		if (k.k->type != KEY_TYPE_inode)
 			continue;
 
@@ -1456,10 +1457,9 @@ static int check_inodes_fast(struct bch_fs *c)
 				break;
 		}
 	}
+	BUG_ON(ret == -EINTR);
 
-	ret2 = bch2_trans_exit(&trans);
-
-	return ret ?: ret2;
+	return bch2_trans_exit(&trans) ?: ret;
 }
 
 /*
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index 71481b9728f5..b07b0f92d4f9 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -1326,7 +1326,7 @@ static void bch2_read_retry(struct bch_fs *c, struct bch_read_bio *rbio,
 retry:
 	for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS,
 			   POS(inode, bvec_iter.bi_sector),
-			   BTREE_ITER_SLOTS, k) {
+			   BTREE_ITER_SLOTS, k, ret) {
 		BKEY_PADDED(k) tmp;
 		unsigned bytes;
 
@@ -1357,8 +1357,8 @@ retry:
 	 * If we get here, it better have been because there was an error
 	 * reading a btree node
 	 */
-	BUG_ON(!btree_iter_err(iter));
-	__bcache_io_error(c, "btree IO error");
+	BUG_ON(!ret);
+	__bcache_io_error(c, "btree IO error: %i", ret);
 err:
 	rbio->bio.bi_status = BLK_STS_IOERR;
 out:
@@ -1871,6 +1871,7 @@ void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, u64 inode)
 	unsigned flags = BCH_READ_RETRY_IF_STALE|
 		BCH_READ_MAY_PROMOTE|
 		BCH_READ_USER_MAPPED;
+	int ret;
 
 	bch2_trans_init(&trans, c);
 
@@ -1883,7 +1884,7 @@ void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, u64 inode)
 
 	for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS,
 			   POS(inode, rbio->bio.bi_iter.bi_sector),
-			   BTREE_ITER_SLOTS, k) {
+			   BTREE_ITER_SLOTS, k, ret) {
 		BKEY_PADDED(k) tmp;
 		unsigned bytes;
 
@@ -1915,8 +1916,8 @@ void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, u64 inode)
 	 * If we get here, it better have been because there was an error
 	 * reading a btree node
 	 */
-	BUG_ON(!btree_iter_err(iter));
-	bcache_io_error(c, &rbio->bio, "btree IO error");
+	BUG_ON(!ret);
+	bcache_io_error(c, &rbio->bio, "btree IO error: %i", ret);
 
 	bch2_trans_exit(&trans);
 	bch2_rbio_done(rbio);
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index 9793896bee77..1ad585ee27ca 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -630,7 +630,7 @@ static int bch2_gc_data_replicas(struct bch_fs *c)
 	bch2_replicas_gc_start(c, (1 << BCH_DATA_USER)|(1 << BCH_DATA_CACHED));
 
 	for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, POS_MIN,
-			   BTREE_ITER_PREFETCH, k) {
+			   BTREE_ITER_PREFETCH, k, ret) {
 		ret = bch2_mark_bkey_replicas(c, k);
 		if (ret)
 			break;
diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c
index a4f75d53b42c..b78df735d94c 100644
--- a/fs/bcachefs/quota.c
+++ b/fs/bcachefs/quota.c
@@ -364,7 +364,7 @@ static int bch2_quota_init_type(struct bch_fs *c, enum quota_types type)
 	bch2_trans_init(&trans, c);
 
 	for_each_btree_key(&trans, iter, BTREE_ID_QUOTAS, POS(type, 0),
-			   BTREE_ITER_PREFETCH, k) {
+			   BTREE_ITER_PREFETCH, k, ret) {
 		if (k.k->p.inode != type)
 			break;
 
@@ -436,7 +436,7 @@ int bch2_fs_quota_read(struct bch_fs *c)
 	bch2_trans_init(&trans, c);
 
 	for_each_btree_key(&trans, iter, BTREE_ID_INODES, POS_MIN,
-			   BTREE_ITER_PREFETCH, k) {
+			   BTREE_ITER_PREFETCH, k, ret) {
 		switch (k.k->type) {
 		case KEY_TYPE_inode:
 			ret = bch2_inode_unpack(bkey_s_c_to_inode(k), &u);
diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h
index 0ed28d7f074d..c47af32ce983 100644
--- a/fs/bcachefs/str_hash.h
+++ b/fs/bcachefs/str_hash.h
@@ -134,14 +134,11 @@ bch2_hash_lookup(struct btree_trans *trans,
 {
 	struct btree_iter *iter;
 	struct bkey_s_c k;
+	int ret;
 
-	iter = bch2_trans_get_iter(trans, desc.btree_id,
-				   POS(inode, desc.hash_key(info, key)),
-				   BTREE_ITER_SLOTS|flags);
-	if (IS_ERR(iter))
-		return iter;
-
-	for_each_btree_key_continue(iter, BTREE_ITER_SLOTS, k) {
+	for_each_btree_key(trans, iter, desc.btree_id,
+			   POS(inode, desc.hash_key(info, key)),
+			   BTREE_ITER_SLOTS|flags, k, ret) {
 		if (iter->pos.inode != inode)
 			break;
 
@@ -156,7 +153,7 @@ bch2_hash_lookup(struct btree_trans *trans,
 		}
 	}
 
-	return IS_ERR(k.k) ? ERR_CAST(k.k) : ERR_PTR(-ENOENT);
+	return ERR_PTR(ret ?: -ENOENT);
 }
 
 static __always_inline struct btree_iter *
@@ -167,14 +164,11 @@ bch2_hash_hole(struct btree_trans *trans,
 {
 	struct btree_iter *iter;
 	struct bkey_s_c k;
+	int ret;
 
-	iter = bch2_trans_get_iter(trans, desc.btree_id,
-				   POS(inode, desc.hash_key(info, key)),
-				   BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
-	if (IS_ERR(iter))
-		return iter;
-
-	for_each_btree_key_continue(iter, BTREE_ITER_SLOTS, k) {
+	for_each_btree_key(trans, iter, desc.btree_id,
+			   POS(inode, desc.hash_key(info, key)),
+			   BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) {
 		if (iter->pos.inode != inode)
 			break;
 
@@ -182,7 +176,7 @@ bch2_hash_hole(struct btree_trans *trans,
 			return iter;
 	}
 
-	return IS_ERR(k.k) ? ERR_CAST(k.k) : ERR_PTR(-ENOSPC);
+	return ERR_PTR(ret ?: -ENOSPC);
 }
 
 static __always_inline
@@ -224,15 +218,11 @@ int bch2_hash_set(struct btree_trans *trans,
 	struct btree_iter *iter, *slot = NULL;
 	struct bkey_s_c k;
 	bool found = false;
-	int ret = 0;
-
-	iter = bch2_trans_get_iter(trans, desc.btree_id,
-			POS(inode, desc.hash_bkey(info, bkey_i_to_s_c(insert))),
-			BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
-	if (IS_ERR(iter))
-		return PTR_ERR(iter);
+	int ret;
 
-	for_each_btree_key_continue(iter, BTREE_ITER_SLOTS, k) {
+	for_each_btree_key(trans, iter, desc.btree_id,
+			   POS(inode, desc.hash_bkey(info, bkey_i_to_s_c(insert))),
+			   BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) {
 		if (iter->pos.inode != inode)
 			break;
 
@@ -256,9 +246,10 @@ int bch2_hash_set(struct btree_trans *trans,
 	}
 
 	if (slot)
-		bch2_trans_iter_free(trans, iter);
+		bch2_trans_iter_free(trans, slot);
+	bch2_trans_iter_free(trans, iter);
 
-	return bch2_trans_iter_free(trans, iter) ?: -ENOSPC;
+	return ret ?: -ENOSPC;
 found:
 	found = true;
 not_found:
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index f4b70f66d0ac..ee4c0764d4ad 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -289,13 +289,14 @@ static ssize_t bch2_compression_stats(struct bch_fs *c, char *buf)
 	    nr_compressed_extents = 0,
 	    compressed_sectors_compressed = 0,
 	    compressed_sectors_uncompressed = 0;
+	int ret;
 
 	if (!test_bit(BCH_FS_STARTED, &c->flags))
 		return -EPERM;
 
 	bch2_trans_init(&trans, c);
 
-	for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, POS_MIN, 0, k)
+	for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, POS_MIN, 0, k, ret)
 		if (k.k->type == KEY_TYPE_extent) {
 			struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
 			const union bch_extent_entry *entry;
@@ -317,7 +318,10 @@ static ssize_t bch2_compression_stats(struct bch_fs *c, char *buf)
 				break;
 			}
 		}
-	bch2_trans_exit(&trans);
+
+	ret = bch2_trans_exit(&trans) ?: ret;
+	if (ret)
+		return ret;
 
 	return scnprintf(buf, PAGE_SIZE,
 			"uncompressed data:\n"
diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c
index c8682fe674f6..0f5a3ed13f3e 100644
--- a/fs/bcachefs/tests.c
+++ b/fs/bcachefs/tests.c
@@ -116,7 +116,8 @@ static void test_iterate(struct bch_fs *c, u64 nr)
 
 	i = 0;
 
-	for_each_btree_key(&trans, iter, BTREE_ID_DIRENTS, POS(0, 0), 0, k)
+	for_each_btree_key(&trans, iter, BTREE_ID_DIRENTS,
+			   POS_MIN, 0, k, ret)
 		BUG_ON(k.k->p.offset != i++);
 
 	BUG_ON(i != nr);
@@ -161,7 +162,8 @@ static void test_iterate_extents(struct bch_fs *c, u64 nr)
 
 	i = 0;
 
-	for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, POS(0, 0), 0, k) {
+	for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS,
+			   POS_MIN, 0, k, ret) {
 		BUG_ON(bkey_start_offset(k.k) != i);
 		i = k.k->p.offset;
 	}
@@ -209,7 +211,8 @@ static void test_iterate_slots(struct bch_fs *c, u64 nr)
 
 	i = 0;
 
-	for_each_btree_key(&trans, iter, BTREE_ID_DIRENTS, POS(0, 0), 0, k) {
+	for_each_btree_key(&trans, iter, BTREE_ID_DIRENTS, POS_MIN,
+			   0, k, ret) {
 		BUG_ON(k.k->p.offset != i);
 		i += 2;
 	}
@@ -221,8 +224,8 @@ static void test_iterate_slots(struct bch_fs *c, u64 nr)
 
 	i = 0;
 
-	for_each_btree_key(&trans, iter, BTREE_ID_DIRENTS, POS(0, 0),
-			   BTREE_ITER_SLOTS, k) {
+	for_each_btree_key(&trans, iter, BTREE_ID_DIRENTS, POS_MIN,
+			   BTREE_ITER_SLOTS, k, ret) {
 		BUG_ON(bkey_deleted(k.k) != (i & 1));
 		BUG_ON(k.k->p.offset != i++);
 
@@ -263,7 +266,8 @@ static void test_iterate_slots_extents(struct bch_fs *c, u64 nr)
 
 	i = 0;
 
-	for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, POS(0, 0), 0, k) {
+	for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, POS_MIN,
+			   0, k, ret) {
 		BUG_ON(bkey_start_offset(k.k) != i + 8);
 		BUG_ON(k.k->size != 8);
 		i += 16;
@@ -276,8 +280,8 @@ static void test_iterate_slots_extents(struct bch_fs *c, u64 nr)
 
 	i = 0;
 
-	for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, POS(0, 0),
-			   BTREE_ITER_SLOTS, k) {
+	for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, POS_MIN,
+			   BTREE_ITER_SLOTS, k, ret) {
 		BUG_ON(bkey_deleted(k.k) != !(i % 16));
 
 		BUG_ON(bkey_start_offset(k.k) != i);
@@ -501,10 +505,8 @@ static void seq_insert(struct bch_fs *c, u64 nr)
 
 	bch2_trans_init(&trans, c);
 
-	iter = bch2_trans_get_iter(&trans, BTREE_ID_DIRENTS, POS_MIN,
-				   BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
-
-	for_each_btree_key_continue(iter, BTREE_ITER_SLOTS, k) {
+	for_each_btree_key(&trans, iter, BTREE_ID_DIRENTS, POS_MIN,
+			   BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) {
 		insert.k.p = iter->pos;
 
 		bch2_trans_update(&trans, BTREE_INSERT_ENTRY(iter, &insert.k_i));
@@ -522,10 +524,11 @@ static void seq_lookup(struct bch_fs *c, u64 nr)
 	struct btree_trans trans;
 	struct btree_iter *iter;
 	struct bkey_s_c k;
+	int ret;
 
 	bch2_trans_init(&trans, c);
 
-	for_each_btree_key(&trans, iter, BTREE_ID_DIRENTS, POS_MIN, 0, k)
+	for_each_btree_key(&trans, iter, BTREE_ID_DIRENTS, POS_MIN, 0, k, ret)
 		;
 	bch2_trans_exit(&trans);
 }
@@ -539,10 +542,8 @@ static void seq_overwrite(struct bch_fs *c, u64 nr)
 
 	bch2_trans_init(&trans, c);
 
-	iter = bch2_trans_get_iter(&trans, BTREE_ID_DIRENTS, POS_MIN,
-				   BTREE_ITER_INTENT);
-
-	for_each_btree_key_continue(iter, 0, k) {
+	for_each_btree_key(&trans, iter, BTREE_ID_DIRENTS, POS_MIN,
+			   BTREE_ITER_INTENT, k, ret) {
 		struct bkey_i_cookie u;
 
 		bkey_reassemble(&u.k_i, k);
diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c
index 68ece7c0ee7a..99fb42225508 100644
--- a/fs/bcachefs/xattr.c
+++ b/fs/bcachefs/xattr.c
@@ -198,55 +198,54 @@ int bch2_xattr_set(struct btree_trans *trans, u64 inum,
 	return ret;
 }
 
-static void __bch2_xattr_emit(const char *prefix,
-			      const char *name, size_t name_len,
-			      char **buffer, size_t *buffer_size,
-			      ssize_t *ret)
+struct xattr_buf {
+	char		*buf;
+	size_t		len;
+	size_t		used;
+};
+
+static int __bch2_xattr_emit(const char *prefix,
+			     const char *name, size_t name_len,
+			     struct xattr_buf *buf)
 {
 	const size_t prefix_len = strlen(prefix);
 	const size_t total_len = prefix_len + name_len + 1;
 
-	if (*buffer) {
-		if (total_len > *buffer_size) {
-			*ret = -ERANGE;
-			return;
-		}
+	if (buf->buf) {
+		if (buf->used + total_len > buf->len)
+			return -ERANGE;
 
-		memcpy(*buffer, prefix, prefix_len);
-		memcpy(*buffer + prefix_len,
+		memcpy(buf->buf + buf->used, prefix, prefix_len);
+		memcpy(buf->buf + buf->used + prefix_len,
 		       name, name_len);
-		(*buffer)[prefix_len + name_len] = '\0';
-
-		*buffer		+= total_len;
-		*buffer_size	-= total_len;
+		buf->buf[buf->used + prefix_len + name_len] = '\0';
 	}
 
-	*ret += total_len;
+	buf->used += total_len;
+	return 0;
 }
 
-static void bch2_xattr_emit(struct dentry *dentry,
+static int bch2_xattr_emit(struct dentry *dentry,
 			    const struct bch_xattr *xattr,
-			    char **buffer, size_t *buffer_size,
-			    ssize_t *ret)
+			    struct xattr_buf *buf)
 {
 	const struct xattr_handler *handler =
 		bch2_xattr_type_to_handler(xattr->x_type);
 
-	if (handler && (!handler->list || handler->list(dentry)))
-		__bch2_xattr_emit(handler->prefix ?: handler->name,
-				  xattr->x_name, xattr->x_name_len,
-				  buffer, buffer_size, ret);
+	return handler && (!handler->list || handler->list(dentry))
+		? __bch2_xattr_emit(handler->prefix ?: handler->name,
+				    xattr->x_name, xattr->x_name_len, buf)
+		: 0;
 }
 
-static void bch2_xattr_list_bcachefs(struct bch_fs *c,
-				     struct bch_inode_info *inode,
-				     char **buffer,
-				     size_t *buffer_size,
-				     ssize_t *ret,
-				     bool all)
+static int bch2_xattr_list_bcachefs(struct bch_fs *c,
+				    struct bch_inode_info *inode,
+				    struct xattr_buf *buf,
+				    bool all)
 {
 	const char *prefix = all ? "bcachefs_effective." : "bcachefs.";
 	unsigned id;
+	int ret = 0;
 	u64 v;
 
 	for (id = 0; id < Inode_opt_nr; id++) {
@@ -258,13 +257,13 @@ static void bch2_xattr_list_bcachefs(struct bch_fs *c,
 		    !(inode->ei_inode.bi_fields_set & (1 << id)))
 			continue;
 
-		__bch2_xattr_emit(prefix,
-				  bch2_inode_opts[id],
-				  strlen(bch2_inode_opts[id]),
-				  buffer, buffer_size, ret);
-		if (*ret < 0)
+		ret = __bch2_xattr_emit(prefix, bch2_inode_opts[id],
+					strlen(bch2_inode_opts[id]), buf);
+		if (ret)
 			break;
 	}
+
+	return ret;
 }
 
 ssize_t bch2_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size)
@@ -274,13 +273,14 @@ ssize_t bch2_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size)
 	struct btree_trans trans;
 	struct btree_iter *iter;
 	struct bkey_s_c k;
+	struct xattr_buf buf = { .buf = buffer, .len = buffer_size };
 	u64 inum = dentry->d_inode->i_ino;
-	ssize_t ret = 0;
+	int ret;
 
 	bch2_trans_init(&trans, c);
 
 	for_each_btree_key(&trans, iter, BTREE_ID_XATTRS,
-			   POS(inum, 0), 0, k) {
+			   POS(inum, 0), 0, k, ret) {
 		BUG_ON(k.k->p.inode < inum);
 
 		if (k.k->p.inode > inum)
@@ -289,27 +289,24 @@ ssize_t bch2_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size)
 		if (k.k->type != KEY_TYPE_xattr)
 			continue;
 
-		bch2_xattr_emit(dentry, bkey_s_c_to_xattr(k).v,
-				&buffer, &buffer_size, &ret);
-		if (ret < 0)
+		ret = bch2_xattr_emit(dentry, bkey_s_c_to_xattr(k).v, &buf);
+		if (ret)
 			break;
 	}
-	bch2_trans_exit(&trans);
+	ret = bch2_trans_exit(&trans) ?: ret;
 
-	if (ret < 0)
+	if (ret)
 		return ret;
 
-	bch2_xattr_list_bcachefs(c, inode, &buffer,
-				 &buffer_size, &ret, false);
-	if (ret < 0)
+	ret = bch2_xattr_list_bcachefs(c, inode, &buf, false);
+	if (ret)
 		return ret;
 
-	bch2_xattr_list_bcachefs(c, inode, &buffer,
-				 &buffer_size, &ret, true);
-	if (ret < 0)
+	ret = bch2_xattr_list_bcachefs(c, inode, &buf, true);
+	if (ret)
 		return ret;
 
-	return ret;
+	return buf.used;
 }
 
 static int bch2_xattr_get_handler(const struct xattr_handler *handler,
-- 
cgit 


From 69eb5390afd287e73f781c26526796b45a77f9d1 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 17 Apr 2019 20:10:43 -0400
Subject: bcachefs: copy correct journal_seq to dir in create

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index ba4b4e942f0c..7ae1b7520351 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -454,7 +454,7 @@ retry:
 	if (!tmpfile) {
 		bch2_inode_update_after_write(c, dir, &dir_u,
 					      ATTR_MTIME|ATTR_CTIME);
-		journal_seq_copy(dir, inode->ei_journal_seq);
+		journal_seq_copy(dir, journal_seq);
 		mutex_unlock(&dir->ei_update_lock);
 	}
 
-- 
cgit 


From ba5c65576b9d46a05ce2c709c88886f11432a204 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 23 Apr 2019 00:10:08 -0400
Subject: bcachefs: Add actual tracepoints for transaction restarts

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_cache.c       |  2 ++
 fs/bcachefs/btree_iter.c        | 14 +++++---
 fs/bcachefs/btree_types.h       |  1 +
 fs/bcachefs/btree_update_leaf.c |  7 ++++
 fs/bcachefs/trace.h             | 72 +++++++++++++++++++++++++++++++++++++++++
 5 files changed, 92 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index 55aaa3e4aa84..bb88ce1415c8 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -734,6 +734,8 @@ retry:
 				goto retry;
 
 			trans_restart();
+			trace_trans_restart_btree_node_reused(c,
+						iter->trans->ip);
 			return ERR_PTR(-EINTR);
 		}
 	}
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 6b9af53a3e77..4bdbdd22b437 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -252,12 +252,15 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
 		}
 	}
 
-	if (ret)
-		__btree_node_lock_type(iter->trans->c, b, type);
-	else
+	if (unlikely(!ret)) {
 		trans_restart();
+		trace_trans_restart_would_deadlock(iter->trans->c,
+						   iter->trans->ip);
+		return false;
+	}
 
-	return ret;
+	__btree_node_lock_type(iter->trans->c, b, type);
+	return true;
 }
 
 /* Btree iterator locking: */
@@ -1695,6 +1698,7 @@ success:
 
 	if (trans->iters_live) {
 		trans_restart();
+		trace_trans_restart_iters_realloced(trans->c, trans->ip);
 		return -EINTR;
 	}
 
@@ -1863,6 +1867,7 @@ void *bch2_trans_kmalloc(struct btree_trans *trans,
 
 		if (old_bytes) {
 			trans_restart();
+			trace_trans_restart_mem_realloced(trans->c, trans->ip);
 			return ERR_PTR(-EINTR);
 		}
 	}
@@ -1939,6 +1944,7 @@ void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c)
 	memset(trans, 0, offsetof(struct btree_trans, iters_onstack));
 
 	trans->c		= c;
+	trans->ip		= _RET_IP_;
 	trans->size		= ARRAY_SIZE(trans->iters_onstack);
 	trans->iters		= trans->iters_onstack;
 	trans->updates		= trans->updates_onstack;
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 8c6f5fe6998e..dd4fa2f595ec 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -267,6 +267,7 @@ struct btree_insert_entry {
 
 struct btree_trans {
 	struct bch_fs		*c;
+	unsigned long		ip;
 	size_t			nr_restarts;
 	u64			commit_start;
 
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 2633a5452b13..a9d7905f3373 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -440,6 +440,7 @@ static int bch2_trans_journal_preres_get(struct btree_trans *trans)
 
 	if (!bch2_btree_trans_relock(trans)) {
 		trans_restart(" (iter relock after journal preres get blocked)");
+		trace_trans_restart_journal_preres_get(c, trans->ip);
 		return -EINTR;
 	}
 
@@ -564,6 +565,7 @@ static inline int do_btree_insert_at(struct btree_trans *trans,
 	if (race_fault()) {
 		ret = -EINTR;
 		trans_restart(" (race)");
+		trace_trans_restart_fault_inject(c, trans->ip);
 		goto out;
 	}
 
@@ -680,6 +682,7 @@ int bch2_trans_commit_error(struct btree_trans *trans,
 		 */
 		if (!ret || (flags & BTREE_INSERT_NOUNLOCK)) {
 			trans_restart(" (split)");
+			trace_trans_restart_btree_node_split(c, trans->ip);
 			ret = -EINTR;
 		}
 		break;
@@ -699,6 +702,7 @@ int bch2_trans_commit_error(struct btree_trans *trans,
 			return 0;
 
 		trans_restart(" (iter relock after marking replicas)");
+		trace_trans_restart_mark_replicas(c, trans->ip);
 		ret = -EINTR;
 		break;
 	case BTREE_INSERT_NEED_JOURNAL_RES:
@@ -712,6 +716,7 @@ int bch2_trans_commit_error(struct btree_trans *trans,
 			return 0;
 
 		trans_restart(" (iter relock after journal res get blocked)");
+		trace_trans_restart_journal_res_get(c, trans->ip);
 		ret = -EINTR;
 		break;
 	default:
@@ -724,6 +729,7 @@ int bch2_trans_commit_error(struct btree_trans *trans,
 
 		if (ret2) {
 			trans_restart(" (traverse)");
+			trace_trans_restart_traverse(c, trans->ip);
 			return ret2;
 		}
 
@@ -735,6 +741,7 @@ int bch2_trans_commit_error(struct btree_trans *trans,
 			return 0;
 
 		trans_restart(" (atomic)");
+		trace_trans_restart_atomic(c, trans->ip);
 	}
 
 	return ret;
diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h
index 1aa6ac05d50e..2864a72938ce 100644
--- a/fs/bcachefs/trace.h
+++ b/fs/bcachefs/trace.h
@@ -499,6 +499,78 @@ TRACE_EVENT(copygc,
 		__entry->buckets_moved, __entry->buckets_not_moved)
 );
 
+DECLARE_EVENT_CLASS(transaction_restart,
+	TP_PROTO(struct bch_fs *c, unsigned long ip),
+	TP_ARGS(c, ip),
+
+	TP_STRUCT__entry(
+		__array(char,			name,	16)
+		__field(unsigned long,		ip	)
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->name, c->name, 16);
+		__entry->ip = ip;
+	),
+
+	TP_printk("%pS", (void *) __entry->ip)
+);
+
+DEFINE_EVENT(transaction_restart,	trans_restart_btree_node_reused,
+	TP_PROTO(struct bch_fs *c, unsigned long ip),
+	TP_ARGS(c, ip)
+);
+
+DEFINE_EVENT(transaction_restart,	trans_restart_would_deadlock,
+	TP_PROTO(struct bch_fs *c, unsigned long ip),
+	TP_ARGS(c, ip)
+);
+
+DEFINE_EVENT(transaction_restart,	trans_restart_iters_realloced,
+	TP_PROTO(struct bch_fs *c, unsigned long ip),
+	TP_ARGS(c, ip)
+);
+
+DEFINE_EVENT(transaction_restart,	trans_restart_mem_realloced,
+	TP_PROTO(struct bch_fs *c, unsigned long ip),
+	TP_ARGS(c, ip)
+);
+
+DEFINE_EVENT(transaction_restart,	trans_restart_journal_res_get,
+	TP_PROTO(struct bch_fs *c, unsigned long ip),
+	TP_ARGS(c, ip)
+);
+
+DEFINE_EVENT(transaction_restart,	trans_restart_journal_preres_get,
+	TP_PROTO(struct bch_fs *c, unsigned long ip),
+	TP_ARGS(c, ip)
+);
+
+DEFINE_EVENT(transaction_restart,	trans_restart_mark_replicas,
+	TP_PROTO(struct bch_fs *c, unsigned long ip),
+	TP_ARGS(c, ip)
+);
+
+DEFINE_EVENT(transaction_restart,	trans_restart_fault_inject,
+	TP_PROTO(struct bch_fs *c, unsigned long ip),
+	TP_ARGS(c, ip)
+);
+
+DEFINE_EVENT(transaction_restart,	trans_restart_btree_node_split,
+	TP_PROTO(struct bch_fs *c, unsigned long ip),
+	TP_ARGS(c, ip)
+);
+
+DEFINE_EVENT(transaction_restart,	trans_restart_traverse,
+	TP_PROTO(struct bch_fs *c, unsigned long ip),
+	TP_ARGS(c, ip)
+);
+
+DEFINE_EVENT(transaction_restart,	trans_restart_atomic,
+	TP_PROTO(struct bch_fs *c, unsigned long ip),
+	TP_ARGS(c, ip)
+);
+
 #endif /* _TRACE_BCACHEFS_H */
 
 /* This part must be outside protection */
-- 
cgit 


From a7451c42926c894956fee3b455619c3e0387219f Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 29 Apr 2019 15:16:18 -0400
Subject: bcachefs: fix bch2_flags_to_text()

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/util.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c
index 295f4577e9c1..0ca1fb59f54d 100644
--- a/fs/bcachefs/util.c
+++ b/fs/bcachefs/util.c
@@ -143,10 +143,10 @@ void bch2_flags_to_text(struct printbuf *out,
 		nr++;
 
 	while (flags && (bit = __ffs(flags)) < nr) {
-		pr_buf(out, "%s", list[bit]);
 		if (!first)
 			pr_buf(out, ",");
 		first = false;
+		pr_buf(out, "%s", list[bit]);
 		flags ^= 1 << bit;
 	}
 }
-- 
cgit 


From ab5c63f5dd3dcdb7943056c6e31ee62325010dc3 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 11 May 2019 17:29:20 -0400
Subject: bcachefs: Don't hardcode BTREE_ID_EXTENTS

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c | 4 ++--
 fs/bcachefs/btree_iter.h | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 4bdbdd22b437..ac3c3769e126 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1114,7 +1114,7 @@ static inline void bch2_btree_iter_checks(struct btree_iter *iter,
 {
 	EBUG_ON(iter->btree_id >= BTREE_ID_NR);
 	EBUG_ON(!!(iter->flags & BTREE_ITER_IS_EXTENTS) !=
-		(iter->btree_id == BTREE_ID_EXTENTS &&
+		(btree_node_type_is_extents(iter->btree_id) &&
 		 type != BTREE_ITER_NODES));
 
 	bch2_btree_trans_verify_locks(iter->trans);
@@ -1590,7 +1590,7 @@ static inline void bch2_btree_iter_init(struct btree_trans *trans,
 	struct bch_fs *c = trans->c;
 	unsigned i;
 
-	if (btree_id == BTREE_ID_EXTENTS &&
+	if (btree_node_type_is_extents(btree_id) &&
 	    !(flags & BTREE_ITER_NODES))
 		flags |= BTREE_ITER_IS_EXTENTS;
 
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index 0a4c6c76e43b..dc15d1b831a8 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -156,7 +156,7 @@ static inline struct bpos btree_type_successor(enum btree_id id,
 	if (id == BTREE_ID_INODES) {
 		pos.inode++;
 		pos.offset = 0;
-	} else if (id != BTREE_ID_EXTENTS) {
+	} else if (!btree_node_type_is_extents(id)) {
 		pos = bkey_successor(pos);
 	}
 
@@ -169,7 +169,7 @@ static inline struct bpos btree_type_predecessor(enum btree_id id,
 	if (id == BTREE_ID_INODES) {
 		--pos.inode;
 		pos.offset = 0;
-	} else /* if (id != BTREE_ID_EXTENTS) */ {
+	} else {
 		pos = bkey_predecessor(pos);
 	}
 
-- 
cgit 


From 3811aa6d4d3efbcf26dbe47bf73a78b17e8e71a4 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 11 May 2019 17:32:07 -0400
Subject: bcachefs: bch2_bkey_ptrs_invalid()

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs.h |   5 ++
 fs/bcachefs/ec.c       |   4 +-
 fs/bcachefs/extents.c  | 207 ++++++++++++++++++++++---------------------------
 fs/bcachefs/extents.h  |   4 +
 4 files changed, 103 insertions(+), 117 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 6f33121736bb..807291c33f5c 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -842,4 +842,9 @@ static inline s64 bch2_current_time(struct bch_fs *c)
 	return timespec_to_bch2_time(c, now);
 }
 
+static inline bool bch2_dev_exists2(const struct bch_fs *c, unsigned dev)
+{
+	return dev < c->sb.nr_devices && c->devs[dev];
+}
+
 #endif /* _BCACHEFS_H */
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index 063f91fc1b09..1eacd9665c7d 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -114,7 +114,7 @@ const char *bch2_stripe_invalid(const struct bch_fs *c, struct bkey_s_c k)
 	    bkey_val_u64s(k.k) < stripe_val_u64s(s))
 		return "incorrect value size";
 
-	return NULL;
+	return bch2_bkey_ptrs_invalid(c, k);
 }
 
 void bch2_stripe_to_text(struct printbuf *out, struct bch_fs *c,
@@ -135,6 +135,8 @@ void bch2_stripe_to_text(struct printbuf *out, struct bch_fs *c,
 		pr_buf(out, " %u:%llu:%u", s->ptrs[i].dev,
 		       (u64) s->ptrs[i].offset,
 		       stripe_blockcount_get(s, i));
+
+	bch2_bkey_ptrs_to_text(out, c, k);
 }
 
 static int ptr_matches_stripe(struct bch_fs *c,
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 257c862c9856..a975f8f72da4 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -500,43 +500,8 @@ void bch2_ptr_swab(const struct bkey_format *f, struct bkey_packed *k)
 	}
 }
 
-static const char *extent_ptr_invalid(const struct bch_fs *c,
-				      struct bkey_s_c k,
-				      const struct bch_extent_ptr *ptr,
-				      unsigned size_ondisk,
-				      bool metadata)
-{
-	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-	const struct bch_extent_ptr *ptr2;
-	struct bch_dev *ca;
-
-	if (ptr->dev >= c->sb.nr_devices ||
-	    !c->devs[ptr->dev])
-		return "pointer to invalid device";
-
-	ca = bch_dev_bkey_exists(c, ptr->dev);
-	if (!ca)
-		return "pointer to invalid device";
-
-	bkey_for_each_ptr(ptrs, ptr2)
-		if (ptr != ptr2 && ptr->dev == ptr2->dev)
-			return "multiple pointers to same device";
-
-	if (ptr->offset + size_ondisk > bucket_to_sector(ca, ca->mi.nbuckets))
-		return "offset past end of device";
-
-	if (ptr->offset < bucket_to_sector(ca, ca->mi.first_bucket))
-		return "offset before first bucket";
-
-	if (bucket_remainder(ca, ptr->offset) +
-	    size_ondisk > ca->mi.bucket_size)
-		return "spans multiple buckets";
-
-	return NULL;
-}
-
-static void bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
-			      struct bkey_s_c k)
+void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
+			    struct bkey_s_c k)
 {
 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
 	const union bch_extent_entry *entry;
@@ -590,37 +555,109 @@ static void bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
 	}
 }
 
-/* Btree ptrs */
+static const char *extent_ptr_invalid(const struct bch_fs *c,
+				      struct bkey_s_c k,
+				      const struct bch_extent_ptr *ptr,
+				      unsigned size_ondisk,
+				      bool metadata)
+{
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+	const struct bch_extent_ptr *ptr2;
+	struct bch_dev *ca;
 
-const char *bch2_btree_ptr_invalid(const struct bch_fs *c, struct bkey_s_c k)
+	if (!bch2_dev_exists2(c, ptr->dev))
+		return "pointer to invalid device";
+
+	ca = bch_dev_bkey_exists(c, ptr->dev);
+	if (!ca)
+		return "pointer to invalid device";
+
+	bkey_for_each_ptr(ptrs, ptr2)
+		if (ptr != ptr2 && ptr->dev == ptr2->dev)
+			return "multiple pointers to same device";
+
+	if (ptr->offset + size_ondisk > bucket_to_sector(ca, ca->mi.nbuckets))
+		return "offset past end of device";
+
+	if (ptr->offset < bucket_to_sector(ca, ca->mi.first_bucket))
+		return "offset before first bucket";
+
+	if (bucket_remainder(ca, ptr->offset) +
+	    size_ondisk > ca->mi.bucket_size)
+		return "spans multiple buckets";
+
+	return NULL;
+}
+
+const char *bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k)
 {
 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
 	const union bch_extent_entry *entry;
-	const struct bch_extent_ptr *ptr;
+	struct bch_extent_crc_unpacked crc;
+	unsigned size_ondisk = k.k->size;
 	const char *reason;
+	unsigned nonce = UINT_MAX;
 
-	if (bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX)
-		return "value too big";
+	if (k.k->type == KEY_TYPE_btree_ptr)
+		size_ondisk = c->opts.btree_node_size;
 
 	bkey_extent_entry_for_each(ptrs, entry) {
 		if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX)
 			return "invalid extent entry type";
 
-		if (!extent_entry_is_ptr(entry))
+		if (k.k->type == KEY_TYPE_btree_ptr &&
+		    !extent_entry_is_ptr(entry))
 			return "has non ptr field";
-	}
 
-	bkey_for_each_ptr(ptrs, ptr) {
-		reason = extent_ptr_invalid(c, k, ptr,
-					    c->opts.btree_node_size,
-					    true);
-		if (reason)
-			return reason;
+		switch (extent_entry_type(entry)) {
+		case BCH_EXTENT_ENTRY_ptr:
+			reason = extent_ptr_invalid(c, k, &entry->ptr,
+						    size_ondisk, false);
+			if (reason)
+				return reason;
+			break;
+		case BCH_EXTENT_ENTRY_crc32:
+		case BCH_EXTENT_ENTRY_crc64:
+		case BCH_EXTENT_ENTRY_crc128:
+			crc = bch2_extent_crc_unpack(k.k, entry_to_crc(entry));
+
+			if (crc.offset + crc.live_size >
+			    crc.uncompressed_size)
+				return "checksum offset + key size > uncompressed size";
+
+			size_ondisk = crc.compressed_size;
+
+			if (!bch2_checksum_type_valid(c, crc.csum_type))
+				return "invalid checksum type";
+
+			if (crc.compression_type >= BCH_COMPRESSION_NR)
+				return "invalid compression type";
+
+			if (bch2_csum_type_is_encryption(crc.csum_type)) {
+				if (nonce == UINT_MAX)
+					nonce = crc.offset + crc.nonce;
+				else if (nonce != crc.offset + crc.nonce)
+					return "incorrect nonce";
+			}
+			break;
+		case BCH_EXTENT_ENTRY_stripe_ptr:
+			break;
+		}
 	}
 
 	return NULL;
 }
 
+/* Btree ptrs */
+
+const char *bch2_btree_ptr_invalid(const struct bch_fs *c, struct bkey_s_c k)
+{
+	if (bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX)
+		return "value too big";
+
+	return bch2_bkey_ptrs_invalid(c, k);
+}
+
 void bch2_btree_ptr_debugcheck(struct bch_fs *c, struct btree *b,
 			       struct bkey_s_c k)
 {
@@ -665,13 +702,7 @@ err:
 void bch2_btree_ptr_to_text(struct printbuf *out, struct bch_fs *c,
 			    struct bkey_s_c k)
 {
-	const char *invalid;
-
-	bkey_ptrs_to_text(out, c, k);
-
-	invalid = bch2_btree_ptr_invalid(c, k);
-	if (invalid)
-		pr_buf(out, " invalid: %s", invalid);
+	bch2_bkey_ptrs_to_text(out, c, k);
 }
 
 /* Extents */
@@ -1221,60 +1252,10 @@ void bch2_insert_fixup_extent(struct btree_trans *trans,
 
 const char *bch2_extent_invalid(const struct bch_fs *c, struct bkey_s_c k)
 {
-	struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
-	const union bch_extent_entry *entry;
-	struct bch_extent_crc_unpacked crc;
-	const struct bch_extent_ptr *ptr;
-	unsigned size_ondisk = e.k->size;
-	const char *reason;
-	unsigned nonce = UINT_MAX;
-
-	if (bkey_val_u64s(e.k) > BKEY_EXTENT_VAL_U64s_MAX)
+	if (bkey_val_u64s(k.k) > BKEY_EXTENT_VAL_U64s_MAX)
 		return "value too big";
 
-	extent_for_each_entry(e, entry) {
-		if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX)
-			return "invalid extent entry type";
-
-		switch (extent_entry_type(entry)) {
-		case BCH_EXTENT_ENTRY_ptr:
-			ptr = entry_to_ptr(entry);
-
-			reason = extent_ptr_invalid(c, e.s_c, &entry->ptr,
-						    size_ondisk, false);
-			if (reason)
-				return reason;
-			break;
-		case BCH_EXTENT_ENTRY_crc32:
-		case BCH_EXTENT_ENTRY_crc64:
-		case BCH_EXTENT_ENTRY_crc128:
-			crc = bch2_extent_crc_unpack(e.k, entry_to_crc(entry));
-
-			if (crc.offset + e.k->size >
-			    crc.uncompressed_size)
-				return "checksum offset + key size > uncompressed size";
-
-			size_ondisk = crc.compressed_size;
-
-			if (!bch2_checksum_type_valid(c, crc.csum_type))
-				return "invalid checksum type";
-
-			if (crc.compression_type >= BCH_COMPRESSION_NR)
-				return "invalid compression type";
-
-			if (bch2_csum_type_is_encryption(crc.csum_type)) {
-				if (nonce == UINT_MAX)
-					nonce = crc.offset + crc.nonce;
-				else if (nonce != crc.offset + crc.nonce)
-					return "incorrect nonce";
-			}
-			break;
-		case BCH_EXTENT_ENTRY_stripe_ptr:
-			break;
-		}
-	}
-
-	return NULL;
+	return bch2_bkey_ptrs_invalid(c, k);
 }
 
 void bch2_extent_debugcheck(struct bch_fs *c, struct btree *b,
@@ -1335,13 +1316,7 @@ void bch2_extent_debugcheck(struct bch_fs *c, struct btree *b,
 void bch2_extent_to_text(struct printbuf *out, struct bch_fs *c,
 			 struct bkey_s_c k)
 {
-	const char *invalid;
-
-	bkey_ptrs_to_text(out, c, k);
-
-	invalid = bch2_extent_invalid(c, k);
-	if (invalid)
-		pr_buf(out, " invalid: %s", invalid);
+	bch2_bkey_ptrs_to_text(out, c, k);
 }
 
 static void bch2_extent_crc_init(union bch_extent_crc *crc,
diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h
index 57a84971637a..4cd16e8a2af6 100644
--- a/fs/bcachefs/extents.h
+++ b/fs/bcachefs/extents.h
@@ -359,6 +359,10 @@ int bch2_bkey_pick_read_device(struct bch_fs *, struct bkey_s_c,
 			       struct bch_io_failures *,
 			       struct extent_ptr_decoded *);
 
+void bch2_bkey_ptrs_to_text(struct printbuf *, struct bch_fs *,
+			    struct bkey_s_c);
+const char *bch2_bkey_ptrs_invalid(const struct bch_fs *, struct bkey_s_c);
+
 /* bch_btree_ptr: */
 
 const char *bch2_btree_ptr_invalid(const struct bch_fs *, struct bkey_s_c);
-- 
cgit 


From 75812e70d9341e7a1a55f6f2a0e367186c859623 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 17 Apr 2019 20:34:24 -0400
Subject: bcachefs: Fix fsync error reporting

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-io.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index f76dd4d89f25..619daf65e1ec 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -1131,9 +1131,10 @@ static void bch2_writepage_io_done(struct closure *cl)
 	struct bio_vec *bvec;
 
 	if (io->op.op.error) {
-		bio_for_each_segment_all(bvec, bio, iter)
+		bio_for_each_segment_all(bvec, bio, iter) {
 			SetPageError(bvec->bv_page);
-		set_bit(AS_EIO, &io->op.inode->v.i_mapping->flags);
+			mapping_set_error(bvec->bv_page->mapping, -EIO);
+		}
 	}
 
 	/*
@@ -2068,10 +2069,9 @@ int bch2_fsync(struct file *file, loff_t start, loff_t end, int datasync)
 	if (ret)
 		return ret;
 out:
-	if (c->opts.journal_flush_disabled)
-		return 0;
-
-	ret = bch2_journal_flush_seq(&c->journal, inode->ei_journal_seq);
+	if (!c->opts.journal_flush_disabled)
+		ret = bch2_journal_flush_seq(&c->journal,
+					     inode->ei_journal_seq);
 	ret2 = file_check_and_advance_wb_err(file);
 
 	return ret ?: ret2;
-- 
cgit 


From 1f7d45beb71f460b9ea138788340926c1b3a4c4d Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 10 May 2019 11:58:00 -0400
Subject: bcachefs: Fix journal shutdown path

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/journal.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index 25d0631c43dd..0261c6bbfa92 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -959,6 +959,8 @@ void bch2_fs_journal_stop(struct journal *j)
 {
 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
 
+	bch2_journal_flush_all_pins(j);
+
 	wait_event(j->wait, journal_entry_close(j));
 
 	/* do we need to write another journal entry? */
-- 
cgit 


From 1fe93f88c36dcceee9f92055d0a1febbd90338d3 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 15 May 2019 12:24:39 -0400
Subject: bcachefs: fix bch2_rbio_narrow_crcs()

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/io.c | 19 +++++++------------
 1 file changed, 7 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index b07b0f92d4f9..9bb4b1fe5b8a 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -1422,7 +1422,7 @@ static void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio)
 	struct bkey_i_extent *e;
 	BKEY_PADDED(k) new;
 	struct bch_extent_crc_unpacked new_crc;
-	unsigned offset;
+	u64 data_offset = rbio->pos.offset - rbio->pick.crc.offset;
 	int ret;
 
 	if (rbio->pick.crc.compression_type)
@@ -1445,24 +1445,19 @@ retry:
 	e = bkey_i_to_extent(&new.k);
 
 	if (!bch2_extent_matches_ptr(c, extent_i_to_s_c(e),
-				     rbio->pick.ptr,
-				     rbio->pos.offset -
-				     rbio->pick.crc.offset) ||
+				     rbio->pick.ptr, data_offset) ||
 	    bversion_cmp(e->k.version, rbio->version))
 		goto out;
 
 	/* Extent was merged? */
-	if (bkey_start_offset(&e->k) < rbio->pos.offset ||
-	    e->k.p.offset > rbio->pos.offset + rbio->pick.crc.uncompressed_size)
+	if (bkey_start_offset(&e->k) < data_offset ||
+	    e->k.p.offset > data_offset + rbio->pick.crc.uncompressed_size)
 		goto out;
 
-	/* The extent might have been partially overwritten since we read it: */
-	offset = rbio->pick.crc.offset + (bkey_start_offset(&e->k) - rbio->pos.offset);
-
 	if (bch2_rechecksum_bio(c, &rbio->bio, rbio->version,
-				rbio->pick.crc, NULL, &new_crc,
-				offset, e->k.size,
-				rbio->pick.crc.csum_type)) {
+			rbio->pick.crc, NULL, &new_crc,
+			bkey_start_offset(&e->k) - data_offset, e->k.size,
+			rbio->pick.crc.csum_type)) {
 		bch_err(c, "error verifying existing checksum while narrowing checksum (memory corruption?)");
 		goto out;
 	}
-- 
cgit 


From 33eb63e5753ad6229d4027340153817b92840760 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 14 May 2019 13:25:25 -0400
Subject: bcachefs: Fix a bug with multiple iterators being traversed

If upgrade fails on one iterator, but it was copied from another
iterator and will be freed before transaction restart, then the original
iterator will get traversed first, so we need to make required btree
nodes on the original iterator will be traversed too.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index ac3c3769e126..eeb9a59283a7 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -320,7 +320,6 @@ bool __bch2_btree_iter_upgrade(struct btree_iter *iter,
 	trans_for_each_iter(iter->trans, linked)
 		if (linked != iter &&
 		    linked->btree_id == iter->btree_id &&
-		    btree_iter_cmp(linked, iter) <= 0 &&
 		    linked->locks_want < new_locks_want) {
 			linked->locks_want = new_locks_want;
 			btree_iter_get_locks(linked, true);
-- 
cgit 


From fca1223ccfac2a461d7d3e29fb09a1b2142bdd7f Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 3 Dec 2020 14:17:33 -0500
Subject: bcachefs: Avoid write lock on mark_lock

mark_lock is a frequently taken lock, and there's also potential for
deadlocks since currently bch2_clear_page_bits which is called from
memory reclaim has to take it to drop disk reservations.

The disk reservation get path takes it when it recalculates the number
of sectors known to be available, but it's not really needed for
consistency.  We just want to make sure we only have one thread updating
the sectors_available count, which we can do with a dedicated mutex.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs.h |  1 +
 fs/bcachefs/buckets.c  | 16 +++++-----------
 fs/bcachefs/super.c    |  2 ++
 3 files changed, 8 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 807291c33f5c..877ce788d413 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -640,6 +640,7 @@ struct bch_fs {
 	unsigned		bucket_size_max;
 
 	atomic64_t		sectors_available;
+	struct mutex		sectors_available_lock;
 
 	struct bch_fs_pcpu __percpu	*pcpu;
 
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index ff4c61371830..2488a2227bd9 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -1182,13 +1182,6 @@ void bch2_trans_fs_usage_apply(struct btree_trans *trans,
 
 /* Disk reservations: */
 
-static u64 bch2_recalc_sectors_available(struct bch_fs *c)
-{
-	percpu_u64_set(&c->pcpu->sectors_available, 0);
-
-	return avail_factor(__bch2_fs_usage_read_short(c).free);
-}
-
 void __bch2_disk_reservation_put(struct bch_fs *c, struct disk_reservation *res)
 {
 	percpu_down_read(&c->mark_lock);
@@ -1222,7 +1215,6 @@ int bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res,
 
 		if (get < sectors) {
 			preempt_enable();
-			percpu_up_read(&c->mark_lock);
 			goto recalculate;
 		}
 	} while ((v = atomic64_cmpxchg(&c->sectors_available,
@@ -1240,9 +1232,10 @@ out:
 	return 0;
 
 recalculate:
-	percpu_down_write(&c->mark_lock);
+	mutex_lock(&c->sectors_available_lock);
 
-	sectors_available = bch2_recalc_sectors_available(c);
+	percpu_u64_set(&c->pcpu->sectors_available, 0);
+	sectors_available = avail_factor(__bch2_fs_usage_read_short(c).free);
 
 	if (sectors <= sectors_available ||
 	    (flags & BCH_DISK_RESERVATION_NOFAIL)) {
@@ -1256,7 +1249,8 @@ recalculate:
 		ret = -ENOSPC;
 	}
 
-	percpu_up_write(&c->mark_lock);
+	mutex_unlock(&c->sectors_available_lock);
+	percpu_up_read(&c->mark_lock);
 
 	return ret;
 }
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index e0d4898ad0f5..b954a4e47e15 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -669,6 +669,8 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 
 	bch2_fs_btree_cache_init_early(&c->btree_cache);
 
+	mutex_init(&c->sectors_available_lock);
+
 	if (percpu_init_rwsem(&c->mark_lock))
 		goto err;
 
-- 
cgit 


From 5e82a9a1f4f82e273530b90d107638a5969d1de0 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 10 Feb 2019 19:34:47 -0500
Subject: bcachefs: Write out fs usage consistently

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs.h              |   6 +-
 fs/bcachefs/btree_gc.c              |  28 +++--
 fs/bcachefs/btree_update_interior.c |  24 ++--
 fs/bcachefs/btree_update_leaf.c     |   4 +-
 fs/bcachefs/buckets.c               | 217 ++++++++++++++++++++++++------------
 fs/bcachefs/buckets.h               |  27 +++--
 fs/bcachefs/buckets_types.h         |  12 +-
 fs/bcachefs/chardev.c               |   4 +-
 fs/bcachefs/recovery.c              |   8 +-
 fs/bcachefs/replicas.c              | 128 ++++++++++++---------
 fs/bcachefs/super-io.c              |  25 ++---
 fs/bcachefs/super.c                 |  13 ++-
 fs/bcachefs/sysfs.c                 |  37 +-----
 fs/bcachefs/util.h                  |   8 ++
 14 files changed, 310 insertions(+), 231 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 877ce788d413..68e2d3b1a9a6 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -646,11 +646,15 @@ struct bch_fs {
 
 	struct percpu_rw_semaphore	mark_lock;
 
+	seqcount_t			usage_lock;
+	struct bch_fs_usage		*usage_base;
 	struct bch_fs_usage __percpu	*usage[2];
+	struct bch_fs_usage __percpu	*usage_gc;
+	u64 __percpu		*online_reserved;
 
 	/* single element mempool: */
 	struct mutex		usage_scratch_lock;
-	struct bch_fs_usage	*usage_scratch;
+	struct bch_fs_usage_online *usage_scratch;
 
 	/*
 	 * When we invalidate buckets, we use both the priority and the amount
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 3ba0910c2a47..5ad933ba049b 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -490,8 +490,8 @@ static void bch2_gc_free(struct bch_fs *c)
 		ca->usage[1] = NULL;
 	}
 
-	free_percpu(c->usage[1]);
-	c->usage[1] = NULL;
+	free_percpu(c->usage_gc);
+	c->usage_gc = NULL;
 }
 
 static int bch2_gc_done(struct bch_fs *c,
@@ -587,14 +587,16 @@ static int bch2_gc_done(struct bch_fs *c,
 		}
 	};
 
+	for (i = 0; i < ARRAY_SIZE(c->usage); i++)
+		bch2_fs_usage_acc_to_base(c, i);
+
 	bch2_dev_usage_from_buckets(c);
 
 	{
 		unsigned nr = fs_usage_u64s(c);
-		struct bch_fs_usage *dst = (void *)
-			bch2_acc_percpu_u64s((void *) c->usage[0], nr);
+		struct bch_fs_usage *dst = c->usage_base;
 		struct bch_fs_usage *src = (void *)
-			bch2_acc_percpu_u64s((void *) c->usage[1], nr);
+			bch2_acc_percpu_u64s((void *) c->usage_gc, nr);
 
 		copy_fs_field(hidden,		"hidden");
 		copy_fs_field(btree,		"btree");
@@ -647,11 +649,11 @@ static int bch2_gc_start(struct bch_fs *c,
 	 */
 	gc_pos_set(c, gc_phase(GC_PHASE_START));
 
-	BUG_ON(c->usage[1]);
+	BUG_ON(c->usage_gc);
 
-	c->usage[1] = __alloc_percpu_gfp(fs_usage_u64s(c) * sizeof(u64),
+	c->usage_gc = __alloc_percpu_gfp(fs_usage_u64s(c) * sizeof(u64),
 					 sizeof(u64), GFP_KERNEL);
-	if (!c->usage[1])
+	if (!c->usage_gc)
 		return -ENOMEM;
 
 	for_each_member_device(ca, c, i) {
@@ -770,11 +772,17 @@ out:
 		ret = -EINVAL;
 	}
 
-	percpu_down_write(&c->mark_lock);
+	if (!ret) {
+		bch2_journal_block(&c->journal);
 
-	if (!ret)
+		percpu_down_write(&c->mark_lock);
 		ret = bch2_gc_done(c, initial, metadata_only);
 
+		bch2_journal_unblock(&c->journal);
+	} else {
+		percpu_down_write(&c->mark_lock);
+	}
+
 	/* Indicates that gc is no longer in progress: */
 	__gc_pos_set(c, gc_phase(GC_PHASE_NOT_RUNNING));
 
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 35472cf5e9e0..cc0cd465b863 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -1066,7 +1066,7 @@ static void bch2_btree_set_root_inmem(struct btree_update *as, struct btree *b)
 {
 	struct bch_fs *c = as->c;
 	struct btree *old = btree_node_root(c, b);
-	struct bch_fs_usage *fs_usage;
+	struct bch_fs_usage_online *fs_usage;
 
 	__bch2_btree_set_root_inmem(c, b);
 
@@ -1075,7 +1075,7 @@ static void bch2_btree_set_root_inmem(struct btree_update *as, struct btree *b)
 	fs_usage = bch2_fs_usage_scratch_get(c);
 
 	bch2_mark_key_locked(c, bkey_i_to_s_c(&b->key),
-		      true, 0, fs_usage, 0, 0);
+		      true, 0, &fs_usage->u, 0, 0);
 	if (gc_visited(c, gc_pos_btree_root(b->btree_id)))
 		bch2_mark_key_locked(c, bkey_i_to_s_c(&b->key),
 				     true, 0, NULL, 0,
@@ -1084,8 +1084,8 @@ static void bch2_btree_set_root_inmem(struct btree_update *as, struct btree *b)
 	if (old && !btree_node_fake(old))
 		bch2_btree_node_free_index(as, NULL,
 					   bkey_i_to_s_c(&old->key),
-					   fs_usage);
-	bch2_fs_usage_apply(c, fs_usage, &as->reserve->disk_res);
+					   &fs_usage->u);
+	bch2_fs_usage_apply(c, fs_usage, &as->reserve->disk_res, 0);
 
 	bch2_fs_usage_scratch_put(c, fs_usage);
 	percpu_up_read(&c->mark_lock);
@@ -1160,7 +1160,7 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, struct btree *b
 					struct btree_node_iter *node_iter)
 {
 	struct bch_fs *c = as->c;
-	struct bch_fs_usage *fs_usage;
+	struct bch_fs_usage_online *fs_usage;
 	struct bkey_packed *k;
 	struct bkey tmp;
 
@@ -1171,7 +1171,7 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, struct btree *b
 	fs_usage = bch2_fs_usage_scratch_get(c);
 
 	bch2_mark_key_locked(c, bkey_i_to_s_c(insert),
-			     true, 0, fs_usage, 0, 0);
+			     true, 0, &fs_usage->u, 0, 0);
 
 	if (gc_visited(c, gc_pos_btree_node(b)))
 		bch2_mark_key_locked(c, bkey_i_to_s_c(insert),
@@ -1188,9 +1188,9 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, struct btree *b
 	if (k && !bkey_cmp_packed(b, k, &insert->k))
 		bch2_btree_node_free_index(as, b,
 					   bkey_disassemble(b, k, &tmp),
-					   fs_usage);
+					   &fs_usage->u);
 
-	bch2_fs_usage_apply(c, fs_usage, &as->reserve->disk_res);
+	bch2_fs_usage_apply(c, fs_usage, &as->reserve->disk_res, 0);
 
 	bch2_fs_usage_scratch_put(c, fs_usage);
 	percpu_up_read(&c->mark_lock);
@@ -1984,7 +1984,7 @@ static void __bch2_btree_node_update_key(struct bch_fs *c,
 			bkey_copy(&b->key, &new_key->k_i);
 		}
 	} else {
-		struct bch_fs_usage *fs_usage;
+		struct bch_fs_usage_online *fs_usage;
 
 		BUG_ON(btree_node_root(c, b) != b);
 
@@ -1995,7 +1995,7 @@ static void __bch2_btree_node_update_key(struct bch_fs *c,
 		fs_usage = bch2_fs_usage_scratch_get(c);
 
 		bch2_mark_key_locked(c, bkey_i_to_s_c(&new_key->k_i),
-			      true, 0, fs_usage, 0, 0);
+			      true, 0, &fs_usage->u, 0, 0);
 		if (gc_visited(c, gc_pos_btree_root(b->btree_id)))
 			bch2_mark_key_locked(c, bkey_i_to_s_c(&new_key->k_i),
 					     true, 0, NULL, 0,
@@ -2003,8 +2003,8 @@ static void __bch2_btree_node_update_key(struct bch_fs *c,
 
 		bch2_btree_node_free_index(as, NULL,
 					   bkey_i_to_s_c(&b->key),
-					   fs_usage);
-		bch2_fs_usage_apply(c, fs_usage, &as->reserve->disk_res);
+					   &fs_usage->u);
+		bch2_fs_usage_apply(c, fs_usage, &as->reserve->disk_res, 0);
 
 		bch2_fs_usage_scratch_put(c, fs_usage);
 		percpu_up_read(&c->mark_lock);
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index a9d7905f3373..3425ad6f68b2 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -533,7 +533,7 @@ static inline int do_btree_insert_at(struct btree_trans *trans,
 				     struct btree_insert_entry **stopped_at)
 {
 	struct bch_fs *c = trans->c;
-	struct bch_fs_usage *fs_usage = NULL;
+	struct bch_fs_usage_online *fs_usage = NULL;
 	struct btree_insert_entry *i;
 	struct btree_iter *linked;
 	int ret;
@@ -608,7 +608,7 @@ static inline int do_btree_insert_at(struct btree_trans *trans,
 
 	if (likely(!(trans->flags & BTREE_INSERT_NOMARK))) {
 		trans_for_each_update_iter(trans, i)
-			bch2_mark_update(trans, i, fs_usage, 0);
+			bch2_mark_update(trans, i, &fs_usage->u, 0);
 		if (fs_usage)
 			bch2_trans_fs_usage_apply(trans, fs_usage);
 
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 2488a2227bd9..fb5461df3bbf 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -120,8 +120,10 @@ void bch2_fs_usage_initialize(struct bch_fs *c)
 	unsigned i;
 
 	percpu_down_write(&c->mark_lock);
-	usage = (void *) bch2_acc_percpu_u64s((void *) c->usage[0],
-					      fs_usage_u64s(c));
+	usage = c->usage_base;
+
+	for (i = 0; i < ARRAY_SIZE(c->usage); i++)
+		bch2_fs_usage_acc_to_base(c, i);
 
 	for (i = 0; i < BCH_REPLICAS_MAX; i++)
 		usage->reserved += usage->persistent_reserved[i];
@@ -146,7 +148,7 @@ void bch2_fs_usage_initialize(struct bch_fs *c)
 	percpu_up_write(&c->mark_lock);
 }
 
-void bch2_fs_usage_scratch_put(struct bch_fs *c, struct bch_fs_usage *fs_usage)
+void bch2_fs_usage_scratch_put(struct bch_fs *c, struct bch_fs_usage_online *fs_usage)
 {
 	if (fs_usage == c->usage_scratch)
 		mutex_unlock(&c->usage_scratch_lock);
@@ -154,12 +156,12 @@ void bch2_fs_usage_scratch_put(struct bch_fs *c, struct bch_fs_usage *fs_usage)
 		kfree(fs_usage);
 }
 
-struct bch_fs_usage *bch2_fs_usage_scratch_get(struct bch_fs *c)
+struct bch_fs_usage_online *bch2_fs_usage_scratch_get(struct bch_fs *c)
 {
-	struct bch_fs_usage *ret;
-	unsigned bytes = fs_usage_u64s(c) * sizeof(u64);
-
-	ret = kzalloc(bytes, GFP_NOWAIT);
+	struct bch_fs_usage_online *ret;
+	unsigned bytes = sizeof(struct bch_fs_usage_online) + sizeof(u64) *
+		READ_ONCE(c->replicas.nr);
+	ret = kzalloc(bytes, GFP_NOWAIT|__GFP_NOWARN);
 	if (ret)
 		return ret;
 
@@ -189,30 +191,117 @@ struct bch_dev_usage bch2_dev_usage_read(struct bch_fs *c, struct bch_dev *ca)
 	return ret;
 }
 
-struct bch_fs_usage *bch2_fs_usage_read(struct bch_fs *c)
+static inline struct bch_fs_usage *fs_usage_ptr(struct bch_fs *c,
+						unsigned journal_seq,
+						bool gc)
 {
-	struct bch_fs_usage *ret;
-	unsigned v, u64s = fs_usage_u64s(c);
-retry:
-	ret = kzalloc(u64s * sizeof(u64), GFP_NOFS);
-	if (unlikely(!ret))
-		return NULL;
+	return this_cpu_ptr(gc
+			    ? c->usage_gc
+			    : c->usage[journal_seq & 1]);
+}
+
+u64 bch2_fs_usage_read_one(struct bch_fs *c, u64 *v)
+{
+	ssize_t offset = v - (u64 *) c->usage_base;
+	unsigned seq;
+	u64 ret;
+
+	BUG_ON(offset < 0 || offset >= fs_usage_u64s(c));
+	percpu_rwsem_assert_held(&c->mark_lock);
+
+	do {
+		seq = read_seqcount_begin(&c->usage_lock);
+		ret = *v +
+			percpu_u64_get((u64 __percpu *) c->usage[0] + offset) +
+			percpu_u64_get((u64 __percpu *) c->usage[1] + offset);
+	} while (read_seqcount_retry(&c->usage_lock, seq));
+
+	return ret;
+}
+
+struct bch_fs_usage_online *bch2_fs_usage_read(struct bch_fs *c)
+{
+	struct bch_fs_usage_online *ret;
+	unsigned seq, i, u64s;
 
 	percpu_down_read(&c->mark_lock);
 
-	v = fs_usage_u64s(c);
-	if (unlikely(u64s != v)) {
-		u64s = v;
+	ret = kmalloc(sizeof(struct bch_fs_usage_online) +
+		      sizeof(u64) + c->replicas.nr, GFP_NOFS);
+	if (unlikely(!ret)) {
 		percpu_up_read(&c->mark_lock);
-		kfree(ret);
-		goto retry;
+		return NULL;
 	}
 
-	acc_u64s_percpu((u64 *) ret, (u64 __percpu *) c->usage[0], u64s);
+	ret->online_reserved = percpu_u64_get(c->online_reserved);
+
+	u64s = fs_usage_u64s(c);
+	do {
+		seq = read_seqcount_begin(&c->usage_lock);
+		memcpy(&ret->u, c->usage_base, u64s * sizeof(u64));
+		for (i = 0; i < ARRAY_SIZE(c->usage); i++)
+			acc_u64s_percpu((u64 *) &ret->u, (u64 __percpu *) c->usage[i], u64s);
+	} while (read_seqcount_retry(&c->usage_lock, seq));
 
 	return ret;
 }
 
+void bch2_fs_usage_acc_to_base(struct bch_fs *c, unsigned idx)
+{
+	unsigned u64s = fs_usage_u64s(c);
+
+	BUG_ON(idx >= ARRAY_SIZE(c->usage));
+
+	preempt_disable();
+	write_seqcount_begin(&c->usage_lock);
+
+	acc_u64s_percpu((u64 *) c->usage_base,
+			(u64 __percpu *) c->usage[idx], u64s);
+	percpu_memset(c->usage[idx], 0, u64s * sizeof(u64));
+
+	write_seqcount_end(&c->usage_lock);
+	preempt_enable();
+}
+
+void bch2_fs_usage_to_text(struct printbuf *out,
+			   struct bch_fs *c,
+			   struct bch_fs_usage_online *fs_usage)
+{
+	unsigned i;
+
+	pr_buf(out, "capacity:\t\t\t%llu\n", c->capacity);
+
+	pr_buf(out, "hidden:\t\t\t\t%llu\n",
+	       fs_usage->u.hidden);
+	pr_buf(out, "data:\t\t\t\t%llu\n",
+	       fs_usage->u.data);
+	pr_buf(out, "cached:\t\t\t\t%llu\n",
+	       fs_usage->u.cached);
+	pr_buf(out, "reserved:\t\t\t%llu\n",
+	       fs_usage->u.reserved);
+	pr_buf(out, "nr_inodes:\t\t\t%llu\n",
+	       fs_usage->u.nr_inodes);
+	pr_buf(out, "online reserved:\t\t%llu\n",
+	       fs_usage->online_reserved);
+
+	for (i = 0;
+	     i < ARRAY_SIZE(fs_usage->u.persistent_reserved);
+	     i++) {
+		pr_buf(out, "%u replicas:\n", i + 1);
+		pr_buf(out, "\treserved:\t\t%llu\n",
+		       fs_usage->u.persistent_reserved[i]);
+	}
+
+	for (i = 0; i < c->replicas.nr; i++) {
+		struct bch_replicas_entry *e =
+			cpu_replicas_entry(&c->replicas, i);
+
+		pr_buf(out, "\t");
+		bch2_replicas_entry_to_text(out, e);
+		pr_buf(out, ":\t%llu\n", fs_usage->u.replicas[i]);
+	}
+}
+
 #define RESERVE_FACTOR	6
 
 static u64 reserve_factor(u64 r)
@@ -225,12 +314,12 @@ static u64 avail_factor(u64 r)
 	return (r << RESERVE_FACTOR) / ((1 << RESERVE_FACTOR) + 1);
 }
 
-u64 bch2_fs_sectors_used(struct bch_fs *c, struct bch_fs_usage *fs_usage)
+u64 bch2_fs_sectors_used(struct bch_fs *c, struct bch_fs_usage_online *fs_usage)
 {
-	return min(fs_usage->hidden +
-		   fs_usage->btree +
-		   fs_usage->data +
-		   reserve_factor(fs_usage->reserved +
+	return min(fs_usage->u.hidden +
+		   fs_usage->u.btree +
+		   fs_usage->u.data +
+		   reserve_factor(fs_usage->u.reserved +
 				  fs_usage->online_reserved),
 		   c->capacity);
 }
@@ -242,17 +331,17 @@ __bch2_fs_usage_read_short(struct bch_fs *c)
 	u64 data, reserved;
 
 	ret.capacity = c->capacity -
-		percpu_u64_get(&c->usage[0]->hidden);
+		bch2_fs_usage_read_one(c, &c->usage_base->hidden);
 
-	data		= percpu_u64_get(&c->usage[0]->data) +
-			  percpu_u64_get(&c->usage[0]->btree);
-	reserved	= percpu_u64_get(&c->usage[0]->reserved) +
-		percpu_u64_get(&c->usage[0]->online_reserved);
+	data		= bch2_fs_usage_read_one(c, &c->usage_base->data) +
+		bch2_fs_usage_read_one(c, &c->usage_base->btree);
+	reserved	= bch2_fs_usage_read_one(c, &c->usage_base->reserved) +
+		percpu_u64_get(c->online_reserved);
 
 	ret.used	= min(ret.capacity, data + reserve_factor(reserved));
 	ret.free	= ret.capacity - ret.used;
 
-	ret.nr_inodes	= percpu_u64_get(&c->usage[0]->nr_inodes);
+	ret.nr_inodes	= bch2_fs_usage_read_one(c, &c->usage_base->nr_inodes);
 
 	return ret;
 }
@@ -300,10 +389,12 @@ static bool bucket_became_unavailable(struct bucket_mark old,
 }
 
 int bch2_fs_usage_apply(struct bch_fs *c,
-			struct bch_fs_usage *fs_usage,
-			struct disk_reservation *disk_res)
+			struct bch_fs_usage_online *src,
+			struct disk_reservation *disk_res,
+			unsigned journal_seq)
 {
-	s64 added = fs_usage->data + fs_usage->reserved;
+	struct bch_fs_usage *dst = fs_usage_ptr(c, journal_seq, false);
+	s64 added = src->u.data + src->u.reserved;
 	s64 should_not_have_added;
 	int ret = 0;
 
@@ -315,20 +406,22 @@ int bch2_fs_usage_apply(struct bch_fs *c,
 	 */
 	should_not_have_added = added - (s64) (disk_res ? disk_res->sectors : 0);
 	if (WARN_ONCE(should_not_have_added > 0,
-		      "disk usage increased without a reservation")) {
+		      "disk usage increased by %lli more than reservation of %llu",
+		      added, disk_res ? disk_res->sectors : 0)) {
 		atomic64_sub(should_not_have_added, &c->sectors_available);
 		added -= should_not_have_added;
 		ret = -1;
 	}
 
 	if (added > 0) {
-		disk_res->sectors		-= added;
-		fs_usage->online_reserved	-= added;
+		disk_res->sectors	-= added;
+		src->online_reserved	-= added;
 	}
 
+	this_cpu_add(*c->online_reserved, src->online_reserved);
+
 	preempt_disable();
-	acc_u64s((u64 *) this_cpu_ptr(c->usage[0]),
-		 (u64 *) fs_usage, fs_usage_u64s(c));
+	acc_u64s((u64 *) dst, (u64 *) &src->u, fs_usage_u64s(c));
 	preempt_enable();
 
 	return ret;
@@ -371,10 +464,7 @@ static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
 		account_bucket(fs_usage, dev_usage, bucket_type(new),
 			       1, ca->mi.bucket_size);
 
-	dev_usage->buckets_alloc +=
-		(int) new.owned_by_allocator - (int) old.owned_by_allocator;
-	dev_usage->buckets_ec +=
-		(int) new.stripe - (int) old.stripe;
+	dev_usage->buckets_ec += (int) new.stripe - (int) old.stripe;
 	dev_usage->buckets_unavailable +=
 		is_unavailable_bucket(new) - is_unavailable_bucket(old);
 
@@ -394,21 +484,12 @@ void bch2_dev_usage_from_buckets(struct bch_fs *c)
 {
 	struct bch_dev *ca;
 	struct bucket_mark old = { .v.counter = 0 };
-	struct bch_fs_usage *fs_usage;
 	struct bucket_array *buckets;
 	struct bucket *g;
 	unsigned i;
 	int cpu;
 
-	percpu_u64_set(&c->usage[0]->hidden, 0);
-
-	/*
-	 * This is only called during startup, before there's any multithreaded
-	 * access to c->usage:
-	 */
-	preempt_disable();
-	fs_usage = this_cpu_ptr(c->usage[0]);
-	preempt_enable();
+	c->usage_base->hidden = 0;
 
 	for_each_member_device(ca, c, i) {
 		for_each_possible_cpu(cpu)
@@ -418,7 +499,7 @@ void bch2_dev_usage_from_buckets(struct bch_fs *c)
 		buckets = bucket_array(ca);
 
 		for_each_bucket(g, buckets)
-			bch2_dev_usage_update(c, ca, fs_usage,
+			bch2_dev_usage_update(c, ca, c->usage_base,
 					      old, g->mark, false);
 	}
 }
@@ -483,7 +564,7 @@ static int __bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca,
 				    size_t b, struct bucket_mark *ret,
 				    bool gc)
 {
-	struct bch_fs_usage *fs_usage = this_cpu_ptr(c->usage[gc]);
+	struct bch_fs_usage *fs_usage = fs_usage_ptr(c, 0, gc);
 	struct bucket *g = __bucket(ca, b, gc);
 	struct bucket_mark old, new;
 
@@ -522,7 +603,7 @@ static int __bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
 				    size_t b, bool owned_by_allocator,
 				    bool gc)
 {
-	struct bch_fs_usage *fs_usage = this_cpu_ptr(c->usage[gc]);
+	struct bch_fs_usage *fs_usage = fs_usage_ptr(c, 0, gc);
 	struct bucket *g = __bucket(ca, b, gc);
 	struct bucket_mark old, new;
 
@@ -634,7 +715,7 @@ static int __bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
 		old.dirty_sectors, sectors);
 
 	if (c)
-		bch2_dev_usage_update(c, ca, this_cpu_ptr(c->usage[gc]),
+		bch2_dev_usage_update(c, ca, fs_usage_ptr(c, 0, gc),
 				      old, new, gc);
 
 	return 0;
@@ -974,7 +1055,7 @@ int bch2_mark_key_locked(struct bch_fs *c,
 	preempt_disable();
 
 	if (!fs_usage || gc)
-		fs_usage = this_cpu_ptr(c->usage[gc]);
+		fs_usage = fs_usage_ptr(c, journal_seq, gc);
 
 	switch (k.k->type) {
 	case KEY_TYPE_alloc:
@@ -1133,7 +1214,7 @@ int bch2_mark_update(struct btree_trans *trans,
 }
 
 void bch2_trans_fs_usage_apply(struct btree_trans *trans,
-			       struct bch_fs_usage *fs_usage)
+			       struct bch_fs_usage_online *fs_usage)
 {
 	struct bch_fs *c = trans->c;
 	struct btree_insert_entry *i;
@@ -1141,7 +1222,8 @@ void bch2_trans_fs_usage_apply(struct btree_trans *trans,
 	u64 disk_res_sectors = trans->disk_res ? trans->disk_res->sectors : 0;
 	char buf[200];
 
-	if (!bch2_fs_usage_apply(c, fs_usage, trans->disk_res) ||
+	if (!bch2_fs_usage_apply(c, fs_usage, trans->disk_res,
+				 trans->journal_res.seq) ||
 	    warned_disk_usage ||
 	    xchg(&warned_disk_usage, 1))
 		return;
@@ -1182,15 +1264,6 @@ void bch2_trans_fs_usage_apply(struct btree_trans *trans,
 
 /* Disk reservations: */
 
-void __bch2_disk_reservation_put(struct bch_fs *c, struct disk_reservation *res)
-{
-	percpu_down_read(&c->mark_lock);
-	this_cpu_sub(c->usage[0]->online_reserved, res->sectors);
-	percpu_up_read(&c->mark_lock);
-
-	res->sectors = 0;
-}
-
 #define SECTORS_CACHE	1024
 
 int bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res,
@@ -1224,7 +1297,7 @@ int bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res,
 
 out:
 	pcpu->sectors_available		-= sectors;
-	this_cpu_add(c->usage[0]->online_reserved, sectors);
+	this_cpu_add(*c->online_reserved, sectors);
 	res->sectors			+= sectors;
 
 	preempt_enable();
@@ -1241,7 +1314,7 @@ recalculate:
 	    (flags & BCH_DISK_RESERVATION_NOFAIL)) {
 		atomic64_set(&c->sectors_available,
 			     max_t(s64, 0, sectors_available - sectors));
-		this_cpu_add(c->usage[0]->online_reserved, sectors);
+		this_cpu_add(*c->online_reserved, sectors);
 		res->sectors			+= sectors;
 		ret = 0;
 	} else {
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index c51192fae503..86431cffb660 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -219,12 +219,19 @@ static inline unsigned fs_usage_u64s(struct bch_fs *c)
 		READ_ONCE(c->replicas.nr);
 }
 
-void bch2_fs_usage_scratch_put(struct bch_fs *, struct bch_fs_usage *);
-struct bch_fs_usage *bch2_fs_usage_scratch_get(struct bch_fs *);
+void bch2_fs_usage_scratch_put(struct bch_fs *, struct bch_fs_usage_online *);
+struct bch_fs_usage_online *bch2_fs_usage_scratch_get(struct bch_fs *);
 
-struct bch_fs_usage *bch2_fs_usage_read(struct bch_fs *);
+u64 bch2_fs_usage_read_one(struct bch_fs *, u64 *);
 
-u64 bch2_fs_sectors_used(struct bch_fs *, struct bch_fs_usage *);
+struct bch_fs_usage_online *bch2_fs_usage_read(struct bch_fs *);
+
+void bch2_fs_usage_acc_to_base(struct bch_fs *, unsigned);
+
+void bch2_fs_usage_to_text(struct printbuf *,
+			   struct bch_fs *, struct bch_fs_usage_online *);
+
+u64 bch2_fs_sectors_used(struct bch_fs *, struct bch_fs_usage_online *);
 
 struct bch_fs_usage_short
 bch2_fs_usage_read_short(struct bch_fs *);
@@ -251,25 +258,23 @@ int bch2_mark_key_locked(struct bch_fs *, struct bkey_s_c,
 int bch2_mark_key(struct bch_fs *, struct bkey_s_c,
 		  bool, s64, struct bch_fs_usage *,
 		  u64, unsigned);
-int bch2_fs_usage_apply(struct bch_fs *, struct bch_fs_usage *,
-			struct disk_reservation *);
+int bch2_fs_usage_apply(struct bch_fs *, struct bch_fs_usage_online *,
+			struct disk_reservation *, unsigned);
 
 int bch2_mark_overwrite(struct btree_trans *, struct btree_iter *,
 			struct bkey_s_c, struct bkey_i *,
 			struct bch_fs_usage *, unsigned);
 int bch2_mark_update(struct btree_trans *, struct btree_insert_entry *,
 		     struct bch_fs_usage *, unsigned);
-void bch2_trans_fs_usage_apply(struct btree_trans *, struct bch_fs_usage *);
+void bch2_trans_fs_usage_apply(struct btree_trans *, struct bch_fs_usage_online *);
 
 /* disk reservations: */
 
-void __bch2_disk_reservation_put(struct bch_fs *, struct disk_reservation *);
-
 static inline void bch2_disk_reservation_put(struct bch_fs *c,
 					     struct disk_reservation *res)
 {
-	if (res->sectors)
-		__bch2_disk_reservation_put(c, res);
+	this_cpu_sub(*c->online_reserved, res->sectors);
+	res->sectors = 0;
 }
 
 #define BCH_DISK_RESERVATION_NOFAIL		(1 << 0)
diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h
index a98493dd2ba8..8e47b273360c 100644
--- a/fs/bcachefs/buckets_types.h
+++ b/fs/bcachefs/buckets_types.h
@@ -52,7 +52,6 @@ struct bucket_array {
 
 struct bch_dev_usage {
 	u64			buckets[BCH_DATA_NR];
-	u64			buckets_alloc;
 	u64			buckets_ec;
 	u64			buckets_unavailable;
 
@@ -63,12 +62,6 @@ struct bch_dev_usage {
 
 struct bch_fs_usage {
 	/* all fields are in units of 512 byte sectors: */
-
-	u64			online_reserved;
-
-	/* fields after online_reserved are cleared/recalculated by gc: */
-	u64			gc_start[0];
-
 	u64			hidden;
 	u64			btree;
 	u64			data;
@@ -88,6 +81,11 @@ struct bch_fs_usage {
 	u64			replicas[];
 };
 
+struct bch_fs_usage_online {
+	u64			online_reserved;
+	struct bch_fs_usage	u;
+};
+
 struct bch_fs_usage_short {
 	u64			capacity;
 	u64			used;
diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c
index f7cfec9f00f9..2573376290bb 100644
--- a/fs/bcachefs/chardev.c
+++ b/fs/bcachefs/chardev.c
@@ -394,7 +394,7 @@ static long bch2_ioctl_usage(struct bch_fs *c,
 	}
 
 	{
-		struct bch_fs_usage *src;
+		struct bch_fs_usage_online *src;
 		struct bch_ioctl_fs_usage dst = {
 			.capacity		= c->capacity,
 		};
@@ -410,7 +410,7 @@ static long bch2_ioctl_usage(struct bch_fs *c,
 
 		for (i = 0; i < BCH_REPLICAS_MAX; i++) {
 			dst.persistent_reserved[i] =
-				src->persistent_reserved[i];
+				src->u.persistent_reserved[i];
 #if 0
 			for (j = 0; j < BCH_DATA_NR; j++)
 				dst.sectors[j][i] = src.replicas[i].data[j];
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index d207ff7b98f4..a3f07565efb0 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -404,13 +404,11 @@ static int journal_replay_entry_early(struct bch_fs *c,
 		switch (entry->btree_id) {
 		case FS_USAGE_RESERVED:
 			if (entry->level < BCH_REPLICAS_MAX)
-				percpu_u64_set(&c->usage[0]->
-					       persistent_reserved[entry->level],
-					       le64_to_cpu(u->v));
+				c->usage_base->persistent_reserved[entry->level] =
+					le64_to_cpu(u->v);
 			break;
 		case FS_USAGE_INODES:
-			percpu_u64_set(&c->usage[0]->nr_inodes,
-				       le64_to_cpu(u->v));
+			c->usage_base->nr_inodes = le64_to_cpu(u->v);
 			break;
 		case FS_USAGE_KEY_VERSION:
 			atomic64_set(&c->key_version,
diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c
index b1df2c1ce4a4..cf13a628682f 100644
--- a/fs/bcachefs/replicas.c
+++ b/fs/bcachefs/replicas.c
@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 
 #include "bcachefs.h"
+#include "buckets.h"
 #include "journal.h"
 #include "replicas.h"
 #include "super-io.h"
@@ -235,20 +236,13 @@ bool bch2_replicas_marked(struct bch_fs *c,
 	return marked;
 }
 
-static void __replicas_table_update(struct bch_fs_usage __percpu *dst_p,
+static void __replicas_table_update(struct bch_fs_usage *dst,
 				    struct bch_replicas_cpu *dst_r,
-				    struct bch_fs_usage __percpu *src_p,
+				    struct bch_fs_usage *src,
 				    struct bch_replicas_cpu *src_r)
 {
-	unsigned src_nr = sizeof(struct bch_fs_usage) / sizeof(u64) + src_r->nr;
-	struct bch_fs_usage *dst, *src = (void *)
-		bch2_acc_percpu_u64s((void *) src_p, src_nr);
 	int src_idx, dst_idx;
 
-	preempt_disable();
-	dst = this_cpu_ptr(dst_p);
-	preempt_enable();
-
 	*dst = *src;
 
 	for (src_idx = 0; src_idx < src_r->nr; src_idx++) {
@@ -263,42 +257,75 @@ static void __replicas_table_update(struct bch_fs_usage __percpu *dst_p,
 	}
 }
 
+static void __replicas_table_update_pcpu(struct bch_fs_usage __percpu *dst_p,
+				    struct bch_replicas_cpu *dst_r,
+				    struct bch_fs_usage __percpu *src_p,
+				    struct bch_replicas_cpu *src_r)
+{
+	unsigned src_nr = sizeof(struct bch_fs_usage) / sizeof(u64) + src_r->nr;
+	struct bch_fs_usage *dst, *src = (void *)
+		bch2_acc_percpu_u64s((void *) src_p, src_nr);
+
+	preempt_disable();
+	dst = this_cpu_ptr(dst_p);
+	preempt_enable();
+
+	__replicas_table_update(dst, dst_r, src, src_r);
+}
+
 /*
  * Resize filesystem accounting:
  */
 static int replicas_table_update(struct bch_fs *c,
 				 struct bch_replicas_cpu *new_r)
 {
-	struct bch_fs_usage __percpu *new_usage[2] = { NULL, NULL };
-	struct bch_fs_usage *new_scratch = NULL;
-	unsigned bytes = sizeof(struct bch_fs_usage) +
+	struct bch_fs_usage __percpu *new_usage[2];
+	struct bch_fs_usage_online *new_scratch = NULL;
+	struct bch_fs_usage __percpu *new_gc = NULL;
+	struct bch_fs_usage *new_base = NULL;
+	unsigned i, bytes = sizeof(struct bch_fs_usage) +
+		sizeof(u64) * new_r->nr;
+	unsigned scratch_bytes = sizeof(struct bch_fs_usage_online) +
 		sizeof(u64) * new_r->nr;
 	int ret = -ENOMEM;
 
-	if (!(new_usage[0] = __alloc_percpu_gfp(bytes, sizeof(u64),
-						GFP_NOIO)) ||
-	    (c->usage[1] &&
-	     !(new_usage[1] = __alloc_percpu_gfp(bytes, sizeof(u64),
-						 GFP_NOIO))) ||
-	    !(new_scratch  = kmalloc(bytes, GFP_NOIO)))
-		goto err;
+	memset(new_usage, 0, sizeof(new_usage));
 
-	if (c->usage[0])
-		__replicas_table_update(new_usage[0],	new_r,
-					c->usage[0],	&c->replicas);
-	if (c->usage[1])
-		__replicas_table_update(new_usage[1],	new_r,
-					c->usage[1],	&c->replicas);
+	for (i = 0; i < ARRAY_SIZE(new_usage); i++)
+		if (!(new_usage[i] = __alloc_percpu_gfp(bytes,
+					sizeof(u64), GFP_NOIO)))
+			goto err;
 
-	swap(c->usage[0],	new_usage[0]);
-	swap(c->usage[1],	new_usage[1]);
+	if (!(new_base = kzalloc(bytes, GFP_NOIO)) ||
+	    !(new_scratch  = kmalloc(scratch_bytes, GFP_NOIO)) ||
+	    (c->usage_gc &&
+	     !(new_gc = __alloc_percpu_gfp(bytes, sizeof(u64), GFP_NOIO))))
+		goto err;
+
+	for (i = 0; i < ARRAY_SIZE(new_usage); i++)
+		if (c->usage[i])
+			__replicas_table_update_pcpu(new_usage[i], new_r,
+						     c->usage[i], &c->replicas);
+	if (c->usage_base)
+		__replicas_table_update(new_base,		new_r,
+					c->usage_base,		&c->replicas);
+	if (c->usage_gc)
+		__replicas_table_update_pcpu(new_gc,		new_r,
+					     c->usage_gc,	&c->replicas);
+
+	for (i = 0; i < ARRAY_SIZE(new_usage); i++)
+		swap(c->usage[i],	new_usage[i]);
+	swap(c->usage_base,	new_base);
 	swap(c->usage_scratch,	new_scratch);
+	swap(c->usage_gc,	new_gc);
 	swap(c->replicas,	*new_r);
 	ret = 0;
 err:
+	free_percpu(new_gc);
 	kfree(new_scratch);
 	free_percpu(new_usage[1]);
 	free_percpu(new_usage[0]);
+	kfree(new_base);
 	return ret;
 }
 
@@ -457,9 +484,7 @@ int bch2_replicas_gc_end(struct bch_fs *c, int ret)
 	lockdep_assert_held(&c->replicas_gc_lock);
 
 	mutex_lock(&c->sb_lock);
-
-	if (ret)
-		goto err;
+	percpu_down_write(&c->mark_lock);
 
 	/*
 	 * this is kind of crappy; the replicas gc mechanism needs to be ripped
@@ -470,26 +495,20 @@ int bch2_replicas_gc_end(struct bch_fs *c, int ret)
 		struct bch_replicas_entry *e =
 			cpu_replicas_entry(&c->replicas, i);
 		struct bch_replicas_cpu n;
-		u64 v;
 
-		if (__replicas_has_entry(&c->replicas_gc, e))
-			continue;
-
-		v = percpu_u64_get(&c->usage[0]->replicas[i]);
-		if (!v)
-			continue;
+		if (!__replicas_has_entry(&c->replicas_gc, e) &&
+		    (c->usage_base->replicas[i] ||
+		     percpu_u64_get(&c->usage[0]->replicas[i]) ||
+		     percpu_u64_get(&c->usage[1]->replicas[i]))) {
+			n = cpu_replicas_add_entry(&c->replicas_gc, e);
+			if (!n.entries) {
+				ret = -ENOSPC;
+				goto err;
+			}
 
-		n = cpu_replicas_add_entry(&c->replicas_gc, e);
-		if (!n.entries) {
-			ret = -ENOSPC;
-			goto err;
+			swap(n, c->replicas_gc);
+			kfree(n.entries);
 		}
-
-		percpu_down_write(&c->mark_lock);
-		swap(n, c->replicas_gc);
-		percpu_up_write(&c->mark_lock);
-
-		kfree(n.entries);
 	}
 
 	if (bch2_cpu_replicas_to_sb_replicas(c, &c->replicas_gc)) {
@@ -497,19 +516,18 @@ int bch2_replicas_gc_end(struct bch_fs *c, int ret)
 		goto err;
 	}
 
-	bch2_write_super(c);
-
-	/* don't update in memory replicas until changes are persistent */
+	ret = replicas_table_update(c, &c->replicas_gc);
 err:
-	percpu_down_write(&c->mark_lock);
-	if (!ret)
-		ret = replicas_table_update(c, &c->replicas_gc);
-
 	kfree(c->replicas_gc.entries);
 	c->replicas_gc.entries = NULL;
+
 	percpu_up_write(&c->mark_lock);
 
+	if (!ret)
+		bch2_write_super(c);
+
 	mutex_unlock(&c->sb_lock);
+
 	return ret;
 }
 
@@ -576,7 +594,7 @@ int bch2_replicas_set_usage(struct bch_fs *c,
 		BUG_ON(ret < 0);
 	}
 
-	percpu_u64_set(&c->usage[0]->replicas[idx], sectors);
+	c->usage_base->replicas[idx] = sectors;
 
 	return 0;
 }
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index 0fe8ea22c6a1..6e69a4f74ca0 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 
 #include "bcachefs.h"
+#include "buckets.h"
 #include "checksum.h"
 #include "disk_groups.h"
 #include "ec.h"
@@ -978,13 +979,16 @@ bch2_journal_super_entries_add_common(struct bch_fs *c,
 
 	mutex_unlock(&c->btree_root_lock);
 
-	if (journal_seq)
-		return entry;
+	percpu_down_read(&c->mark_lock);
 
-	percpu_down_write(&c->mark_lock);
+	if (!journal_seq) {
+		for (i = 0; i < ARRAY_SIZE(c->usage); i++)
+			bch2_fs_usage_acc_to_base(c, i);
+	} else {
+		bch2_fs_usage_acc_to_base(c, journal_seq & 1);
+	}
 
 	{
-		u64 nr_inodes = percpu_u64_get(&c->usage[0]->nr_inodes);
 		struct jset_entry_usage *u =
 			container_of(entry, struct jset_entry_usage, entry);
 
@@ -992,7 +996,7 @@ bch2_journal_super_entries_add_common(struct bch_fs *c,
 		u->entry.u64s	= DIV_ROUND_UP(sizeof(*u), sizeof(u64)) - 1;
 		u->entry.type	= BCH_JSET_ENTRY_usage;
 		u->entry.btree_id = FS_USAGE_INODES;
-		u->v		= cpu_to_le64(nr_inodes);
+		u->v		= cpu_to_le64(c->usage_base->nr_inodes);
 
 		entry = vstruct_next(entry);
 	}
@@ -1013,17 +1017,13 @@ bch2_journal_super_entries_add_common(struct bch_fs *c,
 	for (i = 0; i < BCH_REPLICAS_MAX; i++) {
 		struct jset_entry_usage *u =
 			container_of(entry, struct jset_entry_usage, entry);
-		u64 sectors = percpu_u64_get(&c->usage[0]->persistent_reserved[i]);
-
-		if (!sectors)
-			continue;
 
 		memset(u, 0, sizeof(*u));
 		u->entry.u64s	= DIV_ROUND_UP(sizeof(*u), sizeof(u64)) - 1;
 		u->entry.type	= BCH_JSET_ENTRY_usage;
 		u->entry.btree_id = FS_USAGE_RESERVED;
 		u->entry.level	= i;
-		u->v		= sectors;
+		u->v		= cpu_to_le64(c->usage_base->persistent_reserved[i]);
 
 		entry = vstruct_next(entry);
 	}
@@ -1031,7 +1031,6 @@ bch2_journal_super_entries_add_common(struct bch_fs *c,
 	for (i = 0; i < c->replicas.nr; i++) {
 		struct bch_replicas_entry *e =
 			cpu_replicas_entry(&c->replicas, i);
-		u64 sectors = percpu_u64_get(&c->usage[0]->replicas[i]);
 		struct jset_entry_data_usage *u =
 			container_of(entry, struct jset_entry_data_usage, entry);
 
@@ -1039,14 +1038,14 @@ bch2_journal_super_entries_add_common(struct bch_fs *c,
 		u->entry.u64s	= DIV_ROUND_UP(sizeof(*u) + e->nr_devs,
 					       sizeof(u64)) - 1;
 		u->entry.type	= BCH_JSET_ENTRY_data_usage;
-		u->v		= cpu_to_le64(sectors);
+		u->v		= cpu_to_le64(c->usage_base->replicas[i]);
 		unsafe_memcpy(&u->r, e, replicas_entry_bytes(e),
 			      "embedded variable length struct");
 
 		entry = vstruct_next(entry);
 	}
 
-	percpu_up_write(&c->mark_lock);
+	percpu_up_read(&c->mark_lock);
 
 	return entry;
 }
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index b954a4e47e15..959638c986a0 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -464,8 +464,11 @@ static void bch2_fs_free(struct bch_fs *c)
 	bch2_io_clock_exit(&c->io_clock[READ]);
 	bch2_fs_compress_exit(c);
 	percpu_free_rwsem(&c->mark_lock);
+	free_percpu(c->online_reserved);
 	kfree(c->usage_scratch);
+	free_percpu(c->usage[1]);
 	free_percpu(c->usage[0]);
+	kfree(c->usage_base);
 	free_percpu(c->pcpu);
 	mempool_exit(&c->btree_iters_pool);
 	mempool_exit(&c->btree_bounce_pool);
@@ -658,6 +661,8 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 
 	seqcount_init(&c->gc_pos_lock);
 
+	seqcount_init(&c->usage_lock);
+
 	c->copy_gc_enabled		= 1;
 	c->rebalance.enabled		= 1;
 	c->promote_whole_extents	= true;
@@ -721,6 +726,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 			    offsetof(struct btree_write_bio, wbio.bio)),
 			BIOSET_NEED_BVECS) ||
 	    !(c->pcpu = alloc_percpu(struct bch_fs_pcpu)) ||
+	    !(c->online_reserved = alloc_percpu(u64)) ||
 	    mempool_init_kvpmalloc_pool(&c->btree_bounce_pool, 1,
 					btree_bytes(c)) ||
 	    mempool_init_kmalloc_pool(&c->btree_iters_pool, 1,
@@ -1433,13 +1439,8 @@ err:
 static void dev_usage_clear(struct bch_dev *ca)
 {
 	struct bucket_array *buckets;
-	int cpu;
 
-	for_each_possible_cpu(cpu) {
-		struct bch_dev_usage *p =
-			per_cpu_ptr(ca->usage[0], cpu);
-		memset(p, 0, sizeof(*p));
-	}
+	percpu_memset(ca->usage[0], 0, sizeof(*ca->usage[0]));
 
 	down_read(&ca->bucket_lock);
 	buckets = bucket_array(ca);
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index ee4c0764d4ad..3139161fbe88 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -235,43 +235,12 @@ static size_t bch2_btree_cache_size(struct bch_fs *c)
 static ssize_t show_fs_alloc_debug(struct bch_fs *c, char *buf)
 {
 	struct printbuf out = _PBUF(buf, PAGE_SIZE);
-	struct bch_fs_usage *fs_usage = bch2_fs_usage_read(c);
-	unsigned i;
+	struct bch_fs_usage_online *fs_usage = bch2_fs_usage_read(c);
 
 	if (!fs_usage)
 		return -ENOMEM;
 
-	pr_buf(&out, "capacity:\t\t\t%llu\n", c->capacity);
-
-	pr_buf(&out, "hidden:\t\t\t\t%llu\n",
-	       fs_usage->hidden);
-	pr_buf(&out, "data:\t\t\t\t%llu\n",
-	       fs_usage->data);
-	pr_buf(&out, "cached:\t\t\t\t%llu\n",
-	       fs_usage->cached);
-	pr_buf(&out, "reserved:\t\t\t%llu\n",
-	       fs_usage->reserved);
-	pr_buf(&out, "nr_inodes:\t\t\t%llu\n",
-	       fs_usage->nr_inodes);
-	pr_buf(&out, "online reserved:\t\t%llu\n",
-	       fs_usage->online_reserved);
-
-	for (i = 0;
-	     i < ARRAY_SIZE(fs_usage->persistent_reserved);
-	     i++) {
-		pr_buf(&out, "%u replicas:\n", i + 1);
-		pr_buf(&out, "\treserved:\t\t%llu\n",
-		       fs_usage->persistent_reserved[i]);
-	}
-
-	for (i = 0; i < c->replicas.nr; i++) {
-		struct bch_replicas_entry *e =
-			cpu_replicas_entry(&c->replicas, i);
-
-		pr_buf(&out, "\t");
-		bch2_replicas_entry_to_text(&out, e);
-		pr_buf(&out, ":\t%llu\n", fs_usage->replicas[i]);
-	}
+	bch2_fs_usage_to_text(&out, c, fs_usage);
 
 	percpu_up_read(&c->mark_lock);
 
@@ -840,7 +809,6 @@ static ssize_t show_dev_alloc_debug(struct bch_dev *ca, char *buf)
 		"free[RESERVE_NONE]:     %zu/%zu\n"
 		"buckets:\n"
 		"    capacity:           %llu\n"
-		"    alloc:              %llu\n"
 		"    sb:                 %llu\n"
 		"    journal:            %llu\n"
 		"    meta:               %llu\n"
@@ -867,7 +835,6 @@ static ssize_t show_dev_alloc_debug(struct bch_dev *ca, char *buf)
 		fifo_used(&ca->free[RESERVE_MOVINGGC]),	ca->free[RESERVE_MOVINGGC].size,
 		fifo_used(&ca->free[RESERVE_NONE]),	ca->free[RESERVE_NONE].size,
 		ca->mi.nbuckets - ca->mi.first_bucket,
-		stats.buckets_alloc,
 		stats.buckets[BCH_DATA_SB],
 		stats.buckets[BCH_DATA_JOURNAL],
 		stats.buckets[BCH_DATA_BTREE],
diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h
index 59c8a1dac7be..c0910f230caf 100644
--- a/fs/bcachefs/util.h
+++ b/fs/bcachefs/util.h
@@ -741,6 +741,14 @@ static inline void acc_u64s_percpu(u64 *acc, const u64 __percpu *src,
 		acc_u64s(acc, per_cpu_ptr(src, cpu), nr);
 }
 
+static inline void percpu_memset(void __percpu *p, int c, size_t bytes)
+{
+	int cpu;
+
+	for_each_possible_cpu(cpu)
+		memset(per_cpu_ptr(p, cpu), c, bytes);
+}
+
 u64 *bch2_acc_percpu_u64s(u64 __percpu *, unsigned);
 
 #define cmp_int(l, r)		((l > r) - (l < r))
-- 
cgit 


From c43a6ef9a0747ef1094ff14e173513070ed91600 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 6 Jun 2020 12:28:01 -0400
Subject: bcachefs: btree_bkey_cached_common

This is prep work for the btree key cache: btree iterators will point to
either struct btree, or a new struct bkey_cached.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c      |   2 +-
 fs/bcachefs/btree_cache.c           |  62 +++++++++----------
 fs/bcachefs/btree_cache.h           |   2 +-
 fs/bcachefs/btree_gc.c              |  34 +++++------
 fs/bcachefs/btree_gc.h              |   2 +-
 fs/bcachefs/btree_io.c              |  36 +++++------
 fs/bcachefs/btree_io.h              |   2 +-
 fs/bcachefs/btree_iter.c            |  82 ++++++++++++-------------
 fs/bcachefs/btree_iter.h            |  27 +++++----
 fs/bcachefs/btree_locking.h         |  16 ++---
 fs/bcachefs/btree_types.h           |  15 +++--
 fs/bcachefs/btree_update_interior.c | 115 ++++++++++++++++++------------------
 fs/bcachefs/btree_update_interior.h |   6 +-
 fs/bcachefs/btree_update_leaf.c     |   4 +-
 fs/bcachefs/debug.c                 |   4 +-
 fs/bcachefs/trace.h                 |   6 +-
 16 files changed, 212 insertions(+), 203 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index a6d3417ac262..5988971521eb 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -1529,7 +1529,7 @@ again:
 				rcu_read_unlock();
 				btree_node_lock_type(c, b, SIX_LOCK_read);
 				bch2_btree_node_write(c, b, SIX_LOCK_read);
-				six_unlock_read(&b->lock);
+				six_unlock_read(&b->c.lock);
 				goto again;
 			} else {
 				nodes_unwritten = true;
diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index bb88ce1415c8..2e932ee7ad0c 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -27,7 +27,7 @@ void bch2_recalc_btree_reserve(struct bch_fs *c)
 	for (i = 0; i < BTREE_ID_NR; i++)
 		if (c->btree_roots[i].b)
 			reserve += min_t(unsigned, 1,
-					 c->btree_roots[i].b->level) * 8;
+					 c->btree_roots[i].b->c.level) * 8;
 
 	c->btree_cache.reserve = reserve;
 }
@@ -98,8 +98,8 @@ static struct btree *btree_node_mem_alloc(struct bch_fs *c, gfp_t gfp)
 		return NULL;
 
 	bkey_btree_ptr_init(&b->key);
-	six_lock_init(&b->lock);
-	lockdep_set_novalidate_class(&b->lock);
+	six_lock_init(&b->c.lock);
+	lockdep_set_novalidate_class(&b->c.lock);
 	INIT_LIST_HEAD(&b->list);
 	INIT_LIST_HEAD(&b->write_blocked);
 
@@ -128,8 +128,8 @@ int bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b,
 {
 	int ret;
 
-	b->level	= level;
-	b->btree_id	= id;
+	b->c.level	= level;
+	b->c.btree_id	= id;
 
 	mutex_lock(&bc->lock);
 	ret = __bch2_btree_node_hash_insert(bc, b);
@@ -159,10 +159,10 @@ static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush)
 
 	lockdep_assert_held(&bc->lock);
 
-	if (!six_trylock_intent(&b->lock))
+	if (!six_trylock_intent(&b->c.lock))
 		return -ENOMEM;
 
-	if (!six_trylock_write(&b->lock))
+	if (!six_trylock_write(&b->c.lock))
 		goto out_unlock_intent;
 
 	if (btree_node_noevict(b))
@@ -203,9 +203,9 @@ out:
 		trace_btree_node_reap(c, b);
 	return ret;
 out_unlock:
-	six_unlock_write(&b->lock);
+	six_unlock_write(&b->c.lock);
 out_unlock_intent:
-	six_unlock_intent(&b->lock);
+	six_unlock_intent(&b->c.lock);
 	ret = -ENOMEM;
 	goto out;
 }
@@ -263,8 +263,8 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink,
 		if (++i > 3 &&
 		    !btree_node_reclaim(c, b)) {
 			btree_node_data_free(c, b);
-			six_unlock_write(&b->lock);
-			six_unlock_intent(&b->lock);
+			six_unlock_write(&b->c.lock);
+			six_unlock_intent(&b->c.lock);
 			freed++;
 		}
 	}
@@ -290,8 +290,8 @@ restart:
 			mutex_unlock(&bc->lock);
 
 			bch2_btree_node_hash_remove(bc, b);
-			six_unlock_write(&b->lock);
-			six_unlock_intent(&b->lock);
+			six_unlock_write(&b->c.lock);
+			six_unlock_intent(&b->c.lock);
 
 			if (freed >= nr)
 				goto out;
@@ -530,8 +530,8 @@ struct btree *bch2_btree_node_mem_alloc(struct bch_fs *c)
 			if (b->data)
 				goto out_unlock;
 
-			six_unlock_write(&b->lock);
-			six_unlock_intent(&b->lock);
+			six_unlock_write(&b->c.lock);
+			six_unlock_intent(&b->c.lock);
 			goto err;
 		}
 
@@ -539,8 +539,8 @@ struct btree *bch2_btree_node_mem_alloc(struct bch_fs *c)
 	if (!b)
 		goto err;
 
-	BUG_ON(!six_trylock_intent(&b->lock));
-	BUG_ON(!six_trylock_write(&b->lock));
+	BUG_ON(!six_trylock_intent(&b->c.lock));
+	BUG_ON(!six_trylock_write(&b->c.lock));
 out_unlock:
 	BUG_ON(btree_node_hashed(b));
 	BUG_ON(btree_node_write_in_flight(b));
@@ -611,8 +611,8 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c,
 		list_add(&b->list, &bc->freeable);
 		mutex_unlock(&bc->lock);
 
-		six_unlock_write(&b->lock);
-		six_unlock_intent(&b->lock);
+		six_unlock_write(&b->c.lock);
+		six_unlock_intent(&b->c.lock);
 		return NULL;
 	}
 
@@ -630,15 +630,15 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c,
 
 	bch2_btree_node_read(c, b, sync);
 
-	six_unlock_write(&b->lock);
+	six_unlock_write(&b->c.lock);
 
 	if (!sync) {
-		six_unlock_intent(&b->lock);
+		six_unlock_intent(&b->c.lock);
 		return NULL;
 	}
 
 	if (lock_type == SIX_LOCK_read)
-		six_lock_downgrade(&b->lock);
+		six_lock_downgrade(&b->c.lock);
 
 	return b;
 }
@@ -727,9 +727,9 @@ retry:
 			return ERR_PTR(-EINTR);
 
 		if (unlikely(PTR_HASH(&b->key) != PTR_HASH(k) ||
-			     b->level != level ||
+			     b->c.level != level ||
 			     race_fault())) {
-			six_unlock_type(&b->lock, lock_type);
+			six_unlock_type(&b->c.lock, lock_type);
 			if (bch2_btree_node_relock(iter, level + 1))
 				goto retry;
 
@@ -758,11 +758,11 @@ retry:
 		set_btree_node_accessed(b);
 
 	if (unlikely(btree_node_read_error(b))) {
-		six_unlock_type(&b->lock, lock_type);
+		six_unlock_type(&b->c.lock, lock_type);
 		return ERR_PTR(-EIO);
 	}
 
-	EBUG_ON(b->btree_id != iter->btree_id ||
+	EBUG_ON(b->c.btree_id != iter->btree_id ||
 		BTREE_NODE_LEVEL(b->data) != level ||
 		bkey_cmp(b->data->max_key, k->k.p));
 
@@ -780,7 +780,7 @@ struct btree *bch2_btree_node_get_sibling(struct bch_fs *c,
 	struct bkey_packed *k;
 	BKEY_PADDED(k) tmp;
 	struct btree *ret = NULL;
-	unsigned level = b->level;
+	unsigned level = b->c.level;
 
 	parent = btree_iter_node(iter, level + 1);
 	if (!parent)
@@ -789,7 +789,7 @@ struct btree *bch2_btree_node_get_sibling(struct bch_fs *c,
 	if (!bch2_btree_node_relock(iter, level + 1))
 		goto out_upgrade;
 
-	node_iter = iter->l[parent->level].iter;
+	node_iter = iter->l[parent->c.level].iter;
 
 	k = bch2_btree_node_iter_peek_all(&node_iter, parent);
 	BUG_ON(bkey_cmp_left_packed(parent, k, &b->key.k.p));
@@ -836,7 +836,7 @@ struct btree *bch2_btree_node_get_sibling(struct bch_fs *c,
 			btree_iter_set_dirty(iter, BTREE_ITER_NEED_RELOCK);
 
 			if (!IS_ERR(ret)) {
-				six_unlock_intent(&ret->lock);
+				six_unlock_intent(&ret->c.lock);
 				ret = ERR_PTR(-EINTR);
 			}
 		}
@@ -859,7 +859,7 @@ out:
 		if (sib != btree_prev_sib)
 			swap(n1, n2);
 
-		BUG_ON(bkey_cmp(btree_type_successor(n1->btree_id,
+		BUG_ON(bkey_cmp(btree_type_successor(n1->c.btree_id,
 						     n1->key.k.p),
 				n2->data->min_key));
 	}
@@ -904,7 +904,7 @@ void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c,
 	pr_buf(out,
 	       "l %u %llu:%llu - %llu:%llu:\n"
 	       "    ptrs: ",
-	       b->level,
+	       b->c.level,
 	       b->data->min_key.inode,
 	       b->data->min_key.offset,
 	       b->data->max_key.inode,
diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h
index 7bd2bc84160d..e0f233583796 100644
--- a/fs/bcachefs/btree_cache.h
+++ b/fs/bcachefs/btree_cache.h
@@ -83,7 +83,7 @@ static inline unsigned btree_blocks(struct bch_fs *c)
 	(BTREE_FOREGROUND_MERGE_THRESHOLD(c) +			\
 	 (BTREE_FOREGROUND_MERGE_THRESHOLD(c) << 2))
 
-#define btree_node_root(_c, _b)	((_c)->btree_roots[(_b)->btree_id].b)
+#define btree_node_root(_c, _b)	((_c)->btree_roots[(_b)->c.btree_id].b)
 
 void bch2_btree_node_to_text(struct printbuf *, struct bch_fs *,
 			     struct btree *);
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 5ad933ba049b..3dc073e5e5b6 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -71,10 +71,10 @@ static void btree_node_range_checks_init(struct range_checks *r, unsigned depth)
 static void btree_node_range_checks(struct bch_fs *c, struct btree *b,
 				    struct range_checks *r)
 {
-	struct range_level *l = &r->l[b->level];
+	struct range_level *l = &r->l[b->c.level];
 
 	struct bpos expected_min = bkey_cmp(l->min, l->max)
-		? btree_type_successor(b->btree_id, l->max)
+		? btree_type_successor(b->c.btree_id, l->max)
 		: l->max;
 
 	bch2_fs_inconsistent_on(bkey_cmp(b->data->min_key, expected_min), c,
@@ -86,8 +86,8 @@ static void btree_node_range_checks(struct bch_fs *c, struct btree *b,
 
 	l->max = b->data->max_key;
 
-	if (b->level > r->depth) {
-		l = &r->l[b->level - 1];
+	if (b->c.level > r->depth) {
+		l = &r->l[b->c.level - 1];
 
 		bch2_fs_inconsistent_on(bkey_cmp(b->data->min_key, l->min), c,
 			"btree node min doesn't match min of child nodes: %llu:%llu != %llu:%llu",
@@ -105,7 +105,7 @@ static void btree_node_range_checks(struct bch_fs *c, struct btree *b,
 
 		if (bkey_cmp(b->data->max_key, POS_MAX))
 			l->min = l->max =
-				btree_type_successor(b->btree_id,
+				btree_type_successor(b->c.btree_id,
 						     b->data->max_key);
 	}
 }
@@ -261,7 +261,7 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id,
 	if (!btree_node_fake(b))
 		ret = bch2_gc_mark_key(c, bkey_i_to_s_c(&b->key),
 				       &max_stale, initial);
-	gc_pos_set(c, gc_pos_btree_root(b->btree_id));
+	gc_pos_set(c, gc_pos_btree_root(b->c.btree_id));
 	mutex_unlock(&c->btree_root_lock);
 
 	return ret;
@@ -932,9 +932,9 @@ static void bch2_coalesce_nodes(struct bch_fs *c, struct btree_iter *iter,
 
 			set_btree_bset_end(n1, n1->set);
 
-			six_unlock_write(&n2->lock);
+			six_unlock_write(&n2->c.lock);
 			bch2_btree_node_free_never_inserted(c, n2);
-			six_unlock_intent(&n2->lock);
+			six_unlock_intent(&n2->c.lock);
 
 			memmove(new_nodes + i - 1,
 				new_nodes + i,
@@ -970,7 +970,7 @@ static void bch2_coalesce_nodes(struct bch_fs *c, struct btree_iter *iter,
 		btree_node_reset_sib_u64s(n);
 
 		bch2_btree_build_aux_trees(n);
-		six_unlock_write(&n->lock);
+		six_unlock_write(&n->c.lock);
 
 		bch2_btree_node_write(c, n, SIX_LOCK_intent);
 	}
@@ -1013,7 +1013,7 @@ next:
 
 	BUG_ON(!bch2_keylist_empty(&keylist));
 
-	BUG_ON(iter->l[old_nodes[0]->level].b != old_nodes[0]);
+	BUG_ON(iter->l[old_nodes[0]->c.level].b != old_nodes[0]);
 
 	bch2_btree_iter_node_replace(iter, new_nodes[0]);
 
@@ -1035,7 +1035,7 @@ next:
 		} else {
 			old_nodes[i] = NULL;
 			if (new_nodes[i])
-				six_unlock_intent(&new_nodes[i]->lock);
+				six_unlock_intent(&new_nodes[i]->c.lock);
 		}
 	}
 
@@ -1078,11 +1078,11 @@ static int bch2_coalesce_btree(struct bch_fs *c, enum btree_id btree_id)
 
 		for (i = 1; i < GC_MERGE_NODES; i++) {
 			if (!merge[i] ||
-			    !six_relock_intent(&merge[i]->lock, lock_seq[i]))
+			    !six_relock_intent(&merge[i]->c.lock, lock_seq[i]))
 				break;
 
-			if (merge[i]->level != merge[0]->level) {
-				six_unlock_intent(&merge[i]->lock);
+			if (merge[i]->c.level != merge[0]->c.level) {
+				six_unlock_intent(&merge[i]->c.lock);
 				break;
 			}
 		}
@@ -1091,11 +1091,11 @@ static int bch2_coalesce_btree(struct bch_fs *c, enum btree_id btree_id)
 		bch2_coalesce_nodes(c, iter, merge);
 
 		for (i = 1; i < GC_MERGE_NODES && merge[i]; i++) {
-			lock_seq[i] = merge[i]->lock.state.seq;
-			six_unlock_intent(&merge[i]->lock);
+			lock_seq[i] = merge[i]->c.lock.state.seq;
+			six_unlock_intent(&merge[i]->c.lock);
 		}
 
-		lock_seq[0] = merge[0]->lock.state.seq;
+		lock_seq[0] = merge[0]->c.lock.state.seq;
 
 		if (kthread && kthread_should_stop()) {
 			bch2_trans_exit(&trans);
diff --git a/fs/bcachefs/btree_gc.h b/fs/bcachefs/btree_gc.h
index bd5f2752954f..3966d5e54cfd 100644
--- a/fs/bcachefs/btree_gc.h
+++ b/fs/bcachefs/btree_gc.h
@@ -81,7 +81,7 @@ static inline struct gc_pos gc_pos_btree(enum btree_id id,
  */
 static inline struct gc_pos gc_pos_btree_node(struct btree *b)
 {
-	return gc_pos_btree(b->btree_id, b->key.k.p, b->level);
+	return gc_pos_btree(b->c.btree_id, b->key.k.p, b->c.level);
 }
 
 /*
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index fa261a175f5e..baffb58fd10b 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -473,8 +473,8 @@ void bch2_btree_init_next(struct bch_fs *c, struct btree *b,
 	struct btree_node_entry *bne;
 	bool did_sort;
 
-	EBUG_ON(!(b->lock.state.seq & 1));
-	EBUG_ON(iter && iter->l[b->level].b != b);
+	EBUG_ON(!(b->c.lock.state.seq & 1));
+	EBUG_ON(iter && iter->l[b->c.level].b != b);
 
 	did_sort = btree_node_compact(c, b, iter);
 
@@ -524,8 +524,8 @@ static void btree_err_msg(struct printbuf *out, struct bch_fs *c,
 	       "at btree %u level %u/%u\n"
 	       "pos %llu:%llu node offset %u",
 	       write ? "before write " : "",
-	       b->btree_id, b->level,
-	       c->btree_roots[b->btree_id].level,
+	       b->c.btree_id, b->c.level,
+	       c->btree_roots[b->c.btree_id].level,
 	       b->key.k.p.inode, b->key.k.p.offset,
 	       b->written);
 	if (i)
@@ -610,11 +610,11 @@ static int validate_bset(struct bch_fs *c, struct btree *b,
 
 	if (i == &b->data->keys) {
 		/* These indicate that we read the wrong btree node: */
-		btree_err_on(BTREE_NODE_ID(b->data) != b->btree_id,
+		btree_err_on(BTREE_NODE_ID(b->data) != b->c.btree_id,
 			     BTREE_ERR_MUST_RETRY, c, b, i,
 			     "incorrect btree id");
 
-		btree_err_on(BTREE_NODE_LEVEL(b->data) != b->level,
+		btree_err_on(BTREE_NODE_LEVEL(b->data) != b->c.level,
 			     BTREE_ERR_MUST_RETRY, c, b, i,
 			     "incorrect level");
 
@@ -1105,8 +1105,8 @@ int bch2_btree_root_read(struct bch_fs *c, enum btree_id id,
 
 	bch2_btree_set_root_for_read(c, b);
 err:
-	six_unlock_write(&b->lock);
-	six_unlock_intent(&b->lock);
+	six_unlock_write(&b->c.lock);
+	six_unlock_intent(&b->c.lock);
 
 	return ret;
 }
@@ -1153,15 +1153,15 @@ static void bch2_btree_node_write_error(struct bch_fs *c,
 
 	bch2_trans_init(&trans, c);
 
-	iter = bch2_trans_get_node_iter(&trans, b->btree_id, b->key.k.p,
-					BTREE_MAX_DEPTH, b->level, 0);
+	iter = bch2_trans_get_node_iter(&trans, b->c.btree_id, b->key.k.p,
+					BTREE_MAX_DEPTH, b->c.level, 0);
 retry:
 	ret = bch2_btree_iter_traverse(iter);
 	if (ret)
 		goto err;
 
 	/* has node been freed? */
-	if (iter->l[b->level].b != b) {
+	if (iter->l[b->c.level].b != b) {
 		/* node has been freed: */
 		BUG_ON(!btree_node_dying(b));
 		goto out;
@@ -1359,9 +1359,9 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
 	 * doing btree writes:
 	 */
 	if (lock_type_held == SIX_LOCK_intent &&
-	    six_trylock_write(&b->lock)) {
+	    six_trylock_write(&b->c.lock)) {
 		__bch2_compact_whiteouts(c, b, COMPACT_WRITTEN);
-		six_unlock_write(&b->lock);
+		six_unlock_write(&b->c.lock);
 	} else {
 		__bch2_compact_whiteouts(c, b, COMPACT_WRITTEN_NO_WRITE_LOCK);
 	}
@@ -1606,18 +1606,18 @@ void bch2_btree_node_write(struct bch_fs *c, struct btree *b,
 	BUG_ON(lock_type_held == SIX_LOCK_write);
 
 	if (lock_type_held == SIX_LOCK_intent ||
-	    six_lock_tryupgrade(&b->lock)) {
+	    six_lock_tryupgrade(&b->c.lock)) {
 		__bch2_btree_node_write(c, b, SIX_LOCK_intent);
 
 		/* don't cycle lock unnecessarily: */
 		if (btree_node_just_written(b) &&
-		    six_trylock_write(&b->lock)) {
+		    six_trylock_write(&b->c.lock)) {
 			bch2_btree_post_write_cleanup(c, b);
-			six_unlock_write(&b->lock);
+			six_unlock_write(&b->c.lock);
 		}
 
 		if (lock_type_held == SIX_LOCK_read)
-			six_lock_downgrade(&b->lock);
+			six_lock_downgrade(&b->c.lock);
 	} else {
 		__bch2_btree_node_write(c, b, SIX_LOCK_read);
 	}
@@ -1688,7 +1688,7 @@ ssize_t bch2_dirty_btree_nodes_print(struct bch_fs *c, char *buf)
 		       b,
 		       (flags & (1 << BTREE_NODE_dirty)) != 0,
 		       (flags & (1 << BTREE_NODE_need_write)) != 0,
-		       b->level,
+		       b->c.level,
 		       b->written,
 		       !list_empty_careful(&b->write_blocked),
 		       b->will_make_reachable != 0,
diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h
index c817aeed878a..3fb0aa20b340 100644
--- a/fs/bcachefs/btree_io.h
+++ b/fs/bcachefs/btree_io.h
@@ -111,7 +111,7 @@ static inline void btree_node_write_if_need(struct bch_fs *c, struct btree *b)
 			break;
 		}
 
-		six_unlock_read(&b->lock);
+		six_unlock_read(&b->c.lock);
 		btree_node_wait_on_io(b);
 		btree_node_lock_type(c, b, SIX_LOCK_read);
 	}
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index eeb9a59283a7..3fdf5ab25578 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -54,7 +54,7 @@ static inline int btree_iter_pos_cmp(struct btree_iter *iter,
 				     const struct btree *b,
 				     const struct bkey_packed *k)
 {
-	return __btree_iter_pos_cmp(iter, b, k, b->level != 0);
+	return __btree_iter_pos_cmp(iter, b, k, b->c.level != 0);
 }
 
 /* Btree node locking: */
@@ -67,13 +67,13 @@ void bch2_btree_node_unlock_write(struct btree *b, struct btree_iter *iter)
 {
 	struct btree_iter *linked;
 
-	EBUG_ON(iter->l[b->level].b != b);
-	EBUG_ON(iter->l[b->level].lock_seq + 1 != b->lock.state.seq);
+	EBUG_ON(iter->l[b->c.level].b != b);
+	EBUG_ON(iter->l[b->c.level].lock_seq + 1 != b->c.lock.state.seq);
 
 	trans_for_each_iter_with_node(iter->trans, b, linked)
-		linked->l[b->level].lock_seq += 2;
+		linked->l[b->c.level].lock_seq += 2;
 
-	six_unlock_write(&b->lock);
+	six_unlock_write(&b->c.lock);
 }
 
 void __bch2_btree_node_lock_write(struct btree *b, struct btree_iter *iter)
@@ -81,11 +81,11 @@ void __bch2_btree_node_lock_write(struct btree *b, struct btree_iter *iter)
 	struct btree_iter *linked;
 	unsigned readers = 0;
 
-	EBUG_ON(btree_node_read_locked(iter, b->level));
+	EBUG_ON(btree_node_read_locked(iter, b->c.level));
 
 	trans_for_each_iter(iter->trans, linked)
-		if (linked->l[b->level].b == b &&
-		    btree_node_read_locked(linked, b->level))
+		if (linked->l[b->c.level].b == b &&
+		    btree_node_read_locked(linked, b->c.level))
 			readers++;
 
 	/*
@@ -95,10 +95,10 @@ void __bch2_btree_node_lock_write(struct btree *b, struct btree_iter *iter)
 	 * locked:
 	 */
 	atomic64_sub(__SIX_VAL(read_lock, readers),
-		     &b->lock.state.counter);
+		     &b->c.lock.state.counter);
 	btree_node_lock_type(iter->trans->c, b, SIX_LOCK_write);
 	atomic64_add(__SIX_VAL(read_lock, readers),
-		     &b->lock.state.counter);
+		     &b->c.lock.state.counter);
 }
 
 bool __bch2_btree_node_relock(struct btree_iter *iter, unsigned level)
@@ -112,8 +112,8 @@ bool __bch2_btree_node_relock(struct btree_iter *iter, unsigned level)
 	if (race_fault())
 		return false;
 
-	if (!six_relock_type(&b->lock, want, iter->l[level].lock_seq) &&
-	    !(iter->l[level].lock_seq >> 1 == b->lock.state.seq >> 1 &&
+	if (!six_relock_type(&b->c.lock, want, iter->l[level].lock_seq) &&
+	    !(iter->l[level].lock_seq >> 1 == b->c.lock.state.seq >> 1 &&
 	      btree_node_lock_increment(iter, b, level, want)))
 		return false;
 
@@ -137,11 +137,11 @@ static bool bch2_btree_node_upgrade(struct btree_iter *iter, unsigned level)
 		return false;
 
 	if (btree_node_locked(iter, level)
-	    ? six_lock_tryupgrade(&b->lock)
-	    : six_relock_type(&b->lock, SIX_LOCK_intent, iter->l[level].lock_seq))
+	    ? six_lock_tryupgrade(&b->c.lock)
+	    : six_relock_type(&b->c.lock, SIX_LOCK_intent, iter->l[level].lock_seq))
 		goto success;
 
-	if (iter->l[level].lock_seq >> 1 == b->lock.state.seq >> 1 &&
+	if (iter->l[level].lock_seq >> 1 == b->c.lock.state.seq >> 1 &&
 	    btree_node_lock_increment(iter, b, level, BTREE_NODE_INTENT_LOCKED)) {
 		btree_node_unlock(iter, level);
 		goto success;
@@ -378,7 +378,7 @@ void __bch2_btree_iter_downgrade(struct btree_iter *iter,
 				btree_node_unlock(linked, l);
 			} else {
 				if (btree_node_intent_locked(linked, l)) {
-					six_lock_downgrade(&linked->l[l].b->lock);
+					six_lock_downgrade(&linked->l[l].b->c.lock);
 					linked->nodes_intent_locked ^= 1 << l;
 				}
 				break;
@@ -427,7 +427,7 @@ void bch2_btree_trans_unlock(struct btree_trans *trans)
 static void __bch2_btree_iter_verify(struct btree_iter *iter,
 				     struct btree *b)
 {
-	struct btree_iter_level *l = &iter->l[b->level];
+	struct btree_iter_level *l = &iter->l[b->c.level];
 	struct btree_node_iter tmp = l->iter;
 	struct bkey_packed *k;
 
@@ -446,7 +446,7 @@ static void __bch2_btree_iter_verify(struct btree_iter *iter,
 	 * For extents, the iterator may have skipped past deleted keys (but not
 	 * whiteouts)
 	 */
-	k = b->level || iter->flags & BTREE_ITER_IS_EXTENTS
+	k = b->c.level || iter->flags & BTREE_ITER_IS_EXTENTS
 		? bch2_btree_node_iter_prev_filter(&tmp, b, KEY_TYPE_discard)
 		: bch2_btree_node_iter_prev_all(&tmp, b);
 	if (k && btree_iter_pos_cmp(iter, b, k) > 0) {
@@ -519,7 +519,7 @@ static void __bch2_btree_node_iter_fix(struct btree_iter *iter,
 
 		bch2_btree_node_iter_push(node_iter, b, where, end);
 
-		if (!b->level &&
+		if (!b->c.level &&
 		    node_iter == &iter->l[0].iter)
 			bkey_disassemble(b,
 				bch2_btree_node_iter_peek_all(node_iter, b),
@@ -548,7 +548,7 @@ found:
 	btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
 
 	bch2_btree_node_iter_sort(node_iter, b);
-	if (!b->level && node_iter == &iter->l[0].iter) {
+	if (!b->c.level && node_iter == &iter->l[0].iter) {
 		/*
 		 * not legal to call bkey_debugcheck() here, because we're
 		 * called midway through the update path after update has been
@@ -590,7 +590,7 @@ iter_current_key_not_modified:
 	 * always point to the key for the child node the btree iterator points
 	 * to.
 	 */
-	if (b->level && new_u64s &&
+	if (b->c.level && new_u64s &&
 	    btree_iter_pos_cmp(iter, b, where) > 0) {
 		struct bset_tree *t, *where_set = bch2_bkey_to_bset_inlined(b, where);
 		struct bkey_packed *k;
@@ -633,13 +633,13 @@ void bch2_btree_node_iter_fix(struct btree_iter *iter,
 	struct bset_tree *t = bch2_bkey_to_bset_inlined(b, where);
 	struct btree_iter *linked;
 
-	if (node_iter != &iter->l[b->level].iter)
+	if (node_iter != &iter->l[b->c.level].iter)
 		__bch2_btree_node_iter_fix(iter, b, node_iter, t,
 					  where, clobber_u64s, new_u64s);
 
 	trans_for_each_iter_with_node(iter->trans, b, linked)
 		__bch2_btree_node_iter_fix(linked, b,
-					  &linked->l[b->level].iter, t,
+					  &linked->l[b->c.level].iter, t,
 					  where, clobber_u64s, new_u64s);
 }
 
@@ -715,7 +715,7 @@ static void btree_iter_verify_new_node(struct btree_iter *iter, struct btree *b)
 	if (!IS_ENABLED(CONFIG_BCACHEFS_DEBUG))
 		return;
 
-	plevel = b->level + 1;
+	plevel = b->c.level + 1;
 	if (!btree_iter_node(iter, plevel))
 		return;
 
@@ -738,7 +738,7 @@ static void btree_iter_verify_new_node(struct btree_iter *iter, struct btree *b)
 	}
 
 	if (!parent_locked)
-		btree_node_unlock(iter, b->level + 1);
+		btree_node_unlock(iter, b->c.level + 1);
 }
 
 static inline bool btree_iter_pos_after_node(struct btree_iter *iter,
@@ -751,7 +751,7 @@ static inline bool btree_iter_pos_after_node(struct btree_iter *iter,
 static inline bool btree_iter_pos_in_node(struct btree_iter *iter,
 					  struct btree *b)
 {
-	return iter->btree_id == b->btree_id &&
+	return iter->btree_id == b->c.btree_id &&
 		bkey_cmp(iter->pos, b->data->min_key) >= 0 &&
 		!btree_iter_pos_after_node(iter, b);
 }
@@ -779,11 +779,11 @@ static inline void btree_iter_node_set(struct btree_iter *iter,
 	btree_iter_verify_new_node(iter, b);
 
 	EBUG_ON(!btree_iter_pos_in_node(iter, b));
-	EBUG_ON(b->lock.state.seq & 1);
+	EBUG_ON(b->c.lock.state.seq & 1);
 
-	iter->l[b->level].lock_seq = b->lock.state.seq;
-	iter->l[b->level].b = b;
-	__btree_iter_init(iter, b->level);
+	iter->l[b->c.level].lock_seq = b->c.lock.state.seq;
+	iter->l[b->c.level].b = b;
+	__btree_iter_init(iter, b->c.level);
 }
 
 /*
@@ -802,24 +802,24 @@ void bch2_btree_iter_node_replace(struct btree_iter *iter, struct btree *b)
 			 * the old node we're replacing has already been
 			 * unlocked and the pointer invalidated
 			 */
-			BUG_ON(btree_node_locked(linked, b->level));
+			BUG_ON(btree_node_locked(linked, b->c.level));
 
-			t = btree_lock_want(linked, b->level);
+			t = btree_lock_want(linked, b->c.level);
 			if (t != BTREE_NODE_UNLOCKED) {
-				six_lock_increment(&b->lock, (enum six_lock_type) t);
-				mark_btree_node_locked(linked, b->level, (enum six_lock_type) t);
+				six_lock_increment(&b->c.lock, (enum six_lock_type) t);
+				mark_btree_node_locked(linked, b->c.level, (enum six_lock_type) t);
 			}
 
 			btree_iter_node_set(linked, b);
 		}
 
-	six_unlock_intent(&b->lock);
+	six_unlock_intent(&b->c.lock);
 }
 
 void bch2_btree_iter_node_drop(struct btree_iter *iter, struct btree *b)
 {
 	struct btree_iter *linked;
-	unsigned level = b->level;
+	unsigned level = b->c.level;
 
 	trans_for_each_iter(iter->trans, linked)
 		if (linked->l[level].b == b) {
@@ -837,7 +837,7 @@ void bch2_btree_iter_reinit_node(struct btree_iter *iter, struct btree *b)
 	struct btree_iter *linked;
 
 	trans_for_each_iter_with_node(iter->trans, b, linked)
-		__btree_iter_init(linked, b->level);
+		__btree_iter_init(linked, b->c.level);
 }
 
 static inline int btree_iter_lock_root(struct btree_iter *iter,
@@ -852,7 +852,7 @@ static inline int btree_iter_lock_root(struct btree_iter *iter,
 
 	while (1) {
 		b = READ_ONCE(c->btree_roots[iter->btree_id].b);
-		iter->level = READ_ONCE(b->level);
+		iter->level = READ_ONCE(b->c.level);
 
 		if (unlikely(iter->level < depth_want)) {
 			/*
@@ -872,7 +872,7 @@ static inline int btree_iter_lock_root(struct btree_iter *iter,
 			return -EINTR;
 
 		if (likely(b == c->btree_roots[iter->btree_id].b &&
-			   b->level == iter->level &&
+			   b->c.level == iter->level &&
 			   !race_fault())) {
 			for (i = 0; i < iter->level; i++)
 				iter->l[i].b = BTREE_ITER_NOT_END;
@@ -884,7 +884,7 @@ static inline int btree_iter_lock_root(struct btree_iter *iter,
 
 		}
 
-		six_unlock_type(&b->lock, lock_type);
+		six_unlock_type(&b->c.lock, lock_type);
 	}
 }
 
@@ -1842,7 +1842,7 @@ struct btree_iter *bch2_trans_copy_iter(struct btree_trans *trans,
 
 	for (i = 0; i < BTREE_MAX_DEPTH; i++)
 		if (btree_node_locked(iter, i))
-			six_lock_increment(&iter->l[i].b->lock,
+			six_lock_increment(&iter->l[i].b->c.lock,
 					   __btree_lock_want(iter, i));
 
 	return &trans->iters[idx];
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index dc15d1b831a8..171e729ed3ea 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -17,10 +17,23 @@ static inline struct btree *btree_iter_node(struct btree_iter *iter,
 	return level < BTREE_MAX_DEPTH ? iter->l[level].b : NULL;
 }
 
+static inline bool btree_node_lock_seq_matches(const struct btree_iter *iter,
+					const struct btree *b, unsigned level)
+{
+	/*
+	 * We don't compare the low bits of the lock sequence numbers because
+	 * @iter might have taken a write lock on @b, and we don't want to skip
+	 * the linked iterator if the sequence numbers were equal before taking
+	 * that write lock. The lock sequence number is incremented by taking
+	 * and releasing write locks and is even when unlocked:
+	 */
+	return iter->l[level].lock_seq >> 1 == b->c.lock.state.seq >> 1;
+}
+
 static inline struct btree *btree_node_parent(struct btree_iter *iter,
 					      struct btree *b)
 {
-	return btree_iter_node(iter, b->level + 1);
+	return btree_iter_node(iter, b->c.level + 1);
 }
 
 static inline bool btree_trans_has_multiple_iters(const struct btree_trans *trans)
@@ -55,16 +68,8 @@ __trans_next_iter(struct btree_trans *trans, unsigned idx)
 static inline bool __iter_has_node(const struct btree_iter *iter,
 				   const struct btree *b)
 {
-	/*
-	 * We don't compare the low bits of the lock sequence numbers because
-	 * @iter might have taken a write lock on @b, and we don't want to skip
-	 * the linked iterator if the sequence numbers were equal before taking
-	 * that write lock. The lock sequence number is incremented by taking
-	 * and releasing write locks and is even when unlocked:
-	 */
-
-	return iter->l[b->level].b == b &&
-		iter->l[b->level].lock_seq >> 1 == b->lock.state.seq >> 1;
+	return iter->l[b->c.level].b == b &&
+		btree_node_lock_seq_matches(iter, b, b->c.level);
 }
 
 static inline struct btree_iter *
diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h
index 37e09474fde4..e75e56c34f5f 100644
--- a/fs/bcachefs/btree_locking.h
+++ b/fs/bcachefs/btree_locking.h
@@ -101,7 +101,7 @@ static inline void __btree_node_unlock(struct btree_iter *iter, unsigned level)
 	EBUG_ON(level >= BTREE_MAX_DEPTH);
 
 	if (lock_type != BTREE_NODE_UNLOCKED)
-		six_unlock_type(&iter->l[level].b->lock, lock_type);
+		six_unlock_type(&iter->l[level].b->c.lock, lock_type);
 	mark_btree_node_unlocked(iter, level);
 }
 
@@ -142,14 +142,14 @@ static inline void __btree_node_lock_type(struct bch_fs *c, struct btree *b,
 {
 	u64 start_time = local_clock();
 
-	six_lock_type(&b->lock, type, NULL, NULL);
+	six_lock_type(&b->c.lock, type, NULL, NULL);
 	bch2_time_stats_update(&c->times[lock_to_time_stat(type)], start_time);
 }
 
 static inline void btree_node_lock_type(struct bch_fs *c, struct btree *b,
 					enum six_lock_type type)
 {
-	if (!six_trylock_type(&b->lock, type))
+	if (!six_trylock_type(&b->c.lock, type))
 		__btree_node_lock_type(c, b, type);
 }
 
@@ -167,7 +167,7 @@ static inline bool btree_node_lock_increment(struct btree_iter *iter,
 		if (linked != iter &&
 		    linked->l[level].b == b &&
 		    btree_node_locked_type(linked, level) >= want) {
-			six_lock_increment(&b->lock, want);
+			six_lock_increment(&b->c.lock, want);
 			return true;
 		}
 
@@ -185,7 +185,7 @@ static inline bool btree_node_lock(struct btree *b, struct bpos pos,
 {
 	EBUG_ON(level >= BTREE_MAX_DEPTH);
 
-	return likely(six_trylock_type(&b->lock, type)) ||
+	return likely(six_trylock_type(&b->c.lock, type)) ||
 		btree_node_lock_increment(iter, b, level, type) ||
 		__bch2_btree_node_lock(b, pos, level, iter,
 				       type, may_drop_locks);
@@ -210,10 +210,10 @@ void __bch2_btree_node_lock_write(struct btree *, struct btree_iter *);
 
 static inline void bch2_btree_node_lock_write(struct btree *b, struct btree_iter *iter)
 {
-	EBUG_ON(iter->l[b->level].b != b);
-	EBUG_ON(iter->l[b->level].lock_seq != b->lock.state.seq);
+	EBUG_ON(iter->l[b->c.level].b != b);
+	EBUG_ON(iter->l[b->c.level].lock_seq != b->c.lock.state.seq);
 
-	if (!six_trylock_write(&b->lock))
+	if (!six_trylock_write(&b->c.lock))
 		__bch2_btree_node_lock_write(b, iter);
 }
 
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index dd4fa2f595ec..7bd3adcd4b52 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -60,19 +60,22 @@ struct btree_alloc {
 	BKEY_PADDED(k);
 };
 
+struct btree_bkey_cached_common {
+	struct six_lock		lock;
+	u8			level;
+	u8			btree_id;
+};
+
 struct btree {
-	/* Hottest entries first */
+	struct btree_bkey_cached_common c;
+
 	struct rhash_head	hash;
 
 	/* Key/pointer for this btree node */
 	__BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX);
 
-	struct six_lock		lock;
-
 	unsigned long		flags;
 	u16			written;
-	u8			level;
-	u8			btree_id;
 	u8			nsets;
 	u8			nr_key_bits;
 
@@ -451,7 +454,7 @@ static inline enum btree_node_type __btree_node_type(unsigned level, enum btree_
 /* Type of keys @b contains: */
 static inline enum btree_node_type btree_node_type(struct btree *b)
 {
-	return __btree_node_type(b->level, b->btree_id);
+	return __btree_node_type(b->c.level, b->c.btree_id);
 }
 
 static inline bool btree_node_type_is_extents(enum btree_node_type type)
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index cc0cd465b863..73675af8743a 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -33,7 +33,7 @@ static void btree_node_interior_verify(struct btree *b)
 	struct btree_node_iter iter;
 	struct bkey_packed *k;
 
-	BUG_ON(!b->level);
+	BUG_ON(!b->c.level);
 
 	bch2_btree_node_iter_init(&iter, b, &b->key.k.p);
 #if 1
@@ -229,7 +229,7 @@ void bch2_btree_node_free_never_inserted(struct bch_fs *c, struct btree *b)
 
 	btree_node_lock_type(c, b, SIX_LOCK_write);
 	__btree_node_free(c, b);
-	six_unlock_write(&b->lock);
+	six_unlock_write(&b->c.lock);
 
 	bch2_open_buckets_put(c, &ob);
 }
@@ -240,7 +240,7 @@ void bch2_btree_node_free_inmem(struct bch_fs *c, struct btree *b,
 	struct btree_iter *linked;
 
 	trans_for_each_iter(iter->trans, linked)
-		BUG_ON(linked->l[b->level].b == b);
+		BUG_ON(linked->l[b->c.level].b == b);
 
 	/*
 	 * Is this a node that isn't reachable on disk yet?
@@ -253,10 +253,10 @@ void bch2_btree_node_free_inmem(struct bch_fs *c, struct btree *b,
 	 */
 	btree_update_drop_new_node(c, b);
 
-	six_lock_write(&b->lock, NULL, NULL);
+	six_lock_write(&b->c.lock, NULL, NULL);
 	__btree_node_free(c, b);
-	six_unlock_write(&b->lock);
-	six_unlock_intent(&b->lock);
+	six_unlock_write(&b->c.lock);
+	six_unlock_intent(&b->c.lock);
 }
 
 static void bch2_btree_node_free_ondisk(struct bch_fs *c,
@@ -387,7 +387,7 @@ struct btree *__bch2_btree_node_alloc_replacement(struct btree_update *as,
 {
 	struct btree *n;
 
-	n = bch2_btree_node_alloc(as, b->level);
+	n = bch2_btree_node_alloc(as, b->c.level);
 
 	n->data->min_key	= b->data->min_key;
 	n->data->max_key	= b->data->max_key;
@@ -431,7 +431,7 @@ static struct btree *__btree_root_alloc(struct btree_update *as, unsigned level)
 	btree_node_set_format(b, b->data->format);
 	bch2_btree_build_aux_trees(b);
 
-	six_unlock_write(&b->lock);
+	six_unlock_write(&b->c.lock);
 
 	return b;
 }
@@ -445,7 +445,7 @@ static void bch2_btree_reserve_put(struct bch_fs *c, struct btree_reserve *reser
 	while (reserve->nr) {
 		struct btree *b = reserve->b[--reserve->nr];
 
-		six_unlock_write(&b->lock);
+		six_unlock_write(&b->c.lock);
 
 		if (c->btree_reserve_cache_nr <
 		    ARRAY_SIZE(c->btree_reserve_cache)) {
@@ -461,9 +461,9 @@ static void bch2_btree_reserve_put(struct bch_fs *c, struct btree_reserve *reser
 
 		btree_node_lock_type(c, b, SIX_LOCK_write);
 		__btree_node_free(c, b);
-		six_unlock_write(&b->lock);
+		six_unlock_write(&b->c.lock);
 
-		six_unlock_intent(&b->lock);
+		six_unlock_intent(&b->c.lock);
 	}
 
 	mutex_unlock(&c->btree_reserve_cache_lock);
@@ -586,7 +586,7 @@ static void btree_update_nodes_reachable(struct closure *cl)
 		 */
 		btree_node_lock_type(c, b, SIX_LOCK_read);
 		bch2_btree_node_write_cond(c, b, btree_node_need_write(b));
-		six_unlock_read(&b->lock);
+		six_unlock_read(&b->c.lock);
 		mutex_lock(&c->btree_interior_update_lock);
 	}
 
@@ -641,10 +641,10 @@ retry:
 		/* The usual case: */
 		b = READ_ONCE(as->b);
 
-		if (!six_trylock_read(&b->lock)) {
+		if (!six_trylock_read(&b->c.lock)) {
 			mutex_unlock(&c->btree_interior_update_lock);
 			btree_node_lock_type(c, b, SIX_LOCK_read);
-			six_unlock_read(&b->lock);
+			six_unlock_read(&b->c.lock);
 			goto retry;
 		}
 
@@ -665,7 +665,7 @@ retry:
 		 * write it now if it needs to be written:
 		 */
 		bch2_btree_node_write_cond(c, b, true);
-		six_unlock_read(&b->lock);
+		six_unlock_read(&b->c.lock);
 		break;
 
 	case BTREE_INTERIOR_UPDATING_AS:
@@ -688,15 +688,15 @@ retry:
 		/* b is the new btree root: */
 		b = READ_ONCE(as->b);
 
-		if (!six_trylock_read(&b->lock)) {
+		if (!six_trylock_read(&b->c.lock)) {
 			mutex_unlock(&c->btree_interior_update_lock);
 			btree_node_lock_type(c, b, SIX_LOCK_read);
-			six_unlock_read(&b->lock);
+			six_unlock_read(&b->c.lock);
 			goto retry;
 		}
 
-		BUG_ON(c->btree_roots[b->btree_id].as != as);
-		c->btree_roots[b->btree_id].as = NULL;
+		BUG_ON(c->btree_roots[b->c.btree_id].as != as);
+		c->btree_roots[b->c.btree_id].as = NULL;
 
 		bch2_btree_set_root_ondisk(c, b, WRITE);
 
@@ -707,7 +707,7 @@ retry:
 		 * have the pointer to the new root, and before the allocator
 		 * can reuse the old nodes it'll have to do a journal commit:
 		 */
-		six_unlock_read(&b->lock);
+		six_unlock_read(&b->c.lock);
 		mutex_unlock(&c->btree_interior_update_lock);
 
 		/*
@@ -908,8 +908,8 @@ static void btree_interior_update_add_node_reference(struct btree_update *as,
 	d = &as->pending[as->nr_pending++];
 	d->index_update_done	= false;
 	d->seq			= b->data->keys.seq;
-	d->btree_id		= b->btree_id;
-	d->level		= b->level;
+	d->btree_id		= b->c.btree_id;
+	d->level		= b->c.level;
 	bkey_copy(&d->key, &b->key);
 
 	mutex_unlock(&c->btree_interior_update_lock);
@@ -1053,7 +1053,7 @@ static void __bch2_btree_set_root_inmem(struct bch_fs *c, struct btree *b)
 
 	mutex_lock(&c->btree_root_lock);
 	BUG_ON(btree_node_root(c, b) &&
-	       (b->level < btree_node_root(c, b)->level ||
+	       (b->c.level < btree_node_root(c, b)->c.level ||
 		!btree_node_dying(btree_node_root(c, b))));
 
 	btree_node_root(c, b) = b;
@@ -1076,7 +1076,7 @@ static void bch2_btree_set_root_inmem(struct btree_update *as, struct btree *b)
 
 	bch2_mark_key_locked(c, bkey_i_to_s_c(&b->key),
 		      true, 0, &fs_usage->u, 0, 0);
-	if (gc_visited(c, gc_pos_btree_root(b->btree_id)))
+	if (gc_visited(c, gc_pos_btree_root(b->c.btree_id)))
 		bch2_mark_key_locked(c, bkey_i_to_s_c(&b->key),
 				     true, 0, NULL, 0,
 				     BCH_BUCKET_MARK_GC);
@@ -1094,13 +1094,13 @@ static void bch2_btree_set_root_inmem(struct btree_update *as, struct btree *b)
 
 static void bch2_btree_set_root_ondisk(struct bch_fs *c, struct btree *b, int rw)
 {
-	struct btree_root *r = &c->btree_roots[b->btree_id];
+	struct btree_root *r = &c->btree_roots[b->c.btree_id];
 
 	mutex_lock(&c->btree_root_lock);
 
 	BUG_ON(b != r->b);
 	bkey_copy(&r->key, &b->key);
-	r->level = b->level;
+	r->level = b->c.level;
 	r->alive = true;
 	if (rw == WRITE)
 		c->btree_roots_dirty = true;
@@ -1214,7 +1214,7 @@ static struct btree *__btree_split_node(struct btree_update *as,
 	struct bset *set1, *set2;
 	struct bkey_packed *k, *prev = NULL;
 
-	n2 = bch2_btree_node_alloc(as, n1->level);
+	n2 = bch2_btree_node_alloc(as, n1->c.level);
 
 	n2->data->max_key	= n1->data->max_key;
 	n2->data->format	= n1->format;
@@ -1251,7 +1251,7 @@ static struct btree *__btree_split_node(struct btree_update *as,
 	n1->key.k.p = bkey_unpack_pos(n1, prev);
 	n1->data->max_key = n1->key.k.p;
 	n2->data->min_key =
-		btree_type_successor(n1->btree_id, n1->key.k.p);
+		btree_type_successor(n1->c.btree_id, n1->key.k.p);
 
 	set2->u64s = cpu_to_le16((u64 *) vstruct_end(set1) - (u64 *) k);
 	set1->u64s = cpu_to_le16(le16_to_cpu(set1->u64s) - le16_to_cpu(set2->u64s));
@@ -1282,7 +1282,7 @@ static struct btree *__btree_split_node(struct btree_update *as,
 	bch2_verify_btree_nr_keys(n1);
 	bch2_verify_btree_nr_keys(n2);
 
-	if (n1->level) {
+	if (n1->c.level) {
 		btree_node_interior_verify(n1);
 		btree_node_interior_verify(n2);
 	}
@@ -1359,7 +1359,7 @@ static void btree_split(struct btree_update *as, struct btree *b,
 	u64 start_time = local_clock();
 
 	BUG_ON(!parent && (b != btree_node_root(c, b)));
-	BUG_ON(!btree_node_intent_locked(iter, btree_node_root(c, b)->level));
+	BUG_ON(!btree_node_intent_locked(iter, btree_node_root(c, b)->c.level));
 
 	bch2_btree_interior_update_will_free_node(as, b);
 
@@ -1375,8 +1375,8 @@ static void btree_split(struct btree_update *as, struct btree *b,
 
 		bch2_btree_build_aux_trees(n2);
 		bch2_btree_build_aux_trees(n1);
-		six_unlock_write(&n2->lock);
-		six_unlock_write(&n1->lock);
+		six_unlock_write(&n2->c.lock);
+		six_unlock_write(&n1->c.lock);
 
 		bch2_btree_node_write(c, n2, SIX_LOCK_intent);
 
@@ -1390,7 +1390,7 @@ static void btree_split(struct btree_update *as, struct btree *b,
 
 		if (!parent) {
 			/* Depth increases, make a new root */
-			n3 = __btree_root_alloc(as, b->level + 1);
+			n3 = __btree_root_alloc(as, b->c.level + 1);
 
 			n3->sib_u64s[0] = U16_MAX;
 			n3->sib_u64s[1] = U16_MAX;
@@ -1403,7 +1403,7 @@ static void btree_split(struct btree_update *as, struct btree *b,
 		trace_btree_compact(c, b);
 
 		bch2_btree_build_aux_trees(n1);
-		six_unlock_write(&n1->lock);
+		six_unlock_write(&n1->c.lock);
 
 		bch2_keylist_add(&as->parent_keys, &n1->key);
 	}
@@ -1430,7 +1430,7 @@ static void btree_split(struct btree_update *as, struct btree *b,
 
 	/* Successful split, update the iterator to point to the new nodes: */
 
-	six_lock_increment(&b->lock, SIX_LOCK_intent);
+	six_lock_increment(&b->c.lock, SIX_LOCK_intent);
 	bch2_btree_iter_node_drop(iter, b);
 	if (n3)
 		bch2_btree_iter_node_replace(iter, n3);
@@ -1456,7 +1456,7 @@ bch2_btree_insert_keys_interior(struct btree_update *as, struct btree *b,
 	struct bkey_packed *k;
 
 	/* Don't screw up @iter's position: */
-	node_iter = iter->l[b->level].iter;
+	node_iter = iter->l[b->c.level].iter;
 
 	/*
 	 * btree_split(), btree_gc_coalesce() will insert keys before
@@ -1477,7 +1477,7 @@ bch2_btree_insert_keys_interior(struct btree_update *as, struct btree *b,
 	btree_update_updated_node(as, b);
 
 	trans_for_each_iter_with_node(iter->trans, b, linked)
-		bch2_btree_node_iter_peek(&linked->l[b->level].iter, b);
+		bch2_btree_node_iter_peek(&linked->l[b->c.level].iter, b);
 
 	bch2_btree_iter_verify(iter, b);
 }
@@ -1503,8 +1503,8 @@ void bch2_btree_insert_node(struct btree_update *as, struct btree *b,
 	int old_live_u64s = b->nr.live_u64s;
 	int live_u64s_added, u64s_added;
 
-	BUG_ON(!btree_node_intent_locked(iter, btree_node_root(c, b)->level));
-	BUG_ON(!b->level);
+	BUG_ON(!btree_node_intent_locked(iter, btree_node_root(c, b)->c.level));
+	BUG_ON(!b->c.level);
 	BUG_ON(!as || as->b);
 	bch2_verify_keylist_sorted(keys);
 
@@ -1541,7 +1541,7 @@ void bch2_btree_insert_node(struct btree_update *as, struct btree *b,
 	 * the btree iterator yet, so the merge path's unlock/wait/relock dance
 	 * won't work:
 	 */
-	bch2_foreground_maybe_merge(c, iter, b->level,
+	bch2_foreground_maybe_merge(c, iter, b->c.level,
 				    flags|BTREE_INSERT_NOUNLOCK);
 	return;
 split:
@@ -1686,7 +1686,7 @@ retry:
 	b->sib_u64s[sib] = sib_u64s;
 
 	if (b->sib_u64s[sib] > BTREE_FOREGROUND_MERGE_THRESHOLD(c)) {
-		six_unlock_intent(&m->lock);
+		six_unlock_intent(&m->c.lock);
 		goto out;
 	}
 
@@ -1716,7 +1716,7 @@ retry:
 	bch2_btree_interior_update_will_free_node(as, b);
 	bch2_btree_interior_update_will_free_node(as, m);
 
-	n = bch2_btree_node_alloc(as, b->level);
+	n = bch2_btree_node_alloc(as, b->c.level);
 
 	n->data->min_key	= prev->data->min_key;
 	n->data->max_key	= next->data->max_key;
@@ -1729,7 +1729,7 @@ retry:
 	bch2_btree_sort_into(c, n, next);
 
 	bch2_btree_build_aux_trees(n);
-	six_unlock_write(&n->lock);
+	six_unlock_write(&n->c.lock);
 
 	bkey_init(&delete.k);
 	delete.k.p = prev->key.k.p;
@@ -1742,7 +1742,7 @@ retry:
 
 	bch2_open_buckets_put(c, &n->ob);
 
-	six_lock_increment(&b->lock, SIX_LOCK_intent);
+	six_lock_increment(&b->c.lock, SIX_LOCK_intent);
 	bch2_btree_iter_node_drop(iter, b);
 	bch2_btree_iter_node_drop(iter, m);
 
@@ -1773,7 +1773,7 @@ out:
 	return;
 
 err_cycle_gc_lock:
-	six_unlock_intent(&m->lock);
+	six_unlock_intent(&m->c.lock);
 
 	if (flags & BTREE_INSERT_NOUNLOCK)
 		goto out;
@@ -1786,7 +1786,7 @@ err_cycle_gc_lock:
 	goto err;
 
 err_unlock:
-	six_unlock_intent(&m->lock);
+	six_unlock_intent(&m->c.lock);
 	if (!(flags & BTREE_INSERT_GC_LOCK_HELD))
 		up_read(&c->gc_lock);
 err:
@@ -1828,7 +1828,7 @@ static int __btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter,
 	n = bch2_btree_node_alloc_replacement(as, b);
 
 	bch2_btree_build_aux_trees(n);
-	six_unlock_write(&n->lock);
+	six_unlock_write(&n->c.lock);
 
 	trace_btree_gc_rewrite_node(c, b);
 
@@ -1843,7 +1843,7 @@ static int __btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter,
 
 	bch2_open_buckets_put(c, &n->ob);
 
-	six_lock_increment(&b->lock, SIX_LOCK_intent);
+	six_lock_increment(&b->c.lock, SIX_LOCK_intent);
 	bch2_btree_iter_node_drop(iter, b);
 	bch2_btree_iter_node_replace(iter, n);
 	bch2_btree_node_free_inmem(c, b, iter);
@@ -1963,7 +1963,7 @@ static void __bch2_btree_node_update_key(struct bch_fs *c,
 		if (new_hash) {
 			bkey_copy(&new_hash->key, &new_key->k_i);
 			ret = bch2_btree_node_hash_insert(&c->btree_cache,
-					new_hash, b->level, b->btree_id);
+					new_hash, b->c.level, b->c.btree_id);
 			BUG_ON(ret);
 		}
 
@@ -1996,7 +1996,7 @@ static void __bch2_btree_node_update_key(struct bch_fs *c,
 
 		bch2_mark_key_locked(c, bkey_i_to_s_c(&new_key->k_i),
 			      true, 0, &fs_usage->u, 0, 0);
-		if (gc_visited(c, gc_pos_btree_root(b->btree_id)))
+		if (gc_visited(c, gc_pos_btree_root(b->c.btree_id)))
 			bch2_mark_key_locked(c, bkey_i_to_s_c(&new_key->k_i),
 					     true, 0, NULL, 0,
 					     BCH_BUCKET_MARK_GC);
@@ -2110,8 +2110,8 @@ err:
 		list_move(&new_hash->list, &c->btree_cache.freeable);
 		mutex_unlock(&c->btree_cache.lock);
 
-		six_unlock_write(&new_hash->lock);
-		six_unlock_intent(&new_hash->lock);
+		six_unlock_write(&new_hash->c.lock);
+		six_unlock_intent(&new_hash->c.lock);
 	}
 	up_read(&c->gc_lock);
 	closure_sync(&cl);
@@ -2151,8 +2151,8 @@ void bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id)
 	bch2_btree_cache_cannibalize_unlock(c);
 
 	set_btree_node_fake(b);
-	b->level	= 0;
-	b->btree_id	= id;
+	b->c.level	= 0;
+	b->c.btree_id	= id;
 
 	bkey_btree_ptr_init(&b->key);
 	b->key.k.p = POS_MAX;
@@ -2166,13 +2166,14 @@ void bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id)
 	b->data->format = bch2_btree_calc_format(b);
 	btree_node_set_format(b, b->data->format);
 
-	ret = bch2_btree_node_hash_insert(&c->btree_cache, b, b->level, b->btree_id);
+	ret = bch2_btree_node_hash_insert(&c->btree_cache, b,
+					  b->c.level, b->c.btree_id);
 	BUG_ON(ret);
 
 	__bch2_btree_set_root_inmem(c, b);
 
-	six_unlock_write(&b->lock);
-	six_unlock_intent(&b->lock);
+	six_unlock_write(&b->c.lock);
+	six_unlock_intent(&b->c.lock);
 }
 
 ssize_t bch2_btree_updates_print(struct bch_fs *c, char *buf)
diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h
index e5156e908110..f9e092bf69aa 100644
--- a/fs/bcachefs/btree_update_interior.h
+++ b/fs/bcachefs/btree_update_interior.h
@@ -190,7 +190,7 @@ void bch2_btree_root_alloc(struct bch_fs *, enum btree_id);
 static inline unsigned btree_update_reserve_required(struct bch_fs *c,
 						     struct btree *b)
 {
-	unsigned depth = btree_node_root(c, b)->level + 1;
+	unsigned depth = btree_node_root(c, b)->c.level + 1;
 
 	/*
 	 * Number of nodes we might have to allocate in a worst case btree
@@ -198,9 +198,9 @@ static inline unsigned btree_update_reserve_required(struct bch_fs *c,
 	 * a new root, unless we're already at max depth:
 	 */
 	if (depth < BTREE_MAX_DEPTH)
-		return (depth - b->level) * 2 + 1;
+		return (depth - b->c.level) * 2 + 1;
 	else
-		return (depth - b->level) * 2 - 1;
+		return (depth - b->c.level) * 2 - 1;
 }
 
 static inline void btree_node_reset_sib_u64s(struct btree *b)
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 3425ad6f68b2..5e13ad34ec42 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -155,7 +155,7 @@ static void __btree_node_flush(struct journal *j, struct journal_entry_pin *pin,
 	btree_node_lock_type(c, b, SIX_LOCK_read);
 	bch2_btree_node_write_cond(c, b,
 		(btree_current_write(b) == w && w->journal.seq == seq));
-	six_unlock_read(&b->lock);
+	six_unlock_read(&b->c.lock);
 }
 
 static void btree_node_flush0(struct journal *j, struct journal_entry_pin *pin, u64 seq)
@@ -198,7 +198,7 @@ void bch2_btree_journal_key(struct btree_trans *trans,
 	struct btree *b = iter->l[0].b;
 	struct btree_write *w = btree_current_write(b);
 
-	EBUG_ON(iter->level || b->level);
+	EBUG_ON(iter->level || b->c.level);
 	EBUG_ON(trans->journal_res.ref !=
 		!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY));
 
diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c
index bb69a2acd8dd..a11d7923ea5a 100644
--- a/fs/bcachefs/debug.c
+++ b/fs/bcachefs/debug.c
@@ -52,8 +52,8 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b)
 
 	bkey_copy(&v->key, &b->key);
 	v->written	= 0;
-	v->level	= b->level;
-	v->btree_id	= b->btree_id;
+	v->c.level	= b->c.level;
+	v->c.btree_id	= b->c.btree_id;
 	bch2_btree_keys_init(v, &c->expensive_debug_checks);
 
 	if (bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key),
diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h
index 2864a72938ce..22a378d5f64f 100644
--- a/fs/bcachefs/trace.h
+++ b/fs/bcachefs/trace.h
@@ -144,8 +144,8 @@ DECLARE_EVENT_CLASS(btree_node,
 
 	TP_fast_assign(
 		memcpy(__entry->uuid, c->sb.user_uuid.b, 16);
-		__entry->level		= b->level;
-		__entry->id		= b->btree_id;
+		__entry->level		= b->c.level;
+		__entry->id		= b->c.btree_id;
 		__entry->inode		= b->key.k.p.inode;
 		__entry->offset		= b->key.k.p.offset;
 	),
@@ -262,7 +262,7 @@ TRACE_EVENT(btree_insert_key,
 	),
 
 	TP_fast_assign(
-		__entry->id		= b->btree_id;
+		__entry->id		= b->c.btree_id;
 		__entry->inode		= k->k.p.inode;
 		__entry->offset		= k->k.p.offset;
 		__entry->size		= k->k.size;
-- 
cgit 


From 932aa837453ada12342d89ea5e063993a928d4c8 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 11 Mar 2019 14:59:58 -0400
Subject: bcachefs: bch2_trans_mark_update()

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c  |   6 +-
 fs/bcachefs/alloc_background.h  |   2 +
 fs/bcachefs/bcachefs_format.h   |   1 +
 fs/bcachefs/btree_iter.c        |   4 +-
 fs/bcachefs/btree_types.h       |   4 +
 fs/bcachefs/btree_update.h      |  13 +-
 fs/bcachefs/btree_update_leaf.c | 119 +++++++---
 fs/bcachefs/buckets.c           | 486 +++++++++++++++++++++++++++++++++++-----
 fs/bcachefs/buckets.h           |  11 +-
 fs/bcachefs/buckets_types.h     |  13 ++
 fs/bcachefs/ec.c                |  23 +-
 fs/bcachefs/extents.c           |  45 +++-
 fs/bcachefs/migrate.c           |   3 +
 fs/bcachefs/move.c              |   2 +
 fs/bcachefs/recovery.c          | 100 ++++++---
 fs/bcachefs/replicas.c          |   8 +-
 fs/bcachefs/replicas.h          |   1 +
 fs/bcachefs/super-io.c          |   3 +-
 18 files changed, 702 insertions(+), 142 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 5988971521eb..82a68fabdc5f 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -141,8 +141,8 @@ struct bkey_alloc_unpacked bch2_alloc_unpack(const struct bch_alloc *a)
 	return ret;
 }
 
-static void bch2_alloc_pack(struct bkey_i_alloc *dst,
-			    const struct bkey_alloc_unpacked src)
+void bch2_alloc_pack(struct bkey_i_alloc *dst,
+		     const struct bkey_alloc_unpacked src)
 {
 	unsigned idx = 0;
 	void *d = dst->v.data;
@@ -962,7 +962,6 @@ retry:
 
 	invalidating_cached_data = m.cached_sectors != 0;
 
-	//BUG_ON(u.dirty_sectors);
 	u.data_type	= 0;
 	u.dirty_sectors	= 0;
 	u.cached_sectors = 0;
@@ -974,6 +973,7 @@ retry:
 	 * we have to trust the in memory bucket @m, not the version in the
 	 * btree:
 	 */
+	//BUG_ON(u.dirty_sectors);
 	u.gen		= m.gen + 1;
 
 	a = bkey_alloc_init(&alloc_key.k);
diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h
index b75c56a5dae0..02354c80a102 100644
--- a/fs/bcachefs/alloc_background.h
+++ b/fs/bcachefs/alloc_background.h
@@ -14,6 +14,8 @@ struct bkey_alloc_unpacked {
 };
 
 struct bkey_alloc_unpacked bch2_alloc_unpack(const struct bch_alloc *);
+void bch2_alloc_pack(struct bkey_i_alloc *,
+		     const struct bkey_alloc_unpacked);
 
 #define ALLOC_SCAN_BATCH(ca)		max_t(size_t, 1, (ca)->mi.nbuckets >> 9)
 
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index 7edc410c5391..8715a444f6d5 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -1300,6 +1300,7 @@ enum bch_sb_features {
 
 enum bch_sb_compat {
 	BCH_COMPAT_FEAT_ALLOC_INFO	= 0,
+	BCH_COMPAT_FEAT_ALLOC_METADATA	= 1,
 };
 
 /* options: */
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 3fdf5ab25578..afede9651024 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1005,7 +1005,7 @@ retry_all:
 			goto retry_all;
 	}
 
-	ret = btree_trans_has_multiple_iters(trans) ? -EINTR : 0;
+	ret = hweight64(trans->iters_live) > 1 ? -EINTR : 0;
 out:
 	bch2_btree_cache_cannibalize_unlock(c);
 	return ret;
@@ -1103,8 +1103,6 @@ int __must_check bch2_btree_iter_traverse(struct btree_iter *iter)
 	if (unlikely(ret))
 		ret = __btree_iter_traverse_all(iter->trans, iter, ret);
 
-	BUG_ON(ret == -EINTR && !btree_trans_has_multiple_iters(iter->trans));
-
 	return ret;
 }
 
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 7bd3adcd4b52..ece4f30b3f85 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -6,6 +6,7 @@
 #include <linux/rhashtable.h>
 
 #include "bkey_methods.h"
+#include "buckets_types.h"
 #include "journal_types.h"
 #include "six.h"
 
@@ -264,6 +265,7 @@ struct btree_insert_entry {
 	};
 
 	bool			deferred;
+	bool			triggered;
 };
 
 #define BTREE_ITER_MAX		64
@@ -302,6 +304,8 @@ struct btree_trans {
 
 	struct btree_iter	iters_onstack[2];
 	struct btree_insert_entry updates_onstack[6];
+
+	struct replicas_delta_list fs_usage_deltas;
 };
 
 #define BTREE_FLAG(flag)						\
diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
index 7a638a76634f..4438a9992442 100644
--- a/fs/bcachefs/btree_update.h
+++ b/fs/bcachefs/btree_update.h
@@ -43,8 +43,11 @@ enum {
 	__BTREE_INSERT_USE_ALLOC_RESERVE,
 	__BTREE_INSERT_JOURNAL_REPLAY,
 	__BTREE_INSERT_JOURNAL_RESERVED,
+	__BTREE_INSERT_NOMARK_INSERT,
 	__BTREE_INSERT_NOMARK_OVERWRITES,
 	__BTREE_INSERT_NOMARK,
+	__BTREE_INSERT_MARK_INMEM,
+	__BTREE_INSERT_NO_CLEAR_REPLICAS,
 	__BTREE_INSERT_NOWAIT,
 	__BTREE_INSERT_GC_LOCK_HELD,
 	__BCH_HASH_SET_MUST_CREATE,
@@ -77,12 +80,20 @@ enum {
 
 #define BTREE_INSERT_JOURNAL_RESERVED	(1 << __BTREE_INSERT_JOURNAL_RESERVED)
 
+/* Don't mark new key, just overwrites: */
+#define BTREE_INSERT_NOMARK_INSERT	(1 << __BTREE_INSERT_NOMARK_INSERT)
+
 /* Don't mark overwrites, just new key: */
 #define BTREE_INSERT_NOMARK_OVERWRITES	(1 << __BTREE_INSERT_NOMARK_OVERWRITES)
 
-/* Don't call bch2_mark_key: */
+/* Don't call mark new key at all: */
 #define BTREE_INSERT_NOMARK		(1 << __BTREE_INSERT_NOMARK)
 
+/* Don't mark transactionally: */
+#define BTREE_INSERT_MARK_INMEM		(1 << __BTREE_INSERT_MARK_INMEM)
+
+#define BTREE_INSERT_NO_CLEAR_REPLICAS	(1 << __BTREE_INSERT_NO_CLEAR_REPLICAS)
+
 /* Don't block on allocation failure (for new btree nodes: */
 #define BTREE_INSERT_NOWAIT		(1 << __BTREE_INSERT_NOWAIT)
 #define BTREE_INSERT_GC_LOCK_HELD	(1 << __BTREE_INSERT_GC_LOCK_HELD)
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 5e13ad34ec42..b9b9accfb38c 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -526,6 +526,22 @@ static inline void do_btree_insert_one(struct btree_trans *trans,
 		btree_insert_key_deferred(trans, insert);
 }
 
+static inline bool update_triggers_transactional(struct btree_trans *trans,
+						 struct btree_insert_entry *i)
+{
+	return likely(!(trans->flags & BTREE_INSERT_MARK_INMEM)) &&
+		(i->iter->btree_id == BTREE_ID_EXTENTS ||
+		 i->iter->btree_id == BTREE_ID_INODES);
+}
+
+static inline bool update_has_triggers(struct btree_trans *trans,
+				       struct btree_insert_entry *i)
+{
+	return likely(!(trans->flags & BTREE_INSERT_NOMARK)) &&
+		!i->deferred &&
+		btree_node_type_needs_gc(i->iter->btree_id);
+}
+
 /*
  * Get journal reservation, take write locks, and attempt to do btree update(s):
  */
@@ -538,29 +554,25 @@ static inline int do_btree_insert_at(struct btree_trans *trans,
 	struct btree_iter *linked;
 	int ret;
 
+	if (likely(!(trans->flags & BTREE_INSERT_NO_CLEAR_REPLICAS))) {
+		memset(&trans->fs_usage_deltas.fs_usage, 0,
+		       sizeof(trans->fs_usage_deltas.fs_usage));
+		trans->fs_usage_deltas.top = trans->fs_usage_deltas.d;
+	}
+
 	trans_for_each_update_iter(trans, i)
 		BUG_ON(i->iter->uptodate >= BTREE_ITER_NEED_RELOCK);
 
-	btree_trans_lock_write(c, trans);
-
-	if (likely(!(trans->flags & BTREE_INSERT_NOMARK))) {
-		trans_for_each_update_iter(trans, i) {
-			if (i->deferred ||
-			    !btree_node_type_needs_gc(i->iter->btree_id))
-				continue;
-
-			if (!fs_usage) {
-				percpu_down_read(&c->mark_lock);
-				fs_usage = bch2_fs_usage_scratch_get(c);
-			}
-
-			if (!bch2_bkey_replicas_marked_locked(c,
-					bkey_i_to_s_c(i->k), true)) {
-				ret = BTREE_INSERT_NEED_MARK_REPLICAS;
-				goto out;
-			}
+	trans_for_each_update_iter(trans, i)
+		if (update_has_triggers(trans, i) &&
+		    update_triggers_transactional(trans, i)) {
+			ret = bch2_trans_mark_update(trans, i,
+						&trans->fs_usage_deltas);
+			if (ret)
+				return ret;
 		}
-	}
+
+	btree_trans_lock_write(c, trans);
 
 	if (race_fault()) {
 		ret = -EINTR;
@@ -578,6 +590,23 @@ static inline int do_btree_insert_at(struct btree_trans *trans,
 	if (ret)
 		goto out;
 
+	trans_for_each_update_iter(trans, i) {
+		if (i->deferred ||
+		    !btree_node_type_needs_gc(i->iter->btree_id))
+			continue;
+
+		if (!fs_usage) {
+			percpu_down_read(&c->mark_lock);
+			fs_usage = bch2_fs_usage_scratch_get(c);
+		}
+
+		if (!bch2_bkey_replicas_marked_locked(c,
+			bkey_i_to_s_c(i->k), true)) {
+			ret = BTREE_INSERT_NEED_MARK_REPLICAS;
+			goto out;
+		}
+	}
+
 	/*
 	 * Don't get journal reservation until after we know insert will
 	 * succeed:
@@ -606,20 +635,24 @@ static inline int do_btree_insert_at(struct btree_trans *trans,
 				linked->flags |= BTREE_ITER_NOUNLOCK;
 	}
 
-	if (likely(!(trans->flags & BTREE_INSERT_NOMARK))) {
-		trans_for_each_update_iter(trans, i)
+	trans_for_each_update_iter(trans, i)
+		if (update_has_triggers(trans, i) &&
+		    !update_triggers_transactional(trans, i))
 			bch2_mark_update(trans, i, &fs_usage->u, 0);
-		if (fs_usage)
-			bch2_trans_fs_usage_apply(trans, fs_usage);
-
-		if (unlikely(c->gc_pos.phase)) {
-			trans_for_each_update_iter(trans, i)
-				if (gc_visited(c, gc_pos_btree_node(i->iter->l[0].b)))
-					bch2_mark_update(trans, i, NULL,
-							 BCH_BUCKET_MARK_GC);
-		}
+
+	if (fs_usage) {
+		bch2_replicas_delta_list_apply(c, &fs_usage->u,
+					       &trans->fs_usage_deltas);
+		bch2_trans_fs_usage_apply(trans, fs_usage);
 	}
 
+	if (likely(!(trans->flags & BTREE_INSERT_NOMARK)) &&
+	    unlikely(c->gc_pos.phase))
+		trans_for_each_update_iter(trans, i)
+			if (gc_visited(c, gc_pos_btree_node(i->iter->l[0].b)))
+				bch2_mark_update(trans, i, NULL,
+						 BCH_BUCKET_MARK_GC);
+
 	trans_for_each_update(trans, i)
 		do_btree_insert_one(trans, i);
 out:
@@ -646,6 +679,19 @@ int bch2_trans_commit_error(struct btree_trans *trans,
 {
 	struct bch_fs *c = trans->c;
 	unsigned flags = trans->flags;
+	struct btree_insert_entry *src, *dst;
+
+	src = dst = trans->updates;
+
+	while (src < trans->updates + trans->nr_updates) {
+		if (!src->triggered) {
+			*dst = *src;
+			dst++;
+		}
+		src++;
+	}
+
+	trans->nr_updates = dst - trans->updates;
 
 	/*
 	 * BTREE_INSERT_NOUNLOCK means don't unlock _after_ successful btree
@@ -808,6 +854,7 @@ int bch2_trans_commit(struct btree_trans *trans,
 {
 	struct bch_fs *c = trans->c;
 	struct btree_insert_entry *i;
+	unsigned orig_mem_top = trans->mem_top;
 	int ret = 0;
 
 	if (!trans->nr_updates)
@@ -885,8 +932,16 @@ out_noupdates:
 	return ret;
 err:
 	ret = bch2_trans_commit_error(trans, i, ret);
-	if (!ret)
+
+	/* can't loop if it was passed in and we changed it: */
+	if (unlikely(trans->flags & BTREE_INSERT_NO_CLEAR_REPLICAS) && !ret)
+		ret = -EINTR;
+
+	if (!ret) {
+		/* free memory used by triggers, they'll be reexecuted: */
+		trans->mem_top = orig_mem_top;
 		goto retry;
+	}
 
 	goto out;
 }
@@ -969,6 +1024,7 @@ int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id,
 	int ret = 0;
 
 	bch2_trans_init(&trans, c);
+	bch2_trans_preload_iters(&trans);
 
 	iter = bch2_trans_get_iter(&trans, id, start, BTREE_ITER_INTENT);
 
@@ -1014,5 +1070,6 @@ int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id,
 	}
 
 	bch2_trans_exit(&trans);
+	BUG_ON(ret == -EINTR);
 	return ret;
 }
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index fb5461df3bbf..6d04474f0e3a 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -653,19 +653,16 @@ static int bch2_mark_alloc(struct bch_fs *c, struct bkey_s_c k,
 	ca = bch_dev_bkey_exists(c, k.k->p.inode);
 	g = __bucket(ca, k.k->p.offset, gc);
 
-	/*
-	 * this should currently only be getting called from the bucket
-	 * invalidate path:
-	 */
-	BUG_ON(u.dirty_sectors);
-	BUG_ON(u.cached_sectors);
-	BUG_ON(!g->mark.owned_by_allocator);
-
 	old = bucket_data_cmpxchg(c, ca, fs_usage, g, m, ({
 		m.gen			= u.gen;
 		m.data_type		= u.data_type;
 		m.dirty_sectors		= u.dirty_sectors;
 		m.cached_sectors	= u.cached_sectors;
+
+		if (!(flags & BCH_BUCKET_MARK_GC)) {
+			m.journal_seq_valid	= 1;
+			m.journal_seq		= journal_seq;
+		}
 	}));
 
 	g->io_time[READ]	= u.read_time;
@@ -673,6 +670,11 @@ static int bch2_mark_alloc(struct bch_fs *c, struct bkey_s_c k,
 	g->oldest_gen		= u.oldest_gen;
 	g->gen_valid		= 1;
 
+	/*
+	 * need to know if we're getting called from the invalidate path or
+	 * not:
+	 */
+
 	if (old.cached_sectors) {
 		update_cached_sectors(c, fs_usage, ca->dev_idx,
 				      -old.cached_sectors);
@@ -762,11 +764,34 @@ static s64 ptr_disk_sectors_delta(struct extent_ptr_decoded p,
 	}
 }
 
-/*
- * Checking against gc's position has to be done here, inside the cmpxchg()
- * loop, to avoid racing with the start of gc clearing all the marks - GC does
- * that with the gc pos seqlock held.
- */
+static void bucket_set_stripe(struct bch_fs *c,
+			      const struct bch_stripe *v,
+			      bool enabled,
+			      struct bch_fs_usage *fs_usage,
+			      u64 journal_seq,
+			      bool gc)
+{
+	unsigned i;
+
+	for (i = 0; i < v->nr_blocks; i++) {
+		const struct bch_extent_ptr *ptr = v->ptrs + i;
+		struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+		struct bucket *g = PTR_BUCKET(ca, ptr, gc);
+		struct bucket_mark new, old;
+
+		BUG_ON(ptr_stale(ca, ptr));
+
+		old = bucket_data_cmpxchg(c, ca, fs_usage, g, new, ({
+			new.dirty			= true;
+			new.stripe			= enabled;
+			if (journal_seq) {
+				new.journal_seq_valid	= 1;
+				new.journal_seq		= journal_seq;
+			}
+		}));
+	}
+}
+
 static bool bch2_mark_pointer(struct bch_fs *c,
 			      struct extent_ptr_decoded p,
 			      s64 sectors, enum bch_data_type data_type,
@@ -776,8 +801,7 @@ static bool bch2_mark_pointer(struct bch_fs *c,
 {
 	struct bucket_mark old, new;
 	struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev);
-	size_t b = PTR_BUCKET_NR(ca, &p.ptr);
-	struct bucket *g = __bucket(ca, b, gc);
+	struct bucket *g = PTR_BUCKET(ca, &p.ptr, gc);
 	bool overflow;
 	u64 v;
 
@@ -946,35 +970,6 @@ static int bch2_mark_extent(struct bch_fs *c, struct bkey_s_c k,
 	return 0;
 }
 
-static void bucket_set_stripe(struct bch_fs *c,
-			      const struct bch_stripe *v,
-			      bool enabled,
-			      struct bch_fs_usage *fs_usage,
-			      u64 journal_seq,
-			      bool gc)
-{
-	unsigned i;
-
-	for (i = 0; i < v->nr_blocks; i++) {
-		const struct bch_extent_ptr *ptr = v->ptrs + i;
-		struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
-		size_t b = PTR_BUCKET_NR(ca, ptr);
-		struct bucket *g = __bucket(ca, b, gc);
-		struct bucket_mark new, old;
-
-		BUG_ON(ptr_stale(ca, ptr));
-
-		old = bucket_data_cmpxchg(c, ca, fs_usage, g, new, ({
-			new.dirty			= true;
-			new.stripe			= enabled;
-			if (journal_seq) {
-				new.journal_seq_valid	= 1;
-				new.journal_seq		= journal_seq;
-			}
-		}));
-	}
-}
-
 static int bch2_mark_stripe(struct bch_fs *c, struct bkey_s_c k,
 			    bool inserting,
 			    struct bch_fs_usage *fs_usage,
@@ -1006,14 +1001,7 @@ static int bch2_mark_stripe(struct bch_fs *c, struct bkey_s_c k,
 		m->nr_blocks	= s.v->nr_blocks;
 		m->nr_redundant	= s.v->nr_redundant;
 
-		memset(&m->r, 0, sizeof(m->r));
-
-		m->r.e.data_type	= BCH_DATA_USER;
-		m->r.e.nr_devs		= s.v->nr_blocks;
-		m->r.e.nr_required	= s.v->nr_blocks - s.v->nr_redundant;
-
-		for (i = 0; i < s.v->nr_blocks; i++)
-			m->r.e.devs[i] = s.v->ptrs[i].dev;
+		bch2_bkey_to_replicas(&m->r.e, k);
 
 	/*
 	 * XXX: account for stripes somehow here
@@ -1180,10 +1168,11 @@ int bch2_mark_update(struct btree_trans *trans,
 	if (!btree_node_type_needs_gc(iter->btree_id))
 		return 0;
 
-	bch2_mark_key_locked(c, bkey_i_to_s_c(insert->k), true,
-		bpos_min(insert->k->k.p, b->key.k.p).offset -
-		bkey_start_offset(&insert->k->k),
-		fs_usage, trans->journal_res.seq, flags);
+	if (!(trans->flags & BTREE_INSERT_NOMARK_INSERT))
+		bch2_mark_key_locked(c, bkey_i_to_s_c(insert->k), true,
+			bpos_min(insert->k->k.p, b->key.k.p).offset -
+			bkey_start_offset(&insert->k->k),
+			fs_usage, trans->journal_res.seq, flags);
 
 	if (unlikely(trans->flags & BTREE_INSERT_NOMARK_OVERWRITES))
 		return 0;
@@ -1262,6 +1251,391 @@ void bch2_trans_fs_usage_apply(struct btree_trans *trans,
 	}
 }
 
+/* trans_mark: */
+
+static inline void update_replicas_list(struct replicas_delta_list *d,
+					struct bch_replicas_entry *r,
+					s64 sectors)
+{
+	d->top->delta = sectors;
+	memcpy(&d->top->r, r, replicas_entry_bytes(r));
+
+	d->top = (void *) d->top + replicas_entry_bytes(r) + 8;
+
+	BUG_ON((void *) d->top > (void *) d->d + sizeof(d->pad));
+}
+
+static inline void update_cached_sectors_list(struct replicas_delta_list *d,
+					      unsigned dev, s64 sectors)
+{
+	struct bch_replicas_padded r;
+
+	bch2_replicas_entry_cached(&r.e, dev);
+
+	update_replicas_list(d, &r.e, sectors);
+}
+
+void bch2_replicas_delta_list_apply(struct bch_fs *c,
+				    struct bch_fs_usage *fs_usage,
+				    struct replicas_delta_list *r)
+{
+	struct replicas_delta *d = r->d;
+
+	acc_u64s((u64 *) fs_usage,
+		 (u64 *) &r->fs_usage, sizeof(*fs_usage) / sizeof(u64));
+
+	while (d != r->top) {
+		BUG_ON((void *) d > (void *) r->top);
+
+		update_replicas(c, fs_usage, &d->r, d->delta);
+
+		d = (void *) d + replicas_entry_bytes(&d->r) + 8;
+	}
+}
+
+static int trans_get_key(struct btree_trans *trans,
+			 enum btree_id btree_id, struct bpos pos,
+			 struct btree_insert_entry **insert,
+			 struct btree_iter **iter,
+			 struct bkey_s_c *k)
+{
+	unsigned i;
+	int ret;
+
+	*insert = NULL;
+
+	for (i = 0; i < trans->nr_updates; i++)
+		if (!trans->updates[i].deferred &&
+		    trans->updates[i].iter->btree_id == btree_id &&
+		    !bkey_cmp(pos, trans->updates[i].iter->pos)) {
+			*insert = &trans->updates[i];
+			*iter	= (*insert)->iter;
+			*k	= bkey_i_to_s_c((*insert)->k);
+			return 0;
+		}
+
+	*iter = __bch2_trans_get_iter(trans, btree_id, pos,
+				   BTREE_ITER_SLOTS|BTREE_ITER_INTENT, 0);
+	if (IS_ERR(*iter))
+		return PTR_ERR(*iter);
+
+	*k = bch2_btree_iter_peek_slot(*iter);
+	ret = bkey_err(*k);
+	if (ret)
+		bch2_trans_iter_put(trans, *iter);
+	return ret;
+}
+
+static int trans_update_key(struct btree_trans *trans,
+			    struct btree_insert_entry **insert,
+			    struct btree_iter *iter,
+			    struct bkey_s_c k,
+			    unsigned extra_u64s)
+{
+	struct bkey_i *new_k;
+
+	if (*insert)
+		return 0;
+
+	new_k = bch2_trans_kmalloc(trans, bkey_bytes(k.k) +
+				   extra_u64s * sizeof(u64));
+	if (IS_ERR(new_k))
+		return PTR_ERR(new_k);
+
+	*insert = bch2_trans_update(trans, ((struct btree_insert_entry) {
+				.iter = iter,
+				.k = new_k,
+				.triggered = true,
+	}));
+
+	bkey_reassemble((*insert)->k, k);
+	return 0;
+}
+
+static int bch2_trans_mark_pointer(struct btree_trans *trans,
+			struct extent_ptr_decoded p,
+			s64 sectors, enum bch_data_type data_type,
+			struct replicas_delta_list *d)
+{
+	struct bch_fs *c = trans->c;
+	struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev);
+	struct btree_insert_entry *insert;
+	struct btree_iter *iter;
+	struct bkey_s_c k;
+	struct bkey_alloc_unpacked u;
+	struct bkey_i_alloc *a;
+	bool overflow;
+	int ret;
+
+	ret = trans_get_key(trans, BTREE_ID_ALLOC,
+			    POS(p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr)),
+			    &insert, &iter, &k);
+	if (ret)
+		return ret;
+
+	if (k.k->type != KEY_TYPE_alloc) {
+		bch_err_ratelimited(c, "pointer to nonexistent bucket %u:%zu",
+				    p.ptr.dev,
+				    PTR_BUCKET_NR(ca, &p.ptr));
+		ret = -1;
+		goto out;
+	}
+
+	u = bch2_alloc_unpack(bkey_s_c_to_alloc(k).v);
+
+	if (gen_after(u.gen, p.ptr.gen)) {
+		ret = 1;
+		goto out;
+	}
+
+	if (!p.ptr.cached)
+		overflow = checked_add(u.dirty_sectors, sectors);
+	else
+		overflow = checked_add(u.cached_sectors, sectors);
+
+	u.data_type = u.dirty_sectors || u.cached_sectors
+		? data_type : 0;
+
+	bch2_fs_inconsistent_on(overflow, c,
+		"bucket sector count overflow: %u + %lli > U16_MAX",
+		!p.ptr.cached
+		? u.dirty_sectors
+		: u.cached_sectors, sectors);
+
+	ret = trans_update_key(trans, &insert, iter, k, 1);
+	if (ret)
+		goto out;
+
+	a = bkey_alloc_init(insert->k);
+	a->k.p = iter->pos;
+	bch2_alloc_pack(a, u);
+out:
+	bch2_trans_iter_put(trans, iter);
+	return ret;
+}
+
+static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans,
+			struct bch_extent_stripe_ptr p,
+			s64 sectors, enum bch_data_type data_type,
+			struct replicas_delta_list *d)
+{
+	struct bch_replicas_padded r;
+	struct btree_insert_entry *insert;
+	struct btree_iter *iter;
+	struct bkey_s_c k;
+	struct bkey_s_stripe s;
+	unsigned nr_data;
+	s64 parity_sectors;
+	int ret = 0;
+
+	BUG_ON(!sectors);
+
+	ret = trans_get_key(trans, BTREE_ID_EC, POS(0, p.idx),
+			    &insert, &iter, &k);
+	if (ret)
+		return ret;
+
+	if (k.k->type != KEY_TYPE_stripe) {
+		bch_err_ratelimited(trans->c,
+				    "pointer to nonexistent stripe %llu",
+				    (u64) p.idx);
+		ret = -1;
+		goto out;
+	}
+
+	ret = trans_update_key(trans, &insert, iter, k, 1);
+	if (ret)
+		goto out;
+
+	s = bkey_i_to_s_stripe(insert->k);
+
+	nr_data = s.v->nr_blocks - s.v->nr_redundant;
+
+	parity_sectors = DIV_ROUND_UP(abs(sectors) * s.v->nr_redundant, nr_data);
+
+	if (sectors < 0)
+		parity_sectors = -parity_sectors;
+
+	stripe_blockcount_set(s.v, p.block,
+		stripe_blockcount_get(s.v, p.block) +
+		sectors + parity_sectors);
+
+	bch2_bkey_to_replicas(&r.e, s.s_c);
+
+	update_replicas_list(d, &r.e, sectors);
+out:
+	bch2_trans_iter_put(trans, iter);
+	return ret;
+}
+
+static int bch2_trans_mark_extent(struct btree_trans *trans,
+			struct bkey_s_c k,
+			s64 sectors, enum bch_data_type data_type,
+			struct replicas_delta_list *d)
+{
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+	const union bch_extent_entry *entry;
+	struct extent_ptr_decoded p;
+	struct bch_replicas_padded r;
+	s64 dirty_sectors = 0;
+	bool stale;
+	unsigned i;
+	int ret;
+
+	r.e.data_type	= data_type;
+	r.e.nr_devs	= 0;
+	r.e.nr_required	= 1;
+
+	BUG_ON(!sectors);
+
+	bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+		s64 disk_sectors = data_type == BCH_DATA_BTREE
+			? sectors
+			: ptr_disk_sectors_delta(p, sectors);
+
+		ret = bch2_trans_mark_pointer(trans, p, disk_sectors,
+					      data_type, d);
+		if (ret < 0)
+			return ret;
+
+		stale = ret > 0;
+
+		if (p.ptr.cached) {
+			if (disk_sectors && !stale)
+				update_cached_sectors_list(d, p.ptr.dev,
+							   disk_sectors);
+		} else if (!p.ec_nr) {
+			dirty_sectors	       += disk_sectors;
+			r.e.devs[r.e.nr_devs++]	= p.ptr.dev;
+		} else {
+			for (i = 0; i < p.ec_nr; i++) {
+				ret = bch2_trans_mark_stripe_ptr(trans, p.ec[i],
+						disk_sectors, data_type, d);
+				if (ret)
+					return ret;
+			}
+
+			r.e.nr_required = 0;
+		}
+	}
+
+	if (dirty_sectors)
+		update_replicas_list(d, &r.e, dirty_sectors);
+
+	return 0;
+}
+
+int bch2_trans_mark_key(struct btree_trans *trans,
+			struct bkey_s_c k,
+			bool inserting, s64 sectors,
+			struct replicas_delta_list *d)
+{
+	struct bch_fs *c = trans->c;
+
+	switch (k.k->type) {
+	case KEY_TYPE_btree_ptr:
+		return bch2_trans_mark_extent(trans, k, inserting
+				?  c->opts.btree_node_size
+				: -c->opts.btree_node_size,
+				BCH_DATA_BTREE, d);
+	case KEY_TYPE_extent:
+		return bch2_trans_mark_extent(trans, k,
+				sectors, BCH_DATA_USER, d);
+	case KEY_TYPE_inode:
+		if (inserting)
+			d->fs_usage.nr_inodes++;
+		else
+			d->fs_usage.nr_inodes--;
+		return 0;
+	case KEY_TYPE_reservation: {
+		unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas;
+
+		sectors *= replicas;
+		replicas = clamp_t(unsigned, replicas, 1,
+				   ARRAY_SIZE(d->fs_usage.persistent_reserved));
+
+		d->fs_usage.reserved				+= sectors;
+		d->fs_usage.persistent_reserved[replicas - 1]	+= sectors;
+		return 0;
+	}
+	default:
+		return 0;
+	}
+}
+
+int bch2_trans_mark_update(struct btree_trans *trans,
+			   struct btree_insert_entry *insert,
+			   struct replicas_delta_list *d)
+{
+	struct btree_iter	*iter = insert->iter;
+	struct btree		*b = iter->l[0].b;
+	struct btree_node_iter	node_iter = iter->l[0].iter;
+	struct bkey_packed	*_k;
+	int ret;
+
+	if (!btree_node_type_needs_gc(iter->btree_id))
+		return 0;
+
+	ret = bch2_trans_mark_key(trans,
+			bkey_i_to_s_c(insert->k), true,
+			bpos_min(insert->k->k.p, b->key.k.p).offset -
+			bkey_start_offset(&insert->k->k), d);
+	if (ret)
+		return ret;
+
+	while ((_k = bch2_btree_node_iter_peek_filter(&node_iter, b,
+						      KEY_TYPE_discard))) {
+		struct bkey		unpacked;
+		struct bkey_s_c		k;
+		s64			sectors = 0;
+
+		k = bkey_disassemble(b, _k, &unpacked);
+
+		if (btree_node_is_extents(b)
+		    ? bkey_cmp(insert->k->k.p, bkey_start_pos(k.k)) <= 0
+		    : bkey_cmp(insert->k->k.p, k.k->p))
+			break;
+
+		if (btree_node_is_extents(b)) {
+			switch (bch2_extent_overlap(&insert->k->k, k.k)) {
+			case BCH_EXTENT_OVERLAP_ALL:
+				sectors = -((s64) k.k->size);
+				break;
+			case BCH_EXTENT_OVERLAP_BACK:
+				sectors = bkey_start_offset(&insert->k->k) -
+					k.k->p.offset;
+				break;
+			case BCH_EXTENT_OVERLAP_FRONT:
+				sectors = bkey_start_offset(k.k) -
+					insert->k->k.p.offset;
+				break;
+			case BCH_EXTENT_OVERLAP_MIDDLE:
+				sectors = k.k->p.offset - insert->k->k.p.offset;
+				BUG_ON(sectors <= 0);
+
+				ret = bch2_trans_mark_key(trans, k, true,
+							  sectors, d);
+				if (ret)
+					return ret;
+
+				sectors = bkey_start_offset(&insert->k->k) -
+					k.k->p.offset;
+				break;
+			}
+
+			BUG_ON(sectors >= 0);
+		}
+
+		ret = bch2_trans_mark_key(trans, k, false, sectors, d);
+		if (ret)
+			return ret;
+
+		bch2_btree_node_iter_advance(&node_iter, b);
+	}
+
+	return 0;
+}
+
 /* Disk reservations: */
 
 #define SECTORS_CACHE	1024
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index 86431cffb660..578019089a91 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -100,7 +100,7 @@ static inline struct bucket_mark ptr_bucket_mark(struct bch_dev *ca,
 	struct bucket_mark m;
 
 	rcu_read_lock();
-	m = READ_ONCE(bucket(ca, PTR_BUCKET_NR(ca, ptr))->mark);
+	m = READ_ONCE(PTR_BUCKET(ca, ptr, 0)->mark);
 	rcu_read_unlock();
 
 	return m;
@@ -266,6 +266,15 @@ int bch2_mark_overwrite(struct btree_trans *, struct btree_iter *,
 			struct bch_fs_usage *, unsigned);
 int bch2_mark_update(struct btree_trans *, struct btree_insert_entry *,
 		     struct bch_fs_usage *, unsigned);
+
+void bch2_replicas_delta_list_apply(struct bch_fs *,
+				    struct bch_fs_usage *,
+				    struct replicas_delta_list *);
+int bch2_trans_mark_key(struct btree_trans *, struct bkey_s_c,
+			bool, s64, struct replicas_delta_list *);
+int bch2_trans_mark_update(struct btree_trans *,
+			   struct btree_insert_entry *,
+			   struct replicas_delta_list *);
 void bch2_trans_fs_usage_apply(struct btree_trans *, struct bch_fs_usage_online *);
 
 /* disk reservations: */
diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h
index 8e47b273360c..ec4294d41518 100644
--- a/fs/bcachefs/buckets_types.h
+++ b/fs/bcachefs/buckets_types.h
@@ -93,6 +93,19 @@ struct bch_fs_usage_short {
 	u64			nr_inodes;
 };
 
+struct replicas_delta {
+	s64			delta;
+	struct bch_replicas_entry r;
+} __packed;
+
+struct replicas_delta_list {
+	struct bch_fs_usage	fs_usage;
+
+	struct replicas_delta	*top;
+	struct replicas_delta	d[0];
+	u8			pad[256];
+};
+
 /*
  * A reservation for space on disk:
  */
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index 1eacd9665c7d..6761b5c24a12 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -539,14 +539,17 @@ static int ec_stripe_mem_alloc(struct bch_fs *c,
 			       struct btree_iter *iter)
 {
 	size_t idx = iter->pos.offset;
+	int ret = 0;
 
 	if (!__ec_stripe_mem_alloc(c, idx, GFP_NOWAIT|__GFP_NOWARN))
-		return 0;
+		return ret;
 
 	bch2_btree_trans_unlock(iter->trans);
+	ret = -EINTR;
 
 	if (!__ec_stripe_mem_alloc(c, idx, GFP_KERNEL))
-		return -EINTR;
+		return ret;
+
 	return -ENOMEM;
 }
 
@@ -692,23 +695,22 @@ retry:
 
 	if (!ret)
 		ret = -ENOSPC;
-	goto out;
+	goto err;
 found_slot:
 	ret = ec_stripe_mem_alloc(c, iter);
-
-	if (ret == -EINTR)
-		goto retry;
 	if (ret)
-		return ret;
+		goto err;
 
 	stripe->k.p = iter->pos;
 
 	bch2_trans_update(&trans, BTREE_INSERT_ENTRY(iter, &stripe->k_i));
 
 	ret = bch2_trans_commit(&trans, NULL, NULL,
-				BTREE_INSERT_NOFAIL|
-				BTREE_INSERT_USE_RESERVE);
-out:
+				BTREE_INSERT_ATOMIC|
+				BTREE_INSERT_NOFAIL);
+err:
+	if (ret == -EINTR)
+		goto retry;
 	bch2_trans_exit(&trans);
 
 	return ret;
@@ -745,6 +747,7 @@ static int ec_stripe_update_ptrs(struct bch_fs *c,
 	int ret = 0, dev, idx;
 
 	bch2_trans_init(&trans, c);
+	bch2_trans_preload_iters(&trans);
 
 	iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
 				   bkey_start_pos(pos),
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index a975f8f72da4..2ebde20c74f8 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -903,15 +903,54 @@ static void extent_bset_insert(struct bch_fs *c, struct btree_iter *iter,
 	bch2_btree_iter_verify(iter, l->b);
 }
 
+static unsigned bch2_bkey_nr_alloc_ptrs(struct bkey_s_c k)
+{
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+	const union bch_extent_entry *entry;
+	unsigned ret = 0;
+
+	bkey_extent_entry_for_each(ptrs, entry) {
+		switch (__extent_entry_type(entry)) {
+		case BCH_EXTENT_ENTRY_ptr:
+		case BCH_EXTENT_ENTRY_stripe_ptr:
+			ret++;
+		}
+	}
+
+	return ret;
+}
+
 static inline struct bpos
-bch2_extent_atomic_end(struct bkey_i *k, struct btree_iter *iter)
+bch2_extent_atomic_end(struct bkey_i *insert, struct btree_iter *iter)
 {
 	struct btree *b = iter->l[0].b;
+	struct btree_node_iter	node_iter = iter->l[0].iter;
+	struct bkey_packed	*_k;
+	unsigned		nr_alloc_ptrs =
+		bch2_bkey_nr_alloc_ptrs(bkey_i_to_s_c(insert));
 
 	BUG_ON(iter->uptodate > BTREE_ITER_NEED_PEEK);
-	BUG_ON(bkey_cmp(bkey_start_pos(&k->k), b->data->min_key) < 0);
+	BUG_ON(bkey_cmp(bkey_start_pos(&insert->k), b->data->min_key) < 0);
+
+	while ((_k = bch2_btree_node_iter_peek_filter(&node_iter, b,
+						      KEY_TYPE_discard))) {
+		struct bkey	unpacked;
+		struct bkey_s_c	k = bkey_disassemble(b, _k, &unpacked);
+
+		if (bkey_cmp(insert->k.p, bkey_start_pos(k.k)) <= 0)
+			break;
+
+		nr_alloc_ptrs += bch2_bkey_nr_alloc_ptrs(k);
+
+		if (nr_alloc_ptrs > 20) {
+			BUG_ON(bkey_cmp(k.k->p, bkey_start_pos(&insert->k)) <= 0);
+			return bpos_min(insert->k.p, k.k->p);
+		}
+
+		bch2_btree_node_iter_advance(&node_iter, b);
+	}
 
-	return bpos_min(k->k.p, b->key.k.p);
+	return bpos_min(insert->k.p, b->key.k.p);
 }
 
 void bch2_extent_trim_atomic(struct bkey_i *k, struct btree_iter *iter)
diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c
index 98202fbabfaf..f9e6c9d9ef04 100644
--- a/fs/bcachefs/migrate.c
+++ b/fs/bcachefs/migrate.c
@@ -43,6 +43,7 @@ static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
 	int ret = 0;
 
 	bch2_trans_init(&trans, c);
+	bch2_trans_preload_iters(&trans);
 
 	iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
 				   POS_MIN, BTREE_ITER_PREFETCH);
@@ -96,6 +97,8 @@ static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
 			break;
 	}
 
+	BUG_ON(ret == -EINTR);
+
 	bch2_trans_exit(&trans);
 
 	bch2_replicas_gc_end(c, ret);
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index 1ad585ee27ca..ff426a2c8e7a 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -62,6 +62,7 @@ static int bch2_migrate_index_update(struct bch_write_op *op)
 	int ret = 0;
 
 	bch2_trans_init(&trans, c);
+	bch2_trans_preload_iters(&trans);
 
 	iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
 				   bkey_start_pos(&bch2_keylist_front(keys)->k),
@@ -184,6 +185,7 @@ nomatch:
 	}
 out:
 	bch2_trans_exit(&trans);
+	BUG_ON(ret == -EINTR);
 	return ret;
 }
 
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index a3f07565efb0..a80de5d814d6 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -212,11 +212,6 @@ static int bch2_extent_replay_key(struct bch_fs *c, struct bkey_i *k)
 		bch2_disk_reservation_init(c, 0);
 	struct bkey_i *split;
 	bool split_compressed = false;
-	unsigned flags = BTREE_INSERT_ATOMIC|
-		BTREE_INSERT_NOFAIL|
-		BTREE_INSERT_LAZY_RW|
-		BTREE_INSERT_JOURNAL_REPLAY|
-		BTREE_INSERT_NOMARK;
 	int ret;
 
 	bch2_trans_init(&trans, c);
@@ -252,9 +247,6 @@ retry:
 					BCH_DISK_RESERVATION_NOFAIL);
 			BUG_ON(ret);
 
-			flags &= ~BTREE_INSERT_JOURNAL_REPLAY;
-			flags &= ~BTREE_INSERT_NOMARK;
-			flags |=  BTREE_INSERT_NOMARK_OVERWRITES;
 			split_compressed = true;
 		}
 
@@ -266,24 +258,31 @@ retry:
 		bch2_btree_iter_set_pos(iter, split->k.p);
 	} while (bkey_cmp(iter->pos, k->k.p) < 0);
 
-	ret = bch2_trans_commit(&trans, &disk_res, NULL, flags);
-	if (ret)
-		goto err;
-
 	if (split_compressed) {
-		/*
-		 * This isn't strictly correct - we should only be relying on
-		 * the btree node lock for synchronization with gc when we've
-		 * got a write lock held.
-		 *
-		 * but - there are other correctness issues if btree gc were to
-		 * run before journal replay finishes
-		 */
-		BUG_ON(c->gc_pos.phase);
-
-		bch2_mark_key(c, bkey_i_to_s_c(k), false, -((s64) k->k.size),
-			      NULL, 0, 0);
+		memset(&trans.fs_usage_deltas.fs_usage, 0,
+		       sizeof(trans.fs_usage_deltas.fs_usage));
+		trans.fs_usage_deltas.top = trans.fs_usage_deltas.d;
+
+		ret = bch2_trans_mark_key(&trans, bkey_i_to_s_c(k), false,
+					  -((s64) k->k.size),
+					  &trans.fs_usage_deltas) ?:
+		      bch2_trans_commit(&trans, &disk_res, NULL,
+					BTREE_INSERT_ATOMIC|
+					BTREE_INSERT_NOFAIL|
+					BTREE_INSERT_LAZY_RW|
+					BTREE_INSERT_NOMARK_OVERWRITES|
+					BTREE_INSERT_NO_CLEAR_REPLICAS);
+	} else {
+		ret = bch2_trans_commit(&trans, &disk_res, NULL,
+					BTREE_INSERT_ATOMIC|
+					BTREE_INSERT_NOFAIL|
+					BTREE_INSERT_LAZY_RW|
+					BTREE_INSERT_JOURNAL_REPLAY|
+					BTREE_INSERT_NOMARK);
 	}
+
+	if (ret)
+		goto err;
 err:
 	if (ret == -EINTR)
 		goto retry;
@@ -527,7 +526,7 @@ static int verify_superblock_clean(struct bch_fs *c,
 	struct bch_sb_field_clean *clean = *cleanp;
 	int ret = 0;
 
-	if (!clean || !j)
+	if (!c->sb.clean || !j)
 		return 0;
 
 	if (mustfix_fsck_err_on(j->seq != clean->journal_seq, c,
@@ -653,6 +652,7 @@ int bch2_fs_recovery(struct bch_fs *c)
 	u64 journal_seq;
 	LIST_HEAD(journal_entries);
 	struct journal_keys journal_keys = { NULL };
+	bool wrote = false, write_sb = false;
 	int ret;
 
 	if (c->sb.clean)
@@ -677,8 +677,12 @@ int bch2_fs_recovery(struct bch_fs *c)
 		if (ret)
 			goto err;
 
-		fsck_err_on(c->sb.clean && !journal_empty(&journal_entries), c,
-			    "filesystem marked clean but journal not empty");
+		if (mustfix_fsck_err_on(c->sb.clean && !journal_empty(&journal_entries), c,
+				"filesystem marked clean but journal not empty")) {
+			c->sb.compat &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_INFO);
+			SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
+			c->sb.clean = false;
+		}
 
 		if (!c->sb.clean && list_empty(&journal_entries)) {
 			bch_err(c, "no journal entries found");
@@ -736,12 +740,15 @@ int bch2_fs_recovery(struct bch_fs *c)
 	if (ret)
 		goto err;
 
+	bch_verbose(c, "starting alloc read");
 	err = "error reading allocation information";
 	ret = bch2_alloc_read(c, &journal_keys);
 	if (ret)
 		goto err;
+	bch_verbose(c, "alloc read done");
 
 	bch_verbose(c, "starting stripes_read");
+	err = "error reading stripes";
 	ret = bch2_stripes_read(c, &journal_keys);
 	if (ret)
 		goto err;
@@ -749,11 +756,26 @@ int bch2_fs_recovery(struct bch_fs *c)
 
 	set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags);
 
+	if ((c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_INFO)) &&
+	    !(c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_METADATA))) {
+		/*
+		 * interior btree node updates aren't consistent with the
+		 * journal; after an unclean shutdown we have to walk all
+		 * pointers to metadata:
+		 */
+		bch_verbose(c, "starting metadata mark and sweep:");
+		err = "error in mark and sweep";
+		ret = bch2_gc(c, NULL, true, true);
+		if (ret)
+			goto err;
+		bch_verbose(c, "mark and sweep done");
+	}
+
 	if (c->opts.fsck ||
 	    !(c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_INFO)) ||
 	    test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags)) {
 		bch_verbose(c, "starting mark and sweep:");
-		err = "error in recovery";
+		err = "error in mark and sweep";
 		ret = bch2_gc(c, &journal_keys, true, false);
 		if (ret)
 			goto err;
@@ -780,6 +802,16 @@ int bch2_fs_recovery(struct bch_fs *c)
 		goto err;
 	bch_verbose(c, "journal replay done");
 
+	bch_verbose(c, "writing allocation info:");
+	err = "error writing out alloc info";
+	ret = bch2_stripes_write(c, BTREE_INSERT_LAZY_RW, &wrote) ?:
+		bch2_alloc_write(c, BTREE_INSERT_LAZY_RW, &wrote);
+	if (ret) {
+		bch_err(c, "error writing alloc info");
+		goto err;
+	}
+	bch_verbose(c, "alloc write done");
+
 	if (c->opts.norecovery)
 		goto out;
 
@@ -802,13 +834,23 @@ int bch2_fs_recovery(struct bch_fs *c)
 			c->disk_sb.sb->version_min =
 				le16_to_cpu(bcachefs_metadata_version_min);
 		c->disk_sb.sb->version = le16_to_cpu(bcachefs_metadata_version_current);
+		write_sb = true;
+	}
+
+	if (!test_bit(BCH_FS_ERROR, &c->flags)) {
+		c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_FEAT_ALLOC_INFO;
+		write_sb = true;
 	}
 
 	if (c->opts.fsck &&
 	    !test_bit(BCH_FS_ERROR, &c->flags)) {
 		c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_ATOMIC_NLINK;
 		SET_BCH_SB_HAS_ERRORS(c->disk_sb.sb, 0);
+		write_sb = true;
 	}
+
+	if (write_sb)
+		bch2_write_super(c);
 	mutex_unlock(&c->sb_lock);
 
 	if (c->journal_seq_blacklist_table &&
@@ -821,7 +863,7 @@ out:
 	return ret;
 err:
 fsck_err:
-	pr_err("Error in recovery: %s (%i)", err, ret);
+	bch_err(c, "Error in recovery: %s (%i)", err, ret);
 	goto out;
 }
 
diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c
index cf13a628682f..2482dbbbad38 100644
--- a/fs/bcachefs/replicas.c
+++ b/fs/bcachefs/replicas.c
@@ -102,8 +102,8 @@ static void stripe_to_replicas(struct bkey_s_c k,
 		r->devs[r->nr_devs++] = ptr->dev;
 }
 
-static void bkey_to_replicas(struct bch_replicas_entry *e,
-			     struct bkey_s_c k)
+void bch2_bkey_to_replicas(struct bch_replicas_entry *e,
+			   struct bkey_s_c k)
 {
 	e->nr_devs = 0;
 
@@ -439,7 +439,7 @@ bool bch2_bkey_replicas_marked_locked(struct bch_fs *c,
 			return false;
 	}
 
-	bkey_to_replicas(&search.e, k);
+	bch2_bkey_to_replicas(&search.e, k);
 
 	return bch2_replicas_marked_locked(c, &search.e, check_gc_replicas);
 }
@@ -472,7 +472,7 @@ int bch2_mark_bkey_replicas(struct bch_fs *c, struct bkey_s_c k)
 			return ret;
 	}
 
-	bkey_to_replicas(&search.e, k);
+	bch2_bkey_to_replicas(&search.e, k);
 
 	return bch2_mark_replicas(c, &search.e);
 }
diff --git a/fs/bcachefs/replicas.h b/fs/bcachefs/replicas.h
index 0777e7056d55..1ceedb6231fd 100644
--- a/fs/bcachefs/replicas.h
+++ b/fs/bcachefs/replicas.h
@@ -28,6 +28,7 @@ int bch2_mark_replicas(struct bch_fs *,
 
 bool bch2_bkey_replicas_marked_locked(struct bch_fs *,
 				      struct bkey_s_c, bool);
+void bch2_bkey_to_replicas(struct bch_replicas_entry *, struct bkey_s_c);
 bool bch2_bkey_replicas_marked(struct bch_fs *,
 			       struct bkey_s_c, bool);
 int bch2_mark_bkey_replicas(struct bch_fs *, struct bkey_s_c);
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index 6e69a4f74ca0..b991238c5bd2 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -946,7 +946,7 @@ int bch2_fs_mark_dirty(struct bch_fs *c)
 
 	mutex_lock(&c->sb_lock);
 	SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
-	c->disk_sb.sb->compat[0] &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_INFO);
+	c->disk_sb.sb->compat[0] &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_METADATA);
 	ret = bch2_write_super(c);
 	mutex_unlock(&c->sb_lock);
 
@@ -1063,6 +1063,7 @@ void bch2_fs_mark_clean(struct bch_fs *c)
 	SET_BCH_SB_CLEAN(c->disk_sb.sb, true);
 
 	c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_FEAT_ALLOC_INFO;
+	c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_FEAT_ALLOC_METADATA;
 
 	u64s = sizeof(*sb_clean) / sizeof(u64) + c->journal.entry_u64s_reserved;
 
-- 
cgit 


From 460651ee86b2d2e0cf1a70a15a07031213e27181 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 17 Apr 2019 18:14:46 -0400
Subject: bcachefs: Various improvements to bch2_alloc_write()

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c | 236 ++++++++++++++++-------------------------
 fs/bcachefs/alloc_background.h |   2 +-
 fs/bcachefs/buckets.c          |   8 +-
 fs/bcachefs/ec.c               |  10 +-
 fs/bcachefs/journal_io.c       |   2 -
 fs/bcachefs/super.c            |  13 +--
 6 files changed, 109 insertions(+), 162 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 82a68fabdc5f..25c18b8cd3a6 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -129,15 +129,21 @@ static inline void put_alloc_field(struct bkey_i_alloc *a, void **p,
 	*p += bytes;
 }
 
-struct bkey_alloc_unpacked bch2_alloc_unpack(const struct bch_alloc *a)
+struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c k)
 {
-	struct bkey_alloc_unpacked ret = { .gen = a->gen };
-	const void *d = a->data;
-	unsigned idx = 0;
+	struct bkey_alloc_unpacked ret = { .gen = 0 };
+
+	if (k.k->type == KEY_TYPE_alloc) {
+		const struct bch_alloc *a = bkey_s_c_to_alloc(k).v;
+		const void *d = a->data;
+		unsigned idx = 0;
+
+		ret.gen = a->gen;
 
 #define x(_name, _bits)	ret._name = get_alloc_field(a, &d, idx++);
-	BCH_ALLOC_FIELDS()
+		BCH_ALLOC_FIELDS()
 #undef  x
+	}
 	return ret;
 }
 
@@ -199,66 +205,18 @@ void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c,
 			       get_alloc_field(a.v, &d, i));
 }
 
-static void __alloc_read_key(struct bucket *g, const struct bch_alloc *a)
-{
-	const void *d = a->data;
-	unsigned idx = 0, data_type, dirty_sectors, cached_sectors;
-	struct bucket_mark m;
-
-	g->io_time[READ]	= get_alloc_field(a, &d, idx++);
-	g->io_time[WRITE]	= get_alloc_field(a, &d, idx++);
-	data_type		= get_alloc_field(a, &d, idx++);
-	dirty_sectors		= get_alloc_field(a, &d, idx++);
-	cached_sectors		= get_alloc_field(a, &d, idx++);
-	g->oldest_gen		= get_alloc_field(a, &d, idx++);
-
-	bucket_cmpxchg(g, m, ({
-		m.gen			= a->gen;
-		m.data_type		= data_type;
-		m.dirty_sectors		= dirty_sectors;
-		m.cached_sectors	= cached_sectors;
-	}));
-
-	g->gen_valid		= 1;
-}
-
-static void __alloc_write_key(struct bkey_i_alloc *a, struct bucket *g,
-			      struct bucket_mark m)
+static inline struct bkey_alloc_unpacked
+alloc_mem_to_key(struct bucket *g, struct bucket_mark m)
 {
-	unsigned idx = 0;
-	void *d = a->v.data;
-
-	a->v.fields	= 0;
-	a->v.gen	= m.gen;
-
-	d = a->v.data;
-	put_alloc_field(a, &d, idx++, g->io_time[READ]);
-	put_alloc_field(a, &d, idx++, g->io_time[WRITE]);
-	put_alloc_field(a, &d, idx++, m.data_type);
-	put_alloc_field(a, &d, idx++, m.dirty_sectors);
-	put_alloc_field(a, &d, idx++, m.cached_sectors);
-	put_alloc_field(a, &d, idx++, g->oldest_gen);
-
-	set_bkey_val_bytes(&a->k, (void *) d - (void *) &a->v);
-}
-
-static void bch2_alloc_read_key(struct bch_fs *c, struct bkey_s_c k)
-{
-	struct bch_dev *ca;
-	struct bkey_s_c_alloc a;
-
-	if (k.k->type != KEY_TYPE_alloc)
-		return;
-
-	a = bkey_s_c_to_alloc(k);
-	ca = bch_dev_bkey_exists(c, a.k->p.inode);
-
-	if (a.k->p.offset >= ca->mi.nbuckets)
-		return;
-
-	percpu_down_read(&c->mark_lock);
-	__alloc_read_key(bucket(ca, a.k->p.offset), a.v);
-	percpu_up_read(&c->mark_lock);
+	return (struct bkey_alloc_unpacked) {
+		.gen		= m.gen,
+		.oldest_gen	= g->oldest_gen,
+		.data_type	= m.data_type,
+		.dirty_sectors	= m.dirty_sectors,
+		.cached_sectors	= m.cached_sectors,
+		.read_time	= g->io_time[READ],
+		.write_time	= g->io_time[WRITE],
+	};
 }
 
 int bch2_alloc_read(struct bch_fs *c, struct journal_keys *journal_keys)
@@ -274,7 +232,7 @@ int bch2_alloc_read(struct bch_fs *c, struct journal_keys *journal_keys)
 	bch2_trans_init(&trans, c);
 
 	for_each_btree_key(&trans, iter, BTREE_ID_ALLOC, POS_MIN, 0, k, ret)
-		bch2_alloc_read_key(c, k);
+		bch2_mark_key(c, k, true, 0, NULL, 0, 0);
 
 	ret = bch2_trans_exit(&trans) ?: ret;
 	if (ret) {
@@ -284,7 +242,8 @@ int bch2_alloc_read(struct bch_fs *c, struct journal_keys *journal_keys)
 
 	for_each_journal_key(*journal_keys, j)
 		if (j->btree_id == BTREE_ID_ALLOC)
-			bch2_alloc_read_key(c, bkey_i_to_s_c(j->k));
+			bch2_mark_key(c, bkey_i_to_s_c(j->k),
+				      true, 0, NULL, 0, 0);
 
 	percpu_down_write(&c->mark_lock);
 	bch2_dev_usage_from_buckets(c);
@@ -352,81 +311,32 @@ err:
 	return ret;
 }
 
-static int __bch2_alloc_write_key(struct btree_trans *trans, struct bch_dev *ca,
-				  size_t b, struct btree_iter *iter,
-				  unsigned flags)
-{
-	struct bch_fs *c = trans->c;
-#if 0
-	__BKEY_PADDED(k, BKEY_ALLOC_VAL_U64s_MAX) alloc_key;
-#else
-	/* hack: */
-	__BKEY_PADDED(k, 8) alloc_key;
-#endif
-	struct bkey_i_alloc *a = bkey_alloc_init(&alloc_key.k);
-	struct bucket *g;
-	struct bucket_mark m, new;
-	int ret;
-
-	BUG_ON(BKEY_ALLOC_VAL_U64s_MAX > 8);
-
-	a->k.p = POS(ca->dev_idx, b);
-
-	bch2_btree_iter_set_pos(iter, a->k.p);
-
-	ret = bch2_btree_iter_traverse(iter);
-	if (ret)
-		return ret;
-
-	percpu_down_read(&c->mark_lock);
-	g = bucket(ca, b);
-	m = READ_ONCE(g->mark);
-
-	if (!m.dirty) {
-		percpu_up_read(&c->mark_lock);
-		return 0;
-	}
-
-	__alloc_write_key(a, g, m);
-	percpu_up_read(&c->mark_lock);
-
-	bch2_trans_update(trans, BTREE_INSERT_ENTRY(iter, &a->k_i));
-
-	ret = bch2_trans_commit(trans, NULL, NULL,
-				BTREE_INSERT_NOFAIL|
-				BTREE_INSERT_NOMARK|
-				flags);
-	if (ret)
-		return ret;
-
-	new = m;
-	new.dirty = false;
-	atomic64_cmpxchg(&g->_mark.v, m.v.counter, new.v.counter);
-
-	if (ca->buckets_written)
-		set_bit(b, ca->buckets_written);
-
-	return 0;
-}
-
 int bch2_alloc_write(struct bch_fs *c, unsigned flags, bool *wrote)
 {
 	struct btree_trans trans;
 	struct btree_iter *iter;
 	struct bucket_array *buckets;
 	struct bch_dev *ca;
+	struct bucket *g;
+	struct bucket_mark m, new;
+	struct bkey_alloc_unpacked old_u, new_u;
+	__BKEY_PADDED(k, 8) alloc_key; /* hack: */
+	struct bkey_i_alloc *a;
+	struct bkey_s_c k;
 	unsigned i;
 	size_t b;
 	int ret = 0;
 
+	BUG_ON(BKEY_ALLOC_VAL_U64s_MAX > 8);
+
 	bch2_trans_init(&trans, c);
 
 	iter = bch2_trans_get_iter(&trans, BTREE_ID_ALLOC, POS_MIN,
 				   BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
 
 	for_each_rw_member(ca, c, i) {
-relock:
 		down_read(&ca->bucket_lock);
+restart:
 		buckets = bucket_array(ca);
 
 		for (b = buckets->first_bucket;
@@ -435,27 +345,70 @@ relock:
 			if (!buckets->b[b].mark.dirty)
 				continue;
 
+			bch2_btree_iter_set_pos(iter, POS(i, b));
+			k = bch2_btree_iter_peek_slot(iter);
+			ret = bkey_err(k);
+			if (ret)
+				goto err;
+
+			old_u = bch2_alloc_unpack(k);
+
+			percpu_down_read(&c->mark_lock);
+			g	= bucket(ca, b);
+			m	= READ_ONCE(g->mark);
+			new_u	= alloc_mem_to_key(g, m);
+			percpu_up_read(&c->mark_lock);
+
+			if (!m.dirty)
+				continue;
+
 			if ((flags & BTREE_INSERT_LAZY_RW) &&
 			    percpu_ref_is_zero(&c->writes)) {
 				up_read(&ca->bucket_lock);
 				bch2_trans_unlock(&trans);
 
 				ret = bch2_fs_read_write_early(c);
+				down_read(&ca->bucket_lock);
+
 				if (ret)
-					goto out;
-				goto relock;
+					goto err;
+				goto restart;
 			}
 
-			ret = __bch2_alloc_write_key(&trans, ca, b,
-						     iter, flags);
+			a = bkey_alloc_init(&alloc_key.k);
+			a->k.p = iter->pos;
+			bch2_alloc_pack(a, new_u);
+
+			bch2_trans_update(&trans, BTREE_INSERT_ENTRY(iter, &a->k_i));
+			ret = bch2_trans_commit(&trans, NULL, NULL,
+						BTREE_INSERT_NOFAIL|
+						BTREE_INSERT_NOMARK|
+						flags);
+err:
+			if (ret && !test_bit(BCH_FS_EMERGENCY_RO, &c->flags)) {
+				bch_err(c, "error %i writing alloc info", ret);
+				printk(KERN_CONT "dev %llu bucket %llu\n",
+				       iter->pos.inode, iter->pos.offset);
+				printk(KERN_CONT "gen %u -> %u\n", old_u.gen, new_u.gen);
+#define x(_name, _bits)		printk(KERN_CONT #_name " %u -> %u\n", old_u._name, new_u._name);
+				BCH_ALLOC_FIELDS()
+#undef  x
+			}
 			if (ret)
 				break;
 
+			new = m;
+			new.dirty = false;
+			atomic64_cmpxchg(&g->_mark.v, m.v.counter, new.v.counter);
+
+			if (ca->buckets_written)
+				set_bit(b, ca->buckets_written);
+
 			bch2_trans_cond_resched(&trans);
 			*wrote = true;
 		}
 		up_read(&ca->bucket_lock);
-out:
+
 		if (ret) {
 			percpu_ref_put(&ca->io_ref);
 			break;
@@ -922,6 +875,7 @@ static int bch2_invalidate_one_bucket2(struct btree_trans *trans,
 	struct bch_fs *c = trans->c;
 	struct bkey_i_alloc *a;
 	struct bkey_alloc_unpacked u;
+	struct bucket *g;
 	struct bucket_mark m;
 	struct bkey_s_c k;
 	bool invalidating_cached_data;
@@ -941,7 +895,6 @@ static int bch2_invalidate_one_bucket2(struct btree_trans *trans,
 	BUG_ON(!fifo_push(&ca->free_inc, b));
 
 	bch2_mark_alloc_bucket(c, ca, b, true, gc_pos_alloc(c, NULL), 0);
-	m = bucket(ca, b)->mark;
 
 	spin_unlock(&c->freelist_lock);
 	percpu_up_read(&c->mark_lock);
@@ -955,27 +908,26 @@ retry:
 	if (ret)
 		return ret;
 
-	if (k.k && k.k->type == KEY_TYPE_alloc)
-		u = bch2_alloc_unpack(bkey_s_c_to_alloc(k).v);
-	else
-		memset(&u, 0, sizeof(u));
+	/*
+	 * The allocator has to start before journal replay is finished - thus,
+	 * we have to trust the in memory bucket @m, not the version in the
+	 * btree:
+	 */
+	percpu_down_read(&c->mark_lock);
+	g = bucket(ca, b);
+	m = READ_ONCE(g->mark);
+	u = alloc_mem_to_key(g, m);
+	percpu_up_read(&c->mark_lock);
 
 	invalidating_cached_data = m.cached_sectors != 0;
 
+	u.gen++;
 	u.data_type	= 0;
 	u.dirty_sectors	= 0;
 	u.cached_sectors = 0;
 	u.read_time	= c->bucket_clock[READ].hand;
 	u.write_time	= c->bucket_clock[WRITE].hand;
 
-	/*
-	 * The allocator has to start before journal replay is finished - thus,
-	 * we have to trust the in memory bucket @m, not the version in the
-	 * btree:
-	 */
-	//BUG_ON(u.dirty_sectors);
-	u.gen		= m.gen + 1;
-
 	a = bkey_alloc_init(&alloc_key.k);
 	a->k.p = iter->pos;
 	bch2_alloc_pack(a, u);
diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h
index 02354c80a102..0c1a0f0dd2ab 100644
--- a/fs/bcachefs/alloc_background.h
+++ b/fs/bcachefs/alloc_background.h
@@ -13,7 +13,7 @@ struct bkey_alloc_unpacked {
 #undef  x
 };
 
-struct bkey_alloc_unpacked bch2_alloc_unpack(const struct bch_alloc *);
+struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c);
 void bch2_alloc_pack(struct bkey_i_alloc *,
 		     const struct bkey_alloc_unpacked);
 
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 6d04474f0e3a..2479ad37775b 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -649,9 +649,13 @@ static int bch2_mark_alloc(struct bch_fs *c, struct bkey_s_c k,
 	if (flags & BCH_BUCKET_MARK_GC)
 		return 0;
 
-	u = bch2_alloc_unpack(bkey_s_c_to_alloc(k).v);
 	ca = bch_dev_bkey_exists(c, k.k->p.inode);
+
+	if (k.k->p.offset >= ca->mi.nbuckets)
+		return 0;
+
 	g = __bucket(ca, k.k->p.offset, gc);
+	u = bch2_alloc_unpack(k);
 
 	old = bucket_data_cmpxchg(c, ca, fs_usage, g, m, ({
 		m.gen			= u.gen;
@@ -1381,7 +1385,7 @@ static int bch2_trans_mark_pointer(struct btree_trans *trans,
 		goto out;
 	}
 
-	u = bch2_alloc_unpack(bkey_s_c_to_alloc(k).v);
+	u = bch2_alloc_unpack(k);
 
 	if (gen_after(u.gen, p.ptr.gen)) {
 		ret = 1;
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index 6761b5c24a12..07245717ca4e 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -1234,11 +1234,6 @@ int bch2_stripes_write(struct bch_fs *c, unsigned flags, bool *wrote)
 	return ret;
 }
 
-static void bch2_stripe_read_key(struct bch_fs *c, struct bkey_s_c k)
-{
-	bch2_mark_key(c, k, true, 0, NULL, 0, 0);
-}
-
 int bch2_stripes_read(struct bch_fs *c, struct journal_keys *journal_keys)
 {
 	struct journal_key *i;
@@ -1254,7 +1249,7 @@ int bch2_stripes_read(struct bch_fs *c, struct journal_keys *journal_keys)
 	bch2_trans_init(&trans, c);
 
 	for_each_btree_key(&trans, iter, BTREE_ID_EC, POS_MIN, 0, k, ret)
-		bch2_stripe_read_key(c, k);
+		bch2_mark_key(c, k, true, 0, NULL, 0, 0);
 
 	ret = bch2_trans_exit(&trans) ?: ret;
 	if (ret) {
@@ -1264,7 +1259,8 @@ int bch2_stripes_read(struct bch_fs *c, struct journal_keys *journal_keys)
 
 	for_each_journal_key(*journal_keys, i)
 		if (i->btree_id == BTREE_ID_EC)
-			bch2_stripe_read_key(c, bkey_i_to_s_c(i->k));
+			bch2_mark_key(c, bkey_i_to_s_c(i->k),
+				      true, 0, NULL, 0, 0);
 
 	return 0;
 }
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 4fd7b048050b..4e0c63f0076f 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -947,7 +947,6 @@ out:
 	return;
 err:
 	bch2_fatal_error(c);
-	bch2_journal_halt(j);
 	spin_lock(&j->lock);
 	goto out;
 }
@@ -1059,7 +1058,6 @@ void bch2_journal_write(struct closure *cl)
 	spin_unlock(&j->lock);
 
 	if (ret) {
-		bch2_journal_halt(j);
 		bch_err(c, "Unable to allocate journal write");
 		bch2_fatal_error(c);
 		continue_at(cl, journal_write_done, system_highpri_wq);
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 959638c986a0..8f25c1d9b8cb 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -198,17 +198,14 @@ static void __bch2_fs_read_only(struct bch_fs *c)
 	do {
 		wrote = false;
 
-		ret = bch2_stripes_write(c, BTREE_INSERT_NOCHECK_RW, &wrote);
-		if (ret) {
-			bch2_fs_inconsistent(c, "error writing out stripes");
-			break;
-		}
+		ret = bch2_stripes_write(c, BTREE_INSERT_NOCHECK_RW, &wrote) ?:
+			bch2_alloc_write(c, BTREE_INSERT_NOCHECK_RW, &wrote);
 
-		ret = bch2_alloc_write(c, BTREE_INSERT_NOCHECK_RW, &wrote);
-		if (ret) {
+		if (ret && !test_bit(BCH_FS_EMERGENCY_RO, &c->flags))
 			bch2_fs_inconsistent(c, "error writing out alloc info %i", ret);
+
+		if (ret)
 			break;
-		}
 
 		for_each_member_device(ca, c, i)
 			bch2_dev_allocator_quiesce(c, ca);
-- 
cgit 


From 619f5bee86b558e0dad91c3759b90652cd5f55d2 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 17 Apr 2019 18:21:19 -0400
Subject: bcachefs: some improvements to startup messages and options

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/chardev.c  |   2 +-
 fs/bcachefs/fs-ioctl.c |   3 +-
 fs/bcachefs/fs.c       |   3 ++
 fs/bcachefs/fsck.c     | 115 +++++++++++++++----------------------------------
 fs/bcachefs/fsck.h     |   4 +-
 fs/bcachefs/opts.h     |   7 +--
 fs/bcachefs/recovery.c |  81 +++++++++++++++++++++++-----------
 fs/bcachefs/super.c    |  85 +++++++++++++++++++++++++-----------
 fs/bcachefs/super.h    |   2 +-
 9 files changed, 163 insertions(+), 139 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c
index 2573376290bb..4d8331022648 100644
--- a/fs/bcachefs/chardev.c
+++ b/fs/bcachefs/chardev.c
@@ -158,7 +158,7 @@ static long bch2_ioctl_start(struct bch_fs *c, struct bch_ioctl_start arg)
 	if (arg.flags || arg.pad)
 		return -EINVAL;
 
-	return bch2_fs_start(c) ? -EIO : 0;
+	return bch2_fs_start(c);
 }
 
 static long bch2_ioctl_stop(struct bch_fs *c)
diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c
index b00d25b18ed4..4dca716217a6 100644
--- a/fs/bcachefs/fs-ioctl.c
+++ b/fs/bcachefs/fs-ioctl.c
@@ -267,7 +267,8 @@ long bch2_fs_file_ioctl(struct file *file, unsigned cmd, unsigned long arg)
 
 		down_write(&sb->s_umount);
 		sb->s_flags |= SB_RDONLY;
-		bch2_fs_emergency_read_only(c);
+		if (bch2_fs_emergency_read_only(c))
+			bch_err(c, "emergency read only due to ioctl");
 		up_write(&sb->s_umount);
 		return 0;
 
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index 7ae1b7520351..aac59b8a15eb 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -1834,12 +1834,15 @@ static struct dentry *bch2_mount(struct file_system_type *fs_type,
 
 	vinode = bch2_vfs_inode_get(c, BCACHEFS_ROOT_INO);
 	if (IS_ERR(vinode)) {
+		bch_err(c, "error mounting: error getting root inode %i",
+			(int) PTR_ERR(vinode));
 		ret = PTR_ERR(vinode);
 		goto err_put_super;
 	}
 
 	sb->s_root = d_make_root(vinode);
 	if (!sb->s_root) {
+		bch_err(c, "error mounting: error allocating root dentry");
 		ret = -ENOMEM;
 		goto err_put_super;
 	}
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index ade3446d8dc3..61569e4e1c77 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -499,8 +499,7 @@ retry:
 						BTREE_INSERT_NOFAIL|
 						BTREE_INSERT_LAZY_RW);
 			if (ret) {
-				bch_err(c, "error in fs gc: error %i "
-					"updating inode", ret);
+				bch_err(c, "error in fsck: error %i updating inode", ret);
 				goto err;
 			}
 
@@ -1064,7 +1063,7 @@ static void inc_link(struct bch_fs *c, nlink_table *links,
 
 	link = genradix_ptr_alloc(links, inum - range_start, GFP_KERNEL);
 	if (!link) {
-		bch_verbose(c, "allocation failed during fs gc - will need another pass");
+		bch_verbose(c, "allocation failed during fsck - will need another pass");
 		*range_end = inum;
 		return;
 	}
@@ -1111,7 +1110,7 @@ static int bch2_gc_walk_dirents(struct bch_fs *c, nlink_table *links,
 	}
 	ret = bch2_trans_exit(&trans) ?: ret;
 	if (ret)
-		bch_err(c, "error in fs gc: btree error %i while walking dirents", ret);
+		bch_err(c, "error in fsck: btree error %i while walking dirents", ret);
 
 	return ret;
 }
@@ -1252,8 +1251,7 @@ static int check_inode(struct btree_trans *trans,
 
 		ret = bch2_inode_rm(c, u.bi_inum);
 		if (ret)
-			bch_err(c, "error in fs gc: error %i "
-				"while deleting inode", ret);
+			bch_err(c, "error in fsck: error %i while deleting inode", ret);
 		return ret;
 	}
 
@@ -1270,8 +1268,7 @@ static int check_inode(struct btree_trans *trans,
 
 		ret = bch2_inode_truncate(c, u.bi_inum, u.bi_size);
 		if (ret) {
-			bch_err(c, "error in fs gc: error %i "
-				"truncating inode", ret);
+			bch_err(c, "error in fsck: error %i truncating inode", ret);
 			return ret;
 		}
 
@@ -1296,8 +1293,7 @@ static int check_inode(struct btree_trans *trans,
 
 		sectors = bch2_count_inode_sectors(trans, u.bi_inum);
 		if (sectors < 0) {
-			bch_err(c, "error in fs gc: error %i "
-				"recounting inode sectors",
+			bch_err(c, "error in fsck: error %i recounting inode sectors",
 				(int) sectors);
 			return sectors;
 		}
@@ -1317,7 +1313,7 @@ static int check_inode(struct btree_trans *trans,
 					BTREE_INSERT_NOFAIL|
 					BTREE_INSERT_LAZY_RW);
 		if (ret && ret != -EINTR)
-			bch_err(c, "error in fs gc: error %i "
+			bch_err(c, "error in fsck: error %i "
 				"updating inode", ret);
 	}
 fsck_err:
@@ -1388,7 +1384,7 @@ fsck_err:
 	bch2_trans_exit(&trans);
 
 	if (ret2)
-		bch_err(c, "error in fs gc: btree error %i while walking inodes", ret2);
+		bch_err(c, "error in fsck: btree error %i while walking inodes", ret2);
 
 	return ret ?: ret2;
 }
@@ -1429,101 +1425,60 @@ static int check_inode_nlinks(struct bch_fs *c,
 	return ret;
 }
 
-noinline_for_stack
-static int check_inodes_fast(struct bch_fs *c)
-{
-	struct btree_trans trans;
-	struct btree_iter *iter;
-	struct bkey_s_c k;
-	struct bkey_s_c_inode inode;
-	int ret;
-
-	bch2_trans_init(&trans, c);
-	bch2_trans_preload_iters(&trans);
-
-	for_each_btree_key(&trans, iter, BTREE_ID_INODES, POS_MIN, 0, k, ret) {
-		if (k.k->type != KEY_TYPE_inode)
-			continue;
-
-		inode = bkey_s_c_to_inode(k);
-
-		if (inode.v->bi_flags &
-		    (BCH_INODE_I_SIZE_DIRTY|
-		     BCH_INODE_I_SECTORS_DIRTY|
-		     BCH_INODE_UNLINKED)) {
-			ret = check_inode(&trans, NULL, iter, inode, NULL);
-			BUG_ON(ret == -EINTR);
-			if (ret)
-				break;
-		}
-	}
-	BUG_ON(ret == -EINTR);
-
-	return bch2_trans_exit(&trans) ?: ret;
-}
-
 /*
  * Checks for inconsistencies that shouldn't happen, unless we have a bug.
  * Doesn't fix them yet, mainly because they haven't yet been observed:
  */
-static int bch2_fsck_full(struct bch_fs *c)
+int bch2_fsck_full(struct bch_fs *c)
 {
 	struct bch_inode_unpacked root_inode, lostfound_inode;
-	int ret;
 
-	bch_verbose(c, "starting fsck:");
-	ret =   check_extents(c) ?:
+	return  check_extents(c) ?:
 		check_dirents(c) ?:
 		check_xattrs(c) ?:
 		check_root(c, &root_inode) ?:
 		check_lostfound(c, &root_inode, &lostfound_inode) ?:
 		check_directory_structure(c, &lostfound_inode) ?:
 		check_inode_nlinks(c, &lostfound_inode);
-
-	bch2_flush_fsck_errs(c);
-	bch_verbose(c, "fsck done");
-
-	return ret;
 }
 
-static int bch2_fsck_inode_nlink(struct bch_fs *c)
+int bch2_fsck_inode_nlink(struct bch_fs *c)
 {
 	struct bch_inode_unpacked root_inode, lostfound_inode;
-	int ret;
 
-	bch_verbose(c, "checking inode link counts:");
-	ret =   check_root(c, &root_inode) ?:
+	return  check_root(c, &root_inode) ?:
 		check_lostfound(c, &root_inode, &lostfound_inode) ?:
 		check_inode_nlinks(c, &lostfound_inode);
-
-	bch2_flush_fsck_errs(c);
-	bch_verbose(c, "done");
-
-	return ret;
 }
 
-static int bch2_fsck_walk_inodes_only(struct bch_fs *c)
+int bch2_fsck_walk_inodes_only(struct bch_fs *c)
 {
+	struct btree_trans trans;
+	struct btree_iter *iter;
+	struct bkey_s_c k;
+	struct bkey_s_c_inode inode;
 	int ret;
 
-	bch_verbose(c, "walking inodes:");
-	ret = check_inodes_fast(c);
-
-	bch2_flush_fsck_errs(c);
-	bch_verbose(c, "done");
+	bch2_trans_init(&trans, c);
+	bch2_trans_preload_iters(&trans);
 
-	return ret;
-}
+	for_each_btree_key(&trans, iter, BTREE_ID_INODES, POS_MIN, 0, k, ret) {
+		if (k.k->type != KEY_TYPE_inode)
+			continue;
 
-int bch2_fsck(struct bch_fs *c)
-{
-	if (c->opts.fsck)
-		return bch2_fsck_full(c);
+		inode = bkey_s_c_to_inode(k);
 
-	if (c->sb.clean)
-		return 0;
+		if (inode.v->bi_flags &
+		    (BCH_INODE_I_SIZE_DIRTY|
+		     BCH_INODE_I_SECTORS_DIRTY|
+		     BCH_INODE_UNLINKED)) {
+			ret = check_inode(&trans, NULL, iter, inode, NULL);
+			BUG_ON(ret == -EINTR);
+			if (ret)
+				break;
+		}
+	}
+	BUG_ON(ret == -EINTR);
 
-	return c->sb.features & (1 << BCH_FEATURE_ATOMIC_NLINK)
-		? bch2_fsck_walk_inodes_only(c)
-		: bch2_fsck_inode_nlink(c);
+	return bch2_trans_exit(&trans) ?: ret;
 }
diff --git a/fs/bcachefs/fsck.h b/fs/bcachefs/fsck.h
index 97460452e842..9e4af02bde1e 100644
--- a/fs/bcachefs/fsck.h
+++ b/fs/bcachefs/fsck.h
@@ -2,6 +2,8 @@
 #ifndef _BCACHEFS_FSCK_H
 #define _BCACHEFS_FSCK_H
 
-int bch2_fsck(struct bch_fs *);
+int bch2_fsck_full(struct bch_fs *);
+int bch2_fsck_inode_nlink(struct bch_fs *);
+int bch2_fsck_walk_inodes_only(struct bch_fs *);
 
 #endif /* _BCACHEFS_FSCK_H */
diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
index 53bf06e70cd5..a69bd3718ac4 100644
--- a/fs/bcachefs/opts.h
+++ b/fs/bcachefs/opts.h
@@ -233,16 +233,11 @@ enum opt_type {
 	  NO_SB_OPT,			false,				\
 	  NULL,		"Super read only mode - no writes at all will be issued,\n"\
 			"even if we have to replay the journal")	\
-	x(noreplay,			u8,				\
-	  OPT_MOUNT,							\
-	  OPT_BOOL(),							\
-	  NO_SB_OPT,			false,				\
-	  NULL,		"Don't replay the journal (only for internal tools)")\
 	x(norecovery,			u8,				\
 	  OPT_MOUNT,							\
 	  OPT_BOOL(),							\
 	  NO_SB_OPT,			false,				\
-	  NULL,		NULL)						\
+	  NULL,		"Don't replay the journal")			\
 	x(noexcl,			u8,				\
 	  OPT_MOUNT,							\
 	  OPT_BOOL(),							\
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index a80de5d814d6..3f0eda9f5d0c 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -714,8 +714,8 @@ int bch2_fs_recovery(struct bch_fs *c)
 
 	if (!c->sb.clean) {
 		ret = bch2_journal_seq_blacklist_add(c,
-				journal_seq,
-				journal_seq + 4);
+						     journal_seq,
+						     journal_seq + 4);
 		if (ret) {
 			bch_err(c, "error creating new journal seq blacklist entry");
 			goto err;
@@ -763,7 +763,7 @@ int bch2_fs_recovery(struct bch_fs *c)
 		 * journal; after an unclean shutdown we have to walk all
 		 * pointers to metadata:
 		 */
-		bch_verbose(c, "starting metadata mark and sweep:");
+		bch_info(c, "starting metadata mark and sweep");
 		err = "error in mark and sweep";
 		ret = bch2_gc(c, NULL, true, true);
 		if (ret)
@@ -774,7 +774,7 @@ int bch2_fs_recovery(struct bch_fs *c)
 	if (c->opts.fsck ||
 	    !(c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_INFO)) ||
 	    test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags)) {
-		bch_verbose(c, "starting mark and sweep:");
+		bch_info(c, "starting mark and sweep");
 		err = "error in mark and sweep";
 		ret = bch2_gc(c, &journal_keys, true, false);
 		if (ret)
@@ -792,36 +792,63 @@ int bch2_fs_recovery(struct bch_fs *c)
 	if (c->sb.encryption_type && !c->sb.clean)
 		atomic64_add(1 << 16, &c->key_version);
 
-	if (c->opts.noreplay)
+	if (c->opts.norecovery)
 		goto out;
 
-	bch_verbose(c, "starting journal replay:");
+	bch_verbose(c, "starting journal replay");
 	err = "journal replay failed";
 	ret = bch2_journal_replay(c, journal_keys);
 	if (ret)
 		goto err;
 	bch_verbose(c, "journal replay done");
 
-	bch_verbose(c, "writing allocation info:");
-	err = "error writing out alloc info";
-	ret = bch2_stripes_write(c, BTREE_INSERT_LAZY_RW, &wrote) ?:
-		bch2_alloc_write(c, BTREE_INSERT_LAZY_RW, &wrote);
-	if (ret) {
-		bch_err(c, "error writing alloc info");
-		goto err;
+	if (!c->opts.nochanges) {
+		/*
+		 * note that even when filesystem was clean there might be work
+		 * to do here, if we ran gc (because of fsck) which recalculated
+		 * oldest_gen:
+		 */
+		bch_verbose(c, "writing allocation info");
+		err = "error writing out alloc info";
+		ret = bch2_stripes_write(c, BTREE_INSERT_LAZY_RW, &wrote) ?:
+			bch2_alloc_write(c, BTREE_INSERT_LAZY_RW, &wrote);
+		if (ret) {
+			bch_err(c, "error writing alloc info");
+			goto err;
+		}
+		bch_verbose(c, "alloc write done");
 	}
-	bch_verbose(c, "alloc write done");
 
-	if (c->opts.norecovery)
-		goto out;
+	if (!c->sb.clean) {
+		if (!(c->sb.features & (1 << BCH_FEATURE_ATOMIC_NLINK))) {
+			bch_info(c, "checking inode link counts");
+			err = "error in recovery";
+			ret = bch2_fsck_inode_nlink(c);
+			if (ret)
+				goto err;
+			bch_verbose(c, "check inodes done");
 
-	err = "error in fsck";
-	ret = bch2_fsck(c);
-	if (ret)
-		goto err;
+		} else {
+			bch_verbose(c, "checking for deleted inodes");
+			err = "error in recovery";
+			ret = bch2_fsck_walk_inodes_only(c);
+			if (ret)
+				goto err;
+			bch_verbose(c, "check inodes done");
+		}
+	}
+
+	if (c->opts.fsck) {
+		bch_info(c, "starting fsck");
+		err = "error in fsck";
+		ret = bch2_fsck_full(c);
+		if (ret)
+			goto err;
+		bch_verbose(c, "fsck done");
+	}
 
 	if (enabled_qtypes(c)) {
-		bch_verbose(c, "reading quotas:");
+		bch_verbose(c, "reading quotas");
 		ret = bch2_fs_quota_read(c);
 		if (ret)
 			goto err;
@@ -857,14 +884,18 @@ int bch2_fs_recovery(struct bch_fs *c)
 	    c->journal_seq_blacklist_table->nr > 128)
 		queue_work(system_long_wq, &c->journal_seq_blacklist_gc_work);
 out:
+	ret = 0;
+err:
+fsck_err:
+	bch2_flush_fsck_errs(c);
 	journal_keys_free(&journal_keys);
 	journal_entries_free(&journal_entries);
 	kfree(clean);
+	if (ret)
+		bch_err(c, "Error in recovery: %s (%i)", err, ret);
+	else
+		bch_verbose(c, "ret %i", ret);
 	return ret;
-err:
-fsck_err:
-	bch_err(c, "Error in recovery: %s (%i)", err, ret);
-	goto out;
 }
 
 int bch2_fs_initialize(struct bch_fs *c)
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 8f25c1d9b8cb..654ccc611099 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -305,7 +305,6 @@ void bch2_fs_read_only(struct bch_fs *c)
 	    !test_bit(BCH_FS_ERROR, &c->flags) &&
 	    !test_bit(BCH_FS_EMERGENCY_RO, &c->flags) &&
 	    test_bit(BCH_FS_STARTED, &c->flags) &&
-	    !c->opts.noreplay &&
 	    !c->opts.norecovery)
 		bch2_fs_mark_clean(c);
 
@@ -379,9 +378,14 @@ int __bch2_fs_read_write(struct bch_fs *c, bool early)
 	if (test_bit(BCH_FS_RW, &c->flags))
 		return 0;
 
-	if (c->opts.nochanges ||
-	    c->opts.noreplay)
-		return -EINVAL;
+	/*
+	 * nochanges is used for fsck -n mode - we have to allow going rw
+	 * during recovery for that to work:
+	 */
+	if (c->opts.norecovery ||
+	    (c->opts.nochanges &&
+	     (!early || c->opts.read_only)))
+		return -EROFS;
 
 	ret = bch2_fs_mark_dirty(c);
 	if (ret)
@@ -694,10 +698,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 	c->block_bits		= ilog2(c->opts.block_size);
 	c->btree_foreground_merge_threshold = BTREE_FOREGROUND_MERGE_THRESHOLD(c);
 
-	c->opts.nochanges	|= c->opts.noreplay;
-	c->opts.read_only	|= c->opts.nochanges;
-	c->opts.read_only	|= c->opts.noreplay;
-
 	if (bch2_fs_init_fault("fs_alloc"))
 		goto err;
 
@@ -776,7 +776,41 @@ err:
 	goto out;
 }
 
-const char *bch2_fs_start(struct bch_fs *c)
+noinline_for_stack
+static void print_mount_opts(struct bch_fs *c)
+{
+	enum bch_opt_id i;
+	char buf[512];
+	struct printbuf p = PBUF(buf);
+	bool first = true;
+
+	strcpy(buf, "(null)");
+
+	if (c->opts.read_only) {
+		pr_buf(&p, "ro");
+		first = false;
+	}
+
+	for (i = 0; i < bch2_opts_nr; i++) {
+		const struct bch_option *opt = &bch2_opt_table[i];
+		u64 v = bch2_opt_get_by_id(&c->opts, i);
+
+		if (!(opt->mode & OPT_MOUNT))
+			continue;
+
+		if (v == bch2_opt_get_by_id(&bch2_opts_default, i))
+			continue;
+
+		if (!first)
+			pr_buf(&p, ",");
+		first = false;
+		bch2_opt_to_text(&p, c, opt, v, OPT_SHOW_MOUNT_STYLE);
+	}
+
+	bch_info(c, "mounted with opts: %s", buf);
+}
+
+int bch2_fs_start(struct bch_fs *c)
 {
 	const char *err = "cannot allocate memory";
 	struct bch_sb_field_members *mi;
@@ -815,26 +849,27 @@ const char *bch2_fs_start(struct bch_fs *c)
 		goto err;
 
 	err = "dynamic fault";
+	ret = -EINVAL;
 	if (bch2_fs_init_fault("fs_start"))
 		goto err;
 
-	if (c->opts.read_only) {
+	if (c->opts.read_only || c->opts.nochanges) {
 		bch2_fs_read_only(c);
 	} else {
-		if (!test_bit(BCH_FS_RW, &c->flags)
-		    ? bch2_fs_read_write(c)
-		    : bch2_fs_read_write_late(c)) {
-			err = "error going read write";
+		err = "error going read write";
+		ret = !test_bit(BCH_FS_RW, &c->flags)
+			? bch2_fs_read_write(c)
+			: bch2_fs_read_write_late(c);
+		if (ret)
 			goto err;
-		}
 	}
 
 	set_bit(BCH_FS_STARTED, &c->flags);
-
-	err = NULL;
+	print_mount_opts(c);
+	ret = 0;
 out:
 	mutex_unlock(&c->state_lock);
-	return err;
+	return ret;
 err:
 	switch (ret) {
 	case BCH_FSCK_ERRORS_NOT_FIXED:
@@ -862,7 +897,7 @@ err:
 		break;
 	}
 
-	BUG_ON(!err);
+	BUG_ON(!ret);
 	goto out;
 }
 
@@ -1789,9 +1824,9 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices,
 		goto err_print;
 
 	if (!c->opts.nostart) {
-		err = bch2_fs_start(c);
-		if (err)
-			goto err_print;
+		ret = bch2_fs_start(c);
+		if (ret)
+			goto err;
 	}
 out:
 	kfree(sb);
@@ -1818,6 +1853,7 @@ static const char *__bch2_fs_open_incremental(struct bch_sb_handle *sb,
 	const char *err;
 	struct bch_fs *c;
 	bool allocated_fs = false;
+	int ret;
 
 	err = bch2_sb_validate(sb);
 	if (err)
@@ -1850,8 +1886,9 @@ static const char *__bch2_fs_open_incremental(struct bch_sb_handle *sb,
 	mutex_unlock(&c->sb_lock);
 
 	if (!c->opts.nostart && bch2_fs_may_start(c)) {
-		err = bch2_fs_start(c);
-		if (err)
+		err = "error starting filesystem";
+		ret = bch2_fs_start(c);
+		if (ret)
 			goto err;
 	}
 
diff --git a/fs/bcachefs/super.h b/fs/bcachefs/super.h
index 92ef3e7c8dc2..1b97c6115535 100644
--- a/fs/bcachefs/super.h
+++ b/fs/bcachefs/super.h
@@ -224,7 +224,7 @@ int bch2_fs_read_write_early(struct bch_fs *);
 
 void bch2_fs_stop(struct bch_fs *);
 
-const char *bch2_fs_start(struct bch_fs *);
+int bch2_fs_start(struct bch_fs *);
 struct bch_fs *bch2_fs_open(char * const *, unsigned, struct bch_opts);
 const char *bch2_fs_open_incremental(const char *path);
 
-- 
cgit 


From 44e63bcaaaa2100677d1d95c3a0ebe370493f2f9 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 17 Apr 2019 20:16:43 -0400
Subject: bcachefs: Don't run fsck by default at mount time

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/opts.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
index a69bd3718ac4..d2493d4111c6 100644
--- a/fs/bcachefs/opts.h
+++ b/fs/bcachefs/opts.h
@@ -220,7 +220,7 @@ enum opt_type {
 	x(fsck,				u8,				\
 	  OPT_MOUNT,							\
 	  OPT_BOOL(),							\
-	  NO_SB_OPT,			true,				\
+	  NO_SB_OPT,			false,				\
 	  NULL,		"Run fsck on mount")				\
 	x(fix_errors,			u8,				\
 	  OPT_MOUNT,							\
-- 
cgit 


From 9516950c0690b6a8bef4cc1c7ae269da996973d8 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 22 Apr 2019 17:47:49 -0400
Subject: bcachefs: Fix return code from bch2_fs_start()

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/super.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 654ccc611099..4c54ac64b0af 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -897,7 +897,8 @@ err:
 		break;
 	}
 
-	BUG_ON(!ret);
+	if (ret >= 0)
+		ret = -EIO;
 	goto out;
 }
 
-- 
cgit 


From ae0ff7b8b0ccf0643ea4d3968faabb2d5ee98c1c Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 30 Apr 2019 17:15:39 -0400
Subject: bcachefs: Redo replicas gc mechanism

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/migrate.c  | 22 +++++------------
 fs/bcachefs/move.c     | 66 +++-----------------------------------------------
 fs/bcachefs/replicas.c | 58 ++++++++++++++++++++++++++++++++++++++++++++
 fs/bcachefs/replicas.h |  1 +
 4 files changed, 69 insertions(+), 78 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c
index f9e6c9d9ef04..6b17d7918aa4 100644
--- a/fs/bcachefs/migrate.c
+++ b/fs/bcachefs/migrate.c
@@ -48,10 +48,6 @@ static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
 	iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
 				   POS_MIN, BTREE_ITER_PREFETCH);
 
-	mutex_lock(&c->replicas_gc_lock);
-	bch2_replicas_gc_start(c, (1 << BCH_DATA_USER)|(1 << BCH_DATA_CACHED));
-
-
 	while ((k = bch2_btree_iter_peek(iter)).k &&
 	       !(ret = bkey_err(k))) {
 		if (!bkey_extent_is_data(k.k) ||
@@ -97,12 +93,9 @@ static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
 			break;
 	}
 
-	BUG_ON(ret == -EINTR);
-
-	bch2_trans_exit(&trans);
+	ret = bch2_trans_exit(&trans) ?: ret;
 
-	bch2_replicas_gc_end(c, ret);
-	mutex_unlock(&c->replicas_gc_lock);
+	BUG_ON(ret == -EINTR);
 
 	return ret;
 }
@@ -123,9 +116,6 @@ static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
 	bch2_trans_init(&trans, c);
 	closure_init_stack(&cl);
 
-	mutex_lock(&c->replicas_gc_lock);
-	bch2_replicas_gc_start(c, 1 << BCH_DATA_BTREE);
-
 	for (id = 0; id < BTREE_ID_NR; id++) {
 		for_each_btree_node(&trans, iter, id, POS_MIN,
 				    BTREE_ITER_PREFETCH, b) {
@@ -178,10 +168,9 @@ retry:
 
 	ret = 0;
 err:
-	bch2_trans_exit(&trans);
+	ret = bch2_trans_exit(&trans) ?: ret;
 
-	ret = bch2_replicas_gc_end(c, ret);
-	mutex_unlock(&c->replicas_gc_lock);
+	BUG_ON(ret == -EINTR);
 
 	return ret;
 }
@@ -189,5 +178,6 @@ err:
 int bch2_dev_data_drop(struct bch_fs *c, unsigned dev_idx, int flags)
 {
 	return bch2_dev_usrdata_drop(c, dev_idx, flags) ?:
-		bch2_dev_metadata_drop(c, dev_idx, flags);
+		bch2_dev_metadata_drop(c, dev_idx, flags) ?:
+		bch2_replicas_gc2(c);
 }
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index ff426a2c8e7a..64ac8244e1e0 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -619,64 +619,6 @@ out:
 	return ret;
 }
 
-static int bch2_gc_data_replicas(struct bch_fs *c)
-{
-	struct btree_trans trans;
-	struct btree_iter *iter;
-	struct bkey_s_c k;
-	int ret;
-
-	bch2_trans_init(&trans, c);
-
-	mutex_lock(&c->replicas_gc_lock);
-	bch2_replicas_gc_start(c, (1 << BCH_DATA_USER)|(1 << BCH_DATA_CACHED));
-
-	for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, POS_MIN,
-			   BTREE_ITER_PREFETCH, k, ret) {
-		ret = bch2_mark_bkey_replicas(c, k);
-		if (ret)
-			break;
-	}
-	ret = bch2_trans_exit(&trans) ?: ret;
-
-	bch2_replicas_gc_end(c, ret);
-	mutex_unlock(&c->replicas_gc_lock);
-
-	return ret;
-}
-
-static int bch2_gc_btree_replicas(struct bch_fs *c)
-{
-	struct btree_trans trans;
-	struct btree_iter *iter;
-	struct btree *b;
-	unsigned id;
-	int ret = 0;
-
-	bch2_trans_init(&trans, c);
-
-	mutex_lock(&c->replicas_gc_lock);
-	bch2_replicas_gc_start(c, 1 << BCH_DATA_BTREE);
-
-	for (id = 0; id < BTREE_ID_NR; id++) {
-		for_each_btree_node(&trans, iter, id, POS_MIN,
-				    BTREE_ITER_PREFETCH, b) {
-			ret = bch2_mark_bkey_replicas(c, bkey_i_to_s_c(&b->key));
-
-			bch2_trans_cond_resched(&trans);
-		}
-
-		ret = bch2_trans_iter_free(&trans, iter) ?: ret;
-	}
-
-	bch2_trans_exit(&trans);
-
-	bch2_replicas_gc_end(c, ret);
-	mutex_unlock(&c->replicas_gc_lock);
-
-	return ret;
-}
-
 static int bch2_move_btree(struct bch_fs *c,
 			   move_pred_fn pred,
 			   void *arg,
@@ -803,14 +745,14 @@ int bch2_data_job(struct bch_fs *c,
 			bch2_journal_meta(&c->journal);
 		}
 
-		ret = bch2_gc_btree_replicas(c) ?: ret;
+		ret = bch2_replicas_gc2(c) ?: ret;
 
 		ret = bch2_move_data(c, NULL,
 				     writepoint_hashed((unsigned long) current),
 				     op.start,
 				     op.end,
 				     rereplicate_pred, c, stats) ?: ret;
-		ret = bch2_gc_data_replicas(c) ?: ret;
+		ret = bch2_replicas_gc2(c) ?: ret;
 		break;
 	case BCH_DATA_OP_MIGRATE:
 		if (op.migrate.dev >= c->sb.nr_devices)
@@ -820,14 +762,14 @@ int bch2_data_job(struct bch_fs *c,
 		ret = bch2_journal_flush_device_pins(&c->journal, op.migrate.dev);
 
 		ret = bch2_move_btree(c, migrate_pred, &op, stats) ?: ret;
-		ret = bch2_gc_btree_replicas(c) ?: ret;
+		ret = bch2_replicas_gc2(c) ?: ret;
 
 		ret = bch2_move_data(c, NULL,
 				     writepoint_hashed((unsigned long) current),
 				     op.start,
 				     op.end,
 				     migrate_pred, &op, stats) ?: ret;
-		ret = bch2_gc_data_replicas(c) ?: ret;
+		ret = bch2_replicas_gc2(c) ?: ret;
 		break;
 	default:
 		ret = -EINVAL;
diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c
index 2482dbbbad38..7a9a7ec26c93 100644
--- a/fs/bcachefs/replicas.c
+++ b/fs/bcachefs/replicas.c
@@ -571,6 +571,64 @@ int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask)
 	return 0;
 }
 
+int bch2_replicas_gc2(struct bch_fs *c)
+{
+	struct bch_replicas_cpu new = { 0 };
+	unsigned i, nr;
+	int ret = 0;
+
+	bch2_journal_meta(&c->journal);
+retry:
+	nr		= READ_ONCE(c->replicas.nr);
+	new.entry_size	= READ_ONCE(c->replicas.entry_size);
+	new.entries	= kcalloc(nr, new.entry_size, GFP_KERNEL);
+	if (!new.entries)
+		return -ENOMEM;
+
+	mutex_lock(&c->sb_lock);
+	percpu_down_write(&c->mark_lock);
+
+	if (nr			!= c->replicas.nr ||
+	    new.entry_size	!= c->replicas.entry_size) {
+		percpu_up_write(&c->mark_lock);
+		mutex_unlock(&c->sb_lock);
+		kfree(new.entries);
+		goto retry;
+	}
+
+	for (i = 0; i < c->replicas.nr; i++) {
+		struct bch_replicas_entry *e =
+			cpu_replicas_entry(&c->replicas, i);
+
+		if (e->data_type == BCH_DATA_JOURNAL ||
+		    c->usage_base->replicas[i] ||
+		    percpu_u64_get(&c->usage[0]->replicas[i]) ||
+		    percpu_u64_get(&c->usage[1]->replicas[i]))
+			memcpy(cpu_replicas_entry(&new, new.nr++),
+			       e, new.entry_size);
+	}
+
+	bch2_cpu_replicas_sort(&new);
+
+	if (bch2_cpu_replicas_to_sb_replicas(c, &new)) {
+		ret = -ENOSPC;
+		goto err;
+	}
+
+	ret = replicas_table_update(c, &new);
+err:
+	kfree(new.entries);
+
+	percpu_up_write(&c->mark_lock);
+
+	if (!ret)
+		bch2_write_super(c);
+
+	mutex_unlock(&c->sb_lock);
+
+	return ret;
+}
+
 int bch2_replicas_set_usage(struct bch_fs *c,
 			    struct bch_replicas_entry *r,
 			    u64 sectors)
diff --git a/fs/bcachefs/replicas.h b/fs/bcachefs/replicas.h
index 1ceedb6231fd..0d6e19126021 100644
--- a/fs/bcachefs/replicas.h
+++ b/fs/bcachefs/replicas.h
@@ -59,6 +59,7 @@ unsigned bch2_dev_has_data(struct bch_fs *, struct bch_dev *);
 
 int bch2_replicas_gc_end(struct bch_fs *, int);
 int bch2_replicas_gc_start(struct bch_fs *, unsigned);
+int bch2_replicas_gc2(struct bch_fs *);
 
 int bch2_replicas_set_usage(struct bch_fs *,
 			    struct bch_replicas_entry *,
-- 
cgit 


From 7cfac5f5064331b99928000907e9b4d646cdfc71 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 8 May 2019 17:56:05 -0400
Subject: bcachefs: Fix for the stripes mark path and gc

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/buckets.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 2479ad37775b..d0a288dddb45 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -994,7 +994,7 @@ static int bch2_mark_stripe(struct bch_fs *c, struct bkey_s_c k,
 		return -1;
 	}
 
-	if (m->alive)
+	if (!gc && m->alive)
 		bch2_stripes_heap_del(c, m, idx);
 
 	memset(m, 0, sizeof(*m));
-- 
cgit 


From b7607ce98f286ed3d7181875f4b1f4a02663395c Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 10 May 2019 16:09:17 -0400
Subject: bcachefs: Kill remaining bch2_btree_iter_unlock() uses

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c            | 12 +-----------
 fs/bcachefs/btree_iter.h            |  2 --
 fs/bcachefs/btree_update_interior.c | 19 +++++++++++--------
 3 files changed, 12 insertions(+), 21 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index afede9651024..fbf5f809e6ac 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -389,16 +389,6 @@ void __bch2_btree_iter_downgrade(struct btree_iter *iter,
 	bch2_btree_trans_verify_locks(iter->trans);
 }
 
-int bch2_btree_iter_unlock(struct btree_iter *iter)
-{
-	struct btree_iter *linked;
-
-	trans_for_each_iter(iter->trans, linked)
-		__bch2_btree_iter_unlock(linked);
-
-	return btree_iter_err(iter);
-}
-
 bool bch2_btree_trans_relock(struct btree_trans *trans)
 {
 	struct btree_iter *iter;
@@ -1041,7 +1031,7 @@ static unsigned btree_iter_up_until_locked(struct btree_iter *iter,
  * Returns 0 on success, -EIO on error (error reading in a btree node).
  *
  * On error, caller (peek_node()/peek_key()) must return NULL; the error is
- * stashed in the iterator and returned from bch2_btree_iter_unlock().
+ * stashed in the iterator and returned from bch2_trans_exit().
  */
 int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter)
 {
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index 171e729ed3ea..9b7dfee2da82 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -105,8 +105,6 @@ void bch2_btree_node_iter_fix(struct btree_iter *, struct btree *,
 			      struct btree_node_iter *, struct bkey_packed *,
 			      unsigned, unsigned);
 
-int bch2_btree_iter_unlock(struct btree_iter *);
-
 bool bch2_btree_trans_relock(struct btree_trans *);
 void bch2_btree_trans_unlock(struct btree_trans *);
 
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 73675af8743a..6d6b10502188 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -1551,6 +1551,7 @@ split:
 int bch2_btree_split_leaf(struct bch_fs *c, struct btree_iter *iter,
 			  unsigned flags)
 {
+	struct btree_trans *trans = iter->trans;
 	struct btree *b = iter->l[0].b;
 	struct btree_update *as;
 	struct closure cl;
@@ -1561,7 +1562,7 @@ int bch2_btree_split_leaf(struct bch_fs *c, struct btree_iter *iter,
 	 * We already have a disk reservation and open buckets pinned; this
 	 * allocation must not block:
 	 */
-	trans_for_each_iter(iter->trans, linked)
+	trans_for_each_iter(trans, linked)
 		if (linked->btree_id == BTREE_ID_EXTENTS)
 			flags |= BTREE_INSERT_USE_RESERVE;
 
@@ -1573,10 +1574,10 @@ int bch2_btree_split_leaf(struct bch_fs *c, struct btree_iter *iter,
 		if (flags & BTREE_INSERT_NOUNLOCK)
 			return -EINTR;
 
-		bch2_btree_trans_unlock(iter->trans);
+		bch2_btree_trans_unlock(trans);
 		down_read(&c->gc_lock);
 
-		if (!bch2_btree_trans_relock(iter->trans))
+		if (!bch2_btree_trans_relock(trans))
 			ret = -EINTR;
 	}
 
@@ -1597,7 +1598,7 @@ int bch2_btree_split_leaf(struct bch_fs *c, struct btree_iter *iter,
 		ret = PTR_ERR(as);
 		if (ret == -EAGAIN) {
 			BUG_ON(flags & BTREE_INSERT_NOUNLOCK);
-			bch2_btree_iter_unlock(iter);
+			bch2_btree_trans_unlock(trans);
 			ret = -EINTR;
 		}
 		goto out;
@@ -1624,6 +1625,7 @@ void __bch2_foreground_maybe_merge(struct bch_fs *c,
 				   unsigned flags,
 				   enum btree_node_sibling sib)
 {
+	struct btree_trans *trans = iter->trans;
 	struct btree_update *as;
 	struct bkey_format_state new_s;
 	struct bkey_format new_f;
@@ -1778,7 +1780,7 @@ err_cycle_gc_lock:
 	if (flags & BTREE_INSERT_NOUNLOCK)
 		goto out;
 
-	bch2_btree_iter_unlock(iter);
+	bch2_btree_trans_unlock(trans);
 
 	down_read(&c->gc_lock);
 	up_read(&c->gc_lock);
@@ -1794,7 +1796,7 @@ err:
 
 	if ((ret == -EAGAIN || ret == -EINTR) &&
 	    !(flags & BTREE_INSERT_NOUNLOCK)) {
-		bch2_btree_iter_unlock(iter);
+		bch2_btree_trans_unlock(trans);
 		closure_sync(&cl);
 		ret = bch2_btree_iter_traverse(iter);
 		if (ret)
@@ -1861,6 +1863,7 @@ static int __btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter,
 int bch2_btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter,
 			    __le64 seq, unsigned flags)
 {
+	struct btree_trans *trans = iter->trans;
 	struct closure cl;
 	struct btree *b;
 	int ret;
@@ -1873,7 +1876,7 @@ int bch2_btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter,
 
 	if (!(flags & BTREE_INSERT_GC_LOCK_HELD)) {
 		if (!down_read_trylock(&c->gc_lock)) {
-			bch2_btree_iter_unlock(iter);
+			bch2_btree_trans_unlock(trans);
 			down_read(&c->gc_lock);
 		}
 	}
@@ -1892,7 +1895,7 @@ int bch2_btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter,
 		    ret != -EINTR)
 			break;
 
-		bch2_btree_iter_unlock(iter);
+		bch2_btree_trans_unlock(trans);
 		closure_sync(&cl);
 	}
 
-- 
cgit 


From b03b81dfd268938c6ed3b0ec79b4983523efb303 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 10 May 2019 17:09:42 -0400
Subject: bcachefs: Don't pass around may_drop_locks

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_cache.c           | 41 +++++++++++++++++--------------------
 fs/bcachefs/btree_cache.h           |  5 ++---
 fs/bcachefs/btree_iter.c            | 11 +++++-----
 fs/bcachefs/btree_iter.h            |  5 ++---
 fs/bcachefs/btree_locking.h         |  8 +++-----
 fs/bcachefs/btree_update_interior.c | 15 ++++++--------
 fs/bcachefs/btree_update_leaf.c     |  2 +-
 7 files changed, 38 insertions(+), 49 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index 2e932ee7ad0c..422186e67f3f 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -654,8 +654,7 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c,
  */
 struct btree *bch2_btree_node_get(struct bch_fs *c, struct btree_iter *iter,
 				  const struct bkey_i *k, unsigned level,
-				  enum six_lock_type lock_type,
-				  bool may_drop_locks)
+				  enum six_lock_type lock_type)
 {
 	struct btree_cache *bc = &c->btree_cache;
 	struct btree *b;
@@ -722,8 +721,7 @@ retry:
 		if (btree_node_read_locked(iter, level + 1))
 			btree_node_unlock(iter, level + 1);
 
-		if (!btree_node_lock(b, k->k.p, level, iter,
-				     lock_type, may_drop_locks))
+		if (!btree_node_lock(b, k->k.p, level, iter, lock_type))
 			return ERR_PTR(-EINTR);
 
 		if (unlikely(PTR_HASH(&b->key) != PTR_HASH(k) ||
@@ -772,9 +770,9 @@ retry:
 struct btree *bch2_btree_node_get_sibling(struct bch_fs *c,
 					  struct btree_iter *iter,
 					  struct btree *b,
-					  bool may_drop_locks,
 					  enum btree_node_sibling sib)
 {
+	struct btree_trans *trans = iter->trans;
 	struct btree *parent;
 	struct btree_node_iter node_iter;
 	struct bkey_packed *k;
@@ -786,8 +784,10 @@ struct btree *bch2_btree_node_get_sibling(struct bch_fs *c,
 	if (!parent)
 		return NULL;
 
-	if (!bch2_btree_node_relock(iter, level + 1))
-		goto out_upgrade;
+	if (!bch2_btree_node_relock(iter, level + 1)) {
+		ret = ERR_PTR(-EINTR);
+		goto out;
+	}
 
 	node_iter = iter->l[parent->c.level].iter;
 
@@ -804,19 +804,20 @@ struct btree *bch2_btree_node_get_sibling(struct bch_fs *c,
 	bch2_bkey_unpack(parent, &tmp.k, k);
 
 	ret = bch2_btree_node_get(c, iter, &tmp.k, level,
-				  SIX_LOCK_intent, may_drop_locks);
+				  SIX_LOCK_intent);
 
-	if (PTR_ERR_OR_ZERO(ret) == -EINTR && may_drop_locks) {
+	if (PTR_ERR_OR_ZERO(ret) == -EINTR &&
+	    !(iter->flags & BTREE_ITER_NOUNLOCK)) {
 		struct btree_iter *linked;
 
 		if (!bch2_btree_node_relock(iter, level + 1))
-			goto out_upgrade;
+			goto out;
 
 		/*
 		 * We might have got -EINTR because trylock failed, and we're
 		 * holding other locks that would cause us to deadlock:
 		 */
-		trans_for_each_iter(iter->trans, linked)
+		trans_for_each_iter(trans, linked)
 			if (btree_iter_cmp(iter, linked) < 0)
 				__bch2_btree_iter_unlock(linked);
 
@@ -824,7 +825,7 @@ struct btree *bch2_btree_node_get_sibling(struct bch_fs *c,
 			btree_node_unlock(iter, level);
 
 		ret = bch2_btree_node_get(c, iter, &tmp.k, level,
-					  SIX_LOCK_intent, may_drop_locks);
+					  SIX_LOCK_intent);
 
 		/*
 		 * before btree_iter_relock() calls btree_iter_verify_locks():
@@ -841,17 +842,16 @@ struct btree *bch2_btree_node_get_sibling(struct bch_fs *c,
 			}
 		}
 
-		bch2_btree_trans_relock(iter->trans);
+		bch2_btree_trans_relock(trans);
 	}
 out:
 	if (btree_lock_want(iter, level + 1) == BTREE_NODE_UNLOCKED)
 		btree_node_unlock(iter, level + 1);
 
-	bch2_btree_trans_verify_locks(iter->trans);
+	if (PTR_ERR_OR_ZERO(ret) == -EINTR)
+		bch2_btree_iter_upgrade(iter, level + 2);
 
-	BUG_ON((!may_drop_locks || !IS_ERR(ret)) &&
-	       (iter->uptodate >= BTREE_ITER_NEED_RELOCK ||
-		!btree_node_locked(iter, level)));
+	BUG_ON(!IS_ERR(ret) && !btree_node_locked(iter, level));
 
 	if (!IS_ERR_OR_NULL(ret)) {
 		struct btree *n1 = ret, *n2 = b;
@@ -864,12 +864,9 @@ out:
 				n2->data->min_key));
 	}
 
+	bch2_btree_trans_verify_locks(trans);
+
 	return ret;
-out_upgrade:
-	if (may_drop_locks)
-		bch2_btree_iter_upgrade(iter, level + 2, true);
-	ret = ERR_PTR(-EINTR);
-	goto out;
 }
 
 void bch2_btree_node_prefetch(struct bch_fs *c, struct btree_iter *iter,
diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h
index e0f233583796..4598a4c57aa1 100644
--- a/fs/bcachefs/btree_cache.h
+++ b/fs/bcachefs/btree_cache.h
@@ -23,11 +23,10 @@ struct btree *bch2_btree_node_mem_alloc(struct bch_fs *);
 
 struct btree *bch2_btree_node_get(struct bch_fs *, struct btree_iter *,
 				  const struct bkey_i *, unsigned,
-				  enum six_lock_type, bool);
+				  enum six_lock_type);
 
 struct btree *bch2_btree_node_get_sibling(struct bch_fs *, struct btree_iter *,
-					  struct btree *, bool,
-					  enum btree_node_sibling);
+				struct btree *, enum btree_node_sibling);
 
 void bch2_btree_node_prefetch(struct bch_fs *, struct btree_iter *,
 			      const struct bkey_i *, unsigned);
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index fbf5f809e6ac..f6c2be149645 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -196,8 +196,7 @@ static inline bool btree_iter_get_locks(struct btree_iter *iter,
 bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
 			   unsigned level,
 			   struct btree_iter *iter,
-			   enum six_lock_type type,
-			   bool may_drop_locks)
+			   enum six_lock_type type)
 {
 	struct btree_iter *linked;
 	bool ret = true;
@@ -225,7 +224,7 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
 		 */
 		if (type == SIX_LOCK_intent &&
 		    linked->nodes_locked != linked->nodes_intent_locked) {
-			if (may_drop_locks) {
+			if (!(iter->flags & BTREE_ITER_NOUNLOCK)) {
 				linked->locks_want = max_t(unsigned,
 						linked->locks_want,
 						__fls(linked->nodes_locked) + 1);
@@ -241,7 +240,7 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
 		 */
 		if (linked->btree_id == iter->btree_id &&
 		    level > __fls(linked->nodes_locked)) {
-			if (may_drop_locks) {
+			if (!(iter->flags & BTREE_ITER_NOUNLOCK)) {
 				linked->locks_want =
 					max(level + 1, max_t(unsigned,
 					    linked->locks_want,
@@ -858,7 +857,7 @@ static inline int btree_iter_lock_root(struct btree_iter *iter,
 
 		lock_type = __btree_lock_want(iter, iter->level);
 		if (unlikely(!btree_node_lock(b, POS_MAX, iter->level,
-					      iter, lock_type, true)))
+					      iter, lock_type)))
 			return -EINTR;
 
 		if (likely(b == c->btree_roots[iter->btree_id].b &&
@@ -922,7 +921,7 @@ static inline int btree_iter_down(struct btree_iter *iter)
 	bch2_bkey_unpack(l->b, &tmp.k,
 			 bch2_btree_node_iter_peek(&l->iter, l->b));
 
-	b = bch2_btree_node_get(c, iter, &tmp.k, level, lock_type, true);
+	b = bch2_btree_node_get(c, iter, &tmp.k, level, lock_type);
 	if (unlikely(IS_ERR(b)))
 		return PTR_ERR(b);
 
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index 9b7dfee2da82..b2c0b6816d1c 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -112,13 +112,12 @@ bool __bch2_btree_iter_upgrade(struct btree_iter *, unsigned);
 bool __bch2_btree_iter_upgrade_nounlock(struct btree_iter *, unsigned);
 
 static inline bool bch2_btree_iter_upgrade(struct btree_iter *iter,
-					   unsigned new_locks_want,
-					   bool may_drop_locks)
+					   unsigned new_locks_want)
 {
 	new_locks_want = min(new_locks_want, BTREE_MAX_DEPTH);
 
 	return iter->locks_want < new_locks_want
-		? (may_drop_locks
+		? (!(iter->flags & BTREE_ITER_NOUNLOCK)
 		   ? __bch2_btree_iter_upgrade(iter, new_locks_want)
 		   : __bch2_btree_iter_upgrade_nounlock(iter, new_locks_want))
 		: iter->uptodate <= BTREE_ITER_NEED_PEEK;
diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h
index e75e56c34f5f..33bc94a714c6 100644
--- a/fs/bcachefs/btree_locking.h
+++ b/fs/bcachefs/btree_locking.h
@@ -175,20 +175,18 @@ static inline bool btree_node_lock_increment(struct btree_iter *iter,
 }
 
 bool __bch2_btree_node_lock(struct btree *, struct bpos, unsigned,
-			    struct btree_iter *, enum six_lock_type, bool);
+			    struct btree_iter *, enum six_lock_type);
 
 static inline bool btree_node_lock(struct btree *b, struct bpos pos,
 				   unsigned level,
 				   struct btree_iter *iter,
-				   enum six_lock_type type,
-				   bool may_drop_locks)
+				   enum six_lock_type type)
 {
 	EBUG_ON(level >= BTREE_MAX_DEPTH);
 
 	return likely(six_trylock_type(&b->c.lock, type)) ||
 		btree_node_lock_increment(iter, b, level, type) ||
-		__bch2_btree_node_lock(b, pos, level, iter,
-				       type, may_drop_locks);
+		__bch2_btree_node_lock(b, pos, level, iter, type);
 }
 
 bool __bch2_btree_node_relock(struct btree_iter *, unsigned);
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 6d6b10502188..e3595a8dce6a 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -1585,8 +1585,7 @@ int bch2_btree_split_leaf(struct bch_fs *c, struct btree_iter *iter,
 	 * XXX: figure out how far we might need to split,
 	 * instead of locking/reserving all the way to the root:
 	 */
-	if (!bch2_btree_iter_upgrade(iter, U8_MAX,
-			!(flags & BTREE_INSERT_NOUNLOCK))) {
+	if (!bch2_btree_iter_upgrade(iter, U8_MAX)) {
 		ret = -EINTR;
 		goto out;
 	}
@@ -1649,8 +1648,7 @@ retry:
 		goto out;
 
 	/* XXX: can't be holding read locks */
-	m = bch2_btree_node_get_sibling(c, iter, b,
-			!(flags & BTREE_INSERT_NOUNLOCK), sib);
+	m = bch2_btree_node_get_sibling(c, iter, b, sib);
 	if (IS_ERR(m)) {
 		ret = PTR_ERR(m);
 		goto err;
@@ -1697,8 +1695,7 @@ retry:
 	    !down_read_trylock(&c->gc_lock))
 		goto err_cycle_gc_lock;
 
-	if (!bch2_btree_iter_upgrade(iter, U8_MAX,
-			!(flags & BTREE_INSERT_NOUNLOCK))) {
+	if (!bch2_btree_iter_upgrade(iter, U8_MAX)) {
 		ret = -EINTR;
 		goto err_unlock;
 	}
@@ -1760,7 +1757,7 @@ retry:
 	if (!(flags & BTREE_INSERT_GC_LOCK_HELD))
 		up_read(&c->gc_lock);
 out:
-	bch2_btree_trans_verify_locks(iter->trans);
+	bch2_btree_trans_verify_locks(trans);
 
 	/*
 	 * Don't downgrade locks here: we're called after successful insert,
@@ -1872,7 +1869,7 @@ int bch2_btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter,
 
 	closure_init_stack(&cl);
 
-	bch2_btree_iter_upgrade(iter, U8_MAX, true);
+	bch2_btree_iter_upgrade(iter, U8_MAX);
 
 	if (!(flags & BTREE_INSERT_GC_LOCK_HELD)) {
 		if (!down_read_trylock(&c->gc_lock)) {
@@ -2044,7 +2041,7 @@ int bch2_btree_node_update_key(struct bch_fs *c, struct btree_iter *iter,
 
 	closure_init_stack(&cl);
 
-	if (!bch2_btree_iter_upgrade(iter, U8_MAX, true))
+	if (!bch2_btree_iter_upgrade(iter, U8_MAX))
 		return -EINTR;
 
 	if (!down_read_trylock(&c->gc_lock)) {
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index b9b9accfb38c..a449f983a343 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -816,7 +816,7 @@ static int __bch2_trans_commit(struct btree_trans *trans,
 		unsigned old_locks_want = i->iter->locks_want;
 		unsigned old_uptodate = i->iter->uptodate;
 
-		if (!bch2_btree_iter_upgrade(i->iter, 1, true)) {
+		if (!bch2_btree_iter_upgrade(i->iter, 1)) {
 			trans_restart(" (failed upgrade, locks_want %u uptodate %u)",
 				      old_locks_want, old_uptodate);
 			ret = -EINTR;
-- 
cgit 


From 60755344c6b18753c93b353f147c2e23b8b18de1 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 10 May 2019 17:15:30 -0400
Subject: bcachefs: kill BTREE_ITER_NOUNLOCK

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_cache.c       |  3 +--
 fs/bcachefs/btree_iter.c        |  7 ++-----
 fs/bcachefs/btree_iter.h        |  2 +-
 fs/bcachefs/btree_locking.h     |  2 +-
 fs/bcachefs/btree_types.h       |  2 +-
 fs/bcachefs/btree_update_leaf.c | 21 +++++----------------
 6 files changed, 11 insertions(+), 26 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index 422186e67f3f..ece2c4d2ebd8 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -806,8 +806,7 @@ struct btree *bch2_btree_node_get_sibling(struct bch_fs *c,
 	ret = bch2_btree_node_get(c, iter, &tmp.k, level,
 				  SIX_LOCK_intent);
 
-	if (PTR_ERR_OR_ZERO(ret) == -EINTR &&
-	    !(iter->flags & BTREE_ITER_NOUNLOCK)) {
+	if (PTR_ERR_OR_ZERO(ret) == -EINTR && !trans->nounlock) {
 		struct btree_iter *linked;
 
 		if (!bch2_btree_node_relock(iter, level + 1))
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index f6c2be149645..e18a88cd972e 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -224,7 +224,7 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
 		 */
 		if (type == SIX_LOCK_intent &&
 		    linked->nodes_locked != linked->nodes_intent_locked) {
-			if (!(iter->flags & BTREE_ITER_NOUNLOCK)) {
+			if (!(iter->trans->nounlock)) {
 				linked->locks_want = max_t(unsigned,
 						linked->locks_want,
 						__fls(linked->nodes_locked) + 1);
@@ -240,7 +240,7 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
 		 */
 		if (linked->btree_id == iter->btree_id &&
 		    level > __fls(linked->nodes_locked)) {
-			if (!(iter->flags & BTREE_ITER_NOUNLOCK)) {
+			if (!(iter->trans->nounlock)) {
 				linked->locks_want =
 					max(level + 1, max_t(unsigned,
 					    linked->locks_want,
@@ -269,9 +269,6 @@ void bch2_btree_iter_verify_locks(struct btree_iter *iter)
 {
 	unsigned l;
 
-	BUG_ON((iter->flags & BTREE_ITER_NOUNLOCK) &&
-	       !btree_node_locked(iter, 0));
-
 	for (l = 0; btree_iter_node(iter, l); l++) {
 		if (iter->uptodate >= BTREE_ITER_NEED_RELOCK &&
 		    !btree_node_locked(iter, l))
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index b2c0b6816d1c..ee5437813604 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -117,7 +117,7 @@ static inline bool bch2_btree_iter_upgrade(struct btree_iter *iter,
 	new_locks_want = min(new_locks_want, BTREE_MAX_DEPTH);
 
 	return iter->locks_want < new_locks_want
-		? (!(iter->flags & BTREE_ITER_NOUNLOCK)
+		? (!iter->trans->nounlock
 		   ? __bch2_btree_iter_upgrade(iter, new_locks_want)
 		   : __bch2_btree_iter_upgrade_nounlock(iter, new_locks_want))
 		: iter->uptodate <= BTREE_ITER_NEED_PEEK;
diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h
index 33bc94a714c6..6591da0a52b3 100644
--- a/fs/bcachefs/btree_locking.h
+++ b/fs/bcachefs/btree_locking.h
@@ -107,7 +107,7 @@ static inline void __btree_node_unlock(struct btree_iter *iter, unsigned level)
 
 static inline void btree_node_unlock(struct btree_iter *iter, unsigned level)
 {
-	BUG_ON(!level && iter->flags & BTREE_ITER_NOUNLOCK);
+	EBUG_ON(!level && iter->trans->nounlock);
 
 	__btree_node_unlock(iter, level);
 }
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index ece4f30b3f85..d88f1c911d04 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -197,7 +197,6 @@ enum btree_iter_type {
  */
 #define BTREE_ITER_IS_EXTENTS		(1 << 4)
 #define BTREE_ITER_ERROR		(1 << 5)
-#define BTREE_ITER_NOUNLOCK		(1 << 6)
 
 enum btree_iter_uptodate {
 	BTREE_ITER_UPTODATE		= 0,
@@ -287,6 +286,7 @@ struct btree_trans {
 	u8			size;
 	unsigned		used_mempool:1;
 	unsigned		error:1;
+	unsigned		nounlock:1;
 
 	unsigned		mem_top;
 	unsigned		mem_bytes;
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index a449f983a343..b5749b19c3b9 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -551,7 +551,6 @@ static inline int do_btree_insert_at(struct btree_trans *trans,
 	struct bch_fs *c = trans->c;
 	struct bch_fs_usage_online *fs_usage = NULL;
 	struct btree_insert_entry *i;
-	struct btree_iter *linked;
 	int ret;
 
 	if (likely(!(trans->flags & BTREE_INSERT_NO_CLEAR_REPLICAS))) {
@@ -624,17 +623,6 @@ static inline int do_btree_insert_at(struct btree_trans *trans,
 				i->k->k.version = MAX_VERSION;
 	}
 
-	if (trans->flags & BTREE_INSERT_NOUNLOCK) {
-		/*
-		 * linked iterators that weren't being updated may or may not
-		 * have been traversed/locked, depending on what the caller was
-		 * doing:
-		 */
-		trans_for_each_iter(trans, linked)
-			if (linked->uptodate < BTREE_ITER_NEED_RELOCK)
-				linked->flags |= BTREE_ITER_NOUNLOCK;
-	}
-
 	trans_for_each_update_iter(trans, i)
 		if (update_has_triggers(trans, i) &&
 		    !update_triggers_transactional(trans, i))
@@ -809,7 +797,6 @@ static int __bch2_trans_commit(struct btree_trans *trans,
 {
 	struct bch_fs *c = trans->c;
 	struct btree_insert_entry *i;
-	struct btree_iter *linked;
 	int ret;
 
 	trans_for_each_update_iter(trans, i) {
@@ -832,18 +819,20 @@ static int __bch2_trans_commit(struct btree_trans *trans,
 	if (unlikely(ret))
 		goto err;
 
+	if (trans->flags & BTREE_INSERT_NOUNLOCK)
+		trans->nounlock = true;
+
 	trans_for_each_update_leaf(trans, i)
 		bch2_foreground_maybe_merge(c, i->iter, 0, trans->flags);
 
+	trans->nounlock = false;
+
 	trans_for_each_update_iter(trans, i)
 		bch2_btree_iter_downgrade(i->iter);
 err:
 	/* make sure we didn't drop or screw up locks: */
 	bch2_btree_trans_verify_locks(trans);
 
-	trans_for_each_iter(trans, linked)
-		linked->flags &= ~BTREE_ITER_NOUNLOCK;
-
 	return ret;
 }
 
-- 
cgit 


From 6009b4e5086783619172900e4090781491664517 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 12 May 2019 22:23:30 -0400
Subject: bcachefs: Merge extents with checksums

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bkey_methods.c |  19 +++--
 fs/bcachefs/checksum.c     |  18 +---
 fs/bcachefs/checksum.h     |  16 ++++
 fs/bcachefs/extents.c      | 203 +++++++++++++++++++++++++++++----------------
 4 files changed, 160 insertions(+), 96 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c
index 37c44f087a0b..571013a0d1a0 100644
--- a/fs/bcachefs/bkey_methods.c
+++ b/fs/bcachefs/bkey_methods.c
@@ -202,15 +202,20 @@ enum merge_result bch2_bkey_merge(struct bch_fs *c,
 				  struct bkey_i *l, struct bkey_i *r)
 {
 	const struct bkey_ops *ops = &bch2_bkey_ops[l->k.type];
+	enum merge_result ret;
 
-	if (!key_merging_disabled(c) &&
-	    ops->key_merge &&
-	    l->k.type == r->k.type &&
-	    !bversion_cmp(l->k.version, r->k.version) &&
-	    !bkey_cmp(l->k.p, bkey_start_pos(&r->k)))
-		return ops->key_merge(c, l, r);
+	if (key_merging_disabled(c) ||
+	    !ops->key_merge ||
+	    l->k.type != r->k.type ||
+	    bversion_cmp(l->k.version, r->k.version) ||
+	    bkey_cmp(l->k.p, bkey_start_pos(&r->k)))
+		return BCH_MERGE_NOMERGE;
 
-	return BCH_MERGE_NOMERGE;
+	ret = ops->key_merge(c, l, r);
+
+	if (ret != BCH_MERGE_NOMERGE)
+		l->k.needs_whiteout |= r->k.needs_whiteout;
+	return ret;
 }
 
 static const struct old_bkey_type {
diff --git a/fs/bcachefs/checksum.c b/fs/bcachefs/checksum.c
index 98dc39de1e73..664e1bc2b139 100644
--- a/fs/bcachefs/checksum.c
+++ b/fs/bcachefs/checksum.c
@@ -281,22 +281,8 @@ void bch2_encrypt_bio(struct bch_fs *c, unsigned type,
 	do_encrypt_sg(c->chacha20, nonce, sgl, bytes);
 }
 
-static inline bool bch2_checksum_mergeable(unsigned type)
-{
-
-	switch (type) {
-	case BCH_CSUM_NONE:
-	case BCH_CSUM_CRC32C:
-	case BCH_CSUM_CRC64:
-		return true;
-	default:
-		return false;
-	}
-}
-
-static struct bch_csum bch2_checksum_merge(unsigned type,
-					   struct bch_csum a,
-					   struct bch_csum b, size_t b_len)
+struct bch_csum bch2_checksum_merge(unsigned type, struct bch_csum a,
+				    struct bch_csum b, size_t b_len)
 {
 	BUG_ON(!bch2_checksum_mergeable(type));
 
diff --git a/fs/bcachefs/checksum.h b/fs/bcachefs/checksum.h
index e2f2d797f90c..afdbbf702970 100644
--- a/fs/bcachefs/checksum.h
+++ b/fs/bcachefs/checksum.h
@@ -9,6 +9,22 @@
 #include <linux/crc64.h>
 #include <crypto/chacha.h>
 
+static inline bool bch2_checksum_mergeable(unsigned type)
+{
+
+	switch (type) {
+	case BCH_CSUM_NONE:
+	case BCH_CSUM_CRC32C:
+	case BCH_CSUM_CRC64:
+		return true;
+	default:
+		return false;
+	}
+}
+
+struct bch_csum bch2_checksum_merge(unsigned, struct bch_csum,
+				    struct bch_csum, size_t);
+
 static inline u64 bch2_crc64_update(u64 crc, const void *p, size_t len)
 {
 	return crc64_be(crc, p, len);
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 2ebde20c74f8..33c00db899e0 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -1358,53 +1358,63 @@ void bch2_extent_to_text(struct printbuf *out, struct bch_fs *c,
 	bch2_bkey_ptrs_to_text(out, c, k);
 }
 
+static unsigned bch2_crc_field_size_max[] = {
+	[BCH_EXTENT_ENTRY_crc32] = CRC32_SIZE_MAX,
+	[BCH_EXTENT_ENTRY_crc64] = CRC64_SIZE_MAX,
+	[BCH_EXTENT_ENTRY_crc128] = CRC128_SIZE_MAX,
+};
+
+static void bch2_extent_crc_pack(union bch_extent_crc *dst,
+				 struct bch_extent_crc_unpacked src)
+{
+#define set_common_fields(_dst, _src)					\
+		_dst.csum_type		= _src.csum_type,		\
+		_dst.compression_type	= _src.compression_type,	\
+		_dst._compressed_size	= _src.compressed_size - 1,	\
+		_dst._uncompressed_size	= _src.uncompressed_size - 1,	\
+		_dst.offset		= _src.offset
+
+	switch (extent_entry_type(to_entry(dst))) {
+	case BCH_EXTENT_ENTRY_crc32:
+		set_common_fields(dst->crc32, src);
+		dst->crc32.csum	 = *((__le32 *) &src.csum.lo);
+		break;
+	case BCH_EXTENT_ENTRY_crc64:
+		set_common_fields(dst->crc64, src);
+		dst->crc64.nonce	= src.nonce;
+		dst->crc64.csum_lo	= src.csum.lo;
+		dst->crc64.csum_hi	= *((__le16 *) &src.csum.hi);
+		break;
+	case BCH_EXTENT_ENTRY_crc128:
+		set_common_fields(dst->crc128, src);
+		dst->crc128.nonce	= src.nonce;
+		dst->crc128.csum	= src.csum;
+		break;
+	default:
+		BUG();
+	}
+#undef set_common_fields
+}
+
 static void bch2_extent_crc_init(union bch_extent_crc *crc,
 				 struct bch_extent_crc_unpacked new)
 {
-#define common_fields(_crc)						\
-		.csum_type		= _crc.csum_type,		\
-		.compression_type	= _crc.compression_type,	\
-		._compressed_size	= _crc.compressed_size - 1,	\
-		._uncompressed_size	= _crc.uncompressed_size - 1,	\
-		.offset			= _crc.offset
-
 	if (bch_crc_bytes[new.csum_type]	<= 4 &&
-	    new.uncompressed_size		<= CRC32_SIZE_MAX &&
-	    new.nonce				<= CRC32_NONCE_MAX) {
-		crc->crc32 = (struct bch_extent_crc32) {
-			.type = 1 << BCH_EXTENT_ENTRY_crc32,
-			common_fields(new),
-			.csum			= *((__le32 *) &new.csum.lo),
-		};
-		return;
-	}
-
-	if (bch_crc_bytes[new.csum_type]	<= 10 &&
-	    new.uncompressed_size		<= CRC64_SIZE_MAX &&
-	    new.nonce				<= CRC64_NONCE_MAX) {
-		crc->crc64 = (struct bch_extent_crc64) {
-			.type = 1 << BCH_EXTENT_ENTRY_crc64,
-			common_fields(new),
-			.nonce			= new.nonce,
-			.csum_lo		= new.csum.lo,
-			.csum_hi		= *((__le16 *) &new.csum.hi),
-		};
-		return;
-	}
+	    new.uncompressed_size - 1		<= CRC32_SIZE_MAX &&
+	    new.nonce				<= CRC32_NONCE_MAX)
+		crc->type = 1 << BCH_EXTENT_ENTRY_crc32;
+	else if (bch_crc_bytes[new.csum_type]	<= 10 &&
+		   new.uncompressed_size - 1	<= CRC64_SIZE_MAX &&
+		   new.nonce			<= CRC64_NONCE_MAX)
+		crc->type = 1 << BCH_EXTENT_ENTRY_crc64;
+	else if (bch_crc_bytes[new.csum_type]	<= 16 &&
+		   new.uncompressed_size - 1	<= CRC128_SIZE_MAX &&
+		   new.nonce			<= CRC128_NONCE_MAX)
+		crc->type = 1 << BCH_EXTENT_ENTRY_crc128;
+	else
+		BUG();
 
-	if (bch_crc_bytes[new.csum_type]	<= 16 &&
-	    new.uncompressed_size		<= CRC128_SIZE_MAX &&
-	    new.nonce				<= CRC128_NONCE_MAX) {
-		crc->crc128 = (struct bch_extent_crc128) {
-			.type = 1 << BCH_EXTENT_ENTRY_crc128,
-			common_fields(new),
-			.nonce			= new.nonce,
-			.csum			= new.csum,
-		};
-		return;
-	}
-#undef common_fields
-	BUG();
+	bch2_extent_crc_pack(crc, new);
 }
 
 void bch2_extent_crc_append(struct bkey_i_extent *e,
@@ -1515,46 +1525,98 @@ enum merge_result bch2_extent_merge(struct bch_fs *c,
 {
 	struct bkey_s_extent el = bkey_i_to_s_extent(l);
 	struct bkey_s_extent er = bkey_i_to_s_extent(r);
-	union bch_extent_entry *en_l, *en_r;
+	union bch_extent_entry *en_l = el.v->start;
+	union bch_extent_entry *en_r = er.v->start;
+	struct bch_extent_crc_unpacked crc_l, crc_r;
 
-	if (bkey_val_u64s(&l->k) != bkey_val_u64s(&r->k))
-		return BCH_MERGE_NOMERGE;
+	crc_l = bch2_extent_crc_unpack(el.k, NULL);
 
 	extent_for_each_entry(el, en_l) {
-		struct bch_extent_ptr *lp, *rp;
-		struct bch_dev *ca;
-
 		en_r = vstruct_idx(er.v, (u64 *) en_l - el.v->_data);
 
-		if ((extent_entry_type(en_l) !=
-		     extent_entry_type(en_r)) ||
-		    !extent_entry_is_ptr(en_l))
+		if (extent_entry_type(en_l) != extent_entry_type(en_r))
 			return BCH_MERGE_NOMERGE;
 
-		lp = &en_l->ptr;
-		rp = &en_r->ptr;
+		switch (extent_entry_type(en_l)) {
+		case BCH_EXTENT_ENTRY_ptr: {
+			const struct bch_extent_ptr *lp = &en_l->ptr;
+			const struct bch_extent_ptr *rp = &en_r->ptr;
+			struct bch_dev *ca;
 
-		if (lp->offset + el.k->size	!= rp->offset ||
-		    lp->dev			!= rp->dev ||
-		    lp->gen			!= rp->gen)
-			return BCH_MERGE_NOMERGE;
+			if (lp->offset + crc_l.compressed_size != rp->offset ||
+			    lp->dev			!= rp->dev ||
+			    lp->gen			!= rp->gen)
+				return BCH_MERGE_NOMERGE;
 
-		/* We don't allow extents to straddle buckets: */
-		ca = bch_dev_bkey_exists(c, lp->dev);
+			/* We don't allow extents to straddle buckets: */
+			ca = bch_dev_bkey_exists(c, lp->dev);
 
-		if (PTR_BUCKET_NR(ca, lp) != PTR_BUCKET_NR(ca, rp))
+			if (PTR_BUCKET_NR(ca, lp) != PTR_BUCKET_NR(ca, rp))
+				return BCH_MERGE_NOMERGE;
+
+			break;
+		}
+		case BCH_EXTENT_ENTRY_stripe_ptr:
+			if (en_l->stripe_ptr.block	!= en_r->stripe_ptr.block ||
+			    en_l->stripe_ptr.idx	!= en_r->stripe_ptr.idx)
+				return BCH_MERGE_NOMERGE;
+			break;
+		case BCH_EXTENT_ENTRY_crc32:
+		case BCH_EXTENT_ENTRY_crc64:
+		case BCH_EXTENT_ENTRY_crc128:
+			crc_l = bch2_extent_crc_unpack(el.k, entry_to_crc(en_l));
+			crc_r = bch2_extent_crc_unpack(er.k, entry_to_crc(en_r));
+
+			if (crc_l.csum_type		!= crc_r.csum_type ||
+			    crc_l.compression_type	!= crc_r.compression_type ||
+			    crc_l.nonce			!= crc_r.nonce)
+				return BCH_MERGE_NOMERGE;
+
+			if (crc_l.offset + crc_l.live_size != crc_l.compressed_size ||
+			    crc_r.offset)
+				return BCH_MERGE_NOMERGE;
+
+			if (!bch2_checksum_mergeable(crc_l.csum_type))
+				return BCH_MERGE_NOMERGE;
+
+			if (crc_l.compression_type)
+				return BCH_MERGE_NOMERGE;
+
+			if (crc_l.csum_type &&
+			    crc_l.uncompressed_size +
+			    crc_r.uncompressed_size > c->sb.encoded_extent_max)
+				return BCH_MERGE_NOMERGE;
+
+			if (crc_l.uncompressed_size + crc_r.uncompressed_size - 1 >
+			    bch2_crc_field_size_max[extent_entry_type(en_l)])
+				return BCH_MERGE_NOMERGE;
+
+			break;
+		default:
 			return BCH_MERGE_NOMERGE;
+		}
 	}
 
-	l->k.needs_whiteout |= r->k.needs_whiteout;
+	extent_for_each_entry(el, en_l) {
+		struct bch_extent_crc_unpacked crc_l, crc_r;
+
+		en_r = vstruct_idx(er.v, (u64 *) en_l - el.v->_data);
 
-	/* Keys with no pointers aren't restricted to one bucket and could
-	 * overflow KEY_SIZE
-	 */
-	if ((u64) l->k.size + r->k.size > KEY_SIZE_MAX) {
-		bch2_key_resize(&l->k, KEY_SIZE_MAX);
-		bch2_cut_front(l->k.p, r);
-		return BCH_MERGE_PARTIAL;
+		if (!extent_entry_is_crc(en_l))
+			continue;
+
+		crc_l = bch2_extent_crc_unpack(el.k, entry_to_crc(en_l));
+		crc_r = bch2_extent_crc_unpack(er.k, entry_to_crc(en_r));
+
+		crc_l.csum = bch2_checksum_merge(crc_l.csum_type,
+						 crc_l.csum,
+						 crc_r.csum,
+						 crc_r.uncompressed_size << 9);
+
+		crc_l.uncompressed_size	+= crc_r.uncompressed_size;
+		crc_l.compressed_size	+= crc_r.compressed_size;
+
+		bch2_extent_crc_pack(entry_to_crc(en_l), crc_l);
 	}
 
 	bch2_key_resize(&l->k, l->k.size + r->k.size);
@@ -1725,11 +1787,6 @@ enum merge_result bch2_reservation_merge(struct bch_fs *c,
 	    li->v.nr_replicas != ri->v.nr_replicas)
 		return BCH_MERGE_NOMERGE;
 
-	l->k.needs_whiteout |= r->k.needs_whiteout;
-
-	/* Keys with no pointers aren't restricted to one bucket and could
-	 * overflow KEY_SIZE
-	 */
 	if ((u64) l->k.size + r->k.size > KEY_SIZE_MAX) {
 		bch2_key_resize(&l->k, KEY_SIZE_MAX);
 		bch2_cut_front(l->k.p, r);
-- 
cgit 


From 42c7d748e4983be1b7fdf3ff58920eab92c8833d Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 13 May 2019 00:30:02 -0400
Subject: bcachefs: stripe creation fixes

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/ec.c | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index 07245717ca4e..56939428a21a 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -180,6 +180,25 @@ static int extent_matches_stripe(struct bch_fs *c,
 	return -1;
 }
 
+static bool extent_has_stripe_ptr(struct bkey_s_c k, u64 idx)
+{
+	struct bkey_s_c_extent e;
+	const union bch_extent_entry *entry;
+
+	if (!bkey_extent_is_data(k.k))
+		return false;
+
+	e = bkey_s_c_to_extent(k);
+
+	extent_for_each_entry(e, entry)
+		if (extent_entry_type(entry) ==
+		    BCH_EXTENT_ENTRY_stripe_ptr &&
+		    entry->stripe_ptr.idx == idx)
+			return true;
+
+	return false;
+}
+
 static void ec_stripe_key_init(struct bch_fs *c,
 			       struct bkey_i_stripe *s,
 			       struct open_buckets *blocks,
@@ -756,12 +775,19 @@ static int ec_stripe_update_ptrs(struct bch_fs *c,
 	while ((k = bch2_btree_iter_peek(iter)).k &&
 	       !(ret = bkey_err(k)) &&
 	       bkey_cmp(bkey_start_pos(k.k), pos->p) < 0) {
+		if (extent_has_stripe_ptr(k, s->key.k.p.offset)) {
+			bch2_btree_iter_next(iter);
+			continue;
+		}
+
 		idx = extent_matches_stripe(c, &s->key.v, k);
 		if (idx < 0) {
 			bch2_btree_iter_next(iter);
 			continue;
 		}
 
+		bch2_btree_iter_set_pos(iter, bkey_start_pos(k.k));
+
 		dev = s->key.v.ptrs[idx].dev;
 
 		bkey_reassemble(&tmp.k, k);
-- 
cgit 


From 17758a6c2367f8613490883d541c767c361194a2 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 11 May 2019 17:36:31 -0400
Subject: bcachefs: bch2_btree_delete_at_range()

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update.h      |  2 +
 fs/bcachefs/btree_update_leaf.c | 94 ++++++++++++++++++++++++-----------------
 2 files changed, 57 insertions(+), 39 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
index 4438a9992442..a967f196c87a 100644
--- a/fs/bcachefs/btree_update.h
+++ b/fs/bcachefs/btree_update.h
@@ -106,6 +106,8 @@ int bch2_btree_delete_at(struct btree_trans *, struct btree_iter *, unsigned);
 int bch2_btree_insert(struct bch_fs *, enum btree_id, struct bkey_i *,
 		     struct disk_reservation *, u64 *, int flags);
 
+int bch2_btree_delete_at_range(struct btree_trans *, struct btree_iter *,
+			       struct bpos, u64 *);
 int bch2_btree_delete_range(struct bch_fs *, enum btree_id,
 			    struct bpos, struct bpos, u64 *);
 
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index b5749b19c3b9..5366d2a6c268 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -955,20 +955,6 @@ struct btree_insert_entry *bch2_trans_update(struct btree_trans *trans,
 	return i;
 }
 
-int bch2_btree_delete_at(struct btree_trans *trans,
-			 struct btree_iter *iter, unsigned flags)
-{
-	struct bkey_i k;
-
-	bkey_init(&k.k);
-	k.k.p = iter->pos;
-
-	bch2_trans_update(trans, BTREE_INSERT_ENTRY(iter, &k));
-	return bch2_trans_commit(trans, NULL, NULL,
-				 BTREE_INSERT_NOFAIL|
-				 BTREE_INSERT_USE_RESERVE|flags);
-}
-
 /**
  * bch2_btree_insert - insert keys into the extent btree
  * @c:			pointer to struct bch_fs
@@ -998,30 +984,17 @@ int bch2_btree_insert(struct bch_fs *c, enum btree_id id,
 	return ret;
 }
 
-/*
- * bch_btree_delete_range - delete everything within a given range
- *
- * Range is a half open interval - [start, end)
- */
-int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id,
-			    struct bpos start, struct bpos end,
-			    u64 *journal_seq)
+int bch2_btree_delete_at_range(struct btree_trans *trans,
+			       struct btree_iter *iter,
+			       struct bpos end,
+			       u64 *journal_seq)
 {
-	struct btree_trans trans;
-	struct btree_iter *iter;
 	struct bkey_s_c k;
 	int ret = 0;
-
-	bch2_trans_init(&trans, c);
-	bch2_trans_preload_iters(&trans);
-
-	iter = bch2_trans_get_iter(&trans, id, start, BTREE_ITER_INTENT);
-
+retry:
 	while ((k = bch2_btree_iter_peek(iter)).k &&
 	       !(ret = bkey_err(k)) &&
 	       bkey_cmp(iter->pos, end) < 0) {
-		unsigned max_sectors = KEY_SIZE_MAX & (~0 << c->block_bits);
-		/* really shouldn't be using a bare, unpadded bkey_i */
 		struct bkey_i delete;
 
 		bkey_init(&delete.k);
@@ -1039,26 +1012,69 @@ int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id,
 		delete.k.p = iter->pos;
 
 		if (iter->flags & BTREE_ITER_IS_EXTENTS) {
+			unsigned max_sectors =
+				KEY_SIZE_MAX & (~0 << trans->c->block_bits);
+
 			/* create the biggest key we can */
 			bch2_key_resize(&delete.k, max_sectors);
 			bch2_cut_back(end, &delete.k);
 			bch2_extent_trim_atomic(&delete, iter);
 		}
 
-		bch2_trans_update(&trans, BTREE_INSERT_ENTRY(iter, &delete));
-
-		ret = bch2_trans_commit(&trans, NULL, journal_seq,
+		bch2_trans_update(trans, BTREE_INSERT_ENTRY(iter, &delete));
+		ret = bch2_trans_commit(trans, NULL, journal_seq,
 					BTREE_INSERT_ATOMIC|
 					BTREE_INSERT_NOFAIL);
-		if (ret == -EINTR)
-			ret = 0;
 		if (ret)
 			break;
 
-		bch2_trans_cond_resched(&trans);
+		bch2_trans_cond_resched(trans);
 	}
 
-	bch2_trans_exit(&trans);
+	if (ret == -EINTR) {
+		ret = 0;
+		goto retry;
+	}
+
+	return ret;
+
+}
+
+int bch2_btree_delete_at(struct btree_trans *trans,
+			 struct btree_iter *iter, unsigned flags)
+{
+	struct bkey_i k;
+
+	bkey_init(&k.k);
+	k.k.p = iter->pos;
+
+	bch2_trans_update(trans, BTREE_INSERT_ENTRY(iter, &k));
+	return bch2_trans_commit(trans, NULL, NULL,
+				 BTREE_INSERT_NOFAIL|
+				 BTREE_INSERT_USE_RESERVE|flags);
+}
+
+/*
+ * bch_btree_delete_range - delete everything within a given range
+ *
+ * Range is a half open interval - [start, end)
+ */
+int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id,
+			    struct bpos start, struct bpos end,
+			    u64 *journal_seq)
+{
+	struct btree_trans trans;
+	struct btree_iter *iter;
+	int ret = 0;
+
+	bch2_trans_init(&trans, c);
+	bch2_trans_preload_iters(&trans);
+
+	iter = bch2_trans_get_iter(&trans, id, start, BTREE_ITER_INTENT);
+
+	ret = bch2_btree_delete_at_range(&trans, iter, end, journal_seq);
+	ret = bch2_trans_exit(&trans) ?: ret;
+
 	BUG_ON(ret == -EINTR);
 	return ret;
 }
-- 
cgit 


From ed8413fdab6567985995051869ad211fb8f15d5f Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 14 May 2019 14:08:23 -0400
Subject: bcachefs: improved btree locking tracepoints

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c            | 75 ++++++++++++++++++++++++-------------
 fs/bcachefs/btree_iter.h            | 10 ++---
 fs/bcachefs/btree_update_interior.c |  1 +
 fs/bcachefs/btree_update_leaf.c     |  7 +++-
 fs/bcachefs/trace.h                 | 48 ++++++++++++++++++++++++
 5 files changed, 109 insertions(+), 32 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index e18a88cd972e..4d107d890438 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -15,13 +15,18 @@ static inline struct bkey_s_c __btree_iter_peek_all(struct btree_iter *,
 						    struct btree_iter_level *,
 						    struct bkey *);
 
-#define BTREE_ITER_NOT_END	((struct btree *) 1)
+#define BTREE_ITER_NO_NODE_GET_LOCKS	((struct btree *) 1)
+#define BTREE_ITER_NO_NODE_DROP		((struct btree *) 2)
+#define BTREE_ITER_NO_NODE_LOCK_ROOT	((struct btree *) 3)
+#define BTREE_ITER_NO_NODE_UP		((struct btree *) 4)
+#define BTREE_ITER_NO_NODE_DOWN		((struct btree *) 5)
+#define BTREE_ITER_NO_NODE_INIT		((struct btree *) 6)
+#define BTREE_ITER_NO_NODE_ERROR	((struct btree *) 7)
 
 static inline bool is_btree_node(struct btree_iter *iter, unsigned l)
 {
 	return l < BTREE_MAX_DEPTH &&
-		iter->l[l].b &&
-		iter->l[l].b != BTREE_ITER_NOT_END;
+		(unsigned long) iter->l[l].b >= 128;
 }
 
 /* Returns < 0 if @k is before iter pos, > 0 if @k is after */
@@ -106,19 +111,20 @@ bool __bch2_btree_node_relock(struct btree_iter *iter, unsigned level)
 	struct btree *b = btree_iter_node(iter, level);
 	int want = __btree_lock_want(iter, level);
 
-	if (!b || b == BTREE_ITER_NOT_END)
+	if (!is_btree_node(iter, level))
 		return false;
 
 	if (race_fault())
 		return false;
 
-	if (!six_relock_type(&b->c.lock, want, iter->l[level].lock_seq) &&
-	    !(iter->l[level].lock_seq >> 1 == b->c.lock.state.seq >> 1 &&
-	      btree_node_lock_increment(iter, b, level, want)))
+	if (six_relock_type(&b->c.lock, want, iter->l[level].lock_seq) ||
+	    (btree_node_lock_seq_matches(iter, b, level) &&
+	     btree_node_lock_increment(iter, b, level, want))) {
+		mark_btree_node_locked(iter, level, want);
+		return true;
+	} else {
 		return false;
-
-	mark_btree_node_locked(iter, level, want);
-	return true;
+	}
 }
 
 static bool bch2_btree_node_upgrade(struct btree_iter *iter, unsigned level)
@@ -141,7 +147,7 @@ static bool bch2_btree_node_upgrade(struct btree_iter *iter, unsigned level)
 	    : six_relock_type(&b->c.lock, SIX_LOCK_intent, iter->l[level].lock_seq))
 		goto success;
 
-	if (iter->l[level].lock_seq >> 1 == b->c.lock.state.seq >> 1 &&
+	if (btree_node_lock_seq_matches(iter, b, level) &&
 	    btree_node_lock_increment(iter, b, level, BTREE_NODE_INTENT_LOCKED)) {
 		btree_node_unlock(iter, level);
 		goto success;
@@ -166,6 +172,23 @@ static inline bool btree_iter_get_locks(struct btree_iter *iter,
 		if (!(upgrade
 		      ? bch2_btree_node_upgrade(iter, l)
 		      : bch2_btree_node_relock(iter, l))) {
+			if (upgrade)
+				trace_node_upgrade_fail(l, iter->l[l].lock_seq,
+						is_btree_node(iter, l)
+						? 0
+						: (unsigned long) iter->l[l].b,
+						is_btree_node(iter, l)
+						? iter->l[l].b->c.lock.state.seq
+						: 0);
+			else
+				trace_node_relock_fail(l, iter->l[l].lock_seq,
+						is_btree_node(iter, l)
+						? 0
+						: (unsigned long) iter->l[l].b,
+						is_btree_node(iter, l)
+						? iter->l[l].b->c.lock.state.seq
+						: 0);
+
 			fail_idx = l;
 			btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
 		}
@@ -180,7 +203,7 @@ static inline bool btree_iter_get_locks(struct btree_iter *iter,
 	 */
 	while (fail_idx >= 0) {
 		btree_node_unlock(iter, fail_idx);
-		iter->l[fail_idx].b = BTREE_ITER_NOT_END;
+		iter->l[fail_idx].b = BTREE_ITER_NO_NODE_GET_LOCKS;
 		--fail_idx;
 	}
 
@@ -810,7 +833,7 @@ void bch2_btree_iter_node_drop(struct btree_iter *iter, struct btree *b)
 	trans_for_each_iter(iter->trans, linked)
 		if (linked->l[level].b == b) {
 			__btree_node_unlock(linked, level);
-			linked->l[level].b = BTREE_ITER_NOT_END;
+			linked->l[level].b = BTREE_ITER_NO_NODE_DROP;
 		}
 }
 
@@ -848,7 +871,8 @@ static inline int btree_iter_lock_root(struct btree_iter *iter,
 			 * that depth
 			 */
 			iter->level = depth_want;
-			iter->l[iter->level].b = NULL;
+			for (i = iter->level; i < BTREE_MAX_DEPTH; i++)
+				iter->l[i].b = NULL;
 			return 1;
 		}
 
@@ -861,13 +885,14 @@ static inline int btree_iter_lock_root(struct btree_iter *iter,
 			   b->c.level == iter->level &&
 			   !race_fault())) {
 			for (i = 0; i < iter->level; i++)
-				iter->l[i].b = BTREE_ITER_NOT_END;
+				iter->l[i].b = BTREE_ITER_NO_NODE_LOCK_ROOT;
 			iter->l[iter->level].b = b;
+			for (i = iter->level + 1; i < BTREE_MAX_DEPTH; i++)
+				iter->l[i].b = NULL;
 
 			mark_btree_node_locked(iter, iter->level, lock_type);
 			btree_iter_node_set(iter, b);
 			return 0;
-
 		}
 
 		six_unlock_type(&b->c.lock, lock_type);
@@ -973,7 +998,7 @@ retry_all:
 	if (unlikely(ret == -EIO)) {
 		trans->error = true;
 		iter->flags |= BTREE_ITER_ERROR;
-		iter->l[iter->level].b = BTREE_ITER_NOT_END;
+		iter->l[iter->level].b = BTREE_ITER_NO_NODE_ERROR;
 		goto out;
 	}
 
@@ -1008,12 +1033,12 @@ static unsigned btree_iter_up_until_locked(struct btree_iter *iter,
 	unsigned l = iter->level;
 
 	while (btree_iter_node(iter, l) &&
-	       !(is_btree_node(iter, l) &&
-		 bch2_btree_node_relock(iter, l) &&
-		 (!check_pos ||
-		  btree_iter_pos_in_node(iter, iter->l[l].b)))) {
+	       (!is_btree_node(iter, l) ||
+		!bch2_btree_node_relock(iter, l) ||
+		 (check_pos &&
+		  !btree_iter_pos_in_node(iter, iter->l[l].b)))) {
 		btree_node_unlock(iter, l);
-		iter->l[l].b = BTREE_ITER_NOT_END;
+		iter->l[l].b = BTREE_ITER_NO_NODE_UP;
 		l++;
 	}
 
@@ -1069,7 +1094,7 @@ int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter)
 				return 0;
 
 			iter->level = depth_want;
-			iter->l[iter->level].b = BTREE_ITER_NOT_END;
+			iter->l[iter->level].b = BTREE_ITER_NO_NODE_DOWN;
 			return ret;
 		}
 	}
@@ -1590,7 +1615,7 @@ static inline void bch2_btree_iter_init(struct btree_trans *trans,
 	iter->nodes_intent_locked	= 0;
 	for (i = 0; i < ARRAY_SIZE(iter->l); i++)
 		iter->l[i].b		= NULL;
-	iter->l[iter->level].b		= BTREE_ITER_NOT_END;
+	iter->l[iter->level].b		= BTREE_ITER_NO_NODE_INIT;
 
 	prefetch(c->btree_roots[btree_id].b);
 }
@@ -1798,7 +1823,7 @@ struct btree_iter *bch2_trans_get_node_iter(struct btree_trans *trans,
 
 	for (i = 0; i < ARRAY_SIZE(iter->l); i++)
 		iter->l[i].b		= NULL;
-	iter->l[iter->level].b		= BTREE_ITER_NOT_END;
+	iter->l[iter->level].b		= BTREE_ITER_NO_NODE_INIT;
 
 	return iter;
 }
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index ee5437813604..4dff3bc70fbc 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -76,14 +76,12 @@ static inline struct btree_iter *
 __trans_next_iter_with_node(struct btree_trans *trans, struct btree *b,
 			    unsigned idx)
 {
-	EBUG_ON(idx < trans->nr_iters && trans->iters[idx].idx != idx);
+	struct btree_iter *iter = __trans_next_iter(trans, idx);
 
-	for (; idx < trans->nr_iters; idx++)
-		if ((trans->iters_linked & (1ULL << idx)) &&
-		    __iter_has_node(&trans->iters[idx], b))
-			return &trans->iters[idx];
+	while (iter && !__iter_has_node(iter, b))
+		iter = __trans_next_iter(trans, iter->idx + 1);
 
-	return NULL;
+	return iter;
 }
 
 #define trans_for_each_iter_with_node(_trans, _b, _iter)		\
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index e3595a8dce6a..9ec91c5c1bd8 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -1586,6 +1586,7 @@ int bch2_btree_split_leaf(struct bch_fs *c, struct btree_iter *iter,
 	 * instead of locking/reserving all the way to the root:
 	 */
 	if (!bch2_btree_iter_upgrade(iter, U8_MAX)) {
+		trace_trans_restart_iter_upgrade(c, iter->trans->ip);
 		ret = -EINTR;
 		goto out;
 	}
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 5366d2a6c268..d21f6f035182 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -567,6 +567,8 @@ static inline int do_btree_insert_at(struct btree_trans *trans,
 		    update_triggers_transactional(trans, i)) {
 			ret = bch2_trans_mark_update(trans, i,
 						&trans->fs_usage_deltas);
+			if (ret == -EINTR)
+				trace_trans_restart_mark(c, trans->ip);
 			if (ret)
 				return ret;
 		}
@@ -714,7 +716,9 @@ int bch2_trans_commit_error(struct btree_trans *trans,
 		 * don't care if we got ENOSPC because we told split it
 		 * couldn't block:
 		 */
-		if (!ret || (flags & BTREE_INSERT_NOUNLOCK)) {
+		if (!ret ||
+		    ret == -EINTR ||
+		    (flags & BTREE_INSERT_NOUNLOCK)) {
 			trans_restart(" (split)");
 			trace_trans_restart_btree_node_split(c, trans->ip);
 			ret = -EINTR;
@@ -806,6 +810,7 @@ static int __bch2_trans_commit(struct btree_trans *trans,
 		if (!bch2_btree_iter_upgrade(i->iter, 1)) {
 			trans_restart(" (failed upgrade, locks_want %u uptodate %u)",
 				      old_locks_want, old_uptodate);
+			trace_trans_restart_upgrade(c, trans->ip);
 			ret = -EINTR;
 			goto err;
 		}
diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h
index 22a378d5f64f..86f58206365d 100644
--- a/fs/bcachefs/trace.h
+++ b/fs/bcachefs/trace.h
@@ -561,6 +561,21 @@ DEFINE_EVENT(transaction_restart,	trans_restart_btree_node_split,
 	TP_ARGS(c, ip)
 );
 
+DEFINE_EVENT(transaction_restart,	trans_restart_mark,
+	TP_PROTO(struct bch_fs *c, unsigned long ip),
+	TP_ARGS(c, ip)
+);
+
+DEFINE_EVENT(transaction_restart,	trans_restart_upgrade,
+	TP_PROTO(struct bch_fs *c, unsigned long ip),
+	TP_ARGS(c, ip)
+);
+
+DEFINE_EVENT(transaction_restart,	trans_restart_iter_upgrade,
+	TP_PROTO(struct bch_fs *c, unsigned long ip),
+	TP_ARGS(c, ip)
+);
+
 DEFINE_EVENT(transaction_restart,	trans_restart_traverse,
 	TP_PROTO(struct bch_fs *c, unsigned long ip),
 	TP_ARGS(c, ip)
@@ -571,6 +586,39 @@ DEFINE_EVENT(transaction_restart,	trans_restart_atomic,
 	TP_ARGS(c, ip)
 );
 
+DECLARE_EVENT_CLASS(node_lock_fail,
+	TP_PROTO(unsigned level, u32 iter_seq, unsigned node, u32 node_seq),
+	TP_ARGS(level, iter_seq, node, node_seq),
+
+	TP_STRUCT__entry(
+		__field(u32,		level)
+		__field(u32,		iter_seq)
+		__field(u32,		node)
+		__field(u32,		node_seq)
+	),
+
+	TP_fast_assign(
+		__entry->level		= level;
+		__entry->iter_seq	= iter_seq;
+		__entry->node		= node;
+		__entry->node_seq	= node_seq;
+	),
+
+	TP_printk("level %u iter seq %u node %u node seq %u",
+		  __entry->level, __entry->iter_seq,
+		  __entry->node, __entry->node_seq)
+);
+
+DEFINE_EVENT(node_lock_fail, node_upgrade_fail,
+	TP_PROTO(unsigned level, u32 iter_seq, unsigned node, u32 node_seq),
+	TP_ARGS(level, iter_seq, node, node_seq)
+);
+
+DEFINE_EVENT(node_lock_fail, node_relock_fail,
+	TP_PROTO(unsigned level, u32 iter_seq, unsigned node, u32 node_seq),
+	TP_ARGS(level, iter_seq, node, node_seq)
+);
+
 #endif /* _TRACE_BCACHEFS_H */
 
 /* This part must be outside protection */
-- 
cgit 


From 6fb076e60d65e574756cd1a4262c1c05b750ec21 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 14 May 2019 14:37:11 -0400
Subject: bcachefs: Fix spurious inconsistency in recovery

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c | 8 ++++++--
 fs/bcachefs/buckets.c          | 5 ++++-
 fs/bcachefs/buckets.h          | 1 +
 3 files changed, 11 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 25c18b8cd3a6..61991d898d99 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -232,7 +232,9 @@ int bch2_alloc_read(struct bch_fs *c, struct journal_keys *journal_keys)
 	bch2_trans_init(&trans, c);
 
 	for_each_btree_key(&trans, iter, BTREE_ID_ALLOC, POS_MIN, 0, k, ret)
-		bch2_mark_key(c, k, true, 0, NULL, 0, 0);
+		bch2_mark_key(c, k, true, 0, NULL, 0,
+			      BCH_BUCKET_MARK_NOATOMIC|
+			      BCH_BUCKET_MARK_ALLOC_READ);
 
 	ret = bch2_trans_exit(&trans) ?: ret;
 	if (ret) {
@@ -243,7 +245,9 @@ int bch2_alloc_read(struct bch_fs *c, struct journal_keys *journal_keys)
 	for_each_journal_key(*journal_keys, j)
 		if (j->btree_id == BTREE_ID_ALLOC)
 			bch2_mark_key(c, bkey_i_to_s_c(j->k),
-				      true, 0, NULL, 0, 0);
+				      true, 0, NULL, 0,
+				      BCH_BUCKET_MARK_NOATOMIC|
+				      BCH_BUCKET_MARK_ALLOC_READ);
 
 	percpu_down_write(&c->mark_lock);
 	bch2_dev_usage_from_buckets(c);
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index d0a288dddb45..78b4c93a7170 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -657,7 +657,7 @@ static int bch2_mark_alloc(struct bch_fs *c, struct bkey_s_c k,
 	g = __bucket(ca, k.k->p.offset, gc);
 	u = bch2_alloc_unpack(k);
 
-	old = bucket_data_cmpxchg(c, ca, fs_usage, g, m, ({
+	old = bucket_cmpxchg(g, m, ({
 		m.gen			= u.gen;
 		m.data_type		= u.data_type;
 		m.dirty_sectors		= u.dirty_sectors;
@@ -669,6 +669,9 @@ static int bch2_mark_alloc(struct bch_fs *c, struct bkey_s_c k,
 		}
 	}));
 
+	if (!(flags & BCH_BUCKET_MARK_ALLOC_READ))
+		bch2_dev_usage_update(c, ca, fs_usage, old, m, gc);
+
 	g->io_time[READ]	= u.read_time;
 	g->io_time[WRITE]	= u.write_time;
 	g->oldest_gen		= u.oldest_gen;
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index 578019089a91..9f53fe6280f3 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -251,6 +251,7 @@ void bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *,
 
 #define BCH_BUCKET_MARK_GC			(1 << 0)
 #define BCH_BUCKET_MARK_NOATOMIC		(1 << 1)
+#define BCH_BUCKET_MARK_ALLOC_READ		(1 << 2)
 
 int bch2_mark_key_locked(struct bch_fs *, struct bkey_s_c,
 		  bool, s64, struct bch_fs_usage *,
-- 
cgit 


From 58fbf80834c9fc624a269047a07e94a188350f20 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 15 May 2019 09:47:40 -0400
Subject: bcachefs: Delete duplicate code

Also rename for consistency

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_cache.c           |  2 +-
 fs/bcachefs/btree_iter.c            | 28 +++++-----------------------
 fs/bcachefs/btree_iter.h            |  6 ++----
 fs/bcachefs/btree_update_interior.c | 26 +++++++++++++-------------
 fs/bcachefs/btree_update_leaf.c     | 14 +++++++-------
 fs/bcachefs/ec.c                    |  2 +-
 fs/bcachefs/fs-io.c                 |  6 +++---
 fs/bcachefs/fsck.c                  |  6 +++---
 fs/bcachefs/io.c                    |  4 ++--
 9 files changed, 37 insertions(+), 57 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index ece2c4d2ebd8..34a6d67a5bf1 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -841,7 +841,7 @@ struct btree *bch2_btree_node_get_sibling(struct bch_fs *c,
 			}
 		}
 
-		bch2_btree_trans_relock(trans);
+		bch2_trans_relock(trans);
 	}
 out:
 	if (btree_lock_want(iter, level + 1) == BTREE_NODE_UNLOCKED)
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 4d107d890438..5995e8e6c0d7 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -408,7 +408,9 @@ void __bch2_btree_iter_downgrade(struct btree_iter *iter,
 	bch2_btree_trans_verify_locks(iter->trans);
 }
 
-bool bch2_btree_trans_relock(struct btree_trans *trans)
+/* Btree transaction locking: */
+
+bool bch2_trans_relock(struct btree_trans *trans)
 {
 	struct btree_iter *iter;
 	bool ret = true;
@@ -419,7 +421,7 @@ bool bch2_btree_trans_relock(struct btree_trans *trans)
 	return ret;
 }
 
-void bch2_btree_trans_unlock(struct btree_trans *trans)
+void bch2_trans_unlock(struct btree_trans *trans)
 {
 	struct btree_iter *iter;
 
@@ -427,8 +429,6 @@ void bch2_btree_trans_unlock(struct btree_trans *trans)
 		__bch2_btree_iter_unlock(iter);
 }
 
-/* Btree transaction locking: */
-
 /* Btree iterator: */
 
 #ifdef CONFIG_BCACHEFS_DEBUG
@@ -982,7 +982,7 @@ static int __btree_iter_traverse_all(struct btree_trans *trans,
 #undef btree_iter_cmp_by_idx
 
 retry_all:
-	bch2_btree_trans_unlock(trans);
+	bch2_trans_unlock(trans);
 
 	if (unlikely(ret == -ENOMEM)) {
 		struct closure cl;
@@ -1885,24 +1885,6 @@ void *bch2_trans_kmalloc(struct btree_trans *trans,
 	return ret;
 }
 
-int bch2_trans_unlock(struct btree_trans *trans)
-{
-	u64 iters = trans->iters_linked;
-	int ret = 0;
-
-	while (iters) {
-		unsigned idx = __ffs64(iters);
-		struct btree_iter *iter = &trans->iters[idx];
-
-		ret = ret ?: btree_iter_err(iter);
-
-		__bch2_btree_iter_unlock(iter);
-		iters ^= 1ULL << idx;
-	}
-
-	return ret;
-}
-
 inline void bch2_trans_unlink_iters(struct btree_trans *trans, u64 iters)
 {
 	iters &= trans->iters_linked;
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index 4dff3bc70fbc..18100722ccfd 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -103,8 +103,8 @@ void bch2_btree_node_iter_fix(struct btree_iter *, struct btree *,
 			      struct btree_node_iter *, struct bkey_packed *,
 			      unsigned, unsigned);
 
-bool bch2_btree_trans_relock(struct btree_trans *);
-void bch2_btree_trans_unlock(struct btree_trans *);
+bool bch2_trans_relock(struct btree_trans *);
+void bch2_trans_unlock(struct btree_trans *);
 
 bool __bch2_btree_iter_upgrade(struct btree_iter *, unsigned);
 bool __bch2_btree_iter_upgrade_nounlock(struct btree_iter *, unsigned);
@@ -191,8 +191,6 @@ static inline int btree_iter_cmp(const struct btree_iter *l,
 	return __btree_iter_cmp(l->btree_id, l->pos, r);
 }
 
-int bch2_trans_unlock(struct btree_trans *);
-
 /*
  * Unlocks before scheduling
  * Note: does not revalidate iterator
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 9ec91c5c1bd8..d0ca08a323a1 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -1574,10 +1574,10 @@ int bch2_btree_split_leaf(struct bch_fs *c, struct btree_iter *iter,
 		if (flags & BTREE_INSERT_NOUNLOCK)
 			return -EINTR;
 
-		bch2_btree_trans_unlock(trans);
+		bch2_trans_unlock(trans);
 		down_read(&c->gc_lock);
 
-		if (!bch2_btree_trans_relock(trans))
+		if (!bch2_trans_relock(trans))
 			ret = -EINTR;
 	}
 
@@ -1598,7 +1598,7 @@ int bch2_btree_split_leaf(struct bch_fs *c, struct btree_iter *iter,
 		ret = PTR_ERR(as);
 		if (ret == -EAGAIN) {
 			BUG_ON(flags & BTREE_INSERT_NOUNLOCK);
-			bch2_btree_trans_unlock(trans);
+			bch2_trans_unlock(trans);
 			ret = -EINTR;
 		}
 		goto out;
@@ -1778,7 +1778,7 @@ err_cycle_gc_lock:
 	if (flags & BTREE_INSERT_NOUNLOCK)
 		goto out;
 
-	bch2_btree_trans_unlock(trans);
+	bch2_trans_unlock(trans);
 
 	down_read(&c->gc_lock);
 	up_read(&c->gc_lock);
@@ -1794,7 +1794,7 @@ err:
 
 	if ((ret == -EAGAIN || ret == -EINTR) &&
 	    !(flags & BTREE_INSERT_NOUNLOCK)) {
-		bch2_btree_trans_unlock(trans);
+		bch2_trans_unlock(trans);
 		closure_sync(&cl);
 		ret = bch2_btree_iter_traverse(iter);
 		if (ret)
@@ -1874,7 +1874,7 @@ int bch2_btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter,
 
 	if (!(flags & BTREE_INSERT_GC_LOCK_HELD)) {
 		if (!down_read_trylock(&c->gc_lock)) {
-			bch2_btree_trans_unlock(trans);
+			bch2_trans_unlock(trans);
 			down_read(&c->gc_lock);
 		}
 	}
@@ -1893,7 +1893,7 @@ int bch2_btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter,
 		    ret != -EINTR)
 			break;
 
-		bch2_btree_trans_unlock(trans);
+		bch2_trans_unlock(trans);
 		closure_sync(&cl);
 	}
 
@@ -2046,10 +2046,10 @@ int bch2_btree_node_update_key(struct bch_fs *c, struct btree_iter *iter,
 		return -EINTR;
 
 	if (!down_read_trylock(&c->gc_lock)) {
-		bch2_btree_trans_unlock(iter->trans);
+		bch2_trans_unlock(iter->trans);
 		down_read(&c->gc_lock);
 
-		if (!bch2_btree_trans_relock(iter->trans)) {
+		if (!bch2_trans_relock(iter->trans)) {
 			ret = -EINTR;
 			goto err;
 		}
@@ -2060,12 +2060,12 @@ int bch2_btree_node_update_key(struct bch_fs *c, struct btree_iter *iter,
 		/* bch2_btree_reserve_get will unlock */
 		ret = bch2_btree_cache_cannibalize_lock(c, &cl);
 		if (ret) {
-			bch2_btree_trans_unlock(iter->trans);
+			bch2_trans_unlock(iter->trans);
 			up_read(&c->gc_lock);
 			closure_sync(&cl);
 			down_read(&c->gc_lock);
 
-			if (!bch2_btree_trans_relock(iter->trans)) {
+			if (!bch2_trans_relock(iter->trans)) {
 				ret = -EINTR;
 				goto err;
 			}
@@ -2089,12 +2089,12 @@ int bch2_btree_node_update_key(struct bch_fs *c, struct btree_iter *iter,
 		if (ret != -EINTR)
 			goto err;
 
-		bch2_btree_trans_unlock(iter->trans);
+		bch2_trans_unlock(iter->trans);
 		up_read(&c->gc_lock);
 		closure_sync(&cl);
 		down_read(&c->gc_lock);
 
-		if (!bch2_btree_trans_relock(iter->trans))
+		if (!bch2_trans_relock(iter->trans))
 			goto err;
 	}
 
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index d21f6f035182..7286a5b45481 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -431,14 +431,14 @@ static int bch2_trans_journal_preres_get(struct btree_trans *trans)
 	if (ret != -EAGAIN)
 		return ret;
 
-	bch2_btree_trans_unlock(trans);
+	bch2_trans_unlock(trans);
 
 	ret = bch2_journal_preres_get(&c->journal,
 			&trans->journal_preres, u64s, 0);
 	if (ret)
 		return ret;
 
-	if (!bch2_btree_trans_relock(trans)) {
+	if (!bch2_trans_relock(trans)) {
 		trans_restart(" (iter relock after journal preres get blocked)");
 		trace_trans_restart_journal_preres_get(c, trans->ip);
 		return -EINTR;
@@ -736,7 +736,7 @@ int bch2_trans_commit_error(struct btree_trans *trans,
 				return ret;
 		}
 
-		if (bch2_btree_trans_relock(trans))
+		if (bch2_trans_relock(trans))
 			return 0;
 
 		trans_restart(" (iter relock after marking replicas)");
@@ -744,13 +744,13 @@ int bch2_trans_commit_error(struct btree_trans *trans,
 		ret = -EINTR;
 		break;
 	case BTREE_INSERT_NEED_JOURNAL_RES:
-		bch2_btree_trans_unlock(trans);
+		bch2_trans_unlock(trans);
 
 		ret = bch2_trans_journal_res_get(trans, JOURNAL_RES_GET_CHECK);
 		if (ret)
 			return ret;
 
-		if (bch2_btree_trans_relock(trans))
+		if (bch2_trans_relock(trans))
 			return 0;
 
 		trans_restart(" (iter relock after journal res get blocked)");
@@ -878,7 +878,7 @@ int bch2_trans_commit(struct btree_trans *trans,
 		if (likely(!(trans->flags & BTREE_INSERT_LAZY_RW)))
 			return -EROFS;
 
-		bch2_btree_trans_unlock(trans);
+		bch2_trans_unlock(trans);
 
 		ret = bch2_fs_read_write_early(c);
 		if (ret)
@@ -886,7 +886,7 @@ int bch2_trans_commit(struct btree_trans *trans,
 
 		percpu_ref_get(&c->writes);
 
-		if (!bch2_btree_trans_relock(trans)) {
+		if (!bch2_trans_relock(trans)) {
 			ret = -EINTR;
 			goto err;
 		}
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index 56939428a21a..49cbc0bcd522 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -563,7 +563,7 @@ static int ec_stripe_mem_alloc(struct bch_fs *c,
 	if (!__ec_stripe_mem_alloc(c, idx, GFP_NOWAIT|__GFP_NOWARN))
 		return ret;
 
-	bch2_btree_trans_unlock(iter->trans);
+	bch2_trans_unlock(iter->trans);
 	ret = -EINTR;
 
 	if (!__ec_stripe_mem_alloc(c, idx, GFP_KERNEL))
diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 619daf65e1ec..bce45c87c901 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -322,10 +322,10 @@ static int bch2_extent_update(struct btree_trans *trans,
 	if (i_sectors_delta ||
 	    new_i_size > inode->ei_inode.bi_size) {
 		if (c->opts.new_inode_updates) {
-			bch2_btree_trans_unlock(trans);
+			bch2_trans_unlock(trans);
 			mutex_lock(&inode->ei_update_lock);
 
-			if (!bch2_btree_trans_relock(trans)) {
+			if (!bch2_trans_relock(trans)) {
 				mutex_unlock(&inode->ei_update_lock);
 				return -EINTR;
 			}
@@ -949,7 +949,7 @@ static void bchfs_read(struct btree_trans *trans, struct btree_iter *iter,
 		}
 
 		bkey_reassemble(&tmp.k, k);
-		bch2_btree_trans_unlock(trans);
+		bch2_trans_unlock(trans);
 		k = bkey_i_to_s_c(&tmp.k);
 
 		if (readpages_iter) {
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 61569e4e1c77..2dfa87edb116 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -58,7 +58,7 @@ static int remove_dirent(struct btree_trans *trans,
 	name.name = buf;
 
 	/* Unlock so we don't deadlock, after copying name: */
-	bch2_btree_trans_unlock(trans);
+	bch2_trans_unlock(trans);
 
 	ret = bch2_inode_find_by_inum(c, dir_inum, &dir_inode);
 	if (ret) {
@@ -1015,7 +1015,7 @@ retry:
 		if (fsck_err_on(!inode_bitmap_test(&dirs_done, k.k->p.inode), c,
 				"unreachable directory found (inum %llu)",
 				k.k->p.inode)) {
-			bch2_btree_trans_unlock(&trans);
+			bch2_trans_unlock(&trans);
 
 			ret = reattach_inode(c, lostfound_inode, k.k->p.inode);
 			if (ret) {
@@ -1229,7 +1229,7 @@ static int check_inode(struct btree_trans *trans,
 
 	ret = bch2_inode_unpack(inode, &u);
 
-	bch2_btree_trans_unlock(trans);
+	bch2_trans_unlock(trans);
 
 	if (bch2_fs_inconsistent_on(ret, c,
 			 "error unpacking inode %llu in fsck",
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index 9bb4b1fe5b8a..a676cc1e390d 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -1332,7 +1332,7 @@ retry:
 
 		bkey_reassemble(&tmp.k, k);
 		k = bkey_i_to_s_c(&tmp.k);
-		bch2_btree_trans_unlock(&trans);
+		bch2_trans_unlock(&trans);
 
 		bytes = min_t(unsigned, bvec_iter.bi_size,
 			      (k.k->p.offset - bvec_iter.bi_sector) << 9);
@@ -1889,7 +1889,7 @@ void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, u64 inode)
 		 */
 		bkey_reassemble(&tmp.k, k);
 		k = bkey_i_to_s_c(&tmp.k);
-		bch2_btree_trans_unlock(&trans);
+		bch2_trans_unlock(&trans);
 
 		bytes = min_t(unsigned, rbio->bio.bi_iter.bi_size,
 			      (k.k->p.offset - rbio->bio.bi_iter.bi_sector) << 9);
-- 
cgit 


From 87c3beb4a589312f2c150129309a48b5518385e7 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 15 May 2019 09:49:46 -0400
Subject: bcachefs: Fix a bug with spinning on the journal

Transactional triggers meant that when we failed to get a journal
reservation, then bailed out into the error path to block on a journal
reservation, the second blocking call into the journal code was asking
for less space, which is not what we want.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_types.h       |  1 +
 fs/bcachefs/btree_update_leaf.c | 23 +++++++++++------------
 2 files changed, 12 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index d88f1c911d04..9bab213fd65b 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -301,6 +301,7 @@ struct btree_trans {
 	u64			*journal_seq;
 	struct disk_reservation *disk_res;
 	unsigned		flags;
+	unsigned		journal_u64s;
 
 	struct btree_iter	iters_onstack[2];
 	struct btree_insert_entry updates_onstack[6];
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 7286a5b45481..88e038c1ccef 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -451,21 +451,13 @@ static int bch2_trans_journal_res_get(struct btree_trans *trans,
 				      unsigned flags)
 {
 	struct bch_fs *c = trans->c;
-	struct btree_insert_entry *i;
-	unsigned u64s = 0;
 	int ret;
 
-	if (unlikely(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))
-		return 0;
-
 	if (trans->flags & BTREE_INSERT_JOURNAL_RESERVED)
 		flags |= JOURNAL_RES_GET_RESERVED;
 
-	trans_for_each_update(trans, i)
-		u64s += jset_u64s(i->k->k.u64s);
-
 	ret = bch2_journal_res_get(&c->journal, &trans->journal_res,
-				   u64s, flags);
+				   trans->journal_u64s, flags);
 
 	return ret == -EAGAIN ? BTREE_INSERT_NEED_JOURNAL_RES : ret;
 }
@@ -612,9 +604,16 @@ static inline int do_btree_insert_at(struct btree_trans *trans,
 	 * Don't get journal reservation until after we know insert will
 	 * succeed:
 	 */
-	ret = bch2_trans_journal_res_get(trans, JOURNAL_RES_GET_NONBLOCK);
-	if (ret)
-		goto out;
+	if (likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) {
+		trans->journal_u64s = 0;
+
+		trans_for_each_update(trans, i)
+			trans->journal_u64s += jset_u64s(i->k->k.u64s);
+
+		ret = bch2_trans_journal_res_get(trans, JOURNAL_RES_GET_NONBLOCK);
+		if (ret)
+			goto out;
+	}
 
 	if (!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)) {
 		if (journal_seq_verify(c))
-- 
cgit 


From 0e6dd8fba04499c1f1a20dab2d463b12b03b2770 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 15 May 2019 09:53:27 -0400
Subject: bcachefs: Ensure bch2_btree_iter_next() always advances

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c | 32 ++++++++++++++++++++++----------
 fs/bcachefs/btree_iter.h | 11 +++++------
 2 files changed, 27 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 5995e8e6c0d7..cbf9281e195b 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1110,7 +1110,8 @@ int __must_check bch2_btree_iter_traverse(struct btree_iter *iter)
 {
 	int ret;
 
-	ret = __bch2_btree_iter_traverse(iter);
+	ret =   bch2_trans_cond_resched(iter->trans) ?:
+		__bch2_btree_iter_traverse(iter);
 	if (unlikely(ret))
 		ret = __btree_iter_traverse_all(iter->trans, iter, ret);
 
@@ -1302,9 +1303,11 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
 		return btree_iter_peek_uptodate(iter);
 
 	while (1) {
-		ret = bch2_btree_iter_traverse(iter);
-		if (unlikely(ret))
-			return bkey_s_c_err(ret);
+		if (iter->uptodate >= BTREE_ITER_NEED_RELOCK) {
+			ret = bch2_btree_iter_traverse(iter);
+			if (unlikely(ret))
+				return bkey_s_c_err(ret);
+		}
 
 		k = __btree_iter_peek(iter, l);
 		if (likely(k.k))
@@ -1356,10 +1359,17 @@ struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter)
 
 	bch2_btree_iter_checks(iter, BTREE_ITER_KEYS);
 
+	iter->pos = btree_type_successor(iter->btree_id, iter->k.p);
+
 	if (unlikely(iter->uptodate != BTREE_ITER_UPTODATE)) {
-		k = bch2_btree_iter_peek(iter);
-		if (IS_ERR_OR_NULL(k.k))
-			return k;
+		/*
+		 * XXX: when we just need to relock we should be able to avoid
+		 * calling traverse, but we need to kill BTREE_ITER_NEED_PEEK
+		 * for that to work
+		 */
+		btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
+
+		return bch2_btree_iter_peek(iter);
 	}
 
 	do {
@@ -1559,9 +1569,11 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
 	if (iter->uptodate == BTREE_ITER_UPTODATE)
 		return btree_iter_peek_uptodate(iter);
 
-	ret = bch2_btree_iter_traverse(iter);
-	if (unlikely(ret))
-		return bkey_s_c_err(ret);
+	if (iter->uptodate >= BTREE_ITER_NEED_RELOCK) {
+		ret = bch2_btree_iter_traverse(iter);
+		if (unlikely(ret))
+			return bkey_s_c_err(ret);
+	}
 
 	return __bch2_btree_iter_peek_slot(iter);
 }
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index 18100722ccfd..ee2cea2b0b44 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -195,13 +195,14 @@ static inline int btree_iter_cmp(const struct btree_iter *l,
  * Unlocks before scheduling
  * Note: does not revalidate iterator
  */
-static inline void bch2_trans_cond_resched(struct btree_trans *trans)
+static inline int bch2_trans_cond_resched(struct btree_trans *trans)
 {
-	if (need_resched()) {
+	if (need_resched() || race_fault()) {
 		bch2_trans_unlock(trans);
 		schedule();
-	} else if (race_fault()) {
-		bch2_trans_unlock(trans);
+		return bch2_trans_relock(trans) ? 0 : -EINTR;
+	} else {
+		return 0;
 	}
 }
 
@@ -229,8 +230,6 @@ static inline struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter,
 static inline struct bkey_s_c __bch2_btree_iter_next(struct btree_iter *iter,
 						     unsigned flags)
 {
-	bch2_trans_cond_resched(iter->trans);
-
 	return flags & BTREE_ITER_SLOTS
 		? bch2_btree_iter_next_slot(iter)
 		: bch2_btree_iter_next(iter);
-- 
cgit 


From 7d825866604b34ba02b4c286c6fd6d232fd06cd0 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 15 May 2019 10:08:55 -0400
Subject: bcachefs: Avoid spurious transaction restarts

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c | 43 ++++++++++++++++++++-----------------------
 fs/bcachefs/btree_iter.h |  2 ++
 fs/bcachefs/fs.c         |  1 +
 3 files changed, 23 insertions(+), 23 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index cbf9281e195b..b058b6f3b89d 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -160,7 +160,7 @@ success:
 }
 
 static inline bool btree_iter_get_locks(struct btree_iter *iter,
-					bool upgrade)
+					bool upgrade, bool trace)
 {
 	unsigned l = iter->level;
 	int fail_idx = -1;
@@ -172,16 +172,10 @@ static inline bool btree_iter_get_locks(struct btree_iter *iter,
 		if (!(upgrade
 		      ? bch2_btree_node_upgrade(iter, l)
 		      : bch2_btree_node_relock(iter, l))) {
-			if (upgrade)
-				trace_node_upgrade_fail(l, iter->l[l].lock_seq,
-						is_btree_node(iter, l)
-						? 0
-						: (unsigned long) iter->l[l].b,
-						is_btree_node(iter, l)
-						? iter->l[l].b->c.lock.state.seq
-						: 0);
-			else
-				trace_node_relock_fail(l, iter->l[l].lock_seq,
+			if (trace)
+				(upgrade
+				 ? trace_node_upgrade_fail
+				 : trace_node_relock_fail)(l, iter->l[l].lock_seq,
 						is_btree_node(iter, l)
 						? 0
 						: (unsigned long) iter->l[l].b,
@@ -251,7 +245,7 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
 				linked->locks_want = max_t(unsigned,
 						linked->locks_want,
 						__fls(linked->nodes_locked) + 1);
-				btree_iter_get_locks(linked, true);
+				btree_iter_get_locks(linked, true, false);
 			}
 			ret = false;
 		}
@@ -268,7 +262,7 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
 					max(level + 1, max_t(unsigned,
 					    linked->locks_want,
 					    iter->locks_want));
-				btree_iter_get_locks(linked, true);
+				btree_iter_get_locks(linked, true, false);
 			}
 			ret = false;
 		}
@@ -312,10 +306,10 @@ void bch2_btree_trans_verify_locks(struct btree_trans *trans)
 #endif
 
 __flatten
-static bool bch2_btree_iter_relock(struct btree_iter *iter)
+static bool bch2_btree_iter_relock(struct btree_iter *iter, bool trace)
 {
 	return iter->uptodate >= BTREE_ITER_NEED_RELOCK
-		? btree_iter_get_locks(iter, false)
+		? btree_iter_get_locks(iter, false, trace)
 		: true;
 }
 
@@ -328,7 +322,7 @@ bool __bch2_btree_iter_upgrade(struct btree_iter *iter,
 
 	iter->locks_want = new_locks_want;
 
-	if (btree_iter_get_locks(iter, true))
+	if (btree_iter_get_locks(iter, true, true))
 		return true;
 
 	/*
@@ -341,7 +335,7 @@ bool __bch2_btree_iter_upgrade(struct btree_iter *iter,
 		    linked->btree_id == iter->btree_id &&
 		    linked->locks_want < new_locks_want) {
 			linked->locks_want = new_locks_want;
-			btree_iter_get_locks(linked, true);
+			btree_iter_get_locks(linked, true, false);
 		}
 
 	return false;
@@ -416,7 +410,8 @@ bool bch2_trans_relock(struct btree_trans *trans)
 	bool ret = true;
 
 	trans_for_each_iter(trans, iter)
-		ret &= bch2_btree_iter_relock(iter);
+		if (iter->uptodate == BTREE_ITER_NEED_RELOCK)
+			ret &= bch2_btree_iter_relock(iter, true);
 
 	return ret;
 }
@@ -1061,7 +1056,7 @@ int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter)
 	if (unlikely(iter->level >= BTREE_MAX_DEPTH))
 		return 0;
 
-	if (bch2_btree_iter_relock(iter))
+	if (bch2_btree_iter_relock(iter, false))
 		return 0;
 
 	/*
@@ -1672,11 +1667,13 @@ int bch2_trans_iter_free_on_commit(struct btree_trans *trans,
 	return ret;
 }
 
-static int btree_trans_realloc_iters(struct btree_trans *trans,
-				     unsigned new_size)
+int bch2_trans_realloc_iters(struct btree_trans *trans,
+				    unsigned new_size)
 {
 	void *new_iters, *new_updates;
 
+	new_size = roundup_pow_of_two(new_size);
+
 	BUG_ON(new_size > BTREE_ITER_MAX);
 
 	if (new_size <= trans->size)
@@ -1727,7 +1724,7 @@ success:
 
 void bch2_trans_preload_iters(struct btree_trans *trans)
 {
-	btree_trans_realloc_iters(trans, BTREE_ITER_MAX);
+	bch2_trans_realloc_iters(trans, BTREE_ITER_MAX);
 }
 
 static int btree_trans_iter_alloc(struct btree_trans *trans)
@@ -1738,7 +1735,7 @@ static int btree_trans_iter_alloc(struct btree_trans *trans)
 		goto got_slot;
 
 	if (trans->nr_iters == trans->size) {
-		int ret = btree_trans_realloc_iters(trans, trans->size * 2);
+		int ret = bch2_trans_realloc_iters(trans, trans->size * 2);
 		if (ret)
 			return ret;
 	}
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index ee2cea2b0b44..3089aa7cf8e9 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -258,7 +258,9 @@ static inline int bkey_err(struct bkey_s_c k)
 
 /* new multiple iterator interface: */
 
+int bch2_trans_realloc_iters(struct btree_trans *, unsigned);
 void bch2_trans_preload_iters(struct btree_trans *);
+
 int bch2_trans_iter_put(struct btree_trans *, struct btree_iter *);
 int bch2_trans_iter_free(struct btree_trans *, struct btree_iter *);
 int bch2_trans_iter_free_on_commit(struct btree_trans *, struct btree_iter *);
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index aac59b8a15eb..b5a025939f51 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -415,6 +415,7 @@ __bch2_create(struct mnt_idmap *idmap,
 		mutex_lock(&dir->ei_update_lock);
 
 	bch2_trans_init(&trans, c);
+	bch2_trans_realloc_iters(&trans, 8);
 retry:
 	bch2_trans_begin(&trans);
 
-- 
cgit 


From 20bceecb3159bbe06a26fc6747457d9de02ec227 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 15 May 2019 10:54:43 -0400
Subject: bcachefs: More work to avoid transaction restarts

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/acl.c                   |  4 +-
 fs/bcachefs/alloc_background.c      |  8 ++--
 fs/bcachefs/btree_cache.c           |  3 +-
 fs/bcachefs/btree_gc.c              |  6 +--
 fs/bcachefs/btree_io.c              |  2 +-
 fs/bcachefs/btree_iter.c            | 53 +++++++++++++---------
 fs/bcachefs/btree_iter.h            |  5 +--
 fs/bcachefs/btree_update.h          |  2 +-
 fs/bcachefs/btree_update_interior.c |  2 +-
 fs/bcachefs/btree_update_leaf.c     | 31 ++++++++-----
 fs/bcachefs/debug.c                 |  6 +--
 fs/bcachefs/dirent.c                |  4 +-
 fs/bcachefs/ec.c                    | 13 +++---
 fs/bcachefs/extents.c               |  2 +-
 fs/bcachefs/fs-io.c                 | 22 ++++-----
 fs/bcachefs/fs.c                    | 17 ++++---
 fs/bcachefs/fsck.c                  | 21 +++------
 fs/bcachefs/inode.c                 |  2 +-
 fs/bcachefs/io.c                    | 10 ++---
 fs/bcachefs/journal_seq_blacklist.c |  2 +-
 fs/bcachefs/migrate.c               |  5 +--
 fs/bcachefs/move.c                  |  7 ++-
 fs/bcachefs/quota.c                 |  6 +--
 fs/bcachefs/recovery.c              |  3 +-
 fs/bcachefs/sysfs.c                 |  2 +-
 fs/bcachefs/tests.c                 | 26 +++++------
 fs/bcachefs/trace.h                 | 90 +++++++++++++++++++++++--------------
 fs/bcachefs/xattr.c                 |  4 +-
 28 files changed, 189 insertions(+), 169 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/acl.c b/fs/bcachefs/acl.c
index c7f6bcb87387..1c3343252129 100644
--- a/fs/bcachefs/acl.c
+++ b/fs/bcachefs/acl.c
@@ -222,7 +222,7 @@ struct posix_acl *bch2_get_acl(struct mnt_idmap *idmap,
 	struct bkey_s_c_xattr xattr;
 	struct posix_acl *acl = NULL;
 
-	bch2_trans_init(&trans, c);
+	bch2_trans_init(&trans, c, 0, 0);
 retry:
 	bch2_trans_begin(&trans);
 
@@ -305,7 +305,7 @@ int bch2_set_acl(struct mnt_idmap *idmap,
 	int ret;
 
 	mutex_lock(&inode->ei_update_lock);
-	bch2_trans_init(&trans, c);
+	bch2_trans_init(&trans, c, 0, 0);
 
 	if (type == ACL_TYPE_ACCESS && acl) {
 		ret = posix_acl_update_mode(idmap, &inode->v, &mode, &acl);
diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 61991d898d99..23b81f6615ca 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -229,7 +229,7 @@ int bch2_alloc_read(struct bch_fs *c, struct journal_keys *journal_keys)
 	unsigned i;
 	int ret;
 
-	bch2_trans_init(&trans, c);
+	bch2_trans_init(&trans, c, 0, 0);
 
 	for_each_btree_key(&trans, iter, BTREE_ID_ALLOC, POS_MIN, 0, k, ret)
 		bch2_mark_key(c, k, true, 0, NULL, 0,
@@ -288,7 +288,7 @@ int bch2_alloc_replay_key(struct bch_fs *c, struct bkey_i *k)
 	if (k->k.p.offset >= ca->mi.nbuckets)
 		return 0;
 
-	bch2_trans_init(&trans, c);
+	bch2_trans_init(&trans, c, 0, 0);
 
 	iter = bch2_trans_get_iter(&trans, BTREE_ID_ALLOC, k->k.p,
 				   BTREE_ITER_INTENT);
@@ -333,7 +333,7 @@ int bch2_alloc_write(struct bch_fs *c, unsigned flags, bool *wrote)
 
 	BUG_ON(BKEY_ALLOC_VAL_U64s_MAX > 8);
 
-	bch2_trans_init(&trans, c);
+	bch2_trans_init(&trans, c, 0, 0);
 
 	iter = bch2_trans_get_iter(&trans, BTREE_ID_ALLOC, POS_MIN,
 				   BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
@@ -1032,7 +1032,7 @@ static int bch2_invalidate_buckets(struct bch_fs *c, struct bch_dev *ca)
 	u64 journal_seq = 0;
 	int ret = 0;
 
-	bch2_trans_init(&trans, c);
+	bch2_trans_init(&trans, c, 0, 0);
 
 	iter = bch2_trans_get_iter(&trans, BTREE_ID_ALLOC,
 				   POS(ca->dev_idx, 0),
diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index 34a6d67a5bf1..60a7acd18603 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -732,8 +732,7 @@ retry:
 				goto retry;
 
 			trans_restart();
-			trace_trans_restart_btree_node_reused(c,
-						iter->trans->ip);
+			trace_trans_restart_btree_node_reused(iter->trans->ip);
 			return ERR_PTR(-EINTR);
 		}
 	}
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 3dc073e5e5b6..047f30efdd7a 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -217,7 +217,7 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id,
 	u8 max_stale;
 	int ret = 0;
 
-	bch2_trans_init(&trans, c);
+	bch2_trans_init(&trans, c, 0, 0);
 
 	gc_pos_set(c, gc_pos_btree(btree_id, POS_MIN, 0));
 
@@ -286,7 +286,7 @@ static int mark_journal_key(struct bch_fs *c, enum btree_id id,
 	if (ret)
 		return ret;
 
-	bch2_trans_init(&trans, c);
+	bch2_trans_init(&trans, c, 0, 0);
 
 	for_each_btree_key(&trans, iter, id, bkey_start_pos(&insert->k),
 			   BTREE_ITER_SLOTS, k, ret) {
@@ -1055,7 +1055,7 @@ static int bch2_coalesce_btree(struct bch_fs *c, enum btree_id btree_id)
 	struct btree *merge[GC_MERGE_NODES];
 	u32 lock_seq[GC_MERGE_NODES];
 
-	bch2_trans_init(&trans, c);
+	bch2_trans_init(&trans, c, 0, 0);
 
 	/*
 	 * XXX: We don't have a good way of positively matching on sibling nodes
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index baffb58fd10b..d4806809fc0d 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -1151,7 +1151,7 @@ static void bch2_btree_node_write_error(struct bch_fs *c,
 	struct btree_iter *iter;
 	int ret;
 
-	bch2_trans_init(&trans, c);
+	bch2_trans_init(&trans, c, 0, 0);
 
 	iter = bch2_trans_get_node_iter(&trans, b->c.btree_id, b->key.k.p,
 					BTREE_MAX_DEPTH, b->c.level, 0);
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index b058b6f3b89d..a906eb1c5f5a 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -270,8 +270,7 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
 
 	if (unlikely(!ret)) {
 		trans_restart();
-		trace_trans_restart_would_deadlock(iter->trans->c,
-						   iter->trans->ip);
+		trace_trans_restart_would_deadlock(iter->trans->ip);
 		return false;
 	}
 
@@ -1667,7 +1666,7 @@ int bch2_trans_iter_free_on_commit(struct btree_trans *trans,
 	return ret;
 }
 
-int bch2_trans_realloc_iters(struct btree_trans *trans,
+static int bch2_trans_realloc_iters(struct btree_trans *trans,
 				    unsigned new_size)
 {
 	void *new_iters, *new_updates;
@@ -1715,18 +1714,13 @@ success:
 
 	if (trans->iters_live) {
 		trans_restart();
-		trace_trans_restart_iters_realloced(trans->c, trans->ip);
+		trace_trans_restart_iters_realloced(trans->ip, trans->size);
 		return -EINTR;
 	}
 
 	return 0;
 }
 
-void bch2_trans_preload_iters(struct btree_trans *trans)
-{
-	bch2_trans_realloc_iters(trans, BTREE_ITER_MAX);
-}
-
 static int btree_trans_iter_alloc(struct btree_trans *trans)
 {
 	unsigned idx = __ffs64(~trans->iters_linked);
@@ -1866,32 +1860,41 @@ struct btree_iter *bch2_trans_copy_iter(struct btree_trans *trans,
 	return &trans->iters[idx];
 }
 
-void *bch2_trans_kmalloc(struct btree_trans *trans,
-			 size_t size)
+static int bch2_trans_preload_mem(struct btree_trans *trans, size_t size)
 {
-	void *ret;
-
-	if (trans->mem_top + size > trans->mem_bytes) {
+	if (size > trans->mem_bytes) {
 		size_t old_bytes = trans->mem_bytes;
-		size_t new_bytes = roundup_pow_of_two(trans->mem_top + size);
+		size_t new_bytes = roundup_pow_of_two(size);
 		void *new_mem = krealloc(trans->mem, new_bytes, GFP_NOFS);
 
 		if (!new_mem)
-			return ERR_PTR(-ENOMEM);
+			return -ENOMEM;
 
 		trans->mem = new_mem;
 		trans->mem_bytes = new_bytes;
 
 		if (old_bytes) {
 			trans_restart();
-			trace_trans_restart_mem_realloced(trans->c, trans->ip);
-			return ERR_PTR(-EINTR);
+			trace_trans_restart_mem_realloced(trans->ip, new_bytes);
+			return -EINTR;
 		}
 	}
 
-	ret = trans->mem + trans->mem_top;
+	return 0;
+}
+
+void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
+{
+	void *p;
+	int ret;
+
+	ret = bch2_trans_preload_mem(trans, trans->mem_top + size);
+	if (ret)
+		return ERR_PTR(ret);
+
+	p = trans->mem + trans->mem_top;
 	trans->mem_top += size;
-	return ret;
+	return p;
 }
 
 inline void bch2_trans_unlink_iters(struct btree_trans *trans, u64 iters)
@@ -1938,7 +1941,9 @@ void __bch2_trans_begin(struct btree_trans *trans)
 	bch2_btree_iter_traverse_all(trans);
 }
 
-void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c)
+void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c,
+		     unsigned expected_nr_iters,
+		     size_t expected_mem_bytes)
 {
 	memset(trans, 0, offsetof(struct btree_trans, iters_onstack));
 
@@ -1947,6 +1952,12 @@ void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c)
 	trans->size		= ARRAY_SIZE(trans->iters_onstack);
 	trans->iters		= trans->iters_onstack;
 	trans->updates		= trans->updates_onstack;
+
+	if (expected_nr_iters > trans->size)
+		bch2_trans_realloc_iters(trans, expected_nr_iters);
+
+	if (expected_mem_bytes)
+		bch2_trans_preload_mem(trans, expected_mem_bytes);
 }
 
 int bch2_trans_exit(struct btree_trans *trans)
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index 3089aa7cf8e9..e8c31852d5fd 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -258,9 +258,6 @@ static inline int bkey_err(struct bkey_s_c k)
 
 /* new multiple iterator interface: */
 
-int bch2_trans_realloc_iters(struct btree_trans *, unsigned);
-void bch2_trans_preload_iters(struct btree_trans *);
-
 int bch2_trans_iter_put(struct btree_trans *, struct btree_iter *);
 int bch2_trans_iter_free(struct btree_trans *, struct btree_iter *);
 int bch2_trans_iter_free_on_commit(struct btree_trans *, struct btree_iter *);
@@ -303,7 +300,7 @@ static inline void bch2_trans_begin_updates(struct btree_trans *trans)
 }
 
 void *bch2_trans_kmalloc(struct btree_trans *, size_t);
-void bch2_trans_init(struct btree_trans *, struct bch_fs *);
+void bch2_trans_init(struct btree_trans *, struct bch_fs *, unsigned, size_t);
 int bch2_trans_exit(struct btree_trans *);
 
 #ifdef TRACE_TRANSACTION_RESTARTS
diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
index a967f196c87a..c25e7a752cc9 100644
--- a/fs/bcachefs/btree_update.h
+++ b/fs/bcachefs/btree_update.h
@@ -128,7 +128,7 @@ struct btree_insert_entry *bch2_trans_update(struct btree_trans *,
 	struct btree_trans trans;					\
 	int _ret;							\
 									\
-	bch2_trans_init(&trans, (_c));					\
+	bch2_trans_init(&trans, (_c), 0, 0);				\
 									\
 	do {								\
 		bch2_trans_begin(&trans);				\
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index d0ca08a323a1..dcfcfe97b6f4 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -1586,7 +1586,7 @@ int bch2_btree_split_leaf(struct bch_fs *c, struct btree_iter *iter,
 	 * instead of locking/reserving all the way to the root:
 	 */
 	if (!bch2_btree_iter_upgrade(iter, U8_MAX)) {
-		trace_trans_restart_iter_upgrade(c, iter->trans->ip);
+		trace_trans_restart_iter_upgrade(trans->ip);
 		ret = -EINTR;
 		goto out;
 	}
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 88e038c1ccef..0aca109dac06 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -440,7 +440,7 @@ static int bch2_trans_journal_preres_get(struct btree_trans *trans)
 
 	if (!bch2_trans_relock(trans)) {
 		trans_restart(" (iter relock after journal preres get blocked)");
-		trace_trans_restart_journal_preres_get(c, trans->ip);
+		trace_trans_restart_journal_preres_get(trans->ip);
 		return -EINTR;
 	}
 
@@ -560,7 +560,7 @@ static inline int do_btree_insert_at(struct btree_trans *trans,
 			ret = bch2_trans_mark_update(trans, i,
 						&trans->fs_usage_deltas);
 			if (ret == -EINTR)
-				trace_trans_restart_mark(c, trans->ip);
+				trace_trans_restart_mark(trans->ip);
 			if (ret)
 				return ret;
 		}
@@ -570,7 +570,7 @@ static inline int do_btree_insert_at(struct btree_trans *trans,
 	if (race_fault()) {
 		ret = -EINTR;
 		trans_restart(" (race)");
-		trace_trans_restart_fault_inject(c, trans->ip);
+		trace_trans_restart_fault_inject(trans->ip);
 		goto out;
 	}
 
@@ -719,7 +719,7 @@ int bch2_trans_commit_error(struct btree_trans *trans,
 		    ret == -EINTR ||
 		    (flags & BTREE_INSERT_NOUNLOCK)) {
 			trans_restart(" (split)");
-			trace_trans_restart_btree_node_split(c, trans->ip);
+			trace_trans_restart_btree_node_split(trans->ip);
 			ret = -EINTR;
 		}
 		break;
@@ -739,7 +739,7 @@ int bch2_trans_commit_error(struct btree_trans *trans,
 			return 0;
 
 		trans_restart(" (iter relock after marking replicas)");
-		trace_trans_restart_mark_replicas(c, trans->ip);
+		trace_trans_restart_mark_replicas(trans->ip);
 		ret = -EINTR;
 		break;
 	case BTREE_INSERT_NEED_JOURNAL_RES:
@@ -753,7 +753,7 @@ int bch2_trans_commit_error(struct btree_trans *trans,
 			return 0;
 
 		trans_restart(" (iter relock after journal res get blocked)");
-		trace_trans_restart_journal_res_get(c, trans->ip);
+		trace_trans_restart_journal_res_get(trans->ip);
 		ret = -EINTR;
 		break;
 	default:
@@ -766,7 +766,7 @@ int bch2_trans_commit_error(struct btree_trans *trans,
 
 		if (ret2) {
 			trans_restart(" (traverse)");
-			trace_trans_restart_traverse(c, trans->ip);
+			trace_trans_restart_traverse(trans->ip);
 			return ret2;
 		}
 
@@ -778,7 +778,7 @@ int bch2_trans_commit_error(struct btree_trans *trans,
 			return 0;
 
 		trans_restart(" (atomic)");
-		trace_trans_restart_atomic(c, trans->ip);
+		trace_trans_restart_atomic(trans->ip);
 	}
 
 	return ret;
@@ -809,7 +809,7 @@ static int __bch2_trans_commit(struct btree_trans *trans,
 		if (!bch2_btree_iter_upgrade(i->iter, 1)) {
 			trans_restart(" (failed upgrade, locks_want %u uptodate %u)",
 				      old_locks_want, old_uptodate);
-			trace_trans_restart_upgrade(c, trans->ip);
+			trace_trans_restart_upgrade(trans->ip);
 			ret = -EINTR;
 			goto err;
 		}
@@ -975,7 +975,9 @@ int bch2_btree_insert(struct bch_fs *c, enum btree_id id,
 	struct btree_iter *iter;
 	int ret;
 
-	bch2_trans_init(&trans, c);
+	bch2_trans_init(&trans, c, 0, 0);
+retry:
+	bch2_trans_begin(&trans);
 
 	iter = bch2_trans_get_iter(&trans, id, bkey_start_pos(&k->k),
 				   BTREE_ITER_INTENT);
@@ -983,6 +985,8 @@ int bch2_btree_insert(struct bch_fs *c, enum btree_id id,
 	bch2_trans_update(&trans, BTREE_INSERT_ENTRY(iter, k));
 
 	ret = bch2_trans_commit(&trans, disk_res, journal_seq, flags);
+	if (ret == -EINTR)
+		goto retry;
 	bch2_trans_exit(&trans);
 
 	return ret;
@@ -1071,8 +1075,11 @@ int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id,
 	struct btree_iter *iter;
 	int ret = 0;
 
-	bch2_trans_init(&trans, c);
-	bch2_trans_preload_iters(&trans);
+	/*
+	 * XXX: whether we need mem/more iters depends on whether this btree id
+	 * has triggers
+	 */
+	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 512);
 
 	iter = bch2_trans_get_iter(&trans, id, start, BTREE_ITER_INTENT);
 
diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c
index a11d7923ea5a..4c6fcb6f918e 100644
--- a/fs/bcachefs/debug.c
+++ b/fs/bcachefs/debug.c
@@ -221,7 +221,7 @@ static ssize_t bch2_read_btree(struct file *file, char __user *buf,
 	if (!i->size)
 		return i->ret;
 
-	bch2_trans_init(&trans, i->c);
+	bch2_trans_init(&trans, i->c, 0, 0);
 
 	iter = bch2_trans_get_iter(&trans, i->id, i->from, BTREE_ITER_PREFETCH);
 	k = bch2_btree_iter_peek(iter);
@@ -275,7 +275,7 @@ static ssize_t bch2_read_btree_formats(struct file *file, char __user *buf,
 	if (!i->size || !bkey_cmp(POS_MAX, i->from))
 		return i->ret;
 
-	bch2_trans_init(&trans, i->c);
+	bch2_trans_init(&trans, i->c, 0, 0);
 
 	for_each_btree_node(&trans, iter, i->id, i->from, 0, b) {
 		bch2_btree_node_to_text(&PBUF(i->buf), i->c, b);
@@ -328,7 +328,7 @@ static ssize_t bch2_read_bfloat_failed(struct file *file, char __user *buf,
 	if (!i->size)
 		return i->ret;
 
-	bch2_trans_init(&trans, i->c);
+	bch2_trans_init(&trans, i->c, 0, 0);
 
 	iter = bch2_trans_get_iter(&trans, i->id, i->from, BTREE_ITER_PREFETCH);
 
diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c
index 71971b3cc851..1442dacef0de 100644
--- a/fs/bcachefs/dirent.c
+++ b/fs/bcachefs/dirent.c
@@ -313,7 +313,7 @@ u64 bch2_dirent_lookup(struct bch_fs *c, u64 dir_inum,
 	struct bkey_s_c k;
 	u64 inum = 0;
 
-	bch2_trans_init(&trans, c);
+	bch2_trans_init(&trans, c, 0, 0);
 
 	iter = bch2_hash_lookup(&trans, bch2_dirent_hash_desc,
 				hash_info, dir_inum, name, 0);
@@ -370,7 +370,7 @@ int bch2_readdir(struct bch_fs *c, struct file *file,
 	if (!dir_emit_dots(file, ctx))
 		return 0;
 
-	bch2_trans_init(&trans, c);
+	bch2_trans_init(&trans, c, 0, 0);
 
 	for_each_btree_key(&trans, iter, BTREE_ID_DIRENTS,
 			   POS(inode->v.i_ino, ctx->pos), 0, k, ret) {
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index 49cbc0bcd522..4a8aa7491fb5 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -441,7 +441,7 @@ int bch2_ec_read_extent(struct bch_fs *c, struct bch_read_bio *rbio)
 	if (!buf)
 		return -ENOMEM;
 
-	bch2_trans_init(&trans, c);
+	bch2_trans_init(&trans, c, 0, 0);
 
 	iter = bch2_trans_get_iter(&trans, BTREE_ID_EC,
 				   POS(0, stripe_idx),
@@ -698,7 +698,7 @@ static int ec_stripe_bkey_insert(struct bch_fs *c,
 	struct bkey_s_c k;
 	int ret;
 
-	bch2_trans_init(&trans, c);
+	bch2_trans_init(&trans, c, 0, 0);
 retry:
 	bch2_trans_begin(&trans);
 
@@ -765,8 +765,7 @@ static int ec_stripe_update_ptrs(struct bch_fs *c,
 	BKEY_PADDED(k) tmp;
 	int ret = 0, dev, idx;
 
-	bch2_trans_init(&trans, c);
-	bch2_trans_preload_iters(&trans);
+	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
 
 	iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
 				   bkey_start_pos(pos),
@@ -1236,7 +1235,7 @@ int bch2_stripes_write(struct bch_fs *c, unsigned flags, bool *wrote)
 	new_key = kmalloc(255 * sizeof(u64), GFP_KERNEL);
 	BUG_ON(!new_key);
 
-	bch2_trans_init(&trans, c);
+	bch2_trans_init(&trans, c, 0, 0);
 
 	iter = bch2_trans_get_iter(&trans, BTREE_ID_EC, POS_MIN,
 				   BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
@@ -1272,7 +1271,7 @@ int bch2_stripes_read(struct bch_fs *c, struct journal_keys *journal_keys)
 	if (ret)
 		return ret;
 
-	bch2_trans_init(&trans, c);
+	bch2_trans_init(&trans, c, 0, 0);
 
 	for_each_btree_key(&trans, iter, BTREE_ID_EC, POS_MIN, 0, k, ret)
 		bch2_mark_key(c, k, true, 0, NULL, 0, 0);
@@ -1299,7 +1298,7 @@ int bch2_ec_mem_alloc(struct bch_fs *c, bool gc)
 	size_t i, idx = 0;
 	int ret = 0;
 
-	bch2_trans_init(&trans, c);
+	bch2_trans_init(&trans, c, 0, 0);
 
 	iter = bch2_trans_get_iter(&trans, BTREE_ID_EC, POS(0, U64_MAX), 0);
 
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 33c00db899e0..d8d128cae5b4 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -1712,7 +1712,7 @@ bool bch2_check_range_allocated(struct bch_fs *c, struct bpos pos, u64 size,
 
 	end.offset += size;
 
-	bch2_trans_init(&trans, c);
+	bch2_trans_init(&trans, c, 0, 0);
 
 	for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, pos,
 			   BTREE_ITER_SLOTS, k, err) {
diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index bce45c87c901..9d0cca0bdfa3 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -435,8 +435,7 @@ static int bchfs_write_index_update(struct bch_write_op *wop)
 
 	BUG_ON(k->k.p.inode != inode->v.i_ino);
 
-	bch2_trans_init(&trans, c);
-	bch2_trans_preload_iters(&trans);
+	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024);
 
 	iter = bch2_trans_get_iter(&trans,
 				BTREE_ID_EXTENTS,
@@ -1004,7 +1003,7 @@ void bch2_readahead(struct readahead_control *ractl)
 	ret = readpages_iter_init(&readpages_iter, ractl);
 	BUG_ON(ret);
 
-	bch2_trans_init(&trans, c);
+	bch2_trans_init(&trans, c, 0, 0);
 
 	iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, POS_MIN,
 				   BTREE_ITER_SLOTS);
@@ -1049,7 +1048,7 @@ static void __bchfs_readpage(struct bch_fs *c, struct bch_read_bio *rbio,
 	rbio->bio.bi_opf = REQ_OP_READ|REQ_SYNC;
 	bio_add_page_contig(&rbio->bio, page);
 
-	bch2_trans_init(&trans, c);
+	bch2_trans_init(&trans, c, 0, 0);
 	iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, POS_MIN,
 				   BTREE_ITER_SLOTS);
 
@@ -2090,8 +2089,7 @@ static int __bch2_fpunch(struct bch_fs *c, struct bch_inode_info *inode,
 	struct bkey_s_c k;
 	int ret = 0;
 
-	bch2_trans_init(&trans, c);
-	bch2_trans_preload_iters(&trans);
+	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024);
 
 	iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, start,
 				   BTREE_ITER_INTENT);
@@ -2137,7 +2135,7 @@ static inline int range_has_data(struct bch_fs *c,
 	struct bkey_s_c k;
 	int ret = 0;
 
-	bch2_trans_init(&trans, c);
+	bch2_trans_init(&trans, c, 0, 0);
 
 	for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, start, 0, k, ret) {
 		if (bkey_cmp(bkey_start_pos(k.k), end) >= 0)
@@ -2394,8 +2392,7 @@ static long bch2_fcollapse(struct bch_inode_info *inode,
 	if ((offset | len) & (block_bytes(c) - 1))
 		return -EINVAL;
 
-	bch2_trans_init(&trans, c);
-	bch2_trans_preload_iters(&trans);
+	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 256);
 
 	/*
 	 * We need i_mutex to keep the page cache consistent with the extents
@@ -2510,8 +2507,7 @@ static long bch2_fallocate(struct bch_inode_info *inode, int mode,
 	unsigned replicas = io_opts(c, inode).data_replicas;
 	int ret;
 
-	bch2_trans_init(&trans, c);
-	bch2_trans_preload_iters(&trans);
+	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
 
 	inode_lock(&inode->v);
 	inode_dio_wait(&inode->v);
@@ -2729,7 +2725,7 @@ static loff_t bch2_seek_data(struct file *file, u64 offset)
 	if (offset >= isize)
 		return -ENXIO;
 
-	bch2_trans_init(&trans, c);
+	bch2_trans_init(&trans, c, 0, 0);
 
 	for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS,
 			   POS(inode->v.i_ino, offset >> 9), 0, k, ret) {
@@ -2802,7 +2798,7 @@ static loff_t bch2_seek_hole(struct file *file, u64 offset)
 	if (offset >= isize)
 		return -ENXIO;
 
-	bch2_trans_init(&trans, c);
+	bch2_trans_init(&trans, c, 0, 0);
 
 	for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS,
 			   POS(inode->v.i_ino, offset >> 9),
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index b5a025939f51..defd35d04750 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -215,7 +215,7 @@ int __must_check bch2_write_inode(struct bch_fs *c,
 	struct bch_inode_unpacked inode_u;
 	int ret;
 
-	bch2_trans_init(&trans, c);
+	bch2_trans_init(&trans, c, 0, 0);
 retry:
 	bch2_trans_begin(&trans);
 
@@ -414,8 +414,7 @@ __bch2_create(struct mnt_idmap *idmap,
 	if (!tmpfile)
 		mutex_lock(&dir->ei_update_lock);
 
-	bch2_trans_init(&trans, c);
-	bch2_trans_realloc_iters(&trans, 8);
+	bch2_trans_init(&trans, c, 8, 1024);
 retry:
 	bch2_trans_begin(&trans);
 
@@ -572,7 +571,7 @@ static int __bch2_link(struct bch_fs *c,
 	int ret;
 
 	mutex_lock(&inode->ei_update_lock);
-	bch2_trans_init(&trans, c);
+	bch2_trans_init(&trans, c, 4, 1024);
 retry:
 	bch2_trans_begin(&trans);
 
@@ -659,7 +658,7 @@ static int bch2_unlink(struct inode *vdir, struct dentry *dentry)
 	int ret;
 
 	bch2_lock_inodes(dir, inode);
-	bch2_trans_init(&trans, c);
+	bch2_trans_init(&trans, c, 4, 1024);
 retry:
 	bch2_trans_begin(&trans);
 
@@ -870,13 +869,13 @@ static int bch2_rename2(struct mnt_idmap *idmap,
 			return ret;
 	}
 
+	bch2_trans_init(&trans, c, 8, 2048);
+
 	bch2_lock_inodes(i.src_dir,
 			 i.dst_dir,
 			 i.src_inode,
 			 i.dst_inode);
 
-	bch2_trans_init(&trans, c);
-
 	if (S_ISDIR(i.src_inode->v.i_mode) &&
 	    inode_attrs_changing(i.dst_dir, i.src_inode)) {
 		ret = -EXDEV;
@@ -1045,7 +1044,7 @@ static int bch2_setattr_nonsize(struct mnt_idmap *idmap,
 	if (ret)
 		goto err;
 
-	bch2_trans_init(&trans, c);
+	bch2_trans_init(&trans, c, 0, 0);
 retry:
 	bch2_trans_begin(&trans);
 	kfree(acl);
@@ -1208,7 +1207,7 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
 	if (start + len < start)
 		return -EINVAL;
 
-	bch2_trans_init(&trans, c);
+	bch2_trans_init(&trans, c, 0, 0);
 
 	for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS,
 			   POS(ei->v.i_ino, start >> 9), 0, k, ret)
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 2dfa87edb116..e3738757b6a0 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -451,8 +451,7 @@ static int check_extents(struct bch_fs *c)
 	u64 i_sectors;
 	int ret = 0;
 
-	bch2_trans_init(&trans, c);
-	bch2_trans_preload_iters(&trans);
+	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
 
 	bch_verbose(c, "checking extents");
 
@@ -547,8 +546,7 @@ static int check_dirents(struct bch_fs *c)
 
 	bch_verbose(c, "checking dirents");
 
-	bch2_trans_init(&trans, c);
-	bch2_trans_preload_iters(&trans);
+	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
 
 	hash_check_init(&h);
 
@@ -704,8 +702,7 @@ static int check_xattrs(struct bch_fs *c)
 
 	hash_check_init(&h);
 
-	bch2_trans_init(&trans, c);
-	bch2_trans_preload_iters(&trans);
+	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
 
 	iter = bch2_trans_get_iter(&trans, BTREE_ID_XATTRS,
 				   POS(BCACHEFS_ROOT_INO, 0), 0);
@@ -918,8 +915,7 @@ static int check_directory_structure(struct bch_fs *c,
 	u64 d_inum;
 	int ret = 0;
 
-	bch2_trans_init(&trans, c);
-	bch2_trans_preload_iters(&trans);
+	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
 
 	bch_verbose(c, "checking directory structure");
 
@@ -1085,8 +1081,7 @@ static int bch2_gc_walk_dirents(struct bch_fs *c, nlink_table *links,
 	u64 d_inum;
 	int ret;
 
-	bch2_trans_init(&trans, c);
-	bch2_trans_preload_iters(&trans);
+	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
 
 	inc_link(c, links, range_start, range_end, BCACHEFS_ROOT_INO, false);
 
@@ -1334,8 +1329,7 @@ static int bch2_gc_walk_inodes(struct bch_fs *c,
 	int ret = 0, ret2 = 0;
 	u64 nlinks_pos;
 
-	bch2_trans_init(&trans, c);
-	bch2_trans_preload_iters(&trans);
+	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
 
 	iter = bch2_trans_get_iter(&trans, BTREE_ID_INODES,
 				   POS(range_start, 0), 0);
@@ -1459,8 +1453,7 @@ int bch2_fsck_walk_inodes_only(struct bch_fs *c)
 	struct bkey_s_c_inode inode;
 	int ret;
 
-	bch2_trans_init(&trans, c);
-	bch2_trans_preload_iters(&trans);
+	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
 
 	for_each_btree_key(&trans, iter, BTREE_ID_INODES, POS_MIN, 0, k, ret) {
 		if (k.k->type != KEY_TYPE_inode)
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index 8e7bec8ce542..e6ad0ad51cb2 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -391,7 +391,7 @@ int bch2_inode_rm(struct bch_fs *c, u64 inode_nr)
 	if (ret)
 		return ret;
 
-	bch2_trans_init(&trans, c);
+	bch2_trans_init(&trans, c, 0, 0);
 
 	iter = bch2_trans_get_iter(&trans, BTREE_ID_INODES, POS(inode_nr, 0),
 				   BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index a676cc1e390d..11cdaddb1551 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -302,7 +302,7 @@ int bch2_write_index_default(struct bch_write_op *op)
 	BUG_ON(bch2_keylist_empty(keys));
 	bch2_verify_keylist_sorted(keys);
 
-	bch2_trans_init(&trans, c);
+	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 256);
 
 	iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
 				   bkey_start_pos(&bch2_keylist_front(keys)->k),
@@ -1271,7 +1271,7 @@ static void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio
 
 	flags &= ~BCH_READ_LAST_FRAGMENT;
 
-	bch2_trans_init(&trans, c);
+	bch2_trans_init(&trans, c, 0, 0);
 
 	iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
 				   rbio->pos, BTREE_ITER_SLOTS);
@@ -1319,7 +1319,7 @@ static void bch2_read_retry(struct bch_fs *c, struct bch_read_bio *rbio,
 	struct bkey_s_c k;
 	int ret;
 
-	bch2_trans_init(&trans, c);
+	bch2_trans_init(&trans, c, 0, 0);
 
 	flags &= ~BCH_READ_LAST_FRAGMENT;
 	flags |= BCH_READ_MUST_CLONE;
@@ -1428,7 +1428,7 @@ static void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio)
 	if (rbio->pick.crc.compression_type)
 		return;
 
-	bch2_trans_init(&trans, c);
+	bch2_trans_init(&trans, c, 0, 0);
 retry:
 	bch2_trans_begin(&trans);
 
@@ -1868,7 +1868,7 @@ void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, u64 inode)
 		BCH_READ_USER_MAPPED;
 	int ret;
 
-	bch2_trans_init(&trans, c);
+	bch2_trans_init(&trans, c, 0, 0);
 
 	BUG_ON(rbio->_state);
 	BUG_ON(flags & BCH_READ_NODECODE);
diff --git a/fs/bcachefs/journal_seq_blacklist.c b/fs/bcachefs/journal_seq_blacklist.c
index ae64bf3248ef..787d9f7638d0 100644
--- a/fs/bcachefs/journal_seq_blacklist.c
+++ b/fs/bcachefs/journal_seq_blacklist.c
@@ -258,7 +258,7 @@ void bch2_blacklist_entries_gc(struct work_struct *work)
 	unsigned i, nr, new_nr;
 	int ret;
 
-	bch2_trans_init(&trans, c);
+	bch2_trans_init(&trans, c, 0, 0);
 
 	for (i = 0; i < BTREE_ID_NR; i++) {
 		struct btree_iter *iter;
diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c
index 6b17d7918aa4..ad41f5e36a7c 100644
--- a/fs/bcachefs/migrate.c
+++ b/fs/bcachefs/migrate.c
@@ -42,8 +42,7 @@ static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
 	BKEY_PADDED(key) tmp;
 	int ret = 0;
 
-	bch2_trans_init(&trans, c);
-	bch2_trans_preload_iters(&trans);
+	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
 
 	iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
 				   POS_MIN, BTREE_ITER_PREFETCH);
@@ -113,7 +112,7 @@ static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
 	if (flags & BCH_FORCE_IF_METADATA_LOST)
 		return -EINVAL;
 
-	bch2_trans_init(&trans, c);
+	bch2_trans_init(&trans, c, 0, 0);
 	closure_init_stack(&cl);
 
 	for (id = 0; id < BTREE_ID_NR; id++) {
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index 64ac8244e1e0..96f9f5950438 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -61,8 +61,7 @@ static int bch2_migrate_index_update(struct bch_write_op *op)
 	struct keylist *keys = &op->insert_keys;
 	int ret = 0;
 
-	bch2_trans_init(&trans, c);
-	bch2_trans_preload_iters(&trans);
+	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
 
 	iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
 				   bkey_start_pos(&bch2_keylist_front(keys)->k),
@@ -499,7 +498,7 @@ int bch2_move_data(struct bch_fs *c,
 	INIT_LIST_HEAD(&ctxt.reads);
 	init_waitqueue_head(&ctxt.wait);
 
-	bch2_trans_init(&trans, c);
+	bch2_trans_init(&trans, c, 0, 0);
 
 	stats->data_type = BCH_DATA_USER;
 	stats->btree_id	= BTREE_ID_EXTENTS;
@@ -633,7 +632,7 @@ static int bch2_move_btree(struct bch_fs *c,
 	enum data_cmd cmd;
 	int ret = 0;
 
-	bch2_trans_init(&trans, c);
+	bch2_trans_init(&trans, c, 0, 0);
 
 	stats->data_type = BCH_DATA_BTREE;
 
diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c
index b78df735d94c..f0da0fac09bf 100644
--- a/fs/bcachefs/quota.c
+++ b/fs/bcachefs/quota.c
@@ -361,7 +361,7 @@ static int bch2_quota_init_type(struct bch_fs *c, enum quota_types type)
 	struct bkey_s_c k;
 	int ret = 0;
 
-	bch2_trans_init(&trans, c);
+	bch2_trans_init(&trans, c, 0, 0);
 
 	for_each_btree_key(&trans, iter, BTREE_ID_QUOTAS, POS(type, 0),
 			   BTREE_ITER_PREFETCH, k, ret) {
@@ -433,7 +433,7 @@ int bch2_fs_quota_read(struct bch_fs *c)
 			return ret;
 	}
 
-	bch2_trans_init(&trans, c);
+	bch2_trans_init(&trans, c, 0, 0);
 
 	for_each_btree_key(&trans, iter, BTREE_ID_INODES, POS_MIN,
 			   BTREE_ITER_PREFETCH, k, ret) {
@@ -726,7 +726,7 @@ static int bch2_set_quota(struct super_block *sb, struct kqid qid,
 	bkey_quota_init(&new_quota.k_i);
 	new_quota.k.p = POS(qid.type, from_kqid(&init_user_ns, qid));
 
-	bch2_trans_init(&trans, c);
+	bch2_trans_init(&trans, c, 0, 0);
 
 	iter = bch2_trans_get_iter(&trans, BTREE_ID_QUOTAS, new_quota.k.p,
 				   BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 3f0eda9f5d0c..59f678596a64 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -214,8 +214,7 @@ static int bch2_extent_replay_key(struct bch_fs *c, struct bkey_i *k)
 	bool split_compressed = false;
 	int ret;
 
-	bch2_trans_init(&trans, c);
-	bch2_trans_preload_iters(&trans);
+	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
 retry:
 	bch2_trans_begin(&trans);
 
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index 3139161fbe88..b0f09a31c41e 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -263,7 +263,7 @@ static ssize_t bch2_compression_stats(struct bch_fs *c, char *buf)
 	if (!test_bit(BCH_FS_STARTED, &c->flags))
 		return -EPERM;
 
-	bch2_trans_init(&trans, c);
+	bch2_trans_init(&trans, c, 0, 0);
 
 	for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, POS_MIN, 0, k, ret)
 		if (k.k->type == KEY_TYPE_extent) {
diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c
index 0f5a3ed13f3e..92843bd09b04 100644
--- a/fs/bcachefs/tests.c
+++ b/fs/bcachefs/tests.c
@@ -35,7 +35,7 @@ static void test_delete(struct bch_fs *c, u64 nr)
 
 	bkey_cookie_init(&k.k_i);
 
-	bch2_trans_init(&trans, c);
+	bch2_trans_init(&trans, c, 0, 0);
 
 	iter = bch2_trans_get_iter(&trans, BTREE_ID_DIRENTS, k.k.p,
 				   BTREE_ITER_INTENT);
@@ -67,7 +67,7 @@ static void test_delete_written(struct bch_fs *c, u64 nr)
 
 	bkey_cookie_init(&k.k_i);
 
-	bch2_trans_init(&trans, c);
+	bch2_trans_init(&trans, c, 0, 0);
 
 	iter = bch2_trans_get_iter(&trans, BTREE_ID_DIRENTS, k.k.p,
 				   BTREE_ITER_INTENT);
@@ -95,7 +95,7 @@ static void test_iterate(struct bch_fs *c, u64 nr)
 	u64 i;
 	int ret;
 
-	bch2_trans_init(&trans, c);
+	bch2_trans_init(&trans, c, 0, 0);
 
 	delete_test_keys(c);
 
@@ -140,7 +140,7 @@ static void test_iterate_extents(struct bch_fs *c, u64 nr)
 	u64 i;
 	int ret;
 
-	bch2_trans_init(&trans, c);
+	bch2_trans_init(&trans, c, 0, 0);
 
 	delete_test_keys(c);
 
@@ -190,7 +190,7 @@ static void test_iterate_slots(struct bch_fs *c, u64 nr)
 	u64 i;
 	int ret;
 
-	bch2_trans_init(&trans, c);
+	bch2_trans_init(&trans, c, 0, 0);
 
 	delete_test_keys(c);
 
@@ -244,7 +244,7 @@ static void test_iterate_slots_extents(struct bch_fs *c, u64 nr)
 	u64 i;
 	int ret;
 
-	bch2_trans_init(&trans, c);
+	bch2_trans_init(&trans, c, 0, 0);
 
 	delete_test_keys(c);
 
@@ -305,7 +305,7 @@ static void test_peek_end(struct bch_fs *c, u64 nr)
 	struct btree_iter *iter;
 	struct bkey_s_c k;
 
-	bch2_trans_init(&trans, c);
+	bch2_trans_init(&trans, c, 0, 0);
 
 	iter = bch2_trans_get_iter(&trans, BTREE_ID_DIRENTS, POS_MIN, 0);
 
@@ -324,7 +324,7 @@ static void test_peek_end_extents(struct bch_fs *c, u64 nr)
 	struct btree_iter *iter;
 	struct bkey_s_c k;
 
-	bch2_trans_init(&trans, c);
+	bch2_trans_init(&trans, c, 0, 0);
 
 	iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, POS_MIN, 0);
 
@@ -430,7 +430,7 @@ static void rand_lookup(struct bch_fs *c, u64 nr)
 	struct bkey_s_c k;
 	u64 i;
 
-	bch2_trans_init(&trans, c);
+	bch2_trans_init(&trans, c, 0, 0);
 
 	for (i = 0; i < nr; i++) {
 		iter = bch2_trans_get_iter(&trans, BTREE_ID_DIRENTS,
@@ -451,7 +451,7 @@ static void rand_mixed(struct bch_fs *c, u64 nr)
 	int ret;
 	u64 i;
 
-	bch2_trans_init(&trans, c);
+	bch2_trans_init(&trans, c, 0, 0);
 
 	for (i = 0; i < nr; i++) {
 		iter = bch2_trans_get_iter(&trans, BTREE_ID_DIRENTS,
@@ -503,7 +503,7 @@ static void seq_insert(struct bch_fs *c, u64 nr)
 
 	bkey_cookie_init(&insert.k_i);
 
-	bch2_trans_init(&trans, c);
+	bch2_trans_init(&trans, c, 0, 0);
 
 	for_each_btree_key(&trans, iter, BTREE_ID_DIRENTS, POS_MIN,
 			   BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) {
@@ -526,7 +526,7 @@ static void seq_lookup(struct bch_fs *c, u64 nr)
 	struct bkey_s_c k;
 	int ret;
 
-	bch2_trans_init(&trans, c);
+	bch2_trans_init(&trans, c, 0, 0);
 
 	for_each_btree_key(&trans, iter, BTREE_ID_DIRENTS, POS_MIN, 0, k, ret)
 		;
@@ -540,7 +540,7 @@ static void seq_overwrite(struct bch_fs *c, u64 nr)
 	struct bkey_s_c k;
 	int ret;
 
-	bch2_trans_init(&trans, c);
+	bch2_trans_init(&trans, c, 0, 0);
 
 	for_each_btree_key(&trans, iter, BTREE_ID_DIRENTS, POS_MIN,
 			   BTREE_ITER_INTENT, k, ret) {
diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h
index 86f58206365d..a9fcb5442186 100644
--- a/fs/bcachefs/trace.h
+++ b/fs/bcachefs/trace.h
@@ -500,16 +500,14 @@ TRACE_EVENT(copygc,
 );
 
 DECLARE_EVENT_CLASS(transaction_restart,
-	TP_PROTO(struct bch_fs *c, unsigned long ip),
-	TP_ARGS(c, ip),
+	TP_PROTO(unsigned long ip),
+	TP_ARGS(ip),
 
 	TP_STRUCT__entry(
-		__array(char,			name,	16)
 		__field(unsigned long,		ip	)
 	),
 
 	TP_fast_assign(
-		memcpy(__entry->name, c->name, 16);
 		__entry->ip = ip;
 	),
 
@@ -517,73 +515,97 @@ DECLARE_EVENT_CLASS(transaction_restart,
 );
 
 DEFINE_EVENT(transaction_restart,	trans_restart_btree_node_reused,
-	TP_PROTO(struct bch_fs *c, unsigned long ip),
-	TP_ARGS(c, ip)
+	TP_PROTO(unsigned long ip),
+	TP_ARGS(ip)
 );
 
 DEFINE_EVENT(transaction_restart,	trans_restart_would_deadlock,
-	TP_PROTO(struct bch_fs *c, unsigned long ip),
-	TP_ARGS(c, ip)
+	TP_PROTO(unsigned long ip),
+	TP_ARGS(ip)
 );
 
-DEFINE_EVENT(transaction_restart,	trans_restart_iters_realloced,
-	TP_PROTO(struct bch_fs *c, unsigned long ip),
-	TP_ARGS(c, ip)
+TRACE_EVENT(trans_restart_iters_realloced,
+	TP_PROTO(unsigned long ip, unsigned nr),
+	TP_ARGS(ip, nr),
+
+	TP_STRUCT__entry(
+		__field(unsigned long,		ip	)
+		__field(unsigned,		nr	)
+	),
+
+	TP_fast_assign(
+		__entry->ip	= ip;
+		__entry->nr	= nr;
+	),
+
+	TP_printk("%pS nr %u", (void *) __entry->ip, __entry->nr)
 );
 
-DEFINE_EVENT(transaction_restart,	trans_restart_mem_realloced,
-	TP_PROTO(struct bch_fs *c, unsigned long ip),
-	TP_ARGS(c, ip)
+TRACE_EVENT(trans_restart_mem_realloced,
+	TP_PROTO(unsigned long ip, unsigned long bytes),
+	TP_ARGS(ip, bytes),
+
+	TP_STRUCT__entry(
+		__field(unsigned long,		ip	)
+		__field(unsigned long,		bytes	)
+	),
+
+	TP_fast_assign(
+		__entry->ip	= ip;
+		__entry->bytes	= bytes;
+	),
+
+	TP_printk("%pS bytes %lu", (void *) __entry->ip, __entry->bytes)
 );
 
 DEFINE_EVENT(transaction_restart,	trans_restart_journal_res_get,
-	TP_PROTO(struct bch_fs *c, unsigned long ip),
-	TP_ARGS(c, ip)
+	TP_PROTO(unsigned long ip),
+	TP_ARGS(ip)
 );
 
 DEFINE_EVENT(transaction_restart,	trans_restart_journal_preres_get,
-	TP_PROTO(struct bch_fs *c, unsigned long ip),
-	TP_ARGS(c, ip)
+	TP_PROTO(unsigned long ip),
+	TP_ARGS(ip)
 );
 
 DEFINE_EVENT(transaction_restart,	trans_restart_mark_replicas,
-	TP_PROTO(struct bch_fs *c, unsigned long ip),
-	TP_ARGS(c, ip)
+	TP_PROTO(unsigned long ip),
+	TP_ARGS(ip)
 );
 
 DEFINE_EVENT(transaction_restart,	trans_restart_fault_inject,
-	TP_PROTO(struct bch_fs *c, unsigned long ip),
-	TP_ARGS(c, ip)
+	TP_PROTO(unsigned long ip),
+	TP_ARGS(ip)
 );
 
 DEFINE_EVENT(transaction_restart,	trans_restart_btree_node_split,
-	TP_PROTO(struct bch_fs *c, unsigned long ip),
-	TP_ARGS(c, ip)
+	TP_PROTO(unsigned long ip),
+	TP_ARGS(ip)
 );
 
 DEFINE_EVENT(transaction_restart,	trans_restart_mark,
-	TP_PROTO(struct bch_fs *c, unsigned long ip),
-	TP_ARGS(c, ip)
+	TP_PROTO(unsigned long ip),
+	TP_ARGS(ip)
 );
 
 DEFINE_EVENT(transaction_restart,	trans_restart_upgrade,
-	TP_PROTO(struct bch_fs *c, unsigned long ip),
-	TP_ARGS(c, ip)
+	TP_PROTO(unsigned long ip),
+	TP_ARGS(ip)
 );
 
 DEFINE_EVENT(transaction_restart,	trans_restart_iter_upgrade,
-	TP_PROTO(struct bch_fs *c, unsigned long ip),
-	TP_ARGS(c, ip)
+	TP_PROTO(unsigned long ip),
+	TP_ARGS(ip)
 );
 
 DEFINE_EVENT(transaction_restart,	trans_restart_traverse,
-	TP_PROTO(struct bch_fs *c, unsigned long ip),
-	TP_ARGS(c, ip)
+	TP_PROTO(unsigned long ip),
+	TP_ARGS(ip)
 );
 
 DEFINE_EVENT(transaction_restart,	trans_restart_atomic,
-	TP_PROTO(struct bch_fs *c, unsigned long ip),
-	TP_ARGS(c, ip)
+	TP_PROTO(unsigned long ip),
+	TP_ARGS(ip)
 );
 
 DECLARE_EVENT_CLASS(node_lock_fail,
diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c
index 99fb42225508..2ccf64db8147 100644
--- a/fs/bcachefs/xattr.c
+++ b/fs/bcachefs/xattr.c
@@ -126,7 +126,7 @@ int bch2_xattr_get(struct bch_fs *c, struct bch_inode_info *inode,
 	struct bkey_s_c_xattr xattr;
 	int ret;
 
-	bch2_trans_init(&trans, c);
+	bch2_trans_init(&trans, c, 0, 0);
 
 	iter = bch2_hash_lookup(&trans, bch2_xattr_hash_desc,
 				&inode->ei_str_hash, inode->v.i_ino,
@@ -277,7 +277,7 @@ ssize_t bch2_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size)
 	u64 inum = dentry->d_inode->i_ino;
 	int ret;
 
-	bch2_trans_init(&trans, c);
+	bch2_trans_init(&trans, c, 0, 0);
 
 	for_each_btree_key(&trans, iter, BTREE_ID_XATTRS,
 			   POS(inum, 0), 0, k, ret) {
-- 
cgit 


From 61011ea237852ef7d29b7d6b3608a6538560fc76 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 15 May 2019 12:37:11 -0400
Subject: bcachefs: Rip out old hacky transaction restart tracing

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_cache.c       |  1 -
 fs/bcachefs/btree_iter.c        |  5 +----
 fs/bcachefs/btree_iter.h        | 22 +---------------------
 fs/bcachefs/btree_types.h       |  1 -
 fs/bcachefs/btree_update_leaf.c | 12 ------------
 5 files changed, 2 insertions(+), 39 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index 60a7acd18603..d80ba1d71826 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -731,7 +731,6 @@ retry:
 			if (bch2_btree_node_relock(iter, level + 1))
 				goto retry;
 
-			trans_restart();
 			trace_trans_restart_btree_node_reused(iter->trans->ip);
 			return ERR_PTR(-EINTR);
 		}
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index a906eb1c5f5a..10c8350618bc 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -269,7 +269,6 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
 	}
 
 	if (unlikely(!ret)) {
-		trans_restart();
 		trace_trans_restart_would_deadlock(iter->trans->ip);
 		return false;
 	}
@@ -1713,7 +1712,6 @@ success:
 	trans->size	= new_size;
 
 	if (trans->iters_live) {
-		trans_restart();
 		trace_trans_restart_iters_realloced(trans->ip, trans->size);
 		return -EINTR;
 	}
@@ -1874,7 +1872,6 @@ static int bch2_trans_preload_mem(struct btree_trans *trans, size_t size)
 		trans->mem_bytes = new_bytes;
 
 		if (old_bytes) {
-			trans_restart();
 			trace_trans_restart_mem_realloced(trans->ip, new_bytes);
 			return -EINTR;
 		}
@@ -1910,7 +1907,7 @@ inline void bch2_trans_unlink_iters(struct btree_trans *trans, u64 iters)
 	}
 }
 
-void __bch2_trans_begin(struct btree_trans *trans)
+void bch2_trans_begin(struct btree_trans *trans)
 {
 	u64 iters_to_unlink;
 
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index e8c31852d5fd..710ed70ec807 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -292,7 +292,7 @@ struct btree_iter *bch2_trans_get_node_iter(struct btree_trans *,
 				enum btree_id, struct bpos,
 				unsigned, unsigned, unsigned);
 
-void __bch2_trans_begin(struct btree_trans *);
+void bch2_trans_begin(struct btree_trans *);
 
 static inline void bch2_trans_begin_updates(struct btree_trans *trans)
 {
@@ -303,24 +303,4 @@ void *bch2_trans_kmalloc(struct btree_trans *, size_t);
 void bch2_trans_init(struct btree_trans *, struct bch_fs *, unsigned, size_t);
 int bch2_trans_exit(struct btree_trans *);
 
-#ifdef TRACE_TRANSACTION_RESTARTS
-#define bch2_trans_begin(_trans)					\
-do {									\
-	if (is_power_of_2((_trans)->nr_restarts) &&			\
-	    (_trans)->nr_restarts >= 8)					\
-		pr_info("nr restarts: %zu", (_trans)->nr_restarts);	\
-									\
-	(_trans)->nr_restarts++;					\
-	__bch2_trans_begin(_trans);					\
-} while (0)
-#else
-#define bch2_trans_begin(_trans)	__bch2_trans_begin(_trans)
-#endif
-
-#ifdef TRACE_TRANSACTION_RESTARTS_ALL
-#define trans_restart(...) pr_info("transaction restart" __VA_ARGS__)
-#else
-#define trans_restart(...) no_printk("transaction restart" __VA_ARGS__)
-#endif
-
 #endif /* _BCACHEFS_BTREE_ITER_H */
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 9bab213fd65b..d27d33a5666d 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -272,7 +272,6 @@ struct btree_insert_entry {
 struct btree_trans {
 	struct bch_fs		*c;
 	unsigned long		ip;
-	size_t			nr_restarts;
 	u64			commit_start;
 
 	u64			iters_linked;
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 0aca109dac06..2266c7da58d3 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -439,7 +439,6 @@ static int bch2_trans_journal_preres_get(struct btree_trans *trans)
 		return ret;
 
 	if (!bch2_trans_relock(trans)) {
-		trans_restart(" (iter relock after journal preres get blocked)");
 		trace_trans_restart_journal_preres_get(trans->ip);
 		return -EINTR;
 	}
@@ -569,7 +568,6 @@ static inline int do_btree_insert_at(struct btree_trans *trans,
 
 	if (race_fault()) {
 		ret = -EINTR;
-		trans_restart(" (race)");
 		trace_trans_restart_fault_inject(trans->ip);
 		goto out;
 	}
@@ -718,7 +716,6 @@ int bch2_trans_commit_error(struct btree_trans *trans,
 		if (!ret ||
 		    ret == -EINTR ||
 		    (flags & BTREE_INSERT_NOUNLOCK)) {
-			trans_restart(" (split)");
 			trace_trans_restart_btree_node_split(trans->ip);
 			ret = -EINTR;
 		}
@@ -738,7 +735,6 @@ int bch2_trans_commit_error(struct btree_trans *trans,
 		if (bch2_trans_relock(trans))
 			return 0;
 
-		trans_restart(" (iter relock after marking replicas)");
 		trace_trans_restart_mark_replicas(trans->ip);
 		ret = -EINTR;
 		break;
@@ -752,7 +748,6 @@ int bch2_trans_commit_error(struct btree_trans *trans,
 		if (bch2_trans_relock(trans))
 			return 0;
 
-		trans_restart(" (iter relock after journal res get blocked)");
 		trace_trans_restart_journal_res_get(trans->ip);
 		ret = -EINTR;
 		break;
@@ -765,7 +760,6 @@ int bch2_trans_commit_error(struct btree_trans *trans,
 		int ret2 = bch2_btree_iter_traverse_all(trans);
 
 		if (ret2) {
-			trans_restart(" (traverse)");
 			trace_trans_restart_traverse(trans->ip);
 			return ret2;
 		}
@@ -777,7 +771,6 @@ int bch2_trans_commit_error(struct btree_trans *trans,
 		if (!(flags & BTREE_INSERT_ATOMIC))
 			return 0;
 
-		trans_restart(" (atomic)");
 		trace_trans_restart_atomic(trans->ip);
 	}
 
@@ -803,12 +796,7 @@ static int __bch2_trans_commit(struct btree_trans *trans,
 	int ret;
 
 	trans_for_each_update_iter(trans, i) {
-		unsigned old_locks_want = i->iter->locks_want;
-		unsigned old_uptodate = i->iter->uptodate;
-
 		if (!bch2_btree_iter_upgrade(i->iter, 1)) {
-			trans_restart(" (failed upgrade, locks_want %u uptodate %u)",
-				      old_locks_want, old_uptodate);
 			trace_trans_restart_upgrade(trans->ip);
 			ret = -EINTR;
 			goto err;
-- 
cgit 


From 3838be78410cfe52a067b8e88dfcff922084e627 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 15 May 2019 15:47:43 -0400
Subject: bcachefs: Don't use a fixed size buffer for fs_usage_deltas

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c        |  2 +
 fs/bcachefs/btree_types.h       |  2 +-
 fs/bcachefs/btree_update_leaf.c | 16 ++++----
 fs/bcachefs/buckets.c           | 82 ++++++++++++++++++++++++++---------------
 fs/bcachefs/buckets.h           |  6 +--
 fs/bcachefs/buckets_types.h     |  5 +--
 fs/bcachefs/recovery.c          |  7 +---
 7 files changed, 69 insertions(+), 51 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 10c8350618bc..2579944bb8c1 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1949,6 +1949,7 @@ void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c,
 	trans->size		= ARRAY_SIZE(trans->iters_onstack);
 	trans->iters		= trans->iters_onstack;
 	trans->updates		= trans->updates_onstack;
+	trans->fs_usage_deltas	= NULL;
 
 	if (expected_nr_iters > trans->size)
 		bch2_trans_realloc_iters(trans, expected_nr_iters);
@@ -1961,6 +1962,7 @@ int bch2_trans_exit(struct btree_trans *trans)
 {
 	bch2_trans_unlock(trans);
 
+	kfree(trans->fs_usage_deltas);
 	kfree(trans->mem);
 	if (trans->used_mempool)
 		mempool_free(trans->iters, &trans->c->btree_iters_pool);
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index d27d33a5666d..bdcf9288d749 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -305,7 +305,7 @@ struct btree_trans {
 	struct btree_iter	iters_onstack[2];
 	struct btree_insert_entry updates_onstack[6];
 
-	struct replicas_delta_list fs_usage_deltas;
+	struct replicas_delta_list *fs_usage_deltas;
 };
 
 #define BTREE_FLAG(flag)						\
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 2266c7da58d3..e6fbe8a7413a 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -544,10 +544,11 @@ static inline int do_btree_insert_at(struct btree_trans *trans,
 	struct btree_insert_entry *i;
 	int ret;
 
-	if (likely(!(trans->flags & BTREE_INSERT_NO_CLEAR_REPLICAS))) {
-		memset(&trans->fs_usage_deltas.fs_usage, 0,
-		       sizeof(trans->fs_usage_deltas.fs_usage));
-		trans->fs_usage_deltas.top = trans->fs_usage_deltas.d;
+	if (likely(!(trans->flags & BTREE_INSERT_NO_CLEAR_REPLICAS)) &&
+	    trans->fs_usage_deltas) {
+		memset(&trans->fs_usage_deltas->fs_usage, 0,
+		       sizeof(trans->fs_usage_deltas->fs_usage));
+		trans->fs_usage_deltas->used = 0;
 	}
 
 	trans_for_each_update_iter(trans, i)
@@ -556,8 +557,7 @@ static inline int do_btree_insert_at(struct btree_trans *trans,
 	trans_for_each_update_iter(trans, i)
 		if (update_has_triggers(trans, i) &&
 		    update_triggers_transactional(trans, i)) {
-			ret = bch2_trans_mark_update(trans, i,
-						&trans->fs_usage_deltas);
+			ret = bch2_trans_mark_update(trans, i);
 			if (ret == -EINTR)
 				trace_trans_restart_mark(trans->ip);
 			if (ret)
@@ -627,9 +627,9 @@ static inline int do_btree_insert_at(struct btree_trans *trans,
 		    !update_triggers_transactional(trans, i))
 			bch2_mark_update(trans, i, &fs_usage->u, 0);
 
-	if (fs_usage) {
+	if (fs_usage && trans->fs_usage_deltas) {
 		bch2_replicas_delta_list_apply(c, &fs_usage->u,
-					       &trans->fs_usage_deltas);
+					       trans->fs_usage_deltas);
 		bch2_trans_fs_usage_apply(trans, fs_usage);
 	}
 
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 78b4c93a7170..5c18cebeb180 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -1260,26 +1260,45 @@ void bch2_trans_fs_usage_apply(struct btree_trans *trans,
 
 /* trans_mark: */
 
-static inline void update_replicas_list(struct replicas_delta_list *d,
+static void replicas_deltas_realloc(struct btree_trans *trans)
+{
+	struct replicas_delta_list *d = trans->fs_usage_deltas;
+	unsigned new_size = d ? d->size * 2 : 128;
+
+	d = krealloc(d, sizeof(*d) + new_size, GFP_NOIO|__GFP_ZERO);
+	BUG_ON(!d);
+
+	d->size = new_size;
+	trans->fs_usage_deltas = d;
+}
+
+static inline void update_replicas_list(struct btree_trans *trans,
 					struct bch_replicas_entry *r,
 					s64 sectors)
 {
-	d->top->delta = sectors;
-	memcpy(&d->top->r, r, replicas_entry_bytes(r));
+	struct replicas_delta_list *d = trans->fs_usage_deltas;
+	struct replicas_delta *n;
+	unsigned b = replicas_entry_bytes(r) + 8;
 
-	d->top = (void *) d->top + replicas_entry_bytes(r) + 8;
+	if (!d || d->used + b > d->size) {
+		replicas_deltas_realloc(trans);
+		d = trans->fs_usage_deltas;
+	}
 
-	BUG_ON((void *) d->top > (void *) d->d + sizeof(d->pad));
+	n = (void *) d->d + d->used;
+	n->delta = sectors;
+	memcpy(&n->r, r, replicas_entry_bytes(r));
+	d->used += b;
 }
 
-static inline void update_cached_sectors_list(struct replicas_delta_list *d,
+static inline void update_cached_sectors_list(struct btree_trans *trans,
 					      unsigned dev, s64 sectors)
 {
 	struct bch_replicas_padded r;
 
 	bch2_replicas_entry_cached(&r.e, dev);
 
-	update_replicas_list(d, &r.e, sectors);
+	update_replicas_list(trans, &r.e, sectors);
 }
 
 void bch2_replicas_delta_list_apply(struct bch_fs *c,
@@ -1287,12 +1306,13 @@ void bch2_replicas_delta_list_apply(struct bch_fs *c,
 				    struct replicas_delta_list *r)
 {
 	struct replicas_delta *d = r->d;
+	struct replicas_delta *top = (void *) r->d + r->used;
 
 	acc_u64s((u64 *) fs_usage,
 		 (u64 *) &r->fs_usage, sizeof(*fs_usage) / sizeof(u64));
 
-	while (d != r->top) {
-		BUG_ON((void *) d > (void *) r->top);
+	while (d != top) {
+		BUG_ON((void *) d > (void *) top);
 
 		update_replicas(c, fs_usage, &d->r, d->delta);
 
@@ -1361,8 +1381,7 @@ static int trans_update_key(struct btree_trans *trans,
 
 static int bch2_trans_mark_pointer(struct btree_trans *trans,
 			struct extent_ptr_decoded p,
-			s64 sectors, enum bch_data_type data_type,
-			struct replicas_delta_list *d)
+			s64 sectors, enum bch_data_type data_type)
 {
 	struct bch_fs *c = trans->c;
 	struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev);
@@ -1423,8 +1442,7 @@ out:
 
 static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans,
 			struct bch_extent_stripe_ptr p,
-			s64 sectors, enum bch_data_type data_type,
-			struct replicas_delta_list *d)
+			s64 sectors, enum bch_data_type data_type)
 {
 	struct bch_replicas_padded r;
 	struct btree_insert_entry *insert;
@@ -1469,7 +1487,7 @@ static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans,
 
 	bch2_bkey_to_replicas(&r.e, s.s_c);
 
-	update_replicas_list(d, &r.e, sectors);
+	update_replicas_list(trans, &r.e, sectors);
 out:
 	bch2_trans_iter_put(trans, iter);
 	return ret;
@@ -1477,8 +1495,7 @@ out:
 
 static int bch2_trans_mark_extent(struct btree_trans *trans,
 			struct bkey_s_c k,
-			s64 sectors, enum bch_data_type data_type,
-			struct replicas_delta_list *d)
+			s64 sectors, enum bch_data_type data_type)
 {
 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
 	const union bch_extent_entry *entry;
@@ -1501,7 +1518,7 @@ static int bch2_trans_mark_extent(struct btree_trans *trans,
 			: ptr_disk_sectors_delta(p, sectors);
 
 		ret = bch2_trans_mark_pointer(trans, p, disk_sectors,
-					      data_type, d);
+					      data_type);
 		if (ret < 0)
 			return ret;
 
@@ -1509,7 +1526,7 @@ static int bch2_trans_mark_extent(struct btree_trans *trans,
 
 		if (p.ptr.cached) {
 			if (disk_sectors && !stale)
-				update_cached_sectors_list(d, p.ptr.dev,
+				update_cached_sectors_list(trans, p.ptr.dev,
 							   disk_sectors);
 		} else if (!p.ec_nr) {
 			dirty_sectors	       += disk_sectors;
@@ -1517,7 +1534,7 @@ static int bch2_trans_mark_extent(struct btree_trans *trans,
 		} else {
 			for (i = 0; i < p.ec_nr; i++) {
 				ret = bch2_trans_mark_stripe_ptr(trans, p.ec[i],
-						disk_sectors, data_type, d);
+						disk_sectors, data_type);
 				if (ret)
 					return ret;
 			}
@@ -1527,16 +1544,16 @@ static int bch2_trans_mark_extent(struct btree_trans *trans,
 	}
 
 	if (dirty_sectors)
-		update_replicas_list(d, &r.e, dirty_sectors);
+		update_replicas_list(trans, &r.e, dirty_sectors);
 
 	return 0;
 }
 
 int bch2_trans_mark_key(struct btree_trans *trans,
 			struct bkey_s_c k,
-			bool inserting, s64 sectors,
-			struct replicas_delta_list *d)
+			bool inserting, s64 sectors)
 {
+	struct replicas_delta_list *d;
 	struct bch_fs *c = trans->c;
 
 	switch (k.k->type) {
@@ -1544,11 +1561,15 @@ int bch2_trans_mark_key(struct btree_trans *trans,
 		return bch2_trans_mark_extent(trans, k, inserting
 				?  c->opts.btree_node_size
 				: -c->opts.btree_node_size,
-				BCH_DATA_BTREE, d);
+				BCH_DATA_BTREE);
 	case KEY_TYPE_extent:
 		return bch2_trans_mark_extent(trans, k,
-				sectors, BCH_DATA_USER, d);
+				sectors, BCH_DATA_USER);
 	case KEY_TYPE_inode:
+		if (!trans->fs_usage_deltas)
+			replicas_deltas_realloc(trans);
+		d = trans->fs_usage_deltas;
+
 		if (inserting)
 			d->fs_usage.nr_inodes++;
 		else
@@ -1557,6 +1578,10 @@ int bch2_trans_mark_key(struct btree_trans *trans,
 	case KEY_TYPE_reservation: {
 		unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas;
 
+		if (!trans->fs_usage_deltas)
+			replicas_deltas_realloc(trans);
+		d = trans->fs_usage_deltas;
+
 		sectors *= replicas;
 		replicas = clamp_t(unsigned, replicas, 1,
 				   ARRAY_SIZE(d->fs_usage.persistent_reserved));
@@ -1571,8 +1596,7 @@ int bch2_trans_mark_key(struct btree_trans *trans,
 }
 
 int bch2_trans_mark_update(struct btree_trans *trans,
-			   struct btree_insert_entry *insert,
-			   struct replicas_delta_list *d)
+			   struct btree_insert_entry *insert)
 {
 	struct btree_iter	*iter = insert->iter;
 	struct btree		*b = iter->l[0].b;
@@ -1586,7 +1610,7 @@ int bch2_trans_mark_update(struct btree_trans *trans,
 	ret = bch2_trans_mark_key(trans,
 			bkey_i_to_s_c(insert->k), true,
 			bpos_min(insert->k->k.p, b->key.k.p).offset -
-			bkey_start_offset(&insert->k->k), d);
+			bkey_start_offset(&insert->k->k));
 	if (ret)
 		return ret;
 
@@ -1621,7 +1645,7 @@ int bch2_trans_mark_update(struct btree_trans *trans,
 				BUG_ON(sectors <= 0);
 
 				ret = bch2_trans_mark_key(trans, k, true,
-							  sectors, d);
+							  sectors);
 				if (ret)
 					return ret;
 
@@ -1633,7 +1657,7 @@ int bch2_trans_mark_update(struct btree_trans *trans,
 			BUG_ON(sectors >= 0);
 		}
 
-		ret = bch2_trans_mark_key(trans, k, false, sectors, d);
+		ret = bch2_trans_mark_key(trans, k, false, sectors);
 		if (ret)
 			return ret;
 
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index 9f53fe6280f3..9b264514bfcb 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -271,11 +271,9 @@ int bch2_mark_update(struct btree_trans *, struct btree_insert_entry *,
 void bch2_replicas_delta_list_apply(struct bch_fs *,
 				    struct bch_fs_usage *,
 				    struct replicas_delta_list *);
-int bch2_trans_mark_key(struct btree_trans *, struct bkey_s_c,
-			bool, s64, struct replicas_delta_list *);
+int bch2_trans_mark_key(struct btree_trans *, struct bkey_s_c, bool, s64);
 int bch2_trans_mark_update(struct btree_trans *,
-			   struct btree_insert_entry *,
-			   struct replicas_delta_list *);
+			   struct btree_insert_entry *);
 void bch2_trans_fs_usage_apply(struct btree_trans *, struct bch_fs_usage_online *);
 
 /* disk reservations: */
diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h
index ec4294d41518..efed658abc6a 100644
--- a/fs/bcachefs/buckets_types.h
+++ b/fs/bcachefs/buckets_types.h
@@ -99,11 +99,10 @@ struct replicas_delta {
 } __packed;
 
 struct replicas_delta_list {
+	unsigned		size;
+	unsigned		used;
 	struct bch_fs_usage	fs_usage;
-
-	struct replicas_delta	*top;
 	struct replicas_delta	d[0];
-	u8			pad[256];
 };
 
 /*
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 59f678596a64..8c656308826b 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -258,13 +258,8 @@ retry:
 	} while (bkey_cmp(iter->pos, k->k.p) < 0);
 
 	if (split_compressed) {
-		memset(&trans.fs_usage_deltas.fs_usage, 0,
-		       sizeof(trans.fs_usage_deltas.fs_usage));
-		trans.fs_usage_deltas.top = trans.fs_usage_deltas.d;
-
 		ret = bch2_trans_mark_key(&trans, bkey_i_to_s_c(k), false,
-					  -((s64) k->k.size),
-					  &trans.fs_usage_deltas) ?:
+					  -((s64) k->k.size)) ?:
 		      bch2_trans_commit(&trans, &disk_res, NULL,
 					BTREE_INSERT_ATOMIC|
 					BTREE_INSERT_NOFAIL|
-- 
cgit 


From a62c78a794c2cf6ed08a1a7b25887a1935e87835 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 21 May 2019 10:14:54 -0400
Subject: bcachefs: fix bch2_extent_merge()

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/extents.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index d8d128cae5b4..c5a0d6c8e63a 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -1529,6 +1529,9 @@ enum merge_result bch2_extent_merge(struct bch_fs *c,
 	union bch_extent_entry *en_r = er.v->start;
 	struct bch_extent_crc_unpacked crc_l, crc_r;
 
+	if (bkey_val_u64s(&l->k) != bkey_val_u64s(&r->k))
+		return BCH_MERGE_NOMERGE;
+
 	crc_l = bch2_extent_crc_unpack(el.k, NULL);
 
 	extent_for_each_entry(el, en_l) {
-- 
cgit 


From cdeeb75ea9e329b6e02e7956f741de7c9ddfbb3b Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 21 May 2019 13:42:02 -0400
Subject: bcachefs: fix a mount error path

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index defd35d04750..afe930532224 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -1597,7 +1597,7 @@ static struct bch_fs *__bch2_open_as_blockdevs(const char *dev_name, char * cons
 		 */
 
 		c1 = bch2_path_to_fs(devs[0]);
-		if (!c1)
+		if (IS_ERR(c1))
 			return c;
 
 		for (i = 1; i < nr_devs; i++) {
-- 
cgit 


From 4ee202e2b70fc8f6a7abd8fc3b3a8024c437fe24 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 21 May 2019 15:49:56 -0400
Subject: bcachefs: better BTREE_INSERT_NO_CLEAR_REPLICAS

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update_leaf.c | 15 +++++++--------
 fs/bcachefs/buckets.c           | 31 ++++++++++++++-----------------
 2 files changed, 21 insertions(+), 25 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index e6fbe8a7413a..7475d5c4420b 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -544,13 +544,6 @@ static inline int do_btree_insert_at(struct btree_trans *trans,
 	struct btree_insert_entry *i;
 	int ret;
 
-	if (likely(!(trans->flags & BTREE_INSERT_NO_CLEAR_REPLICAS)) &&
-	    trans->fs_usage_deltas) {
-		memset(&trans->fs_usage_deltas->fs_usage, 0,
-		       sizeof(trans->fs_usage_deltas->fs_usage));
-		trans->fs_usage_deltas->used = 0;
-	}
-
 	trans_for_each_update_iter(trans, i)
 		BUG_ON(i->iter->uptodate >= BTREE_ITER_NEED_RELOCK);
 
@@ -561,7 +554,7 @@ static inline int do_btree_insert_at(struct btree_trans *trans,
 			if (ret == -EINTR)
 				trace_trans_restart_mark(trans->ip);
 			if (ret)
-				return ret;
+				goto out_clear_replicas;
 		}
 
 	btree_trans_lock_write(c, trans);
@@ -655,6 +648,12 @@ out:
 	}
 
 	bch2_journal_res_put(&c->journal, &trans->journal_res);
+out_clear_replicas:
+	if (trans->fs_usage_deltas) {
+		memset(&trans->fs_usage_deltas->fs_usage, 0,
+		       sizeof(trans->fs_usage_deltas->fs_usage));
+		trans->fs_usage_deltas->used = 0;
+	}
 
 	return ret;
 }
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 5c18cebeb180..f38cda70617b 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -1260,30 +1260,31 @@ void bch2_trans_fs_usage_apply(struct btree_trans *trans,
 
 /* trans_mark: */
 
-static void replicas_deltas_realloc(struct btree_trans *trans)
+static struct replicas_delta_list *
+replicas_deltas_realloc(struct btree_trans *trans, unsigned more)
 {
 	struct replicas_delta_list *d = trans->fs_usage_deltas;
-	unsigned new_size = d ? d->size * 2 : 128;
+	unsigned new_size = d ? (d->size + more) * 2 : 128;
 
-	d = krealloc(d, sizeof(*d) + new_size, GFP_NOIO|__GFP_ZERO);
-	BUG_ON(!d);
+	if (!d || d->used + more > d->size) {
+		d = krealloc(d, sizeof(*d) + new_size, GFP_NOIO|__GFP_ZERO);
+		BUG_ON(!d);
 
-	d->size = new_size;
-	trans->fs_usage_deltas = d;
+		d->size = new_size;
+		trans->fs_usage_deltas = d;
+	}
+	return d;
 }
 
 static inline void update_replicas_list(struct btree_trans *trans,
 					struct bch_replicas_entry *r,
 					s64 sectors)
 {
-	struct replicas_delta_list *d = trans->fs_usage_deltas;
+	struct replicas_delta_list *d;
 	struct replicas_delta *n;
 	unsigned b = replicas_entry_bytes(r) + 8;
 
-	if (!d || d->used + b > d->size) {
-		replicas_deltas_realloc(trans);
-		d = trans->fs_usage_deltas;
-	}
+	d = replicas_deltas_realloc(trans, b);
 
 	n = (void *) d->d + d->used;
 	n->delta = sectors;
@@ -1566,9 +1567,7 @@ int bch2_trans_mark_key(struct btree_trans *trans,
 		return bch2_trans_mark_extent(trans, k,
 				sectors, BCH_DATA_USER);
 	case KEY_TYPE_inode:
-		if (!trans->fs_usage_deltas)
-			replicas_deltas_realloc(trans);
-		d = trans->fs_usage_deltas;
+		d = replicas_deltas_realloc(trans, 0);
 
 		if (inserting)
 			d->fs_usage.nr_inodes++;
@@ -1578,9 +1577,7 @@ int bch2_trans_mark_key(struct btree_trans *trans,
 	case KEY_TYPE_reservation: {
 		unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas;
 
-		if (!trans->fs_usage_deltas)
-			replicas_deltas_realloc(trans);
-		d = trans->fs_usage_deltas;
+		d = replicas_deltas_realloc(trans, 0);
 
 		sectors *= replicas;
 		replicas = clamp_t(unsigned, replicas, 1,
-- 
cgit 


From 572ad769f59519fad80ed7241c44d7330bc379b4 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 24 May 2019 11:56:01 -0400
Subject: bcachefs: Fix cached sectors not being updated on invalidate

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update_leaf.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 7475d5c4420b..feaefb00ef71 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -620,11 +620,12 @@ static inline int do_btree_insert_at(struct btree_trans *trans,
 		    !update_triggers_transactional(trans, i))
 			bch2_mark_update(trans, i, &fs_usage->u, 0);
 
-	if (fs_usage && trans->fs_usage_deltas) {
+	if (fs_usage && trans->fs_usage_deltas)
 		bch2_replicas_delta_list_apply(c, &fs_usage->u,
 					       trans->fs_usage_deltas);
+
+	if (fs_usage)
 		bch2_trans_fs_usage_apply(trans, fs_usage);
-	}
 
 	if (likely(!(trans->flags & BTREE_INSERT_NOMARK)) &&
 	    unlikely(c->gc_pos.phase))
-- 
cgit 


From 6e738539cd8fedb3657b97feec07bebffe20d8b0 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 24 May 2019 11:56:20 -0400
Subject: bcachefs: Improve key marking interface

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c      |  14 +-
 fs/bcachefs/btree_gc.c              |   5 +-
 fs/bcachefs/btree_update.h          |   3 +
 fs/bcachefs/btree_update_interior.c |  33 +++--
 fs/bcachefs/btree_update_leaf.c     |   6 +-
 fs/bcachefs/buckets.c               | 273 ++++++++++++++++++------------------
 fs/bcachefs/buckets.h               |  23 +--
 fs/bcachefs/ec.c                    |  28 ++--
 fs/bcachefs/recovery.c              |   5 +-
 9 files changed, 207 insertions(+), 183 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 23b81f6615ca..8b995dbc5018 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -232,9 +232,9 @@ int bch2_alloc_read(struct bch_fs *c, struct journal_keys *journal_keys)
 	bch2_trans_init(&trans, c, 0, 0);
 
 	for_each_btree_key(&trans, iter, BTREE_ID_ALLOC, POS_MIN, 0, k, ret)
-		bch2_mark_key(c, k, true, 0, NULL, 0,
-			      BCH_BUCKET_MARK_NOATOMIC|
-			      BCH_BUCKET_MARK_ALLOC_READ);
+		bch2_mark_key(c, k, 0, NULL, 0,
+			      BCH_BUCKET_MARK_ALLOC_READ|
+			      BCH_BUCKET_MARK_NOATOMIC);
 
 	ret = bch2_trans_exit(&trans) ?: ret;
 	if (ret) {
@@ -244,10 +244,9 @@ int bch2_alloc_read(struct bch_fs *c, struct journal_keys *journal_keys)
 
 	for_each_journal_key(*journal_keys, j)
 		if (j->btree_id == BTREE_ID_ALLOC)
-			bch2_mark_key(c, bkey_i_to_s_c(j->k),
-				      true, 0, NULL, 0,
-				      BCH_BUCKET_MARK_NOATOMIC|
-				      BCH_BUCKET_MARK_ALLOC_READ);
+			bch2_mark_key(c, bkey_i_to_s_c(j->k), 0, NULL, 0,
+				      BCH_BUCKET_MARK_ALLOC_READ|
+				      BCH_BUCKET_MARK_NOATOMIC);
 
 	percpu_down_write(&c->mark_lock);
 	bch2_dev_usage_from_buckets(c);
@@ -953,6 +952,7 @@ retry:
 				BTREE_INSERT_NOFAIL|
 				BTREE_INSERT_USE_RESERVE|
 				BTREE_INSERT_USE_ALLOC_RESERVE|
+				BTREE_INSERT_BUCKET_INVALIDATE|
 				flags);
 	if (ret == -EINTR)
 		goto retry;
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 047f30efdd7a..f93e1d769113 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -173,7 +173,7 @@ static int bch2_gc_mark_key(struct bch_fs *c, struct bkey_s_c k,
 		*max_stale = max(*max_stale, ptr_stale(ca, ptr));
 	}
 
-	bch2_mark_key(c, k, true, k.k->size, NULL, 0, flags);
+	bch2_mark_key(c, k, k.k->size, NULL, 0, flags);
 fsck_err:
 	return ret;
 }
@@ -420,8 +420,7 @@ static void bch2_mark_pending_btree_node_frees(struct bch_fs *c)
 
 	for_each_pending_btree_node_free(c, as, d)
 		if (d->index_update_done)
-			bch2_mark_key(c, bkey_i_to_s_c(&d->key),
-				      true, 0, NULL, 0,
+			bch2_mark_key(c, bkey_i_to_s_c(&d->key), 0, NULL, 0,
 				      BCH_BUCKET_MARK_GC);
 
 	mutex_unlock(&c->btree_interior_update_lock);
diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
index c25e7a752cc9..616c103c05ec 100644
--- a/fs/bcachefs/btree_update.h
+++ b/fs/bcachefs/btree_update.h
@@ -48,6 +48,7 @@ enum {
 	__BTREE_INSERT_NOMARK,
 	__BTREE_INSERT_MARK_INMEM,
 	__BTREE_INSERT_NO_CLEAR_REPLICAS,
+	__BTREE_INSERT_BUCKET_INVALIDATE,
 	__BTREE_INSERT_NOWAIT,
 	__BTREE_INSERT_GC_LOCK_HELD,
 	__BCH_HASH_SET_MUST_CREATE,
@@ -94,6 +95,8 @@ enum {
 
 #define BTREE_INSERT_NO_CLEAR_REPLICAS	(1 << __BTREE_INSERT_NO_CLEAR_REPLICAS)
 
+#define BTREE_INSERT_BUCKET_INVALIDATE	(1 << __BTREE_INSERT_BUCKET_INVALIDATE)
+
 /* Don't block on allocation failure (for new btree nodes: */
 #define BTREE_INSERT_NOWAIT		(1 << __BTREE_INSERT_NOWAIT)
 #define BTREE_INSERT_GC_LOCK_HELD	(1 << __BTREE_INSERT_GC_LOCK_HELD)
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index dcfcfe97b6f4..0b80dca5656a 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -194,7 +194,9 @@ found:
 		       : gc_pos_btree_root(as->btree_id)) >= 0 &&
 	    gc_pos_cmp(c->gc_pos, gc_phase(GC_PHASE_PENDING_DELETE)) < 0)
 		bch2_mark_key_locked(c, bkey_i_to_s_c(&d->key),
-			      false, 0, NULL, 0, BCH_BUCKET_MARK_GC);
+			      0, NULL, 0,
+			      BCH_BUCKET_MARK_OVERWRITE|
+			      BCH_BUCKET_MARK_GC);
 }
 
 static void __btree_node_free(struct bch_fs *c, struct btree *b)
@@ -264,13 +266,13 @@ static void bch2_btree_node_free_ondisk(struct bch_fs *c,
 {
 	BUG_ON(!pending->index_update_done);
 
-	bch2_mark_key(c, bkey_i_to_s_c(&pending->key),
-		      false, 0,
-		      NULL, 0, 0);
+	bch2_mark_key(c, bkey_i_to_s_c(&pending->key), 0, NULL, 0,
+		      BCH_BUCKET_MARK_OVERWRITE);
 
 	if (gc_visited(c, gc_phase(GC_PHASE_PENDING_DELETE)))
-		bch2_mark_key(c, bkey_i_to_s_c(&pending->key),
-			      false, 0, NULL, 0, BCH_BUCKET_MARK_GC);
+		bch2_mark_key(c, bkey_i_to_s_c(&pending->key), 0, NULL, 0,
+			      BCH_BUCKET_MARK_OVERWRITE|
+			      BCH_BUCKET_MARK_GC);
 }
 
 static struct btree *__bch2_btree_node_alloc(struct bch_fs *c,
@@ -1075,10 +1077,12 @@ static void bch2_btree_set_root_inmem(struct btree_update *as, struct btree *b)
 	fs_usage = bch2_fs_usage_scratch_get(c);
 
 	bch2_mark_key_locked(c, bkey_i_to_s_c(&b->key),
-		      true, 0, &fs_usage->u, 0, 0);
+		      0, &fs_usage->u, 0,
+		      BCH_BUCKET_MARK_INSERT);
 	if (gc_visited(c, gc_pos_btree_root(b->c.btree_id)))
 		bch2_mark_key_locked(c, bkey_i_to_s_c(&b->key),
-				     true, 0, NULL, 0,
+				     0, NULL, 0,
+				     BCH_BUCKET_MARK_INSERT|
 				     BCH_BUCKET_MARK_GC);
 
 	if (old && !btree_node_fake(old))
@@ -1171,11 +1175,14 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, struct btree *b
 	fs_usage = bch2_fs_usage_scratch_get(c);
 
 	bch2_mark_key_locked(c, bkey_i_to_s_c(insert),
-			     true, 0, &fs_usage->u, 0, 0);
+			     0, &fs_usage->u, 0,
+			     BCH_BUCKET_MARK_INSERT);
 
 	if (gc_visited(c, gc_pos_btree_node(b)))
 		bch2_mark_key_locked(c, bkey_i_to_s_c(insert),
-				     true, 0, NULL, 0, BCH_BUCKET_MARK_GC);
+				     0, NULL, 0,
+				     BCH_BUCKET_MARK_INSERT|
+				     BCH_BUCKET_MARK_GC);
 
 	while ((k = bch2_btree_node_iter_peek_all(node_iter, b)) &&
 	       bkey_iter_pos_cmp(b, &insert->k.p, k) > 0)
@@ -1996,10 +2003,12 @@ static void __bch2_btree_node_update_key(struct bch_fs *c,
 		fs_usage = bch2_fs_usage_scratch_get(c);
 
 		bch2_mark_key_locked(c, bkey_i_to_s_c(&new_key->k_i),
-			      true, 0, &fs_usage->u, 0, 0);
+			      0, &fs_usage->u, 0,
+			      BCH_BUCKET_MARK_INSERT);
 		if (gc_visited(c, gc_pos_btree_root(b->c.btree_id)))
 			bch2_mark_key_locked(c, bkey_i_to_s_c(&new_key->k_i),
-					     true, 0, NULL, 0,
+					     0, NULL, 0,
+					     BCH_BUCKET_MARK_INSERT||
 					     BCH_BUCKET_MARK_GC);
 
 		bch2_btree_node_free_index(as, NULL,
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index feaefb00ef71..6e63c916986e 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -542,6 +542,9 @@ static inline int do_btree_insert_at(struct btree_trans *trans,
 	struct bch_fs *c = trans->c;
 	struct bch_fs_usage_online *fs_usage = NULL;
 	struct btree_insert_entry *i;
+	unsigned mark_flags = trans->flags & BTREE_INSERT_BUCKET_INVALIDATE
+		? BCH_BUCKET_MARK_BUCKET_INVALIDATE
+		: 0;
 	int ret;
 
 	trans_for_each_update_iter(trans, i)
@@ -618,7 +621,7 @@ static inline int do_btree_insert_at(struct btree_trans *trans,
 	trans_for_each_update_iter(trans, i)
 		if (update_has_triggers(trans, i) &&
 		    !update_triggers_transactional(trans, i))
-			bch2_mark_update(trans, i, &fs_usage->u, 0);
+			bch2_mark_update(trans, i, &fs_usage->u, mark_flags);
 
 	if (fs_usage && trans->fs_usage_deltas)
 		bch2_replicas_delta_list_apply(c, &fs_usage->u,
@@ -632,6 +635,7 @@ static inline int do_btree_insert_at(struct btree_trans *trans,
 		trans_for_each_update_iter(trans, i)
 			if (gc_visited(c, gc_pos_btree_node(i->iter->l[0].b)))
 				bch2_mark_update(trans, i, NULL,
+						 mark_flags|
 						 BCH_BUCKET_MARK_GC);
 
 	trans_for_each_update(trans, i)
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index f38cda70617b..0d96ea572bd0 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -547,6 +547,68 @@ static inline void update_cached_sectors(struct bch_fs *c,
 	update_replicas(c, fs_usage, &r.e, sectors);
 }
 
+static struct replicas_delta_list *
+replicas_deltas_realloc(struct btree_trans *trans, unsigned more)
+{
+	struct replicas_delta_list *d = trans->fs_usage_deltas;
+	unsigned new_size = d ? (d->size + more) * 2 : 128;
+
+	if (!d || d->used + more > d->size) {
+		d = krealloc(d, sizeof(*d) + new_size, GFP_NOIO|__GFP_ZERO);
+		BUG_ON(!d);
+
+		d->size = new_size;
+		trans->fs_usage_deltas = d;
+	}
+	return d;
+}
+
+static inline void update_replicas_list(struct btree_trans *trans,
+					struct bch_replicas_entry *r,
+					s64 sectors)
+{
+	struct replicas_delta_list *d;
+	struct replicas_delta *n;
+	unsigned b = replicas_entry_bytes(r) + 8;
+
+	d = replicas_deltas_realloc(trans, b);
+
+	n = (void *) d->d + d->used;
+	n->delta = sectors;
+	memcpy((void *) n + offsetof(struct replicas_delta, r),
+	       r, replicas_entry_bytes(r));
+	d->used += b;
+}
+
+static inline void update_cached_sectors_list(struct btree_trans *trans,
+					      unsigned dev, s64 sectors)
+{
+	struct bch_replicas_padded r;
+
+	bch2_replicas_entry_cached(&r.e, dev);
+
+	update_replicas_list(trans, &r.e, sectors);
+}
+
+void bch2_replicas_delta_list_apply(struct bch_fs *c,
+				    struct bch_fs_usage *fs_usage,
+				    struct replicas_delta_list *r)
+{
+	struct replicas_delta *d = r->d;
+	struct replicas_delta *top = (void *) r->d + r->used;
+
+	acc_u64s((u64 *) fs_usage,
+		 (u64 *) &r->fs_usage, sizeof(*fs_usage) / sizeof(u64));
+
+	while (d != top) {
+		BUG_ON((void *) d > (void *) top);
+
+		update_replicas(c, fs_usage, &d->r, d->delta);
+
+		d = (void *) d + replicas_entry_bytes(&d->r) + 8;
+	}
+}
+
 #define do_mark_fn(fn, c, pos, flags, ...)				\
 ({									\
 	int gc, ret = 0;						\
@@ -630,23 +692,20 @@ void bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
 }
 
 static int bch2_mark_alloc(struct bch_fs *c, struct bkey_s_c k,
-			   bool inserting,
 			   struct bch_fs_usage *fs_usage,
-			   unsigned journal_seq, unsigned flags,
-			   bool gc)
+			   u64 journal_seq, unsigned flags)
 {
+	bool gc = flags & BCH_BUCKET_MARK_GC;
 	struct bkey_alloc_unpacked u;
 	struct bch_dev *ca;
 	struct bucket *g;
 	struct bucket_mark old, m;
 
-	if (!inserting)
-		return 0;
-
 	/*
 	 * alloc btree is read in by bch2_alloc_read, not gc:
 	 */
-	if (flags & BCH_BUCKET_MARK_GC)
+	if ((flags & BCH_BUCKET_MARK_GC) &&
+	    !(flags & BCH_BUCKET_MARK_BUCKET_INVALIDATE))
 		return 0;
 
 	ca = bch_dev_bkey_exists(c, k.k->p.inode);
@@ -663,7 +722,7 @@ static int bch2_mark_alloc(struct bch_fs *c, struct bkey_s_c k,
 		m.dirty_sectors		= u.dirty_sectors;
 		m.cached_sectors	= u.cached_sectors;
 
-		if (!(flags & BCH_BUCKET_MARK_GC)) {
+		if (journal_seq) {
 			m.journal_seq_valid	= 1;
 			m.journal_seq		= journal_seq;
 		}
@@ -682,7 +741,8 @@ static int bch2_mark_alloc(struct bch_fs *c, struct bkey_s_c k,
 	 * not:
 	 */
 
-	if (old.cached_sectors) {
+	if ((flags & BCH_BUCKET_MARK_BUCKET_INVALIDATE) &&
+	    old.cached_sectors) {
 		update_cached_sectors(c, fs_usage, ca->dev_idx,
 				      -old.cached_sectors);
 		trace_invalidate(ca, bucket_to_sector(ca, k.k->p.offset),
@@ -773,11 +833,12 @@ static s64 ptr_disk_sectors_delta(struct extent_ptr_decoded p,
 
 static void bucket_set_stripe(struct bch_fs *c,
 			      const struct bch_stripe *v,
-			      bool enabled,
 			      struct bch_fs_usage *fs_usage,
 			      u64 journal_seq,
-			      bool gc)
+			      unsigned flags)
 {
+	bool enabled = !(flags & BCH_BUCKET_MARK_OVERWRITE);
+	bool gc = flags & BCH_BUCKET_MARK_GC;
 	unsigned i;
 
 	for (i = 0; i < v->nr_blocks; i++) {
@@ -803,9 +864,9 @@ static bool bch2_mark_pointer(struct bch_fs *c,
 			      struct extent_ptr_decoded p,
 			      s64 sectors, enum bch_data_type data_type,
 			      struct bch_fs_usage *fs_usage,
-			      unsigned journal_seq, unsigned flags,
-			      bool gc)
+			      u64 journal_seq, unsigned flags)
 {
+	bool gc = flags & BCH_BUCKET_MARK_GC;
 	struct bucket_mark old, new;
 	struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev);
 	struct bucket *g = PTR_BUCKET(ca, &p.ptr, gc);
@@ -872,9 +933,9 @@ static int bch2_mark_stripe_ptr(struct bch_fs *c,
 				struct bch_extent_stripe_ptr p,
 				enum bch_data_type data_type,
 				struct bch_fs_usage *fs_usage,
-				s64 sectors, unsigned flags,
-				bool gc)
+				s64 sectors, unsigned flags)
 {
+	bool gc = flags & BCH_BUCKET_MARK_GC;
 	struct stripe *m;
 	unsigned old, new, nr_data;
 	int blocks_nonempty_delta;
@@ -927,8 +988,7 @@ static int bch2_mark_stripe_ptr(struct bch_fs *c,
 static int bch2_mark_extent(struct bch_fs *c, struct bkey_s_c k,
 			    s64 sectors, enum bch_data_type data_type,
 			    struct bch_fs_usage *fs_usage,
-			    unsigned journal_seq, unsigned flags,
-			    bool gc)
+			    unsigned journal_seq, unsigned flags)
 {
 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
 	const union bch_extent_entry *entry;
@@ -949,7 +1009,7 @@ static int bch2_mark_extent(struct bch_fs *c, struct bkey_s_c k,
 			? sectors
 			: ptr_disk_sectors_delta(p, sectors);
 		bool stale = bch2_mark_pointer(c, p, disk_sectors, data_type,
-					fs_usage, journal_seq, flags, gc);
+					fs_usage, journal_seq, flags);
 
 		if (p.ptr.cached) {
 			if (disk_sectors && !stale)
@@ -962,7 +1022,7 @@ static int bch2_mark_extent(struct bch_fs *c, struct bkey_s_c k,
 			for (i = 0; i < p.ec_nr; i++) {
 				ret = bch2_mark_stripe_ptr(c, p.ec[i],
 						data_type, fs_usage,
-						disk_sectors, flags, gc);
+						disk_sectors, flags);
 				if (ret)
 					return ret;
 			}
@@ -978,11 +1038,10 @@ static int bch2_mark_extent(struct bch_fs *c, struct bkey_s_c k,
 }
 
 static int bch2_mark_stripe(struct bch_fs *c, struct bkey_s_c k,
-			    bool inserting,
 			    struct bch_fs_usage *fs_usage,
-			    u64 journal_seq, unsigned flags,
-			    bool gc)
+			    u64 journal_seq, unsigned flags)
 {
+	bool gc = flags & BCH_BUCKET_MARK_GC;
 	struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k);
 	size_t idx = s.k->p.offset;
 	struct stripe *m = genradix_ptr(&c->stripes[gc], idx);
@@ -990,19 +1049,14 @@ static int bch2_mark_stripe(struct bch_fs *c, struct bkey_s_c k,
 
 	spin_lock(&c->ec_stripes_heap_lock);
 
-	if (!m || (!inserting && !m->alive)) {
+	if (!m || ((flags & BCH_BUCKET_MARK_OVERWRITE) && !m->alive)) {
 		spin_unlock(&c->ec_stripes_heap_lock);
 		bch_err_ratelimited(c, "error marking nonexistent stripe %zu",
 				    idx);
 		return -1;
 	}
 
-	if (!gc && m->alive)
-		bch2_stripes_heap_del(c, m, idx);
-
-	memset(m, 0, sizeof(*m));
-
-	if (inserting) {
+	if (!(flags & BCH_BUCKET_MARK_OVERWRITE)) {
 		m->sectors	= le16_to_cpu(s.v->sectors);
 		m->algorithm	= s.v->algorithm;
 		m->nr_blocks	= s.v->nr_blocks;
@@ -1010,11 +1064,11 @@ static int bch2_mark_stripe(struct bch_fs *c, struct bkey_s_c k,
 
 		bch2_bkey_to_replicas(&m->r.e, k);
 
-	/*
-	 * XXX: account for stripes somehow here
-	 */
+		/*
+		 * XXX: account for stripes somehow here
+		 */
 #if 0
-	update_replicas(c, fs_usage, &m->r.e, stripe_sectors);
+		update_replicas(c, fs_usage, &m->r.e, stripe_sectors);
 #endif
 
 		/* gc recalculates these fields: */
@@ -1027,53 +1081,54 @@ static int bch2_mark_stripe(struct bch_fs *c, struct bkey_s_c k,
 		}
 
 		if (!gc)
-			bch2_stripes_heap_insert(c, m, idx);
-		else
-			m->alive = true;
+			bch2_stripes_heap_update(c, m, idx);
+		m->alive	= true;
+	} else {
+		if (!gc)
+			bch2_stripes_heap_del(c, m, idx);
+		memset(m, 0, sizeof(*m));
 	}
 
 	spin_unlock(&c->ec_stripes_heap_lock);
 
-	bucket_set_stripe(c, s.v, inserting, fs_usage, 0, gc);
+	bucket_set_stripe(c, s.v, fs_usage, 0, flags);
 	return 0;
 }
 
 int bch2_mark_key_locked(struct bch_fs *c,
-		   struct bkey_s_c k,
-		   bool inserting, s64 sectors,
+		   struct bkey_s_c k, s64 sectors,
 		   struct bch_fs_usage *fs_usage,
 		   u64 journal_seq, unsigned flags)
 {
-	bool gc = flags & BCH_BUCKET_MARK_GC;
 	int ret = 0;
 
 	preempt_disable();
 
-	if (!fs_usage || gc)
-		fs_usage = fs_usage_ptr(c, journal_seq, gc);
+	if (!fs_usage || (flags & BCH_BUCKET_MARK_GC))
+		fs_usage = fs_usage_ptr(c, journal_seq,
+					flags & BCH_BUCKET_MARK_GC);
 
 	switch (k.k->type) {
 	case KEY_TYPE_alloc:
-		ret = bch2_mark_alloc(c, k, inserting,
-				fs_usage, journal_seq, flags, gc);
+		ret = bch2_mark_alloc(c, k, fs_usage, journal_seq, flags);
 		break;
 	case KEY_TYPE_btree_ptr:
-		ret = bch2_mark_extent(c, k, inserting
-				?  c->opts.btree_node_size
-				: -c->opts.btree_node_size,
-				BCH_DATA_BTREE,
-				fs_usage, journal_seq, flags, gc);
+		sectors = !(flags & BCH_BUCKET_MARK_OVERWRITE)
+			?  c->opts.btree_node_size
+			: -c->opts.btree_node_size;
+
+		ret = bch2_mark_extent(c, k, sectors, BCH_DATA_BTREE,
+				fs_usage, journal_seq, flags);
 		break;
 	case KEY_TYPE_extent:
 		ret = bch2_mark_extent(c, k, sectors, BCH_DATA_USER,
-				fs_usage, journal_seq, flags, gc);
+				fs_usage, journal_seq, flags);
 		break;
 	case KEY_TYPE_stripe:
-		ret = bch2_mark_stripe(c, k, inserting,
-				fs_usage, journal_seq, flags, gc);
+		ret = bch2_mark_stripe(c, k, fs_usage, journal_seq, flags);
 		break;
 	case KEY_TYPE_inode:
-		if (inserting)
+		if (!(flags & BCH_BUCKET_MARK_OVERWRITE))
 			fs_usage->nr_inodes++;
 		else
 			fs_usage->nr_inodes--;
@@ -1097,14 +1152,14 @@ int bch2_mark_key_locked(struct bch_fs *c,
 }
 
 int bch2_mark_key(struct bch_fs *c, struct bkey_s_c k,
-		  bool inserting, s64 sectors,
+		  s64 sectors,
 		  struct bch_fs_usage *fs_usage,
 		  u64 journal_seq, unsigned flags)
 {
 	int ret;
 
 	percpu_down_read(&c->mark_lock);
-	ret = bch2_mark_key_locked(c, k, inserting, sectors,
+	ret = bch2_mark_key_locked(c, k, sectors,
 				   fs_usage, journal_seq, flags);
 	percpu_up_read(&c->mark_lock);
 
@@ -1144,9 +1199,9 @@ inline int bch2_mark_overwrite(struct btree_trans *trans,
 			sectors = old.k->p.offset - new->k.p.offset;
 			BUG_ON(sectors <= 0);
 
-			bch2_mark_key_locked(c, old, true, sectors,
+			bch2_mark_key_locked(c, old, sectors,
 				fs_usage, trans->journal_res.seq,
-				flags);
+				BCH_BUCKET_MARK_INSERT|flags);
 
 			sectors = bkey_start_offset(&new->k) -
 				old.k->p.offset;
@@ -1156,8 +1211,9 @@ inline int bch2_mark_overwrite(struct btree_trans *trans,
 		BUG_ON(sectors >= 0);
 	}
 
-	return bch2_mark_key_locked(c, old, false, sectors, fs_usage,
-				    trans->journal_res.seq, flags) ?: 1;
+	return bch2_mark_key_locked(c, old, sectors, fs_usage,
+				    trans->journal_res.seq,
+				    BCH_BUCKET_MARK_OVERWRITE|flags) ?: 1;
 }
 
 int bch2_mark_update(struct btree_trans *trans,
@@ -1176,10 +1232,11 @@ int bch2_mark_update(struct btree_trans *trans,
 		return 0;
 
 	if (!(trans->flags & BTREE_INSERT_NOMARK_INSERT))
-		bch2_mark_key_locked(c, bkey_i_to_s_c(insert->k), true,
+		bch2_mark_key_locked(c, bkey_i_to_s_c(insert->k),
 			bpos_min(insert->k->k.p, b->key.k.p).offset -
 			bkey_start_offset(&insert->k->k),
-			fs_usage, trans->journal_res.seq, flags);
+			fs_usage, trans->journal_res.seq,
+			BCH_BUCKET_MARK_INSERT|flags);
 
 	if (unlikely(trans->flags & BTREE_INSERT_NOMARK_OVERWRITES))
 		return 0;
@@ -1260,67 +1317,6 @@ void bch2_trans_fs_usage_apply(struct btree_trans *trans,
 
 /* trans_mark: */
 
-static struct replicas_delta_list *
-replicas_deltas_realloc(struct btree_trans *trans, unsigned more)
-{
-	struct replicas_delta_list *d = trans->fs_usage_deltas;
-	unsigned new_size = d ? (d->size + more) * 2 : 128;
-
-	if (!d || d->used + more > d->size) {
-		d = krealloc(d, sizeof(*d) + new_size, GFP_NOIO|__GFP_ZERO);
-		BUG_ON(!d);
-
-		d->size = new_size;
-		trans->fs_usage_deltas = d;
-	}
-	return d;
-}
-
-static inline void update_replicas_list(struct btree_trans *trans,
-					struct bch_replicas_entry *r,
-					s64 sectors)
-{
-	struct replicas_delta_list *d;
-	struct replicas_delta *n;
-	unsigned b = replicas_entry_bytes(r) + 8;
-
-	d = replicas_deltas_realloc(trans, b);
-
-	n = (void *) d->d + d->used;
-	n->delta = sectors;
-	memcpy(&n->r, r, replicas_entry_bytes(r));
-	d->used += b;
-}
-
-static inline void update_cached_sectors_list(struct btree_trans *trans,
-					      unsigned dev, s64 sectors)
-{
-	struct bch_replicas_padded r;
-
-	bch2_replicas_entry_cached(&r.e, dev);
-
-	update_replicas_list(trans, &r.e, sectors);
-}
-
-void bch2_replicas_delta_list_apply(struct bch_fs *c,
-				    struct bch_fs_usage *fs_usage,
-				    struct replicas_delta_list *r)
-{
-	struct replicas_delta *d = r->d;
-	struct replicas_delta *top = (void *) r->d + r->used;
-
-	acc_u64s((u64 *) fs_usage,
-		 (u64 *) &r->fs_usage, sizeof(*fs_usage) / sizeof(u64));
-
-	while (d != top) {
-		BUG_ON((void *) d > (void *) top);
-
-		update_replicas(c, fs_usage, &d->r, d->delta);
-
-		d = (void *) d + replicas_entry_bytes(&d->r) + 8;
-	}
-}
-
 static int trans_get_key(struct btree_trans *trans,
 			 enum btree_id btree_id, struct bpos pos,
 			 struct btree_insert_entry **insert,
@@ -1550,26 +1546,27 @@ static int bch2_trans_mark_extent(struct btree_trans *trans,
 	return 0;
 }
 
-int bch2_trans_mark_key(struct btree_trans *trans,
-			struct bkey_s_c k,
-			bool inserting, s64 sectors)
+int bch2_trans_mark_key(struct btree_trans *trans, struct bkey_s_c k,
+			s64 sectors, unsigned flags)
 {
 	struct replicas_delta_list *d;
 	struct bch_fs *c = trans->c;
 
 	switch (k.k->type) {
 	case KEY_TYPE_btree_ptr:
-		return bch2_trans_mark_extent(trans, k, inserting
-				?  c->opts.btree_node_size
-				: -c->opts.btree_node_size,
-				BCH_DATA_BTREE);
+		sectors = !(flags & BCH_BUCKET_MARK_OVERWRITE)
+			?  c->opts.btree_node_size
+			: -c->opts.btree_node_size;
+
+		return bch2_trans_mark_extent(trans, k, sectors,
+					      BCH_DATA_BTREE);
 	case KEY_TYPE_extent:
-		return bch2_trans_mark_extent(trans, k,
-				sectors, BCH_DATA_USER);
+		return bch2_trans_mark_extent(trans, k, sectors,
+					      BCH_DATA_USER);
 	case KEY_TYPE_inode:
 		d = replicas_deltas_realloc(trans, 0);
 
-		if (inserting)
+		if (!(flags & BCH_BUCKET_MARK_OVERWRITE))
 			d->fs_usage.nr_inodes++;
 		else
 			d->fs_usage.nr_inodes--;
@@ -1605,9 +1602,10 @@ int bch2_trans_mark_update(struct btree_trans *trans,
 		return 0;
 
 	ret = bch2_trans_mark_key(trans,
-			bkey_i_to_s_c(insert->k), true,
+			bkey_i_to_s_c(insert->k),
 			bpos_min(insert->k->k.p, b->key.k.p).offset -
-			bkey_start_offset(&insert->k->k));
+			bkey_start_offset(&insert->k->k),
+			BCH_BUCKET_MARK_INSERT);
 	if (ret)
 		return ret;
 
@@ -1641,8 +1639,8 @@ int bch2_trans_mark_update(struct btree_trans *trans,
 				sectors = k.k->p.offset - insert->k->k.p.offset;
 				BUG_ON(sectors <= 0);
 
-				ret = bch2_trans_mark_key(trans, k, true,
-							  sectors);
+				ret = bch2_trans_mark_key(trans, k, sectors,
+						BCH_BUCKET_MARK_INSERT);
 				if (ret)
 					return ret;
 
@@ -1654,7 +1652,8 @@ int bch2_trans_mark_update(struct btree_trans *trans,
 			BUG_ON(sectors >= 0);
 		}
 
-		ret = bch2_trans_mark_key(trans, k, false, sectors);
+		ret = bch2_trans_mark_key(trans, k, sectors,
+					  BCH_BUCKET_MARK_OVERWRITE);
 		if (ret)
 			return ret;
 
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index 9b264514bfcb..793bb8cb2527 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -249,16 +249,17 @@ void bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *,
 			       size_t, enum bch_data_type, unsigned,
 			       struct gc_pos, unsigned);
 
-#define BCH_BUCKET_MARK_GC			(1 << 0)
-#define BCH_BUCKET_MARK_NOATOMIC		(1 << 1)
-#define BCH_BUCKET_MARK_ALLOC_READ		(1 << 2)
-
-int bch2_mark_key_locked(struct bch_fs *, struct bkey_s_c,
-		  bool, s64, struct bch_fs_usage *,
-		  u64, unsigned);
-int bch2_mark_key(struct bch_fs *, struct bkey_s_c,
-		  bool, s64, struct bch_fs_usage *,
-		  u64, unsigned);
+#define BCH_BUCKET_MARK_INSERT			(1 << 0)
+#define BCH_BUCKET_MARK_OVERWRITE		(1 << 1)
+#define BCH_BUCKET_MARK_BUCKET_INVALIDATE	(1 << 2)
+#define BCH_BUCKET_MARK_GC			(1 << 3)
+#define BCH_BUCKET_MARK_ALLOC_READ		(1 << 4)
+#define BCH_BUCKET_MARK_NOATOMIC		(1 << 5)
+
+int bch2_mark_key_locked(struct bch_fs *, struct bkey_s_c, s64,
+			 struct bch_fs_usage *, u64, unsigned);
+int bch2_mark_key(struct bch_fs *, struct bkey_s_c, s64,
+		  struct bch_fs_usage *, u64, unsigned);
 int bch2_fs_usage_apply(struct bch_fs *, struct bch_fs_usage_online *,
 			struct disk_reservation *, unsigned);
 
@@ -271,7 +272,7 @@ int bch2_mark_update(struct btree_trans *, struct btree_insert_entry *,
 void bch2_replicas_delta_list_apply(struct bch_fs *,
 				    struct bch_fs_usage *,
 				    struct replicas_delta_list *);
-int bch2_trans_mark_key(struct btree_trans *, struct bkey_s_c, bool, s64);
+int bch2_trans_mark_key(struct btree_trans *, struct bkey_s_c, s64, unsigned);
 int bch2_trans_mark_update(struct btree_trans *,
 			   struct btree_insert_entry *);
 void bch2_trans_fs_usage_apply(struct btree_trans *, struct bch_fs_usage_online *);
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index 4a8aa7491fb5..01e85fae72d3 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -611,17 +611,21 @@ void bch2_stripes_heap_update(struct bch_fs *c,
 	ec_stripes_heap *h = &c->ec_stripes_heap;
 	size_t i;
 
-	heap_verify_backpointer(c, idx);
+	if (m->alive) {
+		heap_verify_backpointer(c, idx);
 
-	h->data[m->heap_idx].blocks_nonempty = m->blocks_nonempty;
+		h->data[m->heap_idx].blocks_nonempty = m->blocks_nonempty;
 
-	i = m->heap_idx;
-	heap_sift_up(h,	  i, ec_stripes_heap_cmp,
-		     ec_stripes_heap_set_backpointer);
-	heap_sift_down(h, i, ec_stripes_heap_cmp,
-		       ec_stripes_heap_set_backpointer);
+		i = m->heap_idx;
+		heap_sift_up(h,	  i, ec_stripes_heap_cmp,
+			     ec_stripes_heap_set_backpointer);
+		heap_sift_down(h, i, ec_stripes_heap_cmp,
+			       ec_stripes_heap_set_backpointer);
 
-	heap_verify_backpointer(c, idx);
+		heap_verify_backpointer(c, idx);
+	} else {
+		bch2_stripes_heap_insert(c, m, idx);
+	}
 
 	if (stripe_idx_to_delete(c) >= 0)
 		schedule_work(&c->ec_stripe_delete_work);
@@ -1274,7 +1278,9 @@ int bch2_stripes_read(struct bch_fs *c, struct journal_keys *journal_keys)
 	bch2_trans_init(&trans, c, 0, 0);
 
 	for_each_btree_key(&trans, iter, BTREE_ID_EC, POS_MIN, 0, k, ret)
-		bch2_mark_key(c, k, true, 0, NULL, 0, 0);
+		bch2_mark_key(c, k, 0, NULL, 0,
+			      BCH_BUCKET_MARK_ALLOC_READ|
+			      BCH_BUCKET_MARK_NOATOMIC);
 
 	ret = bch2_trans_exit(&trans) ?: ret;
 	if (ret) {
@@ -1285,7 +1291,9 @@ int bch2_stripes_read(struct bch_fs *c, struct journal_keys *journal_keys)
 	for_each_journal_key(*journal_keys, i)
 		if (i->btree_id == BTREE_ID_EC)
 			bch2_mark_key(c, bkey_i_to_s_c(i->k),
-				      true, 0, NULL, 0, 0);
+				      0, NULL, 0,
+				      BCH_BUCKET_MARK_ALLOC_READ|
+				      BCH_BUCKET_MARK_NOATOMIC);
 
 	return 0;
 }
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 8c656308826b..dc9222b1a196 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -258,8 +258,9 @@ retry:
 	} while (bkey_cmp(iter->pos, k->k.p) < 0);
 
 	if (split_compressed) {
-		ret = bch2_trans_mark_key(&trans, bkey_i_to_s_c(k), false,
-					  -((s64) k->k.size)) ?:
+		ret = bch2_trans_mark_key(&trans, bkey_i_to_s_c(k),
+					  -((s64) k->k.size),
+					  BCH_BUCKET_MARK_OVERWRITE) ?:
 		      bch2_trans_commit(&trans, &disk_res, NULL,
 					BTREE_INSERT_ATOMIC|
 					BTREE_INSERT_NOFAIL|
-- 
cgit 


From 37dd783474d07f8aa210ecf5cc82fd900417eb8b Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 24 May 2019 14:33:16 -0400
Subject: bcachefs: Fix an error path in bch2_btree_iter_traverse()

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 2579944bb8c1..9e6faf7e2830 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -959,9 +959,10 @@ static void btree_iter_up(struct btree_iter *iter)
 int __must_check __bch2_btree_iter_traverse(struct btree_iter *);
 
 static int __btree_iter_traverse_all(struct btree_trans *trans,
-				     struct btree_iter *iter, int ret)
+				     struct btree_iter *orig_iter, int ret)
 {
 	struct bch_fs *c = trans->c;
+	struct btree_iter *iter;
 	u8 sorted[BTREE_ITER_MAX];
 	unsigned i, nr_sorted = 0;
 
@@ -990,8 +991,8 @@ retry_all:
 
 	if (unlikely(ret == -EIO)) {
 		trans->error = true;
-		iter->flags |= BTREE_ITER_ERROR;
-		iter->l[iter->level].b = BTREE_ITER_NO_NODE_ERROR;
+		orig_iter->flags |= BTREE_ITER_ERROR;
+		orig_iter->l[orig_iter->level].b = BTREE_ITER_NO_NODE_ERROR;
 		goto out;
 	}
 
-- 
cgit 


From 5884fddfe732183aa2be4d11444543dfae6e044b Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 24 May 2019 14:45:33 -0400
Subject: bcachefs: Fix starting copygc when already started

We can sometimes call bch2_dev_read_write() when the device is already
RW (in error paths).

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/movinggc.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c
index d97be76da58f..44e235ef3de0 100644
--- a/fs/bcachefs/movinggc.c
+++ b/fs/bcachefs/movinggc.c
@@ -283,7 +283,8 @@ int bch2_copygc_start(struct bch_fs *c, struct bch_dev *ca)
 {
 	struct task_struct *t;
 
-	BUG_ON(ca->copygc_thread);
+	if (ca->copygc_thread)
+		return 0;
 
 	if (c->opts.nochanges)
 		return 0;
-- 
cgit 


From 1ae973345626fcd9cdb81fe856cabf7808a33140 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 29 May 2019 20:06:06 -0400
Subject: bcachefs: Don't overflow stack in bch2_extent_merge_inline()

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bkey.h    | 5 +----
 fs/bcachefs/extents.c | 4 ++++
 2 files changed, 5 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bkey.h b/fs/bcachefs/bkey.h
index 45de61d492a4..ba08d95aae6f 100644
--- a/fs/bcachefs/bkey.h
+++ b/fs/bcachefs/bkey.h
@@ -41,10 +41,7 @@ struct bkey_s {
 
 #define bkey_next(_k)		vstruct_next(_k)
 
-static inline unsigned bkey_val_u64s(const struct bkey *k)
-{
-	return k->u64s - BKEY_U64s;
-}
+#define bkey_val_u64s(_k)	((_k)->u64s - BKEY_U64s)
 
 static inline size_t bkey_val_bytes(const struct bkey *k)
 {
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index c5a0d6c8e63a..a5582a6f6ef6 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -1651,6 +1651,10 @@ static bool bch2_extent_merge_inline(struct bch_fs *c,
 
 	EBUG_ON(bkey_written(b, m));
 
+	if (bkey_val_u64s(l) > BKEY_EXTENT_VAL_U64s_MAX ||
+	    bkey_val_u64s(r) > BKEY_EXTENT_VAL_U64s_MAX)
+		return BCH_MERGE_NOMERGE;
+
 	/*
 	 * We need to save copies of both l and r, because we might get a
 	 * partial merge (which modifies both) and then fails to repack
-- 
cgit 


From 436c656d468fe04cd8e313a4a55151aa946de102 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 9 Jun 2019 16:56:16 -0400
Subject: bcachefs: bkey_merge() now takes bkey_s

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bkey_methods.c | 12 +++++------
 fs/bcachefs/bkey_methods.h |  4 ++--
 fs/bcachefs/bkey_sort.c    |  6 ++++--
 fs/bcachefs/extents.c      | 54 ++++++++++++++++++++++++----------------------
 fs/bcachefs/extents.h      |  4 ++--
 5 files changed, 42 insertions(+), 38 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c
index 571013a0d1a0..547f5b301ad4 100644
--- a/fs/bcachefs/bkey_methods.c
+++ b/fs/bcachefs/bkey_methods.c
@@ -199,22 +199,22 @@ bool bch2_bkey_normalize(struct bch_fs *c, struct bkey_s k)
 }
 
 enum merge_result bch2_bkey_merge(struct bch_fs *c,
-				  struct bkey_i *l, struct bkey_i *r)
+				  struct bkey_s l, struct bkey_s r)
 {
-	const struct bkey_ops *ops = &bch2_bkey_ops[l->k.type];
+	const struct bkey_ops *ops = &bch2_bkey_ops[l.k->type];
 	enum merge_result ret;
 
 	if (key_merging_disabled(c) ||
 	    !ops->key_merge ||
-	    l->k.type != r->k.type ||
-	    bversion_cmp(l->k.version, r->k.version) ||
-	    bkey_cmp(l->k.p, bkey_start_pos(&r->k)))
+	    l.k->type != r.k->type ||
+	    bversion_cmp(l.k->version, r.k->version) ||
+	    bkey_cmp(l.k->p, bkey_start_pos(r.k)))
 		return BCH_MERGE_NOMERGE;
 
 	ret = ops->key_merge(c, l, r);
 
 	if (ret != BCH_MERGE_NOMERGE)
-		l->k.needs_whiteout |= r->k.needs_whiteout;
+		l.k->needs_whiteout |= r.k->needs_whiteout;
 	return ret;
 }
 
diff --git a/fs/bcachefs/bkey_methods.h b/fs/bcachefs/bkey_methods.h
index a4bfd2aef5bf..08b976633360 100644
--- a/fs/bcachefs/bkey_methods.h
+++ b/fs/bcachefs/bkey_methods.h
@@ -33,7 +33,7 @@ struct bkey_ops {
 	void		(*swab)(const struct bkey_format *, struct bkey_packed *);
 	bool		(*key_normalize)(struct bch_fs *, struct bkey_s);
 	enum merge_result (*key_merge)(struct bch_fs *,
-				       struct bkey_i *, struct bkey_i *);
+				       struct bkey_s, struct bkey_s);
 };
 
 const char *bch2_bkey_val_invalid(struct bch_fs *, struct bkey_s_c);
@@ -57,7 +57,7 @@ void bch2_bkey_swab(const struct bkey_format *, struct bkey_packed *);
 bool bch2_bkey_normalize(struct bch_fs *, struct bkey_s);
 
 enum merge_result bch2_bkey_merge(struct bch_fs *,
-				  struct bkey_i *, struct bkey_i *);
+				  struct bkey_s, struct bkey_s);
 
 void bch2_bkey_renumber(enum btree_node_type, struct bkey_packed *, int);
 
diff --git a/fs/bcachefs/bkey_sort.c b/fs/bcachefs/bkey_sort.c
index 12825c1b292f..d4fbb694ee52 100644
--- a/fs/bcachefs/bkey_sort.c
+++ b/fs/bcachefs/bkey_sort.c
@@ -257,7 +257,7 @@ static void extent_sort_append(struct bch_fs *c,
 	bch2_bkey_unpack(b, &tmp.k, k);
 
 	if (*prev &&
-	    bch2_bkey_merge(c, (void *) *prev, &tmp.k))
+	    bch2_bkey_merge(c, bkey_i_to_s((void *) *prev), bkey_i_to_s(&tmp.k)))
 		return;
 
 	if (*prev) {
@@ -436,7 +436,9 @@ bch2_sort_repack_merge(struct bch_fs *c,
 		/* prev is always unpacked, for key merging: */
 
 		if (prev &&
-		    bch2_bkey_merge(c, (void *) prev, &tmp.k) ==
+		    bch2_bkey_merge(c,
+				    bkey_i_to_s((void *) prev),
+				    bkey_i_to_s(&tmp.k)) ==
 		    BCH_MERGE_MERGE)
 			continue;
 
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index a5582a6f6ef6..b55d52ec43a0 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -1521,21 +1521,21 @@ void bch2_extent_mark_replicas_cached(struct bch_fs *c,
 }
 
 enum merge_result bch2_extent_merge(struct bch_fs *c,
-				    struct bkey_i *l, struct bkey_i *r)
+				    struct bkey_s _l, struct bkey_s _r)
 {
-	struct bkey_s_extent el = bkey_i_to_s_extent(l);
-	struct bkey_s_extent er = bkey_i_to_s_extent(r);
-	union bch_extent_entry *en_l = el.v->start;
-	union bch_extent_entry *en_r = er.v->start;
+	struct bkey_s_extent l = bkey_s_to_extent(_l);
+	struct bkey_s_extent r = bkey_s_to_extent(_r);
+	union bch_extent_entry *en_l = l.v->start;
+	union bch_extent_entry *en_r = r.v->start;
 	struct bch_extent_crc_unpacked crc_l, crc_r;
 
-	if (bkey_val_u64s(&l->k) != bkey_val_u64s(&r->k))
+	if (bkey_val_u64s(l.k) != bkey_val_u64s(r.k))
 		return BCH_MERGE_NOMERGE;
 
-	crc_l = bch2_extent_crc_unpack(el.k, NULL);
+	crc_l = bch2_extent_crc_unpack(l.k, NULL);
 
-	extent_for_each_entry(el, en_l) {
-		en_r = vstruct_idx(er.v, (u64 *) en_l - el.v->_data);
+	extent_for_each_entry(l, en_l) {
+		en_r = vstruct_idx(r.v, (u64 *) en_l - l.v->_data);
 
 		if (extent_entry_type(en_l) != extent_entry_type(en_r))
 			return BCH_MERGE_NOMERGE;
@@ -1567,8 +1567,8 @@ enum merge_result bch2_extent_merge(struct bch_fs *c,
 		case BCH_EXTENT_ENTRY_crc32:
 		case BCH_EXTENT_ENTRY_crc64:
 		case BCH_EXTENT_ENTRY_crc128:
-			crc_l = bch2_extent_crc_unpack(el.k, entry_to_crc(en_l));
-			crc_r = bch2_extent_crc_unpack(er.k, entry_to_crc(en_r));
+			crc_l = bch2_extent_crc_unpack(l.k, entry_to_crc(en_l));
+			crc_r = bch2_extent_crc_unpack(r.k, entry_to_crc(en_r));
 
 			if (crc_l.csum_type		!= crc_r.csum_type ||
 			    crc_l.compression_type	!= crc_r.compression_type ||
@@ -1600,16 +1600,16 @@ enum merge_result bch2_extent_merge(struct bch_fs *c,
 		}
 	}
 
-	extent_for_each_entry(el, en_l) {
+	extent_for_each_entry(l, en_l) {
 		struct bch_extent_crc_unpacked crc_l, crc_r;
 
-		en_r = vstruct_idx(er.v, (u64 *) en_l - el.v->_data);
+		en_r = vstruct_idx(r.v, (u64 *) en_l - l.v->_data);
 
 		if (!extent_entry_is_crc(en_l))
 			continue;
 
-		crc_l = bch2_extent_crc_unpack(el.k, entry_to_crc(en_l));
-		crc_r = bch2_extent_crc_unpack(er.k, entry_to_crc(en_r));
+		crc_l = bch2_extent_crc_unpack(l.k, entry_to_crc(en_l));
+		crc_r = bch2_extent_crc_unpack(r.k, entry_to_crc(en_r));
 
 		crc_l.csum = bch2_checksum_merge(crc_l.csum_type,
 						 crc_l.csum,
@@ -1622,7 +1622,7 @@ enum merge_result bch2_extent_merge(struct bch_fs *c,
 		bch2_extent_crc_pack(entry_to_crc(en_l), crc_l);
 	}
 
-	bch2_key_resize(&l->k, l->k.size + r->k.size);
+	bch2_key_resize(l.k, l.k->size + r.k->size);
 
 	return BCH_MERGE_MERGE;
 }
@@ -1662,7 +1662,9 @@ static bool bch2_extent_merge_inline(struct bch_fs *c,
 	bch2_bkey_unpack(b, &li.k, l);
 	bch2_bkey_unpack(b, &ri.k, r);
 
-	ret = bch2_bkey_merge(c, &li.k, &ri.k);
+	ret = bch2_bkey_merge(c,
+			      bkey_i_to_s(&li.k),
+			      bkey_i_to_s(&ri.k));
 	if (ret == BCH_MERGE_NOMERGE)
 		return false;
 
@@ -1785,22 +1787,22 @@ void bch2_reservation_to_text(struct printbuf *out, struct bch_fs *c,
 }
 
 enum merge_result bch2_reservation_merge(struct bch_fs *c,
-					 struct bkey_i *l, struct bkey_i *r)
+					 struct bkey_s _l, struct bkey_s _r)
 {
-	struct bkey_i_reservation *li = bkey_i_to_reservation(l);
-	struct bkey_i_reservation *ri = bkey_i_to_reservation(r);
+	struct bkey_s_reservation l = bkey_s_to_reservation(_l);
+	struct bkey_s_reservation r = bkey_s_to_reservation(_r);
 
-	if (li->v.generation != ri->v.generation ||
-	    li->v.nr_replicas != ri->v.nr_replicas)
+	if (l.v->generation != r.v->generation ||
+	    l.v->nr_replicas != r.v->nr_replicas)
 		return BCH_MERGE_NOMERGE;
 
-	if ((u64) l->k.size + r->k.size > KEY_SIZE_MAX) {
-		bch2_key_resize(&l->k, KEY_SIZE_MAX);
-		bch2_cut_front(l->k.p, r);
+	if ((u64) l.k->size + r.k->size > KEY_SIZE_MAX) {
+		bch2_key_resize(l.k, KEY_SIZE_MAX);
+		__bch2_cut_front(l.k->p, r.s);
 		return BCH_MERGE_PARTIAL;
 	}
 
-	bch2_key_resize(&l->k, l->k.size + r->k.size);
+	bch2_key_resize(l.k, l.k->size + r.k->size);
 
 	return BCH_MERGE_MERGE;
 }
diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h
index 4cd16e8a2af6..fe92737354bd 100644
--- a/fs/bcachefs/extents.h
+++ b/fs/bcachefs/extents.h
@@ -386,7 +386,7 @@ void bch2_extent_debugcheck(struct bch_fs *, struct btree *, struct bkey_s_c);
 void bch2_extent_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 bool bch2_extent_normalize(struct bch_fs *, struct bkey_s);
 enum merge_result bch2_extent_merge(struct bch_fs *,
-				    struct bkey_i *, struct bkey_i *);
+				    struct bkey_s, struct bkey_s);
 
 #define bch2_bkey_ops_extent (struct bkey_ops) {		\
 	.key_invalid	= bch2_extent_invalid,			\
@@ -402,7 +402,7 @@ enum merge_result bch2_extent_merge(struct bch_fs *,
 const char *bch2_reservation_invalid(const struct bch_fs *, struct bkey_s_c);
 void bch2_reservation_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 enum merge_result bch2_reservation_merge(struct bch_fs *,
-					 struct bkey_i *, struct bkey_i *);
+					 struct bkey_s, struct bkey_s);
 
 #define bch2_bkey_ops_reservation (struct bkey_ops) {		\
 	.key_invalid	= bch2_reservation_invalid,		\
-- 
cgit 


From 9146b8ee03fb8a716e6a88d68b8e9074594966bb Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 9 Jun 2019 16:56:29 -0400
Subject: bcachefs: Reduce BKEY_PADDED usage

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bkey_sort.c | 120 ++++++++++++++++++++----------------------------
 1 file changed, 49 insertions(+), 71 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bkey_sort.c b/fs/bcachefs/bkey_sort.c
index d4fbb694ee52..9f5d9b4bf1c9 100644
--- a/fs/bcachefs/bkey_sort.c
+++ b/fs/bcachefs/bkey_sort.c
@@ -241,35 +241,44 @@ static inline void extent_sort_next(struct btree_node_iter_large *iter,
 	heap_sift_down(iter, i - iter->data, extent_sort_cmp, NULL);
 }
 
+static void extent_sort_advance_prev(struct bkey_format *f,
+				     struct btree_nr_keys *nr,
+				     struct bkey_packed *start,
+				     struct bkey_packed **prev)
+{
+	if (*prev) {
+		bch2_bkey_pack(*prev, (void *) *prev, f);
+
+		btree_keys_account_key_add(nr, 0, *prev);
+		*prev = bkey_next(*prev);
+	} else {
+		*prev = start;
+	}
+}
+
 static void extent_sort_append(struct bch_fs *c,
-			       struct btree *b,
+			       struct bkey_format *f,
 			       struct btree_nr_keys *nr,
 			       struct bkey_packed *start,
 			       struct bkey_packed **prev,
-			       struct bkey_packed *k)
+			       struct bkey_s k)
 {
-	struct bkey_format *f = &b->format;
-	BKEY_PADDED(k) tmp;
-
-	if (bkey_whiteout(k))
+	if (bkey_whiteout(k.k))
 		return;
 
-	bch2_bkey_unpack(b, &tmp.k, k);
+	/*
+	 * prev is always unpacked, for key merging - until right before we
+	 * advance it:
+	 */
 
 	if (*prev &&
-	    bch2_bkey_merge(c, bkey_i_to_s((void *) *prev), bkey_i_to_s(&tmp.k)))
+	    bch2_bkey_merge(c, bkey_i_to_s((void *) *prev), k) ==
+	    BCH_MERGE_MERGE)
 		return;
 
-	if (*prev) {
-		bch2_bkey_pack(*prev, (void *) *prev, f);
-
-		btree_keys_account_key_add(nr, 0, *prev);
-		*prev = bkey_next(*prev);
-	} else {
-		*prev = start;
-	}
+	extent_sort_advance_prev(f, nr, start, prev);
 
-	bkey_copy(*prev, &tmp.k);
+	bkey_reassemble((void *) *prev, k.s_c);
 }
 
 struct btree_nr_keys bch2_extent_sort_fix_overlapping(struct bch_fs *c,
@@ -279,7 +288,7 @@ struct btree_nr_keys bch2_extent_sort_fix_overlapping(struct bch_fs *c,
 {
 	struct bkey_format *f = &b->format;
 	struct btree_node_iter_set *_l = iter->data, *_r;
-	struct bkey_packed *prev = NULL, *out, *lk, *rk;
+	struct bkey_packed *prev = NULL, *lk, *rk;
 	struct bkey l_unpacked, r_unpacked;
 	struct bkey_s l, r;
 	struct btree_nr_keys nr;
@@ -290,9 +299,10 @@ struct btree_nr_keys bch2_extent_sort_fix_overlapping(struct bch_fs *c,
 
 	while (!bch2_btree_node_iter_large_end(iter)) {
 		lk = __btree_node_offset_to_key(b, _l->k);
+		l = __bkey_disassemble(b, lk, &l_unpacked);
 
 		if (iter->used == 1) {
-			extent_sort_append(c, b, &nr, dst->start, &prev, lk);
+			extent_sort_append(c, f, &nr, dst->start, &prev, l);
 			extent_sort_next(iter, b, _l);
 			continue;
 		}
@@ -303,13 +313,11 @@ struct btree_nr_keys bch2_extent_sort_fix_overlapping(struct bch_fs *c,
 			_r++;
 
 		rk = __btree_node_offset_to_key(b, _r->k);
-
-		l = __bkey_disassemble(b, lk, &l_unpacked);
 		r = __bkey_disassemble(b, rk, &r_unpacked);
 
 		/* If current key and next key don't overlap, just append */
 		if (bkey_cmp(l.k->p, bkey_start_pos(r.k)) <= 0) {
-			extent_sort_append(c, b, &nr, dst->start, &prev, lk);
+			extent_sort_append(c, f, &nr, dst->start, &prev, l);
 			extent_sort_next(iter, b, _l);
 			continue;
 		}
@@ -354,23 +362,17 @@ struct btree_nr_keys bch2_extent_sort_fix_overlapping(struct bch_fs *c,
 
 			extent_sort_sift(iter, b, 0);
 
-			extent_sort_append(c, b, &nr, dst->start, &prev,
-					   bkey_to_packed(&tmp.k));
+			extent_sort_append(c, f, &nr, dst->start,
+					   &prev, bkey_i_to_s(&tmp.k));
 		} else {
 			bch2_cut_back(bkey_start_pos(r.k), l.k);
 			extent_save(b, lk, l.k);
 		}
 	}
 
-	if (prev) {
-		bch2_bkey_pack(prev, (void *) prev, f);
-		btree_keys_account_key_add(&nr, 0, prev);
-		out = bkey_next(prev);
-	} else {
-		out = dst->start;
-	}
+	extent_sort_advance_prev(f, &nr, dst->start, &prev);
 
-	dst->u64s = cpu_to_le16((u64 *) out - dst->_data);
+	dst->u64s = cpu_to_le16((u64 *) prev - dst->_data);
 	return nr;
 }
 
@@ -413,60 +415,36 @@ bch2_sort_repack_merge(struct bch_fs *c,
 		       struct bkey_format *out_f,
 		       bool filter_whiteouts)
 {
-	struct bkey_packed *k, *prev = NULL, *out;
+	struct bkey_packed *prev = NULL, *k_packed, *next;
+	struct bkey k_unpacked;
+	struct bkey_s k;
 	struct btree_nr_keys nr;
-	BKEY_PADDED(k) tmp;
 
 	memset(&nr, 0, sizeof(nr));
 
-	while ((k = bch2_btree_node_iter_next_all(iter, src))) {
-		if (filter_whiteouts && bkey_whiteout(k))
-			continue;
-
+	next = bch2_btree_node_iter_next_all(iter, src);
+	while ((k_packed = next)) {
 		/*
-		 * The filter might modify pointers, so we have to unpack the
-		 * key and values to &tmp.k:
+		 * The filter might modify the size of @k's value, so advance
+		 * the iterator first:
 		 */
-		bch2_bkey_unpack(src, &tmp.k, k);
+		next = bch2_btree_node_iter_next_all(iter, src);
 
-		if (filter_whiteouts &&
-		    bch2_bkey_normalize(c, bkey_i_to_s(&tmp.k)))
+		if (filter_whiteouts && bkey_whiteout(k_packed))
 			continue;
 
-		/* prev is always unpacked, for key merging: */
+		k = __bkey_disassemble(src, k_packed, &k_unpacked);
 
-		if (prev &&
-		    bch2_bkey_merge(c,
-				    bkey_i_to_s((void *) prev),
-				    bkey_i_to_s(&tmp.k)) ==
-		    BCH_MERGE_MERGE)
+		if (filter_whiteouts &&
+		    bch2_bkey_normalize(c, k))
 			continue;
 
-		/*
-		 * the current key becomes the new prev: advance prev, then
-		 * copy the current key - but first pack prev (in place):
-		 */
-		if (prev) {
-			bch2_bkey_pack(prev, (void *) prev, out_f);
-
-			btree_keys_account_key_add(&nr, 0, prev);
-			prev = bkey_next(prev);
-		} else {
-			prev = vstruct_last(dst);
-		}
-
-		bkey_copy(prev, &tmp.k);
+		extent_sort_append(c, out_f, &nr, vstruct_last(dst), &prev, k);
 	}
 
-	if (prev) {
-		bch2_bkey_pack(prev, (void *) prev, out_f);
-		btree_keys_account_key_add(&nr, 0, prev);
-		out = bkey_next(prev);
-	} else {
-		out = vstruct_last(dst);
-	}
+	extent_sort_advance_prev(out_f, &nr, vstruct_last(dst), &prev);
 
-	dst->u64s = cpu_to_le16((u64 *) out - dst->_data);
+	dst->u64s = cpu_to_le16((u64 *) prev - dst->_data);
 	return nr;
 }
 
-- 
cgit 


From 66b095b008ea6526c660fbecfacbd970416f971e Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 9 Jun 2019 20:32:54 -0400
Subject: bcachefs: Don't allow bkey vals that are too big in extents btree

Make sure we don't overflow BKEY_PADDED keys

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bkey_methods.c | 5 +++++
 fs/bcachefs/extents.c      | 3 ---
 2 files changed, 5 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c
index 547f5b301ad4..09ee958c5568 100644
--- a/fs/bcachefs/bkey_methods.c
+++ b/fs/bcachefs/bkey_methods.c
@@ -82,6 +82,11 @@ const char *__bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k,
 	if (k.k->u64s < BKEY_U64s)
 		return "u64s too small";
 
+	if ((btree_node_type_is_extents(type) ||
+	     type == BKEY_TYPE_BTREE) &&
+	    bkey_val_u64s(k.k) > BKEY_EXTENT_VAL_U64s_MAX)
+		return "value too big";
+
 	if (btree_node_type_is_extents(type)) {
 		if ((k.k->size == 0) != bkey_deleted(k.k))
 			return "bad size field";
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index b55d52ec43a0..4d3722cb7e33 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -1291,9 +1291,6 @@ void bch2_insert_fixup_extent(struct btree_trans *trans,
 
 const char *bch2_extent_invalid(const struct bch_fs *c, struct bkey_s_c k)
 {
-	if (bkey_val_u64s(k.k) > BKEY_EXTENT_VAL_U64s_MAX)
-		return "value too big";
-
 	return bch2_bkey_ptrs_invalid(c, k);
 }
 
-- 
cgit 


From 3e669816904d45d48ef2183a2ad675c24d9c941a Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 10 Jun 2019 11:31:07 -0400
Subject: bcachefs: Fix promoting to cache devices (durability = 0)

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_foreground.c | 78 ++++++++++++++++++++++++++----------------
 fs/bcachefs/io.c               | 14 ++++----
 2 files changed, 55 insertions(+), 37 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index 334bc6576b3a..e3e9383c94ee 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -377,6 +377,25 @@ void bch2_dev_stripe_increment(struct bch_fs *c, struct bch_dev *ca,
 #define BUCKET_MAY_ALLOC_PARTIAL	(1 << 0)
 #define BUCKET_ALLOC_USE_DURABILITY	(1 << 1)
 
+static void add_new_bucket(struct bch_fs *c,
+			   struct open_buckets *ptrs,
+			   struct bch_devs_mask *devs_may_alloc,
+			   unsigned *nr_effective,
+			   bool *have_cache,
+			   unsigned flags,
+			   struct open_bucket *ob)
+{
+	unsigned durability =
+		bch_dev_bkey_exists(c, ob->ptr.dev)->mi.durability;
+
+	__clear_bit(ob->ptr.dev, devs_may_alloc->d);
+	*nr_effective	+= (flags & BUCKET_ALLOC_USE_DURABILITY)
+		? durability : 1;
+	*have_cache	|= !durability;
+
+	ob_push(c, ptrs, ob);
+}
+
 static int bch2_bucket_alloc_set(struct bch_fs *c,
 				 struct open_buckets *ptrs,
 				 struct dev_stripe_state *stripe,
@@ -392,7 +411,7 @@ static int bch2_bucket_alloc_set(struct bch_fs *c,
 		bch2_dev_alloc_list(c, stripe, devs_may_alloc);
 	struct bch_dev *ca;
 	bool alloc_failure = false;
-	unsigned i, durability;
+	unsigned i;
 
 	BUG_ON(*nr_effective >= nr_replicas);
 
@@ -422,14 +441,8 @@ static int bch2_bucket_alloc_set(struct bch_fs *c,
 			continue;
 		}
 
-		durability = (flags & BUCKET_ALLOC_USE_DURABILITY)
-			? ca->mi.durability : 1;
-
-		__clear_bit(ca->dev_idx, devs_may_alloc->d);
-		*nr_effective	+= durability;
-		*have_cache	|= !durability;
-
-		ob_push(c, ptrs, ob);
+		add_new_bucket(c, ptrs, devs_may_alloc,
+			       nr_effective, have_cache, flags, ob);
 
 		bch2_dev_stripe_increment(c, ca, stripe);
 
@@ -524,7 +537,8 @@ static void bucket_alloc_from_stripe(struct bch_fs *c,
 				     unsigned erasure_code,
 				     unsigned nr_replicas,
 				     unsigned *nr_effective,
-				     bool *have_cache)
+				     bool *have_cache,
+				     unsigned flags)
 {
 	struct dev_alloc_list devs_sorted;
 	struct ec_stripe_head *h;
@@ -564,11 +578,8 @@ got_bucket:
 	ob->ec_idx	= ec_idx;
 	ob->ec		= h->s;
 
-	__clear_bit(ob->ptr.dev, devs_may_alloc->d);
-	*nr_effective	+= ca->mi.durability;
-	*have_cache	|= !ca->mi.durability;
-
-	ob_push(c, ptrs, ob);
+	add_new_bucket(c, ptrs, devs_may_alloc,
+		       nr_effective, have_cache, flags, ob);
 	atomic_inc(&h->s->pin);
 out_put_head:
 	bch2_ec_stripe_head_put(h);
@@ -583,6 +594,7 @@ static void get_buckets_from_writepoint(struct bch_fs *c,
 					unsigned nr_replicas,
 					unsigned *nr_effective,
 					bool *have_cache,
+					unsigned flags,
 					bool need_ec)
 {
 	struct open_buckets ptrs_skip = { .nr = 0 };
@@ -597,11 +609,9 @@ static void get_buckets_from_writepoint(struct bch_fs *c,
 		    (ca->mi.durability ||
 		     (wp->type == BCH_DATA_USER && !*have_cache)) &&
 		    (ob->ec || !need_ec)) {
-			__clear_bit(ob->ptr.dev, devs_may_alloc->d);
-			*nr_effective	+= ca->mi.durability;
-			*have_cache	|= !ca->mi.durability;
-
-			ob_push(c, ptrs, ob);
+			add_new_bucket(c, ptrs, devs_may_alloc,
+				       nr_effective, have_cache,
+				       flags, ob);
 		} else {
 			ob_push(c, &ptrs_skip, ob);
 		}
@@ -619,17 +629,15 @@ static int open_bucket_add_buckets(struct bch_fs *c,
 				   unsigned *nr_effective,
 				   bool *have_cache,
 				   enum alloc_reserve reserve,
+				   unsigned flags,
 				   struct closure *_cl)
 {
 	struct bch_devs_mask devs;
 	struct open_bucket *ob;
 	struct closure *cl = NULL;
-	unsigned i, flags = BUCKET_ALLOC_USE_DURABILITY;
+	unsigned i;
 	int ret;
 
-	if (wp->type == BCH_DATA_USER)
-		flags |= BUCKET_MAY_ALLOC_PARTIAL;
-
 	rcu_read_lock();
 	devs = target_rw_devs(c, wp->type, target);
 	rcu_read_unlock();
@@ -644,21 +652,21 @@ static int open_bucket_add_buckets(struct bch_fs *c,
 	if (erasure_code) {
 		get_buckets_from_writepoint(c, ptrs, wp, &devs,
 					    nr_replicas, nr_effective,
-					    have_cache, true);
+					    have_cache, flags, true);
 		if (*nr_effective >= nr_replicas)
 			return 0;
 
 		bucket_alloc_from_stripe(c, ptrs, wp, &devs,
 					 target, erasure_code,
 					 nr_replicas, nr_effective,
-					 have_cache);
+					 have_cache, flags);
 		if (*nr_effective >= nr_replicas)
 			return 0;
 	}
 
 	get_buckets_from_writepoint(c, ptrs, wp, &devs,
 				    nr_replicas, nr_effective,
-				    have_cache, false);
+				    have_cache, flags, false);
 	if (*nr_effective >= nr_replicas)
 		return 0;
 
@@ -863,9 +871,13 @@ struct write_point *bch2_alloc_sectors_start(struct bch_fs *c,
 	struct open_bucket *ob;
 	struct open_buckets ptrs;
 	unsigned nr_effective, write_points_nr;
+	unsigned ob_flags = 0;
 	bool have_cache;
 	int ret, i;
 
+	if (!(flags & BCH_WRITE_ONLY_SPECIFIED_DEVS))
+		ob_flags |= BUCKET_ALLOC_USE_DURABILITY;
+
 	BUG_ON(!nr_replicas || !nr_replicas_required);
 retry:
 	ptrs.nr		= 0;
@@ -875,6 +887,9 @@ retry:
 
 	wp = writepoint_find(c, write_point.v);
 
+	if (wp->type == BCH_DATA_USER)
+		ob_flags |= BUCKET_MAY_ALLOC_PARTIAL;
+
 	/* metadata may not allocate on cache devices: */
 	if (wp->type != BCH_DATA_USER)
 		have_cache = true;
@@ -883,19 +898,22 @@ retry:
 		ret = open_bucket_add_buckets(c, &ptrs, wp, devs_have,
 					      target, erasure_code,
 					      nr_replicas, &nr_effective,
-					      &have_cache, reserve, cl);
+					      &have_cache, reserve,
+					      ob_flags, cl);
 	} else {
 		ret = open_bucket_add_buckets(c, &ptrs, wp, devs_have,
 					      target, erasure_code,
 					      nr_replicas, &nr_effective,
-					      &have_cache, reserve, NULL);
+					      &have_cache, reserve,
+					      ob_flags, NULL);
 		if (!ret)
 			goto alloc_done;
 
 		ret = open_bucket_add_buckets(c, &ptrs, wp, devs_have,
 					      0, erasure_code,
 					      nr_replicas, &nr_effective,
-					      &have_cache, reserve, cl);
+					      &have_cache, reserve,
+					      ob_flags, cl);
 	}
 alloc_done:
 	BUG_ON(!ret && nr_effective < nr_replicas);
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index 11cdaddb1551..9cd9bbc5cce4 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -1013,23 +1013,23 @@ static inline bool should_promote(struct bch_fs *c, struct bkey_s_c k,
 				  struct bch_io_opts opts,
 				  unsigned flags)
 {
-	if (!opts.promote_target)
+	if (!bkey_extent_is_data(k.k))
 		return false;
 
 	if (!(flags & BCH_READ_MAY_PROMOTE))
 		return false;
 
-	if (percpu_ref_is_dying(&c->writes))
-		return false;
-
-	if (!bkey_extent_is_data(k.k))
+	if (!opts.promote_target)
 		return false;
 
-	if (bch2_extent_has_target(c, bkey_s_c_to_extent(k), opts.promote_target))
+	if (bch2_extent_has_target(c, bkey_s_c_to_extent(k),
+				   opts.promote_target))
 		return false;
 
-	if (bch2_target_congested(c, opts.promote_target))
+	if (bch2_target_congested(c, opts.promote_target)) {
+		/* XXX trace this */
 		return false;
+	}
 
 	if (rhashtable_lookup_fast(&c->promote_table, &pos,
 				   bch_promote_params))
-- 
cgit 


From e0dfc08bc2f509de9fda0371b46988247f711a12 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 11 Jun 2019 21:03:23 -0400
Subject: bcachefs: use memalloc_nofs_save() for vmalloc allocation

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_cache.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index d80ba1d71826..ea775d91de67 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -9,6 +9,7 @@
 #include "trace.h"
 
 #include <linux/prefetch.h>
+#include <linux/sched/mm.h>
 
 const char * const bch2_btree_ids[] = {
 #define x(kwd, val, name) name,
@@ -509,7 +510,9 @@ struct btree *bch2_btree_node_mem_alloc(struct bch_fs *c)
 	struct btree_cache *bc = &c->btree_cache;
 	struct btree *b;
 	u64 start_time = local_clock();
+	unsigned flags;
 
+	flags = memalloc_nofs_save();
 	mutex_lock(&bc->lock);
 
 	/*
@@ -547,6 +550,7 @@ out_unlock:
 
 	list_del_init(&b->list);
 	mutex_unlock(&bc->lock);
+	memalloc_nofs_restore(flags);
 out:
 	b->flags		= 0;
 	b->written		= 0;
-- 
cgit 


From 2a488aaac1d4a7f5b48bce687adf430d24e0beb5 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 13 Jun 2019 11:01:14 -0400
Subject: bcachefs: fix __bch2_xattr_bcachefs_get()

We were returning -ERANGE when the size of the buffer passed in was
exactly the size of the xattr val

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/xattr.c | 24 ++++++++++--------------
 1 file changed, 10 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c
index 2ccf64db8147..5aeff1012f8b 100644
--- a/fs/bcachefs/xattr.c
+++ b/fs/bcachefs/xattr.c
@@ -387,6 +387,9 @@ static int __bch2_xattr_bcachefs_get(const struct xattr_handler *handler,
 		bch2_inode_opts_to_opts(bch2_inode_opts_get(&inode->ei_inode));
 	const struct bch_option *opt;
 	int id, inode_opt_id;
+	char buf[512];
+	struct printbuf out = PBUF(buf);
+	unsigned val_len;
 	u64 v;
 
 	id = bch2_opt_lookup(name);
@@ -407,23 +410,16 @@ static int __bch2_xattr_bcachefs_get(const struct xattr_handler *handler,
 		return -ENODATA;
 
 	v = bch2_opt_get_by_id(&opts, id);
+	bch2_opt_to_text(&out, c, opt, v, 0);
 
-	if (!buffer) {
-		char buf[512];
-		struct printbuf out = PBUF(buf);
+	val_len = out.pos - buf;
 
-		bch2_opt_to_text(&out, c, opt, v, 0);
+	if (buffer && val_len > size)
+		return -ERANGE;
 
-		return out.pos - buf;
-	} else {
-		struct printbuf out = _PBUF(buffer, size);
-
-		bch2_opt_to_text(&out, c, opt, v, 0);
-
-		return printbuf_remaining(&out)
-			? (void *) out.pos - buffer
-			: -ERANGE;
-	}
+	if (buffer)
+		memcpy(buffer, buf, val_len);
+	return val_len;
 }
 
 static int bch2_xattr_bcachefs_get(const struct xattr_handler *handler,
-- 
cgit 


From 09bf409b4689d0297da2cc5c95106ef89fa7cfb7 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 27 Jun 2019 11:37:46 -0400
Subject: bcachefs: Delete a spurious assertion

bch_write_op->written used to be a u16, but it's not so the assertion
isn't needed anymore - and 5.1 can send larger bios.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/io.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index 9cd9bbc5cce4..8a090b0d9b03 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -966,7 +966,6 @@ void bch2_write(struct closure *cl)
 	BUG_ON(!op->nr_replicas);
 	BUG_ON(!op->write_point.v);
 	BUG_ON(!bkey_cmp(op->pos, POS_MAX));
-	BUG_ON(bio_sectors(&op->wbio.bio) > U16_MAX);
 
 	op->start_time = local_clock();
 
-- 
cgit 


From f707e3d8f41e77aff94941168ab30da2314a5984 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 18 Jun 2019 19:37:39 -0400
Subject: bcachefs: fix kasan splat

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/recovery.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index dc9222b1a196..2b7133e376e3 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -721,10 +721,12 @@ int bch2_fs_recovery(struct bch_fs *c)
 
 	ret = bch2_blacklist_table_initialize(c);
 
-	ret = verify_journal_entries_not_blacklisted_or_missing(c,
-						&journal_entries);
-	if (ret)
-		goto err;
+	if (!list_empty(&journal_entries)) {
+		ret = verify_journal_entries_not_blacklisted_or_missing(c,
+							&journal_entries);
+		if (ret)
+			goto err;
+	}
 
 	ret = bch2_fs_journal_start(&c->journal, journal_seq,
 				    &journal_entries);
-- 
cgit 


From 2ded276b7e20084842225b47100f3391e00f540f Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 24 Jun 2019 18:11:35 -0400
Subject: bcachefs: Fix array overrun with unknown btree roots

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/recovery.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 2b7133e376e3..e0df2c0a4fdf 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -379,7 +379,15 @@ static int journal_replay_entry_early(struct bch_fs *c,
 
 	switch (entry->type) {
 	case BCH_JSET_ENTRY_btree_root: {
-		struct btree_root *r = &c->btree_roots[entry->btree_id];
+		struct btree_root *r;
+
+		if (entry->btree_id >= BTREE_ID_NR) {
+			bch_err(c, "filesystem has unknown btree type %u",
+				entry->btree_id);
+			return -EINVAL;
+		}
+
+		r = &c->btree_roots[entry->btree_id];
 
 		if (entry->u64s) {
 			r->level = entry->level;
-- 
cgit 


From 44da9767bb32467ac660ce6bacf75162f5abf9a1 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 24 Jun 2019 17:55:15 -0400
Subject: bcachefs: add inode_generation_to_text method

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/inode.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index e6ad0ad51cb2..0fb08a396d62 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -246,6 +246,9 @@ const char *bch2_inode_generation_invalid(const struct bch_fs *c,
 void bch2_inode_generation_to_text(struct printbuf *out, struct bch_fs *c,
 				   struct bkey_s_c k)
 {
+	struct bkey_s_c_inode_generation gen = bkey_s_c_to_inode_generation(k);
+
+	pr_buf(out, "generation: %u", le32_to_cpu(gen.v->bi_generation));
 }
 
 void bch2_inode_init(struct bch_fs *c, struct bch_inode_unpacked *inode_u,
-- 
cgit 


From 88767d65d84257f9b5dfed1aa89404f1b6ddf142 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 24 Jun 2019 17:50:52 -0400
Subject: bcachefs: Update path now handles triggers that generate more
 triggers

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_types.h       |  1 +
 fs/bcachefs/btree_update_leaf.c | 29 ++++++++++++++++++++++-------
 fs/bcachefs/buckets.c           | 24 ++++++++++++------------
 fs/bcachefs/buckets.h           |  3 ++-
 4 files changed, 37 insertions(+), 20 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index bdcf9288d749..ec14e2deecb7 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -265,6 +265,7 @@ struct btree_insert_entry {
 
 	bool			deferred;
 	bool			triggered;
+	bool			marked;
 };
 
 #define BTREE_ITER_MAX		64
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 6e63c916986e..4461e42f2367 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -542,6 +542,7 @@ static inline int do_btree_insert_at(struct btree_trans *trans,
 	struct bch_fs *c = trans->c;
 	struct bch_fs_usage_online *fs_usage = NULL;
 	struct btree_insert_entry *i;
+	bool saw_non_marked;
 	unsigned mark_flags = trans->flags & BTREE_INSERT_BUCKET_INVALIDATE
 		? BCH_BUCKET_MARK_BUCKET_INVALIDATE
 		: 0;
@@ -551,14 +552,28 @@ static inline int do_btree_insert_at(struct btree_trans *trans,
 		BUG_ON(i->iter->uptodate >= BTREE_ITER_NEED_RELOCK);
 
 	trans_for_each_update_iter(trans, i)
-		if (update_has_triggers(trans, i) &&
-		    update_triggers_transactional(trans, i)) {
-			ret = bch2_trans_mark_update(trans, i);
-			if (ret == -EINTR)
-				trace_trans_restart_mark(trans->ip);
-			if (ret)
-				goto out_clear_replicas;
+		i->marked = false;
+
+	do {
+		saw_non_marked = false;
+
+		trans_for_each_update_iter(trans, i) {
+			if (i->marked)
+				continue;
+
+			saw_non_marked = true;
+			i->marked = true;
+
+			if (update_has_triggers(trans, i) &&
+			    update_triggers_transactional(trans, i)) {
+				ret = bch2_trans_mark_update(trans, i->iter, i->k);
+				if (ret == -EINTR)
+					trace_trans_restart_mark(trans->ip);
+				if (ret)
+					goto out_clear_replicas;
+			}
 		}
+	} while (saw_non_marked);
 
 	btree_trans_lock_write(c, trans);
 
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 0d96ea572bd0..911c39c4872e 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -1590,9 +1590,9 @@ int bch2_trans_mark_key(struct btree_trans *trans, struct bkey_s_c k,
 }
 
 int bch2_trans_mark_update(struct btree_trans *trans,
-			   struct btree_insert_entry *insert)
+			   struct btree_iter *iter,
+			   struct bkey_i *insert)
 {
-	struct btree_iter	*iter = insert->iter;
 	struct btree		*b = iter->l[0].b;
 	struct btree_node_iter	node_iter = iter->l[0].iter;
 	struct bkey_packed	*_k;
@@ -1602,9 +1602,9 @@ int bch2_trans_mark_update(struct btree_trans *trans,
 		return 0;
 
 	ret = bch2_trans_mark_key(trans,
-			bkey_i_to_s_c(insert->k),
-			bpos_min(insert->k->k.p, b->key.k.p).offset -
-			bkey_start_offset(&insert->k->k),
+			bkey_i_to_s_c(insert),
+			bpos_min(insert->k.p, b->key.k.p).offset -
+			bkey_start_offset(&insert->k),
 			BCH_BUCKET_MARK_INSERT);
 	if (ret)
 		return ret;
@@ -1618,25 +1618,25 @@ int bch2_trans_mark_update(struct btree_trans *trans,
 		k = bkey_disassemble(b, _k, &unpacked);
 
 		if (btree_node_is_extents(b)
-		    ? bkey_cmp(insert->k->k.p, bkey_start_pos(k.k)) <= 0
-		    : bkey_cmp(insert->k->k.p, k.k->p))
+		    ? bkey_cmp(insert->k.p, bkey_start_pos(k.k)) <= 0
+		    : bkey_cmp(insert->k.p, k.k->p))
 			break;
 
 		if (btree_node_is_extents(b)) {
-			switch (bch2_extent_overlap(&insert->k->k, k.k)) {
+			switch (bch2_extent_overlap(&insert->k, k.k)) {
 			case BCH_EXTENT_OVERLAP_ALL:
 				sectors = -((s64) k.k->size);
 				break;
 			case BCH_EXTENT_OVERLAP_BACK:
-				sectors = bkey_start_offset(&insert->k->k) -
+				sectors = bkey_start_offset(&insert->k) -
 					k.k->p.offset;
 				break;
 			case BCH_EXTENT_OVERLAP_FRONT:
 				sectors = bkey_start_offset(k.k) -
-					insert->k->k.p.offset;
+					insert->k.p.offset;
 				break;
 			case BCH_EXTENT_OVERLAP_MIDDLE:
-				sectors = k.k->p.offset - insert->k->k.p.offset;
+				sectors = k.k->p.offset - insert->k.p.offset;
 				BUG_ON(sectors <= 0);
 
 				ret = bch2_trans_mark_key(trans, k, sectors,
@@ -1644,7 +1644,7 @@ int bch2_trans_mark_update(struct btree_trans *trans,
 				if (ret)
 					return ret;
 
-				sectors = bkey_start_offset(&insert->k->k) -
+				sectors = bkey_start_offset(&insert->k) -
 					k.k->p.offset;
 				break;
 			}
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index 793bb8cb2527..46eb493b42ca 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -274,7 +274,8 @@ void bch2_replicas_delta_list_apply(struct bch_fs *,
 				    struct replicas_delta_list *);
 int bch2_trans_mark_key(struct btree_trans *, struct bkey_s_c, s64, unsigned);
 int bch2_trans_mark_update(struct btree_trans *,
-			   struct btree_insert_entry *);
+			   struct btree_iter *iter,
+			   struct bkey_i *insert);
 void bch2_trans_fs_usage_apply(struct btree_trans *, struct bch_fs_usage_online *);
 
 /* disk reservations: */
-- 
cgit 


From 91052b9de806c4dd1ac0447ba7928e49b107eec7 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 24 Jun 2019 17:58:40 -0400
Subject: bcachefs: Refactor trans_(get|update)_key

these are still pretty ugly...

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs_format.h |  2 ++
 fs/bcachefs/buckets.c         | 65 ++++++++++++++++++++++---------------------
 2 files changed, 35 insertions(+), 32 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index 8715a444f6d5..c2b5c11a87c5 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -852,6 +852,8 @@ static const unsigned BKEY_ALLOC_VAL_U64s_MAX =
 		     BCH_ALLOC_FIELDS(), sizeof(u64));
 #undef x
 
+static const unsigned BKEY_ALLOC_U64s_MAX = BKEY_U64s + BKEY_ALLOC_VAL_U64s_MAX;
+
 /* Quotas: */
 
 enum quota_types {
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 911c39c4872e..02f8b7f5a0c3 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -1319,22 +1319,18 @@ void bch2_trans_fs_usage_apply(struct btree_trans *trans,
 
 static int trans_get_key(struct btree_trans *trans,
 			 enum btree_id btree_id, struct bpos pos,
-			 struct btree_insert_entry **insert,
 			 struct btree_iter **iter,
 			 struct bkey_s_c *k)
 {
 	unsigned i;
 	int ret;
 
-	*insert = NULL;
-
 	for (i = 0; i < trans->nr_updates; i++)
 		if (!trans->updates[i].deferred &&
 		    trans->updates[i].iter->btree_id == btree_id &&
 		    !bkey_cmp(pos, trans->updates[i].iter->pos)) {
-			*insert = &trans->updates[i];
-			*iter	= (*insert)->iter;
-			*k	= bkey_i_to_s_c((*insert)->k);
+			*iter	= trans->updates[i].iter;
+			*k	= bkey_i_to_s_c(trans->updates[i].k);
 			return 0;
 		}
 
@@ -1350,30 +1346,34 @@ static int trans_get_key(struct btree_trans *trans,
 	return ret;
 }
 
-static int trans_update_key(struct btree_trans *trans,
-			    struct btree_insert_entry **insert,
-			    struct btree_iter *iter,
-			    struct bkey_s_c k,
-			    unsigned extra_u64s)
+static void *trans_update_key(struct btree_trans *trans,
+			      struct btree_iter *iter,
+			      unsigned u64s)
 {
 	struct bkey_i *new_k;
+	unsigned i;
 
-	if (*insert)
-		return 0;
-
-	new_k = bch2_trans_kmalloc(trans, bkey_bytes(k.k) +
-				   extra_u64s * sizeof(u64));
+	new_k = bch2_trans_kmalloc(trans, u64s * sizeof(u64));
 	if (IS_ERR(new_k))
-		return PTR_ERR(new_k);
+		return new_k;
+
+	bkey_init(&new_k->k);
+	new_k->k.p = iter->pos;
 
-	*insert = bch2_trans_update(trans, ((struct btree_insert_entry) {
-				.iter = iter,
-				.k = new_k,
-				.triggered = true,
+	for (i = 0; i < trans->nr_updates; i++)
+		if (!trans->updates[i].deferred &&
+		    trans->updates[i].iter == iter) {
+			trans->updates[i].k = new_k;
+			return new_k;
+		}
+
+	bch2_trans_update(trans, ((struct btree_insert_entry) {
+		.iter = iter,
+		.k = new_k,
+		.triggered = true,
 	}));
 
-	bkey_reassemble((*insert)->k, k);
-	return 0;
+	return new_k;
 }
 
 static int bch2_trans_mark_pointer(struct btree_trans *trans,
@@ -1382,7 +1382,6 @@ static int bch2_trans_mark_pointer(struct btree_trans *trans,
 {
 	struct bch_fs *c = trans->c;
 	struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev);
-	struct btree_insert_entry *insert;
 	struct btree_iter *iter;
 	struct bkey_s_c k;
 	struct bkey_alloc_unpacked u;
@@ -1392,7 +1391,7 @@ static int bch2_trans_mark_pointer(struct btree_trans *trans,
 
 	ret = trans_get_key(trans, BTREE_ID_ALLOC,
 			    POS(p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr)),
-			    &insert, &iter, &k);
+			    &iter, &k);
 	if (ret)
 		return ret;
 
@@ -1425,11 +1424,12 @@ static int bch2_trans_mark_pointer(struct btree_trans *trans,
 		? u.dirty_sectors
 		: u.cached_sectors, sectors);
 
-	ret = trans_update_key(trans, &insert, iter, k, 1);
+	a = trans_update_key(trans, iter, BKEY_ALLOC_U64s_MAX);
+	ret = PTR_ERR_OR_ZERO(a);
 	if (ret)
 		goto out;
 
-	a = bkey_alloc_init(insert->k);
+	bkey_alloc_init(&a->k_i);
 	a->k.p = iter->pos;
 	bch2_alloc_pack(a, u);
 out:
@@ -1442,8 +1442,8 @@ static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans,
 			s64 sectors, enum bch_data_type data_type)
 {
 	struct bch_replicas_padded r;
-	struct btree_insert_entry *insert;
 	struct btree_iter *iter;
+	struct bkey_i *new_k;
 	struct bkey_s_c k;
 	struct bkey_s_stripe s;
 	unsigned nr_data;
@@ -1452,8 +1452,7 @@ static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans,
 
 	BUG_ON(!sectors);
 
-	ret = trans_get_key(trans, BTREE_ID_EC, POS(0, p.idx),
-			    &insert, &iter, &k);
+	ret = trans_get_key(trans, BTREE_ID_EC, POS(0, p.idx), &iter, &k);
 	if (ret)
 		return ret;
 
@@ -1465,11 +1464,13 @@ static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans,
 		goto out;
 	}
 
-	ret = trans_update_key(trans, &insert, iter, k, 1);
+	new_k = trans_update_key(trans, iter, k.k->u64s);
+	ret = PTR_ERR_OR_ZERO(new_k);
 	if (ret)
 		goto out;
 
-	s = bkey_i_to_s_stripe(insert->k);
+	bkey_reassemble(new_k, k);
+	s = bkey_i_to_s_stripe(new_k);
 
 	nr_data = s.v->nr_blocks - s.v->nr_redundant;
 
-- 
cgit 


From e812cf38c558f4d3a6bef8a077478a6632811f0b Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 29 Jun 2019 18:03:40 -0400
Subject: bcachefs: Check for key size > offset

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bkey_methods.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c
index 09ee958c5568..27f196ef0b18 100644
--- a/fs/bcachefs/bkey_methods.c
+++ b/fs/bcachefs/bkey_methods.c
@@ -90,6 +90,9 @@ const char *__bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k,
 	if (btree_node_type_is_extents(type)) {
 		if ((k.k->size == 0) != bkey_deleted(k.k))
 			return "bad size field";
+
+		if (k.k->size > k.k->p.offset)
+			return "size greater than offset";
 	} else {
 		if (k.k->size)
 			return "nonzero size field";
-- 
cgit 


From 168f4c5fb375131bd0f5996b549c5e13cc2c2bb5 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 24 Jun 2019 18:24:38 -0400
Subject: bcachefs: Improve bch2_lock_inodes()

Can now be used for the two different types of locks we have so far

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-ioctl.c |  4 ++--
 fs/bcachefs/fs.c       | 10 ++++++----
 fs/bcachefs/fs.h       | 34 ++++++++++++++++++++++++++--------
 3 files changed, 34 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c
index 4dca716217a6..0cf2621ec4fc 100644
--- a/fs/bcachefs/fs-ioctl.c
+++ b/fs/bcachefs/fs-ioctl.c
@@ -205,7 +205,7 @@ static int bch2_ioc_reinherit_attrs(struct bch_fs *c,
 	if (ret)
 		goto err2;
 
-	bch2_lock_inodes(src, dst);
+	bch2_lock_inodes(INODE_UPDATE_LOCK, src, dst);
 
 	if (inode_attr_changing(src, dst, Inode_opt_project)) {
 		ret = bch2_fs_quota_transfer(c, dst,
@@ -218,7 +218,7 @@ static int bch2_ioc_reinherit_attrs(struct bch_fs *c,
 
 	ret = bch2_write_inode(c, dst, bch2_reinherit_attrs_fn, src, 0);
 err3:
-	bch2_unlock_inodes(src, dst);
+	bch2_unlock_inodes(INODE_UPDATE_LOCK, src, dst);
 
 	/* return true if we did work */
 	if (ret >= 0)
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index afe930532224..c806ebad9cde 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -657,7 +657,7 @@ static int bch2_unlink(struct inode *vdir, struct dentry *dentry)
 	struct btree_trans trans;
 	int ret;
 
-	bch2_lock_inodes(dir, inode);
+	bch2_lock_inodes(INODE_UPDATE_LOCK, dir, inode);
 	bch2_trans_init(&trans, c, 4, 1024);
 retry:
 	bch2_trans_begin(&trans);
@@ -690,7 +690,7 @@ retry:
 				      ATTR_MTIME);
 err:
 	bch2_trans_exit(&trans);
-	bch2_unlock_inodes(dir, inode);
+	bch2_unlock_inodes(INODE_UPDATE_LOCK, dir, inode);
 
 	return ret;
 }
@@ -871,7 +871,8 @@ static int bch2_rename2(struct mnt_idmap *idmap,
 
 	bch2_trans_init(&trans, c, 8, 2048);
 
-	bch2_lock_inodes(i.src_dir,
+	bch2_lock_inodes(INODE_UPDATE_LOCK,
+			 i.src_dir,
 			 i.dst_dir,
 			 i.src_inode,
 			 i.dst_inode);
@@ -969,7 +970,8 @@ err:
 				       1 << QTYP_PRJ,
 				       KEY_TYPE_QUOTA_NOCHECK);
 
-	bch2_unlock_inodes(i.src_dir,
+	bch2_unlock_inodes(INODE_UPDATE_LOCK,
+			   i.src_dir,
 			   i.dst_dir,
 			   i.src_inode,
 			   i.dst_inode);
diff --git a/fs/bcachefs/fs.h b/fs/bcachefs/fs.h
index e72d6a58b322..de07f0f1dd51 100644
--- a/fs/bcachefs/fs.h
+++ b/fs/bcachefs/fs.h
@@ -57,24 +57,42 @@ static inline int ptrcmp(void *l, void *r)
 	return cmp_int(l, r);
 }
 
-#define __bch2_lock_inodes(_lock, ...)					\
+enum bch_inode_lock_op {
+	INODE_LOCK		= (1U << 0),
+	INODE_UPDATE_LOCK	= (1U << 1),
+};
+
+#define bch2_lock_inodes(_locks, ...)					\
 do {									\
 	struct bch_inode_info *a[] = { NULL, __VA_ARGS__ };		\
 	unsigned i;							\
 									\
-	bubble_sort(&a[1], ARRAY_SIZE(a) - 1 , ptrcmp);			\
+	bubble_sort(&a[1], ARRAY_SIZE(a) - 1, ptrcmp);			\
 									\
-	for (i = ARRAY_SIZE(a) - 1; a[i]; --i)				\
+	for (i = 1; i < ARRAY_SIZE(a); i++)				\
 		if (a[i] != a[i - 1]) {					\
-			if (_lock)					\
+			if (_locks & INODE_LOCK)			\
+				down_write_nested(&a[i]->v.i_rwsem, i);	\
+			if (_locks & INODE_UPDATE_LOCK)			\
 				mutex_lock_nested(&a[i]->ei_update_lock, i);\
-			else						\
-				mutex_unlock(&a[i]->ei_update_lock);	\
 		}							\
 } while (0)
 
-#define bch2_lock_inodes(...)	__bch2_lock_inodes(true, __VA_ARGS__)
-#define bch2_unlock_inodes(...)	__bch2_lock_inodes(false, __VA_ARGS__)
+#define bch2_unlock_inodes(_locks, ...)					\
+do {									\
+	struct bch_inode_info *a[] = { NULL, __VA_ARGS__ };		\
+	unsigned i;							\
+									\
+	bubble_sort(&a[1], ARRAY_SIZE(a) - 1, ptrcmp);			\
+									\
+	for (i = 1; i < ARRAY_SIZE(a); i++)				\
+		if (a[i] != a[i - 1]) {					\
+			if (_locks & INODE_LOCK)			\
+				up_write(&a[i]->v.i_rwsem);		\
+			if (_locks & INODE_UPDATE_LOCK)			\
+				mutex_unlock(&a[i]->ei_update_lock);	\
+		}							\
+} while (0)
 
 static inline struct bch_inode_info *file_bch_inode(struct file *file)
 {
-- 
cgit 


From d74dfe02881bdf8df99a320857f38540315989cd Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 2 Jul 2019 14:12:48 -0400
Subject: bcachefs: Fix for building with old gcc

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs_format.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index c2b5c11a87c5..b8aafd2e283a 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -852,7 +852,7 @@ static const unsigned BKEY_ALLOC_VAL_U64s_MAX =
 		     BCH_ALLOC_FIELDS(), sizeof(u64));
 #undef x
 
-static const unsigned BKEY_ALLOC_U64s_MAX = BKEY_U64s + BKEY_ALLOC_VAL_U64s_MAX;
+#define BKEY_ALLOC_U64s_MAX	(BKEY_U64s + BKEY_ALLOC_VAL_U64s_MAX)
 
 /* Quotas: */
 
-- 
cgit 


From 738540f7fcdd619fe3adb8a27116f0f4371bf711 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 3 Jul 2019 19:36:39 -0400
Subject: bcachefs: kill bch2_crc64_update

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/checksum.c | 2 +-
 fs/bcachefs/checksum.h | 5 -----
 fs/bcachefs/str_hash.h | 4 ++--
 3 files changed, 3 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/checksum.c b/fs/bcachefs/checksum.c
index 664e1bc2b139..0ab6ce1b9e75 100644
--- a/fs/bcachefs/checksum.c
+++ b/fs/bcachefs/checksum.c
@@ -62,7 +62,7 @@ static u64 bch2_checksum_update(unsigned type, u64 crc, const void *data, size_t
 		return crc32c(crc, data, len);
 	case BCH_CSUM_CRC64_NONZERO:
 	case BCH_CSUM_CRC64:
-		return bch2_crc64_update(crc, data, len);
+		return crc64_be(crc, data, len);
 	default:
 		BUG();
 	}
diff --git a/fs/bcachefs/checksum.h b/fs/bcachefs/checksum.h
index afdbbf702970..b84e81bac8ff 100644
--- a/fs/bcachefs/checksum.h
+++ b/fs/bcachefs/checksum.h
@@ -25,11 +25,6 @@ static inline bool bch2_checksum_mergeable(unsigned type)
 struct bch_csum bch2_checksum_merge(unsigned, struct bch_csum,
 				    struct bch_csum, size_t);
 
-static inline u64 bch2_crc64_update(u64 crc, const void *p, size_t len)
-{
-	return crc64_be(crc, p, len);
-}
-
 #define BCH_NONCE_EXTENT	cpu_to_le32(1 << 28)
 #define BCH_NONCE_BTREE		cpu_to_le32(2 << 28)
 #define BCH_NONCE_JOURNAL	cpu_to_le32(3 << 28)
diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h
index c47af32ce983..df3f19055d1e 100644
--- a/fs/bcachefs/str_hash.h
+++ b/fs/bcachefs/str_hash.h
@@ -71,7 +71,7 @@ static inline void bch2_str_hash_init(struct bch_str_hash_ctx *ctx,
 		ctx->crc32c = crc32c(~0, &info->crc_key, sizeof(info->crc_key));
 		break;
 	case BCH_STR_HASH_CRC64:
-		ctx->crc64 = bch2_crc64_update(~0, &info->crc_key, sizeof(info->crc_key));
+		ctx->crc64 = crc64_be(~0, &info->crc_key, sizeof(info->crc_key));
 		break;
 	case BCH_STR_HASH_SIPHASH:
 		SipHash24_Init(&ctx->siphash, &info->siphash_key);
@@ -90,7 +90,7 @@ static inline void bch2_str_hash_update(struct bch_str_hash_ctx *ctx,
 		ctx->crc32c = crc32c(ctx->crc32c, data, len);
 		break;
 	case BCH_STR_HASH_CRC64:
-		ctx->crc64 = bch2_crc64_update(ctx->crc64, data, len);
+		ctx->crc64 = crc64_be(ctx->crc64, data, len);
 		break;
 	case BCH_STR_HASH_SIPHASH:
 		SipHash24_Update(&ctx->siphash, data, len);
-- 
cgit 


From 885678f68dd1bf9638087ae8b22051b464f5ec05 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 3 Jul 2019 19:27:42 -0400
Subject: bcachefs: Kill direct access to bi_io_vec

Switch to always using bio_add_page(), which merges contiguous pages now
that we have multipage bvecs.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_io.c   |  6 +--
 fs/bcachefs/compress.c   | 11 +++++-
 fs/bcachefs/debug.c      |  3 +-
 fs/bcachefs/ec.c         |  3 +-
 fs/bcachefs/fs-io.c      |  6 +--
 fs/bcachefs/io.c         | 96 ++++++++++++------------------------------------
 fs/bcachefs/io.h         |  1 -
 fs/bcachefs/journal_io.c |  8 ++--
 fs/bcachefs/super-io.c   | 16 +++-----
 fs/bcachefs/util.c       | 31 ++++------------
 fs/bcachefs/util.h       |  2 +-
 11 files changed, 58 insertions(+), 125 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index d4806809fc0d..c1d3e685a5f2 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -1037,10 +1037,9 @@ void bch2_btree_node_read(struct bch_fs *c, struct btree *b,
 	rb->pick		= pick;
 	INIT_WORK(&rb->work, btree_node_read_work);
 	bio->bi_iter.bi_sector	= pick.ptr.offset;
-	bio->bi_iter.bi_size	= btree_bytes(c);
 	bio->bi_end_io		= btree_node_read_endio;
 	bio->bi_private		= b;
-	bch2_bio_map(bio, b->data);
+	bch2_bio_map(bio, b->data, btree_bytes(c));
 
 	set_btree_node_read_in_flight(b);
 
@@ -1502,11 +1501,10 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
 	wbio->data			= data;
 	wbio->wbio.order		= order;
 	wbio->wbio.used_mempool		= used_mempool;
-	wbio->wbio.bio.bi_iter.bi_size	= sectors_to_write << 9;
 	wbio->wbio.bio.bi_end_io	= btree_node_write_endio;
 	wbio->wbio.bio.bi_private	= b;
 
-	bch2_bio_map(&wbio->wbio.bio, data);
+	bch2_bio_map(&wbio->wbio.bio, data, sectors_to_write << 9);
 
 	/*
 	 * If we're appending to a leaf node, we don't technically need FUA -
diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c
index 6b5b61f10fcb..3e91fa53985a 100644
--- a/fs/bcachefs/compress.c
+++ b/fs/bcachefs/compress.c
@@ -244,7 +244,16 @@ int bch2_bio_uncompress_inplace(struct bch_fs *c, struct bio *bio,
 	 * might have to free existing pages and retry allocation from mempool -
 	 * do this _after_ decompressing:
 	 */
-	bch2_bio_alloc_more_pages_pool(c, bio, crc->live_size << 9);
+	if (bio->bi_iter.bi_size < crc->live_size << 9) {
+		if (bch2_bio_alloc_pages(bio, (crc->live_size << 9) -
+					 bio->bi_iter.bi_size,
+					 GFP_NOFS)) {
+			bch2_bio_free_pages_pool(c, bio);
+			bio->bi_iter.bi_size = 0;
+			bio->bi_vcnt = 0;
+			bch2_bio_alloc_pages_pool(c, bio, crc->live_size << 9);
+		}
+	}
 
 	memcpy_to_bio(bio, bio->bi_iter, data.b + (crc->offset << 9));
 
diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c
index 4c6fcb6f918e..7adc5ae20b9f 100644
--- a/fs/bcachefs/debug.c
+++ b/fs/bcachefs/debug.c
@@ -70,8 +70,7 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b)
 			       GFP_NOIO,
 			       &c->btree_bio);
 	bio->bi_iter.bi_sector	= pick.ptr.offset;
-	bio->bi_iter.bi_size	= btree_bytes(c);
-	bch2_bio_map(bio, n_sorted);
+	bch2_bio_map(bio, n_sorted, btree_bytes(c));
 
 	submit_bio_wait(bio);
 
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index 01e85fae72d3..40acd1ec4645 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -399,11 +399,10 @@ static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf,
 		ec_bio->idx			= idx;
 
 		ec_bio->bio.bi_iter.bi_sector	= ptr->offset + buf->offset + (offset >> 9);
-		ec_bio->bio.bi_iter.bi_size	= b;
 		ec_bio->bio.bi_end_io		= ec_block_endio;
 		ec_bio->bio.bi_private		= cl;
 
-		bch2_bio_map(&ec_bio->bio, buf->data[idx] + offset);
+		bch2_bio_map(&ec_bio->bio, buf->data[idx] + offset, b);
 
 		closure_get(cl);
 		percpu_ref_get(&ca->io_ref);
diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 9d0cca0bdfa3..54b071b9ca2c 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -775,7 +775,7 @@ static int bio_add_page_contig(struct bio *bio, struct page *page)
 	else if (!bio_can_add_page_contig(bio, page))
 		return -1;
 
-	__bio_add_page(bio, page, PAGE_SIZE, 0);
+	BUG_ON(!bio_add_page(bio, page, PAGE_SIZE, 0));
 	return 0;
 }
 
@@ -913,7 +913,7 @@ static void readpage_bio_extend(struct readpages_iter *iter,
 			put_page(page);
 		}
 
-		__bio_add_page(bio, page, PAGE_SIZE, 0);
+		BUG_ON(!bio_add_page(bio, page, PAGE_SIZE, 0));
 	}
 }
 
@@ -1025,7 +1025,7 @@ void bch2_readahead(struct readahead_control *ractl)
 
 		rbio->bio.bi_iter.bi_sector = (sector_t) index << PAGE_SECTOR_SHIFT;
 		rbio->bio.bi_end_io = bch2_readpages_end_io;
-		__bio_add_page(&rbio->bio, page, PAGE_SIZE, 0);
+		BUG_ON(!bio_add_page(&rbio->bio, page, PAGE_SIZE, 0));
 
 		bchfs_read(&trans, iter, rbio, inode->v.i_ino,
 			   &readpages_iter);
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index 8a090b0d9b03..8c43791bfbb1 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -141,14 +141,13 @@ void bch2_bio_free_pages_pool(struct bch_fs *c, struct bio *bio)
 	bio->bi_vcnt = 0;
 }
 
-static void bch2_bio_alloc_page_pool(struct bch_fs *c, struct bio *bio,
-				    bool *using_mempool)
+static struct page *__bio_alloc_page_pool(struct bch_fs *c, bool *using_mempool)
 {
-	struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt++];
+	struct page *page;
 
 	if (likely(!*using_mempool)) {
-		bv->bv_page = alloc_page(GFP_NOIO);
-		if (unlikely(!bv->bv_page)) {
+		page = alloc_page(GFP_NOIO);
+		if (unlikely(!page)) {
 			mutex_lock(&c->bio_bounce_pages_lock);
 			*using_mempool = true;
 			goto pool_alloc;
@@ -156,57 +155,29 @@ static void bch2_bio_alloc_page_pool(struct bch_fs *c, struct bio *bio,
 		}
 	} else {
 pool_alloc:
-		bv->bv_page = mempool_alloc(&c->bio_bounce_pages, GFP_NOIO);
+		page = mempool_alloc(&c->bio_bounce_pages, GFP_NOIO);
 	}
 
-	bv->bv_len = PAGE_SIZE;
-	bv->bv_offset = 0;
+	return page;
 }
 
 void bch2_bio_alloc_pages_pool(struct bch_fs *c, struct bio *bio,
-			       size_t bytes)
+			       size_t size)
 {
 	bool using_mempool = false;
 
-	BUG_ON(DIV_ROUND_UP(bytes, PAGE_SIZE) > bio->bi_max_vecs);
+	while (size) {
+		struct page *page = __bio_alloc_page_pool(c, &using_mempool);
+		unsigned len = min(PAGE_SIZE, size);
 
-	bio->bi_iter.bi_size = bytes;
-
-	while (bio->bi_vcnt < DIV_ROUND_UP(bytes, PAGE_SIZE))
-		bch2_bio_alloc_page_pool(c, bio, &using_mempool);
+		BUG_ON(!bio_add_page(bio, page, len, 0));
+		size -= len;
+	}
 
 	if (using_mempool)
 		mutex_unlock(&c->bio_bounce_pages_lock);
 }
 
-void bch2_bio_alloc_more_pages_pool(struct bch_fs *c, struct bio *bio,
-				    size_t bytes)
-{
-	while (bio->bi_vcnt < DIV_ROUND_UP(bytes, PAGE_SIZE)) {
-		struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt];
-
-		BUG_ON(bio->bi_vcnt >= bio->bi_max_vecs);
-
-		bv->bv_page = alloc_page(GFP_NOIO);
-		if (!bv->bv_page) {
-			/*
-			 * We already allocated from mempool, we can't allocate from it again
-			 * without freeing the pages we already allocated or else we could
-			 * deadlock:
-			 */
-			bch2_bio_free_pages_pool(c, bio);
-			bch2_bio_alloc_pages_pool(c, bio, bytes);
-			return;
-		}
-
-		bv->bv_len = PAGE_SIZE;
-		bv->bv_offset = 0;
-		bio->bi_vcnt++;
-	}
-
-	bio->bi_iter.bi_size = bytes;
-}
-
 /* Writes */
 
 void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
@@ -491,8 +462,7 @@ static struct bio *bch2_write_bio_alloc(struct bch_fs *c,
 	wbio->bio.bi_opf	= src->bi_opf;
 
 	if (buf) {
-		bio->bi_iter.bi_size = output_available;
-		bch2_bio_map(bio, buf);
+		bch2_bio_map(bio, buf, output_available);
 		return bio;
 	}
 
@@ -502,31 +472,17 @@ static struct bio *bch2_write_bio_alloc(struct bch_fs *c,
 	 * We can't use mempool for more than c->sb.encoded_extent_max
 	 * worth of pages, but we'd like to allocate more if we can:
 	 */
-	while (bio->bi_iter.bi_size < output_available) {
-		unsigned len = min_t(unsigned, PAGE_SIZE,
-				     output_available - bio->bi_iter.bi_size);
-		struct page *p;
-
-		p = alloc_page(GFP_NOIO);
-		if (!p) {
-			unsigned pool_max =
-				min_t(unsigned, output_available,
-				      c->sb.encoded_extent_max << 9);
-
-			if (bio_sectors(bio) < pool_max)
-				bch2_bio_alloc_pages_pool(c, bio, pool_max);
-			break;
-		}
+	bch2_bio_alloc_pages_pool(c, bio,
+				  min_t(unsigned, output_available,
+					c->sb.encoded_extent_max << 9));
 
-		bio->bi_io_vec[bio->bi_vcnt++] = (struct bio_vec) {
-			.bv_page	= p,
-			.bv_len		= len,
-			.bv_offset	= 0,
-		};
-		bio->bi_iter.bi_size += len;
-	}
+	if (bio->bi_iter.bi_size < output_available)
+		*page_alloc_failed =
+			bch2_bio_alloc_pages(bio,
+					     output_available -
+					     bio->bi_iter.bi_size,
+					     GFP_NOFS) != 0;
 
-	*page_alloc_failed = bio->bi_vcnt < pages;
 	return bio;
 }
 
@@ -830,12 +786,6 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp)
 	}
 
 	dst->bi_iter.bi_size = total_output;
-
-	/* Free unneeded pages after compressing: */
-	if (to_wbio(dst)->bounce)
-		while (dst->bi_vcnt > DIV_ROUND_UP(dst->bi_iter.bi_size, PAGE_SIZE))
-			mempool_free(dst->bi_io_vec[--dst->bi_vcnt].bv_page,
-				     &c->bio_bounce_pages);
 do_write:
 	/* might have done a realloc... */
 
diff --git a/fs/bcachefs/io.h b/fs/bcachefs/io.h
index 84070b674187..61c8b8b3a459 100644
--- a/fs/bcachefs/io.h
+++ b/fs/bcachefs/io.h
@@ -13,7 +13,6 @@
 
 void bch2_bio_free_pages_pool(struct bch_fs *, struct bio *);
 void bch2_bio_alloc_pages_pool(struct bch_fs *, struct bio *, size_t);
-void bch2_bio_alloc_more_pages_pool(struct bch_fs *, struct bio *, size_t);
 
 #ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
 void bch2_latency_acct(struct bch_dev *, u64, int);
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 4e0c63f0076f..2531379e67c6 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -494,9 +494,8 @@ reread:
 			bio = bio_kmalloc(nr_bvecs, GFP_KERNEL);
 			bio_init(bio, ca->disk_sb.bdev, bio->bi_inline_vecs, nr_bvecs, REQ_OP_READ);
 
-			bio->bi_iter.bi_sector	= offset;
-			bio->bi_iter.bi_size	= sectors_read << 9;
-			bch2_bio_map(bio, buf->data);
+			bio->bi_iter.bi_sector = offset;
+			bch2_bio_map(bio, buf->data, sectors_read << 9);
 
 			ret = submit_bio_wait(bio);
 			kfree(bio);
@@ -1086,10 +1085,9 @@ void bch2_journal_write(struct closure *cl)
 		bio_reset(bio, ca->disk_sb.bdev,
 			  REQ_OP_WRITE|REQ_SYNC|REQ_META|REQ_PREFLUSH|REQ_FUA);
 		bio->bi_iter.bi_sector	= ptr->offset;
-		bio->bi_iter.bi_size	= sectors << 9;
 		bio->bi_end_io		= journal_write_endio;
 		bio->bi_private		= ca;
-		bch2_bio_map(bio, jset);
+		bch2_bio_map(bio, jset, sectors << 9);
 
 		trace_journal_write(bio);
 		closure_bio_submit(bio, cl);
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index b991238c5bd2..af6fb90413e9 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -476,8 +476,7 @@ static const char *read_one_super(struct bch_sb_handle *sb, u64 offset)
 reread:
 	bio_reset(sb->bio, sb->bdev, REQ_OP_READ|REQ_SYNC|REQ_META);
 	sb->bio->bi_iter.bi_sector = offset;
-	sb->bio->bi_iter.bi_size = PAGE_SIZE << sb->page_order;
-	bch2_bio_map(sb->bio, sb->sb);
+	bch2_bio_map(sb->bio, sb->sb, PAGE_SIZE << sb->page_order);
 
 	if (submit_bio_wait(sb->bio))
 		return "IO error";
@@ -582,12 +581,11 @@ int bch2_read_super(const char *path, struct bch_opts *opts,
 	 */
 	bio_reset(sb->bio, sb->bdev, REQ_OP_READ|REQ_SYNC|REQ_META);
 	sb->bio->bi_iter.bi_sector = BCH_SB_LAYOUT_SECTOR;
-	sb->bio->bi_iter.bi_size = sizeof(struct bch_sb_layout);
 	/*
 	 * use sb buffer to read layout, since sb buffer is page aligned but
 	 * layout won't be:
 	 */
-	bch2_bio_map(sb->bio, sb->sb);
+	bch2_bio_map(sb->bio, sb->sb, sizeof(struct bch_sb_layout));
 
 	err = "IO error";
 	if (submit_bio_wait(sb->bio))
@@ -653,10 +651,9 @@ static void read_back_super(struct bch_fs *c, struct bch_dev *ca)
 
 	bio_reset(bio, ca->disk_sb.bdev, REQ_OP_READ|REQ_SYNC|REQ_META);
 	bio->bi_iter.bi_sector	= le64_to_cpu(sb->layout.sb_offset[0]);
-	bio->bi_iter.bi_size	= PAGE_SIZE;
 	bio->bi_end_io		= write_super_endio;
 	bio->bi_private		= ca;
-	bch2_bio_map(bio, ca->sb_read_scratch);
+	bch2_bio_map(bio, ca->sb_read_scratch, PAGE_SIZE);
 
 	this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_SB],
 		     bio_sectors(bio));
@@ -678,12 +675,11 @@ static void write_one_super(struct bch_fs *c, struct bch_dev *ca, unsigned idx)
 
 	bio_reset(bio, ca->disk_sb.bdev, REQ_OP_WRITE|REQ_SYNC|REQ_META);
 	bio->bi_iter.bi_sector	= le64_to_cpu(sb->offset);
-	bio->bi_iter.bi_size	=
-		roundup((size_t) vstruct_bytes(sb),
-			bdev_logical_block_size(ca->disk_sb.bdev));
 	bio->bi_end_io		= write_super_endio;
 	bio->bi_private		= ca;
-	bch2_bio_map(bio, sb);
+	bch2_bio_map(bio, sb,
+		     roundup((size_t) vstruct_bytes(sb),
+			     bdev_logical_block_size(ca->disk_sb.bdev)));
 
 	this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_SB],
 		     bio_sectors(bio));
diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c
index 0ca1fb59f54d..fc2ca798fbc3 100644
--- a/fs/bcachefs/util.c
+++ b/fs/bcachefs/util.c
@@ -506,33 +506,18 @@ size_t bch2_pd_controller_print_debug(struct bch_pd_controller *pd, char *buf)
 
 /* misc: */
 
-void bch2_bio_map(struct bio *bio, void *base)
+void bch2_bio_map(struct bio *bio, void *base, size_t size)
 {
-	size_t size = bio->bi_iter.bi_size;
-	struct bio_vec *bv = bio->bi_io_vec;
-
-	BUG_ON(!bio->bi_iter.bi_size);
-	BUG_ON(bio->bi_vcnt);
-	BUG_ON(!bio->bi_max_vecs);
-
-	bv->bv_offset = base ? offset_in_page(base) : 0;
-	goto start;
-
-	for (; size; bio->bi_vcnt++, bv++) {
-		BUG_ON(bio->bi_vcnt >= bio->bi_max_vecs);
-
-		bv->bv_offset	= 0;
-start:		bv->bv_len	= min_t(size_t, PAGE_SIZE - bv->bv_offset,
-					size);
-		if (base) {
-			bv->bv_page = is_vmalloc_addr(base)
+	while (size) {
+		struct page *page = is_vmalloc_addr(base)
 				? vmalloc_to_page(base)
 				: virt_to_page(base);
+		unsigned offset = offset_in_page(base);
+		unsigned len = min_t(size_t, PAGE_SIZE - offset, size);
 
-			base += bv->bv_len;
-		}
-
-		size -= bv->bv_len;
+		BUG_ON(!bio_add_page(bio, page, len, offset));
+		size -= len;
+		base += len;
 	}
 }
 
diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h
index c0910f230caf..baa236b4247c 100644
--- a/fs/bcachefs/util.h
+++ b/fs/bcachefs/util.h
@@ -503,7 +503,7 @@ static inline unsigned fract_exp_two(unsigned x, unsigned fract_bits)
 	return x;
 }
 
-void bch2_bio_map(struct bio *bio, void *base);
+void bch2_bio_map(struct bio *bio, void *base, size_t);
 int bch2_bio_alloc_pages(struct bio *, size_t, gfp_t);
 
 static inline sector_t bdev_sectors(struct block_device *bdev)
-- 
cgit 


From 0fd7263e2e55d43317e85e9f7a4e67de9fa0ec90 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 4 Jul 2019 03:48:25 -0400
Subject: bcachefs: kill bio_for_each_contig_segment()

With multipage bvecs it's not needed anymore

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/checksum.c |  4 ++--
 fs/bcachefs/compress.c |  2 +-
 fs/bcachefs/util.h     | 29 -----------------------------
 3 files changed, 3 insertions(+), 32 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/checksum.c b/fs/bcachefs/checksum.c
index 0ab6ce1b9e75..2e1dfdc68e15 100644
--- a/fs/bcachefs/checksum.c
+++ b/fs/bcachefs/checksum.c
@@ -200,7 +200,7 @@ static struct bch_csum __bch2_checksum_bio(struct bch_fs *c, unsigned type,
 			kunmap_atomic(p);
 		}
 #else
-		__bio_for_each_contig_segment(bv, bio, *iter, *iter)
+		__bio_for_each_bvec(bv, bio, *iter, *iter)
 			crc = bch2_checksum_update(type, crc,
 				page_address(bv.bv_page) + bv.bv_offset,
 				bv.bv_len);
@@ -225,7 +225,7 @@ static struct bch_csum __bch2_checksum_bio(struct bch_fs *c, unsigned type,
 			kunmap_atomic(p);
 		}
 #else
-		__bio_for_each_contig_segment(bv, bio, *iter, *iter)
+		__bio_for_each_bvec(bv, bio, *iter, *iter)
 			crypto_shash_update(desc,
 				page_address(bv.bv_page) + bv.bv_offset,
 				bv.bv_len);
diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c
index 3e91fa53985a..1a51a8c3e95c 100644
--- a/fs/bcachefs/compress.c
+++ b/fs/bcachefs/compress.c
@@ -66,7 +66,7 @@ static struct bbuf __bio_map_or_bounce(struct bch_fs *c, struct bio *bio,
 	BUG_ON(bvec_iter_sectors(start) > c->sb.encoded_extent_max);
 
 #ifndef CONFIG_HIGHMEM
-	__bio_for_each_contig_segment(bv, bio, iter, start) {
+	__bio_for_each_bvec(bv, bio, iter, start) {
 		if (bv.bv_len == start.bi_size)
 			return (struct bbuf) {
 				.b = page_address(bv.bv_page) + bv.bv_offset,
diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h
index baa236b4247c..5f0a3de91ae3 100644
--- a/fs/bcachefs/util.h
+++ b/fs/bcachefs/util.h
@@ -630,35 +630,6 @@ static inline void memmove_u64s(void *dst, const void *src,
 		__memmove_u64s_up(dst, src, u64s);
 }
 
-static inline struct bio_vec next_contig_bvec(struct bio *bio,
-					      struct bvec_iter *iter)
-{
-	struct bio_vec bv = bio_iter_iovec(bio, *iter);
-
-	bio_advance_iter(bio, iter, bv.bv_len);
-#ifndef CONFIG_HIGHMEM
-	while (iter->bi_size) {
-		struct bio_vec next = bio_iter_iovec(bio, *iter);
-
-		if (page_address(bv.bv_page) + bv.bv_offset + bv.bv_len !=
-		    page_address(next.bv_page) + next.bv_offset)
-			break;
-
-		bv.bv_len += next.bv_len;
-		bio_advance_iter(bio, iter, next.bv_len);
-	}
-#endif
-	return bv;
-}
-
-#define __bio_for_each_contig_segment(bv, bio, iter, start)		\
-	for (iter = (start);						\
-	     (iter).bi_size &&						\
-		((bv = next_contig_bvec((bio), &(iter))), 1);)
-
-#define bio_for_each_contig_segment(bv, bio, iter)			\
-	__bio_for_each_contig_segment(bv, bio, iter, (bio)->bi_iter)
-
 void sort_cmp_size(void *base, size_t num, size_t size,
 	  int (*cmp_func)(const void *, const void *, size_t),
 	  void (*swap_func)(void *, void *, size_t));
-- 
cgit 


From a6f4d5bb249cf6ac44d1a83c72584402b7e5f6ed Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 9 Jul 2019 11:16:33 -0400
Subject: bcachefs: Fix moving compressed data

bio_uncompress_inplace() used to potentially need to extend the bio to
be big enough for the uncompressed data, which has become problematic
with multipage bvecs - but, the move extent path actually already
allocated the bios to be big enough for the uncompressed data.

The promote path needed to be fixed, though.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/compress.c | 15 +++----------
 fs/bcachefs/io.c       | 58 +++++++++++++++++++++++++++-----------------------
 2 files changed, 34 insertions(+), 39 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c
index 1a51a8c3e95c..d350d917a8d4 100644
--- a/fs/bcachefs/compress.c
+++ b/fs/bcachefs/compress.c
@@ -241,19 +241,10 @@ int bch2_bio_uncompress_inplace(struct bch_fs *c, struct bio *bio,
 	}
 
 	/*
-	 * might have to free existing pages and retry allocation from mempool -
-	 * do this _after_ decompressing:
+	 * XXX: don't have a good way to assert that the bio was allocated with
+	 * enough space, we depend on bch2_move_extent doing the right thing
 	 */
-	if (bio->bi_iter.bi_size < crc->live_size << 9) {
-		if (bch2_bio_alloc_pages(bio, (crc->live_size << 9) -
-					 bio->bi_iter.bi_size,
-					 GFP_NOFS)) {
-			bch2_bio_free_pages_pool(c, bio);
-			bio->bi_iter.bi_size = 0;
-			bio->bi_vcnt = 0;
-			bch2_bio_alloc_pages_pool(c, bio, crc->live_size << 9);
-		}
-	}
+	bio->bi_iter.bi_size = crc->live_size << 9;
 
 	memcpy_to_bio(bio, bio->bi_iter, data.b + (crc->offset << 9));
 
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index 8c43791bfbb1..42071d0028ad 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -1039,22 +1039,18 @@ static struct promote_op *__promote_alloc(struct bch_fs *c,
 					  struct bpos pos,
 					  struct extent_ptr_decoded *pick,
 					  struct bch_io_opts opts,
-					  unsigned rbio_sectors,
+					  unsigned sectors,
 					  struct bch_read_bio **rbio)
 {
 	struct promote_op *op = NULL;
 	struct bio *bio;
-	unsigned rbio_pages = DIV_ROUND_UP(rbio_sectors, PAGE_SECTORS);
-	/* data might have to be decompressed in the write path: */
-	unsigned wbio_pages = DIV_ROUND_UP(pick->crc.uncompressed_size,
-					   PAGE_SECTORS);
+	unsigned pages = DIV_ROUND_UP(sectors, PAGE_SECTORS);
 	int ret;
 
 	if (!percpu_ref_tryget(&c->writes))
 		return NULL;
 
-	op = kzalloc(sizeof(*op) + sizeof(struct bio_vec) * wbio_pages,
-		     GFP_NOIO);
+	op = kzalloc(sizeof(*op) + sizeof(struct bio_vec) * pages, GFP_NOIO);
 	if (!op)
 		goto err;
 
@@ -1062,34 +1058,32 @@ static struct promote_op *__promote_alloc(struct bch_fs *c,
 	op->pos = pos;
 
 	/*
-	 * promotes require bouncing, but if the extent isn't
-	 * checksummed/compressed it might be too big for the mempool:
+	 * We don't use the mempool here because extents that aren't
+	 * checksummed or compressed can be too big for the mempool:
 	 */
-	if (rbio_sectors > c->sb.encoded_extent_max) {
-		*rbio = kzalloc(sizeof(struct bch_read_bio) +
-				sizeof(struct bio_vec) * rbio_pages,
-				GFP_NOIO);
-		if (!*rbio)
-			goto err;
+	*rbio = kzalloc(sizeof(struct bch_read_bio) +
+			sizeof(struct bio_vec) * pages,
+			GFP_NOIO);
+	if (!*rbio)
+		goto err;
 
-		rbio_init(&(*rbio)->bio, opts);
-		bio_init(&(*rbio)->bio, NULL, (*rbio)->bio.bi_inline_vecs, rbio_pages, 0);
+	rbio_init(&(*rbio)->bio, opts);
+	bio_init(&(*rbio)->bio, NULL, (*rbio)->bio.bi_inline_vecs, pages, 0);
 
-		if (bch2_bio_alloc_pages(&(*rbio)->bio, rbio_sectors << 9,
-					 GFP_NOIO))
-			goto err;
+	if (bch2_bio_alloc_pages(&(*rbio)->bio, sectors << 9,
+				 GFP_NOIO))
+		goto err;
 
-		(*rbio)->bounce		= true;
-		(*rbio)->split		= true;
-		(*rbio)->kmalloc	= true;
-	}
+	(*rbio)->bounce		= true;
+	(*rbio)->split		= true;
+	(*rbio)->kmalloc	= true;
 
 	if (rhashtable_lookup_insert_fast(&c->promote_table, &op->hash,
 					  bch_promote_params))
 		goto err;
 
 	bio = &op->write.op.wbio.bio;
-	bio_init(bio, NULL, bio->bi_inline_vecs, wbio_pages, 0);
+	bio_init(bio, NULL, bio->bi_inline_vecs, pages, 0);
 
 	ret = bch2_migrate_write_init(c, &op->write,
 			writepoint_hashed((unsigned long) current),
@@ -1123,8 +1117,9 @@ static inline struct promote_op *promote_alloc(struct bch_fs *c,
 					       bool *read_full)
 {
 	bool promote_full = *read_full || READ_ONCE(c->promote_whole_extents);
+	/* data might have to be decompressed in the write path: */
 	unsigned sectors = promote_full
-		? pick->crc.compressed_size
+		? max(pick->crc.compressed_size, pick->crc.live_size)
 		: bvec_iter_sectors(iter);
 	struct bpos pos = promote_full
 		? bkey_start_pos(k.k)
@@ -1659,7 +1654,16 @@ int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig,
 	}
 
 	if (rbio) {
-		/* promote already allocated bounce rbio */
+		/*
+		 * promote already allocated bounce rbio:
+		 * promote needs to allocate a bio big enough for uncompressing
+		 * data in the write path, but we're not going to use it all
+		 * here:
+		 */
+		BUG_ON(rbio->bio.bi_iter.bi_size <
+		       pick.crc.compressed_size << 9);
+		rbio->bio.bi_iter.bi_size =
+			pick.crc.compressed_size << 9;
 	} else if (bounce) {
 		unsigned sectors = pick.crc.compressed_size;
 
-- 
cgit 


From e1036a2a718f2cbd082568c881f677cf5fd9f442 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 2 Jul 2019 14:59:15 -0400
Subject: bcachefs: Always touch page state with page locked

This will mean we don't have to use cmpxchg for modifying page state,
which will simplify a fair amount of code

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-io.c | 93 ++++++++++++++++++++++++++++++-----------------------
 fs/bcachefs/fs-io.h |  2 --
 fs/bcachefs/fs.c    |  2 +-
 3 files changed, 54 insertions(+), 43 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 54b071b9ca2c..bf03048252ec 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -500,11 +500,6 @@ static inline struct bch_io_opts io_opts(struct bch_fs *c, struct bch_inode_info
 
 /* stored in page->private: */
 
-/*
- * bch_page_state has to (unfortunately) be manipulated with cmpxchg - we could
- * almost protected it with the page lock, except that bch2_writepage_io_done has
- * to update the sector counts (and from interrupt/bottom half context).
- */
 struct bch_page_state {
 union { struct {
 	/* existing data: */
@@ -550,6 +545,7 @@ static inline struct bch_page_state *page_state(struct page *page)
 {
 	struct bch_page_state *s = (void *) &page->private;
 
+	EBUG_ON(!PageLocked(page));
 	BUILD_BUG_ON(sizeof(*s) > sizeof(page->private));
 
 	if (!PagePrivate(page))
@@ -589,15 +585,20 @@ static void bch2_put_page_reservation(struct bch_fs *c, struct bch_inode_info *i
 	__bch2_put_page_reservation(c, inode, s);
 }
 
+static inline unsigned inode_nr_replicas(struct bch_fs *c, struct bch_inode_info *inode)
+{
+	/* XXX: this should not be open coded */
+	return inode->ei_inode.bi_data_replicas
+		? inode->ei_inode.bi_data_replicas - 1
+		: c->opts.data_replicas;
+}
+
 static int bch2_get_page_reservation(struct bch_fs *c, struct bch_inode_info *inode,
 				     struct page *page, bool check_enospc)
 {
 	struct bch_page_state *s = page_state(page), new;
 
-	/* XXX: this should not be open coded */
-	unsigned nr_replicas = inode->ei_inode.bi_data_replicas
-		? inode->ei_inode.bi_data_replicas - 1
-		: c->opts.data_replicas;
+	unsigned nr_replicas = inode_nr_replicas(c, inode);
 	struct disk_reservation disk_res;
 	struct quota_res quota_res = { 0 };
 	int ret;
@@ -655,7 +656,7 @@ static void bch2_clear_page_bits(struct page *page)
 	__bch2_put_page_reservation(c, inode, s);
 }
 
-bool bch2_dirty_folio(struct address_space *mapping, struct folio *folio)
+static void __bch2_set_page_dirty(struct address_space *mapping, struct folio *folio)
 {
 	struct bch_inode_info *inode = to_bch_ei(mapping->host);
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
@@ -673,8 +674,14 @@ bool bch2_dirty_folio(struct address_space *mapping, struct folio *folio)
 		i_sectors_acct(c, inode, &quota_res,
 			       new.dirty_sectors - old.dirty_sectors);
 	bch2_quota_reservation_put(c, inode, &quota_res);
+}
+
+static void bch2_set_page_dirty(struct address_space *mapping, struct page *page)
+{
+	struct folio *folio = page_folio(page);
 
-	return filemap_dirty_folio(mapping, folio);
+	__bch2_set_page_dirty(mapping, folio);
+	filemap_dirty_folio(mapping, folio);
 }
 
 vm_fault_t bch2_page_fault(struct vm_fault *vmf)
@@ -725,7 +732,7 @@ vm_fault_t bch2_page_mkwrite(struct vm_fault *vmf)
 	}
 
 	if (!PageDirty(page))
-		set_page_dirty(page);
+		bch2_set_page_dirty(mapping, page);
 	wait_for_stable_page(page);
 out:
 	bch2_pagecache_add_put(&inode->ei_pagecache_lock);
@@ -1210,10 +1217,12 @@ static int __bch2_writepage(struct folio *folio,
 	struct bch_inode_info *inode = to_bch_ei(page->mapping->host);
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
 	struct bch_writepage_state *w = data;
-	struct bch_page_state new, old;
+	struct bch_page_state *s;
 	unsigned offset, nr_replicas_this_write;
+	unsigned dirty_sectors, replicas_reserved;
 	loff_t i_size = i_size_read(&inode->v);
 	pgoff_t end_index = i_size >> PAGE_SHIFT;
+	int ret;
 
 	EBUG_ON(!PageUptodate(page));
 
@@ -1237,33 +1246,37 @@ static int __bch2_writepage(struct folio *folio,
 	 */
 	zero_user_segment(page, offset, PAGE_SIZE);
 do_io:
-	EBUG_ON(!PageLocked(page));
+	s = page_state(page);
 
-	/* Before unlocking the page, transfer reservation to w->io: */
-	old = page_state_cmpxchg(page_state(page), new, {
-		/*
-		 * If we didn't get a reservation, we can only write out the
-		 * number of (fully allocated) replicas that currently exist,
-		 * and only if the entire page has been written:
-		 */
-		nr_replicas_this_write =
-			max_t(unsigned,
-			      new.replicas_reserved,
-			      (new.sectors == PAGE_SECTORS
-			       ? new.nr_replicas : 0));
+	ret = bch2_get_page_reservation(c, inode, page, true);
+	if (ret) {
+		SetPageError(page);
+		mapping_set_error(page->mapping, ret);
+		unlock_page(page);
+		return 0;
+	}
 
-		BUG_ON(!nr_replicas_this_write);
+	__bch2_set_page_dirty(page->mapping, page_folio(page));
 
-		new.nr_replicas = w->opts.compression
-			? 0
-			: nr_replicas_this_write;
+	nr_replicas_this_write =
+		max_t(unsigned,
+		      s->replicas_reserved,
+		      (s->sectors == PAGE_SECTORS
+		       ? s->nr_replicas : 0));
 
-		new.replicas_reserved = 0;
+	s->nr_replicas = w->opts.compression
+		? 0
+		: nr_replicas_this_write;
 
-		new.sectors += new.dirty_sectors;
-		BUG_ON(new.sectors != PAGE_SECTORS);
-		new.dirty_sectors = 0;
-	});
+	/* Before unlocking the page, transfer reservation to w->io: */
+	replicas_reserved = s->replicas_reserved;
+	s->replicas_reserved = 0;
+
+	dirty_sectors = s->dirty_sectors;
+	s->dirty_sectors = 0;
+
+	s->sectors += dirty_sectors;
+	BUG_ON(s->sectors != PAGE_SECTORS);
 
 	BUG_ON(PageWriteback(page));
 	set_page_writeback(page);
@@ -1278,12 +1291,12 @@ do_io:
 		bch2_writepage_io_alloc(c, w, inode, page,
 					nr_replicas_this_write);
 
-	w->io->new_sectors += new.sectors - old.sectors;
+	w->io->new_sectors += dirty_sectors;
 
 	BUG_ON(inode != w->io->op.inode);
 	BUG_ON(bio_add_page_contig(&w->io->op.op.wbio.bio, page));
 
-	w->io->op.op.res.sectors += old.replicas_reserved * PAGE_SECTORS;
+	w->io->op.op.res.sectors += replicas_reserved * PAGE_SECTORS;
 	w->io->op.new_i_size = i_size;
 
 	if (wbc->sync_mode == WB_SYNC_ALL)
@@ -1421,7 +1434,7 @@ int bch2_write_end(struct file *file, struct address_space *mapping,
 		if (!PageUptodate(page))
 			SetPageUptodate(page);
 		if (!PageDirty(page))
-			set_page_dirty(page);
+			bch2_set_page_dirty(mapping, page);
 
 		inode->ei_last_dirtied = (unsigned long) current;
 	} else {
@@ -1538,7 +1551,7 @@ out:
 		if (!PageUptodate(pages[i]))
 			SetPageUptodate(pages[i]);
 		if (!PageDirty(pages[i]))
-			set_page_dirty(pages[i]);
+			bch2_set_page_dirty(mapping, pages[i]);
 		unlock_page(pages[i]);
 		put_page(pages[i]);
 	}
@@ -2212,7 +2225,7 @@ static int __bch2_truncate_page(struct bch_inode_info *inode,
 		zero_user_segment(page, 0, end_offset);
 
 	if (!PageDirty(page))
-		set_page_dirty(page);
+		bch2_set_page_dirty(mapping, page);
 unlock:
 	unlock_page(page);
 	put_page(page);
diff --git a/fs/bcachefs/fs-io.h b/fs/bcachefs/fs-io.h
index 2e4bfee877d9..e263b515e901 100644
--- a/fs/bcachefs/fs-io.h
+++ b/fs/bcachefs/fs-io.h
@@ -9,8 +9,6 @@
 
 #include <linux/uio.h>
 
-bool bch2_dirty_folio(struct address_space *, struct folio *);
-
 int bch2_writepage(struct page *, struct writeback_control *);
 int bch2_read_folio(struct file *, struct folio *);
 
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index c806ebad9cde..f69b535b1b82 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -1349,7 +1349,7 @@ static const struct address_space_operations bch_address_space_operations = {
 	.read_folio	= bch2_read_folio,
 	.writepages	= bch2_writepages,
 	.readahead	= bch2_readahead,
-	.dirty_folio	= bch2_dirty_folio,
+	.dirty_folio	= filemap_dirty_folio,
 	.write_begin	= bch2_write_begin,
 	.write_end	= bch2_write_end,
 	.invalidate_folio = bch2_invalidate_folio,
-- 
cgit 


From adfcfaf0686a7b96416e6074127dc8580af28154 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 2 Jul 2019 17:25:05 -0400
Subject: bcachefs: Kill page_state_cmpxchg

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-io.c | 103 +++++++++++++++-------------------------------------
 1 file changed, 30 insertions(+), 73 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index bf03048252ec..be4184debd7a 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -501,7 +501,6 @@ static inline struct bch_io_opts io_opts(struct bch_fs *c, struct bch_inode_info
 /* stored in page->private: */
 
 struct bch_page_state {
-union { struct {
 	/* existing data: */
 	unsigned		sectors:PAGE_SECTOR_SHIFT + 1;
 
@@ -519,28 +518,8 @@ union { struct {
 	 * Uncompressed size, not compressed size:
 	 */
 	unsigned		dirty_sectors:PAGE_SECTOR_SHIFT + 1;
-};
-	/* for cmpxchg: */
-	unsigned long		v;
-};
 };
 
-#define page_state_cmpxchg(_ptr, _new, _expr)				\
-({									\
-	unsigned long _v = READ_ONCE((_ptr)->v);			\
-	struct bch_page_state _old;					\
-									\
-	do {								\
-		_old.v = _new.v = _v;					\
-		_expr;							\
-									\
-		EBUG_ON(_new.sectors + _new.dirty_sectors > PAGE_SECTORS);\
-	} while (_old.v != _new.v &&					\
-		 (_v = cmpxchg(&(_ptr)->v, _old.v, _new.v)) != _old.v);	\
-									\
-	_old;								\
-})
-
 static inline struct bch_page_state *page_state(struct page *page)
 {
 	struct bch_page_state *s = (void *) &page->private;
@@ -554,35 +533,22 @@ static inline struct bch_page_state *page_state(struct page *page)
 	return s;
 }
 
-static inline unsigned page_res_sectors(struct bch_page_state s)
-{
-
-	return s.replicas_reserved * PAGE_SECTORS;
-}
-
-static void __bch2_put_page_reservation(struct bch_fs *c, struct bch_inode_info *inode,
-					struct bch_page_state s)
-{
-	struct disk_reservation res = { .sectors = page_res_sectors(s) };
-	struct quota_res quota_res = { .sectors = s.quota_reserved ? PAGE_SECTORS : 0 };
-
-	bch2_quota_reservation_put(c, inode, &quota_res);
-	bch2_disk_reservation_put(c, &res);
-}
-
 static void bch2_put_page_reservation(struct bch_fs *c, struct bch_inode_info *inode,
 				      struct page *page)
 {
-	struct bch_page_state s;
-
-	EBUG_ON(!PageLocked(page));
+	struct bch_page_state *s = page_state(page);
+	struct disk_reservation disk_res = {
+		.sectors = s->replicas_reserved * PAGE_SECTORS
+	};
+	struct quota_res quota_res = {
+		.sectors = s->quota_reserved ? PAGE_SECTORS : 0
+	};
 
-	s = page_state_cmpxchg(page_state(page), s, {
-		s.replicas_reserved	= 0;
-		s.quota_reserved	= 0;
-	});
+	s->replicas_reserved	= 0;
+	s->quota_reserved	= 0;
 
-	__bch2_put_page_reservation(c, inode, s);
+	bch2_quota_reservation_put(c, inode, &quota_res);
+	bch2_disk_reservation_put(c, &disk_res);
 }
 
 static inline unsigned inode_nr_replicas(struct bch_fs *c, struct bch_inode_info *inode)
@@ -596,8 +562,7 @@ static inline unsigned inode_nr_replicas(struct bch_fs *c, struct bch_inode_info
 static int bch2_get_page_reservation(struct bch_fs *c, struct bch_inode_info *inode,
 				     struct page *page, bool check_enospc)
 {
-	struct bch_page_state *s = page_state(page), new;
-
+	struct bch_page_state *s = page_state(page);
 	unsigned nr_replicas = inode_nr_replicas(c, inode);
 	struct disk_reservation disk_res;
 	struct quota_res quota_res = { 0 };
@@ -612,11 +577,7 @@ static int bch2_get_page_reservation(struct bch_fs *c, struct bch_inode_info *in
 		if (unlikely(ret))
 			return ret;
 
-		page_state_cmpxchg(s, new, ({
-			BUG_ON(new.replicas_reserved +
-			       disk_res.nr_replicas != nr_replicas);
-			new.replicas_reserved += disk_res.nr_replicas;
-		}));
+		s->replicas_reserved += disk_res.nr_replicas;
 	}
 
 	if (!s->quota_reserved &&
@@ -627,52 +588,48 @@ static int bch2_get_page_reservation(struct bch_fs *c, struct bch_inode_info *in
 		if (unlikely(ret))
 			return ret;
 
-		page_state_cmpxchg(s, new, ({
-			BUG_ON(new.quota_reserved);
-			new.quota_reserved = 1;
-		}));
+		s->quota_reserved = 1;
 	}
 
-	return ret;
+	return 0;
 }
 
 static void bch2_clear_page_bits(struct page *page)
 {
 	struct bch_inode_info *inode = to_bch_ei(page->mapping->host);
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	struct bch_page_state s;
+	struct bch_page_state *s;
 
 	EBUG_ON(!PageLocked(page));
 
 	if (!PagePrivate(page))
 		return;
 
-	s.v = xchg(&page_state(page)->v, 0);
-	ClearPagePrivate(page);
+	s = page_state(page);
 
-	if (s.dirty_sectors)
-		i_sectors_acct(c, inode, NULL, -s.dirty_sectors);
+	if (s->dirty_sectors)
+		i_sectors_acct(c, inode, NULL, -((int) s->dirty_sectors));
+	bch2_put_page_reservation(c, inode, page);
 
-	__bch2_put_page_reservation(c, inode, s);
+	ClearPagePrivate(page);
+	set_page_private(page, 0);
 }
 
 static void __bch2_set_page_dirty(struct address_space *mapping, struct folio *folio)
 {
 	struct bch_inode_info *inode = to_bch_ei(mapping->host);
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	struct quota_res quota_res = { 0 };
-	struct bch_page_state old, new;
-
-	old = page_state_cmpxchg(page_state(&folio->page), new,
-		new.dirty_sectors = PAGE_SECTORS - new.sectors;
-		new.quota_reserved = 0;
-	);
+	struct bch_page_state *s = page_state(&folio->page);
+	struct quota_res quota_res = { s->quota_reserved * PAGE_SECTORS };
+	unsigned dirty_sectors = PAGE_SECTORS - s->sectors;
 
-	quota_res.sectors += old.quota_reserved * PAGE_SECTORS;
+	s->quota_reserved = 0;
 
-	if (old.dirty_sectors != new.dirty_sectors)
+	if (s->dirty_sectors != dirty_sectors)
 		i_sectors_acct(c, inode, &quota_res,
-			       new.dirty_sectors - old.dirty_sectors);
+			       dirty_sectors - s->dirty_sectors);
+	s->dirty_sectors = dirty_sectors;
+
 	bch2_quota_reservation_put(c, inode, &quota_res);
 }
 
-- 
cgit 


From f57a6a5d41d66c527f8683b5cc6a069fe59e1fdf Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 2 Jul 2019 21:41:35 -0400
Subject: bcachefs: Track dirtyness at sector level, not page

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-io.c | 344 +++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 233 insertions(+), 111 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index be4184debd7a..8858352eb42a 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -500,52 +500,113 @@ static inline struct bch_io_opts io_opts(struct bch_fs *c, struct bch_inode_info
 
 /* stored in page->private: */
 
-struct bch_page_state {
-	/* existing data: */
-	unsigned		sectors:PAGE_SECTOR_SHIFT + 1;
-
+struct bch_page_sector {
 	/* Uncompressed, fully allocated replicas: */
-	unsigned		nr_replicas:4;
+	unsigned		nr_replicas:3;
 
 	/* Owns PAGE_SECTORS * replicas_reserved sized reservation: */
-	unsigned		replicas_reserved:4;
-
-	/* Owns PAGE_SECTORS sized quota reservation: */
-	unsigned		quota_reserved:1;
+	unsigned		replicas_reserved:3;
+
+	/* i_sectors: */
+	enum {
+		SECTOR_UNALLOCATED,
+		SECTOR_QUOTA_RESERVED,
+		SECTOR_DIRTY,
+		SECTOR_ALLOCATED,
+	}			state:2;
+};
 
-	/*
-	 * Number of sectors on disk - for i_blocks
-	 * Uncompressed size, not compressed size:
-	 */
-	unsigned		dirty_sectors:PAGE_SECTOR_SHIFT + 1;
+struct bch_page_state {
+	struct bch_page_sector	s[PAGE_SECTORS];
 };
 
-static inline struct bch_page_state *page_state(struct page *page)
+static inline struct bch_page_state *__bch2_page_state(struct page *page)
 {
-	struct bch_page_state *s = (void *) &page->private;
+	return page_has_private(page)
+		? (struct bch_page_state *) page_private(page)
+		: NULL;
+}
 
+static inline struct bch_page_state *bch2_page_state(struct page *page)
+{
 	EBUG_ON(!PageLocked(page));
-	BUILD_BUG_ON(sizeof(*s) > sizeof(page->private));
 
-	if (!PagePrivate(page))
-		SetPagePrivate(page);
+	return __bch2_page_state(page);
+}
+
+/* for newly allocated pages: */
+static void __bch2_page_state_release(struct page *page)
+{
+	struct bch_page_state *s = __bch2_page_state(page);
+
+	if (!s)
+		return;
+
+	ClearPagePrivate(page);
+	set_page_private(page, 0);
+	put_page(page);
+	kfree(s);
+}
+
+static void bch2_page_state_release(struct page *page)
+{
+	struct bch_page_state *s = bch2_page_state(page);
+
+	if (!s)
+		return;
+
+	ClearPagePrivate(page);
+	set_page_private(page, 0);
+	put_page(page);
+	kfree(s);
+}
+
+/* for newly allocated pages: */
+static struct bch_page_state *__bch2_page_state_create(struct page *page,
+						       gfp_t gfp)
+{
+	struct bch_page_state *s;
+
+	s = kzalloc(sizeof(*s), GFP_NOFS|gfp);
+	if (!s)
+		return NULL;
 
+	/*
+	 * migrate_page_move_mapping() assumes that pages with private data
+	 * have their count elevated by 1.
+	 */
+	get_page(page);
+	set_page_private(page, (unsigned long) s);
+	SetPagePrivate(page);
 	return s;
 }
 
+static struct bch_page_state *bch2_page_state_create(struct page *page,
+						     gfp_t gfp)
+{
+	return bch2_page_state(page) ?: __bch2_page_state_create(page, gfp);
+}
+
 static void bch2_put_page_reservation(struct bch_fs *c, struct bch_inode_info *inode,
 				      struct page *page)
 {
-	struct bch_page_state *s = page_state(page);
-	struct disk_reservation disk_res = {
-		.sectors = s->replicas_reserved * PAGE_SECTORS
-	};
-	struct quota_res quota_res = {
-		.sectors = s->quota_reserved ? PAGE_SECTORS : 0
-	};
+	struct bch_page_state *s = bch2_page_state(page);
+	struct disk_reservation disk_res = { 0 };
+	struct quota_res quota_res = { 0 };
+	unsigned i;
 
-	s->replicas_reserved	= 0;
-	s->quota_reserved	= 0;
+	if (!s)
+		return;
+
+	for (i = 0; i < ARRAY_SIZE(s->s); i++) {
+		disk_res.sectors += s->s[i].replicas_reserved;
+		s->s[i].replicas_reserved = 0;
+
+		if (s->s[i].state == SECTOR_QUOTA_RESERVED) {
+			quota_res.sectors++;
+			s->s[i].state = SECTOR_UNALLOCATED;
+		}
+	}
 
 	bch2_quota_reservation_put(c, inode, &quota_res);
 	bch2_disk_reservation_put(c, &disk_res);
@@ -559,77 +620,133 @@ static inline unsigned inode_nr_replicas(struct bch_fs *c, struct bch_inode_info
 		: c->opts.data_replicas;
 }
 
-static int bch2_get_page_reservation(struct bch_fs *c, struct bch_inode_info *inode,
-				     struct page *page, bool check_enospc)
+static inline unsigned sectors_to_reserve(struct bch_page_sector *s,
+						  unsigned nr_replicas)
 {
-	struct bch_page_state *s = page_state(page);
+	return max(0, (int) nr_replicas -
+		   s->nr_replicas -
+		   s->replicas_reserved);
+}
+
+static int bch2_get_page_disk_reservation(struct bch_fs *c,
+				struct bch_inode_info *inode,
+				struct page *page, bool check_enospc)
+{
+	struct bch_page_state *s = bch2_page_state_create(page, 0);
 	unsigned nr_replicas = inode_nr_replicas(c, inode);
-	struct disk_reservation disk_res;
+	struct disk_reservation disk_res = { 0 };
+	unsigned i, disk_res_sectors = 0;
+	int ret;
+
+	if (!s)
+		return -ENOMEM;
+
+	for (i = 0; i < ARRAY_SIZE(s->s); i++)
+		disk_res_sectors += sectors_to_reserve(&s->s[i], nr_replicas);
+
+	if (!disk_res_sectors)
+		return 0;
+
+	ret = bch2_disk_reservation_get(c, &disk_res,
+					disk_res_sectors, 1,
+					!check_enospc
+					? BCH_DISK_RESERVATION_NOFAIL
+					: 0);
+	if (unlikely(ret))
+		return ret;
+
+	for (i = 0; i < ARRAY_SIZE(s->s); i++)
+		s->s[i].replicas_reserved +=
+			sectors_to_reserve(&s->s[i], nr_replicas);
+
+	return 0;
+}
+
+static int bch2_get_page_quota_reservation(struct bch_fs *c,
+			struct bch_inode_info *inode,
+			struct page *page, bool check_enospc)
+{
+	struct bch_page_state *s = bch2_page_state_create(page, 0);
 	struct quota_res quota_res = { 0 };
+	unsigned i, quota_res_sectors = 0;
 	int ret;
 
-	EBUG_ON(!PageLocked(page));
+	if (!s)
+		return -ENOMEM;
 
-	if (s->replicas_reserved < nr_replicas) {
-		ret = bch2_disk_reservation_get(c, &disk_res, PAGE_SECTORS,
-				nr_replicas - s->replicas_reserved,
-				!check_enospc ? BCH_DISK_RESERVATION_NOFAIL : 0);
-		if (unlikely(ret))
-			return ret;
+	for (i = 0; i < ARRAY_SIZE(s->s); i++)
+		quota_res_sectors += s->s[i].state == SECTOR_UNALLOCATED;
 
-		s->replicas_reserved += disk_res.nr_replicas;
-	}
+	if (!quota_res_sectors)
+		return 0;
 
-	if (!s->quota_reserved &&
-	    s->sectors + s->dirty_sectors < PAGE_SECTORS) {
-		ret = bch2_quota_reservation_add(c, inode, &quota_res,
-						 PAGE_SECTORS,
-						 check_enospc);
-		if (unlikely(ret))
-			return ret;
+	ret = bch2_quota_reservation_add(c, inode, &quota_res,
+					 quota_res_sectors,
+					 check_enospc);
+	if (unlikely(ret))
+		return ret;
 
-		s->quota_reserved = 1;
-	}
+	for (i = 0; i < ARRAY_SIZE(s->s); i++)
+		if (s->s[i].state == SECTOR_UNALLOCATED)
+			s->s[i].state = SECTOR_QUOTA_RESERVED;
 
 	return 0;
 }
 
+static int bch2_get_page_reservation(struct bch_fs *c, struct bch_inode_info *inode,
+				     struct page *page, bool check_enospc)
+{
+	return bch2_get_page_disk_reservation(c, inode, page, check_enospc) ?:
+		bch2_get_page_quota_reservation(c, inode, page, check_enospc);
+}
+
 static void bch2_clear_page_bits(struct page *page)
 {
 	struct bch_inode_info *inode = to_bch_ei(page->mapping->host);
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	struct bch_page_state *s;
-
-	EBUG_ON(!PageLocked(page));
+	struct bch_page_state *s = bch2_page_state(page);
+	int i, dirty_sectors = 0;
 
-	if (!PagePrivate(page))
+	if (!s)
 		return;
 
-	s = page_state(page);
+	for (i = 0; i < ARRAY_SIZE(s->s); i++) {
+		if (s->s[i].state == SECTOR_DIRTY) {
+			dirty_sectors++;
+			s->s[i].state = SECTOR_UNALLOCATED;
+		}
+	}
 
-	if (s->dirty_sectors)
-		i_sectors_acct(c, inode, NULL, -((int) s->dirty_sectors));
+	if (dirty_sectors)
+		i_sectors_acct(c, inode, NULL, -dirty_sectors);
 	bch2_put_page_reservation(c, inode, page);
 
-	ClearPagePrivate(page);
-	set_page_private(page, 0);
+	bch2_page_state_release(page);
 }
 
 static void __bch2_set_page_dirty(struct address_space *mapping, struct folio *folio)
 {
 	struct bch_inode_info *inode = to_bch_ei(mapping->host);
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	struct bch_page_state *s = page_state(&folio->page);
-	struct quota_res quota_res = { s->quota_reserved * PAGE_SECTORS };
-	unsigned dirty_sectors = PAGE_SECTORS - s->sectors;
+	struct bch_page_state *s = bch2_page_state(&folio->page);
+	struct quota_res quota_res = { 0 };
+	unsigned i, dirty_sectors = 0;
 
-	s->quota_reserved = 0;
+	BUG_ON(!s);
 
-	if (s->dirty_sectors != dirty_sectors)
-		i_sectors_acct(c, inode, &quota_res,
-			       dirty_sectors - s->dirty_sectors);
-	s->dirty_sectors = dirty_sectors;
+	for (i = 0; i < ARRAY_SIZE(s->s); i++) {
+		if (s->s[i].state == SECTOR_QUOTA_RESERVED)
+			quota_res.sectors++;
 
+		if (s->s[i].state == SECTOR_UNALLOCATED ||
+		    s->s[i].state == SECTOR_QUOTA_RESERVED) {
+			s->s[i].state = SECTOR_DIRTY;
+			dirty_sectors++;
+		}
+	}
+
+	if (dirty_sectors)
+		i_sectors_acct(c, inode, &quota_res, dirty_sectors);
 	bch2_quota_reservation_put(c, inode, &quota_res);
 }
 
@@ -796,6 +913,7 @@ static int readpages_iter_init(struct readpages_iter *iter,
 
 	__readahead_batch(ractl, iter->pages, nr_pages);
 	for (i = 0; i < nr_pages; i++) {
+		__bch2_page_state_create(iter->pages[i], __GFP_NOFAIL);
 		put_page(iter->pages[i]);
 	}
 
@@ -809,7 +927,6 @@ static inline struct page *readpage_iter_next(struct readpages_iter *iter)
 
 	EBUG_ON(iter->pages[iter->idx]->index != iter->offset + iter->idx);
 
-	page_state_init_for_read(iter->pages[iter->idx]);
 	return iter->pages[iter->idx];
 }
 
@@ -819,21 +936,20 @@ static void bch2_add_page_sectors(struct bio *bio, struct bkey_s_c k)
 	struct bio_vec bv;
 	unsigned nr_ptrs = bch2_bkey_nr_ptrs_allocated(k);
 
-	bio_for_each_segment(bv, bio, iter) {
-		/* brand new pages, don't need to be locked: */
-
-		struct bch_page_state *s = page_state(bv.bv_page);
-
-		/* sectors in @k from the start of this page: */
-		unsigned k_sectors = k.k->size - (iter.bi_sector - k.k->p.offset);
+	BUG_ON(bio->bi_iter.bi_sector	< bkey_start_offset(k.k));
+	BUG_ON(bio_end_sector(bio)	> k.k->p.offset);
 
-		unsigned page_sectors = min(bv.bv_len >> 9, k_sectors);
 
-		s->nr_replicas = page_sectors == PAGE_SECTORS
-			? nr_ptrs : 0;
-
-		BUG_ON(s->sectors + page_sectors > PAGE_SECTORS);
-		s->sectors += page_sectors;
+	bio_for_each_segment(bv, bio, iter) {
+		struct bch_page_state *s = bch2_page_state(bv.bv_page);
+		unsigned i;
+
+		for (i = bv.bv_offset >> 9;
+		     i < (bv.bv_offset + bv.bv_len) >> 9;
+		     i++) {
+			s->s[i].nr_replicas = nr_ptrs;
+			s->s[i].state = SECTOR_ALLOCATED;
+		}
 	}
 }
 
@@ -864,12 +980,15 @@ static void readpage_bio_extend(struct readpages_iter *iter,
 			if (!page)
 				break;
 
-			page_state_init_for_read(page);
+			if (!__bch2_page_state_create(page, 0)) {
+				put_page(page);
+				break;
+			}
 
 			ret = add_to_page_cache_lru(page, iter->mapping,
 						    page_offset, GFP_NOFS);
 			if (ret) {
-				ClearPagePrivate(page);
+				__bch2_page_state_release(page);
 				put_page(page);
 				break;
 			}
@@ -1007,7 +1126,7 @@ static void __bchfs_readpage(struct bch_fs *c, struct bch_read_bio *rbio,
 	struct btree_trans trans;
 	struct btree_iter *iter;
 
-	page_state_init_for_read(page);
+	bch2_page_state_create(page, __GFP_NOFAIL);
 
 	rbio->bio.bi_opf = REQ_OP_READ|REQ_SYNC;
 	bio_add_page_contig(&rbio->bio, page);
@@ -1175,10 +1294,11 @@ static int __bch2_writepage(struct folio *folio,
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
 	struct bch_writepage_state *w = data;
 	struct bch_page_state *s;
-	unsigned offset, nr_replicas_this_write;
-	unsigned dirty_sectors, replicas_reserved;
+	unsigned offset, nr_replicas_this_write = U32_MAX;
+	unsigned dirty_sectors = 0, reserved_sectors = 0;
 	loff_t i_size = i_size_read(&inode->v);
 	pgoff_t end_index = i_size >> PAGE_SHIFT;
+	unsigned i;
 	int ret;
 
 	EBUG_ON(!PageUptodate(page));
@@ -1203,9 +1323,9 @@ static int __bch2_writepage(struct folio *folio,
 	 */
 	zero_user_segment(page, offset, PAGE_SIZE);
 do_io:
-	s = page_state(page);
+	s = bch2_page_state_create(page, __GFP_NOFAIL);
 
-	ret = bch2_get_page_reservation(c, inode, page, true);
+	ret = bch2_get_page_disk_reservation(c, inode, page, true);
 	if (ret) {
 		SetPageError(page);
 		mapping_set_error(page->mapping, ret);
@@ -1213,27 +1333,24 @@ do_io:
 		return 0;
 	}
 
-	__bch2_set_page_dirty(page->mapping, page_folio(page));
-
-	nr_replicas_this_write =
-		max_t(unsigned,
-		      s->replicas_reserved,
-		      (s->sectors == PAGE_SECTORS
-		       ? s->nr_replicas : 0));
-
-	s->nr_replicas = w->opts.compression
-		? 0
-		: nr_replicas_this_write;
+	for (i = 0; i < PAGE_SECTORS; i++)
+		nr_replicas_this_write =
+			min_t(unsigned, nr_replicas_this_write,
+			      s->s[i].nr_replicas +
+			      s->s[i].replicas_reserved);
 
 	/* Before unlocking the page, transfer reservation to w->io: */
-	replicas_reserved = s->replicas_reserved;
-	s->replicas_reserved = 0;
 
-	dirty_sectors = s->dirty_sectors;
-	s->dirty_sectors = 0;
+	for (i = 0; i < PAGE_SECTORS; i++) {
+		s->s[i].nr_replicas = w->opts.compression
+			? 0 : nr_replicas_this_write;
 
-	s->sectors += dirty_sectors;
-	BUG_ON(s->sectors != PAGE_SECTORS);
+		reserved_sectors += s->s[i].replicas_reserved;
+		s->s[i].replicas_reserved = 0;
+
+		dirty_sectors += s->s[i].state == SECTOR_DIRTY;
+		s->s[i].state = SECTOR_ALLOCATED;
+	}
 
 	BUG_ON(PageWriteback(page));
 	set_page_writeback(page);
@@ -1253,7 +1370,7 @@ do_io:
 	BUG_ON(inode != w->io->op.inode);
 	BUG_ON(bio_add_page_contig(&w->io->op.op.wbio.bio, page));
 
-	w->io->op.op.res.sectors += replicas_reserved * PAGE_SECTORS;
+	w->io->op.op.res.sectors += reserved_sectors;
 	w->io->op.new_i_size = i_size;
 
 	if (wbc->sync_mode == WB_SYNC_ALL)
@@ -2637,12 +2754,17 @@ long bch2_fallocate_dispatch(struct file *file, int mode,
 
 static bool folio_is_data(struct folio *folio)
 {
-	EBUG_ON(!PageLocked(&folio->page));
+	struct bch_page_state *s = bch2_page_state(&folio->page);
+	unsigned i;
+
+	if (!s)
+		return false;
+
+	for (i = 0; i < PAGE_SECTORS; i++)
+		if (s->s[i].state >= SECTOR_DIRTY)
+			return true;
 
-	/* XXX: should only have to check PageDirty */
-	return folio_test_private(folio) &&
-		(page_state(&folio->page)->sectors ||
-		 page_state(&folio->page)->dirty_sectors);
+	return false;
 }
 
 static loff_t bch2_next_pagecache_data(struct inode *vinode,
-- 
cgit 


From 97fd13ad769c24260ce0e05f05a51df132b980bd Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 10 Jul 2019 16:04:58 -0400
Subject: bcachefs: Don't try to delete stripes when RO

We weren't checking for errors when trying to delet stripes, which meant
ec_stripe_delete_work() would spin trying to delete the same stripe over
and over.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/ec.c    | 6 ++++--
 fs/bcachefs/super.c | 2 ++
 2 files changed, 6 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index 40acd1ec4645..6d59631b8259 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -626,7 +626,8 @@ void bch2_stripes_heap_update(struct bch_fs *c,
 		bch2_stripes_heap_insert(c, m, idx);
 	}
 
-	if (stripe_idx_to_delete(c) >= 0)
+	if (stripe_idx_to_delete(c) >= 0 &&
+	    !percpu_ref_is_dying(&c->writes))
 		schedule_work(&c->ec_stripe_delete_work);
 }
 
@@ -684,7 +685,8 @@ static void ec_stripe_delete_work(struct work_struct *work)
 		if (idx < 0)
 			break;
 
-		ec_stripe_delete(c, idx);
+		if (ec_stripe_delete(c, idx))
+			break;
 	}
 
 	mutex_unlock(&c->ec_stripe_create_lock);
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 4c54ac64b0af..91562b95bd97 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -366,6 +366,8 @@ static int bch2_fs_read_write_late(struct bch_fs *c)
 
 	schedule_delayed_work(&c->pd_controllers_update, 5 * HZ);
 
+	schedule_work(&c->ec_stripe_delete_work);
+
 	return 0;
 }
 
-- 
cgit 


From f516c87272c98186f12768e9589664ce7a910bf4 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 11 Jul 2019 12:45:59 -0400
Subject: bcachefs: Fix stripe_idx_to_delete()

There was a null ptr deref when there wasn't a stripes heap allocated

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/ec.c    | 3 ++-
 fs/bcachefs/super.c | 1 +
 2 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index 6d59631b8259..42bca2b413e3 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -575,7 +575,8 @@ static ssize_t stripe_idx_to_delete(struct bch_fs *c)
 {
 	ec_stripes_heap *h = &c->ec_stripes_heap;
 
-	return h->data[0].blocks_nonempty == 0 ? h->data[0].idx : -1;
+	return h->used && h->data[0].blocks_nonempty == 0
+		? h->data[0].idx : -1;
 }
 
 static inline int ec_stripes_heap_cmp(ec_stripes_heap *h,
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 91562b95bd97..202c0b443ef4 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -277,6 +277,7 @@ void bch2_fs_read_only(struct bch_fs *c)
 	 */
 	percpu_ref_kill(&c->writes);
 
+	cancel_work_sync(&c->ec_stripe_delete_work);
 	cancel_delayed_work(&c->pd_controllers_update);
 
 	/*
-- 
cgit 


From e222d206f298dc2c689d8f8787c929451d4f909b Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 12 Jul 2019 17:08:32 -0400
Subject: bcachefs: Fix ec_stripes_read()

Change it to not mark keys that will be overwritten by keys in the
journal - this fixes a bug where we pop an assertion in
bucket_set_stripe() because of a stale pointer - because the stripe that
has the stale pointer has been deleted.

This code could be factored out and used elsewhere, at some point.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/ec.c       | 46 +++++++++++++++++++++++++++++++++++-----------
 fs/bcachefs/recovery.c | 36 ++++++++++++++++++++++++++++++++++++
 fs/bcachefs/recovery.h | 11 +++++++++++
 3 files changed, 82 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index 42bca2b413e3..de31ea6c20de 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -1267,10 +1267,10 @@ int bch2_stripes_write(struct bch_fs *c, unsigned flags, bool *wrote)
 
 int bch2_stripes_read(struct bch_fs *c, struct journal_keys *journal_keys)
 {
-	struct journal_key *i;
 	struct btree_trans trans;
-	struct btree_iter *iter;
-	struct bkey_s_c k;
+	struct btree_iter *btree_iter;
+	struct journal_iter journal_iter;
+	struct bkey_s_c btree_k, journal_k, k;
 	int ret;
 
 	ret = bch2_fs_ec_start(c);
@@ -1279,10 +1279,41 @@ int bch2_stripes_read(struct bch_fs *c, struct journal_keys *journal_keys)
 
 	bch2_trans_init(&trans, c, 0, 0);
 
-	for_each_btree_key(&trans, iter, BTREE_ID_EC, POS_MIN, 0, k, ret)
+	btree_iter	= bch2_trans_get_iter(&trans, BTREE_ID_EC, POS_MIN, 0);
+	journal_iter	= bch2_journal_iter_init(journal_keys, BTREE_ID_EC);
+
+	btree_k		= bch2_btree_iter_peek(btree_iter);
+	journal_k	= bch2_journal_iter_peek(&journal_iter);
+
+	while (1) {
+		if (btree_k.k && journal_k.k) {
+			int cmp = bkey_cmp(btree_k.k->p, journal_k.k->p);
+
+			if (cmp < 0) {
+				k = btree_k;
+				btree_k = bch2_btree_iter_next(btree_iter);
+			} else if (cmp == 0) {
+				btree_k = bch2_btree_iter_next(btree_iter);
+				k = journal_k;
+				journal_k = bch2_journal_iter_next(&journal_iter);
+			} else {
+				k = journal_k;
+				journal_k = bch2_journal_iter_next(&journal_iter);
+			}
+		} else if (btree_k.k) {
+			k = btree_k;
+			btree_k = bch2_btree_iter_next(btree_iter);
+		} else if (journal_k.k) {
+			k = journal_k;
+			journal_k = bch2_journal_iter_next(&journal_iter);
+		} else {
+			break;
+		}
+
 		bch2_mark_key(c, k, 0, NULL, 0,
 			      BCH_BUCKET_MARK_ALLOC_READ|
 			      BCH_BUCKET_MARK_NOATOMIC);
+	}
 
 	ret = bch2_trans_exit(&trans) ?: ret;
 	if (ret) {
@@ -1290,13 +1321,6 @@ int bch2_stripes_read(struct bch_fs *c, struct journal_keys *journal_keys)
 		return ret;
 	}
 
-	for_each_journal_key(*journal_keys, i)
-		if (i->btree_id == BTREE_ID_EC)
-			bch2_mark_key(c, bkey_i_to_s_c(i->k),
-				      0, NULL, 0,
-				      BCH_BUCKET_MARK_ALLOC_READ|
-				      BCH_BUCKET_MARK_NOATOMIC);
-
 	return 0;
 }
 
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index e0df2c0a4fdf..92867b5c078f 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -24,6 +24,42 @@
 
 #define QSTR(n) { { { .len = strlen(n) } }, .name = n }
 
+/* iterate over keys read from the journal: */
+
+struct journal_iter bch2_journal_iter_init(struct journal_keys *keys,
+					   enum btree_id id)
+{
+	return (struct journal_iter) {
+		.keys		= keys,
+		.k		= keys->d,
+		.btree_id	= id,
+	};
+}
+
+struct bkey_s_c bch2_journal_iter_peek(struct journal_iter *iter)
+{
+	while (1) {
+		if (iter->k == iter->keys->d + iter->keys->nr)
+			return bkey_s_c_null;
+
+		if (iter->k->btree_id == iter->btree_id)
+			return bkey_i_to_s_c(iter->k->k);
+
+		iter->k++;
+	}
+
+	return bkey_s_c_null;
+}
+
+struct bkey_s_c bch2_journal_iter_next(struct journal_iter *iter)
+{
+	if (iter->k == iter->keys->d + iter->keys->nr)
+		return bkey_s_c_null;
+
+	iter->k++;
+	return bch2_journal_iter_peek(iter);
+}
+
 /* sort and dedup all keys in the journal: */
 
 static void journal_entries_free(struct list_head *list)
diff --git a/fs/bcachefs/recovery.h b/fs/bcachefs/recovery.h
index a69260d6165a..479ea46f8dcb 100644
--- a/fs/bcachefs/recovery.h
+++ b/fs/bcachefs/recovery.h
@@ -18,6 +18,17 @@ struct journal_keys {
 #define for_each_journal_key(keys, i)				\
 	for (i = (keys).d; i < (keys).d + (keys).nr; (i)++)
 
+struct journal_iter {
+	struct journal_keys	*keys;
+	struct journal_key	*k;
+	enum btree_id		btree_id;
+};
+
+struct journal_iter bch2_journal_iter_init(struct journal_keys *,
+					   enum btree_id);
+struct bkey_s_c bch2_journal_iter_peek(struct journal_iter *);
+struct bkey_s_c bch2_journal_iter_next(struct journal_iter *);
+
 int bch2_fs_recovery(struct bch_fs *);
 int bch2_fs_initialize(struct bch_fs *);
 
-- 
cgit 


From 8d591d5da42d10de6dda7c87e5f4b1447f67bb86 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 13 Jul 2019 12:59:02 -0400
Subject: bcachefs: Convert some assertions to fsck errors

Actual repair code will come later, but this is a start

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/buckets.c | 30 +++++++++++++++++++++++++-----
 1 file changed, 25 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 02f8b7f5a0c3..d71de5c776bb 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -847,8 +847,6 @@ static void bucket_set_stripe(struct bch_fs *c,
 		struct bucket *g = PTR_BUCKET(ca, ptr, gc);
 		struct bucket_mark new, old;
 
-		BUG_ON(ptr_stale(ca, ptr));
-
 		old = bucket_data_cmpxchg(c, ca, fs_usage, g, new, ({
 			new.dirty			= true;
 			new.stripe			= enabled;
@@ -857,6 +855,26 @@ static void bucket_set_stripe(struct bch_fs *c,
 				new.journal_seq		= journal_seq;
 			}
 		}));
+
+		/*
+		 * XXX write repair code for these, flag stripe as possibly bad
+		 */
+		if (old.gen != ptr->gen)
+			bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
+				      "stripe with stale pointer");
+#if 0
+		/*
+		 * We'd like to check for these, but these checks don't work
+		 * yet:
+		 */
+		if (old.stripe && enabled)
+			bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
+				      "multiple stripes using same bucket");
+
+		if (!old.stripe && !enabled)
+			bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
+				      "deleting stripe but bucket not marked as stripe bucket");
+#endif
 	}
 }
 
@@ -885,9 +903,11 @@ static bool bch2_mark_pointer(struct bch_fs *c,
 		 * checked the gen
 		 */
 		if (gen_after(new.gen, p.ptr.gen)) {
-			BUG_ON(!test_bit(BCH_FS_ALLOC_READ_DONE, &c->flags));
-			EBUG_ON(!p.ptr.cached &&
-				test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags));
+			/* XXX write repair code for this */
+			if (!p.ptr.cached &&
+			    test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags))
+				bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
+					      "stale dirty pointer");
 			return true;
 		}
 
-- 
cgit 


From 0c04f5eb0d49c92e5223dd98bb8e2577b9ce6f49 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 16 Jul 2019 12:06:34 -0400
Subject: bcachefs: Don't overflow trans with iters from triggers

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/buckets.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index d71de5c776bb..b1914fba1f7f 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -1359,6 +1359,8 @@ static int trans_get_key(struct btree_trans *trans,
 	if (IS_ERR(*iter))
 		return PTR_ERR(*iter);
 
+	bch2_trans_iter_free_on_commit(trans, *iter);
+
 	*k = bch2_btree_iter_peek_slot(*iter);
 	ret = bkey_err(*k);
 	if (ret)
-- 
cgit 


From a4461c8a7fc33aa663b0b1b2b7144d5890b6887f Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 16 Jul 2019 12:23:04 -0400
Subject: bcachefs: Print out name of bkey type

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bkey_methods.c | 7 +++----
 fs/bcachefs/bkey_methods.h | 2 +-
 2 files changed, 4 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c
index 27f196ef0b18..8af16ca994e0 100644
--- a/fs/bcachefs/bkey_methods.c
+++ b/fs/bcachefs/bkey_methods.c
@@ -12,7 +12,7 @@
 #include "quota.h"
 #include "xattr.h"
 
-const char * const bch_bkey_types[] = {
+const char * const bch2_bkey_types[] = {
 #define x(name, nr) #name,
 	BCH_BKEY_TYPES()
 #undef x
@@ -159,7 +159,8 @@ void bch2_bpos_to_text(struct printbuf *out, struct bpos pos)
 
 void bch2_bkey_to_text(struct printbuf *out, const struct bkey *k)
 {
-	pr_buf(out, "u64s %u type %u ", k->u64s, k->type);
+	pr_buf(out, "u64s %u type %s ", k->u64s,
+	       bch2_bkey_types[k->type]);
 
 	bch2_bpos_to_text(out, k->p);
 
@@ -174,8 +175,6 @@ void bch2_val_to_text(struct printbuf *out, struct bch_fs *c,
 
 	if (likely(ops->val_to_text))
 		ops->val_to_text(out, c, k);
-	else
-		pr_buf(out, " %s", bch_bkey_types[k.k->type]);
 }
 
 void bch2_bkey_val_to_text(struct printbuf *out, struct bch_fs *c,
diff --git a/fs/bcachefs/bkey_methods.h b/fs/bcachefs/bkey_methods.h
index 08b976633360..e6e97cda4f50 100644
--- a/fs/bcachefs/bkey_methods.h
+++ b/fs/bcachefs/bkey_methods.h
@@ -9,7 +9,7 @@ struct btree;
 struct bkey;
 enum btree_node_type;
 
-extern const char * const bch_bkey_types[];
+extern const char * const bch2_bkey_types[];
 
 enum merge_result {
 	BCH_MERGE_NOMERGE,
-- 
cgit 


From 06ed855862853dcdd365a12f7cf182961bec61de Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 9 Jul 2019 12:56:43 -0400
Subject: bcachefs: Add offset_into_extent param to bch2_read_extent()

With reflink, we'll no longer be able to calculate the offset of the
data we want into the extent we're reading from from the extent pos and
the iter pos - we'll have to pass it in separately.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-io.c    | 31 +++++++++++++++++--------------
 fs/bcachefs/io.c       | 41 +++++++++++++++++++++++++----------------
 fs/bcachefs/io.h       | 14 +++++++++-----
 fs/bcachefs/io_types.h |  2 ++
 fs/bcachefs/move.c     |  2 +-
 5 files changed, 54 insertions(+), 36 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 8858352eb42a..def470b5b959 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -1005,7 +1005,6 @@ static void bchfs_read(struct btree_trans *trans, struct btree_iter *iter,
 		       struct readpages_iter *readpages_iter)
 {
 	struct bch_fs *c = trans->c;
-	struct bio *bio = &rbio->bio;
 	int flags = BCH_READ_RETRY_IF_STALE|
 		BCH_READ_MAY_PROMOTE;
 
@@ -1015,9 +1014,10 @@ static void bchfs_read(struct btree_trans *trans, struct btree_iter *iter,
 	while (1) {
 		BKEY_PADDED(k) tmp;
 		struct bkey_s_c k;
-		unsigned bytes;
+		unsigned bytes, offset_into_extent;
 
-		bch2_btree_iter_set_pos(iter, POS(inum, bio->bi_iter.bi_sector));
+		bch2_btree_iter_set_pos(iter,
+				POS(inum, rbio->bio.bi_iter.bi_sector));
 
 		k = bch2_btree_iter_peek_slot(iter);
 		BUG_ON(!k.k);
@@ -1025,8 +1025,8 @@ static void bchfs_read(struct btree_trans *trans, struct btree_iter *iter,
 		if (IS_ERR(k.k)) {
 			int ret = btree_iter_err(iter);
 			BUG_ON(!ret);
-			bcache_io_error(c, bio, "btree IO error %i", ret);
-			bio_endio(bio);
+			bcache_io_error(c, &rbio->bio, "btree IO error %i", ret);
+			bio_endio(&rbio->bio);
 			return;
 		}
 
@@ -1034,6 +1034,9 @@ static void bchfs_read(struct btree_trans *trans, struct btree_iter *iter,
 		bch2_trans_unlock(trans);
 		k = bkey_i_to_s_c(&tmp.k);
 
+		offset_into_extent = iter->pos.offset -
+			bkey_start_offset(k.k);
+
 		if (readpages_iter) {
 			bool want_full_extent = false;
 
@@ -1048,27 +1051,27 @@ static void bchfs_read(struct btree_trans *trans, struct btree_iter *iter,
 			}
 
 			readpage_bio_extend(readpages_iter,
-					    bio, k.k->p.offset,
+					    &rbio->bio, k.k->p.offset,
 					    want_full_extent);
 		}
 
-		bytes = (min_t(u64, k.k->p.offset, bio_end_sector(bio)) -
-			 bio->bi_iter.bi_sector) << 9;
-		swap(bio->bi_iter.bi_size, bytes);
+		bytes = min_t(unsigned, bio_sectors(&rbio->bio),
+			      (k.k->size - offset_into_extent)) << 9;
+		swap(rbio->bio.bi_iter.bi_size, bytes);
 
-		if (bytes == bio->bi_iter.bi_size)
+		if (rbio->bio.bi_iter.bi_size == bytes)
 			flags |= BCH_READ_LAST_FRAGMENT;
 
 		if (bkey_extent_is_allocation(k.k))
-			bch2_add_page_sectors(bio, k);
+			bch2_add_page_sectors(&rbio->bio, k);
 
-		bch2_read_extent(c, rbio, k, flags);
+		bch2_read_extent(c, rbio, k, offset_into_extent, flags);
 
 		if (flags & BCH_READ_LAST_FRAGMENT)
 			return;
 
-		swap(bio->bi_iter.bi_size, bytes);
-		bio_advance(bio, bytes);
+		swap(rbio->bio.bi_iter.bi_size, bytes);
+		bio_advance(&rbio->bio, bytes);
 	}
 }
 
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index 42071d0028ad..8f16e252d2f1 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -1240,7 +1240,7 @@ retry:
 		goto out;
 	}
 
-	ret = __bch2_read_extent(c, rbio, bvec_iter, k, failed, flags);
+	ret = __bch2_read_extent(c, rbio, bvec_iter, k, 0, failed, flags);
 	if (ret == READ_RETRY)
 		goto retry;
 	if (ret)
@@ -1272,17 +1272,22 @@ retry:
 			   POS(inode, bvec_iter.bi_sector),
 			   BTREE_ITER_SLOTS, k, ret) {
 		BKEY_PADDED(k) tmp;
-		unsigned bytes;
+		unsigned bytes, offset_into_extent;
 
 		bkey_reassemble(&tmp.k, k);
 		k = bkey_i_to_s_c(&tmp.k);
+
 		bch2_trans_unlock(&trans);
 
-		bytes = min_t(unsigned, bvec_iter.bi_size,
-			      (k.k->p.offset - bvec_iter.bi_sector) << 9);
+		offset_into_extent = iter->pos.offset -
+			bkey_start_offset(k.k);
+
+		bytes = min_t(unsigned, bvec_iter_sectors(bvec_iter),
+			      (k.k->size - offset_into_extent)) << 9;
 		swap(bvec_iter.bi_size, bytes);
 
-		ret = __bch2_read_extent(c, rbio, bvec_iter, k, failed, flags);
+		ret = __bch2_read_extent(c, rbio, bvec_iter, k,
+				offset_into_extent, failed, flags);
 		switch (ret) {
 		case READ_RETRY:
 			goto retry;
@@ -1463,7 +1468,7 @@ static void __bch2_read_endio(struct work_struct *work)
 		goto nodecode;
 
 	/* Adjust crc to point to subset of data we want: */
-	crc.offset     += rbio->bvec_iter.bi_sector - rbio->pos.offset;
+	crc.offset     += rbio->offset_into_extent;
 	crc.live_size	= bvec_iter_sectors(rbio->bvec_iter);
 
 	if (crc.compression_type != BCH_COMPRESSION_NONE) {
@@ -1574,6 +1579,7 @@ static void bch2_read_endio(struct bio *bio)
 
 int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig,
 		       struct bvec_iter iter, struct bkey_s_c k,
+		       unsigned offset_into_extent,
 		       struct bch_io_failures *failed, unsigned flags)
 {
 	struct extent_ptr_decoded pick;
@@ -1606,7 +1612,6 @@ int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig,
 		if (pick.crc.compressed_size > orig->bio.bi_vcnt * PAGE_SECTORS)
 			goto hole;
 
-		iter.bi_sector	= pos.offset;
 		iter.bi_size	= pick.crc.compressed_size << 9;
 		goto noclone;
 	}
@@ -1620,8 +1625,7 @@ int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig,
 	if (narrow_crcs && (flags & BCH_READ_USER_MAPPED))
 		flags |= BCH_READ_MUST_BOUNCE;
 
-	EBUG_ON(bkey_start_offset(k.k) > iter.bi_sector ||
-		k.k->p.offset < bvec_iter_end_sector(iter));
+	BUG_ON(offset_into_extent + bvec_iter_sectors(iter) > k.k->size);
 
 	if (pick.crc.compression_type != BCH_COMPRESSION_NONE ||
 	    (pick.crc.csum_type != BCH_CSUM_NONE &&
@@ -1642,15 +1646,16 @@ int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig,
 			(bvec_iter_sectors(iter) != pick.crc.uncompressed_size ||
 			 bvec_iter_sectors(iter) != pick.crc.live_size ||
 			 pick.crc.offset ||
-			 iter.bi_sector != pos.offset));
+			 offset_into_extent));
 
+		pos.offset += offset_into_extent;
 		pick.ptr.offset += pick.crc.offset +
-			(iter.bi_sector - pos.offset);
+			offset_into_extent;
 		pick.crc.compressed_size	= bvec_iter_sectors(iter);
 		pick.crc.uncompressed_size	= bvec_iter_sectors(iter);
 		pick.crc.offset			= 0;
 		pick.crc.live_size		= bvec_iter_sectors(iter);
-		pos.offset			= iter.bi_sector;
+		offset_into_extent		= 0;
 	}
 
 	if (rbio) {
@@ -1707,6 +1712,7 @@ noclone:
 	else
 		rbio->end_io	= orig->bio.bi_end_io;
 	rbio->bvec_iter		= iter;
+	rbio->offset_into_extent= offset_into_extent;
 	rbio->flags		= flags;
 	rbio->have_ioref	= pick_ret > 0 && bch2_dev_get_ioref(ca, READ);
 	rbio->narrow_crcs	= narrow_crcs;
@@ -1834,7 +1840,7 @@ void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, u64 inode)
 			   POS(inode, rbio->bio.bi_iter.bi_sector),
 			   BTREE_ITER_SLOTS, k, ret) {
 		BKEY_PADDED(k) tmp;
-		unsigned bytes;
+		unsigned bytes, offset_into_extent;
 
 		/*
 		 * Unlock the iterator while the btree node's lock is still in
@@ -1844,14 +1850,17 @@ void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, u64 inode)
 		k = bkey_i_to_s_c(&tmp.k);
 		bch2_trans_unlock(&trans);
 
-		bytes = min_t(unsigned, rbio->bio.bi_iter.bi_size,
-			      (k.k->p.offset - rbio->bio.bi_iter.bi_sector) << 9);
+		offset_into_extent = iter->pos.offset -
+			bkey_start_offset(k.k);
+
+		bytes = min_t(unsigned, bio_sectors(&rbio->bio),
+			      (k.k->size - offset_into_extent)) << 9;
 		swap(rbio->bio.bi_iter.bi_size, bytes);
 
 		if (rbio->bio.bi_iter.bi_size == bytes)
 			flags |= BCH_READ_LAST_FRAGMENT;
 
-		bch2_read_extent(c, rbio, k, flags);
+		bch2_read_extent(c, rbio, k, offset_into_extent, flags);
 
 		if (flags & BCH_READ_LAST_FRAGMENT)
 			return;
diff --git a/fs/bcachefs/io.h b/fs/bcachefs/io.h
index 61c8b8b3a459..aa437cb05fe7 100644
--- a/fs/bcachefs/io.h
+++ b/fs/bcachefs/io.h
@@ -99,10 +99,6 @@ struct bch_devs_mask;
 struct cache_promote_op;
 struct extent_ptr_decoded;
 
-int __bch2_read_extent(struct bch_fs *, struct bch_read_bio *, struct bvec_iter,
-		       struct bkey_s_c, struct bch_io_failures *, unsigned);
-void bch2_read(struct bch_fs *, struct bch_read_bio *, u64);
-
 enum bch_read_flags {
 	BCH_READ_RETRY_IF_STALE		= 1 << 0,
 	BCH_READ_MAY_PROMOTE		= 1 << 1,
@@ -116,14 +112,22 @@ enum bch_read_flags {
 	BCH_READ_IN_RETRY		= 1 << 7,
 };
 
+int __bch2_read_extent(struct bch_fs *, struct bch_read_bio *,
+		       struct bvec_iter, struct bkey_s_c, unsigned,
+		       struct bch_io_failures *, unsigned);
+
 static inline void bch2_read_extent(struct bch_fs *c,
 				    struct bch_read_bio *rbio,
 				    struct bkey_s_c k,
+				    unsigned offset_into_extent,
 				    unsigned flags)
 {
-	__bch2_read_extent(c, rbio, rbio->bio.bi_iter, k, NULL, flags);
+	__bch2_read_extent(c, rbio, rbio->bio.bi_iter, k,
+			   offset_into_extent, NULL, flags);
 }
 
+void bch2_read(struct bch_fs *, struct bch_read_bio *, u64);
+
 static inline struct bch_read_bio *rbio_init(struct bio *bio,
 					     struct bch_io_opts opts)
 {
diff --git a/fs/bcachefs/io_types.h b/fs/bcachefs/io_types.h
index c697191172b0..50f2a5e57960 100644
--- a/fs/bcachefs/io_types.h
+++ b/fs/bcachefs/io_types.h
@@ -38,6 +38,8 @@ struct bch_read_bio {
 	 */
 	struct bvec_iter	bvec_iter;
 
+	unsigned		offset_into_extent;
+
 	u16			flags;
 	union {
 	struct {
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index 96f9f5950438..27835e4f13fd 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -461,7 +461,7 @@ static int bch2_move_extent(struct bch_fs *c,
 	 * ctxt when doing wakeup
 	 */
 	closure_get(&ctxt->cl);
-	bch2_read_extent(c, &io->rbio, e.s_c,
+	bch2_read_extent(c, &io->rbio, e.s_c, 0,
 			 BCH_READ_NODECODE|
 			 BCH_READ_LAST_FRAGMENT);
 	return 0;
-- 
cgit 


From a9058a223fd2168d96195df3b918e395be8869e3 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 23 Jul 2019 18:56:28 -0400
Subject: bcachefs: add missing bch2_trans_begin() call

for_each_btree_key() calls bch2_trans_get_iter() - we have to reset the
transaction state before getting the iterator again, in the retry path

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/io.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index 8f16e252d2f1..a539719661b8 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -1263,11 +1263,13 @@ static void bch2_read_retry(struct bch_fs *c, struct bch_read_bio *rbio,
 	struct bkey_s_c k;
 	int ret;
 
-	bch2_trans_init(&trans, c, 0, 0);
-
 	flags &= ~BCH_READ_LAST_FRAGMENT;
 	flags |= BCH_READ_MUST_CLONE;
+
+	bch2_trans_init(&trans, c, 0, 0);
 retry:
+	bch2_trans_begin(&trans);
+
 	for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS,
 			   POS(inode, bvec_iter.bi_sector),
 			   BTREE_ITER_SLOTS, k, ret) {
-- 
cgit 


From 8627f674bc98d6eeb91f885ca1259d29a0e7aa34 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 18 Jul 2019 17:32:20 -0400
Subject: bcachefs: Don't unlink iters on unsuccessful commit

Where unlink_on_commit is used, on unsuccessfull commit we're likely
retrying the whole update and were going to be using the same iterators
again.

The management of multiple iterators needs to be gone over a fair bit
more at some point...

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update_leaf.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 4461e42f2367..b878f9a9882c 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -923,8 +923,6 @@ out_noupdates:
 		bch2_trans_unlink_iters(trans, ~trans->iters_touched|
 					trans->iters_unlink_on_commit);
 		trans->iters_touched = 0;
-	} else {
-		bch2_trans_unlink_iters(trans, trans->iters_unlink_on_commit);
 	}
 	trans->nr_updates	= 0;
 	trans->mem_top		= 0;
-- 
cgit 


From b17657d0cf321af98fcb9d52748ca4d201284702 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 18 Jul 2019 17:11:35 -0400
Subject: bcachefs: Dont't call bch2_trans_begin_updates() in
 bch2_extent_update()

Prep work for reflink - for reflink, we're going to be using
bch2_extent_update() with other updates in the same transaction.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-io.c | 30 ++++++++++++++++++------------
 1 file changed, 18 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index def470b5b959..55fee053337f 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -276,16 +276,16 @@ static int sum_sector_overwrites(struct btree_trans *trans,
 	return 0;
 }
 
-static int bch2_extent_update(struct btree_trans *trans,
-			      struct bch_inode_info *inode,
-			      struct disk_reservation *disk_res,
-			      struct quota_res *quota_res,
-			      struct btree_iter *extent_iter,
-			      struct bkey_i *k,
-			      u64 new_i_size,
-			      bool may_allocate,
-			      bool direct,
-			      s64 *total_delta)
+int bch2_extent_update(struct btree_trans *trans,
+		       struct bch_inode_info *inode,
+		       struct disk_reservation *disk_res,
+		       struct quota_res *quota_res,
+		       struct btree_iter *extent_iter,
+		       struct bkey_i *k,
+		       u64 new_i_size,
+		       bool may_allocate,
+		       bool direct,
+		       s64 *total_delta)
 {
 	struct bch_fs *c = trans->c;
 	struct btree_iter *inode_iter = NULL;
@@ -297,8 +297,6 @@ static int bch2_extent_update(struct btree_trans *trans,
 	s64 i_sectors_delta;
 	int ret;
 
-	bch2_trans_begin_updates(trans);
-
 	ret = bch2_btree_iter_traverse(extent_iter);
 	if (ret)
 		return ret;
@@ -447,6 +445,8 @@ static int bchfs_write_index_update(struct bch_write_op *wop)
 
 		bkey_copy(&tmp.k, bch2_keylist_front(keys));
 
+		bch2_trans_begin_updates(&trans);
+
 		ret = bch2_extent_update(&trans, inode,
 				&wop->res, quota_res,
 				iter, &tmp.k,
@@ -2198,6 +2198,8 @@ static int __bch2_fpunch(struct bch_fs *c, struct bch_inode_info *inode,
 		bch2_key_resize(&delete.k, max_sectors);
 		bch2_cut_back(end, &delete.k);
 
+		bch2_trans_begin_updates(&trans);
+
 		ret = bch2_extent_update(&trans, inode,
 				&disk_res, NULL, iter, &delete,
 				0, true, true, NULL);
@@ -2546,6 +2548,8 @@ static long bch2_fcollapse(struct bch_inode_info *inode,
 				BCH_DISK_RESERVATION_NOFAIL);
 		BUG_ON(ret);
 
+		bch2_trans_begin_updates(&trans);
+
 		ret = bch2_extent_update(&trans, inode,
 				&disk_res, NULL,
 				dst, &copy.k,
@@ -2689,6 +2693,8 @@ static long bch2_fallocate(struct bch_inode_info *inode, int mode,
 			reservation.v.nr_replicas = disk_res.nr_replicas;
 		}
 
+		bch2_trans_begin_updates(&trans);
+
 		ret = bch2_extent_update(&trans, inode,
 				&disk_res, &quota_res,
 				iter, &reservation.k_i,
-- 
cgit 


From b1c9358a25eb0ed94c5bfc18ba5f9b00d51d8863 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 18 Jul 2019 17:21:21 -0400
Subject: bcachefs: Refactor __bch2_cut_front()

Minor cleanup - prep work for new key types for reflink

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/extents.c | 50 +++++++++++++++++++++++++++-----------------------
 fs/bcachefs/extents.h |  6 +++---
 2 files changed, 30 insertions(+), 26 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 4d3722cb7e33..60fe50368d21 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -707,44 +707,47 @@ void bch2_btree_ptr_to_text(struct printbuf *out, struct bch_fs *c,
 
 /* Extents */
 
-bool __bch2_cut_front(struct bpos where, struct bkey_s k)
+void __bch2_cut_front(struct bpos where, struct bkey_s k)
 {
-	u64 len = 0;
+	u64 sub;
 
 	if (bkey_cmp(where, bkey_start_pos(k.k)) <= 0)
-		return false;
+		return;
 
 	EBUG_ON(bkey_cmp(where, k.k->p) > 0);
 
-	len = k.k->p.offset - where.offset;
+	sub = where.offset - bkey_start_offset(k.k);
 
-	BUG_ON(len > k.k->size);
+	k.k->size -= sub;
 
-	/*
-	 * Don't readjust offset if the key size is now 0, because that could
-	 * cause offset to point to the next bucket:
-	 */
-	if (!len)
+	if (!k.k->size)
 		k.k->type = KEY_TYPE_deleted;
-	else if (bkey_extent_is_data(k.k)) {
-		struct bkey_s_extent e = bkey_s_to_extent(k);
+
+	switch (k.k->type) {
+	case KEY_TYPE_deleted:
+	case KEY_TYPE_discard:
+	case KEY_TYPE_error:
+	case KEY_TYPE_cookie:
+		break;
+	case KEY_TYPE_extent: {
+		struct bkey_ptrs ptrs = bch2_bkey_ptrs(k);
 		union bch_extent_entry *entry;
 		bool seen_crc = false;
 
-		extent_for_each_entry(e, entry) {
+		bkey_extent_entry_for_each(ptrs, entry) {
 			switch (extent_entry_type(entry)) {
 			case BCH_EXTENT_ENTRY_ptr:
 				if (!seen_crc)
-					entry->ptr.offset += e.k->size - len;
+					entry->ptr.offset += sub;
 				break;
 			case BCH_EXTENT_ENTRY_crc32:
-				entry->crc32.offset += e.k->size - len;
+				entry->crc32.offset += sub;
 				break;
 			case BCH_EXTENT_ENTRY_crc64:
-				entry->crc64.offset += e.k->size - len;
+				entry->crc64.offset += sub;
 				break;
 			case BCH_EXTENT_ENTRY_crc128:
-				entry->crc128.offset += e.k->size - len;
+				entry->crc128.offset += sub;
 				break;
 			case BCH_EXTENT_ENTRY_stripe_ptr:
 				break;
@@ -753,11 +756,14 @@ bool __bch2_cut_front(struct bpos where, struct bkey_s k)
 			if (extent_entry_is_crc(entry))
 				seen_crc = true;
 		}
-	}
 
-	k.k->size = len;
-
-	return true;
+		break;
+	}
+	case KEY_TYPE_reservation:
+		break;
+	default:
+		BUG();
+	}
 }
 
 bool bch2_cut_back(struct bpos where, struct bkey *k)
@@ -771,8 +777,6 @@ bool bch2_cut_back(struct bpos where, struct bkey *k)
 
 	len = where.offset - bkey_start_offset(k);
 
-	BUG_ON(len > k->size);
-
 	k->p = where;
 	k->size = len;
 
diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h
index fe92737354bd..549188c864ae 100644
--- a/fs/bcachefs/extents.h
+++ b/fs/bcachefs/extents.h
@@ -540,11 +540,11 @@ do {									\
 	}								\
 } while (0)
 
-bool __bch2_cut_front(struct bpos, struct bkey_s);
+void __bch2_cut_front(struct bpos, struct bkey_s);
 
-static inline bool bch2_cut_front(struct bpos where, struct bkey_i *k)
+static inline void bch2_cut_front(struct bpos where, struct bkey_i *k)
 {
-	return __bch2_cut_front(where, bkey_i_to_s(k));
+	__bch2_cut_front(where, bkey_i_to_s(k));
 }
 
 bool bch2_cut_back(struct bpos, struct bkey *);
-- 
cgit 


From 99aaf57000b4091d2471ed30387d96e15f2fc38b Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 25 Jul 2019 13:52:14 -0400
Subject: bcachefs: Refactor various code to not be extent specific

With reflink, various code now has to handle both KEY_TYPE_extent
or KEY_TYPE_reflink_v - so, convert it to be generic across all keys
with pointers.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bkey.h      |   2 +-
 fs/bcachefs/ec.c        |  46 ++++++++++----------
 fs/bcachefs/extents.c   | 109 ++++++++++++++++++++++++++++--------------------
 fs/bcachefs/extents.h   |  95 ++++++++++++++++-------------------------
 fs/bcachefs/fs-io.c     |   4 +-
 fs/bcachefs/fs.c        |  44 ++++++++++---------
 fs/bcachefs/io.c        |  50 +++++++---------------
 fs/bcachefs/migrate.c   |   3 +-
 fs/bcachefs/move.c      |  34 +++++++--------
 fs/bcachefs/movinggc.c  |  27 +++++-------
 fs/bcachefs/rebalance.c |   6 +--
 11 files changed, 195 insertions(+), 225 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bkey.h b/fs/bcachefs/bkey.h
index ba08d95aae6f..b3a08e52e6b3 100644
--- a/fs/bcachefs/bkey.h
+++ b/fs/bcachefs/bkey.h
@@ -58,7 +58,7 @@ static inline void set_bkey_val_bytes(struct bkey *k, unsigned bytes)
 	k->u64s = BKEY_U64s + DIV_ROUND_UP(bytes, sizeof(u64));
 }
 
-#define bkey_val_end(_k)	vstruct_idx((_k).v, bkey_val_u64s((_k).k))
+#define bkey_val_end(_k)	((void *) (((u64 *) (_k).v) + bkey_val_u64s((_k).k)))
 
 #define bkey_deleted(_k)	((_k)->type == KEY_TYPE_deleted)
 
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index de31ea6c20de..77a5c3613ff7 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -162,19 +162,20 @@ static int extent_matches_stripe(struct bch_fs *c,
 				 struct bch_stripe *v,
 				 struct bkey_s_c k)
 {
-	struct bkey_s_c_extent e;
-	const struct bch_extent_ptr *ptr;
-	int idx;
 
-	if (!bkey_extent_is_data(k.k))
-		return -1;
-
-	e = bkey_s_c_to_extent(k);
+	switch (k.k->type) {
+	case KEY_TYPE_extent: {
+		struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
+		const struct bch_extent_ptr *ptr;
+		int idx;
 
-	extent_for_each_ptr(e, ptr) {
-		idx = ptr_matches_stripe(c, v, ptr);
-		if (idx >= 0)
-			return idx;
+		extent_for_each_ptr(e, ptr) {
+			idx = ptr_matches_stripe(c, v, ptr);
+			if (idx >= 0)
+				return idx;
+		}
+		break;
+	}
 	}
 
 	return -1;
@@ -182,19 +183,20 @@ static int extent_matches_stripe(struct bch_fs *c,
 
 static bool extent_has_stripe_ptr(struct bkey_s_c k, u64 idx)
 {
-	struct bkey_s_c_extent e;
-	const union bch_extent_entry *entry;
+	switch (k.k->type) {
+	case KEY_TYPE_extent: {
+		struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
+		const union bch_extent_entry *entry;
 
-	if (!bkey_extent_is_data(k.k))
-		return false;
+		extent_for_each_entry(e, entry)
+			if (extent_entry_type(entry) ==
+			    BCH_EXTENT_ENTRY_stripe_ptr &&
+			    entry->stripe_ptr.idx == idx)
+				return true;
 
-	e = bkey_s_c_to_extent(k);
-
-	extent_for_each_entry(e, entry)
-		if (extent_entry_type(entry) ==
-		    BCH_EXTENT_ENTRY_stripe_ptr &&
-		    entry->stripe_ptr.idx == idx)
-			return true;
+		break;
+	}
+	}
 
 	return false;
 }
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 60fe50368d21..9f17780b8bc0 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -249,6 +249,33 @@ void bch2_bkey_drop_device(struct bkey_s k, unsigned dev)
 	bch2_bkey_drop_ptrs(k, ptr, ptr->dev == dev);
 }
 
+const struct bch_extent_ptr *
+bch2_bkey_has_device(struct bkey_s_c k, unsigned dev)
+{
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+	const struct bch_extent_ptr *ptr;
+
+	bkey_for_each_ptr(ptrs, ptr)
+		if (ptr->dev == dev)
+			return ptr;
+
+	return NULL;
+}
+
+bool bch2_bkey_has_target(struct bch_fs *c, struct bkey_s_c k, unsigned target)
+{
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+	const struct bch_extent_ptr *ptr;
+
+	bkey_for_each_ptr(ptrs, ptr)
+		if (bch2_dev_in_target(c, ptr->dev, target) &&
+		    (!ptr->cached ||
+		     !ptr_stale(bch_dev_bkey_exists(c, ptr->dev), ptr)))
+			return true;
+
+	return false;
+}
+
 /* extent specific utility code */
 
 const struct bch_extent_ptr *
@@ -279,20 +306,6 @@ bch2_extent_has_group(struct bch_fs *c, struct bkey_s_c_extent e, unsigned group
 	return NULL;
 }
 
-const struct bch_extent_ptr *
-bch2_extent_has_target(struct bch_fs *c, struct bkey_s_c_extent e, unsigned target)
-{
-	const struct bch_extent_ptr *ptr;
-
-	extent_for_each_ptr(e, ptr)
-		if (bch2_dev_in_target(c, ptr->dev, target) &&
-		    (!ptr->cached ||
-		     !ptr_stale(bch_dev_bkey_exists(c, ptr->dev), ptr)))
-			return ptr;
-
-	return NULL;
-}
-
 unsigned bch2_extent_is_compressed(struct bkey_s_c k)
 {
 	unsigned ret = 0;
@@ -313,16 +326,17 @@ unsigned bch2_extent_is_compressed(struct bkey_s_c k)
 	return ret;
 }
 
-bool bch2_extent_matches_ptr(struct bch_fs *c, struct bkey_s_c_extent e,
-			     struct bch_extent_ptr m, u64 offset)
+bool bch2_bkey_matches_ptr(struct bch_fs *c, struct bkey_s_c k,
+			   struct bch_extent_ptr m, u64 offset)
 {
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
 	const union bch_extent_entry *entry;
 	struct extent_ptr_decoded p;
 
-	extent_for_each_ptr_decode(e, p, entry)
+	bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
 		if (p.ptr.dev	== m.dev &&
 		    p.ptr.gen	== m.gen &&
-		    (s64) p.ptr.offset + p.crc.offset - bkey_start_offset(e.k) ==
+		    (s64) p.ptr.offset + p.crc.offset - bkey_start_offset(k.k) ==
 		    (s64) m.offset  - offset)
 			return true;
 
@@ -389,16 +403,17 @@ static inline bool can_narrow_crc(struct bch_extent_crc_unpacked u,
 		bch2_csum_type_is_encryption(n.csum_type);
 }
 
-bool bch2_can_narrow_extent_crcs(struct bkey_s_c_extent e,
+bool bch2_can_narrow_extent_crcs(struct bkey_s_c k,
 				 struct bch_extent_crc_unpacked n)
 {
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
 	struct bch_extent_crc_unpacked crc;
 	const union bch_extent_entry *i;
 
 	if (!n.csum_type)
 		return false;
 
-	extent_for_each_crc(e, crc, i)
+	bkey_for_each_crc(k.k, ptrs, crc, i)
 		if (can_narrow_crc(crc, n))
 			return true;
 
@@ -414,9 +429,9 @@ bool bch2_can_narrow_extent_crcs(struct bkey_s_c_extent e,
  * currently live (so that readers won't have to bounce) while we've got the
  * checksum we need:
  */
-bool bch2_extent_narrow_crcs(struct bkey_i_extent *e,
-			     struct bch_extent_crc_unpacked n)
+bool bch2_bkey_narrow_crcs(struct bkey_i *k, struct bch_extent_crc_unpacked n)
 {
+	struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k));
 	struct bch_extent_crc_unpacked u;
 	struct extent_ptr_decoded p;
 	union bch_extent_entry *i;
@@ -424,7 +439,7 @@ bool bch2_extent_narrow_crcs(struct bkey_i_extent *e,
 
 	/* Find a checksum entry that covers only live data: */
 	if (!n.csum_type) {
-		extent_for_each_crc(extent_i_to_s(e), u, i)
+		bkey_for_each_crc(&k->k, ptrs, u, i)
 			if (!u.compression_type &&
 			    u.csum_type &&
 			    u.live_size == u.uncompressed_size) {
@@ -436,15 +451,15 @@ bool bch2_extent_narrow_crcs(struct bkey_i_extent *e,
 found:
 	BUG_ON(n.compression_type);
 	BUG_ON(n.offset);
-	BUG_ON(n.live_size != e->k.size);
+	BUG_ON(n.live_size != k->k.size);
 
 restart_narrow_pointers:
-	extent_for_each_ptr_decode(extent_i_to_s(e), p, i)
+	bkey_for_each_ptr_decode(&k->k, ptrs, p, i)
 		if (can_narrow_crc(p.crc, n)) {
-			bch2_bkey_drop_ptr(extent_i_to_s(e).s, &i->ptr);
+			bch2_bkey_drop_ptr(bkey_i_to_s(k), &i->ptr);
 			p.ptr.offset += p.crc.offset;
 			p.crc = n;
-			bch2_extent_ptr_decoded_append(e, &p);
+			bch2_extent_ptr_decoded_append(k, &p);
 			ret = true;
 			goto restart_narrow_pointers;
 		}
@@ -1397,9 +1412,12 @@ static void bch2_extent_crc_pack(union bch_extent_crc *dst,
 #undef set_common_fields
 }
 
-static void bch2_extent_crc_init(union bch_extent_crc *crc,
-				 struct bch_extent_crc_unpacked new)
+void bch2_extent_crc_append(struct bkey_i *k,
+			    struct bch_extent_crc_unpacked new)
 {
+	struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k));
+	union bch_extent_crc *crc = (void *) ptrs.end;
+
 	if (bch_crc_bytes[new.csum_type]	<= 4 &&
 	    new.uncompressed_size - 1		<= CRC32_SIZE_MAX &&
 	    new.nonce				<= CRC32_NONCE_MAX)
@@ -1416,54 +1434,53 @@ static void bch2_extent_crc_init(union bch_extent_crc *crc,
 		BUG();
 
 	bch2_extent_crc_pack(crc, new);
-}
 
-void bch2_extent_crc_append(struct bkey_i_extent *e,
-			    struct bch_extent_crc_unpacked new)
-{
-	bch2_extent_crc_init((void *) extent_entry_last(extent_i_to_s(e)), new);
-	__extent_entry_push(e);
+	k->k.u64s += extent_entry_u64s(ptrs.end);
+
+	EBUG_ON(bkey_val_u64s(&k->k) > BKEY_EXTENT_VAL_U64s_MAX);
 }
 
-static inline void __extent_entry_insert(struct bkey_i_extent *e,
+static inline void __extent_entry_insert(struct bkey_i *k,
 					 union bch_extent_entry *dst,
 					 union bch_extent_entry *new)
 {
-	union bch_extent_entry *end = extent_entry_last(extent_i_to_s(e));
+	union bch_extent_entry *end = bkey_val_end(bkey_i_to_s(k));
 
 	memmove_u64s_up((u64 *) dst + extent_entry_u64s(new),
 			dst, (u64 *) end - (u64 *) dst);
-	e->k.u64s += extent_entry_u64s(new);
+	k->k.u64s += extent_entry_u64s(new);
 	memcpy_u64s_small(dst, new, extent_entry_u64s(new));
 }
 
-void bch2_extent_ptr_decoded_append(struct bkey_i_extent *e,
+void bch2_extent_ptr_decoded_append(struct bkey_i *k,
 				    struct extent_ptr_decoded *p)
 {
-	struct bch_extent_crc_unpacked crc = bch2_extent_crc_unpack(&e->k, NULL);
+	struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k));
+	struct bch_extent_crc_unpacked crc =
+		bch2_extent_crc_unpack(&k->k, NULL);
 	union bch_extent_entry *pos;
 	unsigned i;
 
 	if (!bch2_crc_unpacked_cmp(crc, p->crc)) {
-		pos = e->v.start;
+		pos = ptrs.start;
 		goto found;
 	}
 
-	extent_for_each_crc(extent_i_to_s(e), crc, pos)
+	bkey_for_each_crc(&k->k, ptrs, crc, pos)
 		if (!bch2_crc_unpacked_cmp(crc, p->crc)) {
 			pos = extent_entry_next(pos);
 			goto found;
 		}
 
-	bch2_extent_crc_append(e, p->crc);
-	pos = extent_entry_last(extent_i_to_s(e));
+	bch2_extent_crc_append(k, p->crc);
+	pos = bkey_val_end(bkey_i_to_s(k));
 found:
 	p->ptr.type = 1 << BCH_EXTENT_ENTRY_ptr;
-	__extent_entry_insert(e, pos, to_entry(&p->ptr));
+	__extent_entry_insert(k, pos, to_entry(&p->ptr));
 
 	for (i = 0; i < p->ec_nr; i++) {
 		p->ec[i].type = 1 << BCH_EXTENT_ENTRY_stripe_ptr;
-		__extent_entry_insert(e, pos, to_entry(&p->ec[i]));
+		__extent_entry_insert(k, pos, to_entry(&p->ec[i]));
 	}
 }
 
diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h
index 549188c864ae..035d15bbca39 100644
--- a/fs/bcachefs/extents.h
+++ b/fs/bcachefs/extents.h
@@ -12,7 +12,8 @@ struct btree_insert_entry;
 
 /* extent entries: */
 
-#define extent_entry_last(_e)		bkey_val_end(_e)
+#define extent_entry_last(_e)						\
+	((typeof(&(_e).v->start[0])) bkey_val_end(_e))
 
 #define entry_to_ptr(_entry)						\
 ({									\
@@ -258,6 +259,27 @@ out:									\
 	__bkey_for_each_ptr_decode(_k, (_p).start, (_p).end,		\
 				   _ptr, _entry)
 
+#define bkey_crc_next(_k, _start, _end, _crc, _iter)			\
+({									\
+	__bkey_extent_entry_for_each_from(_iter, _end, _iter)		\
+		if (extent_entry_is_crc(_iter)) {			\
+			(_crc) = bch2_extent_crc_unpack(_k,		\
+						entry_to_crc(_iter));	\
+			break;						\
+		}							\
+									\
+	(_iter) < (_end);						\
+})
+
+#define __bkey_for_each_crc(_k, _start, _end, _crc, _iter)		\
+	for ((_crc) = bch2_extent_crc_unpack(_k, NULL),			\
+	     (_iter) = (_start);					\
+	     bkey_crc_next(_k, _start, _end, _crc, _iter);		\
+	     (_iter) = extent_entry_next(_iter))
+
+#define bkey_for_each_crc(_k, _p, _crc, _iter)				\
+	__bkey_for_each_crc(_k, (_p).start, (_p).end, _crc, _iter)
+
 /* utility code common to all keys with pointers: */
 
 static inline struct bkey_ptrs_c bch2_bkey_ptrs_c(struct bkey_s_c k)
@@ -267,7 +289,7 @@ static inline struct bkey_ptrs_c bch2_bkey_ptrs_c(struct bkey_s_c k)
 		struct bkey_s_c_btree_ptr e = bkey_s_c_to_btree_ptr(k);
 		return (struct bkey_ptrs_c) {
 			to_entry(&e.v->start[0]),
-			to_entry(bkey_val_end(e))
+			to_entry(extent_entry_last(e))
 		};
 	}
 	case KEY_TYPE_extent: {
@@ -337,18 +359,6 @@ static inline struct bch_devs_list bch2_bkey_cached_devs(struct bkey_s_c k)
 	return ret;
 }
 
-static inline bool bch2_bkey_has_device(struct bkey_s_c k, unsigned dev)
-{
-	struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k);
-	const struct bch_extent_ptr *ptr;
-
-	bkey_for_each_ptr(p, ptr)
-		if (ptr->dev == dev)
-			return ptr;
-
-	return NULL;
-}
-
 unsigned bch2_bkey_nr_ptrs(struct bkey_s_c);
 unsigned bch2_bkey_nr_dirty_ptrs(struct bkey_s_c);
 unsigned bch2_bkey_durability(struct bch_fs *, struct bkey_s_c);
@@ -359,6 +369,11 @@ int bch2_bkey_pick_read_device(struct bch_fs *, struct bkey_s_c,
 			       struct bch_io_failures *,
 			       struct extent_ptr_decoded *);
 
+void bch2_bkey_append_ptr(struct bkey_i *, struct bch_extent_ptr);
+void bch2_bkey_drop_device(struct bkey_s, unsigned);
+const struct bch_extent_ptr *bch2_bkey_has_device(struct bkey_s_c, unsigned);
+bool bch2_bkey_has_target(struct bch_fs *, struct bkey_s_c, unsigned);
+
 void bch2_bkey_ptrs_to_text(struct printbuf *, struct bch_fs *,
 			    struct bkey_s_c);
 const char *bch2_bkey_ptrs_invalid(const struct bch_fs *, struct bkey_s_c);
@@ -424,15 +439,11 @@ void bch2_extent_mark_replicas_cached(struct bch_fs *, struct bkey_s_extent,
 
 const struct bch_extent_ptr *
 bch2_extent_has_device(struct bkey_s_c_extent, unsigned);
-const struct bch_extent_ptr *
-bch2_extent_has_group(struct bch_fs *, struct bkey_s_c_extent, unsigned);
-const struct bch_extent_ptr *
-bch2_extent_has_target(struct bch_fs *, struct bkey_s_c_extent, unsigned);
 
 unsigned bch2_extent_is_compressed(struct bkey_s_c);
 
-bool bch2_extent_matches_ptr(struct bch_fs *, struct bkey_s_c_extent,
-			     struct bch_extent_ptr, u64);
+bool bch2_bkey_matches_ptr(struct bch_fs *, struct bkey_s_c,
+			   struct bch_extent_ptr, u64);
 
 static inline bool bkey_extent_is_data(const struct bkey *k)
 {
@@ -456,15 +467,6 @@ static inline bool bkey_extent_is_allocation(const struct bkey *k)
 	}
 }
 
-static inline bool bch2_extent_is_fully_allocated(struct bkey_s_c k)
-{
-	return bkey_extent_is_allocation(k.k) &&
-		!bch2_extent_is_compressed(k);
-}
-
-void bch2_bkey_append_ptr(struct bkey_i *, struct bch_extent_ptr);
-void bch2_bkey_drop_device(struct bkey_s, unsigned);
-
 /* Extent entry iteration: */
 
 #define extent_for_each_entry_from(_e, _entry, _start)			\
@@ -480,45 +482,18 @@ void bch2_bkey_drop_device(struct bkey_s, unsigned);
 #define extent_for_each_ptr(_e, _ptr)					\
 	__bkey_for_each_ptr(&(_e).v->start->ptr, extent_entry_last(_e), _ptr)
 
-#define extent_crc_next(_e, _crc, _iter)				\
-({									\
-	extent_for_each_entry_from(_e, _iter, _iter)			\
-		if (extent_entry_is_crc(_iter)) {			\
-			(_crc) = bch2_extent_crc_unpack((_e).k, entry_to_crc(_iter));\
-			break;						\
-		}							\
-									\
-	(_iter) < extent_entry_last(_e);				\
-})
-
-#define extent_for_each_crc(_e, _crc, _iter)				\
-	for ((_crc) = bch2_extent_crc_unpack((_e).k, NULL),		\
-	     (_iter) = (_e).v->start;					\
-	     extent_crc_next(_e, _crc, _iter);				\
-	     (_iter) = extent_entry_next(_iter))
-
 #define extent_for_each_ptr_decode(_e, _ptr, _entry)			\
 	__bkey_for_each_ptr_decode((_e).k, (_e).v->start,		\
 				   extent_entry_last(_e), _ptr, _entry)
 
-void bch2_extent_crc_append(struct bkey_i_extent *,
+void bch2_extent_crc_append(struct bkey_i *,
 			    struct bch_extent_crc_unpacked);
-void bch2_extent_ptr_decoded_append(struct bkey_i_extent *,
+void bch2_extent_ptr_decoded_append(struct bkey_i *,
 				    struct extent_ptr_decoded *);
 
-static inline void __extent_entry_push(struct bkey_i_extent *e)
-{
-	union bch_extent_entry *entry = extent_entry_last(extent_i_to_s(e));
-
-	EBUG_ON(bkey_val_u64s(&e->k) + extent_entry_u64s(entry) >
-		BKEY_EXTENT_VAL_U64s_MAX);
-
-	e->k.u64s += extent_entry_u64s(entry);
-}
-
-bool bch2_can_narrow_extent_crcs(struct bkey_s_c_extent,
+bool bch2_can_narrow_extent_crcs(struct bkey_s_c,
 				 struct bch_extent_crc_unpacked);
-bool bch2_extent_narrow_crcs(struct bkey_i_extent *, struct bch_extent_crc_unpacked);
+bool bch2_bkey_narrow_crcs(struct bkey_i *, struct bch_extent_crc_unpacked);
 
 union bch_extent_entry *bch2_bkey_drop_ptr(struct bkey_s,
 					   struct bch_extent_ptr *);
diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 55fee053337f..474535aa3fc2 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -1041,11 +1041,11 @@ static void bchfs_read(struct btree_trans *trans, struct btree_iter *iter,
 			bool want_full_extent = false;
 
 			if (bkey_extent_is_data(k.k)) {
-				struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
+				struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
 				const union bch_extent_entry *i;
 				struct extent_ptr_decoded p;
 
-				extent_for_each_ptr_decode(e, p, i)
+				bkey_for_each_ptr_decode(k.k, ptrs, p, i)
 					want_full_extent |= ((p.crc.csum_type != 0) |
 							     (p.crc.compression_type != 0));
 			}
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index f69b535b1b82..1b3898eae8b8 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -1148,15 +1148,15 @@ static int bch2_tmpfile(struct mnt_idmap *idmap,
 }
 
 static int bch2_fill_extent(struct fiemap_extent_info *info,
-			    const struct bkey_i *k, unsigned flags)
+			    struct bkey_s_c k, unsigned flags)
 {
-	if (bkey_extent_is_data(&k->k)) {
-		struct bkey_s_c_extent e = bkey_i_to_s_c_extent(k);
+	if (bkey_extent_is_data(k.k)) {
+		struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
 		const union bch_extent_entry *entry;
 		struct extent_ptr_decoded p;
 		int ret;
 
-		extent_for_each_ptr_decode(e, p, entry) {
+		bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
 			int flags2 = 0;
 			u64 offset = p.ptr.offset;
 
@@ -1166,22 +1166,22 @@ static int bch2_fill_extent(struct fiemap_extent_info *info,
 				offset += p.crc.offset;
 
 			if ((offset & (PAGE_SECTORS - 1)) ||
-			    (e.k->size & (PAGE_SECTORS - 1)))
+			    (k.k->size & (PAGE_SECTORS - 1)))
 				flags2 |= FIEMAP_EXTENT_NOT_ALIGNED;
 
 			ret = fiemap_fill_next_extent(info,
-						bkey_start_offset(e.k) << 9,
+						bkey_start_offset(k.k) << 9,
 						offset << 9,
-						e.k->size << 9, flags|flags2);
+						k.k->size << 9, flags|flags2);
 			if (ret)
 				return ret;
 		}
 
 		return 0;
-	} else if (k->k.type == KEY_TYPE_reservation) {
+	} else if (k.k->type == KEY_TYPE_reservation) {
 		return fiemap_fill_next_extent(info,
-					       bkey_start_offset(&k->k) << 9,
-					       0, k->k.size << 9,
+					       bkey_start_offset(k.k) << 9,
+					       0, k.k->size << 9,
 					       flags|
 					       FIEMAP_EXTENT_DELALLOC|
 					       FIEMAP_EXTENT_UNWRITTEN);
@@ -1198,7 +1198,7 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
 	struct btree_trans trans;
 	struct btree_iter *iter;
 	struct bkey_s_c k;
-	BKEY_PADDED(k) tmp;
+	BKEY_PADDED(k) cur, prev;
 	bool have_extent = false;
 	int ret = 0;
 
@@ -1212,25 +1212,31 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
 	bch2_trans_init(&trans, c, 0, 0);
 
 	for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS,
-			   POS(ei->v.i_ino, start >> 9), 0, k, ret)
+			   POS(ei->v.i_ino, start >> 9), 0, k, ret) {
+		if (bkey_cmp(bkey_start_pos(k.k),
+			     POS(ei->v.i_ino, (start + len) >> 9)) >= 0)
+			break;
+
+		bkey_reassemble(&cur.k, k);
+		k = bkey_i_to_s_c(&cur.k);
+
 		if (bkey_extent_is_data(k.k) ||
 		    k.k->type == KEY_TYPE_reservation) {
-			if (bkey_cmp(bkey_start_pos(k.k),
-				     POS(ei->v.i_ino, (start + len) >> 9)) >= 0)
-				break;
-
 			if (have_extent) {
-				ret = bch2_fill_extent(info, &tmp.k, 0);
+				ret = bch2_fill_extent(info,
+						bkey_i_to_s_c(&prev.k), 0);
 				if (ret)
 					break;
 			}
 
-			bkey_reassemble(&tmp.k, k);
+			bkey_copy(&prev.k, &cur.k);
 			have_extent = true;
 		}
+	}
 
 	if (!ret && have_extent)
-		ret = bch2_fill_extent(info, &tmp.k, FIEMAP_EXTENT_LAST);
+		ret = bch2_fill_extent(info, bkey_i_to_s_c(&prev.k),
+				       FIEMAP_EXTENT_LAST);
 
 	ret = bch2_trans_exit(&trans) ?: ret;
 	return ret < 0 ? ret : 0;
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index a539719661b8..fd1aceea3553 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -431,7 +431,7 @@ static void init_append_extent(struct bch_write_op *op,
 	if (crc.csum_type ||
 	    crc.compression_type ||
 	    crc.nonce)
-		bch2_extent_crc_append(e, crc);
+		bch2_extent_crc_append(&e->k_i, crc);
 
 	bch2_alloc_sectors_append_ptrs(op->c, wp, &e->k_i, crc.compressed_size);
 
@@ -962,17 +962,13 @@ static inline bool should_promote(struct bch_fs *c, struct bkey_s_c k,
 				  struct bch_io_opts opts,
 				  unsigned flags)
 {
-	if (!bkey_extent_is_data(k.k))
-		return false;
-
 	if (!(flags & BCH_READ_MAY_PROMOTE))
 		return false;
 
 	if (!opts.promote_target)
 		return false;
 
-	if (bch2_extent_has_target(c, bkey_s_c_to_extent(k),
-				   opts.promote_target))
+	if (bch2_bkey_has_target(c, k, opts.promote_target))
 		return false;
 
 	if (bch2_target_congested(c, opts.promote_target)) {
@@ -1230,11 +1226,10 @@ retry:
 	k = bkey_i_to_s_c(&tmp.k);
 	bch2_trans_unlock(&trans);
 
-	if (!bkey_extent_is_data(k.k) ||
-	    !bch2_extent_matches_ptr(c, bkey_i_to_s_c_extent(&tmp.k),
-				     rbio->pick.ptr,
-				     rbio->pos.offset -
-				     rbio->pick.crc.offset)) {
+	if (!bch2_bkey_matches_ptr(c, bkey_i_to_s_c(&tmp.k),
+				   rbio->pick.ptr,
+				   rbio->pos.offset -
+				   rbio->pick.crc.offset)) {
 		/* extent we wanted to read no longer exists: */
 		rbio->hole = true;
 		goto out;
@@ -1370,7 +1365,6 @@ static void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio)
 	struct btree_trans trans;
 	struct btree_iter *iter;
 	struct bkey_s_c k;
-	struct bkey_i_extent *e;
 	BKEY_PADDED(k) new;
 	struct bch_extent_crc_unpacked new_crc;
 	u64 data_offset = rbio->pos.offset - rbio->pick.crc.offset;
@@ -1389,34 +1383,30 @@ retry:
 	if (IS_ERR_OR_NULL(k.k))
 		goto out;
 
-	if (!bkey_extent_is_data(k.k))
-		goto out;
-
 	bkey_reassemble(&new.k, k);
-	e = bkey_i_to_extent(&new.k);
+	k = bkey_i_to_s_c(&new.k);
 
-	if (!bch2_extent_matches_ptr(c, extent_i_to_s_c(e),
-				     rbio->pick.ptr, data_offset) ||
-	    bversion_cmp(e->k.version, rbio->version))
+	if (bversion_cmp(k.k->version, rbio->version) ||
+	    !bch2_bkey_matches_ptr(c, k, rbio->pick.ptr, data_offset))
 		goto out;
 
 	/* Extent was merged? */
-	if (bkey_start_offset(&e->k) < data_offset ||
-	    e->k.p.offset > data_offset + rbio->pick.crc.uncompressed_size)
+	if (bkey_start_offset(k.k) < data_offset ||
+	    k.k->p.offset > data_offset + rbio->pick.crc.uncompressed_size)
 		goto out;
 
 	if (bch2_rechecksum_bio(c, &rbio->bio, rbio->version,
 			rbio->pick.crc, NULL, &new_crc,
-			bkey_start_offset(&e->k) - data_offset, e->k.size,
+			bkey_start_offset(k.k) - data_offset, k.k->size,
 			rbio->pick.crc.csum_type)) {
 		bch_err(c, "error verifying existing checksum while narrowing checksum (memory corruption?)");
 		goto out;
 	}
 
-	if (!bch2_extent_narrow_crcs(e, new_crc))
+	if (!bch2_bkey_narrow_crcs(&new.k, new_crc))
 		goto out;
 
-	bch2_trans_update(&trans, BTREE_INSERT_ENTRY(iter, &e->k_i));
+	bch2_trans_update(&trans, BTREE_INSERT_ENTRY(iter, &new.k));
 	ret = bch2_trans_commit(&trans, NULL, NULL,
 				BTREE_INSERT_ATOMIC|
 				BTREE_INSERT_NOFAIL|
@@ -1427,15 +1417,6 @@ out:
 	bch2_trans_exit(&trans);
 }
 
-static bool should_narrow_crcs(struct bkey_s_c k,
-			       struct extent_ptr_decoded *pick,
-			       unsigned flags)
-{
-	return !(flags & BCH_READ_IN_RETRY) &&
-		bkey_extent_is_data(k.k) &&
-		bch2_can_narrow_extent_crcs(bkey_s_c_to_extent(k), pick->crc);
-}
-
 /* Inner part that may run in process context */
 static void __bch2_read_endio(struct work_struct *work)
 {
@@ -1622,7 +1603,8 @@ int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig,
 	    bio_flagged(&orig->bio, BIO_CHAIN))
 		flags |= BCH_READ_MUST_CLONE;
 
-	narrow_crcs = should_narrow_crcs(k, &pick, flags);
+	narrow_crcs = !(flags & BCH_READ_IN_RETRY) &&
+		bch2_can_narrow_extent_crcs(k, pick.crc);
 
 	if (narrow_crcs && (flags & BCH_READ_USER_MAPPED))
 		flags |= BCH_READ_MUST_BOUNCE;
diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c
index ad41f5e36a7c..301cb72bd3e4 100644
--- a/fs/bcachefs/migrate.c
+++ b/fs/bcachefs/migrate.c
@@ -49,8 +49,7 @@ static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
 
 	while ((k = bch2_btree_iter_peek(iter)).k &&
 	       !(ret = bkey_err(k))) {
-		if (!bkey_extent_is_data(k.k) ||
-		    !bch2_extent_has_device(bkey_s_c_to_extent(k), dev_idx)) {
+		if (!bch2_bkey_has_device(k, dev_idx)) {
 			ret = bch2_mark_bkey_replicas(c, k);
 			if (ret)
 				break;
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index 27835e4f13fd..ffa0c2bbe290 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -82,9 +82,7 @@ static int bch2_migrate_index_update(struct bch_write_op *op)
 			break;
 
 		if (bversion_cmp(k.k->version, new->k.version) ||
-		    !bkey_extent_is_data(k.k) ||
-		    !bch2_extent_matches_ptr(c, bkey_s_c_to_extent(k),
-					     m->ptr, m->offset))
+		    !bch2_bkey_matches_ptr(c, k, m->ptr, m->offset))
 			goto nomatch;
 
 		if (m->data_cmd == DATA_REWRITE &&
@@ -116,14 +114,14 @@ static int bch2_migrate_index_update(struct bch_write_op *op)
 				continue;
 			}
 
-			bch2_extent_ptr_decoded_append(insert, &p);
+			bch2_extent_ptr_decoded_append(&insert->k_i, &p);
 			did_work = true;
 		}
 
 		if (!did_work)
 			goto nomatch;
 
-		bch2_extent_narrow_crcs(insert,
+		bch2_bkey_narrow_crcs(&insert->k_i,
 				(struct bch_extent_crc_unpacked) { 0 });
 		bch2_extent_normalize(c, extent_i_to_s(insert).s);
 		bch2_extent_mark_replicas_cached(c, extent_i_to_s(insert),
@@ -393,14 +391,15 @@ static int bch2_move_extent(struct bch_fs *c,
 			    struct moving_context *ctxt,
 			    struct write_point_specifier wp,
 			    struct bch_io_opts io_opts,
-			    struct bkey_s_c_extent e,
+			    struct bkey_s_c k,
 			    enum data_cmd data_cmd,
 			    struct data_opts data_opts)
 {
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
 	struct moving_io *io;
 	const union bch_extent_entry *entry;
 	struct extent_ptr_decoded p;
-	unsigned sectors = e.k->size, pages;
+	unsigned sectors = k.k->size, pages;
 	int ret = -ENOMEM;
 
 	move_ctxt_wait_event(ctxt,
@@ -412,7 +411,7 @@ static int bch2_move_extent(struct bch_fs *c,
 		SECTORS_IN_FLIGHT_PER_DEVICE);
 
 	/* write path might have to decompress data: */
-	extent_for_each_ptr_decode(e, p, entry)
+	bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
 		sectors = max_t(unsigned, sectors, p.crc.uncompressed_size);
 
 	pages = DIV_ROUND_UP(sectors, PAGE_SECTORS);
@@ -422,8 +421,8 @@ static int bch2_move_extent(struct bch_fs *c,
 		goto err;
 
 	io->write.ctxt		= ctxt;
-	io->read_sectors	= e.k->size;
-	io->write_sectors	= e.k->size;
+	io->read_sectors	= k.k->size;
+	io->write_sectors	= k.k->size;
 
 	bio_init(&io->write.op.wbio.bio, NULL, io->bi_inline_vecs, pages, 0);
 	bio_set_prio(&io->write.op.wbio.bio,
@@ -440,18 +439,18 @@ static int bch2_move_extent(struct bch_fs *c,
 	io->rbio.bio.bi_iter.bi_size = sectors << 9;
 
 	io->rbio.bio.bi_opf		= REQ_OP_READ;
-	io->rbio.bio.bi_iter.bi_sector	= bkey_start_offset(e.k);
+	io->rbio.bio.bi_iter.bi_sector	= bkey_start_offset(k.k);
 	io->rbio.bio.bi_end_io		= move_read_endio;
 
 	ret = bch2_migrate_write_init(c, &io->write, wp, io_opts,
-				      data_cmd, data_opts, e.s_c);
+				      data_cmd, data_opts, k);
 	if (ret)
 		goto err_free_pages;
 
 	atomic64_inc(&ctxt->stats->keys_moved);
-	atomic64_add(e.k->size, &ctxt->stats->sectors_moved);
+	atomic64_add(k.k->size, &ctxt->stats->sectors_moved);
 
-	trace_move_extent(e.k);
+	trace_move_extent(k.k);
 
 	atomic_add(io->read_sectors, &ctxt->read_sectors);
 	list_add_tail(&io->list, &ctxt->reads);
@@ -461,7 +460,7 @@ static int bch2_move_extent(struct bch_fs *c,
 	 * ctxt when doing wakeup
 	 */
 	closure_get(&ctxt->cl);
-	bch2_read_extent(c, &io->rbio, e.s_c, 0,
+	bch2_read_extent(c, &io->rbio, k, 0,
 			 BCH_READ_NODECODE|
 			 BCH_READ_LAST_FRAGMENT);
 	return 0;
@@ -470,7 +469,7 @@ err_free_pages:
 err_free:
 	kfree(io);
 err:
-	trace_move_alloc_fail(e.k);
+	trace_move_alloc_fail(k.k);
 	return ret;
 }
 
@@ -580,8 +579,7 @@ peek:
 		k = bkey_i_to_s_c(&tmp.k);
 		bch2_trans_unlock(&trans);
 
-		ret2 = bch2_move_extent(c, &ctxt, wp, io_opts,
-					bkey_s_c_to_extent(k),
+		ret2 = bch2_move_extent(c, &ctxt, wp, io_opts, k,
 					data_cmd, data_opts);
 		if (ret2) {
 			if (ret2 == -ENOMEM) {
diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c
index 44e235ef3de0..84971fbfc722 100644
--- a/fs/bcachefs/movinggc.c
+++ b/fs/bcachefs/movinggc.c
@@ -69,26 +69,19 @@ static bool __copygc_pred(struct bch_dev *ca,
 			  struct bkey_s_c k)
 {
 	copygc_heap *h = &ca->copygc_heap;
+	const struct bch_extent_ptr *ptr =
+		bch2_bkey_has_device(k, ca->dev_idx);
 
-	switch (k.k->type) {
-	case KEY_TYPE_extent: {
-		struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
-		const struct bch_extent_ptr *ptr =
-			bch2_extent_has_device(e, ca->dev_idx);
+	if (ptr) {
+		struct copygc_heap_entry search = { .offset = ptr->offset };
 
-		if (ptr) {
-			struct copygc_heap_entry search = { .offset = ptr->offset };
+		ssize_t i = eytzinger0_find_le(h->data, h->used,
+					       sizeof(h->data[0]),
+					       bucket_offset_cmp, &search);
 
-			ssize_t i = eytzinger0_find_le(h->data, h->used,
-						       sizeof(h->data[0]),
-						       bucket_offset_cmp, &search);
-
-			return (i >= 0 &&
-				ptr->offset < h->data[i].offset + ca->mi.bucket_size &&
-				ptr->gen == h->data[i].gen);
-		}
-		break;
-	}
+		return (i >= 0 &&
+			ptr->offset < h->data[i].offset + ca->mi.bucket_size &&
+			ptr->gen == h->data[i].gen);
 	}
 
 	return false;
diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c
index fe4a9af92a76..0997c0621b7c 100644
--- a/fs/bcachefs/rebalance.c
+++ b/fs/bcachefs/rebalance.c
@@ -38,9 +38,9 @@ void bch2_rebalance_add_key(struct bch_fs *c,
 			    struct bkey_s_c k,
 			    struct bch_io_opts *io_opts)
 {
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
 	const union bch_extent_entry *entry;
 	struct extent_ptr_decoded p;
-	struct bkey_s_c_extent e;
 
 	if (!bkey_extent_is_data(k.k))
 		return;
@@ -49,9 +49,7 @@ void bch2_rebalance_add_key(struct bch_fs *c,
 	    !io_opts->background_compression)
 		return;
 
-	e = bkey_s_c_to_extent(k);
-
-	extent_for_each_ptr_decode(e, p, entry)
+	bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
 		if (rebalance_ptr_pred(c, p, io_opts)) {
 			struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev);
 
-- 
cgit 


From e10d309471c54dbcb0dfc2c780672c07d4805495 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 30 Jul 2019 12:46:53 -0400
Subject: bcachefs: Fix bch2_seek_data()

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-io.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 474535aa3fc2..de0159dde1bd 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -2798,7 +2798,7 @@ static loff_t bch2_next_pagecache_data(struct inode *vinode,
 				end_offset =
 					min(end_offset,
 					    max(start_offset,
-						((loff_t) index) << PAGE_SHIFT));
+						((loff_t) folio->index) << PAGE_SHIFT));
 				folio_unlock(folio);
 				folio_batch_release(&fbatch);
 				return end_offset;
@@ -2847,7 +2847,7 @@ static loff_t bch2_seek_data(struct file *file, u64 offset)
 		next_data = bch2_next_pagecache_data(&inode->v,
 						     offset, next_data);
 
-	if (next_data > isize)
+	if (next_data >= isize)
 		return -ENXIO;
 
 	return vfs_setpos(file, next_data, MAX_LFS_FILESIZE);
-- 
cgit 


From 7f5e31e1a4c8821b346c9b2cc108ffbdd87778a7 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 29 Jul 2019 12:24:36 -0400
Subject: bcachefs: Change __bch2_writepage() to not write to holes

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-io.c | 142 +++++++++++++++++++++++++++++++---------------------
 1 file changed, 86 insertions(+), 56 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index de0159dde1bd..51b18ec1b1f8 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -31,6 +31,15 @@
 
 #include <trace/events/writeback.h>
 
+static inline bool bio_full(struct bio *bio, unsigned len)
+{
+	if (bio->bi_vcnt >= bio->bi_max_vecs)
+		return true;
+	if (bio->bi_iter.bi_size > UINT_MAX - len)
+		return true;
+	return false;
+}
+
 struct quota_res {
 	u64				sectors;
 };
@@ -517,6 +526,7 @@ struct bch_page_sector {
 };
 
 struct bch_page_state {
+	atomic_t		write_count;
 	struct bch_page_sector	s[PAGE_SECTORS];
 };
 
@@ -835,31 +845,6 @@ bool bch2_release_folio(struct folio *folio, gfp_t gfp_mask)
 	return true;
 }
 
-/* readpages/writepages: */
-
-static bool bio_can_add_page_contig(struct bio *bio, struct page *page)
-{
-	sector_t offset = (sector_t) page->index << PAGE_SECTOR_SHIFT;
-
-	return bio->bi_vcnt < bio->bi_max_vecs &&
-		bio_end_sector(bio) == offset;
-}
-
-static int bio_add_page_contig(struct bio *bio, struct page *page)
-{
-	sector_t offset = (sector_t) page->index << PAGE_SECTOR_SHIFT;
-
-	EBUG_ON(!bio->bi_max_vecs);
-
-	if (!bio->bi_vcnt)
-		bio->bi_iter.bi_sector = offset;
-	else if (!bio_can_add_page_contig(bio, page))
-		return -1;
-
-	BUG_ON(!bio_add_page(bio, page, PAGE_SIZE, 0));
-	return 0;
-}
-
 /* readpage(s): */
 
 static void bch2_readpages_end_io(struct bio *bio)
@@ -1132,7 +1117,9 @@ static void __bchfs_readpage(struct bch_fs *c, struct bch_read_bio *rbio,
 	bch2_page_state_create(page, __GFP_NOFAIL);
 
 	rbio->bio.bi_opf = REQ_OP_READ|REQ_SYNC;
-	bio_add_page_contig(&rbio->bio, page);
+	rbio->bio.bi_iter.bi_sector =
+		(sector_t) page->index << PAGE_SECTOR_SHIFT;
+	BUG_ON(!bio_add_page(&rbio->bio, page, PAGE_SIZE, 0));
 
 	bch2_trans_init(&trans, c, 0, 0);
 	iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, POS_MIN,
@@ -1243,8 +1230,12 @@ static void bch2_writepage_io_done(struct closure *cl)
 		i_sectors_acct(c, io->op.inode, NULL,
 			       io->op.sectors_added - (s64) io->new_sectors);
 
-	bio_for_each_segment_all(bvec, bio, iter)
-		end_page_writeback(bvec->bv_page);
+	bio_for_each_segment_all(bvec, bio, iter) {
+		struct bch_page_state *s = __bch2_page_state(bvec->bv_page);
+
+		if (atomic_dec_and_test(&s->write_count))
+			end_page_writeback(bvec->bv_page);
+	}
 
 	closure_return_with_destructor(&io->cl, bch2_writepage_io_free);
 }
@@ -1265,11 +1256,10 @@ static void bch2_writepage_do_io(struct bch_writepage_state *w)
 static void bch2_writepage_io_alloc(struct bch_fs *c,
 				    struct bch_writepage_state *w,
 				    struct bch_inode_info *inode,
-				    struct page *page,
+				    u64 sector,
 				    unsigned nr_replicas)
 {
 	struct bch_write_op *op;
-	u64 offset = (u64) page->index << PAGE_SECTOR_SHIFT;
 
 	w->io = container_of(bio_alloc_bioset(NULL, BIO_MAX_VECS,
 					      REQ_OP_WRITE,
@@ -1284,8 +1274,8 @@ static void bch2_writepage_io_alloc(struct bch_fs *c,
 	op->nr_replicas		= nr_replicas;
 	op->res.nr_replicas	= nr_replicas;
 	op->write_point		= writepoint_hashed(inode->ei_last_dirtied);
-	op->pos			= POS(inode->v.i_ino, offset);
-	op->wbio.bio.bi_iter.bi_sector = offset;
+	op->pos			= POS(inode->v.i_ino, sector);
+	op->wbio.bio.bi_iter.bi_sector = sector;
 }
 
 static int __bch2_writepage(struct folio *folio,
@@ -1296,12 +1286,10 @@ static int __bch2_writepage(struct folio *folio,
 	struct bch_inode_info *inode = to_bch_ei(page->mapping->host);
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
 	struct bch_writepage_state *w = data;
-	struct bch_page_state *s;
-	unsigned offset, nr_replicas_this_write = U32_MAX;
-	unsigned dirty_sectors = 0, reserved_sectors = 0;
+	struct bch_page_state *s, orig;
+	unsigned i, offset, nr_replicas_this_write = U32_MAX;
 	loff_t i_size = i_size_read(&inode->v);
 	pgoff_t end_index = i_size >> PAGE_SHIFT;
-	unsigned i;
 	int ret;
 
 	EBUG_ON(!PageUptodate(page));
@@ -1336,48 +1324,90 @@ do_io:
 		return 0;
 	}
 
-	for (i = 0; i < PAGE_SECTORS; i++)
+	/* Before unlocking the page, get copy of reservations: */
+	orig = *s;
+
+	for (i = 0; i < PAGE_SECTORS; i++) {
+		if (s->s[i].state == SECTOR_UNALLOCATED)
+			continue;
+
 		nr_replicas_this_write =
 			min_t(unsigned, nr_replicas_this_write,
 			      s->s[i].nr_replicas +
 			      s->s[i].replicas_reserved);
-
-	/* Before unlocking the page, transfer reservation to w->io: */
+	}
 
 	for (i = 0; i < PAGE_SECTORS; i++) {
+		if (s->s[i].state == SECTOR_UNALLOCATED)
+			continue;
+
 		s->s[i].nr_replicas = w->opts.compression
 			? 0 : nr_replicas_this_write;
 
-		reserved_sectors += s->s[i].replicas_reserved;
 		s->s[i].replicas_reserved = 0;
-
-		dirty_sectors += s->s[i].state == SECTOR_DIRTY;
 		s->s[i].state = SECTOR_ALLOCATED;
 	}
 
+	BUG_ON(atomic_read(&s->write_count));
+	atomic_set(&s->write_count, 1);
+
 	BUG_ON(PageWriteback(page));
 	set_page_writeback(page);
+
 	unlock_page(page);
 
-	if (w->io &&
-	    (w->io->op.op.res.nr_replicas != nr_replicas_this_write ||
-	     !bio_can_add_page_contig(&w->io->op.op.wbio.bio, page)))
-		bch2_writepage_do_io(w);
+	offset = 0;
+	while (1) {
+		unsigned sectors = 1, dirty_sectors = 0, reserved_sectors = 0;
+		u64 sector;
+
+		while (offset < PAGE_SECTORS &&
+		       orig.s[offset].state == SECTOR_UNALLOCATED)
+			offset++;
+
+		if (offset == PAGE_SECTORS)
+			break;
+
+		sector = ((u64) page->index << PAGE_SECTOR_SHIFT) + offset;
+
+		while (offset + sectors < PAGE_SECTORS &&
+		       orig.s[offset + sectors].state != SECTOR_UNALLOCATED)
+			sectors++;
+
+		for (i = offset; i < offset + sectors; i++) {
+			reserved_sectors += orig.s[i].replicas_reserved;
+			dirty_sectors += orig.s[i].state == SECTOR_DIRTY;
+		}
+
+		if (w->io &&
+		    (w->io->op.op.res.nr_replicas != nr_replicas_this_write ||
+		     bio_full(&w->io->op.op.wbio.bio, PAGE_SIZE) ||
+		     bio_end_sector(&w->io->op.op.wbio.bio) != sector))
+			bch2_writepage_do_io(w);
+
+		if (!w->io)
+			bch2_writepage_io_alloc(c, w, inode, sector,
+						nr_replicas_this_write);
 
-	if (!w->io)
-		bch2_writepage_io_alloc(c, w, inode, page,
-					nr_replicas_this_write);
+		w->io->new_sectors += dirty_sectors;
 
-	w->io->new_sectors += dirty_sectors;
+		atomic_inc(&s->write_count);
 
-	BUG_ON(inode != w->io->op.inode);
-	BUG_ON(bio_add_page_contig(&w->io->op.op.wbio.bio, page));
+		BUG_ON(inode != w->io->op.inode);
+		BUG_ON(!bio_add_page(&w->io->op.op.wbio.bio, page,
+				     sectors << 9, offset << 9));
 
-	w->io->op.op.res.sectors += reserved_sectors;
-	w->io->op.new_i_size = i_size;
+		w->io->op.op.res.sectors += reserved_sectors;
+		w->io->op.new_i_size = i_size;
+
+		if (wbc->sync_mode == WB_SYNC_ALL)
+			w->io->op.op.wbio.bio.bi_opf |= REQ_SYNC;
+
+		offset += sectors;
+	}
 
-	if (wbc->sync_mode == WB_SYNC_ALL)
-		w->io->op.op.wbio.bio.bi_opf |= REQ_SYNC;
+	if (atomic_dec_and_test(&s->write_count))
+		end_page_writeback(page);
 
 	return 0;
 }
-- 
cgit 


From d1542e0362de069f677dfb0e9336438afb8fae74 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 29 Jul 2019 13:38:38 -0400
Subject: bcachefs: Change buffered write path to write to partial pages

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-io.c | 262 ++++++++++++++++++++++++++++++----------------------
 1 file changed, 152 insertions(+), 110 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 51b18ec1b1f8..4efe985da96b 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -519,7 +519,6 @@ struct bch_page_sector {
 	/* i_sectors: */
 	enum {
 		SECTOR_UNALLOCATED,
-		SECTOR_QUOTA_RESERVED,
 		SECTOR_DIRTY,
 		SECTOR_ALLOCATED,
 	}			state:2;
@@ -597,31 +596,6 @@ static struct bch_page_state *bch2_page_state_create(struct page *page,
 	return bch2_page_state(page) ?: __bch2_page_state_create(page, gfp);
 }
 
-static void bch2_put_page_reservation(struct bch_fs *c, struct bch_inode_info *inode,
-				      struct page *page)
-{
-	struct bch_page_state *s = bch2_page_state(page);
-	struct disk_reservation disk_res = { 0 };
-	struct quota_res quota_res = { 0 };
-	unsigned i;
-
-	if (!s)
-		return;
-
-	for (i = 0; i < ARRAY_SIZE(s->s); i++) {
-		disk_res.sectors += s->s[i].replicas_reserved;
-		s->s[i].replicas_reserved = 0;
-
-		if (s->s[i].state == SECTOR_QUOTA_RESERVED) {
-			quota_res.sectors++;
-			s->s[i].state = SECTOR_UNALLOCATED;
-		}
-	}
-
-	bch2_quota_reservation_put(c, inode, &quota_res);
-	bch2_disk_reservation_put(c, &disk_res);
-}
-
 static inline unsigned inode_nr_replicas(struct bch_fs *c, struct bch_inode_info *inode)
 {
 	/* XXX: this should not be open coded */
@@ -672,100 +646,134 @@ static int bch2_get_page_disk_reservation(struct bch_fs *c,
 	return 0;
 }
 
-static int bch2_get_page_quota_reservation(struct bch_fs *c,
+struct bch2_page_reservation {
+	struct disk_reservation	disk;
+	struct quota_res	quota;
+};
+
+static void bch2_page_reservation_init(struct bch_fs *c,
+			struct bch_inode_info *inode,
+			struct bch2_page_reservation *res)
+{
+	memset(res, 0, sizeof(*res));
+
+	res->disk.nr_replicas = inode_nr_replicas(c, inode);
+}
+
+static void bch2_page_reservation_put(struct bch_fs *c,
 			struct bch_inode_info *inode,
-			struct page *page, bool check_enospc)
+			struct bch2_page_reservation *res)
+{
+	bch2_disk_reservation_put(c, &res->disk);
+	bch2_quota_reservation_put(c, inode, &res->quota);
+}
+
+static int bch2_page_reservation_get(struct bch_fs *c,
+			struct bch_inode_info *inode, struct page *page,
+			struct bch2_page_reservation *res,
+			unsigned offset, unsigned len, bool check_enospc)
 {
 	struct bch_page_state *s = bch2_page_state_create(page, 0);
-	struct quota_res quota_res = { 0 };
-	unsigned i, quota_res_sectors = 0;
+	unsigned i, disk_sectors = 0, quota_sectors = 0;
 	int ret;
 
 	if (!s)
 		return -ENOMEM;
 
-	for (i = 0; i < ARRAY_SIZE(s->s); i++)
-		quota_res_sectors += s->s[i].state == SECTOR_UNALLOCATED;
-
-	if (!quota_res_sectors)
-		return 0;
+	for (i = offset / 512;
+	     i < DIV_ROUND_UP(offset + len, 512);
+	     i++) {
+		disk_sectors += sectors_to_reserve(&s->s[i],
+						res->disk.nr_replicas);
+		quota_sectors += s->s[i].state == SECTOR_UNALLOCATED;
+	}
 
-	ret = bch2_quota_reservation_add(c, inode, &quota_res,
-					 quota_res_sectors,
-					 check_enospc);
-	if (unlikely(ret))
-		return ret;
+	if (disk_sectors) {
+		ret = bch2_disk_reservation_add(c, &res->disk,
+						disk_sectors,
+						!check_enospc
+						? BCH_DISK_RESERVATION_NOFAIL
+						: 0);
+		if (unlikely(ret))
+			return ret;
+	}
 
-	for (i = 0; i < ARRAY_SIZE(s->s); i++)
-		if (s->s[i].state == SECTOR_UNALLOCATED)
-			s->s[i].state = SECTOR_QUOTA_RESERVED;
+	if (quota_sectors) {
+		ret = bch2_quota_reservation_add(c, inode, &res->quota,
+						 quota_sectors,
+						 check_enospc);
+		if (unlikely(ret)) {
+			struct disk_reservation tmp = {
+				.sectors = disk_sectors
+			};
+
+			bch2_disk_reservation_put(c, &tmp);
+			res->disk.sectors -= disk_sectors;
+			return ret;
+		}
+	}
 
 	return 0;
 }
 
-static int bch2_get_page_reservation(struct bch_fs *c, struct bch_inode_info *inode,
-				     struct page *page, bool check_enospc)
-{
-	return bch2_get_page_disk_reservation(c, inode, page, check_enospc) ?:
-		bch2_get_page_quota_reservation(c, inode, page, check_enospc);
-}
-
 static void bch2_clear_page_bits(struct page *page)
 {
 	struct bch_inode_info *inode = to_bch_ei(page->mapping->host);
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
 	struct bch_page_state *s = bch2_page_state(page);
+	struct disk_reservation disk_res = { 0 };
 	int i, dirty_sectors = 0;
 
 	if (!s)
 		return;
 
 	for (i = 0; i < ARRAY_SIZE(s->s); i++) {
+		disk_res.sectors += s->s[i].replicas_reserved;
+		s->s[i].replicas_reserved = 0;
+
 		if (s->s[i].state == SECTOR_DIRTY) {
 			dirty_sectors++;
 			s->s[i].state = SECTOR_UNALLOCATED;
 		}
 	}
 
+	bch2_disk_reservation_put(c, &disk_res);
+
 	if (dirty_sectors)
 		i_sectors_acct(c, inode, NULL, -dirty_sectors);
-	bch2_put_page_reservation(c, inode, page);
 
 	bch2_page_state_release(page);
 }
 
-static void __bch2_set_page_dirty(struct address_space *mapping, struct folio *folio)
+static void bch2_set_page_dirty(struct bch_fs *c,
+			struct bch_inode_info *inode, struct page *page,
+			struct bch2_page_reservation *res,
+			unsigned offset, unsigned len)
 {
-	struct bch_inode_info *inode = to_bch_ei(mapping->host);
-	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	struct bch_page_state *s = bch2_page_state(&folio->page);
-	struct quota_res quota_res = { 0 };
+	struct bch_page_state *s = bch2_page_state(page);
 	unsigned i, dirty_sectors = 0;
 
-	BUG_ON(!s);
+	for (i = offset / 512;
+	     i < DIV_ROUND_UP(offset + len, 512);
+	     i++) {
+		unsigned sectors = sectors_to_reserve(&s->s[i],
+						res->disk.nr_replicas);
 
-	for (i = 0; i < ARRAY_SIZE(s->s); i++) {
-		if (s->s[i].state == SECTOR_QUOTA_RESERVED)
-			quota_res.sectors++;
+		BUG_ON(sectors > res->disk.sectors);
+		s->s[i].replicas_reserved += sectors;
+		res->disk.sectors -= sectors;
 
-		if (s->s[i].state == SECTOR_UNALLOCATED ||
-		    s->s[i].state == SECTOR_QUOTA_RESERVED) {
+		if (s->s[i].state == SECTOR_UNALLOCATED) {
 			s->s[i].state = SECTOR_DIRTY;
 			dirty_sectors++;
 		}
 	}
 
 	if (dirty_sectors)
-		i_sectors_acct(c, inode, &quota_res, dirty_sectors);
-	bch2_quota_reservation_put(c, inode, &quota_res);
-}
-
-static void bch2_set_page_dirty(struct address_space *mapping, struct page *page)
-{
-	struct folio *folio = page_folio(page);
+		i_sectors_acct(c, inode, &res->quota, dirty_sectors);
 
-	__bch2_set_page_dirty(mapping, folio);
-	filemap_dirty_folio(mapping, folio);
+	if (!PageDirty(page))
+		filemap_dirty_folio(inode->v.i_mapping, page_folio(page));
 }
 
 vm_fault_t bch2_page_fault(struct vm_fault *vmf)
@@ -788,8 +796,11 @@ vm_fault_t bch2_page_mkwrite(struct vm_fault *vmf)
 	struct bch_inode_info *inode = file_bch_inode(file);
 	struct address_space *mapping = file->f_mapping;
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	struct bch2_page_reservation res;
 	int ret = VM_FAULT_LOCKED;
 
+	bch2_page_reservation_init(c, inode, &res);
+
 	sb_start_pagefault(inode->v.i_sb);
 	file_update_time(file);
 
@@ -809,18 +820,21 @@ vm_fault_t bch2_page_mkwrite(struct vm_fault *vmf)
 		goto out;
 	}
 
-	if (bch2_get_page_reservation(c, inode, page, true)) {
+	if (bch2_page_reservation_get(c, inode, page, &res,
+				      0, PAGE_SIZE, true)) {
 		unlock_page(page);
 		ret = VM_FAULT_SIGBUS;
 		goto out;
 	}
 
-	if (!PageDirty(page))
-		bch2_set_page_dirty(mapping, page);
+	bch2_set_page_dirty(c, inode, page, &res, 0, PAGE_SIZE);
 	wait_for_stable_page(page);
 out:
 	bch2_pagecache_add_put(&inode->ei_pagecache_lock);
 	sb_end_pagefault(inode->v.i_sb);
+
+	bch2_page_reservation_put(c, inode, &res);
+
 	return ret;
 }
 
@@ -1450,12 +1464,18 @@ int bch2_write_begin(struct file *file, struct address_space *mapping,
 {
 	struct bch_inode_info *inode = to_bch_ei(mapping->host);
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	struct bch2_page_reservation *res;
 	pgoff_t index = pos >> PAGE_SHIFT;
 	unsigned offset = pos & (PAGE_SIZE - 1);
 	struct page *page;
 	int ret = -ENOMEM;
 
-	BUG_ON(inode_unhashed(&inode->v));
+	res = kmalloc(sizeof(*res), GFP_KERNEL);
+	if (!res)
+		return -ENOMEM;
+
+	bch2_page_reservation_init(c, inode, res);
+	*fsdata = res;
 
 	bch2_pagecache_add_get(&inode->ei_pagecache_lock);
 
@@ -1486,7 +1506,8 @@ readpage:
 	if (ret)
 		goto err;
 out:
-	ret = bch2_get_page_reservation(c, inode, page, true);
+	ret = bch2_page_reservation_get(c, inode, page, res,
+					offset, len, true);
 	if (ret) {
 		if (!PageUptodate(page)) {
 			/*
@@ -1509,6 +1530,8 @@ err:
 	*pagep = NULL;
 err_unlock:
 	bch2_pagecache_add_put(&inode->ei_pagecache_lock);
+	kfree(res);
+	*fsdata = NULL;
 	return ret;
 }
 
@@ -1518,6 +1541,8 @@ int bch2_write_end(struct file *file, struct address_space *mapping,
 {
 	struct bch_inode_info *inode = to_bch_ei(mapping->host);
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	struct bch2_page_reservation *res = fsdata;
+	unsigned offset = pos & (PAGE_SIZE - 1);
 
 	lockdep_assert_held(&inode->v.i_rwsem);
 
@@ -1540,18 +1565,19 @@ int bch2_write_end(struct file *file, struct address_space *mapping,
 	if (copied) {
 		if (!PageUptodate(page))
 			SetPageUptodate(page);
-		if (!PageDirty(page))
-			bch2_set_page_dirty(mapping, page);
+
+		bch2_set_page_dirty(c, inode, page, res, offset, copied);
 
 		inode->ei_last_dirtied = (unsigned long) current;
-	} else {
-		bch2_put_page_reservation(c, inode, page);
 	}
 
 	unlock_page(page);
 	put_page(page);
 	bch2_pagecache_add_put(&inode->ei_pagecache_lock);
 
+	bch2_page_reservation_put(c, inode, res);
+	kfree(res);
+
 	return copied;
 }
 
@@ -1564,15 +1590,19 @@ static int __bch2_buffered_write(struct bch_inode_info *inode,
 {
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
 	struct page *pages[WRITE_BATCH_PAGES];
+	struct bch2_page_reservation res;
 	unsigned long index = pos >> PAGE_SHIFT;
 	unsigned offset = pos & (PAGE_SIZE - 1);
 	unsigned nr_pages = DIV_ROUND_UP(offset + len, PAGE_SIZE);
-	unsigned i, copied = 0, nr_pages_copied = 0;
+	unsigned i, reserved = 0, set_dirty = 0;
+	unsigned copied = 0, nr_pages_copied = 0;
 	int ret = 0;
 
 	BUG_ON(!len);
 	BUG_ON(nr_pages > ARRAY_SIZE(pages));
 
+	bch2_page_reservation_init(c, inode, &res);
+
 	for (i = 0; i < nr_pages; i++) {
 		pages[i] = grab_cache_page_write_begin(mapping, index + i);
 		if (!pages[i]) {
@@ -1599,19 +1629,25 @@ static int __bch2_buffered_write(struct bch_inode_info *inode,
 		}
 	}
 
-	for (i = 0; i < nr_pages; i++) {
-		ret = bch2_get_page_reservation(c, inode, pages[i], true);
-
-		if (ret && !PageUptodate(pages[i])) {
-			ret = bch2_read_single_page(pages[i], mapping);
-			if (ret)
-				goto out;
-
-			ret = bch2_get_page_reservation(c, inode, pages[i], true);
+	while (reserved < len) {
+		struct page *page = pages[(offset + reserved) >> PAGE_SHIFT];
+		unsigned pg_offset = (offset + reserved) & (PAGE_SIZE - 1);
+		unsigned pg_len = min_t(unsigned, len - reserved,
+					PAGE_SIZE - pg_offset);
+retry_reservation:
+		ret = bch2_page_reservation_get(c, inode, page, &res,
+						pg_offset, pg_len, true);
+
+		if (ret && !PageUptodate(page)) {
+			ret = bch2_read_single_page(page, mapping);
+			if (!ret)
+				goto retry_reservation;
 		}
 
 		if (ret)
 			goto out;
+
+		reserved += pg_len;
 	}
 
 	if (mapping_writably_mapped(mapping))
@@ -1621,16 +1657,16 @@ static int __bch2_buffered_write(struct bch_inode_info *inode,
 	while (copied < len) {
 		struct page *page = pages[(offset + copied) >> PAGE_SHIFT];
 		unsigned pg_offset = (offset + copied) & (PAGE_SIZE - 1);
-		unsigned pg_bytes = min_t(unsigned, len - copied,
-					  PAGE_SIZE - pg_offset);
+		unsigned pg_len = min_t(unsigned, len - copied,
+					PAGE_SIZE - pg_offset);
 		unsigned pg_copied = copy_page_from_iter_atomic(page,
-						pg_offset, pg_bytes, iter);
+						pg_offset, pg_len, iter);
+
+		if (!pg_copied)
+			break;
 
 		flush_dcache_page(page);
 		copied += pg_copied;
-
-		if (pg_copied != pg_bytes)
-			break;
 	}
 
 	if (!copied)
@@ -1653,23 +1689,30 @@ static int __bch2_buffered_write(struct bch_inode_info *inode,
 			copied -= (offset + copied) & (PAGE_SIZE - 1);
 		}
 	}
-out:
-	for (i = 0; i < nr_pages_copied; i++) {
-		if (!PageUptodate(pages[i]))
-			SetPageUptodate(pages[i]);
-		if (!PageDirty(pages[i]))
-			bch2_set_page_dirty(mapping, pages[i]);
-		unlock_page(pages[i]);
-		put_page(pages[i]);
-	}
 
+	while (set_dirty < copied) {
+		struct page *page = pages[(offset + set_dirty) >> PAGE_SHIFT];
+		unsigned pg_offset = (offset + set_dirty) & (PAGE_SIZE - 1);
+		unsigned pg_len = min_t(unsigned, copied - set_dirty,
+					PAGE_SIZE - pg_offset);
+
+		if (!PageUptodate(page))
+			SetPageUptodate(page);
+
+		bch2_set_page_dirty(c, inode, page, &res, pg_offset, pg_len);
+		unlock_page(page);
+		put_page(page);
+
+		set_dirty += pg_len;
+	}
+out:
 	for (i = nr_pages_copied; i < nr_pages; i++) {
-		if (!PageDirty(pages[i]))
-			bch2_put_page_reservation(c, inode, pages[i]);
 		unlock_page(pages[i]);
 		put_page(pages[i]);
 	}
 
+	bch2_page_reservation_put(c, inode, &res);
+
 	return copied ?: ret;
 }
 
@@ -2322,7 +2365,7 @@ static int __bch2_truncate_page(struct bch_inode_info *inode,
 	 * XXX: because we aren't currently tracking whether the page has actual
 	 * data in it (vs. just 0s, or only partially written) this wrong. ick.
 	 */
-	ret = bch2_get_page_reservation(c, inode, page, false);
+	ret = bch2_get_page_disk_reservation(c, inode, page, false);
 	BUG_ON(ret);
 
 	if (index == start >> PAGE_SHIFT &&
@@ -2333,8 +2376,7 @@ static int __bch2_truncate_page(struct bch_inode_info *inode,
 	else if (index == end >> PAGE_SHIFT)
 		zero_user_segment(page, 0, end_offset);
 
-	if (!PageDirty(page))
-		bch2_set_page_dirty(mapping, page);
+	filemap_dirty_folio(mapping, page_folio(page));
 unlock:
 	unlock_page(page);
 	put_page(page);
-- 
cgit 


From 543ef2ebcd90686d999f18b0a874690b7976b239 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 30 Jul 2019 13:49:17 -0400
Subject: bcachefs: Handle partial pages in seek data/hole

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-io.c | 85 ++++++++++++++++++++++++++++++++++-------------------
 1 file changed, 55 insertions(+), 30 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 4efe985da96b..f59c6321f530 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -2833,22 +2833,20 @@ long bch2_fallocate_dispatch(struct file *file, int mode,
 
 /* fseek: */
 
-static bool folio_is_data(struct folio *folio)
+static int folio_data_offset(struct folio *folio, unsigned offset)
 {
 	struct bch_page_state *s = bch2_page_state(&folio->page);
 	unsigned i;
 
-	if (!s)
-		return false;
-
-	for (i = 0; i < PAGE_SECTORS; i++)
-		if (s->s[i].state >= SECTOR_DIRTY)
-			return true;
+	if (s)
+		for (i = offset >> 9; i < PAGE_SECTORS; i++)
+			if (s->s[i].state >= SECTOR_DIRTY)
+				return i << 9;
 
-	return false;
+	return -1;
 }
 
-static loff_t bch2_next_pagecache_data(struct inode *vinode,
+static loff_t bch2_seek_pagecache_data(struct inode *vinode,
 				       loff_t start_offset,
 				       loff_t end_offset)
 {
@@ -2857,6 +2855,8 @@ static loff_t bch2_next_pagecache_data(struct inode *vinode,
 	pgoff_t end_index	= end_offset >> PAGE_SHIFT;
 	pgoff_t index		= start_index;
 	unsigned i;
+	loff_t ret;
+	int offset;
 
 	folio_batch_init(&fbatch);
 
@@ -2866,14 +2866,17 @@ static loff_t bch2_next_pagecache_data(struct inode *vinode,
 			struct folio *folio = fbatch.folios[i];
 
 			folio_lock(folio);
-			if (folio_is_data(folio)) {
-				end_offset =
-					min(end_offset,
-					    max(start_offset,
-						((loff_t) folio->index) << PAGE_SHIFT));
+			offset = folio_data_offset(folio,
+					folio->index == start_index
+					? start_offset & (PAGE_SIZE - 1)
+					: 0);
+			if (offset >= 0) {
+				ret = clamp(((loff_t) folio->index << PAGE_SHIFT) +
+					    offset,
+					    start_offset, end_offset);
 				folio_unlock(folio);
 				folio_batch_release(&fbatch);
-				return end_offset;
+				return ret;
 			}
 			folio_unlock(folio);
 		}
@@ -2916,7 +2919,7 @@ static loff_t bch2_seek_data(struct file *file, u64 offset)
 		return ret;
 
 	if (next_data > offset)
-		next_data = bch2_next_pagecache_data(&inode->v,
+		next_data = bch2_seek_pagecache_data(&inode->v,
 						     offset, next_data);
 
 	if (next_data >= isize)
@@ -2925,34 +2928,56 @@ static loff_t bch2_seek_data(struct file *file, u64 offset)
 	return vfs_setpos(file, next_data, MAX_LFS_FILESIZE);
 }
 
-static bool page_slot_is_data(struct address_space *mapping, pgoff_t index)
+static int __page_hole_offset(struct page *page, unsigned offset)
 {
+	struct bch_page_state *s = bch2_page_state(page);
+	unsigned i;
+
+	if (!s)
+		return 0;
+
+	for (i = offset >> 9; i < PAGE_SECTORS; i++)
+		if (s->s[i].state < SECTOR_DIRTY)
+			return i << 9;
+
+	return -1;
+}
+
+static loff_t page_hole_offset(struct address_space *mapping, loff_t offset)
+{
+	pgoff_t index = offset >> PAGE_SHIFT;
 	struct page *page;
-	bool ret;
+	int pg_offset;
+	loff_t ret = -1;
 
 	page = find_lock_page(mapping, index);
 	if (!page)
-		return false;
+		return offset;
+
+	pg_offset = __page_hole_offset(page, offset & (PAGE_SIZE - 1));
+	if (pg_offset >= 0)
+		ret = ((loff_t) index << PAGE_SHIFT) + pg_offset;
 
-	ret = folio_is_data(page_folio(page));
 	unlock_page(page);
 
 	return ret;
 }
 
-static loff_t bch2_next_pagecache_hole(struct inode *vinode,
+static loff_t bch2_seek_pagecache_hole(struct inode *vinode,
 				       loff_t start_offset,
 				       loff_t end_offset)
 {
 	struct address_space *mapping = vinode->i_mapping;
-	pgoff_t index;
+	loff_t offset = start_offset, hole;
 
-	for (index = start_offset >> PAGE_SHIFT;
-	     index < end_offset >> PAGE_SHIFT;
-	     index++)
-		if (!page_slot_is_data(mapping, index))
-			end_offset = max(start_offset,
-					 ((loff_t) index) << PAGE_SHIFT);
+	while (offset < end_offset) {
+		hole = page_hole_offset(mapping, offset);
+		if (hole >= 0 && hole <= end_offset)
+			return max(start_offset, hole);
+
+		offset += PAGE_SIZE;
+		offset &= PAGE_MASK;
+	}
 
 	return end_offset;
 }
@@ -2977,11 +3002,11 @@ static loff_t bch2_seek_hole(struct file *file, u64 offset)
 			   POS(inode->v.i_ino, offset >> 9),
 			   BTREE_ITER_SLOTS, k, ret) {
 		if (k.k->p.inode != inode->v.i_ino) {
-			next_hole = bch2_next_pagecache_hole(&inode->v,
+			next_hole = bch2_seek_pagecache_hole(&inode->v,
 					offset, MAX_LFS_FILESIZE);
 			break;
 		} else if (!bkey_extent_is_data(k.k)) {
-			next_hole = bch2_next_pagecache_hole(&inode->v,
+			next_hole = bch2_seek_pagecache_hole(&inode->v,
 					max(offset, bkey_start_offset(k.k) << 9),
 					k.k->p.offset << 9);
 
-- 
cgit 


From 2ba5d38b50964bc25075318a2a6fb4a886d492b6 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 30 Jul 2019 14:18:29 -0400
Subject: bcachefs: Count reserved extents as holes

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-io.c | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index f59c6321f530..f1eb9b902db9 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -519,6 +519,7 @@ struct bch_page_sector {
 	/* i_sectors: */
 	enum {
 		SECTOR_UNALLOCATED,
+		SECTOR_RESERVED,
 		SECTOR_DIRTY,
 		SECTOR_ALLOCATED,
 	}			state:2;
@@ -763,10 +764,10 @@ static void bch2_set_page_dirty(struct bch_fs *c,
 		s->s[i].replicas_reserved += sectors;
 		res->disk.sectors -= sectors;
 
-		if (s->s[i].state == SECTOR_UNALLOCATED) {
-			s->s[i].state = SECTOR_DIRTY;
+		if (s->s[i].state == SECTOR_UNALLOCATED)
 			dirty_sectors++;
-		}
+
+		s->s[i].state = max_t(unsigned, s->s[i].state, SECTOR_DIRTY);
 	}
 
 	if (dirty_sectors)
@@ -934,6 +935,9 @@ static void bch2_add_page_sectors(struct bio *bio, struct bkey_s_c k)
 	struct bvec_iter iter;
 	struct bio_vec bv;
 	unsigned nr_ptrs = bch2_bkey_nr_ptrs_allocated(k);
+	unsigned state = k.k->type == KEY_TYPE_reservation
+		? SECTOR_RESERVED
+		: SECTOR_ALLOCATED;
 
 	BUG_ON(bio->bi_iter.bi_sector	< bkey_start_offset(k.k));
 	BUG_ON(bio_end_sector(bio)	> k.k->p.offset);
@@ -947,7 +951,7 @@ static void bch2_add_page_sectors(struct bio *bio, struct bkey_s_c k)
 		     i < (bv.bv_offset + bv.bv_len) >> 9;
 		     i++) {
 			s->s[i].nr_replicas = nr_ptrs;
-			s->s[i].state = SECTOR_ALLOCATED;
+			s->s[i].state = state;
 		}
 	}
 }
@@ -1342,7 +1346,7 @@ do_io:
 	orig = *s;
 
 	for (i = 0; i < PAGE_SECTORS; i++) {
-		if (s->s[i].state == SECTOR_UNALLOCATED)
+		if (s->s[i].state < SECTOR_DIRTY)
 			continue;
 
 		nr_replicas_this_write =
@@ -1352,7 +1356,7 @@ do_io:
 	}
 
 	for (i = 0; i < PAGE_SECTORS; i++) {
-		if (s->s[i].state == SECTOR_UNALLOCATED)
+		if (s->s[i].state < SECTOR_DIRTY)
 			continue;
 
 		s->s[i].nr_replicas = w->opts.compression
@@ -1376,7 +1380,7 @@ do_io:
 		u64 sector;
 
 		while (offset < PAGE_SECTORS &&
-		       orig.s[offset].state == SECTOR_UNALLOCATED)
+		       orig.s[offset].state < SECTOR_DIRTY)
 			offset++;
 
 		if (offset == PAGE_SECTORS)
@@ -1385,7 +1389,7 @@ do_io:
 		sector = ((u64) page->index << PAGE_SECTOR_SHIFT) + offset;
 
 		while (offset + sectors < PAGE_SECTORS &&
-		       orig.s[offset + sectors].state != SECTOR_UNALLOCATED)
+		       orig.s[offset + sectors].state >= SECTOR_DIRTY)
 			sectors++;
 
 		for (i = offset; i < offset + sectors; i++) {
-- 
cgit 


From a99b1caf473461d9269b693286533cc1b7c50d46 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 6 Aug 2019 11:19:58 -0400
Subject: bcachefs: Truncate/fpunch now works on block boundaries, not page

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-io.c | 55 +++++++++++++++++++++++++++++------------------------
 1 file changed, 30 insertions(+), 25 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index f1eb9b902db9..913e26487efb 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -2324,8 +2324,10 @@ static int __bch2_truncate_page(struct bch_inode_info *inode,
 {
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
 	struct address_space *mapping = inode->v.i_mapping;
+	struct bch_page_state *s;
 	unsigned start_offset = start & (PAGE_SIZE - 1);
 	unsigned end_offset = ((end - 1) & (PAGE_SIZE - 1)) + 1;
+	unsigned i;
 	struct page *page;
 	int ret = 0;
 
@@ -2357,12 +2359,32 @@ static int __bch2_truncate_page(struct bch_inode_info *inode,
 		}
 	}
 
+	s = bch2_page_state_create(page, 0);
+	if (!s) {
+		ret = -ENOMEM;
+		goto unlock;
+	}
+
 	if (!PageUptodate(page)) {
 		ret = bch2_read_single_page(page, mapping);
 		if (ret)
 			goto unlock;
 	}
 
+	if (index != start >> PAGE_SHIFT)
+		start_offset = 0;
+	if (index != end >> PAGE_SHIFT)
+		end_offset = PAGE_SIZE;
+
+	for (i = round_up(start_offset, block_bytes(c)) >> 9;
+	     i < round_down(end_offset, block_bytes(c)) >> 9;
+	     i++) {
+		s->s[i].nr_replicas	= 0;
+		s->s[i].state		= SECTOR_UNALLOCATED;
+	}
+
+	zero_user_segment(page, start_offset, end_offset);
+
 	/*
 	 * Bit of a hack - we don't want truncate to fail due to -ENOSPC.
 	 *
@@ -2372,14 +2394,6 @@ static int __bch2_truncate_page(struct bch_inode_info *inode,
 	ret = bch2_get_page_disk_reservation(c, inode, page, false);
 	BUG_ON(ret);
 
-	if (index == start >> PAGE_SHIFT &&
-	    index == end >> PAGE_SHIFT)
-		zero_user_segment(page, start_offset, end_offset);
-	else if (index == start >> PAGE_SHIFT)
-		zero_user_segment(page, start_offset, PAGE_SIZE);
-	else if (index == end >> PAGE_SHIFT)
-		zero_user_segment(page, 0, end_offset);
-
 	filemap_dirty_folio(mapping, page_folio(page));
 unlock:
 	unlock_page(page);
@@ -2391,7 +2405,7 @@ out:
 static int bch2_truncate_page(struct bch_inode_info *inode, loff_t from)
 {
 	return __bch2_truncate_page(inode, from >> PAGE_SHIFT,
-				    from, from + PAGE_SIZE);
+				    from, round_up(from, PAGE_SIZE));
 }
 
 static int bch2_extend(struct bch_inode_info *inode, struct iattr *iattr)
@@ -2483,12 +2497,8 @@ int bch2_truncate(struct bch_inode_info *inode, struct iattr *iattr)
 
 	truncate_setsize(&inode->v, iattr->ia_size);
 
-	/*
-	 * XXX: need a comment explaining why PAGE_SIZE and not block_bytes()
-	 * here:
-	 */
 	ret = __bch2_fpunch(c, inode,
-			round_up(iattr->ia_size, PAGE_SIZE) >> 9,
+			round_up(iattr->ia_size, block_bytes(c)) >> 9,
 			U64_MAX, &inode->ei_journal_seq);
 	if (unlikely(ret))
 		goto err;
@@ -2510,8 +2520,8 @@ err:
 static long bch2_fpunch(struct bch_inode_info *inode, loff_t offset, loff_t len)
 {
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	u64 discard_start = round_up(offset, PAGE_SIZE) >> 9;
-	u64 discard_end = round_down(offset + len, PAGE_SIZE) >> 9;
+	u64 discard_start = round_up(offset, block_bytes(c)) >> 9;
+	u64 discard_end = round_down(offset + len, block_bytes(c)) >> 9;
 	int ret = 0;
 
 	inode_lock(&inode->v);
@@ -2596,7 +2606,7 @@ static long bch2_fcollapse(struct bch_inode_info *inode,
 
 	while (bkey_cmp(dst->pos,
 			POS(inode->v.i_ino,
-			    round_up(new_size, PAGE_SIZE) >> 9)) < 0) {
+			    round_up(new_size, block_bytes(c)) >> 9)) < 0) {
 		struct disk_reservation disk_res;
 
 		ret = bch2_btree_iter_traverse(dst);
@@ -2671,8 +2681,9 @@ static long bch2_fallocate(struct bch_inode_info *inode, int mode,
 	struct btree_trans trans;
 	struct btree_iter *iter;
 	struct bpos end_pos;
-	loff_t block_start, block_end;
-	loff_t end = offset + len;
+	loff_t end		= offset + len;
+	loff_t block_start	= round_down(offset,	block_bytes(c));
+	loff_t block_end	= round_up(end,		block_bytes(c));
 	unsigned sectors;
 	unsigned replicas = io_opts(c, inode).data_replicas;
 	int ret;
@@ -2704,12 +2715,6 @@ static long bch2_fallocate(struct bch_inode_info *inode, int mode,
 			goto err;
 
 		truncate_pagecache_range(&inode->v, offset, end - 1);
-
-		block_start	= round_up(offset, PAGE_SIZE);
-		block_end	= round_down(end, PAGE_SIZE);
-	} else {
-		block_start	= round_down(offset, PAGE_SIZE);
-		block_end	= round_up(end, PAGE_SIZE);
 	}
 
 	iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
-- 
cgit 


From 5b6d40e2b670efc2feff4da9dd065053f5be31a7 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 29 Jun 2019 17:59:21 -0400
Subject: bcachefs: Export correct blocksize to vfs

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index 1b3898eae8b8..54e555fb4d5d 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -1147,7 +1147,8 @@ static int bch2_tmpfile(struct mnt_idmap *idmap,
 	return finish_open_simple(file, 0);
 }
 
-static int bch2_fill_extent(struct fiemap_extent_info *info,
+static int bch2_fill_extent(struct bch_fs *c,
+			    struct fiemap_extent_info *info,
 			    struct bkey_s_c k, unsigned flags)
 {
 	if (bkey_extent_is_data(k.k)) {
@@ -1165,8 +1166,8 @@ static int bch2_fill_extent(struct fiemap_extent_info *info,
 			else
 				offset += p.crc.offset;
 
-			if ((offset & (PAGE_SECTORS - 1)) ||
-			    (k.k->size & (PAGE_SECTORS - 1)))
+			if ((offset & (c->opts.block_size - 1)) ||
+			    (k.k->size & (c->opts.block_size - 1)))
 				flags2 |= FIEMAP_EXTENT_NOT_ALIGNED;
 
 			ret = fiemap_fill_next_extent(info,
@@ -1223,7 +1224,7 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
 		if (bkey_extent_is_data(k.k) ||
 		    k.k->type == KEY_TYPE_reservation) {
 			if (have_extent) {
-				ret = bch2_fill_extent(info,
+				ret = bch2_fill_extent(c, info,
 						bkey_i_to_s_c(&prev.k), 0);
 				if (ret)
 					break;
@@ -1235,7 +1236,7 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
 	}
 
 	if (!ret && have_extent)
-		ret = bch2_fill_extent(info, bkey_i_to_s_c(&prev.k),
+		ret = bch2_fill_extent(c, info, bkey_i_to_s_c(&prev.k),
 				       FIEMAP_EXTENT_LAST);
 
 	ret = bch2_trans_exit(&trans) ?: ret;
@@ -1803,9 +1804,8 @@ static struct dentry *bch2_mount(struct file_system_type *fs_type,
 		goto out;
 	}
 
-	/* XXX: blocksize */
-	sb->s_blocksize		= PAGE_SIZE;
-	sb->s_blocksize_bits	= PAGE_SHIFT;
+	sb->s_blocksize		= block_bytes(c);
+	sb->s_blocksize_bits	= ilog2(block_bytes(c));
 	sb->s_maxbytes		= MAX_LFS_FILESIZE;
 	sb->s_op		= &bch_super_operations;
 	sb->s_export_op		= &bch_export_ops;
-- 
cgit 


From e3d3a9d91a850445e33037d8fabd3930fd9aa208 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 7 Aug 2019 12:07:13 -0400
Subject: bcachefs: trans_get_key() now works correctly for extents

More prep work for reflink: for extents, we're not looking for an exact
mach on pos, rather that the pos is within the range of the key the
iterator points to.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/buckets.c | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index b1914fba1f7f..4e7c58c518d3 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -1342,15 +1342,20 @@ static int trans_get_key(struct btree_trans *trans,
 			 struct btree_iter **iter,
 			 struct bkey_s_c *k)
 {
-	unsigned i;
+	struct btree_insert_entry *i;
 	int ret;
 
-	for (i = 0; i < trans->nr_updates; i++)
-		if (!trans->updates[i].deferred &&
-		    trans->updates[i].iter->btree_id == btree_id &&
-		    !bkey_cmp(pos, trans->updates[i].iter->pos)) {
-			*iter	= trans->updates[i].iter;
-			*k	= bkey_i_to_s_c(trans->updates[i].k);
+	for (i = trans->updates;
+	     i < trans->updates + trans->nr_updates;
+	     i++)
+		if (!i->deferred &&
+		    i->iter->btree_id == btree_id &&
+		    (btree_node_type_is_extents(btree_id)
+		     ? bkey_cmp(pos, bkey_start_pos(&i->k->k)) >= 0 &&
+		       bkey_cmp(pos, i->k->k.p) < 0
+		     : !bkey_cmp(pos, i->iter->pos))) {
+			*iter	= i->iter;
+			*k	= bkey_i_to_s_c(i->k);
 			return 0;
 		}
 
-- 
cgit 


From c8b18c37b2e52a3743be9d235c7f6136fa7940b9 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 9 Aug 2019 17:12:37 -0400
Subject: bcachefs: fix for_each_btree_key()

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index 710ed70ec807..b54351073231 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -242,7 +242,7 @@ static inline struct bkey_s_c __bch2_btree_iter_next(struct btree_iter *iter,
 					    (_start), (_flags))) ?:	\
 		      PTR_ERR_OR_ZERO(((_k) =				\
 			__bch2_btree_iter_peek(_iter, _flags)).k);	\
-	     !ret && (_k).k;						\
+	     !_ret && (_k).k;						\
 	     (_ret) = PTR_ERR_OR_ZERO(((_k) =				\
 			__bch2_btree_iter_next(_iter, _flags)).k))
 
-- 
cgit 


From 1904a65a315d78e6357aed36d3e7e9a5c2563370 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 12 Aug 2019 14:35:34 -0400
Subject: bcachefs: Ensure bch2_trans_get_iter() returns iters with correct
 locks

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 9e6faf7e2830..a91d655035ef 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -86,7 +86,7 @@ void __bch2_btree_node_lock_write(struct btree *b, struct btree_iter *iter)
 	struct btree_iter *linked;
 	unsigned readers = 0;
 
-	EBUG_ON(btree_node_read_locked(iter, b->c.level));
+	EBUG_ON(!btree_node_intent_locked(iter, b->c.level));
 
 	trans_for_each_iter(iter->trans, linked)
 		if (linked->l[b->c.level].b == b &&
@@ -1779,6 +1779,12 @@ found:
 
 		iter->flags &= ~(BTREE_ITER_INTENT|BTREE_ITER_PREFETCH);
 		iter->flags |= flags & (BTREE_ITER_INTENT|BTREE_ITER_PREFETCH);
+
+		if ((iter->flags & BTREE_ITER_INTENT) &&
+		    !bch2_btree_iter_upgrade(iter, 1)) {
+			trace_trans_restart_upgrade(trans->ip);
+			return ERR_PTR(-EINTR);
+		}
 	}
 
 	BUG_ON(iter->btree_id != btree_id);
-- 
cgit 


From b3fce09cd33271d8dcdd3b18527d106558e30728 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 13 Aug 2019 03:16:52 -0400
Subject: bcachefs: Mark space as unallocated on write failure

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-io.c | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 913e26487efb..91a34ca5e4e1 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -1219,11 +1219,20 @@ static void bch2_writepage_io_done(struct closure *cl)
 	struct bio *bio = &io->op.op.wbio.bio;
 	struct bvec_iter_all iter;
 	struct bio_vec *bvec;
+	unsigned i;
 
 	if (io->op.op.error) {
 		bio_for_each_segment_all(bvec, bio, iter) {
+			struct bch_page_state *s;
+
 			SetPageError(bvec->bv_page);
 			mapping_set_error(bvec->bv_page->mapping, -EIO);
+
+			lock_page(bvec->bv_page);
+			s = bch2_page_state(bvec->bv_page);
+			for (i = 0; i < PAGE_SECTORS; i++)
+				s->s[i].nr_replicas = 0;
+			unlock_page(bvec->bv_page);
 		}
 	}
 
-- 
cgit 


From 2cbe5cfe2719c7ffe7a7dd90565be26f7a2adcbb Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 9 Aug 2019 13:01:10 -0400
Subject: bcachefs: Rework calling convention for marking overwrites

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c      |   5 +-
 fs/bcachefs/btree_gc.c              |   5 +-
 fs/bcachefs/btree_update_interior.c |  21 +++---
 fs/bcachefs/buckets.c               | 128 +++++++++++++++++++-----------------
 fs/bcachefs/buckets.h               |  16 +++--
 fs/bcachefs/ec.c                    |   2 +-
 fs/bcachefs/recovery.c              |   2 +-
 7 files changed, 95 insertions(+), 84 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 8b995dbc5018..5619dccdc011 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -232,7 +232,7 @@ int bch2_alloc_read(struct bch_fs *c, struct journal_keys *journal_keys)
 	bch2_trans_init(&trans, c, 0, 0);
 
 	for_each_btree_key(&trans, iter, BTREE_ID_ALLOC, POS_MIN, 0, k, ret)
-		bch2_mark_key(c, k, 0, NULL, 0,
+		bch2_mark_key(c, k, 0, 0, NULL, 0,
 			      BCH_BUCKET_MARK_ALLOC_READ|
 			      BCH_BUCKET_MARK_NOATOMIC);
 
@@ -244,7 +244,8 @@ int bch2_alloc_read(struct bch_fs *c, struct journal_keys *journal_keys)
 
 	for_each_journal_key(*journal_keys, j)
 		if (j->btree_id == BTREE_ID_ALLOC)
-			bch2_mark_key(c, bkey_i_to_s_c(j->k), 0, NULL, 0,
+			bch2_mark_key(c, bkey_i_to_s_c(j->k),
+				      0, 0, NULL, 0,
 				      BCH_BUCKET_MARK_ALLOC_READ|
 				      BCH_BUCKET_MARK_NOATOMIC);
 
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index f93e1d769113..24cf28bf665b 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -173,7 +173,7 @@ static int bch2_gc_mark_key(struct bch_fs *c, struct bkey_s_c k,
 		*max_stale = max(*max_stale, ptr_stale(ca, ptr));
 	}
 
-	bch2_mark_key(c, k, k.k->size, NULL, 0, flags);
+	bch2_mark_key(c, k, 0, k.k->size, NULL, 0, flags);
 fsck_err:
 	return ret;
 }
@@ -420,7 +420,8 @@ static void bch2_mark_pending_btree_node_frees(struct bch_fs *c)
 
 	for_each_pending_btree_node_free(c, as, d)
 		if (d->index_update_done)
-			bch2_mark_key(c, bkey_i_to_s_c(&d->key), 0, NULL, 0,
+			bch2_mark_key(c, bkey_i_to_s_c(&d->key),
+				      0, 0, NULL, 0,
 				      BCH_BUCKET_MARK_GC);
 
 	mutex_unlock(&c->btree_interior_update_lock);
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 0b80dca5656a..946254c51a69 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -194,7 +194,7 @@ found:
 		       : gc_pos_btree_root(as->btree_id)) >= 0 &&
 	    gc_pos_cmp(c->gc_pos, gc_phase(GC_PHASE_PENDING_DELETE)) < 0)
 		bch2_mark_key_locked(c, bkey_i_to_s_c(&d->key),
-			      0, NULL, 0,
+			      0, 0, NULL, 0,
 			      BCH_BUCKET_MARK_OVERWRITE|
 			      BCH_BUCKET_MARK_GC);
 }
@@ -266,11 +266,12 @@ static void bch2_btree_node_free_ondisk(struct bch_fs *c,
 {
 	BUG_ON(!pending->index_update_done);
 
-	bch2_mark_key(c, bkey_i_to_s_c(&pending->key), 0, NULL, 0,
-		      BCH_BUCKET_MARK_OVERWRITE);
+	bch2_mark_key(c, bkey_i_to_s_c(&pending->key),
+		      0, 0, NULL, 0, BCH_BUCKET_MARK_OVERWRITE);
 
 	if (gc_visited(c, gc_phase(GC_PHASE_PENDING_DELETE)))
-		bch2_mark_key(c, bkey_i_to_s_c(&pending->key), 0, NULL, 0,
+		bch2_mark_key(c, bkey_i_to_s_c(&pending->key),
+			      0, 0, NULL, 0,
 			      BCH_BUCKET_MARK_OVERWRITE|
 			      BCH_BUCKET_MARK_GC);
 }
@@ -1077,11 +1078,11 @@ static void bch2_btree_set_root_inmem(struct btree_update *as, struct btree *b)
 	fs_usage = bch2_fs_usage_scratch_get(c);
 
 	bch2_mark_key_locked(c, bkey_i_to_s_c(&b->key),
-		      0, &fs_usage->u, 0,
+		      0, 0, &fs_usage->u, 0,
 		      BCH_BUCKET_MARK_INSERT);
 	if (gc_visited(c, gc_pos_btree_root(b->c.btree_id)))
 		bch2_mark_key_locked(c, bkey_i_to_s_c(&b->key),
-				     0, NULL, 0,
+				     0, 0, NULL, 0,
 				     BCH_BUCKET_MARK_INSERT|
 				     BCH_BUCKET_MARK_GC);
 
@@ -1175,12 +1176,12 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, struct btree *b
 	fs_usage = bch2_fs_usage_scratch_get(c);
 
 	bch2_mark_key_locked(c, bkey_i_to_s_c(insert),
-			     0, &fs_usage->u, 0,
+			     0, 0, &fs_usage->u, 0,
 			     BCH_BUCKET_MARK_INSERT);
 
 	if (gc_visited(c, gc_pos_btree_node(b)))
 		bch2_mark_key_locked(c, bkey_i_to_s_c(insert),
-				     0, NULL, 0,
+				     0, 0, NULL, 0,
 				     BCH_BUCKET_MARK_INSERT|
 				     BCH_BUCKET_MARK_GC);
 
@@ -2003,11 +2004,11 @@ static void __bch2_btree_node_update_key(struct bch_fs *c,
 		fs_usage = bch2_fs_usage_scratch_get(c);
 
 		bch2_mark_key_locked(c, bkey_i_to_s_c(&new_key->k_i),
-			      0, &fs_usage->u, 0,
+			      0, 0, &fs_usage->u, 0,
 			      BCH_BUCKET_MARK_INSERT);
 		if (gc_visited(c, gc_pos_btree_root(b->c.btree_id)))
 			bch2_mark_key_locked(c, bkey_i_to_s_c(&new_key->k_i),
-					     0, NULL, 0,
+					     0, 0, NULL, 0,
 					     BCH_BUCKET_MARK_INSERT||
 					     BCH_BUCKET_MARK_GC);
 
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 4e7c58c518d3..569c9e4a55aa 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -811,23 +811,24 @@ void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
 }
 
 static s64 ptr_disk_sectors_delta(struct extent_ptr_decoded p,
-				  s64 delta)
+				  unsigned offset, s64 delta,
+				  unsigned flags)
 {
-	if (delta > 0) {
-		/*
-		 * marking a new extent, which _will have size_ @delta
-		 *
-		 * in the bch2_mark_update -> BCH_EXTENT_OVERLAP_MIDDLE
-		 * case, we haven't actually created the key we'll be inserting
-		 * yet (for the split) - so we don't want to be using
-		 * k->size/crc.live_size here:
-		 */
-		return __ptr_disk_sectors(p, delta);
+	if (flags & BCH_BUCKET_MARK_OVERWRITE_SPLIT) {
+		BUG_ON(offset + -delta > p.crc.live_size);
+
+		return -((s64) ptr_disk_sectors(p)) +
+			__ptr_disk_sectors(p, offset) +
+			__ptr_disk_sectors(p, p.crc.live_size -
+					   offset + delta);
+	} else if (flags & BCH_BUCKET_MARK_OVERWRITE) {
+		BUG_ON(offset + -delta > p.crc.live_size);
+
+		return -((s64) ptr_disk_sectors(p)) +
+			__ptr_disk_sectors(p, p.crc.live_size +
+					   delta);
 	} else {
-		BUG_ON(-delta > p.crc.live_size);
-
-		return (s64) __ptr_disk_sectors(p, p.crc.live_size + delta) -
-			(s64) ptr_disk_sectors(p);
+		return ptr_disk_sectors(p);
 	}
 }
 
@@ -1006,7 +1007,8 @@ static int bch2_mark_stripe_ptr(struct bch_fs *c,
 }
 
 static int bch2_mark_extent(struct bch_fs *c, struct bkey_s_c k,
-			    s64 sectors, enum bch_data_type data_type,
+			    unsigned offset, s64 sectors,
+			    enum bch_data_type data_type,
 			    struct bch_fs_usage *fs_usage,
 			    unsigned journal_seq, unsigned flags)
 {
@@ -1027,7 +1029,7 @@ static int bch2_mark_extent(struct bch_fs *c, struct bkey_s_c k,
 	bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
 		s64 disk_sectors = data_type == BCH_DATA_BTREE
 			? sectors
-			: ptr_disk_sectors_delta(p, sectors);
+			: ptr_disk_sectors_delta(p, offset, sectors, flags);
 		bool stale = bch2_mark_pointer(c, p, disk_sectors, data_type,
 					fs_usage, journal_seq, flags);
 
@@ -1116,7 +1118,8 @@ static int bch2_mark_stripe(struct bch_fs *c, struct bkey_s_c k,
 }
 
 int bch2_mark_key_locked(struct bch_fs *c,
-		   struct bkey_s_c k, s64 sectors,
+		   struct bkey_s_c k,
+		   unsigned offset, s64 sectors,
 		   struct bch_fs_usage *fs_usage,
 		   u64 journal_seq, unsigned flags)
 {
@@ -1137,11 +1140,11 @@ int bch2_mark_key_locked(struct bch_fs *c,
 			?  c->opts.btree_node_size
 			: -c->opts.btree_node_size;
 
-		ret = bch2_mark_extent(c, k, sectors, BCH_DATA_BTREE,
+		ret = bch2_mark_extent(c, k, offset, sectors, BCH_DATA_BTREE,
 				fs_usage, journal_seq, flags);
 		break;
 	case KEY_TYPE_extent:
-		ret = bch2_mark_extent(c, k, sectors, BCH_DATA_USER,
+		ret = bch2_mark_extent(c, k, offset, sectors, BCH_DATA_USER,
 				fs_usage, journal_seq, flags);
 		break;
 	case KEY_TYPE_stripe:
@@ -1172,14 +1175,14 @@ int bch2_mark_key_locked(struct bch_fs *c,
 }
 
 int bch2_mark_key(struct bch_fs *c, struct bkey_s_c k,
-		  s64 sectors,
+		  unsigned offset, s64 sectors,
 		  struct bch_fs_usage *fs_usage,
 		  u64 journal_seq, unsigned flags)
 {
 	int ret;
 
 	percpu_down_read(&c->mark_lock);
-	ret = bch2_mark_key_locked(c, k, sectors,
+	ret = bch2_mark_key_locked(c, k, offset, sectors,
 				   fs_usage, journal_seq, flags);
 	percpu_up_read(&c->mark_lock);
 
@@ -1195,8 +1198,11 @@ inline int bch2_mark_overwrite(struct btree_trans *trans,
 {
 	struct bch_fs		*c = trans->c;
 	struct btree		*b = iter->l[0].b;
+	unsigned		offset = 0;
 	s64			sectors = 0;
 
+	flags |= BCH_BUCKET_MARK_OVERWRITE;
+
 	if (btree_node_is_extents(b)
 	    ? bkey_cmp(new->k.p, bkey_start_pos(old.k)) <= 0
 	    : bkey_cmp(new->k.p, old.k->p))
@@ -1205,35 +1211,33 @@ inline int bch2_mark_overwrite(struct btree_trans *trans,
 	if (btree_node_is_extents(b)) {
 		switch (bch2_extent_overlap(&new->k, old.k)) {
 		case BCH_EXTENT_OVERLAP_ALL:
+			offset = 0;
 			sectors = -((s64) old.k->size);
 			break;
 		case BCH_EXTENT_OVERLAP_BACK:
+			offset = bkey_start_offset(&new->k) -
+				bkey_start_offset(old.k);
 			sectors = bkey_start_offset(&new->k) -
 				old.k->p.offset;
 			break;
 		case BCH_EXTENT_OVERLAP_FRONT:
+			offset = 0;
 			sectors = bkey_start_offset(old.k) -
 				new->k.p.offset;
 			break;
 		case BCH_EXTENT_OVERLAP_MIDDLE:
-			sectors = old.k->p.offset - new->k.p.offset;
-			BUG_ON(sectors <= 0);
-
-			bch2_mark_key_locked(c, old, sectors,
-				fs_usage, trans->journal_res.seq,
-				BCH_BUCKET_MARK_INSERT|flags);
-
-			sectors = bkey_start_offset(&new->k) -
-				old.k->p.offset;
+			offset = bkey_start_offset(&new->k) -
+				bkey_start_offset(old.k);
+			sectors = -((s64) new->k.size);
+			flags |= BCH_BUCKET_MARK_OVERWRITE_SPLIT;
 			break;
 		}
 
 		BUG_ON(sectors >= 0);
 	}
 
-	return bch2_mark_key_locked(c, old, sectors, fs_usage,
-				    trans->journal_res.seq,
-				    BCH_BUCKET_MARK_OVERWRITE|flags) ?: 1;
+	return bch2_mark_key_locked(c, old, offset, sectors, fs_usage,
+				    trans->journal_res.seq, flags) ?: 1;
 }
 
 int bch2_mark_update(struct btree_trans *trans,
@@ -1251,10 +1255,12 @@ int bch2_mark_update(struct btree_trans *trans,
 	if (!btree_node_type_needs_gc(iter->btree_id))
 		return 0;
 
+	EBUG_ON(btree_node_is_extents(b) &&
+		!bch2_extent_is_atomic(insert->k, insert->iter));
+
 	if (!(trans->flags & BTREE_INSERT_NOMARK_INSERT))
 		bch2_mark_key_locked(c, bkey_i_to_s_c(insert->k),
-			bpos_min(insert->k->k.p, b->key.k.p).offset -
-			bkey_start_offset(&insert->k->k),
+			0, insert->k->k.size,
 			fs_usage, trans->journal_res.seq,
 			BCH_BUCKET_MARK_INSERT|flags);
 
@@ -1519,8 +1525,9 @@ out:
 }
 
 static int bch2_trans_mark_extent(struct btree_trans *trans,
-			struct bkey_s_c k,
-			s64 sectors, enum bch_data_type data_type)
+			struct bkey_s_c k, unsigned offset,
+			s64 sectors, unsigned flags,
+			enum bch_data_type data_type)
 {
 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
 	const union bch_extent_entry *entry;
@@ -1540,7 +1547,7 @@ static int bch2_trans_mark_extent(struct btree_trans *trans,
 	bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
 		s64 disk_sectors = data_type == BCH_DATA_BTREE
 			? sectors
-			: ptr_disk_sectors_delta(p, sectors);
+			: ptr_disk_sectors_delta(p, offset, sectors, flags);
 
 		ret = bch2_trans_mark_pointer(trans, p, disk_sectors,
 					      data_type);
@@ -1575,7 +1582,7 @@ static int bch2_trans_mark_extent(struct btree_trans *trans,
 }
 
 int bch2_trans_mark_key(struct btree_trans *trans, struct bkey_s_c k,
-			s64 sectors, unsigned flags)
+			unsigned offset, s64 sectors, unsigned flags)
 {
 	struct replicas_delta_list *d;
 	struct bch_fs *c = trans->c;
@@ -1586,11 +1593,11 @@ int bch2_trans_mark_key(struct btree_trans *trans, struct bkey_s_c k,
 			?  c->opts.btree_node_size
 			: -c->opts.btree_node_size;
 
-		return bch2_trans_mark_extent(trans, k, sectors,
-					      BCH_DATA_BTREE);
+		return bch2_trans_mark_extent(trans, k, offset, sectors,
+					      flags, BCH_DATA_BTREE);
 	case KEY_TYPE_extent:
-		return bch2_trans_mark_extent(trans, k, sectors,
-					      BCH_DATA_USER);
+		return bch2_trans_mark_extent(trans, k, offset, sectors,
+					      flags, BCH_DATA_USER);
 	case KEY_TYPE_inode:
 		d = replicas_deltas_realloc(trans, 0);
 
@@ -1629,11 +1636,11 @@ int bch2_trans_mark_update(struct btree_trans *trans,
 	if (!btree_node_type_needs_gc(iter->btree_id))
 		return 0;
 
-	ret = bch2_trans_mark_key(trans,
-			bkey_i_to_s_c(insert),
-			bpos_min(insert->k.p, b->key.k.p).offset -
-			bkey_start_offset(&insert->k),
-			BCH_BUCKET_MARK_INSERT);
+	EBUG_ON(btree_node_is_extents(b) &&
+		!bch2_extent_is_atomic(insert, iter));
+
+	ret = bch2_trans_mark_key(trans, bkey_i_to_s_c(insert),
+			0, insert->k.size, BCH_BUCKET_MARK_INSERT);
 	if (ret)
 		return ret;
 
@@ -1641,7 +1648,9 @@ int bch2_trans_mark_update(struct btree_trans *trans,
 						      KEY_TYPE_discard))) {
 		struct bkey		unpacked;
 		struct bkey_s_c		k;
+		unsigned		offset = 0;
 		s64			sectors = 0;
+		unsigned		flags = BCH_BUCKET_MARK_OVERWRITE;
 
 		k = bkey_disassemble(b, _k, &unpacked);
 
@@ -1653,35 +1662,32 @@ int bch2_trans_mark_update(struct btree_trans *trans,
 		if (btree_node_is_extents(b)) {
 			switch (bch2_extent_overlap(&insert->k, k.k)) {
 			case BCH_EXTENT_OVERLAP_ALL:
+				offset = 0;
 				sectors = -((s64) k.k->size);
 				break;
 			case BCH_EXTENT_OVERLAP_BACK:
+				offset = bkey_start_offset(&insert->k) -
+					bkey_start_offset(k.k);
 				sectors = bkey_start_offset(&insert->k) -
 					k.k->p.offset;
 				break;
 			case BCH_EXTENT_OVERLAP_FRONT:
+				offset = 0;
 				sectors = bkey_start_offset(k.k) -
 					insert->k.p.offset;
 				break;
 			case BCH_EXTENT_OVERLAP_MIDDLE:
-				sectors = k.k->p.offset - insert->k.p.offset;
-				BUG_ON(sectors <= 0);
-
-				ret = bch2_trans_mark_key(trans, k, sectors,
-						BCH_BUCKET_MARK_INSERT);
-				if (ret)
-					return ret;
-
-				sectors = bkey_start_offset(&insert->k) -
-					k.k->p.offset;
+				offset = bkey_start_offset(&insert->k) -
+					bkey_start_offset(k.k);
+				sectors = -((s64) insert->k.size);
+				flags |= BCH_BUCKET_MARK_OVERWRITE_SPLIT;
 				break;
 			}
 
 			BUG_ON(sectors >= 0);
 		}
 
-		ret = bch2_trans_mark_key(trans, k, sectors,
-					  BCH_BUCKET_MARK_OVERWRITE);
+		ret = bch2_trans_mark_key(trans, k, offset, sectors, flags);
 		if (ret)
 			return ret;
 
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index 46eb493b42ca..296d250e58dd 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -251,14 +251,15 @@ void bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *,
 
 #define BCH_BUCKET_MARK_INSERT			(1 << 0)
 #define BCH_BUCKET_MARK_OVERWRITE		(1 << 1)
-#define BCH_BUCKET_MARK_BUCKET_INVALIDATE	(1 << 2)
-#define BCH_BUCKET_MARK_GC			(1 << 3)
-#define BCH_BUCKET_MARK_ALLOC_READ		(1 << 4)
-#define BCH_BUCKET_MARK_NOATOMIC		(1 << 5)
+#define BCH_BUCKET_MARK_OVERWRITE_SPLIT		(1 << 2)
+#define BCH_BUCKET_MARK_BUCKET_INVALIDATE	(1 << 3)
+#define BCH_BUCKET_MARK_GC			(1 << 4)
+#define BCH_BUCKET_MARK_ALLOC_READ		(1 << 5)
+#define BCH_BUCKET_MARK_NOATOMIC		(1 << 6)
 
-int bch2_mark_key_locked(struct bch_fs *, struct bkey_s_c, s64,
+int bch2_mark_key_locked(struct bch_fs *, struct bkey_s_c, unsigned, s64,
 			 struct bch_fs_usage *, u64, unsigned);
-int bch2_mark_key(struct bch_fs *, struct bkey_s_c, s64,
+int bch2_mark_key(struct bch_fs *, struct bkey_s_c, unsigned, s64,
 		  struct bch_fs_usage *, u64, unsigned);
 int bch2_fs_usage_apply(struct bch_fs *, struct bch_fs_usage_online *,
 			struct disk_reservation *, unsigned);
@@ -272,7 +273,8 @@ int bch2_mark_update(struct btree_trans *, struct btree_insert_entry *,
 void bch2_replicas_delta_list_apply(struct bch_fs *,
 				    struct bch_fs_usage *,
 				    struct replicas_delta_list *);
-int bch2_trans_mark_key(struct btree_trans *, struct bkey_s_c, s64, unsigned);
+int bch2_trans_mark_key(struct btree_trans *, struct bkey_s_c,
+			unsigned, s64, unsigned);
 int bch2_trans_mark_update(struct btree_trans *,
 			   struct btree_iter *iter,
 			   struct bkey_i *insert);
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index 77a5c3613ff7..2eb33a8460c9 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -1312,7 +1312,7 @@ int bch2_stripes_read(struct bch_fs *c, struct journal_keys *journal_keys)
 			break;
 		}
 
-		bch2_mark_key(c, k, 0, NULL, 0,
+		bch2_mark_key(c, k, 0, 0, NULL, 0,
 			      BCH_BUCKET_MARK_ALLOC_READ|
 			      BCH_BUCKET_MARK_NOATOMIC);
 	}
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 92867b5c078f..a7fc3fe4284a 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -295,7 +295,7 @@ retry:
 
 	if (split_compressed) {
 		ret = bch2_trans_mark_key(&trans, bkey_i_to_s_c(k),
-					  -((s64) k->k.size),
+					  0, -((s64) k->k.size),
 					  BCH_BUCKET_MARK_OVERWRITE) ?:
 		      bch2_trans_commit(&trans, &disk_res, NULL,
 					BTREE_INSERT_ATOMIC|
-- 
cgit 


From 63f1a59847522b5218d1116c12afd9c9ce6c6645 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 17 Aug 2019 15:17:09 -0400
Subject: bcachefs: Improved debug checks

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bset.c       | 43 ++++++++++++++++++++++++++++++-------------
 fs/bcachefs/btree_iter.c | 21 ++++++++++++---------
 2 files changed, 42 insertions(+), 22 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c
index 68442a26756f..78e6fd3f1306 100644
--- a/fs/bcachefs/bset.c
+++ b/fs/bcachefs/bset.c
@@ -21,6 +21,16 @@
 static inline void __bch2_btree_node_iter_advance(struct btree_node_iter *,
 						  struct btree *);
 
+static inline unsigned __btree_node_iter_used(struct btree_node_iter *iter)
+{
+	unsigned n = ARRAY_SIZE(iter->data);
+
+	while (n && __btree_node_iter_set_end(iter, n - 1))
+		--n;
+
+	return n;
+}
+
 struct bset_tree *bch2_bkey_to_bset(struct btree *b, struct bkey_packed *k)
 {
 	return bch2_bkey_to_bset_inlined(b, k);
@@ -98,7 +108,8 @@ void bch2_dump_btree_node_iter(struct btree *b,
 {
 	struct btree_node_iter_set *set;
 
-	printk(KERN_ERR "btree node iter with %u sets:\n", b->nsets);
+	printk(KERN_ERR "btree node iter with %u/%u sets:\n",
+	       __btree_node_iter_used(iter), b->nsets);
 
 	btree_node_iter_for_each(iter, set) {
 		struct bkey_packed *k = __btree_node_offset_to_key(b, set->k);
@@ -107,8 +118,8 @@ void bch2_dump_btree_node_iter(struct btree *b,
 		char buf[100];
 
 		bch2_bkey_to_text(&PBUF(buf), &uk);
-		printk(KERN_ERR "set %zu key %zi/%u: %s\n", t - b->set,
-		       k->_data - bset(b, t)->_data, bset(b, t)->u64s, buf);
+		printk(KERN_ERR "set %zu key %u: %s\n",
+		       t - b->set, set->k, buf);
 	}
 }
 
@@ -170,8 +181,12 @@ void bch2_btree_node_iter_verify(struct btree_node_iter *iter,
 				 struct btree *b)
 {
 	struct btree_node_iter_set *set, *s2;
+	struct bkey_packed *k, *p;
 	struct bset_tree *t;
 
+	if (bch2_btree_node_iter_end(iter))
+		return;
+
 	/* Verify no duplicates: */
 	btree_node_iter_for_each(iter, set)
 		btree_node_iter_for_each(iter, s2)
@@ -192,6 +207,18 @@ found:
 	btree_node_iter_for_each(iter, set)
 		BUG_ON(set != iter->data &&
 		       btree_node_iter_cmp(b, set[-1], set[0]) > 0);
+
+	k = bch2_btree_node_iter_peek_all(iter, b);
+
+	for_each_bset(b, t) {
+		if (iter->data[0].end == t->end_offset)
+			continue;
+
+		p = bch2_bkey_prev_all(b, t,
+			bch2_btree_node_iter_bset_pos(iter, b, t));
+
+		BUG_ON(p && bkey_iter_cmp(b, k, p) < 0);
+	}
 }
 
 void bch2_verify_insert_pos(struct btree *b, struct bkey_packed *where,
@@ -1652,16 +1679,6 @@ void bch2_btree_node_iter_advance(struct btree_node_iter *iter,
 	__bch2_btree_node_iter_advance(iter, b);
 }
 
-static inline unsigned __btree_node_iter_used(struct btree_node_iter *iter)
-{
-	unsigned n = ARRAY_SIZE(iter->data);
-
-	while (n && __btree_node_iter_set_end(iter, n - 1))
-		--n;
-
-	return n;
-}
-
 /*
  * Expensive:
  */
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index a91d655035ef..a702eb3bbefb 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1477,6 +1477,8 @@ recheck:
 		EBUG_ON(bkey_cmp(k.k->p, iter->pos) < 0);
 		EBUG_ON(bkey_deleted(k.k));
 		iter->uptodate = BTREE_ITER_UPTODATE;
+
+		__bch2_btree_iter_verify(iter, l->b);
 		return k;
 	}
 
@@ -1507,6 +1509,8 @@ recheck:
 
 	iter->k	= n;
 	iter->uptodate = BTREE_ITER_UPTODATE;
+
+	__bch2_btree_iter_verify(iter, l->b);
 	return (struct bkey_s_c) { &iter->k, NULL };
 }
 
@@ -1539,19 +1543,18 @@ recheck:
 		goto recheck;
 	}
 
-	if (k.k &&
-	    !bkey_deleted(k.k) &&
-	    !bkey_cmp(iter->pos, k.k->p)) {
-		iter->uptodate = BTREE_ITER_UPTODATE;
-		return k;
-	} else {
+	if (!k.k ||
+	    bkey_deleted(k.k) ||
+	    bkey_cmp(iter->pos, k.k->p)) {
 		/* hole */
 		bkey_init(&iter->k);
 		iter->k.p = iter->pos;
-
-		iter->uptodate = BTREE_ITER_UPTODATE;
-		return (struct bkey_s_c) { &iter->k, NULL };
+		k = (struct bkey_s_c) { &iter->k, NULL };
 	}
+
+	iter->uptodate = BTREE_ITER_UPTODATE;
+	__bch2_btree_iter_verify(iter, l->b);
+	return k;
 }
 
 struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
-- 
cgit 


From 9df279407a2daaf8e6586be483632fe9aaca6ef3 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 17 Aug 2019 15:54:48 -0400
Subject: bcachefs: Fix __bch2_btree_iter_peek_slot_extents()

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index a702eb3bbefb..52932bbdb832 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1436,8 +1436,7 @@ __bch2_btree_iter_peek_slot_extents(struct btree_iter *iter)
 
 recheck:
 	while ((k = __btree_iter_peek_all(iter, l, &iter->k)).k &&
-	       bkey_deleted(k.k) &&
-	       bkey_cmp(bkey_start_pos(k.k), iter->pos) == 0)
+	       bkey_cmp(k.k->p, iter->pos) <= 0)
 		bch2_btree_node_iter_advance(&l->iter, l->b);
 
 	/*
-- 
cgit 


From e67ab0450cca7dc1673e4cd00eecf9d896b15889 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 19 Aug 2019 13:43:01 -0400
Subject: bcachefs: Fix bch2_btree_node_iter_prev_filter()

bch2_btree_node_iter_prev_filter() tried to be smart about iterating
backwards when skipping over whiteouts/discards - but unfortunately,
doing so can leave the node iterator in an inconsistent state; the sane
solution is to just always iterate backwards one key at a time.

But we compact btree nodes when more than a quarter of the keys are
whiteouts/discards, so the optimization wasn't buying us that much
anyways.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bset.c | 35 +++++++++++++++++------------------
 fs/bcachefs/bset.h |  8 ++------
 2 files changed, 19 insertions(+), 24 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c
index 78e6fd3f1306..1dd2bcc69c35 100644
--- a/fs/bcachefs/bset.c
+++ b/fs/bcachefs/bset.c
@@ -1682,12 +1682,10 @@ void bch2_btree_node_iter_advance(struct btree_node_iter *iter,
 /*
  * Expensive:
  */
-struct bkey_packed *bch2_btree_node_iter_prev_filter(struct btree_node_iter *iter,
-						     struct btree *b,
-						     unsigned min_key_type)
+struct bkey_packed *bch2_btree_node_iter_prev_all(struct btree_node_iter *iter,
+						  struct btree *b)
 {
 	struct bkey_packed *k, *prev = NULL;
-	struct bkey_packed *orig_pos = bch2_btree_node_iter_peek_all(iter, b);
 	struct btree_node_iter_set *set;
 	struct bset_tree *t;
 	unsigned end = 0;
@@ -1695,9 +1693,8 @@ struct bkey_packed *bch2_btree_node_iter_prev_filter(struct btree_node_iter *ite
 	bch2_btree_node_iter_verify(iter, b);
 
 	for_each_bset(b, t) {
-		k = bch2_bkey_prev_filter(b, t,
-			bch2_btree_node_iter_bset_pos(iter, b, t),
-			min_key_type);
+		k = bch2_bkey_prev_all(b, t,
+			bch2_btree_node_iter_bset_pos(iter, b, t));
 		if (k &&
 		    (!prev || bkey_iter_cmp(b, k, prev) > 0)) {
 			prev = k;
@@ -1706,7 +1703,7 @@ struct bkey_packed *bch2_btree_node_iter_prev_filter(struct btree_node_iter *ite
 	}
 
 	if (!prev)
-		goto out;
+		return NULL;
 
 	/*
 	 * We're manually memmoving instead of just calling sort() to ensure the
@@ -1727,18 +1724,20 @@ found:
 
 	iter->data[0].k = __btree_node_key_to_offset(b, prev);
 	iter->data[0].end = end;
-out:
-	if (btree_keys_expensive_checks(b)) {
-		struct btree_node_iter iter2 = *iter;
 
-		if (prev)
-			__bch2_btree_node_iter_advance(&iter2, b);
+	bch2_btree_node_iter_verify(iter, b);
+	return prev;
+}
 
-		while ((k = bch2_btree_node_iter_peek_all(&iter2, b)) != orig_pos) {
-			BUG_ON(k->type >= min_key_type);
-			__bch2_btree_node_iter_advance(&iter2, b);
-		}
-	}
+struct bkey_packed *bch2_btree_node_iter_prev_filter(struct btree_node_iter *iter,
+						     struct btree *b,
+						     unsigned min_key_type)
+{
+	struct bkey_packed *prev;
+
+	do {
+		prev = bch2_btree_node_iter_prev_all(iter, b);
+	} while (prev && prev->type < min_key_type);
 
 	return prev;
 }
diff --git a/fs/bcachefs/bset.h b/fs/bcachefs/bset.h
index da3e41cc9757..209d2ed5db3a 100644
--- a/fs/bcachefs/bset.h
+++ b/fs/bcachefs/bset.h
@@ -543,15 +543,11 @@ bch2_btree_node_iter_next_all(struct btree_node_iter *iter, struct btree *b)
 	return ret;
 }
 
+struct bkey_packed *bch2_btree_node_iter_prev_all(struct btree_node_iter *,
+						  struct btree *);
 struct bkey_packed *bch2_btree_node_iter_prev_filter(struct btree_node_iter *,
 						     struct btree *, unsigned);
 
-static inline struct bkey_packed *
-bch2_btree_node_iter_prev_all(struct btree_node_iter *iter, struct btree *b)
-{
-	return bch2_btree_node_iter_prev_filter(iter, b, 0);
-}
-
 static inline struct bkey_packed *
 bch2_btree_node_iter_prev(struct btree_node_iter *iter, struct btree *b)
 {
-- 
cgit 


From 23bbd2bb8f2cbe2679b404f730cbfd6bf42bd878 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 20 Aug 2019 17:43:47 -0400
Subject: bcachefs: Fix bch2_btree_node_iter_fix()

bch2_btree_node_iter_prev_all() depends on an invariant that wasn't
being maintained for extent leaf nodes - specifically, the node iterator
may not have advanced past any keys that compare after the key the node
iterator points to.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c | 92 ++++++++++++++++++++++++------------------------
 1 file changed, 46 insertions(+), 46 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 52932bbdb832..a278921d3e6f 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -496,6 +496,23 @@ static inline void __bch2_btree_iter_verify(struct btree_iter *iter,
 
 #endif
 
+static void btree_node_iter_set_set_pos(struct btree_node_iter *iter,
+					struct btree *b,
+					struct bset_tree *t,
+					struct bkey_packed *k)
+{
+	struct btree_node_iter_set *set;
+
+	btree_node_iter_for_each(iter, set)
+		if (set->end == t->end_offset) {
+			set->k = __btree_node_key_to_offset(b, k);
+			bch2_btree_node_iter_sort(iter, b);
+			return;
+		}
+
+	bch2_btree_node_iter_push(iter, b, k, btree_bkey_last(b, t));
+}
+
 static void __bch2_btree_node_iter_fix(struct btree_iter *iter,
 				      struct btree *b,
 				      struct btree_node_iter *node_iter,
@@ -527,7 +544,8 @@ static void __bch2_btree_node_iter_fix(struct btree_iter *iter,
 				bch2_btree_node_iter_peek_all(node_iter, b),
 				&iter->k);
 	}
-	return;
+
+	goto iter_current_key_not_modified;
 found:
 	set->end = t->end_offset;
 
@@ -569,60 +587,42 @@ found:
 			bkey_disassemble(l->b, k, &iter->k);
 	}
 iter_current_key_not_modified:
-
 	/*
-	 * Interior nodes are special because iterators for interior nodes don't
-	 * obey the usual invariants regarding the iterator position:
-	 *
-	 * We may have whiteouts that compare greater than the iterator
-	 * position, and logically should be in the iterator, but that we
-	 * skipped past to find the first live key greater than the iterator
-	 * position. This becomes an issue when we insert a new key that is
-	 * greater than the current iterator position, but smaller than the
-	 * whiteouts we've already skipped past - this happens in the course of
-	 * a btree split.
-	 *
-	 * We have to rewind the iterator past to before those whiteouts here,
-	 * else bkey_node_iter_prev() is not going to work and who knows what
-	 * else would happen. And we have to do it manually, because here we've
-	 * already done the insert and the iterator is currently inconsistent:
-	 *
-	 * We've got multiple competing invariants, here - we have to be careful
-	 * about rewinding iterators for interior nodes, because they should
-	 * always point to the key for the child node the btree iterator points
-	 * to.
+	 * When a new key is added, and the node iterator now points to that
+	 * key, the iterator might have skipped past deleted keys that should
+	 * come after the key the iterator now points to. We have to rewind to
+	 * before those deleted keys - otherwise bch2_btree_node_iter_prev_all()
+	 * breaks:
 	 */
-	if (b->c.level && new_u64s &&
-	    btree_iter_pos_cmp(iter, b, where) > 0) {
-		struct bset_tree *t, *where_set = bch2_bkey_to_bset_inlined(b, where);
-		struct bkey_packed *k;
+	if (!bch2_btree_node_iter_end(node_iter) &&
+	    (b->c.level ||
+	     (iter->flags & BTREE_ITER_IS_EXTENTS))) {
+		struct bset_tree *t;
+		struct bkey_packed *k, *k2, *p;
+
+		k = bch2_btree_node_iter_peek_all(node_iter, b);
 
 		for_each_bset(b, t) {
-			if (where_set == t)
+			bool set_pos = false;
+
+			if (node_iter->data[0].end == t->end_offset)
 				continue;
 
-			k = bch2_bkey_prev_all(b, t,
-				bch2_btree_node_iter_bset_pos(node_iter, b, t));
-			if (k &&
-			    bkey_iter_cmp(b, k, where) > 0) {
-				struct btree_node_iter_set *set;
-				unsigned offset =
-					__btree_node_key_to_offset(b, bkey_next(k));
-
-				btree_node_iter_for_each(node_iter, set)
-					if (set->k == offset) {
-						set->k = __btree_node_key_to_offset(b, k);
-						bch2_btree_node_iter_sort(node_iter, b);
-						goto next_bset;
-					}
-
-				bch2_btree_node_iter_push(node_iter, b, k,
-						btree_bkey_last(b, t));
+			k2 = bch2_btree_node_iter_bset_pos(node_iter, b, t);
+
+			while ((p = bch2_bkey_prev_all(b, t, k2)) &&
+			       bkey_iter_cmp(b, k, p) < 0) {
+				k2 = p;
+				set_pos = true;
 			}
-next_bset:
-			t = t;
+
+			if (set_pos)
+				btree_node_iter_set_set_pos(node_iter,
+							    b, t, k2);
 		}
 	}
+
+	bch2_btree_node_iter_verify(node_iter, b);
 }
 
 void bch2_btree_node_iter_fix(struct btree_iter *iter,
-- 
cgit 


From 63069bb6bf03ac7a55c53886a1380899df3a176d Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 20 Aug 2019 17:46:22 -0400
Subject: bcachefs: Move node iterator fixup to extent_bset_insert()

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/extents.c | 23 ++++++++++-------------
 1 file changed, 10 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 9f17780b8bc0..0e43e81065b6 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -915,6 +915,16 @@ static void extent_bset_insert(struct bch_fs *c, struct btree_iter *iter,
 	    bch2_extent_merge_inline(c, iter, bkey_to_packed(insert), k, false))
 		return;
 
+	/*
+	 * may have skipped past some deleted extents greater than the insert
+	 * key, before we got to a non deleted extent and knew we could bail out
+	 * rewind the iterator a bit if necessary:
+	 */
+	node_iter = l->iter;
+	while ((k = bch2_btree_node_iter_prev_all(&node_iter, l->b)) &&
+	       bkey_cmp_left_packed(l->b, k, &insert->k.p) > 0)
+		l->iter = node_iter;
+
 	k = bch2_btree_node_iter_bset_pos(&l->iter, l->b, bset_tree_last(l->b));
 
 	bch2_bset_insert(l->b, &l->iter, k, insert, 0);
@@ -1203,19 +1213,6 @@ next:
 		    overlap == BCH_EXTENT_OVERLAP_MIDDLE)
 			break;
 	}
-
-	/*
-	 * may have skipped past some deleted extents greater than the insert
-	 * key, before we got to a non deleted extent and knew we could bail out
-	 * rewind the iterator a bit if necessary:
-	 */
-	{
-		struct btree_node_iter node_iter = l->iter;
-
-		while ((_k = bch2_btree_node_iter_prev_all(&node_iter, l->b)) &&
-		       bkey_cmp_left_packed(l->b, _k, &insert->k.p) > 0)
-			l->iter = node_iter;
-	}
 }
 
 /**
-- 
cgit 


From 3c7f3b7aeb73f2155aec9d00567b70ef55ede465 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 16 Aug 2019 09:58:07 -0400
Subject: bcachefs: Refactor bch2_extent_trim_atomic() for reflink

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update_leaf.c |  8 ++--
 fs/bcachefs/buckets.c           |  6 ---
 fs/bcachefs/extents.c           | 87 ++++++++++++++++++++++++++++++++++-------
 fs/bcachefs/extents.h           |  6 ++-
 fs/bcachefs/fs-io.c             |  8 +++-
 fs/bcachefs/io.c                | 11 +++++-
 fs/bcachefs/recovery.c          | 10 ++++-
 7 files changed, 105 insertions(+), 31 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index b878f9a9882c..5f94b6e9cf28 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -400,8 +400,7 @@ static inline void btree_insert_entry_checks(struct btree_trans *trans,
 		BUG_ON(i->iter->level);
 		BUG_ON(bkey_cmp(bkey_start_pos(&i->k->k), i->iter->pos));
 		EBUG_ON((i->iter->flags & BTREE_ITER_IS_EXTENTS) &&
-			!bch2_extent_is_atomic(i->k, i->iter));
-
+			bkey_cmp(i->k->k.p, i->iter->l[0].b->key.k.p) > 0);
 		EBUG_ON((i->iter->flags & BTREE_ITER_IS_EXTENTS) &&
 			!(trans->flags & BTREE_INSERT_ATOMIC));
 	}
@@ -1031,7 +1030,10 @@ retry:
 			/* create the biggest key we can */
 			bch2_key_resize(&delete.k, max_sectors);
 			bch2_cut_back(end, &delete.k);
-			bch2_extent_trim_atomic(&delete, iter);
+
+			ret = bch2_extent_trim_atomic(&delete, iter);
+			if (ret)
+				break;
 		}
 
 		bch2_trans_update(trans, BTREE_INSERT_ENTRY(iter, &delete));
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 569c9e4a55aa..baf9642d21ca 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -1255,9 +1255,6 @@ int bch2_mark_update(struct btree_trans *trans,
 	if (!btree_node_type_needs_gc(iter->btree_id))
 		return 0;
 
-	EBUG_ON(btree_node_is_extents(b) &&
-		!bch2_extent_is_atomic(insert->k, insert->iter));
-
 	if (!(trans->flags & BTREE_INSERT_NOMARK_INSERT))
 		bch2_mark_key_locked(c, bkey_i_to_s_c(insert->k),
 			0, insert->k->k.size,
@@ -1636,9 +1633,6 @@ int bch2_trans_mark_update(struct btree_trans *trans,
 	if (!btree_node_type_needs_gc(iter->btree_id))
 		return 0;
 
-	EBUG_ON(btree_node_is_extents(b) &&
-		!bch2_extent_is_atomic(insert, iter));
-
 	ret = bch2_trans_mark_key(trans, bkey_i_to_s_c(insert),
 			0, insert->k.size, BCH_BUCKET_MARK_INSERT);
 	if (ret)
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 0e43e81065b6..11defa3d99a5 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -949,47 +949,104 @@ static unsigned bch2_bkey_nr_alloc_ptrs(struct bkey_s_c k)
 	return ret;
 }
 
-static inline struct bpos
-bch2_extent_atomic_end(struct bkey_i *insert, struct btree_iter *iter)
+static int __bch2_extent_atomic_end(struct btree_trans *trans,
+				    struct bkey_s_c k,
+				    unsigned offset,
+				    struct bpos *end,
+				    unsigned *nr_iters,
+				    unsigned max_iters)
+{
+	int ret = 0;
+
+	switch (k.k->type) {
+	case KEY_TYPE_extent:
+		*nr_iters += bch2_bkey_nr_alloc_ptrs(k);
+
+		if (*nr_iters >= max_iters) {
+			*end = bpos_min(*end, k.k->p);
+			return 0;
+		}
+
+		break;
+	}
+
+	return ret;
+}
+
+int bch2_extent_atomic_end(struct btree_trans *trans,
+			   struct btree_iter *iter,
+			   struct bkey_i *insert,
+			   struct bpos *end)
 {
 	struct btree *b = iter->l[0].b;
 	struct btree_node_iter	node_iter = iter->l[0].iter;
 	struct bkey_packed	*_k;
-	unsigned		nr_alloc_ptrs =
+	unsigned		nr_iters =
 		bch2_bkey_nr_alloc_ptrs(bkey_i_to_s_c(insert));
+	int ret = 0;
 
 	BUG_ON(iter->uptodate > BTREE_ITER_NEED_PEEK);
 	BUG_ON(bkey_cmp(bkey_start_pos(&insert->k), b->data->min_key) < 0);
 
-	while ((_k = bch2_btree_node_iter_peek_filter(&node_iter, b,
+	*end = bpos_min(insert->k.p, b->key.k.p);
+
+	ret = __bch2_extent_atomic_end(trans, bkey_i_to_s_c(insert),
+				       0, end, &nr_iters, 10);
+	if (ret)
+		return ret;
+
+	while (nr_iters < 20 &&
+	       (_k = bch2_btree_node_iter_peek_filter(&node_iter, b,
 						      KEY_TYPE_discard))) {
 		struct bkey	unpacked;
 		struct bkey_s_c	k = bkey_disassemble(b, _k, &unpacked);
+		unsigned offset = 0;
 
-		if (bkey_cmp(insert->k.p, bkey_start_pos(k.k)) <= 0)
+		if (bkey_cmp(bkey_start_pos(k.k), *end) >= 0)
 			break;
 
-		nr_alloc_ptrs += bch2_bkey_nr_alloc_ptrs(k);
+		if (bkey_cmp(bkey_start_pos(&insert->k),
+			     bkey_start_pos(k.k)) > 0)
+			offset = bkey_start_offset(&insert->k) -
+				bkey_start_offset(k.k);
 
-		if (nr_alloc_ptrs > 20) {
-			BUG_ON(bkey_cmp(k.k->p, bkey_start_pos(&insert->k)) <= 0);
-			return bpos_min(insert->k.p, k.k->p);
-		}
+		ret = __bch2_extent_atomic_end(trans, k, offset,
+					       end, &nr_iters, 20);
+		if (ret)
+			return ret;
+
+		if (nr_iters >= 20)
+			break;
 
 		bch2_btree_node_iter_advance(&node_iter, b);
 	}
 
-	return bpos_min(insert->k.p, b->key.k.p);
+	return 0;
 }
 
-void bch2_extent_trim_atomic(struct bkey_i *k, struct btree_iter *iter)
+int bch2_extent_trim_atomic(struct bkey_i *k, struct btree_iter *iter)
 {
-	bch2_cut_back(bch2_extent_atomic_end(k, iter), &k->k);
+	struct bpos end;
+	int ret;
+
+	ret = bch2_extent_atomic_end(iter->trans, iter, k, &end);
+	if (ret)
+		return ret;
+
+	bch2_cut_back(end, &k->k);
+	return 0;
 }
 
-bool bch2_extent_is_atomic(struct bkey_i *k, struct btree_iter *iter)
+int bch2_extent_is_atomic(struct bkey_i *k, struct btree_iter *iter)
 {
-	return !bkey_cmp(bch2_extent_atomic_end(k, iter), k->k.p);
+	struct bpos end;
+	int ret;
+
+	ret = bch2_extent_atomic_end(iter->trans, iter, k, &end);
+	if (ret)
+		return ret;
+
+	return !bkey_cmp(end, k->k.p);
 }
 
 enum btree_insert_ret
diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h
index 035d15bbca39..156d8e37045a 100644
--- a/fs/bcachefs/extents.h
+++ b/fs/bcachefs/extents.h
@@ -425,8 +425,10 @@ enum merge_result bch2_reservation_merge(struct bch_fs *,
 	.key_merge	= bch2_reservation_merge,		\
 }
 
-void bch2_extent_trim_atomic(struct bkey_i *, struct btree_iter *);
-bool bch2_extent_is_atomic(struct bkey_i *, struct btree_iter *);
+int bch2_extent_atomic_end(struct btree_trans *, struct btree_iter *,
+			   struct bkey_i *, struct bpos *);
+int bch2_extent_trim_atomic(struct bkey_i *, struct btree_iter *);
+int bch2_extent_is_atomic(struct bkey_i *, struct btree_iter *);
 
 enum btree_insert_ret
 bch2_extent_can_insert(struct btree_trans *, struct btree_insert_entry *,
diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 91a34ca5e4e1..ef94aecaa7cb 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -310,7 +310,9 @@ int bch2_extent_update(struct btree_trans *trans,
 	if (ret)
 		return ret;
 
-	bch2_extent_trim_atomic(k, extent_iter);
+	ret = bch2_extent_trim_atomic(k, extent_iter);
+	if (ret)
+		return ret;
 
 	ret = sum_sector_overwrites(trans, extent_iter,
 				    k, &allocating,
@@ -2634,7 +2636,9 @@ static long bch2_fcollapse(struct bch_inode_info *inode,
 		bch2_cut_front(src->pos, &copy.k);
 		copy.k.k.p.offset -= len >> 9;
 
-		bch2_extent_trim_atomic(&copy.k, dst);
+		ret = bch2_extent_trim_atomic(&copy.k, dst);
+		if (ret)
+			goto bkey_err;
 
 		BUG_ON(bkey_cmp(dst->pos, bkey_start_pos(&copy.k.k)));
 
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index fd1aceea3553..ed84572a9e67 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -274,6 +274,8 @@ int bch2_write_index_default(struct bch_write_op *op)
 	bch2_verify_keylist_sorted(keys);
 
 	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 256);
+retry:
+	bch2_trans_begin(&trans);
 
 	iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
 				   bkey_start_pos(&bch2_keylist_front(keys)->k),
@@ -284,7 +286,9 @@ int bch2_write_index_default(struct bch_write_op *op)
 
 		bkey_copy(&split.k, bch2_keylist_front(keys));
 
-		bch2_extent_trim_atomic(&split.k, iter);
+		ret = bch2_extent_trim_atomic(&split.k, iter);
+		if (ret)
+			break;
 
 		bch2_trans_update(&trans,
 				  BTREE_INSERT_ENTRY(iter, &split.k));
@@ -301,6 +305,11 @@ int bch2_write_index_default(struct bch_write_op *op)
 			bch2_keylist_pop_front(keys);
 	} while (!bch2_keylist_empty(keys));
 
+	if (ret == -EINTR) {
+		ret = 0;
+		goto retry;
+	}
+
 	bch2_trans_exit(&trans);
 
 	return ret;
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index a7fc3fe4284a..3742b241807c 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -247,6 +247,7 @@ static int bch2_extent_replay_key(struct bch_fs *c, struct bkey_i *k)
 	struct disk_reservation disk_res =
 		bch2_disk_reservation_init(c, 0);
 	struct bkey_i *split;
+	struct bpos atomic_end;
 	bool split_compressed = false;
 	int ret;
 
@@ -273,9 +274,14 @@ retry:
 		if (ret)
 			goto err;
 
+		ret = bch2_extent_atomic_end(&trans, split_iter,
+					     k, &atomic_end);
+		if (ret)
+			goto err;
+
 		if (!split_compressed &&
 		    bch2_extent_is_compressed(bkey_i_to_s_c(k)) &&
-		    !bch2_extent_is_atomic(k, split_iter)) {
+		    bkey_cmp(atomic_end, k->k.p) < 0) {
 			ret = bch2_disk_reservation_add(c, &disk_res,
 					k->k.size *
 					bch2_bkey_nr_dirty_ptrs(bkey_i_to_s_c(k)),
@@ -287,7 +293,7 @@ retry:
 
 		bkey_copy(split, k);
 		bch2_cut_front(split_iter->pos, split);
-		bch2_extent_trim_atomic(split, split_iter);
+		bch2_cut_back(atomic_end, &split->k);
 
 		bch2_trans_update(&trans, BTREE_INSERT_ENTRY(split_iter, split));
 		bch2_btree_iter_set_pos(iter, split->k.p);
-- 
cgit 


From 76426098e419c1732efc3f88166f3f3592c215c9 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 16 Aug 2019 09:59:56 -0400
Subject: bcachefs: Reflink

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/Makefile            |   1 +
 fs/bcachefs/bcachefs.h          |   4 +
 fs/bcachefs/bcachefs_format.h   |  26 +++-
 fs/bcachefs/bkey.h              |   2 +
 fs/bcachefs/bkey_methods.c      |   1 +
 fs/bcachefs/btree_types.h       |   9 +-
 fs/bcachefs/btree_update_leaf.c |   3 +-
 fs/bcachefs/buckets.c           | 100 +++++++++++++-
 fs/bcachefs/extents.c           |  50 +++++--
 fs/bcachefs/extents.h           |  19 ++-
 fs/bcachefs/fs-io.c             | 218 ++++++++++++++++++++++-------
 fs/bcachefs/fs-io.h             |  19 +++
 fs/bcachefs/fs.c                |  42 +++++-
 fs/bcachefs/fs.h                |  15 +-
 fs/bcachefs/io.c                | 127 +++++++++++++----
 fs/bcachefs/io.h                |   3 +
 fs/bcachefs/migrate.c           |  13 +-
 fs/bcachefs/move.c              |  98 ++++++++-----
 fs/bcachefs/move.h              |   3 +-
 fs/bcachefs/recovery.c          |  18 +--
 fs/bcachefs/reflink.c           | 300 ++++++++++++++++++++++++++++++++++++++++
 fs/bcachefs/reflink.h           |  32 +++++
 fs/bcachefs/replicas.c          |   1 +
 23 files changed, 945 insertions(+), 159 deletions(-)
 create mode 100644 fs/bcachefs/reflink.c
 create mode 100644 fs/bcachefs/reflink.h

(limited to 'fs')

diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile
index c29ccdb45965..4c2608409144 100644
--- a/fs/bcachefs/Makefile
+++ b/fs/bcachefs/Makefile
@@ -44,6 +44,7 @@ bcachefs-y		:=	\
 	quota.o			\
 	rebalance.o		\
 	recovery.o		\
+	reflink.o		\
 	replicas.o		\
 	siphash.o		\
 	six.o			\
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 68e2d3b1a9a6..410fce3ed8d4 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -361,6 +361,7 @@ enum gc_phase {
 	GC_PHASE_BTREE_XATTRS,
 	GC_PHASE_BTREE_ALLOC,
 	GC_PHASE_BTREE_QUOTAS,
+	GC_PHASE_BTREE_REFLINK,
 
 	GC_PHASE_PENDING_DELETE,
 	GC_PHASE_ALLOC,
@@ -750,6 +751,9 @@ struct bch_fs {
 	struct work_struct	ec_stripe_delete_work;
 	struct llist_head	ec_stripe_delete_list;
 
+	/* REFLINK */
+	u64			reflink_hint;
+
 	/* VFS IO PATH - fs-io.c */
 	struct bio_set		writepage_bioset;
 	struct bio_set		dio_write_bioset;
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index b8aafd2e283a..62afea1e7ec3 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -340,7 +340,9 @@ static inline void bkey_init(struct bkey *k)
 	x(xattr,		11)			\
 	x(alloc,		12)			\
 	x(quota,		13)			\
-	x(stripe,		14)
+	x(stripe,		14)			\
+	x(reflink_p,		15)			\
+	x(reflink_v,		16)
 
 enum bch_bkey_type {
 #define x(name, nr) KEY_TYPE_##name	= nr,
@@ -895,6 +897,24 @@ struct bch_stripe {
 	struct bch_extent_ptr	ptrs[0];
 } __attribute__((packed, aligned(8)));
 
+/* Reflink: */
+
+struct bch_reflink_p {
+	struct bch_val		v;
+	__le64			idx;
+
+	__le32			reservation_generation;
+	__u8			nr_replicas;
+	__u8			pad[3];
+};
+
+struct bch_reflink_v {
+	struct bch_val		v;
+	__le64			refcount;
+	union bch_extent_entry	start[0];
+	__u64			_data[0];
+};
+
 /* Optional/variable size superblock sections: */
 
 struct bch_sb_field {
@@ -1297,6 +1317,7 @@ enum bch_sb_features {
 	BCH_FEATURE_ATOMIC_NLINK	= 3, /* should have gone under compat */
 	BCH_FEATURE_EC			= 4,
 	BCH_FEATURE_JOURNAL_SEQ_BLACKLIST_V3 = 5,
+	BCH_FEATURE_REFLINK		= 6,
 	BCH_FEATURE_NR,
 };
 
@@ -1487,7 +1508,8 @@ LE32_BITMASK(JSET_BIG_ENDIAN,	struct jset, flags, 4, 5);
 	x(XATTRS,	3, "xattrs")			\
 	x(ALLOC,	4, "alloc")			\
 	x(QUOTAS,	5, "quotas")			\
-	x(EC,		6, "erasure_coding")
+	x(EC,		6, "erasure_coding")		\
+	x(REFLINK,	7, "reflink")
 
 enum btree_id {
 #define x(kwd, val, name) BTREE_ID_##kwd = val,
diff --git a/fs/bcachefs/bkey.h b/fs/bcachefs/bkey.h
index b3a08e52e6b3..321fe6fe0b55 100644
--- a/fs/bcachefs/bkey.h
+++ b/fs/bcachefs/bkey.h
@@ -560,6 +560,8 @@ BKEY_VAL_ACCESSORS(xattr);
 BKEY_VAL_ACCESSORS(alloc);
 BKEY_VAL_ACCESSORS(quota);
 BKEY_VAL_ACCESSORS(stripe);
+BKEY_VAL_ACCESSORS(reflink_p);
+BKEY_VAL_ACCESSORS(reflink_v);
 
 /* byte order helpers */
 
diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c
index 8af16ca994e0..6fa6ac1fadc1 100644
--- a/fs/bcachefs/bkey_methods.c
+++ b/fs/bcachefs/bkey_methods.c
@@ -10,6 +10,7 @@
 #include "extents.h"
 #include "inode.h"
 #include "quota.h"
+#include "reflink.h"
 #include "xattr.h"
 
 const char * const bch2_bkey_types[] = {
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index ec14e2deecb7..621cbfa22fc9 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -464,7 +464,13 @@ static inline enum btree_node_type btree_node_type(struct btree *b)
 
 static inline bool btree_node_type_is_extents(enum btree_node_type type)
 {
-	return type == BKEY_TYPE_EXTENTS;
+	switch (type) {
+	case BKEY_TYPE_EXTENTS:
+	case BKEY_TYPE_REFLINK:
+		return true;
+	default:
+		return false;
+	}
 }
 
 static inline bool btree_node_is_extents(struct btree *b)
@@ -480,6 +486,7 @@ static inline bool btree_node_type_needs_gc(enum btree_node_type type)
 	case BKEY_TYPE_EXTENTS:
 	case BKEY_TYPE_INODES:
 	case BKEY_TYPE_EC:
+	case BKEY_TYPE_REFLINK:
 		return true;
 	default:
 		return false;
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 5f94b6e9cf28..443ffb5c709d 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -521,7 +521,8 @@ static inline bool update_triggers_transactional(struct btree_trans *trans,
 {
 	return likely(!(trans->flags & BTREE_INSERT_MARK_INMEM)) &&
 		(i->iter->btree_id == BTREE_ID_EXTENTS ||
-		 i->iter->btree_id == BTREE_ID_INODES);
+		 i->iter->btree_id == BTREE_ID_INODES ||
+		 i->iter->btree_id == BTREE_ID_REFLINK);
 }
 
 static inline bool update_has_triggers(struct btree_trans *trans,
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index baf9642d21ca..3d243f2d1095 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -972,7 +972,7 @@ static int bch2_mark_stripe_ptr(struct bch_fs *c,
 		spin_unlock(&c->ec_stripes_heap_lock);
 		bch_err_ratelimited(c, "pointer to nonexistent stripe %llu",
 				    (u64) p.idx);
-		return -1;
+		return -EIO;
 	}
 
 	BUG_ON(m->r.e.data_type != data_type);
@@ -1144,6 +1144,7 @@ int bch2_mark_key_locked(struct bch_fs *c,
 				fs_usage, journal_seq, flags);
 		break;
 	case KEY_TYPE_extent:
+	case KEY_TYPE_reflink_v:
 		ret = bch2_mark_extent(c, k, offset, sectors, BCH_DATA_USER,
 				fs_usage, journal_seq, flags);
 		break;
@@ -1304,7 +1305,8 @@ void bch2_trans_fs_usage_apply(struct btree_trans *trans,
 	    xchg(&warned_disk_usage, 1))
 		return;
 
-	pr_err("disk usage increased more than %llu sectors reserved", disk_res_sectors);
+	bch_err(c, "disk usage increased more than %llu sectors reserved",
+		disk_res_sectors);
 
 	trans_for_each_update_iter(trans, i) {
 		struct btree_iter	*iter = i->iter;
@@ -1319,7 +1321,7 @@ void bch2_trans_fs_usage_apply(struct btree_trans *trans,
 
 		node_iter = iter->l[0].iter;
 		while ((_k = bch2_btree_node_iter_peek_filter(&node_iter, b,
-							      KEY_TYPE_discard))) {
+							KEY_TYPE_discard))) {
 			struct bkey		unpacked;
 			struct bkey_s_c		k;
 
@@ -1471,6 +1473,7 @@ static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans,
 			struct bch_extent_stripe_ptr p,
 			s64 sectors, enum bch_data_type data_type)
 {
+	struct bch_fs *c = trans->c;
 	struct bch_replicas_padded r;
 	struct btree_iter *iter;
 	struct bkey_i *new_k;
@@ -1487,10 +1490,10 @@ static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans,
 		return ret;
 
 	if (k.k->type != KEY_TYPE_stripe) {
-		bch_err_ratelimited(trans->c,
-				    "pointer to nonexistent stripe %llu",
-				    (u64) p.idx);
-		ret = -1;
+		bch2_fs_inconsistent(c,
+			"pointer to nonexistent stripe %llu",
+			(u64) p.idx);
+		ret = -EIO;
 		goto out;
 	}
 
@@ -1578,6 +1581,84 @@ static int bch2_trans_mark_extent(struct btree_trans *trans,
 	return 0;
 }
 
+static int __bch2_trans_mark_reflink_p(struct btree_trans *trans,
+			struct bkey_s_c_reflink_p p,
+			u64 idx, unsigned sectors,
+			unsigned flags)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter *iter;
+	struct bkey_i *new_k;
+	struct bkey_s_c k;
+	struct bkey_i_reflink_v *r_v;
+	s64 ret;
+
+	ret = trans_get_key(trans, BTREE_ID_REFLINK,
+			    POS(0, idx), &iter, &k);
+	if (ret)
+		return ret;
+
+	if (k.k->type != KEY_TYPE_reflink_v) {
+		bch2_fs_inconsistent(c,
+			"%llu:%llu len %u points to nonexistent indirect extent %llu",
+			p.k->p.inode, p.k->p.offset, p.k->size, idx);
+		ret = -EIO;
+		goto err;
+	}
+
+	if ((flags & BCH_BUCKET_MARK_OVERWRITE) &&
+	    (bkey_start_offset(k.k) < idx ||
+	     k.k->p.offset > idx + sectors))
+		goto out;
+
+	bch2_btree_iter_set_pos(iter, bkey_start_pos(k.k));
+	BUG_ON(iter->uptodate > BTREE_ITER_NEED_PEEK);
+
+	new_k = trans_update_key(trans, iter, k.k->u64s);
+	ret = PTR_ERR_OR_ZERO(new_k);
+	if (ret)
+		goto err;
+
+	bkey_reassemble(new_k, k);
+	r_v = bkey_i_to_reflink_v(new_k);
+
+	le64_add_cpu(&r_v->v.refcount,
+		     !(flags & BCH_BUCKET_MARK_OVERWRITE) ? 1 : -1);
+
+	if (!r_v->v.refcount) {
+		r_v->k.type = KEY_TYPE_deleted;
+		set_bkey_val_u64s(&r_v->k, 0);
+	}
+out:
+	ret = k.k->p.offset - idx;
+err:
+	bch2_trans_iter_put(trans, iter);
+	return ret;
+}
+
+static int bch2_trans_mark_reflink_p(struct btree_trans *trans,
+			struct bkey_s_c_reflink_p p, unsigned offset,
+			s64 sectors, unsigned flags)
+{
+	u64 idx = le64_to_cpu(p.v->idx) + offset;
+	s64 ret = 0;
+
+	sectors = abs(sectors);
+	BUG_ON(offset + sectors > p.k->size);
+
+	while (sectors) {
+		ret = __bch2_trans_mark_reflink_p(trans, p, idx, sectors, flags);
+		if (ret < 0)
+			break;
+
+		idx += ret;
+		sectors = max_t(s64, 0LL, sectors - ret);
+		ret = 0;
+	}
+
+	return ret;
+}
+
 int bch2_trans_mark_key(struct btree_trans *trans, struct bkey_s_c k,
 			unsigned offset, s64 sectors, unsigned flags)
 {
@@ -1593,6 +1674,7 @@ int bch2_trans_mark_key(struct btree_trans *trans, struct bkey_s_c k,
 		return bch2_trans_mark_extent(trans, k, offset, sectors,
 					      flags, BCH_DATA_BTREE);
 	case KEY_TYPE_extent:
+	case KEY_TYPE_reflink_v:
 		return bch2_trans_mark_extent(trans, k, offset, sectors,
 					      flags, BCH_DATA_USER);
 	case KEY_TYPE_inode:
@@ -1616,6 +1698,10 @@ int bch2_trans_mark_key(struct btree_trans *trans, struct bkey_s_c k,
 		d->fs_usage.persistent_reserved[replicas - 1]	+= sectors;
 		return 0;
 	}
+	case KEY_TYPE_reflink_p:
+		return bch2_trans_mark_reflink_p(trans,
+					bkey_s_c_to_reflink_p(k),
+					offset, sectors, flags);
 	default:
 		return 0;
 	}
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 11defa3d99a5..81ec55526ce9 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -744,7 +744,8 @@ void __bch2_cut_front(struct bpos where, struct bkey_s k)
 	case KEY_TYPE_error:
 	case KEY_TYPE_cookie:
 		break;
-	case KEY_TYPE_extent: {
+	case KEY_TYPE_extent:
+	case KEY_TYPE_reflink_v: {
 		struct bkey_ptrs ptrs = bch2_bkey_ptrs(k);
 		union bch_extent_entry *entry;
 		bool seen_crc = false;
@@ -774,6 +775,12 @@ void __bch2_cut_front(struct bpos where, struct bkey_s k)
 
 		break;
 	}
+	case KEY_TYPE_reflink_p: {
+		struct bkey_s_reflink_p p = bkey_s_to_reflink_p(k);
+
+		le64_add_cpu(&p.v->idx, sub);
+		break;
+	}
 	case KEY_TYPE_reservation:
 		break;
 	default:
@@ -968,6 +975,33 @@ static int __bch2_extent_atomic_end(struct btree_trans *trans,
 		}
 
 		break;
+	case KEY_TYPE_reflink_p: {
+		struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
+		u64 idx = le64_to_cpu(p.v->idx);
+		unsigned sectors = end->offset - bkey_start_offset(p.k);
+		struct btree_iter *iter;
+		struct bkey_s_c r_k;
+
+		for_each_btree_key(trans, iter,
+				   BTREE_ID_REFLINK, POS(0, idx + offset),
+				   BTREE_ITER_SLOTS, r_k, ret) {
+			if (bkey_cmp(bkey_start_pos(r_k.k),
+				     POS(0, idx + sectors)) >= 0)
+				break;
+
+			*nr_iters += 1;
+			if (*nr_iters >= max_iters) {
+				struct bpos pos = bkey_start_pos(k.k);
+				pos.offset += r_k.k->p.offset - idx;
+
+				*end = bpos_min(*end, pos);
+				break;
+			}
+		}
+
+		bch2_trans_iter_put(trans, iter);
+		break;
+	}
 	}
 
 	return ret;
@@ -1561,17 +1595,17 @@ bool bch2_extent_normalize(struct bch_fs *c, struct bkey_s k)
 	return false;
 }
 
-void bch2_extent_mark_replicas_cached(struct bch_fs *c,
-				      struct bkey_s_extent e,
-				      unsigned target,
-				      unsigned nr_desired_replicas)
+void bch2_bkey_mark_replicas_cached(struct bch_fs *c, struct bkey_s k,
+				    unsigned target,
+				    unsigned nr_desired_replicas)
 {
+	struct bkey_ptrs ptrs = bch2_bkey_ptrs(k);
 	union bch_extent_entry *entry;
 	struct extent_ptr_decoded p;
-	int extra = bch2_bkey_durability(c, e.s_c) - nr_desired_replicas;
+	int extra = bch2_bkey_durability(c, k.s_c) - nr_desired_replicas;
 
 	if (target && extra > 0)
-		extent_for_each_ptr_decode(e, p, entry) {
+		bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
 			int n = bch2_extent_ptr_durability(c, p);
 
 			if (n && n <= extra &&
@@ -1582,7 +1616,7 @@ void bch2_extent_mark_replicas_cached(struct bch_fs *c,
 		}
 
 	if (extra > 0)
-		extent_for_each_ptr_decode(e, p, entry) {
+		bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
 			int n = bch2_extent_ptr_durability(c, p);
 
 			if (n && n <= extra) {
diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h
index 156d8e37045a..cef93af25858 100644
--- a/fs/bcachefs/extents.h
+++ b/fs/bcachefs/extents.h
@@ -306,6 +306,14 @@ static inline struct bkey_ptrs_c bch2_bkey_ptrs_c(struct bkey_s_c k)
 			to_entry(&s.v->ptrs[s.v->nr_blocks]),
 		};
 	}
+	case KEY_TYPE_reflink_v: {
+		struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(k);
+
+		return (struct bkey_ptrs_c) {
+			r.v->start,
+			bkey_val_end(r),
+		};
+	}
 	default:
 		return (struct bkey_ptrs_c) { NULL, NULL };
 	}
@@ -436,8 +444,8 @@ bch2_extent_can_insert(struct btree_trans *, struct btree_insert_entry *,
 void bch2_insert_fixup_extent(struct btree_trans *,
 			      struct btree_insert_entry *);
 
-void bch2_extent_mark_replicas_cached(struct bch_fs *, struct bkey_s_extent,
-				      unsigned, unsigned);
+void bch2_bkey_mark_replicas_cached(struct bch_fs *, struct bkey_s,
+				    unsigned, unsigned);
 
 const struct bch_extent_ptr *
 bch2_extent_has_device(struct bkey_s_c_extent, unsigned);
@@ -452,17 +460,24 @@ static inline bool bkey_extent_is_data(const struct bkey *k)
 	switch (k->type) {
 	case KEY_TYPE_btree_ptr:
 	case KEY_TYPE_extent:
+	case KEY_TYPE_reflink_p:
+	case KEY_TYPE_reflink_v:
 		return true;
 	default:
 		return false;
 	}
 }
 
+/*
+ * Should extent be counted under inode->i_sectors?
+ */
 static inline bool bkey_extent_is_allocation(const struct bkey *k)
 {
 	switch (k->type) {
 	case KEY_TYPE_extent:
 	case KEY_TYPE_reservation:
+	case KEY_TYPE_reflink_p:
+	case KEY_TYPE_reflink_v:
 		return true;
 	default:
 		return false;
diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index ef94aecaa7cb..771fb111550d 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -16,6 +16,7 @@
 #include "io.h"
 #include "keylist.h"
 #include "quota.h"
+#include "reflink.h"
 #include "trace.h"
 
 #include <linux/aio.h>
@@ -201,9 +202,9 @@ static int inode_set_size(struct bch_inode_info *inode,
 	return 0;
 }
 
-static int __must_check bch2_write_inode_size(struct bch_fs *c,
-					      struct bch_inode_info *inode,
-					      loff_t new_size, unsigned fields)
+int __must_check bch2_write_inode_size(struct bch_fs *c,
+				       struct bch_inode_info *inode,
+				       loff_t new_size, unsigned fields)
 {
 	struct inode_new_size s = {
 		.new_size	= new_size,
@@ -936,15 +937,12 @@ static void bch2_add_page_sectors(struct bio *bio, struct bkey_s_c k)
 {
 	struct bvec_iter iter;
 	struct bio_vec bv;
-	unsigned nr_ptrs = bch2_bkey_nr_ptrs_allocated(k);
+	unsigned nr_ptrs = k.k->type == KEY_TYPE_reflink_v
+		? 0 : bch2_bkey_nr_ptrs_allocated(k);
 	unsigned state = k.k->type == KEY_TYPE_reservation
 		? SECTOR_RESERVED
 		: SECTOR_ALLOCATED;
 
-	BUG_ON(bio->bi_iter.bi_sector	< bkey_start_offset(k.k));
-	BUG_ON(bio_end_sector(bio)	> k.k->p.offset);
-
-
 	bio_for_each_segment(bv, bio, iter) {
 		struct bch_page_state *s = bch2_page_state(bv.bv_page);
 		unsigned i;
@@ -959,10 +957,11 @@ static void bch2_add_page_sectors(struct bio *bio, struct bkey_s_c k)
 }
 
 static void readpage_bio_extend(struct readpages_iter *iter,
-				struct bio *bio, u64 offset,
+				struct bio *bio,
+				unsigned sectors_this_extent,
 				bool get_more)
 {
-	while (bio_end_sector(bio) < offset &&
+	while (bio_sectors(bio) < sectors_this_extent &&
 	       bio->bi_vcnt < bio->bi_max_vecs) {
 		pgoff_t page_offset = bio_end_sector(bio) >> PAGE_SECTOR_SHIFT;
 		struct page *page = readpage_iter_next(iter);
@@ -1012,35 +1011,39 @@ static void bchfs_read(struct btree_trans *trans, struct btree_iter *iter,
 	struct bch_fs *c = trans->c;
 	int flags = BCH_READ_RETRY_IF_STALE|
 		BCH_READ_MAY_PROMOTE;
+	int ret = 0;
 
 	rbio->c = c;
 	rbio->start_time = local_clock();
-
+retry:
 	while (1) {
 		BKEY_PADDED(k) tmp;
 		struct bkey_s_c k;
-		unsigned bytes, offset_into_extent;
+		unsigned bytes, sectors, offset_into_extent;
 
 		bch2_btree_iter_set_pos(iter,
 				POS(inum, rbio->bio.bi_iter.bi_sector));
 
 		k = bch2_btree_iter_peek_slot(iter);
-		BUG_ON(!k.k);
-
-		if (IS_ERR(k.k)) {
-			int ret = btree_iter_err(iter);
-			BUG_ON(!ret);
-			bcache_io_error(c, &rbio->bio, "btree IO error %i", ret);
-			bio_endio(&rbio->bio);
-			return;
-		}
+		ret = bkey_err(k);
+		if (ret)
+			break;
 
 		bkey_reassemble(&tmp.k, k);
-		bch2_trans_unlock(trans);
 		k = bkey_i_to_s_c(&tmp.k);
 
 		offset_into_extent = iter->pos.offset -
 			bkey_start_offset(k.k);
+		sectors = k.k->size - offset_into_extent;
+
+		ret = bch2_read_indirect_extent(trans, iter,
+					&offset_into_extent, &tmp.k);
+		if (ret)
+			break;
+
+		sectors = min(sectors, k.k->size - offset_into_extent);
+
+		bch2_trans_unlock(trans);
 
 		if (readpages_iter) {
 			bool want_full_extent = false;
@@ -1055,13 +1058,11 @@ static void bchfs_read(struct btree_trans *trans, struct btree_iter *iter,
 							     (p.crc.compression_type != 0));
 			}
 
-			readpage_bio_extend(readpages_iter,
-					    &rbio->bio, k.k->p.offset,
-					    want_full_extent);
+			readpage_bio_extend(readpages_iter, &rbio->bio,
+					    sectors, want_full_extent);
 		}
 
-		bytes = min_t(unsigned, bio_sectors(&rbio->bio),
-			      (k.k->size - offset_into_extent)) << 9;
+		bytes = min(sectors, bio_sectors(&rbio->bio)) << 9;
 		swap(rbio->bio.bi_iter.bi_size, bytes);
 
 		if (rbio->bio.bi_iter.bi_size == bytes)
@@ -1078,6 +1079,12 @@ static void bchfs_read(struct btree_trans *trans, struct btree_iter *iter,
 		swap(rbio->bio.bi_iter.bi_size, bytes);
 		bio_advance(&rbio->bio, bytes);
 	}
+
+	if (ret == -EINTR)
+		goto retry;
+
+	bcache_io_error(c, &rbio->bio, "btree IO error %i", ret);
+	bio_endio(&rbio->bio);
 }
 
 void bch2_readahead(struct readahead_control *ractl)
@@ -2256,29 +2263,25 @@ out:
 
 /* truncate: */
 
-static int __bch2_fpunch(struct bch_fs *c, struct bch_inode_info *inode,
-			 u64 start_offset, u64 end_offset, u64 *journal_seq)
+int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter,
+		   struct bpos end, struct bch_inode_info *inode,
+		   u64 new_i_size)
 {
-	struct bpos start	= POS(inode->v.i_ino, start_offset);
-	struct bpos end		= POS(inode->v.i_ino, end_offset);
+	struct bch_fs *c	= trans->c;
 	unsigned max_sectors	= KEY_SIZE_MAX & (~0 << c->block_bits);
-	struct btree_trans trans;
-	struct btree_iter *iter;
 	struct bkey_s_c k;
-	int ret = 0;
-
-	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024);
-
-	iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, start,
-				   BTREE_ITER_INTENT);
+	int ret = 0, ret2 = 0;
 
 	while ((k = bch2_btree_iter_peek(iter)).k &&
-	       !(ret = bkey_err(k)) &&
 	       bkey_cmp(iter->pos, end) < 0) {
 		struct disk_reservation disk_res =
 			bch2_disk_reservation_init(c, 0);
 		struct bkey_i delete;
 
+		ret = bkey_err(k);
+		if (ret)
+			goto btree_err;
+
 		bkey_init(&delete.k);
 		delete.k.p = iter->pos;
 
@@ -2286,23 +2289,51 @@ static int __bch2_fpunch(struct bch_fs *c, struct bch_inode_info *inode,
 		bch2_key_resize(&delete.k, max_sectors);
 		bch2_cut_back(end, &delete.k);
 
-		bch2_trans_begin_updates(&trans);
+		bch2_trans_begin_updates(trans);
 
-		ret = bch2_extent_update(&trans, inode,
+		ret = bch2_extent_update(trans, inode,
 				&disk_res, NULL, iter, &delete,
-				0, true, true, NULL);
+				new_i_size, false, true, NULL);
 		bch2_disk_reservation_put(c, &disk_res);
-
-		if (ret == -EINTR)
+btree_err:
+		if (ret == -EINTR) {
+			ret2 = ret;
 			ret = 0;
+		}
 		if (ret)
 			break;
+	}
 
-		bch2_trans_cond_resched(&trans);
+	if (bkey_cmp(iter->pos, end) > 0) {
+		bch2_btree_iter_set_pos(iter, end);
+		ret = bch2_btree_iter_traverse(iter);
 	}
 
+	return ret ?: ret2;
+}
+
+static int __bch2_fpunch(struct bch_fs *c, struct bch_inode_info *inode,
+			 u64 start_offset, u64 end_offset)
+{
+	struct btree_trans trans;
+	struct btree_iter *iter;
+	int ret = 0;
+
+	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024);
+
+	iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
+				   POS(inode->v.i_ino, start_offset),
+				   BTREE_ITER_INTENT);
+
+	ret = bch2_fpunch_at(&trans, iter,
+			     POS(inode->v.i_ino, end_offset),
+			     inode, 0);
+
 	bch2_trans_exit(&trans);
 
+	if (ret == -EINTR)
+		ret = 0;
+
 	return ret;
 }
 
@@ -2510,7 +2541,7 @@ int bch2_truncate(struct bch_inode_info *inode, struct iattr *iattr)
 
 	ret = __bch2_fpunch(c, inode,
 			round_up(iattr->ia_size, block_bytes(c)) >> 9,
-			U64_MAX, &inode->ei_journal_seq);
+			U64_MAX);
 	if (unlikely(ret))
 		goto err;
 
@@ -2557,8 +2588,7 @@ static long bch2_fpunch(struct bch_inode_info *inode, loff_t offset, loff_t len)
 	truncate_pagecache_range(&inode->v, offset, offset + len - 1);
 
 	if (discard_start < discard_end)
-		ret = __bch2_fpunch(c, inode, discard_start, discard_end,
-				    &inode->ei_journal_seq);
+		ret = __bch2_fpunch(c, inode, discard_start, discard_end);
 err:
 	bch2_pagecache_block_put(&inode->ei_pagecache_lock);
 	inode_unlock(&inode->v);
@@ -2670,7 +2700,7 @@ bkey_err:
 
 	ret = __bch2_fpunch(c, inode,
 			round_up(new_size, block_bytes(c)) >> 9,
-			U64_MAX, &inode->ei_journal_seq);
+			U64_MAX);
 	if (ret)
 		goto err;
 
@@ -2853,6 +2883,94 @@ long bch2_fallocate_dispatch(struct file *file, int mode,
 	return -EOPNOTSUPP;
 }
 
+static void mark_range_unallocated(struct bch_inode_info *inode,
+				   loff_t start, loff_t end)
+{
+	pgoff_t index = start >> PAGE_SHIFT;
+	pgoff_t end_index = (end - 1) >> PAGE_SHIFT;
+	struct folio_batch fbatch;
+	unsigned i, j;
+
+	folio_batch_init(&fbatch);
+
+	while (filemap_get_folios(inode->v.i_mapping,
+				  &index, end_index, &fbatch)) {
+		for (i = 0; i < folio_batch_count(&fbatch); i++) {
+			struct folio *folio = fbatch.folios[i];
+			struct bch_page_state *s;
+
+			folio_lock(folio);
+			s = bch2_page_state(&folio->page);
+
+			if (s)
+				for (j = 0; j < PAGE_SECTORS; j++)
+					s->s[j].nr_replicas = 0;
+
+			folio_unlock(folio);
+		}
+		folio_batch_release(&fbatch);
+		cond_resched();
+	}
+}
+
+loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src,
+			     struct file *file_dst, loff_t pos_dst,
+			     loff_t len, unsigned remap_flags)
+{
+	struct bch_inode_info *src = file_bch_inode(file_src);
+	struct bch_inode_info *dst = file_bch_inode(file_dst);
+	struct bch_fs *c = src->v.i_sb->s_fs_info;
+	loff_t ret = 0;
+	loff_t aligned_len;
+
+	if (remap_flags & ~(REMAP_FILE_DEDUP|REMAP_FILE_ADVISORY))
+		return -EINVAL;
+
+	if (remap_flags & REMAP_FILE_DEDUP)
+		return -EOPNOTSUPP;
+
+	if ((pos_src & (block_bytes(c) - 1)) ||
+	    (pos_dst & (block_bytes(c) - 1)))
+		return -EINVAL;
+
+	if (src == dst &&
+	    abs(pos_src - pos_dst) < len)
+		return -EINVAL;
+
+	bch2_lock_inodes(INODE_LOCK|INODE_PAGECACHE_BLOCK, src, dst);
+
+	inode_dio_wait(&src->v);
+	inode_dio_wait(&dst->v);
+
+	ret = generic_remap_file_range_prep(file_src, pos_src,
+					    file_dst, pos_dst,
+					    &len, remap_flags);
+	if (ret < 0 || len == 0)
+		goto out_unlock;
+
+	aligned_len = round_up(len, block_bytes(c));
+
+	ret = write_invalidate_inode_pages_range(dst->v.i_mapping,
+				pos_dst, pos_dst + aligned_len);
+	if (ret)
+		goto out_unlock;
+
+	mark_range_unallocated(src, pos_src, pos_src + aligned_len);
+
+	ret = bch2_remap_range(c, dst,
+			       POS(dst->v.i_ino, pos_dst >> 9),
+			       POS(src->v.i_ino, pos_src >> 9),
+			       aligned_len >> 9,
+			       pos_dst + len);
+	if (ret > 0)
+		ret = min(ret << 9, len);
+
+out_unlock:
+	bch2_unlock_inodes(INODE_LOCK|INODE_PAGECACHE_BLOCK, src, dst);
+
+	return ret;
+}
+
 /* fseek: */
 
 static int folio_data_offset(struct folio *folio, unsigned offset)
diff --git a/fs/bcachefs/fs-io.h b/fs/bcachefs/fs-io.h
index e263b515e901..861ec25ab9ef 100644
--- a/fs/bcachefs/fs-io.h
+++ b/fs/bcachefs/fs-io.h
@@ -9,6 +9,22 @@
 
 #include <linux/uio.h>
 
+struct quota_res;
+
+int bch2_extent_update(struct btree_trans *,
+		       struct bch_inode_info *,
+		       struct disk_reservation *,
+		       struct quota_res *,
+		       struct btree_iter *,
+		       struct bkey_i *,
+		       u64, bool, bool, s64 *);
+int bch2_fpunch_at(struct btree_trans *, struct btree_iter *,
+		   struct bpos, struct bch_inode_info *, u64);
+
+int __must_check bch2_write_inode_size(struct bch_fs *,
+				       struct bch_inode_info *,
+				       loff_t, unsigned);
+
 int bch2_writepage(struct page *, struct writeback_control *);
 int bch2_read_folio(struct file *, struct folio *);
 
@@ -28,6 +44,9 @@ int bch2_fsync(struct file *, loff_t, loff_t, int);
 int bch2_truncate(struct bch_inode_info *, struct iattr *);
 long bch2_fallocate_dispatch(struct file *, int, loff_t, loff_t);
 
+loff_t bch2_remap_file_range(struct file *, loff_t, struct file *,
+			     loff_t, loff_t, unsigned);
+
 loff_t bch2_llseek(struct file *, loff_t, int);
 
 vm_fault_t bch2_page_fault(struct vm_fault *);
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index 54e555fb4d5d..fad019d3c3f5 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -1157,6 +1157,9 @@ static int bch2_fill_extent(struct bch_fs *c,
 		struct extent_ptr_decoded p;
 		int ret;
 
+		if (k.k->type == KEY_TYPE_reflink_v)
+			flags |= FIEMAP_EXTENT_SHARED;
+
 		bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
 			int flags2 = 0;
 			u64 offset = p.ptr.offset;
@@ -1200,6 +1203,7 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
 	struct btree_iter *iter;
 	struct bkey_s_c k;
 	BKEY_PADDED(k) cur, prev;
+	unsigned offset_into_extent, sectors;
 	bool have_extent = false;
 	int ret = 0;
 
@@ -1212,15 +1216,36 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
 
 	bch2_trans_init(&trans, c, 0, 0);
 
-	for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS,
-			   POS(ei->v.i_ino, start >> 9), 0, k, ret) {
-		if (bkey_cmp(bkey_start_pos(k.k),
-			     POS(ei->v.i_ino, (start + len) >> 9)) >= 0)
-			break;
+	iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
+				   POS(ei->v.i_ino, start >> 9),
+				   BTREE_ITER_SLOTS);
+
+	while (bkey_cmp(iter->pos, POS(ei->v.i_ino, (start + len) >> 9)) < 0) {
+		k = bch2_btree_iter_peek_slot(iter);
+		ret = bkey_err(k);
+		if (ret)
+			goto err;
 
 		bkey_reassemble(&cur.k, k);
 		k = bkey_i_to_s_c(&cur.k);
 
+		offset_into_extent	= iter->pos.offset -
+			bkey_start_offset(k.k);
+		sectors			= k.k->size - offset_into_extent;
+
+		ret = bch2_read_indirect_extent(&trans, iter,
+					&offset_into_extent, &cur.k);
+		if (ret)
+			break;
+
+		sectors = min(sectors, k.k->size - offset_into_extent);
+
+		bch2_cut_front(POS(k.k->p.inode,
+				   bkey_start_offset(k.k) + offset_into_extent),
+			       &cur.k);
+		bch2_key_resize(&cur.k.k, sectors);
+		cur.k.k.p.offset = iter->pos.offset + cur.k.k.size;
+
 		if (bkey_extent_is_data(k.k) ||
 		    k.k->type == KEY_TYPE_reservation) {
 			if (have_extent) {
@@ -1233,12 +1258,16 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
 			bkey_copy(&prev.k, &cur.k);
 			have_extent = true;
 		}
+
+		bch2_btree_iter_set_pos(iter,
+				POS(iter->pos.inode,
+				    iter->pos.offset + sectors));
 	}
 
 	if (!ret && have_extent)
 		ret = bch2_fill_extent(c, info, bkey_i_to_s_c(&prev.k),
 				       FIEMAP_EXTENT_LAST);
-
+err:
 	ret = bch2_trans_exit(&trans) ?: ret;
 	return ret < 0 ? ret : 0;
 }
@@ -1286,6 +1315,7 @@ static const struct file_operations bch_file_operations = {
 #ifdef CONFIG_COMPAT
 	.compat_ioctl	= bch2_compat_fs_ioctl,
 #endif
+	.remap_file_range = bch2_remap_file_range,
 };
 
 static const struct inode_operations bch_file_inode_operations = {
diff --git a/fs/bcachefs/fs.h b/fs/bcachefs/fs.h
index de07f0f1dd51..6edf5dd803f0 100644
--- a/fs/bcachefs/fs.h
+++ b/fs/bcachefs/fs.h
@@ -59,7 +59,8 @@ static inline int ptrcmp(void *l, void *r)
 
 enum bch_inode_lock_op {
 	INODE_LOCK		= (1U << 0),
-	INODE_UPDATE_LOCK	= (1U << 1),
+	INODE_PAGECACHE_BLOCK	= (1U << 1),
+	INODE_UPDATE_LOCK	= (1U << 2),
 };
 
 #define bch2_lock_inodes(_locks, ...)					\
@@ -71,9 +72,11 @@ do {									\
 									\
 	for (i = 1; i < ARRAY_SIZE(a); i++)				\
 		if (a[i] != a[i - 1]) {					\
-			if (_locks & INODE_LOCK)			\
+			if ((_locks) & INODE_LOCK)			\
 				down_write_nested(&a[i]->v.i_rwsem, i);	\
-			if (_locks & INODE_UPDATE_LOCK)			\
+			if ((_locks) & INODE_PAGECACHE_BLOCK)		\
+				bch2_pagecache_block_get(&a[i]->ei_pagecache_lock);\
+			if ((_locks) & INODE_UPDATE_LOCK)			\
 				mutex_lock_nested(&a[i]->ei_update_lock, i);\
 		}							\
 } while (0)
@@ -87,9 +90,11 @@ do {									\
 									\
 	for (i = 1; i < ARRAY_SIZE(a); i++)				\
 		if (a[i] != a[i - 1]) {					\
-			if (_locks & INODE_LOCK)			\
+			if ((_locks) & INODE_LOCK)			\
 				up_write(&a[i]->v.i_rwsem);		\
-			if (_locks & INODE_UPDATE_LOCK)			\
+			if ((_locks) & INODE_PAGECACHE_BLOCK)		\
+				bch2_pagecache_block_put(&a[i]->ei_pagecache_lock);\
+			if ((_locks) & INODE_UPDATE_LOCK)			\
 				mutex_unlock(&a[i]->ei_update_lock);	\
 		}							\
 } while (0)
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index ed84572a9e67..4d359931edb3 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -1041,6 +1041,7 @@ static void promote_start(struct promote_op *op, struct bch_read_bio *rbio)
 
 noinline
 static struct promote_op *__promote_alloc(struct bch_fs *c,
+					  enum btree_id btree_id,
 					  struct bpos pos,
 					  struct extent_ptr_decoded *pick,
 					  struct bch_io_opts opts,
@@ -1097,6 +1098,7 @@ static struct promote_op *__promote_alloc(struct bch_fs *c,
 			(struct data_opts) {
 				.target = opts.promote_target
 			},
+			btree_id,
 			bkey_s_c_null);
 	BUG_ON(ret);
 
@@ -1134,7 +1136,11 @@ static inline struct promote_op *promote_alloc(struct bch_fs *c,
 	if (!should_promote(c, k, pos, opts, flags))
 		return NULL;
 
-	promote = __promote_alloc(c, pos, pick, opts, sectors, rbio);
+	promote = __promote_alloc(c,
+				  k.k->type == KEY_TYPE_reflink_v
+				  ? BTREE_ID_REFLINK
+				  : BTREE_ID_EXTENTS,
+				  pos, pick, opts, sectors, rbio);
 	if (!promote)
 		return NULL;
 
@@ -1278,18 +1284,25 @@ retry:
 			   POS(inode, bvec_iter.bi_sector),
 			   BTREE_ITER_SLOTS, k, ret) {
 		BKEY_PADDED(k) tmp;
-		unsigned bytes, offset_into_extent;
+		unsigned bytes, sectors, offset_into_extent;
 
 		bkey_reassemble(&tmp.k, k);
 		k = bkey_i_to_s_c(&tmp.k);
 
-		bch2_trans_unlock(&trans);
-
 		offset_into_extent = iter->pos.offset -
 			bkey_start_offset(k.k);
+		sectors = k.k->size - offset_into_extent;
+
+		ret = bch2_read_indirect_extent(&trans, iter,
+					&offset_into_extent, &tmp.k);
+		if (ret)
+			break;
 
-		bytes = min_t(unsigned, bvec_iter_sectors(bvec_iter),
-			      (k.k->size - offset_into_extent)) << 9;
+		sectors = min(sectors, k.k->size - offset_into_extent);
+
+		bch2_trans_unlock(&trans);
+
+		bytes = min(sectors, bvec_iter_sectors(bvec_iter)) << 9;
 		swap(bvec_iter.bi_size, bytes);
 
 		ret = __bch2_read_extent(c, rbio, bvec_iter, k,
@@ -1569,6 +1582,48 @@ static void bch2_read_endio(struct bio *bio)
 	bch2_rbio_punt(rbio, __bch2_read_endio, context, wq);
 }
 
+int bch2_read_indirect_extent(struct btree_trans *trans,
+			      struct btree_iter *extent_iter,
+			      unsigned *offset_into_extent,
+			      struct bkey_i *orig_k)
+{
+	struct btree_iter *iter;
+	struct bkey_s_c k;
+	u64 reflink_offset;
+	int ret;
+
+	if (orig_k->k.type != KEY_TYPE_reflink_p)
+		return 0;
+
+	reflink_offset = le64_to_cpu(bkey_i_to_reflink_p(orig_k)->v.idx) +
+		*offset_into_extent;
+
+	iter = __bch2_trans_get_iter(trans, BTREE_ID_REFLINK,
+				     POS(0, reflink_offset),
+				     BTREE_ITER_SLOTS, 1);
+	ret = PTR_ERR_OR_ZERO(iter);
+	if (ret)
+		return ret;
+
+	k = bch2_btree_iter_peek_slot(iter);
+	ret = bkey_err(k);
+	if (ret)
+		goto err;
+
+	if (k.k->type != KEY_TYPE_reflink_v) {
+		__bcache_io_error(trans->c,
+				"pointer to nonexistent indirect extent");
+		ret = -EIO;
+		goto err;
+	}
+
+	*offset_into_extent = iter->pos.offset - bkey_start_offset(k.k);
+	bkey_reassemble(orig_k, k);
+err:
+	bch2_trans_iter_put(trans, iter);
+	return ret;
+}
+
 int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig,
 		       struct bvec_iter iter, struct bkey_s_c k,
 		       unsigned offset_into_extent,
@@ -1644,6 +1699,7 @@ int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig,
 		pos.offset += offset_into_extent;
 		pick.ptr.offset += pick.crc.offset +
 			offset_into_extent;
+		offset_into_extent		= 0;
 		pick.crc.compressed_size	= bvec_iter_sectors(iter);
 		pick.crc.uncompressed_size	= bvec_iter_sectors(iter);
 		pick.crc.offset			= 0;
@@ -1829,25 +1885,47 @@ void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, u64 inode)
 	rbio->c = c;
 	rbio->start_time = local_clock();
 
-	for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS,
-			   POS(inode, rbio->bio.bi_iter.bi_sector),
-			   BTREE_ITER_SLOTS, k, ret) {
+	iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
+				   POS(inode, rbio->bio.bi_iter.bi_sector),
+				   BTREE_ITER_SLOTS);
+
+	while (1) {
 		BKEY_PADDED(k) tmp;
-		unsigned bytes, offset_into_extent;
+		unsigned bytes, sectors, offset_into_extent;
+
+		bch2_btree_iter_set_pos(iter,
+				POS(inode, rbio->bio.bi_iter.bi_sector));
+
+		k = bch2_btree_iter_peek_slot(iter);
+		ret = bkey_err(k);
+		if (ret)
+			goto err;
 
-		/*
-		 * Unlock the iterator while the btree node's lock is still in
-		 * cache, before doing the IO:
-		 */
 		bkey_reassemble(&tmp.k, k);
 		k = bkey_i_to_s_c(&tmp.k);
-		bch2_trans_unlock(&trans);
 
 		offset_into_extent = iter->pos.offset -
 			bkey_start_offset(k.k);
+		sectors = k.k->size - offset_into_extent;
+
+		ret = bch2_read_indirect_extent(&trans, iter,
+					&offset_into_extent, &tmp.k);
+		if (ret)
+			goto err;
+
+		/*
+		 * With indirect extents, the amount of data to read is the min
+		 * of the original extent and the indirect extent:
+		 */
+		sectors = min(sectors, k.k->size - offset_into_extent);
+
+		/*
+		 * Unlock the iterator while the btree node's lock is still in
+		 * cache, before doing the IO:
+		 */
+		bch2_trans_unlock(&trans);
 
-		bytes = min_t(unsigned, bio_sectors(&rbio->bio),
-			      (k.k->size - offset_into_extent)) << 9;
+		bytes = min(sectors, bio_sectors(&rbio->bio)) << 9;
 		swap(rbio->bio.bi_iter.bi_size, bytes);
 
 		if (rbio->bio.bi_iter.bi_size == bytes)
@@ -1856,21 +1934,18 @@ void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, u64 inode)
 		bch2_read_extent(c, rbio, k, offset_into_extent, flags);
 
 		if (flags & BCH_READ_LAST_FRAGMENT)
-			return;
+			break;
 
 		swap(rbio->bio.bi_iter.bi_size, bytes);
 		bio_advance(&rbio->bio, bytes);
 	}
-
-	/*
-	 * If we get here, it better have been because there was an error
-	 * reading a btree node
-	 */
-	BUG_ON(!ret);
-	bcache_io_error(c, &rbio->bio, "btree IO error: %i", ret);
-
+out:
 	bch2_trans_exit(&trans);
+	return;
+err:
+	bcache_io_error(c, &rbio->bio, "btree IO error: %i", ret);
 	bch2_rbio_done(rbio);
+	goto out;
 }
 
 void bch2_fs_io_exit(struct bch_fs *c)
diff --git a/fs/bcachefs/io.h b/fs/bcachefs/io.h
index aa437cb05fe7..a768ccc90f1f 100644
--- a/fs/bcachefs/io.h
+++ b/fs/bcachefs/io.h
@@ -99,6 +99,9 @@ struct bch_devs_mask;
 struct cache_promote_op;
 struct extent_ptr_decoded;
 
+int bch2_read_indirect_extent(struct btree_trans *, struct btree_iter *,
+			      unsigned *, struct bkey_i *);
+
 enum bch_read_flags {
 	BCH_READ_RETRY_IF_STALE		= 1 << 0,
 	BCH_READ_MAY_PROMOTE		= 1 << 1,
diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c
index 301cb72bd3e4..dc3b03d6e627 100644
--- a/fs/bcachefs/migrate.c
+++ b/fs/bcachefs/migrate.c
@@ -34,7 +34,8 @@ static int drop_dev_ptrs(struct bch_fs *c, struct bkey_s k,
 	return 0;
 }
 
-static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
+static int __bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags,
+				   enum btree_id btree_id)
 {
 	struct btree_trans trans;
 	struct btree_iter *iter;
@@ -44,8 +45,8 @@ static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
 
 	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
 
-	iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
-				   POS_MIN, BTREE_ITER_PREFETCH);
+	iter = bch2_trans_get_iter(&trans, btree_id, POS_MIN,
+				   BTREE_ITER_PREFETCH);
 
 	while ((k = bch2_btree_iter_peek(iter)).k &&
 	       !(ret = bkey_err(k))) {
@@ -98,6 +99,12 @@ static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
 	return ret;
 }
 
+static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
+{
+	return  __bch2_dev_usrdata_drop(c, dev_idx, flags, BTREE_ID_EXTENTS) ?:
+		__bch2_dev_usrdata_drop(c, dev_idx, flags, BTREE_ID_REFLINK);
+}
+
 static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
 {
 	struct btree_trans trans;
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index ffa0c2bbe290..05bb74a36230 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -63,13 +63,14 @@ static int bch2_migrate_index_update(struct bch_write_op *op)
 
 	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
 
-	iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
+	iter = bch2_trans_get_iter(&trans, m->btree_id,
 				   bkey_start_pos(&bch2_keylist_front(keys)->k),
 				   BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
 
 	while (1) {
 		struct bkey_s_c k = bch2_btree_iter_peek_slot(iter);
-		struct bkey_i_extent *insert, *new =
+		struct bkey_i *insert;
+		struct bkey_i_extent *new =
 			bkey_i_to_extent(bch2_keylist_front(keys));
 		BKEY_PADDED(k) _new, _insert;
 		const union bch_extent_entry *entry;
@@ -86,26 +87,25 @@ static int bch2_migrate_index_update(struct bch_write_op *op)
 			goto nomatch;
 
 		if (m->data_cmd == DATA_REWRITE &&
-		    !bch2_extent_has_device(bkey_s_c_to_extent(k),
-					    m->data_opts.rewrite_dev))
+		    !bch2_bkey_has_device(k, m->data_opts.rewrite_dev))
 			goto nomatch;
 
 		bkey_reassemble(&_insert.k, k);
-		insert = bkey_i_to_extent(&_insert.k);
+		insert = &_insert.k;
 
 		bkey_copy(&_new.k, bch2_keylist_front(keys));
 		new = bkey_i_to_extent(&_new.k);
 
-		bch2_cut_front(iter->pos, &insert->k_i);
+		bch2_cut_front(iter->pos, insert);
 		bch2_cut_back(new->k.p, &insert->k);
 		bch2_cut_back(insert->k.p, &new->k);
 
 		if (m->data_cmd == DATA_REWRITE)
-			bch2_bkey_drop_device(extent_i_to_s(insert).s,
+			bch2_bkey_drop_device(bkey_i_to_s(insert),
 					      m->data_opts.rewrite_dev);
 
 		extent_for_each_ptr_decode(extent_i_to_s(new), p, entry) {
-			if (bch2_extent_has_device(extent_i_to_s_c(insert), p.ptr.dev)) {
+			if (bch2_bkey_has_device(bkey_i_to_s_c(insert), p.ptr.dev)) {
 				/*
 				 * raced with another move op? extent already
 				 * has a pointer to the device we just wrote
@@ -114,25 +114,25 @@ static int bch2_migrate_index_update(struct bch_write_op *op)
 				continue;
 			}
 
-			bch2_extent_ptr_decoded_append(&insert->k_i, &p);
+			bch2_extent_ptr_decoded_append(insert, &p);
 			did_work = true;
 		}
 
 		if (!did_work)
 			goto nomatch;
 
-		bch2_bkey_narrow_crcs(&insert->k_i,
+		bch2_bkey_narrow_crcs(insert,
 				(struct bch_extent_crc_unpacked) { 0 });
-		bch2_extent_normalize(c, extent_i_to_s(insert).s);
-		bch2_extent_mark_replicas_cached(c, extent_i_to_s(insert),
-						 op->opts.background_target,
-						 op->opts.data_replicas);
+		bch2_extent_normalize(c, bkey_i_to_s(insert));
+		bch2_bkey_mark_replicas_cached(c, bkey_i_to_s(insert),
+					       op->opts.background_target,
+					       op->opts.data_replicas);
 
 		/*
 		 * If we're not fully overwriting @k, and it's compressed, we
 		 * need a reservation for all the pointers in @insert
 		 */
-		nr = bch2_bkey_nr_dirty_ptrs(bkey_i_to_s_c(&insert->k_i)) -
+		nr = bch2_bkey_nr_dirty_ptrs(bkey_i_to_s_c(insert)) -
 			 m->nr_ptrs_reserved;
 
 		if (insert->k.size < k.k->size &&
@@ -148,7 +148,7 @@ static int bch2_migrate_index_update(struct bch_write_op *op)
 		}
 
 		bch2_trans_update(&trans,
-				BTREE_INSERT_ENTRY(iter, &insert->k_i));
+				BTREE_INSERT_ENTRY(iter, insert));
 
 		ret = bch2_trans_commit(&trans, &op->res,
 				op_journal_seq(op),
@@ -213,10 +213,12 @@ int bch2_migrate_write_init(struct bch_fs *c, struct migrate_write *m,
 			    struct bch_io_opts io_opts,
 			    enum data_cmd data_cmd,
 			    struct data_opts data_opts,
+			    enum btree_id btree_id,
 			    struct bkey_s_c k)
 {
 	int ret;
 
+	m->btree_id	= btree_id;
 	m->data_cmd	= data_cmd;
 	m->data_opts	= data_opts;
 	m->nr_ptrs_reserved = 0;
@@ -264,11 +266,12 @@ int bch2_migrate_write_init(struct bch_fs *c, struct migrate_write *m,
 		break;
 	}
 	case DATA_REWRITE: {
+		struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
 		const union bch_extent_entry *entry;
 		struct extent_ptr_decoded p;
 		unsigned compressed_sectors = 0;
 
-		extent_for_each_ptr_decode(bkey_s_c_to_extent(k), p, entry)
+		bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
 			if (!p.ptr.cached &&
 			    p.crc.compression_type != BCH_COMPRESSION_NONE &&
 			    bch2_dev_in_target(c, p.ptr.dev, data_opts.target))
@@ -391,6 +394,7 @@ static int bch2_move_extent(struct bch_fs *c,
 			    struct moving_context *ctxt,
 			    struct write_point_specifier wp,
 			    struct bch_io_opts io_opts,
+			    enum btree_id btree_id,
 			    struct bkey_s_c k,
 			    enum data_cmd data_cmd,
 			    struct data_opts data_opts)
@@ -443,7 +447,7 @@ static int bch2_move_extent(struct bch_fs *c,
 	io->rbio.bio.bi_end_io		= move_read_endio;
 
 	ret = bch2_migrate_write_init(c, &io->write, wp, io_opts,
-				      data_cmd, data_opts, k);
+				      data_cmd, data_opts, btree_id, k);
 	if (ret)
 		goto err_free_pages;
 
@@ -473,16 +477,17 @@ err:
 	return ret;
 }
 
-int bch2_move_data(struct bch_fs *c,
-		   struct bch_ratelimit *rate,
-		   struct write_point_specifier wp,
-		   struct bpos start,
-		   struct bpos end,
-		   move_pred_fn pred, void *arg,
-		   struct bch_move_stats *stats)
+static int __bch2_move_data(struct bch_fs *c,
+		struct moving_context *ctxt,
+		struct bch_ratelimit *rate,
+		struct write_point_specifier wp,
+		struct bpos start,
+		struct bpos end,
+		move_pred_fn pred, void *arg,
+		struct bch_move_stats *stats,
+		enum btree_id btree_id)
 {
 	bool kthread = (current->flags & PF_KTHREAD) != 0;
-	struct moving_context ctxt = { .stats = stats };
 	struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts);
 	BKEY_PADDED(k) tmp;
 	struct btree_trans trans;
@@ -493,17 +498,13 @@ int bch2_move_data(struct bch_fs *c,
 	u64 delay, cur_inum = U64_MAX;
 	int ret = 0, ret2;
 
-	closure_init_stack(&ctxt.cl);
-	INIT_LIST_HEAD(&ctxt.reads);
-	init_waitqueue_head(&ctxt.wait);
-
 	bch2_trans_init(&trans, c, 0, 0);
 
 	stats->data_type = BCH_DATA_USER;
-	stats->btree_id	= BTREE_ID_EXTENTS;
+	stats->btree_id	= btree_id;
 	stats->pos	= POS_MIN;
 
-	iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, start,
+	iter = bch2_trans_get_iter(&trans, btree_id, start,
 				   BTREE_ITER_PREFETCH);
 
 	if (rate)
@@ -528,7 +529,7 @@ int bch2_move_data(struct bch_fs *c,
 
 			if (unlikely(freezing(current))) {
 				bch2_trans_unlock(&trans);
-				move_ctxt_wait_event(&ctxt, list_empty(&ctxt.reads));
+				move_ctxt_wait_event(ctxt, list_empty(&ctxt->reads));
 				try_to_freeze();
 			}
 		} while (delay);
@@ -579,12 +580,12 @@ peek:
 		k = bkey_i_to_s_c(&tmp.k);
 		bch2_trans_unlock(&trans);
 
-		ret2 = bch2_move_extent(c, &ctxt, wp, io_opts, k,
+		ret2 = bch2_move_extent(c, ctxt, wp, io_opts, btree_id, k,
 					data_cmd, data_opts);
 		if (ret2) {
 			if (ret2 == -ENOMEM) {
 				/* memory allocation failure, wait for some IO to finish */
-				bch2_move_ctxt_wait_for_io(&ctxt);
+				bch2_move_ctxt_wait_for_io(ctxt);
 				continue;
 			}
 
@@ -602,7 +603,32 @@ next_nondata:
 		bch2_trans_cond_resched(&trans);
 	}
 out:
-	bch2_trans_exit(&trans);
+	ret = bch2_trans_exit(&trans) ?: ret;
+
+	return ret;
+}
+
+int bch2_move_data(struct bch_fs *c,
+		   struct bch_ratelimit *rate,
+		   struct write_point_specifier wp,
+		   struct bpos start,
+		   struct bpos end,
+		   move_pred_fn pred, void *arg,
+		   struct bch_move_stats *stats)
+{
+	struct moving_context ctxt = { .stats = stats };
+	int ret;
+
+	closure_init_stack(&ctxt.cl);
+	INIT_LIST_HEAD(&ctxt.reads);
+	init_waitqueue_head(&ctxt.wait);
+
+	stats->data_type = BCH_DATA_USER;
+
+	ret =   __bch2_move_data(c, &ctxt, rate, wp, start, end,
+				 pred, arg, stats, BTREE_ID_EXTENTS) ?:
+		__bch2_move_data(c, &ctxt, rate, wp, start, end,
+				 pred, arg, stats, BTREE_ID_REFLINK);
 
 	move_ctxt_wait_event(&ctxt, list_empty(&ctxt.reads));
 	closure_sync(&ctxt.cl);
diff --git a/fs/bcachefs/move.h b/fs/bcachefs/move.h
index 71b3d2b2ddb6..0acd1720d4f8 100644
--- a/fs/bcachefs/move.h
+++ b/fs/bcachefs/move.h
@@ -25,6 +25,7 @@ struct data_opts {
 };
 
 struct migrate_write {
+	enum btree_id		btree_id;
 	enum data_cmd		data_cmd;
 	struct data_opts	data_opts;
 
@@ -44,7 +45,7 @@ int bch2_migrate_write_init(struct bch_fs *, struct migrate_write *,
 			    struct write_point_specifier,
 			    struct bch_io_opts,
 			    enum data_cmd, struct data_opts,
-			    struct bkey_s_c);
+			    enum btree_id, struct bkey_s_c);
 
 typedef enum data_cmd (*move_pred_fn)(struct bch_fs *, void *,
 				struct bkey_s_c,
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 3742b241807c..f2899ba9ad43 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -236,7 +236,8 @@ static void replay_now_at(struct journal *j, u64 seq)
 		bch2_journal_pin_put(j, j->replay_journal_seq++);
 }
 
-static int bch2_extent_replay_key(struct bch_fs *c, struct bkey_i *k)
+static int bch2_extent_replay_key(struct bch_fs *c, enum btree_id btree_id,
+				  struct bkey_i *k)
 {
 	struct btree_trans trans;
 	struct btree_iter *iter, *split_iter;
@@ -255,7 +256,7 @@ static int bch2_extent_replay_key(struct bch_fs *c, struct bkey_i *k)
 retry:
 	bch2_trans_begin(&trans);
 
-	iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
+	iter = bch2_trans_get_iter(&trans, btree_id,
 				   bkey_start_pos(&k->k),
 				   BTREE_ITER_INTENT);
 
@@ -341,22 +342,17 @@ static int bch2_journal_replay(struct bch_fs *c,
 	for_each_journal_key(keys, i) {
 		replay_now_at(j, keys.journal_seq_base + i->journal_seq);
 
-		switch (i->btree_id) {
-		case BTREE_ID_ALLOC:
+		if (i->btree_id == BTREE_ID_ALLOC)
 			ret = bch2_alloc_replay_key(c, i->k);
-			break;
-		case BTREE_ID_EXTENTS:
-			ret = bch2_extent_replay_key(c, i->k);
-			break;
-		default:
+		else if (btree_node_type_is_extents(i->btree_id))
+			ret = bch2_extent_replay_key(c, i->btree_id, i->k);
+		else
 			ret = bch2_btree_insert(c, i->btree_id, i->k,
 						NULL, NULL,
 						BTREE_INSERT_NOFAIL|
 						BTREE_INSERT_LAZY_RW|
 						BTREE_INSERT_JOURNAL_REPLAY|
 						BTREE_INSERT_NOMARK);
-			break;
-		}
 
 		if (ret) {
 			bch_err(c, "journal replay: error %d while replaying key",
diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c
new file mode 100644
index 000000000000..dcca9c1d0f47
--- /dev/null
+++ b/fs/bcachefs/reflink.c
@@ -0,0 +1,300 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "bcachefs.h"
+#include "btree_update.h"
+#include "extents.h"
+#include "fs.h"
+#include "fs-io.h"
+#include "reflink.h"
+
+#include <linux/sched/signal.h>
+
+/* reflink pointers */
+
+const char *bch2_reflink_p_invalid(const struct bch_fs *c, struct bkey_s_c k)
+{
+	struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
+
+	if (bkey_val_bytes(p.k) != sizeof(*p.v))
+		return "incorrect value size";
+
+	return NULL;
+}
+
+void bch2_reflink_p_to_text(struct printbuf *out, struct bch_fs *c,
+			    struct bkey_s_c k)
+{
+	struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
+
+	pr_buf(out, "idx %llu", le64_to_cpu(p.v->idx));
+}
+
+enum merge_result bch2_reflink_p_merge(struct bch_fs *c,
+				       struct bkey_s _l, struct bkey_s _r)
+{
+	struct bkey_s_reflink_p l = bkey_s_to_reflink_p(_l);
+	struct bkey_s_reflink_p r = bkey_s_to_reflink_p(_r);
+
+	if (le64_to_cpu(l.v->idx) + l.k->size != le64_to_cpu(r.v->idx))
+		return BCH_MERGE_NOMERGE;
+
+	if ((u64) l.k->size + r.k->size > KEY_SIZE_MAX) {
+		bch2_key_resize(l.k, KEY_SIZE_MAX);
+		__bch2_cut_front(l.k->p, _r);
+		return BCH_MERGE_PARTIAL;
+	}
+
+	bch2_key_resize(l.k, l.k->size + r.k->size);
+
+	return BCH_MERGE_MERGE;
+}
+
+/* indirect extents */
+
+const char *bch2_reflink_v_invalid(const struct bch_fs *c, struct bkey_s_c k)
+{
+	struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(k);
+
+	if (bkey_val_bytes(r.k) < sizeof(*r.v))
+		return "incorrect value size";
+
+	return bch2_bkey_ptrs_invalid(c, k);
+}
+
+void bch2_reflink_v_to_text(struct printbuf *out, struct bch_fs *c,
+			    struct bkey_s_c k)
+{
+	struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(k);
+
+	pr_buf(out, "refcount: %llu ", le64_to_cpu(r.v->refcount));
+
+	bch2_bkey_ptrs_to_text(out, c, k);
+}
+
+/*
+ * bch2_remap_range() depends on bch2_extent_update(), which depends on various
+ * things tied to the linux vfs for inode updates, for now:
+ */
+#ifndef NO_BCACHEFS_FS
+
+static int bch2_make_extent_indirect(struct btree_trans *trans,
+				     struct btree_iter *extent_iter,
+				     struct bkey_i_extent *e)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter *reflink_iter;
+	struct bkey_s_c k;
+	struct bkey_i_reflink_v *r_v;
+	struct bkey_i_reflink_p *r_p;
+	int ret;
+
+	for_each_btree_key(trans, reflink_iter, BTREE_ID_REFLINK,
+			   POS(0, c->reflink_hint),
+			   BTREE_ITER_INTENT|BTREE_ITER_SLOTS, k, ret) {
+		if (reflink_iter->pos.inode) {
+			bch2_btree_iter_set_pos(reflink_iter, POS_MIN);
+			continue;
+		}
+
+		if (bkey_deleted(k.k) && e->k.size <= k.k->size)
+			break;
+	}
+
+	if (ret)
+		goto err;
+
+	/* rewind iter to start of hole, if necessary: */
+	bch2_btree_iter_set_pos(reflink_iter, bkey_start_pos(k.k));
+
+	r_v = bch2_trans_kmalloc(trans, sizeof(*r_v) + bkey_val_bytes(&e->k));
+	ret = PTR_ERR_OR_ZERO(r_v);
+	if (ret)
+		goto err;
+
+	bkey_reflink_v_init(&r_v->k_i);
+	r_v->k.p	= reflink_iter->pos;
+	bch2_key_resize(&r_v->k, e->k.size);
+	r_v->k.version	= e->k.version;
+
+	set_bkey_val_u64s(&r_v->k, bkey_val_u64s(&r_v->k) +
+			  bkey_val_u64s(&e->k));
+	r_v->v.refcount	= 0;
+	memcpy(r_v->v.start, e->v.start, bkey_val_bytes(&e->k));
+
+	bch2_trans_update(trans, BTREE_INSERT_ENTRY(reflink_iter, &r_v->k_i));
+
+	r_p = bch2_trans_kmalloc(trans, sizeof(*r_p));
+	if (IS_ERR(r_p))
+		return PTR_ERR(r_p);
+
+	e->k.type = KEY_TYPE_reflink_p;
+	r_p = bkey_i_to_reflink_p(&e->k_i);
+	set_bkey_val_bytes(&r_p->k, sizeof(r_p->v));
+	r_p->v.idx = cpu_to_le64(bkey_start_offset(&r_v->k));
+
+	bch2_trans_update(trans, BTREE_INSERT_ENTRY(extent_iter, &r_p->k_i));
+err:
+	if (!IS_ERR(reflink_iter)) {
+		c->reflink_hint = reflink_iter->pos.offset;
+		bch2_trans_iter_put(trans, reflink_iter);
+	}
+
+	return ret;
+}
+
+static struct bkey_s_c get_next_src(struct btree_iter *iter, struct bpos end)
+{
+	struct bkey_s_c k = bch2_btree_iter_peek(iter);
+
+	while (1) {
+		if (bkey_err(k))
+			return k;
+
+		if (bkey_cmp(iter->pos, end) >= 0)
+			return bkey_s_c_null;
+
+		if (k.k->type == KEY_TYPE_extent ||
+		    k.k->type == KEY_TYPE_reflink_p)
+			return k;
+
+		k = bch2_btree_iter_next(iter);
+	}
+}
+
+s64 bch2_remap_range(struct bch_fs *c,
+		     struct bch_inode_info *dst_inode,
+		     struct bpos dst_start, struct bpos src_start,
+		     u64 remap_sectors, u64 new_i_size)
+{
+	struct btree_trans trans;
+	struct btree_iter *dst_iter, *src_iter;
+	struct bkey_s_c src_k;
+	BKEY_PADDED(k) new_dst, new_src;
+	struct bpos dst_end = dst_start, src_end = src_start;
+	struct bpos dst_want, src_want;
+	u64 src_done, dst_done;
+	int ret = 0;
+
+	if (!(c->sb.features & (1ULL << BCH_FEATURE_REFLINK))) {
+		mutex_lock(&c->sb_lock);
+		if (!(c->sb.features & (1ULL << BCH_FEATURE_REFLINK))) {
+			c->disk_sb.sb->features[0] |=
+				cpu_to_le64(1ULL << BCH_FEATURE_REFLINK);
+
+			bch2_write_super(c);
+		}
+		mutex_unlock(&c->sb_lock);
+	}
+
+	dst_end.offset += remap_sectors;
+	src_end.offset += remap_sectors;
+
+	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 4096);
+
+	src_iter = __bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, src_start,
+					 BTREE_ITER_INTENT, 1);
+	dst_iter = __bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, dst_start,
+					 BTREE_ITER_INTENT, 2);
+
+	while (1) {
+		bch2_trans_begin_updates(&trans);
+		trans.mem_top = 0;
+
+		if (fatal_signal_pending(current)) {
+			ret = -EINTR;
+			goto err;
+		}
+
+		src_k = get_next_src(src_iter, src_end);
+		ret = bkey_err(src_k);
+		if (ret)
+			goto btree_err;
+
+		src_done = bpos_min(src_iter->pos, src_end).offset -
+			src_start.offset;
+		dst_want = POS(dst_start.inode, dst_start.offset + src_done);
+
+		if (bkey_cmp(dst_iter->pos, dst_want) < 0) {
+			ret = bch2_fpunch_at(&trans, dst_iter, dst_want,
+					     dst_inode, new_i_size);
+			if (ret)
+				goto btree_err;
+			continue;
+		}
+
+		BUG_ON(bkey_cmp(dst_iter->pos, dst_want));
+
+		if (!bkey_cmp(dst_iter->pos, dst_end))
+			break;
+
+		if (src_k.k->type == KEY_TYPE_extent) {
+			bkey_reassemble(&new_src.k, src_k);
+			src_k = bkey_i_to_s_c(&new_src.k);
+
+			bch2_cut_front(src_iter->pos,	&new_src.k);
+			bch2_cut_back(src_end,		&new_src.k.k);
+
+			ret = bch2_make_extent_indirect(&trans, src_iter,
+						bkey_i_to_extent(&new_src.k));
+			if (ret)
+				goto btree_err;
+
+			BUG_ON(src_k.k->type != KEY_TYPE_reflink_p);
+		}
+
+		if (src_k.k->type == KEY_TYPE_reflink_p) {
+			struct bkey_s_c_reflink_p src_p =
+				bkey_s_c_to_reflink_p(src_k);
+			struct bkey_i_reflink_p *dst_p =
+				bkey_reflink_p_init(&new_dst.k);
+
+			u64 offset = le64_to_cpu(src_p.v->idx) +
+				(src_iter->pos.offset -
+				 bkey_start_offset(src_k.k));
+
+			dst_p->v.idx = cpu_to_le64(offset);
+		} else {
+			BUG();
+		}
+
+		new_dst.k.k.p = dst_iter->pos;
+		bch2_key_resize(&new_dst.k.k,
+				min(src_k.k->p.offset - src_iter->pos.offset,
+				    dst_end.offset - dst_iter->pos.offset));
+
+		ret = bch2_extent_update(&trans, dst_inode, NULL, NULL,
+					 dst_iter, &new_dst.k,
+					 new_i_size, false, true, NULL);
+		if (ret)
+			goto btree_err;
+
+		dst_done = dst_iter->pos.offset - dst_start.offset;
+		src_want = POS(src_start.inode, src_start.offset + dst_done);
+		bch2_btree_iter_set_pos(src_iter, src_want);
+btree_err:
+		if (ret == -EINTR)
+			ret = 0;
+		if (ret)
+			goto err;
+	}
+
+	BUG_ON(bkey_cmp(dst_iter->pos, dst_end));
+err:
+	BUG_ON(bkey_cmp(dst_iter->pos, dst_end) > 0);
+
+	dst_done = dst_iter->pos.offset - dst_start.offset;
+	new_i_size = min(dst_iter->pos.offset << 9, new_i_size);
+
+	ret = bch2_trans_exit(&trans) ?: ret;
+
+	mutex_lock(&dst_inode->ei_update_lock);
+	if (dst_inode->v.i_size < new_i_size) {
+		i_size_write(&dst_inode->v, new_i_size);
+		ret = bch2_write_inode_size(c, dst_inode, new_i_size,
+					    ATTR_MTIME|ATTR_CTIME);
+	}
+	mutex_unlock(&dst_inode->ei_update_lock);
+
+	return dst_done ?: ret;
+}
+
+#endif /* NO_BCACHEFS_FS */
diff --git a/fs/bcachefs/reflink.h b/fs/bcachefs/reflink.h
new file mode 100644
index 000000000000..327618c36d33
--- /dev/null
+++ b/fs/bcachefs/reflink.h
@@ -0,0 +1,32 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_REFLINK_H
+#define _BCACHEFS_REFLINK_H
+
+const char *bch2_reflink_p_invalid(const struct bch_fs *, struct bkey_s_c);
+void bch2_reflink_p_to_text(struct printbuf *, struct bch_fs *,
+			    struct bkey_s_c);
+enum merge_result bch2_reflink_p_merge(struct bch_fs *,
+				       struct bkey_s, struct bkey_s);
+
+#define bch2_bkey_ops_reflink_p (struct bkey_ops) {		\
+	.key_invalid	= bch2_reflink_p_invalid,		\
+	.val_to_text	= bch2_reflink_p_to_text,		\
+	.key_merge	= bch2_reflink_p_merge,		\
+}
+
+const char *bch2_reflink_v_invalid(const struct bch_fs *, struct bkey_s_c);
+void bch2_reflink_v_to_text(struct printbuf *, struct bch_fs *,
+			    struct bkey_s_c);
+
+
+#define bch2_bkey_ops_reflink_v (struct bkey_ops) {		\
+	.key_invalid	= bch2_reflink_v_invalid,		\
+	.val_to_text	= bch2_reflink_v_to_text,		\
+}
+
+#ifndef NO_BCACHEFS_FS
+s64 bch2_remap_range(struct bch_fs *, struct bch_inode_info *,
+		     struct bpos, struct bpos, u64, u64);
+#endif /* NO_BCACHEFS_FS */
+
+#endif /* _BCACHEFS_REFLINK_H */
diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c
index 7a9a7ec26c93..4fb142f3d39c 100644
--- a/fs/bcachefs/replicas.c
+++ b/fs/bcachefs/replicas.c
@@ -113,6 +113,7 @@ void bch2_bkey_to_replicas(struct bch_replicas_entry *e,
 		extent_to_replicas(k, e);
 		break;
 	case KEY_TYPE_extent:
+	case KEY_TYPE_reflink_v:
 		e->data_type = BCH_DATA_USER;
 		extent_to_replicas(k, e);
 		break;
-- 
cgit 


From e42951b0aa50bb67b29da1af1099013c1aeb1d9d Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 21 Aug 2019 18:35:15 -0400
Subject: bcachefs: Fix bch2_sort_repack_merge()

bch2_bkey_normalize() modifies the value, and we were modifying the
original value in the src btree node - but, we're called without a write
lock held on the src node. Oops...

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bkey_sort.c | 19 ++++++++-----------
 fs/bcachefs/extents.c   |  4 ++--
 2 files changed, 10 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bkey_sort.c b/fs/bcachefs/bkey_sort.c
index 9f5d9b4bf1c9..e32fad5a91ac 100644
--- a/fs/bcachefs/bkey_sort.c
+++ b/fs/bcachefs/bkey_sort.c
@@ -415,25 +415,22 @@ bch2_sort_repack_merge(struct bch_fs *c,
 		       struct bkey_format *out_f,
 		       bool filter_whiteouts)
 {
-	struct bkey_packed *prev = NULL, *k_packed, *next;
-	struct bkey k_unpacked;
+	struct bkey_packed *prev = NULL, *k_packed;
 	struct bkey_s k;
 	struct btree_nr_keys nr;
+	BKEY_PADDED(k) tmp;
 
 	memset(&nr, 0, sizeof(nr));
 
-	next = bch2_btree_node_iter_next_all(iter, src);
-	while ((k_packed = next)) {
-		/*
-		 * The filter might modify the size of @k's value, so advance
-		 * the iterator first:
-		 */
-		next = bch2_btree_node_iter_next_all(iter, src);
-
+	while ((k_packed = bch2_btree_node_iter_next_all(iter, src))) {
 		if (filter_whiteouts && bkey_whiteout(k_packed))
 			continue;
 
-		k = __bkey_disassemble(src, k_packed, &k_unpacked);
+		EBUG_ON(bkeyp_val_u64s(&src->format, k_packed) >
+			BKEY_EXTENT_VAL_U64s_MAX);
+
+		bch2_bkey_unpack(src, &tmp.k, k_packed);
+		k = bkey_i_to_s(&tmp.k);
 
 		if (filter_whiteouts &&
 		    bch2_bkey_normalize(c, k))
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 81ec55526ce9..779ee42350ad 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -1590,9 +1590,9 @@ bool bch2_extent_normalize(struct bch_fs *c, struct bkey_s k)
 
 	/* will only happen if all pointers were cached: */
 	if (!bkey_val_u64s(k.k))
-		k.k->type = KEY_TYPE_deleted;
+		k.k->type = KEY_TYPE_discard;
 
-	return false;
+	return bkey_whiteout(k.k);
 }
 
 void bch2_bkey_mark_replicas_cached(struct bch_fs *c, struct bkey_s k,
-- 
cgit 


From f698a957979bfff266a65a2080a224cca7ccc962 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 21 Aug 2019 18:55:07 -0400
Subject: bcachefs: Fix bch2_bkey_narrow_crcs()

We have to reinitialize ptrs whenever we do something that changes them.
Regression from when the code was converted to be generic across all
keys with pointers.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/extents.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 779ee42350ad..f097bed684e5 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -454,6 +454,8 @@ found:
 	BUG_ON(n.live_size != k->k.size);
 
 restart_narrow_pointers:
+	ptrs = bch2_bkey_ptrs(bkey_i_to_s(k));
+
 	bkey_for_each_ptr_decode(&k->k, ptrs, p, i)
 		if (can_narrow_crc(p.crc, n)) {
 			bch2_bkey_drop_ptr(bkey_i_to_s(k), &i->ptr);
-- 
cgit 


From 41fcd6215093b0c40fb4072e04a2b34f09eb4235 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 21 Aug 2019 23:52:10 -0400
Subject: bcachefs: Fix faulty assertion

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/buckets.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 3d243f2d1095..78d43830d0a7 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -1483,8 +1483,6 @@ static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans,
 	s64 parity_sectors;
 	int ret = 0;
 
-	BUG_ON(!sectors);
-
 	ret = trans_get_key(trans, BTREE_ID_EC, POS(0, p.idx), &iter, &k);
 	if (ret)
 		return ret;
@@ -1549,6 +1547,12 @@ static int bch2_trans_mark_extent(struct btree_trans *trans,
 			? sectors
 			: ptr_disk_sectors_delta(p, offset, sectors, flags);
 
+		/*
+		 * can happen due to rounding with compressed extents:
+		 */
+		if (!disk_sectors)
+			continue;
+
 		ret = bch2_trans_mark_pointer(trans, p, disk_sectors,
 					      data_type);
 		if (ret < 0)
-- 
cgit 


From 4b0a66d508d7bfcd2dd2513d4f41a0f3a20f64d5 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 21 Aug 2019 20:16:42 -0400
Subject: bcachefs: Check alignment in write path

Also - fix alignment in bch2_set_page_dirty()

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-io.c |  8 ++++----
 fs/bcachefs/io.c    | 21 +++++++++++++++------
 2 files changed, 19 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 771fb111550d..1873bbb9afda 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -684,8 +684,8 @@ static int bch2_page_reservation_get(struct bch_fs *c,
 	if (!s)
 		return -ENOMEM;
 
-	for (i = offset / 512;
-	     i < DIV_ROUND_UP(offset + len, 512);
+	for (i = round_down(offset, block_bytes(c)) >> 9;
+	     i < round_up(offset + len, block_bytes(c)) >> 9;
 	     i++) {
 		disk_sectors += sectors_to_reserve(&s->s[i],
 						res->disk.nr_replicas);
@@ -757,8 +757,8 @@ static void bch2_set_page_dirty(struct bch_fs *c,
 	struct bch_page_state *s = bch2_page_state(page);
 	unsigned i, dirty_sectors = 0;
 
-	for (i = offset / 512;
-	     i < DIV_ROUND_UP(offset + len, 512);
+	for (i = round_down(offset, block_bytes(c)) >> 9;
+	     i < round_up(offset + len, block_bytes(c)) >> 9;
 	     i++) {
 		unsigned sectors = sectors_to_reserve(&s->s[i],
 						res->disk.nr_replicas);
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index 4d359931edb3..5db83374403b 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -920,30 +920,39 @@ flush_io:
 void bch2_write(struct closure *cl)
 {
 	struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
+	struct bio *bio = &op->wbio.bio;
 	struct bch_fs *c = op->c;
 
 	BUG_ON(!op->nr_replicas);
 	BUG_ON(!op->write_point.v);
 	BUG_ON(!bkey_cmp(op->pos, POS_MAX));
 
+	if (bio_sectors(bio) & (c->opts.block_size - 1)) {
+		__bcache_io_error(c, "misaligned write");
+		op->error = -EIO;
+		goto err;
+	}
+
 	op->start_time = local_clock();
 
 	bch2_keylist_init(&op->insert_keys, op->inline_keys);
-	wbio_init(&op->wbio.bio)->put_bio = false;
+	wbio_init(bio)->put_bio = false;
 
 	if (c->opts.nochanges ||
 	    !percpu_ref_tryget(&c->writes)) {
 		__bcache_io_error(c, "read only");
 		op->error = -EROFS;
-		if (!(op->flags & BCH_WRITE_NOPUT_RESERVATION))
-			bch2_disk_reservation_put(c, &op->res);
-		closure_return(cl);
-		return;
+		goto err;
 	}
 
-	bch2_increment_clock(c, bio_sectors(&op->wbio.bio), WRITE);
+	bch2_increment_clock(c, bio_sectors(bio), WRITE);
 
 	continue_at_nobarrier(cl, __bch2_write, NULL);
+	return;
+err:
+	if (!(op->flags & BCH_WRITE_NOPUT_RESERVATION))
+		bch2_disk_reservation_put(c, &op->res);
+	closure_return(cl);
 }
 
 /* Cache promotion on read */
-- 
cgit 


From e14f4ab4fee5377f65e7d3bad646c1418782ead0 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 22 Aug 2019 11:17:04 -0400
Subject: bcachefs: Re-enable bkey_debugcheck() in the extent update path

Also, move other update path checks to where they actually check all the
updates (after triggers have run)

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update_leaf.c |  8 ++++----
 fs/bcachefs/extents.c           | 17 +++++++++--------
 2 files changed, 13 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 443ffb5c709d..1e631dc8ff7c 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -575,6 +575,10 @@ static inline int do_btree_insert_at(struct btree_trans *trans,
 		}
 	} while (saw_non_marked);
 
+	trans_for_each_update(trans, i)
+		btree_insert_entry_checks(trans, i);
+	bch2_btree_trans_verify_locks(trans);
+
 	btree_trans_lock_write(c, trans);
 
 	if (race_fault()) {
@@ -875,10 +879,6 @@ int bch2_trans_commit(struct btree_trans *trans,
 	trans->journal_seq	= journal_seq;
 	trans->flags		= flags;
 
-	trans_for_each_update(trans, i)
-		btree_insert_entry_checks(trans, i);
-	bch2_btree_trans_verify_locks(trans);
-
 	if (unlikely(!(trans->flags & BTREE_INSERT_NOCHECK_RW) &&
 		     !percpu_ref_tryget(&c->writes))) {
 		if (likely(!(trans->flags & BTREE_INSERT_LAZY_RW)))
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index f097bed684e5..8257a1ae384b 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -1373,12 +1373,11 @@ void bch2_insert_fixup_extent(struct btree_trans *trans,
 
 		if (s.deleting)
 			tmp.k.k.type = KEY_TYPE_discard;
-#if 0
-		/* disabled due to lock recursion - mark_lock: */
+
 		if (debug_check_bkeys(c))
 			bch2_bkey_debugcheck(c, iter->l[0].b,
 					     bkey_i_to_s_c(&tmp.k));
-#endif
+
 		EBUG_ON(bkey_deleted(&tmp.k.k) || !tmp.k.k.size);
 
 		extent_bset_insert(c, iter, &tmp.k);
@@ -1420,11 +1419,13 @@ void bch2_extent_debugcheck(struct bch_fs *c, struct btree *b,
 	 * going to get overwritten during replay)
 	 */
 
-	bch2_fs_bug_on(!test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) &&
-		       !bch2_bkey_replicas_marked(c, e.s_c, false), c,
-		       "extent key bad (replicas not marked in superblock):\n%s",
-		       (bch2_bkey_val_to_text(&PBUF(buf), c, e.s_c), buf));
-
+	if (percpu_down_read_trylock(&c->mark_lock)) {
+		bch2_fs_bug_on(!test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) &&
+			       !bch2_bkey_replicas_marked_locked(c, e.s_c, false), c,
+			       "extent key bad (replicas not marked in superblock):\n%s",
+			       (bch2_bkey_val_to_text(&PBUF(buf), c, e.s_c), buf));
+		percpu_up_read(&c->mark_lock);
+	}
 	/*
 	 * If journal replay hasn't finished, we might be seeing keys
 	 * that will be overwritten by the time journal replay is done:
-- 
cgit 


From 21629f536808c7a0e7791a2711944b03d820749f Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 22 Aug 2019 16:23:10 -0400
Subject: bcachefs: Update more code for KEY_TYPE_reflink_v

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/extents.c | 23 ++++++++++-------------
 1 file changed, 10 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 8257a1ae384b..61b5e22f66c8 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -45,7 +45,8 @@ unsigned bch2_bkey_nr_dirty_ptrs(struct bkey_s_c k)
 
 	switch (k.k->type) {
 	case KEY_TYPE_btree_ptr:
-	case KEY_TYPE_extent: {
+	case KEY_TYPE_extent:
+	case KEY_TYPE_reflink_v: {
 		struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k);
 		const struct bch_extent_ptr *ptr;
 
@@ -308,20 +309,15 @@ bch2_extent_has_group(struct bch_fs *c, struct bkey_s_c_extent e, unsigned group
 
 unsigned bch2_extent_is_compressed(struct bkey_s_c k)
 {
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+	const union bch_extent_entry *entry;
+	struct extent_ptr_decoded p;
 	unsigned ret = 0;
 
-	switch (k.k->type) {
-	case KEY_TYPE_extent: {
-		struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
-		const union bch_extent_entry *entry;
-		struct extent_ptr_decoded p;
-
-		extent_for_each_ptr_decode(e, p, entry)
-			if (!p.ptr.cached &&
-			    p.crc.compression_type != BCH_COMPRESSION_NONE)
-				ret += p.crc.compressed_size;
-	}
-	}
+	bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
+		if (!p.ptr.cached &&
+		    p.crc.compression_type != BCH_COMPRESSION_NONE)
+			ret += p.crc.compressed_size;
 
 	return ret;
 }
@@ -969,6 +965,7 @@ static int __bch2_extent_atomic_end(struct btree_trans *trans,
 
 	switch (k.k->type) {
 	case KEY_TYPE_extent:
+	case KEY_TYPE_reflink_v:
 		*nr_iters += bch2_bkey_nr_alloc_ptrs(k);
 
 		if (*nr_iters >= max_iters) {
-- 
cgit 


From 16e671037a4ca9d695134151d9bc58069d24b094 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 22 Aug 2019 16:41:50 -0400
Subject: bcachefs: Handle ec_buf not being page aligned when allocating bio

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/io.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index 5db83374403b..117d1faa99f2 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -461,7 +461,10 @@ static struct bio *bch2_write_bio_alloc(struct bch_fs *c,
 	struct bio *bio;
 	unsigned output_available =
 		min(wp->sectors_free << 9, src->bi_iter.bi_size);
-	unsigned pages = DIV_ROUND_UP(output_available, PAGE_SIZE);
+	unsigned pages = DIV_ROUND_UP(output_available +
+				      (buf
+				       ? ((unsigned long) buf & (PAGE_SIZE - 1))
+				       : 0), PAGE_SIZE);
 
 	bio = bio_alloc_bioset(NULL, pages, 0,
 			       GFP_NOIO, &c->bio_write);
-- 
cgit 


From d9b022fee69e7f1eea45317c882c65582481a263 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 22 Aug 2019 16:34:59 -0400
Subject: bcachefs: Fix a spurious gcc warning

*i is used as an output parameter, but gcc isn't noticing that. Oh well.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update_leaf.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 1e631dc8ff7c..66b12e55d946 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -857,7 +857,7 @@ int bch2_trans_commit(struct btree_trans *trans,
 		      unsigned flags)
 {
 	struct bch_fs *c = trans->c;
-	struct btree_insert_entry *i;
+	struct btree_insert_entry *i = NULL;
 	unsigned orig_mem_top = trans->mem_top;
 	int ret = 0;
 
-- 
cgit 


From 416f6852523d8599713b756b2d2027d2e9f90b3f Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 22 Aug 2019 16:30:55 -0400
Subject: bcachefs: Don't flush journal from bch2_vfs_write_inode()

It's only updating timestamps, so this doubly doesn't make sense. fsync
will flush the journal, if necessary.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index fad019d3c3f5..c4ef172400e1 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -1533,12 +1533,6 @@ static int bch2_vfs_write_inode(struct inode *vinode,
 			       ATTR_ATIME|ATTR_MTIME|ATTR_CTIME);
 	mutex_unlock(&inode->ei_update_lock);
 
-	if (c->opts.journal_flush_disabled)
-		return ret;
-
-	if (!ret && wbc->sync_mode == WB_SYNC_ALL)
-		ret = bch2_journal_flush_seq(&c->journal, inode->ei_journal_seq);
-
 	return ret;
 }
 
@@ -1595,6 +1589,9 @@ static int bch2_sync_fs(struct super_block *sb, int wait)
 {
 	struct bch_fs *c = sb->s_fs_info;
 
+	if (c->opts.journal_flush_disabled)
+		return 0;
+
 	if (!wait) {
 		bch2_journal_flush_async(&c->journal, NULL);
 		return 0;
-- 
cgit 


From 3fb5ebcdd4b0599ba8d20a322d322f3a1aaea381 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 22 Aug 2019 16:07:37 -0400
Subject: bcachefs: Inline some fast paths

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/extents.c | 13 -------------
 fs/bcachefs/extents.h | 13 ++++++++++++-
 fs/bcachefs/fs-io.c   |  2 +-
 fs/bcachefs/fs.c      |  2 +-
 fs/bcachefs/io.c      | 14 +++++---------
 fs/bcachefs/io.h      | 13 +++++++++++--
 6 files changed, 30 insertions(+), 27 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 61b5e22f66c8..63afbf24a101 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -806,19 +806,6 @@ bool bch2_cut_back(struct bpos where, struct bkey *k)
 	return true;
 }
 
-/**
- * bch_key_resize - adjust size of @k
- *
- * bkey_start_offset(k) will be preserved, modifies where the extent ends
- */
-void bch2_key_resize(struct bkey *k,
-		    unsigned new_size)
-{
-	k->p.offset -= k->size;
-	k->p.offset += new_size;
-	k->size = new_size;
-}
-
 static bool extent_i_save(struct btree *b, struct bkey_packed *dst,
 			  struct bkey_i *src)
 {
diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h
index cef93af25858..4c4a7945a751 100644
--- a/fs/bcachefs/extents.h
+++ b/fs/bcachefs/extents.h
@@ -540,7 +540,18 @@ static inline void bch2_cut_front(struct bpos where, struct bkey_i *k)
 }
 
 bool bch2_cut_back(struct bpos, struct bkey *);
-void bch2_key_resize(struct bkey *, unsigned);
+
+/**
+ * bch_key_resize - adjust size of @k
+ *
+ * bkey_start_offset(k) will be preserved, modifies where the extent ends
+ */
+static inline void bch2_key_resize(struct bkey *k, unsigned new_size)
+{
+	k->p.offset -= k->size;
+	k->p.offset += new_size;
+	k->size = new_size;
+}
 
 /*
  * In extent_sort_fix_overlapping(), insert_fixup_extent(),
diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 1873bbb9afda..0dfe822cecbf 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -1036,7 +1036,7 @@ retry:
 			bkey_start_offset(k.k);
 		sectors = k.k->size - offset_into_extent;
 
-		ret = bch2_read_indirect_extent(trans, iter,
+		ret = bch2_read_indirect_extent(trans,
 					&offset_into_extent, &tmp.k);
 		if (ret)
 			break;
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index c4ef172400e1..dcaf1da656d1 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -1233,7 +1233,7 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
 			bkey_start_offset(k.k);
 		sectors			= k.k->size - offset_into_extent;
 
-		ret = bch2_read_indirect_extent(&trans, iter,
+		ret = bch2_read_indirect_extent(&trans,
 					&offset_into_extent, &cur.k);
 		if (ret)
 			break;
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index 117d1faa99f2..844ae46cd7eb 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -1305,7 +1305,7 @@ retry:
 			bkey_start_offset(k.k);
 		sectors = k.k->size - offset_into_extent;
 
-		ret = bch2_read_indirect_extent(&trans, iter,
+		ret = bch2_read_indirect_extent(&trans,
 					&offset_into_extent, &tmp.k);
 		if (ret)
 			break;
@@ -1594,19 +1594,15 @@ static void bch2_read_endio(struct bio *bio)
 	bch2_rbio_punt(rbio, __bch2_read_endio, context, wq);
 }
 
-int bch2_read_indirect_extent(struct btree_trans *trans,
-			      struct btree_iter *extent_iter,
-			      unsigned *offset_into_extent,
-			      struct bkey_i *orig_k)
+int __bch2_read_indirect_extent(struct btree_trans *trans,
+				unsigned *offset_into_extent,
+				struct bkey_i *orig_k)
 {
 	struct btree_iter *iter;
 	struct bkey_s_c k;
 	u64 reflink_offset;
 	int ret;
 
-	if (orig_k->k.type != KEY_TYPE_reflink_p)
-		return 0;
-
 	reflink_offset = le64_to_cpu(bkey_i_to_reflink_p(orig_k)->v.idx) +
 		*offset_into_extent;
 
@@ -1920,7 +1916,7 @@ void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, u64 inode)
 			bkey_start_offset(k.k);
 		sectors = k.k->size - offset_into_extent;
 
-		ret = bch2_read_indirect_extent(&trans, iter,
+		ret = bch2_read_indirect_extent(&trans,
 					&offset_into_extent, &tmp.k);
 		if (ret)
 			goto err;
diff --git a/fs/bcachefs/io.h b/fs/bcachefs/io.h
index a768ccc90f1f..c6f5ae717cf3 100644
--- a/fs/bcachefs/io.h
+++ b/fs/bcachefs/io.h
@@ -99,8 +99,17 @@ struct bch_devs_mask;
 struct cache_promote_op;
 struct extent_ptr_decoded;
 
-int bch2_read_indirect_extent(struct btree_trans *, struct btree_iter *,
-			      unsigned *, struct bkey_i *);
+int __bch2_read_indirect_extent(struct btree_trans *, unsigned *,
+				struct bkey_i *);
+
+static inline int bch2_read_indirect_extent(struct btree_trans *trans,
+					    unsigned *offset_into_extent,
+					    struct bkey_i *k)
+{
+	return k->k.type == KEY_TYPE_reflink_p
+		? __bch2_read_indirect_extent(trans, offset_into_extent, k)
+		: 0;
+}
 
 enum bch_read_flags {
 	BCH_READ_RETRY_IF_STALE		= 1 << 0,
-- 
cgit 


From 4e1510c3e9a053edc1fdfe56fc96009919ceebd8 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 22 Aug 2019 17:09:16 -0400
Subject: bcachefs: Add a hint for allocating new stripes

This way we aren't doing a full linear scan every time we create a new
stripe.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs.h |  1 +
 fs/bcachefs/ec.c       | 20 +++++++++++++++-----
 2 files changed, 16 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 410fce3ed8d4..9bee837dedcf 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -745,6 +745,7 @@ struct bch_fs {
 	/* ERASURE CODING */
 	struct list_head	ec_new_stripe_list;
 	struct mutex		ec_new_stripe_lock;
+	u64			ec_stripe_hint;
 
 	struct bio_set		ec_bioset;
 
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index 2eb33a8460c9..a9759c2ed7ab 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -704,26 +704,34 @@ static int ec_stripe_bkey_insert(struct bch_fs *c,
 	struct btree_trans trans;
 	struct btree_iter *iter;
 	struct bkey_s_c k;
+	struct bpos start_pos = POS(0, c->ec_stripe_hint);
 	int ret;
 
 	bch2_trans_init(&trans, c, 0, 0);
 retry:
 	bch2_trans_begin(&trans);
 
-	/* XXX: start pos hint */
-	for_each_btree_key(&trans, iter, BTREE_ID_EC, POS_MIN,
+	for_each_btree_key(&trans, iter, BTREE_ID_EC, start_pos,
 			   BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) {
-		if (bkey_cmp(k.k->p, POS(0, U32_MAX)) > 0)
+		if (bkey_cmp(k.k->p, POS(0, U32_MAX)) > 0) {
+			if (start_pos.offset) {
+				start_pos = POS_MIN;
+				bch2_btree_iter_set_pos(iter, start_pos);
+				continue;
+			}
+
+			ret = -ENOSPC;
 			break;
+		}
 
 		if (bkey_deleted(k.k))
 			goto found_slot;
 	}
 
-	if (!ret)
-		ret = -ENOSPC;
 	goto err;
 found_slot:
+	start_pos = iter->pos;
+
 	ret = ec_stripe_mem_alloc(c, iter);
 	if (ret)
 		goto err;
@@ -738,6 +746,8 @@ found_slot:
 err:
 	if (ret == -EINTR)
 		goto retry;
+
+	c->ec_stripe_hint = ret ? start_pos.offset : start_pos.offset + 1;
 	bch2_trans_exit(&trans);
 
 	return ret;
-- 
cgit 


From 7d5224fcdc057a42fcd2d19bbc4d9f1c4808a83b Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 22 Aug 2019 16:12:28 -0400
Subject: bcachefs: Optimize fiemap

Reflink caused fiemap performance to regress badly - this gets us back
to where we were.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs.c | 58 +++++++++++++++++++++++++++++++-------------------------
 1 file changed, 32 insertions(+), 26 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index dcaf1da656d1..ffd9b386a14d 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -1203,6 +1203,7 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
 	struct btree_iter *iter;
 	struct bkey_s_c k;
 	BKEY_PADDED(k) cur, prev;
+	struct bpos end = POS(ei->v.i_ino, (start + len) >> 9);
 	unsigned offset_into_extent, sectors;
 	bool have_extent = false;
 	int ret = 0;
@@ -1217,14 +1218,16 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
 	bch2_trans_init(&trans, c, 0, 0);
 
 	iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
-				   POS(ei->v.i_ino, start >> 9),
-				   BTREE_ITER_SLOTS);
-
-	while (bkey_cmp(iter->pos, POS(ei->v.i_ino, (start + len) >> 9)) < 0) {
-		k = bch2_btree_iter_peek_slot(iter);
-		ret = bkey_err(k);
-		if (ret)
-			goto err;
+				   POS(ei->v.i_ino, start >> 9), 0);
+retry:
+	while ((k = bch2_btree_iter_peek(iter)).k &&
+	       !(ret = bkey_err(k)) &&
+	       bkey_cmp(iter->pos, end) < 0) {
+		if (!bkey_extent_is_data(k.k) &&
+		    k.k->type != KEY_TYPE_reservation) {
+			bch2_btree_iter_next(iter);
+			continue;
+		}
 
 		bkey_reassemble(&cur.k, k);
 		k = bkey_i_to_s_c(&cur.k);
@@ -1240,34 +1243,37 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
 
 		sectors = min(sectors, k.k->size - offset_into_extent);
 
-		bch2_cut_front(POS(k.k->p.inode,
-				   bkey_start_offset(k.k) + offset_into_extent),
-			       &cur.k);
+		if (offset_into_extent)
+			bch2_cut_front(POS(k.k->p.inode,
+					   bkey_start_offset(k.k) +
+					   offset_into_extent),
+				       &cur.k);
 		bch2_key_resize(&cur.k.k, sectors);
 		cur.k.k.p.offset = iter->pos.offset + cur.k.k.size;
 
-		if (bkey_extent_is_data(k.k) ||
-		    k.k->type == KEY_TYPE_reservation) {
-			if (have_extent) {
-				ret = bch2_fill_extent(c, info,
-						bkey_i_to_s_c(&prev.k), 0);
-				if (ret)
-					break;
-			}
-
-			bkey_copy(&prev.k, &cur.k);
-			have_extent = true;
+		if (have_extent) {
+			ret = bch2_fill_extent(c, info,
+					bkey_i_to_s_c(&prev.k), 0);
+			if (ret)
+				break;
 		}
 
-		bch2_btree_iter_set_pos(iter,
-				POS(iter->pos.inode,
-				    iter->pos.offset + sectors));
+		bkey_copy(&prev.k, &cur.k);
+		have_extent = true;
+
+		if (k.k->type == KEY_TYPE_reflink_v)
+			bch2_btree_iter_set_pos(iter, k.k->p);
+		else
+			bch2_btree_iter_next(iter);
 	}
 
+	if (ret == -EINTR)
+		goto retry;
+
 	if (!ret && have_extent)
 		ret = bch2_fill_extent(c, info, bkey_i_to_s_c(&prev.k),
 				       FIEMAP_EXTENT_LAST);
-err:
+
 	ret = bch2_trans_exit(&trans) ?: ret;
 	return ret < 0 ? ret : 0;
 }
-- 
cgit 


From 67163cded330c1d952ae440b21752f4b609585fd Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 27 Aug 2019 17:34:03 -0400
Subject: bcachefs: Trust in memory bucket mark

This fixes a bug in the journal replay -> extent_replay_key ->
split_compressed path, when we do an update that changes alloc info but
the alloc info in the btree isn't up to date yet.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c | 14 --------------
 fs/bcachefs/alloc_background.h | 14 ++++++++++++++
 fs/bcachefs/buckets.c          | 43 ++++++++++++++++++++++++++++++------------
 3 files changed, 45 insertions(+), 26 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 5619dccdc011..c1158ce154c5 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -205,20 +205,6 @@ void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c,
 			       get_alloc_field(a.v, &d, i));
 }
 
-static inline struct bkey_alloc_unpacked
-alloc_mem_to_key(struct bucket *g, struct bucket_mark m)
-{
-	return (struct bkey_alloc_unpacked) {
-		.gen		= m.gen,
-		.oldest_gen	= g->oldest_gen,
-		.data_type	= m.data_type,
-		.dirty_sectors	= m.dirty_sectors,
-		.cached_sectors	= m.cached_sectors,
-		.read_time	= g->io_time[READ],
-		.write_time	= g->io_time[WRITE],
-	};
-}
-
 int bch2_alloc_read(struct bch_fs *c, struct journal_keys *journal_keys)
 {
 	struct btree_trans trans;
diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h
index 0c1a0f0dd2ab..134c6d81397c 100644
--- a/fs/bcachefs/alloc_background.h
+++ b/fs/bcachefs/alloc_background.h
@@ -17,6 +17,20 @@ struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c);
 void bch2_alloc_pack(struct bkey_i_alloc *,
 		     const struct bkey_alloc_unpacked);
 
+static inline struct bkey_alloc_unpacked
+alloc_mem_to_key(struct bucket *g, struct bucket_mark m)
+{
+	return (struct bkey_alloc_unpacked) {
+		.gen		= m.gen,
+		.oldest_gen	= g->oldest_gen,
+		.data_type	= m.data_type,
+		.dirty_sectors	= m.dirty_sectors,
+		.cached_sectors	= m.cached_sectors,
+		.read_time	= g->io_time[READ],
+		.write_time	= g->io_time[WRITE],
+	};
+}
+
 #define ALLOC_SCAN_BATCH(ca)		max_t(size_t, 1, (ca)->mi.nbuckets >> 9)
 
 const char *bch2_alloc_invalid(const struct bch_fs *, struct bkey_s_c);
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 78d43830d0a7..4ab3b834948b 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -1361,7 +1361,7 @@ static int trans_get_key(struct btree_trans *trans,
 		     : !bkey_cmp(pos, i->iter->pos))) {
 			*iter	= i->iter;
 			*k	= bkey_i_to_s_c(i->k);
-			return 0;
+			return 1;
 		}
 
 	*iter = __bch2_trans_get_iter(trans, btree_id, pos,
@@ -1424,18 +1424,37 @@ static int bch2_trans_mark_pointer(struct btree_trans *trans,
 	ret = trans_get_key(trans, BTREE_ID_ALLOC,
 			    POS(p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr)),
 			    &iter, &k);
-	if (ret)
+	if (ret < 0)
 		return ret;
 
-	if (k.k->type != KEY_TYPE_alloc) {
-		bch_err_ratelimited(c, "pointer to nonexistent bucket %u:%zu",
-				    p.ptr.dev,
-				    PTR_BUCKET_NR(ca, &p.ptr));
-		ret = -1;
-		goto out;
-	}
+	if (!ret) {
+		/*
+		 * During journal replay, and if gc repairs alloc info at
+		 * runtime, the alloc info in the btree might not be up to date
+		 * yet - so, trust the in memory mark:
+		 */
+		struct bucket *g;
+		struct bucket_mark m;
 
-	u = bch2_alloc_unpack(k);
+		percpu_down_read(&c->mark_lock);
+		g	= bucket(ca, iter->pos.offset);
+		m	= READ_ONCE(g->mark);
+		u	= alloc_mem_to_key(g, m);
+		percpu_up_read(&c->mark_lock);
+	} else {
+		/*
+		 * Unless we're already updating that key:
+		 */
+		if (k.k->type != KEY_TYPE_alloc) {
+			bch_err_ratelimited(c, "pointer to nonexistent bucket %u:%zu",
+					    p.ptr.dev,
+					    PTR_BUCKET_NR(ca, &p.ptr));
+			ret = -1;
+			goto out;
+		}
+
+		u = bch2_alloc_unpack(k);
+	}
 
 	if (gen_after(u.gen, p.ptr.gen)) {
 		ret = 1;
@@ -1484,7 +1503,7 @@ static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans,
 	int ret = 0;
 
 	ret = trans_get_key(trans, BTREE_ID_EC, POS(0, p.idx), &iter, &k);
-	if (ret)
+	if (ret < 0)
 		return ret;
 
 	if (k.k->type != KEY_TYPE_stripe) {
@@ -1599,7 +1618,7 @@ static int __bch2_trans_mark_reflink_p(struct btree_trans *trans,
 
 	ret = trans_get_key(trans, BTREE_ID_REFLINK,
 			    POS(0, idx), &iter, &k);
-	if (ret)
+	if (ret < 0)
 		return ret;
 
 	if (k.k->type != KEY_TYPE_reflink_v) {
-- 
cgit 


From 6671a7089fdcdd8f25f6b4729fdc066f7c42edfd Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 27 Aug 2019 17:45:42 -0400
Subject: bcachefs: Refactor bch2_alloc_write()

Major simplification - gets rid of the need for marking buckets as
dirty, instead we write buckets if the in memory mark is different from
what's in the btree.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c | 191 ++++++++++++++++++-----------------------
 fs/bcachefs/alloc_background.h |  11 +++
 fs/bcachefs/bcachefs.h         |   1 -
 fs/bcachefs/btree_gc.c         |   8 +-
 fs/bcachefs/buckets.c          |  17 ----
 fs/bcachefs/buckets_types.h    |   1 -
 6 files changed, 94 insertions(+), 135 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index c1158ce154c5..54051161eba7 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -258,46 +258,68 @@ int bch2_alloc_read(struct bch_fs *c, struct journal_keys *journal_keys)
 	return 0;
 }
 
-int bch2_alloc_replay_key(struct bch_fs *c, struct bkey_i *k)
+enum alloc_write_ret {
+	ALLOC_WROTE,
+	ALLOC_NOWROTE,
+	ALLOC_END,
+};
+
+static int bch2_alloc_write_key(struct btree_trans *trans,
+				struct btree_iter *iter,
+				unsigned flags)
 {
-	struct btree_trans trans;
-	struct btree_iter *iter;
+	struct bch_fs *c = trans->c;
+	struct bkey_s_c k;
 	struct bch_dev *ca;
+	struct bucket_array *ba;
+	struct bucket *g;
+	struct bucket_mark m;
+	struct bkey_alloc_unpacked old_u, new_u;
+	__BKEY_PADDED(k, 8) alloc_key; /* hack: */
+	struct bkey_i_alloc *a;
 	int ret;
+retry:
+	k = bch2_btree_iter_peek_slot(iter);
+	ret = bkey_err(k);
+	if (ret)
+		goto err;
 
-	if (k->k.p.inode >= c->sb.nr_devices ||
-	    !c->devs[k->k.p.inode])
-		return 0;
-
-	ca = bch_dev_bkey_exists(c, k->k.p.inode);
+	old_u = bch2_alloc_unpack(k);
 
-	if (k->k.p.offset >= ca->mi.nbuckets)
-		return 0;
+	if (iter->pos.inode >= c->sb.nr_devices ||
+	    !c->devs[iter->pos.inode])
+		return ALLOC_END;
 
-	bch2_trans_init(&trans, c, 0, 0);
+	percpu_down_read(&c->mark_lock);
+	ca	= bch_dev_bkey_exists(c, iter->pos.inode);
+	ba	= bucket_array(ca);
 
-	iter = bch2_trans_get_iter(&trans, BTREE_ID_ALLOC, k->k.p,
-				   BTREE_ITER_INTENT);
+	if (iter->pos.offset >= ba->nbuckets) {
+		percpu_up_read(&c->mark_lock);
+		return ALLOC_END;
+	}
 
-	ret = bch2_btree_iter_traverse(iter);
-	if (ret)
-		goto err;
+	g	= &ba->b[iter->pos.offset];
+	m	= READ_ONCE(g->mark);
+	new_u	= alloc_mem_to_key(g, m);
+	percpu_up_read(&c->mark_lock);
 
-	/* check buckets_written with btree node locked: */
-	if (test_bit(k->k.p.offset, ca->buckets_written)) {
-		ret = 0;
-		goto err;
-	}
+	if (!bkey_alloc_unpacked_cmp(old_u, new_u))
+		return ALLOC_NOWROTE;
 
-	bch2_trans_update(&trans, BTREE_INSERT_ENTRY(iter, k));
+	a = bkey_alloc_init(&alloc_key.k);
+	a->k.p = iter->pos;
+	bch2_alloc_pack(a, new_u);
 
-	ret = bch2_trans_commit(&trans, NULL, NULL,
+	bch2_trans_update(trans, BTREE_INSERT_ENTRY(iter, &a->k_i));
+	ret = bch2_trans_commit(trans, NULL, NULL,
+				BTREE_INSERT_ATOMIC|
 				BTREE_INSERT_NOFAIL|
-				BTREE_INSERT_LAZY_RW|
-				BTREE_INSERT_JOURNAL_REPLAY|
-				BTREE_INSERT_NOMARK);
+				BTREE_INSERT_NOMARK|
+				flags);
 err:
-	bch2_trans_exit(&trans);
+	if (ret == -EINTR)
+		goto retry;
 	return ret;
 }
 
@@ -305,16 +327,8 @@ int bch2_alloc_write(struct bch_fs *c, unsigned flags, bool *wrote)
 {
 	struct btree_trans trans;
 	struct btree_iter *iter;
-	struct bucket_array *buckets;
 	struct bch_dev *ca;
-	struct bucket *g;
-	struct bucket_mark m, new;
-	struct bkey_alloc_unpacked old_u, new_u;
-	__BKEY_PADDED(k, 8) alloc_key; /* hack: */
-	struct bkey_i_alloc *a;
-	struct bkey_s_c k;
 	unsigned i;
-	size_t b;
 	int ret = 0;
 
 	BUG_ON(BKEY_ALLOC_VAL_U64s_MAX > 8);
@@ -325,81 +339,24 @@ int bch2_alloc_write(struct bch_fs *c, unsigned flags, bool *wrote)
 				   BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
 
 	for_each_rw_member(ca, c, i) {
-		down_read(&ca->bucket_lock);
-restart:
-		buckets = bucket_array(ca);
-
-		for (b = buckets->first_bucket;
-		     b < buckets->nbuckets;
-		     b++) {
-			if (!buckets->b[b].mark.dirty)
-				continue;
-
-			bch2_btree_iter_set_pos(iter, POS(i, b));
-			k = bch2_btree_iter_peek_slot(iter);
-			ret = bkey_err(k);
-			if (ret)
-				goto err;
-
-			old_u = bch2_alloc_unpack(k);
-
-			percpu_down_read(&c->mark_lock);
-			g	= bucket(ca, b);
-			m	= READ_ONCE(g->mark);
-			new_u	= alloc_mem_to_key(g, m);
-			percpu_up_read(&c->mark_lock);
+		unsigned first_bucket;
 
-			if (!m.dirty)
-				continue;
-
-			if ((flags & BTREE_INSERT_LAZY_RW) &&
-			    percpu_ref_is_zero(&c->writes)) {
-				up_read(&ca->bucket_lock);
-				bch2_trans_unlock(&trans);
-
-				ret = bch2_fs_read_write_early(c);
-				down_read(&ca->bucket_lock);
-
-				if (ret)
-					goto err;
-				goto restart;
-			}
+		percpu_down_read(&c->mark_lock);
+		first_bucket = bucket_array(ca)->first_bucket;
+		percpu_up_read(&c->mark_lock);
 
-			a = bkey_alloc_init(&alloc_key.k);
-			a->k.p = iter->pos;
-			bch2_alloc_pack(a, new_u);
+		bch2_btree_iter_set_pos(iter, POS(i, first_bucket));
 
-			bch2_trans_update(&trans, BTREE_INSERT_ENTRY(iter, &a->k_i));
-			ret = bch2_trans_commit(&trans, NULL, NULL,
-						BTREE_INSERT_NOFAIL|
-						BTREE_INSERT_NOMARK|
-						flags);
-err:
-			if (ret && !test_bit(BCH_FS_EMERGENCY_RO, &c->flags)) {
-				bch_err(c, "error %i writing alloc info", ret);
-				printk(KERN_CONT "dev %llu bucket %llu\n",
-				       iter->pos.inode, iter->pos.offset);
-				printk(KERN_CONT "gen %u -> %u\n", old_u.gen, new_u.gen);
-#define x(_name, _bits)		printk(KERN_CONT #_name " %u -> %u\n", old_u._name, new_u._name);
-				BCH_ALLOC_FIELDS()
-#undef  x
-			}
-			if (ret)
+		while (1) {
+			ret = bch2_alloc_write_key(&trans, iter, flags);
+			if (ret < 0 || ret == ALLOC_END)
 				break;
-
-			new = m;
-			new.dirty = false;
-			atomic64_cmpxchg(&g->_mark.v, m.v.counter, new.v.counter);
-
-			if (ca->buckets_written)
-				set_bit(b, ca->buckets_written);
-
-			bch2_trans_cond_resched(&trans);
-			*wrote = true;
+			if (ret == ALLOC_WROTE)
+				*wrote = true;
+			bch2_btree_iter_next_slot(iter);
 		}
-		up_read(&ca->bucket_lock);
 
-		if (ret) {
+		if (ret < 0) {
 			percpu_ref_put(&ca->io_ref);
 			break;
 		}
@@ -407,7 +364,27 @@ err:
 
 	bch2_trans_exit(&trans);
 
-	return ret;
+	return ret < 0 ? ret : 0;
+}
+
+int bch2_alloc_replay_key(struct bch_fs *c, struct bkey_i *k)
+{
+	struct btree_trans trans;
+	struct btree_iter *iter;
+	int ret;
+
+	bch2_trans_init(&trans, c, 0, 0);
+
+	iter = bch2_trans_get_iter(&trans, BTREE_ID_ALLOC, k->k.p,
+				   BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+
+	ret = bch2_alloc_write_key(&trans, iter,
+				   BTREE_INSERT_NOFAIL|
+				   BTREE_INSERT_LAZY_RW|
+				   BTREE_INSERT_JOURNAL_REPLAY|
+				   BTREE_INSERT_NOMARK);
+	bch2_trans_exit(&trans);
+	return ret < 0 ? ret : 0;
 }
 
 /* Bucket IO clocks: */
@@ -954,10 +931,6 @@ retry:
 		if (!top->nr)
 			heap_pop(&ca->alloc_heap, e, bucket_alloc_cmp, NULL);
 
-		/* with btree still locked: */
-		if (ca->buckets_written)
-			set_bit(b, ca->buckets_written);
-
 		/*
 		 * Make sure we flush the last journal entry that updated this
 		 * bucket (i.e. deleting the last reference) before writing to
diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h
index 134c6d81397c..501c444353fb 100644
--- a/fs/bcachefs/alloc_background.h
+++ b/fs/bcachefs/alloc_background.h
@@ -13,6 +13,17 @@ struct bkey_alloc_unpacked {
 #undef  x
 };
 
+/* returns true if not equal */
+static inline bool bkey_alloc_unpacked_cmp(struct bkey_alloc_unpacked l,
+					   struct bkey_alloc_unpacked r)
+{
+	return l.gen != r.gen
+#define x(_name, _bits)	|| l._name != r._name
+	BCH_ALLOC_FIELDS()
+#undef  x
+	;
+}
+
 struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c);
 void bch2_alloc_pack(struct bkey_i_alloc *,
 		     const struct bkey_alloc_unpacked);
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 9bee837dedcf..eb4079e57178 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -412,7 +412,6 @@ struct bch_dev {
 	 */
 	struct bucket_array __rcu *buckets[2];
 	unsigned long		*buckets_nouse;
-	unsigned long		*buckets_written;
 	struct rw_semaphore	bucket_lock;
 
 	struct bch_dev_usage __percpu *usage[2];
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 24cf28bf665b..68ae08f86f33 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -148,7 +148,6 @@ static int bch2_gc_mark_key(struct bch_fs *c, struct bkey_s_c k,
 					"type %u gen %u",
 					k.k->type, ptr->gen)) {
 				g2->_mark.gen	= g->_mark.gen		= ptr->gen;
-				g2->_mark.dirty	= g->_mark.dirty	= true;
 				g2->gen_valid	= g->gen_valid		= true;
 			}
 
@@ -156,7 +155,6 @@ static int bch2_gc_mark_key(struct bch_fs *c, struct bkey_s_c k,
 					"%u ptr gen in the future: %u > %u",
 					k.k->type, ptr->gen, g->mark.gen)) {
 				g2->_mark.gen	= g->_mark.gen		= ptr->gen;
-				g2->_mark.dirty	= g->_mark.dirty	= true;
 				g2->gen_valid	= g->gen_valid		= true;
 				set_bit(BCH_FS_FIXED_GENS, &c->flags);
 			}
@@ -528,7 +526,6 @@ static int bch2_gc_done(struct bch_fs *c,
 				": got %u, should be %u", i, b,		\
 				dst->b[b].mark._f, src->b[b].mark._f);	\
 		dst->b[b]._mark._f = src->b[b].mark._f;			\
-		dst->b[b]._mark.dirty = true;				\
 	}
 #define copy_dev_field(_f, _msg, ...)					\
 	copy_field(_f, "dev %u has wrong " _msg, i, ##__VA_ARGS__)
@@ -580,10 +577,7 @@ static int bch2_gc_done(struct bch_fs *c,
 			copy_bucket_field(dirty_sectors);
 			copy_bucket_field(cached_sectors);
 
-			if (dst->b[b].oldest_gen != src->b[b].oldest_gen) {
-				dst->b[b].oldest_gen = src->b[b].oldest_gen;
-				dst->b[b]._mark.dirty = true;
-			}
+			dst->b[b].oldest_gen = src->b[b].oldest_gen;
 		}
 	};
 
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 4ab3b834948b..625c6c5f7963 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -634,7 +634,6 @@ static int __bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca,
 		BUG_ON(!is_available_bucket(new));
 
 		new.owned_by_allocator	= true;
-		new.dirty		= true;
 		new.data_type		= 0;
 		new.cached_sectors	= 0;
 		new.dirty_sectors	= 0;
@@ -774,7 +773,6 @@ static int __bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
 	       type != BCH_DATA_JOURNAL);
 
 	old = bucket_cmpxchg(g, new, ({
-		new.dirty	= true;
 		new.data_type	= type;
 		overflow = checked_add(new.dirty_sectors, sectors);
 	}));
@@ -849,7 +847,6 @@ static void bucket_set_stripe(struct bch_fs *c,
 		struct bucket_mark new, old;
 
 		old = bucket_data_cmpxchg(c, ca, fs_usage, g, new, ({
-			new.dirty			= true;
 			new.stripe			= enabled;
 			if (journal_seq) {
 				new.journal_seq_valid	= 1;
@@ -896,8 +893,6 @@ static bool bch2_mark_pointer(struct bch_fs *c,
 	do {
 		new.v.counter = old.v.counter = v;
 
-		new.dirty = true;
-
 		/*
 		 * Check this after reading bucket mark to guard against
 		 * the allocator invalidating a bucket after we've already
@@ -1882,7 +1877,6 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
 {
 	struct bucket_array *buckets = NULL, *old_buckets = NULL;
 	unsigned long *buckets_nouse = NULL;
-	unsigned long *buckets_written = NULL;
 	alloc_fifo	free[RESERVE_NR];
 	alloc_fifo	free_inc;
 	alloc_heap	alloc_heap;
@@ -1911,9 +1905,6 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
 	    !(buckets_nouse	= kvpmalloc(BITS_TO_LONGS(nbuckets) *
 					    sizeof(unsigned long),
 					    GFP_KERNEL|__GFP_ZERO)) ||
-	    !(buckets_written	= kvpmalloc(BITS_TO_LONGS(nbuckets) *
-					    sizeof(unsigned long),
-					    GFP_KERNEL|__GFP_ZERO)) ||
 	    !init_fifo(&free[RESERVE_BTREE], btree_reserve, GFP_KERNEL) ||
 	    !init_fifo(&free[RESERVE_MOVINGGC],
 		       copygc_reserve, GFP_KERNEL) ||
@@ -1945,16 +1936,12 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
 		memcpy(buckets_nouse,
 		       ca->buckets_nouse,
 		       BITS_TO_LONGS(n) * sizeof(unsigned long));
-		memcpy(buckets_written,
-		       ca->buckets_written,
-		       BITS_TO_LONGS(n) * sizeof(unsigned long));
 	}
 
 	rcu_assign_pointer(ca->buckets[0], buckets);
 	buckets = old_buckets;
 
 	swap(ca->buckets_nouse, buckets_nouse);
-	swap(ca->buckets_written, buckets_written);
 
 	if (resize)
 		percpu_up_write(&c->mark_lock);
@@ -1994,8 +1981,6 @@ err:
 		free_fifo(&free[i]);
 	kvpfree(buckets_nouse,
 		BITS_TO_LONGS(nbuckets) * sizeof(unsigned long));
-	kvpfree(buckets_written,
-		BITS_TO_LONGS(nbuckets) * sizeof(unsigned long));
 	if (buckets)
 		call_rcu(&old_buckets->rcu, buckets_free_rcu);
 
@@ -2011,8 +1996,6 @@ void bch2_dev_buckets_free(struct bch_dev *ca)
 	free_fifo(&ca->free_inc);
 	for (i = 0; i < RESERVE_NR; i++)
 		free_fifo(&ca->free[i]);
-	kvpfree(ca->buckets_written,
-		BITS_TO_LONGS(ca->mi.nbuckets) * sizeof(unsigned long));
 	kvpfree(ca->buckets_nouse,
 		BITS_TO_LONGS(ca->mi.nbuckets) * sizeof(unsigned long));
 	kvpfree(rcu_dereference_protected(ca->buckets[0], 1),
diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h
index efed658abc6a..7ab9aa641c95 100644
--- a/fs/bcachefs/buckets_types.h
+++ b/fs/bcachefs/buckets_types.h
@@ -15,7 +15,6 @@ struct bucket_mark {
 	u8		gen;
 	u8		data_type:3,
 			owned_by_allocator:1,
-			dirty:1,
 			journal_seq_valid:1,
 			stripe:1;
 	u16		dirty_sectors;
-- 
cgit 


From df5d4dae0b667f93616d6e47da09ad916dcb7102 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 22 Aug 2019 13:20:38 -0400
Subject: bcachefs: Fixes for replicas tracking

The continue statement in bch2_trans_mark_extent() was wrong - by
bailing out early, we'd be constructing the wrong replicas list to
update. Also, the assertion in update_replicas() was wrong - due to
rounding with compressed extents, it is possible for sectors to be 0
sometimes.

Also, change extent_to_replicas() in replicas.c to match the replicas
list we construct in buckets.c.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/buckets.c  | 23 +++++++++--------------
 fs/bcachefs/replicas.c |  2 +-
 2 files changed, 10 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 625c6c5f7963..189c475c973a 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -520,7 +520,6 @@ static inline void update_replicas(struct bch_fs *c,
 	int idx = bch2_replicas_entry_idx(c, r);
 
 	BUG_ON(idx < 0);
-	BUG_ON(!sectors);
 
 	switch (r->data_type) {
 	case BCH_DATA_BTREE:
@@ -569,8 +568,12 @@ static inline void update_replicas_list(struct btree_trans *trans,
 {
 	struct replicas_delta_list *d;
 	struct replicas_delta *n;
-	unsigned b = replicas_entry_bytes(r) + 8;
+	unsigned b;
+
+	if (!sectors)
+		return;
 
+	b = replicas_entry_bytes(r) + 8;
 	d = replicas_deltas_realloc(trans, b);
 
 	n = (void *) d->d + d->used;
@@ -1029,7 +1032,7 @@ static int bch2_mark_extent(struct bch_fs *c, struct bkey_s_c k,
 					fs_usage, journal_seq, flags);
 
 		if (p.ptr.cached) {
-			if (disk_sectors && !stale)
+			if (!stale)
 				update_cached_sectors(c, fs_usage, p.ptr.dev,
 						      disk_sectors);
 		} else if (!p.ec_nr) {
@@ -1048,8 +1051,7 @@ static int bch2_mark_extent(struct bch_fs *c, struct bkey_s_c k,
 		}
 	}
 
-	if (dirty_sectors)
-		update_replicas(c, fs_usage, &r.e, dirty_sectors);
+	update_replicas(c, fs_usage, &r.e, dirty_sectors);
 
 	return 0;
 }
@@ -1561,12 +1563,6 @@ static int bch2_trans_mark_extent(struct btree_trans *trans,
 			? sectors
 			: ptr_disk_sectors_delta(p, offset, sectors, flags);
 
-		/*
-		 * can happen due to rounding with compressed extents:
-		 */
-		if (!disk_sectors)
-			continue;
-
 		ret = bch2_trans_mark_pointer(trans, p, disk_sectors,
 					      data_type);
 		if (ret < 0)
@@ -1575,7 +1571,7 @@ static int bch2_trans_mark_extent(struct btree_trans *trans,
 		stale = ret > 0;
 
 		if (p.ptr.cached) {
-			if (disk_sectors && !stale)
+			if (!stale)
 				update_cached_sectors_list(trans, p.ptr.dev,
 							   disk_sectors);
 		} else if (!p.ec_nr) {
@@ -1593,8 +1589,7 @@ static int bch2_trans_mark_extent(struct btree_trans *trans,
 		}
 	}
 
-	if (dirty_sectors)
-		update_replicas_list(trans, &r.e, dirty_sectors);
+	update_replicas_list(trans, &r.e, dirty_sectors);
 
 	return 0;
 }
diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c
index 4fb142f3d39c..64024ce01665 100644
--- a/fs/bcachefs/replicas.c
+++ b/fs/bcachefs/replicas.c
@@ -80,7 +80,7 @@ static void extent_to_replicas(struct bkey_s_c k,
 			continue;
 
 		if (p.ec_nr) {
-			r->nr_devs = 0;
+			r->nr_required = 0;
 			break;
 		}
 
-- 
cgit 


From 06f6c3ec0b5368ddd83a8314cf89ee221ec47263 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 28 Aug 2019 12:05:17 -0400
Subject: bcachefs: Reflink pointers also have to be remarked if split in
 journal replay

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/recovery.c | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index f2899ba9ad43..2aa63cc75f50 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -249,7 +249,13 @@ static int bch2_extent_replay_key(struct bch_fs *c, enum btree_id btree_id,
 		bch2_disk_reservation_init(c, 0);
 	struct bkey_i *split;
 	struct bpos atomic_end;
-	bool split_compressed = false;
+	/*
+	 * Some extents aren't equivalent - w.r.t. what the triggers do
+	 * - if they're split:
+	 */
+	bool remark_if_split = bch2_extent_is_compressed(bkey_i_to_s_c(k)) ||
+		k->k.type == KEY_TYPE_reflink_p;
+	bool remark = false;
 	int ret;
 
 	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
@@ -280,8 +286,8 @@ retry:
 		if (ret)
 			goto err;
 
-		if (!split_compressed &&
-		    bch2_extent_is_compressed(bkey_i_to_s_c(k)) &&
+		if (!remark &&
+		    remark_if_split &&
 		    bkey_cmp(atomic_end, k->k.p) < 0) {
 			ret = bch2_disk_reservation_add(c, &disk_res,
 					k->k.size *
@@ -289,7 +295,7 @@ retry:
 					BCH_DISK_RESERVATION_NOFAIL);
 			BUG_ON(ret);
 
-			split_compressed = true;
+			remark = true;
 		}
 
 		bkey_copy(split, k);
@@ -300,7 +306,7 @@ retry:
 		bch2_btree_iter_set_pos(iter, split->k.p);
 	} while (bkey_cmp(iter->pos, k->k.p) < 0);
 
-	if (split_compressed) {
+	if (remark) {
 		ret = bch2_trans_mark_key(&trans, bkey_i_to_s_c(k),
 					  0, -((s64) k->k.size),
 					  BCH_BUCKET_MARK_OVERWRITE) ?:
-- 
cgit 


From 9940a791ea24876d09116ac503045fb3390aebd2 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 28 Aug 2019 12:11:39 -0400
Subject: bcachefs: Fix error message on bucket overflow

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/buckets.c | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 189c475c973a..61df32cf9f5b 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -1415,6 +1415,7 @@ static int bch2_trans_mark_pointer(struct btree_trans *trans,
 	struct bkey_s_c k;
 	struct bkey_alloc_unpacked u;
 	struct bkey_i_alloc *a;
+	unsigned old;
 	bool overflow;
 	int ret;
 
@@ -1443,9 +1444,9 @@ static int bch2_trans_mark_pointer(struct btree_trans *trans,
 		 * Unless we're already updating that key:
 		 */
 		if (k.k->type != KEY_TYPE_alloc) {
-			bch_err_ratelimited(c, "pointer to nonexistent bucket %u:%zu",
-					    p.ptr.dev,
-					    PTR_BUCKET_NR(ca, &p.ptr));
+			bch_err_ratelimited(c, "pointer to nonexistent bucket %llu:%llu",
+					    iter->pos.inode,
+					    iter->pos.offset);
 			ret = -1;
 			goto out;
 		}
@@ -1458,19 +1459,20 @@ static int bch2_trans_mark_pointer(struct btree_trans *trans,
 		goto out;
 	}
 
-	if (!p.ptr.cached)
+	if (!p.ptr.cached) {
+		old = u.dirty_sectors;
 		overflow = checked_add(u.dirty_sectors, sectors);
-	else
+	} else {
+		old = u.cached_sectors;
 		overflow = checked_add(u.cached_sectors, sectors);
+	}
 
 	u.data_type = u.dirty_sectors || u.cached_sectors
 		? data_type : 0;
 
 	bch2_fs_inconsistent_on(overflow, c,
 		"bucket sector count overflow: %u + %lli > U16_MAX",
-		!p.ptr.cached
-		? u.dirty_sectors
-		: u.cached_sectors, sectors);
+		old, sectors);
 
 	a = trans_update_key(trans, iter, BKEY_ALLOC_U64s_MAX);
 	ret = PTR_ERR_OR_ZERO(a);
-- 
cgit 


From 05cf02b5a10ae9b60aad4b1fe4049eb4e7603b4f Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 28 Aug 2019 12:41:45 -0400
Subject: bcachefs: Fix fiemap (again)

when iterating over reflink pointers, we use the key we just emitted to
set the iterator position - which means we have to be setting the key's
inode field as well

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index ffd9b386a14d..0ba498505b07 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -1249,7 +1249,8 @@ retry:
 					   offset_into_extent),
 				       &cur.k);
 		bch2_key_resize(&cur.k.k, sectors);
-		cur.k.k.p.offset = iter->pos.offset + cur.k.k.size;
+		cur.k.k.p = iter->pos;
+		cur.k.k.p.offset += cur.k.k.size;
 
 		if (have_extent) {
 			ret = bch2_fill_extent(c, info,
-- 
cgit 


From ad7e137ebc3dcadbaa37d2f464728c915e039e1d Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 28 Aug 2019 13:20:31 -0400
Subject: bcachefs: Switch reconstruct_alloc to a mount option

Right now this is the only way of repairing bucket gens in the future

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs.h | 4 +---
 fs/bcachefs/opts.h     | 5 +++++
 fs/bcachefs/recovery.c | 2 +-
 3 files changed, 7 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index eb4079e57178..c5c98aae8bdb 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -285,9 +285,7 @@ do {									\
 		"Force reads to use the reconstruct path, when reading"	\
 		"from erasure coded extents")				\
 	BCH_DEBUG_PARAM(test_restart_gc,				\
-		"Test restarting mark and sweep gc when bucket gens change")\
-	BCH_DEBUG_PARAM(test_reconstruct_alloc,				\
-		"Test reconstructing the alloc btree")
+		"Test restarting mark and sweep gc when bucket gens change")
 
 #define BCH_DEBUG_PARAMS_ALL() BCH_DEBUG_PARAMS_ALWAYS() BCH_DEBUG_PARAMS_DEBUG()
 
diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
index d2493d4111c6..d44bfe90c0d5 100644
--- a/fs/bcachefs/opts.h
+++ b/fs/bcachefs/opts.h
@@ -258,6 +258,11 @@ enum opt_type {
 	  OPT_BOOL(),							\
 	  NO_SB_OPT,			false,				\
 	  NULL,		"Don\'t start filesystem, only open devices")	\
+	x(reconstruct_alloc,		u8,				\
+	  OPT_MOUNT,							\
+	  OPT_BOOL(),							\
+	  NO_SB_OPT,			false,				\
+	  NULL,		"Reconstruct alloc btree")			\
 	x(version_upgrade,		u8,				\
 	  OPT_MOUNT,							\
 	  OPT_BOOL(),							\
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 2aa63cc75f50..c9558ccb9a26 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -659,7 +659,7 @@ static int read_btree_roots(struct bch_fs *c)
 			continue;
 
 		if (i == BTREE_ID_ALLOC &&
-		    test_reconstruct_alloc(c)) {
+		    c->opts.reconstruct_alloc) {
 			c->sb.compat &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_INFO);
 			continue;
 		}
-- 
cgit 


From 06ab329c150f9eebc68cdb156a9591084b16ec55 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 29 Aug 2019 11:34:01 -0400
Subject: bcachefs: Improve pointer marking checks and error messages

Importantly, we don't want to use bch2_fs_inconsistent_on() for errors
that fsck can repair, becuase that will just put us in RO mode and
prevent fsck from actually fixing stuff. Probably want to get rid of it
in the future.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_gc.c | 16 +++++++++----
 fs/bcachefs/buckets.c  | 65 ++++++++++++++++++++++++++++++++++----------------
 fs/bcachefs/buckets.h  |  9 +++++++
 3 files changed, 64 insertions(+), 26 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 68ae08f86f33..c4a7ff5f8a08 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -144,18 +144,24 @@ static int bch2_gc_mark_key(struct bch_fs *c, struct bkey_s_c k,
 			struct bucket *g2 = PTR_BUCKET(ca, ptr, false);
 
 			if (mustfix_fsck_err_on(!g->gen_valid, c,
-					"found ptr with missing gen in alloc btree,\n"
-					"type %u gen %u",
-					k.k->type, ptr->gen)) {
+					"bucket %u:%zu data type %s ptr gen %u missing in alloc btree",
+					ptr->dev, PTR_BUCKET_NR(ca, ptr),
+					bch2_data_types[ptr_data_type(k.k, ptr)],
+					ptr->gen)) {
 				g2->_mark.gen	= g->_mark.gen		= ptr->gen;
 				g2->gen_valid	= g->gen_valid		= true;
 			}
 
 			if (mustfix_fsck_err_on(gen_cmp(ptr->gen, g->mark.gen) > 0, c,
-					"%u ptr gen in the future: %u > %u",
-					k.k->type, ptr->gen, g->mark.gen)) {
+					"bucket %u:%zu data type %s ptr gen in the future: %u > %u",
+					ptr->dev, PTR_BUCKET_NR(ca, ptr),
+					bch2_data_types[ptr_data_type(k.k, ptr)],
+					ptr->gen, g->mark.gen)) {
 				g2->_mark.gen	= g->_mark.gen		= ptr->gen;
 				g2->gen_valid	= g->gen_valid		= true;
+				g2->_mark.data_type		= 0;
+				g2->_mark.dirty_sectors		= 0;
+				g2->_mark.cached_sectors	= 0;
 				set_bit(BCH_FS_FIXED_GENS, &c->flags);
 			}
 		}
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 61df32cf9f5b..d732ec77e281 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -447,12 +447,6 @@ static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
 
 	percpu_rwsem_assert_held(&c->mark_lock);
 
-	bch2_fs_inconsistent_on(old.data_type && new.data_type &&
-				old.data_type != new.data_type, c,
-		"different types of data in same bucket: %s, %s",
-		bch2_data_types[old.data_type],
-		bch2_data_types[new.data_type]);
-
 	preempt_disable();
 	dev_usage = this_cpu_ptr(ca->usage[gc]);
 
@@ -504,14 +498,6 @@ void bch2_dev_usage_from_buckets(struct bch_fs *c)
 	}
 }
 
-#define bucket_data_cmpxchg(c, ca, fs_usage, g, new, expr)	\
-({								\
-	struct bucket_mark _old = bucket_cmpxchg(g, new, expr);	\
-								\
-	bch2_dev_usage_update(c, ca, fs_usage, _old, new, gc);	\
-	_old;							\
-})
-
 static inline void update_replicas(struct bch_fs *c,
 				   struct bch_fs_usage *fs_usage,
 				   struct bch_replicas_entry *r,
@@ -633,7 +619,7 @@ static int __bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca,
 	struct bucket *g = __bucket(ca, b, gc);
 	struct bucket_mark old, new;
 
-	old = bucket_data_cmpxchg(c, ca, fs_usage, g, new, ({
+	old = bucket_cmpxchg(g, new, ({
 		BUG_ON(!is_available_bucket(new));
 
 		new.owned_by_allocator	= true;
@@ -643,6 +629,8 @@ static int __bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca,
 		new.gen++;
 	}));
 
+	bch2_dev_usage_update(c, ca, fs_usage, old, new, gc);
+
 	if (old.cached_sectors)
 		update_cached_sectors(c, fs_usage, ca->dev_idx,
 				      -((s64) old.cached_sectors));
@@ -671,10 +659,12 @@ static int __bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
 	struct bucket *g = __bucket(ca, b, gc);
 	struct bucket_mark old, new;
 
-	old = bucket_data_cmpxchg(c, ca, fs_usage, g, new, ({
+	old = bucket_cmpxchg(g, new, ({
 		new.owned_by_allocator	= owned_by_allocator;
 	}));
 
+	bch2_dev_usage_update(c, ca, fs_usage, old, new, gc);
+
 	BUG_ON(!gc &&
 	       !owned_by_allocator && !old.owned_by_allocator);
 
@@ -780,6 +770,12 @@ static int __bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
 		overflow = checked_add(new.dirty_sectors, sectors);
 	}));
 
+	bch2_fs_inconsistent_on(old.data_type &&
+				old.data_type != type, c,
+		"different types of data in same bucket: %s, %s",
+		bch2_data_types[old.data_type],
+		bch2_data_types[type]);
+
 	bch2_fs_inconsistent_on(overflow, c,
 		"bucket sector count overflow: %u + %u > U16_MAX",
 		old.dirty_sectors, sectors);
@@ -849,7 +845,7 @@ static void bucket_set_stripe(struct bch_fs *c,
 		struct bucket *g = PTR_BUCKET(ca, ptr, gc);
 		struct bucket_mark new, old;
 
-		old = bucket_data_cmpxchg(c, ca, fs_usage, g, new, ({
+		old = bucket_cmpxchg(g, new, ({
 			new.stripe			= enabled;
 			if (journal_seq) {
 				new.journal_seq_valid	= 1;
@@ -857,6 +853,8 @@ static void bucket_set_stripe(struct bch_fs *c,
 			}
 		}));
 
+		bch2_dev_usage_update(c, ca, fs_usage, old, new, gc);
+
 		/*
 		 * XXX write repair code for these, flag stripe as possibly bad
 		 */
@@ -901,7 +899,13 @@ static bool bch2_mark_pointer(struct bch_fs *c,
 		 * the allocator invalidating a bucket after we've already
 		 * checked the gen
 		 */
-		if (gen_after(new.gen, p.ptr.gen)) {
+		if (gen_after(p.ptr.gen, new.gen)) {
+			bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
+				      "pointer gen in the future");
+			return true;
+		}
+
+		if (new.gen != p.ptr.gen) {
 			/* XXX write repair code for this */
 			if (!p.ptr.cached &&
 			    test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags))
@@ -935,6 +939,14 @@ static bool bch2_mark_pointer(struct bch_fs *c,
 			      old.v.counter,
 			      new.v.counter)) != old.v.counter);
 
+	if (old.data_type && old.data_type != data_type)
+		bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
+			"bucket %u:%zu gen %u different types of data in same bucket: %s, %s",
+			p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
+			new.gen,
+			bch2_data_types[old.data_type],
+			bch2_data_types[data_type]);
+
 	bch2_fs_inconsistent_on(overflow, c,
 		"bucket sector count overflow: %u + %lli > U16_MAX",
 		!p.ptr.cached
@@ -1444,9 +1456,9 @@ static int bch2_trans_mark_pointer(struct btree_trans *trans,
 		 * Unless we're already updating that key:
 		 */
 		if (k.k->type != KEY_TYPE_alloc) {
-			bch_err_ratelimited(c, "pointer to nonexistent bucket %llu:%llu",
-					    iter->pos.inode,
-					    iter->pos.offset);
+			bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
+				      "pointer to nonexistent bucket %llu:%llu",
+				      iter->pos.inode, iter->pos.offset);
 			ret = -1;
 			goto out;
 		}
@@ -1459,6 +1471,17 @@ static int bch2_trans_mark_pointer(struct btree_trans *trans,
 		goto out;
 	}
 
+	if (u.data_type && u.data_type != data_type) {
+		bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
+			"bucket %llu:%llu gen %u different types of data in same bucket: %s, %s",
+			iter->pos.inode, iter->pos.offset,
+			u.gen,
+			bch2_data_types[u.data_type],
+			bch2_data_types[data_type]);
+		ret = -1;
+		goto out;
+	}
+
 	if (!p.ptr.cached) {
 		old = u.dirty_sectors;
 		overflow = checked_add(u.dirty_sectors, sectors);
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index 296d250e58dd..e93cda51d705 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -94,6 +94,15 @@ static inline struct bucket *PTR_BUCKET(struct bch_dev *ca,
 	return __bucket(ca, PTR_BUCKET_NR(ca, ptr), gc);
 }
 
+static inline enum bch_data_type ptr_data_type(const struct bkey *k,
+					       const struct bch_extent_ptr *ptr)
+{
+	if (k->type == KEY_TYPE_btree_ptr)
+		return BCH_DATA_BTREE;
+
+	return ptr->cached ? BCH_DATA_CACHED : BCH_DATA_USER;
+}
+
 static inline struct bucket_mark ptr_bucket_mark(struct bch_dev *ca,
 						 const struct bch_extent_ptr *ptr)
 {
-- 
cgit 


From 78854fca28e4e789c99a812acda2ac04c20d2dac Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 29 Aug 2019 13:29:31 -0400
Subject: bcachefs: Fix BTREE_INSERT_NOMARK_OVERWRITES

bch2_mark_update() was correct, but bch2_trans_mark_update() wasn't
respecting BTREE_INSERT_NOMARK_OVERWRITES - key marking/triggers really
need to be cleaned up.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/buckets.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index d732ec77e281..774201a1c0c5 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -1762,6 +1762,9 @@ int bch2_trans_mark_update(struct btree_trans *trans,
 	if (ret)
 		return ret;
 
+	if (unlikely(trans->flags & BTREE_INSERT_NOMARK_OVERWRITES))
+		return 0;
+
 	while ((_k = bch2_btree_node_iter_peek_filter(&node_iter, b,
 						      KEY_TYPE_discard))) {
 		struct bkey		unpacked;
-- 
cgit 


From 4430ea7046fcb2112c5888705a6a674d53d9db03 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 5 Sep 2019 13:37:50 -0400
Subject: bcachefs: Kill BTREE_INSERT_NOMARK_INSERT

Was dead code

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update.h | 4 ----
 fs/bcachefs/buckets.c      | 9 ++++-----
 2 files changed, 4 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
index 616c103c05ec..08c17477e76c 100644
--- a/fs/bcachefs/btree_update.h
+++ b/fs/bcachefs/btree_update.h
@@ -43,7 +43,6 @@ enum {
 	__BTREE_INSERT_USE_ALLOC_RESERVE,
 	__BTREE_INSERT_JOURNAL_REPLAY,
 	__BTREE_INSERT_JOURNAL_RESERVED,
-	__BTREE_INSERT_NOMARK_INSERT,
 	__BTREE_INSERT_NOMARK_OVERWRITES,
 	__BTREE_INSERT_NOMARK,
 	__BTREE_INSERT_MARK_INMEM,
@@ -81,9 +80,6 @@ enum {
 
 #define BTREE_INSERT_JOURNAL_RESERVED	(1 << __BTREE_INSERT_JOURNAL_RESERVED)
 
-/* Don't mark new key, just overwrites: */
-#define BTREE_INSERT_NOMARK_INSERT	(1 << __BTREE_INSERT_NOMARK_INSERT)
-
 /* Don't mark overwrites, just new key: */
 #define BTREE_INSERT_NOMARK_OVERWRITES	(1 << __BTREE_INSERT_NOMARK_OVERWRITES)
 
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 774201a1c0c5..e432a4507081 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -1265,11 +1265,10 @@ int bch2_mark_update(struct btree_trans *trans,
 	if (!btree_node_type_needs_gc(iter->btree_id))
 		return 0;
 
-	if (!(trans->flags & BTREE_INSERT_NOMARK_INSERT))
-		bch2_mark_key_locked(c, bkey_i_to_s_c(insert->k),
-			0, insert->k->k.size,
-			fs_usage, trans->journal_res.seq,
-			BCH_BUCKET_MARK_INSERT|flags);
+	bch2_mark_key_locked(c, bkey_i_to_s_c(insert->k),
+		0, insert->k->k.size,
+		fs_usage, trans->journal_res.seq,
+		BCH_BUCKET_MARK_INSERT|flags);
 
 	if (unlikely(trans->flags & BTREE_INSERT_NOMARK_OVERWRITES))
 		return 0;
-- 
cgit 


From 5055b50939843c027e1cf4602316035d9f21f032 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 7 Sep 2019 12:39:59 -0400
Subject: bcachefs: Rebalance now adds replicas if needed

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs_format.h |  2 +-
 fs/bcachefs/rebalance.c       | 45 ++++++++++++++++++-------------------------
 2 files changed, 20 insertions(+), 27 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index 62afea1e7ec3..4bc3f8d3e7f4 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -661,7 +661,7 @@ struct bch_reservation {
 
 /* Maximum possible size of an entire extent value: */
 #define BKEY_EXTENT_VAL_U64s_MAX				\
-	(BKEY_EXTENT_PTR_U64s_MAX * (BCH_REPLICAS_MAX + 1))
+	(1 + BKEY_EXTENT_PTR_U64s_MAX * (BCH_REPLICAS_MAX + 1))
 
 #define BKEY_PADDED(key)	__BKEY_PADDED(key, BKEY_EXTENT_VAL_U64s_MAX)
 
diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c
index 0997c0621b7c..9eb3ac856eed 100644
--- a/fs/bcachefs/rebalance.c
+++ b/fs/bcachefs/rebalance.c
@@ -42,9 +42,6 @@ void bch2_rebalance_add_key(struct bch_fs *c,
 	const union bch_extent_entry *entry;
 	struct extent_ptr_decoded p;
 
-	if (!bkey_extent_is_data(k.k))
-		return;
-
 	if (!io_opts->background_target &&
 	    !io_opts->background_compression)
 		return;
@@ -72,30 +69,26 @@ static enum data_cmd rebalance_pred(struct bch_fs *c, void *arg,
 				    struct bch_io_opts *io_opts,
 				    struct data_opts *data_opts)
 {
-	switch (k.k->type) {
-	case KEY_TYPE_extent: {
-		struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
-		const union bch_extent_entry *entry;
-		struct extent_ptr_decoded p;
-
-		/* Make sure we have room to add a new pointer: */
-		if (bkey_val_u64s(e.k) + BKEY_EXTENT_PTR_U64s_MAX >
-		    BKEY_EXTENT_VAL_U64s_MAX)
-			return DATA_SKIP;
-
-		extent_for_each_ptr_decode(e, p, entry)
-			if (rebalance_ptr_pred(c, p, io_opts))
-				goto found;
-
-		return DATA_SKIP;
-found:
-		data_opts->target		= io_opts->background_target;
-		data_opts->btree_insert_flags	= 0;
-		return DATA_ADD_REPLICAS;
-	}
-	default:
-		return DATA_SKIP;
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+	const union bch_extent_entry *entry;
+	struct extent_ptr_decoded p;
+	unsigned nr_replicas = 0;
+
+	bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+		nr_replicas += !p.ptr.cached;
+
+		if (rebalance_ptr_pred(c, p, io_opts))
+			goto found;
 	}
+
+	if (nr_replicas < io_opts->data_replicas)
+		goto found;
+
+	return DATA_SKIP;
+found:
+	data_opts->target		= io_opts->background_target;
+	data_opts->btree_insert_flags	= 0;
+	return DATA_ADD_REPLICAS;
 }
 
 struct rebalance_work {
-- 
cgit 


From 89b0511826c1bde65f57a2e051ee9fcb274bff69 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 7 Sep 2019 12:42:27 -0400
Subject: bcachefs: Flush fsck errors when looping in btree gc

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_gc.c | 2 ++
 fs/bcachefs/error.c    | 9 +++++----
 fs/bcachefs/recovery.c | 2 ++
 3 files changed, 9 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index c4a7ff5f8a08..03a3d24d7451 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -764,6 +764,8 @@ out:
 			percpu_down_write(&c->mark_lock);
 			bch2_gc_free(c);
 			percpu_up_write(&c->mark_lock);
+			/* flush fsck errors, reset counters */
+			bch2_flush_fsck_errs(c);
 
 			goto again;
 		}
diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c
index 1aaff44e18cf..304ff92500be 100644
--- a/fs/bcachefs/error.c
+++ b/fs/bcachefs/error.c
@@ -4,6 +4,8 @@
 #include "io.h"
 #include "super.h"
 
+#define FSCK_ERR_RATELIMIT_NR	10
+
 bool bch2_inconsistent_error(struct bch_fs *c)
 {
 	set_bit(BCH_FS_ERROR, &c->flags);
@@ -97,8 +99,8 @@ enum fsck_err_ret bch2_fsck_err(struct bch_fs *c, unsigned flags,
 found:
 	list_move(&s->list, &c->fsck_errors);
 	s->nr++;
-	suppressing	= s->nr == 10;
-	print		= s->nr <= 10;
+	suppressing	= s->nr == FSCK_ERR_RATELIMIT_NR;
+	print		= s->nr <= FSCK_ERR_RATELIMIT_NR;
 	buf		= s->buf;
 print:
 	va_start(args, fmt);
@@ -152,10 +154,9 @@ void bch2_flush_fsck_errs(struct bch_fs *c)
 	struct fsck_err_state *s, *n;
 
 	mutex_lock(&c->fsck_error_lock);
-	set_bit(BCH_FS_FSCK_DONE, &c->flags);
 
 	list_for_each_entry_safe(s, n, &c->fsck_errors, list) {
-		if (s->nr > 10)
+		if (s->nr > FSCK_ERR_RATELIMIT_NR)
 			bch_err(c, "Saw %llu errors like:\n    %s", s->nr, s->buf);
 
 		list_del(&s->list);
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index c9558ccb9a26..5be34231a0c3 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -936,7 +936,9 @@ out:
 	ret = 0;
 err:
 fsck_err:
+	set_bit(BCH_FS_FSCK_DONE, &c->flags);
 	bch2_flush_fsck_errs(c);
+
 	journal_keys_free(&journal_keys);
 	journal_entries_free(&journal_entries);
 	kfree(clean);
-- 
cgit 


From b50dd7920d1cd7b37016929faa175578de12dd27 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 7 Sep 2019 13:16:41 -0400
Subject: bcachefs: Fix a null ptr deref

rbio->c wasn't being initialized in the move path

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/io.c   | 9 +++++++--
 fs/bcachefs/move.c | 3 ++-
 2 files changed, 9 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index 844ae46cd7eb..b0bff54a18e2 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -1218,10 +1218,15 @@ static inline struct bch_read_bio *bch2_rbio_free(struct bch_read_bio *rbio)
 	return rbio;
 }
 
+/*
+ * Only called on a top level bch_read_bio to complete an entire read request,
+ * not a split:
+ */
 static void bch2_rbio_done(struct bch_read_bio *rbio)
 {
-	bch2_time_stats_update(&rbio->c->times[BCH_TIME_data_read],
-			       rbio->start_time);
+	if (rbio->start_time)
+		bch2_time_stats_update(&rbio->c->times[BCH_TIME_data_read],
+				       rbio->start_time);
 	bio_endio(&rbio->bio);
 }
 
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index 05bb74a36230..d01c96ff00d7 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -436,7 +436,8 @@ static int bch2_move_extent(struct bch_fs *c,
 				 GFP_KERNEL))
 		goto err_free;
 
-	io->rbio.opts = io_opts;
+	io->rbio.c		= c;
+	io->rbio.opts		= io_opts;
 	bio_init(&io->rbio.bio, NULL, io->bi_inline_vecs, pages, 0);
 	io->rbio.bio.bi_vcnt = pages;
 	bio_set_prio(&io->rbio.bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
-- 
cgit 


From 8d84260ec1b11c20a7f01797b34fcbc12b33fd70 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 7 Sep 2019 16:13:20 -0400
Subject: bcachefs: data move path should not be trying to move reflink_p keys

This was spotted when the move_extent() path tried to allocate a bio for
a reflink_p extent, but adding pages to the bio failed because we
overflowed bi_max_vecs. Oops.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/extents.h | 9 +++++++--
 fs/bcachefs/move.c    | 2 +-
 2 files changed, 8 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h
index 4c4a7945a751..766584939304 100644
--- a/fs/bcachefs/extents.h
+++ b/fs/bcachefs/extents.h
@@ -455,12 +455,11 @@ unsigned bch2_extent_is_compressed(struct bkey_s_c);
 bool bch2_bkey_matches_ptr(struct bch_fs *, struct bkey_s_c,
 			   struct bch_extent_ptr, u64);
 
-static inline bool bkey_extent_is_data(const struct bkey *k)
+static inline bool bkey_extent_is_direct_data(const struct bkey *k)
 {
 	switch (k->type) {
 	case KEY_TYPE_btree_ptr:
 	case KEY_TYPE_extent:
-	case KEY_TYPE_reflink_p:
 	case KEY_TYPE_reflink_v:
 		return true;
 	default:
@@ -468,6 +467,12 @@ static inline bool bkey_extent_is_data(const struct bkey *k)
 	}
 }
 
+static inline bool bkey_extent_is_data(const struct bkey *k)
+{
+	return bkey_extent_is_direct_data(k) ||
+		k->type == KEY_TYPE_reflink_p;
+}
+
 /*
  * Should extent be counted under inode->i_sectors?
  */
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index d01c96ff00d7..8855dd19f7f2 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -547,7 +547,7 @@ peek:
 		if (bkey_cmp(bkey_start_pos(k.k), end) >= 0)
 			break;
 
-		if (!bkey_extent_is_data(k.k))
+		if (!bkey_extent_is_direct_data(k.k))
 			goto next_nondata;
 
 		if (cur_inum != k.k->p.inode) {
-- 
cgit 


From f9c5519336731174cc79ef23543909f9f4e11f64 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 7 Sep 2019 18:03:56 -0400
Subject: bcachefs: Drop trans arg to bch2_extent_atomic_end()

Just for consistency

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/extents.c  | 8 ++++----
 fs/bcachefs/extents.h  | 4 ++--
 fs/bcachefs/recovery.c | 3 +--
 3 files changed, 7 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 63afbf24a101..1d400808d842 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -993,11 +993,11 @@ static int __bch2_extent_atomic_end(struct btree_trans *trans,
 	return ret;
 }
 
-int bch2_extent_atomic_end(struct btree_trans *trans,
-			   struct btree_iter *iter,
+int bch2_extent_atomic_end(struct btree_iter *iter,
 			   struct bkey_i *insert,
 			   struct bpos *end)
 {
+	struct btree_trans *trans = iter->trans;
 	struct btree *b = iter->l[0].b;
 	struct btree_node_iter	node_iter = iter->l[0].iter;
 	struct bkey_packed	*_k;
@@ -1049,7 +1049,7 @@ int bch2_extent_trim_atomic(struct bkey_i *k, struct btree_iter *iter)
 	struct bpos end;
 	int ret;
 
-	ret = bch2_extent_atomic_end(iter->trans, iter, k, &end);
+	ret = bch2_extent_atomic_end(iter, k, &end);
 	if (ret)
 		return ret;
 
@@ -1062,7 +1062,7 @@ int bch2_extent_is_atomic(struct bkey_i *k, struct btree_iter *iter)
 	struct bpos end;
 	int ret;
 
-	ret = bch2_extent_atomic_end(iter->trans, iter, k, &end);
+	ret = bch2_extent_atomic_end(iter, k, &end);
 	if (ret)
 		return ret;
 
diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h
index 766584939304..c10388aee634 100644
--- a/fs/bcachefs/extents.h
+++ b/fs/bcachefs/extents.h
@@ -433,8 +433,8 @@ enum merge_result bch2_reservation_merge(struct bch_fs *,
 	.key_merge	= bch2_reservation_merge,		\
 }
 
-int bch2_extent_atomic_end(struct btree_trans *, struct btree_iter *,
-			   struct bkey_i *, struct bpos *);
+int bch2_extent_atomic_end(struct btree_iter *, struct bkey_i *,
+			   struct bpos *);
 int bch2_extent_trim_atomic(struct bkey_i *, struct btree_iter *);
 int bch2_extent_is_atomic(struct bkey_i *, struct btree_iter *);
 
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 5be34231a0c3..98d9a1432e50 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -281,8 +281,7 @@ retry:
 		if (ret)
 			goto err;
 
-		ret = bch2_extent_atomic_end(&trans, split_iter,
-					     k, &atomic_end);
+		ret = bch2_extent_atomic_end(split_iter, k, &atomic_end);
 		if (ret)
 			goto err;
 
-- 
cgit 


From 36e9d69854752bdad5c5b63f72e6c4901512c9a2 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 7 Sep 2019 14:16:00 -0400
Subject: bcachefs: Do updates in order they were queued up in

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c        |  41 ++++++++--
 fs/bcachefs/btree_iter.h        |   3 +
 fs/bcachefs/btree_types.h       |   4 +-
 fs/bcachefs/btree_update.h      |  25 ++----
 fs/bcachefs/btree_update_leaf.c | 169 +++++++++++++++++++++-------------------
 fs/bcachefs/buckets.c           |  23 ++----
 fs/bcachefs/super.c             |   7 +-
 7 files changed, 144 insertions(+), 128 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index a278921d3e6f..17596aee23cc 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1671,7 +1671,10 @@ int bch2_trans_iter_free_on_commit(struct btree_trans *trans,
 static int bch2_trans_realloc_iters(struct btree_trans *trans,
 				    unsigned new_size)
 {
-	void *new_iters, *new_updates;
+	void *new_iters, *new_updates, *new_sorted;
+	size_t iters_bytes;
+	size_t updates_bytes;
+	size_t sorted_bytes;
 
 	new_size = roundup_pow_of_two(new_size);
 
@@ -1684,9 +1687,13 @@ static int bch2_trans_realloc_iters(struct btree_trans *trans,
 
 	bch2_trans_unlock(trans);
 
-	new_iters = kmalloc(sizeof(struct btree_iter) * new_size +
-			    sizeof(struct btree_insert_entry) * (new_size + 4),
-			    GFP_NOFS);
+	iters_bytes	= sizeof(struct btree_iter) * new_size;
+	updates_bytes	= sizeof(struct btree_insert_entry) * (new_size + 4);
+	sorted_bytes	= sizeof(u8) * (new_size + 4);
+
+	new_iters = kmalloc(iters_bytes +
+			    updates_bytes +
+			    sorted_bytes, GFP_NOFS);
 	if (new_iters)
 		goto success;
 
@@ -1695,7 +1702,8 @@ static int bch2_trans_realloc_iters(struct btree_trans *trans,
 
 	trans->used_mempool = true;
 success:
-	new_updates = new_iters + sizeof(struct btree_iter) * new_size;
+	new_updates	= new_iters + iters_bytes;
+	new_sorted	= new_updates + updates_bytes;
 
 	memcpy(new_iters, trans->iters,
 	       sizeof(struct btree_iter) * trans->nr_iters);
@@ -1710,9 +1718,10 @@ success:
 	if (trans->iters != trans->iters_onstack)
 		kfree(trans->iters);
 
-	trans->iters	= new_iters;
-	trans->updates	= new_updates;
-	trans->size	= new_size;
+	trans->iters		= new_iters;
+	trans->updates		= new_updates;
+	trans->updates_sorted	= new_sorted;
+	trans->size		= new_size;
 
 	if (trans->iters_live) {
 		trace_trans_restart_iters_realloced(trans->ip, trans->size);
@@ -1958,6 +1967,7 @@ void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c,
 	trans->size		= ARRAY_SIZE(trans->iters_onstack);
 	trans->iters		= trans->iters_onstack;
 	trans->updates		= trans->updates_onstack;
+	trans->updates_sorted	= trans->updates_sorted_onstack;
 	trans->fs_usage_deltas	= NULL;
 
 	if (expected_nr_iters > trans->size)
@@ -1982,3 +1992,18 @@ int bch2_trans_exit(struct btree_trans *trans)
 
 	return trans->error ? -EIO : 0;
 }
+
+void bch2_fs_btree_iter_exit(struct bch_fs *c)
+{
+	mempool_exit(&c->btree_iters_pool);
+}
+
+int bch2_fs_btree_iter_init(struct bch_fs *c)
+{
+	unsigned nr = BTREE_ITER_MAX;
+
+	return mempool_init_kmalloc_pool(&c->btree_iters_pool, 1,
+			sizeof(struct btree_iter) * nr +
+			sizeof(struct btree_insert_entry) * (nr + 4) +
+			sizeof(u8) * (nr + 4));
+}
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index b54351073231..b52d8bff0115 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -303,4 +303,7 @@ void *bch2_trans_kmalloc(struct btree_trans *, size_t);
 void bch2_trans_init(struct btree_trans *, struct bch_fs *, unsigned, size_t);
 int bch2_trans_exit(struct btree_trans *);
 
+void bch2_fs_btree_iter_exit(struct bch_fs *);
+int bch2_fs_btree_iter_init(struct bch_fs *);
+
 #endif /* _BCACHEFS_BTREE_ITER_H */
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 621cbfa22fc9..88e048fa0fba 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -264,8 +264,6 @@ struct btree_insert_entry {
 	};
 
 	bool			deferred;
-	bool			triggered;
-	bool			marked;
 };
 
 #define BTREE_ITER_MAX		64
@@ -294,6 +292,7 @@ struct btree_trans {
 
 	struct btree_iter	*iters;
 	struct btree_insert_entry *updates;
+	u8			*updates_sorted;
 
 	/* update path: */
 	struct journal_res	journal_res;
@@ -305,6 +304,7 @@ struct btree_trans {
 
 	struct btree_iter	iters_onstack[2];
 	struct btree_insert_entry updates_onstack[6];
+	u8			updates_sorted_onstack[6];
 
 	struct replicas_delta_list *fs_usage_deltas;
 };
diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
index 08c17477e76c..36e34b3d9213 100644
--- a/fs/bcachefs/btree_update.h
+++ b/fs/bcachefs/btree_update.h
@@ -119,8 +119,13 @@ int bch2_trans_commit(struct btree_trans *,
 		      struct disk_reservation *,
 		      u64 *, unsigned);
 
-struct btree_insert_entry *bch2_trans_update(struct btree_trans *,
-					     struct btree_insert_entry);
+static inline void bch2_trans_update(struct btree_trans *trans,
+				     struct btree_insert_entry entry)
+{
+	EBUG_ON(trans->nr_updates >= trans->nr_iters + 4);
+
+	trans->updates[trans->nr_updates++] = entry;
+}
 
 #define bch2_trans_do(_c, _journal_seq, _flags, _do)			\
 ({									\
@@ -140,18 +145,6 @@ struct btree_insert_entry *bch2_trans_update(struct btree_trans *,
 	_ret;								\
 })
 
-/*
- * We sort transaction entries so that if multiple iterators point to the same
- * leaf node they'll be adjacent:
- */
-static inline bool same_leaf_as_prev(struct btree_trans *trans,
-				     struct btree_insert_entry *i)
-{
-	return i != trans->updates &&
-		!i->deferred &&
-		i[0].iter->l[0].b == i[-1].iter->l[0].b;
-}
-
 #define __trans_next_update(_trans, _i, _filter)			\
 ({									\
 	while ((_i) < (_trans)->updates + (_trans->nr_updates) && !(_filter))\
@@ -171,8 +164,4 @@ static inline bool same_leaf_as_prev(struct btree_trans *trans,
 #define trans_for_each_update_iter(trans, i)				\
 	__trans_for_each_update(trans, i, !(i)->deferred)
 
-#define trans_for_each_update_leaf(trans, i)				\
-	__trans_for_each_update(trans, i, !(i)->deferred &&		\
-			       !same_leaf_as_prev(trans, i))
-
 #endif /* _BCACHEFS_BTREE_UPDATE_H */
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 66b12e55d946..657359059a08 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -19,6 +19,26 @@
 
 #include <linux/sort.h>
 
+static inline bool same_leaf_as_prev(struct btree_trans *trans,
+				     unsigned sorted_idx)
+{
+	struct btree_insert_entry *i = trans->updates +
+		trans->updates_sorted[sorted_idx];
+	struct btree_insert_entry *prev = sorted_idx
+		? trans->updates + trans->updates_sorted[sorted_idx - 1]
+		: NULL;
+
+	return !i->deferred &&
+		prev &&
+		i->iter->l[0].b == prev->iter->l[0].b;
+}
+
+#define trans_for_each_update_sorted(_trans, _i, _iter)			\
+	for (iter = 0;							\
+	     _iter < _trans->nr_updates &&				\
+	     (_i = _trans->updates + _trans->updates_sorted[_iter], 1);	\
+	     _iter++)
+
 inline void bch2_btree_node_lock_for_insert(struct bch_fs *c, struct btree *b,
 					    struct btree_iter *iter)
 {
@@ -36,20 +56,21 @@ inline void bch2_btree_node_lock_for_insert(struct bch_fs *c, struct btree *b,
 		bch2_btree_init_next(c, b, iter);
 }
 
-static void btree_trans_lock_write(struct bch_fs *c, struct btree_trans *trans)
+static void btree_trans_lock_write(struct btree_trans *trans, bool lock)
 {
+	struct bch_fs *c = trans->c;
 	struct btree_insert_entry *i;
+	unsigned iter;
 
-	trans_for_each_update_leaf(trans, i)
-		bch2_btree_node_lock_for_insert(c, i->iter->l[0].b, i->iter);
-}
-
-static void btree_trans_unlock_write(struct btree_trans *trans)
-{
-	struct btree_insert_entry *i;
+	trans_for_each_update_sorted(trans, i, iter) {
+		if (same_leaf_as_prev(trans, iter))
+			continue;
 
-	trans_for_each_update_leaf(trans, i)
-		bch2_btree_node_unlock_write(i->iter->l[0].b, i->iter);
+		if (lock)
+			bch2_btree_node_lock_for_insert(c, i->iter->l[0].b, i->iter);
+		else
+			bch2_btree_node_unlock_write(i->iter->l[0].b, i->iter);
+	}
 }
 
 static inline int btree_trans_cmp(struct btree_insert_entry l,
@@ -59,6 +80,30 @@ static inline int btree_trans_cmp(struct btree_insert_entry l,
 		btree_iter_cmp(l.iter, r.iter);
 }
 
+static inline void btree_trans_sort_updates(struct btree_trans *trans)
+{
+	struct btree_insert_entry *l, *r;
+	unsigned nr = 0, pos;
+
+	trans_for_each_update(trans, l) {
+		for (pos = 0; pos < nr; pos++) {
+			r = trans->updates + trans->updates_sorted[pos];
+
+			if (btree_trans_cmp(*l, *r) <= 0)
+				break;
+		}
+
+		memmove(&trans->updates_sorted[pos + 1],
+			&trans->updates_sorted[pos],
+			(nr - pos) * sizeof(trans->updates_sorted[0]));
+
+		trans->updates_sorted[pos] = l - trans->updates;
+		nr++;
+	}
+
+	BUG_ON(nr != trans->nr_updates);
+}
+
 /* Inserting into a given leaf node (last stage of insert): */
 
 /* Handle overwrites and do insert, for non extents: */
@@ -488,12 +533,12 @@ static int btree_trans_check_can_insert(struct btree_trans *trans,
 					struct btree_insert_entry **stopped_at)
 {
 	struct btree_insert_entry *i;
-	unsigned u64s = 0;
+	unsigned iter, u64s = 0;
 	int ret;
 
-	trans_for_each_update_iter(trans, i) {
+	trans_for_each_update_sorted(trans, i, iter) {
 		/* Multiple inserts might go to same leaf: */
-		if (!same_leaf_as_prev(trans, i))
+		if (!same_leaf_as_prev(trans, iter))
 			u64s = 0;
 
 		u64s += i->k->k.u64s;
@@ -542,7 +587,6 @@ static inline int do_btree_insert_at(struct btree_trans *trans,
 	struct bch_fs *c = trans->c;
 	struct bch_fs_usage_online *fs_usage = NULL;
 	struct btree_insert_entry *i;
-	bool saw_non_marked;
 	unsigned mark_flags = trans->flags & BTREE_INSERT_BUCKET_INVALIDATE
 		? BCH_BUCKET_MARK_BUCKET_INVALIDATE
 		: 0;
@@ -551,35 +595,31 @@ static inline int do_btree_insert_at(struct btree_trans *trans,
 	trans_for_each_update_iter(trans, i)
 		BUG_ON(i->iter->uptodate >= BTREE_ITER_NEED_RELOCK);
 
+	/*
+	 * note: running triggers will append more updates to the list of
+	 * updates as we're walking it:
+	 */
 	trans_for_each_update_iter(trans, i)
-		i->marked = false;
-
-	do {
-		saw_non_marked = false;
-
-		trans_for_each_update_iter(trans, i) {
-			if (i->marked)
-				continue;
-
-			saw_non_marked = true;
-			i->marked = true;
-
-			if (update_has_triggers(trans, i) &&
-			    update_triggers_transactional(trans, i)) {
-				ret = bch2_trans_mark_update(trans, i->iter, i->k);
-				if (ret == -EINTR)
-					trace_trans_restart_mark(trans->ip);
-				if (ret)
-					goto out_clear_replicas;
-			}
+		if (update_has_triggers(trans, i) &&
+		    update_triggers_transactional(trans, i)) {
+			ret = bch2_trans_mark_update(trans, i->iter, i->k);
+			if (ret == -EINTR)
+				trace_trans_restart_mark(trans->ip);
+			if (ret)
+				goto out_clear_replicas;
 		}
-	} while (saw_non_marked);
 
 	trans_for_each_update(trans, i)
 		btree_insert_entry_checks(trans, i);
 	bch2_btree_trans_verify_locks(trans);
 
-	btree_trans_lock_write(c, trans);
+	/*
+	 * No more updates can be added - sort updates so we can take write
+	 * locks in the correct order:
+	 */
+	btree_trans_sort_updates(trans);
+
+	btree_trans_lock_write(trans, true);
 
 	if (race_fault()) {
 		ret = -EINTR;
@@ -597,8 +637,7 @@ static inline int do_btree_insert_at(struct btree_trans *trans,
 		goto out;
 
 	trans_for_each_update_iter(trans, i) {
-		if (i->deferred ||
-		    !btree_node_type_needs_gc(i->iter->btree_id))
+		if (!btree_node_type_needs_gc(i->iter->btree_id))
 			continue;
 
 		if (!fs_usage) {
@@ -664,7 +703,7 @@ out:
 	       (trans->flags & BTREE_INSERT_JOURNAL_RESERVED) &&
 	       trans->journal_res.ref);
 
-	btree_trans_unlock_write(trans);
+	btree_trans_lock_write(trans, false);
 
 	if (fs_usage) {
 		bch2_fs_usage_scratch_put(c, fs_usage);
@@ -689,19 +728,6 @@ int bch2_trans_commit_error(struct btree_trans *trans,
 {
 	struct bch_fs *c = trans->c;
 	unsigned flags = trans->flags;
-	struct btree_insert_entry *src, *dst;
-
-	src = dst = trans->updates;
-
-	while (src < trans->updates + trans->nr_updates) {
-		if (!src->triggered) {
-			*dst = *src;
-			dst++;
-		}
-		src++;
-	}
-
-	trans->nr_updates = dst - trans->updates;
 
 	/*
 	 * BTREE_INSERT_NOUNLOCK means don't unlock _after_ successful btree
@@ -816,6 +842,7 @@ static int __bch2_trans_commit(struct btree_trans *trans,
 {
 	struct bch_fs *c = trans->c;
 	struct btree_insert_entry *i;
+	unsigned iter;
 	int ret;
 
 	trans_for_each_update_iter(trans, i) {
@@ -837,8 +864,10 @@ static int __bch2_trans_commit(struct btree_trans *trans,
 	if (trans->flags & BTREE_INSERT_NOUNLOCK)
 		trans->nounlock = true;
 
-	trans_for_each_update_leaf(trans, i)
-		bch2_foreground_maybe_merge(c, i->iter, 0, trans->flags);
+	trans_for_each_update_sorted(trans, i, iter)
+		if (!same_leaf_as_prev(trans, iter))
+			bch2_foreground_maybe_merge(c, i->iter,
+						    0, trans->flags);
 
 	trans->nounlock = false;
 
@@ -858,7 +887,8 @@ int bch2_trans_commit(struct btree_trans *trans,
 {
 	struct bch_fs *c = trans->c;
 	struct btree_insert_entry *i = NULL;
-	unsigned orig_mem_top = trans->mem_top;
+	unsigned orig_nr_updates	= trans->nr_updates;
+	unsigned orig_mem_top		= trans->mem_top;
 	int ret = 0;
 
 	if (!trans->nr_updates)
@@ -931,39 +961,20 @@ out_noupdates:
 err:
 	ret = bch2_trans_commit_error(trans, i, ret);
 
+	/* free updates and memory used by triggers, they'll be reexecuted: */
+	trans->nr_updates	= orig_nr_updates;
+	trans->mem_top		= orig_mem_top;
+
 	/* can't loop if it was passed in and we changed it: */
 	if (unlikely(trans->flags & BTREE_INSERT_NO_CLEAR_REPLICAS) && !ret)
 		ret = -EINTR;
 
-	if (!ret) {
-		/* free memory used by triggers, they'll be reexecuted: */
-		trans->mem_top = orig_mem_top;
+	if (!ret)
 		goto retry;
-	}
 
 	goto out;
 }
 
-struct btree_insert_entry *bch2_trans_update(struct btree_trans *trans,
-					     struct btree_insert_entry entry)
-{
-	struct btree_insert_entry *i;
-
-	BUG_ON(trans->nr_updates >= trans->nr_iters + 4);
-
-	for (i = trans->updates;
-	     i < trans->updates + trans->nr_updates;
-	     i++)
-		if (btree_trans_cmp(entry, *i) < 0)
-			break;
-
-	memmove(&i[1], &i[0],
-		(void *) &trans->updates[trans->nr_updates] - (void *) i);
-	trans->nr_updates++;
-	*i = entry;
-	return i;
-}
-
 /**
  * bch2_btree_insert - insert keys into the extent btree
  * @c:			pointer to struct bch_fs
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index e432a4507081..b6340a2f6deb 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -1358,11 +1358,8 @@ static int trans_get_key(struct btree_trans *trans,
 	struct btree_insert_entry *i;
 	int ret;
 
-	for (i = trans->updates;
-	     i < trans->updates + trans->nr_updates;
-	     i++)
-		if (!i->deferred &&
-		    i->iter->btree_id == btree_id &&
+	trans_for_each_update_iter(trans, i)
+		if (i->iter->btree_id == btree_id &&
 		    (btree_node_type_is_extents(btree_id)
 		     ? bkey_cmp(pos, bkey_start_pos(&i->k->k)) >= 0 &&
 		       bkey_cmp(pos, i->k->k.p) < 0
@@ -1390,8 +1387,8 @@ static void *trans_update_key(struct btree_trans *trans,
 			      struct btree_iter *iter,
 			      unsigned u64s)
 {
+	struct btree_insert_entry *i;
 	struct bkey_i *new_k;
-	unsigned i;
 
 	new_k = bch2_trans_kmalloc(trans, u64s * sizeof(u64));
 	if (IS_ERR(new_k))
@@ -1400,19 +1397,13 @@ static void *trans_update_key(struct btree_trans *trans,
 	bkey_init(&new_k->k);
 	new_k->k.p = iter->pos;
 
-	for (i = 0; i < trans->nr_updates; i++)
-		if (!trans->updates[i].deferred &&
-		    trans->updates[i].iter == iter) {
-			trans->updates[i].k = new_k;
+	trans_for_each_update_iter(trans, i)
+		if (i->iter == iter) {
+			i->k = new_k;
 			return new_k;
 		}
 
-	bch2_trans_update(trans, ((struct btree_insert_entry) {
-		.iter = iter,
-		.k = new_k,
-		.triggered = true,
-	}));
-
+	bch2_trans_update(trans, BTREE_INSERT_ENTRY(iter, new_k));
 	return new_k;
 }
 
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 202c0b443ef4..14e2f6828cc6 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -462,6 +462,7 @@ static void bch2_fs_free(struct bch_fs *c)
 	bch2_fs_ec_exit(c);
 	bch2_fs_encryption_exit(c);
 	bch2_fs_io_exit(c);
+	bch2_fs_btree_iter_exit(c);
 	bch2_fs_btree_cache_exit(c);
 	bch2_fs_journal_exit(&c->journal);
 	bch2_io_clock_exit(&c->io_clock[WRITE]);
@@ -474,7 +475,6 @@ static void bch2_fs_free(struct bch_fs *c)
 	free_percpu(c->usage[0]);
 	kfree(c->usage_base);
 	free_percpu(c->pcpu);
-	mempool_exit(&c->btree_iters_pool);
 	mempool_exit(&c->btree_bounce_pool);
 	bioset_exit(&c->btree_bio);
 	mempool_exit(&c->btree_interior_update_pool);
@@ -729,15 +729,12 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 	    !(c->online_reserved = alloc_percpu(u64)) ||
 	    mempool_init_kvpmalloc_pool(&c->btree_bounce_pool, 1,
 					btree_bytes(c)) ||
-	    mempool_init_kmalloc_pool(&c->btree_iters_pool, 1,
-			sizeof(struct btree_iter) * BTREE_ITER_MAX +
-			sizeof(struct btree_insert_entry) *
-			(BTREE_ITER_MAX + 4)) ||
 	    bch2_io_clock_init(&c->io_clock[READ]) ||
 	    bch2_io_clock_init(&c->io_clock[WRITE]) ||
 	    bch2_fs_journal_init(&c->journal) ||
 	    bch2_fs_replicas_init(c) ||
 	    bch2_fs_btree_cache_init(c) ||
+	    bch2_fs_btree_iter_init(c) ||
 	    bch2_fs_io_init(c) ||
 	    bch2_fs_encryption_init(c) ||
 	    bch2_fs_compress_init(c) ||
-- 
cgit 


From c0fc30dad5820b9e7d27355ec8a507f61d27a299 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 7 Sep 2019 19:17:40 -0400
Subject: bcachefs: __bch2_btree_node_iter_fix() improvements

Being more rigorous about noting when the key the iterator currently
poins to has changed - which should also give us a nice performance
improvement due to not having to check if we have to skip other bsets
backwards as much.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c | 67 ++++++++++++++++++++++++------------------------
 1 file changed, 33 insertions(+), 34 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 17596aee23cc..44aa5231edd4 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -526,6 +526,10 @@ static void __bch2_btree_node_iter_fix(struct btree_iter *iter,
 	unsigned offset = __btree_node_key_to_offset(b, where);
 	int shift = new_u64s - clobber_u64s;
 	unsigned old_end = t->end_offset - shift;
+	unsigned orig_iter_pos = node_iter->data[0].k;
+	bool iter_current_key_modified =
+		orig_iter_pos >= offset &&
+		orig_iter_pos <= offset + clobber_u64s;
 
 	btree_node_iter_for_each(node_iter, set)
 		if (set->end == old_end)
@@ -534,24 +538,18 @@ static void __bch2_btree_node_iter_fix(struct btree_iter *iter,
 	/* didn't find the bset in the iterator - might have to readd it: */
 	if (new_u64s &&
 	    btree_iter_pos_cmp(iter, b, where) > 0) {
-		btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
-
 		bch2_btree_node_iter_push(node_iter, b, where, end);
-
-		if (!b->c.level &&
-		    node_iter == &iter->l[0].iter)
-			bkey_disassemble(b,
-				bch2_btree_node_iter_peek_all(node_iter, b),
-				&iter->k);
+		goto fixup_done;
+	} else {
+		/* Iterator is after key that changed */
+		goto out_verify;
 	}
-
-	goto iter_current_key_not_modified;
 found:
 	set->end = t->end_offset;
 
 	/* Iterator hasn't gotten to the key that changed yet: */
 	if (set->k < offset)
-		return;
+		goto out_verify;
 
 	if (new_u64s &&
 	    btree_iter_pos_cmp(iter, b, where) > 0) {
@@ -561,40 +559,25 @@ found:
 		if (set->k == set->end)
 			bch2_btree_node_iter_set_drop(node_iter, set);
 	} else {
+		/* Iterator is after key that changed */
 		set->k = (int) set->k + shift;
-		goto iter_current_key_not_modified;
+		goto out_verify;
 	}
 
-	btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
-
 	bch2_btree_node_iter_sort(node_iter, b);
-	if (!b->c.level && node_iter == &iter->l[0].iter) {
-		/*
-		 * not legal to call bkey_debugcheck() here, because we're
-		 * called midway through the update path after update has been
-		 * marked but before deletes have actually happened:
-		 */
-#if 0
-		__btree_iter_peek_all(iter, &iter->l[0], &iter->k);
-#endif
-		struct btree_iter_level *l = &iter->l[0];
-		struct bkey_packed *k =
-			bch2_btree_node_iter_peek_all(&l->iter, l->b);
+fixup_done:
+	if (node_iter->data[0].k != orig_iter_pos)
+		iter_current_key_modified = true;
 
-		if (unlikely(!k))
-			iter->k.type = KEY_TYPE_deleted;
-		else
-			bkey_disassemble(l->b, k, &iter->k);
-	}
-iter_current_key_not_modified:
 	/*
 	 * When a new key is added, and the node iterator now points to that
 	 * key, the iterator might have skipped past deleted keys that should
 	 * come after the key the iterator now points to. We have to rewind to
-	 * before those deleted keys - otherwise bch2_btree_node_iter_prev_all()
-	 * breaks:
+	 * before those deleted keys - otherwise
+	 * bch2_btree_node_iter_prev_all() breaks:
 	 */
 	if (!bch2_btree_node_iter_end(node_iter) &&
+	    iter_current_key_modified &&
 	    (b->c.level ||
 	     (iter->flags & BTREE_ITER_IS_EXTENTS))) {
 		struct bset_tree *t;
@@ -622,6 +605,22 @@ iter_current_key_not_modified:
 		}
 	}
 
+	if (!b->c.level &&
+	    node_iter == &iter->l[0].iter &&
+	    iter_current_key_modified) {
+		struct bkey_packed *k =
+			bch2_btree_node_iter_peek_all(node_iter, b);
+
+		if (likely(k)) {
+			bkey_disassemble(b, k, &iter->k);
+		} else {
+			/* XXX: for extents, calculate size of hole? */
+			iter->k.type = KEY_TYPE_deleted;
+		}
+
+		btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
+	}
+out_verify:
 	bch2_btree_node_iter_verify(node_iter, b);
 }
 
-- 
cgit 


From 63095894686cb4e16ad6a8329e95681cee63d615 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 22 Jul 2019 13:37:02 -0400
Subject: bcachefs: Improved bch2_fcollapse()

Move extents instead of copying them - this way, we can iterate over
only live extents, not the entire keyspace. Also, this means we can
mostly skip running triggers.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/buckets.c |   1 +
 fs/bcachefs/fs-io.c   | 126 ++++++++++++++++++++++++++++++++++----------------
 2 files changed, 88 insertions(+), 39 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index b6340a2f6deb..637a9e909f82 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -1486,6 +1486,7 @@ static int bch2_trans_mark_pointer(struct btree_trans *trans,
 	bch2_fs_inconsistent_on(overflow, c,
 		"bucket sector count overflow: %u + %lli > U16_MAX",
 		old, sectors);
+	BUG_ON(overflow);
 
 	a = trans_update_key(trans, iter, BKEY_ALLOC_U64s_MAX);
 	ret = PTR_ERR_OR_ZERO(a);
diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 0dfe822cecbf..4a016c19dcbd 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -2602,9 +2602,7 @@ static long bch2_fcollapse(struct bch_inode_info *inode,
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
 	struct address_space *mapping = inode->v.i_mapping;
 	struct btree_trans trans;
-	struct btree_iter *src, *dst;
-	BKEY_PADDED(k) copy;
-	struct bkey_s_c k;
+	struct btree_iter *src, *dst, *del = NULL;
 	loff_t new_size;
 	int ret;
 
@@ -2636,74 +2634,124 @@ static long bch2_fcollapse(struct bch_inode_info *inode,
 	if (ret)
 		goto err;
 
+	ret = __bch2_fpunch(c, inode, offset >> 9,
+			    (offset + len) >> 9);
+	if (ret)
+		goto err;
+
 	dst = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
 			POS(inode->v.i_ino, offset >> 9),
-			BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+			BTREE_ITER_INTENT);
 	BUG_ON(IS_ERR_OR_NULL(dst));
 
 	src = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
-			POS_MIN, BTREE_ITER_SLOTS);
+			POS(inode->v.i_ino, (offset + len) >> 9),
+			BTREE_ITER_INTENT);
 	BUG_ON(IS_ERR_OR_NULL(src));
 
-	while (bkey_cmp(dst->pos,
-			POS(inode->v.i_ino,
-			    round_up(new_size, block_bytes(c)) >> 9)) < 0) {
-		struct disk_reservation disk_res;
+	while (1) {
+		struct disk_reservation disk_res =
+			bch2_disk_reservation_init(c, 0);
+		BKEY_PADDED(k) copy;
+		struct bkey_i delete;
+		struct bkey_s_c k;
+		struct bpos next_pos;
+		unsigned commit_flags = BTREE_INSERT_NOFAIL|
+			BTREE_INSERT_ATOMIC|
+			BTREE_INSERT_USE_RESERVE;
 
-		ret = bch2_btree_iter_traverse(dst);
-		if (ret)
+		k = bch2_btree_iter_peek(src);
+		if ((ret = bkey_err(k)))
 			goto bkey_err;
 
-		bch2_btree_iter_set_pos(src,
-			POS(dst->pos.inode, dst->pos.offset + (len >> 9)));
+		if (!k.k || k.k->p.inode != inode->v.i_ino)
+			break;
 
-		k = bch2_btree_iter_peek_slot(src);
-		if ((ret = bkey_err(k)))
-			goto bkey_err;
+		BUG_ON(src->pos.offset != bkey_start_offset(k.k));
 
-		bkey_reassemble(&copy.k, k);
+		bch2_btree_iter_set_pos(dst,
+			POS(inode->v.i_ino, src->pos.offset - (len >> 9)));
 
-		bch2_cut_front(src->pos, &copy.k);
-		copy.k.k.p.offset -= len >> 9;
+		ret = bch2_btree_iter_traverse(dst);
+		if (ret)
+			goto bkey_err;
 
+		bkey_reassemble(&copy.k, k);
+		copy.k.k.p = dst->pos;
+		copy.k.k.p.offset += copy.k.k.size;
 		ret = bch2_extent_trim_atomic(&copy.k, dst);
 		if (ret)
 			goto bkey_err;
 
-		BUG_ON(bkey_cmp(dst->pos, bkey_start_pos(&copy.k.k)));
+		bkey_init(&delete.k);
+		delete.k.p = src->pos;
+		bch2_key_resize(&delete.k, copy.k.k.size);
 
-		ret = bch2_disk_reservation_get(c, &disk_res, copy.k.k.size,
-				bch2_bkey_nr_dirty_ptrs(bkey_i_to_s_c(&copy.k)),
-				BCH_DISK_RESERVATION_NOFAIL);
-		BUG_ON(ret);
+		next_pos = delete.k.p;
 
-		bch2_trans_begin_updates(&trans);
+		/*
+		 * If the new and old keys overlap (because we're moving an
+		 * extent that's bigger than the amount we're collapsing by),
+		 * we need to trim the delete key here so they don't overlap
+		 * because overlaps on insertions aren't handled before
+		 * triggers are run, so the overwrite will get double counted
+		 * by the triggers machinery:
+		 */
+		if (bkey_cmp(copy.k.k.p, bkey_start_pos(&delete.k)) > 0) {
+			bch2_cut_front(copy.k.k.p, &delete);
 
-		ret = bch2_extent_update(&trans, inode,
-				&disk_res, NULL,
-				dst, &copy.k,
-				0, true, true, NULL);
+			del = bch2_trans_copy_iter(&trans, src);
+			BUG_ON(IS_ERR_OR_NULL(del));
+
+			bch2_btree_iter_set_pos(del,
+				bkey_start_pos(&delete.k));
+			bch2_trans_update(&trans,
+				BTREE_INSERT_ENTRY(del, &delete));
+		} else {
+			bch2_trans_update(&trans,
+				BTREE_INSERT_ENTRY(src, &delete));
+		}
+
+		bch2_trans_update(&trans, BTREE_INSERT_ENTRY(dst, &copy.k));
+
+		if (copy.k.k.size == k.k->size) {
+			/*
+			 * If we're moving the entire extent, we can skip
+			 * running triggers:
+			 */
+			commit_flags |= BTREE_INSERT_NOMARK;
+		} else {
+			/* We might end up splitting compressed extents: */
+			unsigned nr_ptrs =
+				bch2_bkey_nr_dirty_ptrs(bkey_i_to_s_c(&copy.k));
+
+			ret = bch2_disk_reservation_get(c, &disk_res,
+					copy.k.k.size, nr_ptrs,
+					BCH_DISK_RESERVATION_NOFAIL);
+			BUG_ON(ret);
+		}
+
+		ret = bch2_trans_commit(&trans, &disk_res,
+					&inode->ei_journal_seq,
+					commit_flags);
 		bch2_disk_reservation_put(c, &disk_res);
 bkey_err:
+		if (del)
+			bch2_trans_iter_free(&trans, del);
+		del = NULL;
+
+		if (!ret)
+			bch2_btree_iter_set_pos(src, next_pos);
+
 		if (ret == -EINTR)
 			ret = 0;
 		if (ret)
 			goto err;
-		/*
-		 * XXX: if we error here we've left data with multiple
-		 * pointers... which isn't a _super_ serious problem...
-		 */
 
 		bch2_trans_cond_resched(&trans);
 	}
 	bch2_trans_unlock(&trans);
 
-	ret = __bch2_fpunch(c, inode,
-			round_up(new_size, block_bytes(c)) >> 9,
-			U64_MAX);
-	if (ret)
-		goto err;
-
 	i_size_write(&inode->v, new_size);
 	mutex_lock(&inode->ei_update_lock);
 	ret = bch2_write_inode_size(c, inode, new_size,
-- 
cgit 


From 5a8a52d6107328d49b1fb9850ed53607cf41e583 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 13 Sep 2019 14:43:34 -0400
Subject: bcachefs: Fix a typo

_iter, not iter

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update_leaf.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 657359059a08..007d772d6e8f 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -34,7 +34,7 @@ static inline bool same_leaf_as_prev(struct btree_trans *trans,
 }
 
 #define trans_for_each_update_sorted(_trans, _i, _iter)			\
-	for (iter = 0;							\
+	for (_iter = 0;							\
 	     _iter < _trans->nr_updates &&				\
 	     (_i = _trans->updates + _trans->updates_sorted[_iter], 1);	\
 	     _iter++)
-- 
cgit 


From 9b02d1c49a261d711b8d9587afa55cdf3cad97f9 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 8 Sep 2019 14:00:12 -0400
Subject: bcachefs: Optimize calls to bch2_btree_iter_traverse()

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c | 28 ++++++++++++----------------
 fs/bcachefs/btree_iter.h | 11 ++++++++++-
 2 files changed, 22 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 44aa5231edd4..547a07865ac6 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -955,10 +955,10 @@ static void btree_iter_up(struct btree_iter *iter)
 	btree_node_unlock(iter, iter->level++);
 }
 
-int __must_check __bch2_btree_iter_traverse(struct btree_iter *);
+static int btree_iter_traverse_one(struct btree_iter *);
 
 static int __btree_iter_traverse_all(struct btree_trans *trans,
-				     struct btree_iter *orig_iter, int ret)
+				   struct btree_iter *orig_iter, int ret)
 {
 	struct bch_fs *c = trans->c;
 	struct btree_iter *iter;
@@ -1002,7 +1002,7 @@ retry_all:
 		iter = &trans->iters[sorted[i]];
 
 		do {
-			ret = __bch2_btree_iter_traverse(iter);
+			ret = btree_iter_traverse_one(iter);
 		} while (ret == -EINTR);
 
 		if (ret)
@@ -1047,7 +1047,7 @@ static unsigned btree_iter_up_until_locked(struct btree_iter *iter,
  * On error, caller (peek_node()/peek_key()) must return NULL; the error is
  * stashed in the iterator and returned from bch2_trans_exit().
  */
-int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter)
+static int btree_iter_traverse_one(struct btree_iter *iter)
 {
 	unsigned depth_want = iter->level;
 
@@ -1099,12 +1099,12 @@ int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter)
 	return 0;
 }
 
-int __must_check bch2_btree_iter_traverse(struct btree_iter *iter)
+int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter)
 {
 	int ret;
 
 	ret =   bch2_trans_cond_resched(iter->trans) ?:
-		__bch2_btree_iter_traverse(iter);
+		btree_iter_traverse_one(iter);
 	if (unlikely(ret))
 		ret = __btree_iter_traverse_all(iter->trans, iter, ret);
 
@@ -1296,11 +1296,9 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
 		return btree_iter_peek_uptodate(iter);
 
 	while (1) {
-		if (iter->uptodate >= BTREE_ITER_NEED_RELOCK) {
-			ret = bch2_btree_iter_traverse(iter);
-			if (unlikely(ret))
-				return bkey_s_c_err(ret);
-		}
+		ret = bch2_btree_iter_traverse(iter);
+		if (unlikely(ret))
+			return bkey_s_c_err(ret);
 
 		k = __btree_iter_peek(iter, l);
 		if (likely(k.k))
@@ -1564,11 +1562,9 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
 	if (iter->uptodate == BTREE_ITER_UPTODATE)
 		return btree_iter_peek_uptodate(iter);
 
-	if (iter->uptodate >= BTREE_ITER_NEED_RELOCK) {
-		ret = bch2_btree_iter_traverse(iter);
-		if (unlikely(ret))
-			return bkey_s_c_err(ret);
-	}
+	ret = bch2_btree_iter_traverse(iter);
+	if (unlikely(ret))
+		return bkey_s_c_err(ret);
 
 	return __bch2_btree_iter_peek_slot(iter);
 }
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index b52d8bff0115..34c08428a048 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -134,7 +134,16 @@ void bch2_btree_iter_node_drop(struct btree_iter *, struct btree *);
 
 void bch2_btree_iter_reinit_node(struct btree_iter *, struct btree *);
 
-int __must_check bch2_btree_iter_traverse(struct btree_iter *);
+int __must_check __bch2_btree_iter_traverse(struct btree_iter *);
+
+static inline int __must_check
+bch2_btree_iter_traverse(struct btree_iter *iter)
+{
+	return iter->uptodate >= BTREE_ITER_NEED_RELOCK
+		? __bch2_btree_iter_traverse(iter)
+		: 0;
+}
+
 int bch2_btree_iter_traverse_all(struct btree_trans *);
 
 struct btree *bch2_btree_iter_peek_node(struct btree_iter *);
-- 
cgit 


From 4cac0bf2c25d9056c4cc24c27948774fa2591c5a Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 14 Sep 2019 10:45:46 -0400
Subject: bcachefs: Add missing bch2_btree_node_iter_fix() calls

With multiple iterators, if another iterator points to the key being
modified, we need to call bch2_btree_node_iter_fix() to re-unpack the
key into the iter->k

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/extents.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 1d400808d842..2cf97df6b85b 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -1138,6 +1138,8 @@ extent_squash(struct bch_fs *c, struct btree_iter *iter,
 		__bch2_cut_front(insert->k.p, k);
 		BUG_ON(bkey_deleted(k.k));
 		extent_save(l->b, _k, k.k);
+		bch2_btree_node_iter_fix(iter, l->b, &l->iter,
+					 _k, _k->u64s, _k->u64s);
 		verify_modified_extent(iter, _k);
 		break;
 
@@ -1207,6 +1209,8 @@ extent_squash(struct bch_fs *c, struct btree_iter *iter,
 		__bch2_cut_front(insert->k.p, k);
 		BUG_ON(bkey_deleted(k.k));
 		extent_save(l->b, _k, k.k);
+		bch2_btree_node_iter_fix(iter, l->b, &l->iter,
+					 _k, _k->u64s, _k->u64s);
 		verify_modified_extent(iter, _k);
 
 		extent_bset_insert(c, iter, &split.k);
-- 
cgit 


From a9d1f9101832fa1f495ab4b5b083d60eef55bd55 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 14 Sep 2019 10:47:14 -0400
Subject: bcachefs: Debug code improvements

.key_debugcheck no longer needs to take a pointer to the btree node

Also, try to make sure wherever we're inserting or modifying keys in the
btree.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bkey_methods.c |  2 +-
 fs/bcachefs/bkey_methods.h |  3 +--
 fs/bcachefs/extents.c      | 19 +++++++++++--------
 fs/bcachefs/extents.h      |  5 ++---
 4 files changed, 15 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c
index 6fa6ac1fadc1..f01405dd502b 100644
--- a/fs/bcachefs/bkey_methods.c
+++ b/fs/bcachefs/bkey_methods.c
@@ -145,7 +145,7 @@ void bch2_bkey_debugcheck(struct bch_fs *c, struct btree *b, struct bkey_s_c k)
 	}
 
 	if (ops->key_debugcheck)
-		ops->key_debugcheck(c, b, k);
+		ops->key_debugcheck(c, k);
 }
 
 void bch2_bpos_to_text(struct printbuf *out, struct bpos pos)
diff --git a/fs/bcachefs/bkey_methods.h b/fs/bcachefs/bkey_methods.h
index e6e97cda4f50..8568b65c1ed2 100644
--- a/fs/bcachefs/bkey_methods.h
+++ b/fs/bcachefs/bkey_methods.h
@@ -26,8 +26,7 @@ struct bkey_ops {
 	/* Returns reason for being invalid if invalid, else NULL: */
 	const char *	(*key_invalid)(const struct bch_fs *,
 				       struct bkey_s_c);
-	void		(*key_debugcheck)(struct bch_fs *, struct btree *,
-					  struct bkey_s_c);
+	void		(*key_debugcheck)(struct bch_fs *, struct bkey_s_c);
 	void		(*val_to_text)(struct printbuf *, struct bch_fs *,
 				       struct bkey_s_c);
 	void		(*swab)(const struct bkey_format *, struct bkey_packed *);
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 2cf97df6b85b..d7ce87bd4374 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -671,8 +671,7 @@ const char *bch2_btree_ptr_invalid(const struct bch_fs *c, struct bkey_s_c k)
 	return bch2_bkey_ptrs_invalid(c, k);
 }
 
-void bch2_btree_ptr_debugcheck(struct bch_fs *c, struct btree *b,
-			       struct bkey_s_c k)
+void bch2_btree_ptr_debugcheck(struct bch_fs *c, struct bkey_s_c k)
 {
 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
 	const struct bch_extent_ptr *ptr;
@@ -895,6 +894,9 @@ static void extent_bset_insert(struct bch_fs *c, struct btree_iter *iter,
 	EBUG_ON(bkey_deleted(&insert->k) || !insert->k.size);
 	verify_extent_nonoverlapping(c, l->b, &l->iter, insert);
 
+	if (debug_check_bkeys(c))
+		bch2_bkey_debugcheck(c, l->b, bkey_i_to_s_c(insert));
+
 	node_iter = l->iter;
 	k = bch2_btree_node_iter_prev_filter(&node_iter, l->b, KEY_TYPE_discard);
 	if (k && !bkey_written(l->b, k) &&
@@ -1362,10 +1364,6 @@ void bch2_insert_fixup_extent(struct btree_trans *trans,
 		if (s.deleting)
 			tmp.k.k.type = KEY_TYPE_discard;
 
-		if (debug_check_bkeys(c))
-			bch2_bkey_debugcheck(c, iter->l[0].b,
-					     bkey_i_to_s_c(&tmp.k));
-
 		EBUG_ON(bkey_deleted(&tmp.k.k) || !tmp.k.k.size);
 
 		extent_bset_insert(c, iter, &tmp.k);
@@ -1390,8 +1388,7 @@ const char *bch2_extent_invalid(const struct bch_fs *c, struct bkey_s_c k)
 	return bch2_bkey_ptrs_invalid(c, k);
 }
 
-void bch2_extent_debugcheck(struct bch_fs *c, struct btree *b,
-			    struct bkey_s_c k)
+void bch2_extent_debugcheck(struct bch_fs *c, struct bkey_s_c k)
 {
 	struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
 	const union bch_extent_entry *entry;
@@ -1765,6 +1762,12 @@ static bool bch2_extent_merge_inline(struct bch_fs *c,
 	if (ret == BCH_MERGE_NOMERGE)
 		return false;
 
+	if (debug_check_bkeys(c))
+		bch2_bkey_debugcheck(c, b, bkey_i_to_s_c(&li.k));
+	if (debug_check_bkeys(c) &&
+	    ret == BCH_MERGE_PARTIAL)
+		bch2_bkey_debugcheck(c, b, bkey_i_to_s_c(&ri.k));
+
 	/*
 	 * check if we overlap with deleted extents - would break the sort
 	 * order:
diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h
index c10388aee634..7253cd01db6a 100644
--- a/fs/bcachefs/extents.h
+++ b/fs/bcachefs/extents.h
@@ -389,8 +389,7 @@ const char *bch2_bkey_ptrs_invalid(const struct bch_fs *, struct bkey_s_c);
 /* bch_btree_ptr: */
 
 const char *bch2_btree_ptr_invalid(const struct bch_fs *, struct bkey_s_c);
-void bch2_btree_ptr_debugcheck(struct bch_fs *, struct btree *,
-			       struct bkey_s_c);
+void bch2_btree_ptr_debugcheck(struct bch_fs *, struct bkey_s_c);
 void bch2_btree_ptr_to_text(struct printbuf *, struct bch_fs *,
 			    struct bkey_s_c);
 void bch2_ptr_swab(const struct bkey_format *, struct bkey_packed *);
@@ -405,7 +404,7 @@ void bch2_ptr_swab(const struct bkey_format *, struct bkey_packed *);
 /* bch_extent: */
 
 const char *bch2_extent_invalid(const struct bch_fs *, struct bkey_s_c);
-void bch2_extent_debugcheck(struct bch_fs *, struct btree *, struct bkey_s_c);
+void bch2_extent_debugcheck(struct bch_fs *, struct bkey_s_c);
 void bch2_extent_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 bool bch2_extent_normalize(struct bch_fs *, struct bkey_s);
 enum merge_result bch2_extent_merge(struct bch_fs *,
-- 
cgit 


From 3745efd618b24d15443f9d1bf75744bea9c3a73f Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 13 Sep 2019 14:50:02 -0400
Subject: bcachefs: Improve btree_iter_pos_in_node()

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 547a07865ac6..5b2a5aa05403 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -742,18 +742,29 @@ static void btree_iter_verify_new_node(struct btree_iter *iter, struct btree *b)
 		btree_node_unlock(iter, b->c.level + 1);
 }
 
+static inline bool btree_iter_pos_before_node(struct btree_iter *iter,
+					      struct btree *b)
+{
+	return bkey_cmp(iter->pos, b->data->min_key) < 0;
+}
+
 static inline bool btree_iter_pos_after_node(struct btree_iter *iter,
 					     struct btree *b)
 {
-	return __btree_iter_pos_cmp(iter, NULL,
-			bkey_to_packed(&b->key), true) < 0;
+	int cmp = bkey_cmp(b->key.k.p, iter->pos);
+
+	if (!cmp &&
+	    (iter->flags & BTREE_ITER_IS_EXTENTS) &&
+	    bkey_cmp(b->key.k.p, POS_MAX))
+		cmp = -1;
+	return cmp < 0;
 }
 
 static inline bool btree_iter_pos_in_node(struct btree_iter *iter,
 					  struct btree *b)
 {
 	return iter->btree_id == b->c.btree_id &&
-		bkey_cmp(iter->pos, b->data->min_key) >= 0 &&
+		!btree_iter_pos_before_node(iter, b) &&
 		!btree_iter_pos_after_node(iter, b);
 }
 
-- 
cgit 


From f4b613410ccf9e3129d1466c23b28c23a77dd69a Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 7 Sep 2019 19:19:57 -0400
Subject: bcachefs: More btree iter improvements

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c | 156 +++++++++++++++++++++++++++++++----------------
 1 file changed, 103 insertions(+), 53 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 5b2a5aa05403..b5dace40534c 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1031,16 +1031,27 @@ int bch2_btree_iter_traverse_all(struct btree_trans *trans)
 	return __btree_iter_traverse_all(trans, NULL, 0);
 }
 
-static unsigned btree_iter_up_until_locked(struct btree_iter *iter,
-					   bool check_pos)
+static inline bool btree_iter_good_node(struct btree_iter *iter,
+					unsigned l, int check_pos)
+{
+	if (!is_btree_node(iter, l) ||
+	    !bch2_btree_node_relock(iter, l))
+		return false;
+
+	if (check_pos <= 0 && btree_iter_pos_before_node(iter, iter->l[l].b))
+		return false;
+	if (check_pos >= 0 && btree_iter_pos_after_node(iter, iter->l[l].b))
+		return false;
+	return true;
+}
+
+static inline unsigned btree_iter_up_until_good_node(struct btree_iter *iter,
+						     int check_pos)
 {
 	unsigned l = iter->level;
 
 	while (btree_iter_node(iter, l) &&
-	       (!is_btree_node(iter, l) ||
-		!bch2_btree_node_relock(iter, l) ||
-		 (check_pos &&
-		  !btree_iter_pos_in_node(iter, iter->l[l].b)))) {
+	       !btree_iter_good_node(iter, l, check_pos)) {
 		btree_node_unlock(iter, l);
 		iter->l[l].b = BTREE_ITER_NO_NODE_UP;
 		l++;
@@ -1072,7 +1083,7 @@ static int btree_iter_traverse_one(struct btree_iter *iter)
 	 * XXX: correctly using BTREE_ITER_UPTODATE should make using check_pos
 	 * here unnecessary
 	 */
-	iter->level = btree_iter_up_until_locked(iter, true);
+	iter->level = btree_iter_up_until_good_node(iter, 0);
 
 	/*
 	 * If we've got a btree node locked (i.e. we aren't about to relock the
@@ -1080,8 +1091,11 @@ static int btree_iter_traverse_one(struct btree_iter *iter)
 	 *
 	 * XXX correctly using BTREE_ITER_UPTODATE should make this unnecessary
 	 */
-	if (btree_iter_node(iter, iter->level))
+	if (btree_iter_node(iter, iter->level)) {
+		BUG_ON(!btree_iter_pos_in_node(iter, iter->l[iter->level].b));
+
 		btree_iter_advance_to_pos(iter, &iter->l[iter->level], -1);
+	}
 
 	/*
 	 * Note: iter->nodes[iter->level] may be temporarily NULL here - that
@@ -1244,19 +1258,11 @@ void bch2_btree_iter_set_pos_same_leaf(struct btree_iter *iter, struct bpos new_
 		btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
 }
 
-void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos)
+static unsigned btree_iter_pos_changed(struct btree_iter *iter, int cmp)
 {
-	int cmp = bkey_cmp(new_pos, iter->pos);
-	unsigned level;
-
-	if (!cmp)
-		return;
-
-	iter->pos = new_pos;
-
-	level = btree_iter_up_until_locked(iter, true);
+	unsigned l = btree_iter_up_until_good_node(iter, cmp);
 
-	if (btree_iter_node(iter, level)) {
+	if (btree_iter_node(iter, l)) {
 		/*
 		 * We might have to skip over many keys, or just a few: try
 		 * advancing the node iterator, and if we have to skip over too
@@ -1264,20 +1270,71 @@ void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos)
 		 * is expensive).
 		 */
 		if (cmp < 0 ||
-		    !btree_iter_advance_to_pos(iter, &iter->l[level], 8))
-			__btree_iter_init(iter, level);
+		    !btree_iter_advance_to_pos(iter, &iter->l[l], 8))
+			__btree_iter_init(iter, l);
 
 		/* Don't leave it locked if we're not supposed to: */
-		if (btree_lock_want(iter, level) == BTREE_NODE_UNLOCKED)
-			btree_node_unlock(iter, level);
+		if (btree_lock_want(iter, l) == BTREE_NODE_UNLOCKED)
+			btree_node_unlock(iter, l);
 	}
 
-	if (level != iter->level)
+	return l;
+}
+
+void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos)
+{
+	int cmp = bkey_cmp(new_pos, iter->pos);
+	unsigned l;
+
+	if (!cmp)
+		return;
+
+	iter->pos = new_pos;
+
+	l = btree_iter_pos_changed(iter, cmp);
+
+	if (l != iter->level)
 		btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
 	else
 		btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
 }
 
+static inline bool btree_iter_set_pos_to_next_leaf(struct btree_iter *iter)
+{
+	struct btree_iter_level *l = &iter->l[0];
+
+	iter->pos	= l->b->key.k.p;
+	iter->uptodate	= BTREE_ITER_NEED_TRAVERSE;
+
+	if (!bkey_cmp(iter->pos, POS_MAX)) {
+		bkey_init(&iter->k);
+		iter->k.p	= POS_MAX;
+		return false;
+	}
+
+	iter->pos = btree_type_successor(iter->btree_id, iter->pos);
+	btree_iter_pos_changed(iter, 1);
+	return true;
+}
+
+static inline bool btree_iter_set_pos_to_prev_leaf(struct btree_iter *iter)
+{
+	struct btree_iter_level *l = &iter->l[0];
+
+	iter->pos	= l->b->data->min_key;
+	iter->uptodate	= BTREE_ITER_NEED_TRAVERSE;
+
+	if (!bkey_cmp(iter->pos, POS_MIN)) {
+		bkey_init(&iter->k);
+		iter->k.p	= POS_MIN;
+		return false;
+	}
+
+	iter->pos = btree_type_predecessor(iter->btree_id, iter->pos);
+	btree_iter_pos_changed(iter, -1);
+	return true;
+}
+
 static inline struct bkey_s_c btree_iter_peek_uptodate(struct btree_iter *iter)
 {
 	struct btree_iter_level *l = &iter->l[0];
@@ -1295,6 +1352,10 @@ static inline struct bkey_s_c btree_iter_peek_uptodate(struct btree_iter *iter)
 	return ret;
 }
 
+/**
+ * bch2_btree_iter_peek: returns first key greater than or equal to iterator's
+ * current position
+ */
 struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
 {
 	struct btree_iter_level *l = &iter->l[0];
@@ -1315,14 +1376,8 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
 		if (likely(k.k))
 			break;
 
-		/* got to the end of the leaf, iterator needs to be traversed: */
-		iter->pos	= l->b->key.k.p;
-		iter->uptodate	= BTREE_ITER_NEED_TRAVERSE;
-
-		if (!bkey_cmp(iter->pos, POS_MAX))
+		if (!btree_iter_set_pos_to_next_leaf(iter))
 			return bkey_s_c_null;
-
-		iter->pos = btree_type_successor(iter->btree_id, iter->pos);
 	}
 
 	/*
@@ -1337,22 +1392,10 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
 	return k;
 }
 
-static noinline
-struct bkey_s_c bch2_btree_iter_peek_next_leaf(struct btree_iter *iter)
-{
-	struct btree_iter_level *l = &iter->l[0];
-
-	iter->pos	= l->b->key.k.p;
-	iter->uptodate	= BTREE_ITER_NEED_TRAVERSE;
-
-	if (!bkey_cmp(iter->pos, POS_MAX))
-		return bkey_s_c_null;
-
-	iter->pos = btree_type_successor(iter->btree_id, iter->pos);
-
-	return bch2_btree_iter_peek(iter);
-}
-
+/**
+ * bch2_btree_iter_next: returns first key greater than iterator's current
+ * position
+ */
 struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter)
 {
 	struct btree_iter_level *l = &iter->l[0];
@@ -1361,15 +1404,19 @@ struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter)
 
 	bch2_btree_iter_checks(iter, BTREE_ITER_KEYS);
 
-	iter->pos = btree_type_successor(iter->btree_id, iter->k.p);
-
 	if (unlikely(iter->uptodate != BTREE_ITER_UPTODATE)) {
+		if (unlikely(!bkey_cmp(iter->k.p, POS_MAX)))
+			return bkey_s_c_null;
+
 		/*
 		 * XXX: when we just need to relock we should be able to avoid
 		 * calling traverse, but we need to kill BTREE_ITER_NEED_PEEK
 		 * for that to work
 		 */
-		btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
+		iter->uptodate	= BTREE_ITER_NEED_TRAVERSE;
+
+		bch2_btree_iter_set_pos(iter,
+			btree_type_successor(iter->btree_id, iter->k.p));
 
 		return bch2_btree_iter_peek(iter);
 	}
@@ -1377,9 +1424,12 @@ struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter)
 	do {
 		bch2_btree_node_iter_advance(&l->iter, l->b);
 		p = bch2_btree_node_iter_peek_all(&l->iter, l->b);
-		if (unlikely(!p))
-			return bch2_btree_iter_peek_next_leaf(iter);
-	} while (bkey_whiteout(p));
+	} while (likely(p) && bkey_whiteout(p));
+
+	if (unlikely(!p))
+		return btree_iter_set_pos_to_next_leaf(iter)
+			? bch2_btree_iter_peek(iter)
+			: bkey_s_c_null;
 
 	k = __btree_iter_unpack(iter, l, &iter->k, p);
 
-- 
cgit 


From 4d13e818f5c00ae064aefba8349383cc2d79bf6d Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 18 Sep 2019 19:33:12 -0400
Subject: bcachefs: Avoid deadlocking on the allocator

The allocator needs to make sure there's buckets available on the
RESERVE_NONE freelist if at all possible - otherwise foreground IO will
get stuck.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 54051161eba7..85795b580892 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -1164,7 +1164,7 @@ static int bch2_allocator_thread(void *arg)
 			 */
 			if (!nr ||
 			    (nr < ALLOC_SCAN_BATCH(ca) &&
-			     !fifo_full(&ca->free[RESERVE_MOVINGGC]))) {
+			     !fifo_empty(&ca->free[RESERVE_NONE]))) {
 				ret = wait_buckets_available(c, ca);
 				if (ret) {
 					up_read(&c->gc_lock);
-- 
cgit 


From 554d219ebb3420d21b395e1ca018ed74524b5480 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 19 Sep 2019 16:01:32 -0400
Subject: bcachefs: Add missing bch2_btree_node_iter_fix() call

Any time we're modifying what's in the btree, iterators potentially have
to be updated - this one was exposed by the reflink code.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update_leaf.c | 5 ++---
 fs/bcachefs/extents.c           | 2 ++
 2 files changed, 4 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 007d772d6e8f..d97d80859d1b 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -183,9 +183,8 @@ bool bch2_btree_bset_insert_key(struct btree_iter *iter,
 	clobber_u64s = 0;
 overwrite:
 	bch2_bset_insert(b, node_iter, k, insert, clobber_u64s);
-	if (k->u64s != clobber_u64s || bkey_whiteout(&insert->k))
-		bch2_btree_node_iter_fix(iter, b, node_iter, k,
-					 clobber_u64s, k->u64s);
+	bch2_btree_node_iter_fix(iter, b, node_iter, k,
+				 clobber_u64s, k->u64s);
 	bch2_btree_iter_verify(iter, b);
 	return true;
 }
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index d7ce87bd4374..00d77ed01234 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -1270,6 +1270,8 @@ static void __bch2_insert_fixup_extent(struct bch_fs *c,
 				btree_account_key_drop(l->b, _k);
 				_k->type = KEY_TYPE_discard;
 				reserve_whiteout(l->b, _k);
+				bch2_btree_node_iter_fix(iter, l->b, &l->iter,
+							_k, _k->u64s, _k->u64s);
 			}
 			break;
 		}
-- 
cgit 


From 059e4134d26a3ada46dc29a849c8cfa5e0b7fd42 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 19 Sep 2019 16:07:41 -0400
Subject: bcachefs: Debug assertion improvements

Call bch2_btree_iter_verify from bch2_btree_node_iter_fix(); also verify
in btree_iter_peek_uptodate() that iter->k matches what's in the btree.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c        | 40 ++++++++++++++++++++++++----------------
 fs/bcachefs/btree_update_leaf.c |  3 ---
 fs/bcachefs/extents.c           | 18 ++----------------
 3 files changed, 26 insertions(+), 35 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index b5dace40534c..f64cf78d68fa 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -542,14 +542,14 @@ static void __bch2_btree_node_iter_fix(struct btree_iter *iter,
 		goto fixup_done;
 	} else {
 		/* Iterator is after key that changed */
-		goto out_verify;
+		return;
 	}
 found:
 	set->end = t->end_offset;
 
 	/* Iterator hasn't gotten to the key that changed yet: */
 	if (set->k < offset)
-		goto out_verify;
+		return;
 
 	if (new_u64s &&
 	    btree_iter_pos_cmp(iter, b, where) > 0) {
@@ -561,7 +561,7 @@ found:
 	} else {
 		/* Iterator is after key that changed */
 		set->k = (int) set->k + shift;
-		goto out_verify;
+		return;
 	}
 
 	bch2_btree_node_iter_sort(node_iter, b);
@@ -620,8 +620,6 @@ fixup_done:
 
 		btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
 	}
-out_verify:
-	bch2_btree_node_iter_verify(node_iter, b);
 }
 
 void bch2_btree_node_iter_fix(struct btree_iter *iter,
@@ -634,14 +632,18 @@ void bch2_btree_node_iter_fix(struct btree_iter *iter,
 	struct bset_tree *t = bch2_bkey_to_bset_inlined(b, where);
 	struct btree_iter *linked;
 
-	if (node_iter != &iter->l[b->c.level].iter)
+	if (node_iter != &iter->l[b->c.level].iter) {
 		__bch2_btree_node_iter_fix(iter, b, node_iter, t,
-					  where, clobber_u64s, new_u64s);
+					   where, clobber_u64s, new_u64s);
+		bch2_btree_node_iter_verify(node_iter, b);
+	}
 
-	trans_for_each_iter_with_node(iter->trans, b, linked)
+	trans_for_each_iter_with_node(iter->trans, b, linked) {
 		__bch2_btree_node_iter_fix(linked, b,
-					  &linked->l[b->c.level].iter, t,
-					  where, clobber_u64s, new_u64s);
+					   &linked->l[b->c.level].iter, t,
+					   where, clobber_u64s, new_u64s);
+		__bch2_btree_iter_verify(linked, b);
+	}
 }
 
 static inline struct bkey_s_c __btree_iter_unpack(struct btree_iter *iter,
@@ -1341,14 +1343,20 @@ static inline struct bkey_s_c btree_iter_peek_uptodate(struct btree_iter *iter)
 	struct bkey_s_c ret = { .k = &iter->k };
 
 	if (!bkey_deleted(&iter->k)) {
-		EBUG_ON(bch2_btree_node_iter_end(&l->iter));
-		ret.v = bkeyp_val(&l->b->format,
-			__bch2_btree_node_iter_peek_all(&l->iter, l->b));
+		struct bkey_packed *_k =
+			__bch2_btree_node_iter_peek_all(&l->iter, l->b);
+
+		ret.v = bkeyp_val(&l->b->format, _k);
+
+		if (debug_check_iterators(iter->trans->c)) {
+			struct bkey k = bkey_unpack_key(l->b, _k);
+			BUG_ON(memcmp(&k, &iter->k, sizeof(k)));
+		}
+
+		if (debug_check_bkeys(iter->trans->c))
+			bch2_bkey_debugcheck(iter->trans->c, l->b, ret);
 	}
 
-	if (debug_check_bkeys(iter->trans->c) &&
-	    !bkey_deleted(ret.k))
-		bch2_bkey_debugcheck(iter->trans->c, l->b, ret);
 	return ret;
 }
 
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index d97d80859d1b..0b3eed506c2c 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -151,7 +151,6 @@ bool bch2_btree_bset_insert_key(struct btree_iter *iter,
 				bch2_bset_delete(b, k, clobber_u64s);
 				bch2_btree_node_iter_fix(iter, b, node_iter,
 							 k, clobber_u64s, 0);
-				bch2_btree_iter_verify(iter, b);
 				return true;
 			}
 
@@ -161,7 +160,6 @@ bool bch2_btree_bset_insert_key(struct btree_iter *iter,
 		k->type = KEY_TYPE_deleted;
 		bch2_btree_node_iter_fix(iter, b, node_iter, k,
 					 k->u64s, k->u64s);
-		bch2_btree_iter_verify(iter, b);
 
 		if (bkey_whiteout(&insert->k)) {
 			reserve_whiteout(b, k);
@@ -185,7 +183,6 @@ overwrite:
 	bch2_bset_insert(b, node_iter, k, insert, clobber_u64s);
 	bch2_btree_node_iter_fix(iter, b, node_iter, k,
 				 clobber_u64s, k->u64s);
-	bch2_btree_iter_verify(iter, b);
 	return true;
 }
 
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 00d77ed01234..859b1e8206bd 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -875,13 +875,6 @@ static void verify_extent_nonoverlapping(struct bch_fs *c,
 #endif
 }
 
-static void verify_modified_extent(struct btree_iter *iter,
-				   struct bkey_packed *k)
-{
-	bch2_btree_iter_verify(iter, iter->l[0].b);
-	bch2_verify_insert_pos(iter->l[0].b, k, k, k->u64s);
-}
-
 static void extent_bset_insert(struct bch_fs *c, struct btree_iter *iter,
 			       struct bkey_i *insert)
 {
@@ -923,7 +916,6 @@ static void extent_bset_insert(struct bch_fs *c, struct btree_iter *iter,
 
 	bch2_bset_insert(l->b, &l->iter, k, insert, 0);
 	bch2_btree_node_iter_fix(iter, l->b, &l->iter, k, 0, k->u64s);
-	bch2_btree_iter_verify(iter, l->b);
 }
 
 static unsigned bch2_bkey_nr_alloc_ptrs(struct bkey_s_c k)
@@ -1138,17 +1130,16 @@ extent_squash(struct bch_fs *c, struct btree_iter *iter,
 	case BCH_EXTENT_OVERLAP_FRONT:
 		/* insert overlaps with start of k: */
 		__bch2_cut_front(insert->k.p, k);
-		BUG_ON(bkey_deleted(k.k));
+		EBUG_ON(bkey_deleted(k.k));
 		extent_save(l->b, _k, k.k);
 		bch2_btree_node_iter_fix(iter, l->b, &l->iter,
 					 _k, _k->u64s, _k->u64s);
-		verify_modified_extent(iter, _k);
 		break;
 
 	case BCH_EXTENT_OVERLAP_BACK:
 		/* insert overlaps with end of k: */
 		bch2_cut_back(bkey_start_pos(&insert->k), k.k);
-		BUG_ON(bkey_deleted(k.k));
+		EBUG_ON(bkey_deleted(k.k));
 		extent_save(l->b, _k, k.k);
 
 		/*
@@ -1159,7 +1150,6 @@ extent_squash(struct bch_fs *c, struct btree_iter *iter,
 		bch2_bset_fix_invalidated_key(l->b, _k);
 		bch2_btree_node_iter_fix(iter, l->b, &l->iter,
 					 _k, _k->u64s, _k->u64s);
-		verify_modified_extent(iter, _k);
 		break;
 
 	case BCH_EXTENT_OVERLAP_ALL: {
@@ -1176,12 +1166,10 @@ extent_squash(struct bch_fs *c, struct btree_iter *iter,
 			bch2_bset_delete(l->b, _k, _k->u64s);
 			bch2_btree_node_iter_fix(iter, l->b, &l->iter,
 						 _k, u64s, 0);
-			bch2_btree_iter_verify(iter, l->b);
 		} else {
 			extent_save(l->b, _k, k.k);
 			bch2_btree_node_iter_fix(iter, l->b, &l->iter,
 						 _k, _k->u64s, _k->u64s);
-			verify_modified_extent(iter, _k);
 		}
 
 		break;
@@ -1213,7 +1201,6 @@ extent_squash(struct bch_fs *c, struct btree_iter *iter,
 		extent_save(l->b, _k, k.k);
 		bch2_btree_node_iter_fix(iter, l->b, &l->iter,
 					 _k, _k->u64s, _k->u64s);
-		verify_modified_extent(iter, _k);
 
 		extent_bset_insert(c, iter, &split.k);
 		break;
@@ -1806,7 +1793,6 @@ static bool bch2_extent_merge_inline(struct bch_fs *c,
 	bch2_bset_fix_invalidated_key(b, m);
 	bch2_btree_node_iter_fix(iter, b, node_iter,
 				 m, m->u64s, m->u64s);
-	verify_modified_extent(iter, m);
 
 	return ret == BCH_MERGE_MERGE;
 }
-- 
cgit 


From 9c37b63207e4257cf5a14a412c6b586ae47680be Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 19 Sep 2019 16:20:38 -0400
Subject: bcachefs: Check for extents past eof correctly

bcachefs used to work mostly in terms of PAGE_SIZE, not block size at
the vfs level - but that has since been fixed.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fsck.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index e3738757b6a0..50a7d8c1faba 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -509,7 +509,7 @@ retry:
 		if (fsck_err_on(w.have_inode &&
 			!(w.inode.bi_flags & BCH_INODE_I_SIZE_DIRTY) &&
 			k.k->type != KEY_TYPE_reservation &&
-			k.k->p.offset > round_up(w.inode.bi_size, PAGE_SIZE) >> 9, c,
+			k.k->p.offset > round_up(w.inode.bi_size, block_bytes(c)) >> 9, c,
 			"extent type %u offset %llu past end of inode %llu, i_size %llu",
 			k.k->type, k.k->p.offset, k.k->p.inode, w.inode.bi_size)) {
 			bch2_trans_unlock(&trans);
-- 
cgit 


From 6cc3535dcbb8bca4df496b9beac8f4c664958fce Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 19 Sep 2019 18:05:04 -0400
Subject: bcachefs: Don't write past eof

When converting from PAGE_SIZE to block_size, the .mkwrite path was
missed

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-io.c | 32 +++++++++++++++++++++++++++-----
 1 file changed, 27 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 4a016c19dcbd..f05950da6957 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -801,6 +801,8 @@ vm_fault_t bch2_page_mkwrite(struct vm_fault *vmf)
 	struct address_space *mapping = file->f_mapping;
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
 	struct bch2_page_reservation res;
+	unsigned len;
+	loff_t isize;
 	int ret = VM_FAULT_LOCKED;
 
 	bch2_page_reservation_init(c, inode, &res);
@@ -817,21 +819,27 @@ vm_fault_t bch2_page_mkwrite(struct vm_fault *vmf)
 	bch2_pagecache_add_get(&inode->ei_pagecache_lock);
 
 	lock_page(page);
-	if (page->mapping != mapping ||
-	    page_offset(page) > i_size_read(&inode->v)) {
+	isize = i_size_read(&inode->v);
+
+	if (page->mapping != mapping || page_offset(page) >= isize) {
 		unlock_page(page);
 		ret = VM_FAULT_NOPAGE;
 		goto out;
 	}
 
-	if (bch2_page_reservation_get(c, inode, page, &res,
-				      0, PAGE_SIZE, true)) {
+	/* page is wholly or partially inside EOF */
+	if (((page->index + 1) << PAGE_SHIFT) <= isize)
+		len = PAGE_SIZE;
+	else
+		len = offset_in_page(isize);
+
+	if (bch2_page_reservation_get(c, inode, page, &res, 0, len, true)) {
 		unlock_page(page);
 		ret = VM_FAULT_SIGBUS;
 		goto out;
 	}
 
-	bch2_set_page_dirty(c, inode, page, &res, 0, PAGE_SIZE);
+	bch2_set_page_dirty(c, inode, page, &res, 0, len);
 	wait_for_stable_page(page);
 out:
 	bch2_pagecache_add_put(&inode->ei_pagecache_lock);
@@ -1433,6 +1441,10 @@ do_io:
 		BUG_ON(!bio_add_page(&w->io->op.op.wbio.bio, page,
 				     sectors << 9, offset << 9));
 
+		/* Check for writing past i_size: */
+		BUG_ON((bio_end_sector(&w->io->op.op.wbio.bio) << 9) >
+		       round_up(i_size, block_bytes(c)));
+
 		w->io->op.op.res.sectors += reserved_sectors;
 		w->io->op.new_i_size = i_size;
 
@@ -2518,6 +2530,16 @@ int bch2_truncate(struct bch_inode_info *inode, struct iattr *iattr)
 	if (unlikely(ret))
 		goto err;
 
+	/*
+	 * When extending, we're going to write the new i_size to disk
+	 * immediately so we need to flush anything above the current on disk
+	 * i_size first:
+	 *
+	 * Also, when extending we need to flush the page that i_size currently
+	 * straddles - if it's mapped to userspace, we need to ensure that
+	 * userspace has to redirty it and call .mkwrite -> set_page_dirty
+	 * again to allocate the part of the page that was extended.
+	 */
 	if (iattr->ia_size > inode->ei_inode.bi_size)
 		ret = filemap_write_and_wait_range(mapping,
 				inode->ei_inode.bi_size,
-- 
cgit 


From ccf5a1095892633bdb4bd1ac6f7f60aa9c4f327b Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 7 Sep 2019 17:17:21 -0400
Subject: bcachefs: bch2_btree_iter_peek_prev()

Last of the basic operations for iterating forwards and backwards over
the btree: we now have
 - peek(),	returns key >= iter->pos
 - next(),	returns key >  iter->pos
 - peek_prev(),	returns key <= iter->pos
 - prev(),	returns key < iter->pos

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c | 81 ++++++++++++++++++++++++++++++++++--------------
 fs/bcachefs/btree_iter.h |  2 ++
 2 files changed, 60 insertions(+), 23 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index f64cf78d68fa..d65edc460b07 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -686,6 +686,13 @@ static inline struct bkey_s_c __btree_iter_peek(struct btree_iter *iter,
 			bch2_btree_node_iter_peek(&l->iter, l->b));
 }
 
+static inline struct bkey_s_c __btree_iter_prev(struct btree_iter *iter,
+						struct btree_iter_level *l)
+{
+	return __btree_iter_unpack(iter, l, &iter->k,
+			bch2_btree_node_iter_prev(&l->iter, l->b));
+}
+
 static inline bool btree_iter_advance_to_pos(struct btree_iter *iter,
 					     struct btree_iter_level *l,
 					     int max_advance)
@@ -1446,51 +1453,79 @@ struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter)
 	return k;
 }
 
-struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *iter)
+/**
+ * bch2_btree_iter_peek_prev: returns first key less than or equal to
+ * iterator's current position
+ */
+struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
 {
 	struct btree_iter_level *l = &iter->l[0];
-	struct bkey_packed *p;
 	struct bkey_s_c k;
 	int ret;
 
 	bch2_btree_iter_checks(iter, BTREE_ITER_KEYS);
 
-	if (unlikely(iter->uptodate != BTREE_ITER_UPTODATE)) {
-		k = bch2_btree_iter_peek(iter);
-		if (IS_ERR(k.k))
-			return k;
-	}
+	if (iter->uptodate == BTREE_ITER_UPTODATE)
+		return btree_iter_peek_uptodate(iter);
 
 	while (1) {
-		p = bch2_btree_node_iter_prev(&l->iter, l->b);
-		if (likely(p))
-			break;
-
-		iter->pos = l->b->data->min_key;
-		if (!bkey_cmp(iter->pos, POS_MIN))
-			return bkey_s_c_null;
-
-		bch2_btree_iter_set_pos(iter,
-			btree_type_predecessor(iter->btree_id, iter->pos));
-
 		ret = bch2_btree_iter_traverse(iter);
 		if (unlikely(ret))
 			return bkey_s_c_err(ret);
 
-		p = bch2_btree_node_iter_peek(&l->iter, l->b);
-		if (p)
+		k = __btree_iter_peek(iter, l);
+		if (!k.k ||
+		    bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0)
+			k = __btree_iter_prev(iter, l);
+
+		if (likely(k.k))
 			break;
-	}
 
-	k = __btree_iter_unpack(iter, l, &iter->k, p);
+		if (!btree_iter_set_pos_to_prev_leaf(iter))
+			return bkey_s_c_null;
+	}
 
 	EBUG_ON(bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0);
-
 	iter->pos	= bkey_start_pos(k.k);
 	iter->uptodate	= BTREE_ITER_UPTODATE;
 	return k;
 }
 
+/**
+ * bch2_btree_iter_prev: returns first key less than iterator's current
+ * position
+ */
+struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *iter)
+{
+	struct btree_iter_level *l = &iter->l[0];
+	struct bkey_s_c k;
+
+	bch2_btree_iter_checks(iter, BTREE_ITER_KEYS);
+
+	if (unlikely(iter->uptodate != BTREE_ITER_UPTODATE)) {
+		/*
+		 * XXX: when we just need to relock we should be able to avoid
+		 * calling traverse, but we need to kill BTREE_ITER_NEED_PEEK
+		 * for that to work
+		 */
+		iter->pos	= btree_type_predecessor(iter->btree_id,
+							 iter->pos);
+		iter->uptodate	= BTREE_ITER_NEED_TRAVERSE;
+
+		return bch2_btree_iter_peek_prev(iter);
+	}
+
+	k = __btree_iter_prev(iter, l);
+	if (unlikely(!k.k))
+		return btree_iter_set_pos_to_prev_leaf(iter)
+			? bch2_btree_iter_peek(iter)
+			: bkey_s_c_null;
+
+	EBUG_ON(bkey_cmp(bkey_start_pos(k.k), iter->pos) >= 0);
+	iter->pos	= bkey_start_pos(k.k);
+	return k;
+}
+
 static inline struct bkey_s_c
 __bch2_btree_iter_peek_slot_extents(struct btree_iter *iter)
 {
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index 34c08428a048..7f76db5bb8bc 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -151,6 +151,8 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *, unsigned);
 
 struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *);
 struct bkey_s_c bch2_btree_iter_next(struct btree_iter *);
+
+struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *);
 struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *);
 
 struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *);
-- 
cgit 


From 5f786787adf57c7597925a9df9897238cb3bc60e Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 7 Sep 2019 18:04:23 -0400
Subject: bcachefs: Add support for FALLOC_FL_INSERT_RANGE

Somewhat tricky and ugly, because iterating over extents backwards is a
pain.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-io.c | 131 ++++++++++++++++++++++++++++++++++++----------------
 1 file changed, 91 insertions(+), 40 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index f05950da6957..142eb0c3cbbc 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -2618,14 +2618,16 @@ err:
 	return ret;
 }
 
-static long bch2_fcollapse(struct bch_inode_info *inode,
-			   loff_t offset, loff_t len)
+static long bch2_fcollapse_finsert(struct bch_inode_info *inode,
+				   loff_t offset, loff_t len,
+				   bool insert)
 {
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
 	struct address_space *mapping = inode->v.i_mapping;
 	struct btree_trans trans;
 	struct btree_iter *src, *dst, *del = NULL;
-	loff_t new_size;
+	loff_t shift, new_size;
+	u64 src_start;
 	int ret;
 
 	if ((offset | len) & (block_bytes(c) - 1))
@@ -2643,34 +2645,53 @@ static long bch2_fcollapse(struct bch_inode_info *inode,
 	inode_dio_wait(&inode->v);
 	bch2_pagecache_block_get(&inode->ei_pagecache_lock);
 
-	ret = -EINVAL;
-	if (offset + len >= inode->v.i_size)
-		goto err;
+	if (insert) {
+		ret = -EFBIG;
+		if (inode->v.i_sb->s_maxbytes - inode->v.i_size < len)
+			goto err;
 
-	if (inode->v.i_size < len)
-		goto err;
+		ret = -EINVAL;
+		if (offset >= inode->v.i_size)
+			goto err;
 
-	new_size = inode->v.i_size - len;
+		src_start	= U64_MAX;
+		shift		= len;
+	} else {
+		ret = -EINVAL;
+		if (offset + len >= inode->v.i_size)
+			goto err;
 
-	ret = write_invalidate_inode_pages_range(mapping, offset, LLONG_MAX);
-	if (ret)
-		goto err;
+		src_start	= offset + len;
+		shift		= -len;
+	}
+
+	new_size = inode->v.i_size + shift;
 
-	ret = __bch2_fpunch(c, inode, offset >> 9,
-			    (offset + len) >> 9);
+	ret = write_invalidate_inode_pages_range(mapping, offset, LLONG_MAX);
 	if (ret)
 		goto err;
 
-	dst = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
-			POS(inode->v.i_ino, offset >> 9),
-			BTREE_ITER_INTENT);
-	BUG_ON(IS_ERR_OR_NULL(dst));
+	if (insert) {
+		i_size_write(&inode->v, new_size);
+		mutex_lock(&inode->ei_update_lock);
+		ret = bch2_write_inode_size(c, inode, new_size,
+					    ATTR_MTIME|ATTR_CTIME);
+		mutex_unlock(&inode->ei_update_lock);
+	} else {
+		ret = __bch2_fpunch(c, inode, offset >> 9,
+				    (offset + len) >> 9);
+		if (ret)
+			goto err;
+	}
 
 	src = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
-			POS(inode->v.i_ino, (offset + len) >> 9),
+			POS(inode->v.i_ino, src_start >> 9),
 			BTREE_ITER_INTENT);
 	BUG_ON(IS_ERR_OR_NULL(src));
 
+	dst = bch2_trans_copy_iter(&trans, src);
+	BUG_ON(IS_ERR_OR_NULL(dst));
+
 	while (1) {
 		struct disk_reservation disk_res =
 			bch2_disk_reservation_init(c, 0);
@@ -2678,38 +2699,61 @@ static long bch2_fcollapse(struct bch_inode_info *inode,
 		struct bkey_i delete;
 		struct bkey_s_c k;
 		struct bpos next_pos;
+		struct bpos move_pos = POS(inode->v.i_ino, offset >> 9);
+		struct bpos atomic_end;
 		unsigned commit_flags = BTREE_INSERT_NOFAIL|
 			BTREE_INSERT_ATOMIC|
 			BTREE_INSERT_USE_RESERVE;
 
-		k = bch2_btree_iter_peek(src);
+		k = insert
+			? bch2_btree_iter_peek_prev(src)
+			: bch2_btree_iter_peek(src);
 		if ((ret = bkey_err(k)))
 			goto bkey_err;
 
 		if (!k.k || k.k->p.inode != inode->v.i_ino)
 			break;
 
-		BUG_ON(src->pos.offset != bkey_start_offset(k.k));
+		BUG_ON(bkey_cmp(src->pos, bkey_start_pos(k.k)));
+
+		if (insert &&
+		    bkey_cmp(k.k->p, POS(inode->v.i_ino, offset >> 9)) <= 0)
+			break;
+reassemble:
+		bkey_reassemble(&copy.k, k);
+
+		if (insert &&
+		    bkey_cmp(bkey_start_pos(k.k), move_pos) < 0) {
+			bch2_cut_front(move_pos, &copy.k);
+			bch2_btree_iter_set_pos(src, bkey_start_pos(&copy.k.k));
+		}
 
-		bch2_btree_iter_set_pos(dst,
-			POS(inode->v.i_ino, src->pos.offset - (len >> 9)));
+		copy.k.k.p.offset += shift >> 9;
+		bch2_btree_iter_set_pos(dst, bkey_start_pos(&copy.k.k));
 
 		ret = bch2_btree_iter_traverse(dst);
 		if (ret)
 			goto bkey_err;
 
-		bkey_reassemble(&copy.k, k);
-		copy.k.k.p = dst->pos;
-		copy.k.k.p.offset += copy.k.k.size;
-		ret = bch2_extent_trim_atomic(&copy.k, dst);
+		ret = bch2_extent_atomic_end(dst, &copy.k, &atomic_end);
 		if (ret)
 			goto bkey_err;
 
+		if (bkey_cmp(atomic_end, copy.k.k.p)) {
+			if (insert) {
+				move_pos = atomic_end;
+				move_pos.offset -= shift >> 9;
+				goto reassemble;
+			} else {
+				bch2_cut_back(atomic_end, &copy.k.k);
+			}
+		}
+
 		bkey_init(&delete.k);
 		delete.k.p = src->pos;
 		bch2_key_resize(&delete.k, copy.k.k.size);
 
-		next_pos = delete.k.p;
+		next_pos = insert ? bkey_start_pos(&delete.k) : delete.k.p;
 
 		/*
 		 * If the new and old keys overlap (because we're moving an
@@ -2719,7 +2763,12 @@ static long bch2_fcollapse(struct bch_inode_info *inode,
 		 * triggers are run, so the overwrite will get double counted
 		 * by the triggers machinery:
 		 */
-		if (bkey_cmp(copy.k.k.p, bkey_start_pos(&delete.k)) > 0) {
+		if (insert &&
+		    bkey_cmp(bkey_start_pos(&copy.k.k), delete.k.p) < 0) {
+			bch2_cut_back(bkey_start_pos(&copy.k.k), &delete.k);
+		} else if (!insert &&
+			   bkey_cmp(copy.k.k.p,
+				    bkey_start_pos(&delete.k)) > 0) {
 			bch2_cut_front(copy.k.k.p, &delete);
 
 			del = bch2_trans_copy_iter(&trans, src);
@@ -2727,14 +2776,11 @@ static long bch2_fcollapse(struct bch_inode_info *inode,
 
 			bch2_btree_iter_set_pos(del,
 				bkey_start_pos(&delete.k));
-			bch2_trans_update(&trans,
-				BTREE_INSERT_ENTRY(del, &delete));
-		} else {
-			bch2_trans_update(&trans,
-				BTREE_INSERT_ENTRY(src, &delete));
 		}
 
 		bch2_trans_update(&trans, BTREE_INSERT_ENTRY(dst, &copy.k));
+		bch2_trans_update(&trans,
+				  BTREE_INSERT_ENTRY(del ?: src, &delete));
 
 		if (copy.k.k.size == k.k->size) {
 			/*
@@ -2774,11 +2820,13 @@ bkey_err:
 	}
 	bch2_trans_unlock(&trans);
 
-	i_size_write(&inode->v, new_size);
-	mutex_lock(&inode->ei_update_lock);
-	ret = bch2_write_inode_size(c, inode, new_size,
-				    ATTR_MTIME|ATTR_CTIME);
-	mutex_unlock(&inode->ei_update_lock);
+	if (!insert) {
+		i_size_write(&inode->v, new_size);
+		mutex_lock(&inode->ei_update_lock);
+		ret = bch2_write_inode_size(c, inode, new_size,
+					    ATTR_MTIME|ATTR_CTIME);
+		mutex_unlock(&inode->ei_update_lock);
+	}
 err:
 	bch2_trans_exit(&trans);
 	bch2_pagecache_block_put(&inode->ei_pagecache_lock);
@@ -2947,8 +2995,11 @@ long bch2_fallocate_dispatch(struct file *file, int mode,
 	if (mode == (FALLOC_FL_PUNCH_HOLE|FALLOC_FL_KEEP_SIZE))
 		return bch2_fpunch(inode, offset, len);
 
+	if (mode == FALLOC_FL_INSERT_RANGE)
+		return bch2_fcollapse_finsert(inode, offset, len, true);
+
 	if (mode == FALLOC_FL_COLLAPSE_RANGE)
-		return bch2_fcollapse(inode, offset, len);
+		return bch2_fcollapse_finsert(inode, offset, len, false);
 
 	return -EOPNOTSUPP;
 }
-- 
cgit 


From eab32c8e4e8a3addcea0884acdb5bbe4cdb66c5a Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 20 Sep 2019 14:28:35 -0400
Subject: bcachefs: Fix validation of replicas entries

When an extent is erasure coded, we need to record a replicas entry to
indicate that data is present on the devices that extent has pointers to
- but nr_required should be 0, because it's erasure coded.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/replicas.c | 22 ++++++++++++++--------
 1 file changed, 14 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c
index 64024ce01665..afd226f3c8e7 100644
--- a/fs/bcachefs/replicas.c
+++ b/fs/bcachefs/replicas.c
@@ -16,11 +16,16 @@ static inline int u8_cmp(u8 l, u8 r)
 	return cmp_int(l, r);
 }
 
-static void verify_replicas_entry_sorted(struct bch_replicas_entry *e)
+static void verify_replicas_entry(struct bch_replicas_entry *e)
 {
-#ifdef CONFIG_BCACHES_DEBUG
+#ifdef CONFIG_BCACHEFS_DEBUG
 	unsigned i;
 
+	BUG_ON(e->data_type >= BCH_DATA_NR);
+	BUG_ON(!e->nr_devs);
+	BUG_ON(e->nr_required > 1 &&
+	       e->nr_required >= e->nr_devs);
+
 	for (i = 0; i + 1 < e->nr_devs; i++)
 		BUG_ON(e->devs[i] >= e->devs[i + 1]);
 #endif
@@ -158,7 +163,7 @@ cpu_replicas_add_entry(struct bch_replicas_cpu *old,
 	};
 
 	BUG_ON(!new_entry->data_type);
-	verify_replicas_entry_sorted(new_entry);
+	verify_replicas_entry(new_entry);
 
 	new.entries = kcalloc(new.nr, new.entry_size, GFP_NOIO);
 	if (!new.entries)
@@ -185,7 +190,7 @@ static inline int __replicas_entry_idx(struct bch_replicas_cpu *r,
 	if (unlikely(entry_size > r->entry_size))
 		return -1;
 
-	verify_replicas_entry_sorted(search);
+	verify_replicas_entry(search);
 
 #define entry_cmp(_l, _r, size)	memcmp(_l, _r, entry_size)
 	idx = eytzinger0_find(r->entries, r->nr, r->entry_size,
@@ -216,7 +221,7 @@ static bool bch2_replicas_marked_locked(struct bch_fs *c,
 	if (!search->nr_devs)
 		return true;
 
-	verify_replicas_entry_sorted(search);
+	verify_replicas_entry(search);
 
 	return __replicas_has_entry(&c->replicas, search) &&
 		(!check_gc_replicas ||
@@ -363,6 +368,8 @@ static int bch2_mark_replicas_slowpath(struct bch_fs *c,
 	struct bch_replicas_cpu new_r, new_gc;
 	int ret = -ENOMEM;
 
+	verify_replicas_entry(new_entry);
+
 	memset(&new_r, 0, sizeof(new_r));
 	memset(&new_gc, 0, sizeof(new_gc));
 
@@ -878,9 +885,8 @@ static const char *bch2_sb_validate_replicas(struct bch_sb *sb, struct bch_sb_fi
 			goto err;
 
 		err = "invalid replicas entry: bad nr_required";
-		if (!e->nr_required ||
-		    (e->nr_required > 1 &&
-		     e->nr_required >= e->nr_devs))
+		if (e->nr_required > 1 &&
+		    e->nr_required >= e->nr_devs)
 			goto err;
 
 		err = "invalid replicas entry: invalid device";
-- 
cgit 


From ec350b90d73b7b8ead9f377c58efe04b12f6ba6e Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 20 Sep 2019 16:17:46 -0400
Subject: bcachefs: Drop unused arg to bch2_open_buckets_stop_dev()

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_foreground.c | 5 ++---
 fs/bcachefs/alloc_foreground.h | 2 +-
 fs/bcachefs/ec.c               | 8 ++------
 3 files changed, 5 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index e3e9383c94ee..333aa140af54 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -693,8 +693,7 @@ retry_blocking:
 }
 
 void bch2_open_buckets_stop_dev(struct bch_fs *c, struct bch_dev *ca,
-				struct open_buckets *obs,
-				enum bch_data_type data_type)
+				struct open_buckets *obs)
 {
 	struct open_buckets ptrs = { .nr = 0 };
 	struct open_bucket *ob, *ob2;
@@ -725,7 +724,7 @@ void bch2_writepoint_stop(struct bch_fs *c, struct bch_dev *ca,
 			  struct write_point *wp)
 {
 	mutex_lock(&wp->lock);
-	bch2_open_buckets_stop_dev(c, ca, &wp->ptrs, wp->type);
+	bch2_open_buckets_stop_dev(c, ca, &wp->ptrs);
 	mutex_unlock(&wp->lock);
 }
 
diff --git a/fs/bcachefs/alloc_foreground.h b/fs/bcachefs/alloc_foreground.h
index 6d8ffb0cd06d..687f973e4b3a 100644
--- a/fs/bcachefs/alloc_foreground.h
+++ b/fs/bcachefs/alloc_foreground.h
@@ -106,7 +106,7 @@ void bch2_alloc_sectors_append_ptrs(struct bch_fs *, struct write_point *,
 void bch2_alloc_sectors_done(struct bch_fs *, struct write_point *);
 
 void bch2_open_buckets_stop_dev(struct bch_fs *, struct bch_dev *,
-				struct open_buckets *, enum bch_data_type);
+				struct open_buckets *);
 
 void bch2_writepoint_stop(struct bch_fs *, struct bch_dev *,
 			  struct write_point *);
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index a9759c2ed7ab..5b61e9cb1ac3 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -1173,12 +1173,8 @@ void bch2_ec_stop_dev(struct bch_fs *c, struct bch_dev *ca)
 		struct ec_stripe_new *s = NULL;
 
 		mutex_lock(&h->lock);
-		bch2_open_buckets_stop_dev(c, ca,
-					   &h->blocks,
-					   BCH_DATA_USER);
-		bch2_open_buckets_stop_dev(c, ca,
-					   &h->parity,
-					   BCH_DATA_USER);
+		bch2_open_buckets_stop_dev(c, ca, &h->blocks);
+		bch2_open_buckets_stop_dev(c, ca, &h->parity);
 
 		if (!h->s)
 			goto unlock;
-- 
cgit 


From 0a426c323927d647f6c31d063ee2f1abbe53db80 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 22 Sep 2019 15:02:05 -0400
Subject: bcachefs: Handle bio_iov_iter_get_pages() returning unaligned bio

If the user buffer isn't aligned to the filesystem block size, on a
large enough IO - where it won't fit into a single bio -
bio_iov_iter_get_pages() won't necessarily return a bio with the proper
alignment.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-io.c | 21 +++++++++++++++++++--
 1 file changed, 19 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 142eb0c3cbbc..2d7bab51b320 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -2025,12 +2025,14 @@ static void bch2_dio_write_loop_async(struct closure *);
 static long bch2_dio_write_loop(struct dio_write *dio)
 {
 	bool kthread = (current->flags & PF_KTHREAD) != 0;
+	struct bch_fs *c = dio->iop.op.c;
 	struct kiocb *req = dio->req;
 	struct address_space *mapping = req->ki_filp->f_mapping;
 	struct bch_inode_info *inode = dio->iop.inode;
 	struct bio *bio = &dio->iop.op.wbio.bio;
 	struct bvec_iter_all iter;
 	struct bio_vec *bv;
+	unsigned unaligned;
 	loff_t offset;
 	bool sync;
 	long ret;
@@ -2066,6 +2068,21 @@ static long bch2_dio_write_loop(struct dio_write *dio)
 		if (unlikely(ret < 0))
 			goto err;
 
+		unaligned = bio->bi_iter.bi_size & (block_bytes(c) - 1);
+		bio->bi_iter.bi_size -= unaligned;
+		iov_iter_revert(&dio->iter, unaligned);
+
+		if (!bio->bi_iter.bi_size) {
+			/*
+			 * bio_iov_iter_get_pages was only able to get <
+			 * blocksize worth of pages:
+			 */
+			bio_for_each_segment_all(bv, bio, iter)
+				put_page(bv->bv_page);
+			ret = -EFAULT;
+			goto err;
+		}
+
 		/* gup might have faulted pages back in: */
 		ret = write_invalidate_inode_pages_range(mapping,
 				offset,
@@ -2105,8 +2122,8 @@ loop:
 	ret = dio->iop.op.error ?: ((long) dio->iop.op.written << 9);
 err:
 	bch2_pagecache_block_put(&inode->ei_pagecache_lock);
-	bch2_disk_reservation_put(dio->iop.op.c, &dio->iop.op.res);
-	bch2_quota_reservation_put(dio->iop.op.c, inode, &dio->quota_res);
+	bch2_disk_reservation_put(c, &dio->iop.op.res);
+	bch2_quota_reservation_put(c, inode, &dio->quota_res);
 
 	if (dio->free_iov)
 		kfree(dio->iter.__iov);
-- 
cgit 


From fdfab313b65080bc938b79998e61af7399e2ba58 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 21 Sep 2019 15:29:34 -0400
Subject: bcachefs: Update path microoptimizations

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_locking.h     | 2 +-
 fs/bcachefs/btree_update_leaf.c | 7 ++++---
 2 files changed, 5 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h
index 6591da0a52b3..aa5882cc4852 100644
--- a/fs/bcachefs/btree_locking.h
+++ b/fs/bcachefs/btree_locking.h
@@ -211,7 +211,7 @@ static inline void bch2_btree_node_lock_write(struct btree *b, struct btree_iter
 	EBUG_ON(iter->l[b->c.level].b != b);
 	EBUG_ON(iter->l[b->c.level].lock_seq != b->c.lock.state.seq);
 
-	if (!six_trylock_write(&b->c.lock))
+	if (unlikely(!six_trylock_write(&b->c.lock)))
 		__bch2_btree_node_lock_write(b, iter);
 }
 
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 0b3eed506c2c..a0a59cd496a3 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -44,7 +44,7 @@ inline void bch2_btree_node_lock_for_insert(struct bch_fs *c, struct btree *b,
 {
 	bch2_btree_node_lock_write(b, iter);
 
-	if (btree_node_just_written(b) &&
+	if (unlikely(btree_node_just_written(b)) &&
 	    bch2_btree_post_write_cleanup(c, b))
 		bch2_btree_iter_reinit_node(iter, b);
 
@@ -605,8 +605,9 @@ static inline int do_btree_insert_at(struct btree_trans *trans,
 				goto out_clear_replicas;
 		}
 
-	trans_for_each_update(trans, i)
-		btree_insert_entry_checks(trans, i);
+	if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG))
+		trans_for_each_update(trans, i)
+			btree_insert_entry_checks(trans, i);
 	bch2_btree_trans_verify_locks(trans);
 
 	/*
-- 
cgit 


From fb975d14b7737c4381e539677b2e2b99ca9f8d62 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 21 Sep 2019 16:30:15 -0400
Subject: bcachefs: Drop unnecessary rcu_read_lock()

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_cache.c | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index ea775d91de67..eb38fa50e054 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -675,10 +675,7 @@ struct btree *bch2_btree_node_get(struct bch_fs *c, struct btree_iter *iter,
 	EBUG_ON(!btree_node_locked(iter, level + 1));
 	EBUG_ON(level >= BTREE_MAX_DEPTH);
 retry:
-	rcu_read_lock();
 	b = btree_cache_find(bc, k);
-	rcu_read_unlock();
-
 	if (unlikely(!b)) {
 		/*
 		 * We must have the parent locked to call bch2_btree_node_fill(),
@@ -879,10 +876,7 @@ void bch2_btree_node_prefetch(struct bch_fs *c, struct btree_iter *iter,
 	BUG_ON(!btree_node_locked(iter, level + 1));
 	BUG_ON(level >= BTREE_MAX_DEPTH);
 
-	rcu_read_lock();
 	b = btree_cache_find(bc, k);
-	rcu_read_unlock();
-
 	if (b)
 		return;
 
-- 
cgit 


From fe9cdf61cc7cb791c40d076503d8910fc0727310 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 22 Sep 2019 17:48:25 -0400
Subject: bcachefs: Count iterators for reflink_p overwrites correctly

In order to avoid trying to allocate too many btree iterators,
bch2_extent_atomic_end() needs to count how many iterators are going to
be needed for insertions and overwrites - but we weren't counting the
iterators for deleting a reflink_v when the refcount goes to 0.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/extents.c | 35 +++++++++++++++++++++++++----------
 1 file changed, 25 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 859b1e8206bd..35abcde4bca2 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -935,12 +935,13 @@ static unsigned bch2_bkey_nr_alloc_ptrs(struct bkey_s_c k)
 	return ret;
 }
 
-static int __bch2_extent_atomic_end(struct btree_trans *trans,
-				    struct bkey_s_c k,
-				    unsigned offset,
-				    struct bpos *end,
-				    unsigned *nr_iters,
-				    unsigned max_iters)
+static int count_iters_for_insert(struct btree_trans *trans,
+				  struct bkey_s_c k,
+				  unsigned offset,
+				  struct bpos *end,
+				  unsigned *nr_iters,
+				  unsigned max_iters,
+				  bool overwrite)
 {
 	int ret = 0;
 
@@ -970,6 +971,20 @@ static int __bch2_extent_atomic_end(struct btree_trans *trans,
 				break;
 
 			*nr_iters += 1;
+
+			if (overwrite &&
+			    k.k->type == KEY_TYPE_reflink_v) {
+				struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(k);
+
+				if (le64_to_cpu(r.v->refcount) == 1)
+					*nr_iters += bch2_bkey_nr_alloc_ptrs(k);
+			}
+
+			/*
+			 * if we're going to be deleting an entry from
+			 * the reflink btree, need more iters...
+			 */
+
 			if (*nr_iters >= max_iters) {
 				struct bpos pos = bkey_start_pos(k.k);
 				pos.offset += r_k.k->p.offset - idx;
@@ -1004,8 +1019,8 @@ int bch2_extent_atomic_end(struct btree_iter *iter,
 
 	*end = bpos_min(insert->k.p, b->key.k.p);
 
-	ret = __bch2_extent_atomic_end(trans, bkey_i_to_s_c(insert),
-				       0, end, &nr_iters, 10);
+	ret = count_iters_for_insert(trans, bkey_i_to_s_c(insert),
+				     0, end, &nr_iters, 10, false);
 	if (ret)
 		return ret;
 
@@ -1024,8 +1039,8 @@ int bch2_extent_atomic_end(struct btree_iter *iter,
 			offset = bkey_start_offset(&insert->k) -
 				bkey_start_offset(k.k);
 
-		ret = __bch2_extent_atomic_end(trans, k, offset,
-					       end, &nr_iters, 20);
+		ret = count_iters_for_insert(trans, k, offset,
+					     end, &nr_iters, 20, true);
 		if (ret)
 			return ret;
 
-- 
cgit 


From fb472ac52811e8a50e035807b94e1c29ec1f3395 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 24 Sep 2019 13:33:11 -0400
Subject: bcachefs: Convert a BUG_ON() to a warning

We shouldn't ever be writing past i_size - but, apparently there's still
a bug to track down.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-io.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 2d7bab51b320..e3f1d1dd06ae 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -757,6 +757,9 @@ static void bch2_set_page_dirty(struct bch_fs *c,
 	struct bch_page_state *s = bch2_page_state(page);
 	unsigned i, dirty_sectors = 0;
 
+	WARN_ON(page_offset(page) + offset + len >
+		round_up(i_size_read(&inode->v), block_bytes(c)));
+
 	for (i = round_down(offset, block_bytes(c)) >> 9;
 	     i < round_up(offset + len, block_bytes(c)) >> 9;
 	     i++) {
@@ -1442,8 +1445,8 @@ do_io:
 				     sectors << 9, offset << 9));
 
 		/* Check for writing past i_size: */
-		BUG_ON((bio_end_sector(&w->io->op.op.wbio.bio) << 9) >
-		       round_up(i_size, block_bytes(c)));
+		WARN_ON((bio_end_sector(&w->io->op.op.wbio.bio) << 9) >
+			round_up(i_size, block_bytes(c)));
 
 		w->io->op.op.res.sectors += reserved_sectors;
 		w->io->op.new_i_size = i_size;
-- 
cgit 


From d55460bb099592ccec816afb5e662896ec5fffaa Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 25 Sep 2019 15:26:14 -0400
Subject: bcachefs: Trivial cleanup

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-io.c | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index e3f1d1dd06ae..42c5719155e6 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -830,11 +830,7 @@ vm_fault_t bch2_page_mkwrite(struct vm_fault *vmf)
 		goto out;
 	}
 
-	/* page is wholly or partially inside EOF */
-	if (((page->index + 1) << PAGE_SHIFT) <= isize)
-		len = PAGE_SIZE;
-	else
-		len = offset_in_page(isize);
+	len = min_t(loff_t, PAGE_SIZE, isize - page_offset(page));
 
 	if (bch2_page_reservation_get(c, inode, page, &res, 0, len, true)) {
 		unlock_page(page);
-- 
cgit 


From b43a0f60a61e8e0adea6b1b9adc9a97600fc2f00 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 25 Sep 2019 16:19:52 -0400
Subject: bcachefs: Cleanup i_nlink handling

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs.c    | 22 ++++------------------
 fs/bcachefs/fs.h    |  5 -----
 fs/bcachefs/fsck.c  | 13 ++-----------
 fs/bcachefs/inode.h | 43 +++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 49 insertions(+), 34 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index 0ba498505b07..b9a20bb19b58 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -131,9 +131,7 @@ void bch2_inode_update_after_write(struct bch_fs *c,
 				   struct bch_inode_unpacked *bi,
 				   unsigned fields)
 {
-	set_nlink(&inode->v, bi->bi_flags & BCH_INODE_UNLINKED
-		  ? 0
-		  : bi->bi_nlink + nlink_bias(inode->v.i_mode));
+	set_nlink(&inode->v, bch2_inode_nlink_get(bi));
 	i_uid_write(&inode->v, bi->bi_uid);
 	i_gid_write(&inode->v, bi->bi_gid);
 	inode->v.i_mode	= bi->bi_mode;
@@ -552,12 +550,7 @@ static int inode_update_for_link_fn(struct bch_inode_info *inode,
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
 
 	bi->bi_ctime = bch2_current_time(c);
-
-	if (bi->bi_flags & BCH_INODE_UNLINKED)
-		bi->bi_flags &= ~BCH_INODE_UNLINKED;
-	else
-		bi->bi_nlink++;
-
+	bch2_inode_nlink_inc(bi);
 	return 0;
 }
 
@@ -640,11 +633,7 @@ static int inode_update_for_unlink_fn(struct bch_inode_info *inode,
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
 
 	bi->bi_ctime = bch2_current_time(c);
-	if (bi->bi_nlink)
-		bi->bi_nlink--;
-	else
-		bi->bi_flags |= BCH_INODE_UNLINKED;
-
+	bch2_inode_nlink_dec(bi);
 	return 0;
 }
 
@@ -815,10 +804,7 @@ static int inode_update_for_rename_fn(struct bch_inode_info *inode,
 		BUG_ON(bi->bi_nlink &&
 		       S_ISDIR(info->dst_inode->v.i_mode));
 
-		if (bi->bi_nlink)
-			bi->bi_nlink--;
-		else
-			bi->bi_flags |= BCH_INODE_UNLINKED;
+		bch2_inode_nlink_dec(bi);
 	}
 
 	if (inode == info->src_dir ||
diff --git a/fs/bcachefs/fs.h b/fs/bcachefs/fs.h
index 6edf5dd803f0..04ac5b4129a4 100644
--- a/fs/bcachefs/fs.h
+++ b/fs/bcachefs/fs.h
@@ -109,11 +109,6 @@ static inline u8 mode_to_type(umode_t mode)
 	return (mode >> 12) & 15;
 }
 
-static inline unsigned nlink_bias(umode_t mode)
-{
-	return S_ISDIR(mode) ? 2 : 1;
-}
-
 static inline bool inode_attr_changing(struct bch_inode_info *dir,
 				struct bch_inode_info *inode,
 				enum inode_opt_id id)
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 50a7d8c1faba..162563b809fb 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -1116,9 +1116,7 @@ static int check_inode_nlink(struct bch_fs *c,
 			     struct nlink *link,
 			     bool *do_update)
 {
-	u32 i_nlink = u->bi_flags & BCH_INODE_UNLINKED
-		? 0
-		: u->bi_nlink + nlink_bias(u->bi_mode);
+	u32 i_nlink = bch2_inode_nlink_get(u);
 	u32 real_i_nlink =
 		link->count * nlink_bias(u->bi_mode) +
 		link->dir_count;
@@ -1197,14 +1195,7 @@ static int check_inode_nlink(struct bch_fs *c,
 			    u->bi_inum, i_nlink, real_i_nlink);
 set_i_nlink:
 	if (i_nlink != real_i_nlink) {
-		if (real_i_nlink) {
-			u->bi_nlink = real_i_nlink - nlink_bias(u->bi_mode);
-			u->bi_flags &= ~BCH_INODE_UNLINKED;
-		} else {
-			u->bi_nlink = 0;
-			u->bi_flags |= BCH_INODE_UNLINKED;
-		}
-
+		bch2_inode_nlink_set(u, real_i_nlink);
 		*do_update = true;
 	}
 fsck_err:
diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h
index af0c355f2f04..e88ec78071bd 100644
--- a/fs/bcachefs/inode.h
+++ b/fs/bcachefs/inode.h
@@ -103,6 +103,49 @@ static inline u64 bch2_inode_opt_get(struct bch_inode_unpacked *inode,
 	}
 }
 
+/* i_nlink: */
+
+static inline unsigned nlink_bias(umode_t mode)
+{
+	return S_ISDIR(mode) ? 2 : 1;
+}
+
+static inline void bch2_inode_nlink_inc(struct bch_inode_unpacked *bi)
+{
+	if (bi->bi_flags & BCH_INODE_UNLINKED)
+		bi->bi_flags &= ~BCH_INODE_UNLINKED;
+	else
+		bi->bi_nlink++;
+}
+
+static inline void bch2_inode_nlink_dec(struct bch_inode_unpacked *bi)
+{
+	BUG_ON(bi->bi_flags & BCH_INODE_UNLINKED);
+	if (bi->bi_nlink)
+		bi->bi_nlink--;
+	else
+		bi->bi_flags |= BCH_INODE_UNLINKED;
+}
+
+static inline unsigned bch2_inode_nlink_get(struct bch_inode_unpacked *bi)
+{
+	return bi->bi_flags & BCH_INODE_UNLINKED
+		  ? 0
+		  : bi->bi_nlink + nlink_bias(bi->bi_mode);
+}
+
+static inline void bch2_inode_nlink_set(struct bch_inode_unpacked *bi,
+					unsigned nlink)
+{
+	if (nlink) {
+		bi->bi_nlink = nlink - nlink_bias(bi->bi_mode);
+		bi->bi_flags &= ~BCH_INODE_UNLINKED;
+	} else {
+		bi->bi_nlink = 0;
+		bi->bi_flags |= BCH_INODE_UNLINKED;
+	}
+}
+
 #ifdef CONFIG_BCACHEFS_DEBUG
 void bch2_inode_pack_test(void);
 #else
-- 
cgit 


From ef9f95ba41b7685fc27ca73753bbfa4467555b79 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 25 Sep 2019 15:57:56 -0400
Subject: bcachefs: Improve error handling for for_each_btree_key_continue()

Change it to match for_each_btree_key()

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.h | 14 +++++++-------
 fs/bcachefs/fsck.c       | 12 ++++++------
 fs/bcachefs/str_hash.h   |  6 ++++--
 3 files changed, 17 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index 7f76db5bb8bc..9b5e3de68487 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -246,6 +246,11 @@ static inline struct bkey_s_c __bch2_btree_iter_next(struct btree_iter *iter,
 		: bch2_btree_iter_next(iter);
 }
 
+static inline int bkey_err(struct bkey_s_c k)
+{
+	return PTR_ERR_OR_ZERO(k.k);
+}
+
 #define for_each_btree_key(_trans, _iter, _btree_id,			\
 			   _start, _flags, _k, _ret)			\
 	for ((_ret) = PTR_ERR_OR_ZERO((_iter) =				\
@@ -257,16 +262,11 @@ static inline struct bkey_s_c __bch2_btree_iter_next(struct btree_iter *iter,
 	     (_ret) = PTR_ERR_OR_ZERO(((_k) =				\
 			__bch2_btree_iter_next(_iter, _flags)).k))
 
-#define for_each_btree_key_continue(_iter, _flags, _k)			\
+#define for_each_btree_key_continue(_iter, _flags, _k, _ret)		\
 	for ((_k) = __bch2_btree_iter_peek(_iter, _flags);		\
-	     !IS_ERR_OR_NULL((_k).k);					\
+	     !((_ret) = bkey_err(_k)) && (_k).k;			\
 	     (_k) = __bch2_btree_iter_next(_iter, _flags))
 
-static inline int bkey_err(struct bkey_s_c k)
-{
-	return PTR_ERR_OR_ZERO(k.k);
-}
-
 /* new multiple iterator interface: */
 
 int bch2_trans_iter_put(struct btree_trans *, struct btree_iter *);
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 162563b809fb..b806284c0517 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -248,7 +248,7 @@ static int hash_check_duplicates(struct btree_trans *trans,
 	iter = bch2_trans_copy_iter(trans, h->chain);
 	BUG_ON(IS_ERR(iter));
 
-	for_each_btree_key_continue(iter, 0, k2) {
+	for_each_btree_key_continue(iter, 0, k2, ret) {
 		if (bkey_cmp(k2.k->p, k.k->p) >= 0)
 			break;
 
@@ -458,7 +458,7 @@ static int check_extents(struct bch_fs *c)
 	iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
 				   POS(BCACHEFS_ROOT_INO, 0), 0);
 retry:
-	for_each_btree_key_continue(iter, 0, k) {
+	for_each_btree_key_continue(iter, 0, k, ret) {
 		ret = walk_inode(&trans, &w, k.k->p.inode);
 		if (ret)
 			break;
@@ -553,7 +553,7 @@ static int check_dirents(struct bch_fs *c)
 	iter = bch2_trans_get_iter(&trans, BTREE_ID_DIRENTS,
 				   POS(BCACHEFS_ROOT_INO, 0), 0);
 retry:
-	for_each_btree_key_continue(iter, 0, k) {
+	for_each_btree_key_continue(iter, 0, k, ret) {
 		struct bkey_s_c_dirent d;
 		struct bch_inode_unpacked target;
 		bool have_target;
@@ -707,7 +707,7 @@ static int check_xattrs(struct bch_fs *c)
 	iter = bch2_trans_get_iter(&trans, BTREE_ID_XATTRS,
 				   POS(BCACHEFS_ROOT_INO, 0), 0);
 retry:
-	for_each_btree_key_continue(iter, 0, k) {
+	for_each_btree_key_continue(iter, 0, k, ret) {
 		ret = walk_inode(&trans, &w, k.k->p.inode);
 		if (ret)
 			break;
@@ -995,7 +995,7 @@ up:
 
 	iter = bch2_trans_get_iter(&trans, BTREE_ID_INODES, POS_MIN, 0);
 retry:
-	for_each_btree_key_continue(iter, 0, k) {
+	for_each_btree_key_continue(iter, 0, k, ret) {
 		if (k.k->type != KEY_TYPE_inode)
 			continue;
 
@@ -1021,7 +1021,7 @@ retry:
 			had_unreachable = true;
 		}
 	}
-	ret = bch2_trans_iter_free(&trans, iter);
+	bch2_trans_iter_free(&trans, iter);
 	if (ret)
 		goto err;
 
diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h
index df3f19055d1e..31b278e71051 100644
--- a/fs/bcachefs/str_hash.h
+++ b/fs/bcachefs/str_hash.h
@@ -187,6 +187,7 @@ int bch2_hash_needs_whiteout(struct btree_trans *trans,
 {
 	struct btree_iter *iter;
 	struct bkey_s_c k;
+	int ret;
 
 	iter = bch2_trans_copy_iter(trans, start);
 	if (IS_ERR(iter))
@@ -194,7 +195,7 @@ int bch2_hash_needs_whiteout(struct btree_trans *trans,
 
 	bch2_btree_iter_next_slot(iter);
 
-	for_each_btree_key_continue(iter, BTREE_ITER_SLOTS, k) {
+	for_each_btree_key_continue(iter, BTREE_ITER_SLOTS, k, ret) {
 		if (k.k->type != desc.key_type &&
 		    k.k->type != KEY_TYPE_whiteout)
 			break;
@@ -206,7 +207,8 @@ int bch2_hash_needs_whiteout(struct btree_trans *trans,
 		}
 	}
 
-	return bch2_trans_iter_free(trans, iter);
+	bch2_trans_iter_free(trans, iter);
+	return ret;
 }
 
 static __always_inline
-- 
cgit 


From bbd8d2038b129437a6744190d8ca00c2597ee8fc Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 22 Sep 2019 19:35:12 -0400
Subject: bcachefs: BTREE_ITER_SLOTS isn't a type of btree iter

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c  | 66 ++++++++++++++++++++---------------------------
 fs/bcachefs/btree_types.h | 15 +++++++----
 fs/bcachefs/fs-io.c       |  7 +++--
 fs/bcachefs/io.c          |  4 +--
 4 files changed, 43 insertions(+), 49 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index d65edc460b07..0439c5b07f61 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -473,7 +473,7 @@ static void __bch2_btree_iter_verify(struct btree_iter *iter,
 	}
 
 	BUG_ON(iter->uptodate == BTREE_ITER_UPTODATE &&
-	       (iter->flags & BTREE_ITER_TYPE) == BTREE_ITER_KEYS &&
+	       btree_iter_type(iter) == BTREE_ITER_KEYS &&
 	       !bkey_whiteout(&iter->k) &&
 	       bch2_btree_node_iter_end(&l->iter));
 }
@@ -1152,6 +1152,7 @@ static inline void bch2_btree_iter_checks(struct btree_iter *iter,
 	EBUG_ON(!!(iter->flags & BTREE_ITER_IS_EXTENTS) !=
 		(btree_node_type_is_extents(iter->btree_id) &&
 		 type != BTREE_ITER_NODES));
+	EBUG_ON(btree_iter_type(iter) != type);
 
 	bch2_btree_trans_verify_locks(iter->trans);
 }
@@ -1661,7 +1662,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
 {
 	int ret;
 
-	bch2_btree_iter_checks(iter, BTREE_ITER_SLOTS);
+	bch2_btree_iter_checks(iter, BTREE_ITER_KEYS);
 
 	if (iter->uptodate == BTREE_ITER_UPTODATE)
 		return btree_iter_peek_uptodate(iter);
@@ -1675,7 +1676,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
 
 struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *iter)
 {
-	bch2_btree_iter_checks(iter, BTREE_ITER_SLOTS);
+	bch2_btree_iter_checks(iter, BTREE_ITER_KEYS);
 
 	iter->pos = btree_type_successor(iter->btree_id, iter->k.p);
 
@@ -1830,7 +1831,7 @@ success:
 	return 0;
 }
 
-static int btree_trans_iter_alloc(struct btree_trans *trans)
+static struct btree_iter *btree_trans_iter_alloc(struct btree_trans *trans)
 {
 	unsigned idx = __ffs64(~trans->iters_linked);
 
@@ -1840,7 +1841,7 @@ static int btree_trans_iter_alloc(struct btree_trans *trans)
 	if (trans->nr_iters == trans->size) {
 		int ret = bch2_trans_realloc_iters(trans, trans->size * 2);
 		if (ret)
-			return ret;
+			return ERR_PTR(ret);
 	}
 
 	idx = trans->nr_iters++;
@@ -1850,7 +1851,7 @@ static int btree_trans_iter_alloc(struct btree_trans *trans)
 got_slot:
 	BUG_ON(trans->iters_linked & (1ULL << idx));
 	trans->iters_linked |= 1ULL << idx;
-	return idx;
+	return &trans->iters[idx];
 }
 
 static struct btree_iter *__btree_trans_get_iter(struct btree_trans *trans,
@@ -1858,37 +1859,29 @@ static struct btree_iter *__btree_trans_get_iter(struct btree_trans *trans,
 						 unsigned flags, u64 iter_id)
 {
 	struct btree_iter *iter;
-	int idx;
 
 	BUG_ON(trans->nr_iters > BTREE_ITER_MAX);
 
-	for (idx = 0; idx < trans->nr_iters; idx++) {
-		if (!(trans->iters_linked & (1ULL << idx)))
-			continue;
-
-		iter = &trans->iters[idx];
+	trans_for_each_iter(trans, iter)
 		if (iter_id
 		    ? iter->id == iter_id
 		    : (iter->btree_id == btree_id &&
 		       !bkey_cmp(iter->pos, pos)))
 			goto found;
-	}
-	idx = -1;
+
+	iter = NULL;
 found:
-	if (idx < 0) {
-		idx = btree_trans_iter_alloc(trans);
-		if (idx < 0)
-			return ERR_PTR(idx);
+	if (!iter) {
+		iter = btree_trans_iter_alloc(trans);
+		if (IS_ERR(iter))
+			return iter;
 
-		iter = &trans->iters[idx];
 		iter->id = iter_id;
 
 		bch2_btree_iter_init(trans, iter, btree_id, pos, flags);
 	} else {
-		iter = &trans->iters[idx];
-
-		iter->flags &= ~(BTREE_ITER_INTENT|BTREE_ITER_PREFETCH);
-		iter->flags |= flags & (BTREE_ITER_INTENT|BTREE_ITER_PREFETCH);
+		iter->flags &= ~(BTREE_ITER_SLOTS|BTREE_ITER_INTENT|BTREE_ITER_PREFETCH);
+		iter->flags |= flags & (BTREE_ITER_SLOTS|BTREE_ITER_INTENT|BTREE_ITER_PREFETCH);
 
 		if ((iter->flags & BTREE_ITER_INTENT) &&
 		    !bch2_btree_iter_upgrade(iter, 1)) {
@@ -1898,9 +1891,9 @@ found:
 	}
 
 	BUG_ON(iter->btree_id != btree_id);
-	BUG_ON(trans->iters_live & (1ULL << idx));
-	trans->iters_live	|= 1ULL << idx;
-	trans->iters_touched	|= 1ULL << idx;
+	BUG_ON(trans->iters_live & (1ULL << iter->idx));
+	trans->iters_live	|= 1ULL << iter->idx;
+	trans->iters_touched	|= 1ULL << iter->idx;
 
 	BUG_ON(iter->btree_id != btree_id);
 	BUG_ON((iter->flags ^ flags) & BTREE_ITER_TYPE);
@@ -1950,29 +1943,26 @@ struct btree_iter *bch2_trans_copy_iter(struct btree_trans *trans,
 					struct btree_iter *src)
 {
 	struct btree_iter *iter;
-	unsigned offset = offsetof(struct btree_iter, trans);
-	int i, idx;
+	int idx, i;
 
-	idx = btree_trans_iter_alloc(trans);
-	if (idx < 0)
-		return ERR_PTR(idx);
+	iter = btree_trans_iter_alloc(trans);
+	if (IS_ERR(iter))
+		return iter;
+
+	idx = iter->idx;
+	*iter = *src;
+	iter->idx = idx;
 
 	trans->iters_live		|= 1ULL << idx;
 	trans->iters_touched		|= 1ULL << idx;
 	trans->iters_unlink_on_restart	|= 1ULL << idx;
 
-	iter = &trans->iters[idx];
-
-	memcpy((void *) iter + offset,
-	       (void *)  src + offset,
-	       sizeof(*iter) - offset);
-
 	for (i = 0; i < BTREE_MAX_DEPTH; i++)
 		if (btree_node_locked(iter, i))
 			six_lock_increment(&iter->l[i].b->c.lock,
 					   __btree_lock_want(iter, i));
 
-	return &trans->iters[idx];
+	return iter;
 }
 
 static int bch2_trans_preload_mem(struct btree_trans *trans, size_t size)
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 88e048fa0fba..299d1173df62 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -183,20 +183,20 @@ struct btree_node_iter {
 
 enum btree_iter_type {
 	BTREE_ITER_KEYS,
-	BTREE_ITER_SLOTS,
 	BTREE_ITER_NODES,
 };
 
 #define BTREE_ITER_TYPE			((1 << 2) - 1)
 
-#define BTREE_ITER_INTENT		(1 << 2)
-#define BTREE_ITER_PREFETCH		(1 << 3)
+#define BTREE_ITER_SLOTS		(1 << 2)
+#define BTREE_ITER_INTENT		(1 << 3)
+#define BTREE_ITER_PREFETCH		(1 << 4)
 /*
  * Used in bch2_btree_iter_traverse(), to indicate whether we're searching for
  * @pos or the first key strictly greater than @pos
  */
-#define BTREE_ITER_IS_EXTENTS		(1 << 4)
-#define BTREE_ITER_ERROR		(1 << 5)
+#define BTREE_ITER_IS_EXTENTS		(1 << 5)
+#define BTREE_ITER_ERROR		(1 << 6)
 
 enum btree_iter_uptodate {
 	BTREE_ITER_UPTODATE		= 0,
@@ -241,6 +241,11 @@ struct btree_iter {
 	u64			id;
 };
 
+static inline enum btree_iter_type btree_iter_type(struct btree_iter *iter)
+{
+	return iter->flags & BTREE_ITER_TYPE;
+}
+
 struct deferred_update {
 	struct journal_preres	res;
 	struct journal_entry_pin journal;
diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 42c5719155e6..791e72ce1c29 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -447,10 +447,9 @@ static int bchfs_write_index_update(struct bch_write_op *wop)
 
 	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024);
 
-	iter = bch2_trans_get_iter(&trans,
-				BTREE_ID_EXTENTS,
-				bkey_start_pos(&k->k),
-				BTREE_ITER_INTENT);
+	iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
+				   bkey_start_pos(&k->k),
+				   BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
 
 	do {
 		BKEY_PADDED(k) tmp;
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index b0bff54a18e2..07fe6b5cd517 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -1417,8 +1417,8 @@ retry:
 	bch2_trans_begin(&trans);
 
 	iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, rbio->pos,
-				   BTREE_ITER_INTENT);
-	k = bch2_btree_iter_peek(iter);
+				   BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+	k = bch2_btree_iter_peek_slot(iter);
 	if (IS_ERR_OR_NULL(k.k))
 		goto out;
 
-- 
cgit 


From 877dfb348d90abc3d7464ee37240f21d9bdff630 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 26 Sep 2019 19:09:08 -0400
Subject: bcachefs: Fix for partial buffered writes

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-io.c | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 791e72ce1c29..18356cbe0794 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -756,8 +756,8 @@ static void bch2_set_page_dirty(struct bch_fs *c,
 	struct bch_page_state *s = bch2_page_state(page);
 	unsigned i, dirty_sectors = 0;
 
-	WARN_ON(page_offset(page) + offset + len >
-		round_up(i_size_read(&inode->v), block_bytes(c)));
+	WARN_ON((u64) page_offset(page) + offset + len >
+		round_up((u64) i_size_read(&inode->v), block_bytes(c)));
 
 	for (i = round_down(offset, block_bytes(c)) >> 9;
 	     i < round_up(offset + len, block_bytes(c)) >> 9;
@@ -1704,14 +1704,6 @@ retry_reservation:
 	if (!copied)
 		goto out;
 
-	nr_pages_copied = DIV_ROUND_UP(offset + copied, PAGE_SIZE);
-	inode->ei_last_dirtied = (unsigned long) current;
-
-	spin_lock(&inode->v.i_lock);
-	if (pos + copied > inode->v.i_size)
-		i_size_write(&inode->v, pos + copied);
-	spin_unlock(&inode->v.i_lock);
-
 	if (copied < len &&
 	    ((offset + copied) & (PAGE_SIZE - 1))) {
 		struct page *page = pages[(offset + copied) >> PAGE_SHIFT];
@@ -1722,6 +1714,11 @@ retry_reservation:
 		}
 	}
 
+	spin_lock(&inode->v.i_lock);
+	if (pos + copied > inode->v.i_size)
+		i_size_write(&inode->v, pos + copied);
+	spin_unlock(&inode->v.i_lock);
+
 	while (set_dirty < copied) {
 		struct page *page = pages[(offset + set_dirty) >> PAGE_SHIFT];
 		unsigned pg_offset = (offset + set_dirty) & (PAGE_SIZE - 1);
@@ -1737,6 +1734,9 @@ retry_reservation:
 
 		set_dirty += pg_len;
 	}
+
+	nr_pages_copied = DIV_ROUND_UP(offset + copied, PAGE_SIZE);
+	inode->ei_last_dirtied = (unsigned long) current;
 out:
 	for (i = nr_pages_copied; i < nr_pages; i++) {
 		unlock_page(pages[i]);
-- 
cgit 


From a7199432c3cbcd42141cfd5c047bf8828c2390d8 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 22 Sep 2019 18:49:16 -0400
Subject: bcachefs: Kill deferred btree updates

Will be replaced by cached btree iterators

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/acl.c               |   2 +-
 fs/bcachefs/alloc_background.c  |   4 +-
 fs/bcachefs/btree_types.h       |  20 -----
 fs/bcachefs/btree_update.h      |  43 ++--------
 fs/bcachefs/btree_update_leaf.c | 178 ++++++----------------------------------
 fs/bcachefs/buckets.c           |   8 +-
 fs/bcachefs/dirent.c            |   9 +-
 fs/bcachefs/ec.c                |   6 +-
 fs/bcachefs/fs-io.c             |  78 +++++-------------
 fs/bcachefs/fs.c                |  42 +++-------
 fs/bcachefs/fs.h                |   1 -
 fs/bcachefs/fsck.c              |   7 +-
 fs/bcachefs/inode.c             |   6 +-
 fs/bcachefs/io.c                |   5 +-
 fs/bcachefs/migrate.c           |   5 +-
 fs/bcachefs/move.c              |   3 +-
 fs/bcachefs/opts.h              |   8 +-
 fs/bcachefs/quota.c             |   2 +-
 fs/bcachefs/recovery.c          |   2 +-
 fs/bcachefs/reflink.c           |   4 +-
 fs/bcachefs/str_hash.h          |   4 +-
 fs/bcachefs/tests.c             |  10 +--
 22 files changed, 99 insertions(+), 348 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/acl.c b/fs/bcachefs/acl.c
index 1c3343252129..5a4263806610 100644
--- a/fs/bcachefs/acl.c
+++ b/fs/bcachefs/acl.c
@@ -378,7 +378,7 @@ int bch2_acl_chmod(struct btree_trans *trans,
 	}
 
 	new->k.p = iter->pos;
-	bch2_trans_update(trans, BTREE_INSERT_ENTRY(iter, &new->k_i));
+	bch2_trans_update(trans, iter, &new->k_i);
 	*new_acl = acl;
 	acl = NULL;
 err:
diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 85795b580892..81418d534d70 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -311,7 +311,7 @@ retry:
 	a->k.p = iter->pos;
 	bch2_alloc_pack(a, new_u);
 
-	bch2_trans_update(trans, BTREE_INSERT_ENTRY(iter, &a->k_i));
+	bch2_trans_update(trans, iter, &a->k_i);
 	ret = bch2_trans_commit(trans, NULL, NULL,
 				BTREE_INSERT_ATOMIC|
 				BTREE_INSERT_NOFAIL|
@@ -899,7 +899,7 @@ retry:
 	a->k.p = iter->pos;
 	bch2_alloc_pack(a, u);
 
-	bch2_trans_update(trans, BTREE_INSERT_ENTRY(iter, &a->k_i));
+	bch2_trans_update(trans, iter, &a->k_i);
 
 	/*
 	 * XXX:
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 299d1173df62..c128ff393f0c 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -246,29 +246,9 @@ static inline enum btree_iter_type btree_iter_type(struct btree_iter *iter)
 	return iter->flags & BTREE_ITER_TYPE;
 }
 
-struct deferred_update {
-	struct journal_preres	res;
-	struct journal_entry_pin journal;
-
-	spinlock_t		lock;
-	unsigned		dirty:1;
-
-	u8			allocated_u64s;
-	enum btree_id		btree_id;
-
-	/* must be last: */
-	struct bkey_i		k;
-};
-
 struct btree_insert_entry {
 	struct bkey_i		*k;
-
-	union {
 	struct btree_iter	*iter;
-	struct deferred_update	*d;
-	};
-
-	bool			deferred;
 };
 
 #define BTREE_ITER_MAX		64
diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
index 36e34b3d9213..0e985c1f0100 100644
--- a/fs/bcachefs/btree_update.h
+++ b/fs/bcachefs/btree_update.h
@@ -15,24 +15,6 @@ bool bch2_btree_bset_insert_key(struct btree_iter *, struct btree *,
 void bch2_btree_journal_key(struct btree_trans *, struct btree_iter *,
 			    struct bkey_i *);
 
-void bch2_deferred_update_free(struct bch_fs *,
-			       struct deferred_update *);
-struct deferred_update *
-bch2_deferred_update_alloc(struct bch_fs *, enum btree_id, unsigned);
-
-#define BTREE_INSERT_ENTRY(_iter, _k)					\
-	((struct btree_insert_entry) {					\
-		.iter		= (_iter),				\
-		.k		= (_k),					\
-	})
-
-#define BTREE_INSERT_DEFERRED(_d, _k)					\
-	((struct btree_insert_entry) {					\
-		.k		= (_k),					\
-		.d		= (_d),					\
-		.deferred	= true,					\
-	})
-
 enum {
 	__BTREE_INSERT_ATOMIC,
 	__BTREE_INSERT_NOUNLOCK,
@@ -120,11 +102,14 @@ int bch2_trans_commit(struct btree_trans *,
 		      u64 *, unsigned);
 
 static inline void bch2_trans_update(struct btree_trans *trans,
-				     struct btree_insert_entry entry)
+				     struct btree_iter *iter,
+				     struct bkey_i *k)
 {
 	EBUG_ON(trans->nr_updates >= trans->nr_iters + 4);
 
-	trans->updates[trans->nr_updates++] = entry;
+	trans->updates[trans->nr_updates++] = (struct btree_insert_entry) {
+		.iter = iter, .k = k
+	};
 }
 
 #define bch2_trans_do(_c, _journal_seq, _flags, _do)			\
@@ -145,23 +130,9 @@ static inline void bch2_trans_update(struct btree_trans *trans,
 	_ret;								\
 })
 
-#define __trans_next_update(_trans, _i, _filter)			\
-({									\
-	while ((_i) < (_trans)->updates + (_trans->nr_updates) && !(_filter))\
-		(_i)++;							\
-									\
-	(_i) < (_trans)->updates + (_trans->nr_updates);		\
-})
-
-#define __trans_for_each_update(_trans, _i, _filter)			\
+#define trans_for_each_update(_trans, _i)				\
 	for ((_i) = (_trans)->updates;					\
-	     __trans_next_update(_trans, _i, _filter);			\
+	     (_i) < (_trans)->updates + (_trans)->nr_updates;		\
 	     (_i)++)
 
-#define trans_for_each_update(trans, i)					\
-	__trans_for_each_update(trans, i, true)
-
-#define trans_for_each_update_iter(trans, i)				\
-	__trans_for_each_update(trans, i, !(i)->deferred)
-
 #endif /* _BCACHEFS_BTREE_UPDATE_H */
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index a0a59cd496a3..2e9271759447 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -28,8 +28,7 @@ static inline bool same_leaf_as_prev(struct btree_trans *trans,
 		? trans->updates + trans->updates_sorted[sorted_idx - 1]
 		: NULL;
 
-	return !i->deferred &&
-		prev &&
+	return prev &&
 		i->iter->l[0].b == prev->iter->l[0].b;
 }
 
@@ -73,13 +72,6 @@ static void btree_trans_lock_write(struct btree_trans *trans, bool lock)
 	}
 }
 
-static inline int btree_trans_cmp(struct btree_insert_entry l,
-				  struct btree_insert_entry r)
-{
-	return cmp_int(l.deferred, r.deferred) ?:
-		btree_iter_cmp(l.iter, r.iter);
-}
-
 static inline void btree_trans_sort_updates(struct btree_trans *trans)
 {
 	struct btree_insert_entry *l, *r;
@@ -89,7 +81,7 @@ static inline void btree_trans_sort_updates(struct btree_trans *trans)
 		for (pos = 0; pos < nr; pos++) {
 			r = trans->updates + trans->updates_sorted[pos];
 
-			if (btree_trans_cmp(*l, *r) <= 0)
+			if (btree_iter_cmp(l->iter, r->iter) <= 0)
 				break;
 		}
 
@@ -312,143 +304,23 @@ static void btree_insert_key_leaf(struct btree_trans *trans,
 	trace_btree_insert_key(c, b, insert->k);
 }
 
-/* Deferred btree updates: */
-
-static void deferred_update_flush(struct journal *j,
-				  struct journal_entry_pin *pin,
-				  u64 seq)
-{
-	struct bch_fs *c = container_of(j, struct bch_fs, journal);
-	struct deferred_update *d =
-		container_of(pin, struct deferred_update, journal);
-	struct journal_preres res = { 0 };
-	u64 tmp[32];
-	struct bkey_i *k = (void *) tmp;
-	int ret;
-
-	if (d->allocated_u64s > ARRAY_SIZE(tmp)) {
-		k = kmalloc(d->allocated_u64s * sizeof(u64), GFP_NOFS);
-
-		BUG_ON(!k); /* XXX */
-	}
-
-	spin_lock(&d->lock);
-	if (d->dirty) {
-		BUG_ON(jset_u64s(d->k.k.u64s) > d->res.u64s);
-
-		swap(res, d->res);
-
-		BUG_ON(d->k.k.u64s > d->allocated_u64s);
-
-		bkey_copy(k, &d->k);
-		d->dirty = false;
-		spin_unlock(&d->lock);
-
-		ret = bch2_btree_insert(c, d->btree_id, k, NULL, NULL,
-					BTREE_INSERT_NOFAIL|
-					BTREE_INSERT_USE_RESERVE|
-					BTREE_INSERT_JOURNAL_RESERVED);
-		bch2_fs_fatal_err_on(ret && !bch2_journal_error(j),
-				     c, "error flushing deferred btree update: %i", ret);
-
-		spin_lock(&d->lock);
-	}
-
-	if (!d->dirty)
-		bch2_journal_pin_drop(j, &d->journal);
-	spin_unlock(&d->lock);
-
-	bch2_journal_preres_put(j, &res);
-	if (k != (void *) tmp)
-		kfree(k);
-}
-
-static void btree_insert_key_deferred(struct btree_trans *trans,
-				      struct btree_insert_entry *insert)
-{
-	struct bch_fs *c = trans->c;
-	struct journal *j = &c->journal;
-	struct deferred_update *d = insert->d;
-	int difference;
-
-	BUG_ON(trans->flags & BTREE_INSERT_JOURNAL_REPLAY);
-	BUG_ON(insert->k->u64s > d->allocated_u64s);
-
-	__btree_journal_key(trans, d->btree_id, insert->k);
-
-	spin_lock(&d->lock);
-	BUG_ON(jset_u64s(insert->k->u64s) >
-	       trans->journal_preres.u64s);
-
-	difference = jset_u64s(insert->k->u64s) - d->res.u64s;
-	if (difference > 0) {
-		trans->journal_preres.u64s	-= difference;
-		d->res.u64s			+= difference;
-	}
-
-	bkey_copy(&d->k, insert->k);
-	d->dirty = true;
-
-	bch2_journal_pin_update(j, trans->journal_res.seq, &d->journal,
-				deferred_update_flush);
-	spin_unlock(&d->lock);
-}
-
-void bch2_deferred_update_free(struct bch_fs *c,
-			       struct deferred_update *d)
-{
-	deferred_update_flush(&c->journal, &d->journal, 0);
-
-	BUG_ON(journal_pin_active(&d->journal));
-
-	bch2_journal_pin_flush(&c->journal, &d->journal);
-	kfree(d);
-}
-
-struct deferred_update *
-bch2_deferred_update_alloc(struct bch_fs *c,
-			   enum btree_id btree_id,
-			   unsigned u64s)
-{
-	struct deferred_update *d;
-
-	BUG_ON(u64s > U8_MAX);
-
-	d = kmalloc(offsetof(struct deferred_update, k) +
-		    u64s * sizeof(u64), GFP_NOFS);
-	BUG_ON(!d);
-
-	memset(d, 0, offsetof(struct deferred_update, k));
-
-	spin_lock_init(&d->lock);
-	d->allocated_u64s	= u64s;
-	d->btree_id		= btree_id;
-
-	return d;
-}
-
 /* Normal update interface: */
 
 static inline void btree_insert_entry_checks(struct btree_trans *trans,
 					     struct btree_insert_entry *i)
 {
 	struct bch_fs *c = trans->c;
-	enum btree_id btree_id = !i->deferred
-		? i->iter->btree_id
-		: i->d->btree_id;
-
-	if (!i->deferred) {
-		BUG_ON(i->iter->level);
-		BUG_ON(bkey_cmp(bkey_start_pos(&i->k->k), i->iter->pos));
-		EBUG_ON((i->iter->flags & BTREE_ITER_IS_EXTENTS) &&
-			bkey_cmp(i->k->k.p, i->iter->l[0].b->key.k.p) > 0);
-		EBUG_ON((i->iter->flags & BTREE_ITER_IS_EXTENTS) &&
-			!(trans->flags & BTREE_INSERT_ATOMIC));
-	}
+
+	BUG_ON(i->iter->level);
+	BUG_ON(bkey_cmp(bkey_start_pos(&i->k->k), i->iter->pos));
+	EBUG_ON((i->iter->flags & BTREE_ITER_IS_EXTENTS) &&
+		bkey_cmp(i->k->k.p, i->iter->l[0].b->key.k.p) > 0);
+	EBUG_ON((i->iter->flags & BTREE_ITER_IS_EXTENTS) &&
+		!(trans->flags & BTREE_INSERT_ATOMIC));
 
 	BUG_ON(debug_check_bkeys(c) &&
 	       !bkey_deleted(&i->k->k) &&
-	       bch2_bkey_invalid(c, bkey_i_to_s_c(i->k), btree_id));
+	       bch2_bkey_invalid(c, bkey_i_to_s_c(i->k), i->iter->btree_id));
 }
 
 static int bch2_trans_journal_preres_get(struct btree_trans *trans)
@@ -459,7 +331,7 @@ static int bch2_trans_journal_preres_get(struct btree_trans *trans)
 	int ret;
 
 	trans_for_each_update(trans, i)
-		if (i->deferred)
+		if (0)
 			u64s += jset_u64s(i->k->k.u64s);
 
 	if (!u64s)
@@ -551,10 +423,7 @@ static int btree_trans_check_can_insert(struct btree_trans *trans,
 static inline void do_btree_insert_one(struct btree_trans *trans,
 				       struct btree_insert_entry *insert)
 {
-	if (likely(!insert->deferred))
-		btree_insert_key_leaf(trans, insert);
-	else
-		btree_insert_key_deferred(trans, insert);
+	btree_insert_key_leaf(trans, insert);
 }
 
 static inline bool update_triggers_transactional(struct btree_trans *trans,
@@ -570,7 +439,6 @@ static inline bool update_has_triggers(struct btree_trans *trans,
 				       struct btree_insert_entry *i)
 {
 	return likely(!(trans->flags & BTREE_INSERT_NOMARK)) &&
-		!i->deferred &&
 		btree_node_type_needs_gc(i->iter->btree_id);
 }
 
@@ -588,14 +456,14 @@ static inline int do_btree_insert_at(struct btree_trans *trans,
 		: 0;
 	int ret;
 
-	trans_for_each_update_iter(trans, i)
+	trans_for_each_update(trans, i)
 		BUG_ON(i->iter->uptodate >= BTREE_ITER_NEED_RELOCK);
 
 	/*
 	 * note: running triggers will append more updates to the list of
 	 * updates as we're walking it:
 	 */
-	trans_for_each_update_iter(trans, i)
+	trans_for_each_update(trans, i)
 		if (update_has_triggers(trans, i) &&
 		    update_triggers_transactional(trans, i)) {
 			ret = bch2_trans_mark_update(trans, i->iter, i->k);
@@ -633,7 +501,7 @@ static inline int do_btree_insert_at(struct btree_trans *trans,
 	if (ret)
 		goto out;
 
-	trans_for_each_update_iter(trans, i) {
+	trans_for_each_update(trans, i) {
 		if (!btree_node_type_needs_gc(i->iter->btree_id))
 			continue;
 
@@ -673,7 +541,7 @@ static inline int do_btree_insert_at(struct btree_trans *trans,
 				i->k->k.version = MAX_VERSION;
 	}
 
-	trans_for_each_update_iter(trans, i)
+	trans_for_each_update(trans, i)
 		if (update_has_triggers(trans, i) &&
 		    !update_triggers_transactional(trans, i))
 			bch2_mark_update(trans, i, &fs_usage->u, mark_flags);
@@ -687,7 +555,7 @@ static inline int do_btree_insert_at(struct btree_trans *trans,
 
 	if (likely(!(trans->flags & BTREE_INSERT_NOMARK)) &&
 	    unlikely(c->gc_pos.phase))
-		trans_for_each_update_iter(trans, i)
+		trans_for_each_update(trans, i)
 			if (gc_visited(c, gc_pos_btree_node(i->iter->l[0].b)))
 				bch2_mark_update(trans, i, NULL,
 						 mark_flags|
@@ -772,7 +640,7 @@ int bch2_trans_commit_error(struct btree_trans *trans,
 	case BTREE_INSERT_NEED_MARK_REPLICAS:
 		bch2_trans_unlock(trans);
 
-		trans_for_each_update_iter(trans, i) {
+		trans_for_each_update(trans, i) {
 			ret = bch2_mark_bkey_replicas(c, bkey_i_to_s_c(i->k));
 			if (ret)
 				return ret;
@@ -842,7 +710,7 @@ static int __bch2_trans_commit(struct btree_trans *trans,
 	unsigned iter;
 	int ret;
 
-	trans_for_each_update_iter(trans, i) {
+	trans_for_each_update(trans, i) {
 		if (!bch2_btree_iter_upgrade(i->iter, 1)) {
 			trace_trans_restart_upgrade(trans->ip);
 			ret = -EINTR;
@@ -868,7 +736,7 @@ static int __bch2_trans_commit(struct btree_trans *trans,
 
 	trans->nounlock = false;
 
-	trans_for_each_update_iter(trans, i)
+	trans_for_each_update(trans, i)
 		bch2_btree_iter_downgrade(i->iter);
 err:
 	/* make sure we didn't drop or screw up locks: */
@@ -995,7 +863,7 @@ retry:
 	iter = bch2_trans_get_iter(&trans, id, bkey_start_pos(&k->k),
 				   BTREE_ITER_INTENT);
 
-	bch2_trans_update(&trans, BTREE_INSERT_ENTRY(iter, k));
+	bch2_trans_update(&trans, iter, k);
 
 	ret = bch2_trans_commit(&trans, disk_res, journal_seq, flags);
 	if (ret == -EINTR)
@@ -1045,7 +913,7 @@ retry:
 				break;
 		}
 
-		bch2_trans_update(trans, BTREE_INSERT_ENTRY(iter, &delete));
+		bch2_trans_update(trans, iter, &delete);
 		ret = bch2_trans_commit(trans, NULL, journal_seq,
 					BTREE_INSERT_ATOMIC|
 					BTREE_INSERT_NOFAIL);
@@ -1072,7 +940,7 @@ int bch2_btree_delete_at(struct btree_trans *trans,
 	bkey_init(&k.k);
 	k.k.p = iter->pos;
 
-	bch2_trans_update(trans, BTREE_INSERT_ENTRY(iter, &k));
+	bch2_trans_update(trans, iter, &k);
 	return bch2_trans_commit(trans, NULL, NULL,
 				 BTREE_INSERT_NOFAIL|
 				 BTREE_INSERT_USE_RESERVE|flags);
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 637a9e909f82..9c97a1522d9d 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -1316,7 +1316,7 @@ void bch2_trans_fs_usage_apply(struct btree_trans *trans,
 	bch_err(c, "disk usage increased more than %llu sectors reserved",
 		disk_res_sectors);
 
-	trans_for_each_update_iter(trans, i) {
+	trans_for_each_update(trans, i) {
 		struct btree_iter	*iter = i->iter;
 		struct btree		*b = iter->l[0].b;
 		struct btree_node_iter	node_iter = iter->l[0].iter;
@@ -1358,7 +1358,7 @@ static int trans_get_key(struct btree_trans *trans,
 	struct btree_insert_entry *i;
 	int ret;
 
-	trans_for_each_update_iter(trans, i)
+	trans_for_each_update(trans, i)
 		if (i->iter->btree_id == btree_id &&
 		    (btree_node_type_is_extents(btree_id)
 		     ? bkey_cmp(pos, bkey_start_pos(&i->k->k)) >= 0 &&
@@ -1397,13 +1397,13 @@ static void *trans_update_key(struct btree_trans *trans,
 	bkey_init(&new_k->k);
 	new_k->k.p = iter->pos;
 
-	trans_for_each_update_iter(trans, i)
+	trans_for_each_update(trans, i)
 		if (i->iter == iter) {
 			i->k = new_k;
 			return new_k;
 		}
 
-	bch2_trans_update(trans, BTREE_INSERT_ENTRY(iter, new_k));
+	bch2_trans_update(trans, iter, new_k);
 	return new_k;
 }
 
diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c
index 1442dacef0de..38dd96808e90 100644
--- a/fs/bcachefs/dirent.c
+++ b/fs/bcachefs/dirent.c
@@ -255,9 +255,8 @@ int bch2_dirent_rename(struct btree_trans *trans,
 				 * new_dst at the src position:
 				 */
 				new_dst->k.p = src_iter->pos;
-				bch2_trans_update(trans,
-					BTREE_INSERT_ENTRY(src_iter,
-							   &new_dst->k_i));
+				bch2_trans_update(trans, src_iter,
+						  &new_dst->k_i);
 				return 0;
 			} else {
 				/* If we're overwriting, we can't insert new_dst
@@ -280,8 +279,8 @@ int bch2_dirent_rename(struct btree_trans *trans,
 		}
 	}
 
-	bch2_trans_update(trans, BTREE_INSERT_ENTRY(src_iter, &new_src->k_i));
-	bch2_trans_update(trans, BTREE_INSERT_ENTRY(dst_iter, &new_dst->k_i));
+	bch2_trans_update(trans, src_iter, &new_src->k_i);
+	bch2_trans_update(trans, dst_iter, &new_dst->k_i);
 	return 0;
 }
 
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index 5b61e9cb1ac3..155e7c9bd89f 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -738,7 +738,7 @@ found_slot:
 
 	stripe->k.p = iter->pos;
 
-	bch2_trans_update(&trans, BTREE_INSERT_ENTRY(iter, &stripe->k_i));
+	bch2_trans_update(&trans, iter, &stripe->k_i);
 
 	ret = bch2_trans_commit(&trans, NULL, NULL,
 				BTREE_INSERT_ATOMIC|
@@ -819,7 +819,7 @@ static int ec_stripe_update_ptrs(struct bch_fs *c,
 
 		extent_stripe_ptr_add(e, s, ptr, idx);
 
-		bch2_trans_update(&trans, BTREE_INSERT_ENTRY(iter, &tmp.k));
+		bch2_trans_update(&trans, iter, &tmp.k);
 
 		ret = bch2_trans_commit(&trans, NULL, NULL,
 					BTREE_INSERT_ATOMIC|
@@ -1231,7 +1231,7 @@ static int __bch2_stripe_write_key(struct btree_trans *trans,
 
 	spin_unlock(&c->ec_stripes_heap_lock);
 
-	bch2_trans_update(trans, BTREE_INSERT_ENTRY(iter, &new_key->k_i));
+	bch2_trans_update(trans, iter, &new_key->k_i);
 
 	return bch2_trans_commit(trans, NULL, NULL,
 				 BTREE_INSERT_NOFAIL|flags);
diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 18356cbe0794..da4976344d49 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -324,69 +324,36 @@ int bch2_extent_update(struct btree_trans *trans,
 	if (!may_allocate && allocating)
 		return -ENOSPC;
 
-	bch2_trans_update(trans, BTREE_INSERT_ENTRY(extent_iter, k));
+	bch2_trans_update(trans, extent_iter, k);
 
 	new_i_size = min(k->k.p.offset << 9, new_i_size);
 
 	/* XXX: inode->i_size locking */
 	if (i_sectors_delta ||
 	    new_i_size > inode->ei_inode.bi_size) {
-		if (c->opts.new_inode_updates) {
-			bch2_trans_unlock(trans);
-			mutex_lock(&inode->ei_update_lock);
-
-			if (!bch2_trans_relock(trans)) {
-				mutex_unlock(&inode->ei_update_lock);
-				return -EINTR;
-			}
-
-			inode_locked = true;
-
-			if (!inode->ei_inode_update)
-				inode->ei_inode_update =
-					bch2_deferred_update_alloc(c,
-								BTREE_ID_INODES, 64);
-
-			inode_u = inode->ei_inode;
-			inode_u.bi_sectors += i_sectors_delta;
-
-			/* XXX: this is slightly suspect */
-			if (!(inode_u.bi_flags & BCH_INODE_I_SIZE_DIRTY) &&
-			    new_i_size > inode_u.bi_size) {
-				inode_u.bi_size = new_i_size;
-				extended = true;
-			}
-
-			bch2_inode_pack(&inode_p, &inode_u);
-			bch2_trans_update(trans,
-				BTREE_INSERT_DEFERRED(inode->ei_inode_update,
-						      &inode_p.inode.k_i));
-		} else {
-			inode_iter = bch2_trans_get_iter(trans,
-				BTREE_ID_INODES,
-				POS(k->k.p.inode, 0),
-				BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
-			if (IS_ERR(inode_iter))
-				return PTR_ERR(inode_iter);
-
-			ret = bch2_btree_iter_traverse(inode_iter);
-			if (ret)
-				goto err;
+		inode_iter = bch2_trans_get_iter(trans,
+			BTREE_ID_INODES,
+			POS(k->k.p.inode, 0),
+			BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+		if (IS_ERR(inode_iter))
+			return PTR_ERR(inode_iter);
 
-			inode_u = inode->ei_inode;
-			inode_u.bi_sectors += i_sectors_delta;
+		ret = bch2_btree_iter_traverse(inode_iter);
+		if (ret)
+			goto err;
 
-			/* XXX: this is slightly suspect */
-			if (!(inode_u.bi_flags & BCH_INODE_I_SIZE_DIRTY) &&
-			    new_i_size > inode_u.bi_size) {
-				inode_u.bi_size = new_i_size;
-				extended = true;
-			}
+		inode_u = inode->ei_inode;
+		inode_u.bi_sectors += i_sectors_delta;
 
-			bch2_inode_pack(&inode_p, &inode_u);
-			bch2_trans_update(trans,
-				BTREE_INSERT_ENTRY(inode_iter, &inode_p.inode.k_i));
+		/* XXX: this is slightly suspect */
+		if (!(inode_u.bi_flags & BCH_INODE_I_SIZE_DIRTY) &&
+		    new_i_size > inode_u.bi_size) {
+			inode_u.bi_size = new_i_size;
+			extended = true;
 		}
+
+		bch2_inode_pack(&inode_p, &inode_u);
+		bch2_trans_update(trans, inode_iter, &inode_p.inode.k_i);
 	}
 
 	ret = bch2_trans_commit(trans, disk_res,
@@ -2793,9 +2760,8 @@ reassemble:
 				bkey_start_pos(&delete.k));
 		}
 
-		bch2_trans_update(&trans, BTREE_INSERT_ENTRY(dst, &copy.k));
-		bch2_trans_update(&trans,
-				  BTREE_INSERT_ENTRY(del ?: src, &delete));
+		bch2_trans_update(&trans, dst, &copy.k);
+		bch2_trans_update(&trans, del ?: src, &delete);
 
 		if (copy.k.k.size == k.k->size) {
 			/*
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index b9a20bb19b58..166d94e5e59d 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -154,30 +154,22 @@ int __must_check bch2_write_inode_trans(struct btree_trans *trans,
 				inode_set_fn set,
 				void *p)
 {
-	struct bch_fs *c = trans->c;
 	struct btree_iter *iter = NULL;
 	struct bkey_inode_buf *inode_p;
 	int ret;
 
 	lockdep_assert_held(&inode->ei_update_lock);
 
-	if (c->opts.new_inode_updates) {
-	/* XXX: Don't do this with btree locks held */
-	if (!inode->ei_inode_update)
-		inode->ei_inode_update =
-			bch2_deferred_update_alloc(c, BTREE_ID_INODES, 64);
-	} else {
-		iter = bch2_trans_get_iter(trans, BTREE_ID_INODES,
-					   POS(inode->v.i_ino, 0),
-					   BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
-		if (IS_ERR(iter))
-			return PTR_ERR(iter);
-
-		/* The btree node lock is our lock on the inode: */
-		ret = bch2_btree_iter_traverse(iter);
-		if (ret)
-			return ret;
-	}
+	iter = bch2_trans_get_iter(trans, BTREE_ID_INODES,
+				   POS(inode->v.i_ino, 0),
+				   BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+	if (IS_ERR(iter))
+		return PTR_ERR(iter);
+
+	/* The btree node lock is our lock on the inode: */
+	ret = bch2_btree_iter_traverse(iter);
+	if (ret)
+		return ret;
 
 	*inode_u = inode->ei_inode;
 
@@ -192,14 +184,7 @@ int __must_check bch2_write_inode_trans(struct btree_trans *trans,
 		return PTR_ERR(inode_p);
 
 	bch2_inode_pack(inode_p, inode_u);
-
-	if (!inode->ei_inode_update)
-		bch2_trans_update(trans,
-			BTREE_INSERT_ENTRY(iter, &inode_p->inode.k_i));
-	else
-		bch2_trans_update(trans,
-			BTREE_INSERT_DEFERRED(inode->ei_inode_update,
-					      &inode_p->inode.k_i));
+	bch2_trans_update(trans, iter, &inode_p->inode.k_i);
 
 	return 0;
 }
@@ -1482,7 +1467,6 @@ static struct inode *bch2_alloc_inode(struct super_block *sb)
 	mutex_init(&inode->ei_update_lock);
 	pagecache_lock_init(&inode->ei_pagecache_lock);
 	mutex_init(&inode->ei_quota_lock);
-	inode->ei_inode_update = NULL;
 	inode->ei_journal_seq = 0;
 
 	return &inode->v;
@@ -1540,10 +1524,6 @@ static void bch2_evict_inode(struct inode *vinode)
 
 	BUG_ON(!is_bad_inode(&inode->v) && inode->ei_quota_reserved);
 
-	if (inode->ei_inode_update)
-		bch2_deferred_update_free(c, inode->ei_inode_update);
-	inode->ei_inode_update = NULL;
-
 	if (!inode->v.i_nlink && !is_bad_inode(&inode->v)) {
 		bch2_quota_acct(c, inode->ei_qid, Q_SPC, -((s64) inode->v.i_blocks),
 				KEY_TYPE_QUOTA_WARN);
diff --git a/fs/bcachefs/fs.h b/fs/bcachefs/fs.h
index 04ac5b4129a4..c3ee9c17064f 100644
--- a/fs/bcachefs/fs.h
+++ b/fs/bcachefs/fs.h
@@ -34,7 +34,6 @@ struct bch_inode_info {
 	struct inode		v;
 
 	struct mutex		ei_update_lock;
-	struct deferred_update	*ei_inode_update;
 	u64			ei_journal_seq;
 	u64			ei_quota_reserved;
 	unsigned long		ei_last_dirtied;
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index b806284c0517..c5540536f47c 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -393,7 +393,7 @@ static int check_dirent_hash(struct btree_trans *trans, struct hash_check *h,
 
 	if (fsck_err(c, "dirent with junk at end, was %s (%zu) now %s (%u)",
 		     buf, strlen(buf), d->v.d_name, len)) {
-		bch2_trans_update(trans, BTREE_INSERT_ENTRY(iter, &d->k_i));
+		bch2_trans_update(trans, iter, &d->k_i);
 
 		ret = bch2_trans_commit(trans, NULL, NULL,
 					BTREE_INSERT_NOFAIL|
@@ -663,8 +663,7 @@ retry:
 			bkey_reassemble(&n->k_i, d.s_c);
 			n->v.d_type = mode_to_type(target.bi_mode);
 
-			bch2_trans_update(&trans,
-				BTREE_INSERT_ENTRY(iter, &n->k_i));
+			bch2_trans_update(&trans, iter, &n->k_i);
 
 			ret = bch2_trans_commit(&trans, NULL, NULL,
 						BTREE_INSERT_NOFAIL|
@@ -1293,7 +1292,7 @@ static int check_inode(struct btree_trans *trans,
 		struct bkey_inode_buf p;
 
 		bch2_inode_pack(&p, &u);
-		bch2_trans_update(trans, BTREE_INSERT_ENTRY(iter, &p.inode.k_i));
+		bch2_trans_update(trans, iter, &p.inode.k_i);
 
 		ret = bch2_trans_commit(trans, NULL, NULL,
 					BTREE_INSERT_NOFAIL|
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index 0fb08a396d62..f192536558c1 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -345,8 +345,7 @@ again:
 			inode_u->bi_generation	= bkey_generation(k);
 
 			bch2_inode_pack(inode_p, inode_u);
-			bch2_trans_update(trans,
-				BTREE_INSERT_ENTRY(iter, &inode_p->inode.k_i));
+			bch2_trans_update(trans, iter, &inode_p->inode.k_i);
 			return 0;
 		}
 	}
@@ -435,8 +434,7 @@ int bch2_inode_rm(struct bch_fs *c, u64 inode_nr)
 			delete.v.bi_generation = cpu_to_le32(bi_generation);
 		}
 
-		bch2_trans_update(&trans,
-				  BTREE_INSERT_ENTRY(iter, &delete.k_i));
+		bch2_trans_update(&trans, iter, &delete.k_i);
 
 		ret = bch2_trans_commit(&trans, NULL, NULL,
 					BTREE_INSERT_ATOMIC|
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index 07fe6b5cd517..690f9b2dbb98 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -290,8 +290,7 @@ retry:
 		if (ret)
 			break;
 
-		bch2_trans_update(&trans,
-				  BTREE_INSERT_ENTRY(iter, &split.k));
+		bch2_trans_update(&trans, iter, &split.k);
 
 		ret = bch2_trans_commit(&trans, &op->res, op_journal_seq(op),
 					BTREE_INSERT_NOFAIL|
@@ -1445,7 +1444,7 @@ retry:
 	if (!bch2_bkey_narrow_crcs(&new.k, new_crc))
 		goto out;
 
-	bch2_trans_update(&trans, BTREE_INSERT_ENTRY(iter, &new.k));
+	bch2_trans_update(&trans, iter, &new.k);
 	ret = bch2_trans_commit(&trans, NULL, NULL,
 				BTREE_INSERT_ATOMIC|
 				BTREE_INSERT_NOFAIL|
diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c
index dc3b03d6e627..de8522f754e2 100644
--- a/fs/bcachefs/migrate.c
+++ b/fs/bcachefs/migrate.c
@@ -72,10 +72,9 @@ static int __bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags
 		 */
 		bch2_extent_normalize(c, bkey_i_to_s(&tmp.key));
 
-		/* XXX not sketchy at all */
-		iter->pos = bkey_start_pos(&tmp.key.k);
+		bch2_btree_iter_set_pos(iter, bkey_start_pos(&tmp.key.k));
 
-		bch2_trans_update(&trans, BTREE_INSERT_ENTRY(iter, &tmp.key));
+		bch2_trans_update(&trans, iter, &tmp.key);
 
 		ret = bch2_trans_commit(&trans, NULL, NULL,
 					BTREE_INSERT_ATOMIC|
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index 8855dd19f7f2..2f0bdfbfcd61 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -147,8 +147,7 @@ static int bch2_migrate_index_update(struct bch_write_op *op)
 			goto next;
 		}
 
-		bch2_trans_update(&trans,
-				BTREE_INSERT_ENTRY(iter, insert));
+		bch2_trans_update(&trans, iter, insert);
 
 		ret = bch2_trans_commit(&trans, &op->res,
 				op_journal_seq(op),
diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
index d44bfe90c0d5..d9325d4bc024 100644
--- a/fs/bcachefs/opts.h
+++ b/fs/bcachefs/opts.h
@@ -295,13 +295,7 @@ enum opt_type {
 	  OPT_UINT(0, BCH_REPLICAS_MAX),				\
 	  NO_SB_OPT,			1,				\
 	  "n",		"Data written to this device will be considered\n"\
-			"to have already been replicated n times")	\
-	x(new_inode_updates,		u8,				\
-	  OPT_MOUNT,							\
-	  OPT_BOOL(),							\
-	  NO_SB_OPT,			false,				\
-	  NULL,		"Enable new btree write-cache for inode updates")
-
+			"to have already been replicated n times")
 
 struct bch_opts {
 #define x(_name, _bits, ...)	unsigned _name##_defined:1;
diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c
index f0da0fac09bf..0fa6f33c049b 100644
--- a/fs/bcachefs/quota.c
+++ b/fs/bcachefs/quota.c
@@ -752,7 +752,7 @@ static int bch2_set_quota(struct super_block *sb, struct kqid qid,
 	if (qdq->d_fieldmask & QC_INO_HARD)
 		new_quota.v.c[Q_INO].hardlimit = cpu_to_le64(qdq->d_ino_hardlimit);
 
-	bch2_trans_update(&trans, BTREE_INSERT_ENTRY(iter, &new_quota.k_i));
+	bch2_trans_update(&trans, iter, &new_quota.k_i);
 
 	ret = bch2_trans_commit(&trans, NULL, NULL, 0);
 
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 98d9a1432e50..2e880955a07c 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -301,7 +301,7 @@ retry:
 		bch2_cut_front(split_iter->pos, split);
 		bch2_cut_back(atomic_end, &split->k);
 
-		bch2_trans_update(&trans, BTREE_INSERT_ENTRY(split_iter, split));
+		bch2_trans_update(&trans, split_iter, split);
 		bch2_btree_iter_set_pos(iter, split->k.p);
 	} while (bkey_cmp(iter->pos, k->k.p) < 0);
 
diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c
index dcca9c1d0f47..c08b57634abd 100644
--- a/fs/bcachefs/reflink.c
+++ b/fs/bcachefs/reflink.c
@@ -120,7 +120,7 @@ static int bch2_make_extent_indirect(struct btree_trans *trans,
 	r_v->v.refcount	= 0;
 	memcpy(r_v->v.start, e->v.start, bkey_val_bytes(&e->k));
 
-	bch2_trans_update(trans, BTREE_INSERT_ENTRY(reflink_iter, &r_v->k_i));
+	bch2_trans_update(trans, reflink_iter, &r_v->k_i);
 
 	r_p = bch2_trans_kmalloc(trans, sizeof(*r_p));
 	if (IS_ERR(r_p))
@@ -131,7 +131,7 @@ static int bch2_make_extent_indirect(struct btree_trans *trans,
 	set_bkey_val_bytes(&r_p->k, sizeof(r_p->v));
 	r_p->v.idx = cpu_to_le64(bkey_start_offset(&r_v->k));
 
-	bch2_trans_update(trans, BTREE_INSERT_ENTRY(extent_iter, &r_p->k_i));
+	bch2_trans_update(trans, extent_iter, &r_p->k_i);
 err:
 	if (!IS_ERR(reflink_iter)) {
 		c->reflink_hint = reflink_iter->pos.offset;
diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h
index 31b278e71051..886f1bc8aa14 100644
--- a/fs/bcachefs/str_hash.h
+++ b/fs/bcachefs/str_hash.h
@@ -267,7 +267,7 @@ not_found:
 		}
 
 		insert->k.p = iter->pos;
-		bch2_trans_update(trans, BTREE_INSERT_ENTRY(iter, insert));
+		bch2_trans_update(trans, iter, insert);
 		bch2_trans_iter_free_on_commit(trans, iter);
 	}
 
@@ -295,7 +295,7 @@ int bch2_hash_delete_at(struct btree_trans *trans,
 	delete->k.p = iter->pos;
 	delete->k.type = ret ? KEY_TYPE_whiteout : KEY_TYPE_deleted;
 
-	bch2_trans_update(trans, BTREE_INSERT_ENTRY(iter, delete));
+	bch2_trans_update(trans, iter, delete);
 	return 0;
 }
 
diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c
index 92843bd09b04..a2092bb99095 100644
--- a/fs/bcachefs/tests.c
+++ b/fs/bcachefs/tests.c
@@ -43,7 +43,7 @@ static void test_delete(struct bch_fs *c, u64 nr)
 	ret = bch2_btree_iter_traverse(iter);
 	BUG_ON(ret);
 
-	bch2_trans_update(&trans, BTREE_INSERT_ENTRY(iter, &k.k_i));
+	bch2_trans_update(&trans, iter, &k.k_i);
 	ret = bch2_trans_commit(&trans, NULL, NULL, 0);
 	BUG_ON(ret);
 
@@ -75,7 +75,7 @@ static void test_delete_written(struct bch_fs *c, u64 nr)
 	ret = bch2_btree_iter_traverse(iter);
 	BUG_ON(ret);
 
-	bch2_trans_update(&trans, BTREE_INSERT_ENTRY(iter, &k.k_i));
+	bch2_trans_update(&trans, iter, &k.k_i);
 	ret = bch2_trans_commit(&trans, NULL, NULL, 0);
 	BUG_ON(ret);
 
@@ -465,7 +465,7 @@ static void rand_mixed(struct bch_fs *c, u64 nr)
 			bkey_cookie_init(&k.k_i);
 			k.k.p = iter->pos;
 
-			bch2_trans_update(&trans, BTREE_INSERT_ENTRY(iter, &k.k_i));
+			bch2_trans_update(&trans, iter, &k.k_i);
 			ret = bch2_trans_commit(&trans, NULL, NULL, 0);
 			BUG_ON(ret);
 		}
@@ -509,7 +509,7 @@ static void seq_insert(struct bch_fs *c, u64 nr)
 			   BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) {
 		insert.k.p = iter->pos;
 
-		bch2_trans_update(&trans, BTREE_INSERT_ENTRY(iter, &insert.k_i));
+		bch2_trans_update(&trans, iter, &insert.k_i);
 		ret = bch2_trans_commit(&trans, NULL, NULL, 0);
 		BUG_ON(ret);
 
@@ -548,7 +548,7 @@ static void seq_overwrite(struct bch_fs *c, u64 nr)
 
 		bkey_reassemble(&u.k_i, k);
 
-		bch2_trans_update(&trans, BTREE_INSERT_ENTRY(iter, &u.k_i));
+		bch2_trans_update(&trans, iter, &u.k_i);
 		ret = bch2_trans_commit(&trans, NULL, NULL, 0);
 		BUG_ON(ret);
 	}
-- 
cgit 


From 64bc00115335450c4178fea04c5b664cf73a9729 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 26 Sep 2019 22:21:39 -0400
Subject: bcachefs: Rework btree iterator lifetimes

The btree_trans struct needs to memoize/cache btree iterators, so that
on transaction restart we don't have to completely redo btree lookups,
and so that we can do them all at once in the correct order when the
transaction had to restart to avoid a deadlock.

This switches the btree iterator lookups to work based on iterator
position, instead of trying to match them up based on the stack trace.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c        | 202 +++++++++++++++++++++++-----------------
 fs/bcachefs/btree_iter.h        |  37 +++-----
 fs/bcachefs/btree_types.h       |   9 +-
 fs/bcachefs/btree_update.h      |   2 +
 fs/bcachefs/btree_update_leaf.c |   7 +-
 fs/bcachefs/buckets.c           |   6 +-
 fs/bcachefs/fs-io.c             |   4 +-
 fs/bcachefs/io.c                |  19 ++--
 fs/bcachefs/reflink.c           |   8 +-
 fs/bcachefs/str_hash.h          |  25 ++---
 10 files changed, 173 insertions(+), 146 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 0439c5b07f61..8e9164aee409 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1730,15 +1730,6 @@ static inline void bch2_btree_iter_init(struct btree_trans *trans,
 
 /* new transactional stuff: */
 
-int bch2_trans_iter_put(struct btree_trans *trans,
-			struct btree_iter *iter)
-{
-	int ret = btree_iter_err(iter);
-
-	trans->iters_live	&= ~(1ULL << iter->idx);
-	return ret;
-}
-
 static inline void __bch2_trans_iter_free(struct btree_trans *trans,
 					  unsigned idx)
 {
@@ -1746,26 +1737,27 @@ static inline void __bch2_trans_iter_free(struct btree_trans *trans,
 	trans->iters_linked		&= ~(1ULL << idx);
 	trans->iters_live		&= ~(1ULL << idx);
 	trans->iters_touched		&= ~(1ULL << idx);
-	trans->iters_unlink_on_restart	&= ~(1ULL << idx);
-	trans->iters_unlink_on_commit	&= ~(1ULL << idx);
 }
 
-int bch2_trans_iter_free(struct btree_trans *trans,
-			 struct btree_iter *iter)
+int bch2_trans_iter_put(struct btree_trans *trans,
+			struct btree_iter *iter)
 {
 	int ret = btree_iter_err(iter);
 
-	__bch2_trans_iter_free(trans, iter->idx);
+	if (!(trans->iters_touched & (1ULL << iter->idx)) &&
+	    !(iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT))
+		__bch2_trans_iter_free(trans, iter->idx);
+
+	trans->iters_live	&= ~(1ULL << iter->idx);
 	return ret;
 }
 
-int bch2_trans_iter_free_on_commit(struct btree_trans *trans,
-				   struct btree_iter *iter)
+int bch2_trans_iter_free(struct btree_trans *trans,
+			 struct btree_iter *iter)
 {
-	int ret = btree_iter_err(iter);
+	trans->iters_touched &= ~(1ULL << iter->idx);
 
-	trans->iters_unlink_on_commit |= 1ULL << iter->idx;
-	return ret;
+	return bch2_trans_iter_put(trans, iter);
 }
 
 static int bch2_trans_realloc_iters(struct btree_trans *trans,
@@ -1839,7 +1831,25 @@ static struct btree_iter *btree_trans_iter_alloc(struct btree_trans *trans)
 		goto got_slot;
 
 	if (trans->nr_iters == trans->size) {
-		int ret = bch2_trans_realloc_iters(trans, trans->size * 2);
+		int ret;
+
+		if (trans->nr_iters >= BTREE_ITER_MAX) {
+			struct btree_iter *iter;
+
+			trans_for_each_iter(trans, iter) {
+				pr_err("iter: btree %s pos %llu:%llu%s%s%s",
+				       bch2_btree_ids[iter->btree_id],
+				       iter->pos.inode,
+				       iter->pos.offset,
+				       (trans->iters_live & (1ULL << iter->idx)) ? " live" : "",
+				       (trans->iters_touched & (1ULL << iter->idx)) ? " touched" : "",
+				       iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT ? " keep" : "");
+			}
+
+			panic("trans iter oveflow\n");
+		}
+
+		ret = bch2_trans_realloc_iters(trans, trans->size * 2);
 		if (ret)
 			return ERR_PTR(ret);
 	}
@@ -1854,60 +1864,94 @@ got_slot:
 	return &trans->iters[idx];
 }
 
+static inline void btree_iter_copy(struct btree_iter *dst,
+				   struct btree_iter *src)
+{
+	unsigned i, idx = dst->idx;
+
+	*dst = *src;
+	dst->idx = idx;
+
+	for (i = 0; i < BTREE_MAX_DEPTH; i++)
+		if (btree_node_locked(dst, i))
+			six_lock_increment(&dst->l[i].b->c.lock,
+					   __btree_lock_want(dst, i));
+}
+
+static inline struct bpos bpos_diff(struct bpos l, struct bpos r)
+{
+	if (bkey_cmp(l, r) > 0)
+		swap(l, r);
+
+	return POS(r.inode - l.inode, r.offset - l.offset);
+}
+
 static struct btree_iter *__btree_trans_get_iter(struct btree_trans *trans,
 						 unsigned btree_id, struct bpos pos,
-						 unsigned flags, u64 iter_id)
+						 unsigned flags)
 {
-	struct btree_iter *iter;
+	struct btree_iter *iter, *best = NULL;
 
 	BUG_ON(trans->nr_iters > BTREE_ITER_MAX);
 
-	trans_for_each_iter(trans, iter)
-		if (iter_id
-		    ? iter->id == iter_id
-		    : (iter->btree_id == btree_id &&
-		       !bkey_cmp(iter->pos, pos)))
-			goto found;
+	trans_for_each_iter(trans, iter) {
+		if (btree_iter_type(iter) != (flags & BTREE_ITER_TYPE))
+			continue;
 
-	iter = NULL;
-found:
-	if (!iter) {
+		if (iter->btree_id != btree_id)
+			continue;
+
+		if (best &&
+		    bkey_cmp(bpos_diff(best->pos, pos),
+			     bpos_diff(iter->pos, pos)) < 0)
+			continue;
+
+		best = iter;
+	}
+
+	if (!best) {
 		iter = btree_trans_iter_alloc(trans);
 		if (IS_ERR(iter))
 			return iter;
 
-		iter->id = iter_id;
-
 		bch2_btree_iter_init(trans, iter, btree_id, pos, flags);
-	} else {
-		iter->flags &= ~(BTREE_ITER_SLOTS|BTREE_ITER_INTENT|BTREE_ITER_PREFETCH);
-		iter->flags |= flags & (BTREE_ITER_SLOTS|BTREE_ITER_INTENT|BTREE_ITER_PREFETCH);
+	} else if ((trans->iters_live & (1ULL << best->idx)) ||
+		   (best->flags & BTREE_ITER_KEEP_UNTIL_COMMIT)) {
+		iter = btree_trans_iter_alloc(trans);
+		if (IS_ERR(iter))
+			return iter;
 
-		if ((iter->flags & BTREE_ITER_INTENT) &&
-		    !bch2_btree_iter_upgrade(iter, 1)) {
-			trace_trans_restart_upgrade(trans->ip);
-			return ERR_PTR(-EINTR);
-		}
+		btree_iter_copy(iter, best);
+	} else {
+		iter = best;
 	}
 
+	iter->flags &= ~BTREE_ITER_KEEP_UNTIL_COMMIT;
+	iter->flags &= ~(BTREE_ITER_SLOTS|BTREE_ITER_INTENT|BTREE_ITER_PREFETCH);
+	iter->flags |= flags & (BTREE_ITER_SLOTS|BTREE_ITER_INTENT|BTREE_ITER_PREFETCH);
+
+	if (iter->flags & BTREE_ITER_INTENT)
+		bch2_btree_iter_upgrade(iter, 1);
+	else
+		bch2_btree_iter_downgrade(iter);
+
 	BUG_ON(iter->btree_id != btree_id);
+	BUG_ON((iter->flags ^ flags) & BTREE_ITER_TYPE);
+	BUG_ON(iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT);
 	BUG_ON(trans->iters_live & (1ULL << iter->idx));
+
 	trans->iters_live	|= 1ULL << iter->idx;
 	trans->iters_touched	|= 1ULL << iter->idx;
 
-	BUG_ON(iter->btree_id != btree_id);
-	BUG_ON((iter->flags ^ flags) & BTREE_ITER_TYPE);
-
 	return iter;
 }
 
-struct btree_iter *__bch2_trans_get_iter(struct btree_trans *trans,
-					 enum btree_id btree_id,
-					 struct bpos pos, unsigned flags,
-					 u64 iter_id)
+struct btree_iter *bch2_trans_get_iter(struct btree_trans *trans,
+				       enum btree_id btree_id,
+				       struct bpos pos, unsigned flags)
 {
 	struct btree_iter *iter =
-		__btree_trans_get_iter(trans, btree_id, pos, flags, iter_id);
+		__btree_trans_get_iter(trans, btree_id, pos, flags);
 
 	if (!IS_ERR(iter))
 		bch2_btree_iter_set_pos(iter, pos);
@@ -1923,7 +1967,7 @@ struct btree_iter *bch2_trans_get_node_iter(struct btree_trans *trans,
 {
 	struct btree_iter *iter =
 		__btree_trans_get_iter(trans, btree_id, pos,
-				       flags|BTREE_ITER_NODES, 0);
+				       flags|BTREE_ITER_NODES);
 	unsigned i;
 
 	BUG_ON(IS_ERR(iter));
@@ -1943,24 +1987,20 @@ struct btree_iter *bch2_trans_copy_iter(struct btree_trans *trans,
 					struct btree_iter *src)
 {
 	struct btree_iter *iter;
-	int idx, i;
 
 	iter = btree_trans_iter_alloc(trans);
 	if (IS_ERR(iter))
 		return iter;
 
-	idx = iter->idx;
-	*iter = *src;
-	iter->idx = idx;
+	btree_iter_copy(iter, src);
 
-	trans->iters_live		|= 1ULL << idx;
-	trans->iters_touched		|= 1ULL << idx;
-	trans->iters_unlink_on_restart	|= 1ULL << idx;
-
-	for (i = 0; i < BTREE_MAX_DEPTH; i++)
-		if (btree_node_locked(iter, i))
-			six_lock_increment(&iter->l[i].b->c.lock,
-					   __btree_lock_want(iter, i));
+	trans->iters_live |= 1ULL << iter->idx;
+	/*
+	 * Don't mark it as touched, we don't need to preserve this iter since
+	 * it's cheap to copy it again:
+	 */
+	trans->iters_touched &= ~(1ULL << iter->idx);
+	iter->flags &= ~BTREE_ITER_KEEP_UNTIL_COMMIT;
 
 	return iter;
 }
@@ -2001,10 +2041,11 @@ void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
 	return p;
 }
 
-inline void bch2_trans_unlink_iters(struct btree_trans *trans, u64 iters)
+inline void bch2_trans_unlink_iters(struct btree_trans *trans)
 {
-	iters &= trans->iters_linked;
-	iters &= ~trans->iters_live;
+	u64 iters = trans->iters_linked &
+		~trans->iters_touched &
+		~trans->iters_live;
 
 	while (iters) {
 		unsigned idx = __ffs64(iters);
@@ -2014,33 +2055,24 @@ inline void bch2_trans_unlink_iters(struct btree_trans *trans, u64 iters)
 	}
 }
 
-void bch2_trans_begin(struct btree_trans *trans)
+void bch2_trans_reset(struct btree_trans *trans, unsigned flags)
 {
-	u64 iters_to_unlink;
+	struct btree_iter *iter;
 
-	/*
-	 * On transaction restart, the transaction isn't required to allocate
-	 * all the same iterators it on the last iteration:
-	 *
-	 * Unlink any iterators it didn't use this iteration, assuming it got
-	 * further (allocated an iter with a higher idx) than where the iter
-	 * was originally allocated:
-	 */
-	iters_to_unlink = ~trans->iters_live &
-		((1ULL << fls64(trans->iters_live)) - 1);
+	trans_for_each_iter(trans, iter)
+		iter->flags &= ~BTREE_ITER_KEEP_UNTIL_COMMIT;
 
-	iters_to_unlink |= trans->iters_unlink_on_restart;
-	iters_to_unlink |= trans->iters_unlink_on_commit;
+	bch2_trans_unlink_iters(trans);
 
-	trans->iters_live		= 0;
+	if (flags & TRANS_RESET_ITERS)
+		trans->iters_live = 0;
 
-	bch2_trans_unlink_iters(trans, iters_to_unlink);
+	trans->iters_touched &= trans->iters_live;
 
-	trans->iters_touched		= 0;
-	trans->iters_unlink_on_restart	= 0;
-	trans->iters_unlink_on_commit	= 0;
 	trans->nr_updates		= 0;
-	trans->mem_top			= 0;
+
+	if (flags & TRANS_RESET_MEM)
+		trans->mem_top		= 0;
 
 	bch2_btree_iter_traverse_all(trans);
 }
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index 9b5e3de68487..6f81be26e674 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -271,43 +271,30 @@ static inline int bkey_err(struct bkey_s_c k)
 
 int bch2_trans_iter_put(struct btree_trans *, struct btree_iter *);
 int bch2_trans_iter_free(struct btree_trans *, struct btree_iter *);
-int bch2_trans_iter_free_on_commit(struct btree_trans *, struct btree_iter *);
 
-void bch2_trans_unlink_iters(struct btree_trans *, u64);
+void bch2_trans_unlink_iters(struct btree_trans *);
 
-struct btree_iter *__bch2_trans_get_iter(struct btree_trans *, enum btree_id,
-					 struct bpos, unsigned, u64);
+struct btree_iter *bch2_trans_get_iter(struct btree_trans *, enum btree_id,
+				       struct bpos, unsigned);
 struct btree_iter *bch2_trans_copy_iter(struct btree_trans *,
 					struct btree_iter *);
+struct btree_iter *bch2_trans_get_node_iter(struct btree_trans *,
+				enum btree_id, struct bpos,
+				unsigned, unsigned, unsigned);
 
-static __always_inline u64 __btree_iter_id(void)
-{
-	u64 ret = 0;
+#define TRANS_RESET_ITERS		(1 << 0)
+#define TRANS_RESET_MEM			(1 << 1)
 
-	ret <<= 32;
-	ret |= _RET_IP_ & U32_MAX;
-	ret <<= 32;
-	ret |= _THIS_IP_ & U32_MAX;
-	return ret;
-}
+void bch2_trans_reset(struct btree_trans *, unsigned);
 
-static __always_inline struct btree_iter *
-bch2_trans_get_iter(struct btree_trans *trans, enum btree_id btree_id,
-		    struct bpos pos, unsigned flags)
+static inline void bch2_trans_begin(struct btree_trans *trans)
 {
-	return __bch2_trans_get_iter(trans, btree_id, pos, flags,
-				     __btree_iter_id());
+	return bch2_trans_reset(trans, TRANS_RESET_ITERS|TRANS_RESET_MEM);
 }
 
-struct btree_iter *bch2_trans_get_node_iter(struct btree_trans *,
-				enum btree_id, struct bpos,
-				unsigned, unsigned, unsigned);
-
-void bch2_trans_begin(struct btree_trans *);
-
 static inline void bch2_trans_begin_updates(struct btree_trans *trans)
 {
-	trans->nr_updates = 0;
+	return bch2_trans_reset(trans, TRANS_RESET_MEM);
 }
 
 void *bch2_trans_kmalloc(struct btree_trans *, size_t);
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index c128ff393f0c..7d3c6670e30f 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -191,12 +191,13 @@ enum btree_iter_type {
 #define BTREE_ITER_SLOTS		(1 << 2)
 #define BTREE_ITER_INTENT		(1 << 3)
 #define BTREE_ITER_PREFETCH		(1 << 4)
+#define BTREE_ITER_KEEP_UNTIL_COMMIT	(1 << 5)
 /*
  * Used in bch2_btree_iter_traverse(), to indicate whether we're searching for
  * @pos or the first key strictly greater than @pos
  */
-#define BTREE_ITER_IS_EXTENTS		(1 << 5)
-#define BTREE_ITER_ERROR		(1 << 6)
+#define BTREE_ITER_IS_EXTENTS		(1 << 6)
+#define BTREE_ITER_ERROR		(1 << 7)
 
 enum btree_iter_uptodate {
 	BTREE_ITER_UPTODATE		= 0,
@@ -237,8 +238,6 @@ struct btree_iter {
 	 * bch2_btree_iter_next_slot() can correctly advance pos.
 	 */
 	struct bkey		k;
-
-	u64			id;
 };
 
 static inline enum btree_iter_type btree_iter_type(struct btree_iter *iter)
@@ -261,8 +260,6 @@ struct btree_trans {
 	u64			iters_linked;
 	u64			iters_live;
 	u64			iters_touched;
-	u64			iters_unlink_on_restart;
-	u64			iters_unlink_on_commit;
 
 	u8			nr_iters;
 	u8			nr_updates;
diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
index 0e985c1f0100..49f4d24d56ff 100644
--- a/fs/bcachefs/btree_update.h
+++ b/fs/bcachefs/btree_update.h
@@ -107,6 +107,8 @@ static inline void bch2_trans_update(struct btree_trans *trans,
 {
 	EBUG_ON(trans->nr_updates >= trans->nr_iters + 4);
 
+	iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT;
+
 	trans->updates[trans->nr_updates++] = (struct btree_insert_entry) {
 		.iter = iter, .k = k
 	};
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 2e9271759447..05b9c0d2e893 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -752,6 +752,7 @@ int bch2_trans_commit(struct btree_trans *trans,
 {
 	struct bch_fs *c = trans->c;
 	struct btree_insert_entry *i = NULL;
+	struct btree_iter *iter;
 	unsigned orig_nr_updates	= trans->nr_updates;
 	unsigned orig_mem_top		= trans->mem_top;
 	int ret = 0;
@@ -814,9 +815,11 @@ out_noupdates:
 
 	BUG_ON(!(trans->flags & BTREE_INSERT_ATOMIC) && ret == -EINTR);
 
+	trans_for_each_iter(trans, iter)
+		iter->flags &= ~BTREE_ITER_KEEP_UNTIL_COMMIT;
+
 	if (!ret) {
-		bch2_trans_unlink_iters(trans, ~trans->iters_touched|
-					trans->iters_unlink_on_commit);
+		bch2_trans_unlink_iters(trans);
 		trans->iters_touched = 0;
 	}
 	trans->nr_updates	= 0;
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 9c97a1522d9d..5e0e699c679a 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -1369,13 +1369,11 @@ static int trans_get_key(struct btree_trans *trans,
 			return 1;
 		}
 
-	*iter = __bch2_trans_get_iter(trans, btree_id, pos,
-				   BTREE_ITER_SLOTS|BTREE_ITER_INTENT, 0);
+	*iter = bch2_trans_get_iter(trans, btree_id, pos,
+				    BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
 	if (IS_ERR(*iter))
 		return PTR_ERR(*iter);
 
-	bch2_trans_iter_free_on_commit(trans, *iter);
-
 	*k = bch2_btree_iter_peek_slot(*iter);
 	ret = bkey_err(*k);
 	if (ret)
diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index da4976344d49..13e7b7842367 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -282,7 +282,7 @@ static int sum_sector_overwrites(struct btree_trans *trans,
 		old = bch2_btree_iter_next_slot(iter);
 	}
 
-	bch2_trans_iter_free(trans, iter);
+	bch2_trans_iter_put(trans, iter);
 	return 0;
 }
 
@@ -2786,7 +2786,7 @@ reassemble:
 		bch2_disk_reservation_put(c, &disk_res);
 bkey_err:
 		if (del)
-			bch2_trans_iter_free(&trans, del);
+			bch2_trans_iter_put(&trans, del);
 		del = NULL;
 
 		if (!ret)
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index 690f9b2dbb98..b893db7f7dcc 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -355,6 +355,7 @@ static void __bch2_write_index(struct bch_write_op *op)
 		u64 sectors_start = keylist_sectors(keys);
 		int ret = op->index_update_fn(op);
 
+		BUG_ON(ret == -EINTR);
 		BUG_ON(keylist_sectors(keys) && !ret);
 
 		op->written += sectors_start - keylist_sectors(keys);
@@ -1337,6 +1338,8 @@ retry:
 		bio_advance_iter(&rbio->bio, &bvec_iter, bytes);
 	}
 
+	if (ret == -EINTR)
+		goto retry;
 	/*
 	 * If we get here, it better have been because there was an error
 	 * reading a btree node
@@ -1610,9 +1613,9 @@ int __bch2_read_indirect_extent(struct btree_trans *trans,
 	reflink_offset = le64_to_cpu(bkey_i_to_reflink_p(orig_k)->v.idx) +
 		*offset_into_extent;
 
-	iter = __bch2_trans_get_iter(trans, BTREE_ID_REFLINK,
-				     POS(0, reflink_offset),
-				     BTREE_ITER_SLOTS, 1);
+	iter = bch2_trans_get_iter(trans, BTREE_ID_REFLINK,
+				   POS(0, reflink_offset),
+				   BTREE_ITER_SLOTS);
 	ret = PTR_ERR_OR_ZERO(iter);
 	if (ret)
 		return ret;
@@ -1888,8 +1891,6 @@ void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, u64 inode)
 		BCH_READ_USER_MAPPED;
 	int ret;
 
-	bch2_trans_init(&trans, c, 0, 0);
-
 	BUG_ON(rbio->_state);
 	BUG_ON(flags & BCH_READ_NODECODE);
 	BUG_ON(flags & BCH_READ_IN_RETRY);
@@ -1897,10 +1898,13 @@ void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, u64 inode)
 	rbio->c = c;
 	rbio->start_time = local_clock();
 
+	bch2_trans_init(&trans, c, 0, 0);
+retry:
+	bch2_trans_begin(&trans);
+
 	iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
 				   POS(inode, rbio->bio.bi_iter.bi_sector),
 				   BTREE_ITER_SLOTS);
-
 	while (1) {
 		BKEY_PADDED(k) tmp;
 		unsigned bytes, sectors, offset_into_extent;
@@ -1955,6 +1959,9 @@ out:
 	bch2_trans_exit(&trans);
 	return;
 err:
+	if (ret == -EINTR)
+		goto retry;
+
 	bcache_io_error(c, &rbio->bio, "btree IO error: %i", ret);
 	bch2_rbio_done(rbio);
 	goto out;
diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c
index c08b57634abd..ad526d280a14 100644
--- a/fs/bcachefs/reflink.c
+++ b/fs/bcachefs/reflink.c
@@ -190,10 +190,10 @@ s64 bch2_remap_range(struct bch_fs *c,
 
 	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 4096);
 
-	src_iter = __bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, src_start,
-					 BTREE_ITER_INTENT, 1);
-	dst_iter = __bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, dst_start,
-					 BTREE_ITER_INTENT, 2);
+	src_iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, src_start,
+				       BTREE_ITER_INTENT);
+	dst_iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, dst_start,
+				       BTREE_ITER_INTENT);
 
 	while (1) {
 		bch2_trans_begin_updates(&trans);
diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h
index 886f1bc8aa14..31e55acbbead 100644
--- a/fs/bcachefs/str_hash.h
+++ b/fs/bcachefs/str_hash.h
@@ -202,12 +202,13 @@ int bch2_hash_needs_whiteout(struct btree_trans *trans,
 
 		if (k.k->type == desc.key_type &&
 		    desc.hash_bkey(info, k) <= start->pos.offset) {
-			bch2_trans_iter_free_on_commit(trans, iter);
-			return 1;
+			iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT;
+			ret = 1;
+			break;
 		}
 	}
 
-	bch2_trans_iter_free(trans, iter);
+	bch2_trans_iter_put(trans, iter);
 	return ret;
 }
 
@@ -247,11 +248,14 @@ int bch2_hash_set(struct btree_trans *trans,
 			goto not_found;
 	}
 
+	if (!ret)
+		ret = -ENOSPC;
+out:
 	if (slot)
-		bch2_trans_iter_free(trans, slot);
-	bch2_trans_iter_free(trans, iter);
+		bch2_trans_iter_put(trans, slot);
+	bch2_trans_iter_put(trans, iter);
 
-	return ret ?: -ENOSPC;
+	return ret;
 found:
 	found = true;
 not_found:
@@ -261,17 +265,14 @@ not_found:
 	} else if (found && (flags & BCH_HASH_SET_MUST_CREATE)) {
 		ret = -EEXIST;
 	} else {
-		if (!found && slot) {
-			bch2_trans_iter_free(trans, iter);
-			iter = slot;
-		}
+		if (!found && slot)
+			swap(iter, slot);
 
 		insert->k.p = iter->pos;
 		bch2_trans_update(trans, iter, insert);
-		bch2_trans_iter_free_on_commit(trans, iter);
 	}
 
-	return ret;
+	goto out;
 }
 
 static __always_inline
-- 
cgit 


From 21ad9ddee8d1ffc1e12ab7e6b2ec18f0897fe567 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 1 Oct 2019 16:29:17 -0400
Subject: bcachefs: Fix counting iterators for reflink pointers

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/extents.c | 34 ++++++++++++++++------------------
 1 file changed, 16 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 35abcde4bca2..db32d9eaa3dc 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -952,7 +952,7 @@ static int count_iters_for_insert(struct btree_trans *trans,
 
 		if (*nr_iters >= max_iters) {
 			*end = bpos_min(*end, k.k->p);
-			return 0;
+			ret = 1;
 		}
 
 		break;
@@ -973,11 +973,11 @@ static int count_iters_for_insert(struct btree_trans *trans,
 			*nr_iters += 1;
 
 			if (overwrite &&
-			    k.k->type == KEY_TYPE_reflink_v) {
-				struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(k);
+			    r_k.k->type == KEY_TYPE_reflink_v) {
+				struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(r_k);
 
 				if (le64_to_cpu(r.v->refcount) == 1)
-					*nr_iters += bch2_bkey_nr_alloc_ptrs(k);
+					*nr_iters += bch2_bkey_nr_alloc_ptrs(r_k);
 			}
 
 			/*
@@ -990,6 +990,7 @@ static int count_iters_for_insert(struct btree_trans *trans,
 				pos.offset += r_k.k->p.offset - idx;
 
 				*end = bpos_min(*end, pos);
+				ret = 1;
 				break;
 			}
 		}
@@ -1002,6 +1003,8 @@ static int count_iters_for_insert(struct btree_trans *trans,
 	return ret;
 }
 
+#define EXTENT_ITERS_MAX	(BTREE_ITER_MAX / 3)
+
 int bch2_extent_atomic_end(struct btree_iter *iter,
 			   struct bkey_i *insert,
 			   struct bpos *end)
@@ -1010,22 +1013,20 @@ int bch2_extent_atomic_end(struct btree_iter *iter,
 	struct btree *b = iter->l[0].b;
 	struct btree_node_iter	node_iter = iter->l[0].iter;
 	struct bkey_packed	*_k;
-	unsigned		nr_iters =
-		bch2_bkey_nr_alloc_ptrs(bkey_i_to_s_c(insert));
-	int ret = 0;
+	unsigned		nr_iters = 0;
+	int ret;
 
 	BUG_ON(iter->uptodate > BTREE_ITER_NEED_PEEK);
 	BUG_ON(bkey_cmp(bkey_start_pos(&insert->k), b->data->min_key) < 0);
 
 	*end = bpos_min(insert->k.p, b->key.k.p);
 
-	ret = count_iters_for_insert(trans, bkey_i_to_s_c(insert),
-				     0, end, &nr_iters, 10, false);
-	if (ret)
+	ret = count_iters_for_insert(trans, bkey_i_to_s_c(insert), 0, end,
+				     &nr_iters, EXTENT_ITERS_MAX / 2, false);
+	if (ret < 0)
 		return ret;
 
-	while (nr_iters < 20 &&
-	       (_k = bch2_btree_node_iter_peek_filter(&node_iter, b,
+	while ((_k = bch2_btree_node_iter_peek_filter(&node_iter, b,
 						      KEY_TYPE_discard))) {
 		struct bkey	unpacked;
 		struct bkey_s_c	k = bkey_disassemble(b, _k, &unpacked);
@@ -1039,18 +1040,15 @@ int bch2_extent_atomic_end(struct btree_iter *iter,
 			offset = bkey_start_offset(&insert->k) -
 				bkey_start_offset(k.k);
 
-		ret = count_iters_for_insert(trans, k, offset,
-					     end, &nr_iters, 20, true);
+		ret = count_iters_for_insert(trans, k, offset, end,
+					&nr_iters, EXTENT_ITERS_MAX, true);
 		if (ret)
-			return ret;
-
-		if (nr_iters >= 20)
 			break;
 
 		bch2_btree_node_iter_advance(&node_iter, b);
 	}
 
-	return 0;
+	return ret < 0 ? ret : 0;
 }
 
 int bch2_extent_trim_atomic(struct bkey_i *k, struct btree_iter *iter)
-- 
cgit 


From 6988e85be525b874745824622bae4209c265dc5a Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 25 Sep 2019 23:11:41 -0400
Subject: bcachefs: Trust inode in btree over bch_inode_info

This is the start of some refactoring work to make less code depend on
the linux VFS - here the inode cache - to make e.g. the fuse port
easier.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs.c | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index 166d94e5e59d..0a83d5f61a6b 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -156,9 +156,8 @@ int __must_check bch2_write_inode_trans(struct btree_trans *trans,
 {
 	struct btree_iter *iter = NULL;
 	struct bkey_inode_buf *inode_p;
-	int ret;
-
-	lockdep_assert_held(&inode->ei_update_lock);
+	struct bkey_s_c k;
+	int ret = 0;
 
 	iter = bch2_trans_get_iter(trans, BTREE_ID_INODES,
 				   POS(inode->v.i_ino, 0),
@@ -166,12 +165,17 @@ int __must_check bch2_write_inode_trans(struct btree_trans *trans,
 	if (IS_ERR(iter))
 		return PTR_ERR(iter);
 
-	/* The btree node lock is our lock on the inode: */
-	ret = bch2_btree_iter_traverse(iter);
+	k = bch2_btree_iter_peek_slot(iter);
+	ret = bkey_err(k);
 	if (ret)
 		return ret;
 
-	*inode_u = inode->ei_inode;
+	if (k.k->type != KEY_TYPE_inode)
+		return -EIO;
+
+	ret = bch2_inode_unpack(bkey_s_c_to_inode(k), inode_u);
+	if (ret)
+		return ret;
 
 	if (set) {
 		ret = set(inode, inode_u, p);
@@ -185,7 +189,6 @@ int __must_check bch2_write_inode_trans(struct btree_trans *trans,
 
 	bch2_inode_pack(inode_p, inode_u);
 	bch2_trans_update(trans, iter, &inode_p->inode.k_i);
-
 	return 0;
 }
 
-- 
cgit 


From 8de819f83446dcdedae572ccc7449e3fa90b2c20 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 1 Oct 2019 18:51:10 -0400
Subject: bcachefs: Fix __bch2_buffered_write() returning -ENOMEM

When grab_cache_page_write_begin() fails but we did pin some pages, we
shouldn't return -ENOMEM, we should do a partial write.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-io.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 13e7b7842367..be121b755fc7 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -1606,8 +1606,13 @@ static int __bch2_buffered_write(struct bch_inode_info *inode,
 		pages[i] = grab_cache_page_write_begin(mapping, index + i);
 		if (!pages[i]) {
 			nr_pages = i;
-			ret = -ENOMEM;
-			goto out;
+			if (!i) {
+				ret = -ENOMEM;
+				goto out;
+			}
+			len = min_t(unsigned, len,
+				    nr_pages * PAGE_SIZE - offset);
+			break;
 		}
 	}
 
-- 
cgit 


From ab9ff73322aad5cf6ea774047e47ef724077399b Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 2 Oct 2019 00:29:37 -0400
Subject: bcachefs: Fix an error path

It's possible to get -EIO in __btree_iter_traverse_all() after looping,
with orig_iter NULL.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 8e9164aee409..78bc82c7b9c3 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1010,8 +1010,11 @@ retry_all:
 
 	if (unlikely(ret == -EIO)) {
 		trans->error = true;
-		orig_iter->flags |= BTREE_ITER_ERROR;
-		orig_iter->l[orig_iter->level].b = BTREE_ITER_NO_NODE_ERROR;
+		if (orig_iter) {
+			orig_iter->flags |= BTREE_ITER_ERROR;
+			orig_iter->l[orig_iter->level].b =
+				BTREE_ITER_NO_NODE_ERROR;
+		}
 		goto out;
 	}
 
-- 
cgit 


From 618b9e575b40c862a62764043c961646f3ebc6dc Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 2 Oct 2019 09:14:32 -0400
Subject: bcachefs: Fix undefined behaviour

roundup_pow_of_two(0) is undefined

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/ec.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index 155e7c9bd89f..424d5cf48893 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -1351,6 +1351,9 @@ int bch2_ec_mem_alloc(struct bch_fs *c, bool gc)
 	if (ret)
 		return ret;
 
+	if (!idx)
+		return 0;
+
 	if (!gc &&
 	    !init_heap(&c->ec_stripes_heap, roundup_pow_of_two(idx),
 		       GFP_KERNEL))
-- 
cgit 


From 58677a1d40df8fe3375e9badd7387cf1a2946a3a Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 1 Oct 2019 16:51:57 -0400
Subject: bcachefs: bch2_inode_peek()/bch2_inode_write()

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/acl.c   |  51 +++++++-------
 fs/bcachefs/fs-io.c |  87 +++++++++--------------
 fs/bcachefs/fs.c    | 200 +++++++++++++++++++++++-----------------------------
 fs/bcachefs/inode.c |  47 ++++++++++++
 fs/bcachefs/inode.h |   5 ++
 5 files changed, 198 insertions(+), 192 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/acl.c b/fs/bcachefs/acl.c
index 5a4263806610..4e631e04cf0c 100644
--- a/fs/bcachefs/acl.c
+++ b/fs/bcachefs/acl.c
@@ -281,51 +281,54 @@ int bch2_set_acl_trans(struct btree_trans *trans,
 	return ret == -ENOENT ? 0 : ret;
 }
 
-static int inode_update_for_set_acl_fn(struct bch_inode_info *inode,
-				       struct bch_inode_unpacked *bi,
-				       void *p)
-{
-	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	umode_t mode = (unsigned long) p;
-
-	bi->bi_ctime	= bch2_current_time(c);
-	bi->bi_mode	= mode;
-	return 0;
-}
-
 int bch2_set_acl(struct mnt_idmap *idmap,
 		 struct dentry *dentry,
-		 struct posix_acl *acl, int type)
+		 struct posix_acl *_acl, int type)
 {
 	struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
 	struct btree_trans trans;
+	struct btree_iter *inode_iter;
 	struct bch_inode_unpacked inode_u;
-	umode_t mode = inode->v.i_mode;
+	struct posix_acl *acl;
+	umode_t mode;
 	int ret;
 
 	mutex_lock(&inode->ei_update_lock);
 	bch2_trans_init(&trans, c, 0, 0);
+retry:
+	bch2_trans_begin(&trans);
+	acl = _acl;
 
-	if (type == ACL_TYPE_ACCESS && acl) {
+	inode_iter = bch2_inode_peek(&trans, &inode_u, inode->v.i_ino,
+				     BTREE_ITER_INTENT);
+	ret = PTR_ERR_OR_ZERO(inode_iter);
+	if (ret)
+		goto btree_err;
+
+	mode = inode_u.bi_mode;
+
+	if (type == ACL_TYPE_ACCESS) {
 		ret = posix_acl_update_mode(idmap, &inode->v, &mode, &acl);
 		if (ret)
 			goto err;
 	}
-retry:
-	bch2_trans_begin(&trans);
 
-	ret   = bch2_set_acl_trans(&trans,
-				   &inode->ei_inode,
-				   &inode->ei_str_hash,
-				   acl, type) ?:
-		bch2_write_inode_trans(&trans, inode, &inode_u,
-				       inode_update_for_set_acl_fn,
-				       (void *)(unsigned long) mode) ?:
+	ret = bch2_set_acl_trans(&trans, &inode_u,
+				 &inode->ei_str_hash,
+				 acl, type);
+	if (ret)
+		goto btree_err;
+
+	inode_u.bi_ctime	= bch2_current_time(c);
+	inode_u.bi_mode		= mode;
+
+	ret =   bch2_inode_write(&trans, inode_iter, &inode_u) ?:
 		bch2_trans_commit(&trans, NULL,
 				  &inode->ei_journal_seq,
 				  BTREE_INSERT_ATOMIC|
 				  BTREE_INSERT_NOUNLOCK);
+btree_err:
 	if (ret == -EINTR)
 		goto retry;
 	if (unlikely(ret))
diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index be121b755fc7..49c0343da462 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -241,11 +241,13 @@ static void i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode,
 
 static int sum_sector_overwrites(struct btree_trans *trans,
 				 struct btree_iter *extent_iter,
-				 struct bkey_i *new, bool *allocating,
+				 struct bkey_i *new,
+				 bool may_allocate,
 				 s64 *delta)
 {
 	struct btree_iter *iter;
 	struct bkey_s_c old;
+	int ret = 0;
 
 	*delta = 0;
 
@@ -253,21 +255,13 @@ static int sum_sector_overwrites(struct btree_trans *trans,
 	if (IS_ERR(iter))
 		return PTR_ERR(iter);
 
-	old = bch2_btree_iter_peek_slot(iter);
-
-	while (1) {
-		/*
-		 * should not be possible to get an error here, since we're
-		 * carefully not advancing past @new and thus whatever leaf node
-		 * @_iter currently points to:
-		 */
-		BUG_ON(bkey_err(old));
-
-		if (allocating &&
-		    !*allocating &&
+	for_each_btree_key_continue(iter, BTREE_ITER_SLOTS, old, ret) {
+		if (!may_allocate &&
 		    bch2_bkey_nr_ptrs_allocated(old) <
-		    bch2_bkey_nr_dirty_ptrs(bkey_i_to_s_c(new)))
-			*allocating = true;
+		    bch2_bkey_nr_dirty_ptrs(bkey_i_to_s_c(new))) {
+			ret = -ENOSPC;
+			break;
+		}
 
 		*delta += (min(new->k.p.offset,
 			      old.k->p.offset) -
@@ -278,12 +272,10 @@ static int sum_sector_overwrites(struct btree_trans *trans,
 
 		if (bkey_cmp(old.k->p, new->k.p) >= 0)
 			break;
-
-		old = bch2_btree_iter_next_slot(iter);
 	}
 
 	bch2_trans_iter_put(trans, iter);
-	return 0;
+	return ret;
 }
 
 int bch2_extent_update(struct btree_trans *trans,
@@ -301,9 +293,7 @@ int bch2_extent_update(struct btree_trans *trans,
 	struct btree_iter *inode_iter = NULL;
 	struct bch_inode_unpacked inode_u;
 	struct bkey_inode_buf inode_p;
-	bool allocating = false;
 	bool extended = false;
-	bool inode_locked = false;
 	s64 i_sectors_delta;
 	int ret;
 
@@ -315,15 +305,11 @@ int bch2_extent_update(struct btree_trans *trans,
 	if (ret)
 		return ret;
 
-	ret = sum_sector_overwrites(trans, extent_iter,
-				    k, &allocating,
-				    &i_sectors_delta);
+	ret = sum_sector_overwrites(trans, extent_iter, k,
+				    may_allocate, &i_sectors_delta);
 	if (ret)
 		return ret;
 
-	if (!may_allocate && allocating)
-		return -ENOSPC;
-
 	bch2_trans_update(trans, extent_iter, k);
 
 	new_i_size = min(k->k.p.offset << 9, new_i_size);
@@ -331,29 +317,28 @@ int bch2_extent_update(struct btree_trans *trans,
 	/* XXX: inode->i_size locking */
 	if (i_sectors_delta ||
 	    new_i_size > inode->ei_inode.bi_size) {
-		inode_iter = bch2_trans_get_iter(trans,
-			BTREE_ID_INODES,
-			POS(k->k.p.inode, 0),
-			BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+		inode_iter = bch2_inode_peek(trans, &inode_u,
+				k->k.p.inode, BTREE_ITER_INTENT);
 		if (IS_ERR(inode_iter))
 			return PTR_ERR(inode_iter);
 
-		ret = bch2_btree_iter_traverse(inode_iter);
-		if (ret)
-			goto err;
-
-		inode_u = inode->ei_inode;
 		inode_u.bi_sectors += i_sectors_delta;
 
-		/* XXX: this is slightly suspect */
+		/*
+		 * XXX: can BCH_INODE_I_SIZE_DIRTY be true here? i.e. can we
+		 * race with truncate?
+		 */
 		if (!(inode_u.bi_flags & BCH_INODE_I_SIZE_DIRTY) &&
 		    new_i_size > inode_u.bi_size) {
 			inode_u.bi_size = new_i_size;
 			extended = true;
 		}
 
-		bch2_inode_pack(&inode_p, &inode_u);
-		bch2_trans_update(trans, inode_iter, &inode_p.inode.k_i);
+		if (i_sectors_delta || extended) {
+			bch2_inode_pack(&inode_p, &inode_u);
+			bch2_trans_update(trans, inode_iter,
+					  &inode_p.inode.k_i);
+		}
 	}
 
 	ret = bch2_trans_commit(trans, disk_res,
@@ -365,33 +350,25 @@ int bch2_extent_update(struct btree_trans *trans,
 	if (ret)
 		goto err;
 
-	inode->ei_inode.bi_sectors += i_sectors_delta;
-
-	EBUG_ON(i_sectors_delta &&
-		inode->ei_inode.bi_sectors != inode_u.bi_sectors);
-
-	if (extended) {
-		inode->ei_inode.bi_size = new_i_size;
-
-		if (direct) {
-			spin_lock(&inode->v.i_lock);
-			if (new_i_size > inode->v.i_size)
-				i_size_write(&inode->v, new_i_size);
-			spin_unlock(&inode->v.i_lock);
-		}
+	if (i_sectors_delta || extended) {
+		inode->ei_inode.bi_sectors	= inode_u.bi_sectors;
+		inode->ei_inode.bi_size		= inode_u.bi_size;
 	}
 
 	if (direct)
 		i_sectors_acct(c, inode, quota_res, i_sectors_delta);
+	if (direct && extended) {
+		spin_lock(&inode->v.i_lock);
+		if (new_i_size > inode->v.i_size)
+			i_size_write(&inode->v, new_i_size);
+		spin_unlock(&inode->v.i_lock);
+	}
 
 	if (total_delta)
 		*total_delta += i_sectors_delta;
 err:
 	if (!IS_ERR_OR_NULL(inode_iter))
 		bch2_trans_iter_put(trans, inode_iter);
-	if (inode_locked)
-		mutex_unlock(&inode->ei_update_lock);
-
 	return ret;
 }
 
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index 0a83d5f61a6b..cbe1b90e80c2 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -155,41 +155,19 @@ int __must_check bch2_write_inode_trans(struct btree_trans *trans,
 				void *p)
 {
 	struct btree_iter *iter = NULL;
-	struct bkey_inode_buf *inode_p;
-	struct bkey_s_c k;
 	int ret = 0;
 
-	iter = bch2_trans_get_iter(trans, BTREE_ID_INODES,
-				   POS(inode->v.i_ino, 0),
-				   BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
-	if (IS_ERR(iter))
-		return PTR_ERR(iter);
-
-	k = bch2_btree_iter_peek_slot(iter);
-	ret = bkey_err(k);
+	iter = bch2_inode_peek(trans, inode_u, inode->v.i_ino,
+			       BTREE_ITER_INTENT);
+	ret = PTR_ERR_OR_ZERO(iter);
 	if (ret)
 		return ret;
 
-	if (k.k->type != KEY_TYPE_inode)
-		return -EIO;
-
-	ret = bch2_inode_unpack(bkey_s_c_to_inode(k), inode_u);
+	ret = set ? set(inode, inode_u, p) : 0;
 	if (ret)
 		return ret;
 
-	if (set) {
-		ret = set(inode, inode_u, p);
-		if (ret)
-			return ret;
-	}
-
-	inode_p = bch2_trans_kmalloc(trans, sizeof(*inode_p));
-	if (IS_ERR(inode_p))
-		return PTR_ERR(inode_p);
-
-	bch2_inode_pack(inode_p, inode_u);
-	bch2_trans_update(trans, iter, &inode_p->inode.k_i);
-	return 0;
+	return bch2_inode_write(trans, iter, inode_u);
 }
 
 int __must_check bch2_write_inode(struct bch_fs *c,
@@ -531,23 +509,13 @@ static int bch2_create(struct mnt_idmap *idmap,
 	return 0;
 }
 
-static int inode_update_for_link_fn(struct bch_inode_info *inode,
-				    struct bch_inode_unpacked *bi,
-				    void *p)
-{
-	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-
-	bi->bi_ctime = bch2_current_time(c);
-	bch2_inode_nlink_inc(bi);
-	return 0;
-}
-
 static int __bch2_link(struct bch_fs *c,
 		       struct bch_inode_info *inode,
 		       struct bch_inode_info *dir,
 		       struct dentry *dentry)
 {
 	struct btree_trans trans;
+	struct btree_iter *inode_iter;
 	struct bch_inode_unpacked inode_u;
 	int ret;
 
@@ -555,21 +523,30 @@ static int __bch2_link(struct bch_fs *c,
 	bch2_trans_init(&trans, c, 4, 1024);
 retry:
 	bch2_trans_begin(&trans);
-
 	ret   = __bch2_dirent_create(&trans, dir->v.i_ino,
 				     &dir->ei_str_hash,
 				     mode_to_type(inode->v.i_mode),
 				     &dentry->d_name,
 				     inode->v.i_ino,
-				     BCH_HASH_SET_MUST_CREATE) ?:
-		bch2_write_inode_trans(&trans, inode, &inode_u,
-				       inode_update_for_link_fn,
-				       NULL) ?:
-		bch2_trans_commit(&trans, NULL,
-				  &inode->ei_journal_seq,
-				  BTREE_INSERT_ATOMIC|
-				  BTREE_INSERT_NOUNLOCK);
+				     BCH_HASH_SET_MUST_CREATE);
+	if (ret)
+		goto err;
+
+	inode_iter = bch2_inode_peek(&trans, &inode_u, inode->v.i_ino,
+				     BTREE_ITER_INTENT);
+	ret = PTR_ERR_OR_ZERO(inode_iter);
+	if (ret)
+		goto err;
+
+	inode_u.bi_ctime = bch2_current_time(c);
+	bch2_inode_nlink_inc(&inode_u);
 
+	ret =   bch2_inode_write(&trans, inode_iter, &inode_u) ?:
+		bch2_trans_commit(&trans, NULL,
+				&inode->ei_journal_seq,
+				BTREE_INSERT_ATOMIC|
+				BTREE_INSERT_NOUNLOCK);
+err:
 	if (ret == -EINTR)
 		goto retry;
 
@@ -600,36 +577,12 @@ static int bch2_link(struct dentry *old_dentry, struct inode *vdir,
 	return 0;
 }
 
-static int inode_update_dir_for_unlink_fn(struct bch_inode_info *inode,
-					  struct bch_inode_unpacked *bi,
-					  void *p)
-{
-	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	struct bch_inode_info *unlink_inode = p;
-
-	bi->bi_mtime = bi->bi_ctime = bch2_current_time(c);
-
-	bi->bi_nlink -= S_ISDIR(unlink_inode->v.i_mode);
-
-	return 0;
-}
-
-static int inode_update_for_unlink_fn(struct bch_inode_info *inode,
-				      struct bch_inode_unpacked *bi,
-				      void *p)
-{
-	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-
-	bi->bi_ctime = bch2_current_time(c);
-	bch2_inode_nlink_dec(bi);
-	return 0;
-}
-
 static int bch2_unlink(struct inode *vdir, struct dentry *dentry)
 {
 	struct bch_fs *c = vdir->i_sb->s_fs_info;
 	struct bch_inode_info *dir = to_bch_ei(vdir);
 	struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
+	struct btree_iter *dir_iter, *inode_iter;
 	struct bch_inode_unpacked dir_u, inode_u;
 	struct btree_trans trans;
 	int ret;
@@ -641,25 +594,42 @@ retry:
 
 	ret   = __bch2_dirent_delete(&trans, dir->v.i_ino,
 				     &dir->ei_str_hash,
-				     &dentry->d_name) ?:
-		bch2_write_inode_trans(&trans, dir, &dir_u,
-				       inode_update_dir_for_unlink_fn,
-				       inode) ?:
-		bch2_write_inode_trans(&trans, inode, &inode_u,
-				       inode_update_for_unlink_fn,
-				       NULL) ?:
+				     &dentry->d_name);
+	if (ret)
+		goto btree_err;
+
+	dir_iter = bch2_inode_peek(&trans, &dir_u, dir->v.i_ino,
+				   BTREE_ITER_INTENT);
+	ret = PTR_ERR_OR_ZERO(dir_iter);
+	if (ret)
+		goto btree_err;
+
+	inode_iter = bch2_inode_peek(&trans, &inode_u, inode->v.i_ino,
+				     BTREE_ITER_INTENT);
+	ret = PTR_ERR_OR_ZERO(inode_iter);
+	if (ret)
+		goto btree_err;
+
+	dir_u.bi_mtime = dir_u.bi_ctime = inode_u.bi_ctime =
+		bch2_current_time(c);
+
+	dir_u.bi_nlink -= S_ISDIR(inode_u.bi_mode);
+	bch2_inode_nlink_dec(&inode_u);
+
+	ret =   bch2_inode_write(&trans, dir_iter, &dir_u) ?:
+		bch2_inode_write(&trans, inode_iter, &inode_u) ?:
 		bch2_trans_commit(&trans, NULL,
 				  &dir->ei_journal_seq,
 				  BTREE_INSERT_ATOMIC|
 				  BTREE_INSERT_NOUNLOCK|
 				  BTREE_INSERT_NOFAIL);
+btree_err:
 	if (ret == -EINTR)
 		goto retry;
 	if (ret)
 		goto err;
 
-	if (dir->ei_journal_seq > inode->ei_journal_seq)
-		inode->ei_journal_seq = dir->ei_journal_seq;
+	journal_seq_copy(inode, dir->ei_journal_seq);
 
 	bch2_inode_update_after_write(c, dir, &dir_u,
 				      ATTR_MTIME|ATTR_CTIME);
@@ -953,67 +923,60 @@ err:
 	return ret;
 }
 
-struct inode_write_setattr {
-	struct iattr		*attr;
-	struct mnt_idmap	*idmap;
-};
-
-static int inode_update_for_setattr_fn(struct bch_inode_info *inode,
-				       struct bch_inode_unpacked *bi,
-				       void *p)
+static void bch2_setattr_copy(struct mnt_idmap *idmap,
+			      struct bch_inode_info *inode,
+			      struct bch_inode_unpacked *bi,
+			      struct iattr *attr)
 {
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	struct inode_write_setattr *s = p;
-	unsigned int ia_valid = s->attr->ia_valid;
+	unsigned int ia_valid = attr->ia_valid;
 
 	if (ia_valid & ATTR_UID)
-		bi->bi_uid = from_kuid(i_user_ns(&inode->v), s->attr->ia_uid);
+		bi->bi_uid = from_kuid(i_user_ns(&inode->v), attr->ia_uid);
 	if (ia_valid & ATTR_GID)
-		bi->bi_gid = from_kgid(i_user_ns(&inode->v), s->attr->ia_gid);
+		bi->bi_gid = from_kgid(i_user_ns(&inode->v), attr->ia_gid);
 
 	if (ia_valid & ATTR_ATIME)
-		bi->bi_atime = timespec_to_bch2_time(c, s->attr->ia_atime);
+		bi->bi_atime = timespec_to_bch2_time(c, attr->ia_atime);
 	if (ia_valid & ATTR_MTIME)
-		bi->bi_mtime = timespec_to_bch2_time(c, s->attr->ia_mtime);
+		bi->bi_mtime = timespec_to_bch2_time(c, attr->ia_mtime);
 	if (ia_valid & ATTR_CTIME)
-		bi->bi_ctime = timespec_to_bch2_time(c, s->attr->ia_ctime);
+		bi->bi_ctime = timespec_to_bch2_time(c, attr->ia_ctime);
 
 	if (ia_valid & ATTR_MODE) {
-		umode_t mode = s->attr->ia_mode;
+		umode_t mode = attr->ia_mode;
 		kgid_t gid = ia_valid & ATTR_GID
-			? s->attr->ia_gid
+			? attr->ia_gid
 			: inode->v.i_gid;
 
 		if (!in_group_p(gid) &&
-		    !capable_wrt_inode_uidgid(s->idmap, &inode->v, CAP_FSETID))
+		    !capable_wrt_inode_uidgid(idmap, &inode->v, CAP_FSETID))
 			mode &= ~S_ISGID;
 		bi->bi_mode = mode;
 	}
-
-	return 0;
 }
 
 static int bch2_setattr_nonsize(struct mnt_idmap *idmap,
 				struct bch_inode_info *inode,
-				struct iattr *iattr)
+				struct iattr *attr)
 {
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
 	struct bch_qid qid;
 	struct btree_trans trans;
+	struct btree_iter *inode_iter;
 	struct bch_inode_unpacked inode_u;
 	struct posix_acl *acl = NULL;
-	struct inode_write_setattr s = { iattr, idmap };
 	int ret;
 
 	mutex_lock(&inode->ei_update_lock);
 
 	qid = inode->ei_qid;
 
-	if (iattr->ia_valid & ATTR_UID)
-		qid.q[QTYP_USR] = from_kuid(i_user_ns(&inode->v), iattr->ia_uid);
+	if (attr->ia_valid & ATTR_UID)
+		qid.q[QTYP_USR] = from_kuid(i_user_ns(&inode->v), attr->ia_uid);
 
-	if (iattr->ia_valid & ATTR_GID)
-		qid.q[QTYP_GRP] = from_kgid(i_user_ns(&inode->v), iattr->ia_gid);
+	if (attr->ia_valid & ATTR_GID)
+		qid.q[QTYP_GRP] = from_kgid(i_user_ns(&inode->v), attr->ia_gid);
 
 	ret = bch2_fs_quota_transfer(c, inode, qid, ~0,
 				     KEY_TYPE_QUOTA_PREALLOC);
@@ -1026,22 +989,33 @@ retry:
 	kfree(acl);
 	acl = NULL;
 
-	ret = bch2_write_inode_trans(&trans, inode, &inode_u,
-				inode_update_for_setattr_fn, &s) ?:
-		(iattr->ia_valid & ATTR_MODE
-		 ? bch2_acl_chmod(&trans, inode, iattr->ia_mode, &acl)
-		 : 0) ?:
+	inode_iter = bch2_inode_peek(&trans, &inode_u, inode->v.i_ino,
+				     BTREE_ITER_INTENT);
+	ret = PTR_ERR_OR_ZERO(inode_iter);
+	if (ret)
+		goto btree_err;
+
+	bch2_setattr_copy(idmap, inode, &inode_u, attr);
+
+	if (attr->ia_valid & ATTR_MODE) {
+		ret = bch2_acl_chmod(&trans, inode, inode_u.bi_mode, &acl);
+		if (ret)
+			goto btree_err;
+	}
+
+	ret =   bch2_inode_write(&trans, inode_iter, &inode_u) ?:
 		bch2_trans_commit(&trans, NULL,
 				  &inode->ei_journal_seq,
 				  BTREE_INSERT_ATOMIC|
 				  BTREE_INSERT_NOUNLOCK|
 				  BTREE_INSERT_NOFAIL);
+btree_err:
 	if (ret == -EINTR)
 		goto retry;
 	if (unlikely(ret))
 		goto err_trans;
 
-	bch2_inode_update_after_write(c, inode, &inode_u, iattr->ia_valid);
+	bch2_inode_update_after_write(c, inode, &inode_u, attr->ia_valid);
 
 	if (acl)
 		set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl);
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index f192536558c1..fc38cfb9e939 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -181,6 +181,53 @@ int bch2_inode_unpack(struct bkey_s_c_inode inode,
 	return 0;
 }
 
+struct btree_iter *bch2_inode_peek(struct btree_trans *trans,
+				   struct bch_inode_unpacked *inode,
+				   u64 inum, unsigned flags)
+{
+	struct btree_iter *iter;
+	struct bkey_s_c k;
+	int ret;
+
+	iter = bch2_trans_get_iter(trans, BTREE_ID_INODES, POS(inum, 0),
+				   BTREE_ITER_SLOTS|flags);
+	if (IS_ERR(iter))
+		return iter;
+
+	k = bch2_btree_iter_peek_slot(iter);
+	ret = bkey_err(k);
+	if (ret)
+		goto err;
+
+	ret = k.k->type == KEY_TYPE_inode ? 0 : -EIO;
+	if (ret)
+		goto err;
+
+	ret = bch2_inode_unpack(bkey_s_c_to_inode(k), inode);
+	if (ret)
+		goto err;
+
+	return iter;
+err:
+	bch2_trans_iter_put(trans, iter);
+	return ERR_PTR(ret);
+}
+
+int bch2_inode_write(struct btree_trans *trans,
+		     struct btree_iter *iter,
+		     struct bch_inode_unpacked *inode)
+{
+	struct bkey_inode_buf *inode_p;
+
+	inode_p = bch2_trans_kmalloc(trans, sizeof(*inode_p));
+	if (IS_ERR(inode_p))
+		return PTR_ERR(inode_p);
+
+	bch2_inode_pack(inode_p, inode);
+	bch2_trans_update(trans, iter, &inode_p->inode.k_i);
+	return 0;
+}
+
 const char *bch2_inode_invalid(const struct bch_fs *c, struct bkey_s_c k)
 {
 		struct bkey_s_c_inode inode = bkey_s_c_to_inode(k);
diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h
index e88ec78071bd..c5626c668639 100644
--- a/fs/bcachefs/inode.h
+++ b/fs/bcachefs/inode.h
@@ -46,6 +46,11 @@ struct bkey_inode_buf {
 void bch2_inode_pack(struct bkey_inode_buf *, const struct bch_inode_unpacked *);
 int bch2_inode_unpack(struct bkey_s_c_inode, struct bch_inode_unpacked *);
 
+struct btree_iter *bch2_inode_peek(struct btree_trans *,
+			struct bch_inode_unpacked *, u64, unsigned);
+int bch2_inode_write(struct btree_trans *, struct btree_iter *,
+		     struct bch_inode_unpacked *);
+
 void bch2_inode_init(struct bch_fs *, struct bch_inode_unpacked *,
 		     uid_t, gid_t, umode_t, dev_t,
 		     struct bch_inode_unpacked *);
-- 
cgit 


From d65a8551c822030e1214bcffbcb181d4878414b5 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 4 Oct 2019 14:38:41 -0400
Subject: bcachefs: Fix deref of error pointer

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/str_hash.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h
index 31e55acbbead..a81fc3596fc1 100644
--- a/fs/bcachefs/str_hash.h
+++ b/fs/bcachefs/str_hash.h
@@ -251,9 +251,10 @@ int bch2_hash_set(struct btree_trans *trans,
 	if (!ret)
 		ret = -ENOSPC;
 out:
-	if (slot)
+	if (!IS_ERR_OR_NULL(slot))
 		bch2_trans_iter_put(trans, slot);
-	bch2_trans_iter_put(trans, iter);
+	if (!IS_ERR_OR_NULL(iter))
+		bch2_trans_iter_put(trans, iter);
 
 	return ret;
 found:
-- 
cgit 


From bd09d268978e1206d81a169039cf86f639817854 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 4 Oct 2019 14:39:38 -0400
Subject: bcachefs: Only look up inode io opts in extents btree

We currently don't have a way to propagate inode io opts to indirect
extents. This is a problem...

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/move.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index 2f0bdfbfcd61..c5d3375882d7 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -549,7 +549,8 @@ peek:
 		if (!bkey_extent_is_direct_data(k.k))
 			goto next_nondata;
 
-		if (cur_inum != k.k->p.inode) {
+		if (btree_id == BTREE_ID_EXTENTS &&
+		    cur_inum != k.k->p.inode) {
 			struct bch_inode_unpacked inode;
 
 			/* don't hold btree locks while looking up inode: */
-- 
cgit 


From 73501ab82c44b1249916ded8dcb883f7705b1549 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 4 Oct 2019 15:58:43 -0400
Subject: bcachefs: Don't use sha256 for siphash str hash key

With the refactoring that's coming to add fuse support, we want
bch2_hash_info_init() to be cheaper so we don't have to rely on anything
cached besides the inode in the btree.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs_format.h | 15 ++++++++++++---
 fs/bcachefs/inode.c           |  7 ++++---
 fs/bcachefs/opts.h            |  2 +-
 fs/bcachefs/str_hash.h        | 37 ++++++++++++++++++++++++-------------
 4 files changed, 41 insertions(+), 20 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index 4bc3f8d3e7f4..eb6d712e7844 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -1318,6 +1318,7 @@ enum bch_sb_features {
 	BCH_FEATURE_EC			= 4,
 	BCH_FEATURE_JOURNAL_SEQ_BLACKLIST_V3 = 5,
 	BCH_FEATURE_REFLINK		= 6,
+	BCH_FEATURE_NEW_SIPHASH		= 7,
 	BCH_FEATURE_NR,
 };
 
@@ -1344,11 +1345,19 @@ enum bch_csum_opts {
 	BCH_CSUM_OPT_NR			= 3,
 };
 
-enum bch_str_hash_opts {
+enum bch_str_hash_type {
 	BCH_STR_HASH_CRC32C		= 0,
 	BCH_STR_HASH_CRC64		= 1,
-	BCH_STR_HASH_SIPHASH		= 2,
-	BCH_STR_HASH_NR			= 3,
+	BCH_STR_HASH_SIPHASH_OLD	= 2,
+	BCH_STR_HASH_SIPHASH		= 3,
+	BCH_STR_HASH_NR			= 4,
+};
+
+enum bch_str_hash_opts {
+	BCH_STR_HASH_OPT_CRC32C		= 0,
+	BCH_STR_HASH_OPT_CRC64		= 1,
+	BCH_STR_HASH_OPT_SIPHASH	= 2,
+	BCH_STR_HASH_OPT_NR		= 3,
 };
 
 #define BCH_COMPRESSION_TYPES()		\
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index fc38cfb9e939..3dc46faaebbc 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -6,8 +6,7 @@
 #include "error.h"
 #include "extents.h"
 #include "inode.h"
-#include "io.h"
-#include "keylist.h"
+#include "str_hash.h"
 
 #include <linux/random.h>
 
@@ -303,11 +302,13 @@ void bch2_inode_init(struct bch_fs *c, struct bch_inode_unpacked *inode_u,
 		     struct bch_inode_unpacked *parent)
 {
 	s64 now = bch2_current_time(c);
+	enum bch_str_hash_type str_hash =
+		bch2_str_hash_opt_to_type(c, c->opts.str_hash);
 
 	memset(inode_u, 0, sizeof(*inode_u));
 
 	/* ick */
-	inode_u->bi_flags |= c->opts.str_hash << INODE_STR_HASH_OFFSET;
+	inode_u->bi_flags |= str_hash << INODE_STR_HASH_OFFSET;
 	get_random_bytes(&inode_u->bi_hash_seed,
 			 sizeof(inode_u->bi_hash_seed));
 
diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
index d9325d4bc024..a6f1d3ec7b90 100644
--- a/fs/bcachefs/opts.h
+++ b/fs/bcachefs/opts.h
@@ -127,7 +127,7 @@ enum opt_type {
 	x(str_hash,			u8,				\
 	  OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,				\
 	  OPT_STR(bch2_str_hash_types),					\
-	  BCH_SB_STR_HASH_TYPE,		BCH_STR_HASH_SIPHASH,		\
+	  BCH_SB_STR_HASH_TYPE,		BCH_STR_HASH_OPT_SIPHASH,	\
 	  NULL,		"Hash function for directory entries and xattrs")\
 	x(foreground_target,		u16,				\
 	  OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE,			\
diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h
index a81fc3596fc1..7be4a8e50eaa 100644
--- a/fs/bcachefs/str_hash.h
+++ b/fs/bcachefs/str_hash.h
@@ -14,6 +14,23 @@
 #include <crypto/hash.h>
 #include <crypto/sha2.h>
 
+static inline enum bch_str_hash_type
+bch2_str_hash_opt_to_type(struct bch_fs *c, enum bch_str_hash_opts opt)
+{
+	switch (opt) {
+	case BCH_STR_HASH_OPT_CRC32C:
+		return BCH_STR_HASH_CRC32C;
+	case BCH_STR_HASH_OPT_CRC64:
+		return BCH_STR_HASH_CRC64;
+	case BCH_STR_HASH_OPT_SIPHASH:
+		return c->sb.features & (1ULL << BCH_FEATURE_NEW_SIPHASH)
+			? BCH_STR_HASH_SIPHASH
+			: BCH_STR_HASH_SIPHASH_OLD;
+	default:
+	     BUG();
+	}
+}
+
 struct bch_hash_info {
 	u8			type;
 	union {
@@ -23,21 +40,16 @@ struct bch_hash_info {
 };
 
 static inline struct bch_hash_info
-bch2_hash_info_init(struct bch_fs *c,
-		   const struct bch_inode_unpacked *bi)
+bch2_hash_info_init(struct bch_fs *c, const struct bch_inode_unpacked *bi)
 {
 	/* XXX ick */
 	struct bch_hash_info info = {
 		.type = (bi->bi_flags >> INODE_STR_HASH_OFFSET) &
-			~(~0U << INODE_STR_HASH_BITS)
+			~(~0U << INODE_STR_HASH_BITS),
+		.crc_key = bi->bi_hash_seed,
 	};
 
-	switch (info.type) {
-	case BCH_STR_HASH_CRC32C:
-	case BCH_STR_HASH_CRC64:
-		info.crc_key = bi->bi_hash_seed;
-		break;
-	case BCH_STR_HASH_SIPHASH: {
+	if (unlikely(info.type == BCH_STR_HASH_SIPHASH_OLD)) {
 		SHASH_DESC_ON_STACK(desc, c->sha256);
 		u8 digest[SHA256_DIGEST_SIZE];
 
@@ -46,10 +58,6 @@ bch2_hash_info_init(struct bch_fs *c,
 		crypto_shash_digest(desc, (void *) &bi->bi_hash_seed,
 				    sizeof(bi->bi_hash_seed), digest);
 		memcpy(&info.siphash_key, digest, sizeof(info.siphash_key));
-		break;
-	}
-	default:
-		BUG();
 	}
 
 	return info;
@@ -73,6 +81,7 @@ static inline void bch2_str_hash_init(struct bch_str_hash_ctx *ctx,
 	case BCH_STR_HASH_CRC64:
 		ctx->crc64 = crc64_be(~0, &info->crc_key, sizeof(info->crc_key));
 		break;
+	case BCH_STR_HASH_SIPHASH_OLD:
 	case BCH_STR_HASH_SIPHASH:
 		SipHash24_Init(&ctx->siphash, &info->siphash_key);
 		break;
@@ -92,6 +101,7 @@ static inline void bch2_str_hash_update(struct bch_str_hash_ctx *ctx,
 	case BCH_STR_HASH_CRC64:
 		ctx->crc64 = crc64_be(ctx->crc64, data, len);
 		break;
+	case BCH_STR_HASH_SIPHASH_OLD:
 	case BCH_STR_HASH_SIPHASH:
 		SipHash24_Update(&ctx->siphash, data, len);
 		break;
@@ -108,6 +118,7 @@ static inline u64 bch2_str_hash_end(struct bch_str_hash_ctx *ctx,
 		return ctx->crc32c;
 	case BCH_STR_HASH_CRC64:
 		return ctx->crc64 >> 1;
+	case BCH_STR_HASH_SIPHASH_OLD:
 	case BCH_STR_HASH_SIPHASH:
 		return SipHash24_End(&ctx->siphash) >> 1;
 	default:
-- 
cgit 


From 9638574229e3ae0175a46a63431149746c777b3a Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 2 Oct 2019 18:35:36 -0400
Subject: bcachefs: Factor out fs-common.c

This refactoring makes the code easier to understand by separating the
bcachefs btree transactional code from the linux VFS code - but more
importantly, it's also to share code with the fuse port.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/Makefile    |   1 +
 fs/bcachefs/dirent.c    |  97 ++++------
 fs/bcachefs/dirent.h    |  29 +--
 fs/bcachefs/fs-common.c | 280 +++++++++++++++++++++++++++
 fs/bcachefs/fs-common.h |  37 ++++
 fs/bcachefs/fs-ioctl.c  |  10 +
 fs/bcachefs/fs.c        | 495 +++++++++++++-----------------------------------
 fs/bcachefs/fs.h        |  13 --
 fs/bcachefs/fsck.c      |  76 +++-----
 fs/bcachefs/inode.c     |  39 ++--
 fs/bcachefs/inode.h     |  16 +-
 fs/bcachefs/recovery.c  |  26 +--
 12 files changed, 586 insertions(+), 533 deletions(-)
 create mode 100644 fs/bcachefs/fs-common.c
 create mode 100644 fs/bcachefs/fs-common.h

(limited to 'fs')

diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile
index 4c2608409144..9d120936703a 100644
--- a/fs/bcachefs/Makefile
+++ b/fs/bcachefs/Makefile
@@ -27,6 +27,7 @@ bcachefs-y		:=	\
 	error.o			\
 	extents.o		\
 	fs.o			\
+	fs-common.o		\
 	fs-ioctl.o		\
 	fs-io.o			\
 	fsck.o			\
diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c
index 38dd96808e90..304128d7251f 100644
--- a/fs/bcachefs/dirent.c
+++ b/fs/bcachefs/dirent.c
@@ -138,10 +138,10 @@ static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans,
 	return dirent;
 }
 
-int __bch2_dirent_create(struct btree_trans *trans,
-			 u64 dir_inum, const struct bch_hash_info *hash_info,
-			 u8 type, const struct qstr *name, u64 dst_inum,
-			 int flags)
+int bch2_dirent_create(struct btree_trans *trans,
+		       u64 dir_inum, const struct bch_hash_info *hash_info,
+		       u8 type, const struct qstr *name, u64 dst_inum,
+		       int flags)
 {
 	struct bkey_i_dirent *dirent;
 	int ret;
@@ -155,16 +155,6 @@ int __bch2_dirent_create(struct btree_trans *trans,
 			     dir_inum, &dirent->k_i, flags);
 }
 
-int bch2_dirent_create(struct bch_fs *c, u64 dir_inum,
-		       const struct bch_hash_info *hash_info,
-		       u8 type, const struct qstr *name, u64 dst_inum,
-		       u64 *journal_seq, int flags)
-{
-	return bch2_trans_do(c, journal_seq, flags,
-		__bch2_dirent_create(&trans, dir_inum, hash_info,
-				     type, name, dst_inum, flags));
-}
-
 static void dirent_copy_target(struct bkey_i_dirent *dst,
 			       struct bkey_s_c_dirent src)
 {
@@ -172,23 +162,22 @@ static void dirent_copy_target(struct bkey_i_dirent *dst,
 	dst->v.d_type = src.v->d_type;
 }
 
-static struct bpos bch2_dirent_pos(struct bch_inode_info *inode,
-				   const struct qstr *name)
-{
-	return POS(inode->v.i_ino, bch2_dirent_hash(&inode->ei_str_hash, name));
-}
-
 int bch2_dirent_rename(struct btree_trans *trans,
-		struct bch_inode_info *src_dir, const struct qstr *src_name,
-		struct bch_inode_info *dst_dir, const struct qstr *dst_name,
-		enum bch_rename_mode mode)
+		       u64 src_dir, struct bch_hash_info *src_hash,
+		       u64 dst_dir, struct bch_hash_info *dst_hash,
+		       const struct qstr *src_name, u64 *src_inum,
+		       const struct qstr *dst_name, u64 *dst_inum,
+		       enum bch_rename_mode mode)
 {
 	struct btree_iter *src_iter, *dst_iter;
 	struct bkey_s_c old_src, old_dst;
 	struct bkey_i_dirent *new_src = NULL, *new_dst = NULL;
-	struct bpos dst_pos = bch2_dirent_pos(dst_dir, dst_name);
+	struct bpos dst_pos =
+		POS(dst_dir, bch2_dirent_hash(dst_hash, dst_name));
 	int ret;
 
+	*src_inum = *dst_inum = 0;
+
 	/*
 	 * Lookup dst:
 	 *
@@ -198,24 +187,25 @@ int bch2_dirent_rename(struct btree_trans *trans,
 	 */
 	dst_iter = mode == BCH_RENAME
 		? bch2_hash_hole(trans, bch2_dirent_hash_desc,
-				 &dst_dir->ei_str_hash,
-				 dst_dir->v.i_ino, dst_name)
+				 dst_hash, dst_dir, dst_name)
 		: bch2_hash_lookup(trans, bch2_dirent_hash_desc,
-				   &dst_dir->ei_str_hash,
-				   dst_dir->v.i_ino, dst_name,
+				   dst_hash, dst_dir, dst_name,
 				   BTREE_ITER_INTENT);
 	if (IS_ERR(dst_iter))
 		return PTR_ERR(dst_iter);
 	old_dst = bch2_btree_iter_peek_slot(dst_iter);
 
+	if (mode != BCH_RENAME)
+		*dst_inum = le64_to_cpu(bkey_s_c_to_dirent(old_dst).v->d_inum);
+
 	/* Lookup src: */
 	src_iter = bch2_hash_lookup(trans, bch2_dirent_hash_desc,
-				    &src_dir->ei_str_hash,
-				    src_dir->v.i_ino, src_name,
+				    src_hash, src_dir, src_name,
 				    BTREE_ITER_INTENT);
 	if (IS_ERR(src_iter))
 		return PTR_ERR(src_iter);
 	old_src = bch2_btree_iter_peek_slot(src_iter);
+	*src_inum = le64_to_cpu(bkey_s_c_to_dirent(old_src).v->d_inum);
 
 	/* Create new dst key: */
 	new_dst = dirent_create_key(trans, 0, dst_name, 0);
@@ -269,8 +259,7 @@ int bch2_dirent_rename(struct btree_trans *trans,
 		} else {
 			/* Check if we need a whiteout to delete src: */
 			ret = bch2_hash_needs_whiteout(trans, bch2_dirent_hash_desc,
-						       &src_dir->ei_str_hash,
-						       src_iter);
+						       src_hash, src_iter);
 			if (ret < 0)
 				return ret;
 
@@ -284,12 +273,12 @@ int bch2_dirent_rename(struct btree_trans *trans,
 	return 0;
 }
 
-int __bch2_dirent_delete(struct btree_trans *trans, u64 dir_inum,
-			 const struct bch_hash_info *hash_info,
-			 const struct qstr *name)
+int bch2_dirent_delete_at(struct btree_trans *trans,
+			  const struct bch_hash_info *hash_info,
+			  struct btree_iter *iter)
 {
-	return bch2_hash_delete(trans, bch2_dirent_hash_desc, hash_info,
-				dir_inum, name);
+	return bch2_hash_delete_at(trans, bch2_dirent_hash_desc,
+				   hash_info, iter);
 }
 
 int bch2_dirent_delete(struct bch_fs *c, u64 dir_inum,
@@ -300,7 +289,17 @@ int bch2_dirent_delete(struct bch_fs *c, u64 dir_inum,
 	return bch2_trans_do(c, journal_seq,
 			     BTREE_INSERT_ATOMIC|
 			     BTREE_INSERT_NOFAIL,
-		__bch2_dirent_delete(&trans, dir_inum, hash_info, name));
+		bch2_hash_delete(&trans, bch2_dirent_hash_desc, hash_info,
+				 dir_inum, name));
+}
+
+struct btree_iter *
+__bch2_dirent_lookup_trans(struct btree_trans *trans, u64 dir_inum,
+			   const struct bch_hash_info *hash_info,
+			   const struct qstr *name)
+{
+	return bch2_hash_lookup(trans, bch2_dirent_hash_desc,
+				hash_info, dir_inum, name, 0);
 }
 
 u64 bch2_dirent_lookup(struct bch_fs *c, u64 dir_inum,
@@ -314,8 +313,7 @@ u64 bch2_dirent_lookup(struct bch_fs *c, u64 dir_inum,
 
 	bch2_trans_init(&trans, c, 0, 0);
 
-	iter = bch2_hash_lookup(&trans, bch2_dirent_hash_desc,
-				hash_info, dir_inum, name, 0);
+	iter = __bch2_dirent_lookup_trans(&trans, dir_inum, hash_info, name);
 	if (IS_ERR(iter)) {
 		BUG_ON(PTR_ERR(iter) == -EINTR);
 		goto out;
@@ -349,16 +347,8 @@ int bch2_empty_dir_trans(struct btree_trans *trans, u64 dir_inum)
 	return ret;
 }
 
-int bch2_empty_dir(struct bch_fs *c, u64 dir_inum)
+int bch2_readdir(struct bch_fs *c, u64 inum, struct dir_context *ctx)
 {
-	return bch2_trans_do(c, NULL, 0,
-		bch2_empty_dir_trans(&trans, dir_inum));
-}
-
-int bch2_readdir(struct bch_fs *c, struct file *file,
-		 struct dir_context *ctx)
-{
-	struct bch_inode_info *inode = file_bch_inode(file);
 	struct btree_trans trans;
 	struct btree_iter *iter;
 	struct bkey_s_c k;
@@ -366,22 +356,19 @@ int bch2_readdir(struct bch_fs *c, struct file *file,
 	unsigned len;
 	int ret;
 
-	if (!dir_emit_dots(file, ctx))
-		return 0;
-
 	bch2_trans_init(&trans, c, 0, 0);
 
 	for_each_btree_key(&trans, iter, BTREE_ID_DIRENTS,
-			   POS(inode->v.i_ino, ctx->pos), 0, k, ret) {
+			   POS(inum, ctx->pos), 0, k, ret) {
 		if (k.k->type != KEY_TYPE_dirent)
 			continue;
 
 		dirent = bkey_s_c_to_dirent(k);
 
-		if (bkey_cmp(k.k->p, POS(inode->v.i_ino, ctx->pos)) < 0)
+		if (bkey_cmp(k.k->p, POS(inum, ctx->pos)) < 0)
 			continue;
 
-		if (k.k->p.inode > inode->v.i_ino)
+		if (k.k->p.inode > inum)
 			break;
 
 		len = bch2_dirent_name_bytes(dirent);
diff --git a/fs/bcachefs/dirent.h b/fs/bcachefs/dirent.h
index bc64718a7832..9a57ad005468 100644
--- a/fs/bcachefs/dirent.h
+++ b/fs/bcachefs/dirent.h
@@ -29,15 +29,13 @@ static inline unsigned dirent_val_u64s(unsigned len)
 			    sizeof(u64));
 }
 
-int __bch2_dirent_create(struct btree_trans *, u64,
-			 const struct bch_hash_info *, u8,
-			 const struct qstr *, u64, int);
-int bch2_dirent_create(struct bch_fs *c, u64, const struct bch_hash_info *,
-		       u8, const struct qstr *, u64, u64 *, int);
-
-int __bch2_dirent_delete(struct btree_trans *, u64,
-			 const struct bch_hash_info *,
-			 const struct qstr *);
+int bch2_dirent_create(struct btree_trans *, u64,
+		       const struct bch_hash_info *, u8,
+		       const struct qstr *, u64, int);
+
+int bch2_dirent_delete_at(struct btree_trans *,
+			  const struct bch_hash_info *,
+			  struct btree_iter *);
 int bch2_dirent_delete(struct bch_fs *, u64, const struct bch_hash_info *,
 		       const struct qstr *, u64 *);
 
@@ -48,15 +46,20 @@ enum bch_rename_mode {
 };
 
 int bch2_dirent_rename(struct btree_trans *,
-		       struct bch_inode_info *, const struct qstr *,
-		       struct bch_inode_info *, const struct qstr *,
+		       u64, struct bch_hash_info *,
+		       u64, struct bch_hash_info *,
+		       const struct qstr *, u64 *,
+		       const struct qstr *, u64 *,
 		       enum bch_rename_mode);
 
+struct btree_iter *
+__bch2_dirent_lookup_trans(struct btree_trans *, u64,
+			   const struct bch_hash_info *,
+			   const struct qstr *);
 u64 bch2_dirent_lookup(struct bch_fs *, u64, const struct bch_hash_info *,
 		       const struct qstr *);
 
 int bch2_empty_dir_trans(struct btree_trans *, u64);
-int bch2_empty_dir(struct bch_fs *, u64);
-int bch2_readdir(struct bch_fs *, struct file *, struct dir_context *);
+int bch2_readdir(struct bch_fs *, u64, struct dir_context *);
 
 #endif /* _BCACHEFS_DIRENT_H */
diff --git a/fs/bcachefs/fs-common.c b/fs/bcachefs/fs-common.c
new file mode 100644
index 000000000000..fdd2b9b6716f
--- /dev/null
+++ b/fs/bcachefs/fs-common.c
@@ -0,0 +1,280 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "acl.h"
+#include "btree_update.h"
+#include "dirent.h"
+#include "fs-common.h"
+#include "inode.h"
+#include "xattr.h"
+
+#include <linux/posix_acl.h>
+
+int bch2_create_trans(struct btree_trans *trans, u64 dir_inum,
+		      struct bch_inode_unpacked *dir_u,
+		      struct bch_inode_unpacked *new_inode,
+		      const struct qstr *name,
+		      uid_t uid, gid_t gid, umode_t mode, dev_t rdev,
+		      struct posix_acl *default_acl,
+		      struct posix_acl *acl)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter *dir_iter;
+	struct bch_hash_info hash = bch2_hash_info_init(c, new_inode);
+	u64 now = bch2_current_time(trans->c);
+	int ret;
+
+	dir_iter = bch2_inode_peek(trans, dir_u, dir_inum,
+				   name ? BTREE_ITER_INTENT : 0);
+	if (IS_ERR(dir_iter))
+		return PTR_ERR(dir_iter);
+
+	bch2_inode_init_late(new_inode, now, uid, gid, mode, rdev, dir_u);
+
+	if (!name)
+		new_inode->bi_flags |= BCH_INODE_UNLINKED;
+
+	ret = bch2_inode_create(trans, new_inode,
+				BLOCKDEV_INODE_MAX, 0,
+				&c->unused_inode_hint);
+	if (ret)
+		return ret;
+
+	if (default_acl) {
+		ret = bch2_set_acl_trans(trans, new_inode, &hash,
+					 default_acl, ACL_TYPE_DEFAULT);
+		if (ret)
+			return ret;
+	}
+
+	if (acl) {
+		ret = bch2_set_acl_trans(trans, new_inode, &hash,
+					 acl, ACL_TYPE_ACCESS);
+		if (ret)
+			return ret;
+	}
+
+	if (name) {
+		struct bch_hash_info dir_hash = bch2_hash_info_init(c, dir_u);
+		dir_u->bi_mtime = dir_u->bi_ctime = now;
+
+		if (S_ISDIR(new_inode->bi_mode))
+			dir_u->bi_nlink++;
+
+		ret = bch2_inode_write(trans, dir_iter, dir_u);
+		if (ret)
+			return ret;
+
+		ret = bch2_dirent_create(trans, dir_inum, &dir_hash,
+					 mode_to_type(new_inode->bi_mode),
+					 name, new_inode->bi_inum,
+					 BCH_HASH_SET_MUST_CREATE);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+int bch2_link_trans(struct btree_trans *trans,
+		    u64 dir_inum,
+		    u64 inum, struct bch_inode_unpacked *inode_u,
+		    const struct qstr *name)
+{
+	struct btree_iter *dir_iter, *inode_iter;
+	struct bch_inode_unpacked dir_u;
+	struct bch_hash_info dir_hash;
+	u64 now = bch2_current_time(trans->c);
+
+	dir_iter = bch2_inode_peek(trans, &dir_u, dir_inum, 0);
+	if (IS_ERR(dir_iter))
+		return PTR_ERR(dir_iter);
+
+	inode_iter = bch2_inode_peek(trans, inode_u, inum, BTREE_ITER_INTENT);
+	if (IS_ERR(inode_iter))
+		return PTR_ERR(inode_iter);
+
+	dir_hash = bch2_hash_info_init(trans->c, &dir_u);
+
+	inode_u->bi_ctime = now;
+	bch2_inode_nlink_inc(inode_u);
+
+	return bch2_dirent_create(trans, dir_inum, &dir_hash,
+				  mode_to_type(inode_u->bi_mode),
+				  name, inum, BCH_HASH_SET_MUST_CREATE) ?:
+		bch2_inode_write(trans, inode_iter, inode_u);
+}
+
+int bch2_unlink_trans(struct btree_trans *trans,
+		      u64 dir_inum, struct bch_inode_unpacked *dir_u,
+		      struct bch_inode_unpacked *inode_u,
+		      const struct qstr *name)
+{
+	struct btree_iter *dir_iter, *dirent_iter, *inode_iter;
+	struct bch_hash_info dir_hash;
+	u64 inum, now = bch2_current_time(trans->c);
+	struct bkey_s_c k;
+
+	dir_iter = bch2_inode_peek(trans, dir_u, dir_inum, BTREE_ITER_INTENT);
+	if (IS_ERR(dir_iter))
+		return PTR_ERR(dir_iter);
+
+	dir_hash = bch2_hash_info_init(trans->c, dir_u);
+
+	dirent_iter = __bch2_dirent_lookup_trans(trans, dir_inum,
+						 &dir_hash, name);
+	if (IS_ERR(dirent_iter))
+		return PTR_ERR(dirent_iter);
+
+	k = bch2_btree_iter_peek_slot(dirent_iter);
+	inum = le64_to_cpu(bkey_s_c_to_dirent(k).v->d_inum);
+
+	inode_iter = bch2_inode_peek(trans, inode_u, inum, BTREE_ITER_INTENT);
+	if (IS_ERR(inode_iter))
+		return PTR_ERR(inode_iter);
+
+	dir_u->bi_mtime = dir_u->bi_ctime = inode_u->bi_ctime = now;
+	dir_u->bi_nlink -= S_ISDIR(inode_u->bi_mode);
+	bch2_inode_nlink_dec(inode_u);
+
+	return  (S_ISDIR(inode_u->bi_mode)
+		 ? bch2_empty_dir_trans(trans, inum)
+		 : 0) ?:
+		bch2_dirent_delete_at(trans, &dir_hash, dirent_iter) ?:
+		bch2_inode_write(trans, dir_iter, dir_u) ?:
+		bch2_inode_write(trans, inode_iter, inode_u);
+}
+
+bool bch2_reinherit_attrs(struct bch_inode_unpacked *dst_u,
+			  struct bch_inode_unpacked *src_u)
+{
+	u64 src, dst;
+	unsigned id;
+	bool ret = false;
+
+	for (id = 0; id < Inode_opt_nr; id++) {
+		if (dst_u->bi_fields_set & (1 << id))
+			continue;
+
+		src = bch2_inode_opt_get(src_u, id);
+		dst = bch2_inode_opt_get(dst_u, id);
+
+		if (src == dst)
+			continue;
+
+		bch2_inode_opt_set(dst_u, id, src);
+		ret = true;
+	}
+
+	return ret;
+}
+
+int bch2_rename_trans(struct btree_trans *trans,
+		      u64 src_dir, struct bch_inode_unpacked *src_dir_u,
+		      u64 dst_dir, struct bch_inode_unpacked *dst_dir_u,
+		      struct bch_inode_unpacked *src_inode_u,
+		      struct bch_inode_unpacked *dst_inode_u,
+		      const struct qstr *src_name,
+		      const struct qstr *dst_name,
+		      enum bch_rename_mode mode)
+{
+	struct btree_iter *src_dir_iter, *dst_dir_iter = NULL;
+	struct btree_iter *src_inode_iter, *dst_inode_iter = NULL;
+	struct bch_hash_info src_hash, dst_hash;
+	u64 src_inode, dst_inode, now = bch2_current_time(trans->c);
+	int ret;
+
+	src_dir_iter = bch2_inode_peek(trans, src_dir_u, src_dir,
+				       BTREE_ITER_INTENT);
+	if (IS_ERR(src_dir_iter))
+		return PTR_ERR(src_dir_iter);
+
+	src_hash = bch2_hash_info_init(trans->c, src_dir_u);
+
+	if (dst_dir != src_dir) {
+		dst_dir_iter = bch2_inode_peek(trans, dst_dir_u, dst_dir,
+					       BTREE_ITER_INTENT);
+		if (IS_ERR(dst_dir_iter))
+			return PTR_ERR(dst_dir_iter);
+
+		dst_hash = bch2_hash_info_init(trans->c, dst_dir_u);
+	} else {
+		dst_dir_u = src_dir_u;
+		dst_hash = src_hash;
+	}
+
+	ret = bch2_dirent_rename(trans,
+				 src_dir, &src_hash,
+				 dst_dir, &dst_hash,
+				 src_name, &src_inode,
+				 dst_name, &dst_inode,
+				 mode);
+	if (ret)
+		return ret;
+
+	src_inode_iter = bch2_inode_peek(trans, src_inode_u, src_inode,
+					 BTREE_ITER_INTENT);
+	if (IS_ERR(src_inode_iter))
+		return PTR_ERR(src_inode_iter);
+
+	if (dst_inode) {
+		dst_inode_iter = bch2_inode_peek(trans, dst_inode_u, dst_inode,
+						 BTREE_ITER_INTENT);
+		if (IS_ERR(dst_inode_iter))
+			return PTR_ERR(dst_inode_iter);
+	}
+
+	if (mode == BCH_RENAME_OVERWRITE) {
+		if (S_ISDIR(src_inode_u->bi_mode) !=
+		    S_ISDIR(dst_inode_u->bi_mode))
+			return -ENOTDIR;
+
+		if (S_ISDIR(dst_inode_u->bi_mode) &&
+		    bch2_empty_dir_trans(trans, dst_inode))
+			return -ENOTEMPTY;
+	}
+
+	if (bch2_reinherit_attrs(src_inode_u, dst_dir_u) &&
+	    S_ISDIR(src_inode_u->bi_mode))
+		return -EXDEV;
+
+	if (mode == BCH_RENAME_EXCHANGE &&
+	    bch2_reinherit_attrs(dst_inode_u, src_dir_u) &&
+	    S_ISDIR(dst_inode_u->bi_mode))
+		return -EXDEV;
+
+	if (S_ISDIR(src_inode_u->bi_mode)) {
+		src_dir_u->bi_nlink--;
+		dst_dir_u->bi_nlink++;
+	}
+
+	if (dst_inode && S_ISDIR(dst_inode_u->bi_mode)) {
+		dst_dir_u->bi_nlink--;
+		src_dir_u->bi_nlink += mode == BCH_RENAME_EXCHANGE;
+	}
+
+	if (mode == BCH_RENAME_OVERWRITE)
+		bch2_inode_nlink_dec(dst_inode_u);
+
+	src_dir_u->bi_mtime		= now;
+	src_dir_u->bi_ctime		= now;
+
+	if (src_dir != dst_dir) {
+		dst_dir_u->bi_mtime	= now;
+		dst_dir_u->bi_ctime	= now;
+	}
+
+	src_inode_u->bi_ctime		= now;
+
+	if (dst_inode)
+		dst_inode_u->bi_ctime	= now;
+
+	return  bch2_inode_write(trans, src_dir_iter, src_dir_u) ?:
+		(src_dir != dst_dir
+		 ? bch2_inode_write(trans, dst_dir_iter, dst_dir_u)
+		 : 0 ) ?:
+		bch2_inode_write(trans, src_inode_iter, src_inode_u) ?:
+		(dst_inode
+		 ? bch2_inode_write(trans, dst_inode_iter, dst_inode_u)
+		 : 0 );
+}
diff --git a/fs/bcachefs/fs-common.h b/fs/bcachefs/fs-common.h
new file mode 100644
index 000000000000..7adcfcf92aec
--- /dev/null
+++ b/fs/bcachefs/fs-common.h
@@ -0,0 +1,37 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_FS_COMMON_H
+#define _BCACHEFS_FS_COMMON_H
+
+struct posix_acl;
+
+int bch2_create_trans(struct btree_trans *, u64,
+		      struct bch_inode_unpacked *,
+		      struct bch_inode_unpacked *,
+		      const struct qstr *,
+		      uid_t, gid_t, umode_t, dev_t,
+		      struct posix_acl *,
+		      struct posix_acl *);
+
+int bch2_link_trans(struct btree_trans *,
+		    u64,
+		    u64, struct bch_inode_unpacked *,
+		    const struct qstr *);
+
+int bch2_unlink_trans(struct btree_trans *,
+		      u64, struct bch_inode_unpacked *,
+		      struct bch_inode_unpacked *,
+		      const struct qstr *);
+
+int bch2_rename_trans(struct btree_trans *,
+		      u64, struct bch_inode_unpacked *,
+		      u64, struct bch_inode_unpacked *,
+		      struct bch_inode_unpacked *,
+		      struct bch_inode_unpacked *,
+		      const struct qstr *,
+		      const struct qstr *,
+		      enum bch_rename_mode);
+
+bool bch2_reinherit_attrs(struct bch_inode_unpacked *,
+			  struct bch_inode_unpacked *);
+
+#endif /* _BCACHEFS_FS_COMMON_H */
diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c
index 0cf2621ec4fc..acc0a230ff0c 100644
--- a/fs/bcachefs/fs-ioctl.c
+++ b/fs/bcachefs/fs-ioctl.c
@@ -5,6 +5,7 @@
 #include "chardev.h"
 #include "dirent.h"
 #include "fs.h"
+#include "fs-common.h"
 #include "fs-ioctl.h"
 #include "quota.h"
 
@@ -164,6 +165,15 @@ err:
 	return ret;
 }
 
+static int bch2_reinherit_attrs_fn(struct bch_inode_info *inode,
+				   struct bch_inode_unpacked *bi,
+				   void *p)
+{
+	struct bch_inode_info *dir = p;
+
+	return !bch2_reinherit_attrs(bi, &dir->ei_inode);
+}
+
 static int bch2_ioc_reinherit_attrs(struct bch_fs *c,
 				    struct file *file,
 				    struct bch_inode_info *src,
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index cbe1b90e80c2..b19a2deed5c1 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -9,6 +9,7 @@
 #include "dirent.h"
 #include "extents.h"
 #include "fs.h"
+#include "fs-common.h"
 #include "fs-io.h"
 #include "fs-ioctl.h"
 #include "fsck.h"
@@ -148,34 +149,13 @@ void bch2_inode_update_after_write(struct bch_fs *c,
 	bch2_inode_flags_to_vfs(inode);
 }
 
-int __must_check bch2_write_inode_trans(struct btree_trans *trans,
-				struct bch_inode_info *inode,
-				struct bch_inode_unpacked *inode_u,
-				inode_set_fn set,
-				void *p)
-{
-	struct btree_iter *iter = NULL;
-	int ret = 0;
-
-	iter = bch2_inode_peek(trans, inode_u, inode->v.i_ino,
-			       BTREE_ITER_INTENT);
-	ret = PTR_ERR_OR_ZERO(iter);
-	if (ret)
-		return ret;
-
-	ret = set ? set(inode, inode_u, p) : 0;
-	if (ret)
-		return ret;
-
-	return bch2_inode_write(trans, iter, inode_u);
-}
-
 int __must_check bch2_write_inode(struct bch_fs *c,
 				  struct bch_inode_info *inode,
 				  inode_set_fn set,
 				  void *p, unsigned fields)
 {
 	struct btree_trans trans;
+	struct btree_iter *iter;
 	struct bch_inode_unpacked inode_u;
 	int ret;
 
@@ -183,7 +163,11 @@ int __must_check bch2_write_inode(struct bch_fs *c,
 retry:
 	bch2_trans_begin(&trans);
 
-	ret = bch2_write_inode_trans(&trans, inode, &inode_u, set, p) ?:
+	iter = bch2_inode_peek(&trans, &inode_u, inode->v.i_ino,
+			       BTREE_ITER_INTENT);
+	ret   = PTR_ERR_OR_ZERO(iter) ?:
+		(set ? set(inode, &inode_u, p) : 0) ?:
+		bch2_inode_write(&trans, iter, &inode_u) ?:
 		bch2_trans_commit(&trans, NULL,
 				  &inode->ei_journal_seq,
 				  BTREE_INSERT_ATOMIC|
@@ -238,32 +222,6 @@ int bch2_fs_quota_transfer(struct bch_fs *c,
 	return ret;
 }
 
-int bch2_reinherit_attrs_fn(struct bch_inode_info *inode,
-			    struct bch_inode_unpacked *bi,
-			    void *p)
-{
-	struct bch_inode_info *dir = p;
-	u64 src, dst;
-	unsigned id;
-	int ret = 1;
-
-	for (id = 0; id < Inode_opt_nr; id++) {
-		if (bi->bi_fields_set & (1 << id))
-			continue;
-
-		src = bch2_inode_opt_get(&dir->ei_inode, id);
-		dst = bch2_inode_opt_get(bi, id);
-
-		if (src == dst)
-			continue;
-
-		bch2_inode_opt_set(bi, id, src);
-		ret = 0;
-	}
-
-	return ret;
-}
-
 struct inode *bch2_vfs_inode_get(struct bch_fs *c, u64 inum)
 {
 	struct bch_inode_unpacked inode_u;
@@ -291,39 +249,6 @@ struct inode *bch2_vfs_inode_get(struct bch_fs *c, u64 inum)
 	return &inode->v;
 }
 
-static void bch2_inode_init_owner(struct bch_inode_unpacked *inode_u,
-				  const struct inode *dir, umode_t mode)
-{
-	kuid_t uid = current_fsuid();
-	kgid_t gid;
-
-	if (dir && dir->i_mode & S_ISGID) {
-		gid = dir->i_gid;
-		if (S_ISDIR(mode))
-			mode |= S_ISGID;
-	} else
-		gid = current_fsgid();
-
-	inode_u->bi_uid		= from_kuid(i_user_ns(dir), uid);
-	inode_u->bi_gid		= from_kgid(i_user_ns(dir), gid);
-	inode_u->bi_mode	= mode;
-}
-
-static int inode_update_for_create_fn(struct bch_inode_info *inode,
-				      struct bch_inode_unpacked *bi,
-				      void *p)
-{
-	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	struct bch_inode_unpacked *new_inode = p;
-
-	bi->bi_mtime = bi->bi_ctime = bch2_current_time(c);
-
-	if (S_ISDIR(new_inode->bi_mode))
-		bi->bi_nlink++;
-
-	return 0;
-}
-
 static int inum_test(struct inode *inode, void *p)
 {
 	unsigned long *ino = p;
@@ -341,40 +266,27 @@ __bch2_create(struct mnt_idmap *idmap,
 	struct bch_inode_unpacked dir_u;
 	struct bch_inode_info *inode, *old;
 	struct bch_inode_unpacked inode_u;
-	struct bch_hash_info hash_info;
 	struct posix_acl *default_acl = NULL, *acl = NULL;
 	u64 journal_seq = 0;
 	int ret;
 
-	bch2_inode_init(c, &inode_u, 0, 0, 0, rdev, &dir->ei_inode);
-	bch2_inode_init_owner(&inode_u, &dir->v, mode);
-
-	hash_info = bch2_hash_info_init(c, &inode_u);
-
-	if (tmpfile)
-		inode_u.bi_flags |= BCH_INODE_UNLINKED;
-
-	ret = bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, 1,
-			      KEY_TYPE_QUOTA_PREALLOC);
-	if (ret)
-		return ERR_PTR(ret);
-
+	/*
+	 * preallocate acls + vfs inode before btree transaction, so that
+	 * nothing can fail after the transaction succeeds:
+	 */
 #ifdef CONFIG_BCACHEFS_POSIX_ACL
-	ret = posix_acl_create(&dir->v, &inode_u.bi_mode, &default_acl, &acl);
+	ret = posix_acl_create(&dir->v, &mode, &default_acl, &acl);
 	if (ret)
-		goto err;
+		return ERR_PTR(ret);
 #endif
-
-	/*
-	 * preallocate vfs inode before btree transaction, so that nothing can
-	 * fail after the transaction succeeds:
-	 */
 	inode = to_bch_ei(new_inode(c->vfs_sb));
 	if (unlikely(!inode)) {
-		ret = -ENOMEM;
+		inode = ERR_PTR(-ENOMEM);
 		goto err;
 	}
 
+	bch2_inode_init_early(c, &inode_u);
+
 	if (!tmpfile)
 		mutex_lock(&dir->ei_update_lock);
 
@@ -382,38 +294,28 @@ __bch2_create(struct mnt_idmap *idmap,
 retry:
 	bch2_trans_begin(&trans);
 
-	ret   = __bch2_inode_create(&trans, &inode_u,
-				    BLOCKDEV_INODE_MAX, 0,
-				    &c->unused_inode_hint) ?:
-		(default_acl
-		 ? bch2_set_acl_trans(&trans, &inode_u, &hash_info,
-				      default_acl, ACL_TYPE_DEFAULT)
-		 : 0) ?:
-		(acl
-		 ? bch2_set_acl_trans(&trans, &inode_u, &hash_info,
-				      acl, ACL_TYPE_ACCESS)
-		 : 0) ?:
-		(!tmpfile
-		 ? __bch2_dirent_create(&trans, dir->v.i_ino,
-					&dir->ei_str_hash,
-					mode_to_type(mode),
-					&dentry->d_name,
-					inode_u.bi_inum,
-					BCH_HASH_SET_MUST_CREATE)
-		: 0) ?:
-		(!tmpfile
-		 ? bch2_write_inode_trans(&trans, dir, &dir_u,
-					  inode_update_for_create_fn,
-					  &inode_u)
-		 : 0) ?:
-		bch2_trans_commit(&trans, NULL,
-				  &journal_seq,
+	ret   = bch2_create_trans(&trans, dir->v.i_ino, &dir_u, &inode_u,
+				  !tmpfile ? &dentry->d_name : NULL,
+				  from_kuid(i_user_ns(&dir->v), current_fsuid()),
+				  from_kgid(i_user_ns(&dir->v), current_fsgid()),
+				  mode, rdev,
+				  default_acl, acl) ?:
+		bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, 1,
+				KEY_TYPE_QUOTA_PREALLOC);
+	if (unlikely(ret))
+		goto err_before_quota;
+
+	ret   = bch2_trans_commit(&trans, NULL, &journal_seq,
 				  BTREE_INSERT_ATOMIC|
 				  BTREE_INSERT_NOUNLOCK);
-	if (ret == -EINTR)
-		goto retry;
-	if (unlikely(ret))
+	if (unlikely(ret)) {
+		bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, -1,
+				KEY_TYPE_QUOTA_WARN);
+err_before_quota:
+		if (ret == -EINTR)
+			goto retry;
 		goto err_trans;
+	}
 
 	if (!tmpfile) {
 		bch2_inode_update_after_write(c, dir, &dir_u,
@@ -444,7 +346,7 @@ retry:
 		 * We raced, another process pulled the new inode into cache
 		 * before us:
 		 */
-		old->ei_journal_seq = inode->ei_journal_seq;
+		journal_seq_copy(old, journal_seq);
 		make_bad_inode(&inode->v);
 		iput(&inode->v);
 
@@ -458,7 +360,7 @@ retry:
 	}
 
 	bch2_trans_exit(&trans);
-out:
+err:
 	posix_acl_release(default_acl);
 	posix_acl_release(acl);
 	return inode;
@@ -469,10 +371,8 @@ err_trans:
 	bch2_trans_exit(&trans);
 	make_bad_inode(&inode->v);
 	iput(&inode->v);
-err:
-	bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, -1, KEY_TYPE_QUOTA_WARN);
 	inode = ERR_PTR(ret);
-	goto out;
+	goto err;
 }
 
 /* methods */
@@ -515,40 +415,23 @@ static int __bch2_link(struct bch_fs *c,
 		       struct dentry *dentry)
 {
 	struct btree_trans trans;
-	struct btree_iter *inode_iter;
 	struct bch_inode_unpacked inode_u;
 	int ret;
 
 	mutex_lock(&inode->ei_update_lock);
 	bch2_trans_init(&trans, c, 4, 1024);
-retry:
-	bch2_trans_begin(&trans);
-	ret   = __bch2_dirent_create(&trans, dir->v.i_ino,
-				     &dir->ei_str_hash,
-				     mode_to_type(inode->v.i_mode),
-				     &dentry->d_name,
-				     inode->v.i_ino,
-				     BCH_HASH_SET_MUST_CREATE);
-	if (ret)
-		goto err;
 
-	inode_iter = bch2_inode_peek(&trans, &inode_u, inode->v.i_ino,
-				     BTREE_ITER_INTENT);
-	ret = PTR_ERR_OR_ZERO(inode_iter);
-	if (ret)
-		goto err;
-
-	inode_u.bi_ctime = bch2_current_time(c);
-	bch2_inode_nlink_inc(&inode_u);
-
-	ret =   bch2_inode_write(&trans, inode_iter, &inode_u) ?:
-		bch2_trans_commit(&trans, NULL,
-				&inode->ei_journal_seq,
-				BTREE_INSERT_ATOMIC|
-				BTREE_INSERT_NOUNLOCK);
-err:
-	if (ret == -EINTR)
-		goto retry;
+	do {
+		bch2_trans_begin(&trans);
+		ret   = bch2_link_trans(&trans,
+					dir->v.i_ino,
+					inode->v.i_ino, &inode_u,
+					&dentry->d_name) ?:
+			bch2_trans_commit(&trans, NULL,
+					&inode->ei_journal_seq,
+					BTREE_INSERT_ATOMIC|
+					BTREE_INSERT_NOUNLOCK);
+	} while (ret == -EINTR);
 
 	if (likely(!ret))
 		bch2_inode_update_after_write(c, inode, &inode_u, ATTR_CTIME);
@@ -582,60 +465,36 @@ static int bch2_unlink(struct inode *vdir, struct dentry *dentry)
 	struct bch_fs *c = vdir->i_sb->s_fs_info;
 	struct bch_inode_info *dir = to_bch_ei(vdir);
 	struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
-	struct btree_iter *dir_iter, *inode_iter;
 	struct bch_inode_unpacked dir_u, inode_u;
 	struct btree_trans trans;
 	int ret;
 
 	bch2_lock_inodes(INODE_UPDATE_LOCK, dir, inode);
 	bch2_trans_init(&trans, c, 4, 1024);
-retry:
-	bch2_trans_begin(&trans);
-
-	ret   = __bch2_dirent_delete(&trans, dir->v.i_ino,
-				     &dir->ei_str_hash,
-				     &dentry->d_name);
-	if (ret)
-		goto btree_err;
-
-	dir_iter = bch2_inode_peek(&trans, &dir_u, dir->v.i_ino,
-				   BTREE_ITER_INTENT);
-	ret = PTR_ERR_OR_ZERO(dir_iter);
-	if (ret)
-		goto btree_err;
-
-	inode_iter = bch2_inode_peek(&trans, &inode_u, inode->v.i_ino,
-				     BTREE_ITER_INTENT);
-	ret = PTR_ERR_OR_ZERO(inode_iter);
-	if (ret)
-		goto btree_err;
-
-	dir_u.bi_mtime = dir_u.bi_ctime = inode_u.bi_ctime =
-		bch2_current_time(c);
-
-	dir_u.bi_nlink -= S_ISDIR(inode_u.bi_mode);
-	bch2_inode_nlink_dec(&inode_u);
 
-	ret =   bch2_inode_write(&trans, dir_iter, &dir_u) ?:
-		bch2_inode_write(&trans, inode_iter, &inode_u) ?:
-		bch2_trans_commit(&trans, NULL,
-				  &dir->ei_journal_seq,
-				  BTREE_INSERT_ATOMIC|
-				  BTREE_INSERT_NOUNLOCK|
-				  BTREE_INSERT_NOFAIL);
-btree_err:
-	if (ret == -EINTR)
-		goto retry;
-	if (ret)
-		goto err;
-
-	journal_seq_copy(inode, dir->ei_journal_seq);
+	do {
+		bch2_trans_begin(&trans);
+
+		ret   = bch2_unlink_trans(&trans,
+					  dir->v.i_ino, &dir_u,
+					  &inode_u, &dentry->d_name) ?:
+			bch2_trans_commit(&trans, NULL,
+					  &dir->ei_journal_seq,
+					  BTREE_INSERT_ATOMIC|
+					  BTREE_INSERT_NOUNLOCK|
+					  BTREE_INSERT_NOFAIL);
+	} while (ret == -EINTR);
+
+	if (likely(!ret)) {
+		BUG_ON(inode_u.bi_inum != inode->v.i_ino);
+
+		journal_seq_copy(inode, dir->ei_journal_seq);
+		bch2_inode_update_after_write(c, dir, &dir_u,
+					      ATTR_MTIME|ATTR_CTIME);
+		bch2_inode_update_after_write(c, inode, &inode_u,
+					      ATTR_MTIME);
+	}
 
-	bch2_inode_update_after_write(c, dir, &dir_u,
-				      ATTR_MTIME|ATTR_CTIME);
-	bch2_inode_update_after_write(c, inode, &inode_u,
-				      ATTR_MTIME);
-err:
 	bch2_trans_exit(&trans);
 	bch2_unlock_inodes(INODE_UPDATE_LOCK, dir, inode);
 
@@ -693,11 +552,6 @@ static int bch2_mkdir(struct mnt_idmap *idmap,
 
 static int bch2_rmdir(struct inode *vdir, struct dentry *dentry)
 {
-	struct bch_fs *c = vdir->i_sb->s_fs_info;
-
-	if (bch2_empty_dir(c, dentry->d_inode->i_ino))
-		return -ENOTEMPTY;
-
 	return bch2_unlink(vdir, dentry);
 }
 
@@ -715,99 +569,31 @@ static int bch2_mknod(struct mnt_idmap *idmap,
 	return 0;
 }
 
-struct rename_info {
-	u64			now;
-	struct bch_inode_info	*src_dir;
-	struct bch_inode_info	*dst_dir;
-	struct bch_inode_info	*src_inode;
-	struct bch_inode_info	*dst_inode;
-	enum bch_rename_mode	mode;
-};
-
-static int inode_update_for_rename_fn(struct bch_inode_info *inode,
-				      struct bch_inode_unpacked *bi,
-				      void *p)
-{
-	struct rename_info *info = p;
-	int ret;
-
-	if (inode == info->src_dir) {
-		bi->bi_nlink -= S_ISDIR(info->src_inode->v.i_mode);
-		bi->bi_nlink += info->dst_inode &&
-			S_ISDIR(info->dst_inode->v.i_mode) &&
-			info->mode == BCH_RENAME_EXCHANGE;
-	}
-
-	if (inode == info->dst_dir) {
-		bi->bi_nlink += S_ISDIR(info->src_inode->v.i_mode);
-		bi->bi_nlink -= info->dst_inode &&
-			S_ISDIR(info->dst_inode->v.i_mode);
-	}
-
-	if (inode == info->src_inode) {
-		ret = bch2_reinherit_attrs_fn(inode, bi, info->dst_dir);
-
-		BUG_ON(!ret && S_ISDIR(info->src_inode->v.i_mode));
-	}
-
-	if (inode == info->dst_inode &&
-	    info->mode == BCH_RENAME_EXCHANGE) {
-		ret = bch2_reinherit_attrs_fn(inode, bi, info->src_dir);
-
-		BUG_ON(!ret && S_ISDIR(info->dst_inode->v.i_mode));
-	}
-
-	if (inode == info->dst_inode &&
-	    info->mode == BCH_RENAME_OVERWRITE) {
-		BUG_ON(bi->bi_nlink &&
-		       S_ISDIR(info->dst_inode->v.i_mode));
-
-		bch2_inode_nlink_dec(bi);
-	}
-
-	if (inode == info->src_dir ||
-	    inode == info->dst_dir)
-		bi->bi_mtime = info->now;
-	bi->bi_ctime = info->now;
-
-	return 0;
-}
-
 static int bch2_rename2(struct mnt_idmap *idmap,
 			struct inode *src_vdir, struct dentry *src_dentry,
 			struct inode *dst_vdir, struct dentry *dst_dentry,
 			unsigned flags)
 {
 	struct bch_fs *c = src_vdir->i_sb->s_fs_info;
-	struct rename_info i = {
-		.src_dir	= to_bch_ei(src_vdir),
-		.dst_dir	= to_bch_ei(dst_vdir),
-		.src_inode	= to_bch_ei(src_dentry->d_inode),
-		.dst_inode	= to_bch_ei(dst_dentry->d_inode),
-		.mode		= flags & RENAME_EXCHANGE
-				? BCH_RENAME_EXCHANGE
-			: dst_dentry->d_inode
-				? BCH_RENAME_OVERWRITE : BCH_RENAME,
-	};
-	struct btree_trans trans;
+	struct bch_inode_info *src_dir = to_bch_ei(src_vdir);
+	struct bch_inode_info *dst_dir = to_bch_ei(dst_vdir);
+	struct bch_inode_info *src_inode = to_bch_ei(src_dentry->d_inode);
+	struct bch_inode_info *dst_inode = to_bch_ei(dst_dentry->d_inode);
 	struct bch_inode_unpacked dst_dir_u, src_dir_u;
 	struct bch_inode_unpacked src_inode_u, dst_inode_u;
+	struct btree_trans trans;
+	enum bch_rename_mode mode = flags & RENAME_EXCHANGE
+		? BCH_RENAME_EXCHANGE
+		: dst_dentry->d_inode
+		? BCH_RENAME_OVERWRITE : BCH_RENAME;
 	u64 journal_seq = 0;
 	int ret;
 
 	if (flags & ~(RENAME_NOREPLACE|RENAME_EXCHANGE))
 		return -EINVAL;
 
-	if (i.mode == BCH_RENAME_OVERWRITE) {
-		if (S_ISDIR(i.src_inode->v.i_mode) !=
-		    S_ISDIR(i.dst_inode->v.i_mode))
-			return -ENOTDIR;
-
-		if (S_ISDIR(i.src_inode->v.i_mode) &&
-		    bch2_empty_dir(c, i.dst_inode->v.i_ino))
-			return -ENOTEMPTY;
-
-		ret = filemap_write_and_wait_range(i.src_inode->v.i_mapping,
+	if (mode == BCH_RENAME_OVERWRITE) {
+		ret = filemap_write_and_wait_range(src_inode->v.i_mapping,
 						   0, LLONG_MAX);
 		if (ret)
 			return ret;
@@ -816,37 +602,24 @@ static int bch2_rename2(struct mnt_idmap *idmap,
 	bch2_trans_init(&trans, c, 8, 2048);
 
 	bch2_lock_inodes(INODE_UPDATE_LOCK,
-			 i.src_dir,
-			 i.dst_dir,
-			 i.src_inode,
-			 i.dst_inode);
-
-	if (S_ISDIR(i.src_inode->v.i_mode) &&
-	    inode_attrs_changing(i.dst_dir, i.src_inode)) {
-		ret = -EXDEV;
-		goto err;
-	}
-
-	if (i.mode == BCH_RENAME_EXCHANGE &&
-	    S_ISDIR(i.dst_inode->v.i_mode) &&
-	    inode_attrs_changing(i.src_dir, i.dst_inode)) {
-		ret = -EXDEV;
-		goto err;
-	}
-
-	if (inode_attr_changing(i.dst_dir, i.src_inode, Inode_opt_project)) {
-		ret = bch2_fs_quota_transfer(c, i.src_inode,
-					     i.dst_dir->ei_qid,
+			 src_dir,
+			 dst_dir,
+			 src_inode,
+			 dst_inode);
+
+	if (inode_attr_changing(dst_dir, src_inode, Inode_opt_project)) {
+		ret = bch2_fs_quota_transfer(c, src_inode,
+					     dst_dir->ei_qid,
 					     1 << QTYP_PRJ,
 					     KEY_TYPE_QUOTA_PREALLOC);
 		if (ret)
 			goto err;
 	}
 
-	if (i.mode == BCH_RENAME_EXCHANGE &&
-	    inode_attr_changing(i.src_dir, i.dst_inode, Inode_opt_project)) {
-		ret = bch2_fs_quota_transfer(c, i.dst_inode,
-					     i.src_dir->ei_qid,
+	if (mode == BCH_RENAME_EXCHANGE &&
+	    inode_attr_changing(src_dir, dst_inode, Inode_opt_project)) {
+		ret = bch2_fs_quota_transfer(c, dst_inode,
+					     src_dir->ei_qid,
 					     1 << QTYP_PRJ,
 					     KEY_TYPE_QUOTA_PREALLOC);
 		if (ret)
@@ -855,24 +628,14 @@ static int bch2_rename2(struct mnt_idmap *idmap,
 
 retry:
 	bch2_trans_begin(&trans);
-	i.now = bch2_current_time(c);
-
-	ret   = bch2_dirent_rename(&trans,
-				   i.src_dir, &src_dentry->d_name,
-				   i.dst_dir, &dst_dentry->d_name,
-				   i.mode) ?:
-		bch2_write_inode_trans(&trans, i.src_dir, &src_dir_u,
-				       inode_update_for_rename_fn, &i) ?:
-		(i.src_dir != i.dst_dir
-		 ? bch2_write_inode_trans(&trans, i.dst_dir, &dst_dir_u,
-				       inode_update_for_rename_fn, &i)
-		 : 0 ) ?:
-		bch2_write_inode_trans(&trans, i.src_inode, &src_inode_u,
-				       inode_update_for_rename_fn, &i) ?:
-		(i.dst_inode
-		 ? bch2_write_inode_trans(&trans, i.dst_inode, &dst_inode_u,
-				       inode_update_for_rename_fn, &i)
-		 : 0 ) ?:
+	ret   = bch2_rename_trans(&trans,
+				  src_dir->v.i_ino, &src_dir_u,
+				  dst_dir->v.i_ino, &dst_dir_u,
+				  &src_inode_u,
+				  &dst_inode_u,
+				  &src_dentry->d_name,
+				  &dst_dentry->d_name,
+				  mode) ?:
 		bch2_trans_commit(&trans, NULL,
 				  &journal_seq,
 				  BTREE_INSERT_ATOMIC|
@@ -882,43 +645,47 @@ retry:
 	if (unlikely(ret))
 		goto err;
 
-	bch2_inode_update_after_write(c, i.src_dir, &src_dir_u,
+	BUG_ON(src_inode->v.i_ino != src_inode_u.bi_inum);
+	BUG_ON(dst_inode &&
+	       dst_inode->v.i_ino != dst_inode_u.bi_inum);
+
+	bch2_inode_update_after_write(c, src_dir, &src_dir_u,
 				      ATTR_MTIME|ATTR_CTIME);
-	journal_seq_copy(i.src_dir, journal_seq);
+	journal_seq_copy(src_dir, journal_seq);
 
-	if (i.src_dir != i.dst_dir) {
-		bch2_inode_update_after_write(c, i.dst_dir, &dst_dir_u,
+	if (src_dir != dst_dir) {
+		bch2_inode_update_after_write(c, dst_dir, &dst_dir_u,
 					      ATTR_MTIME|ATTR_CTIME);
-		journal_seq_copy(i.dst_dir, journal_seq);
+		journal_seq_copy(dst_dir, journal_seq);
 	}
 
-	journal_seq_copy(i.src_inode, journal_seq);
-	if (i.dst_inode)
-		journal_seq_copy(i.dst_inode, journal_seq);
-
-	bch2_inode_update_after_write(c, i.src_inode, &src_inode_u,
+	bch2_inode_update_after_write(c, src_inode, &src_inode_u,
 				      ATTR_CTIME);
-	if (i.dst_inode)
-		bch2_inode_update_after_write(c, i.dst_inode, &dst_inode_u,
+	journal_seq_copy(src_inode, journal_seq);
+
+	if (dst_inode) {
+		bch2_inode_update_after_write(c, dst_inode, &dst_inode_u,
 					      ATTR_CTIME);
+		journal_seq_copy(dst_inode, journal_seq);
+	}
 err:
 	bch2_trans_exit(&trans);
 
-	bch2_fs_quota_transfer(c, i.src_inode,
-			       bch_qid(&i.src_inode->ei_inode),
+	bch2_fs_quota_transfer(c, src_inode,
+			       bch_qid(&src_inode->ei_inode),
 			       1 << QTYP_PRJ,
 			       KEY_TYPE_QUOTA_NOCHECK);
-	if (i.dst_inode)
-		bch2_fs_quota_transfer(c, i.dst_inode,
-				       bch_qid(&i.dst_inode->ei_inode),
+	if (dst_inode)
+		bch2_fs_quota_transfer(c, dst_inode,
+				       bch_qid(&dst_inode->ei_inode),
 				       1 << QTYP_PRJ,
 				       KEY_TYPE_QUOTA_NOCHECK);
 
 	bch2_unlock_inodes(INODE_UPDATE_LOCK,
-			   i.src_dir,
-			   i.dst_dir,
-			   i.src_inode,
-			   i.dst_inode);
+			   src_dir,
+			   dst_dir,
+			   src_inode,
+			   dst_inode);
 
 	return ret;
 }
@@ -1251,9 +1018,13 @@ static loff_t bch2_dir_llseek(struct file *file, loff_t offset, int whence)
 
 static int bch2_vfs_readdir(struct file *file, struct dir_context *ctx)
 {
-	struct bch_fs *c = file_inode(file)->i_sb->s_fs_info;
+	struct bch_inode_info *inode = file_bch_inode(file);
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+
+	if (!dir_emit_dots(file, ctx))
+		return 0;
 
-	return bch2_readdir(c, file, ctx);
+	return bch2_readdir(c, inode->v.i_ino, ctx);
 }
 
 static const struct file_operations bch_file_operations = {
diff --git a/fs/bcachefs/fs.h b/fs/bcachefs/fs.h
index c3ee9c17064f..b3a2993dd9bc 100644
--- a/fs/bcachefs/fs.h
+++ b/fs/bcachefs/fs.h
@@ -103,11 +103,6 @@ static inline struct bch_inode_info *file_bch_inode(struct file *file)
 	return to_bch_ei(file_inode(file));
 }
 
-static inline u8 mode_to_type(umode_t mode)
-{
-	return (mode >> 12) & 15;
-}
-
 static inline bool inode_attr_changing(struct bch_inode_info *dir,
 				struct bch_inode_info *inode,
 				enum inode_opt_id id)
@@ -162,17 +157,9 @@ void bch2_inode_update_after_write(struct bch_fs *,
 				   struct bch_inode_info *,
 				   struct bch_inode_unpacked *,
 				   unsigned);
-int __must_check bch2_write_inode_trans(struct btree_trans *,
-				struct bch_inode_info *,
-				struct bch_inode_unpacked *,
-				inode_set_fn, void *);
 int __must_check bch2_write_inode(struct bch_fs *, struct bch_inode_info *,
 				  inode_set_fn, void *, unsigned);
 
-int bch2_reinherit_attrs_fn(struct bch_inode_info *,
-			    struct bch_inode_unpacked *,
-			    void *);
-
 void bch2_vfs_exit(void);
 int bch2_vfs_init(void);
 
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index c5540536f47c..5acf1fb64543 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -4,7 +4,7 @@
 #include "btree_update.h"
 #include "dirent.h"
 #include "error.h"
-#include "fs.h"
+#include "fs-common.h"
 #include "fsck.h"
 #include "inode.h"
 #include "keylist.h"
@@ -80,9 +80,7 @@ static int reattach_inode(struct bch_fs *c,
 			  struct bch_inode_unpacked *lostfound_inode,
 			  u64 inum)
 {
-	struct bch_hash_info lostfound_hash_info =
-		bch2_hash_info_init(c, lostfound_inode);
-	struct bkey_inode_buf packed;
+	struct bch_inode_unpacked inode_u;
 	char name_buf[20];
 	struct qstr name;
 	int ret;
@@ -90,30 +88,14 @@ static int reattach_inode(struct bch_fs *c,
 	snprintf(name_buf, sizeof(name_buf), "%llu", inum);
 	name = (struct qstr) QSTR(name_buf);
 
-	lostfound_inode->bi_nlink++;
-
-	bch2_inode_pack(&packed, lostfound_inode);
-
-	ret = bch2_btree_insert(c, BTREE_ID_INODES, &packed.inode.k_i,
-				NULL, NULL,
-				BTREE_INSERT_NOFAIL|
-				BTREE_INSERT_LAZY_RW);
-	if (ret) {
-		bch_err(c, "error %i reattaching inode %llu while updating lost+found",
-			ret, inum);
-		return ret;
-	}
+	ret = bch2_trans_do(c, NULL,
+			    BTREE_INSERT_ATOMIC|
+			    BTREE_INSERT_LAZY_RW,
+		bch2_link_trans(&trans, lostfound_inode->bi_inum,
+				inum, &inode_u, &name));
+	if (ret)
+		bch_err(c, "error %i reattaching inode %llu", ret, inum);
 
-	ret = bch2_dirent_create(c, lostfound_inode->bi_inum,
-				 &lostfound_hash_info,
-				 DT_DIR, &name, inum, NULL,
-				 BTREE_INSERT_NOFAIL|
-				 BTREE_INSERT_LAZY_RW);
-	if (ret) {
-		bch_err(c, "error %i reattaching inode %llu while creating new dirent",
-			ret, inum);
-		return ret;
-	}
 	return ret;
 }
 
@@ -758,7 +740,7 @@ static int check_root(struct bch_fs *c, struct bch_inode_unpacked *root_inode)
 fsck_err:
 	return ret;
 create_root:
-	bch2_inode_init(c, root_inode, 0, 0, S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO,
+	bch2_inode_init(c, root_inode, 0, 0, S_IFDIR|0755,
 			0, NULL);
 	root_inode->bi_inum = BCACHEFS_ROOT_INO;
 
@@ -778,7 +760,6 @@ static int check_lostfound(struct bch_fs *c,
 	struct qstr lostfound = QSTR("lost+found");
 	struct bch_hash_info root_hash_info =
 		bch2_hash_info_init(c, root_inode);
-	struct bkey_inode_buf packed;
 	u64 inum;
 	int ret;
 
@@ -806,33 +787,20 @@ static int check_lostfound(struct bch_fs *c,
 fsck_err:
 	return ret;
 create_lostfound:
-	root_inode->bi_nlink++;
-
-	bch2_inode_pack(&packed, root_inode);
-
-	ret = bch2_btree_insert(c, BTREE_ID_INODES, &packed.inode.k_i,
-				NULL, NULL,
-				BTREE_INSERT_NOFAIL|
-				BTREE_INSERT_LAZY_RW);
-	if (ret)
-		return ret;
-
-	bch2_inode_init(c, lostfound_inode, 0, 0, S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO,
-			0, root_inode);
-
-	ret = bch2_inode_create(c, lostfound_inode, BLOCKDEV_INODE_MAX, 0,
-			       &c->unused_inode_hint);
+	bch2_inode_init_early(c, lostfound_inode);
+
+	ret = bch2_trans_do(c, NULL,
+			    BTREE_INSERT_ATOMIC|
+			    BTREE_INSERT_NOFAIL|
+			    BTREE_INSERT_LAZY_RW,
+		bch2_create_trans(&trans,
+				  BCACHEFS_ROOT_INO, root_inode,
+				  lostfound_inode, &lostfound,
+				  0, 0, S_IFDIR|0755, 0, NULL, NULL));
 	if (ret)
-		return ret;
-
-	ret = bch2_dirent_create(c, BCACHEFS_ROOT_INO, &root_hash_info, DT_DIR,
-				 &lostfound, lostfound_inode->bi_inum, NULL,
-				 BTREE_INSERT_NOFAIL|
-				 BTREE_INSERT_LAZY_RW);
-	if (ret)
-		return ret;
+		bch_err(c, "error creating lost+found: %i", ret);
 
-	return 0;
+	return ret;
 }
 
 struct inode_bitmap {
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index 3dc46faaebbc..aeae536b39f1 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -297,11 +297,9 @@ void bch2_inode_generation_to_text(struct printbuf *out, struct bch_fs *c,
 	pr_buf(out, "generation: %u", le32_to_cpu(gen.v->bi_generation));
 }
 
-void bch2_inode_init(struct bch_fs *c, struct bch_inode_unpacked *inode_u,
-		     uid_t uid, gid_t gid, umode_t mode, dev_t rdev,
-		     struct bch_inode_unpacked *parent)
+void bch2_inode_init_early(struct bch_fs *c,
+			   struct bch_inode_unpacked *inode_u)
 {
-	s64 now = bch2_current_time(c);
 	enum bch_str_hash_type str_hash =
 		bch2_str_hash_opt_to_type(c, c->opts.str_hash);
 
@@ -311,7 +309,12 @@ void bch2_inode_init(struct bch_fs *c, struct bch_inode_unpacked *inode_u,
 	inode_u->bi_flags |= str_hash << INODE_STR_HASH_OFFSET;
 	get_random_bytes(&inode_u->bi_hash_seed,
 			 sizeof(inode_u->bi_hash_seed));
+}
 
+void bch2_inode_init_late(struct bch_inode_unpacked *inode_u, u64 now,
+			  uid_t uid, gid_t gid, umode_t mode, dev_t rdev,
+			  struct bch_inode_unpacked *parent)
+{
 	inode_u->bi_mode	= mode;
 	inode_u->bi_uid		= uid;
 	inode_u->bi_gid		= gid;
@@ -321,6 +324,12 @@ void bch2_inode_init(struct bch_fs *c, struct bch_inode_unpacked *inode_u,
 	inode_u->bi_ctime	= now;
 	inode_u->bi_otime	= now;
 
+	if (parent && parent->bi_mode & S_ISGID) {
+		inode_u->bi_gid = parent->bi_gid;
+		if (S_ISDIR(mode))
+			inode_u->bi_mode |= S_ISGID;
+	}
+
 	if (parent) {
 #define x(_name, ...)	inode_u->bi_##_name = parent->bi_##_name;
 		BCH_INODE_OPTS()
@@ -328,6 +337,15 @@ void bch2_inode_init(struct bch_fs *c, struct bch_inode_unpacked *inode_u,
 	}
 }
 
+void bch2_inode_init(struct bch_fs *c, struct bch_inode_unpacked *inode_u,
+		     uid_t uid, gid_t gid, umode_t mode, dev_t rdev,
+		     struct bch_inode_unpacked *parent)
+{
+	bch2_inode_init_early(c, inode_u);
+	bch2_inode_init_late(inode_u, bch2_current_time(c),
+			     uid, gid, mode, rdev, parent);
+}
+
 static inline u32 bkey_generation(struct bkey_s_c k)
 {
 	switch (k.k->type) {
@@ -340,9 +358,9 @@ static inline u32 bkey_generation(struct bkey_s_c k)
 	}
 }
 
-int __bch2_inode_create(struct btree_trans *trans,
-			struct bch_inode_unpacked *inode_u,
-			u64 min, u64 max, u64 *hint)
+int bch2_inode_create(struct btree_trans *trans,
+		      struct bch_inode_unpacked *inode_u,
+		      u64 min, u64 max, u64 *hint)
 {
 	struct bch_fs *c = trans->c;
 	struct bkey_inode_buf *inode_p;
@@ -408,13 +426,6 @@ out:
 	return -ENOSPC;
 }
 
-int bch2_inode_create(struct bch_fs *c, struct bch_inode_unpacked *inode_u,
-		      u64 min, u64 max, u64 *hint)
-{
-	return bch2_trans_do(c, NULL, BTREE_INSERT_ATOMIC,
-			__bch2_inode_create(&trans, inode_u, min, max, hint));
-}
-
 int bch2_inode_rm(struct bch_fs *c, u64 inode_nr)
 {
 	struct btree_trans trans;
diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h
index c5626c668639..b32c0a47c25d 100644
--- a/fs/bcachefs/inode.h
+++ b/fs/bcachefs/inode.h
@@ -51,14 +51,17 @@ struct btree_iter *bch2_inode_peek(struct btree_trans *,
 int bch2_inode_write(struct btree_trans *, struct btree_iter *,
 		     struct bch_inode_unpacked *);
 
+void bch2_inode_init_early(struct bch_fs *,
+			   struct bch_inode_unpacked *);
+void bch2_inode_init_late(struct bch_inode_unpacked *, u64,
+			  uid_t, gid_t, umode_t, dev_t,
+			  struct bch_inode_unpacked *);
 void bch2_inode_init(struct bch_fs *, struct bch_inode_unpacked *,
 		     uid_t, gid_t, umode_t, dev_t,
 		     struct bch_inode_unpacked *);
 
-int __bch2_inode_create(struct btree_trans *,
-			struct bch_inode_unpacked *,
-			u64, u64, u64 *);
-int bch2_inode_create(struct bch_fs *, struct bch_inode_unpacked *,
+int bch2_inode_create(struct btree_trans *,
+		      struct bch_inode_unpacked *,
 		      u64, u64, u64 *);
 
 int bch2_inode_rm(struct bch_fs *, u64);
@@ -108,6 +111,11 @@ static inline u64 bch2_inode_opt_get(struct bch_inode_unpacked *inode,
 	}
 }
 
+static inline u8 mode_to_type(umode_t mode)
+{
+	return (mode >> 12) & 15;
+}
+
 /* i_nlink: */
 
 static inline unsigned nlink_bias(umode_t mode)
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 2e880955a07c..e6015bc13e9b 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -10,6 +10,7 @@
 #include "dirent.h"
 #include "ec.h"
 #include "error.h"
+#include "fs-common.h"
 #include "fsck.h"
 #include "journal_io.h"
 #include "journal_reclaim.h"
@@ -952,7 +953,6 @@ int bch2_fs_initialize(struct bch_fs *c)
 {
 	struct bch_inode_unpacked root_inode, lostfound_inode;
 	struct bkey_inode_buf packed_inode;
-	struct bch_hash_info root_hash_info;
 	struct qstr lostfound = QSTR("lost+found");
 	const char *err = "cannot allocate memory";
 	struct bch_dev *ca;
@@ -997,7 +997,6 @@ int bch2_fs_initialize(struct bch_fs *c)
 	bch2_inode_init(c, &root_inode, 0, 0,
 			S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0, NULL);
 	root_inode.bi_inum = BCACHEFS_ROOT_INO;
-	root_inode.bi_nlink++; /* lost+found */
 	bch2_inode_pack(&packed_inode, &root_inode);
 
 	err = "error creating root directory";
@@ -1007,24 +1006,15 @@ int bch2_fs_initialize(struct bch_fs *c)
 	if (ret)
 		goto err;
 
-	bch2_inode_init(c, &lostfound_inode, 0, 0,
-			S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0,
-			&root_inode);
-	lostfound_inode.bi_inum = BCACHEFS_ROOT_INO + 1;
-	bch2_inode_pack(&packed_inode, &lostfound_inode);
+	bch2_inode_init_early(c, &lostfound_inode);
 
 	err = "error creating lost+found";
-	ret = bch2_btree_insert(c, BTREE_ID_INODES,
-				&packed_inode.inode.k_i,
-				NULL, NULL, 0);
-	if (ret)
-		goto err;
-
-	root_hash_info = bch2_hash_info_init(c, &root_inode);
-
-	ret = bch2_dirent_create(c, BCACHEFS_ROOT_INO, &root_hash_info, DT_DIR,
-				 &lostfound, lostfound_inode.bi_inum, NULL,
-				 BTREE_INSERT_NOFAIL);
+	ret = bch2_trans_do(c, NULL, BTREE_INSERT_ATOMIC,
+		bch2_create_trans(&trans, BCACHEFS_ROOT_INO,
+				  &root_inode, &lostfound_inode,
+				  &lostfound,
+				  0, 0, 0755, 0,
+				  NULL, NULL));
 	if (ret)
 		goto err;
 
-- 
cgit 


From 137b0ed907f1c0a5036288fa340685f55fb37754 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 4 Oct 2019 17:07:20 -0400
Subject: bcachefs: bch2_extent_atomic_end() now traverses iter

This fixes a bug in io.c bch2_write_index_default() - it was missing the
traverse call, but bch2_extent_atomic_end returns an error now and can
just call it itself.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/extents.c  | 12 +++++++++---
 fs/bcachefs/fs-io.c    |  8 --------
 fs/bcachefs/recovery.c |  6 +-----
 3 files changed, 10 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index db32d9eaa3dc..41a2b36f1d2f 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -1010,13 +1010,19 @@ int bch2_extent_atomic_end(struct btree_iter *iter,
 			   struct bpos *end)
 {
 	struct btree_trans *trans = iter->trans;
-	struct btree *b = iter->l[0].b;
-	struct btree_node_iter	node_iter = iter->l[0].iter;
+	struct btree *b;
+	struct btree_node_iter	node_iter;
 	struct bkey_packed	*_k;
 	unsigned		nr_iters = 0;
 	int ret;
 
-	BUG_ON(iter->uptodate > BTREE_ITER_NEED_PEEK);
+	ret = bch2_btree_iter_traverse(iter);
+	if (ret)
+		return ret;
+
+	b = iter->l[0].b;
+	node_iter = iter->l[0].iter;
+
 	BUG_ON(bkey_cmp(bkey_start_pos(&insert->k), b->data->min_key) < 0);
 
 	*end = bpos_min(insert->k.p, b->key.k.p);
diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 49c0343da462..90587a556cee 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -297,10 +297,6 @@ int bch2_extent_update(struct btree_trans *trans,
 	s64 i_sectors_delta;
 	int ret;
 
-	ret = bch2_btree_iter_traverse(extent_iter);
-	if (ret)
-		return ret;
-
 	ret = bch2_extent_trim_atomic(k, extent_iter);
 	if (ret)
 		return ret;
@@ -2695,10 +2691,6 @@ reassemble:
 		copy.k.k.p.offset += shift >> 9;
 		bch2_btree_iter_set_pos(dst, bkey_start_pos(&copy.k.k));
 
-		ret = bch2_btree_iter_traverse(dst);
-		if (ret)
-			goto bkey_err;
-
 		ret = bch2_extent_atomic_end(dst, &copy.k, &atomic_end);
 		if (ret)
 			goto bkey_err;
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index e6015bc13e9b..97c0d7d1fe77 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -268,7 +268,7 @@ retry:
 				   BTREE_ITER_INTENT);
 
 	do {
-		ret = bch2_btree_iter_traverse(iter);
+		ret = bch2_extent_atomic_end(iter, k, &atomic_end);
 		if (ret)
 			goto err;
 
@@ -282,10 +282,6 @@ retry:
 		if (ret)
 			goto err;
 
-		ret = bch2_extent_atomic_end(split_iter, k, &atomic_end);
-		if (ret)
-			goto err;
-
 		if (!remark &&
 		    remark_if_split &&
 		    bkey_cmp(atomic_end, k->k.p) < 0) {
-- 
cgit 


From 0741d378216054145c9fd2c316924e1d403c8266 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 4 Oct 2019 19:14:43 -0400
Subject: bcachefs: Don't allocate memory under mark_lock

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_gc.c | 26 ++++++++++++++++----------
 1 file changed, 16 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 03a3d24d7451..f8485fba55e2 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -642,12 +642,7 @@ static int bch2_gc_start(struct bch_fs *c,
 {
 	struct bch_dev *ca;
 	unsigned i;
-
-	/*
-	 * indicate to stripe code that we need to allocate for the gc stripes
-	 * radix tree, too
-	 */
-	gc_pos_set(c, gc_phase(GC_PHASE_START));
+	int ret;
 
 	BUG_ON(c->usage_gc);
 
@@ -675,6 +670,18 @@ static int bch2_gc_start(struct bch_fs *c,
 		}
 	}
 
+	ret = bch2_ec_mem_alloc(c, true);
+	if (ret)
+		return ret;
+
+	percpu_down_write(&c->mark_lock);
+
+	/*
+	 * indicate to stripe code that we need to allocate for the gc stripes
+	 * radix tree, too
+	 */
+	gc_pos_set(c, gc_phase(GC_PHASE_START));
+
 	for_each_member_device(ca, c, i) {
 		struct bucket_array *dst = __bucket_array(ca, 1);
 		struct bucket_array *src = __bucket_array(ca, 0);
@@ -699,7 +706,9 @@ static int bch2_gc_start(struct bch_fs *c,
 		}
 	};
 
-	return bch2_ec_mem_alloc(c, true);
+	percpu_up_write(&c->mark_lock);
+
+	return 0;
 }
 
 /**
@@ -732,10 +741,7 @@ int bch2_gc(struct bch_fs *c, struct journal_keys *journal_keys,
 
 	down_write(&c->gc_lock);
 again:
-	percpu_down_write(&c->mark_lock);
 	ret = bch2_gc_start(c, metadata_only);
-	percpu_up_write(&c->mark_lock);
-
 	if (ret)
 		goto out;
 
-- 
cgit 


From 63fbf458cb7d7df6b58f982df7496f79ee9e6863 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 4 Oct 2019 20:40:47 -0400
Subject: bcachefs: Can't be holding read locks while taking write locks

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update_leaf.c |  9 +++++++++
 fs/bcachefs/dirent.c            |  7 ++++---
 fs/bcachefs/dirent.h            |  2 +-
 fs/bcachefs/fs-common.c         | 25 +++++++++++++------------
 fs/bcachefs/fs-common.h         |  3 +--
 fs/bcachefs/reflink.c           | 12 +++++-------
 6 files changed, 33 insertions(+), 25 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 05b9c0d2e893..fd50f51943c3 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -451,6 +451,7 @@ static inline int do_btree_insert_at(struct btree_trans *trans,
 	struct bch_fs *c = trans->c;
 	struct bch_fs_usage_online *fs_usage = NULL;
 	struct btree_insert_entry *i;
+	struct btree_iter *iter;
 	unsigned mark_flags = trans->flags & BTREE_INSERT_BUCKET_INVALIDATE
 		? BCH_BUCKET_MARK_BUCKET_INVALIDATE
 		: 0;
@@ -473,6 +474,14 @@ static inline int do_btree_insert_at(struct btree_trans *trans,
 				goto out_clear_replicas;
 		}
 
+	trans_for_each_iter(trans, iter) {
+		if (iter->nodes_locked != iter->nodes_intent_locked) {
+			BUG_ON(iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT);
+			BUG_ON(trans->iters_live & (1ULL << iter->idx));
+			__bch2_btree_iter_unlock(iter);
+		}
+	}
+
 	if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG))
 		trans_for_each_update(trans, i)
 			btree_insert_entry_checks(trans, i);
diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c
index 304128d7251f..2a3e830ebf50 100644
--- a/fs/bcachefs/dirent.c
+++ b/fs/bcachefs/dirent.c
@@ -296,10 +296,10 @@ int bch2_dirent_delete(struct bch_fs *c, u64 dir_inum,
 struct btree_iter *
 __bch2_dirent_lookup_trans(struct btree_trans *trans, u64 dir_inum,
 			   const struct bch_hash_info *hash_info,
-			   const struct qstr *name)
+			   const struct qstr *name, unsigned flags)
 {
 	return bch2_hash_lookup(trans, bch2_dirent_hash_desc,
-				hash_info, dir_inum, name, 0);
+				hash_info, dir_inum, name, flags);
 }
 
 u64 bch2_dirent_lookup(struct bch_fs *c, u64 dir_inum,
@@ -313,7 +313,8 @@ u64 bch2_dirent_lookup(struct bch_fs *c, u64 dir_inum,
 
 	bch2_trans_init(&trans, c, 0, 0);
 
-	iter = __bch2_dirent_lookup_trans(&trans, dir_inum, hash_info, name);
+	iter = __bch2_dirent_lookup_trans(&trans, dir_inum,
+					  hash_info, name, 0);
 	if (IS_ERR(iter)) {
 		BUG_ON(PTR_ERR(iter) == -EINTR);
 		goto out;
diff --git a/fs/bcachefs/dirent.h b/fs/bcachefs/dirent.h
index 9a57ad005468..e6184dc796d3 100644
--- a/fs/bcachefs/dirent.h
+++ b/fs/bcachefs/dirent.h
@@ -55,7 +55,7 @@ int bch2_dirent_rename(struct btree_trans *,
 struct btree_iter *
 __bch2_dirent_lookup_trans(struct btree_trans *, u64,
 			   const struct bch_hash_info *,
-			   const struct qstr *);
+			   const struct qstr *, unsigned);
 u64 bch2_dirent_lookup(struct bch_fs *, u64, const struct bch_hash_info *,
 		       const struct qstr *);
 
diff --git a/fs/bcachefs/fs-common.c b/fs/bcachefs/fs-common.c
index fdd2b9b6716f..a4497eeb1f1b 100644
--- a/fs/bcachefs/fs-common.c
+++ b/fs/bcachefs/fs-common.c
@@ -24,8 +24,7 @@ int bch2_create_trans(struct btree_trans *trans, u64 dir_inum,
 	u64 now = bch2_current_time(trans->c);
 	int ret;
 
-	dir_iter = bch2_inode_peek(trans, dir_u, dir_inum,
-				   name ? BTREE_ITER_INTENT : 0);
+	dir_iter = bch2_inode_peek(trans, dir_u, dir_inum, BTREE_ITER_INTENT);
 	if (IS_ERR(dir_iter))
 		return PTR_ERR(dir_iter);
 
@@ -76,8 +75,7 @@ int bch2_create_trans(struct btree_trans *trans, u64 dir_inum,
 	return 0;
 }
 
-int bch2_link_trans(struct btree_trans *trans,
-		    u64 dir_inum,
+int bch2_link_trans(struct btree_trans *trans, u64 dir_inum,
 		    u64 inum, struct bch_inode_unpacked *inode_u,
 		    const struct qstr *name)
 {
@@ -86,19 +84,22 @@ int bch2_link_trans(struct btree_trans *trans,
 	struct bch_hash_info dir_hash;
 	u64 now = bch2_current_time(trans->c);
 
-	dir_iter = bch2_inode_peek(trans, &dir_u, dir_inum, 0);
-	if (IS_ERR(dir_iter))
-		return PTR_ERR(dir_iter);
-
 	inode_iter = bch2_inode_peek(trans, inode_u, inum, BTREE_ITER_INTENT);
 	if (IS_ERR(inode_iter))
 		return PTR_ERR(inode_iter);
 
-	dir_hash = bch2_hash_info_init(trans->c, &dir_u);
-
 	inode_u->bi_ctime = now;
 	bch2_inode_nlink_inc(inode_u);
 
+	dir_iter = bch2_inode_peek(trans, &dir_u, dir_inum, 0);
+	if (IS_ERR(dir_iter))
+		return PTR_ERR(dir_iter);
+
+	/* XXX: shouldn't we be updating mtime/ctime on the directory? */
+
+	dir_hash = bch2_hash_info_init(trans->c, &dir_u);
+	bch2_trans_iter_put(trans, dir_iter);
+
 	return bch2_dirent_create(trans, dir_inum, &dir_hash,
 				  mode_to_type(inode_u->bi_mode),
 				  name, inum, BCH_HASH_SET_MUST_CREATE) ?:
@@ -121,8 +122,8 @@ int bch2_unlink_trans(struct btree_trans *trans,
 
 	dir_hash = bch2_hash_info_init(trans->c, dir_u);
 
-	dirent_iter = __bch2_dirent_lookup_trans(trans, dir_inum,
-						 &dir_hash, name);
+	dirent_iter = __bch2_dirent_lookup_trans(trans, dir_inum, &dir_hash,
+						 name, BTREE_ITER_INTENT);
 	if (IS_ERR(dirent_iter))
 		return PTR_ERR(dirent_iter);
 
diff --git a/fs/bcachefs/fs-common.h b/fs/bcachefs/fs-common.h
index 7adcfcf92aec..c1621485a526 100644
--- a/fs/bcachefs/fs-common.h
+++ b/fs/bcachefs/fs-common.h
@@ -12,8 +12,7 @@ int bch2_create_trans(struct btree_trans *, u64,
 		      struct posix_acl *,
 		      struct posix_acl *);
 
-int bch2_link_trans(struct btree_trans *,
-		    u64,
+int bch2_link_trans(struct btree_trans *, u64,
 		    u64, struct bch_inode_unpacked *,
 		    const struct qstr *);
 
diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c
index ad526d280a14..f1b0e7fc8487 100644
--- a/fs/bcachefs/reflink.c
+++ b/fs/bcachefs/reflink.c
@@ -144,20 +144,18 @@ err:
 static struct bkey_s_c get_next_src(struct btree_iter *iter, struct bpos end)
 {
 	struct bkey_s_c k = bch2_btree_iter_peek(iter);
+	int ret;
 
-	while (1) {
-		if (bkey_err(k))
-			return k;
-
+	for_each_btree_key_continue(iter, 0, k, ret) {
 		if (bkey_cmp(iter->pos, end) >= 0)
 			return bkey_s_c_null;
 
 		if (k.k->type == KEY_TYPE_extent ||
 		    k.k->type == KEY_TYPE_reflink_p)
-			return k;
-
-		k = bch2_btree_iter_next(iter);
+			break;
 	}
+
+	return k;
 }
 
 s64 bch2_remap_range(struct bch_fs *c,
-- 
cgit 


From a40d97a771387362dd272608ed2df0a1fd39343e Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 7 Oct 2019 15:09:30 -0400
Subject: bcachefs: Fix incorrect use of bch2_extent_atomic_end()

bch2_extent_atomic_end counts the number of iterators requried for
marking overwrites - but journal replay never marks overwrites, so that
part was incorrect. And counting iterators for the key being inserted
should be unnecessary because we did that prior to the key being
inserted before it was first journalled.

This should fix an iterator overflow bug - the iterators for walking
overwrites were totally unneeded.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/recovery.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 97c0d7d1fe77..095eef3828ce 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -268,10 +268,12 @@ retry:
 				   BTREE_ITER_INTENT);
 
 	do {
-		ret = bch2_extent_atomic_end(iter, k, &atomic_end);
+		ret = bch2_btree_iter_traverse(iter);
 		if (ret)
 			goto err;
 
+		atomic_end = bpos_min(k->k.p, iter->l[0].b->key.k.p);
+
 		split_iter = bch2_trans_copy_iter(&trans, iter);
 		ret = PTR_ERR_OR_ZERO(split_iter);
 		if (ret)
-- 
cgit 


From f7c0fcdd396ff4bd3175000eb3911f75edbc85c5 Mon Sep 17 00:00:00 2001
From: Justin Husted <sigstop@gmail.com>
Date: Tue, 8 Oct 2019 19:16:28 -0700
Subject: bcachefs: Fix uninitialized data in bch2_gc_btree()

Running the filesystem under valgrind exposed a path where the max_stale
variable in bch2_gc_btree() might not be initialized before use in a
rare case when there are no btree nodes in a transaction.

Signed-off-by: Justin Husted <sigstop@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_gc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index f8485fba55e2..f7d9abfdb3de 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -218,7 +218,7 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id,
 		: expensive_debug_checks(c)		? 0
 		: !btree_node_type_needs_gc(btree_id)	? 1
 		: 0;
-	u8 max_stale;
+	u8 max_stale = 0;
 	int ret = 0;
 
 	bch2_trans_init(&trans, c, 0, 0);
-- 
cgit 


From bf974f92039778d338e265278cb7aeaabf82ec2d Mon Sep 17 00:00:00 2001
From: Justin Husted <sigstop@gmail.com>
Date: Tue, 8 Oct 2019 19:17:06 -0700
Subject: bcachefs: Initialize journal pad data in bch_replica_entry objects.

Running the filesystem under valgrind exposed some garbage data being
written to disk in bch2_journal_super_entries_add_common(), in the
portion which encodes bch_replica_entry objects.

Signed-off-by: Justin Husted <sigstop@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/super-io.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index af6fb90413e9..14ff191ad702 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -1030,9 +1030,10 @@ bch2_journal_super_entries_add_common(struct bch_fs *c,
 		struct jset_entry_data_usage *u =
 			container_of(entry, struct jset_entry_data_usage, entry);
 
-		memset(u, 0, sizeof(*u));
-		u->entry.u64s	= DIV_ROUND_UP(sizeof(*u) + e->nr_devs,
-					       sizeof(u64)) - 1;
+		int u64s = DIV_ROUND_UP(sizeof(*u) + e->nr_devs,
+					sizeof(u64)) - 1;
+		memset(u, 0, u64s * sizeof(u64));
+		u->entry.u64s	= u64s;
 		u->entry.type	= BCH_JSET_ENTRY_data_usage;
 		u->v		= cpu_to_le64(c->usage_base->replicas[i]);
 		unsafe_memcpy(&u->r, e, replicas_entry_bytes(e),
-- 
cgit 


From 332c6e5370717fe28776b7427806043c22a81f69 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 8 Oct 2019 21:33:56 -0400
Subject: bcachefs: Fix bch2_mark_extent()

If an extent only contained cached or erasure coded pointers, there
won't be any devices in the normal dirty replicas list or an entry to
update.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/buckets.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 5e0e699c679a..9beb4d4cf85d 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -1063,7 +1063,8 @@ static int bch2_mark_extent(struct bch_fs *c, struct bkey_s_c k,
 		}
 	}
 
-	update_replicas(c, fs_usage, &r.e, dirty_sectors);
+	if (r.e.nr_devs)
+		update_replicas(c, fs_usage, &r.e, dirty_sectors);
 
 	return 0;
 }
-- 
cgit 


From 9ef6068c4dce124235b335847570ae8f2bf2911d Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 7 Oct 2019 16:22:35 -0400
Subject: bcachefs: Fix bch2_extent_ptr_durability()

We were looking up the wrong entry in the stripes radix tree.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/extents.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 41a2b36f1d2f..a8c2081cdd25 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -79,7 +79,7 @@ static unsigned bch2_extent_ptr_durability(struct bch_fs *c,
 
 	for (i = 0; i < p.ec_nr; i++) {
 		struct stripe *s =
-			genradix_ptr(&c->stripes[0], p.idx);
+			genradix_ptr(&c->stripes[0], p.ec[i].idx);
 
 		if (WARN_ON(!s))
 			continue;
-- 
cgit 


From 37954a275fce612f60406bc79f5bc0b07b4b6558 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 8 Oct 2019 18:45:29 -0400
Subject: bcachefs: Limit pointers to being in only one stripe

This make the disk accounting code saner, and it's not clear why we'd
ever want the same data to be in multiple stripes simultaneously.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/buckets.c       | 28 +++++++++++-----------------
 fs/bcachefs/ec.c            |  5 ++---
 fs/bcachefs/extents.c       | 21 ++++++++++-----------
 fs/bcachefs/extents.h       |  5 +++--
 fs/bcachefs/extents_types.h |  4 ++--
 fs/bcachefs/replicas.c      |  4 +---
 6 files changed, 29 insertions(+), 38 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 9beb4d4cf85d..dc184de053ee 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -1027,7 +1027,6 @@ static int bch2_mark_extent(struct bch_fs *c, struct bkey_s_c k,
 	struct extent_ptr_decoded p;
 	struct bch_replicas_padded r;
 	s64 dirty_sectors = 0;
-	unsigned i;
 	int ret;
 
 	r.e.data_type	= data_type;
@@ -1047,17 +1046,15 @@ static int bch2_mark_extent(struct bch_fs *c, struct bkey_s_c k,
 			if (!stale)
 				update_cached_sectors(c, fs_usage, p.ptr.dev,
 						      disk_sectors);
-		} else if (!p.ec_nr) {
+		} else if (!p.has_ec) {
 			dirty_sectors	       += disk_sectors;
 			r.e.devs[r.e.nr_devs++]	= p.ptr.dev;
 		} else {
-			for (i = 0; i < p.ec_nr; i++) {
-				ret = bch2_mark_stripe_ptr(c, p.ec[i],
-						data_type, fs_usage,
-						disk_sectors, flags);
-				if (ret)
-					return ret;
-			}
+			ret = bch2_mark_stripe_ptr(c, p.ec,
+					data_type, fs_usage,
+					disk_sectors, flags);
+			if (ret)
+				return ret;
 
 			r.e.nr_required = 0;
 		}
@@ -1564,7 +1561,6 @@ static int bch2_trans_mark_extent(struct btree_trans *trans,
 	struct bch_replicas_padded r;
 	s64 dirty_sectors = 0;
 	bool stale;
-	unsigned i;
 	int ret;
 
 	r.e.data_type	= data_type;
@@ -1589,16 +1585,14 @@ static int bch2_trans_mark_extent(struct btree_trans *trans,
 			if (!stale)
 				update_cached_sectors_list(trans, p.ptr.dev,
 							   disk_sectors);
-		} else if (!p.ec_nr) {
+		} else if (!p.has_ec) {
 			dirty_sectors	       += disk_sectors;
 			r.e.devs[r.e.nr_devs++]	= p.ptr.dev;
 		} else {
-			for (i = 0; i < p.ec_nr; i++) {
-				ret = bch2_trans_mark_stripe_ptr(trans, p.ec[i],
-						disk_sectors, data_type);
-				if (ret)
-					return ret;
-			}
+			ret = bch2_trans_mark_stripe_ptr(trans, p.ec,
+					disk_sectors, data_type);
+			if (ret)
+				return ret;
 
 			r.e.nr_required = 0;
 		}
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index 424d5cf48893..316dd82809ff 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -433,10 +433,9 @@ int bch2_ec_read_extent(struct bch_fs *c, struct bch_read_bio *rbio)
 
 	closure_init_stack(&cl);
 
-	BUG_ON(!rbio->pick.idx ||
-	       rbio->pick.idx - 1 >= rbio->pick.ec_nr);
+	BUG_ON(!rbio->pick.has_ec);
 
-	stripe_idx = rbio->pick.ec[rbio->pick.idx - 1].idx;
+	stripe_idx = rbio->pick.ec.idx;
 
 	buf = kzalloc(sizeof(*buf), GFP_NOIO);
 	if (!buf)
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index a8c2081cdd25..50cad6725c1b 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -66,7 +66,7 @@ unsigned bch2_bkey_nr_dirty_ptrs(struct bkey_s_c k)
 static unsigned bch2_extent_ptr_durability(struct bch_fs *c,
 					   struct extent_ptr_decoded p)
 {
-	unsigned i, durability = 0;
+	unsigned durability = 0;
 	struct bch_dev *ca;
 
 	if (p.ptr.cached)
@@ -77,16 +77,16 @@ static unsigned bch2_extent_ptr_durability(struct bch_fs *c,
 	if (ca->mi.state != BCH_MEMBER_STATE_FAILED)
 		durability = max_t(unsigned, durability, ca->mi.durability);
 
-	for (i = 0; i < p.ec_nr; i++) {
+	if (p.has_ec) {
 		struct stripe *s =
-			genradix_ptr(&c->stripes[0], p.ec[i].idx);
+			genradix_ptr(&c->stripes[0], p.ec.idx);
 
 		if (WARN_ON(!s))
-			continue;
+			goto out;
 
 		durability = max_t(unsigned, durability, s->nr_redundant);
 	}
-
+out:
 	return durability;
 }
 
@@ -205,10 +205,10 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k,
 			p.idx++;
 
 		if (force_reconstruct_read(c) &&
-		    !p.idx && p.ec_nr)
+		    !p.idx && p.has_ec)
 			p.idx++;
 
-		if (p.idx >= p.ec_nr + 1)
+		if (p.idx >= (unsigned) p.has_ec + 1)
 			continue;
 
 		if (ret > 0 && !ptr_better(c, p, *pick))
@@ -1543,7 +1543,6 @@ void bch2_extent_ptr_decoded_append(struct bkey_i *k,
 	struct bch_extent_crc_unpacked crc =
 		bch2_extent_crc_unpack(&k->k, NULL);
 	union bch_extent_entry *pos;
-	unsigned i;
 
 	if (!bch2_crc_unpacked_cmp(crc, p->crc)) {
 		pos = ptrs.start;
@@ -1562,9 +1561,9 @@ found:
 	p->ptr.type = 1 << BCH_EXTENT_ENTRY_ptr;
 	__extent_entry_insert(k, pos, to_entry(&p->ptr));
 
-	for (i = 0; i < p->ec_nr; i++) {
-		p->ec[i].type = 1 << BCH_EXTENT_ENTRY_stripe_ptr;
-		__extent_entry_insert(k, pos, to_entry(&p->ec[i]));
+	if (p->has_ec) {
+		p->ec.type = 1 << BCH_EXTENT_ENTRY_stripe_ptr;
+		__extent_entry_insert(k, pos, to_entry(&p->ec));
 	}
 }
 
diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h
index 7253cd01db6a..cc7ee9067b50 100644
--- a/fs/bcachefs/extents.h
+++ b/fs/bcachefs/extents.h
@@ -228,7 +228,7 @@ struct bkey_ptrs {
 	__label__ out;							\
 									\
 	(_ptr).idx	= 0;						\
-	(_ptr).ec_nr	= 0;						\
+	(_ptr).has_ec	= false;					\
 									\
 	__bkey_extent_entry_for_each_from(_entry, _end, _entry)		\
 		switch (extent_entry_type(_entry)) {			\
@@ -242,7 +242,8 @@ struct bkey_ptrs {
 					entry_to_crc(_entry));		\
 			break;						\
 		case BCH_EXTENT_ENTRY_stripe_ptr:			\
-			(_ptr).ec[(_ptr).ec_nr++] = _entry->stripe_ptr;	\
+			(_ptr).ec = _entry->stripe_ptr;			\
+			(_ptr).has_ec	= true;				\
 			break;						\
 		}							\
 out:									\
diff --git a/fs/bcachefs/extents_types.h b/fs/bcachefs/extents_types.h
index a8dd6952d989..43d6c341ecca 100644
--- a/fs/bcachefs/extents_types.h
+++ b/fs/bcachefs/extents_types.h
@@ -21,10 +21,10 @@ struct bch_extent_crc_unpacked {
 
 struct extent_ptr_decoded {
 	unsigned			idx;
-	unsigned			ec_nr;
+	bool				has_ec;
 	struct bch_extent_crc_unpacked	crc;
 	struct bch_extent_ptr		ptr;
-	struct bch_extent_stripe_ptr	ec[4];
+	struct bch_extent_stripe_ptr	ec;
 };
 
 struct bch_io_failures {
diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c
index afd226f3c8e7..eef9f54808fb 100644
--- a/fs/bcachefs/replicas.c
+++ b/fs/bcachefs/replicas.c
@@ -84,10 +84,8 @@ static void extent_to_replicas(struct bkey_s_c k,
 		if (p.ptr.cached)
 			continue;
 
-		if (p.ec_nr) {
+		if (p.has_ec)
 			r->nr_required = 0;
-			break;
-		}
 
 		r->devs[r->nr_devs++] = p.ptr.dev;
 	}
-- 
cgit 


From 9ec211b0ff4019407d029c49099f24dfedbc4db1 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 8 Oct 2019 22:56:33 -0400
Subject: bcachefs: Fix ec_stripes_read()

The bkey_s_c returned by btree_iter_(peek|next) points into the btree
iter type, so advancing the iterator and then using the one previously
returned is a bug...

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs_format.h |  4 ++--
 fs/bcachefs/ec.c              | 32 ++++++++++++++------------------
 2 files changed, 16 insertions(+), 20 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index eb6d712e7844..6ba830583846 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -1510,14 +1510,14 @@ LE32_BITMASK(JSET_BIG_ENDIAN,	struct jset, flags, 4, 5);
 
 /* Btree: */
 
-#define BCH_BTREE_IDS()				\
+#define BCH_BTREE_IDS()					\
 	x(EXTENTS,	0, "extents")			\
 	x(INODES,	1, "inodes")			\
 	x(DIRENTS,	2, "dirents")			\
 	x(XATTRS,	3, "xattrs")			\
 	x(ALLOC,	4, "alloc")			\
 	x(QUOTAS,	5, "quotas")			\
-	x(EC,		6, "erasure_coding")		\
+	x(EC,		6, "stripes")			\
 	x(REFLINK,	7, "reflink")
 
 enum btree_id {
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index 316dd82809ff..f32b8e6bf2ce 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -135,8 +135,6 @@ void bch2_stripe_to_text(struct printbuf *out, struct bch_fs *c,
 		pr_buf(out, " %u:%llu:%u", s->ptrs[i].dev,
 		       (u64) s->ptrs[i].offset,
 		       stripe_blockcount_get(s, i));
-
-	bch2_bkey_ptrs_to_text(out, c, k);
 }
 
 static int ptr_matches_stripe(struct bch_fs *c,
@@ -1277,7 +1275,7 @@ int bch2_stripes_read(struct bch_fs *c, struct journal_keys *journal_keys)
 	struct btree_trans trans;
 	struct btree_iter *btree_iter;
 	struct journal_iter journal_iter;
-	struct bkey_s_c btree_k, journal_k, k;
+	struct bkey_s_c btree_k, journal_k;
 	int ret;
 
 	ret = bch2_fs_ec_start(c);
@@ -1293,33 +1291,31 @@ int bch2_stripes_read(struct bch_fs *c, struct journal_keys *journal_keys)
 	journal_k	= bch2_journal_iter_peek(&journal_iter);
 
 	while (1) {
+		bool btree;
+
 		if (btree_k.k && journal_k.k) {
 			int cmp = bkey_cmp(btree_k.k->p, journal_k.k->p);
 
-			if (cmp < 0) {
-				k = btree_k;
-				btree_k = bch2_btree_iter_next(btree_iter);
-			} else if (cmp == 0) {
+			if (!cmp)
 				btree_k = bch2_btree_iter_next(btree_iter);
-				k = journal_k;
-				journal_k = bch2_journal_iter_next(&journal_iter);
-			} else {
-				k = journal_k;
-				journal_k = bch2_journal_iter_next(&journal_iter);
-			}
+			btree = cmp < 0;
 		} else if (btree_k.k) {
-			k = btree_k;
-			btree_k = bch2_btree_iter_next(btree_iter);
+			btree = true;
 		} else if (journal_k.k) {
-			k = journal_k;
-			journal_k = bch2_journal_iter_next(&journal_iter);
+			btree = false;
 		} else {
 			break;
 		}
 
-		bch2_mark_key(c, k, 0, 0, NULL, 0,
+		bch2_mark_key(c, btree ? btree_k : journal_k,
+			      0, 0, NULL, 0,
 			      BCH_BUCKET_MARK_ALLOC_READ|
 			      BCH_BUCKET_MARK_NOATOMIC);
+
+		if (btree)
+			btree_k = bch2_btree_iter_next(btree_iter);
+		else
+			journal_k = bch2_journal_iter_next(&journal_iter);
 	}
 
 	ret = bch2_trans_exit(&trans) ?: ret;
-- 
cgit 


From 43de7376f36c236255b7daf88e8286720426568b Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 7 Oct 2019 15:57:47 -0400
Subject: bcachefs: Fix erasure coding disk space accounting

Disk space accounting for erasure coding + compression was completely
broken - we need to calculate the parity sectors delta the same way we
calculate disk_sectors, by calculating the old and new usage and
subtracting to get the difference.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/buckets.c | 132 ++++++++++++++++++++++++++++++++------------------
 fs/bcachefs/buckets.h |   6 +--
 2 files changed, 87 insertions(+), 51 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index dc184de053ee..d4d66d78d2a3 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -807,28 +807,44 @@ void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
 	preempt_enable();
 }
 
-static s64 ptr_disk_sectors_delta(struct extent_ptr_decoded p,
-				  unsigned offset, s64 delta,
-				  unsigned flags)
+static s64 disk_sectors_scaled(unsigned n, unsigned d, unsigned sectors)
+{
+	return DIV_ROUND_UP(sectors * n, d);
+}
+
+static s64 __ptr_disk_sectors_delta(unsigned old_size,
+				    unsigned offset, s64 delta,
+				    unsigned flags,
+				    unsigned n, unsigned d)
 {
+	BUG_ON(!n || !d);
+
 	if (flags & BCH_BUCKET_MARK_OVERWRITE_SPLIT) {
-		BUG_ON(offset + -delta > p.crc.live_size);
+		BUG_ON(offset + -delta > old_size);
 
-		return -((s64) ptr_disk_sectors(p)) +
-			__ptr_disk_sectors(p, offset) +
-			__ptr_disk_sectors(p, p.crc.live_size -
-					   offset + delta);
+		return -disk_sectors_scaled(n, d, old_size) +
+			disk_sectors_scaled(n, d, offset) +
+			disk_sectors_scaled(n, d, old_size - offset + delta);
 	} else if (flags & BCH_BUCKET_MARK_OVERWRITE) {
-		BUG_ON(offset + -delta > p.crc.live_size);
+		BUG_ON(offset + -delta > old_size);
 
-		return -((s64) ptr_disk_sectors(p)) +
-			__ptr_disk_sectors(p, p.crc.live_size +
-					   delta);
+		return -disk_sectors_scaled(n, d, old_size) +
+			disk_sectors_scaled(n, d, old_size + delta);
 	} else {
-		return ptr_disk_sectors(p);
+		return  disk_sectors_scaled(n, d, delta);
 	}
 }
 
+static s64 ptr_disk_sectors_delta(struct extent_ptr_decoded p,
+				  unsigned offset, s64 delta,
+				  unsigned flags)
+{
+	return __ptr_disk_sectors_delta(p.crc.live_size,
+					offset, delta, flags,
+					p.crc.compressed_size,
+					p.crc.uncompressed_size);
+}
+
 static void bucket_set_stripe(struct bch_fs *c,
 			      const struct bch_stripe *v,
 			      struct bch_fs_usage *fs_usage,
@@ -964,15 +980,15 @@ static int bch2_mark_stripe_ptr(struct bch_fs *c,
 				struct bch_extent_stripe_ptr p,
 				enum bch_data_type data_type,
 				struct bch_fs_usage *fs_usage,
-				s64 sectors, unsigned flags)
+				s64 sectors, unsigned flags,
+				struct bch_replicas_padded *r,
+				unsigned *nr_data,
+				unsigned *nr_parity)
 {
 	bool gc = flags & BCH_BUCKET_MARK_GC;
 	struct stripe *m;
-	unsigned old, new, nr_data;
+	unsigned old, new;
 	int blocks_nonempty_delta;
-	s64 parity_sectors;
-
-	BUG_ON(!sectors);
 
 	m = genradix_ptr(&c->stripes[gc], p.idx);
 
@@ -987,13 +1003,9 @@ static int bch2_mark_stripe_ptr(struct bch_fs *c,
 
 	BUG_ON(m->r.e.data_type != data_type);
 
-	nr_data = m->nr_blocks - m->nr_redundant;
-
-	parity_sectors = DIV_ROUND_UP(abs(sectors) * m->nr_redundant, nr_data);
-
-	if (sectors < 0)
-		parity_sectors = -parity_sectors;
-	sectors += parity_sectors;
+	*nr_data	= m->nr_blocks - m->nr_redundant;
+	*nr_parity	= m->nr_redundant;
+	*r = m->r;
 
 	old = m->block_sectors[p.block];
 	m->block_sectors[p.block] += sectors;
@@ -1011,8 +1023,6 @@ static int bch2_mark_stripe_ptr(struct bch_fs *c,
 
 	spin_unlock(&c->ec_stripes_heap_lock);
 
-	update_replicas(c, fs_usage, &m->r.e, sectors);
-
 	return 0;
 }
 
@@ -1040,7 +1050,7 @@ static int bch2_mark_extent(struct bch_fs *c, struct bkey_s_c k,
 			? sectors
 			: ptr_disk_sectors_delta(p, offset, sectors, flags);
 		bool stale = bch2_mark_pointer(c, p, disk_sectors, data_type,
-					fs_usage, journal_seq, flags);
+					       fs_usage, journal_seq, flags);
 
 		if (p.ptr.cached) {
 			if (!stale)
@@ -1050,12 +1060,30 @@ static int bch2_mark_extent(struct bch_fs *c, struct bkey_s_c k,
 			dirty_sectors	       += disk_sectors;
 			r.e.devs[r.e.nr_devs++]	= p.ptr.dev;
 		} else {
-			ret = bch2_mark_stripe_ptr(c, p.ec,
-					data_type, fs_usage,
-					disk_sectors, flags);
+			struct bch_replicas_padded ec_r;
+			unsigned nr_data, nr_parity;
+			s64 parity_sectors;
+
+			ret = bch2_mark_stripe_ptr(c, p.ec, data_type,
+					fs_usage, disk_sectors, flags,
+					&ec_r, &nr_data, &nr_parity);
 			if (ret)
 				return ret;
 
+			parity_sectors =
+				__ptr_disk_sectors_delta(p.crc.live_size,
+					offset, sectors, flags,
+					p.crc.compressed_size * nr_parity,
+					p.crc.uncompressed_size * nr_data);
+
+			update_replicas(c, fs_usage, &ec_r.e,
+					disk_sectors + parity_sectors);
+
+			/*
+			 * There may be other dirty pointers in this extent, but
+			 * if so they're not required for mounting if we have an
+			 * erasure coded pointer in this extent:
+			 */
 			r.e.nr_required = 0;
 		}
 	}
@@ -1499,16 +1527,16 @@ out:
 
 static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans,
 			struct bch_extent_stripe_ptr p,
-			s64 sectors, enum bch_data_type data_type)
+			s64 sectors, enum bch_data_type data_type,
+			struct bch_replicas_padded *r,
+			unsigned *nr_data,
+			unsigned *nr_parity)
 {
 	struct bch_fs *c = trans->c;
-	struct bch_replicas_padded r;
 	struct btree_iter *iter;
 	struct bkey_i *new_k;
 	struct bkey_s_c k;
 	struct bkey_s_stripe s;
-	unsigned nr_data;
-	s64 parity_sectors;
 	int ret = 0;
 
 	ret = trans_get_key(trans, BTREE_ID_EC, POS(0, p.idx), &iter, &k);
@@ -1531,20 +1559,13 @@ static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans,
 	bkey_reassemble(new_k, k);
 	s = bkey_i_to_s_stripe(new_k);
 
-	nr_data = s.v->nr_blocks - s.v->nr_redundant;
-
-	parity_sectors = DIV_ROUND_UP(abs(sectors) * s.v->nr_redundant, nr_data);
-
-	if (sectors < 0)
-		parity_sectors = -parity_sectors;
-
 	stripe_blockcount_set(s.v, p.block,
 		stripe_blockcount_get(s.v, p.block) +
-		sectors + parity_sectors);
+		sectors);
 
-	bch2_bkey_to_replicas(&r.e, s.s_c);
-
-	update_replicas_list(trans, &r.e, sectors);
+	*nr_data	= s.v->nr_blocks - s.v->nr_redundant;
+	*nr_parity	= s.v->nr_redundant;
+	bch2_bkey_to_replicas(&r->e, s.s_c);
 out:
 	bch2_trans_iter_put(trans, iter);
 	return ret;
@@ -1589,16 +1610,31 @@ static int bch2_trans_mark_extent(struct btree_trans *trans,
 			dirty_sectors	       += disk_sectors;
 			r.e.devs[r.e.nr_devs++]	= p.ptr.dev;
 		} else {
+			struct bch_replicas_padded ec_r;
+			unsigned nr_data, nr_parity;
+			s64 parity_sectors;
+
 			ret = bch2_trans_mark_stripe_ptr(trans, p.ec,
-					disk_sectors, data_type);
+					disk_sectors, data_type,
+					&ec_r, &nr_data, &nr_parity);
 			if (ret)
 				return ret;
 
+			parity_sectors =
+				__ptr_disk_sectors_delta(p.crc.live_size,
+					offset, sectors, flags,
+					p.crc.compressed_size * nr_parity,
+					p.crc.uncompressed_size * nr_data);
+
+			update_replicas_list(trans, &ec_r.e,
+					     disk_sectors + parity_sectors);
+
 			r.e.nr_required = 0;
 		}
 	}
 
-	update_replicas_list(trans, &r.e, dirty_sectors);
+	if (r.e.nr_devs)
+		update_replicas_list(trans, &r.e, dirty_sectors);
 
 	return 0;
 }
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index e93cda51d705..5f91a57abc70 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -137,8 +137,8 @@ static inline u8 ptr_stale(struct bch_dev *ca,
 	return gen_after(ptr_bucket_mark(ca, ptr).gen, ptr->gen);
 }
 
-static inline unsigned __ptr_disk_sectors(struct extent_ptr_decoded p,
-					  unsigned live_size)
+static inline s64 __ptr_disk_sectors(struct extent_ptr_decoded p,
+				     unsigned live_size)
 {
 	return live_size && p.crc.compression_type
 		? max(1U, DIV_ROUND_UP(live_size * p.crc.compressed_size,
@@ -146,7 +146,7 @@ static inline unsigned __ptr_disk_sectors(struct extent_ptr_decoded p,
 		: live_size;
 }
 
-static inline unsigned ptr_disk_sectors(struct extent_ptr_decoded p)
+static inline s64 ptr_disk_sectors(struct extent_ptr_decoded p)
 {
 	return __ptr_disk_sectors(p, p.crc.live_size);
 }
-- 
cgit 


From 3826ee0b17557e85813378be5a5a695baf607c1c Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 9 Oct 2019 09:19:06 -0400
Subject: bcachefs: Add a lock to bch_page_state

We can't use the page lock to protect it, because on writeback IO error
we need to access the page state before calling end_page_writeback() and
the page lock semantics are completely insane so that deadlocks.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-io.c | 24 ++++++++++++++++--------
 1 file changed, 16 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 90587a556cee..0756b11ae3e4 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -468,6 +468,7 @@ struct bch_page_sector {
 };
 
 struct bch_page_state {
+	spinlock_t		lock;
 	atomic_t		write_count;
 	struct bch_page_sector	s[PAGE_SECTORS];
 };
@@ -523,6 +524,7 @@ static struct bch_page_state *__bch2_page_state_create(struct page *page,
 	if (!s)
 		return NULL;
 
+	spin_lock_init(&s->lock);
 	/*
 	 * migrate_page_move_mapping() assumes that pages with private data
 	 * have their count elevated by 1.
@@ -670,6 +672,9 @@ static void bch2_clear_page_bits(struct page *page)
 	if (!s)
 		return;
 
+	EBUG_ON(!PageLocked(page));
+	EBUG_ON(PageWriteback(page));
+
 	for (i = 0; i < ARRAY_SIZE(s->s); i++) {
 		disk_res.sectors += s->s[i].replicas_reserved;
 		s->s[i].replicas_reserved = 0;
@@ -699,6 +704,8 @@ static void bch2_set_page_dirty(struct bch_fs *c,
 	WARN_ON((u64) page_offset(page) + offset + len >
 		round_up((u64) i_size_read(&inode->v), block_bytes(c)));
 
+	spin_lock(&s->lock);
+
 	for (i = round_down(offset, block_bytes(c)) >> 9;
 	     i < round_up(offset + len, block_bytes(c)) >> 9;
 	     i++) {
@@ -715,6 +722,8 @@ static void bch2_set_page_dirty(struct bch_fs *c,
 		s->s[i].state = max_t(unsigned, s->s[i].state, SECTOR_DIRTY);
 	}
 
+	spin_unlock(&s->lock);
+
 	if (dirty_sectors)
 		i_sectors_acct(c, inode, &res->quota, dirty_sectors);
 
@@ -798,10 +807,6 @@ void bch2_invalidate_folio(struct folio *folio, size_t offset, size_t length)
 
 bool bch2_release_folio(struct folio *folio, gfp_t gfp_mask)
 {
-	/* XXX: this can't take locks that are held while we allocate memory */
-	EBUG_ON(!PageLocked(&folio->page));
-	EBUG_ON(folio_test_writeback(folio));
-
 	if (folio_test_dirty(folio) || folio_test_writeback(folio))
 		return false;
 
@@ -1183,11 +1188,11 @@ static void bch2_writepage_io_done(struct closure *cl)
 			SetPageError(bvec->bv_page);
 			mapping_set_error(bvec->bv_page->mapping, -EIO);
 
-			lock_page(bvec->bv_page);
-			s = bch2_page_state(bvec->bv_page);
+			s = __bch2_page_state(bvec->bv_page);
+			spin_lock(&s->lock);
 			for (i = 0; i < PAGE_SECTORS; i++)
 				s->s[i].nr_replicas = 0;
-			unlock_page(bvec->bv_page);
+			spin_unlock(&s->lock);
 		}
 	}
 
@@ -2978,9 +2983,12 @@ static void mark_range_unallocated(struct bch_inode_info *inode,
 			folio_lock(folio);
 			s = bch2_page_state(&folio->page);
 
-			if (s)
+			if (s) {
+				spin_lock(&s->lock);
 				for (j = 0; j < PAGE_SECTORS; j++)
 					s->s[j].nr_replicas = 0;
+				spin_unlock(&s->lock);
+			}
 
 			folio_unlock(folio);
 		}
-- 
cgit 


From 495fa1a2ec78a94bc1857531925b212edb670d2c Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 9 Oct 2019 09:23:30 -0400
Subject: bcachefs: Refactor bch2_readdir() a bit

The tweaks to ctx->pos handling are also to help the fuse port

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/dirent.c | 19 +++++++------------
 1 file changed, 7 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c
index 2a3e830ebf50..38017699c04a 100644
--- a/fs/bcachefs/dirent.c
+++ b/fs/bcachefs/dirent.c
@@ -354,36 +354,31 @@ int bch2_readdir(struct bch_fs *c, u64 inum, struct dir_context *ctx)
 	struct btree_iter *iter;
 	struct bkey_s_c k;
 	struct bkey_s_c_dirent dirent;
-	unsigned len;
 	int ret;
 
 	bch2_trans_init(&trans, c, 0, 0);
 
 	for_each_btree_key(&trans, iter, BTREE_ID_DIRENTS,
 			   POS(inum, ctx->pos), 0, k, ret) {
+		if (k.k->p.inode > inum)
+			break;
+
 		if (k.k->type != KEY_TYPE_dirent)
 			continue;
 
 		dirent = bkey_s_c_to_dirent(k);
 
-		if (bkey_cmp(k.k->p, POS(inum, ctx->pos)) < 0)
-			continue;
-
-		if (k.k->p.inode > inum)
-			break;
-
-		len = bch2_dirent_name_bytes(dirent);
-
 		/*
 		 * XXX: dir_emit() can fault and block, while we're holding
 		 * locks
 		 */
-		if (!dir_emit(ctx, dirent.v->d_name, len,
+		ctx->pos = dirent.k->p.offset;
+		if (!dir_emit(ctx, dirent.v->d_name,
+			      bch2_dirent_name_bytes(dirent),
 			      le64_to_cpu(dirent.v->d_inum),
 			      dirent.v->d_type))
 			break;
-
-		ctx->pos = k.k->p.offset + 1;
+		ctx->pos = dirent.k->p.offset + 1;
 	}
 	ret = bch2_trans_exit(&trans) ?: ret;
 
-- 
cgit 


From 14989d547ee308033e080792239e640e076a4460 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 9 Oct 2019 10:25:32 -0400
Subject: bcachefs: Fix bch2_btree_iter_next() after peek_slot()

this deserves a unit test

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 78bc82c7b9c3..8c6d3193c3fe 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1440,6 +1440,14 @@ struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter)
 		return bch2_btree_iter_peek(iter);
 	}
 
+	if (unlikely(bkey_deleted(&iter->k))) {
+		/*
+		 * we're currently pointed at a hole, because previously we were
+		 * iterating over slots:
+		 */
+		return bch2_btree_iter_peek(iter);
+	}
+
 	do {
 		bch2_btree_node_iter_advance(&l->iter, l->b);
 		p = bch2_btree_node_iter_peek_all(&l->iter, l->b);
-- 
cgit 


From daf3fe502a4c20be99579097cc351e91d27fc789 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 9 Oct 2019 09:44:36 -0400
Subject: bcachefs: Check if extending inode differently

In bch2_extent_update(), we have to update the inode if i_size is
changing (the file is being extend) or if i_sectors is changing, but we
want to avoid touching the inode if it's not necessary.

Change sum_sector_overwrites() to also check if there's already data
above where we're writing to - this means we're definitely not extending
the file.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-io.c   | 70 +++++++++++++++++++++++++++++++++++----------------
 fs/bcachefs/fs-io.h   |  2 +-
 fs/bcachefs/reflink.c |  2 +-
 3 files changed, 50 insertions(+), 24 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 0756b11ae3e4..de3c6f8c4b04 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -243,12 +243,14 @@ static int sum_sector_overwrites(struct btree_trans *trans,
 				 struct btree_iter *extent_iter,
 				 struct bkey_i *new,
 				 bool may_allocate,
+				 bool *maybe_extending,
 				 s64 *delta)
 {
 	struct btree_iter *iter;
 	struct bkey_s_c old;
 	int ret = 0;
 
+	*maybe_extending = true;
 	*delta = 0;
 
 	iter = bch2_trans_copy_iter(trans, extent_iter);
@@ -270,8 +272,27 @@ static int sum_sector_overwrites(struct btree_trans *trans,
 			(bkey_extent_is_allocation(&new->k) -
 			 bkey_extent_is_allocation(old.k));
 
-		if (bkey_cmp(old.k->p, new->k.p) >= 0)
+		if (bkey_cmp(old.k->p, new->k.p) >= 0) {
+			/*
+			 * Check if there's already data above where we're
+			 * going to be writing to - this means we're definitely
+			 * not extending the file:
+			 *
+			 * Note that it's not sufficient to check if there's
+			 * data up to the sector offset we're going to be
+			 * writing to, because i_size could be up to one block
+			 * less:
+			 */
+			if (!bkey_cmp(old.k->p, new->k.p))
+				old = bch2_btree_iter_next(iter);
+
+			if (old.k && !bkey_err(old) &&
+			    old.k->p.inode == extent_iter->pos.inode &&
+			    bkey_extent_is_data(old.k))
+				*maybe_extending = false;
+
 			break;
+		}
 	}
 
 	bch2_trans_iter_put(trans, iter);
@@ -293,7 +314,7 @@ int bch2_extent_update(struct btree_trans *trans,
 	struct btree_iter *inode_iter = NULL;
 	struct bch_inode_unpacked inode_u;
 	struct bkey_inode_buf inode_p;
-	bool extended = false;
+	bool extending = false;
 	s64 i_sectors_delta;
 	int ret;
 
@@ -301,8 +322,8 @@ int bch2_extent_update(struct btree_trans *trans,
 	if (ret)
 		return ret;
 
-	ret = sum_sector_overwrites(trans, extent_iter, k,
-				    may_allocate, &i_sectors_delta);
+	ret = sum_sector_overwrites(trans, extent_iter, k, may_allocate,
+				    &extending, &i_sectors_delta);
 	if (ret)
 		return ret;
 
@@ -310,27 +331,34 @@ int bch2_extent_update(struct btree_trans *trans,
 
 	new_i_size = min(k->k.p.offset << 9, new_i_size);
 
-	/* XXX: inode->i_size locking */
-	if (i_sectors_delta ||
-	    new_i_size > inode->ei_inode.bi_size) {
+	if (i_sectors_delta || extending) {
 		inode_iter = bch2_inode_peek(trans, &inode_u,
 				k->k.p.inode, BTREE_ITER_INTENT);
 		if (IS_ERR(inode_iter))
 			return PTR_ERR(inode_iter);
 
-		inode_u.bi_sectors += i_sectors_delta;
-
 		/*
-		 * XXX: can BCH_INODE_I_SIZE_DIRTY be true here? i.e. can we
-		 * race with truncate?
+		 * XXX:
+		 * writeback can race a bit with truncate, because truncate
+		 * first updates the inode then truncates the pagecache. This is
+		 * ugly, but lets us preserve the invariant that the in memory
+		 * i_size is always >= the on disk i_size.
+		 *
+		BUG_ON(new_i_size > inode_u.bi_size &&
+		       (inode_u.bi_flags & BCH_INODE_I_SIZE_DIRTY));
 		 */
+		BUG_ON(new_i_size > inode_u.bi_size && !extending &&
+		       !(inode_u.bi_flags & BCH_INODE_I_SIZE_DIRTY));
+
 		if (!(inode_u.bi_flags & BCH_INODE_I_SIZE_DIRTY) &&
-		    new_i_size > inode_u.bi_size) {
+		    new_i_size > inode_u.bi_size)
 			inode_u.bi_size = new_i_size;
-			extended = true;
-		}
+		else
+			extending = false;
+
+		inode_u.bi_sectors += i_sectors_delta;
 
-		if (i_sectors_delta || extended) {
+		if (i_sectors_delta || extending) {
 			bch2_inode_pack(&inode_p, &inode_u);
 			bch2_trans_update(trans, inode_iter,
 					  &inode_p.inode.k_i);
@@ -346,14 +374,14 @@ int bch2_extent_update(struct btree_trans *trans,
 	if (ret)
 		goto err;
 
-	if (i_sectors_delta || extended) {
+	if (i_sectors_delta || extending) {
 		inode->ei_inode.bi_sectors	= inode_u.bi_sectors;
 		inode->ei_inode.bi_size		= inode_u.bi_size;
 	}
 
 	if (direct)
 		i_sectors_acct(c, inode, quota_res, i_sectors_delta);
-	if (direct && extended) {
+	if (direct && extending) {
 		spin_lock(&inode->v.i_lock);
 		if (new_i_size > inode->v.i_size)
 			i_size_write(&inode->v, new_i_size);
@@ -2241,8 +2269,7 @@ out:
 /* truncate: */
 
 int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter,
-		   struct bpos end, struct bch_inode_info *inode,
-		   u64 new_i_size)
+		   struct bpos end, struct bch_inode_info *inode)
 {
 	struct bch_fs *c	= trans->c;
 	unsigned max_sectors	= KEY_SIZE_MAX & (~0 << c->block_bits);
@@ -2270,7 +2297,7 @@ int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter,
 
 		ret = bch2_extent_update(trans, inode,
 				&disk_res, NULL, iter, &delete,
-				new_i_size, false, true, NULL);
+				0, false, true, NULL);
 		bch2_disk_reservation_put(c, &disk_res);
 btree_err:
 		if (ret == -EINTR) {
@@ -2303,8 +2330,7 @@ static int __bch2_fpunch(struct bch_fs *c, struct bch_inode_info *inode,
 				   BTREE_ITER_INTENT);
 
 	ret = bch2_fpunch_at(&trans, iter,
-			     POS(inode->v.i_ino, end_offset),
-			     inode, 0);
+			POS(inode->v.i_ino, end_offset), inode);
 
 	bch2_trans_exit(&trans);
 
diff --git a/fs/bcachefs/fs-io.h b/fs/bcachefs/fs-io.h
index 861ec25ab9ef..5e48d21bd2e4 100644
--- a/fs/bcachefs/fs-io.h
+++ b/fs/bcachefs/fs-io.h
@@ -19,7 +19,7 @@ int bch2_extent_update(struct btree_trans *,
 		       struct bkey_i *,
 		       u64, bool, bool, s64 *);
 int bch2_fpunch_at(struct btree_trans *, struct btree_iter *,
-		   struct bpos, struct bch_inode_info *, u64);
+		   struct bpos, struct bch_inode_info *);
 
 int __must_check bch2_write_inode_size(struct bch_fs *,
 				       struct bch_inode_info *,
diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c
index f1b0e7fc8487..de4c8b075a65 100644
--- a/fs/bcachefs/reflink.c
+++ b/fs/bcachefs/reflink.c
@@ -213,7 +213,7 @@ s64 bch2_remap_range(struct bch_fs *c,
 
 		if (bkey_cmp(dst_iter->pos, dst_want) < 0) {
 			ret = bch2_fpunch_at(&trans, dst_iter, dst_want,
-					     dst_inode, new_i_size);
+					     dst_inode);
 			if (ret)
 				goto btree_err;
 			continue;
-- 
cgit 


From e0541a9346951c94dce4d65d88541a329adf0b76 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 9 Oct 2019 11:12:48 -0400
Subject: bcachefs: Kill some dependencies on ei_inode

Moving bch2_extent_update() to io.c will be greatly simplified if we
no longer have to keep ei_inode.bi_size/bi_sectors up to date.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-io.c | 92 ++++++++++++++++++++++++++++++++++++-----------------
 fs/bcachefs/fs.c    | 28 ----------------
 2 files changed, 62 insertions(+), 58 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index de3c6f8c4b04..9ecefd95df6e 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -2453,14 +2453,18 @@ static int bch2_truncate_page(struct bch_inode_info *inode, loff_t from)
 				    from, round_up(from, PAGE_SIZE));
 }
 
-static int bch2_extend(struct bch_inode_info *inode, struct iattr *iattr)
+static int bch2_extend(struct bch_inode_info *inode,
+		       struct bch_inode_unpacked *inode_u,
+		       struct iattr *iattr)
 {
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
 	struct address_space *mapping = inode->v.i_mapping;
 	int ret;
 
-	ret = filemap_write_and_wait_range(mapping,
-			inode->ei_inode.bi_size, S64_MAX);
+	/*
+	 * sync appends:
+	 */
+	ret = filemap_write_and_wait_range(mapping, inode_u->bi_size, S64_MAX);
 	if (ret)
 		return ret;
 
@@ -2501,19 +2505,31 @@ int bch2_truncate(struct bch_inode_info *inode, struct iattr *iattr)
 {
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
 	struct address_space *mapping = inode->v.i_mapping;
+	struct bch_inode_unpacked inode_u;
+	struct btree_trans trans;
+	struct btree_iter *iter;
 	u64 new_i_size = iattr->ia_size;
-	bool shrink;
 	int ret = 0;
 
 	inode_dio_wait(&inode->v);
 	bch2_pagecache_block_get(&inode->ei_pagecache_lock);
 
-	BUG_ON(inode->v.i_size < inode->ei_inode.bi_size);
+	/*
+	 * fetch current on disk i_size: inode is locked, i_size can only
+	 * increase underneath us:
+	 */
+	bch2_trans_init(&trans, c, 0, 0);
+	iter = bch2_inode_peek(&trans, &inode_u, inode->v.i_ino, 0);
+	ret = PTR_ERR_OR_ZERO(iter);
+	bch2_trans_exit(&trans);
+
+	if (ret)
+		goto err;
 
-	shrink = iattr->ia_size <= inode->v.i_size;
+	BUG_ON(inode->v.i_size < inode_u.bi_size);
 
-	if (!shrink) {
-		ret = bch2_extend(inode, iattr);
+	if (iattr->ia_size > inode->v.i_size) {
+		ret = bch2_extend(inode, &inode_u, iattr);
 		goto err;
 	}
 
@@ -2531,9 +2547,9 @@ int bch2_truncate(struct bch_inode_info *inode, struct iattr *iattr)
 	 * userspace has to redirty it and call .mkwrite -> set_page_dirty
 	 * again to allocate the part of the page that was extended.
 	 */
-	if (iattr->ia_size > inode->ei_inode.bi_size)
+	if (iattr->ia_size > inode_u.bi_size)
 		ret = filemap_write_and_wait_range(mapping,
-				inode->ei_inode.bi_size,
+				inode_u.bi_size,
 				iattr->ia_size - 1);
 	else if (iattr->ia_size & (PAGE_SIZE - 1))
 		ret = filemap_write_and_wait_range(mapping,
@@ -2935,33 +2951,49 @@ bkey_err:
 		if (ret)
 			goto err;
 	}
-	bch2_trans_unlock(&trans);
 
-	if (!(mode & FALLOC_FL_KEEP_SIZE) &&
-	    end > inode->v.i_size) {
-		i_size_write(&inode->v, end);
+	/*
+	 * Do we need to extend the file?
+	 *
+	 * If we zeroed up to the end of the file, we dropped whatever writes
+	 * were going to write out the current i_size, so we have to extend
+	 * manually even if FL_KEEP_SIZE was set:
+	 */
+	if (end >= inode->v.i_size &&
+	    (!(mode & FALLOC_FL_KEEP_SIZE) ||
+	     (mode & FALLOC_FL_ZERO_RANGE))) {
+		struct btree_iter *inode_iter;
+		struct bch_inode_unpacked inode_u;
+
+		do {
+			bch2_trans_begin(&trans);
+			inode_iter = bch2_inode_peek(&trans, &inode_u,
+						     inode->v.i_ino, 0);
+			ret = PTR_ERR_OR_ZERO(inode_iter);
+		} while (ret == -EINTR);
 
-		mutex_lock(&inode->ei_update_lock);
-		ret = bch2_write_inode_size(c, inode, inode->v.i_size, 0);
-		mutex_unlock(&inode->ei_update_lock);
-	}
+		bch2_trans_unlock(&trans);
+
+		if (ret)
+			goto err;
 
-	/* blech */
-	if ((mode & FALLOC_FL_KEEP_SIZE) &&
-	    (mode & FALLOC_FL_ZERO_RANGE) &&
-	    inode->ei_inode.bi_size != inode->v.i_size) {
-		/* sync appends.. */
+		/*
+		 * Sync existing appends before extending i_size,
+		 * as in bch2_extend():
+		 */
 		ret = filemap_write_and_wait_range(mapping,
-					inode->ei_inode.bi_size, S64_MAX);
+					inode_u.bi_size, S64_MAX);
 		if (ret)
 			goto err;
 
-		if (inode->ei_inode.bi_size != inode->v.i_size) {
-			mutex_lock(&inode->ei_update_lock);
-			ret = bch2_write_inode_size(c, inode,
-						    inode->v.i_size, 0);
-			mutex_unlock(&inode->ei_update_lock);
-		}
+		if (mode & FALLOC_FL_KEEP_SIZE)
+			end = inode->v.i_size;
+		else
+			i_size_write(&inode->v, end);
+
+		mutex_lock(&inode->ei_update_lock);
+		ret = bch2_write_inode_size(c, inode, end, 0);
+		mutex_unlock(&inode->ei_update_lock);
 	}
 err:
 	bch2_trans_exit(&trans);
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index b19a2deed5c1..0042a825a698 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -99,34 +99,6 @@ void bch2_pagecache_block_get(struct pagecache_lock *lock)
 	__pagecache_lock_get(lock, -1);
 }
 
-/*
- * I_SIZE_DIRTY requires special handling:
- *
- * To the recovery code, the flag means that there is stale data past i_size
- * that needs to be deleted; it's used for implementing atomic appends and
- * truncates.
- *
- * On append, we set I_SIZE_DIRTY before doing the write, then after the write
- * we clear I_SIZE_DIRTY atomically with updating i_size to the new larger size
- * that exposes the data we just wrote.
- *
- * On truncate, it's the reverse: We set I_SIZE_DIRTY atomically with setting
- * i_size to the new smaller size, then we delete the data that we just made
- * invisible, and then we clear I_SIZE_DIRTY.
- *
- * Because there can be multiple appends in flight at a time, we need a refcount
- * (i_size_dirty_count) instead of manipulating the flag directly. Nonzero
- * refcount means I_SIZE_DIRTY is set, zero means it's cleared.
- *
- * Because write_inode() can be called at any time, i_size_dirty_count means
- * something different to the runtime code - it means to write_inode() "don't
- * update i_size yet".
- *
- * We don't clear I_SIZE_DIRTY directly, we let write_inode() clear it when
- * i_size_dirty_count is zero - but the reverse is not true, I_SIZE_DIRTY must
- * be set explicitly.
- */
-
 void bch2_inode_update_after_write(struct bch_fs *c,
 				   struct bch_inode_info *inode,
 				   struct bch_inode_unpacked *bi,
-- 
cgit 


From 2925fc49b3303ee7733cf9f6cba6a59a5b8a5e4b Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 9 Oct 2019 12:11:00 -0400
Subject: bcachefs: Split out bchfs_extent_update()

The next few patches are going to be more moving the logic around
i_size/i_sectors updates to io.c, and better separating the Linux VFS
specific code from core bcachefs code, to better support the fuse port.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-io.c   | 169 +++++++-------------------------------------------
 fs/bcachefs/fs-io.h   |  14 ++---
 fs/bcachefs/io.c      | 141 +++++++++++++++++++++++++++++++++++++++++
 fs/bcachefs/io.h      |   3 +
 fs/bcachefs/reflink.c |   6 +-
 5 files changed, 178 insertions(+), 155 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 9ecefd95df6e..92cab285698c 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -237,151 +237,31 @@ static void i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode,
 	mutex_unlock(&inode->ei_quota_lock);
 }
 
-/* normal i_size/i_sectors update machinery: */
-
-static int sum_sector_overwrites(struct btree_trans *trans,
-				 struct btree_iter *extent_iter,
-				 struct bkey_i *new,
-				 bool may_allocate,
-				 bool *maybe_extending,
-				 s64 *delta)
-{
-	struct btree_iter *iter;
-	struct bkey_s_c old;
-	int ret = 0;
-
-	*maybe_extending = true;
-	*delta = 0;
-
-	iter = bch2_trans_copy_iter(trans, extent_iter);
-	if (IS_ERR(iter))
-		return PTR_ERR(iter);
-
-	for_each_btree_key_continue(iter, BTREE_ITER_SLOTS, old, ret) {
-		if (!may_allocate &&
-		    bch2_bkey_nr_ptrs_allocated(old) <
-		    bch2_bkey_nr_dirty_ptrs(bkey_i_to_s_c(new))) {
-			ret = -ENOSPC;
-			break;
-		}
-
-		*delta += (min(new->k.p.offset,
-			      old.k->p.offset) -
-			  max(bkey_start_offset(&new->k),
-			      bkey_start_offset(old.k))) *
-			(bkey_extent_is_allocation(&new->k) -
-			 bkey_extent_is_allocation(old.k));
-
-		if (bkey_cmp(old.k->p, new->k.p) >= 0) {
-			/*
-			 * Check if there's already data above where we're
-			 * going to be writing to - this means we're definitely
-			 * not extending the file:
-			 *
-			 * Note that it's not sufficient to check if there's
-			 * data up to the sector offset we're going to be
-			 * writing to, because i_size could be up to one block
-			 * less:
-			 */
-			if (!bkey_cmp(old.k->p, new->k.p))
-				old = bch2_btree_iter_next(iter);
-
-			if (old.k && !bkey_err(old) &&
-			    old.k->p.inode == extent_iter->pos.inode &&
-			    bkey_extent_is_data(old.k))
-				*maybe_extending = false;
-
-			break;
-		}
-	}
-
-	bch2_trans_iter_put(trans, iter);
-	return ret;
-}
-
-int bch2_extent_update(struct btree_trans *trans,
-		       struct bch_inode_info *inode,
-		       struct disk_reservation *disk_res,
-		       struct quota_res *quota_res,
-		       struct btree_iter *extent_iter,
-		       struct bkey_i *k,
-		       u64 new_i_size,
-		       bool may_allocate,
-		       bool direct,
-		       s64 *total_delta)
-{
-	struct bch_fs *c = trans->c;
-	struct btree_iter *inode_iter = NULL;
-	struct bch_inode_unpacked inode_u;
-	struct bkey_inode_buf inode_p;
-	bool extending = false;
-	s64 i_sectors_delta;
+int bchfs_extent_update(struct btree_trans *trans,
+			struct bch_inode_info *inode,
+			struct disk_reservation *disk_res,
+			struct quota_res *quota_res,
+			struct btree_iter *extent_iter,
+			struct bkey_i *k,
+			u64 new_i_size,
+			bool may_allocate,
+			bool direct,
+			s64 *total_delta)
+{
+	s64 i_sectors_delta = 0;
 	int ret;
 
-	ret = bch2_extent_trim_atomic(k, extent_iter);
-	if (ret)
-		return ret;
-
-	ret = sum_sector_overwrites(trans, extent_iter, k, may_allocate,
-				    &extending, &i_sectors_delta);
+	ret = bch2_extent_update(trans, extent_iter, k,
+			disk_res, &inode->ei_journal_seq,
+			new_i_size, &i_sectors_delta);
 	if (ret)
 		return ret;
 
-	bch2_trans_update(trans, extent_iter, k);
-
-	new_i_size = min(k->k.p.offset << 9, new_i_size);
-
-	if (i_sectors_delta || extending) {
-		inode_iter = bch2_inode_peek(trans, &inode_u,
-				k->k.p.inode, BTREE_ITER_INTENT);
-		if (IS_ERR(inode_iter))
-			return PTR_ERR(inode_iter);
-
-		/*
-		 * XXX:
-		 * writeback can race a bit with truncate, because truncate
-		 * first updates the inode then truncates the pagecache. This is
-		 * ugly, but lets us preserve the invariant that the in memory
-		 * i_size is always >= the on disk i_size.
-		 *
-		BUG_ON(new_i_size > inode_u.bi_size &&
-		       (inode_u.bi_flags & BCH_INODE_I_SIZE_DIRTY));
-		 */
-		BUG_ON(new_i_size > inode_u.bi_size && !extending &&
-		       !(inode_u.bi_flags & BCH_INODE_I_SIZE_DIRTY));
-
-		if (!(inode_u.bi_flags & BCH_INODE_I_SIZE_DIRTY) &&
-		    new_i_size > inode_u.bi_size)
-			inode_u.bi_size = new_i_size;
-		else
-			extending = false;
-
-		inode_u.bi_sectors += i_sectors_delta;
-
-		if (i_sectors_delta || extending) {
-			bch2_inode_pack(&inode_p, &inode_u);
-			bch2_trans_update(trans, inode_iter,
-					  &inode_p.inode.k_i);
-		}
-	}
-
-	ret = bch2_trans_commit(trans, disk_res,
-				&inode->ei_journal_seq,
-				BTREE_INSERT_NOFAIL|
-				BTREE_INSERT_ATOMIC|
-				BTREE_INSERT_NOUNLOCK|
-				BTREE_INSERT_USE_RESERVE);
-	if (ret)
-		goto err;
-
-	if (i_sectors_delta || extending) {
-		inode->ei_inode.bi_sectors	= inode_u.bi_sectors;
-		inode->ei_inode.bi_size		= inode_u.bi_size;
-	}
+	new_i_size = min(new_i_size, extent_iter->pos.offset << 9);
 
 	if (direct)
-		i_sectors_acct(c, inode, quota_res, i_sectors_delta);
-	if (direct && extending) {
+		i_sectors_acct(trans->c, inode, quota_res, i_sectors_delta);
+	if (direct && new_i_size) {
 		spin_lock(&inode->v.i_lock);
 		if (new_i_size > inode->v.i_size)
 			i_size_write(&inode->v, new_i_size);
@@ -390,10 +270,7 @@ int bch2_extent_update(struct btree_trans *trans,
 
 	if (total_delta)
 		*total_delta += i_sectors_delta;
-err:
-	if (!IS_ERR_OR_NULL(inode_iter))
-		bch2_trans_iter_put(trans, inode_iter);
-	return ret;
+	return 0;
 }
 
 static int bchfs_write_index_update(struct bch_write_op *wop)
@@ -426,7 +303,7 @@ static int bchfs_write_index_update(struct bch_write_op *wop)
 
 		bch2_trans_begin_updates(&trans);
 
-		ret = bch2_extent_update(&trans, inode,
+		ret = bchfs_extent_update(&trans, inode,
 				&wop->res, quota_res,
 				iter, &tmp.k,
 				op->new_i_size,
@@ -2295,7 +2172,7 @@ int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter,
 
 		bch2_trans_begin_updates(trans);
 
-		ret = bch2_extent_update(trans, inode,
+		ret = bchfs_extent_update(trans, inode,
 				&disk_res, NULL, iter, &delete,
 				0, false, true, NULL);
 		bch2_disk_reservation_put(c, &disk_res);
@@ -2463,6 +2340,8 @@ static int bch2_extend(struct bch_inode_info *inode,
 
 	/*
 	 * sync appends:
+	 *
+	 * this has to be done _before_ extending i_size:
 	 */
 	ret = filemap_write_and_wait_range(mapping, inode_u->bi_size, S64_MAX);
 	if (ret)
@@ -2939,7 +2818,7 @@ static long bch2_fallocate(struct bch_inode_info *inode, int mode,
 
 		bch2_trans_begin_updates(&trans);
 
-		ret = bch2_extent_update(&trans, inode,
+		ret = bchfs_extent_update(&trans, inode,
 				&disk_res, &quota_res,
 				iter, &reservation.k_i,
 				0, true, true, NULL);
diff --git a/fs/bcachefs/fs-io.h b/fs/bcachefs/fs-io.h
index 5e48d21bd2e4..090d1c86de37 100644
--- a/fs/bcachefs/fs-io.h
+++ b/fs/bcachefs/fs-io.h
@@ -11,13 +11,13 @@
 
 struct quota_res;
 
-int bch2_extent_update(struct btree_trans *,
-		       struct bch_inode_info *,
-		       struct disk_reservation *,
-		       struct quota_res *,
-		       struct btree_iter *,
-		       struct bkey_i *,
-		       u64, bool, bool, s64 *);
+int bchfs_extent_update(struct btree_trans *,
+			struct bch_inode_info *,
+			struct disk_reservation *,
+			struct quota_res *,
+			struct btree_iter *,
+			struct bkey_i *,
+			u64, bool, bool, s64 *);
 int bch2_fpunch_at(struct btree_trans *, struct btree_iter *,
 		   struct bpos, struct bch_inode_info *);
 
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index b893db7f7dcc..a9b1c21dd9a7 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -19,6 +19,7 @@
 #include "ec.h"
 #include "error.h"
 #include "extents.h"
+#include "inode.h"
 #include "io.h"
 #include "journal.h"
 #include "keylist.h"
@@ -178,6 +179,146 @@ void bch2_bio_alloc_pages_pool(struct bch_fs *c, struct bio *bio,
 		mutex_unlock(&c->bio_bounce_pages_lock);
 }
 
+/* Extent update path: */
+
+static int sum_sector_overwrites(struct btree_trans *trans,
+				 struct btree_iter *extent_iter,
+				 struct bkey_i *new,
+				 bool may_allocate,
+				 bool *maybe_extending,
+				 s64 *delta)
+{
+	struct btree_iter *iter;
+	struct bkey_s_c old;
+	int ret = 0;
+
+	*maybe_extending = true;
+	*delta = 0;
+
+	iter = bch2_trans_copy_iter(trans, extent_iter);
+	if (IS_ERR(iter))
+		return PTR_ERR(iter);
+
+	for_each_btree_key_continue(iter, BTREE_ITER_SLOTS, old, ret) {
+		if (!may_allocate &&
+		    bch2_bkey_nr_ptrs_allocated(old) <
+		    bch2_bkey_nr_dirty_ptrs(bkey_i_to_s_c(new))) {
+			ret = -ENOSPC;
+			break;
+		}
+
+		*delta += (min(new->k.p.offset,
+			      old.k->p.offset) -
+			  max(bkey_start_offset(&new->k),
+			      bkey_start_offset(old.k))) *
+			(bkey_extent_is_allocation(&new->k) -
+			 bkey_extent_is_allocation(old.k));
+
+		if (bkey_cmp(old.k->p, new->k.p) >= 0) {
+			/*
+			 * Check if there's already data above where we're
+			 * going to be writing to - this means we're definitely
+			 * not extending the file:
+			 *
+			 * Note that it's not sufficient to check if there's
+			 * data up to the sector offset we're going to be
+			 * writing to, because i_size could be up to one block
+			 * less:
+			 */
+			if (!bkey_cmp(old.k->p, new->k.p))
+				old = bch2_btree_iter_next(iter);
+
+			if (old.k && !bkey_err(old) &&
+			    old.k->p.inode == extent_iter->pos.inode &&
+			    bkey_extent_is_data(old.k))
+				*maybe_extending = false;
+
+			break;
+		}
+	}
+
+	bch2_trans_iter_put(trans, iter);
+	return ret;
+}
+
+int bch2_extent_update(struct btree_trans *trans,
+		       struct btree_iter *iter,
+		       struct bkey_i *k,
+		       struct disk_reservation *disk_res,
+		       u64 *journal_seq,
+		       u64 new_i_size,
+		       s64 *i_sectors_delta)
+{
+	/* this must live until after bch2_trans_commit(): */
+	struct bkey_inode_buf inode_p;
+	bool extending = false;
+	s64 delta = 0;
+	int ret;
+
+	ret = bch2_extent_trim_atomic(k, iter);
+	if (ret)
+		return ret;
+
+	ret = sum_sector_overwrites(trans, iter, k,
+			disk_res && disk_res->sectors != 0,
+			&extending, &delta);
+	if (ret)
+		return ret;
+
+	new_i_size = extending
+		? min(k->k.p.offset << 9, new_i_size)
+		: 0;
+
+	if (delta || new_i_size) {
+		struct btree_iter *inode_iter;
+		struct bch_inode_unpacked inode_u;
+
+		inode_iter = bch2_inode_peek(trans, &inode_u,
+				k->k.p.inode, BTREE_ITER_INTENT);
+		if (IS_ERR(inode_iter))
+			return PTR_ERR(inode_iter);
+
+		/*
+		 * XXX:
+		 * writeback can race a bit with truncate, because truncate
+		 * first updates the inode then truncates the pagecache. This is
+		 * ugly, but lets us preserve the invariant that the in memory
+		 * i_size is always >= the on disk i_size.
+		 *
+		BUG_ON(new_i_size > inode_u.bi_size &&
+		       (inode_u.bi_flags & BCH_INODE_I_SIZE_DIRTY));
+		 */
+		BUG_ON(new_i_size > inode_u.bi_size && !extending);
+
+		if (!(inode_u.bi_flags & BCH_INODE_I_SIZE_DIRTY) &&
+		    new_i_size > inode_u.bi_size)
+			inode_u.bi_size = new_i_size;
+		else
+			new_i_size = 0;
+
+		inode_u.bi_sectors += delta;
+
+		if (delta || new_i_size) {
+			bch2_inode_pack(&inode_p, &inode_u);
+			bch2_trans_update(trans, inode_iter,
+					  &inode_p.inode.k_i);
+		}
+
+		bch2_trans_iter_put(trans, inode_iter);
+	}
+
+	bch2_trans_update(trans, iter, k);
+
+	ret = bch2_trans_commit(trans, disk_res, journal_seq,
+				BTREE_INSERT_NOFAIL|
+				BTREE_INSERT_ATOMIC|
+				BTREE_INSERT_USE_RESERVE);
+	if (!ret && i_sectors_delta)
+		*i_sectors_delta += delta;
+
+	return ret;
+}
+
 /* Writes */
 
 void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
diff --git a/fs/bcachefs/io.h b/fs/bcachefs/io.h
index c6f5ae717cf3..e53f9ecc082d 100644
--- a/fs/bcachefs/io.h
+++ b/fs/bcachefs/io.h
@@ -58,6 +58,9 @@ static inline struct workqueue_struct *index_update_wq(struct bch_write_op *op)
 		: op->c->wq;
 }
 
+int bch2_extent_update(struct btree_trans *, struct btree_iter *,
+		       struct bkey_i *, struct disk_reservation *,
+		       u64 *, u64, s64 *);
 int bch2_write_index_default(struct bch_write_op *);
 
 static inline void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c,
diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c
index de4c8b075a65..c9ff467cc0d9 100644
--- a/fs/bcachefs/reflink.c
+++ b/fs/bcachefs/reflink.c
@@ -259,9 +259,9 @@ s64 bch2_remap_range(struct bch_fs *c,
 				min(src_k.k->p.offset - src_iter->pos.offset,
 				    dst_end.offset - dst_iter->pos.offset));
 
-		ret = bch2_extent_update(&trans, dst_inode, NULL, NULL,
-					 dst_iter, &new_dst.k,
-					 new_i_size, false, true, NULL);
+		ret = bchfs_extent_update(&trans, dst_inode, NULL, NULL,
+					  dst_iter, &new_dst.k,
+					  new_i_size, false, true, NULL);
 		if (ret)
 			goto btree_err;
 
-- 
cgit 


From 2e87eae1fb7a61e72d98af9c0ef8cbaad1a6d497 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 10 Oct 2019 12:47:22 -0400
Subject: bcachefs: Convert bch2_fpunch to bch2_extent_update()

As before - we're moving non Linux specific code out of fs-io.c.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-io.c   | 156 +++++++++++++++++++-------------------------------
 fs/bcachefs/fs-io.h   |   2 -
 fs/bcachefs/io.c      |  71 +++++++++++++++++++++++
 fs/bcachefs/io.h      |   4 ++
 fs/bcachefs/reflink.c |  53 ++++++++---------
 fs/bcachefs/reflink.h |   6 +-
 6 files changed, 163 insertions(+), 129 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 92cab285698c..19793745edf9 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -2145,78 +2145,6 @@ out:
 
 /* truncate: */
 
-int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter,
-		   struct bpos end, struct bch_inode_info *inode)
-{
-	struct bch_fs *c	= trans->c;
-	unsigned max_sectors	= KEY_SIZE_MAX & (~0 << c->block_bits);
-	struct bkey_s_c k;
-	int ret = 0, ret2 = 0;
-
-	while ((k = bch2_btree_iter_peek(iter)).k &&
-	       bkey_cmp(iter->pos, end) < 0) {
-		struct disk_reservation disk_res =
-			bch2_disk_reservation_init(c, 0);
-		struct bkey_i delete;
-
-		ret = bkey_err(k);
-		if (ret)
-			goto btree_err;
-
-		bkey_init(&delete.k);
-		delete.k.p = iter->pos;
-
-		/* create the biggest key we can */
-		bch2_key_resize(&delete.k, max_sectors);
-		bch2_cut_back(end, &delete.k);
-
-		bch2_trans_begin_updates(trans);
-
-		ret = bchfs_extent_update(trans, inode,
-				&disk_res, NULL, iter, &delete,
-				0, false, true, NULL);
-		bch2_disk_reservation_put(c, &disk_res);
-btree_err:
-		if (ret == -EINTR) {
-			ret2 = ret;
-			ret = 0;
-		}
-		if (ret)
-			break;
-	}
-
-	if (bkey_cmp(iter->pos, end) > 0) {
-		bch2_btree_iter_set_pos(iter, end);
-		ret = bch2_btree_iter_traverse(iter);
-	}
-
-	return ret ?: ret2;
-}
-
-static int __bch2_fpunch(struct bch_fs *c, struct bch_inode_info *inode,
-			 u64 start_offset, u64 end_offset)
-{
-	struct btree_trans trans;
-	struct btree_iter *iter;
-	int ret = 0;
-
-	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024);
-
-	iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
-				   POS(inode->v.i_ino, start_offset),
-				   BTREE_ITER_INTENT);
-
-	ret = bch2_fpunch_at(&trans, iter,
-			POS(inode->v.i_ino, end_offset), inode);
-
-	bch2_trans_exit(&trans);
-
-	if (ret == -EINTR)
-		ret = 0;
-
-	return ret;
-}
-
 static inline int range_has_data(struct bch_fs *c,
 				  struct bpos start,
 				  struct bpos end)
@@ -2388,6 +2316,7 @@ int bch2_truncate(struct bch_inode_info *inode, struct iattr *iattr)
 	struct btree_trans trans;
 	struct btree_iter *iter;
 	u64 new_i_size = iattr->ia_size;
+	s64 i_sectors_delta = 0;
 	int ret = 0;
 
 	inode_dio_wait(&inode->v);
@@ -2447,9 +2376,11 @@ int bch2_truncate(struct bch_inode_info *inode, struct iattr *iattr)
 
 	truncate_setsize(&inode->v, iattr->ia_size);
 
-	ret = __bch2_fpunch(c, inode,
+	ret = bch2_fpunch(c, inode->v.i_ino,
 			round_up(iattr->ia_size, block_bytes(c)) >> 9,
-			U64_MAX);
+			U64_MAX, &inode->ei_journal_seq, &i_sectors_delta);
+	i_sectors_acct(c, inode, NULL, i_sectors_delta);
+
 	if (unlikely(ret))
 		goto err;
 
@@ -2467,7 +2398,7 @@ err:
 
 /* fallocate: */
 
-static long bch2_fpunch(struct bch_inode_info *inode, loff_t offset, loff_t len)
+static long bchfs_fpunch(struct bch_inode_info *inode, loff_t offset, loff_t len)
 {
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
 	u64 discard_start = round_up(offset, block_bytes(c)) >> 9;
@@ -2495,8 +2426,15 @@ static long bch2_fpunch(struct bch_inode_info *inode, loff_t offset, loff_t len)
 
 	truncate_pagecache_range(&inode->v, offset, offset + len - 1);
 
-	if (discard_start < discard_end)
-		ret = __bch2_fpunch(c, inode, discard_start, discard_end);
+	if (discard_start < discard_end) {
+		s64 i_sectors_delta = 0;
+
+		ret = bch2_fpunch(c, inode->v.i_ino,
+				  discard_start, discard_end,
+				  &inode->ei_journal_seq,
+				  &i_sectors_delta);
+		i_sectors_acct(c, inode, NULL, i_sectors_delta);
+	}
 err:
 	bch2_pagecache_block_put(&inode->ei_pagecache_lock);
 	inode_unlock(&inode->v);
@@ -2504,7 +2442,7 @@ err:
 	return ret;
 }
 
-static long bch2_fcollapse_finsert(struct bch_inode_info *inode,
+static long bchfs_fcollapse_finsert(struct bch_inode_info *inode,
 				   loff_t offset, loff_t len,
 				   bool insert)
 {
@@ -2564,8 +2502,14 @@ static long bch2_fcollapse_finsert(struct bch_inode_info *inode,
 					    ATTR_MTIME|ATTR_CTIME);
 		mutex_unlock(&inode->ei_update_lock);
 	} else {
-		ret = __bch2_fpunch(c, inode, offset >> 9,
-				    (offset + len) >> 9);
+		s64 i_sectors_delta = 0;
+
+		ret = bch2_fpunch(c, inode->v.i_ino,
+				  offset >> 9, (offset + len) >> 9,
+				  &inode->ei_journal_seq,
+				  &i_sectors_delta);
+		i_sectors_acct(c, inode, NULL, i_sectors_delta);
+
 		if (ret)
 			goto err;
 	}
@@ -2715,8 +2659,8 @@ err:
 	return ret;
 }
 
-static long bch2_fallocate(struct bch_inode_info *inode, int mode,
-			   loff_t offset, loff_t len)
+static long bchfs_fallocate(struct bch_inode_info *inode, int mode,
+			    loff_t offset, loff_t len)
 {
 	struct address_space *mapping = inode->v.i_mapping;
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
@@ -2765,6 +2709,7 @@ static long bch2_fallocate(struct bch_inode_info *inode, int mode,
 	end_pos = POS(inode->v.i_ino, block_end >> 9);
 
 	while (bkey_cmp(iter->pos, end_pos) < 0) {
+		s64 i_sectors_delta = 0;
 		struct disk_reservation disk_res = { 0 };
 		struct quota_res quota_res = { 0 };
 		struct bkey_i_reservation reservation;
@@ -2818,10 +2763,10 @@ static long bch2_fallocate(struct bch_inode_info *inode, int mode,
 
 		bch2_trans_begin_updates(&trans);
 
-		ret = bchfs_extent_update(&trans, inode,
-				&disk_res, &quota_res,
-				iter, &reservation.k_i,
-				0, true, true, NULL);
+		ret = bch2_extent_update(&trans, iter, &reservation.k_i,
+				&disk_res, &inode->ei_journal_seq,
+				0, &i_sectors_delta);
+		i_sectors_acct(c, inode, &quota_res, i_sectors_delta);
 bkey_err:
 		bch2_quota_reservation_put(c, inode, &quota_res);
 		bch2_disk_reservation_put(c, &disk_res);
@@ -2887,16 +2832,16 @@ long bch2_fallocate_dispatch(struct file *file, int mode,
 	struct bch_inode_info *inode = file_bch_inode(file);
 
 	if (!(mode & ~(FALLOC_FL_KEEP_SIZE|FALLOC_FL_ZERO_RANGE)))
-		return bch2_fallocate(inode, mode, offset, len);
+		return bchfs_fallocate(inode, mode, offset, len);
 
 	if (mode == (FALLOC_FL_PUNCH_HOLE|FALLOC_FL_KEEP_SIZE))
-		return bch2_fpunch(inode, offset, len);
+		return bchfs_fpunch(inode, offset, len);
 
 	if (mode == FALLOC_FL_INSERT_RANGE)
-		return bch2_fcollapse_finsert(inode, offset, len, true);
+		return bchfs_fcollapse_finsert(inode, offset, len, true);
 
 	if (mode == FALLOC_FL_COLLAPSE_RANGE)
-		return bch2_fcollapse_finsert(inode, offset, len, false);
+		return bchfs_fcollapse_finsert(inode, offset, len, false);
 
 	return -EOPNOTSUPP;
 }
@@ -2941,6 +2886,7 @@ loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src,
 	struct bch_inode_info *src = file_bch_inode(file_src);
 	struct bch_inode_info *dst = file_bch_inode(file_dst);
 	struct bch_fs *c = src->v.i_sb->s_fs_info;
+	s64 i_sectors_delta = 0;
 	loff_t ret = 0;
 	loff_t aligned_len;
 
@@ -2960,6 +2906,8 @@ loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src,
 
 	bch2_lock_inodes(INODE_LOCK|INODE_PAGECACHE_BLOCK, src, dst);
 
+	file_update_time(file_dst);
+
 	inode_dio_wait(&src->v);
 	inode_dio_wait(&dst->v);
 
@@ -2967,26 +2915,40 @@ loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src,
 					    file_dst, pos_dst,
 					    &len, remap_flags);
 	if (ret < 0 || len == 0)
-		goto out_unlock;
+		goto err;
 
 	aligned_len = round_up(len, block_bytes(c));
 
 	ret = write_invalidate_inode_pages_range(dst->v.i_mapping,
 				pos_dst, pos_dst + aligned_len);
 	if (ret)
-		goto out_unlock;
+		goto err;
 
 	mark_range_unallocated(src, pos_src, pos_src + aligned_len);
 
-	ret = bch2_remap_range(c, dst,
+	ret = bch2_remap_range(c,
 			       POS(dst->v.i_ino, pos_dst >> 9),
 			       POS(src->v.i_ino, pos_src >> 9),
 			       aligned_len >> 9,
-			       pos_dst + len);
-	if (ret > 0)
-		ret = min(ret << 9, len);
+			       &dst->ei_journal_seq,
+			       pos_dst + len, &i_sectors_delta);
+	if (ret < 0)
+		goto err;
 
-out_unlock:
+	ret <<= 9;
+	/*
+	 * due to alignment, we might have remapped slightly more than requsted
+	 */
+	ret = min(ret, len);
+
+	/* XXX get a quota reservation */
+	i_sectors_acct(c, dst, NULL, i_sectors_delta);
+
+	spin_lock(&dst->v.i_lock);
+	if (pos_dst + len > dst->v.i_size)
+		i_size_write(&dst->v, pos_dst + len);
+	spin_unlock(&dst->v.i_lock);
+err:
 	bch2_unlock_inodes(INODE_LOCK|INODE_PAGECACHE_BLOCK, src, dst);
 
 	return ret;
diff --git a/fs/bcachefs/fs-io.h b/fs/bcachefs/fs-io.h
index 090d1c86de37..f823810d4971 100644
--- a/fs/bcachefs/fs-io.h
+++ b/fs/bcachefs/fs-io.h
@@ -18,8 +18,6 @@ int bchfs_extent_update(struct btree_trans *,
 			struct btree_iter *,
 			struct bkey_i *,
 			u64, bool, bool, s64 *);
-int bch2_fpunch_at(struct btree_trans *, struct btree_iter *,
-		   struct bpos, struct bch_inode_info *);
 
 int __must_check bch2_write_inode_size(struct bch_fs *,
 				       struct bch_inode_info *,
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index a9b1c21dd9a7..c60e52fbf4fe 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -319,6 +319,77 @@ int bch2_extent_update(struct btree_trans *trans,
 	return ret;
 }
 
+int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter,
+		   struct bpos end, u64 *journal_seq,
+		   s64 *i_sectors_delta)
+{
+	struct bch_fs *c	= trans->c;
+	unsigned max_sectors	= KEY_SIZE_MAX & (~0 << c->block_bits);
+	struct bkey_s_c k;
+	int ret = 0, ret2 = 0;
+
+	while ((k = bch2_btree_iter_peek(iter)).k &&
+	       bkey_cmp(iter->pos, end) < 0) {
+		struct disk_reservation disk_res =
+			bch2_disk_reservation_init(c, 0);
+		struct bkey_i delete;
+
+		ret = bkey_err(k);
+		if (ret)
+			goto btree_err;
+
+		bkey_init(&delete.k);
+		delete.k.p = iter->pos;
+
+		/* create the biggest key we can */
+		bch2_key_resize(&delete.k, max_sectors);
+		bch2_cut_back(end, &delete.k);
+
+		bch2_trans_begin_updates(trans);
+
+		ret = bch2_extent_update(trans, iter, &delete,
+				&disk_res, journal_seq,
+				0, i_sectors_delta);
+		bch2_disk_reservation_put(c, &disk_res);
+btree_err:
+		if (ret == -EINTR) {
+			ret2 = ret;
+			ret = 0;
+		}
+		if (ret)
+			break;
+	}
+
+	if (bkey_cmp(iter->pos, end) > 0) {
+		bch2_btree_iter_set_pos(iter, end);
+		ret = bch2_btree_iter_traverse(iter);
+	}
+
+	return ret ?: ret2;
+}
+
+int bch2_fpunch(struct bch_fs *c, u64 inum, u64 start, u64 end,
+		u64 *journal_seq, s64 *i_sectors_delta)
+{
+	struct btree_trans trans;
+	struct btree_iter *iter;
+	int ret = 0;
+
+	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024);
+	iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
+				   POS(inum, start),
+				   BTREE_ITER_INTENT);
+
+	ret = bch2_fpunch_at(&trans, iter, POS(inum, end),
+			     journal_seq, i_sectors_delta);
+	bch2_trans_exit(&trans);
+
+	if (ret == -EINTR)
+		ret = 0;
+
+	return ret;
+}
+
 /* Writes */
 
 void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
diff --git a/fs/bcachefs/io.h b/fs/bcachefs/io.h
index e53f9ecc082d..97cc661420c6 100644
--- a/fs/bcachefs/io.h
+++ b/fs/bcachefs/io.h
@@ -61,6 +61,10 @@ static inline struct workqueue_struct *index_update_wq(struct bch_write_op *op)
 int bch2_extent_update(struct btree_trans *, struct btree_iter *,
 		       struct bkey_i *, struct disk_reservation *,
 		       u64 *, u64, s64 *);
+int bch2_fpunch_at(struct btree_trans *, struct btree_iter *,
+		   struct bpos, u64 *, s64 *);
+int bch2_fpunch(struct bch_fs *c, u64, u64, u64, u64 *, s64 *);
+
 int bch2_write_index_default(struct bch_write_op *);
 
 static inline void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c,
diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c
index c9ff467cc0d9..4a4b17f93a2e 100644
--- a/fs/bcachefs/reflink.c
+++ b/fs/bcachefs/reflink.c
@@ -2,8 +2,8 @@
 #include "bcachefs.h"
 #include "btree_update.h"
 #include "extents.h"
-#include "fs.h"
-#include "fs-io.h"
+#include "inode.h"
+#include "io.h"
 #include "reflink.h"
 
 #include <linux/sched/signal.h>
@@ -70,12 +70,6 @@ void bch2_reflink_v_to_text(struct printbuf *out, struct bch_fs *c,
 	bch2_bkey_ptrs_to_text(out, c, k);
 }
 
-/*
- * bch2_remap_range() depends on bch2_extent_update(), which depends on various
- * things tied to the linux vfs for inode updates, for now:
- */
-#ifndef NO_BCACHEFS_FS
-
 static int bch2_make_extent_indirect(struct btree_trans *trans,
 				     struct btree_iter *extent_iter,
 				     struct bkey_i_extent *e)
@@ -159,9 +153,9 @@ static struct bkey_s_c get_next_src(struct btree_iter *iter, struct bpos end)
 }
 
 s64 bch2_remap_range(struct bch_fs *c,
-		     struct bch_inode_info *dst_inode,
 		     struct bpos dst_start, struct bpos src_start,
-		     u64 remap_sectors, u64 new_i_size)
+		     u64 remap_sectors, u64 *journal_seq,
+		     u64 new_i_size, s64 *i_sectors_delta)
 {
 	struct btree_trans trans;
 	struct btree_iter *dst_iter, *src_iter;
@@ -170,7 +164,7 @@ s64 bch2_remap_range(struct bch_fs *c,
 	struct bpos dst_end = dst_start, src_end = src_start;
 	struct bpos dst_want, src_want;
 	u64 src_done, dst_done;
-	int ret = 0;
+	int ret = 0, ret2 = 0;
 
 	if (!(c->sb.features & (1ULL << BCH_FEATURE_REFLINK))) {
 		mutex_lock(&c->sb_lock);
@@ -213,7 +207,7 @@ s64 bch2_remap_range(struct bch_fs *c,
 
 		if (bkey_cmp(dst_iter->pos, dst_want) < 0) {
 			ret = bch2_fpunch_at(&trans, dst_iter, dst_want,
-					     dst_inode);
+					     journal_seq, i_sectors_delta);
 			if (ret)
 				goto btree_err;
 			continue;
@@ -259,9 +253,9 @@ s64 bch2_remap_range(struct bch_fs *c,
 				min(src_k.k->p.offset - src_iter->pos.offset,
 				    dst_end.offset - dst_iter->pos.offset));
 
-		ret = bchfs_extent_update(&trans, dst_inode, NULL, NULL,
-					  dst_iter, &new_dst.k,
-					  new_i_size, false, true, NULL);
+		ret = bch2_extent_update(&trans, dst_iter, &new_dst.k,
+					 NULL, journal_seq,
+					 new_i_size, i_sectors_delta);
 		if (ret)
 			goto btree_err;
 
@@ -282,17 +276,24 @@ err:
 	dst_done = dst_iter->pos.offset - dst_start.offset;
 	new_i_size = min(dst_iter->pos.offset << 9, new_i_size);
 
-	ret = bch2_trans_exit(&trans) ?: ret;
+	bch2_trans_begin(&trans);
 
-	mutex_lock(&dst_inode->ei_update_lock);
-	if (dst_inode->v.i_size < new_i_size) {
-		i_size_write(&dst_inode->v, new_i_size);
-		ret = bch2_write_inode_size(c, dst_inode, new_i_size,
-					    ATTR_MTIME|ATTR_CTIME);
-	}
-	mutex_unlock(&dst_inode->ei_update_lock);
+	do {
+		struct bch_inode_unpacked inode_u;
+		struct btree_iter *inode_iter;
 
-	return dst_done ?: ret;
-}
+		inode_iter = bch2_inode_peek(&trans, &inode_u,
+				dst_start.inode, BTREE_ITER_INTENT);
+		ret2 = PTR_ERR_OR_ZERO(inode_iter);
 
-#endif /* NO_BCACHEFS_FS */
+		if (!ret2 &&
+		    inode_u.bi_size < new_i_size)
+			ret2  = bch2_inode_write(&trans, inode_iter, &inode_u) ?:
+				bch2_trans_commit(&trans, NULL, journal_seq,
+						  BTREE_INSERT_ATOMIC);
+	} while (ret2 == -EINTR);
+
+	ret = bch2_trans_exit(&trans) ?: ret;
+
+	return dst_done ?: ret ?: ret2;
+}
diff --git a/fs/bcachefs/reflink.h b/fs/bcachefs/reflink.h
index 327618c36d33..ac23b855858c 100644
--- a/fs/bcachefs/reflink.h
+++ b/fs/bcachefs/reflink.h
@@ -24,9 +24,7 @@ void bch2_reflink_v_to_text(struct printbuf *, struct bch_fs *,
 	.val_to_text	= bch2_reflink_v_to_text,		\
 }
 
-#ifndef NO_BCACHEFS_FS
-s64 bch2_remap_range(struct bch_fs *, struct bch_inode_info *,
-		     struct bpos, struct bpos, u64, u64);
-#endif /* NO_BCACHEFS_FS */
+s64 bch2_remap_range(struct bch_fs *, struct bpos, struct bpos,
+		     u64, u64 *, u64, s64 *);
 
 #endif /* _BCACHEFS_REFLINK_H */
-- 
cgit 


From 9a3df993e15e0d44974d6ac7c5749c7028aa9e3c Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 9 Oct 2019 12:50:39 -0400
Subject: bcachefs: Kill bchfs_extent_update()

The generic IO path now handles inode updates for i_size and i_sectors -
this means we can drop a fair amount of code from fs-io.c.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-io.c    | 267 +++++++++++++++----------------------------------
 fs/bcachefs/fs-io.h    |   8 --
 fs/bcachefs/inode.h    |   9 ++
 fs/bcachefs/io.c       |  93 ++++++++---------
 fs/bcachefs/io.h       |   2 +
 fs/bcachefs/io_types.h |   2 +
 6 files changed, 132 insertions(+), 249 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 19793745edf9..9914f6ee2aa2 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -45,23 +45,12 @@ struct quota_res {
 	u64				sectors;
 };
 
-struct bchfs_write_op {
-	struct bch_inode_info		*inode;
-	s64				sectors_added;
-	bool				is_dio;
-	bool				unalloc;
-	u64				new_i_size;
-
-	/* must be last: */
-	struct bch_write_op		op;
-};
-
 struct bch_writepage_io {
 	struct closure			cl;
-	u64				new_sectors;
+	struct bch_inode_info		*inode;
 
 	/* must be last: */
-	struct bchfs_write_op		op;
+	struct bch_write_op		op;
 };
 
 struct dio_write {
@@ -77,7 +66,7 @@ struct dio_write {
 	struct iovec			inline_vecs[2];
 
 	/* must be last: */
-	struct bchfs_write_op		iop;
+	struct bch_write_op		op;
 };
 
 struct dio_read {
@@ -237,121 +226,6 @@ static void i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode,
 	mutex_unlock(&inode->ei_quota_lock);
 }
 
-int bchfs_extent_update(struct btree_trans *trans,
-			struct bch_inode_info *inode,
-			struct disk_reservation *disk_res,
-			struct quota_res *quota_res,
-			struct btree_iter *extent_iter,
-			struct bkey_i *k,
-			u64 new_i_size,
-			bool may_allocate,
-			bool direct,
-			s64 *total_delta)
-{
-	s64 i_sectors_delta = 0;
-	int ret;
-
-	ret = bch2_extent_update(trans, extent_iter, k,
-			disk_res, &inode->ei_journal_seq,
-			new_i_size, &i_sectors_delta);
-	if (ret)
-		return ret;
-
-	new_i_size = min(new_i_size, extent_iter->pos.offset << 9);
-
-	if (direct)
-		i_sectors_acct(trans->c, inode, quota_res, i_sectors_delta);
-	if (direct && new_i_size) {
-		spin_lock(&inode->v.i_lock);
-		if (new_i_size > inode->v.i_size)
-			i_size_write(&inode->v, new_i_size);
-		spin_unlock(&inode->v.i_lock);
-	}
-
-	if (total_delta)
-		*total_delta += i_sectors_delta;
-	return 0;
-}
-
-static int bchfs_write_index_update(struct bch_write_op *wop)
-{
-	struct bch_fs *c = wop->c;
-	struct bchfs_write_op *op = container_of(wop,
-				struct bchfs_write_op, op);
-	struct quota_res *quota_res = op->is_dio
-		? &container_of(op, struct dio_write, iop)->quota_res
-		: NULL;
-	struct bch_inode_info *inode = op->inode;
-	struct keylist *keys = &op->op.insert_keys;
-	struct bkey_i *k = bch2_keylist_front(keys);
-	struct btree_trans trans;
-	struct btree_iter *iter;
-	int ret;
-
-	BUG_ON(k->k.p.inode != inode->v.i_ino);
-
-	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024);
-
-	iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
-				   bkey_start_pos(&k->k),
-				   BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
-
-	do {
-		BKEY_PADDED(k) tmp;
-
-		bkey_copy(&tmp.k, bch2_keylist_front(keys));
-
-		bch2_trans_begin_updates(&trans);
-
-		ret = bchfs_extent_update(&trans, inode,
-				&wop->res, quota_res,
-				iter, &tmp.k,
-				op->new_i_size,
-				!op->unalloc,
-				op->is_dio,
-				&op->sectors_added);
-		if (ret == -EINTR)
-			continue;
-		if (ret)
-			break;
-
-		if (bkey_cmp(iter->pos, bch2_keylist_front(keys)->k.p) < 0)
-			bch2_cut_front(iter->pos, bch2_keylist_front(keys));
-		else
-			bch2_keylist_pop_front(keys);
-	} while (!bch2_keylist_empty(keys));
-
-	bch2_trans_exit(&trans);
-
-	return ret;
-}
-
-static inline void bch2_fswrite_op_init(struct bchfs_write_op *op,
-					struct bch_fs *c,
-					struct bch_inode_info *inode,
-					struct bch_io_opts opts,
-					bool is_dio)
-{
-	op->inode		= inode;
-	op->sectors_added	= 0;
-	op->is_dio		= is_dio;
-	op->unalloc		= false;
-	op->new_i_size		= U64_MAX;
-
-	bch2_write_op_init(&op->op, c, opts);
-	op->op.target		= opts.foreground_target;
-	op->op.index_update_fn	= bchfs_write_index_update;
-	op_journal_seq_set(&op->op, &inode->ei_journal_seq);
-}
-
-static inline struct bch_io_opts io_opts(struct bch_fs *c, struct bch_inode_info *inode)
-{
-	struct bch_io_opts opts = bch2_opts_to_inode_opts(c->opts);
-
-	bch2_io_opts_apply(&opts, bch2_inode_opts_get(&inode->ei_inode));
-	return opts;
-}
-
 /* page state: */
 
 /* stored in page->private: */
@@ -947,7 +821,7 @@ void bch2_readahead(struct readahead_control *ractl)
 {
 	struct bch_inode_info *inode = to_bch_ei(ractl->mapping->host);
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	struct bch_io_opts opts = io_opts(c, inode);
+	struct bch_io_opts opts = io_opts(c, &inode->ei_inode);
 	struct btree_trans trans;
 	struct btree_iter *iter;
 	struct page *page;
@@ -1028,7 +902,7 @@ static int bch2_read_single_page(struct page *page,
 	DECLARE_COMPLETION_ONSTACK(done);
 
 	rbio = rbio_init(bio_alloc_bioset(NULL, 1, REQ_OP_READ, GFP_NOFS, &c->bio_read),
-			 io_opts(c, inode));
+			 io_opts(c, &inode->ei_inode));
 	rbio->bio.bi_private = &done;
 	rbio->bio.bi_end_io = bch2_read_single_page_end_io;
 
@@ -1065,7 +939,9 @@ struct bch_writepage_state {
 static inline struct bch_writepage_state bch_writepage_state_init(struct bch_fs *c,
 								  struct bch_inode_info *inode)
 {
-	return (struct bch_writepage_state) { .opts = io_opts(c, inode) };
+	return (struct bch_writepage_state) {
+		.opts = io_opts(c, &inode->ei_inode)
+	};
 }
 
 static void bch2_writepage_io_free(struct closure *cl)
@@ -1073,20 +949,20 @@ static void bch2_writepage_io_free(struct closure *cl)
 	struct bch_writepage_io *io = container_of(cl,
 					struct bch_writepage_io, cl);
 
-	bio_put(&io->op.op.wbio.bio);
+	bio_put(&io->op.wbio.bio);
 }
 
 static void bch2_writepage_io_done(struct closure *cl)
 {
 	struct bch_writepage_io *io = container_of(cl,
 					struct bch_writepage_io, cl);
-	struct bch_fs *c = io->op.op.c;
-	struct bio *bio = &io->op.op.wbio.bio;
+	struct bch_fs *c = io->op.c;
+	struct bio *bio = &io->op.wbio.bio;
 	struct bvec_iter_all iter;
 	struct bio_vec *bvec;
 	unsigned i;
 
-	if (io->op.op.error) {
+	if (io->op.error) {
 		bio_for_each_segment_all(bvec, bio, iter) {
 			struct bch_page_state *s;
 
@@ -1105,22 +981,20 @@ static void bch2_writepage_io_done(struct closure *cl)
 	 * racing with fallocate can cause us to add fewer sectors than
 	 * expected - but we shouldn't add more sectors than expected:
 	 */
-	BUG_ON(io->op.sectors_added > (s64) io->new_sectors);
+	BUG_ON(io->op.i_sectors_delta > 0);
 
 	/*
 	 * (error (due to going RO) halfway through a page can screw that up
 	 * slightly)
 	 * XXX wtf?
-	   BUG_ON(io->op.sectors_added - io->new_sectors >= (s64) PAGE_SECTORS);
+	   BUG_ON(io->op.op.i_sectors_delta >= PAGE_SECTORS);
 	 */
 
 	/*
 	 * PageWriteback is effectively our ref on the inode - fixup i_blocks
 	 * before calling end_page_writeback:
 	 */
-	if (io->op.sectors_added != io->new_sectors)
-		i_sectors_acct(c, io->op.inode, NULL,
-			       io->op.sectors_added - (s64) io->new_sectors);
+	i_sectors_acct(c, io->inode, NULL, io->op.i_sectors_delta);
 
 	bio_for_each_segment_all(bvec, bio, iter) {
 		struct bch_page_state *s = __bch2_page_state(bvec->bv_page);
@@ -1137,7 +1011,7 @@ static void bch2_writepage_do_io(struct bch_writepage_state *w)
 	struct bch_writepage_io *io = w->io;
 
 	w->io = NULL;
-	closure_call(&io->op.op.cl, bch2_write, NULL, &io->cl);
+	closure_call(&io->op.cl, bch2_write, NULL, &io->cl);
 	continue_at(&io->cl, bch2_writepage_io_done, NULL);
 }
 
@@ -1157,12 +1031,15 @@ static void bch2_writepage_io_alloc(struct bch_fs *c,
 					      REQ_OP_WRITE,
 					      GFP_NOFS,
 					      &c->writepage_bioset),
-			     struct bch_writepage_io, op.op.wbio.bio);
+			     struct bch_writepage_io, op.wbio.bio);
 
 	closure_init(&w->io->cl, NULL);
-	w->io->new_sectors	= 0;
-	bch2_fswrite_op_init(&w->io->op, c, inode, w->opts, false);
-	op			= &w->io->op.op;
+	w->io->inode		= inode;
+
+	op			= &w->io->op;
+	bch2_write_op_init(op, c, w->opts);
+	op->target		= w->opts.foreground_target;
+	op_journal_seq_set(op, &inode->ei_journal_seq);
 	op->nr_replicas		= nr_replicas;
 	op->res.nr_replicas	= nr_replicas;
 	op->write_point		= writepoint_hashed(inode->ei_last_dirtied);
@@ -1272,32 +1149,31 @@ do_io:
 		}
 
 		if (w->io &&
-		    (w->io->op.op.res.nr_replicas != nr_replicas_this_write ||
-		     bio_full(&w->io->op.op.wbio.bio, PAGE_SIZE) ||
-		     bio_end_sector(&w->io->op.op.wbio.bio) != sector))
+		    (w->io->op.res.nr_replicas != nr_replicas_this_write ||
+		     bio_full(&w->io->op.wbio.bio, PAGE_SIZE) ||
+		     bio_end_sector(&w->io->op.wbio.bio) != sector))
 			bch2_writepage_do_io(w);
 
 		if (!w->io)
 			bch2_writepage_io_alloc(c, w, inode, sector,
 						nr_replicas_this_write);
 
-		w->io->new_sectors += dirty_sectors;
-
 		atomic_inc(&s->write_count);
 
-		BUG_ON(inode != w->io->op.inode);
-		BUG_ON(!bio_add_page(&w->io->op.op.wbio.bio, page,
+		BUG_ON(inode != w->io->inode);
+		BUG_ON(!bio_add_page(&w->io->op.wbio.bio, page,
 				     sectors << 9, offset << 9));
 
 		/* Check for writing past i_size: */
-		WARN_ON((bio_end_sector(&w->io->op.op.wbio.bio) << 9) >
+		WARN_ON((bio_end_sector(&w->io->op.wbio.bio) << 9) >
 			round_up(i_size, block_bytes(c)));
 
-		w->io->op.op.res.sectors += reserved_sectors;
+		w->io->op.res.sectors += reserved_sectors;
+		w->io->op.i_sectors_delta -= dirty_sectors;
 		w->io->op.new_i_size = i_size;
 
 		if (wbc->sync_mode == WB_SYNC_ALL)
-			w->io->op.op.wbio.bio.bi_opf |= REQ_SYNC;
+			w->io->op.wbio.bio.bi_opf |= REQ_SYNC;
 
 		offset += sectors;
 	}
@@ -1705,7 +1581,7 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter)
 	struct file *file = req->ki_filp;
 	struct bch_inode_info *inode = file_bch_inode(file);
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	struct bch_io_opts opts = io_opts(c, inode);
+	struct bch_io_opts opts = io_opts(c, &inode->ei_inode);
 	struct dio_read *dio;
 	struct bio *bio;
 	loff_t offset = req->ki_pos;
@@ -1878,14 +1754,15 @@ static void bch2_dio_write_loop_async(struct closure *);
 static long bch2_dio_write_loop(struct dio_write *dio)
 {
 	bool kthread = (current->flags & PF_KTHREAD) != 0;
-	struct bch_fs *c = dio->iop.op.c;
+	struct bch_fs *c = dio->op.c;
 	struct kiocb *req = dio->req;
 	struct address_space *mapping = req->ki_filp->f_mapping;
-	struct bch_inode_info *inode = dio->iop.inode;
-	struct bio *bio = &dio->iop.op.wbio.bio;
+	struct bch_inode_info *inode = file_bch_inode(req->ki_filp);
+	struct bio *bio = &dio->op.wbio.bio;
 	struct bvec_iter_all iter;
 	struct bio_vec *bv;
 	unsigned unaligned;
+	u64 new_i_size;
 	loff_t offset;
 	bool sync;
 	long ret;
@@ -1897,7 +1774,7 @@ static long bch2_dio_write_loop(struct dio_write *dio)
 	bch2_pagecache_block_get(&inode->ei_pagecache_lock);
 
 	/* Write and invalidate pagecache range that we're writing to: */
-	offset = req->ki_pos + (dio->iop.op.written << 9);
+	offset = req->ki_pos + (dio->op.written << 9);
 	ret = write_invalidate_inode_pages_range(mapping,
 					offset,
 					offset + iov_iter_count(&dio->iter) - 1);
@@ -1905,7 +1782,7 @@ static long bch2_dio_write_loop(struct dio_write *dio)
 		goto err;
 
 	while (1) {
-		offset = req->ki_pos + (dio->iop.op.written << 9);
+		offset = req->ki_pos + (dio->op.written << 9);
 
 		if (kthread)
 			kthread_use_mm(dio->mm);
@@ -1943,15 +1820,15 @@ static long bch2_dio_write_loop(struct dio_write *dio)
 		if (unlikely(ret))
 			goto err;
 
-		dio->iop.op.pos = POS(inode->v.i_ino, offset >> 9);
+		dio->op.pos = POS(inode->v.i_ino, offset >> 9);
 
 		task_io_account_write(bio->bi_iter.bi_size);
 
-		closure_call(&dio->iop.op.cl, bch2_write, NULL, &dio->cl);
+		closure_call(&dio->op.cl, bch2_write, NULL, &dio->cl);
 
 		if (!dio->sync && !dio->loop && dio->iter.count) {
 			if (bch2_dio_write_copy_iov(dio)) {
-				dio->iop.op.error = -ENOMEM;
+				dio->op.error = -ENOMEM;
 				goto err_wait_io;
 			}
 		}
@@ -1965,17 +1842,28 @@ err_wait_io:
 
 		closure_sync(&dio->cl);
 loop:
+		i_sectors_acct(c, inode, &dio->quota_res,
+			       dio->op.i_sectors_delta);
+		dio->op.i_sectors_delta = 0;
+
+		new_i_size = req->ki_pos + ((u64) dio->op.written << 9);
+
+		spin_lock(&inode->v.i_lock);
+		if (new_i_size > inode->v.i_size)
+			i_size_write(&inode->v, new_i_size);
+		spin_unlock(&inode->v.i_lock);
+
 		bio_for_each_segment_all(bv, bio, iter)
 			put_page(bv->bv_page);
-		if (!dio->iter.count || dio->iop.op.error)
+		if (!dio->iter.count || dio->op.error)
 			break;
 		bio_reset(bio, NULL, REQ_OP_WRITE);
 	}
 
-	ret = dio->iop.op.error ?: ((long) dio->iop.op.written << 9);
+	ret = dio->op.error ?: ((long) dio->op.written << 9);
 err:
 	bch2_pagecache_block_put(&inode->ei_pagecache_lock);
-	bch2_disk_reservation_put(c, &dio->iop.op.res);
+	bch2_disk_reservation_put(c, &dio->op.res);
 	bch2_quota_reservation_put(c, inode, &dio->quota_res);
 
 	if (dio->free_iov)
@@ -2009,6 +1897,7 @@ ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter)
 	struct file *file = req->ki_filp;
 	struct bch_inode_info *inode = file_bch_inode(file);
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	struct bch_io_opts opts = io_opts(c, &inode->ei_inode);
 	struct dio_write *dio;
 	struct bio *bio;
 	ssize_t ret;
@@ -2026,7 +1915,7 @@ ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter)
 			       REQ_OP_WRITE,
 			       GFP_KERNEL,
 			       &c->dio_write_bioset);
-	dio = container_of(bio, struct dio_write, iop.op.wbio.bio);
+	dio = container_of(bio, struct dio_write, op.wbio.bio);
 	closure_init(&dio->cl, NULL);
 	dio->req		= req;
 	dio->mm			= current->mm;
@@ -2036,36 +1925,36 @@ ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter)
 	dio->free_iov		= false;
 	dio->quota_res.sectors	= 0;
 	dio->iter		= *iter;
-	bch2_fswrite_op_init(&dio->iop, c, inode, io_opts(c, inode), true);
-	dio->iop.op.write_point	= writepoint_hashed((unsigned long) current);
-	dio->iop.op.flags |= BCH_WRITE_NOPUT_RESERVATION;
+
+	bch2_write_op_init(&dio->op, c, opts);
+	dio->op.target		= opts.foreground_target;
+	op_journal_seq_set(&dio->op, &inode->ei_journal_seq);
+	dio->op.write_point	= writepoint_hashed((unsigned long) current);
+	dio->op.flags |= BCH_WRITE_NOPUT_RESERVATION;
 
 	if ((req->ki_flags & IOCB_DSYNC) &&
 	    !c->opts.journal_flush_disabled)
-		dio->iop.op.flags |= BCH_WRITE_FLUSH;
+		dio->op.flags |= BCH_WRITE_FLUSH;
 
 	ret = bch2_quota_reservation_add(c, inode, &dio->quota_res,
 					 iter->count >> 9, true);
 	if (unlikely(ret))
 		goto err;
 
-	dio->iop.op.nr_replicas	= dio->iop.op.opts.data_replicas;
+	dio->op.nr_replicas	= dio->op.opts.data_replicas;
 
-	ret = bch2_disk_reservation_get(c, &dio->iop.op.res, iter->count >> 9,
-					dio->iop.op.opts.data_replicas, 0);
-	if (unlikely(ret)) {
-		if (!bch2_check_range_allocated(c, POS(inode->v.i_ino,
-						       req->ki_pos >> 9),
-						iter->count >> 9,
-						dio->iop.op.opts.data_replicas))
-			goto err;
-
-		dio->iop.unalloc = true;
-	}
+	ret = bch2_disk_reservation_get(c, &dio->op.res, iter->count >> 9,
+					dio->op.opts.data_replicas, 0);
+	if (unlikely(ret) &&
+	    !bch2_check_range_allocated(c, POS(inode->v.i_ino,
+					       req->ki_pos >> 9),
+					iter->count >> 9,
+					dio->op.opts.data_replicas))
+		goto err;
 
 	return bch2_dio_write_loop(dio);
 err:
-	bch2_disk_reservation_put(c, &dio->iop.op.res);
+	bch2_disk_reservation_put(c, &dio->op.res);
 	bch2_quota_reservation_put(c, inode, &dio->quota_res);
 	closure_debug_destroy(&dio->cl);
 	bio_put(bio);
@@ -2671,7 +2560,7 @@ static long bchfs_fallocate(struct bch_inode_info *inode, int mode,
 	loff_t block_start	= round_down(offset,	block_bytes(c));
 	loff_t block_end	= round_up(end,		block_bytes(c));
 	unsigned sectors;
-	unsigned replicas = io_opts(c, inode).data_replicas;
+	unsigned replicas = io_opts(c, &inode->ei_inode).data_replicas;
 	int ret;
 
 	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
@@ -3180,13 +3069,13 @@ int bch2_fs_fsio_init(struct bch_fs *c)
 	pr_verbose_init(c->opts, "");
 
 	if (bioset_init(&c->writepage_bioset,
-			4, offsetof(struct bch_writepage_io, op.op.wbio.bio),
+			4, offsetof(struct bch_writepage_io, op.wbio.bio),
 			BIOSET_NEED_BVECS) ||
 	    bioset_init(&c->dio_read_bioset,
 			4, offsetof(struct dio_read, rbio.bio),
 			BIOSET_NEED_BVECS) ||
 	    bioset_init(&c->dio_write_bioset,
-			4, offsetof(struct dio_write, iop.op.wbio.bio),
+			4, offsetof(struct dio_write, op.wbio.bio),
 			BIOSET_NEED_BVECS))
 		ret = -ENOMEM;
 
diff --git a/fs/bcachefs/fs-io.h b/fs/bcachefs/fs-io.h
index f823810d4971..2a2df58a46bb 100644
--- a/fs/bcachefs/fs-io.h
+++ b/fs/bcachefs/fs-io.h
@@ -11,14 +11,6 @@
 
 struct quota_res;
 
-int bchfs_extent_update(struct btree_trans *,
-			struct bch_inode_info *,
-			struct disk_reservation *,
-			struct quota_res *,
-			struct btree_iter *,
-			struct bkey_i *,
-			u64, bool, bool, s64 *);
-
 int __must_check bch2_write_inode_size(struct bch_fs *,
 				       struct bch_inode_info *,
 				       loff_t, unsigned);
diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h
index b32c0a47c25d..bb759a46dc41 100644
--- a/fs/bcachefs/inode.h
+++ b/fs/bcachefs/inode.h
@@ -111,6 +111,15 @@ static inline u64 bch2_inode_opt_get(struct bch_inode_unpacked *inode,
 	}
 }
 
+static inline struct bch_io_opts
+io_opts(struct bch_fs *c, struct bch_inode_unpacked *inode)
+{
+	struct bch_io_opts opts = bch2_opts_to_inode_opts(c->opts);
+
+	bch2_io_opts_apply(&opts, bch2_inode_opts_get(inode));
+	return opts;
+}
+
 static inline u8 mode_to_type(umode_t mode)
 {
 	return (mode >> 12) & 15;
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index c60e52fbf4fe..d6d777bb53da 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -390,6 +390,47 @@ int bch2_fpunch(struct bch_fs *c, u64 inum, u64 start, u64 end,
 	return ret;
 }
 
+int bch2_write_index_default(struct bch_write_op *op)
+{
+	struct bch_fs *c = op->c;
+	struct keylist *keys = &op->insert_keys;
+	struct bkey_i *k = bch2_keylist_front(keys);
+	struct btree_trans trans;
+	struct btree_iter *iter;
+	int ret;
+
+	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024);
+
+	iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
+				   bkey_start_pos(&k->k),
+				   BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+
+	do {
+		BKEY_PADDED(k) tmp;
+
+		bkey_copy(&tmp.k, bch2_keylist_front(keys));
+
+		bch2_trans_begin_updates(&trans);
+
+		ret = bch2_extent_update(&trans, iter, &tmp.k,
+					 &op->res, op_journal_seq(op),
+					 op->new_i_size, &op->i_sectors_delta);
+		if (ret == -EINTR)
+			continue;
+		if (ret)
+			break;
+
+		if (bkey_cmp(iter->pos, bch2_keylist_front(keys)->k.p) < 0)
+			bch2_cut_front(iter->pos, bch2_keylist_front(keys));
+		else
+			bch2_keylist_pop_front(keys);
+	} while (!bch2_keylist_empty(keys));
+
+	bch2_trans_exit(&trans);
+
+	return ret;
+}
+
 /* Writes */
 
 void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
@@ -474,58 +515,6 @@ static void bch2_write_done(struct closure *cl)
 	closure_return(cl);
 }
 
-int bch2_write_index_default(struct bch_write_op *op)
-{
-	struct bch_fs *c = op->c;
-	struct btree_trans trans;
-	struct btree_iter *iter;
-	struct keylist *keys = &op->insert_keys;
-	int ret;
-
-	BUG_ON(bch2_keylist_empty(keys));
-	bch2_verify_keylist_sorted(keys);
-
-	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 256);
-retry:
-	bch2_trans_begin(&trans);
-
-	iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
-				   bkey_start_pos(&bch2_keylist_front(keys)->k),
-				   BTREE_ITER_INTENT);
-
-	do {
-		BKEY_PADDED(k) split;
-
-		bkey_copy(&split.k, bch2_keylist_front(keys));
-
-		ret = bch2_extent_trim_atomic(&split.k, iter);
-		if (ret)
-			break;
-
-		bch2_trans_update(&trans, iter, &split.k);
-
-		ret = bch2_trans_commit(&trans, &op->res, op_journal_seq(op),
-					BTREE_INSERT_NOFAIL|
-					BTREE_INSERT_USE_RESERVE);
-		if (ret)
-			break;
-
-		if (bkey_cmp(iter->pos, bch2_keylist_front(keys)->k.p) < 0)
-			bch2_cut_front(iter->pos, bch2_keylist_front(keys));
-		else
-			bch2_keylist_pop_front(keys);
-	} while (!bch2_keylist_empty(keys));
-
-	if (ret == -EINTR) {
-		ret = 0;
-		goto retry;
-	}
-
-	bch2_trans_exit(&trans);
-
-	return ret;
-}
-
 /**
  * bch_write_index - after a write, update index to point to new data
  */
diff --git a/fs/bcachefs/io.h b/fs/bcachefs/io.h
index 97cc661420c6..3d4e8ee0553b 100644
--- a/fs/bcachefs/io.h
+++ b/fs/bcachefs/io.h
@@ -89,6 +89,8 @@ static inline void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c,
 	op->write_point		= (struct write_point_specifier) { 0 };
 	op->res			= (struct disk_reservation) { 0 };
 	op->journal_seq		= 0;
+	op->new_i_size		= U64_MAX;
+	op->i_sectors_delta	= 0;
 	op->index_update_fn	= bch2_write_index_default;
 }
 
diff --git a/fs/bcachefs/io_types.h b/fs/bcachefs/io_types.h
index 50f2a5e57960..36b40c3fb894 100644
--- a/fs/bcachefs/io_types.h
+++ b/fs/bcachefs/io_types.h
@@ -134,6 +134,8 @@ struct bch_write_op {
 		u64			*journal_seq_p;
 		u64			journal_seq;
 	};
+	u64			new_i_size;
+	s64			i_sectors_delta;
 
 	int			(*index_update_fn)(struct bch_write_op *);
 
-- 
cgit 


From ea3532cbf7fdbb9fa4e45114532d55d1fc3ac7c2 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 11 Oct 2019 14:45:22 -0400
Subject: bcachefs: Fix a subtle race in the btree split path

We have to free the old (in memory) btree node _before_ unlocking the
new nodes - else, some other thread with a read lock on the old node
could see stale data after another thread has already updated the new
node.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_gc.c              |  5 +++--
 fs/bcachefs/btree_iter.c            |  2 --
 fs/bcachefs/btree_update_interior.c | 15 +++++++++++++++
 3 files changed, 18 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index f7d9abfdb3de..4a66c44764f6 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -1042,11 +1042,12 @@ next:
 			old_nodes[i] = new_nodes[i];
 		} else {
 			old_nodes[i] = NULL;
-			if (new_nodes[i])
-				six_unlock_intent(&new_nodes[i]->c.lock);
 		}
 	}
 
+	for (i = 0; i < nr_new_nodes; i++)
+		six_unlock_intent(&new_nodes[i]->c.lock);
+
 	bch2_btree_update_done(as);
 	bch2_keylist_free(&keylist, NULL);
 }
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 8c6d3193c3fe..a91cee797703 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -833,8 +833,6 @@ void bch2_btree_iter_node_replace(struct btree_iter *iter, struct btree *b)
 
 			btree_iter_node_set(linked, b);
 		}
-
-	six_unlock_intent(&b->c.lock);
 }
 
 void bch2_btree_iter_node_drop(struct btree_iter *iter, struct btree *b)
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 946254c51a69..3b134d3a9984 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -1446,8 +1446,20 @@ static void btree_split(struct btree_update *as, struct btree *b,
 		bch2_btree_iter_node_replace(iter, n2);
 	bch2_btree_iter_node_replace(iter, n1);
 
+	/*
+	 * The old node must be freed (in memory) _before_ unlocking the new
+	 * nodes - else another thread could re-acquire a read lock on the old
+	 * node after another thread has locked and updated the new node, thus
+	 * seeing stale data:
+	 */
 	bch2_btree_node_free_inmem(c, b, iter);
 
+	if (n3)
+		six_unlock_intent(&n3->c.lock);
+	if (n2)
+		six_unlock_intent(&n2->c.lock);
+	six_unlock_intent(&n1->c.lock);
+
 	bch2_btree_trans_verify_locks(iter->trans);
 
 	bch2_time_stats_update(&c->times[BCH_TIME_btree_node_split],
@@ -1761,6 +1773,8 @@ retry:
 	bch2_btree_node_free_inmem(c, b, iter);
 	bch2_btree_node_free_inmem(c, m, iter);
 
+	six_unlock_intent(&n->c.lock);
+
 	bch2_btree_update_done(as);
 
 	if (!(flags & BTREE_INSERT_GC_LOCK_HELD))
@@ -1855,6 +1869,7 @@ static int __btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter,
 	bch2_btree_iter_node_drop(iter, b);
 	bch2_btree_iter_node_replace(iter, n);
 	bch2_btree_node_free_inmem(c, b, iter);
+	six_unlock_intent(&n->c.lock);
 
 	bch2_btree_update_done(as);
 	return 0;
-- 
cgit 


From 05240ba6b897995d4d4086f7f4accc7858ee0a40 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 11 Oct 2019 15:03:32 -0400
Subject: bcachefs: Fix creation of lost+found

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/recovery.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 095eef3828ce..23f3ed54fadd 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -1011,7 +1011,7 @@ int bch2_fs_initialize(struct bch_fs *c)
 		bch2_create_trans(&trans, BCACHEFS_ROOT_INO,
 				  &root_inode, &lostfound_inode,
 				  &lostfound,
-				  0, 0, 0755, 0,
+				  0, 0, S_IFDIR|0755, 0,
 				  NULL, NULL));
 	if (ret)
 		goto err;
-- 
cgit 


From 821a99b7ba6802d43f980a8312cd25694b7ea076 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 11 Oct 2019 15:14:36 -0400
Subject: bcachefs: Switch to .iterate_shared for readdir

We definitely don't need an exclusive inode lock for readdir.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs.c | 45 +++++++++++++--------------------------------
 1 file changed, 13 insertions(+), 32 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index 0042a825a698..65556993bbb9 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -367,12 +367,12 @@ static struct dentry *bch2_lookup(struct inode *vdir, struct dentry *dentry,
 	return d_splice_alias(vinode, dentry);
 }
 
-static int bch2_create(struct mnt_idmap *idmap,
-		       struct inode *vdir, struct dentry *dentry,
-		       umode_t mode, bool excl)
+static int bch2_mknod(struct mnt_idmap *idmap,
+		      struct inode *vdir, struct dentry *dentry,
+		      umode_t mode, dev_t rdev)
 {
 	struct bch_inode_info *inode =
-		__bch2_create(idmap, to_bch_ei(vdir), dentry, mode|S_IFREG, 0, false);
+		__bch2_create(idmap, to_bch_ei(vdir), dentry, mode, rdev, false);
 
 	if (IS_ERR(inode))
 		return PTR_ERR(inode);
@@ -381,6 +381,13 @@ static int bch2_create(struct mnt_idmap *idmap,
 	return 0;
 }
 
+static int bch2_create(struct mnt_idmap *idmap,
+		       struct inode *vdir, struct dentry *dentry,
+		       umode_t mode, bool excl)
+{
+	return bch2_mknod(idmap, vdir, dentry, mode|S_IFREG, 0);
+}
+
 static int __bch2_link(struct bch_fs *c,
 		       struct bch_inode_info *inode,
 		       struct bch_inode_info *dir,
@@ -512,33 +519,7 @@ err:
 static int bch2_mkdir(struct mnt_idmap *idmap,
 		      struct inode *vdir, struct dentry *dentry, umode_t mode)
 {
-	struct bch_inode_info *inode =
-		__bch2_create(idmap, to_bch_ei(vdir), dentry, mode|S_IFDIR, 0, false);
-
-	if (IS_ERR(inode))
-		return PTR_ERR(inode);
-
-	d_instantiate(dentry, &inode->v);
-	return 0;
-}
-
-static int bch2_rmdir(struct inode *vdir, struct dentry *dentry)
-{
-	return bch2_unlink(vdir, dentry);
-}
-
-static int bch2_mknod(struct mnt_idmap *idmap,
-		      struct inode *vdir, struct dentry *dentry,
-		      umode_t mode, dev_t rdev)
-{
-	struct bch_inode_info *inode =
-		__bch2_create(idmap, to_bch_ei(vdir), dentry, mode, rdev, false);
-
-	if (IS_ERR(inode))
-		return PTR_ERR(inode);
-
-	d_instantiate(dentry, &inode->v);
-	return 0;
+	return bch2_mknod(idmap, vdir, dentry, mode|S_IFDIR, 0);
 }
 
 static int bch2_rename2(struct mnt_idmap *idmap,
@@ -1034,7 +1015,7 @@ static const struct inode_operations bch_dir_inode_operations = {
 	.unlink		= bch2_unlink,
 	.symlink	= bch2_symlink,
 	.mkdir		= bch2_mkdir,
-	.rmdir		= bch2_rmdir,
+	.rmdir		= bch2_unlink,
 	.mknod		= bch2_mknod,
 	.rename		= bch2_rename2,
 	.getattr	= bch2_getattr,
-- 
cgit 


From 538abcb8a1dfcd8473a90d931b8603e4a03812ca Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 12 Oct 2019 14:13:45 -0400
Subject: bcachefs: Fix a debug assertion

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index a91cee797703..8aaaa6615eff 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1359,6 +1359,13 @@ static inline struct bkey_s_c btree_iter_peek_uptodate(struct btree_iter *iter)
 
 		if (debug_check_iterators(iter->trans->c)) {
 			struct bkey k = bkey_unpack_key(l->b, _k);
+
+			/*
+			 * this flag is internal to the btree code,
+			 * we don't care if it doesn't match - if it's now set
+			 * it just means the key has been written out to disk:
+			 */
+			k.needs_whiteout = iter->k.needs_whiteout;
 			BUG_ON(memcmp(&k, &iter->k, sizeof(k)));
 		}
 
-- 
cgit 


From f38fe2dc5dbde967c96ce579f974a406a049b122 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 12 Oct 2019 14:44:09 -0400
Subject: bcachefs: Fix iterator counting for reflink pointers (again)

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/extents.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 50cad6725c1b..8b7d2b8759b0 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -959,7 +959,8 @@ static int count_iters_for_insert(struct btree_trans *trans,
 	case KEY_TYPE_reflink_p: {
 		struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
 		u64 idx = le64_to_cpu(p.v->idx);
-		unsigned sectors = end->offset - bkey_start_offset(p.k);
+		unsigned sectors = bpos_min(*end, p.k->p).offset -
+			bkey_start_offset(p.k);
 		struct btree_iter *iter;
 		struct bkey_s_c r_k;
 
-- 
cgit 


From ae93a628956c8eab915baa83be11d2a0300eebca Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 12 Oct 2019 16:44:44 -0400
Subject: bcachefs: Fix flushing held btree writes when there's a fs error

Previously, we'd go into an infinite loop.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 81418d534d70..7d5d94dbc64f 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -1438,6 +1438,9 @@ again:
 	cond_resched();
 	nodes_unwritten = false;
 
+	if (bch2_journal_error(&c->journal))
+		return true;
+
 	rcu_read_lock();
 	for_each_cached_btree(b, c, tbl, i, pos)
 		if (btree_node_need_write(b)) {
-- 
cgit 


From 71603f1ffe03d2409d591e4647377a415376a6b9 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 16 Oct 2019 13:48:12 -0400
Subject: bcachefs: Fix an iterator counting bug

The iterator counting assumed we're doing an obvious optimization when
only updating the refcount on indirect extents - but we're not doing it
yet.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/extents.c | 15 +--------------
 1 file changed, 1 insertion(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 8b7d2b8759b0..16a328a20fb5 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -971,20 +971,7 @@ static int count_iters_for_insert(struct btree_trans *trans,
 				     POS(0, idx + sectors)) >= 0)
 				break;
 
-			*nr_iters += 1;
-
-			if (overwrite &&
-			    r_k.k->type == KEY_TYPE_reflink_v) {
-				struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(r_k);
-
-				if (le64_to_cpu(r.v->refcount) == 1)
-					*nr_iters += bch2_bkey_nr_alloc_ptrs(r_k);
-			}
-
-			/*
-			 * if we're going to be deleting an entry from
-			 * the reflink btree, need more iters...
-			 */
+			*nr_iters += 1 + bch2_bkey_nr_alloc_ptrs(r_k);
 
 			if (*nr_iters >= max_iters) {
 				struct bpos pos = bkey_start_pos(k.k);
-- 
cgit 


From a94407434b7a3b577a7605f0bd999d98acad6d23 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 20 Oct 2019 00:22:03 -0400
Subject: bcachefs: Limit bios in writepages path to 256M

This works around a bug where bio_full() doesn't check for
bio->bi_iter.bi_size overflowing - and, we don't really want to build
bios that are that big anyways.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-io.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 9914f6ee2aa2..a3fb60383725 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -1151,6 +1151,7 @@ do_io:
 		if (w->io &&
 		    (w->io->op.res.nr_replicas != nr_replicas_this_write ||
 		     bio_full(&w->io->op.wbio.bio, PAGE_SIZE) ||
+		     w->io->op.wbio.bio.bi_iter.bi_size >= (256U << 20) ||
 		     bio_end_sector(&w->io->op.wbio.bio) != sector))
 			bch2_writepage_do_io(w);
 
-- 
cgit 


From 2d78737d9686faa363dde0b8fdef224ae29cff55 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 21 Oct 2019 20:40:53 -0400
Subject: bcachefs: Drop bch_write_op->io_wq

This is dead code

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/io.h       | 1 -
 fs/bcachefs/io_types.h | 2 --
 2 files changed, 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/io.h b/fs/bcachefs/io.h
index 3d4e8ee0553b..8a5d45f48045 100644
--- a/fs/bcachefs/io.h
+++ b/fs/bcachefs/io.h
@@ -71,7 +71,6 @@ static inline void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c,
 				      struct bch_io_opts opts)
 {
 	op->c			= c;
-	op->io_wq		= index_update_wq(op);
 	op->flags		= 0;
 	op->written		= 0;
 	op->error		= 0;
diff --git a/fs/bcachefs/io_types.h b/fs/bcachefs/io_types.h
index 36b40c3fb894..0f7fad041205 100644
--- a/fs/bcachefs/io_types.h
+++ b/fs/bcachefs/io_types.h
@@ -95,7 +95,6 @@ struct bch_write_bio {
 struct bch_write_op {
 	struct closure		cl;
 	struct bch_fs		*c;
-	struct workqueue_struct	*io_wq;
 	u64			start_time;
 
 	unsigned		written; /* sectors */
@@ -111,7 +110,6 @@ struct bch_write_op {
 	struct bch_devs_list	devs_have;
 	u16			target;
 	u16			nonce;
-
 	struct bch_io_opts	opts;
 
 	struct bpos		pos;
-- 
cgit 


From fbc519ab2e3e8cfb7d9a660fa319d9c44bdd937e Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 21 Oct 2019 19:58:06 -0400
Subject: bcachefs: Don't submit bio in write path under lock

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/io.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index d6d777bb53da..6d416f71f055 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -1008,6 +1008,8 @@ do_write:
 				bkey_start_pos(&key_to_write->k),
 				total_input >> 9);
 
+	bch2_alloc_sectors_done(c, wp);
+
 	dst->bi_end_io	= bch2_write_endio;
 	dst->bi_private	= &op->cl;
 	dst->bi_opf	= REQ_OP_WRITE;
@@ -1022,6 +1024,8 @@ csum_err:
 		"rewriting existing data (memory corruption?)");
 	ret = -EIO;
 err:
+	bch2_alloc_sectors_done(c, wp);
+
 	if (to_wbio(dst)->bounce)
 		bch2_bio_free_pages_pool(c, dst);
 	if (to_wbio(dst)->put_bio)
@@ -1072,10 +1076,9 @@ again:
 			goto flush_io;
 		}
 
-		ret = bch2_write_extent(op, wp);
-
 		bch2_open_bucket_get(c, wp, &op->open_buckets);
-		bch2_alloc_sectors_done(c, wp);
+
+		ret = bch2_write_extent(op, wp);
 
 		if (ret < 0)
 			goto err;
-- 
cgit 


From 77d63522f04897b025b6172074ededf5eab07b6a Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 19 Oct 2019 22:22:29 -0400
Subject: bcachefs: Make replicas_delta_list smaller

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update_leaf.c |  6 ++++--
 fs/bcachefs/buckets.c           | 18 +++++++++++-------
 fs/bcachefs/buckets_types.h     |  6 +++++-
 3 files changed, 20 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index fd50f51943c3..139e8e8711f0 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -587,9 +587,11 @@ out:
 	bch2_journal_res_put(&c->journal, &trans->journal_res);
 out_clear_replicas:
 	if (trans->fs_usage_deltas) {
-		memset(&trans->fs_usage_deltas->fs_usage, 0,
-		       sizeof(trans->fs_usage_deltas->fs_usage));
 		trans->fs_usage_deltas->used = 0;
+		memset((void *) trans->fs_usage_deltas +
+		       offsetof(struct replicas_delta_list, memset_start), 0,
+		       (void *) &trans->fs_usage_deltas->memset_end -
+		       (void *) &trans->fs_usage_deltas->memset_start);
 	}
 
 	return ret;
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index d4d66d78d2a3..34d3b117085b 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -585,9 +585,14 @@ void bch2_replicas_delta_list_apply(struct bch_fs *c,
 {
 	struct replicas_delta *d = r->d;
 	struct replicas_delta *top = (void *) r->d + r->used;
+	unsigned i;
+
+	fs_usage->nr_inodes += r->nr_inodes;
 
-	acc_u64s((u64 *) fs_usage,
-		 (u64 *) &r->fs_usage, sizeof(*fs_usage) / sizeof(u64));
+	for (i = 0; i < BCH_REPLICAS_MAX; i++) {
+		fs_usage->reserved += r->persistent_reserved[i];
+		fs_usage->persistent_reserved[i] += r->persistent_reserved[i];
+	}
 
 	while (d != top) {
 		BUG_ON((void *) d > (void *) top);
@@ -1739,9 +1744,9 @@ int bch2_trans_mark_key(struct btree_trans *trans, struct bkey_s_c k,
 		d = replicas_deltas_realloc(trans, 0);
 
 		if (!(flags & BCH_BUCKET_MARK_OVERWRITE))
-			d->fs_usage.nr_inodes++;
+			d->nr_inodes++;
 		else
-			d->fs_usage.nr_inodes--;
+			d->nr_inodes--;
 		return 0;
 	case KEY_TYPE_reservation: {
 		unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas;
@@ -1750,10 +1755,9 @@ int bch2_trans_mark_key(struct btree_trans *trans, struct bkey_s_c k,
 
 		sectors *= replicas;
 		replicas = clamp_t(unsigned, replicas, 1,
-				   ARRAY_SIZE(d->fs_usage.persistent_reserved));
+				   ARRAY_SIZE(d->persistent_reserved));
 
-		d->fs_usage.reserved				+= sectors;
-		d->fs_usage.persistent_reserved[replicas - 1]	+= sectors;
+		d->persistent_reserved[replicas - 1] += sectors;
 		return 0;
 	}
 	case KEY_TYPE_reflink_p:
diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h
index 7ab9aa641c95..070e10dfa7bb 100644
--- a/fs/bcachefs/buckets_types.h
+++ b/fs/bcachefs/buckets_types.h
@@ -100,7 +100,11 @@ struct replicas_delta {
 struct replicas_delta_list {
 	unsigned		size;
 	unsigned		used;
-	struct bch_fs_usage	fs_usage;
+
+	struct			{} memset_start;
+	u64			nr_inodes;
+	u64			persistent_reserved[BCH_REPLICAS_MAX];
+	struct			{} memset_end;
 	struct replicas_delta	d[0];
 };
 
-- 
cgit 


From 8f1965391cc421ad4e50b4dfe5e06aae661f8870 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 20 Oct 2019 00:24:51 -0400
Subject: bcachefs: Make btree_node_type_needs_gc() cheaper

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_types.h       | 25 ++++++++++++++-----------
 fs/bcachefs/btree_update.h      |  4 ----
 fs/bcachefs/btree_update_leaf.c | 24 ++++++++++--------------
 3 files changed, 24 insertions(+), 29 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 7d3c6670e30f..48ebc886aaa2 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -460,19 +460,22 @@ static inline bool btree_node_is_extents(struct btree *b)
 	return btree_node_type_is_extents(btree_node_type(b));
 }
 
+#define BTREE_NODE_TYPE_HAS_TRIGGERS			\
+	((1U << BKEY_TYPE_EXTENTS)|			\
+	 (1U << BKEY_TYPE_ALLOC)|			\
+	 (1U << BKEY_TYPE_INODES)|			\
+	 (1U << BKEY_TYPE_REFLINK)|			\
+	 (1U << BKEY_TYPE_EC)|				\
+	 (1U << BKEY_TYPE_BTREE))
+
+#define BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS		\
+	((1U << BKEY_TYPE_EXTENTS)|			\
+	 (1U << BKEY_TYPE_INODES)|			\
+	 (1U << BKEY_TYPE_REFLINK))
+
 static inline bool btree_node_type_needs_gc(enum btree_node_type type)
 {
-	switch (type) {
-	case BKEY_TYPE_ALLOC:
-	case BKEY_TYPE_BTREE:
-	case BKEY_TYPE_EXTENTS:
-	case BKEY_TYPE_INODES:
-	case BKEY_TYPE_EC:
-	case BKEY_TYPE_REFLINK:
-		return true;
-	default:
-		return false;
-	}
+	return BTREE_NODE_TYPE_HAS_TRIGGERS & (1U << type);
 }
 
 struct btree_root {
diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
index 49f4d24d56ff..db18527a239f 100644
--- a/fs/bcachefs/btree_update.h
+++ b/fs/bcachefs/btree_update.h
@@ -27,7 +27,6 @@ enum {
 	__BTREE_INSERT_JOURNAL_RESERVED,
 	__BTREE_INSERT_NOMARK_OVERWRITES,
 	__BTREE_INSERT_NOMARK,
-	__BTREE_INSERT_MARK_INMEM,
 	__BTREE_INSERT_NO_CLEAR_REPLICAS,
 	__BTREE_INSERT_BUCKET_INVALIDATE,
 	__BTREE_INSERT_NOWAIT,
@@ -68,9 +67,6 @@ enum {
 /* Don't call mark new key at all: */
 #define BTREE_INSERT_NOMARK		(1 << __BTREE_INSERT_NOMARK)
 
-/* Don't mark transactionally: */
-#define BTREE_INSERT_MARK_INMEM		(1 << __BTREE_INSERT_MARK_INMEM)
-
 #define BTREE_INSERT_NO_CLEAR_REPLICAS	(1 << __BTREE_INSERT_NO_CLEAR_REPLICAS)
 
 #define BTREE_INSERT_BUCKET_INVALIDATE	(1 << __BTREE_INSERT_BUCKET_INVALIDATE)
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 139e8e8711f0..0b9b573a0d72 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -426,20 +426,16 @@ static inline void do_btree_insert_one(struct btree_trans *trans,
 	btree_insert_key_leaf(trans, insert);
 }
 
-static inline bool update_triggers_transactional(struct btree_trans *trans,
-						 struct btree_insert_entry *i)
+static inline bool update_has_trans_triggers(struct btree_insert_entry *i)
 {
-	return likely(!(trans->flags & BTREE_INSERT_MARK_INMEM)) &&
-		(i->iter->btree_id == BTREE_ID_EXTENTS ||
-		 i->iter->btree_id == BTREE_ID_INODES ||
-		 i->iter->btree_id == BTREE_ID_REFLINK);
+	return BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->iter->btree_id);
 }
 
-static inline bool update_has_triggers(struct btree_trans *trans,
-				       struct btree_insert_entry *i)
+static inline bool update_has_nontrans_triggers(struct btree_insert_entry *i)
 {
-	return likely(!(trans->flags & BTREE_INSERT_NOMARK)) &&
-		btree_node_type_needs_gc(i->iter->btree_id);
+	return (BTREE_NODE_TYPE_HAS_TRIGGERS &
+		~BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS) &
+		(1U << i->iter->btree_id);
 }
 
 /*
@@ -465,8 +461,8 @@ static inline int do_btree_insert_at(struct btree_trans *trans,
 	 * updates as we're walking it:
 	 */
 	trans_for_each_update(trans, i)
-		if (update_has_triggers(trans, i) &&
-		    update_triggers_transactional(trans, i)) {
+		if (likely(!(trans->flags & BTREE_INSERT_NOMARK)) &&
+		    update_has_trans_triggers(i)) {
 			ret = bch2_trans_mark_update(trans, i->iter, i->k);
 			if (ret == -EINTR)
 				trace_trans_restart_mark(trans->ip);
@@ -551,8 +547,8 @@ static inline int do_btree_insert_at(struct btree_trans *trans,
 	}
 
 	trans_for_each_update(trans, i)
-		if (update_has_triggers(trans, i) &&
-		    !update_triggers_transactional(trans, i))
+		if (likely(!(trans->flags & BTREE_INSERT_NOMARK)) &&
+		    update_has_nontrans_triggers(i))
 			bch2_mark_update(trans, i, &fs_usage->u, mark_flags);
 
 	if (fs_usage && trans->fs_usage_deltas)
-- 
cgit 


From 2a9101a9898920a04e77f70f7bbee84d2c76c527 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 19 Oct 2019 19:03:23 -0400
Subject: bcachefs: Refactor bch2_trans_commit() path

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs.h          |   1 -
 fs/bcachefs/btree_io.h          |   6 +-
 fs/bcachefs/btree_iter.h        |   5 +
 fs/bcachefs/btree_types.h       |  10 +-
 fs/bcachefs/btree_update.h      |  27 ++-
 fs/bcachefs/btree_update_leaf.c | 402 ++++++++++++++++++++--------------------
 fs/bcachefs/fs-io.c             |  28 +--
 fs/bcachefs/io.c                |   1 +
 fs/bcachefs/journal.h           |   2 +-
 fs/bcachefs/reflink.c           |   5 +
 10 files changed, 259 insertions(+), 228 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index c5c98aae8bdb..093dc906353d 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -301,7 +301,6 @@ do {									\
 	x(btree_node_sort)			\
 	x(btree_node_read)			\
 	x(btree_gc)				\
-	x(btree_update)				\
 	x(btree_lock_contended_read)		\
 	x(btree_lock_contended_intent)		\
 	x(btree_lock_contended_write)		\
diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h
index 3fb0aa20b340..69516ec34b89 100644
--- a/fs/bcachefs/btree_io.h
+++ b/fs/bcachefs/btree_io.h
@@ -62,10 +62,10 @@ bool __bch2_compact_whiteouts(struct bch_fs *, struct btree *, enum compact_mode
 
 static inline unsigned should_compact_bset_lazy(struct btree *b, struct bset_tree *t)
 {
-	unsigned bset_u64s = le16_to_cpu(bset(b, t)->u64s);
-	unsigned dead_u64s = bset_u64s - b->nr.bset_u64s[t - b->set];
+	unsigned total_u64s = bset_u64s(t);
+	unsigned dead_u64s = total_u64s - b->nr.bset_u64s[t - b->set];
 
-	return dead_u64s > 128 && dead_u64s * 3 > bset_u64s;
+	return dead_u64s > 64 && dead_u64s * 3 > total_u64s;
 }
 
 static inline bool bch2_maybe_compact_whiteouts(struct bch_fs *c, struct btree *b)
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index 6f81be26e674..1b7262d7e284 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -48,6 +48,11 @@ static inline int btree_iter_err(const struct btree_iter *iter)
 
 /* Iterate over iters within a transaction: */
 
+#define trans_for_each_iter_all(_trans, _iter)				\
+	for (_iter = (_trans)->iters;					\
+	     _iter < (_trans)->iters + (_trans)->nr_iters;		\
+	     _iter++)
+
 static inline struct btree_iter *
 __trans_next_iter(struct btree_trans *trans, unsigned idx)
 {
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 48ebc886aaa2..3a26a8802e86 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -255,7 +255,6 @@ struct btree_insert_entry {
 struct btree_trans {
 	struct bch_fs		*c;
 	unsigned long		ip;
-	u64			commit_start;
 
 	u64			iters_linked;
 	u64			iters_live;
@@ -283,12 +282,11 @@ struct btree_trans {
 	struct disk_reservation *disk_res;
 	unsigned		flags;
 	unsigned		journal_u64s;
+	struct replicas_delta_list *fs_usage_deltas;
 
 	struct btree_iter	iters_onstack[2];
 	struct btree_insert_entry updates_onstack[6];
 	u8			updates_sorted_onstack[6];
-
-	struct replicas_delta_list *fs_usage_deltas;
 };
 
 #define BTREE_FLAG(flag)						\
@@ -420,6 +418,12 @@ static inline unsigned btree_bkey_first_offset(const struct bset_tree *t)
 	__btree_node_offset_to_key(_b, (_t)->end_offset);		\
 })
 
+static inline unsigned bset_u64s(struct bset_tree *t)
+{
+	return t->end_offset - t->data_offset -
+		sizeof(struct bset) / sizeof(u64);
+}
+
 static inline unsigned bset_byte_offset(struct btree *b, void *i)
 {
 	return i - (void *) b->data;
diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
index db18527a239f..ad8cbf3fb778 100644
--- a/fs/bcachefs/btree_update.h
+++ b/fs/bcachefs/btree_update.h
@@ -93,9 +93,30 @@ int bch2_btree_node_rewrite(struct bch_fs *c, struct btree_iter *,
 int bch2_btree_node_update_key(struct bch_fs *, struct btree_iter *,
 			       struct btree *, struct bkey_i_btree_ptr *);
 
-int bch2_trans_commit(struct btree_trans *,
-		      struct disk_reservation *,
-		      u64 *, unsigned);
+int __bch2_trans_commit(struct btree_trans *);
+
+/**
+ * bch2_trans_commit - insert keys at given iterator positions
+ *
+ * This is main entry point for btree updates.
+ *
+ * Return values:
+ * -EINTR: locking changed, this function should be called again. Only returned
+ *  if passed BTREE_INSERT_ATOMIC.
+ * -EROFS: filesystem read only
+ * -EIO: journal or btree node IO error
+ */
+static inline int bch2_trans_commit(struct btree_trans *trans,
+				    struct disk_reservation *disk_res,
+				    u64 *journal_seq,
+				    unsigned flags)
+{
+	trans->disk_res		= disk_res;
+	trans->journal_seq	= journal_seq;
+	trans->flags		= flags;
+
+	return __bch2_trans_commit(trans);
+}
 
 static inline void bch2_trans_update(struct btree_trans *trans,
 				     struct btree_iter *iter,
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 0b9b573a0d72..38a27d3a3b40 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -20,16 +20,11 @@
 #include <linux/sort.h>
 
 static inline bool same_leaf_as_prev(struct btree_trans *trans,
-				     unsigned sorted_idx)
+				     unsigned idx)
 {
-	struct btree_insert_entry *i = trans->updates +
-		trans->updates_sorted[sorted_idx];
-	struct btree_insert_entry *prev = sorted_idx
-		? trans->updates + trans->updates_sorted[sorted_idx - 1]
-		: NULL;
-
-	return prev &&
-		i->iter->l[0].b == prev->iter->l[0].b;
+	return idx &&
+		trans->updates[trans->updates_sorted[idx]].iter->l[0].b ==
+		trans->updates[trans->updates_sorted[idx - 1]].iter->l[0].b;
 }
 
 #define trans_for_each_update_sorted(_trans, _i, _iter)			\
@@ -92,8 +87,6 @@ static inline void btree_trans_sort_updates(struct btree_trans *trans)
 		trans->updates_sorted[pos] = l - trans->updates;
 		nr++;
 	}
-
-	BUG_ON(nr != trans->nr_updates);
 }
 
 /* Inserting into a given leaf node (last stage of insert): */
@@ -266,8 +259,8 @@ static void bch2_insert_fixup_key(struct btree_trans *trans,
 	EBUG_ON(insert->k->k.u64s >
 		bch_btree_keys_u64s_remaining(trans->c, l->b));
 
-	if (bch2_btree_bset_insert_key(iter, l->b, &l->iter,
-				       insert->k))
+	if (likely(bch2_btree_bset_insert_key(iter, l->b, &l->iter,
+					      insert->k)))
 		bch2_btree_journal_key(trans, iter, insert->k);
 }
 
@@ -280,7 +273,8 @@ static void btree_insert_key_leaf(struct btree_trans *trans,
 	struct bch_fs *c = trans->c;
 	struct btree_iter *iter = insert->iter;
 	struct btree *b = iter->l[0].b;
-	int old_u64s = le16_to_cpu(btree_bset_last(b)->u64s);
+	struct bset_tree *t = bset_tree_last(b);
+	int old_u64s = bset_u64s(t);
 	int old_live_u64s = b->nr.live_u64s;
 	int live_u64s_added, u64s_added;
 
@@ -290,7 +284,7 @@ static void btree_insert_key_leaf(struct btree_trans *trans,
 		bch2_insert_fixup_extent(trans, insert);
 
 	live_u64s_added = (int) b->nr.live_u64s - old_live_u64s;
-	u64s_added = (int) le16_to_cpu(btree_bset_last(b)->u64s) - old_u64s;
+	u64s_added = (int) bset_u64s(t) - old_u64s;
 
 	if (b->sib_u64s[0] != U16_MAX && live_u64s_added < 0)
 		b->sib_u64s[0] = max(0, (int) b->sib_u64s[0] + live_u64s_added);
@@ -323,26 +317,12 @@ static inline void btree_insert_entry_checks(struct btree_trans *trans,
 	       bch2_bkey_invalid(c, bkey_i_to_s_c(i->k), i->iter->btree_id));
 }
 
-static int bch2_trans_journal_preres_get(struct btree_trans *trans)
+static noinline int
+bch2_trans_journal_preres_get_cold(struct btree_trans *trans, unsigned u64s)
 {
 	struct bch_fs *c = trans->c;
-	struct btree_insert_entry *i;
-	unsigned u64s = 0;
 	int ret;
 
-	trans_for_each_update(trans, i)
-		if (0)
-			u64s += jset_u64s(i->k->k.u64s);
-
-	if (!u64s)
-		return 0;
-
-	ret = bch2_journal_preres_get(&c->journal,
-			&trans->journal_preres, u64s,
-			JOURNAL_RES_GET_NONBLOCK);
-	if (ret != -EAGAIN)
-		return ret;
-
 	bch2_trans_unlock(trans);
 
 	ret = bch2_journal_preres_get(&c->journal,
@@ -358,8 +338,8 @@ static int bch2_trans_journal_preres_get(struct btree_trans *trans)
 	return 0;
 }
 
-static int bch2_trans_journal_res_get(struct btree_trans *trans,
-				      unsigned flags)
+static inline int bch2_trans_journal_res_get(struct btree_trans *trans,
+					     unsigned flags)
 {
 	struct bch_fs *c = trans->c;
 	int ret;
@@ -438,63 +418,43 @@ static inline bool update_has_nontrans_triggers(struct btree_insert_entry *i)
 		(1U << i->iter->btree_id);
 }
 
-/*
- * Get journal reservation, take write locks, and attempt to do btree update(s):
- */
-static inline int do_btree_insert_at(struct btree_trans *trans,
-				     struct btree_insert_entry **stopped_at)
+static noinline void bch2_btree_iter_unlock_noinline(struct btree_iter *iter)
+{
+	__bch2_btree_iter_unlock(iter);
+}
+
+static noinline void bch2_trans_mark_gc(struct btree_trans *trans)
 {
 	struct bch_fs *c = trans->c;
-	struct bch_fs_usage_online *fs_usage = NULL;
 	struct btree_insert_entry *i;
-	struct btree_iter *iter;
 	unsigned mark_flags = trans->flags & BTREE_INSERT_BUCKET_INVALIDATE
 		? BCH_BUCKET_MARK_BUCKET_INVALIDATE
 		: 0;
-	int ret;
 
-	trans_for_each_update(trans, i)
-		BUG_ON(i->iter->uptodate >= BTREE_ITER_NEED_RELOCK);
+	if (unlikely(trans->flags & BTREE_INSERT_NOMARK))
+		return;
 
-	/*
-	 * note: running triggers will append more updates to the list of
-	 * updates as we're walking it:
-	 */
 	trans_for_each_update(trans, i)
-		if (likely(!(trans->flags & BTREE_INSERT_NOMARK)) &&
-		    update_has_trans_triggers(i)) {
-			ret = bch2_trans_mark_update(trans, i->iter, i->k);
-			if (ret == -EINTR)
-				trace_trans_restart_mark(trans->ip);
-			if (ret)
-				goto out_clear_replicas;
-		}
-
-	trans_for_each_iter(trans, iter) {
-		if (iter->nodes_locked != iter->nodes_intent_locked) {
-			BUG_ON(iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT);
-			BUG_ON(trans->iters_live & (1ULL << iter->idx));
-			__bch2_btree_iter_unlock(iter);
-		}
-	}
-
-	if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG))
-		trans_for_each_update(trans, i)
-			btree_insert_entry_checks(trans, i);
-	bch2_btree_trans_verify_locks(trans);
-
-	/*
-	 * No more updates can be added - sort updates so we can take write
-	 * locks in the correct order:
-	 */
-	btree_trans_sort_updates(trans);
+		if (gc_visited(c, gc_pos_btree_node(i->iter->l[0].b)))
+			bch2_mark_update(trans, i, NULL,
+					 mark_flags|BCH_BUCKET_MARK_GC);
+}
 
-	btree_trans_lock_write(trans, true);
+static inline int
+bch2_trans_commit_write_locked(struct btree_trans *trans,
+			       struct btree_insert_entry **stopped_at)
+{
+	struct bch_fs *c = trans->c;
+	struct bch_fs_usage_online *fs_usage = NULL;
+	struct btree_insert_entry *i;
+	unsigned mark_flags = trans->flags & BTREE_INSERT_BUCKET_INVALIDATE
+		? BCH_BUCKET_MARK_BUCKET_INVALIDATE
+		: 0;
+	int ret;
 
 	if (race_fault()) {
-		ret = -EINTR;
 		trace_trans_restart_fault_inject(trans->ip);
-		goto out;
+		return -EINTR;
 	}
 
 	/*
@@ -504,7 +464,7 @@ static inline int do_btree_insert_at(struct btree_trans *trans,
 	 */
 	ret = btree_trans_check_can_insert(trans, stopped_at);
 	if (ret)
-		goto out;
+		return ret;
 
 	trans_for_each_update(trans, i) {
 		if (!btree_node_type_needs_gc(i->iter->btree_id))
@@ -515,10 +475,11 @@ static inline int do_btree_insert_at(struct btree_trans *trans,
 			fs_usage = bch2_fs_usage_scratch_get(c);
 		}
 
+		/* Must be called under mark_lock: */
 		if (!bch2_bkey_replicas_marked_locked(c,
 			bkey_i_to_s_c(i->k), true)) {
 			ret = BTREE_INSERT_NEED_MARK_REPLICAS;
-			goto out;
+			goto err;
 		}
 	}
 
@@ -527,16 +488,17 @@ static inline int do_btree_insert_at(struct btree_trans *trans,
 	 * succeed:
 	 */
 	if (likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) {
-		trans->journal_u64s = 0;
-
-		trans_for_each_update(trans, i)
-			trans->journal_u64s += jset_u64s(i->k->k.u64s);
-
-		ret = bch2_trans_journal_res_get(trans, JOURNAL_RES_GET_NONBLOCK);
+		ret = bch2_trans_journal_res_get(trans,
+				JOURNAL_RES_GET_NONBLOCK);
 		if (ret)
-			goto out;
+			goto err;
 	}
 
+	/*
+	 * Not allowed to fail after we've gotten our journal reservation - we
+	 * have to use it:
+	 */
+
 	if (!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)) {
 		if (journal_seq_verify(c))
 			trans_for_each_update(trans, i)
@@ -558,39 +520,122 @@ static inline int do_btree_insert_at(struct btree_trans *trans,
 	if (fs_usage)
 		bch2_trans_fs_usage_apply(trans, fs_usage);
 
-	if (likely(!(trans->flags & BTREE_INSERT_NOMARK)) &&
-	    unlikely(c->gc_pos.phase))
-		trans_for_each_update(trans, i)
-			if (gc_visited(c, gc_pos_btree_node(i->iter->l[0].b)))
-				bch2_mark_update(trans, i, NULL,
-						 mark_flags|
-						 BCH_BUCKET_MARK_GC);
+	if (unlikely(c->gc_pos.phase))
+		bch2_trans_mark_gc(trans);
 
 	trans_for_each_update(trans, i)
 		do_btree_insert_one(trans, i);
-out:
-	BUG_ON(ret &&
-	       (trans->flags & BTREE_INSERT_JOURNAL_RESERVED) &&
-	       trans->journal_res.ref);
-
-	btree_trans_lock_write(trans, false);
-
+err:
 	if (fs_usage) {
 		bch2_fs_usage_scratch_put(c, fs_usage);
 		percpu_up_read(&c->mark_lock);
 	}
 
-	bch2_journal_res_put(&c->journal, &trans->journal_res);
-out_clear_replicas:
-	if (trans->fs_usage_deltas) {
-		trans->fs_usage_deltas->used = 0;
-		memset((void *) trans->fs_usage_deltas +
-		       offsetof(struct replicas_delta_list, memset_start), 0,
-		       (void *) &trans->fs_usage_deltas->memset_end -
-		       (void *) &trans->fs_usage_deltas->memset_start);
+	return ret;
+}
+
+/*
+ * Get journal reservation, take write locks, and attempt to do btree update(s):
+ */
+static inline int do_bch2_trans_commit(struct btree_trans *trans,
+				       struct btree_insert_entry **stopped_at)
+{
+	struct btree_insert_entry *i;
+	struct btree_iter *iter;
+	unsigned idx, u64s, journal_preres_u64s = 0;
+	int ret;
+
+	/*
+	 * note: running triggers will append more updates to the list of
+	 * updates as we're walking it:
+	 */
+	trans_for_each_update(trans, i) {
+		/* we know trans->nounlock won't be set here: */
+		if (unlikely(!(i->iter->locks_want < 1
+			       ? __bch2_btree_iter_upgrade(i->iter, 1)
+			       : i->iter->uptodate <= BTREE_ITER_NEED_PEEK))) {
+			trace_trans_restart_upgrade(trans->ip);
+			return -EINTR;
+		}
+
+		if (likely(!(trans->flags & BTREE_INSERT_NOMARK)) &&
+		    update_has_trans_triggers(i)) {
+			ret = bch2_trans_mark_update(trans, i->iter, i->k);
+			if (unlikely(ret)) {
+				if (ret == -EINTR)
+					trace_trans_restart_mark(trans->ip);
+				return ret;
+			}
+		}
+
+		u64s = jset_u64s(i->k->k.u64s);
+		if (0)
+			journal_preres_u64s += u64s;
+		trans->journal_u64s += u64s;
 	}
 
-	return ret;
+	ret = bch2_journal_preres_get(&trans->c->journal,
+			&trans->journal_preres, journal_preres_u64s,
+			JOURNAL_RES_GET_NONBLOCK);
+	if (unlikely(ret == -EAGAIN))
+		ret = bch2_trans_journal_preres_get_cold(trans,
+						journal_preres_u64s);
+	if (unlikely(ret))
+		return ret;
+
+	/*
+	 * Can't be holding any read locks when we go to take write locks:
+	 *
+	 * note - this must be done after bch2_trans_journal_preres_get_cold()
+	 * or anything else that might call bch2_trans_relock(), since that
+	 * would just retake the read locks:
+	 */
+	trans_for_each_iter_all(trans, iter) {
+		if (iter->nodes_locked != iter->nodes_intent_locked) {
+			EBUG_ON(iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT);
+			EBUG_ON(trans->iters_live & (1ULL << iter->idx));
+			bch2_btree_iter_unlock_noinline(iter);
+		}
+	}
+
+	if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG))
+		trans_for_each_update(trans, i)
+			btree_insert_entry_checks(trans, i);
+	bch2_btree_trans_verify_locks(trans);
+
+	/*
+	 * No more updates can be added - sort updates so we can take write
+	 * locks in the correct order:
+	 */
+	btree_trans_sort_updates(trans);
+
+	btree_trans_lock_write(trans, true);
+	ret = bch2_trans_commit_write_locked(trans, stopped_at);
+	btree_trans_lock_write(trans, false);
+
+	/*
+	 * Drop journal reservation after dropping write locks, since dropping
+	 * the journal reservation may kick off a journal write:
+	 */
+	bch2_journal_res_put(&trans->c->journal, &trans->journal_res);
+
+	if (unlikely(ret))
+		return ret;
+
+	if (trans->flags & BTREE_INSERT_NOUNLOCK)
+		trans->nounlock = true;
+
+	trans_for_each_update_sorted(trans, i, idx)
+		if (!same_leaf_as_prev(trans, idx))
+			bch2_foreground_maybe_merge(trans->c, i->iter,
+						    0, trans->flags);
+
+	trans->nounlock = false;
+
+	trans_for_each_update(trans, i)
+		bch2_btree_iter_downgrade(i->iter);
+
+	return 0;
 }
 
 static noinline
@@ -698,66 +743,27 @@ int bch2_trans_commit_error(struct btree_trans *trans,
 	return ret;
 }
 
-/**
- * __bch_btree_insert_at - insert keys at given iterator positions
- *
- * This is main entry point for btree updates.
- *
- * Return values:
- * -EINTR: locking changed, this function should be called again. Only returned
- *  if passed BTREE_INSERT_ATOMIC.
- * -EROFS: filesystem read only
- * -EIO: journal or btree node IO error
- */
-static int __bch2_trans_commit(struct btree_trans *trans,
-			       struct btree_insert_entry **stopped_at)
+static noinline int
+bch2_trans_commit_get_rw_cold(struct btree_trans *trans)
 {
 	struct bch_fs *c = trans->c;
-	struct btree_insert_entry *i;
-	unsigned iter;
 	int ret;
 
-	trans_for_each_update(trans, i) {
-		if (!bch2_btree_iter_upgrade(i->iter, 1)) {
-			trace_trans_restart_upgrade(trans->ip);
-			ret = -EINTR;
-			goto err;
-		}
-
-		ret = btree_iter_err(i->iter);
-		if (ret)
-			goto err;
-	}
+	if (likely(!(trans->flags & BTREE_INSERT_LAZY_RW)))
+		return -EROFS;
 
-	ret = do_btree_insert_at(trans, stopped_at);
-	if (unlikely(ret))
-		goto err;
-
-	if (trans->flags & BTREE_INSERT_NOUNLOCK)
-		trans->nounlock = true;
-
-	trans_for_each_update_sorted(trans, i, iter)
-		if (!same_leaf_as_prev(trans, iter))
-			bch2_foreground_maybe_merge(c, i->iter,
-						    0, trans->flags);
-
-	trans->nounlock = false;
+	bch2_trans_unlock(trans);
 
-	trans_for_each_update(trans, i)
-		bch2_btree_iter_downgrade(i->iter);
-err:
-	/* make sure we didn't drop or screw up locks: */
-	bch2_btree_trans_verify_locks(trans);
+	ret = bch2_fs_read_write_early(c);
+	if (ret)
+		return ret;
 
-	return ret;
+	percpu_ref_get(&c->writes);
+	return 0;
 }
 
-int bch2_trans_commit(struct btree_trans *trans,
-		      struct disk_reservation *disk_res,
-		      u64 *journal_seq,
-		      unsigned flags)
+int __bch2_trans_commit(struct btree_trans *trans)
 {
-	struct bch_fs *c = trans->c;
 	struct btree_insert_entry *i = NULL;
 	struct btree_iter *iter;
 	unsigned orig_nr_updates	= trans->nr_updates;
@@ -768,61 +774,47 @@ int bch2_trans_commit(struct btree_trans *trans,
 		goto out_noupdates;
 
 	/* for the sake of sanity: */
-	BUG_ON(trans->nr_updates > 1 && !(flags & BTREE_INSERT_ATOMIC));
-
-	if (flags & BTREE_INSERT_GC_LOCK_HELD)
-		lockdep_assert_held(&c->gc_lock);
+	EBUG_ON(trans->nr_updates > 1 && !(trans->flags & BTREE_INSERT_ATOMIC));
 
-	if (!trans->commit_start)
-		trans->commit_start = local_clock();
+	if (trans->flags & BTREE_INSERT_GC_LOCK_HELD)
+		lockdep_assert_held(&trans->c->gc_lock);
 
-	memset(&trans->journal_res, 0, sizeof(trans->journal_res));
 	memset(&trans->journal_preres, 0, sizeof(trans->journal_preres));
-	trans->disk_res		= disk_res;
-	trans->journal_seq	= journal_seq;
-	trans->flags		= flags;
 
-	if (unlikely(!(trans->flags & BTREE_INSERT_NOCHECK_RW) &&
-		     !percpu_ref_tryget(&c->writes))) {
-		if (likely(!(trans->flags & BTREE_INSERT_LAZY_RW)))
-			return -EROFS;
-
-		bch2_trans_unlock(trans);
-
-		ret = bch2_fs_read_write_early(c);
+	if (!(trans->flags & BTREE_INSERT_NOCHECK_RW) &&
+	    unlikely(!percpu_ref_tryget(&trans->c->writes))) {
+		ret = bch2_trans_commit_get_rw_cold(trans);
 		if (ret)
 			return ret;
+	}
+retry:
+	memset(&trans->journal_res, 0, sizeof(trans->journal_res));
+	trans->journal_u64s	= 0;
 
-		percpu_ref_get(&c->writes);
+	ret = do_bch2_trans_commit(trans, &i);
 
-		if (!bch2_trans_relock(trans)) {
-			ret = -EINTR;
-			goto err;
-		}
+	if (trans->fs_usage_deltas) {
+		trans->fs_usage_deltas->used = 0;
+		memset((void *) trans->fs_usage_deltas +
+		       offsetof(struct replicas_delta_list, memset_start), 0,
+		       (void *) &trans->fs_usage_deltas->memset_end -
+		       (void *) &trans->fs_usage_deltas->memset_start);
 	}
-retry:
-	ret = bch2_trans_journal_preres_get(trans);
-	if (ret)
-		goto err;
 
-	ret = __bch2_trans_commit(trans, &i);
+	/* make sure we didn't drop or screw up locks: */
+	bch2_btree_trans_verify_locks(trans);
+
 	if (ret)
 		goto err;
 out:
-	bch2_journal_preres_put(&c->journal, &trans->journal_preres);
+	bch2_journal_preres_put(&trans->c->journal, &trans->journal_preres);
 
-	if (unlikely(!(trans->flags & BTREE_INSERT_NOCHECK_RW)))
-		percpu_ref_put(&c->writes);
+	if (likely(!(trans->flags & BTREE_INSERT_NOCHECK_RW)))
+		percpu_ref_put(&trans->c->writes);
 out_noupdates:
-	if (!ret && trans->commit_start) {
-		bch2_time_stats_update(&c->times[BCH_TIME_btree_update],
-				       trans->commit_start);
-		trans->commit_start = 0;
-	}
-
-	BUG_ON(!(trans->flags & BTREE_INSERT_ATOMIC) && ret == -EINTR);
+	EBUG_ON(!(trans->flags & BTREE_INSERT_ATOMIC) && ret == -EINTR);
 
-	trans_for_each_iter(trans, iter)
+	trans_for_each_iter_all(trans, iter)
 		iter->flags &= ~BTREE_ITER_KEEP_UNTIL_COMMIT;
 
 	if (!ret) {
@@ -836,18 +828,16 @@ out_noupdates:
 err:
 	ret = bch2_trans_commit_error(trans, i, ret);
 
-	/* free updates and memory used by triggers, they'll be reexecuted: */
-	trans->nr_updates	= orig_nr_updates;
-	trans->mem_top		= orig_mem_top;
-
 	/* can't loop if it was passed in and we changed it: */
 	if (unlikely(trans->flags & BTREE_INSERT_NO_CLEAR_REPLICAS) && !ret)
 		ret = -EINTR;
+	if (ret)
+		goto out;
 
-	if (!ret)
-		goto retry;
-
-	goto out;
+	/* free updates and memory used by triggers, they'll be reexecuted: */
+	trans->nr_updates	= orig_nr_updates;
+	trans->mem_top		= orig_mem_top;
+	goto retry;
 }
 
 /**
diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index a3fb60383725..c539ed3aa48d 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -2720,20 +2720,26 @@ long bch2_fallocate_dispatch(struct file *file, int mode,
 			     loff_t offset, loff_t len)
 {
 	struct bch_inode_info *inode = file_bch_inode(file);
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	long ret;
 
-	if (!(mode & ~(FALLOC_FL_KEEP_SIZE|FALLOC_FL_ZERO_RANGE)))
-		return bchfs_fallocate(inode, mode, offset, len);
-
-	if (mode == (FALLOC_FL_PUNCH_HOLE|FALLOC_FL_KEEP_SIZE))
-		return bchfs_fpunch(inode, offset, len);
-
-	if (mode == FALLOC_FL_INSERT_RANGE)
-		return bchfs_fcollapse_finsert(inode, offset, len, true);
+	if (!percpu_ref_tryget(&c->writes))
+		return -EROFS;
 
-	if (mode == FALLOC_FL_COLLAPSE_RANGE)
-		return bchfs_fcollapse_finsert(inode, offset, len, false);
+	if (!(mode & ~(FALLOC_FL_KEEP_SIZE|FALLOC_FL_ZERO_RANGE)))
+		ret = bchfs_fallocate(inode, mode, offset, len);
+	else if (mode == (FALLOC_FL_PUNCH_HOLE|FALLOC_FL_KEEP_SIZE))
+		ret = bchfs_fpunch(inode, offset, len);
+	else if (mode == FALLOC_FL_INSERT_RANGE)
+		ret = bchfs_fcollapse_finsert(inode, offset, len, true);
+	else if (mode == FALLOC_FL_COLLAPSE_RANGE)
+		ret = bchfs_fcollapse_finsert(inode, offset, len, false);
+	else
+		ret = -EOPNOTSUPP;
+
+	percpu_ref_put(&c->writes);
 
-	return -EOPNOTSUPP;
+	return ret;
 }
 
 static void mark_range_unallocated(struct bch_inode_info *inode,
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index 6d416f71f055..a343393115d8 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -310,6 +310,7 @@ int bch2_extent_update(struct btree_trans *trans,
 	bch2_trans_update(trans, iter, k);
 
 	ret = bch2_trans_commit(trans, disk_res, journal_seq,
+				BTREE_INSERT_NOCHECK_RW|
 				BTREE_INSERT_NOFAIL|
 				BTREE_INSERT_ATOMIC|
 				BTREE_INSERT_USE_RESERVE);
diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h
index 36066ea7de7a..f8867f86318a 100644
--- a/fs/bcachefs/journal.h
+++ b/fs/bcachefs/journal.h
@@ -271,7 +271,7 @@ static inline void bch2_journal_res_put(struct journal *j,
 	if (!res->ref)
 		return;
 
-	lock_release(&j->res_map, _RET_IP_);
+	lock_release(&j->res_map, _THIS_IP_);
 
 	while (res->u64s)
 		bch2_journal_add_entry(j, res,
diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c
index 4a4b17f93a2e..6d45ae24479d 100644
--- a/fs/bcachefs/reflink.c
+++ b/fs/bcachefs/reflink.c
@@ -166,6 +166,9 @@ s64 bch2_remap_range(struct bch_fs *c,
 	u64 src_done, dst_done;
 	int ret = 0, ret2 = 0;
 
+	if (!percpu_ref_tryget(&c->writes))
+		return -EROFS;
+
 	if (!(c->sb.features & (1ULL << BCH_FEATURE_REFLINK))) {
 		mutex_lock(&c->sb_lock);
 		if (!(c->sb.features & (1ULL << BCH_FEATURE_REFLINK))) {
@@ -295,5 +298,7 @@ err:
 
 	ret = bch2_trans_exit(&trans) ?: ret;
 
+	percpu_ref_put(&c->writes);
+
 	return dst_done ?: ret ?: ret2;
 }
-- 
cgit 


From cdd775e6d7fee5dbfb17671d1427c0ca630b7f64 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 21 Oct 2019 19:38:08 -0400
Subject: bcachefs: Don't use FUA unnecessarily

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_io.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index c1d3e685a5f2..c4f85b962b65 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -1493,7 +1493,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
 
 	wbio = container_of(bio_alloc_bioset(NULL,
 				buf_pages(data, sectors_to_write << 9),
-				REQ_OP_WRITE|REQ_META|REQ_FUA,
+				REQ_OP_WRITE|REQ_META,
 				GFP_NOIO,
 				&c->btree_bio),
 			    struct btree_write_bio, wbio.bio);
@@ -1504,6 +1504,9 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
 	wbio->wbio.bio.bi_end_io	= btree_node_write_endio;
 	wbio->wbio.bio.bi_private	= b;
 
+	if (b->c.level || !b->written)
+		wbio->wbio.bio.bi_opf |= REQ_FUA;
+
 	bch2_bio_map(&wbio->wbio.bio, data, sectors_to_write << 9);
 
 	/*
-- 
cgit 


From 2e050d96b0c410646b313d711e57b6968732c37c Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 21 Oct 2019 21:27:10 -0400
Subject: bcachefs: kill bch2_extent_merge_inline()

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/extents.c | 433 +++++++++++++++++---------------------------------
 1 file changed, 146 insertions(+), 287 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 16a328a20fb5..02db6c759622 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -805,119 +805,6 @@ bool bch2_cut_back(struct bpos where, struct bkey *k)
 	return true;
 }
 
-static bool extent_i_save(struct btree *b, struct bkey_packed *dst,
-			  struct bkey_i *src)
-{
-	struct bkey_format *f = &b->format;
-	struct bkey_i *dst_unpacked;
-	struct bkey_packed tmp;
-
-	if ((dst_unpacked = packed_to_bkey(dst)))
-		dst_unpacked->k = src->k;
-	else if (bch2_bkey_pack_key(&tmp, &src->k, f))
-		memcpy_u64s(dst, &tmp, f->key_u64s);
-	else
-		return false;
-
-	memcpy_u64s(bkeyp_val(f, dst), &src->v, bkey_val_u64s(&src->k));
-	return true;
-}
-
-static bool bch2_extent_merge_inline(struct bch_fs *,
-				     struct btree_iter *,
-				     struct bkey_packed *,
-				     struct bkey_packed *,
-				     bool);
-
-static void verify_extent_nonoverlapping(struct bch_fs *c,
-					 struct btree *b,
-					 struct btree_node_iter *_iter,
-					 struct bkey_i *insert)
-{
-#ifdef CONFIG_BCACHEFS_DEBUG
-	struct btree_node_iter iter;
-	struct bkey_packed *k;
-	struct bkey uk;
-
-	if (!expensive_debug_checks(c))
-		return;
-
-	iter = *_iter;
-	k = bch2_btree_node_iter_prev_filter(&iter, b, KEY_TYPE_discard);
-	BUG_ON(k &&
-	       (uk = bkey_unpack_key(b, k),
-		bkey_cmp(uk.p, bkey_start_pos(&insert->k)) > 0));
-
-	iter = *_iter;
-	k = bch2_btree_node_iter_peek_filter(&iter, b, KEY_TYPE_discard);
-#if 0
-	BUG_ON(k &&
-	       (uk = bkey_unpack_key(b, k),
-		bkey_cmp(insert->k.p, bkey_start_pos(&uk))) > 0);
-#else
-	if (k &&
-	    (uk = bkey_unpack_key(b, k),
-	     bkey_cmp(insert->k.p, bkey_start_pos(&uk))) > 0) {
-		char buf1[100];
-		char buf2[100];
-
-		bch2_bkey_to_text(&PBUF(buf1), &insert->k);
-		bch2_bkey_to_text(&PBUF(buf2), &uk);
-
-		bch2_dump_btree_node(b);
-		panic("insert > next :\n"
-		      "insert %s\n"
-		      "next   %s\n",
-		      buf1, buf2);
-	}
-#endif
-
-#endif
-}
-
-static void extent_bset_insert(struct bch_fs *c, struct btree_iter *iter,
-			       struct bkey_i *insert)
-{
-	struct btree_iter_level *l = &iter->l[0];
-	struct btree_node_iter node_iter;
-	struct bkey_packed *k;
-
-	BUG_ON(insert->k.u64s > bch_btree_keys_u64s_remaining(c, l->b));
-
-	EBUG_ON(bkey_deleted(&insert->k) || !insert->k.size);
-	verify_extent_nonoverlapping(c, l->b, &l->iter, insert);
-
-	if (debug_check_bkeys(c))
-		bch2_bkey_debugcheck(c, l->b, bkey_i_to_s_c(insert));
-
-	node_iter = l->iter;
-	k = bch2_btree_node_iter_prev_filter(&node_iter, l->b, KEY_TYPE_discard);
-	if (k && !bkey_written(l->b, k) &&
-	    bch2_extent_merge_inline(c, iter, k, bkey_to_packed(insert), true))
-		return;
-
-	node_iter = l->iter;
-	k = bch2_btree_node_iter_peek_filter(&node_iter, l->b, KEY_TYPE_discard);
-	if (k && !bkey_written(l->b, k) &&
-	    bch2_extent_merge_inline(c, iter, bkey_to_packed(insert), k, false))
-		return;
-
-	/*
-	 * may have skipped past some deleted extents greater than the insert
-	 * key, before we got to a non deleted extent and knew we could bail out
-	 * rewind the iterator a bit if necessary:
-	 */
-	node_iter = l->iter;
-	while ((k = bch2_btree_node_iter_prev_all(&node_iter, l->b)) &&
-	       bkey_cmp_left_packed(l->b, k, &insert->k.p) > 0)
-		l->iter = node_iter;
-
-	k = bch2_btree_node_iter_bset_pos(&l->iter, l->b, bset_tree_last(l->b));
-
-	bch2_bset_insert(l->b, &l->iter, k, insert, 0);
-	bch2_btree_node_iter_fix(iter, l->b, &l->iter, k, 0, k->u64s);
-}
-
 static unsigned bch2_bkey_nr_alloc_ptrs(struct bkey_s_c k)
 {
 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
@@ -1125,6 +1012,83 @@ bch2_extent_can_insert(struct btree_trans *trans,
 	return BTREE_INSERT_OK;
 }
 
+static void verify_extent_nonoverlapping(struct bch_fs *c,
+					 struct btree *b,
+					 struct btree_node_iter *_iter,
+					 struct bkey_i *insert)
+{
+#ifdef CONFIG_BCACHEFS_DEBUG
+	struct btree_node_iter iter;
+	struct bkey_packed *k;
+	struct bkey uk;
+
+	if (!expensive_debug_checks(c))
+		return;
+
+	iter = *_iter;
+	k = bch2_btree_node_iter_prev_filter(&iter, b, KEY_TYPE_discard);
+	BUG_ON(k &&
+	       (uk = bkey_unpack_key(b, k),
+		bkey_cmp(uk.p, bkey_start_pos(&insert->k)) > 0));
+
+	iter = *_iter;
+	k = bch2_btree_node_iter_peek_filter(&iter, b, KEY_TYPE_discard);
+#if 0
+	BUG_ON(k &&
+	       (uk = bkey_unpack_key(b, k),
+		bkey_cmp(insert->k.p, bkey_start_pos(&uk))) > 0);
+#else
+	if (k &&
+	    (uk = bkey_unpack_key(b, k),
+	     bkey_cmp(insert->k.p, bkey_start_pos(&uk))) > 0) {
+		char buf1[100];
+		char buf2[100];
+
+		bch2_bkey_to_text(&PBUF(buf1), &insert->k);
+		bch2_bkey_to_text(&PBUF(buf2), &uk);
+
+		bch2_dump_btree_node(b);
+		panic("insert > next :\n"
+		      "insert %s\n"
+		      "next   %s\n",
+		      buf1, buf2);
+	}
+#endif
+
+#endif
+}
+
+static void extent_bset_insert(struct bch_fs *c, struct btree_iter *iter,
+			       struct bkey_i *insert)
+{
+	struct btree_iter_level *l = &iter->l[0];
+	struct btree_node_iter node_iter;
+	struct bkey_packed *k;
+
+	BUG_ON(insert->k.u64s > bch_btree_keys_u64s_remaining(c, l->b));
+
+	EBUG_ON(bkey_deleted(&insert->k) || !insert->k.size);
+	verify_extent_nonoverlapping(c, l->b, &l->iter, insert);
+
+	if (debug_check_bkeys(c))
+		bch2_bkey_debugcheck(c, l->b, bkey_i_to_s_c(insert));
+
+	/*
+	 * may have skipped past some deleted extents greater than the insert
+	 * key, before we got to a non deleted extent and knew we could bail out
+	 * rewind the iterator a bit if necessary:
+	 */
+	node_iter = l->iter;
+	while ((k = bch2_btree_node_iter_prev_all(&node_iter, l->b)) &&
+	       bkey_cmp_left_packed(l->b, k, &insert->k.p) > 0)
+		l->iter = node_iter;
+
+	k = bch2_btree_node_iter_bset_pos(&l->iter, l->b, bset_tree_last(l->b));
+
+	bch2_bset_insert(l->b, &l->iter, k, insert, 0);
+	bch2_btree_node_iter_fix(iter, l->b, &l->iter, k, 0, k->u64s);
+}
+
 static void
 extent_squash(struct bch_fs *c, struct btree_iter *iter,
 	      struct bkey_i *insert,
@@ -1215,21 +1179,63 @@ extent_squash(struct bch_fs *c, struct btree_iter *iter,
 	}
 }
 
-struct extent_insert_state {
-	struct bkey_i			whiteout;
-	bool				update_journal;
-	bool				update_btree;
-	bool				deleting;
-};
-
-static void __bch2_insert_fixup_extent(struct bch_fs *c,
-				       struct btree_iter *iter,
-				       struct bkey_i *insert,
-				       struct extent_insert_state *s)
+/**
+ * bch_extent_insert_fixup - insert a new extent and deal with overlaps
+ *
+ * this may result in not actually doing the insert, or inserting some subset
+ * of the insert key. For cmpxchg operations this is where that logic lives.
+ *
+ * All subsets of @insert that need to be inserted are inserted using
+ * bch2_btree_insert_and_journal(). If @b or @res fills up, this function
+ * returns false, setting @iter->pos for the prefix of @insert that actually got
+ * inserted.
+ *
+ * BSET INVARIANTS: this function is responsible for maintaining all the
+ * invariants for bsets of extents in memory. things get really hairy with 0
+ * size extents
+ *
+ * within one bset:
+ *
+ * bkey_start_pos(bkey_next(k)) >= k
+ * or bkey_start_offset(bkey_next(k)) >= k->offset
+ *
+ * i.e. strict ordering, no overlapping extents.
+ *
+ * multiple bsets (i.e. full btree node):
+ *
+ * ∀ k, j
+ *   k.size != 0 ∧ j.size != 0 →
+ *     ¬ (k > bkey_start_pos(j) ∧ k < j)
+ *
+ * i.e. no two overlapping keys _of nonzero size_
+ *
+ * We can't realistically maintain this invariant for zero size keys because of
+ * the key merging done in bch2_btree_insert_key() - for two mergeable keys k, j
+ * there may be another 0 size key between them in another bset, and it will
+ * thus overlap with the merged key.
+ *
+ * In addition, the end of iter->pos indicates how much has been processed.
+ * If the end of iter->pos is not the same as the end of insert, then
+ * key insertion needs to continue/be retried.
+ */
+void bch2_insert_fixup_extent(struct btree_trans *trans,
+			      struct btree_insert_entry *insert_entry)
 {
+	struct bch_fs *c = trans->c;
+	struct btree_iter *iter	= insert_entry->iter;
+	struct bkey_i *insert	= insert_entry->k;
 	struct btree_iter_level *l = &iter->l[0];
+	bool deleting		= bkey_whiteout(&insert->k);
+	bool update_journal	= !deleting;
+	bool update_btree	= !deleting;
+	struct bkey_i whiteout	= *insert;
 	struct bkey_packed *_k;
 	struct bkey unpacked;
+	BKEY_PADDED(k) tmp;
+
+	EBUG_ON(iter->level);
+	EBUG_ON(!insert->k.size);
+	EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k)));
 
 	while ((_k = bch2_btree_node_iter_peek_filter(&l->iter, l->b,
 						      KEY_TYPE_discard))) {
@@ -1242,11 +1248,11 @@ static void __bch2_insert_fixup_extent(struct bch_fs *c,
 			break;
 
 		if (!bkey_whiteout(k.k))
-			s->update_journal = true;
+			update_journal = true;
 
-		if (!s->update_journal) {
+		if (!update_journal) {
 			bch2_cut_front(cur_end, insert);
-			bch2_cut_front(cur_end, &s->whiteout);
+			bch2_cut_front(cur_end, &whiteout);
 			bch2_btree_iter_set_pos_same_leaf(iter, cur_end);
 			goto next;
 		}
@@ -1256,8 +1262,8 @@ static void __bch2_insert_fixup_extent(struct bch_fs *c,
 		 * of the key we're deleting, instead of creating and inserting
 		 * a new whiteout:
 		 */
-		if (s->deleting &&
-		    !s->update_btree &&
+		if (deleting &&
+		    !update_btree &&
 		    !bkey_cmp(insert->k.p, k.k->p) &&
 		    !bkey_cmp(bkey_start_pos(&insert->k), bkey_start_pos(k.k))) {
 			if (!bkey_whiteout(k.k)) {
@@ -1272,10 +1278,10 @@ static void __bch2_insert_fixup_extent(struct bch_fs *c,
 
 		if (k.k->needs_whiteout || bkey_written(l->b, _k)) {
 			insert->k.needs_whiteout = true;
-			s->update_btree = true;
+			update_btree = true;
 		}
 
-		if (s->update_btree &&
+		if (update_btree &&
 		    overlap == BCH_EXTENT_OVERLAP_ALL &&
 		    bkey_whiteout(k.k) &&
 		    k.k->needs_whiteout) {
@@ -1285,79 +1291,18 @@ static void __bch2_insert_fixup_extent(struct bch_fs *c,
 
 		extent_squash(c, iter, insert, _k, k, overlap);
 
-		if (!s->update_btree)
+		if (!update_btree)
 			bch2_cut_front(cur_end, insert);
 next:
 		if (overlap == BCH_EXTENT_OVERLAP_FRONT ||
 		    overlap == BCH_EXTENT_OVERLAP_MIDDLE)
 			break;
 	}
-}
 
-/**
- * bch_extent_insert_fixup - insert a new extent and deal with overlaps
- *
- * this may result in not actually doing the insert, or inserting some subset
- * of the insert key. For cmpxchg operations this is where that logic lives.
- *
- * All subsets of @insert that need to be inserted are inserted using
- * bch2_btree_insert_and_journal(). If @b or @res fills up, this function
- * returns false, setting @iter->pos for the prefix of @insert that actually got
- * inserted.
- *
- * BSET INVARIANTS: this function is responsible for maintaining all the
- * invariants for bsets of extents in memory. things get really hairy with 0
- * size extents
- *
- * within one bset:
- *
- * bkey_start_pos(bkey_next(k)) >= k
- * or bkey_start_offset(bkey_next(k)) >= k->offset
- *
- * i.e. strict ordering, no overlapping extents.
- *
- * multiple bsets (i.e. full btree node):
- *
- * ∀ k, j
- *   k.size != 0 ∧ j.size != 0 →
- *     ¬ (k > bkey_start_pos(j) ∧ k < j)
- *
- * i.e. no two overlapping keys _of nonzero size_
- *
- * We can't realistically maintain this invariant for zero size keys because of
- * the key merging done in bch2_btree_insert_key() - for two mergeable keys k, j
- * there may be another 0 size key between them in another bset, and it will
- * thus overlap with the merged key.
- *
- * In addition, the end of iter->pos indicates how much has been processed.
- * If the end of iter->pos is not the same as the end of insert, then
- * key insertion needs to continue/be retried.
- */
-void bch2_insert_fixup_extent(struct btree_trans *trans,
-			      struct btree_insert_entry *insert)
-{
-	struct bch_fs *c = trans->c;
-	struct btree_iter *iter	= insert->iter;
-	struct extent_insert_state s = {
-		.whiteout	= *insert->k,
-		.update_journal	= !bkey_whiteout(&insert->k->k),
-		.update_btree	= !bkey_whiteout(&insert->k->k),
-		.deleting	= bkey_whiteout(&insert->k->k),
-	};
-	BKEY_PADDED(k) tmp;
-
-	EBUG_ON(iter->level);
-	EBUG_ON(!insert->k->k.size);
-	EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k->k)));
-
-	__bch2_insert_fixup_extent(c, iter, insert->k, &s);
+	if (update_btree) {
+		bkey_copy(&tmp.k, insert);
 
-	bch2_btree_iter_set_pos_same_leaf(iter, insert->k->k.p);
-
-	if (s.update_btree) {
-		bkey_copy(&tmp.k, insert->k);
-
-		if (s.deleting)
+		if (deleting)
 			tmp.k.k.type = KEY_TYPE_discard;
 
 		EBUG_ON(bkey_deleted(&tmp.k.k) || !tmp.k.k.size);
@@ -1365,10 +1310,10 @@ void bch2_insert_fixup_extent(struct btree_trans *trans,
 		extent_bset_insert(c, iter, &tmp.k);
 	}
 
-	if (s.update_journal) {
-		bkey_copy(&tmp.k, !s.deleting ? insert->k : &s.whiteout);
+	if (update_journal) {
+		bkey_copy(&tmp.k, !deleting ? insert : &whiteout);
 
-		if (s.deleting)
+		if (deleting)
 			tmp.k.k.type = KEY_TYPE_discard;
 
 		EBUG_ON(bkey_deleted(&tmp.k.k) || !tmp.k.k.size);
@@ -1376,7 +1321,8 @@ void bch2_insert_fixup_extent(struct btree_trans *trans,
 		bch2_btree_journal_key(trans, iter, &tmp.k);
 	}
 
-	bch2_cut_front(insert->k->k.p, insert->k);
+	bch2_cut_front(insert->k.p, insert);
+	bch2_btree_iter_set_pos_same_leaf(iter, insert->k.p);
 }
 
 const char *bch2_extent_invalid(const struct bch_fs *c, struct bkey_s_c k)
@@ -1716,93 +1662,6 @@ enum merge_result bch2_extent_merge(struct bch_fs *c,
 	return BCH_MERGE_MERGE;
 }
 
-/*
- * When merging an extent that we're inserting into a btree node, the new merged
- * extent could overlap with an existing 0 size extent - if we don't fix that,
- * it'll break the btree node iterator so this code finds those 0 size extents
- * and shifts them out of the way.
- *
- * Also unpacks and repacks.
- */
-static bool bch2_extent_merge_inline(struct bch_fs *c,
-				     struct btree_iter *iter,
-				     struct bkey_packed *l,
-				     struct bkey_packed *r,
-				     bool back_merge)
-{
-	struct btree *b = iter->l[0].b;
-	struct btree_node_iter *node_iter = &iter->l[0].iter;
-	BKEY_PADDED(k) li, ri;
-	struct bkey_packed *m	= back_merge ? l : r;
-	struct bkey_i *mi	= back_merge ? &li.k : &ri.k;
-	struct bset_tree *t	= bch2_bkey_to_bset(b, m);
-	enum merge_result ret;
-
-	EBUG_ON(bkey_written(b, m));
-
-	if (bkey_val_u64s(l) > BKEY_EXTENT_VAL_U64s_MAX ||
-	    bkey_val_u64s(r) > BKEY_EXTENT_VAL_U64s_MAX)
-		return BCH_MERGE_NOMERGE;
-
-	/*
-	 * We need to save copies of both l and r, because we might get a
-	 * partial merge (which modifies both) and then fails to repack
-	 */
-	bch2_bkey_unpack(b, &li.k, l);
-	bch2_bkey_unpack(b, &ri.k, r);
-
-	ret = bch2_bkey_merge(c,
-			      bkey_i_to_s(&li.k),
-			      bkey_i_to_s(&ri.k));
-	if (ret == BCH_MERGE_NOMERGE)
-		return false;
-
-	if (debug_check_bkeys(c))
-		bch2_bkey_debugcheck(c, b, bkey_i_to_s_c(&li.k));
-	if (debug_check_bkeys(c) &&
-	    ret == BCH_MERGE_PARTIAL)
-		bch2_bkey_debugcheck(c, b, bkey_i_to_s_c(&ri.k));
-
-	/*
-	 * check if we overlap with deleted extents - would break the sort
-	 * order:
-	 */
-	if (back_merge) {
-		struct bkey_packed *n = bkey_next(m);
-
-		if (n != btree_bkey_last(b, t) &&
-		    bkey_cmp_left_packed(b, n, &li.k.k.p) <= 0 &&
-		    bkey_deleted(n))
-			return false;
-	} else if (ret == BCH_MERGE_MERGE) {
-		struct bkey_packed *prev = bch2_bkey_prev_all(b, t, m);
-
-		if (prev &&
-		    bkey_cmp_left_packed_byval(b, prev,
-				bkey_start_pos(&li.k.k)) > 0)
-			return false;
-	}
-
-	if (ret == BCH_MERGE_PARTIAL) {
-		if (!extent_i_save(b, m, mi))
-			return false;
-
-		if (!back_merge)
-			bkey_copy(packed_to_bkey(l), &li.k);
-		else
-			bkey_copy(packed_to_bkey(r), &ri.k);
-	} else {
-		if (!extent_i_save(b, m, &li.k))
-			return false;
-	}
-
-	bch2_bset_fix_invalidated_key(b, m);
-	bch2_btree_node_iter_fix(iter, b, node_iter,
-				 m, m->u64s, m->u64s);
-
-	return ret == BCH_MERGE_MERGE;
-}
-
 bool bch2_check_range_allocated(struct bch_fs *c, struct bpos pos, u64 size,
 			       unsigned nr_replicas)
 {
-- 
cgit 


From 7f9473d17151408ae0d5fbdbd6089df2f214c2e0 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 21 Oct 2019 21:15:08 -0400
Subject: bcachefs: Avoid calling iter_prev() in extent update path

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/extents.c | 23 ++++++++---------------
 1 file changed, 8 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 02db6c759622..20a2eeed98cd 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -1062,8 +1062,8 @@ static void extent_bset_insert(struct bch_fs *c, struct btree_iter *iter,
 			       struct bkey_i *insert)
 {
 	struct btree_iter_level *l = &iter->l[0];
-	struct btree_node_iter node_iter;
-	struct bkey_packed *k;
+	struct bkey_packed *k =
+		bch2_btree_node_iter_bset_pos(&l->iter, l->b, bset_tree_last(l->b));
 
 	BUG_ON(insert->k.u64s > bch_btree_keys_u64s_remaining(c, l->b));
 
@@ -1073,18 +1073,6 @@ static void extent_bset_insert(struct bch_fs *c, struct btree_iter *iter,
 	if (debug_check_bkeys(c))
 		bch2_bkey_debugcheck(c, l->b, bkey_i_to_s_c(insert));
 
-	/*
-	 * may have skipped past some deleted extents greater than the insert
-	 * key, before we got to a non deleted extent and knew we could bail out
-	 * rewind the iterator a bit if necessary:
-	 */
-	node_iter = l->iter;
-	while ((k = bch2_btree_node_iter_prev_all(&node_iter, l->b)) &&
-	       bkey_cmp_left_packed(l->b, k, &insert->k.p) > 0)
-		l->iter = node_iter;
-
-	k = bch2_btree_node_iter_bset_pos(&l->iter, l->b, bset_tree_last(l->b));
-
 	bch2_bset_insert(l->b, &l->iter, k, insert, 0);
 	bch2_btree_node_iter_fix(iter, l->b, &l->iter, k, 0, k->u64s);
 }
@@ -1225,6 +1213,7 @@ void bch2_insert_fixup_extent(struct btree_trans *trans,
 	struct btree_iter *iter	= insert_entry->iter;
 	struct bkey_i *insert	= insert_entry->k;
 	struct btree_iter_level *l = &iter->l[0];
+	struct btree_node_iter node_iter = l->iter;
 	bool deleting		= bkey_whiteout(&insert->k);
 	bool update_journal	= !deleting;
 	bool update_btree	= !deleting;
@@ -1294,11 +1283,16 @@ void bch2_insert_fixup_extent(struct btree_trans *trans,
 		if (!update_btree)
 			bch2_cut_front(cur_end, insert);
 next:
+		node_iter = l->iter;
+
 		if (overlap == BCH_EXTENT_OVERLAP_FRONT ||
 		    overlap == BCH_EXTENT_OVERLAP_MIDDLE)
 			break;
 	}
 
+	l->iter = node_iter;
+	bch2_btree_iter_set_pos_same_leaf(iter, insert->k.p);
+
 	if (update_btree) {
 		bkey_copy(&tmp.k, insert);
 
@@ -1322,7 +1316,6 @@ next:
 	}
 
 	bch2_cut_front(insert->k.p, insert);
-	bch2_btree_iter_set_pos_same_leaf(iter, insert->k.p);
 }
 
 const char *bch2_extent_invalid(const struct bch_fs *c, struct bkey_s_c k)
-- 
cgit 


From b8098f36dd98566790bae019815583363eb877c9 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 22 Oct 2019 17:35:35 -0400
Subject: bcachefs: Don't use rep movsq for small memcopies

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bkey.h    |  4 ++--
 fs/bcachefs/extents.c |  4 ++--
 fs/bcachefs/util.h    | 18 ++++++++++++++++++
 3 files changed, 22 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bkey.h b/fs/bcachefs/bkey.h
index 321fe6fe0b55..cb2702707c2a 100644
--- a/fs/bcachefs/bkey.h
+++ b/fs/bcachefs/bkey.h
@@ -95,8 +95,8 @@ do {								\
 		(u64 *) (_dst) < (u64 *) (_src) +		\
 		((struct bkey *) (_src))->u64s);		\
 								\
-	__memmove_u64s_down((_dst), (_src),			\
-			    ((struct bkey *) (_src))->u64s);	\
+	memcpy_u64s_small((_dst), (_src),			\
+			  ((struct bkey *) (_src))->u64s);	\
 } while (0)
 
 struct btree;
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 20a2eeed98cd..b427bc1f0f9c 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -1457,8 +1457,8 @@ static inline void __extent_entry_insert(struct bkey_i *k,
 {
 	union bch_extent_entry *end = bkey_val_end(bkey_i_to_s(k));
 
-	memmove_u64s_up((u64 *) dst + extent_entry_u64s(new),
-			dst, (u64 *) end - (u64 *) dst);
+	memmove_u64s_up_small((u64 *) dst + extent_entry_u64s(new),
+			      dst, (u64 *) end - (u64 *) dst);
 	k->k.u64s += extent_entry_u64s(new);
 	memcpy_u64s_small(dst, new, extent_entry_u64s(new));
 }
diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h
index 5f0a3de91ae3..965b6dd73bfa 100644
--- a/fs/bcachefs/util.h
+++ b/fs/bcachefs/util.h
@@ -593,6 +593,24 @@ static inline void memmove_u64s_down(void *dst, const void *src,
 	__memmove_u64s_down(dst, src, u64s);
 }
 
+static inline void __memmove_u64s_up_small(void *_dst, const void *_src,
+					   unsigned u64s)
+{
+	u64 *dst = (u64 *) _dst + u64s;
+	u64 *src = (u64 *) _src + u64s;
+
+	while (u64s--)
+		*--dst = *--src;
+}
+
+static inline void memmove_u64s_up_small(void *dst, const void *src,
+					 unsigned u64s)
+{
+	EBUG_ON(dst < src);
+
+	__memmove_u64s_up_small(dst, src, u64s);
+}
+
 static inline void __memmove_u64s_up(void *_dst, const void *_src,
 				     unsigned u64s)
 {
-- 
cgit 


From 92384391c8d04642bb2be1e0c34f744675f50abc Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 24 Oct 2019 14:22:29 -0400
Subject: bcachefs: Don't reuse bio in retry path

We can't reuse bios without reinitializing them, and in the retry path
it's safer to just make sure we don't reuse them at all.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/io.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index a343393115d8..3f075dcfafb2 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -1447,6 +1447,7 @@ static void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio
 	int ret;
 
 	flags &= ~BCH_READ_LAST_FRAGMENT;
+	flags |= BCH_READ_MUST_CLONE;
 
 	bch2_trans_init(&trans, c, 0, 0);
 
-- 
cgit 


From 406d6d5a0733f3eacbcd4c33905bf63c5b2f4c07 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 25 Oct 2019 18:54:58 -0400
Subject: bcachefs: Fix an error path race

On IO error, bch2_writepages_io_done() will set the page state to
indicate nothing's already reserved (since the write didn't happen, we
don't know what's already reserved). This can race with the buffered IO
path, in between getting a disk reservation and calling
bch2_set_page_dirty().

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-io.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index c539ed3aa48d..93f6cdbbf7c7 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -491,7 +491,12 @@ static void bch2_set_page_dirty(struct bch_fs *c,
 		unsigned sectors = sectors_to_reserve(&s->s[i],
 						res->disk.nr_replicas);
 
-		BUG_ON(sectors > res->disk.sectors);
+		/*
+		 * This can happen if we race with the error path in
+		 * bch2_writepage_io_done():
+		 */
+		sectors = min_t(unsigned, sectors, res->disk.sectors);
+
 		s->s[i].replicas_reserved += sectors;
 		res->disk.sectors -= sectors;
 
-- 
cgit 


From e219965586b0e18a12e12fdf37f26eb74bb17bcd Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 25 Oct 2019 19:06:26 -0400
Subject: bcachefs: Add missing error checking in bch2_find_by_inum_trans()

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/inode.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index aeae536b39f1..b0e670cc9d0f 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -509,7 +509,7 @@ int bch2_inode_find_by_inum_trans(struct btree_trans *trans, u64 inode_nr,
 {
 	struct btree_iter *iter;
 	struct bkey_s_c k;
-	int ret = -ENOENT;
+	int ret;
 
 	iter = bch2_trans_get_iter(trans, BTREE_ID_INODES,
 			POS(inode_nr, 0), BTREE_ITER_SLOTS);
@@ -517,8 +517,13 @@ int bch2_inode_find_by_inum_trans(struct btree_trans *trans, u64 inode_nr,
 		return PTR_ERR(iter);
 
 	k = bch2_btree_iter_peek_slot(iter);
-	if (k.k->type == KEY_TYPE_inode)
-		ret = bch2_inode_unpack(bkey_s_c_to_inode(k), inode);
+	ret = bkey_err(k);
+	if (ret)
+		return ret;
+
+	ret = k.k->type == KEY_TYPE_inode
+		? bch2_inode_unpack(bkey_s_c_to_inode(k), inode)
+		: -ENOENT;
 
 	bch2_trans_iter_put(trans, iter);
 
-- 
cgit 


From e3728b50034504e5e64604d72896973374cb1fa5 Mon Sep 17 00:00:00 2001
From: Justin Husted <sigstop@gmail.com>
Date: Fri, 11 Oct 2019 17:20:30 -0700
Subject: bcachefs: Initialize padding space after alloc bkey

Packed bkeys are padded up to 64 bit alignment, but the alloc bkey type
was not clearing the pad bytes after the last data byte. This left the
key possibly containing some random garbage at the end.

This problem was found using valgrind.

This patch also changes a path with the inode bkey to clear in the same
way.

Signed-off-by: Justin Husted <sigstop@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c | 5 ++++-
 fs/bcachefs/inode.c            | 8 ++++----
 fs/bcachefs/util.h             | 8 ++++++++
 3 files changed, 16 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 7d5d94dbc64f..7b9079a740ef 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -152,6 +152,7 @@ void bch2_alloc_pack(struct bkey_i_alloc *dst,
 {
 	unsigned idx = 0;
 	void *d = dst->v.data;
+	unsigned bytes;
 
 	dst->v.fields	= 0;
 	dst->v.gen	= src.gen;
@@ -160,7 +161,9 @@ void bch2_alloc_pack(struct bkey_i_alloc *dst,
 	BCH_ALLOC_FIELDS()
 #undef  x
 
-	set_bkey_val_bytes(&dst->k, (void *) d - (void *) &dst->v);
+	bytes = (void *) d - (void *) &dst->v;
+	set_bkey_val_bytes(&dst->k, bytes);
+	memset_u64s_tail(&dst->v, 0, bytes);
 }
 
 static unsigned bch_alloc_val_u64s(const struct bch_alloc *a)
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index b0e670cc9d0f..67555db01dc4 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -95,6 +95,7 @@ void bch2_inode_pack(struct bkey_inode_buf *packed,
 	u8 *end = (void *) &packed[1];
 	u8 *last_nonzero_field = out;
 	unsigned nr_fields = 0, last_nonzero_fieldnr = 0;
+	unsigned bytes;
 
 	bkey_inode_init(&packed->inode.k_i);
 	packed->inode.k.p.inode		= inode->bi_inum;
@@ -117,10 +118,9 @@ void bch2_inode_pack(struct bkey_inode_buf *packed,
 	out = last_nonzero_field;
 	nr_fields = last_nonzero_fieldnr;
 
-	set_bkey_val_bytes(&packed->inode.k, out - (u8 *) &packed->inode.v);
-	memset(out, 0,
-	       (u8 *) &packed->inode.v +
-	       bkey_val_bytes(&packed->inode.k) - out);
+	bytes = out - (u8 *) &packed->inode.v;
+	set_bkey_val_bytes(&packed->inode.k, bytes);
+	memset_u64s_tail(&packed->inode.v, 0, bytes);
 
 	SET_INODE_NR_FIELDS(&packed->inode.v, nr_fields);
 
diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h
index 965b6dd73bfa..33589362f5df 100644
--- a/fs/bcachefs/util.h
+++ b/fs/bcachefs/util.h
@@ -648,6 +648,14 @@ static inline void memmove_u64s(void *dst, const void *src,
 		__memmove_u64s_up(dst, src, u64s);
 }
 
+/* Set the last few bytes up to a u64 boundary given an offset into a buffer. */
+static inline void memset_u64s_tail(void *s, int c, unsigned bytes)
+{
+	unsigned rem = round_up(bytes, sizeof(u64)) - bytes;
+
+	memset(s + bytes, c, rem);
+}
+
 void sort_cmp_size(void *base, size_t num, size_t size,
 	  int (*cmp_func)(const void *, const void *, size_t),
 	  void (*swap_func)(void *, void *, size_t));
-- 
cgit 


From 43cfbad6e431568d14e5d1407203c39a140f0148 Mon Sep 17 00:00:00 2001
From: Justin Husted <sigstop@gmail.com>
Date: Fri, 11 Oct 2019 17:05:11 -0700
Subject: bcachefs: Further padding fixes in
 bch2_journal_super_entries_add_common()

The previous patch 128cb1a to fix uninitialized data was incorrect and
did not initialize the padding space correctly. Furthermore, several
other cases in this function do not initialize their padding space
correctly.

Move initialization into some helper functions in a more robust way.

Signed-off-by: Justin Husted <sigstop@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/super-io.c | 35 ++++++++++++++++++++++++-----------
 1 file changed, 24 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index 14ff191ad702..6544bbf18e70 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -949,6 +949,25 @@ int bch2_fs_mark_dirty(struct bch_fs *c)
 	return ret;
 }
 
+static void
+entry_init_u64s(struct jset_entry *entry, unsigned u64s)
+{
+	memset(entry, 0, u64s * sizeof(u64));
+
+	/*
+	 * The u64s field counts from the start of data, ignoring the shared
+	 * fields.
+	 */
+	entry->u64s = u64s - 1;
+}
+
+static void
+entry_init_size(struct jset_entry *entry, size_t size)
+{
+	unsigned u64s = DIV_ROUND_UP(size, sizeof(u64));
+	entry_init_u64s(entry, u64s);
+}
+
 struct jset_entry *
 bch2_journal_super_entries_add_common(struct bch_fs *c,
 				      struct jset_entry *entry,
@@ -963,7 +982,7 @@ bch2_journal_super_entries_add_common(struct bch_fs *c,
 	     r < c->btree_roots + BTREE_ID_NR;
 	     r++)
 		if (r->alive) {
-			entry->u64s	= r->key.u64s;
+			entry_init_u64s(entry, r->key.u64s + 1);
 			entry->btree_id	= r - c->btree_roots;
 			entry->level	= r->level;
 			entry->type	= BCH_JSET_ENTRY_btree_root;
@@ -988,8 +1007,7 @@ bch2_journal_super_entries_add_common(struct bch_fs *c,
 		struct jset_entry_usage *u =
 			container_of(entry, struct jset_entry_usage, entry);
 
-		memset(u, 0, sizeof(*u));
-		u->entry.u64s	= DIV_ROUND_UP(sizeof(*u), sizeof(u64)) - 1;
+		entry_init_size(entry, sizeof(*u));
 		u->entry.type	= BCH_JSET_ENTRY_usage;
 		u->entry.btree_id = FS_USAGE_INODES;
 		u->v		= cpu_to_le64(c->usage_base->nr_inodes);
@@ -1001,8 +1019,7 @@ bch2_journal_super_entries_add_common(struct bch_fs *c,
 		struct jset_entry_usage *u =
 			container_of(entry, struct jset_entry_usage, entry);
 
-		memset(u, 0, sizeof(*u));
-		u->entry.u64s	= DIV_ROUND_UP(sizeof(*u), sizeof(u64)) - 1;
+		entry_init_size(entry, sizeof(*u));
 		u->entry.type	= BCH_JSET_ENTRY_usage;
 		u->entry.btree_id = FS_USAGE_KEY_VERSION;
 		u->v		= cpu_to_le64(atomic64_read(&c->key_version));
@@ -1014,8 +1031,7 @@ bch2_journal_super_entries_add_common(struct bch_fs *c,
 		struct jset_entry_usage *u =
 			container_of(entry, struct jset_entry_usage, entry);
 
-		memset(u, 0, sizeof(*u));
-		u->entry.u64s	= DIV_ROUND_UP(sizeof(*u), sizeof(u64)) - 1;
+		entry_init_size(entry, sizeof(*u));
 		u->entry.type	= BCH_JSET_ENTRY_usage;
 		u->entry.btree_id = FS_USAGE_RESERVED;
 		u->entry.level	= i;
@@ -1030,10 +1046,7 @@ bch2_journal_super_entries_add_common(struct bch_fs *c,
 		struct jset_entry_data_usage *u =
 			container_of(entry, struct jset_entry_data_usage, entry);
 
-		int u64s = DIV_ROUND_UP(sizeof(*u) + e->nr_devs,
-					sizeof(u64)) - 1;
-		memset(u, 0, u64s * sizeof(u64));
-		u->entry.u64s	= u64s;
+		entry_init_size(entry, sizeof(*u) + e->nr_devs);
 		u->entry.type	= BCH_JSET_ENTRY_data_usage;
 		u->v		= cpu_to_le64(c->usage_base->replicas[i]);
 		unsafe_memcpy(&u->r, e, replicas_entry_bytes(e),
-- 
cgit 


From 928c839cc949f7cb0b7cf09e1151e22681a4c338 Mon Sep 17 00:00:00 2001
From: Justin Husted <sigstop@gmail.com>
Date: Fri, 11 Oct 2019 17:56:27 -0700
Subject: bcachefs: Initialize btree_node flags field in bch2_btree_root_alloc.

Valgrind data indicated that the flags field was only partially
initialized when written to disk.

Signed-off-by: Justin Husted <sigstop@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update_interior.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 3b134d3a9984..0956957216f9 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -2187,6 +2187,7 @@ void bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id)
 	bch2_bset_init_first(b, &b->data->keys);
 	bch2_btree_build_aux_trees(b);
 
+	b->data->flags = 0;
 	b->data->min_key = POS_MIN;
 	b->data->max_key = POS_MAX;
 	b->data->format = bch2_btree_calc_format(b);
-- 
cgit 


From c4e065c23c4ad464be2fb1c4dfa70090cc3d0c66 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 23 Oct 2019 19:50:01 -0400
Subject: bcachefs: More bset.c microoptimization

Improve a few paper cuts that've shown up during profiling.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bkey.c       |  2 +-
 fs/bcachefs/bset.c       | 18 +++++++++++++-----
 fs/bcachefs/btree_iter.c |  4 ++--
 3 files changed, 16 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bkey.c b/fs/bcachefs/bkey.c
index 8b3c9ae8d266..dd551cc3a162 100644
--- a/fs/bcachefs/bkey.c
+++ b/fs/bcachefs/bkey.c
@@ -329,7 +329,7 @@ bool bch2_bkey_pack_key(struct bkey_packed *out, const struct bkey *in,
 void bch2_bkey_unpack(const struct btree *b, struct bkey_i *dst,
 		 const struct bkey_packed *src)
 {
-	dst->k = bkey_unpack_key(b, src);
+	__bkey_unpack_key(b, &dst->k, src);
 
 	memcpy_u64s(&dst->v,
 		    bkeyp_val(&b->format, src),
diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c
index 1dd2bcc69c35..6b3b7bd4002b 100644
--- a/fs/bcachefs/bset.c
+++ b/fs/bcachefs/bset.c
@@ -1548,11 +1548,13 @@ static void btree_node_iter_init_pack_failed(struct btree_node_iter *iter,
  *    So we've got to search for start_of_range, then after the lookup iterate
  *    past any extents that compare equal to the position we searched for.
  */
+__flatten
 void bch2_btree_node_iter_init(struct btree_node_iter *iter,
 			       struct btree *b, struct bpos *search)
 {
 	struct bset_tree *t;
 	struct bkey_packed p, *packed_search = NULL;
+	struct btree_node_iter_set *pos = iter->data;
 
 	EBUG_ON(bkey_cmp(*search, b->data->min_key) < 0);
 	bset_aux_tree_verify(b);
@@ -1571,11 +1573,17 @@ void bch2_btree_node_iter_init(struct btree_node_iter *iter,
 		return;
 	}
 
-	for_each_bset(b, t)
-		__bch2_btree_node_iter_push(iter, b,
-					   bch2_bset_search(b, t, search,
-							    packed_search, &p),
-					   btree_bkey_last(b, t));
+	for_each_bset(b, t) {
+		struct bkey_packed *k = bch2_bset_search(b, t, search,
+							 packed_search, &p);
+		struct bkey_packed *end = btree_bkey_last(b, t);
+
+		if (k != end)
+			*pos++ = (struct btree_node_iter_set) {
+				__btree_node_key_to_offset(b, k),
+				__btree_node_key_to_offset(b, end)
+			};
+	}
 
 	bch2_btree_node_iter_sort(iter, b);
 }
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 8aaaa6615eff..25ad6b69b6bd 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -939,7 +939,7 @@ static void btree_iter_prefetch(struct btree_iter *iter)
 		btree_node_unlock(iter, iter->level);
 }
 
-static inline int btree_iter_down(struct btree_iter *iter)
+static __always_inline int btree_iter_down(struct btree_iter *iter)
 {
 	struct bch_fs *c = iter->trans->c;
 	struct btree_iter_level *l = &iter->l[iter->level];
@@ -948,7 +948,7 @@ static inline int btree_iter_down(struct btree_iter *iter)
 	enum six_lock_type lock_type = __btree_lock_want(iter, level);
 	BKEY_PADDED(k) tmp;
 
-	BUG_ON(!btree_node_locked(iter, iter->level));
+	EBUG_ON(!btree_node_locked(iter, iter->level));
 
 	bch2_bkey_unpack(l->b, &tmp.k,
 			 bch2_btree_node_iter_peek(&l->iter, l->b));
-- 
cgit 


From ff929515cc52ed693ff2116be3af9f32122e9b54 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 28 Oct 2019 19:33:59 -0400
Subject: bcachefs: Trust btree alloc info at runtime

This lets us avoid a cache miss in the write path.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs.h | 1 +
 fs/bcachefs/buckets.c  | 2 +-
 fs/bcachefs/recovery.c | 2 ++
 3 files changed, 4 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 093dc906353d..f8a040115fd1 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -499,6 +499,7 @@ enum {
 	/* misc: */
 	BCH_FS_BDEV_MOUNTED,
 	BCH_FS_FIXED_GENS,
+	BCH_FS_ALLOC_WRITTEN,
 	BCH_FS_REBUILD_REPLICAS,
 	BCH_FS_HOLD_BTREE_WRITES,
 };
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 34d3b117085b..0c2ca7601fde 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -1456,7 +1456,7 @@ static int bch2_trans_mark_pointer(struct btree_trans *trans,
 	if (ret < 0)
 		return ret;
 
-	if (!ret) {
+	if (!ret && unlikely(!test_bit(BCH_FS_ALLOC_WRITTEN, &c->flags))) {
 		/*
 		 * During journal replay, and if gc repairs alloc info at
 		 * runtime, the alloc info in the btree might not be up to date
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 23f3ed54fadd..2c441a278044 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -864,6 +864,8 @@ int bch2_fs_recovery(struct bch_fs *c)
 			goto err;
 		}
 		bch_verbose(c, "alloc write done");
+
+		set_bit(BCH_FS_ALLOC_WRITTEN, &c->flags);
 	}
 
 	if (!c->sb.clean) {
-- 
cgit 


From b7ba66c8450a58649393b47bc8975926b1e80814 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 28 Oct 2019 19:35:13 -0400
Subject: bcachefs: Inline more of bch2_trans_commit hot path

The main optimization here is that if we let
bch2_replicas_delta_list_apply() fail, we can completely skip calling
bch2_bkey_replicas_marked_locked().

And assorted other small optimizations.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c            |  14 +----
 fs/bcachefs/btree_locking.h         |  18 +++++++
 fs/bcachefs/btree_update_interior.h |   6 +--
 fs/bcachefs/btree_update_leaf.c     | 104 ++++++++++++++----------------------
 fs/bcachefs/buckets.c               |  48 +++++++++++------
 fs/bcachefs/buckets.h               |   6 +--
 6 files changed, 99 insertions(+), 97 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 25ad6b69b6bd..c264b927f992 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -64,21 +64,9 @@ static inline int btree_iter_pos_cmp(struct btree_iter *iter,
 
 /* Btree node locking: */
 
-/*
- * Updates the saved lock sequence number, so that bch2_btree_node_relock() will
- * succeed:
- */
 void bch2_btree_node_unlock_write(struct btree *b, struct btree_iter *iter)
 {
-	struct btree_iter *linked;
-
-	EBUG_ON(iter->l[b->c.level].b != b);
-	EBUG_ON(iter->l[b->c.level].lock_seq + 1 != b->c.lock.state.seq);
-
-	trans_for_each_iter_with_node(iter->trans, b, linked)
-		linked->l[b->c.level].lock_seq += 2;
-
-	six_unlock_write(&b->c.lock);
+	bch2_btree_node_unlock_write_inlined(b, iter);
 }
 
 void __bch2_btree_node_lock_write(struct btree *b, struct btree_iter *iter)
diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h
index aa5882cc4852..a164924ca656 100644
--- a/fs/bcachefs/btree_locking.h
+++ b/fs/bcachefs/btree_locking.h
@@ -202,6 +202,24 @@ static inline bool bch2_btree_node_relock(struct btree_iter *iter,
 		__bch2_btree_node_relock(iter, level);
 }
 
+/*
+ * Updates the saved lock sequence number, so that bch2_btree_node_relock() will
+ * succeed:
+ */
+static inline void
+bch2_btree_node_unlock_write_inlined(struct btree *b, struct btree_iter *iter)
+{
+	struct btree_iter *linked;
+
+	EBUG_ON(iter->l[b->c.level].b != b);
+	EBUG_ON(iter->l[b->c.level].lock_seq + 1 != b->c.lock.state.seq);
+
+	trans_for_each_iter_with_node(iter->trans, b, linked)
+		linked->l[b->c.level].lock_seq += 2;
+
+	six_unlock_write(&b->c.lock);
+}
+
 void bch2_btree_node_unlock_write(struct btree *, struct btree_iter *);
 
 void __bch2_btree_node_lock_write(struct btree *, struct btree_iter *);
diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h
index f9e092bf69aa..85f1320fa7b1 100644
--- a/fs/bcachefs/btree_update_interior.h
+++ b/fs/bcachefs/btree_update_interior.h
@@ -284,17 +284,17 @@ static inline unsigned btree_write_set_buffer(struct btree *b)
 static inline struct btree_node_entry *want_new_bset(struct bch_fs *c,
 						     struct btree *b)
 {
-	struct bset *i = btree_bset_last(b);
+	struct bset_tree *t = bset_tree_last(b);
 	struct btree_node_entry *bne = max(write_block(b),
 			(void *) btree_bkey_last(b, bset_tree_last(b)));
 	ssize_t remaining_space =
 		__bch_btree_u64s_remaining(c, b, &bne->keys.start[0]);
 
-	if (unlikely(bset_written(b, i))) {
+	if (unlikely(bset_written(b, bset(b, t)))) {
 		if (remaining_space > (ssize_t) (block_bytes(c) >> 3))
 			return bne;
 	} else {
-		if (unlikely(vstruct_bytes(i) > btree_write_set_buffer(b)) &&
+		if (unlikely(bset_u64s(t) * sizeof(u64) > btree_write_set_buffer(b)) &&
 		    remaining_space > (ssize_t) (btree_write_set_buffer(b) >> 3))
 			return bne;
 	}
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 38a27d3a3b40..85580e63b5ca 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -17,6 +17,7 @@
 #include "replicas.h"
 #include "trace.h"
 
+#include <linux/prefetch.h>
 #include <linux/sort.h>
 
 static inline bool same_leaf_as_prev(struct btree_trans *trans,
@@ -50,23 +51,6 @@ inline void bch2_btree_node_lock_for_insert(struct bch_fs *c, struct btree *b,
 		bch2_btree_init_next(c, b, iter);
 }
 
-static void btree_trans_lock_write(struct btree_trans *trans, bool lock)
-{
-	struct bch_fs *c = trans->c;
-	struct btree_insert_entry *i;
-	unsigned iter;
-
-	trans_for_each_update_sorted(trans, i, iter) {
-		if (same_leaf_as_prev(trans, iter))
-			continue;
-
-		if (lock)
-			bch2_btree_node_lock_for_insert(c, i->iter->l[0].b, i->iter);
-		else
-			bch2_btree_node_unlock_write(i->iter->l[0].b, i->iter);
-	}
-}
-
 static inline void btree_trans_sort_updates(struct btree_trans *trans)
 {
 	struct btree_insert_entry *l, *r;
@@ -377,29 +361,6 @@ btree_key_can_insert(struct btree_trans *trans,
 	return BTREE_INSERT_OK;
 }
 
-static int btree_trans_check_can_insert(struct btree_trans *trans,
-					struct btree_insert_entry **stopped_at)
-{
-	struct btree_insert_entry *i;
-	unsigned iter, u64s = 0;
-	int ret;
-
-	trans_for_each_update_sorted(trans, i, iter) {
-		/* Multiple inserts might go to same leaf: */
-		if (!same_leaf_as_prev(trans, iter))
-			u64s = 0;
-
-		u64s += i->k->k.u64s;
-		ret = btree_key_can_insert(trans, i, &u64s);
-		if (ret) {
-			*stopped_at = i;
-			return ret;
-		}
-	}
-
-	return 0;
-}
-
 static inline void do_btree_insert_one(struct btree_trans *trans,
 				       struct btree_insert_entry *insert)
 {
@@ -450,6 +411,8 @@ bch2_trans_commit_write_locked(struct btree_trans *trans,
 	unsigned mark_flags = trans->flags & BTREE_INSERT_BUCKET_INVALIDATE
 		? BCH_BUCKET_MARK_BUCKET_INVALIDATE
 		: 0;
+	unsigned iter, u64s = 0;
+	bool marking = false;
 	int ret;
 
 	if (race_fault()) {
@@ -462,25 +425,28 @@ bch2_trans_commit_write_locked(struct btree_trans *trans,
 	 * held, otherwise another thread could write the node changing the
 	 * amount of space available:
 	 */
-	ret = btree_trans_check_can_insert(trans, stopped_at);
-	if (ret)
-		return ret;
 
-	trans_for_each_update(trans, i) {
-		if (!btree_node_type_needs_gc(i->iter->btree_id))
-			continue;
+	prefetch(&trans->c->journal.flags);
 
-		if (!fs_usage) {
-			percpu_down_read(&c->mark_lock);
-			fs_usage = bch2_fs_usage_scratch_get(c);
-		}
+	trans_for_each_update_sorted(trans, i, iter) {
+		/* Multiple inserts might go to same leaf: */
+		if (!same_leaf_as_prev(trans, iter))
+			u64s = 0;
 
-		/* Must be called under mark_lock: */
-		if (!bch2_bkey_replicas_marked_locked(c,
-			bkey_i_to_s_c(i->k), true)) {
-			ret = BTREE_INSERT_NEED_MARK_REPLICAS;
-			goto err;
+		u64s += i->k->k.u64s;
+		ret = btree_key_can_insert(trans, i, &u64s);
+		if (ret) {
+			*stopped_at = i;
+			return ret;
 		}
+
+		if (btree_node_type_needs_gc(i->iter->btree_id))
+			marking = true;
+	}
+
+	if (marking) {
+		percpu_down_read(&c->mark_lock);
+		fs_usage = bch2_fs_usage_scratch_get(c);
 	}
 
 	/*
@@ -508,16 +474,20 @@ bch2_trans_commit_write_locked(struct btree_trans *trans,
 				i->k->k.version = MAX_VERSION;
 	}
 
+	/* Must be called under mark_lock: */
+	if (marking && trans->fs_usage_deltas &&
+	    bch2_replicas_delta_list_apply(c, &fs_usage->u,
+					   trans->fs_usage_deltas)) {
+		ret = BTREE_INSERT_NEED_MARK_REPLICAS;
+		goto err;
+	}
+
 	trans_for_each_update(trans, i)
 		if (likely(!(trans->flags & BTREE_INSERT_NOMARK)) &&
 		    update_has_nontrans_triggers(i))
 			bch2_mark_update(trans, i, &fs_usage->u, mark_flags);
 
-	if (fs_usage && trans->fs_usage_deltas)
-		bch2_replicas_delta_list_apply(c, &fs_usage->u,
-					       trans->fs_usage_deltas);
-
-	if (fs_usage)
+	if (marking)
 		bch2_trans_fs_usage_apply(trans, fs_usage);
 
 	if (unlikely(c->gc_pos.phase))
@@ -526,7 +496,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans,
 	trans_for_each_update(trans, i)
 		do_btree_insert_one(trans, i);
 err:
-	if (fs_usage) {
+	if (marking) {
 		bch2_fs_usage_scratch_put(c, fs_usage);
 		percpu_up_read(&c->mark_lock);
 	}
@@ -609,9 +579,17 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans,
 	 */
 	btree_trans_sort_updates(trans);
 
-	btree_trans_lock_write(trans, true);
+	trans_for_each_update_sorted(trans, i, idx)
+		if (!same_leaf_as_prev(trans, idx))
+			bch2_btree_node_lock_for_insert(trans->c,
+						i->iter->l[0].b, i->iter);
+
 	ret = bch2_trans_commit_write_locked(trans, stopped_at);
-	btree_trans_lock_write(trans, false);
+
+	trans_for_each_update_sorted(trans, i, idx)
+		if (!same_leaf_as_prev(trans, idx))
+			bch2_btree_node_unlock_write_inlined(i->iter->l[0].b,
+							     i->iter);
 
 	/*
 	 * Drop journal reservation after dropping write locks, since dropping
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 0c2ca7601fde..c90c2d1b7706 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -498,14 +498,18 @@ void bch2_dev_usage_from_buckets(struct bch_fs *c)
 	}
 }
 
-static inline void update_replicas(struct bch_fs *c,
-				   struct bch_fs_usage *fs_usage,
-				   struct bch_replicas_entry *r,
-				   s64 sectors)
+static inline int update_replicas(struct bch_fs *c,
+				  struct bch_fs_usage *fs_usage,
+				  struct bch_replicas_entry *r,
+				  s64 sectors)
 {
 	int idx = bch2_replicas_entry_idx(c, r);
 
-	BUG_ON(idx < 0);
+	if (idx < 0)
+		return -1;
+
+	if (!fs_usage)
+		return 0;
 
 	switch (r->data_type) {
 	case BCH_DATA_BTREE:
@@ -519,6 +523,7 @@ static inline void update_replicas(struct bch_fs *c,
 		break;
 	}
 	fs_usage->replicas[idx]		+= sectors;
+	return 0;
 }
 
 static inline void update_cached_sectors(struct bch_fs *c,
@@ -579,14 +584,29 @@ static inline void update_cached_sectors_list(struct btree_trans *trans,
 	update_replicas_list(trans, &r.e, sectors);
 }
 
-void bch2_replicas_delta_list_apply(struct bch_fs *c,
-				    struct bch_fs_usage *fs_usage,
-				    struct replicas_delta_list *r)
+static inline struct replicas_delta *
+replicas_delta_next(struct replicas_delta *d)
+{
+	return (void *) d + replicas_entry_bytes(&d->r) + 8;
+}
+
+int bch2_replicas_delta_list_apply(struct bch_fs *c,
+				   struct bch_fs_usage *fs_usage,
+				   struct replicas_delta_list *r)
 {
 	struct replicas_delta *d = r->d;
 	struct replicas_delta *top = (void *) r->d + r->used;
 	unsigned i;
 
+	for (d = r->d; d != top; d = replicas_delta_next(d))
+		if (update_replicas(c, fs_usage, &d->r, d->delta)) {
+			top = d;
+			goto unwind;
+		}
+
+	if (!fs_usage)
+		return 0;
+
 	fs_usage->nr_inodes += r->nr_inodes;
 
 	for (i = 0; i < BCH_REPLICAS_MAX; i++) {
@@ -594,13 +614,11 @@ void bch2_replicas_delta_list_apply(struct bch_fs *c,
 		fs_usage->persistent_reserved[i] += r->persistent_reserved[i];
 	}
 
-	while (d != top) {
-		BUG_ON((void *) d > (void *) top);
-
-		update_replicas(c, fs_usage, &d->r, d->delta);
-
-		d = (void *) d + replicas_entry_bytes(&d->r) + 8;
-	}
+	return 0;
+unwind:
+	for (d = r->d; d != top; d = replicas_delta_next(d))
+		update_replicas(c, fs_usage, &d->r, -d->delta);
+	return -1;
 }
 
 #define do_mark_fn(fn, c, pos, flags, ...)				\
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index 5f91a57abc70..336729f763e1 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -279,9 +279,9 @@ int bch2_mark_overwrite(struct btree_trans *, struct btree_iter *,
 int bch2_mark_update(struct btree_trans *, struct btree_insert_entry *,
 		     struct bch_fs_usage *, unsigned);
 
-void bch2_replicas_delta_list_apply(struct bch_fs *,
-				    struct bch_fs_usage *,
-				    struct replicas_delta_list *);
+int bch2_replicas_delta_list_apply(struct bch_fs *,
+				   struct bch_fs_usage *,
+				   struct replicas_delta_list *);
 int bch2_trans_mark_key(struct btree_trans *, struct bkey_s_c,
 			unsigned, s64, unsigned);
 int bch2_trans_mark_update(struct btree_trans *,
-- 
cgit 


From 887c2a4ee5480d725d39a0d611a426040287188f Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 2 Oct 2019 09:56:39 -0400
Subject: bcachefs: bch2_btree_iter_fix_key_modified()

This is considerably cheaper than bch2_btree_node_iter_fix(), for cases
where the key was only modified and key ordering isn't changing.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c | 24 ++++++++++++++++++++++++
 fs/bcachefs/btree_iter.h |  2 ++
 fs/bcachefs/extents.c    | 13 +++++--------
 3 files changed, 31 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index c264b927f992..f849120fcbce 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -501,6 +501,30 @@ static void btree_node_iter_set_set_pos(struct btree_node_iter *iter,
 	bch2_btree_node_iter_push(iter, b, k, btree_bkey_last(b, t));
 }
 
+static void __bch2_btree_iter_fix_key_modified(struct btree_iter *iter,
+						    struct btree *b,
+						    struct bkey_packed *where)
+{
+	struct btree_node_iter *node_iter = &iter->l[0].iter;
+
+	if (where == bch2_btree_node_iter_peek_all(node_iter, b)) {
+		bkey_disassemble(b, where, &iter->k);
+		btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
+	}
+}
+
+void bch2_btree_iter_fix_key_modified(struct btree_iter *iter,
+				      struct btree *b,
+				      struct bkey_packed *where)
+{
+	struct btree_iter *linked;
+
+	trans_for_each_iter_with_node(iter->trans, b, linked) {
+		__bch2_btree_iter_fix_key_modified(linked, b, where);
+		__bch2_btree_iter_verify(linked, b);
+	}
+}
+
 static void __bch2_btree_node_iter_fix(struct btree_iter *iter,
 				      struct btree *b,
 				      struct btree_node_iter *node_iter,
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index 1b7262d7e284..a05e542b3792 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -104,6 +104,8 @@ static inline void bch2_btree_iter_verify(struct btree_iter *iter,
 static inline void bch2_btree_trans_verify_locks(struct btree_trans *iter) {}
 #endif
 
+void bch2_btree_iter_fix_key_modified(struct btree_iter *, struct btree *,
+					   struct bkey_packed *);
 void bch2_btree_node_iter_fix(struct btree_iter *, struct btree *,
 			      struct btree_node_iter *, struct bkey_packed *,
 			      unsigned, unsigned);
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index b427bc1f0f9c..201f4953acac 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -1091,8 +1091,7 @@ extent_squash(struct bch_fs *c, struct btree_iter *iter,
 		__bch2_cut_front(insert->k.p, k);
 		EBUG_ON(bkey_deleted(k.k));
 		extent_save(l->b, _k, k.k);
-		bch2_btree_node_iter_fix(iter, l->b, &l->iter,
-					 _k, _k->u64s, _k->u64s);
+		bch2_btree_iter_fix_key_modified(iter, l->b, _k);
 		break;
 
 	case BCH_EXTENT_OVERLAP_BACK:
@@ -1127,8 +1126,7 @@ extent_squash(struct bch_fs *c, struct btree_iter *iter,
 						 _k, u64s, 0);
 		} else {
 			extent_save(l->b, _k, k.k);
-			bch2_btree_node_iter_fix(iter, l->b, &l->iter,
-						 _k, _k->u64s, _k->u64s);
+			bch2_btree_iter_fix_key_modified(iter, l->b, _k);
 		}
 
 		break;
@@ -1158,8 +1156,7 @@ extent_squash(struct bch_fs *c, struct btree_iter *iter,
 		__bch2_cut_front(insert->k.p, k);
 		BUG_ON(bkey_deleted(k.k));
 		extent_save(l->b, _k, k.k);
-		bch2_btree_node_iter_fix(iter, l->b, &l->iter,
-					 _k, _k->u64s, _k->u64s);
+		bch2_btree_iter_fix_key_modified(iter, l->b, _k);
 
 		extent_bset_insert(c, iter, &split.k);
 		break;
@@ -1259,8 +1256,8 @@ void bch2_insert_fixup_extent(struct btree_trans *trans,
 				btree_account_key_drop(l->b, _k);
 				_k->type = KEY_TYPE_discard;
 				reserve_whiteout(l->b, _k);
-				bch2_btree_node_iter_fix(iter, l->b, &l->iter,
-							_k, _k->u64s, _k->u64s);
+				bch2_btree_iter_fix_key_modified(iter,
+								 l->b, _k);
 			}
 			break;
 		}
-- 
cgit 


From f7f63211a4e7ecc4c56e3bc1062a4f9711dfd0aa Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 29 Oct 2019 03:57:58 -0400
Subject: bcachefs: Don't use extent_ptr_decoded_append() in write path (fixup
 patch)

bch2_extent_ptr_decoded_append() is more general than we need here; we
know we're initializing a new extent so e.g. we're going to need the crc
entry.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/io.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index 3f075dcfafb2..567a553112d1 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -631,10 +631,12 @@ static void init_append_extent(struct bch_write_op *op,
 			       struct bversion version,
 			       struct bch_extent_crc_unpacked crc)
 {
-	struct bkey_i_extent *e = bkey_extent_init(op->insert_keys.top);
+	struct bkey_i_extent *e;
 	struct bch_extent_ptr *ptr;
 
 	op->pos.offset += crc.uncompressed_size;
+
+	e = bkey_extent_init(op->insert_keys.top);
 	e->k.p		= op->pos;
 	e->k.size	= crc.uncompressed_size;
 	e->k.version	= version;
-- 
cgit 


From f8f30863382c0d905196ea7606c14524d1f21fd0 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 1 Nov 2019 21:16:51 -0400
Subject: bcachefs: Avoid atomics in write fast path

This adds some horrible hacks, but the atomic ops for closures were
getting to be a pretty expensive part of the write path. We don't want
to rip out closures entirely from the write path, because they're used
for e.g. waiting on the allocator, or waiting on the journal flush, and
that stuff would get really ugly without closures.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-io.c    | 38 ++++++++++++--------------
 fs/bcachefs/io.c       | 74 ++++++++++++++++++++++++++++++--------------------
 fs/bcachefs/io.h       |  2 ++
 fs/bcachefs/io_types.h |  1 +
 4 files changed, 65 insertions(+), 50 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 93f6cdbbf7c7..436676f4fa2a 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -54,7 +54,7 @@ struct bch_writepage_io {
 };
 
 struct dio_write {
-	struct closure			cl;
+	struct completion		done;
 	struct kiocb			*req;
 	struct mm_struct		*mm;
 	unsigned			loop:1,
@@ -1755,8 +1755,6 @@ static noinline int bch2_dio_write_copy_iov(struct dio_write *dio)
 	return 0;
 }
 
-static void bch2_dio_write_loop_async(struct closure *);
-
 static long bch2_dio_write_loop(struct dio_write *dio)
 {
 	bool kthread = (current->flags & PF_KTHREAD) != 0;
@@ -1830,23 +1828,20 @@ static long bch2_dio_write_loop(struct dio_write *dio)
 
 		task_io_account_write(bio->bi_iter.bi_size);
 
-		closure_call(&dio->op.cl, bch2_write, NULL, &dio->cl);
-
 		if (!dio->sync && !dio->loop && dio->iter.count) {
 			if (bch2_dio_write_copy_iov(dio)) {
-				dio->op.error = -ENOMEM;
-				goto err_wait_io;
+				dio->sync = true;
+				goto do_io;
 			}
 		}
-err_wait_io:
+do_io:
 		dio->loop = true;
+		closure_call(&dio->op.cl, bch2_write, NULL, NULL);
 
-		if (!dio->sync) {
-			continue_at(&dio->cl, bch2_dio_write_loop_async, NULL);
+		if (dio->sync)
+			wait_for_completion(&dio->done);
+		else
 			return -EIOCBQUEUED;
-		}
-
-		closure_sync(&dio->cl);
 loop:
 		i_sectors_acct(c, inode, &dio->quota_res,
 			       dio->op.i_sectors_delta);
@@ -1863,7 +1858,9 @@ loop:
 			put_page(bv->bv_page);
 		if (!dio->iter.count || dio->op.error)
 			break;
+
 		bio_reset(bio, NULL, REQ_OP_WRITE);
+		reinit_completion(&dio->done);
 	}
 
 	ret = dio->op.error ?: ((long) dio->op.written << 9);
@@ -1875,8 +1872,6 @@ err:
 	if (dio->free_iov)
 		kfree(dio->iter.__iov);
 
-	closure_debug_destroy(&dio->cl);
-
 	sync = dio->sync;
 	bio_put(bio);
 
@@ -1890,11 +1885,14 @@ err:
 	return ret;
 }
 
-static void bch2_dio_write_loop_async(struct closure *cl)
+static void bch2_dio_write_loop_async(struct bch_write_op *op)
 {
-	struct dio_write *dio = container_of(cl, struct dio_write, cl);
+	struct dio_write *dio = container_of(op, struct dio_write, op);
 
-	bch2_dio_write_loop(dio);
+	if (dio->sync)
+		complete(&dio->done);
+	else
+		bch2_dio_write_loop(dio);
 }
 
 static noinline
@@ -1922,7 +1920,7 @@ ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter)
 			       GFP_KERNEL,
 			       &c->dio_write_bioset);
 	dio = container_of(bio, struct dio_write, op.wbio.bio);
-	closure_init(&dio->cl, NULL);
+	init_completion(&dio->done);
 	dio->req		= req;
 	dio->mm			= current->mm;
 	dio->loop		= false;
@@ -1933,6 +1931,7 @@ ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter)
 	dio->iter		= *iter;
 
 	bch2_write_op_init(&dio->op, c, opts);
+	dio->op.end_io		= bch2_dio_write_loop_async;
 	dio->op.target		= opts.foreground_target;
 	op_journal_seq_set(&dio->op, &inode->ei_journal_seq);
 	dio->op.write_point	= writepoint_hashed((unsigned long) current);
@@ -1962,7 +1961,6 @@ ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter)
 err:
 	bch2_disk_reservation_put(c, &dio->op.res);
 	bch2_quota_reservation_put(c, inode, &dio->quota_res);
-	closure_debug_destroy(&dio->cl);
 	bio_put(bio);
 	return ret;
 }
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index 567a553112d1..7a2368407a80 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -513,7 +513,12 @@ static void bch2_write_done(struct closure *cl)
 
 	bch2_time_stats_update(&c->times[BCH_TIME_data_write], op->start_time);
 
-	closure_return(cl);
+	if (op->end_io)
+		op->end_io(op);
+	if (cl->parent)
+		closure_return(cl);
+	else
+		closure_debug_destroy(cl);
 }
 
 /**
@@ -622,8 +627,10 @@ static void bch2_write_endio(struct bio *bio)
 
 	if (parent)
 		bio_endio(&parent->bio);
-	else
+	else if (!(op->flags & BCH_WRITE_SKIP_CLOSURE_PUT))
 		closure_put(cl);
+	else
+		continue_at_nobarrier(cl, bch2_write_index, index_update_wq(op));
 }
 
 static void init_append_extent(struct bch_write_op *op,
@@ -828,15 +835,14 @@ static enum prep_encoded_ret {
 	return PREP_ENCODED_OK;
 }
 
-static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp)
+static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp,
+			     struct bio **_dst)
 {
 	struct bch_fs *c = op->c;
 	struct bio *src = &op->wbio.bio, *dst = src;
 	struct bvec_iter saved_iter;
-	struct bkey_i *key_to_write;
 	void *ec_buf;
-	unsigned key_to_write_offset = op->insert_keys.top_p -
-		op->insert_keys.keys_p;
+	struct bpos ec_pos = op->pos;
 	unsigned total_output = 0, total_input = 0;
 	bool bounce = false;
 	bool page_alloc_failed = false;
@@ -855,6 +861,7 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp)
 	case PREP_ENCODED_CHECKSUM_ERR:
 		goto csum_err;
 	case PREP_ENCODED_DO_WRITE:
+		/* XXX look for bug here */
 		if (ec_buf) {
 			dst = bch2_write_bio_alloc(c, wp, src,
 						   &page_alloc_failed,
@@ -1004,31 +1011,15 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp)
 	dst->bi_iter.bi_size = total_output;
 do_write:
 	/* might have done a realloc... */
+	bch2_ec_add_backpointer(c, wp, ec_pos, total_input >> 9);
 
-	key_to_write = (void *) (op->insert_keys.keys_p + key_to_write_offset);
-
-	bch2_ec_add_backpointer(c, wp,
-				bkey_start_pos(&key_to_write->k),
-				total_input >> 9);
-
-	bch2_alloc_sectors_done(c, wp);
-
-	dst->bi_end_io	= bch2_write_endio;
-	dst->bi_private	= &op->cl;
-	dst->bi_opf	= REQ_OP_WRITE;
-
-	closure_get(dst->bi_private);
-
-	bch2_submit_wbio_replicas(to_wbio(dst), c, BCH_DATA_USER,
-				  key_to_write);
+	*_dst = dst;
 	return more;
 csum_err:
 	bch_err(c, "error verifying existing checksum while "
 		"rewriting existing data (memory corruption?)");
 	ret = -EIO;
 err:
-	bch2_alloc_sectors_done(c, wp);
-
 	if (to_wbio(dst)->bounce)
 		bch2_bio_free_pages_pool(c, dst);
 	if (to_wbio(dst)->put_bio)
@@ -1042,11 +1033,17 @@ static void __bch2_write(struct closure *cl)
 	struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
 	struct bch_fs *c = op->c;
 	struct write_point *wp;
+	struct bio *bio;
+	bool skip_put = true;
 	int ret;
 again:
 	memset(&op->failed, 0, sizeof(op->failed));
 
 	do {
+		struct bkey_i *key_to_write;
+		unsigned key_to_write_offset = op->insert_keys.top_p -
+			op->insert_keys.keys_p;
+
 		/* +1 for possible cache device: */
 		if (op->open_buckets.nr + op->nr_replicas + 1 >
 		    ARRAY_SIZE(op->open_buckets.v))
@@ -1080,21 +1077,38 @@ again:
 		}
 
 		bch2_open_bucket_get(c, wp, &op->open_buckets);
-
-		ret = bch2_write_extent(op, wp);
+		ret = bch2_write_extent(op, wp, &bio);
+		bch2_alloc_sectors_done(c, wp);
 
 		if (ret < 0)
 			goto err;
+
+		if (ret)
+			skip_put = false;
+
+		bio->bi_end_io	= bch2_write_endio;
+		bio->bi_private	= &op->cl;
+		bio->bi_opf	= REQ_OP_WRITE;
+
+		if (!skip_put)
+			closure_get(bio->bi_private);
+		else
+			op->flags |= BCH_WRITE_SKIP_CLOSURE_PUT;
+
+		key_to_write = (void *) (op->insert_keys.keys_p +
+					 key_to_write_offset);
+
+		bch2_submit_wbio_replicas(to_wbio(bio), c, BCH_DATA_USER,
+					  key_to_write);
 	} while (ret);
 
-	continue_at(cl, bch2_write_index, index_update_wq(op));
+	if (!skip_put)
+		continue_at(cl, bch2_write_index, index_update_wq(op));
 	return;
 err:
 	op->error = ret;
 
-	continue_at(cl, !bch2_keylist_empty(&op->insert_keys)
-		    ? bch2_write_index
-		    : bch2_write_done, index_update_wq(op));
+	continue_at(cl, bch2_write_index, index_update_wq(op));
 	return;
 flush_io:
 	closure_sync(cl);
diff --git a/fs/bcachefs/io.h b/fs/bcachefs/io.h
index 8a5d45f48045..81fc549a0c97 100644
--- a/fs/bcachefs/io.h
+++ b/fs/bcachefs/io.h
@@ -37,6 +37,7 @@ enum bch_write_flags {
 
 	/* Internal: */
 	BCH_WRITE_JOURNAL_SEQ_PTR	= (1 << 8),
+	BCH_WRITE_SKIP_CLOSURE_PUT	= (1 << 9),
 };
 
 static inline u64 *op_journal_seq(struct bch_write_op *op)
@@ -71,6 +72,7 @@ static inline void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c,
 				      struct bch_io_opts opts)
 {
 	op->c			= c;
+	op->end_io		= NULL;
 	op->flags		= 0;
 	op->written		= 0;
 	op->error		= 0;
diff --git a/fs/bcachefs/io_types.h b/fs/bcachefs/io_types.h
index 0f7fad041205..85dfcb0fdf51 100644
--- a/fs/bcachefs/io_types.h
+++ b/fs/bcachefs/io_types.h
@@ -95,6 +95,7 @@ struct bch_write_bio {
 struct bch_write_op {
 	struct closure		cl;
 	struct bch_fs		*c;
+	void			(*end_io)(struct bch_write_op *);
 	u64			start_time;
 
 	unsigned		written; /* sectors */
-- 
cgit 


From 7edcfbfefe5c18ea5df6bfdaca405003a0a87c51 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 1 Nov 2019 21:35:25 -0400
Subject: bcachefs: Don't hold inode lock longer than necessary in dio write
 path

In theory we should be able to do (non appending/extending) dio writes
without taking the inode lock at all - but this gets us most of the way
there.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-io.c | 89 ++++++++++++++++++++++++++++++++---------------------
 1 file changed, 54 insertions(+), 35 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 436676f4fa2a..37c81a664430 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -1774,9 +1774,6 @@ static long bch2_dio_write_loop(struct dio_write *dio)
 	if (dio->loop)
 		goto loop;
 
-	inode_dio_begin(&inode->v);
-	bch2_pagecache_block_get(&inode->ei_pagecache_lock);
-
 	/* Write and invalidate pagecache range that we're writing to: */
 	offset = req->ki_pos + (dio->op.written << 9);
 	ret = write_invalidate_inode_pages_range(mapping,
@@ -1904,15 +1901,39 @@ ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter)
 	struct bch_io_opts opts = io_opts(c, &inode->ei_inode);
 	struct dio_write *dio;
 	struct bio *bio;
+	bool locked = true, extending;
 	ssize_t ret;
 
-	lockdep_assert_held(&inode->v.i_rwsem);
+	prefetch(&c->opts);
+	prefetch((void *) &c->opts + 64);
+	prefetch(&inode->ei_inode);
+	prefetch((void *) &inode->ei_inode + 64);
 
-	if (unlikely(!iter->count))
-		return 0;
+	inode_lock(&inode->v);
+
+	ret = generic_write_checks(req, iter);
+	if (unlikely(ret <= 0))
+		goto err;
+
+	ret = file_remove_privs(file);
+	if (unlikely(ret))
+		goto err;
+
+	ret = file_update_time(file);
+	if (unlikely(ret))
+		goto err;
 
 	if (unlikely((req->ki_pos|iter->count) & (block_bytes(c) - 1)))
-		return -EINVAL;
+		goto err;
+
+	inode_dio_begin(&inode->v);
+	bch2_pagecache_block_get(&inode->ei_pagecache_lock);
+
+	extending = req->ki_pos + iter->count > inode->v.i_size;
+	if (!extending) {
+		inode_unlock(&inode->v);
+		locked = false;
+	}
 
 	bio = bio_alloc_bioset(NULL,
 			       iov_iter_npages(iter, BIO_MAX_VECS),
@@ -1924,8 +1945,7 @@ ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter)
 	dio->req		= req;
 	dio->mm			= current->mm;
 	dio->loop		= false;
-	dio->sync		= is_sync_kiocb(req) ||
-		req->ki_pos + iter->count > inode->v.i_size;
+	dio->sync		= is_sync_kiocb(req) || extending;
 	dio->free_iov		= false;
 	dio->quota_res.sectors	= 0;
 	dio->iter		= *iter;
@@ -1944,7 +1964,7 @@ ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter)
 	ret = bch2_quota_reservation_add(c, inode, &dio->quota_res,
 					 iter->count >> 9, true);
 	if (unlikely(ret))
-		goto err;
+		goto err_put_bio;
 
 	dio->op.nr_replicas	= dio->op.opts.data_replicas;
 
@@ -1955,55 +1975,54 @@ ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter)
 					       req->ki_pos >> 9),
 					iter->count >> 9,
 					dio->op.opts.data_replicas))
-		goto err;
+		goto err_put_bio;
 
-	return bch2_dio_write_loop(dio);
+	ret = bch2_dio_write_loop(dio);
 err:
+	if (locked)
+		inode_unlock(&inode->v);
+	if (ret > 0)
+		req->ki_pos += ret;
+	return ret;
+err_put_bio:
+	bch2_pagecache_block_put(&inode->ei_pagecache_lock);
 	bch2_disk_reservation_put(c, &dio->op.res);
 	bch2_quota_reservation_put(c, inode, &dio->quota_res);
 	bio_put(bio);
-	return ret;
+	inode_dio_end(&inode->v);
+	goto err;
 }
 
-static ssize_t __bch2_write_iter(struct kiocb *iocb, struct iov_iter *from)
+ssize_t bch2_write_iter(struct kiocb *iocb, struct iov_iter *from)
 {
 	struct file *file = iocb->ki_filp;
+	struct bch_inode_info *inode = file_bch_inode(file);
 	ssize_t	ret;
 
 	if (iocb->ki_flags & IOCB_DIRECT)
 		return bch2_direct_write(iocb, from);
 
+	inode_lock(&inode->v);
+
+	ret = generic_write_checks(iocb, from);
+	if (ret <= 0)
+		goto unlock;
+
 	ret = file_remove_privs(file);
 	if (ret)
-		return ret;
+		goto unlock;
 
 	ret = file_update_time(file);
 	if (ret)
-		return ret;
-
-	ret = iocb->ki_flags & IOCB_DIRECT
-		? bch2_direct_write(iocb, from)
-		: bch2_buffered_write(iocb, from);
+		goto unlock;
 
+	ret = bch2_buffered_write(iocb, from);
 	if (likely(ret > 0))
 		iocb->ki_pos += ret;
-
-	return ret;
-}
-
-ssize_t bch2_write_iter(struct kiocb *iocb, struct iov_iter *from)
-{
-	struct bch_inode_info *inode = file_bch_inode(iocb->ki_filp);
-	bool direct = iocb->ki_flags & IOCB_DIRECT;
-	ssize_t ret;
-
-	inode_lock(&inode->v);
-	ret = generic_write_checks(iocb, from);
-	if (ret > 0)
-		ret = __bch2_write_iter(iocb, from);
+unlock:
 	inode_unlock(&inode->v);
 
-	if (ret > 0 && !direct)
+	if (ret > 0)
 		ret = generic_write_sync(iocb, ret);
 
 	return ret;
-- 
cgit 


From 6d01598ecd151b22feff9d8bc7f0611adacc5f28 Mon Sep 17 00:00:00 2001
From: Justin Husted <sigstop@gmail.com>
Date: Sun, 3 Nov 2019 21:50:32 -0800
Subject: bcachefs: Fix uninitialized field in hash_check_init()

The chain_end field was not initialized before use in
hash_set_chain_start.

Signed-off-by: Justin Husted <sigstop@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fsck.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 5acf1fb64543..3cced2b99f3f 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -147,6 +147,7 @@ struct hash_check {
 static void hash_check_init(struct hash_check *h)
 {
 	h->chain = NULL;
+	h->chain_end = 0;
 }
 
 static void hash_stop_chain(struct btree_trans *trans,
-- 
cgit 


From 1b783a690dd51a64f58091d060468f7c32f13f20 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 18 Oct 2019 18:24:26 -0400
Subject: bcachefs: Add pagecache_add lock to buffered IO path, fault path

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-io.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 37c81a664430..3475b388d2fa 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -571,13 +571,13 @@ vm_fault_t bch2_page_mkwrite(struct vm_fault *vmf)
 	}
 
 	bch2_set_page_dirty(c, inode, page, &res, 0, len);
+	bch2_page_reservation_put(c, inode, &res);
+
 	wait_for_stable_page(page);
 out:
 	bch2_pagecache_add_put(&inode->ei_pagecache_lock);
 	sb_end_pagefault(inode->v.i_sb);
 
-	bch2_page_reservation_put(c, inode, &res);
-
 	return ret;
 }
 
-- 
cgit 


From 54847d253ab3a4980ed8322a618a9521a2d6cd45 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 4 Nov 2019 14:11:53 -0500
Subject: bcachefs: DIO write path only needs to shoot down pagecache once, not
 twice

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-io.c | 28 +++++++++-------------------
 1 file changed, 9 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 3475b388d2fa..185a37a6705e 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -1767,24 +1767,13 @@ static long bch2_dio_write_loop(struct dio_write *dio)
 	struct bio_vec *bv;
 	unsigned unaligned;
 	u64 new_i_size;
-	loff_t offset;
 	bool sync;
 	long ret;
 
 	if (dio->loop)
 		goto loop;
 
-	/* Write and invalidate pagecache range that we're writing to: */
-	offset = req->ki_pos + (dio->op.written << 9);
-	ret = write_invalidate_inode_pages_range(mapping,
-					offset,
-					offset + iov_iter_count(&dio->iter) - 1);
-	if (unlikely(ret))
-		goto err;
-
 	while (1) {
-		offset = req->ki_pos + (dio->op.written << 9);
-
 		if (kthread)
 			kthread_use_mm(dio->mm);
 		BUG_ON(current->faults_disabled_mapping);
@@ -1814,14 +1803,8 @@ static long bch2_dio_write_loop(struct dio_write *dio)
 			goto err;
 		}
 
-		/* gup might have faulted pages back in: */
-		ret = write_invalidate_inode_pages_range(mapping,
-				offset,
-				offset + bio->bi_iter.bi_size - 1);
-		if (unlikely(ret))
-			goto err;
-
-		dio->op.pos = POS(inode->v.i_ino, offset >> 9);
+		dio->op.pos = POS(inode->v.i_ino,
+				  (req->ki_pos >> 9) + dio->op.written);
 
 		task_io_account_write(bio->bi_iter.bi_size);
 
@@ -1896,6 +1879,7 @@ static noinline
 ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter)
 {
 	struct file *file = req->ki_filp;
+	struct address_space *mapping = file->f_mapping;
 	struct bch_inode_info *inode = file_bch_inode(file);
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
 	struct bch_io_opts opts = io_opts(c, &inode->ei_inode);
@@ -1977,6 +1961,12 @@ ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter)
 					dio->op.opts.data_replicas))
 		goto err_put_bio;
 
+	ret = write_invalidate_inode_pages_range(mapping,
+					req->ki_pos,
+					req->ki_pos + iter->count - 1);
+	if (unlikely(ret))
+		goto err_put_bio;
+
 	ret = bch2_dio_write_loop(dio);
 err:
 	if (locked)
-- 
cgit 


From a023127a28d4b8d651d3ccb41dd7da2ba534390a Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 2 Nov 2022 16:45:28 -0400
Subject: bcachefs: Eliminate function calls in DIO fastpaths

We can assume that usually buffered and O_DIRECT IO won't be mixed, and
the calls to flush the page cache won't be needed.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-io.c | 26 +++++++++++++++-----------
 1 file changed, 15 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 185a37a6705e..a1767ee85591 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -77,7 +77,7 @@ struct dio_read {
 };
 
 /* pagecache_block must be held */
-static int write_invalidate_inode_pages_range(struct address_space *mapping,
+static noinline int write_invalidate_inode_pages_range(struct address_space *mapping,
 					      loff_t start, loff_t end)
 {
 	int ret;
@@ -1693,11 +1693,13 @@ ssize_t bch2_read_iter(struct kiocb *iocb, struct iov_iter *iter)
 	if (iocb->ki_flags & IOCB_DIRECT) {
 		struct blk_plug plug;
 
-		ret = filemap_write_and_wait_range(mapping,
-					iocb->ki_pos,
-					iocb->ki_pos + count - 1);
-		if (ret < 0)
-			return ret;
+		if (unlikely(mapping->nrpages)) {
+			ret = filemap_write_and_wait_range(mapping,
+						iocb->ki_pos,
+						iocb->ki_pos + count - 1);
+			if (ret < 0)
+				return ret;
+		}
 
 		file_accessed(file);
 
@@ -1961,11 +1963,13 @@ ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter)
 					dio->op.opts.data_replicas))
 		goto err_put_bio;
 
-	ret = write_invalidate_inode_pages_range(mapping,
-					req->ki_pos,
-					req->ki_pos + iter->count - 1);
-	if (unlikely(ret))
-		goto err_put_bio;
+	if (unlikely(mapping->nrpages)) {
+		ret = write_invalidate_inode_pages_range(mapping,
+						req->ki_pos,
+						req->ki_pos + iter->count - 1);
+		if (unlikely(ret))
+			goto err_put_bio;
+	}
 
 	ret = bch2_dio_write_loop(dio);
 err:
-- 
cgit 


From 4a1d8d3efcdedd0911941f236b2e3a6347f518c3 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 6 Nov 2019 14:29:30 -0500
Subject: bcachefs: Fix setting of attributes mask in getattr

Discovered by xfstests generic/553

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index 65556993bbb9..b241164f6f7e 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -775,10 +775,15 @@ static int bch2_getattr(struct mnt_idmap *idmap,
 
 	if (inode->ei_inode.bi_flags & BCH_INODE_IMMUTABLE)
 		stat->attributes |= STATX_ATTR_IMMUTABLE;
+	stat->attributes_mask	 |= STATX_ATTR_IMMUTABLE;
+
 	if (inode->ei_inode.bi_flags & BCH_INODE_APPEND)
 		stat->attributes |= STATX_ATTR_APPEND;
+	stat->attributes_mask	 |= STATX_ATTR_APPEND;
+
 	if (inode->ei_inode.bi_flags & BCH_INODE_NODUMP)
 		stat->attributes |= STATX_ATTR_NODUMP;
+	stat->attributes_mask	 |= STATX_ATTR_NODUMP;
 
 	return 0;
 }
-- 
cgit 


From 677fc0562a237f6cfc1551e37673707096905ca7 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 4 Nov 2019 22:22:13 -0500
Subject: bcachefs: Some reflink fixes

len might fit into a loff_t when aligned_len does not - make sure we use
a u64 for aligned_len. Also, we weren't always extending the inode
correctly.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-io.c   | 13 ++++++-------
 fs/bcachefs/reflink.c |  4 +++-
 2 files changed, 9 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index a1767ee85591..d17621b0713e 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -2799,8 +2799,8 @@ loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src,
 	struct bch_inode_info *dst = file_bch_inode(file_dst);
 	struct bch_fs *c = src->v.i_sb->s_fs_info;
 	s64 i_sectors_delta = 0;
+	u64 aligned_len;
 	loff_t ret = 0;
-	loff_t aligned_len;
 
 	if (remap_flags & ~(REMAP_FILE_DEDUP|REMAP_FILE_ADVISORY))
 		return -EINVAL;
@@ -2829,10 +2829,10 @@ loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src,
 	if (ret < 0 || len == 0)
 		goto err;
 
-	aligned_len = round_up(len, block_bytes(c));
+	aligned_len = round_up((u64) len, block_bytes(c));
 
 	ret = write_invalidate_inode_pages_range(dst->v.i_mapping,
-				pos_dst, pos_dst + aligned_len);
+				pos_dst, pos_dst + len - 1);
 	if (ret)
 		goto err;
 
@@ -2847,18 +2847,17 @@ loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src,
 	if (ret < 0)
 		goto err;
 
-	ret <<= 9;
 	/*
 	 * due to alignment, we might have remapped slightly more than requsted
 	 */
-	ret = min(ret, len);
+	ret = min((u64) ret << 9, (u64) len);
 
 	/* XXX get a quota reservation */
 	i_sectors_acct(c, dst, NULL, i_sectors_delta);
 
 	spin_lock(&dst->v.i_lock);
-	if (pos_dst + len > dst->v.i_size)
-		i_size_write(&dst->v, pos_dst + len);
+	if (pos_dst + ret > dst->v.i_size)
+		i_size_write(&dst->v, pos_dst + ret);
 	spin_unlock(&dst->v.i_lock);
 err:
 	bch2_unlock_inodes(INODE_LOCK|INODE_PAGECACHE_BLOCK, src, dst);
diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c
index 6d45ae24479d..6e71c5e8f9a2 100644
--- a/fs/bcachefs/reflink.c
+++ b/fs/bcachefs/reflink.c
@@ -290,10 +290,12 @@ err:
 		ret2 = PTR_ERR_OR_ZERO(inode_iter);
 
 		if (!ret2 &&
-		    inode_u.bi_size < new_i_size)
+		    inode_u.bi_size < new_i_size) {
+			inode_u.bi_size = new_i_size;
 			ret2  = bch2_inode_write(&trans, inode_iter, &inode_u) ?:
 				bch2_trans_commit(&trans, NULL, journal_seq,
 						  BTREE_INSERT_ATOMIC);
+		}
 	} while (ret2 == -EINTR);
 
 	ret = bch2_trans_exit(&trans) ?: ret;
-- 
cgit 


From ef496cd268f45351820c3d268d01bd46c8b80b04 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 26 Oct 2019 14:58:36 -0400
Subject: bcachefs: Don't BUG_ON() sector count overflow

Return an error instead (still work in progress...)

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/buckets.c | 26 ++++++++++++++------------
 1 file changed, 14 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index c90c2d1b7706..f837cdda9433 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -1464,7 +1464,7 @@ static int bch2_trans_mark_pointer(struct btree_trans *trans,
 	struct bkey_s_c k;
 	struct bkey_alloc_unpacked u;
 	struct bkey_i_alloc *a;
-	unsigned old;
+	u16 *dst_sectors;
 	bool overflow;
 	int ret;
 
@@ -1519,22 +1519,24 @@ static int bch2_trans_mark_pointer(struct btree_trans *trans,
 		goto out;
 	}
 
-	if (!p.ptr.cached) {
-		old = u.dirty_sectors;
-		overflow = checked_add(u.dirty_sectors, sectors);
-	} else {
-		old = u.cached_sectors;
-		overflow = checked_add(u.cached_sectors, sectors);
+	dst_sectors = !p.ptr.cached
+		? &u.dirty_sectors
+		: &u.cached_sectors;
+
+	overflow = checked_add(*dst_sectors, sectors);
+
+	if (overflow) {
+		bch2_fs_inconsistent(c,
+			"bucket sector count overflow: %u + %lli > U16_MAX",
+			*dst_sectors, sectors);
+		/* return an error indicating that we need full fsck */
+		ret = -EIO;
+		goto out;
 	}
 
 	u.data_type = u.dirty_sectors || u.cached_sectors
 		? data_type : 0;
 
-	bch2_fs_inconsistent_on(overflow, c,
-		"bucket sector count overflow: %u + %lli > U16_MAX",
-		old, sectors);
-	BUG_ON(overflow);
-
 	a = trans_update_key(trans, iter, BKEY_ALLOC_U64s_MAX);
 	ret = PTR_ERR_OR_ZERO(a);
 	if (ret)
-- 
cgit 


From e2ee3eaab72a059b29f079290b5773509df9524f Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 6 Nov 2019 15:32:11 -0500
Subject: bcachefs: Add an option for fsck error ratelimiting

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/error.c | 13 +++++++++----
 fs/bcachefs/error.h |  1 +
 fs/bcachefs/opts.h  | 11 +++++++++++
 3 files changed, 21 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c
index 304ff92500be..5a5cfee623e2 100644
--- a/fs/bcachefs/error.c
+++ b/fs/bcachefs/error.c
@@ -64,7 +64,7 @@ void bch2_io_error(struct bch_dev *ca)
 enum fsck_err_ret bch2_fsck_err(struct bch_fs *c, unsigned flags,
 				const char *fmt, ...)
 {
-	struct fsck_err_state *s;
+	struct fsck_err_state *s = NULL;
 	va_list args;
 	bool fix = false, print = true, suppressing = false;
 	char _buf[sizeof(s->buf)], *buf = _buf;
@@ -99,8 +99,13 @@ enum fsck_err_ret bch2_fsck_err(struct bch_fs *c, unsigned flags,
 found:
 	list_move(&s->list, &c->fsck_errors);
 	s->nr++;
-	suppressing	= s->nr == FSCK_ERR_RATELIMIT_NR;
-	print		= s->nr <= FSCK_ERR_RATELIMIT_NR;
+	if (c->opts.ratelimit_errors &&
+	    s->nr >= FSCK_ERR_RATELIMIT_NR) {
+		if (s->nr == FSCK_ERR_RATELIMIT_NR)
+			suppressing = true;
+		else
+			print = false;
+	}
 	buf		= s->buf;
 print:
 	va_start(args, fmt);
@@ -156,7 +161,7 @@ void bch2_flush_fsck_errs(struct bch_fs *c)
 	mutex_lock(&c->fsck_error_lock);
 
 	list_for_each_entry_safe(s, n, &c->fsck_errors, list) {
-		if (s->nr > FSCK_ERR_RATELIMIT_NR)
+		if (s->ratelimited)
 			bch_err(c, "Saw %llu errors like:\n    %s", s->nr, s->buf);
 
 		list_del(&s->list);
diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h
index 2591e12305b7..7dcb0f6552fc 100644
--- a/fs/bcachefs/error.h
+++ b/fs/bcachefs/error.h
@@ -114,6 +114,7 @@ struct fsck_err_state {
 	struct list_head	list;
 	const char		*fmt;
 	u64			nr;
+	bool			ratelimited;
 	char			buf[512];
 };
 
diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
index a6f1d3ec7b90..2bd8bce43269 100644
--- a/fs/bcachefs/opts.h
+++ b/fs/bcachefs/opts.h
@@ -68,6 +68,12 @@ enum opt_type {
  *  - helptext
  */
 
+#ifdef __KERNEL__
+#define RATELIMIT_ERRORS true
+#else
+#define RATELIMIT_ERRORS false
+#endif
+
 #define BCH_OPTS()							\
 	x(block_size,			u16,				\
 	  OPT_FORMAT,							\
@@ -227,6 +233,11 @@ enum opt_type {
 	  OPT_BOOL(),							\
 	  NO_SB_OPT,			false,				\
 	  NULL,		"Fix errors during fsck without asking")	\
+	x(ratelimit_errors,		u8,				\
+	  OPT_MOUNT,							\
+	  OPT_BOOL(),							\
+	  NO_SB_OPT,			RATELIMIT_ERRORS,		\
+	  NULL,		"Ratelimit error messages during fsck")		\
 	x(nochanges,			u8,				\
 	  OPT_MOUNT,							\
 	  OPT_BOOL(),							\
-- 
cgit 


From f58c22e76febdfeff9c92fe7f3355bd1eea6645b Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 4 Nov 2019 15:56:04 -0500
Subject: bcachefs: Avoid calling bch2_btree_iter_relock() in
 bch2_btree_iter_traverse()

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index f849120fcbce..f05a5e718181 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -294,9 +294,7 @@ void bch2_btree_trans_verify_locks(struct btree_trans *trans)
 __flatten
 static bool bch2_btree_iter_relock(struct btree_iter *iter, bool trace)
 {
-	return iter->uptodate >= BTREE_ITER_NEED_RELOCK
-		? btree_iter_get_locks(iter, false, trace)
-		: true;
+	return btree_iter_get_locks(iter, false, trace);
 }
 
 bool __bch2_btree_iter_upgrade(struct btree_iter *iter,
@@ -1098,7 +1096,15 @@ static int btree_iter_traverse_one(struct btree_iter *iter)
 	if (unlikely(iter->level >= BTREE_MAX_DEPTH))
 		return 0;
 
-	if (bch2_btree_iter_relock(iter, false))
+	/*
+	 * if we need interior nodes locked, call btree_iter_relock() to make
+	 * sure we walk back up enough that we lock them:
+	 */
+	if (iter->uptodate == BTREE_ITER_NEED_RELOCK ||
+	    iter->locks_want > 1)
+		bch2_btree_iter_relock(iter, false);
+
+	if (iter->uptodate < BTREE_ITER_NEED_RELOCK)
 		return 0;
 
 	/*
-- 
cgit 


From 6baf2730ccaf0bbbe87f10fb34692441942b59ba Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 7 Nov 2019 15:00:08 -0500
Subject: bcachefs: Inline fast path of bch2_increment_clock()

Shaving more cycles.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/clock.c |  7 +++----
 fs/bcachefs/clock.h | 13 ++++++++++++-
 2 files changed, 15 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/clock.c b/fs/bcachefs/clock.c
index e4486fcbea19..e227753563ab 100644
--- a/fs/bcachefs/clock.c
+++ b/fs/bcachefs/clock.c
@@ -135,17 +135,16 @@ static struct io_timer *get_expired_timer(struct io_clock *clock,
 	return ret;
 }
 
-void bch2_increment_clock(struct bch_fs *c, unsigned sectors, int rw)
+void __bch2_increment_clock(struct io_clock *clock)
 {
-	struct io_clock *clock = &c->io_clock[rw];
 	struct io_timer *timer;
 	unsigned long now;
+	unsigned sectors;
 
 	/* Buffer up one megabyte worth of IO in the percpu counter */
 	preempt_disable();
 
-	if (likely(this_cpu_add_return(*clock->pcpu_buf, sectors) <
-		   IO_CLOCK_PCPU_SECTORS)) {
+	if (this_cpu_read(*clock->pcpu_buf) < IO_CLOCK_PCPU_SECTORS) {
 		preempt_enable();
 		return;
 	}
diff --git a/fs/bcachefs/clock.h b/fs/bcachefs/clock.h
index 5cb043c579d8..bfbbca8a207b 100644
--- a/fs/bcachefs/clock.h
+++ b/fs/bcachefs/clock.h
@@ -6,7 +6,18 @@ void bch2_io_timer_add(struct io_clock *, struct io_timer *);
 void bch2_io_timer_del(struct io_clock *, struct io_timer *);
 void bch2_kthread_io_clock_wait(struct io_clock *, unsigned long,
 				unsigned long);
-void bch2_increment_clock(struct bch_fs *, unsigned, int);
+
+void __bch2_increment_clock(struct io_clock *);
+
+static inline void bch2_increment_clock(struct bch_fs *c, unsigned sectors,
+					int rw)
+{
+	struct io_clock *clock = &c->io_clock[rw];
+
+	if (unlikely(this_cpu_add_return(*clock->pcpu_buf, sectors) >=
+		   IO_CLOCK_PCPU_SECTORS))
+		__bch2_increment_clock(clock);
+}
 
 void bch2_io_clock_schedule_timeout(struct io_clock *, unsigned long);
 
-- 
cgit 


From fab4f8c6538810e31d7d853333143621091f5dd0 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 7 Nov 2019 15:03:09 -0500
Subject: bcachefs: Make __bch2_bkey_cmp_packed() smaller

We can probably get rid of the version that dispatches based on type
checking too.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bkey.c | 26 ++++++++++----------------
 1 file changed, 10 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bkey.c b/fs/bcachefs/bkey.c
index dd551cc3a162..32e4917dc004 100644
--- a/fs/bcachefs/bkey.c
+++ b/fs/bcachefs/bkey.c
@@ -1061,26 +1061,20 @@ int __bch2_bkey_cmp_packed(const struct bkey_packed *l,
 			   const struct bkey_packed *r,
 			   const struct btree *b)
 {
-	int packed = bkey_lr_packed(l, r);
+	struct bkey unpacked;
 
-	if (likely(packed == BKEY_PACKED_BOTH))
+	if (likely(bkey_packed(l) && bkey_packed(r)))
 		return __bch2_bkey_cmp_packed_format_checked(l, r, b);
 
-	switch (packed) {
-	case BKEY_PACKED_NONE:
-		return bkey_cmp(((struct bkey *) l)->p,
-				((struct bkey *) r)->p);
-	case BKEY_PACKED_LEFT:
-		return __bch2_bkey_cmp_left_packed_format_checked(b,
-				  (struct bkey_packed *) l,
-				  &((struct bkey *) r)->p);
-	case BKEY_PACKED_RIGHT:
-		return -__bch2_bkey_cmp_left_packed_format_checked(b,
-				  (struct bkey_packed *) r,
-				  &((struct bkey *) l)->p);
-	default:
-		unreachable();
+	if (bkey_packed(l)) {
+		__bkey_unpack_key_format_checked(b, &unpacked, l);
+		l = (void*) &unpacked;
+	} else if (bkey_packed(r)) {
+		__bkey_unpack_key_format_checked(b, &unpacked, r);
+		r = (void*) &unpacked;
 	}
+
+	return bkey_cmp(((struct bkey *) l)->p, ((struct bkey *) r)->p);
 }
 
 __pure __flatten
-- 
cgit 


From c45376866aa1db911dfae2703ff919519757e780 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 7 Nov 2019 15:14:10 -0500
Subject: bcachefs: Pipeline binary searches and linear searches

This makes prefetching for the linear search at the end of the lookup
much more effective, and is a couple percent speedup.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bset.c | 114 ++++++++++++++++++++++++++++++++---------------------
 1 file changed, 69 insertions(+), 45 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c
index 6b3b7bd4002b..3e69b48cb67f 100644
--- a/fs/bcachefs/bset.c
+++ b/fs/bcachefs/bset.c
@@ -1326,6 +1326,25 @@ static int bset_search_tree_slowpath(const struct btree *b,
 				 packed_search, search) < 0;
 }
 
+static inline void prefetch_four_cachelines(void *p)
+{
+#ifdef CONFIG_X86_64
+	asm(".intel_syntax noprefix;"
+	    "prefetcht0 [%0 - 127 + 64 * 0];"
+	    "prefetcht0 [%0 - 127 + 64 * 1];"
+	    "prefetcht0 [%0 - 127 + 64 * 2];"
+	    "prefetcht0 [%0 - 127 + 64 * 3];"
+	    ".att_syntax prefix;"
+	    :
+	    : "r" (p + 127));
+#else
+	prefetch(p + L1_CACHE_BYTES * 0);
+	prefetch(p + L1_CACHE_BYTES * 1);
+	prefetch(p + L1_CACHE_BYTES * 2);
+	prefetch(p + L1_CACHE_BYTES * 3);
+#endif
+}
+
 __flatten
 static struct bkey_packed *bset_search_tree(const struct btree *b,
 				struct bset_tree *t,
@@ -1333,34 +1352,12 @@ static struct bkey_packed *bset_search_tree(const struct btree *b,
 				const struct bkey_packed *packed_search)
 {
 	struct ro_aux_tree *base = ro_aux_tree_base(b, t);
-	struct bkey_float *f = bkey_float_get(base, 1);
-	void *p;
+	struct bkey_float *f;
 	unsigned inorder, n = 1;
 
-	while (1) {
-		if (likely(n << 4 < t->size)) {
-			p = bkey_float_get(base, n << 4);
-			prefetch(p);
-		} else if (n << 3 < t->size) {
-			inorder = __eytzinger1_to_inorder(n, t->size, t->extra);
-			p = bset_cacheline(b, t, inorder);
-#ifdef CONFIG_X86_64
-			asm(".intel_syntax noprefix;"
-			    "prefetcht0 [%0 - 127 + 64 * 0];"
-			    "prefetcht0 [%0 - 127 + 64 * 1];"
-			    "prefetcht0 [%0 - 127 + 64 * 2];"
-			    "prefetcht0 [%0 - 127 + 64 * 3];"
-			    ".att_syntax prefix;"
-			    :
-			    : "r" (p + 127));
-#else
-			prefetch(p + L1_CACHE_BYTES * 0);
-			prefetch(p + L1_CACHE_BYTES * 1);
-			prefetch(p + L1_CACHE_BYTES * 2);
-			prefetch(p + L1_CACHE_BYTES * 3);
-#endif
-		} else if (n >= t->size)
-			break;
+	do {
+		if (likely(n << 4 < t->size))
+			prefetch(bkey_float_get(base, n << 4));
 
 		f = bkey_float_get(base, n);
 
@@ -1391,17 +1388,12 @@ static struct bkey_packed *bset_search_tree(const struct btree *b,
 	}
 }
 
-/*
- * Returns the first key greater than or equal to @search
- */
-__always_inline __flatten
-static struct bkey_packed *bch2_bset_search(struct btree *b,
+static __always_inline __flatten
+struct bkey_packed *__bch2_bset_search(struct btree *b,
 				struct bset_tree *t,
 				struct bpos *search,
-				struct bkey_packed *packed_search,
 				const struct bkey_packed *lossy_packed_search)
 {
-	struct bkey_packed *m;
 
 	/*
 	 * First, we search for a cacheline, then lastly we do a linear search
@@ -1420,11 +1412,9 @@ static struct bkey_packed *bch2_bset_search(struct btree *b,
 
 	switch (bset_aux_tree_type(t)) {
 	case BSET_NO_AUX_TREE:
-		m = btree_bkey_first(b, t);
-		break;
+		return btree_bkey_first(b, t);
 	case BSET_RW_AUX_TREE:
-		m = bset_search_write_set(b, t, search, lossy_packed_search);
-		break;
+		return bset_search_write_set(b, t, search, lossy_packed_search);
 	case BSET_RO_AUX_TREE:
 		/*
 		 * Each node in the auxiliary search tree covers a certain range
@@ -1436,10 +1426,20 @@ static struct bkey_packed *bch2_bset_search(struct btree *b,
 		if (bkey_cmp(*search, t->max_key) > 0)
 			return btree_bkey_last(b, t);
 
-		m = bset_search_tree(b, t, search, lossy_packed_search);
-		break;
+		return bset_search_tree(b, t, search, lossy_packed_search);
+	default:
+		unreachable();
 	}
+}
 
+static __always_inline __flatten
+struct bkey_packed *bch2_bset_search_linear(struct btree *b,
+				struct bset_tree *t,
+				struct bpos *search,
+				struct bkey_packed *packed_search,
+				const struct bkey_packed *lossy_packed_search,
+				struct bkey_packed *m)
+{
 	if (lossy_packed_search)
 		while (m != btree_bkey_last(b, t) &&
 		       bkey_iter_cmp_p_or_unp(b, search, lossy_packed_search,
@@ -1462,6 +1462,23 @@ static struct bkey_packed *bch2_bset_search(struct btree *b,
 	return m;
 }
 
+/*
+ * Returns the first key greater than or equal to @search
+ */
+static __always_inline __flatten
+struct bkey_packed *bch2_bset_search(struct btree *b,
+				struct bset_tree *t,
+				struct bpos *search,
+				struct bkey_packed *packed_search,
+				const struct bkey_packed *lossy_packed_search)
+{
+	struct bkey_packed *m = __bch2_bset_search(b, t, search,
+						   lossy_packed_search);
+
+	return bch2_bset_search_linear(b, t, search,
+				 packed_search, lossy_packed_search, m);
+}
+
 /* Btree node iterator */
 
 static inline void __bch2_btree_node_iter_push(struct btree_node_iter *iter,
@@ -1552,9 +1569,10 @@ __flatten
 void bch2_btree_node_iter_init(struct btree_node_iter *iter,
 			       struct btree *b, struct bpos *search)
 {
-	struct bset_tree *t;
 	struct bkey_packed p, *packed_search = NULL;
 	struct btree_node_iter_set *pos = iter->data;
+	struct bkey_packed *k[MAX_BSETS];
+	unsigned i;
 
 	EBUG_ON(bkey_cmp(*search, b->data->min_key) < 0);
 	bset_aux_tree_verify(b);
@@ -1573,14 +1591,20 @@ void bch2_btree_node_iter_init(struct btree_node_iter *iter,
 		return;
 	}
 
-	for_each_bset(b, t) {
-		struct bkey_packed *k = bch2_bset_search(b, t, search,
-							 packed_search, &p);
+	for (i = 0; i < b->nsets; i++) {
+		k[i] = __bch2_bset_search(b, b->set + i, search, &p);
+		prefetch_four_cachelines(k[i]);
+	}
+
+	for (i = 0; i < b->nsets; i++) {
+		struct bset_tree *t = b->set + i;
 		struct bkey_packed *end = btree_bkey_last(b, t);
 
-		if (k != end)
+		k[i] = bch2_bset_search_linear(b, t, search,
+					       packed_search, &p, k[i]);
+		if (k[i] != end)
 			*pos++ = (struct btree_node_iter_set) {
-				__btree_node_key_to_offset(b, k),
+				__btree_node_key_to_offset(b, k[i]),
 				__btree_node_key_to_offset(b, end)
 			};
 	}
-- 
cgit 


From 70438dc3f0a2125ddaaa6ace99fc43d060b1c2e8 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 7 Nov 2019 15:04:13 -0500
Subject: bcachefs: bch2_read_extent() microoptimizations

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/io.c | 24 +++++++++++++-----------
 1 file changed, 13 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index 7a2368407a80..272477fb558f 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -1272,7 +1272,6 @@ static void promote_start(struct promote_op *op, struct bch_read_bio *rbio)
 	closure_return_with_destructor(cl, promote_done);
 }
 
-noinline
 static struct promote_op *__promote_alloc(struct bch_fs *c,
 					  enum btree_id btree_id,
 					  struct bpos pos,
@@ -1346,7 +1345,8 @@ err:
 	return NULL;
 }
 
-static inline struct promote_op *promote_alloc(struct bch_fs *c,
+noinline
+static struct promote_op *promote_alloc(struct bch_fs *c,
 					       struct bvec_iter iter,
 					       struct bkey_s_c k,
 					       struct extent_ptr_decoded *pick,
@@ -1910,7 +1910,7 @@ int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig,
 	if (narrow_crcs && (flags & BCH_READ_USER_MAPPED))
 		flags |= BCH_READ_MUST_BOUNCE;
 
-	BUG_ON(offset_into_extent + bvec_iter_sectors(iter) > k.k->size);
+	EBUG_ON(offset_into_extent + bvec_iter_sectors(iter) > k.k->size);
 
 	if (pick.crc.compression_type != BCH_COMPRESSION_NONE ||
 	    (pick.crc.csum_type != BCH_CSUM_NONE &&
@@ -1922,8 +1922,9 @@ int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig,
 		bounce = true;
 	}
 
-	promote = promote_alloc(c, iter, k, &pick, orig->opts, flags,
-				&rbio, &bounce, &read_full);
+	if (orig->opts.promote_target)
+		promote = promote_alloc(c, iter, k, &pick, orig->opts, flags,
+					&rbio, &bounce, &read_full);
 
 	if (!read_full) {
 		EBUG_ON(pick.crc.compression_type);
@@ -1951,7 +1952,7 @@ int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig,
 		 * data in the write path, but we're not going to use it all
 		 * here:
 		 */
-		BUG_ON(rbio->bio.bi_iter.bi_size <
+		EBUG_ON(rbio->bio.bi_iter.bi_size <
 		       pick.crc.compressed_size << 9);
 		rbio->bio.bi_iter.bi_size =
 			pick.crc.compressed_size << 9;
@@ -1986,10 +1987,10 @@ int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig,
 noclone:
 		rbio = orig;
 		rbio->bio.bi_iter = iter;
-		BUG_ON(bio_flagged(&rbio->bio, BIO_CHAIN));
+		EBUG_ON(bio_flagged(&rbio->bio, BIO_CHAIN));
 	}
 
-	BUG_ON(bio_sectors(&rbio->bio) != pick.crc.compressed_size);
+	EBUG_ON(bio_sectors(&rbio->bio) != pick.crc.compressed_size);
 
 	rbio->c			= c;
 	rbio->submit_time	= local_clock();
@@ -2005,6 +2006,7 @@ noclone:
 	rbio->hole		= 0;
 	rbio->retry		= 0;
 	rbio->context		= 0;
+	/* XXX: only initialize this if needed */
 	rbio->devs_have		= bch2_bkey_devs(k);
 	rbio->pick		= pick;
 	rbio->pos		= pos;
@@ -2021,11 +2023,11 @@ noclone:
 
 	bch2_increment_clock(c, bio_sectors(&rbio->bio), READ);
 
-	percpu_down_read(&c->mark_lock);
+	rcu_read_lock();
 	bucket_io_clock_reset(c, ca, PTR_BUCKET_NR(ca, &pick.ptr), READ);
-	percpu_up_read(&c->mark_lock);
+	rcu_read_unlock();
 
-	if (likely(!(flags & (BCH_READ_IN_RETRY|BCH_READ_LAST_FRAGMENT)))) {
+	if (!(flags & (BCH_READ_IN_RETRY|BCH_READ_LAST_FRAGMENT))) {
 		bio_inc_remaining(&orig->bio);
 		trace_read_split(&orig->bio);
 	}
-- 
cgit 


From 1bdb67e8cb42c156954dfe2bfb1fa6ca5eee3633 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 6 Nov 2019 16:37:29 -0500
Subject: bcachefs: kill BFLOAT_FAILED_PREV

The assumption underlying BFLOAT_FAILED_PREV was wrong; the comparison
we're doing in bset_search_tree() doesn't have to tell the pivot apart
from the previous key, it just has to tell if search is definitely
greater than or equal to the pivot.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bset.c        | 59 ++++-------------------------------------------
 fs/bcachefs/bset.h        |  1 -
 fs/bcachefs/btree_cache.c |  2 --
 3 files changed, 4 insertions(+), 58 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c
index 3e69b48cb67f..16bcc2ef163a 100644
--- a/fs/bcachefs/bset.c
+++ b/fs/bcachefs/bset.c
@@ -283,9 +283,8 @@ static inline void bch2_btree_node_iter_next_check(struct btree_node_iter *iter,
 /* Auxiliary search trees */
 
 #define BFLOAT_FAILED_UNPACKED	(U8_MAX - 0)
-#define BFLOAT_FAILED_PREV	(U8_MAX - 1)
-#define BFLOAT_FAILED_OVERFLOW	(U8_MAX - 2)
-#define BFLOAT_FAILED		(U8_MAX - 2)
+#define BFLOAT_FAILED_OVERFLOW	(U8_MAX - 1)
+#define BFLOAT_FAILED		(U8_MAX - 1)
 
 #define KEY_WORDS		BITS_TO_LONGS(1 << BKEY_EXPONENT_BITS)
 
@@ -698,14 +697,11 @@ static void make_bfloat(struct btree *b, struct bset_tree *t,
 {
 	struct bkey_float *f = bkey_float(b, t, j);
 	struct bkey_packed *m = tree_to_bkey(b, t, j);
-	struct bkey_packed *p = tree_to_prev_bkey(b, t, j);
 	struct bkey_packed *l, *r;
 	unsigned bits = j < BFLOAT_32BIT_NR ? 32 : 16;
 	unsigned mantissa;
 	int shift, exponent, high_bit;
 
-	EBUG_ON(bkey_next(p) != m);
-
 	if (is_power_of_2(j)) {
 		l = min_key;
 
@@ -747,8 +743,7 @@ static void make_bfloat(struct btree *b, struct bset_tree *t,
 	 * the original key.
 	 */
 
-	if (!bkey_packed(l) || !bkey_packed(r) ||
-	    !bkey_packed(p) || !bkey_packed(m) ||
+	if (!bkey_packed(l) || !bkey_packed(r) || !bkey_packed(m) ||
 	    !b->nr_key_bits) {
 		f->exponent = BFLOAT_FAILED_UNPACKED;
 		return;
@@ -798,19 +793,6 @@ static void make_bfloat(struct btree *b, struct bset_tree *t,
 
 	bfloat_mantissa_set(f, j, mantissa);
 
-	/*
-	 * The bfloat must be able to tell its key apart from the previous key -
-	 * if its key and the previous key don't differ in the required bits,
-	 * flag as failed - unless the keys are actually equal, in which case
-	 * we aren't required to return a specific one:
-	 */
-	if (exponent > 0 &&
-	    bfloat_mantissa(f, j) == bkey_mantissa(p, f, j) &&
-	    bkey_cmp_packed(b, p, m)) {
-		f->exponent = BFLOAT_FAILED_PREV;
-		return;
-	}
-
 	/*
 	 * f->mantissa must compare >= the original key - for transitivity with
 	 * the comparison in bset_search_tree. If we're dropping set bits,
@@ -1805,9 +1787,6 @@ void bch2_btree_keys_stats(struct btree *b, struct bset_stats *stats)
 				case BFLOAT_FAILED_UNPACKED:
 					stats->failed_unpacked++;
 					break;
-				case BFLOAT_FAILED_PREV:
-					stats->failed_prev++;
-					break;
 				case BFLOAT_FAILED_OVERFLOW:
 					stats->failed_overflow++;
 					break;
@@ -1820,9 +1799,7 @@ void bch2_bfloat_to_text(struct printbuf *out, struct btree *b,
 			 struct bkey_packed *k)
 {
 	struct bset_tree *t = bch2_bkey_to_bset(b, k);
-	struct bkey_packed *l, *r, *p;
-	struct bkey uk, up;
-	char buf1[200], buf2[200];
+	struct bkey uk;
 	unsigned j, inorder;
 
 	if (out->pos != out->end)
@@ -1848,34 +1825,6 @@ void bch2_bfloat_to_text(struct printbuf *out, struct btree *b,
 		       ilog2(j),
 		       uk.p.inode, uk.p.offset);
 		break;
-	case BFLOAT_FAILED_PREV:
-		p = tree_to_prev_bkey(b, t, j);
-		l = is_power_of_2(j)
-			? btree_bkey_first(b, t)
-			: tree_to_prev_bkey(b, t, j >> ffs(j));
-		r = is_power_of_2(j + 1)
-			? bch2_bkey_prev_all(b, t, btree_bkey_last(b, t))
-			: tree_to_bkey(b, t, j >> (ffz(j) + 1));
-
-		up = bkey_unpack_key(b, p);
-		uk = bkey_unpack_key(b, k);
-		bch2_to_binary(buf1, high_word(&b->format, p), b->nr_key_bits);
-		bch2_to_binary(buf2, high_word(&b->format, k), b->nr_key_bits);
-
-		pr_buf(out,
-		       "    failed prev at depth %u\n"
-		       "\tkey starts at bit %u but first differing bit at %u\n"
-		       "\t%llu:%llu\n"
-		       "\t%llu:%llu\n"
-		       "\t%s\n"
-		       "\t%s\n",
-		       ilog2(j),
-		       bch2_bkey_greatest_differing_bit(b, l, r),
-		       bch2_bkey_greatest_differing_bit(b, p, k),
-		       uk.p.inode, uk.p.offset,
-		       up.p.inode, up.p.offset,
-		       buf1, buf2);
-		break;
 	case BFLOAT_FAILED_OVERFLOW:
 		uk = bkey_unpack_key(b, k);
 		pr_buf(out,
diff --git a/fs/bcachefs/bset.h b/fs/bcachefs/bset.h
index 209d2ed5db3a..0e4f27dbb8ef 100644
--- a/fs/bcachefs/bset.h
+++ b/fs/bcachefs/bset.h
@@ -598,7 +598,6 @@ struct bset_stats {
 
 	size_t floats;
 	size_t failed_unpacked;
-	size_t failed_prev;
 	size_t failed_overflow;
 };
 
diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index eb38fa50e054..86ec1da42892 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -911,7 +911,6 @@ void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c,
 	       "    nr unpacked keys %u\n"
 	       "    floats %zu\n"
 	       "    failed unpacked %zu\n"
-	       "    failed prev %zu\n"
 	       "    failed overflow %zu\n",
 	       f->key_u64s,
 	       f->bits_per_field[0],
@@ -930,6 +929,5 @@ void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c,
 	       b->nr.unpacked_keys,
 	       stats.floats,
 	       stats.failed_unpacked,
-	       stats.failed_prev,
 	       stats.failed_overflow);
 }
-- 
cgit 


From 58404bb2362d8198c8b6618669ff949f84743ff6 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 23 Oct 2019 14:56:20 -0400
Subject: bcachefs: Fall back to slowpath on exact comparison

This is basically equivalent to the original strategy of falling back to
checking against the original key when the original key and previous key
didn't differ in the required bits - except, now we only fall back when
the search key doesn't differ in the required bits, which ends up being
a bit faster.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bset.c        | 100 +++++++++++++++++++++-------------------------
 fs/bcachefs/bset.h        |   3 +-
 fs/bcachefs/btree_cache.c |   6 +--
 3 files changed, 48 insertions(+), 61 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c
index 16bcc2ef163a..5b6e29b65f5b 100644
--- a/fs/bcachefs/bset.c
+++ b/fs/bcachefs/bset.c
@@ -282,9 +282,8 @@ static inline void bch2_btree_node_iter_next_check(struct btree_node_iter *iter,
 
 /* Auxiliary search trees */
 
-#define BFLOAT_FAILED_UNPACKED	(U8_MAX - 0)
-#define BFLOAT_FAILED_OVERFLOW	(U8_MAX - 1)
-#define BFLOAT_FAILED		(U8_MAX - 1)
+#define BFLOAT_FAILED_UNPACKED	U8_MAX
+#define BFLOAT_FAILED		U8_MAX
 
 #define KEY_WORDS		BITS_TO_LONGS(1 << BKEY_EXPONENT_BITS)
 
@@ -792,23 +791,6 @@ static void make_bfloat(struct btree *b, struct bset_tree *t,
 		mantissa |= ~(~0U << -exponent);
 
 	bfloat_mantissa_set(f, j, mantissa);
-
-	/*
-	 * f->mantissa must compare >= the original key - for transitivity with
-	 * the comparison in bset_search_tree. If we're dropping set bits,
-	 * increment it:
-	 */
-	if (exponent > (int) bch2_bkey_ffs(b, m)) {
-		if (j < BFLOAT_32BIT_NR
-		    ? f->mantissa32 == U32_MAX
-		    : f->mantissa16 == U16_MAX)
-			f->exponent = BFLOAT_FAILED_OVERFLOW;
-
-		if (j < BFLOAT_32BIT_NR)
-			f->mantissa32++;
-		else
-			f->mantissa16++;
-	}
 }
 
 /* bytes remaining - only valid for last bset: */
@@ -1298,16 +1280,6 @@ static struct bkey_packed *bset_search_write_set(const struct btree *b,
 	return rw_aux_to_bkey(b, t, l);
 }
 
-noinline
-static int bset_search_tree_slowpath(const struct btree *b,
-				struct bset_tree *t, struct bpos *search,
-				const struct bkey_packed *packed_search,
-				unsigned n)
-{
-	return bkey_cmp_p_or_unp(b, tree_to_bkey(b, t, n),
-				 packed_search, search) < 0;
-}
-
 static inline void prefetch_four_cachelines(void *p)
 {
 #ifdef CONFIG_X86_64
@@ -1327,6 +1299,22 @@ static inline void prefetch_four_cachelines(void *p)
 #endif
 }
 
+static inline bool bkey_mantissa_bits_dropped(const struct btree *b,
+					      const struct bkey_float *f,
+					      unsigned idx)
+{
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+	unsigned key_bits_start = b->format.key_u64s * 64 - b->nr_key_bits;
+
+	return f->exponent > key_bits_start;
+#else
+	unsigned key_bits_end = high_bit_offset + b->nr_key_bits;
+	unsigned mantissa_bits = n < BFLOAT_32BIT_NR ? 32 : 16;
+
+	return f->exponent + mantissa_bits < key_bits_end;
+#endif
+}
+
 __flatten
 static struct bkey_packed *bset_search_tree(const struct btree *b,
 				struct bset_tree *t,
@@ -1335,7 +1323,9 @@ static struct bkey_packed *bset_search_tree(const struct btree *b,
 {
 	struct ro_aux_tree *base = ro_aux_tree_base(b, t);
 	struct bkey_float *f;
-	unsigned inorder, n = 1;
+	struct bkey_packed *k;
+	unsigned inorder, n = 1, l, r;
+	int cmp;
 
 	do {
 		if (likely(n << 4 < t->size))
@@ -1343,13 +1333,26 @@ static struct bkey_packed *bset_search_tree(const struct btree *b,
 
 		f = bkey_float_get(base, n);
 
-		if (packed_search &&
-		    likely(f->exponent < BFLOAT_FAILED))
-			n = n * 2 + (bfloat_mantissa(f, n) <
-				     bkey_mantissa(packed_search, f, n));
-		else
-			n = n * 2 + bset_search_tree_slowpath(b, t,
-						search, packed_search, n);
+		if (!unlikely(packed_search))
+			goto slowpath;
+		if (unlikely(f->exponent >= BFLOAT_FAILED))
+			goto slowpath;
+
+		l = bfloat_mantissa(f, n);
+		r = bkey_mantissa(packed_search, f, n);
+
+		if (unlikely(l == r) && bkey_mantissa_bits_dropped(b, f, n))
+			goto slowpath;
+
+		n = n * 2 + (l < r);
+		continue;
+slowpath:
+		k = tree_to_bkey(b, t, n);
+		cmp = bkey_cmp_p_or_unp(b, k, packed_search, search);
+		if (!cmp)
+			return k;
+
+		n = n * 2 + (cmp < 0);
 	} while (n < t->size);
 
 	inorder = __eytzinger1_to_inorder(n >> 1, t->size, t->extra);
@@ -1783,14 +1786,9 @@ void bch2_btree_keys_stats(struct btree *b, struct bset_stats *stats)
 			stats->floats += t->size - 1;
 
 			for (j = 1; j < t->size; j++)
-				switch (bkey_float(b, t, j)->exponent) {
-				case BFLOAT_FAILED_UNPACKED:
-					stats->failed_unpacked++;
-					break;
-				case BFLOAT_FAILED_OVERFLOW:
-					stats->failed_overflow++;
-					break;
-				}
+				stats->failed +=
+					bkey_float(b, t, j)->exponent ==
+					BFLOAT_FAILED;
 		}
 	}
 }
@@ -1817,7 +1815,7 @@ void bch2_bfloat_to_text(struct printbuf *out, struct btree *b,
 		return;
 
 	switch (bkey_float(b, t, j)->exponent) {
-	case BFLOAT_FAILED_UNPACKED:
+	case BFLOAT_FAILED:
 		uk = bkey_unpack_key(b, k);
 		pr_buf(out,
 		       "    failed unpacked at depth %u\n"
@@ -1825,13 +1823,5 @@ void bch2_bfloat_to_text(struct printbuf *out, struct btree *b,
 		       ilog2(j),
 		       uk.p.inode, uk.p.offset);
 		break;
-	case BFLOAT_FAILED_OVERFLOW:
-		uk = bkey_unpack_key(b, k);
-		pr_buf(out,
-		       "    failed overflow at depth %u\n"
-		       "\t%llu:%llu\n",
-		       ilog2(j),
-		       uk.p.inode, uk.p.offset);
-		break;
 	}
 }
diff --git a/fs/bcachefs/bset.h b/fs/bcachefs/bset.h
index 0e4f27dbb8ef..3f5b7378a0a9 100644
--- a/fs/bcachefs/bset.h
+++ b/fs/bcachefs/bset.h
@@ -597,8 +597,7 @@ struct bset_stats {
 	} sets[BSET_TREE_NR_TYPES];
 
 	size_t floats;
-	size_t failed_unpacked;
-	size_t failed_overflow;
+	size_t failed;
 };
 
 void bch2_btree_keys_stats(struct btree *, struct bset_stats *);
diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index 86ec1da42892..035da548737b 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -910,8 +910,7 @@ void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c,
 	       "    nr packed keys %u\n"
 	       "    nr unpacked keys %u\n"
 	       "    floats %zu\n"
-	       "    failed unpacked %zu\n"
-	       "    failed overflow %zu\n",
+	       "    failed unpacked %zu\n",
 	       f->key_u64s,
 	       f->bits_per_field[0],
 	       f->bits_per_field[1],
@@ -928,6 +927,5 @@ void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c,
 	       b->nr.packed_keys,
 	       b->nr.unpacked_keys,
 	       stats.floats,
-	       stats.failed_unpacked,
-	       stats.failed_overflow);
+	       stats.failed);
 }
-- 
cgit 


From b904a7991802d2fba1f8c59e6f0790021342c0ff Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 6 Nov 2019 19:40:09 -0500
Subject: bcachefs: Go back to 16 bit mantissa bkey floats

The previous optimizations means using 32 bit mantissas are now a net
loss - having bkey_float be only 4 bytes is good for prefetching.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bset.c | 99 +++++++++++++++---------------------------------------
 1 file changed, 28 insertions(+), 71 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c
index 5b6e29b65f5b..af20b9803608 100644
--- a/fs/bcachefs/bset.c
+++ b/fs/bcachefs/bset.c
@@ -285,33 +285,20 @@ static inline void bch2_btree_node_iter_next_check(struct btree_node_iter *iter,
 #define BFLOAT_FAILED_UNPACKED	U8_MAX
 #define BFLOAT_FAILED		U8_MAX
 
-#define KEY_WORDS		BITS_TO_LONGS(1 << BKEY_EXPONENT_BITS)
-
 struct bkey_float {
 	u8		exponent;
 	u8		key_offset;
-	union {
-		u32	mantissa32;
-	struct {
-		u16	mantissa16;
-		u16	_pad;
-	};
-	};
-} __packed;
-
-#define BFLOAT_32BIT_NR		32U
+	u16		mantissa;
+};
+#define BKEY_MANTISSA_BITS	16
 
 static unsigned bkey_float_byte_offset(unsigned idx)
 {
-	int d = (idx - BFLOAT_32BIT_NR) << 1;
-
-	d &= ~(d >> 31);
-
-	return idx * 6 - d;
+	return idx * sizeof(struct bkey_float);
 }
 
 struct ro_aux_tree {
-	struct bkey_float	_d[0];
+	struct bkey_float	f[0];
 };
 
 struct rw_aux_tree {
@@ -366,8 +353,8 @@ static unsigned bset_aux_tree_buf_end(const struct bset_tree *t)
 		return t->aux_data_offset;
 	case BSET_RO_AUX_TREE:
 		return t->aux_data_offset +
-			DIV_ROUND_UP(bkey_float_byte_offset(t->size) +
-				     sizeof(u8) * t->size, 8);
+			DIV_ROUND_UP(t->size * sizeof(struct bkey_float) +
+				     t->size * sizeof(u8), 8);
 	case BSET_RW_AUX_TREE:
 		return t->aux_data_offset +
 			DIV_ROUND_UP(sizeof(struct rw_aux_tree) * t->size, 8);
@@ -406,17 +393,11 @@ static u8 *ro_aux_tree_prev(const struct btree *b,
 	return __aux_tree_base(b, t) + bkey_float_byte_offset(t->size);
 }
 
-static struct bkey_float *bkey_float_get(struct ro_aux_tree *b,
-					 unsigned idx)
-{
-	return (void *) b + bkey_float_byte_offset(idx);
-}
-
 static struct bkey_float *bkey_float(const struct btree *b,
 				     const struct bset_tree *t,
 				     unsigned idx)
 {
-	return bkey_float_get(ro_aux_tree_base(b, t), idx);
+	return ro_aux_tree_base(b, t)->f + idx;
 }
 
 static void bset_aux_tree_verify(struct btree *b)
@@ -650,21 +631,6 @@ static unsigned rw_aux_tree_bsearch(struct btree *b,
 	return idx;
 }
 
-static inline unsigned bfloat_mantissa(const struct bkey_float *f,
-				       unsigned idx)
-{
-	return idx < BFLOAT_32BIT_NR ? f->mantissa32 : f->mantissa16;
-}
-
-static inline void bfloat_mantissa_set(struct bkey_float *f,
-				       unsigned idx, unsigned mantissa)
-{
-	if (idx < BFLOAT_32BIT_NR)
-		f->mantissa32 = mantissa;
-	else
-		f->mantissa16 = mantissa;
-}
-
 static inline unsigned bkey_mantissa(const struct bkey_packed *k,
 				     const struct bkey_float *f,
 				     unsigned idx)
@@ -684,9 +650,9 @@ static inline unsigned bkey_mantissa(const struct bkey_packed *k,
 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
 	v >>= f->exponent & 7;
 #else
-	v >>= 64 - (f->exponent & 7) - (idx < BFLOAT_32BIT_NR ? 32 : 16);
+	v >>= 64 - (f->exponent & 7) - BKEY_MANTISSA_BITS;
 #endif
-	return idx < BFLOAT_32BIT_NR ? (u32) v : (u16) v;
+	return (u16) v;
 }
 
 static void make_bfloat(struct btree *b, struct bset_tree *t,
@@ -697,7 +663,6 @@ static void make_bfloat(struct btree *b, struct bset_tree *t,
 	struct bkey_float *f = bkey_float(b, t, j);
 	struct bkey_packed *m = tree_to_bkey(b, t, j);
 	struct bkey_packed *l, *r;
-	unsigned bits = j < BFLOAT_32BIT_NR ? 32 : 16;
 	unsigned mantissa;
 	int shift, exponent, high_bit;
 
@@ -759,8 +724,8 @@ static void make_bfloat(struct btree *b, struct bset_tree *t,
 	 * of the key: we handle this later:
 	 */
 	high_bit = max(bch2_bkey_greatest_differing_bit(b, l, r),
-		       min_t(unsigned, bits, b->nr_key_bits) - 1);
-	exponent = high_bit - (bits - 1);
+		       min_t(unsigned, BKEY_MANTISSA_BITS, b->nr_key_bits) - 1);
+	exponent = high_bit - (BKEY_MANTISSA_BITS - 1);
 
 	/*
 	 * Then we calculate the actual shift value, from the start of the key
@@ -769,12 +734,12 @@ static void make_bfloat(struct btree *b, struct bset_tree *t,
 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
 	shift = (int) (b->format.key_u64s * 64 - b->nr_key_bits) + exponent;
 
-	EBUG_ON(shift + bits > b->format.key_u64s * 64);
+	EBUG_ON(shift + BKEY_MANTISSA_BITS > b->format.key_u64s * 64);
 #else
 	shift = high_bit_offset +
 		b->nr_key_bits -
 		exponent -
-		bits;
+		BKEY_MANTISSA_BITS;
 
 	EBUG_ON(shift < KEY_PACKED_BITS_START);
 #endif
@@ -790,7 +755,7 @@ static void make_bfloat(struct btree *b, struct bset_tree *t,
 	if (exponent < 0)
 		mantissa |= ~(~0U << -exponent);
 
-	bfloat_mantissa_set(f, j, mantissa);
+	f->mantissa = mantissa;
 }
 
 /* bytes remaining - only valid for last bset: */
@@ -803,14 +768,8 @@ static unsigned __bset_tree_capacity(struct btree *b, struct bset_tree *t)
 
 static unsigned bset_ro_tree_capacity(struct btree *b, struct bset_tree *t)
 {
-	unsigned bytes = __bset_tree_capacity(b, t);
-
-	if (bytes < 7 * BFLOAT_32BIT_NR)
-		return bytes / 7;
-
-	bytes -= 7 * BFLOAT_32BIT_NR;
-
-	return BFLOAT_32BIT_NR + bytes / 5;
+	return __bset_tree_capacity(b, t) /
+		(sizeof(struct bkey_float) + sizeof(u8));
 }
 
 static unsigned bset_rw_tree_capacity(struct btree *b, struct bset_tree *t)
@@ -1309,9 +1268,8 @@ static inline bool bkey_mantissa_bits_dropped(const struct btree *b,
 	return f->exponent > key_bits_start;
 #else
 	unsigned key_bits_end = high_bit_offset + b->nr_key_bits;
-	unsigned mantissa_bits = n < BFLOAT_32BIT_NR ? 32 : 16;
 
-	return f->exponent + mantissa_bits < key_bits_end;
+	return f->exponent + BKEY_MANTISSA_BITS < key_bits_end;
 #endif
 }
 
@@ -1329,16 +1287,16 @@ static struct bkey_packed *bset_search_tree(const struct btree *b,
 
 	do {
 		if (likely(n << 4 < t->size))
-			prefetch(bkey_float_get(base, n << 4));
+			prefetch(&base->f[n << 4]);
 
-		f = bkey_float_get(base, n);
+		f = &base->f[n];
 
 		if (!unlikely(packed_search))
 			goto slowpath;
 		if (unlikely(f->exponent >= BFLOAT_FAILED))
 			goto slowpath;
 
-		l = bfloat_mantissa(f, n);
+		l = f->mantissa;
 		r = bkey_mantissa(packed_search, f, n);
 
 		if (unlikely(l == r) && bkey_mantissa_bits_dropped(b, f, n))
@@ -1361,16 +1319,15 @@ slowpath:
 	 * n would have been the node we recursed to - the low bit tells us if
 	 * we recursed left or recursed right.
 	 */
-	if (n & 1) {
-		return cacheline_to_bkey(b, t, inorder, f->key_offset);
-	} else {
-		if (--inorder) {
-			n = eytzinger1_prev(n >> 1, t->size);
-			f = bkey_float_get(base, n);
-			return cacheline_to_bkey(b, t, inorder, f->key_offset);
-		} else
+	if (likely(!(n & 1))) {
+		--inorder;
+		if (unlikely(!inorder))
 			return btree_bkey_first(b, t);
+
+		f = &base->f[eytzinger1_prev(n >> 1, t->size)];
 	}
+
+	return cacheline_to_bkey(b, t, inorder, f->key_offset);
 }
 
 static __always_inline __flatten
-- 
cgit 


From f7f21ed382ea84f2dd4c448c7d937242a4dbbf97 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 8 Nov 2019 15:09:36 -0500
Subject: bcachefs: Remove some BKEY_PADDED uses

Prep work for extents with inline data

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bkey_sort.c |  8 ++------
 fs/bcachefs/extents.c   | 17 +++++++----------
 2 files changed, 9 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bkey_sort.c b/fs/bcachefs/bkey_sort.c
index e32fad5a91ac..2cac269b386f 100644
--- a/fs/bcachefs/bkey_sort.c
+++ b/fs/bcachefs/bkey_sort.c
@@ -418,7 +418,7 @@ bch2_sort_repack_merge(struct bch_fs *c,
 	struct bkey_packed *prev = NULL, *k_packed;
 	struct bkey_s k;
 	struct btree_nr_keys nr;
-	BKEY_PADDED(k) tmp;
+	struct bkey unpacked;
 
 	memset(&nr, 0, sizeof(nr));
 
@@ -426,11 +426,7 @@ bch2_sort_repack_merge(struct bch_fs *c,
 		if (filter_whiteouts && bkey_whiteout(k_packed))
 			continue;
 
-		EBUG_ON(bkeyp_val_u64s(&src->format, k_packed) >
-			BKEY_EXTENT_VAL_U64s_MAX);
-
-		bch2_bkey_unpack(src, &tmp.k, k_packed);
-		k = bkey_i_to_s(&tmp.k);
+		k = __bkey_disassemble(src, k_packed, &unpacked);
 
 		if (filter_whiteouts &&
 		    bch2_bkey_normalize(c, k))
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 201f4953acac..3cd0a79f8fe6 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -1217,7 +1217,6 @@ void bch2_insert_fixup_extent(struct btree_trans *trans,
 	struct bkey_i whiteout	= *insert;
 	struct bkey_packed *_k;
 	struct bkey unpacked;
-	BKEY_PADDED(k) tmp;
 
 	EBUG_ON(iter->level);
 	EBUG_ON(!insert->k.size);
@@ -1291,25 +1290,23 @@ next:
 	bch2_btree_iter_set_pos_same_leaf(iter, insert->k.p);
 
 	if (update_btree) {
-		bkey_copy(&tmp.k, insert);
-
 		if (deleting)
-			tmp.k.k.type = KEY_TYPE_discard;
+			insert->k.type = KEY_TYPE_discard;
 
-		EBUG_ON(bkey_deleted(&tmp.k.k) || !tmp.k.k.size);
+		EBUG_ON(bkey_deleted(&insert->k) || !insert->k.size);
 
-		extent_bset_insert(c, iter, &tmp.k);
+		extent_bset_insert(c, iter, insert);
 	}
 
 	if (update_journal) {
-		bkey_copy(&tmp.k, !deleting ? insert : &whiteout);
+		struct bkey_i *k = !deleting ? insert : &whiteout;
 
 		if (deleting)
-			tmp.k.k.type = KEY_TYPE_discard;
+			k->k.type = KEY_TYPE_discard;
 
-		EBUG_ON(bkey_deleted(&tmp.k.k) || !tmp.k.k.size);
+		EBUG_ON(bkey_deleted(&k->k) || !k->k.size);
 
-		bch2_btree_journal_key(trans, iter, &tmp.k);
+		bch2_btree_journal_key(trans, iter, k);
 	}
 
 	bch2_cut_front(insert->k.p, insert);
-- 
cgit 


From 0897705163b5c7eb9ecacad53c252adbbf72454a Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 9 Nov 2019 14:58:09 -0500
Subject: bcachefs: Be slightly less tricky with union usage

This is to fix a valgrind complaint - the code was correct, but too
tricky for valgrind to know that.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/extents.c | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 3cd0a79f8fe6..b12798103763 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -1386,16 +1386,18 @@ static unsigned bch2_crc_field_size_max[] = {
 };
 
 static void bch2_extent_crc_pack(union bch_extent_crc *dst,
-				 struct bch_extent_crc_unpacked src)
+				 struct bch_extent_crc_unpacked src,
+				 enum bch_extent_entry_type type)
 {
 #define set_common_fields(_dst, _src)					\
+		_dst.type		= 1 << type;			\
 		_dst.csum_type		= _src.csum_type,		\
 		_dst.compression_type	= _src.compression_type,	\
 		_dst._compressed_size	= _src.compressed_size - 1,	\
 		_dst._uncompressed_size	= _src.uncompressed_size - 1,	\
 		_dst.offset		= _src.offset
 
-	switch (extent_entry_type(to_entry(dst))) {
+	switch (type) {
 	case BCH_EXTENT_ENTRY_crc32:
 		set_common_fields(dst->crc32, src);
 		dst->crc32.csum	 = *((__le32 *) &src.csum.lo);
@@ -1422,23 +1424,24 @@ void bch2_extent_crc_append(struct bkey_i *k,
 {
 	struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k));
 	union bch_extent_crc *crc = (void *) ptrs.end;
+	enum bch_extent_entry_type type;
 
 	if (bch_crc_bytes[new.csum_type]	<= 4 &&
 	    new.uncompressed_size - 1		<= CRC32_SIZE_MAX &&
 	    new.nonce				<= CRC32_NONCE_MAX)
-		crc->type = 1 << BCH_EXTENT_ENTRY_crc32;
+		type = BCH_EXTENT_ENTRY_crc32;
 	else if (bch_crc_bytes[new.csum_type]	<= 10 &&
 		   new.uncompressed_size - 1	<= CRC64_SIZE_MAX &&
 		   new.nonce			<= CRC64_NONCE_MAX)
-		crc->type = 1 << BCH_EXTENT_ENTRY_crc64;
+		type = BCH_EXTENT_ENTRY_crc64;
 	else if (bch_crc_bytes[new.csum_type]	<= 16 &&
 		   new.uncompressed_size - 1	<= CRC128_SIZE_MAX &&
 		   new.nonce			<= CRC128_NONCE_MAX)
-		crc->type = 1 << BCH_EXTENT_ENTRY_crc128;
+		type = BCH_EXTENT_ENTRY_crc128;
 	else
 		BUG();
 
-	bch2_extent_crc_pack(crc, new);
+	bch2_extent_crc_pack(crc, new, type);
 
 	k->k.u64s += extent_entry_u64s(ptrs.end);
 
@@ -1641,7 +1644,8 @@ enum merge_result bch2_extent_merge(struct bch_fs *c,
 		crc_l.uncompressed_size	+= crc_r.uncompressed_size;
 		crc_l.compressed_size	+= crc_r.compressed_size;
 
-		bch2_extent_crc_pack(entry_to_crc(en_l), crc_l);
+		bch2_extent_crc_pack(entry_to_crc(en_l), crc_l,
+				     extent_entry_type(en_l));
 	}
 
 	bch2_key_resize(l.k, l.k->size + r.k->size);
-- 
cgit 


From b627c7d8f46ca74e78f6df63e8e2f15af124d1f7 Mon Sep 17 00:00:00 2001
From: Justin Husted <sigstop@gmail.com>
Date: Sat, 9 Nov 2019 19:15:40 -0800
Subject: bcachefs: Set lost+found mode to 0700

For security and conformance with other filesystems, the lost+found
directory should not be world or group accessible.

Signed-off-by: Justin Husted <sigstop@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fsck.c     | 2 +-
 fs/bcachefs/recovery.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 3cced2b99f3f..0f2308e53d65 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -797,7 +797,7 @@ create_lostfound:
 		bch2_create_trans(&trans,
 				  BCACHEFS_ROOT_INO, root_inode,
 				  lostfound_inode, &lostfound,
-				  0, 0, S_IFDIR|0755, 0, NULL, NULL));
+				  0, 0, S_IFDIR|0700, 0, NULL, NULL));
 	if (ret)
 		bch_err(c, "error creating lost+found: %i", ret);
 
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 2c441a278044..d1184bf62cae 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -1013,7 +1013,7 @@ int bch2_fs_initialize(struct bch_fs *c)
 		bch2_create_trans(&trans, BCACHEFS_ROOT_INO,
 				  &root_inode, &lostfound_inode,
 				  &lostfound,
-				  0, 0, S_IFDIR|0755, 0,
+				  0, 0, S_IFDIR|0700, 0,
 				  NULL, NULL));
 	if (ret)
 		goto err;
-- 
cgit 


From c32bd3ad1fe595bb96c3c781f29c7002f73450a2 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 11 Nov 2019 13:42:10 -0500
Subject: bcachefs: Fix erorr path in bch2_write()

The error path in bch2_write wasn't updated when the end_io callback was
added to bch_write_op.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/io.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index 272477fb558f..f53eee7accc8 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -1176,7 +1176,12 @@ void bch2_write(struct closure *cl)
 err:
 	if (!(op->flags & BCH_WRITE_NOPUT_RESERVATION))
 		bch2_disk_reservation_put(c, &op->res);
-	closure_return(cl);
+	if (op->end_io)
+		op->end_io(op);
+	if (cl->parent)
+		closure_return(cl);
+	else
+		closure_debug_destroy(cl);
 }
 
 /* Cache promotion on read */
-- 
cgit 


From 50fe5bd69c6d4a0cba58ee5dc8b9c72c1abc1d60 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 13 Nov 2019 19:45:48 -0500
Subject: bcachefs: Use wbc_to_write_flags()

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-io.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index d17621b0713e..657559c2db14 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -1025,6 +1025,7 @@ static void bch2_writepage_do_io(struct bch_writepage_state *w)
  * possible, else allocating a new one:
  */
 static void bch2_writepage_io_alloc(struct bch_fs *c,
+				    struct writeback_control *wbc,
 				    struct bch_writepage_state *w,
 				    struct bch_inode_info *inode,
 				    u64 sector,
@@ -1050,6 +1051,7 @@ static void bch2_writepage_io_alloc(struct bch_fs *c,
 	op->write_point		= writepoint_hashed(inode->ei_last_dirtied);
 	op->pos			= POS(inode->v.i_ino, sector);
 	op->wbio.bio.bi_iter.bi_sector = sector;
+	op->wbio.bio.bi_opf	= wbc_to_write_flags(wbc);
 }
 
 static int __bch2_writepage(struct folio *folio,
@@ -1161,7 +1163,7 @@ do_io:
 			bch2_writepage_do_io(w);
 
 		if (!w->io)
-			bch2_writepage_io_alloc(c, w, inode, sector,
+			bch2_writepage_io_alloc(c, wbc, w, inode, sector,
 						nr_replicas_this_write);
 
 		atomic_inc(&s->write_count);
@@ -1178,9 +1180,6 @@ do_io:
 		w->io->op.i_sectors_delta -= dirty_sectors;
 		w->io->op.new_i_size = i_size;
 
-		if (wbc->sync_mode == WB_SYNC_ALL)
-			w->io->op.wbio.bio.bi_opf |= REQ_SYNC;
-
 		offset += sectors;
 	}
 
-- 
cgit 


From 03c8c747a0f2ee5f2b45ad692d51f6e2bdce21cb Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 13 Nov 2019 19:46:11 -0500
Subject: bcachefs: Make memcpy_to_bio() param const

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/util.c | 2 +-
 fs/bcachefs/util.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c
index fc2ca798fbc3..a05ebe475c5a 100644
--- a/fs/bcachefs/util.c
+++ b/fs/bcachefs/util.c
@@ -556,7 +556,7 @@ size_t bch2_rand_range(size_t max)
 	return rand;
 }
 
-void memcpy_to_bio(struct bio *dst, struct bvec_iter dst_iter, void *src)
+void memcpy_to_bio(struct bio *dst, struct bvec_iter dst_iter, const void *src)
 {
 	struct bio_vec bv;
 	struct bvec_iter iter;
diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h
index 33589362f5df..1780a6831136 100644
--- a/fs/bcachefs/util.h
+++ b/fs/bcachefs/util.h
@@ -539,7 +539,7 @@ do {									\
 
 size_t bch2_rand_range(size_t);
 
-void memcpy_to_bio(struct bio *, struct bvec_iter, void *);
+void memcpy_to_bio(struct bio *, struct bvec_iter, const void *);
 void memcpy_from_bio(void *, struct bio *, struct bvec_iter);
 
 static inline void memcpy_u64s_small(void *dst, const void *src,
-- 
cgit 


From 35189e09ab46785746df7007ed2a57ee78b56191 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 9 Nov 2019 16:01:15 -0500
Subject: bcachefs: bkey_on_stack

This implements code for storing small bkeys on the stack and allocating
out of a mempool if they're too big.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs.h      |  2 +
 fs/bcachefs/bkey_on_stack.h | 35 +++++++++++++++++
 fs/bcachefs/bkey_sort.c     | 13 +++++--
 fs/bcachefs/ec.c            | 12 ++++--
 fs/bcachefs/extents.c       | 18 ++++++---
 fs/bcachefs/fs-io.c         | 92 +++++++++++++++++++++++++--------------------
 fs/bcachefs/fs.c            | 29 ++++++++------
 fs/bcachefs/io.c            | 63 +++++++++++++++++++------------
 fs/bcachefs/migrate.c       | 16 +++++---
 fs/bcachefs/move.c          | 10 +++--
 fs/bcachefs/reflink.c       | 17 ++++++---
 fs/bcachefs/super.c         |  2 +
 12 files changed, 205 insertions(+), 104 deletions(-)
 create mode 100644 fs/bcachefs/bkey_on_stack.h

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index f8a040115fd1..344cf982124f 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -729,6 +729,8 @@ struct bch_fs {
 
 	atomic64_t		key_version;
 
+	mempool_t		large_bkey_pool;
+
 	/* REBALANCE */
 	struct bch_fs_rebalance	rebalance;
 
diff --git a/fs/bcachefs/bkey_on_stack.h b/fs/bcachefs/bkey_on_stack.h
new file mode 100644
index 000000000000..d4739038323f
--- /dev/null
+++ b/fs/bcachefs/bkey_on_stack.h
@@ -0,0 +1,35 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BKEY_ON_STACK_H
+#define _BCACHEFS_BKEY_ON_STACK_H
+
+#include "bcachefs.h"
+
+struct bkey_on_stack {
+	struct bkey_i	*k;
+	u64		onstack[12];
+};
+
+static inline void bkey_on_stack_realloc(struct bkey_on_stack *s,
+					 struct bch_fs *c, unsigned u64s)
+{
+	if (s->k == (void *) s->onstack &&
+	    u64s > ARRAY_SIZE(s->onstack)) {
+		s->k = mempool_alloc(&c->large_bkey_pool, GFP_NOFS);
+		memcpy(s->k, s->onstack, sizeof(s->onstack));
+	}
+}
+
+static inline void bkey_on_stack_init(struct bkey_on_stack *s)
+{
+	s->k = (void *) s->onstack;
+}
+
+static inline void bkey_on_stack_exit(struct bkey_on_stack *s,
+				      struct bch_fs *c)
+{
+	if (s->k != (void *) s->onstack)
+		mempool_free(s->k, &c->large_bkey_pool);
+	s->k = NULL;
+}
+
+#endif /* _BCACHEFS_BKEY_ON_STACK_H */
diff --git a/fs/bcachefs/bkey_sort.c b/fs/bcachefs/bkey_sort.c
index 2cac269b386f..f5c0507ad79d 100644
--- a/fs/bcachefs/bkey_sort.c
+++ b/fs/bcachefs/bkey_sort.c
@@ -1,5 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 #include "bcachefs.h"
+#include "bkey_on_stack.h"
 #include "bkey_sort.h"
 #include "bset.h"
 #include "extents.h"
@@ -292,8 +293,10 @@ struct btree_nr_keys bch2_extent_sort_fix_overlapping(struct bch_fs *c,
 	struct bkey l_unpacked, r_unpacked;
 	struct bkey_s l, r;
 	struct btree_nr_keys nr;
+	struct bkey_on_stack split;
 
 	memset(&nr, 0, sizeof(nr));
+	bkey_on_stack_init(&split);
 
 	heap_resort(iter, extent_sort_cmp, NULL);
 
@@ -349,13 +352,13 @@ struct btree_nr_keys bch2_extent_sort_fix_overlapping(struct bch_fs *c,
 
 			extent_sort_sift(iter, b, _r - iter->data);
 		} else if (bkey_cmp(l.k->p, r.k->p) > 0) {
-			BKEY_PADDED(k) tmp;
+			bkey_on_stack_realloc(&split, c, l.k->u64s);
 
 			/*
 			 * r wins, but it overlaps in the middle of l - split l:
 			 */
-			bkey_reassemble(&tmp.k, l.s_c);
-			bch2_cut_back(bkey_start_pos(r.k), &tmp.k.k);
+			bkey_reassemble(split.k, l.s_c);
+			bch2_cut_back(bkey_start_pos(r.k), &split.k->k);
 
 			__bch2_cut_front(r.k->p, l);
 			extent_save(b, lk, l.k);
@@ -363,7 +366,7 @@ struct btree_nr_keys bch2_extent_sort_fix_overlapping(struct bch_fs *c,
 			extent_sort_sift(iter, b, 0);
 
 			extent_sort_append(c, f, &nr, dst->start,
-					   &prev, bkey_i_to_s(&tmp.k));
+					   &prev, bkey_i_to_s(split.k));
 		} else {
 			bch2_cut_back(bkey_start_pos(r.k), l.k);
 			extent_save(b, lk, l.k);
@@ -373,6 +376,8 @@ struct btree_nr_keys bch2_extent_sort_fix_overlapping(struct bch_fs *c,
 	extent_sort_advance_prev(f, &nr, dst->start, &prev);
 
 	dst->u64s = cpu_to_le16((u64 *) prev - dst->_data);
+
+	bkey_on_stack_exit(&split, c);
 	return nr;
 }
 
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index f32b8e6bf2ce..b24f867520c3 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -4,6 +4,7 @@
 
 #include "bcachefs.h"
 #include "alloc_foreground.h"
+#include "bkey_on_stack.h"
 #include "bset.h"
 #include "btree_gc.h"
 #include "btree_update.h"
@@ -777,9 +778,10 @@ static int ec_stripe_update_ptrs(struct bch_fs *c,
 	struct bkey_s_c k;
 	struct bkey_s_extent e;
 	struct bch_extent_ptr *ptr;
-	BKEY_PADDED(k) tmp;
+	struct bkey_on_stack sk;
 	int ret = 0, dev, idx;
 
+	bkey_on_stack_init(&sk);
 	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
 
 	iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
@@ -804,8 +806,9 @@ static int ec_stripe_update_ptrs(struct bch_fs *c,
 
 		dev = s->key.v.ptrs[idx].dev;
 
-		bkey_reassemble(&tmp.k, k);
-		e = bkey_i_to_s_extent(&tmp.k);
+		bkey_on_stack_realloc(&sk, c, k.k->u64s);
+		bkey_reassemble(sk.k, k);
+		e = bkey_i_to_s_extent(sk.k);
 
 		extent_for_each_ptr(e, ptr)
 			if (ptr->dev != dev)
@@ -816,7 +819,7 @@ static int ec_stripe_update_ptrs(struct bch_fs *c,
 
 		extent_stripe_ptr_add(e, s, ptr, idx);
 
-		bch2_trans_update(&trans, iter, &tmp.k);
+		bch2_trans_update(&trans, iter, sk.k);
 
 		ret = bch2_trans_commit(&trans, NULL, NULL,
 					BTREE_INSERT_ATOMIC|
@@ -829,6 +832,7 @@ static int ec_stripe_update_ptrs(struct bch_fs *c,
 	}
 
 	bch2_trans_exit(&trans);
+	bkey_on_stack_exit(&sk, c);
 
 	return ret;
 }
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index b12798103763..46eeaa574e86 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -8,6 +8,7 @@
 
 #include "bcachefs.h"
 #include "bkey_methods.h"
+#include "bkey_on_stack.h"
 #include "btree_gc.h"
 #include "btree_update.h"
 #include "btree_update_interior.h"
@@ -1132,7 +1133,11 @@ extent_squash(struct bch_fs *c, struct btree_iter *iter,
 		break;
 	}
 	case BCH_EXTENT_OVERLAP_MIDDLE: {
-		BKEY_PADDED(k) split;
+		struct bkey_on_stack split;
+
+		bkey_on_stack_init(&split);
+		bkey_on_stack_realloc(&split, c, k.k->u64s);
+
 		/*
 		 * The insert key falls 'in the middle' of k
 		 * The insert key splits k in 3:
@@ -1147,18 +1152,19 @@ extent_squash(struct bch_fs *c, struct btree_iter *iter,
 		 * modify k _before_ doing the insert (which will move
 		 * what k points to)
 		 */
-		bkey_reassemble(&split.k, k.s_c);
-		split.k.k.needs_whiteout |= bkey_written(l->b, _k);
+		bkey_reassemble(split.k, k.s_c);
+		split.k->k.needs_whiteout |= bkey_written(l->b, _k);
 
-		bch2_cut_back(bkey_start_pos(&insert->k), &split.k.k);
-		BUG_ON(bkey_deleted(&split.k.k));
+		bch2_cut_back(bkey_start_pos(&insert->k), &split.k->k);
+		BUG_ON(bkey_deleted(&split.k->k));
 
 		__bch2_cut_front(insert->k.p, k);
 		BUG_ON(bkey_deleted(k.k));
 		extent_save(l->b, _k, k.k);
 		bch2_btree_iter_fix_key_modified(iter, l->b, _k);
 
-		extent_bset_insert(c, iter, &split.k);
+		extent_bset_insert(c, iter, split.k);
+		bkey_on_stack_exit(&split, c);
 		break;
 	}
 	}
diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 657559c2db14..478630fdf643 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -3,6 +3,7 @@
 
 #include "bcachefs.h"
 #include "alloc_foreground.h"
+#include "bkey_on_stack.h"
 #include "btree_update.h"
 #include "buckets.h"
 #include "clock.h"
@@ -691,6 +692,18 @@ static void bch2_add_page_sectors(struct bio *bio, struct bkey_s_c k)
 	}
 }
 
+static bool extent_partial_reads_expensive(struct bkey_s_c k)
+{
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+	struct bch_extent_crc_unpacked crc;
+	const union bch_extent_entry *i;
+
+	bkey_for_each_crc(k.k, ptrs, crc, i)
+		if (crc.csum_type || crc.compression_type)
+			return true;
+	return false;
+}
+
 static void readpage_bio_extend(struct readpages_iter *iter,
 				struct bio *bio,
 				unsigned sectors_this_extent,
@@ -744,15 +757,17 @@ static void bchfs_read(struct btree_trans *trans, struct btree_iter *iter,
 		       struct readpages_iter *readpages_iter)
 {
 	struct bch_fs *c = trans->c;
+	struct bkey_on_stack sk;
 	int flags = BCH_READ_RETRY_IF_STALE|
 		BCH_READ_MAY_PROMOTE;
 	int ret = 0;
 
 	rbio->c = c;
 	rbio->start_time = local_clock();
+
+	bkey_on_stack_init(&sk);
 retry:
 	while (1) {
-		BKEY_PADDED(k) tmp;
 		struct bkey_s_c k;
 		unsigned bytes, sectors, offset_into_extent;
 
@@ -764,15 +779,16 @@ retry:
 		if (ret)
 			break;
 
-		bkey_reassemble(&tmp.k, k);
-		k = bkey_i_to_s_c(&tmp.k);
+		bkey_on_stack_realloc(&sk, c, k.k->u64s);
+		bkey_reassemble(sk.k, k);
+		k = bkey_i_to_s_c(sk.k);
 
 		offset_into_extent = iter->pos.offset -
 			bkey_start_offset(k.k);
 		sectors = k.k->size - offset_into_extent;
 
 		ret = bch2_read_indirect_extent(trans,
-					&offset_into_extent, &tmp.k);
+					&offset_into_extent, sk.k);
 		if (ret)
 			break;
 
@@ -780,22 +796,9 @@ retry:
 
 		bch2_trans_unlock(trans);
 
-		if (readpages_iter) {
-			bool want_full_extent = false;
-
-			if (bkey_extent_is_data(k.k)) {
-				struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-				const union bch_extent_entry *i;
-				struct extent_ptr_decoded p;
-
-				bkey_for_each_ptr_decode(k.k, ptrs, p, i)
-					want_full_extent |= ((p.crc.csum_type != 0) |
-							     (p.crc.compression_type != 0));
-			}
-
-			readpage_bio_extend(readpages_iter, &rbio->bio,
-					    sectors, want_full_extent);
-		}
+		if (readpages_iter)
+			readpage_bio_extend(readpages_iter, &rbio->bio, sectors,
+					    extent_partial_reads_expensive(k));
 
 		bytes = min(sectors, bio_sectors(&rbio->bio)) << 9;
 		swap(rbio->bio.bi_iter.bi_size, bytes);
@@ -809,7 +812,7 @@ retry:
 		bch2_read_extent(c, rbio, k, offset_into_extent, flags);
 
 		if (flags & BCH_READ_LAST_FRAGMENT)
-			return;
+			break;
 
 		swap(rbio->bio.bi_iter.bi_size, bytes);
 		bio_advance(&rbio->bio, bytes);
@@ -818,8 +821,12 @@ retry:
 	if (ret == -EINTR)
 		goto retry;
 
-	bcache_io_error(c, &rbio->bio, "btree IO error %i", ret);
-	bio_endio(&rbio->bio);
+	if (ret) {
+		bcache_io_error(c, &rbio->bio, "btree IO error %i", ret);
+		bio_endio(&rbio->bio);
+	}
+
+	bkey_on_stack_exit(&sk, c);
 }
 
 void bch2_readahead(struct readahead_control *ractl)
@@ -2353,6 +2360,7 @@ static long bchfs_fcollapse_finsert(struct bch_inode_info *inode,
 {
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
 	struct address_space *mapping = inode->v.i_mapping;
+	struct bkey_on_stack copy;
 	struct btree_trans trans;
 	struct btree_iter *src, *dst, *del = NULL;
 	loff_t shift, new_size;
@@ -2362,6 +2370,7 @@ static long bchfs_fcollapse_finsert(struct bch_inode_info *inode,
 	if ((offset | len) & (block_bytes(c) - 1))
 		return -EINVAL;
 
+	bkey_on_stack_init(&copy);
 	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 256);
 
 	/*
@@ -2430,7 +2439,6 @@ static long bchfs_fcollapse_finsert(struct bch_inode_info *inode,
 	while (1) {
 		struct disk_reservation disk_res =
 			bch2_disk_reservation_init(c, 0);
-		BKEY_PADDED(k) copy;
 		struct bkey_i delete;
 		struct bkey_s_c k;
 		struct bpos next_pos;
@@ -2455,34 +2463,35 @@ static long bchfs_fcollapse_finsert(struct bch_inode_info *inode,
 		    bkey_cmp(k.k->p, POS(inode->v.i_ino, offset >> 9)) <= 0)
 			break;
 reassemble:
-		bkey_reassemble(&copy.k, k);
+		bkey_on_stack_realloc(&copy, c, k.k->u64s);
+		bkey_reassemble(copy.k, k);
 
 		if (insert &&
 		    bkey_cmp(bkey_start_pos(k.k), move_pos) < 0) {
-			bch2_cut_front(move_pos, &copy.k);
-			bch2_btree_iter_set_pos(src, bkey_start_pos(&copy.k.k));
+			bch2_cut_front(move_pos, copy.k);
+			bch2_btree_iter_set_pos(src, bkey_start_pos(&copy.k->k));
 		}
 
-		copy.k.k.p.offset += shift >> 9;
-		bch2_btree_iter_set_pos(dst, bkey_start_pos(&copy.k.k));
+		copy.k->k.p.offset += shift >> 9;
+		bch2_btree_iter_set_pos(dst, bkey_start_pos(&copy.k->k));
 
-		ret = bch2_extent_atomic_end(dst, &copy.k, &atomic_end);
+		ret = bch2_extent_atomic_end(dst, copy.k, &atomic_end);
 		if (ret)
 			goto bkey_err;
 
-		if (bkey_cmp(atomic_end, copy.k.k.p)) {
+		if (bkey_cmp(atomic_end, copy.k->k.p)) {
 			if (insert) {
 				move_pos = atomic_end;
 				move_pos.offset -= shift >> 9;
 				goto reassemble;
 			} else {
-				bch2_cut_back(atomic_end, &copy.k.k);
+				bch2_cut_back(atomic_end, &copy.k->k);
 			}
 		}
 
 		bkey_init(&delete.k);
 		delete.k.p = src->pos;
-		bch2_key_resize(&delete.k, copy.k.k.size);
+		bch2_key_resize(&delete.k, copy.k->k.size);
 
 		next_pos = insert ? bkey_start_pos(&delete.k) : delete.k.p;
 
@@ -2495,12 +2504,12 @@ reassemble:
 		 * by the triggers machinery:
 		 */
 		if (insert &&
-		    bkey_cmp(bkey_start_pos(&copy.k.k), delete.k.p) < 0) {
-			bch2_cut_back(bkey_start_pos(&copy.k.k), &delete.k);
+		    bkey_cmp(bkey_start_pos(&copy.k->k), delete.k.p) < 0) {
+			bch2_cut_back(bkey_start_pos(&copy.k->k), &delete.k);
 		} else if (!insert &&
-			   bkey_cmp(copy.k.k.p,
+			   bkey_cmp(copy.k->k.p,
 				    bkey_start_pos(&delete.k)) > 0) {
-			bch2_cut_front(copy.k.k.p, &delete);
+			bch2_cut_front(copy.k->k.p, &delete);
 
 			del = bch2_trans_copy_iter(&trans, src);
 			BUG_ON(IS_ERR_OR_NULL(del));
@@ -2509,10 +2518,10 @@ reassemble:
 				bkey_start_pos(&delete.k));
 		}
 
-		bch2_trans_update(&trans, dst, &copy.k);
+		bch2_trans_update(&trans, dst, copy.k);
 		bch2_trans_update(&trans, del ?: src, &delete);
 
-		if (copy.k.k.size == k.k->size) {
+		if (copy.k->k.size == k.k->size) {
 			/*
 			 * If we're moving the entire extent, we can skip
 			 * running triggers:
@@ -2521,10 +2530,10 @@ reassemble:
 		} else {
 			/* We might end up splitting compressed extents: */
 			unsigned nr_ptrs =
-				bch2_bkey_nr_dirty_ptrs(bkey_i_to_s_c(&copy.k));
+				bch2_bkey_nr_dirty_ptrs(bkey_i_to_s_c(copy.k));
 
 			ret = bch2_disk_reservation_get(c, &disk_res,
-					copy.k.k.size, nr_ptrs,
+					copy.k->k.size, nr_ptrs,
 					BCH_DISK_RESERVATION_NOFAIL);
 			BUG_ON(ret);
 		}
@@ -2559,6 +2568,7 @@ bkey_err:
 	}
 err:
 	bch2_trans_exit(&trans);
+	bkey_on_stack_exit(&copy, c);
 	bch2_pagecache_block_put(&inode->ei_pagecache_lock);
 	inode_unlock(&inode->v);
 	return ret;
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index b241164f6f7e..e8cdae3c114b 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -3,6 +3,7 @@
 
 #include "bcachefs.h"
 #include "acl.h"
+#include "bkey_on_stack.h"
 #include "btree_update.h"
 #include "buckets.h"
 #include "chardev.h"
@@ -875,7 +876,7 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
 	struct btree_trans trans;
 	struct btree_iter *iter;
 	struct bkey_s_c k;
-	BKEY_PADDED(k) cur, prev;
+	struct bkey_on_stack cur, prev;
 	struct bpos end = POS(ei->v.i_ino, (start + len) >> 9);
 	unsigned offset_into_extent, sectors;
 	bool have_extent = false;
@@ -888,6 +889,8 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
 	if (start + len < start)
 		return -EINVAL;
 
+	bkey_on_stack_init(&cur);
+	bkey_on_stack_init(&prev);
 	bch2_trans_init(&trans, c, 0, 0);
 
 	iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
@@ -902,15 +905,17 @@ retry:
 			continue;
 		}
 
-		bkey_reassemble(&cur.k, k);
-		k = bkey_i_to_s_c(&cur.k);
+		bkey_on_stack_realloc(&cur, c, k.k->u64s);
+		bkey_on_stack_realloc(&prev, c, k.k->u64s);
+		bkey_reassemble(cur.k, k);
+		k = bkey_i_to_s_c(cur.k);
 
 		offset_into_extent	= iter->pos.offset -
 			bkey_start_offset(k.k);
 		sectors			= k.k->size - offset_into_extent;
 
 		ret = bch2_read_indirect_extent(&trans,
-					&offset_into_extent, &cur.k);
+					&offset_into_extent, cur.k);
 		if (ret)
 			break;
 
@@ -920,19 +925,19 @@ retry:
 			bch2_cut_front(POS(k.k->p.inode,
 					   bkey_start_offset(k.k) +
 					   offset_into_extent),
-				       &cur.k);
-		bch2_key_resize(&cur.k.k, sectors);
-		cur.k.k.p = iter->pos;
-		cur.k.k.p.offset += cur.k.k.size;
+				       cur.k);
+		bch2_key_resize(&cur.k->k, sectors);
+		cur.k->k.p = iter->pos;
+		cur.k->k.p.offset += cur.k->k.size;
 
 		if (have_extent) {
 			ret = bch2_fill_extent(c, info,
-					bkey_i_to_s_c(&prev.k), 0);
+					bkey_i_to_s_c(prev.k), 0);
 			if (ret)
 				break;
 		}
 
-		bkey_copy(&prev.k, &cur.k);
+		bkey_copy(prev.k, cur.k);
 		have_extent = true;
 
 		if (k.k->type == KEY_TYPE_reflink_v)
@@ -945,10 +950,12 @@ retry:
 		goto retry;
 
 	if (!ret && have_extent)
-		ret = bch2_fill_extent(c, info, bkey_i_to_s_c(&prev.k),
+		ret = bch2_fill_extent(c, info, bkey_i_to_s_c(prev.k),
 				       FIEMAP_EXTENT_LAST);
 
 	ret = bch2_trans_exit(&trans) ?: ret;
+	bkey_on_stack_exit(&cur, c);
+	bkey_on_stack_exit(&prev, c);
 	return ret < 0 ? ret : 0;
 }
 
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index f53eee7accc8..4fe61705ae75 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -8,6 +8,7 @@
 
 #include "bcachefs.h"
 #include "alloc_foreground.h"
+#include "bkey_on_stack.h"
 #include "bset.h"
 #include "btree_update.h"
 #include "buckets.h"
@@ -394,12 +395,14 @@ int bch2_fpunch(struct bch_fs *c, u64 inum, u64 start, u64 end,
 int bch2_write_index_default(struct bch_write_op *op)
 {
 	struct bch_fs *c = op->c;
+	struct bkey_on_stack sk;
 	struct keylist *keys = &op->insert_keys;
 	struct bkey_i *k = bch2_keylist_front(keys);
 	struct btree_trans trans;
 	struct btree_iter *iter;
 	int ret;
 
+	bkey_on_stack_init(&sk);
 	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024);
 
 	iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
@@ -407,13 +410,14 @@ int bch2_write_index_default(struct bch_write_op *op)
 				   BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
 
 	do {
-		BKEY_PADDED(k) tmp;
+		k = bch2_keylist_front(keys);
 
-		bkey_copy(&tmp.k, bch2_keylist_front(keys));
+		bkey_on_stack_realloc(&sk, c, k->k.u64s);
+		bkey_copy(sk.k, k);
 
 		bch2_trans_begin_updates(&trans);
 
-		ret = bch2_extent_update(&trans, iter, &tmp.k,
+		ret = bch2_extent_update(&trans, iter, sk.k,
 					 &op->res, op_journal_seq(op),
 					 op->new_i_size, &op->i_sectors_delta);
 		if (ret == -EINTR)
@@ -421,13 +425,14 @@ int bch2_write_index_default(struct bch_write_op *op)
 		if (ret)
 			break;
 
-		if (bkey_cmp(iter->pos, bch2_keylist_front(keys)->k.p) < 0)
-			bch2_cut_front(iter->pos, bch2_keylist_front(keys));
+		if (bkey_cmp(iter->pos, k->k.p) < 0)
+			bch2_cut_front(iter->pos, k);
 		else
 			bch2_keylist_pop_front(keys);
 	} while (!bch2_keylist_empty(keys));
 
 	bch2_trans_exit(&trans);
+	bkey_on_stack_exit(&sk, c);
 
 	return ret;
 }
@@ -1463,13 +1468,14 @@ static void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio
 {
 	struct btree_trans trans;
 	struct btree_iter *iter;
-	BKEY_PADDED(k) tmp;
+	struct bkey_on_stack sk;
 	struct bkey_s_c k;
 	int ret;
 
 	flags &= ~BCH_READ_LAST_FRAGMENT;
 	flags |= BCH_READ_MUST_CLONE;
 
+	bkey_on_stack_init(&sk);
 	bch2_trans_init(&trans, c, 0, 0);
 
 	iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
@@ -1481,11 +1487,12 @@ retry:
 	if (bkey_err(k))
 		goto err;
 
-	bkey_reassemble(&tmp.k, k);
-	k = bkey_i_to_s_c(&tmp.k);
+	bkey_on_stack_realloc(&sk, c, k.k->u64s);
+	bkey_reassemble(sk.k, k);
+	k = bkey_i_to_s_c(sk.k);
 	bch2_trans_unlock(&trans);
 
-	if (!bch2_bkey_matches_ptr(c, bkey_i_to_s_c(&tmp.k),
+	if (!bch2_bkey_matches_ptr(c, k,
 				   rbio->pick.ptr,
 				   rbio->pos.offset -
 				   rbio->pick.crc.offset)) {
@@ -1502,6 +1509,7 @@ retry:
 out:
 	bch2_rbio_done(rbio);
 	bch2_trans_exit(&trans);
+	bkey_on_stack_exit(&sk, c);
 	return;
 err:
 	rbio->bio.bi_status = BLK_STS_IOERR;
@@ -1514,12 +1522,14 @@ static void bch2_read_retry(struct bch_fs *c, struct bch_read_bio *rbio,
 {
 	struct btree_trans trans;
 	struct btree_iter *iter;
+	struct bkey_on_stack sk;
 	struct bkey_s_c k;
 	int ret;
 
 	flags &= ~BCH_READ_LAST_FRAGMENT;
 	flags |= BCH_READ_MUST_CLONE;
 
+	bkey_on_stack_init(&sk);
 	bch2_trans_init(&trans, c, 0, 0);
 retry:
 	bch2_trans_begin(&trans);
@@ -1527,18 +1537,18 @@ retry:
 	for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS,
 			   POS(inode, bvec_iter.bi_sector),
 			   BTREE_ITER_SLOTS, k, ret) {
-		BKEY_PADDED(k) tmp;
 		unsigned bytes, sectors, offset_into_extent;
 
-		bkey_reassemble(&tmp.k, k);
-		k = bkey_i_to_s_c(&tmp.k);
+		bkey_on_stack_realloc(&sk, c, k.k->u64s);
+		bkey_reassemble(sk.k, k);
+		k = bkey_i_to_s_c(sk.k);
 
 		offset_into_extent = iter->pos.offset -
 			bkey_start_offset(k.k);
 		sectors = k.k->size - offset_into_extent;
 
 		ret = bch2_read_indirect_extent(&trans,
-					&offset_into_extent, &tmp.k);
+					&offset_into_extent, sk.k);
 		if (ret)
 			break;
 
@@ -1577,6 +1587,7 @@ err:
 	rbio->bio.bi_status = BLK_STS_IOERR;
 out:
 	bch2_trans_exit(&trans);
+	bkey_on_stack_exit(&sk, c);
 	bch2_rbio_done(rbio);
 }
 
@@ -1633,7 +1644,7 @@ static void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio)
 	struct btree_trans trans;
 	struct btree_iter *iter;
 	struct bkey_s_c k;
-	BKEY_PADDED(k) new;
+	struct bkey_on_stack new;
 	struct bch_extent_crc_unpacked new_crc;
 	u64 data_offset = rbio->pos.offset - rbio->pick.crc.offset;
 	int ret;
@@ -1641,6 +1652,7 @@ static void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio)
 	if (rbio->pick.crc.compression_type)
 		return;
 
+	bkey_on_stack_init(&new);
 	bch2_trans_init(&trans, c, 0, 0);
 retry:
 	bch2_trans_begin(&trans);
@@ -1651,8 +1663,9 @@ retry:
 	if (IS_ERR_OR_NULL(k.k))
 		goto out;
 
-	bkey_reassemble(&new.k, k);
-	k = bkey_i_to_s_c(&new.k);
+	bkey_on_stack_realloc(&new, c, k.k->u64s);
+	bkey_reassemble(new.k, k);
+	k = bkey_i_to_s_c(new.k);
 
 	if (bversion_cmp(k.k->version, rbio->version) ||
 	    !bch2_bkey_matches_ptr(c, k, rbio->pick.ptr, data_offset))
@@ -1671,10 +1684,10 @@ retry:
 		goto out;
 	}
 
-	if (!bch2_bkey_narrow_crcs(&new.k, new_crc))
+	if (!bch2_bkey_narrow_crcs(new.k, new_crc))
 		goto out;
 
-	bch2_trans_update(&trans, iter, &new.k);
+	bch2_trans_update(&trans, iter, new.k);
 	ret = bch2_trans_commit(&trans, NULL, NULL,
 				BTREE_INSERT_ATOMIC|
 				BTREE_INSERT_NOFAIL|
@@ -1683,6 +1696,7 @@ retry:
 		goto retry;
 out:
 	bch2_trans_exit(&trans);
+	bkey_on_stack_exit(&new, c);
 }
 
 /* Inner part that may run in process context */
@@ -2114,6 +2128,7 @@ void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, u64 inode)
 {
 	struct btree_trans trans;
 	struct btree_iter *iter;
+	struct bkey_on_stack sk;
 	struct bkey_s_c k;
 	unsigned flags = BCH_READ_RETRY_IF_STALE|
 		BCH_READ_MAY_PROMOTE|
@@ -2127,6 +2142,7 @@ void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, u64 inode)
 	rbio->c = c;
 	rbio->start_time = local_clock();
 
+	bkey_on_stack_init(&sk);
 	bch2_trans_init(&trans, c, 0, 0);
 retry:
 	bch2_trans_begin(&trans);
@@ -2135,7 +2151,6 @@ retry:
 				   POS(inode, rbio->bio.bi_iter.bi_sector),
 				   BTREE_ITER_SLOTS);
 	while (1) {
-		BKEY_PADDED(k) tmp;
 		unsigned bytes, sectors, offset_into_extent;
 
 		bch2_btree_iter_set_pos(iter,
@@ -2146,15 +2161,16 @@ retry:
 		if (ret)
 			goto err;
 
-		bkey_reassemble(&tmp.k, k);
-		k = bkey_i_to_s_c(&tmp.k);
-
 		offset_into_extent = iter->pos.offset -
 			bkey_start_offset(k.k);
 		sectors = k.k->size - offset_into_extent;
 
+		bkey_on_stack_realloc(&sk, c, k.k->u64s);
+		bkey_reassemble(sk.k, k);
+		k = bkey_i_to_s_c(sk.k);
+
 		ret = bch2_read_indirect_extent(&trans,
-					&offset_into_extent, &tmp.k);
+					&offset_into_extent, sk.k);
 		if (ret)
 			goto err;
 
@@ -2186,6 +2202,7 @@ retry:
 	}
 out:
 	bch2_trans_exit(&trans);
+	bkey_on_stack_exit(&sk, c);
 	return;
 err:
 	if (ret == -EINTR)
diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c
index de8522f754e2..4dacbd637d02 100644
--- a/fs/bcachefs/migrate.c
+++ b/fs/bcachefs/migrate.c
@@ -4,6 +4,7 @@
  */
 
 #include "bcachefs.h"
+#include "bkey_on_stack.h"
 #include "btree_update.h"
 #include "btree_update_interior.h"
 #include "buckets.h"
@@ -40,9 +41,10 @@ static int __bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags
 	struct btree_trans trans;
 	struct btree_iter *iter;
 	struct bkey_s_c k;
-	BKEY_PADDED(key) tmp;
+	struct bkey_on_stack sk;
 	int ret = 0;
 
+	bkey_on_stack_init(&sk);
 	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
 
 	iter = bch2_trans_get_iter(&trans, btree_id, POS_MIN,
@@ -58,9 +60,10 @@ static int __bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags
 			continue;
 		}
 
-		bkey_reassemble(&tmp.key, k);
+		bkey_on_stack_realloc(&sk, c, k.k->u64s);
+		bkey_reassemble(sk.k, k);
 
-		ret = drop_dev_ptrs(c, bkey_i_to_s(&tmp.key),
+		ret = drop_dev_ptrs(c, bkey_i_to_s(sk.k),
 				    dev_idx, flags, false);
 		if (ret)
 			break;
@@ -70,11 +73,11 @@ static int __bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags
 		 * will do the appropriate thing with it (turning it into a
 		 * KEY_TYPE_error key, or just a discard if it was a cached extent)
 		 */
-		bch2_extent_normalize(c, bkey_i_to_s(&tmp.key));
+		bch2_extent_normalize(c, bkey_i_to_s(sk.k));
 
-		bch2_btree_iter_set_pos(iter, bkey_start_pos(&tmp.key.k));
+		bch2_btree_iter_set_pos(iter, bkey_start_pos(&sk.k->k));
 
-		bch2_trans_update(&trans, iter, &tmp.key);
+		bch2_trans_update(&trans, iter, sk.k);
 
 		ret = bch2_trans_commit(&trans, NULL, NULL,
 					BTREE_INSERT_ATOMIC|
@@ -92,6 +95,7 @@ static int __bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags
 	}
 
 	ret = bch2_trans_exit(&trans) ?: ret;
+	bkey_on_stack_exit(&sk, c);
 
 	BUG_ON(ret == -EINTR);
 
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index c5d3375882d7..dbe35d16e7dd 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -2,6 +2,7 @@
 
 #include "bcachefs.h"
 #include "alloc_foreground.h"
+#include "bkey_on_stack.h"
 #include "btree_gc.h"
 #include "btree_update.h"
 #include "btree_update_interior.h"
@@ -489,7 +490,7 @@ static int __bch2_move_data(struct bch_fs *c,
 {
 	bool kthread = (current->flags & PF_KTHREAD) != 0;
 	struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts);
-	BKEY_PADDED(k) tmp;
+	struct bkey_on_stack sk;
 	struct btree_trans trans;
 	struct btree_iter *iter;
 	struct bkey_s_c k;
@@ -498,6 +499,7 @@ static int __bch2_move_data(struct bch_fs *c,
 	u64 delay, cur_inum = U64_MAX;
 	int ret = 0, ret2;
 
+	bkey_on_stack_init(&sk);
 	bch2_trans_init(&trans, c, 0, 0);
 
 	stats->data_type = BCH_DATA_USER;
@@ -577,8 +579,9 @@ peek:
 		}
 
 		/* unlock before doing IO: */
-		bkey_reassemble(&tmp.k, k);
-		k = bkey_i_to_s_c(&tmp.k);
+		bkey_on_stack_realloc(&sk, c, k.k->u64s);
+		bkey_reassemble(sk.k, k);
+		k = bkey_i_to_s_c(sk.k);
 		bch2_trans_unlock(&trans);
 
 		ret2 = bch2_move_extent(c, ctxt, wp, io_opts, btree_id, k,
@@ -605,6 +608,7 @@ next_nondata:
 	}
 out:
 	ret = bch2_trans_exit(&trans) ?: ret;
+	bkey_on_stack_exit(&sk, c);
 
 	return ret;
 }
diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c
index 6e71c5e8f9a2..6d21086c3254 100644
--- a/fs/bcachefs/reflink.c
+++ b/fs/bcachefs/reflink.c
@@ -1,5 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 #include "bcachefs.h"
+#include "bkey_on_stack.h"
 #include "btree_update.h"
 #include "extents.h"
 #include "inode.h"
@@ -160,7 +161,8 @@ s64 bch2_remap_range(struct bch_fs *c,
 	struct btree_trans trans;
 	struct btree_iter *dst_iter, *src_iter;
 	struct bkey_s_c src_k;
-	BKEY_PADDED(k) new_dst, new_src;
+	BKEY_PADDED(k) new_dst;
+	struct bkey_on_stack new_src;
 	struct bpos dst_end = dst_start, src_end = src_start;
 	struct bpos dst_want, src_want;
 	u64 src_done, dst_done;
@@ -183,6 +185,7 @@ s64 bch2_remap_range(struct bch_fs *c,
 	dst_end.offset += remap_sectors;
 	src_end.offset += remap_sectors;
 
+	bkey_on_stack_init(&new_src);
 	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 4096);
 
 	src_iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, src_start,
@@ -222,14 +225,15 @@ s64 bch2_remap_range(struct bch_fs *c,
 			break;
 
 		if (src_k.k->type == KEY_TYPE_extent) {
-			bkey_reassemble(&new_src.k, src_k);
-			src_k = bkey_i_to_s_c(&new_src.k);
+			bkey_on_stack_realloc(&new_src, c, src_k.k->u64s);
+			bkey_reassemble(new_src.k, src_k);
+			src_k = bkey_i_to_s_c(new_src.k);
 
-			bch2_cut_front(src_iter->pos,	&new_src.k);
-			bch2_cut_back(src_end,		&new_src.k.k);
+			bch2_cut_front(src_iter->pos,	new_src.k);
+			bch2_cut_back(src_end,		&new_src.k->k);
 
 			ret = bch2_make_extent_indirect(&trans, src_iter,
-						bkey_i_to_extent(&new_src.k));
+						bkey_i_to_extent(new_src.k));
 			if (ret)
 				goto btree_err;
 
@@ -299,6 +303,7 @@ err:
 	} while (ret2 == -EINTR);
 
 	ret = bch2_trans_exit(&trans) ?: ret;
+	bkey_on_stack_exit(&new_src, c);
 
 	percpu_ref_put(&c->writes);
 
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 14e2f6828cc6..8c7b56a95f4b 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -475,6 +475,7 @@ static void bch2_fs_free(struct bch_fs *c)
 	free_percpu(c->usage[0]);
 	kfree(c->usage_base);
 	free_percpu(c->pcpu);
+	mempool_exit(&c->large_bkey_pool);
 	mempool_exit(&c->btree_bounce_pool);
 	bioset_exit(&c->btree_bio);
 	mempool_exit(&c->btree_interior_update_pool);
@@ -729,6 +730,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 	    !(c->online_reserved = alloc_percpu(u64)) ||
 	    mempool_init_kvpmalloc_pool(&c->btree_bounce_pool, 1,
 					btree_bytes(c)) ||
+	    mempool_init_kmalloc_pool(&c->large_bkey_pool, 1, 2048) ||
 	    bch2_io_clock_init(&c->io_clock[READ]) ||
 	    bch2_io_clock_init(&c->io_clock[WRITE]) ||
 	    bch2_fs_journal_init(&c->journal) ||
-- 
cgit 


From aef90ce085123c3d0c3f110b4c50b77d007b2d5d Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 15 Nov 2019 20:40:15 -0500
Subject: bcachefs: kill bch2_extent_has_device()

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/ec.c         | 15 ++++++++-------
 fs/bcachefs/extents.h    |  3 ---
 fs/bcachefs/journal.c    |  2 +-
 fs/bcachefs/journal_io.c |  2 +-
 4 files changed, 10 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index b24f867520c3..654823a6da74 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -777,7 +777,6 @@ static int ec_stripe_update_ptrs(struct bch_fs *c,
 	struct btree_iter *iter;
 	struct bkey_s_c k;
 	struct bkey_s_extent e;
-	struct bch_extent_ptr *ptr;
 	struct bkey_on_stack sk;
 	int ret = 0, dev, idx;
 
@@ -791,6 +790,8 @@ static int ec_stripe_update_ptrs(struct bch_fs *c,
 	while ((k = bch2_btree_iter_peek(iter)).k &&
 	       !(ret = bkey_err(k)) &&
 	       bkey_cmp(bkey_start_pos(k.k), pos->p) < 0) {
+		struct bch_extent_ptr *ptr, *ec_ptr = NULL;
+
 		if (extent_has_stripe_ptr(k, s->key.k.p.offset)) {
 			bch2_btree_iter_next(iter);
 			continue;
@@ -810,14 +811,14 @@ static int ec_stripe_update_ptrs(struct bch_fs *c,
 		bkey_reassemble(sk.k, k);
 		e = bkey_i_to_s_extent(sk.k);
 
-		extent_for_each_ptr(e, ptr)
-			if (ptr->dev != dev)
+		extent_for_each_ptr(e, ptr) {
+			if (ptr->dev == dev)
+				ec_ptr = ptr;
+			else
 				ptr->cached = true;
+		}
 
-		ptr = (void *) bch2_extent_has_device(e.c, dev);
-		BUG_ON(!ptr);
-
-		extent_stripe_ptr_add(e, s, ptr, idx);
+		extent_stripe_ptr_add(e, s, ec_ptr, idx);
 
 		bch2_trans_update(&trans, iter, sk.k);
 
diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h
index cc7ee9067b50..f334b6f763e3 100644
--- a/fs/bcachefs/extents.h
+++ b/fs/bcachefs/extents.h
@@ -447,9 +447,6 @@ void bch2_insert_fixup_extent(struct btree_trans *,
 void bch2_bkey_mark_replicas_cached(struct bch_fs *, struct bkey_s,
 				    unsigned, unsigned);
 
-const struct bch_extent_ptr *
-bch2_extent_has_device(struct bkey_s_c_extent, unsigned);
-
 unsigned bch2_extent_is_compressed(struct bkey_s_c);
 
 bool bch2_bkey_matches_ptr(struct bch_fs *, struct bkey_s_c,
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index 0261c6bbfa92..26a8ff38991d 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -944,7 +944,7 @@ static bool bch2_journal_writing_to_device(struct journal *j, unsigned dev_idx)
 	w = j->buf + !state.idx;
 
 	ret = state.prev_buf_unwritten &&
-		bch2_extent_has_device(bkey_i_to_s_c_extent(&w->key), dev_idx);
+		bch2_bkey_has_device(bkey_i_to_s_c(&w->key), dev_idx);
 	spin_unlock(&j->lock);
 
 	return ret;
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 2531379e67c6..244e3faa6b16 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -1097,7 +1097,7 @@ void bch2_journal_write(struct closure *cl)
 
 	for_each_rw_member(ca, c, i)
 		if (journal_flushes_device(ca) &&
-		    !bch2_extent_has_device(bkey_i_to_s_c_extent(&w->key), i)) {
+		    !bch2_bkey_has_device(bkey_i_to_s_c(&w->key), i)) {
 			percpu_ref_get(&ca->io_ref);
 
 			bio = ca->journal.bio;
-- 
cgit 


From ad44bdc351faeacb9b7294f1689ac76babf379ad Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 9 Nov 2019 23:50:52 -0500
Subject: bcachefs: bkey noops

For upcoming inline data extents, we're going to need to be able to
shorten the value of existing bkeys in the btree - and to make that work
we're going to be able to need to pad out the space the value previously
took up with something.

This patch changes the various code that iterates over bkeys to handle
k->u64s == 0 as meaning "skip the next 8 bytes".

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bkey.h                  | 10 +++++++
 fs/bcachefs/bkey_sort.c             |  6 ++++-
 fs/bcachefs/bset.c                  | 40 +++++++++++++++-------------
 fs/bcachefs/bset.h                  |  7 ++++-
 fs/bcachefs/btree_gc.c              |  2 +-
 fs/bcachefs/btree_io.c              | 53 +++++++++++++++----------------------
 fs/bcachefs/btree_update_interior.c | 34 +++++++++++++-----------
 7 files changed, 83 insertions(+), 69 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bkey.h b/fs/bcachefs/bkey.h
index cb2702707c2a..ba4d6329e37a 100644
--- a/fs/bcachefs/bkey.h
+++ b/fs/bcachefs/bkey.h
@@ -41,6 +41,16 @@ struct bkey_s {
 
 #define bkey_next(_k)		vstruct_next(_k)
 
+static inline struct bkey_packed *bkey_next_skip_noops(struct bkey_packed *k,
+						       struct bkey_packed *end)
+{
+	k = bkey_next(k);
+
+	while (k != end && !k->u64s)
+		k = (void *) ((u64 *) k + 1);
+	return k;
+}
+
 #define bkey_val_u64s(_k)	((_k)->u64s - BKEY_U64s)
 
 static inline size_t bkey_val_bytes(const struct bkey *k)
diff --git a/fs/bcachefs/bkey_sort.c b/fs/bcachefs/bkey_sort.c
index f5c0507ad79d..5f9f3d2e6906 100644
--- a/fs/bcachefs/bkey_sort.c
+++ b/fs/bcachefs/bkey_sort.c
@@ -75,6 +75,10 @@ static void sort_key_next(struct btree_node_iter_large *iter,
 {
 	i->k += __btree_node_offset_to_key(b, i->k)->u64s;
 
+	while (i->k != i->end &&
+	       !__btree_node_offset_to_key(b, i->k)->u64s)
+		i->k++;
+
 	if (i->k == i->end)
 		*i = iter->data[--iter->used];
 }
@@ -119,7 +123,7 @@ static inline struct bkey_packed *sort_iter_peek(struct sort_iter *iter)
 
 static inline void sort_iter_advance(struct sort_iter *iter, sort_cmp_fn cmp)
 {
-	iter->data->k = bkey_next(iter->data->k);
+	iter->data->k = bkey_next_skip_noops(iter->data->k, iter->data->end);
 
 	BUG_ON(iter->data->k > iter->data->end);
 
diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c
index af20b9803608..189a187bc080 100644
--- a/fs/bcachefs/bset.c
+++ b/fs/bcachefs/bset.c
@@ -64,7 +64,7 @@ void bch2_dump_bset(struct btree *b, struct bset *i, unsigned set)
 	for (_k = i->start, k = bkey_unpack_key(b, _k);
 	     _k < vstruct_last(i);
 	     _k = _n, k = n) {
-		_n = bkey_next(_k);
+		_n = bkey_next_skip_noops(_k, vstruct_last(i));
 
 		bch2_bkey_to_text(&PBUF(buf), &k);
 		printk(KERN_ERR "block %u key %5u: %s\n", set,
@@ -132,9 +132,7 @@ void __bch2_verify_btree_nr_keys(struct btree *b)
 	struct btree_nr_keys nr = { 0 };
 
 	for_each_bset(b, t)
-		for (k = btree_bkey_first(b, t);
-		     k != btree_bkey_last(b, t);
-		     k = bkey_next(k))
+		bset_tree_for_each_key(b, t, k)
 			if (!bkey_whiteout(k))
 				btree_keys_account_key_add(&nr, t - b->set, k);
 
@@ -595,7 +593,7 @@ start:
 			       rw_aux_tree(b, t)[j - 1].offset);
 		}
 
-		k = bkey_next(k);
+		k = bkey_next_skip_noops(k, btree_bkey_last(b, t));
 		BUG_ON(k >= btree_bkey_last(b, t));
 	}
 }
@@ -786,9 +784,7 @@ static void __build_rw_aux_tree(struct btree *b, struct bset_tree *t)
 	rw_aux_tree(b, t)[0].offset =
 		__btree_node_key_to_offset(b, btree_bkey_first(b, t));
 
-	for (k = btree_bkey_first(b, t);
-	     k != btree_bkey_last(b, t);
-	     k = bkey_next(k)) {
+	bset_tree_for_each_key(b, t, k) {
 		if (t->size == bset_rw_tree_capacity(b, t))
 			break;
 
@@ -821,7 +817,7 @@ retry:
 	/* First we figure out where the first key in each cacheline is */
 	eytzinger1_for_each(j, t->size) {
 		while (bkey_to_cacheline(b, t, k) < cacheline)
-			prev = k, k = bkey_next(k);
+			prev = k, k = bkey_next_skip_noops(k, btree_bkey_last(b, t));
 
 		if (k >= btree_bkey_last(b, t)) {
 			/* XXX: this path sucks */
@@ -837,10 +833,10 @@ retry:
 		EBUG_ON(tree_to_bkey(b, t, j) != k);
 	}
 
-	while (bkey_next(k) != btree_bkey_last(b, t))
-		k = bkey_next(k);
+	while (k != btree_bkey_last(b, t))
+		prev = k, k = bkey_next_skip_noops(k, btree_bkey_last(b, t));
 
-	t->max_key = bkey_unpack_pos(b, k);
+	t->max_key = bkey_unpack_pos(b, prev);
 
 	/* Then we build the tree */
 	eytzinger1_for_each(j, t->size)
@@ -966,7 +962,7 @@ struct bkey_packed *bch2_bkey_prev_filter(struct btree *b,
 	struct bkey_packed *p, *i, *ret = NULL, *orig_k = k;
 
 	while ((p = __bkey_prev(b, t, k)) && !ret) {
-		for (i = p; i != k; i = bkey_next(i))
+		for (i = p; i != k; i = bkey_next_skip_noops(i, k))
 			if (i->type >= min_key_type)
 				ret = i;
 
@@ -976,9 +972,11 @@ struct bkey_packed *bch2_bkey_prev_filter(struct btree *b,
 	if (btree_keys_expensive_checks(b)) {
 		BUG_ON(ret >= orig_k);
 
-		for (i = ret ? bkey_next(ret) : btree_bkey_first(b, t);
+		for (i = ret
+			? bkey_next_skip_noops(ret, orig_k)
+			: btree_bkey_first(b, t);
 		     i != orig_k;
-		     i = bkey_next(i))
+		     i = bkey_next_skip_noops(i, orig_k))
 			BUG_ON(i->type >= min_key_type);
 	}
 
@@ -1013,7 +1011,7 @@ static void ro_aux_tree_fix_invalidated_key(struct btree *b,
 	/* signal to make_bfloat() that they're uninitialized: */
 	min_key.u64s = max_key.u64s = 0;
 
-	if (bkey_next(k) == btree_bkey_last(b, t)) {
+	if (bkey_next_skip_noops(k, btree_bkey_last(b, t)) == btree_bkey_last(b, t)) {
 		t->max_key = bkey_unpack_pos(b, k);
 
 		for (j = 1; j < t->size; j = j * 2 + 1)
@@ -1137,7 +1135,7 @@ static void bch2_bset_fix_lookup_table(struct btree *b,
 		struct bkey_packed *k = start;
 
 		while (1) {
-			k = bkey_next(k);
+			k = bkey_next_skip_noops(k, end);
 			if (k == end)
 				break;
 
@@ -1386,12 +1384,12 @@ struct bkey_packed *bch2_bset_search_linear(struct btree *b,
 		while (m != btree_bkey_last(b, t) &&
 		       bkey_iter_cmp_p_or_unp(b, search, lossy_packed_search,
 					      m) > 0)
-			m = bkey_next(m);
+			m = bkey_next_skip_noops(m, btree_bkey_last(b, t));
 
 	if (!packed_search)
 		while (m != btree_bkey_last(b, t) &&
 		       bkey_iter_pos_cmp(b, search, m) > 0)
-			m = bkey_next(m);
+			m = bkey_next_skip_noops(m, btree_bkey_last(b, t));
 
 	if (btree_keys_expensive_checks(b)) {
 		struct bkey_packed *prev = bch2_bkey_prev_all(b, t, m);
@@ -1625,6 +1623,10 @@ static inline void __bch2_btree_node_iter_advance(struct btree_node_iter *iter,
 
 	EBUG_ON(iter->data->k > iter->data->end);
 
+	while (!__btree_node_iter_set_end(iter, 0) &&
+	       !__bch2_btree_node_iter_peek_all(iter, b)->u64s)
+		iter->data->k++;
+
 	if (unlikely(__btree_node_iter_set_end(iter, 0))) {
 		bch2_btree_node_iter_set_drop(iter, iter->data);
 		return;
diff --git a/fs/bcachefs/bset.h b/fs/bcachefs/bset.h
index 3f5b7378a0a9..0e9bd8022d35 100644
--- a/fs/bcachefs/bset.h
+++ b/fs/bcachefs/bset.h
@@ -284,9 +284,14 @@ static inline struct bkey_s __bkey_disassemble(struct btree *b,
 	return (struct bkey_s) { .k = u, .v = bkeyp_val(&b->format, k), };
 }
 
-#define for_each_bset(_b, _t)					\
+#define for_each_bset(_b, _t)						\
 	for (_t = (_b)->set; _t < (_b)->set + (_b)->nsets; _t++)
 
+#define bset_tree_for_each_key(_b, _t, _k)				\
+	for (_k = btree_bkey_first(_b, _t);				\
+	     _k != btree_bkey_last(_b, _t);				\
+	     _k = bkey_next_skip_noops(_k, btree_bkey_last(_b, _t)))
+
 static inline bool bset_has_ro_aux_tree(struct bset_tree *t)
 {
 	return bset_aux_tree_type(t) == BSET_RO_AUX_TREE;
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 4a66c44764f6..2eaf6a55c06c 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -924,7 +924,7 @@ static void bch2_coalesce_nodes(struct bch_fs *c, struct btree_iter *iter,
 		     k < vstruct_last(s2) &&
 		     vstruct_blocks_plus(n1->data, c->block_bits,
 					 u64s + k->u64s) <= blocks;
-		     k = bkey_next(k)) {
+		     k = bkey_next_skip_noops(k, vstruct_last(s2))) {
 			last = k;
 			u64s += k->u64s;
 		}
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index c4f85b962b65..8532087f2754 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -25,34 +25,33 @@ static void verify_no_dups(struct btree *b,
 			   struct bkey_packed *end)
 {
 #ifdef CONFIG_BCACHEFS_DEBUG
-	struct bkey_packed *k;
+	struct bkey_packed *k, *p;
+
+	if (start == end)
+		return;
 
-	for (k = start; k != end && bkey_next(k) != end; k = bkey_next(k)) {
-		struct bkey l = bkey_unpack_key(b, k);
-		struct bkey r = bkey_unpack_key(b, bkey_next(k));
+	for (p = start, k = bkey_next_skip_noops(start, end);
+	     k != end;
+	     p = k, k = bkey_next_skip_noops(k, end)) {
+		struct bkey l = bkey_unpack_key(b, p);
+		struct bkey r = bkey_unpack_key(b, k);
 
 		BUG_ON(btree_node_is_extents(b)
 		       ? bkey_cmp(l.p, bkey_start_pos(&r)) > 0
 		       : bkey_cmp(l.p, bkey_start_pos(&r)) >= 0);
-		//BUG_ON(bkey_cmp_packed(&b->format, k, bkey_next(k)) >= 0);
+		//BUG_ON(bkey_cmp_packed(&b->format, p, k) >= 0);
 	}
 #endif
 }
 
-static void clear_needs_whiteout(struct bset *i)
-{
-	struct bkey_packed *k;
-
-	for (k = i->start; k != vstruct_last(i); k = bkey_next(k))
-		k->needs_whiteout = false;
-}
-
-static void set_needs_whiteout(struct bset *i)
+static void set_needs_whiteout(struct bset *i, int v)
 {
 	struct bkey_packed *k;
 
-	for (k = i->start; k != vstruct_last(i); k = bkey_next(k))
-		k->needs_whiteout = true;
+	for (k = i->start;
+	     k != vstruct_last(i);
+	     k = bkey_next_skip_noops(k, vstruct_last(i)))
+		k->needs_whiteout = v;
 }
 
 static void btree_bounce_free(struct bch_fs *c, unsigned order,
@@ -167,7 +166,7 @@ bool __bch2_compact_whiteouts(struct bch_fs *c, struct btree *b,
 		out = i->start;
 
 		for (k = start; k != end; k = n) {
-			n = bkey_next(k);
+			n = bkey_next_skip_noops(k, end);
 
 			if (bkey_deleted(k) && btree_node_is_extents(b))
 				continue;
@@ -260,7 +259,7 @@ static bool bch2_drop_whiteouts(struct btree *b)
 		out = i->start;
 
 		for (k = start; k != end; k = n) {
-			n = bkey_next(k);
+			n = bkey_next_skip_noops(k, end);
 
 			if (!bkey_whiteout(k)) {
 				bkey_copy(out, k);
@@ -679,14 +678,6 @@ static int validate_bset(struct bch_fs *c, struct btree *b,
 		struct bkey tmp;
 		const char *invalid;
 
-		if (btree_err_on(!k->u64s,
-				 BTREE_ERR_FIXABLE, c, b, i,
-				 "KEY_U64s 0: %zu bytes of metadata lost",
-				 vstruct_end(i) - (void *) k)) {
-			i->u64s = cpu_to_le16((u64 *) k - i->_data);
-			break;
-		}
-
 		if (btree_err_on(bkey_next(k) > vstruct_last(i),
 				 BTREE_ERR_FIXABLE, c, b, i,
 				 "key extends past end of bset")) {
@@ -755,7 +746,7 @@ static int validate_bset(struct bch_fs *c, struct btree *b,
 
 		prev_pos = u.k->p;
 		prev = k;
-		k = bkey_next(k);
+		k = bkey_next_skip_noops(k, vstruct_last(i));
 	}
 
 	SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN);
@@ -914,12 +905,12 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry
 			continue;
 		}
 
-		k = bkey_next(k);
+		k = bkey_next_skip_noops(k, vstruct_last(i));
 	}
 
 	bch2_bset_build_aux_tree(b, b->set, false);
 
-	set_needs_whiteout(btree_bset_first(b));
+	set_needs_whiteout(btree_bset_first(b), true);
 
 	btree_node_reset_sib_u64s(b);
 out:
@@ -1424,7 +1415,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
 		: bch2_sort_keys(i->start, &sort_iter, false);
 	le16_add_cpu(&i->u64s, u64s);
 
-	clear_needs_whiteout(i);
+	set_needs_whiteout(i, false);
 
 	/* do we have data to write? */
 	if (b->written && !i->u64s)
@@ -1579,7 +1570,7 @@ bool bch2_btree_post_write_cleanup(struct bch_fs *c, struct btree *b)
 	}
 
 	for_each_bset(b, t)
-		set_needs_whiteout(bset(b, t));
+		set_needs_whiteout(bset(b, t), true);
 
 	bch2_btree_verify(c, b);
 
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 0956957216f9..9e2d72bf06b2 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -79,9 +79,7 @@ void __bch2_btree_calc_format(struct bkey_format_state *s, struct btree *b)
 	bch2_bkey_format_add_pos(s, b->data->min_key);
 
 	for_each_bset(b, t)
-		for (k = btree_bkey_first(b, t);
-		     k != btree_bkey_last(b, t);
-		     k = bkey_next(k))
+		bset_tree_for_each_key(b, t, k)
 			if (!bkey_whiteout(k)) {
 				uk = bkey_unpack_key(b, k);
 				bch2_bkey_format_add_key(s, &uk);
@@ -1240,7 +1238,9 @@ static struct btree *__btree_split_node(struct btree_update *as,
 	 */
 	k = set1->start;
 	while (1) {
-		if (bkey_next(k) == vstruct_last(set1))
+		struct bkey_packed *n = bkey_next_skip_noops(k, vstruct_last(set1));
+
+		if (n == vstruct_last(set1))
 			break;
 		if (k->_data - set1->_data >= (le16_to_cpu(set1->u64s) * 3) / 5)
 			break;
@@ -1251,7 +1251,7 @@ static struct btree *__btree_split_node(struct btree_update *as,
 			nr_unpacked++;
 
 		prev = k;
-		k = bkey_next(k);
+		k = n;
 	}
 
 	BUG_ON(!prev);
@@ -1315,7 +1315,7 @@ static void btree_split_insert_keys(struct btree_update *as, struct btree *b,
 {
 	struct btree_node_iter node_iter;
 	struct bkey_i *k = bch2_keylist_front(keys);
-	struct bkey_packed *p;
+	struct bkey_packed *src, *dst, *n;
 	struct bset *i;
 
 	BUG_ON(btree_node_type(b) != BKEY_TYPE_BTREE);
@@ -1340,16 +1340,18 @@ static void btree_split_insert_keys(struct btree_update *as, struct btree *b,
 	 * for the pivot:
 	 */
 	i = btree_bset_first(b);
-	p = i->start;
-	while (p != vstruct_last(i))
-		if (bkey_deleted(p)) {
-			le16_add_cpu(&i->u64s, -p->u64s);
-			set_btree_bset_end(b, b->set);
-			memmove_u64s_down(p, bkey_next(p),
-					  (u64 *) vstruct_last(i) -
-					  (u64 *) p);
-		} else
-			p = bkey_next(p);
+	src = dst = i->start;
+	while (src != vstruct_last(i)) {
+		n = bkey_next_skip_noops(src, vstruct_last(i));
+		if (!bkey_deleted(src)) {
+			memmove_u64s_down(dst, src, src->u64s);
+			dst = bkey_next(dst);
+		}
+		src = n;
+	}
+
+	i->u64s = cpu_to_le16((u64 *) dst - i->_data);
+	set_btree_bset_end(b, b->set);
 
 	BUG_ON(b->nsets != 1 ||
 	       b->nr.live_u64s != le16_to_cpu(btree_bset_first(b)->u64s));
-- 
cgit 


From 085ab69357e091613625f1505d667b6a5a3ec881 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 9 Nov 2019 19:02:48 -0500
Subject: bcachefs: Rework of cut_front & cut_back

This changes bch2_cut_front and bch2_cut_back so that they're able to
shorten the size of the value, and it also changes the extent update
path to update the accounting in the btree node when this happens.

When the size of the value is shortened, they zero out the space that's
no longer used, so it's interpreted as noops (as implemented in the last
patch).

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bkey_sort.c         |  8 ++---
 fs/bcachefs/bset.h              | 10 +++++++
 fs/bcachefs/btree_update_leaf.c |  2 +-
 fs/bcachefs/extents.c           | 65 +++++++++++++++++++++++++++++------------
 fs/bcachefs/extents.h           | 10 +++++--
 fs/bcachefs/fs-io.c             |  8 ++---
 fs/bcachefs/io.c                |  7 ++---
 fs/bcachefs/move.c              |  9 +++---
 fs/bcachefs/recovery.c          |  6 ++--
 fs/bcachefs/reflink.c           |  4 +--
 10 files changed, 84 insertions(+), 45 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bkey_sort.c b/fs/bcachefs/bkey_sort.c
index 5f9f3d2e6906..daef8e5c599f 100644
--- a/fs/bcachefs/bkey_sort.c
+++ b/fs/bcachefs/bkey_sort.c
@@ -350,7 +350,7 @@ struct btree_nr_keys bch2_extent_sort_fix_overlapping(struct bch_fs *c,
 			if (bkey_cmp(l.k->p, r.k->p) >= 0) {
 				sort_key_next(iter, b, _r);
 			} else {
-				__bch2_cut_front(l.k->p, r);
+				bch2_cut_front_s(l.k->p, r);
 				extent_save(b, rk, r.k);
 			}
 
@@ -362,9 +362,9 @@ struct btree_nr_keys bch2_extent_sort_fix_overlapping(struct bch_fs *c,
 			 * r wins, but it overlaps in the middle of l - split l:
 			 */
 			bkey_reassemble(split.k, l.s_c);
-			bch2_cut_back(bkey_start_pos(r.k), &split.k->k);
+			bch2_cut_back(bkey_start_pos(r.k), split.k);
 
-			__bch2_cut_front(r.k->p, l);
+			bch2_cut_front_s(r.k->p, l);
 			extent_save(b, lk, l.k);
 
 			extent_sort_sift(iter, b, 0);
@@ -372,7 +372,7 @@ struct btree_nr_keys bch2_extent_sort_fix_overlapping(struct bch_fs *c,
 			extent_sort_append(c, f, &nr, dst->start,
 					   &prev, bkey_i_to_s(split.k));
 		} else {
-			bch2_cut_back(bkey_start_pos(r.k), l.k);
+			bch2_cut_back_s(bkey_start_pos(r.k), l);
 			extent_save(b, lk, l.k);
 		}
 	}
diff --git a/fs/bcachefs/bset.h b/fs/bcachefs/bset.h
index 0e9bd8022d35..b93c4f287480 100644
--- a/fs/bcachefs/bset.h
+++ b/fs/bcachefs/bset.h
@@ -584,6 +584,16 @@ static inline void btree_keys_account_key(struct btree_nr_keys *n,
 		n->unpacked_keys += sign;
 }
 
+static inline void btree_keys_account_val_delta(struct btree *b,
+						struct bkey_packed *k,
+						int delta)
+{
+	struct bset_tree *t = bch2_bkey_to_bset(b, k);
+
+	b->nr.live_u64s			+= delta;
+	b->nr.bset_u64s[t - b->set]	+= delta;
+}
+
 #define btree_keys_account_key_add(_nr, _bset_idx, _k)		\
 	btree_keys_account_key(_nr, _bset_idx, _k, 1)
 #define btree_keys_account_key_drop(_nr, _bset_idx, _k)	\
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 85580e63b5ca..a774fce027c2 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -884,7 +884,7 @@ retry:
 
 			/* create the biggest key we can */
 			bch2_key_resize(&delete.k, max_sectors);
-			bch2_cut_back(end, &delete.k);
+			bch2_cut_back(end, &delete);
 
 			ret = bch2_extent_trim_atomic(&delete, iter);
 			if (ret)
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 46eeaa574e86..6c1cc90ab320 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -720,12 +720,14 @@ void bch2_btree_ptr_to_text(struct printbuf *out, struct bch_fs *c,
 
 /* Extents */
 
-void __bch2_cut_front(struct bpos where, struct bkey_s k)
+int bch2_cut_front_s(struct bpos where, struct bkey_s k)
 {
+	unsigned new_val_u64s = bkey_val_u64s(k.k);
+	int val_u64s_delta;
 	u64 sub;
 
 	if (bkey_cmp(where, bkey_start_pos(k.k)) <= 0)
-		return;
+		return 0;
 
 	EBUG_ON(bkey_cmp(where, k.k->p) > 0);
 
@@ -733,8 +735,10 @@ void __bch2_cut_front(struct bpos where, struct bkey_s k)
 
 	k.k->size -= sub;
 
-	if (!k.k->size)
+	if (!k.k->size) {
 		k.k->type = KEY_TYPE_deleted;
+		new_val_u64s = 0;
+	}
 
 	switch (k.k->type) {
 	case KEY_TYPE_deleted:
@@ -784,26 +788,42 @@ void __bch2_cut_front(struct bpos where, struct bkey_s k)
 	default:
 		BUG();
 	}
+
+	val_u64s_delta = bkey_val_u64s(k.k) - new_val_u64s;
+	BUG_ON(val_u64s_delta < 0);
+
+	set_bkey_val_u64s(k.k, new_val_u64s);
+	memset(bkey_val_end(k), 0, val_u64s_delta * sizeof(u64));
+	return -val_u64s_delta;
 }
 
-bool bch2_cut_back(struct bpos where, struct bkey *k)
+int bch2_cut_back_s(struct bpos where, struct bkey_s k)
 {
+	unsigned new_val_u64s = bkey_val_u64s(k.k);
+	int val_u64s_delta;
 	u64 len = 0;
 
-	if (bkey_cmp(where, k->p) >= 0)
-		return false;
+	if (bkey_cmp(where, k.k->p) >= 0)
+		return 0;
 
-	EBUG_ON(bkey_cmp(where, bkey_start_pos(k)) < 0);
+	EBUG_ON(bkey_cmp(where, bkey_start_pos(k.k)) < 0);
 
-	len = where.offset - bkey_start_offset(k);
+	len = where.offset - bkey_start_offset(k.k);
 
-	k->p = where;
-	k->size = len;
+	k.k->p = where;
+	k.k->size = len;
 
-	if (!len)
-		k->type = KEY_TYPE_deleted;
+	if (!len) {
+		k.k->type = KEY_TYPE_deleted;
+		new_val_u64s = 0;
+	}
+
+	val_u64s_delta = bkey_val_u64s(k.k) - new_val_u64s;
+	BUG_ON(val_u64s_delta < 0);
 
-	return true;
+	set_bkey_val_u64s(k.k, new_val_u64s);
+	memset(bkey_val_end(k), 0, val_u64s_delta * sizeof(u64));
+	return -val_u64s_delta;
 }
 
 static unsigned bch2_bkey_nr_alloc_ptrs(struct bkey_s_c k)
@@ -942,7 +962,7 @@ int bch2_extent_trim_atomic(struct bkey_i *k, struct btree_iter *iter)
 	if (ret)
 		return ret;
 
-	bch2_cut_back(end, &k->k);
+	bch2_cut_back(end, k);
 	return 0;
 }
 
@@ -1085,11 +1105,14 @@ extent_squash(struct bch_fs *c, struct btree_iter *iter,
 	      enum bch_extent_overlap overlap)
 {
 	struct btree_iter_level *l = &iter->l[0];
+	int u64s_delta;
 
 	switch (overlap) {
 	case BCH_EXTENT_OVERLAP_FRONT:
 		/* insert overlaps with start of k: */
-		__bch2_cut_front(insert->k.p, k);
+		u64s_delta = bch2_cut_front_s(insert->k.p, k);
+		btree_keys_account_val_delta(l->b, _k, u64s_delta);
+
 		EBUG_ON(bkey_deleted(k.k));
 		extent_save(l->b, _k, k.k);
 		bch2_btree_iter_fix_key_modified(iter, l->b, _k);
@@ -1097,7 +1120,9 @@ extent_squash(struct bch_fs *c, struct btree_iter *iter,
 
 	case BCH_EXTENT_OVERLAP_BACK:
 		/* insert overlaps with end of k: */
-		bch2_cut_back(bkey_start_pos(&insert->k), k.k);
+		u64s_delta = bch2_cut_back_s(bkey_start_pos(&insert->k), k);
+		btree_keys_account_val_delta(l->b, _k, u64s_delta);
+
 		EBUG_ON(bkey_deleted(k.k));
 		extent_save(l->b, _k, k.k);
 
@@ -1155,10 +1180,12 @@ extent_squash(struct bch_fs *c, struct btree_iter *iter,
 		bkey_reassemble(split.k, k.s_c);
 		split.k->k.needs_whiteout |= bkey_written(l->b, _k);
 
-		bch2_cut_back(bkey_start_pos(&insert->k), &split.k->k);
+		bch2_cut_back(bkey_start_pos(&insert->k), split.k);
 		BUG_ON(bkey_deleted(&split.k->k));
 
-		__bch2_cut_front(insert->k.p, k);
+		u64s_delta = bch2_cut_front_s(insert->k.p, k);
+		btree_keys_account_val_delta(l->b, _k, u64s_delta);
+
 		BUG_ON(bkey_deleted(k.k));
 		extent_save(l->b, _k, k.k);
 		bch2_btree_iter_fix_key_modified(iter, l->b, _k);
@@ -1748,7 +1775,7 @@ enum merge_result bch2_reservation_merge(struct bch_fs *c,
 
 	if ((u64) l.k->size + r.k->size > KEY_SIZE_MAX) {
 		bch2_key_resize(l.k, KEY_SIZE_MAX);
-		__bch2_cut_front(l.k->p, r.s);
+		bch2_cut_front_s(l.k->p, r.s);
 		return BCH_MERGE_PARTIAL;
 	}
 
diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h
index f334b6f763e3..6e893c37c287 100644
--- a/fs/bcachefs/extents.h
+++ b/fs/bcachefs/extents.h
@@ -534,14 +534,18 @@ do {									\
 	}								\
 } while (0)
 
-void __bch2_cut_front(struct bpos, struct bkey_s);
+int bch2_cut_front_s(struct bpos, struct bkey_s);
+int bch2_cut_back_s(struct bpos, struct bkey_s);
 
 static inline void bch2_cut_front(struct bpos where, struct bkey_i *k)
 {
-	__bch2_cut_front(where, bkey_i_to_s(k));
+	bch2_cut_front_s(where, bkey_i_to_s(k));
 }
 
-bool bch2_cut_back(struct bpos, struct bkey *);
+static inline void bch2_cut_back(struct bpos where, struct bkey_i *k)
+{
+	bch2_cut_back_s(where, bkey_i_to_s(k));
+}
 
 /**
  * bch_key_resize - adjust size of @k
diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 478630fdf643..8b8442f9a81c 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -2485,7 +2485,7 @@ reassemble:
 				move_pos.offset -= shift >> 9;
 				goto reassemble;
 			} else {
-				bch2_cut_back(atomic_end, &copy.k->k);
+				bch2_cut_back(atomic_end, copy.k);
 			}
 		}
 
@@ -2505,7 +2505,7 @@ reassemble:
 		 */
 		if (insert &&
 		    bkey_cmp(bkey_start_pos(&copy.k->k), delete.k.p) < 0) {
-			bch2_cut_back(bkey_start_pos(&copy.k->k), &delete.k);
+			bch2_cut_back(bkey_start_pos(&copy.k->k), &delete);
 		} else if (!insert &&
 			   bkey_cmp(copy.k->k.p,
 				    bkey_start_pos(&delete.k)) > 0) {
@@ -2652,8 +2652,8 @@ static long bchfs_fallocate(struct bch_inode_info *inode, int mode,
 		reservation.k.p		= k.k->p;
 		reservation.k.size	= k.k->size;
 
-		bch2_cut_front(iter->pos, &reservation.k_i);
-		bch2_cut_back(end_pos, &reservation.k);
+		bch2_cut_front(iter->pos,	&reservation.k_i);
+		bch2_cut_back(end_pos,		&reservation.k_i);
 
 		sectors = reservation.k.size;
 		reservation.v.nr_replicas = bch2_bkey_nr_dirty_ptrs(k);
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index 4fe61705ae75..0f1be5c5543d 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -345,7 +345,7 @@ int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter,
 
 		/* create the biggest key we can */
 		bch2_key_resize(&delete.k, max_sectors);
-		bch2_cut_back(end, &delete.k);
+		bch2_cut_back(end, &delete);
 
 		bch2_trans_begin_updates(trans);
 
@@ -414,6 +414,7 @@ int bch2_write_index_default(struct bch_write_op *op)
 
 		bkey_on_stack_realloc(&sk, c, k->k.u64s);
 		bkey_copy(sk.k, k);
+		bch2_cut_front(iter->pos, sk.k);
 
 		bch2_trans_begin_updates(&trans);
 
@@ -425,9 +426,7 @@ int bch2_write_index_default(struct bch_write_op *op)
 		if (ret)
 			break;
 
-		if (bkey_cmp(iter->pos, k->k.p) < 0)
-			bch2_cut_front(iter->pos, k);
-		else
+		if (bkey_cmp(iter->pos, k->k.p) >= 0)
 			bch2_keylist_pop_front(keys);
 	} while (!bch2_keylist_empty(keys));
 
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index dbe35d16e7dd..5fd44dbe2722 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -96,10 +96,11 @@ static int bch2_migrate_index_update(struct bch_write_op *op)
 
 		bkey_copy(&_new.k, bch2_keylist_front(keys));
 		new = bkey_i_to_extent(&_new.k);
+		bch2_cut_front(iter->pos, &new->k_i);
 
-		bch2_cut_front(iter->pos, insert);
-		bch2_cut_back(new->k.p, &insert->k);
-		bch2_cut_back(insert->k.p, &new->k);
+		bch2_cut_front(iter->pos,	insert);
+		bch2_cut_back(new->k.p,		insert);
+		bch2_cut_back(insert->k.p,	&new->k_i);
 
 		if (m->data_cmd == DATA_REWRITE)
 			bch2_bkey_drop_device(bkey_i_to_s(insert),
@@ -168,8 +169,6 @@ next:
 			if (bch2_keylist_empty(keys))
 				goto out;
 		}
-
-		bch2_cut_front(iter->pos, bch2_keylist_front(keys));
 		continue;
 nomatch:
 		if (m->ctxt)
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index d1184bf62cae..2efe023b2f0d 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -177,7 +177,7 @@ static struct journal_keys journal_keys_sort(struct list_head *journal_entries)
 			if ((cmp_int(i[0].journal_seq, i[1].journal_seq) ?:
 			     cmp_int(i[0].journal_offset, i[1].journal_offset)) < 0) {
 				if (bkey_cmp(i[0].k->k.p, i[1].k->k.p) <= 0) {
-					bch2_cut_back(bkey_start_pos(&i[1].k->k), &i[0].k->k);
+					bch2_cut_back(bkey_start_pos(&i[1].k->k), i[0].k);
 				} else {
 					struct bkey_i *split =
 						kmalloc(bkey_bytes(i[0].k), GFP_KERNEL);
@@ -186,7 +186,7 @@ static struct journal_keys journal_keys_sort(struct list_head *journal_entries)
 						goto err;
 
 					bkey_copy(split, i[0].k);
-					bch2_cut_back(bkey_start_pos(&i[1].k->k), &split->k);
+					bch2_cut_back(bkey_start_pos(&i[1].k->k), split);
 					keys_deduped.d[keys_deduped.nr++] = (struct journal_key) {
 						.btree_id	= i[0].btree_id,
 						.allocated	= true,
@@ -298,7 +298,7 @@ retry:
 
 		bkey_copy(split, k);
 		bch2_cut_front(split_iter->pos, split);
-		bch2_cut_back(atomic_end, &split->k);
+		bch2_cut_back(atomic_end, split);
 
 		bch2_trans_update(&trans, split_iter, split);
 		bch2_btree_iter_set_pos(iter, split->k.p);
diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c
index 6d21086c3254..4de65bf70362 100644
--- a/fs/bcachefs/reflink.c
+++ b/fs/bcachefs/reflink.c
@@ -40,7 +40,7 @@ enum merge_result bch2_reflink_p_merge(struct bch_fs *c,
 
 	if ((u64) l.k->size + r.k->size > KEY_SIZE_MAX) {
 		bch2_key_resize(l.k, KEY_SIZE_MAX);
-		__bch2_cut_front(l.k->p, _r);
+		bch2_cut_front_s(l.k->p, _r);
 		return BCH_MERGE_PARTIAL;
 	}
 
@@ -230,7 +230,7 @@ s64 bch2_remap_range(struct bch_fs *c,
 			src_k = bkey_i_to_s_c(new_src.k);
 
 			bch2_cut_front(src_iter->pos,	new_src.k);
-			bch2_cut_back(src_end,		&new_src.k->k);
+			bch2_cut_back(src_end,		new_src.k);
 
 			ret = bch2_make_extent_indirect(&trans, src_iter,
 						bkey_i_to_extent(new_src.k));
-- 
cgit 


From 08c07fea7b437f0a841f85cc9b670b60dacd85cf Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 15 Nov 2019 15:52:28 -0500
Subject: bcachefs: Split out extent_update.c

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/Makefile            |   1 +
 fs/bcachefs/btree_update_leaf.c |   2 +-
 fs/bcachefs/extent_update.c     | 532 ++++++++++++++++++++++++++++++++++++++++
 fs/bcachefs/extent_update.h     |  18 ++
 fs/bcachefs/extents.c           | 525 +--------------------------------------
 fs/bcachefs/extents.h           |  11 -
 fs/bcachefs/fs-io.c             |   1 +
 fs/bcachefs/io.c                |   2 +-
 8 files changed, 555 insertions(+), 537 deletions(-)
 create mode 100644 fs/bcachefs/extent_update.c
 create mode 100644 fs/bcachefs/extent_update.h

(limited to 'fs')

diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile
index 9d120936703a..6d5ad877fb07 100644
--- a/fs/bcachefs/Makefile
+++ b/fs/bcachefs/Makefile
@@ -26,6 +26,7 @@ bcachefs-y		:=	\
 	ec.o			\
 	error.o			\
 	extents.o		\
+	extent_update.o		\
 	fs.o			\
 	fs-common.o		\
 	fs-ioctl.o		\
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index a774fce027c2..6e5405f0b372 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -10,7 +10,7 @@
 #include "buckets.h"
 #include "debug.h"
 #include "error.h"
-#include "extents.h"
+#include "extent_update.h"
 #include "journal.h"
 #include "journal_reclaim.h"
 #include "keylist.h"
diff --git a/fs/bcachefs/extent_update.c b/fs/bcachefs/extent_update.c
new file mode 100644
index 000000000000..21426e01c395
--- /dev/null
+++ b/fs/bcachefs/extent_update.c
@@ -0,0 +1,532 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "bcachefs.h"
+#include "bkey_on_stack.h"
+#include "btree_update.h"
+#include "btree_update_interior.h"
+#include "buckets.h"
+#include "debug.h"
+#include "extents.h"
+#include "extent_update.h"
+
+/*
+ * This counts the number of iterators to the alloc & ec btrees we'll need
+ * inserting/removing this extent:
+ */
+static unsigned bch2_bkey_nr_alloc_ptrs(struct bkey_s_c k)
+{
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+	const union bch_extent_entry *entry;
+	unsigned ret = 0;
+
+	bkey_extent_entry_for_each(ptrs, entry) {
+		switch (__extent_entry_type(entry)) {
+		case BCH_EXTENT_ENTRY_ptr:
+		case BCH_EXTENT_ENTRY_stripe_ptr:
+			ret++;
+		}
+	}
+
+	return ret;
+}
+
+static int count_iters_for_insert(struct btree_trans *trans,
+				  struct bkey_s_c k,
+				  unsigned offset,
+				  struct bpos *end,
+				  unsigned *nr_iters,
+				  unsigned max_iters,
+				  bool overwrite)
+{
+	int ret = 0;
+
+	switch (k.k->type) {
+	case KEY_TYPE_extent:
+	case KEY_TYPE_reflink_v:
+		*nr_iters += bch2_bkey_nr_alloc_ptrs(k);
+
+		if (*nr_iters >= max_iters) {
+			*end = bpos_min(*end, k.k->p);
+			ret = 1;
+		}
+
+		break;
+	case KEY_TYPE_reflink_p: {
+		struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
+		u64 idx = le64_to_cpu(p.v->idx);
+		unsigned sectors = bpos_min(*end, p.k->p).offset -
+			bkey_start_offset(p.k);
+		struct btree_iter *iter;
+		struct bkey_s_c r_k;
+
+		for_each_btree_key(trans, iter,
+				   BTREE_ID_REFLINK, POS(0, idx + offset),
+				   BTREE_ITER_SLOTS, r_k, ret) {
+			if (bkey_cmp(bkey_start_pos(r_k.k),
+				     POS(0, idx + sectors)) >= 0)
+				break;
+
+			*nr_iters += 1 + bch2_bkey_nr_alloc_ptrs(r_k);
+
+			if (*nr_iters >= max_iters) {
+				struct bpos pos = bkey_start_pos(k.k);
+				pos.offset += r_k.k->p.offset - idx;
+
+				*end = bpos_min(*end, pos);
+				ret = 1;
+				break;
+			}
+		}
+
+		bch2_trans_iter_put(trans, iter);
+		break;
+	}
+	}
+
+	return ret;
+}
+
+#define EXTENT_ITERS_MAX	(BTREE_ITER_MAX / 3)
+
+int bch2_extent_atomic_end(struct btree_iter *iter,
+			   struct bkey_i *insert,
+			   struct bpos *end)
+{
+	struct btree_trans *trans = iter->trans;
+	struct btree *b;
+	struct btree_node_iter	node_iter;
+	struct bkey_packed	*_k;
+	unsigned		nr_iters = 0;
+	int ret;
+
+	ret = bch2_btree_iter_traverse(iter);
+	if (ret)
+		return ret;
+
+	b = iter->l[0].b;
+	node_iter = iter->l[0].iter;
+
+	BUG_ON(bkey_cmp(bkey_start_pos(&insert->k), b->data->min_key) < 0);
+
+	*end = bpos_min(insert->k.p, b->key.k.p);
+
+	ret = count_iters_for_insert(trans, bkey_i_to_s_c(insert), 0, end,
+				     &nr_iters, EXTENT_ITERS_MAX / 2, false);
+	if (ret < 0)
+		return ret;
+
+	while ((_k = bch2_btree_node_iter_peek_filter(&node_iter, b,
+						      KEY_TYPE_discard))) {
+		struct bkey	unpacked;
+		struct bkey_s_c	k = bkey_disassemble(b, _k, &unpacked);
+		unsigned offset = 0;
+
+		if (bkey_cmp(bkey_start_pos(k.k), *end) >= 0)
+			break;
+
+		if (bkey_cmp(bkey_start_pos(&insert->k),
+			     bkey_start_pos(k.k)) > 0)
+			offset = bkey_start_offset(&insert->k) -
+				bkey_start_offset(k.k);
+
+		ret = count_iters_for_insert(trans, k, offset, end,
+					&nr_iters, EXTENT_ITERS_MAX, true);
+		if (ret)
+			break;
+
+		bch2_btree_node_iter_advance(&node_iter, b);
+	}
+
+	return ret < 0 ? ret : 0;
+}
+
+int bch2_extent_trim_atomic(struct bkey_i *k, struct btree_iter *iter)
+{
+	struct bpos end;
+	int ret;
+
+	ret = bch2_extent_atomic_end(iter, k, &end);
+	if (ret)
+		return ret;
+
+	bch2_cut_back(end, k);
+	return 0;
+}
+
+int bch2_extent_is_atomic(struct bkey_i *k, struct btree_iter *iter)
+{
+	struct bpos end;
+	int ret;
+
+	ret = bch2_extent_atomic_end(iter, k, &end);
+	if (ret)
+		return ret;
+
+	return !bkey_cmp(end, k->k.p);
+}
+
+enum btree_insert_ret
+bch2_extent_can_insert(struct btree_trans *trans,
+		       struct btree_insert_entry *insert,
+		       unsigned *u64s)
+{
+	struct btree_iter_level *l = &insert->iter->l[0];
+	struct btree_node_iter node_iter = l->iter;
+	enum bch_extent_overlap overlap;
+	struct bkey_packed *_k;
+	struct bkey unpacked;
+	struct bkey_s_c k;
+	int sectors;
+
+	/*
+	 * We avoid creating whiteouts whenever possible when deleting, but
+	 * those optimizations mean we may potentially insert two whiteouts
+	 * instead of one (when we overlap with the front of one extent and the
+	 * back of another):
+	 */
+	if (bkey_whiteout(&insert->k->k))
+		*u64s += BKEY_U64s;
+
+	_k = bch2_btree_node_iter_peek_filter(&node_iter, l->b,
+					      KEY_TYPE_discard);
+	if (!_k)
+		return BTREE_INSERT_OK;
+
+	k = bkey_disassemble(l->b, _k, &unpacked);
+
+	overlap = bch2_extent_overlap(&insert->k->k, k.k);
+
+	/* account for having to split existing extent: */
+	if (overlap == BCH_EXTENT_OVERLAP_MIDDLE)
+		*u64s += _k->u64s;
+
+	if (overlap == BCH_EXTENT_OVERLAP_MIDDLE &&
+	    (sectors = bch2_extent_is_compressed(k))) {
+		int flags = trans->flags & BTREE_INSERT_NOFAIL
+			? BCH_DISK_RESERVATION_NOFAIL : 0;
+
+		switch (bch2_disk_reservation_add(trans->c,
+				trans->disk_res,
+				sectors, flags)) {
+		case 0:
+			break;
+		case -ENOSPC:
+			return BTREE_INSERT_ENOSPC;
+		default:
+			BUG();
+		}
+	}
+
+	return BTREE_INSERT_OK;
+}
+
+static void verify_extent_nonoverlapping(struct bch_fs *c,
+					 struct btree *b,
+					 struct btree_node_iter *_iter,
+					 struct bkey_i *insert)
+{
+#ifdef CONFIG_BCACHEFS_DEBUG
+	struct btree_node_iter iter;
+	struct bkey_packed *k;
+	struct bkey uk;
+
+	if (!expensive_debug_checks(c))
+		return;
+
+	iter = *_iter;
+	k = bch2_btree_node_iter_prev_filter(&iter, b, KEY_TYPE_discard);
+	BUG_ON(k &&
+	       (uk = bkey_unpack_key(b, k),
+		bkey_cmp(uk.p, bkey_start_pos(&insert->k)) > 0));
+
+	iter = *_iter;
+	k = bch2_btree_node_iter_peek_filter(&iter, b, KEY_TYPE_discard);
+#if 0
+	BUG_ON(k &&
+	       (uk = bkey_unpack_key(b, k),
+		bkey_cmp(insert->k.p, bkey_start_pos(&uk))) > 0);
+#else
+	if (k &&
+	    (uk = bkey_unpack_key(b, k),
+	     bkey_cmp(insert->k.p, bkey_start_pos(&uk))) > 0) {
+		char buf1[100];
+		char buf2[100];
+
+		bch2_bkey_to_text(&PBUF(buf1), &insert->k);
+		bch2_bkey_to_text(&PBUF(buf2), &uk);
+
+		bch2_dump_btree_node(b);
+		panic("insert > next :\n"
+		      "insert %s\n"
+		      "next   %s\n",
+		      buf1, buf2);
+	}
+#endif
+
+#endif
+}
+
+static void extent_bset_insert(struct bch_fs *c, struct btree_iter *iter,
+			       struct bkey_i *insert)
+{
+	struct btree_iter_level *l = &iter->l[0];
+	struct bkey_packed *k =
+		bch2_btree_node_iter_bset_pos(&l->iter, l->b, bset_tree_last(l->b));
+
+	BUG_ON(insert->k.u64s > bch_btree_keys_u64s_remaining(c, l->b));
+
+	EBUG_ON(bkey_deleted(&insert->k) || !insert->k.size);
+	verify_extent_nonoverlapping(c, l->b, &l->iter, insert);
+
+	if (debug_check_bkeys(c))
+		bch2_bkey_debugcheck(c, l->b, bkey_i_to_s_c(insert));
+
+	bch2_bset_insert(l->b, &l->iter, k, insert, 0);
+	bch2_btree_node_iter_fix(iter, l->b, &l->iter, k, 0, k->u64s);
+}
+
+static void
+extent_squash(struct bch_fs *c, struct btree_iter *iter,
+	      struct bkey_i *insert,
+	      struct bkey_packed *_k, struct bkey_s k,
+	      enum bch_extent_overlap overlap)
+{
+	struct btree_iter_level *l = &iter->l[0];
+	int u64s_delta;
+
+	switch (overlap) {
+	case BCH_EXTENT_OVERLAP_FRONT:
+		/* insert overlaps with start of k: */
+		u64s_delta = bch2_cut_front_s(insert->k.p, k);
+		btree_keys_account_val_delta(l->b, _k, u64s_delta);
+
+		EBUG_ON(bkey_deleted(k.k));
+		extent_save(l->b, _k, k.k);
+		bch2_btree_iter_fix_key_modified(iter, l->b, _k);
+		break;
+
+	case BCH_EXTENT_OVERLAP_BACK:
+		/* insert overlaps with end of k: */
+		u64s_delta = bch2_cut_back_s(bkey_start_pos(&insert->k), k);
+		btree_keys_account_val_delta(l->b, _k, u64s_delta);
+
+		EBUG_ON(bkey_deleted(k.k));
+		extent_save(l->b, _k, k.k);
+
+		/*
+		 * As the auxiliary tree is indexed by the end of the
+		 * key and we've just changed the end, update the
+		 * auxiliary tree.
+		 */
+		bch2_bset_fix_invalidated_key(l->b, _k);
+		bch2_btree_node_iter_fix(iter, l->b, &l->iter,
+					 _k, _k->u64s, _k->u64s);
+		break;
+
+	case BCH_EXTENT_OVERLAP_ALL: {
+		/* The insert key completely covers k, invalidate k */
+		if (!bkey_whiteout(k.k))
+			btree_account_key_drop(l->b, _k);
+
+		k.k->size = 0;
+		k.k->type = KEY_TYPE_deleted;
+
+		if (_k >= btree_bset_last(l->b)->start) {
+			unsigned u64s = _k->u64s;
+
+			bch2_bset_delete(l->b, _k, _k->u64s);
+			bch2_btree_node_iter_fix(iter, l->b, &l->iter,
+						 _k, u64s, 0);
+		} else {
+			extent_save(l->b, _k, k.k);
+			bch2_btree_iter_fix_key_modified(iter, l->b, _k);
+		}
+
+		break;
+	}
+	case BCH_EXTENT_OVERLAP_MIDDLE: {
+		struct bkey_on_stack split;
+
+		bkey_on_stack_init(&split);
+		bkey_on_stack_realloc(&split, c, k.k->u64s);
+
+		/*
+		 * The insert key falls 'in the middle' of k
+		 * The insert key splits k in 3:
+		 * - start only in k, preserve
+		 * - middle common section, invalidate in k
+		 * - end only in k, preserve
+		 *
+		 * We update the old key to preserve the start,
+		 * insert will be the new common section,
+		 * we manually insert the end that we are preserving.
+		 *
+		 * modify k _before_ doing the insert (which will move
+		 * what k points to)
+		 */
+		bkey_reassemble(split.k, k.s_c);
+		split.k->k.needs_whiteout |= bkey_written(l->b, _k);
+
+		bch2_cut_back(bkey_start_pos(&insert->k), split.k);
+		BUG_ON(bkey_deleted(&split.k->k));
+
+		u64s_delta = bch2_cut_front_s(insert->k.p, k);
+		btree_keys_account_val_delta(l->b, _k, u64s_delta);
+
+		BUG_ON(bkey_deleted(k.k));
+		extent_save(l->b, _k, k.k);
+		bch2_btree_iter_fix_key_modified(iter, l->b, _k);
+
+		extent_bset_insert(c, iter, split.k);
+		bkey_on_stack_exit(&split, c);
+		break;
+	}
+	}
+}
+
+/**
+ * bch_extent_insert_fixup - insert a new extent and deal with overlaps
+ *
+ * this may result in not actually doing the insert, or inserting some subset
+ * of the insert key. For cmpxchg operations this is where that logic lives.
+ *
+ * All subsets of @insert that need to be inserted are inserted using
+ * bch2_btree_insert_and_journal(). If @b or @res fills up, this function
+ * returns false, setting @iter->pos for the prefix of @insert that actually got
+ * inserted.
+ *
+ * BSET INVARIANTS: this function is responsible for maintaining all the
+ * invariants for bsets of extents in memory. things get really hairy with 0
+ * size extents
+ *
+ * within one bset:
+ *
+ * bkey_start_pos(bkey_next(k)) >= k
+ * or bkey_start_offset(bkey_next(k)) >= k->offset
+ *
+ * i.e. strict ordering, no overlapping extents.
+ *
+ * multiple bsets (i.e. full btree node):
+ *
+ * ∀ k, j
+ *   k.size != 0 ∧ j.size != 0 →
+ *     ¬ (k > bkey_start_pos(j) ∧ k < j)
+ *
+ * i.e. no two overlapping keys _of nonzero size_
+ *
+ * We can't realistically maintain this invariant for zero size keys because of
+ * the key merging done in bch2_btree_insert_key() - for two mergeable keys k, j
+ * there may be another 0 size key between them in another bset, and it will
+ * thus overlap with the merged key.
+ *
+ * In addition, the end of iter->pos indicates how much has been processed.
+ * If the end of iter->pos is not the same as the end of insert, then
+ * key insertion needs to continue/be retried.
+ */
+void bch2_insert_fixup_extent(struct btree_trans *trans,
+			      struct btree_insert_entry *insert_entry)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter *iter	= insert_entry->iter;
+	struct bkey_i *insert	= insert_entry->k;
+	struct btree_iter_level *l = &iter->l[0];
+	struct btree_node_iter node_iter = l->iter;
+	bool deleting		= bkey_whiteout(&insert->k);
+	bool update_journal	= !deleting;
+	bool update_btree	= !deleting;
+	struct bkey_i whiteout	= *insert;
+	struct bkey_packed *_k;
+	struct bkey unpacked;
+
+	EBUG_ON(iter->level);
+	EBUG_ON(!insert->k.size);
+	EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k)));
+
+	while ((_k = bch2_btree_node_iter_peek_filter(&l->iter, l->b,
+						      KEY_TYPE_discard))) {
+		struct bkey_s k = __bkey_disassemble(l->b, _k, &unpacked);
+		struct bpos cur_end = bpos_min(insert->k.p, k.k->p);
+		enum bch_extent_overlap overlap =
+			bch2_extent_overlap(&insert->k, k.k);
+
+		if (bkey_cmp(bkey_start_pos(k.k), insert->k.p) >= 0)
+			break;
+
+		if (!bkey_whiteout(k.k))
+			update_journal = true;
+
+		if (!update_journal) {
+			bch2_cut_front(cur_end, insert);
+			bch2_cut_front(cur_end, &whiteout);
+			bch2_btree_iter_set_pos_same_leaf(iter, cur_end);
+			goto next;
+		}
+
+		/*
+		 * When deleting, if possible just do it by switching the type
+		 * of the key we're deleting, instead of creating and inserting
+		 * a new whiteout:
+		 */
+		if (deleting &&
+		    !update_btree &&
+		    !bkey_cmp(insert->k.p, k.k->p) &&
+		    !bkey_cmp(bkey_start_pos(&insert->k), bkey_start_pos(k.k))) {
+			if (!bkey_whiteout(k.k)) {
+				btree_account_key_drop(l->b, _k);
+				_k->type = KEY_TYPE_discard;
+				reserve_whiteout(l->b, _k);
+				bch2_btree_iter_fix_key_modified(iter,
+								 l->b, _k);
+			}
+			break;
+		}
+
+		if (k.k->needs_whiteout || bkey_written(l->b, _k)) {
+			insert->k.needs_whiteout = true;
+			update_btree = true;
+		}
+
+		if (update_btree &&
+		    overlap == BCH_EXTENT_OVERLAP_ALL &&
+		    bkey_whiteout(k.k) &&
+		    k.k->needs_whiteout) {
+			unreserve_whiteout(l->b, _k);
+			_k->needs_whiteout = false;
+		}
+
+		extent_squash(c, iter, insert, _k, k, overlap);
+
+		if (!update_btree)
+			bch2_cut_front(cur_end, insert);
+next:
+		node_iter = l->iter;
+
+		if (overlap == BCH_EXTENT_OVERLAP_FRONT ||
+		    overlap == BCH_EXTENT_OVERLAP_MIDDLE)
+			break;
+	}
+
+	l->iter = node_iter;
+	bch2_btree_iter_set_pos_same_leaf(iter, insert->k.p);
+
+	if (update_btree) {
+		if (deleting)
+			insert->k.type = KEY_TYPE_discard;
+
+		EBUG_ON(bkey_deleted(&insert->k) || !insert->k.size);
+
+		extent_bset_insert(c, iter, insert);
+	}
+
+	if (update_journal) {
+		struct bkey_i *k = !deleting ? insert : &whiteout;
+
+		if (deleting)
+			k->k.type = KEY_TYPE_discard;
+
+		EBUG_ON(bkey_deleted(&k->k) || !k->k.size);
+
+		bch2_btree_journal_key(trans, iter, k);
+	}
+
+	bch2_cut_front(insert->k.p, insert);
+}
diff --git a/fs/bcachefs/extent_update.h b/fs/bcachefs/extent_update.h
new file mode 100644
index 000000000000..89d18e4b6758
--- /dev/null
+++ b/fs/bcachefs/extent_update.h
@@ -0,0 +1,18 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_EXTENT_UPDATE_H
+#define _BCACHEFS_EXTENT_UPDATE_H
+
+#include "bcachefs.h"
+
+int bch2_extent_atomic_end(struct btree_iter *, struct bkey_i *,
+			   struct bpos *);
+int bch2_extent_trim_atomic(struct bkey_i *, struct btree_iter *);
+int bch2_extent_is_atomic(struct bkey_i *, struct btree_iter *);
+
+enum btree_insert_ret
+bch2_extent_can_insert(struct btree_trans *, struct btree_insert_entry *,
+		       unsigned *);
+void bch2_insert_fixup_extent(struct btree_trans *,
+			      struct btree_insert_entry *);
+
+#endif /* _BCACHEFS_EXTENT_UPDATE_H */
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 6c1cc90ab320..8f511760102a 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -8,14 +8,11 @@
 
 #include "bcachefs.h"
 #include "bkey_methods.h"
-#include "bkey_on_stack.h"
 #include "btree_gc.h"
-#include "btree_update.h"
-#include "btree_update_interior.h"
+#include "btree_iter.h"
 #include "buckets.h"
 #include "checksum.h"
 #include "debug.h"
-#include "dirent.h"
 #include "disk_groups.h"
 #include "error.h"
 #include "extents.h"
@@ -26,7 +23,6 @@
 #include "super-io.h"
 #include "trace.h"
 #include "util.h"
-#include "xattr.h"
 
 unsigned bch2_bkey_nr_ptrs(struct bkey_s_c k)
 {
@@ -826,525 +822,6 @@ int bch2_cut_back_s(struct bpos where, struct bkey_s k)
 	return -val_u64s_delta;
 }
 
-static unsigned bch2_bkey_nr_alloc_ptrs(struct bkey_s_c k)
-{
-	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-	const union bch_extent_entry *entry;
-	unsigned ret = 0;
-
-	bkey_extent_entry_for_each(ptrs, entry) {
-		switch (__extent_entry_type(entry)) {
-		case BCH_EXTENT_ENTRY_ptr:
-		case BCH_EXTENT_ENTRY_stripe_ptr:
-			ret++;
-		}
-	}
-
-	return ret;
-}
-
-static int count_iters_for_insert(struct btree_trans *trans,
-				  struct bkey_s_c k,
-				  unsigned offset,
-				  struct bpos *end,
-				  unsigned *nr_iters,
-				  unsigned max_iters,
-				  bool overwrite)
-{
-	int ret = 0;
-
-	switch (k.k->type) {
-	case KEY_TYPE_extent:
-	case KEY_TYPE_reflink_v:
-		*nr_iters += bch2_bkey_nr_alloc_ptrs(k);
-
-		if (*nr_iters >= max_iters) {
-			*end = bpos_min(*end, k.k->p);
-			ret = 1;
-		}
-
-		break;
-	case KEY_TYPE_reflink_p: {
-		struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
-		u64 idx = le64_to_cpu(p.v->idx);
-		unsigned sectors = bpos_min(*end, p.k->p).offset -
-			bkey_start_offset(p.k);
-		struct btree_iter *iter;
-		struct bkey_s_c r_k;
-
-		for_each_btree_key(trans, iter,
-				   BTREE_ID_REFLINK, POS(0, idx + offset),
-				   BTREE_ITER_SLOTS, r_k, ret) {
-			if (bkey_cmp(bkey_start_pos(r_k.k),
-				     POS(0, idx + sectors)) >= 0)
-				break;
-
-			*nr_iters += 1 + bch2_bkey_nr_alloc_ptrs(r_k);
-
-			if (*nr_iters >= max_iters) {
-				struct bpos pos = bkey_start_pos(k.k);
-				pos.offset += r_k.k->p.offset - idx;
-
-				*end = bpos_min(*end, pos);
-				ret = 1;
-				break;
-			}
-		}
-
-		bch2_trans_iter_put(trans, iter);
-		break;
-	}
-	}
-
-	return ret;
-}
-
-#define EXTENT_ITERS_MAX	(BTREE_ITER_MAX / 3)
-
-int bch2_extent_atomic_end(struct btree_iter *iter,
-			   struct bkey_i *insert,
-			   struct bpos *end)
-{
-	struct btree_trans *trans = iter->trans;
-	struct btree *b;
-	struct btree_node_iter	node_iter;
-	struct bkey_packed	*_k;
-	unsigned		nr_iters = 0;
-	int ret;
-
-	ret = bch2_btree_iter_traverse(iter);
-	if (ret)
-		return ret;
-
-	b = iter->l[0].b;
-	node_iter = iter->l[0].iter;
-
-	BUG_ON(bkey_cmp(bkey_start_pos(&insert->k), b->data->min_key) < 0);
-
-	*end = bpos_min(insert->k.p, b->key.k.p);
-
-	ret = count_iters_for_insert(trans, bkey_i_to_s_c(insert), 0, end,
-				     &nr_iters, EXTENT_ITERS_MAX / 2, false);
-	if (ret < 0)
-		return ret;
-
-	while ((_k = bch2_btree_node_iter_peek_filter(&node_iter, b,
-						      KEY_TYPE_discard))) {
-		struct bkey	unpacked;
-		struct bkey_s_c	k = bkey_disassemble(b, _k, &unpacked);
-		unsigned offset = 0;
-
-		if (bkey_cmp(bkey_start_pos(k.k), *end) >= 0)
-			break;
-
-		if (bkey_cmp(bkey_start_pos(&insert->k),
-			     bkey_start_pos(k.k)) > 0)
-			offset = bkey_start_offset(&insert->k) -
-				bkey_start_offset(k.k);
-
-		ret = count_iters_for_insert(trans, k, offset, end,
-					&nr_iters, EXTENT_ITERS_MAX, true);
-		if (ret)
-			break;
-
-		bch2_btree_node_iter_advance(&node_iter, b);
-	}
-
-	return ret < 0 ? ret : 0;
-}
-
-int bch2_extent_trim_atomic(struct bkey_i *k, struct btree_iter *iter)
-{
-	struct bpos end;
-	int ret;
-
-	ret = bch2_extent_atomic_end(iter, k, &end);
-	if (ret)
-		return ret;
-
-	bch2_cut_back(end, k);
-	return 0;
-}
-
-int bch2_extent_is_atomic(struct bkey_i *k, struct btree_iter *iter)
-{
-	struct bpos end;
-	int ret;
-
-	ret = bch2_extent_atomic_end(iter, k, &end);
-	if (ret)
-		return ret;
-
-	return !bkey_cmp(end, k->k.p);
-}
-
-enum btree_insert_ret
-bch2_extent_can_insert(struct btree_trans *trans,
-		       struct btree_insert_entry *insert,
-		       unsigned *u64s)
-{
-	struct btree_iter_level *l = &insert->iter->l[0];
-	struct btree_node_iter node_iter = l->iter;
-	enum bch_extent_overlap overlap;
-	struct bkey_packed *_k;
-	struct bkey unpacked;
-	struct bkey_s_c k;
-	int sectors;
-
-	/*
-	 * We avoid creating whiteouts whenever possible when deleting, but
-	 * those optimizations mean we may potentially insert two whiteouts
-	 * instead of one (when we overlap with the front of one extent and the
-	 * back of another):
-	 */
-	if (bkey_whiteout(&insert->k->k))
-		*u64s += BKEY_U64s;
-
-	_k = bch2_btree_node_iter_peek_filter(&node_iter, l->b,
-					      KEY_TYPE_discard);
-	if (!_k)
-		return BTREE_INSERT_OK;
-
-	k = bkey_disassemble(l->b, _k, &unpacked);
-
-	overlap = bch2_extent_overlap(&insert->k->k, k.k);
-
-	/* account for having to split existing extent: */
-	if (overlap == BCH_EXTENT_OVERLAP_MIDDLE)
-		*u64s += _k->u64s;
-
-	if (overlap == BCH_EXTENT_OVERLAP_MIDDLE &&
-	    (sectors = bch2_extent_is_compressed(k))) {
-		int flags = trans->flags & BTREE_INSERT_NOFAIL
-			? BCH_DISK_RESERVATION_NOFAIL : 0;
-
-		switch (bch2_disk_reservation_add(trans->c,
-				trans->disk_res,
-				sectors, flags)) {
-		case 0:
-			break;
-		case -ENOSPC:
-			return BTREE_INSERT_ENOSPC;
-		default:
-			BUG();
-		}
-	}
-
-	return BTREE_INSERT_OK;
-}
-
-static void verify_extent_nonoverlapping(struct bch_fs *c,
-					 struct btree *b,
-					 struct btree_node_iter *_iter,
-					 struct bkey_i *insert)
-{
-#ifdef CONFIG_BCACHEFS_DEBUG
-	struct btree_node_iter iter;
-	struct bkey_packed *k;
-	struct bkey uk;
-
-	if (!expensive_debug_checks(c))
-		return;
-
-	iter = *_iter;
-	k = bch2_btree_node_iter_prev_filter(&iter, b, KEY_TYPE_discard);
-	BUG_ON(k &&
-	       (uk = bkey_unpack_key(b, k),
-		bkey_cmp(uk.p, bkey_start_pos(&insert->k)) > 0));
-
-	iter = *_iter;
-	k = bch2_btree_node_iter_peek_filter(&iter, b, KEY_TYPE_discard);
-#if 0
-	BUG_ON(k &&
-	       (uk = bkey_unpack_key(b, k),
-		bkey_cmp(insert->k.p, bkey_start_pos(&uk))) > 0);
-#else
-	if (k &&
-	    (uk = bkey_unpack_key(b, k),
-	     bkey_cmp(insert->k.p, bkey_start_pos(&uk))) > 0) {
-		char buf1[100];
-		char buf2[100];
-
-		bch2_bkey_to_text(&PBUF(buf1), &insert->k);
-		bch2_bkey_to_text(&PBUF(buf2), &uk);
-
-		bch2_dump_btree_node(b);
-		panic("insert > next :\n"
-		      "insert %s\n"
-		      "next   %s\n",
-		      buf1, buf2);
-	}
-#endif
-
-#endif
-}
-
-static void extent_bset_insert(struct bch_fs *c, struct btree_iter *iter,
-			       struct bkey_i *insert)
-{
-	struct btree_iter_level *l = &iter->l[0];
-	struct bkey_packed *k =
-		bch2_btree_node_iter_bset_pos(&l->iter, l->b, bset_tree_last(l->b));
-
-	BUG_ON(insert->k.u64s > bch_btree_keys_u64s_remaining(c, l->b));
-
-	EBUG_ON(bkey_deleted(&insert->k) || !insert->k.size);
-	verify_extent_nonoverlapping(c, l->b, &l->iter, insert);
-
-	if (debug_check_bkeys(c))
-		bch2_bkey_debugcheck(c, l->b, bkey_i_to_s_c(insert));
-
-	bch2_bset_insert(l->b, &l->iter, k, insert, 0);
-	bch2_btree_node_iter_fix(iter, l->b, &l->iter, k, 0, k->u64s);
-}
-
-static void
-extent_squash(struct bch_fs *c, struct btree_iter *iter,
-	      struct bkey_i *insert,
-	      struct bkey_packed *_k, struct bkey_s k,
-	      enum bch_extent_overlap overlap)
-{
-	struct btree_iter_level *l = &iter->l[0];
-	int u64s_delta;
-
-	switch (overlap) {
-	case BCH_EXTENT_OVERLAP_FRONT:
-		/* insert overlaps with start of k: */
-		u64s_delta = bch2_cut_front_s(insert->k.p, k);
-		btree_keys_account_val_delta(l->b, _k, u64s_delta);
-
-		EBUG_ON(bkey_deleted(k.k));
-		extent_save(l->b, _k, k.k);
-		bch2_btree_iter_fix_key_modified(iter, l->b, _k);
-		break;
-
-	case BCH_EXTENT_OVERLAP_BACK:
-		/* insert overlaps with end of k: */
-		u64s_delta = bch2_cut_back_s(bkey_start_pos(&insert->k), k);
-		btree_keys_account_val_delta(l->b, _k, u64s_delta);
-
-		EBUG_ON(bkey_deleted(k.k));
-		extent_save(l->b, _k, k.k);
-
-		/*
-		 * As the auxiliary tree is indexed by the end of the
-		 * key and we've just changed the end, update the
-		 * auxiliary tree.
-		 */
-		bch2_bset_fix_invalidated_key(l->b, _k);
-		bch2_btree_node_iter_fix(iter, l->b, &l->iter,
-					 _k, _k->u64s, _k->u64s);
-		break;
-
-	case BCH_EXTENT_OVERLAP_ALL: {
-		/* The insert key completely covers k, invalidate k */
-		if (!bkey_whiteout(k.k))
-			btree_account_key_drop(l->b, _k);
-
-		k.k->size = 0;
-		k.k->type = KEY_TYPE_deleted;
-
-		if (_k >= btree_bset_last(l->b)->start) {
-			unsigned u64s = _k->u64s;
-
-			bch2_bset_delete(l->b, _k, _k->u64s);
-			bch2_btree_node_iter_fix(iter, l->b, &l->iter,
-						 _k, u64s, 0);
-		} else {
-			extent_save(l->b, _k, k.k);
-			bch2_btree_iter_fix_key_modified(iter, l->b, _k);
-		}
-
-		break;
-	}
-	case BCH_EXTENT_OVERLAP_MIDDLE: {
-		struct bkey_on_stack split;
-
-		bkey_on_stack_init(&split);
-		bkey_on_stack_realloc(&split, c, k.k->u64s);
-
-		/*
-		 * The insert key falls 'in the middle' of k
-		 * The insert key splits k in 3:
-		 * - start only in k, preserve
-		 * - middle common section, invalidate in k
-		 * - end only in k, preserve
-		 *
-		 * We update the old key to preserve the start,
-		 * insert will be the new common section,
-		 * we manually insert the end that we are preserving.
-		 *
-		 * modify k _before_ doing the insert (which will move
-		 * what k points to)
-		 */
-		bkey_reassemble(split.k, k.s_c);
-		split.k->k.needs_whiteout |= bkey_written(l->b, _k);
-
-		bch2_cut_back(bkey_start_pos(&insert->k), split.k);
-		BUG_ON(bkey_deleted(&split.k->k));
-
-		u64s_delta = bch2_cut_front_s(insert->k.p, k);
-		btree_keys_account_val_delta(l->b, _k, u64s_delta);
-
-		BUG_ON(bkey_deleted(k.k));
-		extent_save(l->b, _k, k.k);
-		bch2_btree_iter_fix_key_modified(iter, l->b, _k);
-
-		extent_bset_insert(c, iter, split.k);
-		bkey_on_stack_exit(&split, c);
-		break;
-	}
-	}
-}
-
-/**
- * bch_extent_insert_fixup - insert a new extent and deal with overlaps
- *
- * this may result in not actually doing the insert, or inserting some subset
- * of the insert key. For cmpxchg operations this is where that logic lives.
- *
- * All subsets of @insert that need to be inserted are inserted using
- * bch2_btree_insert_and_journal(). If @b or @res fills up, this function
- * returns false, setting @iter->pos for the prefix of @insert that actually got
- * inserted.
- *
- * BSET INVARIANTS: this function is responsible for maintaining all the
- * invariants for bsets of extents in memory. things get really hairy with 0
- * size extents
- *
- * within one bset:
- *
- * bkey_start_pos(bkey_next(k)) >= k
- * or bkey_start_offset(bkey_next(k)) >= k->offset
- *
- * i.e. strict ordering, no overlapping extents.
- *
- * multiple bsets (i.e. full btree node):
- *
- * ∀ k, j
- *   k.size != 0 ∧ j.size != 0 →
- *     ¬ (k > bkey_start_pos(j) ∧ k < j)
- *
- * i.e. no two overlapping keys _of nonzero size_
- *
- * We can't realistically maintain this invariant for zero size keys because of
- * the key merging done in bch2_btree_insert_key() - for two mergeable keys k, j
- * there may be another 0 size key between them in another bset, and it will
- * thus overlap with the merged key.
- *
- * In addition, the end of iter->pos indicates how much has been processed.
- * If the end of iter->pos is not the same as the end of insert, then
- * key insertion needs to continue/be retried.
- */
-void bch2_insert_fixup_extent(struct btree_trans *trans,
-			      struct btree_insert_entry *insert_entry)
-{
-	struct bch_fs *c = trans->c;
-	struct btree_iter *iter	= insert_entry->iter;
-	struct bkey_i *insert	= insert_entry->k;
-	struct btree_iter_level *l = &iter->l[0];
-	struct btree_node_iter node_iter = l->iter;
-	bool deleting		= bkey_whiteout(&insert->k);
-	bool update_journal	= !deleting;
-	bool update_btree	= !deleting;
-	struct bkey_i whiteout	= *insert;
-	struct bkey_packed *_k;
-	struct bkey unpacked;
-
-	EBUG_ON(iter->level);
-	EBUG_ON(!insert->k.size);
-	EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k)));
-
-	while ((_k = bch2_btree_node_iter_peek_filter(&l->iter, l->b,
-						      KEY_TYPE_discard))) {
-		struct bkey_s k = __bkey_disassemble(l->b, _k, &unpacked);
-		struct bpos cur_end = bpos_min(insert->k.p, k.k->p);
-		enum bch_extent_overlap overlap =
-			bch2_extent_overlap(&insert->k, k.k);
-
-		if (bkey_cmp(bkey_start_pos(k.k), insert->k.p) >= 0)
-			break;
-
-		if (!bkey_whiteout(k.k))
-			update_journal = true;
-
-		if (!update_journal) {
-			bch2_cut_front(cur_end, insert);
-			bch2_cut_front(cur_end, &whiteout);
-			bch2_btree_iter_set_pos_same_leaf(iter, cur_end);
-			goto next;
-		}
-
-		/*
-		 * When deleting, if possible just do it by switching the type
-		 * of the key we're deleting, instead of creating and inserting
-		 * a new whiteout:
-		 */
-		if (deleting &&
-		    !update_btree &&
-		    !bkey_cmp(insert->k.p, k.k->p) &&
-		    !bkey_cmp(bkey_start_pos(&insert->k), bkey_start_pos(k.k))) {
-			if (!bkey_whiteout(k.k)) {
-				btree_account_key_drop(l->b, _k);
-				_k->type = KEY_TYPE_discard;
-				reserve_whiteout(l->b, _k);
-				bch2_btree_iter_fix_key_modified(iter,
-								 l->b, _k);
-			}
-			break;
-		}
-
-		if (k.k->needs_whiteout || bkey_written(l->b, _k)) {
-			insert->k.needs_whiteout = true;
-			update_btree = true;
-		}
-
-		if (update_btree &&
-		    overlap == BCH_EXTENT_OVERLAP_ALL &&
-		    bkey_whiteout(k.k) &&
-		    k.k->needs_whiteout) {
-			unreserve_whiteout(l->b, _k);
-			_k->needs_whiteout = false;
-		}
-
-		extent_squash(c, iter, insert, _k, k, overlap);
-
-		if (!update_btree)
-			bch2_cut_front(cur_end, insert);
-next:
-		node_iter = l->iter;
-
-		if (overlap == BCH_EXTENT_OVERLAP_FRONT ||
-		    overlap == BCH_EXTENT_OVERLAP_MIDDLE)
-			break;
-	}
-
-	l->iter = node_iter;
-	bch2_btree_iter_set_pos_same_leaf(iter, insert->k.p);
-
-	if (update_btree) {
-		if (deleting)
-			insert->k.type = KEY_TYPE_discard;
-
-		EBUG_ON(bkey_deleted(&insert->k) || !insert->k.size);
-
-		extent_bset_insert(c, iter, insert);
-	}
-
-	if (update_journal) {
-		struct bkey_i *k = !deleting ? insert : &whiteout;
-
-		if (deleting)
-			k->k.type = KEY_TYPE_discard;
-
-		EBUG_ON(bkey_deleted(&k->k) || !k->k.size);
-
-		bch2_btree_journal_key(trans, iter, k);
-	}
-
-	bch2_cut_front(insert->k.p, insert);
-}
-
 const char *bch2_extent_invalid(const struct bch_fs *c, struct bkey_s_c k)
 {
 	return bch2_bkey_ptrs_invalid(c, k);
diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h
index 6e893c37c287..e360e1989812 100644
--- a/fs/bcachefs/extents.h
+++ b/fs/bcachefs/extents.h
@@ -433,17 +433,6 @@ enum merge_result bch2_reservation_merge(struct bch_fs *,
 	.key_merge	= bch2_reservation_merge,		\
 }
 
-int bch2_extent_atomic_end(struct btree_iter *, struct bkey_i *,
-			   struct bpos *);
-int bch2_extent_trim_atomic(struct bkey_i *, struct btree_iter *);
-int bch2_extent_is_atomic(struct bkey_i *, struct btree_iter *);
-
-enum btree_insert_ret
-bch2_extent_can_insert(struct btree_trans *, struct btree_insert_entry *,
-		       unsigned *);
-void bch2_insert_fixup_extent(struct btree_trans *,
-			      struct btree_insert_entry *);
-
 void bch2_bkey_mark_replicas_cached(struct bch_fs *, struct bkey_s,
 				    unsigned, unsigned);
 
diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 8b8442f9a81c..fab952856e36 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -9,6 +9,7 @@
 #include "clock.h"
 #include "error.h"
 #include "extents.h"
+#include "extent_update.h"
 #include "fs.h"
 #include "fs-io.h"
 #include "fsck.h"
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index 0f1be5c5543d..ef953499e66c 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -19,7 +19,7 @@
 #include "disk_groups.h"
 #include "ec.h"
 #include "error.h"
-#include "extents.h"
+#include "extent_update.h"
 #include "inode.h"
 #include "io.h"
 #include "journal.h"
-- 
cgit 


From 4be1a412ea34923370cd6163232d7928ae9a0e4a Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 9 Nov 2019 16:43:16 -0500
Subject: bcachefs: Inline data extents

This implements extents that have their data inline, in the value,
instead of the bkey value being pointers to the data - and the read and
write paths are updated to read from these new extent types and write
them out, when the write size is small enough.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs_format.h | 11 +++++-
 fs/bcachefs/bkey.h            |  1 +
 fs/bcachefs/bkey_methods.c    | 22 +++++++++--
 fs/bcachefs/extents.c         | 25 ++++++++----
 fs/bcachefs/extents.h         |  2 +
 fs/bcachefs/fs-io.c           | 12 ++++++
 fs/bcachefs/io.c              | 89 +++++++++++++++++++++++++++++++++++++------
 fs/bcachefs/io.h              |  5 ++-
 fs/bcachefs/recovery.c        |  6 +++
 9 files changed, 147 insertions(+), 26 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index 6ba830583846..e3004593874c 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -342,7 +342,8 @@ static inline void bkey_init(struct bkey *k)
 	x(quota,		13)			\
 	x(stripe,		14)			\
 	x(reflink_p,		15)			\
-	x(reflink_v,		16)
+	x(reflink_v,		16)			\
+	x(inline_data,		17)
 
 enum bch_bkey_type {
 #define x(name, nr) KEY_TYPE_##name	= nr,
@@ -915,6 +916,13 @@ struct bch_reflink_v {
 	__u64			_data[0];
 };
 
+/* Inline data */
+
+struct bch_inline_data {
+	struct bch_val		v;
+	u8			data[0];
+};
+
 /* Optional/variable size superblock sections: */
 
 struct bch_sb_field {
@@ -1319,6 +1327,7 @@ enum bch_sb_features {
 	BCH_FEATURE_JOURNAL_SEQ_BLACKLIST_V3 = 5,
 	BCH_FEATURE_REFLINK		= 6,
 	BCH_FEATURE_NEW_SIPHASH		= 7,
+	BCH_FEATURE_INLINE_DATA		= 8,
 	BCH_FEATURE_NR,
 };
 
diff --git a/fs/bcachefs/bkey.h b/fs/bcachefs/bkey.h
index ba4d6329e37a..36e6ecc04514 100644
--- a/fs/bcachefs/bkey.h
+++ b/fs/bcachefs/bkey.h
@@ -572,6 +572,7 @@ BKEY_VAL_ACCESSORS(quota);
 BKEY_VAL_ACCESSORS(stripe);
 BKEY_VAL_ACCESSORS(reflink_p);
 BKEY_VAL_ACCESSORS(reflink_v);
+BKEY_VAL_ACCESSORS(inline_data);
 
 /* byte order helpers */
 
diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c
index f01405dd502b..5312184c37f7 100644
--- a/fs/bcachefs/bkey_methods.c
+++ b/fs/bcachefs/bkey_methods.c
@@ -63,6 +63,23 @@ static const char *key_type_cookie_invalid(const struct bch_fs *c,
 	.key_invalid = empty_val_key_invalid,		\
 }
 
+static const char *key_type_inline_data_invalid(const struct bch_fs *c,
+					   struct bkey_s_c k)
+{
+	return NULL;
+}
+
+static void key_type_inline_data_to_text(struct printbuf *out, struct bch_fs *c,
+					 struct bkey_s_c k)
+{
+	pr_buf(out, "(%zu bytes)", bkey_val_bytes(k.k));
+}
+
+static const struct bkey_ops bch2_bkey_ops_inline_data = {
+	.key_invalid	= key_type_inline_data_invalid,
+	.val_to_text	= key_type_inline_data_to_text,
+};
+
 static const struct bkey_ops bch2_bkey_ops[] = {
 #define x(name, nr) [KEY_TYPE_##name]	= bch2_bkey_ops_##name,
 	BCH_BKEY_TYPES()
@@ -83,9 +100,8 @@ const char *__bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k,
 	if (k.k->u64s < BKEY_U64s)
 		return "u64s too small";
 
-	if ((btree_node_type_is_extents(type) ||
-	     type == BKEY_TYPE_BTREE) &&
-	    bkey_val_u64s(k.k) > BKEY_EXTENT_VAL_U64s_MAX)
+	if (type == BKEY_TYPE_BTREE &&
+	    bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX)
 		return "value too big";
 
 	if (btree_node_type_is_extents(type)) {
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 8f511760102a..0e25fbe65b95 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -737,11 +737,6 @@ int bch2_cut_front_s(struct bpos where, struct bkey_s k)
 	}
 
 	switch (k.k->type) {
-	case KEY_TYPE_deleted:
-	case KEY_TYPE_discard:
-	case KEY_TYPE_error:
-	case KEY_TYPE_cookie:
-		break;
 	case KEY_TYPE_extent:
 	case KEY_TYPE_reflink_v: {
 		struct bkey_ptrs ptrs = bch2_bkey_ptrs(k);
@@ -779,10 +774,18 @@ int bch2_cut_front_s(struct bpos where, struct bkey_s k)
 		le64_add_cpu(&p.v->idx, sub);
 		break;
 	}
-	case KEY_TYPE_reservation:
+	case KEY_TYPE_inline_data: {
+		struct bkey_s_inline_data d = bkey_s_to_inline_data(k);
+
+		sub = min_t(u64, sub << 9, bkey_val_bytes(d.k));
+
+		memmove(d.v->data,
+			d.v->data + sub,
+			bkey_val_bytes(d.k) - sub);
+
+		new_val_u64s -= sub >> 3;
 		break;
-	default:
-		BUG();
+	}
 	}
 
 	val_u64s_delta = bkey_val_u64s(k.k) - new_val_u64s;
@@ -814,6 +817,12 @@ int bch2_cut_back_s(struct bpos where, struct bkey_s k)
 		new_val_u64s = 0;
 	}
 
+	switch (k.k->type) {
+	case KEY_TYPE_inline_data:
+		new_val_u64s = min(new_val_u64s, k.k->size << 6);
+		break;
+	}
+
 	val_u64s_delta = bkey_val_u64s(k.k) - new_val_u64s;
 	BUG_ON(val_u64s_delta < 0);
 
diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h
index e360e1989812..35a66d4f4ea2 100644
--- a/fs/bcachefs/extents.h
+++ b/fs/bcachefs/extents.h
@@ -456,6 +456,7 @@ static inline bool bkey_extent_is_direct_data(const struct bkey *k)
 static inline bool bkey_extent_is_data(const struct bkey *k)
 {
 	return bkey_extent_is_direct_data(k) ||
+		k->type == KEY_TYPE_inline_data ||
 		k->type == KEY_TYPE_reflink_p;
 }
 
@@ -469,6 +470,7 @@ static inline bool bkey_extent_is_allocation(const struct bkey *k)
 	case KEY_TYPE_reservation:
 	case KEY_TYPE_reflink_p:
 	case KEY_TYPE_reflink_v:
+	case KEY_TYPE_inline_data:
 		return true;
 	default:
 		return false;
diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index fab952856e36..7abe53be7dd3 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -990,6 +990,18 @@ static void bch2_writepage_io_done(struct closure *cl)
 		}
 	}
 
+	if (io->op.flags & BCH_WRITE_WROTE_DATA_INLINE) {
+		bio_for_each_segment_all(bvec, bio, iter) {
+			struct bch_page_state *s;
+
+			s = __bch2_page_state(bvec->bv_page);
+			spin_lock(&s->lock);
+			for (i = 0; i < PAGE_SECTORS; i++)
+				s->s[i].nr_replicas = 0;
+			spin_unlock(&s->lock);
+		}
+	}
+
 	/*
 	 * racing with fallocate can cause us to add fewer sectors than
 	 * expected - but we shouldn't add more sectors than expected:
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index ef953499e66c..8f558347ca7f 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -539,16 +539,19 @@ static void __bch2_write_index(struct bch_write_op *op)
 
 	for (src = keys->keys; src != keys->top; src = n) {
 		n = bkey_next(src);
-		bkey_copy(dst, src);
 
-		bch2_bkey_drop_ptrs(bkey_i_to_s(dst), ptr,
-			test_bit(ptr->dev, op->failed.d));
+		if (bkey_extent_is_direct_data(&src->k)) {
+			bch2_bkey_drop_ptrs(bkey_i_to_s(src), ptr,
+					    test_bit(ptr->dev, op->failed.d));
 
-		if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(dst))) {
-			ret = -EIO;
-			goto err;
+			if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(src))) {
+				ret = -EIO;
+				goto err;
+			}
 		}
 
+		if (dst != src)
+			memmove_u64s_down(dst, src, src->u64s);
 		dst = bkey_next(dst);
 	}
 
@@ -1092,7 +1095,7 @@ again:
 
 		bio->bi_end_io	= bch2_write_endio;
 		bio->bi_private	= &op->cl;
-		bio->bi_opf	= REQ_OP_WRITE;
+		bio->bi_opf |= REQ_OP_WRITE;
 
 		if (!skip_put)
 			closure_get(bio->bi_private);
@@ -1129,6 +1132,47 @@ flush_io:
 	goto again;
 }
 
+static void bch2_write_data_inline(struct bch_write_op *op, unsigned data_len)
+{
+	struct closure *cl = &op->cl;
+	struct bio *bio = &op->wbio.bio;
+	struct bvec_iter iter;
+	struct bkey_i_inline_data *id;
+	unsigned sectors;
+	int ret;
+
+	ret = bch2_keylist_realloc(&op->insert_keys, op->inline_keys,
+				   ARRAY_SIZE(op->inline_keys),
+				   BKEY_U64s + DIV_ROUND_UP(data_len, 8));
+	if (ret) {
+		op->error = ret;
+		goto err;
+	}
+
+	sectors = bio_sectors(bio);
+	op->pos.offset += sectors;
+
+	id = bkey_inline_data_init(op->insert_keys.top);
+	id->k.p		= op->pos;
+	id->k.version	= op->version;
+	id->k.size	= sectors;
+
+	iter = bio->bi_iter;
+	iter.bi_size = data_len;
+	memcpy_from_bio(id->v.data, bio, iter);
+
+	while (data_len & 7)
+		id->v.data[data_len++] = '\0';
+	set_bkey_val_bytes(&id->k, data_len);
+	bch2_keylist_push(&op->insert_keys);
+
+	op->flags |= BCH_WRITE_WROTE_DATA_INLINE;
+	continue_at_nobarrier(cl, bch2_write_index, NULL);
+	return;
+err:
+	bch2_write_done(&op->cl);
+}
+
 /**
  * bch_write - handle a write to a cache device or flash only volume
  *
@@ -1150,22 +1194,22 @@ void bch2_write(struct closure *cl)
 	struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
 	struct bio *bio = &op->wbio.bio;
 	struct bch_fs *c = op->c;
+	unsigned data_len;
 
 	BUG_ON(!op->nr_replicas);
 	BUG_ON(!op->write_point.v);
 	BUG_ON(!bkey_cmp(op->pos, POS_MAX));
 
+	op->start_time = local_clock();
+	bch2_keylist_init(&op->insert_keys, op->inline_keys);
+	wbio_init(bio)->put_bio = false;
+
 	if (bio_sectors(bio) & (c->opts.block_size - 1)) {
 		__bcache_io_error(c, "misaligned write");
 		op->error = -EIO;
 		goto err;
 	}
 
-	op->start_time = local_clock();
-
-	bch2_keylist_init(&op->insert_keys, op->inline_keys);
-	wbio_init(bio)->put_bio = false;
-
 	if (c->opts.nochanges ||
 	    !percpu_ref_tryget(&c->writes)) {
 		__bcache_io_error(c, "read only");
@@ -1175,6 +1219,14 @@ void bch2_write(struct closure *cl)
 
 	bch2_increment_clock(c, bio_sectors(bio), WRITE);
 
+	data_len = min_t(u64, bio->bi_iter.bi_size,
+			 op->new_i_size - (op->pos.offset << 9));
+
+	if (data_len <= min(block_bytes(c) / 2, 1024U)) {
+		bch2_write_data_inline(op, data_len);
+		return;
+	}
+
 	continue_at_nobarrier(cl, __bch2_write, NULL);
 	return;
 err:
@@ -1892,6 +1944,19 @@ int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig,
 	struct bpos pos = bkey_start_pos(k.k);
 	int pick_ret;
 
+	if (k.k->type == KEY_TYPE_inline_data) {
+		struct bkey_s_c_inline_data d = bkey_s_c_to_inline_data(k);
+		unsigned bytes = min_t(unsigned, iter.bi_size,
+				       bkey_val_bytes(d.k));
+
+		swap(iter.bi_size, bytes);
+		memcpy_to_bio(&orig->bio, iter, d.v->data);
+		swap(iter.bi_size, bytes);
+		bio_advance_iter(&orig->bio, &iter, bytes);
+		zero_fill_bio_iter(&orig->bio, iter);
+		goto out_read_done;
+	}
+
 	pick_ret = bch2_bkey_pick_read_device(c, k, failed, &pick);
 
 	/* hole or reservation - just zero fill: */
diff --git a/fs/bcachefs/io.h b/fs/bcachefs/io.h
index 81fc549a0c97..fa5841a86fcb 100644
--- a/fs/bcachefs/io.h
+++ b/fs/bcachefs/io.h
@@ -34,10 +34,11 @@ enum bch_write_flags {
 	BCH_WRITE_PAGES_OWNED		= (1 << 5),
 	BCH_WRITE_ONLY_SPECIFIED_DEVS	= (1 << 6),
 	BCH_WRITE_NOPUT_RESERVATION	= (1 << 7),
+	BCH_WRITE_WROTE_DATA_INLINE	= (1 << 8),
 
 	/* Internal: */
-	BCH_WRITE_JOURNAL_SEQ_PTR	= (1 << 8),
-	BCH_WRITE_SKIP_CLOSURE_PUT	= (1 << 9),
+	BCH_WRITE_JOURNAL_SEQ_PTR	= (1 << 9),
+	BCH_WRITE_SKIP_CLOSURE_PUT	= (1 << 10),
 };
 
 static inline u64 *op_journal_seq(struct bch_write_op *op)
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 2efe023b2f0d..9102a1ce1ec4 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -913,6 +913,12 @@ int bch2_fs_recovery(struct bch_fs *c)
 		write_sb = true;
 	}
 
+	if (!(c->sb.features & (1ULL << BCH_FEATURE_INLINE_DATA))) {
+		c->disk_sb.sb->features[0] |=
+			cpu_to_le64(1ULL << BCH_FEATURE_INLINE_DATA);
+		write_sb = true;
+	}
+
 	if (!test_bit(BCH_FS_ERROR, &c->flags)) {
 		c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_FEAT_ALLOC_INFO;
 		write_sb = true;
-- 
cgit 


From 4de774952b12963a2970c77eeb6ebbd48c1d20a4 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 16 Nov 2019 16:25:58 -0500
Subject: bcachefs: Reorganize extents.c

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/extent_update.c |    2 +-
 fs/bcachefs/extents.c       | 1556 +++++++++++++++++++++----------------------
 fs/bcachefs/extents.h       |  260 ++++----
 fs/bcachefs/fs-io.c         |    8 +-
 fs/bcachefs/io.c            |    4 +-
 fs/bcachefs/move.c          |    8 +-
 fs/bcachefs/recovery.c      |    4 +-
 7 files changed, 898 insertions(+), 944 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/extent_update.c b/fs/bcachefs/extent_update.c
index 21426e01c395..91ceb5d53f92 100644
--- a/fs/bcachefs/extent_update.c
+++ b/fs/bcachefs/extent_update.c
@@ -200,7 +200,7 @@ bch2_extent_can_insert(struct btree_trans *trans,
 		*u64s += _k->u64s;
 
 	if (overlap == BCH_EXTENT_OVERLAP_MIDDLE &&
-	    (sectors = bch2_extent_is_compressed(k))) {
+	    (sectors = bch2_bkey_sectors_compressed(k))) {
 		int flags = trans->flags & BTREE_INSERT_NOFAIL
 			? BCH_DISK_RESERVATION_NOFAIL : 0;
 
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 0e25fbe65b95..b85056440ac3 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -24,81 +24,15 @@
 #include "trace.h"
 #include "util.h"
 
-unsigned bch2_bkey_nr_ptrs(struct bkey_s_c k)
-{
-	struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k);
-	const struct bch_extent_ptr *ptr;
-	unsigned nr_ptrs = 0;
-
-	bkey_for_each_ptr(p, ptr)
-		nr_ptrs++;
-
-	return nr_ptrs;
-}
-
-unsigned bch2_bkey_nr_dirty_ptrs(struct bkey_s_c k)
-{
-	unsigned nr_ptrs = 0;
-
-	switch (k.k->type) {
-	case KEY_TYPE_btree_ptr:
-	case KEY_TYPE_extent:
-	case KEY_TYPE_reflink_v: {
-		struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k);
-		const struct bch_extent_ptr *ptr;
-
-		bkey_for_each_ptr(p, ptr)
-			nr_ptrs += !ptr->cached;
-		BUG_ON(!nr_ptrs);
-		break;
-	}
-	case KEY_TYPE_reservation:
-		nr_ptrs = bkey_s_c_to_reservation(k).v->nr_replicas;
-		break;
-	}
-
-	return nr_ptrs;
-}
-
-static unsigned bch2_extent_ptr_durability(struct bch_fs *c,
-					   struct extent_ptr_decoded p)
-{
-	unsigned durability = 0;
-	struct bch_dev *ca;
-
-	if (p.ptr.cached)
-		return 0;
-
-	ca = bch_dev_bkey_exists(c, p.ptr.dev);
-
-	if (ca->mi.state != BCH_MEMBER_STATE_FAILED)
-		durability = max_t(unsigned, durability, ca->mi.durability);
-
-	if (p.has_ec) {
-		struct stripe *s =
-			genradix_ptr(&c->stripes[0], p.ec.idx);
-
-		if (WARN_ON(!s))
-			goto out;
-
-		durability = max_t(unsigned, durability, s->nr_redundant);
-	}
-out:
-	return durability;
-}
-
-unsigned bch2_bkey_durability(struct bch_fs *c, struct bkey_s_c k)
-{
-	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-	const union bch_extent_entry *entry;
-	struct extent_ptr_decoded p;
-	unsigned durability = 0;
-
-	bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
-		durability += bch2_extent_ptr_durability(c, p);
+static unsigned bch2_crc_field_size_max[] = {
+	[BCH_EXTENT_ENTRY_crc32] = CRC32_SIZE_MAX,
+	[BCH_EXTENT_ENTRY_crc64] = CRC64_SIZE_MAX,
+	[BCH_EXTENT_ENTRY_crc128] = CRC128_SIZE_MAX,
+};
 
-	return durability;
-}
+static void bch2_extent_crc_pack(union bch_extent_crc *,
+				 struct bch_extent_crc_unpacked,
+				 enum bch_extent_entry_type);
 
 static struct bch_dev_io_failures *dev_io_failures(struct bch_io_failures *f,
 						   unsigned dev)
@@ -218,172 +152,299 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k,
 	return ret;
 }
 
-void bch2_bkey_append_ptr(struct bkey_i *k,
-			  struct bch_extent_ptr ptr)
-{
-	EBUG_ON(bch2_bkey_has_device(bkey_i_to_s_c(k), ptr.dev));
-
-	switch (k->k.type) {
-	case KEY_TYPE_btree_ptr:
-	case KEY_TYPE_extent:
-		EBUG_ON(bkey_val_u64s(&k->k) >= BKEY_EXTENT_VAL_U64s_MAX);
-
-		ptr.type = 1 << BCH_EXTENT_ENTRY_ptr;
-
-		memcpy((void *) &k->v + bkey_val_bytes(&k->k),
-		       &ptr,
-		       sizeof(ptr));
-		k->u64s++;
-		break;
-	default:
-		BUG();
-	}
-}
+/* KEY_TYPE_btree_ptr: */
 
-void bch2_bkey_drop_device(struct bkey_s k, unsigned dev)
+const char *bch2_btree_ptr_invalid(const struct bch_fs *c, struct bkey_s_c k)
 {
-	struct bch_extent_ptr *ptr;
+	if (bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX)
+		return "value too big";
 
-	bch2_bkey_drop_ptrs(k, ptr, ptr->dev == dev);
+	return bch2_bkey_ptrs_invalid(c, k);
 }
 
-const struct bch_extent_ptr *
-bch2_bkey_has_device(struct bkey_s_c k, unsigned dev)
+void bch2_btree_ptr_debugcheck(struct bch_fs *c, struct bkey_s_c k)
 {
 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
 	const struct bch_extent_ptr *ptr;
+	const char *err;
+	char buf[160];
+	struct bucket_mark mark;
+	struct bch_dev *ca;
 
-	bkey_for_each_ptr(ptrs, ptr)
-		if (ptr->dev == dev)
-			return ptr;
-
-	return NULL;
-}
-
-bool bch2_bkey_has_target(struct bch_fs *c, struct bkey_s_c k, unsigned target)
-{
-	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-	const struct bch_extent_ptr *ptr;
+	bch2_fs_bug_on(!test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) &&
+		       !bch2_bkey_replicas_marked(c, k, false), c,
+		       "btree key bad (replicas not marked in superblock):\n%s",
+		       (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
 
-	bkey_for_each_ptr(ptrs, ptr)
-		if (bch2_dev_in_target(c, ptr->dev, target) &&
-		    (!ptr->cached ||
-		     !ptr_stale(bch_dev_bkey_exists(c, ptr->dev), ptr)))
-			return true;
+	if (!test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags))
+		return;
 
-	return false;
-}
+	bkey_for_each_ptr(ptrs, ptr) {
+		ca = bch_dev_bkey_exists(c, ptr->dev);
 
-/* extent specific utility code */
+		mark = ptr_bucket_mark(ca, ptr);
 
-const struct bch_extent_ptr *
-bch2_extent_has_device(struct bkey_s_c_extent e, unsigned dev)
-{
-	const struct bch_extent_ptr *ptr;
+		err = "stale";
+		if (gen_after(mark.gen, ptr->gen))
+			goto err;
 
-	extent_for_each_ptr(e, ptr)
-		if (ptr->dev == dev)
-			return ptr;
+		err = "inconsistent";
+		if (mark.data_type != BCH_DATA_BTREE ||
+		    mark.dirty_sectors < c->opts.btree_node_size)
+			goto err;
+	}
 
-	return NULL;
+	return;
+err:
+	bch2_bkey_val_to_text(&PBUF(buf), c, k);
+	bch2_fs_bug(c, "%s btree pointer %s: bucket %zi gen %i mark %08x",
+		    err, buf, PTR_BUCKET_NR(ca, ptr),
+		    mark.gen, (unsigned) mark.v.counter);
 }
 
-const struct bch_extent_ptr *
-bch2_extent_has_group(struct bch_fs *c, struct bkey_s_c_extent e, unsigned group)
+void bch2_btree_ptr_to_text(struct printbuf *out, struct bch_fs *c,
+			    struct bkey_s_c k)
 {
-	const struct bch_extent_ptr *ptr;
-
-	extent_for_each_ptr(e, ptr) {
-		struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+	bch2_bkey_ptrs_to_text(out, c, k);
+}
 
-		if (ca->mi.group &&
-		    ca->mi.group - 1 == group)
-			return ptr;
-	}
+/* KEY_TYPE_extent: */
 
-	return NULL;
+const char *bch2_extent_invalid(const struct bch_fs *c, struct bkey_s_c k)
+{
+	return bch2_bkey_ptrs_invalid(c, k);
 }
 
-unsigned bch2_extent_is_compressed(struct bkey_s_c k)
+void bch2_extent_debugcheck(struct bch_fs *c, struct bkey_s_c k)
 {
-	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+	struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
 	const union bch_extent_entry *entry;
 	struct extent_ptr_decoded p;
-	unsigned ret = 0;
+	char buf[160];
 
-	bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
-		if (!p.ptr.cached &&
-		    p.crc.compression_type != BCH_COMPRESSION_NONE)
-			ret += p.crc.compressed_size;
+	/*
+	 * XXX: we should be doing most/all of these checks at startup time,
+	 * where we check bch2_bkey_invalid() in btree_node_read_done()
+	 *
+	 * But note that we can't check for stale pointers or incorrect gc marks
+	 * until after journal replay is done (it might be an extent that's
+	 * going to get overwritten during replay)
+	 */
 
-	return ret;
-}
+	if (percpu_down_read_trylock(&c->mark_lock)) {
+		bch2_fs_bug_on(!test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) &&
+			       !bch2_bkey_replicas_marked_locked(c, e.s_c, false), c,
+			       "extent key bad (replicas not marked in superblock):\n%s",
+			       (bch2_bkey_val_to_text(&PBUF(buf), c, e.s_c), buf));
+		percpu_up_read(&c->mark_lock);
+	}
+	/*
+	 * If journal replay hasn't finished, we might be seeing keys
+	 * that will be overwritten by the time journal replay is done:
+	 */
+	if (!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags))
+		return;
 
-bool bch2_bkey_matches_ptr(struct bch_fs *c, struct bkey_s_c k,
-			   struct bch_extent_ptr m, u64 offset)
-{
-	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-	const union bch_extent_entry *entry;
-	struct extent_ptr_decoded p;
+	extent_for_each_ptr_decode(e, p, entry) {
+		struct bch_dev *ca	= bch_dev_bkey_exists(c, p.ptr.dev);
+		struct bucket_mark mark = ptr_bucket_mark(ca, &p.ptr);
+		unsigned stale		= gen_after(mark.gen, p.ptr.gen);
+		unsigned disk_sectors	= ptr_disk_sectors(p);
+		unsigned mark_sectors	= p.ptr.cached
+			? mark.cached_sectors
+			: mark.dirty_sectors;
 
-	bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
-		if (p.ptr.dev	== m.dev &&
-		    p.ptr.gen	== m.gen &&
-		    (s64) p.ptr.offset + p.crc.offset - bkey_start_offset(k.k) ==
-		    (s64) m.offset  - offset)
-			return true;
+		bch2_fs_bug_on(stale && !p.ptr.cached, c,
+			       "stale dirty pointer (ptr gen %u bucket %u",
+			       p.ptr.gen, mark.gen);
 
-	return false;
+		bch2_fs_bug_on(stale > 96, c, "key too stale: %i", stale);
+
+		bch2_fs_bug_on(!stale &&
+			       (mark.data_type != BCH_DATA_USER ||
+				mark_sectors < disk_sectors), c,
+			       "extent pointer not marked: %s:\n"
+			       "type %u sectors %u < %u",
+			       (bch2_bkey_val_to_text(&PBUF(buf), c, e.s_c), buf),
+			       mark.data_type,
+			       mark_sectors, disk_sectors);
+	}
 }
 
-static union bch_extent_entry *extent_entry_prev(struct bkey_ptrs ptrs,
-					  union bch_extent_entry *entry)
+void bch2_extent_to_text(struct printbuf *out, struct bch_fs *c,
+			 struct bkey_s_c k)
 {
-	union bch_extent_entry *i = ptrs.start;
-
-	if (i == entry)
-		return NULL;
-
-	while (extent_entry_next(i) != entry)
-		i = extent_entry_next(i);
-	return i;
+	bch2_bkey_ptrs_to_text(out, c, k);
 }
 
-union bch_extent_entry *bch2_bkey_drop_ptr(struct bkey_s k,
-					   struct bch_extent_ptr *ptr)
+enum merge_result bch2_extent_merge(struct bch_fs *c,
+				    struct bkey_s _l, struct bkey_s _r)
 {
-	struct bkey_ptrs ptrs = bch2_bkey_ptrs(k);
-	union bch_extent_entry *dst, *src, *prev;
-	bool drop_crc = true;
+	struct bkey_s_extent l = bkey_s_to_extent(_l);
+	struct bkey_s_extent r = bkey_s_to_extent(_r);
+	union bch_extent_entry *en_l = l.v->start;
+	union bch_extent_entry *en_r = r.v->start;
+	struct bch_extent_crc_unpacked crc_l, crc_r;
 
-	EBUG_ON(ptr < &ptrs.start->ptr ||
-		ptr >= &ptrs.end->ptr);
-	EBUG_ON(ptr->type != 1 << BCH_EXTENT_ENTRY_ptr);
+	if (bkey_val_u64s(l.k) != bkey_val_u64s(r.k))
+		return BCH_MERGE_NOMERGE;
 
-	src = extent_entry_next(to_entry(ptr));
-	if (src != ptrs.end &&
-	    !extent_entry_is_crc(src))
-		drop_crc = false;
+	crc_l = bch2_extent_crc_unpack(l.k, NULL);
 
-	dst = to_entry(ptr);
-	while ((prev = extent_entry_prev(ptrs, dst))) {
-		if (extent_entry_is_ptr(prev))
+	extent_for_each_entry(l, en_l) {
+		en_r = vstruct_idx(r.v, (u64 *) en_l - l.v->_data);
+
+		if (extent_entry_type(en_l) != extent_entry_type(en_r))
+			return BCH_MERGE_NOMERGE;
+
+		switch (extent_entry_type(en_l)) {
+		case BCH_EXTENT_ENTRY_ptr: {
+			const struct bch_extent_ptr *lp = &en_l->ptr;
+			const struct bch_extent_ptr *rp = &en_r->ptr;
+			struct bch_dev *ca;
+
+			if (lp->offset + crc_l.compressed_size != rp->offset ||
+			    lp->dev			!= rp->dev ||
+			    lp->gen			!= rp->gen)
+				return BCH_MERGE_NOMERGE;
+
+			/* We don't allow extents to straddle buckets: */
+			ca = bch_dev_bkey_exists(c, lp->dev);
+
+			if (PTR_BUCKET_NR(ca, lp) != PTR_BUCKET_NR(ca, rp))
+				return BCH_MERGE_NOMERGE;
+
+			break;
+		}
+		case BCH_EXTENT_ENTRY_stripe_ptr:
+			if (en_l->stripe_ptr.block	!= en_r->stripe_ptr.block ||
+			    en_l->stripe_ptr.idx	!= en_r->stripe_ptr.idx)
+				return BCH_MERGE_NOMERGE;
 			break;
+		case BCH_EXTENT_ENTRY_crc32:
+		case BCH_EXTENT_ENTRY_crc64:
+		case BCH_EXTENT_ENTRY_crc128:
+			crc_l = bch2_extent_crc_unpack(l.k, entry_to_crc(en_l));
+			crc_r = bch2_extent_crc_unpack(r.k, entry_to_crc(en_r));
+
+			if (crc_l.csum_type		!= crc_r.csum_type ||
+			    crc_l.compression_type	!= crc_r.compression_type ||
+			    crc_l.nonce			!= crc_r.nonce)
+				return BCH_MERGE_NOMERGE;
+
+			if (crc_l.offset + crc_l.live_size != crc_l.compressed_size ||
+			    crc_r.offset)
+				return BCH_MERGE_NOMERGE;
+
+			if (!bch2_checksum_mergeable(crc_l.csum_type))
+				return BCH_MERGE_NOMERGE;
+
+			if (crc_l.compression_type)
+				return BCH_MERGE_NOMERGE;
+
+			if (crc_l.csum_type &&
+			    crc_l.uncompressed_size +
+			    crc_r.uncompressed_size > c->sb.encoded_extent_max)
+				return BCH_MERGE_NOMERGE;
+
+			if (crc_l.uncompressed_size + crc_r.uncompressed_size - 1 >
+			    bch2_crc_field_size_max[extent_entry_type(en_l)])
+				return BCH_MERGE_NOMERGE;
 
-		if (extent_entry_is_crc(prev)) {
-			if (drop_crc)
-				dst = prev;
 			break;
+		default:
+			return BCH_MERGE_NOMERGE;
 		}
+	}
 
-		dst = prev;
+	extent_for_each_entry(l, en_l) {
+		struct bch_extent_crc_unpacked crc_l, crc_r;
+
+		en_r = vstruct_idx(r.v, (u64 *) en_l - l.v->_data);
+
+		if (!extent_entry_is_crc(en_l))
+			continue;
+
+		crc_l = bch2_extent_crc_unpack(l.k, entry_to_crc(en_l));
+		crc_r = bch2_extent_crc_unpack(r.k, entry_to_crc(en_r));
+
+		crc_l.csum = bch2_checksum_merge(crc_l.csum_type,
+						 crc_l.csum,
+						 crc_r.csum,
+						 crc_r.uncompressed_size << 9);
+
+		crc_l.uncompressed_size	+= crc_r.uncompressed_size;
+		crc_l.compressed_size	+= crc_r.compressed_size;
+
+		bch2_extent_crc_pack(entry_to_crc(en_l), crc_l,
+				     extent_entry_type(en_l));
 	}
 
-	memmove_u64s_down(dst, src,
-			  (u64 *) ptrs.end - (u64 *) src);
-	k.k->u64s -= (u64 *) src - (u64 *) dst;
+	bch2_key_resize(l.k, l.k->size + r.k->size);
 
-	return dst;
+	return BCH_MERGE_MERGE;
+}
+
+/* KEY_TYPE_reservation: */
+
+const char *bch2_reservation_invalid(const struct bch_fs *c, struct bkey_s_c k)
+{
+	struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k);
+
+	if (bkey_val_bytes(k.k) != sizeof(struct bch_reservation))
+		return "incorrect value size";
+
+	if (!r.v->nr_replicas || r.v->nr_replicas > BCH_REPLICAS_MAX)
+		return "invalid nr_replicas";
+
+	return NULL;
+}
+
+void bch2_reservation_to_text(struct printbuf *out, struct bch_fs *c,
+			      struct bkey_s_c k)
+{
+	struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k);
+
+	pr_buf(out, "generation %u replicas %u",
+	       le32_to_cpu(r.v->generation),
+	       r.v->nr_replicas);
+}
+
+enum merge_result bch2_reservation_merge(struct bch_fs *c,
+					 struct bkey_s _l, struct bkey_s _r)
+{
+	struct bkey_s_reservation l = bkey_s_to_reservation(_l);
+	struct bkey_s_reservation r = bkey_s_to_reservation(_r);
+
+	if (l.v->generation != r.v->generation ||
+	    l.v->nr_replicas != r.v->nr_replicas)
+		return BCH_MERGE_NOMERGE;
+
+	if ((u64) l.k->size + r.k->size > KEY_SIZE_MAX) {
+		bch2_key_resize(l.k, KEY_SIZE_MAX);
+		bch2_cut_front_s(l.k->p, r.s);
+		return BCH_MERGE_PARTIAL;
+	}
+
+	bch2_key_resize(l.k, l.k->size + r.k->size);
+
+	return BCH_MERGE_MERGE;
+}
+
+/* Extent checksum entries: */
+
+/* returns true if not equal */
+static inline bool bch2_crc_unpacked_cmp(struct bch_extent_crc_unpacked l,
+					 struct bch_extent_crc_unpacked r)
+{
+	return (l.csum_type		!= r.csum_type ||
+		l.compression_type	!= r.compression_type ||
+		l.compressed_size	!= r.compressed_size ||
+		l.uncompressed_size	!= r.uncompressed_size ||
+		l.offset		!= r.offset ||
+		l.live_size		!= r.live_size ||
+		l.nonce			!= r.nonce ||
+		bch2_crc_cmp(l.csum, r.csum));
 }
 
 static inline bool can_narrow_crc(struct bch_extent_crc_unpacked u,
@@ -462,509 +523,237 @@ restart_narrow_pointers:
 	return ret;
 }
 
-/* returns true if not equal */
-static inline bool bch2_crc_unpacked_cmp(struct bch_extent_crc_unpacked l,
-					 struct bch_extent_crc_unpacked r)
+static void bch2_extent_crc_pack(union bch_extent_crc *dst,
+				 struct bch_extent_crc_unpacked src,
+				 enum bch_extent_entry_type type)
 {
-	return (l.csum_type		!= r.csum_type ||
-		l.compression_type	!= r.compression_type ||
-		l.compressed_size	!= r.compressed_size ||
-		l.uncompressed_size	!= r.uncompressed_size ||
-		l.offset		!= r.offset ||
-		l.live_size		!= r.live_size ||
-		l.nonce			!= r.nonce ||
-		bch2_crc_cmp(l.csum, r.csum));
+#define set_common_fields(_dst, _src)					\
+		_dst.type		= 1 << type;			\
+		_dst.csum_type		= _src.csum_type,		\
+		_dst.compression_type	= _src.compression_type,	\
+		_dst._compressed_size	= _src.compressed_size - 1,	\
+		_dst._uncompressed_size	= _src.uncompressed_size - 1,	\
+		_dst.offset		= _src.offset
+
+	switch (type) {
+	case BCH_EXTENT_ENTRY_crc32:
+		set_common_fields(dst->crc32, src);
+		dst->crc32.csum	 = *((__le32 *) &src.csum.lo);
+		break;
+	case BCH_EXTENT_ENTRY_crc64:
+		set_common_fields(dst->crc64, src);
+		dst->crc64.nonce	= src.nonce;
+		dst->crc64.csum_lo	= src.csum.lo;
+		dst->crc64.csum_hi	= *((__le16 *) &src.csum.hi);
+		break;
+	case BCH_EXTENT_ENTRY_crc128:
+		set_common_fields(dst->crc128, src);
+		dst->crc128.nonce	= src.nonce;
+		dst->crc128.csum	= src.csum;
+		break;
+	default:
+		BUG();
+	}
+#undef set_common_fields
 }
 
-void bch2_ptr_swab(const struct bkey_format *f, struct bkey_packed *k)
+void bch2_extent_crc_append(struct bkey_i *k,
+			    struct bch_extent_crc_unpacked new)
 {
-	union bch_extent_entry *entry;
-	u64 *d = (u64 *) bkeyp_val(f, k);
-	unsigned i;
+	struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k));
+	union bch_extent_crc *crc = (void *) ptrs.end;
+	enum bch_extent_entry_type type;
 
-	for (i = 0; i < bkeyp_val_u64s(f, k); i++)
-		d[i] = swab64(d[i]);
+	if (bch_crc_bytes[new.csum_type]	<= 4 &&
+	    new.uncompressed_size - 1		<= CRC32_SIZE_MAX &&
+	    new.nonce				<= CRC32_NONCE_MAX)
+		type = BCH_EXTENT_ENTRY_crc32;
+	else if (bch_crc_bytes[new.csum_type]	<= 10 &&
+		   new.uncompressed_size - 1	<= CRC64_SIZE_MAX &&
+		   new.nonce			<= CRC64_NONCE_MAX)
+		type = BCH_EXTENT_ENTRY_crc64;
+	else if (bch_crc_bytes[new.csum_type]	<= 16 &&
+		   new.uncompressed_size - 1	<= CRC128_SIZE_MAX &&
+		   new.nonce			<= CRC128_NONCE_MAX)
+		type = BCH_EXTENT_ENTRY_crc128;
+	else
+		BUG();
 
-	for (entry = (union bch_extent_entry *) d;
-	     entry < (union bch_extent_entry *) (d + bkeyp_val_u64s(f, k));
-	     entry = extent_entry_next(entry)) {
-		switch (extent_entry_type(entry)) {
-		case BCH_EXTENT_ENTRY_ptr:
-			break;
-		case BCH_EXTENT_ENTRY_crc32:
-			entry->crc32.csum = swab32(entry->crc32.csum);
-			break;
-		case BCH_EXTENT_ENTRY_crc64:
-			entry->crc64.csum_hi = swab16(entry->crc64.csum_hi);
-			entry->crc64.csum_lo = swab64(entry->crc64.csum_lo);
-			break;
-		case BCH_EXTENT_ENTRY_crc128:
-			entry->crc128.csum.hi = (__force __le64)
-				swab64((__force u64) entry->crc128.csum.hi);
-			entry->crc128.csum.lo = (__force __le64)
-				swab64((__force u64) entry->crc128.csum.lo);
-			break;
-		case BCH_EXTENT_ENTRY_stripe_ptr:
-			break;
-		}
-	}
+	bch2_extent_crc_pack(crc, new, type);
+
+	k->k.u64s += extent_entry_u64s(ptrs.end);
+
+	EBUG_ON(bkey_val_u64s(&k->k) > BKEY_EXTENT_VAL_U64s_MAX);
 }
 
-void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
-			    struct bkey_s_c k)
-{
-	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-	const union bch_extent_entry *entry;
-	struct bch_extent_crc_unpacked crc;
-	const struct bch_extent_ptr *ptr;
-	const struct bch_extent_stripe_ptr *ec;
-	struct bch_dev *ca;
-	bool first = true;
-
-	bkey_extent_entry_for_each(ptrs, entry) {
-		if (!first)
-			pr_buf(out, " ");
-
-		switch (__extent_entry_type(entry)) {
-		case BCH_EXTENT_ENTRY_ptr:
-			ptr = entry_to_ptr(entry);
-			ca = ptr->dev < c->sb.nr_devices && c->devs[ptr->dev]
-				? bch_dev_bkey_exists(c, ptr->dev)
-				: NULL;
-
-			pr_buf(out, "ptr: %u:%llu gen %u%s%s", ptr->dev,
-			       (u64) ptr->offset, ptr->gen,
-			       ptr->cached ? " cached" : "",
-			       ca && ptr_stale(ca, ptr)
-			       ? " stale" : "");
-			break;
-		case BCH_EXTENT_ENTRY_crc32:
-		case BCH_EXTENT_ENTRY_crc64:
-		case BCH_EXTENT_ENTRY_crc128:
-			crc = bch2_extent_crc_unpack(k.k, entry_to_crc(entry));
-
-			pr_buf(out, "crc: c_size %u size %u offset %u nonce %u csum %u compress %u",
-			       crc.compressed_size,
-			       crc.uncompressed_size,
-			       crc.offset, crc.nonce,
-			       crc.csum_type,
-			       crc.compression_type);
-			break;
-		case BCH_EXTENT_ENTRY_stripe_ptr:
-			ec = &entry->stripe_ptr;
-
-			pr_buf(out, "ec: idx %llu block %u",
-			       (u64) ec->idx, ec->block);
-			break;
-		default:
-			pr_buf(out, "(invalid extent entry %.16llx)", *((u64 *) entry));
-			return;
-		}
+/* Generic code for keys with pointers: */
 
-		first = false;
-	}
+unsigned bch2_bkey_nr_ptrs(struct bkey_s_c k)
+{
+	return bch2_bkey_devs(k).nr;
 }
 
-static const char *extent_ptr_invalid(const struct bch_fs *c,
-				      struct bkey_s_c k,
-				      const struct bch_extent_ptr *ptr,
-				      unsigned size_ondisk,
-				      bool metadata)
+unsigned bch2_bkey_nr_ptrs_allocated(struct bkey_s_c k)
 {
-	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-	const struct bch_extent_ptr *ptr2;
-	struct bch_dev *ca;
-
-	if (!bch2_dev_exists2(c, ptr->dev))
-		return "pointer to invalid device";
-
-	ca = bch_dev_bkey_exists(c, ptr->dev);
-	if (!ca)
-		return "pointer to invalid device";
-
-	bkey_for_each_ptr(ptrs, ptr2)
-		if (ptr != ptr2 && ptr->dev == ptr2->dev)
-			return "multiple pointers to same device";
-
-	if (ptr->offset + size_ondisk > bucket_to_sector(ca, ca->mi.nbuckets))
-		return "offset past end of device";
-
-	if (ptr->offset < bucket_to_sector(ca, ca->mi.first_bucket))
-		return "offset before first bucket";
-
-	if (bucket_remainder(ca, ptr->offset) +
-	    size_ondisk > ca->mi.bucket_size)
-		return "spans multiple buckets";
-
-	return NULL;
+	return k.k->type == KEY_TYPE_reservation
+		? bkey_s_c_to_reservation(k).v->nr_replicas
+		: bch2_bkey_dirty_devs(k).nr;
 }
 
-const char *bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k)
+unsigned bch2_bkey_nr_ptrs_fully_allocated(struct bkey_s_c k)
 {
-	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-	const union bch_extent_entry *entry;
-	struct bch_extent_crc_unpacked crc;
-	unsigned size_ondisk = k.k->size;
-	const char *reason;
-	unsigned nonce = UINT_MAX;
-
-	if (k.k->type == KEY_TYPE_btree_ptr)
-		size_ondisk = c->opts.btree_node_size;
-
-	bkey_extent_entry_for_each(ptrs, entry) {
-		if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX)
-			return "invalid extent entry type";
-
-		if (k.k->type == KEY_TYPE_btree_ptr &&
-		    !extent_entry_is_ptr(entry))
-			return "has non ptr field";
-
-		switch (extent_entry_type(entry)) {
-		case BCH_EXTENT_ENTRY_ptr:
-			reason = extent_ptr_invalid(c, k, &entry->ptr,
-						    size_ondisk, false);
-			if (reason)
-				return reason;
-			break;
-		case BCH_EXTENT_ENTRY_crc32:
-		case BCH_EXTENT_ENTRY_crc64:
-		case BCH_EXTENT_ENTRY_crc128:
-			crc = bch2_extent_crc_unpack(k.k, entry_to_crc(entry));
-
-			if (crc.offset + crc.live_size >
-			    crc.uncompressed_size)
-				return "checksum offset + key size > uncompressed size";
-
-			size_ondisk = crc.compressed_size;
-
-			if (!bch2_checksum_type_valid(c, crc.csum_type))
-				return "invalid checksum type";
+	unsigned ret = 0;
 
-			if (crc.compression_type >= BCH_COMPRESSION_NR)
-				return "invalid compression type";
+	if (k.k->type == KEY_TYPE_reservation) {
+		ret = bkey_s_c_to_reservation(k).v->nr_replicas;
+	} else {
+		struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+		const union bch_extent_entry *entry;
+		struct extent_ptr_decoded p;
 
-			if (bch2_csum_type_is_encryption(crc.csum_type)) {
-				if (nonce == UINT_MAX)
-					nonce = crc.offset + crc.nonce;
-				else if (nonce != crc.offset + crc.nonce)
-					return "incorrect nonce";
-			}
-			break;
-		case BCH_EXTENT_ENTRY_stripe_ptr:
-			break;
-		}
+		bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
+			ret += !p.ptr.cached &&
+				p.crc.compression_type == BCH_COMPRESSION_NONE;
 	}
 
-	return NULL;
-}
-
-/* Btree ptrs */
-
-const char *bch2_btree_ptr_invalid(const struct bch_fs *c, struct bkey_s_c k)
-{
-	if (bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX)
-		return "value too big";
-
-	return bch2_bkey_ptrs_invalid(c, k);
+	return ret;
 }
 
-void bch2_btree_ptr_debugcheck(struct bch_fs *c, struct bkey_s_c k)
+unsigned bch2_bkey_sectors_compressed(struct bkey_s_c k)
 {
 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-	const struct bch_extent_ptr *ptr;
-	const char *err;
-	char buf[160];
-	struct bucket_mark mark;
-	struct bch_dev *ca;
-
-	bch2_fs_bug_on(!test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) &&
-		       !bch2_bkey_replicas_marked(c, k, false), c,
-		       "btree key bad (replicas not marked in superblock):\n%s",
-		       (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
-
-	if (!test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags))
-		return;
-
-	bkey_for_each_ptr(ptrs, ptr) {
-		ca = bch_dev_bkey_exists(c, ptr->dev);
-
-		mark = ptr_bucket_mark(ca, ptr);
-
-		err = "stale";
-		if (gen_after(mark.gen, ptr->gen))
-			goto err;
-
-		err = "inconsistent";
-		if (mark.data_type != BCH_DATA_BTREE ||
-		    mark.dirty_sectors < c->opts.btree_node_size)
-			goto err;
-	}
+	const union bch_extent_entry *entry;
+	struct extent_ptr_decoded p;
+	unsigned ret = 0;
 
-	return;
-err:
-	bch2_bkey_val_to_text(&PBUF(buf), c, k);
-	bch2_fs_bug(c, "%s btree pointer %s: bucket %zi gen %i mark %08x",
-		    err, buf, PTR_BUCKET_NR(ca, ptr),
-		    mark.gen, (unsigned) mark.v.counter);
-}
+	bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
+		if (!p.ptr.cached &&
+		    p.crc.compression_type != BCH_COMPRESSION_NONE)
+			ret += p.crc.compressed_size;
 
-void bch2_btree_ptr_to_text(struct printbuf *out, struct bch_fs *c,
-			    struct bkey_s_c k)
-{
-	bch2_bkey_ptrs_to_text(out, c, k);
+	return ret;
 }
 
-/* Extents */
-
-int bch2_cut_front_s(struct bpos where, struct bkey_s k)
+bool bch2_check_range_allocated(struct bch_fs *c, struct bpos pos, u64 size,
+				unsigned nr_replicas)
 {
-	unsigned new_val_u64s = bkey_val_u64s(k.k);
-	int val_u64s_delta;
-	u64 sub;
-
-	if (bkey_cmp(where, bkey_start_pos(k.k)) <= 0)
-		return 0;
-
-	EBUG_ON(bkey_cmp(where, k.k->p) > 0);
-
-	sub = where.offset - bkey_start_offset(k.k);
-
-	k.k->size -= sub;
+	struct btree_trans trans;
+	struct btree_iter *iter;
+	struct bpos end = pos;
+	struct bkey_s_c k;
+	bool ret = true;
+	int err;
 
-	if (!k.k->size) {
-		k.k->type = KEY_TYPE_deleted;
-		new_val_u64s = 0;
-	}
+	end.offset += size;
 
-	switch (k.k->type) {
-	case KEY_TYPE_extent:
-	case KEY_TYPE_reflink_v: {
-		struct bkey_ptrs ptrs = bch2_bkey_ptrs(k);
-		union bch_extent_entry *entry;
-		bool seen_crc = false;
+	bch2_trans_init(&trans, c, 0, 0);
 
-		bkey_extent_entry_for_each(ptrs, entry) {
-			switch (extent_entry_type(entry)) {
-			case BCH_EXTENT_ENTRY_ptr:
-				if (!seen_crc)
-					entry->ptr.offset += sub;
-				break;
-			case BCH_EXTENT_ENTRY_crc32:
-				entry->crc32.offset += sub;
-				break;
-			case BCH_EXTENT_ENTRY_crc64:
-				entry->crc64.offset += sub;
-				break;
-			case BCH_EXTENT_ENTRY_crc128:
-				entry->crc128.offset += sub;
-				break;
-			case BCH_EXTENT_ENTRY_stripe_ptr:
-				break;
-			}
+	for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, pos,
+			   BTREE_ITER_SLOTS, k, err) {
+		if (bkey_cmp(bkey_start_pos(k.k), end) >= 0)
+			break;
 
-			if (extent_entry_is_crc(entry))
-				seen_crc = true;
+		if (nr_replicas > bch2_bkey_nr_ptrs_fully_allocated(k)) {
+			ret = false;
+			break;
 		}
-
-		break;
-	}
-	case KEY_TYPE_reflink_p: {
-		struct bkey_s_reflink_p p = bkey_s_to_reflink_p(k);
-
-		le64_add_cpu(&p.v->idx, sub);
-		break;
 	}
-	case KEY_TYPE_inline_data: {
-		struct bkey_s_inline_data d = bkey_s_to_inline_data(k);
-
-		sub = min_t(u64, sub << 9, bkey_val_bytes(d.k));
-
-		memmove(d.v->data,
-			d.v->data + sub,
-			bkey_val_bytes(d.k) - sub);
-
-		new_val_u64s -= sub >> 3;
-		break;
-	}
-	}
-
-	val_u64s_delta = bkey_val_u64s(k.k) - new_val_u64s;
-	BUG_ON(val_u64s_delta < 0);
+	bch2_trans_exit(&trans);
 
-	set_bkey_val_u64s(k.k, new_val_u64s);
-	memset(bkey_val_end(k), 0, val_u64s_delta * sizeof(u64));
-	return -val_u64s_delta;
+	return ret;
 }
 
-int bch2_cut_back_s(struct bpos where, struct bkey_s k)
+static unsigned bch2_extent_ptr_durability(struct bch_fs *c,
+					   struct extent_ptr_decoded p)
 {
-	unsigned new_val_u64s = bkey_val_u64s(k.k);
-	int val_u64s_delta;
-	u64 len = 0;
+	unsigned durability = 0;
+	struct bch_dev *ca;
 
-	if (bkey_cmp(where, k.k->p) >= 0)
+	if (p.ptr.cached)
 		return 0;
 
-	EBUG_ON(bkey_cmp(where, bkey_start_pos(k.k)) < 0);
+	ca = bch_dev_bkey_exists(c, p.ptr.dev);
 
-	len = where.offset - bkey_start_offset(k.k);
+	if (ca->mi.state != BCH_MEMBER_STATE_FAILED)
+		durability = max_t(unsigned, durability, ca->mi.durability);
 
-	k.k->p = where;
-	k.k->size = len;
+	if (p.has_ec) {
+		struct stripe *s =
+			genradix_ptr(&c->stripes[0], p.ec.idx);
 
-	if (!len) {
-		k.k->type = KEY_TYPE_deleted;
-		new_val_u64s = 0;
-	}
+		if (WARN_ON(!s))
+			goto out;
 
-	switch (k.k->type) {
-	case KEY_TYPE_inline_data:
-		new_val_u64s = min(new_val_u64s, k.k->size << 6);
-		break;
+		durability = max_t(unsigned, durability, s->nr_redundant);
 	}
-
-	val_u64s_delta = bkey_val_u64s(k.k) - new_val_u64s;
-	BUG_ON(val_u64s_delta < 0);
-
-	set_bkey_val_u64s(k.k, new_val_u64s);
-	memset(bkey_val_end(k), 0, val_u64s_delta * sizeof(u64));
-	return -val_u64s_delta;
-}
-
-const char *bch2_extent_invalid(const struct bch_fs *c, struct bkey_s_c k)
-{
-	return bch2_bkey_ptrs_invalid(c, k);
+out:
+	return durability;
 }
 
-void bch2_extent_debugcheck(struct bch_fs *c, struct bkey_s_c k)
+unsigned bch2_bkey_durability(struct bch_fs *c, struct bkey_s_c k)
 {
-	struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
 	const union bch_extent_entry *entry;
 	struct extent_ptr_decoded p;
-	char buf[160];
-
-	/*
-	 * XXX: we should be doing most/all of these checks at startup time,
-	 * where we check bch2_bkey_invalid() in btree_node_read_done()
-	 *
-	 * But note that we can't check for stale pointers or incorrect gc marks
-	 * until after journal replay is done (it might be an extent that's
-	 * going to get overwritten during replay)
-	 */
-
-	if (percpu_down_read_trylock(&c->mark_lock)) {
-		bch2_fs_bug_on(!test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) &&
-			       !bch2_bkey_replicas_marked_locked(c, e.s_c, false), c,
-			       "extent key bad (replicas not marked in superblock):\n%s",
-			       (bch2_bkey_val_to_text(&PBUF(buf), c, e.s_c), buf));
-		percpu_up_read(&c->mark_lock);
-	}
-	/*
-	 * If journal replay hasn't finished, we might be seeing keys
-	 * that will be overwritten by the time journal replay is done:
-	 */
-	if (!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags))
-		return;
-
-	extent_for_each_ptr_decode(e, p, entry) {
-		struct bch_dev *ca	= bch_dev_bkey_exists(c, p.ptr.dev);
-		struct bucket_mark mark = ptr_bucket_mark(ca, &p.ptr);
-		unsigned stale		= gen_after(mark.gen, p.ptr.gen);
-		unsigned disk_sectors	= ptr_disk_sectors(p);
-		unsigned mark_sectors	= p.ptr.cached
-			? mark.cached_sectors
-			: mark.dirty_sectors;
-
-		bch2_fs_bug_on(stale && !p.ptr.cached, c,
-			       "stale dirty pointer (ptr gen %u bucket %u",
-			       p.ptr.gen, mark.gen);
+	unsigned durability = 0;
 
-		bch2_fs_bug_on(stale > 96, c, "key too stale: %i", stale);
+	bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
+		durability += bch2_extent_ptr_durability(c, p);
 
-		bch2_fs_bug_on(!stale &&
-			       (mark.data_type != BCH_DATA_USER ||
-				mark_sectors < disk_sectors), c,
-			       "extent pointer not marked: %s:\n"
-			       "type %u sectors %u < %u",
-			       (bch2_bkey_val_to_text(&PBUF(buf), c, e.s_c), buf),
-			       mark.data_type,
-			       mark_sectors, disk_sectors);
-	}
+	return durability;
 }
 
-void bch2_extent_to_text(struct printbuf *out, struct bch_fs *c,
-			 struct bkey_s_c k)
+void bch2_bkey_mark_replicas_cached(struct bch_fs *c, struct bkey_s k,
+				    unsigned target,
+				    unsigned nr_desired_replicas)
 {
-	bch2_bkey_ptrs_to_text(out, c, k);
-}
+	struct bkey_ptrs ptrs = bch2_bkey_ptrs(k);
+	union bch_extent_entry *entry;
+	struct extent_ptr_decoded p;
+	int extra = bch2_bkey_durability(c, k.s_c) - nr_desired_replicas;
 
-static unsigned bch2_crc_field_size_max[] = {
-	[BCH_EXTENT_ENTRY_crc32] = CRC32_SIZE_MAX,
-	[BCH_EXTENT_ENTRY_crc64] = CRC64_SIZE_MAX,
-	[BCH_EXTENT_ENTRY_crc128] = CRC128_SIZE_MAX,
-};
+	if (target && extra > 0)
+		bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+			int n = bch2_extent_ptr_durability(c, p);
 
-static void bch2_extent_crc_pack(union bch_extent_crc *dst,
-				 struct bch_extent_crc_unpacked src,
-				 enum bch_extent_entry_type type)
-{
-#define set_common_fields(_dst, _src)					\
-		_dst.type		= 1 << type;			\
-		_dst.csum_type		= _src.csum_type,		\
-		_dst.compression_type	= _src.compression_type,	\
-		_dst._compressed_size	= _src.compressed_size - 1,	\
-		_dst._uncompressed_size	= _src.uncompressed_size - 1,	\
-		_dst.offset		= _src.offset
+			if (n && n <= extra &&
+			    !bch2_dev_in_target(c, p.ptr.dev, target)) {
+				entry->ptr.cached = true;
+				extra -= n;
+			}
+		}
 
-	switch (type) {
-	case BCH_EXTENT_ENTRY_crc32:
-		set_common_fields(dst->crc32, src);
-		dst->crc32.csum	 = *((__le32 *) &src.csum.lo);
-		break;
-	case BCH_EXTENT_ENTRY_crc64:
-		set_common_fields(dst->crc64, src);
-		dst->crc64.nonce	= src.nonce;
-		dst->crc64.csum_lo	= src.csum.lo;
-		dst->crc64.csum_hi	= *((__le16 *) &src.csum.hi);
-		break;
-	case BCH_EXTENT_ENTRY_crc128:
-		set_common_fields(dst->crc128, src);
-		dst->crc128.nonce	= src.nonce;
-		dst->crc128.csum	= src.csum;
-		break;
-	default:
-		BUG();
-	}
-#undef set_common_fields
-}
+	if (extra > 0)
+		bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+			int n = bch2_extent_ptr_durability(c, p);
 
-void bch2_extent_crc_append(struct bkey_i *k,
-			    struct bch_extent_crc_unpacked new)
-{
-	struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k));
-	union bch_extent_crc *crc = (void *) ptrs.end;
-	enum bch_extent_entry_type type;
+			if (n && n <= extra) {
+				entry->ptr.cached = true;
+				extra -= n;
+			}
+		}
+}
 
-	if (bch_crc_bytes[new.csum_type]	<= 4 &&
-	    new.uncompressed_size - 1		<= CRC32_SIZE_MAX &&
-	    new.nonce				<= CRC32_NONCE_MAX)
-		type = BCH_EXTENT_ENTRY_crc32;
-	else if (bch_crc_bytes[new.csum_type]	<= 10 &&
-		   new.uncompressed_size - 1	<= CRC64_SIZE_MAX &&
-		   new.nonce			<= CRC64_NONCE_MAX)
-		type = BCH_EXTENT_ENTRY_crc64;
-	else if (bch_crc_bytes[new.csum_type]	<= 16 &&
-		   new.uncompressed_size - 1	<= CRC128_SIZE_MAX &&
-		   new.nonce			<= CRC128_NONCE_MAX)
-		type = BCH_EXTENT_ENTRY_crc128;
-	else
-		BUG();
+void bch2_bkey_append_ptr(struct bkey_i *k,
+			  struct bch_extent_ptr ptr)
+{
+	EBUG_ON(bch2_bkey_has_device(bkey_i_to_s_c(k), ptr.dev));
 
-	bch2_extent_crc_pack(crc, new, type);
+	switch (k->k.type) {
+	case KEY_TYPE_btree_ptr:
+	case KEY_TYPE_extent:
+		EBUG_ON(bkey_val_u64s(&k->k) >= BKEY_EXTENT_VAL_U64s_MAX);
 
-	k->k.u64s += extent_entry_u64s(ptrs.end);
+		ptr.type = 1 << BCH_EXTENT_ENTRY_ptr;
 
-	EBUG_ON(bkey_val_u64s(&k->k) > BKEY_EXTENT_VAL_U64s_MAX);
+		memcpy((void *) &k->v + bkey_val_bytes(&k->k),
+		       &ptr,
+		       sizeof(ptr));
+		k->u64s++;
+		break;
+	default:
+		BUG();
+	}
 }
 
 static inline void __extent_entry_insert(struct bkey_i *k,
@@ -1010,6 +799,107 @@ found:
 	}
 }
 
+static union bch_extent_entry *extent_entry_prev(struct bkey_ptrs ptrs,
+					  union bch_extent_entry *entry)
+{
+	union bch_extent_entry *i = ptrs.start;
+
+	if (i == entry)
+		return NULL;
+
+	while (extent_entry_next(i) != entry)
+		i = extent_entry_next(i);
+	return i;
+}
+
+union bch_extent_entry *bch2_bkey_drop_ptr(struct bkey_s k,
+					   struct bch_extent_ptr *ptr)
+{
+	struct bkey_ptrs ptrs = bch2_bkey_ptrs(k);
+	union bch_extent_entry *dst, *src, *prev;
+	bool drop_crc = true;
+
+	EBUG_ON(ptr < &ptrs.start->ptr ||
+		ptr >= &ptrs.end->ptr);
+	EBUG_ON(ptr->type != 1 << BCH_EXTENT_ENTRY_ptr);
+
+	src = extent_entry_next(to_entry(ptr));
+	if (src != ptrs.end &&
+	    !extent_entry_is_crc(src))
+		drop_crc = false;
+
+	dst = to_entry(ptr);
+	while ((prev = extent_entry_prev(ptrs, dst))) {
+		if (extent_entry_is_ptr(prev))
+			break;
+
+		if (extent_entry_is_crc(prev)) {
+			if (drop_crc)
+				dst = prev;
+			break;
+		}
+
+		dst = prev;
+	}
+
+	memmove_u64s_down(dst, src,
+			  (u64 *) ptrs.end - (u64 *) src);
+	k.k->u64s -= (u64 *) src - (u64 *) dst;
+
+	return dst;
+}
+
+void bch2_bkey_drop_device(struct bkey_s k, unsigned dev)
+{
+	struct bch_extent_ptr *ptr;
+
+	bch2_bkey_drop_ptrs(k, ptr, ptr->dev == dev);
+}
+
+const struct bch_extent_ptr *
+bch2_bkey_has_device(struct bkey_s_c k, unsigned dev)
+{
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+	const struct bch_extent_ptr *ptr;
+
+	bkey_for_each_ptr(ptrs, ptr)
+		if (ptr->dev == dev)
+			return ptr;
+
+	return NULL;
+}
+
+bool bch2_bkey_has_target(struct bch_fs *c, struct bkey_s_c k, unsigned target)
+{
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+	const struct bch_extent_ptr *ptr;
+
+	bkey_for_each_ptr(ptrs, ptr)
+		if (bch2_dev_in_target(c, ptr->dev, target) &&
+		    (!ptr->cached ||
+		     !ptr_stale(bch_dev_bkey_exists(c, ptr->dev), ptr)))
+			return true;
+
+	return false;
+}
+
+bool bch2_bkey_matches_ptr(struct bch_fs *c, struct bkey_s_c k,
+			   struct bch_extent_ptr m, u64 offset)
+{
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+	const union bch_extent_entry *entry;
+	struct extent_ptr_decoded p;
+
+	bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
+		if (p.ptr.dev	== m.dev &&
+		    p.ptr.gen	== m.gen &&
+		    (s64) p.ptr.offset + p.crc.offset - bkey_start_offset(k.k) ==
+		    (s64) m.offset  - offset)
+			return true;
+
+	return false;
+}
+
 /*
  * bch_extent_normalize - clean up an extent, dropping stale pointers etc.
  *
@@ -1027,245 +917,307 @@ bool bch2_extent_normalize(struct bch_fs *c, struct bkey_s k)
 		ptr_stale(bch_dev_bkey_exists(c, ptr->dev), ptr));
 
 	/* will only happen if all pointers were cached: */
-	if (!bkey_val_u64s(k.k))
+	if (!bch2_bkey_nr_ptrs(k.s_c))
 		k.k->type = KEY_TYPE_discard;
 
 	return bkey_whiteout(k.k);
 }
 
-void bch2_bkey_mark_replicas_cached(struct bch_fs *c, struct bkey_s k,
-				    unsigned target,
-				    unsigned nr_desired_replicas)
+void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
+			    struct bkey_s_c k)
 {
-	struct bkey_ptrs ptrs = bch2_bkey_ptrs(k);
-	union bch_extent_entry *entry;
-	struct extent_ptr_decoded p;
-	int extra = bch2_bkey_durability(c, k.s_c) - nr_desired_replicas;
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+	const union bch_extent_entry *entry;
+	struct bch_extent_crc_unpacked crc;
+	const struct bch_extent_ptr *ptr;
+	const struct bch_extent_stripe_ptr *ec;
+	struct bch_dev *ca;
+	bool first = true;
 
-	if (target && extra > 0)
-		bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
-			int n = bch2_extent_ptr_durability(c, p);
+	bkey_extent_entry_for_each(ptrs, entry) {
+		if (!first)
+			pr_buf(out, " ");
 
-			if (n && n <= extra &&
-			    !bch2_dev_in_target(c, p.ptr.dev, target)) {
-				entry->ptr.cached = true;
-				extra -= n;
-			}
-		}
+		switch (__extent_entry_type(entry)) {
+		case BCH_EXTENT_ENTRY_ptr:
+			ptr = entry_to_ptr(entry);
+			ca = ptr->dev < c->sb.nr_devices && c->devs[ptr->dev]
+				? bch_dev_bkey_exists(c, ptr->dev)
+				: NULL;
 
-	if (extra > 0)
-		bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
-			int n = bch2_extent_ptr_durability(c, p);
+			pr_buf(out, "ptr: %u:%llu gen %u%s%s", ptr->dev,
+			       (u64) ptr->offset, ptr->gen,
+			       ptr->cached ? " cached" : "",
+			       ca && ptr_stale(ca, ptr)
+			       ? " stale" : "");
+			break;
+		case BCH_EXTENT_ENTRY_crc32:
+		case BCH_EXTENT_ENTRY_crc64:
+		case BCH_EXTENT_ENTRY_crc128:
+			crc = bch2_extent_crc_unpack(k.k, entry_to_crc(entry));
 
-			if (n && n <= extra) {
-				entry->ptr.cached = true;
-				extra -= n;
-			}
+			pr_buf(out, "crc: c_size %u size %u offset %u nonce %u csum %u compress %u",
+			       crc.compressed_size,
+			       crc.uncompressed_size,
+			       crc.offset, crc.nonce,
+			       crc.csum_type,
+			       crc.compression_type);
+			break;
+		case BCH_EXTENT_ENTRY_stripe_ptr:
+			ec = &entry->stripe_ptr;
+
+			pr_buf(out, "ec: idx %llu block %u",
+			       (u64) ec->idx, ec->block);
+			break;
+		default:
+			pr_buf(out, "(invalid extent entry %.16llx)", *((u64 *) entry));
+			return;
 		}
+
+		first = false;
+	}
 }
 
-enum merge_result bch2_extent_merge(struct bch_fs *c,
-				    struct bkey_s _l, struct bkey_s _r)
+static const char *extent_ptr_invalid(const struct bch_fs *c,
+				      struct bkey_s_c k,
+				      const struct bch_extent_ptr *ptr,
+				      unsigned size_ondisk,
+				      bool metadata)
 {
-	struct bkey_s_extent l = bkey_s_to_extent(_l);
-	struct bkey_s_extent r = bkey_s_to_extent(_r);
-	union bch_extent_entry *en_l = l.v->start;
-	union bch_extent_entry *en_r = r.v->start;
-	struct bch_extent_crc_unpacked crc_l, crc_r;
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+	const struct bch_extent_ptr *ptr2;
+	struct bch_dev *ca;
 
-	if (bkey_val_u64s(l.k) != bkey_val_u64s(r.k))
-		return BCH_MERGE_NOMERGE;
+	if (!bch2_dev_exists2(c, ptr->dev))
+		return "pointer to invalid device";
 
-	crc_l = bch2_extent_crc_unpack(l.k, NULL);
+	ca = bch_dev_bkey_exists(c, ptr->dev);
+	if (!ca)
+		return "pointer to invalid device";
 
-	extent_for_each_entry(l, en_l) {
-		en_r = vstruct_idx(r.v, (u64 *) en_l - l.v->_data);
+	bkey_for_each_ptr(ptrs, ptr2)
+		if (ptr != ptr2 && ptr->dev == ptr2->dev)
+			return "multiple pointers to same device";
 
-		if (extent_entry_type(en_l) != extent_entry_type(en_r))
-			return BCH_MERGE_NOMERGE;
+	if (ptr->offset + size_ondisk > bucket_to_sector(ca, ca->mi.nbuckets))
+		return "offset past end of device";
 
-		switch (extent_entry_type(en_l)) {
-		case BCH_EXTENT_ENTRY_ptr: {
-			const struct bch_extent_ptr *lp = &en_l->ptr;
-			const struct bch_extent_ptr *rp = &en_r->ptr;
-			struct bch_dev *ca;
+	if (ptr->offset < bucket_to_sector(ca, ca->mi.first_bucket))
+		return "offset before first bucket";
 
-			if (lp->offset + crc_l.compressed_size != rp->offset ||
-			    lp->dev			!= rp->dev ||
-			    lp->gen			!= rp->gen)
-				return BCH_MERGE_NOMERGE;
+	if (bucket_remainder(ca, ptr->offset) +
+	    size_ondisk > ca->mi.bucket_size)
+		return "spans multiple buckets";
 
-			/* We don't allow extents to straddle buckets: */
-			ca = bch_dev_bkey_exists(c, lp->dev);
+	return NULL;
+}
 
-			if (PTR_BUCKET_NR(ca, lp) != PTR_BUCKET_NR(ca, rp))
-				return BCH_MERGE_NOMERGE;
+const char *bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k)
+{
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+	const union bch_extent_entry *entry;
+	struct bch_extent_crc_unpacked crc;
+	unsigned size_ondisk = k.k->size;
+	const char *reason;
+	unsigned nonce = UINT_MAX;
+
+	if (k.k->type == KEY_TYPE_btree_ptr)
+		size_ondisk = c->opts.btree_node_size;
+
+	bkey_extent_entry_for_each(ptrs, entry) {
+		if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX)
+			return "invalid extent entry type";
 
-			break;
-		}
-		case BCH_EXTENT_ENTRY_stripe_ptr:
-			if (en_l->stripe_ptr.block	!= en_r->stripe_ptr.block ||
-			    en_l->stripe_ptr.idx	!= en_r->stripe_ptr.idx)
-				return BCH_MERGE_NOMERGE;
+		if (k.k->type == KEY_TYPE_btree_ptr &&
+		    !extent_entry_is_ptr(entry))
+			return "has non ptr field";
+
+		switch (extent_entry_type(entry)) {
+		case BCH_EXTENT_ENTRY_ptr:
+			reason = extent_ptr_invalid(c, k, &entry->ptr,
+						    size_ondisk, false);
+			if (reason)
+				return reason;
 			break;
 		case BCH_EXTENT_ENTRY_crc32:
 		case BCH_EXTENT_ENTRY_crc64:
 		case BCH_EXTENT_ENTRY_crc128:
-			crc_l = bch2_extent_crc_unpack(l.k, entry_to_crc(en_l));
-			crc_r = bch2_extent_crc_unpack(r.k, entry_to_crc(en_r));
-
-			if (crc_l.csum_type		!= crc_r.csum_type ||
-			    crc_l.compression_type	!= crc_r.compression_type ||
-			    crc_l.nonce			!= crc_r.nonce)
-				return BCH_MERGE_NOMERGE;
-
-			if (crc_l.offset + crc_l.live_size != crc_l.compressed_size ||
-			    crc_r.offset)
-				return BCH_MERGE_NOMERGE;
+			crc = bch2_extent_crc_unpack(k.k, entry_to_crc(entry));
 
-			if (!bch2_checksum_mergeable(crc_l.csum_type))
-				return BCH_MERGE_NOMERGE;
+			if (crc.offset + crc.live_size >
+			    crc.uncompressed_size)
+				return "checksum offset + key size > uncompressed size";
 
-			if (crc_l.compression_type)
-				return BCH_MERGE_NOMERGE;
+			size_ondisk = crc.compressed_size;
 
-			if (crc_l.csum_type &&
-			    crc_l.uncompressed_size +
-			    crc_r.uncompressed_size > c->sb.encoded_extent_max)
-				return BCH_MERGE_NOMERGE;
+			if (!bch2_checksum_type_valid(c, crc.csum_type))
+				return "invalid checksum type";
 
-			if (crc_l.uncompressed_size + crc_r.uncompressed_size - 1 >
-			    bch2_crc_field_size_max[extent_entry_type(en_l)])
-				return BCH_MERGE_NOMERGE;
+			if (crc.compression_type >= BCH_COMPRESSION_NR)
+				return "invalid compression type";
 
+			if (bch2_csum_type_is_encryption(crc.csum_type)) {
+				if (nonce == UINT_MAX)
+					nonce = crc.offset + crc.nonce;
+				else if (nonce != crc.offset + crc.nonce)
+					return "incorrect nonce";
+			}
+			break;
+		case BCH_EXTENT_ENTRY_stripe_ptr:
 			break;
-		default:
-			return BCH_MERGE_NOMERGE;
 		}
 	}
 
-	extent_for_each_entry(l, en_l) {
-		struct bch_extent_crc_unpacked crc_l, crc_r;
-
-		en_r = vstruct_idx(r.v, (u64 *) en_l - l.v->_data);
-
-		if (!extent_entry_is_crc(en_l))
-			continue;
-
-		crc_l = bch2_extent_crc_unpack(l.k, entry_to_crc(en_l));
-		crc_r = bch2_extent_crc_unpack(r.k, entry_to_crc(en_r));
+	return NULL;
+}
 
-		crc_l.csum = bch2_checksum_merge(crc_l.csum_type,
-						 crc_l.csum,
-						 crc_r.csum,
-						 crc_r.uncompressed_size << 9);
+void bch2_ptr_swab(const struct bkey_format *f, struct bkey_packed *k)
+{
+	union bch_extent_entry *entry;
+	u64 *d = (u64 *) bkeyp_val(f, k);
+	unsigned i;
 
-		crc_l.uncompressed_size	+= crc_r.uncompressed_size;
-		crc_l.compressed_size	+= crc_r.compressed_size;
+	for (i = 0; i < bkeyp_val_u64s(f, k); i++)
+		d[i] = swab64(d[i]);
 
-		bch2_extent_crc_pack(entry_to_crc(en_l), crc_l,
-				     extent_entry_type(en_l));
+	for (entry = (union bch_extent_entry *) d;
+	     entry < (union bch_extent_entry *) (d + bkeyp_val_u64s(f, k));
+	     entry = extent_entry_next(entry)) {
+		switch (extent_entry_type(entry)) {
+		case BCH_EXTENT_ENTRY_ptr:
+			break;
+		case BCH_EXTENT_ENTRY_crc32:
+			entry->crc32.csum = swab32(entry->crc32.csum);
+			break;
+		case BCH_EXTENT_ENTRY_crc64:
+			entry->crc64.csum_hi = swab16(entry->crc64.csum_hi);
+			entry->crc64.csum_lo = swab64(entry->crc64.csum_lo);
+			break;
+		case BCH_EXTENT_ENTRY_crc128:
+			entry->crc128.csum.hi = (__force __le64)
+				swab64((__force u64) entry->crc128.csum.hi);
+			entry->crc128.csum.lo = (__force __le64)
+				swab64((__force u64) entry->crc128.csum.lo);
+			break;
+		case BCH_EXTENT_ENTRY_stripe_ptr:
+			break;
+		}
 	}
-
-	bch2_key_resize(l.k, l.k->size + r.k->size);
-
-	return BCH_MERGE_MERGE;
 }
 
-bool bch2_check_range_allocated(struct bch_fs *c, struct bpos pos, u64 size,
-			       unsigned nr_replicas)
+/* Generic extent code: */
+
+int bch2_cut_front_s(struct bpos where, struct bkey_s k)
 {
-	struct btree_trans trans;
-	struct btree_iter *iter;
-	struct bpos end = pos;
-	struct bkey_s_c k;
-	bool ret = true;
-	int err;
+	unsigned new_val_u64s = bkey_val_u64s(k.k);
+	int val_u64s_delta;
+	u64 sub;
 
-	end.offset += size;
+	if (bkey_cmp(where, bkey_start_pos(k.k)) <= 0)
+		return 0;
 
-	bch2_trans_init(&trans, c, 0, 0);
+	EBUG_ON(bkey_cmp(where, k.k->p) > 0);
 
-	for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, pos,
-			   BTREE_ITER_SLOTS, k, err) {
-		if (bkey_cmp(bkey_start_pos(k.k), end) >= 0)
-			break;
+	sub = where.offset - bkey_start_offset(k.k);
 
-		if (nr_replicas > bch2_bkey_nr_ptrs_allocated(k)) {
-			ret = false;
-			break;
-		}
+	k.k->size -= sub;
+
+	if (!k.k->size) {
+		k.k->type = KEY_TYPE_deleted;
+		new_val_u64s = 0;
 	}
-	bch2_trans_exit(&trans);
 
-	return ret;
-}
+	switch (k.k->type) {
+	case KEY_TYPE_extent:
+	case KEY_TYPE_reflink_v: {
+		struct bkey_ptrs ptrs = bch2_bkey_ptrs(k);
+		union bch_extent_entry *entry;
+		bool seen_crc = false;
 
-unsigned bch2_bkey_nr_ptrs_allocated(struct bkey_s_c k)
-{
-	unsigned ret = 0;
+		bkey_extent_entry_for_each(ptrs, entry) {
+			switch (extent_entry_type(entry)) {
+			case BCH_EXTENT_ENTRY_ptr:
+				if (!seen_crc)
+					entry->ptr.offset += sub;
+				break;
+			case BCH_EXTENT_ENTRY_crc32:
+				entry->crc32.offset += sub;
+				break;
+			case BCH_EXTENT_ENTRY_crc64:
+				entry->crc64.offset += sub;
+				break;
+			case BCH_EXTENT_ENTRY_crc128:
+				entry->crc128.offset += sub;
+				break;
+			case BCH_EXTENT_ENTRY_stripe_ptr:
+				break;
+			}
 
-	switch (k.k->type) {
-	case KEY_TYPE_extent: {
-		struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
-		const union bch_extent_entry *entry;
-		struct extent_ptr_decoded p;
+			if (extent_entry_is_crc(entry))
+				seen_crc = true;
+		}
 
-		extent_for_each_ptr_decode(e, p, entry)
-			ret += !p.ptr.cached &&
-				p.crc.compression_type == BCH_COMPRESSION_NONE;
 		break;
 	}
-	case KEY_TYPE_reservation:
-		ret = bkey_s_c_to_reservation(k).v->nr_replicas;
+	case KEY_TYPE_reflink_p: {
+		struct bkey_s_reflink_p p = bkey_s_to_reflink_p(k);
+
+		le64_add_cpu(&p.v->idx, sub);
 		break;
 	}
+	case KEY_TYPE_inline_data: {
+		struct bkey_s_inline_data d = bkey_s_to_inline_data(k);
 
-	return ret;
-}
-
-/* KEY_TYPE_reservation: */
+		sub = min_t(u64, sub << 9, bkey_val_bytes(d.k));
 
-const char *bch2_reservation_invalid(const struct bch_fs *c, struct bkey_s_c k)
-{
-	struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k);
+		memmove(d.v->data,
+			d.v->data + sub,
+			bkey_val_bytes(d.k) - sub);
 
-	if (bkey_val_bytes(k.k) != sizeof(struct bch_reservation))
-		return "incorrect value size";
+		new_val_u64s -= sub >> 3;
+		break;
+	}
+	}
 
-	if (!r.v->nr_replicas || r.v->nr_replicas > BCH_REPLICAS_MAX)
-		return "invalid nr_replicas";
+	val_u64s_delta = bkey_val_u64s(k.k) - new_val_u64s;
+	BUG_ON(val_u64s_delta < 0);
 
-	return NULL;
+	set_bkey_val_u64s(k.k, new_val_u64s);
+	memset(bkey_val_end(k), 0, val_u64s_delta * sizeof(u64));
+	return -val_u64s_delta;
 }
 
-void bch2_reservation_to_text(struct printbuf *out, struct bch_fs *c,
-			      struct bkey_s_c k)
+int bch2_cut_back_s(struct bpos where, struct bkey_s k)
 {
-	struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k);
+	unsigned new_val_u64s = bkey_val_u64s(k.k);
+	int val_u64s_delta;
+	u64 len = 0;
 
-	pr_buf(out, "generation %u replicas %u",
-	       le32_to_cpu(r.v->generation),
-	       r.v->nr_replicas);
-}
+	if (bkey_cmp(where, k.k->p) >= 0)
+		return 0;
 
-enum merge_result bch2_reservation_merge(struct bch_fs *c,
-					 struct bkey_s _l, struct bkey_s _r)
-{
-	struct bkey_s_reservation l = bkey_s_to_reservation(_l);
-	struct bkey_s_reservation r = bkey_s_to_reservation(_r);
+	EBUG_ON(bkey_cmp(where, bkey_start_pos(k.k)) < 0);
 
-	if (l.v->generation != r.v->generation ||
-	    l.v->nr_replicas != r.v->nr_replicas)
-		return BCH_MERGE_NOMERGE;
+	len = where.offset - bkey_start_offset(k.k);
 
-	if ((u64) l.k->size + r.k->size > KEY_SIZE_MAX) {
-		bch2_key_resize(l.k, KEY_SIZE_MAX);
-		bch2_cut_front_s(l.k->p, r.s);
-		return BCH_MERGE_PARTIAL;
+	k.k->p = where;
+	k.k->size = len;
+
+	if (!len) {
+		k.k->type = KEY_TYPE_deleted;
+		new_val_u64s = 0;
 	}
 
-	bch2_key_resize(l.k, l.k->size + r.k->size);
+	switch (k.k->type) {
+	case KEY_TYPE_inline_data:
+		new_val_u64s = min(new_val_u64s, k.k->size << 6);
+		break;
+	}
 
-	return BCH_MERGE_MERGE;
+	val_u64s_delta = bkey_val_u64s(k.k) - new_val_u64s;
+	BUG_ON(val_u64s_delta < 0);
+
+	set_bkey_val_u64s(k.k, new_val_u64s);
+	memset(bkey_val_end(k), 0, val_u64s_delta * sizeof(u64));
+	return -val_u64s_delta;
 }
diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h
index 35a66d4f4ea2..1140d01a42ab 100644
--- a/fs/bcachefs/extents.h
+++ b/fs/bcachefs/extents.h
@@ -40,6 +40,9 @@ struct btree_insert_entry;
 		(union bch_extent_entry *) (_entry));			\
 })
 
+#define extent_entry_next(_entry)					\
+	((typeof(_entry)) ((void *) (_entry) + extent_entry_bytes(_entry)))
+
 static inline unsigned
 __extent_entry_type(const union bch_extent_entry *e)
 {
@@ -185,10 +188,52 @@ struct bkey_ptrs {
 	union bch_extent_entry	*end;
 };
 
-/* iterate over bkey ptrs */
+static inline struct bkey_ptrs_c bch2_bkey_ptrs_c(struct bkey_s_c k)
+{
+	switch (k.k->type) {
+	case KEY_TYPE_btree_ptr: {
+		struct bkey_s_c_btree_ptr e = bkey_s_c_to_btree_ptr(k);
+		return (struct bkey_ptrs_c) {
+			to_entry(&e.v->start[0]),
+			to_entry(extent_entry_last(e))
+		};
+	}
+	case KEY_TYPE_extent: {
+		struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
+		return (struct bkey_ptrs_c) {
+			e.v->start,
+			extent_entry_last(e)
+		};
+	}
+	case KEY_TYPE_stripe: {
+		struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k);
+		return (struct bkey_ptrs_c) {
+			to_entry(&s.v->ptrs[0]),
+			to_entry(&s.v->ptrs[s.v->nr_blocks]),
+		};
+	}
+	case KEY_TYPE_reflink_v: {
+		struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(k);
 
-#define extent_entry_next(_entry)					\
-	((typeof(_entry)) ((void *) (_entry) + extent_entry_bytes(_entry)))
+		return (struct bkey_ptrs_c) {
+			r.v->start,
+			bkey_val_end(r),
+		};
+	}
+	default:
+		return (struct bkey_ptrs_c) { NULL, NULL };
+	}
+}
+
+static inline struct bkey_ptrs bch2_bkey_ptrs(struct bkey_s k)
+{
+	struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k.s_c);
+
+	return (struct bkey_ptrs) {
+		(void *) p.start,
+		(void *) p.end
+	};
+}
 
 #define __bkey_extent_entry_for_each_from(_start, _end, _entry)		\
 	for ((_entry) = (_start);					\
@@ -281,96 +326,26 @@ out:									\
 #define bkey_for_each_crc(_k, _p, _crc, _iter)				\
 	__bkey_for_each_crc(_k, (_p).start, (_p).end, _crc, _iter)
 
-/* utility code common to all keys with pointers: */
-
-static inline struct bkey_ptrs_c bch2_bkey_ptrs_c(struct bkey_s_c k)
-{
-	switch (k.k->type) {
-	case KEY_TYPE_btree_ptr: {
-		struct bkey_s_c_btree_ptr e = bkey_s_c_to_btree_ptr(k);
-		return (struct bkey_ptrs_c) {
-			to_entry(&e.v->start[0]),
-			to_entry(extent_entry_last(e))
-		};
-	}
-	case KEY_TYPE_extent: {
-		struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
-		return (struct bkey_ptrs_c) {
-			e.v->start,
-			extent_entry_last(e)
-		};
-	}
-	case KEY_TYPE_stripe: {
-		struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k);
-		return (struct bkey_ptrs_c) {
-			to_entry(&s.v->ptrs[0]),
-			to_entry(&s.v->ptrs[s.v->nr_blocks]),
-		};
-	}
-	case KEY_TYPE_reflink_v: {
-		struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(k);
-
-		return (struct bkey_ptrs_c) {
-			r.v->start,
-			bkey_val_end(r),
-		};
-	}
-	default:
-		return (struct bkey_ptrs_c) { NULL, NULL };
-	}
-}
-
-static inline struct bkey_ptrs bch2_bkey_ptrs(struct bkey_s k)
-{
-	struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k.s_c);
+/* Iterate over pointers in KEY_TYPE_extent: */
 
-	return (struct bkey_ptrs) {
-		(void *) p.start,
-		(void *) p.end
-	};
-}
-
-static inline struct bch_devs_list bch2_bkey_devs(struct bkey_s_c k)
-{
-	struct bch_devs_list ret = (struct bch_devs_list) { 0 };
-	struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k);
-	const struct bch_extent_ptr *ptr;
-
-	bkey_for_each_ptr(p, ptr)
-		ret.devs[ret.nr++] = ptr->dev;
-
-	return ret;
-}
-
-static inline struct bch_devs_list bch2_bkey_dirty_devs(struct bkey_s_c k)
-{
-	struct bch_devs_list ret = (struct bch_devs_list) { 0 };
-	struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k);
-	const struct bch_extent_ptr *ptr;
-
-	bkey_for_each_ptr(p, ptr)
-		if (!ptr->cached)
-			ret.devs[ret.nr++] = ptr->dev;
+#define extent_for_each_entry_from(_e, _entry, _start)			\
+	__bkey_extent_entry_for_each_from(_start,			\
+				extent_entry_last(_e),_entry)
 
-	return ret;
-}
+#define extent_for_each_entry(_e, _entry)				\
+	extent_for_each_entry_from(_e, _entry, (_e).v->start)
 
-static inline struct bch_devs_list bch2_bkey_cached_devs(struct bkey_s_c k)
-{
-	struct bch_devs_list ret = (struct bch_devs_list) { 0 };
-	struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k);
-	const struct bch_extent_ptr *ptr;
+#define extent_ptr_next(_e, _ptr)					\
+	__bkey_ptr_next(_ptr, extent_entry_last(_e))
 
-	bkey_for_each_ptr(p, ptr)
-		if (ptr->cached)
-			ret.devs[ret.nr++] = ptr->dev;
+#define extent_for_each_ptr(_e, _ptr)					\
+	__bkey_for_each_ptr(&(_e).v->start->ptr, extent_entry_last(_e), _ptr)
 
-	return ret;
-}
+#define extent_for_each_ptr_decode(_e, _ptr, _entry)			\
+	__bkey_for_each_ptr_decode((_e).k, (_e).v->start,		\
+				   extent_entry_last(_e), _ptr, _entry)
 
-unsigned bch2_bkey_nr_ptrs(struct bkey_s_c);
-unsigned bch2_bkey_nr_dirty_ptrs(struct bkey_s_c);
-unsigned bch2_bkey_durability(struct bch_fs *, struct bkey_s_c);
+/* utility code common to all keys with pointers: */
 
 void bch2_mark_io_failure(struct bch_io_failures *,
 			  struct extent_ptr_decoded *);
@@ -378,22 +353,12 @@ int bch2_bkey_pick_read_device(struct bch_fs *, struct bkey_s_c,
 			       struct bch_io_failures *,
 			       struct extent_ptr_decoded *);
 
-void bch2_bkey_append_ptr(struct bkey_i *, struct bch_extent_ptr);
-void bch2_bkey_drop_device(struct bkey_s, unsigned);
-const struct bch_extent_ptr *bch2_bkey_has_device(struct bkey_s_c, unsigned);
-bool bch2_bkey_has_target(struct bch_fs *, struct bkey_s_c, unsigned);
-
-void bch2_bkey_ptrs_to_text(struct printbuf *, struct bch_fs *,
-			    struct bkey_s_c);
-const char *bch2_bkey_ptrs_invalid(const struct bch_fs *, struct bkey_s_c);
-
-/* bch_btree_ptr: */
+/* KEY_TYPE_btree_ptr: */
 
 const char *bch2_btree_ptr_invalid(const struct bch_fs *, struct bkey_s_c);
 void bch2_btree_ptr_debugcheck(struct bch_fs *, struct bkey_s_c);
 void bch2_btree_ptr_to_text(struct printbuf *, struct bch_fs *,
 			    struct bkey_s_c);
-void bch2_ptr_swab(const struct bkey_format *, struct bkey_packed *);
 
 #define bch2_bkey_ops_btree_ptr (struct bkey_ops) {		\
 	.key_invalid	= bch2_btree_ptr_invalid,		\
@@ -402,12 +367,11 @@ void bch2_ptr_swab(const struct bkey_format *, struct bkey_packed *);
 	.swab		= bch2_ptr_swab,			\
 }
 
-/* bch_extent: */
+/* KEY_TYPE_extent: */
 
 const char *bch2_extent_invalid(const struct bch_fs *, struct bkey_s_c);
 void bch2_extent_debugcheck(struct bch_fs *, struct bkey_s_c);
 void bch2_extent_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
-bool bch2_extent_normalize(struct bch_fs *, struct bkey_s);
 enum merge_result bch2_extent_merge(struct bch_fs *,
 				    struct bkey_s, struct bkey_s);
 
@@ -420,7 +384,7 @@ enum merge_result bch2_extent_merge(struct bch_fs *,
 	.key_merge	= bch2_extent_merge,			\
 }
 
-/* bch_reservation: */
+/* KEY_TYPE_reservation: */
 
 const char *bch2_reservation_invalid(const struct bch_fs *, struct bkey_s_c);
 void bch2_reservation_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
@@ -433,13 +397,15 @@ enum merge_result bch2_reservation_merge(struct bch_fs *,
 	.key_merge	= bch2_reservation_merge,		\
 }
 
-void bch2_bkey_mark_replicas_cached(struct bch_fs *, struct bkey_s,
-				    unsigned, unsigned);
+/* Extent checksum entries: */
 
-unsigned bch2_extent_is_compressed(struct bkey_s_c);
+bool bch2_can_narrow_extent_crcs(struct bkey_s_c,
+				 struct bch_extent_crc_unpacked);
+bool bch2_bkey_narrow_crcs(struct bkey_i *, struct bch_extent_crc_unpacked);
+void bch2_extent_crc_append(struct bkey_i *,
+			    struct bch_extent_crc_unpacked);
 
-bool bch2_bkey_matches_ptr(struct bch_fs *, struct bkey_s_c,
-			   struct bch_extent_ptr, u64);
+/* Generic code for keys with pointers: */
 
 static inline bool bkey_extent_is_direct_data(const struct bkey *k)
 {
@@ -477,34 +443,57 @@ static inline bool bkey_extent_is_allocation(const struct bkey *k)
 	}
 }
 
-/* Extent entry iteration: */
+static inline struct bch_devs_list bch2_bkey_devs(struct bkey_s_c k)
+{
+	struct bch_devs_list ret = (struct bch_devs_list) { 0 };
+	struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k);
+	const struct bch_extent_ptr *ptr;
 
-#define extent_for_each_entry_from(_e, _entry, _start)			\
-	__bkey_extent_entry_for_each_from(_start,			\
-				extent_entry_last(_e),_entry)
+	bkey_for_each_ptr(p, ptr)
+		ret.devs[ret.nr++] = ptr->dev;
 
-#define extent_for_each_entry(_e, _entry)				\
-	extent_for_each_entry_from(_e, _entry, (_e).v->start)
+	return ret;
+}
 
-#define extent_ptr_next(_e, _ptr)					\
-	__bkey_ptr_next(_ptr, extent_entry_last(_e))
+static inline struct bch_devs_list bch2_bkey_dirty_devs(struct bkey_s_c k)
+{
+	struct bch_devs_list ret = (struct bch_devs_list) { 0 };
+	struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k);
+	const struct bch_extent_ptr *ptr;
 
-#define extent_for_each_ptr(_e, _ptr)					\
-	__bkey_for_each_ptr(&(_e).v->start->ptr, extent_entry_last(_e), _ptr)
+	bkey_for_each_ptr(p, ptr)
+		if (!ptr->cached)
+			ret.devs[ret.nr++] = ptr->dev;
 
-#define extent_for_each_ptr_decode(_e, _ptr, _entry)			\
-	__bkey_for_each_ptr_decode((_e).k, (_e).v->start,		\
-				   extent_entry_last(_e), _ptr, _entry)
+	return ret;
+}
 
-void bch2_extent_crc_append(struct bkey_i *,
-			    struct bch_extent_crc_unpacked);
-void bch2_extent_ptr_decoded_append(struct bkey_i *,
-				    struct extent_ptr_decoded *);
+static inline struct bch_devs_list bch2_bkey_cached_devs(struct bkey_s_c k)
+{
+	struct bch_devs_list ret = (struct bch_devs_list) { 0 };
+	struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k);
+	const struct bch_extent_ptr *ptr;
 
-bool bch2_can_narrow_extent_crcs(struct bkey_s_c,
-				 struct bch_extent_crc_unpacked);
-bool bch2_bkey_narrow_crcs(struct bkey_i *, struct bch_extent_crc_unpacked);
+	bkey_for_each_ptr(p, ptr)
+		if (ptr->cached)
+			ret.devs[ret.nr++] = ptr->dev;
 
+	return ret;
+}
+
+unsigned bch2_bkey_nr_ptrs(struct bkey_s_c);
+unsigned bch2_bkey_nr_ptrs_allocated(struct bkey_s_c);
+unsigned bch2_bkey_nr_ptrs_fully_allocated(struct bkey_s_c);
+unsigned bch2_bkey_sectors_compressed(struct bkey_s_c);
+bool bch2_check_range_allocated(struct bch_fs *, struct bpos, u64, unsigned);
+unsigned bch2_bkey_durability(struct bch_fs *, struct bkey_s_c);
+
+void bch2_bkey_mark_replicas_cached(struct bch_fs *, struct bkey_s,
+				    unsigned, unsigned);
+
+void bch2_bkey_append_ptr(struct bkey_i *, struct bch_extent_ptr);
+void bch2_extent_ptr_decoded_append(struct bkey_i *,
+				    struct extent_ptr_decoded *);
 union bch_extent_entry *bch2_bkey_drop_ptr(struct bkey_s,
 					   struct bch_extent_ptr *);
 
@@ -525,6 +514,22 @@ do {									\
 	}								\
 } while (0)
 
+void bch2_bkey_drop_device(struct bkey_s, unsigned);
+const struct bch_extent_ptr *bch2_bkey_has_device(struct bkey_s_c, unsigned);
+bool bch2_bkey_has_target(struct bch_fs *, struct bkey_s_c, unsigned);
+
+bool bch2_bkey_matches_ptr(struct bch_fs *, struct bkey_s_c,
+			   struct bch_extent_ptr, u64);
+
+bool bch2_extent_normalize(struct bch_fs *, struct bkey_s);
+void bch2_bkey_ptrs_to_text(struct printbuf *, struct bch_fs *,
+			    struct bkey_s_c);
+const char *bch2_bkey_ptrs_invalid(const struct bch_fs *, struct bkey_s_c);
+
+void bch2_ptr_swab(const struct bkey_format *, struct bkey_packed *);
+
+/* Generic extent code: */
+
 int bch2_cut_front_s(struct bpos, struct bkey_s);
 int bch2_cut_back_s(struct bpos, struct bkey_s);
 
@@ -568,7 +573,4 @@ static inline void extent_save(struct btree *b, struct bkey_packed *dst,
 		BUG_ON(!bch2_bkey_pack_key(dst, src, f));
 }
 
-bool bch2_check_range_allocated(struct bch_fs *, struct bpos, u64, unsigned);
-unsigned bch2_bkey_nr_ptrs_allocated(struct bkey_s_c);
-
 #endif /* _BCACHEFS_EXTENTS_H */
diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 7abe53be7dd3..e61f5e2fb695 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -675,7 +675,7 @@ static void bch2_add_page_sectors(struct bio *bio, struct bkey_s_c k)
 	struct bvec_iter iter;
 	struct bio_vec bv;
 	unsigned nr_ptrs = k.k->type == KEY_TYPE_reflink_v
-		? 0 : bch2_bkey_nr_ptrs_allocated(k);
+		? 0 : bch2_bkey_nr_ptrs_fully_allocated(k);
 	unsigned state = k.k->type == KEY_TYPE_reservation
 		? SECTOR_RESERVED
 		: SECTOR_ALLOCATED;
@@ -2543,7 +2543,7 @@ reassemble:
 		} else {
 			/* We might end up splitting compressed extents: */
 			unsigned nr_ptrs =
-				bch2_bkey_nr_dirty_ptrs(bkey_i_to_s_c(copy.k));
+				bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(copy.k));
 
 			ret = bch2_disk_reservation_get(c, &disk_res,
 					copy.k->k.size, nr_ptrs,
@@ -2669,7 +2669,7 @@ static long bchfs_fallocate(struct bch_inode_info *inode, int mode,
 		bch2_cut_back(end_pos,		&reservation.k_i);
 
 		sectors = reservation.k.size;
-		reservation.v.nr_replicas = bch2_bkey_nr_dirty_ptrs(k);
+		reservation.v.nr_replicas = bch2_bkey_nr_ptrs_allocated(k);
 
 		if (!bkey_extent_is_allocation(k.k)) {
 			ret = bch2_quota_reservation_add(c, inode,
@@ -2680,7 +2680,7 @@ static long bchfs_fallocate(struct bch_inode_info *inode, int mode,
 		}
 
 		if (reservation.v.nr_replicas < replicas ||
-		    bch2_extent_is_compressed(k)) {
+		    bch2_bkey_sectors_compressed(k)) {
 			ret = bch2_disk_reservation_get(c, &disk_res, sectors,
 							replicas, 0);
 			if (unlikely(ret))
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index 8f558347ca7f..6ceb8951c221 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -202,8 +202,8 @@ static int sum_sector_overwrites(struct btree_trans *trans,
 
 	for_each_btree_key_continue(iter, BTREE_ITER_SLOTS, old, ret) {
 		if (!may_allocate &&
-		    bch2_bkey_nr_ptrs_allocated(old) <
-		    bch2_bkey_nr_dirty_ptrs(bkey_i_to_s_c(new))) {
+		    bch2_bkey_nr_ptrs_fully_allocated(old) <
+		    bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(new))) {
 			ret = -ENOSPC;
 			break;
 		}
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index 5fd44dbe2722..17f0a89a7637 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -134,11 +134,11 @@ static int bch2_migrate_index_update(struct bch_write_op *op)
 		 * If we're not fully overwriting @k, and it's compressed, we
 		 * need a reservation for all the pointers in @insert
 		 */
-		nr = bch2_bkey_nr_dirty_ptrs(bkey_i_to_s_c(insert)) -
+		nr = bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(insert)) -
 			 m->nr_ptrs_reserved;
 
 		if (insert->k.size < k.k->size &&
-		    bch2_extent_is_compressed(k) &&
+		    bch2_bkey_sectors_compressed(k) &&
 		    nr > 0) {
 			ret = bch2_disk_reservation_add(c, &op->res,
 					keylist_sectors(keys) * nr, 0);
@@ -250,7 +250,7 @@ int bch2_migrate_write_init(struct bch_fs *c, struct migrate_write *m,
 		 */
 #if 0
 		int nr = (int) io_opts.data_replicas -
-			bch2_bkey_nr_dirty_ptrs(k);
+			bch2_bkey_nr_ptrs_allocated(k);
 #endif
 		int nr = (int) io_opts.data_replicas;
 
@@ -599,7 +599,7 @@ peek:
 		if (rate)
 			bch2_ratelimit_increment(rate, k.k->size);
 next:
-		atomic64_add(k.k->size * bch2_bkey_nr_dirty_ptrs(k),
+		atomic64_add(k.k->size * bch2_bkey_nr_ptrs_allocated(k),
 			     &stats->sectors_seen);
 next_nondata:
 		bch2_btree_iter_next(iter);
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 9102a1ce1ec4..d4002b7fc917 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -254,7 +254,7 @@ static int bch2_extent_replay_key(struct bch_fs *c, enum btree_id btree_id,
 	 * Some extents aren't equivalent - w.r.t. what the triggers do
 	 * - if they're split:
 	 */
-	bool remark_if_split = bch2_extent_is_compressed(bkey_i_to_s_c(k)) ||
+	bool remark_if_split = bch2_bkey_sectors_compressed(bkey_i_to_s_c(k)) ||
 		k->k.type == KEY_TYPE_reflink_p;
 	bool remark = false;
 	int ret;
@@ -289,7 +289,7 @@ retry:
 		    bkey_cmp(atomic_end, k->k.p) < 0) {
 			ret = bch2_disk_reservation_add(c, &disk_res,
 					k->k.size *
-					bch2_bkey_nr_dirty_ptrs(bkey_i_to_s_c(k)),
+					bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(k)),
 					BCH_DISK_RESERVATION_NOFAIL);
 			BUG_ON(ret);
 
-- 
cgit 


From bd7e82ee2ad45540797d3e36dba24f9824e431ed Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 20 Nov 2019 16:16:57 -0500
Subject: bcachefs: kill ca->freelist_lock

All uses were supposed to be switched over to c->freelist_lock

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs.h | 1 -
 fs/bcachefs/movinggc.c | 4 ++--
 fs/bcachefs/super.c    | 1 -
 fs/bcachefs/sysfs.c    | 4 ++--
 4 files changed, 4 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 344cf982124f..3fa053531344 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -427,7 +427,6 @@ struct bch_dev {
 	 */
 	alloc_fifo		free[RESERVE_NR];
 	alloc_fifo		free_inc;
-	spinlock_t		freelist_lock;
 
 	u8			open_buckets_partial[OPEN_BUCKETS_COUNT];
 	unsigned		open_buckets_partial_nr;
diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c
index 84971fbfc722..c6159a34e509 100644
--- a/fs/bcachefs/movinggc.c
+++ b/fs/bcachefs/movinggc.c
@@ -107,10 +107,10 @@ static bool have_copygc_reserve(struct bch_dev *ca)
 {
 	bool ret;
 
-	spin_lock(&ca->freelist_lock);
+	spin_lock(&ca->fs->freelist_lock);
 	ret = fifo_full(&ca->free[RESERVE_MOVINGGC]) ||
 		ca->allocator_state != ALLOCATOR_RUNNING;
-	spin_unlock(&ca->freelist_lock);
+	spin_unlock(&ca->fs->freelist_lock);
 
 	return ret;
 }
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 8c7b56a95f4b..3ad4c0ecbe12 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -1059,7 +1059,6 @@ static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c,
 
 	writepoint_init(&ca->copygc_write_point, BCH_DATA_USER);
 
-	spin_lock_init(&ca->freelist_lock);
 	bch2_dev_copygc_init(ca);
 
 	INIT_WORK(&ca->io_error_work, bch2_io_error_work);
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index b0f09a31c41e..8d68331f8b63 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -775,7 +775,7 @@ static ssize_t show_reserve_stats(struct bch_dev *ca, char *buf)
 	struct printbuf out = _PBUF(buf, PAGE_SIZE);
 	enum alloc_reserve i;
 
-	spin_lock(&ca->freelist_lock);
+	spin_lock(&ca->fs->freelist_lock);
 
 	pr_buf(&out, "free_inc:\t%zu\t%zu\n",
 	       fifo_used(&ca->free_inc),
@@ -786,7 +786,7 @@ static ssize_t show_reserve_stats(struct bch_dev *ca, char *buf)
 		       fifo_used(&ca->free[i]),
 		       ca->free[i].size);
 
-	spin_unlock(&ca->freelist_lock);
+	spin_unlock(&ca->fs->freelist_lock);
 
 	return out.pos - buf;
 }
-- 
cgit 


From 5934a0caf2b5dc25a17ee652a95f25c86fffbcd6 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 20 Nov 2019 14:51:30 -0500
Subject: bcachefs: bkey_on_stack_reassemble()

Small helper function.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bkey_on_stack.h |  8 ++++++++
 fs/bcachefs/bkey_sort.c     |  3 +--
 fs/bcachefs/ec.c            |  3 +--
 fs/bcachefs/extent_update.c |  3 +--
 fs/bcachefs/fs-io.c         |  6 ++----
 fs/bcachefs/io.c            | 12 ++++--------
 fs/bcachefs/migrate.c       |  3 +--
 fs/bcachefs/move.c          |  3 +--
 fs/bcachefs/reflink.c       |  3 +--
 9 files changed, 20 insertions(+), 24 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bkey_on_stack.h b/fs/bcachefs/bkey_on_stack.h
index d4739038323f..f607a0cb37ed 100644
--- a/fs/bcachefs/bkey_on_stack.h
+++ b/fs/bcachefs/bkey_on_stack.h
@@ -19,6 +19,14 @@ static inline void bkey_on_stack_realloc(struct bkey_on_stack *s,
 	}
 }
 
+static inline void bkey_on_stack_reassemble(struct bkey_on_stack *s,
+					    struct bch_fs *c,
+					    struct bkey_s_c k)
+{
+	bkey_on_stack_realloc(s, c, k.k->u64s);
+	bkey_reassemble(s->k, k);
+}
+
 static inline void bkey_on_stack_init(struct bkey_on_stack *s)
 {
 	s->k = (void *) s->onstack;
diff --git a/fs/bcachefs/bkey_sort.c b/fs/bcachefs/bkey_sort.c
index daef8e5c599f..2e205db5433d 100644
--- a/fs/bcachefs/bkey_sort.c
+++ b/fs/bcachefs/bkey_sort.c
@@ -356,12 +356,11 @@ struct btree_nr_keys bch2_extent_sort_fix_overlapping(struct bch_fs *c,
 
 			extent_sort_sift(iter, b, _r - iter->data);
 		} else if (bkey_cmp(l.k->p, r.k->p) > 0) {
-			bkey_on_stack_realloc(&split, c, l.k->u64s);
 
 			/*
 			 * r wins, but it overlaps in the middle of l - split l:
 			 */
-			bkey_reassemble(split.k, l.s_c);
+			bkey_on_stack_reassemble(&split, c, l.s_c);
 			bch2_cut_back(bkey_start_pos(r.k), split.k);
 
 			bch2_cut_front_s(r.k->p, l);
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index 654823a6da74..3b3b931dc6c9 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -807,8 +807,7 @@ static int ec_stripe_update_ptrs(struct bch_fs *c,
 
 		dev = s->key.v.ptrs[idx].dev;
 
-		bkey_on_stack_realloc(&sk, c, k.k->u64s);
-		bkey_reassemble(sk.k, k);
+		bkey_on_stack_reassemble(&sk, c, k);
 		e = bkey_i_to_s_extent(sk.k);
 
 		extent_for_each_ptr(e, ptr) {
diff --git a/fs/bcachefs/extent_update.c b/fs/bcachefs/extent_update.c
index 91ceb5d53f92..742b4d78cb3a 100644
--- a/fs/bcachefs/extent_update.c
+++ b/fs/bcachefs/extent_update.c
@@ -347,7 +347,7 @@ extent_squash(struct bch_fs *c, struct btree_iter *iter,
 		struct bkey_on_stack split;
 
 		bkey_on_stack_init(&split);
-		bkey_on_stack_realloc(&split, c, k.k->u64s);
+		bkey_on_stack_reassemble(&split, c, k.s_c);
 
 		/*
 		 * The insert key falls 'in the middle' of k
@@ -363,7 +363,6 @@ extent_squash(struct bch_fs *c, struct btree_iter *iter,
 		 * modify k _before_ doing the insert (which will move
 		 * what k points to)
 		 */
-		bkey_reassemble(split.k, k.s_c);
 		split.k->k.needs_whiteout |= bkey_written(l->b, _k);
 
 		bch2_cut_back(bkey_start_pos(&insert->k), split.k);
diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index e61f5e2fb695..5beb47805a68 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -780,8 +780,7 @@ retry:
 		if (ret)
 			break;
 
-		bkey_on_stack_realloc(&sk, c, k.k->u64s);
-		bkey_reassemble(sk.k, k);
+		bkey_on_stack_reassemble(&sk, c, k);
 		k = bkey_i_to_s_c(sk.k);
 
 		offset_into_extent = iter->pos.offset -
@@ -2476,8 +2475,7 @@ static long bchfs_fcollapse_finsert(struct bch_inode_info *inode,
 		    bkey_cmp(k.k->p, POS(inode->v.i_ino, offset >> 9)) <= 0)
 			break;
 reassemble:
-		bkey_on_stack_realloc(&copy, c, k.k->u64s);
-		bkey_reassemble(copy.k, k);
+		bkey_on_stack_reassemble(&copy, c, k);
 
 		if (insert &&
 		    bkey_cmp(bkey_start_pos(k.k), move_pos) < 0) {
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index 6ceb8951c221..5de554a883ef 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -1538,8 +1538,7 @@ retry:
 	if (bkey_err(k))
 		goto err;
 
-	bkey_on_stack_realloc(&sk, c, k.k->u64s);
-	bkey_reassemble(sk.k, k);
+	bkey_on_stack_reassemble(&sk, c, k);
 	k = bkey_i_to_s_c(sk.k);
 	bch2_trans_unlock(&trans);
 
@@ -1590,8 +1589,7 @@ retry:
 			   BTREE_ITER_SLOTS, k, ret) {
 		unsigned bytes, sectors, offset_into_extent;
 
-		bkey_on_stack_realloc(&sk, c, k.k->u64s);
-		bkey_reassemble(sk.k, k);
+		bkey_on_stack_reassemble(&sk, c, k);
 		k = bkey_i_to_s_c(sk.k);
 
 		offset_into_extent = iter->pos.offset -
@@ -1714,8 +1712,7 @@ retry:
 	if (IS_ERR_OR_NULL(k.k))
 		goto out;
 
-	bkey_on_stack_realloc(&new, c, k.k->u64s);
-	bkey_reassemble(new.k, k);
+	bkey_on_stack_reassemble(&new, c, k);
 	k = bkey_i_to_s_c(new.k);
 
 	if (bversion_cmp(k.k->version, rbio->version) ||
@@ -2229,8 +2226,7 @@ retry:
 			bkey_start_offset(k.k);
 		sectors = k.k->size - offset_into_extent;
 
-		bkey_on_stack_realloc(&sk, c, k.k->u64s);
-		bkey_reassemble(sk.k, k);
+		bkey_on_stack_reassemble(&sk, c, k);
 		k = bkey_i_to_s_c(sk.k);
 
 		ret = bch2_read_indirect_extent(&trans,
diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c
index 4dacbd637d02..4b59dcd04cce 100644
--- a/fs/bcachefs/migrate.c
+++ b/fs/bcachefs/migrate.c
@@ -60,8 +60,7 @@ static int __bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags
 			continue;
 		}
 
-		bkey_on_stack_realloc(&sk, c, k.k->u64s);
-		bkey_reassemble(sk.k, k);
+		bkey_on_stack_reassemble(&sk, c, k);
 
 		ret = drop_dev_ptrs(c, bkey_i_to_s(sk.k),
 				    dev_idx, flags, false);
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index 17f0a89a7637..0aebae33d299 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -578,8 +578,7 @@ peek:
 		}
 
 		/* unlock before doing IO: */
-		bkey_on_stack_realloc(&sk, c, k.k->u64s);
-		bkey_reassemble(sk.k, k);
+		bkey_on_stack_reassemble(&sk, c, k);
 		k = bkey_i_to_s_c(sk.k);
 		bch2_trans_unlock(&trans);
 
diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c
index 4de65bf70362..2812fa305c0e 100644
--- a/fs/bcachefs/reflink.c
+++ b/fs/bcachefs/reflink.c
@@ -225,8 +225,7 @@ s64 bch2_remap_range(struct bch_fs *c,
 			break;
 
 		if (src_k.k->type == KEY_TYPE_extent) {
-			bkey_on_stack_realloc(&new_src, c, src_k.k->u64s);
-			bkey_reassemble(new_src.k, src_k);
+			bkey_on_stack_reassemble(&new_src, c, src_k);
 			src_k = bkey_i_to_s_c(new_src.k);
 
 			bch2_cut_front(src_iter->pos,	new_src.k);
-- 
cgit 


From 9ba68f6cdc79b0eab707bf8b50f418da05b6ff5e Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 26 Nov 2019 17:09:32 -0500
Subject: bcachefs: Switch to macro for bkey_ops

Older versions of gcc refuse to compile it the other way

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bkey_methods.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c
index 5312184c37f7..ed448fad83c5 100644
--- a/fs/bcachefs/bkey_methods.c
+++ b/fs/bcachefs/bkey_methods.c
@@ -75,10 +75,10 @@ static void key_type_inline_data_to_text(struct printbuf *out, struct bch_fs *c,
 	pr_buf(out, "(%zu bytes)", bkey_val_bytes(k.k));
 }
 
-static const struct bkey_ops bch2_bkey_ops_inline_data = {
-	.key_invalid	= key_type_inline_data_invalid,
-	.val_to_text	= key_type_inline_data_to_text,
-};
+#define bch2_bkey_ops_inline_data (struct bkey_ops) {	\
+	.key_invalid	= key_type_inline_data_invalid,	\
+	.val_to_text	= key_type_inline_data_to_text,	\
+}
 
 static const struct bkey_ops bch2_bkey_ops[] = {
 #define x(name, nr) [KEY_TYPE_##name]	= bch2_bkey_ops_##name,
-- 
cgit 


From ba239c954e840875a912230439f28b4e4fbea8ff Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 29 Nov 2019 13:47:42 -0500
Subject: bcachefs: bch2_check_set_feature()

New helper function for setting incompatible feature bits

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/io.c       |  2 ++
 fs/bcachefs/opts.c     | 11 ++---------
 fs/bcachefs/recovery.c |  6 ------
 fs/bcachefs/reflink.c  | 11 +----------
 fs/bcachefs/super-io.c | 11 +++++++++++
 fs/bcachefs/super-io.h | 27 +++++++--------------------
 6 files changed, 23 insertions(+), 45 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index 5de554a883ef..6934a0339eb0 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -1141,6 +1141,8 @@ static void bch2_write_data_inline(struct bch_write_op *op, unsigned data_len)
 	unsigned sectors;
 	int ret;
 
+	bch2_check_set_feature(op->c, BCH_FEATURE_INLINE_DATA);
+
 	ret = bch2_keylist_realloc(&op->insert_keys, op->inline_keys,
 				   ARRAY_SIZE(op->inline_keys),
 				   BKEY_U64s + DIV_ROUND_UP(data_len, 8));
diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c
index 13a9a2fcd575..cbacd2f36799 100644
--- a/fs/bcachefs/opts.c
+++ b/fs/bcachefs/opts.c
@@ -299,15 +299,8 @@ int bch2_opt_check_may_set(struct bch_fs *c, int id, u64 v)
 		ret = bch2_check_set_has_compressed_data(c, v);
 		break;
 	case Opt_erasure_code:
-		if (v &&
-		    !(c->sb.features & (1ULL << BCH_FEATURE_EC))) {
-			mutex_lock(&c->sb_lock);
-			c->disk_sb.sb->features[0] |=
-				cpu_to_le64(1ULL << BCH_FEATURE_EC);
-
-			bch2_write_super(c);
-			mutex_unlock(&c->sb_lock);
-		}
+		if (v)
+			bch2_check_set_feature(c, BCH_FEATURE_EC);
 		break;
 	}
 
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index d4002b7fc917..e6b51131cff2 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -913,12 +913,6 @@ int bch2_fs_recovery(struct bch_fs *c)
 		write_sb = true;
 	}
 
-	if (!(c->sb.features & (1ULL << BCH_FEATURE_INLINE_DATA))) {
-		c->disk_sb.sb->features[0] |=
-			cpu_to_le64(1ULL << BCH_FEATURE_INLINE_DATA);
-		write_sb = true;
-	}
-
 	if (!test_bit(BCH_FS_ERROR, &c->flags)) {
 		c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_FEAT_ALLOC_INFO;
 		write_sb = true;
diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c
index 2812fa305c0e..53bd0e0ea058 100644
--- a/fs/bcachefs/reflink.c
+++ b/fs/bcachefs/reflink.c
@@ -171,16 +171,7 @@ s64 bch2_remap_range(struct bch_fs *c,
 	if (!percpu_ref_tryget(&c->writes))
 		return -EROFS;
 
-	if (!(c->sb.features & (1ULL << BCH_FEATURE_REFLINK))) {
-		mutex_lock(&c->sb_lock);
-		if (!(c->sb.features & (1ULL << BCH_FEATURE_REFLINK))) {
-			c->disk_sb.sb->features[0] |=
-				cpu_to_le64(1ULL << BCH_FEATURE_REFLINK);
-
-			bch2_write_super(c);
-		}
-		mutex_unlock(&c->sb_lock);
-	}
+	bch2_check_set_feature(c, BCH_FEATURE_REFLINK);
 
 	dst_end.offset += remap_sectors;
 	src_end.offset += remap_sectors;
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index 6544bbf18e70..cd1aa3891c2e 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -795,6 +795,17 @@ out:
 	return ret;
 }
 
+void __bch2_check_set_feature(struct bch_fs *c, unsigned feat)
+{
+	mutex_lock(&c->sb_lock);
+	if (!(c->sb.features & (1ULL << feat))) {
+		c->disk_sb.sb->features[0] |= cpu_to_le64(1ULL << feat);
+
+		bch2_write_super(c);
+	}
+	mutex_unlock(&c->sb_lock);
+}
+
 /* BCH_SB_FIELD_journal: */
 
 static int u64_cmp(const void *_l, const void *_r)
diff --git a/fs/bcachefs/super-io.h b/fs/bcachefs/super-io.h
index 31b8b8307ac3..402ae563b3c7 100644
--- a/fs/bcachefs/super-io.h
+++ b/fs/bcachefs/super-io.h
@@ -43,26 +43,6 @@ struct bch_sb_field_ops {
 				   struct bch_sb_field *);
 };
 
-static inline bool bch2_sb_test_feature(struct bch_sb *sb,
-					enum bch_sb_features f)
-{
-	unsigned w = f / 64;
-	unsigned b = f % 64;
-
-	return le64_to_cpu(sb->features[w]) & (1ULL << b);
-}
-
-static inline void bch2_sb_set_feature(struct bch_sb *sb,
-				       enum bch_sb_features f)
-{
-	if (!bch2_sb_test_feature(sb, f)) {
-		unsigned w = f / 64;
-		unsigned b = f % 64;
-
-		le64_add_cpu(&sb->features[w], 1ULL << b);
-	}
-}
-
 static inline __le64 bch2_sb_magic(struct bch_fs *c)
 {
 	__le64 ret;
@@ -90,6 +70,13 @@ const char *bch2_sb_validate(struct bch_sb_handle *);
 
 int bch2_read_super(const char *, struct bch_opts *, struct bch_sb_handle *);
 int bch2_write_super(struct bch_fs *);
+void __bch2_check_set_feature(struct bch_fs *, unsigned);
+
+static inline void bch2_check_set_feature(struct bch_fs *c, unsigned feat)
+{
+	if (!(c->sb.features & (1ULL << feat)))
+		__bch2_check_set_feature(c, feat);
+}
 
 /* BCH_SB_FIELD_journal: */
 
-- 
cgit 


From 07358a82bb36ff77d6a1e2e0e7fc9920b96a64d4 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 29 Nov 2019 13:48:09 -0500
Subject: bcachefs: Put inline data behind a mount option for now

Inline data extents + reflink is still broken

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/io.c   | 3 ++-
 fs/bcachefs/opts.h | 5 +++++
 2 files changed, 7 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index 6934a0339eb0..e98ab738cd10 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -1224,7 +1224,8 @@ void bch2_write(struct closure *cl)
 	data_len = min_t(u64, bio->bi_iter.bi_size,
 			 op->new_i_size - (op->pos.offset << 9));
 
-	if (data_len <= min(block_bytes(c) / 2, 1024U)) {
+	if (c->opts.inline_data &&
+	    data_len <= min(block_bytes(c) / 2, 1024U)) {
 		bch2_write_data_inline(op, data_len);
 		return;
 	}
diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
index 2bd8bce43269..92a9b7e0f47f 100644
--- a/fs/bcachefs/opts.h
+++ b/fs/bcachefs/opts.h
@@ -181,6 +181,11 @@ enum opt_type {
 	  OPT_BOOL(),							\
 	  BCH_SB_128_BIT_MACS,		false,				\
 	  NULL,		"Store full 128 bits of cryptographic MACs, instead of 80")\
+	x(inline_data,			u8,				\
+	  OPT_MOUNT|OPT_RUNTIME,					\
+	  OPT_BOOL(),							\
+	  NO_SB_OPT,			false,				\
+	  NULL,		"Enable inline data extents")			\
 	x(acl,				u8,				\
 	  OPT_FORMAT|OPT_MOUNT,						\
 	  OPT_BOOL(),							\
-- 
cgit 


From c201e2d97615e7dc6f2f99dcdb8bf8d64657b761 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 26 Nov 2019 17:18:23 -0500
Subject: bcachefs: Fix bch2_verify_insert_pos()

We were calling __btree_node_key_to_offset() on a key that wasn't in the
btree node.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bset.c | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c
index 189a187bc080..a0bd6af67190 100644
--- a/fs/bcachefs/bset.c
+++ b/fs/bcachefs/bset.c
@@ -241,10 +241,9 @@ void bch2_verify_insert_pos(struct btree *b, struct bkey_packed *where,
 		bch2_bkey_to_text(&PBUF(buf2), &k2);
 
 		panic("prev > insert:\n"
-		      "prev    key %5u %s\n"
-		      "insert  key %5u %s\n",
-		       __btree_node_key_to_offset(b, prev), buf1,
-		       __btree_node_key_to_offset(b, insert), buf2);
+		      "prev    key %s\n"
+		      "insert  key %s\n",
+		      buf1, buf2);
 	}
 #endif
 #if 0
@@ -263,10 +262,9 @@ void bch2_verify_insert_pos(struct btree *b, struct bkey_packed *where,
 		bch2_bkey_to_text(&PBUF(buf2), &k2);
 
 		panic("insert > next:\n"
-		      "insert  key %5u %s\n"
-		      "next    key %5u %s\n",
-		       __btree_node_key_to_offset(b, insert), buf1,
-		       __btree_node_key_to_offset(b, next), buf2);
+		      "insert  key %s\n"
+		      "next    key %s\n",
+		      buf1, buf2);
 	}
 #endif
 }
-- 
cgit 


From 183797e31d43ce2fbfc596ff3f4d034f1ba144d0 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 20 Nov 2019 16:22:49 -0500
Subject: bcachefs: Always emit new extents on partial overwrite

This is prep work for snapshots: the algorithm in
bch2_extent_sort_fix_overlapping() will break when we have multiple
overlapping extents in unrelated snapshots - but, we'll be able to make
extents work like regular keys and use bch2_key_sort_fix_overlapping()
for extent btree nodes if we make a couple changes - the main one being
to always emit new extents when we partially overwrite an existing
(written) extent.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/extent_update.c | 293 +++++++++++++++++++-------------------------
 1 file changed, 125 insertions(+), 168 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/extent_update.c b/fs/bcachefs/extent_update.c
index 742b4d78cb3a..e021e1623a91 100644
--- a/fs/bcachefs/extent_update.c
+++ b/fs/bcachefs/extent_update.c
@@ -171,49 +171,51 @@ bch2_extent_can_insert(struct btree_trans *trans,
 {
 	struct btree_iter_level *l = &insert->iter->l[0];
 	struct btree_node_iter node_iter = l->iter;
-	enum bch_extent_overlap overlap;
 	struct bkey_packed *_k;
 	struct bkey unpacked;
-	struct bkey_s_c k;
 	int sectors;
 
-	/*
-	 * We avoid creating whiteouts whenever possible when deleting, but
-	 * those optimizations mean we may potentially insert two whiteouts
-	 * instead of one (when we overlap with the front of one extent and the
-	 * back of another):
-	 */
-	if (bkey_whiteout(&insert->k->k))
-		*u64s += BKEY_U64s;
-
-	_k = bch2_btree_node_iter_peek_filter(&node_iter, l->b,
-					      KEY_TYPE_discard);
-	if (!_k)
-		return BTREE_INSERT_OK;
-
-	k = bkey_disassemble(l->b, _k, &unpacked);
-
-	overlap = bch2_extent_overlap(&insert->k->k, k.k);
-
-	/* account for having to split existing extent: */
-	if (overlap == BCH_EXTENT_OVERLAP_MIDDLE)
-		*u64s += _k->u64s;
-
-	if (overlap == BCH_EXTENT_OVERLAP_MIDDLE &&
-	    (sectors = bch2_bkey_sectors_compressed(k))) {
-		int flags = trans->flags & BTREE_INSERT_NOFAIL
-			? BCH_DISK_RESERVATION_NOFAIL : 0;
-
-		switch (bch2_disk_reservation_add(trans->c,
-				trans->disk_res,
-				sectors, flags)) {
-		case 0:
+	while ((_k = bch2_btree_node_iter_peek_filter(&node_iter, l->b,
+						      KEY_TYPE_discard))) {
+		struct bkey_s_c k = bkey_disassemble(l->b, _k, &unpacked);
+		enum bch_extent_overlap overlap =
+			bch2_extent_overlap(&insert->k->k, k.k);
+
+		if (bkey_cmp(bkey_start_pos(k.k), insert->k->k.p) >= 0)
 			break;
-		case -ENOSPC:
-			return BTREE_INSERT_ENOSPC;
-		default:
-			BUG();
+
+		overlap = bch2_extent_overlap(&insert->k->k, k.k);
+
+		if (bkey_written(l->b, _k) &&
+		    overlap != BCH_EXTENT_OVERLAP_ALL)
+			*u64s += _k->u64s;
+
+		/* account for having to split existing extent: */
+		if (overlap == BCH_EXTENT_OVERLAP_MIDDLE)
+			*u64s += _k->u64s;
+
+		if (overlap == BCH_EXTENT_OVERLAP_MIDDLE &&
+		    (sectors = bch2_bkey_sectors_compressed(k))) {
+			int flags = trans->flags & BTREE_INSERT_NOFAIL
+				? BCH_DISK_RESERVATION_NOFAIL : 0;
+
+			switch (bch2_disk_reservation_add(trans->c,
+					trans->disk_res,
+					sectors, flags)) {
+			case 0:
+				break;
+			case -ENOSPC:
+				return BTREE_INSERT_ENOSPC;
+			default:
+				BUG();
+			}
 		}
+
+		if (overlap == BCH_EXTENT_OVERLAP_FRONT ||
+		    overlap == BCH_EXTENT_OVERLAP_MIDDLE)
+			break;
+
+		bch2_btree_node_iter_advance(&node_iter, l->b);
 	}
 
 	return BTREE_INSERT_OK;
@@ -285,101 +287,106 @@ static void extent_bset_insert(struct bch_fs *c, struct btree_iter *iter,
 }
 
 static void
-extent_squash(struct bch_fs *c, struct btree_iter *iter,
-	      struct bkey_i *insert,
-	      struct bkey_packed *_k, struct bkey_s k,
-	      enum bch_extent_overlap overlap)
+extent_drop(struct bch_fs *c, struct btree_iter *iter,
+	    struct bkey_packed *_k, struct bkey_s k)
 {
 	struct btree_iter_level *l = &iter->l[0];
-	int u64s_delta;
 
-	switch (overlap) {
-	case BCH_EXTENT_OVERLAP_FRONT:
-		/* insert overlaps with start of k: */
-		u64s_delta = bch2_cut_front_s(insert->k.p, k);
-		btree_keys_account_val_delta(l->b, _k, u64s_delta);
+	if (!bkey_whiteout(k.k))
+		btree_account_key_drop(l->b, _k);
 
-		EBUG_ON(bkey_deleted(k.k));
-		extent_save(l->b, _k, k.k);
-		bch2_btree_iter_fix_key_modified(iter, l->b, _k);
-		break;
+	k.k->size = 0;
+	k.k->type = KEY_TYPE_deleted;
+	k.k->needs_whiteout = false;
 
-	case BCH_EXTENT_OVERLAP_BACK:
-		/* insert overlaps with end of k: */
-		u64s_delta = bch2_cut_back_s(bkey_start_pos(&insert->k), k);
-		btree_keys_account_val_delta(l->b, _k, u64s_delta);
+	if (_k >= btree_bset_last(l->b)->start) {
+		unsigned u64s = _k->u64s;
 
-		EBUG_ON(bkey_deleted(k.k));
+		bch2_bset_delete(l->b, _k, _k->u64s);
+		bch2_btree_node_iter_fix(iter, l->b, &l->iter, _k, u64s, 0);
+	} else {
 		extent_save(l->b, _k, k.k);
+		bch2_btree_iter_fix_key_modified(iter, l->b, _k);
+	}
+}
 
-		/*
-		 * As the auxiliary tree is indexed by the end of the
-		 * key and we've just changed the end, update the
-		 * auxiliary tree.
-		 */
-		bch2_bset_fix_invalidated_key(l->b, _k);
-		bch2_btree_node_iter_fix(iter, l->b, &l->iter,
-					 _k, _k->u64s, _k->u64s);
-		break;
-
-	case BCH_EXTENT_OVERLAP_ALL: {
-		/* The insert key completely covers k, invalidate k */
-		if (!bkey_whiteout(k.k))
-			btree_account_key_drop(l->b, _k);
+static void
+extent_squash(struct bch_fs *c, struct btree_iter *iter,
+	      struct bkey_i *insert,
+	      struct bkey_packed *_k, struct bkey_s k,
+	      enum bch_extent_overlap overlap)
+{
+	struct btree_iter_level *l = &iter->l[0];
+	struct bkey_on_stack tmp, split;
 
-		k.k->size = 0;
-		k.k->type = KEY_TYPE_deleted;
+	bkey_on_stack_init(&tmp);
+	bkey_on_stack_init(&split);
 
-		if (_k >= btree_bset_last(l->b)->start) {
-			unsigned u64s = _k->u64s;
+	switch (overlap) {
+	case BCH_EXTENT_OVERLAP_FRONT:
+		if (bkey_written(l->b, _k)) {
+			bkey_on_stack_reassemble(&tmp, c, k.s_c);
+			bch2_cut_front(insert->k.p, tmp.k);
 
-			bch2_bset_delete(l->b, _k, _k->u64s);
-			bch2_btree_node_iter_fix(iter, l->b, &l->iter,
-						 _k, u64s, 0);
+			extent_drop(c, iter, _k, k);
+			extent_bset_insert(c, iter, tmp.k);
 		} else {
+			btree_keys_account_val_delta(l->b, _k,
+				bch2_cut_front_s(insert->k.p, k));
+
 			extent_save(l->b, _k, k.k);
+			/*
+			 * No need to call bset_fix_invalidated_key, start of
+			 * extent changed but extents are indexed by where they
+			 * end
+			 */
 			bch2_btree_iter_fix_key_modified(iter, l->b, _k);
 		}
-
 		break;
-	}
-	case BCH_EXTENT_OVERLAP_MIDDLE: {
-		struct bkey_on_stack split;
-
-		bkey_on_stack_init(&split);
-		bkey_on_stack_reassemble(&split, c, k.s_c);
+	case BCH_EXTENT_OVERLAP_BACK:
+		if (bkey_written(l->b, _k)) {
+			bkey_on_stack_reassemble(&tmp, c, k.s_c);
+			bch2_cut_back(bkey_start_pos(&insert->k), tmp.k);
 
-		/*
-		 * The insert key falls 'in the middle' of k
-		 * The insert key splits k in 3:
-		 * - start only in k, preserve
-		 * - middle common section, invalidate in k
-		 * - end only in k, preserve
-		 *
-		 * We update the old key to preserve the start,
-		 * insert will be the new common section,
-		 * we manually insert the end that we are preserving.
-		 *
-		 * modify k _before_ doing the insert (which will move
-		 * what k points to)
-		 */
-		split.k->k.needs_whiteout |= bkey_written(l->b, _k);
+			extent_drop(c, iter, _k, k);
+			extent_bset_insert(c, iter, tmp.k);
+		} else {
+			btree_keys_account_val_delta(l->b, _k,
+				bch2_cut_back_s(bkey_start_pos(&insert->k), k));
+			extent_save(l->b, _k, k.k);
 
+			bch2_bset_fix_invalidated_key(l->b, _k);
+			bch2_btree_node_iter_fix(iter, l->b, &l->iter,
+						 _k, _k->u64s, _k->u64s);
+		}
+		break;
+	case BCH_EXTENT_OVERLAP_ALL:
+		extent_drop(c, iter, _k, k);
+		break;
+	case BCH_EXTENT_OVERLAP_MIDDLE:
+		bkey_on_stack_reassemble(&split, c, k.s_c);
 		bch2_cut_back(bkey_start_pos(&insert->k), split.k);
-		BUG_ON(bkey_deleted(&split.k->k));
 
-		u64s_delta = bch2_cut_front_s(insert->k.p, k);
-		btree_keys_account_val_delta(l->b, _k, u64s_delta);
+		if (bkey_written(l->b, _k)) {
+			bkey_on_stack_reassemble(&tmp, c, k.s_c);
+			bch2_cut_front(insert->k.p, tmp.k);
 
-		BUG_ON(bkey_deleted(k.k));
-		extent_save(l->b, _k, k.k);
-		bch2_btree_iter_fix_key_modified(iter, l->b, _k);
+			extent_drop(c, iter, _k, k);
+			extent_bset_insert(c, iter, tmp.k);
+		} else {
+			btree_keys_account_val_delta(l->b, _k,
+				bch2_cut_front_s(insert->k.p, k));
+
+			extent_save(l->b, _k, k.k);
+			bch2_btree_iter_fix_key_modified(iter, l->b, _k);
+		}
 
 		extent_bset_insert(c, iter, split.k);
-		bkey_on_stack_exit(&split, c);
 		break;
 	}
-	}
+
+	bkey_on_stack_exit(&split, c);
+	bkey_on_stack_exit(&tmp, c);
 }
 
 /**
@@ -429,10 +436,7 @@ void bch2_insert_fixup_extent(struct btree_trans *trans,
 	struct bkey_i *insert	= insert_entry->k;
 	struct btree_iter_level *l = &iter->l[0];
 	struct btree_node_iter node_iter = l->iter;
-	bool deleting		= bkey_whiteout(&insert->k);
-	bool update_journal	= !deleting;
-	bool update_btree	= !deleting;
-	struct bkey_i whiteout	= *insert;
+	bool do_update		= !bkey_whiteout(&insert->k);
 	struct bkey_packed *_k;
 	struct bkey unpacked;
 
@@ -443,7 +447,6 @@ void bch2_insert_fixup_extent(struct btree_trans *trans,
 	while ((_k = bch2_btree_node_iter_peek_filter(&l->iter, l->b,
 						      KEY_TYPE_discard))) {
 		struct bkey_s k = __bkey_disassemble(l->b, _k, &unpacked);
-		struct bpos cur_end = bpos_min(insert->k.p, k.k->p);
 		enum bch_extent_overlap overlap =
 			bch2_extent_overlap(&insert->k, k.k);
 
@@ -451,52 +454,18 @@ void bch2_insert_fixup_extent(struct btree_trans *trans,
 			break;
 
 		if (!bkey_whiteout(k.k))
-			update_journal = true;
+			do_update = true;
+
+		if (!do_update) {
+			struct bpos cur_end = bpos_min(insert->k.p, k.k->p);
 
-		if (!update_journal) {
 			bch2_cut_front(cur_end, insert);
-			bch2_cut_front(cur_end, &whiteout);
 			bch2_btree_iter_set_pos_same_leaf(iter, cur_end);
-			goto next;
-		}
-
-		/*
-		 * When deleting, if possible just do it by switching the type
-		 * of the key we're deleting, instead of creating and inserting
-		 * a new whiteout:
-		 */
-		if (deleting &&
-		    !update_btree &&
-		    !bkey_cmp(insert->k.p, k.k->p) &&
-		    !bkey_cmp(bkey_start_pos(&insert->k), bkey_start_pos(k.k))) {
-			if (!bkey_whiteout(k.k)) {
-				btree_account_key_drop(l->b, _k);
-				_k->type = KEY_TYPE_discard;
-				reserve_whiteout(l->b, _k);
-				bch2_btree_iter_fix_key_modified(iter,
-								 l->b, _k);
-			}
-			break;
-		}
-
-		if (k.k->needs_whiteout || bkey_written(l->b, _k)) {
-			insert->k.needs_whiteout = true;
-			update_btree = true;
-		}
-
-		if (update_btree &&
-		    overlap == BCH_EXTENT_OVERLAP_ALL &&
-		    bkey_whiteout(k.k) &&
-		    k.k->needs_whiteout) {
-			unreserve_whiteout(l->b, _k);
-			_k->needs_whiteout = false;
+		} else {
+			insert->k.needs_whiteout |= k.k->needs_whiteout;
+			extent_squash(c, iter, insert, _k, k, overlap);
 		}
 
-		extent_squash(c, iter, insert, _k, k, overlap);
-
-		if (!update_btree)
-			bch2_cut_front(cur_end, insert);
-next:
 		node_iter = l->iter;
 
 		if (overlap == BCH_EXTENT_OVERLAP_FRONT ||
@@ -507,24 +476,12 @@ next:
 	l->iter = node_iter;
 	bch2_btree_iter_set_pos_same_leaf(iter, insert->k.p);
 
-	if (update_btree) {
-		if (deleting)
+	if (do_update) {
+		if (insert->k.type == KEY_TYPE_deleted)
 			insert->k.type = KEY_TYPE_discard;
 
-		EBUG_ON(bkey_deleted(&insert->k) || !insert->k.size);
-
 		extent_bset_insert(c, iter, insert);
-	}
-
-	if (update_journal) {
-		struct bkey_i *k = !deleting ? insert : &whiteout;
-
-		if (deleting)
-			k->k.type = KEY_TYPE_discard;
-
-		EBUG_ON(bkey_deleted(&k->k) || !k->k.size);
-
-		bch2_btree_journal_key(trans, iter, k);
+		bch2_btree_journal_key(trans, iter, insert);
 	}
 
 	bch2_cut_front(insert->k.p, insert);
-- 
cgit 


From c9bebae65eade6529f9d3068a6da42fc56664bfe Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 29 Nov 2019 14:08:51 -0500
Subject: bcachefs: Whiteout changes

More prep work for snapshots: extents will soon be using
KEY_TYPE_deleted for whiteouts, with 0 size. But we wen't be able to
keep these whiteouts with the rest of the extents in the btree node, due
to sorting invariants breaking.

We can deal with this by immediately moving the new whiteouts to the
unwritten whiteouts area - this just means those whiteouts won't be
sorted, so we need new code to sort them prior to merging them with the
rest of the keys to be written.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_cache.c           |  1 -
 fs/bcachefs/btree_io.c              | 99 ++++++++++++++++++++++++++++++-------
 fs/bcachefs/btree_types.h           |  1 -
 fs/bcachefs/btree_update_interior.h | 29 +++++------
 fs/bcachefs/btree_update_leaf.c     | 45 +++++++++--------
 5 files changed, 119 insertions(+), 56 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index 035da548737b..8eed82ac41f1 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -558,7 +558,6 @@ out:
 	b->sib_u64s[0]		= 0;
 	b->sib_u64s[1]		= 0;
 	b->whiteout_u64s	= 0;
-	b->uncompacted_whiteout_u64s = 0;
 	bch2_btree_keys_init(b, &c->expensive_debug_checks);
 
 	bch2_time_stats_update(&c->times[BCH_TIME_btree_node_mem_alloc],
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 8532087f2754..9acf59c0710d 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -79,6 +79,81 @@ static void *btree_bounce_alloc(struct bch_fs *c, unsigned order,
 	return mempool_alloc(&c->btree_bounce_pool, GFP_NOIO);
 }
 
+static void sort_bkey_ptrs(const struct btree *bt,
+			   struct bkey_packed **ptrs, unsigned nr)
+{
+	unsigned n = nr, a = nr / 2, b, c, d;
+
+	if (!a)
+		return;
+
+	/* Heap sort: see lib/sort.c: */
+	while (1) {
+		if (a)
+			a--;
+		else if (--n)
+			swap(ptrs[0], ptrs[n]);
+		else
+			break;
+
+		for (b = a; c = 2 * b + 1, (d = c + 1) < n;)
+			b = bkey_cmp_packed(bt,
+					    ptrs[c],
+					    ptrs[d]) >= 0 ? c : d;
+		if (d == n)
+			b = c;
+
+		while (b != a &&
+		       bkey_cmp_packed(bt,
+				       ptrs[a],
+				       ptrs[b]) >= 0)
+			b = (b - 1) / 2;
+		c = b;
+		while (b != a) {
+			b = (b - 1) / 2;
+			swap(ptrs[b], ptrs[c]);
+		}
+	}
+}
+
+static void bch2_sort_whiteouts(struct bch_fs *c, struct btree *b)
+{
+	struct bkey_packed *new_whiteouts, **whiteout_ptrs, *k;
+	bool used_mempool1 = false, used_mempool2 = false;
+	unsigned order, i, nr = 0;
+
+	if (!b->whiteout_u64s)
+		return;
+
+	order = get_order(b->whiteout_u64s * sizeof(u64));
+
+	new_whiteouts = btree_bounce_alloc(c, order, &used_mempool1);
+	whiteout_ptrs = btree_bounce_alloc(c, order, &used_mempool2);
+
+	for (k = unwritten_whiteouts_start(c, b);
+	     k != unwritten_whiteouts_end(c, b);
+	     k = bkey_next(k))
+		whiteout_ptrs[nr++] = k;
+
+	sort_bkey_ptrs(b, whiteout_ptrs, nr);
+
+	k = new_whiteouts;
+
+	for (i = 0; i < nr; i++) {
+		bkey_copy(k, whiteout_ptrs[i]);
+		k = bkey_next(k);
+	}
+
+	verify_no_dups(b, new_whiteouts,
+		       (void *) ((u64 *) new_whiteouts + b->whiteout_u64s));
+
+	memcpy_u64s(unwritten_whiteouts_start(c, b),
+		    new_whiteouts, b->whiteout_u64s);
+
+	btree_bounce_free(c, order, used_mempool2, whiteout_ptrs);
+	btree_bounce_free(c, order, used_mempool1, new_whiteouts);
+}
+
 static unsigned should_compact_bset(struct btree *b, struct bset_tree *t,
 				    bool compacting,
 				    enum compact_mode mode)
@@ -116,6 +191,8 @@ bool __bch2_compact_whiteouts(struct bch_fs *c, struct btree *b,
 	if (!whiteout_u64s)
 		return false;
 
+	bch2_sort_whiteouts(c, b);
+
 	sort_iter_init(&sort_iter, b);
 
 	whiteout_u64s += b->whiteout_u64s;
@@ -171,11 +248,14 @@ bool __bch2_compact_whiteouts(struct bch_fs *c, struct btree *b,
 			if (bkey_deleted(k) && btree_node_is_extents(b))
 				continue;
 
+			BUG_ON(bkey_whiteout(k) &&
+			       k->needs_whiteout &&
+			       bkey_written(b, k));
+
 			if (bkey_whiteout(k) && !k->needs_whiteout)
 				continue;
 
 			if (bkey_whiteout(k)) {
-				unreserve_whiteout(b, k);
 				memcpy_u64s(u_pos, k, bkeyp_key_u64s(f, k));
 				set_bkeyp_val_u64s(f, u_pos, 0);
 				u_pos = bkey_next(u_pos);
@@ -1342,21 +1422,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
 	BUG_ON(le64_to_cpu(b->data->magic) != bset_magic(c));
 	BUG_ON(memcmp(&b->data->format, &b->format, sizeof(b->format)));
 
-	/*
-	 * We can't block on six_lock_write() here; another thread might be
-	 * trying to get a journal reservation with read locks held, and getting
-	 * a journal reservation might be blocked on flushing the journal and
-	 * doing btree writes:
-	 */
-	if (lock_type_held == SIX_LOCK_intent &&
-	    six_trylock_write(&b->c.lock)) {
-		__bch2_compact_whiteouts(c, b, COMPACT_WRITTEN);
-		six_unlock_write(&b->c.lock);
-	} else {
-		__bch2_compact_whiteouts(c, b, COMPACT_WRITTEN_NO_WRITE_LOCK);
-	}
-
-	BUG_ON(b->uncompacted_whiteout_u64s);
+	bch2_sort_whiteouts(c, b);
 
 	sort_iter_init(&sort_iter, b);
 
@@ -1545,7 +1611,6 @@ bool bch2_btree_post_write_cleanup(struct bch_fs *c, struct btree *b)
 		return false;
 
 	BUG_ON(b->whiteout_u64s);
-	BUG_ON(b->uncompacted_whiteout_u64s);
 
 	clear_btree_node_just_written(b);
 
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 3a26a8802e86..e370474fd8c2 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -97,7 +97,6 @@ struct btree {
 	struct btree_nr_keys	nr;
 	u16			sib_u64s[2];
 	u16			whiteout_u64s;
-	u16			uncompacted_whiteout_u64s;
 	u8			page_order;
 	u8			unpack_fn_len;
 
diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h
index 85f1320fa7b1..8f9d4a0b68ea 100644
--- a/fs/bcachefs/btree_update_interior.h
+++ b/fs/bcachefs/btree_update_interior.h
@@ -251,8 +251,7 @@ static inline ssize_t __bch_btree_u64s_remaining(struct bch_fs *c,
 						 void *end)
 {
 	ssize_t used = bset_byte_offset(b, end) / sizeof(u64) +
-		b->whiteout_u64s +
-		b->uncompacted_whiteout_u64s;
+		b->whiteout_u64s;
 	ssize_t total = c->opts.btree_node_size << 6;
 
 	return total - used;
@@ -302,23 +301,19 @@ static inline struct btree_node_entry *want_new_bset(struct bch_fs *c,
 	return NULL;
 }
 
-static inline void unreserve_whiteout(struct btree *b, struct bkey_packed *k)
+static inline void push_whiteout(struct bch_fs *c, struct btree *b,
+				 struct bkey_packed *k)
 {
-	if (bkey_written(b, k)) {
-		EBUG_ON(b->uncompacted_whiteout_u64s <
-			bkeyp_key_u64s(&b->format, k));
-		b->uncompacted_whiteout_u64s -=
-			bkeyp_key_u64s(&b->format, k);
-	}
-}
+	unsigned u64s = bkeyp_key_u64s(&b->format, k);
+	struct bkey_packed *dst;
 
-static inline void reserve_whiteout(struct btree *b, struct bkey_packed *k)
-{
-	if (bkey_written(b, k)) {
-		BUG_ON(!k->needs_whiteout);
-		b->uncompacted_whiteout_u64s +=
-			bkeyp_key_u64s(&b->format, k);
-	}
+	BUG_ON(u64s > bch_btree_keys_u64s_remaining(c, b));
+
+	b->whiteout_u64s += bkeyp_key_u64s(&b->format, k);
+	dst = unwritten_whiteouts_start(c, b);
+	memcpy_u64s(dst, k, u64s);
+	dst->u64s = u64s;
+	dst->type = KEY_TYPE_deleted;
 }
 
 /*
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 6e5405f0b372..d13f1fc75bdf 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -104,38 +104,43 @@ bool bch2_btree_bset_insert_key(struct btree_iter *iter,
 			return true;
 		}
 
-		insert->k.needs_whiteout = k->needs_whiteout;
-
 		btree_account_key_drop(b, k);
 
-		if (k >= btree_bset_last(b)->start) {
-			clobber_u64s = k->u64s;
+		if (bkey_whiteout(&insert->k)) {
+			unsigned clobber_u64s = k->u64s, new_u64s = k->u64s;
+
+			k->type = KEY_TYPE_deleted;
 
-			/*
-			 * If we're deleting, and the key we're deleting doesn't
-			 * need a whiteout (it wasn't overwriting a key that had
-			 * been written to disk) - just delete it:
-			 */
-			if (bkey_whiteout(&insert->k) && !k->needs_whiteout) {
+			if (k->needs_whiteout) {
+				push_whiteout(iter->trans->c, b, k);
+				k->needs_whiteout = false;
+			}
+
+			if (k >= btree_bset_last(b)->start) {
 				bch2_bset_delete(b, k, clobber_u64s);
-				bch2_btree_node_iter_fix(iter, b, node_iter,
-							 k, clobber_u64s, 0);
-				return true;
+				new_u64s = 0;
 			}
 
+			bch2_btree_node_iter_fix(iter, b, node_iter, k,
+						 clobber_u64s, new_u64s);
+			return true;
+
+		}
+
+		if (k >= btree_bset_last(b)->start) {
+			clobber_u64s = k->u64s;
 			goto overwrite;
 		}
 
+		insert->k.needs_whiteout = k->needs_whiteout;
+		k->needs_whiteout = false;
 		k->type = KEY_TYPE_deleted;
+		/*
+		 * XXX: we should be able to do this without two calls to
+		 * bch2_btree_node_iter_fix:
+		 */
 		bch2_btree_node_iter_fix(iter, b, node_iter, k,
 					 k->u64s, k->u64s);
-
-		if (bkey_whiteout(&insert->k)) {
-			reserve_whiteout(b, k);
-			return true;
-		} else {
-			k->needs_whiteout = false;
-		}
 	} else {
 		/*
 		 * Deleting, but the key to delete wasn't found - nothing to do:
-- 
cgit 


From c297a763e2dcf34fe94f74c633957306d28fe138 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 13 Dec 2019 13:08:37 -0500
Subject: bcachefs: Refactor whiteouts compaction

The whiteout compaction path - as opposed to just dropping whiteouts -
is now only needed for extents, and soon will only be needed for extent
btree nodes in the old format.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bkey_sort.c   |  22 ---------
 fs/bcachefs/bkey_sort.h   |   2 -
 fs/bcachefs/btree_io.c    | 112 ++++++++++++++++++++++++++++------------------
 fs/bcachefs/btree_io.h    |  13 +++---
 fs/bcachefs/btree_types.h |   5 +++
 5 files changed, 80 insertions(+), 74 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bkey_sort.c b/fs/bcachefs/bkey_sort.c
index 2e205db5433d..4f614cde3267 100644
--- a/fs/bcachefs/bkey_sort.c
+++ b/fs/bcachefs/bkey_sort.c
@@ -530,28 +530,6 @@ unsigned bch2_sort_extents(struct bkey_packed *dst,
 	return (u64 *) out - (u64 *) dst;
 }
 
-static inline int sort_key_whiteouts_cmp(struct btree *b,
-					 struct bkey_packed *l,
-					 struct bkey_packed *r)
-{
-	return bkey_cmp_packed(b, l, r);
-}
-
-unsigned bch2_sort_key_whiteouts(struct bkey_packed *dst,
-				 struct sort_iter *iter)
-{
-	struct bkey_packed *in, *out = dst;
-
-	sort_iter_sort(iter, sort_key_whiteouts_cmp);
-
-	while ((in = sort_iter_next(iter, sort_key_whiteouts_cmp))) {
-		bkey_copy(out, in);
-		out = bkey_next(out);
-	}
-
-	return (u64 *) out - (u64 *) dst;
-}
-
 static inline int sort_extent_whiteouts_cmp(struct btree *b,
 					    struct bkey_packed *l,
 					    struct bkey_packed *r)
diff --git a/fs/bcachefs/bkey_sort.h b/fs/bcachefs/bkey_sort.h
index 397009181eae..47a808670341 100644
--- a/fs/bcachefs/bkey_sort.h
+++ b/fs/bcachefs/bkey_sort.h
@@ -61,8 +61,6 @@ unsigned bch2_sort_keys(struct bkey_packed *,
 unsigned bch2_sort_extents(struct bkey_packed *,
 			   struct sort_iter *, bool);
 
-unsigned bch2_sort_key_whiteouts(struct bkey_packed *,
-				 struct sort_iter *);
 unsigned bch2_sort_extent_whiteouts(struct bkey_packed *,
 				    struct sort_iter *);
 
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 9acf59c0710d..6a658f2c6328 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -154,27 +154,26 @@ static void bch2_sort_whiteouts(struct bch_fs *c, struct btree *b)
 	btree_bounce_free(c, order, used_mempool1, new_whiteouts);
 }
 
-static unsigned should_compact_bset(struct btree *b, struct bset_tree *t,
-				    bool compacting,
-				    enum compact_mode mode)
+static bool should_compact_bset(struct btree *b, struct bset_tree *t,
+				bool compacting, enum compact_mode mode)
 {
-	unsigned bset_u64s = le16_to_cpu(bset(b, t)->u64s);
-	unsigned dead_u64s = bset_u64s - b->nr.bset_u64s[t - b->set];
+	if (!bset_dead_u64s(b, t))
+		return false;
 
-	if (mode == COMPACT_LAZY) {
-		if (should_compact_bset_lazy(b, t) ||
-		    (compacting && !bset_written(b, bset(b, t))))
-			return dead_u64s;
-	} else {
-		if (bset_written(b, bset(b, t)))
-			return dead_u64s;
+	switch (mode) {
+	case COMPACT_LAZY:
+		return should_compact_bset_lazy(b, t) ||
+			(compacting && !bset_written(b, bset(b, t)));
+	case COMPACT_ALL:
+		return true;
+	default:
+		BUG();
 	}
-
-	return 0;
 }
 
-bool __bch2_compact_whiteouts(struct bch_fs *c, struct btree *b,
-			     enum compact_mode mode)
+static bool bch2_compact_extent_whiteouts(struct bch_fs *c,
+					  struct btree *b,
+					  enum compact_mode mode)
 {
 	const struct bkey_format *f = &b->format;
 	struct bset_tree *t;
@@ -184,9 +183,11 @@ bool __bch2_compact_whiteouts(struct bch_fs *c, struct btree *b,
 	unsigned order, whiteout_u64s = 0, u64s;
 	bool used_mempool, compacting = false;
 
+	BUG_ON(!btree_node_is_extents(b));
+
 	for_each_bset(b, t)
-		whiteout_u64s += should_compact_bset(b, t,
-					whiteout_u64s != 0, mode);
+		if (should_compact_bset(b, t, whiteout_u64s != 0, mode))
+			whiteout_u64s += bset_dead_u64s(b, t);
 
 	if (!whiteout_u64s)
 		return false;
@@ -215,9 +216,12 @@ bool __bch2_compact_whiteouts(struct bch_fs *c, struct btree *b,
 		if (t != b->set && !bset_written(b, i)) {
 			src = container_of(i, struct btree_node_entry, keys);
 			dst = max(write_block(b),
-				  (void *) btree_bkey_last(b, t -1));
+				  (void *) btree_bkey_last(b, t - 1));
 		}
 
+		if (src != dst)
+			compacting = true;
+
 		if (!should_compact_bset(b, t, compacting, mode)) {
 			if (src != dst) {
 				memmove(dst, src, sizeof(*src) +
@@ -245,7 +249,7 @@ bool __bch2_compact_whiteouts(struct bch_fs *c, struct btree *b,
 		for (k = start; k != end; k = n) {
 			n = bkey_next_skip_noops(k, end);
 
-			if (bkey_deleted(k) && btree_node_is_extents(b))
+			if (bkey_deleted(k))
 				continue;
 
 			BUG_ON(bkey_whiteout(k) &&
@@ -259,7 +263,7 @@ bool __bch2_compact_whiteouts(struct bch_fs *c, struct btree *b,
 				memcpy_u64s(u_pos, k, bkeyp_key_u64s(f, k));
 				set_bkeyp_val_u64s(f, u_pos, 0);
 				u_pos = bkey_next(u_pos);
-			} else if (mode != COMPACT_WRITTEN_NO_WRITE_LOCK) {
+			} else {
 				bkey_copy(out, k);
 				out = bkey_next(out);
 			}
@@ -267,11 +271,9 @@ bool __bch2_compact_whiteouts(struct bch_fs *c, struct btree *b,
 
 		sort_iter_add(&sort_iter, u_start, u_pos);
 
-		if (mode != COMPACT_WRITTEN_NO_WRITE_LOCK) {
-			i->u64s = cpu_to_le16((u64 *) out - i->_data);
-			set_btree_bset_end(b, t);
-			bch2_bset_set_no_aux_tree(b, t);
-		}
+		i->u64s = cpu_to_le16((u64 *) out - i->_data);
+		set_btree_bset_end(b, t);
+		bch2_bset_set_no_aux_tree(b, t);
 	}
 
 	b->whiteout_u64s = (u64 *) u_pos - (u64 *) whiteouts;
@@ -279,13 +281,10 @@ bool __bch2_compact_whiteouts(struct bch_fs *c, struct btree *b,
 	BUG_ON((void *) unwritten_whiteouts_start(c, b) <
 	       (void *) btree_bkey_last(b, bset_tree_last(b)));
 
-	u64s = (btree_node_is_extents(b)
-		? bch2_sort_extent_whiteouts
-		: bch2_sort_key_whiteouts)(unwritten_whiteouts_start(c, b),
-					   &sort_iter);
+	u64s = bch2_sort_extent_whiteouts(unwritten_whiteouts_start(c, b),
+					  &sort_iter);
 
 	BUG_ON(u64s > b->whiteout_u64s);
-	BUG_ON(u64s != b->whiteout_u64s && !btree_node_is_extents(b));
 	BUG_ON(u_pos != whiteouts && !u64s);
 
 	if (u64s != b->whiteout_u64s) {
@@ -301,8 +300,7 @@ bool __bch2_compact_whiteouts(struct bch_fs *c, struct btree *b,
 
 	btree_bounce_free(c, order, used_mempool, whiteouts);
 
-	if (mode != COMPACT_WRITTEN_NO_WRITE_LOCK)
-		bch2_btree_build_aux_trees(b);
+	bch2_btree_build_aux_trees(b);
 
 	bch_btree_keys_u64s_remaining(c, b);
 	bch2_verify_btree_nr_keys(b);
@@ -310,7 +308,7 @@ bool __bch2_compact_whiteouts(struct bch_fs *c, struct btree *b,
 	return true;
 }
 
-static bool bch2_drop_whiteouts(struct btree *b)
+static bool bch2_drop_whiteouts(struct btree *b, enum compact_mode mode)
 {
 	struct bset_tree *t;
 	bool ret = false;
@@ -318,21 +316,34 @@ static bool bch2_drop_whiteouts(struct btree *b)
 	for_each_bset(b, t) {
 		struct bset *i = bset(b, t);
 		struct bkey_packed *k, *n, *out, *start, *end;
+		struct btree_node_entry *src = NULL, *dst = NULL;
+
+		if (t != b->set && !bset_written(b, i)) {
+			src = container_of(i, struct btree_node_entry, keys);
+			dst = max(write_block(b),
+				  (void *) btree_bkey_last(b, t - 1));
+		}
+
+		if (src != dst)
+			ret = true;
 
-		if (!should_compact_bset(b, t, true, COMPACT_WRITTEN))
+		if (!should_compact_bset(b, t, ret, mode)) {
+			if (src != dst) {
+				memmove(dst, src, sizeof(*src) +
+					le16_to_cpu(src->keys.u64s) *
+					sizeof(u64));
+				i = &dst->keys;
+				set_btree_bset(b, t, i);
+			}
 			continue;
+		}
 
 		start	= btree_bkey_first(b, t);
 		end	= btree_bkey_last(b, t);
 
-		if (!bset_written(b, i) &&
-		    t != b->set) {
-			struct bset *dst =
-			       max_t(struct bset *, write_block(b),
-				     (void *) btree_bkey_last(b, t -1));
-
-			memmove(dst, i, sizeof(struct bset));
-			i = dst;
+		if (src != dst) {
+			memmove(dst, src, sizeof(*src));
+			i = &dst->keys;
 			set_btree_bset(b, t, i);
 		}
 
@@ -344,19 +355,32 @@ static bool bch2_drop_whiteouts(struct btree *b)
 			if (!bkey_whiteout(k)) {
 				bkey_copy(out, k);
 				out = bkey_next(out);
+			} else {
+				BUG_ON(k->needs_whiteout);
 			}
 		}
 
 		i->u64s = cpu_to_le16((u64 *) out - i->_data);
+		set_btree_bset_end(b, t);
 		bch2_bset_set_no_aux_tree(b, t);
 		ret = true;
 	}
 
 	bch2_verify_btree_nr_keys(b);
 
+	bch2_btree_build_aux_trees(b);
+
 	return ret;
 }
 
+bool bch2_compact_whiteouts(struct bch_fs *c, struct btree *b,
+			    enum compact_mode mode)
+{
+	return !btree_node_is_extents(b)
+		? bch2_drop_whiteouts(b, mode)
+		: bch2_compact_extent_whiteouts(c, b, mode);
+}
+
 static void btree_node_sort(struct bch_fs *c, struct btree *b,
 			    struct btree_iter *iter,
 			    unsigned start_idx,
@@ -1631,7 +1655,7 @@ bool bch2_btree_post_write_cleanup(struct bch_fs *c, struct btree *b)
 		btree_node_sort(c, b, NULL, 0, b->nsets, true);
 		invalidated_iter = true;
 	} else {
-		invalidated_iter = bch2_drop_whiteouts(b);
+		invalidated_iter = bch2_drop_whiteouts(b, COMPACT_ALL);
 	}
 
 	for_each_bset(b, t)
diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h
index 69516ec34b89..43fa8a6dbee5 100644
--- a/fs/bcachefs/btree_io.h
+++ b/fs/bcachefs/btree_io.h
@@ -54,16 +54,17 @@ static inline bool btree_node_may_write(struct btree *b)
 
 enum compact_mode {
 	COMPACT_LAZY,
-	COMPACT_WRITTEN,
-	COMPACT_WRITTEN_NO_WRITE_LOCK,
+	COMPACT_ALL,
 };
 
-bool __bch2_compact_whiteouts(struct bch_fs *, struct btree *, enum compact_mode);
+bool bch2_compact_whiteouts(struct bch_fs *, struct btree *,
+			    enum compact_mode);
 
-static inline unsigned should_compact_bset_lazy(struct btree *b, struct bset_tree *t)
+static inline bool should_compact_bset_lazy(struct btree *b,
+					    struct bset_tree *t)
 {
 	unsigned total_u64s = bset_u64s(t);
-	unsigned dead_u64s = total_u64s - b->nr.bset_u64s[t - b->set];
+	unsigned dead_u64s = bset_dead_u64s(b, t);
 
 	return dead_u64s > 64 && dead_u64s * 3 > total_u64s;
 }
@@ -74,7 +75,7 @@ static inline bool bch2_maybe_compact_whiteouts(struct bch_fs *c, struct btree *
 
 	for_each_bset(b, t)
 		if (should_compact_bset_lazy(b, t))
-			return __bch2_compact_whiteouts(c, b, COMPACT_LAZY);
+			return bch2_compact_whiteouts(c, b, COMPACT_LAZY);
 
 	return false;
 }
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index e370474fd8c2..5f0b55c98f86 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -423,6 +423,11 @@ static inline unsigned bset_u64s(struct bset_tree *t)
 		sizeof(struct bset) / sizeof(u64);
 }
 
+static inline unsigned bset_dead_u64s(struct btree *b, struct bset_tree *t)
+{
+	return bset_u64s(t) - b->nr.bset_u64s[t - b->set];
+}
+
 static inline unsigned bset_byte_offset(struct btree *b, void *i)
 {
 	return i - (void *) b->data;
-- 
cgit 


From 8f82280ea3871781e638d920d6dead58717bc13a Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 14 Dec 2019 15:55:29 -0500
Subject: bcachefs: Use one buffer for sorting whiteouts

We're not really supposed to allocate from the same mempool more than
once.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_io.c | 23 ++++++++++++-----------
 1 file changed, 12 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 6a658f2c6328..90fc31037bbf 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -118,30 +118,32 @@ static void sort_bkey_ptrs(const struct btree *bt,
 
 static void bch2_sort_whiteouts(struct bch_fs *c, struct btree *b)
 {
-	struct bkey_packed *new_whiteouts, **whiteout_ptrs, *k;
-	bool used_mempool1 = false, used_mempool2 = false;
-	unsigned order, i, nr = 0;
+	struct bkey_packed *new_whiteouts, **ptrs, **ptrs_end, *k;
+	bool used_mempool = false;
+	unsigned order;
 
 	if (!b->whiteout_u64s)
 		return;
 
 	order = get_order(b->whiteout_u64s * sizeof(u64));
 
-	new_whiteouts = btree_bounce_alloc(c, order, &used_mempool1);
-	whiteout_ptrs = btree_bounce_alloc(c, order, &used_mempool2);
+	new_whiteouts = btree_bounce_alloc(c, order, &used_mempool);
+
+	ptrs = ptrs_end = ((void *) new_whiteouts + (PAGE_SIZE << order));
 
 	for (k = unwritten_whiteouts_start(c, b);
 	     k != unwritten_whiteouts_end(c, b);
 	     k = bkey_next(k))
-		whiteout_ptrs[nr++] = k;
+		*--ptrs = k;
 
-	sort_bkey_ptrs(b, whiteout_ptrs, nr);
+	sort_bkey_ptrs(b, ptrs, ptrs_end - ptrs);
 
 	k = new_whiteouts;
 
-	for (i = 0; i < nr; i++) {
-		bkey_copy(k, whiteout_ptrs[i]);
+	while (ptrs != ptrs_end) {
+		bkey_copy(k, *ptrs);
 		k = bkey_next(k);
+		ptrs++;
 	}
 
 	verify_no_dups(b, new_whiteouts,
@@ -150,8 +152,7 @@ static void bch2_sort_whiteouts(struct bch_fs *c, struct btree *b)
 	memcpy_u64s(unwritten_whiteouts_start(c, b),
 		    new_whiteouts, b->whiteout_u64s);
 
-	btree_bounce_free(c, order, used_mempool2, whiteout_ptrs);
-	btree_bounce_free(c, order, used_mempool1, new_whiteouts);
+	btree_bounce_free(c, order, used_mempool, new_whiteouts);
 }
 
 static bool should_compact_bset(struct btree *b, struct bset_tree *t,
-- 
cgit 


From ae2f17d5ad02bc85a31d09c4396e177581abbb1f Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 14 Dec 2019 16:20:33 -0500
Subject: bcachefs: Kill btree_node_iter_large

Long overdue cleanup - this converts btree_node_iter_large uses to
sort_iter.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bkey_sort.c | 256 +++++++++++++++---------------------------------
 fs/bcachefs/bkey_sort.h |  26 ++---
 fs/bcachefs/btree_io.c  |  22 ++---
 fs/bcachefs/super.c     |   4 +-
 4 files changed, 99 insertions(+), 209 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bkey_sort.c b/fs/bcachefs/bkey_sort.c
index 4f614cde3267..23b51ef57303 100644
--- a/fs/bcachefs/bkey_sort.c
+++ b/fs/bcachefs/bkey_sort.c
@@ -5,90 +5,15 @@
 #include "bset.h"
 #include "extents.h"
 
-/* too many iterators, need to clean this up */
-
-/* btree_node_iter_large: */
-
-#define btree_node_iter_cmp_heap(h, _l, _r) btree_node_iter_cmp(b, _l, _r)
+typedef int (*sort_cmp_fn)(struct btree *,
+			   struct bkey_packed *,
+			   struct bkey_packed *);
 
-static inline bool
-bch2_btree_node_iter_large_end(struct btree_node_iter_large *iter)
+static inline bool sort_iter_end(struct sort_iter *iter)
 {
 	return !iter->used;
 }
 
-static inline struct bkey_packed *
-bch2_btree_node_iter_large_peek_all(struct btree_node_iter_large *iter,
-				    struct btree *b)
-{
-	return bch2_btree_node_iter_large_end(iter)
-		? NULL
-		: __btree_node_offset_to_key(b, iter->data->k);
-}
-
-static void
-bch2_btree_node_iter_large_advance(struct btree_node_iter_large *iter,
-				   struct btree *b)
-{
-	iter->data->k += __btree_node_offset_to_key(b, iter->data->k)->u64s;
-
-	EBUG_ON(!iter->used);
-	EBUG_ON(iter->data->k > iter->data->end);
-
-	if (iter->data->k == iter->data->end)
-		heap_del(iter, 0, btree_node_iter_cmp_heap, NULL);
-	else
-		heap_sift_down(iter, 0, btree_node_iter_cmp_heap, NULL);
-}
-
-static inline struct bkey_packed *
-bch2_btree_node_iter_large_next_all(struct btree_node_iter_large *iter,
-				    struct btree *b)
-{
-	struct bkey_packed *ret = bch2_btree_node_iter_large_peek_all(iter, b);
-
-	if (ret)
-		bch2_btree_node_iter_large_advance(iter, b);
-
-	return ret;
-}
-
-void bch2_btree_node_iter_large_push(struct btree_node_iter_large *iter,
-				     struct btree *b,
-				     const struct bkey_packed *k,
-				     const struct bkey_packed *end)
-{
-	if (k != end) {
-		struct btree_node_iter_set n =
-			((struct btree_node_iter_set) {
-				 __btree_node_key_to_offset(b, k),
-				 __btree_node_key_to_offset(b, end)
-			 });
-
-		__heap_add(iter, n, btree_node_iter_cmp_heap, NULL);
-	}
-}
-
-static void sort_key_next(struct btree_node_iter_large *iter,
-			  struct btree *b,
-			  struct btree_node_iter_set *i)
-{
-	i->k += __btree_node_offset_to_key(b, i->k)->u64s;
-
-	while (i->k != i->end &&
-	       !__btree_node_offset_to_key(b, i->k)->u64s)
-		i->k++;
-
-	if (i->k == i->end)
-		*i = iter->data[--iter->used];
-}
-
-/* regular sort_iters */
-
-typedef int (*sort_cmp_fn)(struct btree *,
-			   struct bkey_packed *,
-			   struct bkey_packed *);
-
 static inline void __sort_iter_sift(struct sort_iter *iter,
 				    unsigned from,
 				    sort_cmp_fn cmp)
@@ -118,19 +43,29 @@ static inline void sort_iter_sort(struct sort_iter *iter, sort_cmp_fn cmp)
 
 static inline struct bkey_packed *sort_iter_peek(struct sort_iter *iter)
 {
-	return iter->used ? iter->data->k : NULL;
+	return !sort_iter_end(iter) ? iter->data->k : NULL;
 }
 
-static inline void sort_iter_advance(struct sort_iter *iter, sort_cmp_fn cmp)
+static inline void __sort_iter_advance(struct sort_iter *iter,
+				       unsigned idx, sort_cmp_fn cmp)
 {
-	iter->data->k = bkey_next_skip_noops(iter->data->k, iter->data->end);
+	struct sort_iter_set *i = iter->data + idx;
+
+	BUG_ON(idx >= iter->used);
+
+	i->k = bkey_next_skip_noops(i->k, i->end);
 
-	BUG_ON(iter->data->k > iter->data->end);
+	BUG_ON(i->k > i->end);
 
-	if (iter->data->k == iter->data->end)
-		array_remove_item(iter->data, iter->used, 0);
+	if (i->k == i->end)
+		array_remove_item(iter->data, iter->used, idx);
 	else
-		sort_iter_sift(iter, cmp);
+		__sort_iter_sift(iter, idx, cmp);
+}
+
+static inline void sort_iter_advance(struct sort_iter *iter, sort_cmp_fn cmp)
+{
+	__sort_iter_advance(iter, 0, cmp);
 }
 
 static inline struct bkey_packed *sort_iter_next(struct sort_iter *iter,
@@ -145,70 +80,50 @@ static inline struct bkey_packed *sort_iter_next(struct sort_iter *iter,
 }
 
 /*
- * Returns true if l > r - unless l == r, in which case returns true if l is
- * older than r.
- *
- * Necessary for btree_sort_fixup() - if there are multiple keys that compare
- * equal in different sets, we have to process them newest to oldest.
+ * If keys compare equal, compare by pointer order:
  */
-#define key_sort_cmp(h, l, r)						\
-({									\
-	bkey_cmp_packed(b,						\
-			__btree_node_offset_to_key(b, (l).k),		\
-			__btree_node_offset_to_key(b, (r).k))		\
-									\
-	?: (l).k - (r).k;						\
-})
-
-static inline bool should_drop_next_key(struct btree_node_iter_large *iter,
-					struct btree *b)
+static inline int key_sort_fix_overlapping_cmp(struct btree *b,
+					       struct bkey_packed *l,
+					       struct bkey_packed *r)
 {
-	struct btree_node_iter_set *l = iter->data, *r = iter->data + 1;
-	struct bkey_packed *k = __btree_node_offset_to_key(b, l->k);
-
-	if (bkey_whiteout(k))
-		return true;
-
-	if (iter->used < 2)
-		return false;
-
-	if (iter->used > 2 &&
-	    key_sort_cmp(iter, r[0], r[1]) >= 0)
-		r++;
+	return bkey_cmp_packed(b, l, r) ?:
+		cmp_int((unsigned long) l, (unsigned long) r);
+}
 
+static inline bool should_drop_next_key(struct sort_iter *iter)
+{
 	/*
 	 * key_sort_cmp() ensures that when keys compare equal the older key
-	 * comes first; so if l->k compares equal to r->k then l->k is older and
-	 * should be dropped.
+	 * comes first; so if l->k compares equal to r->k then l->k is older
+	 * and should be dropped.
 	 */
-	return !bkey_cmp_packed(b,
-				__btree_node_offset_to_key(b, l->k),
-				__btree_node_offset_to_key(b, r->k));
+	return iter->used >= 2 &&
+		!bkey_cmp_packed(iter->b,
+				 iter->data[0].k,
+				 iter->data[1].k);
 }
 
-struct btree_nr_keys bch2_key_sort_fix_overlapping(struct bset *dst,
-					struct btree *b,
-					struct btree_node_iter_large *iter)
+struct btree_nr_keys
+bch2_key_sort_fix_overlapping(struct bch_fs *c, struct bset *dst,
+			      struct sort_iter *iter)
 {
 	struct bkey_packed *out = dst->start;
+	struct bkey_packed *k;
 	struct btree_nr_keys nr;
 
 	memset(&nr, 0, sizeof(nr));
 
-	heap_resort(iter, key_sort_cmp, NULL);
-
-	while (!bch2_btree_node_iter_large_end(iter)) {
-		if (!should_drop_next_key(iter, b)) {
-			struct bkey_packed *k =
-				__btree_node_offset_to_key(b, iter->data->k);
+	sort_iter_sort(iter, key_sort_fix_overlapping_cmp);
 
+	while ((k = sort_iter_peek(iter))) {
+		if (!bkey_whiteout(k) &&
+		    !should_drop_next_key(iter)) {
 			bkey_copy(out, k);
 			btree_keys_account_key_add(&nr, 0, out);
 			out = bkey_next(out);
 		}
 
-		sort_key_next(iter, b, iter->data);
-		heap_sift_down(iter, 0, key_sort_cmp, NULL);
+		sort_iter_advance(iter, key_sort_fix_overlapping_cmp);
 	}
 
 	dst->u64s = cpu_to_le16((u64 *) out - dst->_data);
@@ -221,29 +136,16 @@ struct btree_nr_keys bch2_key_sort_fix_overlapping(struct bset *dst,
  * Necessary for sort_fix_overlapping() - if there are multiple keys that
  * compare equal in different sets, we have to process them newest to oldest.
  */
-#define extent_sort_cmp(h, l, r)					\
-({									\
-	struct bkey _ul = bkey_unpack_key(b,				\
-				__btree_node_offset_to_key(b, (l).k));	\
-	struct bkey _ur = bkey_unpack_key(b,				\
-				__btree_node_offset_to_key(b, (r).k));	\
-									\
-	bkey_cmp(bkey_start_pos(&_ul),					\
-		 bkey_start_pos(&_ur)) ?: (r).k - (l).k;		\
-})
-
-static inline void extent_sort_sift(struct btree_node_iter_large *iter,
-				    struct btree *b, size_t i)
+static inline int extent_sort_fix_overlapping_cmp(struct btree *b,
+						  struct bkey_packed *l,
+						  struct bkey_packed *r)
 {
-	heap_sift_down(iter, i, extent_sort_cmp, NULL);
-}
+	struct bkey ul = bkey_unpack_key(b, l);
+	struct bkey ur = bkey_unpack_key(b, r);
 
-static inline void extent_sort_next(struct btree_node_iter_large *iter,
-				    struct btree *b,
-				    struct btree_node_iter_set *i)
-{
-	sort_key_next(iter, b, i);
-	heap_sift_down(iter, i - iter->data, extent_sort_cmp, NULL);
+	return bkey_cmp(bkey_start_pos(&ul),
+			bkey_start_pos(&ur)) ?:
+		cmp_int((unsigned long) r, (unsigned long) l);
 }
 
 static void extent_sort_advance_prev(struct bkey_format *f,
@@ -286,14 +188,14 @@ static void extent_sort_append(struct bch_fs *c,
 	bkey_reassemble((void *) *prev, k.s_c);
 }
 
-struct btree_nr_keys bch2_extent_sort_fix_overlapping(struct bch_fs *c,
-					struct bset *dst,
-					struct btree *b,
-					struct btree_node_iter_large *iter)
+struct btree_nr_keys
+bch2_extent_sort_fix_overlapping(struct bch_fs *c, struct bset *dst,
+				 struct sort_iter *iter)
 {
+	struct btree *b = iter->b;
 	struct bkey_format *f = &b->format;
-	struct btree_node_iter_set *_l = iter->data, *_r;
-	struct bkey_packed *prev = NULL, *lk, *rk;
+	struct sort_iter_set *_l = iter->data, *_r = iter->data + 1;
+	struct bkey_packed *prev = NULL;
 	struct bkey l_unpacked, r_unpacked;
 	struct bkey_s l, r;
 	struct btree_nr_keys nr;
@@ -302,36 +204,32 @@ struct btree_nr_keys bch2_extent_sort_fix_overlapping(struct bch_fs *c,
 	memset(&nr, 0, sizeof(nr));
 	bkey_on_stack_init(&split);
 
-	heap_resort(iter, extent_sort_cmp, NULL);
+	sort_iter_sort(iter, extent_sort_fix_overlapping_cmp);
 
-	while (!bch2_btree_node_iter_large_end(iter)) {
-		lk = __btree_node_offset_to_key(b, _l->k);
-		l = __bkey_disassemble(b, lk, &l_unpacked);
+	while (!sort_iter_end(iter)) {
+		l = __bkey_disassemble(b, _l->k, &l_unpacked);
 
 		if (iter->used == 1) {
 			extent_sort_append(c, f, &nr, dst->start, &prev, l);
-			extent_sort_next(iter, b, _l);
+			sort_iter_advance(iter,
+					  extent_sort_fix_overlapping_cmp);
 			continue;
 		}
 
-		_r = iter->data + 1;
-		if (iter->used > 2 &&
-		    extent_sort_cmp(iter, _r[0], _r[1]) >= 0)
-			_r++;
-
-		rk = __btree_node_offset_to_key(b, _r->k);
-		r = __bkey_disassemble(b, rk, &r_unpacked);
+		r = __bkey_disassemble(b, _r->k, &r_unpacked);
 
 		/* If current key and next key don't overlap, just append */
 		if (bkey_cmp(l.k->p, bkey_start_pos(r.k)) <= 0) {
 			extent_sort_append(c, f, &nr, dst->start, &prev, l);
-			extent_sort_next(iter, b, _l);
+			sort_iter_advance(iter,
+					  extent_sort_fix_overlapping_cmp);
 			continue;
 		}
 
 		/* Skip 0 size keys */
 		if (!r.k->size) {
-			extent_sort_next(iter, b, _r);
+			__sort_iter_advance(iter, 1,
+					    extent_sort_fix_overlapping_cmp);
 			continue;
 		}
 
@@ -348,13 +246,14 @@ struct btree_nr_keys bch2_extent_sort_fix_overlapping(struct bch_fs *c,
 		if (_l->k > _r->k) {
 			/* l wins, trim r */
 			if (bkey_cmp(l.k->p, r.k->p) >= 0) {
-				sort_key_next(iter, b, _r);
+				__sort_iter_advance(iter, 1,
+					 extent_sort_fix_overlapping_cmp);
 			} else {
 				bch2_cut_front_s(l.k->p, r);
-				extent_save(b, rk, r.k);
+				extent_save(b, _r->k, r.k);
+				__sort_iter_sift(iter, 1,
+					 extent_sort_fix_overlapping_cmp);
 			}
-
-			extent_sort_sift(iter, b, _r - iter->data);
 		} else if (bkey_cmp(l.k->p, r.k->p) > 0) {
 
 			/*
@@ -364,15 +263,16 @@ struct btree_nr_keys bch2_extent_sort_fix_overlapping(struct bch_fs *c,
 			bch2_cut_back(bkey_start_pos(r.k), split.k);
 
 			bch2_cut_front_s(r.k->p, l);
-			extent_save(b, lk, l.k);
+			extent_save(b, _l->k, l.k);
 
-			extent_sort_sift(iter, b, 0);
+			__sort_iter_sift(iter, 0,
+					 extent_sort_fix_overlapping_cmp);
 
 			extent_sort_append(c, f, &nr, dst->start,
 					   &prev, bkey_i_to_s(split.k));
 		} else {
 			bch2_cut_back_s(bkey_start_pos(r.k), l);
-			extent_save(b, lk, l.k);
+			extent_save(b, _l->k, l.k);
 		}
 	}
 
diff --git a/fs/bcachefs/bkey_sort.h b/fs/bcachefs/bkey_sort.h
index 47a808670341..458a051fdac5 100644
--- a/fs/bcachefs/bkey_sort.h
+++ b/fs/bcachefs/bkey_sort.h
@@ -2,20 +2,10 @@
 #ifndef _BCACHEFS_BKEY_SORT_H
 #define _BCACHEFS_BKEY_SORT_H
 
-struct btree_node_iter_large {
-	u16		used;
-
-	struct btree_node_iter_set data[MAX_BSETS];
-};
-
-void bch2_btree_node_iter_large_push(struct btree_node_iter_large *,
-				     struct btree *,
-				     const struct bkey_packed *,
-				     const struct bkey_packed *);
-
 struct sort_iter {
-	struct btree	*b;
+	struct btree		*b;
 	unsigned		used;
+	unsigned		size;
 
 	struct sort_iter_set {
 		struct bkey_packed *k, *end;
@@ -24,27 +14,27 @@ struct sort_iter {
 
 static inline void sort_iter_init(struct sort_iter *iter, struct btree *b)
 {
-	memset(iter, 0, sizeof(*iter));
 	iter->b = b;
+	iter->used = 0;
+	iter->size = ARRAY_SIZE(iter->data);
 }
 
 static inline void sort_iter_add(struct sort_iter *iter,
 				 struct bkey_packed *k,
 				 struct bkey_packed *end)
 {
-	BUG_ON(iter->used >= ARRAY_SIZE(iter->data));
+	BUG_ON(iter->used >= iter->size);
 
 	if (k != end)
 		iter->data[iter->used++] = (struct sort_iter_set) { k, end };
 }
 
 struct btree_nr_keys
-bch2_key_sort_fix_overlapping(struct bset *, struct btree *,
-			      struct btree_node_iter_large *);
+bch2_key_sort_fix_overlapping(struct bch_fs *, struct bset *,
+			      struct sort_iter *);
 struct btree_nr_keys
 bch2_extent_sort_fix_overlapping(struct bch_fs *, struct bset *,
-				 struct btree *,
-				 struct btree_node_iter_large *);
+				 struct sort_iter *);
 
 struct btree_nr_keys
 bch2_sort_repack(struct bset *, struct btree *,
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 90fc31037bbf..209e20fbcd70 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -862,7 +862,7 @@ fsck_err:
 int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry)
 {
 	struct btree_node_entry *bne;
-	struct btree_node_iter_large *iter;
+	struct sort_iter *iter;
 	struct btree_node *sorted;
 	struct bkey_packed *k;
 	struct bset *i;
@@ -871,7 +871,8 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry
 	int ret, retry_read = 0, write = READ;
 
 	iter = mempool_alloc(&c->fill_iter, GFP_NOIO);
-	iter->used = 0;
+	sort_iter_init(iter, b);
+	iter->size = (btree_blocks(c) + 1) * 2;
 
 	if (bch2_meta_read_fault("btree"))
 		btree_err(BTREE_ERR_MUST_RETRY, c, b, NULL,
@@ -950,13 +951,12 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry
 		if (blacklisted && !first)
 			continue;
 
-		bch2_btree_node_iter_large_push(iter, b,
-					   i->start,
-					   vstruct_idx(i, whiteout_u64s));
+		sort_iter_add(iter, i->start,
+			      vstruct_idx(i, whiteout_u64s));
 
-		bch2_btree_node_iter_large_push(iter, b,
-					   vstruct_idx(i, whiteout_u64s),
-					   vstruct_last(i));
+		sort_iter_add(iter,
+			      vstruct_idx(i, whiteout_u64s),
+			      vstruct_last(i));
 	}
 
 	for (bne = write_block(b);
@@ -971,9 +971,9 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry
 
 	set_btree_bset(b, b->set, &b->data->keys);
 
-	b->nr = btree_node_is_extents(b)
-		? bch2_extent_sort_fix_overlapping(c, &sorted->keys, b, iter)
-		: bch2_key_sort_fix_overlapping(&sorted->keys, b, iter);
+	b->nr = (btree_node_is_extents(b)
+		 ? bch2_extent_sort_fix_overlapping
+		 : bch2_key_sort_fix_overlapping)(c, &sorted->keys, iter);
 
 	u64s = le16_to_cpu(sorted->keys.u64s);
 	*sorted = *b->data;
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 3ad4c0ecbe12..2e23cc3dbf2f 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -705,9 +705,9 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 	if (bch2_fs_init_fault("fs_alloc"))
 		goto err;
 
-	iter_size = sizeof(struct btree_node_iter_large) +
+	iter_size = sizeof(struct sort_iter) +
 		(btree_blocks(c) + 1) * 2 *
-		sizeof(struct btree_node_iter_set);
+		sizeof(struct sort_iter_set);
 
 	if (!(c->wq = alloc_workqueue("bcachefs",
 				WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_HIGHPRI, 1)) ||
-- 
cgit 


From b5a5c4c1033af72b94c1ba7a71f61e4231f27832 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 16 Dec 2019 15:37:47 -0500
Subject: bcachefs: Fix a null ptr deref in btree_iter_traverse_one()

When traversing nodes and we've reached the end of the btree, the
current btree node will be NULL.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index f05a5e718181..34443d18bc42 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1148,7 +1148,8 @@ static int btree_iter_traverse_one(struct btree_iter *iter)
 	iter->uptodate = BTREE_ITER_NEED_PEEK;
 
 	bch2_btree_trans_verify_locks(iter->trans);
-	__bch2_btree_iter_verify(iter, iter->l[iter->level].b);
+	if (btree_iter_node(iter, iter->level))
+		__bch2_btree_iter_verify(iter, iter->l[iter->level].b);
 	return 0;
 }
 
-- 
cgit 


From c45d473df77f41a104ecf78275be49b67f6d6295 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 18 Dec 2019 13:18:33 -0500
Subject: bcachefs: Fix for an assertion on filesystem error

Normally the in memory i_size is always greater than or equal to i_size
on disk; this doesn't hold on filesystem error.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-io.c | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 5beb47805a68..c30c028c869d 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -2255,6 +2255,15 @@ int bch2_truncate(struct bch_inode_info *inode, struct iattr *iattr)
 	ret = PTR_ERR_OR_ZERO(iter);
 	bch2_trans_exit(&trans);
 
+	if (ret)
+		goto err;
+
+	/*
+	 * check this before next assertion; on filesystem error our normal
+	 * invariants are a bit broken (truncate has to truncate the page cache
+	 * before the inode).
+	 */
+	ret = bch2_journal_error(&c->journal);
 	if (ret)
 		goto err;
 
-- 
cgit 


From 184b1dc1a6bf4bc53a1c71bf14120498aad67ff5 Mon Sep 17 00:00:00 2001
From: Justin Husted <sigstop@gmail.com>
Date: Mon, 11 Nov 2019 20:14:30 -0800
Subject: bcachefs: Update directory timestamps during link

Timestamp updates on the directory during a link operation were cached.
This is inconsistent with other metadata operations such as rename, as
well as being less efficient.

Signed-off-by: Justin Husted <sigstop@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-common.c | 12 ++++++------
 fs/bcachefs/fs-common.h |  1 +
 fs/bcachefs/fs.c        | 12 +++++++++---
 fs/bcachefs/fsck.c      |  4 ++--
 4 files changed, 18 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-common.c b/fs/bcachefs/fs-common.c
index a4497eeb1f1b..96f7bbe0a3ed 100644
--- a/fs/bcachefs/fs-common.c
+++ b/fs/bcachefs/fs-common.c
@@ -76,11 +76,10 @@ int bch2_create_trans(struct btree_trans *trans, u64 dir_inum,
 }
 
 int bch2_link_trans(struct btree_trans *trans, u64 dir_inum,
-		    u64 inum, struct bch_inode_unpacked *inode_u,
-		    const struct qstr *name)
+		    u64 inum, struct bch_inode_unpacked *dir_u,
+		    struct bch_inode_unpacked *inode_u, const struct qstr *name)
 {
 	struct btree_iter *dir_iter, *inode_iter;
-	struct bch_inode_unpacked dir_u;
 	struct bch_hash_info dir_hash;
 	u64 now = bch2_current_time(trans->c);
 
@@ -91,18 +90,19 @@ int bch2_link_trans(struct btree_trans *trans, u64 dir_inum,
 	inode_u->bi_ctime = now;
 	bch2_inode_nlink_inc(inode_u);
 
-	dir_iter = bch2_inode_peek(trans, &dir_u, dir_inum, 0);
+	dir_iter = bch2_inode_peek(trans, dir_u, dir_inum, 0);
 	if (IS_ERR(dir_iter))
 		return PTR_ERR(dir_iter);
 
-	/* XXX: shouldn't we be updating mtime/ctime on the directory? */
+	dir_u->bi_mtime = dir_u->bi_ctime = now;
 
-	dir_hash = bch2_hash_info_init(trans->c, &dir_u);
+	dir_hash = bch2_hash_info_init(trans->c, dir_u);
 	bch2_trans_iter_put(trans, dir_iter);
 
 	return bch2_dirent_create(trans, dir_inum, &dir_hash,
 				  mode_to_type(inode_u->bi_mode),
 				  name, inum, BCH_HASH_SET_MUST_CREATE) ?:
+		bch2_inode_write(trans, dir_iter, dir_u) ?:
 		bch2_inode_write(trans, inode_iter, inode_u);
 }
 
diff --git a/fs/bcachefs/fs-common.h b/fs/bcachefs/fs-common.h
index c1621485a526..2273b7961c9b 100644
--- a/fs/bcachefs/fs-common.h
+++ b/fs/bcachefs/fs-common.h
@@ -14,6 +14,7 @@ int bch2_create_trans(struct btree_trans *, u64,
 
 int bch2_link_trans(struct btree_trans *, u64,
 		    u64, struct bch_inode_unpacked *,
+		    struct bch_inode_unpacked *,
 		    const struct qstr *);
 
 int bch2_unlink_trans(struct btree_trans *,
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index e8cdae3c114b..c20eaa7418c2 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -395,7 +395,7 @@ static int __bch2_link(struct bch_fs *c,
 		       struct dentry *dentry)
 {
 	struct btree_trans trans;
-	struct bch_inode_unpacked inode_u;
+	struct bch_inode_unpacked dir_u, inode_u;
 	int ret;
 
 	mutex_lock(&inode->ei_update_lock);
@@ -405,7 +405,7 @@ static int __bch2_link(struct bch_fs *c,
 		bch2_trans_begin(&trans);
 		ret   = bch2_link_trans(&trans,
 					dir->v.i_ino,
-					inode->v.i_ino, &inode_u,
+					inode->v.i_ino, &dir_u, &inode_u,
 					&dentry->d_name) ?:
 			bch2_trans_commit(&trans, NULL,
 					&inode->ei_journal_seq,
@@ -413,8 +413,14 @@ static int __bch2_link(struct bch_fs *c,
 					BTREE_INSERT_NOUNLOCK);
 	} while (ret == -EINTR);
 
-	if (likely(!ret))
+	if (likely(!ret)) {
+		BUG_ON(inode_u.bi_inum != inode->v.i_ino);
+
+		journal_seq_copy(inode, dir->ei_journal_seq);
+		bch2_inode_update_after_write(c, dir, &dir_u,
+					      ATTR_MTIME|ATTR_CTIME);
 		bch2_inode_update_after_write(c, inode, &inode_u, ATTR_CTIME);
+	}
 
 	bch2_trans_exit(&trans);
 	mutex_unlock(&inode->ei_update_lock);
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 0f2308e53d65..3ae545b31c7a 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -80,7 +80,7 @@ static int reattach_inode(struct bch_fs *c,
 			  struct bch_inode_unpacked *lostfound_inode,
 			  u64 inum)
 {
-	struct bch_inode_unpacked inode_u;
+	struct bch_inode_unpacked dir_u, inode_u;
 	char name_buf[20];
 	struct qstr name;
 	int ret;
@@ -92,7 +92,7 @@ static int reattach_inode(struct bch_fs *c,
 			    BTREE_INSERT_ATOMIC|
 			    BTREE_INSERT_LAZY_RW,
 		bch2_link_trans(&trans, lostfound_inode->bi_inum,
-				inum, &inode_u, &name));
+				inum, &dir_u, &inode_u, &name));
 	if (ret)
 		bch_err(c, "error %i reattaching inode %llu", ret, inum);
 
-- 
cgit 


From 22502ac23a2eaa3714b77d4a9242df352a9cd0c0 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 16 Dec 2019 17:53:59 -0500
Subject: bcachefs: Redo filesystem usage ioctls

When disk space accounting was changed to be tracked by replicas entry,
the ioctl interface was never update: this patch finally does that.

Aditionally, the BCH_IOCTL_USAGE ioctl is now broken out into separate
ioctls for filesystem and device usage.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs_format.h |   3 +
 fs/bcachefs/bcachefs_ioctl.h  |  70 +++++++++++--------
 fs/bcachefs/chardev.c         | 152 +++++++++++++++++++++++++-----------------
 fs/bcachefs/replicas.h        |   3 -
 4 files changed, 136 insertions(+), 92 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index e3004593874c..9b8fc265a5c0 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -1084,6 +1084,9 @@ struct bch_replicas_entry {
 	__u8			devs[];
 } __attribute__((packed));
 
+#define replicas_entry_bytes(_i)					\
+	(offsetof(typeof(*(_i)), devs) + (_i)->nr_devs)
+
 struct bch_sb_field_replicas {
 	struct bch_sb_field	field;
 	struct bch_replicas_entry entries[];
diff --git a/fs/bcachefs/bcachefs_ioctl.h b/fs/bcachefs/bcachefs_ioctl.h
index 8c0599618404..d7f25e52dc71 100644
--- a/fs/bcachefs/bcachefs_ioctl.h
+++ b/fs/bcachefs/bcachefs_ioctl.h
@@ -68,7 +68,8 @@ struct bch_ioctl_incremental {
 #define BCH_IOCTL_DISK_OFFLINE	_IOW(0xbc,	7,  struct bch_ioctl_disk)
 #define BCH_IOCTL_DISK_SET_STATE _IOW(0xbc,	8,  struct bch_ioctl_disk_set_state)
 #define BCH_IOCTL_DATA		_IOW(0xbc,	10, struct bch_ioctl_data)
-#define BCH_IOCTL_USAGE		_IOWR(0xbc,	11, struct bch_ioctl_usage)
+#define BCH_IOCTL_FS_USAGE	_IOWR(0xbc,	11, struct bch_ioctl_fs_usage)
+#define BCH_IOCTL_DEV_USAGE	_IOWR(0xbc,	11, struct bch_ioctl_dev_usage)
 #define BCH_IOCTL_READ_SUPER	_IOW(0xbc,	12, struct bch_ioctl_read_super)
 #define BCH_IOCTL_DISK_GET_IDX	_IOW(0xbc,	13,  struct bch_ioctl_disk_get_idx)
 #define BCH_IOCTL_DISK_RESIZE	_IOW(0xbc,	14,  struct bch_ioctl_disk_resize)
@@ -224,46 +225,59 @@ struct bch_ioctl_data_event {
 	};
 } __attribute__((packed, aligned(8)));
 
-struct bch_ioctl_dev_usage {
-	__u8			state;
-	__u8			alive;
-	__u8			pad[6];
-	__u32			dev;
+struct bch_replicas_usage {
+	__u64			sectors;
+	struct bch_replicas_entry r;
+} __attribute__((packed));
 
-	__u32			bucket_size;
-	__u64			nr_buckets;
-
-	__u64			buckets[BCH_DATA_NR];
-	__u64			sectors[BCH_DATA_NR];
-};
+static inline struct bch_replicas_usage *
+replicas_usage_next(struct bch_replicas_usage *u)
+{
+	return (void *) u + replicas_entry_bytes(&u->r) + 8;
+}
 
+/*
+ * BCH_IOCTL_FS_USAGE: query filesystem disk space usage
+ *
+ * Returns disk space usage broken out by data type, number of replicas, and
+ * by component device
+ *
+ * @replica_entries_bytes - size, in bytes, allocated for replica usage entries
+ *
+ * On success, @replica_entries_bytes will be changed to indicate the number of
+ * bytes actually used.
+ *
+ * Returns -ERANGE if @replica_entries_bytes was too small
+ */
 struct bch_ioctl_fs_usage {
 	__u64			capacity;
 	__u64			used;
 	__u64			online_reserved;
 	__u64			persistent_reserved[BCH_REPLICAS_MAX];
-	__u64			sectors[BCH_DATA_NR][BCH_REPLICAS_MAX];
+
+	__u32			replica_entries_bytes;
+	__u32			pad;
+
+	struct bch_replicas_usage replicas[0];
 };
 
 /*
- * BCH_IOCTL_USAGE: query filesystem disk space usage
- *
- * Returns disk space usage broken out by data type, number of replicas, and
- * by component device
+ * BCH_IOCTL_DEV_USAGE: query device disk space usage
  *
- * @nr_devices	- number of devices userspace allocated space for in @devs
- *
- * On success, @fs and @devs will be filled out appropriately and devs[i].alive
- * will indicate if a device was present in that slot
- *
- * Returns -ERANGE if @nr_devices was too small
+ * Returns disk space usage broken out by data type - both by buckets and
+ * sectors.
  */
-struct bch_ioctl_usage {
-	__u16			nr_devices;
-	__u16			pad[3];
+struct bch_ioctl_dev_usage {
+	__u64			dev;
+	__u32			flags;
+	__u8			state;
+	__u8			pad[7];
+
+	__u32			bucket_size;
+	__u64			nr_buckets;
 
-	struct bch_ioctl_fs_usage fs;
-	struct bch_ioctl_dev_usage devs[0];
+	__u64			buckets[BCH_DATA_NR];
+	__u64			sectors[BCH_DATA_NR];
 };
 
 /*
diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c
index 4d8331022648..084bef5e7997 100644
--- a/fs/bcachefs/chardev.c
+++ b/fs/bcachefs/chardev.c
@@ -6,6 +6,7 @@
 #include "buckets.h"
 #include "chardev.h"
 #include "move.h"
+#include "replicas.h"
 #include "super.h"
 #include "super-io.h"
 
@@ -371,89 +372,116 @@ err:
 	return ret;
 }
 
-static long bch2_ioctl_usage(struct bch_fs *c,
-			     struct bch_ioctl_usage __user *user_arg)
+static long bch2_ioctl_fs_usage(struct bch_fs *c,
+				struct bch_ioctl_fs_usage __user *user_arg)
 {
-	struct bch_ioctl_usage arg;
-	struct bch_dev *ca;
-	unsigned i, j;
-	int ret;
+	struct bch_ioctl_fs_usage *arg = NULL;
+	struct bch_replicas_usage *dst_e, *dst_end;
+	struct bch_fs_usage_online *src;
+	u32 replica_entries_bytes;
+	unsigned i;
+	int ret = 0;
 
 	if (!test_bit(BCH_FS_STARTED, &c->flags))
 		return -EINVAL;
 
-	if (copy_from_user(&arg, user_arg, sizeof(arg)))
+	if (get_user(replica_entries_bytes, &user_arg->replica_entries_bytes))
 		return -EFAULT;
 
-	for (i = 0; i < arg.nr_devices; i++) {
-		struct bch_ioctl_dev_usage dst = { .alive = 0 };
+	arg = kzalloc(sizeof(*arg) + replica_entries_bytes, GFP_KERNEL);
+	if (!arg)
+		return -ENOMEM;
 
-		ret = copy_to_user(&user_arg->devs[i], &dst, sizeof(dst));
-		if (ret)
-			return ret;
+	src = bch2_fs_usage_read(c);
+	if (!src) {
+		ret = -ENOMEM;
+		goto err;
 	}
 
-	{
-		struct bch_fs_usage_online *src;
-		struct bch_ioctl_fs_usage dst = {
-			.capacity		= c->capacity,
-		};
+	arg->capacity		= c->capacity;
+	arg->used		= bch2_fs_sectors_used(c, src);
+	arg->online_reserved	= src->online_reserved;
 
-		src = bch2_fs_usage_read(c);
-		if (!src)
-			return -ENOMEM;
+	for (i = 0; i < BCH_REPLICAS_MAX; i++)
+		arg->persistent_reserved[i] = src->u.persistent_reserved[i];
 
-		dst.used		= bch2_fs_sectors_used(c, src);
-		dst.online_reserved	= src->online_reserved;
+	dst_e	= arg->replicas;
+	dst_end = (void *) arg->replicas + replica_entries_bytes;
 
-		percpu_up_read(&c->mark_lock);
+	for (i = 0; i < c->replicas.nr; i++) {
+		struct bch_replicas_entry *src_e =
+			cpu_replicas_entry(&c->replicas, i);
 
-		for (i = 0; i < BCH_REPLICAS_MAX; i++) {
-			dst.persistent_reserved[i] =
-				src->u.persistent_reserved[i];
-#if 0
-			for (j = 0; j < BCH_DATA_NR; j++)
-				dst.sectors[j][i] = src.replicas[i].data[j];
-#endif
+		if (replicas_usage_next(dst_e) > dst_end) {
+			ret = -ERANGE;
+			break;
 		}
 
-		kfree(src);
+		dst_e->sectors		= src->u.replicas[i];
+		dst_e->r		= *src_e;
+
+		/* recheck after setting nr_devs: */
+		if (replicas_usage_next(dst_e) > dst_end) {
+			ret = -ERANGE;
+			break;
+		}
 
-		ret = copy_to_user(&user_arg->fs, &dst, sizeof(dst));
-		if (ret)
-			return ret;
+		memcpy(dst_e->r.devs, src_e->devs, src_e->nr_devs);
+
+		dst_e = replicas_usage_next(dst_e);
 	}
 
-	for_each_member_device(ca, c, i) {
-		struct bch_dev_usage src = bch2_dev_usage_read(c, ca);
-		struct bch_ioctl_dev_usage dst = {
-			.alive		= 1,
-			.state		= ca->mi.state,
-			.bucket_size	= ca->mi.bucket_size,
-			.nr_buckets	= ca->mi.nbuckets - ca->mi.first_bucket,
-		};
-
-		if (ca->dev_idx >= arg.nr_devices) {
-			percpu_ref_put(&ca->ref);
-			return -ERANGE;
-		}
+	arg->replica_entries_bytes = (void *) dst_e - (void *) arg->replicas;
 
-		if (percpu_ref_tryget(&ca->io_ref)) {
-			dst.dev = huge_encode_dev(ca->disk_sb.bdev->bd_dev);
-			percpu_ref_put(&ca->io_ref);
-		}
+	percpu_up_read(&c->mark_lock);
+	kfree(src);
 
-		for (j = 0; j < BCH_DATA_NR; j++) {
-			dst.buckets[j] = src.buckets[j];
-			dst.sectors[j] = src.sectors[j];
-		}
+	if (!ret)
+		ret = copy_to_user(user_arg, arg,
+			sizeof(*arg) + arg->replica_entries_bytes);
+err:
+	kfree(arg);
+	return ret;
+}
+
+static long bch2_ioctl_dev_usage(struct bch_fs *c,
+				 struct bch_ioctl_dev_usage __user *user_arg)
+{
+	struct bch_ioctl_dev_usage arg;
+	struct bch_dev_usage src;
+	struct bch_dev *ca;
+	unsigned i;
+
+	if (!test_bit(BCH_FS_STARTED, &c->flags))
+		return -EINVAL;
 
-		ret = copy_to_user(&user_arg->devs[i], &dst, sizeof(dst));
-		if (ret)
-			return ret;
+	if (copy_from_user(&arg, user_arg, sizeof(arg)))
+		return -EFAULT;
+
+	if ((arg.flags & ~BCH_BY_INDEX) ||
+	    arg.pad[0] ||
+	    arg.pad[1] ||
+	    arg.pad[2])
+		return -EINVAL;
+
+	ca = bch2_device_lookup(c, arg.dev, arg.flags);
+	if (IS_ERR(ca))
+		return PTR_ERR(ca);
+
+	src = bch2_dev_usage_read(c, ca);
+
+	arg.state	= ca->mi.state;
+	arg.bucket_size	= ca->mi.bucket_size;
+	arg.nr_buckets	= ca->mi.nbuckets - ca->mi.first_bucket;
+
+	for (i = 0; i < BCH_DATA_NR; i++) {
+		arg.buckets[i] = src.buckets[i];
+		arg.sectors[i] = src.sectors[i];
 	}
 
-	return 0;
+	percpu_ref_put(&ca->ref);
+
+	return copy_to_user(user_arg, &arg, sizeof(arg));
 }
 
 static long bch2_ioctl_read_super(struct bch_fs *c,
@@ -547,8 +575,10 @@ long bch2_fs_ioctl(struct bch_fs *c, unsigned cmd, void __user *arg)
 	switch (cmd) {
 	case BCH_IOCTL_QUERY_UUID:
 		return bch2_ioctl_query_uuid(c, arg);
-	case BCH_IOCTL_USAGE:
-		return bch2_ioctl_usage(c, arg);
+	case BCH_IOCTL_FS_USAGE:
+		return bch2_ioctl_fs_usage(c, arg);
+	case BCH_IOCTL_DEV_USAGE:
+		return bch2_ioctl_dev_usage(c, arg);
 	}
 
 	if (!capable(CAP_SYS_ADMIN))
diff --git a/fs/bcachefs/replicas.h b/fs/bcachefs/replicas.h
index 0d6e19126021..8527d82841bb 100644
--- a/fs/bcachefs/replicas.h
+++ b/fs/bcachefs/replicas.h
@@ -72,9 +72,6 @@ int bch2_replicas_set_usage(struct bch_fs *,
 
 /* iterate over superblock replicas - used by userspace tools: */
 
-#define replicas_entry_bytes(_i)					\
-	(offsetof(typeof(*(_i)), devs) + (_i)->nr_devs)
-
 #define replicas_entry_next(_i)						\
 	((typeof(_i)) ((void *) (_i) + replicas_entry_bytes(_i)))
 
-- 
cgit 


From 187c71f6ab439582c80433ef9e04f615b8c0f576 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 18 Dec 2019 17:34:36 -0500
Subject: bcachefs: Fix a memory splat

In __bch2_sb_field_resize, when a field's old a new size was 0, we were
doing an invalid write just past the end of the superblock.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/super-io.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index cd1aa3891c2e..4c1e8571d872 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -54,7 +54,9 @@ static struct bch_sb_field *__bch2_sb_field_resize(struct bch_sb_handle *sb,
 	BUG_ON(get_order(__vstruct_bytes(struct bch_sb, sb_u64s)) >
 	       sb->page_order);
 
-	if (!f) {
+	if (!f && !u64s) {
+		/* nothing to do: */
+	} else if (!f) {
 		f = vstruct_last(sb->sb);
 		memset(f, 0, sizeof(u64) * u64s);
 		f->u64s = cpu_to_le32(u64s);
-- 
cgit 


From 5873efbfd9c3f53312aaa6c3024592a2a344f615 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 19 Dec 2019 15:07:51 -0500
Subject: bcachefs: Make io timers less buggy

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/clock.c       | 43 +++++++++++++++++++++++++++++--------------
 fs/bcachefs/clock.h       |  6 ++++--
 fs/bcachefs/clock_types.h |  1 +
 fs/bcachefs/sysfs.c       | 12 ++++++++++++
 4 files changed, 46 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/clock.c b/fs/bcachefs/clock.c
index e227753563ab..51286520c5c7 100644
--- a/fs/bcachefs/clock.c
+++ b/fs/bcachefs/clock.c
@@ -18,6 +18,14 @@ void bch2_io_timer_add(struct io_clock *clock, struct io_timer *timer)
 	size_t i;
 
 	spin_lock(&clock->timer_lock);
+
+	if (time_after_eq((unsigned long) atomic_long_read(&clock->now),
+			  timer->expire)) {
+		spin_unlock(&clock->timer_lock);
+		timer->fn(timer);
+		return;
+	}
+
 	for (i = 0; i < clock->timers.used; i++)
 		if (clock->timers.data[i] == timer)
 			goto out;
@@ -135,26 +143,31 @@ static struct io_timer *get_expired_timer(struct io_clock *clock,
 	return ret;
 }
 
-void __bch2_increment_clock(struct io_clock *clock)
+void __bch2_increment_clock(struct io_clock *clock, unsigned sectors)
 {
 	struct io_timer *timer;
-	unsigned long now;
-	unsigned sectors;
+	unsigned long now = atomic_long_add_return(sectors, &clock->now);
 
-	/* Buffer up one megabyte worth of IO in the percpu counter */
-	preempt_disable();
+	while ((timer = get_expired_timer(clock, now)))
+		timer->fn(timer);
+}
 
-	if (this_cpu_read(*clock->pcpu_buf) < IO_CLOCK_PCPU_SECTORS) {
-		preempt_enable();
-		return;
-	}
+ssize_t bch2_io_timers_show(struct io_clock *clock, char *buf)
+{
+	struct printbuf out = _PBUF(buf, PAGE_SIZE);
+	unsigned long now;
+	unsigned i;
 
-	sectors = this_cpu_xchg(*clock->pcpu_buf, 0);
-	preempt_enable();
-	now = atomic_long_add_return(sectors, &clock->now);
+	spin_lock(&clock->timer_lock);
+	now = atomic_long_read(&clock->now);
 
-	while ((timer = get_expired_timer(clock, now)))
-		timer->fn(timer);
+	for (i = 0; i < clock->timers.used; i++)
+		pr_buf(&out, "%pf:\t%li\n",
+		       clock->timers.data[i]->fn,
+		       clock->timers.data[i]->expire - now);
+	spin_unlock(&clock->timer_lock);
+
+	return out.pos - buf;
 }
 
 void bch2_io_clock_exit(struct io_clock *clock)
@@ -168,6 +181,8 @@ int bch2_io_clock_init(struct io_clock *clock)
 	atomic_long_set(&clock->now, 0);
 	spin_lock_init(&clock->timer_lock);
 
+	clock->max_slop = IO_CLOCK_PCPU_SECTORS * num_possible_cpus();
+
 	clock->pcpu_buf = alloc_percpu(*clock->pcpu_buf);
 	if (!clock->pcpu_buf)
 		return -ENOMEM;
diff --git a/fs/bcachefs/clock.h b/fs/bcachefs/clock.h
index bfbbca8a207b..da50afe206cc 100644
--- a/fs/bcachefs/clock.h
+++ b/fs/bcachefs/clock.h
@@ -7,7 +7,7 @@ void bch2_io_timer_del(struct io_clock *, struct io_timer *);
 void bch2_kthread_io_clock_wait(struct io_clock *, unsigned long,
 				unsigned long);
 
-void __bch2_increment_clock(struct io_clock *);
+void __bch2_increment_clock(struct io_clock *, unsigned);
 
 static inline void bch2_increment_clock(struct bch_fs *c, unsigned sectors,
 					int rw)
@@ -16,7 +16,7 @@ static inline void bch2_increment_clock(struct bch_fs *c, unsigned sectors,
 
 	if (unlikely(this_cpu_add_return(*clock->pcpu_buf, sectors) >=
 		   IO_CLOCK_PCPU_SECTORS))
-		__bch2_increment_clock(clock);
+		__bch2_increment_clock(clock, this_cpu_xchg(*clock->pcpu_buf, 0));
 }
 
 void bch2_io_clock_schedule_timeout(struct io_clock *, unsigned long);
@@ -30,6 +30,8 @@ void bch2_io_clock_schedule_timeout(struct io_clock *, unsigned long);
 	__ret;								\
 })
 
+ssize_t bch2_io_timers_show(struct io_clock *, char *);
+
 void bch2_io_clock_exit(struct io_clock *);
 int bch2_io_clock_init(struct io_clock *);
 
diff --git a/fs/bcachefs/clock_types.h b/fs/bcachefs/clock_types.h
index 2b5e499e12b4..92c740a47565 100644
--- a/fs/bcachefs/clock_types.h
+++ b/fs/bcachefs/clock_types.h
@@ -28,6 +28,7 @@ typedef HEAP(struct io_timer *)	io_timer_heap;
 struct io_clock {
 	atomic_long_t		now;
 	u16 __percpu		*pcpu_buf;
+	unsigned		max_slop;
 
 	spinlock_t		timer_lock;
 	io_timer_heap		timers;
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index 8d68331f8b63..767fd7bed2d0 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -18,6 +18,7 @@
 #include "btree_update_interior.h"
 #include "btree_gc.h"
 #include "buckets.h"
+#include "clock.h"
 #include "disk_groups.h"
 #include "ec.h"
 #include "inode.h"
@@ -198,6 +199,9 @@ rw_attribute(pd_controllers_update_seconds);
 read_attribute(meta_replicas_have);
 read_attribute(data_replicas_have);
 
+read_attribute(io_timers_read);
+read_attribute(io_timers_write);
+
 #ifdef CONFIG_BCACHEFS_TESTS
 write_attribute(perf_test);
 #endif /* CONFIG_BCACHEFS_TESTS */
@@ -404,6 +408,11 @@ SHOW(bch2_fs)
 	if (attr == &sysfs_new_stripes)
 		return bch2_new_stripes(c, buf);
 
+	if (attr == &sysfs_io_timers_read)
+		return bch2_io_timers_show(&c->io_clock[READ], buf);
+	if (attr == &sysfs_io_timers_write)
+		return bch2_io_timers_show(&c->io_clock[WRITE], buf);
+
 #define BCH_DEBUG_PARAM(name, description) sysfs_print(name, c->name);
 	BCH_DEBUG_PARAMS()
 #undef BCH_DEBUG_PARAM
@@ -581,6 +590,9 @@ struct attribute *bch2_fs_internal_files[] = {
 
 	&sysfs_new_stripes,
 
+	&sysfs_io_timers_read,
+	&sysfs_io_timers_write,
+
 	&sysfs_internal_uuid,
 
 #define BCH_DEBUG_PARAM(name, description) &sysfs_##name,
-- 
cgit 


From 309c54c3f4151be97ff27b38e08de51a71ea0377 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 20 Dec 2019 16:19:46 -0500
Subject: bcachefs: Redo copygc throttling

The code that checked the current free space and waited if it was too
big was causing issues - btree node allocations do not increment the
write IO clock (perhaps they should); but more broadly the check
wouldn't run copygc at all until the device was mostly full, at which
point it might have to do a bunch of work.

This redoes that logic so that copygc starts to run earlier, smoothly
running more and more often as the device becomes closer to full.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/movinggc.c | 52 +++++++++++++++++++++++++++-----------------------
 1 file changed, 28 insertions(+), 24 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c
index c6159a34e509..7e08a7940a35 100644
--- a/fs/bcachefs/movinggc.c
+++ b/fs/bcachefs/movinggc.c
@@ -212,14 +212,36 @@ static void bch2_copygc(struct bch_fs *c, struct bch_dev *ca)
 		     buckets_to_move, buckets_not_moved);
 }
 
+/*
+ * Copygc runs when the amount of fragmented data is above some arbitrary
+ * threshold:
+ *
+ * The threshold at the limit - when the device is full - is the amount of space
+ * we reserved in bch2_recalc_capacity; we can't have more than that amount of
+ * disk space stranded due to fragmentation and store everything we have
+ * promised to store.
+ *
+ * But we don't want to be running copygc unnecessarily when the device still
+ * has plenty of free space - rather, we want copygc to smoothly run every so
+ * often and continually reduce the amount of fragmented space as the device
+ * fills up. So, we increase the threshold by half the current free space.
+ */
+unsigned long bch2_copygc_wait_amount(struct bch_dev *ca)
+{
+	struct bch_fs *c = ca->fs;
+	struct bch_dev_usage usage = bch2_dev_usage_read(c, ca);
+	u64 fragmented_allowed = ca->copygc_threshold +
+		((__dev_buckets_available(ca, usage) * ca->mi.bucket_size) >> 1);
+
+	return max_t(s64, 0, fragmented_allowed - usage.sectors_fragmented);
+}
+
 static int bch2_copygc_thread(void *arg)
 {
 	struct bch_dev *ca = arg;
 	struct bch_fs *c = ca->fs;
 	struct io_clock *clock = &c->io_clock[WRITE];
-	struct bch_dev_usage usage;
-	unsigned long last;
-	u64 available, fragmented, reserve, next;
+	unsigned long last, wait;
 
 	set_freezable();
 
@@ -228,28 +250,10 @@ static int bch2_copygc_thread(void *arg)
 			break;
 
 		last = atomic_long_read(&clock->now);
+		wait = bch2_copygc_wait_amount(ca);
 
-		reserve = ca->copygc_threshold;
-
-		usage = bch2_dev_usage_read(c, ca);
-
-		available = __dev_buckets_available(ca, usage) *
-			ca->mi.bucket_size;
-		if (available > reserve) {
-			next = last + available - reserve;
-			bch2_kthread_io_clock_wait(clock, next,
-					MAX_SCHEDULE_TIMEOUT);
-			continue;
-		}
-
-		/*
-		 * don't start copygc until there's more than half the copygc
-		 * reserve of fragmented space:
-		 */
-		fragmented = usage.sectors_fragmented;
-		if (fragmented < reserve) {
-			next = last + reserve - fragmented;
-			bch2_kthread_io_clock_wait(clock, next,
+		if (wait > clock->max_slop) {
+			bch2_kthread_io_clock_wait(clock, last + wait,
 					MAX_SCHEDULE_TIMEOUT);
 			continue;
 		}
-- 
cgit 


From 780c4e43f8f8986bb8d97d654cb17edd0dfca4b4 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 20 Dec 2019 16:26:27 -0500
Subject: bcachefs: Drop a faulty assertion

This assertion was wrong for interior nodes (and wasn't terribly useful
to begin with)

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 34443d18bc42..06a087b91e33 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -457,11 +457,6 @@ static void __bch2_btree_iter_verify(struct btree_iter *iter,
 		      "cur key  %s\n",
 		      iter->pos.inode, iter->pos.offset, buf);
 	}
-
-	BUG_ON(iter->uptodate == BTREE_ITER_UPTODATE &&
-	       btree_iter_type(iter) == BTREE_ITER_KEYS &&
-	       !bkey_whiteout(&iter->k) &&
-	       bch2_btree_node_iter_end(&l->iter));
 }
 
 void bch2_btree_iter_verify(struct btree_iter *iter, struct btree *b)
-- 
cgit 


From a8abd3a7f63efe4a366ae5aba10b2466feba39d7 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 20 Dec 2019 16:35:24 -0500
Subject: bcachefs: bch2_trans_reset() calls should be at the tops of loops

It needs to be called when we get -EINTR due to e.g. lock restart - this
fixes a transaction iterators overflow bug.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.h        | 5 -----
 fs/bcachefs/btree_update_leaf.c | 2 ++
 fs/bcachefs/fs-io.c             | 4 ++--
 fs/bcachefs/io.c                | 8 ++++----
 fs/bcachefs/reflink.c           | 3 ++-
 5 files changed, 10 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index a05e542b3792..2bbf714c9698 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -299,11 +299,6 @@ static inline void bch2_trans_begin(struct btree_trans *trans)
 	return bch2_trans_reset(trans, TRANS_RESET_ITERS|TRANS_RESET_MEM);
 }
 
-static inline void bch2_trans_begin_updates(struct btree_trans *trans)
-{
-	return bch2_trans_reset(trans, TRANS_RESET_MEM);
-}
-
 void *bch2_trans_kmalloc(struct btree_trans *, size_t);
 void bch2_trans_init(struct btree_trans *, struct bch_fs *, unsigned, size_t);
 int bch2_trans_exit(struct btree_trans *);
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index d13f1fc75bdf..55f785dadaac 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -869,6 +869,8 @@ retry:
 	       bkey_cmp(iter->pos, end) < 0) {
 		struct bkey_i delete;
 
+		bch2_trans_reset(trans, TRANS_RESET_MEM);
+
 		bkey_init(&delete.k);
 
 		/*
diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index c30c028c869d..5656e26540fa 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -2650,6 +2650,8 @@ static long bchfs_fallocate(struct bch_inode_info *inode, int mode,
 		struct bkey_i_reservation reservation;
 		struct bkey_s_c k;
 
+		bch2_trans_reset(&trans, TRANS_RESET_MEM);
+
 		k = bch2_btree_iter_peek_slot(iter);
 		if ((ret = bkey_err(k)))
 			goto bkey_err;
@@ -2696,8 +2698,6 @@ static long bchfs_fallocate(struct bch_inode_info *inode, int mode,
 			reservation.v.nr_replicas = disk_res.nr_replicas;
 		}
 
-		bch2_trans_begin_updates(&trans);
-
 		ret = bch2_extent_update(&trans, iter, &reservation.k_i,
 				&disk_res, &inode->ei_journal_seq,
 				0, &i_sectors_delta);
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index e98ab738cd10..6e0444f3c4f9 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -336,6 +336,8 @@ int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter,
 			bch2_disk_reservation_init(c, 0);
 		struct bkey_i delete;
 
+		bch2_trans_reset(trans, TRANS_RESET_MEM);
+
 		ret = bkey_err(k);
 		if (ret)
 			goto btree_err;
@@ -347,8 +349,6 @@ int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter,
 		bch2_key_resize(&delete.k, max_sectors);
 		bch2_cut_back(end, &delete);
 
-		bch2_trans_begin_updates(trans);
-
 		ret = bch2_extent_update(trans, iter, &delete,
 				&disk_res, journal_seq,
 				0, i_sectors_delta);
@@ -410,14 +410,14 @@ int bch2_write_index_default(struct bch_write_op *op)
 				   BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
 
 	do {
+		bch2_trans_reset(&trans, TRANS_RESET_MEM);
+
 		k = bch2_keylist_front(keys);
 
 		bkey_on_stack_realloc(&sk, c, k->k.u64s);
 		bkey_copy(sk.k, k);
 		bch2_cut_front(iter->pos, sk.k);
 
-		bch2_trans_begin_updates(&trans);
-
 		ret = bch2_extent_update(&trans, iter, sk.k,
 					 &op->res, op_journal_seq(op),
 					 op->new_i_size, &op->i_sectors_delta);
diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c
index 53bd0e0ea058..a65ada691ba1 100644
--- a/fs/bcachefs/reflink.c
+++ b/fs/bcachefs/reflink.c
@@ -185,7 +185,8 @@ s64 bch2_remap_range(struct bch_fs *c,
 				       BTREE_ITER_INTENT);
 
 	while (1) {
-		bch2_trans_begin_updates(&trans);
+		bch2_trans_reset(&trans, TRANS_RESET_MEM);
+
 		trans.mem_top = 0;
 
 		if (fatal_signal_pending(current)) {
-- 
cgit 


From b1fd23df1deda45a408d007aa0b105569d12b907 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 22 Dec 2019 23:04:30 -0500
Subject: bcachefs: Convert all bch2_trans_commit() users to
 BTREE_INSERT_ATOMIC

BTREE_INSERT_ATOMIC should really be the default mode, and there's not
that much code that doesn't need it - so this is prep work for getting
rid of the flag.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update.h      |  24 +++++---
 fs/bcachefs/btree_update_leaf.c |  41 ++++++-------
 fs/bcachefs/dirent.c            |  12 ----
 fs/bcachefs/dirent.h            |   2 -
 fs/bcachefs/ec.c                |  10 +++-
 fs/bcachefs/fs-io.c             |   6 +-
 fs/bcachefs/fsck.c              | 127 +++++++++++++++++++++++-----------------
 fs/bcachefs/inode.c             |   2 +-
 fs/bcachefs/recovery.c          |   2 +-
 fs/bcachefs/xattr.c             |   3 +-
 10 files changed, 124 insertions(+), 105 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
index ad8cbf3fb778..d72da179f866 100644
--- a/fs/bcachefs/btree_update.h
+++ b/fs/bcachefs/btree_update.h
@@ -131,24 +131,34 @@ static inline void bch2_trans_update(struct btree_trans *trans,
 	};
 }
 
-#define bch2_trans_do(_c, _journal_seq, _flags, _do)			\
+#define __bch2_trans_do(_trans, _disk_res, _journal_seq,		\
+			_flags,	_reset_flags, _do)			\
 ({									\
-	struct btree_trans trans;					\
 	int _ret;							\
 									\
-	bch2_trans_init(&trans, (_c), 0, 0);				\
-									\
 	do {								\
-		bch2_trans_begin(&trans);				\
+		bch2_trans_reset(_trans, _reset_flags);			\
 									\
-		_ret = (_do) ?:	bch2_trans_commit(&trans, NULL,		\
+		_ret = (_do) ?:	bch2_trans_commit(_trans, (_disk_res),	\
 					(_journal_seq), (_flags));	\
 	} while (_ret == -EINTR);					\
 									\
-	bch2_trans_exit(&trans);					\
 	_ret;								\
 })
 
+#define bch2_trans_do(_c, _disk_res, _journal_seq, _flags, _do)		\
+({									\
+	struct btree_trans trans;					\
+	int _ret, _ret2;						\
+									\
+	bch2_trans_init(&trans, (_c), 0, 0);				\
+	_ret = __bch2_trans_do(&trans, _disk_res, _journal_seq, _flags,	\
+			       TRANS_RESET_MEM|TRANS_RESET_ITERS, _do);	\
+	_ret2 = bch2_trans_exit(&trans);				\
+									\
+	_ret ?: _ret2;							\
+})
+
 #define trans_for_each_update(_trans, _i)				\
 	for ((_i) = (_trans)->updates;					\
 	     (_i) < (_trans)->updates + (_trans)->nr_updates;		\
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 55f785dadaac..1112bdb689dc 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -823,6 +823,20 @@ err:
 	goto retry;
 }
 
+static int __bch2_btree_insert(struct btree_trans *trans,
+			       enum btree_id id, struct bkey_i *k)
+{
+	struct btree_iter *iter;
+
+	iter = bch2_trans_get_iter(trans, id, bkey_start_pos(&k->k),
+				   BTREE_ITER_INTENT);
+	if (IS_ERR(iter))
+		return PTR_ERR(iter);
+
+	bch2_trans_update(trans, iter, k);
+	return 0;
+}
+
 /**
  * bch2_btree_insert - insert keys into the extent btree
  * @c:			pointer to struct bch_fs
@@ -831,29 +845,12 @@ err:
  * @hook:		insert callback
  */
 int bch2_btree_insert(struct bch_fs *c, enum btree_id id,
-		     struct bkey_i *k,
-		     struct disk_reservation *disk_res,
-		     u64 *journal_seq, int flags)
+		      struct bkey_i *k,
+		      struct disk_reservation *disk_res,
+		      u64 *journal_seq, int flags)
 {
-	struct btree_trans trans;
-	struct btree_iter *iter;
-	int ret;
-
-	bch2_trans_init(&trans, c, 0, 0);
-retry:
-	bch2_trans_begin(&trans);
-
-	iter = bch2_trans_get_iter(&trans, id, bkey_start_pos(&k->k),
-				   BTREE_ITER_INTENT);
-
-	bch2_trans_update(&trans, iter, k);
-
-	ret = bch2_trans_commit(&trans, disk_res, journal_seq, flags);
-	if (ret == -EINTR)
-		goto retry;
-	bch2_trans_exit(&trans);
-
-	return ret;
+	return bch2_trans_do(c, disk_res, journal_seq, flags,
+			     __bch2_btree_insert(&trans, id, k));
 }
 
 int bch2_btree_delete_at_range(struct btree_trans *trans,
diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c
index 38017699c04a..1bf53c55912d 100644
--- a/fs/bcachefs/dirent.c
+++ b/fs/bcachefs/dirent.c
@@ -281,18 +281,6 @@ int bch2_dirent_delete_at(struct btree_trans *trans,
 				   hash_info, iter);
 }
 
-int bch2_dirent_delete(struct bch_fs *c, u64 dir_inum,
-		       const struct bch_hash_info *hash_info,
-		       const struct qstr *name,
-		       u64 *journal_seq)
-{
-	return bch2_trans_do(c, journal_seq,
-			     BTREE_INSERT_ATOMIC|
-			     BTREE_INSERT_NOFAIL,
-		bch2_hash_delete(&trans, bch2_dirent_hash_desc, hash_info,
-				 dir_inum, name));
-}
-
 struct btree_iter *
 __bch2_dirent_lookup_trans(struct btree_trans *trans, u64 dir_inum,
 			   const struct bch_hash_info *hash_info,
diff --git a/fs/bcachefs/dirent.h b/fs/bcachefs/dirent.h
index e6184dc796d3..34769371dd13 100644
--- a/fs/bcachefs/dirent.h
+++ b/fs/bcachefs/dirent.h
@@ -36,8 +36,6 @@ int bch2_dirent_create(struct btree_trans *, u64,
 int bch2_dirent_delete_at(struct btree_trans *,
 			  const struct bch_hash_info *,
 			  struct btree_iter *);
-int bch2_dirent_delete(struct bch_fs *, u64, const struct bch_hash_info *,
-		       const struct qstr *, u64 *);
 
 enum bch_rename_mode {
 	BCH_RENAME,
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index 3b3b931dc6c9..ae07af49af02 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -1235,6 +1235,7 @@ static int __bch2_stripe_write_key(struct btree_trans *trans,
 	bch2_trans_update(trans, iter, &new_key->k_i);
 
 	return bch2_trans_commit(trans, NULL, NULL,
+				 BTREE_INSERT_ATOMIC|
 				 BTREE_INSERT_NOFAIL|flags);
 }
 
@@ -1259,8 +1260,13 @@ int bch2_stripes_write(struct bch_fs *c, unsigned flags, bool *wrote)
 		if (!m->dirty)
 			continue;
 
-		ret = __bch2_stripe_write_key(&trans, iter, m, giter.pos,
-					      new_key, flags);
+		do {
+			bch2_trans_reset(&trans, TRANS_RESET_MEM);
+
+			ret = __bch2_stripe_write_key(&trans, iter, m,
+					giter.pos, new_key, flags);
+		} while (ret == -EINTR);
+
 		if (ret)
 			break;
 
diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 5656e26540fa..f766bbc35cee 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -2465,9 +2465,7 @@ static long bchfs_fcollapse_finsert(struct bch_inode_info *inode,
 		struct bpos next_pos;
 		struct bpos move_pos = POS(inode->v.i_ino, offset >> 9);
 		struct bpos atomic_end;
-		unsigned commit_flags = BTREE_INSERT_NOFAIL|
-			BTREE_INSERT_ATOMIC|
-			BTREE_INSERT_USE_RESERVE;
+		unsigned commit_flags = 0;
 
 		k = insert
 			? bch2_btree_iter_peek_prev(src)
@@ -2560,6 +2558,8 @@ reassemble:
 
 		ret = bch2_trans_commit(&trans, &disk_res,
 					&inode->ei_journal_seq,
+					BTREE_INSERT_ATOMIC|
+					BTREE_INSERT_NOFAIL|
 					commit_flags);
 		bch2_disk_reservation_put(c, &disk_res);
 bkey_err:
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 3ae545b31c7a..a0fdd2ba92f6 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -37,8 +37,8 @@ static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum)
 	return ret ?: sectors;
 }
 
-static int remove_dirent(struct btree_trans *trans,
-			 struct bkey_s_c_dirent dirent)
+static int __remove_dirent(struct btree_trans *trans,
+			   struct bkey_s_c_dirent dirent)
 {
 	struct bch_fs *c = trans->c;
 	struct qstr name;
@@ -49,31 +49,41 @@ static int remove_dirent(struct btree_trans *trans,
 	char *buf;
 
 	name.len = bch2_dirent_name_bytes(dirent);
-	buf = kmalloc(name.len + 1, GFP_KERNEL);
-	if (!buf)
-		return -ENOMEM;
+	buf = bch2_trans_kmalloc(trans, name.len + 1);
+	if (IS_ERR(buf))
+		return PTR_ERR(buf);
 
 	memcpy(buf, dirent.v->d_name, name.len);
 	buf[name.len] = '\0';
 	name.name = buf;
 
-	/* Unlock so we don't deadlock, after copying name: */
-	bch2_trans_unlock(trans);
-
-	ret = bch2_inode_find_by_inum(c, dir_inum, &dir_inode);
-	if (ret) {
+	ret = bch2_inode_find_by_inum_trans(trans, dir_inum, &dir_inode);
+	if (ret && ret != -EINTR)
 		bch_err(c, "remove_dirent: err %i looking up directory inode", ret);
-		goto err;
-	}
+	if (ret)
+		return ret;
 
 	dir_hash_info = bch2_hash_info_init(c, &dir_inode);
 
-	ret = bch2_dirent_delete(c, dir_inum, &dir_hash_info, &name, NULL);
-	if (ret)
+	ret = bch2_hash_delete(trans, bch2_dirent_hash_desc,
+			       &dir_hash_info, dir_inum, &name);
+	if (ret && ret != -EINTR)
 		bch_err(c, "remove_dirent: err %i deleting dirent", ret);
-err:
-	kfree(buf);
-	return ret;
+	if (ret)
+		return ret;
+
+	return 0;
+}
+
+static int remove_dirent(struct btree_trans *trans,
+			 struct bkey_s_c_dirent dirent)
+{
+	return __bch2_trans_do(trans, NULL, NULL,
+			       BTREE_INSERT_ATOMIC|
+			       BTREE_INSERT_NOFAIL|
+			       BTREE_INSERT_LAZY_RW,
+			       TRANS_RESET_MEM,
+			       __remove_dirent(trans, dirent));
 }
 
 static int reattach_inode(struct bch_fs *c,
@@ -88,7 +98,7 @@ static int reattach_inode(struct bch_fs *c,
 	snprintf(name_buf, sizeof(name_buf), "%llu", inum);
 	name = (struct qstr) QSTR(name_buf);
 
-	ret = bch2_trans_do(c, NULL,
+	ret = bch2_trans_do(c, NULL, NULL,
 			    BTREE_INSERT_ATOMIC|
 			    BTREE_INSERT_LAZY_RW,
 		bch2_link_trans(&trans, lostfound_inode->bi_inum,
@@ -171,27 +181,27 @@ static int hash_redo_key(const struct bch_hash_desc desc,
 			 struct btree_iter *k_iter, struct bkey_s_c k,
 			 u64 hashed)
 {
+	struct bkey_i delete;
 	struct bkey_i *tmp;
-	int ret = 0;
 
-	tmp = kmalloc(bkey_bytes(k.k), GFP_KERNEL);
-	if (!tmp)
-		return -ENOMEM;
+	bch2_trans_reset(trans, TRANS_RESET_MEM);
+
+	tmp = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
+	if (IS_ERR(tmp))
+		return PTR_ERR(tmp);
 
 	bkey_reassemble(tmp, k);
 
-	ret = bch2_btree_delete_at(trans, k_iter, 0);
-	if (ret)
-		goto err;
+	bkey_init(&delete.k);
+	delete.k.p = k_iter->pos;
+	bch2_trans_update(trans, k_iter, &delete);
 
-	bch2_hash_set(trans, desc, &h->info, k_iter->pos.inode,
-		      tmp, BCH_HASH_SET_MUST_CREATE);
-	ret = bch2_trans_commit(trans, NULL, NULL,
-				BTREE_INSERT_NOFAIL|
-				BTREE_INSERT_LAZY_RW);
-err:
-	kfree(tmp);
-	return ret;
+	return  bch2_hash_set(trans, desc, &h->info, k_iter->pos.inode,
+			      tmp, BCH_HASH_SET_MUST_CREATE) ?:
+		bch2_trans_commit(trans, NULL, NULL,
+				  BTREE_INSERT_ATOMIC|
+				  BTREE_INSERT_NOFAIL|
+				  BTREE_INSERT_LAZY_RW);
 }
 
 static int fsck_hash_delete_at(struct btree_trans *trans,
@@ -313,9 +323,11 @@ static int hash_check_key(struct btree_trans *trans,
 			"hashed to %llu chain starts at %llu\n%s",
 			desc.btree_id, k.k->p.offset,
 			hashed, h->chain->pos.offset,
-			(bch2_bkey_val_to_text(&PBUF(buf), c,
-					       k), buf))) {
-		ret = hash_redo_key(desc, trans, h, k_iter, k, hashed);
+			(bch2_bkey_val_to_text(&PBUF(buf), c, k), buf))) {
+		do {
+			ret = hash_redo_key(desc, trans, h, k_iter, k, hashed);
+		} while (ret == -EINTR);
+
 		if (ret) {
 			bch_err(c, "hash_redo_key err %i", ret);
 			return ret;
@@ -376,11 +388,12 @@ static int check_dirent_hash(struct btree_trans *trans, struct hash_check *h,
 
 	if (fsck_err(c, "dirent with junk at end, was %s (%zu) now %s (%u)",
 		     buf, strlen(buf), d->v.d_name, len)) {
-		bch2_trans_update(trans, iter, &d->k_i);
-
-		ret = bch2_trans_commit(trans, NULL, NULL,
-					BTREE_INSERT_NOFAIL|
-					BTREE_INSERT_LAZY_RW);
+		ret = __bch2_trans_do(trans, NULL, NULL,
+				      BTREE_INSERT_ATOMIC|
+				      BTREE_INSERT_NOFAIL|
+				      BTREE_INSERT_LAZY_RW,
+				      TRANS_RESET_MEM,
+			(bch2_trans_update(trans, iter, &d->k_i), 0));
 		if (ret)
 			goto err;
 
@@ -402,8 +415,11 @@ err_redo:
 		     k->k->p.offset, hash, h->chain->pos.offset,
 		     (bch2_bkey_val_to_text(&PBUF(buf), c,
 					    *k), buf))) {
-		ret = hash_redo_key(bch2_dirent_hash_desc, trans,
-				    h, iter, *k, hash);
+		do {
+			ret = hash_redo_key(bch2_dirent_hash_desc, trans,
+					    h, iter, *k, hash);
+		} while (ret == -EINTR);
+
 		if (ret)
 			bch_err(c, "hash_redo_key err %i", ret);
 		else
@@ -646,11 +662,12 @@ retry:
 			bkey_reassemble(&n->k_i, d.s_c);
 			n->v.d_type = mode_to_type(target.bi_mode);
 
-			bch2_trans_update(&trans, iter, &n->k_i);
-
-			ret = bch2_trans_commit(&trans, NULL, NULL,
-						BTREE_INSERT_NOFAIL|
-						BTREE_INSERT_LAZY_RW);
+			ret = __bch2_trans_do(&trans, NULL, NULL,
+					      BTREE_INSERT_ATOMIC|
+					      BTREE_INSERT_NOFAIL|
+					      BTREE_INSERT_LAZY_RW,
+					      TRANS_RESET_MEM,
+				(bch2_trans_update(&trans, iter, &n->k_i), 0));
 			kfree(n);
 			if (ret)
 				goto err;
@@ -790,7 +807,7 @@ fsck_err:
 create_lostfound:
 	bch2_inode_init_early(c, lostfound_inode);
 
-	ret = bch2_trans_do(c, NULL,
+	ret = bch2_trans_do(c, NULL, NULL,
 			    BTREE_INSERT_ATOMIC|
 			    BTREE_INSERT_NOFAIL|
 			    BTREE_INSERT_LAZY_RW,
@@ -1261,12 +1278,14 @@ static int check_inode(struct btree_trans *trans,
 		struct bkey_inode_buf p;
 
 		bch2_inode_pack(&p, &u);
-		bch2_trans_update(trans, iter, &p.inode.k_i);
 
-		ret = bch2_trans_commit(trans, NULL, NULL,
-					BTREE_INSERT_NOFAIL|
-					BTREE_INSERT_LAZY_RW);
-		if (ret && ret != -EINTR)
+		ret = __bch2_trans_do(trans, NULL, NULL,
+				      BTREE_INSERT_ATOMIC|
+				      BTREE_INSERT_NOFAIL|
+				      BTREE_INSERT_LAZY_RW,
+				      TRANS_RESET_MEM,
+			(bch2_trans_update(trans, iter, &p.inode.k_i), 0));
+		if (ret)
 			bch_err(c, "error in fsck: error %i "
 				"updating inode", ret);
 	}
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index 67555db01dc4..e2407dcbcb35 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -533,7 +533,7 @@ int bch2_inode_find_by_inum_trans(struct btree_trans *trans, u64 inode_nr,
 int bch2_inode_find_by_inum(struct bch_fs *c, u64 inode_nr,
 			    struct bch_inode_unpacked *inode)
 {
-	return bch2_trans_do(c, NULL, 0,
+	return bch2_trans_do(c, NULL, NULL, 0,
 		bch2_inode_find_by_inum_trans(&trans, inode_nr, inode));
 }
 
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index e6b51131cff2..a3ee2f474952 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -1009,7 +1009,7 @@ int bch2_fs_initialize(struct bch_fs *c)
 	bch2_inode_init_early(c, &lostfound_inode);
 
 	err = "error creating lost+found";
-	ret = bch2_trans_do(c, NULL, BTREE_INSERT_ATOMIC,
+	ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_ATOMIC,
 		bch2_create_trans(&trans, BCACHEFS_ROOT_INO,
 				  &root_inode, &lostfound_inode,
 				  &lostfound,
diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c
index 5aeff1012f8b..6cef6c14fc89 100644
--- a/fs/bcachefs/xattr.c
+++ b/fs/bcachefs/xattr.c
@@ -328,7 +328,8 @@ static int bch2_xattr_set_handler(const struct xattr_handler *handler,
 	struct bch_inode_info *inode = to_bch_ei(vinode);
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
 
-	return bch2_trans_do(c, &inode->ei_journal_seq, BTREE_INSERT_ATOMIC,
+	return bch2_trans_do(c, NULL, &inode->ei_journal_seq,
+			     BTREE_INSERT_ATOMIC,
 			bch2_xattr_set(&trans, inode->v.i_ino,
 				       &inode->ei_str_hash,
 				       name, value, size,
-- 
cgit 


From 58e2388f9e11eb2dfb12d7d11a9a3559cd0e8945 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 22 Dec 2019 23:39:28 -0500
Subject: bcachefs: Kill BTREE_INSERT_ATOMIC

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/acl.c               |  1 -
 fs/bcachefs/alloc_background.c  |  2 --
 fs/bcachefs/btree_update.h      | 10 +---------
 fs/bcachefs/btree_update_leaf.c | 19 ++-----------------
 fs/bcachefs/ec.c                |  3 ---
 fs/bcachefs/fs-io.c             |  1 -
 fs/bcachefs/fs.c                |  6 ------
 fs/bcachefs/fsck.c              |  8 --------
 fs/bcachefs/inode.c             |  1 -
 fs/bcachefs/io.c                |  2 --
 fs/bcachefs/migrate.c           |  1 -
 fs/bcachefs/move.c              |  1 -
 fs/bcachefs/recovery.c          |  4 +---
 fs/bcachefs/reflink.c           |  3 +--
 fs/bcachefs/xattr.c             |  3 +--
 15 files changed, 6 insertions(+), 59 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/acl.c b/fs/bcachefs/acl.c
index 4e631e04cf0c..5a8d8311c08d 100644
--- a/fs/bcachefs/acl.c
+++ b/fs/bcachefs/acl.c
@@ -326,7 +326,6 @@ retry:
 	ret =   bch2_inode_write(&trans, inode_iter, &inode_u) ?:
 		bch2_trans_commit(&trans, NULL,
 				  &inode->ei_journal_seq,
-				  BTREE_INSERT_ATOMIC|
 				  BTREE_INSERT_NOUNLOCK);
 btree_err:
 	if (ret == -EINTR)
diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 7b9079a740ef..bd3e46d066bd 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -316,7 +316,6 @@ retry:
 
 	bch2_trans_update(trans, iter, &a->k_i);
 	ret = bch2_trans_commit(trans, NULL, NULL,
-				BTREE_INSERT_ATOMIC|
 				BTREE_INSERT_NOFAIL|
 				BTREE_INSERT_NOMARK|
 				flags);
@@ -913,7 +912,6 @@ retry:
 	 */
 	ret = bch2_trans_commit(trans, NULL,
 				invalidating_cached_data ? journal_seq : NULL,
-				BTREE_INSERT_ATOMIC|
 				BTREE_INSERT_NOUNLOCK|
 				BTREE_INSERT_NOCHECK_RW|
 				BTREE_INSERT_NOFAIL|
diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
index d72da179f866..aa87477b51e1 100644
--- a/fs/bcachefs/btree_update.h
+++ b/fs/bcachefs/btree_update.h
@@ -16,7 +16,6 @@ void bch2_btree_journal_key(struct btree_trans *, struct btree_iter *,
 			    struct bkey_i *);
 
 enum {
-	__BTREE_INSERT_ATOMIC,
 	__BTREE_INSERT_NOUNLOCK,
 	__BTREE_INSERT_NOFAIL,
 	__BTREE_INSERT_NOCHECK_RW,
@@ -35,12 +34,6 @@ enum {
 	__BCH_HASH_SET_MUST_REPLACE,
 };
 
-/*
- * Don't drop/retake locks before doing btree update, instead return -EINTR if
- * we had to drop locks for any reason
- */
-#define BTREE_INSERT_ATOMIC		(1 << __BTREE_INSERT_ATOMIC)
-
 /*
  * Don't drop locks _after_ successfully updating btree:
  */
@@ -101,8 +94,7 @@ int __bch2_trans_commit(struct btree_trans *);
  * This is main entry point for btree updates.
  *
  * Return values:
- * -EINTR: locking changed, this function should be called again. Only returned
- *  if passed BTREE_INSERT_ATOMIC.
+ * -EINTR: locking changed, this function should be called again.
  * -EROFS: filesystem read only
  * -EIO: journal or btree node IO error
  */
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 1112bdb689dc..94c1e1e2118a 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -298,8 +298,6 @@ static inline void btree_insert_entry_checks(struct btree_trans *trans,
 	BUG_ON(bkey_cmp(bkey_start_pos(&i->k->k), i->iter->pos));
 	EBUG_ON((i->iter->flags & BTREE_ITER_IS_EXTENTS) &&
 		bkey_cmp(i->k->k.p, i->iter->l[0].b->key.k.p) > 0);
-	EBUG_ON((i->iter->flags & BTREE_ITER_IS_EXTENTS) &&
-		!(trans->flags & BTREE_INSERT_ATOMIC));
 
 	BUG_ON(debug_check_bkeys(c) &&
 	       !bkey_deleted(&i->k->k) &&
@@ -641,8 +639,8 @@ int bch2_trans_commit_error(struct btree_trans *trans,
 
 		/*
 		 * if the split succeeded without dropping locks the insert will
-		 * still be atomic (in the BTREE_INSERT_ATOMIC sense, what the
-		 * caller peeked() and is overwriting won't have changed)
+		 * still be atomic (what the caller peeked() and is overwriting
+		 * won't have changed)
 		 */
 #if 0
 		/*
@@ -713,13 +711,6 @@ int bch2_trans_commit_error(struct btree_trans *trans,
 			return ret2;
 		}
 
-		/*
-		 * BTREE_ITER_ATOMIC means we have to return -EINTR if we
-		 * dropped locks:
-		 */
-		if (!(flags & BTREE_INSERT_ATOMIC))
-			return 0;
-
 		trace_trans_restart_atomic(trans->ip);
 	}
 
@@ -756,9 +747,6 @@ int __bch2_trans_commit(struct btree_trans *trans)
 	if (!trans->nr_updates)
 		goto out_noupdates;
 
-	/* for the sake of sanity: */
-	EBUG_ON(trans->nr_updates > 1 && !(trans->flags & BTREE_INSERT_ATOMIC));
-
 	if (trans->flags & BTREE_INSERT_GC_LOCK_HELD)
 		lockdep_assert_held(&trans->c->gc_lock);
 
@@ -795,8 +783,6 @@ out:
 	if (likely(!(trans->flags & BTREE_INSERT_NOCHECK_RW)))
 		percpu_ref_put(&trans->c->writes);
 out_noupdates:
-	EBUG_ON(!(trans->flags & BTREE_INSERT_ATOMIC) && ret == -EINTR);
-
 	trans_for_each_iter_all(trans, iter)
 		iter->flags &= ~BTREE_ITER_KEEP_UNTIL_COMMIT;
 
@@ -897,7 +883,6 @@ retry:
 
 		bch2_trans_update(trans, iter, &delete);
 		ret = bch2_trans_commit(trans, NULL, journal_seq,
-					BTREE_INSERT_ATOMIC|
 					BTREE_INSERT_NOFAIL);
 		if (ret)
 			break;
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index ae07af49af02..a6bc9355c750 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -739,7 +739,6 @@ found_slot:
 	bch2_trans_update(&trans, iter, &stripe->k_i);
 
 	ret = bch2_trans_commit(&trans, NULL, NULL,
-				BTREE_INSERT_ATOMIC|
 				BTREE_INSERT_NOFAIL);
 err:
 	if (ret == -EINTR)
@@ -822,7 +821,6 @@ static int ec_stripe_update_ptrs(struct bch_fs *c,
 		bch2_trans_update(&trans, iter, sk.k);
 
 		ret = bch2_trans_commit(&trans, NULL, NULL,
-					BTREE_INSERT_ATOMIC|
 					BTREE_INSERT_NOFAIL|
 					BTREE_INSERT_USE_RESERVE);
 		if (ret == -EINTR)
@@ -1235,7 +1233,6 @@ static int __bch2_stripe_write_key(struct btree_trans *trans,
 	bch2_trans_update(trans, iter, &new_key->k_i);
 
 	return bch2_trans_commit(trans, NULL, NULL,
-				 BTREE_INSERT_ATOMIC|
 				 BTREE_INSERT_NOFAIL|flags);
 }
 
diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index f766bbc35cee..15b0d20b2f81 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -2558,7 +2558,6 @@ reassemble:
 
 		ret = bch2_trans_commit(&trans, &disk_res,
 					&inode->ei_journal_seq,
-					BTREE_INSERT_ATOMIC|
 					BTREE_INSERT_NOFAIL|
 					commit_flags);
 		bch2_disk_reservation_put(c, &disk_res);
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index c20eaa7418c2..3cada7cc354a 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -143,7 +143,6 @@ retry:
 		bch2_inode_write(&trans, iter, &inode_u) ?:
 		bch2_trans_commit(&trans, NULL,
 				  &inode->ei_journal_seq,
-				  BTREE_INSERT_ATOMIC|
 				  BTREE_INSERT_NOUNLOCK|
 				  BTREE_INSERT_NOFAIL);
 	if (ret == -EINTR)
@@ -279,7 +278,6 @@ retry:
 		goto err_before_quota;
 
 	ret   = bch2_trans_commit(&trans, NULL, &journal_seq,
-				  BTREE_INSERT_ATOMIC|
 				  BTREE_INSERT_NOUNLOCK);
 	if (unlikely(ret)) {
 		bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, -1,
@@ -409,7 +407,6 @@ static int __bch2_link(struct bch_fs *c,
 					&dentry->d_name) ?:
 			bch2_trans_commit(&trans, NULL,
 					&inode->ei_journal_seq,
-					BTREE_INSERT_ATOMIC|
 					BTREE_INSERT_NOUNLOCK);
 	} while (ret == -EINTR);
 
@@ -466,7 +463,6 @@ static int bch2_unlink(struct inode *vdir, struct dentry *dentry)
 					  &inode_u, &dentry->d_name) ?:
 			bch2_trans_commit(&trans, NULL,
 					  &dir->ei_journal_seq,
-					  BTREE_INSERT_ATOMIC|
 					  BTREE_INSERT_NOUNLOCK|
 					  BTREE_INSERT_NOFAIL);
 	} while (ret == -EINTR);
@@ -598,7 +594,6 @@ retry:
 				  mode) ?:
 		bch2_trans_commit(&trans, NULL,
 				  &journal_seq,
-				  BTREE_INSERT_ATOMIC|
 				  BTREE_INSERT_NOUNLOCK);
 	if (ret == -EINTR)
 		goto retry;
@@ -733,7 +728,6 @@ retry:
 	ret =   bch2_inode_write(&trans, inode_iter, &inode_u) ?:
 		bch2_trans_commit(&trans, NULL,
 				  &inode->ei_journal_seq,
-				  BTREE_INSERT_ATOMIC|
 				  BTREE_INSERT_NOUNLOCK|
 				  BTREE_INSERT_NOFAIL);
 btree_err:
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index a0fdd2ba92f6..cd230dc10984 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -79,7 +79,6 @@ static int remove_dirent(struct btree_trans *trans,
 			 struct bkey_s_c_dirent dirent)
 {
 	return __bch2_trans_do(trans, NULL, NULL,
-			       BTREE_INSERT_ATOMIC|
 			       BTREE_INSERT_NOFAIL|
 			       BTREE_INSERT_LAZY_RW,
 			       TRANS_RESET_MEM,
@@ -99,7 +98,6 @@ static int reattach_inode(struct bch_fs *c,
 	name = (struct qstr) QSTR(name_buf);
 
 	ret = bch2_trans_do(c, NULL, NULL,
-			    BTREE_INSERT_ATOMIC|
 			    BTREE_INSERT_LAZY_RW,
 		bch2_link_trans(&trans, lostfound_inode->bi_inum,
 				inum, &dir_u, &inode_u, &name));
@@ -199,7 +197,6 @@ static int hash_redo_key(const struct bch_hash_desc desc,
 	return  bch2_hash_set(trans, desc, &h->info, k_iter->pos.inode,
 			      tmp, BCH_HASH_SET_MUST_CREATE) ?:
 		bch2_trans_commit(trans, NULL, NULL,
-				  BTREE_INSERT_ATOMIC|
 				  BTREE_INSERT_NOFAIL|
 				  BTREE_INSERT_LAZY_RW);
 }
@@ -213,7 +210,6 @@ static int fsck_hash_delete_at(struct btree_trans *trans,
 retry:
 	ret   = bch2_hash_delete_at(trans, desc, info, iter) ?:
 		bch2_trans_commit(trans, NULL, NULL,
-				  BTREE_INSERT_ATOMIC|
 				  BTREE_INSERT_NOFAIL|
 				  BTREE_INSERT_LAZY_RW);
 	if (ret == -EINTR) {
@@ -389,7 +385,6 @@ static int check_dirent_hash(struct btree_trans *trans, struct hash_check *h,
 	if (fsck_err(c, "dirent with junk at end, was %s (%zu) now %s (%u)",
 		     buf, strlen(buf), d->v.d_name, len)) {
 		ret = __bch2_trans_do(trans, NULL, NULL,
-				      BTREE_INSERT_ATOMIC|
 				      BTREE_INSERT_NOFAIL|
 				      BTREE_INSERT_LAZY_RW,
 				      TRANS_RESET_MEM,
@@ -663,7 +658,6 @@ retry:
 			n->v.d_type = mode_to_type(target.bi_mode);
 
 			ret = __bch2_trans_do(&trans, NULL, NULL,
-					      BTREE_INSERT_ATOMIC|
 					      BTREE_INSERT_NOFAIL|
 					      BTREE_INSERT_LAZY_RW,
 					      TRANS_RESET_MEM,
@@ -808,7 +802,6 @@ create_lostfound:
 	bch2_inode_init_early(c, lostfound_inode);
 
 	ret = bch2_trans_do(c, NULL, NULL,
-			    BTREE_INSERT_ATOMIC|
 			    BTREE_INSERT_NOFAIL|
 			    BTREE_INSERT_LAZY_RW,
 		bch2_create_trans(&trans,
@@ -1280,7 +1273,6 @@ static int check_inode(struct btree_trans *trans,
 		bch2_inode_pack(&p, &u);
 
 		ret = __bch2_trans_do(trans, NULL, NULL,
-				      BTREE_INSERT_ATOMIC|
 				      BTREE_INSERT_NOFAIL|
 				      BTREE_INSERT_LAZY_RW,
 				      TRANS_RESET_MEM,
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index e2407dcbcb35..77ac9ab7fc57 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -496,7 +496,6 @@ int bch2_inode_rm(struct bch_fs *c, u64 inode_nr)
 		bch2_trans_update(&trans, iter, &delete.k_i);
 
 		ret = bch2_trans_commit(&trans, NULL, NULL,
-					BTREE_INSERT_ATOMIC|
 					BTREE_INSERT_NOFAIL);
 	} while (ret == -EINTR);
 
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index 6e0444f3c4f9..4b54506b517c 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -313,7 +313,6 @@ int bch2_extent_update(struct btree_trans *trans,
 	ret = bch2_trans_commit(trans, disk_res, journal_seq,
 				BTREE_INSERT_NOCHECK_RW|
 				BTREE_INSERT_NOFAIL|
-				BTREE_INSERT_ATOMIC|
 				BTREE_INSERT_USE_RESERVE);
 	if (!ret && i_sectors_delta)
 		*i_sectors_delta += delta;
@@ -1740,7 +1739,6 @@ retry:
 
 	bch2_trans_update(&trans, iter, new.k);
 	ret = bch2_trans_commit(&trans, NULL, NULL,
-				BTREE_INSERT_ATOMIC|
 				BTREE_INSERT_NOFAIL|
 				BTREE_INSERT_NOWAIT);
 	if (ret == -EINTR)
diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c
index 4b59dcd04cce..db86420bd647 100644
--- a/fs/bcachefs/migrate.c
+++ b/fs/bcachefs/migrate.c
@@ -79,7 +79,6 @@ static int __bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags
 		bch2_trans_update(&trans, iter, sk.k);
 
 		ret = bch2_trans_commit(&trans, NULL, NULL,
-					BTREE_INSERT_ATOMIC|
 					BTREE_INSERT_NOFAIL);
 
 		/*
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index 0aebae33d299..261e465341cd 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -153,7 +153,6 @@ static int bch2_migrate_index_update(struct bch_write_op *op)
 
 		ret = bch2_trans_commit(&trans, &op->res,
 				op_journal_seq(op),
-				BTREE_INSERT_ATOMIC|
 				BTREE_INSERT_NOFAIL|
 				BTREE_INSERT_USE_RESERVE|
 				m->data_opts.btree_insert_flags);
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index a3ee2f474952..44a1dcdb135d 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -309,14 +309,12 @@ retry:
 					  0, -((s64) k->k.size),
 					  BCH_BUCKET_MARK_OVERWRITE) ?:
 		      bch2_trans_commit(&trans, &disk_res, NULL,
-					BTREE_INSERT_ATOMIC|
 					BTREE_INSERT_NOFAIL|
 					BTREE_INSERT_LAZY_RW|
 					BTREE_INSERT_NOMARK_OVERWRITES|
 					BTREE_INSERT_NO_CLEAR_REPLICAS);
 	} else {
 		ret = bch2_trans_commit(&trans, &disk_res, NULL,
-					BTREE_INSERT_ATOMIC|
 					BTREE_INSERT_NOFAIL|
 					BTREE_INSERT_LAZY_RW|
 					BTREE_INSERT_JOURNAL_REPLAY|
@@ -1009,7 +1007,7 @@ int bch2_fs_initialize(struct bch_fs *c)
 	bch2_inode_init_early(c, &lostfound_inode);
 
 	err = "error creating lost+found";
-	ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_ATOMIC,
+	ret = bch2_trans_do(c, NULL, NULL, 0,
 		bch2_create_trans(&trans, BCACHEFS_ROOT_INO,
 				  &root_inode, &lostfound_inode,
 				  &lostfound,
diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c
index a65ada691ba1..5cad39fe031f 100644
--- a/fs/bcachefs/reflink.c
+++ b/fs/bcachefs/reflink.c
@@ -288,8 +288,7 @@ err:
 		    inode_u.bi_size < new_i_size) {
 			inode_u.bi_size = new_i_size;
 			ret2  = bch2_inode_write(&trans, inode_iter, &inode_u) ?:
-				bch2_trans_commit(&trans, NULL, journal_seq,
-						  BTREE_INSERT_ATOMIC);
+				bch2_trans_commit(&trans, NULL, journal_seq, 0);
 		}
 	} while (ret2 == -EINTR);
 
diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c
index 6cef6c14fc89..806a638508a6 100644
--- a/fs/bcachefs/xattr.c
+++ b/fs/bcachefs/xattr.c
@@ -328,8 +328,7 @@ static int bch2_xattr_set_handler(const struct xattr_handler *handler,
 	struct bch_inode_info *inode = to_bch_ei(vinode);
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
 
-	return bch2_trans_do(c, NULL, &inode->ei_journal_seq,
-			     BTREE_INSERT_ATOMIC,
+	return bch2_trans_do(c, NULL, &inode->ei_journal_seq, 0,
 			bch2_xattr_set(&trans, inode->v.i_ino,
 				       &inode->ei_str_hash,
 				       name, value, size,
-- 
cgit 


From 8b3bbe2c34759aad307ced27373405e346960f13 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 24 Dec 2019 18:03:53 -0500
Subject: bcachefs: Don't reexecute triggers when retrying transaction commit

This was causing a bug with transaction iterators overflowing; now, if
triggers have to be reexecuted we always return -EINTR and retry from
the start of the transaction.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_types.h       |  1 +
 fs/bcachefs/btree_update.h      |  3 --
 fs/bcachefs/btree_update_leaf.c | 96 ++++++++++++++++++++---------------------
 fs/bcachefs/recovery.c          |  3 +-
 4 files changed, 49 insertions(+), 54 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 5f0b55c98f86..98451b3dd1a5 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -281,6 +281,7 @@ struct btree_trans {
 	struct disk_reservation *disk_res;
 	unsigned		flags;
 	unsigned		journal_u64s;
+	unsigned		journal_preres_u64s;
 	struct replicas_delta_list *fs_usage_deltas;
 
 	struct btree_iter	iters_onstack[2];
diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
index aa87477b51e1..1534e937a95d 100644
--- a/fs/bcachefs/btree_update.h
+++ b/fs/bcachefs/btree_update.h
@@ -26,7 +26,6 @@ enum {
 	__BTREE_INSERT_JOURNAL_RESERVED,
 	__BTREE_INSERT_NOMARK_OVERWRITES,
 	__BTREE_INSERT_NOMARK,
-	__BTREE_INSERT_NO_CLEAR_REPLICAS,
 	__BTREE_INSERT_BUCKET_INVALIDATE,
 	__BTREE_INSERT_NOWAIT,
 	__BTREE_INSERT_GC_LOCK_HELD,
@@ -60,8 +59,6 @@ enum {
 /* Don't call mark new key at all: */
 #define BTREE_INSERT_NOMARK		(1 << __BTREE_INSERT_NOMARK)
 
-#define BTREE_INSERT_NO_CLEAR_REPLICAS	(1 << __BTREE_INSERT_NO_CLEAR_REPLICAS)
-
 #define BTREE_INSERT_BUCKET_INVALIDATE	(1 << __BTREE_INSERT_BUCKET_INVALIDATE)
 
 /* Don't block on allocation failure (for new btree nodes: */
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 94c1e1e2118a..09f5cd6493f4 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -515,44 +515,18 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans,
 {
 	struct btree_insert_entry *i;
 	struct btree_iter *iter;
-	unsigned idx, u64s, journal_preres_u64s = 0;
+	unsigned idx;
 	int ret;
 
-	/*
-	 * note: running triggers will append more updates to the list of
-	 * updates as we're walking it:
-	 */
-	trans_for_each_update(trans, i) {
-		/* we know trans->nounlock won't be set here: */
-		if (unlikely(!(i->iter->locks_want < 1
-			       ? __bch2_btree_iter_upgrade(i->iter, 1)
-			       : i->iter->uptodate <= BTREE_ITER_NEED_PEEK))) {
-			trace_trans_restart_upgrade(trans->ip);
-			return -EINTR;
-		}
-
-		if (likely(!(trans->flags & BTREE_INSERT_NOMARK)) &&
-		    update_has_trans_triggers(i)) {
-			ret = bch2_trans_mark_update(trans, i->iter, i->k);
-			if (unlikely(ret)) {
-				if (ret == -EINTR)
-					trace_trans_restart_mark(trans->ip);
-				return ret;
-			}
-		}
-
-		u64s = jset_u64s(i->k->k.u64s);
-		if (0)
-			journal_preres_u64s += u64s;
-		trans->journal_u64s += u64s;
-	}
+	trans_for_each_update(trans, i)
+		BUG_ON(!btree_node_intent_locked(i->iter, 0));
 
 	ret = bch2_journal_preres_get(&trans->c->journal,
-			&trans->journal_preres, journal_preres_u64s,
+			&trans->journal_preres, trans->journal_preres_u64s,
 			JOURNAL_RES_GET_NONBLOCK);
 	if (unlikely(ret == -EAGAIN))
 		ret = bch2_trans_journal_preres_get_cold(trans,
-						journal_preres_u64s);
+						trans->journal_preres_u64s);
 	if (unlikely(ret))
 		return ret;
 
@@ -740,8 +714,7 @@ int __bch2_trans_commit(struct btree_trans *trans)
 {
 	struct btree_insert_entry *i = NULL;
 	struct btree_iter *iter;
-	unsigned orig_nr_updates	= trans->nr_updates;
-	unsigned orig_mem_top		= trans->mem_top;
+	unsigned u64s;
 	int ret = 0;
 
 	if (!trans->nr_updates)
@@ -752,26 +725,50 @@ int __bch2_trans_commit(struct btree_trans *trans)
 
 	memset(&trans->journal_preres, 0, sizeof(trans->journal_preres));
 
+	trans->journal_u64s		= 0;
+	trans->journal_preres_u64s	= 0;
+
 	if (!(trans->flags & BTREE_INSERT_NOCHECK_RW) &&
 	    unlikely(!percpu_ref_tryget(&trans->c->writes))) {
 		ret = bch2_trans_commit_get_rw_cold(trans);
 		if (ret)
 			return ret;
 	}
+
+	/*
+	 * note: running triggers will append more updates to the list of
+	 * updates as we're walking it:
+	 */
+	trans_for_each_update(trans, i) {
+		/* we know trans->nounlock won't be set here: */
+		if (unlikely(!(i->iter->locks_want < 1
+			       ? __bch2_btree_iter_upgrade(i->iter, 1)
+			       : i->iter->uptodate <= BTREE_ITER_NEED_PEEK))) {
+			trace_trans_restart_upgrade(trans->ip);
+			ret = -EINTR;
+			goto out;
+		}
+
+		if (likely(!(trans->flags & BTREE_INSERT_NOMARK)) &&
+		    update_has_trans_triggers(i)) {
+			ret = bch2_trans_mark_update(trans, i->iter, i->k);
+			if (unlikely(ret)) {
+				if (ret == -EINTR)
+					trace_trans_restart_mark(trans->ip);
+				goto out;
+			}
+		}
+
+		u64s = jset_u64s(i->k->k.u64s);
+		if (0)
+			trans->journal_preres_u64s += u64s;
+		trans->journal_u64s += u64s;
+	}
 retry:
 	memset(&trans->journal_res, 0, sizeof(trans->journal_res));
-	trans->journal_u64s	= 0;
 
 	ret = do_bch2_trans_commit(trans, &i);
 
-	if (trans->fs_usage_deltas) {
-		trans->fs_usage_deltas->used = 0;
-		memset((void *) trans->fs_usage_deltas +
-		       offsetof(struct replicas_delta_list, memset_start), 0,
-		       (void *) &trans->fs_usage_deltas->memset_end -
-		       (void *) &trans->fs_usage_deltas->memset_start);
-	}
-
 	/* make sure we didn't drop or screw up locks: */
 	bch2_btree_trans_verify_locks(trans);
 
@@ -793,19 +790,20 @@ out_noupdates:
 	trans->nr_updates	= 0;
 	trans->mem_top		= 0;
 
+	if (trans->fs_usage_deltas) {
+		trans->fs_usage_deltas->used = 0;
+		memset((void *) trans->fs_usage_deltas +
+		       offsetof(struct replicas_delta_list, memset_start), 0,
+		       (void *) &trans->fs_usage_deltas->memset_end -
+		       (void *) &trans->fs_usage_deltas->memset_start);
+	}
+
 	return ret;
 err:
 	ret = bch2_trans_commit_error(trans, i, ret);
-
-	/* can't loop if it was passed in and we changed it: */
-	if (unlikely(trans->flags & BTREE_INSERT_NO_CLEAR_REPLICAS) && !ret)
-		ret = -EINTR;
 	if (ret)
 		goto out;
 
-	/* free updates and memory used by triggers, they'll be reexecuted: */
-	trans->nr_updates	= orig_nr_updates;
-	trans->mem_top		= orig_mem_top;
 	goto retry;
 }
 
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 44a1dcdb135d..c366050d572c 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -311,8 +311,7 @@ retry:
 		      bch2_trans_commit(&trans, &disk_res, NULL,
 					BTREE_INSERT_NOFAIL|
 					BTREE_INSERT_LAZY_RW|
-					BTREE_INSERT_NOMARK_OVERWRITES|
-					BTREE_INSERT_NO_CLEAR_REPLICAS);
+					BTREE_INSERT_NOMARK_OVERWRITES);
 	} else {
 		ret = bch2_trans_commit(&trans, &disk_res, NULL,
 					BTREE_INSERT_NOFAIL|
-- 
cgit 


From e731d466d2ba0276badf79e4d960bd0938d0dc89 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 26 Dec 2019 14:54:43 -0500
Subject: bcachefs: Don't export __bch2_fs_read_write

BTREE_INSERT_LAZY_RW was added for this since this code was written; use
it instead.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/recovery.c | 7 +------
 fs/bcachefs/super.c    | 2 +-
 fs/bcachefs/super.h    | 1 -
 3 files changed, 2 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index c366050d572c..9c90d2bbb7cc 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -986,11 +986,6 @@ int bch2_fs_initialize(struct bch_fs *c)
 	bch2_fs_journal_start(&c->journal, 1, &journal);
 	bch2_journal_set_replay_done(&c->journal);
 
-	err = "error going read write";
-	ret = __bch2_fs_read_write(c, true);
-	if (ret)
-		goto err;
-
 	bch2_inode_init(c, &root_inode, 0, 0,
 			S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0, NULL);
 	root_inode.bi_inum = BCACHEFS_ROOT_INO;
@@ -999,7 +994,7 @@ int bch2_fs_initialize(struct bch_fs *c)
 	err = "error creating root directory";
 	ret = bch2_btree_insert(c, BTREE_ID_INODES,
 				&packed_inode.inode.k_i,
-				NULL, NULL, 0);
+				NULL, NULL, BTREE_INSERT_LAZY_RW);
 	if (ret)
 		goto err;
 
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 2e23cc3dbf2f..cd02e5a5f305 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -372,7 +372,7 @@ static int bch2_fs_read_write_late(struct bch_fs *c)
 	return 0;
 }
 
-int __bch2_fs_read_write(struct bch_fs *c, bool early)
+static int __bch2_fs_read_write(struct bch_fs *c, bool early)
 {
 	struct bch_dev *ca;
 	unsigned i;
diff --git a/fs/bcachefs/super.h b/fs/bcachefs/super.h
index 1b97c6115535..b948cb0428c7 100644
--- a/fs/bcachefs/super.h
+++ b/fs/bcachefs/super.h
@@ -218,7 +218,6 @@ struct bch_dev *bch2_dev_lookup(struct bch_fs *, const char *);
 bool bch2_fs_emergency_read_only(struct bch_fs *);
 void bch2_fs_read_only(struct bch_fs *);
 
-int __bch2_fs_read_write(struct bch_fs *, bool);
 int bch2_fs_read_write(struct bch_fs *);
 int bch2_fs_read_write_early(struct bch_fs *);
 
-- 
cgit 


From 46e4bb1c3782484bda814858c34f2cff942f1271 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 27 Dec 2019 13:44:03 -0500
Subject: bcachefs: Fix a use after free

op->end_io may free the op struct

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/io.c | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index 4b54506b517c..dd8f356f3ef0 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -516,12 +516,13 @@ static void bch2_write_done(struct closure *cl)
 
 	bch2_time_stats_update(&c->times[BCH_TIME_data_write], op->start_time);
 
-	if (op->end_io)
+	if (op->end_io) {
+		EBUG_ON(cl->parent);
+		closure_debug_destroy(cl);
 		op->end_io(op);
-	if (cl->parent)
+	} else {
 		closure_return(cl);
-	else
-		closure_debug_destroy(cl);
+	}
 }
 
 /**
@@ -1234,12 +1235,14 @@ void bch2_write(struct closure *cl)
 err:
 	if (!(op->flags & BCH_WRITE_NOPUT_RESERVATION))
 		bch2_disk_reservation_put(c, &op->res);
-	if (op->end_io)
+
+	if (op->end_io) {
+		EBUG_ON(cl->parent);
+		closure_debug_destroy(cl);
 		op->end_io(op);
-	if (cl->parent)
+	} else {
 		closure_return(cl);
-	else
-		closure_debug_destroy(cl);
+	}
 }
 
 /* Cache promotion on read */
-- 
cgit 


From 27b3e52388e1e6b2babb71ec17efa029139e1511 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 27 Dec 2019 20:42:06 -0500
Subject: bcachefs: Add an assertion to track down a heisenbug

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update_interior.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 9e2d72bf06b2..c9be0d110c64 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -1657,6 +1657,8 @@ void __bch2_foreground_maybe_merge(struct bch_fs *c,
 	size_t sib_u64s;
 	int ret = 0;
 
+	BUG_ON(!btree_node_locked(iter, level));
+
 	closure_init_stack(&cl);
 retry:
 	BUG_ON(!btree_node_locked(iter, level));
-- 
cgit 


From 1c3ff72c0fa94651a226d3351d9df89d5eafd2d7 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 28 Dec 2019 20:17:06 -0500
Subject: bcachefs: Convert some enums to x-macros

Helps for preventing things from getting out of sync.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs.h              |   2 +-
 fs/bcachefs/bcachefs_format.h       | 147 ++++++++++++++++++++----------------
 fs/bcachefs/checksum.h              |   4 +-
 fs/bcachefs/compress.c              |  38 +++++-----
 fs/bcachefs/extents.c               |   6 +-
 fs/bcachefs/fsck.c                  |   4 +-
 fs/bcachefs/io.c                    |   6 +-
 fs/bcachefs/journal_seq_blacklist.c |   4 +-
 fs/bcachefs/move.c                  |   2 +-
 fs/bcachefs/opts.c                  |  20 +++--
 fs/bcachefs/opts.h                  |  17 +++--
 fs/bcachefs/recovery.c              |   8 +-
 fs/bcachefs/reflink.c               |   2 +-
 fs/bcachefs/str_hash.h              |   2 +-
 fs/bcachefs/sysfs.c                 |   2 +-
 15 files changed, 145 insertions(+), 119 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 3fa053531344..0d4a8b75ff42 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -718,7 +718,7 @@ struct bch_fs {
 	struct rhashtable	promote_table;
 
 	mempool_t		compression_bounce[2];
-	mempool_t		compress_workspace[BCH_COMPRESSION_NR];
+	mempool_t		compress_workspace[BCH_COMPRESSION_TYPE_NR];
 	mempool_t		decompress_workspace;
 	ZSTD_parameters		zstd_params;
 
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index 9b8fc265a5c0..535ba2788315 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -436,47 +436,6 @@ struct bch_csum {
 	__le64			hi;
 } __attribute__((packed, aligned(8)));
 
-enum bch_csum_type {
-	BCH_CSUM_NONE			= 0,
-	BCH_CSUM_CRC32C_NONZERO		= 1,
-	BCH_CSUM_CRC64_NONZERO		= 2,
-	BCH_CSUM_CHACHA20_POLY1305_80	= 3,
-	BCH_CSUM_CHACHA20_POLY1305_128	= 4,
-	BCH_CSUM_CRC32C			= 5,
-	BCH_CSUM_CRC64			= 6,
-	BCH_CSUM_NR			= 7,
-};
-
-static const unsigned bch_crc_bytes[] = {
-	[BCH_CSUM_NONE]				= 0,
-	[BCH_CSUM_CRC32C_NONZERO]		= 4,
-	[BCH_CSUM_CRC32C]			= 4,
-	[BCH_CSUM_CRC64_NONZERO]		= 8,
-	[BCH_CSUM_CRC64]			= 8,
-	[BCH_CSUM_CHACHA20_POLY1305_80]		= 10,
-	[BCH_CSUM_CHACHA20_POLY1305_128]	= 16,
-};
-
-static inline _Bool bch2_csum_type_is_encryption(enum bch_csum_type type)
-{
-	switch (type) {
-	case BCH_CSUM_CHACHA20_POLY1305_80:
-	case BCH_CSUM_CHACHA20_POLY1305_128:
-		return true;
-	default:
-		return false;
-	}
-}
-
-enum bch_compression_type {
-	BCH_COMPRESSION_NONE		= 0,
-	BCH_COMPRESSION_LZ4_OLD		= 1,
-	BCH_COMPRESSION_GZIP		= 2,
-	BCH_COMPRESSION_LZ4		= 3,
-	BCH_COMPRESSION_ZSTD		= 4,
-	BCH_COMPRESSION_NR		= 5,
-};
-
 #define BCH_EXTENT_ENTRY_TYPES()		\
 	x(ptr,			0)		\
 	x(crc32,		1)		\
@@ -1320,17 +1279,29 @@ LE64_BITMASK(BCH_SB_GC_RESERVE_BYTES,	struct bch_sb, flags[2],  4, 64);
 
 LE64_BITMASK(BCH_SB_ERASURE_CODE,	struct bch_sb, flags[3],  0, 16);
 
-/* Features: */
-enum bch_sb_features {
-	BCH_FEATURE_LZ4			= 0,
-	BCH_FEATURE_GZIP		= 1,
-	BCH_FEATURE_ZSTD		= 2,
-	BCH_FEATURE_ATOMIC_NLINK	= 3, /* should have gone under compat */
-	BCH_FEATURE_EC			= 4,
-	BCH_FEATURE_JOURNAL_SEQ_BLACKLIST_V3 = 5,
-	BCH_FEATURE_REFLINK		= 6,
-	BCH_FEATURE_NEW_SIPHASH		= 7,
-	BCH_FEATURE_INLINE_DATA		= 8,
+/*
+ * Features:
+ *
+ * journal_seq_blacklist_v3:	gates BCH_SB_FIELD_journal_seq_blacklist
+ * reflink:			gates KEY_TYPE_reflink
+ * inline_data:			gates KEY_TYPE_inline_data
+ * new_siphash:			gates BCH_STR_HASH_SIPHASH
+ */
+#define BCH_SB_FEATURES()			\
+	x(lz4,				0)	\
+	x(gzip,				1)	\
+	x(zstd,				2)	\
+	x(atomic_nlink,			3)	\
+	x(ec,				4)	\
+	x(journal_seq_blacklist_v3,	5)	\
+	x(reflink,			6)	\
+	x(new_siphash,			7)	\
+	x(inline_data,			8)
+
+enum bch_sb_feature {
+#define x(f, n) BCH_FEATURE_##f,
+	BCH_SB_FEATURES()
+#undef x
 	BCH_FEATURE_NR,
 };
 
@@ -1350,13 +1321,6 @@ enum bch_error_actions {
 	BCH_NR_ERROR_ACTIONS		= 3,
 };
 
-enum bch_csum_opts {
-	BCH_CSUM_OPT_NONE		= 0,
-	BCH_CSUM_OPT_CRC32C		= 1,
-	BCH_CSUM_OPT_CRC64		= 2,
-	BCH_CSUM_OPT_NR			= 3,
-};
-
 enum bch_str_hash_type {
 	BCH_STR_HASH_CRC32C		= 0,
 	BCH_STR_HASH_CRC64		= 1,
@@ -1372,15 +1336,68 @@ enum bch_str_hash_opts {
 	BCH_STR_HASH_OPT_NR		= 3,
 };
 
+enum bch_csum_type {
+	BCH_CSUM_NONE			= 0,
+	BCH_CSUM_CRC32C_NONZERO		= 1,
+	BCH_CSUM_CRC64_NONZERO		= 2,
+	BCH_CSUM_CHACHA20_POLY1305_80	= 3,
+	BCH_CSUM_CHACHA20_POLY1305_128	= 4,
+	BCH_CSUM_CRC32C			= 5,
+	BCH_CSUM_CRC64			= 6,
+	BCH_CSUM_NR			= 7,
+};
+
+static const unsigned bch_crc_bytes[] = {
+	[BCH_CSUM_NONE]				= 0,
+	[BCH_CSUM_CRC32C_NONZERO]		= 4,
+	[BCH_CSUM_CRC32C]			= 4,
+	[BCH_CSUM_CRC64_NONZERO]		= 8,
+	[BCH_CSUM_CRC64]			= 8,
+	[BCH_CSUM_CHACHA20_POLY1305_80]		= 10,
+	[BCH_CSUM_CHACHA20_POLY1305_128]	= 16,
+};
+
+static inline _Bool bch2_csum_type_is_encryption(enum bch_csum_type type)
+{
+	switch (type) {
+	case BCH_CSUM_CHACHA20_POLY1305_80:
+	case BCH_CSUM_CHACHA20_POLY1305_128:
+		return true;
+	default:
+		return false;
+	}
+}
+
+enum bch_csum_opts {
+	BCH_CSUM_OPT_NONE		= 0,
+	BCH_CSUM_OPT_CRC32C		= 1,
+	BCH_CSUM_OPT_CRC64		= 2,
+	BCH_CSUM_OPT_NR			= 3,
+};
+
 #define BCH_COMPRESSION_TYPES()		\
-	x(NONE)				\
-	x(LZ4)				\
-	x(GZIP)				\
-	x(ZSTD)
+	x(none,		0)		\
+	x(lz4_old,	1)		\
+	x(gzip,		2)		\
+	x(lz4,		3)		\
+	x(zstd,		4)
 
-enum bch_compression_opts {
-#define x(t) BCH_COMPRESSION_OPT_##t,
+enum bch_compression_type {
+#define x(t, n) BCH_COMPRESSION_TYPE_##t,
 	BCH_COMPRESSION_TYPES()
+#undef x
+	BCH_COMPRESSION_TYPE_NR
+};
+
+#define BCH_COMPRESSION_OPTS()		\
+	x(none,		0)		\
+	x(lz4,		1)		\
+	x(gzip,		2)		\
+	x(zstd,		3)
+
+enum bch_compression_opts {
+#define x(t, n) BCH_COMPRESSION_OPT_##t,
+	BCH_COMPRESSION_OPTS()
 #undef x
 	BCH_COMPRESSION_OPT_NR
 };
diff --git a/fs/bcachefs/checksum.h b/fs/bcachefs/checksum.h
index b84e81bac8ff..ca9e45906dc8 100644
--- a/fs/bcachefs/checksum.h
+++ b/fs/bcachefs/checksum.h
@@ -108,8 +108,8 @@ static inline enum bch_csum_type bch2_meta_checksum_type(struct bch_fs *c)
 }
 
 static const unsigned bch2_compression_opt_to_type[] = {
-#define x(t) [BCH_COMPRESSION_OPT_##t] = BCH_COMPRESSION_##t,
-	BCH_COMPRESSION_TYPES()
+#define x(t, n) [BCH_COMPRESSION_OPT_##t] = BCH_COMPRESSION_TYPE_##t,
+	BCH_COMPRESSION_OPTS()
 #undef x
 };
 
diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c
index d350d917a8d4..091958d1ea04 100644
--- a/fs/bcachefs/compress.c
+++ b/fs/bcachefs/compress.c
@@ -158,14 +158,14 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src,
 	src_data = bio_map_or_bounce(c, src, READ);
 
 	switch (crc.compression_type) {
-	case BCH_COMPRESSION_LZ4_OLD:
-	case BCH_COMPRESSION_LZ4:
+	case BCH_COMPRESSION_TYPE_lz4_old:
+	case BCH_COMPRESSION_TYPE_lz4:
 		ret = LZ4_decompress_safe_partial(src_data.b, dst_data,
 						  src_len, dst_len, dst_len);
 		if (ret != dst_len)
 			goto err;
 		break;
-	case BCH_COMPRESSION_GZIP: {
+	case BCH_COMPRESSION_TYPE_gzip: {
 		z_stream strm = {
 			.next_in	= src_data.b,
 			.avail_in	= src_len,
@@ -185,7 +185,7 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src,
 			goto err;
 		break;
 	}
-	case BCH_COMPRESSION_ZSTD: {
+	case BCH_COMPRESSION_TYPE_zstd: {
 		ZSTD_DCtx *ctx;
 		size_t len;
 
@@ -290,10 +290,10 @@ static int attempt_compress(struct bch_fs *c,
 			    void *workspace,
 			    void *dst, size_t dst_len,
 			    void *src, size_t src_len,
-			    unsigned compression_type)
+			    enum bch_compression_type compression_type)
 {
 	switch (compression_type) {
-	case BCH_COMPRESSION_LZ4: {
+	case BCH_COMPRESSION_TYPE_lz4: {
 		int len = src_len;
 		int ret = LZ4_compress_destSize(
 				src,		dst,
@@ -305,7 +305,7 @@ static int attempt_compress(struct bch_fs *c,
 
 		return ret;
 	}
-	case BCH_COMPRESSION_GZIP: {
+	case BCH_COMPRESSION_TYPE_gzip: {
 		z_stream strm = {
 			.next_in	= src,
 			.avail_in	= src_len,
@@ -326,7 +326,7 @@ static int attempt_compress(struct bch_fs *c,
 
 		return strm.total_out;
 	}
-	case BCH_COMPRESSION_ZSTD: {
+	case BCH_COMPRESSION_TYPE_zstd: {
 		ZSTD_CCtx *ctx = zstd_init_cctx(workspace,
 			zstd_cctx_workspace_bound(&c->zstd_params.cParams));
 
@@ -348,14 +348,14 @@ static int attempt_compress(struct bch_fs *c,
 static unsigned __bio_compress(struct bch_fs *c,
 			       struct bio *dst, size_t *dst_len,
 			       struct bio *src, size_t *src_len,
-			       unsigned compression_type)
+			       enum bch_compression_type compression_type)
 {
 	struct bbuf src_data = { NULL }, dst_data = { NULL };
 	void *workspace;
 	unsigned pad;
 	int ret = 0;
 
-	BUG_ON(compression_type >= BCH_COMPRESSION_NR);
+	BUG_ON(compression_type >= BCH_COMPRESSION_TYPE_NR);
 	BUG_ON(!mempool_initialized(&c->compress_workspace[compression_type]));
 
 	/* If it's only one block, don't bother trying to compress: */
@@ -452,8 +452,8 @@ unsigned bch2_bio_compress(struct bch_fs *c,
 	/* Don't generate a bigger output than input: */
 	dst->bi_iter.bi_size = min(dst->bi_iter.bi_size, src->bi_iter.bi_size);
 
-	if (compression_type == BCH_COMPRESSION_LZ4_OLD)
-		compression_type = BCH_COMPRESSION_LZ4;
+	if (compression_type == BCH_COMPRESSION_TYPE_lz4_old)
+		compression_type = BCH_COMPRESSION_TYPE_lz4;
 
 	compression_type =
 		__bio_compress(c, dst, dst_len, src, src_len, compression_type);
@@ -465,15 +465,15 @@ unsigned bch2_bio_compress(struct bch_fs *c,
 
 static int __bch2_fs_compress_init(struct bch_fs *, u64);
 
-#define BCH_FEATURE_NONE	0
+#define BCH_FEATURE_none	0
 
 static const unsigned bch2_compression_opt_to_feature[] = {
-#define x(t) [BCH_COMPRESSION_OPT_##t] = BCH_FEATURE_##t,
-	BCH_COMPRESSION_TYPES()
+#define x(t, n) [BCH_COMPRESSION_OPT_##t] = BCH_FEATURE_##t,
+	BCH_COMPRESSION_OPTS()
 #undef x
 };
 
-#undef BCH_FEATURE_NONE
+#undef BCH_FEATURE_none
 
 static int __bch2_check_set_has_compressed_data(struct bch_fs *c, u64 f)
 {
@@ -537,11 +537,11 @@ static int __bch2_fs_compress_init(struct bch_fs *c, u64 features)
 		size_t		compress_workspace;
 		size_t		decompress_workspace;
 	} compression_types[] = {
-		{ BCH_FEATURE_LZ4, BCH_COMPRESSION_LZ4, LZ4_MEM_COMPRESS, 0 },
-		{ BCH_FEATURE_GZIP, BCH_COMPRESSION_GZIP,
+		{ BCH_FEATURE_lz4, BCH_COMPRESSION_TYPE_lz4, LZ4_MEM_COMPRESS, 0 },
+		{ BCH_FEATURE_gzip, BCH_COMPRESSION_TYPE_gzip,
 			zlib_deflate_workspacesize(MAX_WBITS, DEF_MEM_LEVEL),
 			zlib_inflate_workspacesize(), },
-		{ BCH_FEATURE_ZSTD, BCH_COMPRESSION_ZSTD,
+		{ BCH_FEATURE_zstd, BCH_COMPRESSION_TYPE_zstd,
 			zstd_cctx_workspace_bound(&params.cParams),
 			zstd_dctx_workspace_bound() },
 	}, *i;
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index b85056440ac3..8322b043bdff 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -613,7 +613,7 @@ unsigned bch2_bkey_nr_ptrs_fully_allocated(struct bkey_s_c k)
 
 		bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
 			ret += !p.ptr.cached &&
-				p.crc.compression_type == BCH_COMPRESSION_NONE;
+				p.crc.compression_type == BCH_COMPRESSION_TYPE_none;
 	}
 
 	return ret;
@@ -628,7 +628,7 @@ unsigned bch2_bkey_sectors_compressed(struct bkey_s_c k)
 
 	bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
 		if (!p.ptr.cached &&
-		    p.crc.compression_type != BCH_COMPRESSION_NONE)
+		    p.crc.compression_type != BCH_COMPRESSION_TYPE_none)
 			ret += p.crc.compressed_size;
 
 	return ret;
@@ -1053,7 +1053,7 @@ const char *bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k)
 			if (!bch2_checksum_type_valid(c, crc.csum_type))
 				return "invalid checksum type";
 
-			if (crc.compression_type >= BCH_COMPRESSION_NR)
+			if (crc.compression_type >= BCH_COMPRESSION_TYPE_NR)
 				return "invalid compression type";
 
 			if (bch2_csum_type_is_encryption(crc.csum_type)) {
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index cd230dc10984..e25f064706ad 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -1124,7 +1124,7 @@ static int check_inode_nlink(struct bch_fs *c,
 
 	if (!link->count &&
 	    !(u->bi_flags & BCH_INODE_UNLINKED) &&
-	    (c->sb.features & (1 << BCH_FEATURE_ATOMIC_NLINK))) {
+	    (c->sb.features & (1 << BCH_FEATURE_atomic_nlink))) {
 		if (fsck_err(c, "unreachable inode %llu not marked as unlinked (type %u)",
 			     u->bi_inum, mode_to_type(u->bi_mode)) ==
 		    FSCK_ERR_IGNORE)
@@ -1159,7 +1159,7 @@ static int check_inode_nlink(struct bch_fs *c,
 	}
 
 	if (i_nlink != real_i_nlink &&
-	    (c->sb.features & (1 << BCH_FEATURE_ATOMIC_NLINK))) {
+	    (c->sb.features & (1 << BCH_FEATURE_atomic_nlink))) {
 		if (fsck_err(c, "inode %llu has wrong i_nlink "
 			     "(type %u i_nlink %u, should be %u)",
 			     u->bi_inum, mode_to_type(u->bi_mode),
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index dd8f356f3ef0..ba79b35a130f 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -1141,7 +1141,7 @@ static void bch2_write_data_inline(struct bch_write_op *op, unsigned data_len)
 	unsigned sectors;
 	int ret;
 
-	bch2_check_set_feature(op->c, BCH_FEATURE_INLINE_DATA);
+	bch2_check_set_feature(op->c, BCH_FEATURE_inline_data);
 
 	ret = bch2_keylist_realloc(&op->insert_keys, op->inline_keys,
 				   ARRAY_SIZE(op->inline_keys),
@@ -1788,7 +1788,7 @@ static void __bch2_read_endio(struct work_struct *work)
 	crc.offset     += rbio->offset_into_extent;
 	crc.live_size	= bvec_iter_sectors(rbio->bvec_iter);
 
-	if (crc.compression_type != BCH_COMPRESSION_NONE) {
+	if (crc.compression_type != BCH_COMPRESSION_TYPE_none) {
 		bch2_encrypt_bio(c, crc.csum_type, nonce, src);
 		if (bch2_bio_uncompress(c, src, dst, dst_iter, crc))
 			goto decompression_err;
@@ -1996,7 +1996,7 @@ int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig,
 
 	EBUG_ON(offset_into_extent + bvec_iter_sectors(iter) > k.k->size);
 
-	if (pick.crc.compression_type != BCH_COMPRESSION_NONE ||
+	if (pick.crc.compression_type != BCH_COMPRESSION_TYPE_none ||
 	    (pick.crc.csum_type != BCH_CSUM_NONE &&
 	     (bvec_iter_sectors(iter) != pick.crc.uncompressed_size ||
 	      (bch2_csum_type_is_encryption(pick.crc.csum_type) &&
diff --git a/fs/bcachefs/journal_seq_blacklist.c b/fs/bcachefs/journal_seq_blacklist.c
index 787d9f7638d0..a21de0088753 100644
--- a/fs/bcachefs/journal_seq_blacklist.c
+++ b/fs/bcachefs/journal_seq_blacklist.c
@@ -121,7 +121,7 @@ int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64 start, u64 end)
 	bl->start[nr].end	= cpu_to_le64(end);
 out_write_sb:
 	c->disk_sb.sb->features[0] |=
-		1ULL << BCH_FEATURE_JOURNAL_SEQ_BLACKLIST_V3;
+		1ULL << BCH_FEATURE_journal_seq_blacklist_v3;
 
 	ret = bch2_write_super(c);
 out:
@@ -309,7 +309,7 @@ void bch2_blacklist_entries_gc(struct work_struct *work)
 
 		if (!new_nr)
 			c->disk_sb.sb->features[0] &=
-				~(1ULL << BCH_FEATURE_JOURNAL_SEQ_BLACKLIST_V3);
+				~(1ULL << BCH_FEATURE_journal_seq_blacklist_v3);
 
 		bch2_write_super(c);
 	}
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index 261e465341cd..cb7bb751b7b5 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -271,7 +271,7 @@ int bch2_migrate_write_init(struct bch_fs *c, struct migrate_write *m,
 
 		bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
 			if (!p.ptr.cached &&
-			    p.crc.compression_type != BCH_COMPRESSION_NONE &&
+			    p.crc.compression_type != BCH_COMPRESSION_TYPE_none &&
 			    bch2_dev_in_target(c, p.ptr.dev, data_opts.target))
 				compressed_sectors += p.crc.compressed_size;
 
diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c
index cbacd2f36799..94d6c044a27d 100644
--- a/fs/bcachefs/opts.c
+++ b/fs/bcachefs/opts.c
@@ -16,18 +16,24 @@ const char * const bch2_error_actions[] = {
 	NULL
 };
 
-const char * const bch2_csum_types[] = {
+const char * const bch2_sb_features[] = {
+#define x(f, n) #f,
+	BCH_SB_FEATURES()
+#undef x
+	NULL
+};
+
+const char * const bch2_csum_opts[] = {
 	"none",
 	"crc32c",
 	"crc64",
 	NULL
 };
 
-const char * const bch2_compression_types[] = {
-	"none",
-	"lz4",
-	"gzip",
-	"zstd",
+const char * const bch2_compression_opts[] = {
+#define x(t, n) #t,
+	BCH_COMPRESSION_OPTS()
+#undef x
 	NULL
 };
 
@@ -300,7 +306,7 @@ int bch2_opt_check_may_set(struct bch_fs *c, int id, u64 v)
 		break;
 	case Opt_erasure_code:
 		if (v)
-			bch2_check_set_feature(c, BCH_FEATURE_EC);
+			bch2_check_set_feature(c, BCH_FEATURE_ec);
 		break;
 	}
 
diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
index 92a9b7e0f47f..59c7b3685745 100644
--- a/fs/bcachefs/opts.h
+++ b/fs/bcachefs/opts.h
@@ -9,8 +9,9 @@
 #include "bcachefs_format.h"
 
 extern const char * const bch2_error_actions[];
-extern const char * const bch2_csum_types[];
-extern const char * const bch2_compression_types[];
+extern const char * const bch2_sb_features[];
+extern const char * const bch2_csum_opts[];
+extern const char * const bch2_compression_opts[];
 extern const char * const bch2_str_hash_types[];
 extern const char * const bch2_data_types[];
 extern const char * const bch2_cache_replacement_policies[];
@@ -112,23 +113,23 @@ enum opt_type {
 	  "#",		NULL)						\
 	x(metadata_checksum,		u8,				\
 	  OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,				\
-	  OPT_STR(bch2_csum_types),					\
+	  OPT_STR(bch2_csum_opts),					\
 	  BCH_SB_META_CSUM_TYPE,	BCH_CSUM_OPT_CRC32C,		\
 	  NULL,		NULL)						\
 	x(data_checksum,		u8,				\
 	  OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE,			\
-	  OPT_STR(bch2_csum_types),					\
+	  OPT_STR(bch2_csum_opts),					\
 	  BCH_SB_DATA_CSUM_TYPE,	BCH_CSUM_OPT_CRC32C,		\
 	  NULL,		NULL)						\
 	x(compression,			u8,				\
 	  OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE,			\
-	  OPT_STR(bch2_compression_types),				\
-	  BCH_SB_COMPRESSION_TYPE,	BCH_COMPRESSION_OPT_NONE,	\
+	  OPT_STR(bch2_compression_opts),				\
+	  BCH_SB_COMPRESSION_TYPE,	BCH_COMPRESSION_OPT_none,	\
 	  NULL,		NULL)						\
 	x(background_compression,	u8,				\
 	  OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE,			\
-	  OPT_STR(bch2_compression_types),				\
-	  BCH_SB_BACKGROUND_COMPRESSION_TYPE,BCH_COMPRESSION_OPT_NONE,	\
+	  OPT_STR(bch2_compression_opts),				\
+	  BCH_SB_BACKGROUND_COMPRESSION_TYPE,BCH_COMPRESSION_OPT_none,	\
 	  NULL,		NULL)						\
 	x(str_hash,			u8,				\
 	  OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,				\
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 9c90d2bbb7cc..97b367252e82 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -866,7 +866,7 @@ int bch2_fs_recovery(struct bch_fs *c)
 	}
 
 	if (!c->sb.clean) {
-		if (!(c->sb.features & (1 << BCH_FEATURE_ATOMIC_NLINK))) {
+		if (!(c->sb.features & (1 << BCH_FEATURE_atomic_nlink))) {
 			bch_info(c, "checking inode link counts");
 			err = "error in recovery";
 			ret = bch2_fsck_inode_nlink(c);
@@ -907,6 +907,7 @@ int bch2_fs_recovery(struct bch_fs *c)
 			c->disk_sb.sb->version_min =
 				le16_to_cpu(bcachefs_metadata_version_min);
 		c->disk_sb.sb->version = le16_to_cpu(bcachefs_metadata_version_current);
+		c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_new_siphash;
 		write_sb = true;
 	}
 
@@ -917,7 +918,7 @@ int bch2_fs_recovery(struct bch_fs *c)
 
 	if (c->opts.fsck &&
 	    !test_bit(BCH_FS_ERROR, &c->flags)) {
-		c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_ATOMIC_NLINK;
+		c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_atomic_nlink;
 		SET_BCH_SB_HAS_ERRORS(c->disk_sb.sb, 0);
 		write_sb = true;
 	}
@@ -1024,7 +1025,8 @@ int bch2_fs_initialize(struct bch_fs *c)
 	mutex_lock(&c->sb_lock);
 	c->disk_sb.sb->version = c->disk_sb.sb->version_min =
 		le16_to_cpu(bcachefs_metadata_version_current);
-	c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_ATOMIC_NLINK;
+	c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_atomic_nlink;
+	c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_new_siphash;
 
 	SET_BCH_SB_INITIALIZED(c->disk_sb.sb, true);
 	SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c
index 5cad39fe031f..2bf003ba3bd8 100644
--- a/fs/bcachefs/reflink.c
+++ b/fs/bcachefs/reflink.c
@@ -171,7 +171,7 @@ s64 bch2_remap_range(struct bch_fs *c,
 	if (!percpu_ref_tryget(&c->writes))
 		return -EROFS;
 
-	bch2_check_set_feature(c, BCH_FEATURE_REFLINK);
+	bch2_check_set_feature(c, BCH_FEATURE_reflink);
 
 	dst_end.offset += remap_sectors;
 	src_end.offset += remap_sectors;
diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h
index 7be4a8e50eaa..3870df2d58ce 100644
--- a/fs/bcachefs/str_hash.h
+++ b/fs/bcachefs/str_hash.h
@@ -23,7 +23,7 @@ bch2_str_hash_opt_to_type(struct bch_fs *c, enum bch_str_hash_opts opt)
 	case BCH_STR_HASH_OPT_CRC64:
 		return BCH_STR_HASH_CRC64;
 	case BCH_STR_HASH_OPT_SIPHASH:
-		return c->sb.features & (1ULL << BCH_FEATURE_NEW_SIPHASH)
+		return c->sb.features & (1ULL << BCH_FEATURE_new_siphash)
 			? BCH_STR_HASH_SIPHASH
 			: BCH_STR_HASH_SIPHASH_OLD;
 	default:
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index 767fd7bed2d0..d3713db317ce 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -276,7 +276,7 @@ static ssize_t bch2_compression_stats(struct bch_fs *c, char *buf)
 			struct extent_ptr_decoded p;
 
 			extent_for_each_ptr_decode(e, p, entry) {
-				if (p.crc.compression_type == BCH_COMPRESSION_NONE) {
+				if (p.crc.compression_type == BCH_COMPRESSION_TYPE_none) {
 					nr_uncompressed_extents++;
 					uncompressed_sectors += e.k->size;
 				} else {
-- 
cgit 


From bcd6f3e06fe4f039e1526a0ff5bc3ebbc2405e10 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 26 Nov 2019 17:26:04 -0500
Subject: bcachefs: Use KEY_TYPE_deleted whitouts for extents

Previously, partial overwrites of existing extents were handled
implicitly by the btree code; when reading in a btree node, we'd do a
mergesort of the different bsets and detect and fix partially
overlapping extents during that mergesort.

That approach won't work with snapshots: this changes extents to work
like regular keys as far as the btree code is concerned, where a 0 size
KEY_TYPE_deleted whiteout will completely overwrite an existing extent.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs_format.h       |   8 +-
 fs/bcachefs/bkey_sort.c             | 232 ++++++++++++++++++------------------
 fs/bcachefs/btree_io.c              |  36 ++++--
 fs/bcachefs/btree_types.h           |   2 +
 fs/bcachefs/btree_update_interior.c |   7 ++
 fs/bcachefs/btree_update_leaf.c     |   2 +
 fs/bcachefs/extent_update.c         |  88 +++++++++++++-
 fs/bcachefs/recovery.c              |   2 +
 8 files changed, 244 insertions(+), 133 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index 535ba2788315..0a623ed3caa6 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -1286,6 +1286,7 @@ LE64_BITMASK(BCH_SB_ERASURE_CODE,	struct bch_sb, flags[3],  0, 16);
  * reflink:			gates KEY_TYPE_reflink
  * inline_data:			gates KEY_TYPE_inline_data
  * new_siphash:			gates BCH_STR_HASH_SIPHASH
+ * new_extent_overwrite:	gates BTREE_NODE_NEW_EXTENT_OVERWRITE
  */
 #define BCH_SB_FEATURES()			\
 	x(lz4,				0)	\
@@ -1296,7 +1297,8 @@ LE64_BITMASK(BCH_SB_ERASURE_CODE,	struct bch_sb, flags[3],  0, 16);
 	x(journal_seq_blacklist_v3,	5)	\
 	x(reflink,			6)	\
 	x(new_siphash,			7)	\
-	x(inline_data,			8)
+	x(inline_data,			8)	\
+	x(new_extent_overwrite,		9)
 
 enum bch_sb_feature {
 #define x(f, n) BCH_FEATURE_##f,
@@ -1620,7 +1622,9 @@ struct btree_node {
 
 LE64_BITMASK(BTREE_NODE_ID,	struct btree_node, flags,  0,  4);
 LE64_BITMASK(BTREE_NODE_LEVEL,	struct btree_node, flags,  4,  8);
-/* 8-32 unused */
+LE64_BITMASK(BTREE_NODE_NEW_EXTENT_OVERWRITE,
+				struct btree_node, flags,  8,  9);
+/* 9-32 unused */
 LE64_BITMASK(BTREE_NODE_SEQ,	struct btree_node, flags, 32, 64);
 
 struct btree_node_entry {
diff --git a/fs/bcachefs/bkey_sort.c b/fs/bcachefs/bkey_sort.c
index 23b51ef57303..18f842012f05 100644
--- a/fs/bcachefs/bkey_sort.c
+++ b/fs/bcachefs/bkey_sort.c
@@ -130,24 +130,6 @@ bch2_key_sort_fix_overlapping(struct bch_fs *c, struct bset *dst,
 	return nr;
 }
 
-/*
- * If keys compare equal, compare by pointer order:
- *
- * Necessary for sort_fix_overlapping() - if there are multiple keys that
- * compare equal in different sets, we have to process them newest to oldest.
- */
-static inline int extent_sort_fix_overlapping_cmp(struct btree *b,
-						  struct bkey_packed *l,
-						  struct bkey_packed *r)
-{
-	struct bkey ul = bkey_unpack_key(b, l);
-	struct bkey ur = bkey_unpack_key(b, r);
-
-	return bkey_cmp(bkey_start_pos(&ul),
-			bkey_start_pos(&ur)) ?:
-		cmp_int((unsigned long) r, (unsigned long) l);
-}
-
 static void extent_sort_advance_prev(struct bkey_format *f,
 				     struct btree_nr_keys *nr,
 				     struct bkey_packed *start,
@@ -188,102 +170,6 @@ static void extent_sort_append(struct bch_fs *c,
 	bkey_reassemble((void *) *prev, k.s_c);
 }
 
-struct btree_nr_keys
-bch2_extent_sort_fix_overlapping(struct bch_fs *c, struct bset *dst,
-				 struct sort_iter *iter)
-{
-	struct btree *b = iter->b;
-	struct bkey_format *f = &b->format;
-	struct sort_iter_set *_l = iter->data, *_r = iter->data + 1;
-	struct bkey_packed *prev = NULL;
-	struct bkey l_unpacked, r_unpacked;
-	struct bkey_s l, r;
-	struct btree_nr_keys nr;
-	struct bkey_on_stack split;
-
-	memset(&nr, 0, sizeof(nr));
-	bkey_on_stack_init(&split);
-
-	sort_iter_sort(iter, extent_sort_fix_overlapping_cmp);
-
-	while (!sort_iter_end(iter)) {
-		l = __bkey_disassemble(b, _l->k, &l_unpacked);
-
-		if (iter->used == 1) {
-			extent_sort_append(c, f, &nr, dst->start, &prev, l);
-			sort_iter_advance(iter,
-					  extent_sort_fix_overlapping_cmp);
-			continue;
-		}
-
-		r = __bkey_disassemble(b, _r->k, &r_unpacked);
-
-		/* If current key and next key don't overlap, just append */
-		if (bkey_cmp(l.k->p, bkey_start_pos(r.k)) <= 0) {
-			extent_sort_append(c, f, &nr, dst->start, &prev, l);
-			sort_iter_advance(iter,
-					  extent_sort_fix_overlapping_cmp);
-			continue;
-		}
-
-		/* Skip 0 size keys */
-		if (!r.k->size) {
-			__sort_iter_advance(iter, 1,
-					    extent_sort_fix_overlapping_cmp);
-			continue;
-		}
-
-		/*
-		 * overlap: keep the newer key and trim the older key so they
-		 * don't overlap. comparing pointers tells us which one is
-		 * newer, since the bsets are appended one after the other.
-		 */
-
-		/* can't happen because of comparison func */
-		BUG_ON(_l->k < _r->k &&
-		       !bkey_cmp(bkey_start_pos(l.k), bkey_start_pos(r.k)));
-
-		if (_l->k > _r->k) {
-			/* l wins, trim r */
-			if (bkey_cmp(l.k->p, r.k->p) >= 0) {
-				__sort_iter_advance(iter, 1,
-					 extent_sort_fix_overlapping_cmp);
-			} else {
-				bch2_cut_front_s(l.k->p, r);
-				extent_save(b, _r->k, r.k);
-				__sort_iter_sift(iter, 1,
-					 extent_sort_fix_overlapping_cmp);
-			}
-		} else if (bkey_cmp(l.k->p, r.k->p) > 0) {
-
-			/*
-			 * r wins, but it overlaps in the middle of l - split l:
-			 */
-			bkey_on_stack_reassemble(&split, c, l.s_c);
-			bch2_cut_back(bkey_start_pos(r.k), split.k);
-
-			bch2_cut_front_s(r.k->p, l);
-			extent_save(b, _l->k, l.k);
-
-			__sort_iter_sift(iter, 0,
-					 extent_sort_fix_overlapping_cmp);
-
-			extent_sort_append(c, f, &nr, dst->start,
-					   &prev, bkey_i_to_s(split.k));
-		} else {
-			bch2_cut_back_s(bkey_start_pos(r.k), l);
-			extent_save(b, _l->k, l.k);
-		}
-	}
-
-	extent_sort_advance_prev(f, &nr, dst->start, &prev);
-
-	dst->u64s = cpu_to_le16((u64 *) prev - dst->_data);
-
-	bkey_on_stack_exit(&split, c);
-	return nr;
-}
-
 /* Sort + repack in a new format: */
 struct btree_nr_keys
 bch2_sort_repack(struct bset *dst, struct btree *src,
@@ -354,7 +240,7 @@ static inline int sort_keys_cmp(struct btree *b,
 				struct bkey_packed *r)
 {
 	return bkey_cmp_packed(b, l, r) ?:
-		(int) bkey_whiteout(r) - (int) bkey_whiteout(l) ?:
+		(int) bkey_deleted(r) - (int) bkey_deleted(l) ?:
 		(int) l->needs_whiteout - (int) r->needs_whiteout;
 }
 
@@ -399,6 +285,122 @@ unsigned bch2_sort_keys(struct bkey_packed *dst,
 	return (u64 *) out - (u64 *) dst;
 }
 
+/* Compat code for btree_node_old_extent_overwrite: */
+
+/*
+ * If keys compare equal, compare by pointer order:
+ *
+ * Necessary for sort_fix_overlapping() - if there are multiple keys that
+ * compare equal in different sets, we have to process them newest to oldest.
+ */
+static inline int extent_sort_fix_overlapping_cmp(struct btree *b,
+						  struct bkey_packed *l,
+						  struct bkey_packed *r)
+{
+	struct bkey ul = bkey_unpack_key(b, l);
+	struct bkey ur = bkey_unpack_key(b, r);
+
+	return bkey_cmp(bkey_start_pos(&ul),
+			bkey_start_pos(&ur)) ?:
+		cmp_int((unsigned long) r, (unsigned long) l);
+}
+
+struct btree_nr_keys
+bch2_extent_sort_fix_overlapping(struct bch_fs *c, struct bset *dst,
+				 struct sort_iter *iter)
+{
+	struct btree *b = iter->b;
+	struct bkey_format *f = &b->format;
+	struct sort_iter_set *_l = iter->data, *_r = iter->data + 1;
+	struct bkey_packed *prev = NULL;
+	struct bkey l_unpacked, r_unpacked;
+	struct bkey_s l, r;
+	struct btree_nr_keys nr;
+	struct bkey_on_stack split;
+
+	memset(&nr, 0, sizeof(nr));
+	bkey_on_stack_init(&split);
+
+	sort_iter_sort(iter, extent_sort_fix_overlapping_cmp);
+
+	while (!sort_iter_end(iter)) {
+		l = __bkey_disassemble(b, _l->k, &l_unpacked);
+
+		if (iter->used == 1) {
+			extent_sort_append(c, f, &nr, dst->start, &prev, l);
+			sort_iter_advance(iter,
+					  extent_sort_fix_overlapping_cmp);
+			continue;
+		}
+
+		r = __bkey_disassemble(b, _r->k, &r_unpacked);
+
+		/* If current key and next key don't overlap, just append */
+		if (bkey_cmp(l.k->p, bkey_start_pos(r.k)) <= 0) {
+			extent_sort_append(c, f, &nr, dst->start, &prev, l);
+			sort_iter_advance(iter,
+					  extent_sort_fix_overlapping_cmp);
+			continue;
+		}
+
+		/* Skip 0 size keys */
+		if (!r.k->size) {
+			__sort_iter_advance(iter, 1,
+					    extent_sort_fix_overlapping_cmp);
+			continue;
+		}
+
+		/*
+		 * overlap: keep the newer key and trim the older key so they
+		 * don't overlap. comparing pointers tells us which one is
+		 * newer, since the bsets are appended one after the other.
+		 */
+
+		/* can't happen because of comparison func */
+		BUG_ON(_l->k < _r->k &&
+		       !bkey_cmp(bkey_start_pos(l.k), bkey_start_pos(r.k)));
+
+		if (_l->k > _r->k) {
+			/* l wins, trim r */
+			if (bkey_cmp(l.k->p, r.k->p) >= 0) {
+				__sort_iter_advance(iter, 1,
+					 extent_sort_fix_overlapping_cmp);
+			} else {
+				bch2_cut_front_s(l.k->p, r);
+				extent_save(b, _r->k, r.k);
+				__sort_iter_sift(iter, 1,
+					 extent_sort_fix_overlapping_cmp);
+			}
+		} else if (bkey_cmp(l.k->p, r.k->p) > 0) {
+
+			/*
+			 * r wins, but it overlaps in the middle of l - split l:
+			 */
+			bkey_on_stack_reassemble(&split, c, l.s_c);
+			bch2_cut_back(bkey_start_pos(r.k), split.k);
+
+			bch2_cut_front_s(r.k->p, l);
+			extent_save(b, _l->k, l.k);
+
+			__sort_iter_sift(iter, 0,
+					 extent_sort_fix_overlapping_cmp);
+
+			extent_sort_append(c, f, &nr, dst->start,
+					   &prev, bkey_i_to_s(split.k));
+		} else {
+			bch2_cut_back_s(bkey_start_pos(r.k), l);
+			extent_save(b, _l->k, l.k);
+		}
+	}
+
+	extent_sort_advance_prev(f, &nr, dst->start, &prev);
+
+	dst->u64s = cpu_to_le16((u64 *) prev - dst->_data);
+
+	bkey_on_stack_exit(&split, c);
+	return nr;
+}
+
 static inline int sort_extents_cmp(struct btree *b,
 				   struct bkey_packed *l,
 				   struct bkey_packed *r)
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 209e20fbcd70..c5b5143ada05 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -22,7 +22,8 @@
 
 static void verify_no_dups(struct btree *b,
 			   struct bkey_packed *start,
-			   struct bkey_packed *end)
+			   struct bkey_packed *end,
+			   bool extents)
 {
 #ifdef CONFIG_BCACHEFS_DEBUG
 	struct bkey_packed *k, *p;
@@ -36,7 +37,7 @@ static void verify_no_dups(struct btree *b,
 		struct bkey l = bkey_unpack_key(b, p);
 		struct bkey r = bkey_unpack_key(b, k);
 
-		BUG_ON(btree_node_is_extents(b)
+		BUG_ON(extents
 		       ? bkey_cmp(l.p, bkey_start_pos(&r)) > 0
 		       : bkey_cmp(l.p, bkey_start_pos(&r)) >= 0);
 		//BUG_ON(bkey_cmp_packed(&b->format, p, k) >= 0);
@@ -147,7 +148,8 @@ static void bch2_sort_whiteouts(struct bch_fs *c, struct btree *b)
 	}
 
 	verify_no_dups(b, new_whiteouts,
-		       (void *) ((u64 *) new_whiteouts + b->whiteout_u64s));
+		       (void *) ((u64 *) new_whiteouts + b->whiteout_u64s),
+		       btree_node_old_extent_overwrite(b));
 
 	memcpy_u64s(unwritten_whiteouts_start(c, b),
 		    new_whiteouts, b->whiteout_u64s);
@@ -297,7 +299,8 @@ static bool bch2_compact_extent_whiteouts(struct bch_fs *c,
 
 	verify_no_dups(b,
 		       unwritten_whiteouts_start(c, b),
-		       unwritten_whiteouts_end(c, b));
+		       unwritten_whiteouts_end(c, b),
+		       true);
 
 	btree_bounce_free(c, order, used_mempool, whiteouts);
 
@@ -377,7 +380,7 @@ static bool bch2_drop_whiteouts(struct btree *b, enum compact_mode mode)
 bool bch2_compact_whiteouts(struct bch_fs *c, struct btree *b,
 			    enum compact_mode mode)
 {
-	return !btree_node_is_extents(b)
+	return !btree_node_old_extent_overwrite(b)
 		? bch2_drop_whiteouts(b, mode)
 		: bch2_compact_extent_whiteouts(c, b, mode);
 }
@@ -417,10 +420,10 @@ static void btree_node_sort(struct bch_fs *c, struct btree *b,
 
 	start_time = local_clock();
 
-	if (btree_node_is_extents(b))
+	if (btree_node_old_extent_overwrite(b))
 		filter_whiteouts = bset_written(b, start_bset);
 
-	u64s = (btree_node_is_extents(b)
+	u64s = (btree_node_old_extent_overwrite(b)
 		? bch2_sort_extents
 		: bch2_sort_keys)(out->keys.start,
 				  &sort_iter,
@@ -706,7 +709,8 @@ static int validate_bset(struct bch_fs *c, struct btree *b,
 			 bool have_retry)
 {
 	struct bkey_packed *k, *prev = NULL;
-	struct bpos prev_pos = POS_MIN;
+	struct bpos prev_pos	= POS_MIN;
+	struct bpos prev_data	= POS_MIN;
 	bool seen_non_whiteout = false;
 	unsigned version;
 	const char *err;
@@ -839,7 +843,8 @@ static int validate_bset(struct bch_fs *c, struct btree *b,
 		     (bkey_cmp(prev_pos, bkey_start_pos(u.k)) > 0))) {
 			*whiteout_u64s = k->_data - i->_data;
 			seen_non_whiteout = true;
-		} else if (bkey_cmp(prev_pos, bkey_start_pos(u.k)) > 0) {
+		} else if (bkey_cmp(prev_data, bkey_start_pos(u.k)) > 0 ||
+			   bkey_cmp(prev_pos, u.k->p) > 0) {
 			btree_err(BTREE_ERR_FATAL, c, b, i,
 				  "keys out of order: %llu:%llu > %llu:%llu",
 				  prev_pos.inode,
@@ -849,7 +854,10 @@ static int validate_bset(struct bch_fs *c, struct btree *b,
 			/* XXX: repair this */
 		}
 
+		if (!bkey_deleted(u.k))
+			prev_data = u.k->p;
 		prev_pos = u.k->p;
+
 		prev = k;
 		k = bkey_next_skip_noops(k, vstruct_last(i));
 	}
@@ -908,6 +916,10 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry
 
 			bset_encrypt(c, i, b->written << 9);
 
+			if (btree_node_is_extents(b) &&
+			    !BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data))
+				set_btree_node_old_extent_overwrite(b);
+
 			sectors = vstruct_sectors(b->data, c->block_bits);
 
 			btree_node_set_format(b, b->data->format);
@@ -971,7 +983,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry
 
 	set_btree_bset(b, b->set, &b->data->keys);
 
-	b->nr = (btree_node_is_extents(b)
+	b->nr = (btree_node_old_extent_overwrite(b)
 		 ? bch2_extent_sort_fix_overlapping
 		 : bch2_key_sort_fix_overlapping)(c, &sorted->keys, iter);
 
@@ -1486,7 +1498,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
 	i->journal_seq	= cpu_to_le64(seq);
 	i->u64s		= 0;
 
-	if (!btree_node_is_extents(b)) {
+	if (!btree_node_old_extent_overwrite(b)) {
 		sort_iter_add(&sort_iter,
 			      unwritten_whiteouts_start(c, b),
 			      unwritten_whiteouts_end(c, b));
@@ -1501,7 +1513,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
 
 	b->whiteout_u64s = 0;
 
-	u64s = btree_node_is_extents(b)
+	u64s = btree_node_old_extent_overwrite(b)
 		? bch2_sort_extents(vstruct_last(i), &sort_iter, false)
 		: bch2_sort_keys(i->start, &sort_iter, false);
 	le16_add_cpu(&i->u64s, u64s);
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 98451b3dd1a5..cc04cdbaf432 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -311,6 +311,7 @@ enum btree_flags {
 	BTREE_NODE_just_written,
 	BTREE_NODE_dying,
 	BTREE_NODE_fake,
+	BTREE_NODE_old_extent_overwrite,
 };
 
 BTREE_FLAG(read_in_flight);
@@ -324,6 +325,7 @@ BTREE_FLAG(write_in_flight);
 BTREE_FLAG(just_written);
 BTREE_FLAG(dying);
 BTREE_FLAG(fake);
+BTREE_FLAG(old_extent_overwrite);
 
 static inline struct btree_write *btree_current_write(struct btree *b)
 {
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index c9be0d110c64..870eb0938c22 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -374,6 +374,13 @@ static struct btree *bch2_btree_node_alloc(struct btree_update *as, unsigned lev
 	SET_BTREE_NODE_LEVEL(b->data, level);
 	b->data->ptr = bkey_i_to_btree_ptr(&b->key)->v.start[0];
 
+	if (c->sb.features & (1ULL << BCH_FEATURE_new_extent_overwrite))
+		SET_BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data, true);
+
+	if (btree_node_is_extents(b) &&
+	    !BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data))
+		set_btree_node_old_extent_overwrite(b);
+
 	bch2_btree_build_aux_trees(b);
 
 	btree_node_will_make_reachable(as, b);
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 09f5cd6493f4..78f5674394dc 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -267,6 +267,8 @@ static void btree_insert_key_leaf(struct btree_trans *trans,
 	int old_live_u64s = b->nr.live_u64s;
 	int live_u64s_added, u64s_added;
 
+	insert->k->k.needs_whiteout = false;
+
 	if (!btree_node_is_extents(b))
 		bch2_insert_fixup_key(trans, insert);
 	else
diff --git a/fs/bcachefs/extent_update.c b/fs/bcachefs/extent_update.c
index e021e1623a91..d2f1414f28e2 100644
--- a/fs/bcachefs/extent_update.c
+++ b/fs/bcachefs/extent_update.c
@@ -186,11 +186,26 @@ bch2_extent_can_insert(struct btree_trans *trans,
 
 		overlap = bch2_extent_overlap(&insert->k->k, k.k);
 
+		/*
+		 * If we're overwriting an existing extent, we may need to emit
+		 * a whiteout - unless we're inserting a new extent at the same
+		 * position:
+		 */
+		if (k.k->needs_whiteout &&
+		    (!bkey_whiteout(&insert->k->k) ||
+		     bkey_cmp(k.k->p, insert->k->k.p)))
+			*u64s += BKEY_U64s;
+
+		/*
+		 * If we're partially overwriting an existing extent which has
+		 * been written out to disk, we'll need to emit a new version of
+		 * that extent:
+		 */
 		if (bkey_written(l->b, _k) &&
 		    overlap != BCH_EXTENT_OVERLAP_ALL)
 			*u64s += _k->u64s;
 
-		/* account for having to split existing extent: */
+		/* And we may be splitting an existing extent: */
 		if (overlap == BCH_EXTENT_OVERLAP_MIDDLE)
 			*u64s += _k->u64s;
 
@@ -286,6 +301,23 @@ static void extent_bset_insert(struct bch_fs *c, struct btree_iter *iter,
 	bch2_btree_node_iter_fix(iter, l->b, &l->iter, k, 0, k->u64s);
 }
 
+static void pack_push_whiteout(struct bch_fs *c, struct btree *b,
+			       struct bpos pos)
+{
+	struct bkey_packed k;
+
+	if (!bkey_pack_pos(&k, pos, b)) {
+		struct bkey_i tmp;
+
+		bkey_init(&tmp.k);
+		tmp.k.p = pos;
+		bkey_copy(&k, &tmp);
+	}
+
+	k.needs_whiteout = true;
+	push_whiteout(c, b, &k);
+}
+
 static void
 extent_drop(struct bch_fs *c, struct btree_iter *iter,
 	    struct bkey_packed *_k, struct bkey_s k)
@@ -297,7 +329,12 @@ extent_drop(struct bch_fs *c, struct btree_iter *iter,
 
 	k.k->size = 0;
 	k.k->type = KEY_TYPE_deleted;
-	k.k->needs_whiteout = false;
+
+	if (!btree_node_old_extent_overwrite(l->b) &&
+	    k.k->needs_whiteout) {
+		pack_push_whiteout(c, l->b, k.k->p);
+		k.k->needs_whiteout = false;
+	}
 
 	if (_k >= btree_bset_last(l->b)->start) {
 		unsigned u64s = _k->u64s;
@@ -322,12 +359,29 @@ extent_squash(struct bch_fs *c, struct btree_iter *iter,
 	bkey_on_stack_init(&tmp);
 	bkey_on_stack_init(&split);
 
+	if (!btree_node_old_extent_overwrite(l->b)) {
+		if (!bkey_whiteout(&insert->k) &&
+		    !bkey_cmp(k.k->p, insert->k.p)) {
+			insert->k.needs_whiteout = k.k->needs_whiteout;
+			k.k->needs_whiteout = false;
+		}
+	} else {
+		insert->k.needs_whiteout |= k.k->needs_whiteout;
+	}
+
 	switch (overlap) {
 	case BCH_EXTENT_OVERLAP_FRONT:
 		if (bkey_written(l->b, _k)) {
 			bkey_on_stack_reassemble(&tmp, c, k.s_c);
 			bch2_cut_front(insert->k.p, tmp.k);
 
+			/*
+			 * needs_whiteout was propagated to new version of @k,
+			 * @tmp:
+			 */
+			if (!btree_node_old_extent_overwrite(l->b))
+				k.k->needs_whiteout = false;
+
 			extent_drop(c, iter, _k, k);
 			extent_bset_insert(c, iter, tmp.k);
 		} else {
@@ -348,9 +402,26 @@ extent_squash(struct bch_fs *c, struct btree_iter *iter,
 			bkey_on_stack_reassemble(&tmp, c, k.s_c);
 			bch2_cut_back(bkey_start_pos(&insert->k), tmp.k);
 
+			/*
+			 * @tmp has different position than @k, needs_whiteout
+			 * should not be propagated:
+			 */
+			if (!btree_node_old_extent_overwrite(l->b))
+				tmp.k->k.needs_whiteout = false;
+
 			extent_drop(c, iter, _k, k);
 			extent_bset_insert(c, iter, tmp.k);
 		} else {
+			/*
+			 * position of @k is changing, emit a whiteout if
+			 * needs_whiteout is set:
+			 */
+			if (!btree_node_old_extent_overwrite(l->b) &&
+			    k.k->needs_whiteout) {
+				pack_push_whiteout(c, l->b, k.k->p);
+				k.k->needs_whiteout = false;
+			}
+
 			btree_keys_account_val_delta(l->b, _k,
 				bch2_cut_back_s(bkey_start_pos(&insert->k), k));
 			extent_save(l->b, _k, k.k);
@@ -367,10 +438,17 @@ extent_squash(struct bch_fs *c, struct btree_iter *iter,
 		bkey_on_stack_reassemble(&split, c, k.s_c);
 		bch2_cut_back(bkey_start_pos(&insert->k), split.k);
 
+		if (!btree_node_old_extent_overwrite(l->b))
+			split.k->k.needs_whiteout = false;
+
+		/* this is identical to BCH_EXTENT_OVERLAP_FRONT: */
 		if (bkey_written(l->b, _k)) {
 			bkey_on_stack_reassemble(&tmp, c, k.s_c);
 			bch2_cut_front(insert->k.p, tmp.k);
 
+			if (!btree_node_old_extent_overwrite(l->b))
+				k.k->needs_whiteout = false;
+
 			extent_drop(c, iter, _k, k);
 			extent_bset_insert(c, iter, tmp.k);
 		} else {
@@ -462,7 +540,6 @@ void bch2_insert_fixup_extent(struct btree_trans *trans,
 			bch2_cut_front(cur_end, insert);
 			bch2_btree_iter_set_pos_same_leaf(iter, cur_end);
 		} else {
-			insert->k.needs_whiteout |= k.k->needs_whiteout;
 			extent_squash(c, iter, insert, _k, k, overlap);
 		}
 
@@ -480,7 +557,10 @@ void bch2_insert_fixup_extent(struct btree_trans *trans,
 		if (insert->k.type == KEY_TYPE_deleted)
 			insert->k.type = KEY_TYPE_discard;
 
-		extent_bset_insert(c, iter, insert);
+		if (!bkey_whiteout(&insert->k) ||
+		    btree_node_old_extent_overwrite(l->b))
+			extent_bset_insert(c, iter, insert);
+
 		bch2_btree_journal_key(trans, iter, insert);
 	}
 
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 97b367252e82..c7367a679b22 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -908,6 +908,7 @@ int bch2_fs_recovery(struct bch_fs *c)
 				le16_to_cpu(bcachefs_metadata_version_min);
 		c->disk_sb.sb->version = le16_to_cpu(bcachefs_metadata_version_current);
 		c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_new_siphash;
+		c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_new_extent_overwrite;
 		write_sb = true;
 	}
 
@@ -1027,6 +1028,7 @@ int bch2_fs_initialize(struct bch_fs *c)
 		le16_to_cpu(bcachefs_metadata_version_current);
 	c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_atomic_nlink;
 	c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_new_siphash;
+	c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_new_extent_overwrite;
 
 	SET_BCH_SB_INITIALIZED(c->disk_sb.sb, true);
 	SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
-- 
cgit 


From f21539a56d9753e702d4aa8b71ac1dd6de85f570 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 30 Dec 2019 12:43:19 -0500
Subject: bcachefs: Use bch2_trans_reset in bch2_trans_commit()

Clean up a bit of duplicated code.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c        | 11 ++++++++++-
 fs/bcachefs/btree_iter.h        |  1 +
 fs/bcachefs/btree_update_leaf.c | 19 +------------------
 3 files changed, 12 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 06a087b91e33..c002ac958899 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -2104,7 +2104,16 @@ void bch2_trans_reset(struct btree_trans *trans, unsigned flags)
 	if (flags & TRANS_RESET_MEM)
 		trans->mem_top		= 0;
 
-	bch2_btree_iter_traverse_all(trans);
+	if (trans->fs_usage_deltas) {
+		trans->fs_usage_deltas->used = 0;
+		memset((void *) trans->fs_usage_deltas +
+		       offsetof(struct replicas_delta_list, memset_start), 0,
+		       (void *) &trans->fs_usage_deltas->memset_end -
+		       (void *) &trans->fs_usage_deltas->memset_start);
+	}
+
+	if (!(flags & TRANS_RESET_NOTRAVERSE))
+		bch2_btree_iter_traverse_all(trans);
 }
 
 void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c,
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index 2bbf714c9698..86347bae9795 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -291,6 +291,7 @@ struct btree_iter *bch2_trans_get_node_iter(struct btree_trans *,
 
 #define TRANS_RESET_ITERS		(1 << 0)
 #define TRANS_RESET_MEM			(1 << 1)
+#define TRANS_RESET_NOTRAVERSE		(1 << 2)
 
 void bch2_trans_reset(struct btree_trans *, unsigned);
 
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 78f5674394dc..8d70e22decc7 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -715,7 +715,6 @@ bch2_trans_commit_get_rw_cold(struct btree_trans *trans)
 int __bch2_trans_commit(struct btree_trans *trans)
 {
 	struct btree_insert_entry *i = NULL;
-	struct btree_iter *iter;
 	unsigned u64s;
 	int ret = 0;
 
@@ -782,23 +781,7 @@ out:
 	if (likely(!(trans->flags & BTREE_INSERT_NOCHECK_RW)))
 		percpu_ref_put(&trans->c->writes);
 out_noupdates:
-	trans_for_each_iter_all(trans, iter)
-		iter->flags &= ~BTREE_ITER_KEEP_UNTIL_COMMIT;
-
-	if (!ret) {
-		bch2_trans_unlink_iters(trans);
-		trans->iters_touched = 0;
-	}
-	trans->nr_updates	= 0;
-	trans->mem_top		= 0;
-
-	if (trans->fs_usage_deltas) {
-		trans->fs_usage_deltas->used = 0;
-		memset((void *) trans->fs_usage_deltas +
-		       offsetof(struct replicas_delta_list, memset_start), 0,
-		       (void *) &trans->fs_usage_deltas->memset_end -
-		       (void *) &trans->fs_usage_deltas->memset_start);
-	}
+	bch2_trans_reset(trans, TRANS_RESET_MEM|TRANS_RESET_NOTRAVERSE);
 
 	return ret;
 err:
-- 
cgit 


From 54e86b581306295444ba149bf20106ce518b425f Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 30 Dec 2019 13:08:26 -0500
Subject: bcachefs: Make btree_insert_entry more private to update path

This should be private to btree_update_leaf.c, and we might end up
removing it.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c        |  8 ++---
 fs/bcachefs/btree_types.h       |  4 +--
 fs/bcachefs/btree_update.h      |  2 +-
 fs/bcachefs/btree_update_leaf.c | 73 +++++++++++++++++++++--------------------
 fs/bcachefs/buckets.c           | 12 +++----
 fs/bcachefs/buckets.h           |  4 +--
 fs/bcachefs/extent_update.c     | 20 +++++------
 fs/bcachefs/extent_update.h     |  7 ++--
 fs/bcachefs/extents.h           |  1 -
 9 files changed, 67 insertions(+), 64 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index c002ac958899..0cd1b84c02ba 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1810,8 +1810,8 @@ static int bch2_trans_realloc_iters(struct btree_trans *trans,
 	bch2_trans_unlock(trans);
 
 	iters_bytes	= sizeof(struct btree_iter) * new_size;
-	updates_bytes	= sizeof(struct btree_insert_entry) * (new_size + 4);
-	sorted_bytes	= sizeof(u8) * (new_size + 4);
+	updates_bytes	= sizeof(struct btree_insert_entry) * new_size;
+	sorted_bytes	= sizeof(u8) * new_size;
 
 	new_iters = kmalloc(iters_bytes +
 			    updates_bytes +
@@ -2164,6 +2164,6 @@ int bch2_fs_btree_iter_init(struct bch_fs *c)
 
 	return mempool_init_kmalloc_pool(&c->btree_iters_pool, 1,
 			sizeof(struct btree_iter) * nr +
-			sizeof(struct btree_insert_entry) * (nr + 4) +
-			sizeof(u8) * (nr + 4));
+			sizeof(struct btree_insert_entry) * nr +
+			sizeof(u8) * nr);
 }
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index cc04cdbaf432..6d95a5674e97 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -285,8 +285,8 @@ struct btree_trans {
 	struct replicas_delta_list *fs_usage_deltas;
 
 	struct btree_iter	iters_onstack[2];
-	struct btree_insert_entry updates_onstack[6];
-	u8			updates_sorted_onstack[6];
+	struct btree_insert_entry updates_onstack[2];
+	u8			updates_sorted_onstack[2];
 };
 
 #define BTREE_FLAG(flag)						\
diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
index 1534e937a95d..7f61351aed71 100644
--- a/fs/bcachefs/btree_update.h
+++ b/fs/bcachefs/btree_update.h
@@ -111,7 +111,7 @@ static inline void bch2_trans_update(struct btree_trans *trans,
 				     struct btree_iter *iter,
 				     struct bkey_i *k)
 {
-	EBUG_ON(trans->nr_updates >= trans->nr_iters + 4);
+	EBUG_ON(trans->nr_updates >= trans->nr_iters);
 
 	iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT;
 
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 8d70e22decc7..abcfd42fc24f 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -239,40 +239,39 @@ void bch2_btree_journal_key(struct btree_trans *trans,
 }
 
 static void bch2_insert_fixup_key(struct btree_trans *trans,
-				  struct btree_insert_entry *insert)
+				  struct btree_iter *iter,
+				  struct bkey_i *insert)
 {
-	struct btree_iter *iter = insert->iter;
 	struct btree_iter_level *l = &iter->l[0];
 
 	EBUG_ON(iter->level);
-	EBUG_ON(insert->k->k.u64s >
+	EBUG_ON(insert->k.u64s >
 		bch_btree_keys_u64s_remaining(trans->c, l->b));
 
-	if (likely(bch2_btree_bset_insert_key(iter, l->b, &l->iter,
-					      insert->k)))
-		bch2_btree_journal_key(trans, iter, insert->k);
+	if (likely(bch2_btree_bset_insert_key(iter, l->b, &l->iter, insert)))
+		bch2_btree_journal_key(trans, iter, insert);
 }
 
 /**
  * btree_insert_key - insert a key one key into a leaf node
  */
 static void btree_insert_key_leaf(struct btree_trans *trans,
-				  struct btree_insert_entry *insert)
+				  struct btree_iter *iter,
+				  struct bkey_i *insert)
 {
 	struct bch_fs *c = trans->c;
-	struct btree_iter *iter = insert->iter;
 	struct btree *b = iter->l[0].b;
 	struct bset_tree *t = bset_tree_last(b);
 	int old_u64s = bset_u64s(t);
 	int old_live_u64s = b->nr.live_u64s;
 	int live_u64s_added, u64s_added;
 
-	insert->k->k.needs_whiteout = false;
+	insert->k.needs_whiteout = false;
 
 	if (!btree_node_is_extents(b))
-		bch2_insert_fixup_key(trans, insert);
+		bch2_insert_fixup_key(trans, iter, insert);
 	else
-		bch2_insert_fixup_extent(trans, insert);
+		bch2_insert_fixup_extent(trans, iter, insert);
 
 	live_u64s_added = (int) b->nr.live_u64s - old_live_u64s;
 	u64s_added = (int) bset_u64s(t) - old_u64s;
@@ -286,24 +285,25 @@ static void btree_insert_key_leaf(struct btree_trans *trans,
 	    bch2_maybe_compact_whiteouts(c, b))
 		bch2_btree_iter_reinit_node(iter, b);
 
-	trace_btree_insert_key(c, b, insert->k);
+	trace_btree_insert_key(c, b, insert);
 }
 
 /* Normal update interface: */
 
 static inline void btree_insert_entry_checks(struct btree_trans *trans,
-					     struct btree_insert_entry *i)
+					     struct btree_iter *iter,
+					     struct bkey_i *insert)
 {
 	struct bch_fs *c = trans->c;
 
-	BUG_ON(i->iter->level);
-	BUG_ON(bkey_cmp(bkey_start_pos(&i->k->k), i->iter->pos));
-	EBUG_ON((i->iter->flags & BTREE_ITER_IS_EXTENTS) &&
-		bkey_cmp(i->k->k.p, i->iter->l[0].b->key.k.p) > 0);
+	BUG_ON(iter->level);
+	BUG_ON(bkey_cmp(bkey_start_pos(&insert->k), iter->pos));
+	EBUG_ON((iter->flags & BTREE_ITER_IS_EXTENTS) &&
+		bkey_cmp(insert->k.p, iter->l[0].b->key.k.p) > 0);
 
 	BUG_ON(debug_check_bkeys(c) &&
-	       !bkey_deleted(&i->k->k) &&
-	       bch2_bkey_invalid(c, bkey_i_to_s_c(i->k), i->iter->btree_id));
+	       !bkey_deleted(&insert->k) &&
+	       bch2_bkey_invalid(c, bkey_i_to_s_c(insert), iter->btree_id));
 }
 
 static noinline int
@@ -344,11 +344,12 @@ static inline int bch2_trans_journal_res_get(struct btree_trans *trans,
 
 static enum btree_insert_ret
 btree_key_can_insert(struct btree_trans *trans,
-		     struct btree_insert_entry *insert,
+		     struct btree_iter *iter,
+		     struct bkey_i *insert,
 		     unsigned *u64s)
 {
 	struct bch_fs *c = trans->c;
-	struct btree *b = insert->iter->l[0].b;
+	struct btree *b = iter->l[0].b;
 	static enum btree_insert_ret ret;
 
 	if (unlikely(btree_node_fake(b)))
@@ -356,7 +357,7 @@ btree_key_can_insert(struct btree_trans *trans,
 
 	ret = !btree_node_is_extents(b)
 		? BTREE_INSERT_OK
-		: bch2_extent_can_insert(trans, insert, u64s);
+		: bch2_extent_can_insert(trans, iter, insert, u64s);
 	if (ret)
 		return ret;
 
@@ -367,21 +368,22 @@ btree_key_can_insert(struct btree_trans *trans,
 }
 
 static inline void do_btree_insert_one(struct btree_trans *trans,
-				       struct btree_insert_entry *insert)
+				       struct btree_iter *iter,
+				       struct bkey_i *insert)
 {
-	btree_insert_key_leaf(trans, insert);
+	btree_insert_key_leaf(trans, iter, insert);
 }
 
-static inline bool update_has_trans_triggers(struct btree_insert_entry *i)
+static inline bool iter_has_trans_triggers(struct btree_iter *iter)
 {
-	return BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->iter->btree_id);
+	return BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << iter->btree_id);
 }
 
-static inline bool update_has_nontrans_triggers(struct btree_insert_entry *i)
+static inline bool iter_has_nontrans_triggers(struct btree_iter *iter)
 {
 	return (BTREE_NODE_TYPE_HAS_TRIGGERS &
 		~BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS) &
-		(1U << i->iter->btree_id);
+		(1U << iter->btree_id);
 }
 
 static noinline void bch2_btree_iter_unlock_noinline(struct btree_iter *iter)
@@ -402,7 +404,7 @@ static noinline void bch2_trans_mark_gc(struct btree_trans *trans)
 
 	trans_for_each_update(trans, i)
 		if (gc_visited(c, gc_pos_btree_node(i->iter->l[0].b)))
-			bch2_mark_update(trans, i, NULL,
+			bch2_mark_update(trans, i->iter, i->k, NULL,
 					 mark_flags|BCH_BUCKET_MARK_GC);
 }
 
@@ -439,7 +441,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans,
 			u64s = 0;
 
 		u64s += i->k->k.u64s;
-		ret = btree_key_can_insert(trans, i, &u64s);
+		ret = btree_key_can_insert(trans, i->iter, i->k, &u64s);
 		if (ret) {
 			*stopped_at = i;
 			return ret;
@@ -489,8 +491,9 @@ bch2_trans_commit_write_locked(struct btree_trans *trans,
 
 	trans_for_each_update(trans, i)
 		if (likely(!(trans->flags & BTREE_INSERT_NOMARK)) &&
-		    update_has_nontrans_triggers(i))
-			bch2_mark_update(trans, i, &fs_usage->u, mark_flags);
+		    iter_has_nontrans_triggers(i->iter))
+			bch2_mark_update(trans, i->iter, i->k,
+					 &fs_usage->u, mark_flags);
 
 	if (marking)
 		bch2_trans_fs_usage_apply(trans, fs_usage);
@@ -499,7 +502,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans,
 		bch2_trans_mark_gc(trans);
 
 	trans_for_each_update(trans, i)
-		do_btree_insert_one(trans, i);
+		do_btree_insert_one(trans, i->iter, i->k);
 err:
 	if (marking) {
 		bch2_fs_usage_scratch_put(c, fs_usage);
@@ -549,7 +552,7 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans,
 
 	if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG))
 		trans_for_each_update(trans, i)
-			btree_insert_entry_checks(trans, i);
+			btree_insert_entry_checks(trans, i->iter, i->k);
 	bch2_btree_trans_verify_locks(trans);
 
 	/*
@@ -751,7 +754,7 @@ int __bch2_trans_commit(struct btree_trans *trans)
 		}
 
 		if (likely(!(trans->flags & BTREE_INSERT_NOMARK)) &&
-		    update_has_trans_triggers(i)) {
+		    iter_has_trans_triggers(i->iter)) {
 			ret = bch2_trans_mark_update(trans, i->iter, i->k);
 			if (unlikely(ret)) {
 				if (ret == -EINTR)
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index f837cdda9433..b516b636fbf2 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -1300,12 +1300,12 @@ inline int bch2_mark_overwrite(struct btree_trans *trans,
 }
 
 int bch2_mark_update(struct btree_trans *trans,
-		     struct btree_insert_entry *insert,
+		     struct btree_iter *iter,
+		     struct bkey_i *insert,
 		     struct bch_fs_usage *fs_usage,
 		     unsigned flags)
 {
 	struct bch_fs		*c = trans->c;
-	struct btree_iter	*iter = insert->iter;
 	struct btree		*b = iter->l[0].b;
 	struct btree_node_iter	node_iter = iter->l[0].iter;
 	struct bkey_packed	*_k;
@@ -1314,8 +1314,8 @@ int bch2_mark_update(struct btree_trans *trans,
 	if (!btree_node_type_needs_gc(iter->btree_id))
 		return 0;
 
-	bch2_mark_key_locked(c, bkey_i_to_s_c(insert->k),
-		0, insert->k->k.size,
+	bch2_mark_key_locked(c, bkey_i_to_s_c(insert),
+		0, insert->k.size,
 		fs_usage, trans->journal_res.seq,
 		BCH_BUCKET_MARK_INSERT|flags);
 
@@ -1328,7 +1328,7 @@ int bch2_mark_update(struct btree_trans *trans,
 	 */
 	if ((iter->btree_id == BTREE_ID_ALLOC ||
 	     iter->btree_id == BTREE_ID_EC) &&
-	    !bkey_deleted(&insert->k->k))
+	    !bkey_deleted(&insert->k))
 		return 0;
 
 	while ((_k = bch2_btree_node_iter_peek_filter(&node_iter, b,
@@ -1336,7 +1336,7 @@ int bch2_mark_update(struct btree_trans *trans,
 		struct bkey		unpacked;
 		struct bkey_s_c		k = bkey_disassemble(b, _k, &unpacked);
 
-		ret = bch2_mark_overwrite(trans, iter, k, insert->k,
+		ret = bch2_mark_overwrite(trans, iter, k, insert,
 					  fs_usage, flags);
 		if (ret <= 0)
 			break;
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index 336729f763e1..7b1bbe7c9316 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -276,8 +276,8 @@ int bch2_fs_usage_apply(struct bch_fs *, struct bch_fs_usage_online *,
 int bch2_mark_overwrite(struct btree_trans *, struct btree_iter *,
 			struct bkey_s_c, struct bkey_i *,
 			struct bch_fs_usage *, unsigned);
-int bch2_mark_update(struct btree_trans *, struct btree_insert_entry *,
-		     struct bch_fs_usage *, unsigned);
+int bch2_mark_update(struct btree_trans *, struct btree_iter *,
+		     struct bkey_i *, struct bch_fs_usage *, unsigned);
 
 int bch2_replicas_delta_list_apply(struct bch_fs *,
 				   struct bch_fs_usage *,
diff --git a/fs/bcachefs/extent_update.c b/fs/bcachefs/extent_update.c
index d2f1414f28e2..846d77dc2530 100644
--- a/fs/bcachefs/extent_update.c
+++ b/fs/bcachefs/extent_update.c
@@ -166,10 +166,11 @@ int bch2_extent_is_atomic(struct bkey_i *k, struct btree_iter *iter)
 
 enum btree_insert_ret
 bch2_extent_can_insert(struct btree_trans *trans,
-		       struct btree_insert_entry *insert,
+		       struct btree_iter *iter,
+		       struct bkey_i *insert,
 		       unsigned *u64s)
 {
-	struct btree_iter_level *l = &insert->iter->l[0];
+	struct btree_iter_level *l = &iter->l[0];
 	struct btree_node_iter node_iter = l->iter;
 	struct bkey_packed *_k;
 	struct bkey unpacked;
@@ -179,12 +180,12 @@ bch2_extent_can_insert(struct btree_trans *trans,
 						      KEY_TYPE_discard))) {
 		struct bkey_s_c k = bkey_disassemble(l->b, _k, &unpacked);
 		enum bch_extent_overlap overlap =
-			bch2_extent_overlap(&insert->k->k, k.k);
+			bch2_extent_overlap(&insert->k, k.k);
 
-		if (bkey_cmp(bkey_start_pos(k.k), insert->k->k.p) >= 0)
+		if (bkey_cmp(bkey_start_pos(k.k), insert->k.p) >= 0)
 			break;
 
-		overlap = bch2_extent_overlap(&insert->k->k, k.k);
+		overlap = bch2_extent_overlap(&insert->k, k.k);
 
 		/*
 		 * If we're overwriting an existing extent, we may need to emit
@@ -192,8 +193,8 @@ bch2_extent_can_insert(struct btree_trans *trans,
 		 * position:
 		 */
 		if (k.k->needs_whiteout &&
-		    (!bkey_whiteout(&insert->k->k) ||
-		     bkey_cmp(k.k->p, insert->k->k.p)))
+		    (!bkey_whiteout(&insert->k) ||
+		     bkey_cmp(k.k->p, insert->k.p)))
 			*u64s += BKEY_U64s;
 
 		/*
@@ -507,11 +508,10 @@ extent_squash(struct bch_fs *c, struct btree_iter *iter,
  * key insertion needs to continue/be retried.
  */
 void bch2_insert_fixup_extent(struct btree_trans *trans,
-			      struct btree_insert_entry *insert_entry)
+			      struct btree_iter *iter,
+			      struct bkey_i *insert)
 {
 	struct bch_fs *c = trans->c;
-	struct btree_iter *iter	= insert_entry->iter;
-	struct bkey_i *insert	= insert_entry->k;
 	struct btree_iter_level *l = &iter->l[0];
 	struct btree_node_iter node_iter = l->iter;
 	bool do_update		= !bkey_whiteout(&insert->k);
diff --git a/fs/bcachefs/extent_update.h b/fs/bcachefs/extent_update.h
index 89d18e4b6758..e9dc8091ba3f 100644
--- a/fs/bcachefs/extent_update.h
+++ b/fs/bcachefs/extent_update.h
@@ -10,9 +10,10 @@ int bch2_extent_trim_atomic(struct bkey_i *, struct btree_iter *);
 int bch2_extent_is_atomic(struct bkey_i *, struct btree_iter *);
 
 enum btree_insert_ret
-bch2_extent_can_insert(struct btree_trans *, struct btree_insert_entry *,
-		       unsigned *);
+bch2_extent_can_insert(struct btree_trans *, struct btree_iter *,
+		       struct bkey_i *, unsigned *);
 void bch2_insert_fixup_extent(struct btree_trans *,
-			      struct btree_insert_entry *);
+			      struct btree_iter *,
+			      struct bkey_i *);
 
 #endif /* _BCACHEFS_EXTENT_UPDATE_H */
diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h
index 1140d01a42ab..7c5a41e6d79d 100644
--- a/fs/bcachefs/extents.h
+++ b/fs/bcachefs/extents.h
@@ -8,7 +8,6 @@
 
 struct bch_fs;
 struct btree_trans;
-struct btree_insert_entry;
 
 /* extent entries: */
 
-- 
cgit 


From 2d594dfb5357ee133bd4cb04512c2dea65ec3104 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 31 Dec 2019 16:17:42 -0500
Subject: bcachefs: Split out btree_trigger_flags

The trigger flags really belong with individual btree_insert_entries,
not the transaction commit flags - this splits out those  flags and
unifies them with the BCH_BUCKET_MARK flags. Todo - split out
btree_trigger.c from buckets.c

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/acl.c                   |  2 +-
 fs/bcachefs/alloc_background.c      | 22 +++++------
 fs/bcachefs/btree_gc.c              | 18 ++++-----
 fs/bcachefs/btree_types.h           | 27 +++++++++++++
 fs/bcachefs/btree_update.h          | 19 ++-------
 fs/bcachefs/btree_update_interior.c | 28 +++++++-------
 fs/bcachefs/btree_update_leaf.c     | 28 +++++---------
 fs/bcachefs/buckets.c               | 77 ++++++++++++++++++++-----------------
 fs/bcachefs/buckets.h               | 13 +------
 fs/bcachefs/dirent.c                |  6 +--
 fs/bcachefs/ec.c                    | 10 ++---
 fs/bcachefs/fs-io.c                 | 13 +++----
 fs/bcachefs/fsck.c                  |  8 ++--
 fs/bcachefs/inode.c                 |  6 +--
 fs/bcachefs/io.c                    |  6 +--
 fs/bcachefs/migrate.c               |  2 +-
 fs/bcachefs/move.c                  |  2 +-
 fs/bcachefs/quota.c                 |  2 +-
 fs/bcachefs/recovery.c              | 55 ++++++++++++++++----------
 fs/bcachefs/reflink.c               |  4 +-
 fs/bcachefs/str_hash.h              |  4 +-
 fs/bcachefs/tests.c                 | 10 ++---
 22 files changed, 189 insertions(+), 173 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/acl.c b/fs/bcachefs/acl.c
index 5a8d8311c08d..acc1d03c79e4 100644
--- a/fs/bcachefs/acl.c
+++ b/fs/bcachefs/acl.c
@@ -380,7 +380,7 @@ int bch2_acl_chmod(struct btree_trans *trans,
 	}
 
 	new->k.p = iter->pos;
-	bch2_trans_update(trans, iter, &new->k_i);
+	bch2_trans_update(trans, iter, &new->k_i, 0);
 	*new_acl = acl;
 	acl = NULL;
 err:
diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index bd3e46d066bd..5bc8e7531403 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -222,8 +222,8 @@ int bch2_alloc_read(struct bch_fs *c, struct journal_keys *journal_keys)
 
 	for_each_btree_key(&trans, iter, BTREE_ID_ALLOC, POS_MIN, 0, k, ret)
 		bch2_mark_key(c, k, 0, 0, NULL, 0,
-			      BCH_BUCKET_MARK_ALLOC_READ|
-			      BCH_BUCKET_MARK_NOATOMIC);
+			      BTREE_TRIGGER_ALLOC_READ|
+			      BTREE_TRIGGER_NOATOMIC);
 
 	ret = bch2_trans_exit(&trans) ?: ret;
 	if (ret) {
@@ -235,8 +235,8 @@ int bch2_alloc_read(struct bch_fs *c, struct journal_keys *journal_keys)
 		if (j->btree_id == BTREE_ID_ALLOC)
 			bch2_mark_key(c, bkey_i_to_s_c(j->k),
 				      0, 0, NULL, 0,
-				      BCH_BUCKET_MARK_ALLOC_READ|
-				      BCH_BUCKET_MARK_NOATOMIC);
+				      BTREE_TRIGGER_ALLOC_READ|
+				      BTREE_TRIGGER_NOATOMIC);
 
 	percpu_down_write(&c->mark_lock);
 	bch2_dev_usage_from_buckets(c);
@@ -314,11 +314,10 @@ retry:
 	a->k.p = iter->pos;
 	bch2_alloc_pack(a, new_u);
 
-	bch2_trans_update(trans, iter, &a->k_i);
+	bch2_trans_update(trans, iter, &a->k_i,
+			  BTREE_TRIGGER_NORUN);
 	ret = bch2_trans_commit(trans, NULL, NULL,
-				BTREE_INSERT_NOFAIL|
-				BTREE_INSERT_NOMARK|
-				flags);
+				BTREE_INSERT_NOFAIL|flags);
 err:
 	if (ret == -EINTR)
 		goto retry;
@@ -383,8 +382,7 @@ int bch2_alloc_replay_key(struct bch_fs *c, struct bkey_i *k)
 	ret = bch2_alloc_write_key(&trans, iter,
 				   BTREE_INSERT_NOFAIL|
 				   BTREE_INSERT_LAZY_RW|
-				   BTREE_INSERT_JOURNAL_REPLAY|
-				   BTREE_INSERT_NOMARK);
+				   BTREE_INSERT_JOURNAL_REPLAY);
 	bch2_trans_exit(&trans);
 	return ret < 0 ? ret : 0;
 }
@@ -901,7 +899,8 @@ retry:
 	a->k.p = iter->pos;
 	bch2_alloc_pack(a, u);
 
-	bch2_trans_update(trans, iter, &a->k_i);
+	bch2_trans_update(trans, iter, &a->k_i,
+			  BTREE_TRIGGER_BUCKET_INVALIDATE);
 
 	/*
 	 * XXX:
@@ -917,7 +916,6 @@ retry:
 				BTREE_INSERT_NOFAIL|
 				BTREE_INSERT_USE_RESERVE|
 				BTREE_INSERT_USE_ALLOC_RESERVE|
-				BTREE_INSERT_BUCKET_INVALIDATE|
 				flags);
 	if (ret == -EINTR)
 		goto retry;
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 2eaf6a55c06c..a0b65267cf76 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -118,8 +118,8 @@ static int bch2_gc_mark_key(struct bch_fs *c, struct bkey_s_c k,
 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
 	const struct bch_extent_ptr *ptr;
 	unsigned flags =
-		BCH_BUCKET_MARK_GC|
-		(initial ? BCH_BUCKET_MARK_NOATOMIC : 0);
+		BTREE_TRIGGER_GC|
+		(initial ? BTREE_TRIGGER_NOATOMIC : 0);
 	int ret = 0;
 
 	if (initial) {
@@ -296,8 +296,8 @@ static int mark_journal_key(struct bch_fs *c, enum btree_id id,
 			   BTREE_ITER_SLOTS, k, ret) {
 		percpu_down_read(&c->mark_lock);
 		ret = bch2_mark_overwrite(&trans, iter, k, insert, NULL,
-					 BCH_BUCKET_MARK_GC|
-					 BCH_BUCKET_MARK_NOATOMIC);
+					 BTREE_TRIGGER_GC|
+					 BTREE_TRIGGER_NOATOMIC);
 		percpu_up_read(&c->mark_lock);
 
 		if (!ret)
@@ -409,7 +409,7 @@ static void bch2_mark_superblocks(struct bch_fs *c)
 	gc_pos_set(c, gc_phase(GC_PHASE_SB));
 
 	for_each_online_member(ca, c, i)
-		bch2_mark_dev_superblock(c, ca, BCH_BUCKET_MARK_GC);
+		bch2_mark_dev_superblock(c, ca, BTREE_TRIGGER_GC);
 	mutex_unlock(&c->sb_lock);
 }
 
@@ -426,7 +426,7 @@ static void bch2_mark_pending_btree_node_frees(struct bch_fs *c)
 		if (d->index_update_done)
 			bch2_mark_key(c, bkey_i_to_s_c(&d->key),
 				      0, 0, NULL, 0,
-				      BCH_BUCKET_MARK_GC);
+				      BTREE_TRIGGER_GC);
 
 	mutex_unlock(&c->btree_interior_update_lock);
 }
@@ -447,7 +447,7 @@ static void bch2_mark_allocator_buckets(struct bch_fs *c)
 		fifo_for_each_entry(i, &ca->free_inc, iter)
 			bch2_mark_alloc_bucket(c, ca, i, true,
 					       gc_pos_alloc(c, NULL),
-					       BCH_BUCKET_MARK_GC);
+					       BTREE_TRIGGER_GC);
 
 
@@ -455,7 +455,7 @@ static void bch2_mark_allocator_buckets(struct bch_fs *c)
 			fifo_for_each_entry(i, &ca->free[j], iter)
 				bch2_mark_alloc_bucket(c, ca, i, true,
 						       gc_pos_alloc(c, NULL),
-						       BCH_BUCKET_MARK_GC);
+						       BTREE_TRIGGER_GC);
 	}
 
 	spin_unlock(&c->freelist_lock);
@@ -469,7 +469,7 @@ static void bch2_mark_allocator_buckets(struct bch_fs *c)
 			ca = bch_dev_bkey_exists(c, ob->ptr.dev);
 			bch2_mark_alloc_bucket(c, ca, PTR_BUCKET_NR(ca, &ob->ptr), true,
 					       gc_pos_alloc(c, ob),
-					       BCH_BUCKET_MARK_GC);
+					       BTREE_TRIGGER_GC);
 		}
 		spin_unlock(&ob->lock);
 	}
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 6d95a5674e97..63d04873addb 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -245,6 +245,7 @@ static inline enum btree_iter_type btree_iter_type(struct btree_iter *iter)
 }
 
 struct btree_insert_entry {
+	unsigned		trigger_flags;
 	struct bkey_i		*k;
 	struct btree_iter	*iter;
 };
@@ -484,6 +485,32 @@ static inline bool btree_node_is_extents(struct btree *b)
 	 (1U << BKEY_TYPE_INODES)|			\
 	 (1U << BKEY_TYPE_REFLINK))
 
+enum btree_trigger_flags {
+	__BTREE_TRIGGER_NORUN,		/* Don't run triggers at all */
+	__BTREE_TRIGGER_NOOVERWRITES,	/* Don't run triggers on overwrites */
+
+	__BTREE_TRIGGER_INSERT,
+	__BTREE_TRIGGER_OVERWRITE,
+	__BTREE_TRIGGER_OVERWRITE_SPLIT,
+
+	__BTREE_TRIGGER_GC,
+	__BTREE_TRIGGER_BUCKET_INVALIDATE,
+	__BTREE_TRIGGER_ALLOC_READ,
+	__BTREE_TRIGGER_NOATOMIC,
+};
+
+#define BTREE_TRIGGER_NORUN		(1U << __BTREE_TRIGGER_NORUN)
+#define BTREE_TRIGGER_NOOVERWRITES	(1U << __BTREE_TRIGGER_NOOVERWRITES)
+
+#define BTREE_TRIGGER_INSERT		(1U << __BTREE_TRIGGER_INSERT)
+#define BTREE_TRIGGER_OVERWRITE		(1U << __BTREE_TRIGGER_OVERWRITE)
+#define BTREE_TRIGGER_OVERWRITE_SPLIT	(1U << __BTREE_TRIGGER_OVERWRITE_SPLIT)
+
+#define BTREE_TRIGGER_GC		(1U << __BTREE_TRIGGER_GC)
+#define BTREE_TRIGGER_BUCKET_INVALIDATE	(1U << __BTREE_TRIGGER_BUCKET_INVALIDATE)
+#define BTREE_TRIGGER_ALLOC_READ	(1U << __BTREE_TRIGGER_ALLOC_READ)
+#define BTREE_TRIGGER_NOATOMIC		(1U << __BTREE_TRIGGER_NOATOMIC)
+
 static inline bool btree_node_type_needs_gc(enum btree_node_type type)
 {
 	return BTREE_NODE_TYPE_HAS_TRIGGERS & (1U << type);
diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
index 7f61351aed71..add7217598ed 100644
--- a/fs/bcachefs/btree_update.h
+++ b/fs/bcachefs/btree_update.h
@@ -15,7 +15,7 @@ bool bch2_btree_bset_insert_key(struct btree_iter *, struct btree *,
 void bch2_btree_journal_key(struct btree_trans *, struct btree_iter *,
 			    struct bkey_i *);
 
-enum {
+enum btree_insert_flags {
 	__BTREE_INSERT_NOUNLOCK,
 	__BTREE_INSERT_NOFAIL,
 	__BTREE_INSERT_NOCHECK_RW,
@@ -24,9 +24,6 @@ enum {
 	__BTREE_INSERT_USE_ALLOC_RESERVE,
 	__BTREE_INSERT_JOURNAL_REPLAY,
 	__BTREE_INSERT_JOURNAL_RESERVED,
-	__BTREE_INSERT_NOMARK_OVERWRITES,
-	__BTREE_INSERT_NOMARK,
-	__BTREE_INSERT_BUCKET_INVALIDATE,
 	__BTREE_INSERT_NOWAIT,
 	__BTREE_INSERT_GC_LOCK_HELD,
 	__BCH_HASH_SET_MUST_CREATE,
@@ -53,14 +50,6 @@ enum {
 
 #define BTREE_INSERT_JOURNAL_RESERVED	(1 << __BTREE_INSERT_JOURNAL_RESERVED)
 
-/* Don't mark overwrites, just new key: */
-#define BTREE_INSERT_NOMARK_OVERWRITES	(1 << __BTREE_INSERT_NOMARK_OVERWRITES)
-
-/* Don't call mark new key at all: */
-#define BTREE_INSERT_NOMARK		(1 << __BTREE_INSERT_NOMARK)
-
-#define BTREE_INSERT_BUCKET_INVALIDATE	(1 << __BTREE_INSERT_BUCKET_INVALIDATE)
-
 /* Don't block on allocation failure (for new btree nodes: */
 #define BTREE_INSERT_NOWAIT		(1 << __BTREE_INSERT_NOWAIT)
 #define BTREE_INSERT_GC_LOCK_HELD	(1 << __BTREE_INSERT_GC_LOCK_HELD)
@@ -108,15 +97,15 @@ static inline int bch2_trans_commit(struct btree_trans *trans,
 }
 
 static inline void bch2_trans_update(struct btree_trans *trans,
-				     struct btree_iter *iter,
-				     struct bkey_i *k)
+				     struct btree_iter *iter, struct bkey_i *k,
+				     enum btree_trigger_flags flags)
 {
 	EBUG_ON(trans->nr_updates >= trans->nr_iters);
 
 	iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT;
 
 	trans->updates[trans->nr_updates++] = (struct btree_insert_entry) {
-		.iter = iter, .k = k
+		.trigger_flags = flags, .iter = iter, .k = k
 	};
 }
 
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 870eb0938c22..c8fbee82cc56 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -193,8 +193,8 @@ found:
 	    gc_pos_cmp(c->gc_pos, gc_phase(GC_PHASE_PENDING_DELETE)) < 0)
 		bch2_mark_key_locked(c, bkey_i_to_s_c(&d->key),
 			      0, 0, NULL, 0,
-			      BCH_BUCKET_MARK_OVERWRITE|
-			      BCH_BUCKET_MARK_GC);
+			      BTREE_TRIGGER_OVERWRITE|
+			      BTREE_TRIGGER_GC);
 }
 
 static void __btree_node_free(struct bch_fs *c, struct btree *b)
@@ -265,13 +265,13 @@ static void bch2_btree_node_free_ondisk(struct bch_fs *c,
 	BUG_ON(!pending->index_update_done);
 
 	bch2_mark_key(c, bkey_i_to_s_c(&pending->key),
-		      0, 0, NULL, 0, BCH_BUCKET_MARK_OVERWRITE);
+		      0, 0, NULL, 0, BTREE_TRIGGER_OVERWRITE);
 
 	if (gc_visited(c, gc_phase(GC_PHASE_PENDING_DELETE)))
 		bch2_mark_key(c, bkey_i_to_s_c(&pending->key),
 			      0, 0, NULL, 0,
-			      BCH_BUCKET_MARK_OVERWRITE|
-			      BCH_BUCKET_MARK_GC);
+			      BTREE_TRIGGER_OVERWRITE|
+			      BTREE_TRIGGER_GC);
 }
 
 static struct btree *__bch2_btree_node_alloc(struct bch_fs *c,
@@ -1084,12 +1084,12 @@ static void bch2_btree_set_root_inmem(struct btree_update *as, struct btree *b)
 
 	bch2_mark_key_locked(c, bkey_i_to_s_c(&b->key),
 		      0, 0, &fs_usage->u, 0,
-		      BCH_BUCKET_MARK_INSERT);
+		      BTREE_TRIGGER_INSERT);
 	if (gc_visited(c, gc_pos_btree_root(b->c.btree_id)))
 		bch2_mark_key_locked(c, bkey_i_to_s_c(&b->key),
 				     0, 0, NULL, 0,
-				     BCH_BUCKET_MARK_INSERT|
-				     BCH_BUCKET_MARK_GC);
+				     BTREE_TRIGGER_INSERT|
+				     BTREE_TRIGGER_GC);
 
 	if (old && !btree_node_fake(old))
 		bch2_btree_node_free_index(as, NULL,
@@ -1182,13 +1182,13 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, struct btree *b
 
 	bch2_mark_key_locked(c, bkey_i_to_s_c(insert),
 			     0, 0, &fs_usage->u, 0,
-			     BCH_BUCKET_MARK_INSERT);
+			     BTREE_TRIGGER_INSERT);
 
 	if (gc_visited(c, gc_pos_btree_node(b)))
 		bch2_mark_key_locked(c, bkey_i_to_s_c(insert),
 				     0, 0, NULL, 0,
-				     BCH_BUCKET_MARK_INSERT|
-				     BCH_BUCKET_MARK_GC);
+				     BTREE_TRIGGER_INSERT|
+				     BTREE_TRIGGER_GC);
 
 	while ((k = bch2_btree_node_iter_peek_all(node_iter, b)) &&
 	       bkey_iter_pos_cmp(b, &insert->k.p, k) > 0)
@@ -2031,12 +2031,12 @@ static void __bch2_btree_node_update_key(struct bch_fs *c,
 
 		bch2_mark_key_locked(c, bkey_i_to_s_c(&new_key->k_i),
 			      0, 0, &fs_usage->u, 0,
-			      BCH_BUCKET_MARK_INSERT);
+			      BTREE_TRIGGER_INSERT);
 		if (gc_visited(c, gc_pos_btree_root(b->c.btree_id)))
 			bch2_mark_key_locked(c, bkey_i_to_s_c(&new_key->k_i),
 					     0, 0, NULL, 0,
-					     BCH_BUCKET_MARK_INSERT||
-					     BCH_BUCKET_MARK_GC);
+					     BTREE_TRIGGER_INSERT||
+					     BTREE_TRIGGER_GC);
 
 		bch2_btree_node_free_index(as, NULL,
 					   bkey_i_to_s_c(&b->key),
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index abcfd42fc24f..07a6213914f7 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -395,17 +395,11 @@ static noinline void bch2_trans_mark_gc(struct btree_trans *trans)
 {
 	struct bch_fs *c = trans->c;
 	struct btree_insert_entry *i;
-	unsigned mark_flags = trans->flags & BTREE_INSERT_BUCKET_INVALIDATE
-		? BCH_BUCKET_MARK_BUCKET_INVALIDATE
-		: 0;
-
-	if (unlikely(trans->flags & BTREE_INSERT_NOMARK))
-		return;
 
 	trans_for_each_update(trans, i)
 		if (gc_visited(c, gc_pos_btree_node(i->iter->l[0].b)))
 			bch2_mark_update(trans, i->iter, i->k, NULL,
-					 mark_flags|BCH_BUCKET_MARK_GC);
+					 i->trigger_flags|BTREE_TRIGGER_GC);
 }
 
 static inline int
@@ -415,9 +409,6 @@ bch2_trans_commit_write_locked(struct btree_trans *trans,
 	struct bch_fs *c = trans->c;
 	struct bch_fs_usage_online *fs_usage = NULL;
 	struct btree_insert_entry *i;
-	unsigned mark_flags = trans->flags & BTREE_INSERT_BUCKET_INVALIDATE
-		? BCH_BUCKET_MARK_BUCKET_INVALIDATE
-		: 0;
 	unsigned iter, u64s = 0;
 	bool marking = false;
 	int ret;
@@ -490,10 +481,9 @@ bch2_trans_commit_write_locked(struct btree_trans *trans,
 	}
 
 	trans_for_each_update(trans, i)
-		if (likely(!(trans->flags & BTREE_INSERT_NOMARK)) &&
-		    iter_has_nontrans_triggers(i->iter))
+		if (iter_has_nontrans_triggers(i->iter))
 			bch2_mark_update(trans, i->iter, i->k,
-					 &fs_usage->u, mark_flags);
+					 &fs_usage->u, i->trigger_flags);
 
 	if (marking)
 		bch2_trans_fs_usage_apply(trans, fs_usage);
@@ -753,9 +743,9 @@ int __bch2_trans_commit(struct btree_trans *trans)
 			goto out;
 		}
 
-		if (likely(!(trans->flags & BTREE_INSERT_NOMARK)) &&
-		    iter_has_trans_triggers(i->iter)) {
-			ret = bch2_trans_mark_update(trans, i->iter, i->k);
+		if (iter_has_trans_triggers(i->iter)) {
+			ret = bch2_trans_mark_update(trans, i->iter, i->k,
+						     i->trigger_flags);
 			if (unlikely(ret)) {
 				if (ret == -EINTR)
 					trace_trans_restart_mark(trans->ip);
@@ -805,7 +795,7 @@ static int __bch2_btree_insert(struct btree_trans *trans,
 	if (IS_ERR(iter))
 		return PTR_ERR(iter);
 
-	bch2_trans_update(trans, iter, k);
+	bch2_trans_update(trans, iter, k, 0);
 	return 0;
 }
 
@@ -867,7 +857,7 @@ retry:
 				break;
 		}
 
-		bch2_trans_update(trans, iter, &delete);
+		bch2_trans_update(trans, iter, &delete, 0);
 		ret = bch2_trans_commit(trans, NULL, journal_seq,
 					BTREE_INSERT_NOFAIL);
 		if (ret)
@@ -893,7 +883,7 @@ int bch2_btree_delete_at(struct btree_trans *trans,
 	bkey_init(&k.k);
 	k.k.p = iter->pos;
 
-	bch2_trans_update(trans, iter, &k);
+	bch2_trans_update(trans, iter, &k, 0);
 	return bch2_trans_commit(trans, NULL, NULL,
 				 BTREE_INSERT_NOFAIL|
 				 BTREE_INSERT_USE_RESERVE|flags);
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index b516b636fbf2..d90bcbc0a005 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -628,7 +628,7 @@ unwind:
 	percpu_rwsem_assert_held(&c->mark_lock);			\
 									\
 	for (gc = 0; gc < 2 && !ret; gc++)				\
-		if (!gc == !(flags & BCH_BUCKET_MARK_GC) ||		\
+		if (!gc == !(flags & BTREE_TRIGGER_GC) ||		\
 		    (gc && gc_visited(c, pos)))				\
 			ret = fn(c, __VA_ARGS__, gc);			\
 	ret;								\
@@ -710,7 +710,7 @@ static int bch2_mark_alloc(struct bch_fs *c, struct bkey_s_c k,
 			   struct bch_fs_usage *fs_usage,
 			   u64 journal_seq, unsigned flags)
 {
-	bool gc = flags & BCH_BUCKET_MARK_GC;
+	bool gc = flags & BTREE_TRIGGER_GC;
 	struct bkey_alloc_unpacked u;
 	struct bch_dev *ca;
 	struct bucket *g;
@@ -719,8 +719,8 @@ static int bch2_mark_alloc(struct bch_fs *c, struct bkey_s_c k,
 	/*
 	 * alloc btree is read in by bch2_alloc_read, not gc:
 	 */
-	if ((flags & BCH_BUCKET_MARK_GC) &&
-	    !(flags & BCH_BUCKET_MARK_BUCKET_INVALIDATE))
+	if ((flags & BTREE_TRIGGER_GC) &&
+	    !(flags & BTREE_TRIGGER_BUCKET_INVALIDATE))
 		return 0;
 
 	ca = bch_dev_bkey_exists(c, k.k->p.inode);
@@ -743,7 +743,7 @@ static int bch2_mark_alloc(struct bch_fs *c, struct bkey_s_c k,
 		}
 	}));
 
-	if (!(flags & BCH_BUCKET_MARK_ALLOC_READ))
+	if (!(flags & BTREE_TRIGGER_ALLOC_READ))
 		bch2_dev_usage_update(c, ca, fs_usage, old, m, gc);
 
 	g->io_time[READ]	= u.read_time;
@@ -756,7 +756,7 @@ static int bch2_mark_alloc(struct bch_fs *c, struct bkey_s_c k,
 	 * not:
 	 */
 
-	if ((flags & BCH_BUCKET_MARK_BUCKET_INVALIDATE) &&
+	if ((flags & BTREE_TRIGGER_BUCKET_INVALIDATE) &&
 	    old.cached_sectors) {
 		update_cached_sectors(c, fs_usage, ca->dev_idx,
 				      -old.cached_sectors);
@@ -842,13 +842,13 @@ static s64 __ptr_disk_sectors_delta(unsigned old_size,
 {
 	BUG_ON(!n || !d);
 
-	if (flags & BCH_BUCKET_MARK_OVERWRITE_SPLIT) {
+	if (flags & BTREE_TRIGGER_OVERWRITE_SPLIT) {
 		BUG_ON(offset + -delta > old_size);
 
 		return -disk_sectors_scaled(n, d, old_size) +
 			disk_sectors_scaled(n, d, offset) +
 			disk_sectors_scaled(n, d, old_size - offset + delta);
-	} else if (flags & BCH_BUCKET_MARK_OVERWRITE) {
+	} else if (flags & BTREE_TRIGGER_OVERWRITE) {
 		BUG_ON(offset + -delta > old_size);
 
 		return -disk_sectors_scaled(n, d, old_size) +
@@ -874,8 +874,8 @@ static void bucket_set_stripe(struct bch_fs *c,
 			      u64 journal_seq,
 			      unsigned flags)
 {
-	bool enabled = !(flags & BCH_BUCKET_MARK_OVERWRITE);
-	bool gc = flags & BCH_BUCKET_MARK_GC;
+	bool enabled = !(flags & BTREE_TRIGGER_OVERWRITE);
+	bool gc = flags & BTREE_TRIGGER_GC;
 	unsigned i;
 
 	for (i = 0; i < v->nr_blocks; i++) {
@@ -922,7 +922,7 @@ static bool bch2_mark_pointer(struct bch_fs *c,
 			      struct bch_fs_usage *fs_usage,
 			      u64 journal_seq, unsigned flags)
 {
-	bool gc = flags & BCH_BUCKET_MARK_GC;
+	bool gc = flags & BTREE_TRIGGER_GC;
 	struct bucket_mark old, new;
 	struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev);
 	struct bucket *g = PTR_BUCKET(ca, &p.ptr, gc);
@@ -970,7 +970,7 @@ static bool bch2_mark_pointer(struct bch_fs *c,
 			new.data_type = data_type;
 		}
 
-		if (flags & BCH_BUCKET_MARK_NOATOMIC) {
+		if (flags & BTREE_TRIGGER_NOATOMIC) {
 			g->_mark = new;
 			break;
 		}
@@ -1008,7 +1008,7 @@ static int bch2_mark_stripe_ptr(struct bch_fs *c,
 				unsigned *nr_data,
 				unsigned *nr_parity)
 {
-	bool gc = flags & BCH_BUCKET_MARK_GC;
+	bool gc = flags & BTREE_TRIGGER_GC;
 	struct stripe *m;
 	unsigned old, new;
 	int blocks_nonempty_delta;
@@ -1121,7 +1121,7 @@ static int bch2_mark_stripe(struct bch_fs *c, struct bkey_s_c k,
 			    struct bch_fs_usage *fs_usage,
 			    u64 journal_seq, unsigned flags)
 {
-	bool gc = flags & BCH_BUCKET_MARK_GC;
+	bool gc = flags & BTREE_TRIGGER_GC;
 	struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k);
 	size_t idx = s.k->p.offset;
 	struct stripe *m = genradix_ptr(&c->stripes[gc], idx);
@@ -1129,14 +1129,14 @@ static int bch2_mark_stripe(struct bch_fs *c, struct bkey_s_c k,
 
 	spin_lock(&c->ec_stripes_heap_lock);
 
-	if (!m || ((flags & BCH_BUCKET_MARK_OVERWRITE) && !m->alive)) {
+	if (!m || ((flags & BTREE_TRIGGER_OVERWRITE) && !m->alive)) {
 		spin_unlock(&c->ec_stripes_heap_lock);
 		bch_err_ratelimited(c, "error marking nonexistent stripe %zu",
 				    idx);
 		return -1;
 	}
 
-	if (!(flags & BCH_BUCKET_MARK_OVERWRITE)) {
+	if (!(flags & BTREE_TRIGGER_OVERWRITE)) {
 		m->sectors	= le16_to_cpu(s.v->sectors);
 		m->algorithm	= s.v->algorithm;
 		m->nr_blocks	= s.v->nr_blocks;
@@ -1152,7 +1152,7 @@ static int bch2_mark_stripe(struct bch_fs *c, struct bkey_s_c k,
 #endif
 
 		/* gc recalculates these fields: */
-		if (!(flags & BCH_BUCKET_MARK_GC)) {
+		if (!(flags & BTREE_TRIGGER_GC)) {
 			for (i = 0; i < s.v->nr_blocks; i++) {
 				m->block_sectors[i] =
 					stripe_blockcount_get(s.v, i);
@@ -1185,16 +1185,16 @@ int bch2_mark_key_locked(struct bch_fs *c,
 
 	preempt_disable();
 
-	if (!fs_usage || (flags & BCH_BUCKET_MARK_GC))
+	if (!fs_usage || (flags & BTREE_TRIGGER_GC))
 		fs_usage = fs_usage_ptr(c, journal_seq,
-					flags & BCH_BUCKET_MARK_GC);
+					flags & BTREE_TRIGGER_GC);
 
 	switch (k.k->type) {
 	case KEY_TYPE_alloc:
 		ret = bch2_mark_alloc(c, k, fs_usage, journal_seq, flags);
 		break;
 	case KEY_TYPE_btree_ptr:
-		sectors = !(flags & BCH_BUCKET_MARK_OVERWRITE)
+		sectors = !(flags & BTREE_TRIGGER_OVERWRITE)
 			?  c->opts.btree_node_size
 			: -c->opts.btree_node_size;
 
@@ -1210,7 +1210,7 @@ int bch2_mark_key_locked(struct bch_fs *c,
 		ret = bch2_mark_stripe(c, k, fs_usage, journal_seq, flags);
 		break;
 	case KEY_TYPE_inode:
-		if (!(flags & BCH_BUCKET_MARK_OVERWRITE))
+		if (!(flags & BTREE_TRIGGER_OVERWRITE))
 			fs_usage->nr_inodes++;
 		else
 			fs_usage->nr_inodes--;
@@ -1260,7 +1260,7 @@ inline int bch2_mark_overwrite(struct btree_trans *trans,
 	unsigned		offset = 0;
 	s64			sectors = 0;
 
-	flags |= BCH_BUCKET_MARK_OVERWRITE;
+	flags |= BTREE_TRIGGER_OVERWRITE;
 
 	if (btree_node_is_extents(b)
 	    ? bkey_cmp(new->k.p, bkey_start_pos(old.k)) <= 0
@@ -1288,7 +1288,7 @@ inline int bch2_mark_overwrite(struct btree_trans *trans,
 			offset = bkey_start_offset(&new->k) -
 				bkey_start_offset(old.k);
 			sectors = -((s64) new->k.size);
-			flags |= BCH_BUCKET_MARK_OVERWRITE_SPLIT;
+			flags |= BTREE_TRIGGER_OVERWRITE_SPLIT;
 			break;
 		}
 
@@ -1311,15 +1311,18 @@ int bch2_mark_update(struct btree_trans *trans,
 	struct bkey_packed	*_k;
 	int ret = 0;
 
+	if (unlikely(flags & BTREE_TRIGGER_NORUN))
+		return 0;
+
 	if (!btree_node_type_needs_gc(iter->btree_id))
 		return 0;
 
 	bch2_mark_key_locked(c, bkey_i_to_s_c(insert),
 		0, insert->k.size,
 		fs_usage, trans->journal_res.seq,
-		BCH_BUCKET_MARK_INSERT|flags);
+		BTREE_TRIGGER_INSERT|flags);
 
-	if (unlikely(trans->flags & BTREE_INSERT_NOMARK_OVERWRITES))
+	if (unlikely(flags & BTREE_TRIGGER_NOOVERWRITES))
 		return 0;
 
 	/*
@@ -1450,7 +1453,7 @@ static void *trans_update_key(struct btree_trans *trans,
 			return new_k;
 		}
 
-	bch2_trans_update(trans, iter, new_k);
+	bch2_trans_update(trans, iter, new_k, 0);
 	return new_k;
 }
 
@@ -1689,7 +1692,7 @@ static int __bch2_trans_mark_reflink_p(struct btree_trans *trans,
 		goto err;
 	}
 
-	if ((flags & BCH_BUCKET_MARK_OVERWRITE) &&
+	if ((flags & BTREE_TRIGGER_OVERWRITE) &&
 	    (bkey_start_offset(k.k) < idx ||
 	     k.k->p.offset > idx + sectors))
 		goto out;
@@ -1706,7 +1709,7 @@ static int __bch2_trans_mark_reflink_p(struct btree_trans *trans,
 	r_v = bkey_i_to_reflink_v(new_k);
 
 	le64_add_cpu(&r_v->v.refcount,
-		     !(flags & BCH_BUCKET_MARK_OVERWRITE) ? 1 : -1);
+		     !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1);
 
 	if (!r_v->v.refcount) {
 		r_v->k.type = KEY_TYPE_deleted;
@@ -1750,7 +1753,7 @@ int bch2_trans_mark_key(struct btree_trans *trans, struct bkey_s_c k,
 
 	switch (k.k->type) {
 	case KEY_TYPE_btree_ptr:
-		sectors = !(flags & BCH_BUCKET_MARK_OVERWRITE)
+		sectors = !(flags & BTREE_TRIGGER_OVERWRITE)
 			?  c->opts.btree_node_size
 			: -c->opts.btree_node_size;
 
@@ -1763,7 +1766,7 @@ int bch2_trans_mark_key(struct btree_trans *trans, struct bkey_s_c k,
 	case KEY_TYPE_inode:
 		d = replicas_deltas_realloc(trans, 0);
 
-		if (!(flags & BCH_BUCKET_MARK_OVERWRITE))
+		if (!(flags & BTREE_TRIGGER_OVERWRITE))
 			d->nr_inodes++;
 		else
 			d->nr_inodes--;
@@ -1791,22 +1794,26 @@ int bch2_trans_mark_key(struct btree_trans *trans, struct bkey_s_c k,
 
 int bch2_trans_mark_update(struct btree_trans *trans,
 			   struct btree_iter *iter,
-			   struct bkey_i *insert)
+			   struct bkey_i *insert,
+			   unsigned flags)
 {
 	struct btree		*b = iter->l[0].b;
 	struct btree_node_iter	node_iter = iter->l[0].iter;
 	struct bkey_packed	*_k;
 	int ret;
 
+	if (unlikely(flags & BTREE_TRIGGER_NORUN))
+		return 0;
+
 	if (!btree_node_type_needs_gc(iter->btree_id))
 		return 0;
 
 	ret = bch2_trans_mark_key(trans, bkey_i_to_s_c(insert),
-			0, insert->k.size, BCH_BUCKET_MARK_INSERT);
+			0, insert->k.size, BTREE_TRIGGER_INSERT);
 	if (ret)
 		return ret;
 
-	if (unlikely(trans->flags & BTREE_INSERT_NOMARK_OVERWRITES))
+	if (unlikely(flags & BTREE_TRIGGER_NOOVERWRITES))
 		return 0;
 
 	while ((_k = bch2_btree_node_iter_peek_filter(&node_iter, b,
@@ -1815,7 +1822,7 @@ int bch2_trans_mark_update(struct btree_trans *trans,
 		struct bkey_s_c		k;
 		unsigned		offset = 0;
 		s64			sectors = 0;
-		unsigned		flags = BCH_BUCKET_MARK_OVERWRITE;
+		unsigned		flags = BTREE_TRIGGER_OVERWRITE;
 
 		k = bkey_disassemble(b, _k, &unpacked);
 
@@ -1845,7 +1852,7 @@ int bch2_trans_mark_update(struct btree_trans *trans,
 				offset = bkey_start_offset(&insert->k) -
 					bkey_start_offset(k.k);
 				sectors = -((s64) insert->k.size);
-				flags |= BCH_BUCKET_MARK_OVERWRITE_SPLIT;
+				flags |= BTREE_TRIGGER_OVERWRITE_SPLIT;
 				break;
 			}
 
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index 7b1bbe7c9316..2e49f2a8ccd9 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -258,14 +258,6 @@ void bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *,
 			       size_t, enum bch_data_type, unsigned,
 			       struct gc_pos, unsigned);
 
-#define BCH_BUCKET_MARK_INSERT			(1 << 0)
-#define BCH_BUCKET_MARK_OVERWRITE		(1 << 1)
-#define BCH_BUCKET_MARK_OVERWRITE_SPLIT		(1 << 2)
-#define BCH_BUCKET_MARK_BUCKET_INVALIDATE	(1 << 3)
-#define BCH_BUCKET_MARK_GC			(1 << 4)
-#define BCH_BUCKET_MARK_ALLOC_READ		(1 << 5)
-#define BCH_BUCKET_MARK_NOATOMIC		(1 << 6)
-
 int bch2_mark_key_locked(struct bch_fs *, struct bkey_s_c, unsigned, s64,
 			 struct bch_fs_usage *, u64, unsigned);
 int bch2_mark_key(struct bch_fs *, struct bkey_s_c, unsigned, s64,
@@ -284,9 +276,8 @@ int bch2_replicas_delta_list_apply(struct bch_fs *,
 				   struct replicas_delta_list *);
 int bch2_trans_mark_key(struct btree_trans *, struct bkey_s_c,
 			unsigned, s64, unsigned);
-int bch2_trans_mark_update(struct btree_trans *,
-			   struct btree_iter *iter,
-			   struct bkey_i *insert);
+int bch2_trans_mark_update(struct btree_trans *, struct btree_iter *iter,
+			   struct bkey_i *insert, unsigned);
 void bch2_trans_fs_usage_apply(struct btree_trans *, struct bch_fs_usage_online *);
 
 /* disk reservations: */
diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c
index 1bf53c55912d..4b4aeaf81d21 100644
--- a/fs/bcachefs/dirent.c
+++ b/fs/bcachefs/dirent.c
@@ -246,7 +246,7 @@ int bch2_dirent_rename(struct btree_trans *trans,
 				 */
 				new_dst->k.p = src_iter->pos;
 				bch2_trans_update(trans, src_iter,
-						  &new_dst->k_i);
+						  &new_dst->k_i, 0);
 				return 0;
 			} else {
 				/* If we're overwriting, we can't insert new_dst
@@ -268,8 +268,8 @@ int bch2_dirent_rename(struct btree_trans *trans,
 		}
 	}
 
-	bch2_trans_update(trans, src_iter, &new_src->k_i);
-	bch2_trans_update(trans, dst_iter, &new_dst->k_i);
+	bch2_trans_update(trans, src_iter, &new_src->k_i, 0);
+	bch2_trans_update(trans, dst_iter, &new_dst->k_i, 0);
 	return 0;
 }
 
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index a6bc9355c750..0e2acd4f5712 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -736,7 +736,7 @@ found_slot:
 
 	stripe->k.p = iter->pos;
 
-	bch2_trans_update(&trans, iter, &stripe->k_i);
+	bch2_trans_update(&trans, iter, &stripe->k_i, 0);
 
 	ret = bch2_trans_commit(&trans, NULL, NULL,
 				BTREE_INSERT_NOFAIL);
@@ -818,7 +818,7 @@ static int ec_stripe_update_ptrs(struct bch_fs *c,
 
 		extent_stripe_ptr_add(e, s, ec_ptr, idx);
 
-		bch2_trans_update(&trans, iter, sk.k);
+		bch2_trans_update(&trans, iter, sk.k, 0);
 
 		ret = bch2_trans_commit(&trans, NULL, NULL,
 					BTREE_INSERT_NOFAIL|
@@ -1230,7 +1230,7 @@ static int __bch2_stripe_write_key(struct btree_trans *trans,
 
 	spin_unlock(&c->ec_stripes_heap_lock);
 
-	bch2_trans_update(trans, iter, &new_key->k_i);
+	bch2_trans_update(trans, iter, &new_key->k_i, 0);
 
 	return bch2_trans_commit(trans, NULL, NULL,
 				 BTREE_INSERT_NOFAIL|flags);
@@ -1316,8 +1316,8 @@ int bch2_stripes_read(struct bch_fs *c, struct journal_keys *journal_keys)
 
 		bch2_mark_key(c, btree ? btree_k : journal_k,
 			      0, 0, NULL, 0,
-			      BCH_BUCKET_MARK_ALLOC_READ|
-			      BCH_BUCKET_MARK_NOATOMIC);
+			      BTREE_TRIGGER_ALLOC_READ|
+			      BTREE_TRIGGER_NOATOMIC);
 
 		if (btree)
 			btree_k = bch2_btree_iter_next(btree_iter);
diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 15b0d20b2f81..25fe9ab0f068 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -2465,7 +2465,7 @@ static long bchfs_fcollapse_finsert(struct bch_inode_info *inode,
 		struct bpos next_pos;
 		struct bpos move_pos = POS(inode->v.i_ino, offset >> 9);
 		struct bpos atomic_end;
-		unsigned commit_flags = 0;
+		unsigned trigger_flags = 0;
 
 		k = insert
 			? bch2_btree_iter_peek_prev(src)
@@ -2536,15 +2536,12 @@ reassemble:
 				bkey_start_pos(&delete.k));
 		}
 
-		bch2_trans_update(&trans, dst, copy.k);
-		bch2_trans_update(&trans, del ?: src, &delete);
-
 		if (copy.k->k.size == k.k->size) {
 			/*
 			 * If we're moving the entire extent, we can skip
 			 * running triggers:
 			 */
-			commit_flags |= BTREE_INSERT_NOMARK;
+			trigger_flags |= BTREE_TRIGGER_NORUN;
 		} else {
 			/* We might end up splitting compressed extents: */
 			unsigned nr_ptrs =
@@ -2556,10 +2553,12 @@ reassemble:
 			BUG_ON(ret);
 		}
 
+		bch2_trans_update(&trans, dst, copy.k, trigger_flags);
+		bch2_trans_update(&trans, del ?: src, &delete, trigger_flags);
+
 		ret = bch2_trans_commit(&trans, &disk_res,
 					&inode->ei_journal_seq,
-					BTREE_INSERT_NOFAIL|
-					commit_flags);
+					BTREE_INSERT_NOFAIL);
 		bch2_disk_reservation_put(c, &disk_res);
 bkey_err:
 		if (del)
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index e25f064706ad..9ef532d875e8 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -192,7 +192,7 @@ static int hash_redo_key(const struct bch_hash_desc desc,
 
 	bkey_init(&delete.k);
 	delete.k.p = k_iter->pos;
-	bch2_trans_update(trans, k_iter, &delete);
+	bch2_trans_update(trans, k_iter, &delete, 0);
 
 	return  bch2_hash_set(trans, desc, &h->info, k_iter->pos.inode,
 			      tmp, BCH_HASH_SET_MUST_CREATE) ?:
@@ -388,7 +388,7 @@ static int check_dirent_hash(struct btree_trans *trans, struct hash_check *h,
 				      BTREE_INSERT_NOFAIL|
 				      BTREE_INSERT_LAZY_RW,
 				      TRANS_RESET_MEM,
-			(bch2_trans_update(trans, iter, &d->k_i), 0));
+			(bch2_trans_update(trans, iter, &d->k_i, 0), 0));
 		if (ret)
 			goto err;
 
@@ -661,7 +661,7 @@ retry:
 					      BTREE_INSERT_NOFAIL|
 					      BTREE_INSERT_LAZY_RW,
 					      TRANS_RESET_MEM,
-				(bch2_trans_update(&trans, iter, &n->k_i), 0));
+				(bch2_trans_update(&trans, iter, &n->k_i, 0), 0));
 			kfree(n);
 			if (ret)
 				goto err;
@@ -1276,7 +1276,7 @@ static int check_inode(struct btree_trans *trans,
 				      BTREE_INSERT_NOFAIL|
 				      BTREE_INSERT_LAZY_RW,
 				      TRANS_RESET_MEM,
-			(bch2_trans_update(trans, iter, &p.inode.k_i), 0));
+			(bch2_trans_update(trans, iter, &p.inode.k_i, 0), 0));
 		if (ret)
 			bch_err(c, "error in fsck: error %i "
 				"updating inode", ret);
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index 77ac9ab7fc57..bd44ef3842cb 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -223,7 +223,7 @@ int bch2_inode_write(struct btree_trans *trans,
 		return PTR_ERR(inode_p);
 
 	bch2_inode_pack(inode_p, inode);
-	bch2_trans_update(trans, iter, &inode_p->inode.k_i);
+	bch2_trans_update(trans, iter, &inode_p->inode.k_i, 0);
 	return 0;
 }
 
@@ -411,7 +411,7 @@ again:
 			inode_u->bi_generation	= bkey_generation(k);
 
 			bch2_inode_pack(inode_p, inode_u);
-			bch2_trans_update(trans, iter, &inode_p->inode.k_i);
+			bch2_trans_update(trans, iter, &inode_p->inode.k_i, 0);
 			return 0;
 		}
 	}
@@ -493,7 +493,7 @@ int bch2_inode_rm(struct bch_fs *c, u64 inode_nr)
 			delete.v.bi_generation = cpu_to_le32(bi_generation);
 		}
 
-		bch2_trans_update(&trans, iter, &delete.k_i);
+		bch2_trans_update(&trans, iter, &delete.k_i, 0);
 
 		ret = bch2_trans_commit(&trans, NULL, NULL,
 					BTREE_INSERT_NOFAIL);
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index ba79b35a130f..1fab0bdcf9ae 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -302,13 +302,13 @@ int bch2_extent_update(struct btree_trans *trans,
 		if (delta || new_i_size) {
 			bch2_inode_pack(&inode_p, &inode_u);
 			bch2_trans_update(trans, inode_iter,
-					  &inode_p.inode.k_i);
+					  &inode_p.inode.k_i, 0);
 		}
 
 		bch2_trans_iter_put(trans, inode_iter);
 	}
 
-	bch2_trans_update(trans, iter, k);
+	bch2_trans_update(trans, iter, k, 0);
 
 	ret = bch2_trans_commit(trans, disk_res, journal_seq,
 				BTREE_INSERT_NOCHECK_RW|
@@ -1740,7 +1740,7 @@ retry:
 	if (!bch2_bkey_narrow_crcs(new.k, new_crc))
 		goto out;
 
-	bch2_trans_update(&trans, iter, new.k);
+	bch2_trans_update(&trans, iter, new.k, 0);
 	ret = bch2_trans_commit(&trans, NULL, NULL,
 				BTREE_INSERT_NOFAIL|
 				BTREE_INSERT_NOWAIT);
diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c
index db86420bd647..0e3f63c1d65c 100644
--- a/fs/bcachefs/migrate.c
+++ b/fs/bcachefs/migrate.c
@@ -76,7 +76,7 @@ static int __bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags
 
 		bch2_btree_iter_set_pos(iter, bkey_start_pos(&sk.k->k));
 
-		bch2_trans_update(&trans, iter, sk.k);
+		bch2_trans_update(&trans, iter, sk.k, 0);
 
 		ret = bch2_trans_commit(&trans, NULL, NULL,
 					BTREE_INSERT_NOFAIL);
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index cb7bb751b7b5..7ed90b0576c0 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -149,7 +149,7 @@ static int bch2_migrate_index_update(struct bch_write_op *op)
 			goto next;
 		}
 
-		bch2_trans_update(&trans, iter, insert);
+		bch2_trans_update(&trans, iter, insert, 0);
 
 		ret = bch2_trans_commit(&trans, &op->res,
 				op_journal_seq(op),
diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c
index 0fa6f33c049b..e7787c5063ce 100644
--- a/fs/bcachefs/quota.c
+++ b/fs/bcachefs/quota.c
@@ -752,7 +752,7 @@ static int bch2_set_quota(struct super_block *sb, struct kqid qid,
 	if (qdq->d_fieldmask & QC_INO_HARD)
 		new_quota.v.c[Q_INO].hardlimit = cpu_to_le64(qdq->d_ino_hardlimit);
 
-	bch2_trans_update(&trans, iter, &new_quota.k_i);
+	bch2_trans_update(&trans, iter, &new_quota.k_i, 0);
 
 	ret = bch2_trans_commit(&trans, NULL, NULL, 0);
 
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index c7367a679b22..8ecd4abc8eeb 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -300,28 +300,24 @@ retry:
 		bch2_cut_front(split_iter->pos, split);
 		bch2_cut_back(atomic_end, split);
 
-		bch2_trans_update(&trans, split_iter, split);
+		bch2_trans_update(&trans, split_iter, split, !remark
+				  ? BTREE_TRIGGER_NORUN
+				  : BTREE_TRIGGER_NOOVERWRITES);
 		bch2_btree_iter_set_pos(iter, split->k.p);
 	} while (bkey_cmp(iter->pos, k->k.p) < 0);
 
 	if (remark) {
 		ret = bch2_trans_mark_key(&trans, bkey_i_to_s_c(k),
 					  0, -((s64) k->k.size),
-					  BCH_BUCKET_MARK_OVERWRITE) ?:
-		      bch2_trans_commit(&trans, &disk_res, NULL,
-					BTREE_INSERT_NOFAIL|
-					BTREE_INSERT_LAZY_RW|
-					BTREE_INSERT_NOMARK_OVERWRITES);
-	} else {
-		ret = bch2_trans_commit(&trans, &disk_res, NULL,
-					BTREE_INSERT_NOFAIL|
-					BTREE_INSERT_LAZY_RW|
-					BTREE_INSERT_JOURNAL_REPLAY|
-					BTREE_INSERT_NOMARK);
+					  BTREE_TRIGGER_OVERWRITE);
+		if (ret)
+			goto err;
 	}
 
-	if (ret)
-		goto err;
+	ret = bch2_trans_commit(&trans, &disk_res, NULL,
+				BTREE_INSERT_NOFAIL|
+				BTREE_INSERT_LAZY_RW|
+				BTREE_INSERT_JOURNAL_REPLAY);
 err:
 	if (ret == -EINTR)
 		goto retry;
@@ -331,6 +327,30 @@ err:
 	return bch2_trans_exit(&trans) ?: ret;
 }
 
+static int __bch2_journal_replay_key(struct btree_trans *trans,
+				     enum btree_id id, struct bkey_i *k)
+{
+	struct btree_iter *iter;
+
+	iter = bch2_trans_get_iter(trans, id, bkey_start_pos(&k->k),
+				   BTREE_ITER_INTENT);
+	if (IS_ERR(iter))
+		return PTR_ERR(iter);
+
+	bch2_trans_update(trans, iter, k, BTREE_TRIGGER_NORUN);
+	return 0;
+}
+
+static int bch2_journal_replay_key(struct bch_fs *c, enum btree_id id,
+				   struct bkey_i *k)
+{
+	return bch2_trans_do(c, NULL, NULL,
+			     BTREE_INSERT_NOFAIL|
+			     BTREE_INSERT_LAZY_RW|
+			     BTREE_INSERT_JOURNAL_REPLAY,
+			     __bch2_journal_replay_key(&trans, id, k));
+}
+
 static int bch2_journal_replay(struct bch_fs *c,
 			       struct journal_keys keys)
 {
@@ -348,12 +368,7 @@ static int bch2_journal_replay(struct bch_fs *c,
 		else if (btree_node_type_is_extents(i->btree_id))
 			ret = bch2_extent_replay_key(c, i->btree_id, i->k);
 		else
-			ret = bch2_btree_insert(c, i->btree_id, i->k,
-						NULL, NULL,
-						BTREE_INSERT_NOFAIL|
-						BTREE_INSERT_LAZY_RW|
-						BTREE_INSERT_JOURNAL_REPLAY|
-						BTREE_INSERT_NOMARK);
+			ret = bch2_journal_replay_key(c, i->btree_id, i->k);
 
 		if (ret) {
 			bch_err(c, "journal replay: error %d while replaying key",
diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c
index 2bf003ba3bd8..3b8c74ca3725 100644
--- a/fs/bcachefs/reflink.c
+++ b/fs/bcachefs/reflink.c
@@ -115,7 +115,7 @@ static int bch2_make_extent_indirect(struct btree_trans *trans,
 	r_v->v.refcount	= 0;
 	memcpy(r_v->v.start, e->v.start, bkey_val_bytes(&e->k));
 
-	bch2_trans_update(trans, reflink_iter, &r_v->k_i);
+	bch2_trans_update(trans, reflink_iter, &r_v->k_i, 0);
 
 	r_p = bch2_trans_kmalloc(trans, sizeof(*r_p));
 	if (IS_ERR(r_p))
@@ -126,7 +126,7 @@ static int bch2_make_extent_indirect(struct btree_trans *trans,
 	set_bkey_val_bytes(&r_p->k, sizeof(r_p->v));
 	r_p->v.idx = cpu_to_le64(bkey_start_offset(&r_v->k));
 
-	bch2_trans_update(trans, extent_iter, &r_p->k_i);
+	bch2_trans_update(trans, extent_iter, &r_p->k_i, 0);
 err:
 	if (!IS_ERR(reflink_iter)) {
 		c->reflink_hint = reflink_iter->pos.offset;
diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h
index 3870df2d58ce..35f4232d0755 100644
--- a/fs/bcachefs/str_hash.h
+++ b/fs/bcachefs/str_hash.h
@@ -281,7 +281,7 @@ not_found:
 			swap(iter, slot);
 
 		insert->k.p = iter->pos;
-		bch2_trans_update(trans, iter, insert);
+		bch2_trans_update(trans, iter, insert, 0);
 	}
 
 	goto out;
@@ -308,7 +308,7 @@ int bch2_hash_delete_at(struct btree_trans *trans,
 	delete->k.p = iter->pos;
 	delete->k.type = ret ? KEY_TYPE_whiteout : KEY_TYPE_deleted;
 
-	bch2_trans_update(trans, iter, delete);
+	bch2_trans_update(trans, iter, delete, 0);
 	return 0;
 }
 
diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c
index a2092bb99095..8ef7bc8098d7 100644
--- a/fs/bcachefs/tests.c
+++ b/fs/bcachefs/tests.c
@@ -43,7 +43,7 @@ static void test_delete(struct bch_fs *c, u64 nr)
 	ret = bch2_btree_iter_traverse(iter);
 	BUG_ON(ret);
 
-	bch2_trans_update(&trans, iter, &k.k_i);
+	bch2_trans_update(&trans, iter, &k.k_i, 0);
 	ret = bch2_trans_commit(&trans, NULL, NULL, 0);
 	BUG_ON(ret);
 
@@ -75,7 +75,7 @@ static void test_delete_written(struct bch_fs *c, u64 nr)
 	ret = bch2_btree_iter_traverse(iter);
 	BUG_ON(ret);
 
-	bch2_trans_update(&trans, iter, &k.k_i);
+	bch2_trans_update(&trans, iter, &k.k_i, 0);
 	ret = bch2_trans_commit(&trans, NULL, NULL, 0);
 	BUG_ON(ret);
 
@@ -465,7 +465,7 @@ static void rand_mixed(struct bch_fs *c, u64 nr)
 			bkey_cookie_init(&k.k_i);
 			k.k.p = iter->pos;
 
-			bch2_trans_update(&trans, iter, &k.k_i);
+			bch2_trans_update(&trans, iter, &k.k_i, 0);
 			ret = bch2_trans_commit(&trans, NULL, NULL, 0);
 			BUG_ON(ret);
 		}
@@ -509,7 +509,7 @@ static void seq_insert(struct bch_fs *c, u64 nr)
 			   BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) {
 		insert.k.p = iter->pos;
 
-		bch2_trans_update(&trans, iter, &insert.k_i);
+		bch2_trans_update(&trans, iter, &insert.k_i, 0);
 		ret = bch2_trans_commit(&trans, NULL, NULL, 0);
 		BUG_ON(ret);
 
@@ -548,7 +548,7 @@ static void seq_overwrite(struct bch_fs *c, u64 nr)
 
 		bkey_reassemble(&u.k_i, k);
 
-		bch2_trans_update(&trans, iter, &u.k_i);
+		bch2_trans_update(&trans, iter, &u.k_i, 0);
 		ret = bch2_trans_commit(&trans, NULL, NULL, 0);
 		BUG_ON(ret);
 	}
-- 
cgit 


From 24326cd12aa03ea2c6808d09d285af2cecfa4789 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 31 Dec 2019 19:37:10 -0500
Subject: bcachefs: Sort & deduplicate updates in bch2_trans_update()

Previously, when doing multiple update in the same transaction commit
that overwrote each other, we relied on doing the updates in the same
order as the bch2_trans_update() calls in order to get the correct
result. But that wasn't correct for triggers; bch2_trans_mark_update()
when marking overwrites would do the wrong thing because it hadn't seen
the update that was being overwritten.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c        |  22 +++--
 fs/bcachefs/btree_types.h       |  12 +--
 fs/bcachefs/btree_update.h      |  15 +---
 fs/bcachefs/btree_update_leaf.c | 176 ++++++++++++++++++++++++++--------------
 fs/bcachefs/buckets.c           |  58 ++++---------
 fs/bcachefs/fs-io.c             |  40 ++-------
 6 files changed, 159 insertions(+), 164 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 0cd1b84c02ba..d1218d34232d 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1793,10 +1793,9 @@ int bch2_trans_iter_free(struct btree_trans *trans,
 static int bch2_trans_realloc_iters(struct btree_trans *trans,
 				    unsigned new_size)
 {
-	void *new_iters, *new_updates, *new_sorted;
+	void *new_iters, *new_updates;
 	size_t iters_bytes;
 	size_t updates_bytes;
-	size_t sorted_bytes;
 
 	new_size = roundup_pow_of_two(new_size);
 
@@ -1811,11 +1810,8 @@ static int bch2_trans_realloc_iters(struct btree_trans *trans,
 
 	iters_bytes	= sizeof(struct btree_iter) * new_size;
 	updates_bytes	= sizeof(struct btree_insert_entry) * new_size;
-	sorted_bytes	= sizeof(u8) * new_size;
 
-	new_iters = kmalloc(iters_bytes +
-			    updates_bytes +
-			    sorted_bytes, GFP_NOFS);
+	new_iters = kmalloc(iters_bytes + updates_bytes, GFP_NOFS);
 	if (new_iters)
 		goto success;
 
@@ -1825,7 +1821,6 @@ static int bch2_trans_realloc_iters(struct btree_trans *trans,
 	trans->used_mempool = true;
 success:
 	new_updates	= new_iters + iters_bytes;
-	new_sorted	= new_updates + updates_bytes;
 
 	memcpy(new_iters, trans->iters,
 	       sizeof(struct btree_iter) * trans->nr_iters);
@@ -1842,7 +1837,6 @@ success:
 
 	trans->iters		= new_iters;
 	trans->updates		= new_updates;
-	trans->updates_sorted	= new_sorted;
 	trans->size		= new_size;
 
 	if (trans->iters_live) {
@@ -1891,6 +1885,7 @@ static struct btree_iter *btree_trans_iter_alloc(struct btree_trans *trans)
 got_slot:
 	BUG_ON(trans->iters_linked & (1ULL << idx));
 	trans->iters_linked |= 1ULL << idx;
+	trans->iters[idx].flags = 0;
 	return &trans->iters[idx];
 }
 
@@ -1906,6 +1901,9 @@ static inline void btree_iter_copy(struct btree_iter *dst,
 		if (btree_node_locked(dst, i))
 			six_lock_increment(&dst->l[i].b->c.lock,
 					   __btree_lock_want(dst, i));
+
+	dst->flags &= ~BTREE_ITER_KEEP_UNTIL_COMMIT;
+	dst->flags &= ~BTREE_ITER_SET_POS_AFTER_COMMIT;
 }
 
 static inline struct bpos bpos_diff(struct bpos l, struct bpos r)
@@ -1956,7 +1954,6 @@ static struct btree_iter *__btree_trans_get_iter(struct btree_trans *trans,
 		iter = best;
 	}
 
-	iter->flags &= ~BTREE_ITER_KEEP_UNTIL_COMMIT;
 	iter->flags &= ~(BTREE_ITER_SLOTS|BTREE_ITER_INTENT|BTREE_ITER_PREFETCH);
 	iter->flags |= flags & (BTREE_ITER_SLOTS|BTREE_ITER_INTENT|BTREE_ITER_PREFETCH);
 
@@ -1968,6 +1965,7 @@ static struct btree_iter *__btree_trans_get_iter(struct btree_trans *trans,
 	BUG_ON(iter->btree_id != btree_id);
 	BUG_ON((iter->flags ^ flags) & BTREE_ITER_TYPE);
 	BUG_ON(iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT);
+	BUG_ON(iter->flags & BTREE_ITER_SET_POS_AFTER_COMMIT);
 	BUG_ON(trans->iters_live & (1ULL << iter->idx));
 
 	trans->iters_live	|= 1ULL << iter->idx;
@@ -2030,7 +2028,6 @@ struct btree_iter *bch2_trans_copy_iter(struct btree_trans *trans,
 	 * it's cheap to copy it again:
 	 */
 	trans->iters_touched &= ~(1ULL << iter->idx);
-	iter->flags &= ~BTREE_ITER_KEEP_UNTIL_COMMIT;
 
 	return iter;
 }
@@ -2090,7 +2087,8 @@ void bch2_trans_reset(struct btree_trans *trans, unsigned flags)
 	struct btree_iter *iter;
 
 	trans_for_each_iter(trans, iter)
-		iter->flags &= ~BTREE_ITER_KEEP_UNTIL_COMMIT;
+		iter->flags &= ~(BTREE_ITER_KEEP_UNTIL_COMMIT|
+				 BTREE_ITER_SET_POS_AFTER_COMMIT);
 
 	bch2_trans_unlink_iters(trans);
 
@@ -2099,6 +2097,7 @@ void bch2_trans_reset(struct btree_trans *trans, unsigned flags)
 
 	trans->iters_touched &= trans->iters_live;
 
+	trans->need_reset		= 0;
 	trans->nr_updates		= 0;
 
 	if (flags & TRANS_RESET_MEM)
@@ -2127,7 +2126,6 @@ void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c,
 	trans->size		= ARRAY_SIZE(trans->iters_onstack);
 	trans->iters		= trans->iters_onstack;
 	trans->updates		= trans->updates_onstack;
-	trans->updates_sorted	= trans->updates_sorted_onstack;
 	trans->fs_usage_deltas	= NULL;
 
 	if (expected_nr_iters > trans->size)
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 63d04873addb..3951933db5d6 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -197,6 +197,7 @@ enum btree_iter_type {
  */
 #define BTREE_ITER_IS_EXTENTS		(1 << 6)
 #define BTREE_ITER_ERROR		(1 << 7)
+#define BTREE_ITER_SET_POS_AFTER_COMMIT	(1 << 8)
 
 enum btree_iter_uptodate {
 	BTREE_ITER_UPTODATE		= 0,
@@ -213,12 +214,13 @@ enum btree_iter_uptodate {
  * @nodes_intent_locked	- bitmask indicating which locks are intent locks
  */
 struct btree_iter {
-	u8			idx;
-
 	struct btree_trans	*trans;
 	struct bpos		pos;
+	struct bpos		pos_after_commit;
+
+	u16			flags;
+	u8			idx;
 
-	u8			flags;
 	enum btree_iter_uptodate uptodate:4;
 	enum btree_id		btree_id:4;
 	unsigned		level:4,
@@ -246,6 +248,7 @@ static inline enum btree_iter_type btree_iter_type(struct btree_iter *iter)
 
 struct btree_insert_entry {
 	unsigned		trigger_flags;
+	unsigned		trans_triggers_run:1;
 	struct bkey_i		*k;
 	struct btree_iter	*iter;
 };
@@ -266,6 +269,7 @@ struct btree_trans {
 	unsigned		used_mempool:1;
 	unsigned		error:1;
 	unsigned		nounlock:1;
+	unsigned		need_reset:1;
 
 	unsigned		mem_top;
 	unsigned		mem_bytes;
@@ -273,7 +277,6 @@ struct btree_trans {
 
 	struct btree_iter	*iters;
 	struct btree_insert_entry *updates;
-	u8			*updates_sorted;
 
 	/* update path: */
 	struct journal_res	journal_res;
@@ -287,7 +290,6 @@ struct btree_trans {
 
 	struct btree_iter	iters_onstack[2];
 	struct btree_insert_entry updates_onstack[2];
-	u8			updates_sorted_onstack[2];
 };
 
 #define BTREE_FLAG(flag)						\
diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
index add7217598ed..2c34bae64281 100644
--- a/fs/bcachefs/btree_update.h
+++ b/fs/bcachefs/btree_update.h
@@ -72,6 +72,8 @@ int bch2_btree_node_rewrite(struct bch_fs *c, struct btree_iter *,
 int bch2_btree_node_update_key(struct bch_fs *, struct btree_iter *,
 			       struct btree *, struct bkey_i_btree_ptr *);
 
+int bch2_trans_update(struct btree_trans *, struct btree_iter *,
+		      struct bkey_i *, enum btree_trigger_flags);
 int __bch2_trans_commit(struct btree_trans *);
 
 /**
@@ -96,19 +98,6 @@ static inline int bch2_trans_commit(struct btree_trans *trans,
 	return __bch2_trans_commit(trans);
 }
 
-static inline void bch2_trans_update(struct btree_trans *trans,
-				     struct btree_iter *iter, struct bkey_i *k,
-				     enum btree_trigger_flags flags)
-{
-	EBUG_ON(trans->nr_updates >= trans->nr_iters);
-
-	iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT;
-
-	trans->updates[trans->nr_updates++] = (struct btree_insert_entry) {
-		.trigger_flags = flags, .iter = iter, .k = k
-	};
-}
-
 #define __bch2_trans_do(_trans, _disk_res, _journal_seq,		\
 			_flags,	_reset_flags, _do)			\
 ({									\
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 07a6213914f7..2242b2061ee2 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -21,18 +21,12 @@
 #include <linux/sort.h>
 
 static inline bool same_leaf_as_prev(struct btree_trans *trans,
-				     unsigned idx)
+				     struct btree_insert_entry *i)
 {
-	return idx &&
-		trans->updates[trans->updates_sorted[idx]].iter->l[0].b ==
-		trans->updates[trans->updates_sorted[idx - 1]].iter->l[0].b;
+	return i != trans->updates &&
+		i[0].iter->l[0].b == i[-1].iter->l[0].b;
 }
 
-#define trans_for_each_update_sorted(_trans, _i, _iter)			\
-	for (_iter = 0;							\
-	     _iter < _trans->nr_updates &&				\
-	     (_i = _trans->updates + _trans->updates_sorted[_iter], 1);	\
-	     _iter++)
 
 inline void bch2_btree_node_lock_for_insert(struct bch_fs *c, struct btree *b,
 					    struct btree_iter *iter)
@@ -51,28 +45,6 @@ inline void bch2_btree_node_lock_for_insert(struct bch_fs *c, struct btree *b,
 		bch2_btree_init_next(c, b, iter);
 }
 
-static inline void btree_trans_sort_updates(struct btree_trans *trans)
-{
-	struct btree_insert_entry *l, *r;
-	unsigned nr = 0, pos;
-
-	trans_for_each_update(trans, l) {
-		for (pos = 0; pos < nr; pos++) {
-			r = trans->updates + trans->updates_sorted[pos];
-
-			if (btree_iter_cmp(l->iter, r->iter) <= 0)
-				break;
-		}
-
-		memmove(&trans->updates_sorted[pos + 1],
-			&trans->updates_sorted[pos],
-			(nr - pos) * sizeof(trans->updates_sorted[0]));
-
-		trans->updates_sorted[pos] = l - trans->updates;
-		nr++;
-	}
-}
-
 /* Inserting into a given leaf node (last stage of insert): */
 
 /* Handle overwrites and do insert, for non extents: */
@@ -409,7 +381,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans,
 	struct bch_fs *c = trans->c;
 	struct bch_fs_usage_online *fs_usage = NULL;
 	struct btree_insert_entry *i;
-	unsigned iter, u64s = 0;
+	unsigned u64s = 0;
 	bool marking = false;
 	int ret;
 
@@ -426,9 +398,9 @@ bch2_trans_commit_write_locked(struct btree_trans *trans,
 
 	prefetch(&trans->c->journal.flags);
 
-	trans_for_each_update_sorted(trans, i, iter) {
+	trans_for_each_update(trans, i) {
 		/* Multiple inserts might go to same leaf: */
-		if (!same_leaf_as_prev(trans, iter))
+		if (!same_leaf_as_prev(trans, i))
 			u64s = 0;
 
 		u64s += i->k->k.u64s;
@@ -510,7 +482,6 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans,
 {
 	struct btree_insert_entry *i;
 	struct btree_iter *iter;
-	unsigned idx;
 	int ret;
 
 	trans_for_each_update(trans, i)
@@ -545,21 +516,15 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans,
 			btree_insert_entry_checks(trans, i->iter, i->k);
 	bch2_btree_trans_verify_locks(trans);
 
-	/*
-	 * No more updates can be added - sort updates so we can take write
-	 * locks in the correct order:
-	 */
-	btree_trans_sort_updates(trans);
-
-	trans_for_each_update_sorted(trans, i, idx)
-		if (!same_leaf_as_prev(trans, idx))
+	trans_for_each_update(trans, i)
+		if (!same_leaf_as_prev(trans, i))
 			bch2_btree_node_lock_for_insert(trans->c,
 						i->iter->l[0].b, i->iter);
 
 	ret = bch2_trans_commit_write_locked(trans, stopped_at);
 
-	trans_for_each_update_sorted(trans, i, idx)
-		if (!same_leaf_as_prev(trans, idx))
+	trans_for_each_update(trans, i)
+		if (!same_leaf_as_prev(trans, i))
 			bch2_btree_node_unlock_write_inlined(i->iter->l[0].b,
 							     i->iter);
 
@@ -575,8 +540,8 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans,
 	if (trans->flags & BTREE_INSERT_NOUNLOCK)
 		trans->nounlock = true;
 
-	trans_for_each_update_sorted(trans, i, idx)
-		if (!same_leaf_as_prev(trans, idx))
+	trans_for_each_update(trans, i)
+		if (!same_leaf_as_prev(trans, i))
 			bch2_foreground_maybe_merge(trans->c, i->iter,
 						    0, trans->flags);
 
@@ -708,9 +673,13 @@ bch2_trans_commit_get_rw_cold(struct btree_trans *trans)
 int __bch2_trans_commit(struct btree_trans *trans)
 {
 	struct btree_insert_entry *i = NULL;
+	struct btree_iter *iter;
+	bool trans_trigger_run;
 	unsigned u64s;
 	int ret = 0;
 
+	BUG_ON(trans->need_reset);
+
 	if (!trans->nr_updates)
 		goto out_noupdates;
 
@@ -730,9 +699,29 @@ int __bch2_trans_commit(struct btree_trans *trans)
 	}
 
 	/*
-	 * note: running triggers will append more updates to the list of
-	 * updates as we're walking it:
+	 * Running triggers will append more updates to the list of updates as
+	 * we're walking it:
 	 */
+	do {
+		trans_trigger_run = false;
+
+		trans_for_each_update(trans, i) {
+			if (iter_has_trans_triggers(i->iter) &&
+			    !i->trans_triggers_run) {
+				i->trans_triggers_run = true;
+				trans_trigger_run = true;
+
+				ret = bch2_trans_mark_update(trans, i->iter, i->k,
+							     i->trigger_flags);
+				if (unlikely(ret)) {
+					if (ret == -EINTR)
+						trace_trans_restart_mark(trans->ip);
+					goto out;
+				}
+			}
+		}
+	} while (trans_trigger_run);
+
 	trans_for_each_update(trans, i) {
 		/* we know trans->nounlock won't be set here: */
 		if (unlikely(!(i->iter->locks_want < 1
@@ -743,16 +732,6 @@ int __bch2_trans_commit(struct btree_trans *trans)
 			goto out;
 		}
 
-		if (iter_has_trans_triggers(i->iter)) {
-			ret = bch2_trans_mark_update(trans, i->iter, i->k,
-						     i->trigger_flags);
-			if (unlikely(ret)) {
-				if (ret == -EINTR)
-					trace_trans_restart_mark(trans->ip);
-				goto out;
-			}
-		}
-
 		u64s = jset_u64s(i->k->k.u64s);
 		if (0)
 			trans->journal_preres_u64s += u64s;
@@ -768,6 +747,15 @@ retry:
 
 	if (ret)
 		goto err;
+
+	trans_for_each_iter(trans, iter)
+		if ((trans->iters_live & (1ULL << iter->idx)) &&
+		    (iter->flags & BTREE_ITER_SET_POS_AFTER_COMMIT)) {
+			if (trans->flags & BTREE_INSERT_NOUNLOCK)
+				bch2_btree_iter_set_pos_same_leaf(iter, iter->pos_after_commit);
+			else
+				bch2_btree_iter_set_pos(iter, iter->pos_after_commit);
+		}
 out:
 	bch2_journal_preres_put(&trans->c->journal, &trans->journal_preres);
 
@@ -785,6 +773,76 @@ err:
 	goto retry;
 }
 
+int bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter,
+		      struct bkey_i *k, enum btree_trigger_flags flags)
+{
+	struct btree_insert_entry *i, n = (struct btree_insert_entry) {
+		.trigger_flags = flags, .iter = iter, .k = k
+	};
+
+	EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&k->k)));
+
+	iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT;
+
+	if (iter->flags & BTREE_ITER_IS_EXTENTS) {
+		iter->pos_after_commit = k->k.p;
+		iter->flags |= BTREE_ITER_SET_POS_AFTER_COMMIT;
+	}
+
+	/*
+	 * Pending updates are kept sorted: first, find position of new update:
+	 */
+	trans_for_each_update(trans, i)
+		if (btree_iter_cmp(iter, i->iter) <= 0)
+			break;
+
+	/*
+	 * Now delete/trim any updates the new update overwrites:
+	 */
+	if (i > trans->updates &&
+	    i[-1].iter->btree_id == iter->btree_id &&
+	    bkey_cmp(iter->pos, i[-1].k->k.p) < 0)
+		bch2_cut_back(n.iter->pos, i[-1].k);
+
+	while (i < trans->updates + trans->nr_updates &&
+	       iter->btree_id == i->iter->btree_id &&
+	       bkey_cmp(n.k->k.p, i->k->k.p) >= 0)
+		array_remove_item(trans->updates, trans->nr_updates,
+				  i - trans->updates);
+
+	if (i < trans->updates + trans->nr_updates &&
+	    iter->btree_id == i->iter->btree_id &&
+	    bkey_cmp(n.k->k.p, i->iter->pos) > 0) {
+		/*
+		 * When we have an extent that overwrites the start of another
+		 * update, trimming that extent will mean the iterator's
+		 * position has to change since the iterator position has to
+		 * match the extent's start pos - but we don't want to change
+		 * the iterator pos if some other code is using it, so we may
+		 * need to clone it:
+		 */
+		if (trans->iters_live & (1ULL << i->iter->idx)) {
+			i->iter = bch2_trans_copy_iter(trans, i->iter);
+			if (IS_ERR(i->iter)) {
+				trans->need_reset = true;
+				return PTR_ERR(i->iter);
+			}
+
+			i->iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT;
+			bch2_trans_iter_put(trans, i->iter);
+		}
+
+		bch2_cut_front(n.k->k.p, i->k);
+		bch2_btree_iter_set_pos(i->iter, n.k->k.p);
+	}
+
+	EBUG_ON(trans->nr_updates >= trans->nr_iters);
+
+	array_insert_item(trans->updates, trans->nr_updates,
+			  i - trans->updates, n);
+	return 0;
+}
+
 static int __bch2_btree_insert(struct btree_trans *trans,
 			       enum btree_id id, struct bkey_i *k)
 {
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index d90bcbc0a005..60ad443bb509 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -1433,30 +1433,6 @@ static int trans_get_key(struct btree_trans *trans,
 	return ret;
 }
 
-static void *trans_update_key(struct btree_trans *trans,
-			      struct btree_iter *iter,
-			      unsigned u64s)
-{
-	struct btree_insert_entry *i;
-	struct bkey_i *new_k;
-
-	new_k = bch2_trans_kmalloc(trans, u64s * sizeof(u64));
-	if (IS_ERR(new_k))
-		return new_k;
-
-	bkey_init(&new_k->k);
-	new_k->k.p = iter->pos;
-
-	trans_for_each_update(trans, i)
-		if (i->iter == iter) {
-			i->k = new_k;
-			return new_k;
-		}
-
-	bch2_trans_update(trans, iter, new_k, 0);
-	return new_k;
-}
-
 static int bch2_trans_mark_pointer(struct btree_trans *trans,
 			struct extent_ptr_decoded p,
 			s64 sectors, enum bch_data_type data_type)
@@ -1540,7 +1516,7 @@ static int bch2_trans_mark_pointer(struct btree_trans *trans,
 	u.data_type = u.dirty_sectors || u.cached_sectors
 		? data_type : 0;
 
-	a = trans_update_key(trans, iter, BKEY_ALLOC_U64s_MAX);
+	a = bch2_trans_kmalloc(trans, BKEY_ALLOC_U64s_MAX * 8);
 	ret = PTR_ERR_OR_ZERO(a);
 	if (ret)
 		goto out;
@@ -1548,6 +1524,7 @@ static int bch2_trans_mark_pointer(struct btree_trans *trans,
 	bkey_alloc_init(&a->k_i);
 	a->k.p = iter->pos;
 	bch2_alloc_pack(a, u);
+	bch2_trans_update(trans, iter, &a->k_i, 0);
 out:
 	bch2_trans_iter_put(trans, iter);
 	return ret;
@@ -1562,9 +1539,8 @@ static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans,
 {
 	struct bch_fs *c = trans->c;
 	struct btree_iter *iter;
-	struct bkey_i *new_k;
 	struct bkey_s_c k;
-	struct bkey_s_stripe s;
+	struct bkey_i_stripe *s;
 	int ret = 0;
 
 	ret = trans_get_key(trans, BTREE_ID_EC, POS(0, p.idx), &iter, &k);
@@ -1579,21 +1555,21 @@ static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans,
 		goto out;
 	}
 
-	new_k = trans_update_key(trans, iter, k.k->u64s);
-	ret = PTR_ERR_OR_ZERO(new_k);
+	s = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
+	ret = PTR_ERR_OR_ZERO(s);
 	if (ret)
 		goto out;
 
-	bkey_reassemble(new_k, k);
-	s = bkey_i_to_s_stripe(new_k);
+	bkey_reassemble(&s->k_i, k);
 
-	stripe_blockcount_set(s.v, p.block,
-		stripe_blockcount_get(s.v, p.block) +
+	stripe_blockcount_set(&s->v, p.block,
+		stripe_blockcount_get(&s->v, p.block) +
 		sectors);
 
-	*nr_data	= s.v->nr_blocks - s.v->nr_redundant;
-	*nr_parity	= s.v->nr_redundant;
-	bch2_bkey_to_replicas(&r->e, s.s_c);
+	*nr_data	= s->v.nr_blocks - s->v.nr_redundant;
+	*nr_parity	= s->v.nr_redundant;
+	bch2_bkey_to_replicas(&r->e, bkey_i_to_s_c(&s->k_i));
+	bch2_trans_update(trans, iter, &s->k_i, 0);
 out:
 	bch2_trans_iter_put(trans, iter);
 	return ret;
@@ -1674,7 +1650,6 @@ static int __bch2_trans_mark_reflink_p(struct btree_trans *trans,
 {
 	struct bch_fs *c = trans->c;
 	struct btree_iter *iter;
-	struct bkey_i *new_k;
 	struct bkey_s_c k;
 	struct bkey_i_reflink_v *r_v;
 	s64 ret;
@@ -1700,13 +1675,12 @@ static int __bch2_trans_mark_reflink_p(struct btree_trans *trans,
 	bch2_btree_iter_set_pos(iter, bkey_start_pos(k.k));
 	BUG_ON(iter->uptodate > BTREE_ITER_NEED_PEEK);
 
-	new_k = trans_update_key(trans, iter, k.k->u64s);
-	ret = PTR_ERR_OR_ZERO(new_k);
+	r_v = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
+	ret = PTR_ERR_OR_ZERO(r_v);
 	if (ret)
 		goto err;
 
-	bkey_reassemble(new_k, k);
-	r_v = bkey_i_to_reflink_v(new_k);
+	bkey_reassemble(&r_v->k_i, k);
 
 	le64_add_cpu(&r_v->v.refcount,
 		     !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1);
@@ -1715,6 +1689,8 @@ static int __bch2_trans_mark_reflink_p(struct btree_trans *trans,
 		r_v->k.type = KEY_TYPE_deleted;
 		set_bkey_val_u64s(&r_v->k, 0);
 	}
+
+	bch2_trans_update(trans, iter, &r_v->k_i, 0);
 out:
 	ret = k.k->p.offset - idx;
 err:
diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 25fe9ab0f068..f6a597f54d16 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -2383,7 +2383,7 @@ static long bchfs_fcollapse_finsert(struct bch_inode_info *inode,
 	struct address_space *mapping = inode->v.i_mapping;
 	struct bkey_on_stack copy;
 	struct btree_trans trans;
-	struct btree_iter *src, *dst, *del = NULL;
+	struct btree_iter *src, *dst;
 	loff_t shift, new_size;
 	u64 src_start;
 	int ret;
@@ -2513,29 +2513,6 @@ reassemble:
 
 		next_pos = insert ? bkey_start_pos(&delete.k) : delete.k.p;
 
-		/*
-		 * If the new and old keys overlap (because we're moving an
-		 * extent that's bigger than the amount we're collapsing by),
-		 * we need to trim the delete key here so they don't overlap
-		 * because overlaps on insertions aren't handled before
-		 * triggers are run, so the overwrite will get double counted
-		 * by the triggers machinery:
-		 */
-		if (insert &&
-		    bkey_cmp(bkey_start_pos(&copy.k->k), delete.k.p) < 0) {
-			bch2_cut_back(bkey_start_pos(&copy.k->k), &delete);
-		} else if (!insert &&
-			   bkey_cmp(copy.k->k.p,
-				    bkey_start_pos(&delete.k)) > 0) {
-			bch2_cut_front(copy.k->k.p, &delete);
-
-			del = bch2_trans_copy_iter(&trans, src);
-			BUG_ON(IS_ERR_OR_NULL(del));
-
-			bch2_btree_iter_set_pos(del,
-				bkey_start_pos(&delete.k));
-		}
-
 		if (copy.k->k.size == k.k->size) {
 			/*
 			 * If we're moving the entire extent, we can skip
@@ -2553,18 +2530,13 @@ reassemble:
 			BUG_ON(ret);
 		}
 
-		bch2_trans_update(&trans, dst, copy.k, trigger_flags);
-		bch2_trans_update(&trans, del ?: src, &delete, trigger_flags);
-
-		ret = bch2_trans_commit(&trans, &disk_res,
-					&inode->ei_journal_seq,
-					BTREE_INSERT_NOFAIL);
+		ret =   bch2_trans_update(&trans, src, &delete, trigger_flags) ?:
+			bch2_trans_update(&trans, dst, copy.k, trigger_flags) ?:
+			bch2_trans_commit(&trans, &disk_res,
+					  &inode->ei_journal_seq,
+					  BTREE_INSERT_NOFAIL);
 		bch2_disk_reservation_put(c, &disk_res);
 bkey_err:
-		if (del)
-			bch2_trans_iter_put(&trans, del);
-		del = NULL;
-
 		if (!ret)
 			bch2_btree_iter_set_pos(src, next_pos);
 
-- 
cgit 


From 9ad26b2b568f75ee6872a3afa70fa8005994efe8 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 3 Jan 2020 16:03:10 -0500
Subject: bcachefs: Make sure bch2_read_extent obeys BCH_READ_MUST_CLONE

This fixes the bch2_read_retry_nodecode() path, we were resubmitting a
bio without properly reinitializing it.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/io.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index 1fab0bdcf9ae..a419024ce039 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -1981,7 +1981,7 @@ int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig,
 			goto hole;
 
 		iter.bi_size	= pick.crc.compressed_size << 9;
-		goto noclone;
+		goto get_bio;
 	}
 
 	if (!(flags & BCH_READ_LAST_FRAGMENT) ||
@@ -2028,7 +2028,7 @@ int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig,
 		pick.crc.live_size		= bvec_iter_sectors(iter);
 		offset_into_extent		= 0;
 	}
-
+get_bio:
 	if (rbio) {
 		/*
 		 * promote already allocated bounce rbio:
@@ -2068,7 +2068,6 @@ int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig,
 		rbio->bio.bi_iter = iter;
 		rbio->split	= true;
 	} else {
-noclone:
 		rbio = orig;
 		rbio->bio.bi_iter = iter;
 		EBUG_ON(bio_flagged(&rbio->bio, BIO_CHAIN));
-- 
cgit 


From d5cdf033cd371eb55f7337ac08bb4ac48135ce8b Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 3 Jan 2020 18:04:19 -0500
Subject: bcachefs: Fix an iterator error path

On transaction restart (-EINTR), we need to traverse all iterators.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index d1218d34232d..f37109150e42 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1027,10 +1027,7 @@ retry_all:
 	for (i = 0; i < nr_sorted; i++) {
 		iter = &trans->iters[sorted[i]];
 
-		do {
-			ret = btree_iter_traverse_one(iter);
-		} while (ret == -EINTR);
-
+		ret = btree_iter_traverse_one(iter);
 		if (ret)
 			goto retry_all;
 	}
-- 
cgit 


From 3e548da8f57ef41523f6f7fe72f812116af48ba1 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 3 Jan 2020 18:57:32 -0500
Subject: bcachefs: Don't print anything when device doesn't have a label

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/sysfs.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index d3713db317ce..13b48a0fc87d 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -914,8 +914,6 @@ SHOW(bch2_dev)
 			bch2_disk_path_to_text(&out, &c->disk_sb,
 					       ca->mi.group - 1);
 			mutex_unlock(&c->sb_lock);
-		} else {
-			pr_buf(&out, "none");
 		}
 
 		pr_buf(&out, "\n");
-- 
cgit 


From 31ba2cd33037e1011947b7abbfd70921c735841d Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 3 Jan 2020 22:38:14 -0500
Subject: bcachefs: Hacky fixes for device removal

The device remove test was sporadically failing, because we hadn't
finished dropping btree sector counts for the device when
bch2_replicas_gc2() was called - mainly due to in flight journal writes.
We don't yet have a good mechanism for flushing the counts that
correspend to open journal entries yet.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/migrate.c | 58 +++++++++++++++++++++------------------------------
 fs/bcachefs/super.c   | 43 +++++++++++++++++++++++---------------
 2 files changed, 50 insertions(+), 51 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c
index 0e3f63c1d65c..1ef62a189e33 100644
--- a/fs/bcachefs/migrate.c
+++ b/fs/bcachefs/migrate.c
@@ -53,9 +53,6 @@ static int __bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags
 	while ((k = bch2_btree_iter_peek(iter)).k &&
 	       !(ret = bkey_err(k))) {
 		if (!bch2_bkey_has_device(k, dev_idx)) {
-			ret = bch2_mark_bkey_replicas(c, k);
-			if (ret)
-				break;
 			bch2_btree_iter_next(iter);
 			continue;
 		}
@@ -129,34 +126,27 @@ static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
 			struct bkey_i_btree_ptr *new_key;
 retry:
 			if (!bch2_bkey_has_device(bkey_i_to_s_c(&b->key),
-						  dev_idx)) {
-				/*
-				 * we might have found a btree node key we
-				 * needed to update, and then tried to update it
-				 * but got -EINTR after upgrading the iter, but
-				 * then raced and the node is now gone:
-				 */
-				bch2_btree_iter_downgrade(iter);
-
-				ret = bch2_mark_bkey_replicas(c, bkey_i_to_s_c(&b->key));
-				if (ret)
-					goto err;
-			} else {
-				bkey_copy(&tmp.k, &b->key);
-				new_key = bkey_i_to_btree_ptr(&tmp.k);
-
-				ret = drop_dev_ptrs(c, bkey_i_to_s(&new_key->k_i),
-						    dev_idx, flags, true);
-				if (ret)
-					goto err;
-
-				ret = bch2_btree_node_update_key(c, iter, b, new_key);
-				if (ret == -EINTR) {
-					b = bch2_btree_iter_peek_node(iter);
-					goto retry;
-				}
-				if (ret)
-					goto err;
+						  dev_idx))
+				continue;
+
+			bkey_copy(&tmp.k, &b->key);
+			new_key = bkey_i_to_btree_ptr(&tmp.k);
+
+			ret = drop_dev_ptrs(c, bkey_i_to_s(&new_key->k_i),
+					    dev_idx, flags, true);
+			if (ret) {
+				bch_err(c, "Cannot drop device without losing data");
+				goto err;
+			}
+
+			ret = bch2_btree_node_update_key(c, iter, b, new_key);
+			if (ret == -EINTR) {
+				b = bch2_btree_iter_peek_node(iter);
+				goto retry;
+			}
+			if (ret) {
+				bch_err(c, "Error updating btree node key: %i", ret);
+				goto err;
 			}
 		}
 		bch2_trans_iter_free(&trans, iter);
@@ -167,9 +157,10 @@ retry:
 		closure_wait_event(&c->btree_interior_update_wait,
 				   !bch2_btree_interior_updates_nr_pending(c) ||
 				   c->btree_roots_dirty);
+		if (c->btree_roots_dirty)
+			bch2_journal_meta(&c->journal);
 		if (!bch2_btree_interior_updates_nr_pending(c))
 			break;
-		bch2_journal_meta(&c->journal);
 	}
 
 	ret = 0;
@@ -184,6 +175,5 @@ err:
 int bch2_dev_data_drop(struct bch_fs *c, unsigned dev_idx, int flags)
 {
 	return bch2_dev_usrdata_drop(c, dev_idx, flags) ?:
-		bch2_dev_metadata_drop(c, dev_idx, flags) ?:
-		bch2_replicas_gc2(c);
+		bch2_dev_metadata_drop(c, dev_idx, flags);
 }
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index cd02e5a5f305..586636a4c204 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -1381,7 +1381,11 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
 
 	mutex_lock(&c->state_lock);
 
-	percpu_ref_put(&ca->ref); /* XXX */
+	/*
+	 * We consume a reference to ca->ref, regardless of whether we succeed
+	 * or fail:
+	 */
+	percpu_ref_put(&ca->ref);
 
 	if (!bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_FAILED, flags)) {
 		bch_err(ca, "Cannot remove without losing data");
@@ -1390,11 +1394,6 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
 
 	__bch2_dev_read_only(c, ca);
 
-	/*
-	 * XXX: verify that dev_idx is really not in use anymore, anywhere
-	 *
-	 * flag_data_bad() does not check btree pointers
-	 */
 	ret = bch2_dev_data_drop(c, ca->dev_idx, flags);
 	if (ret) {
 		bch_err(ca, "Remove failed: error %i dropping data", ret);
@@ -1407,17 +1406,6 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
 		goto err;
 	}
 
-	data = bch2_dev_has_data(c, ca);
-	if (data) {
-		char data_has_str[100];
-
-		bch2_flags_to_text(&PBUF(data_has_str),
-				   bch2_data_types, data);
-		bch_err(ca, "Remove failed, still has data (%s)", data_has_str);
-		ret = -EBUSY;
-		goto err;
-	}
-
 	ret = bch2_btree_delete_range(c, BTREE_ID_ALLOC,
 				      POS(ca->dev_idx, 0),
 				      POS(ca->dev_idx + 1, 0),
@@ -1432,12 +1420,33 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
 	 * (overwritten) keys that point to the device we're removing:
 	 */
 	bch2_journal_flush_all_pins(&c->journal);
+	/*
+	 * hack to ensure bch2_replicas_gc2() clears out entries to this device
+	 */
+	bch2_journal_meta(&c->journal);
 	ret = bch2_journal_error(&c->journal);
 	if (ret) {
 		bch_err(ca, "Remove failed, journal error");
 		goto err;
 	}
 
+	ret = bch2_replicas_gc2(c);
+	if (ret) {
+		bch_err(ca, "Remove failed: error %i from replicas gc", ret);
+		goto err;
+	}
+
+	data = bch2_dev_has_data(c, ca);
+	if (data) {
+		char data_has_str[100];
+
+		bch2_flags_to_text(&PBUF(data_has_str),
+				   bch2_data_types, data);
+		bch_err(ca, "Remove failed, still has data (%s)", data_has_str);
+		ret = -EBUSY;
+		goto err;
+	}
+
 	__bch2_dev_offline(c, ca);
 
 	mutex_lock(&c->sb_lock);
-- 
cgit 


From e7808eef95213678a5c4d009aef636e9037588fb Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 4 Jan 2020 16:09:52 -0500
Subject: bcachefs: Kill bch2_fs_bug()

These have all been converted to fsck/inconsistent errors

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bkey_methods.c |  2 +-
 fs/bcachefs/error.h        | 20 -----------
 fs/bcachefs/extents.c      | 87 ++++++++++++++++++++++------------------------
 3 files changed, 43 insertions(+), 66 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c
index ed448fad83c5..320e17d108d2 100644
--- a/fs/bcachefs/bkey_methods.c
+++ b/fs/bcachefs/bkey_methods.c
@@ -156,7 +156,7 @@ void bch2_bkey_debugcheck(struct bch_fs *c, struct btree *b, struct bkey_s_c k)
 		char buf[160];
 
 		bch2_bkey_val_to_text(&PBUF(buf), c, k);
-		bch2_fs_bug(c, "invalid bkey %s: %s", buf, invalid);
+		bch2_fs_inconsistent(c, "invalid bkey %s: %s", buf, invalid);
 		return;
 	}
 
diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h
index 7dcb0f6552fc..de319794ccd1 100644
--- a/fs/bcachefs/error.h
+++ b/fs/bcachefs/error.h
@@ -16,26 +16,6 @@ struct work_struct;
 
 /* Error messages: */
 
-/*
- * Very fatal logic/inconsistency errors: these indicate that we've majorly
- * screwed up at runtime, i.e. it's not likely that it was just caused by the
- * data on disk being inconsistent. These BUG():
- *
- * XXX: audit and convert to inconsistent() checks
- */
-
-#define bch2_fs_bug(c, ...)						\
-do {									\
-	bch_err(c, __VA_ARGS__);					\
-	BUG();								\
-} while (0)
-
-#define bch2_fs_bug_on(cond, c, ...)					\
-do {									\
-	if (cond)							\
-		bch2_fs_bug(c, __VA_ARGS__);				\
-} while (0)
-
 /*
  * Inconsistency errors: The on disk data is inconsistent. If these occur during
  * initial recovery, they don't indicate a bug in the running code - we walk all
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 8322b043bdff..ce94e38c0277 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -171,14 +171,17 @@ void bch2_btree_ptr_debugcheck(struct bch_fs *c, struct bkey_s_c k)
 	struct bucket_mark mark;
 	struct bch_dev *ca;
 
-	bch2_fs_bug_on(!test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) &&
-		       !bch2_bkey_replicas_marked(c, k, false), c,
-		       "btree key bad (replicas not marked in superblock):\n%s",
-		       (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
-
 	if (!test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags))
 		return;
 
+	if (!percpu_down_read_trylock(&c->mark_lock))
+		return;
+
+	bch2_fs_inconsistent_on(!test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) &&
+		!bch2_bkey_replicas_marked(c, k, false), c,
+		"btree key bad (replicas not marked in superblock):\n%s",
+		(bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
+
 	bkey_for_each_ptr(ptrs, ptr) {
 		ca = bch_dev_bkey_exists(c, ptr->dev);
 
@@ -193,13 +196,15 @@ void bch2_btree_ptr_debugcheck(struct bch_fs *c, struct bkey_s_c k)
 		    mark.dirty_sectors < c->opts.btree_node_size)
 			goto err;
 	}
-
+out:
+	percpu_up_read(&c->mark_lock);
 	return;
 err:
-	bch2_bkey_val_to_text(&PBUF(buf), c, k);
-	bch2_fs_bug(c, "%s btree pointer %s: bucket %zi gen %i mark %08x",
-		    err, buf, PTR_BUCKET_NR(ca, ptr),
-		    mark.gen, (unsigned) mark.v.counter);
+	bch2_fs_inconsistent(c, "%s btree pointer %s: bucket %zi gen %i mark %08x",
+		err, (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf),
+		PTR_BUCKET_NR(ca, ptr),
+		mark.gen, (unsigned) mark.v.counter);
+	goto out;
 }
 
 void bch2_btree_ptr_to_text(struct printbuf *out, struct bch_fs *c,
@@ -222,29 +227,18 @@ void bch2_extent_debugcheck(struct bch_fs *c, struct bkey_s_c k)
 	struct extent_ptr_decoded p;
 	char buf[160];
 
-	/*
-	 * XXX: we should be doing most/all of these checks at startup time,
-	 * where we check bch2_bkey_invalid() in btree_node_read_done()
-	 *
-	 * But note that we can't check for stale pointers or incorrect gc marks
-	 * until after journal replay is done (it might be an extent that's
-	 * going to get overwritten during replay)
-	 */
-
-	if (percpu_down_read_trylock(&c->mark_lock)) {
-		bch2_fs_bug_on(!test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) &&
-			       !bch2_bkey_replicas_marked_locked(c, e.s_c, false), c,
-			       "extent key bad (replicas not marked in superblock):\n%s",
-			       (bch2_bkey_val_to_text(&PBUF(buf), c, e.s_c), buf));
-		percpu_up_read(&c->mark_lock);
-	}
-	/*
-	 * If journal replay hasn't finished, we might be seeing keys
-	 * that will be overwritten by the time journal replay is done:
-	 */
-	if (!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags))
+	if (!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags) ||
+	    !test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags))
 		return;
 
+	if (!percpu_down_read_trylock(&c->mark_lock))
+		return;
+
+	bch2_fs_inconsistent_on(!test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) &&
+		!bch2_bkey_replicas_marked_locked(c, e.s_c, false), c,
+		"extent key bad (replicas not marked in superblock):\n%s",
+		(bch2_bkey_val_to_text(&PBUF(buf), c, e.s_c), buf));
+
 	extent_for_each_ptr_decode(e, p, entry) {
 		struct bch_dev *ca	= bch_dev_bkey_exists(c, p.ptr.dev);
 		struct bucket_mark mark = ptr_bucket_mark(ca, &p.ptr);
@@ -254,21 +248,24 @@ void bch2_extent_debugcheck(struct bch_fs *c, struct bkey_s_c k)
 			? mark.cached_sectors
 			: mark.dirty_sectors;
 
-		bch2_fs_bug_on(stale && !p.ptr.cached, c,
-			       "stale dirty pointer (ptr gen %u bucket %u",
-			       p.ptr.gen, mark.gen);
-
-		bch2_fs_bug_on(stale > 96, c, "key too stale: %i", stale);
-
-		bch2_fs_bug_on(!stale &&
-			       (mark.data_type != BCH_DATA_USER ||
-				mark_sectors < disk_sectors), c,
-			       "extent pointer not marked: %s:\n"
-			       "type %u sectors %u < %u",
-			       (bch2_bkey_val_to_text(&PBUF(buf), c, e.s_c), buf),
-			       mark.data_type,
-			       mark_sectors, disk_sectors);
+		bch2_fs_inconsistent_on(stale && !p.ptr.cached, c,
+			"stale dirty pointer (ptr gen %u bucket %u",
+			p.ptr.gen, mark.gen);
+
+		bch2_fs_inconsistent_on(stale > 96, c,
+			"key too stale: %i", stale);
+
+		bch2_fs_inconsistent_on(!stale &&
+			(mark.data_type != BCH_DATA_USER ||
+			 mark_sectors < disk_sectors), c,
+			"extent pointer not marked: %s:\n"
+			"type %u sectors %u < %u",
+			(bch2_bkey_val_to_text(&PBUF(buf), c, e.s_c), buf),
+			mark.data_type,
+			mark_sectors, disk_sectors);
 	}
+
+	percpu_up_read(&c->mark_lock);
 }
 
 void bch2_extent_to_text(struct printbuf *out, struct bch_fs *c,
-- 
cgit 


From b2ca9903cd7a3fd2193aaf123718bcbc9e9e66e9 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 4 Jan 2020 16:33:32 -0500
Subject: bcachefs: Fix extent_to_replicas()

This needs to match bch2_mark_extent()/bch2_trans_mark_extent() in
buckets.c

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/replicas.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c
index eef9f54808fb..66787d0c5c63 100644
--- a/fs/bcachefs/replicas.c
+++ b/fs/bcachefs/replicas.c
@@ -84,10 +84,10 @@ static void extent_to_replicas(struct bkey_s_c k,
 		if (p.ptr.cached)
 			continue;
 
-		if (p.has_ec)
+		if (!p.has_ec)
+			r->devs[r->nr_devs++] = p.ptr.dev;
+		else
 			r->nr_required = 0;
-
-		r->devs[r->nr_devs++] = p.ptr.dev;
 	}
 }
 
-- 
cgit 


From 0abb250125bfb114fa1f471bc5c77f1dc72b9e4d Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 4 Jan 2020 16:46:23 -0500
Subject: bcachefs: Ensure iterators are valid before calling trans_mark_key()

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update_leaf.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 2242b2061ee2..9ad2e3e90d5b 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -706,6 +706,15 @@ int __bch2_trans_commit(struct btree_trans *trans)
 		trans_trigger_run = false;
 
 		trans_for_each_update(trans, i) {
+			/* we know trans->nounlock won't be set here: */
+			if (unlikely(!(i->iter->locks_want < 1
+				       ? __bch2_btree_iter_upgrade(i->iter, 1)
+				       : i->iter->uptodate <= BTREE_ITER_NEED_PEEK))) {
+				trace_trans_restart_upgrade(trans->ip);
+				ret = -EINTR;
+				goto out;
+			}
+
 			if (iter_has_trans_triggers(i->iter) &&
 			    !i->trans_triggers_run) {
 				i->trans_triggers_run = true;
@@ -723,15 +732,6 @@ int __bch2_trans_commit(struct btree_trans *trans)
 	} while (trans_trigger_run);
 
 	trans_for_each_update(trans, i) {
-		/* we know trans->nounlock won't be set here: */
-		if (unlikely(!(i->iter->locks_want < 1
-			       ? __bch2_btree_iter_upgrade(i->iter, 1)
-			       : i->iter->uptodate <= BTREE_ITER_NEED_PEEK))) {
-			trace_trans_restart_upgrade(trans->ip);
-			ret = -EINTR;
-			goto out;
-		}
-
 		u64s = jset_u64s(i->k->k.u64s);
 		if (0)
 			trans->journal_preres_u64s += u64s;
-- 
cgit 


From a7b96ab020a76982c47616dccc2b3efaa5b755fd Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 4 Jan 2020 19:04:47 -0500
Subject: bcachefs: Don't call trans_iter_put() on error pointer

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/dirent.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c
index 4b4aeaf81d21..623b6c3eda95 100644
--- a/fs/bcachefs/dirent.c
+++ b/fs/bcachefs/dirent.c
@@ -331,7 +331,9 @@ int bch2_empty_dir_trans(struct btree_trans *trans, u64 dir_inum)
 			break;
 		}
 	}
-	bch2_trans_iter_put(trans, iter);
+
+	if (!IS_ERR(iter))
+		bch2_trans_iter_put(trans, iter);
 
 	return ret;
 }
-- 
cgit 


From f2e8c69fcb63d280d1013b84973889e3aecd6603 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 5 Jan 2020 18:20:23 -0500
Subject: bcachefs: Don't lose needs_whiteout in overwrite path

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update_leaf.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 9ad2e3e90d5b..3dc6b35b2e6a 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -99,13 +99,14 @@ bool bch2_btree_bset_insert_key(struct btree_iter *iter,
 
 		}
 
+		insert->k.needs_whiteout = k->needs_whiteout;
+		k->needs_whiteout = false;
+
 		if (k >= btree_bset_last(b)->start) {
 			clobber_u64s = k->u64s;
 			goto overwrite;
 		}
 
-		insert->k.needs_whiteout = k->needs_whiteout;
-		k->needs_whiteout = false;
 		k->type = KEY_TYPE_deleted;
 		/*
 		 * XXX: we should be able to do this without two calls to
-- 
cgit 


From 9626aeb167144db2ba235bde5f9f1863c3ef354b Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 6 Jan 2020 22:25:09 -0500
Subject: bcachefs: Rework iter->pos handling

- Rework some of the helper comparison functions for consistency

- Currently trying to refactor all the logic that's different for
extents in the btree iterator code. The main difference is that for non
extents we search for a key greater than or equal to the search key,
while for extents we search for a key strictly greater than the search
key (iter->pos).

So that logic is now handled by btree_iter_search_key(), which computes
the real search key based on iter->pos and whether or not we're
searching for a key >= or > iter->pos.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bset.c                  |  10 +--
 fs/bcachefs/bset.h                  |  30 ++++----
 fs/bcachefs/btree_iter.c            | 136 ++++++++++++++----------------------
 fs/bcachefs/btree_update_interior.c |   2 +-
 4 files changed, 72 insertions(+), 106 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c
index a0bd6af67190..cff664ab75fa 100644
--- a/fs/bcachefs/bset.c
+++ b/fs/bcachefs/bset.c
@@ -1380,21 +1380,21 @@ struct bkey_packed *bch2_bset_search_linear(struct btree *b,
 {
 	if (lossy_packed_search)
 		while (m != btree_bkey_last(b, t) &&
-		       bkey_iter_cmp_p_or_unp(b, search, lossy_packed_search,
-					      m) > 0)
+		       bkey_iter_cmp_p_or_unp(b, m,
+					lossy_packed_search, search) < 0)
 			m = bkey_next_skip_noops(m, btree_bkey_last(b, t));
 
 	if (!packed_search)
 		while (m != btree_bkey_last(b, t) &&
-		       bkey_iter_pos_cmp(b, search, m) > 0)
+		       bkey_iter_pos_cmp(b, m, search) < 0)
 			m = bkey_next_skip_noops(m, btree_bkey_last(b, t));
 
 	if (btree_keys_expensive_checks(b)) {
 		struct bkey_packed *prev = bch2_bkey_prev_all(b, t, m);
 
 		BUG_ON(prev &&
-		       bkey_iter_cmp_p_or_unp(b, search, packed_search,
-					      prev) <= 0);
+		       bkey_iter_cmp_p_or_unp(b, prev,
+					packed_search, search) >= 0);
 	}
 
 	return m;
diff --git a/fs/bcachefs/bset.h b/fs/bcachefs/bset.h
index b93c4f287480..5c3c5fbea4b7 100644
--- a/fs/bcachefs/bset.h
+++ b/fs/bcachefs/bset.h
@@ -360,7 +360,7 @@ void bch2_bset_delete(struct btree *, struct bkey_packed *, unsigned);
 static inline int bkey_cmp_p_or_unp(const struct btree *b,
 				    const struct bkey_packed *l,
 				    const struct bkey_packed *r_packed,
-				    struct bpos *r)
+				    const struct bpos *r)
 {
 	EBUG_ON(r_packed && !bkey_packed(r_packed));
 
@@ -464,7 +464,7 @@ static inline bool bch2_btree_node_iter_end(struct btree_node_iter *iter)
  * XXX: only need to compare pointers for keys that are both within a
  * btree_node_iterator - we need to break ties for prev() to work correctly
  */
-static inline int bkey_iter_cmp(struct btree *b,
+static inline int bkey_iter_cmp(const struct btree *b,
 				const struct bkey_packed *l,
 				const struct bkey_packed *r)
 {
@@ -473,7 +473,7 @@ static inline int bkey_iter_cmp(struct btree *b,
 		?: cmp_int(l, r);
 }
 
-static inline int btree_node_iter_cmp(struct btree *b,
+static inline int btree_node_iter_cmp(const struct btree *b,
 				      struct btree_node_iter_set l,
 				      struct btree_node_iter_set r)
 {
@@ -482,22 +482,22 @@ static inline int btree_node_iter_cmp(struct btree *b,
 			__btree_node_offset_to_key(b, r.k));
 }
 
-/* These assume l (the search key) is not a deleted key: */
-static inline int bkey_iter_pos_cmp(struct btree *b,
-			struct bpos *l,
-			const struct bkey_packed *r)
+/* These assume r (the search key) is not a deleted key: */
+static inline int bkey_iter_pos_cmp(const struct btree *b,
+			const struct bkey_packed *l,
+			const struct bpos *r)
 {
-	return -bkey_cmp_left_packed(b, r, l)
-		?: (int) bkey_deleted(r);
+	return bkey_cmp_left_packed(b, l, r)
+		?: -((int) bkey_deleted(l));
 }
 
-static inline int bkey_iter_cmp_p_or_unp(struct btree *b,
-			struct bpos *l,
-			const struct bkey_packed *l_packed,
-			const struct bkey_packed *r)
+static inline int bkey_iter_cmp_p_or_unp(const struct btree *b,
+				    const struct bkey_packed *l,
+				    const struct bkey_packed *r_packed,
+				    const struct bpos *r)
 {
-	return -bkey_cmp_p_or_unp(b, r, l_packed, l)
-		?: (int) bkey_deleted(r);
+	return bkey_cmp_p_or_unp(b, l, r_packed, r)
+		?: -((int) bkey_deleted(l));
 }
 
 static inline struct bkey_packed *
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index f37109150e42..d1e83cfba47f 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -11,10 +11,6 @@
 
 #include <linux/prefetch.h>
 
-static inline struct bkey_s_c __btree_iter_peek_all(struct btree_iter *,
-						    struct btree_iter_level *,
-						    struct bkey *);
-
 #define BTREE_ITER_NO_NODE_GET_LOCKS	((struct btree *) 1)
 #define BTREE_ITER_NO_NODE_DROP		((struct btree *) 2)
 #define BTREE_ITER_NO_NODE_LOCK_ROOT	((struct btree *) 3)
@@ -29,37 +25,14 @@ static inline bool is_btree_node(struct btree_iter *iter, unsigned l)
 		(unsigned long) iter->l[l].b >= 128;
 }
 
-/* Returns < 0 if @k is before iter pos, > 0 if @k is after */
-static inline int __btree_iter_pos_cmp(struct btree_iter *iter,
-				       const struct btree *b,
-				       const struct bkey_packed *k,
-				       bool interior_node)
+static inline struct bpos btree_iter_search_key(struct btree_iter *iter)
 {
-	int cmp = bkey_cmp_left_packed(b, k, &iter->pos);
-
-	if (cmp)
-		return cmp;
-	if (bkey_deleted(k))
-		return -1;
-
-	/*
-	 * Normally, for extents we want the first key strictly greater than
-	 * the iterator position - with the exception that for interior nodes,
-	 * we don't want to advance past the last key if the iterator position
-	 * is POS_MAX:
-	 */
-	if (iter->flags & BTREE_ITER_IS_EXTENTS &&
-	    (!interior_node ||
-	     bkey_cmp_left_packed_byval(b, k, POS_MAX)))
-		return -1;
-	return 1;
-}
+	struct bpos pos = iter->pos;
 
-static inline int btree_iter_pos_cmp(struct btree_iter *iter,
-				     const struct btree *b,
-				     const struct bkey_packed *k)
-{
-	return __btree_iter_pos_cmp(iter, b, k, b->c.level != 0);
+	if ((iter->flags & BTREE_ITER_IS_EXTENTS) &&
+	    bkey_cmp(pos, POS_MAX))
+		pos = bkey_successor(pos);
+	return pos;
 }
 
 /* Btree node locking: */
@@ -415,6 +388,7 @@ void bch2_trans_unlock(struct btree_trans *trans)
 static void __bch2_btree_iter_verify(struct btree_iter *iter,
 				     struct btree *b)
 {
+	struct bpos pos = btree_iter_search_key(iter);
 	struct btree_iter_level *l = &iter->l[b->c.level];
 	struct btree_node_iter tmp = l->iter;
 	struct bkey_packed *k;
@@ -437,17 +411,17 @@ static void __bch2_btree_iter_verify(struct btree_iter *iter,
 	k = b->c.level || iter->flags & BTREE_ITER_IS_EXTENTS
 		? bch2_btree_node_iter_prev_filter(&tmp, b, KEY_TYPE_discard)
 		: bch2_btree_node_iter_prev_all(&tmp, b);
-	if (k && btree_iter_pos_cmp(iter, b, k) > 0) {
+	if (k && bkey_iter_pos_cmp(b, k, &pos) >= 0) {
 		char buf[100];
 		struct bkey uk = bkey_unpack_key(b, k);
 
 		bch2_bkey_to_text(&PBUF(buf), &uk);
-		panic("prev key should be before iter pos:\n%s\n%llu:%llu\n",
+		panic("iterator should be before prev key:\n%s\n%llu:%llu\n",
 		      buf, iter->pos.inode, iter->pos.offset);
 	}
 
 	k = bch2_btree_node_iter_peek_all(&l->iter, b);
-	if (k && btree_iter_pos_cmp(iter, b, k) < 0) {
+	if (k && bkey_iter_pos_cmp(b, k, &pos) < 0) {
 		char buf[100];
 		struct bkey uk = bkey_unpack_key(b, k);
 
@@ -495,15 +469,19 @@ static void btree_node_iter_set_set_pos(struct btree_node_iter *iter,
 }
 
 static void __bch2_btree_iter_fix_key_modified(struct btree_iter *iter,
-						    struct btree *b,
-						    struct bkey_packed *where)
+					       struct btree *b,
+					       struct bkey_packed *where)
 {
-	struct btree_node_iter *node_iter = &iter->l[0].iter;
+	struct btree_iter_level *l = &iter->l[b->c.level];
+	struct bpos pos = btree_iter_search_key(iter);
 
-	if (where == bch2_btree_node_iter_peek_all(node_iter, b)) {
-		bkey_disassemble(b, where, &iter->k);
-		btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
-	}
+	if (where != bch2_btree_node_iter_peek_all(&l->iter, l->b))
+		return;
+
+	if (bkey_iter_pos_cmp(l->b, where, &pos) < 0)
+		bch2_btree_node_iter_advance(&l->iter, l->b);
+
+	btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
 }
 
 void bch2_btree_iter_fix_key_modified(struct btree_iter *iter,
@@ -535,6 +513,7 @@ static void __bch2_btree_node_iter_fix(struct btree_iter *iter,
 	bool iter_current_key_modified =
 		orig_iter_pos >= offset &&
 		orig_iter_pos <= offset + clobber_u64s;
+	struct bpos iter_pos = btree_iter_search_key(iter);
 
 	btree_node_iter_for_each(node_iter, set)
 		if (set->end == old_end)
@@ -542,7 +521,7 @@ static void __bch2_btree_node_iter_fix(struct btree_iter *iter,
 
 	/* didn't find the bset in the iterator - might have to readd it: */
 	if (new_u64s &&
-	    btree_iter_pos_cmp(iter, b, where) > 0) {
+	    bkey_iter_pos_cmp(b, where, &iter_pos) >= 0) {
 		bch2_btree_node_iter_push(node_iter, b, where, end);
 		goto fixup_done;
 	} else {
@@ -557,7 +536,7 @@ found:
 		return;
 
 	if (new_u64s &&
-	    btree_iter_pos_cmp(iter, b, where) > 0) {
+	    bkey_iter_pos_cmp(b, where, &iter_pos) >= 0) {
 		set->k = offset;
 	} else if (set->k < offset + clobber_u64s) {
 		set->k = offset + new_u64s;
@@ -702,11 +681,12 @@ static inline bool btree_iter_advance_to_pos(struct btree_iter *iter,
 					     struct btree_iter_level *l,
 					     int max_advance)
 {
+	struct bpos pos = btree_iter_search_key(iter);
 	struct bkey_packed *k;
 	int nr_advanced = 0;
 
 	while ((k = bch2_btree_node_iter_peek_all(&l->iter, l->b)) &&
-	       btree_iter_pos_cmp(iter, l->b, k) < 0) {
+	       bkey_iter_pos_cmp(l->b, k, &pos) < 0) {
 		if (max_advance > 0 && nr_advanced >= max_advance)
 			return false;
 
@@ -765,13 +745,7 @@ static inline bool btree_iter_pos_before_node(struct btree_iter *iter,
 static inline bool btree_iter_pos_after_node(struct btree_iter *iter,
 					     struct btree *b)
 {
-	int cmp = bkey_cmp(b->key.k.p, iter->pos);
-
-	if (!cmp &&
-	    (iter->flags & BTREE_ITER_IS_EXTENTS) &&
-	    bkey_cmp(b->key.k.p, POS_MAX))
-		cmp = -1;
-	return cmp < 0;
+	return bkey_cmp(b->key.k.p, btree_iter_search_key(iter)) < 0;
 }
 
 static inline bool btree_iter_pos_in_node(struct btree_iter *iter,
@@ -785,16 +759,10 @@ static inline bool btree_iter_pos_in_node(struct btree_iter *iter,
 static inline void __btree_iter_init(struct btree_iter *iter,
 				     unsigned level)
 {
+	struct bpos pos = btree_iter_search_key(iter);
 	struct btree_iter_level *l = &iter->l[level];
 
-	bch2_btree_node_iter_init(&l->iter, l->b, &iter->pos);
-
-	if (iter->flags & BTREE_ITER_IS_EXTENTS)
-		btree_iter_advance_to_pos(iter, l, -1);
-
-	/* Skip to first non whiteout: */
-	if (level)
-		bch2_btree_node_iter_peek(&l->iter, l->b);
+	bch2_btree_node_iter_init(&l->iter, l->b, &pos);
 
 	btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
 }
@@ -1564,9 +1532,7 @@ __bch2_btree_iter_peek_slot_extents(struct btree_iter *iter)
 	int ret;
 
 recheck:
-	while ((k = __btree_iter_peek_all(iter, l, &iter->k)).k &&
-	       bkey_cmp(k.k->p, iter->pos) <= 0)
-		bch2_btree_node_iter_advance(&l->iter, l->b);
+	btree_iter_advance_to_pos(iter, l, -1);
 
 	/*
 	 * iterator is now at the correct position for inserting at iter->pos,
@@ -1575,9 +1541,27 @@ recheck:
 	 */
 
 	node_iter = l->iter;
-	if (k.k && bkey_whiteout(k.k))
-		k = __btree_iter_unpack(iter, l, &iter->k,
-			bch2_btree_node_iter_peek(&node_iter, l->b));
+	k = __btree_iter_unpack(iter, l, &iter->k,
+		bch2_btree_node_iter_peek(&node_iter, l->b));
+
+	if (k.k && bkey_cmp(bkey_start_pos(k.k), iter->pos) <= 0) {
+		/*
+		 * If there wasn't actually a hole, want the iterator to be
+		 * pointed at the key we found:
+		 *
+		 * XXX: actually, we shouldn't be changing the iterator here:
+		 * the iterator needs to be correct for inserting at iter->pos,
+		 * and there may be whiteouts between iter->pos and what this
+		 * iterator points at:
+		 */
+		l->iter = node_iter;
+
+		EBUG_ON(bkey_cmp(k.k->p, iter->pos) <= 0);
+		iter->uptodate = BTREE_ITER_UPTODATE;
+
+		__bch2_btree_iter_verify(iter, l->b);
+		return k;
+	}
 
 	/*
 	 * If we got to the end of the node, check if we need to traverse to the
@@ -1592,24 +1576,6 @@ recheck:
 		goto recheck;
 	}
 
-	if (k.k &&
-	    !bkey_whiteout(k.k) &&
-	    bkey_cmp(bkey_start_pos(k.k), iter->pos) <= 0) {
-		/*
-		 * if we skipped forward to find the first non whiteout and
-		 * there _wasn't_ actually a hole, we want the iterator to be
-		 * pointed at the key we found:
-		 */
-		l->iter = node_iter;
-
-		EBUG_ON(bkey_cmp(k.k->p, iter->pos) < 0);
-		EBUG_ON(bkey_deleted(k.k));
-		iter->uptodate = BTREE_ITER_UPTODATE;
-
-		__bch2_btree_iter_verify(iter, l->b);
-		return k;
-	}
-
 	/* hole */
 
 	/* holes can't span inode numbers: */
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index c8fbee82cc56..cb7566bbc1fc 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -1191,7 +1191,7 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, struct btree *b
 				     BTREE_TRIGGER_GC);
 
 	while ((k = bch2_btree_node_iter_peek_all(node_iter, b)) &&
-	       bkey_iter_pos_cmp(b, &insert->k.p, k) > 0)
+	       bkey_iter_pos_cmp(b, k, &insert->k.p) < 0)
 		bch2_btree_node_iter_advance(node_iter, b);
 
 	/*
-- 
cgit 


From ae54c4539b3cf8d2c7f96b3731b4d0c171416087 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 6 Jan 2020 23:43:04 -0500
Subject: bcachefs: Refactor bch2_btree_bset_insert_key()

The main thing going on is to separate out the different cases deletion,
overwriting, and inserting a new key.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update_leaf.c | 79 ++++++++++++++++++++---------------------
 1 file changed, 39 insertions(+), 40 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 3dc6b35b2e6a..0c6c2b5b15b3 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -64,64 +64,63 @@ bool bch2_btree_bset_insert_key(struct btree_iter *iter,
 		bkey_cmp(insert->k.p, b->data->max_key) > 0);
 
 	k = bch2_btree_node_iter_peek_all(node_iter, b);
-	if (k && !bkey_cmp_packed(b, k, &insert->k)) {
-		BUG_ON(bkey_whiteout(k));
+	if (k && bkey_cmp_packed(b, k, &insert->k))
+		k = NULL;
 
-		if (!bkey_written(b, k) &&
-		    bkey_val_u64s(&insert->k) == bkeyp_val_u64s(f, k) &&
-		    !bkey_whiteout(&insert->k)) {
-			k->type = insert->k.type;
-			memcpy_u64s(bkeyp_val(f, k), &insert->v,
-				    bkey_val_u64s(&insert->k));
-			return true;
-		}
+	/* @k is the key being overwritten/deleted, if any: */
 
-		btree_account_key_drop(b, k);
+	EBUG_ON(k && bkey_whiteout(k));
 
-		if (bkey_whiteout(&insert->k)) {
-			unsigned clobber_u64s = k->u64s, new_u64s = k->u64s;
+	if (bkey_whiteout(&insert->k)) {
+		/* Deleting: */
 
-			k->type = KEY_TYPE_deleted;
+		/* Not found? Nothing to do: */
+		if (!k)
+			return false;
 
-			if (k->needs_whiteout) {
-				push_whiteout(iter->trans->c, b, k);
-				k->needs_whiteout = false;
-			}
+		btree_account_key_drop(b, k);
+		k->type = KEY_TYPE_deleted;
 
-			if (k >= btree_bset_last(b)->start) {
-				bch2_bset_delete(b, k, clobber_u64s);
-				new_u64s = 0;
-			}
+		if (k->needs_whiteout) {
+			push_whiteout(iter->trans->c, b, k);
+			k->needs_whiteout = false;
+		}
 
+		if (k >= btree_bset_last(b)->start) {
+			clobber_u64s = k->u64s;
+
+			bch2_bset_delete(b, k, clobber_u64s);
 			bch2_btree_node_iter_fix(iter, b, node_iter, k,
-						 clobber_u64s, new_u64s);
-			return true;
+						 clobber_u64s, 0);
+		} else {
+			bch2_btree_iter_fix_key_modified(iter, b, k);
+		}
 
+		return true;
+	}
+
+	if (k) {
+		/* Overwriting: */
+		if (!bkey_written(b, k) &&
+		    bkey_val_u64s(&insert->k) == bkeyp_val_u64s(f, k)) {
+			k->type = insert->k.type;
+			memcpy_u64s(bkeyp_val(f, k), &insert->v,
+				    bkey_val_u64s(&insert->k));
+			return true;
 		}
 
+		btree_account_key_drop(b, k);
+		k->type = KEY_TYPE_deleted;
+
 		insert->k.needs_whiteout = k->needs_whiteout;
 		k->needs_whiteout = false;
 
 		if (k >= btree_bset_last(b)->start) {
 			clobber_u64s = k->u64s;
 			goto overwrite;
+		} else {
+			bch2_btree_iter_fix_key_modified(iter, b, k);
 		}
-
-		k->type = KEY_TYPE_deleted;
-		/*
-		 * XXX: we should be able to do this without two calls to
-		 * bch2_btree_node_iter_fix:
-		 */
-		bch2_btree_node_iter_fix(iter, b, node_iter, k,
-					 k->u64s, k->u64s);
-	} else {
-		/*
-		 * Deleting, but the key to delete wasn't found - nothing to do:
-		 */
-		if (bkey_whiteout(&insert->k))
-			return false;
-
-		insert->k.needs_whiteout = false;
 	}
 
 	k = bch2_btree_node_iter_bset_pos(node_iter, b, bset_tree_last(b));
-- 
cgit 


From d98a5e39452e08d4d51c2240b0711ea86a62de6d Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 9 Jan 2020 20:43:58 -0500
Subject: bcachefs: Add some comments for btree iterator flags

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_types.h | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 3951933db5d6..85d4a6d2f7e9 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -187,9 +187,25 @@ enum btree_iter_type {
 
 #define BTREE_ITER_TYPE			((1 << 2) - 1)
 
+/*
+ * Iterate over all possible positions, synthesizing deleted keys for holes:
+ */
 #define BTREE_ITER_SLOTS		(1 << 2)
+/*
+ * Indicates that intent locks should be taken on leaf nodes, because we expect
+ * to be doing updates:
+ */
 #define BTREE_ITER_INTENT		(1 << 3)
+/*
+ * Causes the btree iterator code to prefetch additional btree nodes from disk:
+ */
 #define BTREE_ITER_PREFETCH		(1 << 4)
+/*
+ * Indicates that this iterator should not be reused until transaction commit,
+ * either because a pending update references it or because the update depends
+ * on that particular key being locked (e.g. by the str_hash code, for hash
+ * table consistency)
+ */
 #define BTREE_ITER_KEEP_UNTIL_COMMIT	(1 << 5)
 /*
  * Used in bch2_btree_iter_traverse(), to indicate whether we're searching for
-- 
cgit 


From 5525f632dc123ed32f17c649a54d07794a873822 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 15 Jan 2020 15:11:22 -0500
Subject: bcachefs: Change btree split threshold to be in u64s

This fixes a bug with very small btree nodes where splitting would end
up with one of the new nodes empty.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_cache.h           | 2 +-
 fs/bcachefs/btree_update_interior.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h
index 4598a4c57aa1..adacb0a06703 100644
--- a/fs/bcachefs/btree_cache.h
+++ b/fs/bcachefs/btree_cache.h
@@ -75,7 +75,7 @@ static inline unsigned btree_blocks(struct bch_fs *c)
 	return c->opts.btree_node_size >> c->block_bits;
 }
 
-#define BTREE_SPLIT_THRESHOLD(c)		(btree_blocks(c) * 3 / 4)
+#define BTREE_SPLIT_THRESHOLD(c)		(btree_max_u64s(c) * 3 / 4)
 
 #define BTREE_FOREGROUND_MERGE_THRESHOLD(c)	(btree_max_u64s(c) * 1 / 3)
 #define BTREE_FOREGROUND_MERGE_HYSTERESIS(c)			\
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index cb7566bbc1fc..713f2d41e6c9 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -1385,7 +1385,7 @@ static void btree_split(struct btree_update *as, struct btree *b,
 	if (keys)
 		btree_split_insert_keys(as, n1, iter, keys);
 
-	if (vstruct_blocks(n1->data, c->block_bits) > BTREE_SPLIT_THRESHOLD(c)) {
+	if (bset_u64s(&n1->set[0]) > BTREE_SPLIT_THRESHOLD(c)) {
 		trace_btree_split(c, b);
 
 		n2 = __btree_split_node(as, n1, iter);
-- 
cgit 


From a965ef4986243bb0490d5af0ae202e81871554e1 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 15 Jan 2020 22:53:49 -0500
Subject: bcachefs: Fix bch2_sort_keys() to not modify src keys

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bkey_sort.c  | 18 +++++++-----------
 fs/bcachefs/bset.h       |  6 ------
 fs/bcachefs/btree_iter.c |  6 ------
 3 files changed, 7 insertions(+), 23 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bkey_sort.c b/fs/bcachefs/bkey_sort.c
index 18f842012f05..1c8e5a80e32a 100644
--- a/fs/bcachefs/bkey_sort.c
+++ b/fs/bcachefs/bkey_sort.c
@@ -254,23 +254,18 @@ unsigned bch2_sort_keys(struct bkey_packed *dst,
 	sort_iter_sort(iter, sort_keys_cmp);
 
 	while ((in = sort_iter_next(iter, sort_keys_cmp))) {
+		bool needs_whiteout = false;
+
 		if (bkey_whiteout(in) &&
 		    (filter_whiteouts || !in->needs_whiteout))
 			continue;
 
-		if (bkey_whiteout(in) &&
-		    (next = sort_iter_peek(iter)) &&
-		    !bkey_cmp_packed(iter->b, in, next)) {
+		while ((next = sort_iter_peek(iter)) &&
+		       !bkey_cmp_packed(iter->b, in, next)) {
 			BUG_ON(in->needs_whiteout &&
 			       next->needs_whiteout);
-			/*
-			 * XXX racy, called with read lock from write path
-			 *
-			 * leads to spurious BUG_ON() in bkey_unpack_key() in
-			 * debug mode
-			 */
-			next->needs_whiteout |= in->needs_whiteout;
-			continue;
+			needs_whiteout |= in->needs_whiteout;
+			in = sort_iter_next(iter, sort_keys_cmp);
 		}
 
 		if (bkey_whiteout(in)) {
@@ -279,6 +274,7 @@ unsigned bch2_sort_keys(struct bkey_packed *dst,
 		} else {
 			bkey_copy(out, in);
 		}
+		out->needs_whiteout |= needs_whiteout;
 		out = bkey_next(out);
 	}
 
diff --git a/fs/bcachefs/bset.h b/fs/bcachefs/bset.h
index 5c3c5fbea4b7..50d0ce7d1afa 100644
--- a/fs/bcachefs/bset.h
+++ b/fs/bcachefs/bset.h
@@ -199,12 +199,6 @@ __bkey_unpack_key_format_checked(const struct btree *b,
 		if (btree_keys_expensive_checks(b)) {
 			struct bkey dst2 = __bch2_bkey_unpack_key(&b->format, src);
 
-			/*
-			 * hack around a harmless race when compacting whiteouts
-			 * for a write:
-			 */
-			dst2.needs_whiteout = dst->needs_whiteout;
-
 			BUG_ON(memcmp(dst, &dst2, sizeof(*dst)));
 		}
 	}
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index d1e83cfba47f..5e220284b0b3 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1339,12 +1339,6 @@ static inline struct bkey_s_c btree_iter_peek_uptodate(struct btree_iter *iter)
 		if (debug_check_iterators(iter->trans->c)) {
 			struct bkey k = bkey_unpack_key(l->b, _k);
 
-			/*
-			 * this flag is internal to the btree code,
-			 * we don't care if it doesn't match - if it's now set
-			 * it just means the key has been written out to disk:
-			 */
-			k.needs_whiteout = iter->k.needs_whiteout;
 			BUG_ON(memcmp(&k, &iter->k, sizeof(k)));
 		}
 
-- 
cgit 


From 952070908f9f47acb9e9ebd642570dd75f672edc Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 16 Jan 2020 16:14:56 -0500
Subject: bcachefs: Don't modify existing key in place in sort_repack_merge()

This fixes a nasty memory corruption with other threads that are still
reading the btree node being compacted.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bkey_sort.c | 20 +++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bkey_sort.c b/fs/bcachefs/bkey_sort.c
index 1c8e5a80e32a..7cbb57042af1 100644
--- a/fs/bcachefs/bkey_sort.c
+++ b/fs/bcachefs/bkey_sort.c
@@ -210,28 +210,38 @@ bch2_sort_repack_merge(struct bch_fs *c,
 		       bool filter_whiteouts)
 {
 	struct bkey_packed *prev = NULL, *k_packed;
-	struct bkey_s k;
+	struct bkey_on_stack k;
 	struct btree_nr_keys nr;
-	struct bkey unpacked;
 
 	memset(&nr, 0, sizeof(nr));
+	bkey_on_stack_init(&k);
 
 	while ((k_packed = bch2_btree_node_iter_next_all(iter, src))) {
 		if (filter_whiteouts && bkey_whiteout(k_packed))
 			continue;
 
-		k = __bkey_disassemble(src, k_packed, &unpacked);
+		/*
+		 * NOTE:
+		 * bch2_bkey_normalize may modify the key we pass it (dropping
+		 * stale pointers) and we don't have a write lock on the src
+		 * node; we have to make a copy of the entire key before calling
+		 * normalize
+		 */
+		bkey_on_stack_realloc(&k, c, k_packed->u64s + BKEY_U64s);
+		bch2_bkey_unpack(src, k.k, k_packed);
 
 		if (filter_whiteouts &&
-		    bch2_bkey_normalize(c, k))
+		    bch2_bkey_normalize(c, bkey_i_to_s(k.k)))
 			continue;
 
-		extent_sort_append(c, out_f, &nr, vstruct_last(dst), &prev, k);
+		extent_sort_append(c, out_f, &nr, vstruct_last(dst),
+				   &prev, bkey_i_to_s(k.k));
 	}
 
 	extent_sort_advance_prev(out_f, &nr, vstruct_last(dst), &prev);
 
 	dst->u64s = cpu_to_le16((u64 *) prev - dst->_data);
+	bkey_on_stack_exit(&k, c);
 	return nr;
 }
 
-- 
cgit 


From 6876d2ab784645291089dd6fb325d6793cfd1c22 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 16 Jan 2020 16:20:53 -0500
Subject: bcachefs: Add a cond_resched() to rebalance loop

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/rebalance.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c
index 9eb3ac856eed..d17e3c0b7d12 100644
--- a/fs/bcachefs/rebalance.c
+++ b/fs/bcachefs/rebalance.c
@@ -183,6 +183,8 @@ static int bch2_rebalance_thread(void *arg)
 	prev_cputime	= curr_cputime();
 
 	while (!kthread_wait_freezable(r->enabled)) {
+		cond_resched();
+
 		start			= jiffies;
 		cputime			= curr_cputime();
 
-- 
cgit 


From 65d9f536fa3e8ad302798194b85d18632ed329b9 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 9 Jan 2020 18:30:05 -0500
Subject: bcachefs: Improve tracepoints slightly in commit path

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update_leaf.c | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 0c6c2b5b15b3..baf97d785774 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -706,10 +706,18 @@ int __bch2_trans_commit(struct btree_trans *trans)
 		trans_trigger_run = false;
 
 		trans_for_each_update(trans, i) {
-			/* we know trans->nounlock won't be set here: */
-			if (unlikely(!(i->iter->locks_want < 1
-				       ? __bch2_btree_iter_upgrade(i->iter, 1)
-				       : i->iter->uptodate <= BTREE_ITER_NEED_PEEK))) {
+			if (unlikely(i->iter->uptodate > BTREE_ITER_NEED_PEEK)) {
+				trace_trans_restart_traverse(trans->ip);
+				ret = -EINTR;
+				goto out;
+			}
+
+			/*
+			 * We're not using bch2_btree_iter_upgrade here because
+			 * we know trans->nounlock can't be set:
+			 */
+			if (unlikely(i->iter->locks_want < 1 &&
+				     !__bch2_btree_iter_upgrade(i->iter, 1))) {
 				trace_trans_restart_upgrade(trans->ip);
 				ret = -EINTR;
 				goto out;
-- 
cgit 


From 182084e3dc5f55de12f0184ddd6243f64b6cd87b Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 20 Jan 2020 19:42:38 -0500
Subject: bcachefs: Refactor rebalance_pred function

Before, the logic for if we should move an extent was duplicated
somewhat, in both rebalance_add_key() and rebalance_pred(); this
centralizes that in __rebalance_pred()

This is prep work for a patch that enables marking data as
incompressible.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/rebalance.c | 93 +++++++++++++++++++++++--------------------------
 1 file changed, 44 insertions(+), 49 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c
index d17e3c0b7d12..51defd636c72 100644
--- a/fs/bcachefs/rebalance.c
+++ b/fs/bcachefs/rebalance.c
@@ -17,50 +17,51 @@
 #include <linux/kthread.h>
 #include <linux/sched/cputime.h>
 
-static inline bool rebalance_ptr_pred(struct bch_fs *c,
-				      struct extent_ptr_decoded p,
-				      struct bch_io_opts *io_opts)
+/*
+ * Check if an extent should be moved:
+ * returns -1 if it should not be moved, or
+ * device of pointer that should be moved, if known, or INT_MAX if unknown
+ */
+static int __bch2_rebalance_pred(struct bch_fs *c,
+				 struct bkey_s_c k,
+				 struct bch_io_opts *io_opts)
 {
-	if (io_opts->background_target &&
-	    !bch2_dev_in_target(c, p.ptr.dev, io_opts->background_target) &&
-	    !p.ptr.cached)
-		return true;
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+	const union bch_extent_entry *entry;
+	struct extent_ptr_decoded p;
+
+	if (io_opts->background_compression)
+		bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
+			if (!p.ptr.cached &&
+			    p.crc.compression_type !=
+			    bch2_compression_opt_to_type[io_opts->background_compression])
+				return p.ptr.dev;
 
-	if (io_opts->background_compression &&
-	    p.crc.compression_type !=
-	    bch2_compression_opt_to_type[io_opts->background_compression])
-		return true;
+	if (io_opts->background_target)
+		bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
+			if (!p.ptr.cached &&
+			    !bch2_dev_in_target(c, p.ptr.dev, io_opts->background_target))
+				return p.ptr.dev;
 
-	return false;
+	return -1;
 }
 
 void bch2_rebalance_add_key(struct bch_fs *c,
 			    struct bkey_s_c k,
 			    struct bch_io_opts *io_opts)
 {
-	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-	const union bch_extent_entry *entry;
-	struct extent_ptr_decoded p;
+	atomic64_t *counter;
+	int dev;
 
-	if (!io_opts->background_target &&
-	    !io_opts->background_compression)
+	dev = __bch2_rebalance_pred(c, k, io_opts);
+	if (dev < 0)
 		return;
 
-	bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
-		if (rebalance_ptr_pred(c, p, io_opts)) {
-			struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev);
+	counter = dev < INT_MAX
+		? &bch_dev_bkey_exists(c, dev)->rebalance_work
+		: &c->rebalance.work_unknown_dev;
 
-			if (atomic64_add_return(p.crc.compressed_size,
-						&ca->rebalance_work) ==
-			    p.crc.compressed_size)
-				rebalance_wakeup(c);
-		}
-}
-
-void bch2_rebalance_add_work(struct bch_fs *c, u64 sectors)
-{
-	if (atomic64_add_return(sectors, &c->rebalance.work_unknown_dev) ==
-	    sectors)
+	if (atomic64_add_return(k.k->size, counter) == k.k->size)
 		rebalance_wakeup(c);
 }
 
@@ -69,26 +70,20 @@ static enum data_cmd rebalance_pred(struct bch_fs *c, void *arg,
 				    struct bch_io_opts *io_opts,
 				    struct data_opts *data_opts)
 {
-	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-	const union bch_extent_entry *entry;
-	struct extent_ptr_decoded p;
-	unsigned nr_replicas = 0;
-
-	bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
-		nr_replicas += !p.ptr.cached;
-
-		if (rebalance_ptr_pred(c, p, io_opts))
-			goto found;
+	if (__bch2_rebalance_pred(c, k, io_opts) >= 0) {
+		data_opts->target		= io_opts->background_target;
+		data_opts->btree_insert_flags	= 0;
+		return DATA_ADD_REPLICAS;
+	} else {
+		return DATA_SKIP;
 	}
+}
 
-	if (nr_replicas < io_opts->data_replicas)
-		goto found;
-
-	return DATA_SKIP;
-found:
-	data_opts->target		= io_opts->background_target;
-	data_opts->btree_insert_flags	= 0;
-	return DATA_ADD_REPLICAS;
+void bch2_rebalance_add_work(struct bch_fs *c, u64 sectors)
+{
+	if (atomic64_add_return(sectors, &c->rebalance.work_unknown_dev) ==
+	    sectors)
+		rebalance_wakeup(c);
 }
 
 struct rebalance_work {
-- 
cgit 


From ab05de4ce4a1b806773e59b97a59bcfabba57d8d Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 23 Feb 2018 16:26:10 -0500
Subject: bcachefs: Track incompressible data

This fixes the background_compression option: wihout some way of marking
data as incompressible, rebalance will keep rewriting incompressible
data over and over.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs_format.h | 14 ++++++++------
 fs/bcachefs/checksum.c        |  3 ++-
 fs/bcachefs/checksum.h        |  7 +++++--
 fs/bcachefs/compress.c        |  2 +-
 fs/bcachefs/extents.c         | 26 ++++++++++++++++++--------
 fs/bcachefs/extents.h         |  7 +++++++
 fs/bcachefs/io.c              | 42 +++++++++++++++++++++++++-----------------
 fs/bcachefs/io_types.h        |  1 +
 fs/bcachefs/move.c            | 19 ++++++++++++-------
 fs/bcachefs/rebalance.c       |  3 ++-
 fs/bcachefs/sysfs.c           |  2 +-
 11 files changed, 82 insertions(+), 44 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index 0a623ed3caa6..dbc9c15514bd 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -1298,7 +1298,8 @@ LE64_BITMASK(BCH_SB_ERASURE_CODE,	struct bch_sb, flags[3],  0, 16);
 	x(reflink,			6)	\
 	x(new_siphash,			7)	\
 	x(inline_data,			8)	\
-	x(new_extent_overwrite,		9)
+	x(new_extent_overwrite,		9)	\
+	x(incompressible,		10)
 
 enum bch_sb_feature {
 #define x(f, n) BCH_FEATURE_##f,
@@ -1378,11 +1379,12 @@ enum bch_csum_opts {
 };
 
 #define BCH_COMPRESSION_TYPES()		\
-	x(none,		0)		\
-	x(lz4_old,	1)		\
-	x(gzip,		2)		\
-	x(lz4,		3)		\
-	x(zstd,		4)
+	x(none,			0)	\
+	x(lz4_old,		1)	\
+	x(gzip,			2)	\
+	x(lz4,			3)	\
+	x(zstd,			4)	\
+	x(incompressible,	5)
 
 enum bch_compression_type {
 #define x(t, n) BCH_COMPRESSION_TYPE_##t,
diff --git a/fs/bcachefs/checksum.c b/fs/bcachefs/checksum.c
index 2e1dfdc68e15..3d88719ba86c 100644
--- a/fs/bcachefs/checksum.c
+++ b/fs/bcachefs/checksum.c
@@ -326,7 +326,7 @@ int bch2_rechecksum_bio(struct bch_fs *c, struct bio *bio,
 
 	BUG_ON(len_a + len_b > bio_sectors(bio));
 	BUG_ON(crc_old.uncompressed_size != bio_sectors(bio));
-	BUG_ON(crc_old.compression_type);
+	BUG_ON(crc_is_compressed(crc_old));
 	BUG_ON(bch2_csum_type_is_encryption(crc_old.csum_type) !=
 	       bch2_csum_type_is_encryption(new_csum_type));
 
@@ -355,6 +355,7 @@ int bch2_rechecksum_bio(struct bch_fs *c, struct bio *bio,
 		if (i->crc)
 			*i->crc = (struct bch_extent_crc_unpacked) {
 				.csum_type		= i->csum_type,
+				.compression_type	= crc_old.compression_type,
 				.compressed_size	= i->len,
 				.uncompressed_size	= i->len,
 				.offset			= 0,
diff --git a/fs/bcachefs/checksum.h b/fs/bcachefs/checksum.h
index ca9e45906dc8..24dee8039d57 100644
--- a/fs/bcachefs/checksum.h
+++ b/fs/bcachefs/checksum.h
@@ -155,13 +155,16 @@ static inline struct nonce null_nonce(void)
 static inline struct nonce extent_nonce(struct bversion version,
 					struct bch_extent_crc_unpacked crc)
 {
-	unsigned size = crc.compression_type ? crc.uncompressed_size : 0;
+	unsigned compression_type = crc_is_compressed(crc)
+		? crc.compression_type
+		: 0;
+	unsigned size = compression_type ? crc.uncompressed_size : 0;
 	struct nonce nonce = (struct nonce) {{
 		[0] = cpu_to_le32(size << 22),
 		[1] = cpu_to_le32(version.lo),
 		[2] = cpu_to_le32(version.lo >> 32),
 		[3] = cpu_to_le32(version.hi|
-				  (crc.compression_type << 24))^BCH_NONCE_EXTENT,
+				  (compression_type << 24))^BCH_NONCE_EXTENT,
 	}};
 
 	return nonce_add(nonce, crc.nonce << 9);
diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c
index 091958d1ea04..117afac3db1a 100644
--- a/fs/bcachefs/compress.c
+++ b/fs/bcachefs/compress.c
@@ -434,7 +434,7 @@ out:
 	bio_unmap_or_unbounce(c, dst_data);
 	return compression_type;
 err:
-	compression_type = 0;
+	compression_type = BCH_COMPRESSION_TYPE_incompressible;
 	goto out;
 }
 
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index ce94e38c0277..f97fa9341c9f 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -336,7 +336,7 @@ enum merge_result bch2_extent_merge(struct bch_fs *c,
 			if (!bch2_checksum_mergeable(crc_l.csum_type))
 				return BCH_MERGE_NOMERGE;
 
-			if (crc_l.compression_type)
+			if (crc_is_compressed(crc_l))
 				return BCH_MERGE_NOMERGE;
 
 			if (crc_l.csum_type &&
@@ -447,7 +447,7 @@ static inline bool bch2_crc_unpacked_cmp(struct bch_extent_crc_unpacked l,
 static inline bool can_narrow_crc(struct bch_extent_crc_unpacked u,
 				  struct bch_extent_crc_unpacked n)
 {
-	return !u.compression_type &&
+	return !crc_is_compressed(u) &&
 		u.csum_type &&
 		u.uncompressed_size > u.live_size &&
 		bch2_csum_type_is_encryption(u.csum_type) ==
@@ -491,7 +491,7 @@ bool bch2_bkey_narrow_crcs(struct bkey_i *k, struct bch_extent_crc_unpacked n)
 	/* Find a checksum entry that covers only live data: */
 	if (!n.csum_type) {
 		bkey_for_each_crc(&k->k, ptrs, u, i)
-			if (!u.compression_type &&
+			if (!crc_is_compressed(u) &&
 			    u.csum_type &&
 			    u.live_size == u.uncompressed_size) {
 				n = u;
@@ -500,7 +500,7 @@ bool bch2_bkey_narrow_crcs(struct bkey_i *k, struct bch_extent_crc_unpacked n)
 		return false;
 	}
 found:
-	BUG_ON(n.compression_type);
+	BUG_ON(crc_is_compressed(n));
 	BUG_ON(n.offset);
 	BUG_ON(n.live_size != k->k.size);
 
@@ -609,8 +609,7 @@ unsigned bch2_bkey_nr_ptrs_fully_allocated(struct bkey_s_c k)
 		struct extent_ptr_decoded p;
 
 		bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
-			ret += !p.ptr.cached &&
-				p.crc.compression_type == BCH_COMPRESSION_TYPE_none;
+			ret += !p.ptr.cached && !crc_is_compressed(p.crc);
 	}
 
 	return ret;
@@ -624,13 +623,24 @@ unsigned bch2_bkey_sectors_compressed(struct bkey_s_c k)
 	unsigned ret = 0;
 
 	bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
-		if (!p.ptr.cached &&
-		    p.crc.compression_type != BCH_COMPRESSION_TYPE_none)
+		if (!p.ptr.cached && crc_is_compressed(p.crc))
 			ret += p.crc.compressed_size;
 
 	return ret;
 }
 
+bool bch2_bkey_is_incompressible(struct bkey_s_c k)
+{
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+	const union bch_extent_entry *entry;
+	struct bch_extent_crc_unpacked crc;
+
+	bkey_for_each_crc(k.k, ptrs, crc, entry)
+		if (crc.compression_type == BCH_COMPRESSION_TYPE_incompressible)
+			return true;
+	return false;
+}
+
 bool bch2_check_range_allocated(struct bch_fs *c, struct bpos pos, u64 size,
 				unsigned nr_replicas)
 {
diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h
index 7c5a41e6d79d..0d8554172263 100644
--- a/fs/bcachefs/extents.h
+++ b/fs/bcachefs/extents.h
@@ -175,6 +175,12 @@ bch2_extent_crc_unpack(const struct bkey *k, const union bch_extent_crc *crc)
 #undef common_fields
 }
 
+static inline bool crc_is_compressed(struct bch_extent_crc_unpacked crc)
+{
+	return (crc.compression_type != BCH_COMPRESSION_TYPE_none &&
+		crc.compression_type != BCH_COMPRESSION_TYPE_incompressible);
+}
+
 /* bkey_ptrs: generically over any key type that has ptrs */
 
 struct bkey_ptrs_c {
@@ -483,6 +489,7 @@ static inline struct bch_devs_list bch2_bkey_cached_devs(struct bkey_s_c k)
 unsigned bch2_bkey_nr_ptrs(struct bkey_s_c);
 unsigned bch2_bkey_nr_ptrs_allocated(struct bkey_s_c);
 unsigned bch2_bkey_nr_ptrs_fully_allocated(struct bkey_s_c);
+bool bch2_bkey_is_incompressible(struct bkey_s_c);
 unsigned bch2_bkey_sectors_compressed(struct bkey_s_c);
 bool bch2_check_range_allocated(struct bch_fs *, struct bpos, u64, unsigned);
 unsigned bch2_bkey_durability(struct bch_fs *, struct bkey_s_c);
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index a419024ce039..679ad54d4c4e 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -562,9 +562,14 @@ static void __bch2_write_index(struct bch_write_op *op)
 	 * particularly want to plumb io_opts all the way through the btree
 	 * update stack right now
 	 */
-	for_each_keylist_key(keys, k)
+	for_each_keylist_key(keys, k) {
 		bch2_rebalance_add_key(c, bkey_i_to_s_c(k), &op->opts);
 
+		if (bch2_bkey_is_incompressible(bkey_i_to_s_c(k)))
+			bch2_check_set_feature(op->c, BCH_FEATURE_incompressible);
+
+	}
+
 	if (!bch2_keylist_empty(keys)) {
 		u64 sectors_start = keylist_sectors(keys);
 		int ret = op->index_update_fn(op);
@@ -786,8 +791,9 @@ static enum prep_encoded_ret {
 	/* Can we just write the entire extent as is? */
 	if (op->crc.uncompressed_size == op->crc.live_size &&
 	    op->crc.compressed_size <= wp->sectors_free &&
-	    op->crc.compression_type == op->compression_type) {
-		if (!op->crc.compression_type &&
+	    (op->crc.compression_type == op->compression_type ||
+	     op->incompressible)) {
+		if (!crc_is_compressed(op->crc) &&
 		    op->csum_type != op->crc.csum_type &&
 		    bch2_write_rechecksum(c, op, op->csum_type))
 			return PREP_ENCODED_CHECKSUM_ERR;
@@ -799,7 +805,7 @@ static enum prep_encoded_ret {
 	 * If the data is compressed and we couldn't write the entire extent as
 	 * is, we have to decompress it:
 	 */
-	if (op->crc.compression_type) {
+	if (crc_is_compressed(op->crc)) {
 		struct bch_csum csum;
 
 		if (bch2_write_decrypt(op))
@@ -910,11 +916,13 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp,
 		       bch2_csum_type_is_encryption(op->crc.csum_type));
 		BUG_ON(op->compression_type && !bounce);
 
-		crc.compression_type = op->compression_type
-			?  bch2_bio_compress(c, dst, &dst_len, src, &src_len,
-					     op->compression_type)
+		crc.compression_type = op->incompressible
+			? BCH_COMPRESSION_TYPE_incompressible
+			: op->compression_type
+			? bch2_bio_compress(c, dst, &dst_len, src, &src_len,
+					    op->compression_type)
 			: 0;
-		if (!crc.compression_type) {
+		if (!crc_is_compressed(crc)) {
 			dst_len = min(dst->bi_iter.bi_size, src->bi_iter.bi_size);
 			dst_len = min_t(unsigned, dst_len, wp->sectors_free << 9);
 
@@ -943,7 +951,7 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp,
 		}
 
 		if ((op->flags & BCH_WRITE_DATA_ENCODED) &&
-		    !crc.compression_type &&
+		    !crc_is_compressed(crc) &&
 		    bch2_csum_type_is_encryption(op->crc.csum_type) ==
 		    bch2_csum_type_is_encryption(op->csum_type)) {
 			/*
@@ -1340,6 +1348,7 @@ static void promote_start(struct promote_op *op, struct bch_read_bio *rbio)
 
 static struct promote_op *__promote_alloc(struct bch_fs *c,
 					  enum btree_id btree_id,
+					  struct bkey_s_c k,
 					  struct bpos pos,
 					  struct extent_ptr_decoded *pick,
 					  struct bch_io_opts opts,
@@ -1396,8 +1405,7 @@ static struct promote_op *__promote_alloc(struct bch_fs *c,
 			(struct data_opts) {
 				.target = opts.promote_target
 			},
-			btree_id,
-			bkey_s_c_null);
+			btree_id, k);
 	BUG_ON(ret);
 
 	return op;
@@ -1439,7 +1447,7 @@ static struct promote_op *promote_alloc(struct bch_fs *c,
 				  k.k->type == KEY_TYPE_reflink_v
 				  ? BTREE_ID_REFLINK
 				  : BTREE_ID_EXTENTS,
-				  pos, pick, opts, sectors, rbio);
+				  k, pos, pick, opts, sectors, rbio);
 	if (!promote)
 		return NULL;
 
@@ -1703,7 +1711,7 @@ static void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio)
 	u64 data_offset = rbio->pos.offset - rbio->pick.crc.offset;
 	int ret;
 
-	if (rbio->pick.crc.compression_type)
+	if (crc_is_compressed(rbio->pick.crc))
 		return;
 
 	bkey_on_stack_init(&new);
@@ -1788,7 +1796,7 @@ static void __bch2_read_endio(struct work_struct *work)
 	crc.offset     += rbio->offset_into_extent;
 	crc.live_size	= bvec_iter_sectors(rbio->bvec_iter);
 
-	if (crc.compression_type != BCH_COMPRESSION_TYPE_none) {
+	if (crc_is_compressed(crc)) {
 		bch2_encrypt_bio(c, crc.csum_type, nonce, src);
 		if (bch2_bio_uncompress(c, src, dst, dst_iter, crc))
 			goto decompression_err;
@@ -1885,7 +1893,7 @@ static void bch2_read_endio(struct bio *bio)
 	}
 
 	if (rbio->narrow_crcs ||
-	    rbio->pick.crc.compression_type ||
+	    crc_is_compressed(rbio->pick.crc) ||
 	    bch2_csum_type_is_encryption(rbio->pick.crc.csum_type))
 		context = RBIO_CONTEXT_UNBOUND,	wq = system_unbound_wq;
 	else if (rbio->pick.crc.csum_type)
@@ -1996,7 +2004,7 @@ int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig,
 
 	EBUG_ON(offset_into_extent + bvec_iter_sectors(iter) > k.k->size);
 
-	if (pick.crc.compression_type != BCH_COMPRESSION_TYPE_none ||
+	if (crc_is_compressed(pick.crc) ||
 	    (pick.crc.csum_type != BCH_CSUM_NONE &&
 	     (bvec_iter_sectors(iter) != pick.crc.uncompressed_size ||
 	      (bch2_csum_type_is_encryption(pick.crc.csum_type) &&
@@ -2011,7 +2019,7 @@ int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig,
 					&rbio, &bounce, &read_full);
 
 	if (!read_full) {
-		EBUG_ON(pick.crc.compression_type);
+		EBUG_ON(crc_is_compressed(pick.crc));
 		EBUG_ON(pick.crc.csum_type &&
 			(bvec_iter_sectors(iter) != pick.crc.uncompressed_size ||
 			 bvec_iter_sectors(iter) != pick.crc.live_size ||
diff --git a/fs/bcachefs/io_types.h b/fs/bcachefs/io_types.h
index 85dfcb0fdf51..a9a336c04269 100644
--- a/fs/bcachefs/io_types.h
+++ b/fs/bcachefs/io_types.h
@@ -107,6 +107,7 @@ struct bch_write_op {
 	unsigned		nr_replicas:4;
 	unsigned		nr_replicas_required:4;
 	unsigned		alloc_reserve:4;
+	unsigned		incompressible:1;
 
 	struct bch_devs_list	devs_have;
 	u16			target;
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index 7ed90b0576c0..dbcda8374692 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -214,6 +214,9 @@ int bch2_migrate_write_init(struct bch_fs *c, struct migrate_write *m,
 			    enum btree_id btree_id,
 			    struct bkey_s_c k)
 {
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+	const union bch_extent_entry *entry;
+	struct extent_ptr_decoded p;
 	int ret;
 
 	m->btree_id	= btree_id;
@@ -222,9 +225,14 @@ int bch2_migrate_write_init(struct bch_fs *c, struct migrate_write *m,
 	m->nr_ptrs_reserved = 0;
 
 	bch2_write_op_init(&m->op, c, io_opts);
-	m->op.compression_type =
-		bch2_compression_opt_to_type[io_opts.background_compression ?:
-					     io_opts.compression];
+
+	if (!bch2_bkey_is_incompressible(k))
+		m->op.compression_type =
+			bch2_compression_opt_to_type[io_opts.background_compression ?:
+						     io_opts.compression];
+	else
+		m->op.incompressible = true;
+
 	m->op.target	= data_opts.target,
 	m->op.write_point = wp;
 
@@ -264,14 +272,11 @@ int bch2_migrate_write_init(struct bch_fs *c, struct migrate_write *m,
 		break;
 	}
 	case DATA_REWRITE: {
-		struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-		const union bch_extent_entry *entry;
-		struct extent_ptr_decoded p;
 		unsigned compressed_sectors = 0;
 
 		bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
 			if (!p.ptr.cached &&
-			    p.crc.compression_type != BCH_COMPRESSION_TYPE_none &&
+			    crc_is_compressed(p.crc) &&
 			    bch2_dev_in_target(c, p.ptr.dev, data_opts.target))
 				compressed_sectors += p.crc.compressed_size;
 
diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c
index 51defd636c72..a0a75cfa41cb 100644
--- a/fs/bcachefs/rebalance.c
+++ b/fs/bcachefs/rebalance.c
@@ -30,7 +30,8 @@ static int __bch2_rebalance_pred(struct bch_fs *c,
 	const union bch_extent_entry *entry;
 	struct extent_ptr_decoded p;
 
-	if (io_opts->background_compression)
+	if (io_opts->background_compression &&
+	    !bch2_bkey_is_incompressible(k))
 		bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
 			if (!p.ptr.cached &&
 			    p.crc.compression_type !=
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index 13b48a0fc87d..662c84b91323 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -276,7 +276,7 @@ static ssize_t bch2_compression_stats(struct bch_fs *c, char *buf)
 			struct extent_ptr_decoded p;
 
 			extent_for_each_ptr_decode(e, p, entry) {
-				if (p.crc.compression_type == BCH_COMPRESSION_TYPE_none) {
+				if (!crc_is_compressed(p.crc)) {
 					nr_uncompressed_extents++;
 					uncompressed_sectors += e.k->size;
 				} else {
-- 
cgit 


From 4dba7c301782de9cba75b7895a5068c0bf07a551 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 27 Jan 2020 17:47:07 -0500
Subject: bcachefs: Fix an in iterator leak

This should fix a transaction iterator overflow bug during fsck.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/str_hash.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h
index 35f4232d0755..cf6ecd963a7b 100644
--- a/fs/bcachefs/str_hash.h
+++ b/fs/bcachefs/str_hash.h
@@ -319,13 +319,16 @@ int bch2_hash_delete(struct btree_trans *trans,
 		     u64 inode, const void *key)
 {
 	struct btree_iter *iter;
+	int ret;
 
 	iter = bch2_hash_lookup(trans, desc, info, inode, key,
 				BTREE_ITER_INTENT);
 	if (IS_ERR(iter))
 		return PTR_ERR(iter);
 
-	return bch2_hash_delete_at(trans, desc, info, iter);
+	ret = bch2_hash_delete_at(trans, desc, info, iter);
+	bch2_trans_iter_put(trans, iter);
+	return ret;
 }
 
 #endif /* _BCACHEFS_STR_HASH_H */
-- 
cgit 


From 7d4aed1ea408f45aef8ddf5f28d8c10680b4f214 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 29 Jan 2020 13:05:04 -0500
Subject: bcachefs: Fix an uninitialized field in bch_write_op

Regression from "bcachefs: Track incompressible data"

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/io.h       | 1 +
 fs/bcachefs/io_types.h | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/io.h b/fs/bcachefs/io.h
index fa5841a86fcb..bc9f9fec2fd7 100644
--- a/fs/bcachefs/io.h
+++ b/fs/bcachefs/io.h
@@ -82,6 +82,7 @@ static inline void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c,
 	op->nr_replicas		= 0;
 	op->nr_replicas_required = c->opts.data_replicas_required;
 	op->alloc_reserve	= RESERVE_NONE;
+	op->incompressible	= 0;
 	op->open_buckets.nr	= 0;
 	op->devs_have.nr	= 0;
 	op->target		= 0;
diff --git a/fs/bcachefs/io_types.h b/fs/bcachefs/io_types.h
index a9a336c04269..692af6dd6031 100644
--- a/fs/bcachefs/io_types.h
+++ b/fs/bcachefs/io_types.h
@@ -106,7 +106,7 @@ struct bch_write_op {
 	unsigned		compression_type:4;
 	unsigned		nr_replicas:4;
 	unsigned		nr_replicas_required:4;
-	unsigned		alloc_reserve:4;
+	unsigned		alloc_reserve:3;
 	unsigned		incompressible:1;
 
 	struct bch_devs_list	devs_have;
-- 
cgit 


From fdf224003354fb14e20f638e479273c4728dfc0a Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 30 Jan 2020 20:26:08 -0500
Subject: bcachefs: Improve an insert path optimization

The insert path had an optimization to short circuit lookup
table/iterator fixups when overwriting an existing key with the same
size value - but it was incorrect when other key fields
(size/version) were changing. This is important for the upcoming rework
to have extent updates use the same insert path as regular keys.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bset.c              |  3 ++-
 fs/bcachefs/btree_update_leaf.c | 38 +++++++++++++-------------------------
 2 files changed, 15 insertions(+), 26 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c
index cff664ab75fa..b8fe361978ef 100644
--- a/fs/bcachefs/bset.c
+++ b/fs/bcachefs/bset.c
@@ -1189,7 +1189,8 @@ void bch2_bset_insert(struct btree *b,
 	memcpy_u64s(bkeyp_val(f, where), &insert->v,
 		    bkeyp_val_u64s(f, src));
 
-	bch2_bset_fix_lookup_table(b, t, where, clobber_u64s, src->u64s);
+	if (src->u64s != clobber_u64s)
+		bch2_bset_fix_lookup_table(b, t, where, clobber_u64s, src->u64s);
 
 	bch2_verify_btree_nr_keys(b);
 }
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index baf97d785774..9cf7075dd265 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -53,9 +53,8 @@ bool bch2_btree_bset_insert_key(struct btree_iter *iter,
 				struct btree_node_iter *node_iter,
 				struct bkey_i *insert)
 {
-	const struct bkey_format *f = &b->format;
 	struct bkey_packed *k;
-	unsigned clobber_u64s;
+	unsigned clobber_u64s = 0, new_u64s = 0;
 
 	EBUG_ON(btree_node_just_written(b));
 	EBUG_ON(bset_written(b, btree_bset_last(b)));
@@ -68,30 +67,25 @@ bool bch2_btree_bset_insert_key(struct btree_iter *iter,
 		k = NULL;
 
 	/* @k is the key being overwritten/deleted, if any: */
-
 	EBUG_ON(k && bkey_whiteout(k));
 
+	/* Deleting, but not found? nothing to do: */
+	if (bkey_whiteout(&insert->k) && !k)
+		return false;
+
 	if (bkey_whiteout(&insert->k)) {
 		/* Deleting: */
-
-		/* Not found? Nothing to do: */
-		if (!k)
-			return false;
-
 		btree_account_key_drop(b, k);
 		k->type = KEY_TYPE_deleted;
 
-		if (k->needs_whiteout) {
+		if (k->needs_whiteout)
 			push_whiteout(iter->trans->c, b, k);
-			k->needs_whiteout = false;
-		}
+		k->needs_whiteout = false;
 
 		if (k >= btree_bset_last(b)->start) {
 			clobber_u64s = k->u64s;
-
 			bch2_bset_delete(b, k, clobber_u64s);
-			bch2_btree_node_iter_fix(iter, b, node_iter, k,
-						 clobber_u64s, 0);
+			goto fix_iter;
 		} else {
 			bch2_btree_iter_fix_key_modified(iter, b, k);
 		}
@@ -101,14 +95,6 @@ bool bch2_btree_bset_insert_key(struct btree_iter *iter,
 
 	if (k) {
 		/* Overwriting: */
-		if (!bkey_written(b, k) &&
-		    bkey_val_u64s(&insert->k) == bkeyp_val_u64s(f, k)) {
-			k->type = insert->k.type;
-			memcpy_u64s(bkeyp_val(f, k), &insert->v,
-				    bkey_val_u64s(&insert->k));
-			return true;
-		}
-
 		btree_account_key_drop(b, k);
 		k->type = KEY_TYPE_deleted;
 
@@ -124,11 +110,13 @@ bool bch2_btree_bset_insert_key(struct btree_iter *iter,
 	}
 
 	k = bch2_btree_node_iter_bset_pos(node_iter, b, bset_tree_last(b));
-	clobber_u64s = 0;
 overwrite:
 	bch2_bset_insert(b, node_iter, k, insert, clobber_u64s);
-	bch2_btree_node_iter_fix(iter, b, node_iter, k,
-				 clobber_u64s, k->u64s);
+	new_u64s = k->u64s;
+fix_iter:
+	if (clobber_u64s != new_u64s)
+		bch2_btree_node_iter_fix(iter, b, node_iter, k,
+					 clobber_u64s, new_u64s);
 	return true;
 }
 
-- 
cgit 


From 8b53852d0a80ec4e438205bf8eb3a5a73ee8238c Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 18 Feb 2020 14:27:10 -0500
Subject: bcachefs: Make sure we're releasing btree iterators

This wasn't originally required, but this is the model we're moving
towards.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c |  14 ++++--
 fs/bcachefs/fs-common.c  | 126 ++++++++++++++++++++++++++++++-----------------
 fs/bcachefs/inode.c      |  60 +++++++++-------------
 fs/bcachefs/reflink.c    |   5 +-
 fs/bcachefs/str_hash.h   |   6 +--
 5 files changed, 120 insertions(+), 91 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 5e220284b0b3..cc5d6389899c 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1729,7 +1729,12 @@ static inline void __bch2_trans_iter_free(struct btree_trans *trans,
 int bch2_trans_iter_put(struct btree_trans *trans,
 			struct btree_iter *iter)
 {
-	int ret = btree_iter_err(iter);
+	int ret;
+
+	if (IS_ERR_OR_NULL(iter))
+		return 0;
+
+	ret = btree_iter_err(iter);
 
 	if (!(trans->iters_touched & (1ULL << iter->idx)) &&
 	    !(iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT))
@@ -1742,6 +1747,9 @@ int bch2_trans_iter_put(struct btree_trans *trans,
 int bch2_trans_iter_free(struct btree_trans *trans,
 			 struct btree_iter *iter)
 {
+	if (IS_ERR_OR_NULL(iter))
+		return 0;
+
 	trans->iters_touched &= ~(1ULL << iter->idx);
 
 	return bch2_trans_iter_put(trans, iter);
@@ -1981,8 +1989,8 @@ struct btree_iter *bch2_trans_copy_iter(struct btree_trans *trans,
 
 	trans->iters_live |= 1ULL << iter->idx;
 	/*
-	 * Don't mark it as touched, we don't need to preserve this iter since
-	 * it's cheap to copy it again:
+	 * We don't need to preserve this iter since it's cheap to copy it
+	 * again - this will cause trans_iter_put() to free it right away:
 	 */
 	trans->iters_touched &= ~(1ULL << iter->idx);
 
diff --git a/fs/bcachefs/fs-common.c b/fs/bcachefs/fs-common.c
index 96f7bbe0a3ed..878419d40992 100644
--- a/fs/bcachefs/fs-common.c
+++ b/fs/bcachefs/fs-common.c
@@ -19,14 +19,15 @@ int bch2_create_trans(struct btree_trans *trans, u64 dir_inum,
 		      struct posix_acl *acl)
 {
 	struct bch_fs *c = trans->c;
-	struct btree_iter *dir_iter;
+	struct btree_iter *dir_iter = NULL;
 	struct bch_hash_info hash = bch2_hash_info_init(c, new_inode);
 	u64 now = bch2_current_time(trans->c);
 	int ret;
 
 	dir_iter = bch2_inode_peek(trans, dir_u, dir_inum, BTREE_ITER_INTENT);
-	if (IS_ERR(dir_iter))
-		return PTR_ERR(dir_iter);
+	ret = PTR_ERR_OR_ZERO(dir_iter);
+	if (ret)
+		goto err;
 
 	bch2_inode_init_late(new_inode, now, uid, gid, mode, rdev, dir_u);
 
@@ -37,20 +38,20 @@ int bch2_create_trans(struct btree_trans *trans, u64 dir_inum,
 				BLOCKDEV_INODE_MAX, 0,
 				&c->unused_inode_hint);
 	if (ret)
-		return ret;
+		goto err;
 
 	if (default_acl) {
 		ret = bch2_set_acl_trans(trans, new_inode, &hash,
 					 default_acl, ACL_TYPE_DEFAULT);
 		if (ret)
-			return ret;
+			goto err;
 	}
 
 	if (acl) {
 		ret = bch2_set_acl_trans(trans, new_inode, &hash,
 					 acl, ACL_TYPE_ACCESS);
 		if (ret)
-			return ret;
+			goto err;
 	}
 
 	if (name) {
@@ -62,48 +63,55 @@ int bch2_create_trans(struct btree_trans *trans, u64 dir_inum,
 
 		ret = bch2_inode_write(trans, dir_iter, dir_u);
 		if (ret)
-			return ret;
+			goto err;
 
 		ret = bch2_dirent_create(trans, dir_inum, &dir_hash,
 					 mode_to_type(new_inode->bi_mode),
 					 name, new_inode->bi_inum,
 					 BCH_HASH_SET_MUST_CREATE);
 		if (ret)
-			return ret;
+			goto err;
 	}
-
-	return 0;
+err:
+	bch2_trans_iter_put(trans, dir_iter);
+	return ret;
 }
 
 int bch2_link_trans(struct btree_trans *trans, u64 dir_inum,
 		    u64 inum, struct bch_inode_unpacked *dir_u,
 		    struct bch_inode_unpacked *inode_u, const struct qstr *name)
 {
-	struct btree_iter *dir_iter, *inode_iter;
+	struct btree_iter *dir_iter = NULL, *inode_iter = NULL;
 	struct bch_hash_info dir_hash;
 	u64 now = bch2_current_time(trans->c);
+	int ret;
 
 	inode_iter = bch2_inode_peek(trans, inode_u, inum, BTREE_ITER_INTENT);
-	if (IS_ERR(inode_iter))
-		return PTR_ERR(inode_iter);
+	ret = PTR_ERR_OR_ZERO(inode_iter);
+	if (ret)
+		goto err;
 
 	inode_u->bi_ctime = now;
 	bch2_inode_nlink_inc(inode_u);
 
 	dir_iter = bch2_inode_peek(trans, dir_u, dir_inum, 0);
-	if (IS_ERR(dir_iter))
-		return PTR_ERR(dir_iter);
+	ret = PTR_ERR_OR_ZERO(dir_iter);
+	if (ret)
+		goto err;
 
 	dir_u->bi_mtime = dir_u->bi_ctime = now;
 
 	dir_hash = bch2_hash_info_init(trans->c, dir_u);
-	bch2_trans_iter_put(trans, dir_iter);
 
-	return bch2_dirent_create(trans, dir_inum, &dir_hash,
+	ret =   bch2_dirent_create(trans, dir_inum, &dir_hash,
 				  mode_to_type(inode_u->bi_mode),
 				  name, inum, BCH_HASH_SET_MUST_CREATE) ?:
 		bch2_inode_write(trans, dir_iter, dir_u) ?:
 		bch2_inode_write(trans, inode_iter, inode_u);
+err:
+	bch2_trans_iter_put(trans, dir_iter);
+	bch2_trans_iter_put(trans, inode_iter);
+	return ret;
 }
 
 int bch2_unlink_trans(struct btree_trans *trans,
@@ -111,39 +119,49 @@ int bch2_unlink_trans(struct btree_trans *trans,
 		      struct bch_inode_unpacked *inode_u,
 		      const struct qstr *name)
 {
-	struct btree_iter *dir_iter, *dirent_iter, *inode_iter;
+	struct btree_iter *dir_iter = NULL, *dirent_iter = NULL,
+			  *inode_iter = NULL;
 	struct bch_hash_info dir_hash;
 	u64 inum, now = bch2_current_time(trans->c);
 	struct bkey_s_c k;
+	int ret;
 
 	dir_iter = bch2_inode_peek(trans, dir_u, dir_inum, BTREE_ITER_INTENT);
-	if (IS_ERR(dir_iter))
-		return PTR_ERR(dir_iter);
+	ret = PTR_ERR_OR_ZERO(dir_iter);
+	if (ret)
+		goto err;
 
 	dir_hash = bch2_hash_info_init(trans->c, dir_u);
 
 	dirent_iter = __bch2_dirent_lookup_trans(trans, dir_inum, &dir_hash,
 						 name, BTREE_ITER_INTENT);
-	if (IS_ERR(dirent_iter))
-		return PTR_ERR(dirent_iter);
+	ret = PTR_ERR_OR_ZERO(dirent_iter);
+	if (ret)
+		goto err;
 
 	k = bch2_btree_iter_peek_slot(dirent_iter);
 	inum = le64_to_cpu(bkey_s_c_to_dirent(k).v->d_inum);
 
 	inode_iter = bch2_inode_peek(trans, inode_u, inum, BTREE_ITER_INTENT);
-	if (IS_ERR(inode_iter))
-		return PTR_ERR(inode_iter);
+	ret = PTR_ERR_OR_ZERO(inode_iter);
+	if (ret)
+		goto err;
 
 	dir_u->bi_mtime = dir_u->bi_ctime = inode_u->bi_ctime = now;
 	dir_u->bi_nlink -= S_ISDIR(inode_u->bi_mode);
 	bch2_inode_nlink_dec(inode_u);
 
-	return  (S_ISDIR(inode_u->bi_mode)
+	ret =   (S_ISDIR(inode_u->bi_mode)
 		 ? bch2_empty_dir_trans(trans, inum)
 		 : 0) ?:
 		bch2_dirent_delete_at(trans, &dir_hash, dirent_iter) ?:
 		bch2_inode_write(trans, dir_iter, dir_u) ?:
 		bch2_inode_write(trans, inode_iter, inode_u);
+err:
+	bch2_trans_iter_put(trans, inode_iter);
+	bch2_trans_iter_put(trans, dirent_iter);
+	bch2_trans_iter_put(trans, dir_iter);
+	return ret;
 }
 
 bool bch2_reinherit_attrs(struct bch_inode_unpacked *dst_u,
@@ -179,24 +197,26 @@ int bch2_rename_trans(struct btree_trans *trans,
 		      const struct qstr *dst_name,
 		      enum bch_rename_mode mode)
 {
-	struct btree_iter *src_dir_iter, *dst_dir_iter = NULL;
-	struct btree_iter *src_inode_iter, *dst_inode_iter = NULL;
+	struct btree_iter *src_dir_iter = NULL, *dst_dir_iter = NULL;
+	struct btree_iter *src_inode_iter = NULL, *dst_inode_iter = NULL;
 	struct bch_hash_info src_hash, dst_hash;
 	u64 src_inode, dst_inode, now = bch2_current_time(trans->c);
 	int ret;
 
 	src_dir_iter = bch2_inode_peek(trans, src_dir_u, src_dir,
 				       BTREE_ITER_INTENT);
-	if (IS_ERR(src_dir_iter))
-		return PTR_ERR(src_dir_iter);
+	ret = PTR_ERR_OR_ZERO(src_dir_iter);
+	if (ret)
+		goto err;
 
 	src_hash = bch2_hash_info_init(trans->c, src_dir_u);
 
 	if (dst_dir != src_dir) {
 		dst_dir_iter = bch2_inode_peek(trans, dst_dir_u, dst_dir,
 					       BTREE_ITER_INTENT);
-		if (IS_ERR(dst_dir_iter))
-			return PTR_ERR(dst_dir_iter);
+		ret = PTR_ERR_OR_ZERO(dst_dir_iter);
+		if (ret)
+			goto err;
 
 		dst_hash = bch2_hash_info_init(trans->c, dst_dir_u);
 	} else {
@@ -211,38 +231,48 @@ int bch2_rename_trans(struct btree_trans *trans,
 				 dst_name, &dst_inode,
 				 mode);
 	if (ret)
-		return ret;
+		goto err;
 
 	src_inode_iter = bch2_inode_peek(trans, src_inode_u, src_inode,
 					 BTREE_ITER_INTENT);
-	if (IS_ERR(src_inode_iter))
-		return PTR_ERR(src_inode_iter);
+	ret = PTR_ERR_OR_ZERO(src_inode_iter);
+	if (ret)
+		goto err;
 
 	if (dst_inode) {
 		dst_inode_iter = bch2_inode_peek(trans, dst_inode_u, dst_inode,
 						 BTREE_ITER_INTENT);
-		if (IS_ERR(dst_inode_iter))
-			return PTR_ERR(dst_inode_iter);
+		ret = PTR_ERR_OR_ZERO(dst_inode_iter);
+		if (ret)
+			goto err;
 	}
 
 	if (mode == BCH_RENAME_OVERWRITE) {
 		if (S_ISDIR(src_inode_u->bi_mode) !=
-		    S_ISDIR(dst_inode_u->bi_mode))
-			return -ENOTDIR;
+		    S_ISDIR(dst_inode_u->bi_mode)) {
+			ret = -ENOTDIR;
+			goto err;
+		}
 
 		if (S_ISDIR(dst_inode_u->bi_mode) &&
-		    bch2_empty_dir_trans(trans, dst_inode))
-			return -ENOTEMPTY;
+		    bch2_empty_dir_trans(trans, dst_inode)) {
+			ret = -ENOTEMPTY;
+			goto err;
+		}
 	}
 
 	if (bch2_reinherit_attrs(src_inode_u, dst_dir_u) &&
-	    S_ISDIR(src_inode_u->bi_mode))
-		return -EXDEV;
+	    S_ISDIR(src_inode_u->bi_mode)) {
+		ret = -EXDEV;
+		goto err;
+	}
 
 	if (mode == BCH_RENAME_EXCHANGE &&
 	    bch2_reinherit_attrs(dst_inode_u, src_dir_u) &&
-	    S_ISDIR(dst_inode_u->bi_mode))
-		return -EXDEV;
+	    S_ISDIR(dst_inode_u->bi_mode)) {
+		ret = -EXDEV;
+		goto err;
+	}
 
 	if (S_ISDIR(src_inode_u->bi_mode)) {
 		src_dir_u->bi_nlink--;
@@ -270,7 +300,7 @@ int bch2_rename_trans(struct btree_trans *trans,
 	if (dst_inode)
 		dst_inode_u->bi_ctime	= now;
 
-	return  bch2_inode_write(trans, src_dir_iter, src_dir_u) ?:
+	ret =   bch2_inode_write(trans, src_dir_iter, src_dir_u) ?:
 		(src_dir != dst_dir
 		 ? bch2_inode_write(trans, dst_dir_iter, dst_dir_u)
 		 : 0 ) ?:
@@ -278,4 +308,10 @@ int bch2_rename_trans(struct btree_trans *trans,
 		(dst_inode
 		 ? bch2_inode_write(trans, dst_inode_iter, dst_inode_u)
 		 : 0 );
+err:
+	bch2_trans_iter_put(trans, dst_inode_iter);
+	bch2_trans_iter_put(trans, src_inode_iter);
+	bch2_trans_iter_put(trans, dst_dir_iter);
+	bch2_trans_iter_put(trans, src_dir_iter);
+	return ret;
 }
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index bd44ef3842cb..c40ff6fc7ae2 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -362,16 +362,16 @@ int bch2_inode_create(struct btree_trans *trans,
 		      struct bch_inode_unpacked *inode_u,
 		      u64 min, u64 max, u64 *hint)
 {
-	struct bch_fs *c = trans->c;
 	struct bkey_inode_buf *inode_p;
-	struct btree_iter *iter;
+	struct btree_iter *iter = NULL;
+	struct bkey_s_c k;
 	u64 start;
 	int ret;
 
 	if (!max)
 		max = ULLONG_MAX;
 
-	if (c->opts.inodes_32bit)
+	if (trans->c->opts.inodes_32bit)
 		max = min_t(u64, max, U32_MAX);
 
 	start = READ_ONCE(*hint);
@@ -382,48 +382,37 @@ int bch2_inode_create(struct btree_trans *trans,
 	inode_p = bch2_trans_kmalloc(trans, sizeof(*inode_p));
 	if (IS_ERR(inode_p))
 		return PTR_ERR(inode_p);
-
-	iter = bch2_trans_get_iter(trans,
-			BTREE_ID_INODES, POS(start, 0),
-			BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
-	if (IS_ERR(iter))
-		return PTR_ERR(iter);
 again:
-	while (1) {
-		struct bkey_s_c k = bch2_btree_iter_peek_slot(iter);
-
-		ret = bkey_err(k);
-		if (ret)
-			return ret;
+	for_each_btree_key(trans, iter, BTREE_ID_INODES, POS(start, 0),
+			   BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) {
+		if (iter->pos.inode > max)
+			break;
 
-		switch (k.k->type) {
-		case KEY_TYPE_inode:
-			/* slot used */
-			if (iter->pos.inode >= max)
-				goto out;
+		if (k.k->type != KEY_TYPE_inode)
+			goto found_slot;
+	}
 
-			bch2_btree_iter_next_slot(iter);
-			break;
+	bch2_trans_iter_put(trans, iter);
 
-		default:
-			*hint			= k.k->p.inode;
-			inode_u->bi_inum	= k.k->p.inode;
-			inode_u->bi_generation	= bkey_generation(k);
+	if (ret)
+		return ret;
 
-			bch2_inode_pack(inode_p, inode_u);
-			bch2_trans_update(trans, iter, &inode_p->inode.k_i, 0);
-			return 0;
-		}
-	}
-out:
 	if (start != min) {
 		/* Retry from start */
 		start = min;
-		bch2_btree_iter_set_pos(iter, POS(start, 0));
 		goto again;
 	}
 
 	return -ENOSPC;
+found_slot:
+	*hint			= k.k->p.inode;
+	inode_u->bi_inum	= k.k->p.inode;
+	inode_u->bi_generation	= bkey_generation(k);
+
+	bch2_inode_pack(inode_p, inode_u);
+	bch2_trans_update(trans, iter, &inode_p->inode.k_i, 0);
+	bch2_trans_iter_put(trans, iter);
+	return 0;
 }
 
 int bch2_inode_rm(struct bch_fs *c, u64 inode_nr)
@@ -518,14 +507,13 @@ int bch2_inode_find_by_inum_trans(struct btree_trans *trans, u64 inode_nr,
 	k = bch2_btree_iter_peek_slot(iter);
 	ret = bkey_err(k);
 	if (ret)
-		return ret;
+		goto err;
 
 	ret = k.k->type == KEY_TYPE_inode
 		? bch2_inode_unpack(bkey_s_c_to_inode(k), inode)
 		: -ENOENT;
-
+err:
 	bch2_trans_iter_put(trans, iter);
-
 	return ret;
 }
 
diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c
index 3b8c74ca3725..d78a3d5f7246 100644
--- a/fs/bcachefs/reflink.c
+++ b/fs/bcachefs/reflink.c
@@ -128,10 +128,9 @@ static int bch2_make_extent_indirect(struct btree_trans *trans,
 
 	bch2_trans_update(trans, extent_iter, &r_p->k_i, 0);
 err:
-	if (!IS_ERR(reflink_iter)) {
+	if (!IS_ERR(reflink_iter))
 		c->reflink_hint = reflink_iter->pos.offset;
-		bch2_trans_iter_put(trans, reflink_iter);
-	}
+	bch2_trans_iter_put(trans, reflink_iter);
 
 	return ret;
 }
diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h
index cf6ecd963a7b..0710d0bbe36d 100644
--- a/fs/bcachefs/str_hash.h
+++ b/fs/bcachefs/str_hash.h
@@ -262,10 +262,8 @@ int bch2_hash_set(struct btree_trans *trans,
 	if (!ret)
 		ret = -ENOSPC;
 out:
-	if (!IS_ERR_OR_NULL(slot))
-		bch2_trans_iter_put(trans, slot);
-	if (!IS_ERR_OR_NULL(iter))
-		bch2_trans_iter_put(trans, iter);
+	bch2_trans_iter_put(trans, slot);
+	bch2_trans_iter_put(trans, iter);
 
 	return ret;
 found:
-- 
cgit 


From 5c4a5cd5b379567f648c0f5d0f9ec7550bc8b67e Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 27 Dec 2019 20:51:35 -0500
Subject: bcachefs: btree_and_journal_iter

Introduce a new iterator that iterates over keys in the btree with keys
from the journal overlaid on top. This factors out what the erasure
coding init code was doing manually.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c |  20 ++++---
 fs/bcachefs/ec.c               |  37 +++----------
 fs/bcachefs/recovery.c         | 116 +++++++++++++++++++++++++++++++++++------
 fs/bcachefs/recovery.h         |  26 +++++++--
 4 files changed, 139 insertions(+), 60 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 5bc8e7531403..9ce53164d9ac 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -211,33 +211,31 @@ void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c,
 int bch2_alloc_read(struct bch_fs *c, struct journal_keys *journal_keys)
 {
 	struct btree_trans trans;
-	struct btree_iter *iter;
+	struct btree_and_journal_iter iter;
 	struct bkey_s_c k;
 	struct bch_dev *ca;
-	struct journal_key *j;
 	unsigned i;
-	int ret;
+	int ret = 0;
 
 	bch2_trans_init(&trans, c, 0, 0);
 
-	for_each_btree_key(&trans, iter, BTREE_ID_ALLOC, POS_MIN, 0, k, ret)
+	bch2_btree_and_journal_iter_init(&iter, &trans, journal_keys,
+					 BTREE_ID_ALLOC, POS_MIN);
+
+	while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
 		bch2_mark_key(c, k, 0, 0, NULL, 0,
 			      BTREE_TRIGGER_ALLOC_READ|
 			      BTREE_TRIGGER_NOATOMIC);
 
+		bch2_btree_and_journal_iter_advance(&iter);
+	}
+
 	ret = bch2_trans_exit(&trans) ?: ret;
 	if (ret) {
 		bch_err(c, "error reading alloc info: %i", ret);
 		return ret;
 	}
 
-	for_each_journal_key(*journal_keys, j)
-		if (j->btree_id == BTREE_ID_ALLOC)
-			bch2_mark_key(c, bkey_i_to_s_c(j->k),
-				      0, 0, NULL, 0,
-				      BTREE_TRIGGER_ALLOC_READ|
-				      BTREE_TRIGGER_NOATOMIC);
-
 	percpu_down_write(&c->mark_lock);
 	bch2_dev_usage_from_buckets(c);
 	percpu_up_write(&c->mark_lock);
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index 0e2acd4f5712..59985227385b 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -1280,9 +1280,8 @@ int bch2_stripes_write(struct bch_fs *c, unsigned flags, bool *wrote)
 int bch2_stripes_read(struct bch_fs *c, struct journal_keys *journal_keys)
 {
 	struct btree_trans trans;
-	struct btree_iter *btree_iter;
-	struct journal_iter journal_iter;
-	struct bkey_s_c btree_k, journal_k;
+	struct btree_and_journal_iter iter;
+	struct bkey_s_c k;
 	int ret;
 
 	ret = bch2_fs_ec_start(c);
@@ -1291,38 +1290,16 @@ int bch2_stripes_read(struct bch_fs *c, struct journal_keys *journal_keys)
 
 	bch2_trans_init(&trans, c, 0, 0);
 
-	btree_iter	= bch2_trans_get_iter(&trans, BTREE_ID_EC, POS_MIN, 0);
-	journal_iter	= bch2_journal_iter_init(journal_keys, BTREE_ID_EC);
+	bch2_btree_and_journal_iter_init(&iter, &trans, journal_keys,
+					 BTREE_ID_EC, POS_MIN);
 
-	btree_k		= bch2_btree_iter_peek(btree_iter);
-	journal_k	= bch2_journal_iter_peek(&journal_iter);
 
-	while (1) {
-		bool btree;
-
-		if (btree_k.k && journal_k.k) {
-			int cmp = bkey_cmp(btree_k.k->p, journal_k.k->p);
-
-			if (!cmp)
-				btree_k = bch2_btree_iter_next(btree_iter);
-			btree = cmp < 0;
-		} else if (btree_k.k) {
-			btree = true;
-		} else if (journal_k.k) {
-			btree = false;
-		} else {
-			break;
-		}
-
-		bch2_mark_key(c, btree ? btree_k : journal_k,
-			      0, 0, NULL, 0,
+	while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
+		bch2_mark_key(c, k, 0, 0, NULL, 0,
 			      BTREE_TRIGGER_ALLOC_READ|
 			      BTREE_TRIGGER_NOATOMIC);
 
-		if (btree)
-			btree_k = bch2_btree_iter_next(btree_iter);
-		else
-			journal_k = bch2_journal_iter_next(&journal_iter);
+		bch2_btree_and_journal_iter_advance(&iter);
 	}
 
 	ret = bch2_trans_exit(&trans) ?: ret;
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 8ecd4abc8eeb..29e6f9f00bad 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -27,26 +27,15 @@
 
 /* iterate over keys read from the journal: */
 
-struct journal_iter bch2_journal_iter_init(struct journal_keys *keys,
-					   enum btree_id id)
-{
-	return (struct journal_iter) {
-		.keys		= keys,
-		.k		= keys->d,
-		.btree_id	= id,
-	};
-}
-
 struct bkey_s_c bch2_journal_iter_peek(struct journal_iter *iter)
 {
-	while (1) {
-		if (iter->k == iter->keys->d + iter->keys->nr)
-			return bkey_s_c_null;
-
+	while (iter->k) {
 		if (iter->k->btree_id == iter->btree_id)
 			return bkey_i_to_s_c(iter->k->k);
 
 		iter->k++;
+		if (iter->k == iter->keys->d + iter->keys->nr)
+			iter->k = NULL;
 	}
 
 	return bkey_s_c_null;
@@ -54,13 +43,110 @@ struct bkey_s_c bch2_journal_iter_peek(struct journal_iter *iter)
 
 struct bkey_s_c bch2_journal_iter_next(struct journal_iter *iter)
 {
-	if (iter->k == iter->keys->d + iter->keys->nr)
+	if (!iter->k)
 		return bkey_s_c_null;
 
 	iter->k++;
+	if (iter->k == iter->keys->d + iter->keys->nr)
+		iter->k = NULL;
+
 	return bch2_journal_iter_peek(iter);
 }
 
+void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *iter)
+{
+	switch (iter->last) {
+	case none:
+		break;
+	case btree:
+		bch2_btree_iter_next(iter->btree);
+		break;
+	case journal:
+		bch2_journal_iter_next(&iter->journal);
+		break;
+	}
+
+	iter->last = none;
+}
+
+struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *iter)
+{
+	struct bkey_s_c ret;
+
+	while (1) {
+		struct bkey_s_c btree_k		= bch2_btree_iter_peek(iter->btree);
+		struct bkey_s_c journal_k	= bch2_journal_iter_peek(&iter->journal);
+
+		if (btree_k.k && journal_k.k) {
+			int cmp = bkey_cmp(btree_k.k->p, journal_k.k->p);
+
+			if (!cmp)
+				bch2_btree_iter_next(iter->btree);
+
+			iter->last = cmp < 0 ? btree : journal;
+		} else if (btree_k.k) {
+			iter->last = btree;
+		} else if (journal_k.k) {
+			iter->last = journal;
+		} else {
+			iter->last = none;
+			return bkey_s_c_null;
+		}
+
+		ret = iter->last == journal ? journal_k : btree_k;
+		if (!bkey_deleted(ret.k))
+			break;
+
+		bch2_btree_and_journal_iter_advance(iter);
+	}
+
+	return ret;
+}
+
+struct bkey_s_c bch2_btree_and_journal_iter_next(struct btree_and_journal_iter *iter)
+{
+	bch2_btree_and_journal_iter_advance(iter);
+
+	return bch2_btree_and_journal_iter_peek(iter);
+}
+
+struct journal_key *journal_key_search(struct journal_keys *journal_keys,
+				       enum btree_id id, struct bpos pos)
+{
+	size_t l = 0, r = journal_keys->nr, m;
+
+	while (l < r) {
+		m = l + ((r - l) >> 1);
+		if ((cmp_int(id, journal_keys->d[m].btree_id) ?:
+		     bkey_cmp(pos, journal_keys->d[m].k->k.p)) > 0)
+			l = m + 1;
+		else
+			r = m;
+	}
+
+	BUG_ON(l < journal_keys->nr &&
+	       (cmp_int(id, journal_keys->d[l].btree_id) ?:
+		bkey_cmp(pos, journal_keys->d[l].k->k.p)) > 0);
+
+	BUG_ON(l &&
+	       (cmp_int(id, journal_keys->d[l - 1].btree_id) ?:
+		bkey_cmp(pos, journal_keys->d[l - 1].k->k.p)) <= 0);
+
+	return l < journal_keys->nr ? journal_keys->d + l : NULL;
+}
+
+void bch2_btree_and_journal_iter_init(struct btree_and_journal_iter *iter,
+				      struct btree_trans *trans,
+				      struct journal_keys *journal_keys,
+				      enum btree_id id, struct bpos pos)
+{
+	iter->journal.keys	= journal_keys;
+	iter->journal.k		= journal_key_search(journal_keys, id, pos);
+	iter->journal.btree_id	= id;
+
+	iter->btree = bch2_trans_get_iter(trans, id, pos, 0);
+}
+
 /* sort and dedup all keys in the journal: */
 
 static void journal_entries_free(struct list_head *list)
diff --git a/fs/bcachefs/recovery.h b/fs/bcachefs/recovery.h
index 479ea46f8dcb..ccd84a8fe60d 100644
--- a/fs/bcachefs/recovery.h
+++ b/fs/bcachefs/recovery.h
@@ -24,10 +24,28 @@ struct journal_iter {
 	enum btree_id		btree_id;
 };
 
-struct journal_iter bch2_journal_iter_init(struct journal_keys *,
-					   enum btree_id);
-struct bkey_s_c bch2_journal_iter_peek(struct journal_iter *);
-struct bkey_s_c bch2_journal_iter_next(struct journal_iter *);
+struct btree_and_journal_iter {
+	enum btree_id		btree_id;
+
+	struct btree_iter	*btree;
+	struct journal_iter	journal;
+
+	enum last_key_returned {
+		none,
+		btree,
+		journal,
+	}			last;
+};
+
+void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *);
+struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *);
+struct bkey_s_c bch2_btree_and_journal_iter_next(struct btree_and_journal_iter *);
+struct journal_key *journal_key_search(struct journal_keys *,
+				       enum btree_id, struct bpos);
+void bch2_btree_and_journal_iter_init(struct btree_and_journal_iter *,
+				      struct btree_trans *,
+				      struct journal_keys *,
+				      enum btree_id, struct bpos);
 
 int bch2_fs_recovery(struct bch_fs *);
 int bch2_fs_initialize(struct bch_fs *);
-- 
cgit 


From 6a9ec8282647cde71bb34e2d78b3d3789206ffed Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 31 Jan 2020 13:23:18 -0500
Subject: bcachefs: __bch2_btree_iter_set_pos()

This one takes an additional argument for whether we're searching for >=
or > the search key.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c | 26 +++++++++++++++++++++++++-
 fs/bcachefs/btree_iter.h |  1 +
 2 files changed, 26 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index cc5d6389899c..f7623a71b768 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1271,6 +1271,29 @@ static unsigned btree_iter_pos_changed(struct btree_iter *iter, int cmp)
 	return l;
 }
 
+void __bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos,
+			       bool strictly_greater)
+{
+	struct bpos old = btree_iter_search_key(iter);
+	unsigned l;
+	int cmp;
+
+	iter->flags &= ~BTREE_ITER_IS_EXTENTS;
+	iter->flags |= strictly_greater ? BTREE_ITER_IS_EXTENTS : 0;
+	iter->pos = new_pos;
+
+	cmp = bkey_cmp(btree_iter_search_key(iter), old);
+	if (!cmp)
+		return;
+
+	l = btree_iter_pos_changed(iter, cmp);
+
+	if (l != iter->level)
+		btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
+	else
+		btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
+}
+
 void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos)
 {
 	int cmp = bkey_cmp(new_pos, iter->pos);
@@ -1947,7 +1970,8 @@ struct btree_iter *bch2_trans_get_iter(struct btree_trans *trans,
 		__btree_trans_get_iter(trans, btree_id, pos, flags);
 
 	if (!IS_ERR(iter))
-		bch2_btree_iter_set_pos(iter, pos);
+		__bch2_btree_iter_set_pos(iter, pos,
+			btree_node_type_is_extents(btree_id));
 	return iter;
 }
 
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index 86347bae9795..f9bf01e26aa1 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -166,6 +166,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *);
 struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *);
 
 void bch2_btree_iter_set_pos_same_leaf(struct btree_iter *, struct bpos);
+void __bch2_btree_iter_set_pos(struct btree_iter *, struct bpos, bool);
 void bch2_btree_iter_set_pos(struct btree_iter *, struct bpos);
 
 static inline struct bpos btree_type_successor(enum btree_id id,
-- 
cgit 


From c4a94ae3da7c666d5d5230897a3f6d1e9d8d8b55 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 31 Jan 2020 13:26:05 -0500
Subject: bcachefs: Make BTREE_ITER_IS_EXTENTS private to iter code

Prep work for changing the core btree update path to handle extents like
regular keys; we need to reduce the scope of what BTREE_ITER_IS_EXTENTS
means

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c        | 4 ++--
 fs/bcachefs/btree_update_leaf.c | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index f7623a71b768..f002ddc18cbb 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -408,7 +408,7 @@ static void __bch2_btree_iter_verify(struct btree_iter *iter,
 	 * For extents, the iterator may have skipped past deleted keys (but not
 	 * whiteouts)
 	 */
-	k = b->c.level || iter->flags & BTREE_ITER_IS_EXTENTS
+	k = b->c.level || btree_node_type_is_extents(iter->btree_id)
 		? bch2_btree_node_iter_prev_filter(&tmp, b, KEY_TYPE_discard)
 		: bch2_btree_node_iter_prev_all(&tmp, b);
 	if (k && bkey_iter_pos_cmp(b, k, &pos) >= 0) {
@@ -563,7 +563,7 @@ fixup_done:
 	if (!bch2_btree_node_iter_end(node_iter) &&
 	    iter_current_key_modified &&
 	    (b->c.level ||
-	     (iter->flags & BTREE_ITER_IS_EXTENTS))) {
+	     btree_node_type_is_extents(iter->btree_id))) {
 		struct bset_tree *t;
 		struct bkey_packed *k, *k2, *p;
 
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 9cf7075dd265..11bd0558ebd5 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -780,7 +780,7 @@ int bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter,
 
 	iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT;
 
-	if (iter->flags & BTREE_ITER_IS_EXTENTS) {
+	if (btree_node_type_is_extents(iter->btree_id)) {
 		iter->pos_after_commit = k->k.p;
 		iter->flags |= BTREE_ITER_SET_POS_AFTER_COMMIT;
 	}
@@ -898,7 +898,7 @@ retry:
 		 */
 		delete.k.p = iter->pos;
 
-		if (iter->flags & BTREE_ITER_IS_EXTENTS) {
+		if (btree_node_type_is_extents(iter->btree_id)) {
 			unsigned max_sectors =
 				KEY_SIZE_MAX & (~0 << trans->c->block_bits);
 
-- 
cgit 


From 1f49dafcd3191de1db9b6105983696b5bc7aedad Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 6 Feb 2020 20:15:15 -0500
Subject: bcachefs: Fix bch2_ptr_swab for indirect extents

bch2_ptr_swab was never updated when the code for generic keys with
pointers was added - it assumed the entire val was only used for
pointers.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bkey_methods.c |  9 +++------
 fs/bcachefs/bkey_methods.h |  4 ++--
 fs/bcachefs/btree_io.c     | 17 ++++++++++-------
 fs/bcachefs/ec.h           |  1 +
 fs/bcachefs/extents.c      | 16 +++++++++-------
 fs/bcachefs/extents.h      |  2 +-
 fs/bcachefs/journal_io.c   |  6 ++++--
 fs/bcachefs/reflink.h      |  1 +
 8 files changed, 31 insertions(+), 25 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c
index 320e17d108d2..c064cf468a9b 100644
--- a/fs/bcachefs/bkey_methods.c
+++ b/fs/bcachefs/bkey_methods.c
@@ -202,15 +202,12 @@ void bch2_bkey_val_to_text(struct printbuf *out, struct bch_fs *c,
 	bch2_val_to_text(out, c, k);
 }
 
-void bch2_bkey_swab(const struct bkey_format *f,
-		    struct bkey_packed *k)
+void bch2_bkey_swab_val(struct bkey_s k)
 {
-	const struct bkey_ops *ops = &bch2_bkey_ops[k->type];
-
-	bch2_bkey_swab_key(f, k);
+	const struct bkey_ops *ops = &bch2_bkey_ops[k.k->type];
 
 	if (ops->swab)
-		ops->swab(f, k);
+		ops->swab(k);
 }
 
 bool bch2_bkey_normalize(struct bch_fs *c, struct bkey_s k)
diff --git a/fs/bcachefs/bkey_methods.h b/fs/bcachefs/bkey_methods.h
index 8568b65c1ed2..d36468b75223 100644
--- a/fs/bcachefs/bkey_methods.h
+++ b/fs/bcachefs/bkey_methods.h
@@ -29,7 +29,7 @@ struct bkey_ops {
 	void		(*key_debugcheck)(struct bch_fs *, struct bkey_s_c);
 	void		(*val_to_text)(struct printbuf *, struct bch_fs *,
 				       struct bkey_s_c);
-	void		(*swab)(const struct bkey_format *, struct bkey_packed *);
+	void		(*swab)(struct bkey_s);
 	bool		(*key_normalize)(struct bch_fs *, struct bkey_s);
 	enum merge_result (*key_merge)(struct bch_fs *,
 				       struct bkey_s, struct bkey_s);
@@ -51,7 +51,7 @@ void bch2_val_to_text(struct printbuf *, struct bch_fs *,
 void bch2_bkey_val_to_text(struct printbuf *, struct bch_fs *,
 			   struct bkey_s_c);
 
-void bch2_bkey_swab(const struct bkey_format *, struct bkey_packed *);
+void bch2_bkey_swab_val(struct bkey_s);
 
 bool bch2_bkey_normalize(struct bch_fs *, struct bkey_s);
 
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index c5b5143ada05..83f61443c8bb 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -783,7 +783,7 @@ static int validate_bset(struct bch_fs *c, struct btree *b,
 
 	for (k = i->start;
 	     k != vstruct_last(i);) {
-		struct bkey_s_c u;
+		struct bkey_s u;
 		struct bkey tmp;
 		const char *invalid;
 
@@ -804,21 +804,24 @@ static int validate_bset(struct bch_fs *c, struct btree *b,
 		}
 
 		if (BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN)
-			bch2_bkey_swab(&b->format, k);
+			bch2_bkey_swab_key(&b->format, k);
 
 		if (!write &&
 		    version < bcachefs_metadata_version_bkey_renumber)
 			bch2_bkey_renumber(btree_node_type(b), k, write);
 
-		u = bkey_disassemble(b, k, &tmp);
+		u = __bkey_disassemble(b, k, &tmp);
 
-		invalid = __bch2_bkey_invalid(c, u, btree_node_type(b)) ?:
-			bch2_bkey_in_btree_node(b, u) ?:
-			(write ? bch2_bkey_val_invalid(c, u) : NULL);
+		if (BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN)
+			bch2_bkey_swab_val(u);
+
+		invalid = __bch2_bkey_invalid(c, u.s_c, btree_node_type(b)) ?:
+			bch2_bkey_in_btree_node(b, u.s_c) ?:
+			(write ? bch2_bkey_val_invalid(c, u.s_c) : NULL);
 		if (invalid) {
 			char buf[160];
 
-			bch2_bkey_val_to_text(&PBUF(buf), c, u);
+			bch2_bkey_val_to_text(&PBUF(buf), c, u.s_c);
 			btree_err(BTREE_ERR_FIXABLE, c, b, i,
 				  "invalid bkey:\n%s\n%s", invalid, buf);
 
diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h
index 8d9fbfd19f66..cf67abd48490 100644
--- a/fs/bcachefs/ec.h
+++ b/fs/bcachefs/ec.h
@@ -12,6 +12,7 @@ void bch2_stripe_to_text(struct printbuf *, struct bch_fs *,
 #define bch2_bkey_ops_stripe (struct bkey_ops) {	\
 	.key_invalid	= bch2_stripe_invalid,		\
 	.val_to_text	= bch2_stripe_to_text,		\
+	.swab		= bch2_ptr_swab,		\
 }
 
 static inline unsigned stripe_csums_per_device(const struct bch_stripe *s)
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index f97fa9341c9f..10ca544317ba 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -1078,17 +1078,19 @@ const char *bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k)
 	return NULL;
 }
 
-void bch2_ptr_swab(const struct bkey_format *f, struct bkey_packed *k)
+void bch2_ptr_swab(struct bkey_s k)
 {
+	struct bkey_ptrs ptrs = bch2_bkey_ptrs(k);
 	union bch_extent_entry *entry;
-	u64 *d = (u64 *) bkeyp_val(f, k);
-	unsigned i;
+	u64 *d;
 
-	for (i = 0; i < bkeyp_val_u64s(f, k); i++)
-		d[i] = swab64(d[i]);
+	for (d =  (u64 *) ptrs.start;
+	     d != (u64 *) ptrs.end;
+	     d++)
+		*d = swab64(*d);
 
-	for (entry = (union bch_extent_entry *) d;
-	     entry < (union bch_extent_entry *) (d + bkeyp_val_u64s(f, k));
+	for (entry = ptrs.start;
+	     entry < ptrs.end;
 	     entry = extent_entry_next(entry)) {
 		switch (extent_entry_type(entry)) {
 		case BCH_EXTENT_ENTRY_ptr:
diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h
index 0d8554172263..6e8119a8ad30 100644
--- a/fs/bcachefs/extents.h
+++ b/fs/bcachefs/extents.h
@@ -532,7 +532,7 @@ void bch2_bkey_ptrs_to_text(struct printbuf *, struct bch_fs *,
 			    struct bkey_s_c);
 const char *bch2_bkey_ptrs_invalid(const struct bch_fs *, struct bkey_s_c);
 
-void bch2_ptr_swab(const struct bkey_format *, struct bkey_packed *);
+void bch2_ptr_swab(struct bkey_s);
 
 /* Generic extent code: */
 
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 244e3faa6b16..1866ed30ce89 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -170,8 +170,10 @@ static int journal_validate_key(struct bch_fs *c, struct jset *jset,
 		return 0;
 	}
 
-	if (JSET_BIG_ENDIAN(jset) != CPU_BIG_ENDIAN)
-		bch2_bkey_swab(NULL, bkey_to_packed(k));
+	if (JSET_BIG_ENDIAN(jset) != CPU_BIG_ENDIAN) {
+		bch2_bkey_swab_key(NULL, bkey_to_packed(k));
+		bch2_bkey_swab_val(bkey_i_to_s(k));
+	}
 
 	if (!write &&
 	    version < bcachefs_metadata_version_bkey_renumber)
diff --git a/fs/bcachefs/reflink.h b/fs/bcachefs/reflink.h
index ac23b855858c..5445c1cf0797 100644
--- a/fs/bcachefs/reflink.h
+++ b/fs/bcachefs/reflink.h
@@ -22,6 +22,7 @@ void bch2_reflink_v_to_text(struct printbuf *, struct bch_fs *,
 #define bch2_bkey_ops_reflink_v (struct bkey_ops) {		\
 	.key_invalid	= bch2_reflink_v_invalid,		\
 	.val_to_text	= bch2_reflink_v_to_text,		\
+	.swab		= bch2_ptr_swab,			\
 }
 
 s64 bch2_remap_range(struct bch_fs *, struct bpos, struct bpos,
-- 
cgit 


From a9bc0a5173bb9c30d0073ccf39b10e26d2f96ecf Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 18 Feb 2020 20:02:41 -0500
Subject: bcachefs: Check for bad key version number

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_gc.c | 6 +++++-
 fs/bcachefs/io.c       | 2 +-
 2 files changed, 6 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index a0b65267cf76..a5fe3b316e06 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -126,7 +126,11 @@ static int bch2_gc_mark_key(struct bch_fs *c, struct bkey_s_c k,
 		BUG_ON(journal_seq_verify(c) &&
 		       k.k->version.lo > journal_cur_seq(&c->journal));
 
-		if (k.k->version.lo > atomic64_read(&c->key_version))
+		/* XXX change to fsck check */
+		if (fsck_err_on(k.k->version.lo > atomic64_read(&c->key_version), c,
+				"key version number higher than recorded: %llu > %llu",
+				k.k->version.lo,
+				atomic64_read(&c->key_version)))
 			atomic64_set(&c->key_version, k.k->version.lo);
 
 		if (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) ||
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index 679ad54d4c4e..33603624b42a 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -943,7 +943,7 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp,
 
 		if (bch2_csum_type_is_encryption(op->csum_type)) {
 			if (bversion_zero(version)) {
-				version.lo = atomic64_inc_return(&c->key_version) + 1;
+				version.lo = atomic64_inc_return(&c->key_version);
 			} else {
 				crc.nonce = op->nonce;
 				op->nonce += src_len >> 9;
-- 
cgit 


From b606c8aa568e9f4f10f1b7504f8b96628d933a0e Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 18 Feb 2020 19:29:33 -0500
Subject: bcachefs: Fix traversing to interior nodes

NULL is used to mean "reach end of traversal" - we were only
initializing the leaf node in the iterator to the right sentinal value.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index f002ddc18cbb..bf68ab7257ce 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1732,8 +1732,7 @@ static inline void bch2_btree_iter_init(struct btree_trans *trans,
 	iter->nodes_locked		= 0;
 	iter->nodes_intent_locked	= 0;
 	for (i = 0; i < ARRAY_SIZE(iter->l); i++)
-		iter->l[i].b		= NULL;
-	iter->l[iter->level].b		= BTREE_ITER_NO_NODE_INIT;
+		iter->l[i].b		= BTREE_ITER_NO_NODE_INIT;
 
 	prefetch(c->btree_roots[btree_id].b);
 }
-- 
cgit 


From 237e80483a6466f3c1968c2a8bb115b3e24d951b Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 18 Feb 2020 17:15:32 -0500
Subject: bcachefs: introduce b->hash_val

This is partly prep work for introducing bch_btree_ptr_v2, but it'll
also be a bit of a performance boost by moving the full key out of the
hot part of struct btree.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_cache.c           | 24 ++++++++++++++----------
 fs/bcachefs/btree_cache.h           | 13 ++++++++++---
 fs/bcachefs/btree_io.c              |  9 ++-------
 fs/bcachefs/btree_types.h           |  7 ++++---
 fs/bcachefs/btree_update.h          |  2 +-
 fs/bcachefs/btree_update_interior.c | 35 +++++++++++++++++++----------------
 fs/bcachefs/migrate.c               |  6 ++----
 7 files changed, 52 insertions(+), 44 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index 8eed82ac41f1..ee3c1f40b500 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -62,13 +62,13 @@ static int bch2_btree_cache_cmp_fn(struct rhashtable_compare_arg *arg,
 	const struct btree *b = obj;
 	const u64 *v = arg->key;
 
-	return PTR_HASH(&b->key) == *v ? 0 : 1;
+	return b->hash_val == *v ? 0 : 1;
 }
 
 static const struct rhashtable_params bch_btree_cache_params = {
 	.head_offset	= offsetof(struct btree, hash),
-	.key_offset	= offsetof(struct btree, key.v),
-	.key_len	= sizeof(struct bch_extent_ptr),
+	.key_offset	= offsetof(struct btree, hash_val),
+	.key_len	= sizeof(u64),
 	.obj_cmpfn	= bch2_btree_cache_cmp_fn,
 };
 
@@ -115,11 +115,14 @@ void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b)
 	rhashtable_remove_fast(&bc->table, &b->hash, bch_btree_cache_params);
 
 	/* Cause future lookups for this node to fail: */
-	PTR_HASH(&b->key) = 0;
+	b->hash_val = 0;
 }
 
 int __bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b)
 {
+	BUG_ON(b->hash_val);
+	b->hash_val = btree_ptr_hash_val(&b->key);
+
 	return rhashtable_lookup_insert_fast(&bc->table, &b->hash,
 					     bch_btree_cache_params);
 }
@@ -145,8 +148,9 @@ __flatten
 static inline struct btree *btree_cache_find(struct btree_cache *bc,
 				     const struct bkey_i *k)
 {
-	return rhashtable_lookup_fast(&bc->table, &PTR_HASH(k),
-				      bch_btree_cache_params);
+	u64 v = btree_ptr_hash_val(k);
+
+	return rhashtable_lookup_fast(&bc->table, &v, bch_btree_cache_params);
 }
 
 /*
@@ -200,7 +204,7 @@ static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush)
 		btree_node_wait_on_io(b);
 	}
 out:
-	if (PTR_HASH(&b->key) && !ret)
+	if (b->hash_val && !ret)
 		trace_btree_node_reap(c, b);
 	return ret;
 out_unlock:
@@ -608,7 +612,7 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c,
 		/* raced with another fill: */
 
 		/* mark as unhashed... */
-		PTR_HASH(&b->key) = 0;
+		b->hash_val = 0;
 
 		mutex_lock(&bc->lock);
 		list_add(&b->list, &bc->freeable);
@@ -711,7 +715,7 @@ retry:
 		 * free it:
 		 *
 		 * To guard against this, btree nodes are evicted from the cache
-		 * when they're freed - and PTR_HASH() is zeroed out, which we
+		 * when they're freed - and b->hash_val is zeroed out, which we
 		 * check for after we lock the node.
 		 *
 		 * Then, bch2_btree_node_relock() on the parent will fail - because
@@ -724,7 +728,7 @@ retry:
 		if (!btree_node_lock(b, k->k.p, level, iter, lock_type))
 			return ERR_PTR(-EINTR);
 
-		if (unlikely(PTR_HASH(&b->key) != PTR_HASH(k) ||
+		if (unlikely(b->hash_val != btree_ptr_hash_val(k) ||
 			     b->c.level != level ||
 			     race_fault())) {
 			six_unlock_type(&b->c.lock, lock_type);
diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h
index adacb0a06703..270f7f8fb140 100644
--- a/fs/bcachefs/btree_cache.h
+++ b/fs/bcachefs/btree_cache.h
@@ -35,13 +35,20 @@ void bch2_fs_btree_cache_exit(struct bch_fs *);
 int bch2_fs_btree_cache_init(struct bch_fs *);
 void bch2_fs_btree_cache_init_early(struct btree_cache *);
 
-#define PTR_HASH(_k)	*((u64 *) &bkey_i_to_btree_ptr_c(_k)->v)
+static inline u64 btree_ptr_hash_val(const struct bkey_i *k)
+{
+	switch (k->k.type) {
+	case KEY_TYPE_btree_ptr:
+		return *((u64 *) bkey_i_to_btree_ptr_c(k)->v.start);
+	default:
+		return 0;
+	}
+}
 
 /* is btree node in hash table? */
 static inline bool btree_node_hashed(struct btree *b)
 {
-	return b->key.k.type == KEY_TYPE_btree_ptr &&
-		PTR_HASH(&b->key);
+	return b->hash_val != 0;
 }
 
 #define for_each_cached_btree(_b, _c, _tbl, _iter, _pos)		\
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 83f61443c8bb..9df8d4f785bf 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -1254,8 +1254,6 @@ static void bch2_btree_node_write_error(struct bch_fs *c,
 {
 	struct btree *b		= wbio->wbio.bio.bi_private;
 	__BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp;
-	struct bkey_i_btree_ptr *new_key;
-	struct bkey_s_btree_ptr bp;
 	struct bch_extent_ptr *ptr;
 	struct btree_trans trans;
 	struct btree_iter *iter;
@@ -1281,16 +1279,13 @@ retry:
 
 	bkey_copy(&tmp.k, &b->key);
 
-	new_key = bkey_i_to_btree_ptr(&tmp.k);
-	bp = btree_ptr_i_to_s(new_key);
-
 	bch2_bkey_drop_ptrs(bkey_i_to_s(&tmp.k), ptr,
 		bch2_dev_list_has_dev(wbio->wbio.failed, ptr->dev));
 
-	if (!bch2_bkey_nr_ptrs(bp.s_c))
+	if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(&tmp.k)))
 		goto err;
 
-	ret = bch2_btree_node_update_key(c, iter, b, new_key);
+	ret = bch2_btree_node_update_key(c, iter, b, &tmp.k);
 	if (ret == -EINTR)
 		goto retry;
 	if (ret)
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 85d4a6d2f7e9..4636b4fd1222 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -71,9 +71,7 @@ struct btree {
 	struct btree_bkey_cached_common c;
 
 	struct rhash_head	hash;
-
-	/* Key/pointer for this btree node */
-	__BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX);
+	u64			hash_val;
 
 	unsigned long		flags;
 	u16			written;
@@ -136,6 +134,9 @@ struct btree {
 #ifdef CONFIG_BCACHEFS_DEBUG
 	bool			*expensive_debug_checks;
 #endif
+
+	/* Key/pointer for this btree node */
+	__BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX);
 };
 
 struct btree_cache {
diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
index 2c34bae64281..be4fe818eac8 100644
--- a/fs/bcachefs/btree_update.h
+++ b/fs/bcachefs/btree_update.h
@@ -70,7 +70,7 @@ int bch2_btree_delete_range(struct bch_fs *, enum btree_id,
 int bch2_btree_node_rewrite(struct bch_fs *c, struct btree_iter *,
 			    __le64, unsigned);
 int bch2_btree_node_update_key(struct bch_fs *, struct btree_iter *,
-			       struct btree *, struct bkey_i_btree_ptr *);
+			       struct btree *, struct bkey_i *);
 
 int bch2_trans_update(struct btree_trans *, struct btree_iter *,
 		      struct bkey_i *, enum btree_trigger_flags);
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 713f2d41e6c9..ff8cb37ed9a1 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -1944,7 +1944,7 @@ static void __bch2_btree_node_update_key(struct bch_fs *c,
 					 struct btree_update *as,
 					 struct btree_iter *iter,
 					 struct btree *b, struct btree *new_hash,
-					 struct bkey_i_btree_ptr *new_key)
+					 struct bkey_i *new_key)
 {
 	struct btree *parent;
 	int ret;
@@ -1989,20 +1989,20 @@ static void __bch2_btree_node_update_key(struct bch_fs *c,
 	 */
 	ret = bch2_disk_reservation_add(c, &as->reserve->disk_res,
 			c->opts.btree_node_size *
-			bch2_bkey_nr_ptrs(bkey_i_to_s_c(&new_key->k_i)),
+			bch2_bkey_nr_ptrs(bkey_i_to_s_c(new_key)),
 			BCH_DISK_RESERVATION_NOFAIL);
 	BUG_ON(ret);
 
 	parent = btree_node_parent(iter, b);
 	if (parent) {
 		if (new_hash) {
-			bkey_copy(&new_hash->key, &new_key->k_i);
+			bkey_copy(&new_hash->key, new_key);
 			ret = bch2_btree_node_hash_insert(&c->btree_cache,
 					new_hash, b->c.level, b->c.btree_id);
 			BUG_ON(ret);
 		}
 
-		bch2_keylist_add(&as->parent_keys, &new_key->k_i);
+		bch2_keylist_add(&as->parent_keys, new_key);
 		bch2_btree_insert_node(as, parent, iter, &as->parent_keys, 0);
 
 		if (new_hash) {
@@ -2011,12 +2011,12 @@ static void __bch2_btree_node_update_key(struct bch_fs *c,
 
 			bch2_btree_node_hash_remove(&c->btree_cache, b);
 
-			bkey_copy(&b->key, &new_key->k_i);
+			bkey_copy(&b->key, new_key);
 			ret = __bch2_btree_node_hash_insert(&c->btree_cache, b);
 			BUG_ON(ret);
 			mutex_unlock(&c->btree_cache.lock);
 		} else {
-			bkey_copy(&b->key, &new_key->k_i);
+			bkey_copy(&b->key, new_key);
 		}
 	} else {
 		struct bch_fs_usage_online *fs_usage;
@@ -2029,11 +2029,11 @@ static void __bch2_btree_node_update_key(struct bch_fs *c,
 		percpu_down_read(&c->mark_lock);
 		fs_usage = bch2_fs_usage_scratch_get(c);
 
-		bch2_mark_key_locked(c, bkey_i_to_s_c(&new_key->k_i),
+		bch2_mark_key_locked(c, bkey_i_to_s_c(new_key),
 			      0, 0, &fs_usage->u, 0,
 			      BTREE_TRIGGER_INSERT);
 		if (gc_visited(c, gc_pos_btree_root(b->c.btree_id)))
-			bch2_mark_key_locked(c, bkey_i_to_s_c(&new_key->k_i),
+			bch2_mark_key_locked(c, bkey_i_to_s_c(new_key),
 					     0, 0, NULL, 0,
 					     BTREE_TRIGGER_INSERT||
 					     BTREE_TRIGGER_GC);
@@ -2047,16 +2047,16 @@ static void __bch2_btree_node_update_key(struct bch_fs *c,
 		percpu_up_read(&c->mark_lock);
 		mutex_unlock(&c->btree_interior_update_lock);
 
-		if (PTR_HASH(&new_key->k_i) != PTR_HASH(&b->key)) {
+		if (btree_ptr_hash_val(new_key) != b->hash_val) {
 			mutex_lock(&c->btree_cache.lock);
 			bch2_btree_node_hash_remove(&c->btree_cache, b);
 
-			bkey_copy(&b->key, &new_key->k_i);
+			bkey_copy(&b->key, new_key);
 			ret = __bch2_btree_node_hash_insert(&c->btree_cache, b);
 			BUG_ON(ret);
 			mutex_unlock(&c->btree_cache.lock);
 		} else {
-			bkey_copy(&b->key, &new_key->k_i);
+			bkey_copy(&b->key, new_key);
 		}
 
 		btree_update_updated_root(as);
@@ -2068,7 +2068,7 @@ static void __bch2_btree_node_update_key(struct bch_fs *c,
 
 int bch2_btree_node_update_key(struct bch_fs *c, struct btree_iter *iter,
 			       struct btree *b,
-			       struct bkey_i_btree_ptr *new_key)
+			       struct bkey_i *new_key)
 {
 	struct btree *parent = btree_node_parent(iter, b);
 	struct btree_update *as = NULL;
@@ -2091,8 +2091,11 @@ int bch2_btree_node_update_key(struct bch_fs *c, struct btree_iter *iter,
 		}
 	}
 
-	/* check PTR_HASH() after @b is locked by btree_iter_traverse(): */
-	if (PTR_HASH(&new_key->k_i) != PTR_HASH(&b->key)) {
+	/*
+	 * check btree_ptr_hash_val() after @b is locked by
+	 * btree_iter_traverse():
+	 */
+	if (btree_ptr_hash_val(new_key) != b->hash_val) {
 		/* bch2_btree_reserve_get will unlock */
 		ret = bch2_btree_cache_cannibalize_lock(c, &cl);
 		if (ret) {
@@ -2134,7 +2137,7 @@ int bch2_btree_node_update_key(struct bch_fs *c, struct btree_iter *iter,
 			goto err;
 	}
 
-	ret = bch2_mark_bkey_replicas(c, bkey_i_to_s_c(&new_key->k_i));
+	ret = bch2_mark_bkey_replicas(c, bkey_i_to_s_c(new_key));
 	if (ret)
 		goto err_free_update;
 
@@ -2193,7 +2196,7 @@ void bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id)
 
 	bkey_btree_ptr_init(&b->key);
 	b->key.k.p = POS_MAX;
-	PTR_HASH(&b->key) = U64_MAX - id;
+	*((u64 *) bkey_i_to_btree_ptr(&b->key)->v.start) = U64_MAX - id;
 
 	bch2_bset_init_first(b, &b->data->keys);
 	bch2_btree_build_aux_trees(b);
diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c
index 1ef62a189e33..e26fa1608f39 100644
--- a/fs/bcachefs/migrate.c
+++ b/fs/bcachefs/migrate.c
@@ -123,23 +123,21 @@ static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
 		for_each_btree_node(&trans, iter, id, POS_MIN,
 				    BTREE_ITER_PREFETCH, b) {
 			__BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp;
-			struct bkey_i_btree_ptr *new_key;
 retry:
 			if (!bch2_bkey_has_device(bkey_i_to_s_c(&b->key),
 						  dev_idx))
 				continue;
 
 			bkey_copy(&tmp.k, &b->key);
-			new_key = bkey_i_to_btree_ptr(&tmp.k);
 
-			ret = drop_dev_ptrs(c, bkey_i_to_s(&new_key->k_i),
+			ret = drop_dev_ptrs(c, bkey_i_to_s(&tmp.k),
 					    dev_idx, flags, true);
 			if (ret) {
 				bch_err(c, "Cannot drop device without losing data");
 				goto err;
 			}
 
-			ret = bch2_btree_node_update_key(c, iter, b, new_key);
+			ret = bch2_btree_node_update_key(c, iter, b, &tmp.k);
 			if (ret == -EINTR) {
 				b = bch2_btree_iter_peek_node(iter);
 				goto retry;
-- 
cgit 


From 548b3d209fa5c6aaa9db58a69d9f6cf4ce8978b6 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 7 Feb 2020 13:38:02 -0500
Subject: bcachefs: btree_ptr_v2

Add a new btree ptr type which contains the sequence number (random 64
bit cookie, actually) for that btree node - this lets us verify that
when we read in a btree node it really is the btree node we wanted.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs_format.h       | 22 +++++++++++--
 fs/bcachefs/bkey.h                  |  1 +
 fs/bcachefs/btree_cache.h           |  2 ++
 fs/bcachefs/btree_io.c              | 30 ++++++++++++++++--
 fs/bcachefs/btree_update_interior.c | 63 ++++++++++++++++++++++++++-----------
 fs/bcachefs/buckets.c               |  2 ++
 fs/bcachefs/buckets.h               |  3 +-
 fs/bcachefs/extents.c               |  3 ++
 fs/bcachefs/extents.h               | 15 +++++++++
 fs/bcachefs/recovery.c              |  1 +
 fs/bcachefs/replicas.c              |  1 +
 11 files changed, 117 insertions(+), 26 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index dbc9c15514bd..575fb7143cc0 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -343,7 +343,8 @@ static inline void bkey_init(struct bkey *k)
 	x(stripe,		14)			\
 	x(reflink_p,		15)			\
 	x(reflink_v,		16)			\
-	x(inline_data,		17)
+	x(inline_data,		17)			\
+	x(btree_ptr_v2,		18)
 
 enum bch_bkey_type {
 #define x(name, nr) KEY_TYPE_##name	= nr,
@@ -599,6 +600,19 @@ struct bch_btree_ptr {
 	struct bch_extent_ptr	start[];
 } __attribute__((packed, aligned(8)));
 
+struct bch_btree_ptr_v2 {
+	struct bch_val		v;
+
+	__u64			mem_ptr;
+	__le64			seq;
+	__le16			sectors_written;
+	/* In case we ever decide to do variable size btree nodes: */
+	__le16			sectors;
+	struct bpos		min_key;
+	__u64			_data[0];
+	struct bch_extent_ptr	start[];
+} __attribute__((packed, aligned(8)));
+
 struct bch_extent {
 	struct bch_val		v;
 
@@ -630,7 +644,8 @@ struct bch_reservation {
 
 /* Btree pointers don't carry around checksums: */
 #define BKEY_BTREE_PTR_VAL_U64s_MAX				\
-	((sizeof(struct bch_extent_ptr)) / sizeof(u64) * BCH_REPLICAS_MAX)
+	((sizeof(struct bch_btree_ptr_v2) +			\
+	  sizeof(struct bch_extent_ptr) * BCH_REPLICAS_MAX) / sizeof(u64))
 #define BKEY_BTREE_PTR_U64s_MAX					\
 	(BKEY_U64s + BKEY_BTREE_PTR_VAL_U64s_MAX)
 
@@ -1299,7 +1314,8 @@ LE64_BITMASK(BCH_SB_ERASURE_CODE,	struct bch_sb, flags[3],  0, 16);
 	x(new_siphash,			7)	\
 	x(inline_data,			8)	\
 	x(new_extent_overwrite,		9)	\
-	x(incompressible,		10)
+	x(incompressible,		10)	\
+	x(btree_ptr_v2,			11)
 
 enum bch_sb_feature {
 #define x(f, n) BCH_FEATURE_##f,
diff --git a/fs/bcachefs/bkey.h b/fs/bcachefs/bkey.h
index 36e6ecc04514..aa729347e448 100644
--- a/fs/bcachefs/bkey.h
+++ b/fs/bcachefs/bkey.h
@@ -573,6 +573,7 @@ BKEY_VAL_ACCESSORS(stripe);
 BKEY_VAL_ACCESSORS(reflink_p);
 BKEY_VAL_ACCESSORS(reflink_v);
 BKEY_VAL_ACCESSORS(inline_data);
+BKEY_VAL_ACCESSORS(btree_ptr_v2);
 
 /* byte order helpers */
 
diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h
index 270f7f8fb140..6e7edcaf6675 100644
--- a/fs/bcachefs/btree_cache.h
+++ b/fs/bcachefs/btree_cache.h
@@ -40,6 +40,8 @@ static inline u64 btree_ptr_hash_val(const struct bkey_i *k)
 	switch (k->k.type) {
 	case KEY_TYPE_btree_ptr:
 		return *((u64 *) bkey_i_to_btree_ptr_c(k)->v.start);
+	case KEY_TYPE_btree_ptr_v2:
+		return bkey_i_to_btree_ptr_v2_c(k)->v.seq;
 	default:
 		return 0;
 	}
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 9df8d4f785bf..5fa31698ed67 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -734,6 +734,15 @@ static int validate_bset(struct bch_fs *c, struct btree *b,
 			bch2_bpos_swab(&b->data->max_key);
 		}
 
+		if (b->key.k.type == KEY_TYPE_btree_ptr_v2) {
+			struct bch_btree_ptr_v2 *bp =
+				&bkey_i_to_btree_ptr_v2(&b->key)->v;
+
+			btree_err_on(bkey_cmp(b->data->min_key, bp->min_key),
+				     BTREE_ERR_MUST_RETRY, c, b, NULL,
+				     "incorrect min_key");
+		}
+
 		btree_err_on(bkey_cmp(b->data->max_key, b->key.k.p),
 			     BTREE_ERR_MUST_RETRY, c, b, i,
 			     "incorrect max key");
@@ -897,6 +906,15 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry
 		     BTREE_ERR_MUST_RETRY, c, b, NULL,
 		     "bad btree header");
 
+	if (b->key.k.type == KEY_TYPE_btree_ptr_v2) {
+		struct bch_btree_ptr_v2 *bp =
+			&bkey_i_to_btree_ptr_v2(&b->key)->v;
+
+		btree_err_on(b->data->keys.seq != bp->seq,
+			     BTREE_ERR_MUST_RETRY, c, b, NULL,
+			     "got wrong btree node");
+	}
+
 	while (b->written < c->opts.btree_node_size) {
 		unsigned sectors, whiteout_u64s = 0;
 		struct nonce nonce;
@@ -1004,15 +1022,15 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry
 	i = &b->data->keys;
 	for (k = i->start; k != vstruct_last(i);) {
 		struct bkey tmp;
-		struct bkey_s_c u = bkey_disassemble(b, k, &tmp);
-		const char *invalid = bch2_bkey_val_invalid(c, u);
+		struct bkey_s u = __bkey_disassemble(b, k, &tmp);
+		const char *invalid = bch2_bkey_val_invalid(c, u.s_c);
 
 		if (invalid ||
 		    (inject_invalid_keys(c) &&
 		     !bversion_cmp(u.k->version, MAX_VERSION))) {
 			char buf[160];
 
-			bch2_bkey_val_to_text(&PBUF(buf), c, u);
+			bch2_bkey_val_to_text(&PBUF(buf), c, u.s_c);
 			btree_err(BTREE_ERR_FIXABLE, c, b, i,
 				  "invalid bkey %s: %s", buf, invalid);
 
@@ -1025,6 +1043,12 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry
 			continue;
 		}
 
+		if (u.k->type == KEY_TYPE_btree_ptr_v2) {
+			struct bkey_s_btree_ptr_v2 bp = bkey_s_to_btree_ptr_v2(u);
+
+			bp.v->mem_ptr = 0;
+		}
+
 		k = bkey_next_skip_noops(k, vstruct_last(i));
 	}
 
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index ff8cb37ed9a1..3d8b6218c983 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -332,7 +332,11 @@ retry:
 		goto retry;
 	}
 
-	bkey_btree_ptr_init(&tmp.k);
+	if (c->sb.features & (1ULL << BCH_FEATURE_btree_ptr_v2))
+		bkey_btree_ptr_v2_init(&tmp.k);
+	else
+		bkey_btree_ptr_init(&tmp.k);
+
 	bch2_alloc_sectors_append_ptrs(c, wp, &tmp.k, c->opts.btree_node_size);
 
 	bch2_open_bucket_get(c, wp, &ob);
@@ -354,14 +358,13 @@ static struct btree *bch2_btree_node_alloc(struct btree_update *as, unsigned lev
 {
 	struct bch_fs *c = as->c;
 	struct btree *b;
+	int ret;
 
 	BUG_ON(level >= BTREE_MAX_DEPTH);
 	BUG_ON(!as->reserve->nr);
 
 	b = as->reserve->b[--as->reserve->nr];
 
-	BUG_ON(bch2_btree_node_hash_insert(&c->btree_cache, b, level, as->btree_id));
-
 	set_btree_node_accessed(b);
 	set_btree_node_dirty(b);
 	set_btree_node_need_write(b);
@@ -372,7 +375,16 @@ static struct btree *bch2_btree_node_alloc(struct btree_update *as, unsigned lev
 	b->data->flags = 0;
 	SET_BTREE_NODE_ID(b->data, as->btree_id);
 	SET_BTREE_NODE_LEVEL(b->data, level);
-	b->data->ptr = bkey_i_to_btree_ptr(&b->key)->v.start[0];
+	b->data->ptr = bch2_bkey_ptrs_c(bkey_i_to_s_c(&b->key)).start->ptr;
+
+	if (b->key.k.type == KEY_TYPE_btree_ptr_v2) {
+		struct bkey_i_btree_ptr_v2 *bp = bkey_i_to_btree_ptr_v2(&b->key);
+
+		bp->v.mem_ptr		= 0;
+		bp->v.seq		= b->data->keys.seq;
+		bp->v.sectors_written	= 0;
+		bp->v.sectors		= cpu_to_le16(c->opts.btree_node_size);
+	}
 
 	if (c->sb.features & (1ULL << BCH_FEATURE_new_extent_overwrite))
 		SET_BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data, true);
@@ -385,10 +397,26 @@ static struct btree *bch2_btree_node_alloc(struct btree_update *as, unsigned lev
 
 	btree_node_will_make_reachable(as, b);
 
+	ret = bch2_btree_node_hash_insert(&c->btree_cache, b, level, as->btree_id);
+	BUG_ON(ret);
+
 	trace_btree_node_alloc(c, b);
 	return b;
 }
 
+static void btree_set_min(struct btree *b, struct bpos pos)
+{
+	if (b->key.k.type == KEY_TYPE_btree_ptr_v2)
+		bkey_i_to_btree_ptr_v2(&b->key)->v.min_key = pos;
+	b->data->min_key = pos;
+}
+
+static void btree_set_max(struct btree *b, struct bpos pos)
+{
+	b->key.k.p = pos;
+	b->data->max_key = pos;
+}
+
 struct btree *__bch2_btree_node_alloc_replacement(struct btree_update *as,
 						  struct btree *b,
 						  struct bkey_format format)
@@ -397,11 +425,12 @@ struct btree *__bch2_btree_node_alloc_replacement(struct btree_update *as,
 
 	n = bch2_btree_node_alloc(as, b->c.level);
 
-	n->data->min_key	= b->data->min_key;
-	n->data->max_key	= b->data->max_key;
-	n->data->format		= format;
 	SET_BTREE_NODE_SEQ(n->data, BTREE_NODE_SEQ(b->data) + 1);
 
+	btree_set_min(n, b->data->min_key);
+	btree_set_max(n, b->data->max_key);
+
+	n->data->format		= format;
 	btree_node_set_format(n, format);
 
 	bch2_btree_sort_into(as->c, n, b);
@@ -431,10 +460,9 @@ static struct btree *__btree_root_alloc(struct btree_update *as, unsigned level)
 {
 	struct btree *b = bch2_btree_node_alloc(as, level);
 
-	b->data->min_key = POS_MIN;
-	b->data->max_key = POS_MAX;
+	btree_set_min(b, POS_MIN);
+	btree_set_max(b, POS_MAX);
 	b->data->format = bch2_btree_calc_format(b);
-	b->key.k.p = POS_MAX;
 
 	btree_node_set_format(b, b->data->format);
 	bch2_btree_build_aux_trees(b);
@@ -1263,10 +1291,8 @@ static struct btree *__btree_split_node(struct btree_update *as,
 
 	BUG_ON(!prev);
 
-	n1->key.k.p = bkey_unpack_pos(n1, prev);
-	n1->data->max_key = n1->key.k.p;
-	n2->data->min_key =
-		btree_type_successor(n1->c.btree_id, n1->key.k.p);
+	btree_set_max(n1, bkey_unpack_pos(n1, prev));
+	btree_set_min(n2, btree_type_successor(n1->c.btree_id, n1->key.k.p));
 
 	set2->u64s = cpu_to_le16((u64 *) vstruct_end(set1) - (u64 *) k);
 	set1->u64s = cpu_to_le16(le16_to_cpu(set1->u64s) - le16_to_cpu(set2->u64s));
@@ -1749,10 +1775,9 @@ retry:
 
 	n = bch2_btree_node_alloc(as, b->c.level);
 
-	n->data->min_key	= prev->data->min_key;
-	n->data->max_key	= next->data->max_key;
+	btree_set_min(n, prev->data->min_key);
+	btree_set_max(n, next->data->max_key);
 	n->data->format		= new_f;
-	n->key.k.p		= next->key.k.p;
 
 	btree_node_set_format(n, new_f);
 
@@ -2202,8 +2227,8 @@ void bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id)
 	bch2_btree_build_aux_trees(b);
 
 	b->data->flags = 0;
-	b->data->min_key = POS_MIN;
-	b->data->max_key = POS_MAX;
+	btree_set_min(b, POS_MIN);
+	btree_set_max(b, POS_MAX);
 	b->data->format = bch2_btree_calc_format(b);
 	btree_node_set_format(b, b->data->format);
 
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 60ad443bb509..9fae7d9fb495 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -1194,6 +1194,7 @@ int bch2_mark_key_locked(struct bch_fs *c,
 		ret = bch2_mark_alloc(c, k, fs_usage, journal_seq, flags);
 		break;
 	case KEY_TYPE_btree_ptr:
+	case KEY_TYPE_btree_ptr_v2:
 		sectors = !(flags & BTREE_TRIGGER_OVERWRITE)
 			?  c->opts.btree_node_size
 			: -c->opts.btree_node_size;
@@ -1729,6 +1730,7 @@ int bch2_trans_mark_key(struct btree_trans *trans, struct bkey_s_c k,
 
 	switch (k.k->type) {
 	case KEY_TYPE_btree_ptr:
+	case KEY_TYPE_btree_ptr_v2:
 		sectors = !(flags & BTREE_TRIGGER_OVERWRITE)
 			?  c->opts.btree_node_size
 			: -c->opts.btree_node_size;
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index 2e49f2a8ccd9..4c84787575f5 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -97,7 +97,8 @@ static inline struct bucket *PTR_BUCKET(struct bch_dev *ca,
 static inline enum bch_data_type ptr_data_type(const struct bkey *k,
 					       const struct bch_extent_ptr *ptr)
 {
-	if (k->type == KEY_TYPE_btree_ptr)
+	if (k->type == KEY_TYPE_btree_ptr ||
+	    k->type == KEY_TYPE_btree_ptr_v2)
 		return BCH_DATA_BTREE;
 
 	return ptr->cached ? BCH_DATA_CACHED : BCH_DATA_USER;
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 10ca544317ba..cff4955d203b 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -748,6 +748,7 @@ void bch2_bkey_append_ptr(struct bkey_i *k,
 
 	switch (k->k.type) {
 	case KEY_TYPE_btree_ptr:
+	case KEY_TYPE_btree_ptr_v2:
 	case KEY_TYPE_extent:
 		EBUG_ON(bkey_val_u64s(&k->k) >= BKEY_EXTENT_VAL_U64s_MAX);
 
@@ -1030,6 +1031,8 @@ const char *bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k)
 
 	if (k.k->type == KEY_TYPE_btree_ptr)
 		size_ondisk = c->opts.btree_node_size;
+	if (k.k->type == KEY_TYPE_btree_ptr_v2)
+		size_ondisk = le16_to_cpu(bkey_s_c_to_btree_ptr_v2(k).v->sectors);
 
 	bkey_extent_entry_for_each(ptrs, entry) {
 		if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX)
diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h
index 6e8119a8ad30..70b7d70269dc 100644
--- a/fs/bcachefs/extents.h
+++ b/fs/bcachefs/extents.h
@@ -225,6 +225,13 @@ static inline struct bkey_ptrs_c bch2_bkey_ptrs_c(struct bkey_s_c k)
 			bkey_val_end(r),
 		};
 	}
+	case KEY_TYPE_btree_ptr_v2: {
+		struct bkey_s_c_btree_ptr_v2 e = bkey_s_c_to_btree_ptr_v2(k);
+		return (struct bkey_ptrs_c) {
+			to_entry(&e.v->start[0]),
+			to_entry(extent_entry_last(e))
+		};
+	}
 	default:
 		return (struct bkey_ptrs_c) { NULL, NULL };
 	}
@@ -372,6 +379,13 @@ void bch2_btree_ptr_to_text(struct printbuf *, struct bch_fs *,
 	.swab		= bch2_ptr_swab,			\
 }
 
+#define bch2_bkey_ops_btree_ptr_v2 (struct bkey_ops) {		\
+	.key_invalid	= bch2_btree_ptr_invalid,		\
+	.key_debugcheck	= bch2_btree_ptr_debugcheck,		\
+	.val_to_text	= bch2_btree_ptr_to_text,		\
+	.swab		= bch2_ptr_swab,			\
+}
+
 /* KEY_TYPE_extent: */
 
 const char *bch2_extent_invalid(const struct bch_fs *, struct bkey_s_c);
@@ -416,6 +430,7 @@ static inline bool bkey_extent_is_direct_data(const struct bkey *k)
 {
 	switch (k->type) {
 	case KEY_TYPE_btree_ptr:
+	case KEY_TYPE_btree_ptr_v2:
 	case KEY_TYPE_extent:
 	case KEY_TYPE_reflink_v:
 		return true;
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 29e6f9f00bad..c9d12f7c180e 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -1010,6 +1010,7 @@ int bch2_fs_recovery(struct bch_fs *c)
 		c->disk_sb.sb->version = le16_to_cpu(bcachefs_metadata_version_current);
 		c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_new_siphash;
 		c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_new_extent_overwrite;
+		c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_btree_ptr_v2;
 		write_sb = true;
 	}
 
diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c
index 66787d0c5c63..f4851c8b8f88 100644
--- a/fs/bcachefs/replicas.c
+++ b/fs/bcachefs/replicas.c
@@ -112,6 +112,7 @@ void bch2_bkey_to_replicas(struct bch_replicas_entry *e,
 
 	switch (k.k->type) {
 	case KEY_TYPE_btree_ptr:
+	case KEY_TYPE_btree_ptr_v2:
 		e->data_type = BCH_DATA_BTREE;
 		extent_to_replicas(k, e);
 		break;
-- 
cgit 


From ac7c51b2180e757feaaabcb84794bcc9912a4edf Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 8 Feb 2020 16:39:37 -0500
Subject: bcachefs: Seralize btree_update operations at
 btree_update_nodes_written()

Prep work for journalling updates to interior nodes - enforcing ordering
will greatly simplify those changes.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs.h              |  1 +
 fs/bcachefs/btree_update_interior.c | 32 ++++++++++++++++++++++++++------
 fs/bcachefs/btree_update_interior.h |  1 +
 fs/bcachefs/super.c                 |  1 +
 4 files changed, 29 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 0d4a8b75ff42..32cdf87ee55d 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -610,6 +610,7 @@ struct bch_fs {
 
 	mempool_t		btree_interior_update_pool;
 	struct list_head	btree_interior_update_list;
+	struct list_head	btree_interior_updates_unwritten;
 	struct mutex		btree_interior_update_lock;
 	struct closure_waitlist	btree_interior_update_wait;
 
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 3d8b6218c983..677cb76731c1 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -666,9 +666,15 @@ static void btree_update_nodes_written(struct closure *cl)
 	 * to child nodes that weren't written yet: now, the child nodes have
 	 * been written so we can write out the update to the interior node.
 	 */
-retry:
 	mutex_lock(&c->btree_interior_update_lock);
 	as->nodes_written = true;
+retry:
+	as = list_first_entry_or_null(&c->btree_interior_updates_unwritten,
+				      struct btree_update, unwritten_list);
+	if (!as || !as->nodes_written) {
+		mutex_unlock(&c->btree_interior_update_lock);
+		return;
+	}
 
 	switch (as->mode) {
 	case BTREE_INTERIOR_NO_UPDATE:
@@ -681,11 +687,12 @@ retry:
 			mutex_unlock(&c->btree_interior_update_lock);
 			btree_node_lock_type(c, b, SIX_LOCK_read);
 			six_unlock_read(&b->c.lock);
+			mutex_lock(&c->btree_interior_update_lock);
 			goto retry;
 		}
 
 		BUG_ON(!btree_node_dirty(b));
-		closure_wait(&btree_current_write(b)->wait, cl);
+		closure_wait(&btree_current_write(b)->wait, &as->cl);
 
 		list_del(&as->write_blocked_list);
 
@@ -694,6 +701,8 @@ retry:
 		 * nodes to be writeable:
 		 */
 		closure_wake_up(&c->btree_interior_update_wait);
+
+		list_del(&as->unwritten_list);
 		mutex_unlock(&c->btree_interior_update_lock);
 
 		/*
@@ -702,6 +711,7 @@ retry:
 		 */
 		bch2_btree_node_write_cond(c, b, true);
 		six_unlock_read(&b->c.lock);
+		continue_at(&as->cl, btree_update_nodes_reachable, system_wq);
 		break;
 
 	case BTREE_INTERIOR_UPDATING_AS:
@@ -716,8 +726,12 @@ retry:
 		/*
 		 * and then we have to wait on that btree_update to finish:
 		 */
-		closure_wait(&as->parent_as->wait, cl);
+		closure_wait(&as->parent_as->wait, &as->cl);
+
+		list_del(&as->unwritten_list);
 		mutex_unlock(&c->btree_interior_update_lock);
+
+		continue_at(&as->cl, btree_update_nodes_reachable, system_wq);
 		break;
 
 	case BTREE_INTERIOR_UPDATING_ROOT:
@@ -728,6 +742,7 @@ retry:
 			mutex_unlock(&c->btree_interior_update_lock);
 			btree_node_lock_type(c, b, SIX_LOCK_read);
 			six_unlock_read(&b->c.lock);
+			mutex_lock(&c->btree_interior_update_lock);
 			goto retry;
 		}
 
@@ -744,6 +759,8 @@ retry:
 		 * can reuse the old nodes it'll have to do a journal commit:
 		 */
 		six_unlock_read(&b->c.lock);
+
+		list_del(&as->unwritten_list);
 		mutex_unlock(&c->btree_interior_update_lock);
 
 		/*
@@ -762,11 +779,12 @@ retry:
 
 		as->journal_seq = bch2_journal_last_unwritten_seq(&c->journal);
 
-		btree_update_wait_on_journal(cl);
-		return;
+		btree_update_wait_on_journal(&as->cl);
+		break;
 	}
 
-	continue_at(cl, btree_update_nodes_reachable, system_wq);
+	mutex_lock(&c->btree_interior_update_lock);
+	goto retry;
 }
 
 /*
@@ -778,6 +796,7 @@ static void btree_update_updated_node(struct btree_update *as, struct btree *b)
 	struct bch_fs *c = as->c;
 
 	mutex_lock(&c->btree_interior_update_lock);
+	list_add_tail(&as->unwritten_list, &c->btree_interior_updates_unwritten);
 
 	BUG_ON(as->mode != BTREE_INTERIOR_NO_UPDATE);
 	BUG_ON(!btree_node_dirty(b));
@@ -858,6 +877,7 @@ static void btree_update_updated_root(struct btree_update *as)
 	struct btree_root *r = &c->btree_roots[as->btree_id];
 
 	mutex_lock(&c->btree_interior_update_lock);
+	list_add_tail(&as->unwritten_list, &c->btree_interior_updates_unwritten);
 
 	BUG_ON(as->mode != BTREE_INTERIOR_NO_UPDATE);
 
diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h
index 8f9d4a0b68ea..e3204f32cc68 100644
--- a/fs/bcachefs/btree_update_interior.h
+++ b/fs/bcachefs/btree_update_interior.h
@@ -55,6 +55,7 @@ struct btree_update {
 	struct bch_fs			*c;
 
 	struct list_head		list;
+	struct list_head		unwritten_list;
 
 	/* What kind of update are we doing? */
 	enum {
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 586636a4c204..2ba04b08529d 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -642,6 +642,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 	INIT_LIST_HEAD(&c->list);
 
 	INIT_LIST_HEAD(&c->btree_interior_update_list);
+	INIT_LIST_HEAD(&c->btree_interior_updates_unwritten);
 	mutex_init(&c->btree_reserve_cache_lock);
 	mutex_init(&c->btree_interior_update_lock);
 
-- 
cgit 


From 163e885a0aee99657b3819ead6c3390271ed0980 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 26 Feb 2020 15:39:46 -0500
Subject: bcachefs: Kill TRANS_RESET_MEM|TRANS_RESET_ITERS

All iterators should be released now with bch2_trans_iter_put(), so
TRANS_RESET_ITERS shouldn't be needed anymore, and TRANS_RESET_MEM is
always used.

Also convert more code to __bch2_trans_do().

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c        | 15 +++++----
 fs/bcachefs/btree_iter.h        |  6 ++--
 fs/bcachefs/btree_update.h      | 15 +++++----
 fs/bcachefs/btree_update_leaf.c | 15 +++++----
 fs/bcachefs/dirent.c            | 45 +++++++++++++++----------
 fs/bcachefs/ec.c                | 19 +++++------
 fs/bcachefs/fs-io.c             |  2 +-
 fs/bcachefs/fsck.c              | 29 +++++-----------
 fs/bcachefs/io.c                |  4 +--
 fs/bcachefs/reflink.c           |  2 +-
 fs/bcachefs/str_hash.h          |  4 +++
 fs/bcachefs/tests.c             | 73 +++++++++++++++++++++++++++++++----------
 12 files changed, 136 insertions(+), 93 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index bf68ab7257ce..37c60842c670 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1756,6 +1756,8 @@ int bch2_trans_iter_put(struct btree_trans *trans,
 	if (IS_ERR_OR_NULL(iter))
 		return 0;
 
+	BUG_ON(trans->iters + iter->idx != iter);
+
 	ret = btree_iter_err(iter);
 
 	if (!(trans->iters_touched & (1ULL << iter->idx)) &&
@@ -2080,16 +2082,11 @@ void bch2_trans_reset(struct btree_trans *trans, unsigned flags)
 
 	bch2_trans_unlink_iters(trans);
 
-	if (flags & TRANS_RESET_ITERS)
-		trans->iters_live = 0;
-
 	trans->iters_touched &= trans->iters_live;
 
 	trans->need_reset		= 0;
 	trans->nr_updates		= 0;
-
-	if (flags & TRANS_RESET_MEM)
-		trans->mem_top		= 0;
+	trans->mem_top			= 0;
 
 	if (trans->fs_usage_deltas) {
 		trans->fs_usage_deltas->used = 0;
@@ -2109,6 +2106,12 @@ void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c,
 {
 	memset(trans, 0, offsetof(struct btree_trans, iters_onstack));
 
+	/*
+	 * reallocating iterators currently completely breaks
+	 * bch2_trans_iter_put():
+	 */
+	expected_nr_iters = BTREE_ITER_MAX;
+
 	trans->c		= c;
 	trans->ip		= _RET_IP_;
 	trans->size		= ARRAY_SIZE(trans->iters_onstack);
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index f9bf01e26aa1..dd7a5e513dc8 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -290,15 +290,13 @@ struct btree_iter *bch2_trans_get_node_iter(struct btree_trans *,
 				enum btree_id, struct bpos,
 				unsigned, unsigned, unsigned);
 
-#define TRANS_RESET_ITERS		(1 << 0)
-#define TRANS_RESET_MEM			(1 << 1)
-#define TRANS_RESET_NOTRAVERSE		(1 << 2)
+#define TRANS_RESET_NOTRAVERSE		(1 << 0)
 
 void bch2_trans_reset(struct btree_trans *, unsigned);
 
 static inline void bch2_trans_begin(struct btree_trans *trans)
 {
-	return bch2_trans_reset(trans, TRANS_RESET_ITERS|TRANS_RESET_MEM);
+	return bch2_trans_reset(trans, 0);
 }
 
 void *bch2_trans_kmalloc(struct btree_trans *, size_t);
diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
index be4fe818eac8..d1cd839ac08f 100644
--- a/fs/bcachefs/btree_update.h
+++ b/fs/bcachefs/btree_update.h
@@ -59,6 +59,7 @@ enum btree_insert_flags {
 
 int bch2_btree_delete_at(struct btree_trans *, struct btree_iter *, unsigned);
 
+int __bch2_btree_insert(struct btree_trans *, enum btree_id, struct bkey_i *);
 int bch2_btree_insert(struct bch_fs *, enum btree_id, struct bkey_i *,
 		     struct disk_reservation *, u64 *, int flags);
 
@@ -98,17 +99,17 @@ static inline int bch2_trans_commit(struct btree_trans *trans,
 	return __bch2_trans_commit(trans);
 }
 
-#define __bch2_trans_do(_trans, _disk_res, _journal_seq,		\
-			_flags,	_reset_flags, _do)			\
+#define __bch2_trans_do(_trans, _disk_res, _journal_seq, _flags, _do)	\
 ({									\
 	int _ret;							\
 									\
-	do {								\
-		bch2_trans_reset(_trans, _reset_flags);			\
-									\
+	while (1) {							\
 		_ret = (_do) ?:	bch2_trans_commit(_trans, (_disk_res),	\
 					(_journal_seq), (_flags));	\
-	} while (_ret == -EINTR);					\
+		if (_ret != -EINTR)					\
+			break;						\
+		bch2_trans_reset(_trans, 0);				\
+	}								\
 									\
 	_ret;								\
 })
@@ -120,7 +121,7 @@ static inline int bch2_trans_commit(struct btree_trans *trans,
 									\
 	bch2_trans_init(&trans, (_c), 0, 0);				\
 	_ret = __bch2_trans_do(&trans, _disk_res, _journal_seq, _flags,	\
-			       TRANS_RESET_MEM|TRANS_RESET_ITERS, _do);	\
+			       _do);					\
 	_ret2 = bch2_trans_exit(&trans);				\
 									\
 	_ret ?: _ret2;							\
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 11bd0558ebd5..8e4a47acd667 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -758,7 +758,7 @@ out:
 	if (likely(!(trans->flags & BTREE_INSERT_NOCHECK_RW)))
 		percpu_ref_put(&trans->c->writes);
 out_noupdates:
-	bch2_trans_reset(trans, TRANS_RESET_MEM|TRANS_RESET_NOTRAVERSE);
+	bch2_trans_reset(trans, !ret ? TRANS_RESET_NOTRAVERSE : 0);
 
 	return ret;
 err:
@@ -839,18 +839,21 @@ int bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter,
 	return 0;
 }
 
-static int __bch2_btree_insert(struct btree_trans *trans,
-			       enum btree_id id, struct bkey_i *k)
+int __bch2_btree_insert(struct btree_trans *trans,
+			enum btree_id id, struct bkey_i *k)
 {
 	struct btree_iter *iter;
+	int ret;
 
 	iter = bch2_trans_get_iter(trans, id, bkey_start_pos(&k->k),
 				   BTREE_ITER_INTENT);
 	if (IS_ERR(iter))
 		return PTR_ERR(iter);
 
-	bch2_trans_update(trans, iter, k, 0);
-	return 0;
+	ret   = bch2_btree_iter_traverse(iter) ?:
+		bch2_trans_update(trans, iter, k, 0);
+	bch2_trans_iter_put(trans, iter);
+	return ret;
 }
 
 /**
@@ -882,7 +885,7 @@ retry:
 	       bkey_cmp(iter->pos, end) < 0) {
 		struct bkey_i delete;
 
-		bch2_trans_reset(trans, TRANS_RESET_MEM);
+		bch2_trans_begin(trans);
 
 		bkey_init(&delete.k);
 
diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c
index 623b6c3eda95..ae5c9fd8d9f7 100644
--- a/fs/bcachefs/dirent.c
+++ b/fs/bcachefs/dirent.c
@@ -169,12 +169,12 @@ int bch2_dirent_rename(struct btree_trans *trans,
 		       const struct qstr *dst_name, u64 *dst_inum,
 		       enum bch_rename_mode mode)
 {
-	struct btree_iter *src_iter, *dst_iter;
+	struct btree_iter *src_iter = NULL, *dst_iter = NULL;
 	struct bkey_s_c old_src, old_dst;
 	struct bkey_i_dirent *new_src = NULL, *new_dst = NULL;
 	struct bpos dst_pos =
 		POS(dst_dir, bch2_dirent_hash(dst_hash, dst_name));
-	int ret;
+	int ret = 0;
 
 	*src_inum = *dst_inum = 0;
 
@@ -191,8 +191,10 @@ int bch2_dirent_rename(struct btree_trans *trans,
 		: bch2_hash_lookup(trans, bch2_dirent_hash_desc,
 				   dst_hash, dst_dir, dst_name,
 				   BTREE_ITER_INTENT);
-	if (IS_ERR(dst_iter))
-		return PTR_ERR(dst_iter);
+	ret = PTR_ERR_OR_ZERO(dst_iter);
+	if (ret)
+		goto out;
+
 	old_dst = bch2_btree_iter_peek_slot(dst_iter);
 
 	if (mode != BCH_RENAME)
@@ -202,15 +204,18 @@ int bch2_dirent_rename(struct btree_trans *trans,
 	src_iter = bch2_hash_lookup(trans, bch2_dirent_hash_desc,
 				    src_hash, src_dir, src_name,
 				    BTREE_ITER_INTENT);
-	if (IS_ERR(src_iter))
-		return PTR_ERR(src_iter);
+	ret = PTR_ERR_OR_ZERO(src_iter);
+	if (ret)
+		goto out;
+
 	old_src = bch2_btree_iter_peek_slot(src_iter);
 	*src_inum = le64_to_cpu(bkey_s_c_to_dirent(old_src).v->d_inum);
 
 	/* Create new dst key: */
 	new_dst = dirent_create_key(trans, 0, dst_name, 0);
-	if (IS_ERR(new_dst))
-		return PTR_ERR(new_dst);
+	ret = PTR_ERR_OR_ZERO(new_dst);
+	if (ret)
+		goto out;
 
 	dirent_copy_target(new_dst, bkey_s_c_to_dirent(old_src));
 	new_dst->k.p = dst_iter->pos;
@@ -218,15 +223,18 @@ int bch2_dirent_rename(struct btree_trans *trans,
 	/* Create new src key: */
 	if (mode == BCH_RENAME_EXCHANGE) {
 		new_src = dirent_create_key(trans, 0, src_name, 0);
-		if (IS_ERR(new_src))
-			return PTR_ERR(new_src);
+		ret = PTR_ERR_OR_ZERO(new_src);
+		if (ret)
+			goto out;
 
 		dirent_copy_target(new_src, bkey_s_c_to_dirent(old_dst));
 		new_src->k.p = src_iter->pos;
 	} else {
 		new_src = bch2_trans_kmalloc(trans, sizeof(struct bkey_i));
-		if (IS_ERR(new_src))
-			return PTR_ERR(new_src);
+		ret = PTR_ERR_OR_ZERO(new_src);
+		if (ret)
+			goto out;
+
 		bkey_init(&new_src->k);
 		new_src->k.p = src_iter->pos;
 
@@ -247,7 +255,7 @@ int bch2_dirent_rename(struct btree_trans *trans,
 				new_dst->k.p = src_iter->pos;
 				bch2_trans_update(trans, src_iter,
 						  &new_dst->k_i, 0);
-				return 0;
+				goto out;
 			} else {
 				/* If we're overwriting, we can't insert new_dst
 				 * at a different slot because it has to
@@ -261,7 +269,7 @@ int bch2_dirent_rename(struct btree_trans *trans,
 			ret = bch2_hash_needs_whiteout(trans, bch2_dirent_hash_desc,
 						       src_hash, src_iter);
 			if (ret < 0)
-				return ret;
+				goto out;
 
 			if (ret)
 				new_src->k.type = KEY_TYPE_whiteout;
@@ -270,7 +278,10 @@ int bch2_dirent_rename(struct btree_trans *trans,
 
 	bch2_trans_update(trans, src_iter, &new_src->k_i, 0);
 	bch2_trans_update(trans, dst_iter, &new_dst->k_i, 0);
-	return 0;
+out:
+	bch2_trans_iter_put(trans, src_iter);
+	bch2_trans_iter_put(trans, dst_iter);
+	return ret;
 }
 
 int bch2_dirent_delete_at(struct btree_trans *trans,
@@ -331,9 +342,7 @@ int bch2_empty_dir_trans(struct btree_trans *trans, u64 dir_inum)
 			break;
 		}
 	}
-
-	if (!IS_ERR(iter))
-		bch2_trans_iter_put(trans, iter);
+	bch2_trans_iter_put(trans, iter);
 
 	return ret;
 }
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index 59985227385b..bc11f7e056eb 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -741,6 +741,8 @@ found_slot:
 	ret = bch2_trans_commit(&trans, NULL, NULL,
 				BTREE_INSERT_NOFAIL);
 err:
+	bch2_trans_iter_put(&trans, iter);
+
 	if (ret == -EINTR)
 		goto retry;
 
@@ -1201,8 +1203,7 @@ static int __bch2_stripe_write_key(struct btree_trans *trans,
 				   struct btree_iter *iter,
 				   struct stripe *m,
 				   size_t idx,
-				   struct bkey_i_stripe *new_key,
-				   unsigned flags)
+				   struct bkey_i_stripe *new_key)
 {
 	struct bch_fs *c = trans->c;
 	struct bkey_s_c k;
@@ -1231,9 +1232,7 @@ static int __bch2_stripe_write_key(struct btree_trans *trans,
 	spin_unlock(&c->ec_stripes_heap_lock);
 
 	bch2_trans_update(trans, iter, &new_key->k_i, 0);
-
-	return bch2_trans_commit(trans, NULL, NULL,
-				 BTREE_INSERT_NOFAIL|flags);
+	return 0;
 }
 
 int bch2_stripes_write(struct bch_fs *c, unsigned flags, bool *wrote)
@@ -1257,12 +1256,10 @@ int bch2_stripes_write(struct bch_fs *c, unsigned flags, bool *wrote)
 		if (!m->dirty)
 			continue;
 
-		do {
-			bch2_trans_reset(&trans, TRANS_RESET_MEM);
-
-			ret = __bch2_stripe_write_key(&trans, iter, m,
-					giter.pos, new_key, flags);
-		} while (ret == -EINTR);
+		ret = __bch2_trans_do(&trans, NULL, NULL,
+				      BTREE_INSERT_NOFAIL|flags,
+			__bch2_stripe_write_key(&trans, iter, m,
+					giter.pos, new_key));
 
 		if (ret)
 			break;
diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index f6a597f54d16..4a20bb11151c 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -2620,7 +2620,7 @@ static long bchfs_fallocate(struct bch_inode_info *inode, int mode,
 		struct bkey_i_reservation reservation;
 		struct bkey_s_c k;
 
-		bch2_trans_reset(&trans, TRANS_RESET_MEM);
+		bch2_trans_begin(&trans);
 
 		k = bch2_btree_iter_peek_slot(iter);
 		if ((ret = bkey_err(k)))
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 9ef532d875e8..eca723121a2c 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -81,7 +81,6 @@ static int remove_dirent(struct btree_trans *trans,
 	return __bch2_trans_do(trans, NULL, NULL,
 			       BTREE_INSERT_NOFAIL|
 			       BTREE_INSERT_LAZY_RW,
-			       TRANS_RESET_MEM,
 			       __remove_dirent(trans, dirent));
 }
 
@@ -182,8 +181,6 @@ static int hash_redo_key(const struct bch_hash_desc desc,
 	struct bkey_i delete;
 	struct bkey_i *tmp;
 
-	bch2_trans_reset(trans, TRANS_RESET_MEM);
-
 	tmp = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
 	if (IS_ERR(tmp))
 		return PTR_ERR(tmp);
@@ -194,11 +191,8 @@ static int hash_redo_key(const struct bch_hash_desc desc,
 	delete.k.p = k_iter->pos;
 	bch2_trans_update(trans, k_iter, &delete, 0);
 
-	return  bch2_hash_set(trans, desc, &h->info, k_iter->pos.inode,
-			      tmp, BCH_HASH_SET_MUST_CREATE) ?:
-		bch2_trans_commit(trans, NULL, NULL,
-				  BTREE_INSERT_NOFAIL|
-				  BTREE_INSERT_LAZY_RW);
+	return bch2_hash_set(trans, desc, &h->info, k_iter->pos.inode,
+			     tmp, BCH_HASH_SET_MUST_CREATE);
 }
 
 static int fsck_hash_delete_at(struct btree_trans *trans,
@@ -320,10 +314,9 @@ static int hash_check_key(struct btree_trans *trans,
 			desc.btree_id, k.k->p.offset,
 			hashed, h->chain->pos.offset,
 			(bch2_bkey_val_to_text(&PBUF(buf), c, k), buf))) {
-		do {
-			ret = hash_redo_key(desc, trans, h, k_iter, k, hashed);
-		} while (ret == -EINTR);
-
+		ret = __bch2_trans_do(trans, NULL, NULL,
+				      BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW,
+			hash_redo_key(desc, trans, h, k_iter, k, hashed));
 		if (ret) {
 			bch_err(c, "hash_redo_key err %i", ret);
 			return ret;
@@ -387,7 +380,6 @@ static int check_dirent_hash(struct btree_trans *trans, struct hash_check *h,
 		ret = __bch2_trans_do(trans, NULL, NULL,
 				      BTREE_INSERT_NOFAIL|
 				      BTREE_INSERT_LAZY_RW,
-				      TRANS_RESET_MEM,
 			(bch2_trans_update(trans, iter, &d->k_i, 0), 0));
 		if (ret)
 			goto err;
@@ -410,11 +402,10 @@ err_redo:
 		     k->k->p.offset, hash, h->chain->pos.offset,
 		     (bch2_bkey_val_to_text(&PBUF(buf), c,
 					    *k), buf))) {
-		do {
-			ret = hash_redo_key(bch2_dirent_hash_desc, trans,
-					    h, iter, *k, hash);
-		} while (ret == -EINTR);
-
+		ret = __bch2_trans_do(trans, NULL, NULL,
+				      BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW,
+			hash_redo_key(bch2_dirent_hash_desc, trans,
+				      h, iter, *k, hash));
 		if (ret)
 			bch_err(c, "hash_redo_key err %i", ret);
 		else
@@ -660,7 +651,6 @@ retry:
 			ret = __bch2_trans_do(&trans, NULL, NULL,
 					      BTREE_INSERT_NOFAIL|
 					      BTREE_INSERT_LAZY_RW,
-					      TRANS_RESET_MEM,
 				(bch2_trans_update(&trans, iter, &n->k_i, 0), 0));
 			kfree(n);
 			if (ret)
@@ -1275,7 +1265,6 @@ static int check_inode(struct btree_trans *trans,
 		ret = __bch2_trans_do(trans, NULL, NULL,
 				      BTREE_INSERT_NOFAIL|
 				      BTREE_INSERT_LAZY_RW,
-				      TRANS_RESET_MEM,
 			(bch2_trans_update(trans, iter, &p.inode.k_i, 0), 0));
 		if (ret)
 			bch_err(c, "error in fsck: error %i "
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index 33603624b42a..5f296de282b6 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -335,7 +335,7 @@ int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter,
 			bch2_disk_reservation_init(c, 0);
 		struct bkey_i delete;
 
-		bch2_trans_reset(trans, TRANS_RESET_MEM);
+		bch2_trans_begin(trans);
 
 		ret = bkey_err(k);
 		if (ret)
@@ -409,7 +409,7 @@ int bch2_write_index_default(struct bch_write_op *op)
 				   BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
 
 	do {
-		bch2_trans_reset(&trans, TRANS_RESET_MEM);
+		bch2_trans_begin(&trans);
 
 		k = bch2_keylist_front(keys);
 
diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c
index d78a3d5f7246..2f223be74926 100644
--- a/fs/bcachefs/reflink.c
+++ b/fs/bcachefs/reflink.c
@@ -184,7 +184,7 @@ s64 bch2_remap_range(struct bch_fs *c,
 				       BTREE_ITER_INTENT);
 
 	while (1) {
-		bch2_trans_reset(&trans, TRANS_RESET_MEM);
+		bch2_trans_begin(&trans);
 
 		trans.mem_top = 0;
 
diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h
index 0710d0bbe36d..9c9549d0a8f6 100644
--- a/fs/bcachefs/str_hash.h
+++ b/fs/bcachefs/str_hash.h
@@ -163,6 +163,7 @@ bch2_hash_lookup(struct btree_trans *trans,
 			break;
 		}
 	}
+	bch2_trans_iter_put(trans, iter);
 
 	return ERR_PTR(ret ?: -ENOENT);
 }
@@ -187,6 +188,9 @@ bch2_hash_hole(struct btree_trans *trans,
 			return iter;
 	}
 
+	iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT;
+	bch2_trans_iter_put(trans, iter);
+
 	return ERR_PTR(ret ?: -ENOSPC);
 }
 
diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c
index 8ef7bc8098d7..876d64bfca20 100644
--- a/fs/bcachefs/tests.c
+++ b/fs/bcachefs/tests.c
@@ -43,8 +43,8 @@ static void test_delete(struct bch_fs *c, u64 nr)
 	ret = bch2_btree_iter_traverse(iter);
 	BUG_ON(ret);
 
-	bch2_trans_update(&trans, iter, &k.k_i, 0);
-	ret = bch2_trans_commit(&trans, NULL, NULL, 0);
+	ret = __bch2_trans_do(&trans, NULL, NULL, 0,
+		bch2_trans_update(&trans, iter, &k.k_i, 0));
 	BUG_ON(ret);
 
 	pr_info("deleting once");
@@ -75,8 +75,8 @@ static void test_delete_written(struct bch_fs *c, u64 nr)
 	ret = bch2_btree_iter_traverse(iter);
 	BUG_ON(ret);
 
-	bch2_trans_update(&trans, iter, &k.k_i, 0);
-	ret = bch2_trans_commit(&trans, NULL, NULL, 0);
+	ret = __bch2_trans_do(&trans, NULL, NULL, 0,
+		bch2_trans_update(&trans, iter, &k.k_i, 0));
 	BUG_ON(ret);
 
 	bch2_journal_flush_all_pins(&c->journal);
@@ -409,18 +409,24 @@ static u64 test_rand(void)
 
 static void rand_insert(struct bch_fs *c, u64 nr)
 {
+	struct btree_trans trans;
 	struct bkey_i_cookie k;
 	int ret;
 	u64 i;
 
+	bch2_trans_init(&trans, c, 0, 0);
+
 	for (i = 0; i < nr; i++) {
 		bkey_cookie_init(&k.k_i);
 		k.k.p.offset = test_rand();
 
-		ret = bch2_btree_insert(c, BTREE_ID_DIRENTS, &k.k_i,
-					NULL, NULL, 0);
+		ret = __bch2_trans_do(&trans, NULL, NULL, 0,
+			__bch2_btree_insert(&trans, BTREE_ID_DIRENTS, &k.k_i));
+
 		BUG_ON(ret);
 	}
+
+	bch2_trans_exit(&trans);
 }
 
 static void rand_lookup(struct bch_fs *c, u64 nr)
@@ -465,8 +471,9 @@ static void rand_mixed(struct bch_fs *c, u64 nr)
 			bkey_cookie_init(&k.k_i);
 			k.k.p = iter->pos;
 
-			bch2_trans_update(&trans, iter, &k.k_i, 0);
-			ret = bch2_trans_commit(&trans, NULL, NULL, 0);
+			ret = __bch2_trans_do(&trans, NULL, NULL, 0,
+				bch2_trans_update(&trans, iter, &k.k_i, 0));
+
 			BUG_ON(ret);
 		}
 
@@ -476,20 +483,50 @@ static void rand_mixed(struct bch_fs *c, u64 nr)
 	bch2_trans_exit(&trans);
 }
 
+static int __do_delete(struct btree_trans *trans, struct bpos pos)
+{
+	struct btree_iter *iter;
+	struct bkey_i delete;
+	struct bkey_s_c k;
+	int ret = 0;
+
+	iter = bch2_trans_get_iter(trans, BTREE_ID_DIRENTS, pos,
+				   BTREE_ITER_INTENT);
+	ret = PTR_ERR_OR_ZERO(iter);
+	if (ret)
+		goto err;
+
+	k = bch2_btree_iter_peek(iter);
+	ret = bkey_err(k);
+	if (ret)
+		goto err;
+
+	bkey_init(&delete.k);
+	delete.k.p = k.k->p;
+
+	bch2_trans_update(trans, iter, &delete, 0);
+err:
+	bch2_trans_iter_put(trans, iter);
+	return ret;
+}
+
 static void rand_delete(struct bch_fs *c, u64 nr)
 {
-	struct bkey_i k;
+	struct btree_trans trans;
 	int ret;
 	u64 i;
 
+	bch2_trans_init(&trans, c, 0, 0);
+
 	for (i = 0; i < nr; i++) {
-		bkey_init(&k.k);
-		k.k.p.offset = test_rand();
+		struct bpos pos = POS(0, test_rand());
 
-		ret = bch2_btree_insert(c, BTREE_ID_DIRENTS, &k,
-					NULL, NULL, 0);
+		ret = __bch2_trans_do(&trans, NULL, NULL, 0,
+			__do_delete(&trans, pos));
 		BUG_ON(ret);
 	}
+
+	bch2_trans_exit(&trans);
 }
 
 static void seq_insert(struct bch_fs *c, u64 nr)
@@ -509,8 +546,9 @@ static void seq_insert(struct bch_fs *c, u64 nr)
 			   BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) {
 		insert.k.p = iter->pos;
 
-		bch2_trans_update(&trans, iter, &insert.k_i, 0);
-		ret = bch2_trans_commit(&trans, NULL, NULL, 0);
+		ret = __bch2_trans_do(&trans, NULL, NULL, 0,
+			bch2_trans_update(&trans, iter, &insert.k_i, 0));
+
 		BUG_ON(ret);
 
 		if (++i == nr)
@@ -548,8 +586,9 @@ static void seq_overwrite(struct bch_fs *c, u64 nr)
 
 		bkey_reassemble(&u.k_i, k);
 
-		bch2_trans_update(&trans, iter, &u.k_i, 0);
-		ret = bch2_trans_commit(&trans, NULL, NULL, 0);
+		ret = __bch2_trans_do(&trans, NULL, NULL, 0,
+			bch2_trans_update(&trans, iter, &u.k_i, 0));
+
 		BUG_ON(ret);
 	}
 	bch2_trans_exit(&trans);
-- 
cgit 


From c18dade6589ae3cec9b457849e93358bb8780fe5 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 26 Feb 2020 15:58:36 -0500
Subject: bcachefs: Issue discards when needed to allocate journal write

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/journal_io.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 1866ed30ce89..fc36385c7830 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -1042,9 +1042,16 @@ void bch2_journal_write(struct closure *cl)
 	bytes = vstruct_bytes(jset);
 	memset((void *) jset + bytes, 0, (sectors << 9) - bytes);
 
+retry_alloc:
 	spin_lock(&j->lock);
 	ret = journal_write_alloc(j, w, sectors);
 
+	if (ret && j->can_discard) {
+		spin_unlock(&j->lock);
+		bch2_journal_do_discards(j);
+		goto retry_alloc;
+	}
+
 	/*
 	 * write is allocated, no longer need to account for it in
 	 * bch2_journal_space_available():
-- 
cgit 


From 00aad62aaf56fe589eb79e31b73af9fed98a40c2 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 26 Feb 2020 17:11:00 -0500
Subject: bcachefs: Fix incorrect initialization of
 btree_node_old_extent_overwrite()

b->level and b->btree_id weren't set when the code was checking
btree_node_is_extents()

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update_interior.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 677cb76731c1..b159ce9b3b7a 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -370,6 +370,9 @@ static struct btree *bch2_btree_node_alloc(struct btree_update *as, unsigned lev
 	set_btree_node_need_write(b);
 
 	bch2_bset_init_first(b, &b->data->keys);
+	b->c.level	= level;
+	b->c.btree_id	= as->btree_id;
+
 	memset(&b->nr, 0, sizeof(b->nr));
 	b->data->magic = cpu_to_le64(bset_magic(c));
 	b->data->flags = 0;
-- 
cgit 


From 72141e1f4fa4f389f64d4ed7c6a63689e67921ac Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 24 Feb 2020 15:25:00 -0500
Subject: bcachefs: Use btree_ptr_v2.mem_ptr to avoid hash table lookup

Nice performance optimization

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_cache.c | 28 +++++++++++-----------------
 fs/bcachefs/btree_cache.h |  7 +++++++
 fs/bcachefs/btree_io.c    |  1 +
 fs/bcachefs/btree_iter.c  | 25 +++++++++++++++++++++++++
 4 files changed, 44 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index ee3c1f40b500..40281a9acbbc 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -596,12 +596,13 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c,
 	struct btree_cache *bc = &c->btree_cache;
 	struct btree *b;
 
+	BUG_ON(level + 1 >= BTREE_MAX_DEPTH);
 	/*
 	 * Parent node must be locked, else we could read in a btree node that's
 	 * been freed:
 	 */
-	BUG_ON(!btree_node_locked(iter, level + 1));
-	BUG_ON(level >= BTREE_MAX_DEPTH);
+	if (!bch2_btree_node_relock(iter, level + 1))
+		return ERR_PTR(-EINTR);
 
 	b = bch2_btree_node_mem_alloc(c);
 	if (IS_ERR(b))
@@ -624,13 +625,9 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c,
 	}
 
 	/*
-	 * If the btree node wasn't cached, we can't drop our lock on
-	 * the parent until after it's added to the cache - because
-	 * otherwise we could race with a btree_split() freeing the node
-	 * we're trying to lock.
+	 * Unlock before doing IO:
 	 *
-	 * But the deadlock described below doesn't exist in this case,
-	 * so it's safe to not drop the parent lock until here:
+	 * XXX: ideally should be dropping all btree node locks here
 	 */
 	if (btree_node_read_locked(iter, level + 1))
 		btree_node_unlock(iter, level + 1);
@@ -667,16 +664,11 @@ struct btree *bch2_btree_node_get(struct bch_fs *c, struct btree_iter *iter,
 	struct btree *b;
 	struct bset_tree *t;
 
-	/*
-	 * XXX: locking optimization
-	 *
-	 * we can make the locking looser here - caller can drop lock on parent
-	 * node before locking child node (and potentially blocking): we just
-	 * have to have bch2_btree_node_fill() call relock on the parent and
-	 * return -EINTR if that fails
-	 */
-	EBUG_ON(!btree_node_locked(iter, level + 1));
 	EBUG_ON(level >= BTREE_MAX_DEPTH);
+
+	b = btree_node_mem_ptr(k);
+	if (b)
+		goto lock_node;
 retry:
 	b = btree_cache_find(bc, k);
 	if (unlikely(!b)) {
@@ -694,6 +686,7 @@ retry:
 		if (IS_ERR(b))
 			return b;
 	} else {
+lock_node:
 		/*
 		 * There's a potential deadlock with splits and insertions into
 		 * interior nodes we have to avoid:
@@ -740,6 +733,7 @@ retry:
 		}
 	}
 
+	/* XXX: waiting on IO with btree locks held: */
 	wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight,
 		       TASK_UNINTERRUPTIBLE);
 
diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h
index 6e7edcaf6675..5d85987457bf 100644
--- a/fs/bcachefs/btree_cache.h
+++ b/fs/bcachefs/btree_cache.h
@@ -47,6 +47,13 @@ static inline u64 btree_ptr_hash_val(const struct bkey_i *k)
 	}
 }
 
+static inline struct btree *btree_node_mem_ptr(const struct bkey_i *k)
+{
+	return k->k.type == KEY_TYPE_btree_ptr_v2
+		? (void *)(unsigned long)bkey_i_to_btree_ptr_v2_c(k)->v.mem_ptr
+		: NULL;
+}
+
 /* is btree node in hash table? */
 static inline bool btree_node_hashed(struct btree *b)
 {
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 5fa31698ed67..00d796cb418b 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -1647,6 +1647,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
 
 	b->written += sectors_to_write;
 
+	/* XXX: submitting IO with btree locks held: */
 	bch2_submit_wbio_replicas(&wbio->wbio, c, BCH_DATA_BTREE, &k.key);
 	return;
 err:
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 37c60842c670..3817dcb5fa1f 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -912,6 +912,27 @@ static void btree_iter_prefetch(struct btree_iter *iter)
 		btree_node_unlock(iter, iter->level);
 }
 
+static noinline void btree_node_mem_ptr_set(struct btree_iter *iter,
+					    unsigned plevel, struct btree *b)
+{
+	struct btree_iter_level *l = &iter->l[plevel];
+	bool locked = btree_node_locked(iter, plevel);
+	struct bkey_packed *k;
+	struct bch_btree_ptr_v2 *bp;
+
+	if (!bch2_btree_node_relock(iter, plevel))
+		return;
+
+	k = bch2_btree_node_iter_peek_all(&l->iter, l->b);
+	BUG_ON(k->type != KEY_TYPE_btree_ptr_v2);
+
+	bp = (void *) bkeyp_val(&l->b->format, k);
+	bp->mem_ptr = (unsigned long)b;
+
+	if (!locked)
+		btree_node_unlock(iter, plevel);
+}
+
 static __always_inline int btree_iter_down(struct btree_iter *iter)
 {
 	struct bch_fs *c = iter->trans->c;
@@ -933,6 +954,10 @@ static __always_inline int btree_iter_down(struct btree_iter *iter)
 	mark_btree_node_locked(iter, level, lock_type);
 	btree_iter_node_set(iter, b);
 
+	if (tmp.k.k.type == KEY_TYPE_btree_ptr_v2 &&
+	    unlikely(b != btree_node_mem_ptr(&tmp.k)))
+		btree_node_mem_ptr_set(iter, level + 1, b);
+
 	if (iter->flags & BTREE_ITER_PREFETCH)
 		btree_iter_prefetch(iter);
 
-- 
cgit 


From 7f81d4cf69c857881cd5139cbe064512a60dae78 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 26 Feb 2020 17:25:13 -0500
Subject: bcachefs: fix setting btree_node_accessed()

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_cache.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index 40281a9acbbc..cb843a362cb4 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -748,7 +748,7 @@ lock_node:
 	}
 
 	/* avoid atomic set bit if it's not needed: */
-	if (btree_node_accessed(b))
+	if (!btree_node_accessed(b))
 		set_btree_node_accessed(b);
 
 	if (unlikely(btree_node_read_error(b))) {
-- 
cgit 


From b807a0c8baf64a9eeae9449dcc8e82b8952db394 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 26 Feb 2020 17:34:27 -0500
Subject: bcachefs: BCH_SB_FEATURES_ALL

BCH_FEATURE_btree_ptr_v2 wasn't getting set on new filesystems, oops

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs_format.h | 5 +++++
 fs/bcachefs/recovery.c        | 7 ++-----
 2 files changed, 7 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index 575fb7143cc0..d1c0a5d5580e 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -1317,6 +1317,11 @@ LE64_BITMASK(BCH_SB_ERASURE_CODE,	struct bch_sb, flags[3],  0, 16);
 	x(incompressible,		10)	\
 	x(btree_ptr_v2,			11)
 
+#define BCH_SB_FEATURES_ALL				\
+	((1ULL << BCH_FEATURE_new_siphash)|		\
+	 (1ULL << BCH_FEATURE_new_extent_overwrite)|	\
+	 (1ULL << BCH_FEATURE_btree_ptr_v2))
+
 enum bch_sb_feature {
 #define x(f, n) BCH_FEATURE_##f,
 	BCH_SB_FEATURES()
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index c9d12f7c180e..1871485c079d 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -1008,9 +1008,7 @@ int bch2_fs_recovery(struct bch_fs *c)
 			c->disk_sb.sb->version_min =
 				le16_to_cpu(bcachefs_metadata_version_min);
 		c->disk_sb.sb->version = le16_to_cpu(bcachefs_metadata_version_current);
-		c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_new_siphash;
-		c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_new_extent_overwrite;
-		c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_btree_ptr_v2;
+		c->disk_sb.sb->features[0] |= BCH_SB_FEATURES_ALL;
 		write_sb = true;
 	}
 
@@ -1129,8 +1127,7 @@ int bch2_fs_initialize(struct bch_fs *c)
 	c->disk_sb.sb->version = c->disk_sb.sb->version_min =
 		le16_to_cpu(bcachefs_metadata_version_current);
 	c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_atomic_nlink;
-	c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_new_siphash;
-	c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_new_extent_overwrite;
+	c->disk_sb.sb->features[0] |= BCH_SB_FEATURES_ALL;
 
 	SET_BCH_SB_INITIALIZED(c->disk_sb.sb, true);
 	SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
-- 
cgit 


From f1f5f114cdd1af3ff68a0da972454d9956c26eb0 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 26 Feb 2020 20:39:06 -0500
Subject: bcachefs: Improve an error message

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_io.c | 27 +++++++++++++++------------
 1 file changed, 15 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 00d796cb418b..a4732bf13a11 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -708,15 +708,15 @@ static int validate_bset(struct bch_fs *c, struct btree *b,
 			 unsigned *whiteout_u64s, int write,
 			 bool have_retry)
 {
-	struct bkey_packed *k, *prev = NULL;
-	struct bpos prev_pos	= POS_MIN;
+	struct bkey_packed *k;
+	struct bkey prev	= KEY(0, 0, 0);
 	struct bpos prev_data	= POS_MIN;
 	bool seen_non_whiteout = false;
 	unsigned version;
 	const char *err;
 	int ret = 0;
 
-	if (i == &b->data->keys) {
+	if (!b->written) {
 		/* These indicate that we read the wrong btree node: */
 		btree_err_on(BTREE_NODE_ID(b->data) != b->c.btree_id,
 			     BTREE_ERR_MUST_RETRY, c, b, i,
@@ -852,25 +852,28 @@ static int validate_bset(struct bch_fs *c, struct btree *b,
 
 		if (!seen_non_whiteout &&
 		    (!bkey_whiteout(k) ||
-		     (bkey_cmp(prev_pos, bkey_start_pos(u.k)) > 0))) {
+		     (bkey_cmp(prev.p, bkey_start_pos(u.k)) > 0))) {
 			*whiteout_u64s = k->_data - i->_data;
 			seen_non_whiteout = true;
 		} else if (bkey_cmp(prev_data, bkey_start_pos(u.k)) > 0 ||
-			   bkey_cmp(prev_pos, u.k->p) > 0) {
+			   bkey_cmp(prev.p, u.k->p) > 0) {
+			char buf1[80];
+			char buf2[80];
+
+			bch2_bkey_to_text(&PBUF(buf1), &prev);
+			bch2_bkey_to_text(&PBUF(buf2), u.k);
+
+			bch2_dump_bset(b, i, 0);
 			btree_err(BTREE_ERR_FATAL, c, b, i,
-				  "keys out of order: %llu:%llu > %llu:%llu",
-				  prev_pos.inode,
-				  prev_pos.offset,
-				  u.k->p.inode,
-				  bkey_start_offset(u.k));
+				  "keys out of order: %s > %s",
+				  buf1, buf2);
 			/* XXX: repair this */
 		}
 
 		if (!bkey_deleted(u.k))
 			prev_data = u.k->p;
-		prev_pos = u.k->p;
+		prev = *u.k;
 
-		prev = k;
 		k = bkey_next_skip_noops(k, vstruct_last(i));
 	}
 
-- 
cgit 


From 38f0664a5f596faf7d0a247697d6f21a85133cb9 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 26 Feb 2020 22:29:52 -0500
Subject: bcachefs: Fix error message on bucket sector count overflow

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/buckets.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 9fae7d9fb495..7e0412dac5ff 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -1444,8 +1444,7 @@ static int bch2_trans_mark_pointer(struct btree_trans *trans,
 	struct bkey_s_c k;
 	struct bkey_alloc_unpacked u;
 	struct bkey_i_alloc *a;
-	u16 *dst_sectors;
-	bool overflow;
+	u16 *dst_sectors, orig_sectors;
 	int ret;
 
 	ret = trans_get_key(trans, BTREE_ID_ALLOC,
@@ -1502,13 +1501,12 @@ static int bch2_trans_mark_pointer(struct btree_trans *trans,
 	dst_sectors = !p.ptr.cached
 		? &u.dirty_sectors
 		: &u.cached_sectors;
+	orig_sectors = *dst_sectors;
 
-	overflow = checked_add(*dst_sectors, sectors);
-
-	if (overflow) {
+	if (checked_add(*dst_sectors, sectors)) {
 		bch2_fs_inconsistent(c,
 			"bucket sector count overflow: %u + %lli > U16_MAX",
-			*dst_sectors, sectors);
+			orig_sectors, sectors);
 		/* return an error indicating that we need full fsck */
 		ret = -EIO;
 		goto out;
-- 
cgit 


From 883f1a7ce05f5f723ce5b47b9d9b796b53b3692a Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 27 Feb 2020 15:03:53 -0500
Subject: bcachefs: Dont't del sysfs dir until after we go RO

This will help for debugging hangs during unmount

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/super.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 2ba04b08529d..8670be394239 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -518,6 +518,10 @@ void bch2_fs_stop(struct bch_fs *c)
 
 	cancel_work_sync(&c->journal_seq_blacklist_gc_work);
 
+	mutex_lock(&c->state_lock);
+	bch2_fs_read_only(c);
+	mutex_unlock(&c->state_lock);
+
 	for_each_member_device(ca, c, i)
 		if (ca->kobj.state_in_sysfs &&
 		    ca->disk_sb.bdev)
@@ -540,10 +544,6 @@ void bch2_fs_stop(struct bch_fs *c)
 	closure_sync(&c->cl);
 	closure_debug_destroy(&c->cl);
 
-	mutex_lock(&c->state_lock);
-	bch2_fs_read_only(c);
-	mutex_unlock(&c->state_lock);
-
 	/* btree prefetch might have kicked off reads in the background: */
 	bch2_btree_flush_all_reads(c);
 
-- 
cgit 


From 3f58a19763d9c4c09ec27152865bc892aa53410a Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 27 Feb 2020 15:03:44 -0500
Subject: bcachefs: Journal pin cleanups

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update_interior.c | 12 +++---
 fs/bcachefs/btree_update_leaf.c     | 17 +++-----
 fs/bcachefs/journal_reclaim.c       | 86 +++++++++++++------------------------
 fs/bcachefs/journal_reclaim.h       | 26 +++++++----
 4 files changed, 59 insertions(+), 82 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index b159ce9b3b7a..12ff2aea0d05 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -867,8 +867,8 @@ static void btree_update_reparent(struct btree_update *as,
 	 * just transfer the journal pin to the new interior update so
 	 * btree_update_nodes_written() can drop it.
 	 */
-	bch2_journal_pin_add_if_older(&c->journal, &child->journal,
-				      &as->journal, interior_update_flush);
+	bch2_journal_pin_copy(&c->journal, &as->journal,
+			      &child->journal, interior_update_flush);
 	bch2_journal_pin_drop(&c->journal, &child->journal);
 
 	as->journal_seq = max(as->journal_seq, child->journal_seq);
@@ -1049,13 +1049,13 @@ void bch2_btree_interior_update_will_free_node(struct btree_update *as,
 	 * oldest pin of any of the nodes we're freeing. We'll release the pin
 	 * when the new nodes are persistent and reachable on disk:
 	 */
-	bch2_journal_pin_add_if_older(&c->journal, &w->journal,
-				      &as->journal, interior_update_flush);
+	bch2_journal_pin_copy(&c->journal, &as->journal,
+			      &w->journal, interior_update_flush);
 	bch2_journal_pin_drop(&c->journal, &w->journal);
 
 	w = btree_prev_write(b);
-	bch2_journal_pin_add_if_older(&c->journal, &w->journal,
-				      &as->journal, interior_update_flush);
+	bch2_journal_pin_copy(&c->journal, &as->journal,
+			      &w->journal, interior_update_flush);
 	bch2_journal_pin_drop(&c->journal, &w->journal);
 
 	mutex_unlock(&c->btree_interior_update_lock);
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 8e4a47acd667..94418c9b42e8 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -172,6 +172,9 @@ void bch2_btree_journal_key(struct btree_trans *trans,
 	struct journal *j = &c->journal;
 	struct btree *b = iter->l[0].b;
 	struct btree_write *w = btree_current_write(b);
+	u64 seq = likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))
+		? trans->journal_res.seq
+		: j->replay_journal_seq;
 
 	EBUG_ON(iter->level || b->c.level);
 	EBUG_ON(trans->journal_res.ref !=
@@ -183,16 +186,10 @@ void bch2_btree_journal_key(struct btree_trans *trans,
 			cpu_to_le64(trans->journal_res.seq);
 	}
 
-	if (unlikely(!journal_pin_active(&w->journal))) {
-		u64 seq = likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))
-			? trans->journal_res.seq
-			: j->replay_journal_seq;
-
-		bch2_journal_pin_add(j, seq, &w->journal,
-				     btree_node_write_idx(b) == 0
-				     ? btree_node_flush0
-				     : btree_node_flush1);
-	}
+	bch2_journal_pin_add(j, seq, &w->journal,
+			     btree_node_write_idx(b) == 0
+			     ? btree_node_flush0
+			     : btree_node_flush1);
 
 	if (unlikely(!btree_node_dirty(b)))
 		set_btree_node_dirty(b);
diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
index 2f67ea2debd2..5233cb82d422 100644
--- a/fs/bcachefs/journal_reclaim.c
+++ b/fs/bcachefs/journal_reclaim.c
@@ -290,38 +290,6 @@ void bch2_journal_pin_put(struct journal *j, u64 seq)
 	}
 }
 
-static inline void __journal_pin_add(struct journal *j,
-				     u64 seq,
-				     struct journal_entry_pin *pin,
-				     journal_pin_flush_fn flush_fn)
-{
-	struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq);
-
-	BUG_ON(journal_pin_active(pin));
-	BUG_ON(!atomic_read(&pin_list->count));
-
-	atomic_inc(&pin_list->count);
-	pin->seq	= seq;
-	pin->flush	= flush_fn;
-
-	list_add(&pin->list, flush_fn ? &pin_list->list : &pin_list->flushed);
-
-	/*
-	 * If the journal is currently full,  we might want to call flush_fn
-	 * immediately:
-	 */
-	journal_wake(j);
-}
-
-void bch2_journal_pin_add(struct journal *j, u64 seq,
-			  struct journal_entry_pin *pin,
-			  journal_pin_flush_fn flush_fn)
-{
-	spin_lock(&j->lock);
-	__journal_pin_add(j, seq, pin, flush_fn);
-	spin_unlock(&j->lock);
-}
-
 static inline void __journal_pin_drop(struct journal *j,
 				      struct journal_entry_pin *pin)
 {
@@ -354,42 +322,46 @@ void bch2_journal_pin_drop(struct journal *j,
 	spin_unlock(&j->lock);
 }
 
-void bch2_journal_pin_update(struct journal *j, u64 seq,
-			     struct journal_entry_pin *pin,
-			     journal_pin_flush_fn flush_fn)
+void __bch2_journal_pin_add(struct journal *j, u64 seq,
+			    struct journal_entry_pin *pin,
+			    journal_pin_flush_fn flush_fn)
 {
+	struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq);
+
 	spin_lock(&j->lock);
 
-	if (pin->seq != seq) {
-		__journal_pin_drop(j, pin);
-		__journal_pin_add(j, seq, pin, flush_fn);
-	} else {
-		struct journal_entry_pin_list *pin_list =
-			journal_seq_pin(j, seq);
+	__journal_pin_drop(j, pin);
+
+	BUG_ON(!atomic_read(&pin_list->count));
 
-		list_move(&pin->list, &pin_list->list);
-	}
+	atomic_inc(&pin_list->count);
+	pin->seq	= seq;
+	pin->flush	= flush_fn;
+
+	list_add(&pin->list, flush_fn ? &pin_list->list : &pin_list->flushed);
 
 	spin_unlock(&j->lock);
+
+	/*
+	 * If the journal is currently full,  we might want to call flush_fn
+	 * immediately:
+	 */
+	journal_wake(j);
 }
 
-void bch2_journal_pin_add_if_older(struct journal *j,
-				  struct journal_entry_pin *src_pin,
-				  struct journal_entry_pin *pin,
-				  journal_pin_flush_fn flush_fn)
+void bch2_journal_pin_copy(struct journal *j,
+			   struct journal_entry_pin *dst,
+			   struct journal_entry_pin *src,
+			   journal_pin_flush_fn flush_fn)
 {
-	spin_lock(&j->lock);
-
-	if (journal_pin_active(src_pin) &&
-	    (!journal_pin_active(pin) ||
-	     src_pin->seq < pin->seq)) {
-		__journal_pin_drop(j, pin);
-		__journal_pin_add(j, src_pin->seq, pin, flush_fn);
-	}
-
-	spin_unlock(&j->lock);
+	if (journal_pin_active(src) &&
+	    (!journal_pin_active(dst) || src->seq < dst->seq))
+		__bch2_journal_pin_add(j, src->seq, dst, flush_fn);
 }
 
+/**
+ * bch2_journal_pin_flush: ensure journal pin callback is no longer running
+ */
 void bch2_journal_pin_flush(struct journal *j, struct journal_entry_pin *pin)
 {
 	BUG_ON(journal_pin_active(pin));
diff --git a/fs/bcachefs/journal_reclaim.h b/fs/bcachefs/journal_reclaim.h
index 9bf982a17797..883a0a5680af 100644
--- a/fs/bcachefs/journal_reclaim.h
+++ b/fs/bcachefs/journal_reclaim.h
@@ -29,16 +29,24 @@ journal_seq_pin(struct journal *j, u64 seq)
 }
 
 void bch2_journal_pin_put(struct journal *, u64);
-
-void bch2_journal_pin_add(struct journal *, u64, struct journal_entry_pin *,
-			  journal_pin_flush_fn);
-void bch2_journal_pin_update(struct journal *, u64, struct journal_entry_pin *,
-			     journal_pin_flush_fn);
 void bch2_journal_pin_drop(struct journal *, struct journal_entry_pin *);
-void bch2_journal_pin_add_if_older(struct journal *,
-				  struct journal_entry_pin *,
-				  struct journal_entry_pin *,
-				  journal_pin_flush_fn);
+
+void __bch2_journal_pin_add(struct journal *, u64, struct journal_entry_pin *,
+			    journal_pin_flush_fn);
+
+static inline void bch2_journal_pin_add(struct journal *j, u64 seq,
+					struct journal_entry_pin *pin,
+					journal_pin_flush_fn flush_fn)
+{
+	if (unlikely(!journal_pin_active(pin)))
+		__bch2_journal_pin_add(j, seq, pin, flush_fn);
+}
+
+void bch2_journal_pin_copy(struct journal *,
+			   struct journal_entry_pin *,
+			   struct journal_entry_pin *,
+			   journal_pin_flush_fn);
+
 void bch2_journal_pin_flush(struct journal *, struct journal_entry_pin *);
 
 void bch2_journal_do_discards(struct journal *);
-- 
cgit 


From e3ecf4f56811ec538ed93fe8dbeb68c81ba74cc8 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 2 Mar 2020 13:38:19 -0500
Subject: bcachefs: Some btree iterator improvements

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c | 83 ++++++++++++++++++++++--------------------------
 fs/bcachefs/tests.c      | 46 ++++++++++++++++-----------
 2 files changed, 65 insertions(+), 64 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 3817dcb5fa1f..f745d228d21c 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -35,6 +35,26 @@ static inline struct bpos btree_iter_search_key(struct btree_iter *iter)
 	return pos;
 }
 
+static inline bool btree_iter_pos_before_node(struct btree_iter *iter,
+					      struct btree *b)
+{
+	return bkey_cmp(iter->pos, b->data->min_key) < 0;
+}
+
+static inline bool btree_iter_pos_after_node(struct btree_iter *iter,
+					     struct btree *b)
+{
+	return bkey_cmp(b->key.k.p, btree_iter_search_key(iter)) < 0;
+}
+
+static inline bool btree_iter_pos_in_node(struct btree_iter *iter,
+					  struct btree *b)
+{
+	return iter->btree_id == b->c.btree_id &&
+		!btree_iter_pos_before_node(iter, b) &&
+		!btree_iter_pos_after_node(iter, b);
+}
+
 /* Btree node locking: */
 
 void bch2_btree_node_unlock_write(struct btree *b, struct btree_iter *iter)
@@ -399,6 +419,8 @@ static void __bch2_btree_iter_verify(struct btree_iter *iter,
 	if (iter->uptodate > BTREE_ITER_NEED_PEEK)
 		return;
 
+	BUG_ON(!btree_iter_pos_in_node(iter, b));
+
 	bch2_btree_node_iter_verify(&l->iter, b);
 
 	/*
@@ -736,26 +758,6 @@ static void btree_iter_verify_new_node(struct btree_iter *iter, struct btree *b)
 		btree_node_unlock(iter, b->c.level + 1);
 }
 
-static inline bool btree_iter_pos_before_node(struct btree_iter *iter,
-					      struct btree *b)
-{
-	return bkey_cmp(iter->pos, b->data->min_key) < 0;
-}
-
-static inline bool btree_iter_pos_after_node(struct btree_iter *iter,
-					     struct btree *b)
-{
-	return bkey_cmp(b->key.k.p, btree_iter_search_key(iter)) < 0;
-}
-
-static inline bool btree_iter_pos_in_node(struct btree_iter *iter,
-					  struct btree *b)
-{
-	return iter->btree_id == b->c.btree_id &&
-		!btree_iter_pos_before_node(iter, b) &&
-		!btree_iter_pos_after_node(iter, b);
-}
-
 static inline void __btree_iter_init(struct btree_iter *iter,
 				     unsigned level)
 {
@@ -1373,6 +1375,10 @@ static inline bool btree_iter_set_pos_to_prev_leaf(struct btree_iter *iter)
 	return true;
 }
 
+/**
+ * btree_iter_peek_uptodate - given an iterator that is uptodate, return the key
+ * it currently points to
+ */
 static inline struct bkey_s_c btree_iter_peek_uptodate(struct btree_iter *iter)
 {
 	struct btree_iter_level *l = &iter->l[0];
@@ -1409,7 +1415,8 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
 
 	bch2_btree_iter_checks(iter, BTREE_ITER_KEYS);
 
-	if (iter->uptodate == BTREE_ITER_UPTODATE)
+	if (iter->uptodate == BTREE_ITER_UPTODATE &&
+	    !bkey_deleted(&iter->k))
 		return btree_iter_peek_uptodate(iter);
 
 	while (1) {
@@ -1503,7 +1510,8 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
 
 	bch2_btree_iter_checks(iter, BTREE_ITER_KEYS);
 
-	if (iter->uptodate == BTREE_ITER_UPTODATE)
+	if (iter->uptodate == BTREE_ITER_UPTODATE &&
+	    !bkey_deleted(&iter->k))
 		return btree_iter_peek_uptodate(iter);
 
 	while (1) {
@@ -1655,33 +1663,15 @@ __bch2_btree_iter_peek_slot(struct btree_iter *iter)
 {
 	struct btree_iter_level *l = &iter->l[0];
 	struct bkey_s_c k;
-	int ret;
 
 	if (iter->flags & BTREE_ITER_IS_EXTENTS)
 		return __bch2_btree_iter_peek_slot_extents(iter);
 
-recheck:
-	while ((k = __btree_iter_peek_all(iter, l, &iter->k)).k &&
-	       bkey_deleted(k.k) &&
-	       bkey_cmp(k.k->p, iter->pos) == 0)
-		bch2_btree_node_iter_advance(&l->iter, l->b);
+	k = __btree_iter_peek_all(iter, l, &iter->k);
 
-	/*
-	 * If we got to the end of the node, check if we need to traverse to the
-	 * next node:
-	 */
-	if (unlikely(!k.k && btree_iter_pos_after_node(iter, l->b))) {
-		btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
-		ret = bch2_btree_iter_traverse(iter);
-		if (unlikely(ret))
-			return bkey_s_c_err(ret);
+	EBUG_ON(k.k && bkey_deleted(k.k) && bkey_cmp(k.k->p, iter->pos) == 0);
 
-		goto recheck;
-	}
-
-	if (!k.k ||
-	    bkey_deleted(k.k) ||
-	    bkey_cmp(iter->pos, k.k->p)) {
+	if (!k.k || bkey_cmp(iter->pos, k.k->p)) {
 		/* hole */
 		bkey_init(&iter->k);
 		iter->k.p = iter->pos;
@@ -1713,8 +1703,12 @@ struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *iter)
 {
 	bch2_btree_iter_checks(iter, BTREE_ITER_KEYS);
 
+	/* XXX directly setting iter->pos is wrong */
 	iter->pos = btree_type_successor(iter->btree_id, iter->k.p);
 
+	if (unlikely(btree_iter_pos_after_node(iter, iter->l[0].b)))
+		btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
+
 	if (unlikely(iter->uptodate != BTREE_ITER_UPTODATE)) {
 		/*
 		 * XXX: when we just need to relock we should be able to avoid
@@ -1726,8 +1720,7 @@ struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *iter)
 		return bch2_btree_iter_peek_slot(iter);
 	}
 
-	if (!bkey_deleted(&iter->k))
-		bch2_btree_node_iter_advance(&iter->l[0].iter, iter->l[0].b);
+	btree_iter_advance_to_pos(iter, &iter->l[0], -1);
 
 	btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
 
diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c
index 876d64bfca20..6aa31369ecc9 100644
--- a/fs/bcachefs/tests.c
+++ b/fs/bcachefs/tests.c
@@ -18,7 +18,7 @@ static void delete_test_keys(struct bch_fs *c)
 				      NULL);
 	BUG_ON(ret);
 
-	ret = bch2_btree_delete_range(c, BTREE_ID_DIRENTS,
+	ret = bch2_btree_delete_range(c, BTREE_ID_XATTRS,
 				      POS(0, 0), POS(0, U64_MAX),
 				      NULL);
 	BUG_ON(ret);
@@ -37,7 +37,7 @@ static void test_delete(struct bch_fs *c, u64 nr)
 
 	bch2_trans_init(&trans, c, 0, 0);
 
-	iter = bch2_trans_get_iter(&trans, BTREE_ID_DIRENTS, k.k.p,
+	iter = bch2_trans_get_iter(&trans, BTREE_ID_XATTRS, k.k.p,
 				   BTREE_ITER_INTENT);
 
 	ret = bch2_btree_iter_traverse(iter);
@@ -69,7 +69,7 @@ static void test_delete_written(struct bch_fs *c, u64 nr)
 
 	bch2_trans_init(&trans, c, 0, 0);
 
-	iter = bch2_trans_get_iter(&trans, BTREE_ID_DIRENTS, k.k.p,
+	iter = bch2_trans_get_iter(&trans, BTREE_ID_XATTRS, k.k.p,
 				   BTREE_ITER_INTENT);
 
 	ret = bch2_btree_iter_traverse(iter);
@@ -107,7 +107,7 @@ static void test_iterate(struct bch_fs *c, u64 nr)
 		bkey_cookie_init(&k.k_i);
 		k.k.p.offset = i;
 
-		ret = bch2_btree_insert(c, BTREE_ID_DIRENTS, &k.k_i,
+		ret = bch2_btree_insert(c, BTREE_ID_XATTRS, &k.k_i,
 					NULL, NULL, 0);
 		BUG_ON(ret);
 	}
@@ -116,9 +116,13 @@ static void test_iterate(struct bch_fs *c, u64 nr)
 
 	i = 0;
 
-	for_each_btree_key(&trans, iter, BTREE_ID_DIRENTS,
-			   POS_MIN, 0, k, ret)
+	for_each_btree_key(&trans, iter, BTREE_ID_XATTRS,
+			   POS_MIN, 0, k, ret) {
+		if (k.k->p.inode)
+			break;
+
 		BUG_ON(k.k->p.offset != i++);
+	}
 
 	BUG_ON(i != nr);
 
@@ -202,7 +206,7 @@ static void test_iterate_slots(struct bch_fs *c, u64 nr)
 		bkey_cookie_init(&k.k_i);
 		k.k.p.offset = i * 2;
 
-		ret = bch2_btree_insert(c, BTREE_ID_DIRENTS, &k.k_i,
+		ret = bch2_btree_insert(c, BTREE_ID_XATTRS, &k.k_i,
 					NULL, NULL, 0);
 		BUG_ON(ret);
 	}
@@ -211,8 +215,11 @@ static void test_iterate_slots(struct bch_fs *c, u64 nr)
 
 	i = 0;
 
-	for_each_btree_key(&trans, iter, BTREE_ID_DIRENTS, POS_MIN,
+	for_each_btree_key(&trans, iter, BTREE_ID_XATTRS, POS_MIN,
 			   0, k, ret) {
+		if (k.k->p.inode)
+			break;
+
 		BUG_ON(k.k->p.offset != i);
 		i += 2;
 	}
@@ -224,11 +231,12 @@ static void test_iterate_slots(struct bch_fs *c, u64 nr)
 
 	i = 0;
 
-	for_each_btree_key(&trans, iter, BTREE_ID_DIRENTS, POS_MIN,
+	for_each_btree_key(&trans, iter, BTREE_ID_XATTRS, POS_MIN,
 			   BTREE_ITER_SLOTS, k, ret) {
+		BUG_ON(k.k->p.offset != i);
 		BUG_ON(bkey_deleted(k.k) != (i & 1));
-		BUG_ON(k.k->p.offset != i++);
 
+		i++;
 		if (i == nr * 2)
 			break;
 	}
@@ -307,7 +315,7 @@ static void test_peek_end(struct bch_fs *c, u64 nr)
 
 	bch2_trans_init(&trans, c, 0, 0);
 
-	iter = bch2_trans_get_iter(&trans, BTREE_ID_DIRENTS, POS_MIN, 0);
+	iter = bch2_trans_get_iter(&trans, BTREE_ID_XATTRS, POS_MIN, 0);
 
 	k = bch2_btree_iter_peek(iter);
 	BUG_ON(k.k);
@@ -421,7 +429,7 @@ static void rand_insert(struct bch_fs *c, u64 nr)
 		k.k.p.offset = test_rand();
 
 		ret = __bch2_trans_do(&trans, NULL, NULL, 0,
-			__bch2_btree_insert(&trans, BTREE_ID_DIRENTS, &k.k_i));
+			__bch2_btree_insert(&trans, BTREE_ID_XATTRS, &k.k_i));
 
 		BUG_ON(ret);
 	}
@@ -439,7 +447,7 @@ static void rand_lookup(struct bch_fs *c, u64 nr)
 	bch2_trans_init(&trans, c, 0, 0);
 
 	for (i = 0; i < nr; i++) {
-		iter = bch2_trans_get_iter(&trans, BTREE_ID_DIRENTS,
+		iter = bch2_trans_get_iter(&trans, BTREE_ID_XATTRS,
 					   POS(0, test_rand()), 0);
 
 		k = bch2_btree_iter_peek(iter);
@@ -460,7 +468,7 @@ static void rand_mixed(struct bch_fs *c, u64 nr)
 	bch2_trans_init(&trans, c, 0, 0);
 
 	for (i = 0; i < nr; i++) {
-		iter = bch2_trans_get_iter(&trans, BTREE_ID_DIRENTS,
+		iter = bch2_trans_get_iter(&trans, BTREE_ID_XATTRS,
 					   POS(0, test_rand()), 0);
 
 		k = bch2_btree_iter_peek(iter);
@@ -490,7 +498,7 @@ static int __do_delete(struct btree_trans *trans, struct bpos pos)
 	struct bkey_s_c k;
 	int ret = 0;
 
-	iter = bch2_trans_get_iter(trans, BTREE_ID_DIRENTS, pos,
+	iter = bch2_trans_get_iter(trans, BTREE_ID_XATTRS, pos,
 				   BTREE_ITER_INTENT);
 	ret = PTR_ERR_OR_ZERO(iter);
 	if (ret)
@@ -542,7 +550,7 @@ static void seq_insert(struct bch_fs *c, u64 nr)
 
 	bch2_trans_init(&trans, c, 0, 0);
 
-	for_each_btree_key(&trans, iter, BTREE_ID_DIRENTS, POS_MIN,
+	for_each_btree_key(&trans, iter, BTREE_ID_XATTRS, POS_MIN,
 			   BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) {
 		insert.k.p = iter->pos;
 
@@ -566,7 +574,7 @@ static void seq_lookup(struct bch_fs *c, u64 nr)
 
 	bch2_trans_init(&trans, c, 0, 0);
 
-	for_each_btree_key(&trans, iter, BTREE_ID_DIRENTS, POS_MIN, 0, k, ret)
+	for_each_btree_key(&trans, iter, BTREE_ID_XATTRS, POS_MIN, 0, k, ret)
 		;
 	bch2_trans_exit(&trans);
 }
@@ -580,7 +588,7 @@ static void seq_overwrite(struct bch_fs *c, u64 nr)
 
 	bch2_trans_init(&trans, c, 0, 0);
 
-	for_each_btree_key(&trans, iter, BTREE_ID_DIRENTS, POS_MIN,
+	for_each_btree_key(&trans, iter, BTREE_ID_XATTRS, POS_MIN,
 			   BTREE_ITER_INTENT, k, ret) {
 		struct bkey_i_cookie u;
 
@@ -598,7 +606,7 @@ static void seq_delete(struct bch_fs *c, u64 nr)
 {
 	int ret;
 
-	ret = bch2_btree_delete_range(c, BTREE_ID_DIRENTS,
+	ret = bch2_btree_delete_range(c, BTREE_ID_XATTRS,
 				      POS(0, 0), POS(0, U64_MAX),
 				      NULL);
 	BUG_ON(ret);
-- 
cgit 


From 7699cdd58727cdb1960e6f2ccd8aeea510a0589b Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 2 Mar 2020 17:08:19 -0500
Subject: bcachefs: Fix extent_sort_fix_overlapping()

Recently the extent update path started emmiting 0 size whiteouts on
extent overwrite, as part of transitioning to moving extent handling
out of the core btree code.

Unfortunately, this broke the old code path that handles overlapping
extents when reading in btree nodes - it relies on sorting incomming
extents by start position, but the 0 size whiteouts broke that ordering.
Skipping over them before the main algorithm sees them fixes this.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bkey_sort.c | 39 +++++++++++++++++++++++++++++++--------
 1 file changed, 31 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bkey_sort.c b/fs/bcachefs/bkey_sort.c
index 7cbb57042af1..68965a0f973a 100644
--- a/fs/bcachefs/bkey_sort.c
+++ b/fs/bcachefs/bkey_sort.c
@@ -311,6 +311,25 @@ static inline int extent_sort_fix_overlapping_cmp(struct btree *b,
 		cmp_int((unsigned long) r, (unsigned long) l);
 }
 
+/*
+ * The algorithm in extent_sort_fix_overlapping() relies on keys in the same
+ * bset being ordered by start offset - but 0 size whiteouts (which are always
+ * KEY_TYPE_deleted) break this ordering, so we need to skip over them:
+ */
+static void extent_iter_advance(struct sort_iter *iter, unsigned idx)
+{
+	struct sort_iter_set *i = iter->data + idx;
+
+	do {
+		i->k = bkey_next_skip_noops(i->k, i->end);
+	} while (i->k != i->end && bkey_deleted(i->k));
+
+	if (i->k == i->end)
+		array_remove_item(iter->data, iter->used, idx);
+	else
+		__sort_iter_sift(iter, idx, extent_sort_fix_overlapping_cmp);
+}
+
 struct btree_nr_keys
 bch2_extent_sort_fix_overlapping(struct bch_fs *c, struct bset *dst,
 				 struct sort_iter *iter)
@@ -323,19 +342,26 @@ bch2_extent_sort_fix_overlapping(struct bch_fs *c, struct bset *dst,
 	struct bkey_s l, r;
 	struct btree_nr_keys nr;
 	struct bkey_on_stack split;
+	unsigned i;
 
 	memset(&nr, 0, sizeof(nr));
 	bkey_on_stack_init(&split);
 
 	sort_iter_sort(iter, extent_sort_fix_overlapping_cmp);
+	for (i = 0; i < iter->used;) {
+		if (bkey_deleted(iter->data[i].k))
+			__sort_iter_advance(iter, i,
+					    extent_sort_fix_overlapping_cmp);
+		else
+			i++;
+	}
 
 	while (!sort_iter_end(iter)) {
 		l = __bkey_disassemble(b, _l->k, &l_unpacked);
 
 		if (iter->used == 1) {
 			extent_sort_append(c, f, &nr, dst->start, &prev, l);
-			sort_iter_advance(iter,
-					  extent_sort_fix_overlapping_cmp);
+			extent_iter_advance(iter, 0);
 			continue;
 		}
 
@@ -344,15 +370,13 @@ bch2_extent_sort_fix_overlapping(struct bch_fs *c, struct bset *dst,
 		/* If current key and next key don't overlap, just append */
 		if (bkey_cmp(l.k->p, bkey_start_pos(r.k)) <= 0) {
 			extent_sort_append(c, f, &nr, dst->start, &prev, l);
-			sort_iter_advance(iter,
-					  extent_sort_fix_overlapping_cmp);
+			extent_iter_advance(iter, 0);
 			continue;
 		}
 
 		/* Skip 0 size keys */
 		if (!r.k->size) {
-			__sort_iter_advance(iter, 1,
-					    extent_sort_fix_overlapping_cmp);
+			extent_iter_advance(iter, 1);
 			continue;
 		}
 
@@ -369,8 +393,7 @@ bch2_extent_sort_fix_overlapping(struct bch_fs *c, struct bset *dst,
 		if (_l->k > _r->k) {
 			/* l wins, trim r */
 			if (bkey_cmp(l.k->p, r.k->p) >= 0) {
-				__sort_iter_advance(iter, 1,
-					 extent_sort_fix_overlapping_cmp);
+				extent_iter_advance(iter, 1);
 			} else {
 				bch2_cut_front_s(l.k->p, r);
 				extent_save(b, _r->k, r.k);
-- 
cgit 


From e65fcb4362d5480b6a62be32014c8643f70a8bfc Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 5 Mar 2020 17:06:15 -0500
Subject: bcachefs: Fix off by one error in bch2_extent_crc_append()

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/extents.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index cff4955d203b..1ac4f0522043 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -344,7 +344,7 @@ enum merge_result bch2_extent_merge(struct bch_fs *c,
 			    crc_r.uncompressed_size > c->sb.encoded_extent_max)
 				return BCH_MERGE_NOMERGE;
 
-			if (crc_l.uncompressed_size + crc_r.uncompressed_size - 1 >
+			if (crc_l.uncompressed_size + crc_r.uncompressed_size >
 			    bch2_crc_field_size_max[extent_entry_type(en_l)])
 				return BCH_MERGE_NOMERGE;
 
@@ -562,15 +562,15 @@ void bch2_extent_crc_append(struct bkey_i *k,
 	enum bch_extent_entry_type type;
 
 	if (bch_crc_bytes[new.csum_type]	<= 4 &&
-	    new.uncompressed_size - 1		<= CRC32_SIZE_MAX &&
+	    new.uncompressed_size		<= CRC32_SIZE_MAX &&
 	    new.nonce				<= CRC32_NONCE_MAX)
 		type = BCH_EXTENT_ENTRY_crc32;
 	else if (bch_crc_bytes[new.csum_type]	<= 10 &&
-		   new.uncompressed_size - 1	<= CRC64_SIZE_MAX &&
+		   new.uncompressed_size	<= CRC64_SIZE_MAX &&
 		   new.nonce			<= CRC64_NONCE_MAX)
 		type = BCH_EXTENT_ENTRY_crc64;
 	else if (bch_crc_bytes[new.csum_type]	<= 16 &&
-		   new.uncompressed_size - 1	<= CRC128_SIZE_MAX &&
+		   new.uncompressed_size	<= CRC128_SIZE_MAX &&
 		   new.nonce			<= CRC128_NONCE_MAX)
 		type = BCH_EXTENT_ENTRY_crc128;
 	else
-- 
cgit 


From 27beb810235615820584e15afc31e130e90793f8 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 7 Mar 2020 13:30:55 -0500
Subject: bcachefs: Fix another iterator leak

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/recovery.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 1871485c079d..3b9c20cf389a 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -424,6 +424,7 @@ static int __bch2_journal_replay_key(struct btree_trans *trans,
 		return PTR_ERR(iter);
 
 	bch2_trans_update(trans, iter, k, BTREE_TRIGGER_NORUN);
+	bch2_trans_iter_put(trans, iter);
 	return 0;
 }
 
-- 
cgit 


From 24e0c3f8da000a30ab9397af5f977bd4cd0422e0 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 7 Mar 2020 17:20:39 -0500
Subject: bcachefs: Fix bch2_dump_bset()

It's used in the write path when the bset isn't in the btree node
buffer.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bset.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c
index b8fe361978ef..fca713fe50fc 100644
--- a/fs/bcachefs/bset.c
+++ b/fs/bcachefs/bset.c
@@ -67,8 +67,8 @@ void bch2_dump_bset(struct btree *b, struct bset *i, unsigned set)
 		_n = bkey_next_skip_noops(_k, vstruct_last(i));
 
 		bch2_bkey_to_text(&PBUF(buf), &k);
-		printk(KERN_ERR "block %u key %5u: %s\n", set,
-		       __btree_node_key_to_offset(b, _k), buf);
+		printk(KERN_ERR "block %u key %5zu: %s\n", set,
+		       _k->_data - i->_data, buf);
 
 		if (_n == vstruct_last(i))
 			continue;
-- 
cgit 


From a7b46a3db0931e88f837ae741afe9770c6fede49 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 9 Mar 2020 14:19:58 -0400
Subject: bcachefs: Don't log errors that are expected during shutdown

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/io.c   | 3 ++-
 fs/bcachefs/io.h   | 5 +++--
 fs/bcachefs/move.c | 3 ++-
 3 files changed, 7 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index 5f296de282b6..717332072d87 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -1222,7 +1222,8 @@ void bch2_write(struct closure *cl)
 
 	if (c->opts.nochanges ||
 	    !percpu_ref_tryget(&c->writes)) {
-		__bcache_io_error(c, "read only");
+		if (!(op->flags & BCH_WRITE_FROM_INTERNAL))
+			__bcache_io_error(c, "read only");
 		op->error = -EROFS;
 		goto err;
 	}
diff --git a/fs/bcachefs/io.h b/fs/bcachefs/io.h
index bc9f9fec2fd7..c250bceb77ea 100644
--- a/fs/bcachefs/io.h
+++ b/fs/bcachefs/io.h
@@ -35,10 +35,11 @@ enum bch_write_flags {
 	BCH_WRITE_ONLY_SPECIFIED_DEVS	= (1 << 6),
 	BCH_WRITE_NOPUT_RESERVATION	= (1 << 7),
 	BCH_WRITE_WROTE_DATA_INLINE	= (1 << 8),
+	BCH_WRITE_FROM_INTERNAL		= (1 << 9),
 
 	/* Internal: */
-	BCH_WRITE_JOURNAL_SEQ_PTR	= (1 << 9),
-	BCH_WRITE_SKIP_CLOSURE_PUT	= (1 << 10),
+	BCH_WRITE_JOURNAL_SEQ_PTR	= (1 << 10),
+	BCH_WRITE_SKIP_CLOSURE_PUT	= (1 << 11),
 };
 
 static inline u64 *op_journal_seq(struct bch_write_op *op)
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index dbcda8374692..a9a72963e1b7 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -242,7 +242,8 @@ int bch2_migrate_write_init(struct bch_fs *c, struct migrate_write *m,
 	m->op.flags |= BCH_WRITE_ONLY_SPECIFIED_DEVS|
 		BCH_WRITE_PAGES_STABLE|
 		BCH_WRITE_PAGES_OWNED|
-		BCH_WRITE_DATA_ENCODED;
+		BCH_WRITE_DATA_ENCODED|
+		BCH_WRITE_FROM_INTERNAL;
 
 	m->op.nr_replicas	= 1;
 	m->op.nr_replicas_required = 1;
-- 
cgit 


From f6d0368e06be2c889a4ce1097e20606cededd775 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 9 Mar 2020 16:15:54 -0400
Subject: bcachefs: Traverse iterator in journal replay

This fixes a bug where we end up spinning in journal replay - in theory
this shouldn't be necessary though, transaction reset should be
re-traversing all iterators.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/recovery.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 3b9c20cf389a..712a6b1fd968 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -417,15 +417,17 @@ static int __bch2_journal_replay_key(struct btree_trans *trans,
 				     enum btree_id id, struct bkey_i *k)
 {
 	struct btree_iter *iter;
+	int ret;
 
 	iter = bch2_trans_get_iter(trans, id, bkey_start_pos(&k->k),
 				   BTREE_ITER_INTENT);
 	if (IS_ERR(iter))
 		return PTR_ERR(iter);
 
-	bch2_trans_update(trans, iter, k, BTREE_TRIGGER_NORUN);
+	ret   = bch2_btree_iter_traverse(iter) ?:
+		bch2_trans_update(trans, iter, k, BTREE_TRIGGER_NORUN);
 	bch2_trans_iter_put(trans, iter);
-	return 0;
+	return ret;
 }
 
 static int bch2_journal_replay_key(struct bch_fs *c, enum btree_id id,
-- 
cgit 


From 3186c80fe93fd2bae4af68b94bb63403e1180ceb Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 5 Mar 2020 18:43:31 -0500
Subject: bcachefs: Skip 0 size deleted extents in journal replay

These are created by the new extent update path, but not used yet by the
recovery code and they break the existing recovery code, so we can just
skip them.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/recovery.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 712a6b1fd968..bd0edda7abf9 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -230,7 +230,11 @@ static struct journal_keys journal_keys_sort(struct list_head *journal_entries)
 		goto err;
 
 	list_for_each_entry(p, journal_entries, list)
-		for_each_jset_key(k, _n, entry, &p->j)
+		for_each_jset_key(k, _n, entry, &p->j) {
+			if (bkey_deleted(&k->k) &&
+			    btree_node_type_is_extents(entry->btree_id))
+				continue;
+
 			keys.d[keys.nr++] = (struct journal_key) {
 				.btree_id	= entry->btree_id,
 				.pos		= bkey_start_pos(&k->k),
@@ -239,8 +243,9 @@ static struct journal_keys journal_keys_sort(struct list_head *journal_entries)
 					keys.journal_seq_base,
 				.journal_offset	= k->_data - p->j._data,
 			};
+		}
 
-	sort(keys.d, nr_keys, sizeof(keys.d[0]), journal_sort_key_cmp, NULL);
+	sort(keys.d, keys.nr, sizeof(keys.d[0]), journal_sort_key_cmp, NULL);
 
 	i = keys.d;
 	while (i < keys.d + keys.nr) {
-- 
cgit 


From 2dac0eae78f4e3419320cafb3bd0de2a6a4b5dba Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 18 Feb 2020 16:17:55 -0500
Subject: bcachefs: Iterator debug code improvements

More aggressively checking iterator invariants, and fixing the resulting
bugs. Also greatly simplifying iter_next() and iter_next_slot() - they
were hyper optimized before, but the optimizations were getting too
brittle.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bset.c                  |   6 +-
 fs/bcachefs/btree_iter.c            | 216 ++++++++++++++++++------------------
 fs/bcachefs/btree_iter.h            |  10 +-
 fs/bcachefs/btree_types.h           |   3 +-
 fs/bcachefs/btree_update_interior.c |   4 +-
 5 files changed, 120 insertions(+), 119 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c
index fca713fe50fc..09711352094c 100644
--- a/fs/bcachefs/bset.c
+++ b/fs/bcachefs/bset.c
@@ -1665,7 +1665,8 @@ struct bkey_packed *bch2_btree_node_iter_prev_all(struct btree_node_iter *iter,
 	struct bset_tree *t;
 	unsigned end = 0;
 
-	bch2_btree_node_iter_verify(iter, b);
+	if (btree_keys_expensive_checks(b))
+		bch2_btree_node_iter_verify(iter, b);
 
 	for_each_bset(b, t) {
 		k = bch2_bkey_prev_all(b, t,
@@ -1700,7 +1701,8 @@ found:
 	iter->data[0].k = __btree_node_key_to_offset(b, prev);
 	iter->data[0].end = end;
 
-	bch2_btree_node_iter_verify(iter, b);
+	if (btree_keys_expensive_checks(b))
+		bch2_btree_node_iter_verify(iter, b);
 	return prev;
 }
 
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index f745d228d21c..b3f13ed7be00 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -405,23 +405,43 @@ void bch2_trans_unlock(struct btree_trans *trans)
 
 #ifdef CONFIG_BCACHEFS_DEBUG
 
-static void __bch2_btree_iter_verify(struct btree_iter *iter,
-				     struct btree *b)
+static void bch2_btree_iter_verify_level(struct btree_iter *iter,
+					 unsigned level)
 {
 	struct bpos pos = btree_iter_search_key(iter);
-	struct btree_iter_level *l = &iter->l[b->c.level];
+	struct btree_iter_level *l = &iter->l[level];
 	struct btree_node_iter tmp = l->iter;
-	struct bkey_packed *k;
+	bool locked = btree_node_locked(iter, level);
+	struct bkey_packed *p, *k;
+	char buf1[100], buf2[100];
+	const char *msg;
 
 	if (!debug_check_iterators(iter->trans->c))
 		return;
 
-	if (iter->uptodate > BTREE_ITER_NEED_PEEK)
+	BUG_ON(iter->level < iter->min_depth);
+
+	if (!btree_iter_node(iter, level))
+		return;
+
+	if (!bch2_btree_node_relock(iter, level))
 		return;
 
-	BUG_ON(!btree_iter_pos_in_node(iter, b));
+	/*
+	 * Ideally this invariant would always be true, and hopefully in the
+	 * future it will be, but for now set_pos_same_leaf() breaks it:
+	 */
+	BUG_ON(iter->uptodate < BTREE_ITER_NEED_TRAVERSE &&
+	       !btree_iter_pos_in_node(iter, l->b));
+
+	/*
+	 * node iterators don't use leaf node iterator:
+	 */
+	if (btree_iter_type(iter) == BTREE_ITER_NODES &&
+	    level <= iter->min_depth)
+		goto unlock;
 
-	bch2_btree_node_iter_verify(&l->iter, b);
+	bch2_btree_node_iter_verify(&l->iter, l->b);
 
 	/*
 	 * For interior nodes, the iterator will have skipped past
@@ -430,46 +450,72 @@ static void __bch2_btree_iter_verify(struct btree_iter *iter,
 	 * For extents, the iterator may have skipped past deleted keys (but not
 	 * whiteouts)
 	 */
-	k = b->c.level || btree_node_type_is_extents(iter->btree_id)
-		? bch2_btree_node_iter_prev_filter(&tmp, b, KEY_TYPE_discard)
-		: bch2_btree_node_iter_prev_all(&tmp, b);
-	if (k && bkey_iter_pos_cmp(b, k, &pos) >= 0) {
-		char buf[100];
-		struct bkey uk = bkey_unpack_key(b, k);
+	p = level || btree_node_type_is_extents(iter->btree_id)
+		? bch2_btree_node_iter_prev_filter(&tmp, l->b, KEY_TYPE_discard)
+		: bch2_btree_node_iter_prev_all(&tmp, l->b);
+	k = bch2_btree_node_iter_peek_all(&l->iter, l->b);
 
-		bch2_bkey_to_text(&PBUF(buf), &uk);
-		panic("iterator should be before prev key:\n%s\n%llu:%llu\n",
-		      buf, iter->pos.inode, iter->pos.offset);
+	if (p && bkey_iter_pos_cmp(l->b, p, &pos) >= 0) {
+		msg = "before";
+		goto err;
 	}
 
-	k = bch2_btree_node_iter_peek_all(&l->iter, b);
-	if (k && bkey_iter_pos_cmp(b, k, &pos) < 0) {
-		char buf[100];
-		struct bkey uk = bkey_unpack_key(b, k);
+	if (k && bkey_iter_pos_cmp(l->b, k, &pos) < 0) {
+		msg = "after";
+		goto err;
+	}
+unlock:
+	if (!locked)
+		btree_node_unlock(iter, level);
+	return;
+err:
+	strcpy(buf1, "(none)");
+	strcpy(buf2, "(none)");
+
+	if (p) {
+		struct bkey uk = bkey_unpack_key(l->b, p);
+		bch2_bkey_to_text(&PBUF(buf1), &uk);
+	}
 
-		bch2_bkey_to_text(&PBUF(buf), &uk);
-		panic("iter should be after current key:\n"
-		      "iter pos %llu:%llu\n"
-		      "cur key  %s\n",
-		      iter->pos.inode, iter->pos.offset, buf);
+	if (k) {
+		struct bkey uk = bkey_unpack_key(l->b, k);
+		bch2_bkey_to_text(&PBUF(buf2), &uk);
 	}
+
+	panic("iterator should be %s key at level %u:\n"
+	      "iter pos %s %llu:%llu\n"
+	      "prev key %s\n"
+	      "cur  key %s\n",
+	      msg, level,
+	      iter->flags & BTREE_ITER_IS_EXTENTS ? ">" : "=>",
+	      iter->pos.inode, iter->pos.offset,
+	      buf1, buf2);
 }
 
-void bch2_btree_iter_verify(struct btree_iter *iter, struct btree *b)
+static void bch2_btree_iter_verify(struct btree_iter *iter)
 {
-	struct btree_iter *linked;
+	unsigned i;
 
-	if (!debug_check_iterators(iter->trans->c))
+	bch2_btree_trans_verify_locks(iter->trans);
+
+	for (i = 0; i < BTREE_MAX_DEPTH; i++)
+		bch2_btree_iter_verify_level(iter, i);
+}
+
+void bch2_btree_trans_verify_iters(struct btree_trans *trans, struct btree *b)
+{
+	struct btree_iter *iter;
+
+	if (!debug_check_iterators(trans->c))
 		return;
 
-	trans_for_each_iter_with_node(iter->trans, b, linked)
-		__bch2_btree_iter_verify(linked, b);
+	trans_for_each_iter_with_node(trans, b, iter)
+		bch2_btree_iter_verify_level(iter, b->c.level);
 }
 
 #else
 
-static inline void __bch2_btree_iter_verify(struct btree_iter *iter,
-					    struct btree *b) {}
+static inline void bch2_btree_iter_verify_level(struct btree_iter *iter, unsigned) {}
 
 #endif
 
@@ -514,7 +560,7 @@ void bch2_btree_iter_fix_key_modified(struct btree_iter *iter,
 
 	trans_for_each_iter_with_node(iter->trans, b, linked) {
 		__bch2_btree_iter_fix_key_modified(linked, b, where);
-		__bch2_btree_iter_verify(linked, b);
+		bch2_btree_iter_verify_level(linked, b->c.level);
 	}
 }
 
@@ -641,14 +687,16 @@ void bch2_btree_node_iter_fix(struct btree_iter *iter,
 	if (node_iter != &iter->l[b->c.level].iter) {
 		__bch2_btree_node_iter_fix(iter, b, node_iter, t,
 					   where, clobber_u64s, new_u64s);
-		bch2_btree_node_iter_verify(node_iter, b);
+
+		if (debug_check_iterators(iter->trans->c))
+			bch2_btree_node_iter_verify(node_iter, b);
 	}
 
 	trans_for_each_iter_with_node(iter->trans, b, linked) {
 		__bch2_btree_node_iter_fix(linked, b,
 					   &linked->l[b->c.level].iter, t,
 					   where, clobber_u64s, new_u64s);
-		__bch2_btree_iter_verify(linked, b);
+		bch2_btree_iter_verify_level(linked, b->c.level);
 	}
 }
 
@@ -1134,9 +1182,7 @@ static int btree_iter_traverse_one(struct btree_iter *iter)
 
 	iter->uptodate = BTREE_ITER_NEED_PEEK;
 
-	bch2_btree_trans_verify_locks(iter->trans);
-	if (btree_iter_node(iter, iter->level))
-		__bch2_btree_iter_verify(iter, iter->l[iter->level].b);
+	bch2_btree_iter_verify(iter);
 	return 0;
 }
 
@@ -1156,12 +1202,10 @@ static inline void bch2_btree_iter_checks(struct btree_iter *iter,
 					  enum btree_iter_type type)
 {
 	EBUG_ON(iter->btree_id >= BTREE_ID_NR);
-	EBUG_ON(!!(iter->flags & BTREE_ITER_IS_EXTENTS) !=
-		(btree_node_type_is_extents(iter->btree_id) &&
-		 type != BTREE_ITER_NODES));
 	EBUG_ON(btree_iter_type(iter) != type);
 
-	bch2_btree_trans_verify_locks(iter->trans);
+	bch2_btree_iter_verify_locks(iter);
+	bch2_btree_iter_verify_level(iter, iter->level);
 }
 
 /* Iterate across nodes (leaf and interior nodes) */
@@ -1189,10 +1233,12 @@ struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter)
 	iter->pos = b->key.k.p;
 	iter->uptodate = BTREE_ITER_UPTODATE;
 
+	bch2_btree_iter_verify(iter);
+
 	return b;
 }
 
-struct btree *bch2_btree_iter_next_node(struct btree_iter *iter, unsigned depth)
+struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
 {
 	struct btree *b;
 	int ret;
@@ -1238,7 +1284,7 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter, unsigned depth)
 		iter->pos	= iter->btree_id == BTREE_ID_INODES
 			? btree_type_successor(iter->btree_id, iter->pos)
 			: bkey_successor(iter->pos);
-		iter->level	= depth;
+		iter->level	= iter->min_depth;
 
 		btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
 		ret = bch2_btree_iter_traverse(iter);
@@ -1251,6 +1297,8 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter, unsigned depth)
 	iter->pos = b->key.k.p;
 	iter->uptodate = BTREE_ITER_UPTODATE;
 
+	bch2_btree_iter_verify(iter);
+
 	return b;
 }
 
@@ -1441,6 +1489,8 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
 		iter->pos = bkey_start_pos(k.k);
 
 	iter->uptodate = BTREE_ITER_UPTODATE;
+
+	bch2_btree_iter_verify_level(iter, 0);
 	return k;
 }
 
@@ -1450,52 +1500,16 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
  */
 struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter)
 {
-	struct btree_iter_level *l = &iter->l[0];
-	struct bkey_packed *p;
-	struct bkey_s_c k;
+	struct bpos next = iter->k.p;
 
 	bch2_btree_iter_checks(iter, BTREE_ITER_KEYS);
 
-	if (unlikely(iter->uptodate != BTREE_ITER_UPTODATE)) {
-		if (unlikely(!bkey_cmp(iter->k.p, POS_MAX)))
-			return bkey_s_c_null;
-
-		/*
-		 * XXX: when we just need to relock we should be able to avoid
-		 * calling traverse, but we need to kill BTREE_ITER_NEED_PEEK
-		 * for that to work
-		 */
-		iter->uptodate	= BTREE_ITER_NEED_TRAVERSE;
+	if (bkey_cmp(next, POS_MAX))
+		next = btree_type_successor(iter->btree_id, next);
 
-		bch2_btree_iter_set_pos(iter,
-			btree_type_successor(iter->btree_id, iter->k.p));
+	bch2_btree_iter_set_pos(iter, next);
 
-		return bch2_btree_iter_peek(iter);
-	}
-
-	if (unlikely(bkey_deleted(&iter->k))) {
-		/*
-		 * we're currently pointed at a hole, because previously we were
-		 * iterating over slots:
-		 */
-		return bch2_btree_iter_peek(iter);
-	}
-
-	do {
-		bch2_btree_node_iter_advance(&l->iter, l->b);
-		p = bch2_btree_node_iter_peek_all(&l->iter, l->b);
-	} while (likely(p) && bkey_whiteout(p));
-
-	if (unlikely(!p))
-		return btree_iter_set_pos_to_next_leaf(iter)
-			? bch2_btree_iter_peek(iter)
-			: bkey_s_c_null;
-
-	k = __btree_iter_unpack(iter, l, &iter->k, p);
-
-	EBUG_ON(bkey_cmp(bkey_start_pos(k.k), iter->pos) < 0);
-	iter->pos = bkey_start_pos(k.k);
-	return k;
+	return bch2_btree_iter_peek(iter);
 }
 
 /**
@@ -1609,7 +1623,7 @@ recheck:
 		EBUG_ON(bkey_cmp(k.k->p, iter->pos) <= 0);
 		iter->uptodate = BTREE_ITER_UPTODATE;
 
-		__bch2_btree_iter_verify(iter, l->b);
+		bch2_btree_iter_verify_level(iter, 0);
 		return k;
 	}
 
@@ -1654,7 +1668,7 @@ recheck:
 	iter->k	= n;
 	iter->uptodate = BTREE_ITER_UPTODATE;
 
-	__bch2_btree_iter_verify(iter, l->b);
+	bch2_btree_iter_verify_level(iter, 0);
 	return (struct bkey_s_c) { &iter->k, NULL };
 }
 
@@ -1679,7 +1693,7 @@ __bch2_btree_iter_peek_slot(struct btree_iter *iter)
 	}
 
 	iter->uptodate = BTREE_ITER_UPTODATE;
-	__bch2_btree_iter_verify(iter, l->b);
+	bch2_btree_iter_verify_level(iter, 0);
 	return k;
 }
 
@@ -1703,28 +1717,10 @@ struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *iter)
 {
 	bch2_btree_iter_checks(iter, BTREE_ITER_KEYS);
 
-	/* XXX directly setting iter->pos is wrong */
-	iter->pos = btree_type_successor(iter->btree_id, iter->k.p);
-
-	if (unlikely(btree_iter_pos_after_node(iter, iter->l[0].b)))
-		btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
-
-	if (unlikely(iter->uptodate != BTREE_ITER_UPTODATE)) {
-		/*
-		 * XXX: when we just need to relock we should be able to avoid
-		 * calling traverse, but we need to kill BTREE_ITER_NEED_PEEK
-		 * for that to work
-		 */
-		btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
-
-		return bch2_btree_iter_peek_slot(iter);
-	}
-
-	btree_iter_advance_to_pos(iter, &iter->l[0], -1);
+	bch2_btree_iter_set_pos(iter,
+		btree_type_successor(iter->btree_id, iter->k.p));
 
-	btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
-
-	return __bch2_btree_iter_peek_slot(iter);
+	return bch2_btree_iter_peek_slot(iter);
 }
 
 static inline void bch2_btree_iter_init(struct btree_trans *trans,
@@ -1746,6 +1742,7 @@ static inline void bch2_btree_iter_init(struct btree_trans *trans,
 	iter->uptodate			= BTREE_ITER_NEED_TRAVERSE;
 	iter->btree_id			= btree_id;
 	iter->level			= 0;
+	iter->min_depth			= 0;
 	iter->locks_want		= flags & BTREE_ITER_INTENT ? 1 : 0;
 	iter->nodes_locked		= 0;
 	iter->nodes_intent_locked	= 0;
@@ -2011,6 +2008,7 @@ struct btree_iter *bch2_trans_get_node_iter(struct btree_trans *trans,
 
 	iter->locks_want = locks_want;
 	iter->level	= depth;
+	iter->min_depth	= depth;
 
 	for (i = 0; i < ARRAY_SIZE(iter->l); i++)
 		iter->l[i].b		= NULL;
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index dd7a5e513dc8..475ea84d8f3d 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -96,11 +96,11 @@ __trans_next_iter_with_node(struct btree_trans *trans, struct btree *b,
 						 (_iter)->idx + 1))
 
 #ifdef CONFIG_BCACHEFS_DEBUG
-void bch2_btree_iter_verify(struct btree_iter *, struct btree *);
+void bch2_btree_trans_verify_iters(struct btree_trans *, struct btree *);
 void bch2_btree_trans_verify_locks(struct btree_trans *);
 #else
-static inline void bch2_btree_iter_verify(struct btree_iter *iter,
-					  struct btree *b) {}
+static inline void bch2_btree_trans_verify_iters(struct btree_trans *trans,
+						 struct btree *b) {}
 static inline void bch2_btree_trans_verify_locks(struct btree_trans *iter) {}
 #endif
 
@@ -154,7 +154,7 @@ bch2_btree_iter_traverse(struct btree_iter *iter)
 int bch2_btree_iter_traverse_all(struct btree_trans *);
 
 struct btree *bch2_btree_iter_peek_node(struct btree_iter *);
-struct btree *bch2_btree_iter_next_node(struct btree_iter *, unsigned);
+struct btree *bch2_btree_iter_next_node(struct btree_iter *);
 
 struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *);
 struct bkey_s_c bch2_btree_iter_next(struct btree_iter *);
@@ -231,7 +231,7 @@ static inline int bch2_trans_cond_resched(struct btree_trans *trans)
 				_start, _locks_want, _depth, _flags),	\
 	     _b = bch2_btree_iter_peek_node(_iter);			\
 	     (_b);							\
-	     (_b) = bch2_btree_iter_next_node(_iter, _depth))
+	     (_b) = bch2_btree_iter_next_node(_iter))
 
 #define for_each_btree_node(_trans, _iter, _btree_id, _start,		\
 			    _flags, _b)					\
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 4636b4fd1222..d1d5385d1eb7 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -238,9 +238,10 @@ struct btree_iter {
 	u16			flags;
 	u8			idx;
 
-	enum btree_iter_uptodate uptodate:4;
 	enum btree_id		btree_id:4;
+	enum btree_iter_uptodate uptodate:4;
 	unsigned		level:4,
+				min_depth:4,
 				locks_want:4,
 				nodes_locked:4,
 				nodes_intent_locked:4;
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 12ff2aea0d05..c1a4d6559d01 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -1557,7 +1557,7 @@ bch2_btree_insert_keys_interior(struct btree_update *as, struct btree *b,
 	trans_for_each_iter_with_node(iter->trans, b, linked)
 		bch2_btree_node_iter_peek(&linked->l[b->c.level].iter, b);
 
-	bch2_btree_iter_verify(iter, b);
+	bch2_btree_trans_verify_iters(iter->trans, b);
 }
 
 /**
@@ -1827,7 +1827,7 @@ retry:
 
 	bch2_btree_iter_node_replace(iter, n);
 
-	bch2_btree_iter_verify(iter, n);
+	bch2_btree_trans_verify_iters(trans, n);
 
 	bch2_btree_node_free_inmem(c, b, iter);
 	bch2_btree_node_free_inmem(c, m, iter);
-- 
cgit 


From c380123988265fa02c62709c39c702f734d63a7c Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 13 Mar 2020 21:41:22 -0400
Subject: bcachefs: Simplify bch2_btree_iter_peek_slot()

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c | 76 +++++++++++++++---------------------------------
 1 file changed, 24 insertions(+), 52 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index b3f13ed7be00..1d4611b264f4 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1595,8 +1595,17 @@ __bch2_btree_iter_peek_slot_extents(struct btree_iter *iter)
 	struct bkey n;
 	int ret;
 
-recheck:
-	btree_iter_advance_to_pos(iter, l, -1);
+	/* keys & holes can't span inode numbers: */
+	if (iter->pos.offset == KEY_OFFSET_MAX) {
+		if (iter->pos.inode == KEY_INODE_MAX)
+			return bkey_s_c_null;
+
+		bch2_btree_iter_set_pos(iter, bkey_successor(iter->pos));
+
+		ret = bch2_btree_iter_traverse(iter);
+		if (unlikely(ret))
+			return bkey_s_c_err(ret);
+	}
 
 	/*
 	 * iterator is now at the correct position for inserting at iter->pos,
@@ -1610,47 +1619,17 @@ recheck:
 
 	if (k.k && bkey_cmp(bkey_start_pos(k.k), iter->pos) <= 0) {
 		/*
-		 * If there wasn't actually a hole, want the iterator to be
-		 * pointed at the key we found:
-		 *
-		 * XXX: actually, we shouldn't be changing the iterator here:
-		 * the iterator needs to be correct for inserting at iter->pos,
-		 * and there may be whiteouts between iter->pos and what this
-		 * iterator points at:
+		 * We're not setting iter->uptodate because the node iterator
+		 * doesn't necessarily point at the key we're returning:
 		 */
-		l->iter = node_iter;
 
 		EBUG_ON(bkey_cmp(k.k->p, iter->pos) <= 0);
-		iter->uptodate = BTREE_ITER_UPTODATE;
-
 		bch2_btree_iter_verify_level(iter, 0);
 		return k;
 	}
 
-	/*
-	 * If we got to the end of the node, check if we need to traverse to the
-	 * next node:
-	 */
-	if (unlikely(!k.k && btree_iter_pos_after_node(iter, l->b))) {
-		btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
-		ret = bch2_btree_iter_traverse(iter);
-		if (unlikely(ret))
-			return bkey_s_c_err(ret);
-
-		goto recheck;
-	}
-
 	/* hole */
 
-	/* holes can't span inode numbers: */
-	if (iter->pos.offset == KEY_OFFSET_MAX) {
-		if (iter->pos.inode == KEY_INODE_MAX)
-			return bkey_s_c_null;
-
-		iter->pos = bkey_successor(iter->pos);
-		goto recheck;
-	}
-
 	if (!k.k)
 		k.k = &l->b->key.k;
 
@@ -1672,11 +1651,20 @@ recheck:
 	return (struct bkey_s_c) { &iter->k, NULL };
 }
 
-static inline struct bkey_s_c
-__bch2_btree_iter_peek_slot(struct btree_iter *iter)
+struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
 {
 	struct btree_iter_level *l = &iter->l[0];
 	struct bkey_s_c k;
+	int ret;
+
+	bch2_btree_iter_checks(iter, BTREE_ITER_KEYS);
+
+	if (iter->uptodate == BTREE_ITER_UPTODATE)
+		return btree_iter_peek_uptodate(iter);
+
+	ret = bch2_btree_iter_traverse(iter);
+	if (unlikely(ret))
+		return bkey_s_c_err(ret);
 
 	if (iter->flags & BTREE_ITER_IS_EXTENTS)
 		return __bch2_btree_iter_peek_slot_extents(iter);
@@ -1697,22 +1685,6 @@ __bch2_btree_iter_peek_slot(struct btree_iter *iter)
 	return k;
 }
 
-struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
-{
-	int ret;
-
-	bch2_btree_iter_checks(iter, BTREE_ITER_KEYS);
-
-	if (iter->uptodate == BTREE_ITER_UPTODATE)
-		return btree_iter_peek_uptodate(iter);
-
-	ret = bch2_btree_iter_traverse(iter);
-	if (unlikely(ret))
-		return bkey_s_c_err(ret);
-
-	return __bch2_btree_iter_peek_slot(iter);
-}
-
 struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *iter)
 {
 	bch2_btree_iter_checks(iter, BTREE_ITER_KEYS);
-- 
cgit 


From 2e70ce563432810b5638450cf6bee271a0f248b4 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 18 Feb 2020 16:17:55 -0500
Subject: bcachefs: More btree iter invariants

Ensure that iter->pos always lies between the start and end of iter->k
(the last key returned). Also, bch2_btree_iter_set_pos() now invalidates
the key that peek() or next() returned.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c | 142 ++++++++++++++++++-----------------------------
 fs/bcachefs/buckets.c    |   8 ++-
 2 files changed, 59 insertions(+), 91 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 1d4611b264f4..6b62ff80ff90 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -659,19 +659,8 @@ fixup_done:
 
 	if (!b->c.level &&
 	    node_iter == &iter->l[0].iter &&
-	    iter_current_key_modified) {
-		struct bkey_packed *k =
-			bch2_btree_node_iter_peek_all(node_iter, b);
-
-		if (likely(k)) {
-			bkey_disassemble(b, k, &iter->k);
-		} else {
-			/* XXX: for extents, calculate size of hole? */
-			iter->k.type = KEY_TYPE_deleted;
-		}
-
+	    iter_current_key_modified)
 		btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
-	}
 }
 
 void bch2_btree_node_iter_fix(struct btree_iter *iter,
@@ -1204,6 +1193,10 @@ static inline void bch2_btree_iter_checks(struct btree_iter *iter,
 	EBUG_ON(iter->btree_id >= BTREE_ID_NR);
 	EBUG_ON(btree_iter_type(iter) != type);
 
+	BUG_ON(type == BTREE_ITER_KEYS &&
+	       (bkey_cmp(iter->pos, bkey_start_pos(&iter->k)) < 0 ||
+		bkey_cmp(iter->pos, iter->k.p) > 0));
+
 	bch2_btree_iter_verify_locks(iter);
 	bch2_btree_iter_verify_level(iter, iter->level);
 }
@@ -1313,7 +1306,8 @@ void bch2_btree_iter_set_pos_same_leaf(struct btree_iter *iter, struct bpos new_
 	EBUG_ON(!btree_node_locked(iter, 0));
 	EBUG_ON(bkey_cmp(new_pos, l->b->key.k.p) > 0);
 
-	iter->pos = new_pos;
+	bkey_init(&iter->k);
+	iter->k.p = iter->pos = new_pos;
 	btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
 
 	btree_iter_advance_to_pos(iter, l, -1);
@@ -1323,9 +1317,14 @@ void bch2_btree_iter_set_pos_same_leaf(struct btree_iter *iter, struct bpos new_
 		btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
 }
 
-static unsigned btree_iter_pos_changed(struct btree_iter *iter, int cmp)
+static void btree_iter_pos_changed(struct btree_iter *iter, int cmp)
 {
-	unsigned l = btree_iter_up_until_good_node(iter, cmp);
+	unsigned l = iter->level;
+
+	if (!cmp)
+		goto out;
+
+	l = btree_iter_up_until_good_node(iter, cmp);
 
 	if (btree_iter_node(iter, l)) {
 		/*
@@ -1342,85 +1341,71 @@ static unsigned btree_iter_pos_changed(struct btree_iter *iter, int cmp)
 		if (btree_lock_want(iter, l) == BTREE_NODE_UNLOCKED)
 			btree_node_unlock(iter, l);
 	}
-
-	return l;
+out:
+	if (l != iter->level)
+		btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
+	else
+		btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
 }
 
 void __bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos,
 			       bool strictly_greater)
 {
 	struct bpos old = btree_iter_search_key(iter);
-	unsigned l;
 	int cmp;
 
 	iter->flags &= ~BTREE_ITER_IS_EXTENTS;
 	iter->flags |= strictly_greater ? BTREE_ITER_IS_EXTENTS : 0;
-	iter->pos = new_pos;
 
-	cmp = bkey_cmp(btree_iter_search_key(iter), old);
-	if (!cmp)
-		return;
+	bkey_init(&iter->k);
+	iter->k.p = iter->pos = new_pos;
 
-	l = btree_iter_pos_changed(iter, cmp);
+	cmp = bkey_cmp(btree_iter_search_key(iter), old);
 
-	if (l != iter->level)
-		btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
-	else
-		btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
+	btree_iter_pos_changed(iter, cmp);
 }
 
 void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos)
 {
 	int cmp = bkey_cmp(new_pos, iter->pos);
-	unsigned l;
-
-	if (!cmp)
-		return;
 
-	iter->pos = new_pos;
-
-	l = btree_iter_pos_changed(iter, cmp);
+	bkey_init(&iter->k);
+	iter->k.p = iter->pos = new_pos;
 
-	if (l != iter->level)
-		btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
-	else
-		btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
+	btree_iter_pos_changed(iter, cmp);
 }
 
 static inline bool btree_iter_set_pos_to_next_leaf(struct btree_iter *iter)
 {
 	struct btree_iter_level *l = &iter->l[0];
+	bool ret;
 
-	iter->pos	= l->b->key.k.p;
-	iter->uptodate	= BTREE_ITER_NEED_TRAVERSE;
+	bkey_init(&iter->k);
+	iter->k.p = iter->pos = l->b->key.k.p;
 
-	if (!bkey_cmp(iter->pos, POS_MAX)) {
-		bkey_init(&iter->k);
-		iter->k.p	= POS_MAX;
-		return false;
-	}
+	ret = bkey_cmp(iter->pos, POS_MAX) != 0;
+	if (ret)
+		iter->k.p = iter->pos = btree_type_successor(iter->btree_id, iter->pos);
 
-	iter->pos = btree_type_successor(iter->btree_id, iter->pos);
 	btree_iter_pos_changed(iter, 1);
-	return true;
+	return ret;
 }
 
 static inline bool btree_iter_set_pos_to_prev_leaf(struct btree_iter *iter)
 {
 	struct btree_iter_level *l = &iter->l[0];
+	bool ret;
 
-	iter->pos	= l->b->data->min_key;
+	bkey_init(&iter->k);
+	iter->k.p = iter->pos = l->b->data->min_key;
 	iter->uptodate	= BTREE_ITER_NEED_TRAVERSE;
 
-	if (!bkey_cmp(iter->pos, POS_MIN)) {
-		bkey_init(&iter->k);
-		iter->k.p	= POS_MIN;
-		return false;
-	}
+	ret = bkey_cmp(iter->pos, POS_MIN) != 0;
+	if (ret)
+		iter->k.p = iter->pos = btree_type_predecessor(iter->btree_id, iter->pos);
 
-	iter->pos = btree_type_predecessor(iter->btree_id, iter->pos);
 	btree_iter_pos_changed(iter, -1);
-	return true;
+	return ret;
 }
 
 /**
@@ -1500,14 +1485,11 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
  */
 struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter)
 {
-	struct bpos next = iter->k.p;
-
-	bch2_btree_iter_checks(iter, BTREE_ITER_KEYS);
-
-	if (bkey_cmp(next, POS_MAX))
-		next = btree_type_successor(iter->btree_id, next);
+	if (unlikely(!bkey_cmp(iter->k.p, POS_MAX)))
+		return bkey_s_c_null;
 
-	bch2_btree_iter_set_pos(iter, next);
+	bch2_btree_iter_set_pos(iter,
+		btree_type_successor(iter->btree_id, iter->k.p));
 
 	return bch2_btree_iter_peek(iter);
 }
@@ -1518,6 +1500,7 @@ struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter)
  */
 struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
 {
+	struct bpos pos = iter->pos;
 	struct btree_iter_level *l = &iter->l[0];
 	struct bkey_s_c k;
 	int ret;
@@ -1534,8 +1517,7 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
 			return bkey_s_c_err(ret);
 
 		k = __btree_iter_peek(iter, l);
-		if (!k.k ||
-		    bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0)
+		if (!k.k || bkey_cmp(bkey_start_pos(k.k), pos) > 0)
 			k = __btree_iter_prev(iter, l);
 
 		if (likely(k.k))
@@ -1545,7 +1527,7 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
 			return bkey_s_c_null;
 	}
 
-	EBUG_ON(bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0);
+	EBUG_ON(bkey_cmp(bkey_start_pos(k.k), pos) > 0);
 	iter->pos	= bkey_start_pos(k.k);
 	iter->uptodate	= BTREE_ITER_UPTODATE;
 	return k;
@@ -1557,33 +1539,16 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
  */
 struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *iter)
 {
-	struct btree_iter_level *l = &iter->l[0];
-	struct bkey_s_c k;
+	struct bpos pos = bkey_start_pos(&iter->k);
 
 	bch2_btree_iter_checks(iter, BTREE_ITER_KEYS);
 
-	if (unlikely(iter->uptodate != BTREE_ITER_UPTODATE)) {
-		/*
-		 * XXX: when we just need to relock we should be able to avoid
-		 * calling traverse, but we need to kill BTREE_ITER_NEED_PEEK
-		 * for that to work
-		 */
-		iter->pos	= btree_type_predecessor(iter->btree_id,
-							 iter->pos);
-		iter->uptodate	= BTREE_ITER_NEED_TRAVERSE;
-
-		return bch2_btree_iter_peek_prev(iter);
-	}
+	if (unlikely(!bkey_cmp(pos, POS_MIN)))
+		return bkey_s_c_null;
 
-	k = __btree_iter_prev(iter, l);
-	if (unlikely(!k.k))
-		return btree_iter_set_pos_to_prev_leaf(iter)
-			? bch2_btree_iter_peek(iter)
-			: bkey_s_c_null;
+	bch2_btree_iter_set_pos(iter, bkey_predecessor(pos));
 
-	EBUG_ON(bkey_cmp(bkey_start_pos(k.k), iter->pos) >= 0);
-	iter->pos	= bkey_start_pos(k.k);
-	return k;
+	return bch2_btree_iter_peek_prev(iter);
 }
 
 static inline struct bkey_s_c
@@ -1687,7 +1652,8 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
 
 struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *iter)
 {
-	bch2_btree_iter_checks(iter, BTREE_ITER_KEYS);
+	if (unlikely(!bkey_cmp(iter->k.p, POS_MAX)))
+		return bkey_s_c_null;
 
 	bch2_btree_iter_set_pos(iter,
 		btree_type_successor(iter->btree_id, iter->k.p));
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 7e0412dac5ff..7b0f0583b1a5 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -1671,8 +1671,7 @@ static int __bch2_trans_mark_reflink_p(struct btree_trans *trans,
 	     k.k->p.offset > idx + sectors))
 		goto out;
 
-	bch2_btree_iter_set_pos(iter, bkey_start_pos(k.k));
-	BUG_ON(iter->uptodate > BTREE_ITER_NEED_PEEK);
+	sectors = k.k->p.offset - idx;
 
 	r_v = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
 	ret = PTR_ERR_OR_ZERO(r_v);
@@ -1689,9 +1688,12 @@ static int __bch2_trans_mark_reflink_p(struct btree_trans *trans,
 		set_bkey_val_u64s(&r_v->k, 0);
 	}
 
+	bch2_btree_iter_set_pos(iter, bkey_start_pos(k.k));
+	BUG_ON(iter->uptodate > BTREE_ITER_NEED_PEEK);
+
 	bch2_trans_update(trans, iter, &r_v->k_i, 0);
 out:
-	ret = k.k->p.offset - idx;
+	ret = sectors;
 err:
 	bch2_trans_iter_put(trans, iter);
 	return ret;
-- 
cgit 


From 7d6f9b6409ef8c587d7395ba6c5674cae878c886 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 15 Mar 2020 16:15:08 -0400
Subject: bcachefs: Fix build when CONFIG_BCACHEFS_DEBUG=n

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 6b62ff80ff90..6daca5afb486 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -261,7 +261,7 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
 /* Btree iterator locking: */
 
 #ifdef CONFIG_BCACHEFS_DEBUG
-void bch2_btree_iter_verify_locks(struct btree_iter *iter)
+static void bch2_btree_iter_verify_locks(struct btree_iter *iter)
 {
 	unsigned l;
 
@@ -282,6 +282,8 @@ void bch2_btree_trans_verify_locks(struct btree_trans *trans)
 	trans_for_each_iter(trans, iter)
 		bch2_btree_iter_verify_locks(iter);
 }
+#else
+static inline void bch2_btree_iter_verify_locks(struct btree_iter *iter) {}
 #endif
 
 __flatten
@@ -515,7 +517,8 @@ void bch2_btree_trans_verify_iters(struct btree_trans *trans, struct btree *b)
 
 #else
 
-static inline void bch2_btree_iter_verify_level(struct btree_iter *iter, unsigned) {}
+static inline void bch2_btree_iter_verify_level(struct btree_iter *iter, unsigned l) {}
+static inline void bch2_btree_iter_verify(struct btree_iter *iter) {}
 
 #endif
 
-- 
cgit 


From 57b0b3db475de6b724e4db3b827c00484cdde642 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 5 Mar 2020 18:44:59 -0500
Subject: bcachefs: btree_iter_peek_with_updates()

Introduce a new iterator method that provides a consistent view of the
btree plus uncommitted updates.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c | 83 ++++++++++++++++++++++++++++++++++++++++++++++++
 fs/bcachefs/btree_iter.h |  7 ++--
 2 files changed, 87 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 6daca5afb486..347477c62779 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -5,6 +5,7 @@
 #include "btree_cache.h"
 #include "btree_iter.h"
 #include "btree_locking.h"
+#include "btree_update.h"
 #include "debug.h"
 #include "extents.h"
 #include "trace.h"
@@ -1497,6 +1498,88 @@ struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter)
 	return bch2_btree_iter_peek(iter);
 }
 
+static struct bkey_s_c __btree_trans_updates_peek(struct btree_iter *iter)
+{
+	struct bpos pos = btree_iter_search_key(iter);
+	struct btree_trans *trans = iter->trans;
+	struct btree_insert_entry *i;
+
+	trans_for_each_update(trans, i)
+		if ((cmp_int(iter->btree_id,	i->iter->btree_id) ?:
+		     bkey_cmp(pos,		i->k->k.p)) <= 0)
+			break;
+
+	return i < trans->updates + trans->nr_updates &&
+		iter->btree_id == i->iter->btree_id
+		? bkey_i_to_s_c(i->k)
+		: bkey_s_c_null;
+}
+
+static struct bkey_s_c __bch2_btree_iter_peek_with_updates(struct btree_iter *iter)
+{
+	struct btree_iter_level *l = &iter->l[0];
+	struct bkey_s_c k = __btree_iter_peek(iter, l);
+	struct bkey_s_c u = __btree_trans_updates_peek(iter);
+
+	if (k.k && (!u.k || bkey_cmp(k.k->p, u.k->p) < 0))
+		return k;
+	if (u.k && bkey_cmp(u.k->p, l->b->key.k.p) <= 0) {
+		iter->k = *u.k;
+		return u;
+	}
+	return bkey_s_c_null;
+}
+
+struct bkey_s_c bch2_btree_iter_peek_with_updates(struct btree_iter *iter)
+{
+	struct bkey_s_c k;
+	int ret;
+
+	bch2_btree_iter_checks(iter, BTREE_ITER_KEYS);
+
+	while (1) {
+		ret = bch2_btree_iter_traverse(iter);
+		if (unlikely(ret))
+			return bkey_s_c_err(ret);
+
+		k = __bch2_btree_iter_peek_with_updates(iter);
+
+		if (k.k && bkey_deleted(k.k)) {
+			bch2_btree_iter_set_pos(iter,
+				btree_type_successor(iter->btree_id, iter->k.p));
+			continue;
+		}
+
+		if (likely(k.k))
+			break;
+
+		if (!btree_iter_set_pos_to_next_leaf(iter))
+			return bkey_s_c_null;
+	}
+
+	/*
+	 * iter->pos should always be equal to the key we just
+	 * returned - except extents can straddle iter->pos:
+	 */
+	if (!(iter->flags & BTREE_ITER_IS_EXTENTS) ||
+	    bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0)
+		iter->pos = bkey_start_pos(k.k);
+
+	iter->uptodate = BTREE_ITER_UPTODATE;
+	return k;
+}
+
+struct bkey_s_c bch2_btree_iter_next_with_updates(struct btree_iter *iter)
+{
+	if (unlikely(!bkey_cmp(iter->k.p, POS_MAX)))
+		return bkey_s_c_null;
+
+	bch2_btree_iter_set_pos(iter,
+		btree_type_successor(iter->btree_id, iter->k.p));
+
+	return bch2_btree_iter_peek_with_updates(iter);
+}
+
 /**
  * bch2_btree_iter_peek_prev: returns first key less than or equal to
  * iterator's current position
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index 475ea84d8f3d..1177bf118dbc 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -159,6 +159,9 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *);
 struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *);
 struct bkey_s_c bch2_btree_iter_next(struct btree_iter *);
 
+struct bkey_s_c bch2_btree_iter_peek_with_updates(struct btree_iter *);
+struct bkey_s_c bch2_btree_iter_next_with_updates(struct btree_iter *);
+
 struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *);
 struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *);
 
@@ -199,9 +202,7 @@ static inline int __btree_iter_cmp(enum btree_id id,
 				   struct bpos pos,
 				   const struct btree_iter *r)
 {
-	if (id != r->btree_id)
-		return id < r->btree_id ? -1 : 1;
-	return bkey_cmp(pos, r->pos);
+	return cmp_int(id, r->btree_id) ?: bkey_cmp(pos, r->pos);
 }
 
 static inline int btree_iter_cmp(const struct btree_iter *l,
-- 
cgit 


From e3e464ac6d09269b19cea3dc32b626db44d0e6ba Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 30 Dec 2019 14:37:25 -0500
Subject: bcachefs: Move extent overwrite handling out of core btree code

Ever since the btree code was first written, handling of overwriting
existing extents - including partially overwriting and splittin existing
extents - was handled as part of the core btree insert path. The modern
transaction and iterator infrastructure didn't exist then, so that was
the only way for it to be done.

This patch moves that outside of the core btree code to a pass that runs
at transaction commit time.

This is a significant simplification to the btree code and overall
reduction in code size, but more importantly it gets us much closer to
the core btree code being completely independent of extents and is
important prep work for snapshots.

This introduces a new feature bit; the old and new extent update models
are incompatible when the filesystem needs journal replay.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs_format.h       |   6 +-
 fs/bcachefs/btree_gc.c              |  57 ++---
 fs/bcachefs/btree_io.c              |  17 +-
 fs/bcachefs/btree_iter.c            |  25 ++-
 fs/bcachefs/btree_types.h           |   3 +
 fs/bcachefs/btree_update.h          |   5 +
 fs/bcachefs/btree_update_interior.h |  23 +-
 fs/bcachefs/btree_update_leaf.c     | 228 ++++++++++++++++----
 fs/bcachefs/buckets.c               |  13 +-
 fs/bcachefs/buckets.h               |   2 +-
 fs/bcachefs/extent_update.c         | 410 +++---------------------------------
 fs/bcachefs/extent_update.h         |   5 +-
 fs/bcachefs/fsck.c                  |  56 +++++
 fs/bcachefs/recovery.c              | 154 +++++---------
 fs/bcachefs/recovery.h              |   2 -
 15 files changed, 404 insertions(+), 602 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index d1c0a5d5580e..1ad5ff449a5b 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -1315,12 +1315,14 @@ LE64_BITMASK(BCH_SB_ERASURE_CODE,	struct bch_sb, flags[3],  0, 16);
 	x(inline_data,			8)	\
 	x(new_extent_overwrite,		9)	\
 	x(incompressible,		10)	\
-	x(btree_ptr_v2,			11)
+	x(btree_ptr_v2,			11)	\
+	x(extents_above_btree_updates,	12)
 
 #define BCH_SB_FEATURES_ALL				\
 	((1ULL << BCH_FEATURE_new_siphash)|		\
 	 (1ULL << BCH_FEATURE_new_extent_overwrite)|	\
-	 (1ULL << BCH_FEATURE_btree_ptr_v2))
+	 (1ULL << BCH_FEATURE_btree_ptr_v2)|		\
+	 (1ULL << BCH_FEATURE_extents_above_btree_updates))
 
 enum bch_sb_feature {
 #define x(f, n) BCH_FEATURE_##f,
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index a5fe3b316e06..f85fbc057fb3 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -186,8 +186,16 @@ fsck_err:
 	return ret;
 }
 
-static int btree_gc_mark_node(struct bch_fs *c, struct btree *b,
-			      u8 *max_stale, bool initial)
+static bool pos_in_journal_keys(struct journal_keys *journal_keys,
+				enum btree_id id, struct bpos pos)
+{
+	struct journal_key *k = journal_key_search(journal_keys, id, pos);
+
+	return k && k->btree_id == id && !bkey_cmp(k->k->k.p, pos);
+}
+
+static int btree_gc_mark_node(struct bch_fs *c, struct btree *b, u8 *max_stale,
+			      struct journal_keys *journal_keys, bool initial)
 {
 	struct btree_node_iter iter;
 	struct bkey unpacked;
@@ -201,6 +209,10 @@ static int btree_gc_mark_node(struct bch_fs *c, struct btree *b,
 
 	for_each_btree_node_key_unpack(b, k, &iter,
 				       &unpacked) {
+		if (!b->c.level && journal_keys &&
+		    pos_in_journal_keys(journal_keys, b->c.btree_id, k.k->p))
+			continue;
+
 		bch2_bkey_debugcheck(c, b, k);
 
 		ret = bch2_gc_mark_key(c, k, max_stale, initial);
@@ -212,6 +224,7 @@ static int btree_gc_mark_node(struct bch_fs *c, struct btree *b,
 }
 
 static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id,
+			 struct journal_keys *journal_keys,
 			 bool initial, bool metadata_only)
 {
 	struct btree_trans trans;
@@ -239,7 +252,8 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id,
 
 		gc_pos_set(c, gc_pos_btree_node(b));
 
-		ret = btree_gc_mark_node(c, b, &max_stale, initial);
+		ret = btree_gc_mark_node(c, b, &max_stale,
+					 journal_keys, initial);
 		if (ret)
 			break;
 
@@ -281,36 +295,6 @@ static inline int btree_id_gc_phase_cmp(enum btree_id l, enum btree_id r)
 		(int) btree_id_to_gc_phase(r);
 }
 
-static int mark_journal_key(struct bch_fs *c, enum btree_id id,
-			    struct bkey_i *insert)
-{
-	struct btree_trans trans;
-	struct btree_iter *iter;
-	struct bkey_s_c k;
-	u8 max_stale;
-	int ret = 0;
-
-	ret = bch2_gc_mark_key(c, bkey_i_to_s_c(insert), &max_stale, true);
-	if (ret)
-		return ret;
-
-	bch2_trans_init(&trans, c, 0, 0);
-
-	for_each_btree_key(&trans, iter, id, bkey_start_pos(&insert->k),
-			   BTREE_ITER_SLOTS, k, ret) {
-		percpu_down_read(&c->mark_lock);
-		ret = bch2_mark_overwrite(&trans, iter, k, insert, NULL,
-					 BTREE_TRIGGER_GC|
-					 BTREE_TRIGGER_NOATOMIC);
-		percpu_up_read(&c->mark_lock);
-
-		if (!ret)
-			break;
-	}
-
-	return bch2_trans_exit(&trans) ?: ret;
-}
-
 static int bch2_gc_btrees(struct bch_fs *c, struct journal_keys *journal_keys,
 			  bool initial, bool metadata_only)
 {
@@ -325,18 +309,21 @@ static int bch2_gc_btrees(struct bch_fs *c, struct journal_keys *journal_keys,
 		enum btree_id id = ids[i];
 		enum btree_node_type type = __btree_node_type(0, id);
 
-		int ret = bch2_gc_btree(c, id, initial, metadata_only);
+		int ret = bch2_gc_btree(c, id, journal_keys,
+					initial, metadata_only);
 		if (ret)
 			return ret;
 
 		if (journal_keys && !metadata_only &&
 		    btree_node_type_needs_gc(type)) {
 			struct journal_key *j;
+			u8 max_stale;
 			int ret;
 
 			for_each_journal_key(*journal_keys, j)
 				if (j->btree_id == id) {
-					ret = mark_journal_key(c, id, j->k);
+					ret = bch2_gc_mark_key(c, bkey_i_to_s_c(j->k),
+							       &max_stale, initial);
 					if (ret)
 						return ret;
 				}
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index a4732bf13a11..d0b761417903 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -708,9 +708,7 @@ static int validate_bset(struct bch_fs *c, struct btree *b,
 			 unsigned *whiteout_u64s, int write,
 			 bool have_retry)
 {
-	struct bkey_packed *k;
-	struct bkey prev	= KEY(0, 0, 0);
-	struct bpos prev_data	= POS_MIN;
+	struct bkey_packed *k, *prev = NULL;
 	bool seen_non_whiteout = false;
 	unsigned version;
 	const char *err;
@@ -852,15 +850,15 @@ static int validate_bset(struct bch_fs *c, struct btree *b,
 
 		if (!seen_non_whiteout &&
 		    (!bkey_whiteout(k) ||
-		     (bkey_cmp(prev.p, bkey_start_pos(u.k)) > 0))) {
+		     (prev && bkey_iter_cmp(b, prev, k) > 0))) {
 			*whiteout_u64s = k->_data - i->_data;
 			seen_non_whiteout = true;
-		} else if (bkey_cmp(prev_data, bkey_start_pos(u.k)) > 0 ||
-			   bkey_cmp(prev.p, u.k->p) > 0) {
+		} else if (prev && bkey_iter_cmp(b, prev, k) > 0) {
 			char buf1[80];
 			char buf2[80];
+			struct bkey up = bkey_unpack_key(b, prev);
 
-			bch2_bkey_to_text(&PBUF(buf1), &prev);
+			bch2_bkey_to_text(&PBUF(buf1), &up);
 			bch2_bkey_to_text(&PBUF(buf2), u.k);
 
 			bch2_dump_bset(b, i, 0);
@@ -870,10 +868,7 @@ static int validate_bset(struct bch_fs *c, struct btree *b,
 			/* XXX: repair this */
 		}
 
-		if (!bkey_deleted(u.k))
-			prev_data = u.k->p;
-		prev = *u.k;
-
+		prev = k;
 		k = bkey_next_skip_noops(k, vstruct_last(i));
 	}
 
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 347477c62779..5f918c6c3efb 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1504,12 +1504,12 @@ static struct bkey_s_c __btree_trans_updates_peek(struct btree_iter *iter)
 	struct btree_trans *trans = iter->trans;
 	struct btree_insert_entry *i;
 
-	trans_for_each_update(trans, i)
+	trans_for_each_update2(trans, i)
 		if ((cmp_int(iter->btree_id,	i->iter->btree_id) ?:
 		     bkey_cmp(pos,		i->k->k.p)) <= 0)
 			break;
 
-	return i < trans->updates + trans->nr_updates &&
+	return i < trans->updates2 + trans->nr_updates2 &&
 		iter->btree_id == i->iter->btree_id
 		? bkey_i_to_s_c(i->k)
 		: bkey_s_c_null;
@@ -1821,7 +1821,7 @@ int bch2_trans_iter_free(struct btree_trans *trans,
 static int bch2_trans_realloc_iters(struct btree_trans *trans,
 				    unsigned new_size)
 {
-	void *new_iters, *new_updates;
+	void *p, *new_iters, *new_updates, *new_updates2;
 	size_t iters_bytes;
 	size_t updates_bytes;
 
@@ -1839,21 +1839,27 @@ static int bch2_trans_realloc_iters(struct btree_trans *trans,
 	iters_bytes	= sizeof(struct btree_iter) * new_size;
 	updates_bytes	= sizeof(struct btree_insert_entry) * new_size;
 
-	new_iters = kmalloc(iters_bytes + updates_bytes, GFP_NOFS);
-	if (new_iters)
+	p = kmalloc(iters_bytes +
+		    updates_bytes +
+		    updates_bytes, GFP_NOFS);
+	if (p)
 		goto success;
 
-	new_iters = mempool_alloc(&trans->c->btree_iters_pool, GFP_NOFS);
+	p = mempool_alloc(&trans->c->btree_iters_pool, GFP_NOFS);
 	new_size = BTREE_ITER_MAX;
 
 	trans->used_mempool = true;
 success:
-	new_updates	= new_iters + iters_bytes;
+	new_iters	= p; p += iters_bytes;
+	new_updates	= p; p += updates_bytes;
+	new_updates2	= p; p += updates_bytes;
 
 	memcpy(new_iters, trans->iters,
 	       sizeof(struct btree_iter) * trans->nr_iters);
 	memcpy(new_updates, trans->updates,
 	       sizeof(struct btree_insert_entry) * trans->nr_updates);
+	memcpy(new_updates2, trans->updates2,
+	       sizeof(struct btree_insert_entry) * trans->nr_updates2);
 
 	if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG))
 		memset(trans->iters, POISON_FREE,
@@ -1865,6 +1871,7 @@ success:
 
 	trans->iters		= new_iters;
 	trans->updates		= new_updates;
+	trans->updates2		= new_updates2;
 	trans->size		= new_size;
 
 	if (trans->iters_live) {
@@ -2126,6 +2133,7 @@ void bch2_trans_reset(struct btree_trans *trans, unsigned flags)
 
 	trans->need_reset		= 0;
 	trans->nr_updates		= 0;
+	trans->nr_updates2		= 0;
 	trans->mem_top			= 0;
 
 	if (trans->fs_usage_deltas) {
@@ -2157,6 +2165,7 @@ void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c,
 	trans->size		= ARRAY_SIZE(trans->iters_onstack);
 	trans->iters		= trans->iters_onstack;
 	trans->updates		= trans->updates_onstack;
+	trans->updates2		= trans->updates2_onstack;
 	trans->fs_usage_deltas	= NULL;
 
 	if (expected_nr_iters > trans->size)
@@ -2194,5 +2203,5 @@ int bch2_fs_btree_iter_init(struct bch_fs *c)
 	return mempool_init_kmalloc_pool(&c->btree_iters_pool, 1,
 			sizeof(struct btree_iter) * nr +
 			sizeof(struct btree_insert_entry) * nr +
-			sizeof(u8) * nr);
+			sizeof(struct btree_insert_entry) * nr);
 }
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index d1d5385d1eb7..fdfa7a265850 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -283,6 +283,7 @@ struct btree_trans {
 
 	u8			nr_iters;
 	u8			nr_updates;
+	u8			nr_updates2;
 	u8			size;
 	unsigned		used_mempool:1;
 	unsigned		error:1;
@@ -295,6 +296,7 @@ struct btree_trans {
 
 	struct btree_iter	*iters;
 	struct btree_insert_entry *updates;
+	struct btree_insert_entry *updates2;
 
 	/* update path: */
 	struct journal_res	journal_res;
@@ -308,6 +310,7 @@ struct btree_trans {
 
 	struct btree_iter	iters_onstack[2];
 	struct btree_insert_entry updates_onstack[2];
+	struct btree_insert_entry updates2_onstack[2];
 };
 
 #define BTREE_FLAG(flag)						\
diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
index d1cd839ac08f..12127a33906b 100644
--- a/fs/bcachefs/btree_update.h
+++ b/fs/bcachefs/btree_update.h
@@ -132,4 +132,9 @@ static inline int bch2_trans_commit(struct btree_trans *trans,
 	     (_i) < (_trans)->updates + (_trans)->nr_updates;		\
 	     (_i)++)
 
+#define trans_for_each_update2(_trans, _i)				\
+	for ((_i) = (_trans)->updates2;					\
+	     (_i) < (_trans)->updates2 + (_trans)->nr_updates2;		\
+	     (_i)++)
+
 #endif /* _BCACHEFS_BTREE_UPDATE_H */
diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h
index e3204f32cc68..f6aceed89427 100644
--- a/fs/bcachefs/btree_update_interior.h
+++ b/fs/bcachefs/btree_update_interior.h
@@ -303,18 +303,23 @@ static inline struct btree_node_entry *want_new_bset(struct bch_fs *c,
 }
 
 static inline void push_whiteout(struct bch_fs *c, struct btree *b,
-				 struct bkey_packed *k)
+				 struct bpos pos)
 {
-	unsigned u64s = bkeyp_key_u64s(&b->format, k);
-	struct bkey_packed *dst;
+	struct bkey_packed k;
 
-	BUG_ON(u64s > bch_btree_keys_u64s_remaining(c, b));
+	BUG_ON(bch_btree_keys_u64s_remaining(c, b) < BKEY_U64s);
 
-	b->whiteout_u64s += bkeyp_key_u64s(&b->format, k);
-	dst = unwritten_whiteouts_start(c, b);
-	memcpy_u64s(dst, k, u64s);
-	dst->u64s = u64s;
-	dst->type = KEY_TYPE_deleted;
+	if (!bkey_pack_pos(&k, pos, b)) {
+		struct bkey *u = (void *) &k;
+
+		bkey_init(u);
+		u->p = pos;
+	}
+
+	k.needs_whiteout = true;
+
+	b->whiteout_u64s += k.u64s;
+	bkey_copy(unwritten_whiteouts_start(c, b), &k);
 }
 
 /*
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 94418c9b42e8..f0efc52c7590 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -23,11 +23,10 @@
 static inline bool same_leaf_as_prev(struct btree_trans *trans,
 				     struct btree_insert_entry *i)
 {
-	return i != trans->updates &&
+	return i != trans->updates2 &&
 		i[0].iter->l[0].b == i[-1].iter->l[0].b;
 }
 
-
 inline void bch2_btree_node_lock_for_insert(struct bch_fs *c, struct btree *b,
 					    struct btree_iter *iter)
 {
@@ -61,6 +60,9 @@ bool bch2_btree_bset_insert_key(struct btree_iter *iter,
 	EBUG_ON(bkey_deleted(&insert->k) && bkey_val_u64s(&insert->k));
 	EBUG_ON(bkey_cmp(bkey_start_pos(&insert->k), b->data->min_key) < 0 ||
 		bkey_cmp(insert->k.p, b->data->max_key) > 0);
+	EBUG_ON(insert->k.u64s >
+		bch_btree_keys_u64s_remaining(iter->trans->c, b));
+	EBUG_ON(iter->flags & BTREE_ITER_IS_EXTENTS);
 
 	k = bch2_btree_node_iter_peek_all(node_iter, b);
 	if (k && bkey_cmp_packed(b, k, &insert->k))
@@ -79,7 +81,7 @@ bool bch2_btree_bset_insert_key(struct btree_iter *iter,
 		k->type = KEY_TYPE_deleted;
 
 		if (k->needs_whiteout)
-			push_whiteout(iter->trans->c, b, k);
+			push_whiteout(iter->trans->c, b, insert->k.p);
 		k->needs_whiteout = false;
 
 		if (k >= btree_bset_last(b)->start) {
@@ -195,20 +197,6 @@ void bch2_btree_journal_key(struct btree_trans *trans,
 		set_btree_node_dirty(b);
 }
 
-static void bch2_insert_fixup_key(struct btree_trans *trans,
-				  struct btree_iter *iter,
-				  struct bkey_i *insert)
-{
-	struct btree_iter_level *l = &iter->l[0];
-
-	EBUG_ON(iter->level);
-	EBUG_ON(insert->k.u64s >
-		bch_btree_keys_u64s_remaining(trans->c, l->b));
-
-	if (likely(bch2_btree_bset_insert_key(iter, l->b, &l->iter, insert)))
-		bch2_btree_journal_key(trans, iter, insert);
-}
-
 /**
  * btree_insert_key - insert a key one key into a leaf node
  */
@@ -223,12 +211,12 @@ static void btree_insert_key_leaf(struct btree_trans *trans,
 	int old_live_u64s = b->nr.live_u64s;
 	int live_u64s_added, u64s_added;
 
+	EBUG_ON(iter->level);
+
 	insert->k.needs_whiteout = false;
 
-	if (!btree_node_is_extents(b))
-		bch2_insert_fixup_key(trans, iter, insert);
-	else
-		bch2_insert_fixup_extent(trans, iter, insert);
+	if (likely(bch2_btree_bset_insert_key(iter, b, &iter->l[0].iter, insert)))
+		bch2_btree_journal_key(trans, iter, insert);
 
 	live_u64s_added = (int) b->nr.live_u64s - old_live_u64s;
 	u64s_added = (int) bset_u64s(t) - old_u64s;
@@ -254,12 +242,8 @@ static inline void btree_insert_entry_checks(struct btree_trans *trans,
 	struct bch_fs *c = trans->c;
 
 	BUG_ON(iter->level);
-	BUG_ON(bkey_cmp(bkey_start_pos(&insert->k), iter->pos));
-	EBUG_ON((iter->flags & BTREE_ITER_IS_EXTENTS) &&
-		bkey_cmp(insert->k.p, iter->l[0].b->key.k.p) > 0);
-
+	BUG_ON(bkey_cmp(insert->k.p, iter->pos));
 	BUG_ON(debug_check_bkeys(c) &&
-	       !bkey_deleted(&insert->k) &&
 	       bch2_bkey_invalid(c, bkey_i_to_s_c(insert), iter->btree_id));
 }
 
@@ -312,9 +296,16 @@ btree_key_can_insert(struct btree_trans *trans,
 	if (unlikely(btree_node_fake(b)))
 		return BTREE_INSERT_BTREE_NODE_FULL;
 
+	/*
+	 * old bch2_extent_sort_fix_overlapping() algorithm won't work with new
+	 * style extent updates:
+	 */
+	if (unlikely(btree_node_old_extent_overwrite(b)))
+		return BTREE_INSERT_BTREE_NODE_FULL;
+
 	ret = !btree_node_is_extents(b)
 		? BTREE_INSERT_OK
-		: bch2_extent_can_insert(trans, iter, insert, u64s);
+		: bch2_extent_can_insert(trans, iter, insert);
 	if (ret)
 		return ret;
 
@@ -383,7 +374,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans,
 
 	prefetch(&trans->c->journal.flags);
 
-	trans_for_each_update(trans, i) {
+	trans_for_each_update2(trans, i) {
 		/* Multiple inserts might go to same leaf: */
 		if (!same_leaf_as_prev(trans, i))
 			u64s = 0;
@@ -422,10 +413,10 @@ bch2_trans_commit_write_locked(struct btree_trans *trans,
 
 	if (!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)) {
 		if (journal_seq_verify(c))
-			trans_for_each_update(trans, i)
+			trans_for_each_update2(trans, i)
 				i->k->k.version.lo = trans->journal_res.seq;
 		else if (inject_invalid_keys(c))
-			trans_for_each_update(trans, i)
+			trans_for_each_update2(trans, i)
 				i->k->k.version = MAX_VERSION;
 	}
 
@@ -448,7 +439,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans,
 	if (unlikely(c->gc_pos.phase))
 		bch2_trans_mark_gc(trans);
 
-	trans_for_each_update(trans, i)
+	trans_for_each_update2(trans, i)
 		do_btree_insert_one(trans, i->iter, i->k);
 err:
 	if (marking) {
@@ -469,7 +460,7 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans,
 	struct btree_iter *iter;
 	int ret;
 
-	trans_for_each_update(trans, i)
+	trans_for_each_update2(trans, i)
 		BUG_ON(!btree_node_intent_locked(i->iter, 0));
 
 	ret = bch2_journal_preres_get(&trans->c->journal,
@@ -497,18 +488,18 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans,
 	}
 
 	if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG))
-		trans_for_each_update(trans, i)
+		trans_for_each_update2(trans, i)
 			btree_insert_entry_checks(trans, i->iter, i->k);
 	bch2_btree_trans_verify_locks(trans);
 
-	trans_for_each_update(trans, i)
+	trans_for_each_update2(trans, i)
 		if (!same_leaf_as_prev(trans, i))
 			bch2_btree_node_lock_for_insert(trans->c,
 						i->iter->l[0].b, i->iter);
 
 	ret = bch2_trans_commit_write_locked(trans, stopped_at);
 
-	trans_for_each_update(trans, i)
+	trans_for_each_update2(trans, i)
 		if (!same_leaf_as_prev(trans, i))
 			bch2_btree_node_unlock_write_inlined(i->iter->l[0].b,
 							     i->iter);
@@ -525,14 +516,14 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans,
 	if (trans->flags & BTREE_INSERT_NOUNLOCK)
 		trans->nounlock = true;
 
-	trans_for_each_update(trans, i)
+	trans_for_each_update2(trans, i)
 		if (!same_leaf_as_prev(trans, i))
 			bch2_foreground_maybe_merge(trans->c, i->iter,
 						    0, trans->flags);
 
 	trans->nounlock = false;
 
-	trans_for_each_update(trans, i)
+	trans_for_each_update2(trans, i)
 		bch2_btree_iter_downgrade(i->iter);
 
 	return 0;
@@ -655,6 +646,135 @@ bch2_trans_commit_get_rw_cold(struct btree_trans *trans)
 	return 0;
 }
 
+static void bch2_trans_update2(struct btree_trans *trans,
+			       struct btree_iter *iter,
+			       struct bkey_i *insert)
+{
+	struct btree_insert_entry *i, n = (struct btree_insert_entry) {
+		.iter = iter, .k = insert
+	};
+
+	btree_insert_entry_checks(trans, n.iter, n.k);
+
+	BUG_ON(iter->uptodate > BTREE_ITER_NEED_PEEK);
+
+	EBUG_ON(trans->nr_updates2 >= trans->nr_iters);
+
+	iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT;
+
+	trans_for_each_update2(trans, i) {
+		if (btree_iter_cmp(n.iter, i->iter) == 0) {
+			*i = n;
+			return;
+		}
+
+		if (btree_iter_cmp(n.iter, i->iter) <= 0)
+			break;
+	}
+
+	array_insert_item(trans->updates2, trans->nr_updates2,
+			  i - trans->updates2, n);
+}
+
+static int extent_update_to_keys(struct btree_trans *trans,
+				 struct btree_iter *orig_iter,
+				 struct bkey_i *insert)
+{
+	struct btree_iter *iter;
+
+	if (bkey_deleted(&insert->k))
+		return 0;
+
+	iter = bch2_trans_copy_iter(trans, orig_iter);
+	if (IS_ERR(iter))
+		return PTR_ERR(iter);
+
+	iter->flags |= BTREE_ITER_INTENT;
+	__bch2_btree_iter_set_pos(iter, insert->k.p, false);
+	bch2_trans_update2(trans, iter, insert);
+	bch2_trans_iter_put(trans, iter);
+	return 0;
+}
+
+static int extent_handle_overwrites(struct btree_trans *trans,
+				    enum btree_id btree_id,
+				    struct bpos start, struct bpos end)
+{
+	struct btree_iter *iter = NULL, *update_iter;
+	struct bkey_i *update;
+	struct bkey_s_c k;
+	int ret = 0;
+
+	iter = bch2_trans_get_iter(trans, btree_id, start, BTREE_ITER_INTENT);
+	ret = PTR_ERR_OR_ZERO(iter);
+	if (ret)
+		return ret;
+
+	k = bch2_btree_iter_peek_with_updates(iter);
+
+	while (k.k && !(ret = bkey_err(k))) {
+		if (bkey_cmp(end, bkey_start_pos(k.k)) <= 0)
+			break;
+
+		if (bkey_cmp(bkey_start_pos(k.k), start) < 0) {
+			update_iter = bch2_trans_copy_iter(trans, iter);
+			if ((ret = PTR_ERR_OR_ZERO(update_iter)))
+				goto err;
+
+			update = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
+			if ((ret = PTR_ERR_OR_ZERO(update)))
+				goto err;
+
+			bkey_reassemble(update, k);
+			bch2_cut_back(start, update);
+
+			__bch2_btree_iter_set_pos(update_iter, update->k.p, false);
+			bch2_trans_update2(trans, update_iter, update);
+			bch2_trans_iter_put(trans, update_iter);
+		}
+
+		if (bkey_cmp(k.k->p, end) > 0) {
+			update_iter = bch2_trans_copy_iter(trans, iter);
+			if ((ret = PTR_ERR_OR_ZERO(update_iter)))
+				goto err;
+
+			update = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
+			if ((ret = PTR_ERR_OR_ZERO(update)))
+				goto err;
+
+			bkey_reassemble(update, k);
+			bch2_cut_front(end, update);
+
+			__bch2_btree_iter_set_pos(update_iter, update->k.p, false);
+			bch2_trans_update2(trans, update_iter, update);
+			bch2_trans_iter_put(trans, update_iter);
+		} else {
+			update_iter = bch2_trans_copy_iter(trans, iter);
+			if ((ret = PTR_ERR_OR_ZERO(update_iter)))
+				goto err;
+
+			update = bch2_trans_kmalloc(trans, sizeof(struct bkey));
+			if ((ret = PTR_ERR_OR_ZERO(update)))
+				goto err;
+
+			update->k = *k.k;
+			set_bkey_val_u64s(&update->k, 0);
+			update->k.type = KEY_TYPE_deleted;
+			update->k.size = 0;
+
+			__bch2_btree_iter_set_pos(update_iter, update->k.p, false);
+			bch2_trans_update2(trans, update_iter, update);
+			bch2_trans_iter_put(trans, update_iter);
+		}
+
+		k = bch2_btree_iter_next_with_updates(iter);
+	}
+err:
+	if (!IS_ERR_OR_NULL(iter))
+		bch2_trans_iter_put(trans, iter);
+	return ret;
+}
+
 int __bch2_trans_commit(struct btree_trans *trans)
 {
 	struct btree_insert_entry *i = NULL;
@@ -724,7 +844,36 @@ int __bch2_trans_commit(struct btree_trans *trans)
 		}
 	} while (trans_trigger_run);
 
+	/* Turn extents updates into keys: */
+	trans_for_each_update(trans, i)
+		if (i->iter->flags & BTREE_ITER_IS_EXTENTS) {
+			struct bpos start = bkey_start_pos(&i->k->k);
+
+			while (i + 1 < trans->updates + trans->nr_updates &&
+			       i[0].iter->btree_id == i[1].iter->btree_id &&
+			       !bkey_cmp(i[0].k->k.p, bkey_start_pos(&i[1].k->k)))
+				i++;
+
+			ret = extent_handle_overwrites(trans, i->iter->btree_id,
+						       start, i->k->k.p);
+			if (ret)
+				goto out;
+		}
+
 	trans_for_each_update(trans, i) {
+		if (i->iter->flags & BTREE_ITER_IS_EXTENTS) {
+			ret = extent_update_to_keys(trans, i->iter, i->k);
+			if (ret)
+				goto out;
+		} else {
+			bch2_trans_update2(trans, i->iter, i->k);
+		}
+	}
+
+	trans_for_each_update2(trans, i) {
+		BUG_ON(i->iter->uptodate > BTREE_ITER_NEED_PEEK);
+		BUG_ON(i->iter->locks_want < 1);
+
 		u64s = jset_u64s(i->k->k.u64s);
 		if (0)
 			trans->journal_preres_u64s += u64s;
@@ -773,7 +922,10 @@ int bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter,
 		.trigger_flags = flags, .iter = iter, .k = k
 	};
 
-	EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&k->k)));
+	EBUG_ON(bkey_cmp(iter->pos,
+			 (iter->flags & BTREE_ITER_IS_EXTENTS)
+			 ? bkey_start_pos(&k->k)
+			 : k->k.p));
 
 	iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT;
 
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 7b0f0583b1a5..cd54c2b1eff2 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -1254,21 +1254,21 @@ inline int bch2_mark_overwrite(struct btree_trans *trans,
 			       struct bkey_s_c old,
 			       struct bkey_i *new,
 			       struct bch_fs_usage *fs_usage,
-			       unsigned flags)
+			       unsigned flags,
+			       bool is_extents)
 {
 	struct bch_fs		*c = trans->c;
-	struct btree		*b = iter->l[0].b;
 	unsigned		offset = 0;
-	s64			sectors = 0;
+	s64			sectors = -((s64) old.k->size);
 
 	flags |= BTREE_TRIGGER_OVERWRITE;
 
-	if (btree_node_is_extents(b)
+	if (is_extents
 	    ? bkey_cmp(new->k.p, bkey_start_pos(old.k)) <= 0
 	    : bkey_cmp(new->k.p, old.k->p))
 		return 0;
 
-	if (btree_node_is_extents(b)) {
+	if (is_extents) {
 		switch (bch2_extent_overlap(&new->k, old.k)) {
 		case BCH_EXTENT_OVERLAP_ALL:
 			offset = 0;
@@ -1341,7 +1341,8 @@ int bch2_mark_update(struct btree_trans *trans,
 		struct bkey_s_c		k = bkey_disassemble(b, _k, &unpacked);
 
 		ret = bch2_mark_overwrite(trans, iter, k, insert,
-					  fs_usage, flags);
+					  fs_usage, flags,
+					  btree_node_type_is_extents(iter->btree_id));
 		if (ret <= 0)
 			break;
 
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index 4c84787575f5..29ebc07a2497 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -268,7 +268,7 @@ int bch2_fs_usage_apply(struct bch_fs *, struct bch_fs_usage_online *,
 
 int bch2_mark_overwrite(struct btree_trans *, struct btree_iter *,
 			struct bkey_s_c, struct bkey_i *,
-			struct bch_fs_usage *, unsigned);
+			struct bch_fs_usage *, unsigned, bool);
 int bch2_mark_update(struct btree_trans *, struct btree_iter *,
 		     struct bkey_i *, struct bch_fs_usage *, unsigned);
 
diff --git a/fs/bcachefs/extent_update.c b/fs/bcachefs/extent_update.c
index 846d77dc2530..fa6c0698f385 100644
--- a/fs/bcachefs/extent_update.c
+++ b/fs/bcachefs/extent_update.c
@@ -39,6 +39,12 @@ static int count_iters_for_insert(struct btree_trans *trans,
 {
 	int ret = 0;
 
+	/*
+	 * The extent update path requires an _additional_ iterator for each
+	 * extent we're inserting and overwriting:
+	 */
+	*nr_iters += 1;
+
 	switch (k.k->type) {
 	case KEY_TYPE_extent:
 	case KEY_TYPE_reflink_v:
@@ -167,402 +173,40 @@ int bch2_extent_is_atomic(struct bkey_i *k, struct btree_iter *iter)
 enum btree_insert_ret
 bch2_extent_can_insert(struct btree_trans *trans,
 		       struct btree_iter *iter,
-		       struct bkey_i *insert,
-		       unsigned *u64s)
+		       struct bkey_i *insert)
 {
 	struct btree_iter_level *l = &iter->l[0];
 	struct btree_node_iter node_iter = l->iter;
 	struct bkey_packed *_k;
+	struct bkey_s_c k;
 	struct bkey unpacked;
 	int sectors;
 
-	while ((_k = bch2_btree_node_iter_peek_filter(&node_iter, l->b,
-						      KEY_TYPE_discard))) {
-		struct bkey_s_c k = bkey_disassemble(l->b, _k, &unpacked);
-		enum bch_extent_overlap overlap =
-			bch2_extent_overlap(&insert->k, k.k);
-
-		if (bkey_cmp(bkey_start_pos(k.k), insert->k.p) >= 0)
-			break;
-
-		overlap = bch2_extent_overlap(&insert->k, k.k);
-
-		/*
-		 * If we're overwriting an existing extent, we may need to emit
-		 * a whiteout - unless we're inserting a new extent at the same
-		 * position:
-		 */
-		if (k.k->needs_whiteout &&
-		    (!bkey_whiteout(&insert->k) ||
-		     bkey_cmp(k.k->p, insert->k.p)))
-			*u64s += BKEY_U64s;
-
-		/*
-		 * If we're partially overwriting an existing extent which has
-		 * been written out to disk, we'll need to emit a new version of
-		 * that extent:
-		 */
-		if (bkey_written(l->b, _k) &&
-		    overlap != BCH_EXTENT_OVERLAP_ALL)
-			*u64s += _k->u64s;
-
-		/* And we may be splitting an existing extent: */
-		if (overlap == BCH_EXTENT_OVERLAP_MIDDLE)
-			*u64s += _k->u64s;
-
-		if (overlap == BCH_EXTENT_OVERLAP_MIDDLE &&
-		    (sectors = bch2_bkey_sectors_compressed(k))) {
-			int flags = trans->flags & BTREE_INSERT_NOFAIL
-				? BCH_DISK_RESERVATION_NOFAIL : 0;
-
-			switch (bch2_disk_reservation_add(trans->c,
-					trans->disk_res,
-					sectors, flags)) {
-			case 0:
-				break;
-			case -ENOSPC:
-				return BTREE_INSERT_ENOSPC;
-			default:
-				BUG();
-			}
-		}
-
-		if (overlap == BCH_EXTENT_OVERLAP_FRONT ||
-		    overlap == BCH_EXTENT_OVERLAP_MIDDLE)
-			break;
-
-		bch2_btree_node_iter_advance(&node_iter, l->b);
-	}
-
-	return BTREE_INSERT_OK;
-}
-
-static void verify_extent_nonoverlapping(struct bch_fs *c,
-					 struct btree *b,
-					 struct btree_node_iter *_iter,
-					 struct bkey_i *insert)
-{
-#ifdef CONFIG_BCACHEFS_DEBUG
-	struct btree_node_iter iter;
-	struct bkey_packed *k;
-	struct bkey uk;
-
-	if (!expensive_debug_checks(c))
-		return;
-
-	iter = *_iter;
-	k = bch2_btree_node_iter_prev_filter(&iter, b, KEY_TYPE_discard);
-	BUG_ON(k &&
-	       (uk = bkey_unpack_key(b, k),
-		bkey_cmp(uk.p, bkey_start_pos(&insert->k)) > 0));
-
-	iter = *_iter;
-	k = bch2_btree_node_iter_peek_filter(&iter, b, KEY_TYPE_discard);
-#if 0
-	BUG_ON(k &&
-	       (uk = bkey_unpack_key(b, k),
-		bkey_cmp(insert->k.p, bkey_start_pos(&uk))) > 0);
-#else
-	if (k &&
-	    (uk = bkey_unpack_key(b, k),
-	     bkey_cmp(insert->k.p, bkey_start_pos(&uk))) > 0) {
-		char buf1[100];
-		char buf2[100];
-
-		bch2_bkey_to_text(&PBUF(buf1), &insert->k);
-		bch2_bkey_to_text(&PBUF(buf2), &uk);
-
-		bch2_dump_btree_node(b);
-		panic("insert > next :\n"
-		      "insert %s\n"
-		      "next   %s\n",
-		      buf1, buf2);
-	}
-#endif
-
-#endif
-}
-
-static void extent_bset_insert(struct bch_fs *c, struct btree_iter *iter,
-			       struct bkey_i *insert)
-{
-	struct btree_iter_level *l = &iter->l[0];
-	struct bkey_packed *k =
-		bch2_btree_node_iter_bset_pos(&l->iter, l->b, bset_tree_last(l->b));
-
-	BUG_ON(insert->k.u64s > bch_btree_keys_u64s_remaining(c, l->b));
-
-	EBUG_ON(bkey_deleted(&insert->k) || !insert->k.size);
-	verify_extent_nonoverlapping(c, l->b, &l->iter, insert);
-
-	if (debug_check_bkeys(c))
-		bch2_bkey_debugcheck(c, l->b, bkey_i_to_s_c(insert));
-
-	bch2_bset_insert(l->b, &l->iter, k, insert, 0);
-	bch2_btree_node_iter_fix(iter, l->b, &l->iter, k, 0, k->u64s);
-}
-
-static void pack_push_whiteout(struct bch_fs *c, struct btree *b,
-			       struct bpos pos)
-{
-	struct bkey_packed k;
-
-	if (!bkey_pack_pos(&k, pos, b)) {
-		struct bkey_i tmp;
-
-		bkey_init(&tmp.k);
-		tmp.k.p = pos;
-		bkey_copy(&k, &tmp);
-	}
-
-	k.needs_whiteout = true;
-	push_whiteout(c, b, &k);
-}
-
-static void
-extent_drop(struct bch_fs *c, struct btree_iter *iter,
-	    struct bkey_packed *_k, struct bkey_s k)
-{
-	struct btree_iter_level *l = &iter->l[0];
-
-	if (!bkey_whiteout(k.k))
-		btree_account_key_drop(l->b, _k);
-
-	k.k->size = 0;
-	k.k->type = KEY_TYPE_deleted;
-
-	if (!btree_node_old_extent_overwrite(l->b) &&
-	    k.k->needs_whiteout) {
-		pack_push_whiteout(c, l->b, k.k->p);
-		k.k->needs_whiteout = false;
-	}
-
-	if (_k >= btree_bset_last(l->b)->start) {
-		unsigned u64s = _k->u64s;
-
-		bch2_bset_delete(l->b, _k, _k->u64s);
-		bch2_btree_node_iter_fix(iter, l->b, &l->iter, _k, u64s, 0);
-	} else {
-		extent_save(l->b, _k, k.k);
-		bch2_btree_iter_fix_key_modified(iter, l->b, _k);
-	}
-}
-
-static void
-extent_squash(struct bch_fs *c, struct btree_iter *iter,
-	      struct bkey_i *insert,
-	      struct bkey_packed *_k, struct bkey_s k,
-	      enum bch_extent_overlap overlap)
-{
-	struct btree_iter_level *l = &iter->l[0];
-	struct bkey_on_stack tmp, split;
-
-	bkey_on_stack_init(&tmp);
-	bkey_on_stack_init(&split);
-
-	if (!btree_node_old_extent_overwrite(l->b)) {
-		if (!bkey_whiteout(&insert->k) &&
-		    !bkey_cmp(k.k->p, insert->k.p)) {
-			insert->k.needs_whiteout = k.k->needs_whiteout;
-			k.k->needs_whiteout = false;
-		}
-	} else {
-		insert->k.needs_whiteout |= k.k->needs_whiteout;
-	}
-
-	switch (overlap) {
-	case BCH_EXTENT_OVERLAP_FRONT:
-		if (bkey_written(l->b, _k)) {
-			bkey_on_stack_reassemble(&tmp, c, k.s_c);
-			bch2_cut_front(insert->k.p, tmp.k);
-
-			/*
-			 * needs_whiteout was propagated to new version of @k,
-			 * @tmp:
-			 */
-			if (!btree_node_old_extent_overwrite(l->b))
-				k.k->needs_whiteout = false;
-
-			extent_drop(c, iter, _k, k);
-			extent_bset_insert(c, iter, tmp.k);
-		} else {
-			btree_keys_account_val_delta(l->b, _k,
-				bch2_cut_front_s(insert->k.p, k));
-
-			extent_save(l->b, _k, k.k);
-			/*
-			 * No need to call bset_fix_invalidated_key, start of
-			 * extent changed but extents are indexed by where they
-			 * end
-			 */
-			bch2_btree_iter_fix_key_modified(iter, l->b, _k);
-		}
-		break;
-	case BCH_EXTENT_OVERLAP_BACK:
-		if (bkey_written(l->b, _k)) {
-			bkey_on_stack_reassemble(&tmp, c, k.s_c);
-			bch2_cut_back(bkey_start_pos(&insert->k), tmp.k);
-
-			/*
-			 * @tmp has different position than @k, needs_whiteout
-			 * should not be propagated:
-			 */
-			if (!btree_node_old_extent_overwrite(l->b))
-				tmp.k->k.needs_whiteout = false;
-
-			extent_drop(c, iter, _k, k);
-			extent_bset_insert(c, iter, tmp.k);
-		} else {
-			/*
-			 * position of @k is changing, emit a whiteout if
-			 * needs_whiteout is set:
-			 */
-			if (!btree_node_old_extent_overwrite(l->b) &&
-			    k.k->needs_whiteout) {
-				pack_push_whiteout(c, l->b, k.k->p);
-				k.k->needs_whiteout = false;
-			}
-
-			btree_keys_account_val_delta(l->b, _k,
-				bch2_cut_back_s(bkey_start_pos(&insert->k), k));
-			extent_save(l->b, _k, k.k);
-
-			bch2_bset_fix_invalidated_key(l->b, _k);
-			bch2_btree_node_iter_fix(iter, l->b, &l->iter,
-						 _k, _k->u64s, _k->u64s);
-		}
-		break;
-	case BCH_EXTENT_OVERLAP_ALL:
-		extent_drop(c, iter, _k, k);
-		break;
-	case BCH_EXTENT_OVERLAP_MIDDLE:
-		bkey_on_stack_reassemble(&split, c, k.s_c);
-		bch2_cut_back(bkey_start_pos(&insert->k), split.k);
-
-		if (!btree_node_old_extent_overwrite(l->b))
-			split.k->k.needs_whiteout = false;
-
-		/* this is identical to BCH_EXTENT_OVERLAP_FRONT: */
-		if (bkey_written(l->b, _k)) {
-			bkey_on_stack_reassemble(&tmp, c, k.s_c);
-			bch2_cut_front(insert->k.p, tmp.k);
-
-			if (!btree_node_old_extent_overwrite(l->b))
-				k.k->needs_whiteout = false;
-
-			extent_drop(c, iter, _k, k);
-			extent_bset_insert(c, iter, tmp.k);
-		} else {
-			btree_keys_account_val_delta(l->b, _k,
-				bch2_cut_front_s(insert->k.p, k));
-
-			extent_save(l->b, _k, k.k);
-			bch2_btree_iter_fix_key_modified(iter, l->b, _k);
-		}
-
-		extent_bset_insert(c, iter, split.k);
-		break;
-	}
-
-	bkey_on_stack_exit(&split, c);
-	bkey_on_stack_exit(&tmp, c);
-}
+	_k = bch2_btree_node_iter_peek_filter(&node_iter, l->b,
+					      KEY_TYPE_discard);
+	if (!_k)
+		return BTREE_INSERT_OK;
 
-/**
- * bch_extent_insert_fixup - insert a new extent and deal with overlaps
- *
- * this may result in not actually doing the insert, or inserting some subset
- * of the insert key. For cmpxchg operations this is where that logic lives.
- *
- * All subsets of @insert that need to be inserted are inserted using
- * bch2_btree_insert_and_journal(). If @b or @res fills up, this function
- * returns false, setting @iter->pos for the prefix of @insert that actually got
- * inserted.
- *
- * BSET INVARIANTS: this function is responsible for maintaining all the
- * invariants for bsets of extents in memory. things get really hairy with 0
- * size extents
- *
- * within one bset:
- *
- * bkey_start_pos(bkey_next(k)) >= k
- * or bkey_start_offset(bkey_next(k)) >= k->offset
- *
- * i.e. strict ordering, no overlapping extents.
- *
- * multiple bsets (i.e. full btree node):
- *
- * ∀ k, j
- *   k.size != 0 ∧ j.size != 0 →
- *     ¬ (k > bkey_start_pos(j) ∧ k < j)
- *
- * i.e. no two overlapping keys _of nonzero size_
- *
- * We can't realistically maintain this invariant for zero size keys because of
- * the key merging done in bch2_btree_insert_key() - for two mergeable keys k, j
- * there may be another 0 size key between them in another bset, and it will
- * thus overlap with the merged key.
- *
- * In addition, the end of iter->pos indicates how much has been processed.
- * If the end of iter->pos is not the same as the end of insert, then
- * key insertion needs to continue/be retried.
- */
-void bch2_insert_fixup_extent(struct btree_trans *trans,
-			      struct btree_iter *iter,
-			      struct bkey_i *insert)
-{
-	struct bch_fs *c = trans->c;
-	struct btree_iter_level *l = &iter->l[0];
-	struct btree_node_iter node_iter = l->iter;
-	bool do_update		= !bkey_whiteout(&insert->k);
-	struct bkey_packed *_k;
-	struct bkey unpacked;
+	k = bkey_disassemble(l->b, _k, &unpacked);
 
-	EBUG_ON(iter->level);
-	EBUG_ON(!insert->k.size);
-	EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k)));
+	/* Check if we're splitting a compressed extent: */
 
-	while ((_k = bch2_btree_node_iter_peek_filter(&l->iter, l->b,
-						      KEY_TYPE_discard))) {
-		struct bkey_s k = __bkey_disassemble(l->b, _k, &unpacked);
-		enum bch_extent_overlap overlap =
-			bch2_extent_overlap(&insert->k, k.k);
+	if (bkey_cmp(bkey_start_pos(&insert->k), bkey_start_pos(k.k)) > 0 &&
+	    bkey_cmp(insert->k.p, k.k->p) < 0 &&
+	    (sectors = bch2_bkey_sectors_compressed(k))) {
+		int flags = trans->flags & BTREE_INSERT_NOFAIL
+			? BCH_DISK_RESERVATION_NOFAIL : 0;
 
-		if (bkey_cmp(bkey_start_pos(k.k), insert->k.p) >= 0)
+		switch (bch2_disk_reservation_add(trans->c, trans->disk_res,
+						  sectors, flags)) {
+		case 0:
 			break;
-
-		if (!bkey_whiteout(k.k))
-			do_update = true;
-
-		if (!do_update) {
-			struct bpos cur_end = bpos_min(insert->k.p, k.k->p);
-
-			bch2_cut_front(cur_end, insert);
-			bch2_btree_iter_set_pos_same_leaf(iter, cur_end);
-		} else {
-			extent_squash(c, iter, insert, _k, k, overlap);
+		case -ENOSPC:
+			return BTREE_INSERT_ENOSPC;
+		default:
+			BUG();
 		}
-
-		node_iter = l->iter;
-
-		if (overlap == BCH_EXTENT_OVERLAP_FRONT ||
-		    overlap == BCH_EXTENT_OVERLAP_MIDDLE)
-			break;
 	}
 
-	l->iter = node_iter;
-	bch2_btree_iter_set_pos_same_leaf(iter, insert->k.p);
-
-	if (do_update) {
-		if (insert->k.type == KEY_TYPE_deleted)
-			insert->k.type = KEY_TYPE_discard;
-
-		if (!bkey_whiteout(&insert->k) ||
-		    btree_node_old_extent_overwrite(l->b))
-			extent_bset_insert(c, iter, insert);
-
-		bch2_btree_journal_key(trans, iter, insert);
-	}
-
-	bch2_cut_front(insert->k.p, insert);
+	return BTREE_INSERT_OK;
 }
diff --git a/fs/bcachefs/extent_update.h b/fs/bcachefs/extent_update.h
index e9dc8091ba3f..38dc084627d2 100644
--- a/fs/bcachefs/extent_update.h
+++ b/fs/bcachefs/extent_update.h
@@ -11,9 +11,6 @@ int bch2_extent_is_atomic(struct bkey_i *, struct btree_iter *);
 
 enum btree_insert_ret
 bch2_extent_can_insert(struct btree_trans *, struct btree_iter *,
-		       struct bkey_i *, unsigned *);
-void bch2_insert_fixup_extent(struct btree_trans *,
-			      struct btree_iter *,
-			      struct bkey_i *);
+		       struct bkey_i *);
 
 #endif /* _BCACHEFS_EXTENT_UPDATE_H */
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index eca723121a2c..902c8da9dc15 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -422,6 +422,42 @@ static int bch2_inode_truncate(struct bch_fs *c, u64 inode_nr, u64 new_size)
 			POS(inode_nr + 1, 0), NULL);
 }
 
+static int bch2_fix_overlapping_extent(struct btree_trans *trans,
+				       struct btree_iter *iter,
+				       struct bkey_s_c k, struct bpos cut_at)
+{
+	struct btree_iter *u_iter;
+	struct bkey_i *u;
+	int ret;
+
+	u = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
+	ret = PTR_ERR_OR_ZERO(u);
+	if (ret)
+		return ret;
+
+	bkey_reassemble(u, k);
+	bch2_cut_front(cut_at, u);
+
+	u_iter = bch2_trans_copy_iter(trans, iter);
+	ret = PTR_ERR_OR_ZERO(u_iter);
+	if (ret)
+		return ret;
+
+	/*
+	 * We don't want to go through the
+	 * extent_handle_overwrites path:
+	 */
+	__bch2_btree_iter_set_pos(u_iter, u->k.p, false);
+
+	/*
+	 * XXX: this is going to leave disk space
+	 * accounting slightly wrong
+	 */
+	ret = bch2_trans_update(trans, u_iter, u, 0);
+	bch2_trans_iter_put(trans, u_iter);
+	return ret;
+}
+
 /*
  * Walk extents: verify that extents have a corresponding S_ISREG inode, and
  * that i_size an i_sectors are consistent
@@ -433,6 +469,7 @@ static int check_extents(struct bch_fs *c)
 	struct btree_trans trans;
 	struct btree_iter *iter;
 	struct bkey_s_c k;
+	struct bkey prev = KEY(0, 0, 0);
 	u64 i_sectors;
 	int ret = 0;
 
@@ -444,6 +481,25 @@ static int check_extents(struct bch_fs *c)
 				   POS(BCACHEFS_ROOT_INO, 0), 0);
 retry:
 	for_each_btree_key_continue(iter, 0, k, ret) {
+		if (bkey_cmp(prev.p, bkey_start_pos(k.k)) > 0) {
+			char buf1[100];
+			char buf2[100];
+
+			bch2_bkey_to_text(&PBUF(buf1), &prev);
+			bch2_bkey_to_text(&PBUF(buf2), k.k);
+
+			if (fsck_err(c, "overlapping extents: %s, %s", buf1, buf2)) {
+				ret = __bch2_trans_do(&trans, NULL, NULL,
+						      BTREE_INSERT_NOFAIL|
+						      BTREE_INSERT_LAZY_RW,
+						bch2_fix_overlapping_extent(&trans,
+								iter, k, prev.p));
+				if (ret)
+					goto err;
+			}
+		}
+		prev = *k.k;
+
 		ret = walk_inode(&trans, &w, k.k->p.inode);
 		if (ret)
 			break;
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index bd0edda7abf9..27378cc9cdd5 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -161,13 +161,16 @@ static void journal_entries_free(struct list_head *list)
 	}
 }
 
+/*
+ * When keys compare equal, oldest compares first:
+ */
 static int journal_sort_key_cmp(const void *_l, const void *_r)
 {
 	const struct journal_key *l = _l;
 	const struct journal_key *r = _r;
 
 	return cmp_int(l->btree_id, r->btree_id) ?:
-		bkey_cmp(l->pos, r->pos) ?:
+		bkey_cmp(l->k->k.p, r->k->k.p) ?:
 		cmp_int(l->journal_seq, r->journal_seq) ?:
 		cmp_int(l->journal_offset, r->journal_offset);
 }
@@ -179,25 +182,11 @@ static int journal_sort_seq_cmp(const void *_l, const void *_r)
 
 	return cmp_int(l->journal_seq, r->journal_seq) ?:
 		cmp_int(l->btree_id, r->btree_id) ?:
-		bkey_cmp(l->pos, r->pos);
-}
-
-static void journal_keys_sift(struct journal_keys *keys, struct journal_key *i)
-{
-	while (i + 1 < keys->d + keys->nr &&
-	       journal_sort_key_cmp(i, i + 1) > 0) {
-		swap(i[0], i[1]);
-		i++;
-	}
+		bkey_cmp(l->k->k.p, r->k->k.p);
 }
 
 static void journal_keys_free(struct journal_keys *keys)
 {
-	struct journal_key *i;
-
-	for_each_journal_key(*keys, i)
-		if (i->allocated)
-			kfree(i->k);
 	kvfree(keys->d);
 	keys->d = NULL;
 	keys->nr = 0;
@@ -208,15 +197,15 @@ static struct journal_keys journal_keys_sort(struct list_head *journal_entries)
 	struct journal_replay *p;
 	struct jset_entry *entry;
 	struct bkey_i *k, *_n;
-	struct journal_keys keys = { NULL }, keys_deduped = { NULL };
-	struct journal_key *i;
+	struct journal_keys keys = { NULL };
+	struct journal_key *src, *dst;
 	size_t nr_keys = 0;
 
 	list_for_each_entry(p, journal_entries, list)
 		for_each_jset_key(k, _n, entry, &p->j)
 			nr_keys++;
 
-	keys.journal_seq_base = keys_deduped.journal_seq_base =
+	keys.journal_seq_base =
 		le64_to_cpu(list_first_entry(journal_entries,
 					     struct journal_replay,
 					     list)->j.seq);
@@ -225,96 +214,31 @@ static struct journal_keys journal_keys_sort(struct list_head *journal_entries)
 	if (!keys.d)
 		goto err;
 
-	keys_deduped.d = kvmalloc(sizeof(keys.d[0]) * nr_keys * 2, GFP_KERNEL);
-	if (!keys_deduped.d)
-		goto err;
-
 	list_for_each_entry(p, journal_entries, list)
-		for_each_jset_key(k, _n, entry, &p->j) {
-			if (bkey_deleted(&k->k) &&
-			    btree_node_type_is_extents(entry->btree_id))
-				continue;
-
+		for_each_jset_key(k, _n, entry, &p->j)
 			keys.d[keys.nr++] = (struct journal_key) {
 				.btree_id	= entry->btree_id,
-				.pos		= bkey_start_pos(&k->k),
 				.k		= k,
 				.journal_seq	= le64_to_cpu(p->j.seq) -
 					keys.journal_seq_base,
 				.journal_offset	= k->_data - p->j._data,
 			};
-		}
 
 	sort(keys.d, keys.nr, sizeof(keys.d[0]), journal_sort_key_cmp, NULL);
 
-	i = keys.d;
-	while (i < keys.d + keys.nr) {
-		if (i + 1 < keys.d + keys.nr &&
-		    i[0].btree_id == i[1].btree_id &&
-		    !bkey_cmp(i[0].pos, i[1].pos)) {
-			if (bkey_cmp(i[0].k->k.p, i[1].k->k.p) <= 0) {
-				i++;
-			} else {
-				bch2_cut_front(i[1].k->k.p, i[0].k);
-				i[0].pos = i[1].k->k.p;
-				journal_keys_sift(&keys, i);
-			}
-			continue;
-		}
-
-		if (i + 1 < keys.d + keys.nr &&
-		    i[0].btree_id == i[1].btree_id &&
-		    bkey_cmp(i[0].k->k.p, bkey_start_pos(&i[1].k->k)) > 0) {
-			if ((cmp_int(i[0].journal_seq, i[1].journal_seq) ?:
-			     cmp_int(i[0].journal_offset, i[1].journal_offset)) < 0) {
-				if (bkey_cmp(i[0].k->k.p, i[1].k->k.p) <= 0) {
-					bch2_cut_back(bkey_start_pos(&i[1].k->k), i[0].k);
-				} else {
-					struct bkey_i *split =
-						kmalloc(bkey_bytes(i[0].k), GFP_KERNEL);
-
-					if (!split)
-						goto err;
-
-					bkey_copy(split, i[0].k);
-					bch2_cut_back(bkey_start_pos(&i[1].k->k), split);
-					keys_deduped.d[keys_deduped.nr++] = (struct journal_key) {
-						.btree_id	= i[0].btree_id,
-						.allocated	= true,
-						.pos		= bkey_start_pos(&split->k),
-						.k		= split,
-						.journal_seq	= i[0].journal_seq,
-						.journal_offset	= i[0].journal_offset,
-					};
-
-					bch2_cut_front(i[1].k->k.p, i[0].k);
-					i[0].pos = i[1].k->k.p;
-					journal_keys_sift(&keys, i);
-					continue;
-				}
-			} else {
-				if (bkey_cmp(i[0].k->k.p, i[1].k->k.p) >= 0) {
-					i[1] = i[0];
-					i++;
-					continue;
-				} else {
-					bch2_cut_front(i[0].k->k.p, i[1].k);
-					i[1].pos = i[0].k->k.p;
-					journal_keys_sift(&keys, i + 1);
-					continue;
-				}
-			}
-		}
+	src = dst = keys.d;
+	while (src < keys.d + keys.nr) {
+		while (src + 1 < keys.d + keys.nr &&
+		       src[0].btree_id == src[1].btree_id &&
+		       !bkey_cmp(src[0].k->k.p, src[1].k->k.p))
+			src++;
 
-		keys_deduped.d[keys_deduped.nr++] = *i++;
+		*dst++ = *src++;
 	}
 
-	kvfree(keys.d);
-	return keys_deduped;
+	keys.nr = dst - keys.d;
 err:
-	journal_keys_free(&keys_deduped);
-	kvfree(keys.d);
-	return (struct journal_keys) { NULL };
+	return keys;
 }
 
 /* journal replay: */
@@ -365,11 +289,6 @@ retry:
 
 		atomic_end = bpos_min(k->k.p, iter->l[0].b->key.k.p);
 
-		split_iter = bch2_trans_copy_iter(&trans, iter);
-		ret = PTR_ERR_OR_ZERO(split_iter);
-		if (ret)
-			goto err;
-
 		split = bch2_trans_kmalloc(&trans, bkey_bytes(&k->k));
 		ret = PTR_ERR_OR_ZERO(split);
 		if (ret)
@@ -388,12 +307,25 @@ retry:
 		}
 
 		bkey_copy(split, k);
-		bch2_cut_front(split_iter->pos, split);
+		bch2_cut_front(iter->pos, split);
 		bch2_cut_back(atomic_end, split);
 
+		split_iter = bch2_trans_copy_iter(&trans, iter);
+		ret = PTR_ERR_OR_ZERO(split_iter);
+		if (ret)
+			goto err;
+
+		/*
+		 * It's important that we don't go through the
+		 * extent_handle_overwrites() and extent_update_to_keys() path
+		 * here: journal replay is supposed to treat extents like
+		 * regular keys
+		 */
+		__bch2_btree_iter_set_pos(split_iter, split->k.p, false);
 		bch2_trans_update(&trans, split_iter, split, !remark
 				  ? BTREE_TRIGGER_NORUN
 				  : BTREE_TRIGGER_NOOVERWRITES);
+
 		bch2_btree_iter_set_pos(iter, split->k.p);
 	} while (bkey_cmp(iter->pos, k->k.p) < 0);
 
@@ -424,11 +356,18 @@ static int __bch2_journal_replay_key(struct btree_trans *trans,
 	struct btree_iter *iter;
 	int ret;
 
-	iter = bch2_trans_get_iter(trans, id, bkey_start_pos(&k->k),
-				   BTREE_ITER_INTENT);
+	iter = bch2_trans_get_iter(trans, id, k->k.p, BTREE_ITER_INTENT);
 	if (IS_ERR(iter))
 		return PTR_ERR(iter);
 
+	/*
+	 * iter->flags & BTREE_ITER_IS_EXTENTS triggers the update path to run
+	 * extent_handle_overwrites() and extent_update_to_keys() - but we don't
+	 * want that here, journal replay is supposed to treat extents like
+	 * regular keys:
+	 */
+	__bch2_btree_iter_set_pos(iter, k->k.p, false);
+
 	ret   = bch2_btree_iter_traverse(iter) ?:
 		bch2_trans_update(trans, iter, k, BTREE_TRIGGER_NORUN);
 	bch2_trans_iter_put(trans, iter);
@@ -459,7 +398,7 @@ static int bch2_journal_replay(struct bch_fs *c,
 
 		if (i->btree_id == BTREE_ID_ALLOC)
 			ret = bch2_alloc_replay_key(c, i->k);
-		else if (btree_node_type_is_extents(i->btree_id))
+		else if (i->k->k.size)
 			ret = bch2_extent_replay_key(c, i->btree_id, i->k);
 		else
 			ret = bch2_journal_replay_key(c, i->btree_id, i->k);
@@ -859,6 +798,15 @@ int bch2_fs_recovery(struct bch_fs *c)
 		journal_seq = le64_to_cpu(clean->journal_seq) + 1;
 	}
 
+	if (!c->sb.clean &&
+	    !(c->sb.features & (1ULL << BCH_FEATURE_extents_above_btree_updates))) {
+		bch_err(c, "filesystem needs recovery from older version; run fsck from older bcachefs-tools to fix");
+		ret = -EINVAL;
+		goto err;
+	}
+
+	c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_extents_above_btree_updates;
+
 	ret = journal_replay_early(c, clean, &journal_entries);
 	if (ret)
 		goto err;
diff --git a/fs/bcachefs/recovery.h b/fs/bcachefs/recovery.h
index ccd84a8fe60d..c91309301563 100644
--- a/fs/bcachefs/recovery.h
+++ b/fs/bcachefs/recovery.h
@@ -5,8 +5,6 @@
 struct journal_keys {
 	struct journal_key {
 		enum btree_id	btree_id:8;
-		unsigned	allocated:1;
-		struct bpos	pos;
 		struct bkey_i	*k;
 		u32		journal_seq;
 		u32		journal_offset;
-- 
cgit 


From 511ed5bf7626ecbba679d7a4c19d3f26685fd431 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 15 Mar 2020 22:41:10 -0400
Subject: bcachefs: Drop unused export

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update.h      | 2 --
 fs/bcachefs/btree_update_leaf.c | 6 +++---
 2 files changed, 3 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
index 12127a33906b..9f58d47ef5d6 100644
--- a/fs/bcachefs/btree_update.h
+++ b/fs/bcachefs/btree_update.h
@@ -12,8 +12,6 @@ void bch2_btree_node_lock_for_insert(struct bch_fs *, struct btree *,
 				     struct btree_iter *);
 bool bch2_btree_bset_insert_key(struct btree_iter *, struct btree *,
 				struct btree_node_iter *, struct bkey_i *);
-void bch2_btree_journal_key(struct btree_trans *, struct btree_iter *,
-			    struct bkey_i *);
 
 enum btree_insert_flags {
 	__BTREE_INSERT_NOUNLOCK,
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index f0efc52c7590..a5362ecb4f5d 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -166,9 +166,9 @@ static inline void __btree_journal_key(struct btree_trans *trans,
 		*trans->journal_seq = seq;
 }
 
-void bch2_btree_journal_key(struct btree_trans *trans,
-			   struct btree_iter *iter,
-			   struct bkey_i *insert)
+static void bch2_btree_journal_key(struct btree_trans *trans,
+				   struct btree_iter *iter,
+				   struct bkey_i *insert)
 {
 	struct bch_fs *c = trans->c;
 	struct journal *j = &c->journal;
-- 
cgit 


From 286d8ad040ddb9a496ac4a8551d72b827e604243 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 16 Mar 2020 14:49:52 -0400
Subject: bcachefs: Fix a use after free in dio write path

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-io.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 4a20bb11151c..726c55072b7b 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -1787,7 +1787,7 @@ static long bch2_dio_write_loop(struct dio_write *dio)
 	struct bio_vec *bv;
 	unsigned unaligned;
 	u64 new_i_size;
-	bool sync;
+	bool sync = dio->sync;
 	long ret;
 
 	if (dio->loop)
@@ -1830,7 +1830,7 @@ static long bch2_dio_write_loop(struct dio_write *dio)
 
 		if (!dio->sync && !dio->loop && dio->iter.count) {
 			if (bch2_dio_write_copy_iov(dio)) {
-				dio->sync = true;
+				dio->sync = sync = true;
 				goto do_io;
 			}
 		}
@@ -1838,7 +1838,7 @@ do_io:
 		dio->loop = true;
 		closure_call(&dio->op.cl, bch2_write, NULL, NULL);
 
-		if (dio->sync)
+		if (sync)
 			wait_for_completion(&dio->done);
 		else
 			return -EIOCBQUEUED;
@@ -1872,7 +1872,6 @@ err:
 	if (dio->free_iov)
 		kfree(dio->iter.__iov);
 
-	sync = dio->sync;
 	bio_put(bio);
 
 	/* inode->i_dio_count is our ref on inode and thus bch_fs */
-- 
cgit 


From 19f24758ef17f4a73c84a507e6433777c0726c3e Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 16 Mar 2020 15:49:23 -0400
Subject: bcachefs: Don't use peek_filter() unnecessarily

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/buckets.c       | 9 +++------
 fs/bcachefs/extent_update.c | 6 ++----
 2 files changed, 5 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index cd54c2b1eff2..22e30ed716c4 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -1335,8 +1335,7 @@ int bch2_mark_update(struct btree_trans *trans,
 	    !bkey_deleted(&insert->k))
 		return 0;
 
-	while ((_k = bch2_btree_node_iter_peek_filter(&node_iter, b,
-						      KEY_TYPE_discard))) {
+	while ((_k = bch2_btree_node_iter_peek(&node_iter, b))) {
 		struct bkey		unpacked;
 		struct bkey_s_c		k = bkey_disassemble(b, _k, &unpacked);
 
@@ -1382,8 +1381,7 @@ void bch2_trans_fs_usage_apply(struct btree_trans *trans,
 		pr_err("overlapping with");
 
 		node_iter = iter->l[0].iter;
-		while ((_k = bch2_btree_node_iter_peek_filter(&node_iter, b,
-							KEY_TYPE_discard))) {
+		while ((_k = bch2_btree_node_iter_peek(&node_iter, b))) {
 			struct bkey		unpacked;
 			struct bkey_s_c		k;
 
@@ -1795,8 +1793,7 @@ int bch2_trans_mark_update(struct btree_trans *trans,
 	if (unlikely(flags & BTREE_TRIGGER_NOOVERWRITES))
 		return 0;
 
-	while ((_k = bch2_btree_node_iter_peek_filter(&node_iter, b,
-						      KEY_TYPE_discard))) {
+	while ((_k = bch2_btree_node_iter_peek(&node_iter, b))) {
 		struct bkey		unpacked;
 		struct bkey_s_c		k;
 		unsigned		offset = 0;
diff --git a/fs/bcachefs/extent_update.c b/fs/bcachefs/extent_update.c
index fa6c0698f385..beb3b694e33c 100644
--- a/fs/bcachefs/extent_update.c
+++ b/fs/bcachefs/extent_update.c
@@ -120,8 +120,7 @@ int bch2_extent_atomic_end(struct btree_iter *iter,
 	if (ret < 0)
 		return ret;
 
-	while ((_k = bch2_btree_node_iter_peek_filter(&node_iter, b,
-						      KEY_TYPE_discard))) {
+	while ((_k = bch2_btree_node_iter_peek(&node_iter, b))) {
 		struct bkey	unpacked;
 		struct bkey_s_c	k = bkey_disassemble(b, _k, &unpacked);
 		unsigned offset = 0;
@@ -182,8 +181,7 @@ bch2_extent_can_insert(struct btree_trans *trans,
 	struct bkey unpacked;
 	int sectors;
 
-	_k = bch2_btree_node_iter_peek_filter(&node_iter, l->b,
-					      KEY_TYPE_discard);
+	_k = bch2_btree_node_iter_peek(&node_iter, l->b);
 	if (!_k)
 		return BTREE_INSERT_OK;
 
-- 
cgit 


From 716254b8a1af9b687547f60b9dc8f925237654d3 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 16 Mar 2020 15:48:58 -0400
Subject: bcachefs: Fix another iterator leak

This updates bch2_rbio_narrow_crcs() to the current style for
transactional btree code, and fixes a rare panic on iterator overflow.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/io.c | 60 ++++++++++++++++++++++++++++++++------------------------
 1 file changed, 34 insertions(+), 26 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index 717332072d87..2ec7203e5824 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -1701,33 +1701,39 @@ static void bch2_rbio_error(struct bch_read_bio *rbio, int retry,
 	}
 }
 
-static void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio)
+static int __bch2_rbio_narrow_crcs(struct btree_trans *trans,
+				   struct bch_read_bio *rbio)
 {
 	struct bch_fs *c = rbio->c;
-	struct btree_trans trans;
-	struct btree_iter *iter;
-	struct bkey_s_c k;
-	struct bkey_on_stack new;
-	struct bch_extent_crc_unpacked new_crc;
 	u64 data_offset = rbio->pos.offset - rbio->pick.crc.offset;
-	int ret;
+	struct bch_extent_crc_unpacked new_crc;
+	struct btree_iter *iter = NULL;
+	struct bkey_i *new;
+	struct bkey_s_c k;
+	int ret = 0;
 
 	if (crc_is_compressed(rbio->pick.crc))
-		return;
-
-	bkey_on_stack_init(&new);
-	bch2_trans_init(&trans, c, 0, 0);
-retry:
-	bch2_trans_begin(&trans);
+		return 0;
 
-	iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, rbio->pos,
+	iter = bch2_trans_get_iter(trans, BTREE_ID_EXTENTS, rbio->pos,
 				   BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+	if ((ret = PTR_ERR_OR_ZERO(iter)))
+		goto out;
+
 	k = bch2_btree_iter_peek_slot(iter);
-	if (IS_ERR_OR_NULL(k.k))
+	if ((ret = bkey_err(k)))
+		goto out;
+
+	/*
+	 * going to be temporarily appending another checksum entry:
+	 */
+	new = bch2_trans_kmalloc(trans, bkey_bytes(k.k) +
+				 BKEY_EXTENT_U64s_MAX * 8);
+	if ((ret = PTR_ERR_OR_ZERO(new)))
 		goto out;
 
-	bkey_on_stack_reassemble(&new, c, k);
-	k = bkey_i_to_s_c(new.k);
+	bkey_reassemble(new, k);
+	k = bkey_i_to_s_c(new);
 
 	if (bversion_cmp(k.k->version, rbio->version) ||
 	    !bch2_bkey_matches_ptr(c, k, rbio->pick.ptr, data_offset))
@@ -1743,21 +1749,23 @@ retry:
 			bkey_start_offset(k.k) - data_offset, k.k->size,
 			rbio->pick.crc.csum_type)) {
 		bch_err(c, "error verifying existing checksum while narrowing checksum (memory corruption?)");
+		ret = 0;
 		goto out;
 	}
 
-	if (!bch2_bkey_narrow_crcs(new.k, new_crc))
+	if (!bch2_bkey_narrow_crcs(new, new_crc))
 		goto out;
 
-	bch2_trans_update(&trans, iter, new.k, 0);
-	ret = bch2_trans_commit(&trans, NULL, NULL,
-				BTREE_INSERT_NOFAIL|
-				BTREE_INSERT_NOWAIT);
-	if (ret == -EINTR)
-		goto retry;
+	bch2_trans_update(trans, iter, new, 0);
 out:
-	bch2_trans_exit(&trans);
-	bkey_on_stack_exit(&new, c);
+	bch2_trans_iter_put(trans, iter);
+	return ret;
+}
+
+static noinline void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio)
+{
+	bch2_trans_do(rbio->c, NULL, NULL, BTREE_INSERT_NOFAIL,
+		      __bch2_rbio_narrow_crcs(&trans, rbio));
 }
 
 /* Inner part that may run in process context */
-- 
cgit 


From 5d548743bddf7353cfedde0b78cfc9330031a697 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 16 Mar 2020 17:23:37 -0400
Subject: bcachefs: Clear BCH_FEATURE_extents_above_btree_updates on clean
 shutdown

This is needed so that users can roll back to before "d9bb516b2d
bcachefs: Move extent overwrite handling out of core btree code", which
it appears may still be buggy.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/recovery.c | 2 --
 fs/bcachefs/super-io.c | 2 ++
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 27378cc9cdd5..02b381cb567b 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -805,8 +805,6 @@ int bch2_fs_recovery(struct bch_fs *c)
 		goto err;
 	}
 
-	c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_extents_above_btree_updates;
-
 	ret = journal_replay_early(c, clean, &journal_entries);
 	if (ret)
 		goto err;
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index 4c1e8571d872..2db88afd6bda 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -956,6 +956,7 @@ int bch2_fs_mark_dirty(struct bch_fs *c)
 	mutex_lock(&c->sb_lock);
 	SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
 	c->disk_sb.sb->compat[0] &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_METADATA);
+	c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_extents_above_btree_updates;
 	ret = bch2_write_super(c);
 	mutex_unlock(&c->sb_lock);
 
@@ -1087,6 +1088,7 @@ void bch2_fs_mark_clean(struct bch_fs *c)
 
 	c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_FEAT_ALLOC_INFO;
 	c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_FEAT_ALLOC_METADATA;
+	c->disk_sb.sb->features[0] &= ~(1ULL << BCH_FEATURE_extents_above_btree_updates);
 
 	u64s = sizeof(*sb_clean) / sizeof(u64) + c->journal.entry_u64s_reserved;
 
-- 
cgit 


From 97328a1a3c95ec90bed0221719ce1ba64544f6a8 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 18 Mar 2020 11:40:07 -0400
Subject: bcachefs: BCH_FEATURE_new_extent_overwrite is now required

The patch "bcachefs: Move extent overwrite handling out of core btree
code" should have been flipping on this feature bit; extent btree nodes
in the old format have to be rewritten before we can insert into them
with the new extent update path. Not turning on this feature bit was
causing us to go into an infinite loop where we keep rewriting btree
nodes over and over.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/super-io.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index 2db88afd6bda..b50f85d1b057 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -956,6 +956,7 @@ int bch2_fs_mark_dirty(struct bch_fs *c)
 	mutex_lock(&c->sb_lock);
 	SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
 	c->disk_sb.sb->compat[0] &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_METADATA);
+	c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_new_extent_overwrite;
 	c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_extents_above_btree_updates;
 	ret = bch2_write_super(c);
 	mutex_unlock(&c->sb_lock);
-- 
cgit 


From 6d61724b2ba1836e4e1f5f8755cb2278d4eae1a0 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 18 Mar 2020 11:46:46 -0400
Subject: bcachefs: Shut down quicker

Internal writes (i.e. copygc/rebalance operations) shouldn't be blocking
on the allocator when we're going RO.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/io.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index 2ec7203e5824..3dcb166afa23 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -1070,6 +1070,12 @@ again:
 					BKEY_EXTENT_U64s_MAX))
 			goto flush_io;
 
+		if ((op->flags & BCH_WRITE_FROM_INTERNAL) &&
+		    percpu_ref_is_dying(&c->writes)) {
+			ret = -EROFS;
+			goto err;
+		}
+
 		wp = bch2_alloc_sectors_start(c,
 			op->target,
 			op->opts.erasure_code,
-- 
cgit 


From 8666a9ad6facc153d143728c8b47aae1e8111cd6 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 18 Mar 2020 13:40:28 -0400
Subject: bcachefs: Fix an iterator bug

We were incorrectly not restarting the transaction when re-traversing
iterators.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 5f918c6c3efb..8b1395ef4d0e 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1068,7 +1068,14 @@ retry_all:
 			goto retry_all;
 	}
 
-	ret = hweight64(trans->iters_live) > 1 ? -EINTR : 0;
+	if (hweight64(trans->iters_live) > 1)
+		ret = -EINTR;
+	else
+		trans_for_each_iter(trans, iter)
+			if (iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT) {
+				ret = -EINTR;
+				break;
+			}
 out:
 	bch2_btree_cache_cannibalize_unlock(c);
 	return ret;
-- 
cgit 


From fa4dc3987b8e75ec1bfd327bb05755d153c276d6 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 21 Mar 2020 14:08:01 -0400
Subject: bcachefs: Fix count_iters_for_insert()

This fixes a transaction iterator overflow.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/extent_update.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/extent_update.c b/fs/bcachefs/extent_update.c
index beb3b694e33c..8e5070d5a39b 100644
--- a/fs/bcachefs/extent_update.c
+++ b/fs/bcachefs/extent_update.c
@@ -44,6 +44,10 @@ static int count_iters_for_insert(struct btree_trans *trans,
 	 * extent we're inserting and overwriting:
 	 */
 	*nr_iters += 1;
+	if (*nr_iters >= max_iters) {
+		*end = bpos_min(*end, k.k->p);
+		ret = 1;
+	}
 
 	switch (k.k->type) {
 	case KEY_TYPE_extent:
-- 
cgit 


From 0728eed7b6ec673d2b6e0f86b6daf240a2948292 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 21 Mar 2020 14:47:00 -0400
Subject: bcachefs: Fix a locking bug in fsck

This works around a btree locking issue - we can't be holding read locks
while taking write locks, which currently means we can't have live
iterators holding read locks at commit time.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fsck.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 902c8da9dc15..936e6366cb04 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -478,7 +478,8 @@ static int check_extents(struct bch_fs *c)
 	bch_verbose(c, "checking extents");
 
 	iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
-				   POS(BCACHEFS_ROOT_INO, 0), 0);
+				   POS(BCACHEFS_ROOT_INO, 0),
+				   BTREE_ITER_INTENT);
 retry:
 	for_each_btree_key_continue(iter, 0, k, ret) {
 		if (bkey_cmp(prev.p, bkey_start_pos(k.k)) > 0) {
-- 
cgit 


From 47143a75e01354ee0daef6667cbe7b08bd89ed84 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 24 Mar 2020 17:00:48 -0400
Subject: bcachefs: Disable extent merging

Extent merging is currently broken, and will be reimplemented
differently soon - right now it only happens when btree nodes are being
compacted, which makes it difficult to test.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bkey.h      |  3 +--
 fs/bcachefs/bkey_sort.c | 64 ++++++++++++++-----------------------------------
 2 files changed, 19 insertions(+), 48 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bkey.h b/fs/bcachefs/bkey.h
index aa729347e448..98733363c161 100644
--- a/fs/bcachefs/bkey.h
+++ b/fs/bcachefs/bkey.h
@@ -408,9 +408,8 @@ static inline int bch2_compile_bkey_format(const struct bkey_format *format,
 static inline void bkey_reassemble(struct bkey_i *dst,
 				   struct bkey_s_c src)
 {
-	BUG_ON(bkey_packed(src.k));
 	dst->k = *src.k;
-	memcpy_u64s(&dst->v, src.v, bkey_val_u64s(src.k));
+	memcpy_u64s_small(&dst->v, src.v, bkey_val_u64s(src.k));
 }
 
 #define bkey_s_null		((struct bkey_s)   { .k = NULL })
diff --git a/fs/bcachefs/bkey_sort.c b/fs/bcachefs/bkey_sort.c
index 68965a0f973a..839e78d1dc35 100644
--- a/fs/bcachefs/bkey_sort.c
+++ b/fs/bcachefs/bkey_sort.c
@@ -130,44 +130,21 @@ bch2_key_sort_fix_overlapping(struct bch_fs *c, struct bset *dst,
 	return nr;
 }
 
-static void extent_sort_advance_prev(struct bkey_format *f,
-				     struct btree_nr_keys *nr,
-				     struct bkey_packed *start,
-				     struct bkey_packed **prev)
-{
-	if (*prev) {
-		bch2_bkey_pack(*prev, (void *) *prev, f);
-
-		btree_keys_account_key_add(nr, 0, *prev);
-		*prev = bkey_next(*prev);
-	} else {
-		*prev = start;
-	}
-}
-
 static void extent_sort_append(struct bch_fs *c,
 			       struct bkey_format *f,
 			       struct btree_nr_keys *nr,
-			       struct bkey_packed *start,
-			       struct bkey_packed **prev,
+			       struct bkey_packed **out,
 			       struct bkey_s k)
 {
-	if (bkey_whiteout(k.k))
-		return;
-
-	/*
-	 * prev is always unpacked, for key merging - until right before we
-	 * advance it:
-	 */
+	if (!bkey_whiteout(k.k)) {
+		if (!bch2_bkey_pack_key(*out, k.k, f))
+			memcpy_u64s_small(*out, k.k, BKEY_U64s);
 
-	if (*prev &&
-	    bch2_bkey_merge(c, bkey_i_to_s((void *) *prev), k) ==
-	    BCH_MERGE_MERGE)
-		return;
+		memcpy_u64s_small(bkeyp_val(f, *out), k.v, bkey_val_u64s(k.k));
 
-	extent_sort_advance_prev(f, nr, start, prev);
-
-	bkey_reassemble((void *) *prev, k.s_c);
+		btree_keys_account_key_add(nr, 0, *out);
+		*out = bkey_next(*out);
+	}
 }
 
 /* Sort + repack in a new format: */
@@ -201,7 +178,7 @@ bch2_sort_repack(struct bset *dst, struct btree *src,
 	return nr;
 }
 
-/* Sort, repack, and merge: */
+/* Sort, repack, and call bch2_bkey_normalize() to drop stale pointers: */
 struct btree_nr_keys
 bch2_sort_repack_merge(struct bch_fs *c,
 		       struct bset *dst, struct btree *src,
@@ -209,7 +186,7 @@ bch2_sort_repack_merge(struct bch_fs *c,
 		       struct bkey_format *out_f,
 		       bool filter_whiteouts)
 {
-	struct bkey_packed *prev = NULL, *k_packed;
+	struct bkey_packed *out = vstruct_last(dst), *k_packed;
 	struct bkey_on_stack k;
 	struct btree_nr_keys nr;
 
@@ -234,13 +211,10 @@ bch2_sort_repack_merge(struct bch_fs *c,
 		    bch2_bkey_normalize(c, bkey_i_to_s(k.k)))
 			continue;
 
-		extent_sort_append(c, out_f, &nr, vstruct_last(dst),
-				   &prev, bkey_i_to_s(k.k));
+		extent_sort_append(c, out_f, &nr, &out, bkey_i_to_s(k.k));
 	}
 
-	extent_sort_advance_prev(out_f, &nr, vstruct_last(dst), &prev);
-
-	dst->u64s = cpu_to_le16((u64 *) prev - dst->_data);
+	dst->u64s = cpu_to_le16((u64 *) out - dst->_data);
 	bkey_on_stack_exit(&k, c);
 	return nr;
 }
@@ -337,7 +311,7 @@ bch2_extent_sort_fix_overlapping(struct bch_fs *c, struct bset *dst,
 	struct btree *b = iter->b;
 	struct bkey_format *f = &b->format;
 	struct sort_iter_set *_l = iter->data, *_r = iter->data + 1;
-	struct bkey_packed *prev = NULL;
+	struct bkey_packed *out = dst->start;
 	struct bkey l_unpacked, r_unpacked;
 	struct bkey_s l, r;
 	struct btree_nr_keys nr;
@@ -360,7 +334,7 @@ bch2_extent_sort_fix_overlapping(struct bch_fs *c, struct bset *dst,
 		l = __bkey_disassemble(b, _l->k, &l_unpacked);
 
 		if (iter->used == 1) {
-			extent_sort_append(c, f, &nr, dst->start, &prev, l);
+			extent_sort_append(c, f, &nr, &out, l);
 			extent_iter_advance(iter, 0);
 			continue;
 		}
@@ -369,7 +343,7 @@ bch2_extent_sort_fix_overlapping(struct bch_fs *c, struct bset *dst,
 
 		/* If current key and next key don't overlap, just append */
 		if (bkey_cmp(l.k->p, bkey_start_pos(r.k)) <= 0) {
-			extent_sort_append(c, f, &nr, dst->start, &prev, l);
+			extent_sort_append(c, f, &nr, &out, l);
 			extent_iter_advance(iter, 0);
 			continue;
 		}
@@ -414,17 +388,15 @@ bch2_extent_sort_fix_overlapping(struct bch_fs *c, struct bset *dst,
 			__sort_iter_sift(iter, 0,
 					 extent_sort_fix_overlapping_cmp);
 
-			extent_sort_append(c, f, &nr, dst->start,
-					   &prev, bkey_i_to_s(split.k));
+			extent_sort_append(c, f, &nr, &out,
+					   bkey_i_to_s(split.k));
 		} else {
 			bch2_cut_back_s(bkey_start_pos(r.k), l);
 			extent_save(b, _l->k, l.k);
 		}
 	}
 
-	extent_sort_advance_prev(f, &nr, dst->start, &prev);
-
-	dst->u64s = cpu_to_le16((u64 *) prev - dst->_data);
+	dst->u64s = cpu_to_le16((u64 *) out - dst->_data);
 
 	bkey_on_stack_exit(&split, c);
 	return nr;
-- 
cgit 


From e62d65f2fbc3cb89ffd273ec0931ff32b778ef8b Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 15 Mar 2020 23:29:43 -0400
Subject: bcachefs: trans_commit() path can now insert to interior nodes

This will be needed for the upcoming patches to journal updates to
interior btree nodes.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bkey_methods.c          |   2 +-
 fs/bcachefs/btree_cache.c           |  81 ++++++++++++++++--
 fs/bcachefs/btree_cache.h           |   3 +
 fs/bcachefs/btree_gc.c              | 113 +++++++++++++++++--------
 fs/bcachefs/btree_types.h           |   5 ++
 fs/bcachefs/btree_update_interior.c |   2 +-
 fs/bcachefs/btree_update_leaf.c     |  22 ++---
 fs/bcachefs/recovery.c              | 161 ++++++++++++++++++++++++------------
 fs/bcachefs/recovery.h              |  21 +++--
 9 files changed, 296 insertions(+), 114 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c
index c064cf468a9b..0aa3d3b9a281 100644
--- a/fs/bcachefs/bkey_methods.c
+++ b/fs/bcachefs/bkey_methods.c
@@ -134,7 +134,7 @@ const char *bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k,
 
 const char *bch2_bkey_in_btree_node(struct btree *b, struct bkey_s_c k)
 {
-	if (bkey_cmp(bkey_start_pos(k.k), b->data->min_key) < 0)
+	if (bkey_cmp(k.k->p, b->data->min_key) < 0)
 		return "key before start of btree node";
 
 	if (bkey_cmp(k.k->p, b->data->max_key) > 0)
diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index cb843a362cb4..0711bde8d68c 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -589,6 +589,7 @@ err:
 static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c,
 				struct btree_iter *iter,
 				const struct bkey_i *k,
+				enum btree_id btree_id,
 				unsigned level,
 				enum six_lock_type lock_type,
 				bool sync)
@@ -601,7 +602,7 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c,
 	 * Parent node must be locked, else we could read in a btree node that's
 	 * been freed:
 	 */
-	if (!bch2_btree_node_relock(iter, level + 1))
+	if (iter && !bch2_btree_node_relock(iter, level + 1))
 		return ERR_PTR(-EINTR);
 
 	b = bch2_btree_node_mem_alloc(c);
@@ -609,7 +610,7 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c,
 		return b;
 
 	bkey_copy(&b->key, k);
-	if (bch2_btree_node_hash_insert(bc, b, level, iter->btree_id)) {
+	if (bch2_btree_node_hash_insert(bc, b, level, btree_id)) {
 		/* raced with another fill: */
 
 		/* mark as unhashed... */
@@ -629,7 +630,7 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c,
 	 *
 	 * XXX: ideally should be dropping all btree node locks here
 	 */
-	if (btree_node_read_locked(iter, level + 1))
+	if (iter && btree_node_read_locked(iter, level + 1))
 		btree_node_unlock(iter, level + 1);
 
 	bch2_btree_node_read(c, b, sync);
@@ -677,7 +678,8 @@ retry:
 		 * else we could read in a btree node from disk that's been
 		 * freed:
 		 */
-		b = bch2_btree_node_fill(c, iter, k, level, lock_type, true);
+		b = bch2_btree_node_fill(c, iter, k, iter->btree_id,
+					 level, lock_type, true);
 
 		/* We raced and found the btree node in the cache */
 		if (!b)
@@ -763,6 +765,74 @@ lock_node:
 	return b;
 }
 
+struct btree *bch2_btree_node_get_noiter(struct bch_fs *c,
+					 const struct bkey_i *k,
+					 enum btree_id btree_id,
+					 unsigned level)
+{
+	struct btree_cache *bc = &c->btree_cache;
+	struct btree *b;
+	struct bset_tree *t;
+
+	EBUG_ON(level >= BTREE_MAX_DEPTH);
+
+	b = btree_node_mem_ptr(k);
+	if (b)
+		goto lock_node;
+retry:
+	b = btree_cache_find(bc, k);
+	if (unlikely(!b)) {
+		b = bch2_btree_node_fill(c, NULL, k, btree_id,
+					 level, SIX_LOCK_read, true);
+
+		/* We raced and found the btree node in the cache */
+		if (!b)
+			goto retry;
+
+		if (IS_ERR(b))
+			return b;
+	} else {
+lock_node:
+		six_lock_read(&b->c.lock, NULL, NULL);
+
+		if (unlikely(b->hash_val != btree_ptr_hash_val(k) ||
+			     b->c.btree_id != btree_id ||
+			     b->c.level != level)) {
+			six_unlock_read(&b->c.lock);
+			goto retry;
+		}
+	}
+
+	/* XXX: waiting on IO with btree locks held: */
+	wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight,
+		       TASK_UNINTERRUPTIBLE);
+
+	prefetch(b->aux_data);
+
+	for_each_bset(b, t) {
+		void *p = (u64 *) b->aux_data + t->aux_data_offset;
+
+		prefetch(p + L1_CACHE_BYTES * 0);
+		prefetch(p + L1_CACHE_BYTES * 1);
+		prefetch(p + L1_CACHE_BYTES * 2);
+	}
+
+	/* avoid atomic set bit if it's not needed: */
+	if (!btree_node_accessed(b))
+		set_btree_node_accessed(b);
+
+	if (unlikely(btree_node_read_error(b))) {
+		six_unlock_read(&b->c.lock);
+		return ERR_PTR(-EIO);
+	}
+
+	EBUG_ON(b->c.btree_id != btree_id ||
+		BTREE_NODE_LEVEL(b->data) != level ||
+		bkey_cmp(b->data->max_key, k->k.p));
+
+	return b;
+}
+
 struct btree *bch2_btree_node_get_sibling(struct bch_fs *c,
 					  struct btree_iter *iter,
 					  struct btree *b,
@@ -877,7 +947,8 @@ void bch2_btree_node_prefetch(struct bch_fs *c, struct btree_iter *iter,
 	if (b)
 		return;
 
-	bch2_btree_node_fill(c, iter, k, level, SIX_LOCK_read, false);
+	bch2_btree_node_fill(c, iter, k, iter->btree_id,
+			     level, SIX_LOCK_read, false);
 }
 
 void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c,
diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h
index 5d85987457bf..abde6c2658c6 100644
--- a/fs/bcachefs/btree_cache.h
+++ b/fs/bcachefs/btree_cache.h
@@ -25,6 +25,9 @@ struct btree *bch2_btree_node_get(struct bch_fs *, struct btree_iter *,
 				  const struct bkey_i *, unsigned,
 				  enum six_lock_type);
 
+struct btree *bch2_btree_node_get_noiter(struct bch_fs *, const struct bkey_i *,
+					 enum btree_id, unsigned);
+
 struct btree *bch2_btree_node_get_sibling(struct bch_fs *, struct btree_iter *,
 				struct btree *, enum btree_node_sibling);
 
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index f85fbc057fb3..ee5eafdb1222 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -186,16 +186,8 @@ fsck_err:
 	return ret;
 }
 
-static bool pos_in_journal_keys(struct journal_keys *journal_keys,
-				enum btree_id id, struct bpos pos)
-{
-	struct journal_key *k = journal_key_search(journal_keys, id, pos);
-
-	return k && k->btree_id == id && !bkey_cmp(k->k->k.p, pos);
-}
-
 static int btree_gc_mark_node(struct bch_fs *c, struct btree *b, u8 *max_stale,
-			      struct journal_keys *journal_keys, bool initial)
+			      bool initial)
 {
 	struct btree_node_iter iter;
 	struct bkey unpacked;
@@ -209,10 +201,6 @@ static int btree_gc_mark_node(struct bch_fs *c, struct btree *b, u8 *max_stale,
 
 	for_each_btree_node_key_unpack(b, k, &iter,
 				       &unpacked) {
-		if (!b->c.level && journal_keys &&
-		    pos_in_journal_keys(journal_keys, b->c.btree_id, k.k->p))
-			continue;
-
 		bch2_bkey_debugcheck(c, b, k);
 
 		ret = bch2_gc_mark_key(c, k, max_stale, initial);
@@ -224,7 +212,6 @@ static int btree_gc_mark_node(struct bch_fs *c, struct btree *b, u8 *max_stale,
 }
 
 static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id,
-			 struct journal_keys *journal_keys,
 			 bool initial, bool metadata_only)
 {
 	struct btree_trans trans;
@@ -252,8 +239,7 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id,
 
 		gc_pos_set(c, gc_pos_btree_node(b));
 
-		ret = btree_gc_mark_node(c, b, &max_stale,
-					 journal_keys, initial);
+		ret = btree_gc_mark_node(c, b, &max_stale, initial);
 		if (ret)
 			break;
 
@@ -289,6 +275,78 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id,
 	return ret;
 }
 
+static int bch2_gc_btree_init_recurse(struct bch_fs *c, struct btree *b,
+					 struct journal_keys *journal_keys,
+					 unsigned target_depth)
+{
+	struct btree_and_journal_iter iter;
+	struct bkey_s_c k;
+	u8 max_stale = 0;
+	int ret = 0;
+
+	bch2_btree_and_journal_iter_init_node_iter(&iter, journal_keys, b);
+
+	while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
+		bch2_bkey_debugcheck(c, b, k);
+
+		ret = bch2_gc_mark_key(c, k, &max_stale, true);
+		if (ret)
+			break;
+
+		if (b->c.level > target_depth) {
+			struct btree *child;
+			BKEY_PADDED(k) tmp;
+
+			bkey_reassemble(&tmp.k, k);
+
+			child = bch2_btree_node_get_noiter(c, &tmp.k,
+						b->c.btree_id, b->c.level - 1);
+			ret = PTR_ERR_OR_ZERO(child);
+			if (ret)
+				break;
+
+			bch2_gc_btree_init_recurse(c, child,
+					journal_keys, target_depth);
+			six_unlock_read(&child->c.lock);
+		}
+
+		bch2_btree_and_journal_iter_advance(&iter);
+	}
+
+	return ret;
+}
+
+static int bch2_gc_btree_init(struct bch_fs *c,
+			      struct journal_keys *journal_keys,
+			      enum btree_id btree_id,
+			      bool metadata_only)
+{
+	struct btree *b;
+	unsigned target_depth = metadata_only		? 1
+		: expensive_debug_checks(c)		? 0
+		: !btree_node_type_needs_gc(btree_id)	? 1
+		: 0;
+	u8 max_stale = 0;
+	int ret = 0;
+
+	b = c->btree_roots[btree_id].b;
+
+	if (btree_node_fake(b))
+		return 0;
+
+	six_lock_read(&b->c.lock, NULL, NULL);
+	if (b->c.level >= target_depth)
+		ret = bch2_gc_btree_init_recurse(c, b,
+					journal_keys, target_depth);
+
+	if (!ret)
+		ret = bch2_gc_mark_key(c, bkey_i_to_s_c(&b->key),
+				       &max_stale, true);
+	six_unlock_read(&b->c.lock);
+
+	return ret;
+}
+
 static inline int btree_id_gc_phase_cmp(enum btree_id l, enum btree_id r)
 {
 	return  (int) btree_id_to_gc_phase(l) -
@@ -307,27 +365,12 @@ static int bch2_gc_btrees(struct bch_fs *c, struct journal_keys *journal_keys,
 
 	for (i = 0; i < BTREE_ID_NR; i++) {
 		enum btree_id id = ids[i];
-		enum btree_node_type type = __btree_node_type(0, id);
-
-		int ret = bch2_gc_btree(c, id, journal_keys,
-					initial, metadata_only);
+		int ret = initial
+			? bch2_gc_btree_init(c, journal_keys,
+					     id, metadata_only)
+			: bch2_gc_btree(c, id, initial, metadata_only);
 		if (ret)
 			return ret;
-
-		if (journal_keys && !metadata_only &&
-		    btree_node_type_needs_gc(type)) {
-			struct journal_key *j;
-			u8 max_stale;
-			int ret;
-
-			for_each_journal_key(*journal_keys, j)
-				if (j->btree_id == id) {
-					ret = bch2_gc_mark_key(c, bkey_i_to_s_c(j->k),
-							       &max_stale, initial);
-					if (ret)
-						return ret;
-				}
-		}
 	}
 
 	return 0;
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index fdfa7a265850..885cc9500f36 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -264,6 +264,11 @@ static inline enum btree_iter_type btree_iter_type(struct btree_iter *iter)
 	return iter->flags & BTREE_ITER_TYPE;
 }
 
+static inline struct btree_iter_level *iter_l(struct btree_iter *iter)
+{
+	return iter->l + iter->level;
+}
+
 struct btree_insert_entry {
 	unsigned		trigger_flags;
 	unsigned		trans_triggers_run:1;
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index c1a4d6559d01..fa9c7f5e0bb9 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -1630,7 +1630,7 @@ int bch2_btree_split_leaf(struct bch_fs *c, struct btree_iter *iter,
 			  unsigned flags)
 {
 	struct btree_trans *trans = iter->trans;
-	struct btree *b = iter->l[0].b;
+	struct btree *b = iter_l(iter)->b;
 	struct btree_update *as;
 	struct closure cl;
 	int ret = 0;
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index a5362ecb4f5d..a8487f8275b6 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -24,7 +24,7 @@ static inline bool same_leaf_as_prev(struct btree_trans *trans,
 				     struct btree_insert_entry *i)
 {
 	return i != trans->updates2 &&
-		i[0].iter->l[0].b == i[-1].iter->l[0].b;
+		iter_l(i[0].iter)->b == iter_l(i[-1].iter)->b;
 }
 
 inline void bch2_btree_node_lock_for_insert(struct bch_fs *c, struct btree *b,
@@ -172,13 +172,12 @@ static void bch2_btree_journal_key(struct btree_trans *trans,
 {
 	struct bch_fs *c = trans->c;
 	struct journal *j = &c->journal;
-	struct btree *b = iter->l[0].b;
+	struct btree *b = iter_l(iter)->b;
 	struct btree_write *w = btree_current_write(b);
 	u64 seq = likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))
 		? trans->journal_res.seq
 		: j->replay_journal_seq;
 
-	EBUG_ON(iter->level || b->c.level);
 	EBUG_ON(trans->journal_res.ref !=
 		!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY));
 
@@ -205,17 +204,15 @@ static void btree_insert_key_leaf(struct btree_trans *trans,
 				  struct bkey_i *insert)
 {
 	struct bch_fs *c = trans->c;
-	struct btree *b = iter->l[0].b;
+	struct btree *b = iter_l(iter)->b;
 	struct bset_tree *t = bset_tree_last(b);
 	int old_u64s = bset_u64s(t);
 	int old_live_u64s = b->nr.live_u64s;
 	int live_u64s_added, u64s_added;
 
-	EBUG_ON(iter->level);
-
 	insert->k.needs_whiteout = false;
 
-	if (likely(bch2_btree_bset_insert_key(iter, b, &iter->l[0].iter, insert)))
+	if (likely(bch2_btree_bset_insert_key(iter, b, &iter_l(iter)->iter, insert)))
 		bch2_btree_journal_key(trans, iter, insert);
 
 	live_u64s_added = (int) b->nr.live_u64s - old_live_u64s;
@@ -241,7 +238,6 @@ static inline void btree_insert_entry_checks(struct btree_trans *trans,
 {
 	struct bch_fs *c = trans->c;
 
-	BUG_ON(iter->level);
 	BUG_ON(bkey_cmp(insert->k.p, iter->pos));
 	BUG_ON(debug_check_bkeys(c) &&
 	       bch2_bkey_invalid(c, bkey_i_to_s_c(insert), iter->btree_id));
@@ -290,7 +286,7 @@ btree_key_can_insert(struct btree_trans *trans,
 		     unsigned *u64s)
 {
 	struct bch_fs *c = trans->c;
-	struct btree *b = iter->l[0].b;
+	struct btree *b = iter_l(iter)->b;
 	static enum btree_insert_ret ret;
 
 	if (unlikely(btree_node_fake(b)))
@@ -345,7 +341,7 @@ static noinline void bch2_trans_mark_gc(struct btree_trans *trans)
 	struct btree_insert_entry *i;
 
 	trans_for_each_update(trans, i)
-		if (gc_visited(c, gc_pos_btree_node(i->iter->l[0].b)))
+		if (gc_visited(c, gc_pos_btree_node(iter_l(i->iter)->b)))
 			bch2_mark_update(trans, i->iter, i->k, NULL,
 					 i->trigger_flags|BTREE_TRIGGER_GC);
 }
@@ -461,7 +457,7 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans,
 	int ret;
 
 	trans_for_each_update2(trans, i)
-		BUG_ON(!btree_node_intent_locked(i->iter, 0));
+		BUG_ON(!btree_node_intent_locked(i->iter, i->iter->level));
 
 	ret = bch2_journal_preres_get(&trans->c->journal,
 			&trans->journal_preres, trans->journal_preres_u64s,
@@ -495,13 +491,13 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans,
 	trans_for_each_update2(trans, i)
 		if (!same_leaf_as_prev(trans, i))
 			bch2_btree_node_lock_for_insert(trans->c,
-						i->iter->l[0].b, i->iter);
+					iter_l(i->iter)->b, i->iter);
 
 	ret = bch2_trans_commit_write_locked(trans, stopped_at);
 
 	trans_for_each_update2(trans, i)
 		if (!same_leaf_as_prev(trans, i))
-			bch2_btree_node_unlock_write_inlined(i->iter->l[0].b,
+			bch2_btree_node_unlock_write_inlined(iter_l(i->iter)->b,
 							     i->iter);
 
 	/*
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 02b381cb567b..0d4abaa3ba10 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -27,30 +27,78 @@
 
 /* iterate over keys read from the journal: */
 
-struct bkey_s_c bch2_journal_iter_peek(struct journal_iter *iter)
+static struct journal_key *journal_key_search(struct journal_keys *journal_keys,
+					      enum btree_id id, unsigned level,
+					      struct bpos pos)
 {
-	while (iter->k) {
-		if (iter->k->btree_id == iter->btree_id)
-			return bkey_i_to_s_c(iter->k->k);
+	size_t l = 0, r = journal_keys->nr, m;
 
-		iter->k++;
-		if (iter->k == iter->keys->d + iter->keys->nr)
-			iter->k = NULL;
+	while (l < r) {
+		m = l + ((r - l) >> 1);
+		if ((cmp_int(id,	journal_keys->d[m].btree_id) ?:
+		     cmp_int(level,	journal_keys->d[m].level) ?:
+		     bkey_cmp(pos,	journal_keys->d[m].k->k.p)) > 0)
+			l = m + 1;
+		else
+			r = m;
 	}
 
-	return bkey_s_c_null;
+	BUG_ON(l < journal_keys->nr &&
+	       (cmp_int(id,	journal_keys->d[l].btree_id) ?:
+		cmp_int(level,	journal_keys->d[l].level) ?:
+		bkey_cmp(pos,	journal_keys->d[l].k->k.p)) > 0);
+
+	BUG_ON(l &&
+	       (cmp_int(id,	journal_keys->d[l - 1].btree_id) ?:
+		cmp_int(level,	journal_keys->d[l - 1].level) ?:
+		bkey_cmp(pos,	journal_keys->d[l - 1].k->k.p)) <= 0);
+
+	return l < journal_keys->nr ? journal_keys->d + l : NULL;
+}
+
+static struct bkey_i *bch2_journal_iter_peek(struct journal_iter *iter)
+{
+	if (iter->k &&
+	    iter->k < iter->keys->d + iter->keys->nr &&
+	    iter->k->btree_id	== iter->btree_id &&
+	    iter->k->level	== iter->level)
+		return iter->k->k;
+
+	iter->k = NULL;
+	return NULL;
+}
+
+static void bch2_journal_iter_advance(struct journal_iter *iter)
+{
+	if (iter->k)
+		iter->k++;
 }
 
-struct bkey_s_c bch2_journal_iter_next(struct journal_iter *iter)
+static void bch2_journal_iter_init(struct journal_iter *iter,
+				   struct journal_keys *journal_keys,
+				   enum btree_id id, unsigned level,
+				   struct bpos pos)
 {
-	if (!iter->k)
-		return bkey_s_c_null;
+	iter->btree_id	= id;
+	iter->level	= level;
+	iter->keys	= journal_keys;
+	iter->k		= journal_key_search(journal_keys, id, level, pos);
+}
 
-	iter->k++;
-	if (iter->k == iter->keys->d + iter->keys->nr)
-		iter->k = NULL;
+static struct bkey_s_c bch2_journal_iter_peek_btree(struct btree_and_journal_iter *iter)
+{
+	return iter->btree
+		? bch2_btree_iter_peek(iter->btree)
+		: bch2_btree_node_iter_peek_unpack(&iter->node_iter,
+						   iter->b, &iter->unpacked);
+}
 
-	return bch2_journal_iter_peek(iter);
+static void bch2_journal_iter_advance_btree(struct btree_and_journal_iter *iter)
+{
+	if (iter->btree)
+		bch2_btree_iter_next(iter->btree);
+	else
+		bch2_btree_node_iter_advance(&iter->node_iter, iter->b);
 }
 
 void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *iter)
@@ -59,10 +107,10 @@ void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *iter)
 	case none:
 		break;
 	case btree:
-		bch2_btree_iter_next(iter->btree);
+		bch2_journal_iter_advance_btree(iter);
 		break;
 	case journal:
-		bch2_journal_iter_next(&iter->journal);
+		bch2_journal_iter_advance(&iter->journal);
 		break;
 	}
 
@@ -74,14 +122,16 @@ struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *
 	struct bkey_s_c ret;
 
 	while (1) {
-		struct bkey_s_c btree_k		= bch2_btree_iter_peek(iter->btree);
-		struct bkey_s_c journal_k	= bch2_journal_iter_peek(&iter->journal);
+		struct bkey_s_c btree_k		=
+			bch2_journal_iter_peek_btree(iter);
+		struct bkey_s_c journal_k	=
+			bkey_i_to_s_c(bch2_journal_iter_peek(&iter->journal));
 
 		if (btree_k.k && journal_k.k) {
 			int cmp = bkey_cmp(btree_k.k->p, journal_k.k->p);
 
 			if (!cmp)
-				bch2_btree_iter_next(iter->btree);
+				bch2_journal_iter_advance_btree(iter);
 
 			iter->last = cmp < 0 ? btree : journal;
 		} else if (btree_k.k) {
@@ -94,6 +144,14 @@ struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *
 		}
 
 		ret = iter->last == journal ? journal_k : btree_k;
+
+		if (iter->b &&
+		    bkey_cmp(ret.k->p, iter->b->data->max_key) > 0) {
+			iter->journal.k = NULL;
+			iter->last = none;
+			return bkey_s_c_null;
+		}
+
 		if (!bkey_deleted(ret.k))
 			break;
 
@@ -110,41 +168,32 @@ struct bkey_s_c bch2_btree_and_journal_iter_next(struct btree_and_journal_iter *
 	return bch2_btree_and_journal_iter_peek(iter);
 }
 
-struct journal_key *journal_key_search(struct journal_keys *journal_keys,
-				       enum btree_id id, struct bpos pos)
-{
-	size_t l = 0, r = journal_keys->nr, m;
-
-	while (l < r) {
-		m = l + ((r - l) >> 1);
-		if ((cmp_int(id, journal_keys->d[m].btree_id) ?:
-		     bkey_cmp(pos, journal_keys->d[m].k->k.p)) > 0)
-			l = m + 1;
-		else
-			r = m;
-	}
-
-	BUG_ON(l < journal_keys->nr &&
-	       (cmp_int(id, journal_keys->d[l].btree_id) ?:
-		bkey_cmp(pos, journal_keys->d[l].k->k.p)) > 0);
-
-	BUG_ON(l &&
-	       (cmp_int(id, journal_keys->d[l - 1].btree_id) ?:
-		bkey_cmp(pos, journal_keys->d[l - 1].k->k.p)) <= 0);
-
-	return l < journal_keys->nr ? journal_keys->d + l : NULL;
-}
-
 void bch2_btree_and_journal_iter_init(struct btree_and_journal_iter *iter,
 				      struct btree_trans *trans,
 				      struct journal_keys *journal_keys,
 				      enum btree_id id, struct bpos pos)
 {
-	iter->journal.keys	= journal_keys;
-	iter->journal.k		= journal_key_search(journal_keys, id, pos);
-	iter->journal.btree_id	= id;
+	memset(iter, 0, sizeof(*iter));
 
 	iter->btree = bch2_trans_get_iter(trans, id, pos, 0);
+	bch2_journal_iter_init(&iter->journal, journal_keys, id, 0, pos);
+}
+
+void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *iter,
+						struct journal_keys *journal_keys,
+						struct btree *b)
+{
+	struct bpos start = b->data->min_key;
+
+	if (btree_node_type_is_extents(b->c.btree_id))
+		start = bkey_successor(start);
+
+	memset(iter, 0, sizeof(*iter));
+
+	iter->b = b;
+	bch2_btree_node_iter_init_from_start(&iter->node_iter, iter->b);
+	bch2_journal_iter_init(&iter->journal, journal_keys,
+			       b->c.btree_id, b->c.level, start);
 }
 
 /* sort and dedup all keys in the journal: */
@@ -169,7 +218,8 @@ static int journal_sort_key_cmp(const void *_l, const void *_r)
 	const struct journal_key *l = _l;
 	const struct journal_key *r = _r;
 
-	return cmp_int(l->btree_id, r->btree_id) ?:
+	return  cmp_int(l->btree_id,	r->btree_id) ?:
+		cmp_int(l->level,	r->level) ?:
 		bkey_cmp(l->k->k.p, r->k->k.p) ?:
 		cmp_int(l->journal_seq, r->journal_seq) ?:
 		cmp_int(l->journal_offset, r->journal_offset);
@@ -180,9 +230,10 @@ static int journal_sort_seq_cmp(const void *_l, const void *_r)
 	const struct journal_key *l = _l;
 	const struct journal_key *r = _r;
 
-	return cmp_int(l->journal_seq, r->journal_seq) ?:
-		cmp_int(l->btree_id, r->btree_id) ?:
-		bkey_cmp(l->k->k.p, r->k->k.p);
+	return  cmp_int(l->journal_seq, r->journal_seq) ?:
+		cmp_int(l->btree_id,	r->btree_id) ?:
+		cmp_int(l->level,	r->level) ?:
+		bkey_cmp(l->k->k.p,	r->k->k.p);
 }
 
 static void journal_keys_free(struct journal_keys *keys)
@@ -218,6 +269,7 @@ static struct journal_keys journal_keys_sort(struct list_head *journal_entries)
 		for_each_jset_key(k, _n, entry, &p->j)
 			keys.d[keys.nr++] = (struct journal_key) {
 				.btree_id	= entry->btree_id,
+				.level		= entry->level,
 				.k		= k,
 				.journal_seq	= le64_to_cpu(p->j.seq) -
 					keys.journal_seq_base,
@@ -229,7 +281,8 @@ static struct journal_keys journal_keys_sort(struct list_head *journal_entries)
 	src = dst = keys.d;
 	while (src < keys.d + keys.nr) {
 		while (src + 1 < keys.d + keys.nr &&
-		       src[0].btree_id == src[1].btree_id &&
+		       src[0].btree_id	== src[1].btree_id &&
+		       src[0].level	== src[1].level &&
 		       !bkey_cmp(src[0].k->k.p, src[1].k->k.p))
 			src++;
 
@@ -864,7 +917,7 @@ int bch2_fs_recovery(struct bch_fs *c)
 		 */
 		bch_info(c, "starting metadata mark and sweep");
 		err = "error in mark and sweep";
-		ret = bch2_gc(c, NULL, true, true);
+		ret = bch2_gc(c, &journal_keys, true, true);
 		if (ret)
 			goto err;
 		bch_verbose(c, "mark and sweep done");
diff --git a/fs/bcachefs/recovery.h b/fs/bcachefs/recovery.h
index c91309301563..fa1f2818817d 100644
--- a/fs/bcachefs/recovery.h
+++ b/fs/bcachefs/recovery.h
@@ -5,6 +5,7 @@
 struct journal_keys {
 	struct journal_key {
 		enum btree_id	btree_id:8;
+		unsigned	level:8;
 		struct bkey_i	*k;
 		u32		journal_seq;
 		u32		journal_offset;
@@ -17,15 +18,23 @@ struct journal_keys {
 	for (i = (keys).d; i < (keys).d + (keys).nr; (i)++)
 
 struct journal_iter {
+	enum btree_id		btree_id;
+	unsigned		level;
 	struct journal_keys	*keys;
 	struct journal_key	*k;
-	enum btree_id		btree_id;
 };
 
-struct btree_and_journal_iter {
-	enum btree_id		btree_id;
+/*
+ * Iterate over keys in the btree, with keys from the journal overlaid on top:
+ */
 
+struct btree_and_journal_iter {
 	struct btree_iter	*btree;
+
+	struct btree		*b;
+	struct btree_node_iter	node_iter;
+	struct bkey		unpacked;
+
 	struct journal_iter	journal;
 
 	enum last_key_returned {
@@ -38,12 +47,14 @@ struct btree_and_journal_iter {
 void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *);
 struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *);
 struct bkey_s_c bch2_btree_and_journal_iter_next(struct btree_and_journal_iter *);
-struct journal_key *journal_key_search(struct journal_keys *,
-				       enum btree_id, struct bpos);
+
 void bch2_btree_and_journal_iter_init(struct btree_and_journal_iter *,
 				      struct btree_trans *,
 				      struct journal_keys *,
 				      enum btree_id, struct bpos);
+void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *,
+						struct journal_keys *,
+						struct btree *);
 
 int bch2_fs_recovery(struct bch_fs *);
 int bch2_fs_initialize(struct bch_fs *);
-- 
cgit 


From f44a6a7134371d8b1e14055a2705d0f4da4c46d6 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 15 Mar 2020 22:32:03 -0400
Subject: bcachefs: Replay interior node keys

This slightly modifies the journal replay code so that it can replay
updates to interior nodes.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/recovery.c | 24 ++++++++++++++++--------
 1 file changed, 16 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 0d4abaa3ba10..b4d9e1f98059 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -230,9 +230,9 @@ static int journal_sort_seq_cmp(const void *_l, const void *_r)
 	const struct journal_key *l = _l;
 	const struct journal_key *r = _r;
 
-	return  cmp_int(l->journal_seq, r->journal_seq) ?:
+	return  cmp_int(r->level,	l->level) ?:
+		cmp_int(l->journal_seq, r->journal_seq) ?:
 		cmp_int(l->btree_id,	r->btree_id) ?:
-		cmp_int(l->level,	r->level) ?:
 		bkey_cmp(l->k->k.p,	r->k->k.p);
 }
 
@@ -404,12 +404,15 @@ err:
 }
 
 static int __bch2_journal_replay_key(struct btree_trans *trans,
-				     enum btree_id id, struct bkey_i *k)
+				     enum btree_id id, unsigned level,
+				     struct bkey_i *k)
 {
 	struct btree_iter *iter;
 	int ret;
 
-	iter = bch2_trans_get_iter(trans, id, k->k.p, BTREE_ITER_INTENT);
+	iter = bch2_trans_get_node_iter(trans, id, k->k.p,
+					BTREE_MAX_DEPTH, level,
+					BTREE_ITER_INTENT);
 	if (IS_ERR(iter))
 		return PTR_ERR(iter);
 
@@ -428,13 +431,13 @@ static int __bch2_journal_replay_key(struct btree_trans *trans,
 }
 
 static int bch2_journal_replay_key(struct bch_fs *c, enum btree_id id,
-				   struct bkey_i *k)
+				   unsigned level, struct bkey_i *k)
 {
 	return bch2_trans_do(c, NULL, NULL,
 			     BTREE_INSERT_NOFAIL|
 			     BTREE_INSERT_LAZY_RW|
 			     BTREE_INSERT_JOURNAL_REPLAY,
-			     __bch2_journal_replay_key(&trans, id, k));
+			     __bch2_journal_replay_key(&trans, id, level, k));
 }
 
 static int bch2_journal_replay(struct bch_fs *c,
@@ -446,15 +449,20 @@ static int bch2_journal_replay(struct bch_fs *c,
 
 	sort(keys.d, keys.nr, sizeof(keys.d[0]), journal_sort_seq_cmp, NULL);
 
+	replay_now_at(j, keys.journal_seq_base);
+
 	for_each_journal_key(keys, i) {
-		replay_now_at(j, keys.journal_seq_base + i->journal_seq);
+		if (!i->level)
+			replay_now_at(j, keys.journal_seq_base + i->journal_seq);
 
+		if (i->level)
+			ret = bch2_journal_replay_key(c, i->btree_id, i->level, i->k);
 		if (i->btree_id == BTREE_ID_ALLOC)
 			ret = bch2_alloc_replay_key(c, i->k);
 		else if (i->k->k.size)
 			ret = bch2_extent_replay_key(c, i->btree_id, i->k);
 		else
-			ret = bch2_journal_replay_key(c, i->btree_id, i->k);
+			ret = bch2_journal_replay_key(c, i->btree_id, i->level, i->k);
 
 		if (ret) {
 			bch_err(c, "journal replay: error %d while replaying key",
-- 
cgit 


From 6357d6071fccb5ccedbe32c1e0db4443d83d28dd Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 8 Feb 2020 19:06:31 -0500
Subject: bcachefs: Journal updates to interior nodes

Previously, the btree has always been self contained and internally
consistent on disk without anything from the journal - the journal just
contained pointers to the btree roots.

However, this meant that btree node split or compact operations - i.e.
anything that changes btree node topology and involves updates to
interior nodes - would require that interior btree node to be written
immediately, which means emitting a btree node write that's mostly empty
(using 4k of space on disk if the filesystemm blocksize is 4k to only
write perhaps ~100 bytes of new keys).

More importantly, this meant most btree node writes had to be FUA, and
consumer drives have a history of slow and/or buggy FUA support - other
filesystes have been bit by this.

This patch changes the interior btree update path to journal updates to
interior nodes, after the writes for the new btree nodes have completed.
Best of all, it turns out to simplify the interior node update path
somewhat.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs_format.h       |   3 +-
 fs/bcachefs/btree_io.c              |  11 +-
 fs/bcachefs/btree_io.h              |   9 +-
 fs/bcachefs/btree_types.h           |   3 -
 fs/bcachefs/btree_update.h          |   1 +
 fs/bcachefs/btree_update_interior.c | 352 ++++++++++++------------------------
 fs/bcachefs/btree_update_interior.h |  16 +-
 fs/bcachefs/btree_update_leaf.c     |  23 ++-
 fs/bcachefs/super-io.c              |   2 +
 9 files changed, 146 insertions(+), 274 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index 1ad5ff449a5b..6f74fda1f21d 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -1316,7 +1316,8 @@ LE64_BITMASK(BCH_SB_ERASURE_CODE,	struct bch_sb, flags[3],  0, 16);
 	x(new_extent_overwrite,		9)	\
 	x(incompressible,		10)	\
 	x(btree_ptr_v2,			11)	\
-	x(extents_above_btree_updates,	12)
+	x(extents_above_btree_updates,	12)	\
+	x(btree_updates_journalled,	13)
 
 #define BCH_SB_FEATURES_ALL				\
 	((1ULL << BCH_FEATURE_new_siphash)|		\
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index d0b761417903..e43d1b2ce5c7 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -1260,7 +1260,6 @@ void bch2_btree_complete_write(struct bch_fs *c, struct btree *b,
 		closure_put(&((struct btree_update *) new)->cl);
 
 	bch2_journal_pin_drop(&c->journal, &w->journal);
-	closure_wake_up(&w->wait);
 }
 
 static void btree_node_write_done(struct bch_fs *c, struct btree *b)
@@ -1618,9 +1617,6 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
 	wbio->wbio.bio.bi_end_io	= btree_node_write_endio;
 	wbio->wbio.bio.bi_private	= b;
 
-	if (b->c.level || !b->written)
-		wbio->wbio.bio.bi_opf |= REQ_FUA;
-
 	bch2_bio_map(&wbio->wbio.bio, data, sectors_to_write << 9);
 
 	/*
@@ -1794,12 +1790,11 @@ ssize_t bch2_dirty_btree_nodes_print(struct bch_fs *c, char *buf)
 	rcu_read_lock();
 	for_each_cached_btree(b, c, tbl, i, pos) {
 		unsigned long flags = READ_ONCE(b->flags);
-		unsigned idx = (flags & (1 << BTREE_NODE_write_idx)) != 0;
 
 		if (!(flags & (1 << BTREE_NODE_dirty)))
 			continue;
 
-		pr_buf(&out, "%p d %u n %u l %u w %u b %u r %u:%lu c %u p %u\n",
+		pr_buf(&out, "%p d %u n %u l %u w %u b %u r %u:%lu\n",
 		       b,
 		       (flags & (1 << BTREE_NODE_dirty)) != 0,
 		       (flags & (1 << BTREE_NODE_need_write)) != 0,
@@ -1807,9 +1802,7 @@ ssize_t bch2_dirty_btree_nodes_print(struct bch_fs *c, char *buf)
 		       b->written,
 		       !list_empty_careful(&b->write_blocked),
 		       b->will_make_reachable != 0,
-		       b->will_make_reachable & 1,
-		       b->writes[ idx].wait.list.first != NULL,
-		       b->writes[!idx].wait.list.first != NULL);
+		       b->will_make_reachable & 1);
 	}
 	rcu_read_unlock();
 
diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h
index 43fa8a6dbee5..a02e261c2eb2 100644
--- a/fs/bcachefs/btree_io.h
+++ b/fs/bcachefs/btree_io.h
@@ -102,19 +102,20 @@ bool bch2_btree_post_write_cleanup(struct bch_fs *, struct btree *);
 void bch2_btree_node_write(struct bch_fs *, struct btree *,
 			  enum six_lock_type);
 
-static inline void btree_node_write_if_need(struct bch_fs *c, struct btree *b)
+static inline void btree_node_write_if_need(struct bch_fs *c, struct btree *b,
+					    enum six_lock_type lock_held)
 {
 	while (b->written &&
 	       btree_node_need_write(b) &&
 	       btree_node_may_write(b)) {
 		if (!btree_node_write_in_flight(b)) {
-			bch2_btree_node_write(c, b, SIX_LOCK_read);
+			bch2_btree_node_write(c, b, lock_held);
 			break;
 		}
 
 		six_unlock_read(&b->c.lock);
 		btree_node_wait_on_io(b);
-		btree_node_lock_type(c, b, SIX_LOCK_read);
+		btree_node_lock_type(c, b, lock_held);
 	}
 }
 
@@ -131,7 +132,7 @@ do {									\
 		new |= (1 << BTREE_NODE_need_write);			\
 	} while ((v = cmpxchg(&(_b)->flags, old, new)) != old);		\
 									\
-	btree_node_write_if_need(_c, _b);				\
+	btree_node_write_if_need(_c, _b, SIX_LOCK_read);		\
 } while (0)
 
 void bch2_btree_flush_all_reads(struct bch_fs *);
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 885cc9500f36..a794f9fe4fce 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -53,7 +53,6 @@ struct bset_tree {
 
 struct btree_write {
 	struct journal_entry_pin	journal;
-	struct closure_waitlist		wait;
 };
 
 struct btree_alloc {
@@ -547,8 +546,6 @@ static inline bool btree_node_type_needs_gc(enum btree_node_type type)
 struct btree_root {
 	struct btree		*b;
 
-	struct btree_update	*as;
-
 	/* On disk root - see async splits: */
 	__BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX);
 	u8			level;
diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
index 9f58d47ef5d6..11f7d02de622 100644
--- a/fs/bcachefs/btree_update.h
+++ b/fs/bcachefs/btree_update.h
@@ -12,6 +12,7 @@ void bch2_btree_node_lock_for_insert(struct bch_fs *, struct btree *,
 				     struct btree_iter *);
 bool bch2_btree_bset_insert_key(struct btree_iter *, struct btree *,
 				struct btree_node_iter *, struct bkey_i *);
+void bch2_btree_add_journal_pin(struct bch_fs *, struct btree *, u64);
 
 enum btree_insert_flags {
 	__BTREE_INSERT_NOUNLOCK,
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index fa9c7f5e0bb9..68deb4eb31a6 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -24,7 +24,6 @@
 static void btree_node_will_make_reachable(struct btree_update *,
 					   struct btree *);
 static void btree_update_drop_new_node(struct bch_fs *, struct btree *);
-static void bch2_btree_set_root_ondisk(struct bch_fs *, struct btree *, int);
 
 /* Debug code: */
 
@@ -260,16 +259,17 @@ void bch2_btree_node_free_inmem(struct bch_fs *c, struct btree *b,
 }
 
 static void bch2_btree_node_free_ondisk(struct bch_fs *c,
-					struct pending_btree_node_free *pending)
+			struct pending_btree_node_free *pending,
+			u64 journal_seq)
 {
 	BUG_ON(!pending->index_update_done);
 
 	bch2_mark_key(c, bkey_i_to_s_c(&pending->key),
-		      0, 0, NULL, 0, BTREE_TRIGGER_OVERWRITE);
+		      0, 0, NULL, journal_seq, BTREE_TRIGGER_OVERWRITE);
 
 	if (gc_visited(c, gc_phase(GC_PHASE_PENDING_DELETE)))
 		bch2_mark_key(c, bkey_i_to_s_c(&pending->key),
-			      0, 0, NULL, 0,
+			      0, 0, NULL, journal_seq,
 			      BTREE_TRIGGER_OVERWRITE|
 			      BTREE_TRIGGER_GC);
 }
@@ -585,10 +585,13 @@ static void bch2_btree_update_free(struct btree_update *as)
 {
 	struct bch_fs *c = as->c;
 
+	bch2_journal_preres_put(&c->journal, &as->journal_preres);
+
+	bch2_journal_pin_drop(&c->journal, &as->journal);
 	bch2_journal_pin_flush(&c->journal, &as->journal);
 
-	BUG_ON(as->nr_new_nodes);
-	BUG_ON(as->nr_pending);
+	BUG_ON((as->nr_new_nodes || as->nr_pending) &&
+	       !bch2_journal_error(&c->journal));;
 
 	if (as->reserve)
 		bch2_btree_reserve_put(c, as->reserve);
@@ -603,13 +606,10 @@ static void bch2_btree_update_free(struct btree_update *as)
 	mutex_unlock(&c->btree_interior_update_lock);
 }
 
-static void btree_update_nodes_reachable(struct closure *cl)
+static void btree_update_nodes_reachable(struct btree_update *as, u64 seq)
 {
-	struct btree_update *as = container_of(cl, struct btree_update, cl);
 	struct bch_fs *c = as->c;
 
-	bch2_journal_pin_drop(&c->journal, &as->journal);
-
 	mutex_lock(&c->btree_interior_update_lock);
 
 	while (as->nr_new_nodes) {
@@ -630,39 +630,22 @@ static void btree_update_nodes_reachable(struct closure *cl)
 	}
 
 	while (as->nr_pending)
-		bch2_btree_node_free_ondisk(c, &as->pending[--as->nr_pending]);
+		bch2_btree_node_free_ondisk(c, &as->pending[--as->nr_pending],
+					    seq);
 
 	mutex_unlock(&c->btree_interior_update_lock);
-
-	closure_wake_up(&as->wait);
-
-	bch2_btree_update_free(as);
-}
-
-static void btree_update_wait_on_journal(struct closure *cl)
-{
-	struct btree_update *as = container_of(cl, struct btree_update, cl);
-	struct bch_fs *c = as->c;
-	int ret;
-
-	ret = bch2_journal_open_seq_async(&c->journal, as->journal_seq, cl);
-	if (ret == -EAGAIN) {
-		continue_at(cl, btree_update_wait_on_journal, system_wq);
-		return;
-	}
-	if (ret < 0)
-		goto err;
-
-	bch2_journal_flush_seq_async(&c->journal, as->journal_seq, cl);
-err:
-	continue_at(cl, btree_update_nodes_reachable, system_wq);
 }
 
 static void btree_update_nodes_written(struct closure *cl)
 {
 	struct btree_update *as = container_of(cl, struct btree_update, cl);
+	struct journal_res res = { 0 };
 	struct bch_fs *c = as->c;
 	struct btree *b;
+	struct bset *i;
+	struct bkey_i *k;
+	unsigned journal_u64s = 0;
+	int ret;
 
 	/*
 	 * We did an update to a parent node where the pointers we added pointed
@@ -671,7 +654,7 @@ static void btree_update_nodes_written(struct closure *cl)
 	 */
 	mutex_lock(&c->btree_interior_update_lock);
 	as->nodes_written = true;
-retry:
+again:
 	as = list_first_entry_or_null(&c->btree_interior_updates_unwritten,
 				      struct btree_update, unwritten_list);
 	if (!as || !as->nodes_written) {
@@ -679,31 +662,53 @@ retry:
 		return;
 	}
 
+	b = as->b;
+	if (b && !six_trylock_intent(&b->c.lock)) {
+		mutex_unlock(&c->btree_interior_update_lock);
+		btree_node_lock_type(c, b, SIX_LOCK_intent);
+		six_unlock_intent(&b->c.lock);
+		goto out;
+	}
+
+	journal_u64s = 0;
+
+	if (as->mode != BTREE_INTERIOR_UPDATING_ROOT)
+		for_each_keylist_key(&as->parent_keys, k)
+			journal_u64s += jset_u64s(k->k.u64s);
+
+	ret = bch2_journal_res_get(&c->journal, &res, journal_u64s,
+				   JOURNAL_RES_GET_RESERVED);
+	if (ret) {
+		BUG_ON(!bch2_journal_error(&c->journal));
+		/* can't unblock btree writes */
+		goto free_update;
+	}
+
+	if (as->mode != BTREE_INTERIOR_UPDATING_ROOT)
+		for_each_keylist_key(&as->parent_keys, k)
+			bch2_journal_add_entry(&c->journal, &res,
+					       BCH_JSET_ENTRY_btree_keys,
+					       as->btree_id,
+					       as->level,
+					       k, k->k.u64s);
+
 	switch (as->mode) {
 	case BTREE_INTERIOR_NO_UPDATE:
 		BUG();
 	case BTREE_INTERIOR_UPDATING_NODE:
-		/* The usual case: */
-		b = READ_ONCE(as->b);
-
-		if (!six_trylock_read(&b->c.lock)) {
-			mutex_unlock(&c->btree_interior_update_lock);
-			btree_node_lock_type(c, b, SIX_LOCK_read);
-			six_unlock_read(&b->c.lock);
-			mutex_lock(&c->btree_interior_update_lock);
-			goto retry;
-		}
-
-		BUG_ON(!btree_node_dirty(b));
-		closure_wait(&btree_current_write(b)->wait, &as->cl);
+		/* @b is the node we did the final insert into: */
+		BUG_ON(!res.ref);
 
+		six_lock_write(&b->c.lock, NULL, NULL);
 		list_del(&as->write_blocked_list);
 
-		/*
-		 * for flush_held_btree_writes() waiting on updates to flush or
-		 * nodes to be writeable:
-		 */
-		closure_wake_up(&c->btree_interior_update_wait);
+		i = btree_bset_last(b);
+		i->journal_seq = cpu_to_le64(
+			max(res.seq,
+			    le64_to_cpu(i->journal_seq)));
+
+		bch2_btree_add_journal_pin(c, b, res.seq);
+		six_unlock_write(&b->c.lock);
 
 		list_del(&as->unwritten_list);
 		mutex_unlock(&c->btree_interior_update_lock);
@@ -712,82 +717,51 @@ retry:
 		 * b->write_blocked prevented it from being written, so
 		 * write it now if it needs to be written:
 		 */
-		bch2_btree_node_write_cond(c, b, true);
-		six_unlock_read(&b->c.lock);
-		continue_at(&as->cl, btree_update_nodes_reachable, system_wq);
+		btree_node_write_if_need(c, b, SIX_LOCK_intent);
+		six_unlock_intent(&b->c.lock);
 		break;
 
 	case BTREE_INTERIOR_UPDATING_AS:
-		/*
-		 * The btree node we originally updated has been freed and is
-		 * being rewritten - so we need to write anything here, we just
-		 * need to signal to that btree_update that it's ok to make the
-		 * new replacement node visible:
-		 */
-		closure_put(&as->parent_as->cl);
-
-		/*
-		 * and then we have to wait on that btree_update to finish:
-		 */
-		closure_wait(&as->parent_as->wait, &as->cl);
+		BUG_ON(b);
 
 		list_del(&as->unwritten_list);
 		mutex_unlock(&c->btree_interior_update_lock);
-
-		continue_at(&as->cl, btree_update_nodes_reachable, system_wq);
 		break;
 
-	case BTREE_INTERIOR_UPDATING_ROOT:
-		/* b is the new btree root: */
-		b = READ_ONCE(as->b);
-
-		if (!six_trylock_read(&b->c.lock)) {
-			mutex_unlock(&c->btree_interior_update_lock);
-			btree_node_lock_type(c, b, SIX_LOCK_read);
-			six_unlock_read(&b->c.lock);
-			mutex_lock(&c->btree_interior_update_lock);
-			goto retry;
-		}
-
-		BUG_ON(c->btree_roots[b->c.btree_id].as != as);
-		c->btree_roots[b->c.btree_id].as = NULL;
+	case BTREE_INTERIOR_UPDATING_ROOT: {
+		struct btree_root *r = &c->btree_roots[as->btree_id];
 
-		bch2_btree_set_root_ondisk(c, b, WRITE);
+		BUG_ON(b);
 
-		/*
-		 * We don't have to wait anything anything here (before
-		 * btree_update_nodes_reachable frees the old nodes
-		 * ondisk) - we've ensured that the very next journal write will
-		 * have the pointer to the new root, and before the allocator
-		 * can reuse the old nodes it'll have to do a journal commit:
-		 */
-		six_unlock_read(&b->c.lock);
+		mutex_lock(&c->btree_root_lock);
+		bkey_copy(&r->key, as->parent_keys.keys);
+		r->level = as->level;
+		r->alive = true;
+		c->btree_roots_dirty = true;
+		mutex_unlock(&c->btree_root_lock);
 
 		list_del(&as->unwritten_list);
 		mutex_unlock(&c->btree_interior_update_lock);
-
-		/*
-		 * Bit of funny circularity going on here we have to break:
-		 *
-		 * We have to drop our journal pin before writing the journal
-		 * entry that points to the new btree root: else, we could
-		 * deadlock if the journal currently happens to be full.
-		 *
-		 * This mean we're dropping the journal pin _before_ the new
-		 * nodes are technically reachable - but this is safe, because
-		 * after the bch2_btree_set_root_ondisk() call above they will
-		 * be reachable as of the very next journal write:
-		 */
-		bch2_journal_pin_drop(&c->journal, &as->journal);
-
-		as->journal_seq = bch2_journal_last_unwritten_seq(&c->journal);
-
-		btree_update_wait_on_journal(&as->cl);
 		break;
 	}
+	}
 
+	bch2_journal_pin_drop(&c->journal, &as->journal);
+
+	bch2_journal_res_put(&c->journal, &res);
+	bch2_journal_preres_put(&c->journal, &as->journal_preres);
+
+	btree_update_nodes_reachable(as, res.seq);
+free_update:
+	bch2_btree_update_free(as);
+	/*
+	 * for flush_held_btree_writes() waiting on updates to flush or
+	 * nodes to be writeable:
+	 */
+	closure_wake_up(&c->btree_interior_update_wait);
+out:
 	mutex_lock(&c->btree_interior_update_lock);
-	goto retry;
+	goto again;
 }
 
 /*
@@ -804,48 +778,12 @@ static void btree_update_updated_node(struct btree_update *as, struct btree *b)
 	BUG_ON(as->mode != BTREE_INTERIOR_NO_UPDATE);
 	BUG_ON(!btree_node_dirty(b));
 
-	as->mode = BTREE_INTERIOR_UPDATING_NODE;
-	as->b = b;
+	as->mode	= BTREE_INTERIOR_UPDATING_NODE;
+	as->b		= b;
+	as->level	= b->c.level;
 	list_add(&as->write_blocked_list, &b->write_blocked);
 
 	mutex_unlock(&c->btree_interior_update_lock);
-
-	/*
-	 * In general, when you're staging things in a journal that will later
-	 * be written elsewhere, and you also want to guarantee ordering: that
-	 * is, if you have updates a, b, c, after a crash you should never see c
-	 * and not a or b - there's a problem:
-	 *
-	 * If the final destination of the update(s) (i.e. btree node) can be
-	 * written/flushed _before_ the relevant journal entry - oops, that
-	 * breaks ordering, since the various leaf nodes can be written in any
-	 * order.
-	 *
-	 * Normally we use bset->journal_seq to deal with this - if during
-	 * recovery we find a btree node write that's newer than the newest
-	 * journal entry, we just ignore it - we don't need it, anything we're
-	 * supposed to have (that we reported as completed via fsync()) will
-	 * still be in the journal, and as far as the state of the journal is
-	 * concerned that btree node write never happened.
-	 *
-	 * That breaks when we're rewriting/splitting/merging nodes, since we're
-	 * mixing btree node writes that haven't happened yet with previously
-	 * written data that has been reported as completed to the journal.
-	 *
-	 * Thus, before making the new nodes reachable, we have to wait the
-	 * newest journal sequence number we have data for to be written (if it
-	 * hasn't been yet).
-	 */
-	bch2_journal_wait_on_seq(&c->journal, as->journal_seq, &as->cl);
-}
-
-static void interior_update_flush(struct journal *j,
-			struct journal_entry_pin *pin, u64 seq)
-{
-	struct btree_update *as =
-		container_of(pin, struct btree_update, journal);
-
-	bch2_journal_flush_seq_async(j, as->journal_seq, NULL);
 }
 
 static void btree_update_reparent(struct btree_update *as,
@@ -853,10 +791,10 @@ static void btree_update_reparent(struct btree_update *as,
 {
 	struct bch_fs *c = as->c;
 
+	lockdep_assert_held(&c->btree_interior_update_lock);
+
 	child->b = NULL;
 	child->mode = BTREE_INTERIOR_UPDATING_AS;
-	child->parent_as = as;
-	closure_get(&as->cl);
 
 	/*
 	 * When we write a new btree root, we have to drop our journal pin
@@ -867,46 +805,24 @@ static void btree_update_reparent(struct btree_update *as,
 	 * just transfer the journal pin to the new interior update so
 	 * btree_update_nodes_written() can drop it.
 	 */
-	bch2_journal_pin_copy(&c->journal, &as->journal,
-			      &child->journal, interior_update_flush);
+	bch2_journal_pin_copy(&c->journal, &as->journal, &child->journal, NULL);
 	bch2_journal_pin_drop(&c->journal, &child->journal);
-
-	as->journal_seq = max(as->journal_seq, child->journal_seq);
 }
 
-static void btree_update_updated_root(struct btree_update *as)
+static void btree_update_updated_root(struct btree_update *as, struct btree *b)
 {
 	struct bch_fs *c = as->c;
-	struct btree_root *r = &c->btree_roots[as->btree_id];
-
-	mutex_lock(&c->btree_interior_update_lock);
-	list_add_tail(&as->unwritten_list, &c->btree_interior_updates_unwritten);
 
 	BUG_ON(as->mode != BTREE_INTERIOR_NO_UPDATE);
+	BUG_ON(!bch2_keylist_empty(&as->parent_keys));
 
-	/*
-	 * Old root might not be persistent yet - if so, redirect its
-	 * btree_update operation to point to us:
-	 */
-	if (r->as)
-		btree_update_reparent(as, r->as);
-
-	as->mode = BTREE_INTERIOR_UPDATING_ROOT;
-	as->b = r->b;
-	r->as = as;
+	mutex_lock(&c->btree_interior_update_lock);
+	list_add_tail(&as->unwritten_list, &c->btree_interior_updates_unwritten);
 
+	as->mode	= BTREE_INTERIOR_UPDATING_ROOT;
+	as->level	= b->c.level;
+	bch2_keylist_add(&as->parent_keys, &b->key);
 	mutex_unlock(&c->btree_interior_update_lock);
-
-	/*
-	 * When we're rewriting nodes and updating interior nodes, there's an
-	 * issue with updates that haven't been written in the journal getting
-	 * mixed together with older data - see btree_update_updated_node()
-	 * for the explanation.
-	 *
-	 * However, this doesn't affect us when we're writing a new btree root -
-	 * because to make that new root reachable we have to write out a new
-	 * journal entry, which must necessarily be newer than as->journal_seq.
-	 */
 }
 
 static void btree_node_will_make_reachable(struct btree_update *as,
@@ -983,10 +899,8 @@ void bch2_btree_interior_update_will_free_node(struct btree_update *as,
 					       struct btree *b)
 {
 	struct bch_fs *c = as->c;
-	struct closure *cl, *cl_n;
 	struct btree_update *p, *n;
 	struct btree_write *w;
-	struct bset_tree *t;
 
 	set_btree_node_dying(b);
 
@@ -995,18 +909,6 @@ void bch2_btree_interior_update_will_free_node(struct btree_update *as,
 
 	btree_interior_update_add_node_reference(as, b);
 
-	/*
-	 * Does this node have data that hasn't been written in the journal?
-	 *
-	 * If so, we have to wait for the corresponding journal entry to be
-	 * written before making the new nodes reachable - we can't just carry
-	 * over the bset->journal_seq tracking, since we'll be mixing those keys
-	 * in with keys that aren't in the journal anymore:
-	 */
-	for_each_bset(b, t)
-		as->journal_seq = max(as->journal_seq,
-				      le64_to_cpu(bset(b, t)->journal_seq));
-
 	mutex_lock(&c->btree_interior_update_lock);
 
 	/*
@@ -1030,16 +932,6 @@ void bch2_btree_interior_update_will_free_node(struct btree_update *as,
 
 	clear_btree_node_dirty(b);
 	clear_btree_node_need_write(b);
-	w = btree_current_write(b);
-
-	/*
-	 * Does this node have any btree_update operations waiting on this node
-	 * to be written?
-	 *
-	 * If so, wake them up when this btree_update operation is reachable:
-	 */
-	llist_for_each_entry_safe(cl, cl_n, llist_del_all(&w->wait.list), list)
-		llist_add(&cl->list, &as->wait.list);
 
 	/*
 	 * Does this node have unwritten data that has a pin on the journal?
@@ -1049,13 +941,12 @@ void bch2_btree_interior_update_will_free_node(struct btree_update *as,
 	 * oldest pin of any of the nodes we're freeing. We'll release the pin
 	 * when the new nodes are persistent and reachable on disk:
 	 */
-	bch2_journal_pin_copy(&c->journal, &as->journal,
-			      &w->journal, interior_update_flush);
+	w = btree_current_write(b);
+	bch2_journal_pin_copy(&c->journal, &as->journal, &w->journal, NULL);
 	bch2_journal_pin_drop(&c->journal, &w->journal);
 
 	w = btree_prev_write(b);
-	bch2_journal_pin_copy(&c->journal, &as->journal,
-			      &w->journal, interior_update_flush);
+	bch2_journal_pin_copy(&c->journal, &as->journal, &w->journal, NULL);
 	bch2_journal_pin_drop(&c->journal, &w->journal);
 
 	mutex_unlock(&c->btree_interior_update_lock);
@@ -1078,6 +969,7 @@ bch2_btree_update_start(struct bch_fs *c, enum btree_id id,
 {
 	struct btree_reserve *reserve;
 	struct btree_update *as;
+	int ret;
 
 	reserve = bch2_btree_reserve_get(c, nr_nodes, flags, cl);
 	if (IS_ERR(reserve))
@@ -1094,6 +986,15 @@ bch2_btree_update_start(struct bch_fs *c, enum btree_id id,
 
 	bch2_keylist_init(&as->parent_keys, as->inline_keys);
 
+	ret = bch2_journal_preres_get(&c->journal, &as->journal_preres,
+				 jset_u64s(BKEY_BTREE_PTR_U64s_MAX) * 3, 0);
+	if (ret) {
+		bch2_btree_reserve_put(c, reserve);
+		closure_debug_destroy(&as->cl);
+		mempool_free(as, &c->btree_interior_update_pool);
+		return ERR_PTR(ret);
+	}
+
 	mutex_lock(&c->btree_interior_update_lock);
 	list_add_tail(&as->list, &c->btree_interior_update_list);
 	mutex_unlock(&c->btree_interior_update_lock);
@@ -1153,22 +1054,6 @@ static void bch2_btree_set_root_inmem(struct btree_update *as, struct btree *b)
 	mutex_unlock(&c->btree_interior_update_lock);
 }
 
-static void bch2_btree_set_root_ondisk(struct bch_fs *c, struct btree *b, int rw)
-{
-	struct btree_root *r = &c->btree_roots[b->c.btree_id];
-
-	mutex_lock(&c->btree_root_lock);
-
-	BUG_ON(b != r->b);
-	bkey_copy(&r->key, &b->key);
-	r->level = b->c.level;
-	r->alive = true;
-	if (rw == WRITE)
-		c->btree_roots_dirty = true;
-
-	mutex_unlock(&c->btree_root_lock);
-}
-
 /**
  * bch_btree_set_root - update the root in memory and on disk
  *
@@ -1201,7 +1086,7 @@ static void bch2_btree_set_root(struct btree_update *as, struct btree *b,
 
 	bch2_btree_set_root_inmem(as, b);
 
-	btree_update_updated_root(as);
+	btree_update_updated_root(as, b);
 
 	/*
 	 * Unlock old root after new root is visible:
@@ -1471,7 +1356,8 @@ static void btree_split(struct btree_update *as, struct btree *b,
 		bch2_btree_build_aux_trees(n1);
 		six_unlock_write(&n1->c.lock);
 
-		bch2_keylist_add(&as->parent_keys, &n1->key);
+		if (parent)
+			bch2_keylist_add(&as->parent_keys, &n1->key);
 	}
 
 	bch2_btree_node_write(c, n1, SIX_LOCK_intent);
@@ -1545,12 +1431,8 @@ bch2_btree_insert_keys_interior(struct btree_update *as, struct btree *b,
 	       (bkey_cmp_packed(b, k, &insert->k) >= 0))
 		;
 
-	while (!bch2_keylist_empty(keys)) {
-		insert = bch2_keylist_front(keys);
-
+	for_each_keylist_key(keys, insert)
 		bch2_insert_fixup_btree_ptr(as, b, iter, insert, &node_iter);
-		bch2_keylist_pop_front(keys);
-	}
 
 	btree_update_updated_node(as, b);
 
@@ -2107,7 +1989,7 @@ static void __bch2_btree_node_update_key(struct bch_fs *c,
 			bkey_copy(&b->key, new_key);
 		}
 
-		btree_update_updated_root(as);
+		btree_update_updated_root(as, b);
 		bch2_btree_node_unlock_write(b, iter);
 	}
 
diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h
index f6aceed89427..4a2ea69f6a2c 100644
--- a/fs/bcachefs/btree_update_interior.h
+++ b/fs/bcachefs/btree_update_interior.h
@@ -69,8 +69,10 @@ struct btree_update {
 	unsigned			nodes_written:1;
 
 	enum btree_id			btree_id;
+	u8				level;
 
 	struct btree_reserve		*reserve;
+	struct journal_preres		journal_preres;
 
 	/*
 	 * BTREE_INTERIOR_UPDATING_NODE:
@@ -83,18 +85,6 @@ struct btree_update {
 	struct btree			*b;
 	struct list_head		write_blocked_list;
 
-	/*
-	 * BTREE_INTERIOR_UPDATING_AS: btree node we updated was freed, so now
-	 * we're now blocking another btree_update
-	 * @parent_as - btree_update that's waiting on our nodes to finish
-	 * writing, before it can make new nodes visible on disk
-	 * @wait - list of child btree_updates that are waiting on this
-	 * btree_update to make all the new nodes visible before they can free
-	 * their old btree nodes
-	 */
-	struct btree_update		*parent_as;
-	struct closure_waitlist		wait;
-
 	/*
 	 * We may be freeing nodes that were dirty, and thus had journal entries
 	 * pinned: we need to transfer the oldest of those pins to the
@@ -103,8 +93,6 @@ struct btree_update {
 	 */
 	struct journal_entry_pin	journal;
 
-	u64				journal_seq;
-
 	/*
 	 * Nodes being freed:
 	 * Protected by c->btree_node_pending_free_lock
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index a8487f8275b6..06e735fc69ec 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -145,6 +145,17 @@ static void btree_node_flush1(struct journal *j, struct journal_entry_pin *pin,
 	return __btree_node_flush(j, pin, 1, seq);
 }
 
+inline void bch2_btree_add_journal_pin(struct bch_fs *c,
+				       struct btree *b, u64 seq)
+{
+	struct btree_write *w = btree_current_write(b);
+
+	bch2_journal_pin_add(&c->journal, seq, &w->journal,
+			     btree_node_write_idx(b) == 0
+			     ? btree_node_flush0
+			     : btree_node_flush1);
+}
+
 static inline void __btree_journal_key(struct btree_trans *trans,
 				       enum btree_id btree_id,
 				       struct bkey_i *insert)
@@ -173,10 +184,6 @@ static void bch2_btree_journal_key(struct btree_trans *trans,
 	struct bch_fs *c = trans->c;
 	struct journal *j = &c->journal;
 	struct btree *b = iter_l(iter)->b;
-	struct btree_write *w = btree_current_write(b);
-	u64 seq = likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))
-		? trans->journal_res.seq
-		: j->replay_journal_seq;
 
 	EBUG_ON(trans->journal_res.ref !=
 		!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY));
@@ -187,10 +194,10 @@ static void bch2_btree_journal_key(struct btree_trans *trans,
 			cpu_to_le64(trans->journal_res.seq);
 	}
 
-	bch2_journal_pin_add(j, seq, &w->journal,
-			     btree_node_write_idx(b) == 0
-			     ? btree_node_flush0
-			     : btree_node_flush1);
+	bch2_btree_add_journal_pin(c, b,
+		likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))
+			? trans->journal_res.seq
+			: j->replay_journal_seq);
 
 	if (unlikely(!btree_node_dirty(b)))
 		set_btree_node_dirty(b);
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index b50f85d1b057..c9d2a01fec29 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -958,6 +958,7 @@ int bch2_fs_mark_dirty(struct bch_fs *c)
 	c->disk_sb.sb->compat[0] &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_METADATA);
 	c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_new_extent_overwrite;
 	c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_extents_above_btree_updates;
+	c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_btree_updates_journalled;
 	ret = bch2_write_super(c);
 	mutex_unlock(&c->sb_lock);
 
@@ -1090,6 +1091,7 @@ void bch2_fs_mark_clean(struct bch_fs *c)
 	c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_FEAT_ALLOC_INFO;
 	c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_FEAT_ALLOC_METADATA;
 	c->disk_sb.sb->features[0] &= ~(1ULL << BCH_FEATURE_extents_above_btree_updates);
+	c->disk_sb.sb->features[0] &= ~(1ULL << BCH_FEATURE_btree_updates_journalled);
 
 	u64s = sizeof(*sb_clean) / sizeof(u64) + c->journal.entry_u64s_reserved;
 
-- 
cgit 


From 2f194e1697f733a2eae1d040eabe71b05c049e0b Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 25 Mar 2020 17:57:29 -0400
Subject: bcachefs: Fix an assertion when nothing to replay

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/recovery.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index b4d9e1f98059..27c9ba3382f9 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -449,7 +449,8 @@ static int bch2_journal_replay(struct bch_fs *c,
 
 	sort(keys.d, keys.nr, sizeof(keys.d[0]), journal_sort_seq_cmp, NULL);
 
-	replay_now_at(j, keys.journal_seq_base);
+	if (keys.nr)
+		replay_now_at(j, keys.journal_seq_base);
 
 	for_each_journal_key(keys, i) {
 		if (!i->level)
-- 
cgit 


From f1d786a0dbc4483d176985ebe8f82fb7cdeec429 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 25 Mar 2020 16:12:33 -0400
Subject: bcachefs: Add an option for keeping journal entries after startup

This will be used by the userspace debug tools.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs.h | 14 ++++++++++++++
 fs/bcachefs/opts.h     |  5 +++++
 fs/bcachefs/recovery.c | 42 +++++++++++++++++++++---------------------
 fs/bcachefs/recovery.h | 15 +++------------
 fs/bcachefs/super.c    |  4 ++++
 5 files changed, 47 insertions(+), 33 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 32cdf87ee55d..5304b6762179 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -523,6 +523,18 @@ struct journal_seq_blacklist_table {
 	}			entries[0];
 };
 
+struct journal_keys {
+	struct journal_key {
+		enum btree_id	btree_id:8;
+		unsigned	level:8;
+		struct bkey_i	*k;
+		u32		journal_seq;
+		u32		journal_offset;
+	}			*d;
+	size_t			nr;
+	u64			journal_seq_base;
+};
+
 struct bch_fs {
 	struct closure		cl;
 
@@ -791,6 +803,8 @@ struct bch_fs {
 	mempool_t		btree_bounce_pool;
 
 	struct journal		journal;
+	struct list_head	journal_entries;
+	struct journal_keys	journal_keys;
 
 	u64			last_bucket_seq_cleanup;
 
diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
index 59c7b3685745..1e579f67346a 100644
--- a/fs/bcachefs/opts.h
+++ b/fs/bcachefs/opts.h
@@ -255,6 +255,11 @@ enum opt_type {
 	  OPT_BOOL(),							\
 	  NO_SB_OPT,			false,				\
 	  NULL,		"Don't replay the journal")			\
+	x(keep_journal,			u8,				\
+	  OPT_MOUNT,							\
+	  OPT_BOOL(),							\
+	  NO_SB_OPT,			false,				\
+	  NULL,		"Don't free journal entries/keys after startup")\
 	x(noexcl,			u8,				\
 	  OPT_MOUNT,							\
 	  OPT_BOOL(),							\
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 27c9ba3382f9..0c8444b5278f 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -198,7 +198,7 @@ void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *i
 
 /* sort and dedup all keys in the journal: */
 
-static void journal_entries_free(struct list_head *list)
+void bch2_journal_entries_free(struct list_head *list)
 {
 
 	while (!list_empty(list)) {
@@ -236,7 +236,7 @@ static int journal_sort_seq_cmp(const void *_l, const void *_r)
 		bkey_cmp(l->k->k.p,	r->k->k.p);
 }
 
-static void journal_keys_free(struct journal_keys *keys)
+void bch2_journal_keys_free(struct journal_keys *keys)
 {
 	kvfree(keys->d);
 	keys->d = NULL;
@@ -802,8 +802,6 @@ int bch2_fs_recovery(struct bch_fs *c)
 	const char *err = "cannot allocate memory";
 	struct bch_sb_field_clean *clean = NULL;
 	u64 journal_seq;
-	LIST_HEAD(journal_entries);
-	struct journal_keys journal_keys = { NULL };
 	bool wrote = false, write_sb = false;
 	int ret;
 
@@ -825,30 +823,30 @@ int bch2_fs_recovery(struct bch_fs *c)
 	if (!c->sb.clean || c->opts.fsck) {
 		struct jset *j;
 
-		ret = bch2_journal_read(c, &journal_entries);
+		ret = bch2_journal_read(c, &c->journal_entries);
 		if (ret)
 			goto err;
 
-		if (mustfix_fsck_err_on(c->sb.clean && !journal_empty(&journal_entries), c,
+		if (mustfix_fsck_err_on(c->sb.clean && !journal_empty(&c->journal_entries), c,
 				"filesystem marked clean but journal not empty")) {
 			c->sb.compat &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_INFO);
 			SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
 			c->sb.clean = false;
 		}
 
-		if (!c->sb.clean && list_empty(&journal_entries)) {
+		if (!c->sb.clean && list_empty(&c->journal_entries)) {
 			bch_err(c, "no journal entries found");
 			ret = BCH_FSCK_REPAIR_IMPOSSIBLE;
 			goto err;
 		}
 
-		journal_keys = journal_keys_sort(&journal_entries);
-		if (!journal_keys.d) {
+		c->journal_keys = journal_keys_sort(&c->journal_entries);
+		if (!c->journal_keys.d) {
 			ret = -ENOMEM;
 			goto err;
 		}
 
-		j = &list_last_entry(&journal_entries,
+		j = &list_last_entry(&c->journal_entries,
 				     struct journal_replay, list)->j;
 
 		ret = verify_superblock_clean(c, &clean, j);
@@ -867,7 +865,7 @@ int bch2_fs_recovery(struct bch_fs *c)
 		goto err;
 	}
 
-	ret = journal_replay_early(c, clean, &journal_entries);
+	ret = journal_replay_early(c, clean, &c->journal_entries);
 	if (ret)
 		goto err;
 
@@ -885,15 +883,15 @@ int bch2_fs_recovery(struct bch_fs *c)
 
 	ret = bch2_blacklist_table_initialize(c);
 
-	if (!list_empty(&journal_entries)) {
+	if (!list_empty(&c->journal_entries)) {
 		ret = verify_journal_entries_not_blacklisted_or_missing(c,
-							&journal_entries);
+							&c->journal_entries);
 		if (ret)
 			goto err;
 	}
 
 	ret = bch2_fs_journal_start(&c->journal, journal_seq,
-				    &journal_entries);
+				    &c->journal_entries);
 	if (ret)
 		goto err;
 
@@ -903,14 +901,14 @@ int bch2_fs_recovery(struct bch_fs *c)
 
 	bch_verbose(c, "starting alloc read");
 	err = "error reading allocation information";
-	ret = bch2_alloc_read(c, &journal_keys);
+	ret = bch2_alloc_read(c, &c->journal_keys);
 	if (ret)
 		goto err;
 	bch_verbose(c, "alloc read done");
 
 	bch_verbose(c, "starting stripes_read");
 	err = "error reading stripes";
-	ret = bch2_stripes_read(c, &journal_keys);
+	ret = bch2_stripes_read(c, &c->journal_keys);
 	if (ret)
 		goto err;
 	bch_verbose(c, "stripes_read done");
@@ -926,7 +924,7 @@ int bch2_fs_recovery(struct bch_fs *c)
 		 */
 		bch_info(c, "starting metadata mark and sweep");
 		err = "error in mark and sweep";
-		ret = bch2_gc(c, &journal_keys, true, true);
+		ret = bch2_gc(c, &c->journal_keys, true, true);
 		if (ret)
 			goto err;
 		bch_verbose(c, "mark and sweep done");
@@ -937,7 +935,7 @@ int bch2_fs_recovery(struct bch_fs *c)
 	    test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags)) {
 		bch_info(c, "starting mark and sweep");
 		err = "error in mark and sweep";
-		ret = bch2_gc(c, &journal_keys, true, false);
+		ret = bch2_gc(c, &c->journal_keys, true, false);
 		if (ret)
 			goto err;
 		bch_verbose(c, "mark and sweep done");
@@ -958,7 +956,7 @@ int bch2_fs_recovery(struct bch_fs *c)
 
 	bch_verbose(c, "starting journal replay");
 	err = "journal replay failed";
-	ret = bch2_journal_replay(c, journal_keys);
+	ret = bch2_journal_replay(c, c->journal_keys);
 	if (ret)
 		goto err;
 	bch_verbose(c, "journal replay done");
@@ -1054,8 +1052,10 @@ fsck_err:
 	set_bit(BCH_FS_FSCK_DONE, &c->flags);
 	bch2_flush_fsck_errs(c);
 
-	journal_keys_free(&journal_keys);
-	journal_entries_free(&journal_entries);
+	if (!c->opts.keep_journal) {
+		bch2_journal_keys_free(&c->journal_keys);
+		bch2_journal_entries_free(&c->journal_entries);
+	}
 	kfree(clean);
 	if (ret)
 		bch_err(c, "Error in recovery: %s (%i)", err, ret);
diff --git a/fs/bcachefs/recovery.h b/fs/bcachefs/recovery.h
index fa1f2818817d..19f2f172a26b 100644
--- a/fs/bcachefs/recovery.h
+++ b/fs/bcachefs/recovery.h
@@ -2,18 +2,6 @@
 #ifndef _BCACHEFS_RECOVERY_H
 #define _BCACHEFS_RECOVERY_H
 
-struct journal_keys {
-	struct journal_key {
-		enum btree_id	btree_id:8;
-		unsigned	level:8;
-		struct bkey_i	*k;
-		u32		journal_seq;
-		u32		journal_offset;
-	}			*d;
-	size_t			nr;
-	u64			journal_seq_base;
-};
-
 #define for_each_journal_key(keys, i)				\
 	for (i = (keys).d; i < (keys).d + (keys).nr; (i)++)
 
@@ -56,6 +44,9 @@ void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *,
 						struct journal_keys *,
 						struct btree *);
 
+void bch2_journal_keys_free(struct journal_keys *);
+void bch2_journal_entries_free(struct list_head *);
+
 int bch2_fs_recovery(struct bch_fs *);
 int bch2_fs_initialize(struct bch_fs *);
 
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 8670be394239..bbb0780bc4ca 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -468,6 +468,8 @@ static void bch2_fs_free(struct bch_fs *c)
 	bch2_io_clock_exit(&c->io_clock[WRITE]);
 	bch2_io_clock_exit(&c->io_clock[READ]);
 	bch2_fs_compress_exit(c);
+	bch2_journal_keys_free(&c->journal_keys);
+	bch2_journal_entries_free(&c->journal_entries);
 	percpu_free_rwsem(&c->mark_lock);
 	free_percpu(c->online_reserved);
 	kfree(c->usage_scratch);
@@ -657,6 +659,8 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 	INIT_WORK(&c->journal_seq_blacklist_gc_work,
 		  bch2_blacklist_entries_gc);
 
+	INIT_LIST_HEAD(&c->journal_entries);
+
 	INIT_LIST_HEAD(&c->fsck_errors);
 	mutex_init(&c->fsck_error_lock);
 
-- 
cgit 


From f7005e0175ed14bc5e2cc60add40d5eaf9075c2c Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 25 Mar 2020 16:13:00 -0400
Subject: bcachefs: Improve error message in fsck

Seeing the extents that were overlapping is highly useful for figuring
out what went wrong.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fsck.c | 25 +++++++++++++++----------
 1 file changed, 15 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 936e6366cb04..822541e6adfc 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 
 #include "bcachefs.h"
+#include "bkey_on_stack.h"
 #include "btree_update.h"
 #include "dirent.h"
 #include "error.h"
@@ -469,10 +470,12 @@ static int check_extents(struct bch_fs *c)
 	struct btree_trans trans;
 	struct btree_iter *iter;
 	struct bkey_s_c k;
-	struct bkey prev = KEY(0, 0, 0);
+	struct bkey_on_stack prev;
 	u64 i_sectors;
 	int ret = 0;
 
+	bkey_on_stack_init(&prev);
+	prev.k->k = KEY(0, 0, 0);
 	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
 
 	bch_verbose(c, "checking extents");
@@ -482,24 +485,24 @@ static int check_extents(struct bch_fs *c)
 				   BTREE_ITER_INTENT);
 retry:
 	for_each_btree_key_continue(iter, 0, k, ret) {
-		if (bkey_cmp(prev.p, bkey_start_pos(k.k)) > 0) {
-			char buf1[100];
-			char buf2[100];
+		if (bkey_cmp(prev.k->k.p, bkey_start_pos(k.k)) > 0) {
+			char buf1[200];
+			char buf2[200];
 
-			bch2_bkey_to_text(&PBUF(buf1), &prev);
-			bch2_bkey_to_text(&PBUF(buf2), k.k);
+			bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(prev.k));
+			bch2_bkey_val_to_text(&PBUF(buf2), c, k);
 
-			if (fsck_err(c, "overlapping extents: %s, %s", buf1, buf2)) {
+			if (fsck_err(c, "overlapping extents:\n%s\n%s", buf1, buf2)) {
 				ret = __bch2_trans_do(&trans, NULL, NULL,
 						      BTREE_INSERT_NOFAIL|
 						      BTREE_INSERT_LAZY_RW,
 						bch2_fix_overlapping_extent(&trans,
-								iter, k, prev.p));
+								iter, k, prev.k->k.p));
 				if (ret)
 					goto err;
 			}
 		}
-		prev = *k.k;
+		bkey_on_stack_reassemble(&prev, c, k);
 
 		ret = walk_inode(&trans, &w, k.k->p.inode);
 		if (ret)
@@ -525,7 +528,8 @@ retry:
 			!(w.inode.bi_flags & BCH_INODE_I_SECTORS_DIRTY) &&
 			w.inode.bi_sectors !=
 			(i_sectors = bch2_count_inode_sectors(&trans, w.cur_inum)),
-			c, "i_sectors wrong: got %llu, should be %llu",
+			c, "inode %llu has incorrect i_sectors: got %llu, should be %llu",
+			w.inode.bi_inum,
 			w.inode.bi_sectors, i_sectors)) {
 			struct bkey_inode_buf p;
 
@@ -567,6 +571,7 @@ err:
 fsck_err:
 	if (ret == -EINTR)
 		goto retry;
+	bkey_on_stack_exit(&prev, c);
 	return bch2_trans_exit(&trans) ?: ret;
 }
 
-- 
cgit 


From 4e4758c6cbbbc31aad9ec733c6f49a7221fd7b70 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 27 Mar 2020 17:38:51 -0400
Subject: bcachefs: Use memalloc_nofs_save()

vmalloc allocations don't always obey GFP_NOFS - memalloc_nofs_save() is
the prefered approach for the future.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_io.c | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index e43d1b2ce5c7..85a17225a68e 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -20,6 +20,8 @@
 #include "super-io.h"
 #include "trace.h"
 
+#include <linux/sched/mm.h>
+
 static void verify_no_dups(struct btree *b,
 			   struct bkey_packed *start,
 			   struct bkey_packed *end,
@@ -67,17 +69,19 @@ static void btree_bounce_free(struct bch_fs *c, unsigned order,
 static void *btree_bounce_alloc(struct bch_fs *c, unsigned order,
 				bool *used_mempool)
 {
+	unsigned flags = memalloc_nofs_save();
 	void *p;
 
 	BUG_ON(order > btree_page_order(c));
 
 	*used_mempool = false;
 	p = (void *) __get_free_pages(__GFP_NOWARN|GFP_NOWAIT, order);
-	if (p)
-		return p;
-
-	*used_mempool = true;
-	return mempool_alloc(&c->btree_bounce_pool, GFP_NOIO);
+	if (!p) {
+		*used_mempool = true;
+		p = mempool_alloc(&c->btree_bounce_pool, GFP_NOIO);
+	}
+	memalloc_nofs_restore(flags);
+	return p;
 }
 
 static void sort_bkey_ptrs(const struct btree *bt,
-- 
cgit 


From 56a40fbc4e398c00e8b667f9c30b40b7695065f3 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 28 Mar 2020 19:17:23 -0400
Subject: bcachefs: Various fixes for interior update path

The locking was wrong, and we could get a use after free in the error
path where we weren't taking the entrie being freed off the unwritten
list.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update_interior.c | 35 ++++++++++++++---------------------
 1 file changed, 14 insertions(+), 21 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 68deb4eb31a6..3f9605f2f1f4 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -581,7 +581,7 @@ err_free:
 
 /* Asynchronous interior node update machinery */
 
-static void bch2_btree_update_free(struct btree_update *as)
+static void __bch2_btree_update_free(struct btree_update *as)
 {
 	struct bch_fs *c = as->c;
 
@@ -596,28 +596,32 @@ static void bch2_btree_update_free(struct btree_update *as)
 	if (as->reserve)
 		bch2_btree_reserve_put(c, as->reserve);
 
-	mutex_lock(&c->btree_interior_update_lock);
 	list_del(&as->list);
 
 	closure_debug_destroy(&as->cl);
 	mempool_free(as, &c->btree_interior_update_pool);
 
 	closure_wake_up(&c->btree_interior_update_wait);
-	mutex_unlock(&c->btree_interior_update_lock);
 }
 
-static void btree_update_nodes_reachable(struct btree_update *as, u64 seq)
+static void bch2_btree_update_free(struct btree_update *as)
 {
 	struct bch_fs *c = as->c;
 
 	mutex_lock(&c->btree_interior_update_lock);
+	__bch2_btree_update_free(as);
+	mutex_unlock(&c->btree_interior_update_lock);
+}
+
+static void btree_update_nodes_reachable(struct btree_update *as, u64 seq)
+{
+	struct bch_fs *c = as->c;
 
 	while (as->nr_new_nodes) {
 		struct btree *b = as->new_nodes[--as->nr_new_nodes];
 
 		BUG_ON(b->will_make_reachable != (unsigned long) as);
 		b->will_make_reachable = 0;
-		mutex_unlock(&c->btree_interior_update_lock);
 
 		/*
 		 * b->will_make_reachable prevented it from being written, so
@@ -626,14 +630,11 @@ static void btree_update_nodes_reachable(struct btree_update *as, u64 seq)
 		btree_node_lock_type(c, b, SIX_LOCK_read);
 		bch2_btree_node_write_cond(c, b, btree_node_need_write(b));
 		six_unlock_read(&b->c.lock);
-		mutex_lock(&c->btree_interior_update_lock);
 	}
 
 	while (as->nr_pending)
 		bch2_btree_node_free_ondisk(c, &as->pending[--as->nr_pending],
 					    seq);
-
-	mutex_unlock(&c->btree_interior_update_lock);
 }
 
 static void btree_update_nodes_written(struct closure *cl)
@@ -667,9 +668,12 @@ again:
 		mutex_unlock(&c->btree_interior_update_lock);
 		btree_node_lock_type(c, b, SIX_LOCK_intent);
 		six_unlock_intent(&b->c.lock);
-		goto out;
+		mutex_lock(&c->btree_interior_update_lock);
+		goto again;
 	}
 
+	list_del(&as->unwritten_list);
+
 	journal_u64s = 0;
 
 	if (as->mode != BTREE_INTERIOR_UPDATING_ROOT)
@@ -710,9 +714,6 @@ again:
 		bch2_btree_add_journal_pin(c, b, res.seq);
 		six_unlock_write(&b->c.lock);
 
-		list_del(&as->unwritten_list);
-		mutex_unlock(&c->btree_interior_update_lock);
-
 		/*
 		 * b->write_blocked prevented it from being written, so
 		 * write it now if it needs to be written:
@@ -723,9 +724,6 @@ again:
 
 	case BTREE_INTERIOR_UPDATING_AS:
 		BUG_ON(b);
-
-		list_del(&as->unwritten_list);
-		mutex_unlock(&c->btree_interior_update_lock);
 		break;
 
 	case BTREE_INTERIOR_UPDATING_ROOT: {
@@ -739,9 +737,6 @@ again:
 		r->alive = true;
 		c->btree_roots_dirty = true;
 		mutex_unlock(&c->btree_root_lock);
-
-		list_del(&as->unwritten_list);
-		mutex_unlock(&c->btree_interior_update_lock);
 		break;
 	}
 	}
@@ -753,14 +748,12 @@ again:
 
 	btree_update_nodes_reachable(as, res.seq);
 free_update:
-	bch2_btree_update_free(as);
+	__bch2_btree_update_free(as);
 	/*
 	 * for flush_held_btree_writes() waiting on updates to flush or
 	 * nodes to be writeable:
 	 */
 	closure_wake_up(&c->btree_interior_update_wait);
-out:
-	mutex_lock(&c->btree_interior_update_lock);
 	goto again;
 }
 
-- 
cgit 


From 5a655f06c94f541fa9223a9b7ef2ab8a909f1fea Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 28 Mar 2020 18:26:01 -0400
Subject: bcachefs: Read journal when keep_journal on

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/recovery.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 0c8444b5278f..c0e6cfa36c89 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -820,7 +820,7 @@ int bch2_fs_recovery(struct bch_fs *c)
 		set_bit(BCH_FS_REBUILD_REPLICAS, &c->flags);
 	}
 
-	if (!c->sb.clean || c->opts.fsck) {
+	if (!c->sb.clean || c->opts.fsck || c->opts.keep_journal) {
 		struct jset *j;
 
 		ret = bch2_journal_read(c, &c->journal_entries);
-- 
cgit 


From 22f776985f34334b3bbba75b71ecca711f34e3f4 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 29 Mar 2020 12:33:41 -0400
Subject: bcachefs: Use kvpmalloc mempools for compression bounce

This fixes an issue where mounting would fail because of memory
fragmentation - previously the compression bounce buffers were using
get_free_pages().

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/compress.c | 25 +++++--------------------
 1 file changed, 5 insertions(+), 20 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c
index 117afac3db1a..89eb03a988f1 100644
--- a/fs/bcachefs/compress.c
+++ b/fs/bcachefs/compress.c
@@ -17,7 +17,6 @@ struct bbuf {
 		BB_NONE,
 		BB_VMAP,
 		BB_KMALLOC,
-		BB_VMALLOC,
 		BB_MEMPOOL,
 	}		type;
 	int		rw;
@@ -33,17 +32,7 @@ static struct bbuf __bounce_alloc(struct bch_fs *c, unsigned size, int rw)
 	if (b)
 		return (struct bbuf) { .b = b, .type = BB_KMALLOC, .rw = rw };
 
-	b = mempool_alloc(&c->compression_bounce[rw], GFP_NOWAIT);
-	b = b ? page_address(b) : NULL;
-	if (b)
-		return (struct bbuf) { .b = b, .type = BB_MEMPOOL, .rw = rw };
-
-	b = vmalloc(size);
-	if (b)
-		return (struct bbuf) { .b = b, .type = BB_VMALLOC, .rw = rw };
-
 	b = mempool_alloc(&c->compression_bounce[rw], GFP_NOIO);
-	b = b ? page_address(b) : NULL;
 	if (b)
 		return (struct bbuf) { .b = b, .type = BB_MEMPOOL, .rw = rw };
 
@@ -129,12 +118,8 @@ static void bio_unmap_or_unbounce(struct bch_fs *c, struct bbuf buf)
 	case BB_KMALLOC:
 		kfree(buf.b);
 		break;
-	case BB_VMALLOC:
-		vfree(buf.b);
-		break;
 	case BB_MEMPOOL:
-		mempool_free(virt_to_page(buf.b),
-			     &c->compression_bounce[buf.rw]);
+		mempool_free(buf.b, &c->compression_bounce[buf.rw]);
 		break;
 	}
 }
@@ -561,15 +546,15 @@ static int __bch2_fs_compress_init(struct bch_fs *c, u64 features)
 have_compressed:
 
 	if (!mempool_initialized(&c->compression_bounce[READ])) {
-		ret = mempool_init_page_pool(&c->compression_bounce[READ],
-					     1, order);
+		ret = mempool_init_kvpmalloc_pool(&c->compression_bounce[READ],
+						  1, order);
 		if (ret)
 			goto out;
 	}
 
 	if (!mempool_initialized(&c->compression_bounce[WRITE])) {
-		ret = mempool_init_page_pool(&c->compression_bounce[WRITE],
-					     1, order);
+		ret = mempool_init_kvpmalloc_pool(&c->compression_bounce[WRITE],
+						  1, order);
 		if (ret)
 			goto out;
 	}
-- 
cgit 


From b72633aed07b0b870680a4de0d40fc79d2edfd03 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 29 Mar 2020 14:21:44 -0400
Subject: bcachefs: Switch a BUG_ON() to a warning

This has popped and thus needs to be debugged, but the assertion firing
isn't necessarily fatal so switch it to a warning.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/journal.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index 26a8ff38991d..220daf88f7b9 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -375,7 +375,8 @@ unlock:
 		goto retry;
 
 	if (ret == -ENOSPC) {
-		BUG_ON(!can_discard && (flags & JOURNAL_RES_GET_RESERVED));
+		WARN_ONCE(!can_discard && (flags & JOURNAL_RES_GET_RESERVED),
+			  "JOURNAL_RES_GET_RESERVED set but journal full");
 
 		/*
 		 * Journal is full - can't rely on reclaim from work item due to
-- 
cgit 


From 39fb2983c5862933798cdd1b59da180bc9642910 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 7 Jan 2020 13:29:32 -0500
Subject: bcachefs: Kill bkey_type_successor

Previously, BTREE_ID_INODES was special - inodes were indexed by the
inode field, which meant the offset field of struct bpos wasn't used,
which led to special cases in e.g. the btree iterator code.

Now, inodes in the inodes btree are indexed by the offset field.

Also: prevously min_key was special for extents btrees, min_key for
extents would equal max_key for the previous node. Now, min_key =
bkey_successor() of the previous node, same as non extent btrees.

This means we can completely get rid of
btree_type_sucessor/predecessor.

Also make some improvements to the metadata IO validate/compat code.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs_format.h       |   3 +-
 fs/bcachefs/bkey_methods.c          |  56 ++++++++++++++++
 fs/bcachefs/bkey_methods.h          |  19 ++++++
 fs/bcachefs/btree_cache.c           |   3 +-
 fs/bcachefs/btree_gc.c              |   9 +--
 fs/bcachefs/btree_io.c              | 129 ++++++++++++++++++++++--------------
 fs/bcachefs/btree_io.h              |  47 +++++++++++++
 fs/bcachefs/btree_iter.c            |  35 ++++++----
 fs/bcachefs/btree_iter.h            |  26 --------
 fs/bcachefs/btree_update_interior.c |   2 +-
 fs/bcachefs/btree_update_leaf.c     |   7 +-
 fs/bcachefs/extent_update.c         |   4 +-
 fs/bcachefs/extents.c               |  17 +++++
 fs/bcachefs/extents.h               |   3 +
 fs/bcachefs/fsck.c                  |   6 +-
 fs/bcachefs/inode.c                 |  32 ++++-----
 fs/bcachefs/journal_io.c            |  39 ++++++-----
 17 files changed, 295 insertions(+), 142 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index 6f74fda1f21d..f0f8964a98b1 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -1160,7 +1160,8 @@ enum bcachefs_metadata_version {
 	bcachefs_metadata_version_min			= 9,
 	bcachefs_metadata_version_new_versioning	= 10,
 	bcachefs_metadata_version_bkey_renumber		= 10,
-	bcachefs_metadata_version_max			= 11,
+	bcachefs_metadata_version_inode_btree_change	= 11,
+	bcachefs_metadata_version_max			= 12,
 };
 
 #define bcachefs_metadata_version_current	(bcachefs_metadata_version_max - 1)
diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c
index 0aa3d3b9a281..c97e1e9002cb 100644
--- a/fs/bcachefs/bkey_methods.c
+++ b/fs/bcachefs/bkey_methods.c
@@ -273,3 +273,59 @@ void bch2_bkey_renumber(enum btree_node_type btree_node_type,
 			break;
 		}
 }
+
+void __bch2_bkey_compat(unsigned level, enum btree_id btree_id,
+			unsigned version, unsigned big_endian,
+			int write,
+			struct bkey_format *f,
+			struct bkey_packed *k)
+{
+	const struct bkey_ops *ops;
+	struct bkey uk;
+	struct bkey_s u;
+
+	if (big_endian != CPU_BIG_ENDIAN)
+		bch2_bkey_swab_key(f, k);
+
+	if (version < bcachefs_metadata_version_bkey_renumber)
+		bch2_bkey_renumber(__btree_node_type(level, btree_id), k, write);
+
+	if (version < bcachefs_metadata_version_inode_btree_change &&
+	    btree_id == BTREE_ID_INODES) {
+		if (!bkey_packed(k)) {
+			struct bkey_i *u = packed_to_bkey(k);
+			swap(u->k.p.inode, u->k.p.offset);
+		} else if (f->bits_per_field[BKEY_FIELD_INODE] &&
+			   f->bits_per_field[BKEY_FIELD_OFFSET]) {
+			struct bkey_format tmp = *f, *in = f, *out = &tmp;
+
+			swap(tmp.bits_per_field[BKEY_FIELD_INODE],
+			     tmp.bits_per_field[BKEY_FIELD_OFFSET]);
+			swap(tmp.field_offset[BKEY_FIELD_INODE],
+			     tmp.field_offset[BKEY_FIELD_OFFSET]);
+
+			if (!write)
+				swap(in, out);
+
+			uk = __bch2_bkey_unpack_key(in, k);
+			swap(uk.p.inode, uk.p.offset);
+			BUG_ON(!bch2_bkey_pack_key(k, &uk, out));
+		}
+	}
+
+	if (!bkey_packed(k)) {
+		u = bkey_i_to_s(packed_to_bkey(k));
+	} else {
+		uk = __bch2_bkey_unpack_key(f, k);
+		u.k = &uk;
+		u.v = bkeyp_val(f, k);
+	}
+
+	if (big_endian != CPU_BIG_ENDIAN)
+		bch2_bkey_swab_val(u);
+
+	ops = &bch2_bkey_ops[k->type];
+
+	if (ops->compat)
+		ops->compat(btree_id, version, big_endian, write, u);
+}
diff --git a/fs/bcachefs/bkey_methods.h b/fs/bcachefs/bkey_methods.h
index d36468b75223..0bca725ae3b8 100644
--- a/fs/bcachefs/bkey_methods.h
+++ b/fs/bcachefs/bkey_methods.h
@@ -33,6 +33,9 @@ struct bkey_ops {
 	bool		(*key_normalize)(struct bch_fs *, struct bkey_s);
 	enum merge_result (*key_merge)(struct bch_fs *,
 				       struct bkey_s, struct bkey_s);
+	void		(*compat)(enum btree_id id, unsigned version,
+				  unsigned big_endian, int write,
+				  struct bkey_s);
 };
 
 const char *bch2_bkey_val_invalid(struct bch_fs *, struct bkey_s_c);
@@ -60,4 +63,20 @@ enum merge_result bch2_bkey_merge(struct bch_fs *,
 
 void bch2_bkey_renumber(enum btree_node_type, struct bkey_packed *, int);
 
+void __bch2_bkey_compat(unsigned, enum btree_id, unsigned, unsigned,
+			int, struct bkey_format *, struct bkey_packed *);
+
+static inline void bch2_bkey_compat(unsigned level, enum btree_id btree_id,
+			       unsigned version, unsigned big_endian,
+			       int write,
+			       struct bkey_format *f,
+			       struct bkey_packed *k)
+{
+	if (version < bcachefs_metadata_version_current ||
+	    big_endian != CPU_BIG_ENDIAN)
+		__bch2_bkey_compat(level, btree_id, version,
+				   big_endian, write, f, k);
+
+}
+
 #endif /* _BCACHEFS_BKEY_METHODS_H */
diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index 0711bde8d68c..4ff57925fb2c 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -924,8 +924,7 @@ out:
 		if (sib != btree_prev_sib)
 			swap(n1, n2);
 
-		BUG_ON(bkey_cmp(btree_type_successor(n1->c.btree_id,
-						     n1->key.k.p),
+		BUG_ON(bkey_cmp(bkey_successor(n1->key.k.p),
 				n2->data->min_key));
 	}
 
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index ee5eafdb1222..8a832e92b6a2 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -74,7 +74,7 @@ static void btree_node_range_checks(struct bch_fs *c, struct btree *b,
 	struct range_level *l = &r->l[b->c.level];
 
 	struct bpos expected_min = bkey_cmp(l->min, l->max)
-		? btree_type_successor(b->c.btree_id, l->max)
+		? bkey_successor(l->max)
 		: l->max;
 
 	bch2_fs_inconsistent_on(bkey_cmp(b->data->min_key, expected_min), c,
@@ -105,8 +105,7 @@ static void btree_node_range_checks(struct bch_fs *c, struct btree *b,
 
 		if (bkey_cmp(b->data->max_key, POS_MAX))
 			l->min = l->max =
-				btree_type_successor(b->c.btree_id,
-						     b->data->max_key);
+				bkey_successor(b->data->max_key);
 	}
 }
 
@@ -987,9 +986,7 @@ static void bch2_coalesce_nodes(struct bch_fs *c, struct btree_iter *iter,
 			n1->key.k.p = n1->data->max_key =
 				bkey_unpack_pos(n1, last);
 
-			n2->data->min_key =
-				btree_type_successor(iter->btree_id,
-						     n1->data->max_key);
+			n2->data->min_key = bkey_successor(n1->data->max_key);
 
 			memcpy_u64s(vstruct_last(s1),
 				    s2->start, u64s);
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 85a17225a68e..04537eb06e4a 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -709,83 +709,107 @@ out:									\
 
 static int validate_bset(struct bch_fs *c, struct btree *b,
 			 struct bset *i, unsigned sectors,
-			 unsigned *whiteout_u64s, int write,
-			 bool have_retry)
+			 int write, bool have_retry)
 {
-	struct bkey_packed *k, *prev = NULL;
-	bool seen_non_whiteout = false;
-	unsigned version;
+	unsigned version = le16_to_cpu(i->version);
 	const char *err;
 	int ret = 0;
 
+	btree_err_on((version != BCH_BSET_VERSION_OLD &&
+		      version < bcachefs_metadata_version_min) ||
+		     version >= bcachefs_metadata_version_max,
+		     BTREE_ERR_FATAL, c, b, i,
+		     "unsupported bset version");
+
+	if (btree_err_on(b->written + sectors > c->opts.btree_node_size,
+			 BTREE_ERR_FIXABLE, c, b, i,
+			 "bset past end of btree node")) {
+		i->u64s = 0;
+		return 0;
+	}
+
+	btree_err_on(b->written && !i->u64s,
+		     BTREE_ERR_FIXABLE, c, b, i,
+		     "empty bset");
+
 	if (!b->written) {
+		struct btree_node *bn =
+			container_of(i, struct btree_node, keys);
 		/* These indicate that we read the wrong btree node: */
-		btree_err_on(BTREE_NODE_ID(b->data) != b->c.btree_id,
+		btree_err_on(BTREE_NODE_ID(bn) != b->c.btree_id,
 			     BTREE_ERR_MUST_RETRY, c, b, i,
 			     "incorrect btree id");
 
-		btree_err_on(BTREE_NODE_LEVEL(b->data) != b->c.level,
+		btree_err_on(BTREE_NODE_LEVEL(bn) != b->c.level,
 			     BTREE_ERR_MUST_RETRY, c, b, i,
 			     "incorrect level");
 
 		if (BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN) {
-			u64 *p = (u64 *) &b->data->ptr;
+			u64 *p = (u64 *) &bn->ptr;
 
 			*p = swab64(*p);
-			bch2_bpos_swab(&b->data->min_key);
-			bch2_bpos_swab(&b->data->max_key);
 		}
 
+		if (!write)
+			compat_btree_node(b->c.level, b->c.btree_id, version,
+					  BSET_BIG_ENDIAN(i), write, bn);
+
 		if (b->key.k.type == KEY_TYPE_btree_ptr_v2) {
 			struct bch_btree_ptr_v2 *bp =
 				&bkey_i_to_btree_ptr_v2(&b->key)->v;
 
 			btree_err_on(bkey_cmp(b->data->min_key, bp->min_key),
 				     BTREE_ERR_MUST_RETRY, c, b, NULL,
-				     "incorrect min_key");
+				     "incorrect min_key: got %llu:%llu should be %llu:%llu",
+				     b->data->min_key.inode,
+				     b->data->min_key.offset,
+				     bp->min_key.inode,
+				     bp->min_key.offset);
 		}
 
-		btree_err_on(bkey_cmp(b->data->max_key, b->key.k.p),
+		btree_err_on(bkey_cmp(bn->max_key, b->key.k.p),
 			     BTREE_ERR_MUST_RETRY, c, b, i,
 			     "incorrect max key");
 
+		if (write)
+			compat_btree_node(b->c.level, b->c.btree_id, version,
+					  BSET_BIG_ENDIAN(i), write, bn);
+
 		/* XXX: ideally we would be validating min_key too */
 #if 0
 		/*
 		 * not correct anymore, due to btree node write error
 		 * handling
 		 *
-		 * need to add b->data->seq to btree keys and verify
+		 * need to add bn->seq to btree keys and verify
 		 * against that
 		 */
 		btree_err_on(!extent_contains_ptr(bkey_i_to_s_c_extent(&b->key),
-						  b->data->ptr),
+						  bn->ptr),
 			     BTREE_ERR_FATAL, c, b, i,
 			     "incorrect backpointer");
 #endif
-		err = bch2_bkey_format_validate(&b->data->format);
+		err = bch2_bkey_format_validate(&bn->format);
 		btree_err_on(err,
 			     BTREE_ERR_FATAL, c, b, i,
 			     "invalid bkey format: %s", err);
-	}
-
-	version = le16_to_cpu(i->version);
-	btree_err_on((version != BCH_BSET_VERSION_OLD &&
-		      version < bcachefs_metadata_version_min) ||
-		     version >= bcachefs_metadata_version_max,
-		     BTREE_ERR_FATAL, c, b, i,
-		     "unsupported bset version");
 
-	if (btree_err_on(b->written + sectors > c->opts.btree_node_size,
-			 BTREE_ERR_FIXABLE, c, b, i,
-			 "bset past end of btree node")) {
-		i->u64s = 0;
-		return 0;
+		compat_bformat(b->c.level, b->c.btree_id, version,
+			       BSET_BIG_ENDIAN(i), write,
+			       &bn->format);
 	}
+fsck_err:
+	return ret;
+}
 
-	btree_err_on(b->written && !i->u64s,
-		     BTREE_ERR_FIXABLE, c, b, i,
-		     "empty bset");
+static int validate_bset_keys(struct bch_fs *c, struct btree *b,
+			 struct bset *i, unsigned *whiteout_u64s,
+			 int write, bool have_retry)
+{
+	unsigned version = le16_to_cpu(i->version);
+	struct bkey_packed *k, *prev = NULL;
+	bool seen_non_whiteout = false;
+	int ret = 0;
 
 	if (!BSET_SEPARATE_WHITEOUTS(i)) {
 		seen_non_whiteout = true;
@@ -814,18 +838,14 @@ static int validate_bset(struct bch_fs *c, struct btree *b,
 			continue;
 		}
 
-		if (BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN)
-			bch2_bkey_swab_key(&b->format, k);
-
-		if (!write &&
-		    version < bcachefs_metadata_version_bkey_renumber)
-			bch2_bkey_renumber(btree_node_type(b), k, write);
+		/* XXX: validate k->u64s */
+		if (!write)
+			bch2_bkey_compat(b->c.level, b->c.btree_id, version,
+				    BSET_BIG_ENDIAN(i), write,
+				    &b->format, k);
 
 		u = __bkey_disassemble(b, k, &tmp);
 
-		if (BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN)
-			bch2_bkey_swab_val(u);
-
 		invalid = __bch2_bkey_invalid(c, u.s_c, btree_node_type(b)) ?:
 			bch2_bkey_in_btree_node(b, u.s_c) ?:
 			(write ? bch2_bkey_val_invalid(c, u.s_c) : NULL);
@@ -842,9 +862,10 @@ static int validate_bset(struct bch_fs *c, struct btree *b,
 			continue;
 		}
 
-		if (write &&
-		    version < bcachefs_metadata_version_bkey_renumber)
-			bch2_bkey_renumber(btree_node_type(b), k, write);
+		if (write)
+			bch2_bkey_compat(b->c.level, b->c.btree_id, version,
+				    BSET_BIG_ENDIAN(i), write,
+				    &b->format, k);
 
 		/*
 		 * with the separate whiteouts thing (used for extents), the
@@ -875,8 +896,6 @@ static int validate_bset(struct bch_fs *c, struct btree *b,
 		prev = k;
 		k = bkey_next_skip_noops(k, vstruct_last(i));
 	}
-
-	SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN);
 fsck_err:
 	return ret;
 }
@@ -944,8 +963,6 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry
 				set_btree_node_old_extent_overwrite(b);
 
 			sectors = vstruct_sectors(b->data, c->block_bits);
-
-			btree_node_set_format(b, b->data->format);
 		} else {
 			bne = write_block(b);
 			i = &bne->keys;
@@ -969,11 +986,21 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry
 			sectors = vstruct_sectors(bne, c->block_bits);
 		}
 
-		ret = validate_bset(c, b, i, sectors, &whiteout_u64s,
+		ret = validate_bset(c, b, i, sectors,
 				    READ, have_retry);
 		if (ret)
 			goto fsck_err;
 
+		if (!b->written)
+			btree_node_set_format(b, b->data->format);
+
+		ret = validate_bset_keys(c, b, i, &whiteout_u64s,
+				    READ, have_retry);
+		if (ret)
+			goto fsck_err;
+
+		SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN);
+
 		b->written += sectors;
 
 		blacklisted = bch2_journal_seq_is_blacklisted(c,
@@ -1416,7 +1443,8 @@ static int validate_bset_for_write(struct bch_fs *c, struct btree *b,
 	if (bch2_bkey_invalid(c, bkey_i_to_s_c(&b->key), BKEY_TYPE_BTREE))
 		return -1;
 
-	ret = validate_bset(c, b, i, sectors, &whiteout_u64s, WRITE, false);
+	ret = validate_bset(c, b, i, sectors, WRITE, false) ?:
+		validate_bset_keys(c, b, i, &whiteout_u64s, WRITE, false);
 	if (ret)
 		bch2_inconsistent_error(c);
 
@@ -1566,8 +1594,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
 		validate_before_checksum = true;
 
 	/* validate_bset will be modifying: */
-	if (le16_to_cpu(i->version) <
-	    bcachefs_metadata_version_bkey_renumber)
+	if (le16_to_cpu(i->version) < bcachefs_metadata_version_max)
 		validate_before_checksum = true;
 
 	/* if we're going to be encrypting, check metadata validity first: */
diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h
index a02e261c2eb2..fa996fdc5474 100644
--- a/fs/bcachefs/btree_io.h
+++ b/fs/bcachefs/btree_io.h
@@ -2,6 +2,7 @@
 #ifndef _BCACHEFS_BTREE_IO_H
 #define _BCACHEFS_BTREE_IO_H
 
+#include "bkey_methods.h"
 #include "bset.h"
 #include "btree_locking.h"
 #include "extents.h"
@@ -140,4 +141,50 @@ void bch2_btree_flush_all_writes(struct bch_fs *);
 void bch2_btree_verify_flushed(struct bch_fs *);
 ssize_t bch2_dirty_btree_nodes_print(struct bch_fs *, char *);
 
+static inline void compat_bformat(unsigned level, enum btree_id btree_id,
+				 unsigned version, unsigned big_endian,
+				 int write, struct bkey_format *f)
+{
+	if (version < bcachefs_metadata_version_inode_btree_change &&
+	    btree_id == BTREE_ID_INODES) {
+		swap(f->bits_per_field[BKEY_FIELD_INODE],
+		     f->bits_per_field[BKEY_FIELD_OFFSET]);
+		swap(f->field_offset[BKEY_FIELD_INODE],
+		     f->field_offset[BKEY_FIELD_OFFSET]);
+	}
+}
+
+static inline void compat_bpos(unsigned level, enum btree_id btree_id,
+			       unsigned version, unsigned big_endian,
+			       int write, struct bpos *p)
+{
+	if (big_endian != CPU_BIG_ENDIAN)
+		bch2_bpos_swab(p);
+
+	if (version < bcachefs_metadata_version_inode_btree_change &&
+	    btree_id == BTREE_ID_INODES)
+		swap(p->inode, p->offset);
+}
+
+static inline void compat_btree_node(unsigned level, enum btree_id btree_id,
+				     unsigned version, unsigned big_endian,
+				     int write,
+				     struct btree_node *bn)
+{
+	if (version < bcachefs_metadata_version_inode_btree_change &&
+	    btree_node_type_is_extents(btree_id) &&
+	    bkey_cmp(bn->min_key, POS_MIN) &&
+	    write)
+		bn->min_key = bkey_predecessor(bn->min_key);
+
+	compat_bpos(level, btree_id, version, big_endian, write, &bn->min_key);
+	compat_bpos(level, btree_id, version, big_endian, write, &bn->max_key);
+
+	if (version < bcachefs_metadata_version_inode_btree_change &&
+	    btree_node_type_is_extents(btree_id) &&
+	    bkey_cmp(bn->min_key, POS_MIN) &&
+	    !write)
+		bn->min_key = bkey_successor(bn->min_key);
+}
+
 #endif /* _BCACHEFS_BTREE_IO_H */
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 8b1395ef4d0e..4ce6a66edcd5 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -39,7 +39,7 @@ static inline struct bpos btree_iter_search_key(struct btree_iter *iter)
 static inline bool btree_iter_pos_before_node(struct btree_iter *iter,
 					      struct btree *b)
 {
-	return bkey_cmp(iter->pos, b->data->min_key) < 0;
+	return bkey_cmp(btree_iter_search_key(iter), b->data->min_key) < 0;
 }
 
 static inline bool btree_iter_pos_after_node(struct btree_iter *iter,
@@ -1284,10 +1284,7 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
 		if (btree_node_read_locked(iter, iter->level))
 			btree_node_unlock(iter, iter->level);
 
-		/* ick: */
-		iter->pos	= iter->btree_id == BTREE_ID_INODES
-			? btree_type_successor(iter->btree_id, iter->pos)
-			: bkey_successor(iter->pos);
+		iter->pos	= bkey_successor(iter->pos);
 		iter->level	= iter->min_depth;
 
 		btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
@@ -1395,8 +1392,8 @@ static inline bool btree_iter_set_pos_to_next_leaf(struct btree_iter *iter)
 	iter->k.p = iter->pos = l->b->key.k.p;
 
 	ret = bkey_cmp(iter->pos, POS_MAX) != 0;
-	if (ret)
-		iter->k.p = iter->pos = btree_type_successor(iter->btree_id, iter->pos);
+	if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS))
+		iter->k.p = iter->pos = bkey_successor(iter->pos);
 
 	btree_iter_pos_changed(iter, 1);
 	return ret;
@@ -1412,8 +1409,12 @@ static inline bool btree_iter_set_pos_to_prev_leaf(struct btree_iter *iter)
 	iter->uptodate	= BTREE_ITER_NEED_TRAVERSE;
 
 	ret = bkey_cmp(iter->pos, POS_MIN) != 0;
-	if (ret)
-		iter->k.p = iter->pos = btree_type_predecessor(iter->btree_id, iter->pos);
+	if (ret) {
+		iter->k.p = iter->pos = bkey_predecessor(iter->pos);
+
+		if (iter->flags & BTREE_ITER_IS_EXTENTS)
+			iter->k.p = iter->pos = bkey_predecessor(iter->pos);
+	}
 
 	btree_iter_pos_changed(iter, -1);
 	return ret;
@@ -1500,7 +1501,9 @@ struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter)
 		return bkey_s_c_null;
 
 	bch2_btree_iter_set_pos(iter,
-		btree_type_successor(iter->btree_id, iter->k.p));
+		(iter->flags & BTREE_ITER_IS_EXTENTS)
+		? iter->k.p
+		: bkey_successor(iter->k.p));
 
 	return bch2_btree_iter_peek(iter);
 }
@@ -1553,7 +1556,9 @@ struct bkey_s_c bch2_btree_iter_peek_with_updates(struct btree_iter *iter)
 
 		if (k.k && bkey_deleted(k.k)) {
 			bch2_btree_iter_set_pos(iter,
-				btree_type_successor(iter->btree_id, iter->k.p));
+				(iter->flags & BTREE_ITER_IS_EXTENTS)
+				? iter->k.p
+				: bkey_successor(iter->k.p));
 			continue;
 		}
 
@@ -1582,7 +1587,9 @@ struct bkey_s_c bch2_btree_iter_next_with_updates(struct btree_iter *iter)
 		return bkey_s_c_null;
 
 	bch2_btree_iter_set_pos(iter,
-		btree_type_successor(iter->btree_id, iter->k.p));
+		(iter->flags & BTREE_ITER_IS_EXTENTS)
+		? iter->k.p
+		: bkey_successor(iter->k.p));
 
 	return bch2_btree_iter_peek_with_updates(iter);
 }
@@ -1749,7 +1756,9 @@ struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *iter)
 		return bkey_s_c_null;
 
 	bch2_btree_iter_set_pos(iter,
-		btree_type_successor(iter->btree_id, iter->k.p));
+		(iter->flags & BTREE_ITER_IS_EXTENTS)
+		? iter->k.p
+		: bkey_successor(iter->k.p));
 
 	return bch2_btree_iter_peek_slot(iter);
 }
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index 1177bf118dbc..60baca62a596 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -172,32 +172,6 @@ void bch2_btree_iter_set_pos_same_leaf(struct btree_iter *, struct bpos);
 void __bch2_btree_iter_set_pos(struct btree_iter *, struct bpos, bool);
 void bch2_btree_iter_set_pos(struct btree_iter *, struct bpos);
 
-static inline struct bpos btree_type_successor(enum btree_id id,
-					       struct bpos pos)
-{
-	if (id == BTREE_ID_INODES) {
-		pos.inode++;
-		pos.offset = 0;
-	} else if (!btree_node_type_is_extents(id)) {
-		pos = bkey_successor(pos);
-	}
-
-	return pos;
-}
-
-static inline struct bpos btree_type_predecessor(enum btree_id id,
-					       struct bpos pos)
-{
-	if (id == BTREE_ID_INODES) {
-		--pos.inode;
-		pos.offset = 0;
-	} else {
-		pos = bkey_predecessor(pos);
-	}
-
-	return pos;
-}
-
 static inline int __btree_iter_cmp(enum btree_id id,
 				   struct bpos pos,
 				   const struct btree_iter *r)
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 3f9605f2f1f4..f09423c83c4a 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -1193,7 +1193,7 @@ static struct btree *__btree_split_node(struct btree_update *as,
 	BUG_ON(!prev);
 
 	btree_set_max(n1, bkey_unpack_pos(n1, prev));
-	btree_set_min(n2, btree_type_successor(n1->c.btree_id, n1->key.k.p));
+	btree_set_min(n2, bkey_successor(n1->key.k.p));
 
 	set2->u64s = cpu_to_le16((u64 *) vstruct_end(set1) - (u64 *) k);
 	set1->u64s = cpu_to_le16(le16_to_cpu(set1->u64s) - le16_to_cpu(set2->u64s));
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 06e735fc69ec..1e6675f68b4a 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -58,8 +58,11 @@ bool bch2_btree_bset_insert_key(struct btree_iter *iter,
 	EBUG_ON(btree_node_just_written(b));
 	EBUG_ON(bset_written(b, btree_bset_last(b)));
 	EBUG_ON(bkey_deleted(&insert->k) && bkey_val_u64s(&insert->k));
-	EBUG_ON(bkey_cmp(bkey_start_pos(&insert->k), b->data->min_key) < 0 ||
-		bkey_cmp(insert->k.p, b->data->max_key) > 0);
+	EBUG_ON(bkey_cmp(b->data->min_key, POS_MIN) &&
+		bkey_cmp(bkey_start_pos(&insert->k),
+			 bkey_predecessor(b->data->min_key)) < 0);
+	EBUG_ON(bkey_cmp(insert->k.p, b->data->min_key) < 0);
+	EBUG_ON(bkey_cmp(insert->k.p, b->data->max_key) > 0);
 	EBUG_ON(insert->k.u64s >
 		bch_btree_keys_u64s_remaining(iter->trans->c, b));
 	EBUG_ON(iter->flags & BTREE_ITER_IS_EXTENTS);
diff --git a/fs/bcachefs/extent_update.c b/fs/bcachefs/extent_update.c
index 8e5070d5a39b..2a7d913bdda3 100644
--- a/fs/bcachefs/extent_update.c
+++ b/fs/bcachefs/extent_update.c
@@ -115,7 +115,9 @@ int bch2_extent_atomic_end(struct btree_iter *iter,
 	b = iter->l[0].b;
 	node_iter = iter->l[0].iter;
 
-	BUG_ON(bkey_cmp(bkey_start_pos(&insert->k), b->data->min_key) < 0);
+	BUG_ON(bkey_cmp(b->data->min_key, POS_MIN) &&
+	       bkey_cmp(bkey_start_pos(&insert->k),
+			bkey_predecessor(b->data->min_key)) < 0);
 
 	*end = bpos_min(insert->k.p, b->key.k.p);
 
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 1ac4f0522043..3c28f3aa9df7 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -9,6 +9,7 @@
 #include "bcachefs.h"
 #include "bkey_methods.h"
 #include "btree_gc.h"
+#include "btree_io.h"
 #include "btree_iter.h"
 #include "buckets.h"
 #include "checksum.h"
@@ -213,6 +214,22 @@ void bch2_btree_ptr_to_text(struct printbuf *out, struct bch_fs *c,
 	bch2_bkey_ptrs_to_text(out, c, k);
 }
 
+void bch2_btree_ptr_v2_compat(enum btree_id btree_id, unsigned version,
+			      unsigned big_endian, int write,
+			      struct bkey_s k)
+{
+	struct bkey_s_btree_ptr_v2 bp = bkey_s_to_btree_ptr_v2(k);
+
+	compat_bpos(0, btree_id, version, big_endian, write, &bp.v->min_key);
+
+	if (version < bcachefs_metadata_version_inode_btree_change &&
+	    btree_node_type_is_extents(btree_id) &&
+	    bkey_cmp(bp.v->min_key, POS_MIN))
+		bp.v->min_key = write
+			? bkey_predecessor(bp.v->min_key)
+			: bkey_successor(bp.v->min_key);
+}
+
 /* KEY_TYPE_extent: */
 
 const char *bch2_extent_invalid(const struct bch_fs *c, struct bkey_s_c k)
diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h
index 70b7d70269dc..8ff2eac3ee2b 100644
--- a/fs/bcachefs/extents.h
+++ b/fs/bcachefs/extents.h
@@ -371,6 +371,8 @@ const char *bch2_btree_ptr_invalid(const struct bch_fs *, struct bkey_s_c);
 void bch2_btree_ptr_debugcheck(struct bch_fs *, struct bkey_s_c);
 void bch2_btree_ptr_to_text(struct printbuf *, struct bch_fs *,
 			    struct bkey_s_c);
+void bch2_btree_ptr_v2_compat(enum btree_id, unsigned, unsigned,
+			      int, struct bkey_s);
 
 #define bch2_bkey_ops_btree_ptr (struct bkey_ops) {		\
 	.key_invalid	= bch2_btree_ptr_invalid,		\
@@ -384,6 +386,7 @@ void bch2_btree_ptr_to_text(struct printbuf *, struct bch_fs *,
 	.key_debugcheck	= bch2_btree_ptr_debugcheck,		\
 	.val_to_text	= bch2_btree_ptr_to_text,		\
 	.swab		= bch2_ptr_swab,			\
+	.compat		= bch2_btree_ptr_v2_compat,		\
 }
 
 /* KEY_TYPE_extent: */
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 822541e6adfc..c7508e81188c 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -1038,12 +1038,12 @@ retry:
 		if (!ret)
 			continue;
 
-		if (fsck_err_on(!inode_bitmap_test(&dirs_done, k.k->p.inode), c,
+		if (fsck_err_on(!inode_bitmap_test(&dirs_done, k.k->p.offset), c,
 				"unreachable directory found (inum %llu)",
-				k.k->p.inode)) {
+				k.k->p.offset)) {
 			bch2_trans_unlock(&trans);
 
-			ret = reattach_inode(c, lostfound_inode, k.k->p.inode);
+			ret = reattach_inode(c, lostfound_inode, k.k->p.offset);
 			if (ret) {
 				goto err;
 			}
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index c40ff6fc7ae2..758eda526674 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -98,7 +98,7 @@ void bch2_inode_pack(struct bkey_inode_buf *packed,
 	unsigned bytes;
 
 	bkey_inode_init(&packed->inode.k_i);
-	packed->inode.k.p.inode		= inode->bi_inum;
+	packed->inode.k.p.offset	= inode->bi_inum;
 	packed->inode.v.bi_hash_seed	= inode->bi_hash_seed;
 	packed->inode.v.bi_flags	= cpu_to_le32(inode->bi_flags);
 	packed->inode.v.bi_mode		= cpu_to_le16(inode->bi_mode);
@@ -149,7 +149,7 @@ int bch2_inode_unpack(struct bkey_s_c_inode inode,
 	unsigned fieldnr = 0, field_bits;
 	int ret;
 
-	unpacked->bi_inum	= inode.k->p.inode;
+	unpacked->bi_inum	= inode.k->p.offset;
 	unpacked->bi_hash_seed	= inode.v->bi_hash_seed;
 	unpacked->bi_flags	= le32_to_cpu(inode.v->bi_flags);
 	unpacked->bi_mode	= le16_to_cpu(inode.v->bi_mode);
@@ -188,7 +188,7 @@ struct btree_iter *bch2_inode_peek(struct btree_trans *trans,
 	struct bkey_s_c k;
 	int ret;
 
-	iter = bch2_trans_get_iter(trans, BTREE_ID_INODES, POS(inum, 0),
+	iter = bch2_trans_get_iter(trans, BTREE_ID_INODES, POS(0, inum),
 				   BTREE_ITER_SLOTS|flags);
 	if (IS_ERR(iter))
 		return iter;
@@ -232,13 +232,13 @@ const char *bch2_inode_invalid(const struct bch_fs *c, struct bkey_s_c k)
 		struct bkey_s_c_inode inode = bkey_s_c_to_inode(k);
 		struct bch_inode_unpacked unpacked;
 
-	if (k.k->p.offset)
-		return "nonzero offset";
+	if (k.k->p.inode)
+		return "nonzero k.p.inode";
 
 	if (bkey_val_bytes(k.k) < sizeof(struct bch_inode))
 		return "incorrect value size";
 
-	if (k.k->p.inode < BLOCKDEV_INODE_MAX)
+	if (k.k->p.offset < BLOCKDEV_INODE_MAX)
 		return "fs inode in blockdev range";
 
 	if (INODE_STR_HASH(inode.v) >= BCH_STR_HASH_NR)
@@ -280,8 +280,8 @@ void bch2_inode_to_text(struct printbuf *out, struct bch_fs *c,
 const char *bch2_inode_generation_invalid(const struct bch_fs *c,
 					  struct bkey_s_c k)
 {
-	if (k.k->p.offset)
-		return "nonzero offset";
+	if (k.k->p.inode)
+		return "nonzero k.p.inode";
 
 	if (bkey_val_bytes(k.k) != sizeof(struct bch_inode_generation))
 		return "incorrect value size";
@@ -383,9 +383,9 @@ int bch2_inode_create(struct btree_trans *trans,
 	if (IS_ERR(inode_p))
 		return PTR_ERR(inode_p);
 again:
-	for_each_btree_key(trans, iter, BTREE_ID_INODES, POS(start, 0),
+	for_each_btree_key(trans, iter, BTREE_ID_INODES, POS(0, start),
 			   BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) {
-		if (iter->pos.inode > max)
+		if (bkey_cmp(iter->pos, POS(0, max)) > 0)
 			break;
 
 		if (k.k->type != KEY_TYPE_inode)
@@ -405,8 +405,8 @@ again:
 
 	return -ENOSPC;
 found_slot:
-	*hint			= k.k->p.inode;
-	inode_u->bi_inum	= k.k->p.inode;
+	*hint			= k.k->p.offset;
+	inode_u->bi_inum	= k.k->p.offset;
 	inode_u->bi_generation	= bkey_generation(k);
 
 	bch2_inode_pack(inode_p, inode_u);
@@ -443,7 +443,7 @@ int bch2_inode_rm(struct bch_fs *c, u64 inode_nr)
 
 	bch2_trans_init(&trans, c, 0, 0);
 
-	iter = bch2_trans_get_iter(&trans, BTREE_ID_INODES, POS(inode_nr, 0),
+	iter = bch2_trans_get_iter(&trans, BTREE_ID_INODES, POS(0, inode_nr),
 				   BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
 	do {
 		struct bkey_s_c k = bch2_btree_iter_peek_slot(iter);
@@ -475,10 +475,10 @@ int bch2_inode_rm(struct bch_fs *c, u64 inode_nr)
 
 		if (!bi_generation) {
 			bkey_init(&delete.k);
-			delete.k.p.inode = inode_nr;
+			delete.k.p.offset = inode_nr;
 		} else {
 			bkey_inode_generation_init(&delete.k_i);
-			delete.k.p.inode = inode_nr;
+			delete.k.p.offset = inode_nr;
 			delete.v.bi_generation = cpu_to_le32(bi_generation);
 		}
 
@@ -500,7 +500,7 @@ int bch2_inode_find_by_inum_trans(struct btree_trans *trans, u64 inode_nr,
 	int ret;
 
 	iter = bch2_trans_get_iter(trans, BTREE_ID_INODES,
-			POS(inode_nr, 0), BTREE_ITER_SLOTS);
+			POS(0, inode_nr), BTREE_ITER_SLOTS);
 	if (IS_ERR(iter))
 		return PTR_ERR(iter);
 
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index fc36385c7830..421fde39ac0e 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 #include "bcachefs.h"
 #include "alloc_foreground.h"
+#include "btree_io.h"
 #include "buckets.h"
 #include "checksum.h"
 #include "error.h"
@@ -137,7 +138,8 @@ static void journal_entry_null_range(void *start, void *end)
 
 static int journal_validate_key(struct bch_fs *c, struct jset *jset,
 				struct jset_entry *entry,
-				struct bkey_i *k, enum btree_node_type key_type,
+				unsigned level, enum btree_id btree_id,
+				struct bkey_i *k,
 				const char *type, int write)
 {
 	void *next = vstruct_next(entry);
@@ -170,16 +172,13 @@ static int journal_validate_key(struct bch_fs *c, struct jset *jset,
 		return 0;
 	}
 
-	if (JSET_BIG_ENDIAN(jset) != CPU_BIG_ENDIAN) {
-		bch2_bkey_swab_key(NULL, bkey_to_packed(k));
-		bch2_bkey_swab_val(bkey_i_to_s(k));
-	}
-
-	if (!write &&
-	    version < bcachefs_metadata_version_bkey_renumber)
-		bch2_bkey_renumber(key_type, bkey_to_packed(k), write);
+	if (!write)
+		bch2_bkey_compat(level, btree_id, version,
+			    JSET_BIG_ENDIAN(jset), write,
+			    NULL, bkey_to_packed(k));
 
-	invalid = bch2_bkey_invalid(c, bkey_i_to_s_c(k), key_type);
+	invalid = bch2_bkey_invalid(c, bkey_i_to_s_c(k),
+				    __btree_node_type(level, btree_id));
 	if (invalid) {
 		char buf[160];
 
@@ -193,9 +192,10 @@ static int journal_validate_key(struct bch_fs *c, struct jset *jset,
 		return 0;
 	}
 
-	if (write &&
-	    version < bcachefs_metadata_version_bkey_renumber)
-		bch2_bkey_renumber(key_type, bkey_to_packed(k), write);
+	if (write)
+		bch2_bkey_compat(level, btree_id, version,
+			    JSET_BIG_ENDIAN(jset), write,
+			    NULL, bkey_to_packed(k));
 fsck_err:
 	return ret;
 }
@@ -208,10 +208,10 @@ static int journal_entry_validate_btree_keys(struct bch_fs *c,
 	struct bkey_i *k;
 
 	vstruct_for_each(entry, k) {
-		int ret = journal_validate_key(c, jset, entry, k,
-				__btree_node_type(entry->level,
-						  entry->btree_id),
-				"key", write);
+		int ret = journal_validate_key(c, jset, entry,
+					       entry->level,
+					       entry->btree_id,
+					       k, "key", write);
 		if (ret)
 			return ret;
 	}
@@ -241,7 +241,7 @@ static int journal_entry_validate_btree_root(struct bch_fs *c,
 		return 0;
 	}
 
-	return journal_validate_key(c, jset, entry, k, BKEY_TYPE_BTREE,
+	return journal_validate_key(c, jset, entry, 1, entry->btree_id, k,
 				    "btree root", write);
 fsck_err:
 	return ret;
@@ -1017,8 +1017,7 @@ void bch2_journal_write(struct closure *cl)
 	if (bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset)))
 		validate_before_checksum = true;
 
-	if (le32_to_cpu(jset->version) <
-	    bcachefs_metadata_version_bkey_renumber)
+	if (le32_to_cpu(jset->version) < bcachefs_metadata_version_max)
 		validate_before_checksum = true;
 
 	if (validate_before_checksum &&
-- 
cgit 


From 2c31e6572ec6f3fede0d4bb54f342bafba90fe70 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 29 Mar 2020 17:01:05 -0400
Subject: bcachefs: Reduce max nr of btree iters when lockdep is on

This is so we don't overflow MAX_LOCK_DEPTH.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_types.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index a794f9fe4fce..71b6b36e513d 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -275,7 +275,11 @@ struct btree_insert_entry {
 	struct btree_iter	*iter;
 };
 
+#ifndef CONFIG_LOCKDEP
 #define BTREE_ITER_MAX		64
+#else
+#define BTREE_ITER_MAX		32
+#endif
 
 struct btree_trans {
 	struct bch_fs		*c;
-- 
cgit 


From a0e491c099a25d06759bf8e6e9bcc6fd7c4229a5 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 30 Mar 2020 12:33:30 -0400
Subject: bcachefs: Don't allocate memory while holding journal reservation

This fixes a lockdep splat - allocating memory can call
bch2_clear_page_bits() which takes mark_lock.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update_interior.c | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index f09423c83c4a..e8dd19cae7ca 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -713,13 +713,6 @@ again:
 
 		bch2_btree_add_journal_pin(c, b, res.seq);
 		six_unlock_write(&b->c.lock);
-
-		/*
-		 * b->write_blocked prevented it from being written, so
-		 * write it now if it needs to be written:
-		 */
-		btree_node_write_if_need(c, b, SIX_LOCK_intent);
-		six_unlock_intent(&b->c.lock);
 		break;
 
 	case BTREE_INTERIOR_UPDATING_AS:
@@ -746,6 +739,16 @@ again:
 	bch2_journal_res_put(&c->journal, &res);
 	bch2_journal_preres_put(&c->journal, &as->journal_preres);
 
+	/* Do btree write after dropping journal res: */
+	if (b) {
+		/*
+		 * b->write_blocked prevented it from being written, so
+		 * write it now if it needs to be written:
+		 */
+		btree_node_write_if_need(c, b, SIX_LOCK_intent);
+		six_unlock_intent(&b->c.lock);
+	}
+
 	btree_update_nodes_reachable(as, res.seq);
 free_update:
 	__bch2_btree_update_free(as);
-- 
cgit 


From d06c1a0cbce929c0bafd7f37dcb0fbf9e652abb5 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 29 Mar 2020 16:48:53 -0400
Subject: bcachefs: Check btree topology at startup

When initial btree gc was changed to overlay journal keys as it walks
the btree, it also stopped checking btree topology.

Previously, checking btree topology was a fairly complicated affair -
but it's much easier now that btree_ptr_v2 has min_key in the pointer.

This rewrites the old range_checks code and uses it in both runtime and
initial gc.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_gc.c | 162 +++++++++++++++++++++++++++----------------------
 1 file changed, 91 insertions(+), 71 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 8a832e92b6a2..6220ec9b540b 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -49,64 +49,42 @@ static inline void gc_pos_set(struct bch_fs *c, struct gc_pos new_pos)
 	__gc_pos_set(c, new_pos);
 }
 
-/* range_checks - for validating min/max pos of each btree node: */
-
-struct range_checks {
-	struct range_level {
-		struct bpos	min;
-		struct bpos	max;
-	}			l[BTREE_MAX_DEPTH];
-	unsigned		depth;
-};
-
-static void btree_node_range_checks_init(struct range_checks *r, unsigned depth)
+static int bch2_gc_check_topology(struct bch_fs *c,
+				  struct bkey_s_c k,
+				  struct bpos *expected_start,
+				  struct bpos expected_end,
+				  bool is_last)
 {
-	unsigned i;
-
-	for (i = 0; i < BTREE_MAX_DEPTH; i++)
-		r->l[i].min = r->l[i].max = POS_MIN;
-	r->depth = depth;
-}
-
-static void btree_node_range_checks(struct bch_fs *c, struct btree *b,
-				    struct range_checks *r)
-{
-	struct range_level *l = &r->l[b->c.level];
-
-	struct bpos expected_min = bkey_cmp(l->min, l->max)
-		? bkey_successor(l->max)
-		: l->max;
-
-	bch2_fs_inconsistent_on(bkey_cmp(b->data->min_key, expected_min), c,
-		"btree node has incorrect min key: %llu:%llu != %llu:%llu",
-		b->data->min_key.inode,
-		b->data->min_key.offset,
-		expected_min.inode,
-		expected_min.offset);
-
-	l->max = b->data->max_key;
-
-	if (b->c.level > r->depth) {
-		l = &r->l[b->c.level - 1];
+	int ret = 0;
 
-		bch2_fs_inconsistent_on(bkey_cmp(b->data->min_key, l->min), c,
-			"btree node min doesn't match min of child nodes: %llu:%llu != %llu:%llu",
-			b->data->min_key.inode,
-			b->data->min_key.offset,
-			l->min.inode,
-			l->min.offset);
+	if (k.k->type == KEY_TYPE_btree_ptr_v2) {
+		struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k);
 
-		bch2_fs_inconsistent_on(bkey_cmp(b->data->max_key, l->max), c,
-			"btree node max doesn't match max of child nodes: %llu:%llu != %llu:%llu",
-			b->data->max_key.inode,
-			b->data->max_key.offset,
-			l->max.inode,
-			l->max.offset);
+		if (fsck_err_on(bkey_cmp(*expected_start, bp.v->min_key), c,
+				"btree node with incorrect min_key: got %llu:%llu, should be %llu:%llu",
+				bp.v->min_key.inode,
+				bp.v->min_key.offset,
+				expected_start->inode,
+				expected_start->offset)) {
+			BUG();
+		}
+	}
 
-		if (bkey_cmp(b->data->max_key, POS_MAX))
-			l->min = l->max =
-				bkey_successor(b->data->max_key);
+	*expected_start = bkey_cmp(k.k->p, POS_MAX)
+		? bkey_successor(k.k->p)
+		: k.k->p;
+
+	if (fsck_err_on(is_last &&
+			bkey_cmp(k.k->p, expected_end), c,
+			"btree node with incorrect max_key: got %llu:%llu, should be %llu:%llu",
+			k.k->p.inode,
+			k.k->p.offset,
+			expected_end.inode,
+			expected_end.offset)) {
+		BUG();
 	}
+fsck_err:
+	return ret;
 }
 
 /* marking of btree keys/nodes: */
@@ -188,6 +166,7 @@ fsck_err:
 static int btree_gc_mark_node(struct bch_fs *c, struct btree *b, u8 *max_stale,
 			      bool initial)
 {
+	struct bpos next_node_start = b->data->min_key;
 	struct btree_node_iter iter;
 	struct bkey unpacked;
 	struct bkey_s_c k;
@@ -198,13 +177,25 @@ static int btree_gc_mark_node(struct bch_fs *c, struct btree *b, u8 *max_stale,
 	if (!btree_node_type_needs_gc(btree_node_type(b)))
 		return 0;
 
-	for_each_btree_node_key_unpack(b, k, &iter,
-				       &unpacked) {
+	bch2_btree_node_iter_init_from_start(&iter, b);
+
+	while ((k = bch2_btree_node_iter_peek_unpack(&iter, b, &unpacked)).k) {
 		bch2_bkey_debugcheck(c, b, k);
 
 		ret = bch2_gc_mark_key(c, k, max_stale, initial);
 		if (ret)
 			break;
+
+		bch2_btree_node_iter_advance(&iter, b);
+
+		if (b->c.level) {
+			ret = bch2_gc_check_topology(c, k,
+					&next_node_start,
+					b->data->max_key,
+					bch2_btree_node_iter_end(&iter));
+			if (ret)
+				break;
+		}
 	}
 
 	return ret;
@@ -216,7 +207,6 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id,
 	struct btree_trans trans;
 	struct btree_iter *iter;
 	struct btree *b;
-	struct range_checks r;
 	unsigned depth = metadata_only			? 1
 		: expensive_debug_checks(c)		? 0
 		: !btree_node_type_needs_gc(btree_id)	? 1
@@ -228,12 +218,8 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id,
 
 	gc_pos_set(c, gc_pos_btree(btree_id, POS_MIN, 0));
 
-	btree_node_range_checks_init(&r, depth);
-
 	__for_each_btree_node(&trans, iter, btree_id, POS_MIN,
 			      0, depth, BTREE_ITER_PREFETCH, b) {
-		btree_node_range_checks(c, b, &r);
-
 		bch2_verify_btree_nr_keys(b);
 
 		gc_pos_set(c, gc_pos_btree_node(b));
@@ -275,11 +261,12 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id,
 }
 
 static int bch2_gc_btree_init_recurse(struct bch_fs *c, struct btree *b,
-					 struct journal_keys *journal_keys,
-					 unsigned target_depth)
+				      struct journal_keys *journal_keys,
+				      unsigned target_depth)
 {
 	struct btree_and_journal_iter iter;
 	struct bkey_s_c k;
+	struct bpos next_node_start = b->data->min_key;
 	u8 max_stale = 0;
 	int ret = 0;
 
@@ -288,28 +275,46 @@ static int bch2_gc_btree_init_recurse(struct bch_fs *c, struct btree *b,
 	while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
 		bch2_bkey_debugcheck(c, b, k);
 
+		BUG_ON(bkey_cmp(k.k->p, b->data->min_key) < 0);
+		BUG_ON(bkey_cmp(k.k->p, b->data->max_key) > 0);
+
 		ret = bch2_gc_mark_key(c, k, &max_stale, true);
 		if (ret)
 			break;
 
-		if (b->c.level > target_depth) {
+		if (b->c.level) {
 			struct btree *child;
 			BKEY_PADDED(k) tmp;
 
 			bkey_reassemble(&tmp.k, k);
+			k = bkey_i_to_s_c(&tmp.k);
+
+			bch2_btree_and_journal_iter_advance(&iter);
 
-			child = bch2_btree_node_get_noiter(c, &tmp.k,
-						b->c.btree_id, b->c.level - 1);
-			ret = PTR_ERR_OR_ZERO(child);
+			ret = bch2_gc_check_topology(c, k,
+					&next_node_start,
+					b->data->max_key,
+					!bch2_btree_and_journal_iter_peek(&iter).k);
 			if (ret)
 				break;
 
-			bch2_gc_btree_init_recurse(c, child,
-					journal_keys, target_depth);
-			six_unlock_read(&child->c.lock);
-		}
+			if (b->c.level > target_depth) {
+				child = bch2_btree_node_get_noiter(c, &tmp.k,
+							b->c.btree_id, b->c.level - 1);
+				ret = PTR_ERR_OR_ZERO(child);
+				if (ret)
+					break;
 
-		bch2_btree_and_journal_iter_advance(&iter);
+				ret = bch2_gc_btree_init_recurse(c, child,
+						journal_keys, target_depth);
+				six_unlock_read(&child->c.lock);
+
+				if (ret)
+					break;
+			}
+		} else {
+			bch2_btree_and_journal_iter_advance(&iter);
+		}
 	}
 
 	return ret;
@@ -334,6 +339,20 @@ static int bch2_gc_btree_init(struct bch_fs *c,
 		return 0;
 
 	six_lock_read(&b->c.lock, NULL, NULL);
+	if (fsck_err_on(bkey_cmp(b->data->min_key, POS_MIN), c,
+			"btree root with incorrect min_key: %llu:%llu",
+			b->data->min_key.inode,
+			b->data->min_key.offset)) {
+		BUG();
+	}
+
+	if (fsck_err_on(bkey_cmp(b->data->max_key, POS_MAX), c,
+			"btree root with incorrect min_key: %llu:%llu",
+			b->data->max_key.inode,
+			b->data->max_key.offset)) {
+		BUG();
+	}
+
 	if (b->c.level >= target_depth)
 		ret = bch2_gc_btree_init_recurse(c, b,
 					journal_keys, target_depth);
@@ -341,6 +360,7 @@ static int bch2_gc_btree_init(struct bch_fs *c,
 	if (!ret)
 		ret = bch2_gc_mark_key(c, bkey_i_to_s_c(&b->key),
 				       &max_stale, true);
+fsck_err:
 	six_unlock_read(&b->c.lock);
 
 	return ret;
-- 
cgit 


From e5e6aaa7979ab588868f686c8bd8997264492610 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 30 Mar 2020 14:05:05 -0400
Subject: bcachefs: Fix ec_stripe_update_ptrs()

bch2_btree_iter_set_pos() invalidates the key returned by peek().

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/ec.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index bc11f7e056eb..909a4a5036ab 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -804,8 +804,6 @@ static int ec_stripe_update_ptrs(struct bch_fs *c,
 			continue;
 		}
 
-		bch2_btree_iter_set_pos(iter, bkey_start_pos(k.k));
-
 		dev = s->key.v.ptrs[idx].dev;
 
 		bkey_on_stack_reassemble(&sk, c, k);
@@ -820,6 +818,7 @@ static int ec_stripe_update_ptrs(struct bch_fs *c,
 
 		extent_stripe_ptr_add(e, s, ec_ptr, idx);
 
+		bch2_btree_iter_set_pos(iter, bkey_start_pos(&sk.k->k));
 		bch2_trans_update(&trans, iter, sk.k, 0);
 
 		ret = bch2_trans_commit(&trans, NULL, NULL,
-- 
cgit 


From 1d60b99999bc2abd4020b758794ced8af43394ae Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 30 Mar 2020 14:29:06 -0400
Subject: bcachefs: Fix inodes pass in fsck

It wasn't updated for the patch that switched inodes to using the offset
field of struct bkey.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fsck.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index c7508e81188c..3ab621c62c43 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -1353,18 +1353,18 @@ static int bch2_gc_walk_inodes(struct bch_fs *c,
 	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
 
 	iter = bch2_trans_get_iter(&trans, BTREE_ID_INODES,
-				   POS(range_start, 0), 0);
+				   POS(0, range_start), 0);
 	nlinks_iter = genradix_iter_init(links, 0);
 
 	while ((k = bch2_btree_iter_peek(iter)).k &&
 	       !(ret2 = bkey_err(k))) {
 peek_nlinks:	link = genradix_iter_peek(&nlinks_iter, links);
 
-		if (!link && (!k.k || iter->pos.inode >= range_end))
+		if (!link && (!k.k || iter->pos.offset >= range_end))
 			break;
 
 		nlinks_pos = range_start + nlinks_iter.pos;
-		if (iter->pos.inode > nlinks_pos) {
+		if (iter->pos.offset > nlinks_pos) {
 			/* Should have been caught by dirents pass: */
 			need_fsck_err_on(link && link->count, c,
 				"missing inode %llu (nlink %u)",
@@ -1373,7 +1373,7 @@ peek_nlinks:	link = genradix_iter_peek(&nlinks_iter, links);
 			goto peek_nlinks;
 		}
 
-		if (iter->pos.inode < nlinks_pos || !link)
+		if (iter->pos.offset < nlinks_pos || !link)
 			link = &zero_links;
 
 		if (k.k && k.k->type == KEY_TYPE_inode) {
@@ -1389,7 +1389,7 @@ peek_nlinks:	link = genradix_iter_peek(&nlinks_iter, links);
 				nlinks_pos, link->count);
 		}
 
-		if (nlinks_pos == iter->pos.inode)
+		if (nlinks_pos == iter->pos.offset)
 			genradix_iter_advance(&nlinks_iter, links);
 
 		bch2_btree_iter_next(iter);
-- 
cgit 


From 11f6ed36b959131a0d990253f07e5105fc4d8901 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 30 Mar 2020 17:43:21 -0400
Subject: bcachefs: Fix a locking bug

Dropping the wrong kind of lock can't lead to anything good...

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_io.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h
index fa996fdc5474..f3d7ec749b61 100644
--- a/fs/bcachefs/btree_io.h
+++ b/fs/bcachefs/btree_io.h
@@ -114,7 +114,7 @@ static inline void btree_node_write_if_need(struct bch_fs *c, struct btree *b,
 			break;
 		}
 
-		six_unlock_read(&b->c.lock);
+		six_unlock_type(&b->c.lock, lock_held);
 		btree_node_wait_on_io(b);
 		btree_node_lock_type(c, b, lock_held);
 	}
-- 
cgit 


From b58a181d5c4a145730f202ac0375fa463c88f710 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 30 Mar 2020 18:11:13 -0400
Subject: bcachefs: Fix iterating of journal keys within a btree node

Extent btrees no longer have weird special behaviour for min_key.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/recovery.c | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index c0e6cfa36c89..8e9d412a6000 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -183,17 +183,12 @@ void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *i
 						struct journal_keys *journal_keys,
 						struct btree *b)
 {
-	struct bpos start = b->data->min_key;
-
-	if (btree_node_type_is_extents(b->c.btree_id))
-		start = bkey_successor(start);
-
 	memset(iter, 0, sizeof(*iter));
 
 	iter->b = b;
 	bch2_btree_node_iter_init_from_start(&iter->node_iter, iter->b);
 	bch2_journal_iter_init(&iter->journal, journal_keys,
-			       b->c.btree_id, b->c.level, start);
+			       b->c.btree_id, b->c.level, b->data->min_key);
 }
 
 /* sort and dedup all keys in the journal: */
-- 
cgit 


From 501e1bda3e58db75eaf938fde70b03639dd6282a Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 31 Mar 2020 16:23:43 -0400
Subject: bcachefs: Fix journalling of interior node updates

We weren't journalling updates done while splitting/compacting nodes -
oops.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update_interior.c | 99 +++++++++++++++++++------------------
 fs/bcachefs/btree_update_interior.h |  4 ++
 2 files changed, 54 insertions(+), 49 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index e8dd19cae7ca..d5bea4f8ea02 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -27,43 +27,37 @@ static void btree_update_drop_new_node(struct bch_fs *, struct btree *);
 
 /* Debug code: */
 
+/*
+ * Verify that child nodes correctly span parent node's range:
+ */
 static void btree_node_interior_verify(struct btree *b)
 {
+#ifdef CONFIG_BCACHEFS_DEBUG
+	struct bpos next_node = b->data->min_key;
 	struct btree_node_iter iter;
-	struct bkey_packed *k;
+	struct bkey_s_c k;
+	struct bkey_s_c_btree_ptr_v2 bp;
+	struct bkey unpacked;
 
 	BUG_ON(!b->c.level);
 
-	bch2_btree_node_iter_init(&iter, b, &b->key.k.p);
-#if 1
-	BUG_ON(!(k = bch2_btree_node_iter_peek(&iter, b)) ||
-	       bkey_cmp_left_packed(b, k, &b->key.k.p));
+	bch2_btree_node_iter_init_from_start(&iter, b);
 
-	BUG_ON((bch2_btree_node_iter_advance(&iter, b),
-		!bch2_btree_node_iter_end(&iter)));
-#else
-	const char *msg;
+	while (1) {
+		k = bch2_btree_node_iter_peek_unpack(&iter, b, &unpacked);
+		bp = bkey_s_c_to_btree_ptr_v2(k);
 
-	msg = "not found";
-	k = bch2_btree_node_iter_peek(&iter, b);
-	if (!k)
-		goto err;
+		BUG_ON(bkey_cmp(next_node, bp.v->min_key));
 
-	msg = "isn't what it should be";
-	if (bkey_cmp_left_packed(b, k, &b->key.k.p))
-		goto err;
+		bch2_btree_node_iter_advance(&iter, b);
 
-	bch2_btree_node_iter_advance(&iter, b);
+		if (bch2_btree_node_iter_end(&iter)) {
+			BUG_ON(bkey_cmp(k.k->p, b->key.k.p));
+			break;
+		}
 
-	msg = "isn't last key";
-	if (!bch2_btree_node_iter_end(&iter))
-		goto err;
-	return;
-err:
-	bch2_dump_btree_node(b);
-	printk(KERN_ERR "last key %llu:%llu %s\n", b->key.k.p.inode,
-	       b->key.k.p.offset, msg);
-	BUG();
+		next_node = bkey_successor(k.k->p);
+	}
 #endif
 }
 
@@ -644,8 +638,6 @@ static void btree_update_nodes_written(struct closure *cl)
 	struct bch_fs *c = as->c;
 	struct btree *b;
 	struct bset *i;
-	struct bkey_i *k;
-	unsigned journal_u64s = 0;
 	int ret;
 
 	/*
@@ -674,13 +666,7 @@ again:
 
 	list_del(&as->unwritten_list);
 
-	journal_u64s = 0;
-
-	if (as->mode != BTREE_INTERIOR_UPDATING_ROOT)
-		for_each_keylist_key(&as->parent_keys, k)
-			journal_u64s += jset_u64s(k->k.u64s);
-
-	ret = bch2_journal_res_get(&c->journal, &res, journal_u64s,
+	ret = bch2_journal_res_get(&c->journal, &res, as->journal_u64s,
 				   JOURNAL_RES_GET_RESERVED);
 	if (ret) {
 		BUG_ON(!bch2_journal_error(&c->journal));
@@ -688,13 +674,14 @@ again:
 		goto free_update;
 	}
 
-	if (as->mode != BTREE_INTERIOR_UPDATING_ROOT)
-		for_each_keylist_key(&as->parent_keys, k)
-			bch2_journal_add_entry(&c->journal, &res,
-					       BCH_JSET_ENTRY_btree_keys,
-					       as->btree_id,
-					       as->level,
-					       k, k->k.u64s);
+	{
+		struct journal_buf *buf = &c->journal.buf[res.idx];
+		struct jset_entry *entry = vstruct_idx(buf->data, res.offset);
+
+		res.offset	+= as->journal_u64s;
+		res.u64s	-= as->journal_u64s;
+		memcpy_u64s(entry, as->journal_entries, as->journal_u64s);
+	}
 
 	switch (as->mode) {
 	case BTREE_INTERIOR_NO_UPDATE:
@@ -983,7 +970,7 @@ bch2_btree_update_start(struct bch_fs *c, enum btree_id id,
 	bch2_keylist_init(&as->parent_keys, as->inline_keys);
 
 	ret = bch2_journal_preres_get(&c->journal, &as->journal_preres,
-				 jset_u64s(BKEY_BTREE_PTR_U64s_MAX) * 3, 0);
+				      ARRAY_SIZE(as->journal_entries), 0);
 	if (ret) {
 		bch2_btree_reserve_put(c, reserve);
 		closure_debug_destroy(&as->cl);
@@ -1103,10 +1090,21 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, struct btree *b
 {
 	struct bch_fs *c = as->c;
 	struct bch_fs_usage_online *fs_usage;
+	struct jset_entry *entry;
 	struct bkey_packed *k;
 	struct bkey tmp;
 
-	BUG_ON(insert->k.u64s > bch_btree_keys_u64s_remaining(c, b));
+	BUG_ON(as->journal_u64s + jset_u64s(insert->k.u64s) >
+	       ARRAY_SIZE(as->journal_entries));
+
+	entry = (void *) &as->journal_entries[as->journal_u64s];
+	memset(entry, 0, sizeof(*entry));
+	entry->u64s	= cpu_to_le16(insert->k.u64s);
+	entry->type	= BCH_JSET_ENTRY_btree_keys;
+	entry->btree_id = b->c.btree_id;
+	entry->level	= b->c.level;
+	memcpy_u64s_small(entry->_data, insert, insert->k.u64s);
+	as->journal_u64s += jset_u64s(insert->k.u64s);
 
 	mutex_lock(&c->btree_interior_update_lock);
 	percpu_down_read(&c->mark_lock);
@@ -1255,6 +1253,14 @@ static void btree_split_insert_keys(struct btree_update *as, struct btree *b,
 	struct bkey_packed *src, *dst, *n;
 	struct bset *i;
 
+	/*
+	 * XXX
+	 *
+	 * these updates must be journalled
+	 *
+	 * oops
+	 */
+
 	BUG_ON(btree_node_type(b) != BKEY_TYPE_BTREE);
 
 	bch2_btree_node_iter_init(&node_iter, b, &k->k.p);
@@ -1262,11 +1268,6 @@ static void btree_split_insert_keys(struct btree_update *as, struct btree *b,
 	while (!bch2_keylist_empty(keys)) {
 		k = bch2_keylist_front(keys);
 
-		BUG_ON(bch_keylist_u64s(keys) >
-		       bch_btree_keys_u64s_remaining(as->c, b));
-		BUG_ON(bkey_cmp(k->k.p, b->data->min_key) < 0);
-		BUG_ON(bkey_cmp(k->k.p, b->data->max_key) > 0);
-
 		bch2_insert_fixup_btree_ptr(as, b, iter, k, &node_iter);
 		bch2_keylist_pop_front(keys);
 	}
diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h
index 4a2ea69f6a2c..d3498aed145b 100644
--- a/fs/bcachefs/btree_update_interior.h
+++ b/fs/bcachefs/btree_update_interior.h
@@ -104,6 +104,10 @@ struct btree_update {
 	struct btree			*new_nodes[BTREE_MAX_DEPTH * 2 + GC_MERGE_NODES];
 	unsigned			nr_new_nodes;
 
+	unsigned			journal_u64s;
+	u64				journal_entries[
+		(BKEY_BTREE_PTR_U64s_MAX + 1) * (BTREE_MAX_DEPTH - 1) * 2];
+
 	/* Only here to reduce stack usage on recursive splits: */
 	struct keylist			parent_keys;
 	/*
-- 
cgit 


From 59a38a384443cd3c7343d25bd4b7cc2ccf2a8aef Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 31 Mar 2020 16:25:30 -0400
Subject: bcachefs: Add print method for bch2_btree_ptr_v2

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/extents.c | 15 +++++++++++++++
 fs/bcachefs/extents.h |  5 ++++-
 2 files changed, 19 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 3c28f3aa9df7..1189c6107c88 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -214,6 +214,21 @@ void bch2_btree_ptr_to_text(struct printbuf *out, struct bch_fs *c,
 	bch2_bkey_ptrs_to_text(out, c, k);
 }
 
+void bch2_btree_ptr_v2_to_text(struct printbuf *out, struct bch_fs *c,
+			    struct bkey_s_c k)
+{
+	struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k);
+
+	pr_buf(out, "seq %llu sectors %u written %u min_key ",
+	       le64_to_cpu(bp.v->seq),
+	       le16_to_cpu(bp.v->sectors),
+	       le16_to_cpu(bp.v->sectors_written));
+
+	bch2_bpos_to_text(out, bp.v->min_key);
+	pr_buf(out, " ");
+	bch2_bkey_ptrs_to_text(out, c, k);
+}
+
 void bch2_btree_ptr_v2_compat(enum btree_id btree_id, unsigned version,
 			      unsigned big_endian, int write,
 			      struct bkey_s k)
diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h
index 8ff2eac3ee2b..29b15365d19c 100644
--- a/fs/bcachefs/extents.h
+++ b/fs/bcachefs/extents.h
@@ -371,6 +371,9 @@ const char *bch2_btree_ptr_invalid(const struct bch_fs *, struct bkey_s_c);
 void bch2_btree_ptr_debugcheck(struct bch_fs *, struct bkey_s_c);
 void bch2_btree_ptr_to_text(struct printbuf *, struct bch_fs *,
 			    struct bkey_s_c);
+
+void bch2_btree_ptr_v2_to_text(struct printbuf *, struct bch_fs *,
+			    struct bkey_s_c);
 void bch2_btree_ptr_v2_compat(enum btree_id, unsigned, unsigned,
 			      int, struct bkey_s);
 
@@ -384,7 +387,7 @@ void bch2_btree_ptr_v2_compat(enum btree_id, unsigned, unsigned,
 #define bch2_bkey_ops_btree_ptr_v2 (struct bkey_ops) {		\
 	.key_invalid	= bch2_btree_ptr_invalid,		\
 	.key_debugcheck	= bch2_btree_ptr_debugcheck,		\
-	.val_to_text	= bch2_btree_ptr_to_text,		\
+	.val_to_text	= bch2_btree_ptr_v2_to_text,		\
 	.swab		= bch2_ptr_swab,			\
 	.compat		= bch2_btree_ptr_v2_compat,		\
 }
-- 
cgit 


From 283eda5798bc5f4f593817e057c14f02790e5bdb Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 1 Apr 2020 16:07:57 -0400
Subject: bcachefs: Fix fallocate FL_INSERT_RANGE

This was another bug because of bch2_btree_iter_set_pos() invalidating
iterators.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-io.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 726c55072b7b..e713c83530cc 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -2484,10 +2484,8 @@ reassemble:
 		bkey_on_stack_reassemble(&copy, c, k);
 
 		if (insert &&
-		    bkey_cmp(bkey_start_pos(k.k), move_pos) < 0) {
+		    bkey_cmp(bkey_start_pos(k.k), move_pos) < 0)
 			bch2_cut_front(move_pos, copy.k);
-			bch2_btree_iter_set_pos(src, bkey_start_pos(&copy.k->k));
-		}
 
 		copy.k->k.p.offset += shift >> 9;
 		bch2_btree_iter_set_pos(dst, bkey_start_pos(&copy.k->k));
@@ -2507,8 +2505,9 @@ reassemble:
 		}
 
 		bkey_init(&delete.k);
-		delete.k.p = src->pos;
-		bch2_key_resize(&delete.k, copy.k->k.size);
+		delete.k.p = copy.k->k.p;
+		delete.k.size = copy.k->k.size;
+		delete.k.p.offset -= shift >> 9;
 
 		next_pos = insert ? bkey_start_pos(&delete.k) : delete.k.p;
 
@@ -2529,6 +2528,8 @@ reassemble:
 			BUG_ON(ret);
 		}
 
+		bch2_btree_iter_set_pos(src, bkey_start_pos(&delete.k));
+
 		ret =   bch2_trans_update(&trans, src, &delete, trigger_flags) ?:
 			bch2_trans_update(&trans, dst, copy.k, trigger_flags) ?:
 			bch2_trans_commit(&trans, &disk_res,
-- 
cgit 


From 0329b1507d37c8a7f87dace9ad888cc5abb7c8a8 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 1 Apr 2020 17:14:14 -0400
Subject: bcachefs: Trace where btree iterators are allocated

This will help with iterator overflow bugs.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c  | 13 +++++++------
 fs/bcachefs/btree_iter.h  | 31 ++++++++++++++++++++++++++++---
 fs/bcachefs/btree_types.h |  1 +
 3 files changed, 36 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 4ce6a66edcd5..7b12bd163df7 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1912,13 +1912,14 @@ static struct btree_iter *btree_trans_iter_alloc(struct btree_trans *trans)
 			struct btree_iter *iter;
 
 			trans_for_each_iter(trans, iter) {
-				pr_err("iter: btree %s pos %llu:%llu%s%s%s",
+				pr_err("iter: btree %s pos %llu:%llu%s%s%s %pf",
 				       bch2_btree_ids[iter->btree_id],
 				       iter->pos.inode,
 				       iter->pos.offset,
 				       (trans->iters_live & (1ULL << iter->idx)) ? " live" : "",
 				       (trans->iters_touched & (1ULL << iter->idx)) ? " touched" : "",
-				       iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT ? " keep" : "");
+				       iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT ? " keep" : "",
+				       (void *) iter->ip_allocated);
 			}
 
 			panic("trans iter oveflow\n");
@@ -2025,9 +2026,9 @@ static struct btree_iter *__btree_trans_get_iter(struct btree_trans *trans,
 	return iter;
 }
 
-struct btree_iter *bch2_trans_get_iter(struct btree_trans *trans,
-				       enum btree_id btree_id,
-				       struct bpos pos, unsigned flags)
+struct btree_iter *__bch2_trans_get_iter(struct btree_trans *trans,
+					 enum btree_id btree_id,
+					 struct bpos pos, unsigned flags)
 {
 	struct btree_iter *iter =
 		__btree_trans_get_iter(trans, btree_id, pos, flags);
@@ -2064,7 +2065,7 @@ struct btree_iter *bch2_trans_get_node_iter(struct btree_trans *trans,
 	return iter;
 }
 
-struct btree_iter *bch2_trans_copy_iter(struct btree_trans *trans,
+struct btree_iter *__bch2_trans_copy_iter(struct btree_trans *trans,
 					struct btree_iter *src)
 {
 	struct btree_iter *iter;
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index 60baca62a596..928170afe3b5 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -257,10 +257,35 @@ int bch2_trans_iter_free(struct btree_trans *, struct btree_iter *);
 
 void bch2_trans_unlink_iters(struct btree_trans *);
 
-struct btree_iter *bch2_trans_get_iter(struct btree_trans *, enum btree_id,
-				       struct bpos, unsigned);
-struct btree_iter *bch2_trans_copy_iter(struct btree_trans *,
+struct btree_iter *__bch2_trans_get_iter(struct btree_trans *, enum btree_id,
+					 struct bpos, unsigned);
+
+static inline struct btree_iter *
+bch2_trans_get_iter(struct btree_trans *trans, enum btree_id btree_id,
+		    struct bpos pos, unsigned flags)
+{
+	struct btree_iter *iter =
+		__bch2_trans_get_iter(trans, btree_id, pos, flags);
+
+	if (!IS_ERR(iter))
+		iter->ip_allocated = _THIS_IP_;
+	return iter;
+}
+
+struct btree_iter *__bch2_trans_copy_iter(struct btree_trans *,
 					struct btree_iter *);
+static inline struct btree_iter *
+bch2_trans_copy_iter(struct btree_trans *trans, struct btree_iter *src)
+{
+	struct btree_iter *iter =
+		__bch2_trans_copy_iter(trans, src);
+
+	if (!IS_ERR(iter))
+		iter->ip_allocated = _THIS_IP_;
+	return iter;
+
+}
+
 struct btree_iter *bch2_trans_get_node_iter(struct btree_trans *,
 				enum btree_id, struct bpos,
 				unsigned, unsigned, unsigned);
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 71b6b36e513d..57796340fb36 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -256,6 +256,7 @@ struct btree_iter {
 	 * bch2_btree_iter_next_slot() can correctly advance pos.
 	 */
 	struct bkey		k;
+	unsigned long		ip_allocated;
 };
 
 static inline enum btree_iter_type btree_iter_type(struct btree_iter *iter)
-- 
cgit 


From 47c46c953163909944cd8ebf7e12107635fdb604 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 1 Apr 2020 17:28:39 -0400
Subject: bcachefs: Add another mssing bch2_trans_iter_put() call

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index 3cada7cc354a..4458a98b78ee 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -145,8 +145,6 @@ retry:
 				  &inode->ei_journal_seq,
 				  BTREE_INSERT_NOUNLOCK|
 				  BTREE_INSERT_NOFAIL);
-	if (ret == -EINTR)
-		goto retry;
 
 	/*
 	 * the btree node lock protects inode->ei_inode, not ei_update_lock;
@@ -155,6 +153,11 @@ retry:
 	if (!ret)
 		bch2_inode_update_after_write(c, inode, &inode_u, fields);
 
+	bch2_trans_iter_put(&trans, iter);
+
+	if (ret == -EINTR)
+		goto retry;
+
 	bch2_trans_exit(&trans);
 	return ret < 0 ? ret : 0;
 }
-- 
cgit 


From 75923ba7ad56f7236ae3979577011f5220d07d50 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 4 Apr 2020 13:54:19 -0400
Subject: bcachefs: Fix a null ptr deref during journal replay

We were calling bch2_extent_can_insert() incorrectly; it should only be
called when the extents-to-keys pass is running because that's when we
could be splitting a compressed extent. Calling bch2_extent_can_insert()
without passing in a disk reservation was causing a null ptr deref.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update_leaf.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 1e6675f68b4a..b9283ced4cae 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -309,7 +309,7 @@ btree_key_can_insert(struct btree_trans *trans,
 	if (unlikely(btree_node_old_extent_overwrite(b)))
 		return BTREE_INSERT_BTREE_NODE_FULL;
 
-	ret = !btree_node_is_extents(b)
+	ret = !(iter->flags & BTREE_ITER_IS_EXTENTS)
 		? BTREE_INSERT_OK
 		: bch2_extent_can_insert(trans, iter, insert);
 	if (ret)
-- 
cgit 


From 8707ab0df25955f4e11bf60643e08c018a72e6fc Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 4 Apr 2020 15:45:06 -0400
Subject: bcachefs: Fix another error path locking bug

btree_update_nodes_written() was leaking a btree node lock on failure to
get a journal reservation.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update_interior.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index d5bea4f8ea02..8b31f4d63094 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -725,7 +725,7 @@ again:
 
 	bch2_journal_res_put(&c->journal, &res);
 	bch2_journal_preres_put(&c->journal, &as->journal_preres);
-
+free_update:
 	/* Do btree write after dropping journal res: */
 	if (b) {
 		/*
@@ -736,8 +736,9 @@ again:
 		six_unlock_intent(&b->c.lock);
 	}
 
-	btree_update_nodes_reachable(as, res.seq);
-free_update:
+	if (!ret)
+		btree_update_nodes_reachable(as, res.seq);
+
 	__bch2_btree_update_free(as);
 	/*
 	 * for flush_held_btree_writes() waiting on updates to flush or
-- 
cgit 


From 2aec5955bb7c800a4eec685dcd58976013da1275 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 4 Apr 2020 15:49:42 -0400
Subject: bcachefs: Fix a debug assertion

This assertion was passing the wrong btree node type when inserting into
interior nodes.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update_leaf.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index b9283ced4cae..a93bc1890263 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -250,7 +250,8 @@ static inline void btree_insert_entry_checks(struct btree_trans *trans,
 
 	BUG_ON(bkey_cmp(insert->k.p, iter->pos));
 	BUG_ON(debug_check_bkeys(c) &&
-	       bch2_bkey_invalid(c, bkey_i_to_s_c(insert), iter->btree_id));
+	       bch2_bkey_invalid(c, bkey_i_to_s_c(insert),
+				 __btree_node_type(iter->level, iter->btree_id)));
 }
 
 static noinline int
-- 
cgit 


From 1e3b1f9a229df5cd8f21e2f6306fbcf25374f42a Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 4 Apr 2020 16:47:59 -0400
Subject: bcachefs: Fix a debug mode assertion

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update_interior.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 8b31f4d63094..677be20f7965 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -45,6 +45,8 @@ static void btree_node_interior_verify(struct btree *b)
 
 	while (1) {
 		k = bch2_btree_node_iter_peek_unpack(&iter, b, &unpacked);
+		if (k.k->type != KEY_TYPE_btree_ptr_v2)
+			break;
 		bp = bkey_s_c_to_btree_ptr_v2(k);
 
 		BUG_ON(bkey_cmp(next_node, bp.v->min_key));
-- 
cgit 


From 0f9dda478fb50bb7c55e4760158a6ef7cca7cb04 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 5 Apr 2020 21:49:17 -0400
Subject: bcachefs: Fix a deadlock on starting an interior btree update

Not legal to block on a journal prereservation with btree locks held.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_gc.c              |  2 +-
 fs/bcachefs/btree_update_interior.c | 40 ++++++++++++++++++++++++-------------
 fs/bcachefs/btree_update_interior.h |  8 +++++---
 3 files changed, 32 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 6220ec9b540b..e4c1b90f3cb5 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -935,7 +935,7 @@ static void bch2_coalesce_nodes(struct bch_fs *c, struct btree_iter *iter,
 		return;
 	}
 
-	as = bch2_btree_update_start(c, iter->btree_id,
+	as = bch2_btree_update_start(iter->trans, iter->btree_id,
 			btree_update_reserve_required(c, parent) + nr_old_nodes,
 			BTREE_INSERT_NOFAIL|
 			BTREE_INSERT_USE_RESERVE,
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 677be20f7965..daa4c0716c05 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -949,14 +949,34 @@ void bch2_btree_update_done(struct btree_update *as)
 }
 
 struct btree_update *
-bch2_btree_update_start(struct bch_fs *c, enum btree_id id,
+bch2_btree_update_start(struct btree_trans *trans, enum btree_id id,
 			unsigned nr_nodes, unsigned flags,
 			struct closure *cl)
 {
+	struct bch_fs *c = trans->c;
+	struct journal_preres journal_preres = { 0 };
 	struct btree_reserve *reserve;
 	struct btree_update *as;
 	int ret;
 
+	ret = bch2_journal_preres_get(&c->journal, &journal_preres,
+				      BTREE_UPDATE_JOURNAL_RES,
+				      JOURNAL_RES_GET_NONBLOCK);
+	if (ret == -EAGAIN) {
+		bch2_trans_unlock(trans);
+
+		ret = bch2_journal_preres_get(&c->journal, &journal_preres,
+					      BTREE_UPDATE_JOURNAL_RES,
+					      JOURNAL_RES_GET_NONBLOCK);
+		if (ret)
+			return ERR_PTR(ret);
+
+		if (!bch2_trans_relock(trans)) {
+			bch2_journal_preres_put(&c->journal, &journal_preres);
+			return ERR_PTR(-EINTR);
+		}
+	}
+
 	reserve = bch2_btree_reserve_get(c, nr_nodes, flags, cl);
 	if (IS_ERR(reserve))
 		return ERR_CAST(reserve);
@@ -969,18 +989,10 @@ bch2_btree_update_start(struct bch_fs *c, enum btree_id id,
 	as->btree_id	= id;
 	as->reserve	= reserve;
 	INIT_LIST_HEAD(&as->write_blocked_list);
+	as->journal_preres = journal_preres;
 
 	bch2_keylist_init(&as->parent_keys, as->inline_keys);
 
-	ret = bch2_journal_preres_get(&c->journal, &as->journal_preres,
-				      ARRAY_SIZE(as->journal_entries), 0);
-	if (ret) {
-		bch2_btree_reserve_put(c, reserve);
-		closure_debug_destroy(&as->cl);
-		mempool_free(as, &c->btree_interior_update_pool);
-		return ERR_PTR(ret);
-	}
-
 	mutex_lock(&c->btree_interior_update_lock);
 	list_add_tail(&as->list, &c->btree_interior_update_list);
 	mutex_unlock(&c->btree_interior_update_lock);
@@ -1551,7 +1563,7 @@ int bch2_btree_split_leaf(struct bch_fs *c, struct btree_iter *iter,
 		goto out;
 	}
 
-	as = bch2_btree_update_start(c, iter->btree_id,
+	as = bch2_btree_update_start(trans, iter->btree_id,
 		btree_update_reserve_required(c, b), flags,
 		!(flags & BTREE_INSERT_NOUNLOCK) ? &cl : NULL);
 	if (IS_ERR(as)) {
@@ -1663,7 +1675,7 @@ retry:
 		goto err_unlock;
 	}
 
-	as = bch2_btree_update_start(c, iter->btree_id,
+	as = bch2_btree_update_start(trans, iter->btree_id,
 			 btree_update_reserve_required(c, parent) + 1,
 			 BTREE_INSERT_NOFAIL|
 			 BTREE_INSERT_USE_RESERVE,
@@ -1776,7 +1788,7 @@ static int __btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter,
 	struct btree *n, *parent = btree_node_parent(iter, b);
 	struct btree_update *as;
 
-	as = bch2_btree_update_start(c, iter->btree_id,
+	as = bch2_btree_update_start(iter->trans, iter->btree_id,
 		(parent
 		 ? btree_update_reserve_required(c, parent)
 		 : 0) + 1,
@@ -2043,7 +2055,7 @@ int bch2_btree_node_update_key(struct bch_fs *c, struct btree_iter *iter,
 		new_hash = bch2_btree_node_mem_alloc(c);
 	}
 
-	as = bch2_btree_update_start(c, iter->btree_id,
+	as = bch2_btree_update_start(iter->trans, iter->btree_id,
 		parent ? btree_update_reserve_required(c, parent) : 0,
 		BTREE_INSERT_NOFAIL|
 		BTREE_INSERT_USE_RESERVE|
diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h
index d3498aed145b..fb35be00f1bb 100644
--- a/fs/bcachefs/btree_update_interior.h
+++ b/fs/bcachefs/btree_update_interior.h
@@ -32,6 +32,9 @@ struct pending_btree_node_free {
 	__BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX);
 };
 
+#define BTREE_UPDATE_JOURNAL_RES		\
+	((BKEY_BTREE_PTR_U64s_MAX + 1) * (BTREE_MAX_DEPTH - 1) * 2)
+
 /*
  * Tracks an in progress split/rewrite of a btree node and the update to the
  * parent node:
@@ -105,8 +108,7 @@ struct btree_update {
 	unsigned			nr_new_nodes;
 
 	unsigned			journal_u64s;
-	u64				journal_entries[
-		(BKEY_BTREE_PTR_U64s_MAX + 1) * (BTREE_MAX_DEPTH - 1) * 2];
+	u64				journal_entries[BTREE_UPDATE_JOURNAL_RES];
 
 	/* Only here to reduce stack usage on recursive splits: */
 	struct keylist			parent_keys;
@@ -132,7 +134,7 @@ struct btree *__bch2_btree_node_alloc_replacement(struct btree_update *,
 
 void bch2_btree_update_done(struct btree_update *);
 struct btree_update *
-bch2_btree_update_start(struct bch_fs *, enum btree_id, unsigned,
+bch2_btree_update_start(struct btree_trans *, enum btree_id, unsigned,
 			unsigned, struct closure *);
 
 void bch2_btree_interior_update_will_free_node(struct btree_update *,
-- 
cgit 


From e77e4efce31f3739fa85a84c5197b7cd18747a64 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 7 Apr 2020 13:49:14 -0400
Subject: bcachefs: Account for ioclock slop when throttling rebalance thread

This should fix an issue where the rebalance thread was spinning

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/rebalance.c | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c
index a0a75cfa41cb..eb3f7d02c791 100644
--- a/fs/bcachefs/rebalance.c
+++ b/fs/bcachefs/rebalance.c
@@ -204,17 +204,21 @@ static int bch2_rebalance_thread(void *arg)
 			prev_run_time;
 
 		if (w.dev_most_full_percent < 20 && throttle > 0) {
-			r->state = REBALANCE_THROTTLED;
 			r->throttled_until_iotime = io_start +
 				div_u64(w.dev_most_full_capacity *
 					(20 - w.dev_most_full_percent),
 					50);
-			r->throttled_until_cputime = start + throttle;
 
-			bch2_kthread_io_clock_wait(clock,
-				r->throttled_until_iotime,
-				throttle);
-			continue;
+			if (atomic_long_read(&clock->now) + clock->max_slop <
+			    r->throttled_until_iotime) {
+				r->throttled_until_cputime = start + throttle;
+				r->state = REBALANCE_THROTTLED;
+
+				bch2_kthread_io_clock_wait(clock,
+					r->throttled_until_iotime,
+					throttle);
+				continue;
+			}
 		}
 
 		/* minimum 1 mb/sec: */
-- 
cgit 


From 1eba942d1c48a9d3dadbb04f65be7705f506e40b Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 7 Apr 2020 17:31:38 -0400
Subject: bcachefs: Fix a locking bug in bch2_btree_ptr_debugcheck()

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/extents.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 1189c6107c88..52beaab227ef 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -179,7 +179,7 @@ void bch2_btree_ptr_debugcheck(struct bch_fs *c, struct bkey_s_c k)
 		return;
 
 	bch2_fs_inconsistent_on(!test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) &&
-		!bch2_bkey_replicas_marked(c, k, false), c,
+		!bch2_bkey_replicas_marked_locked(c, k, false), c,
 		"btree key bad (replicas not marked in superblock):\n%s",
 		(bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
 
-- 
cgit 


From 58fb3e519a7612a01d9fc969aa4eca56b30d898e Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 7 Apr 2020 17:27:12 -0400
Subject: bcachefs: Fix another deadlock in the btree interior update path

Can't take read locks on btree nodes while holding
btree_interior_update_lock. Also, fix a bug where we were leaking
journal prereservations.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update_interior.c | 71 ++++++++++++++++++++++---------------
 1 file changed, 42 insertions(+), 29 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index daa4c0716c05..772595b3da9f 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -609,33 +609,11 @@ static void bch2_btree_update_free(struct btree_update *as)
 	mutex_unlock(&c->btree_interior_update_lock);
 }
 
-static void btree_update_nodes_reachable(struct btree_update *as, u64 seq)
-{
-	struct bch_fs *c = as->c;
-
-	while (as->nr_new_nodes) {
-		struct btree *b = as->new_nodes[--as->nr_new_nodes];
-
-		BUG_ON(b->will_make_reachable != (unsigned long) as);
-		b->will_make_reachable = 0;
-
-		/*
-		 * b->will_make_reachable prevented it from being written, so
-		 * write it now if it needs to be written:
-		 */
-		btree_node_lock_type(c, b, SIX_LOCK_read);
-		bch2_btree_node_write_cond(c, b, btree_node_need_write(b));
-		six_unlock_read(&b->c.lock);
-	}
-
-	while (as->nr_pending)
-		bch2_btree_node_free_ondisk(c, &as->pending[--as->nr_pending],
-					    seq);
-}
-
 static void btree_update_nodes_written(struct closure *cl)
 {
 	struct btree_update *as = container_of(cl, struct btree_update, cl);
+	struct btree *new_nodes[BTREE_MAX_DEPTH * 2 + GC_MERGE_NODES];
+	unsigned nr_new_nodes;
 	struct journal_res res = { 0 };
 	struct bch_fs *c = as->c;
 	struct btree *b;
@@ -650,6 +628,7 @@ static void btree_update_nodes_written(struct closure *cl)
 	mutex_lock(&c->btree_interior_update_lock);
 	as->nodes_written = true;
 again:
+	nr_new_nodes = 0;
 	as = list_first_entry_or_null(&c->btree_interior_updates_unwritten,
 				      struct btree_update, unwritten_list);
 	if (!as || !as->nodes_written) {
@@ -738,8 +717,23 @@ free_update:
 		six_unlock_intent(&b->c.lock);
 	}
 
-	if (!ret)
-		btree_update_nodes_reachable(as, res.seq);
+	if (!ret) {
+		nr_new_nodes = as->nr_new_nodes;
+		memcpy(new_nodes,
+		       as->new_nodes,
+		       as->nr_new_nodes * sizeof(struct btree *));
+
+		while (as->nr_new_nodes) {
+			struct btree *b = as->new_nodes[--as->nr_new_nodes];
+
+			BUG_ON(b->will_make_reachable != (unsigned long) as);
+			b->will_make_reachable = 0;
+		}
+
+		while (as->nr_pending)
+			bch2_btree_node_free_ondisk(c,
+				&as->pending[--as->nr_pending], res.seq);
+	}
 
 	__bch2_btree_update_free(as);
 	/*
@@ -747,6 +741,20 @@ free_update:
 	 * nodes to be writeable:
 	 */
 	closure_wake_up(&c->btree_interior_update_wait);
+
+	/*
+	 * Can't take btree node locks while holding btree_interior_update_lock:
+	 * */
+	mutex_unlock(&c->btree_interior_update_lock);
+
+	while (nr_new_nodes) {
+		struct btree *b = new_nodes[--nr_new_nodes];
+		btree_node_lock_type(c, b, SIX_LOCK_read);
+		bch2_btree_node_write_cond(c, b, btree_node_need_write(b));
+		six_unlock_read(&b->c.lock);
+	}
+
+	mutex_lock(&c->btree_interior_update_lock);
 	goto again;
 }
 
@@ -963,11 +971,13 @@ bch2_btree_update_start(struct btree_trans *trans, enum btree_id id,
 				      BTREE_UPDATE_JOURNAL_RES,
 				      JOURNAL_RES_GET_NONBLOCK);
 	if (ret == -EAGAIN) {
+		if (flags & BTREE_INSERT_NOUNLOCK)
+			return ERR_PTR(-EINTR);
+
 		bch2_trans_unlock(trans);
 
 		ret = bch2_journal_preres_get(&c->journal, &journal_preres,
-					      BTREE_UPDATE_JOURNAL_RES,
-					      JOURNAL_RES_GET_NONBLOCK);
+					      BTREE_UPDATE_JOURNAL_RES, 0);
 		if (ret)
 			return ERR_PTR(ret);
 
@@ -978,8 +988,10 @@ bch2_btree_update_start(struct btree_trans *trans, enum btree_id id,
 	}
 
 	reserve = bch2_btree_reserve_get(c, nr_nodes, flags, cl);
-	if (IS_ERR(reserve))
+	if (IS_ERR(reserve)) {
+		bch2_journal_preres_put(&c->journal, &journal_preres);
 		return ERR_CAST(reserve);
+	}
 
 	as = mempool_alloc(&c->btree_interior_update_pool, GFP_NOIO);
 	memset(as, 0, sizeof(*as));
@@ -1677,6 +1689,7 @@ retry:
 
 	as = bch2_btree_update_start(trans, iter->btree_id,
 			 btree_update_reserve_required(c, parent) + 1,
+			 flags|
 			 BTREE_INSERT_NOFAIL|
 			 BTREE_INSERT_USE_RESERVE,
 			 !(flags & BTREE_INSERT_NOUNLOCK) ? &cl : NULL);
-- 
cgit 


From 94035eed52f58a321fa28e938898535973bec847 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 11 Apr 2020 12:29:32 -0400
Subject: bcachefs: Fix a locking bug in bch2_journal_pin_copy()

There was a race where the src pin would be flushed - releasing the last
pin on that sequence number - before adding the new journal pin. Oops.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/journal_reclaim.c | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
index 5233cb82d422..341106ab4a77 100644
--- a/fs/bcachefs/journal_reclaim.c
+++ b/fs/bcachefs/journal_reclaim.c
@@ -322,14 +322,12 @@ void bch2_journal_pin_drop(struct journal *j,
 	spin_unlock(&j->lock);
 }
 
-void __bch2_journal_pin_add(struct journal *j, u64 seq,
+static void bch2_journal_pin_add_locked(struct journal *j, u64 seq,
 			    struct journal_entry_pin *pin,
 			    journal_pin_flush_fn flush_fn)
 {
 	struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq);
 
-	spin_lock(&j->lock);
-
 	__journal_pin_drop(j, pin);
 
 	BUG_ON(!atomic_read(&pin_list->count));
@@ -339,7 +337,14 @@ void __bch2_journal_pin_add(struct journal *j, u64 seq,
 	pin->flush	= flush_fn;
 
 	list_add(&pin->list, flush_fn ? &pin_list->list : &pin_list->flushed);
+}
 
+void __bch2_journal_pin_add(struct journal *j, u64 seq,
+			    struct journal_entry_pin *pin,
+			    journal_pin_flush_fn flush_fn)
+{
+	spin_lock(&j->lock);
+	bch2_journal_pin_add_locked(j, seq, pin, flush_fn);
 	spin_unlock(&j->lock);
 
 	/*
@@ -354,9 +359,13 @@ void bch2_journal_pin_copy(struct journal *j,
 			   struct journal_entry_pin *src,
 			   journal_pin_flush_fn flush_fn)
 {
+	spin_lock(&j->lock);
+
 	if (journal_pin_active(src) &&
 	    (!journal_pin_active(dst) || src->seq < dst->seq))
-		__bch2_journal_pin_add(j, src->seq, dst, flush_fn);
+		bch2_journal_pin_add_locked(j, src->seq, dst, flush_fn);
+
+	spin_unlock(&j->lock);
 }
 
 /**
-- 
cgit 


From 15a07f2eae6852dc4f3c1172601d592be0f5756f Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 11 Apr 2020 12:30:30 -0400
Subject: bcachefs: Improve lockdep annotation in journalling code

bch2_journal_res_get() in nonblocking mode is equivalent to a trylock.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/journal.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h
index f8867f86318a..81e26ba43fa1 100644
--- a/fs/bcachefs/journal.h
+++ b/fs/bcachefs/journal.h
@@ -346,7 +346,9 @@ static inline int bch2_journal_res_get(struct journal *j, struct journal_res *re
 		return ret;
 out:
 	if (!(flags & JOURNAL_RES_GET_CHECK)) {
-		lock_acquire_shared(&j->res_map, 0, 0, NULL, _THIS_IP_);
+		lock_acquire_shared(&j->res_map, 0,
+				    (flags & JOURNAL_RES_GET_NONBLOCK) != 0,
+				    NULL, _THIS_IP_);
 		EBUG_ON(!res->ref);
 	}
 	return 0;
-- 
cgit 


From f270667a7fc020f1711953ad3b0d6e6b38eba834 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 11 Apr 2020 12:31:16 -0400
Subject: bcachefs: Slightly reduce btree split threshold

2/3rds performs a lot better than 3/4ths on the tested workloda, leading
to significanly fewer btree node compactions.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_cache.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h
index abde6c2658c6..2160012c734f 100644
--- a/fs/bcachefs/btree_cache.h
+++ b/fs/bcachefs/btree_cache.h
@@ -94,7 +94,7 @@ static inline unsigned btree_blocks(struct bch_fs *c)
 	return c->opts.btree_node_size >> c->block_bits;
 }
 
-#define BTREE_SPLIT_THRESHOLD(c)		(btree_max_u64s(c) * 3 / 4)
+#define BTREE_SPLIT_THRESHOLD(c)		(btree_max_u64s(c) * 2 / 3)
 
 #define BTREE_FOREGROUND_MERGE_THRESHOLD(c)	(btree_max_u64s(c) * 1 / 3)
 #define BTREE_FOREGROUND_MERGE_HYSTERESIS(c)			\
-- 
cgit 


From 297604c92337cd546f41a38f53d420093f7ce963 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 11 Apr 2020 12:32:27 -0400
Subject: bcachefs: Add a few tracepoints

Transaction restart tracing should probably be overhaulled at some
point.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update_interior.c |  6 +++++-
 fs/bcachefs/trace.h                 | 17 +++++++++++++++++
 2 files changed, 22 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 772595b3da9f..153d13b9c96e 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -1555,8 +1555,10 @@ int bch2_btree_split_leaf(struct bch_fs *c, struct btree_iter *iter,
 	/* Hack, because gc and splitting nodes doesn't mix yet: */
 	if (!(flags & BTREE_INSERT_GC_LOCK_HELD) &&
 	    !down_read_trylock(&c->gc_lock)) {
-		if (flags & BTREE_INSERT_NOUNLOCK)
+		if (flags & BTREE_INSERT_NOUNLOCK) {
+			trace_transaction_restart_ip(trans->ip, _THIS_IP_);
 			return -EINTR;
+		}
 
 		bch2_trans_unlock(trans);
 		down_read(&c->gc_lock);
@@ -1584,6 +1586,8 @@ int bch2_btree_split_leaf(struct bch_fs *c, struct btree_iter *iter,
 			BUG_ON(flags & BTREE_INSERT_NOUNLOCK);
 			bch2_trans_unlock(trans);
 			ret = -EINTR;
+
+			trace_transaction_restart_ip(trans->ip, _THIS_IP_);
 		}
 		goto out;
 	}
diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h
index a9fcb5442186..d109ef174fd0 100644
--- a/fs/bcachefs/trace.h
+++ b/fs/bcachefs/trace.h
@@ -499,6 +499,23 @@ TRACE_EVENT(copygc,
 		__entry->buckets_moved, __entry->buckets_not_moved)
 );
 
+TRACE_EVENT(transaction_restart_ip,
+	TP_PROTO(unsigned long caller, unsigned long ip),
+	TP_ARGS(caller, ip),
+
+	TP_STRUCT__entry(
+		__field(unsigned long,		caller	)
+		__field(unsigned long,		ip	)
+	),
+
+	TP_fast_assign(
+		__entry->caller	= caller;
+		__entry->ip	= ip;
+	),
+
+	TP_printk("%pF %pF", (void *) __entry->caller, (void *) __entry->ip)
+);
+
 DECLARE_EVENT_CLASS(transaction_restart,
 	TP_PROTO(unsigned long ip),
 	TP_ARGS(ip),
-- 
cgit 


From 41697f382cffc0e396dd832d19c78e69cdd10aa3 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 24 Apr 2020 14:08:18 -0400
Subject: bcachefs: Fix for the bkey compat path

In the write path, we were calling bch2_bkey_ops.compat() in the wrong
place.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bkey_methods.c | 95 +++++++++++++++++++++++++++-------------------
 1 file changed, 55 insertions(+), 40 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c
index c97e1e9002cb..55ef4032b37c 100644
--- a/fs/bcachefs/bkey_methods.c
+++ b/fs/bcachefs/bkey_methods.c
@@ -283,49 +283,64 @@ void __bch2_bkey_compat(unsigned level, enum btree_id btree_id,
 	const struct bkey_ops *ops;
 	struct bkey uk;
 	struct bkey_s u;
-
-	if (big_endian != CPU_BIG_ENDIAN)
-		bch2_bkey_swab_key(f, k);
-
-	if (version < bcachefs_metadata_version_bkey_renumber)
-		bch2_bkey_renumber(__btree_node_type(level, btree_id), k, write);
-
-	if (version < bcachefs_metadata_version_inode_btree_change &&
-	    btree_id == BTREE_ID_INODES) {
+	int i;
+
+	/*
+	 * Do these operations in reverse order in the write path:
+	 */
+
+	for (i = 0; i < 4; i++)
+	switch (!write ? i : 3 - i) {
+	case 0:
+		if (big_endian != CPU_BIG_ENDIAN)
+			bch2_bkey_swab_key(f, k);
+		break;
+	case 1:
+		if (version < bcachefs_metadata_version_bkey_renumber)
+			bch2_bkey_renumber(__btree_node_type(level, btree_id), k, write);
+		break;
+	case 2:
+		if (version < bcachefs_metadata_version_inode_btree_change &&
+		    btree_id == BTREE_ID_INODES) {
+			if (!bkey_packed(k)) {
+				struct bkey_i *u = packed_to_bkey(k);
+				swap(u->k.p.inode, u->k.p.offset);
+			} else if (f->bits_per_field[BKEY_FIELD_INODE] &&
+				   f->bits_per_field[BKEY_FIELD_OFFSET]) {
+				struct bkey_format tmp = *f, *in = f, *out = &tmp;
+
+				swap(tmp.bits_per_field[BKEY_FIELD_INODE],
+				     tmp.bits_per_field[BKEY_FIELD_OFFSET]);
+				swap(tmp.field_offset[BKEY_FIELD_INODE],
+				     tmp.field_offset[BKEY_FIELD_OFFSET]);
+
+				if (!write)
+					swap(in, out);
+
+				uk = __bch2_bkey_unpack_key(in, k);
+				swap(uk.p.inode, uk.p.offset);
+				BUG_ON(!bch2_bkey_pack_key(k, &uk, out));
+			}
+		}
+		break;
+	case 3:
 		if (!bkey_packed(k)) {
-			struct bkey_i *u = packed_to_bkey(k);
-			swap(u->k.p.inode, u->k.p.offset);
-		} else if (f->bits_per_field[BKEY_FIELD_INODE] &&
-			   f->bits_per_field[BKEY_FIELD_OFFSET]) {
-			struct bkey_format tmp = *f, *in = f, *out = &tmp;
-
-			swap(tmp.bits_per_field[BKEY_FIELD_INODE],
-			     tmp.bits_per_field[BKEY_FIELD_OFFSET]);
-			swap(tmp.field_offset[BKEY_FIELD_INODE],
-			     tmp.field_offset[BKEY_FIELD_OFFSET]);
-
-			if (!write)
-				swap(in, out);
-
-			uk = __bch2_bkey_unpack_key(in, k);
-			swap(uk.p.inode, uk.p.offset);
-			BUG_ON(!bch2_bkey_pack_key(k, &uk, out));
+			u = bkey_i_to_s(packed_to_bkey(k));
+		} else {
+			uk = __bch2_bkey_unpack_key(f, k);
+			u.k = &uk;
+			u.v = bkeyp_val(f, k);
 		}
-	}
 
-	if (!bkey_packed(k)) {
-		u = bkey_i_to_s(packed_to_bkey(k));
-	} else {
-		uk = __bch2_bkey_unpack_key(f, k);
-		u.k = &uk;
-		u.v = bkeyp_val(f, k);
-	}
+		if (big_endian != CPU_BIG_ENDIAN)
+			bch2_bkey_swab_val(u);
 
-	if (big_endian != CPU_BIG_ENDIAN)
-		bch2_bkey_swab_val(u);
+		ops = &bch2_bkey_ops[k->type];
 
-	ops = &bch2_bkey_ops[k->type];
-
-	if (ops->compat)
-		ops->compat(btree_id, version, big_endian, write, u);
+		if (ops->compat)
+			ops->compat(btree_id, version, big_endian, write, u);
+		break;
+	default:
+		BUG();
+	}
 }
-- 
cgit 


From 2c480a7102f20c22315b45fcc79d63078e51b13d Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 24 Apr 2020 14:08:56 -0400
Subject: bcachefs: Handle -EINTR bch2_migrate_index_update()

peek_slot() shouldn't return -EINTR when there's only a single live
iterator, but that's tricky to guarantee - we seem to be returning
-EINTR when we shouldn't, but it's easy enough to handle in the caller.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/move.c | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index a9a72963e1b7..882e86e70db7 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -69,19 +69,26 @@ static int bch2_migrate_index_update(struct bch_write_op *op)
 				   BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
 
 	while (1) {
-		struct bkey_s_c k = bch2_btree_iter_peek_slot(iter);
+		struct bkey_s_c k;
 		struct bkey_i *insert;
-		struct bkey_i_extent *new =
-			bkey_i_to_extent(bch2_keylist_front(keys));
+		struct bkey_i_extent *new;
 		BKEY_PADDED(k) _new, _insert;
 		const union bch_extent_entry *entry;
 		struct extent_ptr_decoded p;
 		bool did_work = false;
 		int nr;
 
+		bch2_trans_reset(&trans, 0);
+
+		k = bch2_btree_iter_peek_slot(iter);
 		ret = bkey_err(k);
-		if (ret)
+		if (ret) {
+			if (ret == -EINTR)
+				continue;
 			break;
+		}
+
+		new = bkey_i_to_extent(bch2_keylist_front(keys));
 
 		if (bversion_cmp(k.k->version, new->k.version) ||
 		    !bch2_bkey_matches_ptr(c, k, m->ptr, m->offset))
-- 
cgit 


From bbe65614b7bffaeff5213cb782743e7de3be48d1 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 24 Apr 2020 18:25:11 -0400
Subject: bcachefs: Fix a deadlock

btree_node_lock_increment() was incorrectly skipping over the current
iter when checking if we should increment a node we already have locked.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_locking.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h
index a164924ca656..3b199e2e1e9e 100644
--- a/fs/bcachefs/btree_locking.h
+++ b/fs/bcachefs/btree_locking.h
@@ -164,8 +164,7 @@ static inline bool btree_node_lock_increment(struct btree_iter *iter,
 	struct btree_iter *linked;
 
 	trans_for_each_iter(iter->trans, linked)
-		if (linked != iter &&
-		    linked->l[level].b == b &&
+		if (linked->l[level].b == b &&
 		    btree_node_locked_type(linked, level) >= want) {
 			six_lock_increment(&b->c.lock, want);
 			return true;
-- 
cgit 


From 6f2b9074d7511e23fec4e6a3e3f47184480b0a58 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 24 Apr 2020 17:57:59 -0400
Subject: bcachefs: More fixes for counting extent update iterators

This is unfortunately really fragile - hopefully we'll be able to think
of a new approach at some point.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/extent_update.c | 36 ++++++++++++++++++++++++------------
 1 file changed, 24 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/extent_update.c b/fs/bcachefs/extent_update.c
index 2a7d913bdda3..d0af1bc17018 100644
--- a/fs/bcachefs/extent_update.c
+++ b/fs/bcachefs/extent_update.c
@@ -34,16 +34,10 @@ static int count_iters_for_insert(struct btree_trans *trans,
 				  unsigned offset,
 				  struct bpos *end,
 				  unsigned *nr_iters,
-				  unsigned max_iters,
-				  bool overwrite)
+				  unsigned max_iters)
 {
-	int ret = 0;
+	int ret = 0, ret2 = 0;
 
-	/*
-	 * The extent update path requires an _additional_ iterator for each
-	 * extent we're inserting and overwriting:
-	 */
-	*nr_iters += 1;
 	if (*nr_iters >= max_iters) {
 		*end = bpos_min(*end, k.k->p);
 		ret = 1;
@@ -70,11 +64,14 @@ static int count_iters_for_insert(struct btree_trans *trans,
 
 		for_each_btree_key(trans, iter,
 				   BTREE_ID_REFLINK, POS(0, idx + offset),
-				   BTREE_ITER_SLOTS, r_k, ret) {
+				   BTREE_ITER_SLOTS, r_k, ret2) {
 			if (bkey_cmp(bkey_start_pos(r_k.k),
 				     POS(0, idx + sectors)) >= 0)
 				break;
 
+			/* extent_update_to_keys(), for the reflink_v update */
+			*nr_iters += 1;
+
 			*nr_iters += 1 + bch2_bkey_nr_alloc_ptrs(r_k);
 
 			if (*nr_iters >= max_iters) {
@@ -92,7 +89,7 @@ static int count_iters_for_insert(struct btree_trans *trans,
 	}
 	}
 
-	return ret;
+	return ret2 ?: ret;
 }
 
 #define EXTENT_ITERS_MAX	(BTREE_ITER_MAX / 3)
@@ -121,8 +118,11 @@ int bch2_extent_atomic_end(struct btree_iter *iter,
 
 	*end = bpos_min(insert->k.p, b->key.k.p);
 
+	/* extent_update_to_keys(): */
+	nr_iters += 1;
+
 	ret = count_iters_for_insert(trans, bkey_i_to_s_c(insert), 0, end,
-				     &nr_iters, EXTENT_ITERS_MAX / 2, false);
+				     &nr_iters, EXTENT_ITERS_MAX / 2);
 	if (ret < 0)
 		return ret;
 
@@ -139,8 +139,20 @@ int bch2_extent_atomic_end(struct btree_iter *iter,
 			offset = bkey_start_offset(&insert->k) -
 				bkey_start_offset(k.k);
 
+		/* extent_handle_overwrites(): */
+		switch (bch2_extent_overlap(&insert->k, k.k)) {
+		case BCH_EXTENT_OVERLAP_ALL:
+		case BCH_EXTENT_OVERLAP_FRONT:
+			nr_iters += 1;
+			break;
+		case BCH_EXTENT_OVERLAP_BACK:
+		case BCH_EXTENT_OVERLAP_MIDDLE:
+			nr_iters += 2;
+			break;
+		}
+
 		ret = count_iters_for_insert(trans, k, offset, end,
-					&nr_iters, EXTENT_ITERS_MAX, true);
+					&nr_iters, EXTENT_ITERS_MAX);
 		if (ret)
 			break;
 
-- 
cgit 


From f59b346477a4ed536583be2df077c5aa684a1df8 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 29 Apr 2020 15:28:25 -0400
Subject: bcachefs: Don't issue writes that are more than 1 MB

the bcachefs io path in io.c can't bounce writes larger than that.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-io.c | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index e713c83530cc..9644d4624f80 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -1177,7 +1177,8 @@ do_io:
 		if (w->io &&
 		    (w->io->op.res.nr_replicas != nr_replicas_this_write ||
 		     bio_full(&w->io->op.wbio.bio, PAGE_SIZE) ||
-		     w->io->op.wbio.bio.bi_iter.bi_size >= (256U << 20) ||
+		     w->io->op.wbio.bio.bi_iter.bi_size + (sectors << 9) >=
+		     (BIO_MAX_VECS * PAGE_SIZE) ||
 		     bio_end_sector(&w->io->op.wbio.bio) != sector))
 			bch2_writepage_do_io(w);
 
@@ -1794,12 +1795,22 @@ static long bch2_dio_write_loop(struct dio_write *dio)
 		goto loop;
 
 	while (1) {
+		size_t extra = dio->iter.count -
+			min(BIO_MAX_VECS * PAGE_SIZE, dio->iter.count);
+
 		if (kthread)
 			kthread_use_mm(dio->mm);
 		BUG_ON(current->faults_disabled_mapping);
 		current->faults_disabled_mapping = mapping;
 
+		/*
+		 * Don't issue more than 2MB at once, the bcachefs io path in
+		 * io.c can't bounce more than that:
+		 */
+
+		dio->iter.count -= extra;
 		ret = bio_iov_iter_get_pages(bio, &dio->iter);
+		dio->iter.count += extra;
 
 		current->faults_disabled_mapping = NULL;
 		if (kthread)
-- 
cgit 


From 1e1a31c4b0017dd241438bfbba3186266cc0ed11 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 29 Apr 2020 12:57:04 -0400
Subject: bcachefs: Add some printks for error paths

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_gc.c | 10 ++++++++--
 fs/bcachefs/replicas.c | 17 ++++++++++++-----
 2 files changed, 20 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index e4c1b90f3cb5..59af44f7eab6 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -701,8 +701,10 @@ static int bch2_gc_start(struct bch_fs *c,
 
 	c->usage_gc = __alloc_percpu_gfp(fs_usage_u64s(c) * sizeof(u64),
 					 sizeof(u64), GFP_KERNEL);
-	if (!c->usage_gc)
+	if (!c->usage_gc) {
+		bch_err(c, "error allocating c->usage_gc");
 		return -ENOMEM;
+	}
 
 	for_each_member_device(ca, c, i) {
 		BUG_ON(ca->buckets[1]);
@@ -713,19 +715,23 @@ static int bch2_gc_start(struct bch_fs *c,
 				GFP_KERNEL|__GFP_ZERO);
 		if (!ca->buckets[1]) {
 			percpu_ref_put(&ca->ref);
+			bch_err(c, "error allocating ca->buckets[gc]");
 			return -ENOMEM;
 		}
 
 		ca->usage[1] = alloc_percpu(struct bch_dev_usage);
 		if (!ca->usage[1]) {
+			bch_err(c, "error allocating ca->usage[gc]");
 			percpu_ref_put(&ca->ref);
 			return -ENOMEM;
 		}
 	}
 
 	ret = bch2_ec_mem_alloc(c, true);
-	if (ret)
+	if (ret) {
+		bch_err(c, "error allocating ec gc mem");
 		return ret;
+	}
 
 	percpu_down_write(&c->mark_lock);
 
diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c
index f4851c8b8f88..3e7c389f06ce 100644
--- a/fs/bcachefs/replicas.c
+++ b/fs/bcachefs/replicas.c
@@ -304,8 +304,10 @@ static int replicas_table_update(struct bch_fs *c,
 	if (!(new_base = kzalloc(bytes, GFP_NOIO)) ||
 	    !(new_scratch  = kmalloc(scratch_bytes, GFP_NOIO)) ||
 	    (c->usage_gc &&
-	     !(new_gc = __alloc_percpu_gfp(bytes, sizeof(u64), GFP_NOIO))))
+	     !(new_gc = __alloc_percpu_gfp(bytes, sizeof(u64), GFP_NOIO)))) {
+		bch_err(c, "error updating replicas table: memory allocation failure");
 		goto err;
+	}
 
 	for (i = 0; i < ARRAY_SIZE(new_usage); i++)
 		if (c->usage[i])
@@ -365,7 +367,7 @@ static int bch2_mark_replicas_slowpath(struct bch_fs *c,
 				struct bch_replicas_entry *new_entry)
 {
 	struct bch_replicas_cpu new_r, new_gc;
-	int ret = -ENOMEM;
+	int ret = 0;
 
 	verify_replicas_entry(new_entry);
 
@@ -412,14 +414,16 @@ static int bch2_mark_replicas_slowpath(struct bch_fs *c,
 		swap(new_gc, c->replicas_gc);
 	percpu_up_write(&c->mark_lock);
 out:
-	ret = 0;
-err:
 	mutex_unlock(&c->sb_lock);
 
 	kfree(new_r.entries);
 	kfree(new_gc.entries);
 
 	return ret;
+err:
+	bch_err(c, "error adding replicas entry: memory allocation failure");
+	ret = -ENOMEM;
+	goto out;
 }
 
 int bch2_mark_replicas(struct bch_fs *c,
@@ -564,6 +568,7 @@ int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask)
 					 GFP_NOIO);
 	if (!c->replicas_gc.entries) {
 		mutex_unlock(&c->sb_lock);
+		bch_err(c, "error allocating c->replicas_gc");
 		return -ENOMEM;
 	}
 
@@ -589,8 +594,10 @@ retry:
 	nr		= READ_ONCE(c->replicas.nr);
 	new.entry_size	= READ_ONCE(c->replicas.entry_size);
 	new.entries	= kcalloc(nr, new.entry_size, GFP_KERNEL);
-	if (!new.entries)
+	if (!new.entries) {
+		bch_err(c, "error allocating c->replicas_gc");
 		return -ENOMEM;
+	}
 
 	mutex_lock(&c->sb_lock);
 	percpu_down_write(&c->mark_lock);
-- 
cgit 


From 5b6d505a77f87a4e7d13430920499c6e58928f27 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 1 May 2020 19:56:31 -0400
Subject: bcachefs: Fix another deadlock in btree_update_nodes_written()

We also can't be blocking on btree node write locks while holding
btree_interior_update_lock.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update_interior.c | 41 ++++++++++++++++++++++++++++++++++---
 1 file changed, 38 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 153d13b9c96e..1f0d95558858 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -609,6 +609,19 @@ static void bch2_btree_update_free(struct btree_update *as)
 	mutex_unlock(&c->btree_interior_update_lock);
 }
 
+static inline bool six_trylock_intentwrite(struct six_lock *lock)
+{
+	if (!six_trylock_intent(lock))
+		return false;
+
+	if (!six_trylock_write(lock)) {
+		six_unlock_intent(lock);
+		return false;
+	}
+
+	return true;
+}
+
 static void btree_update_nodes_written(struct closure *cl)
 {
 	struct btree_update *as = container_of(cl, struct btree_update, cl);
@@ -637,10 +650,15 @@ again:
 	}
 
 	b = as->b;
-	if (b && !six_trylock_intent(&b->c.lock)) {
+	if (b && !six_trylock_intentwrite(&b->c.lock)) {
 		mutex_unlock(&c->btree_interior_update_lock);
+
 		btree_node_lock_type(c, b, SIX_LOCK_intent);
+		six_lock_write(&b->c.lock, NULL, NULL);
+
+		six_unlock_write(&b->c.lock);
 		six_unlock_intent(&b->c.lock);
+
 		mutex_lock(&c->btree_interior_update_lock);
 		goto again;
 	}
@@ -648,7 +666,25 @@ again:
 	list_del(&as->unwritten_list);
 
 	ret = bch2_journal_res_get(&c->journal, &res, as->journal_u64s,
+				   JOURNAL_RES_GET_NONBLOCK|
 				   JOURNAL_RES_GET_RESERVED);
+	if (ret == -EAGAIN) {
+		unsigned u64s = as->journal_u64s;
+
+		six_unlock_write(&b->c.lock);
+		six_unlock_intent(&b->c.lock);
+
+		mutex_unlock(&c->btree_interior_update_lock);
+
+		ret = bch2_journal_res_get(&c->journal, &res, u64s,
+					   JOURNAL_RES_GET_CHECK|
+					   JOURNAL_RES_GET_RESERVED);
+		if (!ret) {
+			mutex_lock(&c->btree_interior_update_lock);
+			goto again;
+		}
+	}
+
 	if (ret) {
 		BUG_ON(!bch2_journal_error(&c->journal));
 		/* can't unblock btree writes */
@@ -671,7 +707,6 @@ again:
 		/* @b is the node we did the final insert into: */
 		BUG_ON(!res.ref);
 
-		six_lock_write(&b->c.lock, NULL, NULL);
 		list_del(&as->write_blocked_list);
 
 		i = btree_bset_last(b);
@@ -680,7 +715,6 @@ again:
 			    le64_to_cpu(i->journal_seq)));
 
 		bch2_btree_add_journal_pin(c, b, res.seq);
-		six_unlock_write(&b->c.lock);
 		break;
 
 	case BTREE_INTERIOR_UPDATING_AS:
@@ -709,6 +743,7 @@ again:
 free_update:
 	/* Do btree write after dropping journal res: */
 	if (b) {
+		six_unlock_write(&b->c.lock);
 		/*
 		 * b->write_blocked prevented it from being written, so
 		 * write it now if it needs to be written:
-- 
cgit 


From bc970cecd86d5a77f58e3be017fd80276e71677e Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 2 May 2020 16:21:35 -0400
Subject: bcachefs: Fix two more deadlocks

Deadlock on shutdown:

btree_update_nodes_written() unblocks btree nodes from being written;
after doing so, it has to check if they were marked as needing to be
written and if so kick off those writes - if that doesn't happen, we'll
never release journal pins and shutdown will get stuck when flushing the
journal.

There was an error path where this didn't happen, because in the error
path we don't actually want those btree nodes write to happen; however,
we still have to kick off the write path so the journal pins get
released. The btree write path checks if we're in a journal error state
and doesn't do the actual write if we are.

Also - there was another deadlock because btree_update_nodes_written()
was taking the btree update off of the unwritten_list too soon - before
getting a journal reservation, which could fail and have to be retried.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_io.c              |   5 ++
 fs/bcachefs/btree_update_interior.c | 112 +++++++++++++++++++-----------------
 2 files changed, 64 insertions(+), 53 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 04537eb06e4a..32bd193a85c5 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -1626,6 +1626,11 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
 	 * reflect that those writes were done and the data flushed from the
 	 * journal:
 	 *
+	 * Also on journal error, the pending write may have updates that were
+	 * never journalled (interior nodes, see btree_update_nodes_written()) -
+	 * it's critical that we don't do the write in that case otherwise we
+	 * will have updates visible that weren't in the journal:
+	 *
 	 * Make sure to update b->written so bch2_btree_init_next() doesn't
 	 * break:
 	 */
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 1f0d95558858..28568db5834a 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -586,12 +586,12 @@ static void __bch2_btree_update_free(struct btree_update *as)
 	bch2_journal_pin_drop(&c->journal, &as->journal);
 	bch2_journal_pin_flush(&c->journal, &as->journal);
 
-	BUG_ON((as->nr_new_nodes || as->nr_pending) &&
-	       !bch2_journal_error(&c->journal));;
+	BUG_ON(as->nr_new_nodes || as->nr_pending);
 
 	if (as->reserve)
 		bch2_btree_reserve_put(c, as->reserve);
 
+	list_del(&as->unwritten_list);
 	list_del(&as->list);
 
 	closure_debug_destroy(&as->cl);
@@ -625,12 +625,12 @@ static inline bool six_trylock_intentwrite(struct six_lock *lock)
 static void btree_update_nodes_written(struct closure *cl)
 {
 	struct btree_update *as = container_of(cl, struct btree_update, cl);
-	struct btree *new_nodes[BTREE_MAX_DEPTH * 2 + GC_MERGE_NODES];
-	unsigned nr_new_nodes;
+	struct btree *nodes_need_write[BTREE_MAX_DEPTH * 2 + GC_MERGE_NODES + 1];
+	unsigned nr_nodes_need_write;
 	struct journal_res res = { 0 };
 	struct bch_fs *c = as->c;
+	struct btree_root *r;
 	struct btree *b;
-	struct bset *i;
 	int ret;
 
 	/*
@@ -641,7 +641,7 @@ static void btree_update_nodes_written(struct closure *cl)
 	mutex_lock(&c->btree_interior_update_lock);
 	as->nodes_written = true;
 again:
-	nr_new_nodes = 0;
+	nr_nodes_need_write = 0;
 	as = list_first_entry_or_null(&c->btree_interior_updates_unwritten,
 				      struct btree_update, unwritten_list);
 	if (!as || !as->nodes_written) {
@@ -663,16 +663,16 @@ again:
 		goto again;
 	}
 
-	list_del(&as->unwritten_list);
-
 	ret = bch2_journal_res_get(&c->journal, &res, as->journal_u64s,
 				   JOURNAL_RES_GET_NONBLOCK|
 				   JOURNAL_RES_GET_RESERVED);
 	if (ret == -EAGAIN) {
 		unsigned u64s = as->journal_u64s;
 
-		six_unlock_write(&b->c.lock);
-		six_unlock_intent(&b->c.lock);
+		if (b) {
+			six_unlock_write(&b->c.lock);
+			six_unlock_intent(&b->c.lock);
+		}
 
 		mutex_unlock(&c->btree_interior_update_lock);
 
@@ -685,19 +685,22 @@ again:
 		}
 	}
 
-	if (ret) {
-		BUG_ON(!bch2_journal_error(&c->journal));
-		/* can't unblock btree writes */
-		goto free_update;
-	}
-
-	{
+	if (!ret) {
 		struct journal_buf *buf = &c->journal.buf[res.idx];
 		struct jset_entry *entry = vstruct_idx(buf->data, res.offset);
 
 		res.offset	+= as->journal_u64s;
 		res.u64s	-= as->journal_u64s;
 		memcpy_u64s(entry, as->journal_entries, as->journal_u64s);
+	} else {
+		/*
+		 * On journal error we have to run most of the normal path so
+		 * that shutdown works - unblocking btree node writes in
+		 * particular and writing them if needed - except for
+		 * journalling the update:
+		 */
+
+		BUG_ON(!bch2_journal_error(&c->journal));
 	}
 
 	switch (as->mode) {
@@ -705,24 +708,41 @@ again:
 		BUG();
 	case BTREE_INTERIOR_UPDATING_NODE:
 		/* @b is the node we did the final insert into: */
-		BUG_ON(!res.ref);
+
+		/*
+		 * On failure to get a journal reservation, we still have to
+		 * unblock the write and allow most of the write path to happen
+		 * so that shutdown works, but the i->journal_seq mechanism
+		 * won't work to prevent the btree write from being visible (we
+		 * didn't get a journal sequence number) - instead
+		 * __bch2_btree_node_write() doesn't do the actual write if
+		 * we're in journal error state:
+		 */
 
 		list_del(&as->write_blocked_list);
 
-		i = btree_bset_last(b);
-		i->journal_seq = cpu_to_le64(
-			max(res.seq,
-			    le64_to_cpu(i->journal_seq)));
+		if (!ret) {
+			struct bset *i = btree_bset_last(b);
+
+			i->journal_seq = cpu_to_le64(
+				max(res.seq,
+				    le64_to_cpu(i->journal_seq)));
+
+			bch2_btree_add_journal_pin(c, b, res.seq);
+		}
+
+		nodes_need_write[nr_nodes_need_write++] = b;
 
-		bch2_btree_add_journal_pin(c, b, res.seq);
+		six_unlock_write(&b->c.lock);
+		six_unlock_intent(&b->c.lock);
 		break;
 
 	case BTREE_INTERIOR_UPDATING_AS:
 		BUG_ON(b);
 		break;
 
-	case BTREE_INTERIOR_UPDATING_ROOT: {
-		struct btree_root *r = &c->btree_roots[as->btree_id];
+	case BTREE_INTERIOR_UPDATING_ROOT:
+		r = &c->btree_roots[as->btree_id];
 
 		BUG_ON(b);
 
@@ -734,42 +754,25 @@ again:
 		mutex_unlock(&c->btree_root_lock);
 		break;
 	}
-	}
 
 	bch2_journal_pin_drop(&c->journal, &as->journal);
 
 	bch2_journal_res_put(&c->journal, &res);
 	bch2_journal_preres_put(&c->journal, &as->journal_preres);
-free_update:
-	/* Do btree write after dropping journal res: */
-	if (b) {
-		six_unlock_write(&b->c.lock);
-		/*
-		 * b->write_blocked prevented it from being written, so
-		 * write it now if it needs to be written:
-		 */
-		btree_node_write_if_need(c, b, SIX_LOCK_intent);
-		six_unlock_intent(&b->c.lock);
-	}
 
-	if (!ret) {
-		nr_new_nodes = as->nr_new_nodes;
-		memcpy(new_nodes,
-		       as->new_nodes,
-		       as->nr_new_nodes * sizeof(struct btree *));
+	while (as->nr_new_nodes) {
+		b = as->new_nodes[--as->nr_new_nodes];
 
-		while (as->nr_new_nodes) {
-			struct btree *b = as->new_nodes[--as->nr_new_nodes];
+		BUG_ON(b->will_make_reachable != (unsigned long) as);
+		b->will_make_reachable = 0;
 
-			BUG_ON(b->will_make_reachable != (unsigned long) as);
-			b->will_make_reachable = 0;
-		}
-
-		while (as->nr_pending)
-			bch2_btree_node_free_ondisk(c,
-				&as->pending[--as->nr_pending], res.seq);
+		nodes_need_write[nr_nodes_need_write++] = b;
 	}
 
+	while (as->nr_pending)
+		bch2_btree_node_free_ondisk(c,
+			&as->pending[--as->nr_pending], res.seq);
+
 	__bch2_btree_update_free(as);
 	/*
 	 * for flush_held_btree_writes() waiting on updates to flush or
@@ -782,8 +785,10 @@ free_update:
 	 * */
 	mutex_unlock(&c->btree_interior_update_lock);
 
-	while (nr_new_nodes) {
-		struct btree *b = new_nodes[--nr_new_nodes];
+	/* Do btree writes after dropping journal res/locks: */
+	while (nr_nodes_need_write) {
+		b = nodes_need_write[--nr_nodes_need_write];
+
 		btree_node_lock_type(c, b, SIX_LOCK_read);
 		bch2_btree_node_write_cond(c, b, btree_node_need_write(b));
 		six_unlock_read(&b->c.lock);
@@ -1036,6 +1041,7 @@ bch2_btree_update_start(struct btree_trans *trans, enum btree_id id,
 	as->btree_id	= id;
 	as->reserve	= reserve;
 	INIT_LIST_HEAD(&as->write_blocked_list);
+	INIT_LIST_HEAD(&as->unwritten_list);
 	as->journal_preres = journal_preres;
 
 	bch2_keylist_init(&as->parent_keys, as->inline_keys);
-- 
cgit 


From c4dd7871ef71a7327b6b3834e2cdf6777e03eb0a Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 6 May 2020 15:37:04 -0400
Subject: bcachefs: Some compression improvements

In __bio_map_or_bounce(), the check for if the bio is physically
contiguous is improved; it's now more readable and handles multi page
but contiguous bios.

Also when decompressing, we were doing a redundant memcpy in the case
where we were able to use vmap to map a bio contigiously.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/compress.c | 53 +++++++++++++++++++++++++++++++++++---------------
 1 file changed, 37 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c
index 89eb03a988f1..41d0b49d354f 100644
--- a/fs/bcachefs/compress.c
+++ b/fs/bcachefs/compress.c
@@ -39,6 +39,24 @@ static struct bbuf __bounce_alloc(struct bch_fs *c, unsigned size, int rw)
 	BUG();
 }
 
+static bool bio_phys_contig(struct bio *bio, struct bvec_iter start)
+{
+	struct bio_vec bv;
+	struct bvec_iter iter;
+	void *expected_start = NULL;
+
+	__bio_for_each_bvec(bv, bio, iter, start) {
+		if (expected_start &&
+		    expected_start != page_address(bv.bv_page) + bv.bv_offset)
+			return false;
+
+		expected_start = page_address(bv.bv_page) +
+			bv.bv_offset + bv.bv_len;
+	}
+
+	return true;
+}
+
 static struct bbuf __bio_map_or_bounce(struct bch_fs *c, struct bio *bio,
 				       struct bvec_iter start, int rw)
 {
@@ -48,27 +66,28 @@ static struct bbuf __bio_map_or_bounce(struct bch_fs *c, struct bio *bio,
 	unsigned nr_pages = 0;
 	struct page *stack_pages[16];
 	struct page **pages = NULL;
-	bool first = true;
-	unsigned prev_end = PAGE_SIZE;
 	void *data;
 
 	BUG_ON(bvec_iter_sectors(start) > c->sb.encoded_extent_max);
 
-#ifndef CONFIG_HIGHMEM
-	__bio_for_each_bvec(bv, bio, iter, start) {
-		if (bv.bv_len == start.bi_size)
-			return (struct bbuf) {
-				.b = page_address(bv.bv_page) + bv.bv_offset,
-				.type = BB_NONE, .rw = rw
-			};
-	}
-#endif
+	if (!IS_ENABLED(CONFIG_HIGHMEM) &&
+	    bio_phys_contig(bio, start))
+		return (struct bbuf) {
+			.b = page_address(bio_iter_page(bio, start)) +
+				bio_iter_offset(bio, start),
+			.type = BB_NONE, .rw = rw
+		};
+
+	/* check if we can map the pages contiguously: */
 	__bio_for_each_segment(bv, bio, iter, start) {
-		if ((!first && bv.bv_offset) ||
-		    prev_end != PAGE_SIZE)
+		if (iter.bi_size != start.bi_size &&
+		    bv.bv_offset)
+			goto bounce;
+
+		if (bv.bv_len < iter.bi_size &&
+		    bv.bv_offset + bv.bv_len < PAGE_SIZE)
 			goto bounce;
 
-		prev_end = bv.bv_offset + bv.bv_len;
 		nr_pages++;
 	}
 
@@ -264,7 +283,8 @@ int bch2_bio_uncompress(struct bch_fs *c, struct bio *src,
 	if (ret)
 		goto err;
 
-	if (dst_data.type != BB_NONE)
+	if (dst_data.type != BB_NONE &&
+	    dst_data.type != BB_VMAP)
 		memcpy_to_bio(dst, dst_iter, dst_data.b + (crc.offset << 9));
 err:
 	bio_unmap_or_unbounce(c, dst_data);
@@ -407,7 +427,8 @@ static unsigned __bio_compress(struct bch_fs *c,
 	memset(dst_data.b + *dst_len, 0, pad);
 	*dst_len += pad;
 
-	if (dst_data.type != BB_NONE)
+	if (dst_data.type != BB_NONE &&
+	    dst_data.type != BB_VMAP)
 		memcpy_to_bio(dst, dst->bi_iter, dst_data.b);
 
 	BUG_ON(!*dst_len || *dst_len > dst->bi_iter.bi_size);
-- 
cgit 


From d9b59a57cc81b1a73ff094401e9d65326cf0156b Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 8 May 2020 23:15:42 -0400
Subject: bcachefs: Fix initialization of bounce mempools

When they were converted to kvpmalloc pools they weren't converted to
pass the actual size of the allocation. Oops.

Also, validate the real length in the zstd decompression path.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/compress.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c
index 41d0b49d354f..20bde73a17a8 100644
--- a/fs/bcachefs/compress.c
+++ b/fs/bcachefs/compress.c
@@ -191,20 +191,21 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src,
 	}
 	case BCH_COMPRESSION_TYPE_zstd: {
 		ZSTD_DCtx *ctx;
-		size_t len;
+		size_t real_src_len = le32_to_cpup(src_data.b);
+
+		if (real_src_len > src_len - 4)
+			goto err;
 
 		workspace = mempool_alloc(&c->decompress_workspace, GFP_NOIO);
 		ctx = zstd_init_dctx(workspace, zstd_dctx_workspace_bound());
 
-		src_len = le32_to_cpup(src_data.b);
-
-		len = zstd_decompress_dctx(ctx,
+		ret = zstd_decompress_dctx(ctx,
 				dst_data,	dst_len,
-				src_data.b + 4, src_len);
+				src_data.b + 4, real_src_len);
 
 		mempool_free(workspace, &c->decompress_workspace);
 
-		if (len != dst_len)
+		if (ret != dst_len)
 			goto err;
 		break;
 	}
@@ -533,7 +534,6 @@ void bch2_fs_compress_exit(struct bch_fs *c)
 static int __bch2_fs_compress_init(struct bch_fs *c, u64 features)
 {
 	size_t max_extent = c->sb.encoded_extent_max << 9;
-	size_t order = get_order(max_extent);
 	size_t decompress_workspace_size = 0;
 	bool decompress_workspace_needed;
 	ZSTD_parameters params = zstd_get_params(0, max_extent);
@@ -568,14 +568,14 @@ have_compressed:
 
 	if (!mempool_initialized(&c->compression_bounce[READ])) {
 		ret = mempool_init_kvpmalloc_pool(&c->compression_bounce[READ],
-						  1, order);
+						  1, max_extent);
 		if (ret)
 			goto out;
 	}
 
 	if (!mempool_initialized(&c->compression_bounce[WRITE])) {
 		ret = mempool_init_kvpmalloc_pool(&c->compression_bounce[WRITE],
-						  1, order);
+						  1, max_extent);
 		if (ret)
 			goto out;
 	}
-- 
cgit 


From a9310ab06c0b4a0c199e569dd34579b74b9142d0 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 11 May 2020 20:01:07 -0400
Subject: bcachefs: Fixes for startup on very full filesystems

 - Always pass BTREE_INSERT_USE_RESERVE when writing alloc btree keys
 - Don't strand buckest on the copygc freelist until after recovery is
   done and we're starting copygc.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c | 16 ++++++++++++++--
 fs/bcachefs/super.c            |  3 ++-
 2 files changed, 16 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 9ce53164d9ac..559b9be50952 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -315,7 +315,9 @@ retry:
 	bch2_trans_update(trans, iter, &a->k_i,
 			  BTREE_TRIGGER_NORUN);
 	ret = bch2_trans_commit(trans, NULL, NULL,
-				BTREE_INSERT_NOFAIL|flags);
+				BTREE_INSERT_NOFAIL|
+				BTREE_INSERT_USE_RESERVE|
+				flags);
 err:
 	if (ret == -EINTR)
 		goto retry;
@@ -1033,7 +1035,16 @@ static int push_invalidated_bucket(struct bch_fs *c, struct bch_dev *ca, size_t
 		set_current_state(TASK_INTERRUPTIBLE);
 
 		spin_lock(&c->freelist_lock);
-		for (i = 0; i < RESERVE_NR; i++)
+		for (i = 0; i < RESERVE_NR; i++) {
+
+			/*
+			 * Don't strand buckets on the copygc freelist until
+			 * after recovery is finished:
+			 */
+			if (!test_bit(BCH_FS_STARTED, &c->flags) &&
+			    i == RESERVE_MOVINGGC)
+				continue;
+
 			if (fifo_push(&ca->free[i], bucket)) {
 				fifo_pop(&ca->free_inc, bucket);
 
@@ -1043,6 +1054,7 @@ static int push_invalidated_bucket(struct bch_fs *c, struct bch_dev *ca, size_t
 				spin_unlock(&c->freelist_lock);
 				goto out;
 			}
+		}
 
 		if (ca->allocator_state != ALLOCATOR_BLOCKED_FULL) {
 			ca->allocator_state = ALLOCATOR_BLOCKED_FULL;
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index bbb0780bc4ca..84fb2f51e48a 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -860,6 +860,8 @@ int bch2_fs_start(struct bch_fs *c)
 	if (bch2_fs_init_fault("fs_start"))
 		goto err;
 
+	set_bit(BCH_FS_STARTED, &c->flags);
+
 	if (c->opts.read_only || c->opts.nochanges) {
 		bch2_fs_read_only(c);
 	} else {
@@ -871,7 +873,6 @@ int bch2_fs_start(struct bch_fs *c)
 			goto err;
 	}
 
-	set_bit(BCH_FS_STARTED, &c->flags);
 	print_mount_opts(c);
 	ret = 0;
 out:
-- 
cgit 


From f36dff2885ee70990e529389a08988d5c218eed0 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 12 May 2020 18:34:16 -0400
Subject: bcachefs: Validate that we read the correct btree node

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_io.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 32bd193a85c5..a5888de327fc 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -736,6 +736,17 @@ static int validate_bset(struct bch_fs *c, struct btree *b,
 		struct btree_node *bn =
 			container_of(i, struct btree_node, keys);
 		/* These indicate that we read the wrong btree node: */
+
+		if (b->key.k.type == KEY_TYPE_btree_ptr_v2) {
+			struct bch_btree_ptr_v2 *bp =
+				&bkey_i_to_btree_ptr_v2(&b->key)->v;
+
+			/* XXX endianness */
+			btree_err_on(bp->seq != bn->keys.seq,
+				     BTREE_ERR_MUST_RETRY, c, b, NULL,
+				     "incorrect sequence number (wrong btree node)");
+		}
+
 		btree_err_on(BTREE_NODE_ID(bn) != b->c.btree_id,
 			     BTREE_ERR_MUST_RETRY, c, b, i,
 			     "incorrect btree id");
-- 
cgit 


From 4167b4cdba30fb8db190a3439324f413dc08a0c0 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 13 May 2020 00:15:28 -0400
Subject: bcachefs: Fix a workqueue deadlock

writes running out of a workqueue (via dio path) could block and prevent
other writes from calling bch2_write_index() and completing.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/io.c | 29 +++++++++++++++++++++++++++--
 fs/bcachefs/io.h |  1 +
 2 files changed, 28 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index 3dcb166afa23..7df2b6c3f168 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -604,7 +604,9 @@ static void bch2_write_index(struct closure *cl)
 
 	__bch2_write_index(op);
 
-	if (!op->error && (op->flags & BCH_WRITE_FLUSH)) {
+	if (!(op->flags & BCH_WRITE_DONE)) {
+		continue_at(cl, __bch2_write, index_update_wq(op));
+	} else if (!op->error && (op->flags & BCH_WRITE_FLUSH)) {
 		bch2_journal_flush_seq_async(&c->journal,
 					     *op_journal_seq(op),
 					     cl);
@@ -1104,8 +1106,15 @@ again:
 		if (ret < 0)
 			goto err;
 
-		if (ret)
+		if (ret) {
 			skip_put = false;
+		} else {
+			/*
+			 * for the skip_put optimization this has to be set
+			 * before we submit the bio:
+			 */
+			op->flags |= BCH_WRITE_DONE;
+		}
 
 		bio->bi_end_io	= bch2_write_endio;
 		bio->bi_private	= &op->cl;
@@ -1128,16 +1137,30 @@ again:
 	return;
 err:
 	op->error = ret;
+	op->flags |= BCH_WRITE_DONE;
 
 	continue_at(cl, bch2_write_index, index_update_wq(op));
 	return;
 flush_io:
+	/*
+	 * If the write can't all be submitted at once, we generally want to
+	 * block synchronously as that signals backpressure to the caller.
+	 *
+	 * However, if we're running out of a workqueue, we can't block here
+	 * because we'll be blocking other work items from completing:
+	 */
+	if (current->flags & PF_WQ_WORKER) {
+		continue_at(cl, bch2_write_index, index_update_wq(op));
+		return;
+	}
+
 	closure_sync(cl);
 
 	if (!bch2_keylist_empty(&op->insert_keys)) {
 		__bch2_write_index(op);
 
 		if (op->error) {
+			op->flags |= BCH_WRITE_DONE;
 			continue_at_nobarrier(cl, bch2_write_done, NULL);
 			return;
 		}
@@ -1183,6 +1206,8 @@ static void bch2_write_data_inline(struct bch_write_op *op, unsigned data_len)
 	bch2_keylist_push(&op->insert_keys);
 
 	op->flags |= BCH_WRITE_WROTE_DATA_INLINE;
+	op->flags |= BCH_WRITE_DONE;
+
 	continue_at_nobarrier(cl, bch2_write_index, NULL);
 	return;
 err:
diff --git a/fs/bcachefs/io.h b/fs/bcachefs/io.h
index c250bceb77ea..0a049cc14e42 100644
--- a/fs/bcachefs/io.h
+++ b/fs/bcachefs/io.h
@@ -40,6 +40,7 @@ enum bch_write_flags {
 	/* Internal: */
 	BCH_WRITE_JOURNAL_SEQ_PTR	= (1 << 10),
 	BCH_WRITE_SKIP_CLOSURE_PUT	= (1 << 11),
+	BCH_WRITE_DONE			= (1 << 12),
 };
 
 static inline u64 *op_journal_seq(struct bch_write_op *op)
-- 
cgit 


From d3baf289dce56f78700ca9dc64153263652c9a0a Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 13 May 2020 17:53:33 -0400
Subject: bcachefs: Fix setquota

We were returning -EINTR because we were failing to retry the btree
transaction.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/quota.c | 59 +++++++++++++++++++++++++++--------------------------
 1 file changed, 30 insertions(+), 29 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c
index e7787c5063ce..d3032a46e7f3 100644
--- a/fs/bcachefs/quota.c
+++ b/fs/bcachefs/quota.c
@@ -710,25 +710,15 @@ found:
 	return ret;
 }
 
-static int bch2_set_quota(struct super_block *sb, struct kqid qid,
-			  struct qc_dqblk *qdq)
+static int bch2_set_quota_trans(struct btree_trans *trans,
+				struct bkey_i_quota *new_quota,
+				struct qc_dqblk *qdq)
 {
-	struct bch_fs *c = sb->s_fs_info;
-	struct btree_trans trans;
 	struct btree_iter *iter;
 	struct bkey_s_c k;
-	struct bkey_i_quota new_quota;
 	int ret;
 
-	if (sb->s_flags & SB_RDONLY)
-		return -EROFS;
-
-	bkey_quota_init(&new_quota.k_i);
-	new_quota.k.p = POS(qid.type, from_kqid(&init_user_ns, qid));
-
-	bch2_trans_init(&trans, c, 0, 0);
-
-	iter = bch2_trans_get_iter(&trans, BTREE_ID_QUOTAS, new_quota.k.p,
+	iter = bch2_trans_get_iter(trans, BTREE_ID_QUOTAS, new_quota->k.p,
 				   BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
 	k = bch2_btree_iter_peek_slot(iter);
 
@@ -736,32 +726,43 @@ static int bch2_set_quota(struct super_block *sb, struct kqid qid,
 	if (unlikely(ret))
 		return ret;
 
-	switch (k.k->type) {
-	case KEY_TYPE_quota:
-		new_quota.v = *bkey_s_c_to_quota(k).v;
-		break;
-	}
+	if (k.k->type == KEY_TYPE_quota)
+		new_quota->v = *bkey_s_c_to_quota(k).v;
 
 	if (qdq->d_fieldmask & QC_SPC_SOFT)
-		new_quota.v.c[Q_SPC].softlimit = cpu_to_le64(qdq->d_spc_softlimit >> 9);
+		new_quota->v.c[Q_SPC].softlimit = cpu_to_le64(qdq->d_spc_softlimit >> 9);
 	if (qdq->d_fieldmask & QC_SPC_HARD)
-		new_quota.v.c[Q_SPC].hardlimit = cpu_to_le64(qdq->d_spc_hardlimit >> 9);
+		new_quota->v.c[Q_SPC].hardlimit = cpu_to_le64(qdq->d_spc_hardlimit >> 9);
 
 	if (qdq->d_fieldmask & QC_INO_SOFT)
-		new_quota.v.c[Q_INO].softlimit = cpu_to_le64(qdq->d_ino_softlimit);
+		new_quota->v.c[Q_INO].softlimit = cpu_to_le64(qdq->d_ino_softlimit);
 	if (qdq->d_fieldmask & QC_INO_HARD)
-		new_quota.v.c[Q_INO].hardlimit = cpu_to_le64(qdq->d_ino_hardlimit);
+		new_quota->v.c[Q_INO].hardlimit = cpu_to_le64(qdq->d_ino_hardlimit);
+
+	return bch2_trans_update(trans, iter, &new_quota->k_i, 0);
+}
 
-	bch2_trans_update(&trans, iter, &new_quota.k_i, 0);
+static int bch2_set_quota(struct super_block *sb, struct kqid qid,
+			  struct qc_dqblk *qdq)
+{
+	struct bch_fs *c = sb->s_fs_info;
+	struct btree_trans trans;
+	struct bkey_i_quota new_quota;
+	int ret;
 
-	ret = bch2_trans_commit(&trans, NULL, NULL, 0);
+	if (sb->s_flags & SB_RDONLY)
+		return -EROFS;
 
-	bch2_trans_exit(&trans);
+	bkey_quota_init(&new_quota.k_i);
+	new_quota.k.p = POS(qid.type, from_kqid(&init_user_ns, qid));
 
-	if (ret)
-		return ret;
+	bch2_trans_init(&trans, c, 0, 0);
 
-	ret = __bch2_quota_set(c, bkey_i_to_s_c(&new_quota.k_i));
+	ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_NOUNLOCK,
+			    bch2_set_quota_trans(&trans, &new_quota, qdq)) ?:
+		__bch2_quota_set(c, bkey_i_to_s_c(&new_quota.k_i));
+
+	bch2_trans_exit(&trans);
 
 	return ret;
 }
-- 
cgit 


From 0d0e77f45fe806a80af76501d5422437692e158b Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 14 May 2020 21:45:08 -0400
Subject: bcachefs: Fix another iterator counting bug

We were marking the end of where we could insert incorrectly for
indirect extents.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/extent_update.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/extent_update.c b/fs/bcachefs/extent_update.c
index d0af1bc17018..fd011df3cb99 100644
--- a/fs/bcachefs/extent_update.c
+++ b/fs/bcachefs/extent_update.c
@@ -76,7 +76,8 @@ static int count_iters_for_insert(struct btree_trans *trans,
 
 			if (*nr_iters >= max_iters) {
 				struct bpos pos = bkey_start_pos(k.k);
-				pos.offset += r_k.k->p.offset - idx;
+				pos.offset += min_t(u64, k.k->size,
+						    r_k.k->p.offset - idx);
 
 				*end = bpos_min(*end, pos);
 				ret = 1;
-- 
cgit 


From a1b0da4555c16bf274f705dc76c9b297f54bf6c8 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 21 May 2020 17:23:40 -0400
Subject: bcachefs: Wrap vmap() in memalloc_nofs_save()/restore()

vmalloc() and vmap() don't take GFP_NOFS - this should be pushed further
up the IO path, but for now just doing the simple fix.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/compress.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c
index 20bde73a17a8..6115e0294e4d 100644
--- a/fs/bcachefs/compress.c
+++ b/fs/bcachefs/compress.c
@@ -7,6 +7,7 @@
 #include "super-io.h"
 
 #include <linux/lz4.h>
+#include <linux/sched/mm.h>
 #include <linux/zlib.h>
 #include <linux/zstd.h>
 
@@ -63,7 +64,7 @@ static struct bbuf __bio_map_or_bounce(struct bch_fs *c, struct bio *bio,
 	struct bbuf ret;
 	struct bio_vec bv;
 	struct bvec_iter iter;
-	unsigned nr_pages = 0;
+	unsigned nr_pages = 0, flags;
 	struct page *stack_pages[16];
 	struct page **pages = NULL;
 	void *data;
@@ -103,7 +104,10 @@ static struct bbuf __bio_map_or_bounce(struct bch_fs *c, struct bio *bio,
 	__bio_for_each_segment(bv, bio, iter, start)
 		pages[nr_pages++] = bv.bv_page;
 
+	flags = memalloc_nofs_save();
 	data = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL);
+	memalloc_nofs_restore(flags);
+
 	if (pages != stack_pages)
 		kfree(pages);
 
-- 
cgit 


From 22d8a33d30d7a28d0dd972f36cc48b80f585d67b Mon Sep 17 00:00:00 2001
From: Yuxuan Shui <yshuiv7@gmail.com>
Date: Fri, 22 May 2020 15:50:05 +0100
Subject: bcachefs: fix stack corruption

When a bkey_on_stack is passed to bch_read_indirect_extent, there is no
guarantee that it will be big enough to hold the bkey. And
bch_read_indirect_extent is not aware of bkey_on_stack to call realloc
on it. This cause a stack corruption.

This commit makes bch_read_indirect_extent aware of bkey_on_stack so it
can call realloc when appropriate.

Tested-by: Yuxuan Shui <yshuiv7@gmail.com>
Signed-off-by: Yuxuan Shui <yshuiv7@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-io.c |  2 +-
 fs/bcachefs/fs.c    |  2 +-
 fs/bcachefs/io.c    | 10 +++++-----
 fs/bcachefs/io.h    |  7 ++++---
 4 files changed, 11 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 9644d4624f80..7ce6d71aca29 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -788,7 +788,7 @@ retry:
 		sectors = k.k->size - offset_into_extent;
 
 		ret = bch2_read_indirect_extent(trans,
-					&offset_into_extent, sk.k);
+					&offset_into_extent, &sk);
 		if (ret)
 			break;
 
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index 4458a98b78ee..6aff3203b4e1 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -918,7 +918,7 @@ retry:
 		sectors			= k.k->size - offset_into_extent;
 
 		ret = bch2_read_indirect_extent(&trans,
-					&offset_into_extent, cur.k);
+					&offset_into_extent, &cur);
 		if (ret)
 			break;
 
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index 7df2b6c3f168..39a23c6570eb 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -1642,7 +1642,7 @@ retry:
 		sectors = k.k->size - offset_into_extent;
 
 		ret = bch2_read_indirect_extent(&trans,
-					&offset_into_extent, sk.k);
+					&offset_into_extent, &sk);
 		if (ret)
 			break;
 
@@ -1944,14 +1944,14 @@ static void bch2_read_endio(struct bio *bio)
 
 int __bch2_read_indirect_extent(struct btree_trans *trans,
 				unsigned *offset_into_extent,
-				struct bkey_i *orig_k)
+				struct bkey_on_stack *orig_k)
 {
 	struct btree_iter *iter;
 	struct bkey_s_c k;
 	u64 reflink_offset;
 	int ret;
 
-	reflink_offset = le64_to_cpu(bkey_i_to_reflink_p(orig_k)->v.idx) +
+	reflink_offset = le64_to_cpu(bkey_i_to_reflink_p(orig_k->k)->v.idx) +
 		*offset_into_extent;
 
 	iter = bch2_trans_get_iter(trans, BTREE_ID_REFLINK,
@@ -1974,7 +1974,7 @@ int __bch2_read_indirect_extent(struct btree_trans *trans,
 	}
 
 	*offset_into_extent = iter->pos.offset - bkey_start_offset(k.k);
-	bkey_reassemble(orig_k, k);
+	bkey_on_stack_reassemble(orig_k, trans->c, k);
 err:
 	bch2_trans_iter_put(trans, iter);
 	return ret;
@@ -2281,7 +2281,7 @@ retry:
 		k = bkey_i_to_s_c(sk.k);
 
 		ret = bch2_read_indirect_extent(&trans,
-					&offset_into_extent, sk.k);
+					&offset_into_extent, &sk);
 		if (ret)
 			goto err;
 
diff --git a/fs/bcachefs/io.h b/fs/bcachefs/io.h
index 0a049cc14e42..f0fe0bf906d3 100644
--- a/fs/bcachefs/io.h
+++ b/fs/bcachefs/io.h
@@ -3,6 +3,7 @@
 #define _BCACHEFS_IO_H
 
 #include "checksum.h"
+#include "bkey_on_stack.h"
 #include "io_types.h"
 
 #define to_wbio(_bio)			\
@@ -114,13 +115,13 @@ struct cache_promote_op;
 struct extent_ptr_decoded;
 
 int __bch2_read_indirect_extent(struct btree_trans *, unsigned *,
-				struct bkey_i *);
+				struct bkey_on_stack *);
 
 static inline int bch2_read_indirect_extent(struct btree_trans *trans,
 					    unsigned *offset_into_extent,
-					    struct bkey_i *k)
+					    struct bkey_on_stack *k)
 {
-	return k->k.type == KEY_TYPE_reflink_p
+	return k->k->k.type == KEY_TYPE_reflink_p
 		? __bch2_read_indirect_extent(trans, offset_into_extent, k)
 		: 0;
 }
-- 
cgit 


From e5c15444d6aca00610c5ae9b571e307c65e744e4 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 23 May 2020 11:44:12 -0400
Subject: bcachefs: Print out d_type in dirent_to_text()

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/dirent.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c
index ae5c9fd8d9f7..f34bfda8ab0d 100644
--- a/fs/bcachefs/dirent.c
+++ b/fs/bcachefs/dirent.c
@@ -104,7 +104,7 @@ void bch2_dirent_to_text(struct printbuf *out, struct bch_fs *c,
 
 	bch_scnmemcpy(out, d.v->d_name,
 		      bch2_dirent_name_bytes(d));
-	pr_buf(out, " -> %llu", d.v->d_inum);
+	pr_buf(out, " -> %llu type %u", d.v->d_inum, d.v->d_type);
 }
 
 static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans,
-- 
cgit 


From 2628cfe3d3037c20015bcc8bc59143f8b913db6c Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 25 May 2020 18:47:21 -0400
Subject: bcachefs: Add vmalloc fallback for decompress workspace

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/compress.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c
index 6115e0294e4d..920460a182b4 100644
--- a/fs/bcachefs/compress.c
+++ b/fs/bcachefs/compress.c
@@ -607,7 +607,7 @@ have_compressed:
 	}
 
 	if (!mempool_initialized(&c->decompress_workspace)) {
-		ret = mempool_init_kmalloc_pool(
+		ret = mempool_init_kvpmalloc_pool(
 				&c->decompress_workspace,
 				1, decompress_workspace_size);
 		if (ret)
-- 
cgit 


From dc744b51f97ce043d07ebfdd0397b323396c7683 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 25 May 2020 21:25:31 -0400
Subject: bcachefs: Handle printing of null bkeys

This fixes a null ptr deref.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bkey_methods.c | 21 ++++++++++++++-------
 1 file changed, 14 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c
index 55ef4032b37c..36e0c5152b47 100644
--- a/fs/bcachefs/bkey_methods.c
+++ b/fs/bcachefs/bkey_methods.c
@@ -176,13 +176,17 @@ void bch2_bpos_to_text(struct printbuf *out, struct bpos pos)
 
 void bch2_bkey_to_text(struct printbuf *out, const struct bkey *k)
 {
-	pr_buf(out, "u64s %u type %s ", k->u64s,
-	       bch2_bkey_types[k->type]);
+	if (k) {
+		pr_buf(out, "u64s %u type %s ", k->u64s,
+		       bch2_bkey_types[k->type]);
 
-	bch2_bpos_to_text(out, k->p);
+		bch2_bpos_to_text(out, k->p);
 
-	pr_buf(out, " snap %u len %u ver %llu",
-	       k->p.snapshot, k->size, k->version.lo);
+		pr_buf(out, " snap %u len %u ver %llu",
+		       k->p.snapshot, k->size, k->version.lo);
+	} else {
+		pr_buf(out, "(null)");
+	}
 }
 
 void bch2_val_to_text(struct printbuf *out, struct bch_fs *c,
@@ -198,8 +202,11 @@ void bch2_bkey_val_to_text(struct printbuf *out, struct bch_fs *c,
 			   struct bkey_s_c k)
 {
 	bch2_bkey_to_text(out, k.k);
-	pr_buf(out, ": ");
-	bch2_val_to_text(out, c, k);
+
+	if (k.k) {
+		pr_buf(out, ": ");
+		bch2_val_to_text(out, c, k);
+	}
 }
 
 void bch2_bkey_swab_val(struct bkey_s k)
-- 
cgit 


From 2340fd9d27c48072f4409ad194a8838acd789b8f Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 24 May 2020 13:37:44 -0400
Subject: bcachefs: Be more rigorous about marking the filesystem clean

Previously, there was at least one error path where we could mark the
filesystem clean when we hadn't sucessfully written out alloc info.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs.h |  1 +
 fs/bcachefs/super.c    | 15 ++++++++++++---
 2 files changed, 13 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 5304b6762179..069a3c416bc5 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -479,6 +479,7 @@ struct bch_dev {
 enum {
 	/* startup: */
 	BCH_FS_ALLOC_READ_DONE,
+	BCH_FS_ALLOC_CLEAN,
 	BCH_FS_ALLOCATOR_STARTED,
 	BCH_FS_ALLOCATOR_RUNNING,
 	BCH_FS_INITIAL_GC_DONE,
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 84fb2f51e48a..6b5ab579a25c 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -192,8 +192,12 @@ static void __bch2_fs_read_only(struct bch_fs *c)
 	 */
 	bch2_journal_flush_all_pins(&c->journal);
 
+	/*
+	 * If the allocator threads didn't all start up, the btree updates to
+	 * write out alloc info aren't going to work:
+	 */
 	if (!test_bit(BCH_FS_ALLOCATOR_RUNNING, &c->flags))
-		goto allocator_not_running;
+		goto nowrote_alloc;
 
 	do {
 		wrote = false;
@@ -205,7 +209,7 @@ static void __bch2_fs_read_only(struct bch_fs *c)
 			bch2_fs_inconsistent(c, "error writing out alloc info %i", ret);
 
 		if (ret)
-			break;
+			goto nowrote_alloc;
 
 		for_each_member_device(ca, c, i)
 			bch2_dev_allocator_quiesce(c, ca);
@@ -224,7 +228,9 @@ static void __bch2_fs_read_only(struct bch_fs *c)
 
 		clean_passes = wrote ? 0 : clean_passes + 1;
 	} while (clean_passes < 2);
-allocator_not_running:
+
+	set_bit(BCH_FS_ALLOC_CLEAN, &c->flags);
+nowrote_alloc:
 	for_each_member_device(ca, c, i)
 		bch2_dev_allocator_stop(ca);
 
@@ -306,6 +312,7 @@ void bch2_fs_read_only(struct bch_fs *c)
 	    !test_bit(BCH_FS_ERROR, &c->flags) &&
 	    !test_bit(BCH_FS_EMERGENCY_RO, &c->flags) &&
 	    test_bit(BCH_FS_STARTED, &c->flags) &&
+	    test_bit(BCH_FS_ALLOC_CLEAN, &c->flags) &&
 	    !c->opts.norecovery)
 		bch2_fs_mark_clean(c);
 
@@ -394,6 +401,8 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early)
 	if (ret)
 		goto err;
 
+	clear_bit(BCH_FS_ALLOC_CLEAN, &c->flags);
+
 	for_each_rw_member(ca, c, i)
 		bch2_dev_allocator_add(c, ca);
 	bch2_recalc_capacity(c);
-- 
cgit 


From aafcf9bc12f479b47f4bc1f008f4002dd1af91b8 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 24 May 2020 14:20:00 -0400
Subject: bcachefs: Better error messages on bucket sector count overflows

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_gc.c |  4 +++-
 fs/bcachefs/buckets.c  | 39 +++++++++++++++++++++++----------------
 2 files changed, 26 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 59af44f7eab6..1a97a74b36c8 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -581,8 +581,10 @@ static int bch2_gc_done(struct bch_fs *c,
 #define copy_bucket_field(_f)						\
 	if (dst->b[b].mark._f != src->b[b].mark._f) {			\
 		if (verify)						\
-			fsck_err(c, "dev %u bucket %zu has wrong " #_f	\
+			fsck_err(c, "bucket %u:%zu gen %u data type %s has wrong " #_f	\
 				": got %u, should be %u", i, b,		\
+				dst->b[b].mark.gen,			\
+				bch2_data_types[dst->b[b].mark.data_type],\
 				dst->b[b].mark._f, src->b[b].mark._f);	\
 		dst->b[b]._mark._f = src->b[b].mark._f;			\
 	}
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 22e30ed716c4..43095ae4731d 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -778,29 +778,31 @@ static int bch2_mark_alloc(struct bch_fs *c, struct bkey_s_c k,
 })
 
 static int __bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
-				       size_t b, enum bch_data_type type,
+				       size_t b, enum bch_data_type data_type,
 				       unsigned sectors, bool gc)
 {
 	struct bucket *g = __bucket(ca, b, gc);
 	struct bucket_mark old, new;
 	bool overflow;
 
-	BUG_ON(type != BCH_DATA_SB &&
-	       type != BCH_DATA_JOURNAL);
+	BUG_ON(data_type != BCH_DATA_SB &&
+	       data_type != BCH_DATA_JOURNAL);
 
 	old = bucket_cmpxchg(g, new, ({
-		new.data_type	= type;
+		new.data_type	= data_type;
 		overflow = checked_add(new.dirty_sectors, sectors);
 	}));
 
 	bch2_fs_inconsistent_on(old.data_type &&
-				old.data_type != type, c,
+				old.data_type != data_type, c,
 		"different types of data in same bucket: %s, %s",
 		bch2_data_types[old.data_type],
-		bch2_data_types[type]);
+		bch2_data_types[data_type]);
 
 	bch2_fs_inconsistent_on(overflow, c,
-		"bucket sector count overflow: %u + %u > U16_MAX",
+		"bucket %u:%zu gen %u data type %s sector count overflow: %u + %u > U16_MAX",
+		ca->dev_idx, b, new.gen,
+		bch2_data_types[old.data_type ?: data_type],
 		old.dirty_sectors, sectors);
 
 	if (c)
@@ -926,6 +928,7 @@ static bool bch2_mark_pointer(struct bch_fs *c,
 	struct bucket_mark old, new;
 	struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev);
 	struct bucket *g = PTR_BUCKET(ca, &p.ptr, gc);
+	u16 *dst_sectors, orig_sectors;
 	bool overflow;
 	u64 v;
 
@@ -953,10 +956,12 @@ static bool bch2_mark_pointer(struct bch_fs *c,
 			return true;
 		}
 
-		if (!p.ptr.cached)
-			overflow = checked_add(new.dirty_sectors, sectors);
-		else
-			overflow = checked_add(new.cached_sectors, sectors);
+		dst_sectors = !p.ptr.cached
+			? &new.dirty_sectors
+			: &new.cached_sectors;
+		orig_sectors = *dst_sectors;
+
+		overflow = checked_add(*dst_sectors, sectors);
 
 		if (!new.dirty_sectors &&
 		    !new.cached_sectors) {
@@ -987,10 +992,10 @@ static bool bch2_mark_pointer(struct bch_fs *c,
 			bch2_data_types[data_type]);
 
 	bch2_fs_inconsistent_on(overflow, c,
-		"bucket sector count overflow: %u + %lli > U16_MAX",
-		!p.ptr.cached
-		? old.dirty_sectors
-		: old.cached_sectors, sectors);
+		"bucket %u:%zu gen %u data type %s sector count overflow: %u + %lli > U16_MAX",
+		p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), new.gen,
+		bch2_data_types[old.data_type ?: data_type],
+		orig_sectors, sectors);
 
 	bch2_dev_usage_update(c, ca, fs_usage, old, new, gc);
 
@@ -1504,7 +1509,9 @@ static int bch2_trans_mark_pointer(struct btree_trans *trans,
 
 	if (checked_add(*dst_sectors, sectors)) {
 		bch2_fs_inconsistent(c,
-			"bucket sector count overflow: %u + %lli > U16_MAX",
+			"bucket %llu:%llu gen %u data type %s sector count overflow: %u + %lli > U16_MAX",
+			iter->pos.inode, iter->pos.offset, u.gen,
+			bch2_data_types[u.data_type ?: data_type],
 			orig_sectors, sectors);
 		/* return an error indicating that we need full fsck */
 		ret = -EIO;
-- 
cgit 


From 692c3f0601bd1b04b914a40907a36e4c36dc8edd Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 27 May 2020 14:10:27 -0400
Subject: bcachefs: fix memalloc_nofs_restore() usage

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_cache.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index 4ff57925fb2c..ef7ca552a0a3 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -554,7 +554,6 @@ out_unlock:
 
 	list_del_init(&b->list);
 	mutex_unlock(&bc->lock);
-	memalloc_nofs_restore(flags);
 out:
 	b->flags		= 0;
 	b->written		= 0;
@@ -567,6 +566,7 @@ out:
 	bch2_time_stats_update(&c->times[BCH_TIME_btree_node_mem_alloc],
 			       start_time);
 
+	memalloc_nofs_restore(flags);
 	return b;
 err:
 	/* Try to cannibalize another cached btree node: */
@@ -582,6 +582,7 @@ err:
 	}
 
 	mutex_unlock(&bc->lock);
+	memalloc_nofs_restore(flags);
 	return ERR_PTR(-ENOMEM);
 }
 
-- 
cgit 


From b29303966b9e07dda5f21c667909eb87849453f2 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 24 May 2020 14:06:10 -0400
Subject: bcachefs: Fix reading of alloc info after unclean shutdown

When updates to interior nodes started being journalled, that meant that
after an unclean shutdown, until journal replay is done we can't walk
the btree without overlaying the updates from the journal.

The initial btree gc was changed to walk the btree overlaying keys from
the journal - but bch2_alloc_read() and bch2_stripes_read() were missed.
Major whoops...

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c | 30 ++++++++----------
 fs/bcachefs/ec.c               | 47 ++++++++++-----------------
 fs/bcachefs/ec.h               |  2 --
 fs/bcachefs/recovery.c         | 72 ++++++++++++++++++++++++++++++++++++++++++
 fs/bcachefs/recovery.h         |  7 ++++
 fs/bcachefs/super.c            |  7 +++-
 6 files changed, 114 insertions(+), 51 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 559b9be50952..a08ae42cc073 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -208,29 +208,25 @@ void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c,
 			       get_alloc_field(a.v, &d, i));
 }
 
-int bch2_alloc_read(struct bch_fs *c, struct journal_keys *journal_keys)
+static int bch2_alloc_read_fn(struct bch_fs *c, enum btree_id id,
+			      unsigned level, struct bkey_s_c k)
 {
-	struct btree_trans trans;
-	struct btree_and_journal_iter iter;
-	struct bkey_s_c k;
-	struct bch_dev *ca;
-	unsigned i;
-	int ret = 0;
-
-	bch2_trans_init(&trans, c, 0, 0);
-
-	bch2_btree_and_journal_iter_init(&iter, &trans, journal_keys,
-					 BTREE_ID_ALLOC, POS_MIN);
-
-	while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
+	if (!level)
 		bch2_mark_key(c, k, 0, 0, NULL, 0,
 			      BTREE_TRIGGER_ALLOC_READ|
 			      BTREE_TRIGGER_NOATOMIC);
 
-		bch2_btree_and_journal_iter_advance(&iter);
-	}
+	return 0;
+}
+
+int bch2_alloc_read(struct bch_fs *c, struct journal_keys *journal_keys)
+{
+	struct bch_dev *ca;
+	unsigned i;
+	int ret = 0;
 
-	ret = bch2_trans_exit(&trans) ?: ret;
+	ret = bch2_btree_and_journal_walk(c, journal_keys, BTREE_ID_ALLOC,
+					  NULL, bch2_alloc_read_fn);
 	if (ret) {
 		bch_err(c, "error reading alloc info: %i", ret);
 		return ret;
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index 909a4a5036ab..074b811e9043 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -1273,38 +1273,28 @@ int bch2_stripes_write(struct bch_fs *c, unsigned flags, bool *wrote)
 	return ret;
 }
 
-int bch2_stripes_read(struct bch_fs *c, struct journal_keys *journal_keys)
+static int bch2_stripes_read_fn(struct bch_fs *c, enum btree_id id,
+			      unsigned level, struct bkey_s_c k)
 {
-	struct btree_trans trans;
-	struct btree_and_journal_iter iter;
-	struct bkey_s_c k;
-	int ret;
-
-	ret = bch2_fs_ec_start(c);
-	if (ret)
-		return ret;
-
-	bch2_trans_init(&trans, c, 0, 0);
-
-	bch2_btree_and_journal_iter_init(&iter, &trans, journal_keys,
-					 BTREE_ID_EC, POS_MIN);
-
+	int ret = 0;
 
-	while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
-		bch2_mark_key(c, k, 0, 0, NULL, 0,
-			      BTREE_TRIGGER_ALLOC_READ|
-			      BTREE_TRIGGER_NOATOMIC);
+	if (k.k->type == KEY_TYPE_stripe)
+		ret = __ec_stripe_mem_alloc(c, k.k->p.offset, GFP_KERNEL) ?:
+			bch2_mark_key(c, k, 0, 0, NULL, 0,
+				      BTREE_TRIGGER_ALLOC_READ|
+				      BTREE_TRIGGER_NOATOMIC);
 
-		bch2_btree_and_journal_iter_advance(&iter);
-	}
+	return ret;
+}
 
-	ret = bch2_trans_exit(&trans) ?: ret;
-	if (ret) {
+int bch2_stripes_read(struct bch_fs *c, struct journal_keys *journal_keys)
+{
+	int ret = bch2_btree_and_journal_walk(c, journal_keys, BTREE_ID_EC,
+					  NULL, bch2_stripes_read_fn);
+	if (ret)
 		bch_err(c, "error reading stripes: %i", ret);
-		return ret;
-	}
 
-	return 0;
+	return ret;
 }
 
 int bch2_ec_mem_alloc(struct bch_fs *c, bool gc)
@@ -1343,11 +1333,6 @@ int bch2_ec_mem_alloc(struct bch_fs *c, bool gc)
 	return 0;
 }
 
-int bch2_fs_ec_start(struct bch_fs *c)
-{
-	return bch2_ec_mem_alloc(c, false);
-}
-
 void bch2_fs_ec_exit(struct bch_fs *c)
 {
 	struct ec_stripe_head *h;
diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h
index cf67abd48490..4dfaac034886 100644
--- a/fs/bcachefs/ec.h
+++ b/fs/bcachefs/ec.h
@@ -157,8 +157,6 @@ int bch2_stripes_write(struct bch_fs *, unsigned, bool *);
 
 int bch2_ec_mem_alloc(struct bch_fs *, bool);
 
-int bch2_fs_ec_start(struct bch_fs *);
-
 void bch2_fs_ec_exit(struct bch_fs *);
 int bch2_fs_ec_init(struct bch_fs *);
 
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 8e9d412a6000..95265f1c2b21 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -191,6 +191,78 @@ void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *i
 			       b->c.btree_id, b->c.level, b->data->min_key);
 }
 
+/* Walk btree, overlaying keys from the journal: */
+
+static int bch2_btree_and_journal_walk_recurse(struct bch_fs *c, struct btree *b,
+				struct journal_keys *journal_keys,
+				enum btree_id btree_id,
+				btree_walk_node_fn node_fn,
+				btree_walk_key_fn key_fn)
+{
+	struct btree_and_journal_iter iter;
+	struct bkey_s_c k;
+	int ret = 0;
+
+	bch2_btree_and_journal_iter_init_node_iter(&iter, journal_keys, b);
+
+	while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
+		ret = key_fn(c, btree_id, b->c.level, k);
+		if (ret)
+			break;
+
+		if (b->c.level) {
+			struct btree *child;
+			BKEY_PADDED(k) tmp;
+
+			bkey_reassemble(&tmp.k, k);
+			k = bkey_i_to_s_c(&tmp.k);
+
+			bch2_btree_and_journal_iter_advance(&iter);
+
+			if (b->c.level > 0) {
+				child = bch2_btree_node_get_noiter(c, &tmp.k,
+							b->c.btree_id, b->c.level - 1);
+				ret = PTR_ERR_OR_ZERO(child);
+				if (ret)
+					break;
+
+				ret   = (node_fn ? node_fn(c, b) : 0) ?:
+					bch2_btree_and_journal_walk_recurse(c, child,
+						journal_keys, btree_id, node_fn, key_fn);
+				six_unlock_read(&child->c.lock);
+
+				if (ret)
+					break;
+			}
+		} else {
+			bch2_btree_and_journal_iter_advance(&iter);
+		}
+	}
+
+	return ret;
+}
+
+int bch2_btree_and_journal_walk(struct bch_fs *c, struct journal_keys *journal_keys,
+				enum btree_id btree_id,
+				btree_walk_node_fn node_fn,
+				btree_walk_key_fn key_fn)
+{
+	struct btree *b = c->btree_roots[btree_id].b;
+	int ret = 0;
+
+	if (btree_node_fake(b))
+		return 0;
+
+	six_lock_read(&b->c.lock, NULL, NULL);
+	ret   = (node_fn ? node_fn(c, b) : 0) ?:
+		bch2_btree_and_journal_walk_recurse(c, b, journal_keys, btree_id,
+						    node_fn, key_fn) ?:
+		key_fn(c, btree_id, b->c.level + 1, bkey_i_to_s_c(&b->key));
+	six_unlock_read(&b->c.lock);
+
+	return ret;
+}
+
 /* sort and dedup all keys in the journal: */
 
 void bch2_journal_entries_free(struct list_head *list)
diff --git a/fs/bcachefs/recovery.h b/fs/bcachefs/recovery.h
index 19f2f172a26b..a66827c9addf 100644
--- a/fs/bcachefs/recovery.h
+++ b/fs/bcachefs/recovery.h
@@ -44,6 +44,13 @@ void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *,
 						struct journal_keys *,
 						struct btree *);
 
+typedef int (*btree_walk_node_fn)(struct bch_fs *c, struct btree *b);
+typedef int (*btree_walk_key_fn)(struct bch_fs *c, enum btree_id id,
+				 unsigned level, struct bkey_s_c k);
+
+int bch2_btree_and_journal_walk(struct bch_fs *, struct journal_keys *, enum btree_id,
+				btree_walk_node_fn, btree_walk_key_fn);
+
 void bch2_journal_keys_free(struct journal_keys *);
 void bch2_journal_entries_free(struct list_head *);
 
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 6b5ab579a25c..165163f3896e 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -199,6 +199,8 @@ static void __bch2_fs_read_only(struct bch_fs *c)
 	if (!test_bit(BCH_FS_ALLOCATOR_RUNNING, &c->flags))
 		goto nowrote_alloc;
 
+	bch_verbose(c, "writing alloc info");
+
 	do {
 		wrote = false;
 
@@ -229,6 +231,7 @@ static void __bch2_fs_read_only(struct bch_fs *c)
 		clean_passes = wrote ? 0 : clean_passes + 1;
 	} while (clean_passes < 2);
 
+	bch_verbose(c, "writing alloc info complete");
 	set_bit(BCH_FS_ALLOC_CLEAN, &c->flags);
 nowrote_alloc:
 	for_each_member_device(ca, c, i)
@@ -313,8 +316,10 @@ void bch2_fs_read_only(struct bch_fs *c)
 	    !test_bit(BCH_FS_EMERGENCY_RO, &c->flags) &&
 	    test_bit(BCH_FS_STARTED, &c->flags) &&
 	    test_bit(BCH_FS_ALLOC_CLEAN, &c->flags) &&
-	    !c->opts.norecovery)
+	    !c->opts.norecovery) {
+		bch_verbose(c, "marking filesystem clean");
 		bch2_fs_mark_clean(c);
+	}
 
 	clear_bit(BCH_FS_RW, &c->flags);
 }
-- 
cgit 


From 96e2aa1be5eebd81ed572baf69f8cb82d56e39bd Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 25 May 2020 19:29:48 -0400
Subject: bcachefs: Add a mechanism for passing extra journal entries to
 bch2_trans_commit()

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c        |  3 +++
 fs/bcachefs/btree_types.h       |  3 +++
 fs/bcachefs/btree_update_leaf.c | 12 +++++++++++-
 fs/bcachefs/journal.h           | 11 ++++++++---
 4 files changed, 25 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 7b12bd163df7..ca775e63b4c6 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -2153,6 +2153,9 @@ void bch2_trans_reset(struct btree_trans *trans, unsigned flags)
 	trans->nr_updates2		= 0;
 	trans->mem_top			= 0;
 
+	trans->extra_journal_entries	= NULL;
+	trans->extra_journal_entry_u64s	= 0;
+
 	if (trans->fs_usage_deltas) {
 		trans->fs_usage_deltas->used = 0;
 		memset((void *) trans->fs_usage_deltas +
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 57796340fb36..769c05c8d938 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -308,6 +308,9 @@ struct btree_trans {
 	struct btree_insert_entry *updates2;
 
 	/* update path: */
+	struct jset_entry	*extra_journal_entries;
+	unsigned		extra_journal_entry_u64s;
+
 	struct journal_res	journal_res;
 	struct journal_preres	journal_preres;
 	u64			*journal_seq;
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index a93bc1890263..98b60d230dce 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -413,6 +413,16 @@ bch2_trans_commit_write_locked(struct btree_trans *trans,
 			goto err;
 	}
 
+	if (unlikely(trans->extra_journal_entry_u64s)) {
+		memcpy_u64s_small(bch2_journal_reservation_entry(&c->journal,
+								 &trans->journal_res),
+				  trans->extra_journal_entries,
+				  trans->extra_journal_entry_u64s);
+
+		trans->journal_res.offset	+= trans->extra_journal_entry_u64s;
+		trans->journal_res.u64s		-= trans->extra_journal_entry_u64s;
+	}
+
 	/*
 	 * Not allowed to fail after we've gotten our journal reservation - we
 	 * have to use it:
@@ -800,7 +810,7 @@ int __bch2_trans_commit(struct btree_trans *trans)
 
 	memset(&trans->journal_preres, 0, sizeof(trans->journal_preres));
 
-	trans->journal_u64s		= 0;
+	trans->journal_u64s		= trans->extra_journal_entry_u64s;
 	trans->journal_preres_u64s	= 0;
 
 	if (!(trans->flags & BTREE_INSERT_NOCHECK_RW) &&
diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h
index 81e26ba43fa1..6630db6ecc14 100644
--- a/fs/bcachefs/journal.h
+++ b/fs/bcachefs/journal.h
@@ -199,13 +199,18 @@ bch2_journal_add_entry_noreservation(struct journal_buf *buf, size_t u64s)
 	return entry;
 }
 
+static inline struct jset_entry *
+bch2_journal_reservation_entry(struct journal *j, struct journal_res *res)
+{
+	return vstruct_idx(j->buf[res->idx].data, res->offset);
+}
+
 static inline void bch2_journal_add_entry(struct journal *j, struct journal_res *res,
 					  unsigned type, enum btree_id id,
 					  unsigned level,
 					  const void *data, unsigned u64s)
 {
-	struct journal_buf *buf = &j->buf[res->idx];
-	struct jset_entry *entry = vstruct_idx(buf->data, res->offset);
+	struct jset_entry *entry = bch2_journal_reservation_entry(j, res);
 	unsigned actual = jset_u64s(u64s);
 
 	EBUG_ON(!res->ref);
@@ -221,7 +226,7 @@ static inline void bch2_journal_add_entry(struct journal *j, struct journal_res
 	entry->pad[0]	= 0;
 	entry->pad[1]	= 0;
 	entry->pad[2]	= 0;
-	memcpy_u64s(entry->_data, data, u64s);
+	memcpy_u64s_small(entry->_data, data, u64s);
 }
 
 static inline void bch2_journal_add_keys(struct journal *j, struct journal_res *res,
-- 
cgit 


From c823c3390bd2f325f78bab493f84ea8a84f5ddc2 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 25 May 2020 20:35:53 -0400
Subject: bcachefs: Factor out bch2_fs_btree_interior_update_init()

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update_interior.c | 19 +++++++++++++++++++
 fs/bcachefs/btree_update_interior.h |  3 +++
 fs/bcachefs/super.c                 | 13 ++-----------
 3 files changed, 24 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 28568db5834a..1867d732afd4 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -2247,3 +2247,22 @@ size_t bch2_btree_interior_updates_nr_pending(struct bch_fs *c)
 
 	return ret;
 }
+
+void bch2_fs_btree_interior_update_exit(struct bch_fs *c)
+{
+	mempool_exit(&c->btree_interior_update_pool);
+	mempool_exit(&c->btree_reserve_pool);
+}
+
+int bch2_fs_btree_interior_update_init(struct bch_fs *c)
+{
+	mutex_init(&c->btree_reserve_cache_lock);
+	INIT_LIST_HEAD(&c->btree_interior_update_list);
+	INIT_LIST_HEAD(&c->btree_interior_updates_unwritten);
+	mutex_init(&c->btree_interior_update_lock);
+
+	return  mempool_init_kmalloc_pool(&c->btree_reserve_pool, 1,
+					  sizeof(struct btree_reserve)) ?:
+		mempool_init_kmalloc_pool(&c->btree_interior_update_pool, 1,
+					  sizeof(struct btree_update));
+}
diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h
index fb35be00f1bb..5cec87951dc7 100644
--- a/fs/bcachefs/btree_update_interior.h
+++ b/fs/bcachefs/btree_update_interior.h
@@ -333,4 +333,7 @@ ssize_t bch2_btree_updates_print(struct bch_fs *, char *);
 
 size_t bch2_btree_interior_updates_nr_pending(struct bch_fs *);
 
+void bch2_fs_btree_interior_update_exit(struct bch_fs *);
+int bch2_fs_btree_interior_update_init(struct bch_fs *);
+
 #endif /* _BCACHEFS_BTREE_UPDATE_INTERIOR_H */
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 165163f3896e..4335e0a11c2e 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -476,6 +476,7 @@ static void bch2_fs_free(struct bch_fs *c)
 	bch2_fs_ec_exit(c);
 	bch2_fs_encryption_exit(c);
 	bch2_fs_io_exit(c);
+	bch2_fs_btree_interior_update_exit(c);
 	bch2_fs_btree_iter_exit(c);
 	bch2_fs_btree_cache_exit(c);
 	bch2_fs_journal_exit(&c->journal);
@@ -494,8 +495,6 @@ static void bch2_fs_free(struct bch_fs *c)
 	mempool_exit(&c->large_bkey_pool);
 	mempool_exit(&c->btree_bounce_pool);
 	bioset_exit(&c->btree_bio);
-	mempool_exit(&c->btree_interior_update_pool);
-	mempool_exit(&c->btree_reserve_pool);
 	mempool_exit(&c->fill_iter);
 	percpu_ref_exit(&c->writes);
 	kfree(c->replicas.entries);
@@ -657,11 +656,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 
 	INIT_LIST_HEAD(&c->list);
 
-	INIT_LIST_HEAD(&c->btree_interior_update_list);
-	INIT_LIST_HEAD(&c->btree_interior_updates_unwritten);
-	mutex_init(&c->btree_reserve_cache_lock);
-	mutex_init(&c->btree_interior_update_lock);
-
 	mutex_init(&c->usage_scratch_lock);
 
 	mutex_init(&c->bio_bounce_pages_lock);
@@ -736,10 +730,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 				WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_HIGHPRI, 1)) ||
 	    percpu_ref_init(&c->writes, bch2_writes_disabled,
 			    PERCPU_REF_INIT_DEAD, GFP_KERNEL) ||
-	    mempool_init_kmalloc_pool(&c->btree_reserve_pool, 1,
-				      sizeof(struct btree_reserve)) ||
-	    mempool_init_kmalloc_pool(&c->btree_interior_update_pool, 1,
-				      sizeof(struct btree_update)) ||
 	    mempool_init_kmalloc_pool(&c->fill_iter, 1, iter_size) ||
 	    bioset_init(&c->btree_bio, 1,
 			max(offsetof(struct btree_read_bio, bio),
@@ -756,6 +746,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 	    bch2_fs_replicas_init(c) ||
 	    bch2_fs_btree_cache_init(c) ||
 	    bch2_fs_btree_iter_init(c) ||
+	    bch2_fs_btree_interior_update_init(c) ||
 	    bch2_fs_io_init(c) ||
 	    bch2_fs_encryption_init(c) ||
 	    bch2_fs_compress_init(c) ||
-- 
cgit 


From 00b8ccf7074fddb5607a26673f331ceac2ecd319 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 25 May 2020 14:57:06 -0400
Subject: bcachefs: Interior btree updates are now fully transactional

We now update the alloc info (bucket sector counts) atomically with
journalling the update to the interior btree nodes, and we also set new
btree roots atomically with the journalled part of the btree update.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c      |   5 -
 fs/bcachefs/bcachefs.h              |   6 +-
 fs/bcachefs/btree_gc.c              |  12 +-
 fs/bcachefs/btree_types.h           |   1 +
 fs/bcachefs/btree_update_interior.c | 816 +++++++++++++-----------------------
 fs/bcachefs/btree_update_interior.h |  64 ++-
 fs/bcachefs/btree_update_leaf.c     |   7 +-
 fs/bcachefs/buckets.c               |   2 +-
 fs/bcachefs/buckets.h               |   2 -
 fs/bcachefs/journal.c               |   5 +-
 fs/bcachefs/journal.h               |  31 +-
 fs/bcachefs/journal_io.c            |  20 +-
 fs/bcachefs/journal_reclaim.c       |   2 +-
 fs/bcachefs/journal_reclaim.h       |   2 +-
 fs/bcachefs/keylist.c               |   4 +-
 fs/bcachefs/keylist.h               |   4 +-
 fs/bcachefs/migrate.c               |  11 +-
 fs/bcachefs/move.c                  |  10 +-
 fs/bcachefs/recovery.c              |   7 +-
 fs/bcachefs/super-io.c              |  22 +-
 fs/bcachefs/super.c                 |   5 +
 21 files changed, 412 insertions(+), 626 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index a08ae42cc073..b3c5d82c15de 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -1461,11 +1461,6 @@ again:
 		}
 	rcu_read_unlock();
 
-	if (c->btree_roots_dirty) {
-		bch2_journal_meta(&c->journal);
-		goto again;
-	}
-
 	return !nodes_unwritten &&
 		!bch2_btree_interior_updates_nr_pending(c);
 }
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 069a3c416bc5..e12946d686dd 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -603,13 +603,10 @@ struct bch_fs {
 	struct bio_set		btree_bio;
 
 	struct btree_root	btree_roots[BTREE_ID_NR];
-	bool			btree_roots_dirty;
 	struct mutex		btree_root_lock;
 
 	struct btree_cache	btree_cache;
 
-	mempool_t		btree_reserve_pool;
-
 	/*
 	 * Cache of allocated btree nodes - if we allocate a btree node and
 	 * don't use it, if we free it that space can't be reused until going
@@ -627,6 +624,9 @@ struct bch_fs {
 	struct mutex		btree_interior_update_lock;
 	struct closure_waitlist	btree_interior_update_wait;
 
+	struct workqueue_struct	*btree_interior_update_worker;
+	struct work_struct	btree_interior_update_work;
+
 	mempool_t		btree_iters_pool;
 
 	struct workqueue_struct	*wq;
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 1a97a74b36c8..6589fe0bad6c 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -466,6 +466,7 @@ static void bch2_mark_superblocks(struct bch_fs *c)
 	mutex_unlock(&c->sb_lock);
 }
 
+#if 0
 /* Also see bch2_pending_btree_node_free_insert_done() */
 static void bch2_mark_pending_btree_node_frees(struct bch_fs *c)
 {
@@ -483,6 +484,7 @@ static void bch2_mark_pending_btree_node_frees(struct bch_fs *c)
 
 	mutex_unlock(&c->btree_interior_update_lock);
 }
+#endif
 
 static void bch2_mark_allocator_buckets(struct bch_fs *c)
 {
@@ -801,6 +803,10 @@ int bch2_gc(struct bch_fs *c, struct journal_keys *journal_keys,
 	trace_gc_start(c);
 
 	down_write(&c->gc_lock);
+
+	/* flush interior btree updates: */
+	closure_wait_event(&c->btree_interior_update_wait,
+			   !bch2_btree_interior_updates_nr_pending(c));
 again:
 	ret = bch2_gc_start(c, metadata_only);
 	if (ret)
@@ -812,7 +818,9 @@ again:
 	if (ret)
 		goto out;
 
+#if 0
 	bch2_mark_pending_btree_node_frees(c);
+#endif
 	bch2_mark_allocator_buckets(c);
 
 	c->gc_count++;
@@ -1037,6 +1045,8 @@ static void bch2_coalesce_nodes(struct bch_fs *c, struct btree_iter *iter,
 		btree_node_reset_sib_u64s(n);
 
 		bch2_btree_build_aux_trees(n);
+
+		bch2_btree_update_add_new_node(as, n);
 		six_unlock_write(&n->c.lock);
 
 		bch2_btree_node_write(c, n, SIX_LOCK_intent);
@@ -1085,7 +1095,7 @@ next:
 	bch2_btree_iter_node_replace(iter, new_nodes[0]);
 
 	for (i = 0; i < nr_new_nodes; i++)
-		bch2_open_buckets_put(c, &new_nodes[i]->ob);
+		bch2_btree_update_get_open_buckets(as, new_nodes[i]);
 
 	/* Free the old nodes and update our sliding window */
 	for (i = 0; i < nr_old_nodes; i++) {
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 769c05c8d938..0ecd00475712 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -310,6 +310,7 @@ struct btree_trans {
 	/* update path: */
 	struct jset_entry	*extra_journal_entries;
 	unsigned		extra_journal_entry_u64s;
+	struct journal_entry_pin *journal_pin;
 
 	struct journal_res	journal_res;
 	struct journal_preres	journal_preres;
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 1867d732afd4..7d63c457a3bf 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -21,10 +21,6 @@
 
 #include <linux/random.h>
 
-static void btree_node_will_make_reachable(struct btree_update *,
-					   struct btree *);
-static void btree_update_drop_new_node(struct bch_fs *, struct btree *);
-
 /* Debug code: */
 
 /*
@@ -124,74 +120,6 @@ bool bch2_btree_node_format_fits(struct bch_fs *c, struct btree *b,
 
 /* Btree node freeing/allocation: */
 
-static bool btree_key_matches(struct bch_fs *c,
-			      struct bkey_s_c l,
-			      struct bkey_s_c r)
-{
-	struct bkey_ptrs_c ptrs1 = bch2_bkey_ptrs_c(l);
-	struct bkey_ptrs_c ptrs2 = bch2_bkey_ptrs_c(r);
-	const struct bch_extent_ptr *ptr1, *ptr2;
-
-	bkey_for_each_ptr(ptrs1, ptr1)
-		bkey_for_each_ptr(ptrs2, ptr2)
-			if (ptr1->dev == ptr2->dev &&
-			    ptr1->gen == ptr2->gen &&
-			    ptr1->offset == ptr2->offset)
-				return true;
-
-	return false;
-}
-
-/*
- * We're doing the index update that makes @b unreachable, update stuff to
- * reflect that:
- *
- * Must be called _before_ btree_update_updated_root() or
- * btree_update_updated_node:
- */
-static void bch2_btree_node_free_index(struct btree_update *as, struct btree *b,
-				       struct bkey_s_c k,
-				       struct bch_fs_usage *stats)
-{
-	struct bch_fs *c = as->c;
-	struct pending_btree_node_free *d;
-
-	for (d = as->pending; d < as->pending + as->nr_pending; d++)
-		if (!bkey_cmp(k.k->p, d->key.k.p) &&
-		    btree_key_matches(c, k, bkey_i_to_s_c(&d->key)))
-			goto found;
-	BUG();
-found:
-	BUG_ON(d->index_update_done);
-	d->index_update_done = true;
-
-	/*
-	 * We're dropping @k from the btree, but it's still live until the
-	 * index update is persistent so we need to keep a reference around for
-	 * mark and sweep to find - that's primarily what the
-	 * btree_node_pending_free list is for.
-	 *
-	 * So here (when we set index_update_done = true), we're moving an
-	 * existing reference to a different part of the larger "gc keyspace" -
-	 * and the new position comes after the old position, since GC marks
-	 * the pending free list after it walks the btree.
-	 *
-	 * If we move the reference while mark and sweep is _between_ the old
-	 * and the new position, mark and sweep will see the reference twice
-	 * and it'll get double accounted - so check for that here and subtract
-	 * to cancel out one of mark and sweep's markings if necessary:
-	 */
-
-	if (gc_pos_cmp(c->gc_pos, b
-		       ? gc_pos_btree_node(b)
-		       : gc_pos_btree_root(as->btree_id)) >= 0 &&
-	    gc_pos_cmp(c->gc_pos, gc_phase(GC_PHASE_PENDING_DELETE)) < 0)
-		bch2_mark_key_locked(c, bkey_i_to_s_c(&d->key),
-			      0, 0, NULL, 0,
-			      BTREE_TRIGGER_OVERWRITE|
-			      BTREE_TRIGGER_GC);
-}
-
 static void __btree_node_free(struct bch_fs *c, struct btree *b)
 {
 	trace_btree_node_free(c, b);
@@ -216,8 +144,6 @@ void bch2_btree_node_free_never_inserted(struct bch_fs *c, struct btree *b)
 {
 	struct open_buckets ob = b->ob;
 
-	btree_update_drop_new_node(c, b);
-
 	b->ob.nr = 0;
 
 	clear_btree_node_dirty(b);
@@ -237,39 +163,12 @@ void bch2_btree_node_free_inmem(struct bch_fs *c, struct btree *b,
 	trans_for_each_iter(iter->trans, linked)
 		BUG_ON(linked->l[b->c.level].b == b);
 
-	/*
-	 * Is this a node that isn't reachable on disk yet?
-	 *
-	 * Nodes that aren't reachable yet have writes blocked until they're
-	 * reachable - now that we've cancelled any pending writes and moved
-	 * things waiting on that write to wait on this update, we can drop this
-	 * node from the list of nodes that the other update is making
-	 * reachable, prior to freeing it:
-	 */
-	btree_update_drop_new_node(c, b);
-
 	six_lock_write(&b->c.lock, NULL, NULL);
 	__btree_node_free(c, b);
 	six_unlock_write(&b->c.lock);
 	six_unlock_intent(&b->c.lock);
 }
 
-static void bch2_btree_node_free_ondisk(struct bch_fs *c,
-			struct pending_btree_node_free *pending,
-			u64 journal_seq)
-{
-	BUG_ON(!pending->index_update_done);
-
-	bch2_mark_key(c, bkey_i_to_s_c(&pending->key),
-		      0, 0, NULL, journal_seq, BTREE_TRIGGER_OVERWRITE);
-
-	if (gc_visited(c, gc_phase(GC_PHASE_PENDING_DELETE)))
-		bch2_mark_key(c, bkey_i_to_s_c(&pending->key),
-			      0, 0, NULL, journal_seq,
-			      BTREE_TRIGGER_OVERWRITE|
-			      BTREE_TRIGGER_GC);
-}
-
 static struct btree *__bch2_btree_node_alloc(struct bch_fs *c,
 					     struct disk_reservation *res,
 					     struct closure *cl,
@@ -357,9 +256,9 @@ static struct btree *bch2_btree_node_alloc(struct btree_update *as, unsigned lev
 	int ret;
 
 	BUG_ON(level >= BTREE_MAX_DEPTH);
-	BUG_ON(!as->reserve->nr);
+	BUG_ON(!as->nr_prealloc_nodes);
 
-	b = as->reserve->b[--as->reserve->nr];
+	b = as->prealloc_nodes[--as->nr_prealloc_nodes];
 
 	set_btree_node_accessed(b);
 	set_btree_node_dirty(b);
@@ -394,8 +293,6 @@ static struct btree *bch2_btree_node_alloc(struct btree_update *as, unsigned lev
 
 	bch2_btree_build_aux_trees(b);
 
-	btree_node_will_make_reachable(as, b);
-
 	ret = bch2_btree_node_hash_insert(&c->btree_cache, b, level, as->btree_id);
 	BUG_ON(ret);
 
@@ -466,19 +363,20 @@ static struct btree *__btree_root_alloc(struct btree_update *as, unsigned level)
 	btree_node_set_format(b, b->data->format);
 	bch2_btree_build_aux_trees(b);
 
+	bch2_btree_update_add_new_node(as, b);
 	six_unlock_write(&b->c.lock);
 
 	return b;
 }
 
-static void bch2_btree_reserve_put(struct bch_fs *c, struct btree_reserve *reserve)
+static void bch2_btree_reserve_put(struct btree_update *as)
 {
-	bch2_disk_reservation_put(c, &reserve->disk_res);
+	struct bch_fs *c = as->c;
 
 	mutex_lock(&c->btree_reserve_cache_lock);
 
-	while (reserve->nr) {
-		struct btree *b = reserve->b[--reserve->nr];
+	while (as->nr_prealloc_nodes) {
+		struct btree *b = as->prealloc_nodes[--as->nr_prealloc_nodes];
 
 		six_unlock_write(&b->c.lock);
 
@@ -502,36 +400,14 @@ static void bch2_btree_reserve_put(struct bch_fs *c, struct btree_reserve *reser
 	}
 
 	mutex_unlock(&c->btree_reserve_cache_lock);
-
-	mempool_free(reserve, &c->btree_reserve_pool);
 }
 
-static struct btree_reserve *bch2_btree_reserve_get(struct bch_fs *c,
-						    unsigned nr_nodes,
-						    unsigned flags,
-						    struct closure *cl)
+static int bch2_btree_reserve_get(struct btree_update *as, unsigned nr_nodes,
+				  unsigned flags, struct closure *cl)
 {
-	struct btree_reserve *reserve;
+	struct bch_fs *c = as->c;
 	struct btree *b;
-	struct disk_reservation disk_res = { 0, 0 };
-	unsigned sectors = nr_nodes * c->opts.btree_node_size;
-	int ret, disk_res_flags = 0;
-
-	if (flags & BTREE_INSERT_NOFAIL)
-		disk_res_flags |= BCH_DISK_RESERVATION_NOFAIL;
-
-	/*
-	 * This check isn't necessary for correctness - it's just to potentially
-	 * prevent us from doing a lot of work that'll end up being wasted:
-	 */
-	ret = bch2_journal_error(&c->journal);
-	if (ret)
-		return ERR_PTR(ret);
-
-	if (bch2_disk_reservation_get(c, &disk_res, sectors,
-				      c->opts.metadata_replicas,
-				      disk_res_flags))
-		return ERR_PTR(-ENOSPC);
+	int ret;
 
 	BUG_ON(nr_nodes > BTREE_RESERVE_MAX);
 
@@ -540,18 +416,11 @@ static struct btree_reserve *bch2_btree_reserve_get(struct bch_fs *c,
 	 * open bucket reserve:
 	 */
 	ret = bch2_btree_cache_cannibalize_lock(c, cl);
-	if (ret) {
-		bch2_disk_reservation_put(c, &disk_res);
-		return ERR_PTR(ret);
-	}
-
-	reserve = mempool_alloc(&c->btree_reserve_pool, GFP_NOIO);
-
-	reserve->disk_res = disk_res;
-	reserve->nr = 0;
+	if (ret)
+		return ret;
 
-	while (reserve->nr < nr_nodes) {
-		b = __bch2_btree_node_alloc(c, &disk_res,
+	while (as->nr_prealloc_nodes < nr_nodes) {
+		b = __bch2_btree_node_alloc(c, &as->disk_res,
 					    flags & BTREE_INSERT_NOWAIT
 					    ? NULL : cl, flags);
 		if (IS_ERR(b)) {
@@ -563,21 +432,20 @@ static struct btree_reserve *bch2_btree_reserve_get(struct bch_fs *c,
 		if (ret)
 			goto err_free;
 
-		reserve->b[reserve->nr++] = b;
+		as->prealloc_nodes[as->nr_prealloc_nodes++] = b;
 	}
 
 	bch2_btree_cache_cannibalize_unlock(c);
-	return reserve;
+	return 0;
 err_free:
-	bch2_btree_reserve_put(c, reserve);
 	bch2_btree_cache_cannibalize_unlock(c);
 	trace_btree_reserve_get_fail(c, nr_nodes, cl);
-	return ERR_PTR(ret);
+	return ret;
 }
 
 /* Asynchronous interior node update machinery */
 
-static void __bch2_btree_update_free(struct btree_update *as)
+static void bch2_btree_update_free(struct btree_update *as)
 {
 	struct bch_fs *c = as->c;
 
@@ -585,14 +453,13 @@ static void __bch2_btree_update_free(struct btree_update *as)
 
 	bch2_journal_pin_drop(&c->journal, &as->journal);
 	bch2_journal_pin_flush(&c->journal, &as->journal);
+	bch2_disk_reservation_put(c, &as->disk_res);
+	bch2_btree_reserve_put(as);
 
-	BUG_ON(as->nr_new_nodes || as->nr_pending);
-
-	if (as->reserve)
-		bch2_btree_reserve_put(c, as->reserve);
-
+	mutex_lock(&c->btree_interior_update_lock);
 	list_del(&as->unwritten_list);
 	list_del(&as->list);
+	mutex_unlock(&c->btree_interior_update_lock);
 
 	closure_debug_destroy(&as->cl);
 	mempool_free(as, &c->btree_interior_update_pool);
@@ -600,37 +467,59 @@ static void __bch2_btree_update_free(struct btree_update *as)
 	closure_wake_up(&c->btree_interior_update_wait);
 }
 
-static void bch2_btree_update_free(struct btree_update *as)
+static void btree_update_will_delete_key(struct btree_update *as,
+					 struct bkey_i *k)
 {
-	struct bch_fs *c = as->c;
+	BUG_ON(bch2_keylist_u64s(&as->old_keys) + k->k.u64s >
+	       ARRAY_SIZE(as->_old_keys));
+	bch2_keylist_add(&as->old_keys, k);
+}
 
-	mutex_lock(&c->btree_interior_update_lock);
-	__bch2_btree_update_free(as);
-	mutex_unlock(&c->btree_interior_update_lock);
+static void btree_update_will_add_key(struct btree_update *as,
+				      struct bkey_i *k)
+{
+	BUG_ON(bch2_keylist_u64s(&as->new_keys) + k->k.u64s >
+	       ARRAY_SIZE(as->_new_keys));
+	bch2_keylist_add(&as->new_keys, k);
 }
 
-static inline bool six_trylock_intentwrite(struct six_lock *lock)
+/*
+ * The transactional part of an interior btree node update, where we journal the
+ * update we did to the interior node and update alloc info:
+ */
+static int btree_update_nodes_written_trans(struct btree_trans *trans,
+					    struct btree_update *as)
 {
-	if (!six_trylock_intent(lock))
-		return false;
+	struct bkey_i *k;
+	int ret;
+
+	trans->extra_journal_entries = (void *) &as->journal_entries[0];
+	trans->extra_journal_entry_u64s = as->journal_u64s;
+	trans->journal_pin = &as->journal;
 
-	if (!six_trylock_write(lock)) {
-		six_unlock_intent(lock);
-		return false;
+	for_each_keylist_key(&as->new_keys, k) {
+		ret = bch2_trans_mark_key(trans, bkey_i_to_s_c(k),
+					  0, 0, BTREE_TRIGGER_INSERT);
+		if (ret)
+			return ret;
 	}
 
-	return true;
+	for_each_keylist_key(&as->old_keys, k) {
+		ret = bch2_trans_mark_key(trans, bkey_i_to_s_c(k),
+					  0, 0, BTREE_TRIGGER_OVERWRITE);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
 }
 
-static void btree_update_nodes_written(struct closure *cl)
+static void btree_update_nodes_written(struct btree_update *as)
 {
-	struct btree_update *as = container_of(cl, struct btree_update, cl);
-	struct btree *nodes_need_write[BTREE_MAX_DEPTH * 2 + GC_MERGE_NODES + 1];
-	unsigned nr_nodes_need_write;
-	struct journal_res res = { 0 };
 	struct bch_fs *c = as->c;
-	struct btree_root *r;
-	struct btree *b;
+	struct btree *b = as->b;
+	u64 journal_seq = 0;
+	unsigned i;
 	int ret;
 
 	/*
@@ -638,78 +527,17 @@ static void btree_update_nodes_written(struct closure *cl)
 	 * to child nodes that weren't written yet: now, the child nodes have
 	 * been written so we can write out the update to the interior node.
 	 */
-	mutex_lock(&c->btree_interior_update_lock);
-	as->nodes_written = true;
-again:
-	nr_nodes_need_write = 0;
-	as = list_first_entry_or_null(&c->btree_interior_updates_unwritten,
-				      struct btree_update, unwritten_list);
-	if (!as || !as->nodes_written) {
-		mutex_unlock(&c->btree_interior_update_lock);
-		return;
-	}
-
-	b = as->b;
-	if (b && !six_trylock_intentwrite(&b->c.lock)) {
-		mutex_unlock(&c->btree_interior_update_lock);
-
-		btree_node_lock_type(c, b, SIX_LOCK_intent);
-		six_lock_write(&b->c.lock, NULL, NULL);
-
-		six_unlock_write(&b->c.lock);
-		six_unlock_intent(&b->c.lock);
-
-		mutex_lock(&c->btree_interior_update_lock);
-		goto again;
-	}
-
-	ret = bch2_journal_res_get(&c->journal, &res, as->journal_u64s,
-				   JOURNAL_RES_GET_NONBLOCK|
-				   JOURNAL_RES_GET_RESERVED);
-	if (ret == -EAGAIN) {
-		unsigned u64s = as->journal_u64s;
-
-		if (b) {
-			six_unlock_write(&b->c.lock);
-			six_unlock_intent(&b->c.lock);
-		}
-
-		mutex_unlock(&c->btree_interior_update_lock);
-
-		ret = bch2_journal_res_get(&c->journal, &res, u64s,
-					   JOURNAL_RES_GET_CHECK|
-					   JOURNAL_RES_GET_RESERVED);
-		if (!ret) {
-			mutex_lock(&c->btree_interior_update_lock);
-			goto again;
-		}
-	}
-
-	if (!ret) {
-		struct journal_buf *buf = &c->journal.buf[res.idx];
-		struct jset_entry *entry = vstruct_idx(buf->data, res.offset);
-
-		res.offset	+= as->journal_u64s;
-		res.u64s	-= as->journal_u64s;
-		memcpy_u64s(entry, as->journal_entries, as->journal_u64s);
-	} else {
-		/*
-		 * On journal error we have to run most of the normal path so
-		 * that shutdown works - unblocking btree node writes in
-		 * particular and writing them if needed - except for
-		 * journalling the update:
-		 */
-
-		BUG_ON(!bch2_journal_error(&c->journal));
-	}
-
-	switch (as->mode) {
-	case BTREE_INTERIOR_NO_UPDATE:
-		BUG();
-	case BTREE_INTERIOR_UPDATING_NODE:
-		/* @b is the node we did the final insert into: */
-
+	ret = bch2_trans_do(c, &as->disk_res, &journal_seq,
+			    BTREE_INSERT_NOFAIL|
+			    BTREE_INSERT_NOCHECK_RW|
+			    BTREE_INSERT_JOURNAL_RESERVED,
+			    btree_update_nodes_written_trans(&trans, as));
+	BUG_ON(ret && !bch2_journal_error(&c->journal));
+
+	if (b) {
 		/*
+		 * @b is the node we did the final insert into:
+		 *
 		 * On failure to get a journal reservation, we still have to
 		 * unblock the write and allow most of the write path to happen
 		 * so that shutdown works, but the i->journal_seq mechanism
@@ -719,83 +547,90 @@ again:
 		 * we're in journal error state:
 		 */
 
+		btree_node_lock_type(c, b, SIX_LOCK_intent);
+		btree_node_lock_type(c, b, SIX_LOCK_write);
+		mutex_lock(&c->btree_interior_update_lock);
+
 		list_del(&as->write_blocked_list);
 
-		if (!ret) {
+		if (!ret && as->b == b) {
 			struct bset *i = btree_bset_last(b);
 
+			BUG_ON(!b->c.level);
+			BUG_ON(!btree_node_dirty(b));
+
 			i->journal_seq = cpu_to_le64(
-				max(res.seq,
+				max(journal_seq,
 				    le64_to_cpu(i->journal_seq)));
 
-			bch2_btree_add_journal_pin(c, b, res.seq);
+			bch2_btree_add_journal_pin(c, b, journal_seq);
 		}
 
-		nodes_need_write[nr_nodes_need_write++] = b;
-
+		mutex_unlock(&c->btree_interior_update_lock);
 		six_unlock_write(&b->c.lock);
-		six_unlock_intent(&b->c.lock);
-		break;
-
-	case BTREE_INTERIOR_UPDATING_AS:
-		BUG_ON(b);
-		break;
-
-	case BTREE_INTERIOR_UPDATING_ROOT:
-		r = &c->btree_roots[as->btree_id];
 
-		BUG_ON(b);
-
-		mutex_lock(&c->btree_root_lock);
-		bkey_copy(&r->key, as->parent_keys.keys);
-		r->level = as->level;
-		r->alive = true;
-		c->btree_roots_dirty = true;
-		mutex_unlock(&c->btree_root_lock);
-		break;
+		btree_node_write_if_need(c, b, SIX_LOCK_intent);
+		six_unlock_intent(&b->c.lock);
 	}
 
 	bch2_journal_pin_drop(&c->journal, &as->journal);
 
-	bch2_journal_res_put(&c->journal, &res);
 	bch2_journal_preres_put(&c->journal, &as->journal_preres);
 
-	while (as->nr_new_nodes) {
-		b = as->new_nodes[--as->nr_new_nodes];
+	mutex_lock(&c->btree_interior_update_lock);
+	for (i = 0; i < as->nr_new_nodes; i++) {
+		b = as->new_nodes[i];
 
 		BUG_ON(b->will_make_reachable != (unsigned long) as);
 		b->will_make_reachable = 0;
+	}
+	mutex_unlock(&c->btree_interior_update_lock);
+
+	for (i = 0; i < as->nr_new_nodes; i++) {
+		b = as->new_nodes[i];
 
-		nodes_need_write[nr_nodes_need_write++] = b;
+		btree_node_lock_type(c, b, SIX_LOCK_read);
+		btree_node_write_if_need(c, b, SIX_LOCK_read);
+		six_unlock_read(&b->c.lock);
 	}
 
-	while (as->nr_pending)
-		bch2_btree_node_free_ondisk(c,
-			&as->pending[--as->nr_pending], res.seq);
+	for (i = 0; i < as->nr_open_buckets; i++)
+		bch2_open_bucket_put(c, c->open_buckets + as->open_buckets[i]);
 
-	__bch2_btree_update_free(as);
-	/*
-	 * for flush_held_btree_writes() waiting on updates to flush or
-	 * nodes to be writeable:
-	 */
-	closure_wake_up(&c->btree_interior_update_wait);
+	bch2_btree_update_free(as);
+}
 
-	/*
-	 * Can't take btree node locks while holding btree_interior_update_lock:
-	 * */
-	mutex_unlock(&c->btree_interior_update_lock);
+static void btree_interior_update_work(struct work_struct *work)
+{
+	struct bch_fs *c =
+		container_of(work, struct bch_fs, btree_interior_update_work);
+	struct btree_update *as;
 
-	/* Do btree writes after dropping journal res/locks: */
-	while (nr_nodes_need_write) {
-		b = nodes_need_write[--nr_nodes_need_write];
+	while (1) {
+		mutex_lock(&c->btree_interior_update_lock);
+		as = list_first_entry_or_null(&c->btree_interior_updates_unwritten,
+					      struct btree_update, unwritten_list);
+		if (as && !as->nodes_written)
+			as = NULL;
+		mutex_unlock(&c->btree_interior_update_lock);
 
-		btree_node_lock_type(c, b, SIX_LOCK_read);
-		bch2_btree_node_write_cond(c, b, btree_node_need_write(b));
-		six_unlock_read(&b->c.lock);
+		if (!as)
+			break;
+
+		btree_update_nodes_written(as);
 	}
+}
+
+static void btree_update_set_nodes_written(struct closure *cl)
+{
+	struct btree_update *as = container_of(cl, struct btree_update, cl);
+	struct bch_fs *c = as->c;
 
 	mutex_lock(&c->btree_interior_update_lock);
-	goto again;
+	as->nodes_written = true;
+	mutex_unlock(&c->btree_interior_update_lock);
+
+	queue_work(c->btree_interior_update_worker, &c->btree_interior_update_work);
 }
 
 /*
@@ -814,7 +649,6 @@ static void btree_update_updated_node(struct btree_update *as, struct btree *b)
 
 	as->mode	= BTREE_INTERIOR_UPDATING_NODE;
 	as->b		= b;
-	as->level	= b->c.level;
 	list_add(&as->write_blocked_list, &b->write_blocked);
 
 	mutex_unlock(&c->btree_interior_update_lock);
@@ -845,25 +679,45 @@ static void btree_update_reparent(struct btree_update *as,
 
 static void btree_update_updated_root(struct btree_update *as, struct btree *b)
 {
+	struct bkey_i *insert = &b->key;
 	struct bch_fs *c = as->c;
 
 	BUG_ON(as->mode != BTREE_INTERIOR_NO_UPDATE);
-	BUG_ON(!bch2_keylist_empty(&as->parent_keys));
+
+	BUG_ON(as->journal_u64s + jset_u64s(insert->k.u64s) >
+	       ARRAY_SIZE(as->journal_entries));
+
+	as->journal_u64s +=
+		journal_entry_set((void *) &as->journal_entries[as->journal_u64s],
+				  BCH_JSET_ENTRY_btree_root,
+				  b->c.btree_id, b->c.level,
+				  insert, insert->k.u64s);
 
 	mutex_lock(&c->btree_interior_update_lock);
 	list_add_tail(&as->unwritten_list, &c->btree_interior_updates_unwritten);
 
 	as->mode	= BTREE_INTERIOR_UPDATING_ROOT;
-	as->level	= b->c.level;
-	bch2_keylist_add(&as->parent_keys, &b->key);
 	mutex_unlock(&c->btree_interior_update_lock);
 }
 
-static void btree_node_will_make_reachable(struct btree_update *as,
-					   struct btree *b)
+/*
+ * bch2_btree_update_add_new_node:
+ *
+ * This causes @as to wait on @b to be written, before it gets to
+ * bch2_btree_update_nodes_written
+ *
+ * Additionally, it sets b->will_make_reachable to prevent any additional writes
+ * to @b from happening besides the first until @b is reachable on disk
+ *
+ * And it adds @b to the list of @as's new nodes, so that we can update sector
+ * counts in bch2_btree_update_nodes_written:
+ */
+void bch2_btree_update_add_new_node(struct btree_update *as, struct btree *b)
 {
 	struct bch_fs *c = as->c;
 
+	closure_get(&as->cl);
+
 	mutex_lock(&c->btree_interior_update_lock);
 	BUG_ON(as->nr_new_nodes >= ARRAY_SIZE(as->new_nodes));
 	BUG_ON(b->will_make_reachable);
@@ -871,10 +725,14 @@ static void btree_node_will_make_reachable(struct btree_update *as,
 	as->new_nodes[as->nr_new_nodes++] = b;
 	b->will_make_reachable = 1UL|(unsigned long) as;
 
-	closure_get(&as->cl);
 	mutex_unlock(&c->btree_interior_update_lock);
+
+	btree_update_will_add_key(as, &b->key);
 }
 
+/*
+ * returns true if @b was a new node
+ */
 static void btree_update_drop_new_node(struct bch_fs *c, struct btree *b)
 {
 	struct btree_update *as;
@@ -882,6 +740,11 @@ static void btree_update_drop_new_node(struct bch_fs *c, struct btree *b)
 	unsigned i;
 
 	mutex_lock(&c->btree_interior_update_lock);
+	/*
+	 * When b->will_make_reachable != 0, it owns a ref on as->cl that's
+	 * dropped when it gets written by bch2_btree_complete_write - the
+	 * xchg() is for synchronization with bch2_btree_complete_write:
+	 */
 	v = xchg(&b->will_make_reachable, 0);
 	as = (struct btree_update *) (v & ~1UL);
 
@@ -903,25 +766,11 @@ found:
 		closure_put(&as->cl);
 }
 
-static void btree_interior_update_add_node_reference(struct btree_update *as,
-						     struct btree *b)
+void bch2_btree_update_get_open_buckets(struct btree_update *as, struct btree *b)
 {
-	struct bch_fs *c = as->c;
-	struct pending_btree_node_free *d;
-
-	mutex_lock(&c->btree_interior_update_lock);
-
-	/* Add this node to the list of nodes being freed: */
-	BUG_ON(as->nr_pending >= ARRAY_SIZE(as->pending));
-
-	d = &as->pending[as->nr_pending++];
-	d->index_update_done	= false;
-	d->seq			= b->data->keys.seq;
-	d->btree_id		= b->c.btree_id;
-	d->level		= b->c.level;
-	bkey_copy(&d->key, &b->key);
-
-	mutex_unlock(&c->btree_interior_update_lock);
+	while (b->ob.nr)
+		as->open_buckets[as->nr_open_buckets++] =
+			b->ob.v[--b->ob.nr];
 }
 
 /*
@@ -941,8 +790,6 @@ void bch2_btree_interior_update_will_free_node(struct btree_update *as,
 	if (btree_node_fake(b))
 		return;
 
-	btree_interior_update_add_node_reference(as, b);
-
 	mutex_lock(&c->btree_interior_update_lock);
 
 	/*
@@ -984,16 +831,28 @@ void bch2_btree_interior_update_will_free_node(struct btree_update *as,
 	bch2_journal_pin_drop(&c->journal, &w->journal);
 
 	mutex_unlock(&c->btree_interior_update_lock);
+
+	/*
+	 * Is this a node that isn't reachable on disk yet?
+	 *
+	 * Nodes that aren't reachable yet have writes blocked until they're
+	 * reachable - now that we've cancelled any pending writes and moved
+	 * things waiting on that write to wait on this update, we can drop this
+	 * node from the list of nodes that the other update is making
+	 * reachable, prior to freeing it:
+	 */
+	btree_update_drop_new_node(c, b);
+
+	btree_update_will_delete_key(as, &b->key);
 }
 
 void bch2_btree_update_done(struct btree_update *as)
 {
 	BUG_ON(as->mode == BTREE_INTERIOR_NO_UPDATE);
 
-	bch2_btree_reserve_put(as->c, as->reserve);
-	as->reserve = NULL;
+	bch2_btree_reserve_put(as);
 
-	continue_at(&as->cl, btree_update_nodes_written, system_freezable_wq);
+	continue_at(&as->cl, btree_update_set_nodes_written, system_freezable_wq);
 }
 
 struct btree_update *
@@ -1002,12 +861,32 @@ bch2_btree_update_start(struct btree_trans *trans, enum btree_id id,
 			struct closure *cl)
 {
 	struct bch_fs *c = trans->c;
-	struct journal_preres journal_preres = { 0 };
-	struct btree_reserve *reserve;
 	struct btree_update *as;
-	int ret;
+	int ret, disk_res_flags = (flags & BTREE_INSERT_NOFAIL)
+		? BCH_DISK_RESERVATION_NOFAIL : 0;
+
+	/*
+	 * This check isn't necessary for correctness - it's just to potentially
+	 * prevent us from doing a lot of work that'll end up being wasted:
+	 */
+	ret = bch2_journal_error(&c->journal);
+	if (ret)
+		return ERR_PTR(ret);
+
+	as = mempool_alloc(&c->btree_interior_update_pool, GFP_NOIO);
+	memset(as, 0, sizeof(*as));
+	closure_init(&as->cl, NULL);
+	as->c		= c;
+	as->mode	= BTREE_INTERIOR_NO_UPDATE;
+	as->btree_id	= id;
+	INIT_LIST_HEAD(&as->list);
+	INIT_LIST_HEAD(&as->unwritten_list);
+	INIT_LIST_HEAD(&as->write_blocked_list);
+	bch2_keylist_init(&as->old_keys, as->_old_keys);
+	bch2_keylist_init(&as->new_keys, as->_new_keys);
+	bch2_keylist_init(&as->parent_keys, as->inline_keys);
 
-	ret = bch2_journal_preres_get(&c->journal, &journal_preres,
+	ret = bch2_journal_preres_get(&c->journal, &as->journal_preres,
 				      BTREE_UPDATE_JOURNAL_RES,
 				      JOURNAL_RES_GET_NONBLOCK);
 	if (ret == -EAGAIN) {
@@ -1016,46 +895,41 @@ bch2_btree_update_start(struct btree_trans *trans, enum btree_id id,
 
 		bch2_trans_unlock(trans);
 
-		ret = bch2_journal_preres_get(&c->journal, &journal_preres,
+		ret = bch2_journal_preres_get(&c->journal, &as->journal_preres,
 					      BTREE_UPDATE_JOURNAL_RES, 0);
 		if (ret)
 			return ERR_PTR(ret);
 
 		if (!bch2_trans_relock(trans)) {
-			bch2_journal_preres_put(&c->journal, &journal_preres);
-			return ERR_PTR(-EINTR);
+			ret = -EINTR;
+			goto err;
 		}
 	}
 
-	reserve = bch2_btree_reserve_get(c, nr_nodes, flags, cl);
-	if (IS_ERR(reserve)) {
-		bch2_journal_preres_put(&c->journal, &journal_preres);
-		return ERR_CAST(reserve);
-	}
-
-	as = mempool_alloc(&c->btree_interior_update_pool, GFP_NOIO);
-	memset(as, 0, sizeof(*as));
-	closure_init(&as->cl, NULL);
-	as->c		= c;
-	as->mode	= BTREE_INTERIOR_NO_UPDATE;
-	as->btree_id	= id;
-	as->reserve	= reserve;
-	INIT_LIST_HEAD(&as->write_blocked_list);
-	INIT_LIST_HEAD(&as->unwritten_list);
-	as->journal_preres = journal_preres;
+	ret = bch2_disk_reservation_get(c, &as->disk_res,
+			nr_nodes * c->opts.btree_node_size,
+			c->opts.metadata_replicas,
+			disk_res_flags);
+	if (ret)
+		goto err;
 
-	bch2_keylist_init(&as->parent_keys, as->inline_keys);
+	ret = bch2_btree_reserve_get(as, nr_nodes, flags, cl);
+	if (ret)
+		goto err;
 
 	mutex_lock(&c->btree_interior_update_lock);
 	list_add_tail(&as->list, &c->btree_interior_update_list);
 	mutex_unlock(&c->btree_interior_update_lock);
 
 	return as;
+err:
+	bch2_btree_update_free(as);
+	return ERR_PTR(ret);
 }
 
 /* Btree root updates: */
 
-static void __bch2_btree_set_root_inmem(struct bch_fs *c, struct btree *b)
+static void bch2_btree_set_root_inmem(struct bch_fs *c, struct btree *b)
 {
 	/* Root nodes cannot be reaped */
 	mutex_lock(&c->btree_cache.lock);
@@ -1073,38 +947,6 @@ static void __bch2_btree_set_root_inmem(struct bch_fs *c, struct btree *b)
 	bch2_recalc_btree_reserve(c);
 }
 
-static void bch2_btree_set_root_inmem(struct btree_update *as, struct btree *b)
-{
-	struct bch_fs *c = as->c;
-	struct btree *old = btree_node_root(c, b);
-	struct bch_fs_usage_online *fs_usage;
-
-	__bch2_btree_set_root_inmem(c, b);
-
-	mutex_lock(&c->btree_interior_update_lock);
-	percpu_down_read(&c->mark_lock);
-	fs_usage = bch2_fs_usage_scratch_get(c);
-
-	bch2_mark_key_locked(c, bkey_i_to_s_c(&b->key),
-		      0, 0, &fs_usage->u, 0,
-		      BTREE_TRIGGER_INSERT);
-	if (gc_visited(c, gc_pos_btree_root(b->c.btree_id)))
-		bch2_mark_key_locked(c, bkey_i_to_s_c(&b->key),
-				     0, 0, NULL, 0,
-				     BTREE_TRIGGER_INSERT|
-				     BTREE_TRIGGER_GC);
-
-	if (old && !btree_node_fake(old))
-		bch2_btree_node_free_index(as, NULL,
-					   bkey_i_to_s_c(&old->key),
-					   &fs_usage->u);
-	bch2_fs_usage_apply(c, fs_usage, &as->reserve->disk_res, 0);
-
-	bch2_fs_usage_scratch_put(c, fs_usage);
-	percpu_up_read(&c->mark_lock);
-	mutex_unlock(&c->btree_interior_update_lock);
-}
-
 /**
  * bch_btree_set_root - update the root in memory and on disk
  *
@@ -1135,7 +977,7 @@ static void bch2_btree_set_root(struct btree_update *as, struct btree *b,
 	 */
 	bch2_btree_node_lock_write(old, iter);
 
-	bch2_btree_set_root_inmem(as, b);
+	bch2_btree_set_root_inmem(c, b);
 
 	btree_update_updated_root(as, b);
 
@@ -1156,57 +998,21 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, struct btree *b
 					struct bkey_i *insert,
 					struct btree_node_iter *node_iter)
 {
-	struct bch_fs *c = as->c;
-	struct bch_fs_usage_online *fs_usage;
-	struct jset_entry *entry;
 	struct bkey_packed *k;
-	struct bkey tmp;
 
 	BUG_ON(as->journal_u64s + jset_u64s(insert->k.u64s) >
 	       ARRAY_SIZE(as->journal_entries));
 
-	entry = (void *) &as->journal_entries[as->journal_u64s];
-	memset(entry, 0, sizeof(*entry));
-	entry->u64s	= cpu_to_le16(insert->k.u64s);
-	entry->type	= BCH_JSET_ENTRY_btree_keys;
-	entry->btree_id = b->c.btree_id;
-	entry->level	= b->c.level;
-	memcpy_u64s_small(entry->_data, insert, insert->k.u64s);
-	as->journal_u64s += jset_u64s(insert->k.u64s);
-
-	mutex_lock(&c->btree_interior_update_lock);
-	percpu_down_read(&c->mark_lock);
-	fs_usage = bch2_fs_usage_scratch_get(c);
-
-	bch2_mark_key_locked(c, bkey_i_to_s_c(insert),
-			     0, 0, &fs_usage->u, 0,
-			     BTREE_TRIGGER_INSERT);
-
-	if (gc_visited(c, gc_pos_btree_node(b)))
-		bch2_mark_key_locked(c, bkey_i_to_s_c(insert),
-				     0, 0, NULL, 0,
-				     BTREE_TRIGGER_INSERT|
-				     BTREE_TRIGGER_GC);
+	as->journal_u64s +=
+		journal_entry_set((void *) &as->journal_entries[as->journal_u64s],
+				  BCH_JSET_ENTRY_btree_keys,
+				  b->c.btree_id, b->c.level,
+				  insert, insert->k.u64s);
 
 	while ((k = bch2_btree_node_iter_peek_all(node_iter, b)) &&
 	       bkey_iter_pos_cmp(b, k, &insert->k.p) < 0)
 		bch2_btree_node_iter_advance(node_iter, b);
 
-	/*
-	 * If we're overwriting, look up pending delete and mark so that gc
-	 * marks it on the pending delete list:
-	 */
-	if (k && !bkey_cmp_packed(b, k, &insert->k))
-		bch2_btree_node_free_index(as, b,
-					   bkey_disassemble(b, k, &tmp),
-					   &fs_usage->u);
-
-	bch2_fs_usage_apply(c, fs_usage, &as->reserve->disk_res, 0);
-
-	bch2_fs_usage_scratch_put(c, fs_usage);
-	percpu_up_read(&c->mark_lock);
-	mutex_unlock(&c->btree_interior_update_lock);
-
 	bch2_btree_bset_insert_key(iter, b, node_iter, insert);
 	set_btree_node_dirty(b);
 	set_btree_node_need_write(b);
@@ -1226,6 +1032,7 @@ static struct btree *__btree_split_node(struct btree_update *as,
 	struct bkey_packed *k, *prev = NULL;
 
 	n2 = bch2_btree_node_alloc(as, n1->c.level);
+	bch2_btree_update_add_new_node(as, n2);
 
 	n2->data->max_key	= n1->data->max_key;
 	n2->data->format	= n1->format;
@@ -1321,14 +1128,6 @@ static void btree_split_insert_keys(struct btree_update *as, struct btree *b,
 	struct bkey_packed *src, *dst, *n;
 	struct bset *i;
 
-	/*
-	 * XXX
-	 *
-	 * these updates must be journalled
-	 *
-	 * oops
-	 */
-
 	BUG_ON(btree_node_type(b) != BKEY_TYPE_BTREE);
 
 	bch2_btree_node_iter_init(&node_iter, b, &k->k.p);
@@ -1380,6 +1179,7 @@ static void btree_split(struct btree_update *as, struct btree *b,
 	bch2_btree_interior_update_will_free_node(as, b);
 
 	n1 = bch2_btree_node_alloc_replacement(as, b);
+	bch2_btree_update_add_new_node(as, n1);
 
 	if (keys)
 		btree_split_insert_keys(as, n1, iter, keys);
@@ -1439,11 +1239,11 @@ static void btree_split(struct btree_update *as, struct btree *b,
 		bch2_btree_set_root(as, n1, iter);
 	}
 
-	bch2_open_buckets_put(c, &n1->ob);
+	bch2_btree_update_get_open_buckets(as, n1);
 	if (n2)
-		bch2_open_buckets_put(c, &n2->ob);
+		bch2_btree_update_get_open_buckets(as, n2);
 	if (n3)
-		bch2_open_buckets_put(c, &n3->ob);
+		bch2_btree_update_get_open_buckets(as, n3);
 
 	/* Successful split, update the iterator to point to the new nodes: */
 
@@ -1538,7 +1338,7 @@ void bch2_btree_insert_node(struct btree_update *as, struct btree *b,
 
 	bch2_btree_node_lock_for_insert(c, b, iter);
 
-	if (!bch2_btree_node_insert_fits(c, b, bch_keylist_u64s(keys))) {
+	if (!bch2_btree_node_insert_fits(c, b, bch2_keylist_u64s(keys))) {
 		bch2_btree_node_unlock_write(b, iter);
 		goto split;
 	}
@@ -1749,6 +1549,7 @@ retry:
 	bch2_btree_interior_update_will_free_node(as, m);
 
 	n = bch2_btree_node_alloc(as, b->c.level);
+	bch2_btree_update_add_new_node(as, n);
 
 	btree_set_min(n, prev->data->min_key);
 	btree_set_max(n, next->data->max_key);
@@ -1771,7 +1572,7 @@ retry:
 
 	bch2_btree_insert_node(as, parent, iter, &as->parent_keys, flags);
 
-	bch2_open_buckets_put(c, &n->ob);
+	bch2_btree_update_get_open_buckets(as, n);
 
 	six_lock_increment(&b->c.lock, SIX_LOCK_intent);
 	bch2_btree_iter_node_drop(iter, b);
@@ -1859,6 +1660,7 @@ static int __btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter,
 	bch2_btree_interior_update_will_free_node(as, b);
 
 	n = bch2_btree_node_alloc_replacement(as, b);
+	bch2_btree_update_add_new_node(as, n);
 
 	bch2_btree_build_aux_trees(n);
 	six_unlock_write(&n->c.lock);
@@ -1874,7 +1676,7 @@ static int __btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter,
 		bch2_btree_set_root(as, n, iter);
 	}
 
-	bch2_open_buckets_put(c, &n->ob);
+	bch2_btree_update_get_open_buckets(as, n);
 
 	six_lock_increment(&b->c.lock, SIX_LOCK_intent);
 	bch2_btree_iter_node_drop(iter, b);
@@ -1949,49 +1751,8 @@ static void __bch2_btree_node_update_key(struct bch_fs *c,
 	struct btree *parent;
 	int ret;
 
-	/*
-	 * Two corner cases that need to be thought about here:
-	 *
-	 * @b may not be reachable yet - there might be another interior update
-	 * operation waiting on @b to be written, and we're gonna deliver the
-	 * write completion to that interior update operation _before_
-	 * persisting the new_key update
-	 *
-	 * That ends up working without us having to do anything special here:
-	 * the reason is, we do kick off (and do the in memory updates) for the
-	 * update for @new_key before we return, creating a new interior_update
-	 * operation here.
-	 *
-	 * The new interior update operation here will in effect override the
-	 * previous one. The previous one was going to terminate - make @b
-	 * reachable - in one of two ways:
-	 * - updating the btree root pointer
-	 *   In that case,
-	 *   no, this doesn't work. argh.
-	 */
-
-	if (b->will_make_reachable)
-		as->must_rewrite = true;
-
-	btree_interior_update_add_node_reference(as, b);
-
-	/*
-	 * XXX: the rest of the update path treats this like we're actually
-	 * inserting a new node and deleting the existing node, so the
-	 * reservation needs to include enough space for @b
-	 *
-	 * that is actually sketch as fuck though and I am surprised the code
-	 * seems to work like that, definitely need to go back and rework it
-	 * into something saner.
-	 *
-	 * (I think @b is just getting double counted until the btree update
-	 * finishes and "deletes" @b on disk)
-	 */
-	ret = bch2_disk_reservation_add(c, &as->reserve->disk_res,
-			c->opts.btree_node_size *
-			bch2_bkey_nr_ptrs(bkey_i_to_s_c(new_key)),
-			BCH_DISK_RESERVATION_NOFAIL);
-	BUG_ON(ret);
+	btree_update_will_delete_key(as, &b->key);
+	btree_update_will_add_key(as, new_key);
 
 	parent = btree_node_parent(iter, b);
 	if (parent) {
@@ -2019,44 +1780,18 @@ static void __bch2_btree_node_update_key(struct bch_fs *c,
 			bkey_copy(&b->key, new_key);
 		}
 	} else {
-		struct bch_fs_usage_online *fs_usage;
-
 		BUG_ON(btree_node_root(c, b) != b);
 
 		bch2_btree_node_lock_write(b, iter);
+		bkey_copy(&b->key, new_key);
 
-		mutex_lock(&c->btree_interior_update_lock);
-		percpu_down_read(&c->mark_lock);
-		fs_usage = bch2_fs_usage_scratch_get(c);
-
-		bch2_mark_key_locked(c, bkey_i_to_s_c(new_key),
-			      0, 0, &fs_usage->u, 0,
-			      BTREE_TRIGGER_INSERT);
-		if (gc_visited(c, gc_pos_btree_root(b->c.btree_id)))
-			bch2_mark_key_locked(c, bkey_i_to_s_c(new_key),
-					     0, 0, NULL, 0,
-					     BTREE_TRIGGER_INSERT||
-					     BTREE_TRIGGER_GC);
-
-		bch2_btree_node_free_index(as, NULL,
-					   bkey_i_to_s_c(&b->key),
-					   &fs_usage->u);
-		bch2_fs_usage_apply(c, fs_usage, &as->reserve->disk_res, 0);
-
-		bch2_fs_usage_scratch_put(c, fs_usage);
-		percpu_up_read(&c->mark_lock);
-		mutex_unlock(&c->btree_interior_update_lock);
-
-		if (btree_ptr_hash_val(new_key) != b->hash_val) {
+		if (btree_ptr_hash_val(&b->key) != b->hash_val) {
 			mutex_lock(&c->btree_cache.lock);
 			bch2_btree_node_hash_remove(&c->btree_cache, b);
 
-			bkey_copy(&b->key, new_key);
 			ret = __bch2_btree_node_hash_insert(&c->btree_cache, b);
 			BUG_ON(ret);
 			mutex_unlock(&c->btree_cache.lock);
-		} else {
-			bkey_copy(&b->key, new_key);
 		}
 
 		btree_update_updated_root(as, b);
@@ -2171,7 +1906,7 @@ void bch2_btree_set_root_for_read(struct bch_fs *c, struct btree *b)
 {
 	BUG_ON(btree_node_root(c, b));
 
-	__bch2_btree_set_root_inmem(c, b);
+	bch2_btree_set_root_inmem(c, b);
 }
 
 void bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id)
@@ -2211,7 +1946,7 @@ void bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id)
 					  b->c.level, b->c.btree_id);
 	BUG_ON(ret);
 
-	__bch2_btree_set_root_inmem(c, b);
+	bch2_btree_set_root_inmem(c, b);
 
 	six_unlock_write(&b->c.lock);
 	six_unlock_intent(&b->c.lock);
@@ -2248,10 +1983,59 @@ size_t bch2_btree_interior_updates_nr_pending(struct bch_fs *c)
 	return ret;
 }
 
+void bch2_journal_entries_to_btree_roots(struct bch_fs *c, struct jset *jset)
+{
+	struct btree_root *r;
+	struct jset_entry *entry;
+
+	mutex_lock(&c->btree_root_lock);
+
+	vstruct_for_each(jset, entry)
+		if (entry->type == BCH_JSET_ENTRY_btree_root) {
+			r = &c->btree_roots[entry->btree_id];
+			r->level = entry->level;
+			r->alive = true;
+			bkey_copy(&r->key, &entry->start[0]);
+		}
+
+	mutex_unlock(&c->btree_root_lock);
+}
+
+struct jset_entry *
+bch2_btree_roots_to_journal_entries(struct bch_fs *c,
+				    struct jset_entry *start,
+				    struct jset_entry *end)
+{
+	struct jset_entry *entry;
+	unsigned long have = 0;
+	unsigned i;
+
+	for (entry = start; entry < end; entry = vstruct_next(entry))
+		if (entry->type == BCH_JSET_ENTRY_btree_root)
+			__set_bit(entry->btree_id, &have);
+
+	mutex_lock(&c->btree_root_lock);
+
+	for (i = 0; i < BTREE_ID_NR; i++)
+		if (c->btree_roots[i].alive && !test_bit(i, &have)) {
+			journal_entry_set(end,
+					  BCH_JSET_ENTRY_btree_root,
+					  i, c->btree_roots[i].level,
+					  &c->btree_roots[i].key,
+					  c->btree_roots[i].key.u64s);
+			end = vstruct_next(end);
+		}
+
+	mutex_unlock(&c->btree_root_lock);
+
+	return end;
+}
+
 void bch2_fs_btree_interior_update_exit(struct bch_fs *c)
 {
+	if (c->btree_interior_update_worker)
+		destroy_workqueue(c->btree_interior_update_worker);
 	mempool_exit(&c->btree_interior_update_pool);
-	mempool_exit(&c->btree_reserve_pool);
 }
 
 int bch2_fs_btree_interior_update_init(struct bch_fs *c)
@@ -2260,9 +2044,13 @@ int bch2_fs_btree_interior_update_init(struct bch_fs *c)
 	INIT_LIST_HEAD(&c->btree_interior_update_list);
 	INIT_LIST_HEAD(&c->btree_interior_updates_unwritten);
 	mutex_init(&c->btree_interior_update_lock);
+	INIT_WORK(&c->btree_interior_update_work, btree_interior_update_work);
+
+	c->btree_interior_update_worker =
+		alloc_workqueue("btree_update", WQ_UNBOUND|WQ_MEM_RECLAIM, 1);
+	if (!c->btree_interior_update_worker)
+		return -ENOMEM;
 
-	return  mempool_init_kmalloc_pool(&c->btree_reserve_pool, 1,
-					  sizeof(struct btree_reserve)) ?:
-		mempool_init_kmalloc_pool(&c->btree_interior_update_pool, 1,
-					  sizeof(struct btree_update));
+	return mempool_init_kmalloc_pool(&c->btree_interior_update_pool, 1,
+					 sizeof(struct btree_update));
 }
diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h
index 5cec87951dc7..17bd1ca1fb78 100644
--- a/fs/bcachefs/btree_update_interior.h
+++ b/fs/bcachefs/btree_update_interior.h
@@ -6,34 +6,13 @@
 #include "btree_locking.h"
 #include "btree_update.h"
 
-struct btree_reserve {
-	struct disk_reservation	disk_res;
-	unsigned		nr;
-	struct btree		*b[BTREE_RESERVE_MAX];
-};
-
 void __bch2_btree_calc_format(struct bkey_format_state *, struct btree *);
 bool bch2_btree_node_format_fits(struct bch_fs *c, struct btree *,
 				struct bkey_format *);
 
-/* Btree node freeing/allocation: */
-
-/*
- * Tracks a btree node that has been (or is about to be) freed in memory, but
- * has _not_ yet been freed on disk (because the write that makes the new
- * node(s) visible and frees the old hasn't completed yet)
- */
-struct pending_btree_node_free {
-	bool			index_update_done;
-
-	__le64			seq;
-	enum btree_id		btree_id;
-	unsigned		level;
-	__BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX);
-};
+#define BTREE_UPDATE_NODES_MAX		((BTREE_MAX_DEPTH - 2) * 2 + GC_MERGE_NODES)
 
-#define BTREE_UPDATE_JOURNAL_RES		\
-	((BKEY_BTREE_PTR_U64s_MAX + 1) * (BTREE_MAX_DEPTH - 1) * 2)
+#define BTREE_UPDATE_JOURNAL_RES	(BTREE_UPDATE_NODES_MAX * (BKEY_BTREE_PTR_U64s_MAX + 1))
 
 /*
  * Tracks an in progress split/rewrite of a btree node and the update to the
@@ -72,9 +51,8 @@ struct btree_update {
 	unsigned			nodes_written:1;
 
 	enum btree_id			btree_id;
-	u8				level;
 
-	struct btree_reserve		*reserve;
+	struct disk_reservation		disk_res;
 	struct journal_preres		journal_preres;
 
 	/*
@@ -96,17 +74,28 @@ struct btree_update {
 	 */
 	struct journal_entry_pin	journal;
 
-	/*
-	 * Nodes being freed:
-	 * Protected by c->btree_node_pending_free_lock
-	 */
-	struct pending_btree_node_free	pending[BTREE_MAX_DEPTH + GC_MERGE_NODES];
-	unsigned			nr_pending;
+	/* Preallocated nodes we reserve when we start the update: */
+	struct btree			*prealloc_nodes[BTREE_UPDATE_NODES_MAX];
+	unsigned			nr_prealloc_nodes;
+
+	/* Nodes being freed: */
+	struct keylist			old_keys;
+	u64				_old_keys[BTREE_UPDATE_NODES_MAX *
+						  BKEY_BTREE_PTR_VAL_U64s_MAX];
+
+	/* Nodes being added: */
+	struct keylist			new_keys;
+	u64				_new_keys[BTREE_UPDATE_NODES_MAX *
+						  BKEY_BTREE_PTR_VAL_U64s_MAX];
 
 	/* New nodes, that will be made reachable by this update: */
-	struct btree			*new_nodes[BTREE_MAX_DEPTH * 2 + GC_MERGE_NODES];
+	struct btree			*new_nodes[BTREE_UPDATE_NODES_MAX];
 	unsigned			nr_new_nodes;
 
+	u8				open_buckets[BTREE_UPDATE_NODES_MAX *
+						     BCH_REPLICAS_MAX];
+	u8				nr_open_buckets;
+
 	unsigned			journal_u64s;
 	u64				journal_entries[BTREE_UPDATE_JOURNAL_RES];
 
@@ -120,14 +109,12 @@ struct btree_update {
 	u64				inline_keys[BKEY_BTREE_PTR_U64s_MAX * 3];
 };
 
-#define for_each_pending_btree_node_free(c, as, p)			\
-	list_for_each_entry(as, &c->btree_interior_update_list, list)	\
-		for (p = as->pending; p < as->pending + as->nr_pending; p++)
-
 void bch2_btree_node_free_inmem(struct bch_fs *, struct btree *,
 				struct btree_iter *);
 void bch2_btree_node_free_never_inserted(struct bch_fs *, struct btree *);
 
+void bch2_btree_update_get_open_buckets(struct btree_update *, struct btree *);
+
 struct btree *__bch2_btree_node_alloc_replacement(struct btree_update *,
 						  struct btree *,
 						  struct bkey_format);
@@ -139,6 +126,7 @@ bch2_btree_update_start(struct btree_trans *, enum btree_id, unsigned,
 
 void bch2_btree_interior_update_will_free_node(struct btree_update *,
 					       struct btree *);
+void bch2_btree_update_add_new_node(struct btree_update *, struct btree *);
 
 void bch2_btree_insert_node(struct btree_update *, struct btree *,
 			    struct btree_iter *, struct keylist *,
@@ -333,6 +321,10 @@ ssize_t bch2_btree_updates_print(struct bch_fs *, char *);
 
 size_t bch2_btree_interior_updates_nr_pending(struct bch_fs *);
 
+void bch2_journal_entries_to_btree_roots(struct bch_fs *, struct jset *);
+struct jset_entry *bch2_btree_roots_to_journal_entries(struct bch_fs *,
+					struct jset_entry *, struct jset_entry *);
+
 void bch2_fs_btree_interior_update_exit(struct bch_fs *);
 int bch2_fs_btree_interior_update_init(struct bch_fs *);
 
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 98b60d230dce..ffcaecc8a64f 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -414,8 +414,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans,
 	}
 
 	if (unlikely(trans->extra_journal_entry_u64s)) {
-		memcpy_u64s_small(bch2_journal_reservation_entry(&c->journal,
-								 &trans->journal_res),
+		memcpy_u64s_small(journal_res_entry(&c->journal, &trans->journal_res),
 				  trans->extra_journal_entries,
 				  trans->extra_journal_entry_u64s);
 
@@ -521,6 +520,10 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans,
 			bch2_btree_node_unlock_write_inlined(iter_l(i->iter)->b,
 							     i->iter);
 
+	if (!ret && trans->journal_pin)
+		bch2_journal_pin_add(&trans->c->journal, trans->journal_res.seq,
+				     trans->journal_pin, NULL);
+
 	/*
 	 * Drop journal reservation after dropping write locks, since dropping
 	 * the journal reservation may kick off a journal write:
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 43095ae4731d..5b827698c3e5 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -1180,7 +1180,7 @@ static int bch2_mark_stripe(struct bch_fs *c, struct bkey_s_c k,
 	return 0;
 }
 
-int bch2_mark_key_locked(struct bch_fs *c,
+static int bch2_mark_key_locked(struct bch_fs *c,
 		   struct bkey_s_c k,
 		   unsigned offset, s64 sectors,
 		   struct bch_fs_usage *fs_usage,
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index 29ebc07a2497..cea66c76850d 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -259,8 +259,6 @@ void bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *,
 			       size_t, enum bch_data_type, unsigned,
 			       struct gc_pos, unsigned);
 
-int bch2_mark_key_locked(struct bch_fs *, struct bkey_s_c, unsigned, s64,
-			 struct bch_fs_usage *, u64, unsigned);
 int bch2_mark_key(struct bch_fs *, struct bkey_s_c, unsigned, s64,
 		  struct bch_fs_usage *, u64, unsigned);
 int bch2_fs_usage_apply(struct bch_fs *, struct bch_fs_usage_online *,
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index 220daf88f7b9..5c84569c3404 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -958,15 +958,12 @@ void bch2_dev_journal_stop(struct journal *j, struct bch_dev *ca)
 
 void bch2_fs_journal_stop(struct journal *j)
 {
-	struct bch_fs *c = container_of(j, struct bch_fs, journal);
-
 	bch2_journal_flush_all_pins(j);
 
 	wait_event(j->wait, journal_entry_close(j));
 
 	/* do we need to write another journal entry? */
-	if (test_bit(JOURNAL_NOT_EMPTY, &j->flags) ||
-	    c->btree_roots_dirty)
+	if (test_bit(JOURNAL_NOT_EMPTY, &j->flags))
 		bch2_journal_meta(j);
 
 	journal_quiesce(j);
diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h
index 6630db6ecc14..2c55f74522e2 100644
--- a/fs/bcachefs/journal.h
+++ b/fs/bcachefs/journal.h
@@ -200,33 +200,40 @@ bch2_journal_add_entry_noreservation(struct journal_buf *buf, size_t u64s)
 }
 
 static inline struct jset_entry *
-bch2_journal_reservation_entry(struct journal *j, struct journal_res *res)
+journal_res_entry(struct journal *j, struct journal_res *res)
 {
 	return vstruct_idx(j->buf[res->idx].data, res->offset);
 }
 
+static inline unsigned journal_entry_set(struct jset_entry *entry, unsigned type,
+					  enum btree_id id, unsigned level,
+					  const void *data, unsigned u64s)
+{
+	entry->u64s	= cpu_to_le16(u64s);
+	entry->btree_id = id;
+	entry->level	= level;
+	entry->type	= type;
+	entry->pad[0]	= 0;
+	entry->pad[1]	= 0;
+	entry->pad[2]	= 0;
+	memcpy_u64s_small(entry->_data, data, u64s);
+
+	return jset_u64s(u64s);
+}
+
 static inline void bch2_journal_add_entry(struct journal *j, struct journal_res *res,
 					  unsigned type, enum btree_id id,
 					  unsigned level,
 					  const void *data, unsigned u64s)
 {
-	struct jset_entry *entry = bch2_journal_reservation_entry(j, res);
-	unsigned actual = jset_u64s(u64s);
+	unsigned actual = journal_entry_set(journal_res_entry(j, res),
+			       type, id, level, data, u64s);
 
 	EBUG_ON(!res->ref);
 	EBUG_ON(actual > res->u64s);
 
 	res->offset	+= actual;
 	res->u64s	-= actual;
-
-	entry->u64s	= cpu_to_le16(u64s);
-	entry->btree_id = id;
-	entry->level	= level;
-	entry->type	= type;
-	entry->pad[0]	= 0;
-	entry->pad[1]	= 0;
-	entry->pad[2]	= 0;
-	memcpy_u64s_small(entry->_data, data, u64s);
 }
 
 static inline void bch2_journal_add_keys(struct journal *j, struct journal_res *res,
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 421fde39ac0e..1724c80b323c 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -2,6 +2,7 @@
 #include "bcachefs.h"
 #include "alloc_foreground.h"
 #include "btree_io.h"
+#include "btree_update_interior.h"
 #include "buckets.h"
 #include "checksum.h"
 #include "error.h"
@@ -992,8 +993,23 @@ void bch2_journal_write(struct closure *cl)
 
 	j->write_start_time = local_clock();
 
-	start	= vstruct_last(jset);
-	end	= bch2_journal_super_entries_add_common(c, start,
+	/*
+	 * New btree roots are set by journalling them; when the journal entry
+	 * gets written we have to propagate them to c->btree_roots
+	 *
+	 * But, every journal entry we write has to contain all the btree roots
+	 * (at least for now); so after we copy btree roots to c->btree_roots we
+	 * have to get any missing btree roots and add them to this journal
+	 * entry:
+	 */
+
+	bch2_journal_entries_to_btree_roots(c, jset);
+
+	start = end = vstruct_last(jset);
+
+	end	= bch2_btree_roots_to_journal_entries(c, jset->start, end);
+
+	end	= bch2_journal_super_entries_add_common(c, end,
 						le64_to_cpu(jset->seq));
 	u64s	= (u64 *) end - (u64 *) start;
 	BUG_ON(u64s > j->entry_u64s_reserved);
diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
index 341106ab4a77..6cb37045cf68 100644
--- a/fs/bcachefs/journal_reclaim.c
+++ b/fs/bcachefs/journal_reclaim.c
@@ -330,7 +330,7 @@ static void bch2_journal_pin_add_locked(struct journal *j, u64 seq,
 
 	__journal_pin_drop(j, pin);
 
-	BUG_ON(!atomic_read(&pin_list->count));
+	BUG_ON(!atomic_read(&pin_list->count) && seq == journal_last_seq(j));
 
 	atomic_inc(&pin_list->count);
 	pin->seq	= seq;
diff --git a/fs/bcachefs/journal_reclaim.h b/fs/bcachefs/journal_reclaim.h
index 883a0a5680af..3ef641f7ce30 100644
--- a/fs/bcachefs/journal_reclaim.h
+++ b/fs/bcachefs/journal_reclaim.h
@@ -38,7 +38,7 @@ static inline void bch2_journal_pin_add(struct journal *j, u64 seq,
 					struct journal_entry_pin *pin,
 					journal_pin_flush_fn flush_fn)
 {
-	if (unlikely(!journal_pin_active(pin)))
+	if (unlikely(!journal_pin_active(pin) || pin->seq > seq))
 		__bch2_journal_pin_add(j, seq, pin, flush_fn);
 }
 
diff --git a/fs/bcachefs/keylist.c b/fs/bcachefs/keylist.c
index 5da54ced9cad..864dfaa67b7a 100644
--- a/fs/bcachefs/keylist.c
+++ b/fs/bcachefs/keylist.c
@@ -6,7 +6,7 @@
 int bch2_keylist_realloc(struct keylist *l, u64 *inline_u64s,
 			size_t nr_inline_u64s, size_t new_u64s)
 {
-	size_t oldsize = bch_keylist_u64s(l);
+	size_t oldsize = bch2_keylist_u64s(l);
 	size_t newsize = oldsize + new_u64s;
 	u64 *old_buf = l->keys_p == inline_u64s ? NULL : l->keys_p;
 	u64 *new_keys;
@@ -52,7 +52,7 @@ void bch2_keylist_pop_front(struct keylist *l)
 
 	memmove_u64s_down(l->keys,
 			  bkey_next(l->keys),
-			  bch_keylist_u64s(l));
+			  bch2_keylist_u64s(l));
 }
 
 #ifdef CONFIG_BCACHEFS_DEBUG
diff --git a/fs/bcachefs/keylist.h b/fs/bcachefs/keylist.h
index a7ff86b08abc..195799bb20bc 100644
--- a/fs/bcachefs/keylist.h
+++ b/fs/bcachefs/keylist.h
@@ -36,14 +36,14 @@ static inline bool bch2_keylist_empty(struct keylist *l)
 	return l->top == l->keys;
 }
 
-static inline size_t bch_keylist_u64s(struct keylist *l)
+static inline size_t bch2_keylist_u64s(struct keylist *l)
 {
 	return l->top_p - l->keys_p;
 }
 
 static inline size_t bch2_keylist_bytes(struct keylist *l)
 {
-	return bch_keylist_u64s(l) * sizeof(u64);
+	return bch2_keylist_u64s(l) * sizeof(u64);
 }
 
 static inline struct bkey_i *bch2_keylist_front(struct keylist *l)
diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c
index e26fa1608f39..96c8690adc5b 100644
--- a/fs/bcachefs/migrate.c
+++ b/fs/bcachefs/migrate.c
@@ -151,15 +151,8 @@ retry:
 	}
 
 	/* flush relevant btree updates */
-	while (1) {
-		closure_wait_event(&c->btree_interior_update_wait,
-				   !bch2_btree_interior_updates_nr_pending(c) ||
-				   c->btree_roots_dirty);
-		if (c->btree_roots_dirty)
-			bch2_journal_meta(&c->journal);
-		if (!bch2_btree_interior_updates_nr_pending(c))
-			break;
-	}
+	closure_wait_event(&c->btree_interior_update_wait,
+			   !bch2_btree_interior_updates_nr_pending(c));
 
 	ret = 0;
 err:
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index 882e86e70db7..02cc5089a163 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -774,14 +774,8 @@ int bch2_data_job(struct bch_fs *c,
 
 		ret = bch2_move_btree(c, rereplicate_pred, c, stats) ?: ret;
 
-		while (1) {
-			closure_wait_event(&c->btree_interior_update_wait,
-					   !bch2_btree_interior_updates_nr_pending(c) ||
-					   c->btree_roots_dirty);
-			if (!bch2_btree_interior_updates_nr_pending(c))
-				break;
-			bch2_journal_meta(&c->journal);
-		}
+		closure_wait_event(&c->btree_interior_update_wait,
+				   !bch2_btree_interior_updates_nr_pending(c));
 
 		ret = bch2_replicas_gc2(c) ?: ret;
 
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 95265f1c2b21..b386c7e15e97 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -763,6 +763,7 @@ static int verify_superblock_clean(struct bch_fs *c,
 			"superblock read clock doesn't match journal after clean shutdown");
 
 	for (i = 0; i < BTREE_ID_NR; i++) {
+		char buf1[200], buf2[200];
 		struct bkey_i *k1, *k2;
 		unsigned l1 = 0, l2 = 0;
 
@@ -778,7 +779,11 @@ static int verify_superblock_clean(struct bch_fs *c,
 				    k1->k.u64s != k2->k.u64s ||
 				    memcmp(k1, k2, bkey_bytes(k1)) ||
 				    l1 != l2, c,
-			"superblock btree root doesn't match journal after clean shutdown");
+			"superblock btree root %u doesn't match journal after clean shutdown\n"
+			"sb:      l=%u %s\n"
+			"journal: l=%u %s\n", i,
+			l1, (bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(k1)), buf1),
+			l2, (bch2_bkey_val_to_text(&PBUF(buf2), c, bkey_i_to_s_c(k2)), buf2));
 	}
 fsck_err:
 	return ret;
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index c9d2a01fec29..eb5a91d232e0 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 
 #include "bcachefs.h"
+#include "btree_update_interior.h"
 #include "buckets.h"
 #include "checksum.h"
 #include "disk_groups.h"
@@ -955,7 +956,6 @@ int bch2_fs_mark_dirty(struct bch_fs *c)
 
 	mutex_lock(&c->sb_lock);
 	SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
-	c->disk_sb.sb->compat[0] &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_METADATA);
 	c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_new_extent_overwrite;
 	c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_extents_above_btree_updates;
 	c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_btree_updates_journalled;
@@ -989,27 +989,8 @@ bch2_journal_super_entries_add_common(struct bch_fs *c,
 				      struct jset_entry *entry,
 				      u64 journal_seq)
 {
-	struct btree_root *r;
 	unsigned i;
 
-	mutex_lock(&c->btree_root_lock);
-
-	for (r = c->btree_roots;
-	     r < c->btree_roots + BTREE_ID_NR;
-	     r++)
-		if (r->alive) {
-			entry_init_u64s(entry, r->key.u64s + 1);
-			entry->btree_id	= r - c->btree_roots;
-			entry->level	= r->level;
-			entry->type	= BCH_JSET_ENTRY_btree_root;
-			bkey_copy(&entry->start[0], &r->key);
-
-			entry = vstruct_next(entry);
-		}
-	c->btree_roots_dirty = false;
-
-	mutex_unlock(&c->btree_root_lock);
-
 	percpu_down_read(&c->mark_lock);
 
 	if (!journal_seq) {
@@ -1111,6 +1092,7 @@ void bch2_fs_mark_clean(struct bch_fs *c)
 
 	entry = sb_clean->start;
 	entry = bch2_journal_super_entries_add_common(c, entry, 0);
+	entry = bch2_btree_roots_to_journal_entries(c, entry, entry);
 	BUG_ON((void *) entry > vstruct_end(&sb_clean->field));
 
 	memset(entry, 0,
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 4335e0a11c2e..3cf75ac1b804 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -227,6 +227,7 @@ static void __bch2_fs_read_only(struct bch_fs *c)
 		 */
 		closure_wait_event(&c->btree_interior_update_wait,
 				   !bch2_btree_interior_updates_nr_pending(c));
+		flush_work(&c->btree_interior_update_work);
 
 		clean_passes = wrote ? 0 : clean_passes + 1;
 	} while (clean_passes < 2);
@@ -234,6 +235,10 @@ static void __bch2_fs_read_only(struct bch_fs *c)
 	bch_verbose(c, "writing alloc info complete");
 	set_bit(BCH_FS_ALLOC_CLEAN, &c->flags);
 nowrote_alloc:
+	closure_wait_event(&c->btree_interior_update_wait,
+			   !bch2_btree_interior_updates_nr_pending(c));
+	flush_work(&c->btree_interior_update_work);
+
 	for_each_member_device(ca, c, i)
 		bch2_dev_allocator_stop(ca);
 
-- 
cgit 


From beb6db68a555d14b4b3a7b894d6d635774bcdafd Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 28 May 2020 17:15:41 -0400
Subject: bcachefs: fsck_error_lock requires GFP_NOFS

this fixes a lockdep splat

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/error.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c
index 5a5cfee623e2..1662a36244cd 100644
--- a/fs/bcachefs/error.c
+++ b/fs/bcachefs/error.c
@@ -85,7 +85,7 @@ enum fsck_err_ret bch2_fsck_err(struct bch_fs *c, unsigned flags,
 		if (s->fmt == fmt)
 			goto found;
 
-	s = kzalloc(sizeof(*s), GFP_KERNEL);
+	s = kzalloc(sizeof(*s), GFP_NOFS);
 	if (!s) {
 		if (!c->fsck_alloc_err)
 			bch_err(c, "kmalloc err, cannot ratelimit fsck errs");
-- 
cgit 


From baeed3c3c028359c625fbe92bc5924edaea91ec4 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 28 May 2020 15:51:50 -0400
Subject: bcachefs: Don't require alloc btree to be updated before buckets are
 used

This is to break a circular dependency in the shutdown path.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c | 11 ++++++++--
 fs/bcachefs/buckets.c          | 47 +++++++++++++++++++++++++++++-------------
 2 files changed, 42 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index b3c5d82c15de..38173f662d1e 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -843,7 +843,7 @@ static int bch2_invalidate_one_bucket2(struct btree_trans *trans,
 	struct bkey_s_c k;
 	bool invalidating_cached_data;
 	size_t b;
-	int ret;
+	int ret = 0;
 
 	BUG_ON(!ca->alloc_heap.used ||
 	       !ca->alloc_heap.data[0].nr);
@@ -857,11 +857,18 @@ static int bch2_invalidate_one_bucket2(struct btree_trans *trans,
 
 	BUG_ON(!fifo_push(&ca->free_inc, b));
 
+	g = bucket(ca, b);
+	m = READ_ONCE(g->mark);
+
 	bch2_mark_alloc_bucket(c, ca, b, true, gc_pos_alloc(c, NULL), 0);
 
 	spin_unlock(&c->freelist_lock);
 	percpu_up_read(&c->mark_lock);
 
+	invalidating_cached_data = m.cached_sectors != 0;
+	if (!invalidating_cached_data)
+		goto out;
+
 	BUG_ON(BKEY_ALLOC_VAL_U64s_MAX > 8);
 
 	bch2_btree_iter_set_pos(iter, POS(ca->dev_idx, b));
@@ -915,7 +922,7 @@ retry:
 				flags);
 	if (ret == -EINTR)
 		goto retry;
-
+out:
 	if (!ret) {
 		/* remove from alloc_heap: */
 		struct alloc_heap_entry e, *top = ca->alloc_heap.data;
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 5b827698c3e5..ebdbdd049f50 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -1457,11 +1457,13 @@ static int bch2_trans_mark_pointer(struct btree_trans *trans,
 	if (ret < 0)
 		return ret;
 
-	if (!ret && unlikely(!test_bit(BCH_FS_ALLOC_WRITTEN, &c->flags))) {
+	if (k.k->type != KEY_TYPE_alloc ||
+	    (!ret && unlikely(!test_bit(BCH_FS_ALLOC_WRITTEN, &c->flags)))) {
 		/*
 		 * During journal replay, and if gc repairs alloc info at
 		 * runtime, the alloc info in the btree might not be up to date
-		 * yet - so, trust the in memory mark:
+		 * yet - so, trust the in memory mark - unless we're already
+		 * updating that key:
 		 */
 		struct bucket *g;
 		struct bucket_mark m;
@@ -1472,22 +1474,39 @@ static int bch2_trans_mark_pointer(struct btree_trans *trans,
 		u	= alloc_mem_to_key(g, m);
 		percpu_up_read(&c->mark_lock);
 	} else {
-		/*
-		 * Unless we're already updating that key:
-		 */
-		if (k.k->type != KEY_TYPE_alloc) {
-			bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
-				      "pointer to nonexistent bucket %llu:%llu",
-				      iter->pos.inode, iter->pos.offset);
-			ret = -1;
-			goto out;
-		}
-
 		u = bch2_alloc_unpack(k);
 	}
 
-	if (gen_after(u.gen, p.ptr.gen)) {
+	if (u.gen != p.ptr.gen) {
 		ret = 1;
+
+		if (gen_after(p.ptr.gen, u.gen)) {
+			bch2_fs_inconsistent(c,
+				      "bucket %llu:%llu gen %u data type %s: ptr gen %u newer than bucket gen",
+				      iter->pos.inode, iter->pos.offset, u.gen,
+				      bch2_data_types[u.data_type ?: data_type],
+				      p.ptr.gen);
+			ret = -EIO;
+		}
+
+		if (gen_cmp(u.gen, p.ptr.gen) >= 96U) {
+			bch2_fs_inconsistent(c,
+				      "bucket %llu:%llu gen %u data type %s: ptr gen %u too stale",
+				      iter->pos.inode, iter->pos.offset, u.gen,
+				      bch2_data_types[u.data_type ?: data_type],
+				      p.ptr.gen);
+			ret = -EIO;
+		}
+
+		if (!p.ptr.cached) {
+			bch2_fs_inconsistent(c,
+				      "bucket %llu:%llu gen %u data type %s: stale dirty ptr (gen %u)",
+				      iter->pos.inode, iter->pos.offset, u.gen,
+				      bch2_data_types[u.data_type ?: data_type],
+				      p.ptr.gen);
+			ret = -EIO;
+		}
+
 		goto out;
 	}
 
-- 
cgit 


From 039fc4c5221f7433d8383e25a7c70b30793b4916 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 28 May 2020 16:06:13 -0400
Subject: bcachefs: Fixes for going RO

Now that interior btree updates are fully transactional, we don't need
to write out alloc info in a loop. However, interior btree updates do
put more things in the journal, so we still need a loop in the RO
sequence.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c | 11 ++++++++-
 fs/bcachefs/bcachefs.h         |  1 +
 fs/bcachefs/journal_reclaim.c  | 23 ++++++++++++++-----
 fs/bcachefs/journal_reclaim.h  |  6 ++---
 fs/bcachefs/super.c            | 52 +++++++++++++++++++++++-------------------
 5 files changed, 60 insertions(+), 33 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 38173f662d1e..09a719b256b3 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -869,6 +869,15 @@ static int bch2_invalidate_one_bucket2(struct btree_trans *trans,
 	if (!invalidating_cached_data)
 		goto out;
 
+	/*
+	 * If the read-only path is trying to shut down, we can't be generating
+	 * new btree updates:
+	 */
+	if (test_bit(BCH_FS_ALLOCATOR_STOPPING, &c->flags)) {
+		ret = 1;
+		goto out;
+	}
+
 	BUG_ON(BKEY_ALLOC_VAL_U64s_MAX > 8);
 
 	bch2_btree_iter_set_pos(iter, POS(ca->dev_idx, b));
@@ -956,7 +965,7 @@ out:
 		percpu_up_read(&c->mark_lock);
 	}
 
-	return ret;
+	return ret < 0 ? ret : 0;
 }
 
 static bool bch2_invalidate_one_bucket(struct bch_fs *c, struct bch_dev *ca,
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index e12946d686dd..a90072508819 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -482,6 +482,7 @@ enum {
 	BCH_FS_ALLOC_CLEAN,
 	BCH_FS_ALLOCATOR_STARTED,
 	BCH_FS_ALLOCATOR_RUNNING,
+	BCH_FS_ALLOCATOR_STOPPING,
 	BCH_FS_INITIAL_GC_DONE,
 	BCH_FS_FSCK_DONE,
 	BCH_FS_STARTED,
diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
index 6cb37045cf68..556f12602fcf 100644
--- a/fs/bcachefs/journal_reclaim.c
+++ b/fs/bcachefs/journal_reclaim.c
@@ -413,10 +413,12 @@ journal_get_next_pin(struct journal *j, u64 max_seq, u64 *seq)
 	return ret;
 }
 
-static void journal_flush_pins(struct journal *j, u64 seq_to_flush,
+/* returns true if we did work */
+static bool journal_flush_pins(struct journal *j, u64 seq_to_flush,
 			       unsigned min_nr)
 {
 	struct journal_entry_pin *pin;
+	bool ret = false;
 	u64 seq;
 
 	lockdep_assert_held(&j->reclaim_lock);
@@ -431,7 +433,10 @@ static void journal_flush_pins(struct journal *j, u64 seq_to_flush,
 		BUG_ON(j->flush_in_progress != pin);
 		j->flush_in_progress = NULL;
 		wake_up(&j->pin_flush_wait);
+		ret = true;
 	}
+
+	return ret;
 }
 
 /**
@@ -523,7 +528,8 @@ void bch2_journal_reclaim_work(struct work_struct *work)
 	mutex_unlock(&j->reclaim_lock);
 }
 
-static int journal_flush_done(struct journal *j, u64 seq_to_flush)
+static int journal_flush_done(struct journal *j, u64 seq_to_flush,
+			      bool *did_work)
 {
 	int ret;
 
@@ -533,7 +539,7 @@ static int journal_flush_done(struct journal *j, u64 seq_to_flush)
 
 	mutex_lock(&j->reclaim_lock);
 
-	journal_flush_pins(j, seq_to_flush, 0);
+	*did_work = journal_flush_pins(j, seq_to_flush, 0);
 
 	spin_lock(&j->lock);
 	/*
@@ -551,12 +557,17 @@ static int journal_flush_done(struct journal *j, u64 seq_to_flush)
 	return ret;
 }
 
-void bch2_journal_flush_pins(struct journal *j, u64 seq_to_flush)
+bool bch2_journal_flush_pins(struct journal *j, u64 seq_to_flush)
 {
+	bool did_work = false;
+
 	if (!test_bit(JOURNAL_STARTED, &j->flags))
-		return;
+		return false;
+
+	closure_wait_event(&j->async_wait,
+		journal_flush_done(j, seq_to_flush, &did_work));
 
-	closure_wait_event(&j->async_wait, journal_flush_done(j, seq_to_flush));
+	return did_work;
 }
 
 int bch2_journal_flush_device_pins(struct journal *j, int dev_idx)
diff --git a/fs/bcachefs/journal_reclaim.h b/fs/bcachefs/journal_reclaim.h
index 3ef641f7ce30..272ba8a37967 100644
--- a/fs/bcachefs/journal_reclaim.h
+++ b/fs/bcachefs/journal_reclaim.h
@@ -53,11 +53,11 @@ void bch2_journal_do_discards(struct journal *);
 void bch2_journal_reclaim(struct journal *);
 void bch2_journal_reclaim_work(struct work_struct *);
 
-void bch2_journal_flush_pins(struct journal *, u64);
+bool bch2_journal_flush_pins(struct journal *, u64);
 
-static inline void bch2_journal_flush_all_pins(struct journal *j)
+static inline bool bch2_journal_flush_all_pins(struct journal *j)
 {
-	bch2_journal_flush_pins(j, U64_MAX);
+	return bch2_journal_flush_pins(j, U64_MAX);
 }
 
 int bch2_journal_flush_device_pins(struct journal *, int);
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 3cf75ac1b804..9da64d9d52e5 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -175,7 +175,7 @@ struct bch_fs *bch2_uuid_to_fs(__uuid_t uuid)
 static void __bch2_fs_read_only(struct bch_fs *c)
 {
 	struct bch_dev *ca;
-	bool wrote;
+	bool wrote = false;
 	unsigned i, clean_passes = 0;
 	int ret;
 
@@ -200,39 +200,46 @@ static void __bch2_fs_read_only(struct bch_fs *c)
 		goto nowrote_alloc;
 
 	bch_verbose(c, "writing alloc info");
+	/*
+	 * This should normally just be writing the bucket read/write clocks:
+	 */
+	ret = bch2_stripes_write(c, BTREE_INSERT_NOCHECK_RW, &wrote) ?:
+		bch2_alloc_write(c, BTREE_INSERT_NOCHECK_RW, &wrote);
+	bch_verbose(c, "writing alloc info complete");
 
-	do {
-		wrote = false;
+	if (ret && !test_bit(BCH_FS_EMERGENCY_RO, &c->flags))
+		bch2_fs_inconsistent(c, "error writing out alloc info %i", ret);
 
-		ret = bch2_stripes_write(c, BTREE_INSERT_NOCHECK_RW, &wrote) ?:
-			bch2_alloc_write(c, BTREE_INSERT_NOCHECK_RW, &wrote);
+	if (ret)
+		goto nowrote_alloc;
 
-		if (ret && !test_bit(BCH_FS_EMERGENCY_RO, &c->flags))
-			bch2_fs_inconsistent(c, "error writing out alloc info %i", ret);
+	bch_verbose(c, "flushing journal and stopping allocators");
 
-		if (ret)
-			goto nowrote_alloc;
+	bch2_journal_flush_all_pins(&c->journal);
+	set_bit(BCH_FS_ALLOCATOR_STOPPING, &c->flags);
 
-		for_each_member_device(ca, c, i)
-			bch2_dev_allocator_quiesce(c, ca);
+	do {
+		clean_passes++;
 
-		bch2_journal_flush_all_pins(&c->journal);
+		if (bch2_journal_flush_all_pins(&c->journal))
+			clean_passes = 0;
 
 		/*
-		 * We need to explicitly wait on btree interior updates to complete
-		 * before stopping the journal, flushing all journal pins isn't
-		 * sufficient, because in the BTREE_INTERIOR_UPDATING_ROOT case btree
-		 * interior updates have to drop their journal pin before they're
-		 * fully complete:
+		 * In flight interior btree updates will generate more journal
+		 * updates and btree updates (alloc btree):
 		 */
-		closure_wait_event(&c->btree_interior_update_wait,
-				   !bch2_btree_interior_updates_nr_pending(c));
+		if (bch2_btree_interior_updates_nr_pending(c)) {
+			closure_wait_event(&c->btree_interior_update_wait,
+					   !bch2_btree_interior_updates_nr_pending(c));
+			clean_passes = 0;
+		}
 		flush_work(&c->btree_interior_update_work);
 
-		clean_passes = wrote ? 0 : clean_passes + 1;
+		if (bch2_journal_flush_all_pins(&c->journal))
+			clean_passes = 0;
 	} while (clean_passes < 2);
+	bch_verbose(c, "flushing journal and stopping allocators complete");
 
-	bch_verbose(c, "writing alloc info complete");
 	set_bit(BCH_FS_ALLOC_CLEAN, &c->flags);
 nowrote_alloc:
 	closure_wait_event(&c->btree_interior_update_wait,
@@ -243,11 +250,10 @@ nowrote_alloc:
 		bch2_dev_allocator_stop(ca);
 
 	clear_bit(BCH_FS_ALLOCATOR_RUNNING, &c->flags);
+	clear_bit(BCH_FS_ALLOCATOR_STOPPING, &c->flags);
 
 	bch2_fs_journal_stop(&c->journal);
 
-	/* XXX: mark super that alloc info is persistent */
-
 	/*
 	 * the journal kicks off btree writes via reclaim - wait for in flight
 	 * writes after stopping journal:
-- 
cgit 


From 36b8372b595748d37dcdcd915c176bada978fbe5 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 2 Jun 2020 16:30:54 -0400
Subject: bcachefs: Add an option to disable reflink support

Reflink might be buggy, so we're adding an option so users can help
bisect what's going on.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs_format.h | 2 ++
 fs/bcachefs/fs-io.c           | 3 +++
 fs/bcachefs/opts.h            | 5 +++++
 fs/bcachefs/reflink.c         | 3 +++
 4 files changed, 13 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index f0f8964a98b1..14eca567a10d 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -1266,6 +1266,8 @@ LE64_BITMASK(BCH_SB_PRJQUOTA,		struct bch_sb, flags[0], 59, 60);
 
 LE64_BITMASK(BCH_SB_HAS_ERRORS,		struct bch_sb, flags[0], 60, 61);
 
+LE64_BITMASK(BCH_SB_REFLINK,		struct bch_sb, flags[0], 61, 62);
+
 /* 61-64 unused */
 
 LE64_BITMASK(BCH_SB_STR_HASH_TYPE,	struct bch_sb, flags[1],  0,  4);
diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 7ce6d71aca29..98fe1ec7867d 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -2812,6 +2812,9 @@ loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src,
 	u64 aligned_len;
 	loff_t ret = 0;
 
+	if (!c->opts.reflink)
+		return -EOPNOTSUPP;
+
 	if (remap_flags & ~(REMAP_FILE_DEDUP|REMAP_FILE_ADVISORY))
 		return -EINVAL;
 
diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
index 1e579f67346a..fe457117bf89 100644
--- a/fs/bcachefs/opts.h
+++ b/fs/bcachefs/opts.h
@@ -207,6 +207,11 @@ enum opt_type {
 	  OPT_BOOL(),							\
 	  BCH_SB_PRJQUOTA,		false,				\
 	  NULL,		"Enable project quotas")			\
+	x(reflink,			u8,				\
+	  OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,				\
+	  OPT_BOOL(),							\
+	  BCH_SB_REFLINK,		true,				\
+	  NULL,		"Enable reflink support")			\
 	x(degraded,			u8,				\
 	  OPT_MOUNT,							\
 	  OPT_BOOL(),							\
diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c
index 2f223be74926..3c473f1380a6 100644
--- a/fs/bcachefs/reflink.c
+++ b/fs/bcachefs/reflink.c
@@ -167,6 +167,9 @@ s64 bch2_remap_range(struct bch_fs *c,
 	u64 src_done, dst_done;
 	int ret = 0, ret2 = 0;
 
+	if (!c->opts.reflink)
+		return -EOPNOTSUPP;
+
 	if (!percpu_ref_tryget(&c->writes))
 		return -EROFS;
 
-- 
cgit 


From 61fc3c9610e4728c22e5be67a45d0520b1a388cf Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 3 Jun 2020 16:20:22 -0400
Subject: bcachefs: Set filesystem features earlier in fs init path

Before we were setting features after allocating btree nodes, which
meant we were using the old btree pointer format.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/recovery.c | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index b386c7e15e97..384dfb2279c1 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -1154,6 +1154,15 @@ int bch2_fs_initialize(struct bch_fs *c)
 		bch2_mark_dev_superblock(c, ca, 0);
 	mutex_unlock(&c->sb_lock);
 
+	mutex_lock(&c->sb_lock);
+	c->disk_sb.sb->version = c->disk_sb.sb->version_min =
+		le16_to_cpu(bcachefs_metadata_version_current);
+	c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_atomic_nlink;
+	c->disk_sb.sb->features[0] |= BCH_SB_FEATURES_ALL;
+
+	bch2_write_super(c);
+	mutex_unlock(&c->sb_lock);
+
 	set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags);
 	set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags);
 
@@ -1212,11 +1221,6 @@ int bch2_fs_initialize(struct bch_fs *c)
 		goto err;
 
 	mutex_lock(&c->sb_lock);
-	c->disk_sb.sb->version = c->disk_sb.sb->version_min =
-		le16_to_cpu(bcachefs_metadata_version_current);
-	c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_atomic_nlink;
-	c->disk_sb.sb->features[0] |= BCH_SB_FEATURES_ALL;
-
 	SET_BCH_SB_INITIALIZED(c->disk_sb.sb, true);
 	SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
 
-- 
cgit 


From 495aabede3ff594c5eda98cb9f4463502cb48cad Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 2 Jun 2020 16:36:11 -0400
Subject: bcachefs: Add debug code to print btree transactions

Intented to help debug deadlocks, since we can't use lockdep to check
btree node lock ordering.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs.h      |  3 +++
 fs/bcachefs/btree_iter.c    | 62 ++++++++++++++++++++++++++++++++++++++++++++-
 fs/bcachefs/btree_iter.h    |  2 ++
 fs/bcachefs/btree_locking.h | 12 ++++++++-
 fs/bcachefs/btree_types.h   |  4 +++
 fs/bcachefs/clock.c         |  2 +-
 fs/bcachefs/journal.c       |  4 +--
 fs/bcachefs/sysfs.c         |  8 ++++++
 8 files changed, 92 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index a90072508819..dd34f30f01e5 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -628,6 +628,9 @@ struct bch_fs {
 	struct workqueue_struct	*btree_interior_update_worker;
 	struct work_struct	btree_interior_update_work;
 
+	/* btree_iter.c: */
+	struct mutex		btree_trans_lock;
+	struct list_head	btree_trans_list;
 	mempool_t		btree_iters_pool;
 
 	struct workqueue_struct	*wq;
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index ca775e63b4c6..f7de2def58df 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1912,7 +1912,7 @@ static struct btree_iter *btree_trans_iter_alloc(struct btree_trans *trans)
 			struct btree_iter *iter;
 
 			trans_for_each_iter(trans, iter) {
-				pr_err("iter: btree %s pos %llu:%llu%s%s%s %pf",
+				pr_err("iter: btree %s pos %llu:%llu%s%s%s %ps",
 				       bch2_btree_ids[iter->btree_id],
 				       iter->pos.inode,
 				       iter->pos.offset,
@@ -2193,12 +2193,24 @@ void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c,
 
 	if (expected_mem_bytes)
 		bch2_trans_preload_mem(trans, expected_mem_bytes);
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+	mutex_lock(&c->btree_trans_lock);
+	list_add(&trans->list, &c->btree_trans_list);
+	mutex_unlock(&c->btree_trans_lock);
+#endif
 }
 
 int bch2_trans_exit(struct btree_trans *trans)
 {
 	bch2_trans_unlock(trans);
 
+#ifdef CONFIG_BCACHEFS_DEBUG
+	mutex_lock(&trans->c->btree_trans_lock);
+	list_del(&trans->list);
+	mutex_unlock(&trans->c->btree_trans_lock);
+#endif
+
 	kfree(trans->fs_usage_deltas);
 	kfree(trans->mem);
 	if (trans->used_mempool)
@@ -2211,6 +2223,51 @@ int bch2_trans_exit(struct btree_trans *trans)
 	return trans->error ? -EIO : 0;
 }
 
+void bch2_btree_trans_to_text(struct printbuf *out, struct bch_fs *c)
+{
+#ifdef CONFIG_BCACHEFS_DEBUG
+	struct btree_trans *trans;
+	struct btree_iter *iter;
+	struct btree *b;
+	unsigned l;
+
+	mutex_lock(&c->btree_trans_lock);
+	list_for_each_entry(trans, &c->btree_trans_list, list) {
+		pr_buf(out, "%ps\n", (void *) trans->ip);
+
+		trans_for_each_iter(trans, iter) {
+			if (!iter->nodes_locked)
+				continue;
+
+			pr_buf(out, "  iter %s:", bch2_btree_ids[iter->btree_id]);
+			bch2_bpos_to_text(out, iter->pos);
+			pr_buf(out, "\n");
+
+			for (l = 0; l < BTREE_MAX_DEPTH; l++) {
+				if (btree_node_locked(iter, l)) {
+					b = iter->l[l].b;
+
+					pr_buf(out, "    %p l=%u %s ",
+					       b, l, btree_node_intent_locked(iter, l) ? "i" : "r");
+					bch2_bpos_to_text(out, b->key.k.p);
+					pr_buf(out, "\n");
+				}
+			}
+		}
+
+		b = READ_ONCE(trans->locking);
+		if (b) {
+			pr_buf(out, "  locking %px l=%u %s:",
+			       b, b->c.level,
+			       bch2_btree_ids[b->c.btree_id]);
+			bch2_bpos_to_text(out, b->key.k.p);
+			pr_buf(out, "\n");
+		}
+	}
+	mutex_unlock(&c->btree_trans_lock);
+#endif
+}
+
 void bch2_fs_btree_iter_exit(struct bch_fs *c)
 {
 	mempool_exit(&c->btree_iters_pool);
@@ -2220,6 +2277,9 @@ int bch2_fs_btree_iter_init(struct bch_fs *c)
 {
 	unsigned nr = BTREE_ITER_MAX;
 
+	INIT_LIST_HEAD(&c->btree_trans_list);
+	mutex_init(&c->btree_trans_lock);
+
 	return mempool_init_kmalloc_pool(&c->btree_iters_pool, 1,
 			sizeof(struct btree_iter) * nr +
 			sizeof(struct btree_insert_entry) * nr +
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index 928170afe3b5..80577853a04e 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -303,6 +303,8 @@ void *bch2_trans_kmalloc(struct btree_trans *, size_t);
 void bch2_trans_init(struct btree_trans *, struct bch_fs *, unsigned, size_t);
 int bch2_trans_exit(struct btree_trans *);
 
+void bch2_btree_trans_to_text(struct printbuf *, struct bch_fs *);
+
 void bch2_fs_btree_iter_exit(struct bch_fs *);
 int bch2_fs_btree_iter_init(struct bch_fs *);
 
diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h
index 3b199e2e1e9e..cf1801ee14a2 100644
--- a/fs/bcachefs/btree_locking.h
+++ b/fs/bcachefs/btree_locking.h
@@ -181,11 +181,21 @@ static inline bool btree_node_lock(struct btree *b, struct bpos pos,
 				   struct btree_iter *iter,
 				   enum six_lock_type type)
 {
+	bool ret;
+
 	EBUG_ON(level >= BTREE_MAX_DEPTH);
+#ifdef CONFIG_BCACHEFS_DEBUG
+	iter->trans->locking = b;
+#endif
 
-	return likely(six_trylock_type(&b->c.lock, type)) ||
+	ret = likely(six_trylock_type(&b->c.lock, type)) ||
 		btree_node_lock_increment(iter, b, level, type) ||
 		__bch2_btree_node_lock(b, pos, level, iter, type);
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+	iter->trans->locking = NULL;
+#endif
+	return ret;
 }
 
 bool __bch2_btree_node_relock(struct btree_iter *, unsigned);
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 0ecd00475712..316461e2e016 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -284,6 +284,10 @@ struct btree_insert_entry {
 
 struct btree_trans {
 	struct bch_fs		*c;
+#ifdef CONFIG_BCACHEFS_DEBUG
+	struct list_head	list;
+	struct btree		*locking;
+#endif
 	unsigned long		ip;
 
 	u64			iters_linked;
diff --git a/fs/bcachefs/clock.c b/fs/bcachefs/clock.c
index 51286520c5c7..163058173252 100644
--- a/fs/bcachefs/clock.c
+++ b/fs/bcachefs/clock.c
@@ -162,7 +162,7 @@ ssize_t bch2_io_timers_show(struct io_clock *clock, char *buf)
 	now = atomic_long_read(&clock->now);
 
 	for (i = 0; i < clock->timers.used; i++)
-		pr_buf(&out, "%pf:\t%li\n",
+		pr_buf(&out, "%ps:\t%li\n",
 		       clock->timers.data[i]->fn,
 		       clock->timers.data[i]->expire - now);
 	spin_unlock(&clock->timer_lock);
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index 5c84569c3404..48607e01bd22 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -1237,14 +1237,14 @@ ssize_t bch2_journal_print_pins(struct journal *j, char *buf)
 		       i, atomic_read(&pin_list->count));
 
 		list_for_each_entry(pin, &pin_list->list, list)
-			pr_buf(&out, "\t%p %pf\n",
+			pr_buf(&out, "\t%px %ps\n",
 			       pin, pin->flush);
 
 		if (!list_empty(&pin_list->flushed))
 			pr_buf(&out, "flushed:\n");
 
 		list_for_each_entry(pin, &pin_list->flushed, list)
-			pr_buf(&out, "\t%p %pf\n",
+			pr_buf(&out, "\t%px %ps\n",
 			       pin, pin->flush);
 	}
 	spin_unlock(&j->lock);
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index 662c84b91323..06b59e991312 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -166,6 +166,7 @@ read_attribute(journal_debug);
 read_attribute(journal_pins);
 read_attribute(btree_updates);
 read_attribute(dirty_btree_nodes);
+read_attribute(btree_transactions);
 
 read_attribute(internal_uuid);
 
@@ -401,6 +402,12 @@ SHOW(bch2_fs)
 
 	if (attr == &sysfs_dirty_btree_nodes)
 		return bch2_dirty_btree_nodes_print(c, buf);
+	if (attr == &sysfs_btree_transactions) {
+		struct printbuf out = _PBUF(buf, PAGE_SIZE);
+
+		bch2_btree_trans_to_text(&out, c);
+		return out.pos - buf;
+	}
 
 	if (attr == &sysfs_compression_stats)
 		return bch2_compression_stats(c, buf);
@@ -571,6 +578,7 @@ struct attribute *bch2_fs_internal_files[] = {
 	&sysfs_journal_pins,
 	&sysfs_btree_updates,
 	&sysfs_dirty_btree_nodes,
+	&sysfs_btree_transactions,
 
 	&sysfs_read_realloc_races,
 	&sysfs_extent_migrate_done,
-- 
cgit 


From f96c0df4dbb0ed845bbc51f341d00bc90368c93c Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 2 Jun 2020 19:41:47 -0400
Subject: bcachefs: Fix a deadlock in bch2_btree_node_get_sibling()

There was a bad interaction with bch2_btree_iter_set_pos_same_leaf(),
which can leave a btree node locked that is just outside iter->pos,
breaking the lock ordering checks in __bch2_btree_node_lock(). Ideally
we should get rid of this corner case, but for now fix it locally with
verbose comments.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_cache.c | 12 ++++++++++++
 fs/bcachefs/btree_iter.c  | 18 +++++++++++++++---
 fs/bcachefs/btree_iter.h  |  9 +--------
 fs/bcachefs/btree_types.h |  1 +
 4 files changed, 29 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index ef7ca552a0a3..fa55bab5944e 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -851,6 +851,18 @@ struct btree *bch2_btree_node_get_sibling(struct bch_fs *c,
 	if (!parent)
 		return NULL;
 
+	/*
+	 * There's a corner case where a btree_iter might have a node locked
+	 * that is just outside its current pos - when
+	 * bch2_btree_iter_set_pos_same_leaf() gets to the end of the node.
+	 *
+	 * But the lock ordering checks in __bch2_btree_node_lock() go off of
+	 * iter->pos, not the node's key: so if the iterator is marked as
+	 * needing to be traversed, we risk deadlock if we don't bail out here:
+	 */
+	if (iter->uptodate >= BTREE_ITER_NEED_TRAVERSE)
+		return ERR_PTR(-EINTR);
+
 	if (!bch2_btree_node_relock(iter, level + 1)) {
 		ret = ERR_PTR(-EINTR);
 		goto out;
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index f7de2def58df..43ea3ceafcf2 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -205,8 +205,9 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
 		if (!linked->nodes_locked)
 			continue;
 
-		/* * Must lock btree nodes in key order: */
-		if (__btree_iter_cmp(iter->btree_id, pos, linked) < 0)
+		/* Must lock btree nodes in key order: */
+		if ((cmp_int(iter->btree_id, linked->btree_id) ?:
+		     bkey_cmp(pos, linked->pos)) < 0)
 			ret = false;
 
 		/*
@@ -1320,6 +1321,16 @@ void bch2_btree_iter_set_pos_same_leaf(struct btree_iter *iter, struct bpos new_
 
 	btree_iter_advance_to_pos(iter, l, -1);
 
+	/*
+	 * XXX:
+	 * keeping a node locked that's outside (even just outside) iter->pos
+	 * breaks __bch2_btree_node_lock(). This seems to only affect
+	 * bch2_btree_node_get_sibling so for now it's fixed there, but we
+	 * should try to get rid of this corner case.
+	 *
+	 * (this behaviour is currently needed for BTREE_INSERT_NOUNLOCK)
+	 */
+
 	if (bch2_btree_node_iter_end(&l->iter) &&
 	    btree_iter_pos_after_node(iter, l->b))
 		btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
@@ -2195,6 +2206,7 @@ void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c,
 		bch2_trans_preload_mem(trans, expected_mem_bytes);
 
 #ifdef CONFIG_BCACHEFS_DEBUG
+	trans->pid = current->pid;
 	mutex_lock(&c->btree_trans_lock);
 	list_add(&trans->list, &c->btree_trans_list);
 	mutex_unlock(&c->btree_trans_lock);
@@ -2233,7 +2245,7 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct bch_fs *c)
 
 	mutex_lock(&c->btree_trans_lock);
 	list_for_each_entry(trans, &c->btree_trans_list, list) {
-		pr_buf(out, "%ps\n", (void *) trans->ip);
+		pr_buf(out, "%i %ps\n", trans->pid, (void *) trans->ip);
 
 		trans_for_each_iter(trans, iter) {
 			if (!iter->nodes_locked)
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index 80577853a04e..f9dcbdc9ab52 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -172,17 +172,10 @@ void bch2_btree_iter_set_pos_same_leaf(struct btree_iter *, struct bpos);
 void __bch2_btree_iter_set_pos(struct btree_iter *, struct bpos, bool);
 void bch2_btree_iter_set_pos(struct btree_iter *, struct bpos);
 
-static inline int __btree_iter_cmp(enum btree_id id,
-				   struct bpos pos,
-				   const struct btree_iter *r)
-{
-	return cmp_int(id, r->btree_id) ?: bkey_cmp(pos, r->pos);
-}
-
 static inline int btree_iter_cmp(const struct btree_iter *l,
 				 const struct btree_iter *r)
 {
-	return __btree_iter_cmp(l->btree_id, l->pos, r);
+	return cmp_int(l->btree_id, r->btree_id) ?: bkey_cmp(l->pos, r->pos);
 }
 
 /*
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 316461e2e016..78fbf922341e 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -287,6 +287,7 @@ struct btree_trans {
 #ifdef CONFIG_BCACHEFS_DEBUG
 	struct list_head	list;
 	struct btree		*locking;
+	pid_t			pid;
 #endif
 	unsigned long		ip;
 
-- 
cgit 


From 9ef846a7a13bc6daa3fc431acab5c13d7fb4aa84 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 3 Jun 2020 18:27:07 -0400
Subject: bcachefs: Improve assorted error messages

This also consolidates the various checks in bch2_mark_pointer() and
bch2_trans_mark_pointer().

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_io.c |  15 +--
 fs/bcachefs/buckets.c  | 243 +++++++++++++++++++++++--------------------------
 fs/bcachefs/error.h    |   1 +
 fs/bcachefs/extents.c  |   2 +-
 fs/bcachefs/fsck.c     |   2 +-
 5 files changed, 127 insertions(+), 136 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index a5888de327fc..5325c24548f9 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -631,14 +631,14 @@ static void btree_err_msg(struct printbuf *out, struct bch_fs *c,
 			  struct btree *b, struct bset *i,
 			  unsigned offset, int write)
 {
-	pr_buf(out, "error validating btree node %s"
-	       "at btree %u level %u/%u\n"
-	       "pos %llu:%llu node offset %u",
+	pr_buf(out, "error validating btree node %sat btree %u level %u/%u\n"
+	       "pos ",
 	       write ? "before write " : "",
 	       b->c.btree_id, b->c.level,
-	       c->btree_roots[b->c.btree_id].level,
-	       b->key.k.p.inode, b->key.k.p.offset,
-	       b->written);
+	       c->btree_roots[b->c.btree_id].level);
+	bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&b->key));
+
+	pr_buf(out, " node offset %u", b->written);
 	if (i)
 		pr_buf(out, " bset u64s %u", le16_to_cpu(i->u64s));
 }
@@ -944,7 +944,8 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry
 
 		btree_err_on(b->data->keys.seq != bp->seq,
 			     BTREE_ERR_MUST_RETRY, c, b, NULL,
-			     "got wrong btree node");
+			     "got wrong btree node (seq %llx want %llx)",
+			     b->data->keys.seq, bp->seq);
 	}
 
 	while (b->written < c->opts.btree_node_size) {
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index ebdbdd049f50..4074bc073cfe 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -918,61 +918,117 @@ static void bucket_set_stripe(struct bch_fs *c,
 	}
 }
 
-static bool bch2_mark_pointer(struct bch_fs *c,
-			      struct extent_ptr_decoded p,
-			      s64 sectors, enum bch_data_type data_type,
-			      struct bch_fs_usage *fs_usage,
-			      u64 journal_seq, unsigned flags)
+static int __mark_pointer(struct bch_fs *c, struct bkey_s_c k,
+			  struct extent_ptr_decoded p,
+			  s64 sectors, enum bch_data_type ptr_data_type,
+			  u8 bucket_gen, u8 *bucket_data_type,
+			  u16 *dirty_sectors, u16 *cached_sectors)
+{
+	u16 *dst_sectors = !p.ptr.cached
+		? dirty_sectors
+		: cached_sectors;
+	u16 orig_sectors = *dst_sectors;
+	char buf[200];
+
+	if (gen_after(p.ptr.gen, bucket_gen)) {
+		bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
+			"bucket %u:%zu gen %u data type %s: ptr gen %u newer than bucket gen\n"
+			"while marking %s",
+			p.ptr.dev, PTR_BUCKET_NR(bch_dev_bkey_exists(c, p.ptr.dev), &p.ptr),
+			bucket_gen,
+			bch2_data_types[*bucket_data_type ?: ptr_data_type],
+			p.ptr.gen,
+			(bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
+		return -EIO;
+	}
+
+	if (gen_cmp(bucket_gen, p.ptr.gen) >= 96U) {
+		bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
+			"bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n"
+			"while marking %s",
+			p.ptr.dev, PTR_BUCKET_NR(bch_dev_bkey_exists(c, p.ptr.dev), &p.ptr),
+			bucket_gen,
+			bch2_data_types[*bucket_data_type ?: ptr_data_type],
+			p.ptr.gen,
+			(bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
+		return -EIO;
+	}
+
+	if (bucket_gen != p.ptr.gen && !p.ptr.cached) {
+		bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
+			"bucket %u:%zu gen %u data type %s: stale dirty ptr (gen %u)\n"
+			"while marking %s",
+			p.ptr.dev, PTR_BUCKET_NR(bch_dev_bkey_exists(c, p.ptr.dev), &p.ptr),
+			bucket_gen,
+			bch2_data_types[*bucket_data_type ?: ptr_data_type],
+			p.ptr.gen,
+			(bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
+		return -EIO;
+	}
+
+	if (bucket_gen != p.ptr.gen)
+		return 1;
+
+	if (*bucket_data_type && *bucket_data_type != ptr_data_type) {
+		bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
+			"bucket %u:%zu gen %u different types of data in same bucket: %s, %s\n"
+			"while marking %s",
+			p.ptr.dev, PTR_BUCKET_NR(bch_dev_bkey_exists(c, p.ptr.dev), &p.ptr),
+			bucket_gen,
+			bch2_data_types[*bucket_data_type],
+			bch2_data_types[ptr_data_type],
+			(bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
+		return -EIO;
+	}
+
+	if (checked_add(*dst_sectors, sectors)) {
+		bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
+			"bucket %u:%zu gen %u data type %s sector count overflow: %u + %lli > U16_MAX\n"
+			"while marking %s",
+			p.ptr.dev, PTR_BUCKET_NR(bch_dev_bkey_exists(c, p.ptr.dev), &p.ptr),
+			bucket_gen,
+			bch2_data_types[*bucket_data_type ?: ptr_data_type],
+			orig_sectors, sectors,
+			(bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
+		return -EIO;
+	}
+
+	*bucket_data_type = *dirty_sectors || *cached_sectors
+		? ptr_data_type : 0;
+	return 0;
+}
+
+static int bch2_mark_pointer(struct bch_fs *c, struct bkey_s_c k,
+			     struct extent_ptr_decoded p,
+			     s64 sectors, enum bch_data_type data_type,
+			     struct bch_fs_usage *fs_usage,
+			     u64 journal_seq, unsigned flags)
 {
 	bool gc = flags & BTREE_TRIGGER_GC;
 	struct bucket_mark old, new;
 	struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev);
 	struct bucket *g = PTR_BUCKET(ca, &p.ptr, gc);
-	u16 *dst_sectors, orig_sectors;
-	bool overflow;
+	u8 bucket_data_type;
 	u64 v;
+	int ret;
 
 	v = atomic64_read(&g->_mark.v);
 	do {
 		new.v.counter = old.v.counter = v;
+		bucket_data_type = new.data_type;
 
-		/*
-		 * Check this after reading bucket mark to guard against
-		 * the allocator invalidating a bucket after we've already
-		 * checked the gen
-		 */
-		if (gen_after(p.ptr.gen, new.gen)) {
-			bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
-				      "pointer gen in the future");
-			return true;
-		}
-
-		if (new.gen != p.ptr.gen) {
-			/* XXX write repair code for this */
-			if (!p.ptr.cached &&
-			    test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags))
-				bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
-					      "stale dirty pointer");
-			return true;
-		}
-
-		dst_sectors = !p.ptr.cached
-			? &new.dirty_sectors
-			: &new.cached_sectors;
-		orig_sectors = *dst_sectors;
-
-		overflow = checked_add(*dst_sectors, sectors);
+		ret = __mark_pointer(c, k, p, sectors, data_type, new.gen,
+				     &bucket_data_type,
+				     &new.dirty_sectors,
+				     &new.cached_sectors);
+		if (ret)
+			return ret;
 
-		if (!new.dirty_sectors &&
-		    !new.cached_sectors) {
-			new.data_type	= 0;
+		new.data_type = bucket_data_type;
 
-			if (journal_seq) {
-				new.journal_seq_valid = 1;
-				new.journal_seq = journal_seq;
-			}
-		} else {
-			new.data_type = data_type;
+		if (journal_seq) {
+			new.journal_seq_valid = 1;
+			new.journal_seq = journal_seq;
 		}
 
 		if (flags & BTREE_TRIGGER_NOATOMIC) {
@@ -983,25 +1039,11 @@ static bool bch2_mark_pointer(struct bch_fs *c,
 			      old.v.counter,
 			      new.v.counter)) != old.v.counter);
 
-	if (old.data_type && old.data_type != data_type)
-		bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
-			"bucket %u:%zu gen %u different types of data in same bucket: %s, %s",
-			p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
-			new.gen,
-			bch2_data_types[old.data_type],
-			bch2_data_types[data_type]);
-
-	bch2_fs_inconsistent_on(overflow, c,
-		"bucket %u:%zu gen %u data type %s sector count overflow: %u + %lli > U16_MAX",
-		p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), new.gen,
-		bch2_data_types[old.data_type ?: data_type],
-		orig_sectors, sectors);
-
 	bch2_dev_usage_update(c, ca, fs_usage, old, new, gc);
 
 	BUG_ON(!gc && bucket_became_unavailable(old, new));
 
-	return false;
+	return 0;
 }
 
 static int bch2_mark_stripe_ptr(struct bch_fs *c,
@@ -1065,6 +1107,7 @@ static int bch2_mark_extent(struct bch_fs *c, struct bkey_s_c k,
 	struct extent_ptr_decoded p;
 	struct bch_replicas_padded r;
 	s64 dirty_sectors = 0;
+	bool stale;
 	int ret;
 
 	r.e.data_type	= data_type;
@@ -1077,8 +1120,13 @@ static int bch2_mark_extent(struct bch_fs *c, struct bkey_s_c k,
 		s64 disk_sectors = data_type == BCH_DATA_BTREE
 			? sectors
 			: ptr_disk_sectors_delta(p, offset, sectors, flags);
-		bool stale = bch2_mark_pointer(c, p, disk_sectors, data_type,
-					       fs_usage, journal_seq, flags);
+
+		ret = bch2_mark_pointer(c, k, p, disk_sectors, data_type,
+					fs_usage, journal_seq, flags);
+		if (ret < 0)
+			return ret;
+
+		stale = ret > 0;
 
 		if (p.ptr.cached) {
 			if (!stale)
@@ -1439,25 +1487,24 @@ static int trans_get_key(struct btree_trans *trans,
 }
 
 static int bch2_trans_mark_pointer(struct btree_trans *trans,
-			struct extent_ptr_decoded p,
+			struct bkey_s_c k, struct extent_ptr_decoded p,
 			s64 sectors, enum bch_data_type data_type)
 {
 	struct bch_fs *c = trans->c;
 	struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev);
 	struct btree_iter *iter;
-	struct bkey_s_c k;
+	struct bkey_s_c k_a;
 	struct bkey_alloc_unpacked u;
 	struct bkey_i_alloc *a;
-	u16 *dst_sectors, orig_sectors;
 	int ret;
 
 	ret = trans_get_key(trans, BTREE_ID_ALLOC,
 			    POS(p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr)),
-			    &iter, &k);
+			    &iter, &k_a);
 	if (ret < 0)
 		return ret;
 
-	if (k.k->type != KEY_TYPE_alloc ||
+	if (k_a.k->type != KEY_TYPE_alloc ||
 	    (!ret && unlikely(!test_bit(BCH_FS_ALLOC_WRITTEN, &c->flags)))) {
 		/*
 		 * During journal replay, and if gc repairs alloc info at
@@ -1474,71 +1521,13 @@ static int bch2_trans_mark_pointer(struct btree_trans *trans,
 		u	= alloc_mem_to_key(g, m);
 		percpu_up_read(&c->mark_lock);
 	} else {
-		u = bch2_alloc_unpack(k);
-	}
-
-	if (u.gen != p.ptr.gen) {
-		ret = 1;
-
-		if (gen_after(p.ptr.gen, u.gen)) {
-			bch2_fs_inconsistent(c,
-				      "bucket %llu:%llu gen %u data type %s: ptr gen %u newer than bucket gen",
-				      iter->pos.inode, iter->pos.offset, u.gen,
-				      bch2_data_types[u.data_type ?: data_type],
-				      p.ptr.gen);
-			ret = -EIO;
-		}
-
-		if (gen_cmp(u.gen, p.ptr.gen) >= 96U) {
-			bch2_fs_inconsistent(c,
-				      "bucket %llu:%llu gen %u data type %s: ptr gen %u too stale",
-				      iter->pos.inode, iter->pos.offset, u.gen,
-				      bch2_data_types[u.data_type ?: data_type],
-				      p.ptr.gen);
-			ret = -EIO;
-		}
-
-		if (!p.ptr.cached) {
-			bch2_fs_inconsistent(c,
-				      "bucket %llu:%llu gen %u data type %s: stale dirty ptr (gen %u)",
-				      iter->pos.inode, iter->pos.offset, u.gen,
-				      bch2_data_types[u.data_type ?: data_type],
-				      p.ptr.gen);
-			ret = -EIO;
-		}
-
-		goto out;
-	}
-
-	if (u.data_type && u.data_type != data_type) {
-		bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
-			"bucket %llu:%llu gen %u different types of data in same bucket: %s, %s",
-			iter->pos.inode, iter->pos.offset,
-			u.gen,
-			bch2_data_types[u.data_type],
-			bch2_data_types[data_type]);
-		ret = -1;
-		goto out;
+		u = bch2_alloc_unpack(k_a);
 	}
 
-	dst_sectors = !p.ptr.cached
-		? &u.dirty_sectors
-		: &u.cached_sectors;
-	orig_sectors = *dst_sectors;
-
-	if (checked_add(*dst_sectors, sectors)) {
-		bch2_fs_inconsistent(c,
-			"bucket %llu:%llu gen %u data type %s sector count overflow: %u + %lli > U16_MAX",
-			iter->pos.inode, iter->pos.offset, u.gen,
-			bch2_data_types[u.data_type ?: data_type],
-			orig_sectors, sectors);
-		/* return an error indicating that we need full fsck */
-		ret = -EIO;
+	ret = __mark_pointer(c, k, p, sectors, data_type, u.gen, &u.data_type,
+			     &u.dirty_sectors, &u.cached_sectors);
+	if (ret)
 		goto out;
-	}
-
-	u.data_type = u.dirty_sectors || u.cached_sectors
-		? data_type : 0;
 
 	a = bch2_trans_kmalloc(trans, BKEY_ALLOC_U64s_MAX * 8);
 	ret = PTR_ERR_OR_ZERO(a);
@@ -1623,7 +1612,7 @@ static int bch2_trans_mark_extent(struct btree_trans *trans,
 			? sectors
 			: ptr_disk_sectors_delta(p, offset, sectors, flags);
 
-		ret = bch2_trans_mark_pointer(trans, p, disk_sectors,
+		ret = bch2_trans_mark_pointer(trans, k, p, disk_sectors,
 					      data_type);
 		if (ret < 0)
 			return ret;
diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h
index de319794ccd1..94b53312fbbd 100644
--- a/fs/bcachefs/error.h
+++ b/fs/bcachefs/error.h
@@ -102,6 +102,7 @@ struct fsck_err_state {
 #define FSCK_CAN_IGNORE		(1 << 1)
 #define FSCK_NEED_FSCK		(1 << 2)
 
+__printf(3, 4) __cold
 enum fsck_err_ret bch2_fsck_err(struct bch_fs *,
 				unsigned, const char *, ...);
 void bch2_flush_fsck_errs(struct bch_fs *);
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 52beaab227ef..62eb3b1e2cbf 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -219,7 +219,7 @@ void bch2_btree_ptr_v2_to_text(struct printbuf *out, struct bch_fs *c,
 {
 	struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k);
 
-	pr_buf(out, "seq %llu sectors %u written %u min_key ",
+	pr_buf(out, "seq %llx sectors %u written %u min_key ",
 	       le64_to_cpu(bp.v->seq),
 	       le16_to_cpu(bp.v->sectors),
 	       le16_to_cpu(bp.v->sectors_written));
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 3ab621c62c43..c6ca5968a2e0 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -1169,7 +1169,7 @@ static int check_inode_nlink(struct bch_fs *c,
 	}
 
 	if (!S_ISDIR(u->bi_mode) && link->dir_count) {
-		need_fsck_err(c, "non directory with subdirectories",
+		need_fsck_err(c, "non directory with subdirectories (inum %llu)",
 			      u->bi_inum);
 		return 0;
 	}
-- 
cgit 


From a27443bc7652a37db1ac99f2c77b20ac15947cc5 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 3 Jun 2020 22:11:10 -0400
Subject: bcachefs: Kill old allocator startup code

It's not needed anymore since we can now write to buckets before
updating the alloc btree.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c | 235 -----------------------------------------
 fs/bcachefs/alloc_background.h |   4 +-
 fs/bcachefs/bcachefs.h         |   1 -
 fs/bcachefs/journal_reclaim.c  |   8 --
 fs/bcachefs/super.c            |  10 --
 5 files changed, 1 insertion(+), 257 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 09a719b256b3..30541c8fe3b0 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -968,31 +968,6 @@ out:
 	return ret < 0 ? ret : 0;
 }
 
-static bool bch2_invalidate_one_bucket(struct bch_fs *c, struct bch_dev *ca,
-				       size_t bucket, u64 *flush_seq)
-{
-	struct bucket_mark m;
-
-	percpu_down_read(&c->mark_lock);
-	spin_lock(&c->freelist_lock);
-
-	bch2_invalidate_bucket(c, ca, bucket, &m);
-
-	verify_not_on_freelist(c, ca, bucket);
-	BUG_ON(!fifo_push(&ca->free_inc, bucket));
-
-	spin_unlock(&c->freelist_lock);
-
-	bucket_io_clock_reset(c, ca, bucket, READ);
-	bucket_io_clock_reset(c, ca, bucket, WRITE);
-
-	percpu_up_read(&c->mark_lock);
-
-	*flush_seq = max(*flush_seq, bucket_journal_seq(c, m));
-
-	return m.cached_sectors != 0;
-}
-
 /*
  * Pull buckets off ca->alloc_heap, invalidate them, move them to ca->free_inc:
  */
@@ -1448,216 +1423,6 @@ int bch2_dev_allocator_start(struct bch_dev *ca)
 	return 0;
 }
 
-static bool flush_held_btree_writes(struct bch_fs *c)
-{
-	struct bucket_table *tbl;
-	struct rhash_head *pos;
-	struct btree *b;
-	bool nodes_unwritten;
-	size_t i;
-again:
-	cond_resched();
-	nodes_unwritten = false;
-
-	if (bch2_journal_error(&c->journal))
-		return true;
-
-	rcu_read_lock();
-	for_each_cached_btree(b, c, tbl, i, pos)
-		if (btree_node_need_write(b)) {
-			if (btree_node_may_write(b)) {
-				rcu_read_unlock();
-				btree_node_lock_type(c, b, SIX_LOCK_read);
-				bch2_btree_node_write(c, b, SIX_LOCK_read);
-				six_unlock_read(&b->c.lock);
-				goto again;
-			} else {
-				nodes_unwritten = true;
-			}
-		}
-	rcu_read_unlock();
-
-	return !nodes_unwritten &&
-		!bch2_btree_interior_updates_nr_pending(c);
-}
-
-static void allocator_start_issue_discards(struct bch_fs *c)
-{
-	struct bch_dev *ca;
-	unsigned dev_iter;
-	size_t bu;
-
-	for_each_rw_member(ca, c, dev_iter)
-		while (fifo_pop(&ca->free_inc, bu))
-			blkdev_issue_discard(ca->disk_sb.bdev,
-					     bucket_to_sector(ca, bu),
-					     ca->mi.bucket_size, GFP_NOIO);
-}
-
-static int resize_free_inc(struct bch_dev *ca)
-{
-	alloc_fifo free_inc;
-
-	if (!fifo_full(&ca->free_inc))
-		return 0;
-
-	if (!init_fifo(&free_inc,
-		       ca->free_inc.size * 2,
-		       GFP_KERNEL))
-		return -ENOMEM;
-
-	fifo_move(&free_inc, &ca->free_inc);
-	swap(free_inc, ca->free_inc);
-	free_fifo(&free_inc);
-	return 0;
-}
-
-static bool bch2_fs_allocator_start_fast(struct bch_fs *c)
-{
-	struct bch_dev *ca;
-	unsigned dev_iter;
-	bool ret = true;
-
-	if (test_alloc_startup(c))
-		return false;
-
-	down_read(&c->gc_lock);
-
-	/* Scan for buckets that are already invalidated: */
-	for_each_rw_member(ca, c, dev_iter) {
-		struct bucket_array *buckets;
-		struct bucket_mark m;
-		long bu;
-
-		down_read(&ca->bucket_lock);
-		buckets = bucket_array(ca);
-
-		for (bu = buckets->first_bucket;
-		     bu < buckets->nbuckets; bu++) {
-			m = READ_ONCE(buckets->b[bu].mark);
-
-			if (!buckets->b[bu].gen_valid ||
-			    !is_available_bucket(m) ||
-			    m.cached_sectors ||
-			    (ca->buckets_nouse &&
-			     test_bit(bu, ca->buckets_nouse)))
-				continue;
-
-			percpu_down_read(&c->mark_lock);
-			bch2_mark_alloc_bucket(c, ca, bu, true,
-					gc_pos_alloc(c, NULL), 0);
-			percpu_up_read(&c->mark_lock);
-
-			fifo_push(&ca->free_inc, bu);
-
-			discard_invalidated_buckets(c, ca);
-
-			if (fifo_full(&ca->free[RESERVE_BTREE]))
-				break;
-		}
-		up_read(&ca->bucket_lock);
-	}
-
-	up_read(&c->gc_lock);
-
-	/* did we find enough buckets? */
-	for_each_rw_member(ca, c, dev_iter)
-		if (!fifo_full(&ca->free[RESERVE_BTREE]))
-			ret = false;
-
-	return ret;
-}
-
-int bch2_fs_allocator_start(struct bch_fs *c)
-{
-	struct bch_dev *ca;
-	unsigned dev_iter;
-	u64 journal_seq = 0;
-	bool wrote;
-	long bu;
-	int ret = 0;
-
-	if (!test_alloc_startup(c) &&
-	    bch2_fs_allocator_start_fast(c))
-		return 0;
-
-	pr_debug("not enough empty buckets; scanning for reclaimable buckets");
-
-	/*
-	 * We're moving buckets to freelists _before_ they've been marked as
-	 * invalidated on disk - we have to so that we can allocate new btree
-	 * nodes to mark them as invalidated on disk.
-	 *
-	 * However, we can't _write_ to any of these buckets yet - they might
-	 * have cached data in them, which is live until they're marked as
-	 * invalidated on disk:
-	 */
-	set_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags);
-
-	down_read(&c->gc_lock);
-	do {
-		wrote = false;
-
-		for_each_rw_member(ca, c, dev_iter) {
-			find_reclaimable_buckets(c, ca);
-
-			while (!fifo_full(&ca->free[RESERVE_BTREE]) &&
-			       (bu = next_alloc_bucket(ca)) >= 0) {
-				ret = resize_free_inc(ca);
-				if (ret) {
-					percpu_ref_put(&ca->io_ref);
-					up_read(&c->gc_lock);
-					goto err;
-				}
-
-				bch2_invalidate_one_bucket(c, ca, bu,
-							   &journal_seq);
-
-				fifo_push(&ca->free[RESERVE_BTREE], bu);
-			}
-		}
-
-		pr_debug("done scanning for reclaimable buckets");
-
-		/*
-		 * XXX: it's possible for this to deadlock waiting on journal reclaim,
-		 * since we're holding btree writes. What then?
-		 */
-		ret = bch2_alloc_write(c,
-				       BTREE_INSERT_NOCHECK_RW|
-				       BTREE_INSERT_USE_ALLOC_RESERVE|
-				       BTREE_INSERT_NOWAIT, &wrote);
-
-		/*
-		 * If bch2_alloc_write() did anything, it may have used some
-		 * buckets, and we need the RESERVE_BTREE freelist full - so we
-		 * need to loop and scan again.
-		 * And if it errored, it may have been because there weren't
-		 * enough buckets, so just scan and loop again as long as it
-		 * made some progress:
-		 */
-	} while (wrote);
-	up_read(&c->gc_lock);
-
-	if (ret)
-		goto err;
-
-	pr_debug("flushing journal");
-
-	ret = bch2_journal_flush(&c->journal);
-	if (ret)
-		goto err;
-
-	pr_debug("issuing discards");
-	allocator_start_issue_discards(c);
-err:
-	clear_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags);
-	closure_wait_event(&c->btree_interior_update_wait,
-			   flush_held_btree_writes(c));
-
-	return ret;
-}
-
 void bch2_fs_allocator_background_init(struct bch_fs *c)
 {
 	spin_lock_init(&c->freelist_lock);
diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h
index 501c444353fb..b53a27450889 100644
--- a/fs/bcachefs/alloc_background.h
+++ b/fs/bcachefs/alloc_background.h
@@ -70,8 +70,7 @@ static inline void bch2_wake_allocator(struct bch_dev *ca)
 static inline void verify_not_on_freelist(struct bch_fs *c, struct bch_dev *ca,
 					  size_t bucket)
 {
-	if (expensive_debug_checks(c) &&
-	    test_bit(BCH_FS_ALLOCATOR_STARTED, &c->flags)) {
+	if (expensive_debug_checks(c)) {
 		size_t iter;
 		long i;
 		unsigned j;
@@ -94,7 +93,6 @@ void bch2_dev_allocator_stop(struct bch_dev *);
 int bch2_dev_allocator_start(struct bch_dev *);
 
 int bch2_alloc_write(struct bch_fs *, unsigned, bool *);
-int bch2_fs_allocator_start(struct bch_fs *);
 void bch2_fs_allocator_background_init(struct bch_fs *);
 
 #endif /* _BCACHEFS_ALLOC_BACKGROUND_H */
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index dd34f30f01e5..31c4bac6322f 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -480,7 +480,6 @@ enum {
 	/* startup: */
 	BCH_FS_ALLOC_READ_DONE,
 	BCH_FS_ALLOC_CLEAN,
-	BCH_FS_ALLOCATOR_STARTED,
 	BCH_FS_ALLOCATOR_RUNNING,
 	BCH_FS_ALLOCATOR_STOPPING,
 	BCH_FS_INITIAL_GC_DONE,
diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
index 556f12602fcf..5ca84c9761e9 100644
--- a/fs/bcachefs/journal_reclaim.c
+++ b/fs/bcachefs/journal_reclaim.c
@@ -28,17 +28,9 @@ unsigned bch2_journal_dev_buckets_available(struct journal *j,
 					    struct journal_device *ja,
 					    enum journal_space_from from)
 {
-	struct bch_fs *c = container_of(j, struct bch_fs, journal);
 	unsigned available = (journal_space_from(ja, from) -
 			      ja->cur_idx - 1 + ja->nr) % ja->nr;
 
-	/*
-	 * Allocator startup needs some journal space before we can do journal
-	 * replay:
-	 */
-	if (available && test_bit(BCH_FS_ALLOCATOR_STARTED, &c->flags))
-		--available;
-
 	/*
 	 * Don't use the last bucket unless writing the new last_seq
 	 * will make another bucket available:
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 9da64d9d52e5..819d4392d529 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -423,16 +423,6 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early)
 		bch2_dev_allocator_add(c, ca);
 	bch2_recalc_capacity(c);
 
-	if (!test_bit(BCH_FS_ALLOCATOR_STARTED, &c->flags)) {
-		ret = bch2_fs_allocator_start(c);
-		if (ret) {
-			bch_err(c, "error initializing allocator");
-			goto err;
-		}
-
-		set_bit(BCH_FS_ALLOCATOR_STARTED, &c->flags);
-	}
-
 	for_each_rw_member(ca, c, i) {
 		ret = bch2_dev_allocator_start(ca);
 		if (ret) {
-- 
cgit 


From 255adc515aeab4bd870e548bb4154c2682871c05 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 3 Jun 2020 23:46:15 -0400
Subject: bcachefs: Always increment bucket gen on bucket reuse

Not doing so confuses copygc

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c | 38 ++++++++++++++++++++++++++++----------
 fs/bcachefs/buckets.c          | 30 +++++++++++++++++++-----------
 2 files changed, 47 insertions(+), 21 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 30541c8fe3b0..44ad9821c807 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -860,12 +860,22 @@ static int bch2_invalidate_one_bucket2(struct btree_trans *trans,
 	g = bucket(ca, b);
 	m = READ_ONCE(g->mark);
 
-	bch2_mark_alloc_bucket(c, ca, b, true, gc_pos_alloc(c, NULL), 0);
+	invalidating_cached_data = m.cached_sectors != 0;
+
+	/*
+	 * If we're not invalidating cached data, we only increment the bucket
+	 * gen in memory here, the incremented gen will be updated in the btree
+	 * by bch2_trans_mark_pointer():
+	 */
+
+	if (!invalidating_cached_data)
+		bch2_invalidate_bucket(c, ca, b, &m);
+	else
+		bch2_mark_alloc_bucket(c, ca, b, true, gc_pos_alloc(c, NULL), 0);
 
 	spin_unlock(&c->freelist_lock);
 	percpu_up_read(&c->mark_lock);
 
-	invalidating_cached_data = m.cached_sectors != 0;
 	if (!invalidating_cached_data)
 		goto out;
 
@@ -887,18 +897,26 @@ retry:
 	if (ret)
 		return ret;
 
-	/*
-	 * The allocator has to start before journal replay is finished - thus,
-	 * we have to trust the in memory bucket @m, not the version in the
-	 * btree:
-	 */
 	percpu_down_read(&c->mark_lock);
-	g = bucket(ca, b);
+	g = bucket(ca, iter->pos.offset);
 	m = READ_ONCE(g->mark);
-	u = alloc_mem_to_key(g, m);
+
+	if (unlikely(!test_bit(BCH_FS_ALLOC_WRITTEN, &c->flags))) {
+		/*
+		 * During journal replay, and if gc repairs alloc info at
+		 * runtime, the alloc info in the btree might not be up to date
+		 * yet - so, trust the in memory mark:
+		 */
+		u		= alloc_mem_to_key(g, m);
+	} else {
+		u		= bch2_alloc_unpack(k);
+		u.read_time	= g->io_time[READ];
+		u.write_time	= g->io_time[WRITE];
+	}
+
 	percpu_up_read(&c->mark_lock);
 
-	invalidating_cached_data = m.cached_sectors != 0;
+	invalidating_cached_data = u.cached_sectors != 0;
 
 	u.gen++;
 	u.data_type	= 0;
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 4074bc073cfe..08e8b578fff5 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -1496,6 +1496,8 @@ static int bch2_trans_mark_pointer(struct btree_trans *trans,
 	struct bkey_s_c k_a;
 	struct bkey_alloc_unpacked u;
 	struct bkey_i_alloc *a;
+	struct bucket *g;
+	struct bucket_mark m;
 	int ret;
 
 	ret = trans_get_key(trans, BTREE_ID_ALLOC,
@@ -1504,26 +1506,32 @@ static int bch2_trans_mark_pointer(struct btree_trans *trans,
 	if (ret < 0)
 		return ret;
 
-	if (k_a.k->type != KEY_TYPE_alloc ||
-	    (!ret && unlikely(!test_bit(BCH_FS_ALLOC_WRITTEN, &c->flags)))) {
+	percpu_down_read(&c->mark_lock);
+	g = bucket(ca, iter->pos.offset);
+	m = READ_ONCE(g->mark);
+
+	if (unlikely(!test_bit(BCH_FS_ALLOC_WRITTEN, &c->flags) && !ret)) {
 		/*
 		 * During journal replay, and if gc repairs alloc info at
 		 * runtime, the alloc info in the btree might not be up to date
 		 * yet - so, trust the in memory mark - unless we're already
 		 * updating that key:
 		 */
-		struct bucket *g;
-		struct bucket_mark m;
-
-		percpu_down_read(&c->mark_lock);
-		g	= bucket(ca, iter->pos.offset);
-		m	= READ_ONCE(g->mark);
-		u	= alloc_mem_to_key(g, m);
-		percpu_up_read(&c->mark_lock);
+		u		= alloc_mem_to_key(g, m);
 	} else {
-		u = bch2_alloc_unpack(k_a);
+		u		= bch2_alloc_unpack(k_a);
+		u.read_time	= g->io_time[READ];
+		u.write_time	= g->io_time[WRITE];
 	}
 
+	percpu_up_read(&c->mark_lock);
+
+	/*
+	 * Incrementing the bucket gen can be done lazily:
+	 */
+	if (gen_after(m.gen, u.gen) && !u.data_type)
+		u.gen = m.gen;
+
 	ret = __mark_pointer(c, k, p, sectors, data_type, u.gen, &u.data_type,
 			     &u.dirty_sectors, &u.cached_sectors);
 	if (ret)
-- 
cgit 


From 784d8d173d7aa68a32ea04c45bc8a038e4cfbc4e Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 3 Jun 2020 23:47:50 -0400
Subject: bcachefs: Improve warning for copygc failing to move data

This will help narrow down which code is at fault when this happens.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/move.c       |  5 ++++-
 fs/bcachefs/move_types.h |  1 +
 fs/bcachefs/movinggc.c   | 17 +++++++++++++++--
 3 files changed, 20 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index 02cc5089a163..a061e60e3d7a 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -177,9 +177,12 @@ next:
 		}
 		continue;
 nomatch:
-		if (m->ctxt)
+		if (m->ctxt) {
+			BUG_ON(k.k->p.offset <= iter->pos.offset);
+			atomic64_inc(&m->ctxt->stats->keys_raced);
 			atomic64_add(k.k->p.offset - iter->pos.offset,
 				     &m->ctxt->stats->sectors_raced);
+		}
 		atomic_long_inc(&c->extent_migrate_raced);
 		trace_move_race(&new->k);
 		bch2_btree_iter_next_slot(iter);
diff --git a/fs/bcachefs/move_types.h b/fs/bcachefs/move_types.h
index 6788170d3f95..fc0de165af9f 100644
--- a/fs/bcachefs/move_types.h
+++ b/fs/bcachefs/move_types.h
@@ -8,6 +8,7 @@ struct bch_move_stats {
 	struct bpos		pos;
 
 	atomic64_t		keys_moved;
+	atomic64_t		keys_raced;
 	atomic64_t		sectors_moved;
 	atomic64_t		sectors_seen;
 	atomic64_t		sectors_raced;
diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c
index 7e08a7940a35..d22f26c02b67 100644
--- a/fs/bcachefs/movinggc.c
+++ b/fs/bcachefs/movinggc.c
@@ -78,7 +78,17 @@ static bool __copygc_pred(struct bch_dev *ca,
 		ssize_t i = eytzinger0_find_le(h->data, h->used,
 					       sizeof(h->data[0]),
 					       bucket_offset_cmp, &search);
+#if 0
+		/* eytzinger search verify code: */
+		ssize_t j = -1, k;
 
+		for (k = 0; k < h->used; k++)
+			if (h->data[k].offset <= ptr->offset &&
+			    (j < 0 || h->data[k].offset > h->data[j].offset))
+				j = k;
+
+		BUG_ON(i != j);
+#endif
 		return (i >= 0 &&
 			ptr->offset < h->data[i].offset + ca->mi.bucket_size &&
 			ptr->gen == h->data[i].gen);
@@ -203,9 +213,12 @@ static void bch2_copygc(struct bch_fs *c, struct bch_dev *ca)
 
 	if (sectors_not_moved && !ret)
 		bch_warn_ratelimited(c,
-			"copygc finished but %llu/%llu sectors, %llu/%llu buckets not moved",
+			"copygc finished but %llu/%llu sectors, %llu/%llu buckets not moved (move stats: moved %llu sectors, raced %llu keys, %llu sectors)",
 			 sectors_not_moved, sectors_to_move,
-			 buckets_not_moved, buckets_to_move);
+			 buckets_not_moved, buckets_to_move,
+			 atomic64_read(&move_stats.sectors_moved),
+			 atomic64_read(&move_stats.keys_raced),
+			 atomic64_read(&move_stats.sectors_raced));
 
 	trace_copygc(ca,
 		     atomic64_read(&move_stats.sectors_moved), sectors_not_moved,
-- 
cgit 


From 72545b5e76b05407e13f369590479ef1a5fd0f52 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 8 Jun 2020 13:26:48 -0400
Subject: bcachefs: bch2_trans_downgrade()

bch2_btree_iter_downgrade() was looping over all iterators in a
transaction; bch2_trans_downgrade() should be doing that.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c        | 41 +++++++++++++++++++----------------------
 fs/bcachefs/btree_iter.h        |  2 ++
 fs/bcachefs/btree_update_leaf.c |  3 +--
 3 files changed, 22 insertions(+), 24 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 43ea3ceafcf2..bed0bb67a85d 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -349,31 +349,20 @@ bool __bch2_btree_iter_upgrade_nounlock(struct btree_iter *iter,
 void __bch2_btree_iter_downgrade(struct btree_iter *iter,
 				 unsigned downgrade_to)
 {
-	struct btree_iter *linked;
-	unsigned l;
-
-	/*
-	 * We downgrade linked iterators as well because btree_iter_upgrade
-	 * might have had to modify locks_want on linked iterators due to lock
-	 * ordering:
-	 */
-	trans_for_each_iter(iter->trans, linked) {
-		unsigned new_locks_want = downgrade_to ?:
-			(linked->flags & BTREE_ITER_INTENT ? 1 : 0);
-
-		if (linked->locks_want <= new_locks_want)
-			continue;
+	unsigned l, new_locks_want = downgrade_to ?:
+		(iter->flags & BTREE_ITER_INTENT ? 1 : 0);
 
-		linked->locks_want = new_locks_want;
+	if (iter->locks_want < downgrade_to) {
+		iter->locks_want = new_locks_want;
 
-		while (linked->nodes_locked &&
-		       (l = __fls(linked->nodes_locked)) >= linked->locks_want) {
-			if (l > linked->level) {
-				btree_node_unlock(linked, l);
+		while (iter->nodes_locked &&
+		       (l = __fls(iter->nodes_locked)) >= iter->locks_want) {
+			if (l > iter->level) {
+				btree_node_unlock(iter, l);
 			} else {
-				if (btree_node_intent_locked(linked, l)) {
-					six_lock_downgrade(&linked->l[l].b->c.lock);
-					linked->nodes_intent_locked ^= 1 << l;
+				if (btree_node_intent_locked(iter, l)) {
+					six_lock_downgrade(&iter->l[l].b->c.lock);
+					iter->nodes_intent_locked ^= 1 << l;
 				}
 				break;
 			}
@@ -383,6 +372,14 @@ void __bch2_btree_iter_downgrade(struct btree_iter *iter,
 	bch2_btree_trans_verify_locks(iter->trans);
 }
 
+void bch2_trans_downgrade(struct btree_trans *trans)
+{
+	struct btree_iter *iter;
+
+	trans_for_each_iter(trans, iter)
+		bch2_btree_iter_downgrade(iter);
+}
+
 /* Btree transaction locking: */
 
 bool bch2_trans_relock(struct btree_trans *trans)
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index f9dcbdc9ab52..bc408f1272e7 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -136,6 +136,8 @@ static inline void bch2_btree_iter_downgrade(struct btree_iter *iter)
 		__bch2_btree_iter_downgrade(iter, 0);
 }
 
+void bch2_trans_downgrade(struct btree_trans *);
+
 void bch2_btree_iter_node_replace(struct btree_iter *, struct btree *);
 void bch2_btree_iter_node_drop(struct btree_iter *, struct btree *);
 
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index ffcaecc8a64f..fc4d4b0d3770 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -543,8 +543,7 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans,
 
 	trans->nounlock = false;
 
-	trans_for_each_update2(trans, i)
-		bch2_btree_iter_downgrade(i->iter);
+	bch2_trans_downgrade(trans);
 
 	return 0;
 }
-- 
cgit 


From 8804ef1f28def994562801f68d271ad4f0cf1c36 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 8 Jun 2020 14:28:16 -0400
Subject: bcachefs: Call bch2_btree_iter_traverse() if necessary in commit path

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update_leaf.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index fc4d4b0d3770..7b4b12442db8 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -830,9 +830,9 @@ int __bch2_trans_commit(struct btree_trans *trans)
 		trans_trigger_run = false;
 
 		trans_for_each_update(trans, i) {
-			if (unlikely(i->iter->uptodate > BTREE_ITER_NEED_PEEK)) {
+			if (unlikely(i->iter->uptodate > BTREE_ITER_NEED_PEEK &&
+				     (ret = bch2_btree_iter_traverse(i->iter)))) {
 				trace_trans_restart_traverse(trans->ip);
-				ret = -EINTR;
 				goto out;
 			}
 
-- 
cgit 


From 8c9eef95cde9e2a11f7c84181dc1710e594dffb2 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 5 Jun 2020 09:01:23 -0400
Subject: bcachefs: Check gfp_flags correctly in bch2_btree_cache_scan()

bch2_btree_node_mem_alloc() uses memalloc_nofs_save()/GFP_NOFS, but
GFP_NOFS does include __GFP_IO - oops. We used to use GFP_NOIO, but as
we're a filesystem now GFP_NOFS makes more sense now and is looser.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_cache.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index fa55bab5944e..541a02f87b8d 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -242,7 +242,7 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink,
 		return SHRINK_STOP;
 
 	/* Return -1 if we can't do anything right now */
-	if (sc->gfp_mask & __GFP_IO)
+	if (sc->gfp_mask & __GFP_FS)
 		mutex_lock(&bc->lock);
 	else if (!mutex_trylock(&bc->lock))
 		return -1;
-- 
cgit 


From 40ca39b56448ecf05e7b4b6c921968e8648f8f0b Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 9 Jun 2020 15:59:03 -0400
Subject: bcachefs: btree_update_nodes_written() requires alloc reserve

Also, in the btree_update_start() path, if we already have a journal
pre-reservation we don't want to take another - that's a deadlock.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update_interior.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 7d63c457a3bf..4f244f983f35 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -529,6 +529,8 @@ static void btree_update_nodes_written(struct btree_update *as)
 	 */
 	ret = bch2_trans_do(c, &as->disk_res, &journal_seq,
 			    BTREE_INSERT_NOFAIL|
+			    BTREE_INSERT_USE_RESERVE|
+			    BTREE_INSERT_USE_ALLOC_RESERVE|
 			    BTREE_INSERT_NOCHECK_RW|
 			    BTREE_INSERT_JOURNAL_RESERVED,
 			    btree_update_nodes_written_trans(&trans, as));
@@ -886,9 +888,10 @@ bch2_btree_update_start(struct btree_trans *trans, enum btree_id id,
 	bch2_keylist_init(&as->new_keys, as->_new_keys);
 	bch2_keylist_init(&as->parent_keys, as->inline_keys);
 
-	ret = bch2_journal_preres_get(&c->journal, &as->journal_preres,
-				      BTREE_UPDATE_JOURNAL_RES,
-				      JOURNAL_RES_GET_NONBLOCK);
+	if (!(flags & BTREE_INSERT_JOURNAL_RESERVED))
+		ret = bch2_journal_preres_get(&c->journal, &as->journal_preres,
+					      BTREE_UPDATE_JOURNAL_RES,
+					      JOURNAL_RES_GET_NONBLOCK);
 	if (ret == -EAGAIN) {
 		if (flags & BTREE_INSERT_NOUNLOCK)
 			return ERR_PTR(-EINTR);
-- 
cgit 


From 6b5f9b29e65335baa6291ee2ce39d5ef6bd30c41 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 9 Jun 2020 15:46:22 -0400
Subject: bcachefs: Make open bucket reserves more conservative

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_foreground.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index 333aa140af54..4c1c264ce206 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -212,9 +212,9 @@ static inline unsigned open_buckets_reserved(enum alloc_reserve reserve)
 	case RESERVE_ALLOC:
 		return 0;
 	case RESERVE_BTREE:
-		return BTREE_NODE_OPEN_BUCKET_RESERVE;
+		return OPEN_BUCKETS_COUNT / 4;
 	default:
-		return BTREE_NODE_OPEN_BUCKET_RESERVE * 2;
+		return OPEN_BUCKETS_COUNT / 2;
 	}
 }
 
-- 
cgit 


From 966885ee409d128e2561e4c3546f4187ee173389 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 9 Jun 2020 16:25:07 -0400
Subject: bcachefs: Fix a linked list bug

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update_interior.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 4f244f983f35..8f4087db7b67 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -803,7 +803,7 @@ void bch2_btree_interior_update_will_free_node(struct btree_update *as,
 	 * operations complete
 	 */
 	list_for_each_entry_safe(p, n, &b->write_blocked, write_blocked_list) {
-		list_del(&p->write_blocked_list);
+		list_del_init(&p->write_blocked_list);
 		btree_update_reparent(as, p);
 
 		/*
-- 
cgit 


From e38821f322d97ca1289bea704eed9d276f47c521 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 9 Jun 2020 17:49:24 -0400
Subject: bcachefs: Don't allocate memory under the btree cache lock

The btree cache lock is needed for reclaiming from the btree node cache,
and memory allocation can potentially spin and sleep (for 100 ms at a
time), so.. don't do that.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_cache.c | 87 +++++++++++++++++++++++++++++++----------------
 1 file changed, 58 insertions(+), 29 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index 541a02f87b8d..d31017de53aa 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -72,24 +72,33 @@ static const struct rhashtable_params bch_btree_cache_params = {
 	.obj_cmpfn	= bch2_btree_cache_cmp_fn,
 };
 
-static void btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp)
+static int __btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp)
 {
-	struct btree_cache *bc = &c->btree_cache;
+	BUG_ON(b->data || b->aux_data);
 
 	b->data = kvpmalloc(btree_bytes(c), gfp);
 	if (!b->data)
-		goto err;
+		return -ENOMEM;
 
-	if (bch2_btree_keys_alloc(b, btree_page_order(c), gfp))
-		goto err;
+	if (bch2_btree_keys_alloc(b, btree_page_order(c), gfp)) {
+		kvpfree(b->data, btree_bytes(c));
+		b->data = NULL;
+		return -ENOMEM;
+	}
 
-	bc->used++;
-	list_move(&b->list, &bc->freeable);
-	return;
-err:
-	kvpfree(b->data, btree_bytes(c));
-	b->data = NULL;
-	list_move(&b->list, &bc->freed);
+	return 0;
+}
+
+static void btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp)
+{
+	struct btree_cache *bc = &c->btree_cache;
+
+	if (!__btree_node_data_alloc(c, b, gfp)) {
+		bc->used++;
+		list_move(&b->list, &bc->freeable);
+	} else {
+		list_move(&b->list, &bc->freed);
+	}
 }
 
 static struct btree *btree_node_mem_alloc(struct bch_fs *c, gfp_t gfp)
@@ -525,35 +534,47 @@ struct btree *bch2_btree_node_mem_alloc(struct bch_fs *c)
 	 */
 	list_for_each_entry(b, &bc->freeable, list)
 		if (!btree_node_reclaim(c, b))
-			goto out_unlock;
+			goto got_node;
 
 	/*
 	 * We never free struct btree itself, just the memory that holds the on
 	 * disk node. Check the freed list before allocating a new one:
 	 */
 	list_for_each_entry(b, &bc->freed, list)
-		if (!btree_node_reclaim(c, b)) {
-			btree_node_data_alloc(c, b, __GFP_NOWARN|GFP_NOIO);
-			if (b->data)
-				goto out_unlock;
+		if (!btree_node_reclaim(c, b))
+			goto got_node;
 
-			six_unlock_write(&b->c.lock);
-			six_unlock_intent(&b->c.lock);
+	b = NULL;
+got_node:
+	if (b)
+		list_del_init(&b->list);
+	mutex_unlock(&bc->lock);
+
+	if (!b) {
+		b = kzalloc(sizeof(struct btree), GFP_KERNEL);
+		if (!b)
 			goto err;
-		}
 
-	b = btree_node_mem_alloc(c, __GFP_NOWARN|GFP_NOIO);
-	if (!b)
-		goto err;
+		bkey_btree_ptr_init(&b->key);
+		six_lock_init(&b->c.lock);
+		INIT_LIST_HEAD(&b->list);
+		INIT_LIST_HEAD(&b->write_blocked);
+
+		BUG_ON(!six_trylock_intent(&b->c.lock));
+		BUG_ON(!six_trylock_write(&b->c.lock));
+	}
+
+	if (!b->data) {
+		if (__btree_node_data_alloc(c, b, __GFP_NOWARN|GFP_KERNEL))
+			goto err;
+
+		mutex_lock(&bc->lock);
+		bc->used++;
+		mutex_unlock(&bc->lock);
+	}
 
-	BUG_ON(!six_trylock_intent(&b->c.lock));
-	BUG_ON(!six_trylock_write(&b->c.lock));
-out_unlock:
 	BUG_ON(btree_node_hashed(b));
 	BUG_ON(btree_node_write_in_flight(b));
-
-	list_del_init(&b->list);
-	mutex_unlock(&bc->lock);
 out:
 	b->flags		= 0;
 	b->written		= 0;
@@ -569,6 +590,14 @@ out:
 	memalloc_nofs_restore(flags);
 	return b;
 err:
+	mutex_lock(&bc->lock);
+
+	if (b) {
+		list_add(&b->list, &bc->freed);
+		six_unlock_write(&b->c.lock);
+		six_unlock_intent(&b->c.lock);
+	}
+
 	/* Try to cannibalize another cached btree node: */
 	if (bc->alloc_lock == current) {
 		b = btree_node_cannibalize(c);
-- 
cgit 


From 374153c2a958f33805e68a20770e4f0b503be48e Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 9 Jun 2020 15:44:03 -0400
Subject: bcachefs: More open buckets

We need a larger open bucket reserve now that the btree interior update
path holds onto open bucket references; filesystems with many high
through devices may need more open buckets now.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_types.h           | 16 +++++++++++-----
 fs/bcachefs/bcachefs.h              |  8 ++++----
 fs/bcachefs/btree_update_interior.h |  4 ++--
 3 files changed, 17 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_types.h b/fs/bcachefs/alloc_types.h
index 832568dc9551..4f1465077994 100644
--- a/fs/bcachefs/alloc_types.h
+++ b/fs/bcachefs/alloc_types.h
@@ -46,16 +46,22 @@ enum alloc_reserve {
 
 typedef FIFO(long)	alloc_fifo;
 
-/* Enough for 16 cache devices, 2 tiers and some left over for pipelining */
-#define OPEN_BUCKETS_COUNT	256
+#define OPEN_BUCKETS_COUNT	1024
 
 #define WRITE_POINT_HASH_NR	32
 #define WRITE_POINT_MAX		32
 
+typedef u16			open_bucket_idx_t;
+
 struct open_bucket {
 	spinlock_t		lock;
 	atomic_t		pin;
-	u8			freelist;
+	open_bucket_idx_t	freelist;
+
+	/*
+	 * When an open bucket has an ec_stripe attached, this is the index of
+	 * the block in the stripe this open_bucket corresponds to:
+	 */
 	u8			ec_idx;
 	u8			type;
 	unsigned		valid:1;
@@ -68,8 +74,8 @@ struct open_bucket {
 #define OPEN_BUCKET_LIST_MAX	15
 
 struct open_buckets {
-	u8			nr;
-	u8			v[OPEN_BUCKET_LIST_MAX];
+	open_bucket_idx_t	nr;
+	open_bucket_idx_t	v[OPEN_BUCKET_LIST_MAX];
 };
 
 struct dev_stripe_state {
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 31c4bac6322f..a219969357bc 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -428,8 +428,8 @@ struct bch_dev {
 	alloc_fifo		free[RESERVE_NR];
 	alloc_fifo		free_inc;
 
-	u8			open_buckets_partial[OPEN_BUCKETS_COUNT];
-	unsigned		open_buckets_partial_nr;
+	open_bucket_idx_t	open_buckets_partial[OPEN_BUCKETS_COUNT];
+	open_bucket_idx_t	open_buckets_partial_nr;
 
 	size_t			fifo_last_bucket;
 
@@ -690,8 +690,8 @@ struct bch_fs {
 	struct closure_waitlist	freelist_wait;
 	u64			blocked_allocate;
 	u64			blocked_allocate_open_bucket;
-	u8			open_buckets_freelist;
-	u8			open_buckets_nr_free;
+	open_bucket_idx_t	open_buckets_freelist;
+	open_bucket_idx_t	open_buckets_nr_free;
 	struct closure_waitlist	open_buckets_wait;
 	struct open_bucket	open_buckets[OPEN_BUCKETS_COUNT];
 
diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h
index 17bd1ca1fb78..4a5b9dcfbdd0 100644
--- a/fs/bcachefs/btree_update_interior.h
+++ b/fs/bcachefs/btree_update_interior.h
@@ -92,9 +92,9 @@ struct btree_update {
 	struct btree			*new_nodes[BTREE_UPDATE_NODES_MAX];
 	unsigned			nr_new_nodes;
 
-	u8				open_buckets[BTREE_UPDATE_NODES_MAX *
+	open_bucket_idx_t		open_buckets[BTREE_UPDATE_NODES_MAX *
 						     BCH_REPLICAS_MAX];
-	u8				nr_open_buckets;
+	open_bucket_idx_t		nr_open_buckets;
 
 	unsigned			journal_u64s;
 	u64				journal_entries[BTREE_UPDATE_JOURNAL_RES];
-- 
cgit 


From 4efe71a646c5add87d4082380f1663150cd462af Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 9 Jun 2020 20:54:36 -0400
Subject: bcachefs: Always give out journal pre-res if we already have one

This is better than skipping the journal pre-reservation if we already
have one - we should still acount for the journal reservation we're
going to have to get.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update_interior.c | 15 +++++++++------
 fs/bcachefs/journal.c               | 10 ++++++----
 fs/bcachefs/journal.h               | 20 +++++++++++++++-----
 3 files changed, 30 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 8f4087db7b67..bb921852a093 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -864,8 +864,11 @@ bch2_btree_update_start(struct btree_trans *trans, enum btree_id id,
 {
 	struct bch_fs *c = trans->c;
 	struct btree_update *as;
-	int ret, disk_res_flags = (flags & BTREE_INSERT_NOFAIL)
+	int disk_res_flags = (flags & BTREE_INSERT_NOFAIL)
 		? BCH_DISK_RESERVATION_NOFAIL : 0;
+	int journal_flags = (flags & BTREE_INSERT_JOURNAL_RESERVED)
+		? JOURNAL_RES_GET_RECLAIM : 0;
+	int ret = 0;
 
 	/*
 	 * This check isn't necessary for correctness - it's just to potentially
@@ -888,10 +891,9 @@ bch2_btree_update_start(struct btree_trans *trans, enum btree_id id,
 	bch2_keylist_init(&as->new_keys, as->_new_keys);
 	bch2_keylist_init(&as->parent_keys, as->inline_keys);
 
-	if (!(flags & BTREE_INSERT_JOURNAL_RESERVED))
-		ret = bch2_journal_preres_get(&c->journal, &as->journal_preres,
-					      BTREE_UPDATE_JOURNAL_RES,
-					      JOURNAL_RES_GET_NONBLOCK);
+	ret = bch2_journal_preres_get(&c->journal, &as->journal_preres,
+				      BTREE_UPDATE_JOURNAL_RES,
+				      journal_flags|JOURNAL_RES_GET_NONBLOCK);
 	if (ret == -EAGAIN) {
 		if (flags & BTREE_INSERT_NOUNLOCK)
 			return ERR_PTR(-EINTR);
@@ -899,7 +901,8 @@ bch2_btree_update_start(struct btree_trans *trans, enum btree_id id,
 		bch2_trans_unlock(trans);
 
 		ret = bch2_journal_preres_get(&c->journal, &as->journal_preres,
-					      BTREE_UPDATE_JOURNAL_RES, 0);
+				BTREE_UPDATE_JOURNAL_RES,
+				journal_flags);
 		if (ret)
 			return ERR_PTR(ret);
 
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index 48607e01bd22..767cb6f809e7 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -427,9 +427,10 @@ int bch2_journal_res_get_slowpath(struct journal *j, struct journal_res *res,
 
 static bool journal_preres_available(struct journal *j,
 				     struct journal_preres *res,
-				     unsigned new_u64s)
+				     unsigned new_u64s,
+				     unsigned flags)
 {
-	bool ret = bch2_journal_preres_get_fast(j, res, new_u64s);
+	bool ret = bch2_journal_preres_get_fast(j, res, new_u64s, flags);
 
 	if (!ret)
 		bch2_journal_reclaim_work(&j->reclaim_work.work);
@@ -439,13 +440,14 @@ static bool journal_preres_available(struct journal *j,
 
 int __bch2_journal_preres_get(struct journal *j,
 			      struct journal_preres *res,
-			      unsigned new_u64s)
+			      unsigned new_u64s,
+			      unsigned flags)
 {
 	int ret;
 
 	closure_wait_event(&j->preres_wait,
 		   (ret = bch2_journal_error(j)) ||
-		   journal_preres_available(j, res, new_u64s));
+		   journal_preres_available(j, res, new_u64s, flags));
 	return ret;
 }
 
diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h
index 2c55f74522e2..f14dfa59e702 100644
--- a/fs/bcachefs/journal.h
+++ b/fs/bcachefs/journal.h
@@ -301,6 +301,7 @@ int bch2_journal_res_get_slowpath(struct journal *, struct journal_res *,
 #define JOURNAL_RES_GET_NONBLOCK	(1 << 0)
 #define JOURNAL_RES_GET_CHECK		(1 << 1)
 #define JOURNAL_RES_GET_RESERVED	(1 << 2)
+#define JOURNAL_RES_GET_RECLAIM		(1 << 3)
 
 static inline int journal_res_get_fast(struct journal *j,
 				       struct journal_res *res,
@@ -408,11 +409,12 @@ static inline void bch2_journal_preres_put(struct journal *j,
 }
 
 int __bch2_journal_preres_get(struct journal *,
-			struct journal_preres *, unsigned);
+			struct journal_preres *, unsigned, unsigned);
 
 static inline int bch2_journal_preres_get_fast(struct journal *j,
 					       struct journal_preres *res,
-					       unsigned new_u64s)
+					       unsigned new_u64s,
+					       unsigned flags)
 {
 	int d = new_u64s - res->u64s;
 	union journal_preres_state old, new;
@@ -423,7 +425,15 @@ static inline int bch2_journal_preres_get_fast(struct journal *j,
 
 		new.reserved += d;
 
-		if (new.reserved > new.remaining)
+		/*
+		 * If we're being called from the journal reclaim path, we have
+		 * to unconditionally give out the pre-reservation, there's
+		 * nothing else sensible we can do - otherwise we'd recurse back
+		 * into the reclaim path and deadlock:
+		 */
+
+		if (!(flags & JOURNAL_RES_GET_RECLAIM) &&
+		    new.reserved > new.remaining)
 			return 0;
 	} while ((v = atomic64_cmpxchg(&j->prereserved.counter,
 				       old.v, new.v)) != old.v);
@@ -440,13 +450,13 @@ static inline int bch2_journal_preres_get(struct journal *j,
 	if (new_u64s <= res->u64s)
 		return 0;
 
-	if (bch2_journal_preres_get_fast(j, res, new_u64s))
+	if (bch2_journal_preres_get_fast(j, res, new_u64s, flags))
 		return 0;
 
 	if (flags & JOURNAL_RES_GET_NONBLOCK)
 		return -EAGAIN;
 
-	return __bch2_journal_preres_get(j, res, new_u64s);
+	return __bch2_journal_preres_get(j, res, new_u64s, flags);
 }
 
 /* journal_entry_res: */
-- 
cgit 


From 4e8224ed8ab3c671bb96b6b98c8f8de14637440d Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 9 Jun 2020 21:00:29 -0400
Subject: bcachefs: Refactor btree insert path

This splits out the journalling code from the btree update code; prep
work for the btree key cache.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update_leaf.c | 90 +++++++++++++++++------------------------
 1 file changed, 38 insertions(+), 52 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 7b4b12442db8..028aa9bbeced 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -159,71 +159,32 @@ inline void bch2_btree_add_journal_pin(struct bch_fs *c,
 			     : btree_node_flush1);
 }
 
-static inline void __btree_journal_key(struct btree_trans *trans,
-				       enum btree_id btree_id,
-				       struct bkey_i *insert)
-{
-	struct journal *j = &trans->c->journal;
-	u64 seq = trans->journal_res.seq;
-	bool needs_whiteout = insert->k.needs_whiteout;
-
-	/* ick */
-	insert->k.needs_whiteout = false;
-	bch2_journal_add_keys(j, &trans->journal_res,
-			      btree_id, insert);
-	insert->k.needs_whiteout = needs_whiteout;
-
-	bch2_journal_set_has_inode(j, &trans->journal_res,
-				   insert->k.p.inode);
-
-	if (trans->journal_seq)
-		*trans->journal_seq = seq;
-}
-
-static void bch2_btree_journal_key(struct btree_trans *trans,
-				   struct btree_iter *iter,
-				   struct bkey_i *insert)
-{
-	struct bch_fs *c = trans->c;
-	struct journal *j = &c->journal;
-	struct btree *b = iter_l(iter)->b;
-
-	EBUG_ON(trans->journal_res.ref !=
-		!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY));
-
-	if (likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) {
-		__btree_journal_key(trans, iter->btree_id, insert);
-		btree_bset_last(b)->journal_seq =
-			cpu_to_le64(trans->journal_res.seq);
-	}
-
-	bch2_btree_add_journal_pin(c, b,
-		likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))
-			? trans->journal_res.seq
-			: j->replay_journal_seq);
-
-	if (unlikely(!btree_node_dirty(b)))
-		set_btree_node_dirty(b);
-}
-
 /**
  * btree_insert_key - insert a key one key into a leaf node
  */
-static void btree_insert_key_leaf(struct btree_trans *trans,
+static bool btree_insert_key_leaf(struct btree_trans *trans,
 				  struct btree_iter *iter,
 				  struct bkey_i *insert)
 {
 	struct bch_fs *c = trans->c;
 	struct btree *b = iter_l(iter)->b;
 	struct bset_tree *t = bset_tree_last(b);
+	struct bset *i = bset(b, t);
 	int old_u64s = bset_u64s(t);
 	int old_live_u64s = b->nr.live_u64s;
 	int live_u64s_added, u64s_added;
 
-	insert->k.needs_whiteout = false;
+	if (unlikely(!bch2_btree_bset_insert_key(iter, b,
+					&iter_l(iter)->iter, insert)))
+		return false;
+
+	i->journal_seq = cpu_to_le64(max(trans->journal_res.seq,
+					 le64_to_cpu(i->journal_seq)));
 
-	if (likely(bch2_btree_bset_insert_key(iter, b, &iter_l(iter)->iter, insert)))
-		bch2_btree_journal_key(trans, iter, insert);
+	bch2_btree_add_journal_pin(c, b, trans->journal_res.seq);
+
+	if (unlikely(!btree_node_dirty(b)))
+		set_btree_node_dirty(b);
 
 	live_u64s_added = (int) b->nr.live_u64s - old_live_u64s;
 	u64s_added = (int) bset_u64s(t) - old_u64s;
@@ -238,6 +199,7 @@ static void btree_insert_key_leaf(struct btree_trans *trans,
 		bch2_btree_iter_reinit_node(iter, b);
 
 	trace_btree_insert_key(c, b, insert);
+	return true;
 }
 
 /* Normal update interface: */
@@ -326,7 +288,29 @@ static inline void do_btree_insert_one(struct btree_trans *trans,
 				       struct btree_iter *iter,
 				       struct bkey_i *insert)
 {
-	btree_insert_key_leaf(trans, iter, insert);
+	struct bch_fs *c = trans->c;
+	struct journal *j = &c->journal;
+	bool did_work;
+
+	EBUG_ON(trans->journal_res.ref !=
+		!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY));
+
+	insert->k.needs_whiteout = false;
+
+	did_work = btree_insert_key_leaf(trans, iter, insert);
+	if (!did_work)
+		return;
+
+	if (likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) {
+		bch2_journal_add_keys(j, &trans->journal_res,
+				      iter->btree_id, insert);
+
+		bch2_journal_set_has_inode(j, &trans->journal_res,
+					   insert->k.p.inode);
+
+		if (trans->journal_seq)
+			*trans->journal_seq = trans->journal_res.seq;
+	}
 }
 
 static inline bool iter_has_trans_triggers(struct btree_iter *iter)
@@ -411,6 +395,8 @@ bch2_trans_commit_write_locked(struct btree_trans *trans,
 				JOURNAL_RES_GET_NONBLOCK);
 		if (ret)
 			goto err;
+	} else {
+		trans->journal_res.seq = c->journal.replay_journal_seq;
 	}
 
 	if (unlikely(trans->extra_journal_entry_u64s)) {
-- 
cgit 


From 515282ac7d847d567dd3ba802edf34316368bb14 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 12 Jun 2020 14:58:07 -0400
Subject: bcachefs: Fix a deadlock

__bch2_btree_node_lock() was incorrectly using iter->pos as a proxy for
btree node lock ordering, this caused an off by one error that was
triggered by bch2_btree_node_get_sibling() getting the previous node.

This refactors the code to compare against btree node keys directly.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c        | 69 ++++++++++++++++++++++++++++-------------
 fs/bcachefs/btree_locking.h     | 24 ++++++++------
 fs/bcachefs/btree_types.h       |  4 +++
 fs/bcachefs/btree_update_leaf.c |  2 +-
 4 files changed, 67 insertions(+), 32 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index bed0bb67a85d..35fe7db50fb5 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -101,7 +101,7 @@ bool __bch2_btree_node_relock(struct btree_iter *iter, unsigned level)
 
 	if (six_relock_type(&b->c.lock, want, iter->l[level].lock_seq) ||
 	    (btree_node_lock_seq_matches(iter, b, level) &&
-	     btree_node_lock_increment(iter, b, level, want))) {
+	     btree_node_lock_increment(iter->trans, b, level, want))) {
 		mark_btree_node_locked(iter, level, want);
 		return true;
 	} else {
@@ -130,7 +130,7 @@ static bool bch2_btree_node_upgrade(struct btree_iter *iter, unsigned level)
 		goto success;
 
 	if (btree_node_lock_seq_matches(iter, b, level) &&
-	    btree_node_lock_increment(iter, b, level, BTREE_NODE_INTENT_LOCKED)) {
+	    btree_node_lock_increment(iter->trans, b, level, BTREE_NODE_INTENT_LOCKED)) {
 		btree_node_unlock(iter, level);
 		goto success;
 	}
@@ -193,23 +193,18 @@ static inline bool btree_iter_get_locks(struct btree_iter *iter,
 
 /* Slowpath: */
 bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
-			   unsigned level,
-			   struct btree_iter *iter,
-			   enum six_lock_type type)
+			    unsigned level, struct btree_iter *iter,
+			    enum six_lock_type type)
 {
+	struct btree_trans *trans = iter->trans;
 	struct btree_iter *linked;
 	bool ret = true;
 
 	/* Check if it's safe to block: */
-	trans_for_each_iter(iter->trans, linked) {
+	trans_for_each_iter(trans, linked) {
 		if (!linked->nodes_locked)
 			continue;
 
-		/* Must lock btree nodes in key order: */
-		if ((cmp_int(iter->btree_id, linked->btree_id) ?:
-		     bkey_cmp(pos, linked->pos)) < 0)
-			ret = false;
-
 		/*
 		 * Can't block taking an intent lock if we have _any_ nodes read
 		 * locked:
@@ -224,13 +219,15 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
 		 */
 		if (type == SIX_LOCK_intent &&
 		    linked->nodes_locked != linked->nodes_intent_locked) {
-			if (!(iter->trans->nounlock)) {
+			if (!(trans->nounlock)) {
 				linked->locks_want = max_t(unsigned,
 						linked->locks_want,
 						__fls(linked->nodes_locked) + 1);
-				btree_iter_get_locks(linked, true, false);
+				if (!btree_iter_get_locks(linked, true, false))
+					ret = false;
+			} else {
+				ret = false;
 			}
-			ret = false;
 		}
 
 		/*
@@ -240,14 +237,36 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
 		 */
 		if (linked->btree_id == iter->btree_id &&
 		    level > __fls(linked->nodes_locked)) {
-			if (!(iter->trans->nounlock)) {
+			if (!(trans->nounlock)) {
 				linked->locks_want =
 					max(level + 1, max_t(unsigned,
 					    linked->locks_want,
 					    iter->locks_want));
-				btree_iter_get_locks(linked, true, false);
+				if (!btree_iter_get_locks(linked, true, false))
+					ret = false;
+			} else {
+				ret = false;
 			}
+		}
+
+		/* Must lock btree nodes in key order: */
+		if (iter->btree_id < linked->btree_id)
+			ret = false;
+
+		if (iter->btree_id == linked->btree_id &&
+		    btree_node_locked(linked, level) &&
+		    bkey_cmp(pos, linked->l[level].b->key.k.p) <= 0)
 			ret = false;
+
+		/*
+		 * Recheck if this is a node we already have locked - since one
+		 * of the get_locks() calls might've successfully
+		 * upgraded/relocked it:
+		 */
+		if (linked->l[level].b == b &&
+		    btree_node_locked_type(linked, level) >= type) {
+			six_lock_increment(&b->c.lock, type);
+			return true;
 		}
 	}
 
@@ -2242,13 +2261,15 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct bch_fs *c)
 
 	mutex_lock(&c->btree_trans_lock);
 	list_for_each_entry(trans, &c->btree_trans_list, list) {
-		pr_buf(out, "%i %ps\n", trans->pid, (void *) trans->ip);
+		pr_buf(out, "%i %px %ps\n", trans->pid, trans, (void *) trans->ip);
 
 		trans_for_each_iter(trans, iter) {
 			if (!iter->nodes_locked)
 				continue;
 
-			pr_buf(out, "  iter %s:", bch2_btree_ids[iter->btree_id]);
+			pr_buf(out, "  iter %u %s:",
+			       iter->idx,
+			       bch2_btree_ids[iter->btree_id]);
 			bch2_bpos_to_text(out, iter->pos);
 			pr_buf(out, "\n");
 
@@ -2256,8 +2277,8 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct bch_fs *c)
 				if (btree_node_locked(iter, l)) {
 					b = iter->l[l].b;
 
-					pr_buf(out, "    %p l=%u %s ",
-					       b, l, btree_node_intent_locked(iter, l) ? "i" : "r");
+					pr_buf(out, "    %px %s l=%u ",
+					       b, btree_node_intent_locked(iter, l) ? "i" : "r", l);
 					bch2_bpos_to_text(out, b->key.k.p);
 					pr_buf(out, "\n");
 				}
@@ -2266,7 +2287,13 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct bch_fs *c)
 
 		b = READ_ONCE(trans->locking);
 		if (b) {
-			pr_buf(out, "  locking %px l=%u %s:",
+			pr_buf(out, "  locking iter %u l=%u %s:",
+			       trans->locking_iter_idx,
+			       trans->locking_level,
+			       bch2_btree_ids[trans->locking_btree_id]);
+			bch2_bpos_to_text(out, trans->locking_pos);
+
+			pr_buf(out, " node %px l=%u %s:",
 			       b, b->c.level,
 			       bch2_btree_ids[b->c.btree_id]);
 			bch2_bpos_to_text(out, b->key.k.p);
diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h
index cf1801ee14a2..4c80ab368e69 100644
--- a/fs/bcachefs/btree_locking.h
+++ b/fs/bcachefs/btree_locking.h
@@ -157,15 +157,15 @@ static inline void btree_node_lock_type(struct bch_fs *c, struct btree *b,
  * Lock a btree node if we already have it locked on one of our linked
  * iterators:
  */
-static inline bool btree_node_lock_increment(struct btree_iter *iter,
+static inline bool btree_node_lock_increment(struct btree_trans *trans,
 					     struct btree *b, unsigned level,
 					     enum btree_node_locked_type want)
 {
-	struct btree_iter *linked;
+	struct btree_iter *iter;
 
-	trans_for_each_iter(iter->trans, linked)
-		if (linked->l[level].b == b &&
-		    btree_node_locked_type(linked, level) >= want) {
+	trans_for_each_iter(trans, iter)
+		if (iter->l[level].b == b &&
+		    btree_node_locked_type(iter, level) >= want) {
 			six_lock_increment(&b->c.lock, want);
 			return true;
 		}
@@ -181,19 +181,23 @@ static inline bool btree_node_lock(struct btree *b, struct bpos pos,
 				   struct btree_iter *iter,
 				   enum six_lock_type type)
 {
+	struct btree_trans *trans = iter->trans;
 	bool ret;
 
 	EBUG_ON(level >= BTREE_MAX_DEPTH);
 #ifdef CONFIG_BCACHEFS_DEBUG
-	iter->trans->locking = b;
+	trans->locking		= b;
+	trans->locking_iter_idx = iter->idx;
+	trans->locking_pos	= pos;
+	trans->locking_btree_id	= iter->btree_id;
+	trans->locking_level	= level;
 #endif
-
-	ret = likely(six_trylock_type(&b->c.lock, type)) ||
-		btree_node_lock_increment(iter, b, level, type) ||
+	ret   = likely(six_trylock_type(&b->c.lock, type)) ||
+		btree_node_lock_increment(trans, b, level, type) ||
 		__bch2_btree_node_lock(b, pos, level, iter, type);
 
 #ifdef CONFIG_BCACHEFS_DEBUG
-	iter->trans->locking = NULL;
+	trans->locking = NULL;
 #endif
 	return ret;
 }
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 78fbf922341e..58d54a4ac218 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -287,6 +287,10 @@ struct btree_trans {
 #ifdef CONFIG_BCACHEFS_DEBUG
 	struct list_head	list;
 	struct btree		*locking;
+	unsigned		locking_iter_idx;
+	struct bpos		locking_pos;
+	u8			locking_btree_id;
+	u8			locking_level;
 	pid_t			pid;
 #endif
 	unsigned long		ip;
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 028aa9bbeced..9fbbd2a72e14 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -481,7 +481,7 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans,
 	 * or anything else that might call bch2_trans_relock(), since that
 	 * would just retake the read locks:
 	 */
-	trans_for_each_iter_all(trans, iter) {
+	trans_for_each_iter(trans, iter) {
 		if (iter->nodes_locked != iter->nodes_intent_locked) {
 			EBUG_ON(iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT);
 			EBUG_ON(trans->iters_live & (1ULL << iter->idx));
-- 
cgit 


From bd2bb273a09b93e2a7d79d30458ab5f6f0b3757a Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 12 Jun 2020 22:29:48 -0400
Subject: bcachefs: Don't deadlock when btree node reuse changes lock ordering

Btree node lock ordering is based on the logical key. However, 'struct
btree' may be reused for a different btree node under memory pressure.
This patch uses the new six lock callback to check if a btree node is no
longer the node we wanted to lock before blocking.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_cache.c           | 19 +++++++++++++++++--
 fs/bcachefs/btree_iter.c            | 38 ++++++++++++++++++++++++++++++-------
 fs/bcachefs/btree_locking.h         | 19 ++++++++++++-------
 fs/bcachefs/btree_update_interior.c |  2 ++
 4 files changed, 62 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index d31017de53aa..9423cff1539f 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -678,6 +678,14 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c,
 	return b;
 }
 
+static int lock_node_check_fn(struct six_lock *lock, void *p)
+{
+	struct btree *b = container_of(lock, struct btree, c.lock);
+	const struct bkey_i *k = p;
+
+	return b->hash_val == btree_ptr_hash_val(k) ? 0 : -1;
+}
+
 /**
  * bch_btree_node_get - find a btree node in the cache and lock it, reading it
  * in from disk if necessary.
@@ -750,8 +758,12 @@ lock_node:
 		if (btree_node_read_locked(iter, level + 1))
 			btree_node_unlock(iter, level + 1);
 
-		if (!btree_node_lock(b, k->k.p, level, iter, lock_type))
+		if (!btree_node_lock(b, k->k.p, level, iter, lock_type,
+				     lock_node_check_fn, (void *) k)) {
+			if (b->hash_val != btree_ptr_hash_val(k))
+				goto retry;
 			return ERR_PTR(-EINTR);
+		}
 
 		if (unlikely(b->hash_val != btree_ptr_hash_val(k) ||
 			     b->c.level != level ||
@@ -803,6 +815,7 @@ struct btree *bch2_btree_node_get_noiter(struct bch_fs *c,
 	struct btree_cache *bc = &c->btree_cache;
 	struct btree *b;
 	struct bset_tree *t;
+	int ret;
 
 	EBUG_ON(level >= BTREE_MAX_DEPTH);
 
@@ -823,7 +836,9 @@ retry:
 			return b;
 	} else {
 lock_node:
-		six_lock_read(&b->c.lock, NULL, NULL);
+		ret = six_lock_read(&b->c.lock, lock_node_check_fn, (void *) k);
+		if (ret)
+			goto retry;
 
 		if (unlikely(b->hash_val != btree_ptr_hash_val(k) ||
 			     b->c.btree_id != btree_id ||
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 35fe7db50fb5..b11c8e2a8d6b 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -194,10 +194,13 @@ static inline bool btree_iter_get_locks(struct btree_iter *iter,
 /* Slowpath: */
 bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
 			    unsigned level, struct btree_iter *iter,
-			    enum six_lock_type type)
+			    enum six_lock_type type,
+			    six_lock_should_sleep_fn should_sleep_fn,
+			    void *p)
 {
 	struct btree_trans *trans = iter->trans;
 	struct btree_iter *linked;
+	u64 start_time = local_clock();
 	bool ret = true;
 
 	/* Check if it's safe to block: */
@@ -275,7 +278,14 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
 		return false;
 	}
 
-	__btree_node_lock_type(iter->trans->c, b, type);
+	if (six_trylock_type(&b->c.lock, type))
+		return true;
+
+	if (six_lock_type(&b->c.lock, type, should_sleep_fn, p))
+		return false;
+
+	bch2_time_stats_update(&trans->c->times[lock_to_time_stat(type)],
+			       start_time);
 	return true;
 }
 
@@ -286,6 +296,11 @@ static void bch2_btree_iter_verify_locks(struct btree_iter *iter)
 {
 	unsigned l;
 
+	if (!(iter->trans->iters_linked & (1ULL << iter->idx))) {
+		BUG_ON(iter->nodes_locked);
+		return;
+	}
+
 	for (l = 0; btree_iter_node(iter, l); l++) {
 		if (iter->uptodate >= BTREE_ITER_NEED_RELOCK &&
 		    !btree_node_locked(iter, l))
@@ -300,7 +315,7 @@ void bch2_btree_trans_verify_locks(struct btree_trans *trans)
 {
 	struct btree_iter *iter;
 
-	trans_for_each_iter(trans, iter)
+	trans_for_each_iter_all(trans, iter)
 		bch2_btree_iter_verify_locks(iter);
 }
 #else
@@ -892,18 +907,26 @@ void bch2_btree_iter_reinit_node(struct btree_iter *iter, struct btree *b)
 		__btree_iter_init(linked, b->c.level);
 }
 
+static int lock_root_check_fn(struct six_lock *lock, void *p)
+{
+	struct btree *b = container_of(lock, struct btree, c.lock);
+	struct btree **rootp = p;
+
+	return b == *rootp ? 0 : -1;
+}
+
 static inline int btree_iter_lock_root(struct btree_iter *iter,
 				       unsigned depth_want)
 {
 	struct bch_fs *c = iter->trans->c;
-	struct btree *b;
+	struct btree *b, **rootp = &c->btree_roots[iter->btree_id].b;
 	enum six_lock_type lock_type;
 	unsigned i;
 
 	EBUG_ON(iter->nodes_locked);
 
 	while (1) {
-		b = READ_ONCE(c->btree_roots[iter->btree_id].b);
+		b = READ_ONCE(*rootp);
 		iter->level = READ_ONCE(b->c.level);
 
 		if (unlikely(iter->level < depth_want)) {
@@ -921,10 +944,11 @@ static inline int btree_iter_lock_root(struct btree_iter *iter,
 
 		lock_type = __btree_lock_want(iter, iter->level);
 		if (unlikely(!btree_node_lock(b, POS_MAX, iter->level,
-					      iter, lock_type)))
+					      iter, lock_type,
+					      lock_root_check_fn, rootp)))
 			return -EINTR;
 
-		if (likely(b == c->btree_roots[iter->btree_id].b &&
+		if (likely(b == READ_ONCE(*rootp) &&
 			   b->c.level == iter->level &&
 			   !race_fault())) {
 			for (i = 0; i < iter->level; i++)
diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h
index 4c80ab368e69..ffee6f2d7d4b 100644
--- a/fs/bcachefs/btree_locking.h
+++ b/fs/bcachefs/btree_locking.h
@@ -174,17 +174,21 @@ static inline bool btree_node_lock_increment(struct btree_trans *trans,
 }
 
 bool __bch2_btree_node_lock(struct btree *, struct bpos, unsigned,
-			    struct btree_iter *, enum six_lock_type);
-
-static inline bool btree_node_lock(struct btree *b, struct bpos pos,
-				   unsigned level,
-				   struct btree_iter *iter,
-				   enum six_lock_type type)
+			    struct btree_iter *, enum six_lock_type,
+			    six_lock_should_sleep_fn, void *);
+
+static inline bool btree_node_lock(struct btree *b,
+			struct bpos pos, unsigned level,
+			struct btree_iter *iter,
+			enum six_lock_type type,
+			six_lock_should_sleep_fn should_sleep_fn, void *p)
 {
 	struct btree_trans *trans = iter->trans;
 	bool ret;
 
 	EBUG_ON(level >= BTREE_MAX_DEPTH);
+	EBUG_ON(!(trans->iters_linked & (1ULL << iter->idx)));
+
 #ifdef CONFIG_BCACHEFS_DEBUG
 	trans->locking		= b;
 	trans->locking_iter_idx = iter->idx;
@@ -194,7 +198,8 @@ static inline bool btree_node_lock(struct btree *b, struct bpos pos,
 #endif
 	ret   = likely(six_trylock_type(&b->c.lock, type)) ||
 		btree_node_lock_increment(trans, b, level, type) ||
-		__bch2_btree_node_lock(b, pos, level, iter, type);
+		__bch2_btree_node_lock(b, pos, level, iter, type,
+				       should_sleep_fn, p);
 
 #ifdef CONFIG_BCACHEFS_DEBUG
 	trans->locking = NULL;
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index bb921852a093..2d68f4eaca34 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -135,6 +135,8 @@ static void __btree_node_free(struct bch_fs *c, struct btree *b)
 
 	bch2_btree_node_hash_remove(&c->btree_cache, b);
 
+	six_lock_wakeup_all(&b->c.lock);
+
 	mutex_lock(&c->btree_cache.lock);
 	list_move(&b->list, &c->btree_cache.freeable);
 	mutex_unlock(&c->btree_cache.lock);
-- 
cgit 


From 7fffc85baf1fa176560a546a0625efc549969ce4 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 13 Jun 2020 18:43:14 -0400
Subject: bcachefs: Add an internal option for reading entire journal

To be used the debug tool that dumps the contents of the journal.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/journal.c    |  9 +++++----
 fs/bcachefs/journal_io.c | 26 ++++++++++++++------------
 fs/bcachefs/opts.h       |  5 +++++
 fs/bcachefs/recovery.c   | 26 ++++++++++++++++++++------
 4 files changed, 44 insertions(+), 22 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index 767cb6f809e7..cbfaec5143d8 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -986,9 +986,8 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq,
 	u64 last_seq = cur_seq, nr, seq;
 
 	if (!list_empty(journal_entries))
-		last_seq = le64_to_cpu(list_first_entry(journal_entries,
-							struct journal_replay,
-							list)->j.seq);
+		last_seq = le64_to_cpu(list_last_entry(journal_entries,
+				struct journal_replay, list)->j.last_seq);
 
 	nr = cur_seq - last_seq;
 
@@ -1017,8 +1016,10 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq,
 
 	list_for_each_entry(i, journal_entries, list) {
 		seq = le64_to_cpu(i->j.seq);
+		BUG_ON(seq >= cur_seq);
 
-		BUG_ON(seq < last_seq || seq >= cur_seq);
+		if (seq < last_seq)
+			continue;
 
 		journal_seq_pin(j, seq)->devs = i->devs;
 	}
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 1724c80b323c..a1bae99aeaab 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -40,19 +40,21 @@ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca,
 				  list)->j.last_seq
 		: 0;
 
-	/* Is this entry older than the range we need? */
-	if (le64_to_cpu(j->seq) < le64_to_cpu(last_seq)) {
-		ret = JOURNAL_ENTRY_ADD_OUT_OF_RANGE;
-		goto out;
-	}
+	if (!c->opts.read_entire_journal) {
+		/* Is this entry older than the range we need? */
+		if (le64_to_cpu(j->seq) < le64_to_cpu(last_seq)) {
+			ret = JOURNAL_ENTRY_ADD_OUT_OF_RANGE;
+			goto out;
+		}
 
-	/* Drop entries we don't need anymore */
-	list_for_each_entry_safe(i, pos, jlist->head, list) {
-		if (le64_to_cpu(i->j.seq) >= le64_to_cpu(j->last_seq))
-			break;
-		list_del(&i->list);
-		kvpfree(i, offsetof(struct journal_replay, j) +
-			vstruct_bytes(&i->j));
+		/* Drop entries we don't need anymore */
+		list_for_each_entry_safe(i, pos, jlist->head, list) {
+			if (le64_to_cpu(i->j.seq) >= le64_to_cpu(j->last_seq))
+				break;
+			list_del(&i->list);
+			kvpfree(i, offsetof(struct journal_replay, j) +
+				vstruct_bytes(&i->j));
+		}
 	}
 
 	list_for_each_entry_reverse(i, jlist->head, list) {
diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
index fe457117bf89..bc274918e18c 100644
--- a/fs/bcachefs/opts.h
+++ b/fs/bcachefs/opts.h
@@ -265,6 +265,11 @@ enum opt_type {
 	  OPT_BOOL(),							\
 	  NO_SB_OPT,			false,				\
 	  NULL,		"Don't free journal entries/keys after startup")\
+	x(read_entire_journal,		u8,				\
+	  0,								\
+	  OPT_BOOL(),							\
+	  NO_SB_OPT,			false,				\
+	  NULL,		"Read all journal entries, not just dirty ones")\
 	x(noexcl,			u8,				\
 	  OPT_MOUNT,							\
 	  OPT_BOOL(),							\
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 384dfb2279c1..26e5767aa5de 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -319,20 +319,30 @@ static struct journal_keys journal_keys_sort(struct list_head *journal_entries)
 	struct journal_key *src, *dst;
 	size_t nr_keys = 0;
 
-	list_for_each_entry(p, journal_entries, list)
+	if (list_empty(journal_entries))
+		return keys;
+
+	keys.journal_seq_base =
+		le64_to_cpu(list_last_entry(journal_entries,
+				struct journal_replay, list)->j.last_seq);
+
+	list_for_each_entry(p, journal_entries, list) {
+		if (le64_to_cpu(p->j.seq) < keys.journal_seq_base)
+			continue;
+
 		for_each_jset_key(k, _n, entry, &p->j)
 			nr_keys++;
+	}
 
-	keys.journal_seq_base =
-		le64_to_cpu(list_first_entry(journal_entries,
-					     struct journal_replay,
-					     list)->j.seq);
 
 	keys.d = kvmalloc(sizeof(keys.d[0]) * nr_keys, GFP_KERNEL);
 	if (!keys.d)
 		goto err;
 
-	list_for_each_entry(p, journal_entries, list)
+	list_for_each_entry(p, journal_entries, list) {
+		if (le64_to_cpu(p->j.seq) < keys.journal_seq_base)
+			continue;
+
 		for_each_jset_key(k, _n, entry, &p->j)
 			keys.d[keys.nr++] = (struct journal_key) {
 				.btree_id	= entry->btree_id,
@@ -342,6 +352,7 @@ static struct journal_keys journal_keys_sort(struct list_head *journal_entries)
 					keys.journal_seq_base,
 				.journal_offset	= k->_data - p->j._data,
 			};
+	}
 
 	sort(keys.d, keys.nr, sizeof(keys.d[0]), journal_sort_key_cmp, NULL);
 
@@ -568,6 +579,9 @@ verify_journal_entries_not_blacklisted_or_missing(struct bch_fs *c,
 	int ret = 0;
 
 	list_for_each_entry(i, journal, list) {
+		if (le64_to_cpu(i->j.seq) < start_seq)
+			continue;
+
 		fsck_err_on(seq != le64_to_cpu(i->j.seq), c,
 			"journal entries %llu-%llu missing! (replaying %llu-%llu)",
 			seq, le64_to_cpu(i->j.seq) - 1,
-- 
cgit 


From 1ada160618d66bc57beacb4c35f13e9a4c269afa Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 15 Jun 2020 14:58:47 -0400
Subject: bcachefs: Turn c->state_lock into an rwsem

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs.h |  4 ++--
 fs/bcachefs/btree_gc.c |  1 +
 fs/bcachefs/buckets.c  |  7 +++---
 fs/bcachefs/error.c    |  4 ++--
 fs/bcachefs/fs.c       | 12 +++++-----
 fs/bcachefs/super.c    | 60 +++++++++++++++++++++++++-------------------------
 fs/bcachefs/sysfs.c    | 19 +++++-----------
 7 files changed, 50 insertions(+), 57 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index a219969357bc..8d9cc7eb6ad7 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -551,8 +551,8 @@ struct bch_fs {
 	struct super_block	*vfs_sb;
 	char			name[40];
 
-	/* ro/rw, add/remove devices: */
-	struct mutex		state_lock;
+	/* ro/rw, add/remove/resize devices: */
+	struct rw_semaphore	state_lock;
 
 	/* Counts outstanding writes, for clean transition to read-only */
 	struct percpu_ref	writes;
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 6589fe0bad6c..22aa845ea630 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -800,6 +800,7 @@ int bch2_gc(struct bch_fs *c, struct journal_keys *journal_keys,
 	unsigned i, iter = 0;
 	int ret;
 
+	lockdep_assert_held(&c->state_lock);
 	trace_gc_start(c);
 
 	down_write(&c->gc_lock);
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 08e8b578fff5..5ee978c94568 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -1967,6 +1967,8 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
 	int ret = -ENOMEM;
 	unsigned i;
 
+	lockdep_assert_held(&c->state_lock);
+
 	memset(&free,		0, sizeof(free));
 	memset(&free_inc,	0, sizeof(free_inc));
 	memset(&alloc_heap,	0, sizeof(alloc_heap));
@@ -1993,7 +1995,6 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
 	bch2_copygc_stop(ca);
 
 	if (resize) {
-		down_write(&c->gc_lock);
 		down_write(&ca->bucket_lock);
 		percpu_down_write(&c->mark_lock);
 	}
@@ -2036,10 +2037,8 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
 
 	nbuckets = ca->mi.nbuckets;
 
-	if (resize) {
+	if (resize)
 		up_write(&ca->bucket_lock);
-		up_write(&c->gc_lock);
-	}
 
 	if (start_copygc &&
 	    bch2_copygc_start(c, ca))
diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c
index 1662a36244cd..cd46706fb6f5 100644
--- a/fs/bcachefs/error.c
+++ b/fs/bcachefs/error.c
@@ -37,7 +37,7 @@ void bch2_io_error_work(struct work_struct *work)
 	struct bch_fs *c = ca->fs;
 	bool dev;
 
-	mutex_lock(&c->state_lock);
+	down_write(&c->state_lock);
 	dev = bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_RO,
 				    BCH_FORCE_IF_DEGRADED);
 	if (dev
@@ -47,7 +47,7 @@ void bch2_io_error_work(struct work_struct *work)
 		bch_err(ca,
 			"too many IO errors, setting %s RO",
 			dev ? "device" : "filesystem");
-	mutex_unlock(&c->state_lock);
+	up_write(&c->state_lock);
 }
 
 void bch2_io_error(struct bch_dev *ca)
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index 6aff3203b4e1..4538551ccca3 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -1342,16 +1342,16 @@ static struct bch_fs *__bch2_open_as_blockdevs(const char *dev_name, char * cons
 	if (IS_ERR(c))
 		return c;
 
-	mutex_lock(&c->state_lock);
+	down_write(&c->state_lock);
 
 	if (!test_bit(BCH_FS_STARTED, &c->flags)) {
-		mutex_unlock(&c->state_lock);
+		up_write(&c->state_lock);
 		closure_put(&c->cl);
 		pr_err("err mounting %s: incomplete filesystem", dev_name);
 		return ERR_PTR(-EINVAL);
 	}
 
-	mutex_unlock(&c->state_lock);
+	up_write(&c->state_lock);
 
 	set_bit(BCH_FS_BDEV_MOUNTED, &c->flags);
 	return c;
@@ -1400,7 +1400,7 @@ static int bch2_remount(struct super_block *sb, int *flags, char *data)
 		return ret;
 
 	if (opts.read_only != c->opts.read_only) {
-		mutex_lock(&c->state_lock);
+		down_write(&c->state_lock);
 
 		if (opts.read_only) {
 			bch2_fs_read_only(c);
@@ -1410,7 +1410,7 @@ static int bch2_remount(struct super_block *sb, int *flags, char *data)
 			ret = bch2_fs_read_write(c);
 			if (ret) {
 				bch_err(c, "error going rw: %i", ret);
-				mutex_unlock(&c->state_lock);
+				up_write(&c->state_lock);
 				return -EINVAL;
 			}
 
@@ -1419,7 +1419,7 @@ static int bch2_remount(struct super_block *sb, int *flags, char *data)
 
 		c->opts.read_only = opts.read_only;
 
-		mutex_unlock(&c->state_lock);
+		up_write(&c->state_lock);
 	}
 
 	if (opts.errors >= 0)
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 819d4392d529..3a7c48def9e8 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -340,9 +340,9 @@ static void bch2_fs_read_only_work(struct work_struct *work)
 	struct bch_fs *c =
 		container_of(work, struct bch_fs, read_only_work);
 
-	mutex_lock(&c->state_lock);
+	down_write(&c->state_lock);
 	bch2_fs_read_only(c);
-	mutex_unlock(&c->state_lock);
+	up_write(&c->state_lock);
 }
 
 static void bch2_fs_read_only_async(struct bch_fs *c)
@@ -534,9 +534,9 @@ void bch2_fs_stop(struct bch_fs *c)
 
 	cancel_work_sync(&c->journal_seq_blacklist_gc_work);
 
-	mutex_lock(&c->state_lock);
+	down_write(&c->state_lock);
 	bch2_fs_read_only(c);
-	mutex_unlock(&c->state_lock);
+	up_write(&c->state_lock);
 
 	for_each_member_device(ca, c, i)
 		if (ca->kobj.state_in_sysfs &&
@@ -607,7 +607,7 @@ static const char *bch2_fs_online(struct bch_fs *c)
 	    bch2_opts_create_sysfs_files(&c->opts_dir))
 		return "error creating sysfs objects";
 
-	mutex_lock(&c->state_lock);
+	down_write(&c->state_lock);
 
 	err = "error creating sysfs objects";
 	__for_each_member_device(ca, c, i, NULL)
@@ -617,7 +617,7 @@ static const char *bch2_fs_online(struct bch_fs *c)
 	list_add(&c->list, &bch_fs_list);
 	err = NULL;
 err:
-	mutex_unlock(&c->state_lock);
+	up_write(&c->state_lock);
 	return err;
 }
 
@@ -639,7 +639,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 	c->minor		= -1;
 	c->disk_sb.fs_sb	= true;
 
-	mutex_init(&c->state_lock);
+	init_rwsem(&c->state_lock);
 	mutex_init(&c->sb_lock);
 	mutex_init(&c->replicas_gc_lock);
 	mutex_init(&c->btree_root_lock);
@@ -832,7 +832,7 @@ int bch2_fs_start(struct bch_fs *c)
 	unsigned i;
 	int ret = -EINVAL;
 
-	mutex_lock(&c->state_lock);
+	down_write(&c->state_lock);
 
 	BUG_ON(test_bit(BCH_FS_STARTED, &c->flags));
 
@@ -882,7 +882,7 @@ int bch2_fs_start(struct bch_fs *c)
 	print_mount_opts(c);
 	ret = 0;
 out:
-	mutex_unlock(&c->state_lock);
+	up_write(&c->state_lock);
 	return ret;
 err:
 	switch (ret) {
@@ -1376,9 +1376,9 @@ int bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca,
 {
 	int ret;
 
-	mutex_lock(&c->state_lock);
+	down_write(&c->state_lock);
 	ret = __bch2_dev_set_state(c, ca, new_state, flags);
-	mutex_unlock(&c->state_lock);
+	up_write(&c->state_lock);
 
 	return ret;
 }
@@ -1391,7 +1391,7 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
 	unsigned dev_idx = ca->dev_idx, data;
 	int ret = -EINVAL;
 
-	mutex_lock(&c->state_lock);
+	down_write(&c->state_lock);
 
 	/*
 	 * We consume a reference to ca->ref, regardless of whether we succeed
@@ -1481,13 +1481,13 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
 	bch2_write_super(c);
 
 	mutex_unlock(&c->sb_lock);
-	mutex_unlock(&c->state_lock);
+	up_write(&c->state_lock);
 	return 0;
 err:
 	if (ca->mi.state == BCH_MEMBER_STATE_RW &&
 	    !percpu_ref_is_zero(&ca->io_ref))
 		__bch2_dev_read_write(c, ca);
-	mutex_unlock(&c->state_lock);
+	up_write(&c->state_lock);
 	return ret;
 }
 
@@ -1563,7 +1563,7 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
 
 	dev_usage_clear(ca);
 
-	mutex_lock(&c->state_lock);
+	down_write(&c->state_lock);
 	mutex_lock(&c->sb_lock);
 
 	err = "insufficient space in new superblock";
@@ -1624,12 +1624,12 @@ have_slot:
 			goto err_late;
 	}
 
-	mutex_unlock(&c->state_lock);
+	up_write(&c->state_lock);
 	return 0;
 
 err_unlock:
 	mutex_unlock(&c->sb_lock);
-	mutex_unlock(&c->state_lock);
+	up_write(&c->state_lock);
 err:
 	if (ca)
 		bch2_dev_free(ca);
@@ -1652,11 +1652,11 @@ int bch2_dev_online(struct bch_fs *c, const char *path)
 	const char *err;
 	int ret;
 
-	mutex_lock(&c->state_lock);
+	down_write(&c->state_lock);
 
 	ret = bch2_read_super(path, &opts, &sb);
 	if (ret) {
-		mutex_unlock(&c->state_lock);
+		up_write(&c->state_lock);
 		return ret;
 	}
 
@@ -1687,10 +1687,10 @@ int bch2_dev_online(struct bch_fs *c, const char *path)
 	bch2_write_super(c);
 	mutex_unlock(&c->sb_lock);
 
-	mutex_unlock(&c->state_lock);
+	up_write(&c->state_lock);
 	return 0;
 err:
-	mutex_unlock(&c->state_lock);
+	up_write(&c->state_lock);
 	bch2_free_super(&sb);
 	bch_err(c, "error bringing %s online: %s", path, err);
 	return -EINVAL;
@@ -1698,23 +1698,23 @@ err:
 
 int bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca, int flags)
 {
-	mutex_lock(&c->state_lock);
+	down_write(&c->state_lock);
 
 	if (!bch2_dev_is_online(ca)) {
 		bch_err(ca, "Already offline");
-		mutex_unlock(&c->state_lock);
+		up_write(&c->state_lock);
 		return 0;
 	}
 
 	if (!bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_FAILED, flags)) {
 		bch_err(ca, "Cannot offline required disk");
-		mutex_unlock(&c->state_lock);
+		up_write(&c->state_lock);
 		return -EINVAL;
 	}
 
 	__bch2_dev_offline(c, ca);
 
-	mutex_unlock(&c->state_lock);
+	up_write(&c->state_lock);
 	return 0;
 }
 
@@ -1723,7 +1723,7 @@ int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
 	struct bch_member *mi;
 	int ret = 0;
 
-	mutex_lock(&c->state_lock);
+	down_write(&c->state_lock);
 
 	if (nbuckets < ca->mi.nbuckets) {
 		bch_err(ca, "Cannot shrink yet");
@@ -1754,7 +1754,7 @@ int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
 
 	bch2_recalc_capacity(c);
 err:
-	mutex_unlock(&c->state_lock);
+	up_write(&c->state_lock);
 	return ret;
 }
 
@@ -1834,13 +1834,13 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices,
 		goto err;
 
 	err = "bch2_dev_online() error";
-	mutex_lock(&c->state_lock);
+	down_write(&c->state_lock);
 	for (i = 0; i < nr_devices; i++)
 		if (bch2_dev_attach_bdev(c, &sb[i])) {
-			mutex_unlock(&c->state_lock);
+			up_write(&c->state_lock);
 			goto err_print;
 		}
-	mutex_unlock(&c->state_lock);
+	up_write(&c->state_lock);
 
 	err = "insufficient devices";
 	if (!bch2_fs_may_start(c))
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index 06b59e991312..663b59e78824 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -427,7 +427,7 @@ SHOW(bch2_fs)
 	return 0;
 }
 
-STORE(__bch2_fs)
+STORE(bch2_fs)
 {
 	struct bch_fs *c = container_of(kobj, struct bch_fs, kobj);
 
@@ -485,8 +485,11 @@ STORE(__bch2_fs)
 	if (attr == &sysfs_trigger_btree_coalesce)
 		bch2_coalesce(c);
 
-	if (attr == &sysfs_trigger_gc)
+	if (attr == &sysfs_trigger_gc) {
+		down_read(&c->state_lock);
 		bch2_gc(c, NULL, false, false);
+		up_read(&c->state_lock);
+	}
 
 	if (attr == &sysfs_trigger_alloc_write) {
 		bool wrote;
@@ -501,6 +504,7 @@ STORE(__bch2_fs)
 		sc.nr_to_scan = strtoul_or_return(buf);
 		c->btree_cache.shrink.scan_objects(&c->btree_cache.shrink, &sc);
 	}
+
 #ifdef CONFIG_BCACHEFS_TESTS
 	if (attr == &sysfs_perf_test) {
 		char *tmp = kstrdup(buf, GFP_KERNEL), *p = tmp;
@@ -522,17 +526,6 @@ STORE(__bch2_fs)
 #endif
 	return size;
 }
-
-STORE(bch2_fs)
-{
-	struct bch_fs *c = container_of(kobj, struct bch_fs, kobj);
-
-	mutex_lock(&c->state_lock);
-	size = __bch2_fs_store(kobj, attr, buf, size);
-	mutex_unlock(&c->state_lock);
-
-	return size;
-}
 SYSFS_OPS(bch2_fs);
 
 struct attribute *bch2_fs_files[] = {
-- 
cgit 


From 451570a5bc5f72c4c6442631d158f0c11cb3daa8 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 15 Jun 2020 15:10:54 -0400
Subject: bcachefs: Implement a new gc that only recalcs oldest gen

Full mark and sweep gc doesn't (yet?) work with the new btree key cache
code, but it also blocks updates to interior btree nodes for the
duration and isn't really necessary in practice; we aren't currently
attempting to repair errors in allocation info at runtime.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_gc.c      | 83 +++++++++++++++++++++++++++++++++++++++++++++
 fs/bcachefs/btree_gc.h      |  1 +
 fs/bcachefs/buckets_types.h |  1 +
 fs/bcachefs/sysfs.c         |  7 ++++
 4 files changed, 92 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 22aa845ea630..3775b65a89a6 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -887,6 +887,82 @@ out:
 	return ret;
 }
 
+/*
+ * For recalculating oldest gen, we only need to walk keys in leaf nodes; btree
+ * node pointers currently never have cached pointers that can become stale:
+ */
+static int bch2_gc_btree_gens(struct bch_fs *c, enum btree_id id)
+{
+	struct btree_trans trans;
+	struct btree_iter *iter;
+	struct bkey_s_c k;
+	int ret;
+
+	bch2_trans_init(&trans, c, 0, 0);
+
+	for_each_btree_key(&trans, iter, id, POS_MIN, BTREE_ITER_PREFETCH, k, ret) {
+		struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+		const struct bch_extent_ptr *ptr;
+
+		percpu_down_read(&c->mark_lock);
+		bkey_for_each_ptr(ptrs, ptr) {
+			struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+			struct bucket *g = PTR_BUCKET(ca, ptr, false);
+
+			if (gen_after(g->gc_gen, ptr->gen))
+				g->gc_gen = ptr->gen;
+
+			if (gen_after(g->mark.gen, ptr->gen) > 32) {
+				/* rewrite btree node */
+
+			}
+		}
+		percpu_up_read(&c->mark_lock);
+	}
+
+	bch2_trans_exit(&trans);
+	return ret;
+}
+
+int bch2_gc_gens(struct bch_fs *c)
+{
+	struct bch_dev *ca;
+	struct bucket_array *buckets;
+	struct bucket *g;
+	unsigned i;
+	int ret;
+
+	down_read(&c->state_lock);
+
+	for_each_member_device(ca, c, i) {
+		down_read(&ca->bucket_lock);
+		buckets = bucket_array(ca);
+
+		for_each_bucket(g, buckets)
+			g->gc_gen = g->mark.gen;
+		up_read(&ca->bucket_lock);
+	}
+
+	for (i = 0; i < BTREE_ID_NR; i++)
+		if (btree_node_type_needs_gc(i)) {
+			ret = bch2_gc_btree_gens(c, i);
+			if (ret)
+				goto err;
+		}
+
+	for_each_member_device(ca, c, i) {
+		down_read(&ca->bucket_lock);
+		buckets = bucket_array(ca);
+
+		for_each_bucket(g, buckets)
+			g->oldest_gen = g->gc_gen;
+		up_read(&ca->bucket_lock);
+	}
+err:
+	up_read(&c->state_lock);
+	return ret;
+}
+
 /* Btree coalescing */
 
 static void recalc_packed_keys(struct btree *b)
@@ -1262,7 +1338,14 @@ static int bch2_gc_thread(void *arg)
 		last = atomic_long_read(&clock->now);
 		last_kick = atomic_read(&c->kick_gc);
 
+		/*
+		 * Full gc is currently incompatible with btree key cache:
+		 */
+#if 0
 		ret = bch2_gc(c, NULL, false, false);
+#else
+		ret = bch2_gc_gens(c);
+#endif
 		if (ret)
 			bch_err(c, "btree gc failed: %i", ret);
 
diff --git a/fs/bcachefs/btree_gc.h b/fs/bcachefs/btree_gc.h
index 3966d5e54cfd..3694a3df62a8 100644
--- a/fs/bcachefs/btree_gc.h
+++ b/fs/bcachefs/btree_gc.h
@@ -8,6 +8,7 @@ void bch2_coalesce(struct bch_fs *);
 
 struct journal_keys;
 int bch2_gc(struct bch_fs *, struct journal_keys *, bool, bool);
+int bch2_gc_gens(struct bch_fs *);
 void bch2_gc_thread_stop(struct bch_fs *);
 int bch2_gc_thread_start(struct bch_fs *);
 void bch2_mark_dev_superblock(struct bch_fs *, struct bch_dev *, unsigned);
diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h
index 070e10dfa7bb..172b0ccf2b4f 100644
--- a/fs/bcachefs/buckets_types.h
+++ b/fs/bcachefs/buckets_types.h
@@ -39,6 +39,7 @@ struct bucket {
 
 	u16				io_time[2];
 	u8				oldest_gen;
+	u8				gc_gen;
 	unsigned			gen_valid:1;
 };
 
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index 663b59e78824..1800e0f7f81e 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -486,9 +486,16 @@ STORE(bch2_fs)
 		bch2_coalesce(c);
 
 	if (attr == &sysfs_trigger_gc) {
+		/*
+		 * Full gc is currently incompatible with btree key cache:
+		 */
+#if 0
 		down_read(&c->state_lock);
 		bch2_gc(c, NULL, false, false);
 		up_read(&c->state_lock);
+#else
+		bch2_gc_gens(c);
+#endif
 	}
 
 	if (attr == &sysfs_trigger_alloc_write) {
-- 
cgit 


From 2ca88e5ad9b29624ea1467ef7fcc583c928fd783 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 7 Mar 2019 19:46:10 -0500
Subject: bcachefs: Btree key cache

This introduces a new kind of btree iterator, cached iterators, which
point to keys cached in a hash table. The cache also acts as a write
cache - in the update path, we journal the update but defer updating the
btree until the cached entry is flushed by journal reclaim.

Cache coherency is for now up to the users to handle, which isn't ideal
but should be good enough for now.

These new iterators will be used for updating inodes and alloc info (the
alloc and stripes btrees).

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/Makefile                |   1 +
 fs/bcachefs/bcachefs.h              |   3 +
 fs/bcachefs/btree_iter.c            | 115 ++++++---
 fs/bcachefs/btree_iter.h            |  16 +-
 fs/bcachefs/btree_key_cache.c       | 494 ++++++++++++++++++++++++++++++++++++
 fs/bcachefs/btree_key_cache.h       |  23 ++
 fs/bcachefs/btree_types.h           |  53 +++-
 fs/bcachefs/btree_update.h          |   5 +
 fs/bcachefs/btree_update_interior.c |   9 +
 fs/bcachefs/btree_update_leaf.c     |  67 ++++-
 fs/bcachefs/buckets.c               |   7 +
 fs/bcachefs/journal_reclaim.c       |  31 +++
 fs/bcachefs/journal_reclaim.h       |   4 +
 fs/bcachefs/super.c                 |   4 +
 14 files changed, 787 insertions(+), 45 deletions(-)
 create mode 100644 fs/bcachefs/btree_key_cache.c
 create mode 100644 fs/bcachefs/btree_key_cache.h

(limited to 'fs')

diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile
index 6d5ad877fb07..ffe4db45e1c9 100644
--- a/fs/bcachefs/Makefile
+++ b/fs/bcachefs/Makefile
@@ -13,6 +13,7 @@ bcachefs-y		:=	\
 	btree_gc.o		\
 	btree_io.o		\
 	btree_iter.o		\
+	btree_key_cache.o	\
 	btree_update_interior.o	\
 	btree_update_leaf.o	\
 	buckets.o		\
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 8d9cc7eb6ad7..d293afcda75a 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -483,6 +483,7 @@ enum {
 	BCH_FS_ALLOCATOR_RUNNING,
 	BCH_FS_ALLOCATOR_STOPPING,
 	BCH_FS_INITIAL_GC_DONE,
+	BCH_FS_BTREE_INTERIOR_REPLAY_DONE,
 	BCH_FS_FSCK_DONE,
 	BCH_FS_STARTED,
 	BCH_FS_RW,
@@ -632,6 +633,8 @@ struct bch_fs {
 	struct list_head	btree_trans_list;
 	mempool_t		btree_iters_pool;
 
+	struct btree_key_cache	btree_key_cache;
+
 	struct workqueue_struct	*wq;
 	/* copygc needs its own workqueue for index updates.. */
 	struct workqueue_struct	*copygc_wq;
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index b11c8e2a8d6b..592663a00182 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -4,22 +4,16 @@
 #include "bkey_methods.h"
 #include "btree_cache.h"
 #include "btree_iter.h"
+#include "btree_key_cache.h"
 #include "btree_locking.h"
 #include "btree_update.h"
 #include "debug.h"
 #include "extents.h"
+#include "journal.h"
 #include "trace.h"
 
 #include <linux/prefetch.h>
 
-#define BTREE_ITER_NO_NODE_GET_LOCKS	((struct btree *) 1)
-#define BTREE_ITER_NO_NODE_DROP		((struct btree *) 2)
-#define BTREE_ITER_NO_NODE_LOCK_ROOT	((struct btree *) 3)
-#define BTREE_ITER_NO_NODE_UP		((struct btree *) 4)
-#define BTREE_ITER_NO_NODE_DOWN		((struct btree *) 5)
-#define BTREE_ITER_NO_NODE_INIT		((struct btree *) 6)
-#define BTREE_ITER_NO_NODE_ERROR	((struct btree *) 7)
-
 static inline bool is_btree_node(struct btree_iter *iter, unsigned l)
 {
 	return l < BTREE_MAX_DEPTH &&
@@ -253,7 +247,8 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
 		}
 
 		/* Must lock btree nodes in key order: */
-		if (iter->btree_id < linked->btree_id)
+		if ((cmp_int(iter->btree_id, linked->btree_id) ?:
+		     -cmp_int(btree_iter_type(iter), btree_iter_type(linked))) < 0)
 			ret = false;
 
 		if (iter->btree_id == linked->btree_id &&
@@ -301,7 +296,7 @@ static void bch2_btree_iter_verify_locks(struct btree_iter *iter)
 		return;
 	}
 
-	for (l = 0; btree_iter_node(iter, l); l++) {
+	for (l = 0; is_btree_node(iter, l); l++) {
 		if (iter->uptodate >= BTREE_ITER_NEED_RELOCK &&
 		    !btree_node_locked(iter, l))
 			continue;
@@ -323,7 +318,7 @@ static inline void bch2_btree_iter_verify_locks(struct btree_iter *iter) {}
 #endif
 
 __flatten
-static bool bch2_btree_iter_relock(struct btree_iter *iter, bool trace)
+bool bch2_btree_iter_relock(struct btree_iter *iter, bool trace)
 {
 	return btree_iter_get_locks(iter, false, trace);
 }
@@ -845,6 +840,8 @@ static inline void __btree_iter_init(struct btree_iter *iter,
 static inline void btree_iter_node_set(struct btree_iter *iter,
 				       struct btree *b)
 {
+	BUG_ON(btree_iter_type(iter) == BTREE_ITER_CACHED);
+
 	btree_iter_verify_new_node(iter, b);
 
 	EBUG_ON(!btree_iter_pos_in_node(iter, b));
@@ -865,7 +862,8 @@ void bch2_btree_iter_node_replace(struct btree_iter *iter, struct btree *b)
 	struct btree_iter *linked;
 
 	trans_for_each_iter(iter->trans, linked)
-		if (btree_iter_pos_in_node(linked, b)) {
+		if (btree_iter_type(linked) != BTREE_ITER_CACHED &&
+		    btree_iter_pos_in_node(linked, b)) {
 			/*
 			 * bch2_btree_iter_node_drop() has already been called -
 			 * the old node we're replacing has already been
@@ -1057,24 +1055,28 @@ static void btree_iter_up(struct btree_iter *iter)
 
 static int btree_iter_traverse_one(struct btree_iter *);
 
-static int __btree_iter_traverse_all(struct btree_trans *trans,
-				   struct btree_iter *orig_iter, int ret)
+static int __btree_iter_traverse_all(struct btree_trans *trans, int ret)
 {
 	struct bch_fs *c = trans->c;
 	struct btree_iter *iter;
 	u8 sorted[BTREE_ITER_MAX];
 	unsigned i, nr_sorted = 0;
 
+	if (trans->in_traverse_all)
+		return -EINTR;
+
+	trans->in_traverse_all = true;
+retry_all:
+	nr_sorted = 0;
+
 	trans_for_each_iter(trans, iter)
-		sorted[nr_sorted++] = iter - trans->iters;
+		sorted[nr_sorted++] = iter->idx;
 
 #define btree_iter_cmp_by_idx(_l, _r)				\
 		btree_iter_cmp(&trans->iters[_l], &trans->iters[_r])
 
 	bubble_sort(sorted, nr_sorted, btree_iter_cmp_by_idx);
 #undef btree_iter_cmp_by_idx
-
-retry_all:
 	bch2_trans_unlock(trans);
 
 	if (unlikely(ret == -ENOMEM)) {
@@ -1090,11 +1092,6 @@ retry_all:
 
 	if (unlikely(ret == -EIO)) {
 		trans->error = true;
-		if (orig_iter) {
-			orig_iter->flags |= BTREE_ITER_ERROR;
-			orig_iter->l[orig_iter->level].b =
-				BTREE_ITER_NO_NODE_ERROR;
-		}
 		goto out;
 	}
 
@@ -1102,9 +1099,16 @@ retry_all:
 
 	/* Now, redo traversals in correct order: */
 	for (i = 0; i < nr_sorted; i++) {
-		iter = &trans->iters[sorted[i]];
+		unsigned idx = sorted[i];
+
+		/*
+		 * sucessfully traversing one iterator can cause another to be
+		 * unlinked, in btree_key_cache_fill()
+		 */
+		if (!(trans->iters_linked & (1ULL << idx)))
+			continue;
 
-		ret = btree_iter_traverse_one(iter);
+		ret = btree_iter_traverse_one(&trans->iters[idx]);
 		if (ret)
 			goto retry_all;
 	}
@@ -1119,12 +1123,14 @@ retry_all:
 			}
 out:
 	bch2_btree_cache_cannibalize_unlock(c);
+
+	trans->in_traverse_all = false;
 	return ret;
 }
 
 int bch2_btree_iter_traverse_all(struct btree_trans *trans)
 {
-	return __btree_iter_traverse_all(trans, NULL, 0);
+	return __btree_iter_traverse_all(trans, 0);
 }
 
 static inline bool btree_iter_good_node(struct btree_iter *iter,
@@ -1169,9 +1175,6 @@ static int btree_iter_traverse_one(struct btree_iter *iter)
 {
 	unsigned depth_want = iter->level;
 
-	if (unlikely(iter->level >= BTREE_MAX_DEPTH))
-		return 0;
-
 	/*
 	 * if we need interior nodes locked, call btree_iter_relock() to make
 	 * sure we walk back up enough that we lock them:
@@ -1180,9 +1183,15 @@ static int btree_iter_traverse_one(struct btree_iter *iter)
 	    iter->locks_want > 1)
 		bch2_btree_iter_relock(iter, false);
 
+	if (btree_iter_type(iter) == BTREE_ITER_CACHED)
+		return bch2_btree_iter_traverse_cached(iter);
+
 	if (iter->uptodate < BTREE_ITER_NEED_RELOCK)
 		return 0;
 
+	if (unlikely(iter->level >= BTREE_MAX_DEPTH))
+		return 0;
+
 	/*
 	 * XXX: correctly using BTREE_ITER_UPTODATE should make using check_pos
 	 * here unnecessary
@@ -1216,7 +1225,15 @@ static int btree_iter_traverse_one(struct btree_iter *iter)
 				return 0;
 
 			iter->level = depth_want;
-			iter->l[iter->level].b = BTREE_ITER_NO_NODE_DOWN;
+
+			if (ret == -EIO) {
+				iter->flags |= BTREE_ITER_ERROR;
+				iter->l[iter->level].b =
+					BTREE_ITER_NO_NODE_ERROR;
+			} else {
+				iter->l[iter->level].b =
+					BTREE_ITER_NO_NODE_DOWN;
+			}
 			return ret;
 		}
 	}
@@ -1229,12 +1246,13 @@ static int btree_iter_traverse_one(struct btree_iter *iter)
 
 int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter)
 {
+	struct btree_trans *trans = iter->trans;
 	int ret;
 
-	ret =   bch2_trans_cond_resched(iter->trans) ?:
+	ret =   bch2_trans_cond_resched(trans) ?:
 		btree_iter_traverse_one(iter);
 	if (unlikely(ret))
-		ret = __btree_iter_traverse_all(iter->trans, iter, ret);
+		ret = __btree_iter_traverse_all(trans, ret);
 
 	return ret;
 }
@@ -1383,6 +1401,13 @@ static void btree_iter_pos_changed(struct btree_iter *iter, int cmp)
 	if (!cmp)
 		goto out;
 
+	if (unlikely(btree_iter_type(iter) == BTREE_ITER_CACHED)) {
+		btree_node_unlock(iter, 0);
+		iter->l[0].b = BTREE_ITER_NO_NODE_UP;
+		btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
+		return;
+	}
+
 	l = btree_iter_up_until_good_node(iter, cmp);
 
 	if (btree_iter_node(iter, l)) {
@@ -1814,6 +1839,26 @@ struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *iter)
 	return bch2_btree_iter_peek_slot(iter);
 }
 
+struct bkey_s_c bch2_btree_iter_peek_cached(struct btree_iter *iter)
+{
+	struct bkey_cached *ck;
+	int ret;
+
+	bch2_btree_iter_checks(iter, BTREE_ITER_CACHED);
+
+	ret = bch2_btree_iter_traverse(iter);
+	if (unlikely(ret))
+		return bkey_s_c_err(ret);
+
+	ck = (void *) iter->l[0].b;
+
+	EBUG_ON(iter->btree_id != ck->key.btree_id ||
+		bkey_cmp(iter->pos, ck->key.pos));
+	BUG_ON(!ck->valid);
+
+	return bkey_i_to_s_c(ck->k);
+}
+
 static inline void bch2_btree_iter_init(struct btree_trans *trans,
 			struct btree_iter *iter, enum btree_id btree_id,
 			struct bpos pos, unsigned flags)
@@ -1999,6 +2044,7 @@ static inline void btree_iter_copy(struct btree_iter *dst,
 
 	*dst = *src;
 	dst->idx = idx;
+	dst->flags &= ~BTREE_ITER_KEEP_UNTIL_COMMIT;
 
 	for (i = 0; i < BTREE_MAX_DEPTH; i++)
 		if (btree_node_locked(dst, i))
@@ -2057,8 +2103,9 @@ static struct btree_iter *__btree_trans_get_iter(struct btree_trans *trans,
 		iter = best;
 	}
 
-	iter->flags &= ~(BTREE_ITER_SLOTS|BTREE_ITER_INTENT|BTREE_ITER_PREFETCH);
-	iter->flags |= flags & (BTREE_ITER_SLOTS|BTREE_ITER_INTENT|BTREE_ITER_PREFETCH);
+	iter->flags &= ~BTREE_ITER_KEEP_UNTIL_COMMIT;
+	iter->flags &= ~BTREE_ITER_USER_FLAGS;
+	iter->flags |= flags & BTREE_ITER_USER_FLAGS;
 
 	if (iter->flags & BTREE_ITER_INTENT)
 		bch2_btree_iter_upgrade(iter, 1);
@@ -2263,6 +2310,8 @@ int bch2_trans_exit(struct btree_trans *trans)
 	mutex_unlock(&trans->c->btree_trans_lock);
 #endif
 
+	bch2_journal_preres_put(&trans->c->journal, &trans->journal_preres);
+
 	kfree(trans->fs_usage_deltas);
 	kfree(trans->mem);
 	if (trans->used_mempool)
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index bc408f1272e7..bd9ec3ec9a92 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -110,6 +110,7 @@ void bch2_btree_node_iter_fix(struct btree_iter *, struct btree *,
 			      struct btree_node_iter *, struct bkey_packed *,
 			      unsigned, unsigned);
 
+bool bch2_btree_iter_relock(struct btree_iter *, bool);
 bool bch2_trans_relock(struct btree_trans *);
 void bch2_trans_unlock(struct btree_trans *);
 
@@ -170,6 +171,8 @@ struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *);
 struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *);
 struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *);
 
+struct bkey_s_c bch2_btree_iter_peek_cached(struct btree_iter *);
+
 void bch2_btree_iter_set_pos_same_leaf(struct btree_iter *, struct bpos);
 void __bch2_btree_iter_set_pos(struct btree_iter *, struct bpos, bool);
 void bch2_btree_iter_set_pos(struct btree_iter *, struct bpos);
@@ -177,7 +180,9 @@ void bch2_btree_iter_set_pos(struct btree_iter *, struct bpos);
 static inline int btree_iter_cmp(const struct btree_iter *l,
 				 const struct btree_iter *r)
 {
-	return cmp_int(l->btree_id, r->btree_id) ?: bkey_cmp(l->pos, r->pos);
+	return   cmp_int(l->btree_id, r->btree_id) ?:
+		-cmp_int(btree_iter_type(l), btree_iter_type(r)) ?:
+		 bkey_cmp(l->pos, r->pos);
 }
 
 /*
@@ -211,9 +216,12 @@ static inline int bch2_trans_cond_resched(struct btree_trans *trans)
 static inline struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter,
 						     unsigned flags)
 {
-	return flags & BTREE_ITER_SLOTS
-		? bch2_btree_iter_peek_slot(iter)
-		: bch2_btree_iter_peek(iter);
+	if ((flags & BTREE_ITER_TYPE) == BTREE_ITER_CACHED)
+		return bch2_btree_iter_peek_cached(iter);
+	else
+		return flags & BTREE_ITER_SLOTS
+			? bch2_btree_iter_peek_slot(iter)
+			: bch2_btree_iter_peek(iter);
 }
 
 static inline struct bkey_s_c __bch2_btree_iter_next(struct btree_iter *iter,
diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
new file mode 100644
index 000000000000..2feff59e755a
--- /dev/null
+++ b/fs/bcachefs/btree_key_cache.c
@@ -0,0 +1,494 @@
+
+#include "bcachefs.h"
+#include "btree_iter.h"
+#include "btree_key_cache.h"
+#include "btree_locking.h"
+#include "btree_update.h"
+#include "error.h"
+#include "journal.h"
+#include "journal_reclaim.h"
+#include "trace.h"
+
+static int bch2_btree_key_cache_cmp_fn(struct rhashtable_compare_arg *arg,
+				       const void *obj)
+{
+	const struct bkey_cached *ck = obj;
+	const struct bkey_cached_key *key = arg->key;
+
+	return cmp_int(ck->key.btree_id, key->btree_id) ?:
+		bkey_cmp(ck->key.pos, key->pos);
+}
+
+static const struct rhashtable_params bch2_btree_key_cache_params = {
+	.head_offset	= offsetof(struct bkey_cached, hash),
+	.key_offset	= offsetof(struct bkey_cached, key),
+	.key_len	= sizeof(struct bkey_cached_key),
+	.obj_cmpfn	= bch2_btree_key_cache_cmp_fn,
+};
+
+__flatten
+static inline struct bkey_cached *
+btree_key_cache_find(struct bch_fs *c, enum btree_id btree_id, struct bpos pos)
+{
+	struct bkey_cached_key key = {
+		.btree_id	= btree_id,
+		.pos		= pos,
+	};
+
+	return rhashtable_lookup_fast(&c->btree_key_cache.table, &key,
+				      bch2_btree_key_cache_params);
+}
+
+static bool bkey_cached_lock_for_evict(struct bkey_cached *ck)
+{
+	if (!six_trylock_intent(&ck->c.lock))
+		return false;
+
+	if (!six_trylock_write(&ck->c.lock)) {
+		six_unlock_intent(&ck->c.lock);
+		return false;
+	}
+
+	if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
+		six_unlock_write(&ck->c.lock);
+		six_unlock_intent(&ck->c.lock);
+		return false;
+	}
+
+	return true;
+}
+
+static void bkey_cached_evict(struct btree_key_cache *c,
+			      struct bkey_cached *ck)
+{
+	BUG_ON(rhashtable_remove_fast(&c->table, &ck->hash,
+				      bch2_btree_key_cache_params));
+	memset(&ck->key, ~0, sizeof(ck->key));
+}
+
+static void bkey_cached_free(struct btree_key_cache *c,
+			     struct bkey_cached *ck)
+{
+	list_move(&ck->list, &c->freed);
+
+	kfree(ck->k);
+	ck->k		= NULL;
+	ck->u64s	= 0;
+
+	six_unlock_write(&ck->c.lock);
+	six_unlock_intent(&ck->c.lock);
+}
+
+static struct bkey_cached *
+bkey_cached_alloc(struct btree_key_cache *c)
+{
+	struct bkey_cached *ck;
+
+	list_for_each_entry(ck, &c->freed, list)
+		if (bkey_cached_lock_for_evict(ck))
+			return ck;
+
+	list_for_each_entry(ck, &c->clean, list)
+		if (bkey_cached_lock_for_evict(ck)) {
+			bkey_cached_evict(c, ck);
+			return ck;
+		}
+
+	ck = kzalloc(sizeof(*ck), GFP_NOFS);
+	if (!ck)
+		return NULL;
+
+	INIT_LIST_HEAD(&ck->list);
+	six_lock_init(&ck->c.lock);
+	lockdep_set_novalidate_class(&ck->c.lock);
+	BUG_ON(!six_trylock_intent(&ck->c.lock));
+	BUG_ON(!six_trylock_write(&ck->c.lock));
+
+	return ck;
+}
+
+static struct bkey_cached *
+btree_key_cache_create(struct btree_key_cache *c,
+		       enum btree_id btree_id,
+		       struct bpos pos)
+{
+	struct bkey_cached *ck;
+
+	ck = bkey_cached_alloc(c);
+	if (!ck)
+		return ERR_PTR(-ENOMEM);
+
+	ck->c.level		= 0;
+	ck->c.btree_id		= btree_id;
+	ck->key.btree_id	= btree_id;
+	ck->key.pos		= pos;
+	ck->valid		= false;
+
+	BUG_ON(ck->flags);
+
+	if (rhashtable_lookup_insert_fast(&c->table,
+					  &ck->hash,
+					  bch2_btree_key_cache_params)) {
+		/* We raced with another fill: */
+		bkey_cached_free(c, ck);
+		return NULL;
+	}
+
+	list_move(&ck->list, &c->clean);
+	six_unlock_write(&ck->c.lock);
+
+	return ck;
+}
+
+static int btree_key_cache_fill(struct btree_trans *trans,
+				struct btree_iter *ck_iter,
+				struct bkey_cached *ck)
+{
+	struct btree_iter *iter;
+	struct bkey_s_c k;
+	unsigned new_u64s = 0;
+	struct bkey_i *new_k = NULL;
+	int ret;
+
+	iter = bch2_trans_get_iter(trans, ck->key.btree_id,
+				   ck->key.pos, BTREE_ITER_SLOTS);
+	if (IS_ERR(iter))
+		return PTR_ERR(iter);
+
+	k = bch2_btree_iter_peek_slot(iter);
+	ret = bkey_err(k);
+	if (ret) {
+		bch2_trans_iter_put(trans, iter);
+		return ret;
+	}
+
+	if (!bch2_btree_node_relock(ck_iter, 0)) {
+		bch2_trans_iter_put(trans, iter);
+		trace_transaction_restart_ip(trans->ip, _THIS_IP_);
+		return -EINTR;
+	}
+
+	if (k.k->u64s > ck->u64s) {
+		new_u64s = roundup_pow_of_two(k.k->u64s);
+		new_k = kmalloc(new_u64s * sizeof(u64), GFP_NOFS);
+		if (!new_k) {
+			bch2_trans_iter_put(trans, iter);
+			return -ENOMEM;
+		}
+	}
+
+	bch2_btree_node_lock_write(ck_iter->l[0].b, ck_iter);
+	if (new_k) {
+		kfree(ck->k);
+		ck->u64s = new_u64s;
+		ck->k = new_k;
+	}
+
+	bkey_reassemble(ck->k, k);
+	ck->valid = true;
+	bch2_btree_node_unlock_write(ck_iter->l[0].b, ck_iter);
+
+	/* We're not likely to need this iterator again: */
+	bch2_trans_iter_free(trans, iter);
+
+	return 0;
+}
+
+static int bkey_cached_check_fn(struct six_lock *lock, void *p)
+{
+	struct bkey_cached *ck = container_of(lock, struct bkey_cached, c.lock);
+	const struct btree_iter *iter = p;
+
+	return ck->key.btree_id == iter->btree_id &&
+		!bkey_cmp(ck->key.pos, iter->pos) ? 0 : -1;
+}
+
+int bch2_btree_iter_traverse_cached(struct btree_iter *iter)
+{
+	struct btree_trans *trans = iter->trans;
+	struct bch_fs *c = trans->c;
+	struct bkey_cached *ck;
+	int ret = 0;
+
+	BUG_ON(iter->level);
+
+	if (btree_node_locked(iter, 0)) {
+		ck = (void *) iter->l[0].b;
+		goto fill;
+	}
+retry:
+	ck = btree_key_cache_find(c, iter->btree_id, iter->pos);
+	if (!ck) {
+		if (iter->flags & BTREE_ITER_CACHED_NOCREATE) {
+			iter->l[0].b = NULL;
+			return 0;
+		}
+
+		mutex_lock(&c->btree_key_cache.lock);
+		ck = btree_key_cache_create(&c->btree_key_cache,
+					    iter->btree_id, iter->pos);
+		mutex_unlock(&c->btree_key_cache.lock);
+
+		ret = PTR_ERR_OR_ZERO(ck);
+		if (ret)
+			goto err;
+		if (!ck)
+			goto retry;
+
+		mark_btree_node_locked(iter, 0, SIX_LOCK_intent);
+		iter->locks_want = 1;
+	} else {
+		enum six_lock_type lock_want = __btree_lock_want(iter, 0);
+
+		if (!btree_node_lock((void *) ck, iter->pos, 0, iter, lock_want,
+				     bkey_cached_check_fn, iter)) {
+			if (ck->key.btree_id != iter->btree_id ||
+			    bkey_cmp(ck->key.pos, iter->pos)) {
+				goto retry;
+			}
+
+			trace_transaction_restart_ip(trans->ip, _THIS_IP_);
+			ret = -EINTR;
+			goto err;
+		}
+
+		if (ck->key.btree_id != iter->btree_id ||
+		    bkey_cmp(ck->key.pos, iter->pos)) {
+			six_unlock_type(&ck->c.lock, lock_want);
+			goto retry;
+		}
+
+		mark_btree_node_locked(iter, 0, lock_want);
+	}
+
+	iter->l[0].lock_seq	= ck->c.lock.state.seq;
+	iter->l[0].b		= (void *) ck;
+fill:
+	if (!ck->valid && !(iter->flags & BTREE_ITER_CACHED_NOFILL)) {
+		if (!btree_node_intent_locked(iter, 0))
+			bch2_btree_iter_upgrade(iter, 1);
+		if (!btree_node_intent_locked(iter, 0)) {
+			trace_transaction_restart_ip(trans->ip, _THIS_IP_);
+			ret = -EINTR;
+			goto err;
+		}
+
+		ret = btree_key_cache_fill(trans, iter, ck);
+		if (ret)
+			goto err;
+	}
+
+	iter->uptodate = BTREE_ITER_NEED_PEEK;
+	bch2_btree_iter_downgrade(iter);
+	return ret;
+err:
+	if (ret != -EINTR) {
+		btree_node_unlock(iter, 0);
+		iter->flags |= BTREE_ITER_ERROR;
+		iter->l[0].b = BTREE_ITER_NO_NODE_ERROR;
+	}
+	return ret;
+}
+
+static int btree_key_cache_flush_pos(struct btree_trans *trans,
+				     struct bkey_cached_key key,
+				     u64 journal_seq,
+				     bool evict)
+{
+	struct bch_fs *c = trans->c;
+	struct journal *j = &c->journal;
+	struct btree_iter *c_iter = NULL, *b_iter = NULL;
+	struct bkey_cached *ck;
+	int ret;
+
+	b_iter = bch2_trans_get_iter(trans, key.btree_id, key.pos,
+				     BTREE_ITER_SLOTS|
+				     BTREE_ITER_INTENT);
+	ret = PTR_ERR_OR_ZERO(b_iter);
+	if (ret)
+		goto out;
+
+	c_iter = bch2_trans_get_iter(trans, key.btree_id, key.pos,
+				     BTREE_ITER_CACHED|
+				     BTREE_ITER_CACHED_NOFILL|
+				     BTREE_ITER_CACHED_NOCREATE|
+				     BTREE_ITER_INTENT);
+	ret = PTR_ERR_OR_ZERO(c_iter);
+	if (ret)
+		goto out;
+retry:
+	ret = bch2_btree_iter_traverse(c_iter);
+	if (ret)
+		goto err;
+
+	ck = (void *) c_iter->l[0].b;
+	if (!ck ||
+	    (journal_seq && ck->journal.seq != journal_seq))
+		goto out;
+
+	if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
+		if (!evict)
+			goto out;
+		goto evict;
+	}
+
+	ret   = bch2_btree_iter_traverse(b_iter) ?:
+		bch2_trans_update(trans, b_iter, ck->k, BTREE_TRIGGER_NORUN) ?:
+		bch2_trans_commit(trans, NULL, NULL,
+				  BTREE_INSERT_NOUNLOCK|
+				  BTREE_INSERT_NOCHECK_RW|
+				  BTREE_INSERT_NOFAIL|
+				  BTREE_INSERT_USE_RESERVE|
+				  BTREE_INSERT_USE_ALLOC_RESERVE|
+				  BTREE_INSERT_JOURNAL_RESERVED|
+				  BTREE_INSERT_JOURNAL_RECLAIM);
+err:
+	if (ret == -EINTR)
+		goto retry;
+
+	BUG_ON(ret && !bch2_journal_error(j));
+
+	if (ret)
+		goto out;
+
+	bch2_journal_pin_drop(j, &ck->journal);
+	bch2_journal_preres_put(j, &ck->res);
+	clear_bit(BKEY_CACHED_DIRTY, &ck->flags);
+
+	if (!evict) {
+		mutex_lock(&c->btree_key_cache.lock);
+		list_move_tail(&ck->list, &c->btree_key_cache.clean);
+		mutex_unlock(&c->btree_key_cache.lock);
+	} else {
+evict:
+		BUG_ON(!btree_node_intent_locked(c_iter, 0));
+
+		mark_btree_node_unlocked(c_iter, 0);
+		c_iter->l[0].b = NULL;
+
+		six_lock_write(&ck->c.lock, NULL, NULL);
+
+		mutex_lock(&c->btree_key_cache.lock);
+		bkey_cached_evict(&c->btree_key_cache, ck);
+		bkey_cached_free(&c->btree_key_cache, ck);
+		mutex_unlock(&c->btree_key_cache.lock);
+	}
+out:
+	bch2_trans_iter_put(trans, b_iter);
+	bch2_trans_iter_put(trans, c_iter);
+	return ret;
+}
+
+static void btree_key_cache_journal_flush(struct journal *j,
+					  struct journal_entry_pin *pin,
+					  u64 seq)
+{
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
+	struct bkey_cached *ck =
+		container_of(pin, struct bkey_cached, journal);
+	struct bkey_cached_key key;
+	struct btree_trans trans;
+
+	six_lock_read(&ck->c.lock, NULL, NULL);
+	key = ck->key;
+
+	if (ck->journal.seq != seq ||
+	    !test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
+		six_unlock_read(&ck->c.lock);
+		return;
+	}
+	six_unlock_read(&ck->c.lock);
+
+	bch2_trans_init(&trans, c, 0, 0);
+	btree_key_cache_flush_pos(&trans, key, seq, false);
+	bch2_trans_exit(&trans);
+}
+
+/*
+ * Flush and evict a key from the key cache:
+ */
+int bch2_btree_key_cache_flush(struct btree_trans *trans,
+			       enum btree_id id, struct bpos pos)
+{
+	struct bch_fs *c = trans->c;
+	struct bkey_cached_key key = { id, pos };
+
+	/* Fastpath - assume it won't be found: */
+	if (!btree_key_cache_find(c, id, pos))
+		return 0;
+
+	return btree_key_cache_flush_pos(trans, key, 0, true);
+}
+
+bool bch2_btree_insert_key_cached(struct btree_trans *trans,
+				  struct btree_iter *iter,
+				  struct bkey_i *insert)
+{
+	struct bch_fs *c = trans->c;
+	struct bkey_cached *ck = (void *) iter->l[0].b;
+
+	BUG_ON(insert->u64s > ck->u64s);
+
+	if (likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) {
+		int difference;
+
+		BUG_ON(jset_u64s(insert->u64s) > trans->journal_preres.u64s);
+
+		difference = jset_u64s(insert->u64s) - ck->res.u64s;
+		if (difference > 0) {
+			trans->journal_preres.u64s	-= difference;
+			ck->res.u64s			+= difference;
+		}
+	}
+
+	bkey_copy(ck->k, insert);
+	ck->valid = true;
+
+	if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
+		mutex_lock(&c->btree_key_cache.lock);
+		list_del_init(&ck->list);
+
+		set_bit(BKEY_CACHED_DIRTY, &ck->flags);
+		mutex_unlock(&c->btree_key_cache.lock);
+	}
+
+	bch2_journal_pin_update(&c->journal, trans->journal_res.seq,
+				&ck->journal, btree_key_cache_journal_flush);
+	return true;
+}
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+void bch2_btree_key_cache_verify_clean(struct btree_trans *trans,
+			       enum btree_id id, struct bpos pos)
+{
+	BUG_ON(btree_key_cache_find(trans->c, id, pos));
+}
+#endif
+
+void bch2_fs_btree_key_cache_exit(struct btree_key_cache *c)
+{
+	struct bkey_cached *ck, *n;
+
+	mutex_lock(&c->lock);
+	list_for_each_entry_safe(ck, n, &c->clean, list) {
+		kfree(ck->k);
+		kfree(ck);
+	}
+	list_for_each_entry_safe(ck, n, &c->freed, list)
+		kfree(ck);
+	mutex_unlock(&c->lock);
+
+	rhashtable_destroy(&c->table);
+}
+
+void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *c)
+{
+	mutex_init(&c->lock);
+	INIT_LIST_HEAD(&c->freed);
+	INIT_LIST_HEAD(&c->clean);
+}
+
+int bch2_fs_btree_key_cache_init(struct btree_key_cache *c)
+{
+	return rhashtable_init(&c->table, &bch2_btree_key_cache_params);
+}
diff --git a/fs/bcachefs/btree_key_cache.h b/fs/bcachefs/btree_key_cache.h
new file mode 100644
index 000000000000..fbc29336091f
--- /dev/null
+++ b/fs/bcachefs/btree_key_cache.h
@@ -0,0 +1,23 @@
+#ifndef _BCACHEFS_BTREE_KEY_CACHE_H
+#define _BCACHEFS_BTREE_KEY_CACHE_H
+
+int bch2_btree_iter_traverse_cached(struct btree_iter *);
+
+bool bch2_btree_insert_key_cached(struct btree_trans *,
+			struct btree_iter *, struct bkey_i *);
+int bch2_btree_key_cache_flush(struct btree_trans *,
+			       enum btree_id, struct bpos);
+#ifdef CONFIG_BCACHEFS_DEBUG
+void bch2_btree_key_cache_verify_clean(struct btree_trans *,
+				enum btree_id, struct bpos);
+#else
+static inline void
+bch2_btree_key_cache_verify_clean(struct btree_trans *trans,
+				enum btree_id id, struct bpos pos) {}
+#endif
+
+void bch2_fs_btree_key_cache_exit(struct btree_key_cache *);
+void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *);
+int bch2_fs_btree_key_cache_init(struct btree_key_cache *);
+
+#endif /* _BCACHEFS_BTREE_KEY_CACHE_H */
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 58d54a4ac218..345a06bac0fe 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -183,6 +183,7 @@ struct btree_node_iter {
 enum btree_iter_type {
 	BTREE_ITER_KEYS,
 	BTREE_ITER_NODES,
+	BTREE_ITER_CACHED,
 };
 
 #define BTREE_ITER_TYPE			((1 << 2) - 1)
@@ -214,6 +215,15 @@ enum btree_iter_type {
 #define BTREE_ITER_IS_EXTENTS		(1 << 6)
 #define BTREE_ITER_ERROR		(1 << 7)
 #define BTREE_ITER_SET_POS_AFTER_COMMIT	(1 << 8)
+#define BTREE_ITER_CACHED_NOFILL	(1 << 9)
+#define BTREE_ITER_CACHED_NOCREATE	(1 << 10)
+
+#define BTREE_ITER_USER_FLAGS				\
+	(BTREE_ITER_SLOTS				\
+	|BTREE_ITER_INTENT				\
+	|BTREE_ITER_PREFETCH				\
+	|BTREE_ITER_CACHED_NOFILL			\
+	|BTREE_ITER_CACHED_NOCREATE)
 
 enum btree_iter_uptodate {
 	BTREE_ITER_UPTODATE		= 0,
@@ -222,6 +232,14 @@ enum btree_iter_uptodate {
 	BTREE_ITER_NEED_TRAVERSE	= 3,
 };
 
+#define BTREE_ITER_NO_NODE_GET_LOCKS	((struct btree *) 1)
+#define BTREE_ITER_NO_NODE_DROP		((struct btree *) 2)
+#define BTREE_ITER_NO_NODE_LOCK_ROOT	((struct btree *) 3)
+#define BTREE_ITER_NO_NODE_UP		((struct btree *) 4)
+#define BTREE_ITER_NO_NODE_DOWN		((struct btree *) 5)
+#define BTREE_ITER_NO_NODE_INIT		((struct btree *) 6)
+#define BTREE_ITER_NO_NODE_ERROR	((struct btree *) 7)
+
 /*
  * @pos			- iterator's current position
  * @level		- current btree depth
@@ -259,7 +277,8 @@ struct btree_iter {
 	unsigned long		ip_allocated;
 };
 
-static inline enum btree_iter_type btree_iter_type(struct btree_iter *iter)
+static inline enum btree_iter_type
+btree_iter_type(const struct btree_iter *iter)
 {
 	return iter->flags & BTREE_ITER_TYPE;
 }
@@ -269,6 +288,37 @@ static inline struct btree_iter_level *iter_l(struct btree_iter *iter)
 	return iter->l + iter->level;
 }
 
+struct btree_key_cache {
+	struct mutex		lock;
+	struct rhashtable	table;
+	struct list_head	freed;
+	struct list_head	clean;
+};
+
+struct bkey_cached_key {
+	u32			btree_id;
+	struct bpos		pos;
+} __packed;
+
+#define BKEY_CACHED_DIRTY		0
+
+struct bkey_cached {
+	struct btree_bkey_cached_common c;
+
+	unsigned long		flags;
+	u8			u64s;
+	bool			valid;
+	struct bkey_cached_key	key;
+
+	struct rhash_head	hash;
+	struct list_head	list;
+
+	struct journal_preres	res;
+	struct journal_entry_pin journal;
+
+	struct bkey_i		*k;
+};
+
 struct btree_insert_entry {
 	unsigned		trigger_flags;
 	unsigned		trans_triggers_run:1;
@@ -307,6 +357,7 @@ struct btree_trans {
 	unsigned		error:1;
 	unsigned		nounlock:1;
 	unsigned		need_reset:1;
+	unsigned		in_traverse_all:1;
 
 	unsigned		mem_top;
 	unsigned		mem_bytes;
diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
index 11f7d02de622..e0b1bde37484 100644
--- a/fs/bcachefs/btree_update.h
+++ b/fs/bcachefs/btree_update.h
@@ -23,6 +23,7 @@ enum btree_insert_flags {
 	__BTREE_INSERT_USE_ALLOC_RESERVE,
 	__BTREE_INSERT_JOURNAL_REPLAY,
 	__BTREE_INSERT_JOURNAL_RESERVED,
+	__BTREE_INSERT_JOURNAL_RECLAIM,
 	__BTREE_INSERT_NOWAIT,
 	__BTREE_INSERT_GC_LOCK_HELD,
 	__BCH_HASH_SET_MUST_CREATE,
@@ -47,8 +48,12 @@ enum btree_insert_flags {
 /* Insert is for journal replay - don't get journal reservations: */
 #define BTREE_INSERT_JOURNAL_REPLAY	(1 << __BTREE_INSERT_JOURNAL_REPLAY)
 
+/* Indicates that we have pre-reserved space in the journal: */
 #define BTREE_INSERT_JOURNAL_RESERVED	(1 << __BTREE_INSERT_JOURNAL_RESERVED)
 
+/* Insert is being called from journal reclaim path: */
+#define BTREE_INSERT_JOURNAL_RECLAIM (1 << __BTREE_INSERT_JOURNAL_RECLAIM)
+
 /* Don't block on allocation failure (for new btree nodes: */
 #define BTREE_INSERT_NOWAIT		(1 << __BTREE_INSERT_NOWAIT)
 #define BTREE_INSERT_GC_LOCK_HELD	(1 << __BTREE_INSERT_GC_LOCK_HELD)
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 2d68f4eaca34..d12d5e46a007 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -529,11 +529,20 @@ static void btree_update_nodes_written(struct btree_update *as)
 	 * to child nodes that weren't written yet: now, the child nodes have
 	 * been written so we can write out the update to the interior node.
 	 */
+
+	/*
+	 * We can't call into journal reclaim here: we'd block on the journal
+	 * reclaim lock, but we may need to release the open buckets we have
+	 * pinned in order for other btree updates to make forward progress, and
+	 * journal reclaim does btree updates when flushing bkey_cached entries,
+	 * which may require allocations as well.
+	 */
 	ret = bch2_trans_do(c, &as->disk_res, &journal_seq,
 			    BTREE_INSERT_NOFAIL|
 			    BTREE_INSERT_USE_RESERVE|
 			    BTREE_INSERT_USE_ALLOC_RESERVE|
 			    BTREE_INSERT_NOCHECK_RW|
+			    BTREE_INSERT_JOURNAL_RECLAIM|
 			    BTREE_INSERT_JOURNAL_RESERVED,
 			    btree_update_nodes_written_trans(&trans, as));
 	BUG_ON(ret && !bch2_journal_error(&c->journal));
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 9fbbd2a72e14..2d0f101a6303 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -6,6 +6,7 @@
 #include "btree_gc.h"
 #include "btree_io.h"
 #include "btree_iter.h"
+#include "btree_key_cache.h"
 #include "btree_locking.h"
 #include "buckets.h"
 #include "debug.h"
@@ -32,6 +33,9 @@ inline void bch2_btree_node_lock_for_insert(struct bch_fs *c, struct btree *b,
 {
 	bch2_btree_node_lock_write(b, iter);
 
+	if (btree_iter_type(iter) == BTREE_ITER_CACHED)
+		return;
+
 	if (unlikely(btree_node_just_written(b)) &&
 	    bch2_btree_post_write_cleanup(c, b))
 		bch2_btree_iter_reinit_node(iter, b);
@@ -202,6 +206,8 @@ static bool btree_insert_key_leaf(struct btree_trans *trans,
 	return true;
 }
 
+/* Cached btree updates: */
+
 /* Normal update interface: */
 
 static inline void btree_insert_entry_checks(struct btree_trans *trans,
@@ -284,6 +290,31 @@ btree_key_can_insert(struct btree_trans *trans,
 	return BTREE_INSERT_OK;
 }
 
+static enum btree_insert_ret
+btree_key_can_insert_cached(struct btree_trans *trans,
+			    struct btree_iter *iter,
+			    struct bkey_i *insert,
+			    unsigned *u64s)
+{
+	struct bkey_cached *ck = (void *) iter->l[0].b;
+	unsigned new_u64s;
+	struct bkey_i *new_k;
+
+	BUG_ON(iter->level);
+
+	if (*u64s <= ck->u64s)
+		return BTREE_INSERT_OK;
+
+	new_u64s	= roundup_pow_of_two(*u64s);
+	new_k		= krealloc(ck->k, new_u64s * sizeof(u64), GFP_NOFS);
+	if (!new_k)
+		return -ENOMEM;
+
+	ck->u64s	= new_u64s;
+	ck->k		= new_k;
+	return BTREE_INSERT_OK;
+}
+
 static inline void do_btree_insert_one(struct btree_trans *trans,
 				       struct btree_iter *iter,
 				       struct bkey_i *insert)
@@ -297,7 +328,9 @@ static inline void do_btree_insert_one(struct btree_trans *trans,
 
 	insert->k.needs_whiteout = false;
 
-	did_work = btree_insert_key_leaf(trans, iter, insert);
+	did_work = (btree_iter_type(iter) != BTREE_ITER_CACHED)
+		? btree_insert_key_leaf(trans, iter, insert)
+		: bch2_btree_insert_key_cached(trans, iter, insert);
 	if (!did_work)
 		return;
 
@@ -335,10 +368,16 @@ static noinline void bch2_trans_mark_gc(struct btree_trans *trans)
 	struct bch_fs *c = trans->c;
 	struct btree_insert_entry *i;
 
-	trans_for_each_update(trans, i)
-		if (gc_visited(c, gc_pos_btree_node(iter_l(i->iter)->b)))
+	trans_for_each_update(trans, i) {
+		/*
+		 * XXX: synchronization of cached update triggers with gc
+		 */
+		BUG_ON(btree_iter_type(i->iter) == BTREE_ITER_CACHED);
+
+		if (gc_visited(c, gc_pos_btree_node(i->iter->l[0].b)))
 			bch2_mark_update(trans, i->iter, i->k, NULL,
 					 i->trigger_flags|BTREE_TRIGGER_GC);
+	}
 }
 
 static inline int
@@ -371,7 +410,9 @@ bch2_trans_commit_write_locked(struct btree_trans *trans,
 			u64s = 0;
 
 		u64s += i->k->k.u64s;
-		ret = btree_key_can_insert(trans, i->iter, i->k, &u64s);
+		ret = btree_iter_type(i->iter) != BTREE_ITER_CACHED
+			? btree_key_can_insert(trans, i->iter, i->k, &u64s)
+			: btree_key_can_insert_cached(trans, i->iter, i->k, &u64s);
 		if (ret) {
 			*stopped_at = i;
 			return ret;
@@ -467,7 +508,9 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans,
 
 	ret = bch2_journal_preres_get(&trans->c->journal,
 			&trans->journal_preres, trans->journal_preres_u64s,
-			JOURNAL_RES_GET_NONBLOCK);
+			JOURNAL_RES_GET_NONBLOCK|
+			((trans->flags & BTREE_INSERT_JOURNAL_RECLAIM)
+			 ? JOURNAL_RES_GET_RECLAIM : 0));
 	if (unlikely(ret == -EAGAIN))
 		ret = bch2_trans_journal_preres_get_cold(trans,
 						trans->journal_preres_u64s);
@@ -523,7 +566,8 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans,
 		trans->nounlock = true;
 
 	trans_for_each_update2(trans, i)
-		if (!same_leaf_as_prev(trans, i))
+		if (btree_iter_type(i->iter) != BTREE_ITER_CACHED &&
+		    !same_leaf_as_prev(trans, i))
 			bch2_foreground_maybe_merge(trans->c, i->iter,
 						    0, trans->flags);
 
@@ -808,6 +852,14 @@ int __bch2_trans_commit(struct btree_trans *trans)
 			return ret;
 	}
 
+#ifdef CONFIG_BCACHEFS_DEBUG
+	trans_for_each_update(trans, i)
+		if (btree_iter_type(i->iter) != BTREE_ITER_CACHED &&
+		    !(i->trigger_flags & BTREE_TRIGGER_NORUN))
+			bch2_btree_key_cache_verify_clean(trans,
+					i->iter->btree_id, i->iter->pos);
+#endif
+
 	/*
 	 * Running triggers will append more updates to the list of updates as
 	 * we're walking it:
@@ -880,7 +932,8 @@ int __bch2_trans_commit(struct btree_trans *trans)
 		BUG_ON(i->iter->locks_want < 1);
 
 		u64s = jset_u64s(i->k->k.u64s);
-		if (0)
+		if (btree_iter_type(i->iter) == BTREE_ITER_CACHED &&
+		    likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)))
 			trans->journal_preres_u64s += u64s;
 		trans->journal_u64s += u64s;
 	}
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 5ee978c94568..36c62888f80a 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -1816,6 +1816,13 @@ int bch2_trans_mark_update(struct btree_trans *trans,
 	if (unlikely(flags & BTREE_TRIGGER_NOOVERWRITES))
 		return 0;
 
+	if (btree_iter_type(iter) == BTREE_ITER_CACHED) {
+		struct bkey_cached *ck = (void *) iter->l[0].b;
+
+		return bch2_trans_mark_key(trans, bkey_i_to_s_c(ck->k),
+					   0, 0, BTREE_TRIGGER_OVERWRITE);
+	}
+
 	while ((_k = bch2_btree_node_iter_peek(&node_iter, b))) {
 		struct bkey		unpacked;
 		struct bkey_s_c		k;
diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
index 5ca84c9761e9..62e322e959d3 100644
--- a/fs/bcachefs/journal_reclaim.c
+++ b/fs/bcachefs/journal_reclaim.c
@@ -346,6 +346,37 @@ void __bch2_journal_pin_add(struct journal *j, u64 seq,
 	journal_wake(j);
 }
 
+void bch2_journal_pin_update(struct journal *j, u64 seq,
+			     struct journal_entry_pin *pin,
+			     journal_pin_flush_fn flush_fn)
+{
+	if (journal_pin_active(pin) && pin->seq < seq)
+		return;
+
+	spin_lock(&j->lock);
+
+	if (pin->seq != seq) {
+		bch2_journal_pin_add_locked(j, seq, pin, flush_fn);
+	} else {
+		struct journal_entry_pin_list *pin_list =
+			journal_seq_pin(j, seq);
+
+		/*
+		 * If the pin is already pinning the right sequence number, it
+		 * still might've already been flushed:
+		 */
+		list_move(&pin->list, &pin_list->list);
+	}
+
+	spin_unlock(&j->lock);
+
+	/*
+	 * If the journal is currently full,  we might want to call flush_fn
+	 * immediately:
+	 */
+	journal_wake(j);
+}
+
 void bch2_journal_pin_copy(struct journal *j,
 			   struct journal_entry_pin *dst,
 			   struct journal_entry_pin *src,
diff --git a/fs/bcachefs/journal_reclaim.h b/fs/bcachefs/journal_reclaim.h
index 272ba8a37967..8128907a7623 100644
--- a/fs/bcachefs/journal_reclaim.h
+++ b/fs/bcachefs/journal_reclaim.h
@@ -42,6 +42,10 @@ static inline void bch2_journal_pin_add(struct journal *j, u64 seq,
 		__bch2_journal_pin_add(j, seq, pin, flush_fn);
 }
 
+void bch2_journal_pin_update(struct journal *, u64,
+			     struct journal_entry_pin *,
+			     journal_pin_flush_fn);
+
 void bch2_journal_pin_copy(struct journal *,
 			   struct journal_entry_pin *,
 			   struct journal_entry_pin *,
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 3a7c48def9e8..a680bf8d95f1 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -13,6 +13,7 @@
 #include "bkey_sort.h"
 #include "btree_cache.h"
 #include "btree_gc.h"
+#include "btree_key_cache.h"
 #include "btree_update_interior.h"
 #include "btree_io.h"
 #include "chardev.h"
@@ -479,6 +480,7 @@ static void bch2_fs_free(struct bch_fs *c)
 	bch2_fs_io_exit(c);
 	bch2_fs_btree_interior_update_exit(c);
 	bch2_fs_btree_iter_exit(c);
+	bch2_fs_btree_key_cache_exit(&c->btree_key_cache);
 	bch2_fs_btree_cache_exit(c);
 	bch2_fs_journal_exit(&c->journal);
 	bch2_io_clock_exit(&c->io_clock[WRITE]);
@@ -650,6 +652,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 	for (i = 0; i < BCH_TIME_STAT_NR; i++)
 		bch2_time_stats_init(&c->times[i]);
 
+	bch2_fs_btree_key_cache_init_early(&c->btree_key_cache);
 	bch2_fs_allocator_background_init(c);
 	bch2_fs_allocator_foreground_init(c);
 	bch2_fs_rebalance_init(c);
@@ -746,6 +749,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 	    bch2_fs_journal_init(&c->journal) ||
 	    bch2_fs_replicas_init(c) ||
 	    bch2_fs_btree_cache_init(c) ||
+	    bch2_fs_btree_key_cache_init(&c->btree_key_cache) ||
 	    bch2_fs_btree_iter_init(c) ||
 	    bch2_fs_btree_interior_update_init(c) ||
 	    bch2_fs_io_init(c) ||
-- 
cgit 


From 5d20ba48f00050d8e6498cfbbb93b2914bd97114 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 5 Oct 2019 12:54:53 -0400
Subject: bcachefs: Use cached iterators for alloc btree

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c  |  51 +++++-------------
 fs/bcachefs/alloc_background.h  |   1 -
 fs/bcachefs/btree_update_leaf.c |   3 ++
 fs/bcachefs/buckets.c           |  85 +++++++++++++++--------------
 fs/bcachefs/journal_reclaim.c   |   3 ++
 fs/bcachefs/journal_types.h     |   1 +
 fs/bcachefs/recovery.c          | 117 +++++++++++++++++++++++++++++++---------
 fs/bcachefs/super.c             |  30 +++++++++--
 fs/bcachefs/sysfs.c             |   8 ---
 9 files changed, 184 insertions(+), 115 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 44ad9821c807..678218ca0feb 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -4,6 +4,7 @@
 #include "alloc_foreground.h"
 #include "btree_cache.h"
 #include "btree_io.h"
+#include "btree_key_cache.h"
 #include "btree_update.h"
 #include "btree_update_interior.h"
 #include "btree_gc.h"
@@ -276,6 +277,13 @@ static int bch2_alloc_write_key(struct btree_trans *trans,
 	struct bkey_i_alloc *a;
 	int ret;
 retry:
+	bch2_trans_begin(trans);
+
+	ret = bch2_btree_key_cache_flush(trans,
+			BTREE_ID_ALLOC, iter->pos);
+	if (ret)
+		goto err;
+
 	k = bch2_btree_iter_peek_slot(iter);
 	ret = bkey_err(k);
 	if (ret)
@@ -330,7 +338,7 @@ int bch2_alloc_write(struct bch_fs *c, unsigned flags, bool *wrote)
 
 	BUG_ON(BKEY_ALLOC_VAL_U64s_MAX > 8);
 
-	bch2_trans_init(&trans, c, 0, 0);
+	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
 
 	iter = bch2_trans_get_iter(&trans, BTREE_ID_ALLOC, POS_MIN,
 				   BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
@@ -364,25 +372,6 @@ int bch2_alloc_write(struct bch_fs *c, unsigned flags, bool *wrote)
 	return ret < 0 ? ret : 0;
 }
 
-int bch2_alloc_replay_key(struct bch_fs *c, struct bkey_i *k)
-{
-	struct btree_trans trans;
-	struct btree_iter *iter;
-	int ret;
-
-	bch2_trans_init(&trans, c, 0, 0);
-
-	iter = bch2_trans_get_iter(&trans, BTREE_ID_ALLOC, k->k.p,
-				   BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
-
-	ret = bch2_alloc_write_key(&trans, iter,
-				   BTREE_INSERT_NOFAIL|
-				   BTREE_INSERT_LAZY_RW|
-				   BTREE_INSERT_JOURNAL_REPLAY);
-	bch2_trans_exit(&trans);
-	return ret < 0 ? ret : 0;
-}
-
 /* Bucket IO clocks: */
 
 static void bch2_recalc_oldest_io(struct bch_fs *c, struct bch_dev *ca, int rw)
@@ -840,7 +829,6 @@ static int bch2_invalidate_one_bucket2(struct btree_trans *trans,
 	struct bkey_alloc_unpacked u;
 	struct bucket *g;
 	struct bucket_mark m;
-	struct bkey_s_c k;
 	bool invalidating_cached_data;
 	size_t b;
 	int ret = 0;
@@ -892,27 +880,14 @@ static int bch2_invalidate_one_bucket2(struct btree_trans *trans,
 
 	bch2_btree_iter_set_pos(iter, POS(ca->dev_idx, b));
 retry:
-	k = bch2_btree_iter_peek_slot(iter);
-	ret = bkey_err(k);
+	ret = bch2_btree_iter_traverse(iter);
 	if (ret)
 		return ret;
 
 	percpu_down_read(&c->mark_lock);
 	g = bucket(ca, iter->pos.offset);
 	m = READ_ONCE(g->mark);
-
-	if (unlikely(!test_bit(BCH_FS_ALLOC_WRITTEN, &c->flags))) {
-		/*
-		 * During journal replay, and if gc repairs alloc info at
-		 * runtime, the alloc info in the btree might not be up to date
-		 * yet - so, trust the in memory mark:
-		 */
-		u		= alloc_mem_to_key(g, m);
-	} else {
-		u		= bch2_alloc_unpack(k);
-		u.read_time	= g->io_time[READ];
-		u.write_time	= g->io_time[WRITE];
-	}
+	u = alloc_mem_to_key(g, m);
 
 	percpu_up_read(&c->mark_lock);
 
@@ -1000,7 +975,9 @@ static int bch2_invalidate_buckets(struct bch_fs *c, struct bch_dev *ca)
 
 	iter = bch2_trans_get_iter(&trans, BTREE_ID_ALLOC,
 				   POS(ca->dev_idx, 0),
-				   BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+				   BTREE_ITER_CACHED|
+				   BTREE_ITER_CACHED_NOFILL|
+				   BTREE_ITER_INTENT);
 
 	/* Only use nowait if we've already invalidated at least one bucket: */
 	while (!ret &&
diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h
index b53a27450889..f6b9f27f0713 100644
--- a/fs/bcachefs/alloc_background.h
+++ b/fs/bcachefs/alloc_background.h
@@ -54,7 +54,6 @@ void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 
 struct journal_keys;
 int bch2_alloc_read(struct bch_fs *, struct journal_keys *);
-int bch2_alloc_replay_key(struct bch_fs *, struct bkey_i *);
 
 static inline void bch2_wake_allocator(struct bch_dev *ca)
 {
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 2d0f101a6303..30839ccbf517 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -178,6 +178,9 @@ static bool btree_insert_key_leaf(struct btree_trans *trans,
 	int old_live_u64s = b->nr.live_u64s;
 	int live_u64s_added, u64s_added;
 
+	EBUG_ON(!iter->level &&
+		!test_bit(BCH_FS_BTREE_INTERIOR_REPLAY_DONE, &c->flags));
+
 	if (unlikely(!bch2_btree_bset_insert_key(iter, b,
 					&iter_l(iter)->iter, insert)))
 		return false;
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 36c62888f80a..1683833568a7 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -1455,13 +1455,11 @@ void bch2_trans_fs_usage_apply(struct btree_trans *trans,
 
 /* trans_mark: */
 
-static int trans_get_key(struct btree_trans *trans,
-			 enum btree_id btree_id, struct bpos pos,
-			 struct btree_iter **iter,
-			 struct bkey_s_c *k)
+static struct btree_iter *trans_get_update(struct btree_trans *trans,
+			    enum btree_id btree_id, struct bpos pos,
+			    struct bkey_s_c *k)
 {
 	struct btree_insert_entry *i;
-	int ret;
 
 	trans_for_each_update(trans, i)
 		if (i->iter->btree_id == btree_id &&
@@ -1469,17 +1467,33 @@ static int trans_get_key(struct btree_trans *trans,
 		     ? bkey_cmp(pos, bkey_start_pos(&i->k->k)) >= 0 &&
 		       bkey_cmp(pos, i->k->k.p) < 0
 		     : !bkey_cmp(pos, i->iter->pos))) {
-			*iter	= i->iter;
-			*k	= bkey_i_to_s_c(i->k);
-			return 1;
+			*k = bkey_i_to_s_c(i->k);
+			return i->iter;
 		}
 
+	return NULL;
+}
+
+static int trans_get_key(struct btree_trans *trans,
+			 enum btree_id btree_id, struct bpos pos,
+			 struct btree_iter **iter,
+			 struct bkey_s_c *k)
+{
+	unsigned flags = btree_id != BTREE_ID_ALLOC
+		? BTREE_ITER_SLOTS
+		: BTREE_ITER_CACHED;
+	int ret;
+
+	*iter = trans_get_update(trans, btree_id, pos, k);
+	if (*iter)
+		return 1;
+
 	*iter = bch2_trans_get_iter(trans, btree_id, pos,
-				    BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+				    flags|BTREE_ITER_INTENT);
 	if (IS_ERR(*iter))
 		return PTR_ERR(*iter);
 
-	*k = bch2_btree_iter_peek_slot(*iter);
+	*k = __bch2_btree_iter_peek(*iter, flags);
 	ret = bkey_err(*k);
 	if (ret)
 		bch2_trans_iter_put(trans, *iter);
@@ -1492,45 +1506,34 @@ static int bch2_trans_mark_pointer(struct btree_trans *trans,
 {
 	struct bch_fs *c = trans->c;
 	struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev);
+	struct bpos pos = POS(p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr));
 	struct btree_iter *iter;
 	struct bkey_s_c k_a;
 	struct bkey_alloc_unpacked u;
 	struct bkey_i_alloc *a;
 	struct bucket *g;
-	struct bucket_mark m;
 	int ret;
 
-	ret = trans_get_key(trans, BTREE_ID_ALLOC,
-			    POS(p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr)),
-			    &iter, &k_a);
-	if (ret < 0)
-		return ret;
-
-	percpu_down_read(&c->mark_lock);
-	g = bucket(ca, iter->pos.offset);
-	m = READ_ONCE(g->mark);
-
-	if (unlikely(!test_bit(BCH_FS_ALLOC_WRITTEN, &c->flags) && !ret)) {
-		/*
-		 * During journal replay, and if gc repairs alloc info at
-		 * runtime, the alloc info in the btree might not be up to date
-		 * yet - so, trust the in memory mark - unless we're already
-		 * updating that key:
-		 */
-		u		= alloc_mem_to_key(g, m);
+	iter = trans_get_update(trans, BTREE_ID_ALLOC, pos, &k_a);
+	if (iter) {
+		u = bch2_alloc_unpack(k_a);
 	} else {
-		u		= bch2_alloc_unpack(k_a);
-		u.read_time	= g->io_time[READ];
-		u.write_time	= g->io_time[WRITE];
-	}
-
-	percpu_up_read(&c->mark_lock);
+		iter = bch2_trans_get_iter(trans, BTREE_ID_ALLOC, pos,
+					   BTREE_ITER_CACHED|
+					   BTREE_ITER_CACHED_NOFILL|
+					   BTREE_ITER_INTENT);
+		if (IS_ERR(iter))
+			return PTR_ERR(iter);
+
+		ret = bch2_btree_iter_traverse(iter);
+		if (ret)
+			goto out;
 
-	/*
-	 * Incrementing the bucket gen can be done lazily:
-	 */
-	if (gen_after(m.gen, u.gen) && !u.data_type)
-		u.gen = m.gen;
+		percpu_down_read(&c->mark_lock);
+		g = bucket(ca, pos.offset);
+		u = alloc_mem_to_key(g, READ_ONCE(g->mark));
+		percpu_up_read(&c->mark_lock);
+	}
 
 	ret = __mark_pointer(c, k, p, sectors, data_type, u.gen, &u.data_type,
 			     &u.dirty_sectors, &u.cached_sectors);
@@ -1543,7 +1546,7 @@ static int bch2_trans_mark_pointer(struct btree_trans *trans,
 		goto out;
 
 	bkey_alloc_init(&a->k_i);
-	a->k.p = iter->pos;
+	a->k.p = pos;
 	bch2_alloc_pack(a, u);
 	bch2_trans_update(trans, iter, &a->k_i, 0);
 out:
diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
index 62e322e959d3..1162acffdf45 100644
--- a/fs/bcachefs/journal_reclaim.c
+++ b/fs/bcachefs/journal_reclaim.c
@@ -416,6 +416,9 @@ journal_get_next_pin(struct journal *j, u64 max_seq, u64 *seq)
 	struct journal_entry_pin_list *pin_list;
 	struct journal_entry_pin *ret = NULL;
 
+	if (!test_bit(JOURNAL_RECLAIM_STARTED, &j->flags))
+		return NULL;
+
 	spin_lock(&j->lock);
 
 	fifo_for_each_entry_ptr(pin_list, &j->pin, *seq)
diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h
index 0585e9b6e230..6d0ee8e42da1 100644
--- a/fs/bcachefs/journal_types.h
+++ b/fs/bcachefs/journal_types.h
@@ -125,6 +125,7 @@ union journal_preres_state {
 enum {
 	JOURNAL_REPLAY_DONE,
 	JOURNAL_STARTED,
+	JOURNAL_RECLAIM_STARTED,
 	JOURNAL_NEED_WRITE,
 	JOURNAL_NOT_EMPTY,
 	JOURNAL_MAY_GET_UNRESERVED,
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 26e5767aa5de..41b864dcdc39 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -292,17 +292,6 @@ static int journal_sort_key_cmp(const void *_l, const void *_r)
 		cmp_int(l->journal_offset, r->journal_offset);
 }
 
-static int journal_sort_seq_cmp(const void *_l, const void *_r)
-{
-	const struct journal_key *l = _l;
-	const struct journal_key *r = _r;
-
-	return  cmp_int(r->level,	l->level) ?:
-		cmp_int(l->journal_seq, r->journal_seq) ?:
-		cmp_int(l->btree_id,	r->btree_id) ?:
-		bkey_cmp(l->k->k.p,	r->k->k.p);
-}
-
 void bch2_journal_keys_free(struct journal_keys *keys)
 {
 	kvfree(keys->d);
@@ -518,11 +507,48 @@ static int bch2_journal_replay_key(struct bch_fs *c, enum btree_id id,
 			     __bch2_journal_replay_key(&trans, id, level, k));
 }
 
+static int __bch2_alloc_replay_key(struct btree_trans *trans, struct bkey_i *k)
+{
+	struct btree_iter *iter;
+	int ret;
+
+	iter = bch2_trans_get_iter(trans, BTREE_ID_ALLOC, k->k.p,
+				   BTREE_ITER_CACHED|
+				   BTREE_ITER_CACHED_NOFILL|
+				   BTREE_ITER_INTENT);
+	ret =   PTR_ERR_OR_ZERO(iter) ?:
+		bch2_trans_update(trans, iter, k, BTREE_TRIGGER_NORUN);
+	bch2_trans_iter_put(trans, iter);
+	return ret;
+}
+
+static int bch2_alloc_replay_key(struct bch_fs *c, struct bkey_i *k)
+{
+	return bch2_trans_do(c, NULL, NULL,
+			     BTREE_INSERT_NOFAIL|
+			     BTREE_INSERT_USE_RESERVE|
+			     BTREE_INSERT_LAZY_RW|
+			     BTREE_INSERT_JOURNAL_REPLAY,
+			__bch2_alloc_replay_key(&trans, k));
+}
+
+static int journal_sort_seq_cmp(const void *_l, const void *_r)
+{
+	const struct journal_key *l = _l;
+	const struct journal_key *r = _r;
+
+	return  cmp_int(r->level,	l->level) ?:
+		cmp_int(l->journal_seq, r->journal_seq) ?:
+		cmp_int(l->btree_id,	r->btree_id) ?:
+		bkey_cmp(l->k->k.p,	r->k->k.p);
+}
+
 static int bch2_journal_replay(struct bch_fs *c,
 			       struct journal_keys keys)
 {
 	struct journal *j = &c->journal;
 	struct journal_key *i;
+	u64 seq;
 	int ret;
 
 	sort(keys.d, keys.nr, sizeof(keys.d[0]), journal_sort_seq_cmp, NULL);
@@ -530,26 +556,63 @@ static int bch2_journal_replay(struct bch_fs *c,
 	if (keys.nr)
 		replay_now_at(j, keys.journal_seq_base);
 
+	seq = j->replay_journal_seq;
+
+	/*
+	 * First replay updates to the alloc btree - these will only update the
+	 * btree key cache:
+	 */
 	for_each_journal_key(keys, i) {
-		if (!i->level)
-			replay_now_at(j, keys.journal_seq_base + i->journal_seq);
+		cond_resched();
 
-		if (i->level)
-			ret = bch2_journal_replay_key(c, i->btree_id, i->level, i->k);
-		if (i->btree_id == BTREE_ID_ALLOC)
+		if (!i->level && i->btree_id == BTREE_ID_ALLOC) {
+			j->replay_journal_seq = keys.journal_seq_base + i->journal_seq;
 			ret = bch2_alloc_replay_key(c, i->k);
-		else if (i->k->k.size)
-			ret = bch2_extent_replay_key(c, i->btree_id, i->k);
-		else
-			ret = bch2_journal_replay_key(c, i->btree_id, i->level, i->k);
+			if (ret)
+				goto err;
+		}
+	}
 
-		if (ret) {
-			bch_err(c, "journal replay: error %d while replaying key",
-				ret);
-			return ret;
+	/*
+	 * Next replay updates to interior btree nodes:
+	 */
+	for_each_journal_key(keys, i) {
+		cond_resched();
+
+		if (i->level) {
+			j->replay_journal_seq = keys.journal_seq_base + i->journal_seq;
+			ret = bch2_journal_replay_key(c, i->btree_id, i->level, i->k);
+			if (ret)
+				goto err;
 		}
+	}
+
+	/*
+	 * Now that the btree is in a consistent state, we can start journal
+	 * reclaim (which will be flushing entries from the btree key cache back
+	 * to the btree:
+	 */
+	set_bit(BCH_FS_BTREE_INTERIOR_REPLAY_DONE, &c->flags);
+	set_bit(JOURNAL_RECLAIM_STARTED, &j->flags);
+
+	j->replay_journal_seq = seq;
 
+	/*
+	 * Now replay leaf node updates:
+	 */
+	for_each_journal_key(keys, i) {
 		cond_resched();
+
+		if (i->level || i->btree_id == BTREE_ID_ALLOC)
+			continue;
+
+		replay_now_at(j, keys.journal_seq_base + i->journal_seq);
+
+		ret = i->k->k.size
+			? bch2_extent_replay_key(c, i->btree_id, i->k)
+			: bch2_journal_replay_key(c, i->btree_id, i->level, i->k);
+		if (ret)
+			goto err;
 	}
 
 	replay_now_at(j, j->replay_journal_seq_end);
@@ -558,6 +621,9 @@ static int bch2_journal_replay(struct bch_fs *c,
 	bch2_journal_set_replay_done(j);
 	bch2_journal_flush_all_pins(j);
 	return bch2_journal_error(j);
+err:
+	bch_err(c, "journal replay: error %d while replaying key", ret);
+	return ret;
 }
 
 static bool journal_empty(struct list_head *journal)
@@ -1183,6 +1249,9 @@ int bch2_fs_initialize(struct bch_fs *c)
 	for (i = 0; i < BTREE_ID_NR; i++)
 		bch2_btree_root_alloc(c, i);
 
+	set_bit(BCH_FS_BTREE_INTERIOR_REPLAY_DONE, &c->flags);
+	set_bit(JOURNAL_RECLAIM_STARTED, &c->journal.flags);
+
 	err = "unable to allocate journal buckets";
 	for_each_online_member(ca, c, i) {
 		ret = bch2_dev_journal_alloc(ca);
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index a680bf8d95f1..9bc470e68cc9 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -1389,6 +1389,31 @@ int bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca,
 
 /* Device add/removal: */
 
+int bch2_dev_remove_alloc(struct bch_fs *c, struct bch_dev *ca)
+{
+	struct btree_trans trans;
+	size_t i;
+	int ret;
+
+	bch2_trans_init(&trans, c, 0, 0);
+
+	for (i = 0; i < ca->mi.nbuckets; i++) {
+		ret = bch2_btree_key_cache_flush(&trans,
+				BTREE_ID_ALLOC, POS(ca->dev_idx, i));
+		if (ret)
+			break;
+	}
+	bch2_trans_exit(&trans);
+
+	if (ret)
+		return ret;
+
+	return bch2_btree_delete_range(c, BTREE_ID_ALLOC,
+				       POS(ca->dev_idx, 0),
+				       POS(ca->dev_idx + 1, 0),
+				       NULL);
+}
+
 int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
 {
 	struct bch_sb_field_members *mi;
@@ -1422,10 +1447,7 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
 		goto err;
 	}
 
-	ret = bch2_btree_delete_range(c, BTREE_ID_ALLOC,
-				      POS(ca->dev_idx, 0),
-				      POS(ca->dev_idx + 1, 0),
-				      NULL);
+	ret = bch2_dev_remove_alloc(c, ca);
 	if (ret) {
 		bch_err(ca, "Remove failed, error deleting alloc info");
 		goto err;
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index 1800e0f7f81e..bda9eb1598b8 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -134,7 +134,6 @@ do {									\
 write_attribute(trigger_journal_flush);
 write_attribute(trigger_btree_coalesce);
 write_attribute(trigger_gc);
-write_attribute(trigger_alloc_write);
 write_attribute(prune_cache);
 rw_attribute(btree_gc_periodic);
 
@@ -498,12 +497,6 @@ STORE(bch2_fs)
 #endif
 	}
 
-	if (attr == &sysfs_trigger_alloc_write) {
-		bool wrote;
-
-		bch2_alloc_write(c, 0, &wrote);
-	}
-
 	if (attr == &sysfs_prune_cache) {
 		struct shrink_control sc;
 
@@ -587,7 +580,6 @@ struct attribute *bch2_fs_internal_files[] = {
 	&sysfs_trigger_journal_flush,
 	&sysfs_trigger_btree_coalesce,
 	&sysfs_trigger_gc,
-	&sysfs_trigger_alloc_write,
 	&sysfs_prune_cache,
 
 	&sysfs_copy_gc_enabled,
-- 
cgit 


From e27b03b35b09f9cbee9a69bd99951d7adaf02fa7 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 15 Jun 2020 16:59:36 -0400
Subject: bcachefs: Give bkey_cached_key same attributes as bpos

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_types.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 345a06bac0fe..f741bb79d49b 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -298,7 +298,7 @@ struct btree_key_cache {
 struct bkey_cached_key {
 	u32			btree_id;
 	struct bpos		pos;
-} __packed;
+} __attribute__((packed, aligned(4)));
 
 #define BKEY_CACHED_DIRTY		0
 
-- 
cgit 


From 7dd1ebfa1ec156d54238126c3ce7f0fb31ea7a45 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 15 Jun 2020 17:38:26 -0400
Subject: bcachefs: Increase size of btree node reserve

Also tweak the allocator to be more aggressive about keeping it full.
The recent changes to make updates to interior nodes transactional (and
thus generate updates to the alloc btree) all put more stress on the
btree node reserves.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c | 9 ++++++---
 fs/bcachefs/bcachefs.h         | 2 +-
 2 files changed, 7 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 678218ca0feb..93ee5cdfbe35 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -501,6 +501,7 @@ static void bch2_bucket_clock_init(struct bch_fs *c, int rw)
 static int wait_buckets_available(struct bch_fs *c, struct bch_dev *ca)
 {
 	unsigned long gc_count = c->gc_count;
+	u64 available;
 	int ret = 0;
 
 	ca->allocator_state = ALLOCATOR_BLOCKED;
@@ -516,9 +517,11 @@ static int wait_buckets_available(struct bch_fs *c, struct bch_dev *ca)
 		if (gc_count != c->gc_count)
 			ca->inc_gen_really_needs_gc = 0;
 
-		if ((ssize_t) (dev_buckets_available(c, ca) -
-			       ca->inc_gen_really_needs_gc) >=
-		    (ssize_t) fifo_free(&ca->free_inc))
+		available = max_t(s64, 0, dev_buckets_available(c, ca) -
+				  ca->inc_gen_really_needs_gc);
+
+		if (available > fifo_free(&ca->free_inc) ||
+		    (available && !fifo_full(&ca->free[RESERVE_BTREE])))
 			break;
 
 		up_read(&c->gc_lock);
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index d293afcda75a..42e3395884c1 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -340,7 +340,7 @@ enum bch_time_stats {
 #define BTREE_RESERVE_MAX	(BTREE_MAX_DEPTH + (BTREE_MAX_DEPTH - 1))
 
 /* Size of the freelist we allocate btree nodes from: */
-#define BTREE_NODE_RESERVE	BTREE_RESERVE_MAX
+#define BTREE_NODE_RESERVE	(BTREE_RESERVE_MAX * 4)
 
 #define BTREE_NODE_OPEN_BUCKET_RESERVE	(BTREE_RESERVE_MAX * BCH_REPLICAS_MAX)
 
-- 
cgit 


From 1d1867896210e0ed6d137cebea83abe01bbb8ffc Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 15 Jun 2020 17:59:09 -0400
Subject: bcachefs: delete a slightly faulty assertion

state lock isn't held at startup

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/buckets.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 1683833568a7..2c9ba18357fd 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -1977,8 +1977,6 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
 	int ret = -ENOMEM;
 	unsigned i;
 
-	lockdep_assert_held(&c->state_lock);
-
 	memset(&free,		0, sizeof(free));
 	memset(&free_inc,	0, sizeof(free_inc));
 	memset(&alloc_heap,	0, sizeof(alloc_heap));
-- 
cgit 


From d211b408abbcec99531bc262a8e9973c86a6d856 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 15 Jun 2020 19:53:46 -0400
Subject: bcachefs: Fix lock ordering with new btree cache code

The code that checks lock ordering was recently changed to go off of the
pos of the btree node, rather than the iterator, but the btree cache
code didn't update to handle iterators that point to cached bkeys. Oops

Also, update various debug code.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c      | 94 +++++++++++++++++++++++++++++++++----------
 fs/bcachefs/btree_key_cache.c | 25 ++++++++++++
 fs/bcachefs/btree_key_cache.h |  2 +
 fs/bcachefs/sysfs.c           | 11 +++++
 4 files changed, 110 insertions(+), 22 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 592663a00182..2bd02e804b76 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -185,6 +185,14 @@ static inline bool btree_iter_get_locks(struct btree_iter *iter,
 	return iter->uptodate < BTREE_ITER_NEED_RELOCK;
 }
 
+static struct bpos btree_node_pos(struct btree_bkey_cached_common *_b,
+				  enum btree_iter_type type)
+{
+	return  type != BTREE_ITER_CACHED
+		? container_of(_b, struct btree, c)->key.k.p
+		: container_of(_b, struct bkey_cached, c)->key.pos;
+}
+
 /* Slowpath: */
 bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
 			    unsigned level, struct btree_iter *iter,
@@ -253,7 +261,8 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
 
 		if (iter->btree_id == linked->btree_id &&
 		    btree_node_locked(linked, level) &&
-		    bkey_cmp(pos, linked->l[level].b->key.k.p) <= 0)
+		    bkey_cmp(pos, btree_node_pos((void *) linked->l[level].b,
+						 btree_iter_type(linked))) <= 0)
 			ret = false;
 
 		/*
@@ -435,6 +444,22 @@ void bch2_trans_unlock(struct btree_trans *trans)
 
 #ifdef CONFIG_BCACHEFS_DEBUG
 
+static void bch2_btree_iter_verify_cached(struct btree_iter *iter)
+{
+	struct bkey_cached *ck;
+	bool locked = btree_node_locked(iter, 0);
+
+	if (!bch2_btree_node_relock(iter, 0))
+		return;
+
+	ck = (void *) iter->l[0].b;
+	BUG_ON(ck->key.btree_id != iter->btree_id ||
+	       bkey_cmp(ck->key.pos, iter->pos));
+
+	if (!locked)
+		btree_node_unlock(iter, 0);
+}
+
 static void bch2_btree_iter_verify_level(struct btree_iter *iter,
 					 unsigned level)
 {
@@ -449,6 +474,12 @@ static void bch2_btree_iter_verify_level(struct btree_iter *iter,
 	if (!debug_check_iterators(iter->trans->c))
 		return;
 
+	if (btree_iter_type(iter) == BTREE_ITER_CACHED) {
+		if (!level)
+			bch2_btree_iter_verify_cached(iter);
+		return;
+	}
+
 	BUG_ON(iter->level < iter->min_depth);
 
 	if (!btree_iter_node(iter, level))
@@ -1257,13 +1288,14 @@ int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter)
 	return ret;
 }
 
-static inline void bch2_btree_iter_checks(struct btree_iter *iter,
-					  enum btree_iter_type type)
+static inline void bch2_btree_iter_checks(struct btree_iter *iter)
 {
+	enum btree_iter_type type = btree_iter_type(iter);
+
 	EBUG_ON(iter->btree_id >= BTREE_ID_NR);
-	EBUG_ON(btree_iter_type(iter) != type);
 
-	BUG_ON(type == BTREE_ITER_KEYS &&
+	BUG_ON((type == BTREE_ITER_KEYS ||
+		type == BTREE_ITER_CACHED) &&
 	       (bkey_cmp(iter->pos, bkey_start_pos(&iter->k)) < 0 ||
 		bkey_cmp(iter->pos, iter->k.p) > 0));
 
@@ -1278,7 +1310,8 @@ struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter)
 	struct btree *b;
 	int ret;
 
-	bch2_btree_iter_checks(iter, BTREE_ITER_NODES);
+	EBUG_ON(btree_iter_type(iter) != BTREE_ITER_NODES);
+	bch2_btree_iter_checks(iter);
 
 	if (iter->uptodate == BTREE_ITER_UPTODATE)
 		return iter->l[iter->level].b;
@@ -1306,7 +1339,8 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
 	struct btree *b;
 	int ret;
 
-	bch2_btree_iter_checks(iter, BTREE_ITER_NODES);
+	EBUG_ON(btree_iter_type(iter) != BTREE_ITER_NODES);
+	bch2_btree_iter_checks(iter);
 
 	/* already got to end? */
 	if (!btree_iter_node(iter, iter->level))
@@ -1534,7 +1568,8 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
 	struct bkey_s_c k;
 	int ret;
 
-	bch2_btree_iter_checks(iter, BTREE_ITER_KEYS);
+	EBUG_ON(btree_iter_type(iter) != BTREE_ITER_KEYS);
+	bch2_btree_iter_checks(iter);
 
 	if (iter->uptodate == BTREE_ITER_UPTODATE &&
 	    !bkey_deleted(&iter->k))
@@ -1621,7 +1656,8 @@ struct bkey_s_c bch2_btree_iter_peek_with_updates(struct btree_iter *iter)
 	struct bkey_s_c k;
 	int ret;
 
-	bch2_btree_iter_checks(iter, BTREE_ITER_KEYS);
+	EBUG_ON(btree_iter_type(iter) != BTREE_ITER_KEYS);
+	bch2_btree_iter_checks(iter);
 
 	while (1) {
 		ret = bch2_btree_iter_traverse(iter);
@@ -1681,7 +1717,8 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
 	struct bkey_s_c k;
 	int ret;
 
-	bch2_btree_iter_checks(iter, BTREE_ITER_KEYS);
+	EBUG_ON(btree_iter_type(iter) != BTREE_ITER_KEYS);
+	bch2_btree_iter_checks(iter);
 
 	if (iter->uptodate == BTREE_ITER_UPTODATE &&
 	    !bkey_deleted(&iter->k))
@@ -1717,7 +1754,8 @@ struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *iter)
 {
 	struct bpos pos = bkey_start_pos(&iter->k);
 
-	bch2_btree_iter_checks(iter, BTREE_ITER_KEYS);
+	EBUG_ON(btree_iter_type(iter) != BTREE_ITER_KEYS);
+	bch2_btree_iter_checks(iter);
 
 	if (unlikely(!bkey_cmp(pos, POS_MIN)))
 		return bkey_s_c_null;
@@ -1798,7 +1836,8 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
 	struct bkey_s_c k;
 	int ret;
 
-	bch2_btree_iter_checks(iter, BTREE_ITER_KEYS);
+	EBUG_ON(btree_iter_type(iter) != BTREE_ITER_KEYS);
+	bch2_btree_iter_checks(iter);
 
 	if (iter->uptodate == BTREE_ITER_UPTODATE)
 		return btree_iter_peek_uptodate(iter);
@@ -1844,7 +1883,8 @@ struct bkey_s_c bch2_btree_iter_peek_cached(struct btree_iter *iter)
 	struct bkey_cached *ck;
 	int ret;
 
-	bch2_btree_iter_checks(iter, BTREE_ITER_CACHED);
+	EBUG_ON(btree_iter_type(iter) != BTREE_ITER_CACHED);
+	bch2_btree_iter_checks(iter);
 
 	ret = bch2_btree_iter_traverse(iter);
 	if (unlikely(ret))
@@ -2324,6 +2364,15 @@ int bch2_trans_exit(struct btree_trans *trans)
 	return trans->error ? -EIO : 0;
 }
 
+static void bch2_btree_iter_node_to_text(struct printbuf *out,
+				 struct btree_bkey_cached_common *_b,
+				 enum btree_iter_type type)
+{
+	pr_buf(out, "    %px l=%u %s:",
+	       _b, _b->level, bch2_btree_ids[_b->btree_id]);
+	bch2_bpos_to_text(out, btree_node_pos(_b, type));
+}
+
 void bch2_btree_trans_to_text(struct printbuf *out, struct bch_fs *c)
 {
 #ifdef CONFIG_BCACHEFS_DEBUG
@@ -2348,11 +2397,11 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct bch_fs *c)
 
 			for (l = 0; l < BTREE_MAX_DEPTH; l++) {
 				if (btree_node_locked(iter, l)) {
-					b = iter->l[l].b;
-
-					pr_buf(out, "    %px %s l=%u ",
-					       b, btree_node_intent_locked(iter, l) ? "i" : "r", l);
-					bch2_bpos_to_text(out, b->key.k.p);
+					pr_buf(out, "    %s l=%u ",
+					       btree_node_intent_locked(iter, l) ? "i" : "r", l);
+					bch2_btree_iter_node_to_text(out,
+							(void *) iter->l[l].b,
+							btree_iter_type(iter));
 					pr_buf(out, "\n");
 				}
 			}
@@ -2366,10 +2415,11 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct bch_fs *c)
 			       bch2_btree_ids[trans->locking_btree_id]);
 			bch2_bpos_to_text(out, trans->locking_pos);
 
-			pr_buf(out, " node %px l=%u %s:",
-			       b, b->c.level,
-			       bch2_btree_ids[b->c.btree_id]);
-			bch2_bpos_to_text(out, b->key.k.p);
+
+			pr_buf(out, " node ");
+			bch2_btree_iter_node_to_text(out,
+					(void *) b,
+					btree_iter_type(&trans->iters[trans->locking_iter_idx]));
 			pr_buf(out, "\n");
 		}
 	}
diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index 2feff59e755a..1be01035869f 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -1,5 +1,6 @@
 
 #include "bcachefs.h"
+#include "btree_cache.h"
 #include "btree_iter.h"
 #include "btree_key_cache.h"
 #include "btree_locking.h"
@@ -492,3 +493,27 @@ int bch2_fs_btree_key_cache_init(struct btree_key_cache *c)
 {
 	return rhashtable_init(&c->table, &bch2_btree_key_cache_params);
 }
+
+void bch2_btree_key_cache_to_text(struct printbuf *out, struct btree_key_cache *c)
+{
+	struct bucket_table *tbl;
+	struct bkey_cached *ck;
+	struct rhash_head *pos;
+	size_t i;
+
+	mutex_lock(&c->lock);
+	tbl = rht_dereference_rcu(c->table.tbl, &c->table);
+
+	for (i = 0; i < tbl->size; i++) {
+		rht_for_each_entry_rcu(ck, pos, tbl, i, hash) {
+			pr_buf(out, "%s:",
+			       bch2_btree_ids[ck->key.btree_id]);
+			bch2_bpos_to_text(out, ck->key.pos);
+
+			if (test_bit(BKEY_CACHED_DIRTY, &ck->flags))
+				pr_buf(out, " journal seq %llu", ck->journal.seq);
+			pr_buf(out, "\n");
+		}
+	}
+	mutex_unlock(&c->lock);
+}
diff --git a/fs/bcachefs/btree_key_cache.h b/fs/bcachefs/btree_key_cache.h
index fbc29336091f..b1756c6c622c 100644
--- a/fs/bcachefs/btree_key_cache.h
+++ b/fs/bcachefs/btree_key_cache.h
@@ -20,4 +20,6 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *);
 void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *);
 int bch2_fs_btree_key_cache_init(struct btree_key_cache *);
 
+void bch2_btree_key_cache_to_text(struct printbuf *, struct btree_key_cache *);
+
 #endif /* _BCACHEFS_BTREE_KEY_CACHE_H */
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index bda9eb1598b8..67c0f6d2b219 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -14,6 +14,7 @@
 #include "btree_cache.h"
 #include "btree_io.h"
 #include "btree_iter.h"
+#include "btree_key_cache.h"
 #include "btree_update.h"
 #include "btree_update_interior.h"
 #include "btree_gc.h"
@@ -165,6 +166,7 @@ read_attribute(journal_debug);
 read_attribute(journal_pins);
 read_attribute(btree_updates);
 read_attribute(dirty_btree_nodes);
+read_attribute(btree_key_cache);
 read_attribute(btree_transactions);
 
 read_attribute(internal_uuid);
@@ -401,6 +403,14 @@ SHOW(bch2_fs)
 
 	if (attr == &sysfs_dirty_btree_nodes)
 		return bch2_dirty_btree_nodes_print(c, buf);
+
+	if (attr == &sysfs_btree_key_cache) {
+		struct printbuf out = _PBUF(buf, PAGE_SIZE);
+
+		bch2_btree_key_cache_to_text(&out, &c->btree_key_cache);
+		return out.pos - buf;
+	}
+
 	if (attr == &sysfs_btree_transactions) {
 		struct printbuf out = _PBUF(buf, PAGE_SIZE);
 
@@ -571,6 +581,7 @@ struct attribute *bch2_fs_internal_files[] = {
 	&sysfs_journal_pins,
 	&sysfs_btree_updates,
 	&sysfs_dirty_btree_nodes,
+	&sysfs_btree_key_cache,
 	&sysfs_btree_transactions,
 
 	&sysfs_read_realloc_races,
-- 
cgit 


From 47a5649a0a934b3186f34d90ddfe31bb4bcf4fac Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 15 Jun 2020 20:18:02 -0400
Subject: bcachefs: Fix incorrect gfp check

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_cache.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index 9423cff1539f..dedb2790445d 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -310,7 +310,7 @@ restart:
 			if (freed >= nr)
 				goto out;
 
-			if (sc->gfp_mask & __GFP_IO)
+			if (sc->gfp_mask & __GFP_FS)
 				mutex_lock(&bc->lock);
 			else if (!mutex_trylock(&bc->lock))
 				goto out;
-- 
cgit 


From b9c3d13978120344d8b999d1c8bc4e3096d5a18f Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 17 Jun 2020 17:30:38 -0400
Subject: bcachefs: Fix a deadlock in the RO path

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_gc.c | 9 +++++++--
 fs/bcachefs/buckets.c  | 5 ++++-
 2 files changed, 11 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 3775b65a89a6..cdd4bc334530 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -932,7 +932,12 @@ int bch2_gc_gens(struct bch_fs *c)
 	unsigned i;
 	int ret;
 
-	down_read(&c->state_lock);
+	/*
+	 * Ideally we would be using state_lock and not gc_lock here, but that
+	 * introduces a deadlock in the RO path - we currently take the state
+	 * lock at the start of going RO, thus the gc thread may get stuck:
+	 */
+	down_read(&c->gc_lock);
 
 	for_each_member_device(ca, c, i) {
 		down_read(&ca->bucket_lock);
@@ -959,7 +964,7 @@ int bch2_gc_gens(struct bch_fs *c)
 		up_read(&ca->bucket_lock);
 	}
 err:
-	up_read(&c->state_lock);
+	up_read(&c->gc_lock);
 	return ret;
 }
 
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 2c9ba18357fd..1ae9403847ca 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -2003,6 +2003,7 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
 	bch2_copygc_stop(ca);
 
 	if (resize) {
+		down_write(&c->gc_lock);
 		down_write(&ca->bucket_lock);
 		percpu_down_write(&c->mark_lock);
 	}
@@ -2025,8 +2026,10 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
 
 	swap(ca->buckets_nouse, buckets_nouse);
 
-	if (resize)
+	if (resize) {
 		percpu_up_write(&c->mark_lock);
+		up_write(&c->gc_lock);
+	}
 
 	spin_lock(&c->freelist_lock);
 	for (i = 0; i < RESERVE_NR; i++) {
-- 
cgit 


From a34782a0663cc3a5d8c4e4657480fa0e6ddc8a16 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 17 Jun 2020 17:33:53 -0400
Subject: bcachefs: Change bch2_dump_bset() to also print key values

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bset.c     | 41 ++++++++++++++++++++---------------------
 fs/bcachefs/bset.h     |  4 ++--
 fs/bcachefs/btree_io.c |  2 +-
 fs/bcachefs/debug.c    |  6 +++---
 4 files changed, 26 insertions(+), 27 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c
index 09711352094c..797deaf0ad2e 100644
--- a/fs/bcachefs/bset.c
+++ b/fs/bcachefs/bset.c
@@ -52,21 +52,27 @@ struct bset_tree *bch2_bkey_to_bset(struct btree *b, struct bkey_packed *k)
  * by the time we actually do the insert will all be deleted.
  */
 
-void bch2_dump_bset(struct btree *b, struct bset *i, unsigned set)
+void bch2_dump_bset(struct bch_fs *c, struct btree *b,
+		    struct bset *i, unsigned set)
 {
 	struct bkey_packed *_k, *_n;
-	struct bkey k, n;
-	char buf[120];
+	struct bkey uk, n;
+	struct bkey_s_c k;
+	char buf[200];
 
 	if (!i->u64s)
 		return;
 
-	for (_k = i->start, k = bkey_unpack_key(b, _k);
+	for (_k = i->start;
 	     _k < vstruct_last(i);
-	     _k = _n, k = n) {
+	     _k = _n) {
 		_n = bkey_next_skip_noops(_k, vstruct_last(i));
 
-		bch2_bkey_to_text(&PBUF(buf), &k);
+		k = bkey_disassemble(b, _k, &uk);
+		if (c)
+			bch2_bkey_val_to_text(&PBUF(buf), c, k);
+		else
+			bch2_bkey_to_text(&PBUF(buf), k.k);
 		printk(KERN_ERR "block %u key %5zu: %s\n", set,
 		       _k->_data - i->_data, buf);
 
@@ -75,31 +81,24 @@ void bch2_dump_bset(struct btree *b, struct bset *i, unsigned set)
 
 		n = bkey_unpack_key(b, _n);
 
-		if (bkey_cmp(bkey_start_pos(&n), k.p) < 0) {
+		if (bkey_cmp(bkey_start_pos(&n), k.k->p) < 0) {
 			printk(KERN_ERR "Key skipped backwards\n");
 			continue;
 		}
 
-		/*
-		 * Weird check for duplicate non extent keys: extents are
-		 * deleted iff they have 0 size, so if it has zero size and it's
-		 * not deleted these aren't extents:
-		 */
-		if (((!k.size && !bkey_deleted(&k)) ||
-		     (!n.size && !bkey_deleted(&n))) &&
-		    !bkey_deleted(&k) &&
-		    !bkey_cmp(n.p, k.p))
+		if (!bkey_deleted(k.k) &&
+		    !bkey_cmp(n.p, k.k->p))
 			printk(KERN_ERR "Duplicate keys\n");
 	}
 }
 
-void bch2_dump_btree_node(struct btree *b)
+void bch2_dump_btree_node(struct bch_fs *c, struct btree *b)
 {
 	struct bset_tree *t;
 
 	console_lock();
 	for_each_bset(b, t)
-		bch2_dump_bset(b, bset(b, t), t - b->set);
+		bch2_dump_bset(c, b, bset(b, t), t - b->set);
 	console_unlock();
 }
 
@@ -158,7 +157,7 @@ static void bch2_btree_node_iter_next_check(struct btree_node_iter *_iter,
 		struct bkey nu = bkey_unpack_key(b, n);
 		char buf1[80], buf2[80];
 
-		bch2_dump_btree_node(b);
+		bch2_dump_btree_node(NULL, b);
 		bch2_bkey_to_text(&PBUF(buf1), &ku);
 		bch2_bkey_to_text(&PBUF(buf2), &nu);
 		printk(KERN_ERR "out of order/overlapping:\n%s\n%s\n",
@@ -236,7 +235,7 @@ void bch2_verify_insert_pos(struct btree *b, struct bkey_packed *where,
 		char buf1[100];
 		char buf2[100];
 
-		bch2_dump_btree_node(b);
+		bch2_dump_btree_node(NULL, b);
 		bch2_bkey_to_text(&PBUF(buf1), &k1);
 		bch2_bkey_to_text(&PBUF(buf2), &k2);
 
@@ -257,7 +256,7 @@ void bch2_verify_insert_pos(struct btree *b, struct bkey_packed *where,
 		char buf1[100];
 		char buf2[100];
 
-		bch2_dump_btree_node(b);
+		bch2_dump_btree_node(NULL, b);
 		bch2_bkey_to_text(&PBUF(buf1), &k1);
 		bch2_bkey_to_text(&PBUF(buf2), &k2);
 
diff --git a/fs/bcachefs/bset.h b/fs/bcachefs/bset.h
index 50d0ce7d1afa..a2e5e3ee68db 100644
--- a/fs/bcachefs/bset.h
+++ b/fs/bcachefs/bset.h
@@ -615,8 +615,8 @@ void bch2_bfloat_to_text(struct printbuf *, struct btree *,
 
 /* Debug stuff */
 
-void bch2_dump_bset(struct btree *, struct bset *, unsigned);
-void bch2_dump_btree_node(struct btree *);
+void bch2_dump_bset(struct bch_fs *, struct btree *, struct bset *, unsigned);
+void bch2_dump_btree_node(struct bch_fs *, struct btree *);
 void bch2_dump_btree_node_iter(struct btree *, struct btree_node_iter *);
 
 #ifdef CONFIG_BCACHEFS_DEBUG
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 5325c24548f9..2a253380fef9 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -897,7 +897,7 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b,
 			bch2_bkey_to_text(&PBUF(buf1), &up);
 			bch2_bkey_to_text(&PBUF(buf2), u.k);
 
-			bch2_dump_bset(b, i, 0);
+			bch2_dump_bset(c, b, i, 0);
 			btree_err(BTREE_ERR_FATAL, c, b, i,
 				  "keys out of order: %s > %s",
 				  buf1, buf2);
diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c
index 7adc5ae20b9f..be97cbba12e7 100644
--- a/fs/bcachefs/debug.c
+++ b/fs/bcachefs/debug.c
@@ -97,10 +97,10 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b)
 		console_lock();
 
 		printk(KERN_ERR "*** in memory:\n");
-		bch2_dump_bset(b, inmemory, 0);
+		bch2_dump_bset(c, b, inmemory, 0);
 
 		printk(KERN_ERR "*** read back in:\n");
-		bch2_dump_bset(v, sorted, 0);
+		bch2_dump_bset(c, v, sorted, 0);
 
 		while (offset < b->written) {
 			if (!offset ) {
@@ -117,7 +117,7 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b)
 			}
 
 			printk(KERN_ERR "*** on disk block %u:\n", offset);
-			bch2_dump_bset(b, i, offset);
+			bch2_dump_bset(c, b, i, offset);
 
 			offset += sectors;
 		}
-- 
cgit 


From eff508b459fb90682c7a5c4e6d2de0b1d18217e4 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 17 Jun 2020 18:20:26 -0400
Subject: bcachefs: Add a kthread_should_stop() check to allocator thread

Turns out it's possible during shutdown for the allocator to get stuck
spinning on bch2_invalidate_buckets() without hitting any of the other
checks.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 93ee5cdfbe35..98dd4995e528 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -1104,6 +1104,8 @@ static int bch2_allocator_thread(void *arg)
 
 	while (1) {
 		cond_resched();
+		if (kthread_should_stop())
+			break;
 
 		pr_debug("discarding %zu invalidated buckets",
 			 fifo_used(&ca->free_inc));
-- 
cgit 


From 937f503605695d2b564394afdfa59b866accd915 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 18 Jun 2020 17:16:29 -0400
Subject: bcachefs: Use btree reserve when appropriate

Whenever we're doing an update that has pointers, that generally means
we need to do the update in order to release open bucket references - so
we should be using the btree open bucket reserve.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update_interior.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index d12d5e46a007..552c1ab2ce28 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -1398,14 +1398,14 @@ int bch2_btree_split_leaf(struct bch_fs *c, struct btree_iter *iter,
 	struct btree_update *as;
 	struct closure cl;
 	int ret = 0;
-	struct btree_iter *linked;
+	struct btree_insert_entry *i;
 
 	/*
 	 * We already have a disk reservation and open buckets pinned; this
 	 * allocation must not block:
 	 */
-	trans_for_each_iter(trans, linked)
-		if (linked->btree_id == BTREE_ID_EXTENTS)
+	trans_for_each_update(trans, i)
+		if (btree_node_type_needs_gc(i->iter->btree_id))
 			flags |= BTREE_INSERT_USE_RESERVE;
 
 	closure_init_stack(&cl);
-- 
cgit 


From 649a9b68ac126fc7c8735892d3f833b620c9cbde Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 18 Jun 2020 21:06:42 -0400
Subject: bcachefs: Track sectors of erasure coded data

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs_ioctl.h |  4 ++++
 fs/bcachefs/buckets.c        | 28 ++++++++++++++++++----------
 fs/bcachefs/buckets_types.h  |  4 +++-
 fs/bcachefs/chardev.c        |  9 ++++++---
 fs/bcachefs/sysfs.c          |  2 ++
 5 files changed, 33 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs_ioctl.h b/fs/bcachefs/bcachefs_ioctl.h
index d7f25e52dc71..923001188a88 100644
--- a/fs/bcachefs/bcachefs_ioctl.h
+++ b/fs/bcachefs/bcachefs_ioctl.h
@@ -275,9 +275,13 @@ struct bch_ioctl_dev_usage {
 
 	__u32			bucket_size;
 	__u64			nr_buckets;
+	__u64			available_buckets;
 
 	__u64			buckets[BCH_DATA_NR];
 	__u64			sectors[BCH_DATA_NR];
+
+	__u64			ec_buckets;
+	__u64			ec_sectors;
 };
 
 /*
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 1ae9403847ca..1198c7bbeab9 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -374,6 +374,11 @@ static inline int is_fragmented_bucket(struct bucket_mark m,
 	return 0;
 }
 
+static inline int bucket_stripe_sectors(struct bucket_mark m)
+{
+	return m.stripe ? m.dirty_sectors : 0;
+}
+
 static inline enum bch_data_type bucket_type(struct bucket_mark m)
 {
 	return m.cached_sectors && !m.dirty_sectors
@@ -443,30 +448,33 @@ static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
 				  struct bucket_mark old, struct bucket_mark new,
 				  bool gc)
 {
-	struct bch_dev_usage *dev_usage;
+	struct bch_dev_usage *u;
 
 	percpu_rwsem_assert_held(&c->mark_lock);
 
 	preempt_disable();
-	dev_usage = this_cpu_ptr(ca->usage[gc]);
+	u = this_cpu_ptr(ca->usage[gc]);
 
 	if (bucket_type(old))
-		account_bucket(fs_usage, dev_usage, bucket_type(old),
+		account_bucket(fs_usage, u, bucket_type(old),
 			       -1, -ca->mi.bucket_size);
 
 	if (bucket_type(new))
-		account_bucket(fs_usage, dev_usage, bucket_type(new),
+		account_bucket(fs_usage, u, bucket_type(new),
 			       1, ca->mi.bucket_size);
 
-	dev_usage->buckets_ec += (int) new.stripe - (int) old.stripe;
-	dev_usage->buckets_unavailable +=
+	u->buckets_unavailable +=
 		is_unavailable_bucket(new) - is_unavailable_bucket(old);
 
-	dev_usage->sectors[old.data_type] -= old.dirty_sectors;
-	dev_usage->sectors[new.data_type] += new.dirty_sectors;
-	dev_usage->sectors[BCH_DATA_CACHED] +=
+	u->buckets_ec += (int) new.stripe - (int) old.stripe;
+	u->sectors_ec += bucket_stripe_sectors(new) -
+			 bucket_stripe_sectors(old);
+
+	u->sectors[old.data_type] -= old.dirty_sectors;
+	u->sectors[new.data_type] += new.dirty_sectors;
+	u->sectors[BCH_DATA_CACHED] +=
 		(int) new.cached_sectors - (int) old.cached_sectors;
-	dev_usage->sectors_fragmented +=
+	u->sectors_fragmented +=
 		is_fragmented_bucket(new, ca) - is_fragmented_bucket(old, ca);
 	preempt_enable();
 
diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h
index 172b0ccf2b4f..b64b2fc9a896 100644
--- a/fs/bcachefs/buckets_types.h
+++ b/fs/bcachefs/buckets_types.h
@@ -52,12 +52,14 @@ struct bucket_array {
 
 struct bch_dev_usage {
 	u64			buckets[BCH_DATA_NR];
-	u64			buckets_ec;
 	u64			buckets_unavailable;
 
 	/* _compressed_ sectors: */
 	u64			sectors[BCH_DATA_NR];
 	u64			sectors_fragmented;
+
+	u64			buckets_ec;
+	u64			sectors_ec;
 };
 
 struct bch_fs_usage {
diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c
index 084bef5e7997..b46d32db4b58 100644
--- a/fs/bcachefs/chardev.c
+++ b/fs/bcachefs/chardev.c
@@ -470,9 +470,12 @@ static long bch2_ioctl_dev_usage(struct bch_fs *c,
 
 	src = bch2_dev_usage_read(c, ca);
 
-	arg.state	= ca->mi.state;
-	arg.bucket_size	= ca->mi.bucket_size;
-	arg.nr_buckets	= ca->mi.nbuckets - ca->mi.first_bucket;
+	arg.state		= ca->mi.state;
+	arg.bucket_size		= ca->mi.bucket_size;
+	arg.nr_buckets		= ca->mi.nbuckets - ca->mi.first_bucket;
+	arg.available_buckets	= arg.nr_buckets - src.buckets_unavailable;
+	arg.ec_buckets		= src.buckets_ec;
+	arg.ec_sectors		= src.sectors_ec;
 
 	for (i = 0; i < BCH_DATA_NR; i++) {
 		arg.buckets[i] = src.buckets[i];
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index 67c0f6d2b219..30be49eb5da6 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -845,6 +845,7 @@ static ssize_t show_dev_alloc_debug(struct bch_dev *ca, char *buf)
 		"    meta:               %llu\n"
 		"    user:               %llu\n"
 		"    cached:             %llu\n"
+		"    erasure coded:      %llu\n"
 		"    fragmented:         %llu\n"
 		"    copygc threshold:   %llu\n"
 		"freelist_wait:          %s\n"
@@ -870,6 +871,7 @@ static ssize_t show_dev_alloc_debug(struct bch_dev *ca, char *buf)
 		stats.sectors[BCH_DATA_BTREE],
 		stats.sectors[BCH_DATA_USER],
 		stats.sectors[BCH_DATA_CACHED],
+		stats.sectors_ec,
 		stats.sectors_fragmented,
 		ca->copygc_threshold,
 		c->freelist_wait.list.first		? "waiting" : "empty",
-- 
cgit 


From c61b7e21ecfff2096cdb84d86bd18f1ceab7de72 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 26 Jun 2020 13:56:21 -0400
Subject: bcachefs: Fix a null ptr deref in bch2_btree_iter_traverse_one()

We use sentinal values that aren't NULL to indicate there's a btree node
at a higher level; occasionally, this may result in
btree_iter_up_until_good_node() stopping at one of those sentinal
values.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 2bd02e804b76..7501556c0988 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1235,7 +1235,7 @@ static int btree_iter_traverse_one(struct btree_iter *iter)
 	 *
 	 * XXX correctly using BTREE_ITER_UPTODATE should make this unnecessary
 	 */
-	if (btree_iter_node(iter, iter->level)) {
+	if (is_btree_node(iter, iter->level)) {
 		BUG_ON(!btree_iter_pos_in_node(iter, iter->l[iter->level].b));
 
 		btree_iter_advance_to_pos(iter, &iter->l[iter->level], -1);
-- 
cgit 


From 64f2a8803ec8d3702a4b5225726f9c1dc685f43a Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 28 Jun 2020 18:11:12 -0400
Subject: bcachefs: Fix bch2_extent_can_insert() not being called

It's supposed to check whether we're splitting a compressed extent and
if so get a bigger disk reservation - hence this fixes a "disk usage
increased by x without a reservaiton" bug.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_types.h       | 10 +++++++++
 fs/bcachefs/btree_update_leaf.c | 26 +++++++++++-----------
 fs/bcachefs/buckets.c           | 48 +++++++++++++++++++++++------------------
 3 files changed, 49 insertions(+), 35 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index f741bb79d49b..40cb4758a065 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -568,6 +568,16 @@ static inline bool btree_node_is_extents(struct btree *b)
 	return btree_node_type_is_extents(btree_node_type(b));
 }
 
+static inline enum btree_node_type btree_iter_key_type(struct btree_iter *iter)
+{
+	return __btree_node_type(iter->level, iter->btree_id);
+}
+
+static inline bool btree_iter_is_extents(struct btree_iter *iter)
+{
+	return btree_node_type_is_extents(btree_iter_key_type(iter));
+}
+
 #define BTREE_NODE_TYPE_HAS_TRIGGERS			\
 	((1U << BKEY_TYPE_EXTENTS)|			\
 	 (1U << BKEY_TYPE_ALLOC)|			\
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 30839ccbf517..0609fc61ff39 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -265,11 +265,10 @@ static enum btree_insert_ret
 btree_key_can_insert(struct btree_trans *trans,
 		     struct btree_iter *iter,
 		     struct bkey_i *insert,
-		     unsigned *u64s)
+		     unsigned u64s)
 {
 	struct bch_fs *c = trans->c;
 	struct btree *b = iter_l(iter)->b;
-	static enum btree_insert_ret ret;
 
 	if (unlikely(btree_node_fake(b)))
 		return BTREE_INSERT_BTREE_NODE_FULL;
@@ -281,13 +280,7 @@ btree_key_can_insert(struct btree_trans *trans,
 	if (unlikely(btree_node_old_extent_overwrite(b)))
 		return BTREE_INSERT_BTREE_NODE_FULL;
 
-	ret = !(iter->flags & BTREE_ITER_IS_EXTENTS)
-		? BTREE_INSERT_OK
-		: bch2_extent_can_insert(trans, iter, insert);
-	if (ret)
-		return ret;
-
-	if (*u64s > bch_btree_keys_u64s_remaining(c, b))
+	if (unlikely(u64s > bch_btree_keys_u64s_remaining(c, b)))
 		return BTREE_INSERT_BTREE_NODE_FULL;
 
 	return BTREE_INSERT_OK;
@@ -297,7 +290,7 @@ static enum btree_insert_ret
 btree_key_can_insert_cached(struct btree_trans *trans,
 			    struct btree_iter *iter,
 			    struct bkey_i *insert,
-			    unsigned *u64s)
+			    unsigned u64s)
 {
 	struct bkey_cached *ck = (void *) iter->l[0].b;
 	unsigned new_u64s;
@@ -305,10 +298,10 @@ btree_key_can_insert_cached(struct btree_trans *trans,
 
 	BUG_ON(iter->level);
 
-	if (*u64s <= ck->u64s)
+	if (u64s <= ck->u64s)
 		return BTREE_INSERT_OK;
 
-	new_u64s	= roundup_pow_of_two(*u64s);
+	new_u64s	= roundup_pow_of_two(u64s);
 	new_k		= krealloc(ck->k, new_u64s * sizeof(u64), GFP_NOFS);
 	if (!new_k)
 		return -ENOMEM;
@@ -414,8 +407,8 @@ bch2_trans_commit_write_locked(struct btree_trans *trans,
 
 		u64s += i->k->k.u64s;
 		ret = btree_iter_type(i->iter) != BTREE_ITER_CACHED
-			? btree_key_can_insert(trans, i->iter, i->k, &u64s)
-			: btree_key_can_insert_cached(trans, i->iter, i->k, &u64s);
+			? btree_key_can_insert(trans, i->iter, i->k, u64s)
+			: btree_key_can_insert_cached(trans, i->iter, i->k, u64s);
 		if (ret) {
 			*stopped_at = i;
 			return ret;
@@ -733,6 +726,11 @@ static int extent_update_to_keys(struct btree_trans *trans,
 				 struct bkey_i *insert)
 {
 	struct btree_iter *iter;
+	int ret;
+
+	ret = bch2_extent_can_insert(trans, orig_iter, insert);
+	if (ret)
+		return ret;
 
 	if (bkey_deleted(&insert->k))
 		return 0;
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 1198c7bbeab9..4ea84cbac5d3 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -1368,8 +1368,8 @@ int bch2_mark_update(struct btree_trans *trans,
 		     unsigned flags)
 {
 	struct bch_fs		*c = trans->c;
-	struct btree		*b = iter->l[0].b;
-	struct btree_node_iter	node_iter = iter->l[0].iter;
+	struct btree		*b = iter_l(iter)->b;
+	struct btree_node_iter	node_iter = iter_l(iter)->iter;
 	struct bkey_packed	*_k;
 	int ret = 0;
 
@@ -1431,32 +1431,38 @@ void bch2_trans_fs_usage_apply(struct btree_trans *trans,
 		disk_res_sectors);
 
 	trans_for_each_update(trans, i) {
-		struct btree_iter	*iter = i->iter;
-		struct btree		*b = iter->l[0].b;
-		struct btree_node_iter	node_iter = iter->l[0].iter;
-		struct bkey_packed	*_k;
-
 		pr_err("while inserting");
 		bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(i->k));
 		pr_err("%s", buf);
 		pr_err("overlapping with");
 
-		node_iter = iter->l[0].iter;
-		while ((_k = bch2_btree_node_iter_peek(&node_iter, b))) {
-			struct bkey		unpacked;
-			struct bkey_s_c		k;
+		if (btree_iter_type(i->iter) != BTREE_ITER_CACHED) {
+			struct btree		*b = iter_l(i->iter)->b;
+			struct btree_node_iter	node_iter = iter_l(i->iter)->iter;
+			struct bkey_packed	*_k;
 
-			k = bkey_disassemble(b, _k, &unpacked);
+			while ((_k = bch2_btree_node_iter_peek(&node_iter, b))) {
+				struct bkey		unpacked;
+				struct bkey_s_c		k;
 
-			if (btree_node_is_extents(b)
-			    ? bkey_cmp(i->k->k.p, bkey_start_pos(k.k)) <= 0
-			    : bkey_cmp(i->k->k.p, k.k->p))
-				break;
+				pr_info("_k %px format %u", _k, _k->format);
+				k = bkey_disassemble(b, _k, &unpacked);
 
-			bch2_bkey_val_to_text(&PBUF(buf), c, k);
-			pr_err("%s", buf);
+				if (btree_node_is_extents(b)
+				    ? bkey_cmp(i->k->k.p, bkey_start_pos(k.k)) <= 0
+				    : bkey_cmp(i->k->k.p, k.k->p))
+					break;
+
+				bch2_bkey_val_to_text(&PBUF(buf), c, k);
+				pr_err("%s", buf);
 
-			bch2_btree_node_iter_advance(&node_iter, b);
+				bch2_btree_node_iter_advance(&node_iter, b);
+			}
+		} else {
+			struct bkey_cached *ck = (void *) i->iter->l[0].b;
+
+			bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(ck->k));
+			pr_err("%s", buf);
 		}
 	}
 }
@@ -1808,8 +1814,8 @@ int bch2_trans_mark_update(struct btree_trans *trans,
 			   struct bkey_i *insert,
 			   unsigned flags)
 {
-	struct btree		*b = iter->l[0].b;
-	struct btree_node_iter	node_iter = iter->l[0].iter;
+	struct btree		*b = iter_l(iter)->b;
+	struct btree_node_iter	node_iter = iter_l(iter)->iter;
 	struct bkey_packed	*_k;
 	int ret;
 
-- 
cgit 


From 042a1f268e82678ea202390b8a69457aafacd4a0 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 29 Jun 2020 18:22:06 -0400
Subject: bcachefs: Refactor dio write code to reinit bch_write_op

This fixes a bug where the BCH_WRITE_SKIP_CLOSURE_PUT was set
incorrectly, causing the completion to be delivered multiple times.
oops.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-io.c | 65 +++++++++++++++++++++++------------------------------
 fs/bcachefs/io.c    |  6 ++---
 fs/bcachefs/io.h    | 11 +++++----
 3 files changed, 35 insertions(+), 47 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 98fe1ec7867d..f893bef34217 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -63,6 +63,7 @@ struct dio_write {
 					sync:1,
 					free_iov:1;
 	struct quota_res		quota_res;
+	u64				written;
 
 	struct iov_iter			iter;
 	struct iovec			inline_vecs[2];
@@ -1776,18 +1777,19 @@ static noinline int bch2_dio_write_copy_iov(struct dio_write *dio)
 	return 0;
 }
 
+static void bch2_dio_write_loop_async(struct bch_write_op *);
+
 static long bch2_dio_write_loop(struct dio_write *dio)
 {
 	bool kthread = (current->flags & PF_KTHREAD) != 0;
-	struct bch_fs *c = dio->op.c;
 	struct kiocb *req = dio->req;
 	struct address_space *mapping = req->ki_filp->f_mapping;
 	struct bch_inode_info *inode = file_bch_inode(req->ki_filp);
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
 	struct bio *bio = &dio->op.wbio.bio;
 	struct bvec_iter_all iter;
 	struct bio_vec *bv;
 	unsigned unaligned;
-	u64 new_i_size;
 	bool sync = dio->sync;
 	long ret;
 
@@ -1834,8 +1836,24 @@ static long bch2_dio_write_loop(struct dio_write *dio)
 			goto err;
 		}
 
-		dio->op.pos = POS(inode->v.i_ino,
-				  (req->ki_pos >> 9) + dio->op.written);
+		bch2_write_op_init(&dio->op, c, io_opts(c, &inode->ei_inode));
+		dio->op.end_io		= bch2_dio_write_loop_async;
+		dio->op.target		= dio->op.opts.foreground_target;
+		op_journal_seq_set(&dio->op, &inode->ei_journal_seq);
+		dio->op.write_point	= writepoint_hashed((unsigned long) current);
+		dio->op.nr_replicas	= dio->op.opts.data_replicas;
+		dio->op.pos		= POS(inode->v.i_ino, (u64) req->ki_pos >> 9);
+
+		if ((req->ki_flags & IOCB_DSYNC) &&
+		    !c->opts.journal_flush_disabled)
+			dio->op.flags |= BCH_WRITE_FLUSH;
+
+		ret = bch2_disk_reservation_get(c, &dio->op.res, bio_sectors(bio),
+						dio->op.opts.data_replicas, 0);
+		if (unlikely(ret) &&
+		    !bch2_check_range_allocated(c, dio->op.pos,
+				bio_sectors(bio), dio->op.opts.data_replicas))
+			goto err;
 
 		task_io_account_write(bio->bi_iter.bi_size);
 
@@ -1856,13 +1874,12 @@ do_io:
 loop:
 		i_sectors_acct(c, inode, &dio->quota_res,
 			       dio->op.i_sectors_delta);
-		dio->op.i_sectors_delta = 0;
-
-		new_i_size = req->ki_pos + ((u64) dio->op.written << 9);
+		req->ki_pos += (u64) dio->op.written << 9;
+		dio->written += dio->op.written;
 
 		spin_lock(&inode->v.i_lock);
-		if (new_i_size > inode->v.i_size)
-			i_size_write(&inode->v, new_i_size);
+		if (req->ki_pos > inode->v.i_size)
+			i_size_write(&inode->v, req->ki_pos);
 		spin_unlock(&inode->v.i_lock);
 
 		bio_for_each_segment_all(bv, bio, iter)
@@ -1874,10 +1891,9 @@ loop:
 		reinit_completion(&dio->done);
 	}
 
-	ret = dio->op.error ?: ((long) dio->op.written << 9);
+	ret = dio->op.error ?: ((long) dio->written << 9);
 err:
 	bch2_pagecache_block_put(&inode->ei_pagecache_lock);
-	bch2_disk_reservation_put(c, &dio->op.res);
 	bch2_quota_reservation_put(c, inode, &dio->quota_res);
 
 	if (dio->free_iov)
@@ -1912,7 +1928,6 @@ ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter)
 	struct address_space *mapping = file->f_mapping;
 	struct bch_inode_info *inode = file_bch_inode(file);
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	struct bch_io_opts opts = io_opts(c, &inode->ei_inode);
 	struct dio_write *dio;
 	struct bio *bio;
 	bool locked = true, extending;
@@ -1962,35 +1977,14 @@ ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter)
 	dio->sync		= is_sync_kiocb(req) || extending;
 	dio->free_iov		= false;
 	dio->quota_res.sectors	= 0;
+	dio->written		= 0;
 	dio->iter		= *iter;
 
-	bch2_write_op_init(&dio->op, c, opts);
-	dio->op.end_io		= bch2_dio_write_loop_async;
-	dio->op.target		= opts.foreground_target;
-	op_journal_seq_set(&dio->op, &inode->ei_journal_seq);
-	dio->op.write_point	= writepoint_hashed((unsigned long) current);
-	dio->op.flags |= BCH_WRITE_NOPUT_RESERVATION;
-
-	if ((req->ki_flags & IOCB_DSYNC) &&
-	    !c->opts.journal_flush_disabled)
-		dio->op.flags |= BCH_WRITE_FLUSH;
-
 	ret = bch2_quota_reservation_add(c, inode, &dio->quota_res,
 					 iter->count >> 9, true);
 	if (unlikely(ret))
 		goto err_put_bio;
 
-	dio->op.nr_replicas	= dio->op.opts.data_replicas;
-
-	ret = bch2_disk_reservation_get(c, &dio->op.res, iter->count >> 9,
-					dio->op.opts.data_replicas, 0);
-	if (unlikely(ret) &&
-	    !bch2_check_range_allocated(c, POS(inode->v.i_ino,
-					       req->ki_pos >> 9),
-					iter->count >> 9,
-					dio->op.opts.data_replicas))
-		goto err_put_bio;
-
 	if (unlikely(mapping->nrpages)) {
 		ret = write_invalidate_inode_pages_range(mapping,
 						req->ki_pos,
@@ -2003,12 +1997,9 @@ ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter)
 err:
 	if (locked)
 		inode_unlock(&inode->v);
-	if (ret > 0)
-		req->ki_pos += ret;
 	return ret;
 err_put_bio:
 	bch2_pagecache_block_put(&inode->ei_pagecache_lock);
-	bch2_disk_reservation_put(c, &dio->op.res);
 	bch2_quota_reservation_put(c, inode, &dio->quota_res);
 	bio_put(bio);
 	inode_dio_end(&inode->v);
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index 39a23c6570eb..ca27e7dff5e0 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -509,8 +509,7 @@ static void bch2_write_done(struct closure *cl)
 	if (!op->error && (op->flags & BCH_WRITE_FLUSH))
 		op->error = bch2_journal_error(&c->journal);
 
-	if (!(op->flags & BCH_WRITE_NOPUT_RESERVATION))
-		bch2_disk_reservation_put(c, &op->res);
+	bch2_disk_reservation_put(c, &op->res);
 	percpu_ref_put(&c->writes);
 	bch2_keylist_free(&op->insert_keys, op->inline_keys);
 
@@ -1273,8 +1272,7 @@ void bch2_write(struct closure *cl)
 	continue_at_nobarrier(cl, __bch2_write, NULL);
 	return;
 err:
-	if (!(op->flags & BCH_WRITE_NOPUT_RESERVATION))
-		bch2_disk_reservation_put(c, &op->res);
+	bch2_disk_reservation_put(c, &op->res);
 
 	if (op->end_io) {
 		EBUG_ON(cl->parent);
diff --git a/fs/bcachefs/io.h b/fs/bcachefs/io.h
index f0fe0bf906d3..b8fbfef29176 100644
--- a/fs/bcachefs/io.h
+++ b/fs/bcachefs/io.h
@@ -34,14 +34,13 @@ enum bch_write_flags {
 	BCH_WRITE_PAGES_STABLE		= (1 << 4),
 	BCH_WRITE_PAGES_OWNED		= (1 << 5),
 	BCH_WRITE_ONLY_SPECIFIED_DEVS	= (1 << 6),
-	BCH_WRITE_NOPUT_RESERVATION	= (1 << 7),
-	BCH_WRITE_WROTE_DATA_INLINE	= (1 << 8),
-	BCH_WRITE_FROM_INTERNAL		= (1 << 9),
+	BCH_WRITE_WROTE_DATA_INLINE	= (1 << 7),
+	BCH_WRITE_FROM_INTERNAL		= (1 << 8),
 
 	/* Internal: */
-	BCH_WRITE_JOURNAL_SEQ_PTR	= (1 << 10),
-	BCH_WRITE_SKIP_CLOSURE_PUT	= (1 << 11),
-	BCH_WRITE_DONE			= (1 << 12),
+	BCH_WRITE_JOURNAL_SEQ_PTR	= (1 << 9),
+	BCH_WRITE_SKIP_CLOSURE_PUT	= (1 << 10),
+	BCH_WRITE_DONE			= (1 << 11),
 };
 
 static inline u64 *op_journal_seq(struct bch_write_op *op)
-- 
cgit 


From 52fbb7c859788bb10dc0f1527eedf80fd11ec9ec Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 30 Jun 2020 10:12:45 -0400
Subject: bcachefs: Don't cap ios in dio write path at 2 MB

It appears this was erronious, a different bug was responsible

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-io.c | 10 ----------
 1 file changed, 10 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index f893bef34217..d379581c2517 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -1797,22 +1797,12 @@ static long bch2_dio_write_loop(struct dio_write *dio)
 		goto loop;
 
 	while (1) {
-		size_t extra = dio->iter.count -
-			min(BIO_MAX_VECS * PAGE_SIZE, dio->iter.count);
-
 		if (kthread)
 			kthread_use_mm(dio->mm);
 		BUG_ON(current->faults_disabled_mapping);
 		current->faults_disabled_mapping = mapping;
 
-		/*
-		 * Don't issue more than 2MB at once, the bcachefs io path in
-		 * io.c can't bounce more than that:
-		 */
-
-		dio->iter.count -= extra;
 		ret = bio_iov_iter_get_pages(bio, &dio->iter);
-		dio->iter.count += extra;
 
 		current->faults_disabled_mapping = NULL;
 		if (kthread)
-- 
cgit 


From 306d40df7d27c99a8ec63fc730747a77959c4358 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 2 Jul 2020 13:43:58 -0400
Subject: bcachefs: Use blk_status_to_str()

Improved error messages are always a good thing

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_io.c   | 7 ++++---
 fs/bcachefs/ec.c         | 4 +++-
 fs/bcachefs/io.c         | 6 ++++--
 fs/bcachefs/journal_io.c | 3 ++-
 fs/bcachefs/super-io.c   | 3 ++-
 5 files changed, 15 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 2a253380fef9..cb9abca07059 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -1138,7 +1138,8 @@ static void btree_node_read_work(struct work_struct *work)
 			bio->bi_status = BLK_STS_REMOVED;
 		}
 start:
-		bch2_dev_io_err_on(bio->bi_status, ca, "btree read");
+		bch2_dev_io_err_on(bio->bi_status, ca, "btree read: %s",
+				   blk_status_to_str(bio->bi_status));
 		if (rb->have_ioref)
 			percpu_ref_put(&ca->io_ref);
 		rb->have_ioref = false;
@@ -1423,8 +1424,8 @@ static void btree_node_write_endio(struct bio *bio)
 	if (wbio->have_ioref)
 		bch2_latency_acct(ca, wbio->submit_time, WRITE);
 
-	if (bio->bi_status == BLK_STS_REMOVED ||
-	    bch2_dev_io_err_on(bio->bi_status, ca, "btree write") ||
+	if (bch2_dev_io_err_on(bio->bi_status, ca, "btree write: %s",
+			       blk_status_to_str(bio->bi_status)) ||
 	    bch2_meta_write_fault("btree")) {
 		spin_lock_irqsave(&c->btree_write_error_lock, flags);
 		bch2_dev_list_add_dev(&orig->failed, wbio->dev);
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index 074b811e9043..d35fa016cf0a 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -360,7 +360,9 @@ static void ec_block_endio(struct bio *bio)
 	struct bch_dev *ca = ec_bio->ca;
 	struct closure *cl = bio->bi_private;
 
-	if (bch2_dev_io_err_on(bio->bi_status, ca, "erasure coding"))
+	if (bch2_dev_io_err_on(bio->bi_status, ca, "erasure coding %s: %s",
+			       bio_data_dir(bio) ? "write" : "read",
+			       blk_status_to_str(bio->bi_status)))
 		clear_bit(ec_bio->idx, ec_bio->buf->valid);
 
 	bio_put(&ec_bio->bio);
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index ca27e7dff5e0..5763654db310 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -624,7 +624,8 @@ static void bch2_write_endio(struct bio *bio)
 	struct bch_fs *c		= wbio->c;
 	struct bch_dev *ca		= bch_dev_bkey_exists(c, wbio->dev);
 
-	if (bch2_dev_io_err_on(bio->bi_status, ca, "data write"))
+	if (bch2_dev_io_err_on(bio->bi_status, ca, "data write: %s",
+			       blk_status_to_str(bio->bi_status)))
 		set_bit(wbio->dev, op->failed.d);
 
 	if (wbio->have_ioref) {
@@ -1913,7 +1914,8 @@ static void bch2_read_endio(struct bio *bio)
 	if (!rbio->split)
 		rbio->bio.bi_end_io = rbio->end_io;
 
-	if (bch2_dev_io_err_on(bio->bi_status, ca, "data read")) {
+	if (bch2_dev_io_err_on(bio->bi_status, ca, "data read; %s",
+			       blk_status_to_str(bio->bi_status))) {
 		bch2_rbio_error(rbio, READ_RETRY_AVOID, bio->bi_status);
 		return;
 	}
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index a1bae99aeaab..4d20762f55d3 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -960,7 +960,8 @@ static void journal_write_endio(struct bio *bio)
 	struct bch_dev *ca = bio->bi_private;
 	struct journal *j = &ca->fs->journal;
 
-	if (bch2_dev_io_err_on(bio->bi_status, ca, "journal write") ||
+	if (bch2_dev_io_err_on(bio->bi_status, ca, "journal write: %s",
+			       blk_status_to_str(bio->bi_status)) ||
 	    bch2_meta_write_fault("journal")) {
 		struct journal_buf *w = journal_prev_buf(j);
 		unsigned long flags;
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index eb5a91d232e0..8541db5e5e48 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -640,7 +640,8 @@ static void write_super_endio(struct bio *bio)
 
 	/* XXX: return errors directly */
 
-	if (bch2_dev_io_err_on(bio->bi_status, ca, "superblock write"))
+	if (bch2_dev_io_err_on(bio->bi_status, ca, "superblock write: %s",
+			       blk_status_to_str(bio->bi_status)))
 		ca->sb_write_error = 1;
 
 	closure_put(&ca->fs->sb_write);
-- 
cgit 


From fff899b1d90089a3c77a20dbe48bd44a00161a6b Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 3 Jul 2020 16:32:00 -0400
Subject: bcachefs: Mark btree nodes as needing rewrite when not all replicas
 are RW

This fixes a bug where recovery fails when one of the devices is read
only.

Also - consolidate the "must rewrite this node to insert it" behind a
new btree node flag.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_io.c              | 12 +++++++++++-
 fs/bcachefs/btree_types.h           |  2 ++
 fs/bcachefs/btree_update_interior.c |  5 ++++-
 fs/bcachefs/btree_update_leaf.c     | 13 ++-----------
 4 files changed, 19 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index cb9abca07059..d5240598e7d3 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -917,6 +917,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry
 	struct sort_iter *iter;
 	struct btree_node *sorted;
 	struct bkey_packed *k;
+	struct bch_extent_ptr *ptr;
 	struct bset *i;
 	bool used_mempool, blacklisted;
 	unsigned u64s;
@@ -971,8 +972,10 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry
 			bset_encrypt(c, i, b->written << 9);
 
 			if (btree_node_is_extents(b) &&
-			    !BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data))
+			    !BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data)) {
 				set_btree_node_old_extent_overwrite(b);
+				set_btree_node_need_rewrite(b);
+			}
 
 			sectors = vstruct_sectors(b->data, c->block_bits);
 		} else {
@@ -1098,6 +1101,13 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry
 	set_needs_whiteout(btree_bset_first(b), true);
 
 	btree_node_reset_sib_u64s(b);
+
+	bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&b->key)), ptr) {
+		struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+
+		if (ca->mi.state != BCH_MEMBER_STATE_RW)
+			set_btree_node_need_rewrite(b);
+	}
 out:
 	mempool_free(iter, &c->fill_iter);
 	return retry_read;
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 40cb4758a065..b9edf863e895 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -409,6 +409,7 @@ enum btree_flags {
 	BTREE_NODE_dying,
 	BTREE_NODE_fake,
 	BTREE_NODE_old_extent_overwrite,
+	BTREE_NODE_need_rewrite,
 };
 
 BTREE_FLAG(read_in_flight);
@@ -423,6 +424,7 @@ BTREE_FLAG(just_written);
 BTREE_FLAG(dying);
 BTREE_FLAG(fake);
 BTREE_FLAG(old_extent_overwrite);
+BTREE_FLAG(need_rewrite);
 
 static inline struct btree_write *btree_current_write(struct btree *b)
 {
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 552c1ab2ce28..05d20a6f5efd 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -290,8 +290,10 @@ static struct btree *bch2_btree_node_alloc(struct btree_update *as, unsigned lev
 		SET_BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data, true);
 
 	if (btree_node_is_extents(b) &&
-	    !BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data))
+	    !BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data)) {
 		set_btree_node_old_extent_overwrite(b);
+		set_btree_node_need_rewrite(b);
+	}
 
 	bch2_btree_build_aux_trees(b);
 
@@ -1943,6 +1945,7 @@ void bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id)
 	bch2_btree_cache_cannibalize_unlock(c);
 
 	set_btree_node_fake(b);
+	set_btree_node_need_rewrite(b);
 	b->c.level	= 0;
 	b->c.btree_id	= id;
 
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 0609fc61ff39..262b4f3d9469 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -270,17 +270,8 @@ btree_key_can_insert(struct btree_trans *trans,
 	struct bch_fs *c = trans->c;
 	struct btree *b = iter_l(iter)->b;
 
-	if (unlikely(btree_node_fake(b)))
-		return BTREE_INSERT_BTREE_NODE_FULL;
-
-	/*
-	 * old bch2_extent_sort_fix_overlapping() algorithm won't work with new
-	 * style extent updates:
-	 */
-	if (unlikely(btree_node_old_extent_overwrite(b)))
-		return BTREE_INSERT_BTREE_NODE_FULL;
-
-	if (unlikely(u64s > bch_btree_keys_u64s_remaining(c, b)))
+	if (unlikely(btree_node_need_rewrite(b)) ||
+	    unlikely(u64s > bch_btree_keys_u64s_remaining(c, b)))
 		return BTREE_INSERT_BTREE_NODE_FULL;
 
 	return BTREE_INSERT_OK;
-- 
cgit 


From 697e45b230d5523c5c119b1e6a868632def24451 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 6 Jul 2020 17:02:37 -0400
Subject: bcachefs: Kill BTREE_TRIGGER_NOOVERWRITES

This is prep work for reworking the triggers machinery - we have
triggers that need to know both the old and the new key.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_types.h |  2 --
 fs/bcachefs/buckets.c     |  8 +-------
 fs/bcachefs/buckets.h     |  3 ---
 fs/bcachefs/recovery.c    | 13 ++++++++++---
 4 files changed, 11 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index b9edf863e895..dd272318fba1 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -595,7 +595,6 @@ static inline bool btree_iter_is_extents(struct btree_iter *iter)
 
 enum btree_trigger_flags {
 	__BTREE_TRIGGER_NORUN,		/* Don't run triggers at all */
-	__BTREE_TRIGGER_NOOVERWRITES,	/* Don't run triggers on overwrites */
 
 	__BTREE_TRIGGER_INSERT,
 	__BTREE_TRIGGER_OVERWRITE,
@@ -608,7 +607,6 @@ enum btree_trigger_flags {
 };
 
 #define BTREE_TRIGGER_NORUN		(1U << __BTREE_TRIGGER_NORUN)
-#define BTREE_TRIGGER_NOOVERWRITES	(1U << __BTREE_TRIGGER_NOOVERWRITES)
 
 #define BTREE_TRIGGER_INSERT		(1U << __BTREE_TRIGGER_INSERT)
 #define BTREE_TRIGGER_OVERWRITE		(1U << __BTREE_TRIGGER_OVERWRITE)
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 4ea84cbac5d3..8044cf26fd22 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -1310,7 +1310,7 @@ int bch2_mark_key(struct bch_fs *c, struct bkey_s_c k,
 	return ret;
 }
 
-inline int bch2_mark_overwrite(struct btree_trans *trans,
+static int bch2_mark_overwrite(struct btree_trans *trans,
 			       struct btree_iter *iter,
 			       struct bkey_s_c old,
 			       struct bkey_i *new,
@@ -1384,9 +1384,6 @@ int bch2_mark_update(struct btree_trans *trans,
 		fs_usage, trans->journal_res.seq,
 		BTREE_TRIGGER_INSERT|flags);
 
-	if (unlikely(flags & BTREE_TRIGGER_NOOVERWRITES))
-		return 0;
-
 	/*
 	 * For non extents, we only mark the new key, not the key being
 	 * overwritten - unless we're actually deleting:
@@ -1830,9 +1827,6 @@ int bch2_trans_mark_update(struct btree_trans *trans,
 	if (ret)
 		return ret;
 
-	if (unlikely(flags & BTREE_TRIGGER_NOOVERWRITES))
-		return 0;
-
 	if (btree_iter_type(iter) == BTREE_ITER_CACHED) {
 		struct bkey_cached *ck = (void *) iter->l[0].b;
 
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index cea66c76850d..b897162c5e13 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -264,9 +264,6 @@ int bch2_mark_key(struct bch_fs *, struct bkey_s_c, unsigned, s64,
 int bch2_fs_usage_apply(struct bch_fs *, struct bch_fs_usage_online *,
 			struct disk_reservation *, unsigned);
 
-int bch2_mark_overwrite(struct btree_trans *, struct btree_iter *,
-			struct bkey_s_c, struct bkey_i *,
-			struct bch_fs_usage *, unsigned, bool);
 int bch2_mark_update(struct btree_trans *, struct btree_iter *,
 		     struct bkey_i *, struct bch_fs_usage *, unsigned);
 
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 41b864dcdc39..1695a609ecd9 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -442,11 +442,18 @@ retry:
 		 * regular keys
 		 */
 		__bch2_btree_iter_set_pos(split_iter, split->k.p, false);
-		bch2_trans_update(&trans, split_iter, split, !remark
-				  ? BTREE_TRIGGER_NORUN
-				  : BTREE_TRIGGER_NOOVERWRITES);
+		bch2_trans_update(&trans, split_iter, split,
+				  BTREE_TRIGGER_NORUN);
 
 		bch2_btree_iter_set_pos(iter, split->k.p);
+
+		if (remark) {
+			ret = bch2_trans_mark_key(&trans, bkey_i_to_s_c(split),
+						  0, split->k.size,
+						  BTREE_TRIGGER_INSERT);
+			if (ret)
+				goto err;
+		}
 	} while (bkey_cmp(iter->pos, k->k.p) < 0);
 
 	if (remark) {
-- 
cgit 


From e63534a20117e937b3712acaedb98f208ff6b862 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 6 Jul 2020 19:16:25 -0400
Subject: bcachefs: Rework triggers interface

The trigger for stripe keys is shortly going to need both the old and
the new key passed to the trigger - this patch does that rework.

For now, this just changes the in memory triggers, and this doesn't
change how extent triggers work.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/buckets.c | 297 ++++++++++++++++++++++++++++----------------------
 fs/bcachefs/buckets.h |   4 +-
 2 files changed, 169 insertions(+), 132 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 8044cf26fd22..c02dee3e3164 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -714,7 +714,8 @@ void bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
 	preempt_enable();
 }
 
-static int bch2_mark_alloc(struct bch_fs *c, struct bkey_s_c k,
+static int bch2_mark_alloc(struct bch_fs *c,
+			   struct bkey_s_c old, struct bkey_s_c new,
 			   struct bch_fs_usage *fs_usage,
 			   u64 journal_seq, unsigned flags)
 {
@@ -722,7 +723,11 @@ static int bch2_mark_alloc(struct bch_fs *c, struct bkey_s_c k,
 	struct bkey_alloc_unpacked u;
 	struct bch_dev *ca;
 	struct bucket *g;
-	struct bucket_mark old, m;
+	struct bucket_mark old_m, m;
+
+	/* We don't do anything for deletions - do we?: */
+	if (new.k->type != KEY_TYPE_alloc)
+		return 0;
 
 	/*
 	 * alloc btree is read in by bch2_alloc_read, not gc:
@@ -731,15 +736,15 @@ static int bch2_mark_alloc(struct bch_fs *c, struct bkey_s_c k,
 	    !(flags & BTREE_TRIGGER_BUCKET_INVALIDATE))
 		return 0;
 
-	ca = bch_dev_bkey_exists(c, k.k->p.inode);
+	ca = bch_dev_bkey_exists(c, new.k->p.inode);
 
-	if (k.k->p.offset >= ca->mi.nbuckets)
+	if (new.k->p.offset >= ca->mi.nbuckets)
 		return 0;
 
-	g = __bucket(ca, k.k->p.offset, gc);
-	u = bch2_alloc_unpack(k);
+	g = __bucket(ca, new.k->p.offset, gc);
+	u = bch2_alloc_unpack(new);
 
-	old = bucket_cmpxchg(g, m, ({
+	old_m = bucket_cmpxchg(g, m, ({
 		m.gen			= u.gen;
 		m.data_type		= u.data_type;
 		m.dirty_sectors		= u.dirty_sectors;
@@ -752,7 +757,7 @@ static int bch2_mark_alloc(struct bch_fs *c, struct bkey_s_c k,
 	}));
 
 	if (!(flags & BTREE_TRIGGER_ALLOC_READ))
-		bch2_dev_usage_update(c, ca, fs_usage, old, m, gc);
+		bch2_dev_usage_update(c, ca, fs_usage, old_m, m, gc);
 
 	g->io_time[READ]	= u.read_time;
 	g->io_time[WRITE]	= u.write_time;
@@ -765,11 +770,11 @@ static int bch2_mark_alloc(struct bch_fs *c, struct bkey_s_c k,
 	 */
 
 	if ((flags & BTREE_TRIGGER_BUCKET_INVALIDATE) &&
-	    old.cached_sectors) {
+	    old_m.cached_sectors) {
 		update_cached_sectors(c, fs_usage, ca->dev_idx,
-				      -old.cached_sectors);
-		trace_invalidate(ca, bucket_to_sector(ca, k.k->p.offset),
-				 old.cached_sectors);
+				      -old_m.cached_sectors);
+		trace_invalidate(ca, bucket_to_sector(ca, new.k->p.offset),
+				 old_m.cached_sectors);
 	}
 
 	return 0;
@@ -882,9 +887,9 @@ static void bucket_set_stripe(struct bch_fs *c,
 			      const struct bch_stripe *v,
 			      struct bch_fs_usage *fs_usage,
 			      u64 journal_seq,
-			      unsigned flags)
+			      unsigned flags,
+			      bool enabled)
 {
-	bool enabled = !(flags & BTREE_TRIGGER_OVERWRITE);
 	bool gc = flags & BTREE_TRIGGER_GC;
 	unsigned i;
 
@@ -1104,12 +1109,14 @@ static int bch2_mark_stripe_ptr(struct bch_fs *c,
 	return 0;
 }
 
-static int bch2_mark_extent(struct bch_fs *c, struct bkey_s_c k,
+static int bch2_mark_extent(struct bch_fs *c,
+			    struct bkey_s_c old, struct bkey_s_c new,
 			    unsigned offset, s64 sectors,
 			    enum bch_data_type data_type,
 			    struct bch_fs_usage *fs_usage,
 			    unsigned journal_seq, unsigned flags)
 {
+	struct bkey_s_c k = flags & BTREE_TRIGGER_INSERT ? new : old;
 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
 	const union bch_extent_entry *entry;
 	struct extent_ptr_decoded p;
@@ -1178,72 +1185,88 @@ static int bch2_mark_extent(struct bch_fs *c, struct bkey_s_c k,
 	return 0;
 }
 
-static int bch2_mark_stripe(struct bch_fs *c, struct bkey_s_c k,
+static int bch2_mark_stripe(struct bch_fs *c,
+			    struct bkey_s_c old, struct bkey_s_c new,
 			    struct bch_fs_usage *fs_usage,
 			    u64 journal_seq, unsigned flags)
 {
 	bool gc = flags & BTREE_TRIGGER_GC;
-	struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k);
-	size_t idx = s.k->p.offset;
+	size_t idx = new.k->p.offset;
+	const struct bch_stripe *old_s = old.k->type == KEY_TYPE_stripe
+		? bkey_s_c_to_stripe(old).v : NULL;
+	const struct bch_stripe *new_s = new.k->type == KEY_TYPE_stripe
+		? bkey_s_c_to_stripe(new).v : NULL;
 	struct stripe *m = genradix_ptr(&c->stripes[gc], idx);
 	unsigned i;
 
-	spin_lock(&c->ec_stripes_heap_lock);
-
-	if (!m || ((flags & BTREE_TRIGGER_OVERWRITE) && !m->alive)) {
-		spin_unlock(&c->ec_stripes_heap_lock);
+	if (!m || (old_s && !m->alive)) {
 		bch_err_ratelimited(c, "error marking nonexistent stripe %zu",
 				    idx);
 		return -1;
 	}
 
-	if (!(flags & BTREE_TRIGGER_OVERWRITE)) {
-		m->sectors	= le16_to_cpu(s.v->sectors);
-		m->algorithm	= s.v->algorithm;
-		m->nr_blocks	= s.v->nr_blocks;
-		m->nr_redundant	= s.v->nr_redundant;
+	if (!new_s) {
+		/* Deleting: */
+		bucket_set_stripe(c, old_s, fs_usage,
+				  journal_seq, flags, false);
 
-		bch2_bkey_to_replicas(&m->r.e, k);
+		if (!gc) {
+			spin_lock(&c->ec_stripes_heap_lock);
+			bch2_stripes_heap_del(c, m, idx);
+			spin_unlock(&c->ec_stripes_heap_lock);
+		}
 
-		/*
-		 * XXX: account for stripes somehow here
-		 */
-#if 0
-		update_replicas(c, fs_usage, &m->r.e, stripe_sectors);
-#endif
+		memset(m, 0, sizeof(*m));
+	} else {
+		BUG_ON(old_s && new_s->nr_blocks != old_s->nr_blocks);
+		BUG_ON(old_s && new_s->nr_redundant != old_s->nr_redundant);
+
+		if (!old_s)
+			bucket_set_stripe(c, new_s, fs_usage,
+					  journal_seq, flags, true);
+
+		m->sectors	= le16_to_cpu(new_s->sectors);
+		m->algorithm	= new_s->algorithm;
+		m->nr_blocks	= new_s->nr_blocks;
+		m->nr_redundant	= new_s->nr_redundant;
+
+		bch2_bkey_to_replicas(&m->r.e, new);
 
 		/* gc recalculates these fields: */
 		if (!(flags & BTREE_TRIGGER_GC)) {
-			for (i = 0; i < s.v->nr_blocks; i++) {
+			m->blocks_nonempty = 0;
+
+			for (i = 0; i < new_s->nr_blocks; i++) {
 				m->block_sectors[i] =
-					stripe_blockcount_get(s.v, i);
+					stripe_blockcount_get(new_s, i);
 				m->blocks_nonempty += !!m->block_sectors[i];
 			}
 		}
 
-		if (!gc)
+		if (!gc) {
+			spin_lock(&c->ec_stripes_heap_lock);
 			bch2_stripes_heap_update(c, m, idx);
+			spin_unlock(&c->ec_stripes_heap_lock);
+		}
+
 		m->alive	= true;
-	} else {
-		if (!gc)
-			bch2_stripes_heap_del(c, m, idx);
-		memset(m, 0, sizeof(*m));
 	}
 
-	spin_unlock(&c->ec_stripes_heap_lock);
-
-	bucket_set_stripe(c, s.v, fs_usage, 0, flags);
 	return 0;
 }
 
 static int bch2_mark_key_locked(struct bch_fs *c,
-		   struct bkey_s_c k,
+		   struct bkey_s_c old,
+		   struct bkey_s_c new,
 		   unsigned offset, s64 sectors,
 		   struct bch_fs_usage *fs_usage,
 		   u64 journal_seq, unsigned flags)
 {
+	struct bkey_s_c k = flags & BTREE_TRIGGER_INSERT ? new : old;
 	int ret = 0;
 
+	BUG_ON(!(flags & (BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE)));
+
 	preempt_disable();
 
 	if (!fs_usage || (flags & BTREE_TRIGGER_GC))
@@ -1252,7 +1275,7 @@ static int bch2_mark_key_locked(struct bch_fs *c,
 
 	switch (k.k->type) {
 	case KEY_TYPE_alloc:
-		ret = bch2_mark_alloc(c, k, fs_usage, journal_seq, flags);
+		ret = bch2_mark_alloc(c, old, new, fs_usage, journal_seq, flags);
 		break;
 	case KEY_TYPE_btree_ptr:
 	case KEY_TYPE_btree_ptr_v2:
@@ -1260,16 +1283,16 @@ static int bch2_mark_key_locked(struct bch_fs *c,
 			?  c->opts.btree_node_size
 			: -c->opts.btree_node_size;
 
-		ret = bch2_mark_extent(c, k, offset, sectors, BCH_DATA_BTREE,
-				fs_usage, journal_seq, flags);
+		ret = bch2_mark_extent(c, old, new, offset, sectors,
+				BCH_DATA_BTREE, fs_usage, journal_seq, flags);
 		break;
 	case KEY_TYPE_extent:
 	case KEY_TYPE_reflink_v:
-		ret = bch2_mark_extent(c, k, offset, sectors, BCH_DATA_USER,
-				fs_usage, journal_seq, flags);
+		ret = bch2_mark_extent(c, old, new, offset, sectors,
+				BCH_DATA_USER, fs_usage, journal_seq, flags);
 		break;
 	case KEY_TYPE_stripe:
-		ret = bch2_mark_stripe(c, k, fs_usage, journal_seq, flags);
+		ret = bch2_mark_stripe(c, old, new, fs_usage, journal_seq, flags);
 		break;
 	case KEY_TYPE_inode:
 		if (!(flags & BTREE_TRIGGER_OVERWRITE))
@@ -1295,82 +1318,38 @@ static int bch2_mark_key_locked(struct bch_fs *c,
 	return ret;
 }
 
-int bch2_mark_key(struct bch_fs *c, struct bkey_s_c k,
+int bch2_mark_key(struct bch_fs *c, struct bkey_s_c new,
 		  unsigned offset, s64 sectors,
 		  struct bch_fs_usage *fs_usage,
 		  u64 journal_seq, unsigned flags)
 {
+	struct bkey deleted;
+	struct bkey_s_c old = (struct bkey_s_c) { &deleted, NULL };
 	int ret;
 
+	bkey_init(&deleted);
+
 	percpu_down_read(&c->mark_lock);
-	ret = bch2_mark_key_locked(c, k, offset, sectors,
-				   fs_usage, journal_seq, flags);
+	ret = bch2_mark_key_locked(c, old, new, offset, sectors,
+				   fs_usage, journal_seq,
+				   BTREE_TRIGGER_INSERT|flags);
 	percpu_up_read(&c->mark_lock);
 
 	return ret;
 }
 
-static int bch2_mark_overwrite(struct btree_trans *trans,
-			       struct btree_iter *iter,
-			       struct bkey_s_c old,
-			       struct bkey_i *new,
-			       struct bch_fs_usage *fs_usage,
-			       unsigned flags,
-			       bool is_extents)
-{
-	struct bch_fs		*c = trans->c;
-	unsigned		offset = 0;
-	s64			sectors = -((s64) old.k->size);
-
-	flags |= BTREE_TRIGGER_OVERWRITE;
-
-	if (is_extents
-	    ? bkey_cmp(new->k.p, bkey_start_pos(old.k)) <= 0
-	    : bkey_cmp(new->k.p, old.k->p))
-		return 0;
-
-	if (is_extents) {
-		switch (bch2_extent_overlap(&new->k, old.k)) {
-		case BCH_EXTENT_OVERLAP_ALL:
-			offset = 0;
-			sectors = -((s64) old.k->size);
-			break;
-		case BCH_EXTENT_OVERLAP_BACK:
-			offset = bkey_start_offset(&new->k) -
-				bkey_start_offset(old.k);
-			sectors = bkey_start_offset(&new->k) -
-				old.k->p.offset;
-			break;
-		case BCH_EXTENT_OVERLAP_FRONT:
-			offset = 0;
-			sectors = bkey_start_offset(old.k) -
-				new->k.p.offset;
-			break;
-		case BCH_EXTENT_OVERLAP_MIDDLE:
-			offset = bkey_start_offset(&new->k) -
-				bkey_start_offset(old.k);
-			sectors = -((s64) new->k.size);
-			flags |= BTREE_TRIGGER_OVERWRITE_SPLIT;
-			break;
-		}
-
-		BUG_ON(sectors >= 0);
-	}
-
-	return bch2_mark_key_locked(c, old, offset, sectors, fs_usage,
-				    trans->journal_res.seq, flags) ?: 1;
-}
-
 int bch2_mark_update(struct btree_trans *trans,
 		     struct btree_iter *iter,
-		     struct bkey_i *insert,
+		     struct bkey_i *new,
 		     struct bch_fs_usage *fs_usage,
 		     unsigned flags)
 {
 	struct bch_fs		*c = trans->c;
 	struct btree		*b = iter_l(iter)->b;
 	struct btree_node_iter	node_iter = iter_l(iter)->iter;
-	struct bkey_packed	*_k;
+	struct bkey_packed	*_old;
+	struct bkey_s_c		old;
+	struct bkey		unpacked;
 	int ret = 0;
 
 	if (unlikely(flags & BTREE_TRIGGER_NORUN))
@@ -1379,31 +1358,87 @@ int bch2_mark_update(struct btree_trans *trans,
 	if (!btree_node_type_needs_gc(iter->btree_id))
 		return 0;
 
-	bch2_mark_key_locked(c, bkey_i_to_s_c(insert),
-		0, insert->k.size,
-		fs_usage, trans->journal_res.seq,
-		BTREE_TRIGGER_INSERT|flags);
+	bkey_init(&unpacked);
+	old = (struct bkey_s_c) { &unpacked, NULL };
 
-	/*
-	 * For non extents, we only mark the new key, not the key being
-	 * overwritten - unless we're actually deleting:
-	 */
-	if ((iter->btree_id == BTREE_ID_ALLOC ||
-	     iter->btree_id == BTREE_ID_EC) &&
-	    !bkey_deleted(&insert->k))
-		return 0;
+	if (!btree_node_type_is_extents(iter->btree_id)) {
+		if (btree_iter_type(iter) != BTREE_ITER_CACHED) {
+			_old = bch2_btree_node_iter_peek(&node_iter, b);
+			if (_old)
+				old = bkey_disassemble(b, _old, &unpacked);
+		} else {
+			struct bkey_cached *ck = (void *) iter->l[0].b;
 
-	while ((_k = bch2_btree_node_iter_peek(&node_iter, b))) {
-		struct bkey		unpacked;
-		struct bkey_s_c		k = bkey_disassemble(b, _k, &unpacked);
+			if (ck->valid)
+				old = bkey_i_to_s_c(ck->k);
+		}
 
-		ret = bch2_mark_overwrite(trans, iter, k, insert,
-					  fs_usage, flags,
-					  btree_node_type_is_extents(iter->btree_id));
-		if (ret <= 0)
-			break;
+		if (old.k->type == new->k.type) {
+			bch2_mark_key_locked(c, old, bkey_i_to_s_c(new), 0, 0,
+				fs_usage, trans->journal_res.seq,
+				BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|flags);
 
-		bch2_btree_node_iter_advance(&node_iter, b);
+		} else {
+			bch2_mark_key_locked(c, old, bkey_i_to_s_c(new), 0, 0,
+				fs_usage, trans->journal_res.seq,
+				BTREE_TRIGGER_INSERT|flags);
+			bch2_mark_key_locked(c, old, bkey_i_to_s_c(new), 0, 0,
+				fs_usage, trans->journal_res.seq,
+				BTREE_TRIGGER_OVERWRITE|flags);
+		}
+	} else {
+		BUG_ON(btree_iter_type(iter) == BTREE_ITER_CACHED);
+		bch2_mark_key_locked(c, old, bkey_i_to_s_c(new),
+			0, new->k.size,
+			fs_usage, trans->journal_res.seq,
+			BTREE_TRIGGER_INSERT|flags);
+
+		while ((_old = bch2_btree_node_iter_peek(&node_iter, b))) {
+			unsigned offset = 0;
+			s64 sectors;
+
+			old = bkey_disassemble(b, _old, &unpacked);
+			sectors = -((s64) old.k->size);
+
+			flags |= BTREE_TRIGGER_OVERWRITE;
+
+			if (bkey_cmp(new->k.p, bkey_start_pos(old.k)) <= 0)
+				return 0;
+
+			switch (bch2_extent_overlap(&new->k, old.k)) {
+			case BCH_EXTENT_OVERLAP_ALL:
+				offset = 0;
+				sectors = -((s64) old.k->size);
+				break;
+			case BCH_EXTENT_OVERLAP_BACK:
+				offset = bkey_start_offset(&new->k) -
+					bkey_start_offset(old.k);
+				sectors = bkey_start_offset(&new->k) -
+					old.k->p.offset;
+				break;
+			case BCH_EXTENT_OVERLAP_FRONT:
+				offset = 0;
+				sectors = bkey_start_offset(old.k) -
+					new->k.p.offset;
+				break;
+			case BCH_EXTENT_OVERLAP_MIDDLE:
+				offset = bkey_start_offset(&new->k) -
+					bkey_start_offset(old.k);
+				sectors = -((s64) new->k.size);
+				flags |= BTREE_TRIGGER_OVERWRITE_SPLIT;
+				break;
+			}
+
+			BUG_ON(sectors >= 0);
+
+			ret = bch2_mark_key_locked(c, old, bkey_i_to_s_c(new),
+					offset, sectors, fs_usage,
+					trans->journal_res.seq, flags) ?: 1;
+			if (ret <= 0)
+				break;
+
+			bch2_btree_node_iter_advance(&node_iter, b);
+		}
 	}
 
 	return ret;
@@ -1458,8 +1493,10 @@ void bch2_trans_fs_usage_apply(struct btree_trans *trans,
 		} else {
 			struct bkey_cached *ck = (void *) i->iter->l[0].b;
 
-			bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(ck->k));
-			pr_err("%s", buf);
+			if (ck->valid) {
+				bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(ck->k));
+				pr_err("%s", buf);
+			}
 		}
 	}
 }
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index b897162c5e13..5ba13b99bd65 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -259,8 +259,8 @@ void bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *,
 			       size_t, enum bch_data_type, unsigned,
 			       struct gc_pos, unsigned);
 
-int bch2_mark_key(struct bch_fs *, struct bkey_s_c, unsigned, s64,
-		  struct bch_fs_usage *, u64, unsigned);
+int bch2_mark_key(struct bch_fs *, struct bkey_s_c, unsigned,
+		  s64, struct bch_fs_usage *, u64, unsigned);
 int bch2_fs_usage_apply(struct bch_fs *, struct bch_fs_usage_online *,
 			struct disk_reservation *, unsigned);
 
-- 
cgit 


From ba6dd1dd493f4e621350fa963e3a95686aaf8a4d Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 6 Jul 2020 20:18:13 -0400
Subject: bcachefs: Improve stripe triggers/heap code

Soon we'll be able to modify existing stripes - replacing empty blocks
with new blocks and new p/q blocks. This patch updates the trigger code
to handle pointers changing in an existing stripe; also, it
significantly improves how the stripes heap works, which means we can
get rid of the stripe creation/deletion lock.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs.h |   1 -
 fs/bcachefs/btree_gc.c |   5 +-
 fs/bcachefs/buckets.c  | 103 ++++++++++++++++++++---------------------
 fs/bcachefs/ec.c       | 122 +++++++++++++++++++++++++++++++------------------
 fs/bcachefs/ec.h       |   2 +
 fs/bcachefs/ec_types.h |   1 +
 fs/bcachefs/super.c    |   1 -
 fs/bcachefs/sysfs.c    |   9 ++++
 8 files changed, 146 insertions(+), 98 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 42e3395884c1..27c5d9da70bf 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -755,7 +755,6 @@ struct bch_fs {
 
 	/* STRIPES: */
 	GENRADIX(struct stripe) stripes[2];
-	struct mutex		ec_stripe_create_lock;
 
 	ec_stripes_heap		ec_stripes_heap;
 	spinlock_t		ec_stripes_heap_lock;
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index cdd4bc334530..f32e8009e444 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -619,8 +619,11 @@ static int bch2_gc_done(struct bch_fs *c,
 				copy_stripe_field(block_sectors[i],
 						  "block_sectors[%u]", i);
 
-			if (dst->alive)
+			if (dst->alive) {
+				spin_lock(&c->ec_stripes_heap_lock);
 				bch2_stripes_heap_insert(c, dst, dst_iter.pos);
+				spin_unlock(&c->ec_stripes_heap_lock);
+			}
 
 			genradix_iter_advance(&dst_iter, &c->stripes[0]);
 			genradix_iter_advance(&src_iter, &c->stripes[1]);
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index c02dee3e3164..aff1ace3778f 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -884,51 +884,46 @@ static s64 ptr_disk_sectors_delta(struct extent_ptr_decoded p,
 }
 
 static void bucket_set_stripe(struct bch_fs *c,
-			      const struct bch_stripe *v,
+			      const struct bch_extent_ptr *ptr,
 			      struct bch_fs_usage *fs_usage,
 			      u64 journal_seq,
 			      unsigned flags,
 			      bool enabled)
 {
 	bool gc = flags & BTREE_TRIGGER_GC;
-	unsigned i;
+	struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+	struct bucket *g = PTR_BUCKET(ca, ptr, gc);
+	struct bucket_mark new, old;
 
-	for (i = 0; i < v->nr_blocks; i++) {
-		const struct bch_extent_ptr *ptr = v->ptrs + i;
-		struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
-		struct bucket *g = PTR_BUCKET(ca, ptr, gc);
-		struct bucket_mark new, old;
-
-		old = bucket_cmpxchg(g, new, ({
-			new.stripe			= enabled;
-			if (journal_seq) {
-				new.journal_seq_valid	= 1;
-				new.journal_seq		= journal_seq;
-			}
-		}));
+	old = bucket_cmpxchg(g, new, ({
+		new.stripe			= enabled;
+		if (journal_seq) {
+			new.journal_seq_valid	= 1;
+			new.journal_seq		= journal_seq;
+		}
+	}));
 
-		bch2_dev_usage_update(c, ca, fs_usage, old, new, gc);
+	bch2_dev_usage_update(c, ca, fs_usage, old, new, gc);
 
-		/*
-		 * XXX write repair code for these, flag stripe as possibly bad
-		 */
-		if (old.gen != ptr->gen)
-			bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
-				      "stripe with stale pointer");
+	/*
+	 * XXX write repair code for these, flag stripe as possibly bad
+	 */
+	if (old.gen != ptr->gen)
+		bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
+			      "stripe with stale pointer");
 #if 0
-		/*
-		 * We'd like to check for these, but these checks don't work
-		 * yet:
-		 */
-		if (old.stripe && enabled)
-			bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
-				      "multiple stripes using same bucket");
-
-		if (!old.stripe && !enabled)
-			bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
-				      "deleting stripe but bucket not marked as stripe bucket");
+	/*
+	 * We'd like to check for these, but these checks don't work
+	 * yet:
+	 */
+	if (old.stripe && enabled)
+		bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
+			      "multiple stripes using same bucket");
+
+	if (!old.stripe && !enabled)
+		bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
+			      "deleting stripe but bucket not marked as stripe bucket");
 #endif
-	}
 }
 
 static int __mark_pointer(struct bch_fs *c, struct bkey_s_c k,
@@ -1070,8 +1065,7 @@ static int bch2_mark_stripe_ptr(struct bch_fs *c,
 {
 	bool gc = flags & BTREE_TRIGGER_GC;
 	struct stripe *m;
-	unsigned old, new;
-	int blocks_nonempty_delta;
+	unsigned i, blocks_nonempty = 0;
 
 	m = genradix_ptr(&c->stripes[gc], p.idx);
 
@@ -1090,20 +1084,17 @@ static int bch2_mark_stripe_ptr(struct bch_fs *c,
 	*nr_parity	= m->nr_redundant;
 	*r = m->r;
 
-	old = m->block_sectors[p.block];
 	m->block_sectors[p.block] += sectors;
-	new = m->block_sectors[p.block];
 
-	blocks_nonempty_delta = (int) !!new - (int) !!old;
-	if (blocks_nonempty_delta) {
-		m->blocks_nonempty += blocks_nonempty_delta;
+	for (i = 0; i < m->nr_blocks; i++)
+		blocks_nonempty += m->block_sectors[i] != 0;
 
+	if (m->blocks_nonempty != blocks_nonempty) {
+		m->blocks_nonempty = blocks_nonempty;
 		if (!gc)
 			bch2_stripes_heap_update(c, m, p.idx);
 	}
 
-	m->dirty = true;
-
 	spin_unlock(&c->ec_stripes_heap_lock);
 
 	return 0;
@@ -1207,10 +1198,11 @@ static int bch2_mark_stripe(struct bch_fs *c,
 
 	if (!new_s) {
 		/* Deleting: */
-		bucket_set_stripe(c, old_s, fs_usage,
-				  journal_seq, flags, false);
+		for (i = 0; i < old_s->nr_blocks; i++)
+			bucket_set_stripe(c, old_s->ptrs + i, fs_usage,
+					  journal_seq, flags, false);
 
-		if (!gc) {
+		if (!gc && m->on_heap) {
 			spin_lock(&c->ec_stripes_heap_lock);
 			bch2_stripes_heap_del(c, m, idx);
 			spin_unlock(&c->ec_stripes_heap_lock);
@@ -1221,10 +1213,21 @@ static int bch2_mark_stripe(struct bch_fs *c,
 		BUG_ON(old_s && new_s->nr_blocks != old_s->nr_blocks);
 		BUG_ON(old_s && new_s->nr_redundant != old_s->nr_redundant);
 
-		if (!old_s)
-			bucket_set_stripe(c, new_s, fs_usage,
-					  journal_seq, flags, true);
+		for (i = 0; i < new_s->nr_blocks; i++) {
+			if (!old_s ||
+			    memcmp(new_s->ptrs + i,
+				   old_s->ptrs + i,
+				   sizeof(struct bch_extent_ptr))) {
+
+				if (old_s)
+					bucket_set_stripe(c, old_s->ptrs + i, fs_usage,
+							  journal_seq, flags, false);
+				bucket_set_stripe(c, new_s->ptrs + i, fs_usage,
+						  journal_seq, flags, true);
+			}
+		}
 
+		m->alive	= true;
 		m->sectors	= le16_to_cpu(new_s->sectors);
 		m->algorithm	= new_s->algorithm;
 		m->nr_blocks	= new_s->nr_blocks;
@@ -1248,8 +1251,6 @@ static int bch2_mark_stripe(struct bch_fs *c,
 			bch2_stripes_heap_update(c, m, idx);
 			spin_unlock(&c->ec_stripes_heap_lock);
 		}
-
-		m->alive	= true;
 	}
 
 	return 0;
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index d35fa016cf0a..516a5268f462 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -607,39 +607,16 @@ static void heap_verify_backpointer(struct bch_fs *c, size_t idx)
 	BUG_ON(h->data[m->heap_idx].idx != idx);
 }
 
-void bch2_stripes_heap_update(struct bch_fs *c,
-			      struct stripe *m, size_t idx)
-{
-	ec_stripes_heap *h = &c->ec_stripes_heap;
-	size_t i;
-
-	if (m->alive) {
-		heap_verify_backpointer(c, idx);
-
-		h->data[m->heap_idx].blocks_nonempty = m->blocks_nonempty;
-
-		i = m->heap_idx;
-		heap_sift_up(h,	  i, ec_stripes_heap_cmp,
-			     ec_stripes_heap_set_backpointer);
-		heap_sift_down(h, i, ec_stripes_heap_cmp,
-			       ec_stripes_heap_set_backpointer);
-
-		heap_verify_backpointer(c, idx);
-	} else {
-		bch2_stripes_heap_insert(c, m, idx);
-	}
-
-	if (stripe_idx_to_delete(c) >= 0 &&
-	    !percpu_ref_is_dying(&c->writes))
-		schedule_work(&c->ec_stripe_delete_work);
-}
-
 void bch2_stripes_heap_del(struct bch_fs *c,
 			   struct stripe *m, size_t idx)
 {
+	if (!m->on_heap)
+		return;
+
+	m->on_heap = false;
+
 	heap_verify_backpointer(c, idx);
 
-	m->alive = false;
 	heap_del(&c->ec_stripes_heap, m->heap_idx,
 		 ec_stripes_heap_cmp,
 		 ec_stripes_heap_set_backpointer);
@@ -648,19 +625,49 @@ void bch2_stripes_heap_del(struct bch_fs *c,
 void bch2_stripes_heap_insert(struct bch_fs *c,
 			      struct stripe *m, size_t idx)
 {
+	if (m->on_heap)
+		return;
+
 	BUG_ON(heap_full(&c->ec_stripes_heap));
 
+	m->on_heap = true;
+
 	heap_add(&c->ec_stripes_heap, ((struct ec_stripe_heap_entry) {
 			.idx = idx,
 			.blocks_nonempty = m->blocks_nonempty,
 		}),
 		 ec_stripes_heap_cmp,
 		 ec_stripes_heap_set_backpointer);
-	m->alive = true;
 
 	heap_verify_backpointer(c, idx);
 }
 
+void bch2_stripes_heap_update(struct bch_fs *c,
+			      struct stripe *m, size_t idx)
+{
+	ec_stripes_heap *h = &c->ec_stripes_heap;
+	size_t i;
+
+	if (!m->on_heap)
+		return;
+
+	heap_verify_backpointer(c, idx);
+
+	h->data[m->heap_idx].blocks_nonempty = m->blocks_nonempty;
+
+	i = m->heap_idx;
+	heap_sift_up(h,	  i, ec_stripes_heap_cmp,
+		     ec_stripes_heap_set_backpointer);
+	heap_sift_down(h, i, ec_stripes_heap_cmp,
+		       ec_stripes_heap_set_backpointer);
+
+	heap_verify_backpointer(c, idx);
+
+	if (stripe_idx_to_delete(c) >= 0 &&
+	    !percpu_ref_is_dying(&c->writes))
+		schedule_work(&c->ec_stripe_delete_work);
+}
+
 /* stripe deletion */
 
 static int ec_stripe_delete(struct bch_fs *c, size_t idx)
@@ -677,23 +684,20 @@ static void ec_stripe_delete_work(struct work_struct *work)
 		container_of(work, struct bch_fs, ec_stripe_delete_work);
 	ssize_t idx;
 
-	down_read(&c->gc_lock);
-	mutex_lock(&c->ec_stripe_create_lock);
-
 	while (1) {
 		spin_lock(&c->ec_stripes_heap_lock);
 		idx = stripe_idx_to_delete(c);
-		spin_unlock(&c->ec_stripes_heap_lock);
-
-		if (idx < 0)
+		if (idx < 0) {
+			spin_unlock(&c->ec_stripes_heap_lock);
 			break;
+		}
+
+		bch2_stripes_heap_del(c, genradix_ptr(&c->stripes[0], idx), idx);
+		spin_unlock(&c->ec_stripes_heap_lock);
 
 		if (ec_stripe_delete(c, idx))
 			break;
 	}
-
-	mutex_unlock(&c->ec_stripe_create_lock);
-	up_read(&c->gc_lock);
 }
 
 /* stripe creation: */
@@ -846,6 +850,7 @@ static void ec_stripe_create(struct ec_stripe_new *s)
 	struct bch_fs *c = s->c;
 	struct open_bucket *ob;
 	struct bkey_i *k;
+	struct stripe *m;
 	struct bch_stripe *v = &s->stripe.key.v;
 	unsigned i, nr_data = v->nr_blocks - v->nr_redundant;
 	struct closure cl;
@@ -882,12 +887,10 @@ static void ec_stripe_create(struct ec_stripe_new *s)
 			goto err_put_writes;
 		}
 
-	mutex_lock(&c->ec_stripe_create_lock);
-
 	ret = ec_stripe_bkey_insert(c, &s->stripe.key);
 	if (ret) {
 		bch_err(c, "error creating stripe: error creating stripe key");
-		goto err_unlock;
+		goto err_put_writes;
 	}
 
 	for_each_keylist_key(&s->keys, k) {
@@ -896,8 +899,11 @@ static void ec_stripe_create(struct ec_stripe_new *s)
 			break;
 	}
 
-err_unlock:
-	mutex_unlock(&c->ec_stripe_create_lock);
+	spin_lock(&c->ec_stripes_heap_lock);
+	m = genradix_ptr(&c->stripes[0], s->stripe.key.k.p.offset);
+	BUG_ON(m->on_heap);
+	bch2_stripes_heap_insert(c, m, s->stripe.key.k.p.offset);
+	spin_unlock(&c->ec_stripes_heap_lock);
 err_put_writes:
 	percpu_ref_put(&c->writes);
 err:
@@ -1280,11 +1286,21 @@ static int bch2_stripes_read_fn(struct bch_fs *c, enum btree_id id,
 {
 	int ret = 0;
 
-	if (k.k->type == KEY_TYPE_stripe)
+	if (k.k->type == KEY_TYPE_stripe) {
+		struct stripe *m;
+
 		ret = __ec_stripe_mem_alloc(c, k.k->p.offset, GFP_KERNEL) ?:
 			bch2_mark_key(c, k, 0, 0, NULL, 0,
 				      BTREE_TRIGGER_ALLOC_READ|
 				      BTREE_TRIGGER_NOATOMIC);
+		if (ret)
+			return ret;
+
+		spin_lock(&c->ec_stripes_heap_lock);
+		m = genradix_ptr(&c->stripes[0], k.k->p.offset);
+		bch2_stripes_heap_insert(c, m, k.k->p.offset);
+		spin_unlock(&c->ec_stripes_heap_lock);
+	}
 
 	return ret;
 }
@@ -1335,6 +1351,24 @@ int bch2_ec_mem_alloc(struct bch_fs *c, bool gc)
 	return 0;
 }
 
+void bch2_stripes_heap_to_text(struct printbuf *out, struct bch_fs *c)
+{
+	ec_stripes_heap *h = &c->ec_stripes_heap;
+	struct stripe *m;
+	size_t i;
+
+	spin_lock(&c->ec_stripes_heap_lock);
+	for (i = 0; i < min(h->used, 20UL); i++) {
+		m = genradix_ptr(&c->stripes[0], h->data[i].idx);
+
+		pr_buf(out, "%zu %u/%u+%u\n", h->data[i].idx,
+		       h->data[i].blocks_nonempty,
+		       m->nr_blocks - m->nr_redundant,
+		       m->nr_redundant);
+	}
+	spin_unlock(&c->ec_stripes_heap_lock);
+}
+
 void bch2_fs_ec_exit(struct bch_fs *c)
 {
 	struct ec_stripe_head *h;
diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h
index 4dfaac034886..36444cb14190 100644
--- a/fs/bcachefs/ec.h
+++ b/fs/bcachefs/ec.h
@@ -157,6 +157,8 @@ int bch2_stripes_write(struct bch_fs *, unsigned, bool *);
 
 int bch2_ec_mem_alloc(struct bch_fs *, bool);
 
+void bch2_stripes_heap_to_text(struct printbuf *, struct bch_fs *);
+
 void bch2_fs_ec_exit(struct bch_fs *);
 int bch2_fs_ec_init(struct bch_fs *);
 
diff --git a/fs/bcachefs/ec_types.h b/fs/bcachefs/ec_types.h
index 5c3f77c8aac7..e4d633fca5bf 100644
--- a/fs/bcachefs/ec_types.h
+++ b/fs/bcachefs/ec_types.h
@@ -22,6 +22,7 @@ struct stripe {
 
 	unsigned		alive:1;
 	unsigned		dirty:1;
+	unsigned		on_heap:1;
 	u8			blocks_nonempty;
 	u16			block_sectors[EC_STRIPE_MAX];
 
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 9bc470e68cc9..4b21db5811bd 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -678,7 +678,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 
 	INIT_LIST_HEAD(&c->ec_new_stripe_list);
 	mutex_init(&c->ec_new_stripe_lock);
-	mutex_init(&c->ec_stripe_create_lock);
 	spin_lock_init(&c->ec_stripes_heap_lock);
 
 	seqcount_init(&c->gc_pos_lock);
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index 30be49eb5da6..d7ac26b8f9f3 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -168,6 +168,7 @@ read_attribute(btree_updates);
 read_attribute(dirty_btree_nodes);
 read_attribute(btree_key_cache);
 read_attribute(btree_transactions);
+read_attribute(stripes_heap);
 
 read_attribute(internal_uuid);
 
@@ -418,6 +419,13 @@ SHOW(bch2_fs)
 		return out.pos - buf;
 	}
 
+	if (attr == &sysfs_stripes_heap) {
+		struct printbuf out = _PBUF(buf, PAGE_SIZE);
+
+		bch2_stripes_heap_to_text(&out, c);
+		return out.pos - buf;
+	}
+
 	if (attr == &sysfs_compression_stats)
 		return bch2_compression_stats(c, buf);
 
@@ -583,6 +591,7 @@ struct attribute *bch2_fs_internal_files[] = {
 	&sysfs_dirty_btree_nodes,
 	&sysfs_btree_key_cache,
 	&sysfs_btree_transactions,
+	&sysfs_stripes_heap,
 
 	&sysfs_read_realloc_races,
 	&sysfs_extent_migrate_done,
-- 
cgit 


From 703e2a43bf30c1d5610fa7d1a823911d96487dac Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 6 Jul 2020 20:59:46 -0400
Subject: bcachefs: Move stripe creation to workqueue

This is mainly to solve a lock ordering issue, and also simplifies the
code a bit.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_foreground.c |  2 +-
 fs/bcachefs/bcachefs.h         |  9 +++-
 fs/bcachefs/ec.c               | 93 ++++++++++++++++++++++++------------------
 fs/bcachefs/ec.h               |  5 +--
 fs/bcachefs/super.c            |  8 +++-
 fs/bcachefs/sysfs.c            | 26 ++++++------
 6 files changed, 82 insertions(+), 61 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index 4c1c264ce206..04c1c1b592bc 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -582,7 +582,7 @@ got_bucket:
 		       nr_effective, have_cache, flags, ob);
 	atomic_inc(&h->s->pin);
 out_put_head:
-	bch2_ec_stripe_head_put(h);
+	bch2_ec_stripe_head_put(c, h);
 }
 
 /* Sector allocator */
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 27c5d9da70bf..7fdcae5fa225 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -760,8 +760,13 @@ struct bch_fs {
 	spinlock_t		ec_stripes_heap_lock;
 
 	/* ERASURE CODING */
-	struct list_head	ec_new_stripe_list;
-	struct mutex		ec_new_stripe_lock;
+	struct list_head	ec_stripe_head_list;
+	struct mutex		ec_stripe_head_lock;
+
+	struct list_head	ec_stripe_new_list;
+	struct mutex		ec_stripe_new_lock;
+
+	struct work_struct	ec_stripe_create_work;
 	u64			ec_stripe_hint;
 
 	struct bio_set		ec_bioset;
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index 516a5268f462..b1084b74778a 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -861,7 +861,8 @@ static void ec_stripe_create(struct ec_stripe_new *s)
 	closure_init_stack(&cl);
 
 	if (s->err) {
-		bch_err(c, "error creating stripe: error writing data buckets");
+		if (s->err != -EROFS)
+			bch_err(c, "error creating stripe: error writing data buckets");
 		goto err;
 	}
 
@@ -916,30 +917,50 @@ err:
 
 	bch2_keylist_free(&s->keys, s->inline_keys);
 
-	mutex_lock(&s->h->lock);
-	list_del(&s->list);
-	mutex_unlock(&s->h->lock);
-
 	for (i = 0; i < s->stripe.key.v.nr_blocks; i++)
 		kvpfree(s->stripe.data[i], s->stripe.size << 9);
 	kfree(s);
 }
 
-static struct ec_stripe_new *ec_stripe_set_pending(struct ec_stripe_head *h)
+static void ec_stripe_create_work(struct work_struct *work)
 {
-	struct ec_stripe_new *s = h->s;
+	struct bch_fs *c = container_of(work,
+		struct bch_fs, ec_stripe_create_work);
+	struct ec_stripe_new *s, *n;
+restart:
+	mutex_lock(&c->ec_stripe_new_lock);
+	list_for_each_entry_safe(s, n, &c->ec_stripe_new_list, list)
+		if (!atomic_read(&s->pin)) {
+			list_del(&s->list);
+			mutex_unlock(&c->ec_stripe_new_lock);
+			ec_stripe_create(s);
+			goto restart;
+		}
+	mutex_unlock(&c->ec_stripe_new_lock);
+}
 
-	list_add(&s->list, &h->stripes);
-	h->s = NULL;
+static void ec_stripe_new_put(struct bch_fs *c, struct ec_stripe_new *s)
+{
+	BUG_ON(atomic_read(&s->pin) <= 0);
 
-	return s;
+	if (atomic_dec_and_test(&s->pin)) {
+		BUG_ON(!s->pending);
+		queue_work(system_long_wq, &c->ec_stripe_create_work);
+	}
 }
 
-static void ec_stripe_new_put(struct ec_stripe_new *s)
+static void ec_stripe_set_pending(struct bch_fs *c, struct ec_stripe_head *h)
 {
-	BUG_ON(atomic_read(&s->pin) <= 0);
-	if (atomic_dec_and_test(&s->pin))
-		ec_stripe_create(s);
+	struct ec_stripe_new *s = h->s;
+
+	h->s		= NULL;
+	s->pending	= true;
+
+	mutex_lock(&c->ec_stripe_new_lock);
+	list_add(&s->list, &c->ec_stripe_new_list);
+	mutex_unlock(&c->ec_stripe_new_lock);
+
+	ec_stripe_new_put(c, s);
 }
 
 /* have a full bucket - hand it off to be erasure coded: */
@@ -950,7 +971,7 @@ void bch2_ec_bucket_written(struct bch_fs *c, struct open_bucket *ob)
 	if (ob->sectors_free)
 		s->err = -1;
 
-	ec_stripe_new_put(s);
+	ec_stripe_new_put(c, s);
 }
 
 void bch2_ec_bucket_cancel(struct bch_fs *c, struct open_bucket *ob)
@@ -1106,7 +1127,6 @@ ec_new_stripe_head_alloc(struct bch_fs *c, unsigned target,
 
 	mutex_init(&h->lock);
 	mutex_lock(&h->lock);
-	INIT_LIST_HEAD(&h->stripes);
 
 	h->target	= target;
 	h->algo		= algo;
@@ -1126,23 +1146,18 @@ ec_new_stripe_head_alloc(struct bch_fs *c, unsigned target,
 			h->nr_active_devs++;
 
 	rcu_read_unlock();
-	list_add(&h->list, &c->ec_new_stripe_list);
+	list_add(&h->list, &c->ec_stripe_head_list);
 	return h;
 }
 
-void bch2_ec_stripe_head_put(struct ec_stripe_head *h)
+void bch2_ec_stripe_head_put(struct bch_fs *c, struct ec_stripe_head *h)
 {
-	struct ec_stripe_new *s = NULL;
-
 	if (h->s &&
 	    bitmap_weight(h->s->blocks_allocated,
 			  h->s->blocks.nr) == h->s->blocks.nr)
-		s = ec_stripe_set_pending(h);
+		ec_stripe_set_pending(c, h);
 
 	mutex_unlock(&h->lock);
-
-	if (s)
-		ec_stripe_new_put(s);
 }
 
 struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *c,
@@ -1155,8 +1170,8 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *c,
 	if (!redundancy)
 		return NULL;
 
-	mutex_lock(&c->ec_new_stripe_lock);
-	list_for_each_entry(h, &c->ec_new_stripe_list, list)
+	mutex_lock(&c->ec_stripe_head_lock);
+	list_for_each_entry(h, &c->ec_stripe_head_list, list)
 		if (h->target		== target &&
 		    h->algo		== algo &&
 		    h->redundancy	== redundancy) {
@@ -1166,7 +1181,7 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *c,
 
 	h = ec_new_stripe_head_alloc(c, target, algo, redundancy);
 found:
-	mutex_unlock(&c->ec_new_stripe_lock);
+	mutex_unlock(&c->ec_stripe_head_lock);
 	return h;
 }
 
@@ -1176,9 +1191,8 @@ void bch2_ec_stop_dev(struct bch_fs *c, struct bch_dev *ca)
 	struct open_bucket *ob;
 	unsigned i;
 
-	mutex_lock(&c->ec_new_stripe_lock);
-	list_for_each_entry(h, &c->ec_new_stripe_list, list) {
-		struct ec_stripe_new *s = NULL;
+	mutex_lock(&c->ec_stripe_head_lock);
+	list_for_each_entry(h, &c->ec_stripe_head_list, list) {
 
 		mutex_lock(&h->lock);
 		bch2_open_buckets_stop_dev(c, ca, &h->blocks);
@@ -1195,15 +1209,12 @@ void bch2_ec_stop_dev(struct bch_fs *c, struct bch_dev *ca)
 				goto found;
 		goto unlock;
 found:
-		h->s->err = -1;
-		s = ec_stripe_set_pending(h);
+		h->s->err = -EROFS;
+		ec_stripe_set_pending(c, h);
 unlock:
 		mutex_unlock(&h->lock);
-
-		if (s)
-			ec_stripe_new_put(s);
 	}
-	mutex_unlock(&c->ec_new_stripe_lock);
+	mutex_unlock(&c->ec_stripe_head_lock);
 }
 
 static int __bch2_stripe_write_key(struct btree_trans *trans,
@@ -1374,20 +1385,21 @@ void bch2_fs_ec_exit(struct bch_fs *c)
 	struct ec_stripe_head *h;
 
 	while (1) {
-		mutex_lock(&c->ec_new_stripe_lock);
-		h = list_first_entry_or_null(&c->ec_new_stripe_list,
+		mutex_lock(&c->ec_stripe_head_lock);
+		h = list_first_entry_or_null(&c->ec_stripe_head_list,
 					     struct ec_stripe_head, list);
 		if (h)
 			list_del(&h->list);
-		mutex_unlock(&c->ec_new_stripe_lock);
+		mutex_unlock(&c->ec_stripe_head_lock);
 		if (!h)
 			break;
 
 		BUG_ON(h->s);
-		BUG_ON(!list_empty(&h->stripes));
 		kfree(h);
 	}
 
+	BUG_ON(!list_empty(&c->ec_stripe_new_list));
+
 	free_heap(&c->ec_stripes_heap);
 	genradix_free(&c->stripes[0]);
 	bioset_exit(&c->ec_bioset);
@@ -1395,6 +1407,7 @@ void bch2_fs_ec_exit(struct bch_fs *c)
 
 int bch2_fs_ec_init(struct bch_fs *c)
 {
+	INIT_WORK(&c->ec_stripe_create_work, ec_stripe_create_work);
 	INIT_WORK(&c->ec_stripe_delete_work, ec_stripe_delete_work);
 
 	return bioset_init(&c->ec_bioset, 1, offsetof(struct ec_bio, bio),
diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h
index 36444cb14190..6f9354f82656 100644
--- a/fs/bcachefs/ec.h
+++ b/fs/bcachefs/ec.h
@@ -92,6 +92,7 @@ struct ec_stripe_new {
 	atomic_t		pin;
 
 	int			err;
+	bool			pending;
 
 	unsigned long		blocks_allocated[BITS_TO_LONGS(EC_STRIPE_MAX)];
 
@@ -108,8 +109,6 @@ struct ec_stripe_head {
 	struct list_head	list;
 	struct mutex		lock;
 
-	struct list_head	stripes;
-
 	unsigned		target;
 	unsigned		algo;
 	unsigned		redundancy;
@@ -139,7 +138,7 @@ void bch2_ec_bucket_cancel(struct bch_fs *, struct open_bucket *);
 
 int bch2_ec_stripe_new_alloc(struct bch_fs *, struct ec_stripe_head *);
 
-void bch2_ec_stripe_head_put(struct ec_stripe_head *);
+void bch2_ec_stripe_head_put(struct bch_fs *, struct ec_stripe_head *);
 struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *, unsigned,
 					       unsigned, unsigned);
 
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 4b21db5811bd..6cfcae724650 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -676,8 +676,12 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 	INIT_LIST_HEAD(&c->fsck_errors);
 	mutex_init(&c->fsck_error_lock);
 
-	INIT_LIST_HEAD(&c->ec_new_stripe_list);
-	mutex_init(&c->ec_new_stripe_lock);
+	INIT_LIST_HEAD(&c->ec_stripe_head_list);
+	mutex_init(&c->ec_stripe_head_lock);
+
+	INIT_LIST_HEAD(&c->ec_stripe_new_list);
+	mutex_init(&c->ec_stripe_new_lock);
+
 	spin_lock_init(&c->ec_stripes_heap_lock);
 
 	seqcount_init(&c->gc_pos_lock);
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index d7ac26b8f9f3..a1057532a9f3 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -320,8 +320,8 @@ static ssize_t bch2_new_stripes(struct bch_fs *c, char *buf)
 	struct ec_stripe_head *h;
 	struct ec_stripe_new *s;
 
-	mutex_lock(&c->ec_new_stripe_lock);
-	list_for_each_entry(h, &c->ec_new_stripe_list, list) {
+	mutex_lock(&c->ec_stripe_head_lock);
+	list_for_each_entry(h, &c->ec_stripe_head_list, list) {
 		out += scnprintf(out, end - out,
 				 "target %u algo %u redundancy %u:\n",
 				 h->target, h->algo, h->redundancy);
@@ -332,19 +332,19 @@ static ssize_t bch2_new_stripes(struct bch_fs *c, char *buf)
 					 h->s->blocks.nr,
 					 bitmap_weight(h->s->blocks_allocated,
 						       h->s->blocks.nr));
+	}
+	mutex_unlock(&c->ec_stripe_head_lock);
 
-		mutex_lock(&h->lock);
-		list_for_each_entry(s, &h->stripes, list)
-			out += scnprintf(out, end - out,
-					 "\tin flight: blocks %u allocated %u pin %u\n",
-					 s->blocks.nr,
-					 bitmap_weight(s->blocks_allocated,
-						       s->blocks.nr),
-					 atomic_read(&s->pin));
-		mutex_unlock(&h->lock);
-
+	mutex_lock(&c->ec_stripe_new_lock);
+	list_for_each_entry(h, &c->ec_stripe_new_list, list) {
+		out += scnprintf(out, end - out,
+				 "\tin flight: blocks %u allocated %u pin %u\n",
+				 s->blocks.nr,
+				 bitmap_weight(s->blocks_allocated,
+					       s->blocks.nr),
+				 atomic_read(&s->pin));
 	}
-	mutex_unlock(&c->ec_new_stripe_lock);
+	mutex_unlock(&c->ec_stripe_new_lock);
 
 	return out - buf;
 }
-- 
cgit 


From f6b94a3baa956ff10a52a545a9ad60f35e88e683 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 6 Jul 2020 22:33:54 -0400
Subject: bcachefs: Refactor stripe creation

Prep work for the patch to update existing stripes with new data blocks.
This moves allocating new stripes into ec.c, and also sets up the data
structures so that we can handly only allocating some of the blocks in a
stripe.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_foreground.c | 103 +++------------------
 fs/bcachefs/alloc_foreground.h |   5 +
 fs/bcachefs/ec.c               | 205 +++++++++++++++++++++++++++++++----------
 fs/bcachefs/ec.h               |   6 +-
 4 files changed, 180 insertions(+), 139 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index 04c1c1b592bc..1675f0dfca8a 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -344,10 +344,9 @@ struct dev_alloc_list bch2_dev_alloc_list(struct bch_fs *c,
 					  struct bch_devs_mask *devs)
 {
 	struct dev_alloc_list ret = { .nr = 0 };
-	struct bch_dev *ca;
 	unsigned i;
 
-	for_each_member_device_rcu(ca, c, i, devs)
+	for_each_set_bit(i, devs->d, BCH_SB_MEMBERS_MAX)
 		ret.devs[ret.nr++] = i;
 
 	bubble_sort(ret.devs, ret.nr, dev_stripe_cmp);
@@ -396,16 +395,16 @@ static void add_new_bucket(struct bch_fs *c,
 	ob_push(c, ptrs, ob);
 }
 
-static int bch2_bucket_alloc_set(struct bch_fs *c,
-				 struct open_buckets *ptrs,
-				 struct dev_stripe_state *stripe,
-				 struct bch_devs_mask *devs_may_alloc,
-				 unsigned nr_replicas,
-				 unsigned *nr_effective,
-				 bool *have_cache,
-				 enum alloc_reserve reserve,
-				 unsigned flags,
-				 struct closure *cl)
+int bch2_bucket_alloc_set(struct bch_fs *c,
+			  struct open_buckets *ptrs,
+			  struct dev_stripe_state *stripe,
+			  struct bch_devs_mask *devs_may_alloc,
+			  unsigned nr_replicas,
+			  unsigned *nr_effective,
+			  bool *have_cache,
+			  enum alloc_reserve reserve,
+			  unsigned flags,
+			  struct closure *cl)
 {
 	struct dev_alloc_list devs_sorted =
 		bch2_dev_alloc_list(c, stripe, devs_may_alloc);
@@ -455,74 +454,6 @@ static int bch2_bucket_alloc_set(struct bch_fs *c,
 
 /* Allocate from stripes: */
 
-/*
- * XXX: use a higher watermark for allocating open buckets here:
- */
-static int ec_stripe_alloc(struct bch_fs *c, struct ec_stripe_head *h)
-{
-	struct bch_devs_mask devs;
-	struct open_bucket *ob;
-	unsigned i, nr_have = 0, nr_data =
-		min_t(unsigned, h->nr_active_devs,
-		      EC_STRIPE_MAX) - h->redundancy;
-	bool have_cache = true;
-	int ret = 0;
-
-	BUG_ON(h->blocks.nr > nr_data);
-	BUG_ON(h->parity.nr > h->redundancy);
-
-	devs = h->devs;
-
-	open_bucket_for_each(c, &h->parity, ob, i)
-		__clear_bit(ob->ptr.dev, devs.d);
-	open_bucket_for_each(c, &h->blocks, ob, i)
-		__clear_bit(ob->ptr.dev, devs.d);
-
-	percpu_down_read(&c->mark_lock);
-	rcu_read_lock();
-
-	if (h->parity.nr < h->redundancy) {
-		nr_have = h->parity.nr;
-
-		ret = bch2_bucket_alloc_set(c, &h->parity,
-					    &h->parity_stripe,
-					    &devs,
-					    h->redundancy,
-					    &nr_have,
-					    &have_cache,
-					    RESERVE_NONE,
-					    0,
-					    NULL);
-		if (ret)
-			goto err;
-	}
-
-	if (h->blocks.nr < nr_data) {
-		nr_have = h->blocks.nr;
-
-		ret = bch2_bucket_alloc_set(c, &h->blocks,
-					    &h->block_stripe,
-					    &devs,
-					    nr_data,
-					    &nr_have,
-					    &have_cache,
-					    RESERVE_NONE,
-					    0,
-					    NULL);
-		if (ret)
-			goto err;
-	}
-
-	rcu_read_unlock();
-	percpu_up_read(&c->mark_lock);
-
-	return bch2_ec_stripe_new_alloc(c, h);
-err:
-	rcu_read_unlock();
-	percpu_up_read(&c->mark_lock);
-	return -1;
-}
-
 /*
  * if we can't allocate a new stripe because there are already too many
  * partially filled stripes, force allocating from an existing stripe even when
@@ -555,27 +486,23 @@ static void bucket_alloc_from_stripe(struct bch_fs *c,
 	if (ec_open_bucket(c, ptrs))
 		return;
 
-	h = bch2_ec_stripe_head_get(c, target, erasure_code, nr_replicas - 1);
+	h = bch2_ec_stripe_head_get(c, target, 0, nr_replicas - 1);
 	if (!h)
 		return;
 
-	if (!h->s && ec_stripe_alloc(c, h))
-		goto out_put_head;
-
-	rcu_read_lock();
 	devs_sorted = bch2_dev_alloc_list(c, &wp->stripe, devs_may_alloc);
-	rcu_read_unlock();
 
 	for (i = 0; i < devs_sorted.nr; i++)
 		open_bucket_for_each(c, &h->s->blocks, ob, ec_idx)
 			if (ob->ptr.dev == devs_sorted.devs[i] &&
-			    !test_and_set_bit(ec_idx, h->s->blocks_allocated))
+			    !test_and_set_bit(h->s->data_block_idx[ec_idx],
+					      h->s->blocks_allocated))
 				goto got_bucket;
 	goto out_put_head;
 got_bucket:
 	ca = bch_dev_bkey_exists(c, ob->ptr.dev);
 
-	ob->ec_idx	= ec_idx;
+	ob->ec_idx	= h->s->data_block_idx[ec_idx];
 	ob->ec		= h->s;
 
 	add_new_bucket(c, ptrs, devs_may_alloc,
diff --git a/fs/bcachefs/alloc_foreground.h b/fs/bcachefs/alloc_foreground.h
index 687f973e4b3a..17a6869bb8cd 100644
--- a/fs/bcachefs/alloc_foreground.h
+++ b/fs/bcachefs/alloc_foreground.h
@@ -92,6 +92,11 @@ static inline void bch2_open_bucket_get(struct bch_fs *c,
 	}
 }
 
+int bch2_bucket_alloc_set(struct bch_fs *, struct open_buckets *,
+			  struct dev_stripe_state *, struct bch_devs_mask *,
+			  unsigned, unsigned *, bool *, enum alloc_reserve,
+			  unsigned, struct closure *);
+
 struct write_point *bch2_alloc_sectors_start(struct bch_fs *,
 					     unsigned, unsigned,
 					     struct write_point_specifier,
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index b1084b74778a..8d8683f8b2df 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -200,40 +200,6 @@ static bool extent_has_stripe_ptr(struct bkey_s_c k, u64 idx)
 	return false;
 }
 
-static void ec_stripe_key_init(struct bch_fs *c,
-			       struct bkey_i_stripe *s,
-			       struct open_buckets *blocks,
-			       struct open_buckets *parity,
-			       unsigned stripe_size)
-{
-	struct open_bucket *ob;
-	unsigned i, u64s;
-
-	bkey_stripe_init(&s->k_i);
-	s->v.sectors			= cpu_to_le16(stripe_size);
-	s->v.algorithm			= 0;
-	s->v.nr_blocks			= parity->nr + blocks->nr;
-	s->v.nr_redundant		= parity->nr;
-	s->v.csum_granularity_bits	= ilog2(c->sb.encoded_extent_max);
-	s->v.csum_type			= BCH_CSUM_CRC32C;
-	s->v.pad			= 0;
-
-	open_bucket_for_each(c, blocks, ob, i)
-		s->v.ptrs[i]			= ob->ptr;
-
-	open_bucket_for_each(c, parity, ob, i)
-		s->v.ptrs[blocks->nr + i]	= ob->ptr;
-
-	while ((u64s = stripe_val_u64s(&s->v)) > BKEY_VAL_U64s_MAX) {
-		BUG_ON(1 << s->v.csum_granularity_bits >=
-		       le16_to_cpu(s->v.sectors) ||
-		       s->v.csum_granularity_bits == U8_MAX);
-		s->v.csum_granularity_bits++;
-	}
-
-	set_bkey_val_u64s(&s->k, u64s);
-}
-
 /* Checksumming: */
 
 static void ec_generate_checksums(struct ec_stripe_buf *buf)
@@ -866,6 +832,8 @@ static void ec_stripe_create(struct ec_stripe_new *s)
 		goto err;
 	}
 
+	BUG_ON(!s->allocated);
+
 	if (!percpu_ref_tryget(&c->writes))
 		goto err;
 
@@ -953,6 +921,8 @@ static void ec_stripe_set_pending(struct bch_fs *c, struct ec_stripe_head *h)
 {
 	struct ec_stripe_new *s = h->s;
 
+	BUG_ON(!s->allocated && !s->err);
+
 	h->s		= NULL;
 	s->pending	= true;
 
@@ -1063,14 +1033,38 @@ static unsigned pick_blocksize(struct bch_fs *c,
 	return best.size;
 }
 
-int bch2_ec_stripe_new_alloc(struct bch_fs *c, struct ec_stripe_head *h)
+static void ec_stripe_key_init(struct bch_fs *c,
+			       struct bkey_i_stripe *s,
+			       unsigned nr_data,
+			       unsigned nr_parity,
+			       unsigned stripe_size)
+{
+	unsigned u64s;
+
+	bkey_stripe_init(&s->k_i);
+	s->v.sectors			= cpu_to_le16(stripe_size);
+	s->v.algorithm			= 0;
+	s->v.nr_blocks			= nr_data + nr_parity;
+	s->v.nr_redundant		= nr_parity;
+	s->v.csum_granularity_bits	= ilog2(c->sb.encoded_extent_max);
+	s->v.csum_type			= BCH_CSUM_CRC32C;
+	s->v.pad			= 0;
+
+	while ((u64s = stripe_val_u64s(&s->v)) > BKEY_VAL_U64s_MAX) {
+		BUG_ON(1 << s->v.csum_granularity_bits >=
+		       le16_to_cpu(s->v.sectors) ||
+		       s->v.csum_granularity_bits == U8_MAX);
+		s->v.csum_granularity_bits++;
+	}
+
+	set_bkey_val_u64s(&s->k, u64s);
+}
+
+static int ec_new_stripe_alloc(struct bch_fs *c, struct ec_stripe_head *h)
 {
 	struct ec_stripe_new *s;
 	unsigned i;
 
-	BUG_ON(h->parity.nr != h->redundancy);
-	BUG_ON(!h->blocks.nr);
-	BUG_ON(h->parity.nr + h->blocks.nr > EC_STRIPE_MAX);
 	lockdep_assert_held(&h->lock);
 
 	s = kzalloc(sizeof(*s), GFP_KERNEL);
@@ -1081,11 +1075,9 @@ int bch2_ec_stripe_new_alloc(struct bch_fs *c, struct ec_stripe_head *h)
 	atomic_set(&s->pin, 1);
 	s->c		= c;
 	s->h		= h;
-	s->blocks	= h->blocks;
-	s->parity	= h->parity;
-
-	memset(&h->blocks, 0, sizeof(h->blocks));
-	memset(&h->parity, 0, sizeof(h->parity));
+	s->nr_data	= min_t(unsigned, h->nr_active_devs,
+				EC_STRIPE_MAX) - h->redundancy;
+	s->nr_parity	= h->redundancy;
 
 	bch2_keylist_init(&s->keys, s->inline_keys);
 
@@ -1093,9 +1085,8 @@ int bch2_ec_stripe_new_alloc(struct bch_fs *c, struct ec_stripe_head *h)
 	s->stripe.size		= h->blocksize;
 	memset(s->stripe.valid, 0xFF, sizeof(s->stripe.valid));
 
-	ec_stripe_key_init(c, &s->stripe.key,
-			   &s->blocks, &s->parity,
-			   h->blocksize);
+	ec_stripe_key_init(c, &s->stripe.key, s->nr_data,
+			   s->nr_parity, h->blocksize);
 
 	for (i = 0; i < s->stripe.key.v.nr_blocks; i++) {
 		s->stripe.data[i] = kvpmalloc(s->stripe.size << 9, GFP_KERNEL);
@@ -1153,6 +1144,7 @@ ec_new_stripe_head_alloc(struct bch_fs *c, unsigned target,
 void bch2_ec_stripe_head_put(struct bch_fs *c, struct ec_stripe_head *h)
 {
 	if (h->s &&
+	    h->s->allocated &&
 	    bitmap_weight(h->s->blocks_allocated,
 			  h->s->blocks.nr) == h->s->blocks.nr)
 		ec_stripe_set_pending(c, h);
@@ -1160,7 +1152,7 @@ void bch2_ec_stripe_head_put(struct bch_fs *c, struct ec_stripe_head *h)
 	mutex_unlock(&h->lock);
 }
 
-struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *c,
+struct ec_stripe_head *__bch2_ec_stripe_head_get(struct bch_fs *c,
 					       unsigned target,
 					       unsigned algo,
 					       unsigned redundancy)
@@ -1185,6 +1177,122 @@ found:
 	return h;
 }
 
+/*
+ * XXX: use a higher watermark for allocating open buckets here:
+ */
+static int new_stripe_alloc_buckets(struct bch_fs *c, struct ec_stripe_head *h)
+{
+	struct bch_devs_mask devs;
+	struct open_bucket *ob;
+	unsigned i, nr_have, nr_data =
+		min_t(unsigned, h->nr_active_devs,
+		      EC_STRIPE_MAX) - h->redundancy;
+	bool have_cache = true;
+	int ret = 0;
+
+	devs = h->devs;
+
+	for_each_set_bit(i, h->s->blocks_allocated, EC_STRIPE_MAX) {
+		__clear_bit(h->s->stripe.key.v.ptrs[i].dev, devs.d);
+		--nr_data;
+	}
+
+	BUG_ON(h->s->blocks.nr > nr_data);
+	BUG_ON(h->s->parity.nr > h->redundancy);
+
+	open_bucket_for_each(c, &h->s->parity, ob, i)
+		__clear_bit(ob->ptr.dev, devs.d);
+	open_bucket_for_each(c, &h->s->blocks, ob, i)
+		__clear_bit(ob->ptr.dev, devs.d);
+
+	percpu_down_read(&c->mark_lock);
+	rcu_read_lock();
+
+	if (h->s->parity.nr < h->redundancy) {
+		nr_have = h->s->parity.nr;
+
+		ret = bch2_bucket_alloc_set(c, &h->s->parity,
+					    &h->parity_stripe,
+					    &devs,
+					    h->redundancy,
+					    &nr_have,
+					    &have_cache,
+					    RESERVE_NONE,
+					    0,
+					    NULL);
+		if (ret)
+			goto err;
+	}
+
+	if (h->s->blocks.nr < nr_data) {
+		nr_have = h->s->blocks.nr;
+
+		ret = bch2_bucket_alloc_set(c, &h->s->blocks,
+					    &h->block_stripe,
+					    &devs,
+					    nr_data,
+					    &nr_have,
+					    &have_cache,
+					    RESERVE_NONE,
+					    0,
+					    NULL);
+		if (ret)
+			goto err;
+	}
+err:
+	rcu_read_unlock();
+	percpu_up_read(&c->mark_lock);
+	return ret;
+}
+
+struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *c,
+					       unsigned target,
+					       unsigned algo,
+					       unsigned redundancy)
+{
+	struct closure cl;
+	struct ec_stripe_head *h;
+	struct open_bucket *ob;
+	unsigned i, data_idx = 0;
+
+	closure_init_stack(&cl);
+
+	h = __bch2_ec_stripe_head_get(c, target, algo, redundancy);
+	if (!h)
+		return NULL;
+
+	if (!h->s && ec_new_stripe_alloc(c, h)) {
+		bch2_ec_stripe_head_put(c, h);
+		return NULL;
+	}
+
+	if (!h->s->allocated) {
+		if (new_stripe_alloc_buckets(c, h)) {
+			bch2_ec_stripe_head_put(c, h);
+			h = NULL;
+			goto out;
+		}
+
+		open_bucket_for_each(c, &h->s->blocks, ob, i) {
+			data_idx = find_next_zero_bit(h->s->blocks_allocated,
+						      h->s->nr_data, data_idx);
+			BUG_ON(data_idx >= h->s->nr_data);
+
+			h->s->stripe.key.v.ptrs[data_idx] = ob->ptr;
+			h->s->data_block_idx[i] = data_idx;
+			data_idx++;
+		}
+
+		open_bucket_for_each(c, &h->s->parity, ob, i)
+			h->s->stripe.key.v.ptrs[h->s->nr_data + i] = ob->ptr;
+
+		h->s->allocated = true;
+	}
+out:
+	closure_sync(&cl);
+	return h;
+}
+
 void bch2_ec_stop_dev(struct bch_fs *c, struct bch_dev *ca)
 {
 	struct ec_stripe_head *h;
@@ -1195,9 +1303,6 @@ void bch2_ec_stop_dev(struct bch_fs *c, struct bch_dev *ca)
 	list_for_each_entry(h, &c->ec_stripe_head_list, list) {
 
 		mutex_lock(&h->lock);
-		bch2_open_buckets_stop_dev(c, ca, &h->blocks);
-		bch2_open_buckets_stop_dev(c, ca, &h->parity);
-
 		if (!h->s)
 			goto unlock;
 
diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h
index 6f9354f82656..d7396885792e 100644
--- a/fs/bcachefs/ec.h
+++ b/fs/bcachefs/ec.h
@@ -92,11 +92,15 @@ struct ec_stripe_new {
 	atomic_t		pin;
 
 	int			err;
-	bool			pending;
 
+	u8			nr_data;
+	u8			nr_parity;
+	bool			allocated;
+	bool			pending;
 	unsigned long		blocks_allocated[BITS_TO_LONGS(EC_STRIPE_MAX)];
 
 	struct open_buckets	blocks;
+	u8			data_block_idx[EC_STRIPE_MAX];
 	struct open_buckets	parity;
 
 	struct keylist		keys;
-- 
cgit 


From 0ba95acc4499c84156144316dfb08d68930cc1a9 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 30 Jun 2020 14:44:19 -0400
Subject: bcachefs: Allow existing stripes to be updated with new data buckets

This solves internal fragmentation within stripes. We already have
copygc, which evacuates buckets that are partially or mostly empty, but
it's up to the ec code that manages stripes to deal with stripes that
have empty buckets in them.

This patch changes the path for creating new stripes to check if there's
existing stripes with empty buckets - and if so, update them with new
data buckets instead of creating new stripes.

TODO: improve the disk space accounting so that we can only use this
(more expensive path) when we have too much fragmentation in existing
stripes.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/ec.c | 97 ++++++++++++++++++++++++++++++++++++++++++++++++++++++--
 fs/bcachefs/ec.h |  6 ++--
 2 files changed, 98 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index 8d8683f8b2df..1ff1509558d9 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -638,6 +638,7 @@ void bch2_stripes_heap_update(struct bch_fs *c,
 
 static int ec_stripe_delete(struct bch_fs *c, size_t idx)
 {
+	//pr_info("deleting stripe %zu", idx);
 	return bch2_btree_delete_range(c, BTREE_ID_EC,
 				       POS(0, idx),
 				       POS(0, idx + 1),
@@ -756,6 +757,8 @@ static int ec_stripe_update_ptrs(struct bch_fs *c,
 	bkey_on_stack_init(&sk);
 	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
 
+	/* XXX this doesn't support the reflink btree */
+
 	iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
 				   bkey_start_pos(pos),
 				   BTREE_ITER_INTENT);
@@ -856,7 +859,10 @@ static void ec_stripe_create(struct ec_stripe_new *s)
 			goto err_put_writes;
 		}
 
-	ret = ec_stripe_bkey_insert(c, &s->stripe.key);
+	ret = s->existing_stripe
+		? bch2_btree_insert(c, BTREE_ID_EC, &s->stripe.key.k_i,
+				    NULL, NULL, BTREE_INSERT_NOFAIL)
+		: ec_stripe_bkey_insert(c, &s->stripe.key);
 	if (ret) {
 		bch_err(c, "error creating stripe: error creating stripe key");
 		goto err_put_writes;
@@ -864,12 +870,19 @@ static void ec_stripe_create(struct ec_stripe_new *s)
 
 	for_each_keylist_key(&s->keys, k) {
 		ret = ec_stripe_update_ptrs(c, &s->stripe, &k->k);
-		if (ret)
+		if (ret) {
+			bch_err(c, "error creating stripe: error updating pointers");
 			break;
+		}
 	}
 
 	spin_lock(&c->ec_stripes_heap_lock);
 	m = genradix_ptr(&c->stripes[0], s->stripe.key.k.p.offset);
+#if 0
+	pr_info("created a %s stripe %llu",
+		s->existing_stripe ? "existing" : "new",
+		s->stripe.key.k.p.offset);
+#endif
 	BUG_ON(m->on_heap);
 	bch2_stripes_heap_insert(c, m, s->stripe.key.k.p.offset);
 	spin_unlock(&c->ec_stripes_heap_lock);
@@ -975,6 +988,8 @@ void bch2_ec_add_backpointer(struct bch_fs *c, struct write_point *wp,
 	if (!ob)
 		return;
 
+	//pr_info("adding backpointer at %llu:%llu", pos.inode, pos.offset);
+
 	ec = ob->ec;
 	mutex_lock(&ec->lock);
 
@@ -1033,6 +1048,11 @@ static unsigned pick_blocksize(struct bch_fs *c,
 	return best.size;
 }
 
+static bool may_create_new_stripe(struct bch_fs *c)
+{
+	return false;
+}
+
 static void ec_stripe_key_init(struct bch_fs *c,
 			       struct bkey_i_stripe *s,
 			       unsigned nr_data,
@@ -1245,6 +1265,59 @@ err:
 	return ret;
 }
 
+/* XXX: doesn't obey target: */
+static s64 get_existing_stripe(struct bch_fs *c,
+			       unsigned target,
+			       unsigned algo,
+			       unsigned redundancy)
+{
+	ec_stripes_heap *h = &c->ec_stripes_heap;
+	struct stripe *m;
+	size_t heap_idx;
+	u64 stripe_idx;
+
+	if (may_create_new_stripe(c))
+		return -1;
+
+	spin_lock(&c->ec_stripes_heap_lock);
+	for (heap_idx = 0; heap_idx < h->used; heap_idx++) {
+		if (!h->data[heap_idx].blocks_nonempty)
+			continue;
+
+		stripe_idx = h->data[heap_idx].idx;
+		m = genradix_ptr(&c->stripes[0], stripe_idx);
+
+		if (m->algorithm	== algo &&
+		    m->nr_redundant	== redundancy &&
+		    m->blocks_nonempty	< m->nr_blocks - m->nr_redundant) {
+			bch2_stripes_heap_del(c, m, stripe_idx);
+			spin_unlock(&c->ec_stripes_heap_lock);
+			return stripe_idx;
+		}
+	}
+
+	spin_unlock(&c->ec_stripes_heap_lock);
+	return -1;
+}
+
+static int get_stripe_key(struct bch_fs *c, u64 idx, struct ec_stripe_buf *stripe)
+{
+	struct btree_trans trans;
+	struct btree_iter *iter;
+	struct bkey_s_c k;
+	int ret;
+
+	bch2_trans_init(&trans, c, 0, 0);
+	iter = bch2_trans_get_iter(&trans, BTREE_ID_EC, POS(0, idx), BTREE_ITER_SLOTS);
+	k = bch2_btree_iter_peek_slot(iter);
+	ret = bkey_err(k);
+	if (!ret)
+		bkey_reassemble(&stripe->key.k_i, k);
+	bch2_trans_exit(&trans);
+
+	return ret;
+}
+
 struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *c,
 					       unsigned target,
 					       unsigned algo,
@@ -1254,6 +1327,7 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *c,
 	struct ec_stripe_head *h;
 	struct open_bucket *ob;
 	unsigned i, data_idx = 0;
+	s64 idx;
 
 	closure_init_stack(&cl);
 
@@ -1267,6 +1341,24 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *c,
 	}
 
 	if (!h->s->allocated) {
+		if (!h->s->existing_stripe &&
+		    (idx = get_existing_stripe(c, target, algo, redundancy)) >= 0) {
+			//pr_info("got existing stripe %llu", idx);
+
+			h->s->existing_stripe = true;
+			h->s->existing_stripe_idx = idx;
+			if (get_stripe_key(c, idx, &h->s->stripe)) {
+				/* btree error */
+				BUG();
+			}
+
+			for (i = 0; i < h->s->stripe.key.v.nr_blocks; i++)
+				if (stripe_blockcount_get(&h->s->stripe.key.v, i)) {
+					__set_bit(i, h->s->blocks_allocated);
+					ec_block_io(c, &h->s->stripe, READ, i, &cl);
+				}
+		}
+
 		if (new_stripe_alloc_buckets(c, h)) {
 			bch2_ec_stripe_head_put(c, h);
 			h = NULL;
@@ -1286,6 +1378,7 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *c,
 		open_bucket_for_each(c, &h->s->parity, ob, i)
 			h->s->stripe.key.v.ptrs[h->s->nr_data + i] = ob->ptr;
 
+		//pr_info("new stripe, blocks_allocated %lx", h->s->blocks_allocated[0]);
 		h->s->allocated = true;
 	}
 out:
diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h
index d7396885792e..ad9078fdb045 100644
--- a/fs/bcachefs/ec.h
+++ b/fs/bcachefs/ec.h
@@ -97,6 +97,9 @@ struct ec_stripe_new {
 	u8			nr_parity;
 	bool			allocated;
 	bool			pending;
+	bool			existing_stripe;
+	u64			existing_stripe_idx;
+
 	unsigned long		blocks_allocated[BITS_TO_LONGS(EC_STRIPE_MAX)];
 
 	struct open_buckets	blocks;
@@ -125,9 +128,6 @@ struct ec_stripe_head {
 	struct dev_stripe_state	block_stripe;
 	struct dev_stripe_state	parity_stripe;
 
-	struct open_buckets	blocks;
-	struct open_buckets	parity;
-
 	struct ec_stripe_new	*s;
 };
 
-- 
cgit 


From 912bdf17a849990f7241e294e48629987553b94c Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 9 Jul 2020 13:54:58 -0400
Subject: bcachefs: Fix short buffered writes

In the buffered write path, we have to check for short writes that write
to the full page, where the page wasn't UpToDate; when this happens, the
page is partly garbage, so we have to zero it out and revert that part
of the write.

This check was wrong - we reverted total from copied, but didn't revert
the iov_iter, probably also leading to corrupted writes.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-io.c | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index d379581c2517..c0995723ddd2 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -1454,23 +1454,23 @@ retry_reservation:
 		if (!pg_copied)
 			break;
 
+		if (!PageUptodate(page) &&
+		    pg_copied != PAGE_SIZE &&
+		    pos + copied + pg_copied < inode->v.i_size) {
+			zero_user(page, 0, PAGE_SIZE);
+			break;
+		}
+
 		flush_dcache_page(page);
 		copied += pg_copied;
+
+		if (pg_copied != pg_len)
+			break;
 	}
 
 	if (!copied)
 		goto out;
 
-	if (copied < len &&
-	    ((offset + copied) & (PAGE_SIZE - 1))) {
-		struct page *page = pages[(offset + copied) >> PAGE_SHIFT];
-
-		if (!PageUptodate(page)) {
-			zero_user(page, 0, PAGE_SIZE);
-			copied -= (offset + copied) & (PAGE_SIZE - 1);
-		}
-	}
-
 	spin_lock(&inode->v.i_lock);
 	if (pos + copied > inode->v.i_size)
 		i_size_write(&inode->v, pos + copied);
@@ -1567,6 +1567,7 @@ again:
 		}
 		pos += ret;
 		written += ret;
+		ret = 0;
 
 		balance_dirty_pages_ratelimited(mapping);
 	} while (iov_iter_count(iter));
-- 
cgit 


From 89fd25be70b4e3fc540d4cf591a02898470f1ef0 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 9 Jul 2020 18:28:11 -0400
Subject: bcachefs: Use x-macros for data types

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c |  8 ++++----
 fs/bcachefs/alloc_foreground.c | 18 +++++++++---------
 fs/bcachefs/bcachefs_format.h  | 19 ++++++++++++-------
 fs/bcachefs/btree_gc.c         | 14 +++++++-------
 fs/bcachefs/btree_io.c         |  4 ++--
 fs/bcachefs/buckets.c          | 40 ++++++++++++++++++++--------------------
 fs/bcachefs/buckets.h          |  4 ++--
 fs/bcachefs/ec.c               |  2 +-
 fs/bcachefs/extents.c          |  4 ++--
 fs/bcachefs/io.c               |  6 +++---
 fs/bcachefs/journal.c          |  4 ++--
 fs/bcachefs/journal_io.c       | 10 +++++-----
 fs/bcachefs/journal_reclaim.c  |  8 ++++----
 fs/bcachefs/move.c             | 10 +++++-----
 fs/bcachefs/movinggc.c         |  2 +-
 fs/bcachefs/opts.c             |  9 +++------
 fs/bcachefs/replicas.c         | 22 +++++++++++-----------
 fs/bcachefs/replicas.h         |  2 +-
 fs/bcachefs/super-io.c         |  4 ++--
 fs/bcachefs/super.c            |  4 ++--
 fs/bcachefs/sysfs.c            | 24 ++++++++++++------------
 21 files changed, 110 insertions(+), 108 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 98dd4995e528..b1a8192f2751 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -53,10 +53,10 @@ static void pd_controllers_update(struct work_struct *work)
 		 * reclaimed by copy GC
 		 */
 		s64 fragmented = (bucket_to_sector(ca,
-					stats.buckets[BCH_DATA_USER] +
-					stats.buckets[BCH_DATA_CACHED]) -
-				  (stats.sectors[BCH_DATA_USER] +
-				   stats.sectors[BCH_DATA_CACHED])) << 9;
+					stats.buckets[BCH_DATA_user] +
+					stats.buckets[BCH_DATA_cached]) -
+				  (stats.sectors[BCH_DATA_user] +
+				   stats.sectors[BCH_DATA_cached])) << 9;
 
 		fragmented = max(0LL, fragmented);
 
diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index 1675f0dfca8a..32f7e38c086e 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -534,7 +534,7 @@ static void get_buckets_from_writepoint(struct bch_fs *c,
 		if (*nr_effective < nr_replicas &&
 		    test_bit(ob->ptr.dev, devs_may_alloc->d) &&
 		    (ca->mi.durability ||
-		     (wp->type == BCH_DATA_USER && !*have_cache)) &&
+		     (wp->type == BCH_DATA_user && !*have_cache)) &&
 		    (ob->ec || !need_ec)) {
 			add_new_bucket(c, ptrs, devs_may_alloc,
 				       nr_effective, have_cache,
@@ -813,11 +813,11 @@ retry:
 
 	wp = writepoint_find(c, write_point.v);
 
-	if (wp->type == BCH_DATA_USER)
+	if (wp->type == BCH_DATA_user)
 		ob_flags |= BUCKET_MAY_ALLOC_PARTIAL;
 
 	/* metadata may not allocate on cache devices: */
-	if (wp->type != BCH_DATA_USER)
+	if (wp->type != BCH_DATA_user)
 		have_cache = true;
 
 	if (!target || (flags & BCH_WRITE_ONLY_SPECIFIED_DEVS)) {
@@ -856,7 +856,7 @@ alloc_done:
 
 	/* Free buckets we didn't use: */
 	open_bucket_for_each(c, &wp->ptrs, ob, i)
-		open_bucket_free_unused(c, ob, wp->type == BCH_DATA_USER);
+		open_bucket_free_unused(c, ob, wp->type == BCH_DATA_user);
 
 	wp->ptrs = ptrs;
 
@@ -876,7 +876,7 @@ err:
 			ob_push(c, &ptrs, ob);
 		else
 			open_bucket_free_unused(c, ob,
-					wp->type == BCH_DATA_USER);
+					wp->type == BCH_DATA_user);
 	wp->ptrs = ptrs;
 
 	mutex_unlock(&wp->lock);
@@ -907,7 +907,7 @@ void bch2_alloc_sectors_append_ptrs(struct bch_fs *c, struct write_point *wp,
 		struct bch_extent_ptr tmp = ob->ptr;
 
 		tmp.cached = !ca->mi.durability &&
-			wp->type == BCH_DATA_USER;
+			wp->type == BCH_DATA_user;
 
 		tmp.offset += ca->mi.bucket_size - ob->sectors_free;
 		bch2_bkey_append_ptr(k, tmp);
@@ -956,12 +956,12 @@ void bch2_fs_allocator_foreground_init(struct bch_fs *c)
 		c->open_buckets_freelist = ob - c->open_buckets;
 	}
 
-	writepoint_init(&c->btree_write_point, BCH_DATA_BTREE);
-	writepoint_init(&c->rebalance_write_point, BCH_DATA_USER);
+	writepoint_init(&c->btree_write_point, BCH_DATA_btree);
+	writepoint_init(&c->rebalance_write_point, BCH_DATA_user);
 
 	for (wp = c->write_points;
 	     wp < c->write_points + c->write_points_nr; wp++) {
-		writepoint_init(wp, BCH_DATA_USER);
+		writepoint_init(wp, BCH_DATA_user);
 
 		wp->last_used	= sched_clock();
 		wp->write_point	= (unsigned long) wp;
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index 14eca567a10d..a5b0c308fc46 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -1030,14 +1030,19 @@ LE64_BITMASK(BCH_KDF_SCRYPT_P,	struct bch_sb_field_crypt, kdf_flags, 32, 48);
 
 /* BCH_SB_FIELD_replicas: */
 
+#define BCH_DATA_TYPES()		\
+	x(none,		0)		\
+	x(sb,		1)		\
+	x(journal,	2)		\
+	x(btree,	3)		\
+	x(user,		4)		\
+	x(cached,	5)
+
 enum bch_data_type {
-	BCH_DATA_NONE		= 0,
-	BCH_DATA_SB		= 1,
-	BCH_DATA_JOURNAL	= 2,
-	BCH_DATA_BTREE		= 3,
-	BCH_DATA_USER		= 4,
-	BCH_DATA_CACHED		= 5,
-	BCH_DATA_NR		= 6,
+#define x(t, n) BCH_DATA_##t,
+	BCH_DATA_TYPES()
+#undef x
+	BCH_DATA_NR
 };
 
 struct bch_replicas_entry_v0 {
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index f32e8009e444..36fa4853e8a1 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -435,16 +435,16 @@ void bch2_mark_dev_superblock(struct bch_fs *c, struct bch_dev *ca,
 
 		if (offset == BCH_SB_SECTOR)
 			mark_metadata_sectors(c, ca, 0, BCH_SB_SECTOR,
-					      BCH_DATA_SB, flags);
+					      BCH_DATA_sb, flags);
 
 		mark_metadata_sectors(c, ca, offset,
 				      offset + (1 << layout->sb_max_size_bits),
-				      BCH_DATA_SB, flags);
+				      BCH_DATA_sb, flags);
 	}
 
 	for (i = 0; i < ca->journal.nr; i++) {
 		b = ca->journal.buckets[i];
-		bch2_mark_metadata_bucket(c, ca, b, BCH_DATA_JOURNAL,
+		bch2_mark_metadata_bucket(c, ca, b, BCH_DATA_journal,
 					  ca->mi.bucket_size,
 					  gc_phase(GC_PHASE_SB), flags);
 	}
@@ -678,8 +678,8 @@ static int bch2_gc_done(struct bch_fs *c,
 			char buf[80];
 
 			if (metadata_only &&
-			    (e->data_type == BCH_DATA_USER ||
-			     e->data_type == BCH_DATA_CACHED))
+			    (e->data_type == BCH_DATA_user ||
+			     e->data_type == BCH_DATA_cached))
 				continue;
 
 			bch2_replicas_entry_to_text(&PBUF(buf), e);
@@ -764,8 +764,8 @@ static int bch2_gc_start(struct bch_fs *c,
 			d->gen_valid = s->gen_valid;
 
 			if (metadata_only &&
-			    (s->mark.data_type == BCH_DATA_USER ||
-			     s->mark.data_type == BCH_DATA_CACHED)) {
+			    (s->mark.data_type == BCH_DATA_user ||
+			     s->mark.data_type == BCH_DATA_cached)) {
 				d->_mark = s->mark;
 				d->_mark.owned_by_allocator = 0;
 			}
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index d5240598e7d3..a7d150811f7a 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -1231,7 +1231,7 @@ void bch2_btree_node_read(struct bch_fs *c, struct btree *b,
 	set_btree_node_read_in_flight(b);
 
 	if (rb->have_ioref) {
-		this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_BTREE],
+		this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_btree],
 			     bio_sectors(bio));
 		bio_set_dev(bio, ca->disk_sb.bdev);
 
@@ -1701,7 +1701,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
 	b->written += sectors_to_write;
 
 	/* XXX: submitting IO with btree locks held: */
-	bch2_submit_wbio_replicas(&wbio->wbio, c, BCH_DATA_BTREE, &k.key);
+	bch2_submit_wbio_replicas(&wbio->wbio, c, BCH_DATA_btree, &k.key);
 	return;
 err:
 	set_btree_node_noevict(b);
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index aff1ace3778f..fde5fba2841e 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -133,13 +133,13 @@ void bch2_fs_usage_initialize(struct bch_fs *c)
 			cpu_replicas_entry(&c->replicas, i);
 
 		switch (e->data_type) {
-		case BCH_DATA_BTREE:
+		case BCH_DATA_btree:
 			usage->btree	+= usage->replicas[i];
 			break;
-		case BCH_DATA_USER:
+		case BCH_DATA_user:
 			usage->data	+= usage->replicas[i];
 			break;
-		case BCH_DATA_CACHED:
+		case BCH_DATA_cached:
 			usage->cached	+= usage->replicas[i];
 			break;
 		}
@@ -367,7 +367,7 @@ static inline int is_fragmented_bucket(struct bucket_mark m,
 				       struct bch_dev *ca)
 {
 	if (!m.owned_by_allocator &&
-	    m.data_type == BCH_DATA_USER &&
+	    m.data_type == BCH_DATA_user &&
 	    bucket_sectors_used(m))
 		return max_t(int, 0, (int) ca->mi.bucket_size -
 			     bucket_sectors_used(m));
@@ -382,7 +382,7 @@ static inline int bucket_stripe_sectors(struct bucket_mark m)
 static inline enum bch_data_type bucket_type(struct bucket_mark m)
 {
 	return m.cached_sectors && !m.dirty_sectors
-		? BCH_DATA_CACHED
+		? BCH_DATA_cached
 		: m.data_type;
 }
 
@@ -437,7 +437,7 @@ static inline void account_bucket(struct bch_fs_usage *fs_usage,
 				  enum bch_data_type type,
 				  int nr, s64 size)
 {
-	if (type == BCH_DATA_SB || type == BCH_DATA_JOURNAL)
+	if (type == BCH_DATA_sb || type == BCH_DATA_journal)
 		fs_usage->hidden	+= size;
 
 	dev_usage->buckets[type]	+= nr;
@@ -472,7 +472,7 @@ static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
 
 	u->sectors[old.data_type] -= old.dirty_sectors;
 	u->sectors[new.data_type] += new.dirty_sectors;
-	u->sectors[BCH_DATA_CACHED] +=
+	u->sectors[BCH_DATA_cached] +=
 		(int) new.cached_sectors - (int) old.cached_sectors;
 	u->sectors_fragmented +=
 		is_fragmented_bucket(new, ca) - is_fragmented_bucket(old, ca);
@@ -520,13 +520,13 @@ static inline int update_replicas(struct bch_fs *c,
 		return 0;
 
 	switch (r->data_type) {
-	case BCH_DATA_BTREE:
+	case BCH_DATA_btree:
 		fs_usage->btree		+= sectors;
 		break;
-	case BCH_DATA_USER:
+	case BCH_DATA_user:
 		fs_usage->data		+= sectors;
 		break;
-	case BCH_DATA_CACHED:
+	case BCH_DATA_cached:
 		fs_usage->cached	+= sectors;
 		break;
 	}
@@ -798,8 +798,8 @@ static int __bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
 	struct bucket_mark old, new;
 	bool overflow;
 
-	BUG_ON(data_type != BCH_DATA_SB &&
-	       data_type != BCH_DATA_JOURNAL);
+	BUG_ON(data_type != BCH_DATA_sb &&
+	       data_type != BCH_DATA_journal);
 
 	old = bucket_cmpxchg(g, new, ({
 		new.data_type	= data_type;
@@ -830,8 +830,8 @@ void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
 			       unsigned sectors, struct gc_pos pos,
 			       unsigned flags)
 {
-	BUG_ON(type != BCH_DATA_SB &&
-	       type != BCH_DATA_JOURNAL);
+	BUG_ON(type != BCH_DATA_sb &&
+	       type != BCH_DATA_journal);
 
 	preempt_disable();
 
@@ -1123,7 +1123,7 @@ static int bch2_mark_extent(struct bch_fs *c,
 	BUG_ON(!sectors);
 
 	bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
-		s64 disk_sectors = data_type == BCH_DATA_BTREE
+		s64 disk_sectors = data_type == BCH_DATA_btree
 			? sectors
 			: ptr_disk_sectors_delta(p, offset, sectors, flags);
 
@@ -1285,12 +1285,12 @@ static int bch2_mark_key_locked(struct bch_fs *c,
 			: -c->opts.btree_node_size;
 
 		ret = bch2_mark_extent(c, old, new, offset, sectors,
-				BCH_DATA_BTREE, fs_usage, journal_seq, flags);
+				BCH_DATA_btree, fs_usage, journal_seq, flags);
 		break;
 	case KEY_TYPE_extent:
 	case KEY_TYPE_reflink_v:
 		ret = bch2_mark_extent(c, old, new, offset, sectors,
-				BCH_DATA_USER, fs_usage, journal_seq, flags);
+				BCH_DATA_user, fs_usage, journal_seq, flags);
 		break;
 	case KEY_TYPE_stripe:
 		ret = bch2_mark_stripe(c, old, new, fs_usage, journal_seq, flags);
@@ -1668,7 +1668,7 @@ static int bch2_trans_mark_extent(struct btree_trans *trans,
 	BUG_ON(!sectors);
 
 	bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
-		s64 disk_sectors = data_type == BCH_DATA_BTREE
+		s64 disk_sectors = data_type == BCH_DATA_btree
 			? sectors
 			: ptr_disk_sectors_delta(p, offset, sectors, flags);
 
@@ -1810,11 +1810,11 @@ int bch2_trans_mark_key(struct btree_trans *trans, struct bkey_s_c k,
 			: -c->opts.btree_node_size;
 
 		return bch2_trans_mark_extent(trans, k, offset, sectors,
-					      flags, BCH_DATA_BTREE);
+					      flags, BCH_DATA_btree);
 	case KEY_TYPE_extent:
 	case KEY_TYPE_reflink_v:
 		return bch2_trans_mark_extent(trans, k, offset, sectors,
-					      flags, BCH_DATA_USER);
+					      flags, BCH_DATA_user);
 	case KEY_TYPE_inode:
 		d = replicas_deltas_realloc(trans, 0);
 
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index 5ba13b99bd65..44a5b6df8c8b 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -99,9 +99,9 @@ static inline enum bch_data_type ptr_data_type(const struct bkey *k,
 {
 	if (k->type == KEY_TYPE_btree_ptr ||
 	    k->type == KEY_TYPE_btree_ptr_v2)
-		return BCH_DATA_BTREE;
+		return BCH_DATA_btree;
 
-	return ptr->cached ? BCH_DATA_CACHED : BCH_DATA_USER;
+	return ptr->cached ? BCH_DATA_cached : BCH_DATA_user;
 }
 
 static inline struct bucket_mark ptr_bucket_mark(struct bch_dev *ca,
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index 1ff1509558d9..43f4b2f0d1bf 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -1144,7 +1144,7 @@ ec_new_stripe_head_alloc(struct bch_fs *c, unsigned target,
 	h->redundancy	= redundancy;
 
 	rcu_read_lock();
-	h->devs = target_rw_devs(c, BCH_DATA_USER, target);
+	h->devs = target_rw_devs(c, BCH_DATA_user, target);
 
 	for_each_member_device_rcu(ca, c, i, &h->devs)
 		if (!ca->mi.durability)
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 62eb3b1e2cbf..5c1329360d9b 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -193,7 +193,7 @@ void bch2_btree_ptr_debugcheck(struct bch_fs *c, struct bkey_s_c k)
 			goto err;
 
 		err = "inconsistent";
-		if (mark.data_type != BCH_DATA_BTREE ||
+		if (mark.data_type != BCH_DATA_btree ||
 		    mark.dirty_sectors < c->opts.btree_node_size)
 			goto err;
 	}
@@ -288,7 +288,7 @@ void bch2_extent_debugcheck(struct bch_fs *c, struct bkey_s_c k)
 			"key too stale: %i", stale);
 
 		bch2_fs_inconsistent_on(!stale &&
-			(mark.data_type != BCH_DATA_USER ||
+			(mark.data_type != BCH_DATA_user ||
 			 mark_sectors < disk_sectors), c,
 			"extent pointer not marked: %s:\n"
 			"type %u sectors %u < %u",
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index 5763654db310..be59b615b2db 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -486,7 +486,7 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
 
 			bio_set_dev(&n->bio, ca->disk_sb.bdev);
 
-			if (type != BCH_DATA_BTREE && unlikely(c->opts.no_data_io)) {
+			if (type != BCH_DATA_btree && unlikely(c->opts.no_data_io)) {
 				bio_endio(&n->bio);
 				continue;
 			}
@@ -1128,7 +1128,7 @@ again:
 		key_to_write = (void *) (op->insert_keys.keys_p +
 					 key_to_write_offset);
 
-		bch2_submit_wbio_replicas(to_wbio(bio), c, BCH_DATA_USER,
+		bch2_submit_wbio_replicas(to_wbio(bio), c, BCH_DATA_user,
 					  key_to_write);
 	} while (ret);
 
@@ -2170,7 +2170,7 @@ get_bio:
 			goto out;
 		}
 
-		this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_USER],
+		this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_user],
 			     bio_sectors(&rbio->bio));
 		bio_set_dev(&rbio->bio, ca->disk_sb.bdev);
 
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index cbfaec5143d8..127787cd3e03 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -846,7 +846,7 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
 		if (pos <= ja->cur_idx)
 			ja->cur_idx = (ja->cur_idx + 1) % ja->nr;
 
-		bch2_mark_metadata_bucket(c, ca, bucket, BCH_DATA_JOURNAL,
+		bch2_mark_metadata_bucket(c, ca, bucket, BCH_DATA_journal,
 					  ca->mi.bucket_size,
 					  gc_phase(GC_PHASE_SB),
 					  0);
@@ -1198,7 +1198,7 @@ ssize_t bch2_journal_print_debug(struct journal *j, char *buf)
 	       test_bit(JOURNAL_REPLAY_DONE,	&j->flags));
 
 	for_each_member_device_rcu(ca, c, iter,
-				   &c->rw_devs[BCH_DATA_JOURNAL]) {
+				   &c->rw_devs[BCH_DATA_journal]) {
 		struct journal_device *ja = &ca->journal;
 
 		if (!ja->nr)
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 4d20762f55d3..a4c2b80e8aa5 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -660,7 +660,7 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list)
 
 	for_each_member_device(ca, c, iter) {
 		if (!test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) &&
-		    !(bch2_dev_has_data(c, ca) & (1 << BCH_DATA_JOURNAL)))
+		    !(bch2_dev_has_data(c, ca) & (1 << BCH_DATA_journal)))
 			continue;
 
 		if ((ca->mi.state == BCH_MEMBER_STATE_RW ||
@@ -694,7 +694,7 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list)
 		 * the devices - this is wrong:
 		 */
 
-		bch2_devlist_to_replicas(&replicas.e, BCH_DATA_JOURNAL, i->devs);
+		bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal, i->devs);
 
 		if (!degraded &&
 		    (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) ||
@@ -795,7 +795,7 @@ static int journal_write_alloc(struct journal *j, struct journal_buf *w,
 	rcu_read_lock();
 
 	devs_sorted = bch2_dev_alloc_list(c, &j->wp.stripe,
-					  &c->rw_devs[BCH_DATA_JOURNAL]);
+					  &c->rw_devs[BCH_DATA_journal]);
 
 	__journal_write_alloc(j, w, &devs_sorted,
 			      sectors, &replicas, replicas_want);
@@ -913,7 +913,7 @@ static void journal_write_done(struct closure *cl)
 		goto err;
 	}
 
-	bch2_devlist_to_replicas(&replicas.e, BCH_DATA_JOURNAL, devs);
+	bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal, devs);
 
 	if (bch2_mark_replicas(c, &replicas.e))
 		goto err;
@@ -1105,7 +1105,7 @@ retry_alloc:
 			continue;
 		}
 
-		this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_JOURNAL],
+		this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_journal],
 			     sectors);
 
 		bio = ca->journal.bio;
diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
index 1162acffdf45..49ff26cb246c 100644
--- a/fs/bcachefs/journal_reclaim.c
+++ b/fs/bcachefs/journal_reclaim.c
@@ -70,7 +70,7 @@ static struct journal_space {
 
 	rcu_read_lock();
 	for_each_member_device_rcu(ca, c, i,
-				   &c->rw_devs[BCH_DATA_JOURNAL]) {
+				   &c->rw_devs[BCH_DATA_journal]) {
 		struct journal_device *ja = &ca->journal;
 		unsigned buckets_this_device, sectors_this_device;
 
@@ -139,7 +139,7 @@ void bch2_journal_space_available(struct journal *j)
 
 	rcu_read_lock();
 	for_each_member_device_rcu(ca, c, i,
-				   &c->rw_devs[BCH_DATA_JOURNAL]) {
+				   &c->rw_devs[BCH_DATA_journal]) {
 		struct journal_device *ja = &ca->journal;
 
 		if (!ja->nr)
@@ -618,7 +618,7 @@ int bch2_journal_flush_device_pins(struct journal *j, int dev_idx)
 		return ret;
 
 	mutex_lock(&c->replicas_gc_lock);
-	bch2_replicas_gc_start(c, 1 << BCH_DATA_JOURNAL);
+	bch2_replicas_gc_start(c, 1 << BCH_DATA_journal);
 
 	seq = 0;
 
@@ -627,7 +627,7 @@ int bch2_journal_flush_device_pins(struct journal *j, int dev_idx)
 		struct bch_replicas_padded replicas;
 
 		seq = max(seq, journal_last_seq(j));
-		bch2_devlist_to_replicas(&replicas.e, BCH_DATA_JOURNAL,
+		bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal,
 					 journal_seq_pin(j, seq)->devs);
 		seq++;
 
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index a061e60e3d7a..62626cc13ced 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -516,7 +516,7 @@ static int __bch2_move_data(struct bch_fs *c,
 	bkey_on_stack_init(&sk);
 	bch2_trans_init(&trans, c, 0, 0);
 
-	stats->data_type = BCH_DATA_USER;
+	stats->data_type = BCH_DATA_user;
 	stats->btree_id	= btree_id;
 	stats->pos	= POS_MIN;
 
@@ -641,7 +641,7 @@ int bch2_move_data(struct bch_fs *c,
 	INIT_LIST_HEAD(&ctxt.reads);
 	init_waitqueue_head(&ctxt.wait);
 
-	stats->data_type = BCH_DATA_USER;
+	stats->data_type = BCH_DATA_user;
 
 	ret =   __bch2_move_data(c, &ctxt, rate, wp, start, end,
 				 pred, arg, stats, BTREE_ID_EXTENTS) ?:
@@ -676,7 +676,7 @@ static int bch2_move_btree(struct bch_fs *c,
 
 	bch2_trans_init(&trans, c, 0, 0);
 
-	stats->data_type = BCH_DATA_BTREE;
+	stats->data_type = BCH_DATA_btree;
 
 	for (id = 0; id < BTREE_ID_NR; id++) {
 		stats->btree_id = id;
@@ -772,7 +772,7 @@ int bch2_data_job(struct bch_fs *c,
 
 	switch (op.op) {
 	case BCH_DATA_OP_REREPLICATE:
-		stats->data_type = BCH_DATA_JOURNAL;
+		stats->data_type = BCH_DATA_journal;
 		ret = bch2_journal_flush_device_pins(&c->journal, -1);
 
 		ret = bch2_move_btree(c, rereplicate_pred, c, stats) ?: ret;
@@ -793,7 +793,7 @@ int bch2_data_job(struct bch_fs *c,
 		if (op.migrate.dev >= c->sb.nr_devices)
 			return -EINVAL;
 
-		stats->data_type = BCH_DATA_JOURNAL;
+		stats->data_type = BCH_DATA_journal;
 		ret = bch2_journal_flush_device_pins(&c->journal, op.migrate.dev);
 
 		ret = bch2_move_btree(c, migrate_pred, &op, stats) ?: ret;
diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c
index d22f26c02b67..135bbc102b53 100644
--- a/fs/bcachefs/movinggc.c
+++ b/fs/bcachefs/movinggc.c
@@ -160,7 +160,7 @@ static void bch2_copygc(struct bch_fs *c, struct bch_dev *ca)
 		struct copygc_heap_entry e;
 
 		if (m.owned_by_allocator ||
-		    m.data_type != BCH_DATA_USER ||
+		    m.data_type != BCH_DATA_user ||
 		    !bucket_sectors_used(m) ||
 		    bucket_sectors_used(m) >= ca->mi.bucket_size)
 			continue;
diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c
index 94d6c044a27d..afe25cd26c06 100644
--- a/fs/bcachefs/opts.c
+++ b/fs/bcachefs/opts.c
@@ -45,12 +45,9 @@ const char * const bch2_str_hash_types[] = {
 };
 
 const char * const bch2_data_types[] = {
-	"none",
-	"sb",
-	"journal",
-	"btree",
-	"data",
-	"cached",
+#define x(t, n) #t,
+	BCH_DATA_TYPES()
+#undef x
 	NULL
 };
 
diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c
index 3e7c389f06ce..91e050732aaf 100644
--- a/fs/bcachefs/replicas.c
+++ b/fs/bcachefs/replicas.c
@@ -113,16 +113,16 @@ void bch2_bkey_to_replicas(struct bch_replicas_entry *e,
 	switch (k.k->type) {
 	case KEY_TYPE_btree_ptr:
 	case KEY_TYPE_btree_ptr_v2:
-		e->data_type = BCH_DATA_BTREE;
+		e->data_type = BCH_DATA_btree;
 		extent_to_replicas(k, e);
 		break;
 	case KEY_TYPE_extent:
 	case KEY_TYPE_reflink_v:
-		e->data_type = BCH_DATA_USER;
+		e->data_type = BCH_DATA_user;
 		extent_to_replicas(k, e);
 		break;
 	case KEY_TYPE_stripe:
-		e->data_type = BCH_DATA_USER;
+		e->data_type = BCH_DATA_user;
 		stripe_to_replicas(k, e);
 		break;
 	}
@@ -137,7 +137,7 @@ void bch2_devlist_to_replicas(struct bch_replicas_entry *e,
 	unsigned i;
 
 	BUG_ON(!data_type ||
-	       data_type == BCH_DATA_SB ||
+	       data_type == BCH_DATA_sb ||
 	       data_type >= BCH_DATA_NR);
 
 	e->data_type	= data_type;
@@ -614,7 +614,7 @@ retry:
 		struct bch_replicas_entry *e =
 			cpu_replicas_entry(&c->replicas, i);
 
-		if (e->data_type == BCH_DATA_JOURNAL ||
+		if (e->data_type == BCH_DATA_journal ||
 		    c->usage_base->replicas[i] ||
 		    percpu_u64_get(&c->usage[0]->replicas[i]) ||
 		    percpu_u64_get(&c->usage[1]->replicas[i]))
@@ -1040,13 +1040,13 @@ static bool have_enough_devs(struct replicas_status s,
 
 bool bch2_have_enough_devs(struct replicas_status s, unsigned flags)
 {
-	return (have_enough_devs(s, BCH_DATA_JOURNAL,
+	return (have_enough_devs(s, BCH_DATA_journal,
 				 flags & BCH_FORCE_IF_METADATA_DEGRADED,
 				 flags & BCH_FORCE_IF_METADATA_LOST) &&
-		have_enough_devs(s, BCH_DATA_BTREE,
+		have_enough_devs(s, BCH_DATA_btree,
 				 flags & BCH_FORCE_IF_METADATA_DEGRADED,
 				 flags & BCH_FORCE_IF_METADATA_LOST) &&
-		have_enough_devs(s, BCH_DATA_USER,
+		have_enough_devs(s, BCH_DATA_user,
 				 flags & BCH_FORCE_IF_DATA_DEGRADED,
 				 flags & BCH_FORCE_IF_DATA_LOST));
 }
@@ -1056,9 +1056,9 @@ int bch2_replicas_online(struct bch_fs *c, bool meta)
 	struct replicas_status s = bch2_replicas_status(c);
 
 	return (meta
-		? min(s.replicas[BCH_DATA_JOURNAL].redundancy,
-		      s.replicas[BCH_DATA_BTREE].redundancy)
-		: s.replicas[BCH_DATA_USER].redundancy) + 1;
+		? min(s.replicas[BCH_DATA_journal].redundancy,
+		      s.replicas[BCH_DATA_btree].redundancy)
+		: s.replicas[BCH_DATA_user].redundancy) + 1;
 }
 
 unsigned bch2_dev_has_data(struct bch_fs *c, struct bch_dev *ca)
diff --git a/fs/bcachefs/replicas.h b/fs/bcachefs/replicas.h
index 8527d82841bb..deda5f5c6e20 100644
--- a/fs/bcachefs/replicas.h
+++ b/fs/bcachefs/replicas.h
@@ -36,7 +36,7 @@ int bch2_mark_bkey_replicas(struct bch_fs *, struct bkey_s_c);
 static inline void bch2_replicas_entry_cached(struct bch_replicas_entry *e,
 					      unsigned dev)
 {
-	e->data_type	= BCH_DATA_CACHED;
+	e->data_type	= BCH_DATA_cached;
 	e->nr_devs	= 1;
 	e->nr_required	= 1;
 	e->devs[0]	= dev;
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index 8541db5e5e48..0913ffd23776 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -659,7 +659,7 @@ static void read_back_super(struct bch_fs *c, struct bch_dev *ca)
 	bio->bi_private		= ca;
 	bch2_bio_map(bio, ca->sb_read_scratch, PAGE_SIZE);
 
-	this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_SB],
+	this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_sb],
 		     bio_sectors(bio));
 
 	percpu_ref_get(&ca->io_ref);
@@ -685,7 +685,7 @@ static void write_one_super(struct bch_fs *c, struct bch_dev *ca, unsigned idx)
 		     roundup((size_t) vstruct_bytes(sb),
 			     bdev_logical_block_size(ca->disk_sb.bdev)));
 
-	this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_SB],
+	this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_sb],
 		     bio_sectors(bio));
 
 	percpu_ref_get(&ca->io_ref);
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 6cfcae724650..cd1033228b9c 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -1076,7 +1076,7 @@ static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c,
 
 	init_rwsem(&ca->bucket_lock);
 
-	writepoint_init(&ca->copygc_write_point, BCH_DATA_USER);
+	writepoint_init(&ca->copygc_write_point, BCH_DATA_user);
 
 	bch2_dev_copygc_init(ca);
 
@@ -1207,7 +1207,7 @@ static int bch2_dev_attach_bdev(struct bch_fs *c, struct bch_sb_handle *sb)
 		return ret;
 
 	if (test_bit(BCH_FS_ALLOC_READ_DONE, &c->flags) &&
-	    !percpu_u64_get(&ca->usage[0]->buckets[BCH_DATA_SB])) {
+	    !percpu_u64_get(&ca->usage[0]->buckets[BCH_DATA_sb])) {
 		mutex_lock(&c->sb_lock);
 		bch2_mark_dev_superblock(ca->fs, ca, 0);
 		mutex_unlock(&c->sb_lock);
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index a1057532a9f3..ac8cf6dcec3d 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -868,18 +868,18 @@ static ssize_t show_dev_alloc_debug(struct bch_dev *ca, char *buf)
 		fifo_used(&ca->free[RESERVE_MOVINGGC]),	ca->free[RESERVE_MOVINGGC].size,
 		fifo_used(&ca->free[RESERVE_NONE]),	ca->free[RESERVE_NONE].size,
 		ca->mi.nbuckets - ca->mi.first_bucket,
-		stats.buckets[BCH_DATA_SB],
-		stats.buckets[BCH_DATA_JOURNAL],
-		stats.buckets[BCH_DATA_BTREE],
-		stats.buckets[BCH_DATA_USER],
-		stats.buckets[BCH_DATA_CACHED],
+		stats.buckets[BCH_DATA_sb],
+		stats.buckets[BCH_DATA_journal],
+		stats.buckets[BCH_DATA_btree],
+		stats.buckets[BCH_DATA_user],
+		stats.buckets[BCH_DATA_cached],
 		stats.buckets_ec,
 		ca->mi.nbuckets - ca->mi.first_bucket - stats.buckets_unavailable,
-		stats.sectors[BCH_DATA_SB],
-		stats.sectors[BCH_DATA_JOURNAL],
-		stats.sectors[BCH_DATA_BTREE],
-		stats.sectors[BCH_DATA_USER],
-		stats.sectors[BCH_DATA_CACHED],
+		stats.sectors[BCH_DATA_sb],
+		stats.sectors[BCH_DATA_journal],
+		stats.sectors[BCH_DATA_btree],
+		stats.sectors[BCH_DATA_user],
+		stats.sectors[BCH_DATA_cached],
 		stats.sectors_ec,
 		stats.sectors_fragmented,
 		ca->copygc_threshold,
@@ -887,8 +887,8 @@ static ssize_t show_dev_alloc_debug(struct bch_dev *ca, char *buf)
 		c->open_buckets_nr_free, OPEN_BUCKETS_COUNT,
 		BTREE_NODE_OPEN_BUCKET_RESERVE,
 		c->open_buckets_wait.list.first		? "waiting" : "empty",
-		nr[BCH_DATA_BTREE],
-		nr[BCH_DATA_USER],
+		nr[BCH_DATA_btree],
+		nr[BCH_DATA_user],
 		c->btree_reserve_cache_nr);
 }
 
-- 
cgit 


From 1d2ff0a63049f03bddcc98b195213bbd37e3ab53 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 11 Jul 2020 15:35:04 -0400
Subject: bcachefs: Fix extent_ptr_durability() calculation for erasure coded
 data

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/extents.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 5c1329360d9b..0fae8d76365e 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -723,7 +723,7 @@ static unsigned bch2_extent_ptr_durability(struct bch_fs *c,
 		if (WARN_ON(!s))
 			goto out;
 
-		durability = max_t(unsigned, durability, s->nr_redundant);
+		durability += s->nr_redundant;
 	}
 out:
 	return durability;
-- 
cgit 


From f793bc15491c04481b3f12a10ff22a53cd126842 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 11 Jul 2020 13:23:17 -0400
Subject: bcachefs: Drop extra pointers when marking data as in a stripe

We ideally want the buckets used for the extra initial replicas to be
reused right away.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/ec.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index 43f4b2f0d1bf..54a95edc9901 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -784,12 +784,9 @@ static int ec_stripe_update_ptrs(struct bch_fs *c,
 		bkey_on_stack_reassemble(&sk, c, k);
 		e = bkey_i_to_s_extent(sk.k);
 
-		extent_for_each_ptr(e, ptr) {
-			if (ptr->dev == dev)
-				ec_ptr = ptr;
-			else
-				ptr->cached = true;
-		}
+		bch2_bkey_drop_ptrs(e.s, ptr, ptr->dev != dev);
+		ec_ptr = (void *) bch2_bkey_has_device(e.s_c, dev);
+		BUG_ON(!ec_ptr);
 
 		extent_stripe_ptr_add(e, s, ec_ptr, idx);
 
-- 
cgit 


From e6d1161530bcd632ad10b6aa0ad511abb146dbcc Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 11 Jul 2020 16:28:54 -0400
Subject: bcachefs: Make copygc thread global

Per device copygc threads don't move data to different devices and they
make fragmentation works - they don't make much sense anymore.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c |  20 ++--
 fs/bcachefs/alloc_foreground.c |   5 +-
 fs/bcachefs/bcachefs.h         |  14 +--
 fs/bcachefs/buckets.c          |  19 +---
 fs/bcachefs/buckets_types.h    |   1 +
 fs/bcachefs/movinggc.c         | 213 ++++++++++++++++++++++-------------------
 fs/bcachefs/movinggc.h         |   6 +-
 fs/bcachefs/super.c            |  29 ++----
 fs/bcachefs/sysfs.c            |  18 ++--
 fs/bcachefs/trace.h            |   6 +-
 10 files changed, 159 insertions(+), 172 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index b1a8192f2751..d80e1edf8c44 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -41,29 +41,26 @@ static void pd_controllers_update(struct work_struct *work)
 					   struct bch_fs,
 					   pd_controllers_update);
 	struct bch_dev *ca;
+	s64 free = 0, fragmented = 0;
 	unsigned i;
 
 	for_each_member_device(ca, c, i) {
 		struct bch_dev_usage stats = bch2_dev_usage_read(c, ca);
 
-		u64 free = bucket_to_sector(ca,
+		free += bucket_to_sector(ca,
 				__dev_buckets_free(ca, stats)) << 9;
 		/*
 		 * Bytes of internal fragmentation, which can be
 		 * reclaimed by copy GC
 		 */
-		s64 fragmented = (bucket_to_sector(ca,
+		fragmented += max_t(s64, 0, (bucket_to_sector(ca,
 					stats.buckets[BCH_DATA_user] +
 					stats.buckets[BCH_DATA_cached]) -
 				  (stats.sectors[BCH_DATA_user] +
-				   stats.sectors[BCH_DATA_cached])) << 9;
-
-		fragmented = max(0LL, fragmented);
-
-		bch2_pd_controller_update(&ca->copygc_pd,
-					 free, fragmented, -1);
+				   stats.sectors[BCH_DATA_cached])) << 9);
 	}
 
+	bch2_pd_controller_update(&c->copygc_pd, free, fragmented, -1);
 	schedule_delayed_work(&c->pd_controllers_update,
 			      c->pd_controllers_update_seconds * HZ);
 }
@@ -1191,7 +1188,7 @@ stop:
 void bch2_recalc_capacity(struct bch_fs *c)
 {
 	struct bch_dev *ca;
-	u64 capacity = 0, reserved_sectors = 0, gc_reserve;
+	u64 capacity = 0, reserved_sectors = 0, gc_reserve, copygc_threshold = 0;
 	unsigned bucket_size_max = 0;
 	unsigned long ra_pages = 0;
 	unsigned i, j;
@@ -1234,7 +1231,7 @@ void bch2_recalc_capacity(struct bch_fs *c)
 
 		dev_reserve *= ca->mi.bucket_size;
 
-		ca->copygc_threshold = dev_reserve;
+		copygc_threshold += dev_reserve;
 
 		capacity += bucket_to_sector(ca, ca->mi.nbuckets -
 					     ca->mi.first_bucket);
@@ -1253,6 +1250,7 @@ void bch2_recalc_capacity(struct bch_fs *c)
 
 	reserved_sectors = min(reserved_sectors, capacity);
 
+	c->copygc_threshold = copygc_threshold;
 	c->capacity = capacity - reserved_sectors;
 
 	c->bucket_size_max = bucket_size_max;
@@ -1312,7 +1310,7 @@ void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca)
 	for (i = 0; i < ARRAY_SIZE(c->write_points); i++)
 		bch2_writepoint_stop(c, ca, &c->write_points[i]);
 
-	bch2_writepoint_stop(c, ca, &ca->copygc_write_point);
+	bch2_writepoint_stop(c, ca, &c->copygc_write_point);
 	bch2_writepoint_stop(c, ca, &c->rebalance_write_point);
 	bch2_writepoint_stop(c, ca, &c->btree_write_point);
 
diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index 32f7e38c086e..3ea28a79b8c9 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -956,8 +956,9 @@ void bch2_fs_allocator_foreground_init(struct bch_fs *c)
 		c->open_buckets_freelist = ob - c->open_buckets;
 	}
 
-	writepoint_init(&c->btree_write_point, BCH_DATA_btree);
-	writepoint_init(&c->rebalance_write_point, BCH_DATA_user);
+	writepoint_init(&c->btree_write_point,		BCH_DATA_btree);
+	writepoint_init(&c->rebalance_write_point,	BCH_DATA_user);
+	writepoint_init(&c->copygc_write_point,		BCH_DATA_user);
 
 	for (wp = c->write_points;
 	     wp < c->write_points + c->write_points_nr; wp++) {
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 7fdcae5fa225..baa8801c5412 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -452,13 +452,6 @@ struct bch_dev {
 
 	alloc_heap		alloc_heap;
 
-	/* Copying GC: */
-	struct task_struct	*copygc_thread;
-	copygc_heap		copygc_heap;
-	struct bch_pd_controller copygc_pd;
-	struct write_point	copygc_write_point;
-	u64			copygc_threshold;
-
 	atomic64_t		rebalance_work;
 
 	struct journal_device	journal;
@@ -753,6 +746,13 @@ struct bch_fs {
 	/* REBALANCE */
 	struct bch_fs_rebalance	rebalance;
 
+	/* COPYGC */
+	struct task_struct	*copygc_thread;
+	copygc_heap		copygc_heap;
+	struct bch_pd_controller copygc_pd;
+	struct write_point	copygc_write_point;
+	u64			copygc_threshold;
+
 	/* STRIPES: */
 	GENRADIX(struct stripe) stripes[2];
 
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index fde5fba2841e..c8a57b512b77 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -2009,7 +2009,6 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
 	alloc_fifo	free[RESERVE_NR];
 	alloc_fifo	free_inc;
 	alloc_heap	alloc_heap;
-	copygc_heap	copygc_heap;
 
 	size_t btree_reserve	= DIV_ROUND_UP(BTREE_NODE_RESERVE,
 			     ca->mi.bucket_size / c->opts.btree_node_size);
@@ -2018,15 +2017,13 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
 	size_t copygc_reserve	= max_t(size_t, 2, nbuckets >> 7);
 	size_t free_inc_nr	= max(max_t(size_t, 1, nbuckets >> 12),
 				      btree_reserve * 2);
-	bool resize = ca->buckets[0] != NULL,
-	     start_copygc = ca->copygc_thread != NULL;
+	bool resize = ca->buckets[0] != NULL;
 	int ret = -ENOMEM;
 	unsigned i;
 
 	memset(&free,		0, sizeof(free));
 	memset(&free_inc,	0, sizeof(free_inc));
 	memset(&alloc_heap,	0, sizeof(alloc_heap));
-	memset(&copygc_heap,	0, sizeof(copygc_heap));
 
 	if (!(buckets		= kvpmalloc(sizeof(struct bucket_array) +
 					    nbuckets * sizeof(struct bucket),
@@ -2039,14 +2036,13 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
 		       copygc_reserve, GFP_KERNEL) ||
 	    !init_fifo(&free[RESERVE_NONE], reserve_none, GFP_KERNEL) ||
 	    !init_fifo(&free_inc,	free_inc_nr, GFP_KERNEL) ||
-	    !init_heap(&alloc_heap,	ALLOC_SCAN_BATCH(ca) << 1, GFP_KERNEL) ||
-	    !init_heap(&copygc_heap,	copygc_reserve, GFP_KERNEL))
+	    !init_heap(&alloc_heap,	ALLOC_SCAN_BATCH(ca) << 1, GFP_KERNEL))
 		goto err;
 
 	buckets->first_bucket	= ca->mi.first_bucket;
 	buckets->nbuckets	= nbuckets;
 
-	bch2_copygc_stop(ca);
+	bch2_copygc_stop(c);
 
 	if (resize) {
 		down_write(&c->gc_lock);
@@ -2089,21 +2085,13 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
 	/* with gc lock held, alloc_heap can't be in use: */
 	swap(ca->alloc_heap, alloc_heap);
 
-	/* and we shut down copygc: */
-	swap(ca->copygc_heap, copygc_heap);
-
 	nbuckets = ca->mi.nbuckets;
 
 	if (resize)
 		up_write(&ca->bucket_lock);
 
-	if (start_copygc &&
-	    bch2_copygc_start(c, ca))
-		bch_err(ca, "error restarting copygc thread");
-
 	ret = 0;
 err:
-	free_heap(&copygc_heap);
 	free_heap(&alloc_heap);
 	free_fifo(&free_inc);
 	for (i = 0; i < RESERVE_NR; i++)
@@ -2120,7 +2108,6 @@ void bch2_dev_buckets_free(struct bch_dev *ca)
 {
 	unsigned i;
 
-	free_heap(&ca->copygc_heap);
 	free_heap(&ca->alloc_heap);
 	free_fifo(&ca->free_inc);
 	for (i = 0; i < RESERVE_NR; i++)
diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h
index b64b2fc9a896..0f7fcfe29e0e 100644
--- a/fs/bcachefs/buckets_types.h
+++ b/fs/bcachefs/buckets_types.h
@@ -121,6 +121,7 @@ struct disk_reservation {
 };
 
 struct copygc_heap_entry {
+	u8			dev;
 	u8			gen;
 	u32			sectors;
 	u64			offset;
diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c
index 135bbc102b53..c33b58dc5c50 100644
--- a/fs/bcachefs/movinggc.c
+++ b/fs/bcachefs/movinggc.c
@@ -43,13 +43,6 @@
 #define COPYGC_BUCKETS_PER_ITER(ca)					\
 	((ca)->free[RESERVE_MOVINGGC].size / 2)
 
-/*
- * Max sectors to move per iteration: Have to take into account internal
- * fragmentation from the multiple write points for each generation:
- */
-#define COPYGC_SECTORS_PER_ITER(ca)					\
-	((ca)->mi.bucket_size *	COPYGC_BUCKETS_PER_ITER(ca))
-
 static inline int sectors_used_cmp(copygc_heap *heap,
 				   struct copygc_heap_entry l,
 				   struct copygc_heap_entry r)
@@ -62,18 +55,22 @@ static int bucket_offset_cmp(const void *_l, const void *_r, size_t size)
 	const struct copygc_heap_entry *l = _l;
 	const struct copygc_heap_entry *r = _r;
 
-	return cmp_int(l->offset, r->offset);
+	return  cmp_int(l->dev,    r->dev) ?:
+		cmp_int(l->offset, r->offset);
 }
 
-static bool __copygc_pred(struct bch_dev *ca,
-			  struct bkey_s_c k)
+static int __copygc_pred(struct bch_fs *c, struct bkey_s_c k)
 {
-	copygc_heap *h = &ca->copygc_heap;
-	const struct bch_extent_ptr *ptr =
-		bch2_bkey_has_device(k, ca->dev_idx);
-
-	if (ptr) {
-		struct copygc_heap_entry search = { .offset = ptr->offset };
+	copygc_heap *h = &c->copygc_heap;
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+	const struct bch_extent_ptr *ptr;
+
+	bkey_for_each_ptr(ptrs, ptr) {
+		struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+		struct copygc_heap_entry search = {
+			.dev = ptr->dev,
+			.offset = ptr->offset
+		};
 
 		ssize_t i = eytzinger0_find_le(h->data, h->used,
 					       sizeof(h->data[0]),
@@ -89,12 +86,13 @@ static bool __copygc_pred(struct bch_dev *ca,
 
 		BUG_ON(i != j);
 #endif
-		return (i >= 0 &&
-			ptr->offset < h->data[i].offset + ca->mi.bucket_size &&
-			ptr->gen == h->data[i].gen);
+		if (i >= 0 &&
+		    ptr->offset < h->data[i].offset + ca->mi.bucket_size &&
+		    ptr->gen == h->data[i].gen)
+			return ptr->dev;
 	}
 
-	return false;
+	return -1;
 }
 
 static enum data_cmd copygc_pred(struct bch_fs *c, void *arg,
@@ -102,14 +100,14 @@ static enum data_cmd copygc_pred(struct bch_fs *c, void *arg,
 				 struct bch_io_opts *io_opts,
 				 struct data_opts *data_opts)
 {
-	struct bch_dev *ca = arg;
-
-	if (!__copygc_pred(ca, k))
+	int dev_idx = __copygc_pred(c, k);
+	if (dev_idx < 0)
 		return DATA_SKIP;
 
-	data_opts->target		= dev_to_target(ca->dev_idx);
+	/* XXX: use io_opts for this inode */
+	data_opts->target		= dev_to_target(dev_idx);
 	data_opts->btree_insert_flags	= BTREE_INSERT_USE_RESERVE;
-	data_opts->rewrite_dev		= ca->dev_idx;
+	data_opts->rewrite_dev		= dev_idx;
 	return DATA_REWRITE;
 }
 
@@ -125,20 +123,21 @@ static bool have_copygc_reserve(struct bch_dev *ca)
 	return ret;
 }
 
-static void bch2_copygc(struct bch_fs *c, struct bch_dev *ca)
+static void bch2_copygc(struct bch_fs *c)
 {
-	copygc_heap *h = &ca->copygc_heap;
+	copygc_heap *h = &c->copygc_heap;
 	struct copygc_heap_entry e, *i;
 	struct bucket_array *buckets;
 	struct bch_move_stats move_stats;
 	u64 sectors_to_move = 0, sectors_not_moved = 0;
+	u64 sectors_reserved = 0;
 	u64 buckets_to_move, buckets_not_moved = 0;
-	size_t b;
+	struct bch_dev *ca;
+	unsigned dev_idx;
+	size_t b, heap_size = 0;
 	int ret;
 
 	memset(&move_stats, 0, sizeof(move_stats));
-	closure_wait_event(&c->freelist_wait, have_copygc_reserve(ca));
-
 	/*
 	 * Find buckets with lowest sector counts, skipping completely
 	 * empty buckets, by building a maxheap sorted by sector count,
@@ -147,38 +146,51 @@ static void bch2_copygc(struct bch_fs *c, struct bch_dev *ca)
 	 */
 	h->used = 0;
 
-	/*
-	 * We need bucket marks to be up to date - gc can't be recalculating
-	 * them:
-	 */
-	down_read(&c->gc_lock);
-	down_read(&ca->bucket_lock);
-	buckets = bucket_array(ca);
-
-	for (b = buckets->first_bucket; b < buckets->nbuckets; b++) {
-		struct bucket_mark m = READ_ONCE(buckets->b[b].mark);
-		struct copygc_heap_entry e;
-
-		if (m.owned_by_allocator ||
-		    m.data_type != BCH_DATA_user ||
-		    !bucket_sectors_used(m) ||
-		    bucket_sectors_used(m) >= ca->mi.bucket_size)
-			continue;
+	for_each_rw_member(ca, c, dev_idx)
+		heap_size += ca->mi.nbuckets >> 7;
 
-		e = (struct copygc_heap_entry) {
-			.gen		= m.gen,
-			.sectors	= bucket_sectors_used(m),
-			.offset		= bucket_to_sector(ca, b),
-		};
-		heap_add_or_replace(h, e, -sectors_used_cmp, NULL);
+	if (h->size < heap_size) {
+		free_heap(&c->copygc_heap);
+		if (!init_heap(&c->copygc_heap, heap_size, GFP_KERNEL)) {
+			bch_err(c, "error allocating copygc heap");
+			return;
+		}
+	}
+
+	for_each_rw_member(ca, c, dev_idx) {
+		closure_wait_event(&c->freelist_wait, have_copygc_reserve(ca));
+
+		spin_lock(&ca->fs->freelist_lock);
+		sectors_reserved += fifo_used(&ca->free[RESERVE_MOVINGGC]) * ca->mi.bucket_size;
+		spin_unlock(&ca->fs->freelist_lock);
+
+		down_read(&ca->bucket_lock);
+		buckets = bucket_array(ca);
+
+		for (b = buckets->first_bucket; b < buckets->nbuckets; b++) {
+			struct bucket_mark m = READ_ONCE(buckets->b[b].mark);
+			struct copygc_heap_entry e;
+
+			if (m.owned_by_allocator ||
+			    m.data_type != BCH_DATA_user ||
+			    !bucket_sectors_used(m) ||
+			    bucket_sectors_used(m) >= ca->mi.bucket_size)
+				continue;
+
+			e = (struct copygc_heap_entry) {
+				.gen		= m.gen,
+				.sectors	= bucket_sectors_used(m),
+				.offset		= bucket_to_sector(ca, b),
+			};
+			heap_add_or_replace(h, e, -sectors_used_cmp, NULL);
+		}
+		up_read(&ca->bucket_lock);
 	}
-	up_read(&ca->bucket_lock);
-	up_read(&c->gc_lock);
 
 	for (i = h->data; i < h->data + h->used; i++)
 		sectors_to_move += i->sectors;
 
-	while (sectors_to_move > COPYGC_SECTORS_PER_ITER(ca)) {
+	while (sectors_to_move > sectors_reserved) {
 		BUG_ON(!heap_pop(h, e, -sectors_used_cmp, NULL));
 		sectors_to_move -= e.sectors;
 	}
@@ -192,24 +204,26 @@ static void bch2_copygc(struct bch_fs *c, struct bch_dev *ca)
 			sizeof(h->data[0]),
 			bucket_offset_cmp, NULL);
 
-	ret = bch2_move_data(c, &ca->copygc_pd.rate,
-			     writepoint_ptr(&ca->copygc_write_point),
+	ret = bch2_move_data(c, &c->copygc_pd.rate,
+			     writepoint_ptr(&c->copygc_write_point),
 			     POS_MIN, POS_MAX,
-			     copygc_pred, ca,
+			     copygc_pred, NULL,
 			     &move_stats);
 
-	down_read(&ca->bucket_lock);
-	buckets = bucket_array(ca);
-	for (i = h->data; i < h->data + h->used; i++) {
-		size_t b = sector_to_bucket(ca, i->offset);
-		struct bucket_mark m = READ_ONCE(buckets->b[b].mark);
-
-		if (i->gen == m.gen && bucket_sectors_used(m)) {
-			sectors_not_moved += bucket_sectors_used(m);
-			buckets_not_moved++;
+	for_each_rw_member(ca, c, dev_idx) {
+		down_read(&ca->bucket_lock);
+		buckets = bucket_array(ca);
+		for (i = h->data; i < h->data + h->used; i++) {
+			size_t b = sector_to_bucket(ca, i->offset);
+			struct bucket_mark m = READ_ONCE(buckets->b[b].mark);
+
+			if (i->gen == m.gen && bucket_sectors_used(m)) {
+				sectors_not_moved += bucket_sectors_used(m);
+				buckets_not_moved++;
+			}
 		}
+		up_read(&ca->bucket_lock);
 	}
-	up_read(&ca->bucket_lock);
 
 	if (sectors_not_moved && !ret)
 		bch_warn_ratelimited(c,
@@ -220,7 +234,7 @@ static void bch2_copygc(struct bch_fs *c, struct bch_dev *ca)
 			 atomic64_read(&move_stats.keys_raced),
 			 atomic64_read(&move_stats.sectors_raced));
 
-	trace_copygc(ca,
+	trace_copygc(c,
 		     atomic64_read(&move_stats.sectors_moved), sectors_not_moved,
 		     buckets_to_move, buckets_not_moved);
 }
@@ -239,20 +253,27 @@ static void bch2_copygc(struct bch_fs *c, struct bch_dev *ca)
  * often and continually reduce the amount of fragmented space as the device
  * fills up. So, we increase the threshold by half the current free space.
  */
-unsigned long bch2_copygc_wait_amount(struct bch_dev *ca)
+unsigned long bch2_copygc_wait_amount(struct bch_fs *c)
 {
-	struct bch_fs *c = ca->fs;
-	struct bch_dev_usage usage = bch2_dev_usage_read(c, ca);
-	u64 fragmented_allowed = ca->copygc_threshold +
-		((__dev_buckets_available(ca, usage) * ca->mi.bucket_size) >> 1);
+	struct bch_dev *ca;
+	unsigned dev_idx;
+	u64 fragmented_allowed = c->copygc_threshold;
+	u64 fragmented = 0;
+
+	for_each_rw_member(ca, c, dev_idx) {
+		struct bch_dev_usage usage = bch2_dev_usage_read(c, ca);
+
+		fragmented_allowed += ((__dev_buckets_available(ca, usage) *
+					ca->mi.bucket_size) >> 1);
+		fragmented += usage.sectors_fragmented;
+	}
 
-	return max_t(s64, 0, fragmented_allowed - usage.sectors_fragmented);
+	return max_t(s64, 0, fragmented_allowed - fragmented);
 }
 
 static int bch2_copygc_thread(void *arg)
 {
-	struct bch_dev *ca = arg;
-	struct bch_fs *c = ca->fs;
+	struct bch_fs *c = arg;
 	struct io_clock *clock = &c->io_clock[WRITE];
 	unsigned long last, wait;
 
@@ -263,7 +284,7 @@ static int bch2_copygc_thread(void *arg)
 			break;
 
 		last = atomic_long_read(&clock->now);
-		wait = bch2_copygc_wait_amount(ca);
+		wait = bch2_copygc_wait_amount(c);
 
 		if (wait > clock->max_slop) {
 			bch2_kthread_io_clock_wait(clock, last + wait,
@@ -271,29 +292,29 @@ static int bch2_copygc_thread(void *arg)
 			continue;
 		}
 
-		bch2_copygc(c, ca);
+		bch2_copygc(c);
 	}
 
 	return 0;
 }
 
-void bch2_copygc_stop(struct bch_dev *ca)
+void bch2_copygc_stop(struct bch_fs *c)
 {
-	ca->copygc_pd.rate.rate = UINT_MAX;
-	bch2_ratelimit_reset(&ca->copygc_pd.rate);
+	c->copygc_pd.rate.rate = UINT_MAX;
+	bch2_ratelimit_reset(&c->copygc_pd.rate);
 
-	if (ca->copygc_thread) {
-		kthread_stop(ca->copygc_thread);
-		put_task_struct(ca->copygc_thread);
+	if (c->copygc_thread) {
+		kthread_stop(c->copygc_thread);
+		put_task_struct(c->copygc_thread);
 	}
-	ca->copygc_thread = NULL;
+	c->copygc_thread = NULL;
 }
 
-int bch2_copygc_start(struct bch_fs *c, struct bch_dev *ca)
+int bch2_copygc_start(struct bch_fs *c)
 {
 	struct task_struct *t;
 
-	if (ca->copygc_thread)
+	if (c->copygc_thread)
 		return 0;
 
 	if (c->opts.nochanges)
@@ -302,21 +323,21 @@ int bch2_copygc_start(struct bch_fs *c, struct bch_dev *ca)
 	if (bch2_fs_init_fault("copygc_start"))
 		return -ENOMEM;
 
-	t = kthread_create(bch2_copygc_thread, ca,
-			   "bch_copygc[%s]", ca->name);
+	t = kthread_create(bch2_copygc_thread, c,
+			   "bch_copygc[%s]", c->name);
 	if (IS_ERR(t))
 		return PTR_ERR(t);
 
 	get_task_struct(t);
 
-	ca->copygc_thread = t;
-	wake_up_process(ca->copygc_thread);
+	c->copygc_thread = t;
+	wake_up_process(c->copygc_thread);
 
 	return 0;
 }
 
-void bch2_dev_copygc_init(struct bch_dev *ca)
+void bch2_fs_copygc_init(struct bch_fs *c)
 {
-	bch2_pd_controller_init(&ca->copygc_pd);
-	ca->copygc_pd.d_term = 0;
+	bch2_pd_controller_init(&c->copygc_pd);
+	c->copygc_pd.d_term = 0;
 }
diff --git a/fs/bcachefs/movinggc.h b/fs/bcachefs/movinggc.h
index dcd479632cf1..922738247d03 100644
--- a/fs/bcachefs/movinggc.h
+++ b/fs/bcachefs/movinggc.h
@@ -2,8 +2,8 @@
 #ifndef _BCACHEFS_MOVINGGC_H
 #define _BCACHEFS_MOVINGGC_H
 
-void bch2_copygc_stop(struct bch_dev *);
-int bch2_copygc_start(struct bch_fs *, struct bch_dev *);
-void bch2_dev_copygc_init(struct bch_dev *);
+void bch2_copygc_stop(struct bch_fs *);
+int bch2_copygc_start(struct bch_fs *);
+void bch2_fs_copygc_init(struct bch_fs *);
 
 #endif /* _BCACHEFS_MOVINGGC_H */
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index cd1033228b9c..6dc899be5bd2 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -181,10 +181,7 @@ static void __bch2_fs_read_only(struct bch_fs *c)
 	int ret;
 
 	bch2_rebalance_stop(c);
-
-	for_each_member_device(ca, c, i)
-		bch2_copygc_stop(ca);
-
+	bch2_copygc_stop(c);
 	bch2_gc_thread_stop(c);
 
 	/*
@@ -364,8 +361,6 @@ bool bch2_fs_emergency_read_only(struct bch_fs *c)
 
 static int bch2_fs_read_write_late(struct bch_fs *c)
 {
-	struct bch_dev *ca;
-	unsigned i;
 	int ret;
 
 	ret = bch2_gc_thread_start(c);
@@ -374,13 +369,10 @@ static int bch2_fs_read_write_late(struct bch_fs *c)
 		return ret;
 	}
 
-	for_each_rw_member(ca, c, i) {
-		ret = bch2_copygc_start(c, ca);
-		if (ret) {
-			bch_err(c, "error starting copygc threads");
-			percpu_ref_put(&ca->io_ref);
-			return ret;
-		}
+	ret = bch2_copygc_start(c);
+	if (ret) {
+		bch_err(c, "error starting copygc thread");
+		return ret;
 	}
 
 	ret = bch2_rebalance_start(c);
@@ -504,6 +496,7 @@ static void bch2_fs_free(struct bch_fs *c)
 	kfree(c->replicas_gc.entries);
 	kfree(rcu_dereference_protected(c->disk_groups, 1));
 	kfree(c->journal_seq_blacklist_table);
+	free_heap(&c->copygc_heap);
 
 	if (c->journal_reclaim_wq)
 		destroy_workqueue(c->journal_reclaim_wq);
@@ -652,6 +645,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 	for (i = 0; i < BCH_TIME_STAT_NR; i++)
 		bch2_time_stats_init(&c->times[i]);
 
+	bch2_fs_copygc_init(c);
 	bch2_fs_btree_key_cache_init_early(&c->btree_key_cache);
 	bch2_fs_allocator_background_init(c);
 	bch2_fs_allocator_foreground_init(c);
@@ -1076,10 +1070,6 @@ static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c,
 
 	init_rwsem(&ca->bucket_lock);
 
-	writepoint_init(&ca->copygc_write_point, BCH_DATA_user);
-
-	bch2_dev_copygc_init(ca);
-
 	INIT_WORK(&ca->io_error_work, bch2_io_error_work);
 
 	bch2_time_stats_init(&ca->io_latency[READ]);
@@ -1318,8 +1308,6 @@ static bool bch2_fs_may_start(struct bch_fs *c)
 
 static void __bch2_dev_read_only(struct bch_fs *c, struct bch_dev *ca)
 {
-	bch2_copygc_stop(ca);
-
 	/*
 	 * The allocator thread itself allocates btree nodes, so stop it first:
 	 */
@@ -1340,9 +1328,6 @@ static const char *__bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca)
 	if (bch2_dev_allocator_start(ca))
 		return "error starting allocator thread";
 
-	if (bch2_copygc_start(c, ca))
-		return "error starting copygc thread";
-
 	return NULL;
 }
 
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index ac8cf6dcec3d..058e2137f0c9 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -379,6 +379,7 @@ SHOW(bch2_fs)
 
 	sysfs_printf(rebalance_enabled,		"%i", c->rebalance.enabled);
 	sysfs_pd_controller_show(rebalance,	&c->rebalance.pd); /* XXX */
+	sysfs_pd_controller_show(copy_gc,	&c->copygc_pd);
 
 	if (attr == &sysfs_rebalance_work)
 		return bch2_rebalance_work_show(c, buf);
@@ -460,14 +461,11 @@ STORE(bch2_fs)
 	}
 
 	if (attr == &sysfs_copy_gc_enabled) {
-		struct bch_dev *ca;
-		unsigned i;
 		ssize_t ret = strtoul_safe(buf, c->copy_gc_enabled)
 			?: (ssize_t) size;
 
-		for_each_member_device(ca, c, i)
-			if (ca->copygc_thread)
-				wake_up_process(ca->copygc_thread);
+		if (c->copygc_thread)
+			wake_up_process(c->copygc_thread);
 		return ret;
 	}
 
@@ -482,6 +480,7 @@ STORE(bch2_fs)
 	sysfs_strtoul(pd_controllers_update_seconds,
 		      c->pd_controllers_update_seconds);
 	sysfs_pd_controller_store(rebalance,	&c->rebalance.pd);
+	sysfs_pd_controller_store(copy_gc,	&c->copygc_pd);
 
 	sysfs_strtoul(promote_whole_extents,	c->promote_whole_extents);
 
@@ -607,6 +606,7 @@ struct attribute *bch2_fs_internal_files[] = {
 	&sysfs_rebalance_enabled,
 	&sysfs_rebalance_work,
 	sysfs_pd_controller_files(rebalance),
+	sysfs_pd_controller_files(copy_gc),
 
 	&sysfs_new_stripes,
 
@@ -882,7 +882,7 @@ static ssize_t show_dev_alloc_debug(struct bch_dev *ca, char *buf)
 		stats.sectors[BCH_DATA_cached],
 		stats.sectors_ec,
 		stats.sectors_fragmented,
-		ca->copygc_threshold,
+		c->copygc_threshold,
 		c->freelist_wait.list.first		? "waiting" : "empty",
 		c->open_buckets_nr_free, OPEN_BUCKETS_COUNT,
 		BTREE_NODE_OPEN_BUCKET_RESERVE,
@@ -949,8 +949,6 @@ SHOW(bch2_dev)
 		return out.pos - buf;
 	}
 
-	sysfs_pd_controller_show(copy_gc, &ca->copygc_pd);
-
 	if (attr == &sysfs_cache_replacement_policy) {
 		bch2_string_opt_to_text(&out,
 					bch2_cache_replacement_policies,
@@ -1004,8 +1002,6 @@ STORE(bch2_dev)
 	struct bch_fs *c = ca->fs;
 	struct bch_member *mi;
 
-	sysfs_pd_controller_store(copy_gc, &ca->copygc_pd);
-
 	if (attr == &sysfs_discard) {
 		bool v = strtoul_or_return(buf);
 
@@ -1090,8 +1086,6 @@ struct attribute *bch2_dev_files[] = {
 	/* debug: */
 	&sysfs_alloc_debug,
 	&sysfs_wake_allocator,
-
-	sysfs_pd_controller_files(copy_gc),
 	NULL
 };
 
diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h
index d109ef174fd0..5c57b6efaaf3 100644
--- a/fs/bcachefs/trace.h
+++ b/fs/bcachefs/trace.h
@@ -470,10 +470,10 @@ TRACE_EVENT(move_data,
 );
 
 TRACE_EVENT(copygc,
-	TP_PROTO(struct bch_dev *ca,
+	TP_PROTO(struct bch_fs *c,
 		 u64 sectors_moved, u64 sectors_not_moved,
 		 u64 buckets_moved, u64 buckets_not_moved),
-	TP_ARGS(ca,
+	TP_ARGS(c,
 		sectors_moved, sectors_not_moved,
 		buckets_moved, buckets_not_moved),
 
@@ -486,7 +486,7 @@ TRACE_EVENT(copygc,
 	),
 
 	TP_fast_assign(
-		memcpy(__entry->uuid, ca->uuid.b, 16);
+		memcpy(__entry->uuid, c->sb.user_uuid.b, 16);
 		__entry->sectors_moved		= sectors_moved;
 		__entry->sectors_not_moved	= sectors_not_moved;
 		__entry->buckets_moved		= buckets_moved;
-- 
cgit 


From f621e1521c3ff5ea295a97d7d71cdbe84f496467 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 20 Jul 2020 15:51:05 -0400
Subject: bcachefs: Add an option for rebuilding the replicas section

There is a bug where we cnan end up clearing the data_has field in the
superblock members section, which causes us to skip reading the journal
and thus journal replay fails. This option tells the recovery path to
not trust those fields.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/opts.h     | 5 +++++
 fs/bcachefs/recovery.c | 3 ++-
 2 files changed, 7 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
index bc274918e18c..2f93238891b0 100644
--- a/fs/bcachefs/opts.h
+++ b/fs/bcachefs/opts.h
@@ -260,6 +260,11 @@ enum opt_type {
 	  OPT_BOOL(),							\
 	  NO_SB_OPT,			false,				\
 	  NULL,		"Don't replay the journal")			\
+	x(rebuild_replicas,		u8,				\
+	  OPT_MOUNT,							\
+	  OPT_BOOL(),							\
+	  NO_SB_OPT,			false,				\
+	  NULL,		"Rebuild the superblock replicas section")	\
 	x(keep_journal,			u8,				\
 	  OPT_MOUNT,							\
 	  OPT_BOOL(),							\
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 1695a609ecd9..28972f30e198 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -974,7 +974,8 @@ int bch2_fs_recovery(struct bch_fs *c)
 		bch_info(c, "recovering from clean shutdown, journal seq %llu",
 			 le64_to_cpu(clean->journal_seq));
 
-	if (!c->replicas.entries) {
+	if (!c->replicas.entries ||
+	    c->opts.rebuild_replicas) {
 		bch_info(c, "building replicas info");
 		set_bit(BCH_FS_REBUILD_REPLICAS, &c->flags);
 	}
-- 
cgit 


From e8306e3b4ca2d6cb325136afe5631247466ad176 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 20 Jul 2020 13:00:15 -0400
Subject: bcachefs: Wrap write path in memalloc_nofs_save()

This fixes a lockdep splat where we're allocating memory with vmalloc in
the compression bounce path, which doesn't always obey GFP_NOFS.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/compress.c |  6 +-----
 fs/bcachefs/io.c       | 12 +++++++++---
 2 files changed, 10 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c
index 920460a182b4..595d76aa3956 100644
--- a/fs/bcachefs/compress.c
+++ b/fs/bcachefs/compress.c
@@ -7,7 +7,6 @@
 #include "super-io.h"
 
 #include <linux/lz4.h>
-#include <linux/sched/mm.h>
 #include <linux/zlib.h>
 #include <linux/zstd.h>
 
@@ -64,7 +63,7 @@ static struct bbuf __bio_map_or_bounce(struct bch_fs *c, struct bio *bio,
 	struct bbuf ret;
 	struct bio_vec bv;
 	struct bvec_iter iter;
-	unsigned nr_pages = 0, flags;
+	unsigned nr_pages = 0;
 	struct page *stack_pages[16];
 	struct page **pages = NULL;
 	void *data;
@@ -104,10 +103,7 @@ static struct bbuf __bio_map_or_bounce(struct bch_fs *c, struct bio *bio,
 	__bio_for_each_segment(bv, bio, iter, start)
 		pages[nr_pages++] = bv.bv_page;
 
-	flags = memalloc_nofs_save();
 	data = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL);
-	memalloc_nofs_restore(flags);
-
 	if (pages != stack_pages)
 		kfree(pages);
 
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index be59b615b2db..ae2688d3aee6 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -32,6 +32,7 @@
 
 #include <linux/blkdev.h>
 #include <linux/random.h>
+#include <linux/sched/mm.h>
 
 #ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
 
@@ -1052,7 +1053,10 @@ static void __bch2_write(struct closure *cl)
 	struct write_point *wp;
 	struct bio *bio;
 	bool skip_put = true;
+	unsigned nofs_flags;
 	int ret;
+
+	nofs_flags = memalloc_nofs_save();
 again:
 	memset(&op->failed, 0, sizeof(op->failed));
 
@@ -1134,13 +1138,15 @@ again:
 
 	if (!skip_put)
 		continue_at(cl, bch2_write_index, index_update_wq(op));
+out:
+	memalloc_nofs_restore(nofs_flags);
 	return;
 err:
 	op->error = ret;
 	op->flags |= BCH_WRITE_DONE;
 
 	continue_at(cl, bch2_write_index, index_update_wq(op));
-	return;
+	goto out;
 flush_io:
 	/*
 	 * If the write can't all be submitted at once, we generally want to
@@ -1151,7 +1157,7 @@ flush_io:
 	 */
 	if (current->flags & PF_WQ_WORKER) {
 		continue_at(cl, bch2_write_index, index_update_wq(op));
-		return;
+		goto out;
 	}
 
 	closure_sync(cl);
@@ -1162,7 +1168,7 @@ flush_io:
 		if (op->error) {
 			op->flags |= BCH_WRITE_DONE;
 			continue_at_nobarrier(cl, bch2_write_done, NULL);
-			return;
+			goto out;
 		}
 	}
 
-- 
cgit 


From a2b5313a39dfb0c027ef3f9d79efa531e1c0a736 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 21 Jul 2020 11:51:17 -0400
Subject: bcachefs: Fix a faulty assertion

Now that updates to interior nodes are journalled, we shouldn't be
checking topology of interior nodes until we've finished replaying
updates to that node.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update_interior.c | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 05d20a6f5efd..81386b26f369 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -26,7 +26,7 @@
 /*
  * Verify that child nodes correctly span parent node's range:
  */
-static void btree_node_interior_verify(struct btree *b)
+static void btree_node_interior_verify(struct bch_fs *c, struct btree *b)
 {
 #ifdef CONFIG_BCACHEFS_DEBUG
 	struct bpos next_node = b->data->min_key;
@@ -37,6 +37,9 @@ static void btree_node_interior_verify(struct btree *b)
 
 	BUG_ON(!b->c.level);
 
+	if (!test_bit(BCH_FS_BTREE_INTERIOR_REPLAY_DONE, &c->flags))
+		return;
+
 	bch2_btree_node_iter_init_from_start(&iter, b);
 
 	while (1) {
@@ -1120,8 +1123,8 @@ static struct btree *__btree_split_node(struct btree_update *as,
 	bch2_verify_btree_nr_keys(n2);
 
 	if (n1->c.level) {
-		btree_node_interior_verify(n1);
-		btree_node_interior_verify(n2);
+		btree_node_interior_verify(as->c, n1);
+		btree_node_interior_verify(as->c, n2);
 	}
 
 	return n2;
@@ -1180,7 +1183,7 @@ static void btree_split_insert_keys(struct btree_update *as, struct btree *b,
 	BUG_ON(b->nsets != 1 ||
 	       b->nr.live_u64s != le16_to_cpu(btree_bset_first(b)->u64s));
 
-	btree_node_interior_verify(b);
+	btree_node_interior_verify(as->c, b);
 }
 
 static void btree_split(struct btree_update *as, struct btree *b,
@@ -1378,7 +1381,7 @@ void bch2_btree_insert_node(struct btree_update *as, struct btree *b,
 
 	bch2_btree_node_unlock_write(b, iter);
 
-	btree_node_interior_verify(b);
+	btree_node_interior_verify(c, b);
 
 	/*
 	 * when called from the btree_split path the new nodes aren't added to
-- 
cgit 


From 63b214e75b1c941d3fc81da5b7fc4aa997e40873 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 21 Jul 2020 13:34:22 -0400
Subject: bcachefs: Add bch2_blk_status_to_str()

We define our own BLK_STS_REMOVED, so we need our own to_str helper too.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_io.c   |  4 ++--
 fs/bcachefs/ec.c         |  2 +-
 fs/bcachefs/io.c         | 11 +++++++++--
 fs/bcachefs/io.h         |  2 ++
 fs/bcachefs/journal_io.c |  3 ++-
 fs/bcachefs/super-io.c   |  2 +-
 6 files changed, 17 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index a7d150811f7a..c8870a15a44f 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -1149,7 +1149,7 @@ static void btree_node_read_work(struct work_struct *work)
 		}
 start:
 		bch2_dev_io_err_on(bio->bi_status, ca, "btree read: %s",
-				   blk_status_to_str(bio->bi_status));
+				   bch2_blk_status_to_str(bio->bi_status));
 		if (rb->have_ioref)
 			percpu_ref_put(&ca->io_ref);
 		rb->have_ioref = false;
@@ -1435,7 +1435,7 @@ static void btree_node_write_endio(struct bio *bio)
 		bch2_latency_acct(ca, wbio->submit_time, WRITE);
 
 	if (bch2_dev_io_err_on(bio->bi_status, ca, "btree write: %s",
-			       blk_status_to_str(bio->bi_status)) ||
+			       bch2_blk_status_to_str(bio->bi_status)) ||
 	    bch2_meta_write_fault("btree")) {
 		spin_lock_irqsave(&c->btree_write_error_lock, flags);
 		bch2_dev_list_add_dev(&orig->failed, wbio->dev);
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index 54a95edc9901..8c04e7ced88b 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -328,7 +328,7 @@ static void ec_block_endio(struct bio *bio)
 
 	if (bch2_dev_io_err_on(bio->bi_status, ca, "erasure coding %s: %s",
 			       bio_data_dir(bio) ? "write" : "read",
-			       blk_status_to_str(bio->bi_status)))
+			       bch2_blk_status_to_str(bio->bi_status)))
 		clear_bit(ec_bio->idx, ec_bio->buf->valid);
 
 	bio_put(&ec_bio->bio);
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index ae2688d3aee6..7e57ca2e1071 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -34,6 +34,13 @@
 #include <linux/random.h>
 #include <linux/sched/mm.h>
 
+const char *bch2_blk_status_to_str(blk_status_t status)
+{
+	if (status == BLK_STS_REMOVED)
+		return "device removed";
+	return blk_status_to_str(status);
+}
+
 #ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
 
 static bool bch2_target_congested(struct bch_fs *c, u16 target)
@@ -626,7 +633,7 @@ static void bch2_write_endio(struct bio *bio)
 	struct bch_dev *ca		= bch_dev_bkey_exists(c, wbio->dev);
 
 	if (bch2_dev_io_err_on(bio->bi_status, ca, "data write: %s",
-			       blk_status_to_str(bio->bi_status)))
+			       bch2_blk_status_to_str(bio->bi_status)))
 		set_bit(wbio->dev, op->failed.d);
 
 	if (wbio->have_ioref) {
@@ -1921,7 +1928,7 @@ static void bch2_read_endio(struct bio *bio)
 		rbio->bio.bi_end_io = rbio->end_io;
 
 	if (bch2_dev_io_err_on(bio->bi_status, ca, "data read; %s",
-			       blk_status_to_str(bio->bi_status))) {
+			       bch2_blk_status_to_str(bio->bi_status))) {
 		bch2_rbio_error(rbio, READ_RETRY_AVOID, bio->bi_status);
 		return;
 	}
diff --git a/fs/bcachefs/io.h b/fs/bcachefs/io.h
index b8fbfef29176..be4aa3875360 100644
--- a/fs/bcachefs/io.h
+++ b/fs/bcachefs/io.h
@@ -26,6 +26,8 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *, struct bch_fs *,
 
 #define BLK_STS_REMOVED		((__force blk_status_t)128)
 
+const char *bch2_blk_status_to_str(blk_status_t);
+
 enum bch_write_flags {
 	BCH_WRITE_ALLOC_NOWAIT		= (1 << 0),
 	BCH_WRITE_CACHED		= (1 << 1),
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index a4c2b80e8aa5..1e505f294095 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -6,6 +6,7 @@
 #include "buckets.h"
 #include "checksum.h"
 #include "error.h"
+#include "io.h"
 #include "journal.h"
 #include "journal_io.h"
 #include "journal_reclaim.h"
@@ -961,7 +962,7 @@ static void journal_write_endio(struct bio *bio)
 	struct journal *j = &ca->fs->journal;
 
 	if (bch2_dev_io_err_on(bio->bi_status, ca, "journal write: %s",
-			       blk_status_to_str(bio->bi_status)) ||
+			       bch2_blk_status_to_str(bio->bi_status)) ||
 	    bch2_meta_write_fault("journal")) {
 		struct journal_buf *w = journal_prev_buf(j);
 		unsigned long flags;
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index 0913ffd23776..5406315340e1 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -641,7 +641,7 @@ static void write_super_endio(struct bio *bio)
 	/* XXX: return errors directly */
 
 	if (bch2_dev_io_err_on(bio->bi_status, ca, "superblock write: %s",
-			       blk_status_to_str(bio->bi_status)))
+			       bch2_blk_status_to_str(bio->bi_status)))
 		ca->sb_write_error = 1;
 
 	closure_put(&ca->fs->sb_write);
-- 
cgit 


From 8f3b41ab4f39f87712ed57e0443642d7bcabd1ff Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 11 Jul 2020 18:52:14 -0400
Subject: bcachefs: Don't restrict copygc writes to the same device

This no longer makes any sense, since copygc is now one thread per
filesystem, not per device, with a single write point.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_foreground.c | 92 +++++++++++++++++++++---------------------
 fs/bcachefs/alloc_foreground.h | 16 ++++++--
 fs/bcachefs/move.c             |  9 +++--
 fs/bcachefs/movinggc.c         |  2 +-
 4 files changed, 66 insertions(+), 53 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index 3ea28a79b8c9..747e86d5cd97 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -70,12 +70,6 @@
 #include <linux/rculist.h>
 #include <linux/rcupdate.h>
 
-enum bucket_alloc_ret {
-	ALLOC_SUCCESS,
-	OPEN_BUCKETS_EMPTY,
-	FREELIST_EMPTY,		/* Allocator thread not keeping up */
-};
-
 /*
  * Open buckets represent a bucket that's currently being allocated from.  They
  * serve two purposes:
@@ -395,21 +389,22 @@ static void add_new_bucket(struct bch_fs *c,
 	ob_push(c, ptrs, ob);
 }
 
-int bch2_bucket_alloc_set(struct bch_fs *c,
-			  struct open_buckets *ptrs,
-			  struct dev_stripe_state *stripe,
-			  struct bch_devs_mask *devs_may_alloc,
-			  unsigned nr_replicas,
-			  unsigned *nr_effective,
-			  bool *have_cache,
-			  enum alloc_reserve reserve,
-			  unsigned flags,
-			  struct closure *cl)
+enum bucket_alloc_ret
+bch2_bucket_alloc_set(struct bch_fs *c,
+		      struct open_buckets *ptrs,
+		      struct dev_stripe_state *stripe,
+		      struct bch_devs_mask *devs_may_alloc,
+		      unsigned nr_replicas,
+		      unsigned *nr_effective,
+		      bool *have_cache,
+		      enum alloc_reserve reserve,
+		      unsigned flags,
+		      struct closure *cl)
 {
 	struct dev_alloc_list devs_sorted =
 		bch2_dev_alloc_list(c, stripe, devs_may_alloc);
 	struct bch_dev *ca;
-	bool alloc_failure = false;
+	enum bucket_alloc_ret ret = INSUFFICIENT_DEVICES;
 	unsigned i;
 
 	BUG_ON(*nr_effective >= nr_replicas);
@@ -427,16 +422,10 @@ int bch2_bucket_alloc_set(struct bch_fs *c,
 		ob = bch2_bucket_alloc(c, ca, reserve,
 				flags & BUCKET_MAY_ALLOC_PARTIAL, cl);
 		if (IS_ERR(ob)) {
-			enum bucket_alloc_ret ret = -PTR_ERR(ob);
-
-			WARN_ON(reserve == RESERVE_MOVINGGC &&
-				ret != OPEN_BUCKETS_EMPTY);
+			ret = -PTR_ERR(ob);
 
 			if (cl)
-				return -EAGAIN;
-			if (ret == OPEN_BUCKETS_EMPTY)
-				return -ENOSPC;
-			alloc_failure = true;
+				return ret;
 			continue;
 		}
 
@@ -446,10 +435,10 @@ int bch2_bucket_alloc_set(struct bch_fs *c,
 		bch2_dev_stripe_increment(c, ca, stripe);
 
 		if (*nr_effective >= nr_replicas)
-			return 0;
+			return ALLOC_SUCCESS;
 	}
 
-	return alloc_failure ? -ENOSPC : -EROFS;
+	return ret;
 }
 
 /* Allocate from stripes: */
@@ -546,24 +535,25 @@ static void get_buckets_from_writepoint(struct bch_fs *c,
 	wp->ptrs = ptrs_skip;
 }
 
-static int open_bucket_add_buckets(struct bch_fs *c,
-				   struct open_buckets *ptrs,
-				   struct write_point *wp,
-				   struct bch_devs_list *devs_have,
-				   u16 target,
-				   unsigned erasure_code,
-				   unsigned nr_replicas,
-				   unsigned *nr_effective,
-				   bool *have_cache,
-				   enum alloc_reserve reserve,
-				   unsigned flags,
-				   struct closure *_cl)
+static enum bucket_alloc_ret
+open_bucket_add_buckets(struct bch_fs *c,
+			struct open_buckets *ptrs,
+			struct write_point *wp,
+			struct bch_devs_list *devs_have,
+			u16 target,
+			unsigned erasure_code,
+			unsigned nr_replicas,
+			unsigned *nr_effective,
+			bool *have_cache,
+			enum alloc_reserve reserve,
+			unsigned flags,
+			struct closure *_cl)
 {
 	struct bch_devs_mask devs;
 	struct open_bucket *ob;
 	struct closure *cl = NULL;
+	enum bucket_alloc_ret ret;
 	unsigned i;
-	int ret;
 
 	rcu_read_lock();
 	devs = target_rw_devs(c, wp->type, target);
@@ -608,7 +598,7 @@ retry_blocking:
 	ret = bch2_bucket_alloc_set(c, ptrs, &wp->stripe, &devs,
 				nr_replicas, nr_effective, have_cache,
 				reserve, flags, cl);
-	if (ret && ret != -EROFS && !cl && _cl) {
+	if (ret && ret != INSUFFICIENT_DEVICES && !cl && _cl) {
 		cl = _cl;
 		goto retry_blocking;
 	}
@@ -799,7 +789,8 @@ struct write_point *bch2_alloc_sectors_start(struct bch_fs *c,
 	unsigned nr_effective, write_points_nr;
 	unsigned ob_flags = 0;
 	bool have_cache;
-	int ret, i;
+	enum bucket_alloc_ret ret;
+	int i;
 
 	if (!(flags & BCH_WRITE_ONLY_SPECIFIED_DEVS))
 		ob_flags |= BUCKET_ALLOC_USE_DURABILITY;
@@ -844,10 +835,13 @@ retry:
 alloc_done:
 	BUG_ON(!ret && nr_effective < nr_replicas);
 
+	WARN_ON(reserve == RESERVE_MOVINGGC &&
+		ret == FREELIST_EMPTY);
+
 	if (erasure_code && !ec_open_bucket(c, &ptrs))
 		pr_debug("failed to get ec bucket: ret %u", ret);
 
-	if (ret == -EROFS &&
+	if (ret == INSUFFICIENT_DEVICES &&
 	    nr_effective >= nr_replicas_required)
 		ret = 0;
 
@@ -881,11 +875,19 @@ err:
 
 	mutex_unlock(&wp->lock);
 
-	if (ret == -ENOSPC &&
+	if (ret == FREELIST_EMPTY &&
 	    try_decrease_writepoints(c, write_points_nr))
 		goto retry;
 
-	return ERR_PTR(ret);
+	switch (ret) {
+	case OPEN_BUCKETS_EMPTY:
+	case FREELIST_EMPTY:
+		return cl ? ERR_PTR(-EAGAIN) : ERR_PTR(-ENOSPC);
+	case INSUFFICIENT_DEVICES:
+		return ERR_PTR(-EROFS);
+	default:
+		BUG();
+	}
 }
 
 /*
diff --git a/fs/bcachefs/alloc_foreground.h b/fs/bcachefs/alloc_foreground.h
index 17a6869bb8cd..e8357ec0b333 100644
--- a/fs/bcachefs/alloc_foreground.h
+++ b/fs/bcachefs/alloc_foreground.h
@@ -12,6 +12,13 @@ struct bch_dev;
 struct bch_fs;
 struct bch_devs_List;
 
+enum bucket_alloc_ret {
+	ALLOC_SUCCESS,
+	OPEN_BUCKETS_EMPTY,
+	FREELIST_EMPTY,		/* Allocator thread not keeping up */
+	INSUFFICIENT_DEVICES,
+};
+
 struct dev_alloc_list {
 	unsigned	nr;
 	u8		devs[BCH_SB_MEMBERS_MAX];
@@ -92,10 +99,11 @@ static inline void bch2_open_bucket_get(struct bch_fs *c,
 	}
 }
 
-int bch2_bucket_alloc_set(struct bch_fs *, struct open_buckets *,
-			  struct dev_stripe_state *, struct bch_devs_mask *,
-			  unsigned, unsigned *, bool *, enum alloc_reserve,
-			  unsigned, struct closure *);
+enum bucket_alloc_ret
+bch2_bucket_alloc_set(struct bch_fs *, struct open_buckets *,
+		      struct dev_stripe_state *, struct bch_devs_mask *,
+		      unsigned, unsigned *, bool *, enum alloc_reserve,
+		      unsigned, struct closure *);
 
 struct write_point *bch2_alloc_sectors_start(struct bch_fs *,
 					     unsigned, unsigned,
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index 62626cc13ced..6a43a89e0fdd 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -246,11 +246,14 @@ int bch2_migrate_write_init(struct bch_fs *c, struct migrate_write *m,
 	m->op.target	= data_opts.target,
 	m->op.write_point = wp;
 
-	if (m->data_opts.btree_insert_flags & BTREE_INSERT_USE_RESERVE)
+	if (m->data_opts.btree_insert_flags & BTREE_INSERT_USE_RESERVE) {
 		m->op.alloc_reserve = RESERVE_MOVINGGC;
+	} else {
+		/* XXX: this should probably be passed in */
+		m->op.flags |= BCH_WRITE_ONLY_SPECIFIED_DEVS;
+	}
 
-	m->op.flags |= BCH_WRITE_ONLY_SPECIFIED_DEVS|
-		BCH_WRITE_PAGES_STABLE|
+	m->op.flags |= BCH_WRITE_PAGES_STABLE|
 		BCH_WRITE_PAGES_OWNED|
 		BCH_WRITE_DATA_ENCODED|
 		BCH_WRITE_FROM_INTERNAL;
diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c
index c33b58dc5c50..3c87e4b8da33 100644
--- a/fs/bcachefs/movinggc.c
+++ b/fs/bcachefs/movinggc.c
@@ -105,7 +105,7 @@ static enum data_cmd copygc_pred(struct bch_fs *c, void *arg,
 		return DATA_SKIP;
 
 	/* XXX: use io_opts for this inode */
-	data_opts->target		= dev_to_target(dev_idx);
+	data_opts->target		= io_opts->background_target;
 	data_opts->btree_insert_flags	= BTREE_INSERT_USE_RESERVE;
 	data_opts->rewrite_dev		= dev_idx;
 	return DATA_REWRITE;
-- 
cgit 


From 988e98cfce26ecad20595cb52056759e798cd8de Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 10 Jul 2020 16:13:52 -0400
Subject: bcachefs: Refactor replicas code

Awhile back the mechanism for garbage collecting unused replicas entries
was significantly improved, but some cleanup was missed - this patch
does that now.

This is also prep work for a patch to account for erasure coded parity
blocks separately - we need to consolidate the logic for
checking/marking the various replicas entries from one bkey into a
single function.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_gc.c   |  2 +-
 fs/bcachefs/extents.c    | 10 ------
 fs/bcachefs/journal_io.c |  2 +-
 fs/bcachefs/replicas.c   | 79 +++++++++++++++++-------------------------------
 fs/bcachefs/replicas.h   |  8 ++---
 5 files changed, 31 insertions(+), 70 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 36fa4853e8a1..cebba06f3a96 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -111,7 +111,7 @@ static int bch2_gc_mark_key(struct bch_fs *c, struct bkey_s_c k,
 			atomic64_set(&c->key_version, k.k->version.lo);
 
 		if (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) ||
-		    fsck_err_on(!bch2_bkey_replicas_marked(c, k, false), c,
+		    fsck_err_on(!bch2_bkey_replicas_marked(c, k), c,
 				"superblock not marked as containing replicas (type %u)",
 				k.k->type)) {
 			ret = bch2_mark_bkey_replicas(c, k);
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 0fae8d76365e..02618b9c918c 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -178,11 +178,6 @@ void bch2_btree_ptr_debugcheck(struct bch_fs *c, struct bkey_s_c k)
 	if (!percpu_down_read_trylock(&c->mark_lock))
 		return;
 
-	bch2_fs_inconsistent_on(!test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) &&
-		!bch2_bkey_replicas_marked_locked(c, k, false), c,
-		"btree key bad (replicas not marked in superblock):\n%s",
-		(bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
-
 	bkey_for_each_ptr(ptrs, ptr) {
 		ca = bch_dev_bkey_exists(c, ptr->dev);
 
@@ -266,11 +261,6 @@ void bch2_extent_debugcheck(struct bch_fs *c, struct bkey_s_c k)
 	if (!percpu_down_read_trylock(&c->mark_lock))
 		return;
 
-	bch2_fs_inconsistent_on(!test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) &&
-		!bch2_bkey_replicas_marked_locked(c, e.s_c, false), c,
-		"extent key bad (replicas not marked in superblock):\n%s",
-		(bch2_bkey_val_to_text(&PBUF(buf), c, e.s_c), buf));
-
 	extent_for_each_ptr_decode(e, p, entry) {
 		struct bch_dev *ca	= bch_dev_bkey_exists(c, p.ptr.dev);
 		struct bucket_mark mark = ptr_bucket_mark(ca, &p.ptr);
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 1e505f294095..b43f69c19b0f 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -699,7 +699,7 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list)
 
 		if (!degraded &&
 		    (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) ||
-		     fsck_err_on(!bch2_replicas_marked(c, &replicas.e, false), c,
+		     fsck_err_on(!bch2_replicas_marked(c, &replicas.e), c,
 				 "superblock not marked as containing replicas %s",
 				 (bch2_replicas_entry_to_text(&PBUF(buf),
 							      &replicas.e), buf)))) {
diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c
index 91e050732aaf..db0665abd60b 100644
--- a/fs/bcachefs/replicas.c
+++ b/fs/bcachefs/replicas.c
@@ -213,29 +213,20 @@ static bool __replicas_has_entry(struct bch_replicas_cpu *r,
 	return __replicas_entry_idx(r, search) >= 0;
 }
 
-static bool bch2_replicas_marked_locked(struct bch_fs *c,
-			  struct bch_replicas_entry *search,
-			  bool check_gc_replicas)
+bool bch2_replicas_marked(struct bch_fs *c,
+			  struct bch_replicas_entry *search)
 {
+	bool marked;
+
 	if (!search->nr_devs)
 		return true;
 
 	verify_replicas_entry(search);
 
-	return __replicas_has_entry(&c->replicas, search) &&
-		(!check_gc_replicas ||
-		 likely((!c->replicas_gc.entries)) ||
-		 __replicas_has_entry(&c->replicas_gc, search));
-}
-
-bool bch2_replicas_marked(struct bch_fs *c,
-			  struct bch_replicas_entry *search,
-			  bool check_gc_replicas)
-{
-	bool marked;
-
 	percpu_down_read(&c->mark_lock);
-	marked = bch2_replicas_marked_locked(c, search, check_gc_replicas);
+	marked = __replicas_has_entry(&c->replicas, search) &&
+		(likely((!c->replicas_gc.entries)) ||
+		 __replicas_has_entry(&c->replicas_gc, search));
 	percpu_up_read(&c->mark_lock);
 
 	return marked;
@@ -426,66 +417,50 @@ err:
 	goto out;
 }
 
-int bch2_mark_replicas(struct bch_fs *c,
-		       struct bch_replicas_entry *r)
+static int __bch2_mark_replicas(struct bch_fs *c,
+				struct bch_replicas_entry *r,
+				bool check)
 {
-	return likely(bch2_replicas_marked(c, r, true))
-		? 0
+	return likely(bch2_replicas_marked(c, r))	? 0
+		: check					? -1
 		: bch2_mark_replicas_slowpath(c, r);
 }
 
-bool bch2_bkey_replicas_marked_locked(struct bch_fs *c,
-				      struct bkey_s_c k,
-				      bool check_gc_replicas)
+int bch2_mark_replicas(struct bch_fs *c, struct bch_replicas_entry *r)
+{
+	return __bch2_mark_replicas(c, r, false);
+}
+
+static int __bch2_mark_bkey_replicas(struct bch_fs *c, struct bkey_s_c k,
+				     bool check)
 {
 	struct bch_replicas_padded search;
 	struct bch_devs_list cached = bch2_bkey_cached_devs(k);
 	unsigned i;
+	int ret;
 
 	for (i = 0; i < cached.nr; i++) {
 		bch2_replicas_entry_cached(&search.e, cached.devs[i]);
 
-		if (!bch2_replicas_marked_locked(c, &search.e,
-						 check_gc_replicas))
-			return false;
+		ret = __bch2_mark_replicas(c, &search.e, check);
+		if (ret)
+			return ret;
 	}
 
 	bch2_bkey_to_replicas(&search.e, k);
 
-	return bch2_replicas_marked_locked(c, &search.e, check_gc_replicas);
+	return __bch2_mark_replicas(c, &search.e, check);
 }
 
 bool bch2_bkey_replicas_marked(struct bch_fs *c,
-			       struct bkey_s_c k,
-			       bool check_gc_replicas)
+			       struct bkey_s_c k)
 {
-	bool marked;
-
-	percpu_down_read(&c->mark_lock);
-	marked = bch2_bkey_replicas_marked_locked(c, k, check_gc_replicas);
-	percpu_up_read(&c->mark_lock);
-
-	return marked;
+	return __bch2_mark_bkey_replicas(c, k, true) == 0;
 }
 
 int bch2_mark_bkey_replicas(struct bch_fs *c, struct bkey_s_c k)
 {
-	struct bch_replicas_padded search;
-	struct bch_devs_list cached = bch2_bkey_cached_devs(k);
-	unsigned i;
-	int ret;
-
-	for (i = 0; i < cached.nr; i++) {
-		bch2_replicas_entry_cached(&search.e, cached.devs[i]);
-
-		ret = bch2_mark_replicas(c, &search.e);
-		if (ret)
-			return ret;
-	}
-
-	bch2_bkey_to_replicas(&search.e, k);
-
-	return bch2_mark_replicas(c, &search.e);
+	return __bch2_mark_bkey_replicas(c, k, false);
 }
 
 int bch2_replicas_gc_end(struct bch_fs *c, int ret)
diff --git a/fs/bcachefs/replicas.h b/fs/bcachefs/replicas.h
index deda5f5c6e20..8b95164fbb56 100644
--- a/fs/bcachefs/replicas.h
+++ b/fs/bcachefs/replicas.h
@@ -21,16 +21,12 @@ int bch2_replicas_entry_idx(struct bch_fs *,
 void bch2_devlist_to_replicas(struct bch_replicas_entry *,
 			      enum bch_data_type,
 			      struct bch_devs_list);
-bool bch2_replicas_marked(struct bch_fs *,
-			  struct bch_replicas_entry *, bool);
+bool bch2_replicas_marked(struct bch_fs *, struct bch_replicas_entry *);
 int bch2_mark_replicas(struct bch_fs *,
 		       struct bch_replicas_entry *);
 
-bool bch2_bkey_replicas_marked_locked(struct bch_fs *,
-				      struct bkey_s_c, bool);
 void bch2_bkey_to_replicas(struct bch_replicas_entry *, struct bkey_s_c);
-bool bch2_bkey_replicas_marked(struct bch_fs *,
-			       struct bkey_s_c, bool);
+bool bch2_bkey_replicas_marked(struct bch_fs *, struct bkey_s_c);
 int bch2_mark_bkey_replicas(struct bch_fs *, struct bkey_s_c);
 
 static inline void bch2_replicas_entry_cached(struct bch_replicas_entry *e,
-- 
cgit 


From 4fe7efa17713e3d0eecbd106adff3d1b039cc554 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 22 Jul 2020 18:26:04 -0400
Subject: bcachefs: Fix an error path

We were missing a 'goto retry' and continuing on with an error pointer.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update_interior.c | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 81386b26f369..5317f29c2776 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -1869,7 +1869,7 @@ int bch2_btree_node_update_key(struct bch_fs *c, struct btree_iter *iter,
 
 		new_hash = bch2_btree_node_mem_alloc(c);
 	}
-
+retry:
 	as = bch2_btree_update_start(iter->trans, iter->btree_id,
 		parent ? btree_update_reserve_required(c, parent) : 0,
 		BTREE_INSERT_NOFAIL|
@@ -1882,16 +1882,17 @@ int bch2_btree_node_update_key(struct bch_fs *c, struct btree_iter *iter,
 		if (ret == -EAGAIN)
 			ret = -EINTR;
 
-		if (ret != -EINTR)
-			goto err;
+		if (ret == -EINTR) {
+			bch2_trans_unlock(iter->trans);
+			up_read(&c->gc_lock);
+			closure_sync(&cl);
+			down_read(&c->gc_lock);
 
-		bch2_trans_unlock(iter->trans);
-		up_read(&c->gc_lock);
-		closure_sync(&cl);
-		down_read(&c->gc_lock);
+			if (bch2_trans_relock(iter->trans))
+				goto retry;
+		}
 
-		if (!bch2_trans_relock(iter->trans))
-			goto err;
+		goto err;
 	}
 
 	ret = bch2_mark_bkey_replicas(c, bkey_i_to_s_c(new_key));
-- 
cgit 


From 3d080aa52f6c1bf10734ec4464a2204cbbd80671 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 22 Jul 2020 13:27:00 -0400
Subject: bcachefs: Delete unused arguments

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c |  4 ++--
 fs/bcachefs/alloc_foreground.c |  6 +++---
 fs/bcachefs/alloc_foreground.h |  3 +--
 fs/bcachefs/buckets.c          |  2 +-
 fs/bcachefs/buckets.h          | 10 +++++-----
 fs/bcachefs/chardev.c          |  2 +-
 fs/bcachefs/journal_io.c       |  2 +-
 fs/bcachefs/movinggc.c         |  2 +-
 fs/bcachefs/sysfs.c            |  4 ++--
 9 files changed, 17 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index d80e1edf8c44..54455f77ad2a 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -45,7 +45,7 @@ static void pd_controllers_update(struct work_struct *work)
 	unsigned i;
 
 	for_each_member_device(ca, c, i) {
-		struct bch_dev_usage stats = bch2_dev_usage_read(c, ca);
+		struct bch_dev_usage stats = bch2_dev_usage_read(ca);
 
 		free += bucket_to_sector(ca,
 				__dev_buckets_free(ca, stats)) << 9;
@@ -514,7 +514,7 @@ static int wait_buckets_available(struct bch_fs *c, struct bch_dev *ca)
 		if (gc_count != c->gc_count)
 			ca->inc_gen_really_needs_gc = 0;
 
-		available = max_t(s64, 0, dev_buckets_available(c, ca) -
+		available = max_t(s64, 0, dev_buckets_available(ca) -
 				  ca->inc_gen_really_needs_gc);
 
 		if (available > fifo_free(&ca->free_inc) ||
diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index 747e86d5cd97..2dc8a8ff0569 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -347,11 +347,11 @@ struct dev_alloc_list bch2_dev_alloc_list(struct bch_fs *c,
 	return ret;
 }
 
-void bch2_dev_stripe_increment(struct bch_fs *c, struct bch_dev *ca,
+void bch2_dev_stripe_increment(struct bch_dev *ca,
 			       struct dev_stripe_state *stripe)
 {
 	u64 *v = stripe->next_alloc + ca->dev_idx;
-	u64 free_space = dev_buckets_free(c, ca);
+	u64 free_space = dev_buckets_free(ca);
 	u64 free_space_inv = free_space
 		? div64_u64(1ULL << 48, free_space)
 		: 1ULL << 48;
@@ -432,7 +432,7 @@ bch2_bucket_alloc_set(struct bch_fs *c,
 		add_new_bucket(c, ptrs, devs_may_alloc,
 			       nr_effective, have_cache, flags, ob);
 
-		bch2_dev_stripe_increment(c, ca, stripe);
+		bch2_dev_stripe_increment(ca, stripe);
 
 		if (*nr_effective >= nr_replicas)
 			return ALLOC_SUCCESS;
diff --git a/fs/bcachefs/alloc_foreground.h b/fs/bcachefs/alloc_foreground.h
index e8357ec0b333..dc8574a1a76a 100644
--- a/fs/bcachefs/alloc_foreground.h
+++ b/fs/bcachefs/alloc_foreground.h
@@ -27,8 +27,7 @@ struct dev_alloc_list {
 struct dev_alloc_list bch2_dev_alloc_list(struct bch_fs *,
 					  struct dev_stripe_state *,
 					  struct bch_devs_mask *);
-void bch2_dev_stripe_increment(struct bch_fs *, struct bch_dev *,
-			       struct dev_stripe_state *);
+void bch2_dev_stripe_increment(struct bch_dev *, struct dev_stripe_state *);
 
 long bch2_bucket_alloc_new_fs(struct bch_dev *);
 
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index c8a57b512b77..7aba1907f91d 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -179,7 +179,7 @@ out_pool:
 	return ret;
 }
 
-struct bch_dev_usage bch2_dev_usage_read(struct bch_fs *c, struct bch_dev *ca)
+struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *ca)
 {
 	struct bch_dev_usage ret;
 
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index 44a5b6df8c8b..c85015071c6d 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -182,7 +182,7 @@ static inline bool bucket_needs_journal_commit(struct bucket_mark m,
 
 /* Device usage: */
 
-struct bch_dev_usage bch2_dev_usage_read(struct bch_fs *, struct bch_dev *);
+struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *);
 
 void bch2_dev_usage_from_buckets(struct bch_fs *);
 
@@ -202,9 +202,9 @@ static inline u64 __dev_buckets_available(struct bch_dev *ca,
 /*
  * Number of reclaimable buckets - only for use by the allocator thread:
  */
-static inline u64 dev_buckets_available(struct bch_fs *c, struct bch_dev *ca)
+static inline u64 dev_buckets_available(struct bch_dev *ca)
 {
-	return __dev_buckets_available(ca, bch2_dev_usage_read(c, ca));
+	return __dev_buckets_available(ca, bch2_dev_usage_read(ca));
 }
 
 static inline u64 __dev_buckets_free(struct bch_dev *ca,
@@ -215,9 +215,9 @@ static inline u64 __dev_buckets_free(struct bch_dev *ca,
 		fifo_used(&ca->free_inc);
 }
 
-static inline u64 dev_buckets_free(struct bch_fs *c, struct bch_dev *ca)
+static inline u64 dev_buckets_free(struct bch_dev *ca)
 {
-	return __dev_buckets_free(ca, bch2_dev_usage_read(c, ca));
+	return __dev_buckets_free(ca, bch2_dev_usage_read(ca));
 }
 
 /* Filesystem usage: */
diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c
index b46d32db4b58..0b1eca63f78e 100644
--- a/fs/bcachefs/chardev.c
+++ b/fs/bcachefs/chardev.c
@@ -468,7 +468,7 @@ static long bch2_ioctl_dev_usage(struct bch_fs *c,
 	if (IS_ERR(ca))
 		return PTR_ERR(ca);
 
-	src = bch2_dev_usage_read(c, ca);
+	src = bch2_dev_usage_read(ca);
 
 	arg.state		= ca->mi.state;
 	arg.bucket_size		= ca->mi.bucket_size;
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index b43f69c19b0f..9df8dd75f4ec 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -759,7 +759,7 @@ static void __journal_write_alloc(struct journal *j,
 		    sectors > ja->sectors_free)
 			continue;
 
-		bch2_dev_stripe_increment(c, ca, &j->wp.stripe);
+		bch2_dev_stripe_increment(ca, &j->wp.stripe);
 
 		bch2_bkey_append_ptr(&w->key,
 			(struct bch_extent_ptr) {
diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c
index 3c87e4b8da33..44360bf03d29 100644
--- a/fs/bcachefs/movinggc.c
+++ b/fs/bcachefs/movinggc.c
@@ -261,7 +261,7 @@ unsigned long bch2_copygc_wait_amount(struct bch_fs *c)
 	u64 fragmented = 0;
 
 	for_each_rw_member(ca, c, dev_idx) {
-		struct bch_dev_usage usage = bch2_dev_usage_read(c, ca);
+		struct bch_dev_usage usage = bch2_dev_usage_read(ca);
 
 		fragmented_allowed += ((__dev_buckets_available(ca, usage) *
 					ca->mi.bucket_size) >> 1);
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index 058e2137f0c9..911c305d372c 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -826,7 +826,7 @@ static ssize_t show_reserve_stats(struct bch_dev *ca, char *buf)
 static ssize_t show_dev_alloc_debug(struct bch_dev *ca, char *buf)
 {
 	struct bch_fs *c = ca->fs;
-	struct bch_dev_usage stats = bch2_dev_usage_read(c, ca);
+	struct bch_dev_usage stats = bch2_dev_usage_read(ca);
 	unsigned i, nr[BCH_DATA_NR];
 
 	memset(nr, 0, sizeof(nr));
@@ -874,7 +874,7 @@ static ssize_t show_dev_alloc_debug(struct bch_dev *ca, char *buf)
 		stats.buckets[BCH_DATA_user],
 		stats.buckets[BCH_DATA_cached],
 		stats.buckets_ec,
-		ca->mi.nbuckets - ca->mi.first_bucket - stats.buckets_unavailable,
+		__dev_buckets_available(ca, stats),
 		stats.sectors[BCH_DATA_sb],
 		stats.sectors[BCH_DATA_journal],
 		stats.sectors[BCH_DATA_btree],
-- 
cgit 


From 74ed7e560b794369adf87e0d310453bc78f4b273 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 21 Jul 2020 17:12:39 -0400
Subject: bcachefs: Don't let copygc buckets be stolen by other threads

And assorted other copygc fixes.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c |  4 +++-
 fs/bcachefs/alloc_foreground.c | 46 +++++++++++++++++++++++++++---------------
 fs/bcachefs/alloc_foreground.h |  7 -------
 fs/bcachefs/alloc_types.h      |  1 +
 fs/bcachefs/btree_gc.c         |  6 +++++-
 fs/bcachefs/move.c             |  1 +
 fs/bcachefs/movinggc.c         | 34 ++++++++++++++++++++++---------
 fs/bcachefs/super.c            |  8 ++++++++
 8 files changed, 72 insertions(+), 35 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 54455f77ad2a..ba7620999a8d 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -518,7 +518,9 @@ static int wait_buckets_available(struct bch_fs *c, struct bch_dev *ca)
 				  ca->inc_gen_really_needs_gc);
 
 		if (available > fifo_free(&ca->free_inc) ||
-		    (available && !fifo_full(&ca->free[RESERVE_BTREE])))
+		    (available &&
+		     (!fifo_full(&ca->free[RESERVE_BTREE]) ||
+		      !fifo_full(&ca->free[RESERVE_MOVINGGC]))))
 			break;
 
 		up_read(&c->gc_lock);
diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index 2dc8a8ff0569..926c67e87043 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -144,12 +144,13 @@ static struct open_bucket *bch2_open_bucket_alloc(struct bch_fs *c)
 }
 
 static void open_bucket_free_unused(struct bch_fs *c,
-				    struct open_bucket *ob,
-				    bool may_realloc)
+				    struct write_point *wp,
+				    struct open_bucket *ob)
 {
 	struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev);
+	bool may_realloc = wp->type == BCH_DATA_user;
 
-	BUG_ON(ca->open_buckets_partial_nr >=
+	BUG_ON(ca->open_buckets_partial_nr >
 	       ARRAY_SIZE(ca->open_buckets_partial));
 
 	if (ca->open_buckets_partial_nr <
@@ -228,13 +229,22 @@ struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca,
 
 	spin_lock(&c->freelist_lock);
 
-	if (may_alloc_partial &&
-	    ca->open_buckets_partial_nr) {
-		ob = c->open_buckets +
-			ca->open_buckets_partial[--ca->open_buckets_partial_nr];
-		ob->on_partial_list = false;
-		spin_unlock(&c->freelist_lock);
-		return ob;
+	if (may_alloc_partial) {
+		int i;
+
+		for (i = ca->open_buckets_partial_nr - 1; i >= 0; --i) {
+			ob = c->open_buckets + ca->open_buckets_partial[i];
+
+			if (reserve <= ob->alloc_reserve) {
+				array_remove_item(ca->open_buckets_partial,
+						  ca->open_buckets_partial_nr,
+						  i);
+				ob->on_partial_list = false;
+				ob->alloc_reserve = reserve;
+				spin_unlock(&c->freelist_lock);
+				return ob;
+			}
+		}
 	}
 
 	if (unlikely(c->open_buckets_nr_free <= open_buckets_reserved(reserve))) {
@@ -291,6 +301,7 @@ out:
 
 	ob->valid	= true;
 	ob->sectors_free = ca->mi.bucket_size;
+	ob->alloc_reserve = reserve;
 	ob->ptr		= (struct bch_extent_ptr) {
 		.type	= 1 << BCH_EXTENT_ENTRY_ptr,
 		.gen	= buckets->b[bucket].mark.gen,
@@ -835,9 +846,6 @@ retry:
 alloc_done:
 	BUG_ON(!ret && nr_effective < nr_replicas);
 
-	WARN_ON(reserve == RESERVE_MOVINGGC &&
-		ret == FREELIST_EMPTY);
-
 	if (erasure_code && !ec_open_bucket(c, &ptrs))
 		pr_debug("failed to get ec bucket: ret %u", ret);
 
@@ -850,7 +858,7 @@ alloc_done:
 
 	/* Free buckets we didn't use: */
 	open_bucket_for_each(c, &wp->ptrs, ob, i)
-		open_bucket_free_unused(c, ob, wp->type == BCH_DATA_user);
+		open_bucket_free_unused(c, wp, ob);
 
 	wp->ptrs = ptrs;
 
@@ -869,8 +877,7 @@ err:
 		if (ptrs.nr < ARRAY_SIZE(ptrs.v))
 			ob_push(c, &ptrs, ob);
 		else
-			open_bucket_free_unused(c, ob,
-					wp->type == BCH_DATA_user);
+			open_bucket_free_unused(c, wp, ob);
 	wp->ptrs = ptrs;
 
 	mutex_unlock(&wp->lock);
@@ -938,6 +945,13 @@ void bch2_alloc_sectors_done(struct bch_fs *c, struct write_point *wp)
 	bch2_open_buckets_put(c, &ptrs);
 }
 
+static inline void writepoint_init(struct write_point *wp,
+				   enum bch_data_type type)
+{
+	mutex_init(&wp->lock);
+	wp->type = type;
+}
+
 void bch2_fs_allocator_foreground_init(struct bch_fs *c)
 {
 	struct open_bucket *ob;
diff --git a/fs/bcachefs/alloc_foreground.h b/fs/bcachefs/alloc_foreground.h
index dc8574a1a76a..c658295cb8e0 100644
--- a/fs/bcachefs/alloc_foreground.h
+++ b/fs/bcachefs/alloc_foreground.h
@@ -133,13 +133,6 @@ static inline struct write_point_specifier writepoint_ptr(struct write_point *wp
 	return (struct write_point_specifier) { .v = (unsigned long) wp };
 }
 
-static inline void writepoint_init(struct write_point *wp,
-				   enum bch_data_type type)
-{
-	mutex_init(&wp->lock);
-	wp->type = type;
-}
-
 void bch2_fs_allocator_foreground_init(struct bch_fs *);
 
 #endif /* _BCACHEFS_ALLOC_FOREGROUND_H */
diff --git a/fs/bcachefs/alloc_types.h b/fs/bcachefs/alloc_types.h
index 4f1465077994..20705460bb0a 100644
--- a/fs/bcachefs/alloc_types.h
+++ b/fs/bcachefs/alloc_types.h
@@ -66,6 +66,7 @@ struct open_bucket {
 	u8			type;
 	unsigned		valid:1;
 	unsigned		on_partial_list:1;
+	int			alloc_reserve:3;
 	unsigned		sectors_free;
 	struct bch_extent_ptr	ptr;
 	struct ec_stripe_new	*ec;
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index cebba06f3a96..4b20817402f6 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -954,8 +954,10 @@ int bch2_gc_gens(struct bch_fs *c)
 	for (i = 0; i < BTREE_ID_NR; i++)
 		if (btree_node_type_needs_gc(i)) {
 			ret = bch2_gc_btree_gens(c, i);
-			if (ret)
+			if (ret) {
+				bch_err(c, "error recalculating oldest_gen: %i", ret);
 				goto err;
+			}
 		}
 
 	for_each_member_device(ca, c, i) {
@@ -966,6 +968,8 @@ int bch2_gc_gens(struct bch_fs *c)
 			g->oldest_gen = g->gc_gen;
 		up_read(&ca->bucket_lock);
 	}
+
+	c->gc_count++;
 err:
 	up_read(&c->gc_lock);
 	return ret;
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index 6a43a89e0fdd..b5970f09609a 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -248,6 +248,7 @@ int bch2_migrate_write_init(struct bch_fs *c, struct migrate_write *m,
 
 	if (m->data_opts.btree_insert_flags & BTREE_INSERT_USE_RESERVE) {
 		m->op.alloc_reserve = RESERVE_MOVINGGC;
+		m->op.flags |= BCH_WRITE_ALLOC_NOWAIT;
 	} else {
 		/* XXX: this should probably be passed in */
 		m->op.flags |= BCH_WRITE_ONLY_SPECIFIED_DEVS;
diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c
index 44360bf03d29..25ae4e195c15 100644
--- a/fs/bcachefs/movinggc.c
+++ b/fs/bcachefs/movinggc.c
@@ -12,6 +12,7 @@
 #include "buckets.h"
 #include "clock.h"
 #include "disk_groups.h"
+#include "error.h"
 #include "extents.h"
 #include "eytzinger.h"
 #include "io.h"
@@ -104,7 +105,6 @@ static enum data_cmd copygc_pred(struct bch_fs *c, void *arg,
 	if (dev_idx < 0)
 		return DATA_SKIP;
 
-	/* XXX: use io_opts for this inode */
 	data_opts->target		= io_opts->background_target;
 	data_opts->btree_insert_flags	= BTREE_INSERT_USE_RESERVE;
 	data_opts->rewrite_dev		= dev_idx;
@@ -123,7 +123,7 @@ static bool have_copygc_reserve(struct bch_dev *ca)
 	return ret;
 }
 
-static void bch2_copygc(struct bch_fs *c)
+static int bch2_copygc(struct bch_fs *c)
 {
 	copygc_heap *h = &c->copygc_heap;
 	struct copygc_heap_entry e, *i;
@@ -153,7 +153,7 @@ static void bch2_copygc(struct bch_fs *c)
 		free_heap(&c->copygc_heap);
 		if (!init_heap(&c->copygc_heap, heap_size, GFP_KERNEL)) {
 			bch_err(c, "error allocating copygc heap");
-			return;
+			return 0;
 		}
 	}
 
@@ -178,6 +178,7 @@ static void bch2_copygc(struct bch_fs *c)
 				continue;
 
 			e = (struct copygc_heap_entry) {
+				.dev		= dev_idx,
 				.gen		= m.gen,
 				.sectors	= bucket_sectors_used(m),
 				.offset		= bucket_to_sector(ca, b),
@@ -187,6 +188,11 @@ static void bch2_copygc(struct bch_fs *c)
 		up_read(&ca->bucket_lock);
 	}
 
+	if (!sectors_reserved) {
+		bch2_fs_fatal_error(c, "stuck, ran out of copygc reserve!");
+		return -1;
+	}
+
 	for (i = h->data; i < h->data + h->used; i++)
 		sectors_to_move += i->sectors;
 
@@ -198,7 +204,7 @@ static void bch2_copygc(struct bch_fs *c)
 	buckets_to_move = h->used;
 
 	if (!buckets_to_move)
-		return;
+		return 0;
 
 	eytzinger0_sort(h->data, h->used,
 			sizeof(h->data[0]),
@@ -214,10 +220,17 @@ static void bch2_copygc(struct bch_fs *c)
 		down_read(&ca->bucket_lock);
 		buckets = bucket_array(ca);
 		for (i = h->data; i < h->data + h->used; i++) {
-			size_t b = sector_to_bucket(ca, i->offset);
-			struct bucket_mark m = READ_ONCE(buckets->b[b].mark);
+			struct bucket_mark m;
+			size_t b;
 
-			if (i->gen == m.gen && bucket_sectors_used(m)) {
+			if (i->dev != dev_idx)
+				continue;
+
+			b = sector_to_bucket(ca, i->offset);
+			m = READ_ONCE(buckets->b[b].mark);
+
+			if (i->gen == m.gen &&
+			    bucket_sectors_used(m)) {
 				sectors_not_moved += bucket_sectors_used(m);
 				buckets_not_moved++;
 			}
@@ -237,6 +250,7 @@ static void bch2_copygc(struct bch_fs *c)
 	trace_copygc(c,
 		     atomic64_read(&move_stats.sectors_moved), sectors_not_moved,
 		     buckets_to_move, buckets_not_moved);
+	return 0;
 }
 
 /*
@@ -292,7 +306,8 @@ static int bch2_copygc_thread(void *arg)
 			continue;
 		}
 
-		bch2_copygc(c);
+		if (bch2_copygc(c))
+			break;
 	}
 
 	return 0;
@@ -323,8 +338,7 @@ int bch2_copygc_start(struct bch_fs *c)
 	if (bch2_fs_init_fault("copygc_start"))
 		return -ENOMEM;
 
-	t = kthread_create(bch2_copygc_thread, c,
-			   "bch_copygc[%s]", c->name);
+	t = kthread_create(bch2_copygc_thread, c, "bch_copygc");
 	if (IS_ERR(t))
 		return PTR_ERR(t);
 
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 6dc899be5bd2..084976c9ac74 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -1308,12 +1308,20 @@ static bool bch2_fs_may_start(struct bch_fs *c)
 
 static void __bch2_dev_read_only(struct bch_fs *c, struct bch_dev *ca)
 {
+	/*
+	 * Device going read only means the copygc reserve get smaller, so we
+	 * don't want that happening while copygc is in progress:
+	 */
+	bch2_copygc_stop(c);
+
 	/*
 	 * The allocator thread itself allocates btree nodes, so stop it first:
 	 */
 	bch2_dev_allocator_stop(ca);
 	bch2_dev_allocator_remove(c, ca);
 	bch2_dev_journal_stop(&c->journal, ca);
+
+	bch2_copygc_start(c);
 }
 
 static const char *__bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca)
-- 
cgit 


From 33e339619f7fda8c428daa5cb8fde7d68dad2edb Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 22 Jul 2020 22:40:32 -0400
Subject: bcachefs: Fix a race with BCH_WRITE_SKIP_CLOSURE_PUT

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/io.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index 7e57ca2e1071..d9e35329f707 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -1110,6 +1110,16 @@ again:
 			goto flush_io;
 		}
 
+		/*
+		 * It's possible for the allocator to fail, put us on the
+		 * freelist waitlist, and then succeed in one of various retry
+		 * paths: if that happens, we need to disable the skip_put
+		 * optimization because otherwise there won't necessarily be a
+		 * barrier before we free the bch_write_op:
+		 */
+		if (atomic_read(&cl->remaining) & CLOSURE_WAITING)
+			skip_put = false;
+
 		bch2_open_bucket_get(c, wp, &op->open_buckets);
 		ret = bch2_write_extent(op, wp, &bio);
 		bch2_alloc_sectors_done(c, wp);
-- 
cgit 


From d3a2b5d809c1312559c1272d4a56cb31d19d3133 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 23 Jul 2020 11:31:01 -0400
Subject: bcachefs: Ensure we only allocate one EC bucket per writepoint

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_foreground.c | 26 +++++++++++++++-----------
 1 file changed, 15 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index 926c67e87043..169ddfad7ea0 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -578,18 +578,22 @@ open_bucket_add_buckets(struct bch_fs *c,
 		__clear_bit(ob->ptr.dev, devs.d);
 
 	if (erasure_code) {
-		get_buckets_from_writepoint(c, ptrs, wp, &devs,
-					    nr_replicas, nr_effective,
-					    have_cache, flags, true);
-		if (*nr_effective >= nr_replicas)
-			return 0;
+		if (!ec_open_bucket(c, ptrs)) {
+			get_buckets_from_writepoint(c, ptrs, wp, &devs,
+						    nr_replicas, nr_effective,
+						    have_cache, flags, true);
+			if (*nr_effective >= nr_replicas)
+				return 0;
+		}
 
-		bucket_alloc_from_stripe(c, ptrs, wp, &devs,
-					 target, erasure_code,
-					 nr_replicas, nr_effective,
-					 have_cache, flags);
-		if (*nr_effective >= nr_replicas)
-			return 0;
+		if (!ec_open_bucket(c, ptrs)) {
+			bucket_alloc_from_stripe(c, ptrs, wp, &devs,
+						 target, erasure_code,
+						 nr_replicas, nr_effective,
+						 have_cache, flags);
+			if (*nr_effective >= nr_replicas)
+				return 0;
+		}
 	}
 
 	get_buckets_from_writepoint(c, ptrs, wp, &devs,
-- 
cgit 


From f8058242205d59b8969990188f5590c054f90b5b Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 25 Jul 2020 14:19:37 -0400
Subject: bcachefs: Fix bch2_btree_node_insert_fits()

It should be checking for the recently added flag
btree_node_needs_rewrite.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update_interior.h | 2 +-
 fs/bcachefs/btree_update_leaf.c     | 9 +++------
 2 files changed, 4 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h
index 4a5b9dcfbdd0..812bafdc2d04 100644
--- a/fs/bcachefs/btree_update_interior.h
+++ b/fs/bcachefs/btree_update_interior.h
@@ -311,7 +311,7 @@ static inline void push_whiteout(struct bch_fs *c, struct btree *b,
 static inline bool bch2_btree_node_insert_fits(struct bch_fs *c,
 					       struct btree *b, unsigned u64s)
 {
-	if (unlikely(btree_node_fake(b)))
+	if (unlikely(btree_node_need_rewrite(b)))
 		return false;
 
 	return u64s <= bch_btree_keys_u64s_remaining(c, b);
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 262b4f3d9469..4a0e248f6f82 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -264,14 +264,12 @@ static inline int bch2_trans_journal_res_get(struct btree_trans *trans,
 static enum btree_insert_ret
 btree_key_can_insert(struct btree_trans *trans,
 		     struct btree_iter *iter,
-		     struct bkey_i *insert,
 		     unsigned u64s)
 {
 	struct bch_fs *c = trans->c;
 	struct btree *b = iter_l(iter)->b;
 
-	if (unlikely(btree_node_need_rewrite(b)) ||
-	    unlikely(u64s > bch_btree_keys_u64s_remaining(c, b)))
+	if (!bch2_btree_node_insert_fits(c, b, u64s))
 		return BTREE_INSERT_BTREE_NODE_FULL;
 
 	return BTREE_INSERT_OK;
@@ -280,7 +278,6 @@ btree_key_can_insert(struct btree_trans *trans,
 static enum btree_insert_ret
 btree_key_can_insert_cached(struct btree_trans *trans,
 			    struct btree_iter *iter,
-			    struct bkey_i *insert,
 			    unsigned u64s)
 {
 	struct bkey_cached *ck = (void *) iter->l[0].b;
@@ -398,8 +395,8 @@ bch2_trans_commit_write_locked(struct btree_trans *trans,
 
 		u64s += i->k->k.u64s;
 		ret = btree_iter_type(i->iter) != BTREE_ITER_CACHED
-			? btree_key_can_insert(trans, i->iter, i->k, u64s)
-			: btree_key_can_insert_cached(trans, i->iter, i->k, u64s);
+			? btree_key_can_insert(trans, i->iter, u64s)
+			: btree_key_can_insert_cached(trans, i->iter, u64s);
 		if (ret) {
 			*stopped_at = i;
 			return ret;
-- 
cgit 


From 760992aac852397d3d0e4b15fffc6ebc01d50e0d Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 25 Jul 2020 15:37:14 -0400
Subject: bcachefs: Ensure we wake up threads locking node when reusing it

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_cache.c           | 2 ++
 fs/bcachefs/btree_update_interior.c | 2 --
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index dedb2790445d..6280110ba32b 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -125,6 +125,8 @@ void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b)
 
 	/* Cause future lookups for this node to fail: */
 	b->hash_val = 0;
+
+	six_lock_wakeup_all(&b->c.lock);
 }
 
 int __bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b)
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 5317f29c2776..d81aa039d27f 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -138,8 +138,6 @@ static void __btree_node_free(struct bch_fs *c, struct btree *b)
 
 	bch2_btree_node_hash_remove(&c->btree_cache, b);
 
-	six_lock_wakeup_all(&b->c.lock);
-
 	mutex_lock(&c->btree_cache.lock);
 	list_move(&b->list, &c->btree_cache.freeable);
 	mutex_unlock(&c->btree_cache.lock);
-- 
cgit 


From 4580baec7fbee2fdceb9b5b2b337ea3734a6d2b8 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 25 Jul 2020 15:07:37 -0400
Subject: bcachefs: Remove some uses of PAGE_SIZE in the btree code

For portability to userspace, we should try to avoid working in kernel
pages.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bset.c        | 56 --------------------------------------------
 fs/bcachefs/bset.h        | 34 +++++++++++++++++++++++++--
 fs/bcachefs/btree_cache.c | 57 +++++++++++++++++++++++----------------------
 fs/bcachefs/btree_cache.h |  7 +-----
 fs/bcachefs/btree_io.c    | 59 ++++++++++++++++++++++-------------------------
 fs/bcachefs/btree_io.h    |  3 ++-
 fs/bcachefs/btree_types.h |  2 +-
 fs/bcachefs/io_types.h    |  1 -
 8 files changed, 93 insertions(+), 126 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c
index 797deaf0ad2e..2894666bb77e 100644
--- a/fs/bcachefs/bset.c
+++ b/fs/bcachefs/bset.c
@@ -301,44 +301,6 @@ struct rw_aux_tree {
 	struct bpos	k;
 };
 
-/*
- * BSET_CACHELINE was originally intended to match the hardware cacheline size -
- * it used to be 64, but I realized the lookup code would touch slightly less
- * memory if it was 128.
- *
- * It definites the number of bytes (in struct bset) per struct bkey_float in
- * the auxiliar search tree - when we're done searching the bset_float tree we
- * have this many bytes left that we do a linear search over.
- *
- * Since (after level 5) every level of the bset_tree is on a new cacheline,
- * we're touching one fewer cacheline in the bset tree in exchange for one more
- * cacheline in the linear search - but the linear search might stop before it
- * gets to the second cacheline.
- */
-
-#define BSET_CACHELINE		128
-
-/* Space required for the btree node keys */
-static inline size_t btree_keys_bytes(struct btree *b)
-{
-	return PAGE_SIZE << b->page_order;
-}
-
-static inline size_t btree_keys_cachelines(struct btree *b)
-{
-	return btree_keys_bytes(b) / BSET_CACHELINE;
-}
-
-static inline size_t btree_aux_data_bytes(struct btree *b)
-{
-	return btree_keys_cachelines(b) * 8;
-}
-
-static inline size_t btree_aux_data_u64s(struct btree *b)
-{
-	return btree_aux_data_bytes(b) / sizeof(u64);
-}
-
 static unsigned bset_aux_tree_buf_end(const struct bset_tree *t)
 {
 	BUG_ON(t->aux_data_offset == U16_MAX);
@@ -414,24 +376,6 @@ static void bset_aux_tree_verify(struct btree *b)
 #endif
 }
 
-/* Memory allocation */
-
-void bch2_btree_keys_free(struct btree *b)
-{
-	kvfree(b->aux_data);
-	b->aux_data = NULL;
-}
-
-int bch2_btree_keys_alloc(struct btree *b, unsigned page_order, gfp_t gfp)
-{
-	b->page_order	= page_order;
-	b->aux_data	= kvmalloc(btree_aux_data_bytes(b), gfp);
-	if (!b->aux_data)
-		return -ENOMEM;
-
-	return 0;
-}
-
 void bch2_btree_keys_init(struct btree *b, bool *expensive_debug_checks)
 {
 	unsigned i;
diff --git a/fs/bcachefs/bset.h b/fs/bcachefs/bset.h
index a2e5e3ee68db..88f242191408 100644
--- a/fs/bcachefs/bset.h
+++ b/fs/bcachefs/bset.h
@@ -184,6 +184,38 @@ static inline enum bset_aux_tree_type bset_aux_tree_type(const struct bset_tree
 	}
 }
 
+/*
+ * BSET_CACHELINE was originally intended to match the hardware cacheline size -
+ * it used to be 64, but I realized the lookup code would touch slightly less
+ * memory if it was 128.
+ *
+ * It definites the number of bytes (in struct bset) per struct bkey_float in
+ * the auxiliar search tree - when we're done searching the bset_float tree we
+ * have this many bytes left that we do a linear search over.
+ *
+ * Since (after level 5) every level of the bset_tree is on a new cacheline,
+ * we're touching one fewer cacheline in the bset tree in exchange for one more
+ * cacheline in the linear search - but the linear search might stop before it
+ * gets to the second cacheline.
+ */
+
+#define BSET_CACHELINE		128
+
+static inline size_t btree_keys_cachelines(struct btree *b)
+{
+	return (1U << b->byte_order) / BSET_CACHELINE;
+}
+
+static inline size_t btree_aux_data_bytes(struct btree *b)
+{
+	return btree_keys_cachelines(b) * 8;
+}
+
+static inline size_t btree_aux_data_u64s(struct btree *b)
+{
+	return btree_aux_data_bytes(b) / sizeof(u64);
+}
+
 typedef void (*compiled_unpack_fn)(struct bkey *, const struct bkey_packed *);
 
 static inline void
@@ -334,8 +366,6 @@ static inline struct bset *bset_next_set(struct btree *b,
 	return ((void *) i) + round_up(vstruct_bytes(i), block_bytes);
 }
 
-void bch2_btree_keys_free(struct btree *);
-int bch2_btree_keys_alloc(struct btree *, unsigned, gfp_t);
 void bch2_btree_keys_init(struct btree *, bool *);
 
 void bch2_bset_init_first(struct btree *, struct bset *);
diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index 6280110ba32b..829bff37df8d 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -44,7 +44,8 @@ static void __btree_node_data_free(struct bch_fs *c, struct btree *b)
 
 	kvpfree(b->data, btree_bytes(c));
 	b->data = NULL;
-	bch2_btree_keys_free(b);
+	kvfree(b->aux_data);
+	b->aux_data = NULL;
 }
 
 static void btree_node_data_free(struct bch_fs *c, struct btree *b)
@@ -72,7 +73,7 @@ static const struct rhashtable_params bch_btree_cache_params = {
 	.obj_cmpfn	= bch2_btree_cache_cmp_fn,
 };
 
-static int __btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp)
+static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp)
 {
 	BUG_ON(b->data || b->aux_data);
 
@@ -80,7 +81,8 @@ static int __btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp)
 	if (!b->data)
 		return -ENOMEM;
 
-	if (bch2_btree_keys_alloc(b, btree_page_order(c), gfp)) {
+	b->aux_data = kvmalloc(btree_aux_data_bytes(b), gfp);
+	if (!b->aux_data) {
 		kvpfree(b->data, btree_bytes(c));
 		b->data = NULL;
 		return -ENOMEM;
@@ -89,21 +91,9 @@ static int __btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp)
 	return 0;
 }
 
-static void btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp)
+static struct btree *__btree_node_mem_alloc(struct bch_fs *c)
 {
-	struct btree_cache *bc = &c->btree_cache;
-
-	if (!__btree_node_data_alloc(c, b, gfp)) {
-		bc->used++;
-		list_move(&b->list, &bc->freeable);
-	} else {
-		list_move(&b->list, &bc->freed);
-	}
-}
-
-static struct btree *btree_node_mem_alloc(struct bch_fs *c, gfp_t gfp)
-{
-	struct btree *b = kzalloc(sizeof(struct btree), gfp);
+	struct btree *b = kzalloc(sizeof(struct btree), GFP_KERNEL);
 	if (!b)
 		return NULL;
 
@@ -112,9 +102,25 @@ static struct btree *btree_node_mem_alloc(struct bch_fs *c, gfp_t gfp)
 	lockdep_set_novalidate_class(&b->c.lock);
 	INIT_LIST_HEAD(&b->list);
 	INIT_LIST_HEAD(&b->write_blocked);
+	b->byte_order = ilog2(btree_bytes(c));
+	return b;
+}
 
-	btree_node_data_alloc(c, b, gfp);
-	return b->data ? b : NULL;
+static struct btree *btree_node_mem_alloc(struct bch_fs *c)
+{
+	struct btree_cache *bc = &c->btree_cache;
+	struct btree *b = __btree_node_mem_alloc(c);
+	if (!b)
+		return NULL;
+
+	if (btree_node_data_alloc(c, b, GFP_KERNEL)) {
+		kfree(b);
+		return NULL;
+	}
+
+	bc->used++;
+	list_add(&b->list, &bc->freeable);
+	return b;
 }
 
 /* Btree in memory cache - hash table */
@@ -405,7 +411,7 @@ int bch2_fs_btree_cache_init(struct bch_fs *c)
 	bch2_recalc_btree_reserve(c);
 
 	for (i = 0; i < bc->reserve; i++)
-		if (!btree_node_mem_alloc(c, GFP_KERNEL)) {
+		if (!btree_node_mem_alloc(c)) {
 			ret = -ENOMEM;
 			goto out;
 		}
@@ -421,7 +427,7 @@ int bch2_fs_btree_cache_init(struct bch_fs *c)
 		goto out;
 	}
 
-	c->verify_data = btree_node_mem_alloc(c, GFP_KERNEL);
+	c->verify_data = btree_node_mem_alloc(c);
 	if (!c->verify_data) {
 		ret = -ENOMEM;
 		goto out;
@@ -553,21 +559,16 @@ got_node:
 	mutex_unlock(&bc->lock);
 
 	if (!b) {
-		b = kzalloc(sizeof(struct btree), GFP_KERNEL);
+		b = __btree_node_mem_alloc(c);
 		if (!b)
 			goto err;
 
-		bkey_btree_ptr_init(&b->key);
-		six_lock_init(&b->c.lock);
-		INIT_LIST_HEAD(&b->list);
-		INIT_LIST_HEAD(&b->write_blocked);
-
 		BUG_ON(!six_trylock_intent(&b->c.lock));
 		BUG_ON(!six_trylock_write(&b->c.lock));
 	}
 
 	if (!b->data) {
-		if (__btree_node_data_alloc(c, b, __GFP_NOWARN|GFP_KERNEL))
+		if (btree_node_data_alloc(c, b, __GFP_NOWARN|GFP_KERNEL))
 			goto err;
 
 		mutex_lock(&bc->lock);
diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h
index 2160012c734f..d0d3a85bb8be 100644
--- a/fs/bcachefs/btree_cache.h
+++ b/fs/bcachefs/btree_cache.h
@@ -79,14 +79,9 @@ static inline size_t btree_max_u64s(struct bch_fs *c)
 	return (btree_bytes(c) - sizeof(struct btree_node)) / sizeof(u64);
 }
 
-static inline size_t btree_page_order(struct bch_fs *c)
-{
-	return get_order(btree_bytes(c));
-}
-
 static inline size_t btree_pages(struct bch_fs *c)
 {
-	return 1 << btree_page_order(c);
+	return btree_bytes(c) / PAGE_SIZE;
 }
 
 static inline unsigned btree_blocks(struct bch_fs *c)
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index c8870a15a44f..f80b93a54c08 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -57,25 +57,25 @@ static void set_needs_whiteout(struct bset *i, int v)
 		k->needs_whiteout = v;
 }
 
-static void btree_bounce_free(struct bch_fs *c, unsigned order,
+static void btree_bounce_free(struct bch_fs *c, size_t size,
 			      bool used_mempool, void *p)
 {
 	if (used_mempool)
 		mempool_free(p, &c->btree_bounce_pool);
 	else
-		vpfree(p, PAGE_SIZE << order);
+		vpfree(p, size);
 }
 
-static void *btree_bounce_alloc(struct bch_fs *c, unsigned order,
+static void *btree_bounce_alloc(struct bch_fs *c, size_t size,
 				bool *used_mempool)
 {
 	unsigned flags = memalloc_nofs_save();
 	void *p;
 
-	BUG_ON(order > btree_page_order(c));
+	BUG_ON(size > btree_bytes(c));
 
 	*used_mempool = false;
-	p = (void *) __get_free_pages(__GFP_NOWARN|GFP_NOWAIT, order);
+	p = vpmalloc(size, __GFP_NOWARN|GFP_NOWAIT);
 	if (!p) {
 		*used_mempool = true;
 		p = mempool_alloc(&c->btree_bounce_pool, GFP_NOIO);
@@ -125,16 +125,14 @@ static void bch2_sort_whiteouts(struct bch_fs *c, struct btree *b)
 {
 	struct bkey_packed *new_whiteouts, **ptrs, **ptrs_end, *k;
 	bool used_mempool = false;
-	unsigned order;
+	size_t bytes = b->whiteout_u64s * sizeof(u64);
 
 	if (!b->whiteout_u64s)
 		return;
 
-	order = get_order(b->whiteout_u64s * sizeof(u64));
+	new_whiteouts = btree_bounce_alloc(c, bytes, &used_mempool);
 
-	new_whiteouts = btree_bounce_alloc(c, order, &used_mempool);
-
-	ptrs = ptrs_end = ((void *) new_whiteouts + (PAGE_SIZE << order));
+	ptrs = ptrs_end = ((void *) new_whiteouts + bytes);
 
 	for (k = unwritten_whiteouts_start(c, b);
 	     k != unwritten_whiteouts_end(c, b);
@@ -158,7 +156,7 @@ static void bch2_sort_whiteouts(struct bch_fs *c, struct btree *b)
 	memcpy_u64s(unwritten_whiteouts_start(c, b),
 		    new_whiteouts, b->whiteout_u64s);
 
-	btree_bounce_free(c, order, used_mempool, new_whiteouts);
+	btree_bounce_free(c, bytes, used_mempool, new_whiteouts);
 }
 
 static bool should_compact_bset(struct btree *b, struct bset_tree *t,
@@ -187,7 +185,7 @@ static bool bch2_compact_extent_whiteouts(struct bch_fs *c,
 	struct bkey_packed *whiteouts = NULL;
 	struct bkey_packed *u_start, *u_pos;
 	struct sort_iter sort_iter;
-	unsigned order, whiteout_u64s = 0, u64s;
+	unsigned bytes, whiteout_u64s = 0, u64s;
 	bool used_mempool, compacting = false;
 
 	BUG_ON(!btree_node_is_extents(b));
@@ -204,9 +202,9 @@ static bool bch2_compact_extent_whiteouts(struct bch_fs *c,
 	sort_iter_init(&sort_iter, b);
 
 	whiteout_u64s += b->whiteout_u64s;
-	order = get_order(whiteout_u64s * sizeof(u64));
+	bytes = whiteout_u64s * sizeof(u64);
 
-	whiteouts = btree_bounce_alloc(c, order, &used_mempool);
+	whiteouts = btree_bounce_alloc(c, bytes, &used_mempool);
 	u_start = u_pos = whiteouts;
 
 	memcpy_u64s(u_pos, unwritten_whiteouts_start(c, b),
@@ -306,7 +304,7 @@ static bool bch2_compact_extent_whiteouts(struct bch_fs *c,
 		       unwritten_whiteouts_end(c, b),
 		       true);
 
-	btree_bounce_free(c, order, used_mempool, whiteouts);
+	btree_bounce_free(c, bytes, used_mempool, whiteouts);
 
 	bch2_btree_build_aux_trees(b);
 
@@ -401,7 +399,7 @@ static void btree_node_sort(struct bch_fs *c, struct btree *b,
 	struct bset *start_bset = bset(b, &b->set[start_idx]);
 	bool used_mempool = false;
 	u64 start_time, seq = 0;
-	unsigned i, u64s = 0, order, shift = end_idx - start_idx - 1;
+	unsigned i, u64s = 0, bytes, shift = end_idx - start_idx - 1;
 	bool sorting_entire_node = start_idx == 0 &&
 		end_idx == b->nsets;
 
@@ -416,11 +414,11 @@ static void btree_node_sort(struct bch_fs *c, struct btree *b,
 			      btree_bkey_last(b, t));
 	}
 
-	order = sorting_entire_node
-		? btree_page_order(c)
-		: get_order(__vstruct_bytes(struct btree_node, u64s));
+	bytes = sorting_entire_node
+		? btree_bytes(c)
+		: __vstruct_bytes(struct btree_node, u64s);
 
-	out = btree_bounce_alloc(c, order, &used_mempool);
+	out = btree_bounce_alloc(c, bytes, &used_mempool);
 
 	start_time = local_clock();
 
@@ -435,7 +433,7 @@ static void btree_node_sort(struct bch_fs *c, struct btree *b,
 
 	out->keys.u64s = cpu_to_le16(u64s);
 
-	BUG_ON(vstruct_end(&out->keys) > (void *) out + (PAGE_SIZE << order));
+	BUG_ON(vstruct_end(&out->keys) > (void *) out + bytes);
 
 	if (sorting_entire_node)
 		bch2_time_stats_update(&c->times[BCH_TIME_btree_node_sort],
@@ -449,7 +447,7 @@ static void btree_node_sort(struct bch_fs *c, struct btree *b,
 	if (sorting_entire_node) {
 		unsigned u64s = le16_to_cpu(out->keys.u64s);
 
-		BUG_ON(order != btree_page_order(c));
+		BUG_ON(bytes != btree_bytes(c));
 
 		/*
 		 * Our temporary buffer is the same size as the btree node's
@@ -484,7 +482,7 @@ static void btree_node_sort(struct bch_fs *c, struct btree *b,
 	set_btree_bset_end(b, &b->set[start_idx]);
 	bch2_bset_set_no_aux_tree(b, &b->set[start_idx]);
 
-	btree_bounce_free(c, order, used_mempool, out);
+	btree_bounce_free(c, bytes, used_mempool, out);
 
 	bch2_verify_btree_nr_keys(b);
 }
@@ -1043,7 +1041,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry
 			     BTREE_ERR_WANT_RETRY, c, b, NULL,
 			     "found bset signature after last bset");
 
-	sorted = btree_bounce_alloc(c, btree_page_order(c), &used_mempool);
+	sorted = btree_bounce_alloc(c, btree_bytes(c), &used_mempool);
 	sorted->keys.u64s = 0;
 
 	set_btree_bset(b, b->set, &b->data->keys);
@@ -1061,7 +1059,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry
 
 	BUG_ON(b->nr.live_u64s != u64s);
 
-	btree_bounce_free(c, btree_page_order(c), used_mempool, sorted);
+	btree_bounce_free(c, btree_bytes(c), used_mempool, sorted);
 
 	i = &b->data->keys;
 	for (k = i->start; k != vstruct_last(i);) {
@@ -1403,7 +1401,7 @@ static void btree_node_write_work(struct work_struct *work)
 	struct btree *b		= wbio->wbio.bio.bi_private;
 
 	btree_bounce_free(c,
-		wbio->wbio.order,
+		wbio->bytes,
 		wbio->wbio.used_mempool,
 		wbio->data);
 
@@ -1486,7 +1484,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
 	struct bch_extent_ptr *ptr;
 	struct sort_iter sort_iter;
 	struct nonce nonce;
-	unsigned bytes_to_write, sectors_to_write, order, bytes, u64s;
+	unsigned bytes_to_write, sectors_to_write, bytes, u64s;
 	u64 seq = 0;
 	bool used_mempool;
 	unsigned long old, new;
@@ -1556,8 +1554,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
 		seq = max(seq, le64_to_cpu(i->journal_seq));
 	}
 
-	order = get_order(bytes);
-	data = btree_bounce_alloc(c, order, &used_mempool);
+	data = btree_bounce_alloc(c, bytes, &used_mempool);
 
 	if (!b->written) {
 		bn = data;
@@ -1671,7 +1668,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
 			    struct btree_write_bio, wbio.bio);
 	wbio_init(&wbio->wbio.bio);
 	wbio->data			= data;
-	wbio->wbio.order		= order;
+	wbio->bytes			= bytes;
 	wbio->wbio.used_mempool		= used_mempool;
 	wbio->wbio.bio.bi_end_io	= btree_node_write_endio;
 	wbio->wbio.bio.bi_private	= b;
@@ -1707,7 +1704,7 @@ err:
 	set_btree_node_noevict(b);
 	b->written += sectors_to_write;
 nowrite:
-	btree_bounce_free(c, order, used_mempool, data);
+	btree_bounce_free(c, bytes, used_mempool, data);
 	btree_node_write_done(c, b);
 }
 
diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h
index f3d7ec749b61..db013dc28eec 100644
--- a/fs/bcachefs/btree_io.h
+++ b/fs/bcachefs/btree_io.h
@@ -23,8 +23,9 @@ struct btree_read_bio {
 };
 
 struct btree_write_bio {
-	void			*data;
 	struct work_struct	work;
+	void			*data;
+	unsigned		bytes;
 	struct bch_write_bio	wbio;
 };
 
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index dd272318fba1..297cf26ca13e 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -94,7 +94,7 @@ struct btree {
 	struct btree_nr_keys	nr;
 	u16			sib_u64s[2];
 	u16			whiteout_u64s;
-	u8			page_order;
+	u8			byte_order;
 	u8			unpack_fn_len;
 
 	/*
diff --git a/fs/bcachefs/io_types.h b/fs/bcachefs/io_types.h
index 692af6dd6031..65969eeac253 100644
--- a/fs/bcachefs/io_types.h
+++ b/fs/bcachefs/io_types.h
@@ -79,7 +79,6 @@ struct bch_write_bio {
 	u64			submit_time;
 
 	struct bch_devs_list	failed;
-	u8			order;
 	u8			dev;
 
 	unsigned		split:1,
-- 
cgit 


From 7807e143849e0f86fce6ce7d4907412915d29918 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 25 Jul 2020 17:06:11 -0400
Subject: bcachefs: Convert various code to printbuf

printbufs know how big the buffer is that was allocated, so we can get
rid of the random PAGE_SIZEs all over the place.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_io.c              |   7 +-
 fs/bcachefs/btree_io.h              |   2 +-
 fs/bcachefs/btree_update_interior.c |   7 +-
 fs/bcachefs/btree_update_interior.h |   2 +-
 fs/bcachefs/clock.c                 |   7 +-
 fs/bcachefs/clock.h                 |   2 +-
 fs/bcachefs/ec.c                    |  29 +++++
 fs/bcachefs/ec.h                    |   1 +
 fs/bcachefs/journal.c               |  36 +++---
 fs/bcachefs/journal.h               |   4 +-
 fs/bcachefs/rebalance.c             |  19 ++--
 fs/bcachefs/rebalance.h             |   2 +-
 fs/bcachefs/sysfs.c                 | 220 ++++++++++++++++--------------------
 fs/bcachefs/util.c                  |  25 ++--
 fs/bcachefs/util.h                  |   2 +-
 15 files changed, 177 insertions(+), 188 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index f80b93a54c08..d3ea43dd9fe6 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -1835,9 +1835,8 @@ void bch2_btree_verify_flushed(struct bch_fs *c)
 	rcu_read_unlock();
 }
 
-ssize_t bch2_dirty_btree_nodes_print(struct bch_fs *c, char *buf)
+void bch2_dirty_btree_nodes_to_text(struct printbuf *out, struct bch_fs *c)
 {
-	struct printbuf out = _PBUF(buf, PAGE_SIZE);
 	struct bucket_table *tbl;
 	struct rhash_head *pos;
 	struct btree *b;
@@ -1850,7 +1849,7 @@ ssize_t bch2_dirty_btree_nodes_print(struct bch_fs *c, char *buf)
 		if (!(flags & (1 << BTREE_NODE_dirty)))
 			continue;
 
-		pr_buf(&out, "%p d %u n %u l %u w %u b %u r %u:%lu\n",
+		pr_buf(out, "%p d %u n %u l %u w %u b %u r %u:%lu\n",
 		       b,
 		       (flags & (1 << BTREE_NODE_dirty)) != 0,
 		       (flags & (1 << BTREE_NODE_need_write)) != 0,
@@ -1861,6 +1860,4 @@ ssize_t bch2_dirty_btree_nodes_print(struct bch_fs *c, char *buf)
 		       b->will_make_reachable & 1);
 	}
 	rcu_read_unlock();
-
-	return out.pos - buf;
 }
diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h
index db013dc28eec..66ebdd39f5b3 100644
--- a/fs/bcachefs/btree_io.h
+++ b/fs/bcachefs/btree_io.h
@@ -140,7 +140,7 @@ do {									\
 void bch2_btree_flush_all_reads(struct bch_fs *);
 void bch2_btree_flush_all_writes(struct bch_fs *);
 void bch2_btree_verify_flushed(struct bch_fs *);
-ssize_t bch2_dirty_btree_nodes_print(struct bch_fs *, char *);
+void bch2_dirty_btree_nodes_to_text(struct printbuf *, struct bch_fs *);
 
 static inline void compat_bformat(unsigned level, enum btree_id btree_id,
 				 unsigned version, unsigned big_endian,
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index d81aa039d27f..963213e78f31 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -1974,22 +1974,19 @@ void bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id)
 	six_unlock_intent(&b->c.lock);
 }
 
-ssize_t bch2_btree_updates_print(struct bch_fs *c, char *buf)
+void bch2_btree_updates_to_text(struct printbuf *out, struct bch_fs *c)
 {
-	struct printbuf out = _PBUF(buf, PAGE_SIZE);
 	struct btree_update *as;
 
 	mutex_lock(&c->btree_interior_update_lock);
 	list_for_each_entry(as, &c->btree_interior_update_list, list)
-		pr_buf(&out, "%p m %u w %u r %u j %llu\n",
+		pr_buf(out, "%p m %u w %u r %u j %llu\n",
 		       as,
 		       as->mode,
 		       as->nodes_written,
 		       atomic_read(&as->cl.remaining) & CLOSURE_REMAINING_MASK,
 		       as->journal.seq);
 	mutex_unlock(&c->btree_interior_update_lock);
-
-	return out.pos - buf;
 }
 
 size_t bch2_btree_interior_updates_nr_pending(struct bch_fs *c)
diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h
index 812bafdc2d04..7668225e72c6 100644
--- a/fs/bcachefs/btree_update_interior.h
+++ b/fs/bcachefs/btree_update_interior.h
@@ -317,7 +317,7 @@ static inline bool bch2_btree_node_insert_fits(struct bch_fs *c,
 	return u64s <= bch_btree_keys_u64s_remaining(c, b);
 }
 
-ssize_t bch2_btree_updates_print(struct bch_fs *, char *);
+void bch2_btree_updates_to_text(struct printbuf *, struct bch_fs *);
 
 size_t bch2_btree_interior_updates_nr_pending(struct bch_fs *);
 
diff --git a/fs/bcachefs/clock.c b/fs/bcachefs/clock.c
index 163058173252..869ba1887757 100644
--- a/fs/bcachefs/clock.c
+++ b/fs/bcachefs/clock.c
@@ -152,9 +152,8 @@ void __bch2_increment_clock(struct io_clock *clock, unsigned sectors)
 		timer->fn(timer);
 }
 
-ssize_t bch2_io_timers_show(struct io_clock *clock, char *buf)
+void bch2_io_timers_to_text(struct printbuf *out, struct io_clock *clock)
 {
-	struct printbuf out = _PBUF(buf, PAGE_SIZE);
 	unsigned long now;
 	unsigned i;
 
@@ -162,12 +161,10 @@ ssize_t bch2_io_timers_show(struct io_clock *clock, char *buf)
 	now = atomic_long_read(&clock->now);
 
 	for (i = 0; i < clock->timers.used; i++)
-		pr_buf(&out, "%ps:\t%li\n",
+		pr_buf(out, "%ps:\t%li\n",
 		       clock->timers.data[i]->fn,
 		       clock->timers.data[i]->expire - now);
 	spin_unlock(&clock->timer_lock);
-
-	return out.pos - buf;
 }
 
 void bch2_io_clock_exit(struct io_clock *clock)
diff --git a/fs/bcachefs/clock.h b/fs/bcachefs/clock.h
index da50afe206cc..70a0f7436c84 100644
--- a/fs/bcachefs/clock.h
+++ b/fs/bcachefs/clock.h
@@ -30,7 +30,7 @@ void bch2_io_clock_schedule_timeout(struct io_clock *, unsigned long);
 	__ret;								\
 })
 
-ssize_t bch2_io_timers_show(struct io_clock *, char *);
+void bch2_io_timers_to_text(struct printbuf *, struct io_clock *);
 
 void bch2_io_clock_exit(struct io_clock *);
 int bch2_io_clock_init(struct io_clock *);
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index 8c04e7ced88b..61bc34225bf1 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -1575,6 +1575,35 @@ void bch2_stripes_heap_to_text(struct printbuf *out, struct bch_fs *c)
 	spin_unlock(&c->ec_stripes_heap_lock);
 }
 
+void bch2_new_stripes_to_text(struct printbuf *out, struct bch_fs *c)
+{
+	struct ec_stripe_head *h;
+	struct ec_stripe_new *s;
+
+	mutex_lock(&c->ec_stripe_head_lock);
+	list_for_each_entry(h, &c->ec_stripe_head_list, list) {
+		pr_buf(out, "target %u algo %u redundancy %u:\n",
+		       h->target, h->algo, h->redundancy);
+
+		if (h->s)
+			pr_buf(out, "\tpending: blocks %u allocated %u\n",
+			       h->s->blocks.nr,
+			       bitmap_weight(h->s->blocks_allocated,
+					     h->s->blocks.nr));
+	}
+	mutex_unlock(&c->ec_stripe_head_lock);
+
+	mutex_lock(&c->ec_stripe_new_lock);
+	list_for_each_entry(h, &c->ec_stripe_new_list, list) {
+		pr_buf(out, "\tin flight: blocks %u allocated %u pin %u\n",
+		       s->blocks.nr,
+		       bitmap_weight(s->blocks_allocated,
+				     s->blocks.nr),
+		       atomic_read(&s->pin));
+	}
+	mutex_unlock(&c->ec_stripe_new_lock);
+}
+
 void bch2_fs_ec_exit(struct bch_fs *c)
 {
 	struct ec_stripe_head *h;
diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h
index ad9078fdb045..f8fc3d616cd7 100644
--- a/fs/bcachefs/ec.h
+++ b/fs/bcachefs/ec.h
@@ -161,6 +161,7 @@ int bch2_stripes_write(struct bch_fs *, unsigned, bool *);
 int bch2_ec_mem_alloc(struct bch_fs *, bool);
 
 void bch2_stripes_heap_to_text(struct printbuf *, struct bch_fs *);
+void bch2_new_stripes_to_text(struct printbuf *, struct bch_fs *);
 
 void bch2_fs_ec_exit(struct bch_fs *);
 int bch2_fs_ec_init(struct bch_fs *);
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index 127787cd3e03..8b0746e092de 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -1137,9 +1137,8 @@ out:
 
 /* debug: */
 
-ssize_t bch2_journal_print_debug(struct journal *j, char *buf)
+void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
 {
-	struct printbuf out = _PBUF(buf, PAGE_SIZE);
 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
 	union journal_res_state s;
 	struct bch_dev *ca;
@@ -1149,7 +1148,7 @@ ssize_t bch2_journal_print_debug(struct journal *j, char *buf)
 	spin_lock(&j->lock);
 	s = READ_ONCE(j->reservations);
 
-	pr_buf(&out,
+	pr_buf(out,
 	       "active journal entries:\t%llu\n"
 	       "seq:\t\t\t%llu\n"
 	       "last_seq:\t\t%llu\n"
@@ -1167,31 +1166,31 @@ ssize_t bch2_journal_print_debug(struct journal *j, char *buf)
 
 	switch (s.cur_entry_offset) {
 	case JOURNAL_ENTRY_ERROR_VAL:
-		pr_buf(&out, "error\n");
+		pr_buf(out, "error\n");
 		break;
 	case JOURNAL_ENTRY_CLOSED_VAL:
-		pr_buf(&out, "closed\n");
+		pr_buf(out, "closed\n");
 		break;
 	default:
-		pr_buf(&out, "%u/%u\n",
+		pr_buf(out, "%u/%u\n",
 		       s.cur_entry_offset,
 		       j->cur_entry_u64s);
 		break;
 	}
 
-	pr_buf(&out,
+	pr_buf(out,
 	       "current entry refs:\t%u\n"
 	       "prev entry unwritten:\t",
 	       journal_state_count(s, s.idx));
 
 	if (s.prev_buf_unwritten)
-		pr_buf(&out, "yes, ref %u sectors %u\n",
+		pr_buf(out, "yes, ref %u sectors %u\n",
 		       journal_state_count(s, !s.idx),
 		       journal_prev_buf(j)->sectors);
 	else
-		pr_buf(&out, "no\n");
+		pr_buf(out, "no\n");
 
-	pr_buf(&out,
+	pr_buf(out,
 	       "need write:\t\t%i\n"
 	       "replay done:\t\t%i\n",
 	       test_bit(JOURNAL_NEED_WRITE,	&j->flags),
@@ -1204,7 +1203,7 @@ ssize_t bch2_journal_print_debug(struct journal *j, char *buf)
 		if (!ja->nr)
 			continue;
 
-		pr_buf(&out,
+		pr_buf(out,
 		       "dev %u:\n"
 		       "\tnr\t\t%u\n"
 		       "\tavailable\t%u:%u\n"
@@ -1223,34 +1222,29 @@ ssize_t bch2_journal_print_debug(struct journal *j, char *buf)
 
 	spin_unlock(&j->lock);
 	rcu_read_unlock();
-
-	return out.pos - buf;
 }
 
-ssize_t bch2_journal_print_pins(struct journal *j, char *buf)
+void bch2_journal_pins_to_text(struct printbuf *out, struct journal *j)
 {
-	struct printbuf out = _PBUF(buf, PAGE_SIZE);
 	struct journal_entry_pin_list *pin_list;
 	struct journal_entry_pin *pin;
 	u64 i;
 
 	spin_lock(&j->lock);
 	fifo_for_each_entry_ptr(pin_list, &j->pin, i) {
-		pr_buf(&out, "%llu: count %u\n",
+		pr_buf(out, "%llu: count %u\n",
 		       i, atomic_read(&pin_list->count));
 
 		list_for_each_entry(pin, &pin_list->list, list)
-			pr_buf(&out, "\t%px %ps\n",
+			pr_buf(out, "\t%px %ps\n",
 			       pin, pin->flush);
 
 		if (!list_empty(&pin_list->flushed))
-			pr_buf(&out, "flushed:\n");
+			pr_buf(out, "flushed:\n");
 
 		list_for_each_entry(pin, &pin_list->flushed, list)
-			pr_buf(&out, "\t%px %ps\n",
+			pr_buf(out, "\t%px %ps\n",
 			       pin, pin->flush);
 	}
 	spin_unlock(&j->lock);
-
-	return out.pos - buf;
 }
diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h
index f14dfa59e702..26654b9cf0ea 100644
--- a/fs/bcachefs/journal.h
+++ b/fs/bcachefs/journal.h
@@ -501,8 +501,8 @@ static inline void bch2_journal_set_replay_done(struct journal *j)
 void bch2_journal_unblock(struct journal *);
 void bch2_journal_block(struct journal *);
 
-ssize_t bch2_journal_print_debug(struct journal *, char *);
-ssize_t bch2_journal_print_pins(struct journal *, char *);
+void bch2_journal_debug_to_text(struct printbuf *, struct journal *);
+void bch2_journal_pins_to_text(struct printbuf *, struct journal *);
 
 int bch2_set_nr_journal_buckets(struct bch_fs *, struct bch_dev *,
 				unsigned nr);
diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c
index eb3f7d02c791..a0bbddeac623 100644
--- a/fs/bcachefs/rebalance.c
+++ b/fs/bcachefs/rebalance.c
@@ -249,45 +249,42 @@ static int bch2_rebalance_thread(void *arg)
 	return 0;
 }
 
-ssize_t bch2_rebalance_work_show(struct bch_fs *c, char *buf)
+void bch2_rebalance_work_to_text(struct printbuf *out, struct bch_fs *c)
 {
-	struct printbuf out = _PBUF(buf, PAGE_SIZE);
 	struct bch_fs_rebalance *r = &c->rebalance;
 	struct rebalance_work w = rebalance_work(c);
 	char h1[21], h2[21];
 
 	bch2_hprint(&PBUF(h1), w.dev_most_full_work << 9);
 	bch2_hprint(&PBUF(h2), w.dev_most_full_capacity << 9);
-	pr_buf(&out, "fullest_dev (%i):\t%s/%s\n",
+	pr_buf(out, "fullest_dev (%i):\t%s/%s\n",
 	       w.dev_most_full_idx, h1, h2);
 
 	bch2_hprint(&PBUF(h1), w.total_work << 9);
 	bch2_hprint(&PBUF(h2), c->capacity << 9);
-	pr_buf(&out, "total work:\t\t%s/%s\n", h1, h2);
+	pr_buf(out, "total work:\t\t%s/%s\n", h1, h2);
 
-	pr_buf(&out, "rate:\t\t\t%u\n", r->pd.rate.rate);
+	pr_buf(out, "rate:\t\t\t%u\n", r->pd.rate.rate);
 
 	switch (r->state) {
 	case REBALANCE_WAITING:
-		pr_buf(&out, "waiting\n");
+		pr_buf(out, "waiting\n");
 		break;
 	case REBALANCE_THROTTLED:
 		bch2_hprint(&PBUF(h1),
 			    (r->throttled_until_iotime -
 			     atomic_long_read(&c->io_clock[WRITE].now)) << 9);
-		pr_buf(&out, "throttled for %lu sec or %s io\n",
+		pr_buf(out, "throttled for %lu sec or %s io\n",
 		       (r->throttled_until_cputime - jiffies) / HZ,
 		       h1);
 		break;
 	case REBALANCE_RUNNING:
-		pr_buf(&out, "running\n");
-		pr_buf(&out, "pos %llu:%llu\n",
+		pr_buf(out, "running\n");
+		pr_buf(out, "pos %llu:%llu\n",
 		       r->move_stats.pos.inode,
 		       r->move_stats.pos.offset);
 		break;
 	}
-
-	return out.pos - buf;
 }
 
 void bch2_rebalance_stop(struct bch_fs *c)
diff --git a/fs/bcachefs/rebalance.h b/fs/bcachefs/rebalance.h
index 99e2a1fb6084..7ade0bb81cce 100644
--- a/fs/bcachefs/rebalance.h
+++ b/fs/bcachefs/rebalance.h
@@ -19,7 +19,7 @@ void bch2_rebalance_add_key(struct bch_fs *, struct bkey_s_c,
 			    struct bch_io_opts *);
 void bch2_rebalance_add_work(struct bch_fs *, u64);
 
-ssize_t bch2_rebalance_work_show(struct bch_fs *, char *);
+void bch2_rebalance_work_to_text(struct printbuf *, struct bch_fs *);
 
 void bch2_rebalance_stop(struct bch_fs *);
 int bch2_rebalance_start(struct bch_fs *);
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index 911c305d372c..deaafeecba64 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -75,7 +75,6 @@ do {									\
 #define sysfs_hprint(file, val)						\
 do {									\
 	if (attr == &sysfs_ ## file) {					\
-		struct printbuf out = _PBUF(buf, PAGE_SIZE);		\
 		bch2_hprint(&out, val);					\
 		pr_buf(&out, "\n");					\
 		return out.pos - buf;					\
@@ -239,24 +238,22 @@ static size_t bch2_btree_cache_size(struct bch_fs *c)
 	return ret;
 }
 
-static ssize_t show_fs_alloc_debug(struct bch_fs *c, char *buf)
+static int fs_alloc_debug_to_text(struct printbuf *out, struct bch_fs *c)
 {
-	struct printbuf out = _PBUF(buf, PAGE_SIZE);
 	struct bch_fs_usage_online *fs_usage = bch2_fs_usage_read(c);
 
 	if (!fs_usage)
 		return -ENOMEM;
 
-	bch2_fs_usage_to_text(&out, c, fs_usage);
+	bch2_fs_usage_to_text(out, c, fs_usage);
 
 	percpu_up_read(&c->mark_lock);
 
 	kfree(fs_usage);
-
-	return out.pos - buf;
+	return 0;
 }
 
-static ssize_t bch2_compression_stats(struct bch_fs *c, char *buf)
+static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c)
 {
 	struct btree_trans trans;
 	struct btree_iter *iter;
@@ -299,59 +296,26 @@ static ssize_t bch2_compression_stats(struct bch_fs *c, char *buf)
 	if (ret)
 		return ret;
 
-	return scnprintf(buf, PAGE_SIZE,
-			"uncompressed data:\n"
-			"	nr extents:			%llu\n"
-			"	size (bytes):			%llu\n"
-			"compressed data:\n"
-			"	nr extents:			%llu\n"
-			"	compressed size (bytes):	%llu\n"
-			"	uncompressed size (bytes):	%llu\n",
-			nr_uncompressed_extents,
-			uncompressed_sectors << 9,
-			nr_compressed_extents,
-			compressed_sectors_compressed << 9,
-			compressed_sectors_uncompressed << 9);
-}
-
-static ssize_t bch2_new_stripes(struct bch_fs *c, char *buf)
-{
-	char *out = buf, *end = buf + PAGE_SIZE;
-	struct ec_stripe_head *h;
-	struct ec_stripe_new *s;
-
-	mutex_lock(&c->ec_stripe_head_lock);
-	list_for_each_entry(h, &c->ec_stripe_head_list, list) {
-		out += scnprintf(out, end - out,
-				 "target %u algo %u redundancy %u:\n",
-				 h->target, h->algo, h->redundancy);
-
-		if (h->s)
-			out += scnprintf(out, end - out,
-					 "\tpending: blocks %u allocated %u\n",
-					 h->s->blocks.nr,
-					 bitmap_weight(h->s->blocks_allocated,
-						       h->s->blocks.nr));
-	}
-	mutex_unlock(&c->ec_stripe_head_lock);
-
-	mutex_lock(&c->ec_stripe_new_lock);
-	list_for_each_entry(h, &c->ec_stripe_new_list, list) {
-		out += scnprintf(out, end - out,
-				 "\tin flight: blocks %u allocated %u pin %u\n",
-				 s->blocks.nr,
-				 bitmap_weight(s->blocks_allocated,
-					       s->blocks.nr),
-				 atomic_read(&s->pin));
-	}
-	mutex_unlock(&c->ec_stripe_new_lock);
-
-	return out - buf;
+	pr_buf(out,
+	       "uncompressed data:\n"
+	       "	nr extents:			%llu\n"
+	       "	size (bytes):			%llu\n"
+	       "compressed data:\n"
+	       "	nr extents:			%llu\n"
+	       "	compressed size (bytes):	%llu\n"
+	       "	uncompressed size (bytes):	%llu\n",
+	       nr_uncompressed_extents,
+	       uncompressed_sectors << 9,
+	       nr_compressed_extents,
+	       compressed_sectors_compressed << 9,
+	       compressed_sectors_uncompressed << 9);
+	return 0;
 }
 
 SHOW(bch2_fs)
 {
 	struct bch_fs *c = container_of(kobj, struct bch_fs, kobj);
+	struct printbuf out = _PBUF(buf, PAGE_SIZE);
 
 	sysfs_print(minor,			c->minor);
 	sysfs_printf(internal_uuid, "%pU",	c->sb.uuid.b);
@@ -381,8 +345,10 @@ SHOW(bch2_fs)
 	sysfs_pd_controller_show(rebalance,	&c->rebalance.pd); /* XXX */
 	sysfs_pd_controller_show(copy_gc,	&c->copygc_pd);
 
-	if (attr == &sysfs_rebalance_work)
-		return bch2_rebalance_work_show(c, buf);
+	if (attr == &sysfs_rebalance_work) {
+		bch2_rebalance_work_to_text(&out, c);
+		return out.pos - buf;
+	}
 
 	sysfs_print(promote_whole_extents,	c->promote_whole_extents);
 
@@ -392,51 +358,61 @@ SHOW(bch2_fs)
 	/* Debugging: */
 
 	if (attr == &sysfs_alloc_debug)
-		return show_fs_alloc_debug(c, buf);
+		return fs_alloc_debug_to_text(&out, c) ?: out.pos - buf;
 
-	if (attr == &sysfs_journal_debug)
-		return bch2_journal_print_debug(&c->journal, buf);
+	if (attr == &sysfs_journal_debug) {
+		bch2_journal_debug_to_text(&out, &c->journal);
+		return out.pos - buf;
+	}
 
-	if (attr == &sysfs_journal_pins)
-		return bch2_journal_print_pins(&c->journal, buf);
+	if (attr == &sysfs_journal_pins) {
+		bch2_journal_pins_to_text(&out, &c->journal);
+		return out.pos - buf;
+	}
 
-	if (attr == &sysfs_btree_updates)
-		return bch2_btree_updates_print(c, buf);
+	if (attr == &sysfs_btree_updates) {
+		bch2_btree_updates_to_text(&out, c);
+		return out.pos - buf;
+	}
 
-	if (attr == &sysfs_dirty_btree_nodes)
-		return bch2_dirty_btree_nodes_print(c, buf);
+	if (attr == &sysfs_dirty_btree_nodes) {
+		bch2_dirty_btree_nodes_to_text(&out, c);
+		return out.pos - buf;
+	}
 
 	if (attr == &sysfs_btree_key_cache) {
-		struct printbuf out = _PBUF(buf, PAGE_SIZE);
-
 		bch2_btree_key_cache_to_text(&out, &c->btree_key_cache);
 		return out.pos - buf;
 	}
 
 	if (attr == &sysfs_btree_transactions) {
-		struct printbuf out = _PBUF(buf, PAGE_SIZE);
-
 		bch2_btree_trans_to_text(&out, c);
 		return out.pos - buf;
 	}
 
 	if (attr == &sysfs_stripes_heap) {
-		struct printbuf out = _PBUF(buf, PAGE_SIZE);
-
 		bch2_stripes_heap_to_text(&out, c);
 		return out.pos - buf;
 	}
 
-	if (attr == &sysfs_compression_stats)
-		return bch2_compression_stats(c, buf);
+	if (attr == &sysfs_compression_stats) {
+		bch2_compression_stats_to_text(&out, c);
+		return out.pos - buf;
+	}
 
-	if (attr == &sysfs_new_stripes)
-		return bch2_new_stripes(c, buf);
+	if (attr == &sysfs_new_stripes) {
+		bch2_new_stripes_to_text(&out, c);
+		return out.pos - buf;
+	}
 
-	if (attr == &sysfs_io_timers_read)
-		return bch2_io_timers_show(&c->io_clock[READ], buf);
-	if (attr == &sysfs_io_timers_write)
-		return bch2_io_timers_show(&c->io_clock[WRITE], buf);
+	if (attr == &sysfs_io_timers_read) {
+		bch2_io_timers_to_text(&out, &c->io_clock[READ]);
+		return out.pos - buf;
+	}
+	if (attr == &sysfs_io_timers_write) {
+		bch2_io_timers_to_text(&out, &c->io_clock[WRITE]);
+		return out.pos - buf;
+	}
 
 #define BCH_DEBUG_PARAM(name, description) sysfs_print(name, c->name);
 	BCH_DEBUG_PARAMS()
@@ -705,11 +681,13 @@ int bch2_opts_create_sysfs_files(struct kobject *kobj)
 SHOW(bch2_fs_time_stats)
 {
 	struct bch_fs *c = container_of(kobj, struct bch_fs, time_stats);
+	struct printbuf out = _PBUF(buf, PAGE_SIZE);
 
-#define x(name)						\
-	if (attr == &sysfs_time_stat_##name)				\
-		return bch2_time_stats_print(&c->times[BCH_TIME_##name],\
-					     buf, PAGE_SIZE);
+#define x(name)								\
+	if (attr == &sysfs_time_stat_##name) {				\
+		bch2_time_stats_to_text(&out, &c->times[BCH_TIME_##name]);\
+		return out.pos - buf;					\
+	}
 	BCH_TIME_STATS()
 #undef x
 
@@ -762,13 +740,13 @@ static int unsigned_cmp(const void *_l, const void *_r)
 	return cmp_int(*l, *r);
 }
 
-static ssize_t show_quantiles(struct bch_fs *c, struct bch_dev *ca,
-			      char *buf, bucket_map_fn *fn, void *private)
+static int quantiles_to_text(struct printbuf *out,
+			     struct bch_fs *c, struct bch_dev *ca,
+			     bucket_map_fn *fn, void *private)
 {
 	size_t i, n;
 	/* Compute 31 quantiles */
 	unsigned q[31], *p;
-	ssize_t ret = 0;
 
 	down_read(&ca->bucket_lock);
 	n = ca->mi.nbuckets;
@@ -795,35 +773,30 @@ static ssize_t show_quantiles(struct bch_fs *c, struct bch_dev *ca,
 	vfree(p);
 
 	for (i = 0; i < ARRAY_SIZE(q); i++)
-		ret += scnprintf(buf + ret, PAGE_SIZE - ret,
-				 "%u ", q[i]);
-	buf[ret - 1] = '\n';
-
-	return ret;
+		pr_buf(out, "%u ", q[i]);
+	pr_buf(out, "\n");
+	return 0;
 }
 
-static ssize_t show_reserve_stats(struct bch_dev *ca, char *buf)
+static void reserve_stats_to_text(struct printbuf *out, struct bch_dev *ca)
 {
-	struct printbuf out = _PBUF(buf, PAGE_SIZE);
 	enum alloc_reserve i;
 
 	spin_lock(&ca->fs->freelist_lock);
 
-	pr_buf(&out, "free_inc:\t%zu\t%zu\n",
+	pr_buf(out, "free_inc:\t%zu\t%zu\n",
 	       fifo_used(&ca->free_inc),
 	       ca->free_inc.size);
 
 	for (i = 0; i < RESERVE_NR; i++)
-		pr_buf(&out, "free[%u]:\t%zu\t%zu\n", i,
+		pr_buf(out, "free[%u]:\t%zu\t%zu\n", i,
 		       fifo_used(&ca->free[i]),
 		       ca->free[i].size);
 
 	spin_unlock(&ca->fs->freelist_lock);
-
-	return out.pos - buf;
 }
 
-static ssize_t show_dev_alloc_debug(struct bch_dev *ca, char *buf)
+static void dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca)
 {
 	struct bch_fs *c = ca->fs;
 	struct bch_dev_usage stats = bch2_dev_usage_read(ca);
@@ -834,7 +807,7 @@ static ssize_t show_dev_alloc_debug(struct bch_dev *ca, char *buf)
 	for (i = 0; i < ARRAY_SIZE(c->open_buckets); i++)
 		nr[c->open_buckets[i].type]++;
 
-	return scnprintf(buf, PAGE_SIZE,
+	pr_buf(out,
 		"free_inc:               %zu/%zu\n"
 		"free[RESERVE_BTREE]:    %zu/%zu\n"
 		"free[RESERVE_MOVINGGC]: %zu/%zu\n"
@@ -898,21 +871,18 @@ static const char * const bch2_rw[] = {
 	NULL
 };
 
-static ssize_t show_dev_iodone(struct bch_dev *ca, char *buf)
+static void dev_iodone_to_text(struct printbuf *out, struct bch_dev *ca)
 {
-	struct printbuf out = _PBUF(buf, PAGE_SIZE);
 	int rw, i;
 
 	for (rw = 0; rw < 2; rw++) {
-		pr_buf(&out, "%s:\n", bch2_rw[rw]);
+		pr_buf(out, "%s:\n", bch2_rw[rw]);
 
 		for (i = 1; i < BCH_DATA_NR; i++)
-			pr_buf(&out, "%-12s:%12llu\n",
+			pr_buf(out, "%-12s:%12llu\n",
 			       bch2_data_types[i],
 			       percpu_u64_get(&ca->io_done->sectors[rw][i]) << 9);
 	}
-
-	return out.pos - buf;
 }
 
 SHOW(bch2_dev)
@@ -964,34 +934,44 @@ SHOW(bch2_dev)
 		return out.pos - buf;
 	}
 
-	if (attr == &sysfs_iodone)
-		return show_dev_iodone(ca, buf);
+	if (attr == &sysfs_iodone) {
+		dev_iodone_to_text(&out, ca);
+		return out.pos - buf;
+	}
 
 	sysfs_print(io_latency_read,		atomic64_read(&ca->cur_latency[READ]));
 	sysfs_print(io_latency_write,		atomic64_read(&ca->cur_latency[WRITE]));
 
-	if (attr == &sysfs_io_latency_stats_read)
-		return bch2_time_stats_print(&ca->io_latency[READ], buf, PAGE_SIZE);
-	if (attr == &sysfs_io_latency_stats_write)
-		return bch2_time_stats_print(&ca->io_latency[WRITE], buf, PAGE_SIZE);
+	if (attr == &sysfs_io_latency_stats_read) {
+		bch2_time_stats_to_text(&out, &ca->io_latency[READ]);
+		return out.pos - buf;
+	}
+	if (attr == &sysfs_io_latency_stats_write) {
+		bch2_time_stats_to_text(&out, &ca->io_latency[WRITE]);
+		return out.pos - buf;
+	}
 
 	sysfs_printf(congested,			"%u%%",
 		     clamp(atomic_read(&ca->congested), 0, CONGESTED_MAX)
 		     * 100 / CONGESTED_MAX);
 
 	if (attr == &sysfs_bucket_quantiles_last_read)
-		return show_quantiles(c, ca, buf, bucket_last_io_fn, (void *) 0);
+		return quantiles_to_text(&out, c, ca, bucket_last_io_fn, (void *) 0) ?: out.pos - buf;
 	if (attr == &sysfs_bucket_quantiles_last_write)
-		return show_quantiles(c, ca, buf, bucket_last_io_fn, (void *) 1);
+		return quantiles_to_text(&out, c, ca, bucket_last_io_fn, (void *) 1) ?: out.pos - buf;
 	if (attr == &sysfs_bucket_quantiles_fragmentation)
-		return show_quantiles(c, ca, buf, bucket_sectors_used_fn, NULL);
+		return quantiles_to_text(&out, c, ca, bucket_sectors_used_fn, NULL)  ?: out.pos - buf;
 	if (attr == &sysfs_bucket_quantiles_oldest_gen)
-		return show_quantiles(c, ca, buf, bucket_oldest_gen_fn, NULL);
+		return quantiles_to_text(&out, c, ca, bucket_oldest_gen_fn, NULL)    ?: out.pos - buf;
 
-	if (attr == &sysfs_reserve_stats)
-		return show_reserve_stats(ca, buf);
-	if (attr == &sysfs_alloc_debug)
-		return show_dev_alloc_debug(ca, buf);
+	if (attr == &sysfs_reserve_stats) {
+		reserve_stats_to_text(&out, ca);
+		return out.pos - buf;
+	}
+	if (attr == &sysfs_alloc_debug) {
+		dev_alloc_debug_to_text(&out, ca);
+		return out.pos - buf;
+	}
 
 	return 0;
 }
diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c
index a05ebe475c5a..6e665f7f25a3 100644
--- a/fs/bcachefs/util.c
+++ b/fs/bcachefs/util.c
@@ -320,43 +320,40 @@ static void pr_time_units(struct printbuf *out, u64 ns)
 	pr_buf(out, "%llu %s", div_u64(ns, u->nsecs), u->name);
 }
 
-size_t bch2_time_stats_print(struct bch2_time_stats *stats, char *buf, size_t len)
+void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats)
 {
-	struct printbuf out = _PBUF(buf, len);
 	const struct time_unit *u;
 	u64 freq = READ_ONCE(stats->average_frequency);
 	u64 q, last_q = 0;
 	int i;
 
-	pr_buf(&out, "count:\t\t%llu\n",
+	pr_buf(out, "count:\t\t%llu\n",
 			 stats->count);
-	pr_buf(&out, "rate:\t\t%llu/sec\n",
+	pr_buf(out, "rate:\t\t%llu/sec\n",
 	       freq ?  div64_u64(NSEC_PER_SEC, freq) : 0);
 
-	pr_buf(&out, "frequency:\t");
-	pr_time_units(&out, freq);
+	pr_buf(out, "frequency:\t");
+	pr_time_units(out, freq);
 
-	pr_buf(&out, "\navg duration:\t");
-	pr_time_units(&out, stats->average_duration);
+	pr_buf(out, "\navg duration:\t");
+	pr_time_units(out, stats->average_duration);
 
-	pr_buf(&out, "\nmax duration:\t");
-	pr_time_units(&out, stats->max_duration);
+	pr_buf(out, "\nmax duration:\t");
+	pr_time_units(out, stats->max_duration);
 
 	i = eytzinger0_first(NR_QUANTILES);
 	u = pick_time_units(stats->quantiles.entries[i].m);
 
-	pr_buf(&out, "\nquantiles (%s):\t", u->name);
+	pr_buf(out, "\nquantiles (%s):\t", u->name);
 	eytzinger0_for_each(i, NR_QUANTILES) {
 		bool is_last = eytzinger0_next(i, NR_QUANTILES) == -1;
 
 		q = max(stats->quantiles.entries[i].m, last_q);
-		pr_buf(&out, "%llu%s",
+		pr_buf(out, "%llu%s",
 		       div_u64(q, u->nsecs),
 		       is_last ? "\n" : " ");
 		last_q = q;
 	}
-
-	return out.pos - buf;
 }
 
 void bch2_time_stats_exit(struct bch2_time_stats *stats)
diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h
index 1780a6831136..7b7c638d8904 100644
--- a/fs/bcachefs/util.h
+++ b/fs/bcachefs/util.h
@@ -390,7 +390,7 @@ static inline void bch2_time_stats_update(struct bch2_time_stats *stats, u64 sta
 	__bch2_time_stats_update(stats, start, local_clock());
 }
 
-size_t bch2_time_stats_print(struct bch2_time_stats *, char *, size_t);
+void bch2_time_stats_to_text(struct printbuf *, struct bch2_time_stats *);
 
 void bch2_time_stats_exit(struct bch2_time_stats *);
 void bch2_time_stats_init(struct bch2_time_stats *);
-- 
cgit 


From 79e72a90936e9d19caf0ced4113e8d7659ebb2af Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 3 Aug 2020 13:37:11 -0400
Subject: bcachefs: Fix maximum btree node size

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/opts.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
index 2f93238891b0..6aaabb24d3ed 100644
--- a/fs/bcachefs/opts.h
+++ b/fs/bcachefs/opts.h
@@ -83,7 +83,7 @@ enum opt_type {
 	  "size",	NULL)						\
 	x(btree_node_size,		u16,				\
 	  OPT_FORMAT,							\
-	  OPT_SECTORS(1, 128),						\
+	  OPT_SECTORS(1, 512),						\
 	  BCH_SB_BTREE_NODE_SIZE,	512,				\
 	  "size",	"Btree node size, default 256k")		\
 	x(errors,			u8,				\
-- 
cgit 


From 768b42a7eb91b556cfe7c467c0e7337602eb6f29 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 3 Aug 2020 13:58:36 -0400
Subject: bcachefs: Don't disallow btree writes to RO devices

There's an inherent race with setting devices RO when they have dirty
btree nodes on them. We already check if a btree node is on an RO device
before we dirty it, so this patch just allows those writes so that we
don't have errors forcing the entire filesystem read only when trying to
remove a device.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/io.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index d9e35329f707..0f4c5c3c408c 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -481,7 +481,8 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
 
 		n->c			= c;
 		n->dev			= ptr->dev;
-		n->have_ioref		= bch2_dev_get_ioref(ca, WRITE);
+		n->have_ioref		= bch2_dev_get_ioref(ca,
+					type == BCH_DATA_btree ? READ : WRITE);
 		n->submit_time		= local_clock();
 		n->bio.bi_iter.bi_sector = ptr->offset;
 
-- 
cgit 


From 00c24f53b563dafb2de8c5f642d24ac775b4479c Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 4 Aug 2020 23:12:49 -0400
Subject: bcachefs: Fix bch2_new_stripes_to_text()

painful looking typo, fortunately difficult to hit.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/ec.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index 61bc34225bf1..2120f0a9b424 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -1594,7 +1594,7 @@ void bch2_new_stripes_to_text(struct printbuf *out, struct bch_fs *c)
 	mutex_unlock(&c->ec_stripe_head_lock);
 
 	mutex_lock(&c->ec_stripe_new_lock);
-	list_for_each_entry(h, &c->ec_stripe_new_list, list) {
+	list_for_each_entry(s, &c->ec_stripe_new_list, list) {
 		pr_buf(out, "\tin flight: blocks %u allocated %u pin %u\n",
 		       s->blocks.nr,
 		       bitmap_weight(s->blocks_allocated,
-- 
cgit 


From 9f115ce9e9b57f0e55a37b657feac5663590b85e Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 4 Aug 2020 23:10:08 -0400
Subject: bcachefs: Fix a bug with the journal_seq_blacklist mechanism

Previously, we would start doing btree updates before writing the first
journal entry; if this was after an unclean shutdown, this could cause
those btree updates to not be blacklisted.

Also, move some code to headers for userspace debug tools.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_io.c              | 28 ----------------------------
 fs/bcachefs/btree_io.h              | 29 +++++++++++++++++++++++++++++
 fs/bcachefs/journal_seq_blacklist.c |  9 ---------
 fs/bcachefs/journal_seq_blacklist.h |  9 +++++++++
 fs/bcachefs/recovery.c              |  5 +++++
 fs/bcachefs/super.c                 |  9 ++++++++-
 6 files changed, 51 insertions(+), 38 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index d3ea43dd9fe6..996fc0c34b3c 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -597,34 +597,6 @@ void bch2_btree_init_next(struct bch_fs *c, struct btree *b,
 		bch2_btree_iter_reinit_node(iter, b);
 }
 
-static struct nonce btree_nonce(struct bset *i, unsigned offset)
-{
-	return (struct nonce) {{
-		[0] = cpu_to_le32(offset),
-		[1] = ((__le32 *) &i->seq)[0],
-		[2] = ((__le32 *) &i->seq)[1],
-		[3] = ((__le32 *) &i->journal_seq)[0]^BCH_NONCE_BTREE,
-	}};
-}
-
-static void bset_encrypt(struct bch_fs *c, struct bset *i, unsigned offset)
-{
-	struct nonce nonce = btree_nonce(i, offset);
-
-	if (!offset) {
-		struct btree_node *bn = container_of(i, struct btree_node, keys);
-		unsigned bytes = (void *) &bn->keys - (void *) &bn->flags;
-
-		bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, &bn->flags,
-			     bytes);
-
-		nonce = nonce_add(nonce, round_up(bytes, CHACHA_BLOCK_SIZE));
-	}
-
-	bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, i->_data,
-		     vstruct_end(i) - (void *) i->_data);
-}
-
 static void btree_err_msg(struct printbuf *out, struct bch_fs *c,
 			  struct btree *b, struct bset *i,
 			  unsigned offset, int write)
diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h
index 66ebdd39f5b3..626d0f071b70 100644
--- a/fs/bcachefs/btree_io.h
+++ b/fs/bcachefs/btree_io.h
@@ -5,6 +5,7 @@
 #include "bkey_methods.h"
 #include "bset.h"
 #include "btree_locking.h"
+#include "checksum.h"
 #include "extents.h"
 #include "io_types.h"
 
@@ -82,6 +83,34 @@ static inline bool bch2_maybe_compact_whiteouts(struct bch_fs *c, struct btree *
 	return false;
 }
 
+static inline struct nonce btree_nonce(struct bset *i, unsigned offset)
+{
+	return (struct nonce) {{
+		[0] = cpu_to_le32(offset),
+		[1] = ((__le32 *) &i->seq)[0],
+		[2] = ((__le32 *) &i->seq)[1],
+		[3] = ((__le32 *) &i->journal_seq)[0]^BCH_NONCE_BTREE,
+	}};
+}
+
+static inline void bset_encrypt(struct bch_fs *c, struct bset *i, unsigned offset)
+{
+	struct nonce nonce = btree_nonce(i, offset);
+
+	if (!offset) {
+		struct btree_node *bn = container_of(i, struct btree_node, keys);
+		unsigned bytes = (void *) &bn->keys - (void *) &bn->flags;
+
+		bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, &bn->flags,
+			     bytes);
+
+		nonce = nonce_add(nonce, round_up(bytes, CHACHA_BLOCK_SIZE));
+	}
+
+	bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, i->_data,
+		     vstruct_end(i) - (void *) i->_data);
+}
+
 void bch2_btree_sort_into(struct bch_fs *, struct btree *, struct btree *);
 
 void bch2_btree_build_aux_trees(struct btree *);
diff --git a/fs/bcachefs/journal_seq_blacklist.c b/fs/bcachefs/journal_seq_blacklist.c
index a21de0088753..d0f1bbf8f6a7 100644
--- a/fs/bcachefs/journal_seq_blacklist.c
+++ b/fs/bcachefs/journal_seq_blacklist.c
@@ -36,15 +36,6 @@
  * that bset, until that btree node is rewritten.
  */
 
-static unsigned
-blacklist_nr_entries(struct bch_sb_field_journal_seq_blacklist *bl)
-{
-	return bl
-		? ((vstruct_end(&bl->field) - (void *) &bl->start[0]) /
-		   sizeof(struct journal_seq_blacklist_entry))
-		: 0;
-}
-
 static unsigned sb_blacklist_u64s(unsigned nr)
 {
 	struct bch_sb_field_journal_seq_blacklist *bl;
diff --git a/fs/bcachefs/journal_seq_blacklist.h b/fs/bcachefs/journal_seq_blacklist.h
index 03f4b97247fd..afb886ec8e25 100644
--- a/fs/bcachefs/journal_seq_blacklist.h
+++ b/fs/bcachefs/journal_seq_blacklist.h
@@ -2,6 +2,15 @@
 #ifndef _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H
 #define _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H
 
+static inline unsigned
+blacklist_nr_entries(struct bch_sb_field_journal_seq_blacklist *bl)
+{
+	return bl
+		? ((vstruct_end(&bl->field) - (void *) &bl->start[0]) /
+		   sizeof(struct journal_seq_blacklist_entry))
+		: 0;
+}
+
 bool bch2_journal_seq_is_blacklisted(struct bch_fs *, u64, bool);
 int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64, u64);
 int bch2_blacklist_table_initialize(struct bch_fs *);
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 28972f30e198..6e829bf0a31f 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -1039,6 +1039,11 @@ int bch2_fs_recovery(struct bch_fs *c)
 		}
 
 		journal_seq += 4;
+
+		/*
+		 * The superblock needs to be written before we do any btree
+		 * node writes: it will be in the read_write() path
+		 */
 	}
 
 	ret = bch2_blacklist_table_initialize(c);
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 084976c9ac74..7377f44f15df 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -352,8 +352,8 @@ bool bch2_fs_emergency_read_only(struct bch_fs *c)
 {
 	bool ret = !test_and_set_bit(BCH_FS_EMERGENCY_RO, &c->flags);
 
-	bch2_fs_read_only_async(c);
 	bch2_journal_halt(&c->journal);
+	bch2_fs_read_only_async(c);
 
 	wake_up(&bch_read_only_wait);
 	return ret;
@@ -410,6 +410,13 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early)
 	if (ret)
 		goto err;
 
+	/*
+	 * We need to write out a journal entry before we start doing btree
+	 * updates, to ensure that on unclean shutdown new journal blacklist
+	 * entries are created:
+	 */
+	bch2_journal_meta(&c->journal);
+
 	clear_bit(BCH_FS_ALLOC_CLEAN, &c->flags);
 
 	for_each_rw_member(ca, c, i)
-- 
cgit 


From 1421bea38ace65f167a73ae3f544205766c1778c Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 12 Aug 2020 13:48:02 -0400
Subject: bcachefs: Don't block on allocations when only writing to specific
 device

Since the copygc thread is now global and not per device, we're not
freeing up space on any one device in bounded time - and indeed we never
really were, since rebalance wasn't moving data around between devices
with that objective.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/io.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index 0f4c5c3c408c..525017149855 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -1090,6 +1090,11 @@ again:
 			goto err;
 		}
 
+		/*
+		 * The copygc thread is now global, which means it's no longer
+		 * freeing up space on specific disks, which means that
+		 * allocations for specific disks may hang arbitrarily long:
+		 */
 		wp = bch2_alloc_sectors_start(c,
 			op->target,
 			op->opts.erasure_code,
@@ -1099,7 +1104,8 @@ again:
 			op->nr_replicas_required,
 			op->alloc_reserve,
 			op->flags,
-			(op->flags & BCH_WRITE_ALLOC_NOWAIT) ? NULL : cl);
+			(op->flags & (BCH_WRITE_ALLOC_NOWAIT|
+				      BCH_WRITE_ONLY_SPECIFIED_DEVS)) ? NULL : cl);
 		EBUG_ON(!wp);
 
 		if (unlikely(IS_ERR(wp))) {
-- 
cgit 


From 142cbdff9b3d8e7fe8619d906f9eefd50b078f5f Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 12 Aug 2020 13:49:09 -0400
Subject: bcachefs: Change copygc to consider bucket fragmentation

When devices have different sized buckets this is more correct.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/buckets_types.h |  1 +
 fs/bcachefs/movinggc.c      | 20 +++++++++++---------
 2 files changed, 12 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h
index 0f7fcfe29e0e..26779e94a189 100644
--- a/fs/bcachefs/buckets_types.h
+++ b/fs/bcachefs/buckets_types.h
@@ -123,6 +123,7 @@ struct disk_reservation {
 struct copygc_heap_entry {
 	u8			dev;
 	u8			gen;
+	u16			fragmentation;
 	u32			sectors;
 	u64			offset;
 };
diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c
index 25ae4e195c15..5f96f619bee0 100644
--- a/fs/bcachefs/movinggc.c
+++ b/fs/bcachefs/movinggc.c
@@ -44,13 +44,6 @@
 #define COPYGC_BUCKETS_PER_ITER(ca)					\
 	((ca)->free[RESERVE_MOVINGGC].size / 2)
 
-static inline int sectors_used_cmp(copygc_heap *heap,
-				   struct copygc_heap_entry l,
-				   struct copygc_heap_entry r)
-{
-	return cmp_int(l.sectors, r.sectors);
-}
-
 static int bucket_offset_cmp(const void *_l, const void *_r, size_t size)
 {
 	const struct copygc_heap_entry *l = _l;
@@ -123,6 +116,13 @@ static bool have_copygc_reserve(struct bch_dev *ca)
 	return ret;
 }
 
+static inline int fragmentation_cmp(copygc_heap *heap,
+				   struct copygc_heap_entry l,
+				   struct copygc_heap_entry r)
+{
+	return cmp_int(l.fragmentation, r.fragmentation);
+}
+
 static int bch2_copygc(struct bch_fs *c)
 {
 	copygc_heap *h = &c->copygc_heap;
@@ -180,10 +180,12 @@ static int bch2_copygc(struct bch_fs *c)
 			e = (struct copygc_heap_entry) {
 				.dev		= dev_idx,
 				.gen		= m.gen,
+				.fragmentation	= bucket_sectors_used(m) * (1U << 15)
+					/ ca->mi.bucket_size,
 				.sectors	= bucket_sectors_used(m),
 				.offset		= bucket_to_sector(ca, b),
 			};
-			heap_add_or_replace(h, e, -sectors_used_cmp, NULL);
+			heap_add_or_replace(h, e, -fragmentation_cmp, NULL);
 		}
 		up_read(&ca->bucket_lock);
 	}
@@ -197,7 +199,7 @@ static int bch2_copygc(struct bch_fs *c)
 		sectors_to_move += i->sectors;
 
 	while (sectors_to_move > sectors_reserved) {
-		BUG_ON(!heap_pop(h, e, -sectors_used_cmp, NULL));
+		BUG_ON(!heap_pop(h, e, -fragmentation_cmp, NULL));
 		sectors_to_move -= e.sectors;
 	}
 
-- 
cgit 


From 01566db2fb1fca313d75b8b849ee95bb9ec5bcd7 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 12 Aug 2020 15:00:08 -0400
Subject: bcachefs: Fix disk groups not being updated when set via sysfs

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/disk_groups.c | 7 ++++++-
 fs/bcachefs/disk_groups.h | 3 +++
 2 files changed, 9 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/disk_groups.c b/fs/bcachefs/disk_groups.c
index ee10308131e9..1c30065833c2 100644
--- a/fs/bcachefs/disk_groups.c
+++ b/fs/bcachefs/disk_groups.c
@@ -387,6 +387,7 @@ int bch2_dev_group_set(struct bch_fs *c, struct bch_dev *ca, const char *name)
 {
 	struct bch_member *mi;
 	int v = -1;
+	int ret = 0;
 
 	mutex_lock(&c->sb_lock);
 
@@ -399,14 +400,18 @@ int bch2_dev_group_set(struct bch_fs *c, struct bch_dev *ca, const char *name)
 		return v;
 	}
 
+	ret = bch2_sb_disk_groups_to_cpu(c);
+	if (ret)
+		goto unlock;
 write_sb:
 	mi = &bch2_sb_get_members(c->disk_sb.sb)->members[ca->dev_idx];
 	SET_BCH_MEMBER_GROUP(mi, v + 1);
 
 	bch2_write_super(c);
+unlock:
 	mutex_unlock(&c->sb_lock);
 
-	return 0;
+	return ret;
 }
 
 int bch2_opt_target_parse(struct bch_fs *c, const char *buf, u64 *v)
diff --git a/fs/bcachefs/disk_groups.h b/fs/bcachefs/disk_groups.h
index c8e0c37a5e1a..3d84f23c34ed 100644
--- a/fs/bcachefs/disk_groups.h
+++ b/fs/bcachefs/disk_groups.h
@@ -71,7 +71,10 @@ static inline struct bch_devs_mask target_rw_devs(struct bch_fs *c,
 bool bch2_dev_in_target(struct bch_fs *, unsigned, unsigned);
 
 int bch2_disk_path_find(struct bch_sb_handle *, const char *);
+
+/* Exported for userspace bcachefs-tools: */
 int bch2_disk_path_find_or_create(struct bch_sb_handle *, const char *);
+
 void bch2_disk_path_to_text(struct printbuf *, struct bch_sb_handle *,
 			    unsigned);
 
-- 
cgit 


From 2d8c0da1a7c2601b21d3fb63b8cb4f3610cac196 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 6 Aug 2020 15:22:24 -0400
Subject: bcachefs: Fix a couple null ptr derefs when no disk groups exist

Normally successfully parsing a target means disk groups should exist,
but we don't want a BUG() or null ptr deref if we end up with an invalid
target.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/disk_groups.c | 4 ++--
 fs/bcachefs/io.c          | 4 +++-
 2 files changed, 5 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/disk_groups.c b/fs/bcachefs/disk_groups.c
index 1c30065833c2..c47fa0a0f450 100644
--- a/fs/bcachefs/disk_groups.c
+++ b/fs/bcachefs/disk_groups.c
@@ -183,7 +183,7 @@ const struct bch_devs_mask *bch2_target_to_mask(struct bch_fs *c, unsigned targe
 	case TARGET_GROUP: {
 		struct bch_disk_groups_cpu *g = rcu_dereference(c->disk_groups);
 
-		return t.group < g->nr && !g->entries[t.group].deleted
+		return g && t.group < g->nr && !g->entries[t.group].deleted
 			? &g->entries[t.group].devs
 			: NULL;
 	}
@@ -208,7 +208,7 @@ bool bch2_dev_in_target(struct bch_fs *c, unsigned dev, unsigned target)
 
 		rcu_read_lock();
 		g = rcu_dereference(c->disk_groups);
-		m = t.group < g->nr && !g->entries[t.group].deleted
+		m = g && t.group < g->nr && !g->entries[t.group].deleted
 			? &g->entries[t.group].devs
 			: NULL;
 
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index 525017149855..78adccbee9d9 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -55,7 +55,9 @@ static bool bch2_target_congested(struct bch_fs *c, u16 target)
 		return false;
 
 	rcu_read_lock();
-	devs = bch2_target_to_mask(c, target);
+	devs = bch2_target_to_mask(c, target) ?:
+		&c->rw_devs[BCH_DATA_user];
+
 	for_each_set_bit(d, devs->d, BCH_SB_MEMBERS_MAX) {
 		ca = rcu_dereference(c->devs[d]);
 		if (!ca)
-- 
cgit 


From f9adbb7d5d19dfa9425fad844dce32853d077e58 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 12 Aug 2020 15:08:17 -0400
Subject: bcachefs: Add a cond_resched() to bch2_alloc_write()

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index ba7620999a8d..fd8b57c806cc 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -350,6 +350,8 @@ int bch2_alloc_write(struct bch_fs *c, unsigned flags, bool *wrote)
 		bch2_btree_iter_set_pos(iter, POS(i, first_bucket));
 
 		while (1) {
+			bch2_trans_cond_resched(&trans);
+
 			ret = bch2_alloc_write_key(&trans, iter, flags);
 			if (ret < 0 || ret == ALLOC_END)
 				break;
-- 
cgit 


From ac7eef0318c34c87e7ef9d574175917de1817ae6 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 15 Aug 2020 22:41:35 -0400
Subject: bcachefs: Don't report inodes to statfs

We don't have a limit on the number of inodes in a filesystem, so this
is apparently the right way to report that.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index 4538551ccca3..562a7a833436 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -1262,8 +1262,8 @@ static int bch2_statfs(struct dentry *dentry, struct kstatfs *buf)
 	buf->f_blocks	= usage.capacity >> shift;
 	buf->f_bfree	= (usage.capacity - usage.used) >> shift;
 	buf->f_bavail	= buf->f_bfree;
-	buf->f_files	= usage.nr_inodes;
-	buf->f_ffree	= U64_MAX;
+	buf->f_files	= 0;
+	buf->f_ffree	= 0;
 
 	fsid = le64_to_cpup((void *) c->sb.user_uuid.b) ^
 	       le64_to_cpup((void *) c->sb.user_uuid.b + sizeof(u64));
-- 
cgit 


From 7af0cec3076886d16114f4ca9794dfba3674794e Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 24 Aug 2020 14:57:48 -0400
Subject: bcachefs: Some project id fixes

Inode options that are accessible via the xattr interface are stored
with a +1 bias, so that a value of 0 means unset. We weren't handling
this consistently.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-ioctl.c | 6 +++++-
 fs/bcachefs/xattr.c    | 6 +++++-
 2 files changed, 10 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c
index acc0a230ff0c..f6773783b958 100644
--- a/fs/bcachefs/fs-ioctl.c
+++ b/fs/bcachefs/fs-ioctl.c
@@ -138,6 +138,10 @@ static int bch2_ioc_fssetxattr(struct bch_fs *c,
 	if (fa.fsx_projid >= U32_MAX)
 		return -EINVAL;
 
+	/*
+	 * inode fields accessible via the xattr interface are stored with a +1
+	 * bias, so that 0 means unset:
+	 */
 	s.projid = fa.fsx_projid + 1;
 
 	ret = mnt_want_write_file(file);
@@ -151,7 +155,7 @@ static int bch2_ioc_fssetxattr(struct bch_fs *c,
 	}
 
 	mutex_lock(&inode->ei_update_lock);
-	ret = bch2_set_projid(c, inode, s.projid);
+	ret = bch2_set_projid(c, inode, fa.fsx_projid);
 	if (ret)
 		goto err_unlock;
 
diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c
index 806a638508a6..c7840bb949a1 100644
--- a/fs/bcachefs/xattr.c
+++ b/fs/bcachefs/xattr.c
@@ -513,7 +513,11 @@ static int bch2_xattr_bcachefs_set(const struct xattr_handler *handler,
 
 	mutex_lock(&inode->ei_update_lock);
 	if (inode_opt_id == Inode_opt_project) {
-		ret = bch2_set_projid(c, inode, s.v);
+		/*
+		 * inode fields accessible via the xattr interface are stored
+		 * with a +1 bias, so that 0 means unset:
+		 */
+		ret = bch2_set_projid(c, inode, s.v ? s.v - 1 : 0);
 		if (ret)
 			goto err;
 	}
-- 
cgit 


From a672fb8f5deaa577197e604ad7c6e0380f153211 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 24 Aug 2020 15:16:32 -0400
Subject: bcachefs: Make sure to go rw if lazy in fsck

The paths where we delete or truncate inodes don't pass commit flags for
BTREE_INSERT_LAZY_RW, so just go rw if necessary in the fsck code.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fsck.c  | 4 ++++
 fs/bcachefs/super.h | 9 +++++++++
 2 files changed, 13 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index c6ca5968a2e0..5a6df3d1973a 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -1265,6 +1265,8 @@ static int check_inode(struct btree_trans *trans,
 		      u.bi_inum))) {
 		bch_verbose(c, "deleting inode %llu", u.bi_inum);
 
+		bch2_fs_lazy_rw(c);
+
 		ret = bch2_inode_rm(c, u.bi_inum);
 		if (ret)
 			bch_err(c, "error in fsck: error %i while deleting inode", ret);
@@ -1277,6 +1279,8 @@ static int check_inode(struct btree_trans *trans,
 		      u.bi_inum))) {
 		bch_verbose(c, "truncating inode %llu", u.bi_inum);
 
+		bch2_fs_lazy_rw(c);
+
 		/*
 		 * XXX: need to truncate partial blocks too here - or ideally
 		 * just switch units to bytes and that issue goes away
diff --git a/fs/bcachefs/super.h b/fs/bcachefs/super.h
index b948cb0428c7..fab4bee9c90e 100644
--- a/fs/bcachefs/super.h
+++ b/fs/bcachefs/super.h
@@ -221,6 +221,15 @@ void bch2_fs_read_only(struct bch_fs *);
 int bch2_fs_read_write(struct bch_fs *);
 int bch2_fs_read_write_early(struct bch_fs *);
 
+/*
+ * Only for use in the recovery/fsck path:
+ */
+static inline void bch2_fs_lazy_rw(struct bch_fs *c)
+{
+	if (percpu_ref_is_zero(&c->writes))
+		bch2_fs_read_write_early(c);
+}
+
 void bch2_fs_stop(struct bch_fs *);
 
 int bch2_fs_start(struct bch_fs *);
-- 
cgit 


From ca73852a1341534e2bf00cfbdc853bb3b1095af8 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 24 Aug 2020 15:58:26 -0400
Subject: bcachefs: Improvements to the journal read error paths

 - Print out more information in error messages
 - On checksum error, keep the journal entry but mark it bad so that we
   can prefer entries from other devices that don't have bad checksums

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/journal_io.c | 84 +++++++++++++++++++++++++++++++++---------------
 fs/bcachefs/journal_io.h |  2 ++
 2 files changed, 60 insertions(+), 26 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 9df8dd75f4ec..80c833f1390b 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -28,9 +28,11 @@ struct journal_list {
  * be replayed:
  */
 static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca,
-			     struct journal_list *jlist, struct jset *j)
+			     struct journal_list *jlist, struct jset *j,
+			     bool bad)
 {
 	struct journal_replay *i, *pos;
+	struct bch_devs_list devs = { .nr = 0 };
 	struct list_head *where;
 	size_t bytes = vstruct_bytes(j);
 	__le64 last_seq;
@@ -59,8 +61,31 @@ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca,
 	}
 
 	list_for_each_entry_reverse(i, jlist->head, list) {
-		/* Duplicate? */
-		if (le64_to_cpu(j->seq) == le64_to_cpu(i->j.seq)) {
+		if (le64_to_cpu(j->seq) > le64_to_cpu(i->j.seq)) {
+			where = &i->list;
+			goto add;
+		}
+	}
+
+	where = jlist->head;
+add:
+	i = where->next != jlist->head
+		? container_of(where->next, struct journal_replay, list)
+		: NULL;
+
+	/*
+	 * Duplicate journal entries? If so we want the one that didn't have a
+	 * checksum error:
+	 */
+	if (i && le64_to_cpu(j->seq) == le64_to_cpu(i->j.seq)) {
+		if (i->bad) {
+			devs = i->devs;
+			list_del(&i->list);
+			kvpfree(i, offsetof(struct journal_replay, j) +
+				vstruct_bytes(&i->j));
+		} else if (bad) {
+			goto found;
+		} else {
 			fsck_err_on(bytes != vstruct_bytes(&i->j) ||
 				    memcmp(j, &i->j, bytes), c,
 				    "found duplicate but non identical journal entries (seq %llu)",
@@ -68,14 +93,8 @@ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca,
 			goto found;
 		}
 
-		if (le64_to_cpu(j->seq) > le64_to_cpu(i->j.seq)) {
-			where = &i->list;
-			goto add;
-		}
 	}
 
-	where = jlist->head;
-add:
 	i = kvpmalloc(offsetof(struct journal_replay, j) + bytes, GFP_KERNEL);
 	if (!i) {
 		ret = -ENOMEM;
@@ -83,7 +102,8 @@ add:
 	}
 
 	list_add(&i->list, where);
-	i->devs.nr = 0;
+	i->devs = devs;
+	i->bad	= bad;
 	unsafe_memcpy(&i->j, j, bytes, "embedded variable length struct");
 found:
 	if (!bch2_dev_list_has_dev(i->devs, ca->dev_idx))
@@ -390,6 +410,7 @@ fsck_err:
 }
 
 static int jset_validate(struct bch_fs *c,
+			 struct bch_dev *ca,
 			 struct jset *jset, u64 sector,
 			 unsigned bucket_sectors_left,
 			 unsigned sectors_read,
@@ -404,16 +425,19 @@ static int jset_validate(struct bch_fs *c,
 		return JOURNAL_ENTRY_NONE;
 
 	version = le32_to_cpu(jset->version);
-	if ((version != BCH_JSET_VERSION_OLD &&
-	     version < bcachefs_metadata_version_min) ||
-	    version >= bcachefs_metadata_version_max) {
-		bch_err(c, "unknown journal entry version %u", jset->version);
-		return BCH_FSCK_UNKNOWN_VERSION;
+	if (journal_entry_err_on((version != BCH_JSET_VERSION_OLD &&
+				  version < bcachefs_metadata_version_min) ||
+				 version >= bcachefs_metadata_version_max, c,
+			"%s sector %llu seq %llu: unknown journal entry version %u",
+			ca->name, sector, le64_to_cpu(jset->seq),
+			version)) {
+		/* XXX: note we might have missing journal entries */
+		return JOURNAL_ENTRY_BAD;
 	}
 
 	if (journal_entry_err_on(bytes > bucket_sectors_left << 9, c,
-				 "journal entry too big (%zu bytes), sector %lluu",
-				 bytes, sector)) {
+			"%s sector %llu seq %llu: journal entry too big (%zu bytes)",
+			ca->name, sector, le64_to_cpu(jset->seq), bytes)) {
 		/* XXX: note we might have missing journal entries */
 		return JOURNAL_ENTRY_BAD;
 	}
@@ -422,13 +446,15 @@ static int jset_validate(struct bch_fs *c,
 		return JOURNAL_ENTRY_REREAD;
 
 	if (fsck_err_on(!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(jset)), c,
-			"journal entry with unknown csum type %llu sector %lluu",
-			JSET_CSUM_TYPE(jset), sector))
+			"%s sector %llu seq %llu: journal entry with unknown csum type %llu",
+			ca->name, sector, le64_to_cpu(jset->seq),
+			JSET_CSUM_TYPE(jset)))
 		return JOURNAL_ENTRY_BAD;
 
 	csum = csum_vstruct(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), jset);
 	if (journal_entry_err_on(bch2_crc_cmp(csum, jset->csum), c,
-				 "journal checksum bad, sector %llu", sector)) {
+				 "%s sector %llu seq %llu: journal checksum bad",
+				 ca->name, sector, le64_to_cpu(jset->seq))) {
 		/* XXX: retry IO, when we start retrying checksum errors */
 		/* XXX: note we might have missing journal entries */
 		return JOURNAL_ENTRY_BAD;
@@ -439,8 +465,10 @@ static int jset_validate(struct bch_fs *c,
 		     vstruct_end(jset) - (void *) jset->encrypted_start);
 
 	if (journal_entry_err_on(le64_to_cpu(jset->last_seq) > le64_to_cpu(jset->seq), c,
-				 "invalid journal entry: last_seq > seq"))
+				 "invalid journal entry: last_seq > seq")) {
 		jset->last_seq = jset->seq;
+		return JOURNAL_ENTRY_BAD;
+	}
 
 	return 0;
 fsck_err:
@@ -515,11 +543,12 @@ reread:
 			j = buf->data;
 		}
 
-		ret = jset_validate(c, j, offset,
+		ret = jset_validate(c, ca, j, offset,
 				    end - offset, sectors_read,
 				    READ);
 		switch (ret) {
 		case BCH_FSCK_OK:
+			sectors = vstruct_sectors(j, c->block_bits);
 			break;
 		case JOURNAL_ENTRY_REREAD:
 			if (vstruct_bytes(j) > buf->size) {
@@ -536,8 +565,13 @@ reread:
 			goto next_block;
 		case JOURNAL_ENTRY_BAD:
 			saw_bad = true;
+			/*
+			 * On checksum error we don't really trust the size
+			 * field of the journal entry we read, so try reading
+			 * again at next block boundary:
+			 */
 			sectors = c->opts.block_size;
-			goto next_block;
+			break;
 		default:
 			return ret;
 		}
@@ -554,7 +588,7 @@ reread:
 		ja->bucket_seq[bucket] = le64_to_cpu(j->seq);
 
 		mutex_lock(&jlist->lock);
-		ret = journal_entry_add(c, ca, jlist, j);
+		ret = journal_entry_add(c, ca, jlist, j, ret != 0);
 		mutex_unlock(&jlist->lock);
 
 		switch (ret) {
@@ -565,8 +599,6 @@ reread:
 		default:
 			return ret;
 		}
-
-		sectors = vstruct_sectors(j, c->block_bits);
 next_block:
 		pr_debug("next");
 		offset		+= sectors;
diff --git a/fs/bcachefs/journal_io.h b/fs/bcachefs/journal_io.h
index 72e575f360af..6958ee0f8cf2 100644
--- a/fs/bcachefs/journal_io.h
+++ b/fs/bcachefs/journal_io.h
@@ -9,6 +9,8 @@
 struct journal_replay {
 	struct list_head	list;
 	struct bch_devs_list	devs;
+	/* checksum error, but we may want to try using it anyways: */
+	bool			bad;
 	/* must be last: */
 	struct jset		j;
 };
-- 
cgit 


From 625104ea21386361b60d20ae696b9df6111236f5 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 6 Sep 2020 22:58:28 -0400
Subject: bcachefs: Don't fail mount if device has been removed

Also - make sure to show the devices we actually have open in /proc

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs.c    | 20 +++++++++++++++++++-
 fs/bcachefs/super.c | 16 ++++++++++++++--
 2 files changed, 33 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index 562a7a833436..a4a3085e5185 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -1428,6 +1428,24 @@ static int bch2_remount(struct super_block *sb, int *flags, char *data)
 	return ret;
 }
 
+static int bch2_show_devname(struct seq_file *seq, struct dentry *root)
+{
+	struct bch_fs *c = root->d_sb->s_fs_info;
+	struct bch_dev *ca;
+	unsigned i;
+	bool first = true;
+
+	for_each_online_member(ca, c, i) {
+		if (!first)
+			seq_putc(seq, ':');
+		first = false;
+		seq_puts(seq, "/dev/");
+		seq_puts(seq, ca->name);
+	}
+
+	return 0;
+}
+
 static int bch2_show_options(struct seq_file *seq, struct dentry *root)
 {
 	struct bch_fs *c = root->d_sb->s_fs_info;
@@ -1451,7 +1469,6 @@ static int bch2_show_options(struct seq_file *seq, struct dentry *root)
 	}
 
 	return 0;
-
 }
 
 static const struct super_operations bch_super_operations = {
@@ -1461,6 +1478,7 @@ static const struct super_operations bch_super_operations = {
 	.evict_inode	= bch2_evict_inode,
 	.sync_fs	= bch2_sync_fs,
 	.statfs		= bch2_statfs,
+	.show_devname	= bch2_show_devname,
 	.show_options	= bch2_show_options,
 	.remount_fs	= bch2_remount,
 #if 0
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 7377f44f15df..cb2b719165ce 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -1790,7 +1790,6 @@ err:
 /* return with ref on ca->ref: */
 struct bch_dev *bch2_dev_lookup(struct bch_fs *c, const char *path)
 {
-
 	struct bch_dev *ca;
 	dev_t dev;
 	unsigned i;
@@ -1816,6 +1815,7 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices,
 {
 	struct bch_sb_handle *sb = NULL;
 	struct bch_fs *c = NULL;
+	struct bch_sb_field_members *mi;
 	unsigned i, best_sb = 0;
 	const char *err;
 	int ret = -ENOMEM;
@@ -1851,10 +1851,22 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices,
 		    le64_to_cpu(sb[best_sb].sb->seq))
 			best_sb = i;
 
-	for (i = 0; i < nr_devices; i++) {
+	mi = bch2_sb_get_members(sb[best_sb].sb);
+
+	i = 0;
+	while (i < nr_devices) {
+		if (i != best_sb &&
+		    !bch2_dev_exists(sb[best_sb].sb, mi, sb[i].sb->dev_idx)) {
+			pr_info("%pg has been removed, skipping", sb[i].bdev);
+			bch2_free_super(&sb[i]);
+			array_remove_item(sb, nr_devices, i);
+			continue;
+		}
+
 		err = bch2_dev_in_fs(sb[best_sb].sb, sb[i].sb);
 		if (err)
 			goto err_print;
+		i++;
 	}
 
 	ret = -ENOMEM;
-- 
cgit 


From d5e4dcc29cce41b4bb51bf83c54940018d57e598 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 8 Sep 2020 18:30:32 -0400
Subject: bcachefs: Fix unmount path

There was a long standing race in the mount/unmount code - the VFS
intends for mount/unmount synchronizatino to be handled by the list of
superblocks, but we were still holding devices open after tearing down
our superblock in the unmount path.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs.h |   1 -
 fs/bcachefs/fs.c       | 161 ++++++++++++++++++++++---------------------------
 fs/bcachefs/super.c    |  42 +++++++++----
 fs/bcachefs/super.h    |   2 +
 4 files changed, 104 insertions(+), 102 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index baa8801c5412..f60d530313dc 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -491,7 +491,6 @@ enum {
 	BCH_FS_ERRORS_FIXED,
 
 	/* misc: */
-	BCH_FS_BDEV_MOUNTED,
 	BCH_FS_FIXED_GENS,
 	BCH_FS_ALLOC_WRITTEN,
 	BCH_FS_REBUILD_REPLICAS,
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index a4a3085e5185..3239c4717cc6 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -1300,91 +1300,36 @@ static struct bch_fs *bch2_path_to_fs(const char *path)
 		return ERR_PTR(ret);
 
 	c = bch2_dev_to_fs(dev);
-	return c ?: ERR_PTR(-ENOENT);
-}
-
-static struct bch_fs *__bch2_open_as_blockdevs(const char *dev_name, char * const *devs,
-					       unsigned nr_devs, struct bch_opts opts)
-{
-	struct bch_fs *c, *c1, *c2;
-	size_t i;
-
-	if (!nr_devs)
-		return ERR_PTR(-EINVAL);
-
-	c = bch2_fs_open(devs, nr_devs, opts);
-
-	if (IS_ERR(c) && PTR_ERR(c) == -EBUSY) {
-		/*
-		 * Already open?
-		 * Look up each block device, make sure they all belong to a
-		 * filesystem and they all belong to the _same_ filesystem
-		 */
-
-		c1 = bch2_path_to_fs(devs[0]);
-		if (IS_ERR(c1))
-			return c;
-
-		for (i = 1; i < nr_devs; i++) {
-			c2 = bch2_path_to_fs(devs[i]);
-			if (!IS_ERR(c2))
-				closure_put(&c2->cl);
-
-			if (c1 != c2) {
-				closure_put(&c1->cl);
-				return c;
-			}
-		}
-
-		c = c1;
-	}
-
-	if (IS_ERR(c))
-		return c;
-
-	down_write(&c->state_lock);
-
-	if (!test_bit(BCH_FS_STARTED, &c->flags)) {
-		up_write(&c->state_lock);
+	if (c)
 		closure_put(&c->cl);
-		pr_err("err mounting %s: incomplete filesystem", dev_name);
-		return ERR_PTR(-EINVAL);
-	}
-
-	up_write(&c->state_lock);
-
-	set_bit(BCH_FS_BDEV_MOUNTED, &c->flags);
-	return c;
+	return c ?: ERR_PTR(-ENOENT);
 }
 
-static struct bch_fs *bch2_open_as_blockdevs(const char *_dev_name,
-					     struct bch_opts opts)
+static char **split_devs(const char *_dev_name, unsigned *nr)
 {
 	char *dev_name = NULL, **devs = NULL, *s;
-	struct bch_fs *c = ERR_PTR(-ENOMEM);
 	size_t i, nr_devs = 0;
 
 	dev_name = kstrdup(_dev_name, GFP_KERNEL);
 	if (!dev_name)
-		goto err;
+		return NULL;
 
 	for (s = dev_name; s; s = strchr(s + 1, ':'))
 		nr_devs++;
 
-	devs = kcalloc(nr_devs, sizeof(const char *), GFP_KERNEL);
-	if (!devs)
-		goto err;
+	devs = kcalloc(nr_devs + 1, sizeof(const char *), GFP_KERNEL);
+	if (!devs) {
+		kfree(dev_name);
+		return NULL;
+	}
 
 	for (i = 0, s = dev_name;
 	     s;
 	     (s = strchr(s, ':')) && (*s++ = '\0'))
 		devs[i++] = s;
 
-	c = __bch2_open_as_blockdevs(_dev_name, devs, nr_devs, opts);
-err:
-	kfree(devs);
-	kfree(dev_name);
-	return c;
+	*nr = nr_devs;
+	return devs;
 }
 
 static int bch2_remount(struct super_block *sb, int *flags, char *data)
@@ -1471,6 +1416,13 @@ static int bch2_show_options(struct seq_file *seq, struct dentry *root)
 	return 0;
 }
 
+static void bch2_put_super(struct super_block *sb)
+{
+	struct bch_fs *c = sb->s_fs_info;
+
+	__bch2_fs_stop(c);
+}
+
 static const struct super_operations bch_super_operations = {
 	.alloc_inode	= bch2_alloc_inode,
 	.destroy_inode	= bch2_destroy_inode,
@@ -1481,24 +1433,39 @@ static const struct super_operations bch_super_operations = {
 	.show_devname	= bch2_show_devname,
 	.show_options	= bch2_show_options,
 	.remount_fs	= bch2_remount,
-#if 0
 	.put_super	= bch2_put_super,
+#if 0
 	.freeze_fs	= bch2_freeze,
 	.unfreeze_fs	= bch2_unfreeze,
 #endif
 };
 
-static int bch2_test_super(struct super_block *s, void *data)
-{
-	return s->s_fs_info == data;
-}
-
 static int bch2_set_super(struct super_block *s, void *data)
 {
 	s->s_fs_info = data;
 	return 0;
 }
 
+static int bch2_noset_super(struct super_block *s, void *data)
+{
+	return -EBUSY;
+}
+
+static int bch2_test_super(struct super_block *s, void *data)
+{
+	struct bch_fs *c = s->s_fs_info;
+	struct bch_fs **devs = data;
+	unsigned i;
+
+	if (!c)
+		return false;
+
+	for (i = 0; devs[i]; i++)
+		if (c != devs[i])
+			return false;
+	return true;
+}
+
 static struct dentry *bch2_mount(struct file_system_type *fs_type,
 				 int flags, const char *dev_name, void *data)
 {
@@ -1507,7 +1474,9 @@ static struct dentry *bch2_mount(struct file_system_type *fs_type,
 	struct super_block *sb;
 	struct inode *vinode;
 	struct bch_opts opts = bch2_opts_empty();
-	unsigned i;
+	char **devs;
+	struct bch_fs **devs_to_fs = NULL;
+	unsigned i, nr_devs;
 	int ret;
 
 	opt_set(opts, read_only, (flags & SB_RDONLY) != 0);
@@ -1516,21 +1485,41 @@ static struct dentry *bch2_mount(struct file_system_type *fs_type,
 	if (ret)
 		return ERR_PTR(ret);
 
-	c = bch2_open_as_blockdevs(dev_name, opts);
-	if (IS_ERR(c))
-		return ERR_CAST(c);
+	devs = split_devs(dev_name, &nr_devs);
+	if (!devs)
+		return ERR_PTR(-ENOMEM);
 
-	sb = sget(fs_type, bch2_test_super, bch2_set_super, flags|SB_NOSEC, c);
-	if (IS_ERR(sb)) {
-		closure_put(&c->cl);
-		return ERR_CAST(sb);
+	devs_to_fs = kcalloc(nr_devs + 1, sizeof(void *), GFP_KERNEL);
+	if (!devs_to_fs) {
+		sb = ERR_PTR(-ENOMEM);
+		goto got_sb;
 	}
 
-	BUG_ON(sb->s_fs_info != c);
+	for (i = 0; i < nr_devs; i++)
+		devs_to_fs[i] = bch2_path_to_fs(devs[i]);
 
-	if (sb->s_root) {
-		closure_put(&c->cl);
+	sb = sget(fs_type, bch2_test_super, bch2_noset_super,
+		  flags|SB_NOSEC, devs_to_fs);
+	if (!IS_ERR(sb))
+		goto got_sb;
+
+	c = bch2_fs_open(devs, nr_devs, opts);
+
+	if (!IS_ERR(c))
+		sb = sget(fs_type, NULL, bch2_set_super, flags|SB_NOSEC, c);
+	else
+		sb = ERR_CAST(c);
+got_sb:
+	kfree(devs_to_fs);
+	kfree(devs[0]);
+	kfree(devs);
+
+	if (IS_ERR(sb))
+		return ERR_CAST(sb);
+
+	c = sb->s_fs_info;
 
+	if (sb->s_root) {
 		if ((flags ^ sb->s_flags) & SB_RDONLY) {
 			ret = -EBUSY;
 			goto err_put_super;
@@ -1603,11 +1592,7 @@ static void bch2_kill_sb(struct super_block *sb)
 	struct bch_fs *c = sb->s_fs_info;
 
 	generic_shutdown_super(sb);
-
-	if (test_bit(BCH_FS_BDEV_MOUNTED, &c->flags))
-		bch2_fs_stop(c);
-	else
-		closure_put(&c->cl);
+	bch2_fs_free(c);
 }
 
 static struct file_system_type bcache_fs_type = {
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index cb2b719165ce..d0d46023163c 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -465,7 +465,7 @@ int bch2_fs_read_write_early(struct bch_fs *c)
 
 /* Filesystem startup/shutdown: */
 
-static void bch2_fs_free(struct bch_fs *c)
+static void __bch2_fs_free(struct bch_fs *c)
 {
 	unsigned i;
 
@@ -522,10 +522,10 @@ static void bch2_fs_release(struct kobject *kobj)
 {
 	struct bch_fs *c = container_of(kobj, struct bch_fs, kobj);
 
-	bch2_fs_free(c);
+	__bch2_fs_free(c);
 }
 
-void bch2_fs_stop(struct bch_fs *c)
+void __bch2_fs_stop(struct bch_fs *c)
 {
 	struct bch_dev *ca;
 	unsigned i;
@@ -555,13 +555,6 @@ void bch2_fs_stop(struct bch_fs *c)
 	kobject_put(&c->opts_dir);
 	kobject_put(&c->internal);
 
-	mutex_lock(&bch_fs_list_lock);
-	list_del(&c->list);
-	mutex_unlock(&bch_fs_list_lock);
-
-	closure_sync(&c->cl);
-	closure_debug_destroy(&c->cl);
-
 	/* btree prefetch might have kicked off reads in the background: */
 	bch2_btree_flush_all_reads(c);
 
@@ -571,16 +564,39 @@ void bch2_fs_stop(struct bch_fs *c)
 	cancel_work_sync(&c->btree_write_error_work);
 	cancel_delayed_work_sync(&c->pd_controllers_update);
 	cancel_work_sync(&c->read_only_work);
+}
 
-	for (i = 0; i < c->sb.nr_devices; i++)
-		if (c->devs[i])
-			bch2_dev_free(rcu_dereference_protected(c->devs[i], 1));
+void bch2_fs_free(struct bch_fs *c)
+{
+	unsigned i;
+
+	mutex_lock(&bch_fs_list_lock);
+	list_del(&c->list);
+	mutex_unlock(&bch_fs_list_lock);
+
+	closure_sync(&c->cl);
+	closure_debug_destroy(&c->cl);
+
+	for (i = 0; i < c->sb.nr_devices; i++) {
+		struct bch_dev *ca = rcu_dereference_protected(c->devs[i], true);
+
+		if (ca) {
+			bch2_free_super(&ca->disk_sb);
+			bch2_dev_free(ca);
+		}
+	}
 
 	bch_verbose(c, "shutdown complete");
 
 	kobject_put(&c->kobj);
 }
 
+void bch2_fs_stop(struct bch_fs *c)
+{
+	__bch2_fs_stop(c);
+	bch2_fs_free(c);
+}
+
 static const char *bch2_fs_online(struct bch_fs *c)
 {
 	struct bch_dev *ca;
diff --git a/fs/bcachefs/super.h b/fs/bcachefs/super.h
index fab4bee9c90e..795229e2d6a1 100644
--- a/fs/bcachefs/super.h
+++ b/fs/bcachefs/super.h
@@ -230,6 +230,8 @@ static inline void bch2_fs_lazy_rw(struct bch_fs *c)
 		bch2_fs_read_write_early(c);
 }
 
+void __bch2_fs_stop(struct bch_fs *);
+void bch2_fs_free(struct bch_fs *);
 void bch2_fs_stop(struct bch_fs *);
 
 int bch2_fs_start(struct bch_fs *);
-- 
cgit 


From 61ce38b862c17acccd0df0004d69710d8b438e99 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 6 Oct 2020 22:18:21 -0400
Subject: bcachefs: Fix journal_seq_copy()

We also need to update the journal's bloom filter of inode numbers that
each journal write has upudates for - in case the inode gets evicted
before it gets fsynced.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs.c      | 25 ++++++++++++++-----------
 fs/bcachefs/journal.c | 15 +++++++++++++++
 fs/bcachefs/journal.h |  1 +
 3 files changed, 30 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index 3239c4717cc6..a488dcebc11a 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -40,7 +40,8 @@ static void bch2_vfs_inode_init(struct bch_fs *,
 				struct bch_inode_info *,
 				struct bch_inode_unpacked *);
 
-static void journal_seq_copy(struct bch_inode_info *dst,
+static void journal_seq_copy(struct bch_fs *c,
+			     struct bch_inode_info *dst,
 			     u64 journal_seq)
 {
 	u64 old, v = READ_ONCE(dst->ei_journal_seq);
@@ -51,6 +52,8 @@ static void journal_seq_copy(struct bch_inode_info *dst,
 		if (old >= journal_seq)
 			break;
 	} while ((v = cmpxchg(&dst->ei_journal_seq, old, journal_seq)) != old);
+
+	bch2_journal_set_has_inum(&c->journal, dst->v.i_ino, journal_seq);
 }
 
 static void __pagecache_lock_put(struct pagecache_lock *lock, long i)
@@ -294,12 +297,12 @@ err_before_quota:
 	if (!tmpfile) {
 		bch2_inode_update_after_write(c, dir, &dir_u,
 					      ATTR_MTIME|ATTR_CTIME);
-		journal_seq_copy(dir, journal_seq);
+		journal_seq_copy(c, dir, journal_seq);
 		mutex_unlock(&dir->ei_update_lock);
 	}
 
 	bch2_vfs_inode_init(c, inode, &inode_u);
-	journal_seq_copy(inode, journal_seq);
+	journal_seq_copy(c, inode, journal_seq);
 
 	set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl);
 	set_cached_acl(&inode->v, ACL_TYPE_DEFAULT, default_acl);
@@ -320,7 +323,7 @@ err_before_quota:
 		 * We raced, another process pulled the new inode into cache
 		 * before us:
 		 */
-		journal_seq_copy(old, journal_seq);
+		journal_seq_copy(c, old, journal_seq);
 		make_bad_inode(&inode->v);
 		iput(&inode->v);
 
@@ -416,7 +419,7 @@ static int __bch2_link(struct bch_fs *c,
 	if (likely(!ret)) {
 		BUG_ON(inode_u.bi_inum != inode->v.i_ino);
 
-		journal_seq_copy(inode, dir->ei_journal_seq);
+		journal_seq_copy(c, inode, dir->ei_journal_seq);
 		bch2_inode_update_after_write(c, dir, &dir_u,
 					      ATTR_MTIME|ATTR_CTIME);
 		bch2_inode_update_after_write(c, inode, &inode_u, ATTR_CTIME);
@@ -473,7 +476,7 @@ static int bch2_unlink(struct inode *vdir, struct dentry *dentry)
 	if (likely(!ret)) {
 		BUG_ON(inode_u.bi_inum != inode->v.i_ino);
 
-		journal_seq_copy(inode, dir->ei_journal_seq);
+		journal_seq_copy(c, inode, dir->ei_journal_seq);
 		bch2_inode_update_after_write(c, dir, &dir_u,
 					      ATTR_MTIME|ATTR_CTIME);
 		bch2_inode_update_after_write(c, inode, &inode_u,
@@ -509,7 +512,7 @@ static int bch2_symlink(struct mnt_idmap *idmap,
 	if (unlikely(ret))
 		goto err;
 
-	journal_seq_copy(dir, inode->ei_journal_seq);
+	journal_seq_copy(c, dir, inode->ei_journal_seq);
 
 	ret = __bch2_link(c, inode, dir, dentry);
 	if (unlikely(ret))
@@ -609,22 +612,22 @@ retry:
 
 	bch2_inode_update_after_write(c, src_dir, &src_dir_u,
 				      ATTR_MTIME|ATTR_CTIME);
-	journal_seq_copy(src_dir, journal_seq);
+	journal_seq_copy(c, src_dir, journal_seq);
 
 	if (src_dir != dst_dir) {
 		bch2_inode_update_after_write(c, dst_dir, &dst_dir_u,
 					      ATTR_MTIME|ATTR_CTIME);
-		journal_seq_copy(dst_dir, journal_seq);
+		journal_seq_copy(c, dst_dir, journal_seq);
 	}
 
 	bch2_inode_update_after_write(c, src_inode, &src_inode_u,
 				      ATTR_CTIME);
-	journal_seq_copy(src_inode, journal_seq);
+	journal_seq_copy(c, src_inode, journal_seq);
 
 	if (dst_inode) {
 		bch2_inode_update_after_write(c, dst_inode, &dst_inode_u,
 					      ATTR_CTIME);
-		journal_seq_copy(dst_inode, journal_seq);
+		journal_seq_copy(c, dst_inode, journal_seq);
 	}
 err:
 	bch2_trans_exit(&trans);
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index 8b0746e092de..d1e4a8162ddd 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -17,6 +17,8 @@
 #include "super-io.h"
 #include "trace.h"
 
+static inline struct journal_buf *journal_seq_to_buf(struct journal *, u64);
+
 static bool __journal_entry_is_open(union journal_res_state state)
 {
 	return state.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL;
@@ -304,6 +306,19 @@ u64 bch2_inode_journal_seq(struct journal *j, u64 inode)
 	return seq;
 }
 
+void bch2_journal_set_has_inum(struct journal *j, u64 inode, u64 seq)
+{
+	size_t h = hash_64(inode, ilog2(sizeof(j->buf[0].has_inode) * 8));
+	struct journal_buf *buf;
+
+	spin_lock(&j->lock);
+
+	if ((buf = journal_seq_to_buf(j, seq)))
+		set_bit(h, buf->has_inode);
+
+	spin_unlock(&j->lock);
+}
+
 static int __journal_res_get(struct journal *j, struct journal_res *res,
 			     unsigned flags)
 {
diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h
index 26654b9cf0ea..b8e74c483a23 100644
--- a/fs/bcachefs/journal.h
+++ b/fs/bcachefs/journal.h
@@ -147,6 +147,7 @@ static inline u64 journal_cur_seq(struct journal *j)
 }
 
 u64 bch2_inode_journal_seq(struct journal *, u64);
+void bch2_journal_set_has_inum(struct journal *, u64, u64);
 
 static inline int journal_state_count(union journal_res_state s, int idx)
 {
-- 
cgit 


From 9ba2eb25f017800c3d00eac2bbc3c99451c3bae2 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 9 Oct 2020 00:09:20 -0400
Subject: bcachefs: Fix __bch2_truncate_page()

__bch2_truncate_page() will mark some of the blocks in a page as
unallocated. But, if the page is mmapped (and writable), every block in
the page needs to be marked dirty, else those blocks won't be written by
__bch2_writepage().

The solution is to change those userspace mappings to RO, so that we
force bch2_page_mkwrite() to be called again.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-io.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index c0995723ddd2..0290f7410a5c 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -27,6 +27,7 @@
 #include <linux/migrate.h>
 #include <linux/mmu_context.h>
 #include <linux/pagevec.h>
+#include <linux/rmap.h>
 #include <linux/sched/signal.h>
 #include <linux/task_io_accounting_ops.h>
 #include <linux/uio.h>
@@ -2160,6 +2161,12 @@ static int __bch2_truncate_page(struct bch_inode_info *inode,
 	ret = bch2_get_page_disk_reservation(c, inode, page, false);
 	BUG_ON(ret);
 
+	/*
+	 * This removes any writeable userspace mappings; we need to force
+	 * .page_mkwrite to be called again before any mmapped writes, to
+	 * redirty the full page:
+	 */
+	page_mkclean(page);
 	filemap_dirty_folio(mapping, page_folio(page));
 unlock:
 	unlock_page(page);
-- 
cgit 


From 5d0b7f906a7e047768f86539b10df602fcbd44b8 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 11 Oct 2020 16:33:49 -0400
Subject: bcachefs: Fix a lockdep splat

We can't allocate memory with GFP_FS while holding the btree cache lock,
and vfree() can allocate memory.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_cache.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index 829bff37df8d..5d1a7f138a54 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -349,11 +349,13 @@ void bch2_fs_btree_cache_exit(struct bch_fs *c)
 {
 	struct btree_cache *bc = &c->btree_cache;
 	struct btree *b;
-	unsigned i;
+	unsigned i, flags;
 
 	if (bc->shrink.list.next)
 		unregister_shrinker(&bc->shrink);
 
+	/* vfree() can allocate memory: */
+	flags = memalloc_nofs_save();
 	mutex_lock(&bc->lock);
 
 #ifdef CONFIG_BCACHEFS_DEBUG
@@ -389,6 +391,7 @@ void bch2_fs_btree_cache_exit(struct bch_fs *c)
 	}
 
 	mutex_unlock(&bc->lock);
+	memalloc_nofs_restore(flags);
 
 	if (bc->table_init_done)
 		rhashtable_destroy(&bc->table);
-- 
cgit 


From 9ee38f62da22625d9cd8919d9bbe1e4f0e36b9c3 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 13 Oct 2020 00:06:36 -0400
Subject: bcachefs: Fix off-by-one error in ptr gen check

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/buckets.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 7aba1907f91d..a34a9fe5a21c 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -950,7 +950,7 @@ static int __mark_pointer(struct bch_fs *c, struct bkey_s_c k,
 		return -EIO;
 	}
 
-	if (gen_cmp(bucket_gen, p.ptr.gen) >= 96U) {
+	if (gen_cmp(bucket_gen, p.ptr.gen) > 96U) {
 		bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
 			"bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n"
 			"while marking %s",
-- 
cgit 


From c47c50f8564a5bacff1afaa2adc8f46ef49935b4 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 13 Oct 2020 03:58:50 -0400
Subject: bcachefs: Fix gc of stale ptr gens

Awhile back, gcing of stale pointers was split out from full
mark-and-sweep gc - but, the bit to actually drop those stale pointers
wasn't implemnted. Whoops.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_gc.c | 66 +++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 52 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 4b20817402f6..2774f10054a9 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -8,6 +8,7 @@
 #include "alloc_background.h"
 #include "alloc_foreground.h"
 #include "bkey_methods.h"
+#include "bkey_on_stack.h"
 #include "btree_locking.h"
 #include "btree_update_interior.h"
 #include "btree_io.h"
@@ -890,40 +891,77 @@ out:
 	return ret;
 }
 
+static bool gc_btree_gens_key(struct bch_fs *c, struct bkey_s_c k)
+{
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+	const struct bch_extent_ptr *ptr;
+
+	percpu_down_read(&c->mark_lock);
+	bkey_for_each_ptr(ptrs, ptr) {
+		struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+		struct bucket *g = PTR_BUCKET(ca, ptr, false);
+
+		if (gen_after(g->mark.gen, ptr->gen) > 16) {
+			percpu_up_read(&c->mark_lock);
+			return true;
+		}
+	}
+
+	bkey_for_each_ptr(ptrs, ptr) {
+		struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+		struct bucket *g = PTR_BUCKET(ca, ptr, false);
+
+		if (gen_after(g->gc_gen, ptr->gen))
+			g->gc_gen = ptr->gen;
+	}
+	percpu_up_read(&c->mark_lock);
+
+	return false;
+}
+
 /*
  * For recalculating oldest gen, we only need to walk keys in leaf nodes; btree
  * node pointers currently never have cached pointers that can become stale:
  */
-static int bch2_gc_btree_gens(struct bch_fs *c, enum btree_id id)
+static int bch2_gc_btree_gens(struct bch_fs *c, enum btree_id btree_id)
 {
 	struct btree_trans trans;
 	struct btree_iter *iter;
 	struct bkey_s_c k;
-	int ret;
+	struct bkey_on_stack sk;
+	int ret = 0;
 
+	bkey_on_stack_init(&sk);
 	bch2_trans_init(&trans, c, 0, 0);
 
-	for_each_btree_key(&trans, iter, id, POS_MIN, BTREE_ITER_PREFETCH, k, ret) {
-		struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-		const struct bch_extent_ptr *ptr;
+	iter = bch2_trans_get_iter(&trans, btree_id, POS_MIN,
+				   BTREE_ITER_PREFETCH);
 
-		percpu_down_read(&c->mark_lock);
-		bkey_for_each_ptr(ptrs, ptr) {
-			struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
-			struct bucket *g = PTR_BUCKET(ca, ptr, false);
+	while ((k = bch2_btree_iter_peek(iter)).k &&
+	       !(ret = bkey_err(k))) {
+		if (gc_btree_gens_key(c, k)) {
+			bkey_on_stack_reassemble(&sk, c, k);
+			bch2_extent_normalize(c, bkey_i_to_s(sk.k));
 
-			if (gen_after(g->gc_gen, ptr->gen))
-				g->gc_gen = ptr->gen;
+			bch2_btree_iter_set_pos(iter, bkey_start_pos(&sk.k->k));
 
-			if (gen_after(g->mark.gen, ptr->gen) > 32) {
-				/* rewrite btree node */
+			bch2_trans_update(&trans, iter, sk.k, 0);
 
+			ret = bch2_trans_commit(&trans, NULL, NULL,
+						BTREE_INSERT_NOFAIL);
+			if (ret == -EINTR)
+				continue;
+			if (ret) {
+				break;
 			}
 		}
-		percpu_up_read(&c->mark_lock);
+
+		bch2_btree_iter_next(iter);
 	}
 
 	bch2_trans_exit(&trans);
+	bkey_on_stack_exit(&sk, c);
+
 	return ret;
 }
 
-- 
cgit 


From 922ae9f45585500398e9a88b50d5d29a2fc721e8 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 10 Jul 2020 19:49:34 -0400
Subject: bcachefs: Copy ptr->cached when migrating data

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/move.c | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index b5970f09609a..9967dd422e4d 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -94,10 +94,6 @@ static int bch2_migrate_index_update(struct bch_write_op *op)
 		    !bch2_bkey_matches_ptr(c, k, m->ptr, m->offset))
 			goto nomatch;
 
-		if (m->data_cmd == DATA_REWRITE &&
-		    !bch2_bkey_has_device(k, m->data_opts.rewrite_dev))
-			goto nomatch;
-
 		bkey_reassemble(&_insert.k, k);
 		insert = &_insert.k;
 
@@ -109,9 +105,19 @@ static int bch2_migrate_index_update(struct bch_write_op *op)
 		bch2_cut_back(new->k.p,		insert);
 		bch2_cut_back(insert->k.p,	&new->k_i);
 
-		if (m->data_cmd == DATA_REWRITE)
-			bch2_bkey_drop_device(bkey_i_to_s(insert),
-					      m->data_opts.rewrite_dev);
+		if (m->data_cmd == DATA_REWRITE) {
+			struct bch_extent_ptr *new_ptr, *old_ptr = (void *)
+				bch2_bkey_has_device(bkey_i_to_s_c(insert),
+						     m->data_opts.rewrite_dev);
+			if (!old_ptr)
+				goto nomatch;
+
+			if (old_ptr->cached)
+				extent_for_each_ptr(extent_i_to_s(new), new_ptr)
+					new_ptr->cached = true;
+
+			bch2_bkey_drop_ptr(bkey_i_to_s(insert), old_ptr);
+		}
 
 		extent_for_each_ptr_decode(extent_i_to_s(new), p, entry) {
 			if (bch2_bkey_has_device(bkey_i_to_s_c(insert), p.ptr.dev)) {
-- 
cgit 


From 505b7a4c2844d8ee0004ab905874513776bc14da Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 15 Oct 2020 15:58:36 -0400
Subject: bcachefs: Fix errors early in the fs init process

At some point bch2_fs_alloc() was changed to always call bch2_fs_free()
in the error path, which means we need c->cl to always be initialized.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/super.c | 20 ++++++++------------
 1 file changed, 8 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index d0d46023163c..988c678de9fc 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -654,6 +654,14 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 
 	__module_get(THIS_MODULE);
 
+	closure_init(&c->cl, NULL);
+
+	c->kobj.kset = bcachefs_kset;
+	kobject_init(&c->kobj, &bch2_fs_ktype);
+	kobject_init(&c->internal, &bch2_fs_internal_ktype);
+	kobject_init(&c->opts_dir, &bch2_fs_opts_dir_ktype);
+	kobject_init(&c->time_stats, &bch2_fs_time_stats_ktype);
+
 	c->minor		= -1;
 	c->disk_sb.fs_sb	= true;
 
@@ -785,18 +793,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 		    bch2_dev_alloc(c, i))
 			goto err;
 
-	/*
-	 * Now that all allocations have succeeded, init various refcounty
-	 * things that let us shutdown:
-	 */
-	closure_init(&c->cl, NULL);
-
-	c->kobj.kset = bcachefs_kset;
-	kobject_init(&c->kobj, &bch2_fs_ktype);
-	kobject_init(&c->internal, &bch2_fs_internal_ktype);
-	kobject_init(&c->opts_dir, &bch2_fs_opts_dir_ktype);
-	kobject_init(&c->time_stats, &bch2_fs_time_stats_ktype);
-
 	mutex_lock(&bch_fs_list_lock);
 	err = bch2_fs_online(c);
 	mutex_unlock(&bch_fs_list_lock);
-- 
cgit 


From 97c0e19502549c0501581f9efff45022a10be2aa Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 15 Oct 2020 21:48:58 -0400
Subject: bcachefs: Fix another lockdep splat

vfree() can allocate memory, so we need to call memalloc_nofs_save().

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_cache.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index 5d1a7f138a54..90d884b18b70 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -253,7 +253,7 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink,
 	unsigned long can_free;
 	unsigned long touched = 0;
 	unsigned long freed = 0;
-	unsigned i;
+	unsigned i, flags;
 
 	if (btree_shrinker_disabled(c))
 		return SHRINK_STOP;
@@ -264,6 +264,8 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink,
 	else if (!mutex_trylock(&bc->lock))
 		return -1;
 
+	flags = memalloc_nofs_save();
+
 	/*
 	 * It's _really_ critical that we don't free too many btree nodes - we
 	 * have to always leave ourselves a reserve. The reserve is how we
@@ -327,6 +329,7 @@ restart:
 			clear_btree_node_accessed(b);
 	}
 
+	memalloc_nofs_restore(flags);
 	mutex_unlock(&bc->lock);
 out:
 	return (unsigned long) freed * btree_pages(c);
-- 
cgit 


From 6ea873d1727af15ae429882737e8848492b63595 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 15 Oct 2020 22:23:02 -0400
Subject: bcachefs: Fix copygc of compressed data

The check for when we need to get a disk reservation was wrong.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/move.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index 9967dd422e4d..8aa13b41d20d 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -296,14 +296,14 @@ int bch2_migrate_write_init(struct bch_fs *c, struct migrate_write *m,
 		unsigned compressed_sectors = 0;
 
 		bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
-			if (!p.ptr.cached &&
-			    crc_is_compressed(p.crc) &&
-			    bch2_dev_in_target(c, p.ptr.dev, data_opts.target))
+			if (p.ptr.dev == data_opts.rewrite_dev &&
+			    !p.ptr.cached &&
+			    crc_is_compressed(p.crc))
 				compressed_sectors += p.crc.compressed_size;
 
 		if (compressed_sectors) {
 			ret = bch2_disk_reservation_add(c, &m->op.res,
-					compressed_sectors,
+					k.k->size * m->op.nr_replicas,
 					BCH_DISK_RESERVATION_NOFAIL);
 			if (ret)
 				return ret;
-- 
cgit 


From 9f20ed157d521c7a1af0fe01e80d9e0ee880f9f7 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 15 Oct 2020 22:50:48 -0400
Subject: bcachefs: Fix copygc dying on startup

The copygc threads errors out and makes the filesystem go RO if it ever
tries to run and discovers it has no reserve allocated - which is a
problem if it races with the allocator thread and its reserve hasn't
been filled yet.

The allocator thread doesn't start filling the copygc reserve until
after BCH_FS_STARTED has been set, so make sure to wake up the allocator
threads after setting that and before starting copygc.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.h | 4 +++-
 fs/bcachefs/super.c            | 7 +++++++
 2 files changed, 10 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h
index f6b9f27f0713..4f462696b747 100644
--- a/fs/bcachefs/alloc_background.h
+++ b/fs/bcachefs/alloc_background.h
@@ -61,8 +61,10 @@ static inline void bch2_wake_allocator(struct bch_dev *ca)
 
 	rcu_read_lock();
 	p = rcu_dereference(ca->alloc_thread);
-	if (p)
+	if (p) {
 		wake_up_process(p);
+		ca->allocator_state = ALLOCATOR_RUNNING;
+	}
 	rcu_read_unlock();
 }
 
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 988c678de9fc..85ba96cb2292 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -888,6 +888,13 @@ int bch2_fs_start(struct bch_fs *c)
 
 	set_bit(BCH_FS_STARTED, &c->flags);
 
+	/*
+	 * Allocator threads don't start filling copygc reserve until after we
+	 * set BCH_FS_STARTED - wake them now:
+	 */
+	for_each_online_member(ca, c, i)
+		bch2_wake_allocator(ca);
+
 	if (c->opts.read_only || c->opts.nochanges) {
 		bch2_fs_read_only(c);
 	} else {
-- 
cgit 


From f3721e12d07ab3c3e400a1a635e999ef72780de4 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 16 Oct 2020 21:32:02 -0400
Subject: bcachefs: Perf improvements for bch_alloc_read()

On large filesystems reading in the alloc info takes a significant
amount of time. But we don't need to be calling into the fully general
bch2_mark_key() path, just open code what we need in
bch2_alloc_read_fn().

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c | 26 ++++++++++++++++++++++----
 fs/bcachefs/btree_types.h      |  2 --
 fs/bcachefs/buckets.c          |  4 ++--
 fs/bcachefs/ec.c               |  1 -
 4 files changed, 24 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index fd8b57c806cc..9fa7184188c2 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -209,10 +209,25 @@ void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c,
 static int bch2_alloc_read_fn(struct bch_fs *c, enum btree_id id,
 			      unsigned level, struct bkey_s_c k)
 {
-	if (!level)
-		bch2_mark_key(c, k, 0, 0, NULL, 0,
-			      BTREE_TRIGGER_ALLOC_READ|
-			      BTREE_TRIGGER_NOATOMIC);
+	struct bch_dev *ca;
+	struct bucket *g;
+	struct bkey_alloc_unpacked u;
+
+	if (level || k.k->type != KEY_TYPE_alloc)
+		return 0;
+
+	ca = bch_dev_bkey_exists(c, k.k->p.inode);
+	g = __bucket(ca, k.k->p.offset, 0);
+	u = bch2_alloc_unpack(k);
+
+	g->_mark.gen		= u.gen;
+	g->_mark.data_type	= u.data_type;
+	g->_mark.dirty_sectors	= u.dirty_sectors;
+	g->_mark.cached_sectors	= u.cached_sectors;
+	g->io_time[READ]	= u.read_time;
+	g->io_time[WRITE]	= u.write_time;
+	g->oldest_gen		= u.oldest_gen;
+	g->gen_valid		= 1;
 
 	return 0;
 }
@@ -223,8 +238,11 @@ int bch2_alloc_read(struct bch_fs *c, struct journal_keys *journal_keys)
 	unsigned i;
 	int ret = 0;
 
+	down_read(&c->gc_lock);
 	ret = bch2_btree_and_journal_walk(c, journal_keys, BTREE_ID_ALLOC,
 					  NULL, bch2_alloc_read_fn);
+	up_read(&c->gc_lock);
+
 	if (ret) {
 		bch_err(c, "error reading alloc info: %i", ret);
 		return ret;
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 297cf26ca13e..b295e46de059 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -602,7 +602,6 @@ enum btree_trigger_flags {
 
 	__BTREE_TRIGGER_GC,
 	__BTREE_TRIGGER_BUCKET_INVALIDATE,
-	__BTREE_TRIGGER_ALLOC_READ,
 	__BTREE_TRIGGER_NOATOMIC,
 };
 
@@ -614,7 +613,6 @@ enum btree_trigger_flags {
 
 #define BTREE_TRIGGER_GC		(1U << __BTREE_TRIGGER_GC)
 #define BTREE_TRIGGER_BUCKET_INVALIDATE	(1U << __BTREE_TRIGGER_BUCKET_INVALIDATE)
-#define BTREE_TRIGGER_ALLOC_READ	(1U << __BTREE_TRIGGER_ALLOC_READ)
 #define BTREE_TRIGGER_NOATOMIC		(1U << __BTREE_TRIGGER_NOATOMIC)
 
 static inline bool btree_node_type_needs_gc(enum btree_node_type type)
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index a34a9fe5a21c..7bc51f397c7b 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -482,6 +482,7 @@ static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
 		bch2_wake_allocator(ca);
 }
 
+__flatten
 void bch2_dev_usage_from_buckets(struct bch_fs *c)
 {
 	struct bch_dev *ca;
@@ -756,8 +757,7 @@ static int bch2_mark_alloc(struct bch_fs *c,
 		}
 	}));
 
-	if (!(flags & BTREE_TRIGGER_ALLOC_READ))
-		bch2_dev_usage_update(c, ca, fs_usage, old_m, m, gc);
+	bch2_dev_usage_update(c, ca, fs_usage, old_m, m, gc);
 
 	g->io_time[READ]	= u.read_time;
 	g->io_time[WRITE]	= u.write_time;
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index 2120f0a9b424..0b1d0d2c323b 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -1497,7 +1497,6 @@ static int bch2_stripes_read_fn(struct bch_fs *c, enum btree_id id,
 
 		ret = __ec_stripe_mem_alloc(c, k.k->p.offset, GFP_KERNEL) ?:
 			bch2_mark_key(c, k, 0, 0, NULL, 0,
-				      BTREE_TRIGGER_ALLOC_READ|
 				      BTREE_TRIGGER_NOATOMIC);
 		if (ret)
 			return ret;
-- 
cgit 


From aa8889c07abecd7db7b2c0beb61db921fbafe04f Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 16 Dec 2020 13:35:16 -0500
Subject: bcachefs: Fix assertion popping in transaction commit path

We can't be holding read locks on btree nodes when we go to take write
locks: this would deadlock if another thread is holding an intent lock
on the node we have a read lock on, and it tries to commit and upgrade
to a write lock.

But instead of triggering an assertion, if this happens we can just
upgrade the read lock to an intent lock.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update_leaf.c | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 4a0e248f6f82..a4a5e084aad3 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -503,6 +503,10 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans,
 
 	/*
 	 * Can't be holding any read locks when we go to take write locks:
+	 * another thread could be holding an intent lock on the same node we
+	 * have a read lock on, and it'll block trying to take a write lock
+	 * (because we hold a read lock) and it could be blocking us by holding
+	 * its own read lock (while we're trying to to take write locks).
 	 *
 	 * note - this must be done after bch2_trans_journal_preres_get_cold()
 	 * or anything else that might call bch2_trans_relock(), since that
@@ -510,9 +514,15 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans,
 	 */
 	trans_for_each_iter(trans, iter) {
 		if (iter->nodes_locked != iter->nodes_intent_locked) {
-			EBUG_ON(iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT);
-			EBUG_ON(trans->iters_live & (1ULL << iter->idx));
-			bch2_btree_iter_unlock_noinline(iter);
+			if ((iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT) ||
+			    (trans->iters_live & (1ULL << iter->idx))) {
+				if (!bch2_btree_iter_upgrade(iter, 1)) {
+					trace_trans_restart_upgrade(trans->ip);
+					return -EINTR;
+				}
+			} else {
+				bch2_btree_iter_unlock_noinline(iter);
+			}
 		}
 	}
 
-- 
cgit 


From 8d6b6222bf168e7a0613c0baf3da30f2c7338488 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 16 Oct 2020 21:36:26 -0400
Subject: bcachefs: Improvements to writing alloc info

Now that we've got transactional alloc info updates (and have for
awhile), we don't need to write it out on shutdown, and we don't need to
write it out on startup except when GC found errors - this is a big
improvement to mount/unmount performance.

This patch also fixes a few bugs where we weren't writing out alloc
info (on new filesystems, and new devices) and should have been.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c | 68 ++++++++++++++++++------------------------
 fs/bcachefs/alloc_background.h |  3 +-
 fs/bcachefs/btree_gc.c         |  5 +++-
 fs/bcachefs/ec.c               |  4 +--
 fs/bcachefs/ec.h               |  2 +-
 fs/bcachefs/recovery.c         | 38 +++++++++++++++++------
 fs/bcachefs/super.c            | 21 ++++---------
 7 files changed, 71 insertions(+), 70 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 9fa7184188c2..459da00457ef 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -271,12 +271,6 @@ int bch2_alloc_read(struct bch_fs *c, struct journal_keys *journal_keys)
 	return 0;
 }
 
-enum alloc_write_ret {
-	ALLOC_WROTE,
-	ALLOC_NOWROTE,
-	ALLOC_END,
-};
-
 static int bch2_alloc_write_key(struct btree_trans *trans,
 				struct btree_iter *iter,
 				unsigned flags)
@@ -306,26 +300,17 @@ retry:
 
 	old_u = bch2_alloc_unpack(k);
 
-	if (iter->pos.inode >= c->sb.nr_devices ||
-	    !c->devs[iter->pos.inode])
-		return ALLOC_END;
-
 	percpu_down_read(&c->mark_lock);
 	ca	= bch_dev_bkey_exists(c, iter->pos.inode);
 	ba	= bucket_array(ca);
 
-	if (iter->pos.offset >= ba->nbuckets) {
-		percpu_up_read(&c->mark_lock);
-		return ALLOC_END;
-	}
-
 	g	= &ba->b[iter->pos.offset];
 	m	= READ_ONCE(g->mark);
 	new_u	= alloc_mem_to_key(g, m);
 	percpu_up_read(&c->mark_lock);
 
 	if (!bkey_alloc_unpacked_cmp(old_u, new_u))
-		return ALLOC_NOWROTE;
+		return 0;
 
 	a = bkey_alloc_init(&alloc_key.k);
 	a->k.p = iter->pos;
@@ -343,50 +328,55 @@ err:
 	return ret;
 }
 
-int bch2_alloc_write(struct bch_fs *c, unsigned flags, bool *wrote)
+int bch2_dev_alloc_write(struct bch_fs *c, struct bch_dev *ca, unsigned flags)
 {
 	struct btree_trans trans;
 	struct btree_iter *iter;
-	struct bch_dev *ca;
-	unsigned i;
+	u64 first_bucket, nbuckets;
 	int ret = 0;
 
+	percpu_down_read(&c->mark_lock);
+	first_bucket	= bucket_array(ca)->first_bucket;
+	nbuckets	= bucket_array(ca)->nbuckets;
+	percpu_up_read(&c->mark_lock);
+
 	BUG_ON(BKEY_ALLOC_VAL_U64s_MAX > 8);
 
 	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
 
-	iter = bch2_trans_get_iter(&trans, BTREE_ID_ALLOC, POS_MIN,
+	iter = bch2_trans_get_iter(&trans, BTREE_ID_ALLOC,
+				   POS(ca->dev_idx, first_bucket),
 				   BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
 
-	for_each_rw_member(ca, c, i) {
-		unsigned first_bucket;
+	while (iter->pos.offset < nbuckets) {
+		bch2_trans_cond_resched(&trans);
 
-		percpu_down_read(&c->mark_lock);
-		first_bucket = bucket_array(ca)->first_bucket;
-		percpu_up_read(&c->mark_lock);
+		ret = bch2_alloc_write_key(&trans, iter, flags);
+		if (ret)
+			break;
+		bch2_btree_iter_next_slot(iter);
+	}
 
-		bch2_btree_iter_set_pos(iter, POS(i, first_bucket));
+	bch2_trans_exit(&trans);
 
-		while (1) {
-			bch2_trans_cond_resched(&trans);
+	return ret;
+}
 
-			ret = bch2_alloc_write_key(&trans, iter, flags);
-			if (ret < 0 || ret == ALLOC_END)
-				break;
-			if (ret == ALLOC_WROTE)
-				*wrote = true;
-			bch2_btree_iter_next_slot(iter);
-		}
+int bch2_alloc_write(struct bch_fs *c, unsigned flags)
+{
+	struct bch_dev *ca;
+	unsigned i;
+	int ret = 0;
 
-		if (ret < 0) {
+	for_each_rw_member(ca, c, i) {
+		bch2_dev_alloc_write(c, ca, flags);
+		if (ret) {
 			percpu_ref_put(&ca->io_ref);
 			break;
 		}
 	}
 
-	bch2_trans_exit(&trans);
-
-	return ret < 0 ? ret : 0;
+	return ret;
 }
 
 /* Bucket IO clocks: */
diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h
index 4f462696b747..56a846fde8dd 100644
--- a/fs/bcachefs/alloc_background.h
+++ b/fs/bcachefs/alloc_background.h
@@ -93,7 +93,8 @@ void bch2_dev_allocator_quiesce(struct bch_fs *, struct bch_dev *);
 void bch2_dev_allocator_stop(struct bch_dev *);
 int bch2_dev_allocator_start(struct bch_dev *);
 
-int bch2_alloc_write(struct bch_fs *, unsigned, bool *);
+int bch2_dev_alloc_write(struct bch_fs *, struct bch_dev *, unsigned);
+int bch2_alloc_write(struct bch_fs *, unsigned);
 void bch2_fs_allocator_background_init(struct bch_fs *);
 
 #endif /* _BCACHEFS_ALLOC_BACKGROUND_H */
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 2774f10054a9..74012bea7126 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -570,6 +570,7 @@ static int bch2_gc_done(struct bch_fs *c,
 			fsck_err(c, _msg ": got %llu, should be %llu"	\
 				, ##__VA_ARGS__, dst->_f, src->_f);	\
 		dst->_f = src->_f;					\
+		ret = 1;						\
 	}
 #define copy_stripe_field(_f, _msg, ...)				\
 	if (dst->_f != src->_f) {					\
@@ -580,6 +581,7 @@ static int bch2_gc_done(struct bch_fs *c,
 				dst->_f, src->_f);			\
 		dst->_f = src->_f;					\
 		dst->dirty = true;					\
+		ret = 1;						\
 	}
 #define copy_bucket_field(_f)						\
 	if (dst->b[b].mark._f != src->b[b].mark._f) {			\
@@ -590,6 +592,7 @@ static int bch2_gc_done(struct bch_fs *c,
 				bch2_data_types[dst->b[b].mark.data_type],\
 				dst->b[b].mark._f, src->b[b].mark._f);	\
 		dst->b[b]._mark._f = src->b[b].mark._f;			\
+		ret = 1;						\
 	}
 #define copy_dev_field(_f, _msg, ...)					\
 	copy_field(_f, "dev %u has wrong " _msg, i, ##__VA_ARGS__)
@@ -1396,7 +1399,7 @@ static int bch2_gc_thread(void *arg)
 #else
 		ret = bch2_gc_gens(c);
 #endif
-		if (ret)
+		if (ret < 0)
 			bch_err(c, "btree gc failed: %i", ret);
 
 		debug_check_no_locks_held();
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index 0b1d0d2c323b..c6d6f23d3f24 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -1448,7 +1448,7 @@ static int __bch2_stripe_write_key(struct btree_trans *trans,
 	return 0;
 }
 
-int bch2_stripes_write(struct bch_fs *c, unsigned flags, bool *wrote)
+int bch2_stripes_write(struct bch_fs *c, unsigned flags)
 {
 	struct btree_trans trans;
 	struct btree_iter *iter;
@@ -1476,8 +1476,6 @@ int bch2_stripes_write(struct bch_fs *c, unsigned flags, bool *wrote)
 
 		if (ret)
 			break;
-
-		*wrote = true;
 	}
 
 	bch2_trans_exit(&trans);
diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h
index f8fc3d616cd7..6db16cf768da 100644
--- a/fs/bcachefs/ec.h
+++ b/fs/bcachefs/ec.h
@@ -156,7 +156,7 @@ void bch2_ec_flush_new_stripes(struct bch_fs *);
 
 struct journal_keys;
 int bch2_stripes_read(struct bch_fs *, struct journal_keys *);
-int bch2_stripes_write(struct bch_fs *, unsigned, bool *);
+int bch2_stripes_write(struct bch_fs *, unsigned);
 
 int bch2_ec_mem_alloc(struct bch_fs *, bool);
 
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 6e829bf0a31f..d70fa968db50 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -845,9 +845,11 @@ static int verify_superblock_clean(struct bch_fs *c,
 	}
 
 	mustfix_fsck_err_on(j->read_clock != clean->read_clock, c,
-			"superblock read clock doesn't match journal after clean shutdown");
+			"superblock read clock %u doesn't match journal %u after clean shutdown",
+			clean->read_clock, j->read_clock);
 	mustfix_fsck_err_on(j->write_clock != clean->write_clock, c,
-			"superblock read clock doesn't match journal after clean shutdown");
+			"superblock write clock %u doesn't match journal %u after clean shutdown",
+			clean->write_clock, j->write_clock);
 
 	for (i = 0; i < BTREE_ID_NR; i++) {
 		char buf1[200], buf2[200];
@@ -961,7 +963,7 @@ int bch2_fs_recovery(struct bch_fs *c)
 	const char *err = "cannot allocate memory";
 	struct bch_sb_field_clean *clean = NULL;
 	u64 journal_seq;
-	bool wrote = false, write_sb = false;
+	bool write_sb = false, need_write_alloc = false;
 	int ret;
 
 	if (c->sb.clean)
@@ -1090,8 +1092,10 @@ int bch2_fs_recovery(struct bch_fs *c)
 		bch_info(c, "starting metadata mark and sweep");
 		err = "error in mark and sweep";
 		ret = bch2_gc(c, &c->journal_keys, true, true);
-		if (ret)
+		if (ret < 0)
 			goto err;
+		if (ret)
+			need_write_alloc = true;
 		bch_verbose(c, "mark and sweep done");
 	}
 
@@ -1101,8 +1105,10 @@ int bch2_fs_recovery(struct bch_fs *c)
 		bch_info(c, "starting mark and sweep");
 		err = "error in mark and sweep";
 		ret = bch2_gc(c, &c->journal_keys, true, false);
-		if (ret)
+		if (ret < 0)
 			goto err;
+		if (ret)
+			need_write_alloc = true;
 		bch_verbose(c, "mark and sweep done");
 	}
 
@@ -1126,7 +1132,7 @@ int bch2_fs_recovery(struct bch_fs *c)
 		goto err;
 	bch_verbose(c, "journal replay done");
 
-	if (!c->opts.nochanges) {
+	if (need_write_alloc && !c->opts.nochanges) {
 		/*
 		 * note that even when filesystem was clean there might be work
 		 * to do here, if we ran gc (because of fsck) which recalculated
@@ -1134,8 +1140,8 @@ int bch2_fs_recovery(struct bch_fs *c)
 		 */
 		bch_verbose(c, "writing allocation info");
 		err = "error writing out alloc info";
-		ret = bch2_stripes_write(c, BTREE_INSERT_LAZY_RW, &wrote) ?:
-			bch2_alloc_write(c, BTREE_INSERT_LAZY_RW, &wrote);
+		ret = bch2_stripes_write(c, BTREE_INSERT_LAZY_RW) ?:
+			bch2_alloc_write(c, BTREE_INSERT_LAZY_RW);
 		if (ret) {
 			bch_err(c, "error writing alloc info");
 			goto err;
@@ -1281,6 +1287,20 @@ int bch2_fs_initialize(struct bch_fs *c)
 	bch2_fs_journal_start(&c->journal, 1, &journal);
 	bch2_journal_set_replay_done(&c->journal);
 
+	err = "error going read-write";
+	ret = bch2_fs_read_write_early(c);
+	if (ret)
+		goto err;
+
+	/*
+	 * Write out the superblock and journal buckets, now that we can do
+	 * btree updates
+	 */
+	err = "error writing alloc info";
+	ret = bch2_alloc_write(c, 0);
+	if (ret)
+		goto err;
+
 	bch2_inode_init(c, &root_inode, 0, 0,
 			S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0, NULL);
 	root_inode.bi_inum = BCACHEFS_ROOT_INO;
@@ -1289,7 +1309,7 @@ int bch2_fs_initialize(struct bch_fs *c)
 	err = "error creating root directory";
 	ret = bch2_btree_insert(c, BTREE_ID_INODES,
 				&packed_inode.inode.k_i,
-				NULL, NULL, BTREE_INSERT_LAZY_RW);
+				NULL, NULL, 0);
 	if (ret)
 		goto err;
 
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 85ba96cb2292..7656bf632d79 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -176,9 +176,7 @@ struct bch_fs *bch2_uuid_to_fs(__uuid_t uuid)
 static void __bch2_fs_read_only(struct bch_fs *c)
 {
 	struct bch_dev *ca;
-	bool wrote = false;
 	unsigned i, clean_passes = 0;
-	int ret;
 
 	bch2_rebalance_stop(c);
 	bch2_copygc_stop(c);
@@ -197,20 +195,6 @@ static void __bch2_fs_read_only(struct bch_fs *c)
 	if (!test_bit(BCH_FS_ALLOCATOR_RUNNING, &c->flags))
 		goto nowrote_alloc;
 
-	bch_verbose(c, "writing alloc info");
-	/*
-	 * This should normally just be writing the bucket read/write clocks:
-	 */
-	ret = bch2_stripes_write(c, BTREE_INSERT_NOCHECK_RW, &wrote) ?:
-		bch2_alloc_write(c, BTREE_INSERT_NOCHECK_RW, &wrote);
-	bch_verbose(c, "writing alloc info complete");
-
-	if (ret && !test_bit(BCH_FS_EMERGENCY_RO, &c->flags))
-		bch2_fs_inconsistent(c, "error writing out alloc info %i", ret);
-
-	if (ret)
-		goto nowrote_alloc;
-
 	bch_verbose(c, "flushing journal and stopping allocators");
 
 	bch2_journal_flush_all_pins(&c->journal);
@@ -1666,6 +1650,11 @@ have_slot:
 	bch2_write_super(c);
 	mutex_unlock(&c->sb_lock);
 
+	err = "alloc write failed";
+	ret = bch2_dev_alloc_write(c, ca, 0);
+	if (ret)
+		goto err;
+
 	if (ca->mi.state == BCH_MEMBER_STATE_RW) {
 		err = __bch2_dev_read_write(c, ca);
 		if (err)
-- 
cgit 


From 289980195ffaa949ecd4216337a70a8e23cf8e86 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 17 Oct 2020 16:44:27 -0400
Subject: bcachefs: Start/stop io clock hands in read/write paths

This fixes a bug where the clock hands in the journal and superblock
didn't match, because we were still incrementing the read clock hand
while read-only.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c | 12 ------------
 fs/bcachefs/super.c            |  6 ++++++
 2 files changed, 6 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 459da00457ef..b0448d2f1916 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -1267,18 +1267,6 @@ void bch2_recalc_capacity(struct bch_fs *c)
 
 	c->bucket_size_max = bucket_size_max;
 
-	if (c->capacity) {
-		bch2_io_timer_add(&c->io_clock[READ],
-				 &c->bucket_clock[READ].rescale);
-		bch2_io_timer_add(&c->io_clock[WRITE],
-				 &c->bucket_clock[WRITE].rescale);
-	} else {
-		bch2_io_timer_del(&c->io_clock[READ],
-				 &c->bucket_clock[READ].rescale);
-		bch2_io_timer_del(&c->io_clock[WRITE],
-				 &c->bucket_clock[WRITE].rescale);
-	}
-
 	/* Wake up case someone was waiting for buckets */
 	closure_wake_up(&c->freelist_wait);
 }
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 7656bf632d79..15e760d8dd4d 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -182,6 +182,9 @@ static void __bch2_fs_read_only(struct bch_fs *c)
 	bch2_copygc_stop(c);
 	bch2_gc_thread_stop(c);
 
+	bch2_io_timer_del(&c->io_clock[READ], &c->bucket_clock[READ].rescale);
+	bch2_io_timer_del(&c->io_clock[WRITE], &c->bucket_clock[WRITE].rescale);
+
 	/*
 	 * Flush journal before stopping allocators, because flushing journal
 	 * blacklist entries involves allocating new btree nodes:
@@ -407,6 +410,9 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early)
 		bch2_dev_allocator_add(c, ca);
 	bch2_recalc_capacity(c);
 
+	bch2_io_timer_add(&c->io_clock[READ], &c->bucket_clock[READ].rescale);
+	bch2_io_timer_add(&c->io_clock[WRITE], &c->bucket_clock[WRITE].rescale);
+
 	for_each_rw_member(ca, c, i) {
 		ret = bch2_dev_allocator_start(ca);
 		if (ret) {
-- 
cgit 


From 39283c712e6df927c7c49e8b738ca110551bb399 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 19 Oct 2020 22:36:24 -0400
Subject: bcachefs: Fix for bad stripe pointers

The allocator usually doesn't increment bucket gens right away on
buckets that it's about to hand out (for reasons that need to be
documented), instead deferring that to whatever extent update first
references that bucket.

But stripe pointers reference buckets without changing bucket sector
counts, meaning we could end up with a pointer in a stripe with a gen
newer than the bucket it points to.

Fix this by adding a transactional trigger for KEY_TYPE_stripe that just
writes out the keys in the alloc btree for the buckets it points to.

Also - consolidate the code that checks pointer validity.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c  |   2 -
 fs/bcachefs/alloc_background.h  |   3 +
 fs/bcachefs/btree_types.h       |   1 +
 fs/bcachefs/btree_update_leaf.c |   5 +-
 fs/bcachefs/buckets.c           | 283 ++++++++++++++++++++++++++--------------
 5 files changed, 190 insertions(+), 104 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index b0448d2f1916..8f0c1f378b77 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -497,8 +497,6 @@ static void bch2_bucket_clock_init(struct bch_fs *c, int rw)
  * commands to the newly free buckets, then puts them on the various freelists.
  */
 
-#define BUCKET_GC_GEN_MAX	96U
-
 /**
  * wait_buckets_available - wait on reclaimable buckets
  *
diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h
index 56a846fde8dd..66ce54724e93 100644
--- a/fs/bcachefs/alloc_background.h
+++ b/fs/bcachefs/alloc_background.h
@@ -13,6 +13,9 @@ struct bkey_alloc_unpacked {
 #undef  x
 };
 
+/* How out of date a pointer gen is allowed to be: */
+#define BUCKET_GC_GEN_MAX	96U
+
 /* returns true if not equal */
 static inline bool bkey_alloc_unpacked_cmp(struct bkey_alloc_unpacked l,
 					   struct bkey_alloc_unpacked r)
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index b295e46de059..f02518f9d9ec 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -591,6 +591,7 @@ static inline bool btree_iter_is_extents(struct btree_iter *iter)
 #define BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS		\
 	((1U << BKEY_TYPE_EXTENTS)|			\
 	 (1U << BKEY_TYPE_INODES)|			\
+	 (1U << BKEY_TYPE_EC)|				\
 	 (1U << BKEY_TYPE_REFLINK))
 
 enum btree_trigger_flags {
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index a4a5e084aad3..9c33a8be2c58 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -337,8 +337,9 @@ static inline bool iter_has_trans_triggers(struct btree_iter *iter)
 
 static inline bool iter_has_nontrans_triggers(struct btree_iter *iter)
 {
-	return (BTREE_NODE_TYPE_HAS_TRIGGERS &
-		~BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS) &
+	return (((BTREE_NODE_TYPE_HAS_TRIGGERS &
+		  ~BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS)) |
+		(1U << BTREE_ID_EC)) &
 		(1U << iter->btree_id);
 }
 
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 7bc51f397c7b..80d11decb71e 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -883,124 +883,140 @@ static s64 ptr_disk_sectors_delta(struct extent_ptr_decoded p,
 					p.crc.uncompressed_size);
 }
 
-static void bucket_set_stripe(struct bch_fs *c,
-			      const struct bch_extent_ptr *ptr,
-			      struct bch_fs_usage *fs_usage,
-			      u64 journal_seq,
-			      unsigned flags,
-			      bool enabled)
-{
-	bool gc = flags & BTREE_TRIGGER_GC;
-	struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
-	struct bucket *g = PTR_BUCKET(ca, ptr, gc);
-	struct bucket_mark new, old;
-
-	old = bucket_cmpxchg(g, new, ({
-		new.stripe			= enabled;
-		if (journal_seq) {
-			new.journal_seq_valid	= 1;
-			new.journal_seq		= journal_seq;
-		}
-	}));
-
-	bch2_dev_usage_update(c, ca, fs_usage, old, new, gc);
-
-	/*
-	 * XXX write repair code for these, flag stripe as possibly bad
-	 */
-	if (old.gen != ptr->gen)
-		bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
-			      "stripe with stale pointer");
-#if 0
-	/*
-	 * We'd like to check for these, but these checks don't work
-	 * yet:
-	 */
-	if (old.stripe && enabled)
-		bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
-			      "multiple stripes using same bucket");
-
-	if (!old.stripe && !enabled)
-		bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
-			      "deleting stripe but bucket not marked as stripe bucket");
-#endif
-}
-
-static int __mark_pointer(struct bch_fs *c, struct bkey_s_c k,
-			  struct extent_ptr_decoded p,
-			  s64 sectors, enum bch_data_type ptr_data_type,
-			  u8 bucket_gen, u8 *bucket_data_type,
-			  u16 *dirty_sectors, u16 *cached_sectors)
-{
-	u16 *dst_sectors = !p.ptr.cached
+static int check_bucket_ref(struct bch_fs *c, struct bkey_s_c k,
+			    const struct bch_extent_ptr *ptr,
+			    s64 sectors, enum bch_data_type ptr_data_type,
+			    u8 bucket_gen, u8 bucket_data_type,
+			    u16 dirty_sectors, u16 cached_sectors)
+{
+	size_t bucket_nr = PTR_BUCKET_NR(bch_dev_bkey_exists(c, ptr->dev), ptr);
+	u16 bucket_sectors = !ptr->cached
 		? dirty_sectors
 		: cached_sectors;
-	u16 orig_sectors = *dst_sectors;
 	char buf[200];
 
-	if (gen_after(p.ptr.gen, bucket_gen)) {
+	if (gen_after(ptr->gen, bucket_gen)) {
 		bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
 			"bucket %u:%zu gen %u data type %s: ptr gen %u newer than bucket gen\n"
 			"while marking %s",
-			p.ptr.dev, PTR_BUCKET_NR(bch_dev_bkey_exists(c, p.ptr.dev), &p.ptr),
-			bucket_gen,
-			bch2_data_types[*bucket_data_type ?: ptr_data_type],
-			p.ptr.gen,
+			ptr->dev, bucket_nr, bucket_gen,
+			bch2_data_types[bucket_data_type ?: ptr_data_type],
+			ptr->gen,
 			(bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
 		return -EIO;
 	}
 
-	if (gen_cmp(bucket_gen, p.ptr.gen) > 96U) {
+	if (gen_cmp(bucket_gen, ptr->gen) > BUCKET_GC_GEN_MAX) {
 		bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
 			"bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n"
 			"while marking %s",
-			p.ptr.dev, PTR_BUCKET_NR(bch_dev_bkey_exists(c, p.ptr.dev), &p.ptr),
-			bucket_gen,
-			bch2_data_types[*bucket_data_type ?: ptr_data_type],
-			p.ptr.gen,
+			ptr->dev, bucket_nr, bucket_gen,
+			bch2_data_types[bucket_data_type ?: ptr_data_type],
+			ptr->gen,
 			(bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
 		return -EIO;
 	}
 
-	if (bucket_gen != p.ptr.gen && !p.ptr.cached) {
+	if (bucket_gen != ptr->gen && !ptr->cached) {
 		bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
 			"bucket %u:%zu gen %u data type %s: stale dirty ptr (gen %u)\n"
 			"while marking %s",
-			p.ptr.dev, PTR_BUCKET_NR(bch_dev_bkey_exists(c, p.ptr.dev), &p.ptr),
-			bucket_gen,
-			bch2_data_types[*bucket_data_type ?: ptr_data_type],
-			p.ptr.gen,
+			ptr->dev, bucket_nr, bucket_gen,
+			bch2_data_types[bucket_data_type ?: ptr_data_type],
+			ptr->gen,
 			(bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
 		return -EIO;
 	}
 
-	if (bucket_gen != p.ptr.gen)
+	if (bucket_gen != ptr->gen)
 		return 1;
 
-	if (*bucket_data_type && *bucket_data_type != ptr_data_type) {
+	if (bucket_data_type && ptr_data_type &&
+	    bucket_data_type != ptr_data_type) {
 		bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
 			"bucket %u:%zu gen %u different types of data in same bucket: %s, %s\n"
 			"while marking %s",
-			p.ptr.dev, PTR_BUCKET_NR(bch_dev_bkey_exists(c, p.ptr.dev), &p.ptr),
-			bucket_gen,
-			bch2_data_types[*bucket_data_type],
+			ptr->dev, bucket_nr, bucket_gen,
+			bch2_data_types[bucket_data_type],
 			bch2_data_types[ptr_data_type],
 			(bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
 		return -EIO;
 	}
 
-	if (checked_add(*dst_sectors, sectors)) {
+	if ((unsigned) (bucket_sectors + sectors) > U16_MAX) {
 		bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
 			"bucket %u:%zu gen %u data type %s sector count overflow: %u + %lli > U16_MAX\n"
 			"while marking %s",
-			p.ptr.dev, PTR_BUCKET_NR(bch_dev_bkey_exists(c, p.ptr.dev), &p.ptr),
-			bucket_gen,
-			bch2_data_types[*bucket_data_type ?: ptr_data_type],
-			orig_sectors, sectors,
+			ptr->dev, bucket_nr, bucket_gen,
+			bch2_data_types[bucket_data_type ?: ptr_data_type],
+			bucket_sectors, sectors,
 			(bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
 		return -EIO;
 	}
 
+	return 0;
+}
+
+static int bucket_set_stripe(struct bch_fs *c, struct bkey_s_c k,
+			     const struct bch_extent_ptr *ptr,
+			     struct bch_fs_usage *fs_usage,
+			     u64 journal_seq,
+			     unsigned flags,
+			     bool enabled)
+{
+	bool gc = flags & BTREE_TRIGGER_GC;
+	struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+	struct bucket *g = PTR_BUCKET(ca, ptr, gc);
+	struct bucket_mark new, old;
+	char buf[200];
+	int ret;
+
+	old = bucket_cmpxchg(g, new, ({
+		ret = check_bucket_ref(c, k, ptr, 0, 0, new.gen, new.data_type,
+				       new.dirty_sectors, new.cached_sectors);
+		if (ret)
+			return ret;
+
+		if (new.stripe && enabled)
+			bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
+				      "bucket %u:%zu gen %u: multiple stripes using same bucket\n%s",
+				      ptr->dev, PTR_BUCKET_NR(ca, ptr), new.gen,
+				      (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
+
+		if (!new.stripe && !enabled)
+			bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
+				      "bucket %u:%zu gen %u: deleting stripe but not marked\n%s",
+				      ptr->dev, PTR_BUCKET_NR(ca, ptr), new.gen,
+				      (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
+
+		new.stripe			= enabled;
+		if (journal_seq) {
+			new.journal_seq_valid	= 1;
+			new.journal_seq		= journal_seq;
+		}
+	}));
+
+	bch2_dev_usage_update(c, ca, fs_usage, old, new, gc);
+	return 0;
+}
+
+static int __mark_pointer(struct bch_fs *c, struct bkey_s_c k,
+			  const struct bch_extent_ptr *ptr,
+			  s64 sectors, enum bch_data_type ptr_data_type,
+			  u8 bucket_gen, u8 *bucket_data_type,
+			  u16 *dirty_sectors, u16 *cached_sectors)
+{
+	u16 *dst_sectors = !ptr->cached
+		? dirty_sectors
+		: cached_sectors;
+	int ret = check_bucket_ref(c, k, ptr, sectors, ptr_data_type,
+				   bucket_gen, *bucket_data_type,
+				   *dirty_sectors, *cached_sectors);
+
+	if (ret)
+		return ret;
+
+	*dst_sectors += sectors;
 	*bucket_data_type = *dirty_sectors || *cached_sectors
 		? ptr_data_type : 0;
 	return 0;
@@ -1025,7 +1041,7 @@ static int bch2_mark_pointer(struct bch_fs *c, struct bkey_s_c k,
 		new.v.counter = old.v.counter = v;
 		bucket_data_type = new.data_type;
 
-		ret = __mark_pointer(c, k, p, sectors, data_type, new.gen,
+		ret = __mark_pointer(c, k, &p.ptr, sectors, data_type, new.gen,
 				     &bucket_data_type,
 				     &new.dirty_sectors,
 				     &new.cached_sectors);
@@ -1189,6 +1205,7 @@ static int bch2_mark_stripe(struct bch_fs *c,
 		? bkey_s_c_to_stripe(new).v : NULL;
 	struct stripe *m = genradix_ptr(&c->stripes[gc], idx);
 	unsigned i;
+	int ret;
 
 	if (!m || (old_s && !m->alive)) {
 		bch_err_ratelimited(c, "error marking nonexistent stripe %zu",
@@ -1198,9 +1215,12 @@ static int bch2_mark_stripe(struct bch_fs *c,
 
 	if (!new_s) {
 		/* Deleting: */
-		for (i = 0; i < old_s->nr_blocks; i++)
-			bucket_set_stripe(c, old_s->ptrs + i, fs_usage,
-					  journal_seq, flags, false);
+		for (i = 0; i < old_s->nr_blocks; i++) {
+			ret = bucket_set_stripe(c, old, old_s->ptrs + i, fs_usage,
+						journal_seq, flags, false);
+			if (ret)
+				return ret;
+		}
 
 		if (!gc && m->on_heap) {
 			spin_lock(&c->ec_stripes_heap_lock);
@@ -1219,11 +1239,16 @@ static int bch2_mark_stripe(struct bch_fs *c,
 				   old_s->ptrs + i,
 				   sizeof(struct bch_extent_ptr))) {
 
-				if (old_s)
-					bucket_set_stripe(c, old_s->ptrs + i, fs_usage,
+				if (old_s) {
+					bucket_set_stripe(c, old, old_s->ptrs + i, fs_usage,
 							  journal_seq, flags, false);
-				bucket_set_stripe(c, new_s->ptrs + i, fs_usage,
-						  journal_seq, flags, true);
+					if (ret)
+						return ret;
+				}
+				ret = bucket_set_stripe(c, new, new_s->ptrs + i, fs_usage,
+							journal_seq, flags, true);
+				if (ret)
+					return ret;
 			}
 		}
 
@@ -1549,23 +1574,21 @@ static int trans_get_key(struct btree_trans *trans,
 	return ret;
 }
 
-static int bch2_trans_mark_pointer(struct btree_trans *trans,
-			struct bkey_s_c k, struct extent_ptr_decoded p,
-			s64 sectors, enum bch_data_type data_type)
+static int bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree_iter **_iter,
+					 const struct bch_extent_ptr *ptr,
+					 struct bkey_alloc_unpacked *u)
 {
 	struct bch_fs *c = trans->c;
-	struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev);
-	struct bpos pos = POS(p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr));
-	struct btree_iter *iter;
-	struct bkey_s_c k_a;
-	struct bkey_alloc_unpacked u;
-	struct bkey_i_alloc *a;
+	struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+	struct bpos pos = POS(ptr->dev, PTR_BUCKET_NR(ca, ptr));
 	struct bucket *g;
+	struct btree_iter *iter;
+	struct bkey_s_c k;
 	int ret;
 
-	iter = trans_get_update(trans, BTREE_ID_ALLOC, pos, &k_a);
+	iter = trans_get_update(trans, BTREE_ID_ALLOC, pos, &k);
 	if (iter) {
-		u = bch2_alloc_unpack(k_a);
+		*u = bch2_alloc_unpack(k);
 	} else {
 		iter = bch2_trans_get_iter(trans, BTREE_ID_ALLOC, pos,
 					   BTREE_ITER_CACHED|
@@ -1575,16 +1598,36 @@ static int bch2_trans_mark_pointer(struct btree_trans *trans,
 			return PTR_ERR(iter);
 
 		ret = bch2_btree_iter_traverse(iter);
-		if (ret)
-			goto out;
+		if (ret) {
+			bch2_trans_iter_put(trans, iter);
+			return ret;
+		}
 
 		percpu_down_read(&c->mark_lock);
 		g = bucket(ca, pos.offset);
-		u = alloc_mem_to_key(g, READ_ONCE(g->mark));
+		*u = alloc_mem_to_key(g, READ_ONCE(g->mark));
 		percpu_up_read(&c->mark_lock);
 	}
 
-	ret = __mark_pointer(c, k, p, sectors, data_type, u.gen, &u.data_type,
+	*_iter = iter;
+	return 0;
+}
+
+static int bch2_trans_mark_pointer(struct btree_trans *trans,
+			struct bkey_s_c k, struct extent_ptr_decoded p,
+			s64 sectors, enum bch_data_type data_type)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter *iter;
+	struct bkey_alloc_unpacked u;
+	struct bkey_i_alloc *a;
+	int ret;
+
+	ret = bch2_trans_start_alloc_update(trans, &iter, &p.ptr, &u);
+	if (ret)
+		return ret;
+
+	ret = __mark_pointer(c, k, &p.ptr, sectors, data_type, u.gen, &u.data_type,
 			     &u.dirty_sectors, &u.cached_sectors);
 	if (ret)
 		goto out;
@@ -1595,7 +1638,7 @@ static int bch2_trans_mark_pointer(struct btree_trans *trans,
 		goto out;
 
 	bkey_alloc_init(&a->k_i);
-	a->k.p = pos;
+	a->k.p = iter->pos;
 	bch2_alloc_pack(a, u);
 	bch2_trans_update(trans, iter, &a->k_i, 0);
 out:
@@ -1716,6 +1759,44 @@ static int bch2_trans_mark_extent(struct btree_trans *trans,
 	return 0;
 }
 
+static int bch2_trans_mark_stripe(struct btree_trans *trans,
+				  struct bkey_s_c k)
+{
+	const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
+	struct bkey_alloc_unpacked u;
+	struct bkey_i_alloc *a;
+	struct btree_iter *iter;
+	unsigned i;
+	int ret = 0;
+
+	/*
+	 * The allocator code doesn't necessarily update bucket gens in the
+	 * btree when incrementing them, right before handing out new buckets -
+	 * we just need to persist those updates here along with the new stripe:
+	 */
+
+	for (i = 0; i < s->nr_blocks && !ret; i++) {
+		ret = bch2_trans_start_alloc_update(trans, &iter,
+						    &s->ptrs[i], &u);
+		if (ret)
+			break;
+
+		a = bch2_trans_kmalloc(trans, BKEY_ALLOC_U64s_MAX * 8);
+		ret = PTR_ERR_OR_ZERO(a);
+		if (ret)
+			goto put_iter;
+
+		bkey_alloc_init(&a->k_i);
+		a->k.p = iter->pos;
+		bch2_alloc_pack(a, u);
+		bch2_trans_update(trans, iter, &a->k_i, 0);
+put_iter:
+		bch2_trans_iter_put(trans, iter);
+	}
+
+	return ret;
+}
+
 static int __bch2_trans_mark_reflink_p(struct btree_trans *trans,
 			struct bkey_s_c_reflink_p p,
 			u64 idx, unsigned sectors,
@@ -1815,6 +1896,8 @@ int bch2_trans_mark_key(struct btree_trans *trans, struct bkey_s_c k,
 	case KEY_TYPE_reflink_v:
 		return bch2_trans_mark_extent(trans, k, offset, sectors,
 					      flags, BCH_DATA_user);
+	case KEY_TYPE_stripe:
+		return bch2_trans_mark_stripe(trans, k);
 	case KEY_TYPE_inode:
 		d = replicas_deltas_realloc(trans, 0);
 
-- 
cgit 


From af4d05c46b1ef2b2b43e9df1924e204efe205ec6 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 9 Jul 2020 18:31:51 -0400
Subject: bcachefs: Account for stripe parity sectors separately

Instead of trying to charge EC parity to the data within the stripe
(which is subject to rounding errors), let's charge it to the stripe
itself. It should also make -ENOSPC issues easier to deal with if we
charge for parity blocks up front, and means we can also make more fine
grained accounting available to the user.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs_format.h |   3 +-
 fs/bcachefs/buckets.c         | 172 ++++++++++++++++++++++--------------------
 fs/bcachefs/ec.c              |  31 +++++++-
 fs/bcachefs/ec.h              |   2 +
 fs/bcachefs/replicas.c        |  20 ++++-
 5 files changed, 142 insertions(+), 86 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index a5b0c308fc46..5465acd9cbe8 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -1036,7 +1036,8 @@ LE64_BITMASK(BCH_KDF_SCRYPT_P,	struct bch_sb_field_crypt, kdf_flags, 32, 48);
 	x(journal,	2)		\
 	x(btree,	3)		\
 	x(user,		4)		\
-	x(cached,	5)
+	x(cached,	5)		\
+	x(parity,	6)
 
 enum bch_data_type {
 #define x(t, n) BCH_DATA_##t,
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 80d11decb71e..2277143b1890 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -77,6 +77,26 @@
 
 #include <linux/preempt.h>
 
+static inline void fs_usage_data_type_to_base(struct bch_fs_usage *fs_usage,
+					      enum bch_data_type data_type,
+					      s64 sectors)
+{
+	switch (data_type) {
+	case BCH_DATA_btree:
+		fs_usage->btree		+= sectors;
+		break;
+	case BCH_DATA_user:
+	case BCH_DATA_parity:
+		fs_usage->data		+= sectors;
+		break;
+	case BCH_DATA_cached:
+		fs_usage->cached	+= sectors;
+		break;
+	default:
+		break;
+	}
+}
+
 /*
  * Clear journal_seq_valid for buckets for which it's not needed, to prevent
  * wraparound:
@@ -132,17 +152,7 @@ void bch2_fs_usage_initialize(struct bch_fs *c)
 		struct bch_replicas_entry *e =
 			cpu_replicas_entry(&c->replicas, i);
 
-		switch (e->data_type) {
-		case BCH_DATA_btree:
-			usage->btree	+= usage->replicas[i];
-			break;
-		case BCH_DATA_user:
-			usage->data	+= usage->replicas[i];
-			break;
-		case BCH_DATA_cached:
-			usage->cached	+= usage->replicas[i];
-			break;
-		}
+		fs_usage_data_type_to_base(usage, e->data_type, usage->replicas[i]);
 	}
 
 	percpu_up_write(&c->mark_lock);
@@ -374,9 +384,14 @@ static inline int is_fragmented_bucket(struct bucket_mark m,
 	return 0;
 }
 
+static inline int is_stripe_data_bucket(struct bucket_mark m)
+{
+	return m.stripe && m.data_type != BCH_DATA_parity;
+}
+
 static inline int bucket_stripe_sectors(struct bucket_mark m)
 {
-	return m.stripe ? m.dirty_sectors : 0;
+	return is_stripe_data_bucket(m) ? m.dirty_sectors : 0;
 }
 
 static inline enum bch_data_type bucket_type(struct bucket_mark m)
@@ -520,17 +535,7 @@ static inline int update_replicas(struct bch_fs *c,
 	if (!fs_usage)
 		return 0;
 
-	switch (r->data_type) {
-	case BCH_DATA_btree:
-		fs_usage->btree		+= sectors;
-		break;
-	case BCH_DATA_user:
-		fs_usage->data		+= sectors;
-		break;
-	case BCH_DATA_cached:
-		fs_usage->cached	+= sectors;
-		break;
-	}
+	fs_usage_data_type_to_base(fs_usage, r->data_type, sectors);
 	fs_usage->replicas[idx]		+= sectors;
 	return 0;
 }
@@ -958,12 +963,15 @@ static int check_bucket_ref(struct bch_fs *c, struct bkey_s_c k,
 }
 
 static int bucket_set_stripe(struct bch_fs *c, struct bkey_s_c k,
-			     const struct bch_extent_ptr *ptr,
+			     unsigned ptr_idx,
 			     struct bch_fs_usage *fs_usage,
-			     u64 journal_seq,
-			     unsigned flags,
+			     u64 journal_seq, unsigned flags,
 			     bool enabled)
 {
+	const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
+	unsigned nr_data = s->nr_blocks - s->nr_redundant;
+	bool parity = ptr_idx >= nr_data;
+	const struct bch_extent_ptr *ptr = s->ptrs + ptr_idx;
 	bool gc = flags & BTREE_TRIGGER_GC;
 	struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
 	struct bucket *g = PTR_BUCKET(ca, ptr, gc);
@@ -990,6 +998,12 @@ static int bucket_set_stripe(struct bch_fs *c, struct bkey_s_c k,
 				      (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
 
 		new.stripe			= enabled;
+
+		if ((flags & BTREE_TRIGGER_GC) && parity) {
+			new.data_type = enabled ? BCH_DATA_parity : 0;
+			new.dirty_sectors = enabled ? le16_to_cpu(s->sectors): 0;
+		}
+
 		if (journal_seq) {
 			new.journal_seq_valid	= 1;
 			new.journal_seq		= journal_seq;
@@ -1074,12 +1088,10 @@ static int bch2_mark_stripe_ptr(struct bch_fs *c,
 				struct bch_extent_stripe_ptr p,
 				enum bch_data_type data_type,
 				struct bch_fs_usage *fs_usage,
-				s64 sectors, unsigned flags,
-				struct bch_replicas_padded *r,
-				unsigned *nr_data,
-				unsigned *nr_parity)
+				s64 sectors, unsigned flags)
 {
 	bool gc = flags & BTREE_TRIGGER_GC;
+	struct bch_replicas_padded r;
 	struct stripe *m;
 	unsigned i, blocks_nonempty = 0;
 
@@ -1094,14 +1106,10 @@ static int bch2_mark_stripe_ptr(struct bch_fs *c,
 		return -EIO;
 	}
 
-	BUG_ON(m->r.e.data_type != data_type);
-
-	*nr_data	= m->nr_blocks - m->nr_redundant;
-	*nr_parity	= m->nr_redundant;
-	*r = m->r;
-
 	m->block_sectors[p.block] += sectors;
 
+	r = m->r;
+
 	for (i = 0; i < m->nr_blocks; i++)
 		blocks_nonempty += m->block_sectors[i] != 0;
 
@@ -1113,6 +1121,9 @@ static int bch2_mark_stripe_ptr(struct bch_fs *c,
 
 	spin_unlock(&c->ec_stripes_heap_lock);
 
+	r.e.data_type = data_type;
+	update_replicas(c, fs_usage, &r.e, sectors);
+
 	return 0;
 }
 
@@ -1158,25 +1169,11 @@ static int bch2_mark_extent(struct bch_fs *c,
 			dirty_sectors	       += disk_sectors;
 			r.e.devs[r.e.nr_devs++]	= p.ptr.dev;
 		} else {
-			struct bch_replicas_padded ec_r;
-			unsigned nr_data, nr_parity;
-			s64 parity_sectors;
-
 			ret = bch2_mark_stripe_ptr(c, p.ec, data_type,
-					fs_usage, disk_sectors, flags,
-					&ec_r, &nr_data, &nr_parity);
+					fs_usage, disk_sectors, flags);
 			if (ret)
 				return ret;
 
-			parity_sectors =
-				__ptr_disk_sectors_delta(p.crc.live_size,
-					offset, sectors, flags,
-					p.crc.compressed_size * nr_parity,
-					p.crc.uncompressed_size * nr_data);
-
-			update_replicas(c, fs_usage, &ec_r.e,
-					disk_sectors + parity_sectors);
-
 			/*
 			 * There may be other dirty pointers in this extent, but
 			 * if so they're not required for mounting if we have an
@@ -1216,7 +1213,7 @@ static int bch2_mark_stripe(struct bch_fs *c,
 	if (!new_s) {
 		/* Deleting: */
 		for (i = 0; i < old_s->nr_blocks; i++) {
-			ret = bucket_set_stripe(c, old, old_s->ptrs + i, fs_usage,
+			ret = bucket_set_stripe(c, old, i, fs_usage,
 						journal_seq, flags, false);
 			if (ret)
 				return ret;
@@ -1228,6 +1225,10 @@ static int bch2_mark_stripe(struct bch_fs *c,
 			spin_unlock(&c->ec_stripes_heap_lock);
 		}
 
+		if (gc)
+			update_replicas(c, fs_usage, &m->r.e,
+					-((s64) m->sectors * m->nr_redundant));
+
 		memset(m, 0, sizeof(*m));
 	} else {
 		BUG_ON(old_s && new_s->nr_blocks != old_s->nr_blocks);
@@ -1240,12 +1241,12 @@ static int bch2_mark_stripe(struct bch_fs *c,
 				   sizeof(struct bch_extent_ptr))) {
 
 				if (old_s) {
-					bucket_set_stripe(c, old, old_s->ptrs + i, fs_usage,
+					bucket_set_stripe(c, old, i, fs_usage,
 							  journal_seq, flags, false);
 					if (ret)
 						return ret;
 				}
-				ret = bucket_set_stripe(c, new, new_s->ptrs + i, fs_usage,
+				ret = bucket_set_stripe(c, new, i, fs_usage,
 							journal_seq, flags, true);
 				if (ret)
 					return ret;
@@ -1258,8 +1259,16 @@ static int bch2_mark_stripe(struct bch_fs *c,
 		m->nr_blocks	= new_s->nr_blocks;
 		m->nr_redundant	= new_s->nr_redundant;
 
+		if (gc && old_s)
+			update_replicas(c, fs_usage, &m->r.e,
+					-((s64) m->sectors * m->nr_redundant));
+
 		bch2_bkey_to_replicas(&m->r.e, new);
 
+		if (gc)
+			update_replicas(c, fs_usage, &m->r.e,
+					((s64) m->sectors * m->nr_redundant));
+
 		/* gc recalculates these fields: */
 		if (!(flags & BTREE_TRIGGER_GC)) {
 			m->blocks_nonempty = 0;
@@ -1648,15 +1657,13 @@ out:
 
 static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans,
 			struct bch_extent_stripe_ptr p,
-			s64 sectors, enum bch_data_type data_type,
-			struct bch_replicas_padded *r,
-			unsigned *nr_data,
-			unsigned *nr_parity)
+			s64 sectors, enum bch_data_type data_type)
 {
 	struct bch_fs *c = trans->c;
 	struct btree_iter *iter;
 	struct bkey_s_c k;
 	struct bkey_i_stripe *s;
+	struct bch_replicas_padded r;
 	int ret = 0;
 
 	ret = trans_get_key(trans, BTREE_ID_EC, POS(0, p.idx), &iter, &k);
@@ -1677,15 +1684,14 @@ static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans,
 		goto out;
 
 	bkey_reassemble(&s->k_i, k);
-
 	stripe_blockcount_set(&s->v, p.block,
 		stripe_blockcount_get(&s->v, p.block) +
 		sectors);
-
-	*nr_data	= s->v.nr_blocks - s->v.nr_redundant;
-	*nr_parity	= s->v.nr_redundant;
-	bch2_bkey_to_replicas(&r->e, bkey_i_to_s_c(&s->k_i));
 	bch2_trans_update(trans, iter, &s->k_i, 0);
+
+	bch2_bkey_to_replicas(&r.e, bkey_i_to_s_c(&s->k_i));
+	r.e.data_type = data_type;
+	update_replicas_list(trans, &r.e, sectors);
 out:
 	bch2_trans_iter_put(trans, iter);
 	return ret;
@@ -1730,25 +1736,11 @@ static int bch2_trans_mark_extent(struct btree_trans *trans,
 			dirty_sectors	       += disk_sectors;
 			r.e.devs[r.e.nr_devs++]	= p.ptr.dev;
 		} else {
-			struct bch_replicas_padded ec_r;
-			unsigned nr_data, nr_parity;
-			s64 parity_sectors;
-
 			ret = bch2_trans_mark_stripe_ptr(trans, p.ec,
-					disk_sectors, data_type,
-					&ec_r, &nr_data, &nr_parity);
+					disk_sectors, data_type);
 			if (ret)
 				return ret;
 
-			parity_sectors =
-				__ptr_disk_sectors_delta(p.crc.live_size,
-					offset, sectors, flags,
-					p.crc.compressed_size * nr_parity,
-					p.crc.uncompressed_size * nr_data);
-
-			update_replicas_list(trans, &ec_r.e,
-					     disk_sectors + parity_sectors);
-
 			r.e.nr_required = 0;
 		}
 	}
@@ -1760,15 +1752,26 @@ static int bch2_trans_mark_extent(struct btree_trans *trans,
 }
 
 static int bch2_trans_mark_stripe(struct btree_trans *trans,
-				  struct bkey_s_c k)
+				  struct bkey_s_c k,
+				  unsigned flags)
 {
 	const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
+	unsigned nr_data = s->nr_blocks - s->nr_redundant;
+	struct bch_replicas_padded r;
 	struct bkey_alloc_unpacked u;
 	struct bkey_i_alloc *a;
 	struct btree_iter *iter;
+	bool deleting = flags & BTREE_TRIGGER_OVERWRITE;
+	s64 sectors = le16_to_cpu(s->sectors);
 	unsigned i;
 	int ret = 0;
 
+	if (deleting)
+		sectors = -sectors;
+
+	bch2_bkey_to_replicas(&r.e, k);
+	update_replicas_list(trans, &r.e, sectors * s->nr_redundant);
+
 	/*
 	 * The allocator code doesn't necessarily update bucket gens in the
 	 * btree when incrementing them, right before handing out new buckets -
@@ -1776,11 +1779,20 @@ static int bch2_trans_mark_stripe(struct btree_trans *trans,
 	 */
 
 	for (i = 0; i < s->nr_blocks && !ret; i++) {
+		bool parity = i >= nr_data;
+
 		ret = bch2_trans_start_alloc_update(trans, &iter,
 						    &s->ptrs[i], &u);
 		if (ret)
 			break;
 
+		if (parity) {
+			u.dirty_sectors += sectors;
+			u.data_type = u.dirty_sectors
+				? BCH_DATA_parity
+				: 0;
+		}
+
 		a = bch2_trans_kmalloc(trans, BKEY_ALLOC_U64s_MAX * 8);
 		ret = PTR_ERR_OR_ZERO(a);
 		if (ret)
@@ -1897,7 +1909,7 @@ int bch2_trans_mark_key(struct btree_trans *trans, struct bkey_s_c k,
 		return bch2_trans_mark_extent(trans, k, offset, sectors,
 					      flags, BCH_DATA_user);
 	case KEY_TYPE_stripe:
-		return bch2_trans_mark_stripe(trans, k);
+		return bch2_trans_mark_stripe(trans, k, flags);
 	case KEY_TYPE_inode:
 		d = replicas_deltas_realloc(trans, 0);
 
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index c6d6f23d3f24..e5033b392432 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -343,12 +343,17 @@ static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf,
 	unsigned offset = 0, bytes = buf->size << 9;
 	struct bch_extent_ptr *ptr = &v->ptrs[idx];
 	struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+	enum bch_data_type data_type = idx < buf->key.v.nr_blocks - buf->key.v.nr_redundant
+		? BCH_DATA_user
+		: BCH_DATA_parity;
 
 	if (!bch2_dev_get_ioref(ca, rw)) {
 		clear_bit(idx, buf->valid);
 		return;
 	}
 
+	this_cpu_add(ca->io_done->sectors[rw][data_type], buf->size);
+
 	while (offset < bytes) {
 		unsigned nr_iovecs = min_t(size_t, BIO_MAX_VECS,
 					   DIV_ROUND_UP(bytes, PAGE_SIZE));
@@ -670,6 +675,7 @@ static void ec_stripe_delete_work(struct work_struct *work)
 /* stripe creation: */
 
 static int ec_stripe_bkey_insert(struct bch_fs *c,
+				 struct ec_stripe_new *s,
 				 struct bkey_i_stripe *stripe)
 {
 	struct btree_trans trans;
@@ -711,7 +717,7 @@ found_slot:
 
 	bch2_trans_update(&trans, iter, &stripe->k_i, 0);
 
-	ret = bch2_trans_commit(&trans, NULL, NULL,
+	ret = bch2_trans_commit(&trans, &s->res, NULL,
 				BTREE_INSERT_NOFAIL);
 err:
 	bch2_trans_iter_put(&trans, iter);
@@ -858,8 +864,8 @@ static void ec_stripe_create(struct ec_stripe_new *s)
 
 	ret = s->existing_stripe
 		? bch2_btree_insert(c, BTREE_ID_EC, &s->stripe.key.k_i,
-				    NULL, NULL, BTREE_INSERT_NOFAIL)
-		: ec_stripe_bkey_insert(c, &s->stripe.key);
+				    &s->res, NULL, BTREE_INSERT_NOFAIL)
+		: ec_stripe_bkey_insert(c, s, &s->stripe.key);
 	if (ret) {
 		bch_err(c, "error creating stripe: error creating stripe key");
 		goto err_put_writes;
@@ -886,6 +892,8 @@ static void ec_stripe_create(struct ec_stripe_new *s)
 err_put_writes:
 	percpu_ref_put(&c->writes);
 err:
+	bch2_disk_reservation_put(c, &s->res);
+
 	open_bucket_for_each(c, &s->blocks, ob, i) {
 		ob->ec = NULL;
 		__bch2_open_bucket_put(c, ob);
@@ -1325,6 +1333,7 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *c,
 	struct open_bucket *ob;
 	unsigned i, data_idx = 0;
 	s64 idx;
+	int ret;
 
 	closure_init_stack(&cl);
 
@@ -1356,6 +1365,22 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *c,
 				}
 		}
 
+		if (!h->s->existing_stripe &&
+		    !h->s->res.sectors) {
+			ret = bch2_disk_reservation_get(c, &h->s->res,
+							h->blocksize,
+							h->s->nr_parity, 0);
+			if (ret) {
+				/* What should we do here? */
+				bch_err(c, "unable to create new stripe: %i", ret);
+				bch2_ec_stripe_head_put(c, h);
+				h = NULL;
+				goto out;
+
+			}
+
+		}
+
 		if (new_stripe_alloc_buckets(c, h)) {
 			bch2_ec_stripe_head_put(c, h);
 			h = NULL;
diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h
index 6db16cf768da..15f751fc2a35 100644
--- a/fs/bcachefs/ec.h
+++ b/fs/bcachefs/ec.h
@@ -3,6 +3,7 @@
 #define _BCACHEFS_EC_H
 
 #include "ec_types.h"
+#include "buckets_types.h"
 #include "keylist_types.h"
 
 const char *bch2_stripe_invalid(const struct bch_fs *, struct bkey_s_c);
@@ -105,6 +106,7 @@ struct ec_stripe_new {
 	struct open_buckets	blocks;
 	u8			data_block_idx[EC_STRIPE_MAX];
 	struct open_buckets	parity;
+	struct disk_reservation	res;
 
 	struct keylist		keys;
 	u64			inline_keys[BKEY_U64s * 8];
diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c
index db0665abd60b..f46aa1d70e35 100644
--- a/fs/bcachefs/replicas.c
+++ b/fs/bcachefs/replicas.c
@@ -122,7 +122,7 @@ void bch2_bkey_to_replicas(struct bch_replicas_entry *e,
 		extent_to_replicas(k, e);
 		break;
 	case KEY_TYPE_stripe:
-		e->data_type = BCH_DATA_user;
+		e->data_type = BCH_DATA_parity;
 		stripe_to_replicas(k, e);
 		break;
 	}
@@ -449,7 +449,23 @@ static int __bch2_mark_bkey_replicas(struct bch_fs *c, struct bkey_s_c k,
 
 	bch2_bkey_to_replicas(&search.e, k);
 
-	return __bch2_mark_replicas(c, &search.e, check);
+	ret = __bch2_mark_replicas(c, &search.e, check);
+	if (ret)
+		return ret;
+
+	if (search.e.data_type == BCH_DATA_parity) {
+		search.e.data_type = BCH_DATA_cached;
+		ret = __bch2_mark_replicas(c, &search.e, check);
+		if (ret)
+			return ret;
+
+		search.e.data_type = BCH_DATA_user;
+		ret = __bch2_mark_replicas(c, &search.e, check);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
 }
 
 bool bch2_bkey_replicas_marked(struct bch_fs *c,
-- 
cgit 


From b88e971e45fe61fba435c65cc2f66fb3a5136461 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 22 Jul 2020 23:11:48 -0400
Subject: bcachefs: Don't drop replicas when copygcing ec data

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/buckets.c       |  6 +++++
 fs/bcachefs/buckets_types.h |  2 ++
 fs/bcachefs/io.c            |  3 ++-
 fs/bcachefs/move.c          |  6 +++--
 fs/bcachefs/move.h          |  3 ++-
 fs/bcachefs/movinggc.c      | 61 ++++++++++++++++++++++++---------------------
 fs/bcachefs/rebalance.c     |  1 +
 7 files changed, 50 insertions(+), 32 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 2277143b1890..7558e2bffbdd 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -979,6 +979,9 @@ static int bucket_set_stripe(struct bch_fs *c, struct bkey_s_c k,
 	char buf[200];
 	int ret;
 
+	if (enabled)
+		g->ec_redundancy = s->nr_redundant;
+
 	old = bucket_cmpxchg(g, new, ({
 		ret = check_bucket_ref(c, k, ptr, 0, 0, new.gen, new.data_type,
 				       new.dirty_sectors, new.cached_sectors);
@@ -1010,6 +1013,9 @@ static int bucket_set_stripe(struct bch_fs *c, struct bkey_s_c k,
 		}
 	}));
 
+	if (!enabled)
+		g->ec_redundancy = 0;
+
 	bch2_dev_usage_update(c, ca, fs_usage, old, new, gc);
 	return 0;
 }
diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h
index 26779e94a189..9364addf8441 100644
--- a/fs/bcachefs/buckets_types.h
+++ b/fs/bcachefs/buckets_types.h
@@ -41,6 +41,7 @@ struct bucket {
 	u8				oldest_gen;
 	u8				gc_gen;
 	unsigned			gen_valid:1;
+	u8				ec_redundancy;
 };
 
 struct bucket_array {
@@ -123,6 +124,7 @@ struct disk_reservation {
 struct copygc_heap_entry {
 	u8			dev;
 	u8			gen;
+	u8			replicas;
 	u16			fragmentation;
 	u32			sectors;
 	u64			offset;
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index 78adccbee9d9..5c12bfed3a7b 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -1466,7 +1466,8 @@ static struct promote_op *__promote_alloc(struct bch_fs *c,
 			opts,
 			DATA_PROMOTE,
 			(struct data_opts) {
-				.target = opts.promote_target
+				.target		= opts.promote_target,
+				.nr_replicas	= 1,
 			},
 			btree_id, k);
 	BUG_ON(ret);
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index 8aa13b41d20d..9d190ae4f391 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -265,8 +265,8 @@ int bch2_migrate_write_init(struct bch_fs *c, struct migrate_write *m,
 		BCH_WRITE_DATA_ENCODED|
 		BCH_WRITE_FROM_INTERNAL;
 
-	m->op.nr_replicas	= 1;
-	m->op.nr_replicas_required = 1;
+	m->op.nr_replicas	= data_opts.nr_replicas;
+	m->op.nr_replicas_required = data_opts.nr_replicas;
 	m->op.index_update_fn	= bch2_migrate_index_update;
 
 	switch (data_cmd) {
@@ -754,6 +754,7 @@ static enum data_cmd rereplicate_pred(struct bch_fs *c, void *arg,
 		return DATA_SKIP;
 
 	data_opts->target		= 0;
+	data_opts->nr_replicas		= 1;
 	data_opts->btree_insert_flags	= 0;
 	return DATA_ADD_REPLICAS;
 }
@@ -769,6 +770,7 @@ static enum data_cmd migrate_pred(struct bch_fs *c, void *arg,
 		return DATA_SKIP;
 
 	data_opts->target		= 0;
+	data_opts->nr_replicas		= 1;
 	data_opts->btree_insert_flags	= 0;
 	data_opts->rewrite_dev		= op->migrate.dev;
 	return DATA_REWRITE;
diff --git a/fs/bcachefs/move.h b/fs/bcachefs/move.h
index 0acd1720d4f8..b04bc669226d 100644
--- a/fs/bcachefs/move.h
+++ b/fs/bcachefs/move.h
@@ -20,7 +20,8 @@ enum data_cmd {
 
 struct data_opts {
 	u16		target;
-	unsigned	rewrite_dev;
+	u8		rewrite_dev;
+	u8		nr_replicas;
 	int		btree_insert_flags;
 };
 
diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c
index 5f96f619bee0..e858e2a35f8d 100644
--- a/fs/bcachefs/movinggc.c
+++ b/fs/bcachefs/movinggc.c
@@ -53,17 +53,21 @@ static int bucket_offset_cmp(const void *_l, const void *_r, size_t size)
 		cmp_int(l->offset, r->offset);
 }
 
-static int __copygc_pred(struct bch_fs *c, struct bkey_s_c k)
+static enum data_cmd copygc_pred(struct bch_fs *c, void *arg,
+				 struct bkey_s_c k,
+				 struct bch_io_opts *io_opts,
+				 struct data_opts *data_opts)
 {
 	copygc_heap *h = &c->copygc_heap;
 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-	const struct bch_extent_ptr *ptr;
+	const union bch_extent_entry *entry;
+	struct extent_ptr_decoded p;
 
-	bkey_for_each_ptr(ptrs, ptr) {
-		struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+	bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+		struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev);
 		struct copygc_heap_entry search = {
-			.dev = ptr->dev,
-			.offset = ptr->offset
+			.dev	= p.ptr.dev,
+			.offset	= p.ptr.offset,
 		};
 
 		ssize_t i = eytzinger0_find_le(h->data, h->used,
@@ -81,27 +85,24 @@ static int __copygc_pred(struct bch_fs *c, struct bkey_s_c k)
 		BUG_ON(i != j);
 #endif
 		if (i >= 0 &&
-		    ptr->offset < h->data[i].offset + ca->mi.bucket_size &&
-		    ptr->gen == h->data[i].gen)
-			return ptr->dev;
-	}
+		    p.ptr.offset < h->data[i].offset + ca->mi.bucket_size &&
+		    p.ptr.gen == h->data[i].gen) {
+			data_opts->target		= io_opts->background_target;
+			data_opts->nr_replicas		= 1;
+			data_opts->btree_insert_flags	= BTREE_INSERT_USE_RESERVE;
+			data_opts->rewrite_dev		= p.ptr.dev;
 
-	return -1;
-}
+			if (p.has_ec) {
+				struct stripe *m = genradix_ptr(&c->stripes[0], p.ec.idx);
 
-static enum data_cmd copygc_pred(struct bch_fs *c, void *arg,
-				 struct bkey_s_c k,
-				 struct bch_io_opts *io_opts,
-				 struct data_opts *data_opts)
-{
-	int dev_idx = __copygc_pred(c, k);
-	if (dev_idx < 0)
-		return DATA_SKIP;
-
-	data_opts->target		= io_opts->background_target;
-	data_opts->btree_insert_flags	= BTREE_INSERT_USE_RESERVE;
-	data_opts->rewrite_dev		= dev_idx;
-	return DATA_REWRITE;
+				data_opts->nr_replicas += m->nr_redundant;
+			}
+
+			return DATA_REWRITE;
+		}
+	}
+
+	return DATA_SKIP;
 }
 
 static bool have_copygc_reserve(struct bch_dev *ca)
@@ -168,7 +169,8 @@ static int bch2_copygc(struct bch_fs *c)
 		buckets = bucket_array(ca);
 
 		for (b = buckets->first_bucket; b < buckets->nbuckets; b++) {
-			struct bucket_mark m = READ_ONCE(buckets->b[b].mark);
+			struct bucket *g = buckets->b + b;
+			struct bucket_mark m = READ_ONCE(g->mark);
 			struct copygc_heap_entry e;
 
 			if (m.owned_by_allocator ||
@@ -177,9 +179,12 @@ static int bch2_copygc(struct bch_fs *c)
 			    bucket_sectors_used(m) >= ca->mi.bucket_size)
 				continue;
 
+			WARN_ON(m.stripe && !g->ec_redundancy);
+
 			e = (struct copygc_heap_entry) {
 				.dev		= dev_idx,
 				.gen		= m.gen,
+				.replicas	= 1 + g->ec_redundancy,
 				.fragmentation	= bucket_sectors_used(m) * (1U << 15)
 					/ ca->mi.bucket_size,
 				.sectors	= bucket_sectors_used(m),
@@ -196,11 +201,11 @@ static int bch2_copygc(struct bch_fs *c)
 	}
 
 	for (i = h->data; i < h->data + h->used; i++)
-		sectors_to_move += i->sectors;
+		sectors_to_move += i->sectors * i->replicas;
 
 	while (sectors_to_move > sectors_reserved) {
 		BUG_ON(!heap_pop(h, e, -fragmentation_cmp, NULL));
-		sectors_to_move -= e.sectors;
+		sectors_to_move -= e.sectors * e.replicas;
 	}
 
 	buckets_to_move = h->used;
diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c
index a0bbddeac623..cce6f58fe609 100644
--- a/fs/bcachefs/rebalance.c
+++ b/fs/bcachefs/rebalance.c
@@ -73,6 +73,7 @@ static enum data_cmd rebalance_pred(struct bch_fs *c, void *arg,
 {
 	if (__bch2_rebalance_pred(c, k, io_opts) >= 0) {
 		data_opts->target		= io_opts->background_target;
+		data_opts->nr_replicas		= 1;
 		data_opts->btree_insert_flags	= 0;
 		return DATA_ADD_REPLICAS;
 	} else {
-- 
cgit 


From 5b088c1dd005ec0fbddfa3664d3095caef6ae52e Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 23 Oct 2020 18:40:30 -0400
Subject: bcachefs: Fix bch2_mark_stripe()

There's no reason not to always recalculate these fields

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/buckets.c | 18 +++++++-----------
 1 file changed, 7 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 7558e2bffbdd..aacc20f71729 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -1264,6 +1264,13 @@ static int bch2_mark_stripe(struct bch_fs *c,
 		m->algorithm	= new_s->algorithm;
 		m->nr_blocks	= new_s->nr_blocks;
 		m->nr_redundant	= new_s->nr_redundant;
+		m->blocks_nonempty = 0;
+
+		for (i = 0; i < new_s->nr_blocks; i++) {
+			m->block_sectors[i] =
+				stripe_blockcount_get(new_s, i);
+			m->blocks_nonempty += !!m->block_sectors[i];
+		}
 
 		if (gc && old_s)
 			update_replicas(c, fs_usage, &m->r.e,
@@ -1275,17 +1282,6 @@ static int bch2_mark_stripe(struct bch_fs *c,
 			update_replicas(c, fs_usage, &m->r.e,
 					((s64) m->sectors * m->nr_redundant));
 
-		/* gc recalculates these fields: */
-		if (!(flags & BTREE_TRIGGER_GC)) {
-			m->blocks_nonempty = 0;
-
-			for (i = 0; i < new_s->nr_blocks; i++) {
-				m->block_sectors[i] =
-					stripe_blockcount_get(new_s, i);
-				m->blocks_nonempty += !!m->block_sectors[i];
-			}
-		}
-
 		if (!gc) {
 			spin_lock(&c->ec_stripes_heap_lock);
 			bch2_stripes_heap_update(c, m, idx);
-- 
cgit 


From a10e677a1555e070f1a7b3c1dc3e3189d462ab9e Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 23 Oct 2020 21:07:17 -0400
Subject: bcachefs: Fix for passing target= opts as mount opts

Some options can't be parsed until the filesystem initialized;
previously, passing these options to mount or remount would cause mount
to fail.

This changes the mount path so that we parse the options passed in
twice, and just ignore any options that can't be parsed the first time.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs.c   | 25 +++++++++++++++++++------
 fs/bcachefs/opts.c |  7 ++++---
 fs/bcachefs/opts.h |  2 +-
 3 files changed, 24 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index a488dcebc11a..b214d58e94e9 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -1343,7 +1343,7 @@ static int bch2_remount(struct super_block *sb, int *flags, char *data)
 
 	opt_set(opts, read_only, (*flags & SB_RDONLY) != 0);
 
-	ret = bch2_parse_mount_opts(&opts, data);
+	ret = bch2_parse_mount_opts(c, &opts, data);
 	if (ret)
 		return ret;
 
@@ -1484,7 +1484,7 @@ static struct dentry *bch2_mount(struct file_system_type *fs_type,
 
 	opt_set(opts, read_only, (flags & SB_RDONLY) != 0);
 
-	ret = bch2_parse_mount_opts(&opts, data);
+	ret = bch2_parse_mount_opts(NULL, &opts, data);
 	if (ret)
 		return ERR_PTR(ret);
 
@@ -1507,11 +1507,24 @@ static struct dentry *bch2_mount(struct file_system_type *fs_type,
 		goto got_sb;
 
 	c = bch2_fs_open(devs, nr_devs, opts);
-
-	if (!IS_ERR(c))
-		sb = sget(fs_type, NULL, bch2_set_super, flags|SB_NOSEC, c);
-	else
+	if (IS_ERR(c)) {
 		sb = ERR_CAST(c);
+		goto got_sb;
+	}
+
+	/* Some options can't be parsed until after the fs is started: */
+	ret = bch2_parse_mount_opts(c, &opts, data);
+	if (ret) {
+		bch2_fs_stop(c);
+		sb = ERR_PTR(ret);
+		goto got_sb;
+	}
+
+	bch2_opts_apply(&c->opts, opts);
+
+	sb = sget(fs_type, NULL, bch2_set_super, flags|SB_NOSEC, c);
+	if (IS_ERR(sb))
+		bch2_fs_stop(c);
 got_sb:
 	kfree(devs_to_fs);
 	kfree(devs[0]);
diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c
index afe25cd26c06..97a36ac0beea 100644
--- a/fs/bcachefs/opts.c
+++ b/fs/bcachefs/opts.c
@@ -247,7 +247,7 @@ int bch2_opt_parse(struct bch_fs *c, const struct bch_option *opt,
 		break;
 	case BCH_OPT_FN:
 		if (!c)
-			return -EINVAL;
+			return 0;
 
 		return opt->parse(c, val, res);
 	}
@@ -325,7 +325,8 @@ int bch2_opts_check_may_set(struct bch_fs *c)
 	return 0;
 }
 
-int bch2_parse_mount_opts(struct bch_opts *opts, char *options)
+int bch2_parse_mount_opts(struct bch_fs *c, struct bch_opts *opts,
+			  char *options)
 {
 	char *opt, *name, *val;
 	int ret, id;
@@ -340,7 +341,7 @@ int bch2_parse_mount_opts(struct bch_opts *opts, char *options)
 			if (id < 0)
 				goto bad_opt;
 
-			ret = bch2_opt_parse(NULL, &bch2_opt_table[id], val, &v);
+			ret = bch2_opt_parse(c, &bch2_opt_table[id], val, &v);
 			if (ret < 0)
 				goto bad_val;
 		} else {
diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
index 6aaabb24d3ed..1ddb9c57b3a5 100644
--- a/fs/bcachefs/opts.h
+++ b/fs/bcachefs/opts.h
@@ -424,7 +424,7 @@ void bch2_opt_to_text(struct printbuf *, struct bch_fs *,
 
 int bch2_opt_check_may_set(struct bch_fs *, int, u64);
 int bch2_opts_check_may_set(struct bch_fs *);
-int bch2_parse_mount_opts(struct bch_opts *, char *);
+int bch2_parse_mount_opts(struct bch_fs *, struct bch_opts *, char *);
 
 /* inode opts: */
 
-- 
cgit 


From e00711d2cae7b8e178bb615c757260107b2d4872 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 24 Oct 2020 16:37:17 -0400
Subject: bcachefs: Improve some error messages

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_io.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 996fc0c34b3c..eebab3d08c0d 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -750,7 +750,9 @@ static int validate_bset(struct bch_fs *c, struct btree *b,
 
 		btree_err_on(bkey_cmp(bn->max_key, b->key.k.p),
 			     BTREE_ERR_MUST_RETRY, c, b, i,
-			     "incorrect max key");
+			     "incorrect max key %llu:%llu",
+			     bn->max_key.inode,
+			     bn->max_key.offset);
 
 		if (write)
 			compat_btree_node(b->c.level, b->c.btree_id, version,
@@ -930,7 +932,8 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry
 
 			btree_err_on(!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)),
 				     BTREE_ERR_WANT_RETRY, c, b, i,
-				     "unknown checksum type");
+				     "unknown checksum type %llu",
+				     BSET_CSUM_TYPE(i));
 
 			nonce = btree_nonce(i, b->written << 9);
 			csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, b->data);
@@ -957,7 +960,8 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry
 
 			btree_err_on(!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)),
 				     BTREE_ERR_WANT_RETRY, c, b, i,
-				     "unknown checksum type");
+				     "unknown checksum type %llu",
+				     BSET_CSUM_TYPE(i));
 
 			nonce = btree_nonce(i, b->written << 9);
 			csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne);
-- 
cgit 


From 13dcd4abcd8d4e177f4f75ea3f5c8838a8a8c3c3 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 24 Oct 2020 20:56:47 -0400
Subject: bcachefs: Fix rare use after free in read path

If the bkey_on_stack_reassemble() call in __bch2_read_indirect_extent()
reallocates the buffer, k in bch2_read - which we pointed at the
bkey_on_stack buffer - will now point to a stale buffer. Whoops.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-io.c |  7 ++++---
 fs/bcachefs/fs.c    | 11 ++++++-----
 fs/bcachefs/io.c    |  6 ++++--
 3 files changed, 14 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 0290f7410a5c..edc3d73d26ba 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -782,18 +782,19 @@ retry:
 		if (ret)
 			break;
 
-		bkey_on_stack_reassemble(&sk, c, k);
-		k = bkey_i_to_s_c(sk.k);
-
 		offset_into_extent = iter->pos.offset -
 			bkey_start_offset(k.k);
 		sectors = k.k->size - offset_into_extent;
 
+		bkey_on_stack_reassemble(&sk, c, k);
+
 		ret = bch2_read_indirect_extent(trans,
 					&offset_into_extent, &sk);
 		if (ret)
 			break;
 
+		k = bkey_i_to_s_c(sk.k);
+
 		sectors = min(sectors, k.k->size - offset_into_extent);
 
 		bch2_trans_unlock(trans);
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index b214d58e94e9..a61d5f8aecd6 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -911,20 +911,21 @@ retry:
 			continue;
 		}
 
-		bkey_on_stack_realloc(&cur, c, k.k->u64s);
-		bkey_on_stack_realloc(&prev, c, k.k->u64s);
-		bkey_reassemble(cur.k, k);
-		k = bkey_i_to_s_c(cur.k);
-
 		offset_into_extent	= iter->pos.offset -
 			bkey_start_offset(k.k);
 		sectors			= k.k->size - offset_into_extent;
 
+		bkey_on_stack_realloc(&cur, c, k.k->u64s);
+		bkey_on_stack_realloc(&prev, c, k.k->u64s);
+		bkey_reassemble(cur.k, k);
+
 		ret = bch2_read_indirect_extent(&trans,
 					&offset_into_extent, &cur);
 		if (ret)
 			break;
 
+		k = bkey_i_to_s_c(cur.k);
+
 		sectors = min(sectors, k.k->size - offset_into_extent);
 
 		if (offset_into_extent)
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index 5c12bfed3a7b..03f5b9034aa7 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -1667,7 +1667,6 @@ retry:
 		unsigned bytes, sectors, offset_into_extent;
 
 		bkey_on_stack_reassemble(&sk, c, k);
-		k = bkey_i_to_s_c(sk.k);
 
 		offset_into_extent = iter->pos.offset -
 			bkey_start_offset(k.k);
@@ -1678,6 +1677,8 @@ retry:
 		if (ret)
 			break;
 
+		k = bkey_i_to_s_c(sk.k);
+
 		sectors = min(sectors, k.k->size - offset_into_extent);
 
 		bch2_trans_unlock(&trans);
@@ -2311,13 +2312,14 @@ retry:
 		sectors = k.k->size - offset_into_extent;
 
 		bkey_on_stack_reassemble(&sk, c, k);
-		k = bkey_i_to_s_c(sk.k);
 
 		ret = bch2_read_indirect_extent(&trans,
 					&offset_into_extent, &sk);
 		if (ret)
 			goto err;
 
+		k = bkey_i_to_s_c(sk.k);
+
 		/*
 		 * With indirect extents, the amount of data to read is the min
 		 * of the original extent and the indirect extent:
-- 
cgit 


From 801a3de6427924d87ecc7e218a99ad3245ee8290 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 24 Oct 2020 19:51:34 -0400
Subject: bcachefs: Indirect inline data extents

When inline data extents were added, reflink was forgotten about - we
need indirect inline data extents for reflink + inline data to work
correctly.

This patch adds them, and a new feature bit that's flipped when they're
used.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs_format.h | 12 +++++--
 fs/bcachefs/bkey.h            |  1 +
 fs/bcachefs/bkey_methods.c    |  6 +++-
 fs/bcachefs/buckets.c         | 49 +++++++++++++++++-----------
 fs/bcachefs/extents.c         | 16 ++++++----
 fs/bcachefs/extents.h         | 30 ++++++++++++++++--
 fs/bcachefs/io.c              | 10 +++---
 fs/bcachefs/opts.h            |  2 +-
 fs/bcachefs/reflink.c         | 74 ++++++++++++++++++++++++++++++++-----------
 fs/bcachefs/reflink.h         | 11 ++++++-
 10 files changed, 156 insertions(+), 55 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index 5465acd9cbe8..0d79bb7764a7 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -344,7 +344,8 @@ static inline void bkey_init(struct bkey *k)
 	x(reflink_p,		15)			\
 	x(reflink_v,		16)			\
 	x(inline_data,		17)			\
-	x(btree_ptr_v2,		18)
+	x(btree_ptr_v2,		18)			\
+	x(indirect_inline_data,	19)
 
 enum bch_bkey_type {
 #define x(name, nr) KEY_TYPE_##name	= nr,
@@ -890,6 +891,12 @@ struct bch_reflink_v {
 	__u64			_data[0];
 };
 
+struct bch_indirect_inline_data {
+	struct bch_val		v;
+	__le64			refcount;
+	u8			data[0];
+};
+
 /* Inline data */
 
 struct bch_inline_data {
@@ -1326,7 +1333,8 @@ LE64_BITMASK(BCH_SB_ERASURE_CODE,	struct bch_sb, flags[3],  0, 16);
 	x(incompressible,		10)	\
 	x(btree_ptr_v2,			11)	\
 	x(extents_above_btree_updates,	12)	\
-	x(btree_updates_journalled,	13)
+	x(btree_updates_journalled,	13)	\
+	x(reflink_inline_data,		14)
 
 #define BCH_SB_FEATURES_ALL				\
 	((1ULL << BCH_FEATURE_new_siphash)|		\
diff --git a/fs/bcachefs/bkey.h b/fs/bcachefs/bkey.h
index 98733363c161..dea7dfe4b079 100644
--- a/fs/bcachefs/bkey.h
+++ b/fs/bcachefs/bkey.h
@@ -573,6 +573,7 @@ BKEY_VAL_ACCESSORS(reflink_p);
 BKEY_VAL_ACCESSORS(reflink_v);
 BKEY_VAL_ACCESSORS(inline_data);
 BKEY_VAL_ACCESSORS(btree_ptr_v2);
+BKEY_VAL_ACCESSORS(indirect_inline_data);
 
 /* byte order helpers */
 
diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c
index 36e0c5152b47..32849229801d 100644
--- a/fs/bcachefs/bkey_methods.c
+++ b/fs/bcachefs/bkey_methods.c
@@ -72,7 +72,11 @@ static const char *key_type_inline_data_invalid(const struct bch_fs *c,
 static void key_type_inline_data_to_text(struct printbuf *out, struct bch_fs *c,
 					 struct bkey_s_c k)
 {
-	pr_buf(out, "(%zu bytes)", bkey_val_bytes(k.k));
+	struct bkey_s_c_inline_data d = bkey_s_c_to_inline_data(k);
+	unsigned datalen = bkey_inline_data_bytes(k.k);
+
+	pr_buf(out, "datalen %u: %*phN",
+	       datalen, min(datalen, 32U), d.v->data);
 }
 
 #define bch2_bkey_ops_inline_data (struct bkey_ops) {	\
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index aacc20f71729..0dc01386d1cd 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -1811,6 +1811,18 @@ put_iter:
 	return ret;
 }
 
+static __le64 *bkey_refcount(struct bkey_i *k)
+{
+	switch (k->k.type) {
+	case KEY_TYPE_reflink_v:
+		return &bkey_i_to_reflink_v(k)->v.refcount;
+	case KEY_TYPE_indirect_inline_data:
+		return &bkey_i_to_indirect_inline_data(k)->v.refcount;
+	default:
+		return NULL;
+	}
+}
+
 static int __bch2_trans_mark_reflink_p(struct btree_trans *trans,
 			struct bkey_s_c_reflink_p p,
 			u64 idx, unsigned sectors,
@@ -1819,7 +1831,8 @@ static int __bch2_trans_mark_reflink_p(struct btree_trans *trans,
 	struct bch_fs *c = trans->c;
 	struct btree_iter *iter;
 	struct bkey_s_c k;
-	struct bkey_i_reflink_v *r_v;
+	struct bkey_i *n;
+	__le64 *refcount;
 	s64 ret;
 
 	ret = trans_get_key(trans, BTREE_ID_REFLINK,
@@ -1827,14 +1840,6 @@ static int __bch2_trans_mark_reflink_p(struct btree_trans *trans,
 	if (ret < 0)
 		return ret;
 
-	if (k.k->type != KEY_TYPE_reflink_v) {
-		bch2_fs_inconsistent(c,
-			"%llu:%llu len %u points to nonexistent indirect extent %llu",
-			p.k->p.inode, p.k->p.offset, p.k->size, idx);
-		ret = -EIO;
-		goto err;
-	}
-
 	if ((flags & BTREE_TRIGGER_OVERWRITE) &&
 	    (bkey_start_offset(k.k) < idx ||
 	     k.k->p.offset > idx + sectors))
@@ -1842,25 +1847,33 @@ static int __bch2_trans_mark_reflink_p(struct btree_trans *trans,
 
 	sectors = k.k->p.offset - idx;
 
-	r_v = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
-	ret = PTR_ERR_OR_ZERO(r_v);
+	n = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
+	ret = PTR_ERR_OR_ZERO(n);
 	if (ret)
 		goto err;
 
-	bkey_reassemble(&r_v->k_i, k);
+	bkey_reassemble(n, k);
+
+	refcount = bkey_refcount(n);
+	if (!refcount) {
+		bch2_fs_inconsistent(c,
+			"%llu:%llu len %u points to nonexistent indirect extent %llu",
+			p.k->p.inode, p.k->p.offset, p.k->size, idx);
+		ret = -EIO;
+		goto err;
+	}
 
-	le64_add_cpu(&r_v->v.refcount,
-		     !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1);
+	le64_add_cpu(refcount, !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1);
 
-	if (!r_v->v.refcount) {
-		r_v->k.type = KEY_TYPE_deleted;
-		set_bkey_val_u64s(&r_v->k, 0);
+	if (!*refcount) {
+		n->k.type = KEY_TYPE_deleted;
+		set_bkey_val_u64s(&n->k, 0);
 	}
 
 	bch2_btree_iter_set_pos(iter, bkey_start_pos(k.k));
 	BUG_ON(iter->uptodate > BTREE_ITER_NEED_PEEK);
 
-	bch2_trans_update(trans, iter, &r_v->k_i, 0);
+	bch2_trans_update(trans, iter, n, 0);
 out:
 	ret = sectors;
 err:
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 02618b9c918c..15e7c49e1a9b 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -1199,14 +1199,14 @@ int bch2_cut_front_s(struct bpos where, struct bkey_s k)
 		le64_add_cpu(&p.v->idx, sub);
 		break;
 	}
-	case KEY_TYPE_inline_data: {
-		struct bkey_s_inline_data d = bkey_s_to_inline_data(k);
+	case KEY_TYPE_inline_data:
+	case KEY_TYPE_indirect_inline_data: {
+		void *p = bkey_inline_data_p(k);
+		unsigned bytes = bkey_inline_data_bytes(k.k);
 
-		sub = min_t(u64, sub << 9, bkey_val_bytes(d.k));
+		sub = min_t(u64, sub << 9, bytes);
 
-		memmove(d.v->data,
-			d.v->data + sub,
-			bkey_val_bytes(d.k) - sub);
+		memmove(p, p + sub, bytes - sub);
 
 		new_val_u64s -= sub >> 3;
 		break;
@@ -1244,7 +1244,9 @@ int bch2_cut_back_s(struct bpos where, struct bkey_s k)
 
 	switch (k.k->type) {
 	case KEY_TYPE_inline_data:
-		new_val_u64s = min(new_val_u64s, k.k->size << 6);
+	case KEY_TYPE_indirect_inline_data:
+		new_val_u64s = (bkey_inline_data_offset(k.k) +
+				min(bkey_inline_data_bytes(k.k), k.k->size << 9)) >> 3;
 		break;
 	}
 
diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h
index 29b15365d19c..74c7bb8f9104 100644
--- a/fs/bcachefs/extents.h
+++ b/fs/bcachefs/extents.h
@@ -445,10 +445,35 @@ static inline bool bkey_extent_is_direct_data(const struct bkey *k)
 	}
 }
 
+static inline bool bkey_extent_is_inline_data(const struct bkey *k)
+{
+	return  k->type == KEY_TYPE_inline_data ||
+		k->type == KEY_TYPE_indirect_inline_data;
+}
+
+static inline unsigned bkey_inline_data_offset(const struct bkey *k)
+{
+	switch (k->type) {
+	case KEY_TYPE_inline_data:
+		return sizeof(struct bch_inline_data);
+	case KEY_TYPE_indirect_inline_data:
+		return sizeof(struct bch_indirect_inline_data);
+	default:
+		BUG();
+	}
+}
+
+static inline unsigned bkey_inline_data_bytes(const struct bkey *k)
+{
+	return bkey_val_bytes(k) - bkey_inline_data_offset(k);
+}
+
+#define bkey_inline_data_p(_k)	(((void *) (_k).v) + bkey_inline_data_offset((_k).k))
+
 static inline bool bkey_extent_is_data(const struct bkey *k)
 {
-	return bkey_extent_is_direct_data(k) ||
-		k->type == KEY_TYPE_inline_data ||
+	return  bkey_extent_is_direct_data(k) ||
+		bkey_extent_is_inline_data(k) ||
 		k->type == KEY_TYPE_reflink_p;
 }
 
@@ -463,6 +488,7 @@ static inline bool bkey_extent_is_allocation(const struct bkey *k)
 	case KEY_TYPE_reflink_p:
 	case KEY_TYPE_reflink_v:
 	case KEY_TYPE_inline_data:
+	case KEY_TYPE_indirect_inline_data:
 		return true;
 	default:
 		return false;
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index 03f5b9034aa7..346d77d68ade 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -2000,7 +2000,8 @@ int __bch2_read_indirect_extent(struct btree_trans *trans,
 	if (ret)
 		goto err;
 
-	if (k.k->type != KEY_TYPE_reflink_v) {
+	if (k.k->type != KEY_TYPE_reflink_v &&
+	    k.k->type != KEY_TYPE_indirect_inline_data) {
 		__bcache_io_error(trans->c,
 				"pointer to nonexistent indirect extent");
 		ret = -EIO;
@@ -2027,13 +2028,12 @@ int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig,
 	struct bpos pos = bkey_start_pos(k.k);
 	int pick_ret;
 
-	if (k.k->type == KEY_TYPE_inline_data) {
-		struct bkey_s_c_inline_data d = bkey_s_c_to_inline_data(k);
+	if (bkey_extent_is_inline_data(k.k)) {
 		unsigned bytes = min_t(unsigned, iter.bi_size,
-				       bkey_val_bytes(d.k));
+				       bkey_inline_data_bytes(k.k));
 
 		swap(iter.bi_size, bytes);
-		memcpy_to_bio(&orig->bio, iter, d.v->data);
+		memcpy_to_bio(&orig->bio, iter, bkey_inline_data_p(k));
 		swap(iter.bi_size, bytes);
 		bio_advance_iter(&orig->bio, &iter, bytes);
 		zero_fill_bio_iter(&orig->bio, iter);
diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
index 1ddb9c57b3a5..e1a46f97f299 100644
--- a/fs/bcachefs/opts.h
+++ b/fs/bcachefs/opts.h
@@ -185,7 +185,7 @@ enum opt_type {
 	x(inline_data,			u8,				\
 	  OPT_MOUNT|OPT_RUNTIME,					\
 	  OPT_BOOL(),							\
-	  NO_SB_OPT,			false,				\
+	  NO_SB_OPT,			true,				\
 	  NULL,		"Enable inline data extents")			\
 	x(acl,				u8,				\
 	  OPT_FORMAT|OPT_MOUNT,						\
diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c
index 3c473f1380a6..8abcbfb3bd64 100644
--- a/fs/bcachefs/reflink.c
+++ b/fs/bcachefs/reflink.c
@@ -9,6 +9,18 @@
 
 #include <linux/sched/signal.h>
 
+static inline unsigned bkey_type_to_indirect(const struct bkey *k)
+{
+	switch (k->type) {
+	case KEY_TYPE_extent:
+		return KEY_TYPE_reflink_v;
+	case KEY_TYPE_inline_data:
+		return KEY_TYPE_indirect_inline_data;
+	default:
+		return 0;
+	}
+}
+
 /* reflink pointers */
 
 const char *bch2_reflink_p_invalid(const struct bch_fs *c, struct bkey_s_c k)
@@ -71,17 +83,42 @@ void bch2_reflink_v_to_text(struct printbuf *out, struct bch_fs *c,
 	bch2_bkey_ptrs_to_text(out, c, k);
 }
 
+/* indirect inline data */
+
+const char *bch2_indirect_inline_data_invalid(const struct bch_fs *c,
+					      struct bkey_s_c k)
+{
+	if (bkey_val_bytes(k.k) < sizeof(struct bch_indirect_inline_data))
+		return "incorrect value size";
+	return NULL;
+}
+
+void bch2_indirect_inline_data_to_text(struct printbuf *out,
+					struct bch_fs *c, struct bkey_s_c k)
+{
+	struct bkey_s_c_indirect_inline_data d = bkey_s_c_to_indirect_inline_data(k);
+	unsigned datalen = bkey_inline_data_bytes(k.k);
+
+	pr_buf(out, "refcount %llu datalen %u: %*phN",
+	       le64_to_cpu(d.v->refcount), datalen,
+	       min(datalen, 32U), d.v->data);
+}
+
 static int bch2_make_extent_indirect(struct btree_trans *trans,
 				     struct btree_iter *extent_iter,
-				     struct bkey_i_extent *e)
+				     struct bkey_i *orig)
 {
 	struct bch_fs *c = trans->c;
 	struct btree_iter *reflink_iter;
 	struct bkey_s_c k;
-	struct bkey_i_reflink_v *r_v;
+	struct bkey_i *r_v;
 	struct bkey_i_reflink_p *r_p;
+	__le64 *refcount;
 	int ret;
 
+	if (orig->k.type == KEY_TYPE_inline_data)
+		bch2_check_set_feature(c, BCH_FEATURE_reflink_inline_data);
+
 	for_each_btree_key(trans, reflink_iter, BTREE_ID_REFLINK,
 			   POS(0, c->reflink_hint),
 			   BTREE_ITER_INTENT|BTREE_ITER_SLOTS, k, ret) {
@@ -90,7 +127,7 @@ static int bch2_make_extent_indirect(struct btree_trans *trans,
 			continue;
 		}
 
-		if (bkey_deleted(k.k) && e->k.size <= k.k->size)
+		if (bkey_deleted(k.k) && orig->k.size <= k.k->size)
 			break;
 	}
 
@@ -100,29 +137,31 @@ static int bch2_make_extent_indirect(struct btree_trans *trans,
 	/* rewind iter to start of hole, if necessary: */
 	bch2_btree_iter_set_pos(reflink_iter, bkey_start_pos(k.k));
 
-	r_v = bch2_trans_kmalloc(trans, sizeof(*r_v) + bkey_val_bytes(&e->k));
+	r_v = bch2_trans_kmalloc(trans, sizeof(__le64) + bkey_val_bytes(&orig->k));
 	ret = PTR_ERR_OR_ZERO(r_v);
 	if (ret)
 		goto err;
 
-	bkey_reflink_v_init(&r_v->k_i);
+	bkey_init(&r_v->k);
+	r_v->k.type	= bkey_type_to_indirect(&orig->k);
 	r_v->k.p	= reflink_iter->pos;
-	bch2_key_resize(&r_v->k, e->k.size);
-	r_v->k.version	= e->k.version;
+	bch2_key_resize(&r_v->k, orig->k.size);
+	r_v->k.version	= orig->k.version;
+
+	set_bkey_val_bytes(&r_v->k, sizeof(__le64) + bkey_val_bytes(&orig->k));
 
-	set_bkey_val_u64s(&r_v->k, bkey_val_u64s(&r_v->k) +
-			  bkey_val_u64s(&e->k));
-	r_v->v.refcount	= 0;
-	memcpy(r_v->v.start, e->v.start, bkey_val_bytes(&e->k));
+	refcount	= (void *) &r_v->v;
+	*refcount	= 0;
+	memcpy(refcount + 1, &orig->v, bkey_val_bytes(&orig->k));
 
-	bch2_trans_update(trans, reflink_iter, &r_v->k_i, 0);
+	bch2_trans_update(trans, reflink_iter, r_v, 0);
 
 	r_p = bch2_trans_kmalloc(trans, sizeof(*r_p));
 	if (IS_ERR(r_p))
 		return PTR_ERR(r_p);
 
-	e->k.type = KEY_TYPE_reflink_p;
-	r_p = bkey_i_to_reflink_p(&e->k_i);
+	orig->k.type = KEY_TYPE_reflink_p;
+	r_p = bkey_i_to_reflink_p(orig);
 	set_bkey_val_bytes(&r_p->k, sizeof(r_p->v));
 	r_p->v.idx = cpu_to_le64(bkey_start_offset(&r_v->k));
 
@@ -144,8 +183,7 @@ static struct bkey_s_c get_next_src(struct btree_iter *iter, struct bpos end)
 		if (bkey_cmp(iter->pos, end) >= 0)
 			return bkey_s_c_null;
 
-		if (k.k->type == KEY_TYPE_extent ||
-		    k.k->type == KEY_TYPE_reflink_p)
+		if (bkey_extent_is_data(k.k))
 			break;
 	}
 
@@ -218,7 +256,7 @@ s64 bch2_remap_range(struct bch_fs *c,
 		if (!bkey_cmp(dst_iter->pos, dst_end))
 			break;
 
-		if (src_k.k->type == KEY_TYPE_extent) {
+		if (src_k.k->type != KEY_TYPE_reflink_p) {
 			bkey_on_stack_reassemble(&new_src, c, src_k);
 			src_k = bkey_i_to_s_c(new_src.k);
 
@@ -226,7 +264,7 @@ s64 bch2_remap_range(struct bch_fs *c,
 			bch2_cut_back(src_end,		new_src.k);
 
 			ret = bch2_make_extent_indirect(&trans, src_iter,
-						bkey_i_to_extent(new_src.k));
+						new_src.k);
 			if (ret)
 				goto btree_err;
 
diff --git a/fs/bcachefs/reflink.h b/fs/bcachefs/reflink.h
index 5445c1cf0797..9d5e7dc58f2b 100644
--- a/fs/bcachefs/reflink.h
+++ b/fs/bcachefs/reflink.h
@@ -18,13 +18,22 @@ const char *bch2_reflink_v_invalid(const struct bch_fs *, struct bkey_s_c);
 void bch2_reflink_v_to_text(struct printbuf *, struct bch_fs *,
 			    struct bkey_s_c);
 
-
 #define bch2_bkey_ops_reflink_v (struct bkey_ops) {		\
 	.key_invalid	= bch2_reflink_v_invalid,		\
 	.val_to_text	= bch2_reflink_v_to_text,		\
 	.swab		= bch2_ptr_swab,			\
 }
 
+const char *bch2_indirect_inline_data_invalid(const struct bch_fs *,
+					      struct bkey_s_c);
+void bch2_indirect_inline_data_to_text(struct printbuf *,
+				struct bch_fs *, struct bkey_s_c);
+
+#define bch2_bkey_ops_indirect_inline_data (struct bkey_ops) {	\
+	.key_invalid	= bch2_indirect_inline_data_invalid,	\
+	.val_to_text	= bch2_indirect_inline_data_to_text,	\
+}
+
 s64 bch2_remap_range(struct bch_fs *, struct bpos, struct bpos,
 		     u64, u64 *, u64, s64 *);
 
-- 
cgit 


From 33114c2d897405ec338df979d1bf1d3319f92938 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 24 Oct 2020 21:20:16 -0400
Subject: bcachefs: Drop alloc keys from journal when -o reconstruct_alloc

This fixes a bug where we'd pop an assertion due to replaying a key for
an interior btree node when that node no longer exists.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/recovery.c | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index d70fa968db50..32fed6b81a52 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -25,6 +25,18 @@
 
 #define QSTR(n) { { { .len = strlen(n) } }, .name = n }
 
+/* for -o reconstruct_alloc: */
+static void drop_alloc_keys(struct journal_keys *keys)
+{
+	size_t src, dst;
+
+	for (src = 0, dst = 0; src < keys->nr; src++)
+		if (keys->d[src].btree_id != BTREE_ID_ALLOC)
+			keys->d[dst++] = keys->d[src];
+
+	keys->nr = dst;
+}
+
 /* iterate over keys read from the journal: */
 
 static struct journal_key *journal_key_search(struct journal_keys *journal_keys,
@@ -930,7 +942,6 @@ static int read_btree_roots(struct bch_fs *c)
 			continue;
 		}
 
-
 		if (r->error) {
 			__fsck_err(c, i == BTREE_ID_ALLOC
 				   ? FSCK_CAN_IGNORE : 0,
@@ -1027,6 +1038,11 @@ int bch2_fs_recovery(struct bch_fs *c)
 		goto err;
 	}
 
+	if (c->opts.reconstruct_alloc) {
+		c->sb.compat &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_INFO);
+		drop_alloc_keys(&c->journal_keys);
+	}
+
 	ret = journal_replay_early(c, clean, &c->journal_entries);
 	if (ret)
 		goto err;
-- 
cgit 


From 8be901d5d4a266e6838cdb6781084a02d2b37ace Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 25 Oct 2020 01:08:28 -0400
Subject: bcachefs: Always write a journal entry when stopping journal

This is to fix a (harmless) bug where the read clock hand in the
superblock doesn't match the journal.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/journal.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index d1e4a8162ddd..1f7f3b96bd87 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -979,9 +979,11 @@ void bch2_fs_journal_stop(struct journal *j)
 
 	wait_event(j->wait, journal_entry_close(j));
 
-	/* do we need to write another journal entry? */
-	if (test_bit(JOURNAL_NOT_EMPTY, &j->flags))
-		bch2_journal_meta(j);
+	/*
+	 * Always write a new journal entry, to make sure the clock hands are up
+	 * to date (and match the superblock)
+	 */
+	bch2_journal_meta(j);
 
 	journal_quiesce(j);
 
-- 
cgit 


From eb4609796de52a8e86eb69f5cf9eabd492242a1b Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 26 Oct 2020 14:54:55 -0400
Subject: bcachefs: Add mode to bch2_inode_to_text

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/inode.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index 758eda526674..71670f415d66 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -271,6 +271,8 @@ void bch2_inode_to_text(struct printbuf *out, struct bch_fs *c,
 		return;
 	}
 
+	pr_buf(out, "mode: %o ", unpacked.bi_mode);
+
 #define x(_name, _bits)						\
 	pr_buf(out, #_name ": %llu ", (u64) unpacked._name);
 	BCH_INODE_FIELDS()
-- 
cgit 


From 645d72aa366e51259296cfc02e37c802d7b78493 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 26 Oct 2020 14:45:20 -0400
Subject: bcachefs: Fix btree updates when mixing cached and non cached
 iterators

There was a bug where bch2_trans_update() would incorrectly delete a
pending update where the new update did not actually overwrite the
existing update, because we were incorrectly using BTREE_ITER_TYPE when
sorting pending btree updates.

This affects the pending patch to use cached iterators for inode
updates.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_cache.c       |  2 +-
 fs/bcachefs/btree_iter.c        |  2 +-
 fs/bcachefs/btree_iter.h        |  5 +++--
 fs/bcachefs/btree_update_leaf.c | 13 ++++++++++---
 4 files changed, 15 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index 90d884b18b70..c503d76bab3b 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -949,7 +949,7 @@ struct btree *bch2_btree_node_get_sibling(struct bch_fs *c,
 		 * holding other locks that would cause us to deadlock:
 		 */
 		trans_for_each_iter(trans, linked)
-			if (btree_iter_cmp(iter, linked) < 0)
+			if (btree_iter_lock_cmp(iter, linked) < 0)
 				__bch2_btree_iter_unlock(linked);
 
 		if (sib == btree_prev_sib)
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 7501556c0988..b561d0353d77 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1104,7 +1104,7 @@ retry_all:
 		sorted[nr_sorted++] = iter->idx;
 
 #define btree_iter_cmp_by_idx(_l, _r)				\
-		btree_iter_cmp(&trans->iters[_l], &trans->iters[_r])
+		btree_iter_lock_cmp(&trans->iters[_l], &trans->iters[_r])
 
 	bubble_sort(sorted, nr_sorted, btree_iter_cmp_by_idx);
 #undef btree_iter_cmp_by_idx
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index bd9ec3ec9a92..f80e09255f68 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -177,8 +177,9 @@ void bch2_btree_iter_set_pos_same_leaf(struct btree_iter *, struct bpos);
 void __bch2_btree_iter_set_pos(struct btree_iter *, struct bpos, bool);
 void bch2_btree_iter_set_pos(struct btree_iter *, struct bpos);
 
-static inline int btree_iter_cmp(const struct btree_iter *l,
-				 const struct btree_iter *r)
+/* Sort order for locking btree iterators: */
+static inline int btree_iter_lock_cmp(const struct btree_iter *l,
+				      const struct btree_iter *r)
 {
 	return   cmp_int(l->btree_id, r->btree_id) ?:
 		-cmp_int(btree_iter_type(l), btree_iter_type(r)) ?:
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 9c33a8be2c58..839dba099cac 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -690,6 +690,13 @@ bch2_trans_commit_get_rw_cold(struct btree_trans *trans)
 	return 0;
 }
 
+static inline int btree_iter_pos_cmp(const struct btree_iter *l,
+				     const struct btree_iter *r)
+{
+	return   cmp_int(l->btree_id, r->btree_id) ?:
+		 bkey_cmp(l->pos, r->pos);
+}
+
 static void bch2_trans_update2(struct btree_trans *trans,
 			       struct btree_iter *iter,
 			       struct bkey_i *insert)
@@ -707,12 +714,12 @@ static void bch2_trans_update2(struct btree_trans *trans,
 	iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT;
 
 	trans_for_each_update2(trans, i) {
-		if (btree_iter_cmp(n.iter, i->iter) == 0) {
+		if (btree_iter_pos_cmp(n.iter, i->iter) == 0) {
 			*i = n;
 			return;
 		}
 
-		if (btree_iter_cmp(n.iter, i->iter) <= 0)
+		if (btree_iter_pos_cmp(n.iter, i->iter) <= 0)
 			break;
 	}
 
@@ -996,7 +1003,7 @@ int bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter,
 	 * Pending updates are kept sorted: first, find position of new update:
 	 */
 	trans_for_each_update(trans, i)
-		if (btree_iter_cmp(iter, i->iter) <= 0)
+		if (btree_iter_pos_cmp(iter, i->iter) <= 0)
 			break;
 
 	/*
-- 
cgit 


From e7b854b1f76d34eeea6baa3a1b5eaa1f85ae6340 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 26 Oct 2020 17:03:28 -0400
Subject: bcachefs: fiemap fixes

 - fiemap didn't know about inline extents, fixed
 - advancing to the next extent after we'd chased a pointer to the
   reflink btree was wrong, fixed

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs.c | 23 +++++++++++++----------
 1 file changed, 13 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index a61d5f8aecd6..5119266a8493 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -831,7 +831,7 @@ static int bch2_fill_extent(struct bch_fs *c,
 			    struct fiemap_extent_info *info,
 			    struct bkey_s_c k, unsigned flags)
 {
-	if (bkey_extent_is_data(k.k)) {
+	if (bkey_extent_is_direct_data(k.k)) {
 		struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
 		const union bch_extent_entry *entry;
 		struct extent_ptr_decoded p;
@@ -862,6 +862,12 @@ static int bch2_fill_extent(struct bch_fs *c,
 		}
 
 		return 0;
+	} else if (bkey_extent_is_inline_data(k.k)) {
+		return fiemap_fill_next_extent(info,
+					       bkey_start_offset(k.k) << 9,
+					       0, k.k->size << 9,
+					       flags|
+					       FIEMAP_EXTENT_DATA_INLINE);
 	} else if (k.k->type == KEY_TYPE_reservation) {
 		return fiemap_fill_next_extent(info,
 					       bkey_start_offset(k.k) << 9,
@@ -928,11 +934,10 @@ retry:
 
 		sectors = min(sectors, k.k->size - offset_into_extent);
 
-		if (offset_into_extent)
-			bch2_cut_front(POS(k.k->p.inode,
-					   bkey_start_offset(k.k) +
-					   offset_into_extent),
-				       cur.k);
+		bch2_cut_front(POS(k.k->p.inode,
+				   bkey_start_offset(k.k) +
+				   offset_into_extent),
+			       cur.k);
 		bch2_key_resize(&cur.k->k, sectors);
 		cur.k->k.p = iter->pos;
 		cur.k->k.p.offset += cur.k->k.size;
@@ -947,10 +952,8 @@ retry:
 		bkey_copy(prev.k, cur.k);
 		have_extent = true;
 
-		if (k.k->type == KEY_TYPE_reflink_v)
-			bch2_btree_iter_set_pos(iter, k.k->p);
-		else
-			bch2_btree_iter_next(iter);
+		bch2_btree_iter_set_pos(iter,
+			POS(iter->pos.inode, iter->pos.offset + sectors));
 	}
 
 	if (ret == -EINTR)
-- 
cgit 


From 8cad3e2f73f5c6ad39e9da5564382a2a737a201c Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 22 Sep 2019 19:10:21 -0400
Subject: bcachefs: Use cached iterators for inode updates

This switches inode updates to use cached btree iterators - which should
be a nice performance boost, since lock contention on the inodes btree
can be a bottleneck on multithreaded workloads.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_key_cache.c |  10 ++--
 fs/bcachefs/btree_key_cache.h |   3 ++
 fs/bcachefs/inode.c           | 104 ++++++++++++++++++++++++++----------------
 3 files changed, 72 insertions(+), 45 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index 1be01035869f..52b657030755 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -28,8 +28,8 @@ static const struct rhashtable_params bch2_btree_key_cache_params = {
 };
 
 __flatten
-static inline struct bkey_cached *
-btree_key_cache_find(struct bch_fs *c, enum btree_id btree_id, struct bpos pos)
+inline struct bkey_cached *
+bch2_btree_key_cache_find(struct bch_fs *c, enum btree_id btree_id, struct bpos pos)
 {
 	struct bkey_cached_key key = {
 		.btree_id	= btree_id,
@@ -218,7 +218,7 @@ int bch2_btree_iter_traverse_cached(struct btree_iter *iter)
 		goto fill;
 	}
 retry:
-	ck = btree_key_cache_find(c, iter->btree_id, iter->pos);
+	ck = bch2_btree_key_cache_find(c, iter->btree_id, iter->pos);
 	if (!ck) {
 		if (iter->flags & BTREE_ITER_CACHED_NOCREATE) {
 			iter->l[0].b = NULL;
@@ -415,7 +415,7 @@ int bch2_btree_key_cache_flush(struct btree_trans *trans,
 	struct bkey_cached_key key = { id, pos };
 
 	/* Fastpath - assume it won't be found: */
-	if (!btree_key_cache_find(c, id, pos))
+	if (!bch2_btree_key_cache_find(c, id, pos))
 		return 0;
 
 	return btree_key_cache_flush_pos(trans, key, 0, true);
@@ -462,7 +462,7 @@ bool bch2_btree_insert_key_cached(struct btree_trans *trans,
 void bch2_btree_key_cache_verify_clean(struct btree_trans *trans,
 			       enum btree_id id, struct bpos pos)
 {
-	BUG_ON(btree_key_cache_find(trans->c, id, pos));
+	BUG_ON(bch2_btree_key_cache_find(trans->c, id, pos));
 }
 #endif
 
diff --git a/fs/bcachefs/btree_key_cache.h b/fs/bcachefs/btree_key_cache.h
index b1756c6c622c..d448264abcc8 100644
--- a/fs/bcachefs/btree_key_cache.h
+++ b/fs/bcachefs/btree_key_cache.h
@@ -1,6 +1,9 @@
 #ifndef _BCACHEFS_BTREE_KEY_CACHE_H
 #define _BCACHEFS_BTREE_KEY_CACHE_H
 
+struct bkey_cached *
+bch2_btree_key_cache_find(struct bch_fs *, enum btree_id, struct bpos);
+
 int bch2_btree_iter_traverse_cached(struct btree_iter *);
 
 bool bch2_btree_insert_key_cached(struct btree_trans *,
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index 71670f415d66..631c60bb2fac 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 
 #include "bcachefs.h"
+#include "btree_key_cache.h"
 #include "bkey_methods.h"
 #include "btree_update.h"
 #include "error.h"
@@ -189,11 +190,11 @@ struct btree_iter *bch2_inode_peek(struct btree_trans *trans,
 	int ret;
 
 	iter = bch2_trans_get_iter(trans, BTREE_ID_INODES, POS(0, inum),
-				   BTREE_ITER_SLOTS|flags);
+				   BTREE_ITER_CACHED|flags);
 	if (IS_ERR(iter))
 		return iter;
 
-	k = bch2_btree_iter_peek_slot(iter);
+	k = bch2_btree_iter_peek_cached(iter);
 	ret = bkey_err(k);
 	if (ret)
 		goto err;
@@ -390,7 +391,17 @@ again:
 		if (bkey_cmp(iter->pos, POS(0, max)) > 0)
 			break;
 
-		if (k.k->type != KEY_TYPE_inode)
+		/*
+		 * There's a potential cache coherency issue with the btree key
+		 * cache code here - we're iterating over the btree, skipping
+		 * that cache. We should never see an empty slot that isn't
+		 * actually empty due to a pending update in the key cache
+		 * because the update that creates the inode isn't done with a
+		 * cached iterator, but - better safe than sorry, check the
+		 * cache before using a slot:
+		 */
+		if (k.k->type != KEY_TYPE_inode &&
+		    !bch2_btree_key_cache_find(trans->c, BTREE_ID_INODES, iter->pos))
 			goto found_slot;
 	}
 
@@ -424,6 +435,8 @@ int bch2_inode_rm(struct bch_fs *c, u64 inode_nr)
 	struct bkey_i_inode_generation delete;
 	struct bpos start = POS(inode_nr, 0);
 	struct bpos end = POS(inode_nr + 1, 0);
+	struct bkey_s_c k;
+	u64 bi_generation;
 	int ret;
 
 	/*
@@ -444,51 +457,62 @@ int bch2_inode_rm(struct bch_fs *c, u64 inode_nr)
 		return ret;
 
 	bch2_trans_init(&trans, c, 0, 0);
+retry:
+	bch2_trans_begin(&trans);
+
+	bi_generation = 0;
+
+	ret = bch2_btree_key_cache_flush(&trans, BTREE_ID_INODES, POS(0, inode_nr));
+	if (ret) {
+		if (ret != -EINTR)
+			bch_err(c, "error flushing btree key cache: %i", ret);
+		goto err;
+	}
 
 	iter = bch2_trans_get_iter(&trans, BTREE_ID_INODES, POS(0, inode_nr),
 				   BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
-	do {
-		struct bkey_s_c k = bch2_btree_iter_peek_slot(iter);
-		u32 bi_generation = 0;
+	k = bch2_btree_iter_peek_slot(iter);
 
-		ret = bkey_err(k);
-		if (ret)
-			break;
+	ret = bkey_err(k);
+	if (ret)
+		goto err;
 
-		bch2_fs_inconsistent_on(k.k->type != KEY_TYPE_inode, c,
-					"inode %llu not found when deleting",
-					inode_nr);
+	bch2_fs_inconsistent_on(k.k->type != KEY_TYPE_inode, c,
+				"inode %llu not found when deleting",
+				inode_nr);
 
-		switch (k.k->type) {
-		case KEY_TYPE_inode: {
-			struct bch_inode_unpacked inode_u;
+	switch (k.k->type) {
+	case KEY_TYPE_inode: {
+		struct bch_inode_unpacked inode_u;
 
-			if (!bch2_inode_unpack(bkey_s_c_to_inode(k), &inode_u))
-				bi_generation = inode_u.bi_generation + 1;
-			break;
-		}
-		case KEY_TYPE_inode_generation: {
-			struct bkey_s_c_inode_generation g =
-				bkey_s_c_to_inode_generation(k);
-			bi_generation = le32_to_cpu(g.v->bi_generation);
-			break;
-		}
-		}
+		if (!bch2_inode_unpack(bkey_s_c_to_inode(k), &inode_u))
+			bi_generation = inode_u.bi_generation + 1;
+		break;
+	}
+	case KEY_TYPE_inode_generation: {
+		struct bkey_s_c_inode_generation g =
+			bkey_s_c_to_inode_generation(k);
+		bi_generation = le32_to_cpu(g.v->bi_generation);
+		break;
+	}
+	}
 
-		if (!bi_generation) {
-			bkey_init(&delete.k);
-			delete.k.p.offset = inode_nr;
-		} else {
-			bkey_inode_generation_init(&delete.k_i);
-			delete.k.p.offset = inode_nr;
-			delete.v.bi_generation = cpu_to_le32(bi_generation);
-		}
+	if (!bi_generation) {
+		bkey_init(&delete.k);
+		delete.k.p.offset = inode_nr;
+	} else {
+		bkey_inode_generation_init(&delete.k_i);
+		delete.k.p.offset = inode_nr;
+		delete.v.bi_generation = cpu_to_le32(bi_generation);
+	}
 
-		bch2_trans_update(&trans, iter, &delete.k_i, 0);
+	bch2_trans_update(&trans, iter, &delete.k_i, 0);
 
-		ret = bch2_trans_commit(&trans, NULL, NULL,
-					BTREE_INSERT_NOFAIL);
-	} while (ret == -EINTR);
+	ret = bch2_trans_commit(&trans, NULL, NULL,
+				BTREE_INSERT_NOFAIL);
+err:
+	if (ret == -EINTR)
+		goto retry;
 
 	bch2_trans_exit(&trans);
 	return ret;
@@ -502,11 +526,11 @@ int bch2_inode_find_by_inum_trans(struct btree_trans *trans, u64 inode_nr,
 	int ret;
 
 	iter = bch2_trans_get_iter(trans, BTREE_ID_INODES,
-			POS(0, inode_nr), BTREE_ITER_SLOTS);
+			POS(0, inode_nr), BTREE_ITER_CACHED);
 	if (IS_ERR(iter))
 		return PTR_ERR(iter);
 
-	k = bch2_btree_iter_peek_slot(iter);
+	k = bch2_btree_iter_peek_cached(iter);
 	ret = bkey_err(k);
 	if (ret)
 		goto err;
-- 
cgit 


From 527087c741dc1199fbf4a635a80bf4839a9a8288 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 27 Oct 2020 14:10:52 -0400
Subject: bcachefs: Fix stack corruption

A bkey_on_stack_realloc() call was in the wrong place, and broken for
indirect extents

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index 5119266a8493..2ed80ef41d1a 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -921,9 +921,7 @@ retry:
 			bkey_start_offset(k.k);
 		sectors			= k.k->size - offset_into_extent;
 
-		bkey_on_stack_realloc(&cur, c, k.k->u64s);
-		bkey_on_stack_realloc(&prev, c, k.k->u64s);
-		bkey_reassemble(cur.k, k);
+		bkey_on_stack_reassemble(&cur, c, k);
 
 		ret = bch2_read_indirect_extent(&trans,
 					&offset_into_extent, &cur);
@@ -931,6 +929,7 @@ retry:
 			break;
 
 		k = bkey_i_to_s_c(cur.k);
+		bkey_on_stack_realloc(&prev, c, k.k->u64s);
 
 		sectors = min(sectors, k.k->size - offset_into_extent);
 
-- 
cgit 


From a301dc38efa178e900a59ce7f03c1e81123c0919 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 28 Oct 2020 14:17:46 -0400
Subject: bcachefs: Improve tracing for transaction restarts

We have a bug where we can get stuck with a process spinning in
transaction restarts - need more information.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_cache.c     |  9 +++---
 fs/bcachefs/btree_cache.h     |  2 +-
 fs/bcachefs/btree_iter.c      | 72 +++++++++++++++++++++++++++----------------
 fs/bcachefs/btree_key_cache.c |  2 +-
 fs/bcachefs/btree_locking.h   |  8 +++--
 fs/bcachefs/trace.h           | 43 ++++++++++++++++++++++++--
 6 files changed, 98 insertions(+), 38 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index c503d76bab3b..02a2f558cf4d 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -706,7 +706,8 @@ static int lock_node_check_fn(struct six_lock *lock, void *p)
  */
 struct btree *bch2_btree_node_get(struct bch_fs *c, struct btree_iter *iter,
 				  const struct bkey_i *k, unsigned level,
-				  enum six_lock_type lock_type)
+				  enum six_lock_type lock_type,
+				  unsigned long trace_ip)
 {
 	struct btree_cache *bc = &c->btree_cache;
 	struct btree *b;
@@ -768,7 +769,7 @@ lock_node:
 			btree_node_unlock(iter, level + 1);
 
 		if (!btree_node_lock(b, k->k.p, level, iter, lock_type,
-				     lock_node_check_fn, (void *) k)) {
+				     lock_node_check_fn, (void *) k, trace_ip)) {
 			if (b->hash_val != btree_ptr_hash_val(k))
 				goto retry;
 			return ERR_PTR(-EINTR);
@@ -936,7 +937,7 @@ struct btree *bch2_btree_node_get_sibling(struct bch_fs *c,
 	bch2_bkey_unpack(parent, &tmp.k, k);
 
 	ret = bch2_btree_node_get(c, iter, &tmp.k, level,
-				  SIX_LOCK_intent);
+				  SIX_LOCK_intent, _THIS_IP_);
 
 	if (PTR_ERR_OR_ZERO(ret) == -EINTR && !trans->nounlock) {
 		struct btree_iter *linked;
@@ -956,7 +957,7 @@ struct btree *bch2_btree_node_get_sibling(struct bch_fs *c,
 			btree_node_unlock(iter, level);
 
 		ret = bch2_btree_node_get(c, iter, &tmp.k, level,
-					  SIX_LOCK_intent);
+					  SIX_LOCK_intent, _THIS_IP_);
 
 		/*
 		 * before btree_iter_relock() calls btree_iter_verify_locks():
diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h
index d0d3a85bb8be..8a19e60e9258 100644
--- a/fs/bcachefs/btree_cache.h
+++ b/fs/bcachefs/btree_cache.h
@@ -23,7 +23,7 @@ struct btree *bch2_btree_node_mem_alloc(struct bch_fs *);
 
 struct btree *bch2_btree_node_get(struct bch_fs *, struct btree_iter *,
 				  const struct bkey_i *, unsigned,
-				  enum six_lock_type);
+				  enum six_lock_type, unsigned long);
 
 struct btree *bch2_btree_node_get_noiter(struct bch_fs *, const struct bkey_i *,
 					 enum btree_id, unsigned);
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index b561d0353d77..a76e13000d11 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -197,13 +197,13 @@ static struct bpos btree_node_pos(struct btree_bkey_cached_common *_b,
 bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
 			    unsigned level, struct btree_iter *iter,
 			    enum six_lock_type type,
-			    six_lock_should_sleep_fn should_sleep_fn,
-			    void *p)
+			    six_lock_should_sleep_fn should_sleep_fn, void *p,
+			    unsigned long ip)
 {
 	struct btree_trans *trans = iter->trans;
-	struct btree_iter *linked;
+	struct btree_iter *linked, *deadlock_iter = NULL;
 	u64 start_time = local_clock();
-	bool ret = true;
+	unsigned reason = 9;
 
 	/* Check if it's safe to block: */
 	trans_for_each_iter(trans, linked) {
@@ -228,10 +228,13 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
 				linked->locks_want = max_t(unsigned,
 						linked->locks_want,
 						__fls(linked->nodes_locked) + 1);
-				if (!btree_iter_get_locks(linked, true, false))
-					ret = false;
+				if (!btree_iter_get_locks(linked, true, false)) {
+					deadlock_iter = linked;
+					reason = 1;
+				}
 			} else {
-				ret = false;
+				deadlock_iter = linked;
+				reason = 2;
 			}
 		}
 
@@ -247,23 +250,30 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
 					max(level + 1, max_t(unsigned,
 					    linked->locks_want,
 					    iter->locks_want));
-				if (!btree_iter_get_locks(linked, true, false))
-					ret = false;
+				if (!btree_iter_get_locks(linked, true, false)) {
+					deadlock_iter = linked;
+					reason = 3;
+				}
 			} else {
-				ret = false;
+				deadlock_iter = linked;
+				reason = 4;
 			}
 		}
 
 		/* Must lock btree nodes in key order: */
 		if ((cmp_int(iter->btree_id, linked->btree_id) ?:
-		     -cmp_int(btree_iter_type(iter), btree_iter_type(linked))) < 0)
-			ret = false;
+		     -cmp_int(btree_iter_type(iter), btree_iter_type(linked))) < 0) {
+			deadlock_iter = linked;
+			reason = 5;
+		}
 
 		if (iter->btree_id == linked->btree_id &&
 		    btree_node_locked(linked, level) &&
 		    bkey_cmp(pos, btree_node_pos((void *) linked->l[level].b,
-						 btree_iter_type(linked))) <= 0)
-			ret = false;
+						 btree_iter_type(linked))) <= 0) {
+			deadlock_iter = linked;
+			reason = 6;
+		}
 
 		/*
 		 * Recheck if this is a node we already have locked - since one
@@ -277,8 +287,13 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
 		}
 	}
 
-	if (unlikely(!ret)) {
-		trace_trans_restart_would_deadlock(iter->trans->ip);
+	if (unlikely(deadlock_iter)) {
+		trace_trans_restart_would_deadlock(iter->trans->ip, ip,
+				reason,
+				deadlock_iter->btree_id,
+				btree_iter_type(deadlock_iter),
+				iter->btree_id,
+				btree_iter_type(iter));
 		return false;
 	}
 
@@ -945,7 +960,8 @@ static int lock_root_check_fn(struct six_lock *lock, void *p)
 }
 
 static inline int btree_iter_lock_root(struct btree_iter *iter,
-				       unsigned depth_want)
+				       unsigned depth_want,
+				       unsigned long trace_ip)
 {
 	struct bch_fs *c = iter->trans->c;
 	struct btree *b, **rootp = &c->btree_roots[iter->btree_id].b;
@@ -974,7 +990,8 @@ static inline int btree_iter_lock_root(struct btree_iter *iter,
 		lock_type = __btree_lock_want(iter, iter->level);
 		if (unlikely(!btree_node_lock(b, POS_MAX, iter->level,
 					      iter, lock_type,
-					      lock_root_check_fn, rootp)))
+					      lock_root_check_fn, rootp,
+					      trace_ip)))
 			return -EINTR;
 
 		if (likely(b == READ_ONCE(*rootp) &&
@@ -1046,7 +1063,8 @@ static noinline void btree_node_mem_ptr_set(struct btree_iter *iter,
 		btree_node_unlock(iter, plevel);
 }
 
-static __always_inline int btree_iter_down(struct btree_iter *iter)
+static __always_inline int btree_iter_down(struct btree_iter *iter,
+					   unsigned long trace_ip)
 {
 	struct bch_fs *c = iter->trans->c;
 	struct btree_iter_level *l = &iter->l[iter->level];
@@ -1060,7 +1078,7 @@ static __always_inline int btree_iter_down(struct btree_iter *iter)
 	bch2_bkey_unpack(l->b, &tmp.k,
 			 bch2_btree_node_iter_peek(&l->iter, l->b));
 
-	b = bch2_btree_node_get(c, iter, &tmp.k, level, lock_type);
+	b = bch2_btree_node_get(c, iter, &tmp.k, level, lock_type, trace_ip);
 	if (unlikely(IS_ERR(b)))
 		return PTR_ERR(b);
 
@@ -1084,7 +1102,7 @@ static void btree_iter_up(struct btree_iter *iter)
 	btree_node_unlock(iter, iter->level++);
 }
 
-static int btree_iter_traverse_one(struct btree_iter *);
+static int btree_iter_traverse_one(struct btree_iter *, unsigned long);
 
 static int __btree_iter_traverse_all(struct btree_trans *trans, int ret)
 {
@@ -1109,6 +1127,7 @@ retry_all:
 	bubble_sort(sorted, nr_sorted, btree_iter_cmp_by_idx);
 #undef btree_iter_cmp_by_idx
 	bch2_trans_unlock(trans);
+	cond_resched();
 
 	if (unlikely(ret == -ENOMEM)) {
 		struct closure cl;
@@ -1139,7 +1158,7 @@ retry_all:
 		if (!(trans->iters_linked & (1ULL << idx)))
 			continue;
 
-		ret = btree_iter_traverse_one(&trans->iters[idx]);
+		ret = btree_iter_traverse_one(&trans->iters[idx], _THIS_IP_);
 		if (ret)
 			goto retry_all;
 	}
@@ -1202,7 +1221,8 @@ static inline unsigned btree_iter_up_until_good_node(struct btree_iter *iter,
  * On error, caller (peek_node()/peek_key()) must return NULL; the error is
  * stashed in the iterator and returned from bch2_trans_exit().
  */
-static int btree_iter_traverse_one(struct btree_iter *iter)
+static int btree_iter_traverse_one(struct btree_iter *iter,
+				   unsigned long trace_ip)
 {
 	unsigned depth_want = iter->level;
 
@@ -1249,8 +1269,8 @@ static int btree_iter_traverse_one(struct btree_iter *iter)
 	 */
 	while (iter->level > depth_want) {
 		int ret = btree_iter_node(iter, iter->level)
-			? btree_iter_down(iter)
-			: btree_iter_lock_root(iter, depth_want);
+			? btree_iter_down(iter, trace_ip)
+			: btree_iter_lock_root(iter, depth_want, trace_ip);
 		if (unlikely(ret)) {
 			if (ret == 1)
 				return 0;
@@ -1281,7 +1301,7 @@ int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter)
 	int ret;
 
 	ret =   bch2_trans_cond_resched(trans) ?:
-		btree_iter_traverse_one(iter);
+		btree_iter_traverse_one(iter, _RET_IP_);
 	if (unlikely(ret))
 		ret = __btree_iter_traverse_all(trans, ret);
 
diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index 52b657030755..9a93b6d26878 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -242,7 +242,7 @@ retry:
 		enum six_lock_type lock_want = __btree_lock_want(iter, 0);
 
 		if (!btree_node_lock((void *) ck, iter->pos, 0, iter, lock_want,
-				     bkey_cached_check_fn, iter)) {
+				     bkey_cached_check_fn, iter, _THIS_IP_)) {
 			if (ck->key.btree_id != iter->btree_id ||
 			    bkey_cmp(ck->key.pos, iter->pos)) {
 				goto retry;
diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h
index ffee6f2d7d4b..cc07ef2938ae 100644
--- a/fs/bcachefs/btree_locking.h
+++ b/fs/bcachefs/btree_locking.h
@@ -175,13 +175,15 @@ static inline bool btree_node_lock_increment(struct btree_trans *trans,
 
 bool __bch2_btree_node_lock(struct btree *, struct bpos, unsigned,
 			    struct btree_iter *, enum six_lock_type,
-			    six_lock_should_sleep_fn, void *);
+			    six_lock_should_sleep_fn, void *,
+			    unsigned long);
 
 static inline bool btree_node_lock(struct btree *b,
 			struct bpos pos, unsigned level,
 			struct btree_iter *iter,
 			enum six_lock_type type,
-			six_lock_should_sleep_fn should_sleep_fn, void *p)
+			six_lock_should_sleep_fn should_sleep_fn, void *p,
+			unsigned long ip)
 {
 	struct btree_trans *trans = iter->trans;
 	bool ret;
@@ -199,7 +201,7 @@ static inline bool btree_node_lock(struct btree *b,
 	ret   = likely(six_trylock_type(&b->c.lock, type)) ||
 		btree_node_lock_increment(trans, b, level, type) ||
 		__bch2_btree_node_lock(b, pos, level, iter, type,
-				       should_sleep_fn, p);
+				       should_sleep_fn, p, ip);
 
 #ifdef CONFIG_BCACHEFS_DEBUG
 	trans->locking = NULL;
diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h
index 5c57b6efaaf3..c30fb4d74a95 100644
--- a/fs/bcachefs/trace.h
+++ b/fs/bcachefs/trace.h
@@ -536,9 +536,46 @@ DEFINE_EVENT(transaction_restart,	trans_restart_btree_node_reused,
 	TP_ARGS(ip)
 );
 
-DEFINE_EVENT(transaction_restart,	trans_restart_would_deadlock,
-	TP_PROTO(unsigned long ip),
-	TP_ARGS(ip)
+TRACE_EVENT(trans_restart_would_deadlock,
+	TP_PROTO(unsigned long	trans_ip,
+		 unsigned long	caller_ip,
+		 unsigned	reason,
+		 enum btree_id	have_btree_id,
+		 unsigned	have_iter_type,
+		 enum btree_id	want_btree_id,
+		 unsigned	want_iter_type),
+	TP_ARGS(trans_ip, caller_ip, reason,
+		have_btree_id, have_iter_type,
+		want_btree_id, want_iter_type),
+
+	TP_STRUCT__entry(
+		__field(unsigned long,		trans_ip	)
+		__field(unsigned long,		caller_ip	)
+		__field(u8,			reason		)
+		__field(u8,			have_btree_id	)
+		__field(u8,			have_iter_type	)
+		__field(u8,			want_btree_id	)
+		__field(u8,			want_iter_type	)
+	),
+
+	TP_fast_assign(
+		__entry->trans_ip		= trans_ip;
+		__entry->caller_ip		= caller_ip;
+		__entry->reason			= reason;
+		__entry->have_btree_id		= have_btree_id;
+		__entry->have_iter_type		= have_iter_type;
+		__entry->want_btree_id		= want_btree_id;
+		__entry->want_iter_type		= want_iter_type;
+	),
+
+	TP_printk("%pF %pF because %u have %u:%u want %u:%u",
+		  (void *) __entry->trans_ip,
+		  (void *) __entry->caller_ip,
+		  __entry->reason,
+		  __entry->have_btree_id,
+		  __entry->have_iter_type,
+		  __entry->want_btree_id,
+		  __entry->want_iter_type)
 );
 
 TRACE_EVENT(trans_restart_iters_realloced,
-- 
cgit 


From dcf141b9e13d261629806aa37e0fa7769d38b789 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 28 Oct 2020 14:18:18 -0400
Subject: bcachefs: Fix spurious transaction restarts

The check for whether locking a btree node would deadlock was wrong - we
have to check that interior nodes are locked before descendents, but
this check was wrong when consider cached vs. non cached iterators.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c  | 1 +
 fs/bcachefs/btree_types.h | 5 +++++
 2 files changed, 6 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index a76e13000d11..d310b2389e38 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -244,6 +244,7 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
 		 * we're about to lock, it must have the ancestors locked too:
 		 */
 		if (linked->btree_id == iter->btree_id &&
+		    btree_iter_is_cached(linked) == btree_iter_is_cached(iter) &&
 		    level > __fls(linked->nodes_locked)) {
 			if (!(trans->nounlock)) {
 				linked->locks_want =
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index f02518f9d9ec..d4f0db1fe457 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -283,6 +283,11 @@ btree_iter_type(const struct btree_iter *iter)
 	return iter->flags & BTREE_ITER_TYPE;
 }
 
+static inline bool btree_iter_is_cached(const struct btree_iter *iter)
+{
+	return btree_iter_type(iter) == BTREE_ITER_CACHED;
+}
+
 static inline struct btree_iter_level *iter_l(struct btree_iter *iter)
 {
 	return iter->l + iter->level;
-- 
cgit 


From b16fa0bae5766748bd682b0829136ca02d6ea3ba Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 30 Oct 2020 17:29:38 -0400
Subject: bcachefs: Improve check for when bios are physically contiguous

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/compress.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c
index 595d76aa3956..27bbc265d550 100644
--- a/fs/bcachefs/compress.c
+++ b/fs/bcachefs/compress.c
@@ -70,7 +70,7 @@ static struct bbuf __bio_map_or_bounce(struct bch_fs *c, struct bio *bio,
 
 	BUG_ON(bvec_iter_sectors(start) > c->sb.encoded_extent_max);
 
-	if (!IS_ENABLED(CONFIG_HIGHMEM) &&
+	if (!PageHighMem(bio_iter_page(bio, start)) &&
 	    bio_phys_contig(bio, start))
 		return (struct bbuf) {
 			.b = page_address(bio_iter_page(bio, start)) +
-- 
cgit 


From 45e4dcba79401dd17e0c32ff26f83e240c27ca5c Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 27 Oct 2020 18:56:21 -0400
Subject: bcachefs: Inode create optimization

On workloads that do a lot of multithreaded creates all at once, lock
contention on the inodes btree turns out to still be an issue.

This patch adds a small buffer of inode numbers that are known to be
free, so that we can avoid touching the btree on every create. Also,
this changes inode creates to update via the btree key cache for the
initial create.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs.h  |   4 ++
 fs/bcachefs/fs-common.c |   4 +-
 fs/bcachefs/inode.c     | 137 ++++++++++++++++++++++++++++++++----------------
 fs/bcachefs/inode.h     |   4 +-
 fs/bcachefs/super.c     |   2 +
 5 files changed, 101 insertions(+), 50 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index f60d530313dc..b6f93da37ba0 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -802,6 +802,10 @@ struct bch_fs {
 	struct mutex		verify_lock;
 #endif
 
+	struct mutex		inode_create_lock;
+	unsigned		unused_inodes_nr;
+	u64			unused_inodes[64];
+	u32			unused_inodes_gens[64];
 	u64			unused_inode_hint;
 
 	/*
diff --git a/fs/bcachefs/fs-common.c b/fs/bcachefs/fs-common.c
index 878419d40992..503ce1920f39 100644
--- a/fs/bcachefs/fs-common.c
+++ b/fs/bcachefs/fs-common.c
@@ -34,9 +34,7 @@ int bch2_create_trans(struct btree_trans *trans, u64 dir_inum,
 	if (!name)
 		new_inode->bi_flags |= BCH_INODE_UNLINKED;
 
-	ret = bch2_inode_create(trans, new_inode,
-				BLOCKDEV_INODE_MAX, 0,
-				&c->unused_inode_hint);
+	ret = bch2_inode_create(trans, new_inode);
 	if (ret)
 		goto err;
 
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index 631c60bb2fac..c55c164be882 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -361,71 +361,120 @@ static inline u32 bkey_generation(struct bkey_s_c k)
 	}
 }
 
-int bch2_inode_create(struct btree_trans *trans,
-		      struct bch_inode_unpacked *inode_u,
-		      u64 min, u64 max, u64 *hint)
+static int scan_free_inums(struct btree_trans *trans)
 {
-	struct bkey_inode_buf *inode_p;
+	struct bch_fs *c = trans->c;
 	struct btree_iter *iter = NULL;
 	struct bkey_s_c k;
-	u64 start;
-	int ret;
-
-	if (!max)
-		max = ULLONG_MAX;
-
-	if (trans->c->opts.inodes_32bit)
-		max = min_t(u64, max, U32_MAX);
+	u64 min = BLOCKDEV_INODE_MAX;
+	u64 max = c->opts.inodes_32bit
+		? S32_MAX : S64_MAX;
+	u64 start = max(min, READ_ONCE(c->unused_inode_hint));
+	int ret = 0;
+
+	iter = bch2_trans_get_iter(trans, BTREE_ID_INODES, POS(0, start),
+				   BTREE_ITER_SLOTS);
+	if (IS_ERR(iter))
+		return PTR_ERR(iter);
+again:
+	for_each_btree_key_continue(iter, BTREE_ITER_SLOTS, k, ret) {
+		if (bkey_cmp(iter->pos, POS(0, max)) > 0)
+			break;
 
-	start = READ_ONCE(*hint);
+		/*
+		 * This doesn't check the btree key cache, but we don't care:
+		 * we have to recheck with an intent lock held on the slot we're
+		 * inserting to anyways:
+		 */
+		if (k.k->type != KEY_TYPE_inode) {
+			if (c->unused_inodes_nr < ARRAY_SIZE(c->unused_inodes)) {
+				c->unused_inodes[c->unused_inodes_nr] = k.k->p.offset;
+				c->unused_inodes_gens[c->unused_inodes_nr] = bkey_generation(k);
+				c->unused_inodes_nr++;
+			}
+
+			if (c->unused_inodes_nr == ARRAY_SIZE(c->unused_inodes))
+				goto out;
+		}
+	}
 
-	if (start >= max || start < min)
+	if (!ret && start != min) {
+		max = start;
 		start = min;
+		bch2_btree_iter_set_pos(iter, POS(0, start));
+		goto again;
+	}
+out:
+	c->unused_inode_hint = iter->pos.offset;
+	bch2_trans_iter_put(trans, iter);
+	return ret;
+}
+
+int bch2_inode_create(struct btree_trans *trans,
+		      struct bch_inode_unpacked *inode_u)
+{
+	struct bch_fs *c = trans->c;
+	struct bkey_inode_buf *inode_p;
+	struct btree_iter *iter = NULL;
+	struct bkey_s_c k;
+	u64 inum;
+	u32 generation;
+	int ret = 0;
 
 	inode_p = bch2_trans_kmalloc(trans, sizeof(*inode_p));
 	if (IS_ERR(inode_p))
 		return PTR_ERR(inode_p);
-again:
-	for_each_btree_key(trans, iter, BTREE_ID_INODES, POS(0, start),
-			   BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) {
-		if (bkey_cmp(iter->pos, POS(0, max)) > 0)
-			break;
 
-		/*
-		 * There's a potential cache coherency issue with the btree key
-		 * cache code here - we're iterating over the btree, skipping
-		 * that cache. We should never see an empty slot that isn't
-		 * actually empty due to a pending update in the key cache
-		 * because the update that creates the inode isn't done with a
-		 * cached iterator, but - better safe than sorry, check the
-		 * cache before using a slot:
-		 */
-		if (k.k->type != KEY_TYPE_inode &&
-		    !bch2_btree_key_cache_find(trans->c, BTREE_ID_INODES, iter->pos))
-			goto found_slot;
+	iter = bch2_trans_get_iter(trans, BTREE_ID_INODES, POS_MIN,
+				   BTREE_ITER_CACHED|
+				   BTREE_ITER_INTENT);
+	if (IS_ERR(iter))
+		return PTR_ERR(iter);
+retry:
+	if (!mutex_trylock(&c->inode_create_lock)) {
+		bch2_trans_unlock(trans);
+		mutex_lock(&c->inode_create_lock);
+		if (!bch2_trans_relock(trans)) {
+			mutex_unlock(&c->inode_create_lock);
+			ret = -EINTR;
+			goto err;
+		}
 	}
 
-	bch2_trans_iter_put(trans, iter);
+	if (!c->unused_inodes_nr)
+		ret = scan_free_inums(trans);
+	if (!ret && !c->unused_inodes_nr)
+		ret = -ENOSPC;
+	if (!ret) {
+		--c->unused_inodes_nr;
+		inum		= c->unused_inodes[c->unused_inodes_nr];
+		generation	= c->unused_inodes_gens[c->unused_inodes_nr];
+	}
+
+	mutex_unlock(&c->inode_create_lock);
 
 	if (ret)
-		return ret;
+		goto err;
 
-	if (start != min) {
-		/* Retry from start */
-		start = min;
-		goto again;
-	}
+	bch2_btree_iter_set_pos(iter, POS(0, inum));
+
+	/* Recheck that the slot is free with an intent lock held: */
+	k = bch2_btree_iter_peek_cached(iter);
+	ret = bkey_err(k);
+	if (ret)
+		goto err;
+
+	if (k.k->type == KEY_TYPE_inode)
+		goto retry;
 
-	return -ENOSPC;
-found_slot:
-	*hint			= k.k->p.offset;
-	inode_u->bi_inum	= k.k->p.offset;
-	inode_u->bi_generation	= bkey_generation(k);
+	inode_u->bi_inum	= inum;
+	inode_u->bi_generation	= generation;
 
 	bch2_inode_pack(inode_p, inode_u);
 	bch2_trans_update(trans, iter, &inode_p->inode.k_i, 0);
+err:
 	bch2_trans_iter_put(trans, iter);
-	return 0;
+	return ret;
 }
 
 int bch2_inode_rm(struct bch_fs *c, u64 inode_nr)
diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h
index bb759a46dc41..5743be2307f3 100644
--- a/fs/bcachefs/inode.h
+++ b/fs/bcachefs/inode.h
@@ -60,9 +60,7 @@ void bch2_inode_init(struct bch_fs *, struct bch_inode_unpacked *,
 		     uid_t, gid_t, umode_t, dev_t,
 		     struct bch_inode_unpacked *);
 
-int bch2_inode_create(struct btree_trans *,
-		      struct bch_inode_unpacked *,
-		      u64, u64, u64 *);
+int bch2_inode_create(struct btree_trans *, struct bch_inode_unpacked *);
 
 int bch2_inode_rm(struct bch_fs *, u64);
 
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 15e760d8dd4d..b8736a822630 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -703,6 +703,8 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 
 	seqcount_init(&c->usage_lock);
 
+	mutex_init(&c->inode_create_lock);
+
 	c->copy_gc_enabled		= 1;
 	c->rebalance.enabled		= 1;
 	c->promote_whole_extents	= true;
-- 
cgit 


From 2f33ece9b47741ba53b467b7599145ed7595a2d7 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 2 Nov 2020 17:51:38 -0500
Subject: bcachefs: Minor journal reclaim improvement

With the btree key cache code, journal reclaim now has a lot more work
to do. It could be the case that after journal reclaim has finished one
iteration there's already more work to do, so put it in a loop to check
for that.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/journal_reclaim.c | 82 ++++++++++++++++++++++++-------------------
 fs/bcachefs/super.c           |  8 ++---
 2 files changed, 50 insertions(+), 40 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
index 49ff26cb246c..3f57f498ce0b 100644
--- a/fs/bcachefs/journal_reclaim.c
+++ b/fs/bcachefs/journal_reclaim.c
@@ -465,34 +465,12 @@ static bool journal_flush_pins(struct journal *j, u64 seq_to_flush,
 	return ret;
 }
 
-/**
- * bch2_journal_reclaim - free up journal buckets
- *
- * Background journal reclaim writes out btree nodes. It should be run
- * early enough so that we never completely run out of journal buckets.
- *
- * High watermarks for triggering background reclaim:
- * - FIFO has fewer than 512 entries left
- * - fewer than 25% journal buckets free
- *
- * Background reclaim runs until low watermarks are reached:
- * - FIFO has more than 1024 entries left
- * - more than 50% journal buckets free
- *
- * As long as a reclaim can complete in the time it takes to fill up
- * 512 journal entries or 25% of all journal buckets, then
- * journal_next_bucket() should not stall.
- */
-void bch2_journal_reclaim(struct journal *j)
+static u64 journal_seq_to_flush(struct journal *j)
 {
 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
 	struct bch_dev *ca;
-	unsigned iter, min_nr = 0;
 	u64 seq_to_flush = 0;
-
-	lockdep_assert_held(&j->reclaim_lock);
-
-	bch2_journal_do_discards(j);
+	unsigned iter;
 
 	spin_lock(&j->lock);
 
@@ -524,20 +502,52 @@ void bch2_journal_reclaim(struct journal *j)
 			     (j->pin.size >> 1));
 	spin_unlock(&j->lock);
 
-	/*
-	 * If it's been longer than j->reclaim_delay_ms since we last flushed,
-	 * make sure to flush at least one journal pin:
-	 */
-	if (time_after(jiffies, j->last_flushed +
-		       msecs_to_jiffies(j->reclaim_delay_ms)))
-		min_nr = 1;
+	return seq_to_flush;
+}
 
-	if (j->prereserved.reserved * 2 > j->prereserved.remaining) {
-		seq_to_flush = max(seq_to_flush, journal_last_seq(j));
-		min_nr = 1;
-	}
+/**
+ * bch2_journal_reclaim - free up journal buckets
+ *
+ * Background journal reclaim writes out btree nodes. It should be run
+ * early enough so that we never completely run out of journal buckets.
+ *
+ * High watermarks for triggering background reclaim:
+ * - FIFO has fewer than 512 entries left
+ * - fewer than 25% journal buckets free
+ *
+ * Background reclaim runs until low watermarks are reached:
+ * - FIFO has more than 1024 entries left
+ * - more than 50% journal buckets free
+ *
+ * As long as a reclaim can complete in the time it takes to fill up
+ * 512 journal entries or 25% of all journal buckets, then
+ * journal_next_bucket() should not stall.
+ */
+void bch2_journal_reclaim(struct journal *j)
+{
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
+	unsigned min_nr = 0;
+	u64 seq_to_flush = 0;
+
+	lockdep_assert_held(&j->reclaim_lock);
+
+	do {
+		bch2_journal_do_discards(j);
+
+		seq_to_flush = journal_seq_to_flush(j);
+		min_nr = 0;
+
+		/*
+		 * If it's been longer than j->reclaim_delay_ms since we last flushed,
+		 * make sure to flush at least one journal pin:
+		 */
+		if (time_after(jiffies, j->last_flushed +
+			       msecs_to_jiffies(j->reclaim_delay_ms)))
+			min_nr = 1;
 
-	journal_flush_pins(j, seq_to_flush, min_nr);
+		if (j->prereserved.reserved * 2 > j->prereserved.remaining)
+			min_nr = 1;
+	} while (journal_flush_pins(j, seq_to_flush, min_nr));
 
 	if (!bch2_journal_error(j))
 		queue_delayed_work(c->journal_reclaim_wq, &j->reclaim_work,
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index b8736a822630..fd78ab205865 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -747,10 +747,10 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 		sizeof(struct sort_iter_set);
 
 	if (!(c->wq = alloc_workqueue("bcachefs",
-				WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_HIGHPRI, 1)) ||
-	    !(c->copygc_wq = alloc_workqueue("bcache_copygc",
-				WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_HIGHPRI, 1)) ||
-	    !(c->journal_reclaim_wq = alloc_workqueue("bcache_journal",
+				WQ_FREEZABLE|WQ_MEM_RECLAIM, 1)) ||
+	    !(c->copygc_wq = alloc_workqueue("bcachefs_copygc",
+				WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) ||
+	    !(c->journal_reclaim_wq = alloc_workqueue("bcachefs_journal_reclaim",
 				WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_HIGHPRI, 1)) ||
 	    percpu_ref_init(&c->writes, bch2_writes_disabled,
 			    PERCPU_REF_INIT_DEAD, GFP_KERNEL) ||
-- 
cgit 


From 29364f34530d30ca0f34dfe5d1ea73c8f1e77ff3 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 2 Nov 2020 18:20:44 -0500
Subject: bcachefs: Drop sysfs interface to debug parameters

It's not used much anymore, the module paramter interface is better.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.h  |  2 +-
 fs/bcachefs/bcachefs.h          | 16 +++++++++++-----
 fs/bcachefs/bkey_methods.c      |  2 +-
 fs/bcachefs/bset.c              | 18 ++++++++----------
 fs/bcachefs/bset.h              | 19 ++++---------------
 fs/bcachefs/btree_cache.c       |  8 ++++----
 fs/bcachefs/btree_gc.c          | 12 ++++++------
 fs/bcachefs/btree_io.c          |  2 +-
 fs/bcachefs/btree_iter.c        | 12 ++++++------
 fs/bcachefs/btree_types.h       |  4 ----
 fs/bcachefs/btree_update_leaf.c |  6 +++---
 fs/bcachefs/debug.c             |  2 +-
 fs/bcachefs/debug.h             | 33 ++-------------------------------
 fs/bcachefs/extents.c           |  4 ++--
 fs/bcachefs/sysfs.c             | 19 -------------------
 15 files changed, 50 insertions(+), 109 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h
index 66ce54724e93..8e3abb89dfb7 100644
--- a/fs/bcachefs/alloc_background.h
+++ b/fs/bcachefs/alloc_background.h
@@ -74,7 +74,7 @@ static inline void bch2_wake_allocator(struct bch_dev *ca)
 static inline void verify_not_on_freelist(struct bch_fs *c, struct bch_dev *ca,
 					  size_t bucket)
 {
-	if (expensive_debug_checks(c)) {
+	if (bch2_expensive_debug_checks) {
 		size_t iter;
 		long i;
 		unsigned j;
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index b6f93da37ba0..85b8b7c4c9e1 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -295,6 +295,16 @@ do {									\
 #define BCH_DEBUG_PARAMS() BCH_DEBUG_PARAMS_ALWAYS()
 #endif
 
+#define BCH_DEBUG_PARAM(name, description) extern bool bch2_##name;
+BCH_DEBUG_PARAMS()
+#undef BCH_DEBUG_PARAM
+
+#ifndef CONFIG_BCACHEFS_DEBUG
+#define BCH_DEBUG_PARAM(name, description) static const bool bch2_##name;
+BCH_DEBUG_PARAMS_DEBUG()
+#undef BCH_DEBUG_PARAM
+#endif
+
 #define BCH_TIME_STATS()			\
 	x(btree_node_mem_alloc)			\
 	x(btree_node_split)			\
@@ -726,7 +736,7 @@ struct bch_fs {
 	struct bio_set		bio_read_split;
 	struct bio_set		bio_write;
 	struct mutex		bio_bounce_pages_lock;
-	mempool_t		bio_bounce_pages;
+mempool_t		bio_bounce_pages;
 	struct rhashtable	promote_table;
 
 	mempool_t		compression_bounce[2];
@@ -831,10 +841,6 @@ struct bch_fs {
 	unsigned		copy_gc_enabled:1;
 	bool			promote_whole_extents;
 
-#define BCH_DEBUG_PARAM(name, description) bool name;
-	BCH_DEBUG_PARAMS_ALL()
-#undef BCH_DEBUG_PARAM
-
 	struct bch2_time_stats	times[BCH_TIME_STAT_NR];
 };
 
diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c
index 32849229801d..99b7fce2bfd3 100644
--- a/fs/bcachefs/bkey_methods.c
+++ b/fs/bcachefs/bkey_methods.c
@@ -236,7 +236,7 @@ enum merge_result bch2_bkey_merge(struct bch_fs *c,
 	const struct bkey_ops *ops = &bch2_bkey_ops[l.k->type];
 	enum merge_result ret;
 
-	if (key_merging_disabled(c) ||
+	if (bch2_key_merging_disabled ||
 	    !ops->key_merge ||
 	    l.k->type != r.k->type ||
 	    bversion_cmp(l.k->version, r.k->version) ||
diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c
index 2894666bb77e..f483bcbe801e 100644
--- a/fs/bcachefs/bset.c
+++ b/fs/bcachefs/bset.c
@@ -376,15 +376,13 @@ static void bset_aux_tree_verify(struct btree *b)
 #endif
 }
 
-void bch2_btree_keys_init(struct btree *b, bool *expensive_debug_checks)
+void bch2_btree_keys_init(struct btree *b)
 {
 	unsigned i;
 
 	b->nsets		= 0;
 	memset(&b->nr, 0, sizeof(b->nr));
-#ifdef CONFIG_BCACHEFS_DEBUG
-	b->expensive_debug_checks = expensive_debug_checks;
-#endif
+
 	for (i = 0; i < MAX_BSETS; i++)
 		b->set[i].data_offset = U16_MAX;
 
@@ -510,7 +508,7 @@ static void bch2_bset_verify_rw_aux_tree(struct btree *b,
 	struct bkey_packed *k = btree_bkey_first(b, t);
 	unsigned j = 0;
 
-	if (!btree_keys_expensive_checks(b))
+	if (!bch2_expensive_debug_checks)
 		return;
 
 	BUG_ON(bset_has_ro_aux_tree(t));
@@ -910,7 +908,7 @@ struct bkey_packed *bch2_bkey_prev_filter(struct btree *b,
 		k = p;
 	}
 
-	if (btree_keys_expensive_checks(b)) {
+	if (bch2_expensive_debug_checks) {
 		BUG_ON(ret >= orig_k);
 
 		for (i = ret
@@ -1333,7 +1331,7 @@ struct bkey_packed *bch2_bset_search_linear(struct btree *b,
 		       bkey_iter_pos_cmp(b, m, search) < 0)
 			m = bkey_next_skip_noops(m, btree_bkey_last(b, t));
 
-	if (btree_keys_expensive_checks(b)) {
+	if (bch2_expensive_debug_checks) {
 		struct bkey_packed *prev = bch2_bkey_prev_all(b, t, m);
 
 		BUG_ON(prev &&
@@ -1589,7 +1587,7 @@ static inline void __bch2_btree_node_iter_advance(struct btree_node_iter *iter,
 void bch2_btree_node_iter_advance(struct btree_node_iter *iter,
 				  struct btree *b)
 {
-	if (btree_keys_expensive_checks(b)) {
+	if (bch2_expensive_debug_checks) {
 		bch2_btree_node_iter_verify(iter, b);
 		bch2_btree_node_iter_next_check(iter, b);
 	}
@@ -1608,7 +1606,7 @@ struct bkey_packed *bch2_btree_node_iter_prev_all(struct btree_node_iter *iter,
 	struct bset_tree *t;
 	unsigned end = 0;
 
-	if (btree_keys_expensive_checks(b))
+	if (bch2_expensive_debug_checks)
 		bch2_btree_node_iter_verify(iter, b);
 
 	for_each_bset(b, t) {
@@ -1644,7 +1642,7 @@ found:
 	iter->data[0].k = __btree_node_key_to_offset(b, prev);
 	iter->data[0].end = end;
 
-	if (btree_keys_expensive_checks(b))
+	if (bch2_expensive_debug_checks)
 		bch2_btree_node_iter_verify(iter, b);
 	return prev;
 }
diff --git a/fs/bcachefs/bset.h b/fs/bcachefs/bset.h
index 88f242191408..c9fe83ded267 100644
--- a/fs/bcachefs/bset.h
+++ b/fs/bcachefs/bset.h
@@ -5,7 +5,7 @@
 #include <linux/kernel.h>
 #include <linux/types.h>
 
-#include "bcachefs_format.h"
+#include "bcachefs.h"
 #include "bkey.h"
 #include "bkey_methods.h"
 #include "btree_types.h"
@@ -147,17 +147,6 @@
  * first key in that range of bytes again.
  */
 
-extern bool bch2_expensive_debug_checks;
-
-static inline bool btree_keys_expensive_checks(const struct btree *b)
-{
-#ifdef CONFIG_BCACHEFS_DEBUG
-	return bch2_expensive_debug_checks || *b->expensive_debug_checks;
-#else
-	return false;
-#endif
-}
-
 enum bset_aux_tree_type {
 	BSET_NO_AUX_TREE,
 	BSET_RO_AUX_TREE,
@@ -228,7 +217,7 @@ __bkey_unpack_key_format_checked(const struct btree *b,
 		compiled_unpack_fn unpack_fn = b->aux_data;
 		unpack_fn(dst, src);
 
-		if (btree_keys_expensive_checks(b)) {
+		if (bch2_expensive_debug_checks) {
 			struct bkey dst2 = __bch2_bkey_unpack_key(&b->format, src);
 
 			BUG_ON(memcmp(dst, &dst2, sizeof(*dst)));
@@ -366,7 +355,7 @@ static inline struct bset *bset_next_set(struct btree *b,
 	return ((void *) i) + round_up(vstruct_bytes(i), block_bytes);
 }
 
-void bch2_btree_keys_init(struct btree *, bool *);
+void bch2_btree_keys_init(struct btree *);
 
 void bch2_bset_init_first(struct btree *, struct bset *);
 void bch2_bset_init_next(struct bch_fs *, struct btree *,
@@ -669,7 +658,7 @@ static inline void bch2_verify_insert_pos(struct btree *b,
 
 static inline void bch2_verify_btree_nr_keys(struct btree *b)
 {
-	if (btree_keys_expensive_checks(b))
+	if (bch2_expensive_debug_checks)
 		__bch2_verify_btree_nr_keys(b);
 }
 
diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index 02a2f558cf4d..229841c2ef0c 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -212,7 +212,7 @@ static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush)
 		 * - unless btree verify mode is enabled, since it runs out of
 		 * the post write cleanup:
 		 */
-		if (verify_btree_ondisk(c))
+		if (bch2_verify_btree_ondisk)
 			bch2_btree_node_write(c, b, SIX_LOCK_intent);
 		else
 			__bch2_btree_node_write(c, b, SIX_LOCK_read);
@@ -255,7 +255,7 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink,
 	unsigned long freed = 0;
 	unsigned i, flags;
 
-	if (btree_shrinker_disabled(c))
+	if (bch2_btree_shrinker_disabled)
 		return SHRINK_STOP;
 
 	/* Return -1 if we can't do anything right now */
@@ -342,7 +342,7 @@ static unsigned long bch2_btree_cache_count(struct shrinker *shrink,
 					btree_cache.shrink);
 	struct btree_cache *bc = &c->btree_cache;
 
-	if (btree_shrinker_disabled(c))
+	if (bch2_btree_shrinker_disabled)
 		return 0;
 
 	return btree_cache_can_free(bc) * btree_pages(c);
@@ -591,7 +591,7 @@ out:
 	b->sib_u64s[0]		= 0;
 	b->sib_u64s[1]		= 0;
 	b->whiteout_u64s	= 0;
-	bch2_btree_keys_init(b, &c->expensive_debug_checks);
+	bch2_btree_keys_init(b);
 
 	bch2_time_stats_update(&c->times[BCH_TIME_btree_node_mem_alloc],
 			       start_time);
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 74012bea7126..da0ad8f50775 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -101,7 +101,7 @@ static int bch2_gc_mark_key(struct bch_fs *c, struct bkey_s_c k,
 	int ret = 0;
 
 	if (initial) {
-		BUG_ON(journal_seq_verify(c) &&
+		BUG_ON(bch2_journal_seq_verify &&
 		       k.k->version.lo > journal_cur_seq(&c->journal));
 
 		/* XXX change to fsck check */
@@ -209,7 +209,7 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id,
 	struct btree_iter *iter;
 	struct btree *b;
 	unsigned depth = metadata_only			? 1
-		: expensive_debug_checks(c)		? 0
+		: bch2_expensive_debug_checks		? 0
 		: !btree_node_type_needs_gc(btree_id)	? 1
 		: 0;
 	u8 max_stale = 0;
@@ -236,8 +236,8 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id,
 						BTREE_INSERT_USE_RESERVE|
 						BTREE_INSERT_NOWAIT|
 						BTREE_INSERT_GC_LOCK_HELD);
-			else if (!btree_gc_rewrite_disabled(c) &&
-				 (btree_gc_always_rewrite(c) || max_stale > 16))
+			else if (!bch2_btree_gc_rewrite_disabled &&
+				 (bch2_btree_gc_always_rewrite || max_stale > 16))
 				bch2_btree_node_rewrite(c, iter,
 						b->data->keys.seq,
 						BTREE_INSERT_NOWAIT|
@@ -328,7 +328,7 @@ static int bch2_gc_btree_init(struct bch_fs *c,
 {
 	struct btree *b;
 	unsigned target_depth = metadata_only		? 1
-		: expensive_debug_checks(c)		? 0
+		: bch2_expensive_debug_checks		? 0
 		: !btree_node_type_needs_gc(btree_id)	? 1
 		: 0;
 	u8 max_stale = 0;
@@ -835,7 +835,7 @@ again:
 out:
 	if (!ret &&
 	    (test_bit(BCH_FS_FIXED_GENS, &c->flags) ||
-	     (!iter && test_restart_gc(c)))) {
+	     (!iter && bch2_test_restart_gc))) {
 		/*
 		 * XXX: make sure gens we fixed got saved
 		 */
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index eebab3d08c0d..d4f61ee5ed72 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -1044,7 +1044,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry
 		const char *invalid = bch2_bkey_val_invalid(c, u.s_c);
 
 		if (invalid ||
-		    (inject_invalid_keys(c) &&
+		    (bch2_inject_invalid_keys &&
 		     !bversion_cmp(u.k->version, MAX_VERSION))) {
 			char buf[160];
 
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index d310b2389e38..0b69cdccccdb 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -487,7 +487,7 @@ static void bch2_btree_iter_verify_level(struct btree_iter *iter,
 	char buf1[100], buf2[100];
 	const char *msg;
 
-	if (!debug_check_iterators(iter->trans->c))
+	if (!bch2_debug_check_iterators)
 		return;
 
 	if (btree_iter_type(iter) == BTREE_ITER_CACHED) {
@@ -583,7 +583,7 @@ void bch2_btree_trans_verify_iters(struct btree_trans *trans, struct btree *b)
 {
 	struct btree_iter *iter;
 
-	if (!debug_check_iterators(trans->c))
+	if (!bch2_debug_check_iterators)
 		return;
 
 	trans_for_each_iter_with_node(trans, b, iter)
@@ -755,7 +755,7 @@ void bch2_btree_node_iter_fix(struct btree_iter *iter,
 		__bch2_btree_node_iter_fix(iter, b, node_iter, t,
 					   where, clobber_u64s, new_u64s);
 
-		if (debug_check_iterators(iter->trans->c))
+		if (bch2_debug_check_iterators)
 			bch2_btree_node_iter_verify(node_iter, b);
 	}
 
@@ -785,7 +785,7 @@ static inline struct bkey_s_c __btree_iter_unpack(struct btree_iter *iter,
 
 	ret = bkey_disassemble(l->b, k, u);
 
-	if (debug_check_bkeys(iter->trans->c))
+	if (bch2_debug_check_bkeys)
 		bch2_bkey_debugcheck(iter->trans->c, l->b, ret);
 
 	return ret;
@@ -1566,13 +1566,13 @@ static inline struct bkey_s_c btree_iter_peek_uptodate(struct btree_iter *iter)
 
 		ret.v = bkeyp_val(&l->b->format, _k);
 
-		if (debug_check_iterators(iter->trans->c)) {
+		if (bch2_debug_check_iterators) {
 			struct bkey k = bkey_unpack_key(l->b, _k);
 
 			BUG_ON(memcmp(&k, &iter->k, sizeof(k)));
 		}
 
-		if (debug_check_bkeys(iter->trans->c))
+		if (bch2_debug_check_bkeys)
 			bch2_bkey_debugcheck(iter->trans->c, l->b, ret);
 	}
 
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index d4f0db1fe457..3b1dcbf5e625 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -130,10 +130,6 @@ struct btree {
 
 	struct btree_write	writes[2];
 
-#ifdef CONFIG_BCACHEFS_DEBUG
-	bool			*expensive_debug_checks;
-#endif
-
 	/* Key/pointer for this btree node */
 	__BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX);
 };
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 839dba099cac..51ff6a16d249 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -220,7 +220,7 @@ static inline void btree_insert_entry_checks(struct btree_trans *trans,
 	struct bch_fs *c = trans->c;
 
 	BUG_ON(bkey_cmp(insert->k.p, iter->pos));
-	BUG_ON(debug_check_bkeys(c) &&
+	BUG_ON(bch2_debug_check_bkeys &&
 	       bch2_bkey_invalid(c, bkey_i_to_s_c(insert),
 				 __btree_node_type(iter->level, iter->btree_id)));
 }
@@ -440,10 +440,10 @@ bch2_trans_commit_write_locked(struct btree_trans *trans,
 	 */
 
 	if (!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)) {
-		if (journal_seq_verify(c))
+		if (bch2_journal_seq_verify)
 			trans_for_each_update2(trans, i)
 				i->k->k.version.lo = trans->journal_res.seq;
-		else if (inject_invalid_keys(c))
+		else if (bch2_inject_invalid_keys)
 			trans_for_each_update2(trans, i)
 				i->k->k.version = MAX_VERSION;
 	}
diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c
index be97cbba12e7..0d5ec39e44e0 100644
--- a/fs/bcachefs/debug.c
+++ b/fs/bcachefs/debug.c
@@ -54,7 +54,7 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b)
 	v->written	= 0;
 	v->c.level	= b->c.level;
 	v->c.btree_id	= b->c.btree_id;
-	bch2_btree_keys_init(v, &c->expensive_debug_checks);
+	bch2_btree_keys_init(v);
 
 	if (bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key),
 				       NULL, &pick) <= 0)
diff --git a/fs/bcachefs/debug.h b/fs/bcachefs/debug.h
index 56c2d1ab5f63..7ac1615e9447 100644
--- a/fs/bcachefs/debug.h
+++ b/fs/bcachefs/debug.h
@@ -8,44 +8,15 @@ struct bio;
 struct btree;
 struct bch_fs;
 
-#define BCH_DEBUG_PARAM(name, description) extern bool bch2_##name;
-BCH_DEBUG_PARAMS()
-#undef BCH_DEBUG_PARAM
-
-#define BCH_DEBUG_PARAM(name, description)				\
-	static inline bool name(struct bch_fs *c)			\
-	{ return bch2_##name || c->name;	}
-BCH_DEBUG_PARAMS_ALWAYS()
-#undef BCH_DEBUG_PARAM
-
 #ifdef CONFIG_BCACHEFS_DEBUG
-
-#define BCH_DEBUG_PARAM(name, description)				\
-	static inline bool name(struct bch_fs *c)			\
-	{ return bch2_##name || c->name;	}
-BCH_DEBUG_PARAMS_DEBUG()
-#undef BCH_DEBUG_PARAM
-
 void __bch2_btree_verify(struct bch_fs *, struct btree *);
-
-#define bypass_torture_test(d)		((d)->bypass_torture_test)
-
-#else /* DEBUG */
-
-#define BCH_DEBUG_PARAM(name, description)				\
-	static inline bool name(struct bch_fs *c) { return false; }
-BCH_DEBUG_PARAMS_DEBUG()
-#undef BCH_DEBUG_PARAM
-
+#else
 static inline void __bch2_btree_verify(struct bch_fs *c, struct btree *b) {}
-
-#define bypass_torture_test(d)		0
-
 #endif
 
 static inline void bch2_btree_verify(struct bch_fs *c, struct btree *b)
 {
-	if (verify_btree_ondisk(c))
+	if (bch2_verify_btree_ondisk)
 		__bch2_btree_verify(c, b);
 }
 
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 15e7c49e1a9b..f9838c1f36db 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -88,7 +88,7 @@ static inline bool ptr_better(struct bch_fs *c,
 		return bch2_rand_range(l1 + l2) > l1;
 	}
 
-	if (force_reconstruct_read(c))
+	if (bch2_force_reconstruct_read)
 		return p1.idx > p2.idx;
 
 	return p1.idx < p2.idx;
@@ -136,7 +136,7 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k,
 		    !bch2_dev_is_readable(ca))
 			p.idx++;
 
-		if (force_reconstruct_read(c) &&
+		if (bch2_force_reconstruct_read &&
 		    !p.idx && p.has_ec)
 			p.idx++;
 
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index deaafeecba64..598ad6bdd61b 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -208,12 +208,6 @@ read_attribute(io_timers_write);
 write_attribute(perf_test);
 #endif /* CONFIG_BCACHEFS_TESTS */
 
-#define BCH_DEBUG_PARAM(name, description)				\
-	rw_attribute(name);
-
-	BCH_DEBUG_PARAMS()
-#undef BCH_DEBUG_PARAM
-
 #define x(_name)						\
 	static struct attribute sysfs_time_stat_##_name =		\
 		{ .name = #_name, .mode = S_IRUGO };
@@ -414,10 +408,6 @@ SHOW(bch2_fs)
 		return out.pos - buf;
 	}
 
-#define BCH_DEBUG_PARAM(name, description) sysfs_print(name, c->name);
-	BCH_DEBUG_PARAMS()
-#undef BCH_DEBUG_PARAM
-
 	return 0;
 }
 
@@ -462,10 +452,6 @@ STORE(bch2_fs)
 
 	/* Debugging: */
 
-#define BCH_DEBUG_PARAM(name, description) sysfs_strtoul(name, c->name);
-	BCH_DEBUG_PARAMS()
-#undef BCH_DEBUG_PARAM
-
 	if (!test_bit(BCH_FS_STARTED, &c->flags))
 		return -EPERM;
 
@@ -590,11 +576,6 @@ struct attribute *bch2_fs_internal_files[] = {
 	&sysfs_io_timers_write,
 
 	&sysfs_internal_uuid,
-
-#define BCH_DEBUG_PARAM(name, description) &sysfs_##name,
-	BCH_DEBUG_PARAMS()
-#undef BCH_DEBUG_PARAM
-
 	NULL
 };
 
-- 
cgit 


From 692d4031a458092bc602840739f97c4acf155dcb Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 2 Nov 2020 18:36:08 -0500
Subject: bcachefs: Split out debug_check_btree_accounting

This check is very expensive

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs.h | 2 ++
 fs/bcachefs/bset.h     | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 85b8b7c4c9e1..d56057f27e8f 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -265,6 +265,8 @@ do {									\
 	BCH_DEBUG_PARAM(debug_check_bkeys,				\
 		"Run bkey_debugcheck (primarily checking GC/allocation "\
 		"information) when iterating over keys")		\
+	BCH_DEBUG_PARAM(debug_check_btree_accounting,			\
+		"Verify btree accounting for keys within a node")	\
 	BCH_DEBUG_PARAM(verify_btree_ondisk,				\
 		"Reread btree nodes at various points to verify the "	\
 		"mergesort in the read path against modifications "	\
diff --git a/fs/bcachefs/bset.h b/fs/bcachefs/bset.h
index c9fe83ded267..21e4ed4eacc3 100644
--- a/fs/bcachefs/bset.h
+++ b/fs/bcachefs/bset.h
@@ -658,7 +658,7 @@ static inline void bch2_verify_insert_pos(struct btree *b,
 
 static inline void bch2_verify_btree_nr_keys(struct btree *b)
 {
-	if (bch2_expensive_debug_checks)
+	if (bch2_debug_check_btree_accounting)
 		__bch2_verify_btree_nr_keys(b);
 }
 
-- 
cgit 


From ae1ede5893bd9b46f40cc9d1148321206369a9f2 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 2 Nov 2020 18:54:33 -0500
Subject: bcachefs: Don't embed btree iters in btree_trans

These haven't been in used since reallocing iterators has been disabled,
and saves us a lot of stack if we get rid of it.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c  | 13 +++----------
 fs/bcachefs/btree_types.h |  4 ----
 2 files changed, 3 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 0b69cdccccdb..a4141a5b569e 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -2039,8 +2039,7 @@ success:
 		       sizeof(struct btree_iter) * trans->nr_iters +
 		       sizeof(struct btree_insert_entry) * trans->nr_iters);
 
-	if (trans->iters != trans->iters_onstack)
-		kfree(trans->iters);
+	kfree(trans->iters);
 
 	trans->iters		= new_iters;
 	trans->updates		= new_updates;
@@ -2331,21 +2330,15 @@ void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c,
 		     unsigned expected_nr_iters,
 		     size_t expected_mem_bytes)
 {
-	memset(trans, 0, offsetof(struct btree_trans, iters_onstack));
-
 	/*
 	 * reallocating iterators currently completely breaks
 	 * bch2_trans_iter_put():
 	 */
 	expected_nr_iters = BTREE_ITER_MAX;
 
+	memset(trans, 0, sizeof(*trans));
 	trans->c		= c;
 	trans->ip		= _RET_IP_;
-	trans->size		= ARRAY_SIZE(trans->iters_onstack);
-	trans->iters		= trans->iters_onstack;
-	trans->updates		= trans->updates_onstack;
-	trans->updates2		= trans->updates2_onstack;
-	trans->fs_usage_deltas	= NULL;
 
 	if (expected_nr_iters > trans->size)
 		bch2_trans_realloc_iters(trans, expected_nr_iters);
@@ -2377,7 +2370,7 @@ int bch2_trans_exit(struct btree_trans *trans)
 	kfree(trans->mem);
 	if (trans->used_mempool)
 		mempool_free(trans->iters, &trans->c->btree_iters_pool);
-	else if (trans->iters != trans->iters_onstack)
+	else
 		kfree(trans->iters);
 	trans->mem	= (void *) 0x1;
 	trans->iters	= (void *) 0x1;
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 3b1dcbf5e625..55ea028d242e 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -381,10 +381,6 @@ struct btree_trans {
 	unsigned		journal_u64s;
 	unsigned		journal_preres_u64s;
 	struct replicas_delta_list *fs_usage_deltas;
-
-	struct btree_iter	iters_onstack[2];
-	struct btree_insert_entry updates_onstack[2];
-	struct btree_insert_entry updates2_onstack[2];
 };
 
 #define BTREE_FLAG(flag)						\
-- 
cgit 


From d108efc2541590a0a086f27b1b703e59a84fafb2 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 2 Nov 2020 19:15:18 -0500
Subject: bcachefs: add const annotations to bset.c

perhaps a bit silly, but some debug assertions we want to add need const
propagated a bit more.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bset.c | 14 +++++++-------
 fs/bcachefs/bset.h |  6 +++---
 2 files changed, 10 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c
index f483bcbe801e..fc8a7cc2f4bd 100644
--- a/fs/bcachefs/bset.c
+++ b/fs/bcachefs/bset.c
@@ -357,10 +357,10 @@ static struct bkey_float *bkey_float(const struct btree *b,
 	return ro_aux_tree_base(b, t)->f + idx;
 }
 
-static void bset_aux_tree_verify(struct btree *b)
+static void bset_aux_tree_verify(const struct btree *b)
 {
 #ifdef CONFIG_BCACHEFS_DEBUG
-	struct bset_tree *t;
+	const struct bset_tree *t;
 
 	for_each_bset(b, t) {
 		if (t->aux_data_offset == U16_MAX)
@@ -696,20 +696,20 @@ static void make_bfloat(struct btree *b, struct bset_tree *t,
 }
 
 /* bytes remaining - only valid for last bset: */
-static unsigned __bset_tree_capacity(struct btree *b, struct bset_tree *t)
+static unsigned __bset_tree_capacity(const struct btree *b, const struct bset_tree *t)
 {
 	bset_aux_tree_verify(b);
 
 	return btree_aux_data_bytes(b) - t->aux_data_offset * sizeof(u64);
 }
 
-static unsigned bset_ro_tree_capacity(struct btree *b, struct bset_tree *t)
+static unsigned bset_ro_tree_capacity(const struct btree *b, const struct bset_tree *t)
 {
 	return __bset_tree_capacity(b, t) /
 		(sizeof(struct bkey_float) + sizeof(u8));
 }
 
-static unsigned bset_rw_tree_capacity(struct btree *b, struct bset_tree *t)
+static unsigned bset_rw_tree_capacity(const struct btree *b, const struct bset_tree *t)
 {
 	return __bset_tree_capacity(b, t) / sizeof(struct rw_aux_tree);
 }
@@ -1213,8 +1213,8 @@ static inline bool bkey_mantissa_bits_dropped(const struct btree *b,
 
 __flatten
 static struct bkey_packed *bset_search_tree(const struct btree *b,
-				struct bset_tree *t,
-				struct bpos *search,
+				const struct bset_tree *t,
+				const struct bpos *search,
 				const struct bkey_packed *packed_search)
 {
 	struct ro_aux_tree *base = ro_aux_tree_base(b, t);
diff --git a/fs/bcachefs/bset.h b/fs/bcachefs/bset.h
index 21e4ed4eacc3..5389e4f4f350 100644
--- a/fs/bcachefs/bset.h
+++ b/fs/bcachefs/bset.h
@@ -190,17 +190,17 @@ static inline enum bset_aux_tree_type bset_aux_tree_type(const struct bset_tree
 
 #define BSET_CACHELINE		128
 
-static inline size_t btree_keys_cachelines(struct btree *b)
+static inline size_t btree_keys_cachelines(const struct btree *b)
 {
 	return (1U << b->byte_order) / BSET_CACHELINE;
 }
 
-static inline size_t btree_aux_data_bytes(struct btree *b)
+static inline size_t btree_aux_data_bytes(const struct btree *b)
 {
 	return btree_keys_cachelines(b) * 8;
 }
 
-static inline size_t btree_aux_data_u64s(struct btree *b)
+static inline size_t btree_aux_data_u64s(const struct btree *b)
 {
 	return btree_aux_data_bytes(b) / sizeof(u64);
 }
-- 
cgit 


From df082b3a50e02bb8dfc583cea29ab94ab1a04692 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 2 Nov 2020 19:49:23 -0500
Subject: bcachefs: Report inode counts via statfs

Took awhile to figure out exactly what statfs wanted...

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index 2ed80ef41d1a..917a08ddc148 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -1261,6 +1261,11 @@ static int bch2_statfs(struct dentry *dentry, struct kstatfs *buf)
 	struct bch_fs *c = sb->s_fs_info;
 	struct bch_fs_usage_short usage = bch2_fs_usage_read_short(c);
 	unsigned shift = sb->s_blocksize_bits - 9;
+	/*
+	 * this assumes inodes take up 64 bytes, which is a decent average
+	 * number:
+	 */
+	u64 avail_inodes = ((usage.capacity - usage.used) << 3);
 	u64 fsid;
 
 	buf->f_type	= BCACHEFS_STATFS_MAGIC;
@@ -1268,8 +1273,9 @@ static int bch2_statfs(struct dentry *dentry, struct kstatfs *buf)
 	buf->f_blocks	= usage.capacity >> shift;
 	buf->f_bfree	= (usage.capacity - usage.used) >> shift;
 	buf->f_bavail	= buf->f_bfree;
-	buf->f_files	= 0;
-	buf->f_ffree	= 0;
+
+	buf->f_files	= usage.nr_inodes + avail_inodes;
+	buf->f_ffree	= avail_inodes;
 
 	fsid = le64_to_cpup((void *) c->sb.user_uuid.b) ^
 	       le64_to_cpup((void *) c->sb.user_uuid.b + sizeof(u64));
-- 
cgit 


From b5e8a6992fb1195cb58cb79461ef50f474c27608 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 2 Nov 2020 23:51:33 -0500
Subject: bcachefs: Improved inode create optimization

This shards new inodes into different btree nodes by using the processor
ID for the high bits of the new inode number. Much faster than the
previous inode create optimization - this also helps with sharding in
the other btrees that index by inode number.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs.h |   7 +--
 fs/bcachefs/inode.c    | 139 +++++++++++++++++--------------------------------
 fs/bcachefs/super.c    |   7 ++-
 3 files changed, 54 insertions(+), 99 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index d56057f27e8f..c14117227dd7 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -814,11 +814,8 @@ mempool_t		bio_bounce_pages;
 	struct mutex		verify_lock;
 #endif
 
-	struct mutex		inode_create_lock;
-	unsigned		unused_inodes_nr;
-	u64			unused_inodes[64];
-	u32			unused_inodes_gens[64];
-	u64			unused_inode_hint;
+	u64			*unused_inode_hints;
+	unsigned		inode_shard_bits;
 
 	/*
 	 * A btree node on disk could have too many bsets for an iterator to fit
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index c55c164be882..b49c382f5452 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -361,55 +361,6 @@ static inline u32 bkey_generation(struct bkey_s_c k)
 	}
 }
 
-static int scan_free_inums(struct btree_trans *trans)
-{
-	struct bch_fs *c = trans->c;
-	struct btree_iter *iter = NULL;
-	struct bkey_s_c k;
-	u64 min = BLOCKDEV_INODE_MAX;
-	u64 max = c->opts.inodes_32bit
-		? S32_MAX : S64_MAX;
-	u64 start = max(min, READ_ONCE(c->unused_inode_hint));
-	int ret = 0;
-
-	iter = bch2_trans_get_iter(trans, BTREE_ID_INODES, POS(0, start),
-				   BTREE_ITER_SLOTS);
-	if (IS_ERR(iter))
-		return PTR_ERR(iter);
-again:
-	for_each_btree_key_continue(iter, BTREE_ITER_SLOTS, k, ret) {
-		if (bkey_cmp(iter->pos, POS(0, max)) > 0)
-			break;
-
-		/*
-		 * This doesn't check the btree key cache, but we don't care:
-		 * we have to recheck with an intent lock held on the slot we're
-		 * inserting to anyways:
-		 */
-		if (k.k->type != KEY_TYPE_inode) {
-			if (c->unused_inodes_nr < ARRAY_SIZE(c->unused_inodes)) {
-				c->unused_inodes[c->unused_inodes_nr] = k.k->p.offset;
-				c->unused_inodes_gens[c->unused_inodes_nr] = bkey_generation(k);
-				c->unused_inodes_nr++;
-			}
-
-			if (c->unused_inodes_nr == ARRAY_SIZE(c->unused_inodes))
-				goto out;
-		}
-	}
-
-	if (!ret && start != min) {
-		max = start;
-		start = min;
-		bch2_btree_iter_set_pos(iter, POS(0, start));
-		goto again;
-	}
-out:
-	c->unused_inode_hint = iter->pos.offset;
-	bch2_trans_iter_put(trans, iter);
-	return ret;
-}
-
 int bch2_inode_create(struct btree_trans *trans,
 		      struct bch_inode_unpacked *inode_u)
 {
@@ -417,64 +368,68 @@ int bch2_inode_create(struct btree_trans *trans,
 	struct bkey_inode_buf *inode_p;
 	struct btree_iter *iter = NULL;
 	struct bkey_s_c k;
-	u64 inum;
-	u32 generation;
-	int ret = 0;
+	u64 min, max, start, *hint;
+	int ret;
+
+	unsigned cpu = raw_smp_processor_id();
+	unsigned bits = (c->opts.inodes_32bit
+		? 31 : 63) - c->inode_shard_bits;
+
+	min = (cpu << bits);
+	max = (cpu << bits) | ~(ULLONG_MAX << bits);
+
+	min = max_t(u64, min, BLOCKDEV_INODE_MAX);
+	hint = c->unused_inode_hints + cpu;
+
+	start = READ_ONCE(*hint);
+
+	if (start >= max || start < min)
+		start = min;
 
 	inode_p = bch2_trans_kmalloc(trans, sizeof(*inode_p));
 	if (IS_ERR(inode_p))
 		return PTR_ERR(inode_p);
+again:
+	for_each_btree_key(trans, iter, BTREE_ID_INODES, POS(0, start),
+			   BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) {
+		if (bkey_cmp(iter->pos, POS(0, max)) > 0)
+			break;
 
-	iter = bch2_trans_get_iter(trans, BTREE_ID_INODES, POS_MIN,
-				   BTREE_ITER_CACHED|
-				   BTREE_ITER_INTENT);
-	if (IS_ERR(iter))
-		return PTR_ERR(iter);
-retry:
-	if (!mutex_trylock(&c->inode_create_lock)) {
-		bch2_trans_unlock(trans);
-		mutex_lock(&c->inode_create_lock);
-		if (!bch2_trans_relock(trans)) {
-			mutex_unlock(&c->inode_create_lock);
-			ret = -EINTR;
-			goto err;
-		}
-	}
-
-	if (!c->unused_inodes_nr)
-		ret = scan_free_inums(trans);
-	if (!ret && !c->unused_inodes_nr)
-		ret = -ENOSPC;
-	if (!ret) {
-		--c->unused_inodes_nr;
-		inum		= c->unused_inodes[c->unused_inodes_nr];
-		generation	= c->unused_inodes_gens[c->unused_inodes_nr];
+		/*
+		 * There's a potential cache coherency issue with the btree key
+		 * cache code here - we're iterating over the btree, skipping
+		 * that cache. We should never see an empty slot that isn't
+		 * actually empty due to a pending update in the key cache
+		 * because the update that creates the inode isn't done with a
+		 * cached iterator, but - better safe than sorry, check the
+		 * cache before using a slot:
+		 */
+		if (k.k->type != KEY_TYPE_inode &&
+		    !bch2_btree_key_cache_find(c, BTREE_ID_INODES, iter->pos))
+			goto found_slot;
 	}
 
-	mutex_unlock(&c->inode_create_lock);
-
-	if (ret)
-		goto err;
-
-	bch2_btree_iter_set_pos(iter, POS(0, inum));
+	bch2_trans_iter_put(trans, iter);
 
-	/* Recheck that the slot is free with an intent lock held: */
-	k = bch2_btree_iter_peek_cached(iter);
-	ret = bkey_err(k);
 	if (ret)
-		goto err;
+		return ret;
 
-	if (k.k->type == KEY_TYPE_inode)
-		goto retry;
+	if (start != min) {
+		/* Retry from start */
+		start = min;
+		goto again;
+	}
 
-	inode_u->bi_inum	= inum;
-	inode_u->bi_generation	= generation;
+	return -ENOSPC;
+found_slot:
+	*hint			= k.k->p.offset;
+	inode_u->bi_inum	= k.k->p.offset;
+	inode_u->bi_generation	= bkey_generation(k);
 
 	bch2_inode_pack(inode_p, inode_u);
 	bch2_trans_update(trans, iter, &inode_p->inode.k_i, 0);
-err:
 	bch2_trans_iter_put(trans, iter);
-	return ret;
+	return 0;
 }
 
 int bch2_inode_rm(struct bch_fs *c, u64 inode_nr)
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index fd78ab205865..a2ade0df62b5 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -493,6 +493,7 @@ static void __bch2_fs_free(struct bch_fs *c)
 	kfree(c->replicas_gc.entries);
 	kfree(rcu_dereference_protected(c->disk_groups, 1));
 	kfree(c->journal_seq_blacklist_table);
+	kfree(c->unused_inode_hints);
 	free_heap(&c->copygc_heap);
 
 	if (c->journal_reclaim_wq)
@@ -703,8 +704,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 
 	seqcount_init(&c->usage_lock);
 
-	mutex_init(&c->inode_create_lock);
-
 	c->copy_gc_enabled		= 1;
 	c->rebalance.enabled		= 1;
 	c->promote_whole_extents	= true;
@@ -746,6 +745,8 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 		(btree_blocks(c) + 1) * 2 *
 		sizeof(struct sort_iter_set);
 
+	c->inode_shard_bits = ilog2(roundup_pow_of_two(num_possible_cpus()));
+
 	if (!(c->wq = alloc_workqueue("bcachefs",
 				WQ_FREEZABLE|WQ_MEM_RECLAIM, 1)) ||
 	    !(c->copygc_wq = alloc_workqueue("bcachefs_copygc",
@@ -764,6 +765,8 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 	    mempool_init_kvpmalloc_pool(&c->btree_bounce_pool, 1,
 					btree_bytes(c)) ||
 	    mempool_init_kmalloc_pool(&c->large_bkey_pool, 1, 2048) ||
+	    !(c->unused_inode_hints = kcalloc(1U << c->inode_shard_bits,
+					      sizeof(u64), GFP_KERNEL)) ||
 	    bch2_io_clock_init(&c->io_clock[READ]) ||
 	    bch2_io_clock_init(&c->io_clock[WRITE]) ||
 	    bch2_fs_journal_init(&c->journal) ||
-- 
cgit 


From b735d73a00d5d9f5652a299146d518b7eea47b7b Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 5 Nov 2020 12:16:05 -0500
Subject: bcachefs: Build fixes for 32bit x86

PAGE_SIZE and size_t are not unsigned longs on 32 bit, annoying...

also switch to atomic64_cmpxchg instead of cmpxchg() for
journal_seq_copy, as atomic64_cmpxchg has a fallback that uses spinlocks
for when it's not supported.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/ec.c | 2 +-
 fs/bcachefs/fs.c | 7 ++++++-
 fs/bcachefs/io.c | 2 +-
 3 files changed, 8 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index e5033b392432..42331f0e54e7 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -1586,7 +1586,7 @@ void bch2_stripes_heap_to_text(struct printbuf *out, struct bch_fs *c)
 	size_t i;
 
 	spin_lock(&c->ec_stripes_heap_lock);
-	for (i = 0; i < min(h->used, 20UL); i++) {
+	for (i = 0; i < min_t(size_t, h->used, 20); i++) {
 		m = genradix_ptr(&c->stripes[0], h->data[i].idx);
 
 		pr_buf(out, "%zu %u/%u+%u\n", h->data[i].idx,
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index 917a08ddc148..3e3ab4e53f33 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -44,6 +44,11 @@ static void journal_seq_copy(struct bch_fs *c,
 			     struct bch_inode_info *dst,
 			     u64 journal_seq)
 {
+	/*
+	 * atomic64_cmpxchg has a fallback for archs that don't support it,
+	 * cmpxchg does not:
+	 */
+	atomic64_t *dst_seq = (void *) &dst->ei_journal_seq;
 	u64 old, v = READ_ONCE(dst->ei_journal_seq);
 
 	do {
@@ -51,7 +56,7 @@ static void journal_seq_copy(struct bch_fs *c,
 
 		if (old >= journal_seq)
 			break;
-	} while ((v = cmpxchg(&dst->ei_journal_seq, old, journal_seq)) != old);
+	} while ((v = atomic64_cmpxchg(dst_seq, old, journal_seq)) != old);
 
 	bch2_journal_set_has_inum(&c->journal, dst->v.i_ino, journal_seq);
 }
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index 346d77d68ade..6df99ac013a1 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -180,7 +180,7 @@ void bch2_bio_alloc_pages_pool(struct bch_fs *c, struct bio *bio,
 
 	while (size) {
 		struct page *page = __bio_alloc_page_pool(c, &using_mempool);
-		unsigned len = min(PAGE_SIZE, size);
+		unsigned len = min_t(size_t, PAGE_SIZE, size);
 
 		BUG_ON(!bio_add_page(bio, page, len, 0));
 		size -= len;
-- 
cgit 


From 96fee47e44939c087d1a0f9ed69555374e751843 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Thu, 5 Nov 2020 15:58:37 +0000
Subject: bcachefs: Remove page_state_init_for_read

This is dead code; delete the function.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-io.c | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index edc3d73d26ba..1fae450df8a6 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -624,12 +624,6 @@ static void bch2_readpages_end_io(struct bio *bio)
 	bio_put(bio);
 }
 
-static inline void page_state_init_for_read(struct page *page)
-{
-	SetPagePrivate(page);
-	page->private = 0;
-}
-
 struct readpages_iter {
 	struct address_space	*mapping;
 	struct page		**pages;
-- 
cgit 


From 00276f9f34c29c59a848e22eb491d76c268f0dad Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Thu, 5 Nov 2020 15:58:38 +0000
Subject: bcachefs: Use attach_page_private and detach_page_private

These recently added helpers simplify the code.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-io.c | 29 ++++-------------------------
 1 file changed, 4 insertions(+), 25 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 1fae450df8a6..658d19c04b99 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -273,28 +273,13 @@ static inline struct bch_page_state *bch2_page_state(struct page *page)
 /* for newly allocated pages: */
 static void __bch2_page_state_release(struct page *page)
 {
-	struct bch_page_state *s = __bch2_page_state(page);
-
-	if (!s)
-		return;
-
-	ClearPagePrivate(page);
-	set_page_private(page, 0);
-	put_page(page);
-	kfree(s);
+	kfree(detach_page_private(page));
 }
 
 static void bch2_page_state_release(struct page *page)
 {
-	struct bch_page_state *s = bch2_page_state(page);
-
-	if (!s)
-		return;
-
-	ClearPagePrivate(page);
-	set_page_private(page, 0);
-	put_page(page);
-	kfree(s);
+	EBUG_ON(!PageLocked(page));
+	__bch2_page_state_release(page);
 }
 
 /* for newly allocated pages: */
@@ -308,13 +293,7 @@ static struct bch_page_state *__bch2_page_state_create(struct page *page,
 		return NULL;
 
 	spin_lock_init(&s->lock);
-	/*
-	 * migrate_page_move_mapping() assumes that pages with private data
-	 * have their count elevated by 1.
-	 */
-	get_page(page);
-	set_page_private(page, (unsigned long) s);
-	SetPagePrivate(page);
+	attach_page_private(page, s);
 	return s;
 }
 
-- 
cgit 


From 1a21bf9866700f29ad552cca8bbddfd248bb751c Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 5 Nov 2020 20:02:01 -0500
Subject: bcachefs: Add a single slot percpu buf for btree iters

Allocating our array of btree iters is a big enough allocation that it
hits the buddy allocator, and we're seeing lots of lock contention.
Sticking a single element buffer in front of it should help.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs.h   |  5 ++++
 fs/bcachefs/btree_iter.c | 73 ++++++++++++++++++++++++++++++------------------
 fs/bcachefs/super.c      |  8 ++++++
 3 files changed, 59 insertions(+), 27 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index c14117227dd7..8ac96384fddf 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -541,6 +541,10 @@ struct journal_keys {
 	u64			journal_seq_base;
 };
 
+struct btree_iter_buf {
+	struct btree_iter	*iter;
+};
+
 struct bch_fs {
 	struct closure		cl;
 
@@ -636,6 +640,7 @@ struct bch_fs {
 	struct mutex		btree_trans_lock;
 	struct list_head	btree_trans_list;
 	mempool_t		btree_iters_pool;
+	struct btree_iter_buf  __percpu	*btree_iters_bufs;
 
 	struct btree_key_cache	btree_key_cache;
 
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index a4141a5b569e..f62658f1b1dd 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1991,6 +1991,7 @@ int bch2_trans_iter_free(struct btree_trans *trans,
 	return bch2_trans_iter_put(trans, iter);
 }
 
+#if 0
 static int bch2_trans_realloc_iters(struct btree_trans *trans,
 				    unsigned new_size)
 {
@@ -2053,6 +2054,7 @@ success:
 
 	return 0;
 }
+#endif
 
 static struct btree_iter *btree_trans_iter_alloc(struct btree_trans *trans)
 {
@@ -2062,28 +2064,27 @@ static struct btree_iter *btree_trans_iter_alloc(struct btree_trans *trans)
 		goto got_slot;
 
 	if (trans->nr_iters == trans->size) {
-		int ret;
-
-		if (trans->nr_iters >= BTREE_ITER_MAX) {
-			struct btree_iter *iter;
-
-			trans_for_each_iter(trans, iter) {
-				pr_err("iter: btree %s pos %llu:%llu%s%s%s %ps",
-				       bch2_btree_ids[iter->btree_id],
-				       iter->pos.inode,
-				       iter->pos.offset,
-				       (trans->iters_live & (1ULL << iter->idx)) ? " live" : "",
-				       (trans->iters_touched & (1ULL << iter->idx)) ? " touched" : "",
-				       iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT ? " keep" : "",
-				       (void *) iter->ip_allocated);
-			}
+		struct btree_iter *iter;
 
-			panic("trans iter oveflow\n");
+		BUG_ON(trans->size < BTREE_ITER_MAX);
+
+		trans_for_each_iter(trans, iter) {
+			pr_err("iter: btree %s pos %llu:%llu%s%s%s %ps",
+			       bch2_btree_ids[iter->btree_id],
+			       iter->pos.inode,
+			       iter->pos.offset,
+			       (trans->iters_live & (1ULL << iter->idx)) ? " live" : "",
+			       (trans->iters_touched & (1ULL << iter->idx)) ? " touched" : "",
+			       iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT ? " keep" : "",
+			       (void *) iter->ip_allocated);
 		}
 
+		panic("trans iter oveflow\n");
+#if 0
 		ret = bch2_trans_realloc_iters(trans, trans->size * 2);
 		if (ret)
 			return ERR_PTR(ret);
+#endif
 	}
 
 	idx = trans->nr_iters++;
@@ -2326,22 +2327,37 @@ void bch2_trans_reset(struct btree_trans *trans, unsigned flags)
 		bch2_btree_iter_traverse_all(trans);
 }
 
+static void bch2_trans_alloc_iters(struct btree_trans *trans, struct bch_fs *c)
+{
+	unsigned new_size = BTREE_ITER_MAX;
+	size_t iters_bytes	= sizeof(struct btree_iter) * new_size;
+	size_t updates_bytes	= sizeof(struct btree_insert_entry) * new_size;
+	void *p;
+
+	BUG_ON(trans->used_mempool);
+
+	p =     this_cpu_xchg(c->btree_iters_bufs->iter, NULL) ?:
+		mempool_alloc(&trans->c->btree_iters_pool, GFP_NOFS);
+
+	trans->iters		= p; p += iters_bytes;
+	trans->updates		= p; p += updates_bytes;
+	trans->updates2		= p; p += updates_bytes;
+	trans->size		= new_size;
+}
+
 void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c,
 		     unsigned expected_nr_iters,
 		     size_t expected_mem_bytes)
 {
-	/*
-	 * reallocating iterators currently completely breaks
-	 * bch2_trans_iter_put():
-	 */
-	expected_nr_iters = BTREE_ITER_MAX;
-
 	memset(trans, 0, sizeof(*trans));
 	trans->c		= c;
 	trans->ip		= _RET_IP_;
 
-	if (expected_nr_iters > trans->size)
-		bch2_trans_realloc_iters(trans, expected_nr_iters);
+	/*
+	 * reallocating iterators currently completely breaks
+	 * bch2_trans_iter_put(), we always allocate the max:
+	 */
+	bch2_trans_alloc_iters(trans, c);
 
 	if (expected_mem_bytes)
 		bch2_trans_preload_mem(trans, expected_mem_bytes);
@@ -2356,6 +2372,8 @@ void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c,
 
 int bch2_trans_exit(struct btree_trans *trans)
 {
+	struct bch_fs *c = trans->c;
+
 	bch2_trans_unlock(trans);
 
 #ifdef CONFIG_BCACHEFS_DEBUG
@@ -2368,10 +2386,11 @@ int bch2_trans_exit(struct btree_trans *trans)
 
 	kfree(trans->fs_usage_deltas);
 	kfree(trans->mem);
-	if (trans->used_mempool)
+
+	trans->iters = this_cpu_xchg(c->btree_iters_bufs->iter, trans->iters);
+	if (trans->iters)
 		mempool_free(trans->iters, &trans->c->btree_iters_pool);
-	else
-		kfree(trans->iters);
+
 	trans->mem	= (void *) 0x1;
 	trans->iters	= (void *) 0x1;
 
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index a2ade0df62b5..e55fcbcbd37f 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -458,6 +458,7 @@ int bch2_fs_read_write_early(struct bch_fs *c)
 static void __bch2_fs_free(struct bch_fs *c)
 {
 	unsigned i;
+	int cpu;
 
 	for (i = 0; i < BCH_TIME_STAT_NR; i++)
 		bch2_time_stats_exit(&c->times[i]);
@@ -483,6 +484,12 @@ static void __bch2_fs_free(struct bch_fs *c)
 	free_percpu(c->usage[1]);
 	free_percpu(c->usage[0]);
 	kfree(c->usage_base);
+
+	if (c->btree_iters_bufs)
+		for_each_possible_cpu(cpu)
+			kfree(per_cpu_ptr(c->btree_iters_bufs, cpu)->iter);
+
+	free_percpu(c->btree_iters_bufs);
 	free_percpu(c->pcpu);
 	mempool_exit(&c->large_bkey_pool);
 	mempool_exit(&c->btree_bounce_pool);
@@ -762,6 +769,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 			BIOSET_NEED_BVECS) ||
 	    !(c->pcpu = alloc_percpu(struct bch_fs_pcpu)) ||
 	    !(c->online_reserved = alloc_percpu(u64)) ||
+	    !(c->btree_iters_bufs = alloc_percpu(struct btree_iter_buf)) ||
 	    mempool_init_kvpmalloc_pool(&c->btree_bounce_pool, 1,
 					btree_bytes(c)) ||
 	    mempool_init_kmalloc_pool(&c->large_bkey_pool, 1, 2048) ||
-- 
cgit 


From 7e7ae6ca57d210dcedc4268323c9471d97194111 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 5 Nov 2020 20:49:08 -0500
Subject: bcachefs: Fix spurious transaction restarts

The checks for lock ordering violations weren't quite right.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c | 39 +++++++++++++++++++++++++--------------
 fs/bcachefs/btree_iter.h |  2 +-
 2 files changed, 26 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index f62658f1b1dd..bbb125fb9d43 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -238,14 +238,32 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
 			}
 		}
 
+		if (linked->btree_id != iter->btree_id) {
+			if (linked->btree_id > iter->btree_id) {
+				deadlock_iter = linked;
+				reason = 3;
+			}
+			continue;
+		}
+
+		/*
+		 * Within the same btree, cached iterators come before non
+		 * cached iterators:
+		 */
+		if (btree_iter_is_cached(linked) != btree_iter_is_cached(iter)) {
+			if (btree_iter_is_cached(iter)) {
+				deadlock_iter = linked;
+				reason = 4;
+			}
+			continue;
+		}
+
 		/*
 		 * Interior nodes must be locked before their descendants: if
 		 * another iterator has possible descendants locked of the node
 		 * we're about to lock, it must have the ancestors locked too:
 		 */
-		if (linked->btree_id == iter->btree_id &&
-		    btree_iter_is_cached(linked) == btree_iter_is_cached(iter) &&
-		    level > __fls(linked->nodes_locked)) {
+		if (level > __fls(linked->nodes_locked)) {
 			if (!(trans->nounlock)) {
 				linked->locks_want =
 					max(level + 1, max_t(unsigned,
@@ -253,27 +271,20 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
 					    iter->locks_want));
 				if (!btree_iter_get_locks(linked, true, false)) {
 					deadlock_iter = linked;
-					reason = 3;
+					reason = 5;
 				}
 			} else {
 				deadlock_iter = linked;
-				reason = 4;
+				reason = 6;
 			}
 		}
 
 		/* Must lock btree nodes in key order: */
-		if ((cmp_int(iter->btree_id, linked->btree_id) ?:
-		     -cmp_int(btree_iter_type(iter), btree_iter_type(linked))) < 0) {
-			deadlock_iter = linked;
-			reason = 5;
-		}
-
-		if (iter->btree_id == linked->btree_id &&
-		    btree_node_locked(linked, level) &&
+		if (btree_node_locked(linked, level) &&
 		    bkey_cmp(pos, btree_node_pos((void *) linked->l[level].b,
 						 btree_iter_type(linked))) <= 0) {
 			deadlock_iter = linked;
-			reason = 6;
+			reason = 7;
 		}
 
 		/*
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index f80e09255f68..f7a73619c85b 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -182,7 +182,7 @@ static inline int btree_iter_lock_cmp(const struct btree_iter *l,
 				      const struct btree_iter *r)
 {
 	return   cmp_int(l->btree_id, r->btree_id) ?:
-		-cmp_int(btree_iter_type(l), btree_iter_type(r)) ?:
+		-cmp_int(btree_iter_is_cached(l), btree_iter_is_cached(r)) ?:
 		 bkey_cmp(l->pos, r->pos);
 }
 
-- 
cgit 


From 73e7470b31e439c3e86bb2371b7eb1c0bc852766 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 6 Nov 2020 01:34:41 -0500
Subject: bcachefs: More inlinining in the btree key cache code

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_key_cache.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index 9a93b6d26878..8b43460c9c9b 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -204,6 +204,7 @@ static int bkey_cached_check_fn(struct six_lock *lock, void *p)
 		!bkey_cmp(ck->key.pos, iter->pos) ? 0 : -1;
 }
 
+__flatten
 int bch2_btree_iter_traverse_cached(struct btree_iter *iter)
 {
 	struct btree_trans *trans = iter->trans;
-- 
cgit 


From 811d2bcd85a82642c2cd328f6734b5c8c35e57de Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 7 Nov 2020 12:31:20 -0500
Subject: bcachefs: Drop typechecking from bkey_cmp_packed()

This only did anything in two places, and those can just be replaced
wiht bkey_cmp_left_packed()).

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bkey.c                  |  8 +++----
 fs/bcachefs/bkey.h                  | 47 +++----------------------------------
 fs/bcachefs/bkey_sort.c             | 10 ++++----
 fs/bcachefs/bset.h                  |  2 +-
 fs/bcachefs/btree_io.c              |  6 ++---
 fs/bcachefs/btree_update_interior.c |  2 +-
 fs/bcachefs/btree_update_leaf.c     |  2 +-
 7 files changed, 18 insertions(+), 59 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bkey.c b/fs/bcachefs/bkey.c
index 32e4917dc004..6417307f42b9 100644
--- a/fs/bcachefs/bkey.c
+++ b/fs/bcachefs/bkey.c
@@ -413,7 +413,7 @@ static bool bkey_packed_successor(struct bkey_packed *out,
 
 		if ((*p & mask) != mask) {
 			*p += 1ULL << offset;
-			EBUG_ON(bkey_cmp_packed(b, out, &k) <= 0);
+			EBUG_ON(bch2_bkey_cmp_packed(b, out, &k) <= 0);
 			return true;
 		}
 
@@ -1057,9 +1057,9 @@ int __bch2_bkey_cmp_left_packed_format_checked(const struct btree *b,
 }
 
 __pure __flatten
-int __bch2_bkey_cmp_packed(const struct bkey_packed *l,
-			   const struct bkey_packed *r,
-			   const struct btree *b)
+int bch2_bkey_cmp_packed(const struct btree *b,
+			 const struct bkey_packed *l,
+			 const struct bkey_packed *r)
 {
 	struct bkey unpacked;
 
diff --git a/fs/bcachefs/bkey.h b/fs/bcachefs/bkey.h
index dea7dfe4b079..5ce883ba22dc 100644
--- a/fs/bcachefs/bkey.h
+++ b/fs/bcachefs/bkey.h
@@ -75,13 +75,6 @@ static inline void set_bkey_val_bytes(struct bkey *k, unsigned bytes)
 #define bkey_whiteout(_k)				\
 	((_k)->type == KEY_TYPE_deleted || (_k)->type == KEY_TYPE_discard)
 
-#define bkey_packed_typecheck(_k)					\
-({									\
-	BUILD_BUG_ON(!type_is(_k, struct bkey *) &&			\
-		     !type_is(_k, struct bkey_packed *));		\
-	type_is(_k, struct bkey_packed *);				\
-})
-
 enum bkey_lr_packed {
 	BKEY_PACKED_BOTH,
 	BKEY_PACKED_RIGHT,
@@ -89,9 +82,6 @@ enum bkey_lr_packed {
 	BKEY_PACKED_NONE,
 };
 
-#define bkey_lr_packed_typecheck(_l, _r)				\
-	(!bkey_packed_typecheck(_l) + ((!bkey_packed_typecheck(_r)) << 1))
-
 #define bkey_lr_packed(_l, _r)						\
 	((_l)->format + ((_r)->format << 1))
 
@@ -140,9 +130,9 @@ int __bch2_bkey_cmp_left_packed_format_checked(const struct btree *,
 					  const struct bpos *);
 
 __pure
-int __bch2_bkey_cmp_packed(const struct bkey_packed *,
-			   const struct bkey_packed *,
-			   const struct btree *);
+int bch2_bkey_cmp_packed(const struct btree *,
+			 const struct bkey_packed *,
+			 const struct bkey_packed *);
 
 __pure
 int __bch2_bkey_cmp_left_packed(const struct btree *,
@@ -168,37 +158,6 @@ static inline int bkey_cmp_left_packed_byval(const struct btree *b,
 	return bkey_cmp_left_packed(b, l, &r);
 }
 
-/*
- * If @_l or @_r are struct bkey * (not bkey_packed *), uses type information to
- * skip dispatching on k->format:
- */
-#define bkey_cmp_packed(_b, _l, _r)					\
-({									\
-	int _cmp;							\
-									\
-	switch (bkey_lr_packed_typecheck(_l, _r)) {			\
-	case BKEY_PACKED_NONE:						\
-		_cmp = bkey_cmp(((struct bkey *) (_l))->p,		\
-				((struct bkey *) (_r))->p);		\
-		break;							\
-	case BKEY_PACKED_LEFT:						\
-		_cmp = bkey_cmp_left_packed((_b),			\
-				  (struct bkey_packed *) (_l),		\
-				  &((struct bkey *) (_r))->p);		\
-		break;							\
-	case BKEY_PACKED_RIGHT:						\
-		_cmp = -bkey_cmp_left_packed((_b),			\
-				  (struct bkey_packed *) (_r),		\
-				  &((struct bkey *) (_l))->p);		\
-		break;							\
-	case BKEY_PACKED_BOTH:						\
-		_cmp = __bch2_bkey_cmp_packed((void *) (_l),		\
-					 (void *) (_r), (_b));		\
-		break;							\
-	}								\
-	_cmp;								\
-})
-
 #if 1
 static __always_inline int bkey_cmp(struct bpos l, struct bpos r)
 {
diff --git a/fs/bcachefs/bkey_sort.c b/fs/bcachefs/bkey_sort.c
index 839e78d1dc35..99e0a4011fae 100644
--- a/fs/bcachefs/bkey_sort.c
+++ b/fs/bcachefs/bkey_sort.c
@@ -86,7 +86,7 @@ static inline int key_sort_fix_overlapping_cmp(struct btree *b,
 					       struct bkey_packed *l,
 					       struct bkey_packed *r)
 {
-	return bkey_cmp_packed(b, l, r) ?:
+	return bch2_bkey_cmp_packed(b, l, r) ?:
 		cmp_int((unsigned long) l, (unsigned long) r);
 }
 
@@ -98,7 +98,7 @@ static inline bool should_drop_next_key(struct sort_iter *iter)
 	 * and should be dropped.
 	 */
 	return iter->used >= 2 &&
-		!bkey_cmp_packed(iter->b,
+		!bch2_bkey_cmp_packed(iter->b,
 				 iter->data[0].k,
 				 iter->data[1].k);
 }
@@ -223,7 +223,7 @@ static inline int sort_keys_cmp(struct btree *b,
 				struct bkey_packed *l,
 				struct bkey_packed *r)
 {
-	return bkey_cmp_packed(b, l, r) ?:
+	return bch2_bkey_cmp_packed(b, l, r) ?:
 		(int) bkey_deleted(r) - (int) bkey_deleted(l) ?:
 		(int) l->needs_whiteout - (int) r->needs_whiteout;
 }
@@ -245,7 +245,7 @@ unsigned bch2_sort_keys(struct bkey_packed *dst,
 			continue;
 
 		while ((next = sort_iter_peek(iter)) &&
-		       !bkey_cmp_packed(iter->b, in, next)) {
+		       !bch2_bkey_cmp_packed(iter->b, in, next)) {
 			BUG_ON(in->needs_whiteout &&
 			       next->needs_whiteout);
 			needs_whiteout |= in->needs_whiteout;
@@ -406,7 +406,7 @@ static inline int sort_extents_cmp(struct btree *b,
 				   struct bkey_packed *l,
 				   struct bkey_packed *r)
 {
-	return bkey_cmp_packed(b, l, r) ?:
+	return bch2_bkey_cmp_packed(b, l, r) ?:
 		(int) bkey_deleted(l) - (int) bkey_deleted(r);
 }
 
diff --git a/fs/bcachefs/bset.h b/fs/bcachefs/bset.h
index 5389e4f4f350..12d5dc7bdb42 100644
--- a/fs/bcachefs/bset.h
+++ b/fs/bcachefs/bset.h
@@ -481,7 +481,7 @@ static inline int bkey_iter_cmp(const struct btree *b,
 				const struct bkey_packed *l,
 				const struct bkey_packed *r)
 {
-	return bkey_cmp_packed(b, l, r)
+	return bch2_bkey_cmp_packed(b, l, r)
 		?: (int) bkey_deleted(r) - (int) bkey_deleted(l)
 		?: cmp_int(l, r);
 }
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index d4f61ee5ed72..c1293709eb01 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -42,7 +42,7 @@ static void verify_no_dups(struct btree *b,
 		BUG_ON(extents
 		       ? bkey_cmp(l.p, bkey_start_pos(&r)) > 0
 		       : bkey_cmp(l.p, bkey_start_pos(&r)) >= 0);
-		//BUG_ON(bkey_cmp_packed(&b->format, p, k) >= 0);
+		//BUG_ON(bch2_bkey_cmp_packed(&b->format, p, k) >= 0);
 	}
 #endif
 }
@@ -102,14 +102,14 @@ static void sort_bkey_ptrs(const struct btree *bt,
 			break;
 
 		for (b = a; c = 2 * b + 1, (d = c + 1) < n;)
-			b = bkey_cmp_packed(bt,
+			b = bch2_bkey_cmp_packed(bt,
 					    ptrs[c],
 					    ptrs[d]) >= 0 ? c : d;
 		if (d == n)
 			b = c;
 
 		while (b != a &&
-		       bkey_cmp_packed(bt,
+		       bch2_bkey_cmp_packed(bt,
 				       ptrs[a],
 				       ptrs[b]) >= 0)
 			b = (b - 1) / 2;
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 963213e78f31..78b8e2d00fd9 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -1313,7 +1313,7 @@ bch2_btree_insert_keys_interior(struct btree_update *as, struct btree *b,
 	 * the node the iterator points to:
 	 */
 	while ((k = bch2_btree_node_iter_prev_all(&node_iter, b)) &&
-	       (bkey_cmp_packed(b, k, &insert->k) >= 0))
+	       (bkey_cmp_left_packed(b, k, &insert->k.p) >= 0))
 		;
 
 	for_each_keylist_key(keys, insert)
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 51ff6a16d249..3122256cc6ca 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -72,7 +72,7 @@ bool bch2_btree_bset_insert_key(struct btree_iter *iter,
 	EBUG_ON(iter->flags & BTREE_ITER_IS_EXTENTS);
 
 	k = bch2_btree_node_iter_peek_all(node_iter, b);
-	if (k && bkey_cmp_packed(b, k, &insert->k))
+	if (k && bkey_cmp_left_packed(b, k, &insert->k.p))
 		k = NULL;
 
 	/* @k is the key being overwritten/deleted, if any: */
-- 
cgit 


From b3d1e6cab2dfcdfef5fc35659a8f33a75ae5904e Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 7 Nov 2020 12:43:48 -0500
Subject: bcachefs: Fix build warning when CONFIG_BCACHEFS_DEBUG=n

this function is only used by debug code, but we'd like to always build
it so we know that it does build.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index bbb125fb9d43..f1d6553890f4 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -2408,9 +2408,10 @@ int bch2_trans_exit(struct btree_trans *trans)
 	return trans->error ? -EIO : 0;
 }
 
-static void bch2_btree_iter_node_to_text(struct printbuf *out,
-				 struct btree_bkey_cached_common *_b,
-				 enum btree_iter_type type)
+static void __maybe_unused
+bch2_btree_iter_node_to_text(struct printbuf *out,
+			     struct btree_bkey_cached_common *_b,
+			     enum btree_iter_type type)
 {
 	pr_buf(out, "    %px l=%u %s:",
 	       _b, _b->level, bch2_btree_ids[_b->btree_id]);
-- 
cgit 


From a3e7226268b26f0976f64ce8b0644daae28cafff Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 5 Nov 2020 23:39:33 -0500
Subject: bcachefs: New varints

Previous varint implementation used by the inode code was not nearly as
fast as it could have been; partly because it was attempting to encode
integers up to 96 bits (for timestamps) but this meant that encoding and
decoding the length required a table lookup.

Instead, we'll just encode timestamps greater than 64 bits as two
separate varints; this will make decoding/encoding of inodes
significantly faster overall.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/Makefile          |   1 +
 fs/bcachefs/bcachefs_format.h |  17 ++--
 fs/bcachefs/fsck.c            |   6 +-
 fs/bcachefs/inode.c           | 187 ++++++++++++++++++++++++++++++------------
 fs/bcachefs/inode.h           |  17 ++--
 fs/bcachefs/io.c              |   2 +-
 fs/bcachefs/recovery.c        |   2 +-
 fs/bcachefs/super.c           |   1 -
 fs/bcachefs/varint.c          |  43 ++++++++++
 fs/bcachefs/varint.h          |   8 ++
 10 files changed, 210 insertions(+), 74 deletions(-)
 create mode 100644 fs/bcachefs/varint.c
 create mode 100644 fs/bcachefs/varint.h

(limited to 'fs')

diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile
index ffe4db45e1c9..dad2fe2530e5 100644
--- a/fs/bcachefs/Makefile
+++ b/fs/bcachefs/Makefile
@@ -57,4 +57,5 @@ bcachefs-y		:=	\
 	tests.o			\
 	trace.o			\
 	util.o			\
+	varint.o		\
 	xattr.o
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index 0d79bb7764a7..f072e865e43f 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -673,10 +673,10 @@ struct bch_inode_generation {
 } __attribute__((packed, aligned(8)));
 
 #define BCH_INODE_FIELDS()			\
-	x(bi_atime,			64)	\
-	x(bi_ctime,			64)	\
-	x(bi_mtime,			64)	\
-	x(bi_otime,			64)	\
+	x(bi_atime,			96)	\
+	x(bi_ctime,			96)	\
+	x(bi_mtime,			96)	\
+	x(bi_otime,			96)	\
 	x(bi_size,			64)	\
 	x(bi_sectors,			64)	\
 	x(bi_uid,			32)	\
@@ -743,7 +743,8 @@ enum {
 #define BCH_INODE_UNLINKED	(1 << __BCH_INODE_UNLINKED)
 
 LE32_BITMASK(INODE_STR_HASH,	struct bch_inode, bi_flags, 20, 24);
-LE32_BITMASK(INODE_NR_FIELDS,	struct bch_inode, bi_flags, 24, 32);
+LE32_BITMASK(INODE_NR_FIELDS,	struct bch_inode, bi_flags, 24, 31);
+LE32_BITMASK(INODE_NEW_VARINT,	struct bch_inode, bi_flags, 31, 32);
 
 /* Dirents */
 
@@ -1334,13 +1335,15 @@ LE64_BITMASK(BCH_SB_ERASURE_CODE,	struct bch_sb, flags[3],  0, 16);
 	x(btree_ptr_v2,			11)	\
 	x(extents_above_btree_updates,	12)	\
 	x(btree_updates_journalled,	13)	\
-	x(reflink_inline_data,		14)
+	x(reflink_inline_data,		14)	\
+	x(new_varint,			15)
 
 #define BCH_SB_FEATURES_ALL				\
 	((1ULL << BCH_FEATURE_new_siphash)|		\
 	 (1ULL << BCH_FEATURE_new_extent_overwrite)|	\
 	 (1ULL << BCH_FEATURE_btree_ptr_v2)|		\
-	 (1ULL << BCH_FEATURE_extents_above_btree_updates))
+	 (1ULL << BCH_FEATURE_extents_above_btree_updates)|\
+	 (1ULL << BCH_FEATURE_new_varint))\
 
 enum bch_sb_feature {
 #define x(f, n) BCH_FEATURE_##f,
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 5a6df3d1973a..e3671b66c046 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -537,7 +537,7 @@ retry:
 
 			bch2_trans_unlock(&trans);
 
-			bch2_inode_pack(&p, &w.inode);
+			bch2_inode_pack(c, &p, &w.inode);
 
 			ret = bch2_btree_insert(c, BTREE_ID_INODES,
 						&p.inode.k_i, NULL, NULL,
@@ -808,7 +808,7 @@ create_root:
 			0, NULL);
 	root_inode->bi_inum = BCACHEFS_ROOT_INO;
 
-	bch2_inode_pack(&packed, root_inode);
+	bch2_inode_pack(c, &packed, root_inode);
 
 	return bch2_btree_insert(c, BTREE_ID_INODES, &packed.inode.k_i,
 				 NULL, NULL,
@@ -1326,7 +1326,7 @@ static int check_inode(struct btree_trans *trans,
 	if (do_update) {
 		struct bkey_inode_buf p;
 
-		bch2_inode_pack(&p, &u);
+		bch2_inode_pack(c, &p, &u);
 
 		ret = __bch2_trans_do(trans, NULL, NULL,
 				      BTREE_INSERT_NOFAIL|
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index b49c382f5452..c64197d8fc84 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -8,6 +8,7 @@
 #include "extents.h"
 #include "inode.h"
 #include "str_hash.h"
+#include "varint.h"
 
 #include <linux/random.h>
 
@@ -89,22 +90,17 @@ static int inode_decode_field(const u8 *in, const u8 *end,
 	return bytes;
 }
 
-void bch2_inode_pack(struct bkey_inode_buf *packed,
-		     const struct bch_inode_unpacked *inode)
+static noinline void bch2_inode_pack_v1(struct bkey_inode_buf *packed,
+					const struct bch_inode_unpacked *inode)
 {
-	u8 *out = packed->inode.v.fields;
+	struct bkey_i_inode *k = &packed->inode;
+	u8 *out = k->v.fields;
 	u8 *end = (void *) &packed[1];
 	u8 *last_nonzero_field = out;
 	unsigned nr_fields = 0, last_nonzero_fieldnr = 0;
 	unsigned bytes;
 
-	bkey_inode_init(&packed->inode.k_i);
-	packed->inode.k.p.offset	= inode->bi_inum;
-	packed->inode.v.bi_hash_seed	= inode->bi_hash_seed;
-	packed->inode.v.bi_flags	= cpu_to_le32(inode->bi_flags);
-	packed->inode.v.bi_mode		= cpu_to_le16(inode->bi_mode);
-
-#define x(_name, _bits)					\
+#define x(_name, _bits)							\
 	out += inode_encode_field(out, end, 0, inode->_name);		\
 	nr_fields++;							\
 									\
@@ -123,7 +119,69 @@ void bch2_inode_pack(struct bkey_inode_buf *packed,
 	set_bkey_val_bytes(&packed->inode.k, bytes);
 	memset_u64s_tail(&packed->inode.v, 0, bytes);
 
-	SET_INODE_NR_FIELDS(&packed->inode.v, nr_fields);
+	SET_INODE_NR_FIELDS(&k->v, nr_fields);
+}
+
+static void bch2_inode_pack_v2(struct bkey_inode_buf *packed,
+			       const struct bch_inode_unpacked *inode)
+{
+	struct bkey_i_inode *k = &packed->inode;
+	u8 *out = k->v.fields;
+	u8 *end = (void *) &packed[1];
+	u8 *last_nonzero_field = out;
+	unsigned nr_fields = 0, last_nonzero_fieldnr = 0;
+	unsigned bytes;
+	int ret;
+
+#define x(_name, _bits)							\
+	nr_fields++;							\
+									\
+	if (inode->_name) {						\
+		ret = bch2_varint_encode(out, inode->_name);		\
+		out += ret;						\
+									\
+		if (_bits > 64)						\
+			*out++ = 0;					\
+									\
+		last_nonzero_field = out;				\
+		last_nonzero_fieldnr = nr_fields;			\
+	} else {							\
+		*out++ = 0;						\
+									\
+		if (_bits > 64)						\
+			*out++ = 0;					\
+	}
+
+	BCH_INODE_FIELDS()
+#undef  x
+	BUG_ON(out > end);
+
+	out = last_nonzero_field;
+	nr_fields = last_nonzero_fieldnr;
+
+	bytes = out - (u8 *) &packed->inode.v;
+	set_bkey_val_bytes(&packed->inode.k, bytes);
+	memset_u64s_tail(&packed->inode.v, 0, bytes);
+
+	SET_INODE_NR_FIELDS(&k->v, nr_fields);
+}
+
+void bch2_inode_pack(struct bch_fs *c,
+		     struct bkey_inode_buf *packed,
+		     const struct bch_inode_unpacked *inode)
+{
+	bkey_inode_init(&packed->inode.k_i);
+	packed->inode.k.p.offset	= inode->bi_inum;
+	packed->inode.v.bi_hash_seed	= inode->bi_hash_seed;
+	packed->inode.v.bi_flags	= cpu_to_le32(inode->bi_flags);
+	packed->inode.v.bi_mode		= cpu_to_le16(inode->bi_mode);
+
+	if (c->sb.features & (1ULL << BCH_FEATURE_new_varint)) {
+		SET_INODE_NEW_VARINT(&packed->inode.v, true);
+		bch2_inode_pack_v2(packed, inode);
+	} else {
+		bch2_inode_pack_v1(packed, inode);
+	}
 
 	if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) {
 		struct bch_inode_unpacked unpacked;
@@ -135,26 +193,23 @@ void bch2_inode_pack(struct bkey_inode_buf *packed,
 		BUG_ON(unpacked.bi_hash_seed	!= inode->bi_hash_seed);
 		BUG_ON(unpacked.bi_mode		!= inode->bi_mode);
 
-#define x(_name, _bits)	BUG_ON(unpacked._name != inode->_name);
+#define x(_name, _bits)	if (unpacked._name != inode->_name)		\
+			panic("unpacked %llu should be %llu",		\
+			      (u64) unpacked._name, (u64) inode->_name);
 		BCH_INODE_FIELDS()
 #undef  x
 	}
 }
 
-int bch2_inode_unpack(struct bkey_s_c_inode inode,
-		      struct bch_inode_unpacked *unpacked)
+static noinline int bch2_inode_unpack_v1(struct bkey_s_c_inode inode,
+				struct bch_inode_unpacked *unpacked)
 {
 	const u8 *in = inode.v->fields;
-	const u8 *end = (void *) inode.v + bkey_val_bytes(inode.k);
+	const u8 *end = bkey_val_end(inode);
 	u64 field[2];
 	unsigned fieldnr = 0, field_bits;
 	int ret;
 
-	unpacked->bi_inum	= inode.k->p.offset;
-	unpacked->bi_hash_seed	= inode.v->bi_hash_seed;
-	unpacked->bi_flags	= le32_to_cpu(inode.v->bi_flags);
-	unpacked->bi_mode	= le16_to_cpu(inode.v->bi_mode);
-
 #define x(_name, _bits)					\
 	if (fieldnr++ == INODE_NR_FIELDS(inode.v)) {			\
 		unsigned offset = offsetof(struct bch_inode_unpacked, _name);\
@@ -177,6 +232,62 @@ int bch2_inode_unpack(struct bkey_s_c_inode inode,
 #undef  x
 
 	/* XXX: signal if there were more fields than expected? */
+	return 0;
+}
+
+static int bch2_inode_unpack_v2(struct bkey_s_c_inode inode,
+				struct bch_inode_unpacked *unpacked)
+{
+	const u8 *in = inode.v->fields;
+	const u8 *end = bkey_val_end(inode);
+	unsigned fieldnr = 0;
+	int ret;
+	u64 v[2];
+
+#define x(_name, _bits)							\
+	if (fieldnr < INODE_NR_FIELDS(inode.v)) {			\
+		ret = bch2_varint_decode(in, end, &v[0]);		\
+		if (ret < 0)						\
+			return ret;					\
+		in += ret;						\
+									\
+		if (_bits > 64) {					\
+			ret = bch2_varint_decode(in, end, &v[1]);	\
+			if (ret < 0)					\
+				return ret;				\
+			in += ret;					\
+		} else {						\
+			v[1] = 0;					\
+		}							\
+	} else {							\
+		v[0] = v[1] = 0;					\
+	}								\
+									\
+	unpacked->_name = v[0];						\
+	if (v[1] || v[0] != unpacked->_name)				\
+		return -1;						\
+	fieldnr++;
+
+	BCH_INODE_FIELDS()
+#undef  x
+
+	/* XXX: signal if there were more fields than expected? */
+	return 0;
+}
+
+int bch2_inode_unpack(struct bkey_s_c_inode inode,
+		      struct bch_inode_unpacked *unpacked)
+{
+	unpacked->bi_inum	= inode.k->p.offset;
+	unpacked->bi_hash_seed	= inode.v->bi_hash_seed;
+	unpacked->bi_flags	= le32_to_cpu(inode.v->bi_flags);
+	unpacked->bi_mode	= le16_to_cpu(inode.v->bi_mode);
+
+	if (INODE_NEW_VARINT(inode.v)) {
+		return bch2_inode_unpack_v2(inode, unpacked);
+	} else {
+		return bch2_inode_unpack_v1(inode, unpacked);
+	}
 
 	return 0;
 }
@@ -223,7 +334,7 @@ int bch2_inode_write(struct btree_trans *trans,
 	if (IS_ERR(inode_p))
 		return PTR_ERR(inode_p);
 
-	bch2_inode_pack(inode_p, inode);
+	bch2_inode_pack(trans->c, inode_p, inode);
 	bch2_trans_update(trans, iter, &inode_p->inode.k_i, 0);
 	return 0;
 }
@@ -426,10 +537,7 @@ found_slot:
 	inode_u->bi_inum	= k.k->p.offset;
 	inode_u->bi_generation	= bkey_generation(k);
 
-	bch2_inode_pack(inode_p, inode_u);
-	bch2_trans_update(trans, iter, &inode_p->inode.k_i, 0);
-	bch2_trans_iter_put(trans, iter);
-	return 0;
+	return bch2_inode_write(trans, iter, inode_u);
 }
 
 int bch2_inode_rm(struct bch_fs *c, u64 inode_nr)
@@ -553,32 +661,3 @@ int bch2_inode_find_by_inum(struct bch_fs *c, u64 inode_nr,
 	return bch2_trans_do(c, NULL, NULL, 0,
 		bch2_inode_find_by_inum_trans(&trans, inode_nr, inode));
 }
-
-#ifdef CONFIG_BCACHEFS_DEBUG
-void bch2_inode_pack_test(void)
-{
-	struct bch_inode_unpacked *u, test_inodes[] = {
-		{
-			.bi_atime	= U64_MAX,
-			.bi_ctime	= U64_MAX,
-			.bi_mtime	= U64_MAX,
-			.bi_otime	= U64_MAX,
-			.bi_size	= U64_MAX,
-			.bi_sectors	= U64_MAX,
-			.bi_uid		= U32_MAX,
-			.bi_gid		= U32_MAX,
-			.bi_nlink	= U32_MAX,
-			.bi_generation	= U32_MAX,
-			.bi_dev		= U32_MAX,
-		},
-	};
-
-	for (u = test_inodes;
-	     u < test_inodes + ARRAY_SIZE(test_inodes);
-	     u++) {
-		struct bkey_inode_buf p;
-
-		bch2_inode_pack(&p, u);
-	}
-}
-#endif
diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h
index 5743be2307f3..ef7e885dce0c 100644
--- a/fs/bcachefs/inode.h
+++ b/fs/bcachefs/inode.h
@@ -24,6 +24,14 @@ void bch2_inode_generation_to_text(struct printbuf *, struct bch_fs *,
 	.val_to_text	= bch2_inode_generation_to_text,	\
 }
 
+#if 0
+typedef struct {
+	u64			lo;
+	u32			hi;
+} __packed __aligned(4) u96;
+#endif
+typedef u64 u96;
+
 struct bch_inode_unpacked {
 	u64			bi_inum;
 	__le64			bi_hash_seed;
@@ -43,7 +51,8 @@ struct bkey_inode_buf {
 #undef  x
 } __attribute__((packed, aligned(8)));
 
-void bch2_inode_pack(struct bkey_inode_buf *, const struct bch_inode_unpacked *);
+void bch2_inode_pack(struct bch_fs *, struct bkey_inode_buf *,
+		     const struct bch_inode_unpacked *);
 int bch2_inode_unpack(struct bkey_s_c_inode, struct bch_inode_unpacked *);
 
 struct btree_iter *bch2_inode_peek(struct btree_trans *,
@@ -166,10 +175,4 @@ static inline void bch2_inode_nlink_set(struct bch_inode_unpacked *bi,
 	}
 }
 
-#ifdef CONFIG_BCACHEFS_DEBUG
-void bch2_inode_pack_test(void);
-#else
-static inline void bch2_inode_pack_test(void) {}
-#endif
-
 #endif /* _BCACHEFS_INODE_H */
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index 6df99ac013a1..62a9a0b32d5b 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -310,7 +310,7 @@ int bch2_extent_update(struct btree_trans *trans,
 		inode_u.bi_sectors += delta;
 
 		if (delta || new_i_size) {
-			bch2_inode_pack(&inode_p, &inode_u);
+			bch2_inode_pack(trans->c, &inode_p, &inode_u);
 			bch2_trans_update(trans, inode_iter,
 					  &inode_p.inode.k_i, 0);
 		}
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 32fed6b81a52..1745cfac6b26 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -1320,7 +1320,7 @@ int bch2_fs_initialize(struct bch_fs *c)
 	bch2_inode_init(c, &root_inode, 0, 0,
 			S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0, NULL);
 	root_inode.bi_inum = BCACHEFS_ROOT_INO;
-	bch2_inode_pack(&packed_inode, &root_inode);
+	bch2_inode_pack(c, &packed_inode, &root_inode);
 
 	err = "error creating root directory";
 	ret = bch2_btree_insert(c, BTREE_ID_INODES,
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index e55fcbcbd37f..61b7e750037c 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -2027,7 +2027,6 @@ static void bcachefs_exit(void)
 static int __init bcachefs_init(void)
 {
 	bch2_bkey_pack_test();
-	bch2_inode_pack_test();
 
 	if (!(bcachefs_kset = kset_create_and_add("bcachefs", NULL, fs_kobj)) ||
 	    bch2_chardev_init() ||
diff --git a/fs/bcachefs/varint.c b/fs/bcachefs/varint.c
new file mode 100644
index 000000000000..0f3d06a6a685
--- /dev/null
+++ b/fs/bcachefs/varint.c
@@ -0,0 +1,43 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/bitops.h>
+#include <linux/math.h>
+#include <asm/unaligned.h>
+
+#include "varint.h"
+
+int bch2_varint_encode(u8 *out, u64 v)
+{
+	unsigned bits = fls64(v|1);
+	unsigned bytes = DIV_ROUND_UP(bits, 7);
+
+	if (likely(bytes < 9)) {
+		v <<= bytes;
+		v |= ~(~0 << (bytes - 1));
+	} else {
+		*out++ = 255;
+		bytes = 9;
+	}
+
+	put_unaligned_le64(v, out);
+	return bytes;
+}
+
+int bch2_varint_decode(const u8 *in, const u8 *end, u64 *out)
+{
+	u64 v = get_unaligned_le64(in);
+	unsigned bytes = ffz(v & 255) + 1;
+
+	if (unlikely(in + bytes > end))
+		return -1;
+
+	if (likely(bytes < 9)) {
+		v >>= bytes;
+		v &= ~(~0ULL << (7 * bytes));
+	} else {
+		v = get_unaligned_le64(++in);
+	}
+
+	*out = v;
+	return bytes;
+}
diff --git a/fs/bcachefs/varint.h b/fs/bcachefs/varint.h
new file mode 100644
index 000000000000..8daf813576b7
--- /dev/null
+++ b/fs/bcachefs/varint.h
@@ -0,0 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_VARINT_H
+#define _BCACHEFS_VARINT_H
+
+int bch2_varint_encode(u8 *, u64);
+int bch2_varint_decode(const u8 *, const u8 *, u64 *);
+
+#endif /* _BCACHEFS_VARINT_H */
-- 
cgit 


From fe4584765d831571231de629fc139af5fc9db2d5 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 7 Nov 2020 13:03:24 -0500
Subject: bcachefs: use a radix tree for inum bitmap in fsck

The change to use the cpu nr for the high bits of new inode numbers
means that inode numbers are very space - we see -ENOMEM during fsck
without this.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fsck.c | 39 +++++++++++++--------------------------
 1 file changed, 13 insertions(+), 26 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index e3671b66c046..0c5035270846 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -866,36 +866,22 @@ create_lostfound:
 	return ret;
 }
 
-struct inode_bitmap {
-	unsigned long	*bits;
-	size_t		size;
-};
+typedef GENRADIX(unsigned long) inode_bitmap;
 
-static inline bool inode_bitmap_test(struct inode_bitmap *b, size_t nr)
+static inline bool inode_bitmap_test(inode_bitmap *b, size_t nr)
 {
-	return nr < b->size ? test_bit(nr, b->bits) : false;
+	unsigned long *w = genradix_ptr(b, nr / BITS_PER_LONG);
+	return w ? test_bit(nr & (BITS_PER_LONG - 1), w) : false;
 }
 
-static inline int inode_bitmap_set(struct inode_bitmap *b, size_t nr)
+static inline int inode_bitmap_set(inode_bitmap *b, size_t nr)
 {
-	if (nr >= b->size) {
-		size_t new_size = max_t(size_t, max_t(size_t,
-					PAGE_SIZE * 8,
-					b->size * 2),
-					nr + 1);
-		void *n;
-
-		new_size = roundup_pow_of_two(new_size);
-		n = krealloc(b->bits, new_size / 8, GFP_KERNEL|__GFP_ZERO);
-		if (!n) {
-			return -ENOMEM;
-		}
+	unsigned long *w = genradix_ptr_alloc(b, nr / BITS_PER_LONG, GFP_KERNEL);
 
-		b->bits = n;
-		b->size = new_size;
-	}
+	if (!w)
+		return -ENOMEM;
 
-	__set_bit(nr, b->bits);
+	*w |= 1UL << (nr & (BITS_PER_LONG - 1));
 	return 0;
 }
 
@@ -934,7 +920,7 @@ noinline_for_stack
 static int check_directory_structure(struct bch_fs *c,
 				     struct bch_inode_unpacked *lostfound_inode)
 {
-	struct inode_bitmap dirs_done = { NULL, 0 };
+	inode_bitmap dirs_done;
 	struct pathbuf path = { 0, 0, NULL };
 	struct pathbuf_entry *e;
 	struct btree_trans trans;
@@ -951,6 +937,7 @@ static int check_directory_structure(struct bch_fs *c,
 
 	/* DFS: */
 restart_dfs:
+	genradix_init(&dirs_done);
 	had_unreachable = false;
 
 	ret = inode_bitmap_set(&dirs_done, BCACHEFS_ROOT_INO);
@@ -1057,7 +1044,7 @@ retry:
 
 	if (had_unreachable) {
 		bch_info(c, "reattached unreachable directories, restarting pass to check for loops");
-		kfree(dirs_done.bits);
+		genradix_free(&dirs_done);
 		kfree(path.entries);
 		memset(&dirs_done, 0, sizeof(dirs_done));
 		memset(&path, 0, sizeof(path));
@@ -1066,7 +1053,7 @@ retry:
 err:
 fsck_err:
 	ret = bch2_trans_exit(&trans) ?: ret;
-	kfree(dirs_done.bits);
+	genradix_free(&dirs_done);
 	kfree(path.entries);
 	return ret;
 }
-- 
cgit 


From 9ae82fe6ace1b267005758ccfb2347a4a6aa4398 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 7 Nov 2020 16:16:52 -0500
Subject: bcachefs: Inline make_bfloat() into __build_ro_aux_tree()

This is a fast path - also, lift out the checks/init for min/max key.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bset.c | 94 ++++++++++++++++++++++++++----------------------------
 1 file changed, 46 insertions(+), 48 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c
index fc8a7cc2f4bd..89d511db2c50 100644
--- a/fs/bcachefs/bset.c
+++ b/fs/bcachefs/bset.c
@@ -592,53 +592,23 @@ static inline unsigned bkey_mantissa(const struct bkey_packed *k,
 	return (u16) v;
 }
 
-static void make_bfloat(struct btree *b, struct bset_tree *t,
-			unsigned j,
-			struct bkey_packed *min_key,
-			struct bkey_packed *max_key)
+__always_inline
+static inline void __make_bfloat(struct btree *b, struct bset_tree *t,
+				 unsigned j,
+				 struct bkey_packed *min_key,
+				 struct bkey_packed *max_key)
 {
 	struct bkey_float *f = bkey_float(b, t, j);
 	struct bkey_packed *m = tree_to_bkey(b, t, j);
-	struct bkey_packed *l, *r;
+	struct bkey_packed *l = is_power_of_2(j)
+		? min_key
+		: tree_to_prev_bkey(b, t, j >> ffs(j));
+	struct bkey_packed *r = is_power_of_2(j + 1)
+		? max_key
+		: tree_to_bkey(b, t, j >> (ffz(j) + 1));
 	unsigned mantissa;
 	int shift, exponent, high_bit;
 
-	if (is_power_of_2(j)) {
-		l = min_key;
-
-		if (!l->u64s) {
-			if (!bkey_pack_pos(l, b->data->min_key, b)) {
-				struct bkey_i tmp;
-
-				bkey_init(&tmp.k);
-				tmp.k.p = b->data->min_key;
-				bkey_copy(l, &tmp);
-			}
-		}
-	} else {
-		l = tree_to_prev_bkey(b, t, j >> ffs(j));
-
-		EBUG_ON(m < l);
-	}
-
-	if (is_power_of_2(j + 1)) {
-		r = max_key;
-
-		if (!r->u64s) {
-			if (!bkey_pack_pos(r, t->max_key, b)) {
-				struct bkey_i tmp;
-
-				bkey_init(&tmp.k);
-				tmp.k.p = t->max_key;
-				bkey_copy(r, &tmp);
-			}
-		}
-	} else {
-		r = tree_to_bkey(b, t, j >> (ffz(j) + 1));
-
-		EBUG_ON(m > r);
-	}
-
 	/*
 	 * for failed bfloats, the lookup code falls back to comparing against
 	 * the original key.
@@ -695,6 +665,30 @@ static void make_bfloat(struct btree *b, struct bset_tree *t,
 	f->mantissa = mantissa;
 }
 
+static void make_bfloat(struct btree *b, struct bset_tree *t,
+			unsigned j,
+			struct bkey_packed *min_key,
+			struct bkey_packed *max_key)
+{
+	struct bkey_i *k;
+
+	if (is_power_of_2(j) &&
+	    !min_key->u64s) {
+		k = (void *) min_key;
+		bkey_init(&k->k);
+		k->k.p = b->data->min_key;
+	}
+
+	if (is_power_of_2(j + 1) &&
+	    !max_key->u64s) {
+		k = (void *) max_key;
+		bkey_init(&k->k);
+		k->k.p = t->max_key;
+	}
+
+	__make_bfloat(b, t, j, min_key, max_key);
+}
+
 /* bytes remaining - only valid for last bset: */
 static unsigned __bset_tree_capacity(const struct btree *b, const struct bset_tree *t)
 {
@@ -714,7 +708,7 @@ static unsigned bset_rw_tree_capacity(const struct btree *b, const struct bset_t
 	return __bset_tree_capacity(b, t) / sizeof(struct rw_aux_tree);
 }
 
-static void __build_rw_aux_tree(struct btree *b, struct bset_tree *t)
+static noinline void __build_rw_aux_tree(struct btree *b, struct bset_tree *t)
 {
 	struct bkey_packed *k;
 
@@ -733,15 +727,12 @@ static void __build_rw_aux_tree(struct btree *b, struct bset_tree *t)
 	}
 }
 
-static void __build_ro_aux_tree(struct btree *b, struct bset_tree *t)
+static noinline void __build_ro_aux_tree(struct btree *b, struct bset_tree *t)
 {
 	struct bkey_packed *prev = NULL, *k = btree_bkey_first(b, t);
-	struct bkey_packed min_key, max_key;
+	struct bkey_i min_key, max_key;
 	unsigned j, cacheline = 1;
 
-	/* signal to make_bfloat() that they're uninitialized: */
-	min_key.u64s = max_key.u64s = 0;
-
 	t->size = min(bkey_to_cacheline(b, t, btree_bkey_last(b, t)),
 		      bset_ro_tree_capacity(b, t));
 retry:
@@ -777,9 +768,16 @@ retry:
 
 	t->max_key = bkey_unpack_pos(b, prev);
 
+	bkey_init(&min_key.k);
+	min_key.k.p = b->data->min_key;
+	bkey_init(&max_key.k);
+	max_key.k.p = t->max_key;
+
 	/* Then we build the tree */
 	eytzinger1_for_each(j, t->size)
-		make_bfloat(b, t, j, &min_key, &max_key);
+		__make_bfloat(b, t, j,
+			      bkey_to_packed(&min_key),
+			      bkey_to_packed(&max_key));
 }
 
 static void bset_alloc_tree(struct btree *b, struct bset_tree *t)
-- 
cgit 


From 01819cfe37e864a0e7d6f208c2e5b4635c66f974 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 7 Nov 2020 16:55:57 -0500
Subject: bcachefs: Fix btree iterator leak

this fixes an occasonial btree transaction iterators overflow.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/inode.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index c64197d8fc84..8e52d475b397 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -537,7 +537,9 @@ found_slot:
 	inode_u->bi_inum	= k.k->p.offset;
 	inode_u->bi_generation	= bkey_generation(k);
 
-	return bch2_inode_write(trans, iter, inode_u);
+	ret = bch2_inode_write(trans, iter, inode_u);
+	bch2_trans_iter_put(trans, iter);
+	return ret;
 }
 
 int bch2_inode_rm(struct bch_fs *c, u64 inode_nr)
-- 
cgit 


From 6a747c4683803abb01ce246ac2faf7f171cb3872 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 9 Nov 2020 13:01:52 -0500
Subject: bcachefs: Add accounting for dirty btree nodes/keys

This lets us improve journal reclaim, so that it now tries to make sure
no more than 3/4s of the btree node cache and btree key cache are dirty
- ensuring the shrinkers can free memory.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_cache.c           |  4 +++-
 fs/bcachefs/btree_io.c              |  2 ++
 fs/bcachefs/btree_io.h              | 17 ++++++++++++++++
 fs/bcachefs/btree_key_cache.c       | 39 +++++++++++++++++++++++++++++--------
 fs/bcachefs/btree_types.h           |  6 +++++-
 fs/bcachefs/btree_update_interior.c |  8 ++++----
 fs/bcachefs/btree_update_leaf.c     |  2 +-
 fs/bcachefs/journal_reclaim.c       |  6 ++++++
 8 files changed, 69 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index 229841c2ef0c..d130447e3477 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -382,11 +382,13 @@ void bch2_fs_btree_cache_exit(struct bch_fs *c)
 
 		if (btree_node_dirty(b))
 			bch2_btree_complete_write(c, b, btree_current_write(b));
-		clear_btree_node_dirty(b);
+		clear_btree_node_dirty(c, b);
 
 		btree_node_data_free(c, b);
 	}
 
+	BUG_ON(atomic_read(&c->btree_cache.dirty));
+
 	while (!list_empty(&bc->freed)) {
 		b = list_first_entry(&bc->freed, struct btree, list);
 		list_del(&b->list);
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index c1293709eb01..0de703c5b4b7 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -1498,6 +1498,8 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
 		new ^=  (1 << BTREE_NODE_write_idx);
 	} while (cmpxchg_acquire(&b->flags, old, new) != old);
 
+	atomic_dec(&c->btree_cache.dirty);
+
 	BUG_ON(btree_node_fake(b));
 	BUG_ON((b->will_make_reachable != 0) != !b->written);
 
diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h
index 626d0f071b70..1a4b11e99cc4 100644
--- a/fs/bcachefs/btree_io.h
+++ b/fs/bcachefs/btree_io.h
@@ -14,6 +14,23 @@ struct btree_write;
 struct btree;
 struct btree_iter;
 
+static inline bool btree_node_dirty(struct btree *b)
+{
+	return test_bit(BTREE_NODE_dirty, &b->flags);
+}
+
+static inline void set_btree_node_dirty(struct bch_fs *c, struct btree *b)
+{
+	if (!test_and_set_bit(BTREE_NODE_dirty, &b->flags))
+		atomic_inc(&c->btree_cache.dirty);
+}
+
+static inline void clear_btree_node_dirty(struct bch_fs *c, struct btree *b)
+{
+	if (test_and_clear_bit(BTREE_NODE_dirty, &b->flags))
+		atomic_dec(&c->btree_cache.dirty);
+}
+
 struct btree_read_bio {
 	struct bch_fs		*c;
 	u64			start_time;
diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index 8b43460c9c9b..4c61324f59d4 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -65,6 +65,8 @@ static void bkey_cached_evict(struct btree_key_cache *c,
 	BUG_ON(rhashtable_remove_fast(&c->table, &ck->hash,
 				      bch2_btree_key_cache_params));
 	memset(&ck->key, ~0, sizeof(ck->key));
+
+	c->nr_keys--;
 }
 
 static void bkey_cached_free(struct btree_key_cache *c,
@@ -135,6 +137,8 @@ btree_key_cache_create(struct btree_key_cache *c,
 		return NULL;
 	}
 
+	c->nr_keys++;
+
 	list_move(&ck->list, &c->clean);
 	six_unlock_write(&ck->c.lock);
 
@@ -355,10 +359,14 @@ err:
 
 	bch2_journal_pin_drop(j, &ck->journal);
 	bch2_journal_preres_put(j, &ck->res);
-	clear_bit(BKEY_CACHED_DIRTY, &ck->flags);
 
 	if (!evict) {
 		mutex_lock(&c->btree_key_cache.lock);
+		if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
+			clear_bit(BKEY_CACHED_DIRTY, &ck->flags);
+			c->btree_key_cache.nr_dirty--;
+		}
+
 		list_move_tail(&ck->list, &c->btree_key_cache.clean);
 		mutex_unlock(&c->btree_key_cache.lock);
 	} else {
@@ -371,6 +379,11 @@ evict:
 		six_lock_write(&ck->c.lock, NULL, NULL);
 
 		mutex_lock(&c->btree_key_cache.lock);
+		if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
+			clear_bit(BKEY_CACHED_DIRTY, &ck->flags);
+			c->btree_key_cache.nr_dirty--;
+		}
+
 		bkey_cached_evict(&c->btree_key_cache, ck);
 		bkey_cached_free(&c->btree_key_cache, ck);
 		mutex_unlock(&c->btree_key_cache.lock);
@@ -448,9 +461,10 @@ bool bch2_btree_insert_key_cached(struct btree_trans *trans,
 
 	if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
 		mutex_lock(&c->btree_key_cache.lock);
-		list_del_init(&ck->list);
+		list_move(&ck->list, &c->btree_key_cache.dirty);
 
 		set_bit(BKEY_CACHED_DIRTY, &ck->flags);
+		c->btree_key_cache.nr_dirty++;
 		mutex_unlock(&c->btree_key_cache.lock);
 	}
 
@@ -467,20 +481,28 @@ void bch2_btree_key_cache_verify_clean(struct btree_trans *trans,
 }
 #endif
 
-void bch2_fs_btree_key_cache_exit(struct btree_key_cache *c)
+void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc)
 {
+	struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
 	struct bkey_cached *ck, *n;
 
-	mutex_lock(&c->lock);
-	list_for_each_entry_safe(ck, n, &c->clean, list) {
+	mutex_lock(&bc->lock);
+	list_splice(&bc->dirty, &bc->clean);
+
+	list_for_each_entry_safe(ck, n, &bc->clean, list) {
 		kfree(ck->k);
 		kfree(ck);
+		bc->nr_keys--;
 	}
-	list_for_each_entry_safe(ck, n, &c->freed, list)
+
+	BUG_ON(bc->nr_dirty && !bch2_journal_error(&c->journal));
+	BUG_ON(bc->nr_keys);
+
+	list_for_each_entry_safe(ck, n, &bc->freed, list)
 		kfree(ck);
-	mutex_unlock(&c->lock);
+	mutex_unlock(&bc->lock);
 
-	rhashtable_destroy(&c->table);
+	rhashtable_destroy(&bc->table);
 }
 
 void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *c)
@@ -488,6 +510,7 @@ void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *c)
 	mutex_init(&c->lock);
 	INIT_LIST_HEAD(&c->freed);
 	INIT_LIST_HEAD(&c->clean);
+	INIT_LIST_HEAD(&c->dirty);
 }
 
 int bch2_fs_btree_key_cache_init(struct btree_key_cache *c)
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 55ea028d242e..de287f91ac28 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -158,6 +158,7 @@ struct btree_cache {
 	/* Number of elements in live + freeable lists */
 	unsigned		used;
 	unsigned		reserve;
+	atomic_t		dirty;
 	struct shrinker		shrink;
 
 	/*
@@ -294,6 +295,10 @@ struct btree_key_cache {
 	struct rhashtable	table;
 	struct list_head	freed;
 	struct list_head	clean;
+	struct list_head	dirty;
+
+	size_t			nr_keys;
+	size_t			nr_dirty;
 };
 
 struct bkey_cached_key {
@@ -411,7 +416,6 @@ enum btree_flags {
 
 BTREE_FLAG(read_in_flight);
 BTREE_FLAG(read_error);
-BTREE_FLAG(dirty);
 BTREE_FLAG(need_write);
 BTREE_FLAG(noevict);
 BTREE_FLAG(write_idx);
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 78b8e2d00fd9..c1f822b96c48 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -149,7 +149,7 @@ void bch2_btree_node_free_never_inserted(struct bch_fs *c, struct btree *b)
 
 	b->ob.nr = 0;
 
-	clear_btree_node_dirty(b);
+	clear_btree_node_dirty(c, b);
 
 	btree_node_lock_type(c, b, SIX_LOCK_write);
 	__btree_node_free(c, b);
@@ -264,7 +264,7 @@ static struct btree *bch2_btree_node_alloc(struct btree_update *as, unsigned lev
 	b = as->prealloc_nodes[--as->nr_prealloc_nodes];
 
 	set_btree_node_accessed(b);
-	set_btree_node_dirty(b);
+	set_btree_node_dirty(c, b);
 	set_btree_node_need_write(b);
 
 	bch2_bset_init_first(b, &b->data->keys);
@@ -827,7 +827,7 @@ void bch2_btree_interior_update_will_free_node(struct btree_update *as,
 		closure_wake_up(&c->btree_interior_update_wait);
 	}
 
-	clear_btree_node_dirty(b);
+	clear_btree_node_dirty(c, b);
 	clear_btree_node_need_write(b);
 
 	/*
@@ -1034,7 +1034,7 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, struct btree *b
 		bch2_btree_node_iter_advance(node_iter, b);
 
 	bch2_btree_bset_insert_key(iter, b, node_iter, insert);
-	set_btree_node_dirty(b);
+	set_btree_node_dirty(as->c, b);
 	set_btree_node_need_write(b);
 }
 
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 3122256cc6ca..4ab12a9db2f4 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -191,7 +191,7 @@ static bool btree_insert_key_leaf(struct btree_trans *trans,
 	bch2_btree_add_journal_pin(c, b, trans->journal_res.seq);
 
 	if (unlikely(!btree_node_dirty(b)))
-		set_btree_node_dirty(b);
+		set_btree_node_dirty(c, b);
 
 	live_u64s_added = (int) b->nr.live_u64s - old_live_u64s;
 	u64s_added = (int) bset_u64s(t) - old_u64s;
diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
index 3f57f498ce0b..da28761e7942 100644
--- a/fs/bcachefs/journal_reclaim.c
+++ b/fs/bcachefs/journal_reclaim.c
@@ -547,6 +547,12 @@ void bch2_journal_reclaim(struct journal *j)
 
 		if (j->prereserved.reserved * 2 > j->prereserved.remaining)
 			min_nr = 1;
+
+		if ((atomic_read(&c->btree_cache.dirty) * 4 >
+		     c->btree_cache.used  * 3) ||
+		    (c->btree_key_cache.nr_dirty * 4 >
+		     c->btree_key_cache.nr_keys))
+			min_nr = 1;
 	} while (journal_flush_pins(j, seq_to_flush, min_nr));
 
 	if (!bch2_journal_error(j))
-- 
cgit 


From f526d26d711a376aa3dd8dd56f55928d5a28d9b1 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 11 Nov 2020 17:47:39 -0500
Subject: bcachefs: Fix btree key cache shutdown

On emergency shutdown, we might still have dirty keys in the btree key
cache that need to be cleaned up properly.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_key_cache.c | 3 +++
 fs/bcachefs/journal_reclaim.c | 1 +
 2 files changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index 4c61324f59d4..8b5e690a4d83 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -490,6 +490,9 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc)
 	list_splice(&bc->dirty, &bc->clean);
 
 	list_for_each_entry_safe(ck, n, &bc->clean, list) {
+		bch2_journal_pin_drop(&c->journal, &ck->journal);
+		bch2_journal_preres_put(&c->journal, &ck->res);
+
 		kfree(ck->k);
 		kfree(ck);
 		bc->nr_keys--;
diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
index da28761e7942..f9e0160074db 100644
--- a/fs/bcachefs/journal_reclaim.c
+++ b/fs/bcachefs/journal_reclaim.c
@@ -263,6 +263,7 @@ static void bch2_journal_reclaim_fast(struct journal *j)
 	while (!fifo_empty(&j->pin) &&
 	       !atomic_read(&fifo_peek_front(&j->pin).count)) {
 		BUG_ON(!list_empty(&fifo_peek_front(&j->pin).list));
+		BUG_ON(!list_empty(&fifo_peek_front(&j->pin).flushed));
 		BUG_ON(!fifo_pop(&j->pin, temp));
 		popped = true;
 	}
-- 
cgit 


From e648448ca562af0cb11729ce6fad0f860fa42cb4 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 11 Nov 2020 18:59:41 -0500
Subject: bcachefs: Fix missing memalloc_nofs_restore()

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_cache.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index d130447e3477..9d8c73ec57d3 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -329,9 +329,9 @@ restart:
 			clear_btree_node_accessed(b);
 	}
 
-	memalloc_nofs_restore(flags);
 	mutex_unlock(&bc->lock);
 out:
+	memalloc_nofs_restore(flags);
 	return (unsigned long) freed * btree_pages(c);
 }
 
-- 
cgit 


From 6d9378f3dcd7b91effdc4ffe1da1a2e8987e9f1e Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 11 Nov 2020 12:42:54 -0500
Subject: bcachefs: Hack around bch2_varint_decode invalid reads

bch2_varint_decode can do reads up to 7 bytes past the end ptr, for the
sake of performance - these extra bytes are always masked off.

This won't be a problem in practice if we make sure to burn 8 bytes in
any buffer that has bkeys in it.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_io.c              | 3 +++
 fs/bcachefs/btree_update_interior.h | 3 +++
 2 files changed, 6 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 0de703c5b4b7..302ee3851b0d 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -1532,6 +1532,9 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
 		seq = max(seq, le64_to_cpu(i->journal_seq));
 	}
 
+	/* bch2_varint_decode may read up to 7 bytes past the end of the buffer: */
+	bytes += 8;
+
 	data = btree_bounce_alloc(c, bytes, &used_mempool);
 
 	if (!b->written) {
diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h
index 7668225e72c6..41854fc345d2 100644
--- a/fs/bcachefs/btree_update_interior.h
+++ b/fs/bcachefs/btree_update_interior.h
@@ -237,6 +237,9 @@ static inline ssize_t __bch_btree_u64s_remaining(struct bch_fs *c,
 		b->whiteout_u64s;
 	ssize_t total = c->opts.btree_node_size << 6;
 
+	/* Always leave one extra u64 for bch2_varint_decode: */
+	used++;
+
 	return total - used;
 }
 
-- 
cgit 


From eb8e6e9ccbb4ba37c04a7cff032975b4df7d63c7 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 11 Nov 2020 12:33:12 -0500
Subject: bcachefs: Deadlock prevention for ei_pagecache_lock

In the dio write path, when get_user_pages() invokes the fault handler
we have a recursive locking situation - we have to handle the lock
ordering ourselves or we have a deadlock: this patch addresses that by
checking for locking ordering violations and doing the unlock/relock
dance if necessary.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-io.c | 68 +++++++++++++++++++++++++++++++++++++++++++++++++++--
 fs/bcachefs/fs.c    |  5 ++++
 fs/bcachefs/fs.h    |  1 +
 3 files changed, 72 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 658d19c04b99..1afdd775ffb3 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -44,6 +44,22 @@ static inline bool bio_full(struct bio *bio, unsigned len)
 	return false;
 }
 
+static inline struct address_space *faults_disabled_mapping(void)
+{
+	return (void *) (((unsigned long) current->faults_disabled_mapping) & ~1UL);
+}
+
+static inline void set_fdm_dropped_locks(void)
+{
+	current->faults_disabled_mapping =
+		(void *) (((unsigned long) current->faults_disabled_mapping)|1);
+}
+
+static inline bool fdm_dropped_locks(void)
+{
+	return ((unsigned long) current->faults_disabled_mapping) & 1;
+}
+
 struct quota_res {
 	u64				sectors;
 };
@@ -501,10 +517,35 @@ static void bch2_set_page_dirty(struct bch_fs *c,
 vm_fault_t bch2_page_fault(struct vm_fault *vmf)
 {
 	struct file *file = vmf->vma->vm_file;
+	struct address_space *mapping = file->f_mapping;
+	struct address_space *fdm = faults_disabled_mapping();
 	struct bch_inode_info *inode = file_bch_inode(file);
 	int ret;
 
+	if (fdm == mapping)
+		return VM_FAULT_SIGBUS;
+
+	/* Lock ordering: */
+	if (fdm > mapping) {
+		struct bch_inode_info *fdm_host = to_bch_ei(fdm->host);
+
+		if (bch2_pagecache_add_tryget(&inode->ei_pagecache_lock))
+			goto got_lock;
+
+		bch2_pagecache_block_put(&fdm_host->ei_pagecache_lock);
+
+		bch2_pagecache_add_get(&inode->ei_pagecache_lock);
+		bch2_pagecache_add_put(&inode->ei_pagecache_lock);
+
+		bch2_pagecache_block_get(&fdm_host->ei_pagecache_lock);
+
+		/* Signal that lock has been dropped: */
+		set_fdm_dropped_locks();
+		return VM_FAULT_SIGBUS;
+	}
+
 	bch2_pagecache_add_get(&inode->ei_pagecache_lock);
+got_lock:
 	ret = filemap_fault(vmf);
 	bch2_pagecache_add_put(&inode->ei_pagecache_lock);
 
@@ -1765,14 +1806,16 @@ static long bch2_dio_write_loop(struct dio_write *dio)
 	struct bio *bio = &dio->op.wbio.bio;
 	struct bvec_iter_all iter;
 	struct bio_vec *bv;
-	unsigned unaligned;
-	bool sync = dio->sync;
+	unsigned unaligned, iter_count;
+	bool sync = dio->sync, dropped_locks;
 	long ret;
 
 	if (dio->loop)
 		goto loop;
 
 	while (1) {
+		iter_count = dio->iter.count;
+
 		if (kthread)
 			kthread_use_mm(dio->mm);
 		BUG_ON(current->faults_disabled_mapping);
@@ -1780,13 +1823,34 @@ static long bch2_dio_write_loop(struct dio_write *dio)
 
 		ret = bio_iov_iter_get_pages(bio, &dio->iter);
 
+		dropped_locks = fdm_dropped_locks();
+
 		current->faults_disabled_mapping = NULL;
 		if (kthread)
 			kthread_unuse_mm(dio->mm);
 
+		/*
+		 * If the fault handler returned an error but also signalled
+		 * that it dropped & retook ei_pagecache_lock, we just need to
+		 * re-shoot down the page cache and retry:
+		 */
+		if (dropped_locks && ret)
+			ret = 0;
+
 		if (unlikely(ret < 0))
 			goto err;
 
+		if (unlikely(dropped_locks)) {
+			ret = write_invalidate_inode_pages_range(mapping,
+					req->ki_pos,
+					req->ki_pos + iter_count - 1);
+			if (unlikely(ret))
+				goto err;
+
+			if (!bio->bi_iter.bi_size)
+				continue;
+		}
+
 		unaligned = bio->bi_iter.bi_size & (block_bytes(c) - 1);
 		bio->bi_iter.bi_size -= unaligned;
 		iov_iter_revert(&dio->iter, unaligned);
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index 3e3ab4e53f33..231a5433577f 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -93,6 +93,11 @@ void bch2_pagecache_add_put(struct pagecache_lock *lock)
 	__pagecache_lock_put(lock, 1);
 }
 
+bool bch2_pagecache_add_tryget(struct pagecache_lock *lock)
+{
+	return __pagecache_lock_tryget(lock, 1);
+}
+
 void bch2_pagecache_add_get(struct pagecache_lock *lock)
 {
 	__pagecache_lock_get(lock, 1);
diff --git a/fs/bcachefs/fs.h b/fs/bcachefs/fs.h
index b3a2993dd9bc..7c095b856b05 100644
--- a/fs/bcachefs/fs.h
+++ b/fs/bcachefs/fs.h
@@ -26,6 +26,7 @@ static inline void pagecache_lock_init(struct pagecache_lock *lock)
 }
 
 void bch2_pagecache_add_put(struct pagecache_lock *);
+bool bch2_pagecache_add_tryget(struct pagecache_lock *);
 void bch2_pagecache_add_get(struct pagecache_lock *);
 void bch2_pagecache_block_put(struct pagecache_lock *);
 void bch2_pagecache_block_get(struct pagecache_lock *);
-- 
cgit 


From 35ef6df5ca67347c606e418eb3ef71870ea97ba7 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 13 Nov 2020 14:39:43 -0500
Subject: bcachefs: Improve journal entry validate code

Previously, the journal entry read code was changed so that if we got a
journal entry that failed validation, we'd try to use it, preferring to
use a good version from another device if available.

But this left a bug where if an earlier validation check (say, checksum)
failed, the later checks (for last_seq) wouldn't run and we'd end up
using a journal entry with a garbage last_seq field. This fixes that so
that the later validation checks run and if necessary change those
fields to something sensible.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/journal_io.c | 33 ++++++++++++++++-----------------
 1 file changed, 16 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 80c833f1390b..e976aa83d527 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -431,46 +431,45 @@ static int jset_validate(struct bch_fs *c,
 			"%s sector %llu seq %llu: unknown journal entry version %u",
 			ca->name, sector, le64_to_cpu(jset->seq),
 			version)) {
-		/* XXX: note we might have missing journal entries */
-		return JOURNAL_ENTRY_BAD;
+		/* don't try to continue: */
+		return EINVAL;
 	}
 
+	if (bytes > (sectors_read << 9) &&
+	    sectors_read < bucket_sectors_left)
+		return JOURNAL_ENTRY_REREAD;
+
 	if (journal_entry_err_on(bytes > bucket_sectors_left << 9, c,
 			"%s sector %llu seq %llu: journal entry too big (%zu bytes)",
 			ca->name, sector, le64_to_cpu(jset->seq), bytes)) {
-		/* XXX: note we might have missing journal entries */
-		return JOURNAL_ENTRY_BAD;
+		ret = JOURNAL_ENTRY_BAD;
+		le32_add_cpu(&jset->u64s,
+			     -((bytes - (bucket_sectors_left << 9)) / 8));
 	}
 
-	if (bytes > sectors_read << 9)
-		return JOURNAL_ENTRY_REREAD;
-
 	if (fsck_err_on(!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(jset)), c,
 			"%s sector %llu seq %llu: journal entry with unknown csum type %llu",
 			ca->name, sector, le64_to_cpu(jset->seq),
-			JSET_CSUM_TYPE(jset)))
-		return JOURNAL_ENTRY_BAD;
+			JSET_CSUM_TYPE(jset))) {
+		ret = JOURNAL_ENTRY_BAD;
+		goto bad_csum_type;
+	}
 
 	csum = csum_vstruct(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), jset);
 	if (journal_entry_err_on(bch2_crc_cmp(csum, jset->csum), c,
 				 "%s sector %llu seq %llu: journal checksum bad",
-				 ca->name, sector, le64_to_cpu(jset->seq))) {
-		/* XXX: retry IO, when we start retrying checksum errors */
-		/* XXX: note we might have missing journal entries */
-		return JOURNAL_ENTRY_BAD;
-	}
+				 ca->name, sector, le64_to_cpu(jset->seq)))
+		ret = JOURNAL_ENTRY_BAD;
 
 	bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset),
 		     jset->encrypted_start,
 		     vstruct_end(jset) - (void *) jset->encrypted_start);
-
+bad_csum_type:
 	if (journal_entry_err_on(le64_to_cpu(jset->last_seq) > le64_to_cpu(jset->seq), c,
 				 "invalid journal entry: last_seq > seq")) {
 		jset->last_seq = jset->seq;
 		return JOURNAL_ENTRY_BAD;
 	}
-
-	return 0;
 fsck_err:
 	return ret;
 }
-- 
cgit 


From 101d471367a4b3a9158c53d3ca0093b0bed60338 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 13 Nov 2020 14:49:57 -0500
Subject: bcachefs: Fix a 64 bit divide

this fixes builds on 32 bit.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/buckets.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 0dc01386d1cd..65d9b8126609 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -321,7 +321,7 @@ static u64 reserve_factor(u64 r)
 
 static u64 avail_factor(u64 r)
 {
-	return (r << RESERVE_FACTOR) / ((1 << RESERVE_FACTOR) + 1);
+	return div_u64(r << RESERVE_FACTOR, (1 << RESERVE_FACTOR) + 1);
 }
 
 u64 bch2_fs_sectors_used(struct bch_fs *c, struct bch_fs_usage_online *fs_usage)
-- 
cgit 


From 6d758368f1265ca9d0b7a077caf1ca9e9859c8c8 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 13 Nov 2020 18:30:53 -0500
Subject: bcachefs: Fix a btree transaction iter overflow

extent_replay_key dates from before putting iterators was required -
fixed.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/recovery.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 1745cfac6b26..6750063663b5 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -456,6 +456,7 @@ retry:
 		__bch2_btree_iter_set_pos(split_iter, split->k.p, false);
 		bch2_trans_update(&trans, split_iter, split,
 				  BTREE_TRIGGER_NORUN);
+		bch2_trans_iter_put(&trans, split_iter);
 
 		bch2_btree_iter_set_pos(iter, split->k.p);
 
@@ -481,6 +482,8 @@ retry:
 				BTREE_INSERT_LAZY_RW|
 				BTREE_INSERT_JOURNAL_REPLAY);
 err:
+	bch2_trans_iter_put(&trans, iter);
+
 	if (ret == -EINTR)
 		goto retry;
 
-- 
cgit 


From 1640647c04bf0963e51aaec595af026d383352d6 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 13 Nov 2020 16:51:02 -0500
Subject: bcachefs: Inode delete doesn't need to flush key cache anymore

Inode create checks to make sure the slot doesn't exist in the btree key
cache.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/inode.c | 11 ++---------
 1 file changed, 2 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index 8e52d475b397..f00778d78271 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -576,16 +576,9 @@ retry:
 
 	bi_generation = 0;
 
-	ret = bch2_btree_key_cache_flush(&trans, BTREE_ID_INODES, POS(0, inode_nr));
-	if (ret) {
-		if (ret != -EINTR)
-			bch_err(c, "error flushing btree key cache: %i", ret);
-		goto err;
-	}
-
 	iter = bch2_trans_get_iter(&trans, BTREE_ID_INODES, POS(0, inode_nr),
-				   BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
-	k = bch2_btree_iter_peek_slot(iter);
+				   BTREE_ITER_CACHED|BTREE_ITER_INTENT);
+	k = bch2_btree_iter_peek_cached(iter);
 
 	ret = bkey_err(k);
 	if (ret)
-- 
cgit 


From 61501161817b1dee17231af6a255af836c9b6853 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 13 Nov 2020 15:03:34 -0500
Subject: bcachefs: Be more careful in bch2_bkey_to_text()

This is used to print keys that failed bch2_bkey_invalid(), so be more
careful with k->type.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bkey_methods.c | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c
index 99b7fce2bfd3..f5779795a4b2 100644
--- a/fs/bcachefs/bkey_methods.c
+++ b/fs/bcachefs/bkey_methods.c
@@ -181,8 +181,12 @@ void bch2_bpos_to_text(struct printbuf *out, struct bpos pos)
 void bch2_bkey_to_text(struct printbuf *out, const struct bkey *k)
 {
 	if (k) {
-		pr_buf(out, "u64s %u type %s ", k->u64s,
-		       bch2_bkey_types[k->type]);
+		pr_buf(out, "u64s %u type ", k->u64s);
+
+		if (k->type < KEY_TYPE_MAX)
+			pr_buf(out, "%s ", bch2_bkey_types[k->type]);
+		else
+			pr_buf(out, "%u ", k->type);
 
 		bch2_bpos_to_text(out, k->p);
 
@@ -196,10 +200,14 @@ void bch2_bkey_to_text(struct printbuf *out, const struct bkey *k)
 void bch2_val_to_text(struct printbuf *out, struct bch_fs *c,
 		      struct bkey_s_c k)
 {
-	const struct bkey_ops *ops = &bch2_bkey_ops[k.k->type];
+	if (k.k->type < KEY_TYPE_MAX) {
+		const struct bkey_ops *ops = &bch2_bkey_ops[k.k->type];
 
-	if (likely(ops->val_to_text))
-		ops->val_to_text(out, c, k);
+		if (likely(ops->val_to_text))
+			ops->val_to_text(out, c, k);
+	} else {
+		pr_buf(out, "(invalid type %u)", k.k->type);
+	}
 }
 
 void bch2_bkey_val_to_text(struct printbuf *out, struct bch_fs *c,
-- 
cgit 


From ed0d631fa50112c51f302442e3d11a1c5f4d2bb4 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 13 Nov 2020 16:19:24 -0500
Subject: bcachefs: Improve journal error messages

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/journal_io.c | 21 ++++++++++++++-------
 1 file changed, 14 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index e976aa83d527..a251f76fdd39 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -172,7 +172,9 @@ static int journal_validate_key(struct bch_fs *c, struct jset *jset,
 	int ret = 0;
 
 	if (journal_entry_err_on(!k->k.u64s, c,
-			"invalid %s in journal: k->u64s 0", type)) {
+			"invalid %s in journal entry %llu offset %zi: k->u64s 0",
+			type, le64_to_cpu(jset->seq),
+			(u64 *) entry - jset->_data)) {
 		entry->u64s = cpu_to_le16((u64 *) k - entry->_data);
 		journal_entry_null_range(vstruct_next(entry), next);
 		return 0;
@@ -180,16 +182,19 @@ static int journal_validate_key(struct bch_fs *c, struct jset *jset,
 
 	if (journal_entry_err_on((void *) bkey_next(k) >
 				(void *) vstruct_next(entry), c,
-			"invalid %s in journal: extends past end of journal entry",
-			type)) {
+			"invalid %s in journal entry %llu offset %zi: extends past end of journal entry",
+			type, le64_to_cpu(jset->seq),
+			(u64 *) entry - jset->_data)) {
 		entry->u64s = cpu_to_le16((u64 *) k - entry->_data);
 		journal_entry_null_range(vstruct_next(entry), next);
 		return 0;
 	}
 
 	if (journal_entry_err_on(k->k.format != KEY_FORMAT_CURRENT, c,
-			"invalid %s in journal: bad format %u",
-			type, k->k.format)) {
+			"invalid %s in journal entry %llu offset %zi: bad format %u",
+			type, le64_to_cpu(jset->seq),
+			(u64 *) entry - jset->_data,
+			k->k.format)) {
 		le16_add_cpu(&entry->u64s, -k->k.u64s);
 		memmove(k, bkey_next(k), next - (void *) bkey_next(k));
 		journal_entry_null_range(vstruct_next(entry), next);
@@ -207,8 +212,10 @@ static int journal_validate_key(struct bch_fs *c, struct jset *jset,
 		char buf[160];
 
 		bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(k));
-		mustfix_fsck_err(c, "invalid %s in journal: %s\n%s",
-				 type, invalid, buf);
+		mustfix_fsck_err(c, "invalid %s in journal entry %llu offset %zi: %s\n%s",
+				 type, le64_to_cpu(jset->seq),
+				 (u64 *) entry - jset->_data,
+				 invalid, buf);
 
 		le16_add_cpu(&entry->u64s, -k->k.u64s);
 		memmove(k, bkey_next(k), next - (void *) bkey_next(k));
-- 
cgit 


From 1676a398d37bffa29824f132a29f2836282940f3 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 14 Nov 2020 13:12:50 -0500
Subject: bcachefs: Delete dead journalling code

Usage of the journal has gotten somewhat simpler over time - neat.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/journal.c | 112 --------------------------------------------------
 fs/bcachefs/journal.h |   5 ---
 fs/bcachefs/sysfs.c   |   2 +-
 3 files changed, 1 insertion(+), 118 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index 1f7f3b96bd87..f57ab3884761 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -502,74 +502,6 @@ out:
 
 /* journal flushing: */
 
-u64 bch2_journal_last_unwritten_seq(struct journal *j)
-{
-	u64 seq;
-
-	spin_lock(&j->lock);
-	seq = journal_cur_seq(j);
-	if (j->reservations.prev_buf_unwritten)
-		seq--;
-	spin_unlock(&j->lock);
-
-	return seq;
-}
-
-/**
- * bch2_journal_open_seq_async - try to open a new journal entry if @seq isn't
- * open yet, or wait if we cannot
- *
- * used by the btree interior update machinery, when it needs to write a new
- * btree root - every journal entry contains the roots of all the btrees, so it
- * doesn't need to bother with getting a journal reservation
- */
-int bch2_journal_open_seq_async(struct journal *j, u64 seq, struct closure *cl)
-{
-	struct bch_fs *c = container_of(j, struct bch_fs, journal);
-	int ret;
-
-	spin_lock(&j->lock);
-
-	/*
-	 * Can't try to open more than one sequence number ahead:
-	 */
-	BUG_ON(journal_cur_seq(j) < seq && !journal_entry_is_open(j));
-
-	if (journal_cur_seq(j) > seq ||
-	    journal_entry_is_open(j)) {
-		spin_unlock(&j->lock);
-		return 0;
-	}
-
-	if (journal_cur_seq(j) < seq &&
-	    !__journal_entry_close(j)) {
-		/* haven't finished writing out the previous one: */
-		trace_journal_entry_full(c);
-		ret = -EAGAIN;
-	} else {
-		BUG_ON(journal_cur_seq(j) != seq);
-
-		ret = journal_entry_open(j);
-	}
-
-	if ((ret == -EAGAIN || ret == -ENOSPC) &&
-	    !j->res_get_blocked_start)
-		j->res_get_blocked_start = local_clock() ?: 1;
-
-	if (ret == -EAGAIN || ret == -ENOSPC)
-		closure_wait(&j->async_wait, cl);
-
-	spin_unlock(&j->lock);
-
-	if (ret == -ENOSPC) {
-		trace_journal_full(c);
-		bch2_journal_reclaim_work(&j->reclaim_work.work);
-		ret = -EAGAIN;
-	}
-
-	return ret;
-}
-
 static int journal_seq_error(struct journal *j, u64 seq)
 {
 	union journal_res_state state = READ_ONCE(j->reservations);
@@ -601,35 +533,6 @@ journal_seq_to_buf(struct journal *j, u64 seq)
 	return NULL;
 }
 
-/**
- * bch2_journal_wait_on_seq - wait for a journal entry to be written
- *
- * does _not_ cause @seq to be written immediately - if there is no other
- * activity to cause the relevant journal entry to be filled up or flushed it
- * can wait for an arbitrary amount of time (up to @j->write_delay_ms, which is
- * configurable).
- */
-void bch2_journal_wait_on_seq(struct journal *j, u64 seq,
-			      struct closure *parent)
-{
-	struct journal_buf *buf;
-
-	spin_lock(&j->lock);
-
-	if ((buf = journal_seq_to_buf(j, seq))) {
-		if (!closure_wait(&buf->wait, parent))
-			BUG();
-
-		if (seq == journal_cur_seq(j)) {
-			smp_mb();
-			if (bch2_journal_error(j))
-				closure_wake_up(&buf->wait);
-		}
-	}
-
-	spin_unlock(&j->lock);
-}
-
 /**
  * bch2_journal_flush_seq_async - wait for a journal entry to be written
  *
@@ -679,21 +582,6 @@ int bch2_journal_flush_seq(struct journal *j, u64 seq)
 	return ret ?: ret2 < 0 ? ret2 : 0;
 }
 
-/**
- * bch2_journal_meta_async - force a journal entry to be written
- */
-void bch2_journal_meta_async(struct journal *j, struct closure *parent)
-{
-	struct journal_res res;
-
-	memset(&res, 0, sizeof(res));
-
-	bch2_journal_res_get(j, &res, jset_u64s(0), 0);
-	bch2_journal_res_put(j, &res);
-
-	bch2_journal_flush_seq_async(j, res.seq, parent);
-}
-
 int bch2_journal_meta(struct journal *j)
 {
 	struct journal_res res;
diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h
index b8e74c483a23..8931ff3627a8 100644
--- a/fs/bcachefs/journal.h
+++ b/fs/bcachefs/journal.h
@@ -466,13 +466,8 @@ void bch2_journal_entry_res_resize(struct journal *,
 				   struct journal_entry_res *,
 				   unsigned);
 
-u64 bch2_journal_last_unwritten_seq(struct journal *);
-int bch2_journal_open_seq_async(struct journal *, u64, struct closure *);
-
-void bch2_journal_wait_on_seq(struct journal *, u64, struct closure *);
 void bch2_journal_flush_seq_async(struct journal *, u64, struct closure *);
 void bch2_journal_flush_async(struct journal *, struct closure *);
-void bch2_journal_meta_async(struct journal *, struct closure *);
 
 int bch2_journal_flush_seq(struct journal *, u64);
 int bch2_journal_flush(struct journal *);
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index 598ad6bdd61b..89287bfe31a7 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -458,7 +458,7 @@ STORE(bch2_fs)
 	/* Debugging: */
 
 	if (attr == &sysfs_trigger_journal_flush)
-		bch2_journal_meta_async(&c->journal, NULL);
+		bch2_journal_meta(&c->journal);
 
 	if (attr == &sysfs_trigger_btree_coalesce)
 		bch2_coalesce(c);
-- 
cgit 


From 158eecb88ed3100bef01917913a26e9aad152417 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 14 Nov 2020 16:04:30 -0500
Subject: bcachefs: Assorted journal refactoring

Improved the way we track various state by adding j->err_seq, which
records the first journal sequence number that encountered an error
being written, and j->last_empty_seq, which records the most recent
journal entry that was completely empty.

Also, use the low bits of the journal sequence number to index the
corresponding journal_buf.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/journal.c       | 105 +++++++++++++++++++++-----------------------
 fs/bcachefs/journal.h       |   2 +-
 fs/bcachefs/journal_io.c    |  25 ++++++-----
 fs/bcachefs/journal_types.h |   3 +-
 4 files changed, 67 insertions(+), 68 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index f57ab3884761..e7b60876d09a 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -17,7 +17,19 @@
 #include "super-io.h"
 #include "trace.h"
 
-static inline struct journal_buf *journal_seq_to_buf(struct journal *, u64);
+static u64 last_unwritten_seq(struct journal *j)
+{
+	union journal_res_state s = READ_ONCE(j->reservations);
+
+	lockdep_assert_held(&j->lock);
+
+	return journal_cur_seq(j) - s.prev_buf_unwritten;
+}
+
+static inline bool journal_seq_unwritten(struct journal *j, u64 seq)
+{
+	return seq >= last_unwritten_seq(j);
+}
 
 static bool __journal_entry_is_open(union journal_res_state state)
 {
@@ -29,6 +41,22 @@ static bool journal_entry_is_open(struct journal *j)
 	return __journal_entry_is_open(j->reservations);
 }
 
+static inline struct journal_buf *
+journal_seq_to_buf(struct journal *j, u64 seq)
+{
+	struct journal_buf *buf = NULL;
+
+	EBUG_ON(seq > journal_cur_seq(j));
+	EBUG_ON(seq == journal_cur_seq(j) &&
+		j->reservations.cur_entry_offset == JOURNAL_ENTRY_CLOSED_VAL);
+
+	if (journal_seq_unwritten(j, seq)) {
+		buf = j->buf + (seq & 1);
+		EBUG_ON(le64_to_cpu(buf->data->seq) != seq);
+	}
+	return buf;
+}
+
 static void journal_pin_new_entry(struct journal *j, int count)
 {
 	struct journal_entry_pin_list *p;
@@ -50,6 +78,8 @@ static void bch2_journal_buf_init(struct journal *j)
 {
 	struct journal_buf *buf = journal_cur_buf(j);
 
+	bkey_extent_init(&buf->key);
+
 	memset(buf->has_inode, 0, sizeof(buf->has_inode));
 
 	memset(buf->data, 0, sizeof(*buf->data));
@@ -71,6 +101,7 @@ void bch2_journal_halt(struct journal *j)
 	} while ((v = atomic64_cmpxchg(&j->reservations.counter,
 				       old.v, new.v)) != old.v);
 
+	j->err_seq = journal_cur_seq(j);
 	journal_wake(j);
 	closure_wake_up(&journal_cur_buf(j)->wait);
 }
@@ -138,8 +169,6 @@ static bool __journal_entry_close(struct journal *j)
 	BUG_ON(sectors > buf->sectors);
 	buf->sectors = sectors;
 
-	bkey_extent_init(&buf->key);
-
 	/*
 	 * We have to set last_seq here, _before_ opening a new journal entry:
 	 *
@@ -161,11 +190,6 @@ static bool __journal_entry_close(struct journal *j)
 	 */
 	buf->data->last_seq	= cpu_to_le64(journal_last_seq(j));
 
-	if (journal_entry_empty(buf->data))
-		clear_bit(JOURNAL_NOT_EMPTY, &j->flags);
-	else
-		set_bit(JOURNAL_NOT_EMPTY, &j->flags);
-
 	journal_pin_new_entry(j, 1);
 
 	bch2_journal_buf_init(j);
@@ -502,49 +526,28 @@ out:
 
 /* journal flushing: */
 
-static int journal_seq_error(struct journal *j, u64 seq)
-{
-	union journal_res_state state = READ_ONCE(j->reservations);
-
-	if (seq == journal_cur_seq(j))
-		return bch2_journal_error(j);
-
-	if (seq + 1 == journal_cur_seq(j) &&
-	    !state.prev_buf_unwritten &&
-	    seq > j->seq_ondisk)
-		return -EIO;
-
-	return 0;
-}
-
-static inline struct journal_buf *
-journal_seq_to_buf(struct journal *j, u64 seq)
-{
-	/* seq should be for a journal entry that has been opened: */
-	BUG_ON(seq > journal_cur_seq(j));
-	BUG_ON(seq == journal_cur_seq(j) &&
-	       j->reservations.cur_entry_offset == JOURNAL_ENTRY_CLOSED_VAL);
-
-	if (seq == journal_cur_seq(j))
-		return journal_cur_buf(j);
-	if (seq + 1 == journal_cur_seq(j) &&
-	    j->reservations.prev_buf_unwritten)
-		return journal_prev_buf(j);
-	return NULL;
-}
-
 /**
  * bch2_journal_flush_seq_async - wait for a journal entry to be written
  *
  * like bch2_journal_wait_on_seq, except that it triggers a write immediately if
  * necessary
  */
-void bch2_journal_flush_seq_async(struct journal *j, u64 seq,
+int bch2_journal_flush_seq_async(struct journal *j, u64 seq,
 				  struct closure *parent)
 {
 	struct journal_buf *buf;
+	int ret = 0;
 
 	spin_lock(&j->lock);
+	if (seq <= j->err_seq) {
+		ret = -EIO;
+		goto out;
+	}
+
+	if (seq <= j->seq_ondisk) {
+		ret = 1;
+		goto out;
+	}
 
 	if (parent &&
 	    (buf = journal_seq_to_buf(j, seq)))
@@ -553,20 +556,8 @@ void bch2_journal_flush_seq_async(struct journal *j, u64 seq,
 
 	if (seq == journal_cur_seq(j))
 		__journal_entry_close(j);
+out:
 	spin_unlock(&j->lock);
-}
-
-static int journal_seq_flushed(struct journal *j, u64 seq)
-{
-	int ret;
-
-	spin_lock(&j->lock);
-	ret = seq <= j->seq_ondisk ? 1 : journal_seq_error(j, seq);
-
-	if (seq == journal_cur_seq(j))
-		__journal_entry_close(j);
-	spin_unlock(&j->lock);
-
 	return ret;
 }
 
@@ -575,7 +566,7 @@ int bch2_journal_flush_seq(struct journal *j, u64 seq)
 	u64 start_time = local_clock();
 	int ret, ret2;
 
-	ret = wait_event_killable(j->wait, (ret2 = journal_seq_flushed(j, seq)));
+	ret = wait_event_killable(j->wait, (ret2 = bch2_journal_flush_seq_async(j, seq, NULL)));
 
 	bch2_time_stats_update(j->flush_seq_time, start_time);
 
@@ -876,7 +867,8 @@ void bch2_fs_journal_stop(struct journal *j)
 	journal_quiesce(j);
 
 	BUG_ON(!bch2_journal_error(j) &&
-	       test_bit(JOURNAL_NOT_EMPTY, &j->flags));
+	       (journal_entry_is_open(j) ||
+		j->last_empty_seq + 1 != journal_cur_seq(j)));
 
 	cancel_delayed_work_sync(&j->write_work);
 	cancel_delayed_work_sync(&j->reclaim_work);
@@ -934,6 +926,9 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq,
 	set_bit(JOURNAL_STARTED, &j->flags);
 
 	journal_pin_new_entry(j, 1);
+
+	j->reservations.idx = journal_cur_seq(j);
+
 	bch2_journal_buf_init(j);
 
 	c->last_bucket_seq_cleanup = journal_cur_seq(j);
diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h
index 8931ff3627a8..7ad2bb576eb0 100644
--- a/fs/bcachefs/journal.h
+++ b/fs/bcachefs/journal.h
@@ -466,7 +466,7 @@ void bch2_journal_entry_res_resize(struct journal *,
 				   struct journal_entry_res *,
 				   unsigned);
 
-void bch2_journal_flush_seq_async(struct journal *, u64, struct closure *);
+int bch2_journal_flush_seq_async(struct journal *, u64, struct closure *);
 void bch2_journal_flush_async(struct journal *, struct closure *);
 
 int bch2_journal_flush_seq(struct journal *, u64);
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index a251f76fdd39..a6fb4fb207a2 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -944,24 +944,29 @@ static void journal_write_done(struct closure *cl)
 	struct bch_replicas_padded replicas;
 	u64 seq = le64_to_cpu(w->data->seq);
 	u64 last_seq = le64_to_cpu(w->data->last_seq);
+	int err = 0;
 
 	bch2_time_stats_update(j->write_time, j->write_start_time);
 
 	if (!devs.nr) {
 		bch_err(c, "unable to write journal to sufficient devices");
-		goto err;
+		err = -EIO;
+	} else {
+		bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal, devs);
+		if (bch2_mark_replicas(c, &replicas.e))
+			err = -EIO;
 	}
 
-	bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal, devs);
-
-	if (bch2_mark_replicas(c, &replicas.e))
-		goto err;
+	if (err)
+		bch2_fatal_error(c);
 
 	spin_lock(&j->lock);
 	if (seq >= j->pin.front)
 		journal_seq_pin(j, seq)->devs = devs;
 
 	j->seq_ondisk		= seq;
+	if (err && (!j->err_seq || seq < j->err_seq))
+		j->err_seq	= seq;
 	j->last_seq_ondisk	= last_seq;
 	bch2_journal_space_available(j);
 
@@ -973,7 +978,7 @@ static void journal_write_done(struct closure *cl)
 	 * bch2_fs_journal_stop():
 	 */
 	mod_delayed_work(c->journal_reclaim_wq, &j->reclaim_work, 0);
-out:
+
 	/* also must come before signalling write completion: */
 	closure_debug_destroy(cl);
 
@@ -987,11 +992,6 @@ out:
 	if (test_bit(JOURNAL_NEED_WRITE, &j->flags))
 		mod_delayed_work(system_freezable_wq, &j->write_work, 0);
 	spin_unlock(&j->lock);
-	return;
-err:
-	bch2_fatal_error(c);
-	spin_lock(&j->lock);
-	goto out;
 }
 
 static void journal_write_endio(struct bio *bio)
@@ -1072,6 +1072,9 @@ void bch2_journal_write(struct closure *cl)
 	SET_JSET_BIG_ENDIAN(jset, CPU_BIG_ENDIAN);
 	SET_JSET_CSUM_TYPE(jset, bch2_meta_checksum_type(c));
 
+	if (journal_entry_empty(jset))
+		j->last_empty_seq = le64_to_cpu(jset->seq);
+
 	if (bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset)))
 		validate_before_checksum = true;
 
diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h
index 6d0ee8e42da1..22ff7f8081c6 100644
--- a/fs/bcachefs/journal_types.h
+++ b/fs/bcachefs/journal_types.h
@@ -127,7 +127,6 @@ enum {
 	JOURNAL_STARTED,
 	JOURNAL_RECLAIM_STARTED,
 	JOURNAL_NEED_WRITE,
-	JOURNAL_NOT_EMPTY,
 	JOURNAL_MAY_GET_UNRESERVED,
 };
 
@@ -181,6 +180,8 @@ struct journal {
 	/* seq, last_seq from the most recent journal entry successfully written */
 	u64			seq_ondisk;
 	u64			last_seq_ondisk;
+	u64			err_seq;
+	u64			last_empty_seq;
 
 	/*
 	 * FIFO of journal entries whose btree updates have not yet been
-- 
cgit 


From d8b46004648c66dbde5d35eed8e3939987d7a833 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 15 Nov 2020 16:31:58 -0500
Subject: bcachefs: Check for errors from register_shrinker()

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_cache.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index 9d8c73ec57d3..2c8f67fabc5a 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -448,7 +448,7 @@ int bch2_fs_btree_cache_init(struct bch_fs *c)
 	bc->shrink.scan_objects		= bch2_btree_cache_scan;
 	bc->shrink.seeks		= 4;
 	bc->shrink.batch		= btree_pages(c) * 2;
-	register_shrinker(&bc->shrink, "%s/btree_cache", c->name);
+	ret = register_shrinker(&bc->shrink, "%s/btree_cache", c->name);
 out:
 	pr_verbose_init(c->opts, "ret %i", ret);
 	return ret;
-- 
cgit 


From 876c7af3a6620c5698782f18bff8a3ed7e006d78 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 15 Nov 2020 16:30:22 -0500
Subject: bcachefs: Take a SRCU lock in btree transactions

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/Kconfig       | 1 +
 fs/bcachefs/bcachefs.h    | 3 +++
 fs/bcachefs/btree_iter.c  | 8 +++++++-
 fs/bcachefs/btree_types.h | 1 +
 4 files changed, 12 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/Kconfig b/fs/bcachefs/Kconfig
index eccf643e9081..151c4b10d543 100644
--- a/fs/bcachefs/Kconfig
+++ b/fs/bcachefs/Kconfig
@@ -19,6 +19,7 @@ config BCACHEFS_FS
 	select KEYS
 	select RAID6_PQ
 	select XOR_BLOCKS
+	select SRCU
 	help
 	The bcachefs filesystem - a modern, copy on write filesystem, with
 	support for multiple devices, compression, checksumming, etc.
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 8ac96384fddf..d77d1fc1cfed 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -193,6 +193,7 @@
 #include <linux/rwsem.h>
 #include <linux/seqlock.h>
 #include <linux/shrinker.h>
+#include <linux/srcu.h>
 #include <linux/types.h>
 #include <linux/workqueue.h>
 #include <linux/zstd.h>
@@ -642,6 +643,8 @@ struct bch_fs {
 	mempool_t		btree_iters_pool;
 	struct btree_iter_buf  __percpu	*btree_iters_bufs;
 
+	struct srcu_struct	btree_trans_barrier;
+
 	struct btree_key_cache	btree_key_cache;
 
 	struct workqueue_struct	*wq;
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index f1d6553890f4..007d69656660 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -2373,6 +2373,8 @@ void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c,
 	if (expected_mem_bytes)
 		bch2_trans_preload_mem(trans, expected_mem_bytes);
 
+	trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
+
 #ifdef CONFIG_BCACHEFS_DEBUG
 	trans->pid = current->pid;
 	mutex_lock(&c->btree_trans_lock);
@@ -2393,6 +2395,8 @@ int bch2_trans_exit(struct btree_trans *trans)
 	mutex_unlock(&trans->c->btree_trans_lock);
 #endif
 
+	srcu_read_unlock(&c->btree_trans_barrier, trans->srcu_idx);
+
 	bch2_journal_preres_put(&trans->c->journal, &trans->journal_preres);
 
 	kfree(trans->fs_usage_deltas);
@@ -2475,6 +2479,7 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct bch_fs *c)
 void bch2_fs_btree_iter_exit(struct bch_fs *c)
 {
 	mempool_exit(&c->btree_iters_pool);
+	cleanup_srcu_struct(&c->btree_trans_barrier);
 }
 
 int bch2_fs_btree_iter_init(struct bch_fs *c)
@@ -2484,7 +2489,8 @@ int bch2_fs_btree_iter_init(struct bch_fs *c)
 	INIT_LIST_HEAD(&c->btree_trans_list);
 	mutex_init(&c->btree_trans_lock);
 
-	return mempool_init_kmalloc_pool(&c->btree_iters_pool, 1,
+	return  init_srcu_struct(&c->btree_trans_barrier) ?:
+		mempool_init_kmalloc_pool(&c->btree_iters_pool, 1,
 			sizeof(struct btree_iter) * nr +
 			sizeof(struct btree_insert_entry) * nr +
 			sizeof(struct btree_insert_entry) * nr);
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index de287f91ac28..47564995a0a3 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -350,6 +350,7 @@ struct btree_trans {
 	pid_t			pid;
 #endif
 	unsigned long		ip;
+	int			srcu_idx;
 
 	u64			iters_linked;
 	u64			iters_live;
-- 
cgit 


From 628a3ad2c2dfd8e89ac9ab9fc3682f8e2ce504d8 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 12 Nov 2020 17:19:47 -0500
Subject: bcachefs: Add a shrinker for the btree key cache

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_key_cache.c | 93 ++++++++++++++++++++++++++++++++++++++++---
 fs/bcachefs/btree_types.h     |  2 +
 2 files changed, 90 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index 8b5e690a4d83..71d5bfa4caab 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -10,6 +10,8 @@
 #include "journal_reclaim.h"
 #include "trace.h"
 
+#include <linux/sched/mm.h>
+
 static int bch2_btree_key_cache_cmp_fn(struct rhashtable_compare_arg *arg,
 				       const void *obj)
 {
@@ -69,10 +71,15 @@ static void bkey_cached_evict(struct btree_key_cache *c,
 	c->nr_keys--;
 }
 
-static void bkey_cached_free(struct btree_key_cache *c,
+static void bkey_cached_free(struct btree_key_cache *bc,
 			     struct bkey_cached *ck)
 {
-	list_move(&ck->list, &c->freed);
+	struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
+
+	ck->btree_trans_barrier_seq =
+		start_poll_synchronize_srcu(&c->btree_trans_barrier);
+
+	list_move(&ck->list, &bc->freed);
 
 	kfree(ck->k);
 	ck->k		= NULL;
@@ -404,19 +411,23 @@ static void btree_key_cache_journal_flush(struct journal *j,
 	struct bkey_cached_key key;
 	struct btree_trans trans;
 
+	int srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
+
 	six_lock_read(&ck->c.lock, NULL, NULL);
 	key = ck->key;
 
 	if (ck->journal.seq != seq ||
 	    !test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
 		six_unlock_read(&ck->c.lock);
-		return;
+		goto unlock;
 	}
 	six_unlock_read(&ck->c.lock);
 
 	bch2_trans_init(&trans, c, 0, 0);
 	btree_key_cache_flush_pos(&trans, key, seq, false);
 	bch2_trans_exit(&trans);
+unlock:
+	srcu_read_unlock(&c->btree_trans_barrier, srcu_idx);
 }
 
 /*
@@ -481,11 +492,77 @@ void bch2_btree_key_cache_verify_clean(struct btree_trans *trans,
 }
 #endif
 
+static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
+					   struct shrink_control *sc)
+{
+	struct bch_fs *c = container_of(shrink, struct bch_fs,
+					btree_key_cache.shrink);
+	struct btree_key_cache *bc = &c->btree_key_cache;
+	struct bkey_cached *ck, *t;
+	size_t scanned = 0, freed = 0, nr = sc->nr_to_scan;
+	unsigned flags;
+
+	/* Return -1 if we can't do anything right now */
+	if (sc->gfp_mask & __GFP_FS)
+		mutex_lock(&bc->lock);
+	else if (!mutex_trylock(&bc->lock))
+		return -1;
+
+	flags = memalloc_nofs_save();
+
+	list_for_each_entry_safe(ck, t, &bc->freed, list) {
+		scanned++;
+
+		if (poll_state_synchronize_srcu(&c->btree_trans_barrier,
+						ck->btree_trans_barrier_seq)) {
+			list_del(&ck->list);
+			kfree(ck);
+			freed++;
+		}
+
+		if (scanned >= nr)
+			goto out;
+	}
+
+	list_for_each_entry_safe(ck, t, &bc->clean, list) {
+		scanned++;
+
+		if (bkey_cached_lock_for_evict(ck)) {
+			bkey_cached_evict(bc, ck);
+			bkey_cached_free(bc, ck);
+		}
+
+		if (scanned >= nr) {
+			if (&t->list != &bc->clean)
+				list_move_tail(&bc->clean, &t->list);
+			goto out;
+		}
+	}
+out:
+	memalloc_nofs_restore(flags);
+	mutex_unlock(&bc->lock);
+
+	return freed;
+}
+
+static unsigned long bch2_btree_key_cache_count(struct shrinker *shrink,
+					    struct shrink_control *sc)
+{
+	struct bch_fs *c = container_of(shrink, struct bch_fs,
+					btree_key_cache.shrink);
+	struct btree_key_cache *bc = &c->btree_key_cache;
+
+	return bc->nr_keys;
+}
+
 void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc)
 {
 	struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
 	struct bkey_cached *ck, *n;
 
+	if (bc->shrink.list.next)
+		unregister_shrinker(&bc->shrink);
+
 	mutex_lock(&bc->lock);
 	list_splice(&bc->dirty, &bc->clean);
 
@@ -516,9 +593,15 @@ void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *c)
 	INIT_LIST_HEAD(&c->dirty);
 }
 
-int bch2_fs_btree_key_cache_init(struct btree_key_cache *c)
+int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc)
 {
-	return rhashtable_init(&c->table, &bch2_btree_key_cache_params);
+	struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
+
+	bc->shrink.count_objects	= bch2_btree_key_cache_count;
+	bc->shrink.scan_objects		= bch2_btree_key_cache_scan;
+
+	return  register_shrinker(&bc->shrink, "%s/btree_key_cache", c->name) ?:
+		rhashtable_init(&bc->table, &bch2_btree_key_cache_params);
 }
 
 void bch2_btree_key_cache_to_text(struct printbuf *out, struct btree_key_cache *c)
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 47564995a0a3..c0ee829ead40 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -296,6 +296,7 @@ struct btree_key_cache {
 	struct list_head	freed;
 	struct list_head	clean;
 	struct list_head	dirty;
+	struct shrinker		shrink;
 
 	size_t			nr_keys;
 	size_t			nr_dirty;
@@ -314,6 +315,7 @@ struct bkey_cached {
 	unsigned long		flags;
 	u8			u64s;
 	bool			valid;
+	u32			btree_trans_barrier_seq;
 	struct bkey_cached_key	key;
 
 	struct rhash_head	hash;
-- 
cgit 


From 4d54337cdbf3baf8115bf8c1ed61bf44b3932a6d Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 16 Nov 2020 12:22:30 -0500
Subject: bcachefs: Fix journal entry repair code

When we detect bad keys in the journal that have to be dropped, the flow
control was wrong - we ended up not checking the next key in that entry.
Oops.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/journal_io.c | 48 ++++++++++++++++++++++++++++++++----------------
 1 file changed, 32 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index a6fb4fb207a2..354d57a3cd59 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -160,6 +160,8 @@ static void journal_entry_null_range(void *start, void *end)
 #define journal_entry_err_on(cond, c, msg, ...)				\
 	((cond) ? journal_entry_err(c, msg, ##__VA_ARGS__) : false)
 
+#define FSCK_DELETED_KEY	5
+
 static int journal_validate_key(struct bch_fs *c, struct jset *jset,
 				struct jset_entry *entry,
 				unsigned level, enum btree_id btree_id,
@@ -172,33 +174,42 @@ static int journal_validate_key(struct bch_fs *c, struct jset *jset,
 	int ret = 0;
 
 	if (journal_entry_err_on(!k->k.u64s, c,
-			"invalid %s in journal entry %llu offset %zi: k->u64s 0",
+			"invalid %s in jset %llu offset %zi/%u entry offset %zi/%u: k->u64s 0",
 			type, le64_to_cpu(jset->seq),
-			(u64 *) entry - jset->_data)) {
+			(u64 *) entry - jset->_data,
+			le32_to_cpu(jset->u64s),
+			(u64 *) k - entry->_data,
+			le16_to_cpu(entry->u64s))) {
 		entry->u64s = cpu_to_le16((u64 *) k - entry->_data);
 		journal_entry_null_range(vstruct_next(entry), next);
-		return 0;
+		return FSCK_DELETED_KEY;
 	}
 
 	if (journal_entry_err_on((void *) bkey_next(k) >
 				(void *) vstruct_next(entry), c,
-			"invalid %s in journal entry %llu offset %zi: extends past end of journal entry",
+			"invalid %s in jset %llu offset %zi/%u entry offset %zi/%u: extends past end of journal entry",
 			type, le64_to_cpu(jset->seq),
-			(u64 *) entry - jset->_data)) {
+			(u64 *) entry - jset->_data,
+			le32_to_cpu(jset->u64s),
+			(u64 *) k - entry->_data,
+			le16_to_cpu(entry->u64s))) {
 		entry->u64s = cpu_to_le16((u64 *) k - entry->_data);
 		journal_entry_null_range(vstruct_next(entry), next);
-		return 0;
+		return FSCK_DELETED_KEY;
 	}
 
 	if (journal_entry_err_on(k->k.format != KEY_FORMAT_CURRENT, c,
-			"invalid %s in journal entry %llu offset %zi: bad format %u",
+			"invalid %s in jset %llu offset %zi/%u entry offset %zi/%u: bad format %u",
 			type, le64_to_cpu(jset->seq),
 			(u64 *) entry - jset->_data,
+			le32_to_cpu(jset->u64s),
+			(u64 *) k - entry->_data,
+			le16_to_cpu(entry->u64s),
 			k->k.format)) {
-		le16_add_cpu(&entry->u64s, -k->k.u64s);
+		le16_add_cpu(&entry->u64s, -((u16) k->k.u64s));
 		memmove(k, bkey_next(k), next - (void *) bkey_next(k));
 		journal_entry_null_range(vstruct_next(entry), next);
-		return 0;
+		return FSCK_DELETED_KEY;
 	}
 
 	if (!write)
@@ -212,15 +223,18 @@ static int journal_validate_key(struct bch_fs *c, struct jset *jset,
 		char buf[160];
 
 		bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(k));
-		mustfix_fsck_err(c, "invalid %s in journal entry %llu offset %zi: %s\n%s",
+		mustfix_fsck_err(c, "invalid %s in jset %llu offset %zi/%u entry offset %zi/%u: %s\n%s",
 				 type, le64_to_cpu(jset->seq),
 				 (u64 *) entry - jset->_data,
+				 le32_to_cpu(jset->u64s),
+				 (u64 *) k - entry->_data,
+				 le16_to_cpu(entry->u64s),
 				 invalid, buf);
 
-		le16_add_cpu(&entry->u64s, -k->k.u64s);
+		le16_add_cpu(&entry->u64s, -((u16) k->k.u64s));
 		memmove(k, bkey_next(k), next - (void *) bkey_next(k));
 		journal_entry_null_range(vstruct_next(entry), next);
-		return 0;
+		return FSCK_DELETED_KEY;
 	}
 
 	if (write)
@@ -236,15 +250,17 @@ static int journal_entry_validate_btree_keys(struct bch_fs *c,
 					     struct jset_entry *entry,
 					     int write)
 {
-	struct bkey_i *k;
+	struct bkey_i *k = entry->start;
 
-	vstruct_for_each(entry, k) {
+	while (k != vstruct_last(entry)) {
 		int ret = journal_validate_key(c, jset, entry,
 					       entry->level,
 					       entry->btree_id,
 					       k, "key", write);
-		if (ret)
-			return ret;
+		if (ret == FSCK_DELETED_KEY)
+			continue;
+
+		k = bkey_next(k);
 	}
 
 	return 0;
-- 
cgit 


From 3dc5fcfcf51efa7dfd6ef900b06ad1fef0820664 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 16 Nov 2020 13:06:28 -0500
Subject: bcachefs: Convert tracepoints to use %ps, not %pf

Symbol decoding was changed from %pf to %ps

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/trace.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h
index c30fb4d74a95..09653c7ed858 100644
--- a/fs/bcachefs/trace.h
+++ b/fs/bcachefs/trace.h
@@ -513,7 +513,7 @@ TRACE_EVENT(transaction_restart_ip,
 		__entry->ip	= ip;
 	),
 
-	TP_printk("%pF %pF", (void *) __entry->caller, (void *) __entry->ip)
+	TP_printk("%pS %pS", (void *) __entry->caller, (void *) __entry->ip)
 );
 
 DECLARE_EVENT_CLASS(transaction_restart,
@@ -568,7 +568,7 @@ TRACE_EVENT(trans_restart_would_deadlock,
 		__entry->want_iter_type		= want_iter_type;
 	),
 
-	TP_printk("%pF %pF because %u have %u:%u want %u:%u",
+	TP_printk("%pS %pS because %u have %u:%u want %u:%u",
 		  (void *) __entry->trans_ip,
 		  (void *) __entry->caller_ip,
 		  __entry->reason,
-- 
cgit 


From 0b5c9f59401e4f339c2b716d2f9210114b7885c1 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 15 Nov 2020 20:52:55 -0500
Subject: bcachefs: Set preallocated transaction mem to avoid restarts

this will reduce transaction restarts, from observation of tracepoints.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c            |  8 ++++++--
 fs/bcachefs/btree_update_interior.c | 19 +++++++++++--------
 fs/bcachefs/fs.c                    |  3 ++-
 3 files changed, 19 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 007d69656660..6eebbadcef45 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -2370,8 +2370,12 @@ void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c,
 	 */
 	bch2_trans_alloc_iters(trans, c);
 
-	if (expected_mem_bytes)
-		bch2_trans_preload_mem(trans, expected_mem_bytes);
+	if (expected_mem_bytes) {
+		expected_mem_bytes = roundup_pow_of_two(expected_mem_bytes);
+		trans->mem = kmalloc(expected_mem_bytes, GFP_KERNEL);
+		if (trans->mem)
+			trans->mem_bytes = expected_mem_bytes;
+	}
 
 	trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
 
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index c1f822b96c48..0a83d9fdecd1 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -523,6 +523,7 @@ static void btree_update_nodes_written(struct btree_update *as)
 {
 	struct bch_fs *c = as->c;
 	struct btree *b = as->b;
+	struct btree_trans trans;
 	u64 journal_seq = 0;
 	unsigned i;
 	int ret;
@@ -540,14 +541,16 @@ static void btree_update_nodes_written(struct btree_update *as)
 	 * journal reclaim does btree updates when flushing bkey_cached entries,
 	 * which may require allocations as well.
 	 */
-	ret = bch2_trans_do(c, &as->disk_res, &journal_seq,
-			    BTREE_INSERT_NOFAIL|
-			    BTREE_INSERT_USE_RESERVE|
-			    BTREE_INSERT_USE_ALLOC_RESERVE|
-			    BTREE_INSERT_NOCHECK_RW|
-			    BTREE_INSERT_JOURNAL_RECLAIM|
-			    BTREE_INSERT_JOURNAL_RESERVED,
-			    btree_update_nodes_written_trans(&trans, as));
+	bch2_trans_init(&trans, c, 0, 512);
+	ret = __bch2_trans_do(&trans, &as->disk_res, &journal_seq,
+			      BTREE_INSERT_NOFAIL|
+			      BTREE_INSERT_USE_RESERVE|
+			      BTREE_INSERT_USE_ALLOC_RESERVE|
+			      BTREE_INSERT_NOCHECK_RW|
+			      BTREE_INSERT_JOURNAL_RECLAIM|
+			      BTREE_INSERT_JOURNAL_RESERVED,
+			      btree_update_nodes_written_trans(&trans, as));
+	bch2_trans_exit(&trans);
 	BUG_ON(ret && !bch2_journal_error(&c->journal));
 
 	if (b) {
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index 231a5433577f..480469784152 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -278,7 +278,8 @@ __bch2_create(struct mnt_idmap *idmap,
 	if (!tmpfile)
 		mutex_lock(&dir->ei_update_lock);
 
-	bch2_trans_init(&trans, c, 8, 1024);
+	bch2_trans_init(&trans, c, 8,
+			2048 + (!tmpfile ? dentry->d_name.len : 0));
 retry:
 	bch2_trans_begin(&trans);
 
-- 
cgit 


From dbd1e8259ad2b35aafe230fdabfa387b3b54ab64 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 16 Nov 2020 18:20:50 -0500
Subject: bcachefs: Dont' use percpu btree_iter buf in userspace

bcachefs-tools doesn't have a real percpu (per thread) implementation
yet

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 6eebbadcef45..0e8c8f3400de 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -2343,12 +2343,15 @@ static void bch2_trans_alloc_iters(struct btree_trans *trans, struct bch_fs *c)
 	unsigned new_size = BTREE_ITER_MAX;
 	size_t iters_bytes	= sizeof(struct btree_iter) * new_size;
 	size_t updates_bytes	= sizeof(struct btree_insert_entry) * new_size;
-	void *p;
+	void *p = NULL;
 
 	BUG_ON(trans->used_mempool);
 
-	p =     this_cpu_xchg(c->btree_iters_bufs->iter, NULL) ?:
-		mempool_alloc(&trans->c->btree_iters_pool, GFP_NOFS);
+#ifdef __KERNEL__
+	p = this_cpu_xchg(c->btree_iters_bufs->iter, NULL);
+#endif
+	if (!p)
+		p = mempool_alloc(&trans->c->btree_iters_pool, GFP_NOFS);
 
 	trans->iters		= p; p += iters_bytes;
 	trans->updates		= p; p += updates_bytes;
@@ -2406,7 +2409,12 @@ int bch2_trans_exit(struct btree_trans *trans)
 	kfree(trans->fs_usage_deltas);
 	kfree(trans->mem);
 
+#ifdef __KERNEL__
+	/*
+	 * Userspace doesn't have a real percpu implementation:
+	 */
 	trans->iters = this_cpu_xchg(c->btree_iters_bufs->iter, trans->iters);
+#endif
 	if (trans->iters)
 		mempool_free(trans->iters, &trans->c->btree_iters_pool);
 
-- 
cgit 


From e8bd002b23813f162d83a5c5c3b28832ba88f78e Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 16 Nov 2020 18:21:55 -0500
Subject: bcachefs: Dump journal state when the journal deadlocks

Currently tracking down one of these bugs.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/journal.c | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index e7b60876d09a..32555ccffc0e 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -414,8 +414,17 @@ unlock:
 		goto retry;
 
 	if (ret == -ENOSPC) {
-		WARN_ONCE(!can_discard && (flags & JOURNAL_RES_GET_RESERVED),
-			  "JOURNAL_RES_GET_RESERVED set but journal full");
+		if (WARN_ONCE(!can_discard && (flags & JOURNAL_RES_GET_RESERVED),
+			      "JOURNAL_RES_GET_RESERVED set but journal full")) {
+			char *buf;
+
+			buf = kmalloc(4096, GFP_NOFS);
+			if (buf) {
+				bch2_journal_debug_to_text(&_PBUF(buf, 4096), j);
+				pr_err("\n%s", buf);
+				kfree(buf);
+			}
+		}
 
 		/*
 		 * Journal is full - can't rely on reclaim from work item due to
-- 
cgit 


From 1c74cec10cc8dbc595c6de83e2344a44d278dc11 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 16 Nov 2020 14:16:42 -0500
Subject: bcachefs: Add more debug checks

tracking down a bug where we see a btree node pointer in the wrong node

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_io.c              |  4 +++-
 fs/bcachefs/btree_update_interior.c | 17 ++++++++++++++++-
 2 files changed, 19 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 302ee3851b0d..af3b39b70957 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -1442,8 +1442,10 @@ static int validate_bset_for_write(struct bch_fs *c, struct btree *b,
 
 	ret = validate_bset(c, b, i, sectors, WRITE, false) ?:
 		validate_bset_keys(c, b, i, &whiteout_u64s, WRITE, false);
-	if (ret)
+	if (ret) {
 		bch2_inconsistent_error(c);
+		dump_stack();
+	}
 
 	return ret;
 }
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 0a83d9fdecd1..96fa2b5a1d1e 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -11,6 +11,7 @@
 #include "btree_iter.h"
 #include "btree_locking.h"
 #include "buckets.h"
+#include "error.h"
 #include "extents.h"
 #include "journal.h"
 #include "journal_reclaim.h"
@@ -1021,7 +1022,19 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, struct btree *b
 					struct bkey_i *insert,
 					struct btree_node_iter *node_iter)
 {
+	struct bch_fs *c = as->c;
 	struct bkey_packed *k;
+	const char *invalid;
+
+	invalid = bch2_bkey_invalid(c, bkey_i_to_s_c(insert), btree_node_type(b)) ?:
+		bch2_bkey_in_btree_node(b, bkey_i_to_s_c(insert));
+	if (invalid) {
+		char buf[160];
+
+		bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(insert));
+		bch2_fs_inconsistent(c, "inserting invalid bkey %s: %s", buf, invalid);
+		dump_stack();
+	}
 
 	BUG_ON(as->journal_u64s + jset_u64s(insert->k.u64s) >
 	       ARRAY_SIZE(as->journal_entries));
@@ -1037,7 +1050,7 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, struct btree *b
 		bch2_btree_node_iter_advance(node_iter, b);
 
 	bch2_btree_bset_insert_key(iter, b, node_iter, insert);
-	set_btree_node_dirty(as->c, b);
+	set_btree_node_dirty(c, b);
 	set_btree_node_need_write(b);
 }
 
@@ -1366,6 +1379,8 @@ void bch2_btree_insert_node(struct btree_update *as, struct btree *b,
 		goto split;
 	}
 
+	btree_node_interior_verify(c, b);
+
 	bch2_btree_insert_keys_interior(as, b, iter, keys);
 
 	live_u64s_added = (int) b->nr.live_u64s - old_live_u64s;
-- 
cgit 


From e8c851b351d60b5cf9f2cd23c126fc200b8f5e6a Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 16 Nov 2020 14:23:06 -0500
Subject: bcachefs: Add an ioctl for resizing journal on a device

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs_ioctl.h | 14 ++++++++++++++
 fs/bcachefs/chardev.c        | 23 +++++++++++++++++++++++
 fs/bcachefs/journal.c        | 10 +++++++++-
 3 files changed, 46 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs_ioctl.h b/fs/bcachefs/bcachefs_ioctl.h
index 923001188a88..1bf834e31775 100644
--- a/fs/bcachefs/bcachefs_ioctl.h
+++ b/fs/bcachefs/bcachefs_ioctl.h
@@ -73,6 +73,7 @@ struct bch_ioctl_incremental {
 #define BCH_IOCTL_READ_SUPER	_IOW(0xbc,	12, struct bch_ioctl_read_super)
 #define BCH_IOCTL_DISK_GET_IDX	_IOW(0xbc,	13,  struct bch_ioctl_disk_get_idx)
 #define BCH_IOCTL_DISK_RESIZE	_IOW(0xbc,	14,  struct bch_ioctl_disk_resize)
+#define BCH_IOCTL_DISK_RESIZE_JOURNAL _IOW(0xbc,15,  struct bch_ioctl_disk_resize_journal)
 
 /* ioctl below act on a particular file, not the filesystem as a whole: */
 
@@ -329,4 +330,17 @@ struct bch_ioctl_disk_resize {
 	__u64			nbuckets;
 };
 
+/*
+ * BCH_IOCTL_DISK_RESIZE_JOURNAL: resize journal on a device
+ *
+ * @dev		- member to resize
+ * @nbuckets	- new number of buckets
+ */
+struct bch_ioctl_disk_resize_journal {
+	__u32			flags;
+	__u32			pad;
+	__u64			dev;
+	__u64			nbuckets;
+};
+
 #endif /* _BCACHEFS_IOCTL_H */
diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c
index 0b1eca63f78e..cd5c850a41ec 100644
--- a/fs/bcachefs/chardev.c
+++ b/fs/bcachefs/chardev.c
@@ -5,6 +5,7 @@
 #include "bcachefs_ioctl.h"
 #include "buckets.h"
 #include "chardev.h"
+#include "journal.h"
 #include "move.h"
 #include "replicas.h"
 #include "super.h"
@@ -563,6 +564,26 @@ static long bch2_ioctl_disk_resize(struct bch_fs *c,
 	return ret;
 }
 
+static long bch2_ioctl_disk_resize_journal(struct bch_fs *c,
+				   struct bch_ioctl_disk_resize_journal arg)
+{
+	struct bch_dev *ca;
+	int ret;
+
+	if ((arg.flags & ~BCH_BY_INDEX) ||
+	    arg.pad)
+		return -EINVAL;
+
+	ca = bch2_device_lookup(c, arg.dev, arg.flags);
+	if (IS_ERR(ca))
+		return PTR_ERR(ca);
+
+	ret = bch2_set_nr_journal_buckets(c, ca, arg.nbuckets);
+
+	percpu_ref_put(&ca->ref);
+	return ret;
+}
+
 #define BCH_IOCTL(_name, _argtype)					\
 do {									\
 	_argtype i;							\
@@ -619,6 +640,8 @@ long bch2_fs_ioctl(struct bch_fs *c, unsigned cmd, void __user *arg)
 		BCH_IOCTL(data, struct bch_ioctl_data);
 	case BCH_IOCTL_DISK_RESIZE:
 		BCH_IOCTL(disk_resize, struct bch_ioctl_disk_resize);
+	case BCH_IOCTL_DISK_RESIZE_JOURNAL:
+		BCH_IOCTL(disk_resize_journal, struct bch_ioctl_disk_resize_journal);
 
 	default:
 		return -ENOTTY;
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index 32555ccffc0e..b2a5e9db404e 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -684,7 +684,7 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
 		goto err;
 
 	journal_buckets = bch2_sb_resize_journal(&ca->disk_sb,
-						 nr + sizeof(*journal_buckets) / sizeof(u64));
+					nr + sizeof(*journal_buckets) / sizeof(u64));
 	if (!journal_buckets)
 		goto err;
 
@@ -730,6 +730,12 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
 			spin_lock(&c->journal.lock);
 		}
 
+		/*
+		 * XXX
+		 * For resize at runtime, we should be writing the new
+		 * superblock before inserting into the journal array
+		 */
+
 		pos = ja->nr ? (ja->cur_idx + 1) % ja->nr : 0;
 		__array_insert_item(ja->buckets,		ja->nr, pos);
 		__array_insert_item(ja->bucket_seq,		ja->nr, pos);
@@ -765,6 +771,8 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
 
 	ret = 0;
 err:
+	bch2_sb_resize_journal(&ca->disk_sb,
+		ja->nr + sizeof(*journal_buckets) / sizeof(u64));
 	kfree(new_bucket_seq);
 	kfree(new_buckets);
 
-- 
cgit 


From d8ebed7d24cdf3d4596ab5af471f5e7f749d7aab Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 19 Nov 2020 20:13:30 -0500
Subject: bcachefs: Add btree cache stats to sysfs

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_cache.c | 6 ++++++
 fs/bcachefs/btree_cache.h | 1 +
 fs/bcachefs/sysfs.c       | 7 +++++++
 3 files changed, 14 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index 2c8f67fabc5a..04c71f11a555 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -1065,3 +1065,9 @@ void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c,
 	       stats.floats,
 	       stats.failed);
 }
+
+void bch2_btree_cache_to_text(struct printbuf *out, struct bch_fs *c)
+{
+	pr_buf(out, "nr nodes:\t%u\n", c->btree_cache.used);
+	pr_buf(out, "nr dirty:\t%u\n", atomic_read(&c->btree_cache.dirty));
+}
diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h
index 8a19e60e9258..e766ef552ce7 100644
--- a/fs/bcachefs/btree_cache.h
+++ b/fs/bcachefs/btree_cache.h
@@ -100,5 +100,6 @@ static inline unsigned btree_blocks(struct bch_fs *c)
 
 void bch2_btree_node_to_text(struct printbuf *, struct bch_fs *,
 			     struct btree *);
+void bch2_btree_cache_to_text(struct printbuf *, struct bch_fs *);
 
 #endif /* _BCACHEFS_BTREE_CACHE_H */
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index 89287bfe31a7..280c28a926dd 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -165,6 +165,7 @@ read_attribute(journal_debug);
 read_attribute(journal_pins);
 read_attribute(btree_updates);
 read_attribute(dirty_btree_nodes);
+read_attribute(btree_cache);
 read_attribute(btree_key_cache);
 read_attribute(btree_transactions);
 read_attribute(stripes_heap);
@@ -374,6 +375,11 @@ SHOW(bch2_fs)
 		return out.pos - buf;
 	}
 
+	if (attr == &sysfs_btree_cache) {
+		bch2_btree_cache_to_text(&out, c);
+		return out.pos - buf;
+	}
+
 	if (attr == &sysfs_btree_key_cache) {
 		bch2_btree_key_cache_to_text(&out, &c->btree_key_cache);
 		return out.pos - buf;
@@ -550,6 +556,7 @@ struct attribute *bch2_fs_internal_files[] = {
 	&sysfs_journal_pins,
 	&sysfs_btree_updates,
 	&sysfs_dirty_btree_nodes,
+	&sysfs_btree_cache,
 	&sysfs_btree_key_cache,
 	&sysfs_btree_transactions,
 	&sysfs_stripes_heap,
-- 
cgit 


From ed0e24c0992dffe494bdd0ea6ddf3b816c438524 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 18 Nov 2020 13:21:59 -0500
Subject: bcachefs: Be more precise with journal error reporting

We were incorrectly detecting a journal deadlock - the journal filling
up - when only the journal pin fifo had filled up; if the journal pin
fifo is full that just means we need to wait on reclaim.

This plumbs through better error reporting so we can better discriminate
in the journal_res_get path what's going on.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/journal.c         | 76 ++++++++++++++++++++++---------------------
 fs/bcachefs/journal_reclaim.c |  6 ++--
 fs/bcachefs/journal_types.h   |  8 ++++-
 3 files changed, 49 insertions(+), 41 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index b2a5e9db404e..bb4353e673e7 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -234,7 +234,7 @@ static int journal_entry_open(struct journal *j)
 	BUG_ON(journal_entry_is_open(j));
 
 	if (j->blocked)
-		return -EAGAIN;
+		return cur_entry_blocked;
 
 	if (j->cur_entry_error)
 		return j->cur_entry_error;
@@ -250,7 +250,7 @@ static int journal_entry_open(struct journal *j)
 	u64s  = clamp_t(int, u64s, 0, JOURNAL_ENTRY_CLOSED_VAL - 1);
 
 	if (u64s <= le32_to_cpu(buf->data->u64s))
-		return -ENOSPC;
+		return cur_entry_journal_full;
 
 	/*
 	 * Must be set before marking the journal entry as open:
@@ -262,7 +262,7 @@ static int journal_entry_open(struct journal *j)
 		old.v = new.v = v;
 
 		if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL)
-			return -EROFS;
+			return cur_entry_insufficient_devices;
 
 		/* Handle any already added entries */
 		new.cur_entry_offset = le32_to_cpu(buf->data->u64s);
@@ -375,7 +375,7 @@ retry:
 		 * Don't want to close current journal entry, just need to
 		 * invoke reclaim:
 		 */
-		ret = -ENOSPC;
+		ret = cur_entry_journal_full;
 		goto unlock;
 	}
 
@@ -398,14 +398,16 @@ retry:
 		 * there's still a previous one in flight:
 		 */
 		trace_journal_entry_full(c);
-		ret = -EAGAIN;
+		ret = cur_entry_blocked;
 	} else {
 		ret = journal_entry_open(j);
 	}
 unlock:
-	if ((ret == -EAGAIN || ret == -ENOSPC) &&
-	    !j->res_get_blocked_start)
+	if ((ret && ret != cur_entry_insufficient_devices) &&
+	    !j->res_get_blocked_start) {
 		j->res_get_blocked_start = local_clock() ?: 1;
+		trace_journal_full(c);
+	}
 
 	can_discard = j->can_discard;
 	spin_unlock(&j->lock);
@@ -413,41 +415,39 @@ unlock:
 	if (!ret)
 		goto retry;
 
-	if (ret == -ENOSPC) {
-		if (WARN_ONCE(!can_discard && (flags & JOURNAL_RES_GET_RESERVED),
-			      "JOURNAL_RES_GET_RESERVED set but journal full")) {
-			char *buf;
-
-			buf = kmalloc(4096, GFP_NOFS);
-			if (buf) {
-				bch2_journal_debug_to_text(&_PBUF(buf, 4096), j);
-				pr_err("\n%s", buf);
-				kfree(buf);
-			}
+	if (WARN_ONCE(ret == cur_entry_journal_full &&
+		      !can_discard &&
+		      (flags & JOURNAL_RES_GET_RESERVED),
+		      "JOURNAL_RES_GET_RESERVED set but journal full")) {
+		char *buf;
+
+		buf = kmalloc(4096, GFP_NOFS);
+		if (buf) {
+			bch2_journal_debug_to_text(&_PBUF(buf, 4096), j);
+			pr_err("\n%s", buf);
+			kfree(buf);
 		}
+	}
 
-		/*
-		 * Journal is full - can't rely on reclaim from work item due to
-		 * freezing:
-		 */
-		trace_journal_full(c);
-
-		if (!(flags & JOURNAL_RES_GET_NONBLOCK)) {
-			if (can_discard) {
-				bch2_journal_do_discards(j);
-				goto retry;
-			}
-
-			if (mutex_trylock(&j->reclaim_lock)) {
-				bch2_journal_reclaim(j);
-				mutex_unlock(&j->reclaim_lock);
-			}
+	/*
+	 * Journal is full - can't rely on reclaim from work item due to
+	 * freezing:
+	 */
+	if ((ret == cur_entry_journal_full ||
+	     ret == cur_entry_journal_pin_full) &&
+	    !(flags & JOURNAL_RES_GET_NONBLOCK)) {
+		if (can_discard) {
+			bch2_journal_do_discards(j);
+			goto retry;
 		}
 
-		ret = -EAGAIN;
+		if (mutex_trylock(&j->reclaim_lock)) {
+			bch2_journal_reclaim(j);
+			mutex_unlock(&j->reclaim_lock);
+		}
 	}
 
-	return ret;
+	return ret == cur_entry_insufficient_devices ? -EROFS : -EAGAIN;
 }
 
 /*
@@ -1072,6 +1072,7 @@ void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
 	       "last_seq_ondisk:\t%llu\n"
 	       "prereserved:\t\t%u/%u\n"
 	       "current entry sectors:\t%u\n"
+	       "current entry error:\t%u\n"
 	       "current entry:\t\t",
 	       fifo_used(&j->pin),
 	       journal_cur_seq(j),
@@ -1079,7 +1080,8 @@ void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
 	       j->last_seq_ondisk,
 	       j->prereserved.reserved,
 	       j->prereserved.remaining,
-	       j->cur_entry_sectors);
+	       j->cur_entry_sectors,
+	       j->cur_entry_error);
 
 	switch (s.cur_entry_offset) {
 	case JOURNAL_ENTRY_ERROR_VAL:
diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
index f9e0160074db..1cd9c11a37f0 100644
--- a/fs/bcachefs/journal_reclaim.c
+++ b/fs/bcachefs/journal_reclaim.c
@@ -164,12 +164,12 @@ void bch2_journal_space_available(struct journal *j)
 	j->can_discard = can_discard;
 
 	if (nr_online < c->opts.metadata_replicas_required) {
-		ret = -EROFS;
+		ret = cur_entry_insufficient_devices;
 		goto out;
 	}
 
 	if (!fifo_free(&j->pin)) {
-		ret = -ENOSPC;
+		ret = cur_entry_journal_pin_full;
 		goto out;
 	}
 
@@ -180,7 +180,7 @@ void bch2_journal_space_available(struct journal *j)
 	clean		= __journal_space_available(j, nr_devs_want, journal_space_clean);
 
 	if (!discarded.next_entry)
-		ret = -ENOSPC;
+		ret = cur_entry_journal_full;
 
 	overhead = DIV_ROUND_UP(clean.remaining, max_entry_size) *
 		journal_entry_overhead(j);
diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h
index 22ff7f8081c6..5f20653b8eb5 100644
--- a/fs/bcachefs/journal_types.h
+++ b/fs/bcachefs/journal_types.h
@@ -146,7 +146,13 @@ struct journal {
 	 * 0, or -ENOSPC if waiting on journal reclaim, or -EROFS if
 	 * insufficient devices:
 	 */
-	int			cur_entry_error;
+	enum {
+		cur_entry_ok,
+		cur_entry_blocked,
+		cur_entry_journal_full,
+		cur_entry_journal_pin_full,
+		cur_entry_insufficient_devices,
+	}			cur_entry_error;
 
 	union journal_preres_state prereserved;
 
-- 
cgit 


From 14ba3706b3a8b5d243e0f250e54baeaecfbd8289 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 18 Nov 2020 14:09:33 -0500
Subject: bcachefs: Add a kmem_cache for btree_key_cache objects

We allocate a lot of these, and we're seeing sporading OOMs - this will
help with tracking those down.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_key_cache.c | 30 +++++++++++++++++++++++++-----
 fs/bcachefs/btree_key_cache.h |  3 +++
 fs/bcachefs/super.c           |  2 ++
 3 files changed, 30 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index 71d5bfa4caab..441cdc88b940 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -12,6 +12,8 @@
 
 #include <linux/sched/mm.h>
 
+static struct kmem_cache *bch2_key_cache;
+
 static int bch2_btree_key_cache_cmp_fn(struct rhashtable_compare_arg *arg,
 				       const void *obj)
 {
@@ -104,7 +106,7 @@ bkey_cached_alloc(struct btree_key_cache *c)
 			return ck;
 		}
 
-	ck = kzalloc(sizeof(*ck), GFP_NOFS);
+	ck = kmem_cache_alloc(bch2_key_cache, GFP_NOFS|__GFP_ZERO);
 	if (!ck)
 		return NULL;
 
@@ -516,7 +518,7 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
 		if (poll_state_synchronize_srcu(&c->btree_trans_barrier,
 						ck->btree_trans_barrier_seq)) {
 			list_del(&ck->list);
-			kfree(ck);
+			kmem_cache_free(bch2_key_cache, ck);
 			freed++;
 		}
 
@@ -571,15 +573,18 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc)
 		bch2_journal_preres_put(&c->journal, &ck->res);
 
 		kfree(ck->k);
-		kfree(ck);
+		list_del(&ck->list);
+		kmem_cache_free(bch2_key_cache, ck);
 		bc->nr_keys--;
 	}
 
 	BUG_ON(bc->nr_dirty && !bch2_journal_error(&c->journal));
 	BUG_ON(bc->nr_keys);
 
-	list_for_each_entry_safe(ck, n, &bc->freed, list)
-		kfree(ck);
+	list_for_each_entry_safe(ck, n, &bc->freed, list) {
+		list_del(&ck->list);
+		kmem_cache_free(bch2_key_cache, ck);
+	}
 	mutex_unlock(&bc->lock);
 
 	rhashtable_destroy(&bc->table);
@@ -627,3 +632,18 @@ void bch2_btree_key_cache_to_text(struct printbuf *out, struct btree_key_cache *
 	}
 	mutex_unlock(&c->lock);
 }
+
+void bch2_btree_key_cache_exit(void)
+{
+	if (bch2_key_cache)
+		kmem_cache_destroy(bch2_key_cache);
+}
+
+int __init bch2_btree_key_cache_init(void)
+{
+	bch2_key_cache = KMEM_CACHE(bkey_cached, 0);
+	if (!bch2_key_cache)
+		return -ENOMEM;
+
+	return 0;
+}
diff --git a/fs/bcachefs/btree_key_cache.h b/fs/bcachefs/btree_key_cache.h
index d448264abcc8..e64a8e9c726f 100644
--- a/fs/bcachefs/btree_key_cache.h
+++ b/fs/bcachefs/btree_key_cache.h
@@ -25,4 +25,7 @@ int bch2_fs_btree_key_cache_init(struct btree_key_cache *);
 
 void bch2_btree_key_cache_to_text(struct printbuf *, struct btree_key_cache *);
 
+void bch2_btree_key_cache_exit(void);
+int __init bch2_btree_key_cache_init(void);
+
 #endif /* _BCACHEFS_BTREE_KEY_CACHE_H */
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 61b7e750037c..12ce4a627746 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -2020,6 +2020,7 @@ static void bcachefs_exit(void)
 	bch2_debug_exit();
 	bch2_vfs_exit();
 	bch2_chardev_exit();
+	bch2_btree_key_cache_exit();
 	if (bcachefs_kset)
 		kset_unregister(bcachefs_kset);
 }
@@ -2029,6 +2030,7 @@ static int __init bcachefs_init(void)
 	bch2_bkey_pack_test();
 
 	if (!(bcachefs_kset = kset_create_and_add("bcachefs", NULL, fs_kobj)) ||
+	    bch2_btree_key_cache_init() ||
 	    bch2_chardev_init() ||
 	    bch2_vfs_init() ||
 	    bch2_debug_init())
-- 
cgit 


From 4e92cbb64287fcacc05f9a4fc1f9f390d5f58574 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 19 Nov 2020 11:53:38 -0500
Subject: bcachefs: More debug code improvements

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update_interior.c | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 96fa2b5a1d1e..3f6ab09d100d 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -49,12 +49,27 @@ static void btree_node_interior_verify(struct bch_fs *c, struct btree *b)
 			break;
 		bp = bkey_s_c_to_btree_ptr_v2(k);
 
-		BUG_ON(bkey_cmp(next_node, bp.v->min_key));
+		if (bkey_cmp(next_node, bp.v->min_key)) {
+			bch2_dump_btree_node(c, b);
+			panic("expected next min_key %llu:%llu got %llu:%llu\n",
+			      next_node.inode,
+			      next_node.offset,
+			      bp.v->min_key.inode,
+			      bp.v->min_key.offset);
+		}
 
 		bch2_btree_node_iter_advance(&iter, b);
 
 		if (bch2_btree_node_iter_end(&iter)) {
-			BUG_ON(bkey_cmp(k.k->p, b->key.k.p));
+
+			if (bkey_cmp(k.k->p, b->key.k.p)) {
+				bch2_dump_btree_node(c, b);
+				panic("expected end %llu:%llu got %llu:%llu\n",
+				      b->key.k.p.inode,
+				      b->key.k.p.offset,
+				      k.k->p.inode,
+				      k.k->p.offset);
+			}
 			break;
 		}
 
-- 
cgit 


From 125907203cc902d6fd430950f700a44124e208d4 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 19 Nov 2020 15:38:27 -0500
Subject: bcachefs: Improve btree key cache shrinker

The shrinker should start scanning for entries that can be freed oldest
to newest - this way, we can avoid scanning a lot of entries that are
too new to be freed.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_key_cache.c | 95 +++++++++++++++++++++----------------------
 fs/bcachefs/btree_types.h     |  4 +-
 2 files changed, 49 insertions(+), 50 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index 441cdc88b940..836bb23fe3bc 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -78,10 +78,13 @@ static void bkey_cached_free(struct btree_key_cache *bc,
 {
 	struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
 
+	BUG_ON(test_bit(BKEY_CACHED_DIRTY, &ck->flags));
+
 	ck->btree_trans_barrier_seq =
 		start_poll_synchronize_srcu(&c->btree_trans_barrier);
 
-	list_move(&ck->list, &bc->freed);
+	list_move_tail(&ck->list, &bc->freed);
+	bc->nr_freed++;
 
 	kfree(ck->k);
 	ck->k		= NULL;
@@ -96,9 +99,21 @@ bkey_cached_alloc(struct btree_key_cache *c)
 {
 	struct bkey_cached *ck;
 
-	list_for_each_entry(ck, &c->freed, list)
-		if (bkey_cached_lock_for_evict(ck))
+	list_for_each_entry_reverse(ck, &c->freed, list)
+		if (bkey_cached_lock_for_evict(ck)) {
+			c->nr_freed--;
 			return ck;
+		}
+
+	ck = kmem_cache_alloc(bch2_key_cache, GFP_NOFS|__GFP_ZERO);
+	if (likely(ck)) {
+		INIT_LIST_HEAD(&ck->list);
+		six_lock_init(&ck->c.lock);
+		lockdep_set_novalidate_class(&ck->c.lock);
+		BUG_ON(!six_trylock_intent(&ck->c.lock));
+		BUG_ON(!six_trylock_write(&ck->c.lock));
+		return ck;
+	}
 
 	list_for_each_entry(ck, &c->clean, list)
 		if (bkey_cached_lock_for_evict(ck)) {
@@ -106,17 +121,7 @@ bkey_cached_alloc(struct btree_key_cache *c)
 			return ck;
 		}
 
-	ck = kmem_cache_alloc(bch2_key_cache, GFP_NOFS|__GFP_ZERO);
-	if (!ck)
-		return NULL;
-
-	INIT_LIST_HEAD(&ck->list);
-	six_lock_init(&ck->c.lock);
-	lockdep_set_novalidate_class(&ck->c.lock);
-	BUG_ON(!six_trylock_intent(&ck->c.lock));
-	BUG_ON(!six_trylock_write(&ck->c.lock));
-
-	return ck;
+	return NULL;
 }
 
 static struct bkey_cached *
@@ -135,8 +140,7 @@ btree_key_cache_create(struct btree_key_cache *c,
 	ck->key.btree_id	= btree_id;
 	ck->key.pos		= pos;
 	ck->valid		= false;
-
-	BUG_ON(ck->flags);
+	ck->flags		= 1U << BKEY_CACHED_ACCESSED;
 
 	if (rhashtable_lookup_insert_fast(&c->table,
 					  &ck->hash,
@@ -293,6 +297,9 @@ fill:
 			goto err;
 	}
 
+	if (!test_bit(BKEY_CACHED_ACCESSED, &ck->flags))
+		set_bit(BKEY_CACHED_ACCESSED, &ck->flags);
+
 	iter->uptodate = BTREE_ITER_NEED_PEEK;
 	bch2_btree_iter_downgrade(iter);
 	return ret;
@@ -512,28 +519,34 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
 
 	flags = memalloc_nofs_save();
 
+	/*
+	 * Newest freed entries are at the end of the list - once we hit one
+	 * that's too new to be freed, we can bail out:
+	 */
 	list_for_each_entry_safe(ck, t, &bc->freed, list) {
-		scanned++;
-
-		if (poll_state_synchronize_srcu(&c->btree_trans_barrier,
-						ck->btree_trans_barrier_seq)) {
-			list_del(&ck->list);
-			kmem_cache_free(bch2_key_cache, ck);
-			freed++;
-		}
+		if (!poll_state_synchronize_srcu(&c->btree_trans_barrier,
+						 ck->btree_trans_barrier_seq))
+			break;
 
-		if (scanned >= nr)
-			goto out;
+		list_del(&ck->list);
+		kmem_cache_free(bch2_key_cache, ck);
+		bc->nr_freed--;
+		scanned++;
+		freed++;
 	}
 
-	list_for_each_entry_safe(ck, t, &bc->clean, list) {
-		scanned++;
+	if (scanned >= nr)
+		goto out;
 
-		if (bkey_cached_lock_for_evict(ck)) {
+	list_for_each_entry_safe(ck, t, &bc->clean, list) {
+		if (test_bit(BKEY_CACHED_ACCESSED, &ck->flags))
+			clear_bit(BKEY_CACHED_ACCESSED, &ck->flags);
+		else if (bkey_cached_lock_for_evict(ck)) {
 			bkey_cached_evict(bc, ck);
 			bkey_cached_free(bc, ck);
 		}
 
+		scanned++;
 		if (scanned >= nr) {
 			if (&t->list != &bc->clean)
 				list_move_tail(&bc->clean, &t->list);
@@ -602,6 +615,7 @@ int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc)
 {
 	struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
 
+	bc->shrink.seeks		= 1;
 	bc->shrink.count_objects	= bch2_btree_key_cache_count;
 	bc->shrink.scan_objects		= bch2_btree_key_cache_scan;
 
@@ -611,26 +625,9 @@ int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc)
 
 void bch2_btree_key_cache_to_text(struct printbuf *out, struct btree_key_cache *c)
 {
-	struct bucket_table *tbl;
-	struct bkey_cached *ck;
-	struct rhash_head *pos;
-	size_t i;
-
-	mutex_lock(&c->lock);
-	tbl = rht_dereference_rcu(c->table.tbl, &c->table);
-
-	for (i = 0; i < tbl->size; i++) {
-		rht_for_each_entry_rcu(ck, pos, tbl, i, hash) {
-			pr_buf(out, "%s:",
-			       bch2_btree_ids[ck->key.btree_id]);
-			bch2_bpos_to_text(out, ck->key.pos);
-
-			if (test_bit(BKEY_CACHED_DIRTY, &ck->flags))
-				pr_buf(out, " journal seq %llu", ck->journal.seq);
-			pr_buf(out, "\n");
-		}
-	}
-	mutex_unlock(&c->lock);
+	pr_buf(out, "nr_freed:\t%zu\n",	c->nr_freed);
+	pr_buf(out, "nr_keys:\t%zu\n",	c->nr_keys);
+	pr_buf(out, "nr_dirty:\t%zu\n",	c->nr_dirty);
 }
 
 void bch2_btree_key_cache_exit(void)
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index c0ee829ead40..bf2fc979a2eb 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -298,6 +298,7 @@ struct btree_key_cache {
 	struct list_head	dirty;
 	struct shrinker		shrink;
 
+	size_t			nr_freed;
 	size_t			nr_keys;
 	size_t			nr_dirty;
 };
@@ -307,7 +308,8 @@ struct bkey_cached_key {
 	struct bpos		pos;
 } __attribute__((packed, aligned(4)));
 
-#define BKEY_CACHED_DIRTY		0
+#define BKEY_CACHED_ACCESSED		0
+#define BKEY_CACHED_DIRTY		1
 
 struct bkey_cached {
 	struct btree_bkey_cached_common c;
-- 
cgit 


From 8a92e545597a3eaca80f2df14eb9a783d96c8445 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 19 Nov 2020 19:54:40 -0500
Subject: bcachefs: Ensure journal reclaim runs when btree key cache is too
 dirty

Ensuring the key cache isn't too dirty is critical for ensuring that the
shrinker can reclaim memory.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_key_cache.c |  8 ++++++
 fs/bcachefs/btree_key_cache.h |  9 +++++++
 fs/bcachefs/journal_reclaim.c | 53 ++++++++++++++++++++++++++------------
 fs/bcachefs/trace.h           | 59 +++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 113 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index 836bb23fe3bc..99e03852b814 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -461,6 +461,7 @@ bool bch2_btree_insert_key_cached(struct btree_trans *trans,
 {
 	struct bch_fs *c = trans->c;
 	struct bkey_cached *ck = (void *) iter->l[0].b;
+	bool kick_reclaim = false;
 
 	BUG_ON(insert->u64s > ck->u64s);
 
@@ -485,11 +486,18 @@ bool bch2_btree_insert_key_cached(struct btree_trans *trans,
 
 		set_bit(BKEY_CACHED_DIRTY, &ck->flags);
 		c->btree_key_cache.nr_dirty++;
+
+		if (bch2_nr_btree_keys_need_flush(c))
+			kick_reclaim = true;
+
 		mutex_unlock(&c->btree_key_cache.lock);
 	}
 
 	bch2_journal_pin_update(&c->journal, trans->journal_res.seq,
 				&ck->journal, btree_key_cache_journal_flush);
+
+	if (kick_reclaim)
+		mod_delayed_work(c->journal_reclaim_wq, &c->journal.reclaim_work, 0);
 	return true;
 }
 
diff --git a/fs/bcachefs/btree_key_cache.h b/fs/bcachefs/btree_key_cache.h
index e64a8e9c726f..7723a2178430 100644
--- a/fs/bcachefs/btree_key_cache.h
+++ b/fs/bcachefs/btree_key_cache.h
@@ -1,6 +1,15 @@
 #ifndef _BCACHEFS_BTREE_KEY_CACHE_H
 #define _BCACHEFS_BTREE_KEY_CACHE_H
 
+static inline size_t bch2_nr_btree_keys_need_flush(struct bch_fs *c)
+{
+	size_t nr_dirty = READ_ONCE(c->btree_key_cache.nr_dirty);
+	size_t nr_keys = READ_ONCE(c->btree_key_cache.nr_dirty);
+	size_t max_dirty = 1024 + (nr_keys * 3) / 4;
+
+	return max_t(ssize_t, 0, nr_dirty - max_dirty);
+}
+
 struct bkey_cached *
 bch2_btree_key_cache_find(struct bch_fs *, enum btree_id, struct bpos);
 
diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
index 1cd9c11a37f0..7f8ab13256c8 100644
--- a/fs/bcachefs/journal_reclaim.c
+++ b/fs/bcachefs/journal_reclaim.c
@@ -1,11 +1,13 @@
 // SPDX-License-Identifier: GPL-2.0
 
 #include "bcachefs.h"
+#include "btree_key_cache.h"
 #include "journal.h"
 #include "journal_io.h"
 #include "journal_reclaim.h"
 #include "replicas.h"
 #include "super.h"
+#include "trace.h"
 
 /* Free space calculations: */
 
@@ -432,7 +434,6 @@ journal_get_next_pin(struct journal *j, u64 max_seq, u64 *seq)
 		list_move(&ret->list, &pin_list->flushed);
 		BUG_ON(j->flush_in_progress);
 		j->flush_in_progress = ret;
-		j->last_flushed = jiffies;
 	}
 
 	spin_unlock(&j->lock);
@@ -441,17 +442,24 @@ journal_get_next_pin(struct journal *j, u64 max_seq, u64 *seq)
 }
 
 /* returns true if we did work */
-static bool journal_flush_pins(struct journal *j, u64 seq_to_flush,
-			       unsigned min_nr)
+static u64 journal_flush_pins(struct journal *j, u64 seq_to_flush,
+			      unsigned min_nr)
 {
 	struct journal_entry_pin *pin;
-	bool ret = false;
-	u64 seq;
+	u64 seq, ret = 0;
 
 	lockdep_assert_held(&j->reclaim_lock);
 
-	while ((pin = journal_get_next_pin(j, min_nr
-				? U64_MAX : seq_to_flush, &seq))) {
+	while (1) {
+		cond_resched();
+
+		j->last_flushed = jiffies;
+
+		pin = journal_get_next_pin(j, min_nr
+				? U64_MAX : seq_to_flush, &seq);
+		if (!pin)
+			break;
+
 		if (min_nr)
 			min_nr--;
 
@@ -460,7 +468,7 @@ static bool journal_flush_pins(struct journal *j, u64 seq_to_flush,
 		BUG_ON(j->flush_in_progress != pin);
 		j->flush_in_progress = NULL;
 		wake_up(&j->pin_flush_wait);
-		ret = true;
+		ret++;
 	}
 
 	return ret;
@@ -527,8 +535,8 @@ static u64 journal_seq_to_flush(struct journal *j)
 void bch2_journal_reclaim(struct journal *j)
 {
 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
-	unsigned min_nr = 0;
-	u64 seq_to_flush = 0;
+	u64 seq_to_flush, nr_flushed = 0;
+	size_t min_nr;
 
 	lockdep_assert_held(&j->reclaim_lock);
 
@@ -549,12 +557,25 @@ void bch2_journal_reclaim(struct journal *j)
 		if (j->prereserved.reserved * 2 > j->prereserved.remaining)
 			min_nr = 1;
 
-		if ((atomic_read(&c->btree_cache.dirty) * 4 >
-		     c->btree_cache.used  * 3) ||
-		    (c->btree_key_cache.nr_dirty * 4 >
-		     c->btree_key_cache.nr_keys))
+		if (atomic_read(&c->btree_cache.dirty) * 4 >
+		    c->btree_cache.used  * 3)
 			min_nr = 1;
-	} while (journal_flush_pins(j, seq_to_flush, min_nr));
+
+		min_nr = max(min_nr, bch2_nr_btree_keys_need_flush(c));
+
+		trace_journal_reclaim_start(c,
+				min_nr,
+				j->prereserved.reserved,
+				j->prereserved.remaining,
+				atomic_read(&c->btree_cache.dirty),
+				c->btree_cache.used,
+				c->btree_key_cache.nr_dirty,
+				c->btree_key_cache.nr_keys);
+
+		nr_flushed += journal_flush_pins(j, seq_to_flush, min_nr);
+	} while (min_nr);
+
+	trace_journal_reclaim_finish(c, nr_flushed);
 
 	if (!bch2_journal_error(j))
 		queue_delayed_work(c->journal_reclaim_wq, &j->reclaim_work,
@@ -582,7 +603,7 @@ static int journal_flush_done(struct journal *j, u64 seq_to_flush,
 
 	mutex_lock(&j->reclaim_lock);
 
-	*did_work = journal_flush_pins(j, seq_to_flush, 0);
+	*did_work = journal_flush_pins(j, seq_to_flush, 0) != 0;
 
 	spin_lock(&j->lock);
 	/*
diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h
index 09653c7ed858..2afc09ad64ea 100644
--- a/fs/bcachefs/trace.h
+++ b/fs/bcachefs/trace.h
@@ -121,6 +121,65 @@ DEFINE_EVENT(bio, journal_write,
 	TP_ARGS(bio)
 );
 
+TRACE_EVENT(journal_reclaim_start,
+	TP_PROTO(struct bch_fs *c, u64 min_nr,
+		 u64 prereserved, u64 prereserved_total,
+		 u64 btree_cache_dirty, u64 btree_cache_total,
+		 u64 btree_key_cache_dirty, u64 btree_key_cache_total),
+	TP_ARGS(c, min_nr, prereserved, prereserved_total,
+		btree_cache_dirty, btree_cache_total,
+		btree_key_cache_dirty, btree_key_cache_total),
+
+	TP_STRUCT__entry(
+		__array(char,		uuid,	16		)
+		__field(u64,		min_nr			)
+		__field(u64,		prereserved		)
+		__field(u64,		prereserved_total	)
+		__field(u64,		btree_cache_dirty	)
+		__field(u64,		btree_cache_total	)
+		__field(u64,		btree_key_cache_dirty	)
+		__field(u64,		btree_key_cache_total	)
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->uuid, c->sb.user_uuid.b, 16);
+		__entry->min_nr			= min_nr;
+		__entry->prereserved		= prereserved;
+		__entry->prereserved_total	= prereserved_total;
+		__entry->btree_cache_dirty	= btree_cache_dirty;
+		__entry->btree_cache_total	= btree_cache_total;
+		__entry->btree_key_cache_dirty	= btree_key_cache_dirty;
+		__entry->btree_key_cache_total	= btree_key_cache_total;
+	),
+
+	TP_printk("%pU min %llu prereserved %llu/%llu btree cache %llu/%llu key cache %llu/%llu",
+		  __entry->uuid,
+		  __entry->min_nr,
+		  __entry->prereserved,
+		  __entry->prereserved_total,
+		  __entry->btree_cache_dirty,
+		  __entry->btree_cache_total,
+		  __entry->btree_key_cache_dirty,
+		  __entry->btree_key_cache_total)
+);
+
+TRACE_EVENT(journal_reclaim_finish,
+	TP_PROTO(struct bch_fs *c, u64 nr_flushed),
+	TP_ARGS(c, nr_flushed),
+
+	TP_STRUCT__entry(
+		__array(char,		uuid,	16 )
+		__field(u64,		nr_flushed )
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->uuid, c->sb.user_uuid.b, 16);
+		__entry->nr_flushed = nr_flushed;
+	),
+
+	TP_printk("%pU flushed %llu", __entry->uuid, __entry->nr_flushed)
+);
+
 /* bset.c: */
 
 DEFINE_EVENT(bpos, bkey_pack_pos_fail,
-- 
cgit 


From b3c2a06b7d89eb06454f31c4b396e37fbe59374c Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 20 Nov 2020 13:24:51 -0500
Subject: bcachefs: Simplify transaction commit error path

The transaction restart path traverses all iterators, we don't need to
do it here.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update_leaf.c | 11 -----------
 fs/bcachefs/trace.h             |  5 -----
 2 files changed, 16 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 4ab12a9db2f4..08d08d2f1ea3 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -657,17 +657,6 @@ int bch2_trans_commit_error(struct btree_trans *trans,
 		break;
 	}
 
-	if (ret == -EINTR) {
-		int ret2 = bch2_btree_iter_traverse_all(trans);
-
-		if (ret2) {
-			trace_trans_restart_traverse(trans->ip);
-			return ret2;
-		}
-
-		trace_trans_restart_atomic(trans->ip);
-	}
-
 	return ret;
 }
 
diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h
index 2afc09ad64ea..9706b6a3b1b2 100644
--- a/fs/bcachefs/trace.h
+++ b/fs/bcachefs/trace.h
@@ -716,11 +716,6 @@ DEFINE_EVENT(transaction_restart,	trans_restart_traverse,
 	TP_ARGS(ip)
 );
 
-DEFINE_EVENT(transaction_restart,	trans_restart_atomic,
-	TP_PROTO(unsigned long ip),
-	TP_ARGS(ip)
-);
-
 DECLARE_EVENT_CLASS(node_lock_fail,
 	TP_PROTO(unsigned level, u32 iter_seq, unsigned node, u32 node_seq),
 	TP_ARGS(level, iter_seq, node, node_seq),
-- 
cgit 


From 9d4582ffdb286d3513ee9ebf7961b1741d8cbc0d Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 19 Nov 2020 21:15:39 -0500
Subject: bcachefs: Journal reclaim requires memalloc_noreclaim_save()

Memory reclaim requires journal reclaim to make forward progress - it's
what cleans our caches - thus, while we're in journal reclaim or holding
the journal reclaim lock we can't recurse into  memory reclaim.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/journal_reclaim.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
index 7f8ab13256c8..9c67597d1ec6 100644
--- a/fs/bcachefs/journal_reclaim.c
+++ b/fs/bcachefs/journal_reclaim.c
@@ -9,6 +9,8 @@
 #include "super.h"
 #include "trace.h"
 
+#include <linux/sched/mm.h>
+
 /* Free space calculations: */
 
 static unsigned journal_space_from(struct journal_device *ja,
@@ -537,8 +539,16 @@ void bch2_journal_reclaim(struct journal *j)
 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
 	u64 seq_to_flush, nr_flushed = 0;
 	size_t min_nr;
+	unsigned flags;
 
+	/*
+	 * We can't invoke memory reclaim while holding the reclaim_lock -
+	 * journal reclaim is required to make progress for memory reclaim
+	 * (cleaning the caches), so we can't get stuck in memory reclaim while
+	 * we're holding the reclaim lock:
+	 */
 	lockdep_assert_held(&j->reclaim_lock);
+	flags = memalloc_noreclaim_save();
 
 	do {
 		bch2_journal_do_discards(j);
@@ -575,6 +585,8 @@ void bch2_journal_reclaim(struct journal *j)
 		nr_flushed += journal_flush_pins(j, seq_to_flush, min_nr);
 	} while (min_nr);
 
+	memalloc_noreclaim_restore(flags);
+
 	trace_journal_reclaim_finish(c, nr_flushed);
 
 	if (!bch2_journal_error(j))
-- 
cgit 


From d5425a3b220a8b94ae2dd3c74af001a6b1216651 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 19 Nov 2020 21:40:03 -0500
Subject: bcachefs: Throttle updates when btree key cache is too dirty

This is needed to ensure we don't deadlock because journal reclaim and
thus memory reclaim isn't making forward progress.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_key_cache.h   | 11 ++++++++++-
 fs/bcachefs/btree_types.h       |  1 +
 fs/bcachefs/btree_update_leaf.c | 19 +++++++++++++++++++
 fs/bcachefs/trace.h             |  5 +++++
 4 files changed, 35 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_key_cache.h b/fs/bcachefs/btree_key_cache.h
index 7723a2178430..d7d31a0662c3 100644
--- a/fs/bcachefs/btree_key_cache.h
+++ b/fs/bcachefs/btree_key_cache.h
@@ -5,11 +5,20 @@ static inline size_t bch2_nr_btree_keys_need_flush(struct bch_fs *c)
 {
 	size_t nr_dirty = READ_ONCE(c->btree_key_cache.nr_dirty);
 	size_t nr_keys = READ_ONCE(c->btree_key_cache.nr_dirty);
-	size_t max_dirty = 1024 + (nr_keys * 3) / 4;
+	size_t max_dirty = 4096 + nr_keys  / 2;
 
 	return max_t(ssize_t, 0, nr_dirty - max_dirty);
 }
 
+static inline bool bch2_btree_key_cache_must_wait(struct bch_fs *c)
+{
+	size_t nr_dirty = READ_ONCE(c->btree_key_cache.nr_dirty);
+	size_t nr_keys = READ_ONCE(c->btree_key_cache.nr_dirty);
+	size_t max_dirty = 4096 + (nr_keys * 3) / 4;
+
+	return nr_dirty > max_dirty;
+}
+
 struct bkey_cached *
 bch2_btree_key_cache_find(struct bch_fs *, enum btree_id, struct bpos);
 
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index bf2fc979a2eb..d861d94242a4 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -649,6 +649,7 @@ enum btree_insert_ret {
 	BTREE_INSERT_ENOSPC,
 	BTREE_INSERT_NEED_MARK_REPLICAS,
 	BTREE_INSERT_NEED_JOURNAL_RES,
+	BTREE_INSERT_NEED_JOURNAL_RECLAIM,
 };
 
 enum btree_gc_coalesce_fail_reason {
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 08d08d2f1ea3..4504d7740a57 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -286,6 +286,10 @@ btree_key_can_insert_cached(struct btree_trans *trans,
 
 	BUG_ON(iter->level);
 
+	if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) &&
+	    bch2_btree_key_cache_must_wait(trans->c))
+		return BTREE_INSERT_NEED_JOURNAL_RECLAIM;
+
 	if (u64s <= ck->u64s)
 		return BTREE_INSERT_OK;
 
@@ -652,6 +656,21 @@ int bch2_trans_commit_error(struct btree_trans *trans,
 		trace_trans_restart_journal_res_get(trans->ip);
 		ret = -EINTR;
 		break;
+	case BTREE_INSERT_NEED_JOURNAL_RECLAIM:
+		bch2_trans_unlock(trans);
+
+		while (bch2_btree_key_cache_must_wait(c)) {
+			mutex_lock(&c->journal.reclaim_lock);
+			bch2_journal_reclaim(&c->journal);
+			mutex_unlock(&c->journal.reclaim_lock);
+		}
+
+		if (bch2_trans_relock(trans))
+			return 0;
+
+		trace_trans_restart_journal_reclaim(trans->ip);
+		ret = -EINTR;
+		break;
 	default:
 		BUG_ON(ret >= 0);
 		break;
diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h
index 9706b6a3b1b2..babb07e3acc4 100644
--- a/fs/bcachefs/trace.h
+++ b/fs/bcachefs/trace.h
@@ -681,6 +681,11 @@ DEFINE_EVENT(transaction_restart,	trans_restart_journal_preres_get,
 	TP_ARGS(ip)
 );
 
+DEFINE_EVENT(transaction_restart,	trans_restart_journal_reclaim,
+	TP_PROTO(unsigned long ip),
+	TP_ARGS(ip)
+);
+
 DEFINE_EVENT(transaction_restart,	trans_restart_mark_replicas,
 	TP_PROTO(unsigned long ip),
 	TP_ARGS(ip)
-- 
cgit 


From b7a9bbfc1b85730ddf9905289b1a148ea1aa5ade Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 19 Nov 2020 20:55:33 -0500
Subject: bcachefs: Move journal reclaim to a kthread

This is to make tracing easier.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c |  2 +-
 fs/bcachefs/bcachefs.h         |  1 -
 fs/bcachefs/btree_gc.c         |  2 +-
 fs/bcachefs/btree_key_cache.c  |  2 +-
 fs/bcachefs/chardev.c          |  3 +-
 fs/bcachefs/journal.c          | 16 ++++++--
 fs/bcachefs/journal_io.c       |  4 +-
 fs/bcachefs/journal_reclaim.c  | 86 ++++++++++++++++++++++++++++++++++++------
 fs/bcachefs/journal_reclaim.h  | 15 +++++++-
 fs/bcachefs/journal_types.h    |  6 ++-
 fs/bcachefs/movinggc.c         |  2 +-
 fs/bcachefs/rebalance.c        |  2 +-
 fs/bcachefs/super.c            | 16 ++++----
 13 files changed, 122 insertions(+), 35 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 8f0c1f378b77..078968f30175 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -1409,7 +1409,7 @@ int bch2_dev_allocator_start(struct bch_dev *ca)
 		return 0;
 
 	p = kthread_create(bch2_allocator_thread, ca,
-			   "bch_alloc[%s]", ca->name);
+			   "bch-alloc/%s", ca->name);
 	if (IS_ERR(p))
 		return PTR_ERR(p);
 
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index d77d1fc1cfed..4fe3f9257752 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -650,7 +650,6 @@ struct bch_fs {
 	struct workqueue_struct	*wq;
 	/* copygc needs its own workqueue for index updates.. */
 	struct workqueue_struct	*copygc_wq;
-	struct workqueue_struct	*journal_reclaim_wq;
 
 	/* ALLOCATION */
 	struct delayed_work	pd_controllers_update;
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index da0ad8f50775..df018a2e463e 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -1427,7 +1427,7 @@ int bch2_gc_thread_start(struct bch_fs *c)
 
 	BUG_ON(c->gc_thread);
 
-	p = kthread_create(bch2_gc_thread, c, "bch_gc");
+	p = kthread_create(bch2_gc_thread, c, "bch-gc/%s", c->name);
 	if (IS_ERR(p))
 		return PTR_ERR(p);
 
diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index 99e03852b814..d1f226e66158 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -497,7 +497,7 @@ bool bch2_btree_insert_key_cached(struct btree_trans *trans,
 				&ck->journal, btree_key_cache_journal_flush);
 
 	if (kick_reclaim)
-		mod_delayed_work(c->journal_reclaim_wq, &c->journal.reclaim_work, 0);
+		journal_reclaim_kick(&c->journal);
 	return true;
 }
 
diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c
index cd5c850a41ec..7c77fd09c834 100644
--- a/fs/bcachefs/chardev.c
+++ b/fs/bcachefs/chardev.c
@@ -341,7 +341,8 @@ static long bch2_ioctl_data(struct bch_fs *c,
 	ctx->c = c;
 	ctx->arg = arg;
 
-	ctx->thread = kthread_create(bch2_data_thread, ctx, "[bcachefs]");
+	ctx->thread = kthread_create(bch2_data_thread, ctx,
+				     "bch-data/%s", c->name);
 	if (IS_ERR(ctx->thread)) {
 		ret = PTR_ERR(ctx->thread);
 		goto err;
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index bb4353e673e7..2c6aa36cc025 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -225,11 +225,14 @@ static bool journal_entry_close(struct journal *j)
  */
 static int journal_entry_open(struct journal *j)
 {
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
 	struct journal_buf *buf = journal_cur_buf(j);
 	union journal_res_state old, new;
 	int u64s;
 	u64 v;
 
+	BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb));
+
 	lockdep_assert_held(&j->lock);
 	BUG_ON(journal_entry_is_open(j));
 
@@ -480,8 +483,10 @@ static bool journal_preres_available(struct journal *j,
 {
 	bool ret = bch2_journal_preres_get_fast(j, res, new_u64s, flags);
 
-	if (!ret)
-		bch2_journal_reclaim_work(&j->reclaim_work.work);
+	if (!ret && mutex_trylock(&j->reclaim_lock)) {
+		bch2_journal_reclaim(j);
+		mutex_unlock(&j->reclaim_lock);
+	}
 
 	return ret;
 }
@@ -888,7 +893,7 @@ void bch2_fs_journal_stop(struct journal *j)
 		j->last_empty_seq + 1 != journal_cur_seq(j)));
 
 	cancel_delayed_work_sync(&j->write_work);
-	cancel_delayed_work_sync(&j->reclaim_work);
+	bch2_journal_reclaim_stop(j);
 }
 
 int bch2_fs_journal_start(struct journal *j, u64 cur_seq,
@@ -1019,7 +1024,6 @@ int bch2_fs_journal_init(struct journal *j)
 	spin_lock_init(&j->err_lock);
 	init_waitqueue_head(&j->wait);
 	INIT_DELAYED_WORK(&j->write_work, journal_write_work);
-	INIT_DELAYED_WORK(&j->reclaim_work, bch2_journal_reclaim_work);
 	init_waitqueue_head(&j->pin_flush_wait);
 	mutex_init(&j->reclaim_lock);
 	mutex_init(&j->discard_lock);
@@ -1071,6 +1075,8 @@ void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
 	       "last_seq:\t\t%llu\n"
 	       "last_seq_ondisk:\t%llu\n"
 	       "prereserved:\t\t%u/%u\n"
+	       "nr direct reclaim:\t%llu\n"
+	       "nr background reclaim:\t%llu\n"
 	       "current entry sectors:\t%u\n"
 	       "current entry error:\t%u\n"
 	       "current entry:\t\t",
@@ -1080,6 +1086,8 @@ void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
 	       j->last_seq_ondisk,
 	       j->prereserved.reserved,
 	       j->prereserved.remaining,
+	       j->nr_direct_reclaim,
+	       j->nr_background_reclaim,
 	       j->cur_entry_sectors,
 	       j->cur_entry_error);
 
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 354d57a3cd59..79d5d892728f 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -993,7 +993,7 @@ static void journal_write_done(struct closure *cl)
 	 * Must come before signaling write completion, for
 	 * bch2_fs_journal_stop():
 	 */
-	mod_delayed_work(c->journal_reclaim_wq, &j->reclaim_work, 0);
+	journal_reclaim_kick(&c->journal);
 
 	/* also must come before signalling write completion: */
 	closure_debug_destroy(cl);
@@ -1044,6 +1044,8 @@ void bch2_journal_write(struct closure *cl)
 	unsigned i, sectors, bytes, u64s;
 	int ret;
 
+	BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb));
+
 	bch2_journal_pin_put(j, le64_to_cpu(w->data->seq));
 
 	journal_buf_realloc(j, w);
diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
index 9c67597d1ec6..9f0d2e6aa4e3 100644
--- a/fs/bcachefs/journal_reclaim.c
+++ b/fs/bcachefs/journal_reclaim.c
@@ -9,6 +9,7 @@
 #include "super.h"
 #include "trace.h"
 
+#include <linux/kthread.h>
 #include <linux/sched/mm.h>
 
 /* Free space calculations: */
@@ -534,9 +535,10 @@ static u64 journal_seq_to_flush(struct journal *j)
  * 512 journal entries or 25% of all journal buckets, then
  * journal_next_bucket() should not stall.
  */
-void bch2_journal_reclaim(struct journal *j)
+static void __bch2_journal_reclaim(struct journal *j, bool direct)
 {
 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
+	bool kthread = (current->flags & PF_KTHREAD) != 0;
 	u64 seq_to_flush, nr_flushed = 0;
 	size_t min_nr;
 	unsigned flags;
@@ -551,6 +553,9 @@ void bch2_journal_reclaim(struct journal *j)
 	flags = memalloc_noreclaim_save();
 
 	do {
+		if (kthread && kthread_should_stop())
+			break;
+
 		bch2_journal_do_discards(j);
 
 		seq_to_flush = journal_seq_to_flush(j);
@@ -582,26 +587,83 @@ void bch2_journal_reclaim(struct journal *j)
 				c->btree_key_cache.nr_dirty,
 				c->btree_key_cache.nr_keys);
 
-		nr_flushed += journal_flush_pins(j, seq_to_flush, min_nr);
+		nr_flushed = journal_flush_pins(j, seq_to_flush, min_nr);
+
+		if (direct)
+			j->nr_direct_reclaim += nr_flushed;
+		else
+			j->nr_background_reclaim += nr_flushed;
+		trace_journal_reclaim_finish(c, nr_flushed);
 	} while (min_nr);
 
 	memalloc_noreclaim_restore(flags);
+}
+
+void bch2_journal_reclaim(struct journal *j)
+{
+	__bch2_journal_reclaim(j, true);
+}
+
+static int bch2_journal_reclaim_thread(void *arg)
+{
+	struct journal *j = arg;
+	unsigned long next;
+
+	while (!kthread_should_stop()) {
+		j->reclaim_kicked = false;
+
+		mutex_lock(&j->reclaim_lock);
+		__bch2_journal_reclaim(j, false);
+		mutex_unlock(&j->reclaim_lock);
+
+		next = j->last_flushed + msecs_to_jiffies(j->reclaim_delay_ms);
 
-	trace_journal_reclaim_finish(c, nr_flushed);
+		while (1) {
+			set_current_state(TASK_INTERRUPTIBLE);
+			if (kthread_should_stop())
+				break;
+			if (j->reclaim_kicked)
+				break;
+			if (time_after_eq(jiffies, next))
+				break;
+			schedule_timeout(next - jiffies);
 
-	if (!bch2_journal_error(j))
-		queue_delayed_work(c->journal_reclaim_wq, &j->reclaim_work,
-				   msecs_to_jiffies(j->reclaim_delay_ms));
+		}
+		__set_current_state(TASK_RUNNING);
+	}
+
+	return 0;
 }
 
-void bch2_journal_reclaim_work(struct work_struct *work)
+void bch2_journal_reclaim_stop(struct journal *j)
 {
-	struct journal *j = container_of(to_delayed_work(work),
-				struct journal, reclaim_work);
+	struct task_struct *p = j->reclaim_thread;
 
-	mutex_lock(&j->reclaim_lock);
-	bch2_journal_reclaim(j);
-	mutex_unlock(&j->reclaim_lock);
+	j->reclaim_thread = NULL;
+
+	if (p) {
+		kthread_stop(p);
+		put_task_struct(p);
+	}
+}
+
+int bch2_journal_reclaim_start(struct journal *j)
+{
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
+	struct task_struct *p;
+
+	if (j->reclaim_thread)
+		return 0;
+
+	p = kthread_create(bch2_journal_reclaim_thread, j,
+			   "bch-reclaim/%s", c->name);
+	if (IS_ERR(p))
+		return PTR_ERR(p);
+
+	get_task_struct(p);
+	j->reclaim_thread = p;
+	wake_up_process(p);
+	return 0;
 }
 
 static int journal_flush_done(struct journal *j, u64 seq_to_flush,
diff --git a/fs/bcachefs/journal_reclaim.h b/fs/bcachefs/journal_reclaim.h
index 8128907a7623..bae2c9210db8 100644
--- a/fs/bcachefs/journal_reclaim.h
+++ b/fs/bcachefs/journal_reclaim.h
@@ -10,6 +10,17 @@ enum journal_space_from {
 	journal_space_clean,
 };
 
+static inline void journal_reclaim_kick(struct journal *j)
+{
+	struct task_struct *p = READ_ONCE(j->reclaim_thread);
+
+	if (p && !j->reclaim_kicked) {
+		j->reclaim_kicked = true;
+		if (p)
+			wake_up_process(p);
+	}
+}
+
 unsigned bch2_journal_dev_buckets_available(struct journal *,
 					    struct journal_device *,
 					    enum journal_space_from);
@@ -55,7 +66,9 @@ void bch2_journal_pin_flush(struct journal *, struct journal_entry_pin *);
 
 void bch2_journal_do_discards(struct journal *);
 void bch2_journal_reclaim(struct journal *);
-void bch2_journal_reclaim_work(struct work_struct *);
+
+void bch2_journal_reclaim_stop(struct journal *);
+int bch2_journal_reclaim_start(struct journal *);
 
 bool bch2_journal_flush_pins(struct journal *, u64);
 
diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h
index 5f20653b8eb5..6312a7f06d87 100644
--- a/fs/bcachefs/journal_types.h
+++ b/fs/bcachefs/journal_types.h
@@ -216,8 +216,12 @@ struct journal {
 	struct write_point	wp;
 	spinlock_t		err_lock;
 
-	struct delayed_work	reclaim_work;
 	struct mutex		reclaim_lock;
+	struct task_struct	*reclaim_thread;
+	bool			reclaim_kicked;
+	u64			nr_direct_reclaim;
+	u64			nr_background_reclaim;
+
 	unsigned long		last_flushed;
 	struct journal_entry_pin *flush_in_progress;
 	wait_queue_head_t	pin_flush_wait;
diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c
index e858e2a35f8d..a9775cc84f66 100644
--- a/fs/bcachefs/movinggc.c
+++ b/fs/bcachefs/movinggc.c
@@ -345,7 +345,7 @@ int bch2_copygc_start(struct bch_fs *c)
 	if (bch2_fs_init_fault("copygc_start"))
 		return -ENOMEM;
 
-	t = kthread_create(bch2_copygc_thread, c, "bch_copygc");
+	t = kthread_create(bch2_copygc_thread, c, "bch-copygc/%s", c->name);
 	if (IS_ERR(t))
 		return PTR_ERR(t);
 
diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c
index cce6f58fe609..f9a12dd797a5 100644
--- a/fs/bcachefs/rebalance.c
+++ b/fs/bcachefs/rebalance.c
@@ -314,7 +314,7 @@ int bch2_rebalance_start(struct bch_fs *c)
 	if (c->opts.nochanges)
 		return 0;
 
-	p = kthread_create(bch2_rebalance_thread, c, "bch_rebalance");
+	p = kthread_create(bch2_rebalance_thread, c, "bch-rebalance/%s", c->name);
 	if (IS_ERR(p))
 		return PTR_ERR(p);
 
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 12ce4a627746..98a875e08e9a 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -49,7 +49,6 @@
 #include <linux/debugfs.h>
 #include <linux/device.h>
 #include <linux/idr.h>
-#include <linux/kthread.h>
 #include <linux/module.h>
 #include <linux/percpu.h>
 #include <linux/random.h>
@@ -266,7 +265,7 @@ static void bch2_writes_disabled(struct percpu_ref *writes)
 void bch2_fs_read_only(struct bch_fs *c)
 {
 	if (!test_bit(BCH_FS_RW, &c->flags)) {
-		cancel_delayed_work_sync(&c->journal.reclaim_work);
+		BUG_ON(c->journal.reclaim_thread);
 		return;
 	}
 
@@ -424,6 +423,12 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early)
 
 	set_bit(BCH_FS_ALLOCATOR_RUNNING, &c->flags);
 
+	ret = bch2_journal_reclaim_start(&c->journal);
+	if (ret) {
+		bch_err(c, "error starting journal reclaim: %i", ret);
+		return ret;
+	}
+
 	if (!early) {
 		ret = bch2_fs_read_write_late(c);
 		if (ret)
@@ -432,9 +437,6 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early)
 
 	percpu_ref_reinit(&c->writes);
 	set_bit(BCH_FS_RW, &c->flags);
-
-	queue_delayed_work(c->journal_reclaim_wq,
-			   &c->journal.reclaim_work, 0);
 	return 0;
 err:
 	__bch2_fs_read_only(c);
@@ -503,8 +505,6 @@ static void __bch2_fs_free(struct bch_fs *c)
 	kfree(c->unused_inode_hints);
 	free_heap(&c->copygc_heap);
 
-	if (c->journal_reclaim_wq)
-		destroy_workqueue(c->journal_reclaim_wq);
 	if (c->copygc_wq)
 		destroy_workqueue(c->copygc_wq);
 	if (c->wq)
@@ -758,8 +758,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 				WQ_FREEZABLE|WQ_MEM_RECLAIM, 1)) ||
 	    !(c->copygc_wq = alloc_workqueue("bcachefs_copygc",
 				WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) ||
-	    !(c->journal_reclaim_wq = alloc_workqueue("bcachefs_journal_reclaim",
-				WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_HIGHPRI, 1)) ||
 	    percpu_ref_init(&c->writes, bch2_writes_disabled,
 			    PERCPU_REF_INIT_DEAD, GFP_KERNEL) ||
 	    mempool_init_kmalloc_pool(&c->fill_iter, 1, iter_size) ||
-- 
cgit 


From f3020550777af9a66737334db94d96f8c3253e96 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 20 Nov 2020 16:12:39 -0500
Subject: bcachefs: Fix an rcu splat

bch2_bucket_alloc() requires rcu_read_lock() to be held.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/journal.c | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index 2c6aa36cc025..27fcb378790b 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -682,16 +682,19 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
 	if (nr <= ja->nr)
 		return 0;
 
-	ret = -ENOMEM;
 	new_buckets	= kzalloc(nr * sizeof(u64), GFP_KERNEL);
 	new_bucket_seq	= kzalloc(nr * sizeof(u64), GFP_KERNEL);
-	if (!new_buckets || !new_bucket_seq)
+	if (!new_buckets || !new_bucket_seq) {
+		ret = -ENOMEM;
 		goto err;
+	}
 
 	journal_buckets = bch2_sb_resize_journal(&ca->disk_sb,
 					nr + sizeof(*journal_buckets) / sizeof(u64));
-	if (!journal_buckets)
+	if (!journal_buckets) {
+		ret = -ENOSPC;
 		goto err;
+	}
 
 	/*
 	 * We may be called from the device add path, before the new device has
@@ -720,8 +723,10 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
 				goto err;
 			}
 		} else {
+			rcu_read_lock();
 			ob = bch2_bucket_alloc(c, ca, RESERVE_ALLOC,
 					       false, cl);
+			rcu_read_unlock();
 			if (IS_ERR(ob)) {
 				ret = cl ? -EAGAIN : -ENOSPC;
 				goto err;
@@ -773,8 +778,6 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
 		if (!new_fs)
 			bch2_open_bucket_put(c, ob);
 	}
-
-	ret = 0;
 err:
 	bch2_sb_resize_journal(&ca->disk_sb,
 		ja->nr + sizeof(*journal_buckets) / sizeof(u64));
-- 
cgit 


From 6584e84a978ed710ee295201647b7f05dbbc56ee Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 20 Nov 2020 21:21:28 -0500
Subject: bcachefs: Don't use bkey cache for inode update in fsck

fsck doesn't know about the btree key cache, and non-cached iterators
aren't cache coherent (yet?)

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs.c    |  2 +-
 fs/bcachefs/fsck.c  |  2 +-
 fs/bcachefs/inode.c | 14 ++++++++++----
 fs/bcachefs/inode.h |  2 +-
 4 files changed, 13 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index 480469784152..a3810493826b 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -1262,7 +1262,7 @@ static void bch2_evict_inode(struct inode *vinode)
 				KEY_TYPE_QUOTA_WARN);
 		bch2_quota_acct(c, inode->ei_qid, Q_INO, -1,
 				KEY_TYPE_QUOTA_WARN);
-		bch2_inode_rm(c, inode->v.i_ino);
+		bch2_inode_rm(c, inode->v.i_ino, true);
 	}
 }
 
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 0c5035270846..09ce6c29b88c 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -1254,7 +1254,7 @@ static int check_inode(struct btree_trans *trans,
 
 		bch2_fs_lazy_rw(c);
 
-		ret = bch2_inode_rm(c, u.bi_inum);
+		ret = bch2_inode_rm(c, u.bi_inum, false);
 		if (ret)
 			bch_err(c, "error in fsck: error %i while deleting inode", ret);
 		return ret;
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index f00778d78271..b1f420776d9a 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -542,7 +542,7 @@ found_slot:
 	return ret;
 }
 
-int bch2_inode_rm(struct bch_fs *c, u64 inode_nr)
+int bch2_inode_rm(struct bch_fs *c, u64 inode_nr, bool cached)
 {
 	struct btree_trans trans;
 	struct btree_iter *iter;
@@ -576,9 +576,15 @@ retry:
 
 	bi_generation = 0;
 
-	iter = bch2_trans_get_iter(&trans, BTREE_ID_INODES, POS(0, inode_nr),
-				   BTREE_ITER_CACHED|BTREE_ITER_INTENT);
-	k = bch2_btree_iter_peek_cached(iter);
+	if (cached) {
+		iter = bch2_trans_get_iter(&trans, BTREE_ID_INODES, POS(0, inode_nr),
+					   BTREE_ITER_CACHED|BTREE_ITER_INTENT);
+		k = bch2_btree_iter_peek_cached(iter);
+	} else {
+		iter = bch2_trans_get_iter(&trans, BTREE_ID_INODES, POS(0, inode_nr),
+					   BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+		k = bch2_btree_iter_peek_slot(iter);
+	}
 
 	ret = bkey_err(k);
 	if (ret)
diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h
index ef7e885dce0c..dbdfcf63d079 100644
--- a/fs/bcachefs/inode.h
+++ b/fs/bcachefs/inode.h
@@ -71,7 +71,7 @@ void bch2_inode_init(struct bch_fs *, struct bch_inode_unpacked *,
 
 int bch2_inode_create(struct btree_trans *, struct bch_inode_unpacked *);
 
-int bch2_inode_rm(struct bch_fs *, u64);
+int bch2_inode_rm(struct bch_fs *, u64, bool);
 
 int bch2_inode_find_by_inum_trans(struct btree_trans *, u64,
 				  struct bch_inode_unpacked *);
-- 
cgit 


From 087c201943ff4ec5150b8c3e2e5095b8add01f19 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 20 Nov 2020 21:28:55 -0500
Subject: bcachefs: bch2_btree_delete_range_trans()

This helps reduce stack usage by avoiding multiple btree_trans on the
stack.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update.h      |  4 +--
 fs/bcachefs/btree_update_leaf.c | 66 +++++++++++++++++++----------------------
 fs/bcachefs/inode.c             | 20 ++++++-------
 3 files changed, 42 insertions(+), 48 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
index e0b1bde37484..adb07043cbb3 100644
--- a/fs/bcachefs/btree_update.h
+++ b/fs/bcachefs/btree_update.h
@@ -67,8 +67,8 @@ int __bch2_btree_insert(struct btree_trans *, enum btree_id, struct bkey_i *);
 int bch2_btree_insert(struct bch_fs *, enum btree_id, struct bkey_i *,
 		     struct disk_reservation *, u64 *, int flags);
 
-int bch2_btree_delete_at_range(struct btree_trans *, struct btree_iter *,
-			       struct bpos, u64 *);
+int bch2_btree_delete_range_trans(struct btree_trans *, enum btree_id,
+				  struct bpos, struct bpos, u64 *);
 int bch2_btree_delete_range(struct bch_fs *, enum btree_id,
 			    struct bpos, struct bpos, u64 *);
 
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 4504d7740a57..44d1d21dd608 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -1094,13 +1094,32 @@ int bch2_btree_insert(struct bch_fs *c, enum btree_id id,
 			     __bch2_btree_insert(&trans, id, k));
 }
 
-int bch2_btree_delete_at_range(struct btree_trans *trans,
-			       struct btree_iter *iter,
-			       struct bpos end,
-			       u64 *journal_seq)
+int bch2_btree_delete_at(struct btree_trans *trans,
+			 struct btree_iter *iter, unsigned flags)
+{
+	struct bkey_i k;
+
+	bkey_init(&k.k);
+	k.k.p = iter->pos;
+
+	bch2_trans_update(trans, iter, &k, 0);
+	return bch2_trans_commit(trans, NULL, NULL,
+				 BTREE_INSERT_NOFAIL|
+				 BTREE_INSERT_USE_RESERVE|flags);
+}
+
+int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id id,
+				  struct bpos start, struct bpos end,
+				  u64 *journal_seq)
 {
+	struct btree_iter *iter;
 	struct bkey_s_c k;
 	int ret = 0;
+
+	iter = bch2_trans_get_iter(trans, id, start, BTREE_ITER_INTENT);
+	ret = PTR_ERR_OR_ZERO(iter);
+	if (ret)
+		return ret;
 retry:
 	while ((k = bch2_btree_iter_peek(iter)).k &&
 	       !(ret = bkey_err(k)) &&
@@ -1111,6 +1130,10 @@ retry:
 
 		bkey_init(&delete.k);
 
+		/*
+		 * This could probably be more efficient for extents:
+		 */
+
 		/*
 		 * For extents, iter.pos won't necessarily be the same as
 		 * bkey_start_pos(k.k) (for non extents they always will be the
@@ -1150,22 +1173,8 @@ retry:
 		goto retry;
 	}
 
+	bch2_trans_iter_put(trans, iter);
 	return ret;
-
-}
-
-int bch2_btree_delete_at(struct btree_trans *trans,
-			 struct btree_iter *iter, unsigned flags)
-{
-	struct bkey_i k;
-
-	bkey_init(&k.k);
-	k.k.p = iter->pos;
-
-	bch2_trans_update(trans, iter, &k, 0);
-	return bch2_trans_commit(trans, NULL, NULL,
-				 BTREE_INSERT_NOFAIL|
-				 BTREE_INSERT_USE_RESERVE|flags);
 }
 
 /*
@@ -1177,21 +1186,6 @@ int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id,
 			    struct bpos start, struct bpos end,
 			    u64 *journal_seq)
 {
-	struct btree_trans trans;
-	struct btree_iter *iter;
-	int ret = 0;
-
-	/*
-	 * XXX: whether we need mem/more iters depends on whether this btree id
-	 * has triggers
-	 */
-	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 512);
-
-	iter = bch2_trans_get_iter(&trans, id, start, BTREE_ITER_INTENT);
-
-	ret = bch2_btree_delete_at_range(&trans, iter, end, journal_seq);
-	ret = bch2_trans_exit(&trans) ?: ret;
-
-	BUG_ON(ret == -EINTR);
-	return ret;
+	return bch2_trans_do(c, NULL, journal_seq, 0,
+			     bch2_btree_delete_range_trans(&trans, id, start, end, journal_seq));
 }
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index b1f420776d9a..358e39361e56 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -553,6 +553,8 @@ int bch2_inode_rm(struct bch_fs *c, u64 inode_nr, bool cached)
 	u64 bi_generation;
 	int ret;
 
+	bch2_trans_init(&trans, c, 0, 0);
+
 	/*
 	 * If this was a directory, there shouldn't be any real dirents left -
 	 * but there could be whiteouts (from hash collisions) that we should
@@ -561,16 +563,14 @@ int bch2_inode_rm(struct bch_fs *c, u64 inode_nr, bool cached)
 	 * XXX: the dirent could ideally would delete whiteouts when they're no
 	 * longer needed
 	 */
-	ret   = bch2_btree_delete_range(c, BTREE_ID_EXTENTS,
-					start, end, NULL) ?:
-		bch2_btree_delete_range(c, BTREE_ID_XATTRS,
-					start, end, NULL) ?:
-		bch2_btree_delete_range(c, BTREE_ID_DIRENTS,
-					start, end, NULL);
+	ret   = bch2_btree_delete_range_trans(&trans, BTREE_ID_EXTENTS,
+					      start, end, NULL) ?:
+		bch2_btree_delete_range_trans(&trans, BTREE_ID_XATTRS,
+					      start, end, NULL) ?:
+		bch2_btree_delete_range_trans(&trans, BTREE_ID_DIRENTS,
+					      start, end, NULL);
 	if (ret)
-		return ret;
-
-	bch2_trans_init(&trans, c, 0, 0);
+		goto err;
 retry:
 	bch2_trans_begin(&trans);
 
@@ -590,7 +590,7 @@ retry:
 	if (ret)
 		goto err;
 
-	bch2_fs_inconsistent_on(k.k->type != KEY_TYPE_inode, c,
+	bch2_fs_inconsistent_on(k.k->type != KEY_TYPE_inode, trans.c,
 				"inode %llu not found when deleting",
 				inode_nr);
 
-- 
cgit 


From 7b489207703833f705763d14a77d29edf97e5a12 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 20 Nov 2020 22:51:04 -0500
Subject: bcachefs: Delete dead code

The interior btree node update path has changed, this is no longer
needed.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update_interior.c | 3 ---
 fs/bcachefs/btree_update_interior.h | 1 -
 2 files changed, 4 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 3f6ab09d100d..240794525a90 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -1384,9 +1384,6 @@ void bch2_btree_insert_node(struct btree_update *as, struct btree *b,
 	BUG_ON(!as || as->b);
 	bch2_verify_keylist_sorted(keys);
 
-	if (as->must_rewrite)
-		goto split;
-
 	bch2_btree_node_lock_for_insert(c, b, iter);
 
 	if (!bch2_btree_node_insert_fits(c, b, bch2_keylist_u64s(keys))) {
diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h
index 41854fc345d2..45d212730fd7 100644
--- a/fs/bcachefs/btree_update_interior.h
+++ b/fs/bcachefs/btree_update_interior.h
@@ -47,7 +47,6 @@ struct btree_update {
 		BTREE_INTERIOR_UPDATING_AS,
 	} mode;
 
-	unsigned			must_rewrite:1;
 	unsigned			nodes_written:1;
 
 	enum btree_id			btree_id;
-- 
cgit 


From 33b3b1dc0f685b4542a631a0ca053380613829a6 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 20 Nov 2020 19:27:57 -0500
Subject: bcachefs: Optimize bch2_journal_flush_seq_async()

Avoid taking the journal lock if we don't have to.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/journal.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index 27fcb378790b..759a04346cde 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -547,12 +547,20 @@ out:
  * necessary
  */
 int bch2_journal_flush_seq_async(struct journal *j, u64 seq,
-				  struct closure *parent)
+				 struct closure *parent)
 {
 	struct journal_buf *buf;
 	int ret = 0;
 
+	if (seq <= j->err_seq)
+		return -EIO;
+
+	if (seq <= j->seq_ondisk)
+		return 1;
+
 	spin_lock(&j->lock);
+
+	/* Recheck under lock: */
 	if (seq <= j->err_seq) {
 		ret = -EIO;
 		goto out;
-- 
cgit 


From 89931472c20bee93c6bcb0df7b057edda6473381 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 29 Nov 2020 16:00:47 -0500
Subject: bcachefs: Fix for __readahead_batch getting partial batch

We were incorrectly ignoring the return value of __readahead_batch,
leading to a null ptr deref in __bch2_page_state_create().

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-io.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 1afdd775ffb3..9c3b11f414b5 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -667,7 +667,7 @@ static int readpages_iter_init(struct readpages_iter *iter,
 	if (!iter->pages)
 		return -ENOMEM;
 
-	__readahead_batch(ractl, iter->pages, nr_pages);
+	nr_pages = __readahead_batch(ractl, iter->pages, nr_pages);
 	for (i = 0; i < nr_pages; i++) {
 		__bch2_page_state_create(iter->pages[i], __GFP_NOFAIL);
 		put_page(iter->pages[i]);
-- 
cgit 


From 5731cf01567da4f354bbff4a040b53f3f86328ad Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 29 Nov 2020 17:09:13 -0500
Subject: bcachefs: Fix journal reclaim spinning in recovery

We can't run journal reclaim until we've finished replaying updates to
interior btree nodes - the check for this was in the wrong place though,
leading to journal reclaim spinning before it was allowed to proceed.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/journal_reclaim.c | 5 +++++
 fs/bcachefs/recovery.c        | 1 +
 2 files changed, 6 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
index 9f0d2e6aa4e3..c20f6de34730 100644
--- a/fs/bcachefs/journal_reclaim.c
+++ b/fs/bcachefs/journal_reclaim.c
@@ -609,6 +609,10 @@ static int bch2_journal_reclaim_thread(void *arg)
 	struct journal *j = arg;
 	unsigned long next;
 
+	set_freezable();
+
+	kthread_wait_freezable(test_bit(JOURNAL_RECLAIM_STARTED, &j->flags));
+
 	while (!kthread_should_stop()) {
 		j->reclaim_kicked = false;
 
@@ -627,6 +631,7 @@ static int bch2_journal_reclaim_thread(void *arg)
 			if (time_after_eq(jiffies, next))
 				break;
 			schedule_timeout(next - jiffies);
+			try_to_freeze();
 
 		}
 		__set_current_state(TASK_RUNNING);
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 6750063663b5..0b3521c9cc19 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -616,6 +616,7 @@ static int bch2_journal_replay(struct bch_fs *c,
 	 */
 	set_bit(BCH_FS_BTREE_INTERIOR_REPLAY_DONE, &c->flags);
 	set_bit(JOURNAL_RECLAIM_STARTED, &j->flags);
+	journal_reclaim_kick(j);
 
 	j->replay_journal_seq = seq;
 
-- 
cgit 


From d0022290b81c15db3c81936cdb254663598ef35f Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 29 Nov 2020 23:48:20 -0500
Subject: bcachefs: Fix error in filesystem initialization

The rhashtable code doesn't like when we destroy an rhashtable that was
never initialized

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_key_cache.c | 11 +++++++++--
 fs/bcachefs/btree_types.h     |  1 +
 2 files changed, 10 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index d1f226e66158..ae3d5880f84e 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -608,7 +608,8 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc)
 	}
 	mutex_unlock(&bc->lock);
 
-	rhashtable_destroy(&bc->table);
+	if (bc->table_init_done)
+		rhashtable_destroy(&bc->table);
 }
 
 void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *c)
@@ -622,13 +623,19 @@ void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *c)
 int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc)
 {
 	struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
+	int ret;
 
 	bc->shrink.seeks		= 1;
 	bc->shrink.count_objects	= bch2_btree_key_cache_count;
 	bc->shrink.scan_objects		= bch2_btree_key_cache_scan;
 
-	return  register_shrinker(&bc->shrink, "%s/btree_key_cache", c->name) ?:
+	ret =   register_shrinker(&bc->shrink, "%s/btree_key_cache", c->name) ?:
 		rhashtable_init(&bc->table, &bch2_btree_key_cache_params);
+	if (ret)
+		return ret;
+
+	bc->table_init_done = true;
+	return 0;
 }
 
 void bch2_btree_key_cache_to_text(struct printbuf *out, struct btree_key_cache *c)
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index d861d94242a4..28f0d7b85ad6 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -293,6 +293,7 @@ static inline struct btree_iter_level *iter_l(struct btree_iter *iter)
 struct btree_key_cache {
 	struct mutex		lock;
 	struct rhashtable	table;
+	bool			table_init_done;
 	struct list_head	freed;
 	struct list_head	clean;
 	struct list_head	dirty;
-- 
cgit 


From d7b04163c2e6fbfb3befc047586c4c85069e8db3 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 30 Nov 2020 02:07:38 -0500
Subject: bcachefs: Change a BUG_ON() to a fatal error

In the btree key cache code, failing to flush a dirty key is a serious
error, but it doesn't need to be a BUG_ON(), we can stop the filesystem
instead.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_key_cache.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index ae3d5880f84e..ccb5f3cc7160 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -368,10 +368,11 @@ err:
 	if (ret == -EINTR)
 		goto retry;
 
-	BUG_ON(ret && !bch2_journal_error(j));
-
-	if (ret)
+	if (ret) {
+		bch2_fs_fatal_err_on(!bch2_journal_error(j), c,
+			"error flushing key cache: %i", ret);
 		goto out;
+	}
 
 	bch2_journal_pin_drop(j, &ck->journal);
 	bch2_journal_preres_put(j, &ck->res);
-- 
cgit 


From 04e23a566fa9f41228408a7829b4462fb62e42c9 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 30 Nov 2020 02:08:14 -0500
Subject: bcachefs: Ensure we always have a journal pin in interior update path

For the new nodes an interior btree update makes reachable, updates to
those nodes may be journalled after the btree update starts but before
the transactional part - where we make those nodes reachable. Those
updates need to be kept in the journal until after the btree update
completes, hence we should always get a journal pin at the start of the
interior update.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update_interior.c | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 240794525a90..ac91006f3c69 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -544,6 +544,8 @@ static void btree_update_nodes_written(struct btree_update *as)
 	unsigned i;
 	int ret;
 
+	BUG_ON(!journal_pin_active(&as->journal));
+
 	/*
 	 * We did an update to a parent node where the pointers we added pointed
 	 * to child nodes that weren't written yet: now, the child nodes have
@@ -699,17 +701,7 @@ static void btree_update_reparent(struct btree_update *as,
 	child->b = NULL;
 	child->mode = BTREE_INTERIOR_UPDATING_AS;
 
-	/*
-	 * When we write a new btree root, we have to drop our journal pin
-	 * _before_ the new nodes are technically reachable; see
-	 * btree_update_nodes_written().
-	 *
-	 * This goes for journal pins that are recursively blocked on us - so,
-	 * just transfer the journal pin to the new interior update so
-	 * btree_update_nodes_written() can drop it.
-	 */
 	bch2_journal_pin_copy(&c->journal, &as->journal, &child->journal, NULL);
-	bch2_journal_pin_drop(&c->journal, &child->journal);
 }
 
 static void btree_update_updated_root(struct btree_update *as, struct btree *b)
@@ -956,6 +948,10 @@ bch2_btree_update_start(struct btree_trans *trans, enum btree_id id,
 	if (ret)
 		goto err;
 
+	bch2_journal_pin_add(&c->journal,
+			     atomic64_read(&c->journal.seq),
+			     &as->journal, NULL);
+
 	mutex_lock(&c->btree_interior_update_lock);
 	list_add_tail(&as->list, &c->btree_interior_update_list);
 	mutex_unlock(&c->btree_interior_update_lock);
-- 
cgit 


From 2e9f3b88858e4a36657e850edc58db0e2954a141 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 1 Dec 2020 11:40:59 -0500
Subject: bcachefs: Use BTREE_ITER_PREFETCH in journal+btree iter

Introducing the journal+btree iter introduced a regression where we
stopped using BTREE_ITER_PREFETCH - this is a performance regression on
rotating disks.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/recovery.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 0b3521c9cc19..a837d9eb0f6d 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -187,7 +187,7 @@ void bch2_btree_and_journal_iter_init(struct btree_and_journal_iter *iter,
 {
 	memset(iter, 0, sizeof(*iter));
 
-	iter->btree = bch2_trans_get_iter(trans, id, pos, 0);
+	iter->btree = bch2_trans_get_iter(trans, id, pos, BTREE_ITER_PREFETCH);
 	bch2_journal_iter_init(&iter->journal, journal_keys, id, 0, pos);
 }
 
-- 
cgit 


From 34c1cd6a59bf1a72bb1b672df35f9d819c81bfde Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 1 Dec 2020 11:42:23 -0500
Subject: bcachefs: Fix for fsck spuriously finding duplicate extents

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fsck.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 09ce6c29b88c..7449819d8eac 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -485,7 +485,11 @@ static int check_extents(struct bch_fs *c)
 				   BTREE_ITER_INTENT);
 retry:
 	for_each_btree_key_continue(iter, 0, k, ret) {
-		if (bkey_cmp(prev.k->k.p, bkey_start_pos(k.k)) > 0) {
+		/*
+		 * due to retry errors we might see the same extent twice:
+		 */
+		if (bkey_cmp(prev.k->k.p, k.k->p) &&
+		    bkey_cmp(prev.k->k.p, bkey_start_pos(k.k)) > 0) {
 			char buf1[200];
 			char buf2[200];
 
-- 
cgit 


From 231db03c571ba3065a32999f4f76a5482c6557e7 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 1 Dec 2020 11:48:08 -0500
Subject: bcachefs: Journal pin refactoring

This deletes some duplicated code.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/journal_reclaim.c | 64 +++++--------------------------------------
 fs/bcachefs/journal_reclaim.h | 28 ++++++++++++-------
 2 files changed, 25 insertions(+), 67 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
index c20f6de34730..1dabad618870 100644
--- a/fs/bcachefs/journal_reclaim.c
+++ b/fs/bcachefs/journal_reclaim.c
@@ -320,11 +320,14 @@ void bch2_journal_pin_drop(struct journal *j,
 	spin_unlock(&j->lock);
 }
 
-static void bch2_journal_pin_add_locked(struct journal *j, u64 seq,
-			    struct journal_entry_pin *pin,
-			    journal_pin_flush_fn flush_fn)
+void bch2_journal_pin_set(struct journal *j, u64 seq,
+			  struct journal_entry_pin *pin,
+			  journal_pin_flush_fn flush_fn)
 {
-	struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq);
+	struct journal_entry_pin_list *pin_list;
+
+	spin_lock(&j->lock);
+	pin_list = journal_seq_pin(j, seq);
 
 	__journal_pin_drop(j, pin);
 
@@ -335,45 +338,6 @@ static void bch2_journal_pin_add_locked(struct journal *j, u64 seq,
 	pin->flush	= flush_fn;
 
 	list_add(&pin->list, flush_fn ? &pin_list->list : &pin_list->flushed);
-}
-
-void __bch2_journal_pin_add(struct journal *j, u64 seq,
-			    struct journal_entry_pin *pin,
-			    journal_pin_flush_fn flush_fn)
-{
-	spin_lock(&j->lock);
-	bch2_journal_pin_add_locked(j, seq, pin, flush_fn);
-	spin_unlock(&j->lock);
-
-	/*
-	 * If the journal is currently full,  we might want to call flush_fn
-	 * immediately:
-	 */
-	journal_wake(j);
-}
-
-void bch2_journal_pin_update(struct journal *j, u64 seq,
-			     struct journal_entry_pin *pin,
-			     journal_pin_flush_fn flush_fn)
-{
-	if (journal_pin_active(pin) && pin->seq < seq)
-		return;
-
-	spin_lock(&j->lock);
-
-	if (pin->seq != seq) {
-		bch2_journal_pin_add_locked(j, seq, pin, flush_fn);
-	} else {
-		struct journal_entry_pin_list *pin_list =
-			journal_seq_pin(j, seq);
-
-		/*
-		 * If the pin is already pinning the right sequence number, it
-		 * still might've already been flushed:
-		 */
-		list_move(&pin->list, &pin_list->list);
-	}
-
 	spin_unlock(&j->lock);
 
 	/*
@@ -383,20 +347,6 @@ void bch2_journal_pin_update(struct journal *j, u64 seq,
 	journal_wake(j);
 }
 
-void bch2_journal_pin_copy(struct journal *j,
-			   struct journal_entry_pin *dst,
-			   struct journal_entry_pin *src,
-			   journal_pin_flush_fn flush_fn)
-{
-	spin_lock(&j->lock);
-
-	if (journal_pin_active(src) &&
-	    (!journal_pin_active(dst) || src->seq < dst->seq))
-		bch2_journal_pin_add_locked(j, src->seq, dst, flush_fn);
-
-	spin_unlock(&j->lock);
-}
-
 /**
  * bch2_journal_pin_flush: ensure journal pin callback is no longer running
  */
diff --git a/fs/bcachefs/journal_reclaim.h b/fs/bcachefs/journal_reclaim.h
index bae2c9210db8..e25355042e6e 100644
--- a/fs/bcachefs/journal_reclaim.h
+++ b/fs/bcachefs/journal_reclaim.h
@@ -42,25 +42,33 @@ journal_seq_pin(struct journal *j, u64 seq)
 void bch2_journal_pin_put(struct journal *, u64);
 void bch2_journal_pin_drop(struct journal *, struct journal_entry_pin *);
 
-void __bch2_journal_pin_add(struct journal *, u64, struct journal_entry_pin *,
-			    journal_pin_flush_fn);
+void bch2_journal_pin_set(struct journal *, u64, struct journal_entry_pin *,
+			  journal_pin_flush_fn);
 
 static inline void bch2_journal_pin_add(struct journal *j, u64 seq,
 					struct journal_entry_pin *pin,
 					journal_pin_flush_fn flush_fn)
 {
 	if (unlikely(!journal_pin_active(pin) || pin->seq > seq))
-		__bch2_journal_pin_add(j, seq, pin, flush_fn);
+		bch2_journal_pin_set(j, seq, pin, flush_fn);
 }
 
-void bch2_journal_pin_update(struct journal *, u64,
-			     struct journal_entry_pin *,
-			     journal_pin_flush_fn);
+static inline void bch2_journal_pin_copy(struct journal *j,
+					 struct journal_entry_pin *dst,
+					 struct journal_entry_pin *src,
+					 journal_pin_flush_fn flush_fn)
+{
+	if (journal_pin_active(src))
+		bch2_journal_pin_add(j, src->seq, dst, flush_fn);
+}
 
-void bch2_journal_pin_copy(struct journal *,
-			   struct journal_entry_pin *,
-			   struct journal_entry_pin *,
-			   journal_pin_flush_fn);
+static inline void bch2_journal_pin_update(struct journal *j, u64 seq,
+					   struct journal_entry_pin *pin,
+					   journal_pin_flush_fn flush_fn)
+{
+	if (unlikely(!journal_pin_active(pin) || pin->seq < seq))
+		bch2_journal_pin_set(j, seq, pin, flush_fn);
+}
 
 void bch2_journal_pin_flush(struct journal *, struct journal_entry_pin *);
 
-- 
cgit 


From ec3d21a9f25d495e0b6042a25f27cc711390195b Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 1 Dec 2020 12:23:55 -0500
Subject: bcachefs: Add error handling to unit & perf tests

This way, these tests can be used with tests that inject IO errors and
shut down the filesystem.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/sysfs.c |   7 +-
 fs/bcachefs/tests.c | 249 ++++++++++++++++++++++++++++++++++------------------
 fs/bcachefs/tests.h |   2 +-
 3 files changed, 170 insertions(+), 88 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index 280c28a926dd..afe0238d0cc0 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -503,10 +503,11 @@ STORE(bch2_fs)
 		if (threads_str &&
 		    !(ret = kstrtouint(threads_str, 10, &threads)) &&
 		    !(ret = bch2_strtoull_h(nr_str, &nr)))
-			bch2_btree_perf_test(c, test, nr, threads);
-		else
-			size = ret;
+			ret = bch2_btree_perf_test(c, test, nr, threads);
 		kfree(tmp);
+
+		if (ret)
+			size = ret;
 	}
 #endif
 	return size;
diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c
index 6aa31369ecc9..2acecbca212e 100644
--- a/fs/bcachefs/tests.c
+++ b/fs/bcachefs/tests.c
@@ -26,7 +26,7 @@ static void delete_test_keys(struct bch_fs *c)
 
 /* unit tests */
 
-static void test_delete(struct bch_fs *c, u64 nr)
+static int test_delete(struct bch_fs *c, u64 nr)
 {
 	struct btree_trans trans;
 	struct btree_iter *iter;
@@ -41,24 +41,37 @@ static void test_delete(struct bch_fs *c, u64 nr)
 				   BTREE_ITER_INTENT);
 
 	ret = bch2_btree_iter_traverse(iter);
-	BUG_ON(ret);
+	if (ret) {
+		bch_err(c, "lookup error in test_delete: %i", ret);
+		goto err;
+	}
 
 	ret = __bch2_trans_do(&trans, NULL, NULL, 0,
 		bch2_trans_update(&trans, iter, &k.k_i, 0));
-	BUG_ON(ret);
+	if (ret) {
+		bch_err(c, "update error in test_delete: %i", ret);
+		goto err;
+	}
 
 	pr_info("deleting once");
 	ret = bch2_btree_delete_at(&trans, iter, 0);
-	BUG_ON(ret);
+	if (ret) {
+		bch_err(c, "delete error (first) in test_delete: %i", ret);
+		goto err;
+	}
 
 	pr_info("deleting twice");
 	ret = bch2_btree_delete_at(&trans, iter, 0);
-	BUG_ON(ret);
-
+	if (ret) {
+		bch_err(c, "delete error (second) in test_delete: %i", ret);
+		goto err;
+	}
+err:
 	bch2_trans_exit(&trans);
+	return ret;
 }
 
-static void test_delete_written(struct bch_fs *c, u64 nr)
+static int test_delete_written(struct bch_fs *c, u64 nr)
 {
 	struct btree_trans trans;
 	struct btree_iter *iter;
@@ -73,27 +86,37 @@ static void test_delete_written(struct bch_fs *c, u64 nr)
 				   BTREE_ITER_INTENT);
 
 	ret = bch2_btree_iter_traverse(iter);
-	BUG_ON(ret);
+	if (ret) {
+		bch_err(c, "lookup error in test_delete_written: %i", ret);
+		goto err;
+	}
 
 	ret = __bch2_trans_do(&trans, NULL, NULL, 0,
 		bch2_trans_update(&trans, iter, &k.k_i, 0));
-	BUG_ON(ret);
+	if (ret) {
+		bch_err(c, "update error in test_delete_written: %i", ret);
+		goto err;
+	}
 
 	bch2_journal_flush_all_pins(&c->journal);
 
 	ret = bch2_btree_delete_at(&trans, iter, 0);
-	BUG_ON(ret);
-
+	if (ret) {
+		bch_err(c, "delete error in test_delete_written: %i", ret);
+		goto err;
+	}
+err:
 	bch2_trans_exit(&trans);
+	return ret;
 }
 
-static void test_iterate(struct bch_fs *c, u64 nr)
+static int test_iterate(struct bch_fs *c, u64 nr)
 {
 	struct btree_trans trans;
 	struct btree_iter *iter;
 	struct bkey_s_c k;
 	u64 i;
-	int ret;
+	int ret = 0;
 
 	bch2_trans_init(&trans, c, 0, 0);
 
@@ -109,7 +132,10 @@ static void test_iterate(struct bch_fs *c, u64 nr)
 
 		ret = bch2_btree_insert(c, BTREE_ID_XATTRS, &k.k_i,
 					NULL, NULL, 0);
-		BUG_ON(ret);
+		if (ret) {
+			bch_err(c, "insert error in test_iterate: %i", ret);
+			goto err;
+		}
 	}
 
 	pr_info("iterating forwards");
@@ -132,17 +158,18 @@ static void test_iterate(struct bch_fs *c, u64 nr)
 		BUG_ON(k.k->p.offset != --i);
 
 	BUG_ON(i);
-
+err:
 	bch2_trans_exit(&trans);
+	return ret;
 }
 
-static void test_iterate_extents(struct bch_fs *c, u64 nr)
+static int test_iterate_extents(struct bch_fs *c, u64 nr)
 {
 	struct btree_trans trans;
 	struct btree_iter *iter;
 	struct bkey_s_c k;
 	u64 i;
-	int ret;
+	int ret = 0;
 
 	bch2_trans_init(&trans, c, 0, 0);
 
@@ -159,7 +186,10 @@ static void test_iterate_extents(struct bch_fs *c, u64 nr)
 
 		ret = bch2_btree_insert(c, BTREE_ID_EXTENTS, &k.k_i,
 					NULL, NULL, 0);
-		BUG_ON(ret);
+		if (ret) {
+			bch_err(c, "insert error in test_iterate_extents: %i", ret);
+			goto err;
+		}
 	}
 
 	pr_info("iterating forwards");
@@ -182,17 +212,18 @@ static void test_iterate_extents(struct bch_fs *c, u64 nr)
 	}
 
 	BUG_ON(i);
-
+err:
 	bch2_trans_exit(&trans);
+	return ret;
 }
 
-static void test_iterate_slots(struct bch_fs *c, u64 nr)
+static int test_iterate_slots(struct bch_fs *c, u64 nr)
 {
 	struct btree_trans trans;
 	struct btree_iter *iter;
 	struct bkey_s_c k;
 	u64 i;
-	int ret;
+	int ret = 0;
 
 	bch2_trans_init(&trans, c, 0, 0);
 
@@ -208,7 +239,10 @@ static void test_iterate_slots(struct bch_fs *c, u64 nr)
 
 		ret = bch2_btree_insert(c, BTREE_ID_XATTRS, &k.k_i,
 					NULL, NULL, 0);
-		BUG_ON(ret);
+		if (ret) {
+			bch_err(c, "insert error in test_iterate_slots: %i", ret);
+			goto err;
+		}
 	}
 
 	pr_info("iterating forwards");
@@ -240,17 +274,18 @@ static void test_iterate_slots(struct bch_fs *c, u64 nr)
 		if (i == nr * 2)
 			break;
 	}
-
+err:
 	bch2_trans_exit(&trans);
+	return ret;
 }
 
-static void test_iterate_slots_extents(struct bch_fs *c, u64 nr)
+static int test_iterate_slots_extents(struct bch_fs *c, u64 nr)
 {
 	struct btree_trans trans;
 	struct btree_iter *iter;
 	struct bkey_s_c k;
 	u64 i;
-	int ret;
+	int ret = 0;
 
 	bch2_trans_init(&trans, c, 0, 0);
 
@@ -267,7 +302,10 @@ static void test_iterate_slots_extents(struct bch_fs *c, u64 nr)
 
 		ret = bch2_btree_insert(c, BTREE_ID_EXTENTS, &k.k_i,
 					NULL, NULL, 0);
-		BUG_ON(ret);
+		if (ret) {
+			bch_err(c, "insert error in test_iterate_slots_extents: %i", ret);
+			goto err;
+		}
 	}
 
 	pr_info("iterating forwards");
@@ -299,15 +337,16 @@ static void test_iterate_slots_extents(struct bch_fs *c, u64 nr)
 		if (i == nr)
 			break;
 	}
-
+err:
 	bch2_trans_exit(&trans);
+	return 0;
 }
 
 /*
  * XXX: we really want to make sure we've got a btree with depth > 0 for these
  * tests
  */
-static void test_peek_end(struct bch_fs *c, u64 nr)
+static int test_peek_end(struct bch_fs *c, u64 nr)
 {
 	struct btree_trans trans;
 	struct btree_iter *iter;
@@ -324,9 +363,10 @@ static void test_peek_end(struct bch_fs *c, u64 nr)
 	BUG_ON(k.k);
 
 	bch2_trans_exit(&trans);
+	return 0;
 }
 
-static void test_peek_end_extents(struct bch_fs *c, u64 nr)
+static int test_peek_end_extents(struct bch_fs *c, u64 nr)
 {
 	struct btree_trans trans;
 	struct btree_iter *iter;
@@ -343,14 +383,15 @@ static void test_peek_end_extents(struct bch_fs *c, u64 nr)
 	BUG_ON(k.k);
 
 	bch2_trans_exit(&trans);
+	return 0;
 }
 
 /* extent unit tests */
 
 u64 test_version;
 
-static void insert_test_extent(struct bch_fs *c,
-			       u64 start, u64 end)
+static int insert_test_extent(struct bch_fs *c,
+			      u64 start, u64 end)
 {
 	struct bkey_i_cookie k;
 	int ret;
@@ -364,42 +405,47 @@ static void insert_test_extent(struct bch_fs *c,
 
 	ret = bch2_btree_insert(c, BTREE_ID_EXTENTS, &k.k_i,
 				NULL, NULL, 0);
-	BUG_ON(ret);
+	if (ret)
+		bch_err(c, "insert error in insert_test_extent: %i", ret);
+	return ret;
 }
 
-static void __test_extent_overwrite(struct bch_fs *c,
+static int __test_extent_overwrite(struct bch_fs *c,
 				    u64 e1_start, u64 e1_end,
 				    u64 e2_start, u64 e2_end)
 {
-	insert_test_extent(c, e1_start, e1_end);
-	insert_test_extent(c, e2_start, e2_end);
+	int ret;
+
+	ret   = insert_test_extent(c, e1_start, e1_end) ?:
+		insert_test_extent(c, e2_start, e2_end);
 
 	delete_test_keys(c);
+	return ret;
 }
 
-static void test_extent_overwrite_front(struct bch_fs *c, u64 nr)
+static int test_extent_overwrite_front(struct bch_fs *c, u64 nr)
 {
-	__test_extent_overwrite(c, 0, 64, 0, 32);
-	__test_extent_overwrite(c, 8, 64, 0, 32);
+	return  __test_extent_overwrite(c, 0, 64, 0, 32) ?:
+		__test_extent_overwrite(c, 8, 64, 0, 32);
 }
 
-static void test_extent_overwrite_back(struct bch_fs *c, u64 nr)
+static int test_extent_overwrite_back(struct bch_fs *c, u64 nr)
 {
-	__test_extent_overwrite(c, 0, 64, 32, 64);
-	__test_extent_overwrite(c, 0, 64, 32, 72);
+	return  __test_extent_overwrite(c, 0, 64, 32, 64) ?:
+		__test_extent_overwrite(c, 0, 64, 32, 72);
 }
 
-static void test_extent_overwrite_middle(struct bch_fs *c, u64 nr)
+static int test_extent_overwrite_middle(struct bch_fs *c, u64 nr)
 {
-	__test_extent_overwrite(c, 0, 64, 32, 40);
+	return __test_extent_overwrite(c, 0, 64, 32, 40);
 }
 
-static void test_extent_overwrite_all(struct bch_fs *c, u64 nr)
+static int test_extent_overwrite_all(struct bch_fs *c, u64 nr)
 {
-	__test_extent_overwrite(c, 32, 64,  0,  64);
-	__test_extent_overwrite(c, 32, 64,  0, 128);
-	__test_extent_overwrite(c, 32, 64, 32,  64);
-	__test_extent_overwrite(c, 32, 64, 32, 128);
+	return  __test_extent_overwrite(c, 32, 64,  0,  64) ?:
+		__test_extent_overwrite(c, 32, 64,  0, 128) ?:
+		__test_extent_overwrite(c, 32, 64, 32,  64) ?:
+		__test_extent_overwrite(c, 32, 64, 32, 128);
 }
 
 /* perf tests */
@@ -415,11 +461,11 @@ static u64 test_rand(void)
 	return v;
 }
 
-static void rand_insert(struct bch_fs *c, u64 nr)
+static int rand_insert(struct bch_fs *c, u64 nr)
 {
 	struct btree_trans trans;
 	struct bkey_i_cookie k;
-	int ret;
+	int ret = 0;
 	u64 i;
 
 	bch2_trans_init(&trans, c, 0, 0);
@@ -430,48 +476,63 @@ static void rand_insert(struct bch_fs *c, u64 nr)
 
 		ret = __bch2_trans_do(&trans, NULL, NULL, 0,
 			__bch2_btree_insert(&trans, BTREE_ID_XATTRS, &k.k_i));
-
-		BUG_ON(ret);
+		if (ret) {
+			bch_err(c, "error in rand_insert: %i", ret);
+			break;
+		}
 	}
 
 	bch2_trans_exit(&trans);
+	return ret;
 }
 
-static void rand_lookup(struct bch_fs *c, u64 nr)
+static int rand_lookup(struct bch_fs *c, u64 nr)
 {
 	struct btree_trans trans;
 	struct btree_iter *iter;
 	struct bkey_s_c k;
+	int ret = 0;
 	u64 i;
 
 	bch2_trans_init(&trans, c, 0, 0);
+	iter = bch2_trans_get_iter(&trans, BTREE_ID_XATTRS, POS_MIN, 0);
 
 	for (i = 0; i < nr; i++) {
-		iter = bch2_trans_get_iter(&trans, BTREE_ID_XATTRS,
-					   POS(0, test_rand()), 0);
+		bch2_btree_iter_set_pos(iter, POS(0, test_rand()));
 
 		k = bch2_btree_iter_peek(iter);
-		bch2_trans_iter_free(&trans, iter);
+		ret = bkey_err(k);
+		if (ret) {
+			bch_err(c, "error in rand_lookup: %i", ret);
+			break;
+		}
 	}
 
+	bch2_trans_iter_free(&trans, iter);
 	bch2_trans_exit(&trans);
+	return ret;
 }
 
-static void rand_mixed(struct bch_fs *c, u64 nr)
+static int rand_mixed(struct bch_fs *c, u64 nr)
 {
 	struct btree_trans trans;
 	struct btree_iter *iter;
 	struct bkey_s_c k;
-	int ret;
+	int ret = 0;
 	u64 i;
 
 	bch2_trans_init(&trans, c, 0, 0);
+	iter = bch2_trans_get_iter(&trans, BTREE_ID_XATTRS, POS_MIN, 0);
 
 	for (i = 0; i < nr; i++) {
-		iter = bch2_trans_get_iter(&trans, BTREE_ID_XATTRS,
-					   POS(0, test_rand()), 0);
+		bch2_btree_iter_set_pos(iter, POS(0, test_rand()));
 
 		k = bch2_btree_iter_peek(iter);
+		ret = bkey_err(k);
+		if (ret) {
+			bch_err(c, "lookup error in rand_mixed: %i", ret);
+			break;
+		}
 
 		if (!(i & 3) && k.k) {
 			struct bkey_i_cookie k;
@@ -481,14 +542,16 @@ static void rand_mixed(struct bch_fs *c, u64 nr)
 
 			ret = __bch2_trans_do(&trans, NULL, NULL, 0,
 				bch2_trans_update(&trans, iter, &k.k_i, 0));
-
-			BUG_ON(ret);
+			if (ret) {
+				bch_err(c, "update error in rand_mixed: %i", ret);
+				break;
+			}
 		}
-
-		bch2_trans_iter_free(&trans, iter);
 	}
 
+	bch2_trans_iter_free(&trans, iter);
 	bch2_trans_exit(&trans);
+	return ret;
 }
 
 static int __do_delete(struct btree_trans *trans, struct bpos pos)
@@ -518,10 +581,10 @@ err:
 	return ret;
 }
 
-static void rand_delete(struct bch_fs *c, u64 nr)
+static int rand_delete(struct bch_fs *c, u64 nr)
 {
 	struct btree_trans trans;
-	int ret;
+	int ret = 0;
 	u64 i;
 
 	bch2_trans_init(&trans, c, 0, 0);
@@ -531,19 +594,23 @@ static void rand_delete(struct bch_fs *c, u64 nr)
 
 		ret = __bch2_trans_do(&trans, NULL, NULL, 0,
 			__do_delete(&trans, pos));
-		BUG_ON(ret);
+		if (ret) {
+			bch_err(c, "error in rand_delete: %i", ret);
+			break;
+		}
 	}
 
 	bch2_trans_exit(&trans);
+	return ret;
 }
 
-static void seq_insert(struct bch_fs *c, u64 nr)
+static int seq_insert(struct bch_fs *c, u64 nr)
 {
 	struct btree_trans trans;
 	struct btree_iter *iter;
 	struct bkey_s_c k;
 	struct bkey_i_cookie insert;
-	int ret;
+	int ret = 0;
 	u64 i = 0;
 
 	bkey_cookie_init(&insert.k_i);
@@ -556,35 +623,39 @@ static void seq_insert(struct bch_fs *c, u64 nr)
 
 		ret = __bch2_trans_do(&trans, NULL, NULL, 0,
 			bch2_trans_update(&trans, iter, &insert.k_i, 0));
-
-		BUG_ON(ret);
+		if (ret) {
+			bch_err(c, "error in seq_insert: %i", ret);
+			break;
+		}
 
 		if (++i == nr)
 			break;
 	}
 	bch2_trans_exit(&trans);
+	return ret;
 }
 
-static void seq_lookup(struct bch_fs *c, u64 nr)
+static int seq_lookup(struct bch_fs *c, u64 nr)
 {
 	struct btree_trans trans;
 	struct btree_iter *iter;
 	struct bkey_s_c k;
-	int ret;
+	int ret = 0;
 
 	bch2_trans_init(&trans, c, 0, 0);
 
 	for_each_btree_key(&trans, iter, BTREE_ID_XATTRS, POS_MIN, 0, k, ret)
 		;
 	bch2_trans_exit(&trans);
+	return ret;
 }
 
-static void seq_overwrite(struct bch_fs *c, u64 nr)
+static int seq_overwrite(struct bch_fs *c, u64 nr)
 {
 	struct btree_trans trans;
 	struct btree_iter *iter;
 	struct bkey_s_c k;
-	int ret;
+	int ret = 0;
 
 	bch2_trans_init(&trans, c, 0, 0);
 
@@ -596,23 +667,28 @@ static void seq_overwrite(struct bch_fs *c, u64 nr)
 
 		ret = __bch2_trans_do(&trans, NULL, NULL, 0,
 			bch2_trans_update(&trans, iter, &u.k_i, 0));
-
-		BUG_ON(ret);
+		if (ret) {
+			bch_err(c, "error in seq_overwrite: %i", ret);
+			break;
+		}
 	}
 	bch2_trans_exit(&trans);
+	return ret;
 }
 
-static void seq_delete(struct bch_fs *c, u64 nr)
+static int seq_delete(struct bch_fs *c, u64 nr)
 {
 	int ret;
 
 	ret = bch2_btree_delete_range(c, BTREE_ID_XATTRS,
 				      POS(0, 0), POS(0, U64_MAX),
 				      NULL);
-	BUG_ON(ret);
+	if (ret)
+		bch_err(c, "error in seq_delete: %i", ret);
+	return ret;
 }
 
-typedef void (*perf_test_fn)(struct bch_fs *, u64);
+typedef int (*perf_test_fn)(struct bch_fs *, u64);
 
 struct test_job {
 	struct bch_fs			*c;
@@ -628,11 +704,13 @@ struct test_job {
 
 	u64				start;
 	u64				finish;
+	int				ret;
 };
 
 static int btree_perf_test_thread(void *data)
 {
 	struct test_job *j = data;
+	int ret;
 
 	if (atomic_dec_and_test(&j->ready)) {
 		wake_up(&j->ready_wait);
@@ -641,7 +719,9 @@ static int btree_perf_test_thread(void *data)
 		wait_event(j->ready_wait, !atomic_read(&j->ready));
 	}
 
-	j->fn(j->c, j->nr / j->nr_threads);
+	ret = j->fn(j->c, j->nr / j->nr_threads);
+	if (ret)
+		j->ret = ret;
 
 	if (atomic_dec_and_test(&j->done)) {
 		j->finish = sched_clock();
@@ -651,8 +731,8 @@ static int btree_perf_test_thread(void *data)
 	return 0;
 }
 
-void bch2_btree_perf_test(struct bch_fs *c, const char *testname,
-			  u64 nr, unsigned nr_threads)
+int bch2_btree_perf_test(struct bch_fs *c, const char *testname,
+			 u64 nr, unsigned nr_threads)
 {
 	struct test_job j = { .c = c, .nr = nr, .nr_threads = nr_threads };
 	char name_buf[20], nr_buf[20], per_sec_buf[20];
@@ -695,7 +775,7 @@ void bch2_btree_perf_test(struct bch_fs *c, const char *testname,
 
 	if (!j.fn) {
 		pr_err("unknown test %s", testname);
-		return;
+		return -EINVAL;
 	}
 
 	//pr_info("running test %s:", testname);
@@ -720,6 +800,7 @@ void bch2_btree_perf_test(struct bch_fs *c, const char *testname,
 		time / NSEC_PER_SEC,
 		time * nr_threads / nr,
 		per_sec_buf);
+	return j.ret;
 }
 
 #endif /* CONFIG_BCACHEFS_TESTS */
diff --git a/fs/bcachefs/tests.h b/fs/bcachefs/tests.h
index 551d0764225e..c73b18aea7e0 100644
--- a/fs/bcachefs/tests.h
+++ b/fs/bcachefs/tests.h
@@ -6,7 +6,7 @@ struct bch_fs;
 
 #ifdef CONFIG_BCACHEFS_TESTS
 
-void bch2_btree_perf_test(struct bch_fs *, const char *, u64, unsigned);
+int bch2_btree_perf_test(struct bch_fs *, const char *, u64, unsigned);
 
 #else
 
-- 
cgit 


From 3eb26d0157781c262a85f13c20de92889f1a2a8f Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 1 Dec 2020 23:11:53 -0500
Subject: bcachefs: bch2_trans_get_iter() no longer returns errors

Since we now always preallocate the maximum number of iterators when we
initialize a btree transaction, getting an iterator never fails - we can
delete a fair amount of error path code.

This patch also simplifies the iterator allocation code a bit.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c        | 141 ++++++++--------------------------------
 fs/bcachefs/btree_iter.h        |  32 ++++-----
 fs/bcachefs/btree_key_cache.c   |  10 ---
 fs/bcachefs/btree_types.h       |  11 ++--
 fs/bcachefs/btree_update_leaf.c |  27 +-------
 fs/bcachefs/buckets.c           |   6 --
 fs/bcachefs/fs-io.c             |   3 -
 fs/bcachefs/fsck.c              |   8 +--
 fs/bcachefs/inode.c             |   6 --
 fs/bcachefs/io.c                |   9 ---
 fs/bcachefs/recovery.c          |   8 +--
 fs/bcachefs/str_hash.h          |   7 +-
 12 files changed, 46 insertions(+), 222 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 0e8c8f3400de..6fa2b13e53bf 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -346,7 +346,7 @@ void bch2_btree_trans_verify_locks(struct btree_trans *trans)
 {
 	struct btree_iter *iter;
 
-	trans_for_each_iter_all(trans, iter)
+	trans_for_each_iter(trans, iter)
 		bch2_btree_iter_verify_locks(iter);
 }
 #else
@@ -2002,110 +2002,37 @@ int bch2_trans_iter_free(struct btree_trans *trans,
 	return bch2_trans_iter_put(trans, iter);
 }
 
-#if 0
-static int bch2_trans_realloc_iters(struct btree_trans *trans,
-				    unsigned new_size)
+noinline __cold
+static void btree_trans_iter_alloc_fail(struct btree_trans *trans)
 {
-	void *p, *new_iters, *new_updates, *new_updates2;
-	size_t iters_bytes;
-	size_t updates_bytes;
-
-	new_size = roundup_pow_of_two(new_size);
-
-	BUG_ON(new_size > BTREE_ITER_MAX);
-
-	if (new_size <= trans->size)
-		return 0;
-
-	BUG_ON(trans->used_mempool);
-
-	bch2_trans_unlock(trans);
 
-	iters_bytes	= sizeof(struct btree_iter) * new_size;
-	updates_bytes	= sizeof(struct btree_insert_entry) * new_size;
-
-	p = kmalloc(iters_bytes +
-		    updates_bytes +
-		    updates_bytes, GFP_NOFS);
-	if (p)
-		goto success;
-
-	p = mempool_alloc(&trans->c->btree_iters_pool, GFP_NOFS);
-	new_size = BTREE_ITER_MAX;
-
-	trans->used_mempool = true;
-success:
-	new_iters	= p; p += iters_bytes;
-	new_updates	= p; p += updates_bytes;
-	new_updates2	= p; p += updates_bytes;
-
-	memcpy(new_iters, trans->iters,
-	       sizeof(struct btree_iter) * trans->nr_iters);
-	memcpy(new_updates, trans->updates,
-	       sizeof(struct btree_insert_entry) * trans->nr_updates);
-	memcpy(new_updates2, trans->updates2,
-	       sizeof(struct btree_insert_entry) * trans->nr_updates2);
-
-	if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG))
-		memset(trans->iters, POISON_FREE,
-		       sizeof(struct btree_iter) * trans->nr_iters +
-		       sizeof(struct btree_insert_entry) * trans->nr_iters);
-
-	kfree(trans->iters);
-
-	trans->iters		= new_iters;
-	trans->updates		= new_updates;
-	trans->updates2		= new_updates2;
-	trans->size		= new_size;
-
-	if (trans->iters_live) {
-		trace_trans_restart_iters_realloced(trans->ip, trans->size);
-		return -EINTR;
-	}
+	struct btree_iter *iter;
 
-	return 0;
+	trans_for_each_iter(trans, iter)
+		pr_err("iter: btree %s pos %llu:%llu%s%s%s %ps",
+		       bch2_btree_ids[iter->btree_id],
+		       iter->pos.inode,
+		       iter->pos.offset,
+		       (trans->iters_live & (1ULL << iter->idx)) ? " live" : "",
+		       (trans->iters_touched & (1ULL << iter->idx)) ? " touched" : "",
+		       iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT ? " keep" : "",
+		       (void *) iter->ip_allocated);
+	panic("trans iter oveflow\n");
 }
-#endif
 
 static struct btree_iter *btree_trans_iter_alloc(struct btree_trans *trans)
 {
-	unsigned idx = __ffs64(~trans->iters_linked);
-
-	if (idx < trans->nr_iters)
-		goto got_slot;
-
-	if (trans->nr_iters == trans->size) {
-		struct btree_iter *iter;
-
-		BUG_ON(trans->size < BTREE_ITER_MAX);
-
-		trans_for_each_iter(trans, iter) {
-			pr_err("iter: btree %s pos %llu:%llu%s%s%s %ps",
-			       bch2_btree_ids[iter->btree_id],
-			       iter->pos.inode,
-			       iter->pos.offset,
-			       (trans->iters_live & (1ULL << iter->idx)) ? " live" : "",
-			       (trans->iters_touched & (1ULL << iter->idx)) ? " touched" : "",
-			       iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT ? " keep" : "",
-			       (void *) iter->ip_allocated);
-		}
+	unsigned idx;
 
-		panic("trans iter oveflow\n");
-#if 0
-		ret = bch2_trans_realloc_iters(trans, trans->size * 2);
-		if (ret)
-			return ERR_PTR(ret);
-#endif
-	}
+	if (unlikely(trans->iters_linked ==
+		     ~((~0ULL << 1) << (BTREE_ITER_MAX - 1))))
+		btree_trans_iter_alloc_fail(trans);
 
-	idx = trans->nr_iters++;
-	BUG_ON(trans->nr_iters > trans->size);
+	idx = __ffs64(~trans->iters_linked);
 
-	trans->iters[idx].idx = idx;
-got_slot:
-	BUG_ON(trans->iters_linked & (1ULL << idx));
-	trans->iters_linked |= 1ULL << idx;
-	trans->iters[idx].flags = 0;
+	trans->iters_linked	|= 1ULL << idx;
+	trans->iters[idx].idx	 = idx;
+	trans->iters[idx].flags	 = 0;
 	return &trans->iters[idx];
 }
 
@@ -2141,8 +2068,6 @@ static struct btree_iter *__btree_trans_get_iter(struct btree_trans *trans,
 {
 	struct btree_iter *iter, *best = NULL;
 
-	BUG_ON(trans->nr_iters > BTREE_ITER_MAX);
-
 	trans_for_each_iter(trans, iter) {
 		if (btree_iter_type(iter) != (flags & BTREE_ITER_TYPE))
 			continue;
@@ -2160,16 +2085,10 @@ static struct btree_iter *__btree_trans_get_iter(struct btree_trans *trans,
 
 	if (!best) {
 		iter = btree_trans_iter_alloc(trans);
-		if (IS_ERR(iter))
-			return iter;
-
 		bch2_btree_iter_init(trans, iter, btree_id, pos, flags);
 	} else if ((trans->iters_live & (1ULL << best->idx)) ||
 		   (best->flags & BTREE_ITER_KEEP_UNTIL_COMMIT)) {
 		iter = btree_trans_iter_alloc(trans);
-		if (IS_ERR(iter))
-			return iter;
-
 		btree_iter_copy(iter, best);
 	} else {
 		iter = best;
@@ -2203,9 +2122,8 @@ struct btree_iter *__bch2_trans_get_iter(struct btree_trans *trans,
 	struct btree_iter *iter =
 		__btree_trans_get_iter(trans, btree_id, pos, flags);
 
-	if (!IS_ERR(iter))
-		__bch2_btree_iter_set_pos(iter, pos,
-			btree_node_type_is_extents(btree_id));
+	__bch2_btree_iter_set_pos(iter, pos,
+		btree_node_type_is_extents(btree_id));
 	return iter;
 }
 
@@ -2221,7 +2139,6 @@ struct btree_iter *bch2_trans_get_node_iter(struct btree_trans *trans,
 				       flags|BTREE_ITER_NODES);
 	unsigned i;
 
-	BUG_ON(IS_ERR(iter));
 	BUG_ON(bkey_cmp(iter->pos, pos));
 
 	iter->locks_want = locks_want;
@@ -2241,9 +2158,6 @@ struct btree_iter *__bch2_trans_copy_iter(struct btree_trans *trans,
 	struct btree_iter *iter;
 
 	iter = btree_trans_iter_alloc(trans);
-	if (IS_ERR(iter))
-		return iter;
-
 	btree_iter_copy(iter, src);
 
 	trans->iters_live |= 1ULL << iter->idx;
@@ -2318,7 +2232,6 @@ void bch2_trans_reset(struct btree_trans *trans, unsigned flags)
 
 	trans->iters_touched &= trans->iters_live;
 
-	trans->need_reset		= 0;
 	trans->nr_updates		= 0;
 	trans->nr_updates2		= 0;
 	trans->mem_top			= 0;
@@ -2340,9 +2253,8 @@ void bch2_trans_reset(struct btree_trans *trans, unsigned flags)
 
 static void bch2_trans_alloc_iters(struct btree_trans *trans, struct bch_fs *c)
 {
-	unsigned new_size = BTREE_ITER_MAX;
-	size_t iters_bytes	= sizeof(struct btree_iter) * new_size;
-	size_t updates_bytes	= sizeof(struct btree_insert_entry) * new_size;
+	size_t iters_bytes	= sizeof(struct btree_iter) * BTREE_ITER_MAX;
+	size_t updates_bytes	= sizeof(struct btree_insert_entry) * BTREE_ITER_MAX;
 	void *p = NULL;
 
 	BUG_ON(trans->used_mempool);
@@ -2356,7 +2268,6 @@ static void bch2_trans_alloc_iters(struct btree_trans *trans, struct bch_fs *c)
 	trans->iters		= p; p += iters_bytes;
 	trans->updates		= p; p += updates_bytes;
 	trans->updates2		= p; p += updates_bytes;
-	trans->size		= new_size;
 }
 
 void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c,
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index f7a73619c85b..ee8c4346aadb 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -48,21 +48,16 @@ static inline int btree_iter_err(const struct btree_iter *iter)
 
 /* Iterate over iters within a transaction: */
 
-#define trans_for_each_iter_all(_trans, _iter)				\
-	for (_iter = (_trans)->iters;					\
-	     _iter < (_trans)->iters + (_trans)->nr_iters;		\
-	     _iter++)
-
 static inline struct btree_iter *
 __trans_next_iter(struct btree_trans *trans, unsigned idx)
 {
-	EBUG_ON(idx < trans->nr_iters && trans->iters[idx].idx != idx);
-
-	for (; idx < trans->nr_iters; idx++)
-		if (trans->iters_linked & (1ULL << idx))
-			return &trans->iters[idx];
+	u64 l = trans->iters_linked >> idx;
+	if (!l)
+		return NULL;
 
-	return NULL;
+	idx += __ffs64(l);
+	EBUG_ON(trans->iters[idx].idx != idx);
+	return &trans->iters[idx];
 }
 
 #define trans_for_each_iter(_trans, _iter)				\
@@ -240,10 +235,9 @@ static inline int bkey_err(struct bkey_s_c k)
 
 #define for_each_btree_key(_trans, _iter, _btree_id,			\
 			   _start, _flags, _k, _ret)			\
-	for ((_ret) = PTR_ERR_OR_ZERO((_iter) =				\
-			bch2_trans_get_iter((_trans), (_btree_id),	\
-					    (_start), (_flags))) ?:	\
-		      PTR_ERR_OR_ZERO(((_k) =				\
+	for ((_iter) = bch2_trans_get_iter((_trans), (_btree_id),	\
+					   (_start), (_flags)),		\
+	     (_ret) = PTR_ERR_OR_ZERO(((_k) =				\
 			__bch2_btree_iter_peek(_iter, _flags)).k);	\
 	     !_ret && (_k).k;						\
 	     (_ret) = PTR_ERR_OR_ZERO(((_k) =				\
@@ -270,9 +264,7 @@ bch2_trans_get_iter(struct btree_trans *trans, enum btree_id btree_id,
 {
 	struct btree_iter *iter =
 		__bch2_trans_get_iter(trans, btree_id, pos, flags);
-
-	if (!IS_ERR(iter))
-		iter->ip_allocated = _THIS_IP_;
+	iter->ip_allocated = _THIS_IP_;
 	return iter;
 }
 
@@ -284,10 +276,8 @@ bch2_trans_copy_iter(struct btree_trans *trans, struct btree_iter *src)
 	struct btree_iter *iter =
 		__bch2_trans_copy_iter(trans, src);
 
-	if (!IS_ERR(iter))
-		iter->ip_allocated = _THIS_IP_;
+	iter->ip_allocated = _THIS_IP_;
 	return iter;
-
 }
 
 struct btree_iter *bch2_trans_get_node_iter(struct btree_trans *,
diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index ccb5f3cc7160..83156bc45e7b 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -170,9 +170,6 @@ static int btree_key_cache_fill(struct btree_trans *trans,
 
 	iter = bch2_trans_get_iter(trans, ck->key.btree_id,
 				   ck->key.pos, BTREE_ITER_SLOTS);
-	if (IS_ERR(iter))
-		return PTR_ERR(iter);
-
 	k = bch2_btree_iter_peek_slot(iter);
 	ret = bkey_err(k);
 	if (ret) {
@@ -326,18 +323,11 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans,
 	b_iter = bch2_trans_get_iter(trans, key.btree_id, key.pos,
 				     BTREE_ITER_SLOTS|
 				     BTREE_ITER_INTENT);
-	ret = PTR_ERR_OR_ZERO(b_iter);
-	if (ret)
-		goto out;
-
 	c_iter = bch2_trans_get_iter(trans, key.btree_id, key.pos,
 				     BTREE_ITER_CACHED|
 				     BTREE_ITER_CACHED_NOFILL|
 				     BTREE_ITER_CACHED_NOCREATE|
 				     BTREE_ITER_INTENT);
-	ret = PTR_ERR_OR_ZERO(c_iter);
-	if (ret)
-		goto out;
 retry:
 	ret = bch2_btree_iter_traverse(c_iter);
 	if (ret)
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 28f0d7b85ad6..d30c31f0f11f 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -357,20 +357,17 @@ struct btree_trans {
 	unsigned long		ip;
 	int			srcu_idx;
 
-	u64			iters_linked;
-	u64			iters_live;
-	u64			iters_touched;
-
-	u8			nr_iters;
 	u8			nr_updates;
 	u8			nr_updates2;
-	u8			size;
 	unsigned		used_mempool:1;
 	unsigned		error:1;
 	unsigned		nounlock:1;
-	unsigned		need_reset:1;
 	unsigned		in_traverse_all:1;
 
+	u64			iters_linked;
+	u64			iters_live;
+	u64			iters_touched;
+
 	unsigned		mem_top;
 	unsigned		mem_bytes;
 	void			*mem;
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 44d1d21dd608..1c47d806fa9c 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -717,7 +717,7 @@ static void bch2_trans_update2(struct btree_trans *trans,
 
 	BUG_ON(iter->uptodate > BTREE_ITER_NEED_PEEK);
 
-	EBUG_ON(trans->nr_updates2 >= trans->nr_iters);
+	EBUG_ON(trans->nr_updates2 >= BTREE_ITER_MAX);
 
 	iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT;
 
@@ -750,8 +750,6 @@ static int extent_update_to_keys(struct btree_trans *trans,
 		return 0;
 
 	iter = bch2_trans_copy_iter(trans, orig_iter);
-	if (IS_ERR(iter))
-		return PTR_ERR(iter);
 
 	iter->flags |= BTREE_ITER_INTENT;
 	__bch2_btree_iter_set_pos(iter, insert->k.p, false);
@@ -770,10 +768,6 @@ static int extent_handle_overwrites(struct btree_trans *trans,
 	int ret = 0;
 
 	iter = bch2_trans_get_iter(trans, btree_id, start, BTREE_ITER_INTENT);
-	ret = PTR_ERR_OR_ZERO(iter);
-	if (ret)
-		return ret;
-
 	k = bch2_btree_iter_peek_with_updates(iter);
 
 	while (k.k && !(ret = bkey_err(k))) {
@@ -782,8 +776,6 @@ static int extent_handle_overwrites(struct btree_trans *trans,
 
 		if (bkey_cmp(bkey_start_pos(k.k), start) < 0) {
 			update_iter = bch2_trans_copy_iter(trans, iter);
-			if ((ret = PTR_ERR_OR_ZERO(update_iter)))
-				goto err;
 
 			update = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
 			if ((ret = PTR_ERR_OR_ZERO(update)))
@@ -799,8 +791,6 @@ static int extent_handle_overwrites(struct btree_trans *trans,
 
 		if (bkey_cmp(k.k->p, end) > 0) {
 			update_iter = bch2_trans_copy_iter(trans, iter);
-			if ((ret = PTR_ERR_OR_ZERO(update_iter)))
-				goto err;
 
 			update = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
 			if ((ret = PTR_ERR_OR_ZERO(update)))
@@ -814,8 +804,6 @@ static int extent_handle_overwrites(struct btree_trans *trans,
 			bch2_trans_iter_put(trans, update_iter);
 		} else {
 			update_iter = bch2_trans_copy_iter(trans, iter);
-			if ((ret = PTR_ERR_OR_ZERO(update_iter)))
-				goto err;
 
 			update = bch2_trans_kmalloc(trans, sizeof(struct bkey));
 			if ((ret = PTR_ERR_OR_ZERO(update)))
@@ -847,8 +835,6 @@ int __bch2_trans_commit(struct btree_trans *trans)
 	unsigned u64s;
 	int ret = 0;
 
-	BUG_ON(trans->need_reset);
-
 	if (!trans->nr_updates)
 		goto out_noupdates;
 
@@ -1041,10 +1027,6 @@ int bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter,
 		 */
 		if (trans->iters_live & (1ULL << i->iter->idx)) {
 			i->iter = bch2_trans_copy_iter(trans, i->iter);
-			if (IS_ERR(i->iter)) {
-				trans->need_reset = true;
-				return PTR_ERR(i->iter);
-			}
 
 			i->iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT;
 			bch2_trans_iter_put(trans, i->iter);
@@ -1054,7 +1036,7 @@ int bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter,
 		bch2_btree_iter_set_pos(i->iter, n.k->k.p);
 	}
 
-	EBUG_ON(trans->nr_updates >= trans->nr_iters);
+	EBUG_ON(trans->nr_updates >= BTREE_ITER_MAX);
 
 	array_insert_item(trans->updates, trans->nr_updates,
 			  i - trans->updates, n);
@@ -1069,8 +1051,6 @@ int __bch2_btree_insert(struct btree_trans *trans,
 
 	iter = bch2_trans_get_iter(trans, id, bkey_start_pos(&k->k),
 				   BTREE_ITER_INTENT);
-	if (IS_ERR(iter))
-		return PTR_ERR(iter);
 
 	ret   = bch2_btree_iter_traverse(iter) ?:
 		bch2_trans_update(trans, iter, k, 0);
@@ -1117,9 +1097,6 @@ int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id id,
 	int ret = 0;
 
 	iter = bch2_trans_get_iter(trans, id, start, BTREE_ITER_INTENT);
-	ret = PTR_ERR_OR_ZERO(iter);
-	if (ret)
-		return ret;
 retry:
 	while ((k = bch2_btree_iter_peek(iter)).k &&
 	       !(ret = bkey_err(k)) &&
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 65d9b8126609..7cc31b0e02e4 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -1575,9 +1575,6 @@ static int trans_get_key(struct btree_trans *trans,
 
 	*iter = bch2_trans_get_iter(trans, btree_id, pos,
 				    flags|BTREE_ITER_INTENT);
-	if (IS_ERR(*iter))
-		return PTR_ERR(*iter);
-
 	*k = __bch2_btree_iter_peek(*iter, flags);
 	ret = bkey_err(*k);
 	if (ret)
@@ -1605,9 +1602,6 @@ static int bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree
 					   BTREE_ITER_CACHED|
 					   BTREE_ITER_CACHED_NOFILL|
 					   BTREE_ITER_INTENT);
-		if (IS_ERR(iter))
-			return PTR_ERR(iter);
-
 		ret = bch2_btree_iter_traverse(iter);
 		if (ret) {
 			bch2_trans_iter_put(trans, iter);
diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 9c3b11f414b5..4dafe2be0a44 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -2489,10 +2489,7 @@ static long bchfs_fcollapse_finsert(struct bch_inode_info *inode,
 	src = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
 			POS(inode->v.i_ino, src_start >> 9),
 			BTREE_ITER_INTENT);
-	BUG_ON(IS_ERR_OR_NULL(src));
-
 	dst = bch2_trans_copy_iter(&trans, src);
-	BUG_ON(IS_ERR_OR_NULL(dst));
 
 	while (1) {
 		struct disk_reservation disk_res =
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 7449819d8eac..39f872de0c18 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -230,7 +230,6 @@ static int hash_check_duplicates(struct btree_trans *trans,
 		return 0;
 
 	iter = bch2_trans_copy_iter(trans, h->chain);
-	BUG_ON(IS_ERR(iter));
 
 	for_each_btree_key_continue(iter, 0, k2, ret) {
 		if (bkey_cmp(k2.k->p, k.k->p) >= 0)
@@ -265,10 +264,8 @@ static void hash_set_chain_start(struct btree_trans *trans,
 		hash_stop_chain(trans, h);
 
 	if (!hole) {
-		if (!h->chain) {
+		if (!h->chain)
 			h->chain = bch2_trans_copy_iter(trans, k_iter);
-			BUG_ON(IS_ERR(h->chain));
-		}
 
 		h->chain_end = k.k->p.offset;
 	}
@@ -440,9 +437,6 @@ static int bch2_fix_overlapping_extent(struct btree_trans *trans,
 	bch2_cut_front(cut_at, u);
 
 	u_iter = bch2_trans_copy_iter(trans, iter);
-	ret = PTR_ERR_OR_ZERO(u_iter);
-	if (ret)
-		return ret;
 
 	/*
 	 * We don't want to go through the
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index 358e39361e56..b11aecf2cfab 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -302,9 +302,6 @@ struct btree_iter *bch2_inode_peek(struct btree_trans *trans,
 
 	iter = bch2_trans_get_iter(trans, BTREE_ID_INODES, POS(0, inum),
 				   BTREE_ITER_CACHED|flags);
-	if (IS_ERR(iter))
-		return iter;
-
 	k = bch2_btree_iter_peek_cached(iter);
 	ret = bkey_err(k);
 	if (ret)
@@ -640,9 +637,6 @@ int bch2_inode_find_by_inum_trans(struct btree_trans *trans, u64 inode_nr,
 
 	iter = bch2_trans_get_iter(trans, BTREE_ID_INODES,
 			POS(0, inode_nr), BTREE_ITER_CACHED);
-	if (IS_ERR(iter))
-		return PTR_ERR(iter);
-
 	k = bch2_btree_iter_peek_cached(iter);
 	ret = bkey_err(k);
 	if (ret)
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index 62a9a0b32d5b..8125642aef7c 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -207,8 +207,6 @@ static int sum_sector_overwrites(struct btree_trans *trans,
 	*delta = 0;
 
 	iter = bch2_trans_copy_iter(trans, extent_iter);
-	if (IS_ERR(iter))
-		return PTR_ERR(iter);
 
 	for_each_btree_key_continue(iter, BTREE_ITER_SLOTS, old, ret) {
 		if (!may_allocate &&
@@ -1781,9 +1779,6 @@ static int __bch2_rbio_narrow_crcs(struct btree_trans *trans,
 
 	iter = bch2_trans_get_iter(trans, BTREE_ID_EXTENTS, rbio->pos,
 				   BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
-	if ((ret = PTR_ERR_OR_ZERO(iter)))
-		goto out;
-
 	k = bch2_btree_iter_peek_slot(iter);
 	if ((ret = bkey_err(k)))
 		goto out;
@@ -1991,10 +1986,6 @@ int __bch2_read_indirect_extent(struct btree_trans *trans,
 	iter = bch2_trans_get_iter(trans, BTREE_ID_REFLINK,
 				   POS(0, reflink_offset),
 				   BTREE_ITER_SLOTS);
-	ret = PTR_ERR_OR_ZERO(iter);
-	if (ret)
-		return ret;
-
 	k = bch2_btree_iter_peek_slot(iter);
 	ret = bkey_err(k);
 	if (ret)
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index a837d9eb0f6d..d24cef2bf1aa 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -443,9 +443,6 @@ retry:
 		bch2_cut_back(atomic_end, split);
 
 		split_iter = bch2_trans_copy_iter(&trans, iter);
-		ret = PTR_ERR_OR_ZERO(split_iter);
-		if (ret)
-			goto err;
 
 		/*
 		 * It's important that we don't go through the
@@ -502,8 +499,6 @@ static int __bch2_journal_replay_key(struct btree_trans *trans,
 	iter = bch2_trans_get_node_iter(trans, id, k->k.p,
 					BTREE_MAX_DEPTH, level,
 					BTREE_ITER_INTENT);
-	if (IS_ERR(iter))
-		return PTR_ERR(iter);
 
 	/*
 	 * iter->flags & BTREE_ITER_IS_EXTENTS triggers the update path to run
@@ -538,8 +533,7 @@ static int __bch2_alloc_replay_key(struct btree_trans *trans, struct bkey_i *k)
 				   BTREE_ITER_CACHED|
 				   BTREE_ITER_CACHED_NOFILL|
 				   BTREE_ITER_INTENT);
-	ret =   PTR_ERR_OR_ZERO(iter) ?:
-		bch2_trans_update(trans, iter, k, BTREE_TRIGGER_NORUN);
+	ret = bch2_trans_update(trans, iter, k, BTREE_TRIGGER_NORUN);
 	bch2_trans_iter_put(trans, iter);
 	return ret;
 }
diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h
index 9c9549d0a8f6..f6b694b9346b 100644
--- a/fs/bcachefs/str_hash.h
+++ b/fs/bcachefs/str_hash.h
@@ -205,8 +205,6 @@ int bch2_hash_needs_whiteout(struct btree_trans *trans,
 	int ret;
 
 	iter = bch2_trans_copy_iter(trans, start);
-	if (IS_ERR(iter))
-		return PTR_ERR(iter);
 
 	bch2_btree_iter_next_slot(iter);
 
@@ -253,11 +251,8 @@ int bch2_hash_set(struct btree_trans *trans,
 		}
 
 		if (!slot &&
-		    !(flags & BCH_HASH_SET_MUST_REPLACE)) {
+		    !(flags & BCH_HASH_SET_MUST_REPLACE))
 			slot = bch2_trans_copy_iter(trans, iter);
-			if (IS_ERR(slot))
-				return PTR_ERR(slot);
-		}
 
 		if (k.k->type != KEY_TYPE_whiteout)
 			goto not_found;
-- 
cgit 


From c5bb1690346167888d3b2834b4b11d9cfa682583 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 2 Dec 2020 15:33:12 -0500
Subject: bcachefs: Fix journal_flush_seq()

The error check was inverted - leading fsyncs to get stuck and hang,
oops.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/journal.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index 759a04346cde..0cd868c8248b 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -552,16 +552,13 @@ int bch2_journal_flush_seq_async(struct journal *j, u64 seq,
 	struct journal_buf *buf;
 	int ret = 0;
 
-	if (seq <= j->err_seq)
-		return -EIO;
-
 	if (seq <= j->seq_ondisk)
 		return 1;
 
 	spin_lock(&j->lock);
 
 	/* Recheck under lock: */
-	if (seq <= j->err_seq) {
+	if (j->err_seq && seq >= j->err_seq) {
 		ret = -EIO;
 		goto out;
 	}
-- 
cgit 


From b206df6e15ca85d0bb777a5548834c8685e99bc8 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 3 Dec 2020 13:09:08 -0500
Subject: bcachefs: Fix some spurious gcc warnings

These only come up when building in userspace, for some reason.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_key_cache.c | 2 +-
 fs/bcachefs/movinggc.c        | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index 83156bc45e7b..7416e7a49893 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -317,7 +317,7 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans,
 	struct bch_fs *c = trans->c;
 	struct journal *j = &c->journal;
 	struct btree_iter *c_iter = NULL, *b_iter = NULL;
-	struct bkey_cached *ck;
+	struct bkey_cached *ck = NULL;
 	int ret;
 
 	b_iter = bch2_trans_get_iter(trans, key.btree_id, key.pos,
diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c
index a9775cc84f66..f0cfd109a022 100644
--- a/fs/bcachefs/movinggc.c
+++ b/fs/bcachefs/movinggc.c
@@ -61,7 +61,7 @@ static enum data_cmd copygc_pred(struct bch_fs *c, void *arg,
 	copygc_heap *h = &c->copygc_heap;
 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
 	const union bch_extent_entry *entry;
-	struct extent_ptr_decoded p;
+	struct extent_ptr_decoded p = { 0 };
 
 	bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
 		struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev);
-- 
cgit 


From 7bfbbd88024d70947761e482c856522b43a98d87 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 2 Dec 2020 18:30:06 -0500
Subject: bcachefs: Fix spurious alloc errors on forced shutdown

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update_interior.c | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index ac91006f3c69..aed54d5b5251 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -544,6 +544,17 @@ static void btree_update_nodes_written(struct btree_update *as)
 	unsigned i;
 	int ret;
 
+	/*
+	 * If we're already in an error state, it might be because a btree node
+	 * was never written, and we might be trying to free that same btree
+	 * node here, but it won't have been marked as allocated and we'll see
+	 * spurious disk usage inconsistencies in the transactional part below
+	 * if we don't skip it:
+	 */
+	ret = bch2_journal_error(&c->journal);
+	if (ret)
+		goto err;
+
 	BUG_ON(!journal_pin_active(&as->journal));
 
 	/*
@@ -569,8 +580,10 @@ static void btree_update_nodes_written(struct btree_update *as)
 			      BTREE_INSERT_JOURNAL_RESERVED,
 			      btree_update_nodes_written_trans(&trans, as));
 	bch2_trans_exit(&trans);
-	BUG_ON(ret && !bch2_journal_error(&c->journal));
 
+	bch2_fs_fatal_err_on(ret && !bch2_journal_error(&c->journal), c,
+			     "error %i in btree_update_nodes_written()", ret);
+err:
 	if (b) {
 		/*
 		 * @b is the node we did the final insert into:
-- 
cgit 


From f299d57350b2450c522dc7780400ce811f4847ec Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 13 Nov 2020 18:36:33 -0500
Subject: bcachefs: Refactor filesystem usage accounting

Various filesystem usage counters are kept in percpu counters, with one
set per in flight journal buffer. Right now all the code that deals with
it assumes that there's only two buffers/sets of counters, but the
number of journal bufs is getting increased to 4 in the next patch - so
refactor that code to not assume a constant.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs.h      |  2 +-
 fs/bcachefs/btree_gc.c      |  1 -
 fs/bcachefs/buckets.c       | 28 +++++++++++++++++-----------
 fs/bcachefs/journal_types.h |  4 ++++
 fs/bcachefs/replicas.c      | 30 +++++++++++++++++-------------
 fs/bcachefs/super-io.c      |  2 +-
 fs/bcachefs/super.c         |  4 ++--
 7 files changed, 42 insertions(+), 29 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 4fe3f9257752..6db04dc9d2d3 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -676,7 +676,7 @@ struct bch_fs {
 
 	seqcount_t			usage_lock;
 	struct bch_fs_usage		*usage_base;
-	struct bch_fs_usage __percpu	*usage[2];
+	struct bch_fs_usage __percpu	*usage[JOURNAL_BUF_NR];
 	struct bch_fs_usage __percpu	*usage_gc;
 	u64 __percpu		*online_reserved;
 
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index df018a2e463e..5f5686466d7d 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -603,7 +603,6 @@ static int bch2_gc_done(struct bch_fs *c,
 		struct genradix_iter dst_iter = genradix_iter_init(&c->stripes[0], 0);
 		struct genradix_iter src_iter = genradix_iter_init(&c->stripes[1], 0);
 		struct stripe *dst, *src;
-		unsigned i;
 
 		c->ec_stripes_heap.used = 0;
 
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 7cc31b0e02e4..4762c5465ef0 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -207,13 +207,13 @@ static inline struct bch_fs_usage *fs_usage_ptr(struct bch_fs *c,
 {
 	return this_cpu_ptr(gc
 			    ? c->usage_gc
-			    : c->usage[journal_seq & 1]);
+			    : c->usage[journal_seq & JOURNAL_BUF_MASK]);
 }
 
 u64 bch2_fs_usage_read_one(struct bch_fs *c, u64 *v)
 {
 	ssize_t offset = v - (u64 *) c->usage_base;
-	unsigned seq;
+	unsigned i, seq;
 	u64 ret;
 
 	BUG_ON(offset < 0 || offset >= fs_usage_u64s(c));
@@ -221,9 +221,10 @@ u64 bch2_fs_usage_read_one(struct bch_fs *c, u64 *v)
 
 	do {
 		seq = read_seqcount_begin(&c->usage_lock);
-		ret = *v +
-			percpu_u64_get((u64 __percpu *) c->usage[0] + offset) +
-			percpu_u64_get((u64 __percpu *) c->usage[1] + offset);
+		ret = *v;
+
+		for (i = 0; i < ARRAY_SIZE(c->usage); i++)
+			ret += percpu_u64_get((u64 __percpu *) c->usage[i] + offset);
 	} while (read_seqcount_retry(&c->usage_lock, seq));
 
 	return ret;
@@ -232,15 +233,20 @@ u64 bch2_fs_usage_read_one(struct bch_fs *c, u64 *v)
 struct bch_fs_usage_online *bch2_fs_usage_read(struct bch_fs *c)
 {
 	struct bch_fs_usage_online *ret;
-	unsigned seq, i, u64s;
+	unsigned seq, i, v, u64s = fs_usage_u64s(c);
+retry:
+	ret = kmalloc(u64s * sizeof(u64), GFP_NOFS);
+	if (unlikely(!ret))
+		return NULL;
 
 	percpu_down_read(&c->mark_lock);
 
-	ret = kmalloc(sizeof(struct bch_fs_usage_online) +
-		      sizeof(u64) + c->replicas.nr, GFP_NOFS);
-	if (unlikely(!ret)) {
+	v = fs_usage_u64s(c);
+	if (unlikely(u64s != v)) {
+		u64s = v;
 		percpu_up_read(&c->mark_lock);
-		return NULL;
+		kfree(ret);
+		goto retry;
 	}
 
 	ret->online_reserved = percpu_u64_get(c->online_reserved);
@@ -248,7 +254,7 @@ struct bch_fs_usage_online *bch2_fs_usage_read(struct bch_fs *c)
 	u64s = fs_usage_u64s(c);
 	do {
 		seq = read_seqcount_begin(&c->usage_lock);
-		memcpy(&ret->u, c->usage_base, u64s * sizeof(u64));
+		memcpy(ret, c->usage_base, u64s * sizeof(u64));
 		for (i = 0; i < ARRAY_SIZE(c->usage); i++)
 			acc_u64s_percpu((u64 *) &ret->u, (u64 __percpu *) c->usage[i], u64s);
 	} while (read_seqcount_retry(&c->usage_lock, seq));
diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h
index 6312a7f06d87..7e328ccc0a8f 100644
--- a/fs/bcachefs/journal_types.h
+++ b/fs/bcachefs/journal_types.h
@@ -11,6 +11,10 @@
 
 struct journal_res;
 
+#define JOURNAL_BUF_BITS	1
+#define JOURNAL_BUF_NR		(1U << JOURNAL_BUF_BITS)
+#define JOURNAL_BUF_MASK	(JOURNAL_BUF_NR - 1)
+
 /*
  * We put two of these in struct journal; we used them for writes to the
  * journal that are being staged or in flight.
diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c
index f46aa1d70e35..85c97f67936a 100644
--- a/fs/bcachefs/replicas.c
+++ b/fs/bcachefs/replicas.c
@@ -275,7 +275,7 @@ static void __replicas_table_update_pcpu(struct bch_fs_usage __percpu *dst_p,
 static int replicas_table_update(struct bch_fs *c,
 				 struct bch_replicas_cpu *new_r)
 {
-	struct bch_fs_usage __percpu *new_usage[2];
+	struct bch_fs_usage __percpu *new_usage[JOURNAL_BUF_NR];
 	struct bch_fs_usage_online *new_scratch = NULL;
 	struct bch_fs_usage __percpu *new_gc = NULL;
 	struct bch_fs_usage *new_base = NULL;
@@ -283,7 +283,14 @@ static int replicas_table_update(struct bch_fs *c,
 		sizeof(u64) * new_r->nr;
 	unsigned scratch_bytes = sizeof(struct bch_fs_usage_online) +
 		sizeof(u64) * new_r->nr;
-	int ret = -ENOMEM;
+	int ret = 0;
+
+	memset(new_usage, 0, sizeof(new_usage));
+
+	for (i = 0; i < ARRAY_SIZE(new_usage); i++)
+		if (!(new_usage[i] = __alloc_percpu_gfp(bytes,
+					sizeof(u64), GFP_NOIO)))
+			goto err;
 
 	memset(new_usage, 0, sizeof(new_usage));
 
@@ -295,10 +302,8 @@ static int replicas_table_update(struct bch_fs *c,
 	if (!(new_base = kzalloc(bytes, GFP_NOIO)) ||
 	    !(new_scratch  = kmalloc(scratch_bytes, GFP_NOIO)) ||
 	    (c->usage_gc &&
-	     !(new_gc = __alloc_percpu_gfp(bytes, sizeof(u64), GFP_NOIO)))) {
-		bch_err(c, "error updating replicas table: memory allocation failure");
+	     !(new_gc = __alloc_percpu_gfp(bytes, sizeof(u64), GFP_NOIO))))
 		goto err;
-	}
 
 	for (i = 0; i < ARRAY_SIZE(new_usage); i++)
 		if (c->usage[i])
@@ -317,14 +322,17 @@ static int replicas_table_update(struct bch_fs *c,
 	swap(c->usage_scratch,	new_scratch);
 	swap(c->usage_gc,	new_gc);
 	swap(c->replicas,	*new_r);
-	ret = 0;
-err:
+out:
 	free_percpu(new_gc);
 	kfree(new_scratch);
 	free_percpu(new_usage[1]);
 	free_percpu(new_usage[0]);
 	kfree(new_base);
 	return ret;
+err:
+	bch_err(c, "error updating replicas table: memory allocation failure");
+	ret = -ENOMEM;
+	goto out;
 }
 
 static unsigned reserve_journal_replicas(struct bch_fs *c,
@@ -499,9 +507,7 @@ int bch2_replicas_gc_end(struct bch_fs *c, int ret)
 		struct bch_replicas_cpu n;
 
 		if (!__replicas_has_entry(&c->replicas_gc, e) &&
-		    (c->usage_base->replicas[i] ||
-		     percpu_u64_get(&c->usage[0]->replicas[i]) ||
-		     percpu_u64_get(&c->usage[1]->replicas[i]))) {
+		    bch2_fs_usage_read_one(c, &c->usage_base->replicas[i])) {
 			n = cpu_replicas_add_entry(&c->replicas_gc, e);
 			if (!n.entries) {
 				ret = -ENOSPC;
@@ -606,9 +612,7 @@ retry:
 			cpu_replicas_entry(&c->replicas, i);
 
 		if (e->data_type == BCH_DATA_journal ||
-		    c->usage_base->replicas[i] ||
-		    percpu_u64_get(&c->usage[0]->replicas[i]) ||
-		    percpu_u64_get(&c->usage[1]->replicas[i]))
+		    bch2_fs_usage_read_one(c, &c->usage_base->replicas[i]))
 			memcpy(cpu_replicas_entry(&new, new.nr++),
 			       e, new.entry_size);
 	}
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index 5406315340e1..e25ff75b97f3 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -998,7 +998,7 @@ bch2_journal_super_entries_add_common(struct bch_fs *c,
 		for (i = 0; i < ARRAY_SIZE(c->usage); i++)
 			bch2_fs_usage_acc_to_base(c, i);
 	} else {
-		bch2_fs_usage_acc_to_base(c, journal_seq & 1);
+		bch2_fs_usage_acc_to_base(c, journal_seq & JOURNAL_BUF_MASK);
 	}
 
 	{
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 98a875e08e9a..f46b4b05b4aa 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -483,8 +483,8 @@ static void __bch2_fs_free(struct bch_fs *c)
 	percpu_free_rwsem(&c->mark_lock);
 	free_percpu(c->online_reserved);
 	kfree(c->usage_scratch);
-	free_percpu(c->usage[1]);
-	free_percpu(c->usage[0]);
+	for (i = 0; i < ARRAY_SIZE(c->usage); i++)
+		free_percpu(c->usage[i]);
 	kfree(c->usage_base);
 
 	if (c->btree_iters_bufs)
-- 
cgit 


From 0fefe8d8ef74029e9f3676ef9613ef022ae6dbd6 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 3 Dec 2020 13:57:22 -0500
Subject: bcachefs: Improve some IO error messages

it's useful to know whether an error was for a read or a write - this
also standardizes error messages a bit more.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs.h   |  9 +++++++--
 fs/bcachefs/btree_io.c   | 31 +++++++++++++++++++++----------
 fs/bcachefs/ec.c         | 12 ++++++------
 fs/bcachefs/error.h      | 29 ++++++++++++++++-------------
 fs/bcachefs/fs-io.c      |  4 +++-
 fs/bcachefs/io.c         | 47 ++++++++++++++++++++++++++++++-----------------
 fs/bcachefs/journal_io.c |  4 ++--
 fs/bcachefs/super-io.c   |  2 +-
 8 files changed, 86 insertions(+), 52 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 6db04dc9d2d3..9645a4edcbe8 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -214,9 +214,11 @@
 	 dynamic_fault("bcachefs:meta:write:" name)
 
 #ifdef __KERNEL__
-#define bch2_fmt(_c, fmt)	"bcachefs (%s): " fmt "\n", ((_c)->name)
+#define bch2_fmt(_c, fmt)		"bcachefs (%s): " fmt "\n", ((_c)->name)
+#define bch2_fmt_inum(_c, _inum, fmt)	"bcachefs (%s inum %llu): " fmt "\n", ((_c)->name), (_inum)
 #else
-#define bch2_fmt(_c, fmt)	fmt "\n"
+#define bch2_fmt(_c, fmt)		fmt "\n"
+#define bch2_fmt_inum(_c, _inum, fmt)	"inum %llu: " fmt "\n", (_inum)
 #endif
 
 #define bch_info(c, fmt, ...) \
@@ -229,8 +231,11 @@
 	printk_ratelimited(KERN_WARNING bch2_fmt(c, fmt), ##__VA_ARGS__)
 #define bch_err(c, fmt, ...) \
 	printk(KERN_ERR bch2_fmt(c, fmt), ##__VA_ARGS__)
+
 #define bch_err_ratelimited(c, fmt, ...) \
 	printk_ratelimited(KERN_ERR bch2_fmt(c, fmt), ##__VA_ARGS__)
+#define bch_err_inum_ratelimited(c, _inum, fmt, ...) \
+	printk_ratelimited(KERN_ERR bch2_fmt_inum(c, _inum, fmt), ##__VA_ARGS__)
 
 #define bch_verbose(c, fmt, ...)					\
 do {									\
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index af3b39b70957..520eef531d39 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -597,18 +597,25 @@ void bch2_btree_init_next(struct bch_fs *c, struct btree *b,
 		bch2_btree_iter_reinit_node(iter, b);
 }
 
+static void btree_pos_to_text(struct printbuf *out, struct bch_fs *c,
+			  struct btree *b)
+{
+	pr_buf(out, "%s level %u/%u\n  ",
+	       bch2_btree_ids[b->c.btree_id],
+	       b->c.level,
+	       c->btree_roots[b->c.btree_id].level);
+	bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&b->key));
+}
+
 static void btree_err_msg(struct printbuf *out, struct bch_fs *c,
 			  struct btree *b, struct bset *i,
 			  unsigned offset, int write)
 {
-	pr_buf(out, "error validating btree node %sat btree %u level %u/%u\n"
-	       "pos ",
-	       write ? "before write " : "",
-	       b->c.btree_id, b->c.level,
-	       c->btree_roots[b->c.btree_id].level);
-	bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&b->key));
+	pr_buf(out, "error validating btree node %sat btree ",
+	       write ? "before write " : "");
+	btree_pos_to_text(out, c, b);
 
-	pr_buf(out, " node offset %u", b->written);
+	pr_buf(out, "\n  node offset %u", b->written);
 	if (i)
 		pr_buf(out, " bset u64s %u", le16_to_cpu(i->u64s));
 }
@@ -1104,6 +1111,8 @@ static void btree_node_read_work(struct work_struct *work)
 	struct btree *b		= rb->bio.bi_private;
 	struct bio *bio		= &rb->bio;
 	struct bch_io_failures failed = { .nr = 0 };
+	char buf[200];
+	struct printbuf out;
 	bool can_retry;
 
 	goto start;
@@ -1122,8 +1131,10 @@ static void btree_node_read_work(struct work_struct *work)
 			bio->bi_status = BLK_STS_REMOVED;
 		}
 start:
-		bch2_dev_io_err_on(bio->bi_status, ca, "btree read: %s",
-				   bch2_blk_status_to_str(bio->bi_status));
+		out = PBUF(buf);
+		btree_pos_to_text(&out, c, b);
+		bch2_dev_io_err_on(bio->bi_status, ca, "btree read error %s for %s",
+				   bch2_blk_status_to_str(bio->bi_status), buf);
 		if (rb->have_ioref)
 			percpu_ref_put(&ca->io_ref);
 		rb->have_ioref = false;
@@ -1408,7 +1419,7 @@ static void btree_node_write_endio(struct bio *bio)
 	if (wbio->have_ioref)
 		bch2_latency_acct(ca, wbio->submit_time, WRITE);
 
-	if (bch2_dev_io_err_on(bio->bi_status, ca, "btree write: %s",
+	if (bch2_dev_io_err_on(bio->bi_status, ca, "btree write error: %s",
 			       bch2_blk_status_to_str(bio->bi_status)) ||
 	    bch2_meta_write_fault("btree")) {
 		spin_lock_irqsave(&c->btree_write_error_lock, flags);
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index 42331f0e54e7..6c9259ee6742 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -264,7 +264,7 @@ static void ec_validate_checksums(struct bch_fs *c, struct ec_stripe_buf *buf)
 					     len << 9);
 
 			if (memcmp(stripe_csum(v, i, j), &csum, csum_bytes)) {
-				__bcache_io_error(c,
+				bch_err_ratelimited(c,
 					"checksum error while doing reconstruct read (%u:%u)",
 					i, j);
 				clear_bit(i, buf->valid);
@@ -305,7 +305,7 @@ static int ec_do_recov(struct bch_fs *c, struct ec_stripe_buf *buf)
 	unsigned bytes = buf->size << 9;
 
 	if (ec_nr_failed(buf) > v->nr_redundant) {
-		__bcache_io_error(c,
+		bch_err_ratelimited(c,
 			"error doing reconstruct read: unable to read enough blocks");
 		return -1;
 	}
@@ -326,7 +326,7 @@ static void ec_block_endio(struct bio *bio)
 	struct bch_dev *ca = ec_bio->ca;
 	struct closure *cl = bio->bi_private;
 
-	if (bch2_dev_io_err_on(bio->bi_status, ca, "erasure coding %s: %s",
+	if (bch2_dev_io_err_on(bio->bi_status, ca, "erasure coding %s error: %s",
 			       bio_data_dir(bio) ? "write" : "read",
 			       bch2_blk_status_to_str(bio->bi_status)))
 		clear_bit(ec_bio->idx, ec_bio->buf->valid);
@@ -420,7 +420,7 @@ int bch2_ec_read_extent(struct bch_fs *c, struct bch_read_bio *rbio)
 				   BTREE_ITER_SLOTS);
 	k = bch2_btree_iter_peek_slot(iter);
 	if (bkey_err(k) || k.k->type != KEY_TYPE_stripe) {
-		__bcache_io_error(c,
+		bch_err_ratelimited(c,
 			"error doing reconstruct read: stripe not found");
 		kfree(buf);
 		return bch2_trans_exit(&trans) ?: -EIO;
@@ -462,7 +462,7 @@ int bch2_ec_read_extent(struct bch_fs *c, struct bch_read_bio *rbio)
 		struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
 
 		if (ptr_stale(ca, ptr)) {
-			__bcache_io_error(c,
+			bch_err_ratelimited(c,
 					  "error doing reconstruct read: stale pointer");
 			clear_bit(i, buf->valid);
 			continue;
@@ -474,7 +474,7 @@ int bch2_ec_read_extent(struct bch_fs *c, struct bch_read_bio *rbio)
 	closure_sync(&cl);
 
 	if (ec_nr_failed(buf) > v->nr_redundant) {
-		__bcache_io_error(c,
+		bch_err_ratelimited(c,
 			"error doing reconstruct read: unable to read enough blocks");
 		ret = -EIO;
 		goto err;
diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h
index 94b53312fbbd..0e49fd728e44 100644
--- a/fs/bcachefs/error.h
+++ b/fs/bcachefs/error.h
@@ -181,12 +181,18 @@ void bch2_io_error(struct bch_dev *);
 /* Logs message and handles the error: */
 #define bch2_dev_io_error(ca, fmt, ...)					\
 do {									\
-	printk_ratelimited(KERN_ERR bch2_fmt((ca)->fs,			\
-		"IO error on %s for " fmt),				\
+	printk_ratelimited(KERN_ERR "bcachefs (%s): " fmt,		\
 		(ca)->name, ##__VA_ARGS__);				\
 	bch2_io_error(ca);						\
 } while (0)
 
+#define bch2_dev_inum_io_error(ca, _inum, _offset, fmt, ...)		\
+do {									\
+	printk_ratelimited(KERN_ERR "bcachefs (%s inum %llu offset %llu): " fmt,\
+		(ca)->name, (_inum), (_offset), ##__VA_ARGS__);		\
+	bch2_io_error(ca);						\
+} while (0)
+
 #define bch2_dev_io_err_on(cond, ca, ...)				\
 ({									\
 	bool _ret = (cond);						\
@@ -196,16 +202,13 @@ do {									\
 	_ret;								\
 })
 
-/* kill? */
-
-#define __bcache_io_error(c, fmt, ...)					\
-	printk_ratelimited(KERN_ERR bch2_fmt(c,				\
-			"IO error: " fmt), ##__VA_ARGS__)
-
-#define bcache_io_error(c, bio, fmt, ...)				\
-do {									\
-	__bcache_io_error(c, fmt, ##__VA_ARGS__);			\
-	(bio)->bi_status = BLK_STS_IOERR;					\
-} while (0)
+#define bch2_dev_inum_io_err_on(cond, ca, _inum, _offset, ...)		\
+({									\
+	bool _ret = (cond);						\
+									\
+	if (_ret)							\
+		bch2_dev_inum_io_error(ca, _inum, _offset, __VA_ARGS__);\
+	_ret;								\
+})
 
 #endif /* _BCACHEFS_ERROR_H */
diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 4dafe2be0a44..c10192e2a688 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -839,7 +839,9 @@ retry:
 		goto retry;
 
 	if (ret) {
-		bcache_io_error(c, &rbio->bio, "btree IO error %i", ret);
+		bch_err_inum_ratelimited(c, inum,
+				"read error %i from btree lookup", ret);
+		rbio->bio.bi_status = BLK_STS_IOERR;
 		bio_endio(&rbio->bio);
 	}
 
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index 8125642aef7c..9603381bb7ce 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -588,7 +588,8 @@ static void __bch2_write_index(struct bch_write_op *op)
 		op->written += sectors_start - keylist_sectors(keys);
 
 		if (ret) {
-			__bcache_io_error(c, "btree IO error %i", ret);
+			bch_err_inum_ratelimited(c, op->pos.inode,
+				"write error %i from btree update", ret);
 			op->error = ret;
 		}
 	}
@@ -633,7 +634,10 @@ static void bch2_write_endio(struct bio *bio)
 	struct bch_fs *c		= wbio->c;
 	struct bch_dev *ca		= bch_dev_bkey_exists(c, wbio->dev);
 
-	if (bch2_dev_io_err_on(bio->bi_status, ca, "data write: %s",
+	if (bch2_dev_inum_io_err_on(bio->bi_status, ca,
+				    op->pos.inode,
+				    op->pos.offset - bio_sectors(bio), /* XXX definitely wrong */
+				    "data write error: %s",
 			       bch2_blk_status_to_str(bio->bi_status)))
 		set_bit(wbio->dev, op->failed.d);
 
@@ -1276,15 +1280,14 @@ void bch2_write(struct closure *cl)
 	wbio_init(bio)->put_bio = false;
 
 	if (bio_sectors(bio) & (c->opts.block_size - 1)) {
-		__bcache_io_error(c, "misaligned write");
+		bch_err_inum_ratelimited(c, op->pos.inode,
+					 "misaligned write");
 		op->error = -EIO;
 		goto err;
 	}
 
 	if (c->opts.nochanges ||
 	    !percpu_ref_tryget(&c->writes)) {
-		if (!(op->flags & BCH_WRITE_FROM_INTERNAL))
-			__bcache_io_error(c, "read only");
 		op->error = -EROFS;
 		goto err;
 	}
@@ -1707,7 +1710,8 @@ retry:
 	 * reading a btree node
 	 */
 	BUG_ON(!ret);
-	__bcache_io_error(c, "btree IO error: %i", ret);
+	bch_err_inum_ratelimited(c, inode,
+			"read error %i from btree lookup", ret);
 err:
 	rbio->bio.bi_status = BLK_STS_IOERR;
 out:
@@ -1911,17 +1915,15 @@ csum_err:
 		return;
 	}
 
-	bch2_dev_io_error(ca,
-		"data checksum error, inode %llu offset %llu: expected %0llx:%0llx got %0llx:%0llx (type %u)",
-		rbio->pos.inode, (u64) rbio->bvec_iter.bi_sector,
+	bch2_dev_inum_io_error(ca, rbio->pos.inode, (u64) rbio->bvec_iter.bi_sector,
+		"data checksum error: expected %0llx:%0llx got %0llx:%0llx (type %u)",
 		rbio->pick.crc.csum.hi, rbio->pick.crc.csum.lo,
 		csum.hi, csum.lo, crc.csum_type);
 	bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
 	return;
 decompression_err:
-	__bcache_io_error(c, "decompression error, inode %llu offset %llu",
-			  rbio->pos.inode,
-			  (u64) rbio->bvec_iter.bi_sector);
+	bch_err_inum_ratelimited(c, rbio->pos.inode,
+				 "decompression error");
 	bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR);
 	return;
 }
@@ -1943,7 +1945,14 @@ static void bch2_read_endio(struct bio *bio)
 	if (!rbio->split)
 		rbio->bio.bi_end_io = rbio->end_io;
 
-	if (bch2_dev_io_err_on(bio->bi_status, ca, "data read; %s",
+	/*
+	 * XXX: rbio->pos is not what we want here when reading from indirect
+	 * extents
+	 */
+	if (bch2_dev_inum_io_err_on(bio->bi_status, ca,
+				    rbio->pos.inode,
+				    rbio->pos.offset,
+				    "data read error: %s",
 			       bch2_blk_status_to_str(bio->bi_status))) {
 		bch2_rbio_error(rbio, READ_RETRY_AVOID, bio->bi_status);
 		return;
@@ -1993,7 +2002,7 @@ int __bch2_read_indirect_extent(struct btree_trans *trans,
 
 	if (k.k->type != KEY_TYPE_reflink_v &&
 	    k.k->type != KEY_TYPE_indirect_inline_data) {
-		__bcache_io_error(trans->c,
+		bch_err_inum_ratelimited(trans->c, orig_k->k->k.p.inode,
 				"pointer to nonexistent indirect extent");
 		ret = -EIO;
 		goto err;
@@ -2038,7 +2047,8 @@ int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig,
 		goto hole;
 
 	if (pick_ret < 0) {
-		__bcache_io_error(c, "no device to read from");
+		bch_err_inum_ratelimited(c, k.k->p.inode,
+					 "no device to read from");
 		goto err;
 	}
 
@@ -2190,7 +2200,8 @@ get_bio:
 
 	if (!rbio->pick.idx) {
 		if (!rbio->have_ioref) {
-			__bcache_io_error(c, "no device to read from");
+			bch_err_inum_ratelimited(c, k.k->p.inode,
+						 "no device to read from");
 			bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
 			goto out;
 		}
@@ -2345,7 +2356,9 @@ err:
 	if (ret == -EINTR)
 		goto retry;
 
-	bcache_io_error(c, &rbio->bio, "btree IO error: %i", ret);
+	bch_err_inum_ratelimited(c, inode,
+				 "read error %i from btree lookup", ret);
+	rbio->bio.bi_status = BLK_STS_IOERR;
 	bch2_rbio_done(rbio);
 	goto out;
 }
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 79d5d892728f..fc2fdcc2b627 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -557,7 +557,7 @@ reread:
 			kfree(bio);
 
 			if (bch2_dev_io_err_on(ret, ca,
-					       "journal read from sector %llu",
+					       "journal read error: sector %llu",
 					       offset) ||
 			    bch2_meta_read_fault("journal"))
 				return -EIO;
@@ -1015,7 +1015,7 @@ static void journal_write_endio(struct bio *bio)
 	struct bch_dev *ca = bio->bi_private;
 	struct journal *j = &ca->fs->journal;
 
-	if (bch2_dev_io_err_on(bio->bi_status, ca, "journal write: %s",
+	if (bch2_dev_io_err_on(bio->bi_status, ca, "journal write error: %s",
 			       bch2_blk_status_to_str(bio->bi_status)) ||
 	    bch2_meta_write_fault("journal")) {
 		struct journal_buf *w = journal_prev_buf(j);
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index e25ff75b97f3..e1b4e6f02ee3 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -640,7 +640,7 @@ static void write_super_endio(struct bio *bio)
 
 	/* XXX: return errors directly */
 
-	if (bch2_dev_io_err_on(bio->bi_status, ca, "superblock write: %s",
+	if (bch2_dev_io_err_on(bio->bi_status, ca, "superblock write error: %s",
 			       bch2_blk_status_to_str(bio->bi_status)))
 		ca->sb_write_error = 1;
 
-- 
cgit 


From 33c74e4119a91c3ae87fc207777e34fdbb613c66 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 3 Dec 2020 14:27:20 -0500
Subject: bcachefs: Flag inodes that had btree update errors

On write error, the vfs inode's i_size may be inconsistent with the
btree inode's i_size - flag this so we don't have spurious assertions.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-io.c | 13 +++++++++++--
 fs/bcachefs/fs.c    |  1 +
 fs/bcachefs/fs.h    |  7 +++++++
 3 files changed, 19 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index c10192e2a688..2d31547446ac 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -994,6 +994,8 @@ static void bch2_writepage_io_done(struct closure *cl)
 	unsigned i;
 
 	if (io->op.error) {
+		set_bit(EI_INODE_ERROR, &io->inode->ei_flags);
+
 		bio_for_each_segment_all(bvec, bio, iter) {
 			struct bch_page_state *s;
 
@@ -1916,7 +1918,13 @@ loop:
 
 		bio_for_each_segment_all(bv, bio, iter)
 			put_page(bv->bv_page);
-		if (!dio->iter.count || dio->op.error)
+
+		if (dio->op.error) {
+			set_bit(EI_INODE_ERROR, &inode->ei_flags);
+			break;
+		}
+
+		if (!dio->iter.count)
 			break;
 
 		bio_reset(bio, NULL, REQ_OP_WRITE);
@@ -2306,7 +2314,8 @@ int bch2_truncate(struct bch_inode_info *inode, struct iattr *iattr)
 	if (ret)
 		goto err;
 
-	BUG_ON(inode->v.i_size < inode_u.bi_size);
+	WARN_ON(!test_bit(EI_INODE_ERROR, &inode->ei_flags) &&
+		inode->v.i_size < inode_u.bi_size);
 
 	if (iattr->ia_size > inode->v.i_size) {
 		ret = bch2_extend(inode, &inode_u, iattr);
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index a3810493826b..7cd3f243d1ed 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -1161,6 +1161,7 @@ static void bch2_vfs_inode_init(struct bch_fs *c,
 	inode->v.i_generation	= bi->bi_generation;
 	inode->v.i_size		= bi->bi_size;
 
+	inode->ei_flags		= 0;
 	inode->ei_journal_seq	= 0;
 	inode->ei_quota_reserved = 0;
 	inode->ei_str_hash	= bch2_hash_info_init(c, bi);
diff --git a/fs/bcachefs/fs.h b/fs/bcachefs/fs.h
index 7c095b856b05..8c2796aa7abf 100644
--- a/fs/bcachefs/fs.h
+++ b/fs/bcachefs/fs.h
@@ -33,6 +33,7 @@ void bch2_pagecache_block_get(struct pagecache_lock *);
 
 struct bch_inode_info {
 	struct inode		v;
+	unsigned long		ei_flags;
 
 	struct mutex		ei_update_lock;
 	u64			ei_journal_seq;
@@ -49,6 +50,12 @@ struct bch_inode_info {
 	struct bch_inode_unpacked ei_inode;
 };
 
+/*
+ * Set if we've gotten a btree error for this inode, and thus the vfs inode and
+ * btree inode may be inconsistent:
+ */
+#define EI_INODE_ERROR			0
+
 #define to_bch_ei(_inode)					\
 	container_of_or_null(_inode, struct bch_inode_info, v)
 
-- 
cgit 


From afa7cb0c36bd511362bcb03c6db8af74186176bf Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 3 Dec 2020 13:23:58 -0500
Subject: bcachefs: Check for errors in bch2_journal_reclaim()

If the journal is halted, journal reclaim won't necessarily be able to
make any forward progress, and won't accomplish anything anyways - we
should bail out so that we don't get stuck looping in reclaim when the
caches are too dirty and we should be shutting down.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update_leaf.c |  8 ++++----
 fs/bcachefs/journal_reclaim.c   | 19 ++++++++++++++-----
 fs/bcachefs/journal_reclaim.h   |  2 +-
 3 files changed, 19 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 1c47d806fa9c..e27ec0fbee2c 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -659,13 +659,13 @@ int bch2_trans_commit_error(struct btree_trans *trans,
 	case BTREE_INSERT_NEED_JOURNAL_RECLAIM:
 		bch2_trans_unlock(trans);
 
-		while (bch2_btree_key_cache_must_wait(c)) {
+		do {
 			mutex_lock(&c->journal.reclaim_lock);
-			bch2_journal_reclaim(&c->journal);
+			ret = bch2_journal_reclaim(&c->journal);
 			mutex_unlock(&c->journal.reclaim_lock);
-		}
+		} while (!ret && bch2_btree_key_cache_must_wait(c));
 
-		if (bch2_trans_relock(trans))
+		if (!ret && bch2_trans_relock(trans))
 			return 0;
 
 		trace_trans_restart_journal_reclaim(trans->ip);
diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
index 1dabad618870..4fd2b272e04e 100644
--- a/fs/bcachefs/journal_reclaim.c
+++ b/fs/bcachefs/journal_reclaim.c
@@ -485,13 +485,14 @@ static u64 journal_seq_to_flush(struct journal *j)
  * 512 journal entries or 25% of all journal buckets, then
  * journal_next_bucket() should not stall.
  */
-static void __bch2_journal_reclaim(struct journal *j, bool direct)
+static int __bch2_journal_reclaim(struct journal *j, bool direct)
 {
 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
 	bool kthread = (current->flags & PF_KTHREAD) != 0;
 	u64 seq_to_flush, nr_flushed = 0;
 	size_t min_nr;
 	unsigned flags;
+	int ret = 0;
 
 	/*
 	 * We can't invoke memory reclaim while holding the reclaim_lock -
@@ -506,6 +507,11 @@ static void __bch2_journal_reclaim(struct journal *j, bool direct)
 		if (kthread && kthread_should_stop())
 			break;
 
+		if (bch2_journal_error(j)) {
+			ret = -EIO;
+			break;
+		}
+
 		bch2_journal_do_discards(j);
 
 		seq_to_flush = journal_seq_to_flush(j);
@@ -547,27 +553,30 @@ static void __bch2_journal_reclaim(struct journal *j, bool direct)
 	} while (min_nr);
 
 	memalloc_noreclaim_restore(flags);
+
+	return ret;
 }
 
-void bch2_journal_reclaim(struct journal *j)
+int bch2_journal_reclaim(struct journal *j)
 {
-	__bch2_journal_reclaim(j, true);
+	return __bch2_journal_reclaim(j, true);
 }
 
 static int bch2_journal_reclaim_thread(void *arg)
 {
 	struct journal *j = arg;
 	unsigned long next;
+	int ret = 0;
 
 	set_freezable();
 
 	kthread_wait_freezable(test_bit(JOURNAL_RECLAIM_STARTED, &j->flags));
 
-	while (!kthread_should_stop()) {
+	while (!ret && !kthread_should_stop()) {
 		j->reclaim_kicked = false;
 
 		mutex_lock(&j->reclaim_lock);
-		__bch2_journal_reclaim(j, false);
+		ret = __bch2_journal_reclaim(j, false);
 		mutex_unlock(&j->reclaim_lock);
 
 		next = j->last_flushed + msecs_to_jiffies(j->reclaim_delay_ms);
diff --git a/fs/bcachefs/journal_reclaim.h b/fs/bcachefs/journal_reclaim.h
index e25355042e6e..3404fef241ea 100644
--- a/fs/bcachefs/journal_reclaim.h
+++ b/fs/bcachefs/journal_reclaim.h
@@ -73,7 +73,7 @@ static inline void bch2_journal_pin_update(struct journal *j, u64 seq,
 void bch2_journal_pin_flush(struct journal *, struct journal_entry_pin *);
 
 void bch2_journal_do_discards(struct journal *);
-void bch2_journal_reclaim(struct journal *);
+int bch2_journal_reclaim(struct journal *);
 
 void bch2_journal_reclaim_stop(struct journal *);
 int bch2_journal_reclaim_start(struct journal *);
-- 
cgit 


From 5db43418d5097b8aca5c725eb301186dee04c70a Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 3 Dec 2020 16:20:18 -0500
Subject: bcachefs: Don't issue btree writes that weren't journalled

If we have an error in the btree interior update path that prevents us
from journalling the update, we can't issue the corresponding btree node
write - we didn't get a journal sequence number that would cause it to
be ignored in recovery.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_io.c              |  5 +++++
 fs/bcachefs/btree_types.h           |  2 ++
 fs/bcachefs/btree_update_interior.c | 25 +++++++++++++++++++------
 3 files changed, 26 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 520eef531d39..b2ffdff48637 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -1499,6 +1499,9 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
 		if (!btree_node_may_write(b))
 			return;
 
+		if (old & (1 << BTREE_NODE_never_write))
+			return;
+
 		if (old & (1 << BTREE_NODE_write_in_flight)) {
 			btree_node_wait_on_io(b);
 			continue;
@@ -1545,6 +1548,8 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
 		seq = max(seq, le64_to_cpu(i->journal_seq));
 	}
 
+	BUG_ON(b->written && !seq);
+
 	/* bch2_varint_decode may read up to 7 bytes past the end of the buffer: */
 	bytes += 8;
 
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index d30c31f0f11f..51ad87abc763 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -415,6 +415,7 @@ enum btree_flags {
 	BTREE_NODE_fake,
 	BTREE_NODE_old_extent_overwrite,
 	BTREE_NODE_need_rewrite,
+	BTREE_NODE_never_write,
 };
 
 BTREE_FLAG(read_in_flight);
@@ -429,6 +430,7 @@ BTREE_FLAG(dying);
 BTREE_FLAG(fake);
 BTREE_FLAG(old_extent_overwrite);
 BTREE_FLAG(need_rewrite);
+BTREE_FLAG(never_write);
 
 static inline struct btree_write *btree_current_write(struct btree *b)
 {
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index aed54d5b5251..594bcd797516 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -603,17 +603,30 @@ err:
 
 		list_del(&as->write_blocked_list);
 
-		if (!ret && as->b == b) {
+		/*
+		 * Node might have been freed, recheck under
+		 * btree_interior_update_lock:
+		 */
+		if (as->b == b) {
 			struct bset *i = btree_bset_last(b);
 
 			BUG_ON(!b->c.level);
 			BUG_ON(!btree_node_dirty(b));
 
-			i->journal_seq = cpu_to_le64(
-				max(journal_seq,
-				    le64_to_cpu(i->journal_seq)));
-
-			bch2_btree_add_journal_pin(c, b, journal_seq);
+			if (!ret) {
+				i->journal_seq = cpu_to_le64(
+					max(journal_seq,
+					    le64_to_cpu(i->journal_seq)));
+
+				bch2_btree_add_journal_pin(c, b, journal_seq);
+			} else {
+				/*
+				 * If we didn't get a journal sequence number we
+				 * can't write this btree node, because recovery
+				 * won't know to ignore this write:
+				 */
+				set_btree_node_never_write(b);
+			}
 		}
 
 		mutex_unlock(&c->btree_interior_update_lock);
-- 
cgit 


From ebb84d094141eac9ee3e22d95abc9792a1c79eca Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 13 Nov 2020 18:36:33 -0500
Subject: bcachefs: Increase journal pipelining

This patch increases the maximum journal buffers in flight from 2 to 4 -
this will be particularly helpful when in the future we stop requiring
flush+fua for every journal write.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/journal.c         | 143 ++++++++++++++++++++++++++----------------
 fs/bcachefs/journal.h         |  47 +++++++++-----
 fs/bcachefs/journal_io.c      |  30 ++++++---
 fs/bcachefs/journal_reclaim.c |  46 ++++++++++----
 fs/bcachefs/journal_reclaim.h |   1 +
 fs/bcachefs/journal_types.h   |  18 +++---
 fs/bcachefs/recovery.c        |   4 +-
 7 files changed, 188 insertions(+), 101 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index 0cd868c8248b..ac2dddd90c31 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -23,7 +23,7 @@ static u64 last_unwritten_seq(struct journal *j)
 
 	lockdep_assert_held(&j->lock);
 
-	return journal_cur_seq(j) - s.prev_buf_unwritten;
+	return journal_cur_seq(j) - ((s.idx - s.unwritten_idx) & JOURNAL_BUF_MASK);
 }
 
 static inline bool journal_seq_unwritten(struct journal *j, u64 seq)
@@ -51,7 +51,7 @@ journal_seq_to_buf(struct journal *j, u64 seq)
 		j->reservations.cur_entry_offset == JOURNAL_ENTRY_CLOSED_VAL);
 
 	if (journal_seq_unwritten(j, seq)) {
-		buf = j->buf + (seq & 1);
+		buf = j->buf + (seq & JOURNAL_BUF_MASK);
 		EBUG_ON(le64_to_cpu(buf->data->seq) != seq);
 	}
 	return buf;
@@ -108,15 +108,8 @@ void bch2_journal_halt(struct journal *j)
 
 /* journal entry close/open: */
 
-void __bch2_journal_buf_put(struct journal *j, bool need_write_just_set)
+void __bch2_journal_buf_put(struct journal *j)
 {
-	if (!need_write_just_set &&
-	    test_bit(JOURNAL_NEED_WRITE, &j->flags))
-		bch2_time_stats_update(j->delay_time,
-				       j->need_write_time);
-
-	clear_bit(JOURNAL_NEED_WRITE, &j->flags);
-
 	closure_call(&j->io, bch2_journal_write, system_highpri_wq, NULL);
 }
 
@@ -129,7 +122,6 @@ static bool __journal_entry_close(struct journal *j)
 	struct journal_buf *buf = journal_cur_buf(j);
 	union journal_res_state old, new;
 	u64 v = atomic64_read(&j->reservations.counter);
-	bool set_need_write = false;
 	unsigned sectors;
 
 	lockdep_assert_held(&j->lock);
@@ -148,15 +140,13 @@ static bool __journal_entry_close(struct journal *j)
 		if (!test_bit(JOURNAL_NEED_WRITE, &j->flags)) {
 			set_bit(JOURNAL_NEED_WRITE, &j->flags);
 			j->need_write_time = local_clock();
-			set_need_write = true;
 		}
 
-		if (new.prev_buf_unwritten)
-			return false;
-
 		new.cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL;
 		new.idx++;
-		new.prev_buf_unwritten = 1;
+
+		if (new.idx == new.unwritten_idx)
+			return false;
 
 		BUG_ON(journal_state_count(new, new.idx));
 	} while ((v = atomic64_cmpxchg(&j->reservations.counter,
@@ -190,24 +180,44 @@ static bool __journal_entry_close(struct journal *j)
 	 */
 	buf->data->last_seq	= cpu_to_le64(journal_last_seq(j));
 
+	__bch2_journal_pin_put(j, le64_to_cpu(buf->data->seq));
+
 	journal_pin_new_entry(j, 1);
 
 	bch2_journal_buf_init(j);
 
 	cancel_delayed_work(&j->write_work);
+	clear_bit(JOURNAL_NEED_WRITE, &j->flags);
 
 	bch2_journal_space_available(j);
 
-	bch2_journal_buf_put(j, old.idx, set_need_write);
+	bch2_journal_buf_put(j, old.idx);
 	return true;
 }
 
+static bool journal_entry_want_write(struct journal *j)
+{
+	union journal_res_state s = READ_ONCE(j->reservations);
+	bool ret = false;
+
+	/*
+	 * Don't close it yet if we already have a write in flight, but do set
+	 * NEED_WRITE:
+	 */
+	if (s.idx != s.unwritten_idx)
+		set_bit(JOURNAL_NEED_WRITE, &j->flags);
+	else
+		ret = __journal_entry_close(j);
+
+	return ret;
+}
+
 static bool journal_entry_close(struct journal *j)
 {
 	bool ret;
 
 	spin_lock(&j->lock);
-	ret = __journal_entry_close(j);
+	ret = journal_entry_want_write(j);
 	spin_unlock(&j->lock);
 
 	return ret;
@@ -289,8 +299,8 @@ static int journal_entry_open(struct journal *j)
 
 static bool journal_quiesced(struct journal *j)
 {
-	union journal_res_state state = READ_ONCE(j->reservations);
-	bool ret = !state.prev_buf_unwritten && !__journal_entry_is_open(state);
+	union journal_res_state s = READ_ONCE(j->reservations);
+	bool ret = s.idx == s.unwritten_idx && !__journal_entry_is_open(s);
 
 	if (!ret)
 		journal_entry_close(j);
@@ -317,17 +327,29 @@ static void journal_write_work(struct work_struct *work)
 u64 bch2_inode_journal_seq(struct journal *j, u64 inode)
 {
 	size_t h = hash_64(inode, ilog2(sizeof(j->buf[0].has_inode) * 8));
-	u64 seq = 0;
+	union journal_res_state s;
+	unsigned i;
+	u64 seq;
 
-	if (!test_bit(h, j->buf[0].has_inode) &&
-	    !test_bit(h, j->buf[1].has_inode))
-		return 0;
 
 	spin_lock(&j->lock);
-	if (test_bit(h, journal_cur_buf(j)->has_inode))
-		seq = journal_cur_seq(j);
-	else if (test_bit(h, journal_prev_buf(j)->has_inode))
-		seq = journal_cur_seq(j) - 1;
+	seq = journal_cur_seq(j);
+	s = READ_ONCE(j->reservations);
+	i = s.idx;
+
+	while (1) {
+		if (test_bit(h, j->buf[i].has_inode))
+			goto out;
+
+		if (i == s.unwritten_idx)
+			break;
+
+		i = (i - 1) & JOURNAL_BUF_MASK;
+		seq--;
+	}
+
+	seq = 0;
+out:
 	spin_unlock(&j->lock);
 
 	return seq;
@@ -574,7 +596,7 @@ int bch2_journal_flush_seq_async(struct journal *j, u64 seq,
 			BUG();
 
 	if (seq == journal_cur_seq(j))
-		__journal_entry_close(j);
+		journal_entry_want_write(j);
 out:
 	spin_unlock(&j->lock);
 	return ret;
@@ -863,15 +885,18 @@ int bch2_dev_journal_alloc(struct bch_dev *ca)
 static bool bch2_journal_writing_to_device(struct journal *j, unsigned dev_idx)
 {
 	union journal_res_state state;
-	struct journal_buf *w;
-	bool ret;
+	bool ret = false;
+	unsigned i;
 
 	spin_lock(&j->lock);
 	state = READ_ONCE(j->reservations);
-	w = j->buf + !state.idx;
+	i = state.idx;
 
-	ret = state.prev_buf_unwritten &&
-		bch2_bkey_has_device(bkey_i_to_s_c(&w->key), dev_idx);
+	while (i != state.unwritten_idx) {
+		i = (i - 1) & JOURNAL_BUF_MASK;
+		if (bch2_bkey_has_device(bkey_i_to_s_c(&j->buf[i].key), dev_idx))
+			ret = true;
+	}
 	spin_unlock(&j->lock);
 
 	return ret;
@@ -957,7 +982,7 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq,
 
 	journal_pin_new_entry(j, 1);
 
-	j->reservations.idx = journal_cur_seq(j);
+	j->reservations.idx = j->reservations.unwritten_idx = journal_cur_seq(j);
 
 	bch2_journal_buf_init(j);
 
@@ -1015,8 +1040,10 @@ int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb)
 
 void bch2_fs_journal_exit(struct journal *j)
 {
-	kvpfree(j->buf[1].data, j->buf[1].buf_size);
-	kvpfree(j->buf[0].data, j->buf[0].buf_size);
+	unsigned i;
+
+	for (i = 0; i < ARRAY_SIZE(j->buf); i++)
+		kvpfree(j->buf[i].data, j->buf[i].buf_size);
 	free_fifo(&j->pin);
 }
 
@@ -1024,6 +1051,7 @@ int bch2_fs_journal_init(struct journal *j)
 {
 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
 	static struct lock_class_key res_key;
+	unsigned i;
 	int ret = 0;
 
 	pr_verbose_init(c->opts, "");
@@ -1038,8 +1066,6 @@ int bch2_fs_journal_init(struct journal *j)
 
 	lockdep_init_map(&j->res_map, "journal res", &res_key, 0);
 
-	j->buf[0].buf_size	= JOURNAL_ENTRY_SIZE_MIN;
-	j->buf[1].buf_size	= JOURNAL_ENTRY_SIZE_MIN;
 	j->write_delay_ms	= 1000;
 	j->reclaim_delay_ms	= 100;
 
@@ -1051,13 +1077,20 @@ int bch2_fs_journal_init(struct journal *j)
 		((union journal_res_state)
 		 { .cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL }).v);
 
-	if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)) ||
-	    !(j->buf[0].data = kvpmalloc(j->buf[0].buf_size, GFP_KERNEL)) ||
-	    !(j->buf[1].data = kvpmalloc(j->buf[1].buf_size, GFP_KERNEL))) {
+	if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL))) {
 		ret = -ENOMEM;
 		goto out;
 	}
 
+	for (i = 0; i < ARRAY_SIZE(j->buf); i++) {
+		j->buf[i].buf_size = JOURNAL_ENTRY_SIZE_MIN;
+		j->buf[i].data = kvpmalloc(j->buf[i].buf_size, GFP_KERNEL);
+		if (!j->buf[i].data) {
+			ret = -ENOMEM;
+			goto out;
+		}
+	}
+
 	j->pin.front = j->pin.back = 1;
 out:
 	pr_verbose_init(c->opts, "ret %i", ret);
@@ -1071,7 +1104,7 @@ void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
 	union journal_res_state s;
 	struct bch_dev *ca;
-	unsigned iter;
+	unsigned i;
 
 	rcu_read_lock();
 	spin_lock(&j->lock);
@@ -1114,16 +1147,16 @@ void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
 	}
 
 	pr_buf(out,
-	       "current entry refs:\t%u\n"
-	       "prev entry unwritten:\t",
-	       journal_state_count(s, s.idx));
-
-	if (s.prev_buf_unwritten)
-		pr_buf(out, "yes, ref %u sectors %u\n",
-		       journal_state_count(s, !s.idx),
-		       journal_prev_buf(j)->sectors);
-	else
-		pr_buf(out, "no\n");
+	       "current entry:\tidx %u refcount %u\n",
+	       s.idx, journal_state_count(s, s.idx));
+
+	i = s.idx;
+	while (i != s.unwritten_idx) {
+		i = (i - 1) & JOURNAL_BUF_MASK;
+
+		pr_buf(out, "unwritten entry:\tidx %u refcount %u sectors %u\n",
+		       i, journal_state_count(s, i), j->buf[i].sectors);
+	}
 
 	pr_buf(out,
 	       "need write:\t\t%i\n"
@@ -1131,7 +1164,7 @@ void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
 	       test_bit(JOURNAL_NEED_WRITE,	&j->flags),
 	       test_bit(JOURNAL_REPLAY_DONE,	&j->flags));
 
-	for_each_member_device_rcu(ca, c, iter,
+	for_each_member_device_rcu(ca, c, i,
 				   &c->rw_devs[BCH_DATA_journal]) {
 		struct journal_device *ja = &ca->journal;
 
@@ -1146,7 +1179,7 @@ void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
 		       "\tdirty_idx_ondisk\t%u (seq %llu)\n"
 		       "\tdirty_idx\t\t%u (seq %llu)\n"
 		       "\tcur_idx\t\t%u (seq %llu)\n",
-		       iter, ja->nr,
+		       i, ja->nr,
 		       bch2_journal_dev_buckets_available(j, ja, journal_space_discarded),
 		       ja->sectors_free,
 		       ja->discard_idx,
diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h
index 7ad2bb576eb0..1b6175cd6f1b 100644
--- a/fs/bcachefs/journal.h
+++ b/fs/bcachefs/journal.h
@@ -127,11 +127,6 @@ static inline struct journal_buf *journal_cur_buf(struct journal *j)
 	return j->buf + j->reservations.idx;
 }
 
-static inline struct journal_buf *journal_prev_buf(struct journal *j)
-{
-	return j->buf + !j->reservations.idx;
-}
-
 /* Sequence number of oldest dirty journal entry */
 
 static inline u64 journal_last_seq(struct journal *j)
@@ -151,13 +146,21 @@ void bch2_journal_set_has_inum(struct journal *, u64, u64);
 
 static inline int journal_state_count(union journal_res_state s, int idx)
 {
-	return idx == 0 ? s.buf0_count : s.buf1_count;
+	switch (idx) {
+	case 0: return s.buf0_count;
+	case 1: return s.buf1_count;
+	case 2: return s.buf2_count;
+	case 3: return s.buf3_count;
+	}
+	BUG();
 }
 
 static inline void journal_state_inc(union journal_res_state *s)
 {
 	s->buf0_count += s->idx == 0;
 	s->buf1_count += s->idx == 1;
+	s->buf2_count += s->idx == 2;
+	s->buf3_count += s->idx == 3;
 }
 
 static inline void bch2_journal_set_has_inode(struct journal *j,
@@ -257,21 +260,24 @@ static inline bool journal_entry_empty(struct jset *j)
 	return true;
 }
 
-void __bch2_journal_buf_put(struct journal *, bool);
+void __bch2_journal_buf_put(struct journal *);
 
-static inline void bch2_journal_buf_put(struct journal *j, unsigned idx,
-				       bool need_write_just_set)
+static inline void bch2_journal_buf_put(struct journal *j, unsigned idx)
 {
 	union journal_res_state s;
 
 	s.v = atomic64_sub_return(((union journal_res_state) {
 				    .buf0_count = idx == 0,
 				    .buf1_count = idx == 1,
+				    .buf2_count = idx == 2,
+				    .buf3_count = idx == 3,
 				    }).v, &j->reservations.counter);
-	if (!journal_state_count(s, idx)) {
-		EBUG_ON(s.idx == idx || !s.prev_buf_unwritten);
-		__bch2_journal_buf_put(j, need_write_just_set);
-	}
+
+	EBUG_ON(((s.idx - idx) & 3) >
+		((s.idx - s.unwritten_idx) & 3));
+
+	if (!journal_state_count(s, idx) && idx == s.unwritten_idx)
+		__bch2_journal_buf_put(j);
 }
 
 /*
@@ -291,7 +297,7 @@ static inline void bch2_journal_res_put(struct journal *j,
 				       BCH_JSET_ENTRY_btree_keys,
 				       0, 0, NULL, 0);
 
-	bch2_journal_buf_put(j, res->idx, false);
+	bch2_journal_buf_put(j, res->idx);
 
 	res->ref = 0;
 }
@@ -327,11 +333,18 @@ static inline int journal_res_get_fast(struct journal *j,
 		    !test_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags))
 			return 0;
 
-		if (flags & JOURNAL_RES_GET_CHECK)
-			return 1;
-
 		new.cur_entry_offset += res->u64s;
 		journal_state_inc(&new);
+
+		/*
+		 * If the refcount would overflow, we have to wait:
+		 * XXX - tracepoint this:
+		 */
+		if (!journal_state_count(new, new.idx))
+			return 0;
+
+		if (flags & JOURNAL_RES_GET_CHECK)
+			return 1;
 	} while ((v = atomic64_cmpxchg(&j->reservations.counter,
 				       old.v, new.v)) != old.v);
 
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index fc2fdcc2b627..1aeeb58d3c2a 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -950,16 +950,23 @@ static void journal_buf_realloc(struct journal *j, struct journal_buf *buf)
 	buf->buf_size	= new_size;
 }
 
+static inline struct journal_buf *journal_last_unwritten_buf(struct journal *j)
+{
+	return j->buf + j->reservations.unwritten_idx;
+}
+
 static void journal_write_done(struct closure *cl)
 {
 	struct journal *j = container_of(cl, struct journal, io);
 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
-	struct journal_buf *w = journal_prev_buf(j);
+	struct journal_buf *w = journal_last_unwritten_buf(j);
 	struct bch_devs_list devs =
 		bch2_bkey_devs(bkey_i_to_s_c(&w->key));
 	struct bch_replicas_padded replicas;
+	union journal_res_state old, new;
 	u64 seq = le64_to_cpu(w->data->seq);
 	u64 last_seq = le64_to_cpu(w->data->last_seq);
+	u64 v;
 	int err = 0;
 
 	bch2_time_stats_update(j->write_time, j->write_start_time);
@@ -998,9 +1005,14 @@ static void journal_write_done(struct closure *cl)
 	/* also must come before signalling write completion: */
 	closure_debug_destroy(cl);
 
-	BUG_ON(!j->reservations.prev_buf_unwritten);
-	atomic64_sub(((union journal_res_state) { .prev_buf_unwritten = 1 }).v,
-		     &j->reservations.counter);
+	v = atomic64_read(&j->reservations.counter);
+	do {
+		old.v = new.v = v;
+		BUG_ON(new.idx == new.unwritten_idx);
+
+		new.unwritten_idx++;
+	} while ((v = atomic64_cmpxchg(&j->reservations.counter,
+				       old.v, new.v)) != old.v);
 
 	closure_wake_up(&w->wait);
 	journal_wake(j);
@@ -1008,6 +1020,10 @@ static void journal_write_done(struct closure *cl)
 	if (test_bit(JOURNAL_NEED_WRITE, &j->flags))
 		mod_delayed_work(system_freezable_wq, &j->write_work, 0);
 	spin_unlock(&j->lock);
+
+	if (new.unwritten_idx != new.idx &&
+	    !journal_state_count(new, new.unwritten_idx))
+		closure_call(&j->io, bch2_journal_write, system_highpri_wq, NULL);
 }
 
 static void journal_write_endio(struct bio *bio)
@@ -1018,7 +1034,7 @@ static void journal_write_endio(struct bio *bio)
 	if (bch2_dev_io_err_on(bio->bi_status, ca, "journal write error: %s",
 			       bch2_blk_status_to_str(bio->bi_status)) ||
 	    bch2_meta_write_fault("journal")) {
-		struct journal_buf *w = journal_prev_buf(j);
+		struct journal_buf *w = journal_last_unwritten_buf(j);
 		unsigned long flags;
 
 		spin_lock_irqsave(&j->err_lock, flags);
@@ -1035,7 +1051,7 @@ void bch2_journal_write(struct closure *cl)
 	struct journal *j = container_of(cl, struct journal, io);
 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
 	struct bch_dev *ca;
-	struct journal_buf *w = journal_prev_buf(j);
+	struct journal_buf *w = journal_last_unwritten_buf(j);
 	struct jset_entry *start, *end;
 	struct jset *jset;
 	struct bio *bio;
@@ -1046,8 +1062,6 @@ void bch2_journal_write(struct closure *cl)
 
 	BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb));
 
-	bch2_journal_pin_put(j, le64_to_cpu(w->data->seq));
-
 	journal_buf_realloc(j, w);
 	jset = w->data;
 
diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
index 4fd2b272e04e..c50352385a47 100644
--- a/fs/bcachefs/journal_reclaim.c
+++ b/fs/bcachefs/journal_reclaim.c
@@ -58,6 +58,19 @@ static void journal_set_remaining(struct journal *j, unsigned u64s_remaining)
 				       old.v, new.v)) != old.v);
 }
 
+static inline unsigned get_unwritten_sectors(struct journal *j, unsigned *idx)
+{
+	unsigned sectors = 0;
+
+	while (!sectors && *idx != j->reservations.idx) {
+		sectors = j->buf[*idx].sectors;
+
+		*idx = (*idx + 1) & JOURNAL_BUF_MASK;
+	}
+
+	return sectors;
+}
+
 static struct journal_space {
 	unsigned	next_entry;
 	unsigned	remaining;
@@ -69,15 +82,14 @@ static struct journal_space {
 	unsigned sectors_next_entry	= UINT_MAX;
 	unsigned sectors_total		= UINT_MAX;
 	unsigned i, nr_devs = 0;
-	unsigned unwritten_sectors = j->reservations.prev_buf_unwritten
-		? journal_prev_buf(j)->sectors
-		: 0;
+	unsigned unwritten_sectors;
 
 	rcu_read_lock();
 	for_each_member_device_rcu(ca, c, i,
 				   &c->rw_devs[BCH_DATA_journal]) {
 		struct journal_device *ja = &ca->journal;
 		unsigned buckets_this_device, sectors_this_device;
+		unsigned idx = j->reservations.unwritten_idx;
 
 		if (!ja->nr)
 			continue;
@@ -89,16 +101,20 @@ static struct journal_space {
 		 * We that we don't allocate the space for a journal entry
 		 * until we write it out - thus, account for it here:
 		 */
-		if (unwritten_sectors >= sectors_this_device) {
-			if (!buckets_this_device)
-				continue;
-
-			buckets_this_device--;
-			sectors_this_device = ca->mi.bucket_size;
+		while ((unwritten_sectors = get_unwritten_sectors(j, &idx))) {
+			if (unwritten_sectors >= sectors_this_device) {
+				if (!buckets_this_device) {
+					sectors_this_device = 0;
+					break;
+				}
+
+				buckets_this_device--;
+				sectors_this_device = ca->mi.bucket_size;
+			}
+
+			sectors_this_device -= unwritten_sectors;
 		}
 
-		sectors_this_device -= unwritten_sectors;
-
 		if (sectors_this_device < ca->mi.bucket_size &&
 		    buckets_this_device) {
 			buckets_this_device--;
@@ -277,6 +293,14 @@ static void bch2_journal_reclaim_fast(struct journal *j)
 		bch2_journal_space_available(j);
 }
 
+void __bch2_journal_pin_put(struct journal *j, u64 seq)
+{
+	struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq);
+
+	if (atomic_dec_and_test(&pin_list->count))
+		bch2_journal_reclaim_fast(j);
+}
+
 void bch2_journal_pin_put(struct journal *j, u64 seq)
 {
 	struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq);
diff --git a/fs/bcachefs/journal_reclaim.h b/fs/bcachefs/journal_reclaim.h
index 3404fef241ea..b0f05839396d 100644
--- a/fs/bcachefs/journal_reclaim.h
+++ b/fs/bcachefs/journal_reclaim.h
@@ -39,6 +39,7 @@ journal_seq_pin(struct journal *j, u64 seq)
 	return &j->pin.data[seq & j->pin.mask];
 }
 
+void __bch2_journal_pin_put(struct journal *, u64);
 void bch2_journal_pin_put(struct journal *, u64);
 void bch2_journal_pin_drop(struct journal *, struct journal_entry_pin *);
 
diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h
index 7e328ccc0a8f..ec19f75f8ede 100644
--- a/fs/bcachefs/journal_types.h
+++ b/fs/bcachefs/journal_types.h
@@ -11,13 +11,13 @@
 
 struct journal_res;
 
-#define JOURNAL_BUF_BITS	1
+#define JOURNAL_BUF_BITS	2
 #define JOURNAL_BUF_NR		(1U << JOURNAL_BUF_BITS)
 #define JOURNAL_BUF_MASK	(JOURNAL_BUF_NR - 1)
 
 /*
- * We put two of these in struct journal; we used them for writes to the
- * journal that are being staged or in flight.
+ * We put JOURNAL_BUF_NR of these in struct journal; we used them for writes to
+ * the journal that are being staged or in flight.
  */
 struct journal_buf {
 	struct jset		*data;
@@ -85,10 +85,12 @@ union journal_res_state {
 
 	struct {
 		u64		cur_entry_offset:20,
-				idx:1,
-				prev_buf_unwritten:1,
-				buf0_count:21,
-				buf1_count:21;
+				idx:2,
+				unwritten_idx:2,
+				buf0_count:10,
+				buf1_count:10,
+				buf2_count:10,
+				buf3_count:10;
 	};
 };
 
@@ -169,7 +171,7 @@ struct journal {
 	 * Two journal entries -- one is currently open for new entries, the
 	 * other is possibly being written out.
 	 */
-	struct journal_buf	buf[2];
+	struct journal_buf	buf[JOURNAL_BUF_NR];
 
 	spinlock_t		lock;
 
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index d24cef2bf1aa..7ad5b8234747 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -1048,13 +1048,13 @@ int bch2_fs_recovery(struct bch_fs *c)
 	if (!c->sb.clean) {
 		ret = bch2_journal_seq_blacklist_add(c,
 						     journal_seq,
-						     journal_seq + 4);
+						     journal_seq + 8);
 		if (ret) {
 			bch_err(c, "error creating new journal seq blacklist entry");
 			goto err;
 		}
 
-		journal_seq += 4;
+		journal_seq += 8;
 
 		/*
 		 * The superblock needs to be written before we do any btree
-- 
cgit 


From b6df4325cd914d988e5b96016f64b879058d0bc6 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 14 Nov 2020 12:29:21 -0500
Subject: bcachefs: Improve journal free space calculations

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/journal.c         |  25 ++++++--
 fs/bcachefs/journal_reclaim.c | 136 ++++++++++++++++++++++--------------------
 fs/bcachefs/journal_reclaim.h |   6 --
 fs/bcachefs/journal_types.h   |  18 +++++-
 4 files changed, 108 insertions(+), 77 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index ac2dddd90c31..3bbb23d7739a 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -1147,7 +1147,7 @@ void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
 	}
 
 	pr_buf(out,
-	       "current entry:\tidx %u refcount %u\n",
+	       "current entry:\t\tidx %u refcount %u\n",
 	       s.idx, journal_state_count(s, s.idx));
 
 	i = s.idx;
@@ -1164,6 +1164,20 @@ void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
 	       test_bit(JOURNAL_NEED_WRITE,	&j->flags),
 	       test_bit(JOURNAL_REPLAY_DONE,	&j->flags));
 
+	pr_buf(out, "space:\n");
+	pr_buf(out, "\tdiscarded\t%u:%u\n",
+	       j->space[journal_space_discarded].next_entry,
+	       j->space[journal_space_discarded].total);
+	pr_buf(out, "\tclean ondisk\t%u:%u\n",
+	       j->space[journal_space_clean_ondisk].next_entry,
+	       j->space[journal_space_clean_ondisk].total);
+	pr_buf(out, "\tclean\t\t%u:%u\n",
+	       j->space[journal_space_clean].next_entry,
+	       j->space[journal_space_clean].total);
+	pr_buf(out, "\ttotal\t\t%u:%u\n",
+	       j->space[journal_space_total].next_entry,
+	       j->space[journal_space_total].total);
+
 	for_each_member_device_rcu(ca, c, i,
 				   &c->rw_devs[BCH_DATA_journal]) {
 		struct journal_device *ja = &ca->journal;
@@ -1174,12 +1188,13 @@ void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
 		pr_buf(out,
 		       "dev %u:\n"
 		       "\tnr\t\t%u\n"
+		       "\tbucket size\t%u\n"
 		       "\tavailable\t%u:%u\n"
-		       "\tdiscard_idx\t\t%u\n"
-		       "\tdirty_idx_ondisk\t%u (seq %llu)\n"
-		       "\tdirty_idx\t\t%u (seq %llu)\n"
+		       "\tdiscard_idx\t%u\n"
+		       "\tdirty_ondisk\t%u (seq %llu)\n"
+		       "\tdirty_idx\t%u (seq %llu)\n"
 		       "\tcur_idx\t\t%u (seq %llu)\n",
-		       i, ja->nr,
+		       i, ja->nr, ca->mi.bucket_size,
 		       bch2_journal_dev_buckets_available(j, ja, journal_space_discarded),
 		       ja->sectors_free,
 		       ja->discard_idx,
diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
index c50352385a47..c6267284a028 100644
--- a/fs/bcachefs/journal_reclaim.c
+++ b/fs/bcachefs/journal_reclaim.c
@@ -71,84 +71,94 @@ static inline unsigned get_unwritten_sectors(struct journal *j, unsigned *idx)
 	return sectors;
 }
 
-static struct journal_space {
-	unsigned	next_entry;
-	unsigned	remaining;
-} __journal_space_available(struct journal *j, unsigned nr_devs_want,
+static struct journal_space
+journal_dev_space_available(struct journal *j, struct bch_dev *ca,
 			    enum journal_space_from from)
 {
-	struct bch_fs *c = container_of(j, struct bch_fs, journal);
-	struct bch_dev *ca;
-	unsigned sectors_next_entry	= UINT_MAX;
-	unsigned sectors_total		= UINT_MAX;
-	unsigned i, nr_devs = 0;
-	unsigned unwritten_sectors;
+	struct journal_device *ja = &ca->journal;
+	unsigned sectors, buckets, unwritten, idx = j->reservations.unwritten_idx;
 
-	rcu_read_lock();
-	for_each_member_device_rcu(ca, c, i,
-				   &c->rw_devs[BCH_DATA_journal]) {
-		struct journal_device *ja = &ca->journal;
-		unsigned buckets_this_device, sectors_this_device;
-		unsigned idx = j->reservations.unwritten_idx;
+	if (from == journal_space_total)
+		return (struct journal_space) {
+			.next_entry	= ca->mi.bucket_size,
+			.total		= ca->mi.bucket_size * ja->nr,
+		};
 
-		if (!ja->nr)
-			continue;
-
-		buckets_this_device = bch2_journal_dev_buckets_available(j, ja, from);
-		sectors_this_device = ja->sectors_free;
+	buckets = bch2_journal_dev_buckets_available(j, ja, from);
+	sectors = ja->sectors_free;
 
-		/*
-		 * We that we don't allocate the space for a journal entry
-		 * until we write it out - thus, account for it here:
-		 */
-		while ((unwritten_sectors = get_unwritten_sectors(j, &idx))) {
-			if (unwritten_sectors >= sectors_this_device) {
-				if (!buckets_this_device) {
-					sectors_this_device = 0;
-					break;
-				}
-
-				buckets_this_device--;
-				sectors_this_device = ca->mi.bucket_size;
+	/*
+	 * We that we don't allocate the space for a journal entry
+	 * until we write it out - thus, account for it here:
+	 */
+	while ((unwritten = get_unwritten_sectors(j, &idx))) {
+		if (unwritten >= sectors) {
+			if (!buckets) {
+				sectors = 0;
+				break;
 			}
 
-			sectors_this_device -= unwritten_sectors;
+			buckets--;
+			sectors = ca->mi.bucket_size;
 		}
 
-		if (sectors_this_device < ca->mi.bucket_size &&
-		    buckets_this_device) {
-			buckets_this_device--;
-			sectors_this_device = ca->mi.bucket_size;
-		}
+		sectors -= unwritten;
+	}
+
+	if (sectors < ca->mi.bucket_size && buckets) {
+		buckets--;
+		sectors = ca->mi.bucket_size;
+	}
+
+	return (struct journal_space) {
+		.next_entry	= sectors,
+		.total		= sectors + buckets * ca->mi.bucket_size,
+	};
+}
 
-		if (!sectors_this_device)
+static struct journal_space __journal_space_available(struct journal *j, unsigned nr_devs_want,
+			    enum journal_space_from from)
+{
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
+	struct bch_dev *ca;
+	unsigned i, pos, nr_devs = 0;
+	struct journal_space space, dev_space[BCH_SB_MEMBERS_MAX];
+
+	BUG_ON(nr_devs_want > ARRAY_SIZE(dev_space));
+
+	rcu_read_lock();
+	for_each_member_device_rcu(ca, c, i,
+				   &c->rw_devs[BCH_DATA_journal]) {
+		if (!ca->journal.nr)
 			continue;
 
-		sectors_next_entry = min(sectors_next_entry,
-					 sectors_this_device);
+		space = journal_dev_space_available(j, ca, from);
+		if (!space.next_entry)
+			continue;
 
-		sectors_total = min(sectors_total,
-			buckets_this_device * ca->mi.bucket_size +
-			sectors_this_device);
+		for (pos = 0; pos < nr_devs; pos++)
+			if (space.total > dev_space[pos].total)
+				break;
 
-		nr_devs++;
+		array_insert_item(dev_space, nr_devs, pos, space);
 	}
 	rcu_read_unlock();
 
 	if (nr_devs < nr_devs_want)
 		return (struct journal_space) { 0, 0 };
 
-	return (struct journal_space) {
-		.next_entry	= sectors_next_entry,
-		.remaining	= max_t(int, 0, sectors_total - sectors_next_entry),
-	};
+	/*
+	 * We sorted largest to smallest, and we want the smallest out of the
+	 * @nr_devs_want largest devices:
+	 */
+	return dev_space[nr_devs_want - 1];
 }
 
 void bch2_journal_space_available(struct journal *j)
 {
 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
 	struct bch_dev *ca;
-	struct journal_space discarded, clean_ondisk, clean;
+	unsigned clean;
 	unsigned overhead, u64s_remaining = 0;
 	unsigned max_entry_size	 = min(j->buf[0].buf_size >> 9,
 				       j->buf[1].buf_size >> 9);
@@ -189,27 +199,25 @@ void bch2_journal_space_available(struct journal *j)
 		goto out;
 	}
 
-	if (!fifo_free(&j->pin)) {
-		ret = cur_entry_journal_pin_full;
-		goto out;
-	}
-
 	nr_devs_want = min_t(unsigned, nr_online, c->opts.metadata_replicas);
 
-	discarded	= __journal_space_available(j, nr_devs_want, journal_space_discarded);
-	clean_ondisk	= __journal_space_available(j, nr_devs_want, journal_space_clean_ondisk);
-	clean		= __journal_space_available(j, nr_devs_want, journal_space_clean);
+	for (i = 0; i < journal_space_nr; i++)
+		j->space[i] = __journal_space_available(j, nr_devs_want, i);
 
-	if (!discarded.next_entry)
+	clean		= j->space[journal_space_clean].total;
+
+	if (!j->space[journal_space_discarded].next_entry)
 		ret = cur_entry_journal_full;
+	else if (!fifo_free(&j->pin))
+		ret = cur_entry_journal_pin_full;
 
-	overhead = DIV_ROUND_UP(clean.remaining, max_entry_size) *
+	overhead = DIV_ROUND_UP(clean, max_entry_size) *
 		journal_entry_overhead(j);
-	u64s_remaining = clean.remaining << 6;
+	u64s_remaining = clean << 6;
 	u64s_remaining = max_t(int, 0, u64s_remaining - overhead);
 	u64s_remaining /= 4;
 out:
-	j->cur_entry_sectors	= !ret ? discarded.next_entry : 0;
+	j->cur_entry_sectors	= !ret ? j->space[journal_space_discarded].next_entry : 0;
 	j->cur_entry_error	= ret;
 	journal_set_remaining(j, u64s_remaining);
 	journal_check_may_get_unreserved(j);
diff --git a/fs/bcachefs/journal_reclaim.h b/fs/bcachefs/journal_reclaim.h
index b0f05839396d..f02caa3d49ea 100644
--- a/fs/bcachefs/journal_reclaim.h
+++ b/fs/bcachefs/journal_reclaim.h
@@ -4,12 +4,6 @@
 
 #define JOURNAL_PIN	(32 * 1024)
 
-enum journal_space_from {
-	journal_space_discarded,
-	journal_space_clean_ondisk,
-	journal_space_clean,
-};
-
 static inline void journal_reclaim_kick(struct journal *j)
 {
 	struct task_struct *p = READ_ONCE(j->reclaim_thread);
diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h
index ec19f75f8ede..6b525dc6ab7c 100644
--- a/fs/bcachefs/journal_types.h
+++ b/fs/bcachefs/journal_types.h
@@ -9,8 +9,6 @@
 #include "super_types.h"
 #include "fifo.h"
 
-struct journal_res;
-
 #define JOURNAL_BUF_BITS	2
 #define JOURNAL_BUF_NR		(1U << JOURNAL_BUF_BITS)
 #define JOURNAL_BUF_MASK	(JOURNAL_BUF_NR - 1)
@@ -122,6 +120,20 @@ union journal_preres_state {
 #define JOURNAL_ENTRY_CLOSED_VAL	(JOURNAL_ENTRY_OFFSET_MAX - 1)
 #define JOURNAL_ENTRY_ERROR_VAL		(JOURNAL_ENTRY_OFFSET_MAX)
 
+struct journal_space {
+	/* Units of 512 bytes sectors: */
+	unsigned	next_entry; /* How big the next journal entry can be */
+	unsigned	total;
+};
+
+enum journal_space_from {
+	journal_space_discarded,
+	journal_space_clean_ondisk,
+	journal_space_clean,
+	journal_space_total,
+	journal_space_nr,
+};
+
 /*
  * JOURNAL_NEED_WRITE - current (pending) journal entry should be written ASAP,
  * either because something's waiting on the write to complete or because it's
@@ -216,6 +228,8 @@ struct journal {
 		struct journal_entry_pin_list *data;
 	}			pin;
 
+	struct journal_space	space[journal_space_nr];
+
 	u64			replay_journal_seq;
 	u64			replay_journal_seq_end;
 
-- 
cgit 


From adbcada43fa79197224b5a522b1faaf222b43bcd Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 14 Nov 2020 09:59:58 -0500
Subject: bcachefs: Don't require flush/fua on every journal write

This patch adds a flag to journal entries which, if set, indicates that
they weren't done as flush/fua writes.

 - non flush/fua journal writes don't update last_seq (i.e. they don't
   free up space in the journal), thus the journal free space
   calculations now check whether nonflush journal writes are currently
   allowed (i.e. are we low on free space, or would doing a flush write
   free up a lot of space in the journal)

 - write_delay_ms, the user configurable option for when open journal
   entries are automatically written, is now interpreted as the max
   delay between flush journal writes (default 1 second).

 - bch2_journal_flush_seq_async is changed to ensure a flush write >=
   the requested sequence number has happened

 - journal read/replay must now ignore, and blacklist, any journal
   entries newer than the most recent flush entry in the journal. Also,
   the way the read_entire_journal option is handled has been improved;
   struct journal_replay now has an entry, 'ignore', for entries that
   were read but should not be used.

 - assorted refactoring and improvements related to journal read in
   journal_io.c and recovery.c

Previously, we'd have to issue a flush/fua write every time we
accumulated a full journal entry - typically the bucket size. Now we
need to issue them much less frequently: when an fsync is requested, or
it's been more than write_delay_ms since the last flush, or when we need
to free up space in the journal. This is a significant performance
improvement on many write heavy workloads.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs_format.h       |   7 +-
 fs/bcachefs/journal.c               |  54 +++++++++-
 fs/bcachefs/journal.h               |   2 +-
 fs/bcachefs/journal_io.c            | 208 +++++++++++++++++++++++++++++-------
 fs/bcachefs/journal_io.h            |   3 +-
 fs/bcachefs/journal_reclaim.c       |  10 +-
 fs/bcachefs/journal_seq_blacklist.c |   5 +-
 fs/bcachefs/journal_types.h         |   8 ++
 fs/bcachefs/recovery.c              | 166 ++++++++++++----------------
 9 files changed, 312 insertions(+), 151 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index f072e865e43f..7df2bc7ecd4f 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -1336,14 +1336,16 @@ LE64_BITMASK(BCH_SB_ERASURE_CODE,	struct bch_sb, flags[3],  0, 16);
 	x(extents_above_btree_updates,	12)	\
 	x(btree_updates_journalled,	13)	\
 	x(reflink_inline_data,		14)	\
-	x(new_varint,			15)
+	x(new_varint,			15)	\
+	x(journal_no_flush,		16)
 
 #define BCH_SB_FEATURES_ALL				\
 	((1ULL << BCH_FEATURE_new_siphash)|		\
 	 (1ULL << BCH_FEATURE_new_extent_overwrite)|	\
 	 (1ULL << BCH_FEATURE_btree_ptr_v2)|		\
 	 (1ULL << BCH_FEATURE_extents_above_btree_updates)|\
-	 (1ULL << BCH_FEATURE_new_varint))\
+	 (1ULL << BCH_FEATURE_new_varint)|		\
+	 (1ULL << BCH_FEATURE_journal_no_flush))
 
 enum bch_sb_feature {
 #define x(f, n) BCH_FEATURE_##f,
@@ -1582,6 +1584,7 @@ struct jset {
 
 LE32_BITMASK(JSET_CSUM_TYPE,	struct jset, flags, 0, 4);
 LE32_BITMASK(JSET_BIG_ENDIAN,	struct jset, flags, 4, 5);
+LE32_BITMASK(JSET_NO_FLUSH,	struct jset, flags, 5, 6);
 
 #define BCH_JOURNAL_BUCKETS_MIN		8
 
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index 3bbb23d7739a..31168754d6b8 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -79,6 +79,8 @@ static void bch2_journal_buf_init(struct journal *j)
 	struct journal_buf *buf = journal_cur_buf(j);
 
 	bkey_extent_init(&buf->key);
+	buf->noflush	= false;
+	buf->must_flush	= false;
 
 	memset(buf->has_inode, 0, sizeof(buf->has_inode));
 
@@ -574,7 +576,7 @@ int bch2_journal_flush_seq_async(struct journal *j, u64 seq,
 	struct journal_buf *buf;
 	int ret = 0;
 
-	if (seq <= j->seq_ondisk)
+	if (seq <= j->flushed_seq_ondisk)
 		return 1;
 
 	spin_lock(&j->lock);
@@ -585,16 +587,53 @@ int bch2_journal_flush_seq_async(struct journal *j, u64 seq,
 		goto out;
 	}
 
-	if (seq <= j->seq_ondisk) {
+	if (seq <= j->flushed_seq_ondisk) {
 		ret = 1;
 		goto out;
 	}
 
-	if (parent &&
-	    (buf = journal_seq_to_buf(j, seq)))
-		if (!closure_wait(&buf->wait, parent))
+	/* if seq was written, but not flushed - flush a newer one instead */
+	seq = max(seq, last_unwritten_seq(j));
+
+recheck_need_open:
+	if (seq == journal_cur_seq(j) && !journal_entry_is_open(j)) {
+		struct journal_res res = { 0 };
+
+		spin_unlock(&j->lock);
+
+		ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0);
+		if (ret)
+			return ret;
+
+		seq = res.seq;
+		buf = j->buf + (seq & JOURNAL_BUF_MASK);
+		buf->must_flush = true;
+		set_bit(JOURNAL_NEED_WRITE, &j->flags);
+
+		if (parent && !closure_wait(&buf->wait, parent))
 			BUG();
 
+		bch2_journal_res_put(j, &res);
+
+		spin_lock(&j->lock);
+		goto want_write;
+	}
+
+	/*
+	 * if write was kicked off without a flush, flush the next sequence
+	 * number instead
+	 */
+	buf = journal_seq_to_buf(j, seq);
+	if (buf->noflush) {
+		seq++;
+		goto recheck_need_open;
+	}
+
+	buf->must_flush = true;
+
+	if (parent && !closure_wait(&buf->wait, parent))
+		BUG();
+want_write:
 	if (seq == journal_cur_seq(j))
 		journal_entry_want_write(j);
 out:
@@ -979,6 +1018,7 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq,
 	spin_lock(&j->lock);
 
 	set_bit(JOURNAL_STARTED, &j->flags);
+	j->last_flush_write = jiffies;
 
 	journal_pin_new_entry(j, 1);
 
@@ -1116,6 +1156,8 @@ void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
 	       "last_seq:\t\t%llu\n"
 	       "last_seq_ondisk:\t%llu\n"
 	       "prereserved:\t\t%u/%u\n"
+	       "nr flush writes:\t%llu\n"
+	       "nr noflush writes:\t%llu\n"
 	       "nr direct reclaim:\t%llu\n"
 	       "nr background reclaim:\t%llu\n"
 	       "current entry sectors:\t%u\n"
@@ -1127,6 +1169,8 @@ void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
 	       j->last_seq_ondisk,
 	       j->prereserved.reserved,
 	       j->prereserved.remaining,
+	       j->nr_flush_writes,
+	       j->nr_noflush_writes,
 	       j->nr_direct_reclaim,
 	       j->nr_background_reclaim,
 	       j->cur_entry_sectors,
diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h
index 1b6175cd6f1b..2c0014c3c02f 100644
--- a/fs/bcachefs/journal.h
+++ b/fs/bcachefs/journal.h
@@ -136,7 +136,7 @@ static inline u64 journal_last_seq(struct journal *j)
 
 static inline u64 journal_cur_seq(struct journal *j)
 {
-	BUG_ON(j->pin.back - 1 != atomic64_read(&j->seq));
+	EBUG_ON(j->pin.back - 1 != atomic64_read(&j->seq));
 
 	return j->pin.back - 1;
 }
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 1aeeb58d3c2a..26556bb381b2 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -10,9 +10,26 @@
 #include "journal.h"
 #include "journal_io.h"
 #include "journal_reclaim.h"
+#include "journal_seq_blacklist.h"
 #include "replicas.h"
 #include "trace.h"
 
+static void __journal_replay_free(struct journal_replay *i)
+{
+	list_del(&i->list);
+	kvpfree(i, offsetof(struct journal_replay, j) +
+		vstruct_bytes(&i->j));
+
+}
+
+static void journal_replay_free(struct bch_fs *c, struct journal_replay *i)
+{
+	i->ignore = true;
+
+	if (!c->opts.read_entire_journal)
+		__journal_replay_free(i);
+}
+
 struct journal_list {
 	struct closure		cl;
 	struct mutex		lock;
@@ -35,28 +52,29 @@ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca,
 	struct bch_devs_list devs = { .nr = 0 };
 	struct list_head *where;
 	size_t bytes = vstruct_bytes(j);
-	__le64 last_seq;
+	u64 last_seq = 0;
 	int ret;
 
-	last_seq = !list_empty(jlist->head)
-		? list_last_entry(jlist->head, struct journal_replay,
-				  list)->j.last_seq
-		: 0;
-
-	if (!c->opts.read_entire_journal) {
-		/* Is this entry older than the range we need? */
-		if (le64_to_cpu(j->seq) < le64_to_cpu(last_seq)) {
-			ret = JOURNAL_ENTRY_ADD_OUT_OF_RANGE;
-			goto out;
+	list_for_each_entry_reverse(i, jlist->head, list) {
+		if (!JSET_NO_FLUSH(&i->j)) {
+			last_seq = le64_to_cpu(i->j.last_seq);
+			break;
 		}
+	}
 
-		/* Drop entries we don't need anymore */
+	/* Is this entry older than the range we need? */
+	if (!c->opts.read_entire_journal &&
+	    le64_to_cpu(j->seq) < last_seq) {
+		ret = JOURNAL_ENTRY_ADD_OUT_OF_RANGE;
+		goto out;
+	}
+
+	/* Drop entries we don't need anymore */
+	if (!JSET_NO_FLUSH(j)) {
 		list_for_each_entry_safe(i, pos, jlist->head, list) {
 			if (le64_to_cpu(i->j.seq) >= le64_to_cpu(j->last_seq))
 				break;
-			list_del(&i->list);
-			kvpfree(i, offsetof(struct journal_replay, j) +
-				vstruct_bytes(&i->j));
+			journal_replay_free(c, i);
 		}
 	}
 
@@ -80,9 +98,7 @@ add:
 	if (i && le64_to_cpu(j->seq) == le64_to_cpu(i->j.seq)) {
 		if (i->bad) {
 			devs = i->devs;
-			list_del(&i->list);
-			kvpfree(i, offsetof(struct journal_replay, j) +
-				vstruct_bytes(&i->j));
+			__journal_replay_free(i);
 		} else if (bad) {
 			goto found;
 		} else {
@@ -104,6 +120,7 @@ add:
 	list_add(&i->list, where);
 	i->devs = devs;
 	i->bad	= bad;
+	i->ignore = false;
 	unsafe_memcpy(&i->j, j, bytes, "embedded variable length struct");
 found:
 	if (!bch2_dev_list_has_dev(i->devs, ca->dev_idx))
@@ -698,14 +715,16 @@ err:
 	goto out;
 }
 
-int bch2_journal_read(struct bch_fs *c, struct list_head *list)
+int bch2_journal_read(struct bch_fs *c, struct list_head *list,
+		      u64 *blacklist_seq, u64 *start_seq)
 {
 	struct journal_list jlist;
-	struct journal_replay *i;
+	struct journal_replay *i, *t;
 	struct bch_dev *ca;
 	unsigned iter;
 	size_t keys = 0, entries = 0;
 	bool degraded = false;
+	u64 seq, last_seq = 0;
 	int ret = 0;
 
 	closure_init_stack(&jlist.cl);
@@ -734,12 +753,97 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list)
 	if (jlist.ret)
 		return jlist.ret;
 
+	if (list_empty(list)) {
+		bch_info(c, "journal read done, but no entries found");
+		return 0;
+	}
+
+	i = list_last_entry(list, struct journal_replay, list);
+	*start_seq = le64_to_cpu(i->j.seq) + 1;
+
+	/*
+	 * Find most recent flush entry, and ignore newer non flush entries -
+	 * those entries will be blacklisted:
+	 */
+	list_for_each_entry_safe_reverse(i, t, list, list) {
+		if (i->ignore)
+			continue;
+
+		if (!JSET_NO_FLUSH(&i->j)) {
+			last_seq	= le64_to_cpu(i->j.last_seq);
+			*blacklist_seq	= le64_to_cpu(i->j.seq) + 1;
+			break;
+		}
+
+		journal_replay_free(c, i);
+	}
+
+	if (!last_seq) {
+		fsck_err(c, "journal read done, but no entries found after dropping non-flushes");
+		return -1;
+	}
+
+	/* Drop blacklisted entries and entries older than last_seq: */
+	list_for_each_entry_safe(i, t, list, list) {
+		if (i->ignore)
+			continue;
+
+		seq = le64_to_cpu(i->j.seq);
+		if (seq < last_seq) {
+			journal_replay_free(c, i);
+			continue;
+		}
+
+		if (bch2_journal_seq_is_blacklisted(c, seq, true)) {
+			fsck_err_on(!JSET_NO_FLUSH(&i->j), c,
+				    "found blacklisted journal entry %llu", seq);
+
+			journal_replay_free(c, i);
+		}
+	}
+
+	/* Check for missing entries: */
+	seq = last_seq;
+	list_for_each_entry(i, list, list) {
+		if (i->ignore)
+			continue;
+
+		BUG_ON(seq > le64_to_cpu(i->j.seq));
+
+		while (seq < le64_to_cpu(i->j.seq)) {
+			u64 missing_start, missing_end;
+
+			while (seq < le64_to_cpu(i->j.seq) &&
+			       bch2_journal_seq_is_blacklisted(c, seq, false))
+				seq++;
+
+			if (seq == le64_to_cpu(i->j.seq))
+				break;
+
+			missing_start = seq;
+
+			while (seq < le64_to_cpu(i->j.seq) &&
+			       !bch2_journal_seq_is_blacklisted(c, seq, false))
+				seq++;
+
+			missing_end = seq - 1;
+			fsck_err(c, "journal entries %llu-%llu missing! (replaying %llu-%llu)",
+				 missing_start, missing_end,
+				 last_seq, *blacklist_seq - 1);
+		}
+
+		seq++;
+	}
+
 	list_for_each_entry(i, list, list) {
 		struct jset_entry *entry;
 		struct bkey_i *k, *_n;
 		struct bch_replicas_padded replicas;
 		char buf[80];
 
+		if (i->ignore)
+			continue;
+
 		ret = jset_validate_entries(c, &i->j, READ);
 		if (ret)
 			goto fsck_err;
@@ -767,12 +871,12 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list)
 		entries++;
 	}
 
-	if (!list_empty(list)) {
-		i = list_last_entry(list, struct journal_replay, list);
+	bch_info(c, "journal read done, %zu keys in %zu entries, seq %llu",
+		 keys, entries, *start_seq);
 
-		bch_info(c, "journal read done, %zu keys in %zu entries, seq %llu",
-			 keys, entries, le64_to_cpu(i->j.seq));
-	}
+	if (*start_seq != *blacklist_seq)
+		bch_info(c, "dropped unflushed entries %llu-%llu",
+			 *blacklist_seq, *start_seq - 1);
 fsck_err:
 	return ret;
 }
@@ -990,8 +1094,12 @@ static void journal_write_done(struct closure *cl)
 	j->seq_ondisk		= seq;
 	if (err && (!j->err_seq || seq < j->err_seq))
 		j->err_seq	= seq;
-	j->last_seq_ondisk	= last_seq;
-	bch2_journal_space_available(j);
+
+	if (!w->noflush) {
+		j->flushed_seq_ondisk = seq;
+		j->last_seq_ondisk = last_seq;
+		bch2_journal_space_available(j);
+	}
 
 	/*
 	 * Updating last_seq_ondisk may let bch2_journal_reclaim_work() discard
@@ -1067,6 +1175,22 @@ void bch2_journal_write(struct closure *cl)
 
 	j->write_start_time = local_clock();
 
+	spin_lock(&j->lock);
+	if (c->sb.features & (1ULL << BCH_FEATURE_journal_no_flush) &&
+	    !w->must_flush &&
+	    (jiffies - j->last_flush_write) < msecs_to_jiffies(j->write_delay_ms) &&
+	    test_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags)) {
+		w->noflush = true;
+		SET_JSET_NO_FLUSH(jset, true);
+		jset->last_seq = cpu_to_le64(j->last_seq_ondisk);
+
+		j->nr_noflush_writes++;
+	} else {
+		j->last_flush_write = jiffies;
+		j->nr_flush_writes++;
+	}
+	spin_unlock(&j->lock);
+
 	/*
 	 * New btree roots are set by journalling them; when the journal entry
 	 * gets written we have to propagate them to c->btree_roots
@@ -1183,11 +1307,12 @@ retry_alloc:
 			     sectors);
 
 		bio = ca->journal.bio;
-		bio_reset(bio, ca->disk_sb.bdev,
-			  REQ_OP_WRITE|REQ_SYNC|REQ_META|REQ_PREFLUSH|REQ_FUA);
+		bio_reset(bio, ca->disk_sb.bdev, REQ_OP_WRITE|REQ_SYNC|REQ_META);
 		bio->bi_iter.bi_sector	= ptr->offset;
 		bio->bi_end_io		= journal_write_endio;
 		bio->bi_private		= ca;
+		if (!JSET_NO_FLUSH(jset))
+			bio->bi_opf    |= REQ_PREFLUSH|REQ_FUA;
 		bch2_bio_map(bio, jset, sectors << 9);
 
 		trace_journal_write(bio);
@@ -1196,18 +1321,19 @@ retry_alloc:
 		ca->journal.bucket_seq[ca->journal.cur_idx] = le64_to_cpu(jset->seq);
 	}
 
-	for_each_rw_member(ca, c, i)
-		if (journal_flushes_device(ca) &&
-		    !bch2_bkey_has_device(bkey_i_to_s_c(&w->key), i)) {
-			percpu_ref_get(&ca->io_ref);
-
-			bio = ca->journal.bio;
-			bio_reset(bio, ca->disk_sb.bdev, REQ_OP_FLUSH);
-			bio->bi_end_io		= journal_write_endio;
-			bio->bi_private		= ca;
-			closure_bio_submit(bio, cl);
-		}
-
+	if (!JSET_NO_FLUSH(jset)) {
+		for_each_rw_member(ca, c, i)
+			if (journal_flushes_device(ca) &&
+			    !bch2_bkey_has_device(bkey_i_to_s_c(&w->key), i)) {
+				percpu_ref_get(&ca->io_ref);
+
+				bio = ca->journal.bio;
+				bio_reset(bio, ca->disk_sb.bdev, REQ_OP_FLUSH);
+				bio->bi_end_io		= journal_write_endio;
+				bio->bi_private		= ca;
+				closure_bio_submit(bio, cl);
+			}
+	}
 no_io:
 	bch2_bucket_seq_cleanup(c);
 
diff --git a/fs/bcachefs/journal_io.h b/fs/bcachefs/journal_io.h
index 6958ee0f8cf2..6b4c80968f52 100644
--- a/fs/bcachefs/journal_io.h
+++ b/fs/bcachefs/journal_io.h
@@ -11,6 +11,7 @@ struct journal_replay {
 	struct bch_devs_list	devs;
 	/* checksum error, but we may want to try using it anyways: */
 	bool			bad;
+	bool			ignore;
 	/* must be last: */
 	struct jset		j;
 };
@@ -37,7 +38,7 @@ static inline struct jset_entry *__jset_entry_type_next(struct jset *jset,
 	for_each_jset_entry_type(entry, jset, BCH_JSET_ENTRY_btree_keys)	\
 		vstruct_for_each_safe(entry, k, _n)
 
-int bch2_journal_read(struct bch_fs *, struct list_head *);
+int bch2_journal_read(struct bch_fs *, struct list_head *, u64 *, u64 *);
 
 void bch2_journal_write(struct closure *);
 
diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
index c6267284a028..a3d5405991b9 100644
--- a/fs/bcachefs/journal_reclaim.c
+++ b/fs/bcachefs/journal_reclaim.c
@@ -158,7 +158,7 @@ void bch2_journal_space_available(struct journal *j)
 {
 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
 	struct bch_dev *ca;
-	unsigned clean;
+	unsigned clean, clean_ondisk, total;
 	unsigned overhead, u64s_remaining = 0;
 	unsigned max_entry_size	 = min(j->buf[0].buf_size >> 9,
 				       j->buf[1].buf_size >> 9);
@@ -204,13 +204,21 @@ void bch2_journal_space_available(struct journal *j)
 	for (i = 0; i < journal_space_nr; i++)
 		j->space[i] = __journal_space_available(j, nr_devs_want, i);
 
+	clean_ondisk	= j->space[journal_space_clean_ondisk].total;
 	clean		= j->space[journal_space_clean].total;
+	total		= j->space[journal_space_total].total;
 
 	if (!j->space[journal_space_discarded].next_entry)
 		ret = cur_entry_journal_full;
 	else if (!fifo_free(&j->pin))
 		ret = cur_entry_journal_pin_full;
 
+	if ((clean - clean_ondisk <= total / 8) &&
+	    (clean_ondisk * 2 > clean ))
+		set_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags);
+	else
+		clear_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags);
+
 	overhead = DIV_ROUND_UP(clean, max_entry_size) *
 		journal_entry_overhead(j);
 	u64s_remaining = clean << 6;
diff --git a/fs/bcachefs/journal_seq_blacklist.c b/fs/bcachefs/journal_seq_blacklist.c
index d0f1bbf8f6a7..e1b63f3879f4 100644
--- a/fs/bcachefs/journal_seq_blacklist.c
+++ b/fs/bcachefs/journal_seq_blacklist.c
@@ -118,7 +118,7 @@ out_write_sb:
 out:
 	mutex_unlock(&c->sb_lock);
 
-	return ret;
+	return ret ?: bch2_blacklist_table_initialize(c);
 }
 
 static int journal_seq_blacklist_table_cmp(const void *_l,
@@ -164,8 +164,6 @@ int bch2_blacklist_table_initialize(struct bch_fs *c)
 	struct journal_seq_blacklist_table *t;
 	unsigned i, nr = blacklist_nr_entries(bl);
 
-	BUG_ON(c->journal_seq_blacklist_table);
-
 	if (!bl)
 		return 0;
 
@@ -187,6 +185,7 @@ int bch2_blacklist_table_initialize(struct bch_fs *c)
 			journal_seq_blacklist_table_cmp,
 			NULL);
 
+	kfree(c->journal_seq_blacklist_table);
 	c->journal_seq_blacklist_table = t;
 	return 0;
 }
diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h
index 6b525dc6ab7c..cf9675310f2b 100644
--- a/fs/bcachefs/journal_types.h
+++ b/fs/bcachefs/journal_types.h
@@ -29,6 +29,8 @@ struct journal_buf {
 	unsigned		disk_sectors;	/* maximum size entry could have been, if
 						   buf_size was bigger */
 	unsigned		u64s_reserved;
+	bool			noflush;	/* write has already been kicked off, and was noflush */
+	bool			must_flush;	/* something wants a flush */
 	/* bloom filter: */
 	unsigned long		has_inode[1024 / sizeof(unsigned long)];
 };
@@ -146,6 +148,7 @@ enum {
 	JOURNAL_RECLAIM_STARTED,
 	JOURNAL_NEED_WRITE,
 	JOURNAL_MAY_GET_UNRESERVED,
+	JOURNAL_MAY_SKIP_FLUSH,
 };
 
 /* Embedded in struct bch_fs */
@@ -203,6 +206,7 @@ struct journal {
 
 	/* seq, last_seq from the most recent journal entry successfully written */
 	u64			seq_ondisk;
+	u64			flushed_seq_ondisk;
 	u64			last_seq_ondisk;
 	u64			err_seq;
 	u64			last_empty_seq;
@@ -252,11 +256,15 @@ struct journal {
 
 	unsigned		write_delay_ms;
 	unsigned		reclaim_delay_ms;
+	unsigned long		last_flush_write;
 
 	u64			res_get_blocked_start;
 	u64			need_write_time;
 	u64			write_start_time;
 
+	u64			nr_flush_writes;
+	u64			nr_noflush_writes;
+
 	struct bch2_time_stats	*write_time;
 	struct bch2_time_stats	*delay_time;
 	struct bch2_time_stats	*blocked_time;
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 7ad5b8234747..ecd51d45743a 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -313,7 +313,7 @@ void bch2_journal_keys_free(struct journal_keys *keys)
 
 static struct journal_keys journal_keys_sort(struct list_head *journal_entries)
 {
-	struct journal_replay *p;
+	struct journal_replay *i;
 	struct jset_entry *entry;
 	struct bkey_i *k, *_n;
 	struct journal_keys keys = { NULL };
@@ -323,35 +323,35 @@ static struct journal_keys journal_keys_sort(struct list_head *journal_entries)
 	if (list_empty(journal_entries))
 		return keys;
 
-	keys.journal_seq_base =
-		le64_to_cpu(list_last_entry(journal_entries,
-				struct journal_replay, list)->j.last_seq);
-
-	list_for_each_entry(p, journal_entries, list) {
-		if (le64_to_cpu(p->j.seq) < keys.journal_seq_base)
+	list_for_each_entry(i, journal_entries, list) {
+		if (i->ignore)
 			continue;
 
-		for_each_jset_key(k, _n, entry, &p->j)
+		if (!keys.journal_seq_base)
+			keys.journal_seq_base = le64_to_cpu(i->j.seq);
+
+		for_each_jset_key(k, _n, entry, &i->j)
 			nr_keys++;
 	}
 
-
 	keys.d = kvmalloc(sizeof(keys.d[0]) * nr_keys, GFP_KERNEL);
 	if (!keys.d)
 		goto err;
 
-	list_for_each_entry(p, journal_entries, list) {
-		if (le64_to_cpu(p->j.seq) < keys.journal_seq_base)
+	list_for_each_entry(i, journal_entries, list) {
+		if (i->ignore)
 			continue;
 
-		for_each_jset_key(k, _n, entry, &p->j)
+		BUG_ON(le64_to_cpu(i->j.seq) - keys.journal_seq_base > U32_MAX);
+
+		for_each_jset_key(k, _n, entry, &i->j)
 			keys.d[keys.nr++] = (struct journal_key) {
 				.btree_id	= entry->btree_id,
 				.level		= entry->level,
 				.k		= k,
-				.journal_seq	= le64_to_cpu(p->j.seq) -
+				.journal_seq	= le64_to_cpu(i->j.seq) -
 					keys.journal_seq_base,
-				.journal_offset	= k->_data - p->j._data,
+				.journal_offset	= k->_data - i->j._data,
 			};
 	}
 
@@ -643,46 +643,6 @@ err:
 	return ret;
 }
 
-static bool journal_empty(struct list_head *journal)
-{
-	return list_empty(journal) ||
-		journal_entry_empty(&list_last_entry(journal,
-					struct journal_replay, list)->j);
-}
-
-static int
-verify_journal_entries_not_blacklisted_or_missing(struct bch_fs *c,
-						  struct list_head *journal)
-{
-	struct journal_replay *i =
-		list_last_entry(journal, struct journal_replay, list);
-	u64 start_seq	= le64_to_cpu(i->j.last_seq);
-	u64 end_seq	= le64_to_cpu(i->j.seq);
-	u64 seq		= start_seq;
-	int ret = 0;
-
-	list_for_each_entry(i, journal, list) {
-		if (le64_to_cpu(i->j.seq) < start_seq)
-			continue;
-
-		fsck_err_on(seq != le64_to_cpu(i->j.seq), c,
-			"journal entries %llu-%llu missing! (replaying %llu-%llu)",
-			seq, le64_to_cpu(i->j.seq) - 1,
-			start_seq, end_seq);
-
-		seq = le64_to_cpu(i->j.seq);
-
-		fsck_err_on(bch2_journal_seq_is_blacklisted(c, seq, false), c,
-			    "found blacklisted journal entry %llu", seq);
-
-		do {
-			seq++;
-		} while (bch2_journal_seq_is_blacklisted(c, seq, false));
-	}
-fsck_err:
-	return ret;
-}
-
 /* journal replay early: */
 
 static int journal_replay_entry_early(struct bch_fs *c,
@@ -767,6 +727,7 @@ static int journal_replay_early(struct bch_fs *c,
 				struct bch_sb_field_clean *clean,
 				struct list_head *journal)
 {
+	struct journal_replay *i;
 	struct jset_entry *entry;
 	int ret;
 
@@ -782,18 +743,19 @@ static int journal_replay_early(struct bch_fs *c,
 				return ret;
 		}
 	} else {
-		struct journal_replay *i =
-			list_last_entry(journal, struct journal_replay, list);
+		list_for_each_entry(i, journal, list) {
+			if (i->ignore)
+				continue;
 
-		c->bucket_clock[READ].hand = le16_to_cpu(i->j.read_clock);
-		c->bucket_clock[WRITE].hand = le16_to_cpu(i->j.write_clock);
+			c->bucket_clock[READ].hand = le16_to_cpu(i->j.read_clock);
+			c->bucket_clock[WRITE].hand = le16_to_cpu(i->j.write_clock);
 
-		list_for_each_entry(i, journal, list)
 			vstruct_for_each(&i->j, entry) {
 				ret = journal_replay_entry_early(c, entry);
 				if (ret)
 					return ret;
 			}
+		}
 	}
 
 	bch2_fs_usage_initialize(c);
@@ -842,9 +804,6 @@ static int verify_superblock_clean(struct bch_fs *c,
 	struct bch_sb_field_clean *clean = *cleanp;
 	int ret = 0;
 
-	if (!c->sb.clean || !j)
-		return 0;
-
 	if (mustfix_fsck_err_on(j->seq != clean->journal_seq, c,
 			"superblock journal seq (%llu) doesn't match journal (%llu) after clean shutdown",
 			le64_to_cpu(clean->journal_seq),
@@ -971,7 +930,8 @@ int bch2_fs_recovery(struct bch_fs *c)
 {
 	const char *err = "cannot allocate memory";
 	struct bch_sb_field_clean *clean = NULL;
-	u64 journal_seq;
+	struct jset *last_journal_entry = NULL;
+	u64 blacklist_seq, journal_seq;
 	bool write_sb = false, need_write_alloc = false;
 	int ret;
 
@@ -991,24 +951,38 @@ int bch2_fs_recovery(struct bch_fs *c)
 		set_bit(BCH_FS_REBUILD_REPLICAS, &c->flags);
 	}
 
+	ret = bch2_blacklist_table_initialize(c);
+	if (ret) {
+		bch_err(c, "error initializing blacklist table");
+		goto err;
+	}
+
 	if (!c->sb.clean || c->opts.fsck || c->opts.keep_journal) {
-		struct jset *j;
+		struct journal_replay *i;
 
-		ret = bch2_journal_read(c, &c->journal_entries);
+		ret = bch2_journal_read(c, &c->journal_entries,
+					&blacklist_seq, &journal_seq);
 		if (ret)
 			goto err;
 
-		if (mustfix_fsck_err_on(c->sb.clean && !journal_empty(&c->journal_entries), c,
+		list_for_each_entry_reverse(i, &c->journal_entries, list)
+			if (!i->ignore) {
+				last_journal_entry = &i->j;
+				break;
+			}
+
+		if (mustfix_fsck_err_on(c->sb.clean &&
+					last_journal_entry &&
+					!journal_entry_empty(last_journal_entry), c,
 				"filesystem marked clean but journal not empty")) {
 			c->sb.compat &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_INFO);
 			SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
 			c->sb.clean = false;
 		}
 
-		if (!c->sb.clean && list_empty(&c->journal_entries)) {
-			bch_err(c, "no journal entries found");
-			ret = BCH_FSCK_REPAIR_IMPOSSIBLE;
-			goto err;
+		if (!last_journal_entry) {
+			fsck_err_on(!c->sb.clean, c, "no journal entries found");
+			goto use_clean;
 		}
 
 		c->journal_keys = journal_keys_sort(&c->journal_entries);
@@ -1017,16 +991,21 @@ int bch2_fs_recovery(struct bch_fs *c)
 			goto err;
 		}
 
-		j = &list_last_entry(&c->journal_entries,
-				     struct journal_replay, list)->j;
-
-		ret = verify_superblock_clean(c, &clean, j);
-		if (ret)
+		if (c->sb.clean && last_journal_entry) {
+			ret = verify_superblock_clean(c, &clean,
+						      last_journal_entry);
+			if (ret)
+				goto err;
+		}
+	} else {
+use_clean:
+		if (!clean) {
+			bch_err(c, "no superblock clean section found");
+			ret = BCH_FSCK_REPAIR_IMPOSSIBLE;
 			goto err;
 
-		journal_seq = le64_to_cpu(j->seq) + 1;
-	} else {
-		journal_seq = le64_to_cpu(clean->journal_seq) + 1;
+		}
+		blacklist_seq = journal_seq = le64_to_cpu(clean->journal_seq) + 1;
 	}
 
 	if (!c->sb.clean &&
@@ -1045,30 +1024,23 @@ int bch2_fs_recovery(struct bch_fs *c)
 	if (ret)
 		goto err;
 
-	if (!c->sb.clean) {
+	/*
+	 * After an unclean shutdown, skip then next few journal sequence
+	 * numbers as they may have been referenced by btree writes that
+	 * happened before their corresponding journal writes - those btree
+	 * writes need to be ignored, by skipping and blacklisting the next few
+	 * journal sequence numbers:
+	 */
+	if (!c->sb.clean)
+		journal_seq += 8;
+
+	if (blacklist_seq != journal_seq) {
 		ret = bch2_journal_seq_blacklist_add(c,
-						     journal_seq,
-						     journal_seq + 8);
+					blacklist_seq, journal_seq);
 		if (ret) {
 			bch_err(c, "error creating new journal seq blacklist entry");
 			goto err;
 		}
-
-		journal_seq += 8;
-
-		/*
-		 * The superblock needs to be written before we do any btree
-		 * node writes: it will be in the read_write() path
-		 */
-	}
-
-	ret = bch2_blacklist_table_initialize(c);
-
-	if (!list_empty(&c->journal_entries)) {
-		ret = verify_journal_entries_not_blacklisted_or_missing(c,
-							&c->journal_entries);
-		if (ret)
-			goto err;
 	}
 
 	ret = bch2_fs_journal_start(&c->journal, journal_seq,
-- 
cgit 


From 5d32c5bb076e76232e56fd44c537046abd54d32c Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 5 Dec 2020 16:25:05 -0500
Subject: bcachefs: Be more conservation about journal pre-reservations

 - Try to always keep 1/8th of the journal free, on top of
   pre-reservations
 - Move the check for whether the journal is stuck to
   bch2_journal_space_available, and make it only fire when there aren't
   any journal writes in flight (that might free up space by updating
   last_seq)

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/journal.c         | 25 ++++++++-----------------
 fs/bcachefs/journal.h         |  3 ++-
 fs/bcachefs/journal_io.c      |  3 ++-
 fs/bcachefs/journal_reclaim.c | 35 +++++++++++++++++++++++++++--------
 fs/bcachefs/journal_types.h   |  1 +
 5 files changed, 40 insertions(+), 27 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index 31168754d6b8..9c0de18930ac 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -442,20 +442,6 @@ unlock:
 	if (!ret)
 		goto retry;
 
-	if (WARN_ONCE(ret == cur_entry_journal_full &&
-		      !can_discard &&
-		      (flags & JOURNAL_RES_GET_RESERVED),
-		      "JOURNAL_RES_GET_RESERVED set but journal full")) {
-		char *buf;
-
-		buf = kmalloc(4096, GFP_NOFS);
-		if (buf) {
-			bch2_journal_debug_to_text(&_PBUF(buf, 4096), j);
-			pr_err("\n%s", buf);
-			kfree(buf);
-		}
-	}
-
 	/*
 	 * Journal is full - can't rely on reclaim from work item due to
 	 * freezing:
@@ -1139,7 +1125,7 @@ out:
 
 /* debug: */
 
-void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
+void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
 {
 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
 	union journal_res_state s;
@@ -1147,7 +1133,6 @@ void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
 	unsigned i;
 
 	rcu_read_lock();
-	spin_lock(&j->lock);
 	s = READ_ONCE(j->reservations);
 
 	pr_buf(out,
@@ -1247,10 +1232,16 @@ void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
 		       ja->cur_idx,		ja->bucket_seq[ja->cur_idx]);
 	}
 
-	spin_unlock(&j->lock);
 	rcu_read_unlock();
 }
 
+void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
+{
+	spin_lock(&j->lock);
+	__bch2_journal_debug_to_text(out, j);
+	spin_unlock(&j->lock);
+}
+
 void bch2_journal_pins_to_text(struct printbuf *out, struct journal *j)
 {
 	struct journal_entry_pin_list *pin_list;
diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h
index 2c0014c3c02f..df353a18011b 100644
--- a/fs/bcachefs/journal.h
+++ b/fs/bcachefs/journal.h
@@ -386,7 +386,7 @@ out:
 static inline bool journal_check_may_get_unreserved(struct journal *j)
 {
 	union journal_preres_state s = READ_ONCE(j->prereserved);
-	bool ret = s.reserved <= s.remaining &&
+	bool ret = s.reserved < s.remaining &&
 		fifo_free(&j->pin) > 8;
 
 	lockdep_assert_held(&j->lock);
@@ -510,6 +510,7 @@ static inline void bch2_journal_set_replay_done(struct journal *j)
 void bch2_journal_unblock(struct journal *);
 void bch2_journal_block(struct journal *);
 
+void __bch2_journal_debug_to_text(struct printbuf *, struct journal *);
 void bch2_journal_debug_to_text(struct printbuf *, struct journal *);
 void bch2_journal_pins_to_text(struct printbuf *, struct journal *);
 
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 26556bb381b2..cb2cfbbf50d4 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -1098,7 +1098,6 @@ static void journal_write_done(struct closure *cl)
 	if (!w->noflush) {
 		j->flushed_seq_ondisk = seq;
 		j->last_seq_ondisk = last_seq;
-		bch2_journal_space_available(j);
 	}
 
 	/*
@@ -1122,6 +1121,8 @@ static void journal_write_done(struct closure *cl)
 	} while ((v = atomic64_cmpxchg(&j->reservations.counter,
 				       old.v, new.v)) != old.v);
 
+	bch2_journal_space_available(j);
+
 	closure_wake_up(&w->wait);
 	journal_wake(j);
 
diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
index a3d5405991b9..0fba832d3670 100644
--- a/fs/bcachefs/journal_reclaim.c
+++ b/fs/bcachefs/journal_reclaim.c
@@ -2,6 +2,7 @@
 
 #include "bcachefs.h"
 #include "btree_key_cache.h"
+#include "error.h"
 #include "journal.h"
 #include "journal_io.h"
 #include "journal_reclaim.h"
@@ -159,7 +160,7 @@ void bch2_journal_space_available(struct journal *j)
 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
 	struct bch_dev *ca;
 	unsigned clean, clean_ondisk, total;
-	unsigned overhead, u64s_remaining = 0;
+	s64 u64s_remaining = 0;
 	unsigned max_entry_size	 = min(j->buf[0].buf_size >> 9,
 				       j->buf[1].buf_size >> 9);
 	unsigned i, nr_online = 0, nr_devs_want;
@@ -208,22 +209,37 @@ void bch2_journal_space_available(struct journal *j)
 	clean		= j->space[journal_space_clean].total;
 	total		= j->space[journal_space_total].total;
 
-	if (!j->space[journal_space_discarded].next_entry)
+	if (!clean_ondisk &&
+	    j->reservations.idx ==
+	    j->reservations.unwritten_idx) {
+		char *buf = kmalloc(4096, GFP_ATOMIC);
+
+		bch_err(c, "journal stuck");
+		if (buf) {
+			__bch2_journal_debug_to_text(&_PBUF(buf, 4096), j);
+			pr_err("\n%s", buf);
+			kfree(buf);
+		}
+
+		bch2_fatal_error(c);
+		ret = cur_entry_journal_stuck;
+	} else if (!j->space[journal_space_discarded].next_entry)
 		ret = cur_entry_journal_full;
 	else if (!fifo_free(&j->pin))
 		ret = cur_entry_journal_pin_full;
 
-	if ((clean - clean_ondisk <= total / 8) &&
+	if ((j->space[journal_space_clean_ondisk].next_entry <
+	     j->space[journal_space_clean_ondisk].total) &&
+	    (clean - clean_ondisk <= total / 8) &&
 	    (clean_ondisk * 2 > clean ))
 		set_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags);
 	else
 		clear_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags);
 
-	overhead = DIV_ROUND_UP(clean, max_entry_size) *
-		journal_entry_overhead(j);
-	u64s_remaining = clean << 6;
-	u64s_remaining = max_t(int, 0, u64s_remaining - overhead);
-	u64s_remaining /= 4;
+	u64s_remaining  = (u64) clean << 6;
+	u64s_remaining -= (u64) total << 3;
+	u64s_remaining = max(0LL, u64s_remaining);
+	u64s_remaining /= 2;
 out:
 	j->cur_entry_sectors	= !ret ? j->space[journal_space_discarded].next_entry : 0;
 	j->cur_entry_error	= ret;
@@ -572,6 +588,9 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct)
 		    c->btree_cache.used  * 3)
 			min_nr = 1;
 
+		if (fifo_free(&j->pin) <= 32)
+			min_nr = 1;
+
 		min_nr = max(min_nr, bch2_nr_btree_keys_need_flush(c));
 
 		trace_journal_reclaim_start(c,
diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h
index cf9675310f2b..1b130541f00b 100644
--- a/fs/bcachefs/journal_types.h
+++ b/fs/bcachefs/journal_types.h
@@ -172,6 +172,7 @@ struct journal {
 		cur_entry_blocked,
 		cur_entry_journal_full,
 		cur_entry_journal_pin_full,
+		cur_entry_journal_stuck,
 		cur_entry_insufficient_devices,
 	}			cur_entry_error;
 
-- 
cgit 


From f51e84fe24d8d170bfbba626e76ee08b1ab7b283 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 5 Dec 2020 21:03:57 -0500
Subject: bcachefs: Fix btree key cache dirty checks

Had a type that meant we were triggering journal reclaim _much_ more
aggressively than needed. Also, fix a potential integer overflow.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_key_cache.h | 6 +++---
 fs/bcachefs/journal_reclaim.c | 1 +
 2 files changed, 4 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_key_cache.h b/fs/bcachefs/btree_key_cache.h
index d7d31a0662c3..dad3e344dcf9 100644
--- a/fs/bcachefs/btree_key_cache.h
+++ b/fs/bcachefs/btree_key_cache.h
@@ -4,8 +4,8 @@
 static inline size_t bch2_nr_btree_keys_need_flush(struct bch_fs *c)
 {
 	size_t nr_dirty = READ_ONCE(c->btree_key_cache.nr_dirty);
-	size_t nr_keys = READ_ONCE(c->btree_key_cache.nr_dirty);
-	size_t max_dirty = 4096 + nr_keys  / 2;
+	size_t nr_keys = READ_ONCE(c->btree_key_cache.nr_keys);
+	size_t max_dirty = 1024 + nr_keys  / 2;
 
 	return max_t(ssize_t, 0, nr_dirty - max_dirty);
 }
@@ -13,7 +13,7 @@ static inline size_t bch2_nr_btree_keys_need_flush(struct bch_fs *c)
 static inline bool bch2_btree_key_cache_must_wait(struct bch_fs *c)
 {
 	size_t nr_dirty = READ_ONCE(c->btree_key_cache.nr_dirty);
-	size_t nr_keys = READ_ONCE(c->btree_key_cache.nr_dirty);
+	size_t nr_keys = READ_ONCE(c->btree_key_cache.nr_keys);
 	size_t max_dirty = 4096 + (nr_keys * 3) / 4;
 
 	return nr_dirty > max_dirty;
diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
index 0fba832d3670..5ab147e94167 100644
--- a/fs/bcachefs/journal_reclaim.c
+++ b/fs/bcachefs/journal_reclaim.c
@@ -240,6 +240,7 @@ void bch2_journal_space_available(struct journal *j)
 	u64s_remaining -= (u64) total << 3;
 	u64s_remaining = max(0LL, u64s_remaining);
 	u64s_remaining /= 2;
+	u64s_remaining = min_t(u64, u64s_remaining, U32_MAX);
 out:
 	j->cur_entry_sectors	= !ret ? j->space[journal_space_discarded].next_entry : 0;
 	j->cur_entry_error	= ret;
-- 
cgit 


From b18df768ebf71196c3620d1e5f23f064c1ba1485 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 6 Dec 2020 16:29:13 -0500
Subject: bcachefs: Prevent journal reclaim from spinning

Without checking if we actually flushed anything, journal reclaim could
still go into an infinite loop while trying ot shut down.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/journal_reclaim.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
index 5ab147e94167..e8fd11abe4c3 100644
--- a/fs/bcachefs/journal_reclaim.c
+++ b/fs/bcachefs/journal_reclaim.c
@@ -610,7 +610,7 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct)
 		else
 			j->nr_background_reclaim += nr_flushed;
 		trace_journal_reclaim_finish(c, nr_flushed);
-	} while (min_nr);
+	} while (min_nr && nr_flushed);
 
 	memalloc_noreclaim_restore(flags);
 
-- 
cgit 


From a2bfc8412ad8da289b933810232cf95f7739340a Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 6 Dec 2020 16:30:02 -0500
Subject: bcachefs: Try to print full btree error message

Metadata corruption bugs are hard to debug if we can't see exactly what
went wrong - try to allocate a bigger buffer so we can print out
everything we have.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_io.c   | 15 +++++++++++----
 fs/bcachefs/btree_iter.c | 14 ++++++++++++--
 2 files changed, 23 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index b2ffdff48637..c100f930bb8f 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -635,21 +635,26 @@ enum btree_validate_ret {
 ({									\
 	__label__ out;							\
 	char _buf[300];							\
+	char *buf2 = _buf;						\
 	struct printbuf out = PBUF(_buf);				\
 									\
+	buf2 = kmalloc(4096, GFP_ATOMIC);				\
+	if (buf2)							\
+		out = _PBUF(buf2, 4986);				\
+									\
 	btree_err_msg(&out, c, b, i, b->written, write);		\
 	pr_buf(&out, ": " msg, ##__VA_ARGS__);				\
 									\
 	if (type == BTREE_ERR_FIXABLE &&				\
 	    write == READ &&						\
 	    !test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags)) {		\
-		mustfix_fsck_err(c, "%s", _buf);			\
+		mustfix_fsck_err(c, "%s", buf2);			\
 		goto out;						\
 	}								\
 									\
 	switch (write) {						\
 	case READ:							\
-		bch_err(c, "%s", _buf);					\
+		bch_err(c, "%s", buf2);					\
 									\
 		switch (type) {						\
 		case BTREE_ERR_FIXABLE:					\
@@ -670,7 +675,7 @@ enum btree_validate_ret {
 		}							\
 		break;							\
 	case WRITE:							\
-		bch_err(c, "corrupt metadata before write: %s", _buf);	\
+		bch_err(c, "corrupt metadata before write: %s", buf2);	\
 									\
 		if (bch2_fs_inconsistent(c)) {				\
 			ret = BCH_FSCK_ERRORS_NOT_FIXED;		\
@@ -679,6 +684,8 @@ enum btree_validate_ret {
 		break;							\
 	}								\
 out:									\
+	if (buf2 != _buf)						\
+		kfree(buf2);						\
 	true;								\
 })
 
@@ -844,7 +851,7 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b,
 
 			bch2_bkey_val_to_text(&PBUF(buf), c, u.s_c);
 			btree_err(BTREE_ERR_FIXABLE, c, b, i,
-				  "invalid bkey:\n%s\n%s", invalid, buf);
+				  "invalid bkey: %s\n%s", invalid, buf);
 
 			i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
 			memmove_u64s_down(k, bkey_next(k),
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 6fa2b13e53bf..ea1735445202 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -875,9 +875,19 @@ static void btree_iter_verify_new_node(struct btree_iter *iter, struct btree *b)
 		char buf[100];
 		struct bkey uk = bkey_unpack_key(b, k);
 
+		bch2_dump_btree_node(iter->trans->c, l->b);
 		bch2_bkey_to_text(&PBUF(buf), &uk);
-		panic("parent iter doesn't point to new node:\n%s\n%llu:%llu\n",
-		      buf, b->key.k.p.inode, b->key.k.p.offset);
+		panic("parent iter doesn't point to new node:\n"
+		      "iter pos %s %llu:%llu\n"
+		      "iter key %s\n"
+		      "new node %llu:%llu-%llu:%llu\n",
+		      bch2_btree_ids[iter->btree_id],
+		      iter->pos.inode,
+		      iter->pos.offset,
+		      buf,
+		      b->data->min_key.inode,
+		      b->data->min_key.offset,
+		      b->key.k.p.inode, b->key.k.p.offset);
 	}
 
 	if (!parent_locked)
-- 
cgit 


From d5b98fe2d764170e7a30eda8b94780aa6a0af129 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 7 Dec 2020 11:44:12 -0500
Subject: bcachefs: Fix rand_delete() test

When we didn't find a key to delete we were getting a null ptr deref.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/tests.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c
index 2acecbca212e..6d0f8e233e8b 100644
--- a/fs/bcachefs/tests.c
+++ b/fs/bcachefs/tests.c
@@ -563,15 +563,14 @@ static int __do_delete(struct btree_trans *trans, struct bpos pos)
 
 	iter = bch2_trans_get_iter(trans, BTREE_ID_XATTRS, pos,
 				   BTREE_ITER_INTENT);
-	ret = PTR_ERR_OR_ZERO(iter);
-	if (ret)
-		goto err;
-
 	k = bch2_btree_iter_peek(iter);
 	ret = bkey_err(k);
 	if (ret)
 		goto err;
 
+	if (!k.k)
+		goto err;
+
 	bkey_init(&delete.k);
 	delete.k.p = k.k->p;
 
-- 
cgit 


From cc578a36f9953c32a8ba866ee1878fcbb99a9746 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 9 Dec 2020 13:34:42 -0500
Subject: bcachefs: Fix __btree_iter_next() when all iters are in use_next()
 when all iters are in use

Also, print out more information on btree transaction iterator overflow.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c | 11 ++++++++++-
 fs/bcachefs/btree_iter.h |  8 +++++++-
 2 files changed, 17 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index ea1735445202..239d7c5deddc 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -2017,9 +2017,10 @@ static void btree_trans_iter_alloc_fail(struct btree_trans *trans)
 {
 
 	struct btree_iter *iter;
+	struct btree_insert_entry *i;
 
 	trans_for_each_iter(trans, iter)
-		pr_err("iter: btree %s pos %llu:%llu%s%s%s %ps",
+		printk(KERN_ERR "iter: btree %s pos %llu:%llu%s%s%s %ps\n",
 		       bch2_btree_ids[iter->btree_id],
 		       iter->pos.inode,
 		       iter->pos.offset,
@@ -2027,6 +2028,14 @@ static void btree_trans_iter_alloc_fail(struct btree_trans *trans)
 		       (trans->iters_touched & (1ULL << iter->idx)) ? " touched" : "",
 		       iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT ? " keep" : "",
 		       (void *) iter->ip_allocated);
+
+	trans_for_each_update(trans, i) {
+		char buf[300];
+
+		bch2_bkey_val_to_text(&PBUF(buf), trans->c, bkey_i_to_s_c(i->k));
+		printk(KERN_ERR "update: btree %s %s\n",
+		       bch2_btree_ids[i->iter->btree_id], buf);
+	}
 	panic("trans iter oveflow\n");
 }
 
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index ee8c4346aadb..9a7f8d0197ec 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -51,11 +51,17 @@ static inline int btree_iter_err(const struct btree_iter *iter)
 static inline struct btree_iter *
 __trans_next_iter(struct btree_trans *trans, unsigned idx)
 {
-	u64 l = trans->iters_linked >> idx;
+	u64 l;
+
+	if (idx == BTREE_ITER_MAX)
+		return NULL;
+
+	l = trans->iters_linked >> idx;
 	if (!l)
 		return NULL;
 
 	idx += __ffs64(l);
+	EBUG_ON(idx >= BTREE_ITER_MAX);
 	EBUG_ON(trans->iters[idx].idx != idx);
 	return &trans->iters[idx];
 }
-- 
cgit 


From 66bddc6c2b389a65708c27e7e7a9969e645ca799 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 9 Dec 2020 13:39:30 -0500
Subject: bcachefs: Only try to get existing stripe once in stripe create path

The stripe creation path was too state-machiney: it would always run the
full state machine until it had succesfully created a new stripe.

But if we tried to get and reuse an existing stripe after we'd already
allocated some buckets, the buckets we'd allocated might have conflicted
with the blocks in the existing stripe we need to keep - oops.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/ec.c | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index 6c9259ee6742..db1c652f1ed4 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -874,7 +874,7 @@ static void ec_stripe_create(struct ec_stripe_new *s)
 	for_each_keylist_key(&s->keys, k) {
 		ret = ec_stripe_update_ptrs(c, &s->stripe, &k->k);
 		if (ret) {
-			bch_err(c, "error creating stripe: error updating pointers");
+			bch_err(c, "error creating stripe: error %i updating pointers", ret);
 			break;
 		}
 	}
@@ -1341,16 +1341,14 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *c,
 	if (!h)
 		return NULL;
 
-	if (!h->s && ec_new_stripe_alloc(c, h)) {
-		bch2_ec_stripe_head_put(c, h);
-		return NULL;
-	}
-
-	if (!h->s->allocated) {
-		if (!h->s->existing_stripe &&
-		    (idx = get_existing_stripe(c, target, algo, redundancy)) >= 0) {
-			//pr_info("got existing stripe %llu", idx);
+	if (!h->s) {
+		if (ec_new_stripe_alloc(c, h)) {
+			bch2_ec_stripe_head_put(c, h);
+			return NULL;
+		}
 
+		idx = get_existing_stripe(c, target, algo, redundancy);
+		if (idx >= 0) {
 			h->s->existing_stripe = true;
 			h->s->existing_stripe_idx = idx;
 			if (get_stripe_key(c, idx, &h->s->stripe)) {
@@ -1364,7 +1362,9 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *c,
 					ec_block_io(c, &h->s->stripe, READ, i, &cl);
 				}
 		}
+	}
 
+	if (!h->s->allocated) {
 		if (!h->s->existing_stripe &&
 		    !h->s->res.sectors) {
 			ret = bch2_disk_reservation_get(c, &h->s->res,
-- 
cgit 


From 719fe7fb555ad9a53bb847bfae1cad7170cb2591 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 10 Dec 2020 13:13:56 -0500
Subject: bcachefs: Update transactional triggers interface to pass old & new
 keys

This is needed to fix a bug where we're overflowing iterators within a
btree transaction, because we're updating the stripes btree (to update
block counts) and the stripes btree trigger is unnecessarily updating
the alloc btree - it doesn't need to update the alloc btree when the
pointers within a stripe aren't changing.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update_interior.c |   8 +-
 fs/bcachefs/buckets.c               | 256 ++++++++++++++++++++++--------------
 fs/bcachefs/buckets.h               |   2 +-
 fs/bcachefs/recovery.c              |   8 +-
 4 files changed, 172 insertions(+), 102 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 594bcd797516..3ae920a223f9 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -519,14 +519,18 @@ static int btree_update_nodes_written_trans(struct btree_trans *trans,
 	trans->journal_pin = &as->journal;
 
 	for_each_keylist_key(&as->new_keys, k) {
-		ret = bch2_trans_mark_key(trans, bkey_i_to_s_c(k),
+		ret = bch2_trans_mark_key(trans,
+					  bkey_s_c_null,
+					  bkey_i_to_s_c(k),
 					  0, 0, BTREE_TRIGGER_INSERT);
 		if (ret)
 			return ret;
 	}
 
 	for_each_keylist_key(&as->old_keys, k) {
-		ret = bch2_trans_mark_key(trans, bkey_i_to_s_c(k),
+		ret = bch2_trans_mark_key(trans,
+					  bkey_i_to_s_c(k),
+					  bkey_s_c_null,
 					  0, 0, BTREE_TRIGGER_OVERWRITE);
 		if (ret)
 			return ret;
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 4762c5465ef0..44d08434855d 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -1338,10 +1338,8 @@ static int bch2_mark_key_locked(struct bch_fs *c,
 		ret = bch2_mark_stripe(c, old, new, fs_usage, journal_seq, flags);
 		break;
 	case KEY_TYPE_inode:
-		if (!(flags & BTREE_TRIGGER_OVERWRITE))
-			fs_usage->nr_inodes++;
-		else
-			fs_usage->nr_inodes--;
+		fs_usage->nr_inodes += new.k->type == KEY_TYPE_inode;
+		fs_usage->nr_inodes -= old.k->type == KEY_TYPE_inode;
 		break;
 	case KEY_TYPE_reservation: {
 		unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas;
@@ -1405,10 +1403,10 @@ int bch2_mark_update(struct btree_trans *trans,
 	old = (struct bkey_s_c) { &unpacked, NULL };
 
 	if (!btree_node_type_is_extents(iter->btree_id)) {
+		/* iterators should be uptodate, shouldn't get errors here: */
 		if (btree_iter_type(iter) != BTREE_ITER_CACHED) {
-			_old = bch2_btree_node_iter_peek(&node_iter, b);
-			if (_old)
-				old = bkey_disassemble(b, _old, &unpacked);
+			old = bch2_btree_iter_peek_slot(iter);
+			BUG_ON(bkey_err(old));
 		} else {
 			struct bkey_cached *ck = (void *) iter->l[0].b;
 
@@ -1753,59 +1751,92 @@ static int bch2_trans_mark_extent(struct btree_trans *trans,
 	return 0;
 }
 
+static int bch2_trans_mark_stripe_alloc_ref(struct btree_trans *trans,
+					    const struct bch_extent_ptr *ptr,
+					    s64 sectors, bool parity)
+{
+	struct bkey_i_alloc *a;
+	struct btree_iter *iter;
+	struct bkey_alloc_unpacked u;
+	int ret;
+
+	ret = bch2_trans_start_alloc_update(trans, &iter, ptr, &u);
+	if (ret)
+		return ret;
+
+	if (parity) {
+		u.dirty_sectors += sectors;
+		u.data_type = u.dirty_sectors
+			? BCH_DATA_parity
+			: 0;
+	}
+
+	a = bch2_trans_kmalloc(trans, BKEY_ALLOC_U64s_MAX * 8);
+	ret = PTR_ERR_OR_ZERO(a);
+	if (ret)
+		goto err;
+
+	bkey_alloc_init(&a->k_i);
+	a->k.p = iter->pos;
+	bch2_alloc_pack(a, u);
+	bch2_trans_update(trans, iter, &a->k_i, 0);
+err:
+	bch2_trans_iter_put(trans, iter);
+	return ret;
+}
+
 static int bch2_trans_mark_stripe(struct btree_trans *trans,
-				  struct bkey_s_c k,
+				  struct bkey_s_c old, struct bkey_s_c new,
 				  unsigned flags)
 {
-	const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
-	unsigned nr_data = s->nr_blocks - s->nr_redundant;
+	const struct bch_stripe *old_s = old.k->type == KEY_TYPE_stripe
+		? bkey_s_c_to_stripe(old).v : NULL;
+	const struct bch_stripe *new_s = new.k->type == KEY_TYPE_stripe
+		? bkey_s_c_to_stripe(new).v : NULL;
 	struct bch_replicas_padded r;
-	struct bkey_alloc_unpacked u;
-	struct bkey_i_alloc *a;
-	struct btree_iter *iter;
-	bool deleting = flags & BTREE_TRIGGER_OVERWRITE;
-	s64 sectors = le16_to_cpu(s->sectors);
 	unsigned i;
 	int ret = 0;
 
-	if (deleting)
-		sectors = -sectors;
-
-	bch2_bkey_to_replicas(&r.e, k);
-	update_replicas_list(trans, &r.e, sectors * s->nr_redundant);
-
 	/*
-	 * The allocator code doesn't necessarily update bucket gens in the
-	 * btree when incrementing them, right before handing out new buckets -
-	 * we just need to persist those updates here along with the new stripe:
+	 * If the pointers aren't changing, we don't need to do anything:
 	 */
+	if (new_s && old_s &&
+	    !memcmp(old_s->ptrs, new_s->ptrs,
+		    new_s->nr_blocks * sizeof(struct bch_extent_ptr)))
+		return 0;
 
-	for (i = 0; i < s->nr_blocks && !ret; i++) {
-		bool parity = i >= nr_data;
+	if (new_s) {
+		unsigned nr_data = new_s->nr_blocks - new_s->nr_redundant;
+		s64 sectors = le16_to_cpu(new_s->sectors);
 
-		ret = bch2_trans_start_alloc_update(trans, &iter,
-						    &s->ptrs[i], &u);
-		if (ret)
-			break;
+		bch2_bkey_to_replicas(&r.e, new);
+		update_replicas_list(trans, &r.e, sectors * new_s->nr_redundant);
 
-		if (parity) {
-			u.dirty_sectors += sectors;
-			u.data_type = u.dirty_sectors
-				? BCH_DATA_parity
-				: 0;
+		for (i = 0; i < new_s->nr_blocks; i++) {
+			bool parity = i >= nr_data;
+
+			ret = bch2_trans_mark_stripe_alloc_ref(trans,
+					&new_s->ptrs[i], sectors, parity);
+			if (ret)
+				return ret;
 		}
+	}
 
-		a = bch2_trans_kmalloc(trans, BKEY_ALLOC_U64s_MAX * 8);
-		ret = PTR_ERR_OR_ZERO(a);
-		if (ret)
-			goto put_iter;
-
-		bkey_alloc_init(&a->k_i);
-		a->k.p = iter->pos;
-		bch2_alloc_pack(a, u);
-		bch2_trans_update(trans, iter, &a->k_i, 0);
-put_iter:
-		bch2_trans_iter_put(trans, iter);
+	if (old_s) {
+		unsigned nr_data = old_s->nr_blocks - old_s->nr_redundant;
+		s64 sectors = -((s64) le16_to_cpu(old_s->sectors));
+
+		bch2_bkey_to_replicas(&r.e, old);
+		update_replicas_list(trans, &r.e, sectors * old_s->nr_redundant);
+
+		for (i = 0; i < old_s->nr_blocks; i++) {
+			bool parity = i >= nr_data;
+
+			ret = bch2_trans_mark_stripe_alloc_ref(trans,
+					&old_s->ptrs[i], sectors, parity);
+			if (ret)
+				return ret;
+		}
 	}
 
 	return ret;
@@ -1904,11 +1935,16 @@ static int bch2_trans_mark_reflink_p(struct btree_trans *trans,
 	return ret;
 }
 
-int bch2_trans_mark_key(struct btree_trans *trans, struct bkey_s_c k,
+int bch2_trans_mark_key(struct btree_trans *trans,
+			struct bkey_s_c old,
+			struct bkey_s_c new,
 			unsigned offset, s64 sectors, unsigned flags)
 {
-	struct replicas_delta_list *d;
 	struct bch_fs *c = trans->c;
+	struct bkey_s_c k = flags & BTREE_TRIGGER_INSERT ? new : old;
+	struct replicas_delta_list *d;
+
+	BUG_ON(!(flags & (BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE)));
 
 	switch (k.k->type) {
 	case KEY_TYPE_btree_ptr:
@@ -1924,15 +1960,18 @@ int bch2_trans_mark_key(struct btree_trans *trans, struct bkey_s_c k,
 		return bch2_trans_mark_extent(trans, k, offset, sectors,
 					      flags, BCH_DATA_user);
 	case KEY_TYPE_stripe:
-		return bch2_trans_mark_stripe(trans, k, flags);
-	case KEY_TYPE_inode:
-		d = replicas_deltas_realloc(trans, 0);
+		return bch2_trans_mark_stripe(trans, old, new, flags);
+	case KEY_TYPE_inode: {
+		int nr = (new.k->type == KEY_TYPE_inode) -
+			 (old.k->type == KEY_TYPE_inode);
+
+		if (nr) {
+			d = replicas_deltas_realloc(trans, 0);
+			d->nr_inodes += nr;
+		}
 
-		if (!(flags & BTREE_TRIGGER_OVERWRITE))
-			d->nr_inodes++;
-		else
-			d->nr_inodes--;
 		return 0;
+	}
 	case KEY_TYPE_reservation: {
 		unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas;
 
@@ -1956,12 +1995,10 @@ int bch2_trans_mark_key(struct btree_trans *trans, struct bkey_s_c k,
 
 int bch2_trans_mark_update(struct btree_trans *trans,
 			   struct btree_iter *iter,
-			   struct bkey_i *insert,
+			   struct bkey_i *new,
 			   unsigned flags)
 {
-	struct btree		*b = iter_l(iter)->b;
-	struct btree_node_iter	node_iter = iter_l(iter)->iter;
-	struct bkey_packed	*_k;
+	struct bkey_s_c	old;
 	int ret;
 
 	if (unlikely(flags & BTREE_TRIGGER_NORUN))
@@ -1970,68 +2007,93 @@ int bch2_trans_mark_update(struct btree_trans *trans,
 	if (!btree_node_type_needs_gc(iter->btree_id))
 		return 0;
 
-	ret = bch2_trans_mark_key(trans, bkey_i_to_s_c(insert),
-			0, insert->k.size, BTREE_TRIGGER_INSERT);
-	if (ret)
-		return ret;
-
-	if (btree_iter_type(iter) == BTREE_ITER_CACHED) {
-		struct bkey_cached *ck = (void *) iter->l[0].b;
+	if (!btree_node_type_is_extents(iter->btree_id)) {
+		/* iterators should be uptodate, shouldn't get errors here: */
+		if (btree_iter_type(iter) != BTREE_ITER_CACHED) {
+			old = bch2_btree_iter_peek_slot(iter);
+			BUG_ON(bkey_err(old));
+		} else {
+			struct bkey_cached *ck = (void *) iter->l[0].b;
 
-		return bch2_trans_mark_key(trans, bkey_i_to_s_c(ck->k),
-					   0, 0, BTREE_TRIGGER_OVERWRITE);
-	}
+			BUG_ON(!ck->valid);
+			old = bkey_i_to_s_c(ck->k);
+		}
 
-	while ((_k = bch2_btree_node_iter_peek(&node_iter, b))) {
+		if (old.k->type == new->k.type) {
+			ret   = bch2_trans_mark_key(trans, old, bkey_i_to_s_c(new), 0, 0,
+					BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|flags);
+		} else {
+			ret   = bch2_trans_mark_key(trans, old, bkey_i_to_s_c(new), 0, 0,
+					BTREE_TRIGGER_INSERT|flags) ?:
+				bch2_trans_mark_key(trans, old, bkey_i_to_s_c(new), 0, 0,
+					BTREE_TRIGGER_OVERWRITE|flags);
+		}
+	} else {
+		struct btree		*b = iter_l(iter)->b;
+		struct btree_node_iter	node_iter = iter_l(iter)->iter;
+		struct bkey_packed	*_old;
 		struct bkey		unpacked;
-		struct bkey_s_c		k;
-		unsigned		offset = 0;
-		s64			sectors = 0;
-		unsigned		flags = BTREE_TRIGGER_OVERWRITE;
 
-		k = bkey_disassemble(b, _k, &unpacked);
+		EBUG_ON(btree_iter_type(iter) == BTREE_ITER_CACHED);
 
-		if (btree_node_is_extents(b)
-		    ? bkey_cmp(insert->k.p, bkey_start_pos(k.k)) <= 0
-		    : bkey_cmp(insert->k.p, k.k->p))
-			break;
+		bkey_init(&unpacked);
+		old = (struct bkey_s_c) { &unpacked, NULL };
+
+		ret = bch2_trans_mark_key(trans, old, bkey_i_to_s_c(new),
+					  0, new->k.size,
+					  BTREE_TRIGGER_INSERT);
+		if (ret)
+			return ret;
+
+		while ((_old = bch2_btree_node_iter_peek(&node_iter, b))) {
+			unsigned flags = BTREE_TRIGGER_OVERWRITE;
+			unsigned offset = 0;
+			s64 sectors;
+
+			old = bkey_disassemble(b, _old, &unpacked);
+			sectors = -((s64) old.k->size);
+
+			flags |= BTREE_TRIGGER_OVERWRITE;
+
+			if (bkey_cmp(new->k.p, bkey_start_pos(old.k)) <= 0)
+				return 0;
 
-		if (btree_node_is_extents(b)) {
-			switch (bch2_extent_overlap(&insert->k, k.k)) {
+			switch (bch2_extent_overlap(&new->k, old.k)) {
 			case BCH_EXTENT_OVERLAP_ALL:
 				offset = 0;
-				sectors = -((s64) k.k->size);
+				sectors = -((s64) old.k->size);
 				break;
 			case BCH_EXTENT_OVERLAP_BACK:
-				offset = bkey_start_offset(&insert->k) -
-					bkey_start_offset(k.k);
-				sectors = bkey_start_offset(&insert->k) -
-					k.k->p.offset;
+				offset = bkey_start_offset(&new->k) -
+					bkey_start_offset(old.k);
+				sectors = bkey_start_offset(&new->k) -
+					old.k->p.offset;
 				break;
 			case BCH_EXTENT_OVERLAP_FRONT:
 				offset = 0;
-				sectors = bkey_start_offset(k.k) -
-					insert->k.p.offset;
+				sectors = bkey_start_offset(old.k) -
+					new->k.p.offset;
 				break;
 			case BCH_EXTENT_OVERLAP_MIDDLE:
-				offset = bkey_start_offset(&insert->k) -
-					bkey_start_offset(k.k);
-				sectors = -((s64) insert->k.size);
+				offset = bkey_start_offset(&new->k) -
+					bkey_start_offset(old.k);
+				sectors = -((s64) new->k.size);
 				flags |= BTREE_TRIGGER_OVERWRITE_SPLIT;
 				break;
 			}
 
 			BUG_ON(sectors >= 0);
-		}
 
-		ret = bch2_trans_mark_key(trans, k, offset, sectors, flags);
-		if (ret)
-			return ret;
+			ret = bch2_trans_mark_key(trans, old, bkey_i_to_s_c(new),
+					offset, sectors, flags);
+			if (ret)
+				return ret;
 
-		bch2_btree_node_iter_advance(&node_iter, b);
+			bch2_btree_node_iter_advance(&node_iter, b);
+		}
 	}
 
-	return 0;
+	return ret;
 }
 
 /* Disk reservations: */
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index c85015071c6d..7ee63413f83c 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -270,7 +270,7 @@ int bch2_mark_update(struct btree_trans *, struct btree_iter *,
 int bch2_replicas_delta_list_apply(struct bch_fs *,
 				   struct bch_fs_usage *,
 				   struct replicas_delta_list *);
-int bch2_trans_mark_key(struct btree_trans *, struct bkey_s_c,
+int bch2_trans_mark_key(struct btree_trans *, struct bkey_s_c, struct bkey_s_c,
 			unsigned, s64, unsigned);
 int bch2_trans_mark_update(struct btree_trans *, struct btree_iter *iter,
 			   struct bkey_i *insert, unsigned);
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index ecd51d45743a..1883a1faf380 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -458,7 +458,9 @@ retry:
 		bch2_btree_iter_set_pos(iter, split->k.p);
 
 		if (remark) {
-			ret = bch2_trans_mark_key(&trans, bkey_i_to_s_c(split),
+			ret = bch2_trans_mark_key(&trans,
+						  bkey_s_c_null,
+						  bkey_i_to_s_c(split),
 						  0, split->k.size,
 						  BTREE_TRIGGER_INSERT);
 			if (ret)
@@ -467,7 +469,9 @@ retry:
 	} while (bkey_cmp(iter->pos, k->k.p) < 0);
 
 	if (remark) {
-		ret = bch2_trans_mark_key(&trans, bkey_i_to_s_c(k),
+		ret = bch2_trans_mark_key(&trans,
+					  bkey_i_to_s_c(k),
+					  bkey_s_c_null,
 					  0, -((s64) k->k.size),
 					  BTREE_TRIGGER_OVERWRITE);
 		if (ret)
-- 
cgit 


From 5b9bf43c81e6c0fd9d4f16351d53f26e7e6d19b5 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 10 Dec 2020 13:38:54 -0500
Subject: bcachefs: Always check if we need disk res in extent update path

With erasure coding, we now have processes in the background that
compact data, causing it to take up less space on disk than when it was
written, or potentially when it was read.

This means that we can't trust the page cache when it says "we have data
on disk taking up x amount of space here" - there's always the potential
to race with background compaction.

To fix this, just check if we need to add to our disk reservation in the
bch2_extent_update() path, in the transaction that will do the btree
update.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/io.c | 59 +++++++++++++++++++++++++++++++++-----------------------
 1 file changed, 35 insertions(+), 24 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index 9603381bb7ce..2b381fc96009 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -195,34 +195,33 @@ void bch2_bio_alloc_pages_pool(struct bch_fs *c, struct bio *bio,
 static int sum_sector_overwrites(struct btree_trans *trans,
 				 struct btree_iter *extent_iter,
 				 struct bkey_i *new,
-				 bool may_allocate,
 				 bool *maybe_extending,
-				 s64 *delta)
+				 s64 *i_sectors_delta,
+				 s64 *disk_sectors_delta)
 {
 	struct btree_iter *iter;
 	struct bkey_s_c old;
 	int ret = 0;
 
-	*maybe_extending = true;
-	*delta = 0;
+	*maybe_extending	= true;
+	*i_sectors_delta	= 0;
+	*disk_sectors_delta	= 0;
 
 	iter = bch2_trans_copy_iter(trans, extent_iter);
 
 	for_each_btree_key_continue(iter, BTREE_ITER_SLOTS, old, ret) {
-		if (!may_allocate &&
-		    bch2_bkey_nr_ptrs_fully_allocated(old) <
-		    bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(new))) {
-			ret = -ENOSPC;
-			break;
-		}
+		s64 sectors = min(new->k.p.offset, old.k->p.offset) -
+			max(bkey_start_offset(&new->k),
+			    bkey_start_offset(old.k));
 
-		*delta += (min(new->k.p.offset,
-			      old.k->p.offset) -
-			  max(bkey_start_offset(&new->k),
-			      bkey_start_offset(old.k))) *
+		*i_sectors_delta += sectors *
 			(bkey_extent_is_allocation(&new->k) -
 			 bkey_extent_is_allocation(old.k));
 
+		*disk_sectors_delta += sectors *
+			(int) (bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(new)) -
+			       bch2_bkey_nr_ptrs_fully_allocated(old));
+
 		if (bkey_cmp(old.k->p, new->k.p) >= 0) {
 			/*
 			 * Check if there's already data above where we're
@@ -256,12 +255,12 @@ int bch2_extent_update(struct btree_trans *trans,
 		       struct disk_reservation *disk_res,
 		       u64 *journal_seq,
 		       u64 new_i_size,
-		       s64 *i_sectors_delta)
+		       s64 *i_sectors_delta_total)
 {
 	/* this must live until after bch2_trans_commit(): */
 	struct bkey_inode_buf inode_p;
 	bool extending = false;
-	s64 delta = 0;
+	s64 i_sectors_delta = 0, disk_sectors_delta = 0;
 	int ret;
 
 	ret = bch2_extent_trim_atomic(k, iter);
@@ -269,16 +268,26 @@ int bch2_extent_update(struct btree_trans *trans,
 		return ret;
 
 	ret = sum_sector_overwrites(trans, iter, k,
-			disk_res && disk_res->sectors != 0,
-			&extending, &delta);
+			&extending,
+			&i_sectors_delta,
+			&disk_sectors_delta);
 	if (ret)
 		return ret;
 
+	if (disk_res &&
+	    disk_sectors_delta > (s64) disk_res->sectors) {
+		ret = bch2_disk_reservation_add(trans->c, disk_res,
+					disk_sectors_delta - disk_res->sectors,
+					0);
+		if (ret)
+			return ret;
+	}
+
 	new_i_size = extending
 		? min(k->k.p.offset << 9, new_i_size)
 		: 0;
 
-	if (delta || new_i_size) {
+	if (i_sectors_delta || new_i_size) {
 		struct btree_iter *inode_iter;
 		struct bch_inode_unpacked inode_u;
 
@@ -305,9 +314,9 @@ int bch2_extent_update(struct btree_trans *trans,
 		else
 			new_i_size = 0;
 
-		inode_u.bi_sectors += delta;
+		inode_u.bi_sectors += i_sectors_delta;
 
-		if (delta || new_i_size) {
+		if (i_sectors_delta || new_i_size) {
 			bch2_inode_pack(trans->c, &inode_p, &inode_u);
 			bch2_trans_update(trans, inode_iter,
 					  &inode_p.inode.k_i, 0);
@@ -322,10 +331,12 @@ int bch2_extent_update(struct btree_trans *trans,
 				BTREE_INSERT_NOCHECK_RW|
 				BTREE_INSERT_NOFAIL|
 				BTREE_INSERT_USE_RESERVE);
-	if (!ret && i_sectors_delta)
-		*i_sectors_delta += delta;
+	if (ret)
+		return ret;
 
-	return ret;
+	if (i_sectors_delta_total)
+		*i_sectors_delta_total += i_sectors_delta;
+	return 0;
 }
 
 int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter,
-- 
cgit 


From 537c49d6afadb4be54be03c9a8cb1f1ade07b104 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 11 Dec 2020 12:02:48 -0500
Subject: bcachefs: Fix btree node merge -> split operations

If a btree node merger is followed by a split or compact of the parent
node, we could end up with the parent btree node iterator pointing to
the whiteout inserted by the btree node merge operation - the fix is to
ensure that interior btree node iterators always point to the first non
whiteout.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 239d7c5deddc..c0333ee94463 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -902,6 +902,13 @@ static inline void __btree_iter_init(struct btree_iter *iter,
 
 	bch2_btree_node_iter_init(&l->iter, l->b, &pos);
 
+	/*
+	 * Iterators to interior nodes should always be pointed at the first non
+	 * whiteout:
+	 */
+	if (level)
+		bch2_btree_node_iter_peek(&l->iter, l->b);
+
 	btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
 }
 
-- 
cgit 


From 1d8305c11a289a13591d4c51726803cd37d8f646 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 13 Dec 2020 16:12:04 -0500
Subject: bcachefs: Add some cond_rescheds() in shutdown path

Particularly on emergency shutdown we can end up having to clean up a
lot of dirty cached btree keys here.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_key_cache.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index 7416e7a49893..e6808d7139c6 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -581,6 +581,8 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc)
 	list_splice(&bc->dirty, &bc->clean);
 
 	list_for_each_entry_safe(ck, n, &bc->clean, list) {
+		cond_resched();
+
 		bch2_journal_pin_drop(&c->journal, &ck->journal);
 		bch2_journal_preres_put(&c->journal, &ck->res);
 
@@ -594,6 +596,8 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc)
 	BUG_ON(bc->nr_keys);
 
 	list_for_each_entry_safe(ck, n, &bc->freed, list) {
+		cond_resched();
+
 		list_del(&ck->list);
 		kmem_cache_free(bch2_key_cache, ck);
 	}
-- 
cgit 


From ded54580bdf18ba3a2b38e7910c54b1c53f007c6 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 16 Dec 2020 14:18:33 -0500
Subject: bcachefs: Check for duplicate device ptrs in bch2_bkey_ptrs_invalid()

This is something we clearly should be checking for, but weren't -
oops.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/extents.c  | 8 ++++++++
 fs/bcachefs/replicas.c | 5 -----
 fs/bcachefs/util.h     | 5 +++++
 3 files changed, 13 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index f9838c1f36db..7cdfd09d797e 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -1045,11 +1045,13 @@ static const char *extent_ptr_invalid(const struct bch_fs *c,
 const char *bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k)
 {
 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+	struct bch_devs_list devs;
 	const union bch_extent_entry *entry;
 	struct bch_extent_crc_unpacked crc;
 	unsigned size_ondisk = k.k->size;
 	const char *reason;
 	unsigned nonce = UINT_MAX;
+	unsigned i;
 
 	if (k.k->type == KEY_TYPE_btree_ptr)
 		size_ondisk = c->opts.btree_node_size;
@@ -1100,6 +1102,12 @@ const char *bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k)
 		}
 	}
 
+	devs = bch2_bkey_devs(k);
+	bubble_sort(devs.devs, devs.nr, u8_cmp);
+	for (i = 0; i + 1 < devs.nr; i++)
+		if (devs.devs[i] == devs.devs[i + 1])
+			return "multiple ptrs to same device";
+
 	return NULL;
 }
 
diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c
index 85c97f67936a..57c2e66edad1 100644
--- a/fs/bcachefs/replicas.c
+++ b/fs/bcachefs/replicas.c
@@ -11,11 +11,6 @@ static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *,
 
 /* Replicas tracking - in memory: */
 
-static inline int u8_cmp(u8 l, u8 r)
-{
-	return cmp_int(l, r);
-}
-
 static void verify_replicas_entry(struct bch_replicas_entry *e)
 {
 #ifdef CONFIG_BCACHEFS_DEBUG
diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h
index 7b7c638d8904..91aa8c0a0e09 100644
--- a/fs/bcachefs/util.h
+++ b/fs/bcachefs/util.h
@@ -750,4 +750,9 @@ u64 *bch2_acc_percpu_u64s(u64 __percpu *, unsigned);
 
 #define cmp_int(l, r)		((l > r) - (l < r))
 
+static inline int u8_cmp(u8 l, u8 r)
+{
+	return cmp_int(l, r);
+}
+
 #endif /* _BCACHEFS_UTIL_H */
-- 
cgit 


From ffb7c3d370a104d14ad0658b359cdf04ae679f04 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 16 Dec 2020 14:23:27 -0500
Subject: bcachefs: Add BCH_BKEY_PTRS_MAX

This now means "the maximum number of pointers within a bkey" - and
bch_devs_list is updated to use it instead of BCH_REPLICAS_MAX, since
stripes can contain more than BCH_REPLICAS_MAX pointers.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs_format.h | 2 ++
 fs/bcachefs/ec.c              | 8 ++++----
 fs/bcachefs/ec.h              | 8 ++++----
 fs/bcachefs/ec_types.h        | 6 ++----
 fs/bcachefs/super_types.h     | 2 +-
 5 files changed, 13 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index 7df2bc7ecd4f..3f8281b5db41 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -1363,6 +1363,8 @@ enum bch_sb_compat {
 
 #define BCH_REPLICAS_MAX		4U
 
+#define BCH_BKEY_PTRS_MAX		16U
+
 enum bch_error_actions {
 	BCH_ON_ERROR_CONTINUE		= 0,
 	BCH_ON_ERROR_RO			= 1,
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index db1c652f1ed4..95abc00bd0e0 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -300,7 +300,7 @@ static unsigned ec_nr_failed(struct ec_stripe_buf *buf)
 static int ec_do_recov(struct bch_fs *c, struct ec_stripe_buf *buf)
 {
 	struct bch_stripe *v = &buf->key.v;
-	unsigned i, failed[EC_STRIPE_MAX], nr_failed = 0;
+	unsigned i, failed[BCH_BKEY_PTRS_MAX], nr_failed = 0;
 	unsigned nr_data = v->nr_blocks - v->nr_redundant;
 	unsigned bytes = buf->size << 9;
 
@@ -1101,7 +1101,7 @@ static int ec_new_stripe_alloc(struct bch_fs *c, struct ec_stripe_head *h)
 	s->c		= c;
 	s->h		= h;
 	s->nr_data	= min_t(unsigned, h->nr_active_devs,
-				EC_STRIPE_MAX) - h->redundancy;
+				BCH_BKEY_PTRS_MAX) - h->redundancy;
 	s->nr_parity	= h->redundancy;
 
 	bch2_keylist_init(&s->keys, s->inline_keys);
@@ -1211,13 +1211,13 @@ static int new_stripe_alloc_buckets(struct bch_fs *c, struct ec_stripe_head *h)
 	struct open_bucket *ob;
 	unsigned i, nr_have, nr_data =
 		min_t(unsigned, h->nr_active_devs,
-		      EC_STRIPE_MAX) - h->redundancy;
+		      BCH_BKEY_PTRS_MAX) - h->redundancy;
 	bool have_cache = true;
 	int ret = 0;
 
 	devs = h->devs;
 
-	for_each_set_bit(i, h->s->blocks_allocated, EC_STRIPE_MAX) {
+	for_each_set_bit(i, h->s->blocks_allocated, BCH_BKEY_PTRS_MAX) {
 		__clear_bit(h->s->stripe.key.v.ptrs[i].dev, devs.d);
 		--nr_data;
 	}
diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h
index 15f751fc2a35..450bb1a113a3 100644
--- a/fs/bcachefs/ec.h
+++ b/fs/bcachefs/ec.h
@@ -71,9 +71,9 @@ struct ec_stripe_buf {
 	/* might not be buffering the entire stripe: */
 	unsigned		offset;
 	unsigned		size;
-	unsigned long		valid[BITS_TO_LONGS(EC_STRIPE_MAX)];
+	unsigned long		valid[BITS_TO_LONGS(BCH_BKEY_PTRS_MAX)];
 
-	void			*data[EC_STRIPE_MAX];
+	void			*data[BCH_BKEY_PTRS_MAX];
 
 	union {
 		struct bkey_i_stripe	key;
@@ -101,10 +101,10 @@ struct ec_stripe_new {
 	bool			existing_stripe;
 	u64			existing_stripe_idx;
 
-	unsigned long		blocks_allocated[BITS_TO_LONGS(EC_STRIPE_MAX)];
+	unsigned long		blocks_allocated[BITS_TO_LONGS(BCH_BKEY_PTRS_MAX)];
 
 	struct open_buckets	blocks;
-	u8			data_block_idx[EC_STRIPE_MAX];
+	u8			data_block_idx[BCH_BKEY_PTRS_MAX];
 	struct open_buckets	parity;
 	struct disk_reservation	res;
 
diff --git a/fs/bcachefs/ec_types.h b/fs/bcachefs/ec_types.h
index e4d633fca5bf..5b688b4394f7 100644
--- a/fs/bcachefs/ec_types.h
+++ b/fs/bcachefs/ec_types.h
@@ -4,11 +4,9 @@
 
 #include <linux/llist.h>
 
-#define EC_STRIPE_MAX	16
-
 struct bch_replicas_padded {
 	struct bch_replicas_entry	e;
-	u8				pad[EC_STRIPE_MAX];
+	u8				pad[BCH_BKEY_PTRS_MAX];
 };
 
 struct stripe {
@@ -24,7 +22,7 @@ struct stripe {
 	unsigned		dirty:1;
 	unsigned		on_heap:1;
 	u8			blocks_nonempty;
-	u16			block_sectors[EC_STRIPE_MAX];
+	u16			block_sectors[BCH_BKEY_PTRS_MAX];
 
 	struct bch_replicas_padded r;
 };
diff --git a/fs/bcachefs/super_types.h b/fs/bcachefs/super_types.h
index 6d0168a73ee4..e3a989e3e9d9 100644
--- a/fs/bcachefs/super_types.h
+++ b/fs/bcachefs/super_types.h
@@ -21,7 +21,7 @@ struct bch_devs_mask {
 
 struct bch_devs_list {
 	u8			nr;
-	u8			devs[BCH_REPLICAS_MAX + 1];
+	u8			devs[BCH_BKEY_PTRS_MAX];
 };
 
 struct bch_member_cpu {
-- 
cgit 


From f30dd8601262c74caf148fe834418ad7c931af66 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 16 Oct 2020 21:39:16 -0400
Subject: bcachefs: Don't write bucket IO time lazily

With the btree key cache code, we don't need to update the alloc btree
lazily - and this will mean we can remove the bch2_alloc_write() call in
the shutdown path.

Future work: we really need to expend the bucket IO clocks from 16 to 64
bits, so that we don't have to rescale them.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c | 48 ++++++++++++++++++++++++++++++++++++++++++
 fs/bcachefs/alloc_background.h |  2 ++
 fs/bcachefs/alloc_foreground.c |  2 --
 fs/bcachefs/buckets.h          |  6 ------
 fs/bcachefs/fs-io.c            |  2 +-
 fs/bcachefs/io.c               | 16 ++++++++------
 fs/bcachefs/io.h               |  6 +++---
 fs/bcachefs/move.c             |  7 +++---
 8 files changed, 67 insertions(+), 22 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 078968f30175..1ef695acc7d6 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -489,6 +489,54 @@ static void bch2_bucket_clock_init(struct bch_fs *c, int rw)
 	mutex_init(&clock->lock);
 }
 
+int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev,
+			      size_t bucket_nr, int rw)
+{
+	struct bch_fs *c = trans->c;
+	struct bch_dev *ca = bch_dev_bkey_exists(c, dev);
+	struct btree_iter *iter;
+	struct bucket *g;
+	struct bkey_i_alloc *a;
+	struct bkey_alloc_unpacked u;
+	u16 *time;
+	int ret = 0;
+
+	iter = bch2_trans_get_iter(trans, BTREE_ID_ALLOC, POS(dev, bucket_nr),
+				   BTREE_ITER_CACHED|
+				   BTREE_ITER_CACHED_NOFILL|
+				   BTREE_ITER_INTENT);
+	ret = bch2_btree_iter_traverse(iter);
+	if (ret)
+		goto out;
+
+	a = bch2_trans_kmalloc(trans, BKEY_ALLOC_U64s_MAX * 8);
+	ret = PTR_ERR_OR_ZERO(a);
+	if (ret)
+		goto out;
+
+	percpu_down_read(&c->mark_lock);
+	g = bucket(ca, bucket_nr);
+	u = alloc_mem_to_key(g, READ_ONCE(g->mark));
+	percpu_up_read(&c->mark_lock);
+
+	bkey_alloc_init(&a->k_i);
+	a->k.p = iter->pos;
+
+	time = rw == READ ? &u.read_time : &u.write_time;
+	if (*time == c->bucket_clock[rw].hand)
+		goto out;
+
+	*time = c->bucket_clock[rw].hand;
+
+	bch2_alloc_pack(a, u);
+
+	ret   = bch2_trans_update(trans, iter, &a->k_i, 0) ?:
+		bch2_trans_commit(trans, NULL, NULL, 0);
+out:
+	bch2_trans_iter_put(trans, iter);
+	return ret;
+}
+
 /* Background allocator thread: */
 
 /*
diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h
index 8e3abb89dfb7..d10ff56e4de1 100644
--- a/fs/bcachefs/alloc_background.h
+++ b/fs/bcachefs/alloc_background.h
@@ -31,6 +31,8 @@ struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c);
 void bch2_alloc_pack(struct bkey_i_alloc *,
 		     const struct bkey_alloc_unpacked);
 
+int bch2_bucket_io_time_reset(struct btree_trans *, unsigned, size_t, int);
+
 static inline struct bkey_alloc_unpacked
 alloc_mem_to_key(struct bucket *g, struct bucket_mark m)
 {
diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index 169ddfad7ea0..82a49831afb7 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -309,8 +309,6 @@ out:
 		.dev	= ca->dev_idx,
 	};
 
-	bucket_io_clock_reset(c, ca, bucket, READ);
-	bucket_io_clock_reset(c, ca, bucket, WRITE);
 	spin_unlock(&ob->lock);
 
 	if (c->blocked_allocate_open_bucket) {
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index 7ee63413f83c..2e9c4e46c61c 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -58,12 +58,6 @@ static inline struct bucket *bucket(struct bch_dev *ca, size_t b)
 	return __bucket(ca, b, false);
 }
 
-static inline void bucket_io_clock_reset(struct bch_fs *c, struct bch_dev *ca,
-					 size_t b, int rw)
-{
-	bucket(ca, b)->io_time[rw] = c->bucket_clock[rw].hand;
-}
-
 static inline u16 bucket_last_io(struct bch_fs *c, struct bucket *g, int rw)
 {
 	return c->bucket_clock[rw].hand - g->io_time[rw];
diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 2d31547446ac..4f270b6cdf66 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -826,7 +826,7 @@ retry:
 		if (bkey_extent_is_allocation(k.k))
 			bch2_add_page_sectors(&rbio->bio, k);
 
-		bch2_read_extent(c, rbio, k, offset_into_extent, flags);
+		bch2_read_extent(trans, rbio, k, offset_into_extent, flags);
 
 		if (flags & BCH_READ_LAST_FRAGMENT)
 			break;
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index 2b381fc96009..a67bd18f6b8c 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -7,6 +7,7 @@
  */
 
 #include "bcachefs.h"
+#include "alloc_background.h"
 #include "alloc_foreground.h"
 #include "bkey_on_stack.h"
 #include "bset.h"
@@ -1640,7 +1641,7 @@ retry:
 		goto out;
 	}
 
-	ret = __bch2_read_extent(c, rbio, bvec_iter, k, 0, failed, flags);
+	ret = __bch2_read_extent(&trans, rbio, bvec_iter, k, 0, failed, flags);
 	if (ret == READ_RETRY)
 		goto retry;
 	if (ret)
@@ -1698,7 +1699,7 @@ retry:
 		bytes = min(sectors, bvec_iter_sectors(bvec_iter)) << 9;
 		swap(bvec_iter.bi_size, bytes);
 
-		ret = __bch2_read_extent(c, rbio, bvec_iter, k,
+		ret = __bch2_read_extent(&trans, rbio, bvec_iter, k,
 				offset_into_extent, failed, flags);
 		switch (ret) {
 		case READ_RETRY:
@@ -2026,11 +2027,12 @@ err:
 	return ret;
 }
 
-int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig,
+int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig,
 		       struct bvec_iter iter, struct bkey_s_c k,
 		       unsigned offset_into_extent,
 		       struct bch_io_failures *failed, unsigned flags)
 {
+	struct bch_fs *c = trans->c;
 	struct extent_ptr_decoded pick;
 	struct bch_read_bio *rbio = NULL;
 	struct bch_dev *ca;
@@ -2200,9 +2202,9 @@ get_bio:
 
 	bch2_increment_clock(c, bio_sectors(&rbio->bio), READ);
 
-	rcu_read_lock();
-	bucket_io_clock_reset(c, ca, PTR_BUCKET_NR(ca, &pick.ptr), READ);
-	rcu_read_unlock();
+	if (pick.ptr.cached)
+		bch2_bucket_io_time_reset(trans, pick.ptr.dev,
+			PTR_BUCKET_NR(ca, &pick.ptr), READ);
 
 	if (!(flags & (BCH_READ_IN_RETRY|BCH_READ_LAST_FRAGMENT))) {
 		bio_inc_remaining(&orig->bio);
@@ -2351,7 +2353,7 @@ retry:
 		if (rbio->bio.bi_iter.bi_size == bytes)
 			flags |= BCH_READ_LAST_FRAGMENT;
 
-		bch2_read_extent(c, rbio, k, offset_into_extent, flags);
+		bch2_read_extent(&trans, rbio, k, offset_into_extent, flags);
 
 		if (flags & BCH_READ_LAST_FRAGMENT)
 			break;
diff --git a/fs/bcachefs/io.h b/fs/bcachefs/io.h
index be4aa3875360..379263a935fa 100644
--- a/fs/bcachefs/io.h
+++ b/fs/bcachefs/io.h
@@ -140,17 +140,17 @@ enum bch_read_flags {
 	BCH_READ_IN_RETRY		= 1 << 7,
 };
 
-int __bch2_read_extent(struct bch_fs *, struct bch_read_bio *,
+int __bch2_read_extent(struct btree_trans *, struct bch_read_bio *,
 		       struct bvec_iter, struct bkey_s_c, unsigned,
 		       struct bch_io_failures *, unsigned);
 
-static inline void bch2_read_extent(struct bch_fs *c,
+static inline void bch2_read_extent(struct btree_trans *trans,
 				    struct bch_read_bio *rbio,
 				    struct bkey_s_c k,
 				    unsigned offset_into_extent,
 				    unsigned flags)
 {
-	__bch2_read_extent(c, rbio, rbio->bio.bi_iter, k,
+	__bch2_read_extent(trans, rbio, rbio->bio.bi_iter, k,
 			   offset_into_extent, NULL, flags);
 }
 
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index 9d190ae4f391..62a6bbd676ae 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -414,7 +414,7 @@ static void bch2_move_ctxt_wait_for_io(struct moving_context *ctxt)
 		atomic_read(&ctxt->write_sectors) != sectors_pending);
 }
 
-static int bch2_move_extent(struct bch_fs *c,
+static int bch2_move_extent(struct btree_trans *trans,
 			    struct moving_context *ctxt,
 			    struct write_point_specifier wp,
 			    struct bch_io_opts io_opts,
@@ -423,6 +423,7 @@ static int bch2_move_extent(struct bch_fs *c,
 			    enum data_cmd data_cmd,
 			    struct data_opts data_opts)
 {
+	struct bch_fs *c = trans->c;
 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
 	struct moving_io *io;
 	const union bch_extent_entry *entry;
@@ -489,7 +490,7 @@ static int bch2_move_extent(struct bch_fs *c,
 	 * ctxt when doing wakeup
 	 */
 	closure_get(&ctxt->cl);
-	bch2_read_extent(c, &io->rbio, k, 0,
+	bch2_read_extent(trans, &io->rbio, k, 0,
 			 BCH_READ_NODECODE|
 			 BCH_READ_LAST_FRAGMENT);
 	return 0;
@@ -607,7 +608,7 @@ peek:
 		k = bkey_i_to_s_c(sk.k);
 		bch2_trans_unlock(&trans);
 
-		ret2 = bch2_move_extent(c, ctxt, wp, io_opts, btree_id, k,
+		ret2 = bch2_move_extent(&trans, ctxt, wp, io_opts, btree_id, k,
 					data_cmd, data_opts);
 		if (ret2) {
 			if (ret2 == -ENOMEM) {
-- 
cgit 


From d483dd17e2bfd6858498d39eb92abd232d7b8e97 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 16 Dec 2020 15:41:29 -0500
Subject: bcachefs: Fix race between journal_seq_copy() and journal_seq_drop()

In bch2_btree_interior_update_will_free_node, we copy the journal pins
from outstanding writes on the btree node we're about to free. But, this
can race with the writes completing, and dropping their journal pins.

To guard against this, just use READ_ONCE() in bch2_journal_pin_copy().

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/journal_reclaim.c | 14 ++++++++++++--
 fs/bcachefs/journal_reclaim.h |  7 +++++--
 2 files changed, 17 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
index e8fd11abe4c3..1141b7d3a060 100644
--- a/fs/bcachefs/journal_reclaim.c
+++ b/fs/bcachefs/journal_reclaim.c
@@ -384,12 +384,22 @@ void bch2_journal_pin_set(struct journal *j, u64 seq,
 	struct journal_entry_pin_list *pin_list;
 
 	spin_lock(&j->lock);
+
+	if (seq < journal_last_seq(j)) {
+		/*
+		 * bch2_journal_pin_copy() raced with bch2_journal_pin_drop() on
+		 * the src pin - with the pin dropped, the entry to pin might no
+		 * longer to exist, but that means there's no longer anything to
+		 * copy and we can bail out here:
+		 */
+		spin_unlock(&j->lock);
+		return;
+	}
+
 	pin_list = journal_seq_pin(j, seq);
 
 	__journal_pin_drop(j, pin);
 
-	BUG_ON(!atomic_read(&pin_list->count) && seq == journal_last_seq(j));
-
 	atomic_inc(&pin_list->count);
 	pin->seq	= seq;
 	pin->flush	= flush_fn;
diff --git a/fs/bcachefs/journal_reclaim.h b/fs/bcachefs/journal_reclaim.h
index f02caa3d49ea..adf1f5c981cd 100644
--- a/fs/bcachefs/journal_reclaim.h
+++ b/fs/bcachefs/journal_reclaim.h
@@ -53,8 +53,11 @@ static inline void bch2_journal_pin_copy(struct journal *j,
 					 struct journal_entry_pin *src,
 					 journal_pin_flush_fn flush_fn)
 {
-	if (journal_pin_active(src))
-		bch2_journal_pin_add(j, src->seq, dst, flush_fn);
+	/* Guard against racing with journal_pin_drop(src): */
+	u64 seq = READ_ONCE(src->seq);
+
+	if (seq)
+		bch2_journal_pin_add(j, seq, dst, flush_fn);
 }
 
 static inline void bch2_journal_pin_update(struct journal *j, u64 seq,
-- 
cgit 


From e323edd6d39094ce021dbb20e513b03ae6ebecbe Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 19 Dec 2020 15:39:10 -0500
Subject: bcachefs: Fix for spinning in journal reclaim on startup

We normally avoid having too many dirty keys in the btree key cache, to
ensure that we can always shrink our caches to reclaim memory if needed.

But this check was causing us to go into an infinite loop on startup, in
the btree insert path before journal reclaim was started.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_key_cache.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_key_cache.h b/fs/bcachefs/btree_key_cache.h
index dad3e344dcf9..2f8b5521718a 100644
--- a/fs/bcachefs/btree_key_cache.h
+++ b/fs/bcachefs/btree_key_cache.h
@@ -16,7 +16,8 @@ static inline bool bch2_btree_key_cache_must_wait(struct bch_fs *c)
 	size_t nr_keys = READ_ONCE(c->btree_key_cache.nr_keys);
 	size_t max_dirty = 4096 + (nr_keys * 3) / 4;
 
-	return nr_dirty > max_dirty;
+	return nr_dirty > max_dirty &&
+		test_bit(JOURNAL_RECLAIM_STARTED, &c->journal.flags);
 }
 
 struct bkey_cached *
-- 
cgit 


From 07bd4c285b79e068d2e6986a4cc60703434f1eed Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 19 Dec 2020 21:31:05 -0500
Subject: bcachefs: Fix btree lock being incorrectly dropped

__btree_trans_get_iter() was using bch2_btree_iter_upgrade, but it
shouldn't have been because on failure bch2_btree_iter_upgrade may drop
locks in other iterators, expecting the transaction to be restarted. But
__btree_trans_get_iter can't return an error to indicate that we need to
restart thet transaction - oops.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c        | 9 ++++++---
 fs/bcachefs/btree_update_leaf.c | 8 ++++----
 2 files changed, 10 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index c0333ee94463..15963a657c72 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -2124,9 +2124,12 @@ static struct btree_iter *__btree_trans_get_iter(struct btree_trans *trans,
 	iter->flags &= ~BTREE_ITER_USER_FLAGS;
 	iter->flags |= flags & BTREE_ITER_USER_FLAGS;
 
-	if (iter->flags & BTREE_ITER_INTENT)
-		bch2_btree_iter_upgrade(iter, 1);
-	else
+	if (iter->flags & BTREE_ITER_INTENT) {
+		if (!iter->locks_want) {
+			__bch2_btree_iter_unlock(iter);
+			iter->locks_want = 1;
+		}
+	} else
 		bch2_btree_iter_downgrade(iter);
 
 	BUG_ON(iter->btree_id != btree_id);
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index e27ec0fbee2c..a2ec2e58f9e4 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -869,8 +869,8 @@ int __bch2_trans_commit(struct btree_trans *trans)
 		trans_trigger_run = false;
 
 		trans_for_each_update(trans, i) {
-			if (unlikely(i->iter->uptodate > BTREE_ITER_NEED_PEEK &&
-				     (ret = bch2_btree_iter_traverse(i->iter)))) {
+			ret = bch2_btree_iter_traverse(i->iter);
+			if (unlikely(ret)) {
 				trace_trans_restart_traverse(trans->ip);
 				goto out;
 			}
@@ -879,8 +879,8 @@ int __bch2_trans_commit(struct btree_trans *trans)
 			 * We're not using bch2_btree_iter_upgrade here because
 			 * we know trans->nounlock can't be set:
 			 */
-			if (unlikely(i->iter->locks_want < 1 &&
-				     !__bch2_btree_iter_upgrade(i->iter, 1))) {
+			if (unlikely(!btree_node_intent_locked(i->iter, i->iter->level) &&
+				     !__bch2_btree_iter_upgrade(i->iter, i->iter->level + 1))) {
 				trace_trans_restart_upgrade(trans->ip);
 				ret = -EINTR;
 				goto out;
-- 
cgit 


From f0e70018d14ef94a5f680c977591ccb6cf29e9ca Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 20 Dec 2020 21:42:19 -0500
Subject: bcachefs: Fix iterator overflow in move path

The move path was calling bch2_bucket_io_time_reset() for cached
pointers (which it shouldn't have been), and then not calling
bch2_trans_reset() when it got -EINTR (indicating transaction restart).
Oops.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/io.c   | 6 +++++-
 fs/bcachefs/move.c | 6 ++++++
 2 files changed, 11 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index a67bd18f6b8c..ee2ba1b8aff9 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -2202,7 +2202,11 @@ get_bio:
 
 	bch2_increment_clock(c, bio_sectors(&rbio->bio), READ);
 
-	if (pick.ptr.cached)
+	/*
+	 * If it's being moved internally, we don't want to flag it as a cache
+	 * hit:
+	 */
+	if (pick.ptr.cached && !(flags & BCH_READ_NODECODE))
 		bch2_bucket_io_time_reset(trans, pick.ptr.dev,
 			PTR_BUCKET_NR(ca, &pick.ptr), READ);
 
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index 62a6bbd676ae..1b1a14d2fa23 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -611,6 +611,12 @@ peek:
 		ret2 = bch2_move_extent(&trans, ctxt, wp, io_opts, btree_id, k,
 					data_cmd, data_opts);
 		if (ret2) {
+			if (ret2 == -EINTR) {
+				bch2_trans_reset(&trans, 0);
+				bch2_trans_cond_resched(&trans);
+				continue;
+			}
+
 			if (ret2 == -ENOMEM) {
 				/* memory allocation failure, wait for some IO to finish */
 				bch2_move_ctxt_wait_for_io(ctxt);
-- 
cgit 


From 3187aa8d57025f60f1b8f9e14b6fc33f5e2d2960 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 21 Dec 2020 17:17:18 -0500
Subject: bcachefs: Don't use BTREE_INSERT_USE_RESERVE so much

Previously, we were using BTREE_INSERT_RESERVE in a lot of places where
it no longer makes sense.

 - we now have more open_buckets than we used to, and the reserves work
   better, so we shouldn't need to use BTREE_INSERT_RESERVE just because
   we're holding open_buckets pinned anymore.

 - We have the btree key cache for updates to the alloc btree, meaning
   we no longer need the btree reserve to ensure the allocator can make
   forward progress.

This means that we should only need a reserve for btree updates to
ensure that copygc can make forward progress.

Since it's now just for copygc, we can also fold RESERVE_BTREE into
RESERVE_MOVINGGC (the allocator's freelist reserve).

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c      | 10 +++-------
 fs/bcachefs/alloc_foreground.c      | 14 +-------------
 fs/bcachefs/alloc_types.h           |  8 +++-----
 fs/bcachefs/btree_gc.c              |  1 -
 fs/bcachefs/btree_key_cache.c       |  2 --
 fs/bcachefs/btree_update.h          |  2 --
 fs/bcachefs/btree_update_interior.c | 23 +++--------------------
 fs/bcachefs/btree_update_leaf.c     |  3 +--
 fs/bcachefs/buckets.c               |  3 +--
 fs/bcachefs/ec.c                    |  3 +--
 fs/bcachefs/io.c                    |  3 +--
 fs/bcachefs/journal.c               |  2 +-
 fs/bcachefs/move.c                  |  1 -
 fs/bcachefs/movinggc.c              |  5 +++++
 fs/bcachefs/sysfs.c                 |  2 --
 15 files changed, 20 insertions(+), 62 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 1ef695acc7d6..9920e902d383 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -319,9 +319,7 @@ retry:
 	bch2_trans_update(trans, iter, &a->k_i,
 			  BTREE_TRIGGER_NORUN);
 	ret = bch2_trans_commit(trans, NULL, NULL,
-				BTREE_INSERT_NOFAIL|
-				BTREE_INSERT_USE_RESERVE|
-				flags);
+				BTREE_INSERT_NOFAIL|flags);
 err:
 	if (ret == -EINTR)
 		goto retry;
@@ -575,8 +573,7 @@ static int wait_buckets_available(struct bch_fs *c, struct bch_dev *ca)
 
 		if (available > fifo_free(&ca->free_inc) ||
 		    (available &&
-		     (!fifo_full(&ca->free[RESERVE_BTREE]) ||
-		      !fifo_full(&ca->free[RESERVE_MOVINGGC]))))
+		     !fifo_full(&ca->free[RESERVE_MOVINGGC])))
 			break;
 
 		up_read(&c->gc_lock);
@@ -977,8 +974,7 @@ retry:
 				BTREE_INSERT_NOUNLOCK|
 				BTREE_INSERT_NOCHECK_RW|
 				BTREE_INSERT_NOFAIL|
-				BTREE_INSERT_USE_RESERVE|
-				BTREE_INSERT_USE_ALLOC_RESERVE|
+				BTREE_INSERT_JOURNAL_RESERVED|
 				flags);
 	if (ret == -EINTR)
 		goto retry;
diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index 82a49831afb7..1ea8ee99956b 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -204,10 +204,8 @@ success:
 static inline unsigned open_buckets_reserved(enum alloc_reserve reserve)
 {
 	switch (reserve) {
-	case RESERVE_ALLOC:
+	case RESERVE_MOVINGGC:
 		return 0;
-	case RESERVE_BTREE:
-		return OPEN_BUCKETS_COUNT / 4;
 	default:
 		return OPEN_BUCKETS_COUNT / 2;
 	}
@@ -263,16 +261,6 @@ struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca,
 		goto out;
 
 	switch (reserve) {
-	case RESERVE_ALLOC:
-		if (fifo_pop(&ca->free[RESERVE_BTREE], bucket))
-			goto out;
-		break;
-	case RESERVE_BTREE:
-		if (fifo_used(&ca->free[RESERVE_BTREE]) * 2 >=
-		    ca->free[RESERVE_BTREE].size &&
-		    fifo_pop(&ca->free[RESERVE_BTREE], bucket))
-			goto out;
-		break;
 	case RESERVE_MOVINGGC:
 		if (fifo_pop(&ca->free[RESERVE_MOVINGGC], bucket))
 			goto out;
diff --git a/fs/bcachefs/alloc_types.h b/fs/bcachefs/alloc_types.h
index 20705460bb0a..a510ca9a295b 100644
--- a/fs/bcachefs/alloc_types.h
+++ b/fs/bcachefs/alloc_types.h
@@ -37,11 +37,9 @@ struct bucket_clock {
 /* There is one reserve for each type of btree, one for prios and gens
  * and one for moving GC */
 enum alloc_reserve {
-	RESERVE_ALLOC		= -1,
-	RESERVE_BTREE		= 0,
-	RESERVE_MOVINGGC	= 1,
-	RESERVE_NONE		= 2,
-	RESERVE_NR		= 3,
+	RESERVE_MOVINGGC	= 0,
+	RESERVE_NONE		= 1,
+	RESERVE_NR		= 2,
 };
 
 typedef FIFO(long)	alloc_fifo;
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 5f5686466d7d..8ab4c0df0d83 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -233,7 +233,6 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id,
 			if (max_stale > 64)
 				bch2_btree_node_rewrite(c, iter,
 						b->data->keys.seq,
-						BTREE_INSERT_USE_RESERVE|
 						BTREE_INSERT_NOWAIT|
 						BTREE_INSERT_GC_LOCK_HELD);
 			else if (!bch2_btree_gc_rewrite_disabled &&
diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index e6808d7139c6..6dc13fa3d1f4 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -350,8 +350,6 @@ retry:
 				  BTREE_INSERT_NOUNLOCK|
 				  BTREE_INSERT_NOCHECK_RW|
 				  BTREE_INSERT_NOFAIL|
-				  BTREE_INSERT_USE_RESERVE|
-				  BTREE_INSERT_USE_ALLOC_RESERVE|
 				  BTREE_INSERT_JOURNAL_RESERVED|
 				  BTREE_INSERT_JOURNAL_RECLAIM);
 err:
diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
index adb07043cbb3..a25138080169 100644
--- a/fs/bcachefs/btree_update.h
+++ b/fs/bcachefs/btree_update.h
@@ -20,7 +20,6 @@ enum btree_insert_flags {
 	__BTREE_INSERT_NOCHECK_RW,
 	__BTREE_INSERT_LAZY_RW,
 	__BTREE_INSERT_USE_RESERVE,
-	__BTREE_INSERT_USE_ALLOC_RESERVE,
 	__BTREE_INSERT_JOURNAL_REPLAY,
 	__BTREE_INSERT_JOURNAL_RESERVED,
 	__BTREE_INSERT_JOURNAL_RECLAIM,
@@ -43,7 +42,6 @@ enum btree_insert_flags {
 
 /* for copygc, or when merging btree nodes */
 #define BTREE_INSERT_USE_RESERVE	(1 << __BTREE_INSERT_USE_RESERVE)
-#define BTREE_INSERT_USE_ALLOC_RESERVE	(1 << __BTREE_INSERT_USE_ALLOC_RESERVE)
 
 /* Insert is for journal replay - don't get journal reservations: */
 #define BTREE_INSERT_JOURNAL_REPLAY	(1 << __BTREE_INSERT_JOURNAL_REPLAY)
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 3ae920a223f9..6d69c7cb3665 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -201,12 +201,9 @@ static struct btree *__bch2_btree_node_alloc(struct bch_fs *c,
 	unsigned nr_reserve;
 	enum alloc_reserve alloc_reserve;
 
-	if (flags & BTREE_INSERT_USE_ALLOC_RESERVE) {
+	if (flags & BTREE_INSERT_USE_RESERVE) {
 		nr_reserve	= 0;
-		alloc_reserve	= RESERVE_ALLOC;
-	} else if (flags & BTREE_INSERT_USE_RESERVE) {
-		nr_reserve	= BTREE_NODE_RESERVE / 2;
-		alloc_reserve	= RESERVE_BTREE;
+		alloc_reserve	= RESERVE_MOVINGGC;
 	} else {
 		nr_reserve	= BTREE_NODE_RESERVE;
 		alloc_reserve	= RESERVE_NONE;
@@ -577,8 +574,6 @@ static void btree_update_nodes_written(struct btree_update *as)
 	bch2_trans_init(&trans, c, 0, 512);
 	ret = __bch2_trans_do(&trans, &as->disk_res, &journal_seq,
 			      BTREE_INSERT_NOFAIL|
-			      BTREE_INSERT_USE_RESERVE|
-			      BTREE_INSERT_USE_ALLOC_RESERVE|
 			      BTREE_INSERT_NOCHECK_RW|
 			      BTREE_INSERT_JOURNAL_RECLAIM|
 			      BTREE_INSERT_JOURNAL_RESERVED,
@@ -1457,15 +1452,6 @@ int bch2_btree_split_leaf(struct bch_fs *c, struct btree_iter *iter,
 	struct btree_update *as;
 	struct closure cl;
 	int ret = 0;
-	struct btree_insert_entry *i;
-
-	/*
-	 * We already have a disk reservation and open buckets pinned; this
-	 * allocation must not block:
-	 */
-	trans_for_each_update(trans, i)
-		if (btree_node_type_needs_gc(i->iter->btree_id))
-			flags |= BTREE_INSERT_USE_RESERVE;
 
 	closure_init_stack(&cl);
 
@@ -1926,10 +1912,7 @@ int bch2_btree_node_update_key(struct bch_fs *c, struct btree_iter *iter,
 retry:
 	as = bch2_btree_update_start(iter->trans, iter->btree_id,
 		parent ? btree_update_reserve_required(c, parent) : 0,
-		BTREE_INSERT_NOFAIL|
-		BTREE_INSERT_USE_RESERVE|
-		BTREE_INSERT_USE_ALLOC_RESERVE,
-		&cl);
+		BTREE_INSERT_NOFAIL, &cl);
 
 	if (IS_ERR(as)) {
 		ret = PTR_ERR(as);
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index a2ec2e58f9e4..a25cc3b7db39 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -1084,8 +1084,7 @@ int bch2_btree_delete_at(struct btree_trans *trans,
 
 	bch2_trans_update(trans, iter, &k, 0);
 	return bch2_trans_commit(trans, NULL, NULL,
-				 BTREE_INSERT_NOFAIL|
-				 BTREE_INSERT_USE_RESERVE|flags);
+				 BTREE_INSERT_NOFAIL|flags);
 }
 
 int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id id,
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 44d08434855d..31a2d3dbfe8f 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -2186,7 +2186,7 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
 			     ca->mi.bucket_size / c->opts.btree_node_size);
 	/* XXX: these should be tunable */
 	size_t reserve_none	= max_t(size_t, 1, nbuckets >> 9);
-	size_t copygc_reserve	= max_t(size_t, 2, nbuckets >> 7);
+	size_t copygc_reserve	= max_t(size_t, 2, nbuckets >> 6);
 	size_t free_inc_nr	= max(max_t(size_t, 1, nbuckets >> 12),
 				      btree_reserve * 2);
 	bool resize = ca->buckets[0] != NULL;
@@ -2203,7 +2203,6 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
 	    !(buckets_nouse	= kvpmalloc(BITS_TO_LONGS(nbuckets) *
 					    sizeof(unsigned long),
 					    GFP_KERNEL|__GFP_ZERO)) ||
-	    !init_fifo(&free[RESERVE_BTREE], btree_reserve, GFP_KERNEL) ||
 	    !init_fifo(&free[RESERVE_MOVINGGC],
 		       copygc_reserve, GFP_KERNEL) ||
 	    !init_fifo(&free[RESERVE_NONE], reserve_none, GFP_KERNEL) ||
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index 95abc00bd0e0..76509c5970d2 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -800,8 +800,7 @@ static int ec_stripe_update_ptrs(struct bch_fs *c,
 		bch2_trans_update(&trans, iter, sk.k, 0);
 
 		ret = bch2_trans_commit(&trans, NULL, NULL,
-					BTREE_INSERT_NOFAIL|
-					BTREE_INSERT_USE_RESERVE);
+					BTREE_INSERT_NOFAIL);
 		if (ret == -EINTR)
 			ret = 0;
 		if (ret)
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index ee2ba1b8aff9..20c31176b131 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -330,8 +330,7 @@ int bch2_extent_update(struct btree_trans *trans,
 
 	ret = bch2_trans_commit(trans, disk_res, journal_seq,
 				BTREE_INSERT_NOCHECK_RW|
-				BTREE_INSERT_NOFAIL|
-				BTREE_INSERT_USE_RESERVE);
+				BTREE_INSERT_NOFAIL);
 	if (ret)
 		return ret;
 
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index 9c0de18930ac..be2c2d92384e 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -776,7 +776,7 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
 			}
 		} else {
 			rcu_read_lock();
-			ob = bch2_bucket_alloc(c, ca, RESERVE_ALLOC,
+			ob = bch2_bucket_alloc(c, ca, RESERVE_NONE,
 					       false, cl);
 			rcu_read_unlock();
 			if (IS_ERR(ob)) {
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index 1b1a14d2fa23..50b7363fe84b 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -167,7 +167,6 @@ static int bch2_migrate_index_update(struct bch_write_op *op)
 		ret = bch2_trans_commit(&trans, &op->res,
 				op_journal_seq(op),
 				BTREE_INSERT_NOFAIL|
-				BTREE_INSERT_USE_RESERVE|
 				m->data_opts.btree_insert_flags);
 		if (!ret)
 			atomic_long_inc(&c->extent_migrate_done);
diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c
index f0cfd109a022..659dcfb2cca1 100644
--- a/fs/bcachefs/movinggc.c
+++ b/fs/bcachefs/movinggc.c
@@ -200,6 +200,11 @@ static int bch2_copygc(struct bch_fs *c)
 		return -1;
 	}
 
+	/*
+	 * Our btree node allocations also come out of RESERVE_MOVINGGC:
+	 */
+	sectors_to_move = (sectors_to_move * 3) / 4;
+
 	for (i = h->data; i < h->data + h->used; i++)
 		sectors_to_move += i->sectors * i->replicas;
 
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index afe0238d0cc0..aa58c595c5cb 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -798,7 +798,6 @@ static void dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca)
 
 	pr_buf(out,
 		"free_inc:               %zu/%zu\n"
-		"free[RESERVE_BTREE]:    %zu/%zu\n"
 		"free[RESERVE_MOVINGGC]: %zu/%zu\n"
 		"free[RESERVE_NONE]:     %zu/%zu\n"
 		"buckets:\n"
@@ -826,7 +825,6 @@ static void dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca)
 		"open_buckets_user:      %u\n"
 		"btree reserve cache:    %u\n",
 		fifo_used(&ca->free_inc),		ca->free_inc.size,
-		fifo_used(&ca->free[RESERVE_BTREE]),	ca->free[RESERVE_BTREE].size,
 		fifo_used(&ca->free[RESERVE_MOVINGGC]),	ca->free[RESERVE_MOVINGGC].size,
 		fifo_used(&ca->free[RESERVE_NONE]),	ca->free[RESERVE_NONE].size,
 		ca->mi.nbuckets - ca->mi.first_bucket,
-- 
cgit 


From 35a067b42dcfd884fb132128ae94f240c6511fea Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 14 Dec 2020 21:59:33 -0500
Subject: bcachefs: Change when we allow overwrites

Originally, we'd check for -ENOSPC when getting a disk reservation
whenever the new extent took up more space on disk than the old extent.

Erasure coding screwed this up, because with erasure coding writes are
initially replicated, and then in the background the extra replicas are
dropped when the stripe is created. This means that with erasure coding
enabled, writes will always take up more space on disk than the data
they're overwriting - but, according to posix, overwrites aren't
supposed to return ENOSPC.

So, in this patch we fudge things: if the new extent has more replicas
than the _effective_ replicas of the old extent, or if the old extent is
compressed and the new one isn't, we check for ENOSPC when getting the
disk reservation - otherwise, we don't.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/extents.c | 32 ++++++++++++++++++++++++++++++--
 fs/bcachefs/extents.h |  4 +++-
 fs/bcachefs/fs-io.c   |  4 +++-
 fs/bcachefs/io.c      | 30 +++++++++++++++++++++---------
 fs/bcachefs/io.h      |  2 ++
 fs/bcachefs/move.c    | 35 ++++++++++++++++-------------------
 6 files changed, 75 insertions(+), 32 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 7cdfd09d797e..a924cc66b4d0 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -664,7 +664,7 @@ bool bch2_bkey_is_incompressible(struct bkey_s_c k)
 }
 
 bool bch2_check_range_allocated(struct bch_fs *c, struct bpos pos, u64 size,
-				unsigned nr_replicas)
+				unsigned nr_replicas, bool compressed)
 {
 	struct btree_trans trans;
 	struct btree_iter *iter;
@@ -682,7 +682,8 @@ bool bch2_check_range_allocated(struct bch_fs *c, struct bpos pos, u64 size,
 		if (bkey_cmp(bkey_start_pos(k.k), end) >= 0)
 			break;
 
-		if (nr_replicas > bch2_bkey_nr_ptrs_fully_allocated(k)) {
+		if (nr_replicas > bch2_bkey_replicas(c, k) ||
+		    (!compressed && bch2_bkey_sectors_compressed(k))) {
 			ret = false;
 			break;
 		}
@@ -692,6 +693,33 @@ bool bch2_check_range_allocated(struct bch_fs *c, struct bpos pos, u64 size,
 	return ret;
 }
 
+unsigned bch2_bkey_replicas(struct bch_fs *c, struct bkey_s_c k)
+{
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+	const union bch_extent_entry *entry;
+	struct extent_ptr_decoded p;
+	unsigned replicas = 0;
+
+	bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+		if (p.ptr.cached)
+			continue;
+
+		if (p.has_ec) {
+			struct stripe *s =
+				genradix_ptr(&c->stripes[0], p.ec.idx);
+
+			WARN_ON(!s);
+			if (s)
+				replicas += s->nr_redundant;
+		}
+
+		replicas++;
+
+	}
+
+	return replicas;
+}
+
 static unsigned bch2_extent_ptr_durability(struct bch_fs *c,
 					   struct extent_ptr_decoded p)
 {
diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h
index 74c7bb8f9104..ebe0a04c7850 100644
--- a/fs/bcachefs/extents.h
+++ b/fs/bcachefs/extents.h
@@ -538,7 +538,9 @@ unsigned bch2_bkey_nr_ptrs_allocated(struct bkey_s_c);
 unsigned bch2_bkey_nr_ptrs_fully_allocated(struct bkey_s_c);
 bool bch2_bkey_is_incompressible(struct bkey_s_c);
 unsigned bch2_bkey_sectors_compressed(struct bkey_s_c);
-bool bch2_check_range_allocated(struct bch_fs *, struct bpos, u64, unsigned);
+bool bch2_check_range_allocated(struct bch_fs *, struct bpos, u64, unsigned, bool);
+
+unsigned bch2_bkey_replicas(struct bch_fs *, struct bkey_s_c);
 unsigned bch2_bkey_durability(struct bch_fs *, struct bkey_s_c);
 
 void bch2_bkey_mark_replicas_cached(struct bch_fs *, struct bkey_s,
diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 4f270b6cdf66..c2d024dec5c9 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -1886,7 +1886,9 @@ static long bch2_dio_write_loop(struct dio_write *dio)
 						dio->op.opts.data_replicas, 0);
 		if (unlikely(ret) &&
 		    !bch2_check_range_allocated(c, dio->op.pos,
-				bio_sectors(bio), dio->op.opts.data_replicas))
+				bio_sectors(bio),
+				dio->op.opts.data_replicas,
+				dio->op.opts.compression != 0))
 			goto err;
 
 		task_io_account_write(bio->bi_iter.bi_size);
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index 20c31176b131..b0d017e0b220 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -193,18 +193,23 @@ void bch2_bio_alloc_pages_pool(struct bch_fs *c, struct bio *bio,
 
 /* Extent update path: */
 
-static int sum_sector_overwrites(struct btree_trans *trans,
-				 struct btree_iter *extent_iter,
-				 struct bkey_i *new,
-				 bool *maybe_extending,
-				 s64 *i_sectors_delta,
-				 s64 *disk_sectors_delta)
+int bch2_sum_sector_overwrites(struct btree_trans *trans,
+			       struct btree_iter *extent_iter,
+			       struct bkey_i *new,
+			       bool *maybe_extending,
+			       bool *should_check_enospc,
+			       s64 *i_sectors_delta,
+			       s64 *disk_sectors_delta)
 {
+	struct bch_fs *c = trans->c;
 	struct btree_iter *iter;
 	struct bkey_s_c old;
+	unsigned new_replicas = bch2_bkey_replicas(c, bkey_i_to_s_c(new));
+	bool new_compressed = bch2_bkey_sectors_compressed(bkey_i_to_s_c(new));
 	int ret = 0;
 
 	*maybe_extending	= true;
+	*should_check_enospc	= false;
 	*i_sectors_delta	= 0;
 	*disk_sectors_delta	= 0;
 
@@ -223,6 +228,11 @@ static int sum_sector_overwrites(struct btree_trans *trans,
 			(int) (bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(new)) -
 			       bch2_bkey_nr_ptrs_fully_allocated(old));
 
+		if (!*should_check_enospc &&
+		    (new_replicas > bch2_bkey_replicas(c, old) ||
+		     (!new_compressed && bch2_bkey_sectors_compressed(old))))
+			*should_check_enospc = true;
+
 		if (bkey_cmp(old.k->p, new->k.p) >= 0) {
 			/*
 			 * Check if there's already data above where we're
@@ -260,7 +270,7 @@ int bch2_extent_update(struct btree_trans *trans,
 {
 	/* this must live until after bch2_trans_commit(): */
 	struct bkey_inode_buf inode_p;
-	bool extending = false;
+	bool extending = false, should_check_enospc;
 	s64 i_sectors_delta = 0, disk_sectors_delta = 0;
 	int ret;
 
@@ -268,8 +278,9 @@ int bch2_extent_update(struct btree_trans *trans,
 	if (ret)
 		return ret;
 
-	ret = sum_sector_overwrites(trans, iter, k,
+	ret = bch2_sum_sector_overwrites(trans, iter, k,
 			&extending,
+			&should_check_enospc,
 			&i_sectors_delta,
 			&disk_sectors_delta);
 	if (ret)
@@ -279,7 +290,8 @@ int bch2_extent_update(struct btree_trans *trans,
 	    disk_sectors_delta > (s64) disk_res->sectors) {
 		ret = bch2_disk_reservation_add(trans->c, disk_res,
 					disk_sectors_delta - disk_res->sectors,
-					0);
+					!should_check_enospc
+					? BCH_DISK_RESERVATION_NOFAIL : 0);
 		if (ret)
 			return ret;
 	}
diff --git a/fs/bcachefs/io.h b/fs/bcachefs/io.h
index 379263a935fa..6721440e8bc7 100644
--- a/fs/bcachefs/io.h
+++ b/fs/bcachefs/io.h
@@ -64,6 +64,8 @@ static inline struct workqueue_struct *index_update_wq(struct bch_write_op *op)
 		: op->c->wq;
 }
 
+int bch2_sum_sector_overwrites(struct btree_trans *, struct btree_iter *,
+			       struct bkey_i *, bool *, bool *, s64 *, s64 *);
 int bch2_extent_update(struct btree_trans *, struct btree_iter *,
 		       struct bkey_i *, struct disk_reservation *,
 		       u64 *, u64, s64 *);
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index 50b7363fe84b..7f0990617b29 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -76,17 +76,15 @@ static int bch2_migrate_index_update(struct bch_write_op *op)
 		const union bch_extent_entry *entry;
 		struct extent_ptr_decoded p;
 		bool did_work = false;
-		int nr;
+		bool extending = false, should_check_enospc;
+		s64 i_sectors_delta = 0, disk_sectors_delta = 0;
 
 		bch2_trans_reset(&trans, 0);
 
 		k = bch2_btree_iter_peek_slot(iter);
 		ret = bkey_err(k);
-		if (ret) {
-			if (ret == -EINTR)
-				continue;
-			break;
-		}
+		if (ret)
+			goto err;
 
 		new = bkey_i_to_extent(bch2_keylist_front(keys));
 
@@ -143,23 +141,21 @@ static int bch2_migrate_index_update(struct bch_write_op *op)
 					       op->opts.background_target,
 					       op->opts.data_replicas);
 
-		/*
-		 * If we're not fully overwriting @k, and it's compressed, we
-		 * need a reservation for all the pointers in @insert
-		 */
-		nr = bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(insert)) -
-			 m->nr_ptrs_reserved;
+		ret = bch2_sum_sector_overwrites(&trans, iter, insert,
+						 &extending,
+						 &should_check_enospc,
+						 &i_sectors_delta,
+						 &disk_sectors_delta);
+		if (ret)
+			goto err;
 
-		if (insert->k.size < k.k->size &&
-		    bch2_bkey_sectors_compressed(k) &&
-		    nr > 0) {
+		if (disk_sectors_delta > (s64) op->res.sectors) {
 			ret = bch2_disk_reservation_add(c, &op->res,
-					keylist_sectors(keys) * nr, 0);
+						disk_sectors_delta - op->res.sectors,
+						!should_check_enospc
+						? BCH_DISK_RESERVATION_NOFAIL : 0);
 			if (ret)
 				goto out;
-
-			m->nr_ptrs_reserved += nr;
-			goto next;
 		}
 
 		bch2_trans_update(&trans, iter, insert, 0);
@@ -168,6 +164,7 @@ static int bch2_migrate_index_update(struct bch_write_op *op)
 				op_journal_seq(op),
 				BTREE_INSERT_NOFAIL|
 				m->data_opts.btree_insert_flags);
+err:
 		if (!ret)
 			atomic_long_inc(&c->extent_migrate_done);
 		if (ret == -EINTR)
-- 
cgit 


From 81d8599e192e85fa7e01d8a6e8e4095177ff46a3 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 14 Dec 2020 19:41:03 -0500
Subject: bcachefs: Don't read existing stripes synchronously in write path

Previously, in the stripe creation path, when reusing an existing stripe
we'd read the existing stripe synchronously - ouch.

Now, we allocate two stripe bufs if we're using an existing stripe, so
that we can do the read asynchronously - and, we read the full stripe so
that we can run recovery, if necessary.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs_format.h |   2 +-
 fs/bcachefs/ec.c              | 179 ++++++++++++++++++++++++++----------------
 fs/bcachefs/ec.h              |   7 +-
 3 files changed, 117 insertions(+), 71 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index 3f8281b5db41..397099514418 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -871,7 +871,7 @@ struct bch_stripe {
 	__u8			csum_type;
 	__u8			pad;
 
-	struct bch_extent_ptr	ptrs[0];
+	struct bch_extent_ptr	ptrs[];
 } __attribute__((packed, aligned(8)));
 
 /* Reflink: */
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index 76509c5970d2..72ee53dc95d0 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -200,6 +200,36 @@ static bool extent_has_stripe_ptr(struct bkey_s_c k, u64 idx)
 	return false;
 }
 
+/* Stripe bufs: */
+
+static void ec_stripe_buf_free(struct ec_stripe_buf *stripe)
+{
+	unsigned i;
+
+	for (i = 0; i < stripe->key.v.nr_blocks; i++) {
+		kvpfree(stripe->data[i], stripe->size << 9);
+		stripe->data[i] = NULL;
+	}
+}
+
+static int ec_stripe_buf_alloc(struct ec_stripe_buf *stripe)
+{
+	unsigned i;
+
+	memset(stripe->valid, 0xFF, sizeof(stripe->valid));
+
+	for (i = 0; i < stripe->key.v.nr_blocks; i++) {
+		stripe->data[i] = kvpmalloc(stripe->size << 9, GFP_KERNEL);
+		if (!stripe->data[i])
+			goto err;
+	}
+
+	return 0;
+err:
+	ec_stripe_buf_free(stripe);
+	return -ENOMEM;
+}
+
 /* Checksumming: */
 
 static void ec_generate_checksums(struct ec_stripe_buf *buf)
@@ -287,14 +317,10 @@ static void ec_generate_ec(struct ec_stripe_buf *buf)
 	raid_gen(nr_data, v->nr_redundant, bytes, buf->data);
 }
 
-static unsigned __ec_nr_failed(struct ec_stripe_buf *buf, unsigned nr)
-{
-	return nr - bitmap_weight(buf->valid, nr);
-}
-
 static unsigned ec_nr_failed(struct ec_stripe_buf *buf)
 {
-	return __ec_nr_failed(buf, buf->key.v.nr_blocks);
+	return buf->key.v.nr_blocks -
+		bitmap_weight(buf->valid, buf->key.v.nr_blocks);
 }
 
 static int ec_do_recov(struct bch_fs *c, struct ec_stripe_buf *buf)
@@ -822,14 +848,13 @@ static void ec_stripe_create(struct ec_stripe_new *s)
 	struct open_bucket *ob;
 	struct bkey_i *k;
 	struct stripe *m;
-	struct bch_stripe *v = &s->stripe.key.v;
+	struct bch_stripe *v = &s->new_stripe.key.v;
 	unsigned i, nr_data = v->nr_blocks - v->nr_redundant;
-	struct closure cl;
 	int ret;
 
 	BUG_ON(s->h->s == s);
 
-	closure_init_stack(&cl);
+	closure_sync(&s->iodone);
 
 	if (s->err) {
 		if (s->err != -EROFS)
@@ -837,6 +862,22 @@ static void ec_stripe_create(struct ec_stripe_new *s)
 		goto err;
 	}
 
+	if (s->have_existing_stripe) {
+		ec_validate_checksums(c, &s->existing_stripe);
+
+		if (ec_do_recov(c, &s->existing_stripe)) {
+			bch_err(c, "error creating stripe: error reading existing stripe");
+			goto err;
+		}
+
+		for (i = 0; i < nr_data; i++)
+			if (stripe_blockcount_get(&s->existing_stripe.key.v, i))
+				swap(s->new_stripe.data[i],
+				     s->existing_stripe.data[i]);
+
+		ec_stripe_buf_free(&s->existing_stripe);
+	}
+
 	BUG_ON(!s->allocated);
 
 	if (!percpu_ref_tryget(&c->writes))
@@ -845,33 +886,31 @@ static void ec_stripe_create(struct ec_stripe_new *s)
 	BUG_ON(bitmap_weight(s->blocks_allocated,
 			     s->blocks.nr) != s->blocks.nr);
 
-	ec_generate_ec(&s->stripe);
+	ec_generate_ec(&s->new_stripe);
 
-	ec_generate_checksums(&s->stripe);
+	ec_generate_checksums(&s->new_stripe);
 
 	/* write p/q: */
 	for (i = nr_data; i < v->nr_blocks; i++)
-		ec_block_io(c, &s->stripe, REQ_OP_WRITE, i, &cl);
-
-	closure_sync(&cl);
+		ec_block_io(c, &s->new_stripe, REQ_OP_WRITE, i, &s->iodone);
+	closure_sync(&s->iodone);
 
-	for (i = nr_data; i < v->nr_blocks; i++)
-		if (!test_bit(i, s->stripe.valid)) {
-			bch_err(c, "error creating stripe: error writing redundancy buckets");
-			goto err_put_writes;
-		}
+	if (ec_nr_failed(&s->new_stripe)) {
+		bch_err(c, "error creating stripe: error writing redundancy buckets");
+		goto err_put_writes;
+	}
 
-	ret = s->existing_stripe
-		? bch2_btree_insert(c, BTREE_ID_EC, &s->stripe.key.k_i,
+	ret = s->have_existing_stripe
+		? bch2_btree_insert(c, BTREE_ID_EC, &s->new_stripe.key.k_i,
 				    &s->res, NULL, BTREE_INSERT_NOFAIL)
-		: ec_stripe_bkey_insert(c, s, &s->stripe.key);
+		: ec_stripe_bkey_insert(c, s, &s->new_stripe.key);
 	if (ret) {
 		bch_err(c, "error creating stripe: error creating stripe key");
 		goto err_put_writes;
 	}
 
 	for_each_keylist_key(&s->keys, k) {
-		ret = ec_stripe_update_ptrs(c, &s->stripe, &k->k);
+		ret = ec_stripe_update_ptrs(c, &s->new_stripe, &k->k);
 		if (ret) {
 			bch_err(c, "error creating stripe: error %i updating pointers", ret);
 			break;
@@ -879,14 +918,14 @@ static void ec_stripe_create(struct ec_stripe_new *s)
 	}
 
 	spin_lock(&c->ec_stripes_heap_lock);
-	m = genradix_ptr(&c->stripes[0], s->stripe.key.k.p.offset);
+	m = genradix_ptr(&c->stripes[0], s->new_stripe.key.k.p.offset);
 #if 0
 	pr_info("created a %s stripe %llu",
-		s->existing_stripe ? "existing" : "new",
+		s->have_existing_stripe ? "existing" : "new",
 		s->stripe.key.k.p.offset);
 #endif
 	BUG_ON(m->on_heap);
-	bch2_stripes_heap_insert(c, m, s->stripe.key.k.p.offset);
+	bch2_stripes_heap_insert(c, m, s->new_stripe.key.k.p.offset);
 	spin_unlock(&c->ec_stripes_heap_lock);
 err_put_writes:
 	percpu_ref_put(&c->writes);
@@ -902,8 +941,9 @@ err:
 
 	bch2_keylist_free(&s->keys, s->inline_keys);
 
-	for (i = 0; i < s->stripe.key.v.nr_blocks; i++)
-		kvpfree(s->stripe.data[i], s->stripe.size << 9);
+	ec_stripe_buf_free(&s->existing_stripe);
+	ec_stripe_buf_free(&s->new_stripe);
+	closure_debug_destroy(&s->iodone);
 	kfree(s);
 }
 
@@ -980,7 +1020,7 @@ void *bch2_writepoint_ec_buf(struct bch_fs *c, struct write_point *wp)
 	ca	= bch_dev_bkey_exists(c, ob->ptr.dev);
 	offset	= ca->mi.bucket_size - ob->sectors_free;
 
-	return ob->ec->stripe.data[ob->ec_idx] + (offset << 9);
+	return ob->ec->new_stripe.data[ob->ec_idx] + (offset << 9);
 }
 
 void bch2_ec_add_backpointer(struct bch_fs *c, struct write_point *wp,
@@ -1087,7 +1127,6 @@ static void ec_stripe_key_init(struct bch_fs *c,
 static int ec_new_stripe_alloc(struct bch_fs *c, struct ec_stripe_head *h)
 {
 	struct ec_stripe_new *s;
-	unsigned i;
 
 	lockdep_assert_held(&h->lock);
 
@@ -1096,6 +1135,7 @@ static int ec_new_stripe_alloc(struct bch_fs *c, struct ec_stripe_head *h)
 		return -ENOMEM;
 
 	mutex_init(&s->lock);
+	closure_init(&s->iodone, NULL);
 	atomic_set(&s->pin, 1);
 	s->c		= c;
 	s->h		= h;
@@ -1105,27 +1145,14 @@ static int ec_new_stripe_alloc(struct bch_fs *c, struct ec_stripe_head *h)
 
 	bch2_keylist_init(&s->keys, s->inline_keys);
 
-	s->stripe.offset	= 0;
-	s->stripe.size		= h->blocksize;
-	memset(s->stripe.valid, 0xFF, sizeof(s->stripe.valid));
+	s->new_stripe.offset	= 0;
+	s->new_stripe.size	= h->blocksize;
 
-	ec_stripe_key_init(c, &s->stripe.key, s->nr_data,
+	ec_stripe_key_init(c, &s->new_stripe.key, s->nr_data,
 			   s->nr_parity, h->blocksize);
 
-	for (i = 0; i < s->stripe.key.v.nr_blocks; i++) {
-		s->stripe.data[i] = kvpmalloc(s->stripe.size << 9, GFP_KERNEL);
-		if (!s->stripe.data[i])
-			goto err;
-	}
-
 	h->s = s;
-
 	return 0;
-err:
-	for (i = 0; i < s->stripe.key.v.nr_blocks; i++)
-		kvpfree(s->stripe.data[i], s->stripe.size << 9);
-	kfree(s);
-	return -ENOMEM;
 }
 
 static struct ec_stripe_head *
@@ -1217,7 +1244,7 @@ static int new_stripe_alloc_buckets(struct bch_fs *c, struct ec_stripe_head *h)
 	devs = h->devs;
 
 	for_each_set_bit(i, h->s->blocks_allocated, BCH_BKEY_PTRS_MAX) {
-		__clear_bit(h->s->stripe.key.v.ptrs[i].dev, devs.d);
+		__clear_bit(h->s->new_stripe.key.v.ptrs[i].dev, devs.d);
 		--nr_data;
 	}
 
@@ -1327,51 +1354,70 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *c,
 					       unsigned algo,
 					       unsigned redundancy)
 {
-	struct closure cl;
 	struct ec_stripe_head *h;
 	struct open_bucket *ob;
 	unsigned i, data_idx = 0;
 	s64 idx;
 	int ret;
 
-	closure_init_stack(&cl);
-
 	h = __bch2_ec_stripe_head_get(c, target, algo, redundancy);
-	if (!h)
+	if (!h) {
+		bch_err(c, "no stripe head");
 		return NULL;
+	}
 
 	if (!h->s) {
 		if (ec_new_stripe_alloc(c, h)) {
 			bch2_ec_stripe_head_put(c, h);
+			bch_err(c, "failed to allocate new stripe");
 			return NULL;
 		}
 
 		idx = get_existing_stripe(c, target, algo, redundancy);
 		if (idx >= 0) {
-			h->s->existing_stripe = true;
-			h->s->existing_stripe_idx = idx;
-			if (get_stripe_key(c, idx, &h->s->stripe)) {
-				/* btree error */
+			h->s->have_existing_stripe = true;
+			ret = get_stripe_key(c, idx, &h->s->existing_stripe);
+			if (ret) {
+				bch2_fs_fatal_error(c, "error reading stripe key: %i", ret);
+				bch2_ec_stripe_head_put(c, h);
+				return NULL;
+			}
+
+			if (ec_stripe_buf_alloc(&h->s->existing_stripe)) {
+				/*
+				 * this is a problem: we have deleted from the
+				 * stripes heap already
+				 */
 				BUG();
 			}
 
-			for (i = 0; i < h->s->stripe.key.v.nr_blocks; i++)
-				if (stripe_blockcount_get(&h->s->stripe.key.v, i)) {
+			for (i = 0; i < h->s->existing_stripe.key.v.nr_blocks; i++) {
+				if (stripe_blockcount_get(&h->s->existing_stripe.key.v, i))
 					__set_bit(i, h->s->blocks_allocated);
-					ec_block_io(c, &h->s->stripe, READ, i, &cl);
-				}
+
+				ec_block_io(c, &h->s->existing_stripe, READ, i, &h->s->iodone);
+			}
+
+			bkey_copy(&h->s->new_stripe.key.k_i,
+				  &h->s->existing_stripe.key.k_i);
+		}
+
+		if (ec_stripe_buf_alloc(&h->s->new_stripe)) {
+			BUG();
 		}
 	}
 
 	if (!h->s->allocated) {
-		if (!h->s->existing_stripe &&
+		if (!h->s->have_existing_stripe &&
 		    !h->s->res.sectors) {
 			ret = bch2_disk_reservation_get(c, &h->s->res,
-							h->blocksize,
-							h->s->nr_parity, 0);
+					h->blocksize,
+					h->s->nr_parity, 0);
 			if (ret) {
-				/* What should we do here? */
-				bch_err(c, "unable to create new stripe: %i", ret);
+				/*
+				 * This means we need to wait for copygc to
+				 * empty out buckets from existing stripes:
+				 */
 				bch2_ec_stripe_head_put(c, h);
 				h = NULL;
 				goto out;
@@ -1391,19 +1437,18 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *c,
 						      h->s->nr_data, data_idx);
 			BUG_ON(data_idx >= h->s->nr_data);
 
-			h->s->stripe.key.v.ptrs[data_idx] = ob->ptr;
+			h->s->new_stripe.key.v.ptrs[data_idx] = ob->ptr;
 			h->s->data_block_idx[i] = data_idx;
 			data_idx++;
 		}
 
 		open_bucket_for_each(c, &h->s->parity, ob, i)
-			h->s->stripe.key.v.ptrs[h->s->nr_data + i] = ob->ptr;
+			h->s->new_stripe.key.v.ptrs[h->s->nr_data + i] = ob->ptr;
 
 		//pr_info("new stripe, blocks_allocated %lx", h->s->blocks_allocated[0]);
 		h->s->allocated = true;
 	}
 out:
-	closure_sync(&cl);
 	return h;
 }
 
diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h
index 450bb1a113a3..1d4aad50db4d 100644
--- a/fs/bcachefs/ec.h
+++ b/fs/bcachefs/ec.h
@@ -88,6 +88,7 @@ struct ec_stripe_new {
 	struct ec_stripe_head	*h;
 	struct mutex		lock;
 	struct list_head	list;
+	struct closure		iodone;
 
 	/* counts in flight writes, stripe is created when pin == 0 */
 	atomic_t		pin;
@@ -98,8 +99,7 @@ struct ec_stripe_new {
 	u8			nr_parity;
 	bool			allocated;
 	bool			pending;
-	bool			existing_stripe;
-	u64			existing_stripe_idx;
+	bool			have_existing_stripe;
 
 	unsigned long		blocks_allocated[BITS_TO_LONGS(BCH_BKEY_PTRS_MAX)];
 
@@ -111,7 +111,8 @@ struct ec_stripe_new {
 	struct keylist		keys;
 	u64			inline_keys[BKEY_U64s * 8];
 
-	struct ec_stripe_buf	stripe;
+	struct ec_stripe_buf	new_stripe;
+	struct ec_stripe_buf	existing_stripe;
 };
 
 struct ec_stripe_head {
-- 
cgit 


From 2c40a2403e2b25aca38ba728385657dfca560a62 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 15 Dec 2020 12:38:17 -0500
Subject: bcachefs: Change allocations for ec stripes to blocking

We don't want writes to not get erasure coded just because the allocator
temporarily wasn't keeping up.

However, it's not guaranteed that these allocations will ever succeed,
we can currently get stuck - especially if devices are different sizes -
we still have work to do in this area.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_foreground.c | 42 +++++++++++++++++++++++++-----------------
 fs/bcachefs/ec.c               | 23 +++++++++++------------
 fs/bcachefs/ec.h               |  4 ++--
 3 files changed, 38 insertions(+), 31 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index 1ea8ee99956b..1689e229164f 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -446,16 +446,18 @@ bch2_bucket_alloc_set(struct bch_fs *c,
  * it's to a device we don't want:
  */
 
-static void bucket_alloc_from_stripe(struct bch_fs *c,
-				     struct open_buckets *ptrs,
-				     struct write_point *wp,
-				     struct bch_devs_mask *devs_may_alloc,
-				     u16 target,
-				     unsigned erasure_code,
-				     unsigned nr_replicas,
-				     unsigned *nr_effective,
-				     bool *have_cache,
-				     unsigned flags)
+static enum bucket_alloc_ret
+bucket_alloc_from_stripe(struct bch_fs *c,
+			 struct open_buckets *ptrs,
+			 struct write_point *wp,
+			 struct bch_devs_mask *devs_may_alloc,
+			 u16 target,
+			 unsigned erasure_code,
+			 unsigned nr_replicas,
+			 unsigned *nr_effective,
+			 bool *have_cache,
+			 unsigned flags,
+			 struct closure *cl)
 {
 	struct dev_alloc_list devs_sorted;
 	struct ec_stripe_head *h;
@@ -464,17 +466,19 @@ static void bucket_alloc_from_stripe(struct bch_fs *c,
 	unsigned i, ec_idx;
 
 	if (!erasure_code)
-		return;
+		return 0;
 
 	if (nr_replicas < 2)
-		return;
+		return 0;
 
 	if (ec_open_bucket(c, ptrs))
-		return;
+		return 0;
 
-	h = bch2_ec_stripe_head_get(c, target, 0, nr_replicas - 1);
+	h = bch2_ec_stripe_head_get(c, target, 0, nr_replicas - 1, cl);
+	if (IS_ERR(h))
+		return -PTR_ERR(h);
 	if (!h)
-		return;
+		return 0;
 
 	devs_sorted = bch2_dev_alloc_list(c, &wp->stripe, devs_may_alloc);
 
@@ -496,6 +500,7 @@ got_bucket:
 	atomic_inc(&h->s->pin);
 out_put_head:
 	bch2_ec_stripe_head_put(c, h);
+	return 0;
 }
 
 /* Sector allocator */
@@ -573,10 +578,13 @@ open_bucket_add_buckets(struct bch_fs *c,
 		}
 
 		if (!ec_open_bucket(c, ptrs)) {
-			bucket_alloc_from_stripe(c, ptrs, wp, &devs,
+			ret = bucket_alloc_from_stripe(c, ptrs, wp, &devs,
 						 target, erasure_code,
 						 nr_replicas, nr_effective,
-						 have_cache, flags);
+						 have_cache, flags, _cl);
+			if (ret == FREELIST_EMPTY ||
+			    ret == OPEN_BUCKETS_EMPTY)
+				return ret;
 			if (*nr_effective >= nr_replicas)
 				return 0;
 		}
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index 72ee53dc95d0..f1659474b615 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -1228,10 +1228,9 @@ found:
 	return h;
 }
 
-/*
- * XXX: use a higher watermark for allocating open buckets here:
- */
-static int new_stripe_alloc_buckets(struct bch_fs *c, struct ec_stripe_head *h)
+static enum bucket_alloc_ret
+new_stripe_alloc_buckets(struct bch_fs *c, struct ec_stripe_head *h,
+			 struct closure *cl)
 {
 	struct bch_devs_mask devs;
 	struct open_bucket *ob;
@@ -1239,7 +1238,7 @@ static int new_stripe_alloc_buckets(struct bch_fs *c, struct ec_stripe_head *h)
 		min_t(unsigned, h->nr_active_devs,
 		      BCH_BKEY_PTRS_MAX) - h->redundancy;
 	bool have_cache = true;
-	int ret = 0;
+	enum bucket_alloc_ret ret = ALLOC_SUCCESS;
 
 	devs = h->devs;
 
@@ -1270,7 +1269,7 @@ static int new_stripe_alloc_buckets(struct bch_fs *c, struct ec_stripe_head *h)
 					    &have_cache,
 					    RESERVE_NONE,
 					    0,
-					    NULL);
+					    cl);
 		if (ret)
 			goto err;
 	}
@@ -1286,7 +1285,7 @@ static int new_stripe_alloc_buckets(struct bch_fs *c, struct ec_stripe_head *h)
 					    &have_cache,
 					    RESERVE_NONE,
 					    0,
-					    NULL);
+					    cl);
 		if (ret)
 			goto err;
 	}
@@ -1352,7 +1351,8 @@ static int get_stripe_key(struct bch_fs *c, u64 idx, struct ec_stripe_buf *strip
 struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *c,
 					       unsigned target,
 					       unsigned algo,
-					       unsigned redundancy)
+					       unsigned redundancy,
+					       struct closure *cl)
 {
 	struct ec_stripe_head *h;
 	struct open_bucket *ob;
@@ -1421,14 +1421,13 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *c,
 				bch2_ec_stripe_head_put(c, h);
 				h = NULL;
 				goto out;
-
 			}
-
 		}
 
-		if (new_stripe_alloc_buckets(c, h)) {
+		ret = new_stripe_alloc_buckets(c, h, cl);
+		if (ret) {
 			bch2_ec_stripe_head_put(c, h);
-			h = NULL;
+			h = ERR_PTR(-ret);
 			goto out;
 		}
 
diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h
index 1d4aad50db4d..3f1999bae6d4 100644
--- a/fs/bcachefs/ec.h
+++ b/fs/bcachefs/ec.h
@@ -146,8 +146,8 @@ void bch2_ec_bucket_cancel(struct bch_fs *, struct open_bucket *);
 int bch2_ec_stripe_new_alloc(struct bch_fs *, struct ec_stripe_head *);
 
 void bch2_ec_stripe_head_put(struct bch_fs *, struct ec_stripe_head *);
-struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *, unsigned,
-					       unsigned, unsigned);
+struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *,
+			unsigned, unsigned, unsigned, struct closure *);
 
 void bch2_stripes_heap_update(struct bch_fs *, struct stripe *, size_t);
 void bch2_stripes_heap_del(struct bch_fs *, struct stripe *, size_t);
-- 
cgit 


From 8deed5f4e547e675cf8c1de88720c23c3c3093ca Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 15 Dec 2020 12:53:30 -0500
Subject: bcachefs: Use separate new stripes for copygc and non-copygc

Allocations for copygc have to be kept separate from everything else,
so that copygc doesn't get starved.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_foreground.c |  4 +++-
 fs/bcachefs/alloc_types.h      |  1 -
 fs/bcachefs/ec.c               | 27 ++++++++++++++++++---------
 fs/bcachefs/ec.h               |  3 ++-
 4 files changed, 23 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index 1689e229164f..df9f022e6926 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -474,7 +474,9 @@ bucket_alloc_from_stripe(struct bch_fs *c,
 	if (ec_open_bucket(c, ptrs))
 		return 0;
 
-	h = bch2_ec_stripe_head_get(c, target, 0, nr_replicas - 1, cl);
+	h = bch2_ec_stripe_head_get(c, target, 0, nr_replicas - 1,
+				    wp == &c->copygc_write_point,
+				    cl);
 	if (IS_ERR(h))
 		return -PTR_ERR(h);
 	if (!h)
diff --git a/fs/bcachefs/alloc_types.h b/fs/bcachefs/alloc_types.h
index a510ca9a295b..0cfb026a02e5 100644
--- a/fs/bcachefs/alloc_types.h
+++ b/fs/bcachefs/alloc_types.h
@@ -87,7 +87,6 @@ struct write_point {
 	u64			last_used;
 	unsigned long		write_point;
 	enum bch_data_type	type;
-	bool			is_ec;
 
 	/* calculated based on how many pointers we're actually going to use: */
 	unsigned		sectors_free;
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index f1659474b615..09de3270bff0 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -1157,7 +1157,8 @@ static int ec_new_stripe_alloc(struct bch_fs *c, struct ec_stripe_head *h)
 
 static struct ec_stripe_head *
 ec_new_stripe_head_alloc(struct bch_fs *c, unsigned target,
-			 unsigned algo, unsigned redundancy)
+			 unsigned algo, unsigned redundancy,
+			 bool copygc)
 {
 	struct ec_stripe_head *h;
 	struct bch_dev *ca;
@@ -1173,6 +1174,7 @@ ec_new_stripe_head_alloc(struct bch_fs *c, unsigned target,
 	h->target	= target;
 	h->algo		= algo;
 	h->redundancy	= redundancy;
+	h->copygc	= copygc;
 
 	rcu_read_lock();
 	h->devs = target_rw_devs(c, BCH_DATA_user, target);
@@ -1204,9 +1206,10 @@ void bch2_ec_stripe_head_put(struct bch_fs *c, struct ec_stripe_head *h)
 }
 
 struct ec_stripe_head *__bch2_ec_stripe_head_get(struct bch_fs *c,
-					       unsigned target,
-					       unsigned algo,
-					       unsigned redundancy)
+						 unsigned target,
+						 unsigned algo,
+						 unsigned redundancy,
+						 bool copygc)
 {
 	struct ec_stripe_head *h;
 
@@ -1217,12 +1220,13 @@ struct ec_stripe_head *__bch2_ec_stripe_head_get(struct bch_fs *c,
 	list_for_each_entry(h, &c->ec_stripe_head_list, list)
 		if (h->target		== target &&
 		    h->algo		== algo &&
-		    h->redundancy	== redundancy) {
+		    h->redundancy	== redundancy &&
+		    h->copygc		== copygc) {
 			mutex_lock(&h->lock);
 			goto found;
 		}
 
-	h = ec_new_stripe_head_alloc(c, target, algo, redundancy);
+	h = ec_new_stripe_head_alloc(c, target, algo, redundancy, copygc);
 found:
 	mutex_unlock(&c->ec_stripe_head_lock);
 	return h;
@@ -1267,7 +1271,9 @@ new_stripe_alloc_buckets(struct bch_fs *c, struct ec_stripe_head *h,
 					    h->redundancy,
 					    &nr_have,
 					    &have_cache,
-					    RESERVE_NONE,
+					    h->copygc
+					    ? RESERVE_MOVINGGC
+					    : RESERVE_NONE,
 					    0,
 					    cl);
 		if (ret)
@@ -1283,7 +1289,9 @@ new_stripe_alloc_buckets(struct bch_fs *c, struct ec_stripe_head *h,
 					    nr_data,
 					    &nr_have,
 					    &have_cache,
-					    RESERVE_NONE,
+					    h->copygc
+					    ? RESERVE_MOVINGGC
+					    : RESERVE_NONE,
 					    0,
 					    cl);
 		if (ret)
@@ -1352,6 +1360,7 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *c,
 					       unsigned target,
 					       unsigned algo,
 					       unsigned redundancy,
+					       bool copygc,
 					       struct closure *cl)
 {
 	struct ec_stripe_head *h;
@@ -1360,7 +1369,7 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *c,
 	s64 idx;
 	int ret;
 
-	h = __bch2_ec_stripe_head_get(c, target, algo, redundancy);
+	h = __bch2_ec_stripe_head_get(c, target, algo, redundancy, copygc);
 	if (!h) {
 		bch_err(c, "no stripe head");
 		return NULL;
diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h
index 3f1999bae6d4..97a263cf9c87 100644
--- a/fs/bcachefs/ec.h
+++ b/fs/bcachefs/ec.h
@@ -122,6 +122,7 @@ struct ec_stripe_head {
 	unsigned		target;
 	unsigned		algo;
 	unsigned		redundancy;
+	bool			copygc;
 
 	struct bch_devs_mask	devs;
 	unsigned		nr_active_devs;
@@ -147,7 +148,7 @@ int bch2_ec_stripe_new_alloc(struct bch_fs *, struct ec_stripe_head *);
 
 void bch2_ec_stripe_head_put(struct bch_fs *, struct ec_stripe_head *);
 struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *,
-			unsigned, unsigned, unsigned, struct closure *);
+			unsigned, unsigned, unsigned, bool, struct closure *);
 
 void bch2_stripes_heap_update(struct bch_fs *, struct stripe *, size_t);
 void bch2_stripes_heap_del(struct bch_fs *, struct stripe *, size_t);
-- 
cgit 


From 07a1006ae81580c6a1b52b80e32fa9dadea1954b Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 17 Dec 2020 15:08:58 -0500
Subject: bcachefs: Reduce/kill BKEY_PADDED use

With various newer key types - stripe keys, inline data extents - the
old approach of calculating the maximum size of the value is becoming
more and more error prone. Better to switch to bkey_on_stack, which can
dynamically allocate if necessary to handle any size bkey.

In particular we also want to get rid of BKEY_EXTENT_VAL_U64s_MAX.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs_format.h       |  2 --
 fs/bcachefs/bkey_buf.h              | 60 +++++++++++++++++++++++++++++++++++++
 fs/bcachefs/bkey_on_stack.h         | 43 --------------------------
 fs/bcachefs/bkey_sort.c             | 18 +++++------
 fs/bcachefs/btree_cache.c           | 13 +++++---
 fs/bcachefs/btree_gc.c              | 22 +++++++-------
 fs/bcachefs/btree_io.c              | 23 ++++++++------
 fs/bcachefs/btree_iter.c            | 35 ++++++++++++++--------
 fs/bcachefs/btree_types.h           |  2 +-
 fs/bcachefs/btree_update_interior.c |  2 +-
 fs/bcachefs/ec.c                    | 10 +++----
 fs/bcachefs/extent_update.c         |  1 -
 fs/bcachefs/fs-io.c                 | 18 +++++------
 fs/bcachefs/fs.c                    | 16 +++++-----
 fs/bcachefs/fsck.c                  | 10 +++----
 fs/bcachefs/io.c                    | 59 ++++++++++++++++++------------------
 fs/bcachefs/io.h                    |  6 ++--
 fs/bcachefs/journal.c               |  2 +-
 fs/bcachefs/journal_io.c            |  2 ++
 fs/bcachefs/journal_types.h         |  2 +-
 fs/bcachefs/migrate.c               | 20 +++++++------
 fs/bcachefs/move.c                  | 26 +++++++++-------
 fs/bcachefs/recovery.c              | 34 +++++++++++----------
 fs/bcachefs/reflink.c               | 21 ++++++-------
 24 files changed, 247 insertions(+), 200 deletions(-)
 create mode 100644 fs/bcachefs/bkey_buf.h
 delete mode 100644 fs/bcachefs/bkey_on_stack.h

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index 397099514418..b88a9fdf17ad 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -638,8 +638,6 @@ struct bch_reservation {
 #define BKEY_EXTENT_VAL_U64s_MAX				\
 	(1 + BKEY_EXTENT_PTR_U64s_MAX * (BCH_REPLICAS_MAX + 1))
 
-#define BKEY_PADDED(key)	__BKEY_PADDED(key, BKEY_EXTENT_VAL_U64s_MAX)
-
 /* * Maximum possible size of an entire extent, key + value: */
 #define BKEY_EXTENT_U64s_MAX		(BKEY_U64s + BKEY_EXTENT_VAL_U64s_MAX)
 
diff --git a/fs/bcachefs/bkey_buf.h b/fs/bcachefs/bkey_buf.h
new file mode 100644
index 000000000000..0d7c67a959af
--- /dev/null
+++ b/fs/bcachefs/bkey_buf.h
@@ -0,0 +1,60 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BKEY_BUF_H
+#define _BCACHEFS_BKEY_BUF_H
+
+#include "bcachefs.h"
+
+struct bkey_buf {
+	struct bkey_i	*k;
+	u64		onstack[12];
+};
+
+static inline void bch2_bkey_buf_realloc(struct bkey_buf *s,
+					 struct bch_fs *c, unsigned u64s)
+{
+	if (s->k == (void *) s->onstack &&
+	    u64s > ARRAY_SIZE(s->onstack)) {
+		s->k = mempool_alloc(&c->large_bkey_pool, GFP_NOFS);
+		memcpy(s->k, s->onstack, sizeof(s->onstack));
+	}
+}
+
+static inline void bch2_bkey_buf_reassemble(struct bkey_buf *s,
+					    struct bch_fs *c,
+					    struct bkey_s_c k)
+{
+	bch2_bkey_buf_realloc(s, c, k.k->u64s);
+	bkey_reassemble(s->k, k);
+}
+
+static inline void bch2_bkey_buf_copy(struct bkey_buf *s,
+				      struct bch_fs *c,
+				      struct bkey_i *src)
+{
+	bch2_bkey_buf_realloc(s, c, src->k.u64s);
+	bkey_copy(s->k, src);
+}
+
+static inline void bch2_bkey_buf_unpack(struct bkey_buf *s,
+					struct bch_fs *c,
+					struct btree *b,
+					struct bkey_packed *src)
+{
+	bch2_bkey_buf_realloc(s, c, BKEY_U64s +
+			      bkeyp_val_u64s(&b->format, src));
+	bch2_bkey_unpack(b, s->k, src);
+}
+
+static inline void bch2_bkey_buf_init(struct bkey_buf *s)
+{
+	s->k = (void *) s->onstack;
+}
+
+static inline void bch2_bkey_buf_exit(struct bkey_buf *s, struct bch_fs *c)
+{
+	if (s->k != (void *) s->onstack)
+		mempool_free(s->k, &c->large_bkey_pool);
+	s->k = NULL;
+}
+
+#endif /* _BCACHEFS_BKEY_BUF_H */
diff --git a/fs/bcachefs/bkey_on_stack.h b/fs/bcachefs/bkey_on_stack.h
deleted file mode 100644
index f607a0cb37ed..000000000000
--- a/fs/bcachefs/bkey_on_stack.h
+++ /dev/null
@@ -1,43 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_BKEY_ON_STACK_H
-#define _BCACHEFS_BKEY_ON_STACK_H
-
-#include "bcachefs.h"
-
-struct bkey_on_stack {
-	struct bkey_i	*k;
-	u64		onstack[12];
-};
-
-static inline void bkey_on_stack_realloc(struct bkey_on_stack *s,
-					 struct bch_fs *c, unsigned u64s)
-{
-	if (s->k == (void *) s->onstack &&
-	    u64s > ARRAY_SIZE(s->onstack)) {
-		s->k = mempool_alloc(&c->large_bkey_pool, GFP_NOFS);
-		memcpy(s->k, s->onstack, sizeof(s->onstack));
-	}
-}
-
-static inline void bkey_on_stack_reassemble(struct bkey_on_stack *s,
-					    struct bch_fs *c,
-					    struct bkey_s_c k)
-{
-	bkey_on_stack_realloc(s, c, k.k->u64s);
-	bkey_reassemble(s->k, k);
-}
-
-static inline void bkey_on_stack_init(struct bkey_on_stack *s)
-{
-	s->k = (void *) s->onstack;
-}
-
-static inline void bkey_on_stack_exit(struct bkey_on_stack *s,
-				      struct bch_fs *c)
-{
-	if (s->k != (void *) s->onstack)
-		mempool_free(s->k, &c->large_bkey_pool);
-	s->k = NULL;
-}
-
-#endif /* _BCACHEFS_BKEY_ON_STACK_H */
diff --git a/fs/bcachefs/bkey_sort.c b/fs/bcachefs/bkey_sort.c
index 99e0a4011fae..2e1d9cd65f43 100644
--- a/fs/bcachefs/bkey_sort.c
+++ b/fs/bcachefs/bkey_sort.c
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 #include "bcachefs.h"
-#include "bkey_on_stack.h"
+#include "bkey_buf.h"
 #include "bkey_sort.h"
 #include "bset.h"
 #include "extents.h"
@@ -187,11 +187,11 @@ bch2_sort_repack_merge(struct bch_fs *c,
 		       bool filter_whiteouts)
 {
 	struct bkey_packed *out = vstruct_last(dst), *k_packed;
-	struct bkey_on_stack k;
+	struct bkey_buf k;
 	struct btree_nr_keys nr;
 
 	memset(&nr, 0, sizeof(nr));
-	bkey_on_stack_init(&k);
+	bch2_bkey_buf_init(&k);
 
 	while ((k_packed = bch2_btree_node_iter_next_all(iter, src))) {
 		if (filter_whiteouts && bkey_whiteout(k_packed))
@@ -204,7 +204,7 @@ bch2_sort_repack_merge(struct bch_fs *c,
 		 * node; we have to make a copy of the entire key before calling
 		 * normalize
 		 */
-		bkey_on_stack_realloc(&k, c, k_packed->u64s + BKEY_U64s);
+		bch2_bkey_buf_realloc(&k, c, k_packed->u64s + BKEY_U64s);
 		bch2_bkey_unpack(src, k.k, k_packed);
 
 		if (filter_whiteouts &&
@@ -215,7 +215,7 @@ bch2_sort_repack_merge(struct bch_fs *c,
 	}
 
 	dst->u64s = cpu_to_le16((u64 *) out - dst->_data);
-	bkey_on_stack_exit(&k, c);
+	bch2_bkey_buf_exit(&k, c);
 	return nr;
 }
 
@@ -315,11 +315,11 @@ bch2_extent_sort_fix_overlapping(struct bch_fs *c, struct bset *dst,
 	struct bkey l_unpacked, r_unpacked;
 	struct bkey_s l, r;
 	struct btree_nr_keys nr;
-	struct bkey_on_stack split;
+	struct bkey_buf split;
 	unsigned i;
 
 	memset(&nr, 0, sizeof(nr));
-	bkey_on_stack_init(&split);
+	bch2_bkey_buf_init(&split);
 
 	sort_iter_sort(iter, extent_sort_fix_overlapping_cmp);
 	for (i = 0; i < iter->used;) {
@@ -379,7 +379,7 @@ bch2_extent_sort_fix_overlapping(struct bch_fs *c, struct bset *dst,
 			/*
 			 * r wins, but it overlaps in the middle of l - split l:
 			 */
-			bkey_on_stack_reassemble(&split, c, l.s_c);
+			bch2_bkey_buf_reassemble(&split, c, l.s_c);
 			bch2_cut_back(bkey_start_pos(r.k), split.k);
 
 			bch2_cut_front_s(r.k->p, l);
@@ -398,7 +398,7 @@ bch2_extent_sort_fix_overlapping(struct bch_fs *c, struct bset *dst,
 
 	dst->u64s = cpu_to_le16((u64 *) out - dst->_data);
 
-	bkey_on_stack_exit(&split, c);
+	bch2_bkey_buf_exit(&split, c);
 	return nr;
 }
 
diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index 04c71f11a555..d859cd26259b 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 
 #include "bcachefs.h"
+#include "bkey_buf.h"
 #include "btree_cache.h"
 #include "btree_io.h"
 #include "btree_iter.h"
@@ -899,10 +900,12 @@ struct btree *bch2_btree_node_get_sibling(struct bch_fs *c,
 	struct btree *parent;
 	struct btree_node_iter node_iter;
 	struct bkey_packed *k;
-	BKEY_PADDED(k) tmp;
+	struct bkey_buf tmp;
 	struct btree *ret = NULL;
 	unsigned level = b->c.level;
 
+	bch2_bkey_buf_init(&tmp);
+
 	parent = btree_iter_node(iter, level + 1);
 	if (!parent)
 		return NULL;
@@ -936,9 +939,9 @@ struct btree *bch2_btree_node_get_sibling(struct bch_fs *c,
 	if (!k)
 		goto out;
 
-	bch2_bkey_unpack(parent, &tmp.k, k);
+	bch2_bkey_buf_unpack(&tmp, c, parent, k);
 
-	ret = bch2_btree_node_get(c, iter, &tmp.k, level,
+	ret = bch2_btree_node_get(c, iter, tmp.k, level,
 				  SIX_LOCK_intent, _THIS_IP_);
 
 	if (PTR_ERR_OR_ZERO(ret) == -EINTR && !trans->nounlock) {
@@ -958,7 +961,7 @@ struct btree *bch2_btree_node_get_sibling(struct bch_fs *c,
 		if (sib == btree_prev_sib)
 			btree_node_unlock(iter, level);
 
-		ret = bch2_btree_node_get(c, iter, &tmp.k, level,
+		ret = bch2_btree_node_get(c, iter, tmp.k, level,
 					  SIX_LOCK_intent, _THIS_IP_);
 
 		/*
@@ -999,6 +1002,8 @@ out:
 
 	bch2_btree_trans_verify_locks(trans);
 
+	bch2_bkey_buf_exit(&tmp, c);
+
 	return ret;
 }
 
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 8ab4c0df0d83..c390b490433a 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -8,7 +8,7 @@
 #include "alloc_background.h"
 #include "alloc_foreground.h"
 #include "bkey_methods.h"
-#include "bkey_on_stack.h"
+#include "bkey_buf.h"
 #include "btree_locking.h"
 #include "btree_update_interior.h"
 #include "btree_io.h"
@@ -267,10 +267,12 @@ static int bch2_gc_btree_init_recurse(struct bch_fs *c, struct btree *b,
 	struct btree_and_journal_iter iter;
 	struct bkey_s_c k;
 	struct bpos next_node_start = b->data->min_key;
+	struct bkey_buf tmp;
 	u8 max_stale = 0;
 	int ret = 0;
 
 	bch2_btree_and_journal_iter_init_node_iter(&iter, journal_keys, b);
+	bch2_bkey_buf_init(&tmp);
 
 	while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
 		bch2_bkey_debugcheck(c, b, k);
@@ -284,10 +286,9 @@ static int bch2_gc_btree_init_recurse(struct bch_fs *c, struct btree *b,
 
 		if (b->c.level) {
 			struct btree *child;
-			BKEY_PADDED(k) tmp;
 
-			bkey_reassemble(&tmp.k, k);
-			k = bkey_i_to_s_c(&tmp.k);
+			bch2_bkey_buf_reassemble(&tmp, c, k);
+			k = bkey_i_to_s_c(tmp.k);
 
 			bch2_btree_and_journal_iter_advance(&iter);
 
@@ -299,7 +300,7 @@ static int bch2_gc_btree_init_recurse(struct bch_fs *c, struct btree *b,
 				break;
 
 			if (b->c.level > target_depth) {
-				child = bch2_btree_node_get_noiter(c, &tmp.k,
+				child = bch2_btree_node_get_noiter(c, tmp.k,
 							b->c.btree_id, b->c.level - 1);
 				ret = PTR_ERR_OR_ZERO(child);
 				if (ret)
@@ -317,6 +318,7 @@ static int bch2_gc_btree_init_recurse(struct bch_fs *c, struct btree *b,
 		}
 	}
 
+	bch2_bkey_buf_exit(&tmp, c);
 	return ret;
 }
 
@@ -929,10 +931,10 @@ static int bch2_gc_btree_gens(struct bch_fs *c, enum btree_id btree_id)
 	struct btree_trans trans;
 	struct btree_iter *iter;
 	struct bkey_s_c k;
-	struct bkey_on_stack sk;
+	struct bkey_buf sk;
 	int ret = 0;
 
-	bkey_on_stack_init(&sk);
+	bch2_bkey_buf_init(&sk);
 	bch2_trans_init(&trans, c, 0, 0);
 
 	iter = bch2_trans_get_iter(&trans, btree_id, POS_MIN,
@@ -941,7 +943,7 @@ static int bch2_gc_btree_gens(struct bch_fs *c, enum btree_id btree_id)
 	while ((k = bch2_btree_iter_peek(iter)).k &&
 	       !(ret = bkey_err(k))) {
 		if (gc_btree_gens_key(c, k)) {
-			bkey_on_stack_reassemble(&sk, c, k);
+			bch2_bkey_buf_reassemble(&sk, c, k);
 			bch2_extent_normalize(c, bkey_i_to_s(sk.k));
 
 			bch2_btree_iter_set_pos(iter, bkey_start_pos(&sk.k->k));
@@ -961,7 +963,7 @@ static int bch2_gc_btree_gens(struct bch_fs *c, enum btree_id btree_id)
 	}
 
 	bch2_trans_exit(&trans);
-	bkey_on_stack_exit(&sk, c);
+	bch2_bkey_buf_exit(&sk, c);
 
 	return ret;
 }
@@ -1073,7 +1075,7 @@ static void bch2_coalesce_nodes(struct bch_fs *c, struct btree_iter *iter,
 		}
 
 	if (bch2_keylist_realloc(&keylist, NULL, 0,
-			(BKEY_U64s + BKEY_EXTENT_U64s_MAX) * nr_old_nodes)) {
+			BKEY_BTREE_PTR_U64s_MAX * nr_old_nodes)) {
 		trace_btree_gc_coalesce_fail(c,
 				BTREE_GC_COALESCE_FAIL_KEYLIST_REALLOC);
 		return;
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index c100f930bb8f..831f387557aa 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -1320,12 +1320,13 @@ static void bch2_btree_node_write_error(struct bch_fs *c,
 					struct btree_write_bio *wbio)
 {
 	struct btree *b		= wbio->wbio.bio.bi_private;
-	__BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp;
+	struct bkey_buf k;
 	struct bch_extent_ptr *ptr;
 	struct btree_trans trans;
 	struct btree_iter *iter;
 	int ret;
 
+	bch2_bkey_buf_init(&k);
 	bch2_trans_init(&trans, c, 0, 0);
 
 	iter = bch2_trans_get_node_iter(&trans, b->c.btree_id, b->key.k.p,
@@ -1344,21 +1345,22 @@ retry:
 
 	BUG_ON(!btree_node_hashed(b));
 
-	bkey_copy(&tmp.k, &b->key);
+	bch2_bkey_buf_copy(&k, c, &b->key);
 
-	bch2_bkey_drop_ptrs(bkey_i_to_s(&tmp.k), ptr,
+	bch2_bkey_drop_ptrs(bkey_i_to_s(k.k), ptr,
 		bch2_dev_list_has_dev(wbio->wbio.failed, ptr->dev));
 
-	if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(&tmp.k)))
+	if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(k.k)))
 		goto err;
 
-	ret = bch2_btree_node_update_key(c, iter, b, &tmp.k);
+	ret = bch2_btree_node_update_key(c, iter, b, k.k);
 	if (ret == -EINTR)
 		goto retry;
 	if (ret)
 		goto err;
 out:
 	bch2_trans_exit(&trans);
+	bch2_bkey_buf_exit(&k, c);
 	bio_put(&wbio->wbio.bio);
 	btree_node_write_done(c, b);
 	return;
@@ -1476,7 +1478,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
 	struct bset *i;
 	struct btree_node *bn = NULL;
 	struct btree_node_entry *bne = NULL;
-	BKEY_PADDED(key) k;
+	struct bkey_buf k;
 	struct bch_extent_ptr *ptr;
 	struct sort_iter sort_iter;
 	struct nonce nonce;
@@ -1487,6 +1489,8 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
 	bool validate_before_checksum = false;
 	void *data;
 
+	bch2_bkey_buf_init(&k);
+
 	if (test_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags))
 		return;
 
@@ -1696,15 +1700,16 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
 	 * just make all btree node writes FUA to keep things sane.
 	 */
 
-	bkey_copy(&k.key, &b->key);
+	bch2_bkey_buf_copy(&k, c, &b->key);
 
-	bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&k.key)), ptr)
+	bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(k.k)), ptr)
 		ptr->offset += b->written;
 
 	b->written += sectors_to_write;
 
 	/* XXX: submitting IO with btree locks held: */
-	bch2_submit_wbio_replicas(&wbio->wbio, c, BCH_DATA_btree, &k.key);
+	bch2_submit_wbio_replicas(&wbio->wbio, c, BCH_DATA_btree, k.k);
+	bch2_bkey_buf_exit(&k, c);
 	return;
 err:
 	set_btree_node_noevict(b);
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 15963a657c72..47d833f5ad56 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -2,6 +2,7 @@
 
 #include "bcachefs.h"
 #include "bkey_methods.h"
+#include "bkey_buf.h"
 #include "btree_cache.h"
 #include "btree_iter.h"
 #include "btree_key_cache.h"
@@ -1048,27 +1049,31 @@ static void btree_iter_prefetch(struct btree_iter *iter)
 	struct btree_iter_level *l = &iter->l[iter->level];
 	struct btree_node_iter node_iter = l->iter;
 	struct bkey_packed *k;
-	BKEY_PADDED(k) tmp;
+	struct bkey_buf tmp;
 	unsigned nr = test_bit(BCH_FS_STARTED, &c->flags)
 		? (iter->level > 1 ? 0 :  2)
 		: (iter->level > 1 ? 1 : 16);
 	bool was_locked = btree_node_locked(iter, iter->level);
 
+	bch2_bkey_buf_init(&tmp);
+
 	while (nr) {
 		if (!bch2_btree_node_relock(iter, iter->level))
-			return;
+			break;
 
 		bch2_btree_node_iter_advance(&node_iter, l->b);
 		k = bch2_btree_node_iter_peek(&node_iter, l->b);
 		if (!k)
 			break;
 
-		bch2_bkey_unpack(l->b, &tmp.k, k);
-		bch2_btree_node_prefetch(c, iter, &tmp.k, iter->level - 1);
+		bch2_bkey_buf_unpack(&tmp, c, l->b, k);
+		bch2_btree_node_prefetch(c, iter, tmp.k, iter->level - 1);
 	}
 
 	if (!was_locked)
 		btree_node_unlock(iter, iter->level);
+
+	bch2_bkey_buf_exit(&tmp, c);
 }
 
 static noinline void btree_node_mem_ptr_set(struct btree_iter *iter,
@@ -1100,30 +1105,34 @@ static __always_inline int btree_iter_down(struct btree_iter *iter,
 	struct btree *b;
 	unsigned level = iter->level - 1;
 	enum six_lock_type lock_type = __btree_lock_want(iter, level);
-	BKEY_PADDED(k) tmp;
+	struct bkey_buf tmp;
+	int ret;
 
 	EBUG_ON(!btree_node_locked(iter, iter->level));
 
-	bch2_bkey_unpack(l->b, &tmp.k,
+	bch2_bkey_buf_init(&tmp);
+	bch2_bkey_buf_unpack(&tmp, c, l->b,
 			 bch2_btree_node_iter_peek(&l->iter, l->b));
 
-	b = bch2_btree_node_get(c, iter, &tmp.k, level, lock_type, trace_ip);
-	if (unlikely(IS_ERR(b)))
-		return PTR_ERR(b);
+	b = bch2_btree_node_get(c, iter, tmp.k, level, lock_type, trace_ip);
+	ret = PTR_ERR_OR_ZERO(b);
+	if (unlikely(ret))
+		goto err;
 
 	mark_btree_node_locked(iter, level, lock_type);
 	btree_iter_node_set(iter, b);
 
-	if (tmp.k.k.type == KEY_TYPE_btree_ptr_v2 &&
-	    unlikely(b != btree_node_mem_ptr(&tmp.k)))
+	if (tmp.k->k.type == KEY_TYPE_btree_ptr_v2 &&
+	    unlikely(b != btree_node_mem_ptr(tmp.k)))
 		btree_node_mem_ptr_set(iter, level + 1, b);
 
 	if (iter->flags & BTREE_ITER_PREFETCH)
 		btree_iter_prefetch(iter);
 
 	iter->level = level;
-
-	return 0;
+err:
+	bch2_bkey_buf_exit(&tmp, c);
+	return ret;
 }
 
 static void btree_iter_up(struct btree_iter *iter)
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 51ad87abc763..e51e3c7868de 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -57,7 +57,7 @@ struct btree_write {
 
 struct btree_alloc {
 	struct open_buckets	ob;
-	BKEY_PADDED(k);
+	__BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX);
 };
 
 struct btree_bkey_cached_common {
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 6d69c7cb3665..2fa3a9aeb89a 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -195,7 +195,7 @@ static struct btree *__bch2_btree_node_alloc(struct bch_fs *c,
 {
 	struct write_point *wp;
 	struct btree *b;
-	BKEY_PADDED(k) tmp;
+	__BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp;
 	struct open_buckets ob = { .nr = 0 };
 	struct bch_devs_list devs_have = (struct bch_devs_list) { 0 };
 	unsigned nr_reserve;
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index 09de3270bff0..5dc2fc23c134 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -4,7 +4,7 @@
 
 #include "bcachefs.h"
 #include "alloc_foreground.h"
-#include "bkey_on_stack.h"
+#include "bkey_buf.h"
 #include "bset.h"
 #include "btree_gc.h"
 #include "btree_update.h"
@@ -783,10 +783,10 @@ static int ec_stripe_update_ptrs(struct bch_fs *c,
 	struct btree_iter *iter;
 	struct bkey_s_c k;
 	struct bkey_s_extent e;
-	struct bkey_on_stack sk;
+	struct bkey_buf sk;
 	int ret = 0, dev, idx;
 
-	bkey_on_stack_init(&sk);
+	bch2_bkey_buf_init(&sk);
 	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
 
 	/* XXX this doesn't support the reflink btree */
@@ -813,7 +813,7 @@ static int ec_stripe_update_ptrs(struct bch_fs *c,
 
 		dev = s->key.v.ptrs[idx].dev;
 
-		bkey_on_stack_reassemble(&sk, c, k);
+		bch2_bkey_buf_reassemble(&sk, c, k);
 		e = bkey_i_to_s_extent(sk.k);
 
 		bch2_bkey_drop_ptrs(e.s, ptr, ptr->dev != dev);
@@ -834,7 +834,7 @@ static int ec_stripe_update_ptrs(struct bch_fs *c,
 	}
 
 	bch2_trans_exit(&trans);
-	bkey_on_stack_exit(&sk, c);
+	bch2_bkey_buf_exit(&sk, c);
 
 	return ret;
 }
diff --git a/fs/bcachefs/extent_update.c b/fs/bcachefs/extent_update.c
index fd011df3cb99..1faca4bc1825 100644
--- a/fs/bcachefs/extent_update.c
+++ b/fs/bcachefs/extent_update.c
@@ -1,6 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
 #include "bcachefs.h"
-#include "bkey_on_stack.h"
 #include "btree_update.h"
 #include "btree_update_interior.h"
 #include "buckets.h"
diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index c2d024dec5c9..d48aa5b31e7b 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -3,7 +3,7 @@
 
 #include "bcachefs.h"
 #include "alloc_foreground.h"
-#include "bkey_on_stack.h"
+#include "bkey_buf.h"
 #include "btree_update.h"
 #include "buckets.h"
 #include "clock.h"
@@ -774,7 +774,7 @@ static void bchfs_read(struct btree_trans *trans, struct btree_iter *iter,
 		       struct readpages_iter *readpages_iter)
 {
 	struct bch_fs *c = trans->c;
-	struct bkey_on_stack sk;
+	struct bkey_buf sk;
 	int flags = BCH_READ_RETRY_IF_STALE|
 		BCH_READ_MAY_PROMOTE;
 	int ret = 0;
@@ -782,7 +782,7 @@ static void bchfs_read(struct btree_trans *trans, struct btree_iter *iter,
 	rbio->c = c;
 	rbio->start_time = local_clock();
 
-	bkey_on_stack_init(&sk);
+	bch2_bkey_buf_init(&sk);
 retry:
 	while (1) {
 		struct bkey_s_c k;
@@ -800,7 +800,7 @@ retry:
 			bkey_start_offset(k.k);
 		sectors = k.k->size - offset_into_extent;
 
-		bkey_on_stack_reassemble(&sk, c, k);
+		bch2_bkey_buf_reassemble(&sk, c, k);
 
 		ret = bch2_read_indirect_extent(trans,
 					&offset_into_extent, &sk);
@@ -845,7 +845,7 @@ retry:
 		bio_endio(&rbio->bio);
 	}
 
-	bkey_on_stack_exit(&sk, c);
+	bch2_bkey_buf_exit(&sk, c);
 }
 
 void bch2_readahead(struct readahead_control *ractl)
@@ -2431,7 +2431,7 @@ static long bchfs_fcollapse_finsert(struct bch_inode_info *inode,
 {
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
 	struct address_space *mapping = inode->v.i_mapping;
-	struct bkey_on_stack copy;
+	struct bkey_buf copy;
 	struct btree_trans trans;
 	struct btree_iter *src, *dst;
 	loff_t shift, new_size;
@@ -2441,7 +2441,7 @@ static long bchfs_fcollapse_finsert(struct bch_inode_info *inode,
 	if ((offset | len) & (block_bytes(c) - 1))
 		return -EINVAL;
 
-	bkey_on_stack_init(&copy);
+	bch2_bkey_buf_init(&copy);
 	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 256);
 
 	/*
@@ -2529,7 +2529,7 @@ static long bchfs_fcollapse_finsert(struct bch_inode_info *inode,
 		    bkey_cmp(k.k->p, POS(inode->v.i_ino, offset >> 9)) <= 0)
 			break;
 reassemble:
-		bkey_on_stack_reassemble(&copy, c, k);
+		bch2_bkey_buf_reassemble(&copy, c, k);
 
 		if (insert &&
 		    bkey_cmp(bkey_start_pos(k.k), move_pos) < 0)
@@ -2606,7 +2606,7 @@ bkey_err:
 	}
 err:
 	bch2_trans_exit(&trans);
-	bkey_on_stack_exit(&copy, c);
+	bch2_bkey_buf_exit(&copy, c);
 	bch2_pagecache_block_put(&inode->ei_pagecache_lock);
 	inode_unlock(&inode->v);
 	return ret;
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index 7cd3f243d1ed..bcb2f83fe354 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -3,7 +3,7 @@
 
 #include "bcachefs.h"
 #include "acl.h"
-#include "bkey_on_stack.h"
+#include "bkey_buf.h"
 #include "btree_update.h"
 #include "buckets.h"
 #include "chardev.h"
@@ -899,7 +899,7 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
 	struct btree_trans trans;
 	struct btree_iter *iter;
 	struct bkey_s_c k;
-	struct bkey_on_stack cur, prev;
+	struct bkey_buf cur, prev;
 	struct bpos end = POS(ei->v.i_ino, (start + len) >> 9);
 	unsigned offset_into_extent, sectors;
 	bool have_extent = false;
@@ -912,8 +912,8 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
 	if (start + len < start)
 		return -EINVAL;
 
-	bkey_on_stack_init(&cur);
-	bkey_on_stack_init(&prev);
+	bch2_bkey_buf_init(&cur);
+	bch2_bkey_buf_init(&prev);
 	bch2_trans_init(&trans, c, 0, 0);
 
 	iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
@@ -932,7 +932,7 @@ retry:
 			bkey_start_offset(k.k);
 		sectors			= k.k->size - offset_into_extent;
 
-		bkey_on_stack_reassemble(&cur, c, k);
+		bch2_bkey_buf_reassemble(&cur, c, k);
 
 		ret = bch2_read_indirect_extent(&trans,
 					&offset_into_extent, &cur);
@@ -940,7 +940,7 @@ retry:
 			break;
 
 		k = bkey_i_to_s_c(cur.k);
-		bkey_on_stack_realloc(&prev, c, k.k->u64s);
+		bch2_bkey_buf_realloc(&prev, c, k.k->u64s);
 
 		sectors = min(sectors, k.k->size - offset_into_extent);
 
@@ -974,8 +974,8 @@ retry:
 				       FIEMAP_EXTENT_LAST);
 
 	ret = bch2_trans_exit(&trans) ?: ret;
-	bkey_on_stack_exit(&cur, c);
-	bkey_on_stack_exit(&prev, c);
+	bch2_bkey_buf_exit(&cur, c);
+	bch2_bkey_buf_exit(&prev, c);
 	return ret < 0 ? ret : 0;
 }
 
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 39f872de0c18..df0f00f10bd7 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 
 #include "bcachefs.h"
-#include "bkey_on_stack.h"
+#include "bkey_buf.h"
 #include "btree_update.h"
 #include "dirent.h"
 #include "error.h"
@@ -464,11 +464,11 @@ static int check_extents(struct bch_fs *c)
 	struct btree_trans trans;
 	struct btree_iter *iter;
 	struct bkey_s_c k;
-	struct bkey_on_stack prev;
+	struct bkey_buf prev;
 	u64 i_sectors;
 	int ret = 0;
 
-	bkey_on_stack_init(&prev);
+	bch2_bkey_buf_init(&prev);
 	prev.k->k = KEY(0, 0, 0);
 	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
 
@@ -500,7 +500,7 @@ retry:
 					goto err;
 			}
 		}
-		bkey_on_stack_reassemble(&prev, c, k);
+		bch2_bkey_buf_reassemble(&prev, c, k);
 
 		ret = walk_inode(&trans, &w, k.k->p.inode);
 		if (ret)
@@ -569,7 +569,7 @@ err:
 fsck_err:
 	if (ret == -EINTR)
 		goto retry;
-	bkey_on_stack_exit(&prev, c);
+	bch2_bkey_buf_exit(&prev, c);
 	return bch2_trans_exit(&trans) ?: ret;
 }
 
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index b0d017e0b220..bc1e2dc04850 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -9,7 +9,7 @@
 #include "bcachefs.h"
 #include "alloc_background.h"
 #include "alloc_foreground.h"
-#include "bkey_on_stack.h"
+#include "bkey_buf.h"
 #include "bset.h"
 #include "btree_update.h"
 #include "buckets.h"
@@ -425,14 +425,14 @@ int bch2_fpunch(struct bch_fs *c, u64 inum, u64 start, u64 end,
 int bch2_write_index_default(struct bch_write_op *op)
 {
 	struct bch_fs *c = op->c;
-	struct bkey_on_stack sk;
+	struct bkey_buf sk;
 	struct keylist *keys = &op->insert_keys;
 	struct bkey_i *k = bch2_keylist_front(keys);
 	struct btree_trans trans;
 	struct btree_iter *iter;
 	int ret;
 
-	bkey_on_stack_init(&sk);
+	bch2_bkey_buf_init(&sk);
 	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024);
 
 	iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
@@ -444,7 +444,7 @@ int bch2_write_index_default(struct bch_write_op *op)
 
 		k = bch2_keylist_front(keys);
 
-		bkey_on_stack_realloc(&sk, c, k->k.u64s);
+		bch2_bkey_buf_realloc(&sk, c, k->k.u64s);
 		bkey_copy(sk.k, k);
 		bch2_cut_front(iter->pos, sk.k);
 
@@ -461,7 +461,7 @@ int bch2_write_index_default(struct bch_write_op *op)
 	} while (!bch2_keylist_empty(keys));
 
 	bch2_trans_exit(&trans);
-	bkey_on_stack_exit(&sk, c);
+	bch2_bkey_buf_exit(&sk, c);
 
 	return ret;
 }
@@ -1620,14 +1620,14 @@ static void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio
 {
 	struct btree_trans trans;
 	struct btree_iter *iter;
-	struct bkey_on_stack sk;
+	struct bkey_buf sk;
 	struct bkey_s_c k;
 	int ret;
 
 	flags &= ~BCH_READ_LAST_FRAGMENT;
 	flags |= BCH_READ_MUST_CLONE;
 
-	bkey_on_stack_init(&sk);
+	bch2_bkey_buf_init(&sk);
 	bch2_trans_init(&trans, c, 0, 0);
 
 	iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
@@ -1639,7 +1639,7 @@ retry:
 	if (bkey_err(k))
 		goto err;
 
-	bkey_on_stack_reassemble(&sk, c, k);
+	bch2_bkey_buf_reassemble(&sk, c, k);
 	k = bkey_i_to_s_c(sk.k);
 	bch2_trans_unlock(&trans);
 
@@ -1660,7 +1660,7 @@ retry:
 out:
 	bch2_rbio_done(rbio);
 	bch2_trans_exit(&trans);
-	bkey_on_stack_exit(&sk, c);
+	bch2_bkey_buf_exit(&sk, c);
 	return;
 err:
 	rbio->bio.bi_status = BLK_STS_IOERR;
@@ -1673,14 +1673,14 @@ static void bch2_read_retry(struct bch_fs *c, struct bch_read_bio *rbio,
 {
 	struct btree_trans trans;
 	struct btree_iter *iter;
-	struct bkey_on_stack sk;
+	struct bkey_buf sk;
 	struct bkey_s_c k;
 	int ret;
 
 	flags &= ~BCH_READ_LAST_FRAGMENT;
 	flags |= BCH_READ_MUST_CLONE;
 
-	bkey_on_stack_init(&sk);
+	bch2_bkey_buf_init(&sk);
 	bch2_trans_init(&trans, c, 0, 0);
 retry:
 	bch2_trans_begin(&trans);
@@ -1690,7 +1690,7 @@ retry:
 			   BTREE_ITER_SLOTS, k, ret) {
 		unsigned bytes, sectors, offset_into_extent;
 
-		bkey_on_stack_reassemble(&sk, c, k);
+		bch2_bkey_buf_reassemble(&sk, c, k);
 
 		offset_into_extent = iter->pos.offset -
 			bkey_start_offset(k.k);
@@ -1739,7 +1739,7 @@ err:
 	rbio->bio.bi_status = BLK_STS_IOERR;
 out:
 	bch2_trans_exit(&trans);
-	bkey_on_stack_exit(&sk, c);
+	bch2_bkey_buf_exit(&sk, c);
 	bch2_rbio_done(rbio);
 }
 
@@ -1810,17 +1810,6 @@ static int __bch2_rbio_narrow_crcs(struct btree_trans *trans,
 	if ((ret = bkey_err(k)))
 		goto out;
 
-	/*
-	 * going to be temporarily appending another checksum entry:
-	 */
-	new = bch2_trans_kmalloc(trans, bkey_bytes(k.k) +
-				 BKEY_EXTENT_U64s_MAX * 8);
-	if ((ret = PTR_ERR_OR_ZERO(new)))
-		goto out;
-
-	bkey_reassemble(new, k);
-	k = bkey_i_to_s_c(new);
-
 	if (bversion_cmp(k.k->version, rbio->version) ||
 	    !bch2_bkey_matches_ptr(c, k, rbio->pick.ptr, data_offset))
 		goto out;
@@ -1839,6 +1828,16 @@ static int __bch2_rbio_narrow_crcs(struct btree_trans *trans,
 		goto out;
 	}
 
+	/*
+	 * going to be temporarily appending another checksum entry:
+	 */
+	new = bch2_trans_kmalloc(trans, bkey_bytes(k.k) +
+				 sizeof(struct bch_extent_crc128));
+	if ((ret = PTR_ERR_OR_ZERO(new)))
+		goto out;
+
+	bkey_reassemble(new, k);
+
 	if (!bch2_bkey_narrow_crcs(new, new_crc))
 		goto out;
 
@@ -2005,7 +2004,7 @@ static void bch2_read_endio(struct bio *bio)
 
 int __bch2_read_indirect_extent(struct btree_trans *trans,
 				unsigned *offset_into_extent,
-				struct bkey_on_stack *orig_k)
+				struct bkey_buf *orig_k)
 {
 	struct btree_iter *iter;
 	struct bkey_s_c k;
@@ -2032,7 +2031,7 @@ int __bch2_read_indirect_extent(struct btree_trans *trans,
 	}
 
 	*offset_into_extent = iter->pos.offset - bkey_start_offset(k.k);
-	bkey_on_stack_reassemble(orig_k, trans->c, k);
+	bch2_bkey_buf_reassemble(orig_k, trans->c, k);
 err:
 	bch2_trans_iter_put(trans, iter);
 	return ret;
@@ -2304,7 +2303,7 @@ void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, u64 inode)
 {
 	struct btree_trans trans;
 	struct btree_iter *iter;
-	struct bkey_on_stack sk;
+	struct bkey_buf sk;
 	struct bkey_s_c k;
 	unsigned flags = BCH_READ_RETRY_IF_STALE|
 		BCH_READ_MAY_PROMOTE|
@@ -2318,7 +2317,7 @@ void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, u64 inode)
 	rbio->c = c;
 	rbio->start_time = local_clock();
 
-	bkey_on_stack_init(&sk);
+	bch2_bkey_buf_init(&sk);
 	bch2_trans_init(&trans, c, 0, 0);
 retry:
 	bch2_trans_begin(&trans);
@@ -2341,7 +2340,7 @@ retry:
 			bkey_start_offset(k.k);
 		sectors = k.k->size - offset_into_extent;
 
-		bkey_on_stack_reassemble(&sk, c, k);
+		bch2_bkey_buf_reassemble(&sk, c, k);
 
 		ret = bch2_read_indirect_extent(&trans,
 					&offset_into_extent, &sk);
@@ -2378,7 +2377,7 @@ retry:
 	}
 out:
 	bch2_trans_exit(&trans);
-	bkey_on_stack_exit(&sk, c);
+	bch2_bkey_buf_exit(&sk, c);
 	return;
 err:
 	if (ret == -EINTR)
diff --git a/fs/bcachefs/io.h b/fs/bcachefs/io.h
index 6721440e8bc7..8535e1f631be 100644
--- a/fs/bcachefs/io.h
+++ b/fs/bcachefs/io.h
@@ -3,7 +3,7 @@
 #define _BCACHEFS_IO_H
 
 #include "checksum.h"
-#include "bkey_on_stack.h"
+#include "bkey_buf.h"
 #include "io_types.h"
 
 #define to_wbio(_bio)			\
@@ -118,11 +118,11 @@ struct cache_promote_op;
 struct extent_ptr_decoded;
 
 int __bch2_read_indirect_extent(struct btree_trans *, unsigned *,
-				struct bkey_on_stack *);
+				struct bkey_buf *);
 
 static inline int bch2_read_indirect_extent(struct btree_trans *trans,
 					    unsigned *offset_into_extent,
-					    struct bkey_on_stack *k)
+					    struct bkey_buf *k)
 {
 	return k->k->k.type == KEY_TYPE_reflink_p
 		? __bch2_read_indirect_extent(trans, offset_into_extent, k)
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index be2c2d92384e..3ca8137923a6 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -1097,7 +1097,7 @@ int bch2_fs_journal_init(struct journal *j)
 
 	/* Btree roots: */
 	j->entry_u64s_reserved +=
-		BTREE_ID_NR * (JSET_KEYS_U64s + BKEY_EXTENT_U64s_MAX);
+		BTREE_ID_NR * (JSET_KEYS_U64s + BKEY_BTREE_PTR_U64s_MAX);
 
 	atomic64_set(&j->reservations.counter,
 		((union journal_res_state)
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index cb2cfbbf50d4..25010aa42af6 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -989,6 +989,8 @@ static int journal_write_alloc(struct journal *j, struct journal_buf *w,
 done:
 	rcu_read_unlock();
 
+	BUG_ON(bkey_val_u64s(&w->key.k) > BCH_REPLICAS_MAX);
+
 	return replicas >= c->opts.metadata_replicas_required ? 0 : -EROFS;
 }
 
diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h
index 1b130541f00b..150e691d5317 100644
--- a/fs/bcachefs/journal_types.h
+++ b/fs/bcachefs/journal_types.h
@@ -20,7 +20,7 @@
 struct journal_buf {
 	struct jset		*data;
 
-	BKEY_PADDED(key);
+	__BKEY_PADDED(key, BCH_REPLICAS_MAX);
 
 	struct closure_waitlist	wait;
 
diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c
index 96c8690adc5b..6241ff0c129f 100644
--- a/fs/bcachefs/migrate.c
+++ b/fs/bcachefs/migrate.c
@@ -4,7 +4,7 @@
  */
 
 #include "bcachefs.h"
-#include "bkey_on_stack.h"
+#include "bkey_buf.h"
 #include "btree_update.h"
 #include "btree_update_interior.h"
 #include "buckets.h"
@@ -41,10 +41,10 @@ static int __bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags
 	struct btree_trans trans;
 	struct btree_iter *iter;
 	struct bkey_s_c k;
-	struct bkey_on_stack sk;
+	struct bkey_buf sk;
 	int ret = 0;
 
-	bkey_on_stack_init(&sk);
+	bch2_bkey_buf_init(&sk);
 	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
 
 	iter = bch2_trans_get_iter(&trans, btree_id, POS_MIN,
@@ -57,7 +57,7 @@ static int __bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags
 			continue;
 		}
 
-		bkey_on_stack_reassemble(&sk, c, k);
+		bch2_bkey_buf_reassemble(&sk, c, k);
 
 		ret = drop_dev_ptrs(c, bkey_i_to_s(sk.k),
 				    dev_idx, flags, false);
@@ -90,7 +90,7 @@ static int __bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags
 	}
 
 	ret = bch2_trans_exit(&trans) ?: ret;
-	bkey_on_stack_exit(&sk, c);
+	bch2_bkey_buf_exit(&sk, c);
 
 	BUG_ON(ret == -EINTR);
 
@@ -109,6 +109,7 @@ static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
 	struct btree_iter *iter;
 	struct closure cl;
 	struct btree *b;
+	struct bkey_buf k;
 	unsigned id;
 	int ret;
 
@@ -116,28 +117,28 @@ static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
 	if (flags & BCH_FORCE_IF_METADATA_LOST)
 		return -EINVAL;
 
+	bch2_bkey_buf_init(&k);
 	bch2_trans_init(&trans, c, 0, 0);
 	closure_init_stack(&cl);
 
 	for (id = 0; id < BTREE_ID_NR; id++) {
 		for_each_btree_node(&trans, iter, id, POS_MIN,
 				    BTREE_ITER_PREFETCH, b) {
-			__BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp;
 retry:
 			if (!bch2_bkey_has_device(bkey_i_to_s_c(&b->key),
 						  dev_idx))
 				continue;
 
-			bkey_copy(&tmp.k, &b->key);
+			bch2_bkey_buf_copy(&k, c, &b->key);
 
-			ret = drop_dev_ptrs(c, bkey_i_to_s(&tmp.k),
+			ret = drop_dev_ptrs(c, bkey_i_to_s(k.k),
 					    dev_idx, flags, true);
 			if (ret) {
 				bch_err(c, "Cannot drop device without losing data");
 				goto err;
 			}
 
-			ret = bch2_btree_node_update_key(c, iter, b, &tmp.k);
+			ret = bch2_btree_node_update_key(c, iter, b, k.k);
 			if (ret == -EINTR) {
 				b = bch2_btree_iter_peek_node(iter);
 				goto retry;
@@ -157,6 +158,7 @@ retry:
 	ret = 0;
 err:
 	ret = bch2_trans_exit(&trans) ?: ret;
+	bch2_bkey_buf_exit(&k, c);
 
 	BUG_ON(ret == -EINTR);
 
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index 7f0990617b29..28e2125c12ed 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -2,7 +2,7 @@
 
 #include "bcachefs.h"
 #include "alloc_foreground.h"
-#include "bkey_on_stack.h"
+#include "bkey_buf.h"
 #include "btree_gc.h"
 #include "btree_update.h"
 #include "btree_update_interior.h"
@@ -60,8 +60,13 @@ static int bch2_migrate_index_update(struct bch_write_op *op)
 	struct migrate_write *m =
 		container_of(op, struct migrate_write, op);
 	struct keylist *keys = &op->insert_keys;
+	struct bkey_buf _new, _insert;
 	int ret = 0;
 
+	bch2_bkey_buf_init(&_new);
+	bch2_bkey_buf_init(&_insert);
+	bch2_bkey_buf_realloc(&_insert, c, U8_MAX);
+
 	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
 
 	iter = bch2_trans_get_iter(&trans, m->btree_id,
@@ -72,7 +77,6 @@ static int bch2_migrate_index_update(struct bch_write_op *op)
 		struct bkey_s_c k;
 		struct bkey_i *insert;
 		struct bkey_i_extent *new;
-		BKEY_PADDED(k) _new, _insert;
 		const union bch_extent_entry *entry;
 		struct extent_ptr_decoded p;
 		bool did_work = false;
@@ -92,11 +96,11 @@ static int bch2_migrate_index_update(struct bch_write_op *op)
 		    !bch2_bkey_matches_ptr(c, k, m->ptr, m->offset))
 			goto nomatch;
 
-		bkey_reassemble(&_insert.k, k);
-		insert = &_insert.k;
+		bkey_reassemble(_insert.k, k);
+		insert = _insert.k;
 
-		bkey_copy(&_new.k, bch2_keylist_front(keys));
-		new = bkey_i_to_extent(&_new.k);
+		bch2_bkey_buf_copy(&_new, c, bch2_keylist_front(keys));
+		new = bkey_i_to_extent(_new.k);
 		bch2_cut_front(iter->pos, &new->k_i);
 
 		bch2_cut_front(iter->pos,	insert);
@@ -192,6 +196,8 @@ nomatch:
 	}
 out:
 	bch2_trans_exit(&trans);
+	bch2_bkey_buf_exit(&_insert, c);
+	bch2_bkey_buf_exit(&_new, c);
 	BUG_ON(ret == -EINTR);
 	return ret;
 }
@@ -511,7 +517,7 @@ static int __bch2_move_data(struct bch_fs *c,
 {
 	bool kthread = (current->flags & PF_KTHREAD) != 0;
 	struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts);
-	struct bkey_on_stack sk;
+	struct bkey_buf sk;
 	struct btree_trans trans;
 	struct btree_iter *iter;
 	struct bkey_s_c k;
@@ -520,7 +526,7 @@ static int __bch2_move_data(struct bch_fs *c,
 	u64 delay, cur_inum = U64_MAX;
 	int ret = 0, ret2;
 
-	bkey_on_stack_init(&sk);
+	bch2_bkey_buf_init(&sk);
 	bch2_trans_init(&trans, c, 0, 0);
 
 	stats->data_type = BCH_DATA_user;
@@ -600,7 +606,7 @@ peek:
 		}
 
 		/* unlock before doing IO: */
-		bkey_on_stack_reassemble(&sk, c, k);
+		bch2_bkey_buf_reassemble(&sk, c, k);
 		k = bkey_i_to_s_c(sk.k);
 		bch2_trans_unlock(&trans);
 
@@ -634,7 +640,7 @@ next_nondata:
 	}
 out:
 	ret = bch2_trans_exit(&trans) ?: ret;
-	bkey_on_stack_exit(&sk, c);
+	bch2_bkey_buf_exit(&sk, c);
 
 	return ret;
 }
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 1883a1faf380..c5da1be46444 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 
 #include "bcachefs.h"
+#include "bkey_buf.h"
 #include "alloc_background.h"
 #include "btree_gc.h"
 #include "btree_update.h"
@@ -224,28 +225,29 @@ static int bch2_btree_and_journal_walk_recurse(struct bch_fs *c, struct btree *b
 
 		if (b->c.level) {
 			struct btree *child;
-			BKEY_PADDED(k) tmp;
+			struct bkey_buf tmp;
 
-			bkey_reassemble(&tmp.k, k);
-			k = bkey_i_to_s_c(&tmp.k);
+			bch2_bkey_buf_init(&tmp);
+			bch2_bkey_buf_reassemble(&tmp, c, k);
+			k = bkey_i_to_s_c(tmp.k);
 
 			bch2_btree_and_journal_iter_advance(&iter);
 
-			if (b->c.level > 0) {
-				child = bch2_btree_node_get_noiter(c, &tmp.k,
-							b->c.btree_id, b->c.level - 1);
-				ret = PTR_ERR_OR_ZERO(child);
-				if (ret)
-					break;
+			child = bch2_btree_node_get_noiter(c, tmp.k,
+						b->c.btree_id, b->c.level - 1);
+			bch2_bkey_buf_exit(&tmp, c);
 
-				ret   = (node_fn ? node_fn(c, b) : 0) ?:
-					bch2_btree_and_journal_walk_recurse(c, child,
-						journal_keys, btree_id, node_fn, key_fn);
-				six_unlock_read(&child->c.lock);
+			ret = PTR_ERR_OR_ZERO(child);
+			if (ret)
+				break;
 
-				if (ret)
-					break;
-			}
+			ret   = (node_fn ? node_fn(c, b) : 0) ?:
+				bch2_btree_and_journal_walk_recurse(c, child,
+					journal_keys, btree_id, node_fn, key_fn);
+			six_unlock_read(&child->c.lock);
+
+			if (ret)
+				break;
 		} else {
 			bch2_btree_and_journal_iter_advance(&iter);
 		}
diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c
index 8abcbfb3bd64..930547de3309 100644
--- a/fs/bcachefs/reflink.c
+++ b/fs/bcachefs/reflink.c
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 #include "bcachefs.h"
-#include "bkey_on_stack.h"
+#include "bkey_buf.h"
 #include "btree_update.h"
 #include "extents.h"
 #include "inode.h"
@@ -198,8 +198,7 @@ s64 bch2_remap_range(struct bch_fs *c,
 	struct btree_trans trans;
 	struct btree_iter *dst_iter, *src_iter;
 	struct bkey_s_c src_k;
-	BKEY_PADDED(k) new_dst;
-	struct bkey_on_stack new_src;
+	struct bkey_buf new_dst, new_src;
 	struct bpos dst_end = dst_start, src_end = src_start;
 	struct bpos dst_want, src_want;
 	u64 src_done, dst_done;
@@ -216,7 +215,8 @@ s64 bch2_remap_range(struct bch_fs *c,
 	dst_end.offset += remap_sectors;
 	src_end.offset += remap_sectors;
 
-	bkey_on_stack_init(&new_src);
+	bch2_bkey_buf_init(&new_dst);
+	bch2_bkey_buf_init(&new_src);
 	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 4096);
 
 	src_iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, src_start,
@@ -257,7 +257,7 @@ s64 bch2_remap_range(struct bch_fs *c,
 			break;
 
 		if (src_k.k->type != KEY_TYPE_reflink_p) {
-			bkey_on_stack_reassemble(&new_src, c, src_k);
+			bch2_bkey_buf_reassemble(&new_src, c, src_k);
 			src_k = bkey_i_to_s_c(new_src.k);
 
 			bch2_cut_front(src_iter->pos,	new_src.k);
@@ -275,7 +275,7 @@ s64 bch2_remap_range(struct bch_fs *c,
 			struct bkey_s_c_reflink_p src_p =
 				bkey_s_c_to_reflink_p(src_k);
 			struct bkey_i_reflink_p *dst_p =
-				bkey_reflink_p_init(&new_dst.k);
+				bkey_reflink_p_init(new_dst.k);
 
 			u64 offset = le64_to_cpu(src_p.v->idx) +
 				(src_iter->pos.offset -
@@ -286,12 +286,12 @@ s64 bch2_remap_range(struct bch_fs *c,
 			BUG();
 		}
 
-		new_dst.k.k.p = dst_iter->pos;
-		bch2_key_resize(&new_dst.k.k,
+		new_dst.k->k.p = dst_iter->pos;
+		bch2_key_resize(&new_dst.k->k,
 				min(src_k.k->p.offset - src_iter->pos.offset,
 				    dst_end.offset - dst_iter->pos.offset));
 
-		ret = bch2_extent_update(&trans, dst_iter, &new_dst.k,
+		ret = bch2_extent_update(&trans, dst_iter, new_dst.k,
 					 NULL, journal_seq,
 					 new_i_size, i_sectors_delta);
 		if (ret)
@@ -333,7 +333,8 @@ err:
 	} while (ret2 == -EINTR);
 
 	ret = bch2_trans_exit(&trans) ?: ret;
-	bkey_on_stack_exit(&new_src, c);
+	bch2_bkey_buf_exit(&new_src, c);
+	bch2_bkey_buf_exit(&new_dst, c);
 
 	percpu_ref_put(&c->writes);
 
-- 
cgit 


From c859430b1728d59ca6e4d7e9356db82979e2fd5b Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 4 Jan 2021 15:46:57 -0500
Subject: bcachefs: Fix journal_buf_realloc()

It used to be safe to reallocate a buf that the write path owns without
holding the journal lock, but now this can trigger an assertion in
journal_seq_to_buf().

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/journal_io.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 25010aa42af6..cba420565248 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -1051,9 +1051,13 @@ static void journal_buf_realloc(struct journal *j, struct journal_buf *buf)
 		return;
 
 	memcpy(new_buf, buf->data, buf->buf_size);
-	kvpfree(buf->data, buf->buf_size);
-	buf->data	= new_buf;
-	buf->buf_size	= new_size;
+
+	spin_lock(&j->lock);
+	swap(buf->data,		new_buf);
+	swap(buf->buf_size,	new_size);
+	spin_unlock(&j->lock);
+
+	kvpfree(new_buf, new_size);
 }
 
 static inline struct journal_buf *journal_last_unwritten_buf(struct journal *j)
-- 
cgit 


From 29d90f61eb341018ab571e7f8ceb8ff39cf5353a Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 6 Jan 2021 18:49:35 -0500
Subject: bcachefs: Don't error out of recovery process on journal read error

We don't want to fail the recovery/mount because of a single error
reading from the journal - the relevant journal entry may still be found
on other devices, and missing or no journal entries found is already
handled later in the recovery process.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/journal_io.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index cba420565248..ef4d48081975 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -576,8 +576,15 @@ reread:
 			if (bch2_dev_io_err_on(ret, ca,
 					       "journal read error: sector %llu",
 					       offset) ||
-			    bch2_meta_read_fault("journal"))
-				return -EIO;
+			    bch2_meta_read_fault("journal")) {
+				/*
+				 * We don't error out of the recovery process
+				 * here, since the relevant journal entry may be
+				 * found on a different device, and missing or
+				 * no journal entries will be handled later
+				 */
+				return 0;
+			}
 
 			j = buf->data;
 		}
-- 
cgit 


From fd54c40e00dc54cf1cd1724e4184502a56b9848f Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 7 Jan 2021 17:06:22 -0500
Subject: bcachefs: Work around a zstd bug

The zstd compression code seems to have a bug where it will write just
past the end of the destination buffer - probably only when the
compressed output isn't going to fit in the destination buffer, which
will never happen if you're always allocating a bigger buffer than the
source buffer which would explain other users not hitting it. But, we
size the buffer according to how much contiguous space on disk we have,
so...

generally, bugs like this don't write more than a word past the end of
the buffer, so an easy workaround is to subtract a fudge factor from the
buffer size.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/compress.c | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c
index 27bbc265d550..78757dcede36 100644
--- a/fs/bcachefs/compress.c
+++ b/fs/bcachefs/compress.c
@@ -336,8 +336,19 @@ static int attempt_compress(struct bch_fs *c,
 		ZSTD_CCtx *ctx = zstd_init_cctx(workspace,
 			zstd_cctx_workspace_bound(&c->zstd_params.cParams));
 
+		/*
+		 * ZSTD requires that when we decompress we pass in the exact
+		 * compressed size - rounding it up to the nearest sector
+		 * doesn't work, so we use the first 4 bytes of the buffer for
+		 * that.
+		 *
+		 * Additionally, the ZSTD code seems to have a bug where it will
+		 * write just past the end of the buffer - so subtract a fudge
+		 * factor (7 bytes) from the dst buffer size to account for
+		 * that.
+		 */
 		size_t len = zstd_compress_cctx(ctx,
-				dst + 4,	dst_len - 4,
+				dst + 4,	dst_len - 4 - 7,
 				src,		src_len,
 				&c->zstd_params);
 		if (zstd_is_error(len))
-- 
cgit 


From 890e3f5bf7e8d6035179c5f4668e0d30c19e9541 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 7 Jan 2021 17:18:14 -0500
Subject: bcachefs: Reserve some open buckets for btree allocations

This reverts part of the change from "bcachefs: Don't use
BTREE_INSERT_USE_RESERVE so much" - it turns out we still should be
reserving open buckets for btree node allocations, because otherwise
data bucket allocations (especially with erasure coding enabled) can use
up all our open buckets and we won't be able to do the metadata update
that lets us release those open bucket references. Oops.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_foreground.c      | 6 +++++-
 fs/bcachefs/alloc_types.h           | 4 ++--
 fs/bcachefs/btree_update_interior.c | 4 ++--
 3 files changed, 9 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index df9f022e6926..476c46f596cc 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -204,8 +204,11 @@ success:
 static inline unsigned open_buckets_reserved(enum alloc_reserve reserve)
 {
 	switch (reserve) {
-	case RESERVE_MOVINGGC:
+	case RESERVE_BTREE:
+	case RESERVE_BTREE_MOVINGGC:
 		return 0;
+	case RESERVE_MOVINGGC:
+		return OPEN_BUCKETS_COUNT / 4;
 	default:
 		return OPEN_BUCKETS_COUNT / 2;
 	}
@@ -261,6 +264,7 @@ struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca,
 		goto out;
 
 	switch (reserve) {
+	case RESERVE_BTREE_MOVINGGC:
 	case RESERVE_MOVINGGC:
 		if (fifo_pop(&ca->free[RESERVE_MOVINGGC], bucket))
 			goto out;
diff --git a/fs/bcachefs/alloc_types.h b/fs/bcachefs/alloc_types.h
index 0cfb026a02e5..1abfff5290bc 100644
--- a/fs/bcachefs/alloc_types.h
+++ b/fs/bcachefs/alloc_types.h
@@ -34,9 +34,9 @@ struct bucket_clock {
 	struct mutex		lock;
 };
 
-/* There is one reserve for each type of btree, one for prios and gens
- * and one for moving GC */
 enum alloc_reserve {
+	RESERVE_BTREE_MOVINGGC	= -2,
+	RESERVE_BTREE		= -1,
 	RESERVE_MOVINGGC	= 0,
 	RESERVE_NONE		= 1,
 	RESERVE_NR		= 2,
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 2fa3a9aeb89a..c25ce358f931 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -203,10 +203,10 @@ static struct btree *__bch2_btree_node_alloc(struct bch_fs *c,
 
 	if (flags & BTREE_INSERT_USE_RESERVE) {
 		nr_reserve	= 0;
-		alloc_reserve	= RESERVE_MOVINGGC;
+		alloc_reserve	= RESERVE_BTREE_MOVINGGC;
 	} else {
 		nr_reserve	= BTREE_NODE_RESERVE;
-		alloc_reserve	= RESERVE_NONE;
+		alloc_reserve	= RESERVE_BTREE;
 	}
 
 	mutex_lock(&c->btree_reserve_cache_lock);
-- 
cgit 


From dcf64dfbbc3c3c46af5508afed9f46e906fcd748 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 8 Jan 2021 10:56:39 -0500
Subject: bcachefs: Fix btree node split after merge operations

A btree node merge operation deletes a key in the parent node; if when
inserting into the parent node we split the parent node, we can end up
with a whiteout in the parent node that we don't want.

The existing code drops them before doing the split, because they can
screw up picking the pivot, but we forgot about the unwritten writeouts
area - that needs to be cleared out too.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update_interior.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index c25ce358f931..3b19c1c7b450 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -1227,6 +1227,9 @@ static void btree_split_insert_keys(struct btree_update *as, struct btree *b,
 		src = n;
 	}
 
+	/* Also clear out the unwritten whiteouts area: */
+	b->whiteout_u64s = 0;
+
 	i->u64s = cpu_to_le16((u64 *) dst - i->_data);
 	set_btree_bset_end(b, b->set);
 
-- 
cgit 


From 4291a3317f3724283023f35802c47083a05b938d Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 8 Jan 2021 21:20:58 -0500
Subject: bcachefs: bch2_alloc_write() should be writing for all devices

Alloc info isn't stored on a particular device, it makes no sense to
only be writing it out for rw members - this was causing fsck to not fix
alloc info errors, oops.

Also, make sure we write out alloc info in other repair paths.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c |  2 +-
 fs/bcachefs/bcachefs.h         |  2 +-
 fs/bcachefs/btree_gc.c         |  8 +++++---
 fs/bcachefs/recovery.c         | 15 +++++----------
 4 files changed, 12 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 9920e902d383..d93c7809d821 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -366,7 +366,7 @@ int bch2_alloc_write(struct bch_fs *c, unsigned flags)
 	unsigned i;
 	int ret = 0;
 
-	for_each_rw_member(ca, c, i) {
+	for_each_member_device(ca, c, i) {
 		bch2_dev_alloc_write(c, ca, flags);
 		if (ret) {
 			percpu_ref_put(&ca->io_ref);
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 9645a4edcbe8..799569d1778a 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -510,7 +510,7 @@ enum {
 
 	/* misc: */
 	BCH_FS_FIXED_GENS,
-	BCH_FS_ALLOC_WRITTEN,
+	BCH_FS_NEED_ALLOC_WRITE,
 	BCH_FS_REBUILD_REPLICAS,
 	BCH_FS_HOLD_BTREE_WRITES,
 };
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index c390b490433a..4a1d800d257e 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -132,6 +132,7 @@ static int bch2_gc_mark_key(struct bch_fs *c, struct bkey_s_c k,
 					ptr->gen)) {
 				g2->_mark.gen	= g->_mark.gen		= ptr->gen;
 				g2->gen_valid	= g->gen_valid		= true;
+				set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags);
 			}
 
 			if (mustfix_fsck_err_on(gen_cmp(ptr->gen, g->mark.gen) > 0, c,
@@ -145,6 +146,7 @@ static int bch2_gc_mark_key(struct bch_fs *c, struct bkey_s_c k,
 				g2->_mark.dirty_sectors		= 0;
 				g2->_mark.cached_sectors	= 0;
 				set_bit(BCH_FS_FIXED_GENS, &c->flags);
+				set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags);
 			}
 		}
 	}
@@ -571,7 +573,7 @@ static int bch2_gc_done(struct bch_fs *c,
 			fsck_err(c, _msg ": got %llu, should be %llu"	\
 				, ##__VA_ARGS__, dst->_f, src->_f);	\
 		dst->_f = src->_f;					\
-		ret = 1;						\
+		set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags);		\
 	}
 #define copy_stripe_field(_f, _msg, ...)				\
 	if (dst->_f != src->_f) {					\
@@ -582,7 +584,7 @@ static int bch2_gc_done(struct bch_fs *c,
 				dst->_f, src->_f);			\
 		dst->_f = src->_f;					\
 		dst->dirty = true;					\
-		ret = 1;						\
+		set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags);		\
 	}
 #define copy_bucket_field(_f)						\
 	if (dst->b[b].mark._f != src->b[b].mark._f) {			\
@@ -593,7 +595,7 @@ static int bch2_gc_done(struct bch_fs *c,
 				bch2_data_types[dst->b[b].mark.data_type],\
 				dst->b[b].mark._f, src->b[b].mark._f);	\
 		dst->b[b]._mark._f = src->b[b].mark._f;			\
-		ret = 1;						\
+		set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags);		\
 	}
 #define copy_dev_field(_f, _msg, ...)					\
 	copy_field(_f, "dev %u has wrong " _msg, i, ##__VA_ARGS__)
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index c5da1be46444..5a43682c26ef 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -938,7 +938,7 @@ int bch2_fs_recovery(struct bch_fs *c)
 	struct bch_sb_field_clean *clean = NULL;
 	struct jset *last_journal_entry = NULL;
 	u64 blacklist_seq, journal_seq;
-	bool write_sb = false, need_write_alloc = false;
+	bool write_sb = false;
 	int ret;
 
 	if (c->sb.clean)
@@ -1084,10 +1084,8 @@ use_clean:
 		bch_info(c, "starting metadata mark and sweep");
 		err = "error in mark and sweep";
 		ret = bch2_gc(c, &c->journal_keys, true, true);
-		if (ret < 0)
-			goto err;
 		if (ret)
-			need_write_alloc = true;
+			goto err;
 		bch_verbose(c, "mark and sweep done");
 	}
 
@@ -1097,10 +1095,8 @@ use_clean:
 		bch_info(c, "starting mark and sweep");
 		err = "error in mark and sweep";
 		ret = bch2_gc(c, &c->journal_keys, true, false);
-		if (ret < 0)
-			goto err;
 		if (ret)
-			need_write_alloc = true;
+			goto err;
 		bch_verbose(c, "mark and sweep done");
 	}
 
@@ -1124,7 +1120,8 @@ use_clean:
 		goto err;
 	bch_verbose(c, "journal replay done");
 
-	if (need_write_alloc && !c->opts.nochanges) {
+	if (test_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags) &&
+	    !c->opts.nochanges) {
 		/*
 		 * note that even when filesystem was clean there might be work
 		 * to do here, if we ran gc (because of fsck) which recalculated
@@ -1139,8 +1136,6 @@ use_clean:
 			goto err;
 		}
 		bch_verbose(c, "alloc write done");
-
-		set_bit(BCH_FS_ALLOC_WRITTEN, &c->flags);
 	}
 
 	if (!c->sb.clean) {
-- 
cgit 


From 53ef2c5cc991a9c5aecc7b85754695df5cc5de45 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 10 Jan 2021 13:38:09 -0500
Subject: bcachefs: Fix bch2_replicas_gc2

This fixes a regression introduced by "bcachefs: Refactor filesystem
usage accounting". We have to include all the replicas entries that have
any of the entries for different journal entries nonzero, we can't skip
them if they sum to zero.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/replicas.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c
index 57c2e66edad1..a0840e1c9f88 100644
--- a/fs/bcachefs/replicas.c
+++ b/fs/bcachefs/replicas.c
@@ -607,7 +607,11 @@ retry:
 			cpu_replicas_entry(&c->replicas, i);
 
 		if (e->data_type == BCH_DATA_journal ||
-		    bch2_fs_usage_read_one(c, &c->usage_base->replicas[i]))
+		    c->usage_base->replicas[i] ||
+		    percpu_u64_get(&c->usage[0]->replicas[i]) ||
+		    percpu_u64_get(&c->usage[1]->replicas[i]) ||
+		    percpu_u64_get(&c->usage[2]->replicas[i]) ||
+		    percpu_u64_get(&c->usage[3]->replicas[i]))
 			memcpy(cpu_replicas_entry(&new, new.nr++),
 			       e, new.entry_size);
 	}
-- 
cgit 


From 032ac32c516403cd0d5ebf30e233746271a7ddcc Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 27 Apr 2021 14:18:22 -0400
Subject: bcachefs: Fix .splice_write

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-io.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index d48aa5b31e7b..5dd985e20c7f 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -1918,8 +1918,7 @@ loop:
 			i_size_write(&inode->v, req->ki_pos);
 		spin_unlock(&inode->v.i_lock);
 
-		bio_for_each_segment_all(bv, bio, iter)
-			put_page(bv->bv_page);
+		bio_release_pages(bio, false);
 
 		if (dio->op.error) {
 			set_bit(EI_INODE_ERROR, &inode->ei_flags);
-- 
cgit 


From b929bbef6f9284350ad3e23a77a822a5bb2fec3d Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 11 Jan 2021 13:37:35 -0500
Subject: bcachefs: Add cannibalize lock to btree_cache_to_text()

More debugging info is always a good thing.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_cache.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index d859cd26259b..904440f26d40 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -1073,6 +1073,7 @@ void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c,
 
 void bch2_btree_cache_to_text(struct printbuf *out, struct bch_fs *c)
 {
-	pr_buf(out, "nr nodes:\t%u\n", c->btree_cache.used);
-	pr_buf(out, "nr dirty:\t%u\n", atomic_read(&c->btree_cache.dirty));
+	pr_buf(out, "nr nodes:\t\t%u\n", c->btree_cache.used);
+	pr_buf(out, "nr dirty:\t\t%u\n", atomic_read(&c->btree_cache.dirty));
+	pr_buf(out, "cannibalize lock:\t%p\n", c->btree_cache.alloc_lock);
 }
-- 
cgit 


From 2a3731e34de9365038b25d76bb6e11cf5c40ac36 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 11 Jan 2021 13:51:23 -0500
Subject: bcachefs: Erasure coding fixes & refactoring

 - Originally bch_extent_stripe_ptr didn't contain the block index,
   instead we'd have to search through the stripe pointers to figure out
   which pointer matched. When the block field was added to
   bch_extent_stripe_ptr, not all of the code was updated to use it.
   This patch fixes that, and we also now verify that field where it
   makes sense.

 - The ec_stripe_buf_init/exit() functions have been improved, and are
   now used by the bch2_ec_read_extent() (recovery read) path.

 - get_stripe_key() is now used by bch2_ec_read_extent().

 - We now have a getter and setter for checksums within a stripe, like
   we had previously for block sector counts, and ec_generate_checksums
   and ec_validate_checksums are now quite a bit smaller and cleaner.

ec.c still needs a lot of work, but this patch is slowly moving things
in the right direction.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/buckets.c |  20 +++-
 fs/bcachefs/ec.c      | 311 ++++++++++++++++++++++----------------------------
 fs/bcachefs/ec.h      |  46 +++++++-
 3 files changed, 194 insertions(+), 183 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 31a2d3dbfe8f..1bbd1ee080ec 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -1656,7 +1656,7 @@ out:
 }
 
 static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans,
-			struct bch_extent_stripe_ptr p,
+			struct extent_ptr_decoded p,
 			s64 sectors, enum bch_data_type data_type)
 {
 	struct bch_fs *c = trans->c;
@@ -1666,14 +1666,22 @@ static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans,
 	struct bch_replicas_padded r;
 	int ret = 0;
 
-	ret = trans_get_key(trans, BTREE_ID_EC, POS(0, p.idx), &iter, &k);
+	ret = trans_get_key(trans, BTREE_ID_EC, POS(0, p.ec.idx), &iter, &k);
 	if (ret < 0)
 		return ret;
 
 	if (k.k->type != KEY_TYPE_stripe) {
 		bch2_fs_inconsistent(c,
 			"pointer to nonexistent stripe %llu",
-			(u64) p.idx);
+			(u64) p.ec.idx);
+		ret = -EIO;
+		goto out;
+	}
+
+	if (!bch2_ptr_matches_stripe(bkey_s_c_to_stripe(k).v, p)) {
+		bch2_fs_inconsistent(c,
+			"stripe pointer doesn't match stripe %llu",
+			(u64) p.ec.idx);
 		ret = -EIO;
 		goto out;
 	}
@@ -1684,8 +1692,8 @@ static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans,
 		goto out;
 
 	bkey_reassemble(&s->k_i, k);
-	stripe_blockcount_set(&s->v, p.block,
-		stripe_blockcount_get(&s->v, p.block) +
+	stripe_blockcount_set(&s->v, p.ec.block,
+		stripe_blockcount_get(&s->v, p.ec.block) +
 		sectors);
 	bch2_trans_update(trans, iter, &s->k_i, 0);
 
@@ -1736,7 +1744,7 @@ static int bch2_trans_mark_extent(struct btree_trans *trans,
 			dirty_sectors	       += disk_sectors;
 			r.e.devs[r.e.nr_devs++]	= p.ptr.dev;
 		} else {
-			ret = bch2_trans_mark_stripe_ptr(trans, p.ec,
+			ret = bch2_trans_mark_stripe_ptr(trans, p,
 					disk_sectors, data_type);
 			if (ret)
 				return ret;
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index 5dc2fc23c134..ce52344c79a6 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -138,44 +138,18 @@ void bch2_stripe_to_text(struct printbuf *out, struct bch_fs *c,
 		       stripe_blockcount_get(s, i));
 }
 
-static int ptr_matches_stripe(struct bch_fs *c,
-			      struct bch_stripe *v,
-			      const struct bch_extent_ptr *ptr)
+/* returns blocknr in stripe that we matched: */
+static int bkey_matches_stripe(struct bch_stripe *s,
+			       struct bkey_s_c k)
 {
-	unsigned i;
-
-	for (i = 0; i < v->nr_blocks - v->nr_redundant; i++) {
-		const struct bch_extent_ptr *ptr2 = v->ptrs + i;
-
-		if (ptr->dev == ptr2->dev &&
-		    ptr->gen == ptr2->gen &&
-		    ptr->offset >= ptr2->offset &&
-		    ptr->offset <  ptr2->offset + le16_to_cpu(v->sectors))
-			return i;
-	}
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+	const struct bch_extent_ptr *ptr;
+	unsigned i, nr_data = s->nr_blocks - s->nr_redundant;
 
-	return -1;
-}
-
-static int extent_matches_stripe(struct bch_fs *c,
-				 struct bch_stripe *v,
-				 struct bkey_s_c k)
-{
-
-	switch (k.k->type) {
-	case KEY_TYPE_extent: {
-		struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
-		const struct bch_extent_ptr *ptr;
-		int idx;
-
-		extent_for_each_ptr(e, ptr) {
-			idx = ptr_matches_stripe(c, v, ptr);
-			if (idx >= 0)
-				return idx;
-		}
-		break;
-	}
-	}
+	bkey_for_each_ptr(ptrs, ptr)
+		for (i = 0; i < nr_data; i++)
+			if (__bch2_ptr_matches_stripe(s, ptr, i))
+				return i;
 
 	return -1;
 }
@@ -202,74 +176,93 @@ static bool extent_has_stripe_ptr(struct bkey_s_c k, u64 idx)
 
 /* Stripe bufs: */
 
-static void ec_stripe_buf_free(struct ec_stripe_buf *stripe)
+static void ec_stripe_buf_exit(struct ec_stripe_buf *buf)
 {
 	unsigned i;
 
-	for (i = 0; i < stripe->key.v.nr_blocks; i++) {
-		kvpfree(stripe->data[i], stripe->size << 9);
-		stripe->data[i] = NULL;
+	for (i = 0; i < buf->key.v.nr_blocks; i++) {
+		kvpfree(buf->data[i], buf->size << 9);
+		buf->data[i] = NULL;
 	}
 }
 
-static int ec_stripe_buf_alloc(struct ec_stripe_buf *stripe)
+static int ec_stripe_buf_init(struct ec_stripe_buf *buf,
+			       unsigned offset, unsigned size)
 {
+	struct bch_stripe *v = &buf->key.v;
+	unsigned csum_granularity = 1U << v->csum_granularity_bits;
+	unsigned end = offset + size;
 	unsigned i;
 
-	memset(stripe->valid, 0xFF, sizeof(stripe->valid));
+	BUG_ON(end > le16_to_cpu(v->sectors));
+
+	offset	= round_down(offset, csum_granularity);
+	end	= min_t(unsigned, le16_to_cpu(v->sectors),
+			round_up(end, csum_granularity));
 
-	for (i = 0; i < stripe->key.v.nr_blocks; i++) {
-		stripe->data[i] = kvpmalloc(stripe->size << 9, GFP_KERNEL);
-		if (!stripe->data[i])
+	buf->offset	= offset;
+	buf->size	= end - offset;
+
+	memset(buf->valid, 0xFF, sizeof(buf->valid));
+
+	for (i = 0; i < buf->key.v.nr_blocks; i++) {
+		buf->data[i] = kvpmalloc(buf->size << 9, GFP_KERNEL);
+		if (!buf->data[i])
 			goto err;
 	}
 
 	return 0;
 err:
-	ec_stripe_buf_free(stripe);
+	ec_stripe_buf_exit(buf);
 	return -ENOMEM;
 }
 
 /* Checksumming: */
 
-static void ec_generate_checksums(struct ec_stripe_buf *buf)
+static struct bch_csum ec_block_checksum(struct ec_stripe_buf *buf,
+					 unsigned block, unsigned offset)
 {
 	struct bch_stripe *v = &buf->key.v;
 	unsigned csum_granularity = 1 << v->csum_granularity_bits;
-	unsigned csums_per_device = stripe_csums_per_device(v);
-	unsigned csum_bytes = bch_crc_bytes[v->csum_type];
-	unsigned i, j;
+	unsigned end = buf->offset + buf->size;
+	unsigned len = min(csum_granularity, end - offset);
+
+	BUG_ON(offset >= end);
+	BUG_ON(offset <  buf->offset);
+	BUG_ON(offset & (csum_granularity - 1));
+	BUG_ON(offset + len != le16_to_cpu(v->sectors) &&
+	       (len & (csum_granularity - 1)));
+
+	return bch2_checksum(NULL, v->csum_type,
+			     null_nonce(),
+			     buf->data[block] + ((offset - buf->offset) << 9),
+			     len << 9);
+}
+
+static void ec_generate_checksums(struct ec_stripe_buf *buf)
+{
+	struct bch_stripe *v = &buf->key.v;
+	unsigned i, j, csums_per_device = stripe_csums_per_device(v);
 
-	if (!csum_bytes)
+	if (!v->csum_type)
 		return;
 
 	BUG_ON(buf->offset);
 	BUG_ON(buf->size != le16_to_cpu(v->sectors));
 
-	for (i = 0; i < v->nr_blocks; i++) {
-		for (j = 0; j < csums_per_device; j++) {
-			unsigned offset = j << v->csum_granularity_bits;
-			unsigned len = min(csum_granularity, buf->size - offset);
-
-			struct bch_csum csum =
-				bch2_checksum(NULL, v->csum_type,
-					      null_nonce(),
-					      buf->data[i] + (offset << 9),
-					      len << 9);
-
-			memcpy(stripe_csum(v, i, j), &csum, csum_bytes);
-		}
-	}
+	for (i = 0; i < v->nr_blocks; i++)
+		for (j = 0; j < csums_per_device; j++)
+			stripe_csum_set(v, i, j,
+				ec_block_checksum(buf, i, j << v->csum_granularity_bits));
 }
 
 static void ec_validate_checksums(struct bch_fs *c, struct ec_stripe_buf *buf)
 {
 	struct bch_stripe *v = &buf->key.v;
 	unsigned csum_granularity = 1 << v->csum_granularity_bits;
-	unsigned csum_bytes = bch_crc_bytes[v->csum_type];
 	unsigned i;
 
-	if (!csum_bytes)
+	if (!v->csum_type)
 		return;
 
 	for (i = 0; i < v->nr_blocks; i++) {
@@ -282,21 +275,14 @@ static void ec_validate_checksums(struct bch_fs *c, struct ec_stripe_buf *buf)
 		while (offset < end) {
 			unsigned j = offset >> v->csum_granularity_bits;
 			unsigned len = min(csum_granularity, end - offset);
-			struct bch_csum csum;
-
-			BUG_ON(offset & (csum_granularity - 1));
-			BUG_ON(offset + len != le16_to_cpu(v->sectors) &&
-			       ((offset + len) & (csum_granularity - 1)));
-
-			csum = bch2_checksum(NULL, v->csum_type,
-					     null_nonce(),
-					     buf->data[i] + ((offset - buf->offset) << 9),
-					     len << 9);
+			struct bch_csum want = stripe_csum_get(v, i, j);
+			struct bch_csum got = ec_block_checksum(buf, i, offset);
 
-			if (memcmp(stripe_csum(v, i, j), &csum, csum_bytes)) {
+			if (bch2_crc_cmp(want, got)) {
 				bch_err_ratelimited(c,
-					"checksum error while doing reconstruct read (%u:%u)",
-					i, j);
+					"stripe checksum error at %u:%u: csum type %u, expected %llx got %llx",
+					i, j, v->csum_type,
+					want.lo, got.lo);
 				clear_bit(i, buf->valid);
 				break;
 			}
@@ -373,6 +359,14 @@ static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf,
 		? BCH_DATA_user
 		: BCH_DATA_parity;
 
+	if (ptr_stale(ca, ptr)) {
+		bch_err_ratelimited(c,
+				    "error %s stripe: stale pointer",
+				    rw == READ ? "reading from" : "writing to");
+		clear_bit(idx, buf->valid);
+		return;
+	}
+
 	if (!bch2_dev_get_ioref(ca, rw)) {
 		clear_bit(idx, buf->valid);
 		return;
@@ -415,87 +409,77 @@ static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf,
 	percpu_ref_put(&ca->io_ref);
 }
 
-/* recovery read path: */
-int bch2_ec_read_extent(struct bch_fs *c, struct bch_read_bio *rbio)
+static int get_stripe_key(struct bch_fs *c, u64 idx, struct ec_stripe_buf *stripe)
 {
 	struct btree_trans trans;
 	struct btree_iter *iter;
+	struct bkey_s_c k;
+	int ret;
+
+	bch2_trans_init(&trans, c, 0, 0);
+	iter = bch2_trans_get_iter(&trans, BTREE_ID_EC, POS(0, idx), BTREE_ITER_SLOTS);
+	k = bch2_btree_iter_peek_slot(iter);
+	ret = bkey_err(k);
+	if (ret)
+		goto err;
+	if (k.k->type != KEY_TYPE_stripe) {
+		ret = -ENOENT;
+		goto err;
+	}
+	bkey_reassemble(&stripe->key.k_i, k);
+err:
+	bch2_trans_exit(&trans);
+	return ret;
+}
+
+/* recovery read path: */
+int bch2_ec_read_extent(struct bch_fs *c, struct bch_read_bio *rbio)
+{
 	struct ec_stripe_buf *buf;
 	struct closure cl;
-	struct bkey_s_c k;
 	struct bch_stripe *v;
-	unsigned stripe_idx;
-	unsigned offset, end;
-	unsigned i, nr_data, csum_granularity;
-	int ret = 0, idx;
+	unsigned i, offset;
+	int ret = 0;
 
 	closure_init_stack(&cl);
 
 	BUG_ON(!rbio->pick.has_ec);
 
-	stripe_idx = rbio->pick.ec.idx;
-
 	buf = kzalloc(sizeof(*buf), GFP_NOIO);
 	if (!buf)
 		return -ENOMEM;
 
-	bch2_trans_init(&trans, c, 0, 0);
-
-	iter = bch2_trans_get_iter(&trans, BTREE_ID_EC,
-				   POS(0, stripe_idx),
-				   BTREE_ITER_SLOTS);
-	k = bch2_btree_iter_peek_slot(iter);
-	if (bkey_err(k) || k.k->type != KEY_TYPE_stripe) {
+	ret = get_stripe_key(c, rbio->pick.ec.idx, buf);
+	if (ret) {
 		bch_err_ratelimited(c,
-			"error doing reconstruct read: stripe not found");
+			"error doing reconstruct read: error %i looking up stripe", ret);
 		kfree(buf);
-		return bch2_trans_exit(&trans) ?: -EIO;
+		return -EIO;
 	}
 
-	bkey_reassemble(&buf->key.k_i, k);
-	bch2_trans_exit(&trans);
-
 	v = &buf->key.v;
 
-	nr_data = v->nr_blocks - v->nr_redundant;
-
-	idx = ptr_matches_stripe(c, v, &rbio->pick.ptr);
-	BUG_ON(idx < 0);
-
-	csum_granularity = 1U << v->csum_granularity_bits;
-
-	offset	= rbio->bio.bi_iter.bi_sector - v->ptrs[idx].offset;
-	end	= offset + bio_sectors(&rbio->bio);
-
-	BUG_ON(end > le16_to_cpu(v->sectors));
-
-	buf->offset	= round_down(offset, csum_granularity);
-	buf->size	= min_t(unsigned, le16_to_cpu(v->sectors),
-				round_up(end, csum_granularity)) - buf->offset;
-
-	for (i = 0; i < v->nr_blocks; i++) {
-		buf->data[i] = kmalloc(buf->size << 9, GFP_NOIO);
-		if (!buf->data[i]) {
-			ret = -ENOMEM;
-			goto err;
-		}
+	if (!bch2_ptr_matches_stripe(v, rbio->pick)) {
+		bch_err_ratelimited(c,
+			"error doing reconstruct read: pointer doesn't match stripe");
+		ret = -EIO;
+		goto err;
 	}
 
-	memset(buf->valid, 0xFF, sizeof(buf->valid));
-
-	for (i = 0; i < v->nr_blocks; i++) {
-		struct bch_extent_ptr *ptr = v->ptrs + i;
-		struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+	offset = rbio->bio.bi_iter.bi_sector - v->ptrs[rbio->pick.ec.block].offset;
+	if (offset + bio_sectors(&rbio->bio) > le16_to_cpu(v->sectors)) {
+		bch_err_ratelimited(c,
+			"error doing reconstruct read: read is bigger than stripe");
+		ret = -EIO;
+		goto err;
+	}
 
-		if (ptr_stale(ca, ptr)) {
-			bch_err_ratelimited(c,
-					  "error doing reconstruct read: stale pointer");
-			clear_bit(i, buf->valid);
-			continue;
-		}
+	ret = ec_stripe_buf_init(buf, offset, bio_sectors(&rbio->bio));
+	if (ret)
+		goto err;
 
+	for (i = 0; i < v->nr_blocks; i++)
 		ec_block_io(c, buf, REQ_OP_READ, i, &cl);
-	}
 
 	closure_sync(&cl);
 
@@ -513,10 +497,9 @@ int bch2_ec_read_extent(struct bch_fs *c, struct bch_read_bio *rbio)
 		goto err;
 
 	memcpy_to_bio(&rbio->bio, rbio->bio.bi_iter,
-		      buf->data[idx] + ((offset - buf->offset) << 9));
+		      buf->data[rbio->pick.ec.block] + ((offset - buf->offset) << 9));
 err:
-	for (i = 0; i < v->nr_blocks; i++)
-		kfree(buf->data[i]);
+	ec_stripe_buf_exit(buf);
 	kfree(buf);
 	return ret;
 }
@@ -784,7 +767,7 @@ static int ec_stripe_update_ptrs(struct bch_fs *c,
 	struct bkey_s_c k;
 	struct bkey_s_extent e;
 	struct bkey_buf sk;
-	int ret = 0, dev, idx;
+	int ret = 0, dev, block;
 
 	bch2_bkey_buf_init(&sk);
 	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
@@ -805,13 +788,13 @@ static int ec_stripe_update_ptrs(struct bch_fs *c,
 			continue;
 		}
 
-		idx = extent_matches_stripe(c, &s->key.v, k);
-		if (idx < 0) {
+		block = bkey_matches_stripe(&s->key.v, k);
+		if (block < 0) {
 			bch2_btree_iter_next(iter);
 			continue;
 		}
 
-		dev = s->key.v.ptrs[idx].dev;
+		dev = s->key.v.ptrs[block].dev;
 
 		bch2_bkey_buf_reassemble(&sk, c, k);
 		e = bkey_i_to_s_extent(sk.k);
@@ -820,7 +803,7 @@ static int ec_stripe_update_ptrs(struct bch_fs *c,
 		ec_ptr = (void *) bch2_bkey_has_device(e.s_c, dev);
 		BUG_ON(!ec_ptr);
 
-		extent_stripe_ptr_add(e, s, ec_ptr, idx);
+		extent_stripe_ptr_add(e, s, ec_ptr, block);
 
 		bch2_btree_iter_set_pos(iter, bkey_start_pos(&sk.k->k));
 		bch2_trans_update(&trans, iter, sk.k, 0);
@@ -875,7 +858,7 @@ static void ec_stripe_create(struct ec_stripe_new *s)
 				swap(s->new_stripe.data[i],
 				     s->existing_stripe.data[i]);
 
-		ec_stripe_buf_free(&s->existing_stripe);
+		ec_stripe_buf_exit(&s->existing_stripe);
 	}
 
 	BUG_ON(!s->allocated);
@@ -941,8 +924,8 @@ err:
 
 	bch2_keylist_free(&s->keys, s->inline_keys);
 
-	ec_stripe_buf_free(&s->existing_stripe);
-	ec_stripe_buf_free(&s->new_stripe);
+	ec_stripe_buf_exit(&s->existing_stripe);
+	ec_stripe_buf_exit(&s->new_stripe);
 	closure_debug_destroy(&s->iodone);
 	kfree(s);
 }
@@ -1145,9 +1128,6 @@ static int ec_new_stripe_alloc(struct bch_fs *c, struct ec_stripe_head *h)
 
 	bch2_keylist_init(&s->keys, s->inline_keys);
 
-	s->new_stripe.offset	= 0;
-	s->new_stripe.size	= h->blocksize;
-
 	ec_stripe_key_init(c, &s->new_stripe.key, s->nr_data,
 			   s->nr_parity, h->blocksize);
 
@@ -1305,9 +1285,7 @@ err:
 
 /* XXX: doesn't obey target: */
 static s64 get_existing_stripe(struct bch_fs *c,
-			       unsigned target,
-			       unsigned algo,
-			       unsigned redundancy)
+			       struct ec_stripe_head *head)
 {
 	ec_stripes_heap *h = &c->ec_stripes_heap;
 	struct stripe *m;
@@ -1325,8 +1303,9 @@ static s64 get_existing_stripe(struct bch_fs *c,
 		stripe_idx = h->data[heap_idx].idx;
 		m = genradix_ptr(&c->stripes[0], stripe_idx);
 
-		if (m->algorithm	== algo &&
-		    m->nr_redundant	== redundancy &&
+		if (m->algorithm	== head->algo &&
+		    m->nr_redundant	== head->redundancy &&
+		    m->sectors		== head->blocksize &&
 		    m->blocks_nonempty	< m->nr_blocks - m->nr_redundant) {
 			bch2_stripes_heap_del(c, m, stripe_idx);
 			spin_unlock(&c->ec_stripes_heap_lock);
@@ -1338,24 +1317,6 @@ static s64 get_existing_stripe(struct bch_fs *c,
 	return -1;
 }
 
-static int get_stripe_key(struct bch_fs *c, u64 idx, struct ec_stripe_buf *stripe)
-{
-	struct btree_trans trans;
-	struct btree_iter *iter;
-	struct bkey_s_c k;
-	int ret;
-
-	bch2_trans_init(&trans, c, 0, 0);
-	iter = bch2_trans_get_iter(&trans, BTREE_ID_EC, POS(0, idx), BTREE_ITER_SLOTS);
-	k = bch2_btree_iter_peek_slot(iter);
-	ret = bkey_err(k);
-	if (!ret)
-		bkey_reassemble(&stripe->key.k_i, k);
-	bch2_trans_exit(&trans);
-
-	return ret;
-}
-
 struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *c,
 					       unsigned target,
 					       unsigned algo,
@@ -1382,7 +1343,7 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *c,
 			return NULL;
 		}
 
-		idx = get_existing_stripe(c, target, algo, redundancy);
+		idx = get_existing_stripe(c, h);
 		if (idx >= 0) {
 			h->s->have_existing_stripe = true;
 			ret = get_stripe_key(c, idx, &h->s->existing_stripe);
@@ -1392,7 +1353,7 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *c,
 				return NULL;
 			}
 
-			if (ec_stripe_buf_alloc(&h->s->existing_stripe)) {
+			if (ec_stripe_buf_init(&h->s->existing_stripe, 0, h->blocksize)) {
 				/*
 				 * this is a problem: we have deleted from the
 				 * stripes heap already
@@ -1411,7 +1372,7 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *c,
 				  &h->s->existing_stripe.key.k_i);
 		}
 
-		if (ec_stripe_buf_alloc(&h->s->new_stripe)) {
+		if (ec_stripe_buf_init(&h->s->new_stripe, 0, h->blocksize)) {
 			BUG();
 		}
 	}
diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h
index 97a263cf9c87..c3959af46833 100644
--- a/fs/bcachefs/ec.h
+++ b/fs/bcachefs/ec.h
@@ -60,9 +60,51 @@ static inline unsigned stripe_val_u64s(const struct bch_stripe *s)
 }
 
 static inline void *stripe_csum(struct bch_stripe *s,
-				unsigned dev, unsigned csum_idx)
+				unsigned block, unsigned csum_idx)
 {
-	return (void *) s + stripe_csum_offset(s, dev, csum_idx);
+	EBUG_ON(block >= s->nr_blocks);
+	EBUG_ON(csum_idx >= stripe_csums_per_device(s));
+
+	return (void *) s + stripe_csum_offset(s, block, csum_idx);
+}
+
+static inline struct bch_csum stripe_csum_get(struct bch_stripe *s,
+				   unsigned block, unsigned csum_idx)
+{
+	struct bch_csum csum = { 0 };
+
+	memcpy(&csum, stripe_csum(s, block, csum_idx), bch_crc_bytes[s->csum_type]);
+	return csum;
+}
+
+static inline void stripe_csum_set(struct bch_stripe *s,
+				   unsigned block, unsigned csum_idx,
+				   struct bch_csum csum)
+{
+	memcpy(stripe_csum(s, block, csum_idx), &csum, bch_crc_bytes[s->csum_type]);
+}
+
+static inline bool __bch2_ptr_matches_stripe(const struct bch_stripe *s,
+					     const struct bch_extent_ptr *ptr,
+					     unsigned block)
+{
+	unsigned nr_data = s->nr_blocks - s->nr_redundant;
+
+	if (block >= nr_data)
+		return false;
+
+	return  ptr->dev    == s->ptrs[block].dev &&
+		ptr->gen    == s->ptrs[block].gen &&
+		ptr->offset >= s->ptrs[block].offset &&
+		ptr->offset  < s->ptrs[block].offset + le16_to_cpu(s->sectors);
+}
+
+static inline bool bch2_ptr_matches_stripe(const struct bch_stripe *s,
+					   struct extent_ptr_decoded p)
+{
+	BUG_ON(!p.has_ec);
+
+	return __bch2_ptr_matches_stripe(s, &p.ptr, p.ec.block);
 }
 
 struct bch_read_bio;
-- 
cgit 


From edfbba58e3e7a94900d24d266e6365b1ab531e3b Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 11 Jan 2021 16:11:02 -0500
Subject: bcachefs: Add btree node prefetching to bch2_btree_and_journal_walk()

bch2_btree_and_journal_walk() walks the btree overlaying keys from the
journal; it was introduced so that we could read in the alloc btree
prior to journal replay being done, when journalling of updates to
interior btree nodes was introduced.

But it didn't have btree node prefetching, which introduced a severe
regression with mount times, particularly on spinning rust. This patch
implements btree node prefetching for the btree + journal walk,
hopefully fixing that.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_cache.c |  8 ++++----
 fs/bcachefs/btree_cache.h |  2 +-
 fs/bcachefs/btree_iter.c  |  3 ++-
 fs/bcachefs/recovery.c    | 37 +++++++++++++++++++++++++++++++------
 4 files changed, 38 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index 904440f26d40..4b29be7234c7 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -1008,20 +1008,20 @@ out:
 }
 
 void bch2_btree_node_prefetch(struct bch_fs *c, struct btree_iter *iter,
-			      const struct bkey_i *k, unsigned level)
+			      const struct bkey_i *k,
+			      enum btree_id btree_id, unsigned level)
 {
 	struct btree_cache *bc = &c->btree_cache;
 	struct btree *b;
 
-	BUG_ON(!btree_node_locked(iter, level + 1));
+	BUG_ON(iter && !btree_node_locked(iter, level + 1));
 	BUG_ON(level >= BTREE_MAX_DEPTH);
 
 	b = btree_cache_find(bc, k);
 	if (b)
 		return;
 
-	bch2_btree_node_fill(c, iter, k, iter->btree_id,
-			     level, SIX_LOCK_read, false);
+	bch2_btree_node_fill(c, iter, k, btree_id, level, SIX_LOCK_read, false);
 }
 
 void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c,
diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h
index e766ef552ce7..0eeca0bcc48e 100644
--- a/fs/bcachefs/btree_cache.h
+++ b/fs/bcachefs/btree_cache.h
@@ -32,7 +32,7 @@ struct btree *bch2_btree_node_get_sibling(struct bch_fs *, struct btree_iter *,
 				struct btree *, enum btree_node_sibling);
 
 void bch2_btree_node_prefetch(struct bch_fs *, struct btree_iter *,
-			      const struct bkey_i *, unsigned);
+			      const struct bkey_i *, enum btree_id, unsigned);
 
 void bch2_fs_btree_cache_exit(struct bch_fs *);
 int bch2_fs_btree_cache_init(struct bch_fs *);
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 47d833f5ad56..196f346f0544 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1067,7 +1067,8 @@ static void btree_iter_prefetch(struct btree_iter *iter)
 			break;
 
 		bch2_bkey_buf_unpack(&tmp, c, l->b, k);
-		bch2_btree_node_prefetch(c, iter, tmp.k, iter->level - 1);
+		bch2_btree_node_prefetch(c, iter, tmp.k, iter->btree_id,
+					 iter->level - 1);
 	}
 
 	if (!was_locked)
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 5a43682c26ef..c700b12b2ac0 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -206,6 +206,31 @@ void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *i
 
 /* Walk btree, overlaying keys from the journal: */
 
+static void btree_and_journal_iter_prefetch(struct bch_fs *c, struct btree *b,
+					   struct btree_and_journal_iter iter)
+{
+	unsigned i = 0, nr = b->c.level > 1 ? 2 : 16;
+	struct bkey_s_c k;
+	struct bkey_buf tmp;
+
+	BUG_ON(!b->c.level);
+
+	bch2_bkey_buf_init(&tmp);
+
+	while (i < nr &&
+	       (k = bch2_btree_and_journal_iter_peek(&iter)).k) {
+		bch2_bkey_buf_reassemble(&tmp, c, k);
+
+		bch2_btree_node_prefetch(c, NULL, tmp.k,
+					b->c.btree_id, b->c.level - 1);
+
+		bch2_btree_and_journal_iter_advance(&iter);
+		i++;
+	}
+
+	bch2_bkey_buf_exit(&tmp, c);
+}
+
 static int bch2_btree_and_journal_walk_recurse(struct bch_fs *c, struct btree *b,
 				struct journal_keys *journal_keys,
 				enum btree_id btree_id,
@@ -214,8 +239,11 @@ static int bch2_btree_and_journal_walk_recurse(struct bch_fs *c, struct btree *b
 {
 	struct btree_and_journal_iter iter;
 	struct bkey_s_c k;
+	struct bkey_buf tmp;
+	struct btree *child;
 	int ret = 0;
 
+	bch2_bkey_buf_init(&tmp);
 	bch2_btree_and_journal_iter_init_node_iter(&iter, journal_keys, b);
 
 	while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
@@ -224,23 +252,19 @@ static int bch2_btree_and_journal_walk_recurse(struct bch_fs *c, struct btree *b
 			break;
 
 		if (b->c.level) {
-			struct btree *child;
-			struct bkey_buf tmp;
-
-			bch2_bkey_buf_init(&tmp);
 			bch2_bkey_buf_reassemble(&tmp, c, k);
-			k = bkey_i_to_s_c(tmp.k);
 
 			bch2_btree_and_journal_iter_advance(&iter);
 
 			child = bch2_btree_node_get_noiter(c, tmp.k,
 						b->c.btree_id, b->c.level - 1);
-			bch2_bkey_buf_exit(&tmp, c);
 
 			ret = PTR_ERR_OR_ZERO(child);
 			if (ret)
 				break;
 
+			btree_and_journal_iter_prefetch(c, b, iter);
+
 			ret   = (node_fn ? node_fn(c, b) : 0) ?:
 				bch2_btree_and_journal_walk_recurse(c, child,
 					journal_keys, btree_id, node_fn, key_fn);
@@ -253,6 +277,7 @@ static int bch2_btree_and_journal_walk_recurse(struct bch_fs *c, struct btree *b
 		}
 	}
 
+	bch2_bkey_buf_exit(&tmp, c);
 	return ret;
 }
 
-- 
cgit 


From ac958006294ab462848bc69b9b5ddb1a8b99e748 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 14 Jan 2021 16:19:23 -0500
Subject: bcachefs: Factor out bch2_ec_stripes_heap_start()

This fixes a bug where mark and sweep gc incorrectly was clearing out
the stripes heap and causing assertions to fire later - simpler to just
create the stripes heap after gc has finished.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_gc.c |  8 --------
 fs/bcachefs/ec.c       | 17 ++++++++++-------
 fs/bcachefs/ec.h       |  2 ++
 fs/bcachefs/recovery.c |  2 ++
 4 files changed, 14 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 4a1d800d257e..790beac71e9b 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -607,8 +607,6 @@ static int bch2_gc_done(struct bch_fs *c,
 		struct genradix_iter src_iter = genradix_iter_init(&c->stripes[1], 0);
 		struct stripe *dst, *src;
 
-		c->ec_stripes_heap.used = 0;
-
 		while ((dst = genradix_iter_peek(&dst_iter, &c->stripes[0])) &&
 		       (src = genradix_iter_peek(&src_iter, &c->stripes[1]))) {
 			BUG_ON(src_iter.pos != dst_iter.pos);
@@ -625,12 +623,6 @@ static int bch2_gc_done(struct bch_fs *c,
 				copy_stripe_field(block_sectors[i],
 						  "block_sectors[%u]", i);
 
-			if (dst->alive) {
-				spin_lock(&c->ec_stripes_heap_lock);
-				bch2_stripes_heap_insert(c, dst, dst_iter.pos);
-				spin_unlock(&c->ec_stripes_heap_lock);
-			}
-
 			genradix_iter_advance(&dst_iter, &c->stripes[0]);
 			genradix_iter_advance(&src_iter, &c->stripes[1]);
 		}
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index ce52344c79a6..1f125ce77e4f 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -1450,6 +1450,16 @@ unlock:
 	mutex_unlock(&c->ec_stripe_head_lock);
 }
 
+void bch2_stripes_heap_start(struct bch_fs *c)
+{
+	struct genradix_iter iter;
+	struct stripe *m;
+
+	genradix_for_each(&c->stripes[0], iter, m)
+		if (m->alive)
+			bch2_stripes_heap_insert(c, m, iter.pos);
+}
+
 static int __bch2_stripe_write_key(struct btree_trans *trans,
 				   struct btree_iter *iter,
 				   struct stripe *m,
@@ -1529,18 +1539,11 @@ static int bch2_stripes_read_fn(struct bch_fs *c, enum btree_id id,
 	int ret = 0;
 
 	if (k.k->type == KEY_TYPE_stripe) {
-		struct stripe *m;
-
 		ret = __ec_stripe_mem_alloc(c, k.k->p.offset, GFP_KERNEL) ?:
 			bch2_mark_key(c, k, 0, 0, NULL, 0,
 				      BTREE_TRIGGER_NOATOMIC);
 		if (ret)
 			return ret;
-
-		spin_lock(&c->ec_stripes_heap_lock);
-		m = genradix_ptr(&c->stripes[0], k.k->p.offset);
-		bch2_stripes_heap_insert(c, m, k.k->p.offset);
-		spin_unlock(&c->ec_stripes_heap_lock);
 	}
 
 	return ret;
diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h
index c3959af46833..f124582fdc5f 100644
--- a/fs/bcachefs/ec.h
+++ b/fs/bcachefs/ec.h
@@ -200,6 +200,8 @@ void bch2_ec_stop_dev(struct bch_fs *, struct bch_dev *);
 
 void bch2_ec_flush_new_stripes(struct bch_fs *);
 
+void bch2_stripes_heap_start(struct bch_fs *);
+
 struct journal_keys;
 int bch2_stripes_read(struct bch_fs *, struct journal_keys *);
 int bch2_stripes_write(struct bch_fs *, unsigned);
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index c700b12b2ac0..8c67f1468945 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -1125,6 +1125,8 @@ use_clean:
 		bch_verbose(c, "mark and sweep done");
 	}
 
+	bch2_stripes_heap_start(c);
+
 	clear_bit(BCH_FS_REBUILD_REPLICAS, &c->flags);
 	set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags);
 
-- 
cgit 


From ed9d58a2b1ddbc38816571638ee114b7efb9f279 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 14 Jan 2021 16:21:22 -0500
Subject: bcachefs: Run jset_validate in write path as well

This is because we had a bug where we were writing out journal entries
with garbage last_seq, and not catching it.

Also, completely ignore jset->last_seq when JSET_NO_FLUSH is true,
because of aforementioned bug, but change the write path to set last_seq
to 0 when JSET_NO_FLUSH is true.

Minor other cleanups and comments.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_io.c   |  2 +-
 fs/bcachefs/journal.c    |  5 +++++
 fs/bcachefs/journal_io.c | 54 +++++++++++++++++++++++++++++++++---------------
 3 files changed, 43 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 831f387557aa..c4d53ea2e920 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -1624,7 +1624,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
 		validate_before_checksum = true;
 
 	/* validate_bset will be modifying: */
-	if (le16_to_cpu(i->version) < bcachefs_metadata_version_max)
+	if (le16_to_cpu(i->version) <= bcachefs_metadata_version_inode_btree_change)
 		validate_before_checksum = true;
 
 	/* if we're going to be encrypting, check metadata validity first: */
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index 3ca8137923a6..e90fe042302f 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -117,6 +117,9 @@ void __bch2_journal_buf_put(struct journal *j)
 
 /*
  * Returns true if journal entry is now closed:
+ *
+ * We don't close a journal_buf until the next journal_buf is finished writing,
+ * and can be opened again - this also initializes the next journal_buf:
  */
 static bool __journal_entry_close(struct journal *j)
 {
@@ -154,6 +157,7 @@ static bool __journal_entry_close(struct journal *j)
 	} while ((v = atomic64_cmpxchg(&j->reservations.counter,
 				       old.v, new.v)) != old.v);
 
+	/* Close out old buffer: */
 	buf->data->u64s		= cpu_to_le32(old.cur_entry_offset);
 
 	sectors = vstruct_blocks_plus(buf->data, c->block_bits,
@@ -184,6 +188,7 @@ static bool __journal_entry_close(struct journal *j)
 
 	__bch2_journal_pin_put(j, le64_to_cpu(buf->data->seq));
 
+	/* Initialize new buffer: */
 	journal_pin_new_entry(j, 1);
 
 	bch2_journal_buf_init(j);
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index ef4d48081975..f6c9681badea 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -469,7 +469,8 @@ static int jset_validate(struct bch_fs *c,
 				  version < bcachefs_metadata_version_min) ||
 				 version >= bcachefs_metadata_version_max, c,
 			"%s sector %llu seq %llu: unknown journal entry version %u",
-			ca->name, sector, le64_to_cpu(jset->seq),
+			ca ? ca->name : c->name,
+			sector, le64_to_cpu(jset->seq),
 			version)) {
 		/* don't try to continue: */
 		return EINVAL;
@@ -481,32 +482,42 @@ static int jset_validate(struct bch_fs *c,
 
 	if (journal_entry_err_on(bytes > bucket_sectors_left << 9, c,
 			"%s sector %llu seq %llu: journal entry too big (%zu bytes)",
-			ca->name, sector, le64_to_cpu(jset->seq), bytes)) {
+			ca ? ca->name : c->name,
+			sector, le64_to_cpu(jset->seq), bytes)) {
 		ret = JOURNAL_ENTRY_BAD;
 		le32_add_cpu(&jset->u64s,
 			     -((bytes - (bucket_sectors_left << 9)) / 8));
 	}
 
-	if (fsck_err_on(!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(jset)), c,
+	if (journal_entry_err_on(!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(jset)), c,
 			"%s sector %llu seq %llu: journal entry with unknown csum type %llu",
-			ca->name, sector, le64_to_cpu(jset->seq),
+			ca ? ca->name : c->name,
+			sector, le64_to_cpu(jset->seq),
 			JSET_CSUM_TYPE(jset))) {
 		ret = JOURNAL_ENTRY_BAD;
-		goto bad_csum_type;
+		goto csum_done;
 	}
 
+	if (write)
+		goto csum_done;
+
 	csum = csum_vstruct(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), jset);
 	if (journal_entry_err_on(bch2_crc_cmp(csum, jset->csum), c,
 				 "%s sector %llu seq %llu: journal checksum bad",
-				 ca->name, sector, le64_to_cpu(jset->seq)))
+				 ca ? ca->name : c->name,
+				 sector, le64_to_cpu(jset->seq)))
 		ret = JOURNAL_ENTRY_BAD;
 
 	bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset),
 		     jset->encrypted_start,
 		     vstruct_end(jset) - (void *) jset->encrypted_start);
-bad_csum_type:
-	if (journal_entry_err_on(le64_to_cpu(jset->last_seq) > le64_to_cpu(jset->seq), c,
-				 "invalid journal entry: last_seq > seq")) {
+csum_done:
+	/* last_seq is ignored when JSET_NO_FLUSH is true */
+	if (journal_entry_err_on(!JSET_NO_FLUSH(jset) &&
+				 le64_to_cpu(jset->last_seq) > le64_to_cpu(jset->seq), c,
+				 "invalid journal entry: last_seq > seq (%llu > %llu)",
+				 le64_to_cpu(jset->last_seq),
+				 le64_to_cpu(jset->seq))) {
 		jset->last_seq = jset->seq;
 		return JOURNAL_ENTRY_BAD;
 	}
@@ -514,6 +525,14 @@ fsck_err:
 	return ret;
 }
 
+static int jset_validate_for_write(struct bch_fs *c, struct jset *jset)
+{
+	unsigned sectors = vstruct_sectors(jset, c->block_bits);
+
+	return jset_validate(c, NULL, jset, 0, sectors, sectors, WRITE) ?:
+		jset_validate_entries(c, jset, WRITE);
+}
+
 struct journal_read_buf {
 	void		*data;
 	size_t		size;
@@ -1081,9 +1100,7 @@ static void journal_write_done(struct closure *cl)
 		bch2_bkey_devs(bkey_i_to_s_c(&w->key));
 	struct bch_replicas_padded replicas;
 	union journal_res_state old, new;
-	u64 seq = le64_to_cpu(w->data->seq);
-	u64 last_seq = le64_to_cpu(w->data->last_seq);
-	u64 v;
+	u64 v, seq, last_seq;
 	int err = 0;
 
 	bch2_time_stats_update(j->write_time, j->write_start_time);
@@ -1101,6 +1118,9 @@ static void journal_write_done(struct closure *cl)
 		bch2_fatal_error(c);
 
 	spin_lock(&j->lock);
+	seq = le64_to_cpu(w->data->seq);
+	last_seq = le64_to_cpu(w->data->last_seq);
+
 	if (seq >= j->pin.front)
 		journal_seq_pin(j, seq)->devs = devs;
 
@@ -1108,7 +1128,7 @@ static void journal_write_done(struct closure *cl)
 	if (err && (!j->err_seq || seq < j->err_seq))
 		j->err_seq	= seq;
 
-	if (!w->noflush) {
+	if (!JSET_NO_FLUSH(w->data)) {
 		j->flushed_seq_ondisk = seq;
 		j->last_seq_ondisk = last_seq;
 	}
@@ -1196,7 +1216,7 @@ void bch2_journal_write(struct closure *cl)
 	    test_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags)) {
 		w->noflush = true;
 		SET_JSET_NO_FLUSH(jset, true);
-		jset->last_seq = cpu_to_le64(j->last_seq_ondisk);
+		jset->last_seq = 0;
 
 		j->nr_noflush_writes++;
 	} else {
@@ -1248,11 +1268,11 @@ void bch2_journal_write(struct closure *cl)
 	if (bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset)))
 		validate_before_checksum = true;
 
-	if (le32_to_cpu(jset->version) < bcachefs_metadata_version_max)
+	if (le32_to_cpu(jset->version) <= bcachefs_metadata_version_inode_btree_change)
 		validate_before_checksum = true;
 
 	if (validate_before_checksum &&
-	    jset_validate_entries(c, jset, WRITE))
+	    jset_validate_for_write(c, jset))
 		goto err;
 
 	bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset),
@@ -1263,7 +1283,7 @@ void bch2_journal_write(struct closure *cl)
 				  journal_nonce(jset), jset);
 
 	if (!validate_before_checksum &&
-	    jset_validate_entries(c, jset, WRITE))
+	    jset_validate_for_write(c, jset))
 		goto err;
 
 	sectors = vstruct_sectors(jset, c->block_bits);
-- 
cgit 


From 280249b9d9b9a62562ddeb5429a7d29d2f03ba1c Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 16 Jan 2021 15:40:33 -0500
Subject: bcachefs: Correctly order flushes and journal writes on multi device
 filesystems

All writes prior to a journal write need to be flushed before the
journal write itself happens. On single device filesystems, it suffices
to mark the write with REQ_PREFLUSH|REQ_FUA, but on multi device
filesystems we need to issue flushes to every device - and wait for them
to complete - before issuing the journal writes. Previously, we were
issuing flushes to every device, but we weren't waiting for them to
complete before issuing the journal writes.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/io.c            |  3 --
 fs/bcachefs/journal.c       |  1 +
 fs/bcachefs/journal.h       |  5 ---
 fs/bcachefs/journal_io.c    | 99 ++++++++++++++++++++++++++++-----------------
 fs/bcachefs/journal_types.h |  1 +
 5 files changed, 65 insertions(+), 44 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index bc1e2dc04850..8a4d05eee381 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -509,9 +509,6 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
 		n->submit_time		= local_clock();
 		n->bio.bi_iter.bi_sector = ptr->offset;
 
-		if (!journal_flushes_device(ca))
-			n->bio.bi_opf |= REQ_FUA;
-
 		if (likely(n->have_ioref)) {
 			this_cpu_add(ca->io_done->sectors[WRITE][type],
 				     bio_sectors(&n->bio));
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index e90fe042302f..6f84a5dd06bc 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -81,6 +81,7 @@ static void bch2_journal_buf_init(struct journal *j)
 	bkey_extent_init(&buf->key);
 	buf->noflush	= false;
 	buf->must_flush	= false;
+	buf->separate_flush = false;
 
 	memset(buf->has_inode, 0, sizeof(buf->has_inode));
 
diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h
index df353a18011b..547c735ce3cb 100644
--- a/fs/bcachefs/journal.h
+++ b/fs/bcachefs/journal.h
@@ -496,11 +496,6 @@ static inline int bch2_journal_error(struct journal *j)
 
 struct bch_dev;
 
-static inline bool journal_flushes_device(struct bch_dev *ca)
-{
-	return true;
-}
-
 static inline void bch2_journal_set_replay_done(struct journal *j)
 {
 	BUG_ON(!test_bit(JOURNAL_STARTED, &j->flags));
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index f6c9681badea..40da18d778a3 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -1188,6 +1188,51 @@ static void journal_write_endio(struct bio *bio)
 	percpu_ref_put(&ca->io_ref);
 }
 
+static void do_journal_write(struct closure *cl)
+{
+	struct journal *j = container_of(cl, struct journal, io);
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
+	struct bch_dev *ca;
+	struct journal_buf *w = journal_last_unwritten_buf(j);
+	struct bch_extent_ptr *ptr;
+	struct bio *bio;
+	unsigned sectors = vstruct_sectors(w->data, c->block_bits);
+
+	extent_for_each_ptr(bkey_i_to_s_extent(&w->key), ptr) {
+		ca = bch_dev_bkey_exists(c, ptr->dev);
+		if (!percpu_ref_tryget(&ca->io_ref)) {
+			/* XXX: fix this */
+			bch_err(c, "missing device for journal write\n");
+			continue;
+		}
+
+		this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_journal],
+			     sectors);
+
+		bio = ca->journal.bio;
+		bio_reset(bio, ca->disk_sb.bdev, REQ_OP_WRITE|REQ_SYNC|REQ_META);
+		bio->bi_iter.bi_sector	= ptr->offset;
+		bio->bi_end_io		= journal_write_endio;
+		bio->bi_private		= ca;
+
+		if (!JSET_NO_FLUSH(w->data))
+			bio->bi_opf    |= REQ_FUA;
+		if (!JSET_NO_FLUSH(w->data) && !w->separate_flush)
+			bio->bi_opf    |= REQ_PREFLUSH;
+
+		bch2_bio_map(bio, w->data, sectors << 9);
+
+		trace_journal_write(bio);
+		closure_bio_submit(bio, cl);
+
+		ca->journal.bucket_seq[ca->journal.cur_idx] =
+			le64_to_cpu(w->data->seq);
+	}
+
+	continue_at(cl, journal_write_done, system_highpri_wq);
+	return;
+}
+
 void bch2_journal_write(struct closure *cl)
 {
 	struct journal *j = container_of(cl, struct journal, io);
@@ -1197,9 +1242,8 @@ void bch2_journal_write(struct closure *cl)
 	struct jset_entry *start, *end;
 	struct jset *jset;
 	struct bio *bio;
-	struct bch_extent_ptr *ptr;
 	bool validate_before_checksum = false;
-	unsigned i, sectors, bytes, u64s;
+	unsigned i, sectors, bytes, u64s, nr_rw_members = 0;
 	int ret;
 
 	BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb));
@@ -1329,45 +1373,28 @@ retry_alloc:
 	if (c->opts.nochanges)
 		goto no_io;
 
-	extent_for_each_ptr(bkey_i_to_s_extent(&w->key), ptr) {
-		ca = bch_dev_bkey_exists(c, ptr->dev);
-		if (!percpu_ref_tryget(&ca->io_ref)) {
-			/* XXX: fix this */
-			bch_err(c, "missing device for journal write\n");
-			continue;
-		}
-
-		this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_journal],
-			     sectors);
+	for_each_rw_member(ca, c, i)
+		nr_rw_members++;
 
-		bio = ca->journal.bio;
-		bio_reset(bio, ca->disk_sb.bdev, REQ_OP_WRITE|REQ_SYNC|REQ_META);
-		bio->bi_iter.bi_sector	= ptr->offset;
-		bio->bi_end_io		= journal_write_endio;
-		bio->bi_private		= ca;
-		if (!JSET_NO_FLUSH(jset))
-			bio->bi_opf    |= REQ_PREFLUSH|REQ_FUA;
-		bch2_bio_map(bio, jset, sectors << 9);
+	if (nr_rw_members > 1)
+		w->separate_flush = true;
 
-		trace_journal_write(bio);
-		closure_bio_submit(bio, cl);
+	if (!JSET_NO_FLUSH(jset) && w->separate_flush) {
+		for_each_rw_member(ca, c, i) {
+			percpu_ref_get(&ca->io_ref);
 
-		ca->journal.bucket_seq[ca->journal.cur_idx] = le64_to_cpu(jset->seq);
+			bio = ca->journal.bio;
+			bio_reset(bio, ca->disk_sb.bdev, REQ_OP_FLUSH);
+			bio->bi_end_io		= journal_write_endio;
+			bio->bi_private		= ca;
+			closure_bio_submit(bio, cl);
+		}
 	}
 
-	if (!JSET_NO_FLUSH(jset)) {
-		for_each_rw_member(ca, c, i)
-			if (journal_flushes_device(ca) &&
-			    !bch2_bkey_has_device(bkey_i_to_s_c(&w->key), i)) {
-				percpu_ref_get(&ca->io_ref);
-
-				bio = ca->journal.bio;
-				bio_reset(bio, ca->disk_sb.bdev, REQ_OP_FLUSH);
-				bio->bi_end_io		= journal_write_endio;
-				bio->bi_private		= ca;
-				closure_bio_submit(bio, cl);
-			}
-	}
+	bch2_bucket_seq_cleanup(c);
+
+	continue_at(cl, do_journal_write, system_highpri_wq);
+	return;
 no_io:
 	bch2_bucket_seq_cleanup(c);
 
diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h
index 150e691d5317..8ad10e46dd5d 100644
--- a/fs/bcachefs/journal_types.h
+++ b/fs/bcachefs/journal_types.h
@@ -31,6 +31,7 @@ struct journal_buf {
 	unsigned		u64s_reserved;
 	bool			noflush;	/* write has already been kicked off, and was noflush */
 	bool			must_flush;	/* something wants a flush */
+	bool			separate_flush;
 	/* bloom filter: */
 	unsigned long		has_inode[1024 / sizeof(unsigned long)];
 };
-- 
cgit 


From cd9f3dfe58709c7b0793a706d0dd0292a66237d5 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 17 Jan 2021 13:19:16 -0500
Subject: bcachefs: Fix integer overflow in bch2_disk_reservation_get()

The sectors argument shouldn't have been a u32 - it can be up to U32_MAX
(i.e. fallocate creating persistent reservations), and if replication is
enabled we'll overflow when we calculate the real number of sectors to
reserve. Oops.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/buckets.c | 2 +-
 fs/bcachefs/buckets.h | 7 +++----
 2 files changed, 4 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 1bbd1ee080ec..11907abd9b4c 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -2109,7 +2109,7 @@ int bch2_trans_mark_update(struct btree_trans *trans,
 #define SECTORS_CACHE	1024
 
 int bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res,
-			      unsigned sectors, int flags)
+			      u64 sectors, int flags)
 {
 	struct bch_fs_pcpu *pcpu;
 	u64 old, v, get;
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index 2e9c4e46c61c..a0ef9c041d5c 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -282,8 +282,8 @@ static inline void bch2_disk_reservation_put(struct bch_fs *c,
 #define BCH_DISK_RESERVATION_NOFAIL		(1 << 0)
 
 int bch2_disk_reservation_add(struct bch_fs *,
-			     struct disk_reservation *,
-			     unsigned, int);
+			      struct disk_reservation *,
+			      u64, int);
 
 static inline struct disk_reservation
 bch2_disk_reservation_init(struct bch_fs *c, unsigned nr_replicas)
@@ -300,8 +300,7 @@ bch2_disk_reservation_init(struct bch_fs *c, unsigned nr_replicas)
 
 static inline int bch2_disk_reservation_get(struct bch_fs *c,
 					    struct disk_reservation *res,
-					    unsigned sectors,
-					    unsigned nr_replicas,
+					    u64 sectors, unsigned nr_replicas,
 					    int flags)
 {
 	*res = bch2_disk_reservation_init(c, nr_replicas);
-- 
cgit 


From 2ef220cba297ef381a980572154fd17aa2a7e3de Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 17 Jan 2021 15:18:11 -0500
Subject: bcachefs: Fix double counting of stripe block counts by GC

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/buckets.c | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 11907abd9b4c..95368c9f70c3 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -1273,9 +1273,15 @@ static int bch2_mark_stripe(struct bch_fs *c,
 		m->blocks_nonempty = 0;
 
 		for (i = 0; i < new_s->nr_blocks; i++) {
-			m->block_sectors[i] =
-				stripe_blockcount_get(new_s, i);
-			m->blocks_nonempty += !!m->block_sectors[i];
+			unsigned s = stripe_blockcount_get(new_s, i);
+
+			/*
+			 * gc recalculates this field from stripe ptr
+			 * references:
+			 */
+			if (!gc)
+				m->block_sectors[i] = s;
+			m->blocks_nonempty += !!s;
 		}
 
 		if (gc && old_s)
-- 
cgit 


From a39c74be8059be72fcf6c7cc2f827c38076a25db Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 17 Jan 2021 16:16:37 -0500
Subject: bcachefs: Fix gc updating stripes info

The primary stripes radix tree can be sparse, which was causing an
assertion to pop because the one use for gc isn't. Fix this by changing
the algorithm to copy between the two radix trees.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_gc.c | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 790beac71e9b..f2310f7f89c5 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -580,7 +580,7 @@ static int bch2_gc_done(struct bch_fs *c,
 		if (verify)						\
 			fsck_err(c, "stripe %zu has wrong "_msg		\
 				": got %u, should be %u",		\
-				dst_iter.pos, ##__VA_ARGS__,		\
+				iter.pos, ##__VA_ARGS__,		\
 				dst->_f, src->_f);			\
 		dst->_f = src->_f;					\
 		dst->dirty = true;					\
@@ -603,13 +603,11 @@ static int bch2_gc_done(struct bch_fs *c,
 	copy_field(_f, "fs has wrong " _msg, ##__VA_ARGS__)
 
 	if (!metadata_only) {
-		struct genradix_iter dst_iter = genradix_iter_init(&c->stripes[0], 0);
-		struct genradix_iter src_iter = genradix_iter_init(&c->stripes[1], 0);
+		struct genradix_iter iter = genradix_iter_init(&c->stripes[1], 0);
 		struct stripe *dst, *src;
 
-		while ((dst = genradix_iter_peek(&dst_iter, &c->stripes[0])) &&
-		       (src = genradix_iter_peek(&src_iter, &c->stripes[1]))) {
-			BUG_ON(src_iter.pos != dst_iter.pos);
+		while ((src = genradix_iter_peek(&iter, &c->stripes[1]))) {
+			dst = genradix_ptr_alloc(&c->stripes[0], iter.pos, GFP_KERNEL);
 
 			copy_stripe_field(alive,	"alive");
 			copy_stripe_field(sectors,	"sectors");
@@ -623,8 +621,7 @@ static int bch2_gc_done(struct bch_fs *c,
 				copy_stripe_field(block_sectors[i],
 						  "block_sectors[%u]", i);
 
-			genradix_iter_advance(&dst_iter, &c->stripes[0]);
-			genradix_iter_advance(&src_iter, &c->stripes[1]);
+			genradix_iter_advance(&iter, &c->stripes[1]);
 		}
 	}
 
-- 
cgit 


From 6e53151b7b738fe60b9295c2ff47e6b2092718b1 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 17 Jan 2021 16:45:19 -0500
Subject: bcachefs: Kill stripe->dirty

This makes bch2_stripes_write() work more like bch2_alloc_write().

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_gc.c | 21 +++++++++++++--------
 fs/bcachefs/ec.c       | 15 ++++++++-------
 fs/bcachefs/ec_types.h |  3 +--
 3 files changed, 22 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index f2310f7f89c5..5608d8a0ed61 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -583,7 +583,6 @@ static int bch2_gc_done(struct bch_fs *c,
 				iter.pos, ##__VA_ARGS__,		\
 				dst->_f, src->_f);			\
 		dst->_f = src->_f;					\
-		dst->dirty = true;					\
 		set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags);		\
 	}
 #define copy_bucket_field(_f)						\
@@ -609,18 +608,24 @@ static int bch2_gc_done(struct bch_fs *c,
 		while ((src = genradix_iter_peek(&iter, &c->stripes[1]))) {
 			dst = genradix_ptr_alloc(&c->stripes[0], iter.pos, GFP_KERNEL);
 
-			copy_stripe_field(alive,	"alive");
-			copy_stripe_field(sectors,	"sectors");
-			copy_stripe_field(algorithm,	"algorithm");
-			copy_stripe_field(nr_blocks,	"nr_blocks");
-			copy_stripe_field(nr_redundant,	"nr_redundant");
-			copy_stripe_field(blocks_nonempty,
-					  "blocks_nonempty");
+			if (dst->alive		!= src->alive ||
+			    dst->sectors	!= src->sectors ||
+			    dst->algorithm	!= src->algorithm ||
+			    dst->nr_blocks	!= src->nr_blocks ||
+			    dst->nr_redundant	!= src->nr_redundant) {
+				bch_err(c, "unexpected stripe inconsistency at bch2_gc_done, confused");
+				ret = -EINVAL;
+				goto fsck_err;
+			}
 
 			for (i = 0; i < ARRAY_SIZE(dst->block_sectors); i++)
 				copy_stripe_field(block_sectors[i],
 						  "block_sectors[%u]", i);
 
+			dst->blocks_nonempty = 0;
+			for (i = 0; i < dst->nr_blocks; i++)
+				dst->blocks_nonempty += dst->block_sectors[i] != 0;
+
 			genradix_iter_advance(&iter, &c->stripes[1]);
 		}
 	}
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index 1f125ce77e4f..0d9a27726c05 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -1466,7 +1466,7 @@ static int __bch2_stripe_write_key(struct btree_trans *trans,
 				   size_t idx,
 				   struct bkey_i_stripe *new_key)
 {
-	struct bch_fs *c = trans->c;
+	const struct bch_stripe *v;
 	struct bkey_s_c k;
 	unsigned i;
 	int ret;
@@ -1481,16 +1481,17 @@ static int __bch2_stripe_write_key(struct btree_trans *trans,
 	if (k.k->type != KEY_TYPE_stripe)
 		return -EIO;
 
+	v = bkey_s_c_to_stripe(k).v;
+	for (i = 0; i < v->nr_blocks; i++)
+		if (m->block_sectors[i] != stripe_blockcount_get(v, i))
+			goto write;
+	return 0;
+write:
 	bkey_reassemble(&new_key->k_i, k);
 
-	spin_lock(&c->ec_stripes_heap_lock);
-
 	for (i = 0; i < new_key->v.nr_blocks; i++)
 		stripe_blockcount_set(&new_key->v, i,
 				      m->block_sectors[i]);
-	m->dirty = false;
-
-	spin_unlock(&c->ec_stripes_heap_lock);
 
 	bch2_trans_update(trans, iter, &new_key->k_i, 0);
 	return 0;
@@ -1514,7 +1515,7 @@ int bch2_stripes_write(struct bch_fs *c, unsigned flags)
 				   BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
 
 	genradix_for_each(&c->stripes[0], giter, m) {
-		if (!m->dirty)
+		if (!m->alive)
 			continue;
 
 		ret = __bch2_trans_do(&trans, NULL, NULL,
diff --git a/fs/bcachefs/ec_types.h b/fs/bcachefs/ec_types.h
index 5b688b4394f7..847770166223 100644
--- a/fs/bcachefs/ec_types.h
+++ b/fs/bcachefs/ec_types.h
@@ -18,8 +18,7 @@ struct stripe {
 	u8			nr_blocks;
 	u8			nr_redundant;
 
-	unsigned		alive:1;
-	unsigned		dirty:1;
+	unsigned		alive:1; /* does a corresponding key exist in stripes btree? */
 	unsigned		on_heap:1;
 	u8			blocks_nonempty;
 	u16			block_sectors[BCH_BKEY_PTRS_MAX];
-- 
cgit 


From c6e658ee9f7f7d1da410ada1c3174fe46541c454 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 17 Jan 2021 17:43:49 -0500
Subject: bcachefs: Preserve stripe blockcounts on existing stripes

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/ec.c | 59 +++++++++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 48 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index 0d9a27726c05..d48df42d41e9 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -684,8 +684,8 @@ static void ec_stripe_delete_work(struct work_struct *work)
 /* stripe creation: */
 
 static int ec_stripe_bkey_insert(struct bch_fs *c,
-				 struct ec_stripe_new *s,
-				 struct bkey_i_stripe *stripe)
+				 struct bkey_i_stripe *stripe,
+				 struct disk_reservation *res)
 {
 	struct btree_trans trans;
 	struct btree_iter *iter;
@@ -726,7 +726,7 @@ found_slot:
 
 	bch2_trans_update(&trans, iter, &stripe->k_i, 0);
 
-	ret = bch2_trans_commit(&trans, &s->res, NULL,
+	ret = bch2_trans_commit(&trans, res, NULL,
 				BTREE_INSERT_NOFAIL);
 err:
 	bch2_trans_iter_put(&trans, iter);
@@ -740,6 +740,47 @@ err:
 	return ret;
 }
 
+static int ec_stripe_bkey_update(struct btree_trans *trans,
+				 struct bkey_i_stripe *new)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter *iter;
+	struct bkey_s_c k;
+	const struct bch_stripe *existing;
+	unsigned i;
+	int ret;
+
+	iter = bch2_trans_get_iter(trans, BTREE_ID_EC,
+				   new->k.p, BTREE_ITER_INTENT);
+	k = bch2_btree_iter_peek_slot(iter);
+	ret = bkey_err(k);
+	if (ret)
+		goto err;
+
+	if (!k.k || k.k->type != KEY_TYPE_stripe) {
+		bch_err(c, "error updating stripe: not found");
+		ret = -ENOENT;
+		goto err;
+	}
+
+	existing = bkey_s_c_to_stripe(k).v;
+
+	if (existing->nr_blocks != new->v.nr_blocks) {
+		bch_err(c, "error updating stripe: nr_blocks does not match");
+		ret = -EINVAL;
+		goto err;
+	}
+
+	for (i = 0; i < new->v.nr_blocks; i++)
+		stripe_blockcount_set(&new->v, i,
+			stripe_blockcount_get(existing, i));
+
+	bch2_trans_update(trans, iter, &new->k_i, 0);
+err:
+	bch2_trans_iter_put(trans, iter);
+	return ret;
+}
+
 static void extent_stripe_ptr_add(struct bkey_s_extent e,
 				  struct ec_stripe_buf *s,
 				  struct bch_extent_ptr *ptr,
@@ -884,9 +925,9 @@ static void ec_stripe_create(struct ec_stripe_new *s)
 	}
 
 	ret = s->have_existing_stripe
-		? bch2_btree_insert(c, BTREE_ID_EC, &s->new_stripe.key.k_i,
-				    &s->res, NULL, BTREE_INSERT_NOFAIL)
-		: ec_stripe_bkey_insert(c, s, &s->new_stripe.key);
+		? bch2_trans_do(c, &s->res, NULL, BTREE_INSERT_NOFAIL,
+				ec_stripe_bkey_update(&trans, &s->new_stripe.key))
+		: ec_stripe_bkey_insert(c, &s->new_stripe.key, &s->res);
 	if (ret) {
 		bch_err(c, "error creating stripe: error creating stripe key");
 		goto err_put_writes;
@@ -902,11 +943,7 @@ static void ec_stripe_create(struct ec_stripe_new *s)
 
 	spin_lock(&c->ec_stripes_heap_lock);
 	m = genradix_ptr(&c->stripes[0], s->new_stripe.key.k.p.offset);
-#if 0
-	pr_info("created a %s stripe %llu",
-		s->have_existing_stripe ? "existing" : "new",
-		s->stripe.key.k.p.offset);
-#endif
+
 	BUG_ON(m->on_heap);
 	bch2_stripes_heap_insert(c, m, s->new_stripe.key.k.p.offset);
 	spin_unlock(&c->ec_stripes_heap_lock);
-- 
cgit 


From f9ef45ad434ba85363aab6d74fa48499f7ea6499 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 18 Jan 2021 19:59:03 -0500
Subject: bcachefs: Verify transaction updates are sorted

A user reported a bug that implies they might not be correctly sorted,
this should help track that down.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update_leaf.c | 20 ++++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index a25cc3b7db39..ffd8e0b14257 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -981,10 +981,22 @@ int bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter,
 		.trigger_flags = flags, .iter = iter, .k = k
 	};
 
-	EBUG_ON(bkey_cmp(iter->pos,
-			 (iter->flags & BTREE_ITER_IS_EXTENTS)
-			 ? bkey_start_pos(&k->k)
-			 : k->k.p));
+#ifdef CONFIG_BCACHEFS_DEBUG
+	BUG_ON(bkey_cmp(iter->pos,
+			(iter->flags & BTREE_ITER_IS_EXTENTS)
+			? bkey_start_pos(&k->k)
+			: k->k.p));
+
+	trans_for_each_update(trans, i) {
+		BUG_ON(bkey_cmp(i->iter->pos,
+				 (i->iter->flags & BTREE_ITER_IS_EXTENTS)
+				 ? bkey_start_pos(&i->k->k)
+				 : i->k->k.p));
+
+		BUG_ON(i != trans->updates &&
+		       btree_iter_pos_cmp(i[-1].iter, i[0].iter) >= 0);
+	}
+#endif
 
 	iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT;
 
-- 
cgit 


From 6c7585b098c519c157cca4ca1c974321f3903ad4 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 18 Jan 2021 23:26:42 -0500
Subject: bcachefs: Rework allocating buckets for stripes

Allocating buckets for existing stripes was busted, in part because the
data structures were too contorted. This reworks new stripes so that we
have an array of open buckets that matches blocks in the stripe, and
it's sparse if we're reusing an existing stripe.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_foreground.c |  21 ++++--
 fs/bcachefs/ec.c               | 142 ++++++++++++++++++++++-------------------
 fs/bcachefs/ec.h               |   6 +-
 3 files changed, 92 insertions(+), 77 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index 476c46f596cc..07aabae379c8 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -489,16 +489,20 @@ bucket_alloc_from_stripe(struct bch_fs *c,
 	devs_sorted = bch2_dev_alloc_list(c, &wp->stripe, devs_may_alloc);
 
 	for (i = 0; i < devs_sorted.nr; i++)
-		open_bucket_for_each(c, &h->s->blocks, ob, ec_idx)
+		for (ec_idx = 0; ec_idx < h->s->nr_data; ec_idx++) {
+			if (!h->s->blocks[ec_idx])
+				continue;
+
+			ob = c->open_buckets + h->s->blocks[ec_idx];
 			if (ob->ptr.dev == devs_sorted.devs[i] &&
-			    !test_and_set_bit(h->s->data_block_idx[ec_idx],
-					      h->s->blocks_allocated))
+			    !test_and_set_bit(ec_idx, h->s->blocks_allocated))
 				goto got_bucket;
+		}
 	goto out_put_head;
 got_bucket:
 	ca = bch_dev_bkey_exists(c, ob->ptr.dev);
 
-	ob->ec_idx	= h->s->data_block_idx[ec_idx];
+	ob->ec_idx	= ec_idx;
 	ob->ec		= h->s;
 
 	add_new_bucket(c, ptrs, devs_may_alloc,
@@ -636,10 +640,13 @@ void bch2_open_buckets_stop_dev(struct bch_fs *c, struct bch_dev *ca,
 
 		if (!drop && ob->ec) {
 			mutex_lock(&ob->ec->lock);
-			open_bucket_for_each(c, &ob->ec->blocks, ob2, j)
-				drop |= ob2->ptr.dev == ca->dev_idx;
-			open_bucket_for_each(c, &ob->ec->parity, ob2, j)
+			for (j = 0; j < ob->ec->new_stripe.key.v.nr_blocks; j++) {
+				if (!ob->ec->blocks[j])
+					continue;
+
+				ob2 = c->open_buckets + ob->ec->blocks[j];
 				drop |= ob2->ptr.dev == ca->dev_idx;
+			}
 			mutex_unlock(&ob->ec->lock);
 		}
 
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index d48df42d41e9..43296763148c 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -907,9 +907,6 @@ static void ec_stripe_create(struct ec_stripe_new *s)
 	if (!percpu_ref_tryget(&c->writes))
 		goto err;
 
-	BUG_ON(bitmap_weight(s->blocks_allocated,
-			     s->blocks.nr) != s->blocks.nr);
-
 	ec_generate_ec(&s->new_stripe);
 
 	ec_generate_checksums(&s->new_stripe);
@@ -952,12 +949,17 @@ err_put_writes:
 err:
 	bch2_disk_reservation_put(c, &s->res);
 
-	open_bucket_for_each(c, &s->blocks, ob, i) {
-		ob->ec = NULL;
-		__bch2_open_bucket_put(c, ob);
-	}
-
-	bch2_open_buckets_put(c, &s->parity);
+	for (i = 0; i < v->nr_blocks; i++)
+		if (s->blocks[i]) {
+			ob = c->open_buckets + s->blocks[i];
+
+			if (i < nr_data) {
+				ob->ec = NULL;
+				__bch2_open_bucket_put(c, ob);
+			} else {
+				bch2_open_bucket_put(c, ob);
+			}
+		}
 
 	bch2_keylist_free(&s->keys, s->inline_keys);
 
@@ -1216,7 +1218,7 @@ void bch2_ec_stripe_head_put(struct bch_fs *c, struct ec_stripe_head *h)
 	if (h->s &&
 	    h->s->allocated &&
 	    bitmap_weight(h->s->blocks_allocated,
-			  h->s->blocks.nr) == h->s->blocks.nr)
+			  h->s->nr_data) == h->s->nr_data)
 		ec_stripe_set_pending(c, h);
 
 	mutex_unlock(&h->lock);
@@ -1253,64 +1255,82 @@ static enum bucket_alloc_ret
 new_stripe_alloc_buckets(struct bch_fs *c, struct ec_stripe_head *h,
 			 struct closure *cl)
 {
-	struct bch_devs_mask devs;
+	struct bch_devs_mask devs = h->devs;
 	struct open_bucket *ob;
-	unsigned i, nr_have, nr_data =
-		min_t(unsigned, h->nr_active_devs,
-		      BCH_BKEY_PTRS_MAX) - h->redundancy;
+	struct open_buckets buckets;
+	unsigned i, j, nr_have_parity = 0, nr_have_data = 0;
 	bool have_cache = true;
 	enum bucket_alloc_ret ret = ALLOC_SUCCESS;
 
-	devs = h->devs;
-
-	for_each_set_bit(i, h->s->blocks_allocated, BCH_BKEY_PTRS_MAX) {
-		__clear_bit(h->s->new_stripe.key.v.ptrs[i].dev, devs.d);
-		--nr_data;
+	for (i = 0; i < h->s->new_stripe.key.v.nr_blocks; i++) {
+		if (test_bit(i, h->s->blocks_gotten)) {
+			__clear_bit(h->s->new_stripe.key.v.ptrs[i].dev, devs.d);
+			if (i < h->s->nr_data)
+				nr_have_data++;
+			else
+				nr_have_parity++;
+		}
 	}
 
-	BUG_ON(h->s->blocks.nr > nr_data);
-	BUG_ON(h->s->parity.nr > h->redundancy);
-
-	open_bucket_for_each(c, &h->s->parity, ob, i)
-		__clear_bit(ob->ptr.dev, devs.d);
-	open_bucket_for_each(c, &h->s->blocks, ob, i)
-		__clear_bit(ob->ptr.dev, devs.d);
+	BUG_ON(nr_have_data	> h->s->nr_data);
+	BUG_ON(nr_have_parity	> h->s->nr_parity);
 
 	percpu_down_read(&c->mark_lock);
 	rcu_read_lock();
 
-	if (h->s->parity.nr < h->redundancy) {
-		nr_have = h->s->parity.nr;
-
-		ret = bch2_bucket_alloc_set(c, &h->s->parity,
+	buckets.nr = 0;
+	if (nr_have_parity < h->s->nr_parity) {
+		ret = bch2_bucket_alloc_set(c, &buckets,
 					    &h->parity_stripe,
 					    &devs,
-					    h->redundancy,
-					    &nr_have,
+					    h->s->nr_parity,
+					    &nr_have_parity,
 					    &have_cache,
 					    h->copygc
 					    ? RESERVE_MOVINGGC
 					    : RESERVE_NONE,
 					    0,
 					    cl);
+
+		open_bucket_for_each(c, &buckets, ob, i) {
+			j = find_next_zero_bit(h->s->blocks_gotten,
+					       h->s->nr_data + h->s->nr_parity,
+					       h->s->nr_data);
+			BUG_ON(j >= h->s->nr_data + h->s->nr_parity);
+
+			h->s->blocks[j] = buckets.v[i];
+			h->s->new_stripe.key.v.ptrs[j] = ob->ptr;
+			__set_bit(j, h->s->blocks_gotten);
+		}
+
 		if (ret)
 			goto err;
 	}
 
-	if (h->s->blocks.nr < nr_data) {
-		nr_have = h->s->blocks.nr;
-
-		ret = bch2_bucket_alloc_set(c, &h->s->blocks,
+	buckets.nr = 0;
+	if (nr_have_data < h->s->nr_data) {
+		ret = bch2_bucket_alloc_set(c, &buckets,
 					    &h->block_stripe,
 					    &devs,
-					    nr_data,
-					    &nr_have,
+					    h->s->nr_data,
+					    &nr_have_data,
 					    &have_cache,
 					    h->copygc
 					    ? RESERVE_MOVINGGC
 					    : RESERVE_NONE,
 					    0,
 					    cl);
+
+		open_bucket_for_each(c, &buckets, ob, i) {
+			j = find_next_zero_bit(h->s->blocks_gotten,
+					       h->s->nr_data, 0);
+			BUG_ON(j >= h->s->nr_data);
+
+			h->s->blocks[j] = buckets.v[i];
+			h->s->new_stripe.key.v.ptrs[j] = ob->ptr;
+			__set_bit(j, h->s->blocks_gotten);
+		}
+
 		if (ret)
 			goto err;
 	}
@@ -1362,8 +1382,7 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *c,
 					       struct closure *cl)
 {
 	struct ec_stripe_head *h;
-	struct open_bucket *ob;
-	unsigned i, data_idx = 0;
+	unsigned i;
 	s64 idx;
 	int ret;
 
@@ -1398,9 +1417,14 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *c,
 				BUG();
 			}
 
+			BUG_ON(h->s->existing_stripe.size != h->blocksize);
+			BUG_ON(h->s->existing_stripe.size != h->s->existing_stripe.key.v.sectors);
+
 			for (i = 0; i < h->s->existing_stripe.key.v.nr_blocks; i++) {
-				if (stripe_blockcount_get(&h->s->existing_stripe.key.v, i))
+				if (stripe_blockcount_get(&h->s->existing_stripe.key.v, i)) {
+					__set_bit(i, h->s->blocks_gotten);
 					__set_bit(i, h->s->blocks_allocated);
+				}
 
 				ec_block_io(c, &h->s->existing_stripe, READ, i, &h->s->iodone);
 			}
@@ -1438,20 +1462,6 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *c,
 			goto out;
 		}
 
-		open_bucket_for_each(c, &h->s->blocks, ob, i) {
-			data_idx = find_next_zero_bit(h->s->blocks_allocated,
-						      h->s->nr_data, data_idx);
-			BUG_ON(data_idx >= h->s->nr_data);
-
-			h->s->new_stripe.key.v.ptrs[data_idx] = ob->ptr;
-			h->s->data_block_idx[i] = data_idx;
-			data_idx++;
-		}
-
-		open_bucket_for_each(c, &h->s->parity, ob, i)
-			h->s->new_stripe.key.v.ptrs[h->s->nr_data + i] = ob->ptr;
-
-		//pr_info("new stripe, blocks_allocated %lx", h->s->blocks_allocated[0]);
 		h->s->allocated = true;
 	}
 out:
@@ -1471,12 +1481,14 @@ void bch2_ec_stop_dev(struct bch_fs *c, struct bch_dev *ca)
 		if (!h->s)
 			goto unlock;
 
-		open_bucket_for_each(c, &h->s->blocks, ob, i)
-			if (ob->ptr.dev == ca->dev_idx)
-				goto found;
-		open_bucket_for_each(c, &h->s->parity, ob, i)
+		for (i = 0; i < h->s->new_stripe.key.v.nr_blocks; i++) {
+			if (!h->s->blocks[i])
+				continue;
+
+			ob = c->open_buckets + h->s->blocks[i];
 			if (ob->ptr.dev == ca->dev_idx)
 				goto found;
+		}
 		goto unlock;
 found:
 		h->s->err = -EROFS;
@@ -1662,19 +1674,17 @@ void bch2_new_stripes_to_text(struct printbuf *out, struct bch_fs *c)
 		       h->target, h->algo, h->redundancy);
 
 		if (h->s)
-			pr_buf(out, "\tpending: blocks %u allocated %u\n",
-			       h->s->blocks.nr,
+			pr_buf(out, "\tpending: blocks %u+%u allocated %u\n",
+			       h->s->nr_data, h->s->nr_parity,
 			       bitmap_weight(h->s->blocks_allocated,
-					     h->s->blocks.nr));
+					     h->s->nr_data));
 	}
 	mutex_unlock(&c->ec_stripe_head_lock);
 
 	mutex_lock(&c->ec_stripe_new_lock);
 	list_for_each_entry(s, &c->ec_stripe_new_list, list) {
-		pr_buf(out, "\tin flight: blocks %u allocated %u pin %u\n",
-		       s->blocks.nr,
-		       bitmap_weight(s->blocks_allocated,
-				     s->blocks.nr),
+		pr_buf(out, "\tin flight: blocks %u+%u pin %u\n",
+		       s->nr_data, s->nr_parity,
 		       atomic_read(&s->pin));
 	}
 	mutex_unlock(&c->ec_stripe_new_lock);
diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h
index f124582fdc5f..765baa9d9264 100644
--- a/fs/bcachefs/ec.h
+++ b/fs/bcachefs/ec.h
@@ -143,11 +143,9 @@ struct ec_stripe_new {
 	bool			pending;
 	bool			have_existing_stripe;
 
+	unsigned long		blocks_gotten[BITS_TO_LONGS(BCH_BKEY_PTRS_MAX)];
 	unsigned long		blocks_allocated[BITS_TO_LONGS(BCH_BKEY_PTRS_MAX)];
-
-	struct open_buckets	blocks;
-	u8			data_block_idx[BCH_BKEY_PTRS_MAX];
-	struct open_buckets	parity;
+	open_bucket_idx_t	blocks[BCH_BKEY_PTRS_MAX];
 	struct disk_reservation	res;
 
 	struct keylist		keys;
-- 
cgit 


From 33ccd7188e37ad5d9d662e7450610768bc8cc8a9 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 18 Jan 2021 20:20:24 -0500
Subject: bcachefs: Don't allocate stripes at POS_MIN

In the future, stripe index 0 will be a sentinal value. This patch
doesn't disallow stripes at POS_MIN yet, leaving that for when we do the
on disk format changes.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bkey.h | 5 +++++
 fs/bcachefs/ec.c   | 5 +++--
 2 files changed, 8 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bkey.h b/fs/bcachefs/bkey.h
index 5ce883ba22dc..f984064f4b5d 100644
--- a/fs/bcachefs/bkey.h
+++ b/fs/bcachefs/bkey.h
@@ -178,6 +178,11 @@ static inline struct bpos bpos_min(struct bpos l, struct bpos r)
 	return bkey_cmp(l, r) < 0 ? l : r;
 }
 
+static inline struct bpos bpos_max(struct bpos l, struct bpos r)
+{
+	return bkey_cmp(l, r) > 0 ? l : r;
+}
+
 void bch2_bpos_swab(struct bpos *);
 void bch2_bkey_swab_key(const struct bkey_format *, struct bkey_packed *);
 
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index 43296763148c..f280ca20d457 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -690,7 +690,8 @@ static int ec_stripe_bkey_insert(struct bch_fs *c,
 	struct btree_trans trans;
 	struct btree_iter *iter;
 	struct bkey_s_c k;
-	struct bpos start_pos = POS(0, c->ec_stripe_hint);
+	struct bpos min_pos = POS(0, 1);
+	struct bpos start_pos = bpos_max(min_pos, POS(0, c->ec_stripe_hint));
 	int ret;
 
 	bch2_trans_init(&trans, c, 0, 0);
@@ -701,7 +702,7 @@ retry:
 			   BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) {
 		if (bkey_cmp(k.k->p, POS(0, U32_MAX)) > 0) {
 			if (start_pos.offset) {
-				start_pos = POS_MIN;
+				start_pos = min_pos;
 				bch2_btree_iter_set_pos(iter, start_pos);
 				continue;
 			}
-- 
cgit 


From a5cd80ea99e544b67acde573fc2a2dd68659fd40 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 20 Jan 2021 17:31:31 -0500
Subject: bcachefs: Fix an assertion pop

There was a race: btree node writes drop their reference on journal pins
before clearing the btree_node_write_in_flight flag.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_io.c | 17 -----------------
 fs/bcachefs/btree_io.h |  1 -
 fs/bcachefs/super.c    |  5 +----
 3 files changed, 1 insertion(+), 22 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index c4d53ea2e920..cd2b300043b6 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -1829,23 +1829,6 @@ void bch2_btree_flush_all_writes(struct bch_fs *c)
 	__bch2_btree_flush_all(c, BTREE_NODE_write_in_flight);
 }
 
-void bch2_btree_verify_flushed(struct bch_fs *c)
-{
-	struct bucket_table *tbl;
-	struct rhash_head *pos;
-	struct btree *b;
-	unsigned i;
-
-	rcu_read_lock();
-	for_each_cached_btree(b, c, tbl, i, pos) {
-		unsigned long flags = READ_ONCE(b->flags);
-
-		BUG_ON((flags & (1 << BTREE_NODE_dirty)) ||
-		       (flags & (1 << BTREE_NODE_write_in_flight)));
-	}
-	rcu_read_unlock();
-}
-
 void bch2_dirty_btree_nodes_to_text(struct printbuf *out, struct bch_fs *c)
 {
 	struct bucket_table *tbl;
diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h
index 1a4b11e99cc4..3b61555ef906 100644
--- a/fs/bcachefs/btree_io.h
+++ b/fs/bcachefs/btree_io.h
@@ -185,7 +185,6 @@ do {									\
 
 void bch2_btree_flush_all_reads(struct bch_fs *);
 void bch2_btree_flush_all_writes(struct bch_fs *);
-void bch2_btree_verify_flushed(struct bch_fs *);
 void bch2_dirty_btree_nodes_to_text(struct printbuf *, struct bch_fs *);
 
 static inline void compat_bformat(unsigned level, enum btree_id btree_id,
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index f46b4b05b4aa..2b3fb07fbc4d 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -242,10 +242,7 @@ nowrote_alloc:
 	 * the journal kicks off btree writes via reclaim - wait for in flight
 	 * writes after stopping journal:
 	 */
-	if (test_bit(BCH_FS_EMERGENCY_RO, &c->flags))
-		bch2_btree_flush_all_writes(c);
-	else
-		bch2_btree_verify_flushed(c);
+	bch2_btree_flush_all_writes(c);
 
 	/*
 	 * After stopping journal:
-- 
cgit 


From ef470b4817e6592fdb22b566380d289d7c42a17d Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 20 Jan 2021 19:42:09 -0500
Subject: bcachefs: Clean up bch2_extent_can_insert

It was using an internal btree node iterator interface, when
bch2_btree_iter_peek_slot() sufficed. We were hitting a null ptr deref
that looked like it was from the iterator not being uptodate - this will
also fix that.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/extent_update.c | 15 +++++----------
 1 file changed, 5 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/extent_update.c b/fs/bcachefs/extent_update.c
index 1faca4bc1825..5c43678e94a3 100644
--- a/fs/bcachefs/extent_update.c
+++ b/fs/bcachefs/extent_update.c
@@ -192,18 +192,13 @@ bch2_extent_can_insert(struct btree_trans *trans,
 		       struct btree_iter *iter,
 		       struct bkey_i *insert)
 {
-	struct btree_iter_level *l = &iter->l[0];
-	struct btree_node_iter node_iter = l->iter;
-	struct bkey_packed *_k;
 	struct bkey_s_c k;
-	struct bkey unpacked;
-	int sectors;
+	int ret, sectors;
 
-	_k = bch2_btree_node_iter_peek(&node_iter, l->b);
-	if (!_k)
-		return BTREE_INSERT_OK;
-
-	k = bkey_disassemble(l->b, _k, &unpacked);
+	k = bch2_btree_iter_peek_slot(iter);
+	ret = bkey_err(k);
+	if (ret)
+		return ret;
 
 	/* Check if we're splitting a compressed extent: */
 
-- 
cgit 


From b4725cc1a45fa859e6ff0966f5fa988d6402e5c8 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 21 Jan 2021 14:42:23 -0500
Subject: bcachefs: Fix loopback in dio mode

We had a deadlock on page_lock, because buffered reads signal completion
by unlocking the page, but the dio read path normally dirties the pages
it's reading to with set_page_dirty_lock.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-io.c | 30 ++++++++++++++++++++++++++----
 1 file changed, 26 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 5dd985e20c7f..79f1f0f37e18 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -93,6 +93,7 @@ struct dio_read {
 	struct closure			cl;
 	struct kiocb			*req;
 	long				ret;
+	bool				should_dirty;
 	struct bch_read_bio		rbio;
 };
 
@@ -1599,12 +1600,22 @@ again:
 
 /* O_DIRECT reads */
 
+static void bio_check_or_release(struct bio *bio, bool check_dirty)
+{
+	if (check_dirty) {
+		bio_check_pages_dirty(bio);
+	} else {
+		bio_release_pages(bio, false);
+		bio_put(bio);
+	}
+}
+
 static void bch2_dio_read_complete(struct closure *cl)
 {
 	struct dio_read *dio = container_of(cl, struct dio_read, cl);
 
 	dio->req->ki_complete(dio->req, dio->ret);
-	bio_check_pages_dirty(&dio->rbio.bio);	/* transfers ownership */
+	bio_check_or_release(&dio->rbio.bio, dio->should_dirty);
 }
 
 static void bch2_direct_IO_read_endio(struct bio *bio)
@@ -1619,8 +1630,11 @@ static void bch2_direct_IO_read_endio(struct bio *bio)
 
 static void bch2_direct_IO_read_split_endio(struct bio *bio)
 {
+	struct dio_read *dio = bio->bi_private;
+	bool should_dirty = dio->should_dirty;
+
 	bch2_direct_IO_read_endio(bio);
-	bio_check_pages_dirty(bio);	/* transfers ownership */
+	bio_check_or_release(bio, should_dirty);
 }
 
 static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter)
@@ -1676,6 +1690,12 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter)
 
 	dio->req	= req;
 	dio->ret	= ret;
+	/*
+	 * This is one of the sketchier things I've encountered: we have to skip
+	 * the dirtying of requests that are internal from the kernel (i.e. from
+	 * loopback), because we'll deadlock on page_lock.
+	 */
+	dio->should_dirty = iter_is_iovec(iter);
 
 	goto start;
 	while (iter->count) {
@@ -1699,7 +1719,9 @@ start:
 		}
 
 		offset += bio->bi_iter.bi_size;
-		bio_set_pages_dirty(bio);
+
+		if (dio->should_dirty)
+			bio_set_pages_dirty(bio);
 
 		if (iter->count)
 			closure_get(&dio->cl);
@@ -1713,7 +1735,7 @@ start:
 		closure_sync(&dio->cl);
 		closure_debug_destroy(&dio->cl);
 		ret = dio->ret;
-		bio_check_pages_dirty(&dio->rbio.bio); /* transfers ownership */
+		bio_check_or_release(&dio->rbio.bio, dio->should_dirty);
 		return ret;
 	} else {
 		return -EIOCBQUEUED;
-- 
cgit 


From e46b85573434b4e0c9f8eee4ac21d90643a97454 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 21 Jan 2021 19:14:37 -0500
Subject: bcachefs: Switch replicas.c allocations to GFP_KERNEL

We're transitioning to memalloc_nofs_save/restore instead of GFP flags
with the rest of the kernel, and GFP_NOIO was excessively strict and
causing unnnecessary allocation failures - these allocations are done
with btree locks dropped.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/replicas.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c
index a0840e1c9f88..979e9c2b8c74 100644
--- a/fs/bcachefs/replicas.c
+++ b/fs/bcachefs/replicas.c
@@ -159,7 +159,7 @@ cpu_replicas_add_entry(struct bch_replicas_cpu *old,
 	BUG_ON(!new_entry->data_type);
 	verify_replicas_entry(new_entry);
 
-	new.entries = kcalloc(new.nr, new.entry_size, GFP_NOIO);
+	new.entries = kcalloc(new.nr, new.entry_size, GFP_KERNEL);
 	if (!new.entries)
 		return new;
 
@@ -284,20 +284,20 @@ static int replicas_table_update(struct bch_fs *c,
 
 	for (i = 0; i < ARRAY_SIZE(new_usage); i++)
 		if (!(new_usage[i] = __alloc_percpu_gfp(bytes,
-					sizeof(u64), GFP_NOIO)))
+					sizeof(u64), GFP_KERNEL)))
 			goto err;
 
 	memset(new_usage, 0, sizeof(new_usage));
 
 	for (i = 0; i < ARRAY_SIZE(new_usage); i++)
 		if (!(new_usage[i] = __alloc_percpu_gfp(bytes,
-					sizeof(u64), GFP_NOIO)))
+					sizeof(u64), GFP_KERNEL)))
 			goto err;
 
-	if (!(new_base = kzalloc(bytes, GFP_NOIO)) ||
-	    !(new_scratch  = kmalloc(scratch_bytes, GFP_NOIO)) ||
+	if (!(new_base = kzalloc(bytes, GFP_KERNEL)) ||
+	    !(new_scratch  = kmalloc(scratch_bytes, GFP_KERNEL)) ||
 	    (c->usage_gc &&
-	     !(new_gc = __alloc_percpu_gfp(bytes, sizeof(u64), GFP_NOIO))))
+	     !(new_gc = __alloc_percpu_gfp(bytes, sizeof(u64), GFP_KERNEL))))
 		goto err;
 
 	for (i = 0; i < ARRAY_SIZE(new_usage); i++)
@@ -557,7 +557,7 @@ int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask)
 
 	c->replicas_gc.entries = kcalloc(c->replicas_gc.nr,
 					 c->replicas_gc.entry_size,
-					 GFP_NOIO);
+					 GFP_KERNEL);
 	if (!c->replicas_gc.entries) {
 		mutex_unlock(&c->sb_lock);
 		bch_err(c, "error allocating c->replicas_gc");
@@ -680,7 +680,7 @@ __bch2_sb_replicas_to_cpu_replicas(struct bch_sb_field_replicas *sb_r,
 		nr++;
 	}
 
-	cpu_r->entries = kcalloc(nr, entry_size, GFP_NOIO);
+	cpu_r->entries = kcalloc(nr, entry_size, GFP_KERNEL);
 	if (!cpu_r->entries)
 		return -ENOMEM;
 
@@ -712,7 +712,7 @@ __bch2_sb_replicas_v0_to_cpu_replicas(struct bch_sb_field_replicas_v0 *sb_r,
 	entry_size += sizeof(struct bch_replicas_entry) -
 		sizeof(struct bch_replicas_entry_v0);
 
-	cpu_r->entries = kcalloc(nr, entry_size, GFP_NOIO);
+	cpu_r->entries = kcalloc(nr, entry_size, GFP_KERNEL);
 	if (!cpu_r->entries)
 		return -ENOMEM;
 
-- 
cgit 


From fdbb88ac019462efcdcf83927e004e74088831f4 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 21 Jan 2021 19:15:49 -0500
Subject: bcachefs: Fix a faulty assertion

If journal replay hasn't finished, the journal can't be empty - oops.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/journal.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index 6f84a5dd06bc..ecc3629bcd4c 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -953,6 +953,7 @@ void bch2_fs_journal_stop(struct journal *j)
 	journal_quiesce(j);
 
 	BUG_ON(!bch2_journal_error(j) &&
+	       test_bit(JOURNAL_REPLAY_DONE, &j->flags) &&
 	       (journal_entry_is_open(j) ||
 		j->last_empty_seq + 1 != journal_cur_seq(j)));
 
-- 
cgit 


From b7cf4bd7fe689534f77455b7d60f5033b9a3bd28 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 21 Jan 2021 19:30:35 -0500
Subject: bcachefs: Ensure __bch2_trans_commit() always calls
 bch2_trans_reset()

This was leading to a very strange bug in bch2_bucket_io_time_reset(),
where we'd retry without clearing out the list of updates.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update_leaf.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index ffd8e0b14257..f64e7d37bbbf 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -836,7 +836,7 @@ int __bch2_trans_commit(struct btree_trans *trans)
 	int ret = 0;
 
 	if (!trans->nr_updates)
-		goto out_noupdates;
+		goto out_reset;
 
 	if (trans->flags & BTREE_INSERT_GC_LOCK_HELD)
 		lockdep_assert_held(&trans->c->gc_lock);
@@ -850,7 +850,7 @@ int __bch2_trans_commit(struct btree_trans *trans)
 	    unlikely(!percpu_ref_tryget(&trans->c->writes))) {
 		ret = bch2_trans_commit_get_rw_cold(trans);
 		if (ret)
-			return ret;
+			goto out_reset;
 	}
 
 #ifdef CONFIG_BCACHEFS_DEBUG
@@ -962,7 +962,7 @@ out:
 
 	if (likely(!(trans->flags & BTREE_INSERT_NOCHECK_RW)))
 		percpu_ref_put(&trans->c->writes);
-out_noupdates:
+out_reset:
 	bch2_trans_reset(trans, !ret ? TRANS_RESET_NOTRAVERSE : 0);
 
 	return ret;
-- 
cgit 


From 079663d8ed81bfd74a331b819eda17d753719605 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 21 Jan 2021 21:51:42 -0500
Subject: bcachefs: Kill metadata only gc

This was useful before we had transactional updates to interior btree
nodes - but now, it's just extra unneeded complexity.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_gc.c | 68 ++++++++++++++++++--------------------------------
 fs/bcachefs/btree_gc.h |  2 +-
 fs/bcachefs/recovery.c | 18 ++-----------
 3 files changed, 27 insertions(+), 61 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 5608d8a0ed61..6611047dcb0d 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -205,13 +205,12 @@ static int btree_gc_mark_node(struct bch_fs *c, struct btree *b, u8 *max_stale,
 }
 
 static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id,
-			 bool initial, bool metadata_only)
+			 bool initial)
 {
 	struct btree_trans trans;
 	struct btree_iter *iter;
 	struct btree *b;
-	unsigned depth = metadata_only			? 1
-		: bch2_expensive_debug_checks		? 0
+	unsigned depth = bch2_expensive_debug_checks	? 0
 		: !btree_node_type_needs_gc(btree_id)	? 1
 		: 0;
 	u8 max_stale = 0;
@@ -326,13 +325,11 @@ static int bch2_gc_btree_init_recurse(struct bch_fs *c, struct btree *b,
 
 static int bch2_gc_btree_init(struct bch_fs *c,
 			      struct journal_keys *journal_keys,
-			      enum btree_id btree_id,
-			      bool metadata_only)
+			      enum btree_id btree_id)
 {
 	struct btree *b;
-	unsigned target_depth = metadata_only		? 1
-		: bch2_expensive_debug_checks		? 0
-		: !btree_node_type_needs_gc(btree_id)	? 1
+	unsigned target_depth = bch2_expensive_debug_checks	? 0
+		: !btree_node_type_needs_gc(btree_id)		? 1
 		: 0;
 	u8 max_stale = 0;
 	int ret = 0;
@@ -377,7 +374,7 @@ static inline int btree_id_gc_phase_cmp(enum btree_id l, enum btree_id r)
 }
 
 static int bch2_gc_btrees(struct bch_fs *c, struct journal_keys *journal_keys,
-			  bool initial, bool metadata_only)
+			  bool initial)
 {
 	enum btree_id ids[BTREE_ID_NR];
 	unsigned i;
@@ -390,8 +387,8 @@ static int bch2_gc_btrees(struct bch_fs *c, struct journal_keys *journal_keys,
 		enum btree_id id = ids[i];
 		int ret = initial
 			? bch2_gc_btree_init(c, journal_keys,
-					     id, metadata_only)
-			: bch2_gc_btree(c, id, initial, metadata_only);
+					     id)
+			: bch2_gc_btree(c, id, initial);
 		if (ret)
 			return ret;
 	}
@@ -558,12 +555,11 @@ static void bch2_gc_free(struct bch_fs *c)
 }
 
 static int bch2_gc_done(struct bch_fs *c,
-			bool initial, bool metadata_only)
+			bool initial)
 {
 	struct bch_dev *ca;
-	bool verify = !metadata_only &&
-		(!initial ||
-		 (c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_INFO)));
+	bool verify = (!initial ||
+		       (c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_INFO)));
 	unsigned i;
 	int ret = 0;
 
@@ -601,7 +597,7 @@ static int bch2_gc_done(struct bch_fs *c,
 #define copy_fs_field(_f, _msg, ...)					\
 	copy_field(_f, "fs has wrong " _msg, ##__VA_ARGS__)
 
-	if (!metadata_only) {
+	{
 		struct genradix_iter iter = genradix_iter_init(&c->stripes[1], 0);
 		struct stripe *dst, *src;
 
@@ -660,28 +656,20 @@ static int bch2_gc_done(struct bch_fs *c,
 
 		copy_fs_field(hidden,		"hidden");
 		copy_fs_field(btree,		"btree");
+		copy_fs_field(data,	"data");
+		copy_fs_field(cached,	"cached");
+		copy_fs_field(reserved,	"reserved");
+		copy_fs_field(nr_inodes,"nr_inodes");
 
-		if (!metadata_only) {
-			copy_fs_field(data,	"data");
-			copy_fs_field(cached,	"cached");
-			copy_fs_field(reserved,	"reserved");
-			copy_fs_field(nr_inodes,"nr_inodes");
-
-			for (i = 0; i < BCH_REPLICAS_MAX; i++)
-				copy_fs_field(persistent_reserved[i],
-					      "persistent_reserved[%i]", i);
-		}
+		for (i = 0; i < BCH_REPLICAS_MAX; i++)
+			copy_fs_field(persistent_reserved[i],
+				      "persistent_reserved[%i]", i);
 
 		for (i = 0; i < c->replicas.nr; i++) {
 			struct bch_replicas_entry *e =
 				cpu_replicas_entry(&c->replicas, i);
 			char buf[80];
 
-			if (metadata_only &&
-			    (e->data_type == BCH_DATA_user ||
-			     e->data_type == BCH_DATA_cached))
-				continue;
-
 			bch2_replicas_entry_to_text(&PBUF(buf), e);
 
 			copy_fs_field(replicas[i], "%s", buf);
@@ -697,8 +685,7 @@ fsck_err:
 	return ret;
 }
 
-static int bch2_gc_start(struct bch_fs *c,
-			 bool metadata_only)
+static int bch2_gc_start(struct bch_fs *c)
 {
 	struct bch_dev *ca;
 	unsigned i;
@@ -762,13 +749,6 @@ static int bch2_gc_start(struct bch_fs *c,
 
 			d->_mark.gen = dst->b[b].oldest_gen = s->mark.gen;
 			d->gen_valid = s->gen_valid;
-
-			if (metadata_only &&
-			    (s->mark.data_type == BCH_DATA_user ||
-			     s->mark.data_type == BCH_DATA_cached)) {
-				d->_mark = s->mark;
-				d->_mark.owned_by_allocator = 0;
-			}
 		}
 	};
 
@@ -796,7 +776,7 @@ static int bch2_gc_start(struct bch_fs *c,
  *    uses, GC could skip past them
  */
 int bch2_gc(struct bch_fs *c, struct journal_keys *journal_keys,
-	    bool initial, bool metadata_only)
+	    bool initial)
 {
 	struct bch_dev *ca;
 	u64 start_time = local_clock();
@@ -812,13 +792,13 @@ int bch2_gc(struct bch_fs *c, struct journal_keys *journal_keys,
 	closure_wait_event(&c->btree_interior_update_wait,
 			   !bch2_btree_interior_updates_nr_pending(c));
 again:
-	ret = bch2_gc_start(c, metadata_only);
+	ret = bch2_gc_start(c);
 	if (ret)
 		goto out;
 
 	bch2_mark_superblocks(c);
 
-	ret = bch2_gc_btrees(c, journal_keys, initial, metadata_only);
+	ret = bch2_gc_btrees(c, journal_keys, initial);
 	if (ret)
 		goto out;
 
@@ -857,7 +837,7 @@ out:
 		bch2_journal_block(&c->journal);
 
 		percpu_down_write(&c->mark_lock);
-		ret = bch2_gc_done(c, initial, metadata_only);
+		ret = bch2_gc_done(c, initial);
 
 		bch2_journal_unblock(&c->journal);
 	} else {
diff --git a/fs/bcachefs/btree_gc.h b/fs/bcachefs/btree_gc.h
index 3694a3df62a8..f0435a58793b 100644
--- a/fs/bcachefs/btree_gc.h
+++ b/fs/bcachefs/btree_gc.h
@@ -7,7 +7,7 @@
 void bch2_coalesce(struct bch_fs *);
 
 struct journal_keys;
-int bch2_gc(struct bch_fs *, struct journal_keys *, bool, bool);
+int bch2_gc(struct bch_fs *, struct journal_keys *, bool);
 int bch2_gc_gens(struct bch_fs *);
 void bch2_gc_thread_stop(struct bch_fs *);
 int bch2_gc_thread_start(struct bch_fs *);
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 8c67f1468945..422f2fbe6dfb 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -1099,27 +1099,13 @@ use_clean:
 
 	set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags);
 
-	if ((c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_INFO)) &&
-	    !(c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_METADATA))) {
-		/*
-		 * interior btree node updates aren't consistent with the
-		 * journal; after an unclean shutdown we have to walk all
-		 * pointers to metadata:
-		 */
-		bch_info(c, "starting metadata mark and sweep");
-		err = "error in mark and sweep";
-		ret = bch2_gc(c, &c->journal_keys, true, true);
-		if (ret)
-			goto err;
-		bch_verbose(c, "mark and sweep done");
-	}
-
 	if (c->opts.fsck ||
 	    !(c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_INFO)) ||
+	    !(c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_METADATA)) ||
 	    test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags)) {
 		bch_info(c, "starting mark and sweep");
 		err = "error in mark and sweep";
-		ret = bch2_gc(c, &c->journal_keys, true, false);
+		ret = bch2_gc(c, &c->journal_keys, true);
 		if (ret)
 			goto err;
 		bch_verbose(c, "mark and sweep done");
-- 
cgit 


From 72eab8da47b211f50d0b68548e4cf070efb0c7ef Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 21 Jan 2021 20:51:51 -0500
Subject: bcachefs: Refactor dev usage

This is to make it more amenable for serialization.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c | 42 ++++++++++----------
 fs/bcachefs/alloc_foreground.c | 19 +++++----
 fs/bcachefs/buckets.c          | 38 +++++++-----------
 fs/bcachefs/buckets.h          | 11 +-----
 fs/bcachefs/buckets_types.h    | 13 +++----
 fs/bcachefs/chardev.c          |  6 +--
 fs/bcachefs/movinggc.c         |  2 +-
 fs/bcachefs/super.c            |  2 +-
 fs/bcachefs/sysfs.c            | 87 +++++++++++++++++-------------------------
 9 files changed, 91 insertions(+), 129 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index d93c7809d821..e9200f883894 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -54,10 +54,10 @@ static void pd_controllers_update(struct work_struct *work)
 		 * reclaimed by copy GC
 		 */
 		fragmented += max_t(s64, 0, (bucket_to_sector(ca,
-					stats.buckets[BCH_DATA_user] +
-					stats.buckets[BCH_DATA_cached]) -
-				  (stats.sectors[BCH_DATA_user] +
-				   stats.sectors[BCH_DATA_cached])) << 9);
+					stats.d[BCH_DATA_user].buckets +
+					stats.d[BCH_DATA_cached].buckets) -
+				  (stats.d[BCH_DATA_user].sectors +
+				   stats.d[BCH_DATA_cached].sectors)) << 9);
 	}
 
 	bch2_pd_controller_update(&c->copygc_pd, free, fragmented, -1);
@@ -217,7 +217,7 @@ static int bch2_alloc_read_fn(struct bch_fs *c, enum btree_id id,
 		return 0;
 
 	ca = bch_dev_bkey_exists(c, k.k->p.inode);
-	g = __bucket(ca, k.k->p.offset, 0);
+	g = bucket(ca, k.k->p.offset);
 	u = bch2_alloc_unpack(k);
 
 	g->_mark.gen		= u.gen;
@@ -278,7 +278,6 @@ static int bch2_alloc_write_key(struct btree_trans *trans,
 	struct bch_fs *c = trans->c;
 	struct bkey_s_c k;
 	struct bch_dev *ca;
-	struct bucket_array *ba;
 	struct bucket *g;
 	struct bucket_mark m;
 	struct bkey_alloc_unpacked old_u, new_u;
@@ -302,9 +301,7 @@ retry:
 
 	percpu_down_read(&c->mark_lock);
 	ca	= bch_dev_bkey_exists(c, iter->pos.inode);
-	ba	= bucket_array(ca);
-
-	g	= &ba->b[iter->pos.offset];
+	g	= bucket(ca, iter->pos.offset);
 	m	= READ_ONCE(g->mark);
 	new_u	= alloc_mem_to_key(g, m);
 	percpu_up_read(&c->mark_lock);
@@ -330,16 +327,10 @@ int bch2_dev_alloc_write(struct bch_fs *c, struct bch_dev *ca, unsigned flags)
 {
 	struct btree_trans trans;
 	struct btree_iter *iter;
-	u64 first_bucket, nbuckets;
+	u64 first_bucket	= ca->mi.first_bucket;
+	u64 nbuckets		= ca->mi.nbuckets;
 	int ret = 0;
 
-	percpu_down_read(&c->mark_lock);
-	first_bucket	= bucket_array(ca)->first_bucket;
-	nbuckets	= bucket_array(ca)->nbuckets;
-	percpu_up_read(&c->mark_lock);
-
-	BUG_ON(BKEY_ALLOC_VAL_U64s_MAX > 8);
-
 	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
 
 	iter = bch2_trans_get_iter(&trans, BTREE_ID_ALLOC,
@@ -552,7 +543,8 @@ out:
 static int wait_buckets_available(struct bch_fs *c, struct bch_dev *ca)
 {
 	unsigned long gc_count = c->gc_count;
-	u64 available;
+	s64 available;
+	unsigned i;
 	int ret = 0;
 
 	ca->allocator_state = ALLOCATOR_BLOCKED;
@@ -568,8 +560,15 @@ static int wait_buckets_available(struct bch_fs *c, struct bch_dev *ca)
 		if (gc_count != c->gc_count)
 			ca->inc_gen_really_needs_gc = 0;
 
-		available = max_t(s64, 0, dev_buckets_available(ca) -
-				  ca->inc_gen_really_needs_gc);
+		available  = dev_buckets_available(ca);
+		available -= ca->inc_gen_really_needs_gc;
+
+		spin_lock(&c->freelist_lock);
+		for (i = 0; i < RESERVE_NR; i++)
+			available -= fifo_used(&ca->free[i]);
+		spin_unlock(&c->freelist_lock);
+
+		available = max(available, 0LL);
 
 		if (available > fifo_free(&ca->free_inc) ||
 		    (available &&
@@ -598,6 +597,9 @@ static bool bch2_can_invalidate_bucket(struct bch_dev *ca,
 	if (!is_available_bucket(mark))
 		return false;
 
+	if (mark.owned_by_allocator)
+		return false;
+
 	if (ca->buckets_nouse &&
 	    test_bit(bucket, ca->buckets_nouse))
 		return false;
diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index 07aabae379c8..97b692bcfe46 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -192,8 +192,9 @@ long bch2_bucket_alloc_new_fs(struct bch_dev *ca)
 	rcu_read_lock();
 	buckets = bucket_array(ca);
 
-	for (b = ca->mi.first_bucket; b < ca->mi.nbuckets; b++)
-		if (is_available_bucket(buckets->b[b].mark))
+	for (b = buckets->first_bucket; b < buckets->nbuckets; b++)
+		if (is_available_bucket(buckets->b[b].mark) &&
+		    !buckets->b[b].mark.owned_by_allocator)
 			goto success;
 	b = -1;
 success:
@@ -224,9 +225,8 @@ struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca,
 				      bool may_alloc_partial,
 				      struct closure *cl)
 {
-	struct bucket_array *buckets;
 	struct open_bucket *ob;
-	long bucket = 0;
+	long b = 0;
 
 	spin_lock(&c->freelist_lock);
 
@@ -260,13 +260,13 @@ struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca,
 		return ERR_PTR(-OPEN_BUCKETS_EMPTY);
 	}
 
-	if (likely(fifo_pop(&ca->free[RESERVE_NONE], bucket)))
+	if (likely(fifo_pop(&ca->free[RESERVE_NONE], b)))
 		goto out;
 
 	switch (reserve) {
 	case RESERVE_BTREE_MOVINGGC:
 	case RESERVE_MOVINGGC:
-		if (fifo_pop(&ca->free[RESERVE_MOVINGGC], bucket))
+		if (fifo_pop(&ca->free[RESERVE_MOVINGGC], b))
 			goto out;
 		break;
 	default:
@@ -284,20 +284,19 @@ struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca,
 	trace_bucket_alloc_fail(ca, reserve);
 	return ERR_PTR(-FREELIST_EMPTY);
 out:
-	verify_not_on_freelist(c, ca, bucket);
+	verify_not_on_freelist(c, ca, b);
 
 	ob = bch2_open_bucket_alloc(c);
 
 	spin_lock(&ob->lock);
-	buckets = bucket_array(ca);
 
 	ob->valid	= true;
 	ob->sectors_free = ca->mi.bucket_size;
 	ob->alloc_reserve = reserve;
 	ob->ptr		= (struct bch_extent_ptr) {
 		.type	= 1 << BCH_EXTENT_ENTRY_ptr,
-		.gen	= buckets->b[bucket].mark.gen,
-		.offset	= bucket_to_sector(ca, bucket),
+		.gen	= bucket(ca, b)->mark.gen,
+		.offset	= bucket_to_sector(ca, b),
 		.dev	= ca->dev_idx,
 	};
 
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 95368c9f70c3..327d34b30de0 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -379,15 +379,12 @@ static inline int is_unavailable_bucket(struct bucket_mark m)
 	return !is_available_bucket(m);
 }
 
-static inline int is_fragmented_bucket(struct bucket_mark m,
-				       struct bch_dev *ca)
-{
-	if (!m.owned_by_allocator &&
-	    m.data_type == BCH_DATA_user &&
-	    bucket_sectors_used(m))
-		return max_t(int, 0, (int) ca->mi.bucket_size -
-			     bucket_sectors_used(m));
-	return 0;
+static inline int bucket_sectors_fragmented(struct bch_dev *ca,
+					    struct bucket_mark m)
+{
+	return bucket_sectors_used(m)
+		? max(0, (int) ca->mi.bucket_size - (int) bucket_sectors_used(m))
+		: 0;
 }
 
 static inline int is_stripe_data_bucket(struct bucket_mark m)
@@ -395,11 +392,6 @@ static inline int is_stripe_data_bucket(struct bucket_mark m)
 	return m.stripe && m.data_type != BCH_DATA_parity;
 }
 
-static inline int bucket_stripe_sectors(struct bucket_mark m)
-{
-	return is_stripe_data_bucket(m) ? m.dirty_sectors : 0;
-}
-
 static inline enum bch_data_type bucket_type(struct bucket_mark m)
 {
 	return m.cached_sectors && !m.dirty_sectors
@@ -461,7 +453,7 @@ static inline void account_bucket(struct bch_fs_usage *fs_usage,
 	if (type == BCH_DATA_sb || type == BCH_DATA_journal)
 		fs_usage->hidden	+= size;
 
-	dev_usage->buckets[type]	+= nr;
+	dev_usage->d[type].buckets	+= nr;
 }
 
 static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
@@ -487,16 +479,14 @@ static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
 	u->buckets_unavailable +=
 		is_unavailable_bucket(new) - is_unavailable_bucket(old);
 
-	u->buckets_ec += (int) new.stripe - (int) old.stripe;
-	u->sectors_ec += bucket_stripe_sectors(new) -
-			 bucket_stripe_sectors(old);
-
-	u->sectors[old.data_type] -= old.dirty_sectors;
-	u->sectors[new.data_type] += new.dirty_sectors;
-	u->sectors[BCH_DATA_cached] +=
+	u->d[old.data_type].sectors -= old.dirty_sectors;
+	u->d[new.data_type].sectors += new.dirty_sectors;
+	u->d[BCH_DATA_cached].sectors +=
 		(int) new.cached_sectors - (int) old.cached_sectors;
-	u->sectors_fragmented +=
-		is_fragmented_bucket(new, ca) - is_fragmented_bucket(old, ca);
+
+	u->d[old.data_type].fragmented -= bucket_sectors_fragmented(ca, old);
+	u->d[new.data_type].fragmented += bucket_sectors_fragmented(ca, new);
+
 	preempt_enable();
 
 	if (!is_available_bucket(old) && is_available_bucket(new))
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index a0ef9c041d5c..14f53c92bb7b 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -153,18 +153,9 @@ static inline unsigned bucket_sectors_used(struct bucket_mark mark)
 	return mark.dirty_sectors + mark.cached_sectors;
 }
 
-static inline bool bucket_unused(struct bucket_mark mark)
-{
-	return !mark.owned_by_allocator &&
-		!mark.data_type &&
-		!bucket_sectors_used(mark);
-}
-
 static inline bool is_available_bucket(struct bucket_mark mark)
 {
-	return (!mark.owned_by_allocator &&
-		!mark.dirty_sectors &&
-		!mark.stripe);
+	return !mark.dirty_sectors && !mark.stripe;
 }
 
 static inline bool bucket_needs_journal_commit(struct bucket_mark m,
diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h
index 9364addf8441..98b6c18ca2e8 100644
--- a/fs/bcachefs/buckets_types.h
+++ b/fs/bcachefs/buckets_types.h
@@ -52,15 +52,14 @@ struct bucket_array {
 };
 
 struct bch_dev_usage {
-	u64			buckets[BCH_DATA_NR];
+	u64			buckets_ec;
 	u64			buckets_unavailable;
 
-	/* _compressed_ sectors: */
-	u64			sectors[BCH_DATA_NR];
-	u64			sectors_fragmented;
-
-	u64			buckets_ec;
-	u64			sectors_ec;
+	struct {
+		u64		buckets;
+		u64		sectors; /* _compressed_ sectors: */
+		u64		fragmented;
+	}			d[BCH_DATA_NR];
 };
 
 struct bch_fs_usage {
diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c
index 7c77fd09c834..34085e32a159 100644
--- a/fs/bcachefs/chardev.c
+++ b/fs/bcachefs/chardev.c
@@ -477,11 +477,11 @@ static long bch2_ioctl_dev_usage(struct bch_fs *c,
 	arg.nr_buckets		= ca->mi.nbuckets - ca->mi.first_bucket;
 	arg.available_buckets	= arg.nr_buckets - src.buckets_unavailable;
 	arg.ec_buckets		= src.buckets_ec;
-	arg.ec_sectors		= src.sectors_ec;
+	arg.ec_sectors		= 0;
 
 	for (i = 0; i < BCH_DATA_NR; i++) {
-		arg.buckets[i] = src.buckets[i];
-		arg.sectors[i] = src.sectors[i];
+		arg.buckets[i] = src.d[i].buckets;
+		arg.sectors[i] = src.d[i].sectors;
 	}
 
 	percpu_ref_put(&ca->ref);
diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c
index 659dcfb2cca1..a867460bc71c 100644
--- a/fs/bcachefs/movinggc.c
+++ b/fs/bcachefs/movinggc.c
@@ -291,7 +291,7 @@ unsigned long bch2_copygc_wait_amount(struct bch_fs *c)
 
 		fragmented_allowed += ((__dev_buckets_available(ca, usage) *
 					ca->mi.bucket_size) >> 1);
-		fragmented += usage.sectors_fragmented;
+		fragmented += usage.d[BCH_DATA_user].fragmented;
 	}
 
 	return max_t(s64, 0, fragmented_allowed - fragmented);
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 2b3fb07fbc4d..e04d68ceb55b 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -1221,7 +1221,7 @@ static int bch2_dev_attach_bdev(struct bch_fs *c, struct bch_sb_handle *sb)
 		return ret;
 
 	if (test_bit(BCH_FS_ALLOC_READ_DONE, &c->flags) &&
-	    !percpu_u64_get(&ca->usage[0]->buckets[BCH_DATA_sb])) {
+	    !percpu_u64_get(&ca->usage[0]->d[BCH_DATA_sb].buckets)) {
 		mutex_lock(&c->sb_lock);
 		bch2_mark_dev_superblock(ca->fs, ca, 0);
 		mutex_unlock(&c->sb_lock);
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index aa58c595c5cb..57b1dbe04178 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -797,59 +797,40 @@ static void dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca)
 		nr[c->open_buckets[i].type]++;
 
 	pr_buf(out,
-		"free_inc:               %zu/%zu\n"
-		"free[RESERVE_MOVINGGC]: %zu/%zu\n"
-		"free[RESERVE_NONE]:     %zu/%zu\n"
-		"buckets:\n"
-		"    capacity:           %llu\n"
-		"    sb:                 %llu\n"
-		"    journal:            %llu\n"
-		"    meta:               %llu\n"
-		"    user:               %llu\n"
-		"    cached:             %llu\n"
-		"    erasure coded:      %llu\n"
-		"    available:          %lli\n"
-		"sectors:\n"
-		"    sb:                 %llu\n"
-		"    journal:            %llu\n"
-		"    meta:               %llu\n"
-		"    user:               %llu\n"
-		"    cached:             %llu\n"
-		"    erasure coded:      %llu\n"
-		"    fragmented:         %llu\n"
-		"    copygc threshold:   %llu\n"
-		"freelist_wait:          %s\n"
-		"open buckets:           %u/%u (reserved %u)\n"
-		"open_buckets_wait:      %s\n"
-		"open_buckets_btree:     %u\n"
-		"open_buckets_user:      %u\n"
-		"btree reserve cache:    %u\n",
-		fifo_used(&ca->free_inc),		ca->free_inc.size,
-		fifo_used(&ca->free[RESERVE_MOVINGGC]),	ca->free[RESERVE_MOVINGGC].size,
-		fifo_used(&ca->free[RESERVE_NONE]),	ca->free[RESERVE_NONE].size,
-		ca->mi.nbuckets - ca->mi.first_bucket,
-		stats.buckets[BCH_DATA_sb],
-		stats.buckets[BCH_DATA_journal],
-		stats.buckets[BCH_DATA_btree],
-		stats.buckets[BCH_DATA_user],
-		stats.buckets[BCH_DATA_cached],
-		stats.buckets_ec,
-		__dev_buckets_available(ca, stats),
-		stats.sectors[BCH_DATA_sb],
-		stats.sectors[BCH_DATA_journal],
-		stats.sectors[BCH_DATA_btree],
-		stats.sectors[BCH_DATA_user],
-		stats.sectors[BCH_DATA_cached],
-		stats.sectors_ec,
-		stats.sectors_fragmented,
-		c->copygc_threshold,
-		c->freelist_wait.list.first		? "waiting" : "empty",
-		c->open_buckets_nr_free, OPEN_BUCKETS_COUNT,
-		BTREE_NODE_OPEN_BUCKET_RESERVE,
-		c->open_buckets_wait.list.first		? "waiting" : "empty",
-		nr[BCH_DATA_btree],
-		nr[BCH_DATA_user],
-		c->btree_reserve_cache_nr);
+	       "\t\t buckets\t sectors      fragmented\n"
+	       "capacity%16llu\n",
+	       ca->mi.nbuckets - ca->mi.first_bucket);
+
+	for (i = 1; i < BCH_DATA_NR; i++)
+		pr_buf(out, "%-8s%16llu%16llu%16llu\n",
+		       bch2_data_types[i], stats.d[i].buckets,
+		       stats.d[i].sectors, stats.d[i].fragmented);
+
+	pr_buf(out,
+	       "ec\t%16llu\n"
+	       "available%15llu\n"
+	       "\n"
+	       "free_inc\t\t%zu/%zu\n"
+	       "free[RESERVE_MOVINGGC]\t%zu/%zu\n"
+	       "free[RESERVE_NONE]\t%zu/%zu\n"
+	       "freelist_wait\t\t%s\n"
+	       "open buckets\t\t%u/%u (reserved %u)\n"
+	       "open_buckets_wait\t%s\n"
+	       "open_buckets_btree\t%u\n"
+	       "open_buckets_user\t%u\n"
+	       "btree reserve cache\t%u\n",
+	       stats.buckets_ec,
+	       __dev_buckets_available(ca, stats),
+	       fifo_used(&ca->free_inc),		ca->free_inc.size,
+	       fifo_used(&ca->free[RESERVE_MOVINGGC]),	ca->free[RESERVE_MOVINGGC].size,
+	       fifo_used(&ca->free[RESERVE_NONE]),	ca->free[RESERVE_NONE].size,
+	       c->freelist_wait.list.first		? "waiting" : "empty",
+	       c->open_buckets_nr_free, OPEN_BUCKETS_COUNT,
+	       BTREE_NODE_OPEN_BUCKET_RESERVE,
+	       c->open_buckets_wait.list.first		? "waiting" : "empty",
+	       nr[BCH_DATA_btree],
+	       nr[BCH_DATA_user],
+	       c->btree_reserve_cache_nr);
 }
 
 static const char * const bch2_rw[] = {
-- 
cgit 


From 9afc6652d14ac83ef9c5ce3544becad22ea50baa Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 22 Jan 2021 18:19:15 -0500
Subject: bcachefs: Kill bch2_invalidate_bucket()

This patch is working towards eventually getting rid of the in memory
struct bucket, and relying only on the btree representation.

Since bch2_invalidate_bucket() was only used for incrementing gens, not
invalidating cached data, no other counters were being changed as a side
effect - meaning it's safe for the allocator code to increment the
bucket gen directly.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c | 30 ++++++++++++++----------------
 fs/bcachefs/buckets.c          | 40 ----------------------------------------
 fs/bcachefs/buckets.h          |  2 --
 3 files changed, 14 insertions(+), 58 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index e9200f883894..b306eed02a6d 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -896,34 +896,32 @@ static int bch2_invalidate_one_bucket2(struct btree_trans *trans,
 
 	/* first, put on free_inc and mark as owned by allocator: */
 	percpu_down_read(&c->mark_lock);
-	spin_lock(&c->freelist_lock);
-
-	verify_not_on_freelist(c, ca, b);
-
-	BUG_ON(!fifo_push(&ca->free_inc, b));
-
 	g = bucket(ca, b);
 	m = READ_ONCE(g->mark);
 
-	invalidating_cached_data = m.cached_sectors != 0;
+	BUG_ON(m.data_type || m.dirty_sectors);
+
+	bch2_mark_alloc_bucket(c, ca, b, true, gc_pos_alloc(c, NULL), 0);
+
+	spin_lock(&c->freelist_lock);
+	verify_not_on_freelist(c, ca, b);
+	BUG_ON(!fifo_push(&ca->free_inc, b));
+	spin_unlock(&c->freelist_lock);
 
 	/*
 	 * If we're not invalidating cached data, we only increment the bucket
 	 * gen in memory here, the incremented gen will be updated in the btree
 	 * by bch2_trans_mark_pointer():
 	 */
+	if (!m.cached_sectors &&
+	    !bucket_needs_journal_commit(m, c->journal.last_seq_ondisk)) {
+		bucket_cmpxchg(g, m, m.gen++);
+		percpu_up_read(&c->mark_lock);
+		goto out;
+	}
 
-	if (!invalidating_cached_data)
-		bch2_invalidate_bucket(c, ca, b, &m);
-	else
-		bch2_mark_alloc_bucket(c, ca, b, true, gc_pos_alloc(c, NULL), 0);
-
-	spin_unlock(&c->freelist_lock);
 	percpu_up_read(&c->mark_lock);
 
-	if (!invalidating_cached_data)
-		goto out;
-
 	/*
 	 * If the read-only path is trying to shut down, we can't be generating
 	 * new btree updates:
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 327d34b30de0..c3d63a190154 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -644,46 +644,6 @@ unwind:
 	ret;								\
 })
 
-static int __bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca,
-				    size_t b, struct bucket_mark *ret,
-				    bool gc)
-{
-	struct bch_fs_usage *fs_usage = fs_usage_ptr(c, 0, gc);
-	struct bucket *g = __bucket(ca, b, gc);
-	struct bucket_mark old, new;
-
-	old = bucket_cmpxchg(g, new, ({
-		BUG_ON(!is_available_bucket(new));
-
-		new.owned_by_allocator	= true;
-		new.data_type		= 0;
-		new.cached_sectors	= 0;
-		new.dirty_sectors	= 0;
-		new.gen++;
-	}));
-
-	bch2_dev_usage_update(c, ca, fs_usage, old, new, gc);
-
-	if (old.cached_sectors)
-		update_cached_sectors(c, fs_usage, ca->dev_idx,
-				      -((s64) old.cached_sectors));
-
-	if (!gc)
-		*ret = old;
-	return 0;
-}
-
-void bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca,
-			    size_t b, struct bucket_mark *old)
-{
-	do_mark_fn(__bch2_invalidate_bucket, c, gc_phase(GC_PHASE_START), 0,
-		   ca, b, old);
-
-	if (!old->owned_by_allocator && old->cached_sectors)
-		trace_invalidate(ca, bucket_to_sector(ca, b),
-				 old->cached_sectors);
-}
-
 static int __bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
 				    size_t b, bool owned_by_allocator,
 				    bool gc)
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index 14f53c92bb7b..7eebae7c439d 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -236,8 +236,6 @@ bch2_fs_usage_read_short(struct bch_fs *);
 void bch2_bucket_seq_cleanup(struct bch_fs *);
 void bch2_fs_usage_initialize(struct bch_fs *);
 
-void bch2_invalidate_bucket(struct bch_fs *, struct bch_dev *,
-			    size_t, struct bucket_mark *);
 void bch2_mark_alloc_bucket(struct bch_fs *, struct bch_dev *,
 			    size_t, bool, struct gc_pos, unsigned);
 void bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *,
-- 
cgit 


From bfcf840ddf0697f991f2591b56a9f1969accbd23 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 22 Jan 2021 17:56:34 -0500
Subject: bcachefs: Mark superblocks transactionally

More work towards getting rid of the in memory struct bucket: this path
adds code for marking superblock and journal buckets via the btree, and
uses it in the device add and journal resize paths.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c |  46 +++++-------
 fs/bcachefs/alloc_background.h |   1 -
 fs/bcachefs/buckets.c          | 162 +++++++++++++++++++++++++++++++++++++++++
 fs/bcachefs/buckets.h          |   6 ++
 fs/bcachefs/journal.c          |  19 ++++-
 fs/bcachefs/super.c            |  24 +++---
 6 files changed, 211 insertions(+), 47 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index b306eed02a6d..206134fec320 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -323,48 +323,36 @@ err:
 	return ret;
 }
 
-int bch2_dev_alloc_write(struct bch_fs *c, struct bch_dev *ca, unsigned flags)
+int bch2_alloc_write(struct bch_fs *c, unsigned flags)
 {
 	struct btree_trans trans;
 	struct btree_iter *iter;
-	u64 first_bucket	= ca->mi.first_bucket;
-	u64 nbuckets		= ca->mi.nbuckets;
+	struct bch_dev *ca;
+	unsigned i;
 	int ret = 0;
 
 	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
 
-	iter = bch2_trans_get_iter(&trans, BTREE_ID_ALLOC,
-				   POS(ca->dev_idx, first_bucket),
+	iter = bch2_trans_get_iter(&trans, BTREE_ID_ALLOC, POS_MIN,
 				   BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
 
-	while (iter->pos.offset < nbuckets) {
-		bch2_trans_cond_resched(&trans);
-
-		ret = bch2_alloc_write_key(&trans, iter, flags);
-		if (ret)
-			break;
-		bch2_btree_iter_next_slot(iter);
-	}
-
-	bch2_trans_exit(&trans);
-
-	return ret;
-}
+	for_each_member_device(ca, c, i) {
+		bch2_btree_iter_set_pos(iter,
+			POS(ca->dev_idx, ca->mi.first_bucket));
 
-int bch2_alloc_write(struct bch_fs *c, unsigned flags)
-{
-	struct bch_dev *ca;
-	unsigned i;
-	int ret = 0;
+		while (iter->pos.offset < ca->mi.nbuckets) {
+			bch2_trans_cond_resched(&trans);
 
-	for_each_member_device(ca, c, i) {
-		bch2_dev_alloc_write(c, ca, flags);
-		if (ret) {
-			percpu_ref_put(&ca->io_ref);
-			break;
+			ret = bch2_alloc_write_key(&trans, iter, flags);
+			if (ret) {
+				percpu_ref_put(&ca->io_ref);
+				goto err;
+			}
+			bch2_btree_iter_next_slot(iter);
 		}
 	}
-
+err:
+	bch2_trans_exit(&trans);
 	return ret;
 }
 
diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h
index d10ff56e4de1..f60fcebff2ce 100644
--- a/fs/bcachefs/alloc_background.h
+++ b/fs/bcachefs/alloc_background.h
@@ -98,7 +98,6 @@ void bch2_dev_allocator_quiesce(struct bch_fs *, struct bch_dev *);
 void bch2_dev_allocator_stop(struct bch_dev *);
 int bch2_dev_allocator_start(struct bch_dev *);
 
-int bch2_dev_alloc_write(struct bch_fs *, struct bch_dev *, unsigned);
 int bch2_alloc_write(struct bch_fs *, unsigned);
 void bch2_fs_allocator_background_init(struct bch_fs *);
 
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index c3d63a190154..1be527ab1416 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -2060,6 +2060,168 @@ int bch2_trans_mark_update(struct btree_trans *trans,
 	return ret;
 }
 
+static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
+				    struct bch_dev *ca, size_t b,
+				    enum bch_data_type type,
+				    unsigned sectors)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter *iter;
+	struct bkey_alloc_unpacked u;
+	struct bkey_i_alloc *a;
+	struct bch_extent_ptr ptr = {
+		.dev = ca->dev_idx,
+		.offset = bucket_to_sector(ca, b),
+	};
+	int ret = 0;
+
+	a = bch2_trans_kmalloc(trans, BKEY_ALLOC_U64s_MAX * 8);
+	ret = PTR_ERR_OR_ZERO(a);
+	if (ret)
+		return ret;
+
+	ret = bch2_trans_start_alloc_update(trans, &iter, &ptr, &u);
+	if (ret)
+		return ret;
+
+	if (u.data_type && u.data_type != type) {
+		bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
+			"bucket %llu:%llu gen %u different types of data in same bucket: %s, %s\n"
+			"while marking %s",
+			iter->pos.inode, iter->pos.offset, u.gen,
+			bch2_data_types[u.data_type],
+			bch2_data_types[type],
+			bch2_data_types[type]);
+		ret = -EIO;
+		goto out;
+	}
+
+	if ((unsigned) (u.dirty_sectors + sectors) > ca->mi.bucket_size) {
+		bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
+			"bucket %llu:%llu gen %u data type %s sector count overflow: %u + %u > %u\n"
+			"while marking %s",
+			iter->pos.inode, iter->pos.offset, u.gen,
+			bch2_data_types[u.data_type ?: type],
+			u.dirty_sectors, sectors, ca->mi.bucket_size,
+			bch2_data_types[type]);
+		ret = -EIO;
+		goto out;
+	}
+
+	if (u.data_type		== type &&
+	    u.dirty_sectors	== sectors)
+		goto out;
+
+	u.data_type	= type;
+	u.dirty_sectors	= sectors;
+
+	bkey_alloc_init(&a->k_i);
+	a->k.p = iter->pos;
+	bch2_alloc_pack(a, u);
+	bch2_trans_update(trans, iter, &a->k_i, 0);
+out:
+	bch2_trans_iter_put(trans, iter);
+	return ret;
+}
+
+int bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
+				    struct disk_reservation *res,
+				    struct bch_dev *ca, size_t b,
+				    enum bch_data_type type,
+				    unsigned sectors)
+{
+	return __bch2_trans_do(trans, res, NULL, 0,
+			__bch2_trans_mark_metadata_bucket(trans, ca, b, BCH_DATA_journal,
+							ca->mi.bucket_size));
+
+}
+
+static int bch2_trans_mark_metadata_sectors(struct btree_trans *trans,
+					    struct disk_reservation *res,
+					    struct bch_dev *ca,
+					    u64 start, u64 end,
+					    enum bch_data_type type,
+					    u64 *bucket, unsigned *bucket_sectors)
+{
+	int ret;
+
+	do {
+		u64 b = sector_to_bucket(ca, start);
+		unsigned sectors =
+			min_t(u64, bucket_to_sector(ca, b + 1), end) - start;
+
+		if (b != *bucket) {
+			if (*bucket_sectors) {
+				ret = bch2_trans_mark_metadata_bucket(trans, res, ca,
+						*bucket, type, *bucket_sectors);
+				if (ret)
+					return ret;
+			}
+
+			*bucket		= b;
+			*bucket_sectors	= 0;
+		}
+
+		*bucket_sectors	+= sectors;
+		start += sectors;
+	} while (!ret && start < end);
+
+	return 0;
+}
+
+static int __bch2_trans_mark_dev_sb(struct btree_trans *trans,
+			     struct disk_reservation *res,
+			     struct bch_dev *ca)
+{
+	struct bch_sb_layout *layout = &ca->disk_sb.sb->layout;
+	u64 bucket = 0;
+	unsigned i, bucket_sectors = 0;
+	int ret;
+
+	for (i = 0; i < layout->nr_superblocks; i++) {
+		u64 offset = le64_to_cpu(layout->sb_offset[i]);
+
+		if (offset == BCH_SB_SECTOR) {
+			ret = bch2_trans_mark_metadata_sectors(trans, res, ca,
+						0, BCH_SB_SECTOR,
+						BCH_DATA_sb, &bucket, &bucket_sectors);
+			if (ret)
+				return ret;
+		}
+
+		ret = bch2_trans_mark_metadata_sectors(trans, res, ca, offset,
+				      offset + (1 << layout->sb_max_size_bits),
+				      BCH_DATA_sb, &bucket, &bucket_sectors);
+		if (ret)
+			return ret;
+	}
+
+	if (bucket_sectors) {
+		ret = bch2_trans_mark_metadata_bucket(trans, res, ca,
+				bucket, BCH_DATA_sb, bucket_sectors);
+		if (ret)
+			return ret;
+	}
+
+	for (i = 0; i < ca->journal.nr; i++) {
+		ret = bch2_trans_mark_metadata_bucket(trans, res, ca,
+				ca->journal.buckets[i],
+				BCH_DATA_journal, ca->mi.bucket_size);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+int bch2_trans_mark_dev_sb(struct bch_fs *c,
+			   struct disk_reservation *res,
+			   struct bch_dev *ca)
+{
+	return bch2_trans_do(c, res, NULL, 0,
+			__bch2_trans_mark_dev_sb(&trans, res, ca));
+}
+
 /* Disk reservations: */
 
 #define SECTORS_CACHE	1024
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index 7eebae7c439d..4103ea7e769a 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -259,6 +259,12 @@ int bch2_trans_mark_update(struct btree_trans *, struct btree_iter *iter,
 			   struct bkey_i *insert, unsigned);
 void bch2_trans_fs_usage_apply(struct btree_trans *, struct bch_fs_usage_online *);
 
+int bch2_trans_mark_metadata_bucket(struct btree_trans *,
+			struct disk_reservation *, struct bch_dev *,
+			size_t, enum bch_data_type, unsigned);
+int bch2_trans_mark_dev_sb(struct bch_fs *, struct disk_reservation *,
+			   struct bch_dev *);
+
 /* disk reservations: */
 
 static inline void bch2_disk_reservation_put(struct bch_fs *c,
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index ecc3629bcd4c..d4c5c6306928 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -9,6 +9,7 @@
 #include "alloc_foreground.h"
 #include "bkey_methods.h"
 #include "btree_gc.h"
+#include "btree_update.h"
 #include "buckets.h"
 #include "journal.h"
 #include "journal_io.h"
@@ -823,18 +824,28 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
 		if (pos <= ja->cur_idx)
 			ja->cur_idx = (ja->cur_idx + 1) % ja->nr;
 
-		bch2_mark_metadata_bucket(c, ca, bucket, BCH_DATA_journal,
-					  ca->mi.bucket_size,
-					  gc_phase(GC_PHASE_SB),
-					  0);
+		if (!c || new_fs)
+			bch2_mark_metadata_bucket(c, ca, bucket, BCH_DATA_journal,
+						  ca->mi.bucket_size,
+						  gc_phase(GC_PHASE_SB),
+						  0);
 
 		if (c) {
 			spin_unlock(&c->journal.lock);
 			percpu_up_read(&c->mark_lock);
 		}
 
+		if (c && !new_fs)
+			ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_NOFAIL,
+				bch2_trans_mark_metadata_bucket(&trans, NULL, ca,
+						bucket, BCH_DATA_journal,
+						ca->mi.bucket_size));
+
 		if (!new_fs)
 			bch2_open_bucket_put(c, ob);
+
+		if (ret)
+			goto err;
 	}
 err:
 	bch2_sb_resize_journal(&ca->disk_sb,
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index e04d68ceb55b..bdaea336be85 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -1220,13 +1220,6 @@ static int bch2_dev_attach_bdev(struct bch_fs *c, struct bch_sb_handle *sb)
 	if (ret)
 		return ret;
 
-	if (test_bit(BCH_FS_ALLOC_READ_DONE, &c->flags) &&
-	    !percpu_u64_get(&ca->usage[0]->d[BCH_DATA_sb].buckets)) {
-		mutex_lock(&c->sb_lock);
-		bch2_mark_dev_superblock(ca->fs, ca, 0);
-		mutex_unlock(&c->sb_lock);
-	}
-
 	bch2_dev_sysfs_online(c, ca);
 
 	if (c->sb.nr_devices == 1)
@@ -1600,7 +1593,7 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
 	 * allocate the journal, reset all the marks, then remark after we
 	 * attach...
 	 */
-	bch2_mark_dev_superblock(ca->fs, ca, 0);
+	bch2_mark_dev_superblock(NULL, ca, 0);
 
 	err = "journal alloc failed";
 	ret = bch2_dev_journal_alloc(ca);
@@ -1659,15 +1652,13 @@ have_slot:
 	ca->disk_sb.sb->dev_idx	= dev_idx;
 	bch2_dev_attach(c, ca, dev_idx);
 
-	bch2_mark_dev_superblock(c, ca, 0);
-
 	bch2_write_super(c);
 	mutex_unlock(&c->sb_lock);
 
-	err = "alloc write failed";
-	ret = bch2_dev_alloc_write(c, ca, 0);
+	err = "error marking superblock";
+	ret = bch2_trans_mark_dev_sb(c, NULL, ca);
 	if (ret)
-		goto err;
+		goto err_late;
 
 	if (ca->mi.state == BCH_MEMBER_STATE_RW) {
 		err = __bch2_dev_read_write(c, ca);
@@ -1688,6 +1679,7 @@ err:
 	bch_err(c, "Unable to add device: %s", err);
 	return ret;
 err_late:
+	up_write(&c->state_lock);
 	bch_err(c, "Error going rw after adding device: %s", err);
 	return -EINVAL;
 }
@@ -1723,6 +1715,12 @@ int bch2_dev_online(struct bch_fs *c, const char *path)
 	}
 
 	ca = bch_dev_locked(c, dev_idx);
+
+	if (bch2_trans_mark_dev_sb(c, NULL, ca)) {
+		err = "bch2_trans_mark_dev_sb() error";
+		goto err;
+	}
+
 	if (ca->mi.state == BCH_MEMBER_STATE_RW) {
 		err = __bch2_dev_read_write(c, ca);
 		if (err)
-- 
cgit 


From 4529ae09cea2c040180e991ea648588220611497 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 25 Jan 2021 14:04:31 -0500
Subject: bcachefs: Fix an assertion

If we're invalidating a bucket that has cached data in it, data_type
won't be 0 - oops.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 206134fec320..68fa6caf022d 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -887,7 +887,7 @@ static int bch2_invalidate_one_bucket2(struct btree_trans *trans,
 	g = bucket(ca, b);
 	m = READ_ONCE(g->mark);
 
-	BUG_ON(m.data_type || m.dirty_sectors);
+	BUG_ON(m.dirty_sectors);
 
 	bch2_mark_alloc_bucket(c, ca, b, true, gc_pos_alloc(c, NULL), 0);
 
@@ -903,6 +903,7 @@ static int bch2_invalidate_one_bucket2(struct btree_trans *trans,
 	 */
 	if (!m.cached_sectors &&
 	    !bucket_needs_journal_commit(m, c->journal.last_seq_ondisk)) {
+		BUG_ON(m.data_type);
 		bucket_cmpxchg(g, m, m.gen++);
 		percpu_up_read(&c->mark_lock);
 		goto out;
-- 
cgit 


From 0093a50f2759f6e58fe44bafb80146fd2ef4d6a2 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 27 Jan 2021 19:36:09 -0500
Subject: bcachefs: Fix build in userspace

The userspace bch_err() macro doesn't use the filesystem argument. Could
also be fixed with a better macro.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/ec.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index f280ca20d457..78bea3e5fa9a 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -744,7 +744,6 @@ err:
 static int ec_stripe_bkey_update(struct btree_trans *trans,
 				 struct bkey_i_stripe *new)
 {
-	struct bch_fs *c = trans->c;
 	struct btree_iter *iter;
 	struct bkey_s_c k;
 	const struct bch_stripe *existing;
@@ -759,7 +758,7 @@ static int ec_stripe_bkey_update(struct btree_trans *trans,
 		goto err;
 
 	if (!k.k || k.k->type != KEY_TYPE_stripe) {
-		bch_err(c, "error updating stripe: not found");
+		bch_err(trans->c, "error updating stripe: not found");
 		ret = -ENOENT;
 		goto err;
 	}
@@ -767,7 +766,7 @@ static int ec_stripe_bkey_update(struct btree_trans *trans,
 	existing = bkey_s_c_to_stripe(k).v;
 
 	if (existing->nr_blocks != new->v.nr_blocks) {
-		bch_err(c, "error updating stripe: nr_blocks does not match");
+		bch_err(trans->c, "error updating stripe: nr_blocks does not match");
 		ret = -EINVAL;
 		goto err;
 	}
-- 
cgit 


From 522c25f068bd5df5e1bc623e855262afd90e4a05 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 26 Jan 2021 21:22:19 -0500
Subject: bcachefs: Fix BCH_REPLICAS_MAX check

Ideally, this limit will be going away in the future.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/super-io.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index e1b4e6f02ee3..61b947313c88 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -283,19 +283,19 @@ const char *bch2_sb_validate(struct bch_sb_handle *disk_sb)
 		return "Bad number of member devices";
 
 	if (!BCH_SB_META_REPLICAS_WANT(sb) ||
-	    BCH_SB_META_REPLICAS_WANT(sb) >= BCH_REPLICAS_MAX)
+	    BCH_SB_META_REPLICAS_WANT(sb) > BCH_REPLICAS_MAX)
 		return "Invalid number of metadata replicas";
 
 	if (!BCH_SB_META_REPLICAS_REQ(sb) ||
-	    BCH_SB_META_REPLICAS_REQ(sb) >= BCH_REPLICAS_MAX)
+	    BCH_SB_META_REPLICAS_REQ(sb) > BCH_REPLICAS_MAX)
 		return "Invalid number of metadata replicas";
 
 	if (!BCH_SB_DATA_REPLICAS_WANT(sb) ||
-	    BCH_SB_DATA_REPLICAS_WANT(sb) >= BCH_REPLICAS_MAX)
+	    BCH_SB_DATA_REPLICAS_WANT(sb) > BCH_REPLICAS_MAX)
 		return "Invalid number of data replicas";
 
 	if (!BCH_SB_DATA_REPLICAS_REQ(sb) ||
-	    BCH_SB_DATA_REPLICAS_REQ(sb) >= BCH_REPLICAS_MAX)
+	    BCH_SB_DATA_REPLICAS_REQ(sb) > BCH_REPLICAS_MAX)
 		return "Invalid number of data replicas";
 
 	if (BCH_SB_META_CSUM_TYPE(sb) >= BCH_CSUM_OPT_NR)
-- 
cgit 


From e4c3f386b62d8f7e917b26089c7dea374f7c1fb6 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 26 Jan 2021 16:04:12 -0500
Subject: bcachefs: Improve diagnostics when journal entries are missing

There's an outstanding bug with journal entries being missing in journal
replay. This patch adds code to print out where the journal entries were
physically located that were around the entry(ies) being missing, which
should make debugging easier.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/journal.c    |   8 +++-
 fs/bcachefs/journal_io.c | 112 ++++++++++++++++++++++++++++++++++++-----------
 fs/bcachefs/journal_io.h |   4 +-
 3 files changed, 96 insertions(+), 28 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index d4c5c6306928..ba37c78c01db 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -1010,13 +1010,19 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq,
 	}
 
 	list_for_each_entry(i, journal_entries, list) {
+		unsigned ptr;
+
 		seq = le64_to_cpu(i->j.seq);
 		BUG_ON(seq >= cur_seq);
 
 		if (seq < last_seq)
 			continue;
 
-		journal_seq_pin(j, seq)->devs = i->devs;
+		p = journal_seq_pin(j, seq);
+
+		p->devs.nr = 0;
+		for (ptr = 0; ptr < i->nr_ptrs; ptr++)
+			bch2_dev_list_add_dev(&p->devs, i->ptrs[ptr].dev);
 	}
 
 	spin_lock(&j->lock);
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 40da18d778a3..e693ebd332d2 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -45,15 +45,16 @@ struct journal_list {
  * be replayed:
  */
 static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca,
+			     struct bch_extent_ptr entry_ptr,
 			     struct journal_list *jlist, struct jset *j,
 			     bool bad)
 {
-	struct journal_replay *i, *pos;
-	struct bch_devs_list devs = { .nr = 0 };
+	struct journal_replay *i, *pos, *dup = NULL;
+	struct bch_extent_ptr *ptr;
 	struct list_head *where;
 	size_t bytes = vstruct_bytes(j);
 	u64 last_seq = 0;
-	int ret;
+	int ret = JOURNAL_ENTRY_ADD_OK;
 
 	list_for_each_entry_reverse(i, jlist->head, list) {
 		if (!JSET_NO_FLUSH(&i->j)) {
@@ -87,28 +88,31 @@ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca,
 
 	where = jlist->head;
 add:
-	i = where->next != jlist->head
+	dup = where->next != jlist->head
 		? container_of(where->next, struct journal_replay, list)
 		: NULL;
 
+	if (dup && le64_to_cpu(j->seq) != le64_to_cpu(dup->j.seq))
+		dup = NULL;
+
 	/*
 	 * Duplicate journal entries? If so we want the one that didn't have a
 	 * checksum error:
 	 */
-	if (i && le64_to_cpu(j->seq) == le64_to_cpu(i->j.seq)) {
-		if (i->bad) {
-			devs = i->devs;
-			__journal_replay_free(i);
+	if (dup) {
+		if (dup->bad) {
+			/* we'll replace @dup: */
 		} else if (bad) {
+			i = dup;
 			goto found;
 		} else {
-			fsck_err_on(bytes != vstruct_bytes(&i->j) ||
-				    memcmp(j, &i->j, bytes), c,
+			fsck_err_on(bytes != vstruct_bytes(&dup->j) ||
+				    memcmp(j, &dup->j, bytes), c,
 				    "found duplicate but non identical journal entries (seq %llu)",
 				    le64_to_cpu(j->seq));
+			i = dup;
 			goto found;
 		}
-
 	}
 
 	i = kvpmalloc(offsetof(struct journal_replay, j) + bytes, GFP_KERNEL);
@@ -117,17 +121,34 @@ add:
 		goto out;
 	}
 
-	list_add(&i->list, where);
-	i->devs = devs;
-	i->bad	= bad;
-	i->ignore = false;
+	i->nr_ptrs	 = 0;
+	i->bad		= bad;
+	i->ignore	= false;
 	unsafe_memcpy(&i->j, j, bytes, "embedded variable length struct");
+
+	if (dup) {
+		i->nr_ptrs = dup->nr_ptrs;
+		memcpy(i->ptrs, dup->ptrs, sizeof(dup->ptrs));
+		__journal_replay_free(dup);
+	}
+
+	list_add(&i->list, where);
 found:
-	if (!bch2_dev_list_has_dev(i->devs, ca->dev_idx))
-		bch2_dev_list_add_dev(&i->devs, ca->dev_idx);
-	else
-		fsck_err_on(1, c, "duplicate journal entries on same device");
-	ret = JOURNAL_ENTRY_ADD_OK;
+	for (ptr = i->ptrs; ptr < i->ptrs + i->nr_ptrs; ptr++) {
+		if (ptr->dev == ca->dev_idx) {
+			bch_err(c, "duplicate journal entry %llu on same device",
+				le64_to_cpu(i->j.seq));
+			goto out;
+		}
+	}
+
+	if (i->nr_ptrs >= ARRAY_SIZE(i->ptrs)) {
+		bch_err(c, "found too many copies of journal entry %llu",
+			le64_to_cpu(i->j.seq));
+		goto out;
+	}
+
+	i->ptrs[i->nr_ptrs++] = entry_ptr;
 out:
 fsck_err:
 	return ret;
@@ -653,7 +674,10 @@ reread:
 		ja->bucket_seq[bucket] = le64_to_cpu(j->seq);
 
 		mutex_lock(&jlist->lock);
-		ret = journal_entry_add(c, ca, jlist, j, ret != 0);
+		ret = journal_entry_add(c, ca, (struct bch_extent_ptr) {
+					.dev = ca->dev_idx,
+					.offset	= offset,
+					}, jlist, j, ret != 0);
 		mutex_unlock(&jlist->lock);
 
 		switch (ret) {
@@ -741,6 +765,23 @@ err:
 	goto out;
 }
 
+static void bch2_journal_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
+				      struct journal_replay *j)
+{
+	unsigned i;
+
+	for (i = 0; i < j->nr_ptrs; i++) {
+		struct bch_dev *ca = c->devs[j->ptrs[i].dev];
+
+		if (i)
+			pr_buf(out, " ");
+		pr_buf(out, "%u:%llu (offset %llu)",
+		       j->ptrs[i].dev,
+		       (u64) j->ptrs[i].offset,
+		       (u64) j->ptrs[i].offset % ca->mi.bucket_size);
+	}
+}
+
 int bch2_journal_read(struct bch_fs *c, struct list_head *list,
 		      u64 *blacklist_seq, u64 *start_seq)
 {
@@ -838,6 +879,7 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list,
 
 		while (seq < le64_to_cpu(i->j.seq)) {
 			u64 missing_start, missing_end;
+			char buf1[200], buf2[200];
 
 			while (seq < le64_to_cpu(i->j.seq) &&
 			       bch2_journal_seq_is_blacklisted(c, seq, false))
@@ -852,10 +894,23 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list,
 			       !bch2_journal_seq_is_blacklisted(c, seq, false))
 				seq++;
 
+			if (i->list.prev != list) {
+				struct printbuf out = PBUF(buf1);
+				struct journal_replay *p = list_prev_entry(i, list);
+
+				bch2_journal_ptrs_to_text(&out, c, p);
+				pr_buf(&out, " size %llu", vstruct_sectors(&p->j, c->block_bits));
+			} else
+				sprintf(buf1, "(none)");
+			bch2_journal_ptrs_to_text(&PBUF(buf2), c, i);
+
 			missing_end = seq - 1;
-			fsck_err(c, "journal entries %llu-%llu missing! (replaying %llu-%llu)",
+			fsck_err(c, "journal entries %llu-%llu missing! (replaying %llu-%llu)\n"
+				 "  prev at %s\n"
+				 "  next at %s",
 				 missing_start, missing_end,
-				 last_seq, *blacklist_seq - 1);
+				 last_seq, *blacklist_seq - 1,
+				 buf1, buf2);
 		}
 
 		seq++;
@@ -864,7 +919,11 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list,
 	list_for_each_entry(i, list, list) {
 		struct jset_entry *entry;
 		struct bkey_i *k, *_n;
-		struct bch_replicas_padded replicas;
+		struct bch_replicas_padded replicas = {
+			.e.data_type = BCH_DATA_journal,
+			.e.nr_required = 1,
+		};
+		unsigned ptr;
 		char buf[80];
 
 		if (i->ignore)
@@ -874,13 +933,14 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list,
 		if (ret)
 			goto fsck_err;
 
+		for (ptr = 0; ptr < i->nr_ptrs; ptr++)
+			replicas.e.devs[replicas.e.nr_devs++] = i->ptrs[ptr].dev;
+
 		/*
 		 * If we're mounting in degraded mode - if we didn't read all
 		 * the devices - this is wrong:
 		 */
 
-		bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal, i->devs);
-
 		if (!degraded &&
 		    (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) ||
 		     fsck_err_on(!bch2_replicas_marked(c, &replicas.e), c,
diff --git a/fs/bcachefs/journal_io.h b/fs/bcachefs/journal_io.h
index 6b4c80968f52..a4931ab93a68 100644
--- a/fs/bcachefs/journal_io.h
+++ b/fs/bcachefs/journal_io.h
@@ -8,7 +8,9 @@
  */
 struct journal_replay {
 	struct list_head	list;
-	struct bch_devs_list	devs;
+	struct bch_extent_ptr	ptrs[BCH_REPLICAS_MAX];
+	unsigned		nr_ptrs;
+
 	/* checksum error, but we may want to try using it anyways: */
 	bool			bad;
 	bool			ignore;
-- 
cgit 


From a66f7989742a0071ea4d5d0b0674978ac8ce30ab Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 26 Jan 2021 16:04:38 -0500
Subject: bcachefs: Refactor checking of btree topology

Still a lot of work to be done here: we can't yet repair btree topology
issues, but this patch refactors things so that we have better access to
what we need in the topology checks. Next up will be figuring out a way
to do btree updates during gc, before journal replay is done.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_gc.c | 83 +++++++++++++++++++++++++++++---------------------
 1 file changed, 48 insertions(+), 35 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 6611047dcb0d..6023af960101 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -51,39 +51,46 @@ static inline void gc_pos_set(struct bch_fs *c, struct gc_pos new_pos)
 }
 
 static int bch2_gc_check_topology(struct bch_fs *c,
-				  struct bkey_s_c k,
-				  struct bpos *expected_start,
-				  struct bpos expected_end,
+				  struct btree *b,
+				  struct bkey_buf *prev,
+				  struct bkey_buf cur,
 				  bool is_last)
 {
+	struct bpos node_start	= b->data->min_key;
+	struct bpos node_end	= b->data->max_key;
+	struct bpos expected_start = bkey_deleted(&prev->k->k)
+		? node_start
+		: bkey_successor(prev->k->k.p);
+	char buf1[200], buf2[200];
 	int ret = 0;
 
-	if (k.k->type == KEY_TYPE_btree_ptr_v2) {
-		struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k);
+	if (cur.k->k.type == KEY_TYPE_btree_ptr_v2) {
+		struct bkey_i_btree_ptr_v2 *bp = bkey_i_to_btree_ptr_v2(cur.k);
 
-		if (fsck_err_on(bkey_cmp(*expected_start, bp.v->min_key), c,
-				"btree node with incorrect min_key: got %llu:%llu, should be %llu:%llu",
-				bp.v->min_key.inode,
-				bp.v->min_key.offset,
-				expected_start->inode,
-				expected_start->offset)) {
+		if (bkey_deleted(&prev->k->k))
+			scnprintf(buf1, sizeof(buf1), "start of node: %llu:%llu",
+				  node_start.inode,
+				  node_start.offset);
+		else
+			bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(prev->k));
+
+		if (fsck_err_on(bkey_cmp(expected_start, bp->v.min_key), c,
+				"btree node with incorrect min_key:\n  prev %s\n  cur %s",
+				buf1,
+				(bch2_bkey_val_to_text(&PBUF(buf2), c, bkey_i_to_s_c(cur.k)), buf2))) {
 			BUG();
 		}
 	}
 
-	*expected_start = bkey_cmp(k.k->p, POS_MAX)
-		? bkey_successor(k.k->p)
-		: k.k->p;
-
 	if (fsck_err_on(is_last &&
-			bkey_cmp(k.k->p, expected_end), c,
-			"btree node with incorrect max_key: got %llu:%llu, should be %llu:%llu",
-			k.k->p.inode,
-			k.k->p.offset,
-			expected_end.inode,
-			expected_end.offset)) {
+			bkey_cmp(cur.k->k.p, node_end), c,
+			"btree node with incorrect max_key:\n  %s\n  expected %s",
+			(bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(cur.k)), buf1),
+			(bch2_bpos_to_text(&PBUF(buf2), node_end), buf2))) {
 		BUG();
 	}
+
+	bch2_bkey_buf_copy(prev, c, cur.k);
 fsck_err:
 	return ret;
 }
@@ -169,10 +176,10 @@ fsck_err:
 static int btree_gc_mark_node(struct bch_fs *c, struct btree *b, u8 *max_stale,
 			      bool initial)
 {
-	struct bpos next_node_start = b->data->min_key;
 	struct btree_node_iter iter;
 	struct bkey unpacked;
 	struct bkey_s_c k;
+	struct bkey_buf prev, cur;
 	int ret = 0;
 
 	*max_stale = 0;
@@ -181,6 +188,9 @@ static int btree_gc_mark_node(struct bch_fs *c, struct btree *b, u8 *max_stale,
 		return 0;
 
 	bch2_btree_node_iter_init_from_start(&iter, b);
+	bch2_bkey_buf_init(&prev);
+	bch2_bkey_buf_init(&cur);
+	bkey_init(&prev.k->k);
 
 	while ((k = bch2_btree_node_iter_peek_unpack(&iter, b, &unpacked)).k) {
 		bch2_bkey_debugcheck(c, b, k);
@@ -192,15 +202,17 @@ static int btree_gc_mark_node(struct bch_fs *c, struct btree *b, u8 *max_stale,
 		bch2_btree_node_iter_advance(&iter, b);
 
 		if (b->c.level) {
-			ret = bch2_gc_check_topology(c, k,
-					&next_node_start,
-					b->data->max_key,
+			bch2_bkey_buf_reassemble(&cur, c, k);
+
+			ret = bch2_gc_check_topology(c, b, &prev, cur,
 					bch2_btree_node_iter_end(&iter));
 			if (ret)
 				break;
 		}
 	}
 
+	bch2_bkey_buf_exit(&cur, c);
+	bch2_bkey_buf_exit(&prev, c);
 	return ret;
 }
 
@@ -267,13 +279,14 @@ static int bch2_gc_btree_init_recurse(struct bch_fs *c, struct btree *b,
 {
 	struct btree_and_journal_iter iter;
 	struct bkey_s_c k;
-	struct bpos next_node_start = b->data->min_key;
-	struct bkey_buf tmp;
+	struct bkey_buf cur, prev;
 	u8 max_stale = 0;
 	int ret = 0;
 
 	bch2_btree_and_journal_iter_init_node_iter(&iter, journal_keys, b);
-	bch2_bkey_buf_init(&tmp);
+	bch2_bkey_buf_init(&prev);
+	bch2_bkey_buf_init(&cur);
+	bkey_init(&prev.k->k);
 
 	while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
 		bch2_bkey_debugcheck(c, b, k);
@@ -288,20 +301,19 @@ static int bch2_gc_btree_init_recurse(struct bch_fs *c, struct btree *b,
 		if (b->c.level) {
 			struct btree *child;
 
-			bch2_bkey_buf_reassemble(&tmp, c, k);
-			k = bkey_i_to_s_c(tmp.k);
+			bch2_bkey_buf_reassemble(&cur, c, k);
+			k = bkey_i_to_s_c(cur.k);
 
 			bch2_btree_and_journal_iter_advance(&iter);
 
-			ret = bch2_gc_check_topology(c, k,
-					&next_node_start,
-					b->data->max_key,
+			ret = bch2_gc_check_topology(c, b,
+					&prev, cur,
 					!bch2_btree_and_journal_iter_peek(&iter).k);
 			if (ret)
 				break;
 
 			if (b->c.level > target_depth) {
-				child = bch2_btree_node_get_noiter(c, tmp.k,
+				child = bch2_btree_node_get_noiter(c, cur.k,
 							b->c.btree_id, b->c.level - 1);
 				ret = PTR_ERR_OR_ZERO(child);
 				if (ret)
@@ -319,7 +331,8 @@ static int bch2_gc_btree_init_recurse(struct bch_fs *c, struct btree *b,
 		}
 	}
 
-	bch2_bkey_buf_exit(&tmp, c);
+	bch2_bkey_buf_exit(&cur, c);
+	bch2_bkey_buf_exit(&prev, c);
 	return ret;
 }
 
-- 
cgit 


From 51d2dfb82d0553c5764689d30adabbf6d0927be5 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 26 Jan 2021 20:13:54 -0500
Subject: bcachefs: Add BTREE_PTR_RANGE_UPDATED

This is so that when we discover btree topology issues, we can just
update the pointer to a btree node and signal btree read path that the
min/max keys in the node header should be updated from the node pointer.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs_format.h       | 5 +++--
 fs/bcachefs/btree_io.c              | 5 +++++
 fs/bcachefs/btree_update_interior.c | 1 -
 fs/bcachefs/extents.c               | 8 +++-----
 4 files changed, 11 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index b88a9fdf17ad..a0e445a71c3e 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -607,13 +607,14 @@ struct bch_btree_ptr_v2 {
 	__u64			mem_ptr;
 	__le64			seq;
 	__le16			sectors_written;
-	/* In case we ever decide to do variable size btree nodes: */
-	__le16			sectors;
+	__le16			flags;
 	struct bpos		min_key;
 	__u64			_data[0];
 	struct bch_extent_ptr	start[];
 } __attribute__((packed, aligned(8)));
 
+LE16_BITMASK(BTREE_PTR_RANGE_UPDATED,	struct bch_btree_ptr_v2, flags, 0, 1);
+
 struct bch_extent {
 	struct bch_val		v;
 
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index cd2b300043b6..8ac50c9ffcba 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -753,6 +753,11 @@ static int validate_bset(struct bch_fs *c, struct btree *b,
 			struct bch_btree_ptr_v2 *bp =
 				&bkey_i_to_btree_ptr_v2(&b->key)->v;
 
+			if (BTREE_PTR_RANGE_UPDATED(bp)) {
+				b->data->min_key = bp->min_key;
+				b->data->max_key = b->key.k.p;
+			}
+
 			btree_err_on(bkey_cmp(b->data->min_key, bp->min_key),
 				     BTREE_ERR_MUST_RETRY, c, b, NULL,
 				     "incorrect min_key: got %llu:%llu should be %llu:%llu",
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 3b19c1c7b450..42015f729da7 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -297,7 +297,6 @@ static struct btree *bch2_btree_node_alloc(struct btree_update *as, unsigned lev
 		bp->v.mem_ptr		= 0;
 		bp->v.seq		= b->data->keys.seq;
 		bp->v.sectors_written	= 0;
-		bp->v.sectors		= cpu_to_le16(c->opts.btree_node_size);
 	}
 
 	if (c->sb.features & (1ULL << BCH_FEATURE_new_extent_overwrite))
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index a924cc66b4d0..6e388881ebf9 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -214,9 +214,8 @@ void bch2_btree_ptr_v2_to_text(struct printbuf *out, struct bch_fs *c,
 {
 	struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k);
 
-	pr_buf(out, "seq %llx sectors %u written %u min_key ",
+	pr_buf(out, "seq %llx written %u min_key ",
 	       le64_to_cpu(bp.v->seq),
-	       le16_to_cpu(bp.v->sectors),
 	       le16_to_cpu(bp.v->sectors_written));
 
 	bch2_bpos_to_text(out, bp.v->min_key);
@@ -1081,10 +1080,9 @@ const char *bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k)
 	unsigned nonce = UINT_MAX;
 	unsigned i;
 
-	if (k.k->type == KEY_TYPE_btree_ptr)
+	if (k.k->type == KEY_TYPE_btree_ptr ||
+	    k.k->type == KEY_TYPE_btree_ptr_v2)
 		size_ondisk = c->opts.btree_node_size;
-	if (k.k->type == KEY_TYPE_btree_ptr_v2)
-		size_ondisk = le16_to_cpu(bkey_s_c_to_btree_ptr_v2(k).v->sectors);
 
 	bkey_extent_entry_for_each(ptrs, entry) {
 		if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX)
-- 
cgit 


From 5b593ee172bd536a2c9fd717de7e4a16d682ef23 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 26 Jan 2021 20:15:46 -0500
Subject: bcachefs: Add support for doing btree updates prior to journal replay

Some errors may need to be fixed in order for GC to successfully run -
walk and mark all metadata. But we can't start the allocators and do
normal btree updates until after GC has completed, and allocation
information is known to be consistent, so we need a different method of
doing btree updates.

Fortunately, we already have code for walking the btree while overlaying
keys from the journal to be replayed. This patch adds an update path
that adds keys to the list of keys to be replayed by journal replay, and
also fixes up iterators.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs.h |   3 +
 fs/bcachefs/btree_gc.c |  23 +++---
 fs/bcachefs/btree_gc.h |   3 +-
 fs/bcachefs/recovery.c | 208 ++++++++++++++++++++++++++++++++++++-------------
 fs/bcachefs/recovery.h |  17 ++--
 fs/bcachefs/super.c    |   1 +
 fs/bcachefs/sysfs.c    |   2 +-
 7 files changed, 176 insertions(+), 81 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 799569d1778a..d5fc5eed73ae 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -539,11 +539,13 @@ struct journal_keys {
 	struct journal_key {
 		enum btree_id	btree_id:8;
 		unsigned	level:8;
+		bool		allocated;
 		struct bkey_i	*k;
 		u32		journal_seq;
 		u32		journal_offset;
 	}			*d;
 	size_t			nr;
+	size_t			size;
 	u64			journal_seq_base;
 };
 
@@ -840,6 +842,7 @@ mempool_t		bio_bounce_pages;
 	struct journal		journal;
 	struct list_head	journal_entries;
 	struct journal_keys	journal_keys;
+	struct list_head	journal_iters;
 
 	u64			last_bucket_seq_cleanup;
 
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 6023af960101..8f347ba5b4e6 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -274,7 +274,6 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id,
 }
 
 static int bch2_gc_btree_init_recurse(struct bch_fs *c, struct btree *b,
-				      struct journal_keys *journal_keys,
 				      unsigned target_depth)
 {
 	struct btree_and_journal_iter iter;
@@ -283,7 +282,7 @@ static int bch2_gc_btree_init_recurse(struct bch_fs *c, struct btree *b,
 	u8 max_stale = 0;
 	int ret = 0;
 
-	bch2_btree_and_journal_iter_init_node_iter(&iter, journal_keys, b);
+	bch2_btree_and_journal_iter_init_node_iter(&iter, c, b);
 	bch2_bkey_buf_init(&prev);
 	bch2_bkey_buf_init(&cur);
 	bkey_init(&prev.k->k);
@@ -320,7 +319,7 @@ static int bch2_gc_btree_init_recurse(struct bch_fs *c, struct btree *b,
 					break;
 
 				ret = bch2_gc_btree_init_recurse(c, child,
-						journal_keys, target_depth);
+						target_depth);
 				six_unlock_read(&child->c.lock);
 
 				if (ret)
@@ -333,11 +332,11 @@ static int bch2_gc_btree_init_recurse(struct bch_fs *c, struct btree *b,
 
 	bch2_bkey_buf_exit(&cur, c);
 	bch2_bkey_buf_exit(&prev, c);
+	bch2_btree_and_journal_iter_exit(&iter);
 	return ret;
 }
 
 static int bch2_gc_btree_init(struct bch_fs *c,
-			      struct journal_keys *journal_keys,
 			      enum btree_id btree_id)
 {
 	struct btree *b;
@@ -368,8 +367,7 @@ static int bch2_gc_btree_init(struct bch_fs *c,
 	}
 
 	if (b->c.level >= target_depth)
-		ret = bch2_gc_btree_init_recurse(c, b,
-					journal_keys, target_depth);
+		ret = bch2_gc_btree_init_recurse(c, b, target_depth);
 
 	if (!ret)
 		ret = bch2_gc_mark_key(c, bkey_i_to_s_c(&b->key),
@@ -386,8 +384,7 @@ static inline int btree_id_gc_phase_cmp(enum btree_id l, enum btree_id r)
 		(int) btree_id_to_gc_phase(r);
 }
 
-static int bch2_gc_btrees(struct bch_fs *c, struct journal_keys *journal_keys,
-			  bool initial)
+static int bch2_gc_btrees(struct bch_fs *c, bool initial)
 {
 	enum btree_id ids[BTREE_ID_NR];
 	unsigned i;
@@ -399,8 +396,7 @@ static int bch2_gc_btrees(struct bch_fs *c, struct journal_keys *journal_keys,
 	for (i = 0; i < BTREE_ID_NR; i++) {
 		enum btree_id id = ids[i];
 		int ret = initial
-			? bch2_gc_btree_init(c, journal_keys,
-					     id)
+			? bch2_gc_btree_init(c, id)
 			: bch2_gc_btree(c, id, initial);
 		if (ret)
 			return ret;
@@ -788,8 +784,7 @@ static int bch2_gc_start(struct bch_fs *c)
  *    move around - if references move backwards in the ordering GC
  *    uses, GC could skip past them
  */
-int bch2_gc(struct bch_fs *c, struct journal_keys *journal_keys,
-	    bool initial)
+int bch2_gc(struct bch_fs *c, bool initial)
 {
 	struct bch_dev *ca;
 	u64 start_time = local_clock();
@@ -811,7 +806,7 @@ again:
 
 	bch2_mark_superblocks(c);
 
-	ret = bch2_gc_btrees(c, journal_keys, initial);
+	ret = bch2_gc_btrees(c, initial);
 	if (ret)
 		goto out;
 
@@ -1384,7 +1379,7 @@ static int bch2_gc_thread(void *arg)
 		 * Full gc is currently incompatible with btree key cache:
 		 */
 #if 0
-		ret = bch2_gc(c, NULL, false, false);
+		ret = bch2_gc(c, false, false);
 #else
 		ret = bch2_gc_gens(c);
 #endif
diff --git a/fs/bcachefs/btree_gc.h b/fs/bcachefs/btree_gc.h
index f0435a58793b..fa604efc70cc 100644
--- a/fs/bcachefs/btree_gc.h
+++ b/fs/bcachefs/btree_gc.h
@@ -6,8 +6,7 @@
 
 void bch2_coalesce(struct bch_fs *);
 
-struct journal_keys;
-int bch2_gc(struct bch_fs *, struct journal_keys *, bool);
+int bch2_gc(struct bch_fs *, bool);
 int bch2_gc_gens(struct bch_fs *);
 void bch2_gc_thread_stop(struct bch_fs *);
 int bch2_gc_thread_start(struct bch_fs *);
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 422f2fbe6dfb..88a1d47e6e4b 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -40,78 +40,169 @@ static void drop_alloc_keys(struct journal_keys *keys)
 
 /* iterate over keys read from the journal: */
 
-static struct journal_key *journal_key_search(struct journal_keys *journal_keys,
-					      enum btree_id id, unsigned level,
-					      struct bpos pos)
+static int __journal_key_cmp(enum btree_id	l_btree_id,
+			     unsigned		l_level,
+			     struct bpos	l_pos,
+			     struct journal_key *r)
+{
+	return (cmp_int(l_btree_id,	r->btree_id) ?:
+		cmp_int(l_level,	r->level) ?:
+		bkey_cmp(l_pos,	r->k->k.p));
+}
+
+static int journal_key_cmp(struct journal_key *l, struct journal_key *r)
+{
+	return (cmp_int(l->btree_id,	r->btree_id) ?:
+		cmp_int(l->level,	r->level) ?:
+		bkey_cmp(l->k->k.p,	r->k->k.p));
+}
+
+static size_t journal_key_search(struct journal_keys *journal_keys,
+				 enum btree_id id, unsigned level,
+				 struct bpos pos)
 {
 	size_t l = 0, r = journal_keys->nr, m;
 
 	while (l < r) {
 		m = l + ((r - l) >> 1);
-		if ((cmp_int(id,	journal_keys->d[m].btree_id) ?:
-		     cmp_int(level,	journal_keys->d[m].level) ?:
-		     bkey_cmp(pos,	journal_keys->d[m].k->k.p)) > 0)
+		if (__journal_key_cmp(id, level, pos, &journal_keys->d[m]) > 0)
 			l = m + 1;
 		else
 			r = m;
 	}
 
 	BUG_ON(l < journal_keys->nr &&
-	       (cmp_int(id,	journal_keys->d[l].btree_id) ?:
-		cmp_int(level,	journal_keys->d[l].level) ?:
-		bkey_cmp(pos,	journal_keys->d[l].k->k.p)) > 0);
+	       __journal_key_cmp(id, level, pos, &journal_keys->d[l]) > 0);
 
 	BUG_ON(l &&
-	       (cmp_int(id,	journal_keys->d[l - 1].btree_id) ?:
-		cmp_int(level,	journal_keys->d[l - 1].level) ?:
-		bkey_cmp(pos,	journal_keys->d[l - 1].k->k.p)) <= 0);
+	       __journal_key_cmp(id, level, pos, &journal_keys->d[l - 1]) <= 0);
 
-	return l < journal_keys->nr ? journal_keys->d + l : NULL;
+	return l;
+}
+
+static void journal_iter_fix(struct bch_fs *c, struct journal_iter *iter, unsigned idx)
+{
+	struct bkey_i *n = iter->keys->d[idx].k;
+	struct btree_and_journal_iter *biter =
+		container_of(iter, struct btree_and_journal_iter, journal);
+
+	if (iter->idx > idx ||
+	    (iter->idx == idx &&
+	     biter->last &&
+	     bkey_cmp(n->k.p, biter->unpacked.p) <= 0))
+		iter->idx++;
+}
+
+int bch2_journal_key_insert(struct bch_fs *c, enum btree_id id,
+			    unsigned level, struct bkey_i *k)
+{
+	struct journal_key n = {
+		.btree_id	= id,
+		.level		= level,
+		.k		= k,
+		.allocated	= true
+	};
+	struct journal_keys *keys = &c->journal_keys;
+	struct journal_iter *iter;
+	unsigned idx = journal_key_search(keys, id, level, k->k.p);
+
+	if (idx < keys->nr &&
+	    journal_key_cmp(&n, &keys->d[idx]) == 0) {
+		if (keys->d[idx].allocated)
+			kfree(keys->d[idx].k);
+		keys->d[idx] = n;
+		return 0;
+	}
+
+	if (keys->nr == keys->size) {
+		struct journal_keys new_keys = {
+			.nr			= keys->nr,
+			.size			= keys->size * 2,
+			.journal_seq_base	= keys->journal_seq_base,
+		};
+
+		new_keys.d = kvmalloc(sizeof(new_keys.d[0]) * new_keys.size, GFP_KERNEL);
+		if (!new_keys.d)
+			return -ENOMEM;
+
+		memcpy(new_keys.d, keys->d, sizeof(keys->d[0]) * keys->nr);
+		kvfree(keys->d);
+		*keys = new_keys;
+	}
+
+	array_insert_item(keys->d, keys->nr, idx, n);
+
+	list_for_each_entry(iter, &c->journal_iters, list)
+		journal_iter_fix(c, iter, idx);
+
+	return 0;
+}
+
+int bch2_journal_key_delete(struct bch_fs *c, enum btree_id id,
+			    unsigned level, struct bpos pos)
+{
+	struct bkey_i *whiteout =
+		kmalloc(sizeof(struct bkey), GFP_KERNEL);
+	int ret;
+
+	if (!whiteout)
+		return -ENOMEM;
+
+	bkey_init(&whiteout->k);
+	whiteout->k.p = pos;
+
+	ret = bch2_journal_key_insert(c, id, level, whiteout);
+	if (ret)
+		kfree(whiteout);
+	return ret;
 }
 
 static struct bkey_i *bch2_journal_iter_peek(struct journal_iter *iter)
 {
-	if (iter->k &&
-	    iter->k < iter->keys->d + iter->keys->nr &&
-	    iter->k->btree_id	== iter->btree_id &&
-	    iter->k->level	== iter->level)
-		return iter->k->k;
+	struct journal_key *k = iter->idx - iter->keys->nr
+		? iter->keys->d + iter->idx : NULL;
+
+	if (k &&
+	    k->btree_id	== iter->btree_id &&
+	    k->level	== iter->level)
+		return k->k;
 
-	iter->k = NULL;
+	iter->idx = iter->keys->nr;
 	return NULL;
 }
 
 static void bch2_journal_iter_advance(struct journal_iter *iter)
 {
-	if (iter->k)
-		iter->k++;
+	if (iter->idx < iter->keys->nr)
+		iter->idx++;
+}
+
+static void bch2_journal_iter_exit(struct journal_iter *iter)
+{
+	list_del(&iter->list);
 }
 
-static void bch2_journal_iter_init(struct journal_iter *iter,
-				   struct journal_keys *journal_keys,
+static void bch2_journal_iter_init(struct bch_fs *c,
+				   struct journal_iter *iter,
 				   enum btree_id id, unsigned level,
 				   struct bpos pos)
 {
 	iter->btree_id	= id;
 	iter->level	= level;
-	iter->keys	= journal_keys;
-	iter->k		= journal_key_search(journal_keys, id, level, pos);
+	iter->keys	= &c->journal_keys;
+	iter->idx	= journal_key_search(&c->journal_keys, id, level, pos);
+	list_add(&iter->list, &c->journal_iters);
 }
 
 static struct bkey_s_c bch2_journal_iter_peek_btree(struct btree_and_journal_iter *iter)
 {
-	return iter->btree
-		? bch2_btree_iter_peek(iter->btree)
-		: bch2_btree_node_iter_peek_unpack(&iter->node_iter,
-						   iter->b, &iter->unpacked);
+	return bch2_btree_node_iter_peek_unpack(&iter->node_iter,
+						iter->b, &iter->unpacked);
 }
 
 static void bch2_journal_iter_advance_btree(struct btree_and_journal_iter *iter)
 {
-	if (iter->btree)
-		bch2_btree_iter_next(iter->btree);
-	else
-		bch2_btree_node_iter_advance(&iter->node_iter, iter->b);
+	bch2_btree_node_iter_advance(&iter->node_iter, iter->b);
 }
 
 void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *iter)
@@ -160,7 +251,7 @@ struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *
 
 		if (iter->b &&
 		    bkey_cmp(ret.k->p, iter->b->data->max_key) > 0) {
-			iter->journal.k = NULL;
+			iter->journal.idx = iter->journal.keys->nr;
 			iter->last = none;
 			return bkey_s_c_null;
 		}
@@ -181,26 +272,20 @@ struct bkey_s_c bch2_btree_and_journal_iter_next(struct btree_and_journal_iter *
 	return bch2_btree_and_journal_iter_peek(iter);
 }
 
-void bch2_btree_and_journal_iter_init(struct btree_and_journal_iter *iter,
-				      struct btree_trans *trans,
-				      struct journal_keys *journal_keys,
-				      enum btree_id id, struct bpos pos)
+void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *iter)
 {
-	memset(iter, 0, sizeof(*iter));
-
-	iter->btree = bch2_trans_get_iter(trans, id, pos, BTREE_ITER_PREFETCH);
-	bch2_journal_iter_init(&iter->journal, journal_keys, id, 0, pos);
+	bch2_journal_iter_exit(&iter->journal);
 }
 
 void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *iter,
-						struct journal_keys *journal_keys,
+						struct bch_fs *c,
 						struct btree *b)
 {
 	memset(iter, 0, sizeof(*iter));
 
 	iter->b = b;
 	bch2_btree_node_iter_init_from_start(&iter->node_iter, iter->b);
-	bch2_journal_iter_init(&iter->journal, journal_keys,
+	bch2_journal_iter_init(c, &iter->journal,
 			       b->c.btree_id, b->c.level, b->data->min_key);
 }
 
@@ -244,7 +329,7 @@ static int bch2_btree_and_journal_walk_recurse(struct bch_fs *c, struct btree *b
 	int ret = 0;
 
 	bch2_bkey_buf_init(&tmp);
-	bch2_btree_and_journal_iter_init_node_iter(&iter, journal_keys, b);
+	bch2_btree_and_journal_iter_init_node_iter(&iter, c, b);
 
 	while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
 		ret = key_fn(c, btree_id, b->c.level, k);
@@ -277,6 +362,7 @@ static int bch2_btree_and_journal_walk_recurse(struct bch_fs *c, struct btree *b
 		}
 	}
 
+	bch2_btree_and_journal_iter_exit(&iter);
 	bch2_bkey_buf_exit(&tmp, c);
 	return ret;
 }
@@ -333,6 +419,12 @@ static int journal_sort_key_cmp(const void *_l, const void *_r)
 
 void bch2_journal_keys_free(struct journal_keys *keys)
 {
+	struct journal_key *i;
+
+	for (i = keys->d; i < keys->d + keys->nr; i++)
+		if (i->allocated)
+			kfree(i->k);
+
 	kvfree(keys->d);
 	keys->d = NULL;
 	keys->nr = 0;
@@ -361,7 +453,9 @@ static struct journal_keys journal_keys_sort(struct list_head *journal_entries)
 			nr_keys++;
 	}
 
-	keys.d = kvmalloc(sizeof(keys.d[0]) * nr_keys, GFP_KERNEL);
+	keys.size = roundup_pow_of_two(nr_keys);
+
+	keys.d = kvmalloc(sizeof(keys.d[0]) * keys.size, GFP_KERNEL);
 	if (!keys.d)
 		goto err;
 
@@ -545,14 +639,16 @@ static int __bch2_journal_replay_key(struct btree_trans *trans,
 	return ret;
 }
 
-static int bch2_journal_replay_key(struct bch_fs *c, enum btree_id id,
-				   unsigned level, struct bkey_i *k)
+static int bch2_journal_replay_key(struct bch_fs *c, struct journal_key *k)
 {
-	return bch2_trans_do(c, NULL, NULL,
-			     BTREE_INSERT_NOFAIL|
-			     BTREE_INSERT_LAZY_RW|
-			     BTREE_INSERT_JOURNAL_REPLAY,
-			     __bch2_journal_replay_key(&trans, id, level, k));
+	unsigned commit_flags = BTREE_INSERT_NOFAIL|
+		BTREE_INSERT_LAZY_RW;
+
+	if (!k->allocated)
+		commit_flags |= BTREE_INSERT_JOURNAL_REPLAY;
+
+	return bch2_trans_do(c, NULL, NULL, commit_flags,
+			     __bch2_journal_replay_key(&trans, k->btree_id, k->level, k->k));
 }
 
 static int __bch2_alloc_replay_key(struct btree_trans *trans, struct bkey_i *k)
@@ -628,7 +724,7 @@ static int bch2_journal_replay(struct bch_fs *c,
 
 		if (i->level) {
 			j->replay_journal_seq = keys.journal_seq_base + i->journal_seq;
-			ret = bch2_journal_replay_key(c, i->btree_id, i->level, i->k);
+			ret = bch2_journal_replay_key(c, i);
 			if (ret)
 				goto err;
 		}
@@ -658,7 +754,7 @@ static int bch2_journal_replay(struct bch_fs *c,
 
 		ret = i->k->k.size
 			? bch2_extent_replay_key(c, i->btree_id, i->k)
-			: bch2_journal_replay_key(c, i->btree_id, i->level, i->k);
+			: bch2_journal_replay_key(c, i);
 		if (ret)
 			goto err;
 	}
@@ -1105,7 +1201,7 @@ use_clean:
 	    test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags)) {
 		bch_info(c, "starting mark and sweep");
 		err = "error in mark and sweep";
-		ret = bch2_gc(c, &c->journal_keys, true);
+		ret = bch2_gc(c, true);
 		if (ret)
 			goto err;
 		bch_verbose(c, "mark and sweep done");
diff --git a/fs/bcachefs/recovery.h b/fs/bcachefs/recovery.h
index a66827c9addf..fa91851b9ed7 100644
--- a/fs/bcachefs/recovery.h
+++ b/fs/bcachefs/recovery.h
@@ -6,10 +6,11 @@
 	for (i = (keys).d; i < (keys).d + (keys).nr; (i)++)
 
 struct journal_iter {
+	struct list_head	list;
 	enum btree_id		btree_id;
 	unsigned		level;
+	size_t			idx;
 	struct journal_keys	*keys;
-	struct journal_key	*k;
 };
 
 /*
@@ -17,8 +18,6 @@ struct journal_iter {
  */
 
 struct btree_and_journal_iter {
-	struct btree_iter	*btree;
-
 	struct btree		*b;
 	struct btree_node_iter	node_iter;
 	struct bkey		unpacked;
@@ -32,16 +31,18 @@ struct btree_and_journal_iter {
 	}			last;
 };
 
+int bch2_journal_key_insert(struct bch_fs *, enum btree_id,
+			    unsigned, struct bkey_i *);
+int bch2_journal_key_delete(struct bch_fs *, enum btree_id,
+			    unsigned, struct bpos);
+
 void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *);
 struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *);
 struct bkey_s_c bch2_btree_and_journal_iter_next(struct btree_and_journal_iter *);
 
-void bch2_btree_and_journal_iter_init(struct btree_and_journal_iter *,
-				      struct btree_trans *,
-				      struct journal_keys *,
-				      enum btree_id, struct bpos);
+void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *);
 void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *,
-						struct journal_keys *,
+						struct bch_fs *,
 						struct btree *);
 
 typedef int (*btree_walk_node_fn)(struct bch_fs *c, struct btree *b);
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index bdaea336be85..d451a29b517b 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -692,6 +692,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 		  bch2_blacklist_entries_gc);
 
 	INIT_LIST_HEAD(&c->journal_entries);
+	INIT_LIST_HEAD(&c->journal_iters);
 
 	INIT_LIST_HEAD(&c->fsck_errors);
 	mutex_init(&c->fsck_error_lock);
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index 57b1dbe04178..521b6d8d518f 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -475,7 +475,7 @@ STORE(bch2_fs)
 		 */
 #if 0
 		down_read(&c->state_lock);
-		bch2_gc(c, NULL, false, false);
+		bch2_gc(c, false, false);
 		up_read(&c->state_lock);
 #else
 		bch2_gc_gens(c);
-- 
cgit 


From a0b73c1c5363f5e2cd9a7a7968a9d6579548050a Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 26 Jan 2021 20:59:00 -0500
Subject: bcachefs: Add (partial) support for fixing btree topology

When we walk the btrees during recovery, part of that is checking that
btree topology is correct: for every interior btree node, its child
nodes should exactly span the range the parent node covers.

Previously, we had checks for this, but not repair code. Now that we
have the ability to do btree updates during initial GC, this patch adds
that repair code.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs.h    |   3 +-
 fs/bcachefs/btree_cache.c |  43 +++++++++++---
 fs/bcachefs/btree_cache.h |   2 +-
 fs/bcachefs/btree_gc.c    | 146 ++++++++++++++++++++++++++++++++++++----------
 fs/bcachefs/recovery.c    |   6 +-
 5 files changed, 156 insertions(+), 44 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index d5fc5eed73ae..19ba23f7d9dd 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -509,7 +509,8 @@ enum {
 	BCH_FS_ERRORS_FIXED,
 
 	/* misc: */
-	BCH_FS_FIXED_GENS,
+	BCH_FS_NEED_ANOTHER_GC,
+	BCH_FS_DELETED_NODES,
 	BCH_FS_NEED_ALLOC_WRITE,
 	BCH_FS_REBUILD_REPLICAS,
 	BCH_FS_HOLD_BTREE_WRITES,
diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index 4b29be7234c7..443d669e6a30 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -7,6 +7,7 @@
 #include "btree_iter.h"
 #include "btree_locking.h"
 #include "debug.h"
+#include "error.h"
 #include "trace.h"
 
 #include <linux/prefetch.h>
@@ -813,9 +814,12 @@ lock_node:
 		return ERR_PTR(-EIO);
 	}
 
-	EBUG_ON(b->c.btree_id != iter->btree_id ||
-		BTREE_NODE_LEVEL(b->data) != level ||
-		bkey_cmp(b->data->max_key, k->k.p));
+	EBUG_ON(b->c.btree_id != iter->btree_id);
+	EBUG_ON(BTREE_NODE_LEVEL(b->data) != level);
+	EBUG_ON(bkey_cmp(b->data->max_key, k->k.p));
+	EBUG_ON(b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
+		bkey_cmp(b->data->min_key,
+			 bkey_i_to_btree_ptr_v2(&b->key)->v.min_key));
 
 	return b;
 }
@@ -823,7 +827,8 @@ lock_node:
 struct btree *bch2_btree_node_get_noiter(struct bch_fs *c,
 					 const struct bkey_i *k,
 					 enum btree_id btree_id,
-					 unsigned level)
+					 unsigned level,
+					 bool nofill)
 {
 	struct btree_cache *bc = &c->btree_cache;
 	struct btree *b;
@@ -838,6 +843,9 @@ struct btree *bch2_btree_node_get_noiter(struct bch_fs *c,
 retry:
 	b = btree_cache_find(bc, k);
 	if (unlikely(!b)) {
+		if (nofill)
+			return NULL;
+
 		b = bch2_btree_node_fill(c, NULL, k, btree_id,
 					 level, SIX_LOCK_read, true);
 
@@ -884,9 +892,12 @@ lock_node:
 		return ERR_PTR(-EIO);
 	}
 
-	EBUG_ON(b->c.btree_id != btree_id ||
-		BTREE_NODE_LEVEL(b->data) != level ||
-		bkey_cmp(b->data->max_key, k->k.p));
+	EBUG_ON(b->c.btree_id != btree_id);
+	EBUG_ON(BTREE_NODE_LEVEL(b->data) != level);
+	EBUG_ON(bkey_cmp(b->data->max_key, k->k.p));
+	EBUG_ON(b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
+		bkey_cmp(b->data->min_key,
+			 bkey_i_to_btree_ptr_v2(&b->key)->v.min_key));
 
 	return b;
 }
@@ -996,8 +1007,22 @@ out:
 		if (sib != btree_prev_sib)
 			swap(n1, n2);
 
-		BUG_ON(bkey_cmp(bkey_successor(n1->key.k.p),
-				n2->data->min_key));
+		if (bkey_cmp(bkey_successor(n1->key.k.p),
+			     n2->data->min_key)) {
+			char buf1[200], buf2[200];
+
+			bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(&n1->key));
+			bch2_bkey_val_to_text(&PBUF(buf2), c, bkey_i_to_s_c(&n2->key));
+
+			bch2_fs_inconsistent(c, "btree topology error at btree %s level %u:\n"
+					     "prev: %s\n"
+					     "next: %s\n",
+					     bch2_btree_ids[iter->btree_id], level,
+					     buf1, buf2);
+
+			six_unlock_intent(&ret->c.lock);
+			ret = NULL;
+		}
 	}
 
 	bch2_btree_trans_verify_locks(trans);
diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h
index 0eeca0bcc48e..5fffae92effb 100644
--- a/fs/bcachefs/btree_cache.h
+++ b/fs/bcachefs/btree_cache.h
@@ -26,7 +26,7 @@ struct btree *bch2_btree_node_get(struct bch_fs *, struct btree_iter *,
 				  enum six_lock_type, unsigned long);
 
 struct btree *bch2_btree_node_get_noiter(struct bch_fs *, const struct bkey_i *,
-					 enum btree_id, unsigned);
+					 enum btree_id, unsigned, bool);
 
 struct btree *bch2_btree_node_get_sibling(struct bch_fs *, struct btree_iter *,
 				struct btree *, enum btree_node_sibling);
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 8f347ba5b4e6..0dfb1f67225d 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -50,6 +50,10 @@ static inline void gc_pos_set(struct bch_fs *c, struct gc_pos new_pos)
 	__gc_pos_set(c, new_pos);
 }
 
+/*
+ * Missing: if an interior btree node is empty, we need to do something -
+ * perhaps just kill it
+ */
 static int bch2_gc_check_topology(struct bch_fs *c,
 				  struct btree *b,
 				  struct bkey_buf *prev,
@@ -62,6 +66,8 @@ static int bch2_gc_check_topology(struct bch_fs *c,
 		? node_start
 		: bkey_successor(prev->k->k.p);
 	char buf1[200], buf2[200];
+	bool update_min = false;
+	bool update_max = false;
 	int ret = 0;
 
 	if (cur.k->k.type == KEY_TYPE_btree_ptr_v2) {
@@ -75,22 +81,79 @@ static int bch2_gc_check_topology(struct bch_fs *c,
 			bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(prev->k));
 
 		if (fsck_err_on(bkey_cmp(expected_start, bp->v.min_key), c,
-				"btree node with incorrect min_key:\n  prev %s\n  cur %s",
+				"btree node with incorrect min_key at btree %s level %u:\n"
+				"  prev %s\n"
+				"  cur %s",
+				bch2_btree_ids[b->c.btree_id], b->c.level,
 				buf1,
-				(bch2_bkey_val_to_text(&PBUF(buf2), c, bkey_i_to_s_c(cur.k)), buf2))) {
-			BUG();
-		}
+				(bch2_bkey_val_to_text(&PBUF(buf2), c, bkey_i_to_s_c(cur.k)), buf2)))
+			update_min = true;
 	}
 
 	if (fsck_err_on(is_last &&
 			bkey_cmp(cur.k->k.p, node_end), c,
-			"btree node with incorrect max_key:\n  %s\n  expected %s",
+			"btree node with incorrect max_key at btree %s level %u:\n"
+			"  %s\n"
+			"  expected %s",
+			bch2_btree_ids[b->c.btree_id], b->c.level,
 			(bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(cur.k)), buf1),
-			(bch2_bpos_to_text(&PBUF(buf2), node_end), buf2))) {
-		BUG();
-	}
+			(bch2_bpos_to_text(&PBUF(buf2), node_end), buf2)))
+		update_max = true;
 
 	bch2_bkey_buf_copy(prev, c, cur.k);
+
+	if (update_min || update_max) {
+		struct bkey_i *new;
+		struct bkey_i_btree_ptr_v2 *bp = NULL;
+		struct btree *n;
+
+		if (update_max) {
+			ret = bch2_journal_key_delete(c, b->c.btree_id,
+						      b->c.level, cur.k->k.p);
+			if (ret)
+				return ret;
+		}
+
+		new = kmalloc(bkey_bytes(&cur.k->k), GFP_KERNEL);
+		if (!new)
+			return -ENOMEM;
+
+		bkey_copy(new, cur.k);
+
+		if (new->k.type == KEY_TYPE_btree_ptr_v2)
+			bp = bkey_i_to_btree_ptr_v2(new);
+
+		if (update_min)
+			bp->v.min_key = expected_start;
+		if (update_max)
+			new->k.p = node_end;
+		if (bp)
+			SET_BTREE_PTR_RANGE_UPDATED(&bp->v, true);
+
+		ret = bch2_journal_key_insert(c, b->c.btree_id, b->c.level, new);
+		if (ret) {
+			kfree(new);
+			return ret;
+		}
+
+		n = bch2_btree_node_get_noiter(c, cur.k, b->c.btree_id,
+					       b->c.level - 1, true);
+		if (n) {
+			mutex_lock(&c->btree_cache.lock);
+			bch2_btree_node_hash_remove(&c->btree_cache, n);
+
+			bkey_copy(&n->key, new);
+			if (update_min)
+				n->data->min_key = expected_start;
+			if (update_max)
+				n->data->max_key = node_end;
+
+			ret = __bch2_btree_node_hash_insert(&c->btree_cache, n);
+			BUG_ON(ret);
+			mutex_unlock(&c->btree_cache.lock);
+			six_unlock_read(&n->c.lock);
+		}
+	}
 fsck_err:
 	return ret;
 }
@@ -147,12 +210,13 @@ static int bch2_gc_mark_key(struct bch_fs *c, struct bkey_s_c k,
 					ptr->dev, PTR_BUCKET_NR(ca, ptr),
 					bch2_data_types[ptr_data_type(k.k, ptr)],
 					ptr->gen, g->mark.gen)) {
+				/* XXX if it's a cached ptr, drop it */
 				g2->_mark.gen	= g->_mark.gen		= ptr->gen;
 				g2->gen_valid	= g->gen_valid		= true;
 				g2->_mark.data_type		= 0;
 				g2->_mark.dirty_sectors		= 0;
 				g2->_mark.cached_sectors	= 0;
-				set_bit(BCH_FS_FIXED_GENS, &c->flags);
+				set_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags);
 				set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags);
 			}
 		}
@@ -298,8 +362,6 @@ static int bch2_gc_btree_init_recurse(struct bch_fs *c, struct btree *b,
 			break;
 
 		if (b->c.level) {
-			struct btree *child;
-
 			bch2_bkey_buf_reassemble(&cur, c, k);
 			k = bkey_i_to_s_c(cur.k);
 
@@ -310,26 +372,49 @@ static int bch2_gc_btree_init_recurse(struct bch_fs *c, struct btree *b,
 					!bch2_btree_and_journal_iter_peek(&iter).k);
 			if (ret)
 				break;
+		} else {
+			bch2_btree_and_journal_iter_advance(&iter);
+		}
+	}
 
-			if (b->c.level > target_depth) {
-				child = bch2_btree_node_get_noiter(c, cur.k,
-							b->c.btree_id, b->c.level - 1);
-				ret = PTR_ERR_OR_ZERO(child);
-				if (ret)
-					break;
+	if (b->c.level > target_depth) {
+		bch2_btree_and_journal_iter_exit(&iter);
+		bch2_btree_and_journal_iter_init_node_iter(&iter, c, b);
 
-				ret = bch2_gc_btree_init_recurse(c, child,
-						target_depth);
-				six_unlock_read(&child->c.lock);
+		while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
+			struct btree *child;
+
+			bch2_bkey_buf_reassemble(&cur, c, k);
+			bch2_btree_and_journal_iter_advance(&iter);
 
+			child = bch2_btree_node_get_noiter(c, cur.k,
+						b->c.btree_id, b->c.level - 1,
+						false);
+			ret = PTR_ERR_OR_ZERO(child);
+
+			if (fsck_err_on(ret == -EIO, c,
+					"unreadable btree node")) {
+				ret = bch2_journal_key_delete(c, b->c.btree_id,
+							      b->c.level, cur.k->k.p);
 				if (ret)
-					break;
+					return ret;
+
+				set_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags);
+				continue;
 			}
-		} else {
-			bch2_btree_and_journal_iter_advance(&iter);
+
+			if (ret)
+				break;
+
+			ret = bch2_gc_btree_init_recurse(c, child,
+							 target_depth);
+			six_unlock_read(&child->c.lock);
+
+			if (ret)
+				break;
 		}
 	}
-
+fsck_err:
 	bch2_bkey_buf_exit(&cur, c);
 	bch2_bkey_buf_exit(&prev, c);
 	bch2_btree_and_journal_iter_exit(&iter);
@@ -816,16 +901,15 @@ again:
 	bch2_mark_allocator_buckets(c);
 
 	c->gc_count++;
-out:
-	if (!ret &&
-	    (test_bit(BCH_FS_FIXED_GENS, &c->flags) ||
-	     (!iter && bch2_test_restart_gc))) {
+
+	if (test_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags) ||
+	    (!iter && bch2_test_restart_gc)) {
 		/*
 		 * XXX: make sure gens we fixed got saved
 		 */
 		if (iter++ <= 2) {
-			bch_info(c, "Fixed gens, restarting mark and sweep:");
-			clear_bit(BCH_FS_FIXED_GENS, &c->flags);
+			bch_info(c, "Second GC pass needed, restarting:");
+			clear_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags);
 			__gc_pos_set(c, gc_phase(GC_PHASE_NOT_RUNNING));
 
 			percpu_down_write(&c->mark_lock);
@@ -840,7 +924,7 @@ out:
 		bch_info(c, "Unable to fix bucket gens, looping");
 		ret = -EINVAL;
 	}
-
+out:
 	if (!ret) {
 		bch2_journal_block(&c->journal);
 
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 88a1d47e6e4b..f470e0e233ce 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -342,7 +342,8 @@ static int bch2_btree_and_journal_walk_recurse(struct bch_fs *c, struct btree *b
 			bch2_btree_and_journal_iter_advance(&iter);
 
 			child = bch2_btree_node_get_noiter(c, tmp.k,
-						b->c.btree_id, b->c.level - 1);
+						b->c.btree_id, b->c.level - 1,
+						false);
 
 			ret = PTR_ERR_OR_ZERO(child);
 			if (ret)
@@ -766,7 +767,8 @@ static int bch2_journal_replay(struct bch_fs *c,
 	bch2_journal_flush_all_pins(j);
 	return bch2_journal_error(j);
 err:
-	bch_err(c, "journal replay: error %d while replaying key", ret);
+	bch_err(c, "journal replay: error %d while replaying key at btree %s level %u",
+		ret, bch2_btree_ids[i->btree_id], i->level);
 	return ret;
 }
 
-- 
cgit 


From 5fc70d3a54e4b0e7f7ad1baec564e3987cb6ee0e Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 27 Jan 2021 19:08:54 -0500
Subject: bcachefs: Repair bad data pointers

Now that we can repair metadata during GC, we can handle bad pointers
that would trigger errors being marked, when they need to just be
dropped.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_gc.c | 138 ++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 102 insertions(+), 36 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 0dfb1f67225d..9e123736a125 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -158,9 +158,101 @@ fsck_err:
 	return ret;
 }
 
+static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id,
+			       unsigned level, bool is_root,
+			       struct bkey_s_c *k)
+{
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(*k);
+	const struct bch_extent_ptr *ptr;
+	bool do_update = false;
+	int ret = 0;
+
+	bkey_for_each_ptr(ptrs, ptr) {
+		struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+		struct bucket *g = PTR_BUCKET(ca, ptr, true);
+		struct bucket *g2 = PTR_BUCKET(ca, ptr, false);
+
+		if (fsck_err_on(!g->gen_valid, c,
+				"bucket %u:%zu data type %s ptr gen %u missing in alloc btree",
+				ptr->dev, PTR_BUCKET_NR(ca, ptr),
+				bch2_data_types[ptr_data_type(k->k, ptr)],
+				ptr->gen)) {
+			if (!ptr->cached) {
+				g2->_mark.gen	= g->_mark.gen		= ptr->gen;
+				g2->gen_valid	= g->gen_valid		= true;
+				set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags);
+			} else {
+				do_update = true;
+			}
+		}
+
+		if (fsck_err_on(gen_cmp(ptr->gen, g->mark.gen) > 0, c,
+				"bucket %u:%zu data type %s ptr gen in the future: %u > %u",
+				ptr->dev, PTR_BUCKET_NR(ca, ptr),
+				bch2_data_types[ptr_data_type(k->k, ptr)],
+				ptr->gen, g->mark.gen)) {
+			if (!ptr->cached) {
+				g2->_mark.gen	= g->_mark.gen	= ptr->gen;
+				g2->gen_valid	= g->gen_valid	= true;
+				g2->_mark.data_type		= 0;
+				g2->_mark.dirty_sectors		= 0;
+				g2->_mark.cached_sectors	= 0;
+				set_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags);
+				set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags);
+			} else {
+				do_update = true;
+			}
+		}
+
+		if (fsck_err_on(!ptr->cached &&
+				gen_cmp(ptr->gen, g->mark.gen) < 0, c,
+				"bucket %u:%zu data type %s stale dirty ptr: %u < %u",
+				ptr->dev, PTR_BUCKET_NR(ca, ptr),
+				bch2_data_types[ptr_data_type(k->k, ptr)],
+				ptr->gen, g->mark.gen))
+			do_update = true;
+	}
+
+	if (do_update) {
+		struct bch_extent_ptr *ptr;
+		struct bkey_i *new;
+
+		if (is_root) {
+			bch_err(c, "cannot update btree roots yet");
+			return -EINVAL;
+		}
+
+		new = kmalloc(bkey_bytes(k->k), GFP_KERNEL);
+		if (!new)
+			return -ENOMEM;
+
+		bkey_reassemble(new, *k);
+
+		bch2_bkey_drop_ptrs(bkey_i_to_s(new), ptr, ({
+			struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+			struct bucket *g = PTR_BUCKET(ca, ptr, true);
+
+			(ptr->cached &&
+			 (!g->gen_valid || gen_cmp(ptr->gen, g->mark.gen) > 0)) ||
+			(!ptr->cached &&
+			 gen_cmp(ptr->gen, g->mark.gen) < 0);
+		}));
+
+		ret = bch2_journal_key_insert(c, btree_id, level, new);
+		if (ret)
+			kfree(new);
+		else
+			*k = bkey_i_to_s_c(new);
+	}
+fsck_err:
+	return ret;
+}
+
 /* marking of btree keys/nodes: */
 
-static int bch2_gc_mark_key(struct bch_fs *c, struct bkey_s_c k,
+static int bch2_gc_mark_key(struct bch_fs *c, enum btree_id btree_id,
+			    unsigned level, bool is_root,
+			    struct bkey_s_c k,
 			    u8 *max_stale, bool initial)
 {
 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
@@ -174,7 +266,6 @@ static int bch2_gc_mark_key(struct bch_fs *c, struct bkey_s_c k,
 		BUG_ON(bch2_journal_seq_verify &&
 		       k.k->version.lo > journal_cur_seq(&c->journal));
 
-		/* XXX change to fsck check */
 		if (fsck_err_on(k.k->version.lo > atomic64_read(&c->key_version), c,
 				"key version number higher than recorded: %llu > %llu",
 				k.k->version.lo,
@@ -190,36 +281,7 @@ static int bch2_gc_mark_key(struct bch_fs *c, struct bkey_s_c k,
 				return ret;
 		}
 
-		bkey_for_each_ptr(ptrs, ptr) {
-			struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
-			struct bucket *g = PTR_BUCKET(ca, ptr, true);
-			struct bucket *g2 = PTR_BUCKET(ca, ptr, false);
-
-			if (mustfix_fsck_err_on(!g->gen_valid, c,
-					"bucket %u:%zu data type %s ptr gen %u missing in alloc btree",
-					ptr->dev, PTR_BUCKET_NR(ca, ptr),
-					bch2_data_types[ptr_data_type(k.k, ptr)],
-					ptr->gen)) {
-				g2->_mark.gen	= g->_mark.gen		= ptr->gen;
-				g2->gen_valid	= g->gen_valid		= true;
-				set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags);
-			}
-
-			if (mustfix_fsck_err_on(gen_cmp(ptr->gen, g->mark.gen) > 0, c,
-					"bucket %u:%zu data type %s ptr gen in the future: %u > %u",
-					ptr->dev, PTR_BUCKET_NR(ca, ptr),
-					bch2_data_types[ptr_data_type(k.k, ptr)],
-					ptr->gen, g->mark.gen)) {
-				/* XXX if it's a cached ptr, drop it */
-				g2->_mark.gen	= g->_mark.gen		= ptr->gen;
-				g2->gen_valid	= g->gen_valid		= true;
-				g2->_mark.data_type		= 0;
-				g2->_mark.dirty_sectors		= 0;
-				g2->_mark.cached_sectors	= 0;
-				set_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags);
-				set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags);
-			}
-		}
+		ret = bch2_check_fix_ptrs(c, btree_id, level, is_root, &k);
 	}
 
 	bkey_for_each_ptr(ptrs, ptr) {
@@ -259,7 +321,8 @@ static int btree_gc_mark_node(struct bch_fs *c, struct btree *b, u8 *max_stale,
 	while ((k = bch2_btree_node_iter_peek_unpack(&iter, b, &unpacked)).k) {
 		bch2_bkey_debugcheck(c, b, k);
 
-		ret = bch2_gc_mark_key(c, k, max_stale, initial);
+		ret = bch2_gc_mark_key(c, b->c.btree_id, b->c.level, false,
+				       k, max_stale, initial);
 		if (ret)
 			break;
 
@@ -329,7 +392,8 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id,
 	mutex_lock(&c->btree_root_lock);
 	b = c->btree_roots[btree_id].b;
 	if (!btree_node_fake(b))
-		ret = bch2_gc_mark_key(c, bkey_i_to_s_c(&b->key),
+		ret = bch2_gc_mark_key(c, b->c.btree_id, b->c.level, true,
+				       bkey_i_to_s_c(&b->key),
 				       &max_stale, initial);
 	gc_pos_set(c, gc_pos_btree_root(b->c.btree_id));
 	mutex_unlock(&c->btree_root_lock);
@@ -357,7 +421,8 @@ static int bch2_gc_btree_init_recurse(struct bch_fs *c, struct btree *b,
 		BUG_ON(bkey_cmp(k.k->p, b->data->min_key) < 0);
 		BUG_ON(bkey_cmp(k.k->p, b->data->max_key) > 0);
 
-		ret = bch2_gc_mark_key(c, k, &max_stale, true);
+		ret = bch2_gc_mark_key(c, b->c.btree_id, b->c.level, false,
+				       k, &max_stale, true);
 		if (ret)
 			break;
 
@@ -455,7 +520,8 @@ static int bch2_gc_btree_init(struct bch_fs *c,
 		ret = bch2_gc_btree_init_recurse(c, b, target_depth);
 
 	if (!ret)
-		ret = bch2_gc_mark_key(c, bkey_i_to_s_c(&b->key),
+		ret = bch2_gc_mark_key(c, b->c.btree_id, b->c.level, true,
+				       bkey_i_to_s_c(&b->key),
 				       &max_stale, true);
 fsck_err:
 	six_unlock_read(&b->c.lock);
-- 
cgit 


From d042b0402cff3278d9fa6056cf3d3063bf196716 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 29 Jan 2021 15:37:28 -0500
Subject: bcachefs: Add an option for metadata_target

Also, make journal writes obey foreground_target and metadata_target.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs_format.h       |  1 +
 fs/bcachefs/btree_update_interior.c |  5 ++++-
 fs/bcachefs/journal_io.c            | 15 +++++++++++++--
 fs/bcachefs/opts.h                  |  5 +++++
 4 files changed, 23 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index a0e445a71c3e..77af77efdd6d 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -1309,6 +1309,7 @@ LE64_BITMASK(BCH_SB_BACKGROUND_COMPRESSION_TYPE,
 LE64_BITMASK(BCH_SB_GC_RESERVE_BYTES,	struct bch_sb, flags[2],  4, 64);
 
 LE64_BITMASK(BCH_SB_ERASURE_CODE,	struct bch_sb, flags[3],  0, 16);
+LE64_BITMASK(BCH_SB_METADATA_TARGET,	struct bch_sb, flags[3], 16, 28);
 
 /*
  * Features:
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 42015f729da7..285365ba7012 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -222,7 +222,10 @@ static struct btree *__bch2_btree_node_alloc(struct bch_fs *c,
 	mutex_unlock(&c->btree_reserve_cache_lock);
 
 retry:
-	wp = bch2_alloc_sectors_start(c, c->opts.foreground_target, 0,
+	wp = bch2_alloc_sectors_start(c,
+				      c->opts.metadata_target ?:
+				      c->opts.foreground_target,
+				      0,
 				      writepoint_ptr(&c->btree_write_point),
 				      &devs_have,
 				      res->nr_replicas,
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index e693ebd332d2..252993ffcb1b 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -5,6 +5,7 @@
 #include "btree_update_interior.h"
 #include "buckets.h"
 #include "checksum.h"
+#include "disk_groups.h"
 #include "error.h"
 #include "io.h"
 #include "journal.h"
@@ -1031,16 +1032,20 @@ static int journal_write_alloc(struct journal *j, struct journal_buf *w,
 			       unsigned sectors)
 {
 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
+	struct bch_devs_mask devs;
 	struct journal_device *ja;
 	struct bch_dev *ca;
 	struct dev_alloc_list devs_sorted;
+	unsigned target = c->opts.metadata_target ?:
+		c->opts.foreground_target;
 	unsigned i, replicas = 0, replicas_want =
 		READ_ONCE(c->opts.metadata_replicas);
 
 	rcu_read_lock();
+retry:
+	devs = target_rw_devs(c, BCH_DATA_journal, target);
 
-	devs_sorted = bch2_dev_alloc_list(c, &j->wp.stripe,
-					  &c->rw_devs[BCH_DATA_journal]);
+	devs_sorted = bch2_dev_alloc_list(c, &j->wp.stripe, &devs);
 
 	__journal_write_alloc(j, w, &devs_sorted,
 			      sectors, &replicas, replicas_want);
@@ -1072,6 +1077,12 @@ static int journal_write_alloc(struct journal *j, struct journal_buf *w,
 
 	__journal_write_alloc(j, w, &devs_sorted,
 			      sectors, &replicas, replicas_want);
+
+	if (replicas < replicas_want && target) {
+		/* Retry from all devices: */
+		target = 0;
+		goto retry;
+	}
 done:
 	rcu_read_unlock();
 
diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
index e1a46f97f299..01b93e7eb027 100644
--- a/fs/bcachefs/opts.h
+++ b/fs/bcachefs/opts.h
@@ -136,6 +136,11 @@ enum opt_type {
 	  OPT_STR(bch2_str_hash_types),					\
 	  BCH_SB_STR_HASH_TYPE,		BCH_STR_HASH_OPT_SIPHASH,	\
 	  NULL,		"Hash function for directory entries and xattrs")\
+	x(metadata_target,		u16,				\
+	  OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE,			\
+	  OPT_FN(bch2_opt_target),					\
+	  BCH_SB_METADATA_TARGET,	0,				\
+	  "(target)",	"Device or disk group for metadata writes")	\
 	x(foreground_target,		u16,				\
 	  OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE,			\
 	  OPT_FN(bch2_opt_target),					\
-- 
cgit 


From a28bd48a7fc15c99222059a796b3a3c29184b899 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 29 Jan 2021 13:58:10 -0500
Subject: bcachefs: Add an assertion to check for journal writes to same
 location

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs.h   | 1 +
 fs/bcachefs/journal_io.c | 3 +++
 2 files changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 19ba23f7d9dd..bd675b88b354 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -473,6 +473,7 @@ struct bch_dev {
 	atomic64_t		rebalance_work;
 
 	struct journal_device	journal;
+	u64			prev_journal_sector;
 
 	struct work_struct	io_error_work;
 
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 252993ffcb1b..60cefb4ace45 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -1286,6 +1286,9 @@ static void do_journal_write(struct closure *cl)
 		bio->bi_end_io		= journal_write_endio;
 		bio->bi_private		= ca;
 
+		BUG_ON(bio->bi_iter.bi_sector == ca->prev_journal_sector);
+		ca->prev_journal_sector = bio->bi_iter.bi_sector;
+
 		if (!JSET_NO_FLUSH(w->data))
 			bio->bi_opf    |= REQ_FUA;
 		if (!JSET_NO_FLUSH(w->data) && !w->separate_flush)
-- 
cgit 


From 26452d1dcd4b134ecc7aeaae74f78de1c525caf3 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 2 Feb 2021 15:56:44 -0500
Subject: bcachefs: Add missing call to bch2_replicas_entry_sort()

This fixes a bug introduced by "bcachefs: Improve diagnostics when
journal entries are missing" - devices in a replicas entry are supposed
to be sorted.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/journal_io.c |  2 ++
 fs/bcachefs/replicas.c   | 12 ++++++------
 fs/bcachefs/replicas.h   |  1 +
 3 files changed, 9 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 60cefb4ace45..7e726db77881 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -937,6 +937,8 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list,
 		for (ptr = 0; ptr < i->nr_ptrs; ptr++)
 			replicas.e.devs[replicas.e.nr_devs++] = i->ptrs[ptr].dev;
 
+		bch2_replicas_entry_sort(&replicas.e);
+
 		/*
 		 * If we're mounting in degraded mode - if we didn't read all
 		 * the devices - this is wrong:
diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c
index 979e9c2b8c74..e5e437deb9ab 100644
--- a/fs/bcachefs/replicas.c
+++ b/fs/bcachefs/replicas.c
@@ -26,7 +26,7 @@ static void verify_replicas_entry(struct bch_replicas_entry *e)
 #endif
 }
 
-static void replicas_entry_sort(struct bch_replicas_entry *e)
+void bch2_replicas_entry_sort(struct bch_replicas_entry *e)
 {
 	bubble_sort(e->devs, e->nr_devs, u8_cmp);
 }
@@ -122,7 +122,7 @@ void bch2_bkey_to_replicas(struct bch_replicas_entry *e,
 		break;
 	}
 
-	replicas_entry_sort(e);
+	bch2_replicas_entry_sort(e);
 }
 
 void bch2_devlist_to_replicas(struct bch_replicas_entry *e,
@@ -142,7 +142,7 @@ void bch2_devlist_to_replicas(struct bch_replicas_entry *e,
 	for (i = 0; i < devs.nr; i++)
 		e->devs[e->nr_devs++] = devs.devs[i];
 
-	replicas_entry_sort(e);
+	bch2_replicas_entry_sort(e);
 }
 
 static struct bch_replicas_cpu
@@ -197,7 +197,7 @@ static inline int __replicas_entry_idx(struct bch_replicas_cpu *r,
 int bch2_replicas_entry_idx(struct bch_fs *c,
 			    struct bch_replicas_entry *search)
 {
-	replicas_entry_sort(search);
+	bch2_replicas_entry_sort(search);
 
 	return __replicas_entry_idx(&c->replicas, search);
 }
@@ -690,7 +690,7 @@ __bch2_sb_replicas_to_cpu_replicas(struct bch_sb_field_replicas *sb_r,
 	for_each_replicas_entry(sb_r, e) {
 		dst = cpu_replicas_entry(cpu_r, idx++);
 		memcpy(dst, e, replicas_entry_bytes(e));
-		replicas_entry_sort(dst);
+		bch2_replicas_entry_sort(dst);
 	}
 
 	return 0;
@@ -727,7 +727,7 @@ __bch2_sb_replicas_v0_to_cpu_replicas(struct bch_sb_field_replicas_v0 *sb_r,
 		dst->nr_devs	= e->nr_devs;
 		dst->nr_required = 1;
 		memcpy(dst->devs, e->devs, e->nr_devs);
-		replicas_entry_sort(dst);
+		bch2_replicas_entry_sort(dst);
 	}
 
 	return 0;
diff --git a/fs/bcachefs/replicas.h b/fs/bcachefs/replicas.h
index 8b95164fbb56..a16ef23bde8a 100644
--- a/fs/bcachefs/replicas.h
+++ b/fs/bcachefs/replicas.h
@@ -5,6 +5,7 @@
 #include "eytzinger.h"
 #include "replicas_types.h"
 
+void bch2_replicas_entry_sort(struct bch_replicas_entry *);
 void bch2_replicas_entry_to_text(struct printbuf *,
 				 struct bch_replicas_entry *);
 void bch2_cpu_replicas_to_text(struct printbuf *, struct bch_replicas_cpu *);
-- 
cgit 


From 7f4e1d5d0faff0d72e9f6708bf98488d76533846 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 22 Jan 2021 18:01:07 -0500
Subject: bcachefs: KEY_TYPE_alloc_v2

This introduces a new version of KEY_TYPE_alloc, which uses the new
varint encoding introduced for inodes. This means we'll eventually be
able to support much larger bucket sizes (for SMR devices), and the
read/write time fields are expanded to 64 bits - which will be used in
the next patch to get rid of the periodic rescaling of those fields.

Also, for buckets that are members of erasure coded stripes, this adds
persistent fields for the index of the stripe they're members of and the
stripe redundancy. This is part of work to get rid of having to scan and
read into memory the alloc and stripes btrees at mount time.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c | 247 ++++++++++++++++++++++++------------
 fs/bcachefs/alloc_background.h |  48 +++++--
 fs/bcachefs/bcachefs_format.h  |  56 +++++----
 fs/bcachefs/bkey.h             |   1 +
 fs/bcachefs/buckets.c          | 280 +++++++++++++++++++----------------------
 fs/bcachefs/buckets_types.h    |   3 +-
 fs/bcachefs/ec.c               |  35 ++++--
 fs/bcachefs/extents.c          |  21 +---
 fs/bcachefs/movinggc.c         |  11 +-
 9 files changed, 403 insertions(+), 299 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 68fa6caf022d..9a670bb2ccfb 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -15,6 +15,7 @@
 #include "error.h"
 #include "recovery.h"
 #include "trace.h"
+#include "varint.h"
 
 #include <linux/kthread.h>
 #include <linux/math64.h>
@@ -24,11 +25,10 @@
 #include <linux/sched/task.h>
 #include <linux/sort.h>
 
-static const char * const bch2_alloc_field_names[] = {
-#define x(name, bytes) #name,
-	BCH_ALLOC_FIELDS()
+static const unsigned BCH_ALLOC_V1_FIELD_BYTES[] = {
+#define x(name, bits) [BCH_ALLOC_FIELD_V1_##name] = bits / 8,
+	BCH_ALLOC_FIELDS_V1()
 #undef x
-	NULL
 };
 
 static void bch2_recalc_oldest_io(struct bch_fs *, struct bch_dev *, int);
@@ -67,10 +67,10 @@ static void pd_controllers_update(struct work_struct *work)
 
 /* Persistent alloc info: */
 
-static inline u64 get_alloc_field(const struct bch_alloc *a,
-				  const void **p, unsigned field)
+static inline u64 alloc_field_v1_get(const struct bch_alloc *a,
+				     const void **p, unsigned field)
 {
-	unsigned bytes = BCH_ALLOC_FIELD_BYTES[field];
+	unsigned bytes = BCH_ALLOC_V1_FIELD_BYTES[field];
 	u64 v;
 
 	if (!(a->fields & (1 << field)))
@@ -97,10 +97,10 @@ static inline u64 get_alloc_field(const struct bch_alloc *a,
 	return v;
 }
 
-static inline void put_alloc_field(struct bkey_i_alloc *a, void **p,
-				   unsigned field, u64 v)
+static inline void alloc_field_v1_put(struct bkey_i_alloc *a, void **p,
+				      unsigned field, u64 v)
 {
-	unsigned bytes = BCH_ALLOC_FIELD_BYTES[field];
+	unsigned bytes = BCH_ALLOC_V1_FIELD_BYTES[field];
 
 	if (!v)
 		return;
@@ -127,55 +127,149 @@ static inline void put_alloc_field(struct bkey_i_alloc *a, void **p,
 	*p += bytes;
 }
 
-struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c k)
+static void bch2_alloc_unpack_v1(struct bkey_alloc_unpacked *out,
+				 struct bkey_s_c k)
 {
-	struct bkey_alloc_unpacked ret = { .gen = 0 };
+	const struct bch_alloc *in = bkey_s_c_to_alloc(k).v;
+	const void *d = in->data;
+	unsigned idx = 0;
 
-	if (k.k->type == KEY_TYPE_alloc) {
-		const struct bch_alloc *a = bkey_s_c_to_alloc(k).v;
-		const void *d = a->data;
-		unsigned idx = 0;
+	out->gen = in->gen;
+
+#define x(_name, _bits) out->_name = alloc_field_v1_get(in, &d, idx++);
+	BCH_ALLOC_FIELDS_V1()
+#undef  x
+}
+
+static void bch2_alloc_pack_v1(struct bkey_alloc_buf *dst,
+			       const struct bkey_alloc_unpacked src)
+{
+	struct bkey_i_alloc *a = bkey_alloc_init(&dst->k);
+	void *d = a->v.data;
+	unsigned bytes, idx = 0;
 
-		ret.gen = a->gen;
+	a->k.p		= POS(src.dev, src.bucket);
+	a->v.fields	= 0;
+	a->v.gen	= src.gen;
 
-#define x(_name, _bits)	ret._name = get_alloc_field(a, &d, idx++);
-		BCH_ALLOC_FIELDS()
+#define x(_name, _bits)	alloc_field_v1_put(a, &d, idx++, src._name);
+	BCH_ALLOC_FIELDS_V1()
 #undef  x
-	}
-	return ret;
+	bytes = (void *) d - (void *) &a->v;
+	set_bkey_val_bytes(&a->k, bytes);
+	memset_u64s_tail(&a->v, 0, bytes);
 }
 
-void bch2_alloc_pack(struct bkey_i_alloc *dst,
-		     const struct bkey_alloc_unpacked src)
+static int bch2_alloc_unpack_v2(struct bkey_alloc_unpacked *out,
+				struct bkey_s_c k)
 {
-	unsigned idx = 0;
-	void *d = dst->v.data;
+	struct bkey_s_c_alloc_v2 a = bkey_s_c_to_alloc_v2(k);
+	const u8 *in = a.v->data;
+	const u8 *end = bkey_val_end(a);
+	unsigned fieldnr = 0;
+	int ret;
+	u64 v;
+
+	out->gen	= a.v->gen;
+	out->oldest_gen	= a.v->oldest_gen;
+	out->data_type	= a.v->data_type;
+
+#define x(_name, _bits)							\
+	if (fieldnr < a.v->nr_fields) {					\
+		ret = bch2_varint_decode(in, end, &v);			\
+		if (ret < 0)						\
+			return ret;					\
+		in += ret;						\
+	} else {							\
+		v = 0;							\
+	}								\
+	out->_name = v;							\
+	if (v != out->_name)						\
+		return -1;						\
+	fieldnr++;
+
+	BCH_ALLOC_FIELDS_V2()
+#undef  x
+	return 0;
+}
+
+static void bch2_alloc_pack_v2(struct bkey_alloc_buf *dst,
+			       const struct bkey_alloc_unpacked src)
+{
+	struct bkey_i_alloc_v2 *a = bkey_alloc_v2_init(&dst->k);
+	unsigned nr_fields = 0, last_nonzero_fieldnr = 0;
+	u8 *out = a->v.data;
+	u8 *end = (void *) &dst[1];
+	u8 *last_nonzero_field = out;
 	unsigned bytes;
 
-	dst->v.fields	= 0;
-	dst->v.gen	= src.gen;
+	a->k.p		= POS(src.dev, src.bucket);
+	a->v.gen	= src.gen;
+	a->v.oldest_gen	= src.oldest_gen;
+	a->v.data_type	= src.data_type;
+
+#define x(_name, _bits)							\
+	nr_fields++;							\
+									\
+	if (src._name) {						\
+		out += bch2_varint_encode(out, src._name);		\
+									\
+		last_nonzero_field = out;				\
+		last_nonzero_fieldnr = nr_fields;			\
+	} else {							\
+		*out++ = 0;						\
+	}
 
-#define x(_name, _bits)	put_alloc_field(dst, &d, idx++, src._name);
-	BCH_ALLOC_FIELDS()
+	BCH_ALLOC_FIELDS_V2()
 #undef  x
+	BUG_ON(out > end);
+
+	out = last_nonzero_field;
+	a->v.nr_fields = last_nonzero_fieldnr;
+
+	bytes = (u8 *) out - (u8 *) &a->v;
+	set_bkey_val_bytes(&a->k, bytes);
+	memset_u64s_tail(&a->v, 0, bytes);
+}
+
+struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c k)
+{
+	struct bkey_alloc_unpacked ret = {
+		.dev	= k.k->p.inode,
+		.bucket	= k.k->p.offset,
+		.gen	= 0,
+	};
 
-	bytes = (void *) d - (void *) &dst->v;
-	set_bkey_val_bytes(&dst->k, bytes);
-	memset_u64s_tail(&dst->v, 0, bytes);
+	if (k.k->type == KEY_TYPE_alloc_v2)
+		bch2_alloc_unpack_v2(&ret, k);
+	else if (k.k->type == KEY_TYPE_alloc)
+		bch2_alloc_unpack_v1(&ret, k);
+
+	return ret;
+}
+
+void bch2_alloc_pack(struct bch_fs *c,
+		     struct bkey_alloc_buf *dst,
+		     const struct bkey_alloc_unpacked src)
+{
+	if (c->sb.features & (1ULL << BCH_FEATURE_alloc_v2))
+		bch2_alloc_pack_v2(dst, src);
+	else
+		bch2_alloc_pack_v1(dst, src);
 }
 
 static unsigned bch_alloc_val_u64s(const struct bch_alloc *a)
 {
 	unsigned i, bytes = offsetof(struct bch_alloc, data);
 
-	for (i = 0; i < ARRAY_SIZE(BCH_ALLOC_FIELD_BYTES); i++)
+	for (i = 0; i < ARRAY_SIZE(BCH_ALLOC_V1_FIELD_BYTES); i++)
 		if (a->fields & (1 << i))
-			bytes += BCH_ALLOC_FIELD_BYTES[i];
+			bytes += BCH_ALLOC_V1_FIELD_BYTES[i];
 
 	return DIV_ROUND_UP(bytes, sizeof(u64));
 }
 
-const char *bch2_alloc_invalid(const struct bch_fs *c, struct bkey_s_c k)
+const char *bch2_alloc_v1_invalid(const struct bch_fs *c, struct bkey_s_c k)
 {
 	struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k);
 
@@ -190,20 +284,30 @@ const char *bch2_alloc_invalid(const struct bch_fs *c, struct bkey_s_c k)
 	return NULL;
 }
 
-void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c,
-			struct bkey_s_c k)
+const char *bch2_alloc_v2_invalid(const struct bch_fs *c, struct bkey_s_c k)
 {
-	struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k);
-	const void *d = a.v->data;
-	unsigned i;
+	struct bkey_alloc_unpacked u;
+
+	if (k.k->p.inode >= c->sb.nr_devices ||
+	    !c->devs[k.k->p.inode])
+		return "invalid device";
 
-	pr_buf(out, "gen %u", a.v->gen);
+	if (bch2_alloc_unpack_v2(&u, k))
+		return "unpack error";
 
-	for (i = 0; i < BCH_ALLOC_FIELD_NR; i++)
-		if (a.v->fields & (1 << i))
-			pr_buf(out, " %s %llu",
-			       bch2_alloc_field_names[i],
-			       get_alloc_field(a.v, &d, i));
+	return NULL;
+}
+
+void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c,
+			   struct bkey_s_c k)
+{
+	struct bkey_alloc_unpacked u = bch2_alloc_unpack(k);
+
+	pr_buf(out, "gen %u oldest_gen %u data_type %u",
+	       u.gen, u.oldest_gen, u.data_type);
+#define x(_name, ...)	pr_buf(out, #_name " %llu ", (u64) u._name);
+	BCH_ALLOC_FIELDS_V2()
+#undef  x
 }
 
 static int bch2_alloc_read_fn(struct bch_fs *c, enum btree_id id,
@@ -213,7 +317,9 @@ static int bch2_alloc_read_fn(struct bch_fs *c, enum btree_id id,
 	struct bucket *g;
 	struct bkey_alloc_unpacked u;
 
-	if (level || k.k->type != KEY_TYPE_alloc)
+	if (level ||
+	    (k.k->type != KEY_TYPE_alloc &&
+	     k.k->type != KEY_TYPE_alloc_v2))
 		return 0;
 
 	ca = bch_dev_bkey_exists(c, k.k->p.inode);
@@ -281,8 +387,7 @@ static int bch2_alloc_write_key(struct btree_trans *trans,
 	struct bucket *g;
 	struct bucket_mark m;
 	struct bkey_alloc_unpacked old_u, new_u;
-	__BKEY_PADDED(k, 8) alloc_key; /* hack: */
-	struct bkey_i_alloc *a;
+	struct bkey_alloc_buf a;
 	int ret;
 retry:
 	bch2_trans_begin(trans);
@@ -303,17 +408,14 @@ retry:
 	ca	= bch_dev_bkey_exists(c, iter->pos.inode);
 	g	= bucket(ca, iter->pos.offset);
 	m	= READ_ONCE(g->mark);
-	new_u	= alloc_mem_to_key(g, m);
+	new_u	= alloc_mem_to_key(iter, g, m);
 	percpu_up_read(&c->mark_lock);
 
 	if (!bkey_alloc_unpacked_cmp(old_u, new_u))
 		return 0;
 
-	a = bkey_alloc_init(&alloc_key.k);
-	a->k.p = iter->pos;
-	bch2_alloc_pack(a, new_u);
-
-	bch2_trans_update(trans, iter, &a->k_i,
+	bch2_alloc_pack(c, &a, new_u);
+	bch2_trans_update(trans, iter, &a.k,
 			  BTREE_TRIGGER_NORUN);
 	ret = bch2_trans_commit(trans, NULL, NULL,
 				BTREE_INSERT_NOFAIL|flags);
@@ -473,9 +575,9 @@ int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev,
 	struct bch_dev *ca = bch_dev_bkey_exists(c, dev);
 	struct btree_iter *iter;
 	struct bucket *g;
-	struct bkey_i_alloc *a;
+	struct bkey_alloc_buf *a;
 	struct bkey_alloc_unpacked u;
-	u16 *time;
+	u64 *time;
 	int ret = 0;
 
 	iter = bch2_trans_get_iter(trans, BTREE_ID_ALLOC, POS(dev, bucket_nr),
@@ -486,28 +588,24 @@ int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev,
 	if (ret)
 		goto out;
 
-	a = bch2_trans_kmalloc(trans, BKEY_ALLOC_U64s_MAX * 8);
+	a = bch2_trans_kmalloc(trans, sizeof(struct bkey_alloc_buf));
 	ret = PTR_ERR_OR_ZERO(a);
 	if (ret)
 		goto out;
 
 	percpu_down_read(&c->mark_lock);
 	g = bucket(ca, bucket_nr);
-	u = alloc_mem_to_key(g, READ_ONCE(g->mark));
+	u = alloc_mem_to_key(iter, g, READ_ONCE(g->mark));
 	percpu_up_read(&c->mark_lock);
 
-	bkey_alloc_init(&a->k_i);
-	a->k.p = iter->pos;
-
 	time = rw == READ ? &u.read_time : &u.write_time;
 	if (*time == c->bucket_clock[rw].hand)
 		goto out;
 
 	*time = c->bucket_clock[rw].hand;
 
-	bch2_alloc_pack(a, u);
-
-	ret   = bch2_trans_update(trans, iter, &a->k_i, 0) ?:
+	bch2_alloc_pack(c, a, u);
+	ret   = bch2_trans_update(trans, iter, &a->k, 0) ?:
 		bch2_trans_commit(trans, NULL, NULL, 0);
 out:
 	bch2_trans_iter_put(trans, iter);
@@ -863,14 +961,8 @@ static int bch2_invalidate_one_bucket2(struct btree_trans *trans,
 				       struct btree_iter *iter,
 				       u64 *journal_seq, unsigned flags)
 {
-#if 0
-	__BKEY_PADDED(k, BKEY_ALLOC_VAL_U64s_MAX) alloc_key;
-#else
-	/* hack: */
-	__BKEY_PADDED(k, 8) alloc_key;
-#endif
 	struct bch_fs *c = trans->c;
-	struct bkey_i_alloc *a;
+	struct bkey_alloc_buf a;
 	struct bkey_alloc_unpacked u;
 	struct bucket *g;
 	struct bucket_mark m;
@@ -920,8 +1012,6 @@ static int bch2_invalidate_one_bucket2(struct btree_trans *trans,
 		goto out;
 	}
 
-	BUG_ON(BKEY_ALLOC_VAL_U64s_MAX > 8);
-
 	bch2_btree_iter_set_pos(iter, POS(ca->dev_idx, b));
 retry:
 	ret = bch2_btree_iter_traverse(iter);
@@ -931,7 +1021,7 @@ retry:
 	percpu_down_read(&c->mark_lock);
 	g = bucket(ca, iter->pos.offset);
 	m = READ_ONCE(g->mark);
-	u = alloc_mem_to_key(g, m);
+	u = alloc_mem_to_key(iter, g, m);
 
 	percpu_up_read(&c->mark_lock);
 
@@ -944,11 +1034,8 @@ retry:
 	u.read_time	= c->bucket_clock[READ].hand;
 	u.write_time	= c->bucket_clock[WRITE].hand;
 
-	a = bkey_alloc_init(&alloc_key.k);
-	a->k.p = iter->pos;
-	bch2_alloc_pack(a, u);
-
-	bch2_trans_update(trans, iter, &a->k_i,
+	bch2_alloc_pack(c, &a, u);
+	bch2_trans_update(trans, iter, &a.k,
 			  BTREE_TRIGGER_BUCKET_INVALIDATE);
 
 	/*
diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h
index f60fcebff2ce..6fededcd9f86 100644
--- a/fs/bcachefs/alloc_background.h
+++ b/fs/bcachefs/alloc_background.h
@@ -7,12 +7,33 @@
 #include "debug.h"
 
 struct bkey_alloc_unpacked {
+	u64		bucket;
+	u8		dev;
 	u8		gen;
+	u8		oldest_gen;
+	u8		data_type;
 #define x(_name, _bits)	u##_bits _name;
-	BCH_ALLOC_FIELDS()
+	BCH_ALLOC_FIELDS_V2()
 #undef  x
 };
 
+struct bkey_alloc_buf {
+	struct bkey_i	k;
+
+	union {
+	struct {
+#define x(_name,  _bits)		+ _bits / 8
+	u8		_pad[8 + BCH_ALLOC_FIELDS_V1()];
+#undef  x
+	} _v1;
+	struct {
+#define x(_name,  _bits)		+ 8 + _bits / 8
+	u8		_pad[8 + BCH_ALLOC_FIELDS_V2()];
+#undef  x
+	} _v2;
+	};
+} __attribute__((packed, aligned(8)));
+
 /* How out of date a pointer gen is allowed to be: */
 #define BUCKET_GC_GEN_MAX	96U
 
@@ -20,23 +41,28 @@ struct bkey_alloc_unpacked {
 static inline bool bkey_alloc_unpacked_cmp(struct bkey_alloc_unpacked l,
 					   struct bkey_alloc_unpacked r)
 {
-	return l.gen != r.gen
-#define x(_name, _bits)	|| l._name != r._name
-	BCH_ALLOC_FIELDS()
+	return  l.gen != r.gen			||
+		l.oldest_gen != r.oldest_gen	||
+		l.data_type != r.data_type
+#define x(_name, ...)	|| l._name != r._name
+	BCH_ALLOC_FIELDS_V2()
 #undef  x
 	;
 }
 
 struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c);
-void bch2_alloc_pack(struct bkey_i_alloc *,
+void bch2_alloc_pack(struct bch_fs *, struct bkey_alloc_buf *,
 		     const struct bkey_alloc_unpacked);
 
 int bch2_bucket_io_time_reset(struct btree_trans *, unsigned, size_t, int);
 
 static inline struct bkey_alloc_unpacked
-alloc_mem_to_key(struct bucket *g, struct bucket_mark m)
+alloc_mem_to_key(struct btree_iter *iter,
+		 struct bucket *g, struct bucket_mark m)
 {
 	return (struct bkey_alloc_unpacked) {
+		.dev		= iter->pos.inode,
+		.bucket		= iter->pos.offset,
 		.gen		= m.gen,
 		.oldest_gen	= g->oldest_gen,
 		.data_type	= m.data_type,
@@ -49,11 +75,17 @@ alloc_mem_to_key(struct bucket *g, struct bucket_mark m)
 
 #define ALLOC_SCAN_BATCH(ca)		max_t(size_t, 1, (ca)->mi.nbuckets >> 9)
 
-const char *bch2_alloc_invalid(const struct bch_fs *, struct bkey_s_c);
+const char *bch2_alloc_v1_invalid(const struct bch_fs *, struct bkey_s_c);
+const char *bch2_alloc_v2_invalid(const struct bch_fs *, struct bkey_s_c);
 void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 
 #define bch2_bkey_ops_alloc (struct bkey_ops) {		\
-	.key_invalid	= bch2_alloc_invalid,		\
+	.key_invalid	= bch2_alloc_v1_invalid,	\
+	.val_to_text	= bch2_alloc_to_text,		\
+}
+
+#define bch2_bkey_ops_alloc_v2 (struct bkey_ops) {	\
+	.key_invalid	= bch2_alloc_v2_invalid,	\
 	.val_to_text	= bch2_alloc_to_text,		\
 }
 
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index 77af77efdd6d..b6c7e57b6bcd 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -345,7 +345,8 @@ static inline void bkey_init(struct bkey *k)
 	x(reflink_v,		16)			\
 	x(inline_data,		17)			\
 	x(btree_ptr_v2,		18)			\
-	x(indirect_inline_data,	19)
+	x(indirect_inline_data,	19)			\
+	x(alloc_v2,		20)
 
 enum bch_bkey_type {
 #define x(name, nr) KEY_TYPE_##name	= nr,
@@ -555,9 +556,11 @@ struct bch_extent_stripe_ptr {
 #if defined(__LITTLE_ENDIAN_BITFIELD)
 	__u64			type:5,
 				block:8,
-				idx:51;
+				redundancy:4,
+				idx:47;
 #elif defined (__BIG_ENDIAN_BITFIELD)
-	__u64			idx:51,
+	__u64			idx:47,
+				redundancy:4,
 				block:8,
 				type:5;
 #endif
@@ -803,35 +806,40 @@ struct bch_alloc {
 	__u8			data[];
 } __attribute__((packed, aligned(8)));
 
-#define BCH_ALLOC_FIELDS()			\
+#define BCH_ALLOC_FIELDS_V1()			\
 	x(read_time,		16)		\
 	x(write_time,		16)		\
 	x(data_type,		8)		\
 	x(dirty_sectors,	16)		\
 	x(cached_sectors,	16)		\
-	x(oldest_gen,		8)
+	x(oldest_gen,		8)		\
+	x(stripe,		32)		\
+	x(stripe_redundancy,	8)
+
+struct bch_alloc_v2 {
+	struct bch_val		v;
+	__u8			nr_fields;
+	__u8			gen;
+	__u8			oldest_gen;
+	__u8			data_type;
+	__u8			data[];
+} __attribute__((packed, aligned(8)));
+
+#define BCH_ALLOC_FIELDS_V2()			\
+	x(read_time,		64)		\
+	x(write_time,		64)		\
+	x(dirty_sectors,	16)		\
+	x(cached_sectors,	16)		\
+	x(stripe,		32)		\
+	x(stripe_redundancy,	8)
 
 enum {
-#define x(name, bytes) BCH_ALLOC_FIELD_##name,
-	BCH_ALLOC_FIELDS()
+#define x(name, _bits) BCH_ALLOC_FIELD_V1_##name,
+	BCH_ALLOC_FIELDS_V1()
 #undef x
 	BCH_ALLOC_FIELD_NR
 };
 
-static const unsigned BCH_ALLOC_FIELD_BYTES[] = {
-#define x(name, bits) [BCH_ALLOC_FIELD_##name] = bits / 8,
-	BCH_ALLOC_FIELDS()
-#undef x
-};
-
-#define x(name, bits) + (bits / 8)
-static const unsigned BKEY_ALLOC_VAL_U64s_MAX =
-	DIV_ROUND_UP(offsetof(struct bch_alloc, data)
-		     BCH_ALLOC_FIELDS(), sizeof(u64));
-#undef x
-
-#define BKEY_ALLOC_U64s_MAX	(BKEY_U64s + BKEY_ALLOC_VAL_U64s_MAX)
-
 /* Quotas: */
 
 enum quota_types {
@@ -1337,7 +1345,8 @@ LE64_BITMASK(BCH_SB_METADATA_TARGET,	struct bch_sb, flags[3], 16, 28);
 	x(btree_updates_journalled,	13)	\
 	x(reflink_inline_data,		14)	\
 	x(new_varint,			15)	\
-	x(journal_no_flush,		16)
+	x(journal_no_flush,		16)	\
+	x(alloc_v2,			17)
 
 #define BCH_SB_FEATURES_ALL				\
 	((1ULL << BCH_FEATURE_new_siphash)|		\
@@ -1345,7 +1354,8 @@ LE64_BITMASK(BCH_SB_METADATA_TARGET,	struct bch_sb, flags[3], 16, 28);
 	 (1ULL << BCH_FEATURE_btree_ptr_v2)|		\
 	 (1ULL << BCH_FEATURE_extents_above_btree_updates)|\
 	 (1ULL << BCH_FEATURE_new_varint)|		\
-	 (1ULL << BCH_FEATURE_journal_no_flush))
+	 (1ULL << BCH_FEATURE_journal_no_flush)|	\
+	 (1ULL << BCH_FEATURE_alloc_v2))
 
 enum bch_sb_feature {
 #define x(f, n) BCH_FEATURE_##f,
diff --git a/fs/bcachefs/bkey.h b/fs/bcachefs/bkey.h
index f984064f4b5d..9fd752b5c2f5 100644
--- a/fs/bcachefs/bkey.h
+++ b/fs/bcachefs/bkey.h
@@ -538,6 +538,7 @@ BKEY_VAL_ACCESSORS(reflink_v);
 BKEY_VAL_ACCESSORS(inline_data);
 BKEY_VAL_ACCESSORS(btree_ptr_v2);
 BKEY_VAL_ACCESSORS(indirect_inline_data);
+BKEY_VAL_ACCESSORS(alloc_v2);
 
 /* byte order helpers */
 
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 1be527ab1416..7b60e988df83 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -688,7 +688,8 @@ static int bch2_mark_alloc(struct bch_fs *c,
 	struct bucket_mark old_m, m;
 
 	/* We don't do anything for deletions - do we?: */
-	if (new.k->type != KEY_TYPE_alloc)
+	if (new.k->type != KEY_TYPE_alloc &&
+	    new.k->type != KEY_TYPE_alloc_v2)
 		return 0;
 
 	/*
@@ -711,6 +712,7 @@ static int bch2_mark_alloc(struct bch_fs *c,
 		m.data_type		= u.data_type;
 		m.dirty_sectors		= u.dirty_sectors;
 		m.cached_sectors	= u.cached_sectors;
+		m.stripe		= u.stripe != 0;
 
 		if (journal_seq) {
 			m.journal_seq_valid	= 1;
@@ -724,6 +726,8 @@ static int bch2_mark_alloc(struct bch_fs *c,
 	g->io_time[WRITE]	= u.write_time;
 	g->oldest_gen		= u.oldest_gen;
 	g->gen_valid		= 1;
+	g->stripe		= u.stripe;
+	g->stripe_redundancy	= u.stripe_redundancy;
 
 	/*
 	 * need to know if we're getting called from the invalidate path or
@@ -918,11 +922,10 @@ static int check_bucket_ref(struct bch_fs *c, struct bkey_s_c k,
 	return 0;
 }
 
-static int bucket_set_stripe(struct bch_fs *c, struct bkey_s_c k,
+static int mark_stripe_bucket(struct bch_fs *c, struct bkey_s_c k,
 			     unsigned ptr_idx,
 			     struct bch_fs_usage *fs_usage,
-			     u64 journal_seq, unsigned flags,
-			     bool enabled)
+			     u64 journal_seq, unsigned flags)
 {
 	const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
 	unsigned nr_data = s->nr_blocks - s->nr_redundant;
@@ -935,8 +938,13 @@ static int bucket_set_stripe(struct bch_fs *c, struct bkey_s_c k,
 	char buf[200];
 	int ret;
 
-	if (enabled)
-		g->ec_redundancy = s->nr_redundant;
+	if (g->stripe && g->stripe != k.k->p.offset) {
+		bch2_fs_inconsistent(c,
+			      "bucket %u:%zu gen %u: multiple stripes using same bucket\n%s",
+			      ptr->dev, PTR_BUCKET_NR(ca, ptr), new.gen,
+			      (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
+		return -EINVAL;
+	}
 
 	old = bucket_cmpxchg(g, new, ({
 		ret = check_bucket_ref(c, k, ptr, 0, 0, new.gen, new.data_type,
@@ -944,23 +952,9 @@ static int bucket_set_stripe(struct bch_fs *c, struct bkey_s_c k,
 		if (ret)
 			return ret;
 
-		if (new.stripe && enabled)
-			bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
-				      "bucket %u:%zu gen %u: multiple stripes using same bucket\n%s",
-				      ptr->dev, PTR_BUCKET_NR(ca, ptr), new.gen,
-				      (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
-
-		if (!new.stripe && !enabled)
-			bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
-				      "bucket %u:%zu gen %u: deleting stripe but not marked\n%s",
-				      ptr->dev, PTR_BUCKET_NR(ca, ptr), new.gen,
-				      (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
-
-		new.stripe			= enabled;
-
-		if ((flags & BTREE_TRIGGER_GC) && parity) {
-			new.data_type = enabled ? BCH_DATA_parity : 0;
-			new.dirty_sectors = enabled ? le16_to_cpu(s->sectors): 0;
+		if (parity) {
+			new.data_type		= BCH_DATA_parity;
+			new.dirty_sectors	= le16_to_cpu(s->sectors);
 		}
 
 		if (journal_seq) {
@@ -969,8 +963,8 @@ static int bucket_set_stripe(struct bch_fs *c, struct bkey_s_c k,
 		}
 	}));
 
-	if (!enabled)
-		g->ec_redundancy = 0;
+	g->stripe		= k.k->p.offset;
+	g->stripe_redundancy	= s->nr_redundant;
 
 	bch2_dev_usage_update(c, ca, fs_usage, old, new, gc);
 	return 0;
@@ -1166,6 +1160,8 @@ static int bch2_mark_stripe(struct bch_fs *c,
 	unsigned i;
 	int ret;
 
+	BUG_ON(gc && old_s);
+
 	if (!m || (old_s && !m->alive)) {
 		bch_err_ratelimited(c, "error marking nonexistent stripe %zu",
 				    idx);
@@ -1173,48 +1169,12 @@ static int bch2_mark_stripe(struct bch_fs *c,
 	}
 
 	if (!new_s) {
-		/* Deleting: */
-		for (i = 0; i < old_s->nr_blocks; i++) {
-			ret = bucket_set_stripe(c, old, i, fs_usage,
-						journal_seq, flags, false);
-			if (ret)
-				return ret;
-		}
-
-		if (!gc && m->on_heap) {
-			spin_lock(&c->ec_stripes_heap_lock);
-			bch2_stripes_heap_del(c, m, idx);
-			spin_unlock(&c->ec_stripes_heap_lock);
-		}
-
-		if (gc)
-			update_replicas(c, fs_usage, &m->r.e,
-					-((s64) m->sectors * m->nr_redundant));
+		spin_lock(&c->ec_stripes_heap_lock);
+		bch2_stripes_heap_del(c, m, idx);
+		spin_unlock(&c->ec_stripes_heap_lock);
 
 		memset(m, 0, sizeof(*m));
 	} else {
-		BUG_ON(old_s && new_s->nr_blocks != old_s->nr_blocks);
-		BUG_ON(old_s && new_s->nr_redundant != old_s->nr_redundant);
-
-		for (i = 0; i < new_s->nr_blocks; i++) {
-			if (!old_s ||
-			    memcmp(new_s->ptrs + i,
-				   old_s->ptrs + i,
-				   sizeof(struct bch_extent_ptr))) {
-
-				if (old_s) {
-					bucket_set_stripe(c, old, i, fs_usage,
-							  journal_seq, flags, false);
-					if (ret)
-						return ret;
-				}
-				ret = bucket_set_stripe(c, new, i, fs_usage,
-							journal_seq, flags, true);
-				if (ret)
-					return ret;
-			}
-		}
-
 		m->alive	= true;
 		m->sectors	= le16_to_cpu(new_s->sectors);
 		m->algorithm	= new_s->algorithm;
@@ -1223,27 +1183,13 @@ static int bch2_mark_stripe(struct bch_fs *c,
 		m->blocks_nonempty = 0;
 
 		for (i = 0; i < new_s->nr_blocks; i++) {
-			unsigned s = stripe_blockcount_get(new_s, i);
-
-			/*
-			 * gc recalculates this field from stripe ptr
-			 * references:
-			 */
-			if (!gc)
-				m->block_sectors[i] = s;
-			m->blocks_nonempty += !!s;
+			m->block_sectors[i] =
+				stripe_blockcount_get(new_s, i);
+			m->blocks_nonempty += !!m->block_sectors[i];
 		}
 
-		if (gc && old_s)
-			update_replicas(c, fs_usage, &m->r.e,
-					-((s64) m->sectors * m->nr_redundant));
-
 		bch2_bkey_to_replicas(&m->r.e, new);
 
-		if (gc)
-			update_replicas(c, fs_usage, &m->r.e,
-					((s64) m->sectors * m->nr_redundant));
-
 		if (!gc) {
 			spin_lock(&c->ec_stripes_heap_lock);
 			bch2_stripes_heap_update(c, m, idx);
@@ -1251,6 +1197,25 @@ static int bch2_mark_stripe(struct bch_fs *c,
 		}
 	}
 
+	if (gc) {
+		/*
+		 * gc recalculates this field from stripe ptr
+		 * references:
+		 */
+		memset(m->block_sectors, 0, sizeof(m->block_sectors));
+		m->blocks_nonempty = 0;
+
+		for (i = 0; i < new_s->nr_blocks; i++) {
+			ret = mark_stripe_bucket(c, new, i, fs_usage,
+						 journal_seq, flags);
+			if (ret)
+				return ret;
+		}
+
+		update_replicas(c, fs_usage, &m->r.e,
+				((s64) m->sectors * m->nr_redundant));
+	}
+
 	return 0;
 }
 
@@ -1274,6 +1239,7 @@ static int bch2_mark_key_locked(struct bch_fs *c,
 
 	switch (k.k->type) {
 	case KEY_TYPE_alloc:
+	case KEY_TYPE_alloc_v2:
 		ret = bch2_mark_alloc(c, old, new, fs_usage, journal_seq, flags);
 		break;
 	case KEY_TYPE_btree_ptr:
@@ -1542,9 +1508,10 @@ static int trans_get_key(struct btree_trans *trans,
 	return ret;
 }
 
-static int bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree_iter **_iter,
-					 const struct bch_extent_ptr *ptr,
-					 struct bkey_alloc_unpacked *u)
+static struct bkey_alloc_buf *
+bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree_iter **_iter,
+			      const struct bch_extent_ptr *ptr,
+			      struct bkey_alloc_unpacked *u)
 {
 	struct bch_fs *c = trans->c;
 	struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
@@ -1552,8 +1519,13 @@ static int bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree
 	struct bucket *g;
 	struct btree_iter *iter;
 	struct bkey_s_c k;
+	struct bkey_alloc_buf *a;
 	int ret;
 
+	a = bch2_trans_kmalloc(trans, sizeof(struct bkey_alloc_buf));
+	if (IS_ERR(a))
+		return a;
+
 	iter = trans_get_update(trans, BTREE_ID_ALLOC, pos, &k);
 	if (iter) {
 		*u = bch2_alloc_unpack(k);
@@ -1565,17 +1537,17 @@ static int bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree
 		ret = bch2_btree_iter_traverse(iter);
 		if (ret) {
 			bch2_trans_iter_put(trans, iter);
-			return ret;
+			return ERR_PTR(ret);
 		}
 
 		percpu_down_read(&c->mark_lock);
 		g = bucket(ca, pos.offset);
-		*u = alloc_mem_to_key(g, READ_ONCE(g->mark));
+		*u = alloc_mem_to_key(iter, g, READ_ONCE(g->mark));
 		percpu_up_read(&c->mark_lock);
 	}
 
 	*_iter = iter;
-	return 0;
+	return a;
 }
 
 static int bch2_trans_mark_pointer(struct btree_trans *trans,
@@ -1585,27 +1557,20 @@ static int bch2_trans_mark_pointer(struct btree_trans *trans,
 	struct bch_fs *c = trans->c;
 	struct btree_iter *iter;
 	struct bkey_alloc_unpacked u;
-	struct bkey_i_alloc *a;
+	struct bkey_alloc_buf *a;
 	int ret;
 
-	ret = bch2_trans_start_alloc_update(trans, &iter, &p.ptr, &u);
-	if (ret)
-		return ret;
+	a = bch2_trans_start_alloc_update(trans, &iter, &p.ptr, &u);
+	if (IS_ERR(a))
+		return PTR_ERR(a);
 
 	ret = __mark_pointer(c, k, &p.ptr, sectors, data_type, u.gen, &u.data_type,
 			     &u.dirty_sectors, &u.cached_sectors);
 	if (ret)
 		goto out;
 
-	a = bch2_trans_kmalloc(trans, BKEY_ALLOC_U64s_MAX * 8);
-	ret = PTR_ERR_OR_ZERO(a);
-	if (ret)
-		goto out;
-
-	bkey_alloc_init(&a->k_i);
-	a->k.p = iter->pos;
-	bch2_alloc_pack(a, u);
-	bch2_trans_update(trans, iter, &a->k_i, 0);
+	bch2_alloc_pack(c, a, u);
+	bch2_trans_update(trans, iter, &a->k, 0);
 out:
 	bch2_trans_iter_put(trans, iter);
 	return ret;
@@ -1716,34 +1681,51 @@ static int bch2_trans_mark_extent(struct btree_trans *trans,
 }
 
 static int bch2_trans_mark_stripe_alloc_ref(struct btree_trans *trans,
-					    const struct bch_extent_ptr *ptr,
-					    s64 sectors, bool parity)
+					    struct bkey_s_c_stripe s,
+					    unsigned idx, bool deleting)
 {
-	struct bkey_i_alloc *a;
+	struct bch_fs *c = trans->c;
+	const struct bch_extent_ptr *ptr = &s.v->ptrs[idx];
+	struct bkey_alloc_buf *a;
 	struct btree_iter *iter;
 	struct bkey_alloc_unpacked u;
-	int ret;
+	bool parity = idx >= s.v->nr_blocks - s.v->nr_redundant;
+	int ret = 0;
 
-	ret = bch2_trans_start_alloc_update(trans, &iter, ptr, &u);
-	if (ret)
-		return ret;
+	a = bch2_trans_start_alloc_update(trans, &iter, ptr, &u);
+	if (IS_ERR(a))
+		return PTR_ERR(a);
 
 	if (parity) {
+		s64 sectors = le16_to_cpu(s.v->sectors);
+
+		if (deleting)
+			sectors = -sectors;
+
 		u.dirty_sectors += sectors;
 		u.data_type = u.dirty_sectors
 			? BCH_DATA_parity
 			: 0;
 	}
 
-	a = bch2_trans_kmalloc(trans, BKEY_ALLOC_U64s_MAX * 8);
-	ret = PTR_ERR_OR_ZERO(a);
-	if (ret)
-		goto err;
+	if (!deleting) {
+		if (bch2_fs_inconsistent_on(u.stripe && u.stripe != s.k->p.offset, c,
+				"bucket %llu:%llu gen %u: multiple stripes using same bucket (%u, %llu)",
+				iter->pos.inode, iter->pos.offset, u.gen,
+				u.stripe, s.k->p.offset)) {
+			ret = -EIO;
+			goto err;
+		}
 
-	bkey_alloc_init(&a->k_i);
-	a->k.p = iter->pos;
-	bch2_alloc_pack(a, u);
-	bch2_trans_update(trans, iter, &a->k_i, 0);
+		u.stripe		= s.k->p.offset;
+		u.stripe_redundancy	= s.v->nr_redundant;
+	} else {
+		u.stripe		= 0;
+		u.stripe_redundancy	= 0;
+	}
+
+	bch2_alloc_pack(c, a, u);
+	bch2_trans_update(trans, iter, &a->k, 0);
 err:
 	bch2_trans_iter_put(trans, iter);
 	return ret;
@@ -1753,51 +1735,50 @@ static int bch2_trans_mark_stripe(struct btree_trans *trans,
 				  struct bkey_s_c old, struct bkey_s_c new,
 				  unsigned flags)
 {
-	const struct bch_stripe *old_s = old.k->type == KEY_TYPE_stripe
-		? bkey_s_c_to_stripe(old).v : NULL;
-	const struct bch_stripe *new_s = new.k->type == KEY_TYPE_stripe
-		? bkey_s_c_to_stripe(new).v : NULL;
+	struct bkey_s_c_stripe old_s = { NULL };
+	struct bkey_s_c_stripe new_s = { NULL };
 	struct bch_replicas_padded r;
 	unsigned i;
 	int ret = 0;
 
+	if (old.k->type == KEY_TYPE_stripe)
+		old_s = bkey_s_c_to_stripe(old);
+	if (new.k->type == KEY_TYPE_stripe)
+		new_s = bkey_s_c_to_stripe(new);
+
 	/*
 	 * If the pointers aren't changing, we don't need to do anything:
 	 */
-	if (new_s && old_s &&
-	    !memcmp(old_s->ptrs, new_s->ptrs,
-		    new_s->nr_blocks * sizeof(struct bch_extent_ptr)))
+	if (new_s.k && old_s.k &&
+	    new_s.v->nr_blocks		== old_s.v->nr_blocks &&
+	    new_s.v->nr_redundant	== old_s.v->nr_redundant &&
+	    !memcmp(old_s.v->ptrs, new_s.v->ptrs,
+		    new_s.v->nr_blocks * sizeof(struct bch_extent_ptr)))
 		return 0;
 
-	if (new_s) {
-		unsigned nr_data = new_s->nr_blocks - new_s->nr_redundant;
-		s64 sectors = le16_to_cpu(new_s->sectors);
+	if (new_s.k) {
+		s64 sectors = le16_to_cpu(new_s.v->sectors);
 
 		bch2_bkey_to_replicas(&r.e, new);
-		update_replicas_list(trans, &r.e, sectors * new_s->nr_redundant);
+		update_replicas_list(trans, &r.e, sectors * new_s.v->nr_redundant);
 
-		for (i = 0; i < new_s->nr_blocks; i++) {
-			bool parity = i >= nr_data;
-
-			ret = bch2_trans_mark_stripe_alloc_ref(trans,
-					&new_s->ptrs[i], sectors, parity);
+		for (i = 0; i < new_s.v->nr_blocks; i++) {
+			ret = bch2_trans_mark_stripe_alloc_ref(trans, new_s,
+							       i, false);
 			if (ret)
 				return ret;
 		}
 	}
 
-	if (old_s) {
-		unsigned nr_data = old_s->nr_blocks - old_s->nr_redundant;
-		s64 sectors = -((s64) le16_to_cpu(old_s->sectors));
+	if (old_s.k) {
+		s64 sectors = -((s64) le16_to_cpu(old_s.v->sectors));
 
 		bch2_bkey_to_replicas(&r.e, old);
-		update_replicas_list(trans, &r.e, sectors * old_s->nr_redundant);
+		update_replicas_list(trans, &r.e, sectors * old_s.v->nr_redundant);
 
-		for (i = 0; i < old_s->nr_blocks; i++) {
-			bool parity = i >= nr_data;
-
-			ret = bch2_trans_mark_stripe_alloc_ref(trans,
-					&old_s->ptrs[i], sectors, parity);
+		for (i = 0; i < old_s.v->nr_blocks; i++) {
+			ret = bch2_trans_mark_stripe_alloc_ref(trans, old_s,
+							       i, true);
 			if (ret)
 				return ret;
 		}
@@ -2068,21 +2049,16 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
 	struct bch_fs *c = trans->c;
 	struct btree_iter *iter;
 	struct bkey_alloc_unpacked u;
-	struct bkey_i_alloc *a;
+	struct bkey_alloc_buf *a;
 	struct bch_extent_ptr ptr = {
 		.dev = ca->dev_idx,
 		.offset = bucket_to_sector(ca, b),
 	};
 	int ret = 0;
 
-	a = bch2_trans_kmalloc(trans, BKEY_ALLOC_U64s_MAX * 8);
-	ret = PTR_ERR_OR_ZERO(a);
-	if (ret)
-		return ret;
-
-	ret = bch2_trans_start_alloc_update(trans, &iter, &ptr, &u);
-	if (ret)
-		return ret;
+	a = bch2_trans_start_alloc_update(trans, &iter, &ptr, &u);
+	if (IS_ERR(a))
+		return PTR_ERR(a);
 
 	if (u.data_type && u.data_type != type) {
 		bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
@@ -2115,10 +2091,8 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
 	u.data_type	= type;
 	u.dirty_sectors	= sectors;
 
-	bkey_alloc_init(&a->k_i);
-	a->k.p = iter->pos;
-	bch2_alloc_pack(a, u);
-	bch2_trans_update(trans, iter, &a->k_i, 0);
+	bch2_alloc_pack(c, a, u);
+	bch2_trans_update(trans, iter, &a->k, 0);
 out:
 	bch2_trans_iter_put(trans, iter);
 	return ret;
diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h
index 98b6c18ca2e8..99ab9f48ba9d 100644
--- a/fs/bcachefs/buckets_types.h
+++ b/fs/bcachefs/buckets_types.h
@@ -41,7 +41,8 @@ struct bucket {
 	u8				oldest_gen;
 	u8				gc_gen;
 	unsigned			gen_valid:1;
-	u8				ec_redundancy;
+	u8				stripe_redundancy;
+	u32				stripe;
 };
 
 struct bucket_array {
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index 78bea3e5fa9a..a32d399e5b6f 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -105,6 +105,9 @@ const char *bch2_stripe_invalid(const struct bch_fs *c, struct bkey_s_c k)
 {
 	const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
 
+	if (!bkey_cmp(k.k->p, POS_MIN))
+		return "stripe at pos 0";
+
 	if (k.k->p.inode)
 		return "invalid stripe key";
 
@@ -279,10 +282,14 @@ static void ec_validate_checksums(struct bch_fs *c, struct ec_stripe_buf *buf)
 			struct bch_csum got = ec_block_checksum(buf, i, offset);
 
 			if (bch2_crc_cmp(want, got)) {
+				char buf2[200];
+
+				bch2_bkey_val_to_text(&PBUF(buf2), c, bkey_i_to_s_c(&buf->key.k_i));
+
 				bch_err_ratelimited(c,
-					"stripe checksum error at %u:%u: csum type %u, expected %llx got %llx",
-					i, j, v->csum_type,
-					want.lo, got.lo);
+					"stripe checksum error for %ps at %u:%u: csum type %u, expected %llx got %llx\n%s",
+					(void *) _RET_IP_, i, j, v->csum_type,
+					want.lo, got.lo, buf2);
 				clear_bit(i, buf->valid);
 				break;
 			}
@@ -335,6 +342,8 @@ static int ec_do_recov(struct bch_fs *c, struct ec_stripe_buf *buf)
 static void ec_block_endio(struct bio *bio)
 {
 	struct ec_bio *ec_bio = container_of(bio, struct ec_bio, bio);
+	struct bch_stripe *v = &ec_bio->buf->key.v;
+	struct bch_extent_ptr *ptr = &v->ptrs[ec_bio->idx];
 	struct bch_dev *ca = ec_bio->ca;
 	struct closure *cl = bio->bi_private;
 
@@ -343,6 +352,13 @@ static void ec_block_endio(struct bio *bio)
 			       bch2_blk_status_to_str(bio->bi_status)))
 		clear_bit(ec_bio->idx, ec_bio->buf->valid);
 
+	if (ptr_stale(ca, ptr)) {
+		bch_err_ratelimited(ca->fs,
+				    "error %s stripe: stale pointer after io",
+				    bio_data_dir(bio) == READ ? "reading from" : "writing to");
+		clear_bit(ec_bio->idx, ec_bio->buf->valid);
+	}
+
 	bio_put(&ec_bio->bio);
 	percpu_ref_put(&ca->io_ref);
 	closure_put(cl);
@@ -652,7 +668,6 @@ void bch2_stripes_heap_update(struct bch_fs *c,
 
 static int ec_stripe_delete(struct bch_fs *c, size_t idx)
 {
-	//pr_info("deleting stripe %zu", idx);
 	return bch2_btree_delete_range(c, BTREE_ID_EC,
 				       POS(0, idx),
 				       POS(0, idx + 1),
@@ -795,6 +810,7 @@ static void extent_stripe_ptr_add(struct bkey_s_extent e,
 	*dst = (struct bch_extent_stripe_ptr) {
 		.type = 1 << BCH_EXTENT_ENTRY_stripe_ptr,
 		.block		= block,
+		.redundancy	= s->key.v.nr_redundant,
 		.idx		= s->key.k.p.offset,
 	};
 }
@@ -1054,8 +1070,6 @@ void bch2_ec_add_backpointer(struct bch_fs *c, struct write_point *wp,
 	if (!ob)
 		return;
 
-	//pr_info("adding backpointer at %llu:%llu", pos.inode, pos.offset);
-
 	ec = ob->ec;
 	mutex_lock(&ec->lock);
 
@@ -1348,12 +1362,14 @@ static s64 get_existing_stripe(struct bch_fs *c,
 	struct stripe *m;
 	size_t heap_idx;
 	u64 stripe_idx;
+	s64 ret = -1;
 
 	if (may_create_new_stripe(c))
 		return -1;
 
 	spin_lock(&c->ec_stripes_heap_lock);
 	for (heap_idx = 0; heap_idx < h->used; heap_idx++) {
+		/* No blocks worth reusing, stripe will just be deleted: */
 		if (!h->data[heap_idx].blocks_nonempty)
 			continue;
 
@@ -1365,13 +1381,12 @@ static s64 get_existing_stripe(struct bch_fs *c,
 		    m->sectors		== head->blocksize &&
 		    m->blocks_nonempty	< m->nr_blocks - m->nr_redundant) {
 			bch2_stripes_heap_del(c, m, stripe_idx);
-			spin_unlock(&c->ec_stripes_heap_lock);
-			return stripe_idx;
+			ret = stripe_idx;
+			break;
 		}
 	}
-
 	spin_unlock(&c->ec_stripes_heap_lock);
-	return -1;
+	return ret;
 }
 
 struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *c,
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 6e388881ebf9..50ab240d89a8 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -703,14 +703,8 @@ unsigned bch2_bkey_replicas(struct bch_fs *c, struct bkey_s_c k)
 		if (p.ptr.cached)
 			continue;
 
-		if (p.has_ec) {
-			struct stripe *s =
-				genradix_ptr(&c->stripes[0], p.ec.idx);
-
-			WARN_ON(!s);
-			if (s)
-				replicas += s->nr_redundant;
-		}
+		if (p.has_ec)
+			replicas += p.ec.redundancy;
 
 		replicas++;
 
@@ -733,16 +727,9 @@ static unsigned bch2_extent_ptr_durability(struct bch_fs *c,
 	if (ca->mi.state != BCH_MEMBER_STATE_FAILED)
 		durability = max_t(unsigned, durability, ca->mi.durability);
 
-	if (p.has_ec) {
-		struct stripe *s =
-			genradix_ptr(&c->stripes[0], p.ec.idx);
-
-		if (WARN_ON(!s))
-			goto out;
+	if (p.has_ec)
+		durability += p.ec.redundancy;
 
-		durability += s->nr_redundant;
-	}
-out:
 	return durability;
 }
 
diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c
index a867460bc71c..8e6e4cd73886 100644
--- a/fs/bcachefs/movinggc.c
+++ b/fs/bcachefs/movinggc.c
@@ -92,11 +92,8 @@ static enum data_cmd copygc_pred(struct bch_fs *c, void *arg,
 			data_opts->btree_insert_flags	= BTREE_INSERT_USE_RESERVE;
 			data_opts->rewrite_dev		= p.ptr.dev;
 
-			if (p.has_ec) {
-				struct stripe *m = genradix_ptr(&c->stripes[0], p.ec.idx);
-
-				data_opts->nr_replicas += m->nr_redundant;
-			}
+			if (p.has_ec)
+				data_opts->nr_replicas += p.ec.redundancy;
 
 			return DATA_REWRITE;
 		}
@@ -179,12 +176,12 @@ static int bch2_copygc(struct bch_fs *c)
 			    bucket_sectors_used(m) >= ca->mi.bucket_size)
 				continue;
 
-			WARN_ON(m.stripe && !g->ec_redundancy);
+			WARN_ON(m.stripe && !g->stripe_redundancy);
 
 			e = (struct copygc_heap_entry) {
 				.dev		= dev_idx,
 				.gen		= m.gen,
-				.replicas	= 1 + g->ec_redundancy,
+				.replicas	= 1 + g->stripe_redundancy,
 				.fragmentation	= bucket_sectors_used(m) * (1U << 15)
 					/ ca->mi.bucket_size,
 				.sectors	= bucket_sectors_used(m),
-- 
cgit 


From 2abe542087d9cb1bc7bb8ac7ae262afccbdb7aa6 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 21 Jan 2021 15:28:59 -0500
Subject: bcachefs: Persist 64 bit io clocks

Originally, bcachefs - going back to bcache - stored, for each bucket, a
16 bit counter corresponding to how long it had been since the bucket
was read from. But, this required periodically rescaling counters on
every bucket to avoid wraparound. That wasn't an issue in bcache, where
we'd perodically rewrite the per bucket metadata all at once, but in
bcachefs we're trying to avoid having to walk every single bucket.

This patch switches to persisting 64 bit io clocks, corresponding to the
64 bit bucket timestaps introduced in the previous patch with
KEY_TYPE_alloc_v2.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c | 225 ++++++++---------------------------------
 fs/bcachefs/alloc_types.h      |  24 -----
 fs/bcachefs/bcachefs.h         |  11 --
 fs/bcachefs/bcachefs_format.h  |  18 +++-
 fs/bcachefs/btree_gc.c         |   6 +-
 fs/bcachefs/buckets.h          |   9 +-
 fs/bcachefs/buckets_types.h    |   2 +-
 fs/bcachefs/clock.c            |   8 +-
 fs/bcachefs/clock_types.h      |   2 +-
 fs/bcachefs/journal.c          |   3 +
 fs/bcachefs/journal_io.c       |  33 +++++-
 fs/bcachefs/movinggc.c         |   4 +-
 fs/bcachefs/rebalance.c        |  10 +-
 fs/bcachefs/rebalance_types.h  |   2 +-
 fs/bcachefs/recovery.c         |  19 ++--
 fs/bcachefs/super-io.c         |  60 +++++------
 fs/bcachefs/super-io.h         |   5 +-
 fs/bcachefs/super.c            |   6 --
 fs/bcachefs/sysfs.c            |   4 +-
 19 files changed, 141 insertions(+), 310 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 9a670bb2ccfb..bba83011b18b 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -31,8 +31,6 @@ static const unsigned BCH_ALLOC_V1_FIELD_BYTES[] = {
 #undef x
 };
 
-static void bch2_recalc_oldest_io(struct bch_fs *, struct bch_dev *, int);
-
 /* Ratelimiting/PD controllers */
 
 static void pd_controllers_update(struct work_struct *work)
@@ -340,9 +338,7 @@ static int bch2_alloc_read_fn(struct bch_fs *c, enum btree_id id,
 
 int bch2_alloc_read(struct bch_fs *c, struct journal_keys *journal_keys)
 {
-	struct bch_dev *ca;
-	unsigned i;
-	int ret = 0;
+	int ret;
 
 	down_read(&c->gc_lock);
 	ret = bch2_btree_and_journal_walk(c, journal_keys, BTREE_ID_ALLOC,
@@ -358,22 +354,6 @@ int bch2_alloc_read(struct bch_fs *c, struct journal_keys *journal_keys)
 	bch2_dev_usage_from_buckets(c);
 	percpu_up_write(&c->mark_lock);
 
-	mutex_lock(&c->bucket_clock[READ].lock);
-	for_each_member_device(ca, c, i) {
-		down_read(&ca->bucket_lock);
-		bch2_recalc_oldest_io(c, ca, READ);
-		up_read(&ca->bucket_lock);
-	}
-	mutex_unlock(&c->bucket_clock[READ].lock);
-
-	mutex_lock(&c->bucket_clock[WRITE].lock);
-	for_each_member_device(ca, c, i) {
-		down_read(&ca->bucket_lock);
-		bch2_recalc_oldest_io(c, ca, WRITE);
-		up_read(&ca->bucket_lock);
-	}
-	mutex_unlock(&c->bucket_clock[WRITE].lock);
-
 	return 0;
 }
 
@@ -460,114 +440,6 @@ err:
 
 /* Bucket IO clocks: */
 
-static void bch2_recalc_oldest_io(struct bch_fs *c, struct bch_dev *ca, int rw)
-{
-	struct bucket_clock *clock = &c->bucket_clock[rw];
-	struct bucket_array *buckets = bucket_array(ca);
-	struct bucket *g;
-	u16 max_last_io = 0;
-	unsigned i;
-
-	lockdep_assert_held(&c->bucket_clock[rw].lock);
-
-	/* Recalculate max_last_io for this device: */
-	for_each_bucket(g, buckets)
-		max_last_io = max(max_last_io, bucket_last_io(c, g, rw));
-
-	ca->max_last_bucket_io[rw] = max_last_io;
-
-	/* Recalculate global max_last_io: */
-	max_last_io = 0;
-
-	for_each_member_device(ca, c, i)
-		max_last_io = max(max_last_io, ca->max_last_bucket_io[rw]);
-
-	clock->max_last_io = max_last_io;
-}
-
-static void bch2_rescale_bucket_io_times(struct bch_fs *c, int rw)
-{
-	struct bucket_clock *clock = &c->bucket_clock[rw];
-	struct bucket_array *buckets;
-	struct bch_dev *ca;
-	struct bucket *g;
-	unsigned i;
-
-	trace_rescale_prios(c);
-
-	for_each_member_device(ca, c, i) {
-		down_read(&ca->bucket_lock);
-		buckets = bucket_array(ca);
-
-		for_each_bucket(g, buckets)
-			g->io_time[rw] = clock->hand -
-			bucket_last_io(c, g, rw) / 2;
-
-		bch2_recalc_oldest_io(c, ca, rw);
-
-		up_read(&ca->bucket_lock);
-	}
-}
-
-static inline u64 bucket_clock_freq(u64 capacity)
-{
-	return max(capacity >> 10, 2028ULL);
-}
-
-static void bch2_inc_clock_hand(struct io_timer *timer)
-{
-	struct bucket_clock *clock = container_of(timer,
-						struct bucket_clock, rescale);
-	struct bch_fs *c = container_of(clock,
-					struct bch_fs, bucket_clock[clock->rw]);
-	struct bch_dev *ca;
-	u64 capacity;
-	unsigned i;
-
-	mutex_lock(&clock->lock);
-
-	/* if clock cannot be advanced more, rescale prio */
-	if (clock->max_last_io >= U16_MAX - 2)
-		bch2_rescale_bucket_io_times(c, clock->rw);
-
-	BUG_ON(clock->max_last_io >= U16_MAX - 2);
-
-	for_each_member_device(ca, c, i)
-		ca->max_last_bucket_io[clock->rw]++;
-	clock->max_last_io++;
-	clock->hand++;
-
-	mutex_unlock(&clock->lock);
-
-	capacity = READ_ONCE(c->capacity);
-
-	if (!capacity)
-		return;
-
-	/*
-	 * we only increment when 0.1% of the filesystem capacity has been read
-	 * or written too, this determines if it's time
-	 *
-	 * XXX: we shouldn't really be going off of the capacity of devices in
-	 * RW mode (that will be 0 when we're RO, yet we can still service
-	 * reads)
-	 */
-	timer->expire += bucket_clock_freq(capacity);
-
-	bch2_io_timer_add(&c->io_clock[clock->rw], timer);
-}
-
-static void bch2_bucket_clock_init(struct bch_fs *c, int rw)
-{
-	struct bucket_clock *clock = &c->bucket_clock[rw];
-
-	clock->hand		= 1;
-	clock->rw		= rw;
-	clock->rescale.fn	= bch2_inc_clock_hand;
-	clock->rescale.expire	= bucket_clock_freq(c->capacity);
-	mutex_init(&clock->lock);
-}
-
 int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev,
 			      size_t bucket_nr, int rw)
 {
@@ -577,7 +449,7 @@ int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev,
 	struct bucket *g;
 	struct bkey_alloc_buf *a;
 	struct bkey_alloc_unpacked u;
-	u64 *time;
+	u64 *time, now;
 	int ret = 0;
 
 	iter = bch2_trans_get_iter(trans, BTREE_ID_ALLOC, POS(dev, bucket_nr),
@@ -599,10 +471,11 @@ int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev,
 	percpu_up_read(&c->mark_lock);
 
 	time = rw == READ ? &u.read_time : &u.write_time;
-	if (*time == c->bucket_clock[rw].hand)
+	now = atomic64_read(&c->io_clock[rw].now);
+	if (*time == now)
 		goto out;
 
-	*time = c->bucket_clock[rw].hand;
+	*time = now;
 
 	bch2_alloc_pack(c, a, u);
 	ret   = bch2_trans_update(trans, iter, &a->k, 0) ?:
@@ -674,23 +547,22 @@ static int wait_buckets_available(struct bch_fs *c, struct bch_dev *ca)
 	return ret;
 }
 
-static bool bch2_can_invalidate_bucket(struct bch_dev *ca,
-				       size_t bucket,
-				       struct bucket_mark mark)
+static bool bch2_can_invalidate_bucket(struct bch_dev *ca, size_t b,
+				       struct bucket_mark m)
 {
 	u8 gc_gen;
 
-	if (!is_available_bucket(mark))
+	if (!is_available_bucket(m))
 		return false;
 
-	if (mark.owned_by_allocator)
+	if (m.owned_by_allocator)
 		return false;
 
 	if (ca->buckets_nouse &&
-	    test_bit(bucket, ca->buckets_nouse))
+	    test_bit(b, ca->buckets_nouse))
 		return false;
 
-	gc_gen = bucket_gc_gen(ca, bucket);
+	gc_gen = bucket_gc_gen(bucket(ca, b));
 
 	if (gc_gen >= BUCKET_GC_GEN_MAX / 2)
 		ca->inc_gen_needs_gc++;
@@ -704,43 +576,33 @@ static bool bch2_can_invalidate_bucket(struct bch_dev *ca,
 /*
  * Determines what order we're going to reuse buckets, smallest bucket_key()
  * first.
- *
- *
- * - We take into account the read prio of the bucket, which gives us an
- *   indication of how hot the data is -- we scale the prio so that the prio
- *   farthest from the clock is worth 1/8th of the closest.
- *
- * - The number of sectors of cached data in the bucket, which gives us an
- *   indication of the cost in cache misses this eviction will cause.
- *
- * - If hotness * sectors used compares equal, we pick the bucket with the
- *   smallest bucket_gc_gen() - since incrementing the same bucket's generation
- *   number repeatedly forces us to run mark and sweep gc to avoid generation
- *   number wraparound.
  */
 
-static unsigned long bucket_sort_key(struct bch_fs *c, struct bch_dev *ca,
-				     size_t b, struct bucket_mark m)
+static unsigned bucket_sort_key(struct bucket *g, struct bucket_mark m,
+				u64 now, u64 last_seq_ondisk)
 {
-	unsigned last_io = bucket_last_io(c, bucket(ca, b), READ);
-	unsigned max_last_io = ca->max_last_bucket_io[READ];
-
-	/*
-	 * Time since last read, scaled to [0, 8) where larger value indicates
-	 * more recently read data:
-	 */
-	unsigned long hotness = (max_last_io - last_io) * 7 / max_last_io;
-
-	/* How much we want to keep the data in this bucket: */
-	unsigned long data_wantness =
-		(hotness + 1) * bucket_sectors_used(m);
+	unsigned used = bucket_sectors_used(m);
 
-	unsigned long needs_journal_commit =
-		bucket_needs_journal_commit(m, c->journal.last_seq_ondisk);
+	if (used) {
+		/*
+		 * Prefer to keep buckets that have been read more recently, and
+		 * buckets that have more data in them:
+		 */
+		u64 last_read = max_t(s64, 0, now - g->io_time[READ]);
+		u32 last_read_scaled = max_t(u64, U32_MAX, div_u64(last_read, used));
 
-	return  (data_wantness << 9) |
-		(needs_journal_commit << 8) |
-		(bucket_gc_gen(ca, b) / 16);
+		return -last_read_scaled;
+	} else {
+		/*
+		 * Prefer to use buckets with smaller gc_gen so that we don't
+		 * have to walk the btree and recalculate oldest_gen - but shift
+		 * off the low bits so that buckets will still have equal sort
+		 * keys when there's only a small difference, so that we can
+		 * keep sequential buckets together:
+		 */
+		return  (bucket_needs_journal_commit(m, last_seq_ondisk) << 4)|
+			(bucket_gc_gen(g) >> 4);
+	}
 }
 
 static inline int bucket_alloc_cmp(alloc_heap *h,
@@ -763,16 +625,15 @@ static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca)
 {
 	struct bucket_array *buckets;
 	struct alloc_heap_entry e = { 0 };
+	u64 now, last_seq_ondisk;
 	size_t b, i, nr = 0;
 
-	ca->alloc_heap.used = 0;
-
-	mutex_lock(&c->bucket_clock[READ].lock);
 	down_read(&ca->bucket_lock);
 
 	buckets = bucket_array(ca);
-
-	bch2_recalc_oldest_io(c, ca, READ);
+	ca->alloc_heap.used = 0;
+	now = atomic64_read(&c->io_clock[READ].now);
+	last_seq_ondisk = c->journal.last_seq_ondisk;
 
 	/*
 	 * Find buckets with lowest read priority, by building a maxheap sorted
@@ -780,8 +641,9 @@ static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca)
 	 * all buckets have been visited.
 	 */
 	for (b = ca->mi.first_bucket; b < ca->mi.nbuckets; b++) {
-		struct bucket_mark m = READ_ONCE(buckets->b[b].mark);
-		unsigned long key = bucket_sort_key(c, ca, b, m);
+		struct bucket *g = &buckets->b[b];
+		struct bucket_mark m = READ_ONCE(g->mark);
+		unsigned key = bucket_sort_key(g, m, now, last_seq_ondisk);
 
 		if (!bch2_can_invalidate_bucket(ca, b, m))
 			continue;
@@ -816,7 +678,6 @@ static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca)
 	}
 
 	up_read(&ca->bucket_lock);
-	mutex_unlock(&c->bucket_clock[READ].lock);
 }
 
 static void find_reclaimable_buckets_fifo(struct bch_fs *c, struct bch_dev *ca)
@@ -1031,8 +892,8 @@ retry:
 	u.data_type	= 0;
 	u.dirty_sectors	= 0;
 	u.cached_sectors = 0;
-	u.read_time	= c->bucket_clock[READ].hand;
-	u.write_time	= c->bucket_clock[WRITE].hand;
+	u.read_time	= atomic64_read(&c->io_clock[READ].now);
+	u.write_time	= atomic64_read(&c->io_clock[WRITE].now);
 
 	bch2_alloc_pack(c, &a, u);
 	bch2_trans_update(trans, iter, &a.k,
@@ -1542,8 +1403,6 @@ int bch2_dev_allocator_start(struct bch_dev *ca)
 void bch2_fs_allocator_background_init(struct bch_fs *c)
 {
 	spin_lock_init(&c->freelist_lock);
-	bch2_bucket_clock_init(c, READ);
-	bch2_bucket_clock_init(c, WRITE);
 
 	c->pd_controllers_update_seconds = 5;
 	INIT_DELAYED_WORK(&c->pd_controllers_update, pd_controllers_update);
diff --git a/fs/bcachefs/alloc_types.h b/fs/bcachefs/alloc_types.h
index 1abfff5290bc..be164d6108bb 100644
--- a/fs/bcachefs/alloc_types.h
+++ b/fs/bcachefs/alloc_types.h
@@ -10,30 +10,6 @@
 
 struct ec_bucket_buf;
 
-/* There's two of these clocks, one for reads and one for writes: */
-struct bucket_clock {
-	/*
-	 * "now" in (read/write) IO time - incremented whenever we do X amount
-	 * of reads or writes.
-	 *
-	 * Goes with the bucket read/write prios: when we read or write to a
-	 * bucket we reset the bucket's prio to the current hand; thus hand -
-	 * prio = time since bucket was last read/written.
-	 *
-	 * The units are some amount (bytes/sectors) of data read/written, and
-	 * the units can change on the fly if we need to rescale to fit
-	 * everything in a u16 - your only guarantee is that the units are
-	 * consistent.
-	 */
-	u16			hand;
-	u16			max_last_io;
-
-	int			rw;
-
-	struct io_timer		rescale;
-	struct mutex		lock;
-};
-
 enum alloc_reserve {
 	RESERVE_BTREE_MOVINGGC	= -2,
 	RESERVE_BTREE		= -1,
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index bd675b88b354..763cac0efa0c 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -451,9 +451,6 @@ struct bch_dev {
 
 	size_t			fifo_last_bucket;
 
-	/* last calculated minimum prio */
-	u16			max_last_bucket_io[2];
-
 	size_t			inc_gen_needs_gc;
 	size_t			inc_gen_really_needs_gc;
 
@@ -693,14 +690,6 @@ struct bch_fs {
 	struct mutex		usage_scratch_lock;
 	struct bch_fs_usage_online *usage_scratch;
 
-	/*
-	 * When we invalidate buckets, we use both the priority and the amount
-	 * of good data to determine which buckets to reuse first - to weight
-	 * those together consistently we keep track of the smallest nonzero
-	 * priority of any bucket.
-	 */
-	struct bucket_clock	bucket_clock[2];
-
 	struct io_clock		io_clock[2];
 
 	/* JOURNAL SEQ BLACKLIST */
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index b6c7e57b6bcd..5dab5bfd228a 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -1143,8 +1143,8 @@ struct bch_sb_field_clean {
 	struct bch_sb_field	field;
 
 	__le32			flags;
-	__le16			read_clock;
-	__le16			write_clock;
+	__le16			_read_clock; /* no longer used */
+	__le16			_write_clock;
 	__le64			journal_seq;
 
 	union {
@@ -1511,7 +1511,8 @@ static inline __u64 __bset_magic(struct bch_sb *sb)
 	x(blacklist,		3)		\
 	x(blacklist_v2,		4)		\
 	x(usage,		5)		\
-	x(data_usage,		6)
+	x(data_usage,		6)		\
+	x(clock,		7)
 
 enum {
 #define x(f, nr)	BCH_JSET_ENTRY_##f	= nr,
@@ -1559,6 +1560,13 @@ struct jset_entry_data_usage {
 	struct bch_replicas_entry r;
 } __attribute__((packed));
 
+struct jset_entry_clock {
+	struct jset_entry	entry;
+	__u8			rw;
+	__u8			pad[7];
+	__le64			time;
+} __attribute__((packed));
+
 /*
  * On disk format for a journal entry:
  * seq is monotonically increasing; every journal entry has its own unique
@@ -1581,8 +1589,8 @@ struct jset {
 
 	__u8			encrypted_start[0];
 
-	__le16			read_clock;
-	__le16			write_clock;
+	__le16			_read_clock; /* no longer used */
+	__le16			_write_clock;
 
 	/* Sequence number of oldest dirty journal entry */
 	__le64			last_seq;
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 9e123736a125..5ea9bae09d59 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -1489,7 +1489,7 @@ static int bch2_gc_thread(void *arg)
 {
 	struct bch_fs *c = arg;
 	struct io_clock *clock = &c->io_clock[WRITE];
-	unsigned long last = atomic_long_read(&clock->now);
+	unsigned long last = atomic64_read(&clock->now);
 	unsigned last_kick = atomic_read(&c->kick_gc);
 	int ret;
 
@@ -1510,7 +1510,7 @@ static int bch2_gc_thread(void *arg)
 			if (c->btree_gc_periodic) {
 				unsigned long next = last + c->capacity / 16;
 
-				if (atomic_long_read(&clock->now) >= next)
+				if (atomic64_read(&clock->now) >= next)
 					break;
 
 				bch2_io_clock_schedule_timeout(clock, next);
@@ -1522,7 +1522,7 @@ static int bch2_gc_thread(void *arg)
 		}
 		__set_current_state(TASK_RUNNING);
 
-		last = atomic_long_read(&clock->now);
+		last = atomic64_read(&clock->now);
 		last_kick = atomic_read(&c->kick_gc);
 
 		/*
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index 4103ea7e769a..50989d286190 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -58,20 +58,13 @@ static inline struct bucket *bucket(struct bch_dev *ca, size_t b)
 	return __bucket(ca, b, false);
 }
 
-static inline u16 bucket_last_io(struct bch_fs *c, struct bucket *g, int rw)
-{
-	return c->bucket_clock[rw].hand - g->io_time[rw];
-}
-
 /*
  * bucket_gc_gen() returns the difference between the bucket's current gen and
  * the oldest gen of any pointer into that bucket in the btree.
  */
 
-static inline u8 bucket_gc_gen(struct bch_dev *ca, size_t b)
+static inline u8 bucket_gc_gen(struct bucket *g)
 {
-	struct bucket *g = bucket(ca, b);
-
 	return g->mark.gen - g->oldest_gen;
 }
 
diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h
index 99ab9f48ba9d..b6ea67506cc2 100644
--- a/fs/bcachefs/buckets_types.h
+++ b/fs/bcachefs/buckets_types.h
@@ -37,7 +37,7 @@ struct bucket {
 		const struct bucket_mark mark;
 	};
 
-	u16				io_time[2];
+	u64				io_time[2];
 	u8				oldest_gen;
 	u8				gc_gen;
 	unsigned			gen_valid:1;
diff --git a/fs/bcachefs/clock.c b/fs/bcachefs/clock.c
index 869ba1887757..da91c95e3ffc 100644
--- a/fs/bcachefs/clock.c
+++ b/fs/bcachefs/clock.c
@@ -19,7 +19,7 @@ void bch2_io_timer_add(struct io_clock *clock, struct io_timer *timer)
 
 	spin_lock(&clock->timer_lock);
 
-	if (time_after_eq((unsigned long) atomic_long_read(&clock->now),
+	if (time_after_eq((unsigned long) atomic64_read(&clock->now),
 			  timer->expire)) {
 		spin_unlock(&clock->timer_lock);
 		timer->fn(timer);
@@ -146,7 +146,7 @@ static struct io_timer *get_expired_timer(struct io_clock *clock,
 void __bch2_increment_clock(struct io_clock *clock, unsigned sectors)
 {
 	struct io_timer *timer;
-	unsigned long now = atomic_long_add_return(sectors, &clock->now);
+	unsigned long now = atomic64_add_return(sectors, &clock->now);
 
 	while ((timer = get_expired_timer(clock, now)))
 		timer->fn(timer);
@@ -158,7 +158,7 @@ void bch2_io_timers_to_text(struct printbuf *out, struct io_clock *clock)
 	unsigned i;
 
 	spin_lock(&clock->timer_lock);
-	now = atomic_long_read(&clock->now);
+	now = atomic64_read(&clock->now);
 
 	for (i = 0; i < clock->timers.used; i++)
 		pr_buf(out, "%ps:\t%li\n",
@@ -175,7 +175,7 @@ void bch2_io_clock_exit(struct io_clock *clock)
 
 int bch2_io_clock_init(struct io_clock *clock)
 {
-	atomic_long_set(&clock->now, 0);
+	atomic64_set(&clock->now, 0);
 	spin_lock_init(&clock->timer_lock);
 
 	clock->max_slop = IO_CLOCK_PCPU_SECTORS * num_possible_cpus();
diff --git a/fs/bcachefs/clock_types.h b/fs/bcachefs/clock_types.h
index 92c740a47565..5fae0012d808 100644
--- a/fs/bcachefs/clock_types.h
+++ b/fs/bcachefs/clock_types.h
@@ -26,7 +26,7 @@ struct io_timer {
 typedef HEAP(struct io_timer *)	io_timer_heap;
 
 struct io_clock {
-	atomic_long_t		now;
+	atomic64_t		now;
 	u16 __percpu		*pcpu_buf;
 	unsigned		max_slop;
 
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index ba37c78c01db..379b9ad2c0f9 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -1123,6 +1123,9 @@ int bch2_fs_journal_init(struct journal *j)
 	j->entry_u64s_reserved +=
 		BTREE_ID_NR * (JSET_KEYS_U64s + BKEY_BTREE_PTR_U64s_MAX);
 
+	j->entry_u64s_reserved +=
+		2 * (sizeof(struct jset_entry_clock) / sizeof(u64));
+
 	atomic64_set(&j->reservations.counter,
 		((union journal_res_state)
 		 { .cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL }).v);
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 7e726db77881..a82548983dbd 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -426,6 +426,32 @@ fsck_err:
 	return ret;
 }
 
+static int journal_entry_validate_clock(struct bch_fs *c,
+					struct jset *jset,
+					struct jset_entry *entry,
+					int write)
+{
+	struct jset_entry_clock *clock =
+		container_of(entry, struct jset_entry_clock, entry);
+	unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64);
+	int ret = 0;
+
+	if (journal_entry_err_on(bytes != sizeof(*clock),
+				 c, "invalid journal entry clock: bad size")) {
+		journal_entry_null_range(entry, vstruct_next(entry));
+		return ret;
+	}
+
+	if (journal_entry_err_on(clock->rw > 1,
+				 c, "invalid journal entry clock: bad rw")) {
+		journal_entry_null_range(entry, vstruct_next(entry));
+		return ret;
+	}
+
+fsck_err:
+	return ret;
+}
+
 struct jset_entry_ops {
 	int (*validate)(struct bch_fs *, struct jset *,
 			struct jset_entry *, int);
@@ -1361,8 +1387,8 @@ void bch2_journal_write(struct closure *cl)
 
 	end	= bch2_btree_roots_to_journal_entries(c, jset->start, end);
 
-	end	= bch2_journal_super_entries_add_common(c, end,
-						le64_to_cpu(jset->seq));
+	bch2_journal_super_entries_add_common(c, &end,
+				le64_to_cpu(jset->seq));
 	u64s	= (u64 *) end - (u64 *) start;
 	BUG_ON(u64s > j->entry_u64s_reserved);
 
@@ -1371,10 +1397,7 @@ void bch2_journal_write(struct closure *cl)
 
 	journal_write_compact(jset);
 
-	jset->read_clock	= cpu_to_le16(c->bucket_clock[READ].hand);
-	jset->write_clock	= cpu_to_le16(c->bucket_clock[WRITE].hand);
 	jset->magic		= cpu_to_le64(jset_magic(c));
-
 	jset->version		= c->sb.version < bcachefs_metadata_version_new_versioning
 		? cpu_to_le32(BCH_JSET_VERSION_OLD)
 		: cpu_to_le32(c->sb.version);
diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c
index 8e6e4cd73886..e2472c19beaf 100644
--- a/fs/bcachefs/movinggc.c
+++ b/fs/bcachefs/movinggc.c
@@ -298,7 +298,7 @@ static int bch2_copygc_thread(void *arg)
 {
 	struct bch_fs *c = arg;
 	struct io_clock *clock = &c->io_clock[WRITE];
-	unsigned long last, wait;
+	u64 last, wait;
 
 	set_freezable();
 
@@ -306,7 +306,7 @@ static int bch2_copygc_thread(void *arg)
 		if (kthread_wait_freezable(c->copy_gc_enabled))
 			break;
 
-		last = atomic_long_read(&clock->now);
+		last = atomic64_read(&clock->now);
 		wait = bch2_copygc_wait_amount(c);
 
 		if (wait > clock->max_slop) {
diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c
index f9a12dd797a5..2263ee41c444 100644
--- a/fs/bcachefs/rebalance.c
+++ b/fs/bcachefs/rebalance.c
@@ -169,12 +169,12 @@ static int bch2_rebalance_thread(void *arg)
 	unsigned long start, prev_start;
 	unsigned long prev_run_time, prev_run_cputime;
 	unsigned long cputime, prev_cputime;
-	unsigned long io_start;
+	u64 io_start;
 	long throttle;
 
 	set_freezable();
 
-	io_start	= atomic_long_read(&clock->now);
+	io_start	= atomic64_read(&clock->now);
 	p		= rebalance_work(c);
 	prev_start	= jiffies;
 	prev_cputime	= curr_cputime();
@@ -210,7 +210,7 @@ static int bch2_rebalance_thread(void *arg)
 					(20 - w.dev_most_full_percent),
 					50);
 
-			if (atomic_long_read(&clock->now) + clock->max_slop <
+			if (atomic64_read(&clock->now) + clock->max_slop <
 			    r->throttled_until_iotime) {
 				r->throttled_until_cputime = start + throttle;
 				r->state = REBALANCE_THROTTLED;
@@ -229,7 +229,7 @@ static int bch2_rebalance_thread(void *arg)
 			      max(p.dev_most_full_percent, 1U) /
 			      max(w.dev_most_full_percent, 1U));
 
-		io_start	= atomic_long_read(&clock->now);
+		io_start	= atomic64_read(&clock->now);
 		p		= w;
 		prev_start	= start;
 		prev_cputime	= cputime;
@@ -274,7 +274,7 @@ void bch2_rebalance_work_to_text(struct printbuf *out, struct bch_fs *c)
 	case REBALANCE_THROTTLED:
 		bch2_hprint(&PBUF(h1),
 			    (r->throttled_until_iotime -
-			     atomic_long_read(&c->io_clock[WRITE].now)) << 9);
+			     atomic64_read(&c->io_clock[WRITE].now)) << 9);
 		pr_buf(out, "throttled for %lu sec or %s io\n",
 		       (r->throttled_until_cputime - jiffies) / HZ,
 		       h1);
diff --git a/fs/bcachefs/rebalance_types.h b/fs/bcachefs/rebalance_types.h
index 192c6be20ced..2f62a643c39f 100644
--- a/fs/bcachefs/rebalance_types.h
+++ b/fs/bcachefs/rebalance_types.h
@@ -17,7 +17,7 @@ struct bch_fs_rebalance {
 	atomic64_t		work_unknown_dev;
 
 	enum rebalance_state	state;
-	unsigned long		throttled_until_iotime;
+	u64			throttled_until_iotime;
 	unsigned long		throttled_until_cputime;
 	struct bch_move_stats	move_stats;
 
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index f470e0e233ce..55f7771e11c8 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -847,6 +847,12 @@ static int journal_replay_entry_early(struct bch_fs *c,
 				le64_to_cpu(bl_entry->end) + 1);
 		break;
 	}
+	case BCH_JSET_ENTRY_clock: {
+		struct jset_entry_clock *clock =
+			container_of(entry, struct jset_entry_clock, entry);
+
+		atomic64_set(&c->io_clock[clock->rw].now, clock->time);
+	}
 	}
 
 	return ret;
@@ -861,9 +867,6 @@ static int journal_replay_early(struct bch_fs *c,
 	int ret;
 
 	if (clean) {
-		c->bucket_clock[READ].hand = le16_to_cpu(clean->read_clock);
-		c->bucket_clock[WRITE].hand = le16_to_cpu(clean->write_clock);
-
 		for (entry = clean->start;
 		     entry != vstruct_end(&clean->field);
 		     entry = vstruct_next(entry)) {
@@ -876,9 +879,6 @@ static int journal_replay_early(struct bch_fs *c,
 			if (i->ignore)
 				continue;
 
-			c->bucket_clock[READ].hand = le16_to_cpu(i->j.read_clock);
-			c->bucket_clock[WRITE].hand = le16_to_cpu(i->j.write_clock);
-
 			vstruct_for_each(&i->j, entry) {
 				ret = journal_replay_entry_early(c, entry);
 				if (ret)
@@ -942,13 +942,6 @@ static int verify_superblock_clean(struct bch_fs *c,
 		return 0;
 	}
 
-	mustfix_fsck_err_on(j->read_clock != clean->read_clock, c,
-			"superblock read clock %u doesn't match journal %u after clean shutdown",
-			clean->read_clock, j->read_clock);
-	mustfix_fsck_err_on(j->write_clock != clean->write_clock, c,
-			"superblock write clock %u doesn't match journal %u after clean shutdown",
-			clean->write_clock, j->write_clock);
-
 	for (i = 0; i < BTREE_ID_NR; i++) {
 		char buf1[200], buf2[200];
 		struct bkey_i *k1, *k2;
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index 61b947313c88..3b082da934fb 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -966,29 +966,25 @@ int bch2_fs_mark_dirty(struct bch_fs *c)
 	return ret;
 }
 
-static void
-entry_init_u64s(struct jset_entry *entry, unsigned u64s)
+static struct jset_entry *jset_entry_init(struct jset_entry **end, size_t size)
 {
-	memset(entry, 0, u64s * sizeof(u64));
+	struct jset_entry *entry = *end;
+	unsigned u64s = DIV_ROUND_UP(size, sizeof(u64));
 
+	memset(entry, 0, u64s * sizeof(u64));
 	/*
 	 * The u64s field counts from the start of data, ignoring the shared
 	 * fields.
 	 */
 	entry->u64s = u64s - 1;
-}
 
-static void
-entry_init_size(struct jset_entry *entry, size_t size)
-{
-	unsigned u64s = DIV_ROUND_UP(size, sizeof(u64));
-	entry_init_u64s(entry, u64s);
+	*end = vstruct_next(*end);
+	return entry;
 }
 
-struct jset_entry *
-bch2_journal_super_entries_add_common(struct bch_fs *c,
-				      struct jset_entry *entry,
-				      u64 journal_seq)
+void bch2_journal_super_entries_add_common(struct bch_fs *c,
+					   struct jset_entry **end,
+					   u64 journal_seq)
 {
 	unsigned i;
 
@@ -1003,59 +999,59 @@ bch2_journal_super_entries_add_common(struct bch_fs *c,
 
 	{
 		struct jset_entry_usage *u =
-			container_of(entry, struct jset_entry_usage, entry);
+			container_of(jset_entry_init(end, sizeof(*u)),
+				     struct jset_entry_usage, entry);
 
-		entry_init_size(entry, sizeof(*u));
 		u->entry.type	= BCH_JSET_ENTRY_usage;
 		u->entry.btree_id = FS_USAGE_INODES;
 		u->v		= cpu_to_le64(c->usage_base->nr_inodes);
-
-		entry = vstruct_next(entry);
 	}
 
 	{
 		struct jset_entry_usage *u =
-			container_of(entry, struct jset_entry_usage, entry);
+			container_of(jset_entry_init(end, sizeof(*u)),
+				     struct jset_entry_usage, entry);
 
-		entry_init_size(entry, sizeof(*u));
 		u->entry.type	= BCH_JSET_ENTRY_usage;
 		u->entry.btree_id = FS_USAGE_KEY_VERSION;
 		u->v		= cpu_to_le64(atomic64_read(&c->key_version));
-
-		entry = vstruct_next(entry);
 	}
 
 	for (i = 0; i < BCH_REPLICAS_MAX; i++) {
 		struct jset_entry_usage *u =
-			container_of(entry, struct jset_entry_usage, entry);
+			container_of(jset_entry_init(end, sizeof(*u)),
+				     struct jset_entry_usage, entry);
 
-		entry_init_size(entry, sizeof(*u));
 		u->entry.type	= BCH_JSET_ENTRY_usage;
 		u->entry.btree_id = FS_USAGE_RESERVED;
 		u->entry.level	= i;
 		u->v		= cpu_to_le64(c->usage_base->persistent_reserved[i]);
-
-		entry = vstruct_next(entry);
 	}
 
 	for (i = 0; i < c->replicas.nr; i++) {
 		struct bch_replicas_entry *e =
 			cpu_replicas_entry(&c->replicas, i);
 		struct jset_entry_data_usage *u =
-			container_of(entry, struct jset_entry_data_usage, entry);
+			container_of(jset_entry_init(end, sizeof(*u) + e->nr_devs),
+				     struct jset_entry_data_usage, entry);
 
-		entry_init_size(entry, sizeof(*u) + e->nr_devs);
 		u->entry.type	= BCH_JSET_ENTRY_data_usage;
 		u->v		= cpu_to_le64(c->usage_base->replicas[i]);
 		unsafe_memcpy(&u->r, e, replicas_entry_bytes(e),
 			      "embedded variable length struct");
-
-		entry = vstruct_next(entry);
 	}
 
 	percpu_up_read(&c->mark_lock);
 
-	return entry;
+	for (i = 0; i < 2; i++) {
+		struct jset_entry_clock *clock =
+			container_of(jset_entry_init(end, sizeof(*clock)),
+				     struct jset_entry_clock, entry);
+
+		clock->entry.type = BCH_JSET_ENTRY_clock;
+		clock->rw	= i;
+		clock->time	= atomic64_read(&c->io_clock[i].now);
+	}
 }
 
 void bch2_fs_mark_clean(struct bch_fs *c)
@@ -1084,15 +1080,13 @@ void bch2_fs_mark_clean(struct bch_fs *c)
 	}
 
 	sb_clean->flags		= 0;
-	sb_clean->read_clock	= cpu_to_le16(c->bucket_clock[READ].hand);
-	sb_clean->write_clock	= cpu_to_le16(c->bucket_clock[WRITE].hand);
 	sb_clean->journal_seq	= cpu_to_le64(journal_cur_seq(&c->journal) - 1);
 
 	/* Trying to catch outstanding bug: */
 	BUG_ON(le64_to_cpu(sb_clean->journal_seq) > S64_MAX);
 
 	entry = sb_clean->start;
-	entry = bch2_journal_super_entries_add_common(c, entry, 0);
+	bch2_journal_super_entries_add_common(c, &entry, 0);
 	entry = bch2_btree_roots_to_journal_entries(c, entry, entry);
 	BUG_ON((void *) entry > vstruct_end(&sb_clean->field));
 
diff --git a/fs/bcachefs/super-io.h b/fs/bcachefs/super-io.h
index 402ae563b3c7..dd8d4ba911f0 100644
--- a/fs/bcachefs/super-io.h
+++ b/fs/bcachefs/super-io.h
@@ -122,9 +122,8 @@ static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi)
 
 /* BCH_SB_FIELD_clean: */
 
-struct jset_entry *
-bch2_journal_super_entries_add_common(struct bch_fs *,
-				      struct jset_entry *, u64);
+void bch2_journal_super_entries_add_common(struct bch_fs *,
+					   struct jset_entry **, u64);
 
 void bch2_sb_clean_renumber(struct bch_sb_field_clean *, int);
 
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index d451a29b517b..5f5893ab9edf 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -181,9 +181,6 @@ static void __bch2_fs_read_only(struct bch_fs *c)
 	bch2_copygc_stop(c);
 	bch2_gc_thread_stop(c);
 
-	bch2_io_timer_del(&c->io_clock[READ], &c->bucket_clock[READ].rescale);
-	bch2_io_timer_del(&c->io_clock[WRITE], &c->bucket_clock[WRITE].rescale);
-
 	/*
 	 * Flush journal before stopping allocators, because flushing journal
 	 * blacklist entries involves allocating new btree nodes:
@@ -406,9 +403,6 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early)
 		bch2_dev_allocator_add(c, ca);
 	bch2_recalc_capacity(c);
 
-	bch2_io_timer_add(&c->io_clock[READ], &c->bucket_clock[READ].rescale);
-	bch2_io_timer_add(&c->io_clock[WRITE], &c->bucket_clock[WRITE].rescale);
-
 	for_each_rw_member(ca, c, i) {
 		ret = bch2_dev_allocator_start(ca);
 		if (ret) {
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index 521b6d8d518f..8fdbeaf9df32 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -705,7 +705,7 @@ static unsigned bucket_last_io_fn(struct bch_fs *c, struct bch_dev *ca,
 {
 	int rw = (private ? 1 : 0);
 
-	return bucket_last_io(c, bucket(ca, b), rw);
+	return atomic64_read(&c->io_clock[rw].now) - bucket(ca, b)->io_time[rw];
 }
 
 static unsigned bucket_sectors_used_fn(struct bch_fs *c, struct bch_dev *ca,
@@ -718,7 +718,7 @@ static unsigned bucket_sectors_used_fn(struct bch_fs *c, struct bch_dev *ca,
 static unsigned bucket_oldest_gen_fn(struct bch_fs *c, struct bch_dev *ca,
 				     size_t b, void *private)
 {
-	return bucket_gc_gen(ca, b);
+	return bucket_gc_gen(bucket(ca, b));
 }
 
 static int unsigned_cmp(const void *_l, const void *_r)
-- 
cgit 


From 180fb49dea90dfbac591b9b201a4dfb75159f5f0 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 21 Jan 2021 21:52:06 -0500
Subject: bcachefs: Journal updates to dev usage

This eliminates the need to scan every bucket to regenerate dev_usage at
mount time.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c |   4 --
 fs/bcachefs/bcachefs.h         |   6 ++-
 fs/bcachefs/bcachefs_format.h  |  20 +++++++-
 fs/bcachefs/btree_gc.c         |  38 ++++++++++-----
 fs/bcachefs/buckets.c          | 102 +++++++++++++++++++++++++----------------
 fs/bcachefs/buckets.h          |   7 ++-
 fs/bcachefs/journal_io.c       |  37 +++++++++++++++
 fs/bcachefs/recovery.c         |  21 +++++++++
 fs/bcachefs/super-io.c         |  22 ++++++++-
 fs/bcachefs/super.c            |  37 +++++++++------
 10 files changed, 219 insertions(+), 75 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index bba83011b18b..aadd878b357d 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -350,10 +350,6 @@ int bch2_alloc_read(struct bch_fs *c, struct journal_keys *journal_keys)
 		return ret;
 	}
 
-	percpu_down_write(&c->mark_lock);
-	bch2_dev_usage_from_buckets(c);
-	percpu_up_write(&c->mark_lock);
-
 	return 0;
 }
 
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 763cac0efa0c..0c24a5312e49 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -429,7 +429,9 @@ struct bch_dev {
 	unsigned long		*buckets_nouse;
 	struct rw_semaphore	bucket_lock;
 
-	struct bch_dev_usage __percpu *usage[2];
+	struct bch_dev_usage		*usage_base;
+	struct bch_dev_usage __percpu	*usage[JOURNAL_BUF_NR];
+	struct bch_dev_usage __percpu	*usage_gc;
 
 	/* Allocator: */
 	struct task_struct __rcu *alloc_thread;
@@ -582,6 +584,8 @@ struct bch_fs {
 
 	struct journal_entry_res replicas_journal_res;
 
+	struct journal_entry_res dev_usage_journal_res;
+
 	struct bch_disk_groups_cpu __rcu *disk_groups;
 
 	struct bch_opts		opts;
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index 5dab5bfd228a..9048441cfa55 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -1512,7 +1512,8 @@ static inline __u64 __bset_magic(struct bch_sb *sb)
 	x(blacklist_v2,		4)		\
 	x(usage,		5)		\
 	x(data_usage,		6)		\
-	x(clock,		7)
+	x(clock,		7)		\
+	x(dev_usage,		8)
 
 enum {
 #define x(f, nr)	BCH_JSET_ENTRY_##f	= nr,
@@ -1567,6 +1568,23 @@ struct jset_entry_clock {
 	__le64			time;
 } __attribute__((packed));
 
+struct jset_entry_dev_usage_type {
+	__le64			buckets;
+	__le64			sectors;
+	__le64			fragmented;
+} __attribute__((packed));
+
+struct jset_entry_dev_usage {
+	struct jset_entry	entry;
+	__le32			dev;
+	__u32			pad;
+
+	__le64			buckets_ec;
+	__le64			buckets_unavailable;
+
+	struct jset_entry_dev_usage_type d[];
+} __attribute__((packed));
+
 /*
  * On disk format for a journal entry:
  * seq is monotonically increasing; every journal entry has its own unique
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 5ea9bae09d59..d44b9c079fde 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -706,8 +706,8 @@ static void bch2_gc_free(struct bch_fs *c)
 			ca->mi.nbuckets * sizeof(struct bucket));
 		ca->buckets[1] = NULL;
 
-		free_percpu(ca->usage[1]);
-		ca->usage[1] = NULL;
+		free_percpu(ca->usage_gc);
+		ca->usage_gc = NULL;
 	}
 
 	free_percpu(c->usage_gc);
@@ -720,7 +720,7 @@ static int bch2_gc_done(struct bch_fs *c,
 	struct bch_dev *ca;
 	bool verify = (!initial ||
 		       (c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_INFO)));
-	unsigned i;
+	unsigned i, dev;
 	int ret = 0;
 
 #define copy_field(_f, _msg, ...)					\
@@ -786,7 +786,10 @@ static int bch2_gc_done(struct bch_fs *c,
 		}
 	}
 
-	for_each_member_device(ca, c, i) {
+	for (i = 0; i < ARRAY_SIZE(c->usage); i++)
+		bch2_fs_usage_acc_to_base(c, i);
+
+	for_each_member_device(ca, c, dev) {
 		struct bucket_array *dst = __bucket_array(ca, 0);
 		struct bucket_array *src = __bucket_array(ca, 1);
 		size_t b;
@@ -801,12 +804,23 @@ static int bch2_gc_done(struct bch_fs *c,
 
 			dst->b[b].oldest_gen = src->b[b].oldest_gen;
 		}
-	};
 
-	for (i = 0; i < ARRAY_SIZE(c->usage); i++)
-		bch2_fs_usage_acc_to_base(c, i);
+		{
+			struct bch_dev_usage *dst = ca->usage_base;
+			struct bch_dev_usage *src = (void *)
+				bch2_acc_percpu_u64s((void *) ca->usage_gc,
+						     dev_usage_u64s());
+
+			copy_dev_field(buckets_ec,		"buckets_ec");
+			copy_dev_field(buckets_unavailable,	"buckets_unavailable");
 
-	bch2_dev_usage_from_buckets(c);
+			for (i = 0; i < BCH_DATA_NR; i++) {
+				copy_dev_field(d[i].buckets,	"%s buckets", bch2_data_types[i]);
+				copy_dev_field(d[i].sectors,	"%s sectors", bch2_data_types[i]);
+				copy_dev_field(d[i].fragmented,	"%s fragmented", bch2_data_types[i]);
+			}
+		}
+	};
 
 	{
 		unsigned nr = fs_usage_u64s(c);
@@ -862,7 +876,7 @@ static int bch2_gc_start(struct bch_fs *c)
 
 	for_each_member_device(ca, c, i) {
 		BUG_ON(ca->buckets[1]);
-		BUG_ON(ca->usage[1]);
+		BUG_ON(ca->usage_gc);
 
 		ca->buckets[1] = kvpmalloc(sizeof(struct bucket_array) +
 				ca->mi.nbuckets * sizeof(struct bucket),
@@ -873,9 +887,9 @@ static int bch2_gc_start(struct bch_fs *c)
 			return -ENOMEM;
 		}
 
-		ca->usage[1] = alloc_percpu(struct bch_dev_usage);
-		if (!ca->usage[1]) {
-			bch_err(c, "error allocating ca->usage[gc]");
+		ca->usage_gc = alloc_percpu(struct bch_dev_usage);
+		if (!ca->usage_gc) {
+			bch_err(c, "error allocating ca->usage_gc");
 			percpu_ref_put(&ca->ref);
 			return -ENOMEM;
 		}
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 7b60e988df83..65ae89c80590 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -137,6 +137,7 @@ void bch2_bucket_seq_cleanup(struct bch_fs *c)
 void bch2_fs_usage_initialize(struct bch_fs *c)
 {
 	struct bch_fs_usage *usage;
+	struct bch_dev *ca;
 	unsigned i;
 
 	percpu_down_write(&c->mark_lock);
@@ -155,6 +156,14 @@ void bch2_fs_usage_initialize(struct bch_fs *c)
 		fs_usage_data_type_to_base(usage, e->data_type, usage->replicas[i]);
 	}
 
+	for_each_member_device(ca, c, i) {
+		struct bch_dev_usage dev = bch2_dev_usage_read(ca);
+
+		usage->hidden += (dev.d[BCH_DATA_sb].buckets +
+				  dev.d[BCH_DATA_journal].buckets) *
+			ca->mi.bucket_size;
+	}
+
 	percpu_up_write(&c->mark_lock);
 }
 
@@ -189,14 +198,27 @@ out_pool:
 	return ret;
 }
 
+static inline struct bch_dev_usage *dev_usage_ptr(struct bch_dev *ca,
+						  unsigned journal_seq,
+						  bool gc)
+{
+	return this_cpu_ptr(gc
+			    ? ca->usage_gc
+			    : ca->usage[journal_seq & JOURNAL_BUF_MASK]);
+}
+
 struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *ca)
 {
+	struct bch_fs *c = ca->fs;
 	struct bch_dev_usage ret;
+	unsigned seq, i, u64s = dev_usage_u64s();
 
-	memset(&ret, 0, sizeof(ret));
-	acc_u64s_percpu((u64 *) &ret,
-			(u64 __percpu *) ca->usage[0],
-			sizeof(ret) / sizeof(u64));
+	do {
+		seq = read_seqcount_begin(&c->usage_lock);
+		memcpy(&ret, ca->usage_base, u64s * sizeof(u64));
+		for (i = 0; i < ARRAY_SIZE(ca->usage); i++)
+			acc_u64s_percpu((u64 *) &ret, (u64 __percpu *) ca->usage[i], u64s);
+	} while (read_seqcount_retry(&c->usage_lock, seq));
 
 	return ret;
 }
@@ -264,7 +286,8 @@ retry:
 
 void bch2_fs_usage_acc_to_base(struct bch_fs *c, unsigned idx)
 {
-	unsigned u64s = fs_usage_u64s(c);
+	struct bch_dev *ca;
+	unsigned i, u64s = fs_usage_u64s(c);
 
 	BUG_ON(idx >= ARRAY_SIZE(c->usage));
 
@@ -275,6 +298,16 @@ void bch2_fs_usage_acc_to_base(struct bch_fs *c, unsigned idx)
 			(u64 __percpu *) c->usage[idx], u64s);
 	percpu_memset(c->usage[idx], 0, u64s * sizeof(u64));
 
+	rcu_read_lock();
+	for_each_member_device_rcu(ca, c, i, NULL) {
+		u64s = dev_usage_u64s();
+
+		acc_u64s_percpu((u64 *) ca->usage_base,
+				(u64 __percpu *) ca->usage[idx], u64s);
+		percpu_memset(ca->usage[idx], 0, u64s * sizeof(u64));
+	}
+	rcu_read_unlock();
+
 	write_seqcount_end(&c->usage_lock);
 	preempt_enable();
 }
@@ -459,14 +492,14 @@ static inline void account_bucket(struct bch_fs_usage *fs_usage,
 static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
 				  struct bch_fs_usage *fs_usage,
 				  struct bucket_mark old, struct bucket_mark new,
-				  bool gc)
+				  u64 journal_seq, bool gc)
 {
 	struct bch_dev_usage *u;
 
 	percpu_rwsem_assert_held(&c->mark_lock);
 
 	preempt_disable();
-	u = this_cpu_ptr(ca->usage[gc]);
+	u = dev_usage_ptr(ca, journal_seq, gc);
 
 	if (bucket_type(old))
 		account_bucket(fs_usage, u, bucket_type(old),
@@ -493,31 +526,6 @@ static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
 		bch2_wake_allocator(ca);
 }
 
-__flatten
-void bch2_dev_usage_from_buckets(struct bch_fs *c)
-{
-	struct bch_dev *ca;
-	struct bucket_mark old = { .v.counter = 0 };
-	struct bucket_array *buckets;
-	struct bucket *g;
-	unsigned i;
-	int cpu;
-
-	c->usage_base->hidden = 0;
-
-	for_each_member_device(ca, c, i) {
-		for_each_possible_cpu(cpu)
-			memset(per_cpu_ptr(ca->usage[0], cpu), 0,
-			       sizeof(*ca->usage[0]));
-
-		buckets = bucket_array(ca);
-
-		for_each_bucket(g, buckets)
-			bch2_dev_usage_update(c, ca, c->usage_base,
-					      old, g->mark, false);
-	}
-}
-
 static inline int update_replicas(struct bch_fs *c,
 				  struct bch_fs_usage *fs_usage,
 				  struct bch_replicas_entry *r,
@@ -656,7 +664,12 @@ static int __bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
 		new.owned_by_allocator	= owned_by_allocator;
 	}));
 
-	bch2_dev_usage_update(c, ca, fs_usage, old, new, gc);
+	/*
+	 * XXX: this is wrong, this means we'll be doing updates to the percpu
+	 * buckets_alloc counter that don't have an open journal buffer and
+	 * we'll race with the machinery that accumulates that to ca->usage_base
+	 */
+	bch2_dev_usage_update(c, ca, fs_usage, old, new, 0, gc);
 
 	BUG_ON(!gc &&
 	       !owned_by_allocator && !old.owned_by_allocator);
@@ -720,7 +733,7 @@ static int bch2_mark_alloc(struct bch_fs *c,
 		}
 	}));
 
-	bch2_dev_usage_update(c, ca, fs_usage, old_m, m, gc);
+	bch2_dev_usage_update(c, ca, fs_usage, old_m, m, journal_seq, gc);
 
 	g->io_time[READ]	= u.read_time;
 	g->io_time[WRITE]	= u.write_time;
@@ -785,7 +798,7 @@ static int __bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
 
 	if (c)
 		bch2_dev_usage_update(c, ca, fs_usage_ptr(c, 0, gc),
-				      old, new, gc);
+				      old, new, 0, gc);
 
 	return 0;
 }
@@ -966,7 +979,7 @@ static int mark_stripe_bucket(struct bch_fs *c, struct bkey_s_c k,
 	g->stripe		= k.k->p.offset;
 	g->stripe_redundancy	= s->nr_redundant;
 
-	bch2_dev_usage_update(c, ca, fs_usage, old, new, gc);
+	bch2_dev_usage_update(c, ca, fs_usage, old, new, journal_seq, gc);
 	return 0;
 }
 
@@ -1033,7 +1046,7 @@ static int bch2_mark_pointer(struct bch_fs *c, struct bkey_s_c k,
 			      old.v.counter,
 			      new.v.counter)) != old.v.counter);
 
-	bch2_dev_usage_update(c, ca, fs_usage, old, new, gc);
+	bch2_dev_usage_update(c, ca, fs_usage, old, new, journal_seq, gc);
 
 	BUG_ON(!gc && bucket_became_unavailable(old, new));
 
@@ -2389,13 +2402,24 @@ void bch2_dev_buckets_free(struct bch_dev *ca)
 		sizeof(struct bucket_array) +
 		ca->mi.nbuckets * sizeof(struct bucket));
 
-	free_percpu(ca->usage[0]);
+	for (i = 0; i < ARRAY_SIZE(ca->usage); i++)
+		free_percpu(ca->usage[i]);
+	kfree(ca->usage_base);
 }
 
 int bch2_dev_buckets_alloc(struct bch_fs *c, struct bch_dev *ca)
 {
-	if (!(ca->usage[0] = alloc_percpu(struct bch_dev_usage)))
+	unsigned i;
+
+	ca->usage_base = kzalloc(sizeof(struct bch_dev_usage), GFP_KERNEL);
+	if (!ca->usage_base)
 		return -ENOMEM;
 
+	for (i = 0; i < ARRAY_SIZE(ca->usage); i++) {
+		ca->usage[i] = alloc_percpu(struct bch_dev_usage);
+		if (!ca->usage[i])
+			return -ENOMEM;
+	}
+
 	return bch2_dev_buckets_resize(c, ca, ca->mi.nbuckets);;
 }
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index 50989d286190..c965c4d48218 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -162,8 +162,6 @@ static inline bool bucket_needs_journal_commit(struct bucket_mark m,
 
 struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *);
 
-void bch2_dev_usage_from_buckets(struct bch_fs *);
-
 static inline u64 __dev_buckets_available(struct bch_dev *ca,
 					  struct bch_dev_usage stats)
 {
@@ -207,6 +205,11 @@ static inline unsigned fs_usage_u64s(struct bch_fs *c)
 		READ_ONCE(c->replicas.nr);
 }
 
+static inline unsigned dev_usage_u64s(void)
+{
+	return sizeof(struct bch_dev_usage) / sizeof(u64);
+}
+
 void bch2_fs_usage_scratch_put(struct bch_fs *, struct bch_fs_usage_online *);
 struct bch_fs_usage_online *bch2_fs_usage_scratch_get(struct bch_fs *);
 
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index a82548983dbd..df5b375c367f 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -452,6 +452,43 @@ fsck_err:
 	return ret;
 }
 
+static int journal_entry_validate_dev_usage(struct bch_fs *c,
+					    struct jset *jset,
+					    struct jset_entry *entry,
+					    int write)
+{
+	struct jset_entry_dev_usage *u =
+		container_of(entry, struct jset_entry_dev_usage, entry);
+	unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64);
+	unsigned expected = sizeof(*u) + sizeof(u->d[0]) * 7; /* Current value of BCH_DATA_NR */
+	unsigned dev;
+	int ret = 0;
+
+	if (journal_entry_err_on(bytes < expected,
+				 c, "invalid journal entry dev usage: bad size (%u < %u)",
+				 bytes, expected)) {
+		journal_entry_null_range(entry, vstruct_next(entry));
+		return ret;
+	}
+
+	dev = le32_to_cpu(u->dev);
+
+	if (journal_entry_err_on(!bch2_dev_exists2(c, dev),
+				 c, "invalid journal entry dev usage: bad dev")) {
+		journal_entry_null_range(entry, vstruct_next(entry));
+		return ret;
+	}
+
+	if (journal_entry_err_on(u->pad,
+				 c, "invalid journal entry dev usage: bad pad")) {
+		journal_entry_null_range(entry, vstruct_next(entry));
+		return ret;
+	}
+
+fsck_err:
+	return ret;
+}
+
 struct jset_entry_ops {
 	int (*validate)(struct bch_fs *, struct jset *,
 			struct jset_entry *, int);
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 55f7771e11c8..7ba098adcab9 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -825,10 +825,31 @@ static int journal_replay_entry_early(struct bch_fs *c,
 	case BCH_JSET_ENTRY_data_usage: {
 		struct jset_entry_data_usage *u =
 			container_of(entry, struct jset_entry_data_usage, entry);
+
 		ret = bch2_replicas_set_usage(c, &u->r,
 					      le64_to_cpu(u->v));
 		break;
 	}
+	case BCH_JSET_ENTRY_dev_usage: {
+		struct jset_entry_dev_usage *u =
+			container_of(entry, struct jset_entry_dev_usage, entry);
+		struct bch_dev *ca = bch_dev_bkey_exists(c, u->dev);
+		unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64);
+		unsigned nr_types = (bytes - sizeof(struct jset_entry_dev_usage)) /
+			sizeof(struct jset_entry_dev_usage_type);
+		unsigned i;
+
+		ca->usage_base->buckets_ec		= le64_to_cpu(u->buckets_ec);
+		ca->usage_base->buckets_unavailable	= le64_to_cpu(u->buckets_unavailable);
+
+		for (i = 0; i < nr_types; i++) {
+			ca->usage_base->d[i].buckets	= le64_to_cpu(u->d[i].buckets);
+			ca->usage_base->d[i].sectors	= le64_to_cpu(u->d[i].sectors);
+			ca->usage_base->d[i].fragmented	= le64_to_cpu(u->d[i].fragmented);
+		}
+
+		break;
+	}
 	case BCH_JSET_ENTRY_blacklist: {
 		struct jset_entry_blacklist *bl_entry =
 			container_of(entry, struct jset_entry_blacklist, entry);
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index 3b082da934fb..0356541c00e2 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -986,7 +986,8 @@ void bch2_journal_super_entries_add_common(struct bch_fs *c,
 					   struct jset_entry **end,
 					   u64 journal_seq)
 {
-	unsigned i;
+	struct bch_dev *ca;
+	unsigned i, dev;
 
 	percpu_down_read(&c->mark_lock);
 
@@ -1041,6 +1042,25 @@ void bch2_journal_super_entries_add_common(struct bch_fs *c,
 			      "embedded variable length struct");
 	}
 
+	for_each_member_device(ca, c, dev) {
+		unsigned b = sizeof(struct jset_entry_dev_usage) +
+			sizeof(struct jset_entry_dev_usage_type) * BCH_DATA_NR;
+		struct jset_entry_dev_usage *u =
+			container_of(jset_entry_init(end, b),
+				     struct jset_entry_dev_usage, entry);
+
+		u->entry.type = BCH_JSET_ENTRY_dev_usage;
+		u->dev = cpu_to_le32(dev);
+		u->buckets_ec		= cpu_to_le64(ca->usage_base->buckets_ec);
+		u->buckets_unavailable	= cpu_to_le64(ca->usage_base->buckets_unavailable);
+
+		for (i = 0; i < BCH_DATA_NR; i++) {
+			u->d[i].buckets = cpu_to_le64(ca->usage_base->d[i].buckets);
+			u->d[i].sectors	= cpu_to_le64(ca->usage_base->d[i].sectors);
+			u->d[i].fragmented = cpu_to_le64(ca->usage_base->d[i].fragmented);
+		}
+	}
+
 	percpu_up_read(&c->mark_lock);
 
 	for (i = 0; i < 2; i++) {
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 5f5893ab9edf..eecabeb08c94 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -155,6 +155,22 @@ struct bch_fs *bch2_uuid_to_fs(__uuid_t uuid)
 	return c;
 }
 
+static void bch2_dev_usage_journal_reserve(struct bch_fs *c)
+{
+	struct bch_dev *ca;
+	unsigned i, nr = 0, u64s =
+		(sizeof(struct jset_entry_dev_usage) +
+		 sizeof(struct jset_entry_dev_usage_type) * BCH_DATA_NR);
+
+	rcu_read_lock();
+	for_each_member_device_rcu(ca, c, i, NULL)
+		nr++;
+	rcu_read_unlock();
+
+	bch2_journal_entry_res_resize(&c->journal,
+			&c->dev_usage_journal_res, u64s * nr);
+}
+
 /* Filesystem RO/RW: */
 
 /*
@@ -780,6 +796,8 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 	    bch2_fs_fsio_init(c))
 		goto err;
 
+	bch2_dev_usage_journal_reserve(c);
+
 	mi = bch2_sb_get_members(c->disk_sb.sb);
 	for (i = 0; i < c->sb.nr_devices; i++)
 		if (bch2_dev_exists(c->disk_sb.sb, mi, i) &&
@@ -1516,6 +1534,8 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
 
 	mutex_unlock(&c->sb_lock);
 	up_write(&c->state_lock);
+
+	bch2_dev_usage_journal_reserve(c);
 	return 0;
 err:
 	if (ca->mi.state == BCH_MEMBER_STATE_RW &&
@@ -1525,19 +1545,6 @@ err:
 	return ret;
 }
 
-static void dev_usage_clear(struct bch_dev *ca)
-{
-	struct bucket_array *buckets;
-
-	percpu_memset(ca->usage[0], 0, sizeof(*ca->usage[0]));
-
-	down_read(&ca->bucket_lock);
-	buckets = bucket_array(ca);
-
-	memset(buckets->b, 0, sizeof(buckets->b[0]) * buckets->nbuckets);
-	up_read(&ca->bucket_lock);
-}
-
 /* Add new device to running filesystem: */
 int bch2_dev_add(struct bch_fs *c, const char *path)
 {
@@ -1595,8 +1602,6 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
 	if (ret)
 		goto err;
 
-	dev_usage_clear(ca);
-
 	down_write(&c->state_lock);
 	mutex_lock(&c->sb_lock);
 
@@ -1650,6 +1655,8 @@ have_slot:
 	bch2_write_super(c);
 	mutex_unlock(&c->sb_lock);
 
+	bch2_dev_usage_journal_reserve(c);
+
 	err = "error marking superblock";
 	ret = bch2_trans_mark_dev_sb(c, NULL, ca);
 	if (ret)
-- 
cgit 


From 91f6ad6f947c96545eb7790569b279fdeac06153 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 2 Feb 2021 17:08:54 -0500
Subject: bcachefs: Include device in btree IO error messages

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_io.c | 76 +++++++++++++++++++++++++++-----------------------
 fs/bcachefs/btree_io.h |  3 +-
 fs/bcachefs/debug.c    |  2 +-
 3 files changed, 44 insertions(+), 37 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 8ac50c9ffcba..6e656ed6b32a 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -608,11 +608,16 @@ static void btree_pos_to_text(struct printbuf *out, struct bch_fs *c,
 }
 
 static void btree_err_msg(struct printbuf *out, struct bch_fs *c,
+			  struct bch_dev *ca,
 			  struct btree *b, struct bset *i,
 			  unsigned offset, int write)
 {
-	pr_buf(out, "error validating btree node %sat btree ",
-	       write ? "before write " : "");
+	pr_buf(out, "error validating btree node ");
+	if (write)
+		pr_buf(out, "before write ");
+	if (ca)
+		pr_buf(out, "on %s ", ca->name);
+	pr_buf(out, "at btree ");
 	btree_pos_to_text(out, c, b);
 
 	pr_buf(out, "\n  node offset %u", b->written);
@@ -631,7 +636,7 @@ enum btree_validate_ret {
 	BTREE_RETRY_READ = 64,
 };
 
-#define btree_err(type, c, b, i, msg, ...)				\
+#define btree_err(type, c, ca, b, i, msg, ...)				\
 ({									\
 	__label__ out;							\
 	char _buf[300];							\
@@ -642,7 +647,7 @@ enum btree_validate_ret {
 	if (buf2)							\
 		out = _PBUF(buf2, 4986);				\
 									\
-	btree_err_msg(&out, c, b, i, b->written, write);		\
+	btree_err_msg(&out, c, ca, b, i, b->written, write);		\
 	pr_buf(&out, ": " msg, ##__VA_ARGS__);				\
 									\
 	if (type == BTREE_ERR_FIXABLE &&				\
@@ -691,9 +696,9 @@ out:									\
 
 #define btree_err_on(cond, ...)	((cond) ? btree_err(__VA_ARGS__) : false)
 
-static int validate_bset(struct bch_fs *c, struct btree *b,
-			 struct bset *i, unsigned sectors,
-			 int write, bool have_retry)
+static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
+			 struct btree *b, struct bset *i,
+			 unsigned sectors, int write, bool have_retry)
 {
 	unsigned version = le16_to_cpu(i->version);
 	const char *err;
@@ -702,18 +707,18 @@ static int validate_bset(struct bch_fs *c, struct btree *b,
 	btree_err_on((version != BCH_BSET_VERSION_OLD &&
 		      version < bcachefs_metadata_version_min) ||
 		     version >= bcachefs_metadata_version_max,
-		     BTREE_ERR_FATAL, c, b, i,
+		     BTREE_ERR_FATAL, c, ca, b, i,
 		     "unsupported bset version");
 
 	if (btree_err_on(b->written + sectors > c->opts.btree_node_size,
-			 BTREE_ERR_FIXABLE, c, b, i,
+			 BTREE_ERR_FIXABLE, c, ca, b, i,
 			 "bset past end of btree node")) {
 		i->u64s = 0;
 		return 0;
 	}
 
 	btree_err_on(b->written && !i->u64s,
-		     BTREE_ERR_FIXABLE, c, b, i,
+		     BTREE_ERR_FIXABLE, c, ca, b, i,
 		     "empty bset");
 
 	if (!b->written) {
@@ -727,16 +732,16 @@ static int validate_bset(struct bch_fs *c, struct btree *b,
 
 			/* XXX endianness */
 			btree_err_on(bp->seq != bn->keys.seq,
-				     BTREE_ERR_MUST_RETRY, c, b, NULL,
+				     BTREE_ERR_MUST_RETRY, c, ca, b, NULL,
 				     "incorrect sequence number (wrong btree node)");
 		}
 
 		btree_err_on(BTREE_NODE_ID(bn) != b->c.btree_id,
-			     BTREE_ERR_MUST_RETRY, c, b, i,
+			     BTREE_ERR_MUST_RETRY, c, ca, b, i,
 			     "incorrect btree id");
 
 		btree_err_on(BTREE_NODE_LEVEL(bn) != b->c.level,
-			     BTREE_ERR_MUST_RETRY, c, b, i,
+			     BTREE_ERR_MUST_RETRY, c, ca, b, i,
 			     "incorrect level");
 
 		if (BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN) {
@@ -759,7 +764,7 @@ static int validate_bset(struct bch_fs *c, struct btree *b,
 			}
 
 			btree_err_on(bkey_cmp(b->data->min_key, bp->min_key),
-				     BTREE_ERR_MUST_RETRY, c, b, NULL,
+				     BTREE_ERR_MUST_RETRY, c, ca, b, NULL,
 				     "incorrect min_key: got %llu:%llu should be %llu:%llu",
 				     b->data->min_key.inode,
 				     b->data->min_key.offset,
@@ -768,7 +773,7 @@ static int validate_bset(struct bch_fs *c, struct btree *b,
 		}
 
 		btree_err_on(bkey_cmp(bn->max_key, b->key.k.p),
-			     BTREE_ERR_MUST_RETRY, c, b, i,
+			     BTREE_ERR_MUST_RETRY, c, ca, b, i,
 			     "incorrect max key %llu:%llu",
 			     bn->max_key.inode,
 			     bn->max_key.offset);
@@ -793,7 +798,7 @@ static int validate_bset(struct bch_fs *c, struct btree *b,
 #endif
 		err = bch2_bkey_format_validate(&bn->format);
 		btree_err_on(err,
-			     BTREE_ERR_FATAL, c, b, i,
+			     BTREE_ERR_FATAL, c, ca, b, i,
 			     "invalid bkey format: %s", err);
 
 		compat_bformat(b->c.level, b->c.btree_id, version,
@@ -825,14 +830,14 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b,
 		const char *invalid;
 
 		if (btree_err_on(bkey_next(k) > vstruct_last(i),
-				 BTREE_ERR_FIXABLE, c, b, i,
+				 BTREE_ERR_FIXABLE, c, NULL, b, i,
 				 "key extends past end of bset")) {
 			i->u64s = cpu_to_le16((u64 *) k - i->_data);
 			break;
 		}
 
 		if (btree_err_on(k->format > KEY_FORMAT_CURRENT,
-				 BTREE_ERR_FIXABLE, c, b, i,
+				 BTREE_ERR_FIXABLE, c, NULL, b, i,
 				 "invalid bkey format %u", k->format)) {
 			i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
 			memmove_u64s_down(k, bkey_next(k),
@@ -855,7 +860,7 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b,
 			char buf[160];
 
 			bch2_bkey_val_to_text(&PBUF(buf), c, u.s_c);
-			btree_err(BTREE_ERR_FIXABLE, c, b, i,
+			btree_err(BTREE_ERR_FIXABLE, c, NULL, b, i,
 				  "invalid bkey: %s\n%s", invalid, buf);
 
 			i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
@@ -889,7 +894,7 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b,
 			bch2_bkey_to_text(&PBUF(buf2), u.k);
 
 			bch2_dump_bset(c, b, i, 0);
-			btree_err(BTREE_ERR_FATAL, c, b, i,
+			btree_err(BTREE_ERR_FATAL, c, NULL, b, i,
 				  "keys out of order: %s > %s",
 				  buf1, buf2);
 			/* XXX: repair this */
@@ -902,7 +907,8 @@ fsck_err:
 	return ret;
 }
 
-int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry)
+int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
+			      struct btree *b, bool have_retry)
 {
 	struct btree_node_entry *bne;
 	struct sort_iter *iter;
@@ -919,15 +925,15 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry
 	iter->size = (btree_blocks(c) + 1) * 2;
 
 	if (bch2_meta_read_fault("btree"))
-		btree_err(BTREE_ERR_MUST_RETRY, c, b, NULL,
+		btree_err(BTREE_ERR_MUST_RETRY, c, ca, b, NULL,
 			  "dynamic fault");
 
 	btree_err_on(le64_to_cpu(b->data->magic) != bset_magic(c),
-		     BTREE_ERR_MUST_RETRY, c, b, NULL,
+		     BTREE_ERR_MUST_RETRY, c, ca, b, NULL,
 		     "bad magic");
 
 	btree_err_on(!b->data->keys.seq,
-		     BTREE_ERR_MUST_RETRY, c, b, NULL,
+		     BTREE_ERR_MUST_RETRY, c, ca, b, NULL,
 		     "bad btree header");
 
 	if (b->key.k.type == KEY_TYPE_btree_ptr_v2) {
@@ -935,7 +941,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry
 			&bkey_i_to_btree_ptr_v2(&b->key)->v;
 
 		btree_err_on(b->data->keys.seq != bp->seq,
-			     BTREE_ERR_MUST_RETRY, c, b, NULL,
+			     BTREE_ERR_MUST_RETRY, c, ca, b, NULL,
 			     "got wrong btree node (seq %llx want %llx)",
 			     b->data->keys.seq, bp->seq);
 	}
@@ -950,7 +956,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry
 			i = &b->data->keys;
 
 			btree_err_on(!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)),
-				     BTREE_ERR_WANT_RETRY, c, b, i,
+				     BTREE_ERR_WANT_RETRY, c, ca, b, i,
 				     "unknown checksum type %llu",
 				     BSET_CSUM_TYPE(i));
 
@@ -958,7 +964,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry
 			csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, b->data);
 
 			btree_err_on(bch2_crc_cmp(csum, b->data->csum),
-				     BTREE_ERR_WANT_RETRY, c, b, i,
+				     BTREE_ERR_WANT_RETRY, c, ca, b, i,
 				     "invalid checksum");
 
 			bset_encrypt(c, i, b->written << 9);
@@ -978,7 +984,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry
 				break;
 
 			btree_err_on(!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)),
-				     BTREE_ERR_WANT_RETRY, c, b, i,
+				     BTREE_ERR_WANT_RETRY, c, ca, b, i,
 				     "unknown checksum type %llu",
 				     BSET_CSUM_TYPE(i));
 
@@ -986,7 +992,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry
 			csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne);
 
 			btree_err_on(bch2_crc_cmp(csum, bne->csum),
-				     BTREE_ERR_WANT_RETRY, c, b, i,
+				     BTREE_ERR_WANT_RETRY, c, ca, b, i,
 				     "invalid checksum");
 
 			bset_encrypt(c, i, b->written << 9);
@@ -994,7 +1000,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry
 			sectors = vstruct_sectors(bne, c->block_bits);
 		}
 
-		ret = validate_bset(c, b, i, sectors,
+		ret = validate_bset(c, ca, b, i, sectors,
 				    READ, have_retry);
 		if (ret)
 			goto fsck_err;
@@ -1016,7 +1022,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry
 					true);
 
 		btree_err_on(blacklisted && first,
-			     BTREE_ERR_FIXABLE, c, b, i,
+			     BTREE_ERR_FIXABLE, c, ca, b, i,
 			     "first btree node bset has blacklisted journal seq");
 		if (blacklisted && !first)
 			continue;
@@ -1033,7 +1039,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry
 	     bset_byte_offset(b, bne) < btree_bytes(c);
 	     bne = (void *) bne + block_bytes(c))
 		btree_err_on(bne->keys.seq == b->data->keys.seq,
-			     BTREE_ERR_WANT_RETRY, c, b, NULL,
+			     BTREE_ERR_WANT_RETRY, c, ca, b, NULL,
 			     "found bset signature after last bset");
 
 	sorted = btree_bounce_alloc(c, btree_bytes(c), &used_mempool);
@@ -1068,7 +1074,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry
 			char buf[160];
 
 			bch2_bkey_val_to_text(&PBUF(buf), c, u.s_c);
-			btree_err(BTREE_ERR_FIXABLE, c, b, i,
+			btree_err(BTREE_ERR_FIXABLE, c, NULL, b, i,
 				  "invalid bkey %s: %s", buf, invalid);
 
 			btree_keys_account_key_drop(&b->nr, 0, k);
@@ -1158,7 +1164,7 @@ start:
 				&failed, &rb->pick) > 0;
 
 		if (!bio->bi_status &&
-		    !bch2_btree_node_read_done(c, b, can_retry))
+		    !bch2_btree_node_read_done(c, ca, b, can_retry))
 			break;
 
 		if (!can_retry) {
@@ -1465,7 +1471,7 @@ static int validate_bset_for_write(struct bch_fs *c, struct btree *b,
 	if (bch2_bkey_invalid(c, bkey_i_to_s_c(&b->key), BKEY_TYPE_BTREE))
 		return -1;
 
-	ret = validate_bset(c, b, i, sectors, WRITE, false) ?:
+	ret = validate_bset(c, NULL, b, i, sectors, WRITE, false) ?:
 		validate_bset_keys(c, b, i, &whiteout_u64s, WRITE, false);
 	if (ret) {
 		bch2_inconsistent_error(c);
diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h
index 3b61555ef906..89685bd57fc0 100644
--- a/fs/bcachefs/btree_io.h
+++ b/fs/bcachefs/btree_io.h
@@ -134,7 +134,8 @@ void bch2_btree_build_aux_trees(struct btree *);
 void bch2_btree_init_next(struct bch_fs *, struct btree *,
 			 struct btree_iter *);
 
-int bch2_btree_node_read_done(struct bch_fs *, struct btree *, bool);
+int bch2_btree_node_read_done(struct bch_fs *, struct bch_dev *,
+			      struct btree *, bool);
 void bch2_btree_node_read(struct bch_fs *, struct btree *, bool);
 int bch2_btree_root_read(struct bch_fs *, enum btree_id,
 			 const struct bkey_i *, unsigned);
diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c
index 0d5ec39e44e0..3ac700dc72d5 100644
--- a/fs/bcachefs/debug.c
+++ b/fs/bcachefs/debug.c
@@ -79,7 +79,7 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b)
 
 	memcpy(n_ondisk, n_sorted, btree_bytes(c));
 
-	if (bch2_btree_node_read_done(c, v, false))
+	if (bch2_btree_node_read_done(c, ca, v, false))
 		goto out;
 
 	n_sorted = c->verify_data->data;
-- 
cgit 


From 4b8f89afd44592d50f7309750e7835fc777dfb08 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 3 Feb 2021 13:10:55 -0500
Subject: bcachefs: Fixes/improvements for journal entry reservations

This fixes some arithmetic bugs in "bcachefs: Journal updates to dev
usage" - additionally, it cleans things up by switching everything that
goes in every journal entry to the journal_entry_res mechanism.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs.h |  3 ++-
 fs/bcachefs/journal.c  |  7 -------
 fs/bcachefs/replicas.c |  5 +++--
 fs/bcachefs/super.c    | 15 +++++++++++----
 4 files changed, 16 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 0c24a5312e49..cec5c3ddce34 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -582,8 +582,9 @@ struct bch_fs {
 	struct bch_replicas_cpu replicas_gc;
 	struct mutex		replicas_gc_lock;
 
+	struct journal_entry_res btree_root_journal_res;
 	struct journal_entry_res replicas_journal_res;
-
+	struct journal_entry_res clock_journal_res;
 	struct journal_entry_res dev_usage_journal_res;
 
 	struct bch_disk_groups_cpu __rcu *disk_groups;
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index 379b9ad2c0f9..7c805dd74180 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -1119,13 +1119,6 @@ int bch2_fs_journal_init(struct journal *j)
 	j->write_delay_ms	= 1000;
 	j->reclaim_delay_ms	= 100;
 
-	/* Btree roots: */
-	j->entry_u64s_reserved +=
-		BTREE_ID_NR * (JSET_KEYS_U64s + BKEY_BTREE_PTR_U64s_MAX);
-
-	j->entry_u64s_reserved +=
-		2 * (sizeof(struct jset_entry_clock) / sizeof(u64));
-
 	atomic64_set(&j->reservations.counter,
 		((union journal_res_state)
 		 { .cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL }).v);
diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c
index e5e437deb9ab..bf1804c10bfb 100644
--- a/fs/bcachefs/replicas.c
+++ b/fs/bcachefs/replicas.c
@@ -1074,8 +1074,9 @@ unsigned bch2_dev_has_data(struct bch_fs *c, struct bch_dev *ca)
 
 int bch2_fs_replicas_init(struct bch_fs *c)
 {
-	c->journal.entry_u64s_reserved +=
-		reserve_journal_replicas(c, &c->replicas);
+	bch2_journal_entry_res_resize(&c->journal,
+			&c->replicas_journal_res,
+			reserve_journal_replicas(c, &c->replicas));
 
 	return replicas_table_update(c, &c->replicas);
 }
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index eecabeb08c94..e242b7215548 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -159,8 +159,9 @@ static void bch2_dev_usage_journal_reserve(struct bch_fs *c)
 {
 	struct bch_dev *ca;
 	unsigned i, nr = 0, u64s =
-		(sizeof(struct jset_entry_dev_usage) +
-		 sizeof(struct jset_entry_dev_usage_type) * BCH_DATA_NR);
+		((sizeof(struct jset_entry_dev_usage) +
+		  sizeof(struct jset_entry_dev_usage_type) * BCH_DATA_NR)) /
+		sizeof(u64);
 
 	rcu_read_lock();
 	for_each_member_device_rcu(ca, c, i, NULL)
@@ -796,14 +797,20 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 	    bch2_fs_fsio_init(c))
 		goto err;
 
-	bch2_dev_usage_journal_reserve(c);
-
 	mi = bch2_sb_get_members(c->disk_sb.sb);
 	for (i = 0; i < c->sb.nr_devices; i++)
 		if (bch2_dev_exists(c->disk_sb.sb, mi, i) &&
 		    bch2_dev_alloc(c, i))
 			goto err;
 
+	bch2_journal_entry_res_resize(&c->journal,
+			&c->btree_root_journal_res,
+			BTREE_ID_NR * (JSET_KEYS_U64s + BKEY_BTREE_PTR_U64s_MAX));
+	bch2_dev_usage_journal_reserve(c);
+	bch2_journal_entry_res_resize(&c->journal,
+			&c->clock_journal_res,
+			(sizeof(struct jset_entry_clock) / sizeof(u64)) * 2);
+
 	mutex_lock(&bch_fs_list_lock);
 	err = bch2_fs_online(c);
 	mutex_unlock(&bch_fs_list_lock);
-- 
cgit 


From 5d428c7c6445fc483f77eef82d17a744eeed73be Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 3 Feb 2021 15:31:17 -0500
Subject: bcachefs: Run fsck if BCH_FEATURE_alloc_v2 isn't set

We're using BCH_FEATURE_alloc_v2 to also gate journalling updates to dev
usage - we don't have the code for reconstructing this from buckets
anymore, so we need to run fsck if it's not set.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/recovery.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 7ba098adcab9..8560023b4c7a 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -1088,6 +1088,13 @@ int bch2_fs_recovery(struct bch_fs *c)
 		bch_info(c, "recovering from clean shutdown, journal seq %llu",
 			 le64_to_cpu(clean->journal_seq));
 
+	if (!(c->sb.features & (1ULL << BCH_FEATURE_alloc_v2))) {
+		bch_info(c, "alloc_v2 feature bit not set, fsck required");
+		c->opts.fsck = true;
+		c->opts.fix_errors = FSCK_OPT_YES;
+		c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_alloc_v2;
+	}
+
 	if (!c->replicas.entries ||
 	    c->opts.rebuild_replicas) {
 		bch_info(c, "building replicas info");
-- 
cgit 


From fcb3431be837d06ae9af6eedde6a6509881664de Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 6 Feb 2021 23:17:26 -0500
Subject: bcachefs: Redo checks for sufficient devices

When the replicas mechanism was added, for tracking data by which drives
it's replicated on, the check for whether we have sufficient devices was
never updated to make use of it. This patch finally does that.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs_ioctl.h |  3 ++
 fs/bcachefs/opts.h           |  5 +++
 fs/bcachefs/replicas.c       | 98 ++++++++++++--------------------------------
 fs/bcachefs/replicas.h       | 16 ++------
 fs/bcachefs/super-io.c       |  7 ++--
 fs/bcachefs/super.c          | 23 +++++------
 fs/bcachefs/sysfs.c          |  9 ----
 7 files changed, 51 insertions(+), 110 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs_ioctl.h b/fs/bcachefs/bcachefs_ioctl.h
index 1bf834e31775..38c6ac96e12f 100644
--- a/fs/bcachefs/bcachefs_ioctl.h
+++ b/fs/bcachefs/bcachefs_ioctl.h
@@ -14,6 +14,9 @@
 #define BCH_FORCE_IF_DATA_DEGRADED	(1 << 2)
 #define BCH_FORCE_IF_METADATA_DEGRADED	(1 << 3)
 
+#define BCH_FORCE_IF_LOST			\
+	(BCH_FORCE_IF_DATA_LOST|		\
+	 BCH_FORCE_IF_METADATA_LOST)
 #define BCH_FORCE_IF_DEGRADED			\
 	(BCH_FORCE_IF_DATA_DEGRADED|		\
 	 BCH_FORCE_IF_METADATA_DEGRADED)
diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
index 01b93e7eb027..01282314bacb 100644
--- a/fs/bcachefs/opts.h
+++ b/fs/bcachefs/opts.h
@@ -222,6 +222,11 @@ enum opt_type {
 	  OPT_BOOL(),							\
 	  NO_SB_OPT,			false,				\
 	  NULL,		"Allow mounting in degraded mode")		\
+	x(very_degraded,		u8,				\
+	  OPT_MOUNT,							\
+	  OPT_BOOL(),							\
+	  NO_SB_OPT,			false,				\
+	  NULL,		"Allow mounting in when data will be missing")	\
 	x(discard,			u8,				\
 	  OPT_MOUNT|OPT_DEVICE,						\
 	  OPT_BOOL(),							\
diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c
index bf1804c10bfb..8003973b0400 100644
--- a/fs/bcachefs/replicas.c
+++ b/fs/bcachefs/replicas.c
@@ -967,94 +967,48 @@ const struct bch_sb_field_ops bch_sb_field_ops_replicas_v0 = {
 
 /* Query replicas: */
 
-struct replicas_status __bch2_replicas_status(struct bch_fs *c,
-					      struct bch_devs_mask online_devs)
+bool bch2_have_enough_devs(struct bch_fs *c, struct bch_devs_mask devs,
+			   unsigned flags, bool print)
 {
-	struct bch_sb_field_members *mi;
 	struct bch_replicas_entry *e;
-	unsigned i, nr_online, nr_offline;
-	struct replicas_status ret;
-
-	memset(&ret, 0, sizeof(ret));
-
-	for (i = 0; i < ARRAY_SIZE(ret.replicas); i++)
-		ret.replicas[i].redundancy = INT_MAX;
-
-	mi = bch2_sb_get_members(c->disk_sb.sb);
+	bool ret = true;
 
 	percpu_down_read(&c->mark_lock);
-
 	for_each_cpu_replicas_entry(&c->replicas, e) {
-		if (e->data_type >= ARRAY_SIZE(ret.replicas))
-			panic("e %p data_type %u\n", e, e->data_type);
+		unsigned i, nr_online = 0, dflags = 0;
+		bool metadata = e->data_type < BCH_DATA_user;
 
-		nr_online = nr_offline = 0;
+		for (i = 0; i < e->nr_devs; i++)
+			nr_online += test_bit(e->devs[i], devs.d);
 
-		for (i = 0; i < e->nr_devs; i++) {
-			BUG_ON(!bch2_dev_exists(c->disk_sb.sb, mi,
-						e->devs[i]));
+		if (nr_online < e->nr_required)
+			dflags |= metadata
+				? BCH_FORCE_IF_METADATA_LOST
+				: BCH_FORCE_IF_DATA_LOST;
 
-			if (test_bit(e->devs[i], online_devs.d))
-				nr_online++;
-			else
-				nr_offline++;
-		}
+		if (nr_online < e->nr_devs)
+			dflags |= metadata
+				? BCH_FORCE_IF_METADATA_DEGRADED
+				: BCH_FORCE_IF_DATA_DEGRADED;
 
-		ret.replicas[e->data_type].redundancy =
-			min(ret.replicas[e->data_type].redundancy,
-			    (int) nr_online - (int) e->nr_required);
+		if (dflags & ~flags) {
+			if (print) {
+				char buf[100];
 
-		ret.replicas[e->data_type].nr_offline =
-			max(ret.replicas[e->data_type].nr_offline,
-			    nr_offline);
-	}
+				bch2_replicas_entry_to_text(&PBUF(buf), e);
+				bch_err(c, "insufficient devices online (%u) for replicas entry %s",
+					nr_online, buf);
+			}
+			ret = false;
+			break;
+		}
 
+	}
 	percpu_up_read(&c->mark_lock);
 
-	for (i = 0; i < ARRAY_SIZE(ret.replicas); i++)
-		if (ret.replicas[i].redundancy == INT_MAX)
-			ret.replicas[i].redundancy = 0;
-
 	return ret;
 }
 
-struct replicas_status bch2_replicas_status(struct bch_fs *c)
-{
-	return __bch2_replicas_status(c, bch2_online_devs(c));
-}
-
-static bool have_enough_devs(struct replicas_status s,
-			     enum bch_data_type type,
-			     bool force_if_degraded,
-			     bool force_if_lost)
-{
-	return (!s.replicas[type].nr_offline || force_if_degraded) &&
-		(s.replicas[type].redundancy >= 0 || force_if_lost);
-}
-
-bool bch2_have_enough_devs(struct replicas_status s, unsigned flags)
-{
-	return (have_enough_devs(s, BCH_DATA_journal,
-				 flags & BCH_FORCE_IF_METADATA_DEGRADED,
-				 flags & BCH_FORCE_IF_METADATA_LOST) &&
-		have_enough_devs(s, BCH_DATA_btree,
-				 flags & BCH_FORCE_IF_METADATA_DEGRADED,
-				 flags & BCH_FORCE_IF_METADATA_LOST) &&
-		have_enough_devs(s, BCH_DATA_user,
-				 flags & BCH_FORCE_IF_DATA_DEGRADED,
-				 flags & BCH_FORCE_IF_DATA_LOST));
-}
-
-int bch2_replicas_online(struct bch_fs *c, bool meta)
-{
-	struct replicas_status s = bch2_replicas_status(c);
-
-	return (meta
-		? min(s.replicas[BCH_DATA_journal].redundancy,
-		      s.replicas[BCH_DATA_btree].redundancy)
-		: s.replicas[BCH_DATA_user].redundancy) + 1;
-}
-
 unsigned bch2_dev_has_data(struct bch_fs *c, struct bch_dev *ca)
 {
 	struct bch_replicas_entry *e;
diff --git a/fs/bcachefs/replicas.h b/fs/bcachefs/replicas.h
index a16ef23bde8a..9c8fd3d98247 100644
--- a/fs/bcachefs/replicas.h
+++ b/fs/bcachefs/replicas.h
@@ -39,19 +39,9 @@ static inline void bch2_replicas_entry_cached(struct bch_replicas_entry *e,
 	e->devs[0]	= dev;
 }
 
-struct replicas_status {
-	struct {
-		int		redundancy;
-		unsigned	nr_offline;
-	}			replicas[BCH_DATA_NR];
-};
-
-struct replicas_status __bch2_replicas_status(struct bch_fs *,
-					      struct bch_devs_mask);
-struct replicas_status bch2_replicas_status(struct bch_fs *);
-bool bch2_have_enough_devs(struct replicas_status, unsigned);
-
-int bch2_replicas_online(struct bch_fs *, bool);
+bool bch2_have_enough_devs(struct bch_fs *, struct bch_devs_mask,
+			   unsigned, bool);
+
 unsigned bch2_dev_has_data(struct bch_fs *, struct bch_dev *);
 
 int bch2_replicas_gc_end(struct bch_fs *, int);
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index 0356541c00e2..767baab18807 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -770,15 +770,13 @@ int bch2_write_super(struct bch_fs *c)
 	nr_wrote = dev_mask_nr(&sb_written);
 
 	can_mount_with_written =
-		bch2_have_enough_devs(__bch2_replicas_status(c, sb_written),
-				      BCH_FORCE_IF_DEGRADED);
+		bch2_have_enough_devs(c, sb_written, BCH_FORCE_IF_DEGRADED, false);
 
 	for (i = 0; i < ARRAY_SIZE(sb_written.d); i++)
 		sb_written.d[i] = ~sb_written.d[i];
 
 	can_mount_without_written =
-		bch2_have_enough_devs(__bch2_replicas_status(c, sb_written),
-				      BCH_FORCE_IF_DEGRADED);
+		bch2_have_enough_devs(c, sb_written, BCH_FORCE_IF_DEGRADED, false);
 
 	/*
 	 * If we would be able to mount _without_ the devices we successfully
@@ -789,6 +787,7 @@ int bch2_write_super(struct bch_fs *c)
 	 * mount with the devices we did successfully write to:
 	 */
 	if (bch2_fs_fatal_err_on(!nr_wrote ||
+				 !can_mount_with_written ||
 				 (can_mount_without_written &&
 				  !can_mount_with_written), c,
 		"Unable to write superblock to sufficient devices"))
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index e242b7215548..e10e7e0c0454 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -1265,7 +1265,6 @@ bool bch2_dev_state_allowed(struct bch_fs *c, struct bch_dev *ca,
 			    enum bch_member_state new_state, int flags)
 {
 	struct bch_devs_mask new_online_devs;
-	struct replicas_status s;
 	struct bch_dev *ca2;
 	int i, nr_rw = 0, required;
 
@@ -1301,9 +1300,7 @@ bool bch2_dev_state_allowed(struct bch_fs *c, struct bch_dev *ca,
 		new_online_devs = bch2_online_devs(c);
 		__clear_bit(ca->dev_idx, new_online_devs.d);
 
-		s = __bch2_replicas_status(c, new_online_devs);
-
-		return bch2_have_enough_devs(s, flags);
+		return bch2_have_enough_devs(c, new_online_devs, flags, false);
 	default:
 		BUG();
 	}
@@ -1311,14 +1308,18 @@ bool bch2_dev_state_allowed(struct bch_fs *c, struct bch_dev *ca,
 
 static bool bch2_fs_may_start(struct bch_fs *c)
 {
-	struct replicas_status s;
 	struct bch_sb_field_members *mi;
 	struct bch_dev *ca;
-	unsigned i, flags = c->opts.degraded
-		? BCH_FORCE_IF_DEGRADED
-		: 0;
+	unsigned i, flags = 0;
+
+	if (c->opts.very_degraded)
+		flags |= BCH_FORCE_IF_DEGRADED|BCH_FORCE_IF_LOST;
 
-	if (!c->opts.degraded) {
+	if (c->opts.degraded)
+		flags |= BCH_FORCE_IF_DEGRADED;
+
+	if (!c->opts.degraded &&
+	    !c->opts.very_degraded) {
 		mutex_lock(&c->sb_lock);
 		mi = bch2_sb_get_members(c->disk_sb.sb);
 
@@ -1338,9 +1339,7 @@ static bool bch2_fs_may_start(struct bch_fs *c)
 		mutex_unlock(&c->sb_lock);
 	}
 
-	s = bch2_replicas_status(c);
-
-	return bch2_have_enough_devs(s, flags);
+	return bch2_have_enough_devs(c, bch2_online_devs(c), flags, true);
 }
 
 static void __bch2_dev_read_only(struct bch_fs *c, struct bch_dev *ca)
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index 8fdbeaf9df32..49c19873ad6f 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -199,9 +199,6 @@ read_attribute(new_stripes);
 
 rw_attribute(pd_controllers_update_seconds);
 
-read_attribute(meta_replicas_have);
-read_attribute(data_replicas_have);
-
 read_attribute(io_timers_read);
 read_attribute(io_timers_write);
 
@@ -347,9 +344,6 @@ SHOW(bch2_fs)
 
 	sysfs_print(promote_whole_extents,	c->promote_whole_extents);
 
-	sysfs_printf(meta_replicas_have, "%i",	bch2_replicas_online(c, true));
-	sysfs_printf(data_replicas_have, "%i",	bch2_replicas_online(c, false));
-
 	/* Debugging: */
 
 	if (attr == &sysfs_alloc_debug)
@@ -520,9 +514,6 @@ struct attribute *bch2_fs_files[] = {
 	&sysfs_btree_node_size,
 	&sysfs_btree_cache_size,
 
-	&sysfs_meta_replicas_have,
-	&sysfs_data_replicas_have,
-
 	&sysfs_journal_write_delay_ms,
 	&sysfs_journal_reclaim_delay_ms,
 
-- 
cgit 


From 6a16ad951ae4cb01dbfe73e8aa63045ee07e4581 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 9 Feb 2021 19:54:04 -0500
Subject: bcachefs: Add flushed_seq_ondisk to journal_debug_to_text()

Also, make the wait in bch2_journal_flush_seq() interruptible, not just
killable.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/journal.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index 7c805dd74180..b33d985fa020 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -639,9 +639,10 @@ int bch2_journal_flush_seq(struct journal *j, u64 seq)
 	u64 start_time = local_clock();
 	int ret, ret2;
 
-	ret = wait_event_killable(j->wait, (ret2 = bch2_journal_flush_seq_async(j, seq, NULL)));
+	ret = wait_event_interruptible(j->wait, (ret2 = bch2_journal_flush_seq_async(j, seq, NULL)));
 
-	bch2_time_stats_update(j->flush_seq_time, start_time);
+	if (!ret)
+		bch2_time_stats_update(j->flush_seq_time, start_time);
 
 	return ret ?: ret2 < 0 ? ret2 : 0;
 }
@@ -1160,6 +1161,7 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
 	       "seq:\t\t\t%llu\n"
 	       "last_seq:\t\t%llu\n"
 	       "last_seq_ondisk:\t%llu\n"
+	       "flushed_seq_ondisk:\t%llu\n"
 	       "prereserved:\t\t%u/%u\n"
 	       "nr flush writes:\t%llu\n"
 	       "nr noflush writes:\t%llu\n"
@@ -1172,6 +1174,7 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
 	       journal_cur_seq(j),
 	       journal_last_seq(j),
 	       j->last_seq_ondisk,
+	       j->flushed_seq_ondisk,
 	       j->prereserved.reserved,
 	       j->prereserved.remaining,
 	       j->nr_flush_writes,
-- 
cgit 


From eaf798317aa8fe3c8417ae1414c0b0bc58748881 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 9 Feb 2021 19:54:40 -0500
Subject: bcachefs: Fix for hash_redo_key() in fsck

It's possible we're calling hash_redo_key() because of a duplicate key -
easiest fix for that is to just not use BCH_HASH_SET_MUST_CREATE.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fsck.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index df0f00f10bd7..c3e6137ffd75 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -193,7 +193,7 @@ static int hash_redo_key(const struct bch_hash_desc desc,
 	bch2_trans_update(trans, k_iter, &delete, 0);
 
 	return bch2_hash_set(trans, desc, &h->info, k_iter->pos.inode,
-			     tmp, BCH_HASH_SET_MUST_CREATE);
+			     tmp, 0);
 }
 
 static int fsck_hash_delete_at(struct btree_trans *trans,
-- 
cgit 


From 2b2c1a89cef6d19fc5a0995e550a67b1701d8bed Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 7 Feb 2021 18:52:13 -0500
Subject: bcachefs: Simplify btree_iter_(next|prev)_leaf()

There's no good reason for these functions to not be using
bch2_btree_iter_set_pos().

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c | 27 +++++++++------------------
 1 file changed, 9 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 196f346f0544..431b1e8bebc7 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1554,38 +1554,29 @@ void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos)
 
 static inline bool btree_iter_set_pos_to_next_leaf(struct btree_iter *iter)
 {
-	struct btree_iter_level *l = &iter->l[0];
-	bool ret;
-
-	bkey_init(&iter->k);
-	iter->k.p = iter->pos = l->b->key.k.p;
+	struct bpos next_pos = iter->l[0].b->key.k.p;
+	bool ret = bkey_cmp(next_pos, POS_MAX) != 0;
 
-	ret = bkey_cmp(iter->pos, POS_MAX) != 0;
 	if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS))
-		iter->k.p = iter->pos = bkey_successor(iter->pos);
+		next_pos = bkey_successor(next_pos);
 
-	btree_iter_pos_changed(iter, 1);
+	bch2_btree_iter_set_pos(iter, next_pos);
 	return ret;
 }
 
 static inline bool btree_iter_set_pos_to_prev_leaf(struct btree_iter *iter)
 {
-	struct btree_iter_level *l = &iter->l[0];
-	bool ret;
-
-	bkey_init(&iter->k);
-	iter->k.p = iter->pos = l->b->data->min_key;
-	iter->uptodate	= BTREE_ITER_NEED_TRAVERSE;
+	struct bpos next_pos = iter->l[0].b->data->min_key;
+	bool ret = bkey_cmp(next_pos, POS_MIN) != 0;
 
-	ret = bkey_cmp(iter->pos, POS_MIN) != 0;
 	if (ret) {
-		iter->k.p = iter->pos = bkey_predecessor(iter->pos);
+		next_pos = bkey_predecessor(next_pos);
 
 		if (iter->flags & BTREE_ITER_IS_EXTENTS)
-			iter->k.p = iter->pos = bkey_predecessor(iter->pos);
+			next_pos = bkey_predecessor(next_pos);
 	}
 
-	btree_iter_pos_changed(iter, -1);
+	bch2_btree_iter_set_pos(iter, next_pos);
 	return ret;
 }
 
-- 
cgit 


From 792e2c4c850c75857e822fa40660a3b4733d5b8c Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 7 Feb 2021 20:16:21 -0500
Subject: bcachefs: Kill bch2_btree_iter_set_pos_same_leaf()

The only reason we were keeping this around was for
BTREE_INSERT_NOUNLOCK semantics - if bch2_btree_iter_set_pos() advances
to the next leaf node, it'll drop the lock on the node that we just
inserted to.

But we don't rely on BTREE_INSERT_NOUNLOCK semantics for the extents
btree, just the inodes btree, and if we do need it for the extents btree
in the future we can do it more cleanly by cloning the iterator - this
lets us delete some special cases in the btree iterator code, which is
complicated enough as it is.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c        | 37 +------------------------------------
 fs/bcachefs/btree_iter.h        |  1 -
 fs/bcachefs/btree_update_leaf.c |  8 ++------
 3 files changed, 3 insertions(+), 43 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 431b1e8bebc7..e4fb1a0451a4 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -516,12 +516,7 @@ static void bch2_btree_iter_verify_level(struct btree_iter *iter,
 	if (!bch2_btree_node_relock(iter, level))
 		return;
 
-	/*
-	 * Ideally this invariant would always be true, and hopefully in the
-	 * future it will be, but for now set_pos_same_leaf() breaks it:
-	 */
-	BUG_ON(iter->uptodate < BTREE_ITER_NEED_TRAVERSE &&
-	       !btree_iter_pos_in_node(iter, l->b));
+	BUG_ON(!btree_iter_pos_in_node(iter, l->b));
 
 	/*
 	 * node iterators don't use leaf node iterator:
@@ -1457,36 +1452,6 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
 
 /* Iterate across keys (in leaf nodes only) */
 
-void bch2_btree_iter_set_pos_same_leaf(struct btree_iter *iter, struct bpos new_pos)
-{
-	struct btree_iter_level *l = &iter->l[0];
-
-	EBUG_ON(iter->level != 0);
-	EBUG_ON(bkey_cmp(new_pos, iter->pos) < 0);
-	EBUG_ON(!btree_node_locked(iter, 0));
-	EBUG_ON(bkey_cmp(new_pos, l->b->key.k.p) > 0);
-
-	bkey_init(&iter->k);
-	iter->k.p = iter->pos = new_pos;
-	btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
-
-	btree_iter_advance_to_pos(iter, l, -1);
-
-	/*
-	 * XXX:
-	 * keeping a node locked that's outside (even just outside) iter->pos
-	 * breaks __bch2_btree_node_lock(). This seems to only affect
-	 * bch2_btree_node_get_sibling so for now it's fixed there, but we
-	 * should try to get rid of this corner case.
-	 *
-	 * (this behaviour is currently needed for BTREE_INSERT_NOUNLOCK)
-	 */
-
-	if (bch2_btree_node_iter_end(&l->iter) &&
-	    btree_iter_pos_after_node(iter, l->b))
-		btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
-}
-
 static void btree_iter_pos_changed(struct btree_iter *iter, int cmp)
 {
 	unsigned l = iter->level;
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index 9a7f8d0197ec..12c519ae2a60 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -174,7 +174,6 @@ struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *);
 
 struct bkey_s_c bch2_btree_iter_peek_cached(struct btree_iter *);
 
-void bch2_btree_iter_set_pos_same_leaf(struct btree_iter *, struct bpos);
 void __bch2_btree_iter_set_pos(struct btree_iter *, struct bpos, bool);
 void bch2_btree_iter_set_pos(struct btree_iter *, struct bpos);
 
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index f64e7d37bbbf..022ab3d90871 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -951,12 +951,8 @@ retry:
 
 	trans_for_each_iter(trans, iter)
 		if ((trans->iters_live & (1ULL << iter->idx)) &&
-		    (iter->flags & BTREE_ITER_SET_POS_AFTER_COMMIT)) {
-			if (trans->flags & BTREE_INSERT_NOUNLOCK)
-				bch2_btree_iter_set_pos_same_leaf(iter, iter->pos_after_commit);
-			else
-				bch2_btree_iter_set_pos(iter, iter->pos_after_commit);
-		}
+		    (iter->flags & BTREE_ITER_SET_POS_AFTER_COMMIT))
+			bch2_btree_iter_set_pos(iter, iter->pos_after_commit);
 out:
 	bch2_journal_preres_put(&trans->c->journal, &trans->journal_preres);
 
-- 
cgit 


From 434094bec03e43472c96f8c8acd8e94820d2a7e4 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 7 Feb 2021 21:28:58 -0500
Subject: bcachefs: bch2_btree_iter_advance_pos()

This adds a new common helper for advancing past the last key returned
by peek().

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c | 41 +++++++++++++++++------------------------
 1 file changed, 17 insertions(+), 24 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index e4fb1a0451a4..294c591b5047 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1517,6 +1517,18 @@ void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos)
 	btree_iter_pos_changed(iter, cmp);
 }
 
+static inline bool bch2_btree_iter_advance_pos(struct btree_iter *iter)
+{
+	if (unlikely(!bkey_cmp(iter->k.p, POS_MAX)))
+		return false;
+
+	bch2_btree_iter_set_pos(iter,
+		(iter->flags & BTREE_ITER_IS_EXTENTS)
+		? iter->k.p
+		: bkey_successor(iter->k.p));
+	return true;
+}
+
 static inline bool btree_iter_set_pos_to_next_leaf(struct btree_iter *iter)
 {
 	struct bpos next_pos = iter->l[0].b->key.k.p;
@@ -1623,14 +1635,9 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
  */
 struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter)
 {
-	if (unlikely(!bkey_cmp(iter->k.p, POS_MAX)))
+	if (!bch2_btree_iter_advance_pos(iter))
 		return bkey_s_c_null;
 
-	bch2_btree_iter_set_pos(iter,
-		(iter->flags & BTREE_ITER_IS_EXTENTS)
-		? iter->k.p
-		: bkey_successor(iter->k.p));
-
 	return bch2_btree_iter_peek(iter);
 }
 
@@ -1682,10 +1689,7 @@ struct bkey_s_c bch2_btree_iter_peek_with_updates(struct btree_iter *iter)
 		k = __bch2_btree_iter_peek_with_updates(iter);
 
 		if (k.k && bkey_deleted(k.k)) {
-			bch2_btree_iter_set_pos(iter,
-				(iter->flags & BTREE_ITER_IS_EXTENTS)
-				? iter->k.p
-				: bkey_successor(iter->k.p));
+			bch2_btree_iter_advance_pos(iter);
 			continue;
 		}
 
@@ -1700,8 +1704,7 @@ struct bkey_s_c bch2_btree_iter_peek_with_updates(struct btree_iter *iter)
 	 * iter->pos should always be equal to the key we just
 	 * returned - except extents can straddle iter->pos:
 	 */
-	if (!(iter->flags & BTREE_ITER_IS_EXTENTS) ||
-	    bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0)
+	if (bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0)
 		iter->pos = bkey_start_pos(k.k);
 
 	iter->uptodate = BTREE_ITER_UPTODATE;
@@ -1710,14 +1713,9 @@ struct bkey_s_c bch2_btree_iter_peek_with_updates(struct btree_iter *iter)
 
 struct bkey_s_c bch2_btree_iter_next_with_updates(struct btree_iter *iter)
 {
-	if (unlikely(!bkey_cmp(iter->k.p, POS_MAX)))
+	if (!bch2_btree_iter_advance_pos(iter))
 		return bkey_s_c_null;
 
-	bch2_btree_iter_set_pos(iter,
-		(iter->flags & BTREE_ITER_IS_EXTENTS)
-		? iter->k.p
-		: bkey_successor(iter->k.p));
-
 	return bch2_btree_iter_peek_with_updates(iter);
 }
 
@@ -1882,14 +1880,9 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
 
 struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *iter)
 {
-	if (unlikely(!bkey_cmp(iter->k.p, POS_MAX)))
+	if (!bch2_btree_iter_advance_pos(iter))
 		return bkey_s_c_null;
 
-	bch2_btree_iter_set_pos(iter,
-		(iter->flags & BTREE_ITER_IS_EXTENTS)
-		? iter->k.p
-		: bkey_successor(iter->k.p));
-
 	return bch2_btree_iter_peek_slot(iter);
 }
 
-- 
cgit 


From 3d4955952f05d5d0583bbb1fe4ce56c022f97847 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 7 Feb 2021 21:11:49 -0500
Subject: bcachefs: Fix bch2_btree_iter_peek_prev()

This makes bch2_btree_iter_peek_prev() and bch2_btree_iter_prev()
consistent with peek() and next(), w.r.t. iter->pos.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c | 48 +++++++++++++++++++++++++++++++-----------------
 fs/bcachefs/fs-io.c      | 10 ++++------
 2 files changed, 35 insertions(+), 23 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 294c591b5047..4012a2c0f008 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1519,13 +1519,27 @@ void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos)
 
 static inline bool bch2_btree_iter_advance_pos(struct btree_iter *iter)
 {
-	if (unlikely(!bkey_cmp(iter->k.p, POS_MAX)))
+	struct bpos pos = iter->k.p;
+
+	if (unlikely(!bkey_cmp(pos, POS_MAX)))
+		return false;
+
+	if (!(iter->flags & BTREE_ITER_IS_EXTENTS))
+		pos = bkey_successor(pos);
+	bch2_btree_iter_set_pos(iter, pos);
+	return true;
+}
+
+static inline bool bch2_btree_iter_rewind_pos(struct btree_iter *iter)
+{
+	struct bpos pos = bkey_start_pos(&iter->k);
+
+	if (unlikely(!bkey_cmp(pos, POS_MIN)))
 		return false;
 
-	bch2_btree_iter_set_pos(iter,
-		(iter->flags & BTREE_ITER_IS_EXTENTS)
-		? iter->k.p
-		: bkey_successor(iter->k.p));
+	if (!(iter->flags & BTREE_ITER_IS_EXTENTS))
+		pos = bkey_predecessor(pos);
+	bch2_btree_iter_set_pos(iter, pos);
 	return true;
 }
 
@@ -1619,8 +1633,7 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
 	 * iter->pos should always be equal to the key we just
 	 * returned - except extents can straddle iter->pos:
 	 */
-	if (!(iter->flags & BTREE_ITER_IS_EXTENTS) ||
-	    bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0)
+	if (bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0)
 		iter->pos = bkey_start_pos(k.k);
 
 	iter->uptodate = BTREE_ITER_UPTODATE;
@@ -1743,7 +1756,10 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
 			return bkey_s_c_err(ret);
 
 		k = __btree_iter_peek(iter, l);
-		if (!k.k || bkey_cmp(bkey_start_pos(k.k), pos) > 0)
+		if (!k.k ||
+		    ((iter->flags & BTREE_ITER_IS_EXTENTS)
+		     ? bkey_cmp(bkey_start_pos(k.k), pos) >= 0
+		     : bkey_cmp(bkey_start_pos(k.k), pos) > 0))
 			k = __btree_iter_prev(iter, l);
 
 		if (likely(k.k))
@@ -1754,8 +1770,13 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
 	}
 
 	EBUG_ON(bkey_cmp(bkey_start_pos(k.k), pos) > 0);
-	iter->pos	= bkey_start_pos(k.k);
+
+	/* Extents can straddle iter->pos: */
+	if (bkey_cmp(k.k->p, pos) < 0)
+		iter->pos = k.k->p;
 	iter->uptodate	= BTREE_ITER_UPTODATE;
+
+	bch2_btree_iter_verify_level(iter, 0);
 	return k;
 }
 
@@ -1765,16 +1786,9 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
  */
 struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *iter)
 {
-	struct bpos pos = bkey_start_pos(&iter->k);
-
-	EBUG_ON(btree_iter_type(iter) != BTREE_ITER_KEYS);
-	bch2_btree_iter_checks(iter);
-
-	if (unlikely(!bkey_cmp(pos, POS_MIN)))
+	if (!bch2_btree_iter_rewind_pos(iter))
 		return bkey_s_c_null;
 
-	bch2_btree_iter_set_pos(iter, bkey_predecessor(pos));
-
 	return bch2_btree_iter_peek_prev(iter);
 }
 
diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 79f1f0f37e18..80ef9d6df287 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -2454,7 +2454,7 @@ static long bchfs_fcollapse_finsert(struct bch_inode_info *inode,
 	struct address_space *mapping = inode->v.i_mapping;
 	struct bkey_buf copy;
 	struct btree_trans trans;
-	struct btree_iter *src, *dst;
+	struct btree_iter *src, *dst, *del;
 	loff_t shift, new_size;
 	u64 src_start;
 	int ret;
@@ -2524,6 +2524,7 @@ static long bchfs_fcollapse_finsert(struct bch_inode_info *inode,
 			POS(inode->v.i_ino, src_start >> 9),
 			BTREE_ITER_INTENT);
 	dst = bch2_trans_copy_iter(&trans, src);
+	del = bch2_trans_copy_iter(&trans, src);
 
 	while (1) {
 		struct disk_reservation disk_res =
@@ -2544,8 +2545,6 @@ static long bchfs_fcollapse_finsert(struct bch_inode_info *inode,
 		if (!k.k || k.k->p.inode != inode->v.i_ino)
 			break;
 
-		BUG_ON(bkey_cmp(src->pos, bkey_start_pos(k.k)));
-
 		if (insert &&
 		    bkey_cmp(k.k->p, POS(inode->v.i_ino, offset >> 9)) <= 0)
 			break;
@@ -2577,6 +2576,7 @@ reassemble:
 		delete.k.p = copy.k->k.p;
 		delete.k.size = copy.k->k.size;
 		delete.k.p.offset -= shift >> 9;
+		bch2_btree_iter_set_pos(del, bkey_start_pos(&delete.k));
 
 		next_pos = insert ? bkey_start_pos(&delete.k) : delete.k.p;
 
@@ -2597,9 +2597,7 @@ reassemble:
 			BUG_ON(ret);
 		}
 
-		bch2_btree_iter_set_pos(src, bkey_start_pos(&delete.k));
-
-		ret =   bch2_trans_update(&trans, src, &delete, trigger_flags) ?:
+		ret =   bch2_trans_update(&trans, del, &delete, trigger_flags) ?:
 			bch2_trans_update(&trans, dst, copy.k, trigger_flags) ?:
 			bch2_trans_commit(&trans, &disk_res,
 					  &inode->ei_journal_seq,
-- 
cgit 


From 5ea037d03cabc219f5b2ccd72b7a33fa036c9bfc Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 10 Feb 2021 13:39:48 -0500
Subject: bcachefs: Assert that we're not trying to flush journal seq in the
 future

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/journal.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index b33d985fa020..c2b1eef6265a 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -574,6 +574,8 @@ int bch2_journal_flush_seq_async(struct journal *j, u64 seq,
 
 	spin_lock(&j->lock);
 
+	BUG_ON(seq > journal_cur_seq(j));
+
 	/* Recheck under lock: */
 	if (j->err_seq && seq >= j->err_seq) {
 		ret = -EIO;
-- 
cgit 


From 9d4032617605144717892f0763b617568bd15ac3 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 11 Feb 2021 14:49:36 -0500
Subject: bcachefs: Fix a shift greater than type size

Found by UBSAN

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/inode.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index b11aecf2cfab..3462e248c954 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -479,7 +479,7 @@ int bch2_inode_create(struct btree_trans *trans,
 	u64 min, max, start, *hint;
 	int ret;
 
-	unsigned cpu = raw_smp_processor_id();
+	u64 cpu = raw_smp_processor_id();
 	unsigned bits = (c->opts.inodes_32bit
 		? 31 : 63) - c->inode_shard_bits;
 
-- 
cgit 


From 2bb748a69596e883cf9ea28321d43f8c6a225cef Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 12 Feb 2021 20:53:29 -0500
Subject: bcachefs: Fsck fixes

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fsck.c | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index c3e6137ffd75..b2d9d55b1951 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -1072,6 +1072,11 @@ static void inc_link(struct bch_fs *c, nlink_table *links,
 	if (inum < range_start || inum >= *range_end)
 		return;
 
+	if (inum - range_start >= SIZE_MAX / sizeof(struct nlink)) {
+		*range_end = inum;
+		return;
+	}
+
 	link = genradix_ptr_alloc(links, inum - range_start, GFP_KERNEL);
 	if (!link) {
 		bch_verbose(c, "allocation failed during fsck - will need another pass");
@@ -1346,23 +1351,25 @@ static int bch2_gc_walk_inodes(struct bch_fs *c,
 	nlinks_iter = genradix_iter_init(links, 0);
 
 	while ((k = bch2_btree_iter_peek(iter)).k &&
-	       !(ret2 = bkey_err(k))) {
+	       !(ret2 = bkey_err(k)) &&
+	       iter->pos.offset < range_end) {
 peek_nlinks:	link = genradix_iter_peek(&nlinks_iter, links);
 
 		if (!link && (!k.k || iter->pos.offset >= range_end))
 			break;
 
 		nlinks_pos = range_start + nlinks_iter.pos;
-		if (iter->pos.offset > nlinks_pos) {
+
+		if (link && nlinks_pos < iter->pos.offset) {
 			/* Should have been caught by dirents pass: */
-			need_fsck_err_on(link && link->count, c,
+			need_fsck_err_on(link->count, c,
 				"missing inode %llu (nlink %u)",
 				nlinks_pos, link->count);
 			genradix_iter_advance(&nlinks_iter, links);
 			goto peek_nlinks;
 		}
 
-		if (iter->pos.offset < nlinks_pos || !link)
+		if (!link || nlinks_pos > iter->pos.offset)
 			link = &zero_links;
 
 		if (k.k && k.k->type == KEY_TYPE_inode) {
-- 
cgit 


From 0ef837a0cc87d49d9f7d29bdef5a57f07ecc84d3 Mon Sep 17 00:00:00 2001
From: Robbie Litchfield <blam.kiwi@gmail.com>
Date: Wed, 10 Feb 2021 13:18:13 +1300
Subject: bcachefs: Fix unnecessary read amplificaiton when allocating ec
 stripes

When allocating an erasure coding stripe, bcachefs will always reuse any
partial stripes before reserving a new stripe. This causes unnecessary
read amplification when preparing a stripe for writing. This patch changes
bcachefs to always reserve new stripes first, only relying on stripe reuse
when copygc needs more time to empty buckets from existing stripes.

Signed-off-by: Robbie Litchfield <blam.kiwi@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/ec.c | 155 +++++++++++++++++++++++++++++++++----------------------
 1 file changed, 92 insertions(+), 63 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index a32d399e5b6f..a70b859363f0 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -1389,6 +1389,72 @@ static s64 get_existing_stripe(struct bch_fs *c,
 	return ret;
 }
 
+static int __bch2_ec_stripe_head_reuse(struct bch_fs *c,
+						   struct ec_stripe_head *h)
+{
+	unsigned i;
+	s64 idx;
+	int ret;
+
+	idx = get_existing_stripe(c, h);
+	if (idx < 0) {
+		bch_err(c, "failed to find an existing stripe");
+		return -ENOSPC;
+	}
+
+	h->s->have_existing_stripe = true;
+	ret = get_stripe_key(c, idx, &h->s->existing_stripe);
+	if (ret) {
+		bch2_fs_fatal_error(c, "error reading stripe key: %i", ret);
+		return ret;
+	}
+
+	if (ec_stripe_buf_init(&h->s->existing_stripe, 0, h->blocksize)) {
+		/*
+		 * this is a problem: we have deleted from the
+		 * stripes heap already
+		 */
+		BUG();
+	}
+
+	BUG_ON(h->s->existing_stripe.size != h->blocksize);
+	BUG_ON(h->s->existing_stripe.size != h->s->existing_stripe.key.v.sectors);
+
+	for (i = 0; i < h->s->existing_stripe.key.v.nr_blocks; i++) {
+		if (stripe_blockcount_get(&h->s->existing_stripe.key.v, i)) {
+			__set_bit(i, h->s->blocks_gotten);
+			__set_bit(i, h->s->blocks_allocated);
+		}
+
+		ec_block_io(c, &h->s->existing_stripe, READ, i, &h->s->iodone);
+	}
+
+	bkey_copy(&h->s->new_stripe.key.k_i,
+			&h->s->existing_stripe.key.k_i);
+
+	return 0;
+}
+
+static int __bch2_ec_stripe_head_reserve(struct bch_fs *c,
+							struct ec_stripe_head *h)
+{
+	int ret;
+
+	ret = bch2_disk_reservation_get(c, &h->s->res,
+			h->blocksize,
+			h->s->nr_parity, 0);
+
+	if (ret) {
+		/*
+		 * This means we need to wait for copygc to
+		 * empty out buckets from existing stripes:
+		 */
+		bch_err(c, "failed to reserve stripe");
+	}
+
+	return ret;
+}
+
 struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *c,
 					       unsigned target,
 					       unsigned algo,
@@ -1397,9 +1463,8 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *c,
 					       struct closure *cl)
 {
 	struct ec_stripe_head *h;
-	unsigned i;
-	s64 idx;
 	int ret;
+	bool needs_stripe_new;
 
 	h = __bch2_ec_stripe_head_get(c, target, algo, redundancy, copygc);
 	if (!h) {
@@ -1407,80 +1472,44 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *c,
 		return NULL;
 	}
 
-	if (!h->s) {
+	needs_stripe_new = !h->s;
+	if (needs_stripe_new) {
 		if (ec_new_stripe_alloc(c, h)) {
-			bch2_ec_stripe_head_put(c, h);
+			ret = -ENOMEM;
 			bch_err(c, "failed to allocate new stripe");
-			return NULL;
-		}
-
-		idx = get_existing_stripe(c, h);
-		if (idx >= 0) {
-			h->s->have_existing_stripe = true;
-			ret = get_stripe_key(c, idx, &h->s->existing_stripe);
-			if (ret) {
-				bch2_fs_fatal_error(c, "error reading stripe key: %i", ret);
-				bch2_ec_stripe_head_put(c, h);
-				return NULL;
-			}
-
-			if (ec_stripe_buf_init(&h->s->existing_stripe, 0, h->blocksize)) {
-				/*
-				 * this is a problem: we have deleted from the
-				 * stripes heap already
-				 */
-				BUG();
-			}
-
-			BUG_ON(h->s->existing_stripe.size != h->blocksize);
-			BUG_ON(h->s->existing_stripe.size != h->s->existing_stripe.key.v.sectors);
-
-			for (i = 0; i < h->s->existing_stripe.key.v.nr_blocks; i++) {
-				if (stripe_blockcount_get(&h->s->existing_stripe.key.v, i)) {
-					__set_bit(i, h->s->blocks_gotten);
-					__set_bit(i, h->s->blocks_allocated);
-				}
-
-				ec_block_io(c, &h->s->existing_stripe, READ, i, &h->s->iodone);
-			}
-
-			bkey_copy(&h->s->new_stripe.key.k_i,
-				  &h->s->existing_stripe.key.k_i);
+			goto err;
 		}
 
-		if (ec_stripe_buf_init(&h->s->new_stripe, 0, h->blocksize)) {
+		if (ec_stripe_buf_init(&h->s->new_stripe, 0, h->blocksize))
 			BUG();
-		}
 	}
 
-	if (!h->s->allocated) {
-		if (!h->s->have_existing_stripe &&
-		    !h->s->res.sectors) {
-			ret = bch2_disk_reservation_get(c, &h->s->res,
-					h->blocksize,
-					h->s->nr_parity, 0);
-			if (ret) {
-				/*
-				 * This means we need to wait for copygc to
-				 * empty out buckets from existing stripes:
-				 */
-				bch2_ec_stripe_head_put(c, h);
-				h = NULL;
-				goto out;
-			}
-		}
+	/*
+	 * Try reserve a new stripe before reusing an
+	 * existing stripe. This will prevent unnecessary
+	 * read amplification during write oriented workloads.
+	 */
+	ret = 0;
+	if (!h->s->allocated && !h->s->res.sectors && !h->s->have_existing_stripe)
+		ret = __bch2_ec_stripe_head_reserve(c, h);
+	if (ret && needs_stripe_new)
+		ret = __bch2_ec_stripe_head_reuse(c, h);
+	if (ret)
+		goto err;
 
+	if (!h->s->allocated) {
 		ret = new_stripe_alloc_buckets(c, h, cl);
-		if (ret) {
-			bch2_ec_stripe_head_put(c, h);
-			h = ERR_PTR(-ret);
-			goto out;
-		}
+		if (ret)
+			goto err;
 
 		h->s->allocated = true;
 	}
-out:
+
 	return h;
+
+err:
+	bch2_ec_stripe_head_put(c, h);
+	return ERR_PTR(-ret);
 }
 
 void bch2_ec_stop_dev(struct bch_fs *c, struct bch_dev *ca)
-- 
cgit 


From 0507962f634bc3bada77bce9f3cd839e48aa5fb0 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 17 Feb 2021 13:37:22 -0500
Subject: bcachefs: Drop invalid stripe ptrs in fsck

More repair code, now that we can repair extents during initial gc.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_gc.c | 67 ++++++++++++++++++++++++++++++++++----------------
 fs/bcachefs/extents.c  |  9 +++++++
 fs/bcachefs/extents.h  |  1 +
 3 files changed, 56 insertions(+), 21 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index d44b9c079fde..f687cc3bfa94 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -163,22 +163,23 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id,
 			       struct bkey_s_c *k)
 {
 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(*k);
-	const struct bch_extent_ptr *ptr;
+	const union bch_extent_entry *entry;
+	struct extent_ptr_decoded p;
 	bool do_update = false;
 	int ret = 0;
 
-	bkey_for_each_ptr(ptrs, ptr) {
-		struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
-		struct bucket *g = PTR_BUCKET(ca, ptr, true);
-		struct bucket *g2 = PTR_BUCKET(ca, ptr, false);
+	bkey_for_each_ptr_decode(k->k, ptrs, p, entry) {
+		struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev);
+		struct bucket *g = PTR_BUCKET(ca, &p.ptr, true);
+		struct bucket *g2 = PTR_BUCKET(ca, &p.ptr, false);
 
 		if (fsck_err_on(!g->gen_valid, c,
 				"bucket %u:%zu data type %s ptr gen %u missing in alloc btree",
-				ptr->dev, PTR_BUCKET_NR(ca, ptr),
-				bch2_data_types[ptr_data_type(k->k, ptr)],
-				ptr->gen)) {
-			if (!ptr->cached) {
-				g2->_mark.gen	= g->_mark.gen		= ptr->gen;
+				p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
+				bch2_data_types[ptr_data_type(k->k, &p.ptr)],
+				p.ptr.gen)) {
+			if (!p.ptr.cached) {
+				g2->_mark.gen	= g->_mark.gen		= p.ptr.gen;
 				g2->gen_valid	= g->gen_valid		= true;
 				set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags);
 			} else {
@@ -186,13 +187,13 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id,
 			}
 		}
 
-		if (fsck_err_on(gen_cmp(ptr->gen, g->mark.gen) > 0, c,
+		if (fsck_err_on(gen_cmp(p.ptr.gen, g->mark.gen) > 0, c,
 				"bucket %u:%zu data type %s ptr gen in the future: %u > %u",
-				ptr->dev, PTR_BUCKET_NR(ca, ptr),
-				bch2_data_types[ptr_data_type(k->k, ptr)],
-				ptr->gen, g->mark.gen)) {
-			if (!ptr->cached) {
-				g2->_mark.gen	= g->_mark.gen	= ptr->gen;
+				p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
+				bch2_data_types[ptr_data_type(k->k, &p.ptr)],
+				p.ptr.gen, g->mark.gen)) {
+			if (!p.ptr.cached) {
+				g2->_mark.gen	= g->_mark.gen	= p.ptr.gen;
 				g2->gen_valid	= g->gen_valid	= true;
 				g2->_mark.data_type		= 0;
 				g2->_mark.dirty_sectors		= 0;
@@ -204,16 +205,27 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id,
 			}
 		}
 
-		if (fsck_err_on(!ptr->cached &&
-				gen_cmp(ptr->gen, g->mark.gen) < 0, c,
+		if (fsck_err_on(!p.ptr.cached &&
+				gen_cmp(p.ptr.gen, g->mark.gen) < 0, c,
 				"bucket %u:%zu data type %s stale dirty ptr: %u < %u",
-				ptr->dev, PTR_BUCKET_NR(ca, ptr),
-				bch2_data_types[ptr_data_type(k->k, ptr)],
-				ptr->gen, g->mark.gen))
+				p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
+				bch2_data_types[ptr_data_type(k->k, &p.ptr)],
+				p.ptr.gen, g->mark.gen))
 			do_update = true;
+
+		if (p.has_ec) {
+			struct stripe *m = genradix_ptr(&c->stripes[true], p.ec.idx);
+
+			if (fsck_err_on(!m || !m->alive, c,
+					"pointer to nonexistent stripe %llu",
+					(u64) p.ec.idx))
+				do_update = true;
+		}
 	}
 
 	if (do_update) {
+		struct bkey_ptrs ptrs;
+		union bch_extent_entry *entry;
 		struct bch_extent_ptr *ptr;
 		struct bkey_i *new;
 
@@ -237,6 +249,19 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id,
 			(!ptr->cached &&
 			 gen_cmp(ptr->gen, g->mark.gen) < 0);
 		}));
+again:
+		ptrs = bch2_bkey_ptrs(bkey_i_to_s(new));
+		bkey_extent_entry_for_each(ptrs, entry) {
+			if (extent_entry_type(entry) == BCH_EXTENT_ENTRY_stripe_ptr) {
+				struct stripe *m = genradix_ptr(&c->stripes[true],
+								entry->stripe_ptr.idx);
+
+				if (!m || !m->alive) {
+					bch2_bkey_extent_entry_drop(new, entry);
+					goto again;
+				}
+			}
+		}
 
 		ret = bch2_journal_key_insert(c, btree_id, level, new);
 		if (ret)
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 50ab240d89a8..4007af4a780b 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -777,6 +777,15 @@ void bch2_bkey_mark_replicas_cached(struct bch_fs *c, struct bkey_s k,
 		}
 }
 
+void bch2_bkey_extent_entry_drop(struct bkey_i *k, union bch_extent_entry *entry)
+{
+	union bch_extent_entry *end = bkey_val_end(bkey_i_to_s(k));
+	union bch_extent_entry *next = extent_entry_next(entry);
+
+	memmove_u64s(entry, next, (u64 *) end - (u64 *) next);
+	k->k.u64s -= extent_entry_u64s(entry);
+}
+
 void bch2_bkey_append_ptr(struct bkey_i *k,
 			  struct bch_extent_ptr ptr)
 {
diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h
index ebe0a04c7850..3988315fc404 100644
--- a/fs/bcachefs/extents.h
+++ b/fs/bcachefs/extents.h
@@ -546,6 +546,7 @@ unsigned bch2_bkey_durability(struct bch_fs *, struct bkey_s_c);
 void bch2_bkey_mark_replicas_cached(struct bch_fs *, struct bkey_s,
 				    unsigned, unsigned);
 
+void bch2_bkey_extent_entry_drop(struct bkey_i *, union bch_extent_entry *);
 void bch2_bkey_append_ptr(struct bkey_i *, struct bch_extent_ptr);
 void bch2_extent_ptr_decoded_append(struct bkey_i *,
 				    struct extent_ptr_decoded *);
-- 
cgit 


From 9f631dc14325df8acfa73c76299dbefb68582ee4 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 9 Mar 2021 19:37:40 -0500
Subject: bcachefs: Ensure btree iterators are traversed in bch2_trans_commit()

The upcoming patch to allow extents to span btree nodes will require
this... and this assertion seems to be popping, and it's not a very good
assertion anyways.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update_leaf.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 022ab3d90871..2f94b8917a76 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -929,9 +929,14 @@ int __bch2_trans_commit(struct btree_trans *trans)
 	}
 
 	trans_for_each_update2(trans, i) {
-		BUG_ON(i->iter->uptodate > BTREE_ITER_NEED_PEEK);
 		BUG_ON(i->iter->locks_want < 1);
 
+		ret = bch2_btree_iter_traverse(i->iter);
+		if (unlikely(ret)) {
+			trace_trans_restart_traverse(trans->ip);
+			goto out;
+		}
+
 		u64s = jset_u64s(i->k->k.u64s);
 		if (btree_iter_type(i->iter) == BTREE_ITER_CACHED &&
 		    likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)))
-- 
cgit 


From 7e1a3aa9dfcb9cd8f46085df86f158a1f23085dc Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 11 Feb 2021 21:57:32 -0500
Subject: bcachefs: iter->real_pos

We need to differentiate between the search position of a btree
iterator, vs. what it actually points at (what we found). This matters
for extents, where iter->pos will typically be the start of the key we
found and iter->real_pos will be the end of the key we found (which soon
won't necessarily be in the same btree node!) and it will also matter
for snapshots.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c        | 208 ++++++++++++++++++++++------------------
 fs/bcachefs/btree_types.h       |   2 +
 fs/bcachefs/btree_update_leaf.c |   2 +-
 3 files changed, 120 insertions(+), 92 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 4012a2c0f008..531732e30950 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -34,13 +34,13 @@ static inline struct bpos btree_iter_search_key(struct btree_iter *iter)
 static inline bool btree_iter_pos_before_node(struct btree_iter *iter,
 					      struct btree *b)
 {
-	return bkey_cmp(btree_iter_search_key(iter), b->data->min_key) < 0;
+	return bkey_cmp(iter->real_pos, b->data->min_key) < 0;
 }
 
 static inline bool btree_iter_pos_after_node(struct btree_iter *iter,
 					     struct btree *b)
 {
-	return bkey_cmp(b->key.k.p, btree_iter_search_key(iter)) < 0;
+	return bkey_cmp(b->key.k.p, iter->real_pos) < 0;
 }
 
 static inline bool btree_iter_pos_in_node(struct btree_iter *iter,
@@ -491,7 +491,6 @@ static void bch2_btree_iter_verify_cached(struct btree_iter *iter)
 static void bch2_btree_iter_verify_level(struct btree_iter *iter,
 					 unsigned level)
 {
-	struct bpos pos = btree_iter_search_key(iter);
 	struct btree_iter_level *l = &iter->l[level];
 	struct btree_node_iter tmp = l->iter;
 	bool locked = btree_node_locked(iter, level);
@@ -539,12 +538,12 @@ static void bch2_btree_iter_verify_level(struct btree_iter *iter,
 		: bch2_btree_node_iter_prev_all(&tmp, l->b);
 	k = bch2_btree_node_iter_peek_all(&l->iter, l->b);
 
-	if (p && bkey_iter_pos_cmp(l->b, p, &pos) >= 0) {
+	if (p && bkey_iter_pos_cmp(l->b, p, &iter->real_pos) >= 0) {
 		msg = "before";
 		goto err;
 	}
 
-	if (k && bkey_iter_pos_cmp(l->b, k, &pos) < 0) {
+	if (k && bkey_iter_pos_cmp(l->b, k, &iter->real_pos) < 0) {
 		msg = "after";
 		goto err;
 	}
@@ -567,12 +566,11 @@ err:
 	}
 
 	panic("iterator should be %s key at level %u:\n"
-	      "iter pos %s %llu:%llu\n"
+	      "iter pos %llu:%llu\n"
 	      "prev key %s\n"
 	      "cur  key %s\n",
 	      msg, level,
-	      iter->flags & BTREE_ITER_IS_EXTENTS ? ">" : "=>",
-	      iter->pos.inode, iter->pos.offset,
+	      iter->real_pos.inode, iter->real_pos.offset,
 	      buf1, buf2);
 }
 
@@ -580,12 +578,24 @@ static void bch2_btree_iter_verify(struct btree_iter *iter)
 {
 	unsigned i;
 
-	bch2_btree_trans_verify_locks(iter->trans);
+	EBUG_ON(iter->btree_id >= BTREE_ID_NR);
+
+	bch2_btree_iter_verify_locks(iter);
 
 	for (i = 0; i < BTREE_MAX_DEPTH; i++)
 		bch2_btree_iter_verify_level(iter, i);
 }
 
+static void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter)
+{
+	enum btree_iter_type type = btree_iter_type(iter);
+
+	BUG_ON((type == BTREE_ITER_KEYS ||
+		type == BTREE_ITER_CACHED) &&
+	       (bkey_cmp(iter->pos, bkey_start_pos(&iter->k)) < 0 ||
+		bkey_cmp(iter->pos, iter->k.p) > 0));
+}
+
 void bch2_btree_trans_verify_iters(struct btree_trans *trans, struct btree *b)
 {
 	struct btree_iter *iter;
@@ -601,6 +611,7 @@ void bch2_btree_trans_verify_iters(struct btree_trans *trans, struct btree *b)
 
 static inline void bch2_btree_iter_verify_level(struct btree_iter *iter, unsigned l) {}
 static inline void bch2_btree_iter_verify(struct btree_iter *iter) {}
+static inline void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter) {}
 
 #endif
 
@@ -626,12 +637,11 @@ static void __bch2_btree_iter_fix_key_modified(struct btree_iter *iter,
 					       struct bkey_packed *where)
 {
 	struct btree_iter_level *l = &iter->l[b->c.level];
-	struct bpos pos = btree_iter_search_key(iter);
 
 	if (where != bch2_btree_node_iter_peek_all(&l->iter, l->b))
 		return;
 
-	if (bkey_iter_pos_cmp(l->b, where, &pos) < 0)
+	if (bkey_iter_pos_cmp(l->b, where, &iter->real_pos) < 0)
 		bch2_btree_node_iter_advance(&l->iter, l->b);
 
 	btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
@@ -666,7 +676,6 @@ static void __bch2_btree_node_iter_fix(struct btree_iter *iter,
 	bool iter_current_key_modified =
 		orig_iter_pos >= offset &&
 		orig_iter_pos <= offset + clobber_u64s;
-	struct bpos iter_pos = btree_iter_search_key(iter);
 
 	btree_node_iter_for_each(node_iter, set)
 		if (set->end == old_end)
@@ -674,7 +683,7 @@ static void __bch2_btree_node_iter_fix(struct btree_iter *iter,
 
 	/* didn't find the bset in the iterator - might have to readd it: */
 	if (new_u64s &&
-	    bkey_iter_pos_cmp(b, where, &iter_pos) >= 0) {
+	    bkey_iter_pos_cmp(b, where, &iter->real_pos) >= 0) {
 		bch2_btree_node_iter_push(node_iter, b, where, end);
 		goto fixup_done;
 	} else {
@@ -689,7 +698,7 @@ found:
 		return;
 
 	if (new_u64s &&
-	    bkey_iter_pos_cmp(b, where, &iter_pos) >= 0) {
+	    bkey_iter_pos_cmp(b, where, &iter->real_pos) >= 0) {
 		set->k = offset;
 	} else if (set->k < offset + clobber_u64s) {
 		set->k = offset + new_u64s;
@@ -825,12 +834,11 @@ static inline bool btree_iter_advance_to_pos(struct btree_iter *iter,
 					     struct btree_iter_level *l,
 					     int max_advance)
 {
-	struct bpos pos = btree_iter_search_key(iter);
 	struct bkey_packed *k;
 	int nr_advanced = 0;
 
 	while ((k = bch2_btree_node_iter_peek_all(&l->iter, l->b)) &&
-	       bkey_iter_pos_cmp(l->b, k, &pos) < 0) {
+	       bkey_iter_pos_cmp(l->b, k, &iter->real_pos) < 0) {
 		if (max_advance > 0 && nr_advanced >= max_advance)
 			return false;
 
@@ -893,10 +901,9 @@ static void btree_iter_verify_new_node(struct btree_iter *iter, struct btree *b)
 static inline void __btree_iter_init(struct btree_iter *iter,
 				     unsigned level)
 {
-	struct bpos pos = btree_iter_search_key(iter);
 	struct btree_iter_level *l = &iter->l[level];
 
-	bch2_btree_node_iter_init(&l->iter, l->b, &pos);
+	bch2_btree_node_iter_init(&l->iter, l->b, &iter->real_pos);
 
 	/*
 	 * Iterators to interior nodes should always be pointed at the first non
@@ -1342,21 +1349,6 @@ int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter)
 	return ret;
 }
 
-static inline void bch2_btree_iter_checks(struct btree_iter *iter)
-{
-	enum btree_iter_type type = btree_iter_type(iter);
-
-	EBUG_ON(iter->btree_id >= BTREE_ID_NR);
-
-	BUG_ON((type == BTREE_ITER_KEYS ||
-		type == BTREE_ITER_CACHED) &&
-	       (bkey_cmp(iter->pos, bkey_start_pos(&iter->k)) < 0 ||
-		bkey_cmp(iter->pos, iter->k.p) > 0));
-
-	bch2_btree_iter_verify_locks(iter);
-	bch2_btree_iter_verify_level(iter, iter->level);
-}
-
 /* Iterate across nodes (leaf and interior nodes) */
 
 struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter)
@@ -1365,7 +1357,7 @@ struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter)
 	int ret;
 
 	EBUG_ON(btree_iter_type(iter) != BTREE_ITER_NODES);
-	bch2_btree_iter_checks(iter);
+	bch2_btree_iter_verify(iter);
 
 	if (iter->uptodate == BTREE_ITER_UPTODATE)
 		return iter->l[iter->level].b;
@@ -1380,7 +1372,7 @@ struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter)
 
 	BUG_ON(bkey_cmp(b->key.k.p, iter->pos) < 0);
 
-	iter->pos = b->key.k.p;
+	iter->pos = iter->real_pos = b->key.k.p;
 	iter->uptodate = BTREE_ITER_UPTODATE;
 
 	bch2_btree_iter_verify(iter);
@@ -1394,7 +1386,7 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
 	int ret;
 
 	EBUG_ON(btree_iter_type(iter) != BTREE_ITER_NODES);
-	bch2_btree_iter_checks(iter);
+	bch2_btree_iter_verify(iter);
 
 	/* already got to end? */
 	if (!btree_iter_node(iter, iter->level))
@@ -1431,7 +1423,7 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
 		if (btree_node_read_locked(iter, iter->level))
 			btree_node_unlock(iter, iter->level);
 
-		iter->pos	= bkey_successor(iter->pos);
+		iter->pos = iter->real_pos = bkey_successor(iter->pos);
 		iter->level	= iter->min_depth;
 
 		btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
@@ -1442,7 +1434,7 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
 		b = iter->l[iter->level].b;
 	}
 
-	iter->pos = b->key.k.p;
+	iter->pos = iter->real_pos = b->key.k.p;
 	iter->uptodate = BTREE_ITER_UPTODATE;
 
 	bch2_btree_iter_verify(iter);
@@ -1490,57 +1482,55 @@ out:
 		btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
 }
 
-void __bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos,
-			       bool strictly_greater)
+static void btree_iter_set_search_pos(struct btree_iter *iter, struct bpos new_pos)
 {
-	struct bpos old = btree_iter_search_key(iter);
-	int cmp;
+	int cmp = bkey_cmp(new_pos, iter->real_pos);
 
-	iter->flags &= ~BTREE_ITER_IS_EXTENTS;
-	iter->flags |= strictly_greater ? BTREE_ITER_IS_EXTENTS : 0;
+	iter->real_pos = new_pos;
+
+	btree_iter_pos_changed(iter, cmp);
 
+	bch2_btree_iter_verify(iter);
+}
+
+void __bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos,
+			       bool strictly_greater)
+{
 	bkey_init(&iter->k);
 	iter->k.p = iter->pos = new_pos;
 
-	cmp = bkey_cmp(btree_iter_search_key(iter), old);
+	iter->flags &= ~BTREE_ITER_IS_EXTENTS;
+	iter->flags |= strictly_greater ? BTREE_ITER_IS_EXTENTS : 0;
 
-	btree_iter_pos_changed(iter, cmp);
+	btree_iter_set_search_pos(iter, btree_iter_search_key(iter));
 }
 
 void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos)
 {
-	int cmp = bkey_cmp(new_pos, iter->pos);
-
-	bkey_init(&iter->k);
-	iter->k.p = iter->pos = new_pos;
-
-	btree_iter_pos_changed(iter, cmp);
+	__bch2_btree_iter_set_pos(iter, new_pos,
+			(iter->flags & BTREE_ITER_IS_EXTENTS) != 0);
 }
 
 static inline bool bch2_btree_iter_advance_pos(struct btree_iter *iter)
 {
 	struct bpos pos = iter->k.p;
+	bool ret = bkey_cmp(pos, POS_MAX) != 0;
 
-	if (unlikely(!bkey_cmp(pos, POS_MAX)))
-		return false;
-
-	if (!(iter->flags & BTREE_ITER_IS_EXTENTS))
+	if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS))
 		pos = bkey_successor(pos);
 	bch2_btree_iter_set_pos(iter, pos);
-	return true;
+	return ret;
 }
 
 static inline bool bch2_btree_iter_rewind_pos(struct btree_iter *iter)
 {
 	struct bpos pos = bkey_start_pos(&iter->k);
+	bool ret = bkey_cmp(pos, POS_MIN) != 0;
 
-	if (unlikely(!bkey_cmp(pos, POS_MIN)))
-		return false;
-
-	if (!(iter->flags & BTREE_ITER_IS_EXTENTS))
+	if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS))
 		pos = bkey_predecessor(pos);
 	bch2_btree_iter_set_pos(iter, pos);
-	return true;
+	return ret;
 }
 
 static inline bool btree_iter_set_pos_to_next_leaf(struct btree_iter *iter)
@@ -1548,10 +1538,16 @@ static inline bool btree_iter_set_pos_to_next_leaf(struct btree_iter *iter)
 	struct bpos next_pos = iter->l[0].b->key.k.p;
 	bool ret = bkey_cmp(next_pos, POS_MAX) != 0;
 
-	if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS))
-		next_pos = bkey_successor(next_pos);
+	/*
+	 * Typically, we don't want to modify iter->pos here, since that
+	 * indicates where we searched from - unless we got to the end of the
+	 * btree, in that case we want iter->pos to reflect that:
+	 */
+	if (ret)
+		btree_iter_set_search_pos(iter, bkey_successor(next_pos));
+	else
+		bch2_btree_iter_set_pos(iter, POS_MAX);
 
-	bch2_btree_iter_set_pos(iter, next_pos);
 	return ret;
 }
 
@@ -1560,14 +1556,11 @@ static inline bool btree_iter_set_pos_to_prev_leaf(struct btree_iter *iter)
 	struct bpos next_pos = iter->l[0].b->data->min_key;
 	bool ret = bkey_cmp(next_pos, POS_MIN) != 0;
 
-	if (ret) {
-		next_pos = bkey_predecessor(next_pos);
-
-		if (iter->flags & BTREE_ITER_IS_EXTENTS)
-			next_pos = bkey_predecessor(next_pos);
-	}
+	if (ret)
+		btree_iter_set_search_pos(iter, bkey_predecessor(next_pos));
+	else
+		bch2_btree_iter_set_pos(iter, POS_MIN);
 
-	bch2_btree_iter_set_pos(iter, next_pos);
 	return ret;
 }
 
@@ -1610,7 +1603,10 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
 	int ret;
 
 	EBUG_ON(btree_iter_type(iter) != BTREE_ITER_KEYS);
-	bch2_btree_iter_checks(iter);
+	bch2_btree_iter_verify(iter);
+	bch2_btree_iter_verify_entry_exit(iter);
+
+	btree_iter_set_search_pos(iter, btree_iter_search_key(iter));
 
 	if (iter->uptodate == BTREE_ITER_UPTODATE &&
 	    !bkey_deleted(&iter->k))
@@ -1636,9 +1632,12 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
 	if (bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0)
 		iter->pos = bkey_start_pos(k.k);
 
+	iter->real_pos = k.k->p;
+
 	iter->uptodate = BTREE_ITER_UPTODATE;
 
-	bch2_btree_iter_verify_level(iter, 0);
+	bch2_btree_iter_verify_entry_exit(iter);
+	bch2_btree_iter_verify(iter);
 	return k;
 }
 
@@ -1692,7 +1691,7 @@ struct bkey_s_c bch2_btree_iter_peek_with_updates(struct btree_iter *iter)
 	int ret;
 
 	EBUG_ON(btree_iter_type(iter) != BTREE_ITER_KEYS);
-	bch2_btree_iter_checks(iter);
+	bch2_btree_iter_verify(iter);
 
 	while (1) {
 		ret = bch2_btree_iter_traverse(iter);
@@ -1714,8 +1713,8 @@ struct bkey_s_c bch2_btree_iter_peek_with_updates(struct btree_iter *iter)
 	}
 
 	/*
-	 * iter->pos should always be equal to the key we just
-	 * returned - except extents can straddle iter->pos:
+	 * iter->pos should be mononotically increasing, and always be equal to
+	 * the key we just returned - except extents can straddle iter->pos:
 	 */
 	if (bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0)
 		iter->pos = bkey_start_pos(k.k);
@@ -1744,7 +1743,10 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
 	int ret;
 
 	EBUG_ON(btree_iter_type(iter) != BTREE_ITER_KEYS);
-	bch2_btree_iter_checks(iter);
+	bch2_btree_iter_verify(iter);
+	bch2_btree_iter_verify_entry_exit(iter);
+
+	btree_iter_set_search_pos(iter, iter->pos);
 
 	if (iter->uptodate == BTREE_ITER_UPTODATE &&
 	    !bkey_deleted(&iter->k))
@@ -1752,8 +1754,10 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
 
 	while (1) {
 		ret = bch2_btree_iter_traverse(iter);
-		if (unlikely(ret))
-			return bkey_s_c_err(ret);
+		if (unlikely(ret)) {
+			k = bkey_s_c_err(ret);
+			goto no_key;
+		}
 
 		k = __btree_iter_peek(iter, l);
 		if (!k.k ||
@@ -1765,8 +1769,10 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
 		if (likely(k.k))
 			break;
 
-		if (!btree_iter_set_pos_to_prev_leaf(iter))
-			return bkey_s_c_null;
+		if (!btree_iter_set_pos_to_prev_leaf(iter)) {
+			k = bkey_s_c_null;
+			goto no_key;
+		}
 	}
 
 	EBUG_ON(bkey_cmp(bkey_start_pos(k.k), pos) > 0);
@@ -1774,10 +1780,23 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
 	/* Extents can straddle iter->pos: */
 	if (bkey_cmp(k.k->p, pos) < 0)
 		iter->pos = k.k->p;
-	iter->uptodate	= BTREE_ITER_UPTODATE;
-
-	bch2_btree_iter_verify_level(iter, 0);
+	iter->real_pos = k.k->p;
+	iter->uptodate = BTREE_ITER_UPTODATE;
+out:
+	bch2_btree_iter_verify_entry_exit(iter);
+	bch2_btree_iter_verify(iter);
 	return k;
+no_key:
+	/*
+	 * __btree_iter_peek() may have set iter->k to a key we didn't want, and
+	 * then we errored going to the previous leaf - make sure it's
+	 * consistent with iter->pos:
+	 */
+	BUG_ON(bkey_cmp(pos, iter->pos) &&
+	       bkey_cmp(iter->pos, POS_MIN));
+	bkey_init(&iter->k);
+	iter->k.p = iter->pos;
+	goto out;
 }
 
 /**
@@ -1830,7 +1849,7 @@ __bch2_btree_iter_peek_slot_extents(struct btree_iter *iter)
 		 */
 
 		EBUG_ON(bkey_cmp(k.k->p, iter->pos) <= 0);
-		bch2_btree_iter_verify_level(iter, 0);
+		bch2_btree_iter_verify(iter);
 		return k;
 	}
 
@@ -1853,7 +1872,9 @@ __bch2_btree_iter_peek_slot_extents(struct btree_iter *iter)
 	iter->k	= n;
 	iter->uptodate = BTREE_ITER_UPTODATE;
 
-	bch2_btree_iter_verify_level(iter, 0);
+	bch2_btree_iter_verify_entry_exit(iter);
+	bch2_btree_iter_verify(iter);
+
 	return (struct bkey_s_c) { &iter->k, NULL };
 }
 
@@ -1864,7 +1885,10 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
 	int ret;
 
 	EBUG_ON(btree_iter_type(iter) != BTREE_ITER_KEYS);
-	bch2_btree_iter_checks(iter);
+	bch2_btree_iter_verify(iter);
+	bch2_btree_iter_verify_entry_exit(iter);
+
+	btree_iter_set_search_pos(iter, btree_iter_search_key(iter));
 
 	if (iter->uptodate == BTREE_ITER_UPTODATE)
 		return btree_iter_peek_uptodate(iter);
@@ -1888,7 +1912,8 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
 	}
 
 	iter->uptodate = BTREE_ITER_UPTODATE;
-	bch2_btree_iter_verify_level(iter, 0);
+	bch2_btree_iter_verify_entry_exit(iter);
+	bch2_btree_iter_verify(iter);
 	return k;
 }
 
@@ -1906,7 +1931,7 @@ struct bkey_s_c bch2_btree_iter_peek_cached(struct btree_iter *iter)
 	int ret;
 
 	EBUG_ON(btree_iter_type(iter) != BTREE_ITER_CACHED);
-	bch2_btree_iter_checks(iter);
+	bch2_btree_iter_verify(iter);
 
 	ret = bch2_btree_iter_traverse(iter);
 	if (unlikely(ret))
@@ -1937,6 +1962,7 @@ static inline void bch2_btree_iter_init(struct btree_trans *trans,
 	bkey_init(&iter->k);
 	iter->k.p			= pos;
 	iter->flags			= flags;
+	iter->real_pos			= btree_iter_search_key(iter);
 	iter->uptodate			= BTREE_ITER_NEED_TRAVERSE;
 	iter->btree_id			= btree_id;
 	iter->level			= 0;
@@ -2076,7 +2102,7 @@ static struct btree_iter *__btree_trans_get_iter(struct btree_trans *trans,
 
 		if (best &&
 		    bkey_cmp(bpos_diff(best->pos, pos),
-			     bpos_diff(iter->pos, pos)) < 0)
+			     bpos_diff(iter->real_pos, pos)) < 0)
 			continue;
 
 		best = iter;
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index e51e3c7868de..80bb31a53339 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -247,6 +247,8 @@ enum btree_iter_uptodate {
 struct btree_iter {
 	struct btree_trans	*trans;
 	struct bpos		pos;
+	/* what we're searching for/what the iterator actually points to: */
+	struct bpos		real_pos;
 	struct bpos		pos_after_commit;
 
 	u16			flags;
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 2f94b8917a76..5e0ce7cde017 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -219,7 +219,7 @@ static inline void btree_insert_entry_checks(struct btree_trans *trans,
 {
 	struct bch_fs *c = trans->c;
 
-	BUG_ON(bkey_cmp(insert->k.p, iter->pos));
+	BUG_ON(bkey_cmp(insert->k.p, iter->real_pos));
 	BUG_ON(bch2_debug_check_bkeys &&
 	       bch2_bkey_invalid(c, bkey_i_to_s_c(insert),
 				 __btree_node_type(iter->level, iter->btree_id)));
-- 
cgit 


From 8042b5b715e6722fb26e40724b87f93b4b777acf Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 10 Feb 2021 16:13:57 -0500
Subject: bcachefs: Extents may now cross btree node boundaries

When snapshots arrive, we won't necessarily be able to arbitrarily split
existis - when we need to split an existing extent, we'll have to check
if the extent was overwritten in child snapshots and if so emit a
whiteout for the split in the child snapshot.

Because extents couldn't span btree nodes previously, journal replay
would sometimes have to split existing extents. That's no good anymore,
but fortunately since extent handling has already been lifted above most
of the btree code there's no real need for that rule anymore.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs_format.h   |  14 +++--
 fs/bcachefs/btree_iter.c        |  58 +++++++--------------
 fs/bcachefs/btree_update_leaf.c |  39 ++++++++------
 fs/bcachefs/buckets.c           |  67 +++++++++---------------
 fs/bcachefs/extent_update.c     |  29 +++--------
 fs/bcachefs/recovery.c          | 113 +---------------------------------------
 fs/bcachefs/super-io.c          |   4 +-
 7 files changed, 87 insertions(+), 237 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index 9048441cfa55..cf092903a6ab 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -1346,13 +1346,19 @@ LE64_BITMASK(BCH_SB_METADATA_TARGET,	struct bch_sb, flags[3], 16, 28);
 	x(reflink_inline_data,		14)	\
 	x(new_varint,			15)	\
 	x(journal_no_flush,		16)	\
-	x(alloc_v2,			17)
+	x(alloc_v2,			17)	\
+	x(extents_across_btree_nodes,	18)
+
+#define BCH_SB_FEATURES_ALWAYS				\
+	((1ULL << BCH_FEATURE_new_extent_overwrite)|	\
+	 (1ULL << BCH_FEATURE_extents_above_btree_updates)|\
+	 (1ULL << BCH_FEATURE_btree_updates_journalled)|\
+	 (1ULL << BCH_FEATURE_extents_across_btree_nodes))
 
 #define BCH_SB_FEATURES_ALL				\
-	((1ULL << BCH_FEATURE_new_siphash)|		\
-	 (1ULL << BCH_FEATURE_new_extent_overwrite)|	\
+	(BCH_SB_FEATURES_ALWAYS|			\
+	 (1ULL << BCH_FEATURE_new_siphash)|		\
 	 (1ULL << BCH_FEATURE_btree_ptr_v2)|		\
-	 (1ULL << BCH_FEATURE_extents_above_btree_updates)|\
 	 (1ULL << BCH_FEATURE_new_varint)|		\
 	 (1ULL << BCH_FEATURE_journal_no_flush)|	\
 	 (1ULL << BCH_FEATURE_alloc_v2))
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 531732e30950..660e9e827ed4 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1814,11 +1814,8 @@ struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *iter)
 static inline struct bkey_s_c
 __bch2_btree_iter_peek_slot_extents(struct btree_iter *iter)
 {
-	struct btree_iter_level *l = &iter->l[0];
-	struct btree_node_iter node_iter;
 	struct bkey_s_c k;
-	struct bkey n;
-	int ret;
+	struct bpos pos, next_start;
 
 	/* keys & holes can't span inode numbers: */
 	if (iter->pos.offset == KEY_OFFSET_MAX) {
@@ -1826,50 +1823,31 @@ __bch2_btree_iter_peek_slot_extents(struct btree_iter *iter)
 			return bkey_s_c_null;
 
 		bch2_btree_iter_set_pos(iter, bkey_successor(iter->pos));
-
-		ret = bch2_btree_iter_traverse(iter);
-		if (unlikely(ret))
-			return bkey_s_c_err(ret);
 	}
 
-	/*
-	 * iterator is now at the correct position for inserting at iter->pos,
-	 * but we need to keep iterating until we find the first non whiteout so
-	 * we know how big a hole we have, if any:
-	 */
-
-	node_iter = l->iter;
-	k = __btree_iter_unpack(iter, l, &iter->k,
-		bch2_btree_node_iter_peek(&node_iter, l->b));
-
-	if (k.k && bkey_cmp(bkey_start_pos(k.k), iter->pos) <= 0) {
-		/*
-		 * We're not setting iter->uptodate because the node iterator
-		 * doesn't necessarily point at the key we're returning:
-		 */
+	pos = iter->pos;
+	k = bch2_btree_iter_peek(iter);
+	iter->pos = pos;
 
-		EBUG_ON(bkey_cmp(k.k->p, iter->pos) <= 0);
-		bch2_btree_iter_verify(iter);
+	if (bkey_err(k))
 		return k;
-	}
 
-	/* hole */
+	if (k.k && bkey_cmp(bkey_start_pos(k.k), iter->pos) <= 0)
+		return k;
 
-	if (!k.k)
-		k.k = &l->b->key.k;
+	next_start = k.k ? bkey_start_pos(k.k) : POS_MAX;
 
-	bkey_init(&n);
-	n.p = iter->pos;
-	bch2_key_resize(&n,
+	bkey_init(&iter->k);
+	iter->k.p = iter->pos;
+	bch2_key_resize(&iter->k,
 			min_t(u64, KEY_SIZE_MAX,
-			      (k.k->p.inode == n.p.inode
-			       ? bkey_start_offset(k.k)
+			      (next_start.inode == iter->pos.inode
+			       ? next_start.offset
 			       : KEY_OFFSET_MAX) -
-			      n.p.offset));
+			      iter->pos.offset));
 
-	EBUG_ON(!n.size);
+	EBUG_ON(!iter->k.size);
 
-	iter->k	= n;
 	iter->uptodate = BTREE_ITER_UPTODATE;
 
 	bch2_btree_iter_verify_entry_exit(iter);
@@ -1893,13 +1871,13 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
 	if (iter->uptodate == BTREE_ITER_UPTODATE)
 		return btree_iter_peek_uptodate(iter);
 
+	if (iter->flags & BTREE_ITER_IS_EXTENTS)
+		return __bch2_btree_iter_peek_slot_extents(iter);
+
 	ret = bch2_btree_iter_traverse(iter);
 	if (unlikely(ret))
 		return bkey_s_c_err(ret);
 
-	if (iter->flags & BTREE_ITER_IS_EXTENTS)
-		return __bch2_btree_iter_peek_slot_extents(iter);
-
 	k = __btree_iter_peek_all(iter, l, &iter->k);
 
 	EBUG_ON(k.k && bkey_deleted(k.k) && bkey_cmp(k.k->p, iter->pos) == 0);
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 5e0ce7cde017..d99a78f8950d 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -62,9 +62,6 @@ bool bch2_btree_bset_insert_key(struct btree_iter *iter,
 	EBUG_ON(btree_node_just_written(b));
 	EBUG_ON(bset_written(b, btree_bset_last(b)));
 	EBUG_ON(bkey_deleted(&insert->k) && bkey_val_u64s(&insert->k));
-	EBUG_ON(bkey_cmp(b->data->min_key, POS_MIN) &&
-		bkey_cmp(bkey_start_pos(&insert->k),
-			 bkey_predecessor(b->data->min_key)) < 0);
 	EBUG_ON(bkey_cmp(insert->k.p, b->data->min_key) < 0);
 	EBUG_ON(bkey_cmp(insert->k.p, b->data->max_key) > 0);
 	EBUG_ON(insert->k.u64s >
@@ -705,26 +702,31 @@ static inline int btree_iter_pos_cmp(const struct btree_iter *l,
 		 bkey_cmp(l->pos, r->pos);
 }
 
-static void bch2_trans_update2(struct btree_trans *trans,
+static int bch2_trans_update2(struct btree_trans *trans,
 			       struct btree_iter *iter,
 			       struct bkey_i *insert)
 {
 	struct btree_insert_entry *i, n = (struct btree_insert_entry) {
 		.iter = iter, .k = insert
 	};
+	int ret;
 
 	btree_insert_entry_checks(trans, n.iter, n.k);
 
-	BUG_ON(iter->uptodate > BTREE_ITER_NEED_PEEK);
-
 	EBUG_ON(trans->nr_updates2 >= BTREE_ITER_MAX);
 
+	ret = bch2_btree_iter_traverse(iter);
+	if (unlikely(ret))
+		return ret;
+
+	BUG_ON(iter->uptodate > BTREE_ITER_NEED_PEEK);
+
 	iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT;
 
 	trans_for_each_update2(trans, i) {
 		if (btree_iter_pos_cmp(n.iter, i->iter) == 0) {
 			*i = n;
-			return;
+			return 0;
 		}
 
 		if (btree_iter_pos_cmp(n.iter, i->iter) <= 0)
@@ -733,6 +735,7 @@ static void bch2_trans_update2(struct btree_trans *trans,
 
 	array_insert_item(trans->updates2, trans->nr_updates2,
 			  i - trans->updates2, n);
+	return 0;
 }
 
 static int extent_update_to_keys(struct btree_trans *trans,
@@ -753,9 +756,9 @@ static int extent_update_to_keys(struct btree_trans *trans,
 
 	iter->flags |= BTREE_ITER_INTENT;
 	__bch2_btree_iter_set_pos(iter, insert->k.p, false);
-	bch2_trans_update2(trans, iter, insert);
+	ret = bch2_trans_update2(trans, iter, insert);
 	bch2_trans_iter_put(trans, iter);
-	return 0;
+	return ret;
 }
 
 static int extent_handle_overwrites(struct btree_trans *trans,
@@ -785,8 +788,10 @@ static int extent_handle_overwrites(struct btree_trans *trans,
 			bch2_cut_back(start, update);
 
 			__bch2_btree_iter_set_pos(update_iter, update->k.p, false);
-			bch2_trans_update2(trans, update_iter, update);
+			ret = bch2_trans_update2(trans, update_iter, update);
 			bch2_trans_iter_put(trans, update_iter);
+			if (ret)
+				goto err;
 		}
 
 		if (bkey_cmp(k.k->p, end) > 0) {
@@ -800,8 +805,10 @@ static int extent_handle_overwrites(struct btree_trans *trans,
 			bch2_cut_front(end, update);
 
 			__bch2_btree_iter_set_pos(update_iter, update->k.p, false);
-			bch2_trans_update2(trans, update_iter, update);
+			ret = bch2_trans_update2(trans, update_iter, update);
 			bch2_trans_iter_put(trans, update_iter);
+			if (ret)
+				goto err;
 		} else {
 			update_iter = bch2_trans_copy_iter(trans, iter);
 
@@ -815,8 +822,10 @@ static int extent_handle_overwrites(struct btree_trans *trans,
 			update->k.size = 0;
 
 			__bch2_btree_iter_set_pos(update_iter, update->k.p, false);
-			bch2_trans_update2(trans, update_iter, update);
+			ret = bch2_trans_update2(trans, update_iter, update);
 			bch2_trans_iter_put(trans, update_iter);
+			if (ret)
+				goto err;
 		}
 
 		k = bch2_btree_iter_next_with_updates(iter);
@@ -921,11 +930,11 @@ int __bch2_trans_commit(struct btree_trans *trans)
 	trans_for_each_update(trans, i) {
 		if (i->iter->flags & BTREE_ITER_IS_EXTENTS) {
 			ret = extent_update_to_keys(trans, i->iter, i->k);
-			if (ret)
-				goto out;
 		} else {
-			bch2_trans_update2(trans, i->iter, i->k);
+			ret = bch2_trans_update2(trans, i->iter, i->k);
 		}
+		if (ret)
+			goto out;
 	}
 
 	trans_for_each_update2(trans, i) {
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 65ae89c80590..66e50e6b36ea 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -1321,9 +1321,6 @@ int bch2_mark_update(struct btree_trans *trans,
 		     unsigned flags)
 {
 	struct bch_fs		*c = trans->c;
-	struct btree		*b = iter_l(iter)->b;
-	struct btree_node_iter	node_iter = iter_l(iter)->iter;
-	struct bkey_packed	*_old;
 	struct bkey_s_c		old;
 	struct bkey		unpacked;
 	int ret = 0;
@@ -1363,23 +1360,24 @@ int bch2_mark_update(struct btree_trans *trans,
 				BTREE_TRIGGER_OVERWRITE|flags);
 		}
 	} else {
+		struct btree_iter *copy;
+
 		BUG_ON(btree_iter_type(iter) == BTREE_ITER_CACHED);
 		bch2_mark_key_locked(c, old, bkey_i_to_s_c(new),
 			0, new->k.size,
 			fs_usage, trans->journal_res.seq,
 			BTREE_TRIGGER_INSERT|flags);
 
-		while ((_old = bch2_btree_node_iter_peek(&node_iter, b))) {
-			unsigned offset = 0;
-			s64 sectors;
+		copy = bch2_trans_copy_iter(trans, iter);
 
-			old = bkey_disassemble(b, _old, &unpacked);
-			sectors = -((s64) old.k->size);
+		for_each_btree_key_continue(copy, 0, old, ret) {
+			unsigned offset = 0;
+			s64 sectors = -((s64) old.k->size);
 
 			flags |= BTREE_TRIGGER_OVERWRITE;
 
 			if (bkey_cmp(new->k.p, bkey_start_pos(old.k)) <= 0)
-				return 0;
+				break;
 
 			switch (bch2_extent_overlap(&new->k, old.k)) {
 			case BCH_EXTENT_OVERLAP_ALL:
@@ -1412,9 +1410,8 @@ int bch2_mark_update(struct btree_trans *trans,
 					trans->journal_res.seq, flags) ?: 1;
 			if (ret <= 0)
 				break;
-
-			bch2_btree_node_iter_advance(&node_iter, b);
 		}
+		bch2_trans_iter_put(trans, copy);
 	}
 
 	return ret;
@@ -1445,27 +1442,20 @@ void bch2_trans_fs_usage_apply(struct btree_trans *trans,
 		pr_err("overlapping with");
 
 		if (btree_iter_type(i->iter) != BTREE_ITER_CACHED) {
-			struct btree		*b = iter_l(i->iter)->b;
-			struct btree_node_iter	node_iter = iter_l(i->iter)->iter;
-			struct bkey_packed	*_k;
-
-			while ((_k = bch2_btree_node_iter_peek(&node_iter, b))) {
-				struct bkey		unpacked;
-				struct bkey_s_c		k;
+			struct btree_iter *copy = bch2_trans_copy_iter(trans, i->iter);
+			struct bkey_s_c k;
+			int ret;
 
-				pr_info("_k %px format %u", _k, _k->format);
-				k = bkey_disassemble(b, _k, &unpacked);
-
-				if (btree_node_is_extents(b)
+			for_each_btree_key_continue(copy, 0, k, ret) {
+				if (btree_node_type_is_extents(i->iter->btree_id)
 				    ? bkey_cmp(i->k->k.p, bkey_start_pos(k.k)) <= 0
 				    : bkey_cmp(i->k->k.p, k.k->p))
 					break;
 
 				bch2_bkey_val_to_text(&PBUF(buf), c, k);
 				pr_err("%s", buf);
-
-				bch2_btree_node_iter_advance(&node_iter, b);
 			}
+			bch2_trans_iter_put(trans, copy);
 		} else {
 			struct bkey_cached *ck = (void *) i->iter->l[0].b;
 
@@ -1860,8 +1850,6 @@ static int __bch2_trans_mark_reflink_p(struct btree_trans *trans,
 	}
 
 	bch2_btree_iter_set_pos(iter, bkey_start_pos(k.k));
-	BUG_ON(iter->uptodate > BTREE_ITER_NEED_PEEK);
-
 	bch2_trans_update(trans, iter, n, 0);
 out:
 	ret = sectors;
@@ -1987,15 +1975,13 @@ int bch2_trans_mark_update(struct btree_trans *trans,
 					BTREE_TRIGGER_OVERWRITE|flags);
 		}
 	} else {
-		struct btree		*b = iter_l(iter)->b;
-		struct btree_node_iter	node_iter = iter_l(iter)->iter;
-		struct bkey_packed	*_old;
-		struct bkey		unpacked;
+		struct btree_iter *copy;
+		struct bkey _old;
 
 		EBUG_ON(btree_iter_type(iter) == BTREE_ITER_CACHED);
 
-		bkey_init(&unpacked);
-		old = (struct bkey_s_c) { &unpacked, NULL };
+		bkey_init(&_old);
+		old = (struct bkey_s_c) { &_old, NULL };
 
 		ret = bch2_trans_mark_key(trans, old, bkey_i_to_s_c(new),
 					  0, new->k.size,
@@ -2003,18 +1989,16 @@ int bch2_trans_mark_update(struct btree_trans *trans,
 		if (ret)
 			return ret;
 
-		while ((_old = bch2_btree_node_iter_peek(&node_iter, b))) {
-			unsigned flags = BTREE_TRIGGER_OVERWRITE;
-			unsigned offset = 0;
-			s64 sectors;
+		copy = bch2_trans_copy_iter(trans, iter);
 
-			old = bkey_disassemble(b, _old, &unpacked);
-			sectors = -((s64) old.k->size);
+		for_each_btree_key_continue(copy, 0, old, ret) {
+			unsigned offset = 0;
+			s64 sectors = -((s64) old.k->size);
 
 			flags |= BTREE_TRIGGER_OVERWRITE;
 
 			if (bkey_cmp(new->k.p, bkey_start_pos(old.k)) <= 0)
-				return 0;
+				break;
 
 			switch (bch2_extent_overlap(&new->k, old.k)) {
 			case BCH_EXTENT_OVERLAP_ALL:
@@ -2045,10 +2029,9 @@ int bch2_trans_mark_update(struct btree_trans *trans,
 			ret = bch2_trans_mark_key(trans, old, bkey_i_to_s_c(new),
 					offset, sectors, flags);
 			if (ret)
-				return ret;
-
-			bch2_btree_node_iter_advance(&node_iter, b);
+				break;
 		}
+		bch2_trans_iter_put(trans, copy);
 	}
 
 	return ret;
diff --git a/fs/bcachefs/extent_update.c b/fs/bcachefs/extent_update.c
index 5c43678e94a3..16d2bca8a662 100644
--- a/fs/bcachefs/extent_update.c
+++ b/fs/bcachefs/extent_update.c
@@ -99,24 +99,12 @@ int bch2_extent_atomic_end(struct btree_iter *iter,
 			   struct bpos *end)
 {
 	struct btree_trans *trans = iter->trans;
-	struct btree *b;
-	struct btree_node_iter	node_iter;
-	struct bkey_packed	*_k;
-	unsigned		nr_iters = 0;
+	struct btree_iter *copy;
+	struct bkey_s_c k;
+	unsigned nr_iters = 0;
 	int ret;
 
-	ret = bch2_btree_iter_traverse(iter);
-	if (ret)
-		return ret;
-
-	b = iter->l[0].b;
-	node_iter = iter->l[0].iter;
-
-	BUG_ON(bkey_cmp(b->data->min_key, POS_MIN) &&
-	       bkey_cmp(bkey_start_pos(&insert->k),
-			bkey_predecessor(b->data->min_key)) < 0);
-
-	*end = bpos_min(insert->k.p, b->key.k.p);
+	*end = insert->k.p;
 
 	/* extent_update_to_keys(): */
 	nr_iters += 1;
@@ -126,9 +114,9 @@ int bch2_extent_atomic_end(struct btree_iter *iter,
 	if (ret < 0)
 		return ret;
 
-	while ((_k = bch2_btree_node_iter_peek(&node_iter, b))) {
-		struct bkey	unpacked;
-		struct bkey_s_c	k = bkey_disassemble(b, _k, &unpacked);
+	copy = bch2_trans_copy_iter(trans, iter);
+
+	for_each_btree_key_continue(copy, 0, k, ret) {
 		unsigned offset = 0;
 
 		if (bkey_cmp(bkey_start_pos(k.k), *end) >= 0)
@@ -155,10 +143,9 @@ int bch2_extent_atomic_end(struct btree_iter *iter,
 					&nr_iters, EXTENT_ITERS_MAX);
 		if (ret)
 			break;
-
-		bch2_btree_node_iter_advance(&node_iter, b);
 	}
 
+	bch2_trans_iter_put(trans, copy);
 	return ret < 0 ? ret : 0;
 }
 
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 8560023b4c7a..54ac9cc470af 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -506,115 +506,6 @@ static void replay_now_at(struct journal *j, u64 seq)
 		bch2_journal_pin_put(j, j->replay_journal_seq++);
 }
 
-static int bch2_extent_replay_key(struct bch_fs *c, enum btree_id btree_id,
-				  struct bkey_i *k)
-{
-	struct btree_trans trans;
-	struct btree_iter *iter, *split_iter;
-	/*
-	 * We might cause compressed extents to be split, so we need to pass in
-	 * a disk_reservation:
-	 */
-	struct disk_reservation disk_res =
-		bch2_disk_reservation_init(c, 0);
-	struct bkey_i *split;
-	struct bpos atomic_end;
-	/*
-	 * Some extents aren't equivalent - w.r.t. what the triggers do
-	 * - if they're split:
-	 */
-	bool remark_if_split = bch2_bkey_sectors_compressed(bkey_i_to_s_c(k)) ||
-		k->k.type == KEY_TYPE_reflink_p;
-	bool remark = false;
-	int ret;
-
-	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
-retry:
-	bch2_trans_begin(&trans);
-
-	iter = bch2_trans_get_iter(&trans, btree_id,
-				   bkey_start_pos(&k->k),
-				   BTREE_ITER_INTENT);
-
-	do {
-		ret = bch2_btree_iter_traverse(iter);
-		if (ret)
-			goto err;
-
-		atomic_end = bpos_min(k->k.p, iter->l[0].b->key.k.p);
-
-		split = bch2_trans_kmalloc(&trans, bkey_bytes(&k->k));
-		ret = PTR_ERR_OR_ZERO(split);
-		if (ret)
-			goto err;
-
-		if (!remark &&
-		    remark_if_split &&
-		    bkey_cmp(atomic_end, k->k.p) < 0) {
-			ret = bch2_disk_reservation_add(c, &disk_res,
-					k->k.size *
-					bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(k)),
-					BCH_DISK_RESERVATION_NOFAIL);
-			BUG_ON(ret);
-
-			remark = true;
-		}
-
-		bkey_copy(split, k);
-		bch2_cut_front(iter->pos, split);
-		bch2_cut_back(atomic_end, split);
-
-		split_iter = bch2_trans_copy_iter(&trans, iter);
-
-		/*
-		 * It's important that we don't go through the
-		 * extent_handle_overwrites() and extent_update_to_keys() path
-		 * here: journal replay is supposed to treat extents like
-		 * regular keys
-		 */
-		__bch2_btree_iter_set_pos(split_iter, split->k.p, false);
-		bch2_trans_update(&trans, split_iter, split,
-				  BTREE_TRIGGER_NORUN);
-		bch2_trans_iter_put(&trans, split_iter);
-
-		bch2_btree_iter_set_pos(iter, split->k.p);
-
-		if (remark) {
-			ret = bch2_trans_mark_key(&trans,
-						  bkey_s_c_null,
-						  bkey_i_to_s_c(split),
-						  0, split->k.size,
-						  BTREE_TRIGGER_INSERT);
-			if (ret)
-				goto err;
-		}
-	} while (bkey_cmp(iter->pos, k->k.p) < 0);
-
-	if (remark) {
-		ret = bch2_trans_mark_key(&trans,
-					  bkey_i_to_s_c(k),
-					  bkey_s_c_null,
-					  0, -((s64) k->k.size),
-					  BTREE_TRIGGER_OVERWRITE);
-		if (ret)
-			goto err;
-	}
-
-	ret = bch2_trans_commit(&trans, &disk_res, NULL,
-				BTREE_INSERT_NOFAIL|
-				BTREE_INSERT_LAZY_RW|
-				BTREE_INSERT_JOURNAL_REPLAY);
-err:
-	bch2_trans_iter_put(&trans, iter);
-
-	if (ret == -EINTR)
-		goto retry;
-
-	bch2_disk_reservation_put(c, &disk_res);
-
-	return bch2_trans_exit(&trans) ?: ret;
-}
-
 static int __bch2_journal_replay_key(struct btree_trans *trans,
 				     enum btree_id id, unsigned level,
 				     struct bkey_i *k)
@@ -753,9 +644,7 @@ static int bch2_journal_replay(struct bch_fs *c,
 
 		replay_now_at(j, keys.journal_seq_base + i->journal_seq);
 
-		ret = i->k->k.size
-			? bch2_extent_replay_key(c, i->btree_id, i->k)
-			: bch2_journal_replay_key(c, i);
+		ret = bch2_journal_replay_key(c, i);
 		if (ret)
 			goto err;
 	}
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index 767baab18807..79d03b18b5c8 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -956,9 +956,7 @@ int bch2_fs_mark_dirty(struct bch_fs *c)
 
 	mutex_lock(&c->sb_lock);
 	SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
-	c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_new_extent_overwrite;
-	c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_extents_above_btree_updates;
-	c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_btree_updates_journalled;
+	c->disk_sb.sb->features[0] |= BCH_SB_FEATURES_ALWAYS;
 	ret = bch2_write_super(c);
 	mutex_unlock(&c->sb_lock);
 
-- 
cgit 


From dab9ef0d271648c24b867059855439ec48775fc4 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 23 Feb 2021 15:16:41 -0500
Subject: bcachefs: Add error message for some allocation failures

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c |  5 ++++-
 fs/bcachefs/btree_gc.c         | 38 ++++++++++++++++++++++++++++++--------
 fs/bcachefs/journal_reclaim.c  |  4 +++-
 fs/bcachefs/movinggc.c         |  4 +++-
 fs/bcachefs/rebalance.c        |  4 +++-
 fs/bcachefs/recovery.c         | 13 ++++++++++---
 6 files changed, 53 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index aadd878b357d..eac82c9880ba 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -1387,8 +1387,11 @@ int bch2_dev_allocator_start(struct bch_dev *ca)
 
 	p = kthread_create(bch2_allocator_thread, ca,
 			   "bch-alloc/%s", ca->name);
-	if (IS_ERR(p))
+	if (IS_ERR(p)) {
+		bch_err(ca->fs, "error creating allocator thread: %li",
+			PTR_ERR(p));
 		return PTR_ERR(p);
+	}
 
 	get_task_struct(p);
 	rcu_assign_pointer(ca->alloc_thread, p);
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index f687cc3bfa94..426c932098da 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -115,8 +115,10 @@ static int bch2_gc_check_topology(struct bch_fs *c,
 		}
 
 		new = kmalloc(bkey_bytes(&cur.k->k), GFP_KERNEL);
-		if (!new)
+		if (!new) {
+			bch_err(c, "%s: error allocating new key", __func__);
 			return -ENOMEM;
+		}
 
 		bkey_copy(new, cur.k);
 
@@ -235,8 +237,10 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id,
 		}
 
 		new = kmalloc(bkey_bytes(k->k), GFP_KERNEL);
-		if (!new)
+		if (!new) {
+			bch_err(c, "%s: error allocating new key", __func__);
 			return -ENOMEM;
+		}
 
 		bkey_reassemble(new, *k);
 
@@ -302,8 +306,10 @@ static int bch2_gc_mark_key(struct bch_fs *c, enum btree_id btree_id,
 				"superblock not marked as containing replicas (type %u)",
 				k.k->type)) {
 			ret = bch2_mark_bkey_replicas(c, k);
-			if (ret)
-				return ret;
+			if (ret) {
+				bch_err(c, "error marking bkey replicas: %i", ret);
+				goto err;
+			}
 		}
 
 		ret = bch2_check_fix_ptrs(c, btree_id, level, is_root, &k);
@@ -321,6 +327,9 @@ static int bch2_gc_mark_key(struct bch_fs *c, enum btree_id btree_id,
 
 	bch2_mark_key(c, k, 0, k.k->size, NULL, 0, flags);
 fsck_err:
+err:
+	if (ret)
+		bch_err(c, "%s: ret %i", __func__, ret);
 	return ret;
 }
 
@@ -448,8 +457,10 @@ static int bch2_gc_btree_init_recurse(struct bch_fs *c, struct btree *b,
 
 		ret = bch2_gc_mark_key(c, b->c.btree_id, b->c.level, false,
 				       k, &max_stale, true);
-		if (ret)
+		if (ret) {
+			bch_err(c, "%s: error %i from bch2_gc_mark_key", __func__, ret);
 			break;
+		}
 
 		if (b->c.level) {
 			bch2_bkey_buf_reassemble(&cur, c, k);
@@ -493,8 +504,11 @@ static int bch2_gc_btree_init_recurse(struct bch_fs *c, struct btree *b,
 				continue;
 			}
 
-			if (ret)
+			if (ret) {
+				bch_err(c, "%s: error %i getting btree node",
+					__func__, ret);
 				break;
+			}
 
 			ret = bch2_gc_btree_init_recurse(c, child,
 							 target_depth);
@@ -551,6 +565,8 @@ static int bch2_gc_btree_init(struct bch_fs *c,
 fsck_err:
 	six_unlock_read(&b->c.lock);
 
+	if (ret)
+		bch_err(c, "%s: ret %i", __func__, ret);
 	return ret;
 }
 
@@ -574,8 +590,10 @@ static int bch2_gc_btrees(struct bch_fs *c, bool initial)
 		int ret = initial
 			? bch2_gc_btree_init(c, id)
 			: bch2_gc_btree(c, id, initial);
-		if (ret)
+		if (ret) {
+			bch_err(c, "%s: ret %i", __func__, ret);
 			return ret;
+		}
 	}
 
 	return 0;
@@ -881,6 +899,8 @@ static int bch2_gc_done(struct bch_fs *c,
 #undef copy_stripe_field
 #undef copy_field
 fsck_err:
+	if (ret)
+		bch_err(c, "%s: ret %i", __func__, ret);
 	return ret;
 }
 
@@ -1601,8 +1621,10 @@ int bch2_gc_thread_start(struct bch_fs *c)
 	BUG_ON(c->gc_thread);
 
 	p = kthread_create(bch2_gc_thread, c, "bch-gc/%s", c->name);
-	if (IS_ERR(p))
+	if (IS_ERR(p)) {
+		bch_err(c, "error creating gc thread: %li", PTR_ERR(p));
 		return PTR_ERR(p);
+	}
 
 	get_task_struct(p);
 	c->gc_thread = p;
diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
index 1141b7d3a060..0a16343fb51a 100644
--- a/fs/bcachefs/journal_reclaim.c
+++ b/fs/bcachefs/journal_reclaim.c
@@ -691,8 +691,10 @@ int bch2_journal_reclaim_start(struct journal *j)
 
 	p = kthread_create(bch2_journal_reclaim_thread, j,
 			   "bch-reclaim/%s", c->name);
-	if (IS_ERR(p))
+	if (IS_ERR(p)) {
+		bch_err(c, "error creating journal reclaim thread: %li", PTR_ERR(p));
 		return PTR_ERR(p);
+	}
 
 	get_task_struct(p);
 	j->reclaim_thread = p;
diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c
index e2472c19beaf..b61bbc18a0aa 100644
--- a/fs/bcachefs/movinggc.c
+++ b/fs/bcachefs/movinggc.c
@@ -348,8 +348,10 @@ int bch2_copygc_start(struct bch_fs *c)
 		return -ENOMEM;
 
 	t = kthread_create(bch2_copygc_thread, c, "bch-copygc/%s", c->name);
-	if (IS_ERR(t))
+	if (IS_ERR(t)) {
+		bch_err(c, "error creating copygc thread: %li", PTR_ERR(t));
 		return PTR_ERR(t);
+	}
 
 	get_task_struct(t);
 
diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c
index 2263ee41c444..c75411af4622 100644
--- a/fs/bcachefs/rebalance.c
+++ b/fs/bcachefs/rebalance.c
@@ -315,8 +315,10 @@ int bch2_rebalance_start(struct bch_fs *c)
 		return 0;
 
 	p = kthread_create(bch2_rebalance_thread, c, "bch-rebalance/%s", c->name);
-	if (IS_ERR(p))
+	if (IS_ERR(p)) {
+		bch_err(c, "error creating rebalance thread: %li", PTR_ERR(p));
 		return PTR_ERR(p);
+	}
 
 	get_task_struct(p);
 	rcu_assign_pointer(c->rebalance.thread, p);
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 54ac9cc470af..0aeaaadbf3f8 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -122,8 +122,11 @@ int bch2_journal_key_insert(struct bch_fs *c, enum btree_id id,
 		};
 
 		new_keys.d = kvmalloc(sizeof(new_keys.d[0]) * new_keys.size, GFP_KERNEL);
-		if (!new_keys.d)
+		if (!new_keys.d) {
+			bch_err(c, "%s: error allocating new key array (size %zu)",
+				__func__, new_keys.size);
 			return -ENOMEM;
+		}
 
 		memcpy(new_keys.d, keys->d, sizeof(keys->d[0]) * keys->nr);
 		kvfree(keys->d);
@@ -145,8 +148,10 @@ int bch2_journal_key_delete(struct bch_fs *c, enum btree_id id,
 		kmalloc(sizeof(struct bkey), GFP_KERNEL);
 	int ret;
 
-	if (!whiteout)
+	if (!whiteout) {
+		bch_err(c, "%s: error allocating new key", __func__);
 		return -ENOMEM;
+	}
 
 	bkey_init(&whiteout->k);
 	whiteout->k.p = pos;
@@ -1330,8 +1335,10 @@ int bch2_fs_initialize(struct bch_fs *c)
 				  &lostfound,
 				  0, 0, S_IFDIR|0700, 0,
 				  NULL, NULL));
-	if (ret)
+	if (ret) {
+		bch_err(c, "error creating lost+found");
 		goto err;
+	}
 
 	if (enabled_qtypes(c)) {
 		ret = bch2_fs_quota_read(c);
-- 
cgit 


From 18a7b97239b6f0bae3fa1475cb276a273e07597a Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 23 Feb 2021 21:41:25 -0500
Subject: bcachefs: Fix for bch2_btree_node_get_noiter() returning -ENOMEM

bch2_btree_node_get_noiter() isn't used from the btree iterator code,
which retries with the btree node cache cannibalize lock held on
-ENOMEM, so we should do it ourself if necessary.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_cache.c | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index 443d669e6a30..2152813554b4 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -844,7 +844,7 @@ retry:
 	b = btree_cache_find(bc, k);
 	if (unlikely(!b)) {
 		if (nofill)
-			return NULL;
+			goto out;
 
 		b = bch2_btree_node_fill(c, NULL, k, btree_id,
 					 level, SIX_LOCK_read, true);
@@ -853,8 +853,12 @@ retry:
 		if (!b)
 			goto retry;
 
+		if (IS_ERR(b) &&
+		    !bch2_btree_cache_cannibalize_lock(c, NULL))
+			goto retry;
+
 		if (IS_ERR(b))
-			return b;
+			goto out;
 	} else {
 lock_node:
 		ret = six_lock_read(&b->c.lock, lock_node_check_fn, (void *) k);
@@ -889,7 +893,8 @@ lock_node:
 
 	if (unlikely(btree_node_read_error(b))) {
 		six_unlock_read(&b->c.lock);
-		return ERR_PTR(-EIO);
+		b = ERR_PTR(-EIO);
+		goto out;
 	}
 
 	EBUG_ON(b->c.btree_id != btree_id);
@@ -898,7 +903,8 @@ lock_node:
 	EBUG_ON(b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
 		bkey_cmp(b->data->min_key,
 			 bkey_i_to_btree_ptr_v2(&b->key)->v.min_key));
-
+out:
+	bch2_btree_cache_cannibalize_unlock(c);
 	return b;
 }
 
-- 
cgit 


From 59a7405161425df39d33faabf9f97c101fcb75d9 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 5 Mar 2021 18:00:55 -0500
Subject: bcachefs: Create allocator threads when allocating filesystem

We're seeing failures to mount because of a failure to start the
allocator threads, which currently happens fairly late in the mount
process, after walking all metadata, and kthread_create() fails if
something has tried to kill the mount process, which is probably not
what we want.

This patch avoids this issue by creating, but not starting, the
allocator threads when we preallocate all of our other in memory data
structures.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c | 15 ++++++++++++++-
 fs/bcachefs/super.c            | 11 +++++++++++
 2 files changed, 25 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index eac82c9880ba..b9b97cbda177 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -1068,6 +1068,12 @@ static int discard_invalidated_buckets(struct bch_fs *c, struct bch_dev *ca)
 	return 0;
 }
 
+static inline bool allocator_thread_running(struct bch_dev *ca)
+{
+	return ca->mi.state == BCH_MEMBER_STATE_RW &&
+		test_bit(BCH_FS_ALLOCATOR_RUNNING, &ca->fs->flags);
+}
+
 /**
  * bch_allocator_thread - move buckets from free_inc to reserves
  *
@@ -1084,9 +1090,16 @@ static int bch2_allocator_thread(void *arg)
 	int ret;
 
 	set_freezable();
-	ca->allocator_state = ALLOCATOR_RUNNING;
 
 	while (1) {
+		if (!allocator_thread_running(ca)) {
+			ca->allocator_state = ALLOCATOR_STOPPED;
+			if (kthread_wait_freezable(allocator_thread_running(ca)))
+				break;
+		}
+
+		ca->allocator_state = ALLOCATOR_RUNNING;
+
 		cond_resched();
 		if (kthread_should_stop())
 			break;
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index e10e7e0c0454..224c21c3f9f7 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -431,6 +431,9 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early)
 
 	set_bit(BCH_FS_ALLOCATOR_RUNNING, &c->flags);
 
+	for_each_rw_member(ca, c, i)
+		bch2_wake_allocator(ca);
+
 	ret = bch2_journal_reclaim_start(&c->journal);
 	if (ret) {
 		bch_err(c, "error starting journal reclaim: %i", ret);
@@ -1008,6 +1011,8 @@ static void bch2_dev_release(struct kobject *kobj)
 
 static void bch2_dev_free(struct bch_dev *ca)
 {
+	bch2_dev_allocator_stop(ca);
+
 	cancel_work_sync(&ca->io_error_work);
 
 	if (ca->kobj.state_in_sysfs &&
@@ -1172,6 +1177,12 @@ static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx)
 	if (!ca)
 		goto err;
 
+	if (ca->mi.state == BCH_MEMBER_STATE_RW &&
+	    bch2_dev_allocator_start(ca)) {
+		bch2_dev_free(ca);
+		goto err;
+	}
+
 	bch2_dev_attach(c, ca, dev_idx);
 out:
 	pr_verbose_init(c->opts, "ret %i", ret);
-- 
cgit 


From bcdb4b9732208fb8d3c634661a1b581437dcdd12 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 7 Mar 2021 19:04:16 -0500
Subject: bcachefs: Don't call into journal reclaim when we're not supposed to

This was causing a deadlock when btree_update_nodes_writtes() invokes
journal reclaim because of the btree cache being too dirty.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update_leaf.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index d99a78f8950d..70cf18bcbcdd 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -284,7 +284,8 @@ btree_key_can_insert_cached(struct btree_trans *trans,
 	BUG_ON(iter->level);
 
 	if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) &&
-	    bch2_btree_key_cache_must_wait(trans->c))
+	    bch2_btree_key_cache_must_wait(trans->c) &&
+	    !(trans->flags & BTREE_INSERT_JOURNAL_RECLAIM))
 		return BTREE_INSERT_NEED_JOURNAL_RECLAIM;
 
 	if (u64s <= ck->u64s)
-- 
cgit 


From fe38b720862204595f7b56b8db98ea5074c83f82 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 7 Mar 2021 21:43:21 -0500
Subject: bcachefs: Don't use inode btree key cache in fsck code

We had a cache coherency bug with the btree key cache in the fsck code -
this fixes fsck to be consistent about not using it.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fsck.c  | 15 +++++++++------
 fs/bcachefs/inode.c | 19 +++++++++++++++----
 fs/bcachefs/inode.h |  2 ++
 3 files changed, 26 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index b2d9d55b1951..66c9dad2ef3e 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -58,7 +58,7 @@ static int __remove_dirent(struct btree_trans *trans,
 	buf[name.len] = '\0';
 	name.name = buf;
 
-	ret = bch2_inode_find_by_inum_trans(trans, dir_inum, &dir_inode);
+	ret = __bch2_inode_find_by_inum_trans(trans, dir_inum, &dir_inode, 0);
 	if (ret && ret != -EINTR)
 		bch_err(c, "remove_dirent: err %i looking up directory inode", ret);
 	if (ret)
@@ -126,8 +126,8 @@ static int walk_inode(struct btree_trans *trans,
 		      struct inode_walker *w, u64 inum)
 {
 	if (inum != w->cur_inum) {
-		int ret = bch2_inode_find_by_inum_trans(trans, inum,
-							&w->inode);
+		int ret = __bch2_inode_find_by_inum_trans(trans, inum,
+							  &w->inode, 0);
 
 		if (ret && ret != -ENOENT)
 			return ret;
@@ -673,7 +673,7 @@ retry:
 			continue;
 		}
 
-		ret = bch2_inode_find_by_inum_trans(&trans, d_inum, &target);
+		ret = __bch2_inode_find_by_inum_trans(&trans, d_inum, &target, 0);
 		if (ret && ret != -ENOENT)
 			break;
 
@@ -787,7 +787,9 @@ static int check_root(struct bch_fs *c, struct bch_inode_unpacked *root_inode)
 
 	bch_verbose(c, "checking root directory");
 
-	ret = bch2_inode_find_by_inum(c, BCACHEFS_ROOT_INO, root_inode);
+	ret = bch2_trans_do(c, NULL, NULL, 0,
+		__bch2_inode_find_by_inum_trans(&trans, BCACHEFS_ROOT_INO,
+						root_inode, 0));
 	if (ret && ret != -ENOENT)
 		return ret;
 
@@ -834,7 +836,8 @@ static int check_lostfound(struct bch_fs *c,
 		goto create_lostfound;
 	}
 
-	ret = bch2_inode_find_by_inum(c, inum, lostfound_inode);
+	ret = bch2_trans_do(c, NULL, NULL, 0,
+		__bch2_inode_find_by_inum_trans(&trans, inum, lostfound_inode, 0));
 	if (ret && ret != -ENOENT)
 		return ret;
 
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index 3462e248c954..8377d39ccc4d 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -628,16 +628,19 @@ err:
 	return ret;
 }
 
-int bch2_inode_find_by_inum_trans(struct btree_trans *trans, u64 inode_nr,
-				  struct bch_inode_unpacked *inode)
+int __bch2_inode_find_by_inum_trans(struct btree_trans *trans, u64 inode_nr,
+				    struct bch_inode_unpacked *inode,
+				    unsigned flags)
 {
 	struct btree_iter *iter;
 	struct bkey_s_c k;
 	int ret;
 
 	iter = bch2_trans_get_iter(trans, BTREE_ID_INODES,
-			POS(0, inode_nr), BTREE_ITER_CACHED);
-	k = bch2_btree_iter_peek_cached(iter);
+			POS(0, inode_nr), flags);
+	k = (flags & BTREE_ITER_TYPE) == BTREE_ITER_CACHED
+		? bch2_btree_iter_peek_cached(iter)
+		: bch2_btree_iter_peek_slot(iter);
 	ret = bkey_err(k);
 	if (ret)
 		goto err;
@@ -650,6 +653,14 @@ err:
 	return ret;
 }
 
+int bch2_inode_find_by_inum_trans(struct btree_trans *trans, u64 inode_nr,
+				  struct bch_inode_unpacked *inode)
+{
+	return __bch2_inode_find_by_inum_trans(trans, inode_nr,
+					       inode, BTREE_ITER_CACHED);
+
+}
+
 int bch2_inode_find_by_inum(struct bch_fs *c, u64 inode_nr,
 			    struct bch_inode_unpacked *inode)
 {
diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h
index dbdfcf63d079..1caf036ae928 100644
--- a/fs/bcachefs/inode.h
+++ b/fs/bcachefs/inode.h
@@ -73,6 +73,8 @@ int bch2_inode_create(struct btree_trans *, struct bch_inode_unpacked *);
 
 int bch2_inode_rm(struct bch_fs *, u64, bool);
 
+int __bch2_inode_find_by_inum_trans(struct btree_trans *, u64,
+				    struct bch_inode_unpacked *, unsigned);
 int bch2_inode_find_by_inum_trans(struct btree_trans *, u64,
 				  struct bch_inode_unpacked *);
 int bch2_inode_find_by_inum(struct bch_fs *, u64, struct bch_inode_unpacked *);
-- 
cgit 


From 514852c2b58f06f1643b10264b6bc9089071b05c Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 20 Feb 2021 05:05:18 -0500
Subject: bcachefs: Fix a 64 bit divide on 32 bit

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/journal_io.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index df5b375c367f..40d452cedffd 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -836,13 +836,15 @@ static void bch2_journal_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
 
 	for (i = 0; i < j->nr_ptrs; i++) {
 		struct bch_dev *ca = c->devs[j->ptrs[i].dev];
+		u64 offset;
+
+		div64_u64_rem(j->ptrs[i].offset, ca->mi.bucket_size, &offset);
 
 		if (i)
 			pr_buf(out, " ");
 		pr_buf(out, "%u:%llu (offset %llu)",
 		       j->ptrs[i].dev,
-		       (u64) j->ptrs[i].offset,
-		       (u64) j->ptrs[i].offset % ca->mi.bucket_size);
+		       (u64) j->ptrs[i].offset, offset);
 	}
 }
 
-- 
cgit 


From 8567415457b25b467933e47ff78dca55a55f7206 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 24 Feb 2021 01:16:49 -0500
Subject: bcachefs: Dump journal state when we get stuck

We had a bug reported where the journal is failing to allocate a journal
write - this should help figure out what's going on.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/journal_io.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 40d452cedffd..fdd5a837902c 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -1383,6 +1383,7 @@ void bch2_journal_write(struct closure *cl)
 	struct jset_entry *start, *end;
 	struct jset *jset;
 	struct bio *bio;
+	char *journal_debug_buf = NULL;
 	bool validate_before_checksum = false;
 	unsigned i, sectors, bytes, u64s, nr_rw_members = 0;
 	int ret;
@@ -1484,6 +1485,12 @@ retry_alloc:
 		goto retry_alloc;
 	}
 
+	if (ret) {
+		journal_debug_buf = kmalloc(4096, GFP_ATOMIC);
+		if (journal_debug_buf)
+			__bch2_journal_debug_to_text(&_PBUF(journal_debug_buf, 4096), j);
+	}
+
 	/*
 	 * write is allocated, no longer need to account for it in
 	 * bch2_journal_space_available():
@@ -1498,7 +1505,9 @@ retry_alloc:
 	spin_unlock(&j->lock);
 
 	if (ret) {
-		bch_err(c, "Unable to allocate journal write");
+		bch_err(c, "Unable to allocate journal write:\n%s",
+			journal_debug_buf);
+		kfree(journal_debug_buf);
 		bch2_fatal_error(c);
 		continue_at(cl, journal_write_done, system_highpri_wq);
 		return;
-- 
cgit 


From 1889ad5a1285ba452f6a8cef3df663087611050a Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 14 Mar 2021 19:01:14 -0400
Subject: bcachefs: Add code to scan for/rewite old btree nodes

This adds a new data job type to scan for btree nodes in the old extent
format, and rewrite them.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs_ioctl.h        |  17 +++--
 fs/bcachefs/btree_io.c              |   5 ++
 fs/bcachefs/btree_types.h           |   1 +
 fs/bcachefs/btree_update_interior.c |   1 +
 fs/bcachefs/move.c                  | 131 +++++++++++++++++++++++++++++-------
 fs/bcachefs/move.h                  |   6 +-
 fs/bcachefs/movinggc.c              |   6 +-
 fs/bcachefs/rebalance.c             |   3 +-
 8 files changed, 132 insertions(+), 38 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs_ioctl.h b/fs/bcachefs/bcachefs_ioctl.h
index 38c6ac96e12f..1ef9907e07ad 100644
--- a/fs/bcachefs/bcachefs_ioctl.h
+++ b/fs/bcachefs/bcachefs_ioctl.h
@@ -171,10 +171,11 @@ struct bch_ioctl_disk_set_state {
 };
 
 enum bch_data_ops {
-	BCH_DATA_OP_SCRUB	= 0,
-	BCH_DATA_OP_REREPLICATE	= 1,
-	BCH_DATA_OP_MIGRATE	= 2,
-	BCH_DATA_OP_NR		= 3,
+	BCH_DATA_OP_SCRUB		= 0,
+	BCH_DATA_OP_REREPLICATE		= 1,
+	BCH_DATA_OP_MIGRATE		= 2,
+	BCH_DATA_OP_REWRITE_OLD_NODES	= 3,
+	BCH_DATA_OP_NR			= 4,
 };
 
 /*
@@ -187,11 +188,13 @@ enum bch_data_ops {
  * job. The file descriptor is O_CLOEXEC.
  */
 struct bch_ioctl_data {
-	__u32			op;
+	__u16			op;
+	__u8			start_btree;
+	__u8			end_btree;
 	__u32			flags;
 
-	struct bpos		start;
-	struct bpos		end;
+	struct bpos		start_pos;
+	struct bpos		end_pos;
 
 	union {
 	struct {
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 6e656ed6b32a..eac51c39fc6c 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -920,6 +920,8 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
 	unsigned u64s;
 	int ret, retry_read = 0, write = READ;
 
+	b->version_ondisk = U16_MAX;
+
 	iter = mempool_alloc(&c->fill_iter, GFP_NOIO);
 	sort_iter_init(iter, b);
 	iter->size = (btree_blocks(c) + 1) * 2;
@@ -1000,6 +1002,9 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
 			sectors = vstruct_sectors(bne, c->block_bits);
 		}
 
+		b->version_ondisk = min(b->version_ondisk,
+					le16_to_cpu(i->version));
+
 		ret = validate_bset(c, ca, b, i, sectors,
 				    READ, have_retry);
 		if (ret)
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 80bb31a53339..55d8d815a04a 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -76,6 +76,7 @@ struct btree {
 	u16			written;
 	u8			nsets;
 	u8			nr_key_bits;
+	u16			version_ondisk;
 
 	struct bkey_format	format;
 
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 285365ba7012..989ba81207c9 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -286,6 +286,7 @@ static struct btree *bch2_btree_node_alloc(struct btree_update *as, unsigned lev
 	bch2_bset_init_first(b, &b->data->keys);
 	b->c.level	= level;
 	b->c.btree_id	= as->btree_id;
+	b->version_ondisk = c->sb.version;
 
 	memset(&b->nr, 0, sizeof(b->nr));
 	b->data->magic = cpu_to_le64(bset_magic(c));
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index 28e2125c12ed..72958b867014 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -531,7 +531,7 @@ static int __bch2_move_data(struct bch_fs *c,
 
 	stats->data_type = BCH_DATA_user;
 	stats->btree_id	= btree_id;
-	stats->pos	= POS_MIN;
+	stats->pos	= start;
 
 	iter = bch2_trans_get_iter(&trans, btree_id, start,
 				   BTREE_ITER_PREFETCH);
@@ -646,14 +646,15 @@ out:
 }
 
 int bch2_move_data(struct bch_fs *c,
+		   enum btree_id start_btree_id, struct bpos start_pos,
+		   enum btree_id end_btree_id,   struct bpos end_pos,
 		   struct bch_ratelimit *rate,
 		   struct write_point_specifier wp,
-		   struct bpos start,
-		   struct bpos end,
 		   move_pred_fn pred, void *arg,
 		   struct bch_move_stats *stats)
 {
 	struct moving_context ctxt = { .stats = stats };
+	enum btree_id id;
 	int ret;
 
 	closure_init_stack(&ctxt.cl);
@@ -662,10 +663,23 @@ int bch2_move_data(struct bch_fs *c,
 
 	stats->data_type = BCH_DATA_user;
 
-	ret =   __bch2_move_data(c, &ctxt, rate, wp, start, end,
-				 pred, arg, stats, BTREE_ID_EXTENTS) ?:
-		__bch2_move_data(c, &ctxt, rate, wp, start, end,
-				 pred, arg, stats, BTREE_ID_REFLINK);
+	for (id = start_btree_id;
+	     id <= min_t(unsigned, end_btree_id, BTREE_ID_NR - 1);
+	     id++) {
+		stats->btree_id = id;
+
+		if (id != BTREE_ID_EXTENTS &&
+		    id != BTREE_ID_REFLINK)
+			continue;
+
+		ret = __bch2_move_data(c, &ctxt, rate, wp,
+				       id == start_btree_id ? start_pos : POS_MIN,
+				       id == end_btree_id   ? end_pos   : POS_MAX,
+				       pred, arg, stats, id);
+		if (ret)
+			break;
+	}
+
 
 	move_ctxt_wait_event(&ctxt, list_empty(&ctxt.reads));
 	closure_sync(&ctxt.cl);
@@ -679,16 +693,22 @@ int bch2_move_data(struct bch_fs *c,
 	return ret;
 }
 
+typedef enum data_cmd (*move_btree_pred)(struct bch_fs *, void *,
+					 struct btree *, struct bch_io_opts *,
+					 struct data_opts *);
+
 static int bch2_move_btree(struct bch_fs *c,
-			   move_pred_fn pred,
-			   void *arg,
+			   enum btree_id start_btree_id, struct bpos start_pos,
+			   enum btree_id end_btree_id,   struct bpos end_pos,
+			   move_btree_pred pred, void *arg,
 			   struct bch_move_stats *stats)
 {
+	bool kthread = (current->flags & PF_KTHREAD) != 0;
 	struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts);
 	struct btree_trans trans;
 	struct btree_iter *iter;
 	struct btree *b;
-	unsigned id;
+	enum btree_id id;
 	struct data_opts data_opts;
 	enum data_cmd cmd;
 	int ret = 0;
@@ -697,16 +717,24 @@ static int bch2_move_btree(struct bch_fs *c,
 
 	stats->data_type = BCH_DATA_btree;
 
-	for (id = 0; id < BTREE_ID_NR; id++) {
+	for (id = start_btree_id;
+	     id <= min_t(unsigned, end_btree_id, BTREE_ID_NR - 1);
+	     id++) {
 		stats->btree_id = id;
 
-		for_each_btree_node(&trans, iter, id, POS_MIN,
+		for_each_btree_node(&trans, iter, id,
+				    id == start_btree_id ? start_pos : POS_MIN,
 				    BTREE_ITER_PREFETCH, b) {
+			if (kthread && kthread_should_stop())
+				goto out;
+
+			if ((cmp_int(id, end_btree_id) ?:
+			     bkey_cmp(b->key.k.p, end_pos)) > 0)
+				break;
+
 			stats->pos = iter->pos;
 
-			switch ((cmd = pred(c, arg,
-					    bkey_i_to_s_c(&b->key),
-					    &io_opts, &data_opts))) {
+			switch ((cmd = pred(c, arg, b, &io_opts, &data_opts))) {
 			case DATA_SKIP:
 				goto next;
 			case DATA_SCRUB:
@@ -726,7 +754,7 @@ next:
 
 		ret = bch2_trans_iter_free(&trans, iter) ?: ret;
 	}
-
+out:
 	bch2_trans_exit(&trans);
 
 	return ret;
@@ -785,6 +813,38 @@ static enum data_cmd migrate_pred(struct bch_fs *c, void *arg,
 	return DATA_REWRITE;
 }
 
+static enum data_cmd rereplicate_btree_pred(struct bch_fs *c, void *arg,
+					    struct btree *b,
+					    struct bch_io_opts *io_opts,
+					    struct data_opts *data_opts)
+{
+	return rereplicate_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts);
+}
+
+static enum data_cmd migrate_btree_pred(struct bch_fs *c, void *arg,
+					struct btree *b,
+					struct bch_io_opts *io_opts,
+					struct data_opts *data_opts)
+{
+	return migrate_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts);
+}
+
+static enum data_cmd rewrite_old_nodes_pred(struct bch_fs *c, void *arg,
+					    struct btree *b,
+					    struct bch_io_opts *io_opts,
+					    struct data_opts *data_opts)
+{
+	if (b->version_ondisk != c->sb.version ||
+	    btree_node_need_rewrite(b)) {
+		data_opts->target		= 0;
+		data_opts->nr_replicas		= 1;
+		data_opts->btree_insert_flags	= 0;
+		return DATA_REWRITE;
+	}
+
+	return DATA_SKIP;
+}
+
 int bch2_data_job(struct bch_fs *c,
 		  struct bch_move_stats *stats,
 		  struct bch_ioctl_data op)
@@ -796,17 +856,20 @@ int bch2_data_job(struct bch_fs *c,
 		stats->data_type = BCH_DATA_journal;
 		ret = bch2_journal_flush_device_pins(&c->journal, -1);
 
-		ret = bch2_move_btree(c, rereplicate_pred, c, stats) ?: ret;
+		ret = bch2_move_btree(c,
+				      op.start_btree,	op.start_pos,
+				      op.end_btree,	op.end_pos,
+				      rereplicate_btree_pred, c, stats) ?: ret;
 
 		closure_wait_event(&c->btree_interior_update_wait,
 				   !bch2_btree_interior_updates_nr_pending(c));
 
 		ret = bch2_replicas_gc2(c) ?: ret;
 
-		ret = bch2_move_data(c, NULL,
-				     writepoint_hashed((unsigned long) current),
-				     op.start,
-				     op.end,
+		ret = bch2_move_data(c,
+				     op.start_btree,	op.start_pos,
+				     op.end_btree,	op.end_pos,
+				     NULL, writepoint_hashed((unsigned long) current),
 				     rereplicate_pred, c, stats) ?: ret;
 		ret = bch2_replicas_gc2(c) ?: ret;
 		break;
@@ -817,16 +880,32 @@ int bch2_data_job(struct bch_fs *c,
 		stats->data_type = BCH_DATA_journal;
 		ret = bch2_journal_flush_device_pins(&c->journal, op.migrate.dev);
 
-		ret = bch2_move_btree(c, migrate_pred, &op, stats) ?: ret;
+		ret = bch2_move_btree(c,
+				      op.start_btree,	op.start_pos,
+				      op.end_btree,	op.end_pos,
+				      migrate_btree_pred, &op, stats) ?: ret;
 		ret = bch2_replicas_gc2(c) ?: ret;
 
-		ret = bch2_move_data(c, NULL,
-				     writepoint_hashed((unsigned long) current),
-				     op.start,
-				     op.end,
+		ret = bch2_move_data(c,
+				     op.start_btree,	op.start_pos,
+				     op.end_btree,	op.end_pos,
+				     NULL, writepoint_hashed((unsigned long) current),
 				     migrate_pred, &op, stats) ?: ret;
 		ret = bch2_replicas_gc2(c) ?: ret;
 		break;
+	case BCH_DATA_OP_REWRITE_OLD_NODES:
+		ret = bch2_move_btree(c,
+				      op.start_btree,	op.start_pos,
+				      op.end_btree,	op.end_pos,
+				      rewrite_old_nodes_pred, &op, stats) ?: ret;
+
+		if (!ret) {
+			mutex_lock(&c->sb_lock);
+			c->disk_sb.sb->version_min = c->disk_sb.sb->version;
+			bch2_write_super(c);
+			mutex_unlock(&c->sb_lock);
+		}
+		break;
 	default:
 		ret = -EINVAL;
 	}
diff --git a/fs/bcachefs/move.h b/fs/bcachefs/move.h
index b04bc669226d..403ca695c875 100644
--- a/fs/bcachefs/move.h
+++ b/fs/bcachefs/move.h
@@ -52,9 +52,11 @@ typedef enum data_cmd (*move_pred_fn)(struct bch_fs *, void *,
 				struct bkey_s_c,
 				struct bch_io_opts *, struct data_opts *);
 
-int bch2_move_data(struct bch_fs *, struct bch_ratelimit *,
+int bch2_move_data(struct bch_fs *,
+		   enum btree_id, struct bpos,
+		   enum btree_id, struct bpos,
+		   struct bch_ratelimit *,
 		   struct write_point_specifier,
-		   struct bpos, struct bpos,
 		   move_pred_fn, void *,
 		   struct bch_move_stats *);
 
diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c
index b61bbc18a0aa..65a8cd14ee75 100644
--- a/fs/bcachefs/movinggc.c
+++ b/fs/bcachefs/movinggc.c
@@ -219,9 +219,11 @@ static int bch2_copygc(struct bch_fs *c)
 			sizeof(h->data[0]),
 			bucket_offset_cmp, NULL);
 
-	ret = bch2_move_data(c, &c->copygc_pd.rate,
+	ret = bch2_move_data(c,
+			     0,			POS_MIN,
+			     BTREE_ID_NR,	POS_MAX,
+			     &c->copygc_pd.rate,
 			     writepoint_ptr(&c->copygc_write_point),
-			     POS_MIN, POS_MAX,
 			     copygc_pred, NULL,
 			     &move_stats);
 
diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c
index c75411af4622..c83c12dbb0d2 100644
--- a/fs/bcachefs/rebalance.c
+++ b/fs/bcachefs/rebalance.c
@@ -239,10 +239,11 @@ static int bch2_rebalance_thread(void *arg)
 		rebalance_work_reset(c);
 
 		bch2_move_data(c,
+			       0,		POS_MIN,
+			       BTREE_ID_NR,	POS_MAX,
 			       /* ratelimiting disabled for now */
 			       NULL, /*  &r->pd.rate, */
 			       writepoint_ptr(&c->rebalance_write_point),
-			       POS_MIN, POS_MAX,
 			       rebalance_pred, NULL,
 			       &r->move_stats);
 	}
-- 
cgit 


From a4805d6672aac04784af132f0e11ac1dfb208079 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 22 Mar 2021 18:39:16 -0400
Subject: bcachefs: Scan for old btree nodes if necessary on mount

We dropped support for !BTREE_NODE_NEW_EXTENT_OVERWRITE but it turned
out there were people who still had filesystems with btree nodes in that
format in the wild. This adds a new compat feature that indicates we've
scanned for and rewritten nodes in the old format, and does that scan at
mount time if the option isn't set.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs_format.h |  1 +
 fs/bcachefs/btree_gc.c        |  3 ++-
 fs/bcachefs/move.c            | 31 ++++++++++++++++++++-----------
 fs/bcachefs/move.h            |  2 ++
 fs/bcachefs/rebalance.c       |  3 +++
 fs/bcachefs/recovery.c        | 16 ++++++++++++++++
 6 files changed, 44 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index cf092903a6ab..e2df0f7182b4 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -1373,6 +1373,7 @@ enum bch_sb_feature {
 enum bch_sb_compat {
 	BCH_COMPAT_FEAT_ALLOC_INFO	= 0,
 	BCH_COMPAT_FEAT_ALLOC_METADATA	= 1,
+	BCH_COMPAT_FEAT_EXTENTS_ABOVE_BTREE_UPDATES_DONE = 2,
 };
 
 /* options: */
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 426c932098da..259a36f41629 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -1618,7 +1618,8 @@ int bch2_gc_thread_start(struct bch_fs *c)
 {
 	struct task_struct *p;
 
-	BUG_ON(c->gc_thread);
+	if (c->gc_thread)
+		return 0;
 
 	p = kthread_create(bch2_gc_thread, c, "bch-gc/%s", c->name);
 	if (IS_ERR(p)) {
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index 72958b867014..ed18abf8bf14 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -845,6 +845,25 @@ static enum data_cmd rewrite_old_nodes_pred(struct bch_fs *c, void *arg,
 	return DATA_SKIP;
 }
 
+int bch2_scan_old_btree_nodes(struct bch_fs *c, struct bch_move_stats *stats)
+{
+	int ret;
+
+	ret = bch2_move_btree(c,
+			      0,		POS_MIN,
+			      BTREE_ID_NR,	POS_MAX,
+			      rewrite_old_nodes_pred, c, stats);
+	if (!ret) {
+		mutex_lock(&c->sb_lock);
+		c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_FEAT_EXTENTS_ABOVE_BTREE_UPDATES_DONE;
+		c->disk_sb.sb->version_min = c->disk_sb.sb->version;
+		bch2_write_super(c);
+		mutex_unlock(&c->sb_lock);
+	}
+
+	return ret;
+}
+
 int bch2_data_job(struct bch_fs *c,
 		  struct bch_move_stats *stats,
 		  struct bch_ioctl_data op)
@@ -894,17 +913,7 @@ int bch2_data_job(struct bch_fs *c,
 		ret = bch2_replicas_gc2(c) ?: ret;
 		break;
 	case BCH_DATA_OP_REWRITE_OLD_NODES:
-		ret = bch2_move_btree(c,
-				      op.start_btree,	op.start_pos,
-				      op.end_btree,	op.end_pos,
-				      rewrite_old_nodes_pred, &op, stats) ?: ret;
-
-		if (!ret) {
-			mutex_lock(&c->sb_lock);
-			c->disk_sb.sb->version_min = c->disk_sb.sb->version;
-			bch2_write_super(c);
-			mutex_unlock(&c->sb_lock);
-		}
+		ret = bch2_scan_old_btree_nodes(c, stats);
 		break;
 	default:
 		ret = -EINVAL;
diff --git a/fs/bcachefs/move.h b/fs/bcachefs/move.h
index 403ca695c875..5076153689d1 100644
--- a/fs/bcachefs/move.h
+++ b/fs/bcachefs/move.h
@@ -52,6 +52,8 @@ typedef enum data_cmd (*move_pred_fn)(struct bch_fs *, void *,
 				struct bkey_s_c,
 				struct bch_io_opts *, struct data_opts *);
 
+int bch2_scan_old_btree_nodes(struct bch_fs *, struct bch_move_stats *);
+
 int bch2_move_data(struct bch_fs *,
 		   enum btree_id, struct bpos,
 		   enum btree_id, struct bpos,
diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c
index c83c12dbb0d2..0e1f18d82855 100644
--- a/fs/bcachefs/rebalance.c
+++ b/fs/bcachefs/rebalance.c
@@ -312,6 +312,9 @@ int bch2_rebalance_start(struct bch_fs *c)
 {
 	struct task_struct *p;
 
+	if (c->rebalance.thread)
+		return 0;
+
 	if (c->opts.nochanges)
 		return 0;
 
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 0aeaaadbf3f8..e322dc35f992 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -16,6 +16,7 @@
 #include "journal_io.h"
 #include "journal_reclaim.h"
 #include "journal_seq_blacklist.h"
+#include "move.h"
 #include "quota.h"
 #include "recovery.h"
 #include "replicas.h"
@@ -1200,6 +1201,20 @@ use_clean:
 		bch_verbose(c, "quotas done");
 	}
 
+	if (!(c->sb.compat & (1ULL << BCH_COMPAT_FEAT_EXTENTS_ABOVE_BTREE_UPDATES_DONE))) {
+		struct bch_move_stats stats = { 0 };
+
+		bch_verbose(c, "scanning for old btree nodes");
+		ret = bch2_fs_read_write(c);
+		if (ret)
+			goto err;
+
+		ret = bch2_scan_old_btree_nodes(c, &stats);
+		if (ret)
+			goto err;
+		bch_verbose(c, "scanning for old btree nodes done");
+	}
+
 	mutex_lock(&c->sb_lock);
 	if (c->opts.version_upgrade) {
 		if (c->sb.version < bcachefs_metadata_version_new_versioning)
@@ -1271,6 +1286,7 @@ int bch2_fs_initialize(struct bch_fs *c)
 		le16_to_cpu(bcachefs_metadata_version_current);
 	c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_atomic_nlink;
 	c->disk_sb.sb->features[0] |= BCH_SB_FEATURES_ALL;
+	c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_FEAT_EXTENTS_ABOVE_BTREE_UPDATES_DONE;
 
 	bch2_write_super(c);
 	mutex_unlock(&c->sb_lock);
-- 
cgit 


From e01dacf76c0c6a5fc6963b7857773b3d58740acb Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 20 Mar 2021 23:55:36 -0400
Subject: bcachefs: Fix bkey format generation for 32 bit fields

Having a packed format that can represent a field larger than the
unpacked type breaks bkey_packed_successor() assertions - we need to fix this to start using the snapshot filed.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs_format.h |  5 +++--
 fs/bcachefs/bkey.c            |  7 ++++++-
 fs/bcachefs/move.c            | 28 +++++++++++++++++++++++++++-
 fs/bcachefs/recovery.c        |  4 +++-
 4 files changed, 39 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index e2df0f7182b4..0a615fe6c1c1 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -1371,9 +1371,10 @@ enum bch_sb_feature {
 };
 
 enum bch_sb_compat {
-	BCH_COMPAT_FEAT_ALLOC_INFO	= 0,
-	BCH_COMPAT_FEAT_ALLOC_METADATA	= 1,
+	BCH_COMPAT_FEAT_ALLOC_INFO		= 0,
+	BCH_COMPAT_FEAT_ALLOC_METADATA		= 1,
 	BCH_COMPAT_FEAT_EXTENTS_ABOVE_BTREE_UPDATES_DONE = 2,
+	BCH_COMPAT_FEAT_BFORMAT_OVERFLOW_DONE	= 3,
 };
 
 /* options: */
diff --git a/fs/bcachefs/bkey.c b/fs/bcachefs/bkey.c
index 6417307f42b9..aeac07e2cb32 100644
--- a/fs/bcachefs/bkey.c
+++ b/fs/bcachefs/bkey.c
@@ -554,7 +554,12 @@ void bch2_bkey_format_add_pos(struct bkey_format_state *s, struct bpos p)
 static void set_format_field(struct bkey_format *f, enum bch_bkey_fields i,
 			     unsigned bits, u64 offset)
 {
-	offset = bits == 64 ? 0 : min(offset, U64_MAX - ((1ULL << bits) - 1));
+	unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i];
+	u64 unpacked_max = ~((~0ULL << 1) << (unpacked_bits - 1));
+
+	bits = min(bits, unpacked_bits);
+
+	offset = bits == unpacked_bits ? 0 : min(offset, unpacked_max - ((1ULL << bits) - 1));
 
 	f->bits_per_field[i]	= bits;
 	f->field_offset[i]	= cpu_to_le64(offset);
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index ed18abf8bf14..de3554e3854b 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -829,13 +829,38 @@ static enum data_cmd migrate_btree_pred(struct bch_fs *c, void *arg,
 	return migrate_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts);
 }
 
+static bool bformat_needs_redo(struct bkey_format *f)
+{
+	unsigned i;
+
+	for (i = 0; i < f->nr_fields; i++) {
+		unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i];
+		u64 unpacked_mask = ~((~0ULL << 1) << (unpacked_bits - 1));
+		u64 field_offset = le64_to_cpu(f->field_offset[i]);
+
+		if (f->bits_per_field[i] > unpacked_bits)
+			return true;
+
+		if ((f->bits_per_field[i] == unpacked_bits) && field_offset)
+			return true;
+
+		if (((field_offset + ((1ULL << f->bits_per_field[i]) - 1)) &
+		     unpacked_mask) <
+		    field_offset)
+			return true;
+	}
+
+	return false;
+}
+
 static enum data_cmd rewrite_old_nodes_pred(struct bch_fs *c, void *arg,
 					    struct btree *b,
 					    struct bch_io_opts *io_opts,
 					    struct data_opts *data_opts)
 {
 	if (b->version_ondisk != c->sb.version ||
-	    btree_node_need_rewrite(b)) {
+	    btree_node_need_rewrite(b) ||
+	    bformat_needs_redo(&b->format)) {
 		data_opts->target		= 0;
 		data_opts->nr_replicas		= 1;
 		data_opts->btree_insert_flags	= 0;
@@ -856,6 +881,7 @@ int bch2_scan_old_btree_nodes(struct bch_fs *c, struct bch_move_stats *stats)
 	if (!ret) {
 		mutex_lock(&c->sb_lock);
 		c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_FEAT_EXTENTS_ABOVE_BTREE_UPDATES_DONE;
+		c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_FEAT_BFORMAT_OVERFLOW_DONE;
 		c->disk_sb.sb->version_min = c->disk_sb.sb->version;
 		bch2_write_super(c);
 		mutex_unlock(&c->sb_lock);
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index e322dc35f992..edcf6389d2fd 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -1201,7 +1201,8 @@ use_clean:
 		bch_verbose(c, "quotas done");
 	}
 
-	if (!(c->sb.compat & (1ULL << BCH_COMPAT_FEAT_EXTENTS_ABOVE_BTREE_UPDATES_DONE))) {
+	if (!(c->sb.compat & (1ULL << BCH_COMPAT_FEAT_EXTENTS_ABOVE_BTREE_UPDATES_DONE)) ||
+	    !(c->sb.compat & (1ULL << BCH_COMPAT_FEAT_BFORMAT_OVERFLOW_DONE))) {
 		struct bch_move_stats stats = { 0 };
 
 		bch_verbose(c, "scanning for old btree nodes");
@@ -1287,6 +1288,7 @@ int bch2_fs_initialize(struct bch_fs *c)
 	c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_atomic_nlink;
 	c->disk_sb.sb->features[0] |= BCH_SB_FEATURES_ALL;
 	c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_FEAT_EXTENTS_ABOVE_BTREE_UPDATES_DONE;
+	c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_FEAT_BFORMAT_OVERFLOW_DONE;
 
 	bch2_write_super(c);
 	mutex_unlock(&c->sb_lock);
-- 
cgit 


From 220d206232ba682002b06feb68969e462867f8f5 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 11 Mar 2021 21:46:23 -0500
Subject: bcachefs: Fix an allocator startup race

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/super.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 224c21c3f9f7..19399447f379 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -1177,6 +1177,8 @@ static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx)
 	if (!ca)
 		goto err;
 
+	ca->fs = c;
+
 	if (ca->mi.state == BCH_MEMBER_STATE_RW &&
 	    bch2_dev_allocator_start(ca)) {
 		bch2_dev_free(ca);
-- 
cgit 


From 33a391a2551beca12926c77b16404f6aab2e8f58 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 24 Mar 2021 22:11:22 -0400
Subject: bcachefs: Fix some (spurious) warnings about uninitialized vars

These are only complained about when building in userspace, for some
reason.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_gc.c | 2 +-
 fs/bcachefs/extents.c  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 259a36f41629..7c03c50e0fa5 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -166,7 +166,7 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id,
 {
 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(*k);
 	const union bch_extent_entry *entry;
-	struct extent_ptr_decoded p;
+	struct extent_ptr_decoded p = { 0 };
 	bool do_update = false;
 	int ret = 0;
 
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 4007af4a780b..4bf4c27da6cd 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -696,7 +696,7 @@ unsigned bch2_bkey_replicas(struct bch_fs *c, struct bkey_s_c k)
 {
 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
 	const union bch_extent_entry *entry;
-	struct extent_ptr_decoded p;
+	struct extent_ptr_decoded p = { 0 };
 	unsigned replicas = 0;
 
 	bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
-- 
cgit 


From 19dd3172b0002a3a5f8ead324db03a72c0cac2a2 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 4 Apr 2021 21:57:35 -0400
Subject: bcachefs: Use x-macros for compat feature bits

This is to generate strings for them, so that we can print them out.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs_format.h | 14 ++++++++++----
 fs/bcachefs/btree_gc.c        |  2 +-
 fs/bcachefs/move.c            |  7 +++++--
 fs/bcachefs/opts.c            |  7 +++++++
 fs/bcachefs/opts.h            |  1 +
 fs/bcachefs/recovery.c        | 28 ++++++++++++++--------------
 fs/bcachefs/super-io.c        |  5 ++---
 7 files changed, 40 insertions(+), 24 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index 0a615fe6c1c1..dff49ab7e93d 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -1370,11 +1370,17 @@ enum bch_sb_feature {
 	BCH_FEATURE_NR,
 };
 
+#define BCH_SB_COMPAT()					\
+	x(alloc_info,				0)	\
+	x(alloc_metadata,			1)	\
+	x(extents_above_btree_updates_done,	2)	\
+	x(bformat_overflow_done,		3)
+
 enum bch_sb_compat {
-	BCH_COMPAT_FEAT_ALLOC_INFO		= 0,
-	BCH_COMPAT_FEAT_ALLOC_METADATA		= 1,
-	BCH_COMPAT_FEAT_EXTENTS_ABOVE_BTREE_UPDATES_DONE = 2,
-	BCH_COMPAT_FEAT_BFORMAT_OVERFLOW_DONE	= 3,
+#define x(f, n) BCH_COMPAT_##f,
+	BCH_SB_COMPAT()
+#undef x
+	BCH_COMPAT_NR,
 };
 
 /* options: */
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 7c03c50e0fa5..414642099ea1 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -762,7 +762,7 @@ static int bch2_gc_done(struct bch_fs *c,
 {
 	struct bch_dev *ca;
 	bool verify = (!initial ||
-		       (c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_INFO)));
+		       (c->sb.compat & (1ULL << BCH_COMPAT_alloc_info)));
 	unsigned i, dev;
 	int ret = 0;
 
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index de3554e3854b..2343f41715ef 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -757,6 +757,9 @@ next:
 out:
 	bch2_trans_exit(&trans);
 
+	if (ret)
+		bch_err(c, "error %i in bch2_move_btree", ret);
+
 	return ret;
 }
 
@@ -880,8 +883,8 @@ int bch2_scan_old_btree_nodes(struct bch_fs *c, struct bch_move_stats *stats)
 			      rewrite_old_nodes_pred, c, stats);
 	if (!ret) {
 		mutex_lock(&c->sb_lock);
-		c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_FEAT_EXTENTS_ABOVE_BTREE_UPDATES_DONE;
-		c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_FEAT_BFORMAT_OVERFLOW_DONE;
+		c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_extents_above_btree_updates_done;
+		c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_bformat_overflow_done;
 		c->disk_sb.sb->version_min = c->disk_sb.sb->version;
 		bch2_write_super(c);
 		mutex_unlock(&c->sb_lock);
diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c
index 97a36ac0beea..d53b6dccd161 100644
--- a/fs/bcachefs/opts.c
+++ b/fs/bcachefs/opts.c
@@ -23,6 +23,13 @@ const char * const bch2_sb_features[] = {
 	NULL
 };
 
+const char * const bch2_sb_compat[] = {
+#define x(f, n) #f,
+	BCH_SB_COMPAT()
+#undef x
+	NULL
+};
+
 const char * const bch2_csum_opts[] = {
 	"none",
 	"crc32c",
diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
index 01282314bacb..d2b3549a33af 100644
--- a/fs/bcachefs/opts.h
+++ b/fs/bcachefs/opts.h
@@ -10,6 +10,7 @@
 
 extern const char * const bch2_error_actions[];
 extern const char * const bch2_sb_features[];
+extern const char * const bch2_sb_compat[];
 extern const char * const bch2_csum_opts[];
 extern const char * const bch2_compression_opts[];
 extern const char * const bch2_str_hash_types[];
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index edcf6389d2fd..c42919277c72 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -933,7 +933,7 @@ static int read_btree_roots(struct bch_fs *c)
 
 		if (i == BTREE_ID_ALLOC &&
 		    c->opts.reconstruct_alloc) {
-			c->sb.compat &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_INFO);
+			c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
 			continue;
 		}
 
@@ -943,7 +943,7 @@ static int read_btree_roots(struct bch_fs *c)
 				   "invalid btree root %s",
 				   bch2_btree_ids[i]);
 			if (i == BTREE_ID_ALLOC)
-				c->sb.compat &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_INFO);
+				c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
 		}
 
 		ret = bch2_btree_root_read(c, i, &r->key, r->level);
@@ -953,7 +953,7 @@ static int read_btree_roots(struct bch_fs *c)
 				   "error reading btree root %s",
 				   bch2_btree_ids[i]);
 			if (i == BTREE_ID_ALLOC)
-				c->sb.compat &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_INFO);
+				c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
 		}
 	}
 
@@ -1020,7 +1020,7 @@ int bch2_fs_recovery(struct bch_fs *c)
 					last_journal_entry &&
 					!journal_entry_empty(last_journal_entry), c,
 				"filesystem marked clean but journal not empty")) {
-			c->sb.compat &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_INFO);
+			c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
 			SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
 			c->sb.clean = false;
 		}
@@ -1061,7 +1061,7 @@ use_clean:
 	}
 
 	if (c->opts.reconstruct_alloc) {
-		c->sb.compat &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_INFO);
+		c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
 		drop_alloc_keys(&c->journal_keys);
 	}
 
@@ -1114,8 +1114,8 @@ use_clean:
 	set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags);
 
 	if (c->opts.fsck ||
-	    !(c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_INFO)) ||
-	    !(c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_METADATA)) ||
+	    !(c->sb.compat & (1ULL << BCH_COMPAT_alloc_info)) ||
+	    !(c->sb.compat & (1ULL << BCH_COMPAT_alloc_metadata)) ||
 	    test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags)) {
 		bch_info(c, "starting mark and sweep");
 		err = "error in mark and sweep";
@@ -1201,11 +1201,11 @@ use_clean:
 		bch_verbose(c, "quotas done");
 	}
 
-	if (!(c->sb.compat & (1ULL << BCH_COMPAT_FEAT_EXTENTS_ABOVE_BTREE_UPDATES_DONE)) ||
-	    !(c->sb.compat & (1ULL << BCH_COMPAT_FEAT_BFORMAT_OVERFLOW_DONE))) {
+	if (!(c->sb.compat & (1ULL << BCH_COMPAT_extents_above_btree_updates_done)) ||
+	    !(c->sb.compat & (1ULL << BCH_COMPAT_bformat_overflow_done))) {
 		struct bch_move_stats stats = { 0 };
 
-		bch_verbose(c, "scanning for old btree nodes");
+		bch_info(c, "scanning for old btree nodes");
 		ret = bch2_fs_read_write(c);
 		if (ret)
 			goto err;
@@ -1213,7 +1213,7 @@ use_clean:
 		ret = bch2_scan_old_btree_nodes(c, &stats);
 		if (ret)
 			goto err;
-		bch_verbose(c, "scanning for old btree nodes done");
+		bch_info(c, "scanning for old btree nodes done");
 	}
 
 	mutex_lock(&c->sb_lock);
@@ -1227,7 +1227,7 @@ use_clean:
 	}
 
 	if (!test_bit(BCH_FS_ERROR, &c->flags)) {
-		c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_FEAT_ALLOC_INFO;
+		c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_alloc_info;
 		write_sb = true;
 	}
 
@@ -1287,8 +1287,8 @@ int bch2_fs_initialize(struct bch_fs *c)
 		le16_to_cpu(bcachefs_metadata_version_current);
 	c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_atomic_nlink;
 	c->disk_sb.sb->features[0] |= BCH_SB_FEATURES_ALL;
-	c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_FEAT_EXTENTS_ABOVE_BTREE_UPDATES_DONE;
-	c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_FEAT_BFORMAT_OVERFLOW_DONE;
+	c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_extents_above_btree_updates_done;
+	c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_bformat_overflow_done;
 
 	bch2_write_super(c);
 	mutex_unlock(&c->sb_lock);
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index 79d03b18b5c8..ce370cf2a72f 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -382,7 +382,6 @@ static void bch2_sb_update(struct bch_fs *c)
 		ca->mi = bch2_mi_to_cpu(mi->members + i);
 }
 
-/* doesn't copy member info */
 static void __copy_super(struct bch_sb_handle *dst_handle, struct bch_sb *src)
 {
 	struct bch_sb_field *src_f, *dst_f;
@@ -1083,8 +1082,8 @@ void bch2_fs_mark_clean(struct bch_fs *c)
 
 	SET_BCH_SB_CLEAN(c->disk_sb.sb, true);
 
-	c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_FEAT_ALLOC_INFO;
-	c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_FEAT_ALLOC_METADATA;
+	c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_alloc_info;
+	c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_alloc_metadata;
 	c->disk_sb.sb->features[0] &= ~(1ULL << BCH_FEATURE_extents_above_btree_updates);
 	c->disk_sb.sb->features[0] &= ~(1ULL << BCH_FEATURE_btree_updates_journalled);
 
-- 
cgit 


From 1b05778707d04cb367a7ca8dbeff571b6117a191 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 5 Apr 2021 00:53:42 -0400
Subject: bcachefs: Add a cond_seched() to the allocator thread

This is just a band-aid fix for now.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index b9b97cbda177..f2117084f2fe 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -1127,6 +1127,7 @@ static int bch2_allocator_thread(void *arg)
 		pr_debug("free_inc now empty");
 
 		do {
+			cond_resched();
 			/*
 			 * Find some buckets that we can invalidate, either
 			 * they're completely unused, or only contain clean data
-- 
cgit 


From ed8269cc1d4107985aa92ade34cd3fe71315dd6a Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 2 Apr 2021 23:41:10 -0400
Subject: bcachefs: Don't fail mounts due to devices that are marked as failed

If a given set of replicas is entirely on failed devices, don't fail the
mount: we will still fail the mount if we have some copies on non failed
devices.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/replicas.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c
index 8003973b0400..15ff0d3c936a 100644
--- a/fs/bcachefs/replicas.c
+++ b/fs/bcachefs/replicas.c
@@ -975,11 +975,18 @@ bool bch2_have_enough_devs(struct bch_fs *c, struct bch_devs_mask devs,
 
 	percpu_down_read(&c->mark_lock);
 	for_each_cpu_replicas_entry(&c->replicas, e) {
-		unsigned i, nr_online = 0, dflags = 0;
+		unsigned i, nr_online = 0, nr_failed = 0, dflags = 0;
 		bool metadata = e->data_type < BCH_DATA_user;
 
-		for (i = 0; i < e->nr_devs; i++)
+		for (i = 0; i < e->nr_devs; i++) {
+			struct bch_dev *ca = bch_dev_bkey_exists(c, e->devs[i]);
+
 			nr_online += test_bit(e->devs[i], devs.d);
+			nr_failed += ca->mi.state == BCH_MEMBER_STATE_FAILED;
+		}
+
+		if (nr_failed == e->nr_devs)
+			continue;
 
 		if (nr_online < e->nr_required)
 			dflags |= metadata
-- 
cgit 


From 98f2197de49b8eb038909e709c79c13178022dda Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 9 Apr 2021 19:04:57 -0400
Subject: bcachefs: Fix bch2_write_super to obey very_degraded option

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/super-io.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index ce370cf2a72f..776c026ac838 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -699,8 +699,12 @@ int bch2_write_super(struct bch_fs *c)
 	const char *err;
 	struct bch_devs_mask sb_written;
 	bool wrote, can_mount_without_written, can_mount_with_written;
+	unsigned degraded_flags = BCH_FORCE_IF_DEGRADED;
 	int ret = 0;
 
+	if (c->opts.very_degraded)
+		degraded_flags |= BCH_FORCE_IF_LOST;
+
 	lockdep_assert_held(&c->sb_lock);
 
 	closure_init_stack(cl);
@@ -769,13 +773,13 @@ int bch2_write_super(struct bch_fs *c)
 	nr_wrote = dev_mask_nr(&sb_written);
 
 	can_mount_with_written =
-		bch2_have_enough_devs(c, sb_written, BCH_FORCE_IF_DEGRADED, false);
+		bch2_have_enough_devs(c, sb_written, degraded_flags, false);
 
 	for (i = 0; i < ARRAY_SIZE(sb_written.d); i++)
 		sb_written.d[i] = ~sb_written.d[i];
 
 	can_mount_without_written =
-		bch2_have_enough_devs(c, sb_written, BCH_FORCE_IF_DEGRADED, false);
+		bch2_have_enough_devs(c, sb_written, degraded_flags, false);
 
 	/*
 	 * If we would be able to mount _without_ the devices we successfully
-- 
cgit 


From 41e3778636cab27ef43e6e1b1cf3d8c2952cc77a Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 16 Apr 2021 16:54:11 -0400
Subject: bcachefs: Bring back metadata only gc

This is useful for the filesystem dump debugging tool - when we're
hitting bugs we want to skip as much of the recovery process as
possible, and the dump tool only needs to know where metadata lives.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_gc.c | 65 ++++++++++++++++++++++++++++++++------------------
 fs/bcachefs/btree_gc.h |  2 +-
 fs/bcachefs/recovery.c |  4 +++-
 3 files changed, 46 insertions(+), 25 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 414642099ea1..12f77ff4de62 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -378,12 +378,13 @@ static int btree_gc_mark_node(struct bch_fs *c, struct btree *b, u8 *max_stale,
 }
 
 static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id,
-			 bool initial)
+			 bool initial, bool metadata_only)
 {
 	struct btree_trans trans;
 	struct btree_iter *iter;
 	struct btree *b;
-	unsigned depth = bch2_expensive_debug_checks	? 0
+	unsigned depth = metadata_only			? 1
+		: bch2_expensive_debug_checks		? 0
 		: !btree_node_type_needs_gc(btree_id)	? 1
 		: 0;
 	u8 max_stale = 0;
@@ -526,11 +527,13 @@ fsck_err:
 }
 
 static int bch2_gc_btree_init(struct bch_fs *c,
-			      enum btree_id btree_id)
+			      enum btree_id btree_id,
+			      bool metadata_only)
 {
 	struct btree *b;
-	unsigned target_depth = bch2_expensive_debug_checks	? 0
-		: !btree_node_type_needs_gc(btree_id)		? 1
+	unsigned target_depth = metadata_only		? 1
+		: bch2_expensive_debug_checks		? 0
+		: !btree_node_type_needs_gc(btree_id)	? 1
 		: 0;
 	u8 max_stale = 0;
 	int ret = 0;
@@ -576,7 +579,7 @@ static inline int btree_id_gc_phase_cmp(enum btree_id l, enum btree_id r)
 		(int) btree_id_to_gc_phase(r);
 }
 
-static int bch2_gc_btrees(struct bch_fs *c, bool initial)
+static int bch2_gc_btrees(struct bch_fs *c, bool initial, bool metadata_only)
 {
 	enum btree_id ids[BTREE_ID_NR];
 	unsigned i;
@@ -588,8 +591,8 @@ static int bch2_gc_btrees(struct bch_fs *c, bool initial)
 	for (i = 0; i < BTREE_ID_NR; i++) {
 		enum btree_id id = ids[i];
 		int ret = initial
-			? bch2_gc_btree_init(c, id)
-			: bch2_gc_btree(c, id, initial);
+			? bch2_gc_btree_init(c, id, metadata_only)
+			: bch2_gc_btree(c, id, initial, metadata_only);
 		if (ret) {
 			bch_err(c, "%s: ret %i", __func__, ret);
 			return ret;
@@ -758,10 +761,10 @@ static void bch2_gc_free(struct bch_fs *c)
 }
 
 static int bch2_gc_done(struct bch_fs *c,
-			bool initial)
+			bool initial, bool metadata_only)
 {
 	struct bch_dev *ca;
-	bool verify = (!initial ||
+	bool verify = !metadata_only && (!initial ||
 		       (c->sb.compat & (1ULL << BCH_COMPAT_alloc_info)));
 	unsigned i, dev;
 	int ret = 0;
@@ -800,7 +803,7 @@ static int bch2_gc_done(struct bch_fs *c,
 #define copy_fs_field(_f, _msg, ...)					\
 	copy_field(_f, "fs has wrong " _msg, ##__VA_ARGS__)
 
-	{
+	if (!metadata_only) {
 		struct genradix_iter iter = genradix_iter_init(&c->stripes[1], 0);
 		struct stripe *dst, *src;
 
@@ -873,20 +876,28 @@ static int bch2_gc_done(struct bch_fs *c,
 
 		copy_fs_field(hidden,		"hidden");
 		copy_fs_field(btree,		"btree");
-		copy_fs_field(data,	"data");
-		copy_fs_field(cached,	"cached");
-		copy_fs_field(reserved,	"reserved");
-		copy_fs_field(nr_inodes,"nr_inodes");
 
-		for (i = 0; i < BCH_REPLICAS_MAX; i++)
-			copy_fs_field(persistent_reserved[i],
-				      "persistent_reserved[%i]", i);
+		if (!metadata_only) {
+			copy_fs_field(data,	"data");
+			copy_fs_field(cached,	"cached");
+			copy_fs_field(reserved,	"reserved");
+			copy_fs_field(nr_inodes,"nr_inodes");
+
+			for (i = 0; i < BCH_REPLICAS_MAX; i++)
+				copy_fs_field(persistent_reserved[i],
+					      "persistent_reserved[%i]", i);
+		}
 
 		for (i = 0; i < c->replicas.nr; i++) {
 			struct bch_replicas_entry *e =
 				cpu_replicas_entry(&c->replicas, i);
 			char buf[80];
 
+			if (metadata_only &&
+			    (e->data_type == BCH_DATA_user ||
+			     e->data_type == BCH_DATA_cached))
+				continue;
+
 			bch2_replicas_entry_to_text(&PBUF(buf), e);
 
 			copy_fs_field(replicas[i], "%s", buf);
@@ -904,7 +915,8 @@ fsck_err:
 	return ret;
 }
 
-static int bch2_gc_start(struct bch_fs *c)
+static int bch2_gc_start(struct bch_fs *c,
+			 bool metadata_only)
 {
 	struct bch_dev *ca;
 	unsigned i;
@@ -968,6 +980,13 @@ static int bch2_gc_start(struct bch_fs *c)
 
 			d->_mark.gen = dst->b[b].oldest_gen = s->mark.gen;
 			d->gen_valid = s->gen_valid;
+
+			if (metadata_only &&
+			    (s->mark.data_type == BCH_DATA_user ||
+			     s->mark.data_type == BCH_DATA_cached)) {
+				d->_mark = s->mark;
+				d->_mark.owned_by_allocator = 0;
+			}
 		}
 	};
 
@@ -994,7 +1013,7 @@ static int bch2_gc_start(struct bch_fs *c)
  *    move around - if references move backwards in the ordering GC
  *    uses, GC could skip past them
  */
-int bch2_gc(struct bch_fs *c, bool initial)
+int bch2_gc(struct bch_fs *c, bool initial, bool metadata_only)
 {
 	struct bch_dev *ca;
 	u64 start_time = local_clock();
@@ -1010,13 +1029,13 @@ int bch2_gc(struct bch_fs *c, bool initial)
 	closure_wait_event(&c->btree_interior_update_wait,
 			   !bch2_btree_interior_updates_nr_pending(c));
 again:
-	ret = bch2_gc_start(c);
+	ret = bch2_gc_start(c, metadata_only);
 	if (ret)
 		goto out;
 
 	bch2_mark_superblocks(c);
 
-	ret = bch2_gc_btrees(c, initial);
+	ret = bch2_gc_btrees(c, initial, metadata_only);
 	if (ret)
 		goto out;
 
@@ -1054,7 +1073,7 @@ out:
 		bch2_journal_block(&c->journal);
 
 		percpu_down_write(&c->mark_lock);
-		ret = bch2_gc_done(c, initial);
+		ret = bch2_gc_done(c, initial, metadata_only);
 
 		bch2_journal_unblock(&c->journal);
 	} else {
diff --git a/fs/bcachefs/btree_gc.h b/fs/bcachefs/btree_gc.h
index fa604efc70cc..f516faded269 100644
--- a/fs/bcachefs/btree_gc.h
+++ b/fs/bcachefs/btree_gc.h
@@ -6,7 +6,7 @@
 
 void bch2_coalesce(struct bch_fs *);
 
-int bch2_gc(struct bch_fs *, bool);
+int bch2_gc(struct bch_fs *, bool, bool);
 int bch2_gc_gens(struct bch_fs *);
 void bch2_gc_thread_stop(struct bch_fs *);
 int bch2_gc_thread_start(struct bch_fs *);
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index c42919277c72..740fdeafe1a2 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -1117,9 +1117,11 @@ use_clean:
 	    !(c->sb.compat & (1ULL << BCH_COMPAT_alloc_info)) ||
 	    !(c->sb.compat & (1ULL << BCH_COMPAT_alloc_metadata)) ||
 	    test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags)) {
+		bool metadata_only = c->opts.norecovery;
+
 		bch_info(c, "starting mark and sweep");
 		err = "error in mark and sweep";
-		ret = bch2_gc(c, true);
+		ret = bch2_gc(c, true, metadata_only);
 		if (ret)
 			goto err;
 		bch_verbose(c, "mark and sweep done");
-- 
cgit 


From d065472c3a7966b5104cce6901f329250f629758 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 16 Apr 2021 18:02:57 -0400
Subject: bcachefs: Fix a use-after-free in bch2_gc_mark_key()

bch2_check_fix_ptrs() can update/reallocate k

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_gc.c | 43 ++++++++++++++++++++++++-------------------
 1 file changed, 24 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 12f77ff4de62..f7a5bd9eca0b 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -281,10 +281,10 @@ fsck_err:
 
 static int bch2_gc_mark_key(struct bch_fs *c, enum btree_id btree_id,
 			    unsigned level, bool is_root,
-			    struct bkey_s_c k,
+			    struct bkey_s_c *k,
 			    u8 *max_stale, bool initial)
 {
-	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+	struct bkey_ptrs_c ptrs;
 	const struct bch_extent_ptr *ptr;
 	unsigned flags =
 		BTREE_TRIGGER_GC|
@@ -293,28 +293,29 @@ static int bch2_gc_mark_key(struct bch_fs *c, enum btree_id btree_id,
 
 	if (initial) {
 		BUG_ON(bch2_journal_seq_verify &&
-		       k.k->version.lo > journal_cur_seq(&c->journal));
+		       k->k->version.lo > journal_cur_seq(&c->journal));
 
-		if (fsck_err_on(k.k->version.lo > atomic64_read(&c->key_version), c,
+		if (fsck_err_on(k->k->version.lo > atomic64_read(&c->key_version), c,
 				"key version number higher than recorded: %llu > %llu",
-				k.k->version.lo,
+				k->k->version.lo,
 				atomic64_read(&c->key_version)))
-			atomic64_set(&c->key_version, k.k->version.lo);
+			atomic64_set(&c->key_version, k->k->version.lo);
 
 		if (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) ||
-		    fsck_err_on(!bch2_bkey_replicas_marked(c, k), c,
+		    fsck_err_on(!bch2_bkey_replicas_marked(c, *k), c,
 				"superblock not marked as containing replicas (type %u)",
-				k.k->type)) {
-			ret = bch2_mark_bkey_replicas(c, k);
+				k->k->type)) {
+			ret = bch2_mark_bkey_replicas(c, *k);
 			if (ret) {
 				bch_err(c, "error marking bkey replicas: %i", ret);
 				goto err;
 			}
 		}
 
-		ret = bch2_check_fix_ptrs(c, btree_id, level, is_root, &k);
+		ret = bch2_check_fix_ptrs(c, btree_id, level, is_root, k);
 	}
 
+	ptrs = bch2_bkey_ptrs_c(*k);
 	bkey_for_each_ptr(ptrs, ptr) {
 		struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
 		struct bucket *g = PTR_BUCKET(ca, ptr, true);
@@ -325,7 +326,7 @@ static int bch2_gc_mark_key(struct bch_fs *c, enum btree_id btree_id,
 		*max_stale = max(*max_stale, ptr_stale(ca, ptr));
 	}
 
-	bch2_mark_key(c, k, 0, k.k->size, NULL, 0, flags);
+	bch2_mark_key(c, *k, 0, k->k->size, NULL, 0, flags);
 fsck_err:
 err:
 	if (ret)
@@ -356,7 +357,7 @@ static int btree_gc_mark_node(struct bch_fs *c, struct btree *b, u8 *max_stale,
 		bch2_bkey_debugcheck(c, b, k);
 
 		ret = bch2_gc_mark_key(c, b->c.btree_id, b->c.level, false,
-				       k, max_stale, initial);
+				       &k, max_stale, initial);
 		if (ret)
 			break;
 
@@ -426,10 +427,12 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id,
 
 	mutex_lock(&c->btree_root_lock);
 	b = c->btree_roots[btree_id].b;
-	if (!btree_node_fake(b))
+	if (!btree_node_fake(b)) {
+		struct bkey_s_c k = bkey_i_to_s_c(&b->key);
+
 		ret = bch2_gc_mark_key(c, b->c.btree_id, b->c.level, true,
-				       bkey_i_to_s_c(&b->key),
-				       &max_stale, initial);
+				       &k, &max_stale, initial);
+	}
 	gc_pos_set(c, gc_pos_btree_root(b->c.btree_id));
 	mutex_unlock(&c->btree_root_lock);
 
@@ -457,7 +460,7 @@ static int bch2_gc_btree_init_recurse(struct bch_fs *c, struct btree *b,
 		BUG_ON(bkey_cmp(k.k->p, b->data->max_key) > 0);
 
 		ret = bch2_gc_mark_key(c, b->c.btree_id, b->c.level, false,
-				       k, &max_stale, true);
+				       &k, &max_stale, true);
 		if (ret) {
 			bch_err(c, "%s: error %i from bch2_gc_mark_key", __func__, ret);
 			break;
@@ -561,10 +564,12 @@ static int bch2_gc_btree_init(struct bch_fs *c,
 	if (b->c.level >= target_depth)
 		ret = bch2_gc_btree_init_recurse(c, b, target_depth);
 
-	if (!ret)
+	if (!ret) {
+		struct bkey_s_c k = bkey_i_to_s_c(&b->key);
+
 		ret = bch2_gc_mark_key(c, b->c.btree_id, b->c.level, true,
-				       bkey_i_to_s_c(&b->key),
-				       &max_stale, true);
+				       &k, &max_stale, true);
+	}
 fsck_err:
 	six_unlock_read(&b->c.lock);
 
-- 
cgit 


From 006d69aa2655f1a0ca4e47666939669f27bb740f Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 16 Apr 2021 18:59:54 -0400
Subject: bcachefs: Don't drop ptrs to btree nodes

If a ptr gen doesn't match the bucket gen, the bucket likely doesn't
contain the data we want - but it's still possible the data we want
might have been overwritten, and for btree node pointers we can verify
whether or not the node is the one we wanted with the node's sequence
number, so it's better to keep the pointer and try reading from it.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_gc.c | 51 ++++++++++++++++++++++++++++++++------------------
 fs/bcachefs/btree_io.c |  5 ++++-
 2 files changed, 37 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index f7a5bd9eca0b..7506a3de58ff 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -244,25 +244,40 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id,
 
 		bkey_reassemble(new, *k);
 
-		bch2_bkey_drop_ptrs(bkey_i_to_s(new), ptr, ({
-			struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
-			struct bucket *g = PTR_BUCKET(ca, ptr, true);
-
-			(ptr->cached &&
-			 (!g->gen_valid || gen_cmp(ptr->gen, g->mark.gen) > 0)) ||
-			(!ptr->cached &&
-			 gen_cmp(ptr->gen, g->mark.gen) < 0);
-		}));
+		if (level) {
+			/*
+			 * We don't want to drop btree node pointers - if the
+			 * btree node isn't there anymore, the read path will
+			 * sort it out:
+			 */
+			ptrs = bch2_bkey_ptrs(bkey_i_to_s(new));
+			bkey_for_each_ptr(ptrs, ptr) {
+				struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+				struct bucket *g = PTR_BUCKET(ca, ptr, true);
+
+				ptr->gen = g->mark.gen;
+			}
+		} else {
+			bch2_bkey_drop_ptrs(bkey_i_to_s(new), ptr, ({
+				struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+				struct bucket *g = PTR_BUCKET(ca, ptr, true);
+
+				(ptr->cached &&
+				 (!g->gen_valid || gen_cmp(ptr->gen, g->mark.gen) > 0)) ||
+				(!ptr->cached &&
+				 gen_cmp(ptr->gen, g->mark.gen) < 0);
+			}));
 again:
-		ptrs = bch2_bkey_ptrs(bkey_i_to_s(new));
-		bkey_extent_entry_for_each(ptrs, entry) {
-			if (extent_entry_type(entry) == BCH_EXTENT_ENTRY_stripe_ptr) {
-				struct stripe *m = genradix_ptr(&c->stripes[true],
-								entry->stripe_ptr.idx);
-
-				if (!m || !m->alive) {
-					bch2_bkey_extent_entry_drop(new, entry);
-					goto again;
+			ptrs = bch2_bkey_ptrs(bkey_i_to_s(new));
+			bkey_extent_entry_for_each(ptrs, entry) {
+				if (extent_entry_type(entry) == BCH_EXTENT_ENTRY_stripe_ptr) {
+					struct stripe *m = genradix_ptr(&c->stripes[true],
+									entry->stripe_ptr.idx);
+
+					if (!m || !m->alive) {
+						bch2_bkey_extent_entry_drop(new, entry);
+						goto again;
+					}
 				}
 			}
 		}
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index eac51c39fc6c..c7c91c1d5b23 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -1206,14 +1206,17 @@ void bch2_btree_node_read(struct bch_fs *c, struct btree *b,
 	struct btree_read_bio *rb;
 	struct bch_dev *ca;
 	struct bio *bio;
+	char buf[200];
 	int ret;
 
+	btree_pos_to_text(&PBUF(buf), c, b);
 	trace_btree_read(c, b);
 
 	ret = bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key),
 					 NULL, &pick);
 	if (bch2_fs_fatal_err_on(ret <= 0, c,
-			"btree node read error: no device to read from")) {
+			"btree node read error: no device to read from\n"
+			" at %s", buf)) {
 		set_btree_node_read_error(b);
 		return;
 	}
-- 
cgit 


From cb66fc5fe4cc806d60d8884cb82b67c357b49640 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 13 Apr 2021 09:49:23 -0400
Subject: bcachefs: Fix copygc threshold

Awhile back the meaning of is_available_bucket() and thus also
bch_dev_usage->buckets_unavailable changed to include buckets that are
owned by the allocator - this was so that the stat could be persisted
like other allocation information, and wouldn't have to be regenerated
by walking each bucket at mount time.

This broke copygc, which needs to consider buckets that are reclaimable
and haven't yet been grabbed by the allocator thread and moved onta
freelist. This patch fixes that by adding dev_buckets_reclaimable() for
copygc and the allocator thread, and cleans up some of the callers a bit.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c | 19 ++++---------------
 fs/bcachefs/alloc_foreground.c |  5 ++++-
 fs/bcachefs/bcachefs.h         |  2 +-
 fs/bcachefs/buckets.h          | 26 ++++++++++++++++----------
 fs/bcachefs/movinggc.c         |  5 ++---
 fs/bcachefs/sysfs.c            |  9 ++++++---
 6 files changed, 33 insertions(+), 33 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index f2117084f2fe..f603fd347d58 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -46,7 +46,7 @@ static void pd_controllers_update(struct work_struct *work)
 		struct bch_dev_usage stats = bch2_dev_usage_read(ca);
 
 		free += bucket_to_sector(ca,
-				__dev_buckets_free(ca, stats)) << 9;
+				__dev_buckets_available(ca, stats)) << 9;
 		/*
 		 * Bytes of internal fragmentation, which can be
 		 * reclaimed by copy GC
@@ -499,7 +499,6 @@ static int wait_buckets_available(struct bch_fs *c, struct bch_dev *ca)
 {
 	unsigned long gc_count = c->gc_count;
 	s64 available;
-	unsigned i;
 	int ret = 0;
 
 	ca->allocator_state = ALLOCATOR_BLOCKED;
@@ -515,19 +514,12 @@ static int wait_buckets_available(struct bch_fs *c, struct bch_dev *ca)
 		if (gc_count != c->gc_count)
 			ca->inc_gen_really_needs_gc = 0;
 
-		available  = dev_buckets_available(ca);
+		available  = dev_buckets_reclaimable(ca);
 		available -= ca->inc_gen_really_needs_gc;
 
-		spin_lock(&c->freelist_lock);
-		for (i = 0; i < RESERVE_NR; i++)
-			available -= fifo_used(&ca->free[i]);
-		spin_unlock(&c->freelist_lock);
-
 		available = max(available, 0LL);
 
-		if (available > fifo_free(&ca->free_inc) ||
-		    (available &&
-		     !fifo_full(&ca->free[RESERVE_MOVINGGC])))
+		if (available)
 			break;
 
 		up_read(&c->gc_lock);
@@ -1189,7 +1181,7 @@ stop:
 void bch2_recalc_capacity(struct bch_fs *c)
 {
 	struct bch_dev *ca;
-	u64 capacity = 0, reserved_sectors = 0, gc_reserve, copygc_threshold = 0;
+	u64 capacity = 0, reserved_sectors = 0, gc_reserve;
 	unsigned bucket_size_max = 0;
 	unsigned long ra_pages = 0;
 	unsigned i, j;
@@ -1232,8 +1224,6 @@ void bch2_recalc_capacity(struct bch_fs *c)
 
 		dev_reserve *= ca->mi.bucket_size;
 
-		copygc_threshold += dev_reserve;
-
 		capacity += bucket_to_sector(ca, ca->mi.nbuckets -
 					     ca->mi.first_bucket);
 
@@ -1251,7 +1241,6 @@ void bch2_recalc_capacity(struct bch_fs *c)
 
 	reserved_sectors = min(reserved_sectors, capacity);
 
-	c->copygc_threshold = copygc_threshold;
 	c->capacity = capacity - reserved_sectors;
 
 	c->bucket_size_max = bucket_size_max;
diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index 97b692bcfe46..4834ac798b9e 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -109,7 +109,9 @@ void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob)
 	spin_lock(&c->freelist_lock);
 	ob->freelist = c->open_buckets_freelist;
 	c->open_buckets_freelist = ob - c->open_buckets;
+
 	c->open_buckets_nr_free++;
+	ca->nr_open_buckets--;
 	spin_unlock(&c->freelist_lock);
 
 	closure_wake_up(&c->open_buckets_wait);
@@ -316,6 +318,7 @@ out:
 		c->blocked_allocate = 0;
 	}
 
+	ca->nr_open_buckets++;
 	spin_unlock(&c->freelist_lock);
 
 	bch2_wake_allocator(ca);
@@ -351,7 +354,7 @@ void bch2_dev_stripe_increment(struct bch_dev *ca,
 			       struct dev_stripe_state *stripe)
 {
 	u64 *v = stripe->next_alloc + ca->dev_idx;
-	u64 free_space = dev_buckets_free(ca);
+	u64 free_space = dev_buckets_available(ca);
 	u64 free_space_inv = free_space
 		? div64_u64(1ULL << 48, free_space)
 		: 1ULL << 48;
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index cec5c3ddce34..76b72ed693a8 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -447,6 +447,7 @@ struct bch_dev {
 	 */
 	alloc_fifo		free[RESERVE_NR];
 	alloc_fifo		free_inc;
+	unsigned		nr_open_buckets;
 
 	open_bucket_idx_t	open_buckets_partial[OPEN_BUCKETS_COUNT];
 	open_bucket_idx_t	open_buckets_partial_nr;
@@ -772,7 +773,6 @@ mempool_t		bio_bounce_pages;
 	copygc_heap		copygc_heap;
 	struct bch_pd_controller copygc_pd;
 	struct write_point	copygc_write_point;
-	u64			copygc_threshold;
 
 	/* STRIPES: */
 	GENRADIX(struct stripe) stripes[2];
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index c965c4d48218..e53cee27a720 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -175,25 +175,31 @@ static inline u64 __dev_buckets_available(struct bch_dev *ca,
 	return total - stats.buckets_unavailable;
 }
 
-/*
- * Number of reclaimable buckets - only for use by the allocator thread:
- */
 static inline u64 dev_buckets_available(struct bch_dev *ca)
 {
 	return __dev_buckets_available(ca, bch2_dev_usage_read(ca));
 }
 
-static inline u64 __dev_buckets_free(struct bch_dev *ca,
-				     struct bch_dev_usage stats)
+static inline u64 __dev_buckets_reclaimable(struct bch_dev *ca,
+					    struct bch_dev_usage stats)
 {
-	return __dev_buckets_available(ca, stats) +
-		fifo_used(&ca->free[RESERVE_NONE]) +
-		fifo_used(&ca->free_inc);
+	struct bch_fs *c = ca->fs;
+	s64 available = __dev_buckets_available(ca, stats);
+	unsigned i;
+
+	spin_lock(&c->freelist_lock);
+	for (i = 0; i < RESERVE_NR; i++)
+		available -= fifo_used(&ca->free[i]);
+	available -= fifo_used(&ca->free_inc);
+	available -= ca->nr_open_buckets;
+	spin_unlock(&c->freelist_lock);
+
+	return max(available, 0LL);
 }
 
-static inline u64 dev_buckets_free(struct bch_dev *ca)
+static inline u64 dev_buckets_reclaimable(struct bch_dev *ca)
 {
-	return __dev_buckets_free(ca, bch2_dev_usage_read(ca));
+	return __dev_buckets_reclaimable(ca, bch2_dev_usage_read(ca));
 }
 
 /* Filesystem usage: */
diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c
index 65a8cd14ee75..b8da600cdc53 100644
--- a/fs/bcachefs/movinggc.c
+++ b/fs/bcachefs/movinggc.c
@@ -282,13 +282,12 @@ unsigned long bch2_copygc_wait_amount(struct bch_fs *c)
 {
 	struct bch_dev *ca;
 	unsigned dev_idx;
-	u64 fragmented_allowed = c->copygc_threshold;
-	u64 fragmented = 0;
+	u64 fragmented_allowed = 0, fragmented = 0;
 
 	for_each_rw_member(ca, c, dev_idx) {
 		struct bch_dev_usage usage = bch2_dev_usage_read(ca);
 
-		fragmented_allowed += ((__dev_buckets_available(ca, usage) *
+		fragmented_allowed += ((__dev_buckets_reclaimable(ca, usage) *
 					ca->mi.bucket_size) >> 1);
 		fragmented += usage.d[BCH_DATA_user].fragmented;
 	}
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index 49c19873ad6f..9f75f72f7b12 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -805,7 +805,9 @@ static void dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca)
 	       "free[RESERVE_MOVINGGC]\t%zu/%zu\n"
 	       "free[RESERVE_NONE]\t%zu/%zu\n"
 	       "freelist_wait\t\t%s\n"
-	       "open buckets\t\t%u/%u (reserved %u)\n"
+	       "open buckets allocated\t%u\n"
+	       "open buckets this dev\t%u\n"
+	       "open buckets total\t%u\n"
 	       "open_buckets_wait\t%s\n"
 	       "open_buckets_btree\t%u\n"
 	       "open_buckets_user\t%u\n"
@@ -816,8 +818,9 @@ static void dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca)
 	       fifo_used(&ca->free[RESERVE_MOVINGGC]),	ca->free[RESERVE_MOVINGGC].size,
 	       fifo_used(&ca->free[RESERVE_NONE]),	ca->free[RESERVE_NONE].size,
 	       c->freelist_wait.list.first		? "waiting" : "empty",
-	       c->open_buckets_nr_free, OPEN_BUCKETS_COUNT,
-	       BTREE_NODE_OPEN_BUCKET_RESERVE,
+	       OPEN_BUCKETS_COUNT - c->open_buckets_nr_free,
+	       ca->nr_open_buckets,
+	       OPEN_BUCKETS_COUNT,
 	       c->open_buckets_wait.list.first		? "waiting" : "empty",
 	       nr[BCH_DATA_btree],
 	       nr[BCH_DATA_user],
-- 
cgit 


From 5bbe4bf95bdd18500c5de52e5d38a91fbb5f6234 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 13 Apr 2021 14:45:55 -0400
Subject: bcachefs: Add copygc wait to sysfs

Currently debugging an issue with copygc not running when it's supposed
to, and this is an obvious first step.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs.h | 1 +
 fs/bcachefs/movinggc.c | 3 +++
 fs/bcachefs/sysfs.c    | 5 +++++
 3 files changed, 9 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 76b72ed693a8..234918fdd717 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -773,6 +773,7 @@ mempool_t		bio_bounce_pages;
 	copygc_heap		copygc_heap;
 	struct bch_pd_controller copygc_pd;
 	struct write_point	copygc_write_point;
+	s64			copygc_wait;
 
 	/* STRIPES: */
 	GENRADIX(struct stripe) stripes[2];
diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c
index b8da600cdc53..113013d1fc48 100644
--- a/fs/bcachefs/movinggc.c
+++ b/fs/bcachefs/movinggc.c
@@ -311,11 +311,14 @@ static int bch2_copygc_thread(void *arg)
 		wait = bch2_copygc_wait_amount(c);
 
 		if (wait > clock->max_slop) {
+			c->copygc_wait = last + wait;
 			bch2_kthread_io_clock_wait(clock, last + wait,
 					MAX_SCHEDULE_TIMEOUT);
 			continue;
 		}
 
+		c->copygc_wait = 0;
+
 		if (bch2_copygc(c))
 			break;
 	}
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index 9f75f72f7b12..1f31458ffec9 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -188,6 +188,7 @@ rw_attribute(cache_replacement_policy);
 rw_attribute(label);
 
 rw_attribute(copy_gc_enabled);
+read_attribute(copy_gc_wait);
 sysfs_pd_controller_attribute(copy_gc);
 
 rw_attribute(rebalance_enabled);
@@ -336,6 +337,9 @@ SHOW(bch2_fs)
 	sysfs_printf(rebalance_enabled,		"%i", c->rebalance.enabled);
 	sysfs_pd_controller_show(rebalance,	&c->rebalance.pd); /* XXX */
 	sysfs_pd_controller_show(copy_gc,	&c->copygc_pd);
+	sysfs_hprint(copy_gc_wait,
+		     max(0LL, c->copygc_wait -
+			 atomic64_read(&c->io_clock[WRITE].now)) << 9);
 
 	if (attr == &sysfs_rebalance_work) {
 		bch2_rebalance_work_to_text(&out, c);
@@ -563,6 +567,7 @@ struct attribute *bch2_fs_internal_files[] = {
 	&sysfs_prune_cache,
 
 	&sysfs_copy_gc_enabled,
+	&sysfs_copy_gc_wait,
 
 	&sysfs_rebalance_enabled,
 	&sysfs_rebalance_work,
-- 
cgit 


From 51c66fedc0ea4a16d5d45f94a619c43897018da8 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 17 Apr 2021 20:24:54 -0400
Subject: bcachefs: Rip out copygc pd controller

We have a separate mechanism for ratelimiting copygc now - the pd
controller has only been causing problems.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c | 35 -----------------------------------
 fs/bcachefs/bcachefs.h         |  4 ----
 fs/bcachefs/movinggc.c         |  7 +------
 fs/bcachefs/super.c            |  4 ----
 fs/bcachefs/sysfs.c            | 11 -----------
 5 files changed, 1 insertion(+), 60 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index f603fd347d58..055b6b559666 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -31,38 +31,6 @@ static const unsigned BCH_ALLOC_V1_FIELD_BYTES[] = {
 #undef x
 };
 
-/* Ratelimiting/PD controllers */
-
-static void pd_controllers_update(struct work_struct *work)
-{
-	struct bch_fs *c = container_of(to_delayed_work(work),
-					   struct bch_fs,
-					   pd_controllers_update);
-	struct bch_dev *ca;
-	s64 free = 0, fragmented = 0;
-	unsigned i;
-
-	for_each_member_device(ca, c, i) {
-		struct bch_dev_usage stats = bch2_dev_usage_read(ca);
-
-		free += bucket_to_sector(ca,
-				__dev_buckets_available(ca, stats)) << 9;
-		/*
-		 * Bytes of internal fragmentation, which can be
-		 * reclaimed by copy GC
-		 */
-		fragmented += max_t(s64, 0, (bucket_to_sector(ca,
-					stats.d[BCH_DATA_user].buckets +
-					stats.d[BCH_DATA_cached].buckets) -
-				  (stats.d[BCH_DATA_user].sectors +
-				   stats.d[BCH_DATA_cached].sectors)) << 9);
-	}
-
-	bch2_pd_controller_update(&c->copygc_pd, free, fragmented, -1);
-	schedule_delayed_work(&c->pd_controllers_update,
-			      c->pd_controllers_update_seconds * HZ);
-}
-
 /* Persistent alloc info: */
 
 static inline u64 alloc_field_v1_get(const struct bch_alloc *a,
@@ -1405,7 +1373,4 @@ int bch2_dev_allocator_start(struct bch_dev *ca)
 void bch2_fs_allocator_background_init(struct bch_fs *c)
 {
 	spin_lock_init(&c->freelist_lock);
-
-	c->pd_controllers_update_seconds = 5;
-	INIT_DELAYED_WORK(&c->pd_controllers_update, pd_controllers_update);
 }
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 234918fdd717..12441f943a6d 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -664,9 +664,6 @@ struct bch_fs {
 	struct workqueue_struct	*copygc_wq;
 
 	/* ALLOCATION */
-	struct delayed_work	pd_controllers_update;
-	unsigned		pd_controllers_update_seconds;
-
 	struct bch_devs_mask	rw_devs[BCH_DATA_NR];
 
 	u64			capacity; /* sectors */
@@ -771,7 +768,6 @@ mempool_t		bio_bounce_pages;
 	/* COPYGC */
 	struct task_struct	*copygc_thread;
 	copygc_heap		copygc_heap;
-	struct bch_pd_controller copygc_pd;
 	struct write_point	copygc_write_point;
 	s64			copygc_wait;
 
diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c
index 113013d1fc48..e25fa0a2a4b5 100644
--- a/fs/bcachefs/movinggc.c
+++ b/fs/bcachefs/movinggc.c
@@ -222,7 +222,7 @@ static int bch2_copygc(struct bch_fs *c)
 	ret = bch2_move_data(c,
 			     0,			POS_MIN,
 			     BTREE_ID_NR,	POS_MAX,
-			     &c->copygc_pd.rate,
+			     NULL,
 			     writepoint_ptr(&c->copygc_write_point),
 			     copygc_pred, NULL,
 			     &move_stats);
@@ -328,9 +328,6 @@ static int bch2_copygc_thread(void *arg)
 
 void bch2_copygc_stop(struct bch_fs *c)
 {
-	c->copygc_pd.rate.rate = UINT_MAX;
-	bch2_ratelimit_reset(&c->copygc_pd.rate);
-
 	if (c->copygc_thread) {
 		kthread_stop(c->copygc_thread);
 		put_task_struct(c->copygc_thread);
@@ -367,6 +364,4 @@ int bch2_copygc_start(struct bch_fs *c)
 
 void bch2_fs_copygc_init(struct bch_fs *c)
 {
-	bch2_pd_controller_init(&c->copygc_pd);
-	c->copygc_pd.d_term = 0;
 }
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 19399447f379..650a559737fd 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -293,7 +293,6 @@ void bch2_fs_read_only(struct bch_fs *c)
 	percpu_ref_kill(&c->writes);
 
 	cancel_work_sync(&c->ec_stripe_delete_work);
-	cancel_delayed_work(&c->pd_controllers_update);
 
 	/*
 	 * If we're not doing an emergency shutdown, we want to wait on
@@ -378,8 +377,6 @@ static int bch2_fs_read_write_late(struct bch_fs *c)
 		return ret;
 	}
 
-	schedule_delayed_work(&c->pd_controllers_update, 5 * HZ);
-
 	schedule_work(&c->ec_stripe_delete_work);
 
 	return 0;
@@ -571,7 +568,6 @@ void __bch2_fs_stop(struct bch_fs *c)
 		cancel_work_sync(&ca->io_error_work);
 
 	cancel_work_sync(&c->btree_write_error_work);
-	cancel_delayed_work_sync(&c->pd_controllers_update);
 	cancel_work_sync(&c->read_only_work);
 }
 
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index 1f31458ffec9..ff93e5ba9f41 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -189,7 +189,6 @@ rw_attribute(label);
 
 rw_attribute(copy_gc_enabled);
 read_attribute(copy_gc_wait);
-sysfs_pd_controller_attribute(copy_gc);
 
 rw_attribute(rebalance_enabled);
 sysfs_pd_controller_attribute(rebalance);
@@ -198,8 +197,6 @@ rw_attribute(promote_whole_extents);
 
 read_attribute(new_stripes);
 
-rw_attribute(pd_controllers_update_seconds);
-
 read_attribute(io_timers_read);
 read_attribute(io_timers_write);
 
@@ -331,12 +328,8 @@ SHOW(bch2_fs)
 
 	sysfs_printf(copy_gc_enabled, "%i", c->copy_gc_enabled);
 
-	sysfs_print(pd_controllers_update_seconds,
-		    c->pd_controllers_update_seconds);
-
 	sysfs_printf(rebalance_enabled,		"%i", c->rebalance.enabled);
 	sysfs_pd_controller_show(rebalance,	&c->rebalance.pd); /* XXX */
-	sysfs_pd_controller_show(copy_gc,	&c->copygc_pd);
 	sysfs_hprint(copy_gc_wait,
 		     max(0LL, c->copygc_wait -
 			 atomic64_read(&c->io_clock[WRITE].now)) << 9);
@@ -447,10 +440,7 @@ STORE(bch2_fs)
 		return ret;
 	}
 
-	sysfs_strtoul(pd_controllers_update_seconds,
-		      c->pd_controllers_update_seconds);
 	sysfs_pd_controller_store(rebalance,	&c->rebalance.pd);
-	sysfs_pd_controller_store(copy_gc,	&c->copygc_pd);
 
 	sysfs_strtoul(promote_whole_extents,	c->promote_whole_extents);
 
@@ -572,7 +562,6 @@ struct attribute *bch2_fs_internal_files[] = {
 	&sysfs_rebalance_enabled,
 	&sysfs_rebalance_work,
 	sysfs_pd_controller_files(rebalance),
-	sysfs_pd_controller_files(copy_gc),
 
 	&sysfs_new_stripes,
 
-- 
cgit 


From bae895a5a3300c2da605dd0c841e175c4c9e5872 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 18 Apr 2021 17:54:56 -0400
Subject: bcachefs: Add allocator thread state to sysfs

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c | 25 ++++++++++++++++---------
 fs/bcachefs/alloc_background.h |  4 +++-
 fs/bcachefs/alloc_types.h      | 12 ++++++++++++
 fs/bcachefs/bcachefs.h         | 11 +----------
 fs/bcachefs/movinggc.c         |  2 +-
 fs/bcachefs/sysfs.c            |  6 ++++--
 6 files changed, 37 insertions(+), 23 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 055b6b559666..54e58b377e51 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -25,6 +25,13 @@
 #include <linux/sched/task.h>
 #include <linux/sort.h>
 
+const char * const bch2_allocator_states[] = {
+#define x(n)	#n,
+	ALLOC_THREAD_STATES()
+#undef x
+	NULL
+};
+
 static const unsigned BCH_ALLOC_V1_FIELD_BYTES[] = {
 #define x(name, bits) [BCH_ALLOC_FIELD_V1_##name] = bits / 8,
 	BCH_ALLOC_FIELDS_V1()
@@ -469,7 +476,7 @@ static int wait_buckets_available(struct bch_fs *c, struct bch_dev *ca)
 	s64 available;
 	int ret = 0;
 
-	ca->allocator_state = ALLOCATOR_BLOCKED;
+	ca->allocator_state = ALLOCATOR_blocked;
 	closure_wake_up(&c->freelist_wait);
 
 	while (1) {
@@ -497,7 +504,7 @@ static int wait_buckets_available(struct bch_fs *c, struct bch_dev *ca)
 	}
 
 	__set_current_state(TASK_RUNNING);
-	ca->allocator_state = ALLOCATOR_RUNNING;
+	ca->allocator_state = ALLOCATOR_running;
 	closure_wake_up(&c->freelist_wait);
 
 	return ret;
@@ -978,15 +985,15 @@ static int push_invalidated_bucket(struct bch_fs *c, struct bch_dev *ca, size_t
 				fifo_pop(&ca->free_inc, bucket);
 
 				closure_wake_up(&c->freelist_wait);
-				ca->allocator_state = ALLOCATOR_RUNNING;
+				ca->allocator_state = ALLOCATOR_running;
 
 				spin_unlock(&c->freelist_lock);
 				goto out;
 			}
 		}
 
-		if (ca->allocator_state != ALLOCATOR_BLOCKED_FULL) {
-			ca->allocator_state = ALLOCATOR_BLOCKED_FULL;
+		if (ca->allocator_state != ALLOCATOR_blocked_full) {
+			ca->allocator_state = ALLOCATOR_blocked_full;
 			closure_wake_up(&c->freelist_wait);
 		}
 
@@ -1053,12 +1060,12 @@ static int bch2_allocator_thread(void *arg)
 
 	while (1) {
 		if (!allocator_thread_running(ca)) {
-			ca->allocator_state = ALLOCATOR_STOPPED;
+			ca->allocator_state = ALLOCATOR_stopped;
 			if (kthread_wait_freezable(allocator_thread_running(ca)))
 				break;
 		}
 
-		ca->allocator_state = ALLOCATOR_RUNNING;
+		ca->allocator_state = ALLOCATOR_running;
 
 		cond_resched();
 		if (kthread_should_stop())
@@ -1139,7 +1146,7 @@ static int bch2_allocator_thread(void *arg)
 
 stop:
 	pr_debug("alloc thread stopping (ret %i)", ret);
-	ca->allocator_state = ALLOCATOR_STOPPED;
+	ca->allocator_state = ALLOCATOR_stopped;
 	closure_wake_up(&c->freelist_wait);
 	return 0;
 }
@@ -1319,7 +1326,7 @@ void bch2_dev_allocator_quiesce(struct bch_fs *c, struct bch_dev *ca)
 {
 	if (ca->alloc_thread)
 		closure_wait_event(&c->freelist_wait,
-				   ca->allocator_state != ALLOCATOR_RUNNING);
+				   ca->allocator_state != ALLOCATOR_running);
 }
 
 /* stop allocator thread: */
diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h
index 6fededcd9f86..73e1c27c96e3 100644
--- a/fs/bcachefs/alloc_background.h
+++ b/fs/bcachefs/alloc_background.h
@@ -6,6 +6,8 @@
 #include "alloc_types.h"
 #include "debug.h"
 
+extern const char * const bch2_allocator_states[];
+
 struct bkey_alloc_unpacked {
 	u64		bucket;
 	u8		dev;
@@ -100,7 +102,7 @@ static inline void bch2_wake_allocator(struct bch_dev *ca)
 	p = rcu_dereference(ca->alloc_thread);
 	if (p) {
 		wake_up_process(p);
-		ca->allocator_state = ALLOCATOR_RUNNING;
+		ca->allocator_state = ALLOCATOR_running;
 	}
 	rcu_read_unlock();
 }
diff --git a/fs/bcachefs/alloc_types.h b/fs/bcachefs/alloc_types.h
index be164d6108bb..4a1cd8b73d16 100644
--- a/fs/bcachefs/alloc_types.h
+++ b/fs/bcachefs/alloc_types.h
@@ -10,6 +10,18 @@
 
 struct ec_bucket_buf;
 
+#define ALLOC_THREAD_STATES()		\
+	x(stopped)			\
+	x(running)			\
+	x(blocked)			\
+	x(blocked_full)
+
+enum allocator_states {
+#define x(n)	ALLOCATOR_##n,
+	ALLOC_THREAD_STATES()
+#undef x
+};
+
 enum alloc_reserve {
 	RESERVE_BTREE_MOVINGGC	= -2,
 	RESERVE_BTREE		= -1,
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 12441f943a6d..9502f393a59f 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -457,16 +457,7 @@ struct bch_dev {
 	size_t			inc_gen_needs_gc;
 	size_t			inc_gen_really_needs_gc;
 
-	/*
-	 * XXX: this should be an enum for allocator state, so as to include
-	 * error state
-	 */
-	enum {
-		ALLOCATOR_STOPPED,
-		ALLOCATOR_RUNNING,
-		ALLOCATOR_BLOCKED,
-		ALLOCATOR_BLOCKED_FULL,
-	}			allocator_state;
+	enum allocator_states	allocator_state;
 
 	alloc_heap		alloc_heap;
 
diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c
index e25fa0a2a4b5..3d57a72e63e4 100644
--- a/fs/bcachefs/movinggc.c
+++ b/fs/bcachefs/movinggc.c
@@ -108,7 +108,7 @@ static bool have_copygc_reserve(struct bch_dev *ca)
 
 	spin_lock(&ca->fs->freelist_lock);
 	ret = fifo_full(&ca->free[RESERVE_MOVINGGC]) ||
-		ca->allocator_state != ALLOCATOR_RUNNING;
+		ca->allocator_state != ALLOCATOR_running;
 	spin_unlock(&ca->fs->freelist_lock);
 
 	return ret;
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index ff93e5ba9f41..c4d79096c53a 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -805,7 +805,8 @@ static void dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca)
 	       "open_buckets_wait\t%s\n"
 	       "open_buckets_btree\t%u\n"
 	       "open_buckets_user\t%u\n"
-	       "btree reserve cache\t%u\n",
+	       "btree reserve cache\t%u\n"
+	       "thread state:\t\t%s\n",
 	       stats.buckets_ec,
 	       __dev_buckets_available(ca, stats),
 	       fifo_used(&ca->free_inc),		ca->free_inc.size,
@@ -818,7 +819,8 @@ static void dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca)
 	       c->open_buckets_wait.list.first		? "waiting" : "empty",
 	       nr[BCH_DATA_btree],
 	       nr[BCH_DATA_user],
-	       c->btree_reserve_cache_nr);
+	       c->btree_reserve_cache_nr,
+	       bch2_allocator_states[ca->allocator_state]);
 }
 
 static const char * const bch2_rw[] = {
-- 
cgit 


From 2ee47eec44f0613d6c51d88f8c820a5e8ed624a3 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 18 Apr 2021 18:01:49 -0400
Subject: bcachefs: Fix for copygc getting stuck waiting for reserve to be
 filled

This fixes a regression from the patch
  bcachefs: Fix copygc dying on startup

In general only the allocator thread itself should be updating
ca->allocator_state, the thread waking up the allocator setting it is an
ugly hack only needed to avoid racing with the copygc threads when we're
first starting up.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.h | 4 +---
 fs/bcachefs/super.c            | 9 ++++++++-
 2 files changed, 9 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h
index 73e1c27c96e3..ad15a80602c0 100644
--- a/fs/bcachefs/alloc_background.h
+++ b/fs/bcachefs/alloc_background.h
@@ -100,10 +100,8 @@ static inline void bch2_wake_allocator(struct bch_dev *ca)
 
 	rcu_read_lock();
 	p = rcu_dereference(ca->alloc_thread);
-	if (p) {
+	if (p)
 		wake_up_process(p);
-		ca->allocator_state = ALLOCATOR_running;
-	}
 	rcu_read_unlock();
 }
 
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 650a559737fd..0ff80816a54f 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -908,9 +908,16 @@ int bch2_fs_start(struct bch_fs *c)
 	/*
 	 * Allocator threads don't start filling copygc reserve until after we
 	 * set BCH_FS_STARTED - wake them now:
+	 *
+	 * XXX ugly hack:
+	 * Need to set ca->allocator_state here instead of relying on the
+	 * allocator threads to do it to avoid racing with the copygc threads
+	 * checking it and thinking they have no alloc reserve:
 	 */
-	for_each_online_member(ca, c, i)
+	for_each_online_member(ca, c, i) {
+		ca->allocator_state = ALLOCATOR_running;
 		bch2_wake_allocator(ca);
+	}
 
 	if (c->opts.read_only || c->opts.nochanges) {
 		bch2_fs_read_only(c);
-- 
cgit 


From 9ae28f824e1bcc922c8f20d6b502ed0388026e3a Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 21 Jun 2021 16:30:52 -0400
Subject: bcachefs: Start journal reclaim thread earlier

Especially in userspace, we sometime run into resource exhaustion issues
with starting up threads after mark and sweep/fsck.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/journal.c | 2 +-
 fs/bcachefs/super.c   | 8 +-------
 2 files changed, 2 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index c2b1eef6265a..d5bbbf619359 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -1044,7 +1044,7 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq,
 	bch2_journal_space_available(j);
 	spin_unlock(&j->lock);
 
-	return 0;
+	return bch2_journal_reclaim_start(j);
 }
 
 /* init/exit: */
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 0ff80816a54f..8de04bfae800 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -276,7 +276,7 @@ static void bch2_writes_disabled(struct percpu_ref *writes)
 void bch2_fs_read_only(struct bch_fs *c)
 {
 	if (!test_bit(BCH_FS_RW, &c->flags)) {
-		BUG_ON(c->journal.reclaim_thread);
+		bch2_journal_reclaim_stop(&c->journal);
 		return;
 	}
 
@@ -431,12 +431,6 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early)
 	for_each_rw_member(ca, c, i)
 		bch2_wake_allocator(ca);
 
-	ret = bch2_journal_reclaim_start(&c->journal);
-	if (ret) {
-		bch_err(c, "error starting journal reclaim: %i", ret);
-		return ret;
-	}
-
 	if (!early) {
 		ret = bch2_fs_read_write_late(c);
 		if (ret)
-- 
cgit 


From e131b6aa0a316f9724e6e8ad2f9be091c07115a7 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 24 Apr 2021 00:09:06 -0400
Subject: bcachefs: Add a mempool for btree_trans bump allocator

This allocation is required for filesystem operations to make forward
progress, thus needs a mempool.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs.h    |  1 +
 fs/bcachefs/btree_iter.c  | 30 ++++++++++++++++++++++++++----
 fs/bcachefs/btree_types.h |  2 ++
 3 files changed, 29 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 9502f393a59f..8c279261c98c 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -644,6 +644,7 @@ struct bch_fs {
 	struct mutex		btree_trans_lock;
 	struct list_head	btree_trans_list;
 	mempool_t		btree_iters_pool;
+	mempool_t		btree_trans_mem_pool;
 	struct btree_iter_buf  __percpu	*btree_iters_bufs;
 
 	struct srcu_struct	btree_trans_barrier;
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 660e9e827ed4..96814a244784 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -2181,7 +2181,16 @@ static int bch2_trans_preload_mem(struct btree_trans *trans, size_t size)
 	if (size > trans->mem_bytes) {
 		size_t old_bytes = trans->mem_bytes;
 		size_t new_bytes = roundup_pow_of_two(size);
-		void *new_mem = krealloc(trans->mem, new_bytes, GFP_NOFS);
+		void *new_mem;
+
+		WARN_ON_ONCE(new_bytes > BTREE_TRANS_MEM_MAX);
+
+		new_mem = krealloc(trans->mem, new_bytes, GFP_NOFS);
+		if (!new_mem && new_bytes <= BTREE_TRANS_MEM_MAX) {
+			new_mem = mempool_alloc(&trans->c->btree_trans_mem_pool, GFP_KERNEL);
+			new_bytes = BTREE_TRANS_MEM_MAX;
+			kfree(trans->mem);
+		}
 
 		if (!new_mem)
 			return -ENOMEM;
@@ -2293,8 +2302,13 @@ void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c,
 	if (expected_mem_bytes) {
 		expected_mem_bytes = roundup_pow_of_two(expected_mem_bytes);
 		trans->mem = kmalloc(expected_mem_bytes, GFP_KERNEL);
-		if (trans->mem)
+
+		if (!unlikely(trans->mem)) {
+			trans->mem = mempool_alloc(&c->btree_trans_mem_pool, GFP_KERNEL);
+			trans->mem_bytes = BTREE_TRANS_MEM_MAX;
+		} else {
 			trans->mem_bytes = expected_mem_bytes;
+		}
 	}
 
 	trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
@@ -2324,7 +2338,11 @@ int bch2_trans_exit(struct btree_trans *trans)
 	bch2_journal_preres_put(&trans->c->journal, &trans->journal_preres);
 
 	kfree(trans->fs_usage_deltas);
-	kfree(trans->mem);
+
+	if (trans->mem_bytes == BTREE_TRANS_MEM_MAX)
+		mempool_free(trans->mem, &trans->c->btree_trans_mem_pool);
+	else
+		kfree(trans->mem);
 
 #ifdef __KERNEL__
 	/*
@@ -2332,6 +2350,7 @@ int bch2_trans_exit(struct btree_trans *trans)
 	 */
 	trans->iters = this_cpu_xchg(c->btree_iters_bufs->iter, trans->iters);
 #endif
+
 	if (trans->iters)
 		mempool_free(trans->iters, &trans->c->btree_iters_pool);
 
@@ -2407,6 +2426,7 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct bch_fs *c)
 
 void bch2_fs_btree_iter_exit(struct bch_fs *c)
 {
+	mempool_exit(&c->btree_trans_mem_pool);
 	mempool_exit(&c->btree_iters_pool);
 	cleanup_srcu_struct(&c->btree_trans_barrier);
 }
@@ -2422,5 +2442,7 @@ int bch2_fs_btree_iter_init(struct bch_fs *c)
 		mempool_init_kmalloc_pool(&c->btree_iters_pool, 1,
 			sizeof(struct btree_iter) * nr +
 			sizeof(struct btree_insert_entry) * nr +
-			sizeof(struct btree_insert_entry) * nr);
+			sizeof(struct btree_insert_entry) * nr) ?:
+		mempool_init_kmalloc_pool(&c->btree_trans_mem_pool, 1,
+					  BTREE_TRANS_MEM_MAX);
 }
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 55d8d815a04a..38414d19e71e 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -346,6 +346,8 @@ struct btree_insert_entry {
 #define BTREE_ITER_MAX		32
 #endif
 
+#define BTREE_TRANS_MEM_MAX	(1U << 14)
+
 struct btree_trans {
 	struct bch_fs		*c;
 #ifdef CONFIG_BCACHEFS_DEBUG
-- 
cgit 


From 9620c3ec2fa83e18f52e99b3dd2e8451446ca17e Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 24 Apr 2021 00:24:25 -0400
Subject: bcachefs: Add a mempool for the replicas delta list

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs.h   |  3 +++
 fs/bcachefs/btree_iter.c |  9 ++++++++-
 fs/bcachefs/buckets.c    | 20 ++++++++++++++++++--
 fs/bcachefs/replicas.c   | 18 +++++++++++++++++-
 fs/bcachefs/replicas.h   |  1 +
 fs/bcachefs/super.c      |  7 +------
 6 files changed, 48 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 8c279261c98c..51ba38f19ca9 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -546,6 +546,8 @@ struct btree_iter_buf {
 	struct btree_iter	*iter;
 };
 
+#define REPLICAS_DELTA_LIST_MAX	(1U << 16)
+
 struct bch_fs {
 	struct closure		cl;
 
@@ -573,6 +575,7 @@ struct bch_fs {
 	struct bch_replicas_cpu replicas;
 	struct bch_replicas_cpu replicas_gc;
 	struct mutex		replicas_gc_lock;
+	mempool_t		replicas_delta_pool;
 
 	struct journal_entry_res btree_root_journal_res;
 	struct journal_entry_res replicas_journal_res;
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 96814a244784..eccc9bb0e4e9 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -2337,7 +2337,14 @@ int bch2_trans_exit(struct btree_trans *trans)
 
 	bch2_journal_preres_put(&trans->c->journal, &trans->journal_preres);
 
-	kfree(trans->fs_usage_deltas);
+	if (trans->fs_usage_deltas) {
+		if (trans->fs_usage_deltas->size + sizeof(trans->fs_usage_deltas) ==
+		    REPLICAS_DELTA_LIST_MAX)
+			mempool_free(trans->fs_usage_deltas,
+				     &trans->c->replicas_delta_pool);
+		else
+			kfree(trans->fs_usage_deltas);
+	}
 
 	if (trans->mem_bytes == BTREE_TRANS_MEM_MAX)
 		mempool_free(trans->mem, &trans->c->btree_trans_mem_pool);
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 66e50e6b36ea..3dcc77d5242f 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -560,10 +560,26 @@ replicas_deltas_realloc(struct btree_trans *trans, unsigned more)
 {
 	struct replicas_delta_list *d = trans->fs_usage_deltas;
 	unsigned new_size = d ? (d->size + more) * 2 : 128;
+	unsigned alloc_size = sizeof(*d) + new_size;
+
+	WARN_ON_ONCE(alloc_size > REPLICAS_DELTA_LIST_MAX);
 
 	if (!d || d->used + more > d->size) {
-		d = krealloc(d, sizeof(*d) + new_size, GFP_NOIO|__GFP_ZERO);
-		BUG_ON(!d);
+		d = krealloc(d, alloc_size, GFP_NOIO|__GFP_ZERO);
+
+		BUG_ON(!d && alloc_size > REPLICAS_DELTA_LIST_MAX);
+
+		if (!d) {
+			d = mempool_alloc(&trans->c->replicas_delta_pool, GFP_NOIO);
+			memset(d, 0, REPLICAS_DELTA_LIST_MAX);
+
+			if (trans->fs_usage_deltas)
+				memcpy(d, trans->fs_usage_deltas,
+				       trans->fs_usage_deltas->size + sizeof(*d));
+
+			new_size = REPLICAS_DELTA_LIST_MAX - sizeof(*d);
+			kfree(trans->fs_usage_deltas);
+		}
 
 		d->size = new_size;
 		trans->fs_usage_deltas = d;
diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c
index 15ff0d3c936a..ddaf833d0bf2 100644
--- a/fs/bcachefs/replicas.c
+++ b/fs/bcachefs/replicas.c
@@ -1033,11 +1033,27 @@ unsigned bch2_dev_has_data(struct bch_fs *c, struct bch_dev *ca)
 	return ret;
 }
 
+void bch2_fs_replicas_exit(struct bch_fs *c)
+{
+	unsigned i;
+
+	kfree(c->usage_scratch);
+	for (i = 0; i < ARRAY_SIZE(c->usage); i++)
+		free_percpu(c->usage[i]);
+	kfree(c->usage_base);
+	kfree(c->replicas.entries);
+	kfree(c->replicas_gc.entries);
+
+	mempool_exit(&c->replicas_delta_pool);
+}
+
 int bch2_fs_replicas_init(struct bch_fs *c)
 {
 	bch2_journal_entry_res_resize(&c->journal,
 			&c->replicas_journal_res,
 			reserve_journal_replicas(c, &c->replicas));
 
-	return replicas_table_update(c, &c->replicas);
+	return mempool_init_kmalloc_pool(&c->replicas_delta_pool, 1,
+					 REPLICAS_DELTA_LIST_MAX) ?:
+		replicas_table_update(c, &c->replicas);
 }
diff --git a/fs/bcachefs/replicas.h b/fs/bcachefs/replicas.h
index 9c8fd3d98247..8cb1f592f1b6 100644
--- a/fs/bcachefs/replicas.h
+++ b/fs/bcachefs/replicas.h
@@ -77,6 +77,7 @@ int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *);
 extern const struct bch_sb_field_ops bch_sb_field_ops_replicas;
 extern const struct bch_sb_field_ops bch_sb_field_ops_replicas_v0;
 
+void bch2_fs_replicas_exit(struct bch_fs *);
 int bch2_fs_replicas_init(struct bch_fs *);
 
 #endif /* _BCACHEFS_REPLICAS_H */
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 8de04bfae800..be6e66e0db71 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -476,6 +476,7 @@ static void __bch2_fs_free(struct bch_fs *c)
 	bch2_fs_btree_iter_exit(c);
 	bch2_fs_btree_key_cache_exit(&c->btree_key_cache);
 	bch2_fs_btree_cache_exit(c);
+	bch2_fs_replicas_exit(c);
 	bch2_fs_journal_exit(&c->journal);
 	bch2_io_clock_exit(&c->io_clock[WRITE]);
 	bch2_io_clock_exit(&c->io_clock[READ]);
@@ -484,10 +485,6 @@ static void __bch2_fs_free(struct bch_fs *c)
 	bch2_journal_entries_free(&c->journal_entries);
 	percpu_free_rwsem(&c->mark_lock);
 	free_percpu(c->online_reserved);
-	kfree(c->usage_scratch);
-	for (i = 0; i < ARRAY_SIZE(c->usage); i++)
-		free_percpu(c->usage[i]);
-	kfree(c->usage_base);
 
 	if (c->btree_iters_bufs)
 		for_each_possible_cpu(cpu)
@@ -500,8 +497,6 @@ static void __bch2_fs_free(struct bch_fs *c)
 	bioset_exit(&c->btree_bio);
 	mempool_exit(&c->fill_iter);
 	percpu_ref_exit(&c->writes);
-	kfree(c->replicas.entries);
-	kfree(c->replicas_gc.entries);
 	kfree(rcu_dereference_protected(c->disk_groups, 1));
 	kfree(c->journal_seq_blacklist_table);
 	kfree(c->unused_inode_hints);
-- 
cgit 


From c043a3303c11cdf53ee98db67cee11931b626e22 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 27 Dec 2021 22:11:54 -0500
Subject: bcachefs: Fix bch2_btree_cache_scan()

It was counting nodes on the freed list that it skips - because we want
to leave a few so that btree splits don't touch the allocator - as nodes
that it touched, meaning that if it was called with <= 3 nodes to
reclaim, and those nodes were on the freed list, it would never do any
work.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_cache.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index 2152813554b4..97b4f87a377f 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -281,13 +281,19 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink,
 
 	i = 0;
 	list_for_each_entry_safe(b, t, &bc->freeable, list) {
+		/*
+		 * Leave a few nodes on the freeable list, so that a btree split
+		 * won't have to hit the system allocator:
+		 */
+		if (++i <= 3)
+			continue;
+
 		touched++;
 
 		if (freed >= nr)
 			break;
 
-		if (++i > 3 &&
-		    !btree_node_reclaim(c, b)) {
+		if (!btree_node_reclaim(c, b)) {
 			btree_node_data_free(c, b);
 			six_unlock_write(&b->c.lock);
 			six_unlock_intent(&b->c.lock);
-- 
cgit 


From f2785955bbdf977bbfba4d20a7dc9db4bdffc4f1 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 20 Feb 2021 00:00:23 -0500
Subject: bcachefs: Kill support for !BTREE_NODE_NEW_EXTENT_OVERWRITE()

bcachefs has been aggressively migrating filesystems and btree nodes to
the new format for quite some time - this shouldn't affect anyone
anymore, and lets us delete a _lot_ of code. Also, it frees up
KEY_TYPE_discard for a new whiteout key type for snapshots.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bkey_sort.c             | 278 ++----------------------------------
 fs/bcachefs/bkey_sort.h             |   8 --
 fs/bcachefs/btree_io.c              | 195 ++-----------------------
 fs/bcachefs/btree_types.h           |   2 -
 fs/bcachefs/btree_update_interior.c |   9 +-
 fs/bcachefs/recovery.c              |   6 +
 6 files changed, 30 insertions(+), 468 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bkey_sort.c b/fs/bcachefs/bkey_sort.c
index 2e1d9cd65f43..a88670753cb0 100644
--- a/fs/bcachefs/bkey_sort.c
+++ b/fs/bcachefs/bkey_sort.c
@@ -14,9 +14,8 @@ static inline bool sort_iter_end(struct sort_iter *iter)
 	return !iter->used;
 }
 
-static inline void __sort_iter_sift(struct sort_iter *iter,
-				    unsigned from,
-				    sort_cmp_fn cmp)
+static inline void sort_iter_sift(struct sort_iter *iter, unsigned from,
+				  sort_cmp_fn cmp)
 {
 	unsigned i;
 
@@ -27,18 +26,12 @@ static inline void __sort_iter_sift(struct sort_iter *iter,
 		swap(iter->data[i], iter->data[i + 1]);
 }
 
-static inline void sort_iter_sift(struct sort_iter *iter, sort_cmp_fn cmp)
-{
-
-	__sort_iter_sift(iter, 0, cmp);
-}
-
 static inline void sort_iter_sort(struct sort_iter *iter, sort_cmp_fn cmp)
 {
 	unsigned i = iter->used;
 
 	while (i--)
-		__sort_iter_sift(iter, i, cmp);
+		sort_iter_sift(iter, i, cmp);
 }
 
 static inline struct bkey_packed *sort_iter_peek(struct sort_iter *iter)
@@ -46,26 +39,20 @@ static inline struct bkey_packed *sort_iter_peek(struct sort_iter *iter)
 	return !sort_iter_end(iter) ? iter->data->k : NULL;
 }
 
-static inline void __sort_iter_advance(struct sort_iter *iter,
-				       unsigned idx, sort_cmp_fn cmp)
+static inline void sort_iter_advance(struct sort_iter *iter, sort_cmp_fn cmp)
 {
-	struct sort_iter_set *i = iter->data + idx;
+	struct sort_iter_set *i = iter->data;
 
-	BUG_ON(idx >= iter->used);
+	BUG_ON(!iter->used);
 
 	i->k = bkey_next_skip_noops(i->k, i->end);
 
 	BUG_ON(i->k > i->end);
 
 	if (i->k == i->end)
-		array_remove_item(iter->data, iter->used, idx);
+		array_remove_item(iter->data, iter->used, 0);
 	else
-		__sort_iter_sift(iter, idx, cmp);
-}
-
-static inline void sort_iter_advance(struct sort_iter *iter, sort_cmp_fn cmp)
-{
-	__sort_iter_advance(iter, 0, cmp);
+		sort_iter_sift(iter, 0, cmp);
 }
 
 static inline struct bkey_packed *sort_iter_next(struct sort_iter *iter,
@@ -264,252 +251,3 @@ unsigned bch2_sort_keys(struct bkey_packed *dst,
 
 	return (u64 *) out - (u64 *) dst;
 }
-
-/* Compat code for btree_node_old_extent_overwrite: */
-
-/*
- * If keys compare equal, compare by pointer order:
- *
- * Necessary for sort_fix_overlapping() - if there are multiple keys that
- * compare equal in different sets, we have to process them newest to oldest.
- */
-static inline int extent_sort_fix_overlapping_cmp(struct btree *b,
-						  struct bkey_packed *l,
-						  struct bkey_packed *r)
-{
-	struct bkey ul = bkey_unpack_key(b, l);
-	struct bkey ur = bkey_unpack_key(b, r);
-
-	return bkey_cmp(bkey_start_pos(&ul),
-			bkey_start_pos(&ur)) ?:
-		cmp_int((unsigned long) r, (unsigned long) l);
-}
-
-/*
- * The algorithm in extent_sort_fix_overlapping() relies on keys in the same
- * bset being ordered by start offset - but 0 size whiteouts (which are always
- * KEY_TYPE_deleted) break this ordering, so we need to skip over them:
- */
-static void extent_iter_advance(struct sort_iter *iter, unsigned idx)
-{
-	struct sort_iter_set *i = iter->data + idx;
-
-	do {
-		i->k = bkey_next_skip_noops(i->k, i->end);
-	} while (i->k != i->end && bkey_deleted(i->k));
-
-	if (i->k == i->end)
-		array_remove_item(iter->data, iter->used, idx);
-	else
-		__sort_iter_sift(iter, idx, extent_sort_fix_overlapping_cmp);
-}
-
-struct btree_nr_keys
-bch2_extent_sort_fix_overlapping(struct bch_fs *c, struct bset *dst,
-				 struct sort_iter *iter)
-{
-	struct btree *b = iter->b;
-	struct bkey_format *f = &b->format;
-	struct sort_iter_set *_l = iter->data, *_r = iter->data + 1;
-	struct bkey_packed *out = dst->start;
-	struct bkey l_unpacked, r_unpacked;
-	struct bkey_s l, r;
-	struct btree_nr_keys nr;
-	struct bkey_buf split;
-	unsigned i;
-
-	memset(&nr, 0, sizeof(nr));
-	bch2_bkey_buf_init(&split);
-
-	sort_iter_sort(iter, extent_sort_fix_overlapping_cmp);
-	for (i = 0; i < iter->used;) {
-		if (bkey_deleted(iter->data[i].k))
-			__sort_iter_advance(iter, i,
-					    extent_sort_fix_overlapping_cmp);
-		else
-			i++;
-	}
-
-	while (!sort_iter_end(iter)) {
-		l = __bkey_disassemble(b, _l->k, &l_unpacked);
-
-		if (iter->used == 1) {
-			extent_sort_append(c, f, &nr, &out, l);
-			extent_iter_advance(iter, 0);
-			continue;
-		}
-
-		r = __bkey_disassemble(b, _r->k, &r_unpacked);
-
-		/* If current key and next key don't overlap, just append */
-		if (bkey_cmp(l.k->p, bkey_start_pos(r.k)) <= 0) {
-			extent_sort_append(c, f, &nr, &out, l);
-			extent_iter_advance(iter, 0);
-			continue;
-		}
-
-		/* Skip 0 size keys */
-		if (!r.k->size) {
-			extent_iter_advance(iter, 1);
-			continue;
-		}
-
-		/*
-		 * overlap: keep the newer key and trim the older key so they
-		 * don't overlap. comparing pointers tells us which one is
-		 * newer, since the bsets are appended one after the other.
-		 */
-
-		/* can't happen because of comparison func */
-		BUG_ON(_l->k < _r->k &&
-		       !bkey_cmp(bkey_start_pos(l.k), bkey_start_pos(r.k)));
-
-		if (_l->k > _r->k) {
-			/* l wins, trim r */
-			if (bkey_cmp(l.k->p, r.k->p) >= 0) {
-				extent_iter_advance(iter, 1);
-			} else {
-				bch2_cut_front_s(l.k->p, r);
-				extent_save(b, _r->k, r.k);
-				__sort_iter_sift(iter, 1,
-					 extent_sort_fix_overlapping_cmp);
-			}
-		} else if (bkey_cmp(l.k->p, r.k->p) > 0) {
-
-			/*
-			 * r wins, but it overlaps in the middle of l - split l:
-			 */
-			bch2_bkey_buf_reassemble(&split, c, l.s_c);
-			bch2_cut_back(bkey_start_pos(r.k), split.k);
-
-			bch2_cut_front_s(r.k->p, l);
-			extent_save(b, _l->k, l.k);
-
-			__sort_iter_sift(iter, 0,
-					 extent_sort_fix_overlapping_cmp);
-
-			extent_sort_append(c, f, &nr, &out,
-					   bkey_i_to_s(split.k));
-		} else {
-			bch2_cut_back_s(bkey_start_pos(r.k), l);
-			extent_save(b, _l->k, l.k);
-		}
-	}
-
-	dst->u64s = cpu_to_le16((u64 *) out - dst->_data);
-
-	bch2_bkey_buf_exit(&split, c);
-	return nr;
-}
-
-static inline int sort_extents_cmp(struct btree *b,
-				   struct bkey_packed *l,
-				   struct bkey_packed *r)
-{
-	return bch2_bkey_cmp_packed(b, l, r) ?:
-		(int) bkey_deleted(l) - (int) bkey_deleted(r);
-}
-
-unsigned bch2_sort_extents(struct bkey_packed *dst,
-			   struct sort_iter *iter,
-			   bool filter_whiteouts)
-{
-	struct bkey_packed *in, *out = dst;
-
-	sort_iter_sort(iter, sort_extents_cmp);
-
-	while ((in = sort_iter_next(iter, sort_extents_cmp))) {
-		if (bkey_deleted(in))
-			continue;
-
-		if (bkey_whiteout(in) &&
-		    (filter_whiteouts || !in->needs_whiteout))
-			continue;
-
-		bkey_copy(out, in);
-		out = bkey_next(out);
-	}
-
-	return (u64 *) out - (u64 *) dst;
-}
-
-static inline int sort_extent_whiteouts_cmp(struct btree *b,
-					    struct bkey_packed *l,
-					    struct bkey_packed *r)
-{
-	struct bkey ul = bkey_unpack_key(b, l);
-	struct bkey ur = bkey_unpack_key(b, r);
-
-	return bkey_cmp(bkey_start_pos(&ul), bkey_start_pos(&ur));
-}
-
-unsigned bch2_sort_extent_whiteouts(struct bkey_packed *dst,
-				    struct sort_iter *iter)
-{
-	const struct bkey_format *f = &iter->b->format;
-	struct bkey_packed *in, *out = dst;
-	struct bkey_i l, r;
-	bool prev = false, l_packed = false;
-	u64 max_packed_size	= bkey_field_max(f, BKEY_FIELD_SIZE);
-	u64 max_packed_offset	= bkey_field_max(f, BKEY_FIELD_OFFSET);
-	u64 new_size;
-
-	max_packed_size = min_t(u64, max_packed_size, KEY_SIZE_MAX);
-
-	sort_iter_sort(iter, sort_extent_whiteouts_cmp);
-
-	while ((in = sort_iter_next(iter, sort_extent_whiteouts_cmp))) {
-		if (bkey_deleted(in))
-			continue;
-
-		EBUG_ON(bkeyp_val_u64s(f, in));
-		EBUG_ON(in->type != KEY_TYPE_discard);
-
-		r.k = bkey_unpack_key(iter->b, in);
-
-		if (prev &&
-		    bkey_cmp(l.k.p, bkey_start_pos(&r.k)) >= 0) {
-			if (bkey_cmp(l.k.p, r.k.p) >= 0)
-				continue;
-
-			new_size = l_packed
-				? min(max_packed_size, max_packed_offset -
-				      bkey_start_offset(&l.k))
-				: KEY_SIZE_MAX;
-
-			new_size = min(new_size, r.k.p.offset -
-				       bkey_start_offset(&l.k));
-
-			BUG_ON(new_size < l.k.size);
-
-			bch2_key_resize(&l.k, new_size);
-
-			if (bkey_cmp(l.k.p, r.k.p) >= 0)
-				continue;
-
-			bch2_cut_front(l.k.p, &r);
-		}
-
-		if (prev) {
-			if (!bch2_bkey_pack(out, &l, f)) {
-				BUG_ON(l_packed);
-				bkey_copy(out, &l);
-			}
-			out = bkey_next(out);
-		}
-
-		l = r;
-		prev = true;
-		l_packed = bkey_packed(in);
-	}
-
-	if (prev) {
-		if (!bch2_bkey_pack(out, &l, f)) {
-			BUG_ON(l_packed);
-			bkey_copy(out, &l);
-		}
-		out = bkey_next(out);
-	}
-
-	return (u64 *) out - (u64 *) dst;
-}
diff --git a/fs/bcachefs/bkey_sort.h b/fs/bcachefs/bkey_sort.h
index 458a051fdac5..1059996dac78 100644
--- a/fs/bcachefs/bkey_sort.h
+++ b/fs/bcachefs/bkey_sort.h
@@ -32,9 +32,6 @@ static inline void sort_iter_add(struct sort_iter *iter,
 struct btree_nr_keys
 bch2_key_sort_fix_overlapping(struct bch_fs *, struct bset *,
 			      struct sort_iter *);
-struct btree_nr_keys
-bch2_extent_sort_fix_overlapping(struct bch_fs *, struct bset *,
-				 struct sort_iter *);
 
 struct btree_nr_keys
 bch2_sort_repack(struct bset *, struct btree *,
@@ -48,10 +45,5 @@ bch2_sort_repack_merge(struct bch_fs *,
 
 unsigned bch2_sort_keys(struct bkey_packed *,
 			struct sort_iter *, bool);
-unsigned bch2_sort_extents(struct bkey_packed *,
-			   struct sort_iter *, bool);
-
-unsigned bch2_sort_extent_whiteouts(struct bkey_packed *,
-				    struct sort_iter *);
 
 #endif /* _BCACHEFS_BKEY_SORT_H */
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index c7c91c1d5b23..b3743a16973c 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -24,8 +24,7 @@
 
 static void verify_no_dups(struct btree *b,
 			   struct bkey_packed *start,
-			   struct bkey_packed *end,
-			   bool extents)
+			   struct bkey_packed *end)
 {
 #ifdef CONFIG_BCACHEFS_DEBUG
 	struct bkey_packed *k, *p;
@@ -39,10 +38,7 @@ static void verify_no_dups(struct btree *b,
 		struct bkey l = bkey_unpack_key(b, p);
 		struct bkey r = bkey_unpack_key(b, k);
 
-		BUG_ON(extents
-		       ? bkey_cmp(l.p, bkey_start_pos(&r)) > 0
-		       : bkey_cmp(l.p, bkey_start_pos(&r)) >= 0);
-		//BUG_ON(bch2_bkey_cmp_packed(&b->format, p, k) >= 0);
+		BUG_ON(bkey_cmp(l.p, bkey_start_pos(&r)) >= 0);
 	}
 #endif
 }
@@ -150,8 +146,7 @@ static void bch2_sort_whiteouts(struct bch_fs *c, struct btree *b)
 	}
 
 	verify_no_dups(b, new_whiteouts,
-		       (void *) ((u64 *) new_whiteouts + b->whiteout_u64s),
-		       btree_node_old_extent_overwrite(b));
+		       (void *) ((u64 *) new_whiteouts + b->whiteout_u64s));
 
 	memcpy_u64s(unwritten_whiteouts_start(c, b),
 		    new_whiteouts, b->whiteout_u64s);
@@ -176,144 +171,6 @@ static bool should_compact_bset(struct btree *b, struct bset_tree *t,
 	}
 }
 
-static bool bch2_compact_extent_whiteouts(struct bch_fs *c,
-					  struct btree *b,
-					  enum compact_mode mode)
-{
-	const struct bkey_format *f = &b->format;
-	struct bset_tree *t;
-	struct bkey_packed *whiteouts = NULL;
-	struct bkey_packed *u_start, *u_pos;
-	struct sort_iter sort_iter;
-	unsigned bytes, whiteout_u64s = 0, u64s;
-	bool used_mempool, compacting = false;
-
-	BUG_ON(!btree_node_is_extents(b));
-
-	for_each_bset(b, t)
-		if (should_compact_bset(b, t, whiteout_u64s != 0, mode))
-			whiteout_u64s += bset_dead_u64s(b, t);
-
-	if (!whiteout_u64s)
-		return false;
-
-	bch2_sort_whiteouts(c, b);
-
-	sort_iter_init(&sort_iter, b);
-
-	whiteout_u64s += b->whiteout_u64s;
-	bytes = whiteout_u64s * sizeof(u64);
-
-	whiteouts = btree_bounce_alloc(c, bytes, &used_mempool);
-	u_start = u_pos = whiteouts;
-
-	memcpy_u64s(u_pos, unwritten_whiteouts_start(c, b),
-		    b->whiteout_u64s);
-	u_pos = (void *) u_pos + b->whiteout_u64s * sizeof(u64);
-
-	sort_iter_add(&sort_iter, u_start, u_pos);
-
-	for_each_bset(b, t) {
-		struct bset *i = bset(b, t);
-		struct bkey_packed *k, *n, *out, *start, *end;
-		struct btree_node_entry *src = NULL, *dst = NULL;
-
-		if (t != b->set && !bset_written(b, i)) {
-			src = container_of(i, struct btree_node_entry, keys);
-			dst = max(write_block(b),
-				  (void *) btree_bkey_last(b, t - 1));
-		}
-
-		if (src != dst)
-			compacting = true;
-
-		if (!should_compact_bset(b, t, compacting, mode)) {
-			if (src != dst) {
-				memmove(dst, src, sizeof(*src) +
-					le16_to_cpu(src->keys.u64s) *
-					sizeof(u64));
-				i = &dst->keys;
-				set_btree_bset(b, t, i);
-			}
-			continue;
-		}
-
-		compacting = true;
-		u_start = u_pos;
-		start = i->start;
-		end = vstruct_last(i);
-
-		if (src != dst) {
-			memmove(dst, src, sizeof(*src));
-			i = &dst->keys;
-			set_btree_bset(b, t, i);
-		}
-
-		out = i->start;
-
-		for (k = start; k != end; k = n) {
-			n = bkey_next_skip_noops(k, end);
-
-			if (bkey_deleted(k))
-				continue;
-
-			BUG_ON(bkey_whiteout(k) &&
-			       k->needs_whiteout &&
-			       bkey_written(b, k));
-
-			if (bkey_whiteout(k) && !k->needs_whiteout)
-				continue;
-
-			if (bkey_whiteout(k)) {
-				memcpy_u64s(u_pos, k, bkeyp_key_u64s(f, k));
-				set_bkeyp_val_u64s(f, u_pos, 0);
-				u_pos = bkey_next(u_pos);
-			} else {
-				bkey_copy(out, k);
-				out = bkey_next(out);
-			}
-		}
-
-		sort_iter_add(&sort_iter, u_start, u_pos);
-
-		i->u64s = cpu_to_le16((u64 *) out - i->_data);
-		set_btree_bset_end(b, t);
-		bch2_bset_set_no_aux_tree(b, t);
-	}
-
-	b->whiteout_u64s = (u64 *) u_pos - (u64 *) whiteouts;
-
-	BUG_ON((void *) unwritten_whiteouts_start(c, b) <
-	       (void *) btree_bkey_last(b, bset_tree_last(b)));
-
-	u64s = bch2_sort_extent_whiteouts(unwritten_whiteouts_start(c, b),
-					  &sort_iter);
-
-	BUG_ON(u64s > b->whiteout_u64s);
-	BUG_ON(u_pos != whiteouts && !u64s);
-
-	if (u64s != b->whiteout_u64s) {
-		void *src = unwritten_whiteouts_start(c, b);
-
-		b->whiteout_u64s = u64s;
-		memmove_u64s_up(unwritten_whiteouts_start(c, b), src, u64s);
-	}
-
-	verify_no_dups(b,
-		       unwritten_whiteouts_start(c, b),
-		       unwritten_whiteouts_end(c, b),
-		       true);
-
-	btree_bounce_free(c, bytes, used_mempool, whiteouts);
-
-	bch2_btree_build_aux_trees(b);
-
-	bch_btree_keys_u64s_remaining(c, b);
-	bch2_verify_btree_nr_keys(b);
-
-	return true;
-}
-
 static bool bch2_drop_whiteouts(struct btree *b, enum compact_mode mode)
 {
 	struct bset_tree *t;
@@ -382,9 +239,7 @@ static bool bch2_drop_whiteouts(struct btree *b, enum compact_mode mode)
 bool bch2_compact_whiteouts(struct bch_fs *c, struct btree *b,
 			    enum compact_mode mode)
 {
-	return !btree_node_old_extent_overwrite(b)
-		? bch2_drop_whiteouts(b, mode)
-		: bch2_compact_extent_whiteouts(c, b, mode);
+	return bch2_drop_whiteouts(b, mode);
 }
 
 static void btree_node_sort(struct bch_fs *c, struct btree *b,
@@ -422,14 +277,7 @@ static void btree_node_sort(struct bch_fs *c, struct btree *b,
 
 	start_time = local_clock();
 
-	if (btree_node_old_extent_overwrite(b))
-		filter_whiteouts = bset_written(b, start_bset);
-
-	u64s = (btree_node_old_extent_overwrite(b)
-		? bch2_sort_extents
-		: bch2_sort_keys)(out->keys.start,
-				  &sort_iter,
-				  filter_whiteouts);
+	u64s = bch2_sort_keys(out->keys.start, &sort_iter, filter_whiteouts);
 
 	out->keys.u64s = cpu_to_le16(u64s);
 
@@ -971,11 +819,10 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
 
 			bset_encrypt(c, i, b->written << 9);
 
-			if (btree_node_is_extents(b) &&
-			    !BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data)) {
-				set_btree_node_old_extent_overwrite(b);
-				set_btree_node_need_rewrite(b);
-			}
+			btree_err_on(btree_node_is_extents(b) &&
+				     !BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data),
+				     BTREE_ERR_FATAL, c, NULL, b, NULL,
+				     "btree node does not have NEW_EXTENT_OVERWRITE set");
 
 			sectors = vstruct_sectors(b->data, c->block_bits);
 		} else {
@@ -1052,9 +899,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
 
 	set_btree_bset(b, b->set, &b->data->keys);
 
-	b->nr = (btree_node_old_extent_overwrite(b)
-		 ? bch2_extent_sort_fix_overlapping
-		 : bch2_key_sort_fix_overlapping)(c, &sorted->keys, iter);
+	b->nr = bch2_key_sort_fix_overlapping(c, &sorted->keys, iter);
 
 	u64s = le16_to_cpu(sorted->keys.u64s);
 	*sorted = *b->data;
@@ -1598,24 +1443,14 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
 	i->journal_seq	= cpu_to_le64(seq);
 	i->u64s		= 0;
 
-	if (!btree_node_old_extent_overwrite(b)) {
-		sort_iter_add(&sort_iter,
-			      unwritten_whiteouts_start(c, b),
-			      unwritten_whiteouts_end(c, b));
-		SET_BSET_SEPARATE_WHITEOUTS(i, false);
-	} else {
-		memcpy_u64s(i->start,
-			    unwritten_whiteouts_start(c, b),
-			    b->whiteout_u64s);
-		i->u64s = cpu_to_le16(b->whiteout_u64s);
-		SET_BSET_SEPARATE_WHITEOUTS(i, true);
-	}
+	sort_iter_add(&sort_iter,
+		      unwritten_whiteouts_start(c, b),
+		      unwritten_whiteouts_end(c, b));
+	SET_BSET_SEPARATE_WHITEOUTS(i, false);
 
 	b->whiteout_u64s = 0;
 
-	u64s = btree_node_old_extent_overwrite(b)
-		? bch2_sort_extents(vstruct_last(i), &sort_iter, false)
-		: bch2_sort_keys(i->start, &sort_iter, false);
+	u64s = bch2_sort_keys(i->start, &sort_iter, false);
 	le16_add_cpu(&i->u64s, u64s);
 
 	set_needs_whiteout(i, false);
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 38414d19e71e..35511d47ae97 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -418,7 +418,6 @@ enum btree_flags {
 	BTREE_NODE_just_written,
 	BTREE_NODE_dying,
 	BTREE_NODE_fake,
-	BTREE_NODE_old_extent_overwrite,
 	BTREE_NODE_need_rewrite,
 	BTREE_NODE_never_write,
 };
@@ -433,7 +432,6 @@ BTREE_FLAG(write_in_flight);
 BTREE_FLAG(just_written);
 BTREE_FLAG(dying);
 BTREE_FLAG(fake);
-BTREE_FLAG(old_extent_overwrite);
 BTREE_FLAG(need_rewrite);
 BTREE_FLAG(never_write);
 
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 989ba81207c9..63cda00bb4ad 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -303,14 +303,7 @@ static struct btree *bch2_btree_node_alloc(struct btree_update *as, unsigned lev
 		bp->v.sectors_written	= 0;
 	}
 
-	if (c->sb.features & (1ULL << BCH_FEATURE_new_extent_overwrite))
-		SET_BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data, true);
-
-	if (btree_node_is_extents(b) &&
-	    !BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data)) {
-		set_btree_node_old_extent_overwrite(b);
-		set_btree_node_need_rewrite(b);
-	}
+	SET_BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data, true);
 
 	bch2_btree_build_aux_trees(b);
 
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 740fdeafe1a2..4d7badcc568b 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -983,6 +983,12 @@ int bch2_fs_recovery(struct bch_fs *c)
 		bch_info(c, "recovering from clean shutdown, journal seq %llu",
 			 le64_to_cpu(clean->journal_seq));
 
+	if (!(c->sb.features & (1ULL << BCH_FEATURE_new_extent_overwrite))) {
+		bch_err(c, "feature new_extent_overwrite not set, filesystem no longer supported");
+		ret = -EINVAL;
+		goto err;
+	}
+
 	if (!(c->sb.features & (1ULL << BCH_FEATURE_alloc_v2))) {
 		bch_info(c, "alloc_v2 feature bit not set, fsck required");
 		c->opts.fsck = true;
-- 
cgit 


From c052cf82f3d66ad5b680003cfacf67cbe0e9b1bb Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 19 Feb 2021 23:41:40 -0500
Subject: bcachefs: KEY_TYPE_discard is no longer used

KEY_TYPE_discard used to be used for extent whiteouts, but when handling
over overlapping extents was lifted above the core btree code it became
unused. This patch updates various code to reflect that.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bkey_sort.c             | 12 +++++-----
 fs/bcachefs/bset.c                  | 11 +++++-----
 fs/bcachefs/bset.h                  | 44 ++++++++++++-------------------------
 fs/bcachefs/btree_io.c              |  6 ++---
 fs/bcachefs/btree_iter.c            |  2 +-
 fs/bcachefs/btree_update_interior.c |  2 +-
 fs/bcachefs/btree_update_leaf.c     |  6 ++---
 fs/bcachefs/extents.c               |  4 ++--
 8 files changed, 35 insertions(+), 52 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bkey_sort.c b/fs/bcachefs/bkey_sort.c
index a88670753cb0..f2507079ed11 100644
--- a/fs/bcachefs/bkey_sort.c
+++ b/fs/bcachefs/bkey_sort.c
@@ -103,7 +103,7 @@ bch2_key_sort_fix_overlapping(struct bch_fs *c, struct bset *dst,
 	sort_iter_sort(iter, key_sort_fix_overlapping_cmp);
 
 	while ((k = sort_iter_peek(iter))) {
-		if (!bkey_whiteout(k) &&
+		if (!bkey_deleted(k) &&
 		    !should_drop_next_key(iter)) {
 			bkey_copy(out, k);
 			btree_keys_account_key_add(&nr, 0, out);
@@ -123,7 +123,7 @@ static void extent_sort_append(struct bch_fs *c,
 			       struct bkey_packed **out,
 			       struct bkey_s k)
 {
-	if (!bkey_whiteout(k.k)) {
+	if (!bkey_deleted(k.k)) {
 		if (!bch2_bkey_pack_key(*out, k.k, f))
 			memcpy_u64s_small(*out, k.k, BKEY_U64s);
 
@@ -148,7 +148,7 @@ bch2_sort_repack(struct bset *dst, struct btree *src,
 	memset(&nr, 0, sizeof(nr));
 
 	while ((in = bch2_btree_node_iter_next_all(src_iter, src))) {
-		if (filter_whiteouts && bkey_whiteout(in))
+		if (filter_whiteouts && bkey_deleted(in))
 			continue;
 
 		if (bch2_bkey_transform(out_f, out, bkey_packed(in)
@@ -181,7 +181,7 @@ bch2_sort_repack_merge(struct bch_fs *c,
 	bch2_bkey_buf_init(&k);
 
 	while ((k_packed = bch2_btree_node_iter_next_all(iter, src))) {
-		if (filter_whiteouts && bkey_whiteout(k_packed))
+		if (filter_whiteouts && bkey_deleted(k_packed))
 			continue;
 
 		/*
@@ -227,7 +227,7 @@ unsigned bch2_sort_keys(struct bkey_packed *dst,
 	while ((in = sort_iter_next(iter, sort_keys_cmp))) {
 		bool needs_whiteout = false;
 
-		if (bkey_whiteout(in) &&
+		if (bkey_deleted(in) &&
 		    (filter_whiteouts || !in->needs_whiteout))
 			continue;
 
@@ -239,7 +239,7 @@ unsigned bch2_sort_keys(struct bkey_packed *dst,
 			in = sort_iter_next(iter, sort_keys_cmp);
 		}
 
-		if (bkey_whiteout(in)) {
+		if (bkey_deleted(in)) {
 			memcpy_u64s(out, in, bkeyp_key_u64s(f, in));
 			set_bkeyp_val_u64s(f, out, 0);
 		} else {
diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c
index 89d511db2c50..ac2fd8242ca4 100644
--- a/fs/bcachefs/bset.c
+++ b/fs/bcachefs/bset.c
@@ -132,7 +132,7 @@ void __bch2_verify_btree_nr_keys(struct btree *b)
 
 	for_each_bset(b, t)
 		bset_tree_for_each_key(b, t, k)
-			if (!bkey_whiteout(k))
+			if (!bkey_deleted(k))
 				btree_keys_account_key_add(&nr, t - b->set, k);
 
 	BUG_ON(memcmp(&nr, &b->nr, sizeof(nr)));
@@ -1108,7 +1108,7 @@ void bch2_bset_insert(struct btree *b,
 	if (bch2_bkey_pack_key(&packed, &insert->k, f))
 		src = &packed;
 
-	if (!bkey_whiteout(&insert->k))
+	if (!bkey_deleted(&insert->k))
 		btree_keys_account_key_add(&b->nr, t - b->set, src);
 
 	if (src->u64s != clobber_u64s) {
@@ -1645,15 +1645,14 @@ found:
 	return prev;
 }
 
-struct bkey_packed *bch2_btree_node_iter_prev_filter(struct btree_node_iter *iter,
-						     struct btree *b,
-						     unsigned min_key_type)
+struct bkey_packed *bch2_btree_node_iter_prev(struct btree_node_iter *iter,
+					      struct btree *b)
 {
 	struct bkey_packed *prev;
 
 	do {
 		prev = bch2_btree_node_iter_prev_all(iter, b);
-	} while (prev && prev->type < min_key_type);
+	} while (prev && bkey_deleted(prev));
 
 	return prev;
 }
diff --git a/fs/bcachefs/bset.h b/fs/bcachefs/bset.h
index 12d5dc7bdb42..f19cd032cf70 100644
--- a/fs/bcachefs/bset.h
+++ b/fs/bcachefs/bset.h
@@ -415,7 +415,7 @@ bch2_bkey_prev_all(struct btree *b, struct bset_tree *t, struct bkey_packed *k)
 static inline struct bkey_packed *
 bch2_bkey_prev(struct btree *b, struct bset_tree *t, struct bkey_packed *k)
 {
-	return bch2_bkey_prev_filter(b, t, k, KEY_TYPE_discard + 1);
+	return bch2_bkey_prev_filter(b, t, k, 1);
 }
 
 enum bch_extent_overlap {
@@ -521,33 +521,23 @@ __bch2_btree_node_iter_peek_all(struct btree_node_iter *iter,
 }
 
 static inline struct bkey_packed *
-bch2_btree_node_iter_peek_filter(struct btree_node_iter *iter,
-				 struct btree *b,
-				 unsigned min_key_type)
+bch2_btree_node_iter_peek_all(struct btree_node_iter *iter, struct btree *b)
 {
-	while (!bch2_btree_node_iter_end(iter)) {
-		struct bkey_packed *k = __bch2_btree_node_iter_peek_all(iter, b);
-
-		if (k->type >= min_key_type)
-			return k;
-
-		bch2_btree_node_iter_advance(iter, b);
-	}
-
-	return NULL;
-}
-
-static inline struct bkey_packed *
-bch2_btree_node_iter_peek_all(struct btree_node_iter *iter,
-			      struct btree *b)
-{
-	return bch2_btree_node_iter_peek_filter(iter, b, 0);
+	return !bch2_btree_node_iter_end(iter)
+		? __btree_node_offset_to_key(b, iter->data->k)
+		: NULL;
 }
 
 static inline struct bkey_packed *
 bch2_btree_node_iter_peek(struct btree_node_iter *iter, struct btree *b)
 {
-	return bch2_btree_node_iter_peek_filter(iter, b, KEY_TYPE_discard + 1);
+	struct bkey_packed *k;
+
+	while ((k = bch2_btree_node_iter_peek_all(iter, b)) &&
+	       bkey_deleted(k))
+		bch2_btree_node_iter_advance(iter, b);
+
+	return k;
 }
 
 static inline struct bkey_packed *
@@ -563,14 +553,8 @@ bch2_btree_node_iter_next_all(struct btree_node_iter *iter, struct btree *b)
 
 struct bkey_packed *bch2_btree_node_iter_prev_all(struct btree_node_iter *,
 						  struct btree *);
-struct bkey_packed *bch2_btree_node_iter_prev_filter(struct btree_node_iter *,
-						     struct btree *, unsigned);
-
-static inline struct bkey_packed *
-bch2_btree_node_iter_prev(struct btree_node_iter *iter, struct btree *b)
-{
-	return bch2_btree_node_iter_prev_filter(iter, b, KEY_TYPE_discard + 1);
-}
+struct bkey_packed *bch2_btree_node_iter_prev(struct btree_node_iter *,
+					      struct btree *);
 
 struct bkey_s_c bch2_btree_node_iter_peek_unpack(struct btree_node_iter *,
 						struct btree *,
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index b3743a16973c..f081233a1ef1 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -215,7 +215,7 @@ static bool bch2_drop_whiteouts(struct btree *b, enum compact_mode mode)
 		for (k = start; k != end; k = n) {
 			n = bkey_next_skip_noops(k, end);
 
-			if (!bkey_whiteout(k)) {
+			if (!bkey_deleted(k)) {
 				bkey_copy(out, k);
 				out = bkey_next(out);
 			} else {
@@ -725,11 +725,11 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b,
 		/*
 		 * with the separate whiteouts thing (used for extents), the
 		 * second set of keys actually can have whiteouts too, so we
-		 * can't solely go off bkey_whiteout()...
+		 * can't solely go off bkey_deleted()...
 		 */
 
 		if (!seen_non_whiteout &&
-		    (!bkey_whiteout(k) ||
+		    (!bkey_deleted(k) ||
 		     (prev && bkey_iter_cmp(b, prev, k) > 0))) {
 			*whiteout_u64s = k->_data - i->_data;
 			seen_non_whiteout = true;
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index eccc9bb0e4e9..1bd7c92d705e 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -534,7 +534,7 @@ static void bch2_btree_iter_verify_level(struct btree_iter *iter,
 	 * whiteouts)
 	 */
 	p = level || btree_node_type_is_extents(iter->btree_id)
-		? bch2_btree_node_iter_prev_filter(&tmp, l->b, KEY_TYPE_discard)
+		? bch2_btree_node_iter_prev(&tmp, l->b)
 		: bch2_btree_node_iter_prev_all(&tmp, l->b);
 	k = bch2_btree_node_iter_peek_all(&l->iter, l->b);
 
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 63cda00bb4ad..961191881b48 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -90,7 +90,7 @@ void __bch2_btree_calc_format(struct bkey_format_state *s, struct btree *b)
 
 	for_each_bset(b, t)
 		bset_tree_for_each_key(b, t, k)
-			if (!bkey_whiteout(k)) {
+			if (!bkey_deleted(k)) {
 				uk = bkey_unpack_key(b, k);
 				bch2_bkey_format_add_key(s, &uk);
 			}
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 70cf18bcbcdd..6100f164278b 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -73,13 +73,13 @@ bool bch2_btree_bset_insert_key(struct btree_iter *iter,
 		k = NULL;
 
 	/* @k is the key being overwritten/deleted, if any: */
-	EBUG_ON(k && bkey_whiteout(k));
+	EBUG_ON(k && bkey_deleted(k));
 
 	/* Deleting, but not found? nothing to do: */
-	if (bkey_whiteout(&insert->k) && !k)
+	if (bkey_deleted(&insert->k) && !k)
 		return false;
 
-	if (bkey_whiteout(&insert->k)) {
+	if (bkey_deleted(&insert->k)) {
 		/* Deleting: */
 		btree_account_key_drop(b, k);
 		k->type = KEY_TYPE_deleted;
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 4bf4c27da6cd..08236ceac4df 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -971,9 +971,9 @@ bool bch2_extent_normalize(struct bch_fs *c, struct bkey_s k)
 
 	/* will only happen if all pointers were cached: */
 	if (!bch2_bkey_nr_ptrs(k.s_c))
-		k.k->type = KEY_TYPE_discard;
+		k.k->type = KEY_TYPE_deleted;
 
-	return bkey_whiteout(k.k);
+	return bkey_deleted(k.k);
 }
 
 void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
-- 
cgit 


From 79f88eba01b1aafdec1fbf453954f5cd00dd56c1 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 20 Feb 2021 19:09:53 -0500
Subject: bcachefs: Rename KEY_TYPE_whiteout -> KEY_TYPE_hash_whiteout

Snapshots are going to need a different whiteout key type. Also, switch
to using BCH_BKEY_TYPES() to define the bkey value accessors.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs_format.h | 18 +++++++++++++++++-
 fs/bcachefs/bkey.h            | 21 +++------------------
 fs/bcachefs/bkey_methods.c    |  6 +++---
 fs/bcachefs/dirent.c          |  4 ++--
 fs/bcachefs/fsck.c            |  2 +-
 fs/bcachefs/str_hash.h        |  8 ++++----
 6 files changed, 30 insertions(+), 29 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index dff49ab7e93d..1df6b7c6e4d7 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -330,7 +330,7 @@ static inline void bkey_init(struct bkey *k)
 	x(discard,		1)			\
 	x(error,		2)			\
 	x(cookie,		3)			\
-	x(whiteout,		4)			\
+	x(hash_whiteout,	4)			\
 	x(btree_ptr,		5)			\
 	x(extent,		6)			\
 	x(reservation,		7)			\
@@ -355,11 +355,27 @@ enum bch_bkey_type {
 	KEY_TYPE_MAX,
 };
 
+struct bch_deleted {
+	struct bch_val		v;
+};
+
+struct bch_discard {
+	struct bch_val		v;
+};
+
+struct bch_error {
+	struct bch_val		v;
+};
+
 struct bch_cookie {
 	struct bch_val		v;
 	__le64			cookie;
 };
 
+struct bch_hash_whiteout {
+	struct bch_val		v;
+};
+
 /* Extents */
 
 /*
diff --git a/fs/bcachefs/bkey.h b/fs/bcachefs/bkey.h
index 9fd752b5c2f5..25cb5e985109 100644
--- a/fs/bcachefs/bkey.h
+++ b/fs/bcachefs/bkey.h
@@ -411,7 +411,7 @@ static inline struct bkey_s_c bkey_i_to_s_c(const struct bkey_i *k)
  * bkey_i_extent to a bkey_i - since that's always safe, instead of conversion
  * functions.
  */
-#define BKEY_VAL_ACCESSORS(name)					\
+#define x(name, ...)					\
 struct bkey_i_##name {							\
 	union {								\
 		struct bkey		k;				\
@@ -522,23 +522,8 @@ static inline struct bkey_i_##name *bkey_##name##_init(struct bkey_i *_k)\
 	return k;							\
 }
 
-BKEY_VAL_ACCESSORS(cookie);
-BKEY_VAL_ACCESSORS(btree_ptr);
-BKEY_VAL_ACCESSORS(extent);
-BKEY_VAL_ACCESSORS(reservation);
-BKEY_VAL_ACCESSORS(inode);
-BKEY_VAL_ACCESSORS(inode_generation);
-BKEY_VAL_ACCESSORS(dirent);
-BKEY_VAL_ACCESSORS(xattr);
-BKEY_VAL_ACCESSORS(alloc);
-BKEY_VAL_ACCESSORS(quota);
-BKEY_VAL_ACCESSORS(stripe);
-BKEY_VAL_ACCESSORS(reflink_p);
-BKEY_VAL_ACCESSORS(reflink_v);
-BKEY_VAL_ACCESSORS(inline_data);
-BKEY_VAL_ACCESSORS(btree_ptr_v2);
-BKEY_VAL_ACCESSORS(indirect_inline_data);
-BKEY_VAL_ACCESSORS(alloc_v2);
+BCH_BKEY_TYPES();
+#undef x
 
 /* byte order helpers */
 
diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c
index f5779795a4b2..756bf5aeee9b 100644
--- a/fs/bcachefs/bkey_methods.c
+++ b/fs/bcachefs/bkey_methods.c
@@ -59,7 +59,7 @@ static const char *key_type_cookie_invalid(const struct bch_fs *c,
 	.key_invalid = key_type_cookie_invalid,		\
 }
 
-#define bch2_bkey_ops_whiteout (struct bkey_ops) {	\
+#define bch2_bkey_ops_hash_whiteout (struct bkey_ops) {	\
 	.key_invalid = empty_val_key_invalid,		\
 }
 
@@ -270,9 +270,9 @@ static const struct old_bkey_type {
 	{BKEY_TYPE_INODES,	128, KEY_TYPE_inode		},
 	{BKEY_TYPE_INODES,	130, KEY_TYPE_inode_generation	},
 	{BKEY_TYPE_DIRENTS,	128, KEY_TYPE_dirent		},
-	{BKEY_TYPE_DIRENTS,	129, KEY_TYPE_whiteout		},
+	{BKEY_TYPE_DIRENTS,	129, KEY_TYPE_hash_whiteout	},
 	{BKEY_TYPE_XATTRS,	128, KEY_TYPE_xattr		},
-	{BKEY_TYPE_XATTRS,	129, KEY_TYPE_whiteout		},
+	{BKEY_TYPE_XATTRS,	129, KEY_TYPE_hash_whiteout	},
 	{BKEY_TYPE_ALLOC,	128, KEY_TYPE_alloc		},
 	{BKEY_TYPE_QUOTAS,	128, KEY_TYPE_quota		},
 };
diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c
index f34bfda8ab0d..d2ebf1e5819d 100644
--- a/fs/bcachefs/dirent.c
+++ b/fs/bcachefs/dirent.c
@@ -262,7 +262,7 @@ int bch2_dirent_rename(struct btree_trans *trans,
 				 * overwrite old_dst - just make sure to use a
 				 * whiteout when deleting src:
 				 */
-				new_src->k.type = KEY_TYPE_whiteout;
+				new_src->k.type = KEY_TYPE_hash_whiteout;
 			}
 		} else {
 			/* Check if we need a whiteout to delete src: */
@@ -272,7 +272,7 @@ int bch2_dirent_rename(struct btree_trans *trans,
 				goto out;
 
 			if (ret)
-				new_src->k.type = KEY_TYPE_whiteout;
+				new_src->k.type = KEY_TYPE_hash_whiteout;
 		}
 	}
 
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 66c9dad2ef3e..7f78edcfe565 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -257,7 +257,7 @@ static void hash_set_chain_start(struct btree_trans *trans,
 			struct hash_check *h,
 			struct btree_iter *k_iter, struct bkey_s_c k)
 {
-	bool hole = (k.k->type != KEY_TYPE_whiteout &&
+	bool hole = (k.k->type != KEY_TYPE_hash_whiteout &&
 		     k.k->type != desc.key_type);
 
 	if (hole || k.k->p.offset > h->chain_end + 1)
diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h
index f6b694b9346b..952b146af750 100644
--- a/fs/bcachefs/str_hash.h
+++ b/fs/bcachefs/str_hash.h
@@ -156,7 +156,7 @@ bch2_hash_lookup(struct btree_trans *trans,
 		if (k.k->type == desc.key_type) {
 			if (!desc.cmp_key(k, key))
 				return iter;
-		} else if (k.k->type == KEY_TYPE_whiteout) {
+		} else if (k.k->type == KEY_TYPE_hash_whiteout) {
 			;
 		} else {
 			/* hole, not found */
@@ -210,7 +210,7 @@ int bch2_hash_needs_whiteout(struct btree_trans *trans,
 
 	for_each_btree_key_continue(iter, BTREE_ITER_SLOTS, k, ret) {
 		if (k.k->type != desc.key_type &&
-		    k.k->type != KEY_TYPE_whiteout)
+		    k.k->type != KEY_TYPE_hash_whiteout)
 			break;
 
 		if (k.k->type == desc.key_type &&
@@ -254,7 +254,7 @@ int bch2_hash_set(struct btree_trans *trans,
 		    !(flags & BCH_HASH_SET_MUST_REPLACE))
 			slot = bch2_trans_copy_iter(trans, iter);
 
-		if (k.k->type != KEY_TYPE_whiteout)
+		if (k.k->type != KEY_TYPE_hash_whiteout)
 			goto not_found;
 	}
 
@@ -303,7 +303,7 @@ int bch2_hash_delete_at(struct btree_trans *trans,
 
 	bkey_init(&delete->k);
 	delete->k.p = iter->pos;
-	delete->k.type = ret ? KEY_TYPE_whiteout : KEY_TYPE_deleted;
+	delete->k.type = ret ? KEY_TYPE_hash_whiteout : KEY_TYPE_deleted;
 
 	bch2_trans_update(trans, iter, delete, 0);
 	return 0;
-- 
cgit 


From 41f8b09edc25d8ea1f4cee44a9931deb3cf8b9d6 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 20 Feb 2021 19:27:37 -0500
Subject: bcachefs: Rename BTREE_ID enums for consistency with other enums

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c      | 10 ++++----
 fs/bcachefs/bcachefs.h              | 18 ++++++-------
 fs/bcachefs/bcachefs_format.h       | 20 +++++++--------
 fs/bcachefs/bkey_methods.c          | 30 +++++++++++-----------
 fs/bcachefs/btree_cache.c           |  7 ------
 fs/bcachefs/btree_cache.h           |  2 --
 fs/bcachefs/btree_gc.h              |  2 +-
 fs/bcachefs/btree_io.c              |  2 +-
 fs/bcachefs/btree_io.h              |  4 +--
 fs/bcachefs/btree_types.h           | 30 +++++++++++-----------
 fs/bcachefs/btree_update_interior.c |  2 +-
 fs/bcachefs/btree_update_leaf.c     |  2 +-
 fs/bcachefs/buckets.c               | 10 ++++----
 fs/bcachefs/dirent.c                |  6 ++---
 fs/bcachefs/ec.c                    | 16 ++++++------
 fs/bcachefs/extent_update.c         |  2 +-
 fs/bcachefs/extents.c               |  2 +-
 fs/bcachefs/fs-io.c                 | 14 +++++------
 fs/bcachefs/fs.c                    |  2 +-
 fs/bcachefs/fsck.c                  | 26 +++++++++----------
 fs/bcachefs/inode.c                 | 18 ++++++-------
 fs/bcachefs/io.c                    | 18 ++++++-------
 fs/bcachefs/migrate.c               |  4 +--
 fs/bcachefs/move.c                  |  6 ++---
 fs/bcachefs/opts.c                  |  7 ++++++
 fs/bcachefs/opts.h                  |  1 +
 fs/bcachefs/quota.c                 | 12 ++++-----
 fs/bcachefs/recovery.c              | 20 +++++++--------
 fs/bcachefs/reflink.c               |  6 ++---
 fs/bcachefs/super-io.c              |  2 +-
 fs/bcachefs/super.c                 |  4 +--
 fs/bcachefs/sysfs.c                 |  2 +-
 fs/bcachefs/tests.c                 | 50 ++++++++++++++++++-------------------
 fs/bcachefs/xattr.c                 |  4 +--
 34 files changed, 180 insertions(+), 181 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 54e58b377e51..34590e4b8f5d 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -316,7 +316,7 @@ int bch2_alloc_read(struct bch_fs *c, struct journal_keys *journal_keys)
 	int ret;
 
 	down_read(&c->gc_lock);
-	ret = bch2_btree_and_journal_walk(c, journal_keys, BTREE_ID_ALLOC,
+	ret = bch2_btree_and_journal_walk(c, journal_keys, BTREE_ID_alloc,
 					  NULL, bch2_alloc_read_fn);
 	up_read(&c->gc_lock);
 
@@ -344,7 +344,7 @@ retry:
 	bch2_trans_begin(trans);
 
 	ret = bch2_btree_key_cache_flush(trans,
-			BTREE_ID_ALLOC, iter->pos);
+			BTREE_ID_alloc, iter->pos);
 	if (ret)
 		goto err;
 
@@ -386,7 +386,7 @@ int bch2_alloc_write(struct bch_fs *c, unsigned flags)
 
 	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
 
-	iter = bch2_trans_get_iter(&trans, BTREE_ID_ALLOC, POS_MIN,
+	iter = bch2_trans_get_iter(&trans, BTREE_ID_alloc, POS_MIN,
 				   BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
 
 	for_each_member_device(ca, c, i) {
@@ -423,7 +423,7 @@ int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev,
 	u64 *time, now;
 	int ret = 0;
 
-	iter = bch2_trans_get_iter(trans, BTREE_ID_ALLOC, POS(dev, bucket_nr),
+	iter = bch2_trans_get_iter(trans, BTREE_ID_alloc, POS(dev, bucket_nr),
 				   BTREE_ITER_CACHED|
 				   BTREE_ITER_CACHED_NOFILL|
 				   BTREE_ITER_INTENT);
@@ -927,7 +927,7 @@ static int bch2_invalidate_buckets(struct bch_fs *c, struct bch_dev *ca)
 
 	bch2_trans_init(&trans, c, 0, 0);
 
-	iter = bch2_trans_get_iter(&trans, BTREE_ID_ALLOC,
+	iter = bch2_trans_get_iter(&trans, BTREE_ID_alloc,
 				   POS(ca->dev_idx, 0),
 				   BTREE_ITER_CACHED|
 				   BTREE_ITER_CACHED_NOFILL|
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 51ba38f19ca9..9f4e7a3ada36 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -369,14 +369,14 @@ enum gc_phase {
 	GC_PHASE_START,
 	GC_PHASE_SB,
 
-	GC_PHASE_BTREE_EC,
-	GC_PHASE_BTREE_EXTENTS,
-	GC_PHASE_BTREE_INODES,
-	GC_PHASE_BTREE_DIRENTS,
-	GC_PHASE_BTREE_XATTRS,
-	GC_PHASE_BTREE_ALLOC,
-	GC_PHASE_BTREE_QUOTAS,
-	GC_PHASE_BTREE_REFLINK,
+	GC_PHASE_BTREE_stripes,
+	GC_PHASE_BTREE_extents,
+	GC_PHASE_BTREE_inodes,
+	GC_PHASE_BTREE_dirents,
+	GC_PHASE_BTREE_xattrs,
+	GC_PHASE_BTREE_alloc,
+	GC_PHASE_BTREE_quotas,
+	GC_PHASE_BTREE_reflink,
 
 	GC_PHASE_PENDING_DELETE,
 	GC_PHASE_ALLOC,
@@ -722,7 +722,7 @@ struct bch_fs {
 	 * Tracks GC's progress - everything in the range [ZERO_KEY..gc_cur_pos]
 	 * has been marked by GC.
 	 *
-	 * gc_cur_phase is a superset of btree_ids (BTREE_ID_EXTENTS etc.)
+	 * gc_cur_phase is a superset of btree_ids (BTREE_ID_extents etc.)
 	 *
 	 * Protected by gc_pos_lock. Only written to by GC thread, so GC thread
 	 * can read without a lock.
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index 1df6b7c6e4d7..e9e501a8c3ec 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -1658,18 +1658,18 @@ LE32_BITMASK(JSET_NO_FLUSH,	struct jset, flags, 5, 6);
 
 /* Btree: */
 
-#define BCH_BTREE_IDS()					\
-	x(EXTENTS,	0, "extents")			\
-	x(INODES,	1, "inodes")			\
-	x(DIRENTS,	2, "dirents")			\
-	x(XATTRS,	3, "xattrs")			\
-	x(ALLOC,	4, "alloc")			\
-	x(QUOTAS,	5, "quotas")			\
-	x(EC,		6, "stripes")			\
-	x(REFLINK,	7, "reflink")
+#define BCH_BTREE_IDS()				\
+	x(extents,	0)			\
+	x(inodes,	1)			\
+	x(dirents,	2)			\
+	x(xattrs,	3)			\
+	x(alloc,	4)			\
+	x(quotas,	5)			\
+	x(stripes,	6)			\
+	x(reflink,	7)
 
 enum btree_id {
-#define x(kwd, val, name) BTREE_ID_##kwd = val,
+#define x(kwd, val) BTREE_ID_##kwd = val,
 	BCH_BTREE_IDS()
 #undef x
 	BTREE_ID_NR
diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c
index 756bf5aeee9b..79e249f49971 100644
--- a/fs/bcachefs/bkey_methods.c
+++ b/fs/bcachefs/bkey_methods.c
@@ -104,7 +104,7 @@ const char *__bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k,
 	if (k.k->u64s < BKEY_U64s)
 		return "u64s too small";
 
-	if (type == BKEY_TYPE_BTREE &&
+	if (type == BKEY_TYPE_btree &&
 	    bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX)
 		return "value too big";
 
@@ -122,7 +122,7 @@ const char *__bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k,
 	if (k.k->p.snapshot)
 		return "nonzero snapshot";
 
-	if (type != BKEY_TYPE_BTREE &&
+	if (type != BKEY_TYPE_btree &&
 	    !bkey_cmp(k.k->p, POS_MAX))
 		return "POS_MAX key";
 
@@ -263,18 +263,18 @@ static const struct old_bkey_type {
 	u8		old;
 	u8		new;
 } bkey_renumber_table[] = {
-	{BKEY_TYPE_BTREE,	128, KEY_TYPE_btree_ptr		},
-	{BKEY_TYPE_EXTENTS,	128, KEY_TYPE_extent		},
-	{BKEY_TYPE_EXTENTS,	129, KEY_TYPE_extent		},
-	{BKEY_TYPE_EXTENTS,	130, KEY_TYPE_reservation	},
-	{BKEY_TYPE_INODES,	128, KEY_TYPE_inode		},
-	{BKEY_TYPE_INODES,	130, KEY_TYPE_inode_generation	},
-	{BKEY_TYPE_DIRENTS,	128, KEY_TYPE_dirent		},
-	{BKEY_TYPE_DIRENTS,	129, KEY_TYPE_hash_whiteout	},
-	{BKEY_TYPE_XATTRS,	128, KEY_TYPE_xattr		},
-	{BKEY_TYPE_XATTRS,	129, KEY_TYPE_hash_whiteout	},
-	{BKEY_TYPE_ALLOC,	128, KEY_TYPE_alloc		},
-	{BKEY_TYPE_QUOTAS,	128, KEY_TYPE_quota		},
+	{BKEY_TYPE_btree,	128, KEY_TYPE_btree_ptr		},
+	{BKEY_TYPE_extents,	128, KEY_TYPE_extent		},
+	{BKEY_TYPE_extents,	129, KEY_TYPE_extent		},
+	{BKEY_TYPE_extents,	130, KEY_TYPE_reservation	},
+	{BKEY_TYPE_inodes,	128, KEY_TYPE_inode		},
+	{BKEY_TYPE_inodes,	130, KEY_TYPE_inode_generation	},
+	{BKEY_TYPE_dirents,	128, KEY_TYPE_dirent		},
+	{BKEY_TYPE_dirents,	129, KEY_TYPE_hash_whiteout	},
+	{BKEY_TYPE_xattrs,	128, KEY_TYPE_xattr		},
+	{BKEY_TYPE_xattrs,	129, KEY_TYPE_hash_whiteout	},
+	{BKEY_TYPE_alloc,	128, KEY_TYPE_alloc		},
+	{BKEY_TYPE_quotas,	128, KEY_TYPE_quota		},
 };
 
 void bch2_bkey_renumber(enum btree_node_type btree_node_type,
@@ -320,7 +320,7 @@ void __bch2_bkey_compat(unsigned level, enum btree_id btree_id,
 		break;
 	case 2:
 		if (version < bcachefs_metadata_version_inode_btree_change &&
-		    btree_id == BTREE_ID_INODES) {
+		    btree_id == BTREE_ID_inodes) {
 			if (!bkey_packed(k)) {
 				struct bkey_i *u = packed_to_bkey(k);
 				swap(u->k.p.inode, u->k.p.offset);
diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index 97b4f87a377f..1a6b4618c2ae 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -13,13 +13,6 @@
 #include <linux/prefetch.h>
 #include <linux/sched/mm.h>
 
-const char * const bch2_btree_ids[] = {
-#define x(kwd, val, name) name,
-	BCH_BTREE_IDS()
-#undef x
-	NULL
-};
-
 void bch2_recalc_btree_reserve(struct bch_fs *c)
 {
 	unsigned i, reserve = 16;
diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h
index 5fffae92effb..217988696a77 100644
--- a/fs/bcachefs/btree_cache.h
+++ b/fs/bcachefs/btree_cache.h
@@ -7,8 +7,6 @@
 
 struct btree_iter;
 
-extern const char * const bch2_btree_ids[];
-
 void bch2_recalc_btree_reserve(struct bch_fs *);
 
 void bch2_btree_node_hash_remove(struct btree_cache *, struct btree *);
diff --git a/fs/bcachefs/btree_gc.h b/fs/bcachefs/btree_gc.h
index f516faded269..d5559827ed7f 100644
--- a/fs/bcachefs/btree_gc.h
+++ b/fs/bcachefs/btree_gc.h
@@ -57,7 +57,7 @@ static inline int gc_pos_cmp(struct gc_pos l, struct gc_pos r)
 static inline enum gc_phase btree_id_to_gc_phase(enum btree_id id)
 {
 	switch (id) {
-#define x(n, v, s) case BTREE_ID_##n: return GC_PHASE_BTREE_##n;
+#define x(name, v) case BTREE_ID_##name: return GC_PHASE_BTREE_##name;
 	BCH_BTREE_IDS()
 #undef x
 	default:
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index f081233a1ef1..b7d931335dd6 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -1321,7 +1321,7 @@ static int validate_bset_for_write(struct bch_fs *c, struct btree *b,
 	unsigned whiteout_u64s = 0;
 	int ret;
 
-	if (bch2_bkey_invalid(c, bkey_i_to_s_c(&b->key), BKEY_TYPE_BTREE))
+	if (bch2_bkey_invalid(c, bkey_i_to_s_c(&b->key), BKEY_TYPE_btree))
 		return -1;
 
 	ret = validate_bset(c, NULL, b, i, sectors, WRITE, false) ?:
diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h
index 89685bd57fc0..16ce6dff6af7 100644
--- a/fs/bcachefs/btree_io.h
+++ b/fs/bcachefs/btree_io.h
@@ -193,7 +193,7 @@ static inline void compat_bformat(unsigned level, enum btree_id btree_id,
 				 int write, struct bkey_format *f)
 {
 	if (version < bcachefs_metadata_version_inode_btree_change &&
-	    btree_id == BTREE_ID_INODES) {
+	    btree_id == BTREE_ID_inodes) {
 		swap(f->bits_per_field[BKEY_FIELD_INODE],
 		     f->bits_per_field[BKEY_FIELD_OFFSET]);
 		swap(f->field_offset[BKEY_FIELD_INODE],
@@ -209,7 +209,7 @@ static inline void compat_bpos(unsigned level, enum btree_id btree_id,
 		bch2_bpos_swab(p);
 
 	if (version < bcachefs_metadata_version_inode_btree_change &&
-	    btree_id == BTREE_ID_INODES)
+	    btree_id == BTREE_ID_inodes)
 		swap(p->inode, p->offset);
 }
 
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 35511d47ae97..fcaa13b9129c 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -545,16 +545,16 @@ static inline unsigned bset_byte_offset(struct btree *b, void *i)
 }
 
 enum btree_node_type {
-#define x(kwd, val, name) BKEY_TYPE_##kwd = val,
+#define x(kwd, val) BKEY_TYPE_##kwd = val,
 	BCH_BTREE_IDS()
 #undef x
-	BKEY_TYPE_BTREE,
+	BKEY_TYPE_btree,
 };
 
 /* Type of a key in btree @id at level @level: */
 static inline enum btree_node_type __btree_node_type(unsigned level, enum btree_id id)
 {
-	return level ? BKEY_TYPE_BTREE : (enum btree_node_type) id;
+	return level ? BKEY_TYPE_btree : (enum btree_node_type) id;
 }
 
 /* Type of keys @b contains: */
@@ -566,8 +566,8 @@ static inline enum btree_node_type btree_node_type(struct btree *b)
 static inline bool btree_node_type_is_extents(enum btree_node_type type)
 {
 	switch (type) {
-	case BKEY_TYPE_EXTENTS:
-	case BKEY_TYPE_REFLINK:
+	case BKEY_TYPE_extents:
+	case BKEY_TYPE_reflink:
 		return true;
 	default:
 		return false;
@@ -590,18 +590,18 @@ static inline bool btree_iter_is_extents(struct btree_iter *iter)
 }
 
 #define BTREE_NODE_TYPE_HAS_TRIGGERS			\
-	((1U << BKEY_TYPE_EXTENTS)|			\
-	 (1U << BKEY_TYPE_ALLOC)|			\
-	 (1U << BKEY_TYPE_INODES)|			\
-	 (1U << BKEY_TYPE_REFLINK)|			\
-	 (1U << BKEY_TYPE_EC)|				\
-	 (1U << BKEY_TYPE_BTREE))
+	((1U << BKEY_TYPE_extents)|			\
+	 (1U << BKEY_TYPE_alloc)|			\
+	 (1U << BKEY_TYPE_inodes)|			\
+	 (1U << BKEY_TYPE_reflink)|			\
+	 (1U << BKEY_TYPE_stripes)|			\
+	 (1U << BKEY_TYPE_btree))
 
 #define BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS		\
-	((1U << BKEY_TYPE_EXTENTS)|			\
-	 (1U << BKEY_TYPE_INODES)|			\
-	 (1U << BKEY_TYPE_EC)|				\
-	 (1U << BKEY_TYPE_REFLINK))
+	((1U << BKEY_TYPE_extents)|			\
+	 (1U << BKEY_TYPE_inodes)|			\
+	 (1U << BKEY_TYPE_stripes)|			\
+	 (1U << BKEY_TYPE_reflink))
 
 enum btree_trigger_flags {
 	__BTREE_TRIGGER_NORUN,		/* Don't run triggers at all */
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 961191881b48..e1dd21320153 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -1196,7 +1196,7 @@ static void btree_split_insert_keys(struct btree_update *as, struct btree *b,
 	struct bkey_packed *src, *dst, *n;
 	struct bset *i;
 
-	BUG_ON(btree_node_type(b) != BKEY_TYPE_BTREE);
+	BUG_ON(btree_node_type(b) != BKEY_TYPE_btree);
 
 	bch2_btree_node_iter_init(&node_iter, b, &k->k.p);
 
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 6100f164278b..c46016961284 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -341,7 +341,7 @@ static inline bool iter_has_nontrans_triggers(struct btree_iter *iter)
 {
 	return (((BTREE_NODE_TYPE_HAS_TRIGGERS &
 		  ~BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS)) |
-		(1U << BTREE_ID_EC)) &
+		(1U << BTREE_ID_stripes)) &
 		(1U << iter->btree_id);
 }
 
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 3dcc77d5242f..55b9818a1dc2 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -1509,7 +1509,7 @@ static int trans_get_key(struct btree_trans *trans,
 			 struct btree_iter **iter,
 			 struct bkey_s_c *k)
 {
-	unsigned flags = btree_id != BTREE_ID_ALLOC
+	unsigned flags = btree_id != BTREE_ID_alloc
 		? BTREE_ITER_SLOTS
 		: BTREE_ITER_CACHED;
 	int ret;
@@ -1545,11 +1545,11 @@ bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree_iter **_it
 	if (IS_ERR(a))
 		return a;
 
-	iter = trans_get_update(trans, BTREE_ID_ALLOC, pos, &k);
+	iter = trans_get_update(trans, BTREE_ID_alloc, pos, &k);
 	if (iter) {
 		*u = bch2_alloc_unpack(k);
 	} else {
-		iter = bch2_trans_get_iter(trans, BTREE_ID_ALLOC, pos,
+		iter = bch2_trans_get_iter(trans, BTREE_ID_alloc, pos,
 					   BTREE_ITER_CACHED|
 					   BTREE_ITER_CACHED_NOFILL|
 					   BTREE_ITER_INTENT);
@@ -1606,7 +1606,7 @@ static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans,
 	struct bch_replicas_padded r;
 	int ret = 0;
 
-	ret = trans_get_key(trans, BTREE_ID_EC, POS(0, p.ec.idx), &iter, &k);
+	ret = trans_get_key(trans, BTREE_ID_stripes, POS(0, p.ec.idx), &iter, &k);
 	if (ret < 0)
 		return ret;
 
@@ -1830,7 +1830,7 @@ static int __bch2_trans_mark_reflink_p(struct btree_trans *trans,
 	__le64 *refcount;
 	s64 ret;
 
-	ret = trans_get_key(trans, BTREE_ID_REFLINK,
+	ret = trans_get_key(trans, BTREE_ID_reflink,
 			    POS(0, idx), &iter, &k);
 	if (ret < 0)
 		return ret;
diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c
index d2ebf1e5819d..b0625176ab35 100644
--- a/fs/bcachefs/dirent.c
+++ b/fs/bcachefs/dirent.c
@@ -64,7 +64,7 @@ static bool dirent_cmp_bkey(struct bkey_s_c _l, struct bkey_s_c _r)
 }
 
 const struct bch_hash_desc bch2_dirent_hash_desc = {
-	.btree_id	= BTREE_ID_DIRENTS,
+	.btree_id	= BTREE_ID_dirents,
 	.key_type	= KEY_TYPE_dirent,
 	.hash_key	= dirent_hash_key,
 	.hash_bkey	= dirent_hash_bkey,
@@ -332,7 +332,7 @@ int bch2_empty_dir_trans(struct btree_trans *trans, u64 dir_inum)
 	struct bkey_s_c k;
 	int ret;
 
-	for_each_btree_key(trans, iter, BTREE_ID_DIRENTS,
+	for_each_btree_key(trans, iter, BTREE_ID_dirents,
 			   POS(dir_inum, 0), 0, k, ret) {
 		if (k.k->p.inode > dir_inum)
 			break;
@@ -357,7 +357,7 @@ int bch2_readdir(struct bch_fs *c, u64 inum, struct dir_context *ctx)
 
 	bch2_trans_init(&trans, c, 0, 0);
 
-	for_each_btree_key(&trans, iter, BTREE_ID_DIRENTS,
+	for_each_btree_key(&trans, iter, BTREE_ID_dirents,
 			   POS(inum, ctx->pos), 0, k, ret) {
 		if (k.k->p.inode > inum)
 			break;
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index a70b859363f0..ced8ceeef992 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -433,7 +433,7 @@ static int get_stripe_key(struct bch_fs *c, u64 idx, struct ec_stripe_buf *strip
 	int ret;
 
 	bch2_trans_init(&trans, c, 0, 0);
-	iter = bch2_trans_get_iter(&trans, BTREE_ID_EC, POS(0, idx), BTREE_ITER_SLOTS);
+	iter = bch2_trans_get_iter(&trans, BTREE_ID_stripes, POS(0, idx), BTREE_ITER_SLOTS);
 	k = bch2_btree_iter_peek_slot(iter);
 	ret = bkey_err(k);
 	if (ret)
@@ -668,7 +668,7 @@ void bch2_stripes_heap_update(struct bch_fs *c,
 
 static int ec_stripe_delete(struct bch_fs *c, size_t idx)
 {
-	return bch2_btree_delete_range(c, BTREE_ID_EC,
+	return bch2_btree_delete_range(c, BTREE_ID_stripes,
 				       POS(0, idx),
 				       POS(0, idx + 1),
 				       NULL);
@@ -713,7 +713,7 @@ static int ec_stripe_bkey_insert(struct bch_fs *c,
 retry:
 	bch2_trans_begin(&trans);
 
-	for_each_btree_key(&trans, iter, BTREE_ID_EC, start_pos,
+	for_each_btree_key(&trans, iter, BTREE_ID_stripes, start_pos,
 			   BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) {
 		if (bkey_cmp(k.k->p, POS(0, U32_MAX)) > 0) {
 			if (start_pos.offset) {
@@ -765,7 +765,7 @@ static int ec_stripe_bkey_update(struct btree_trans *trans,
 	unsigned i;
 	int ret;
 
-	iter = bch2_trans_get_iter(trans, BTREE_ID_EC,
+	iter = bch2_trans_get_iter(trans, BTREE_ID_stripes,
 				   new->k.p, BTREE_ITER_INTENT);
 	k = bch2_btree_iter_peek_slot(iter);
 	ret = bkey_err(k);
@@ -831,7 +831,7 @@ static int ec_stripe_update_ptrs(struct bch_fs *c,
 
 	/* XXX this doesn't support the reflink btree */
 
-	iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
+	iter = bch2_trans_get_iter(&trans, BTREE_ID_extents,
 				   bkey_start_pos(pos),
 				   BTREE_ITER_INTENT);
 
@@ -1604,7 +1604,7 @@ int bch2_stripes_write(struct bch_fs *c, unsigned flags)
 
 	bch2_trans_init(&trans, c, 0, 0);
 
-	iter = bch2_trans_get_iter(&trans, BTREE_ID_EC, POS_MIN,
+	iter = bch2_trans_get_iter(&trans, BTREE_ID_stripes, POS_MIN,
 				   BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
 
 	genradix_for_each(&c->stripes[0], giter, m) {
@@ -1645,7 +1645,7 @@ static int bch2_stripes_read_fn(struct bch_fs *c, enum btree_id id,
 
 int bch2_stripes_read(struct bch_fs *c, struct journal_keys *journal_keys)
 {
-	int ret = bch2_btree_and_journal_walk(c, journal_keys, BTREE_ID_EC,
+	int ret = bch2_btree_and_journal_walk(c, journal_keys, BTREE_ID_stripes,
 					  NULL, bch2_stripes_read_fn);
 	if (ret)
 		bch_err(c, "error reading stripes: %i", ret);
@@ -1663,7 +1663,7 @@ int bch2_ec_mem_alloc(struct bch_fs *c, bool gc)
 
 	bch2_trans_init(&trans, c, 0, 0);
 
-	iter = bch2_trans_get_iter(&trans, BTREE_ID_EC, POS(0, U64_MAX), 0);
+	iter = bch2_trans_get_iter(&trans, BTREE_ID_stripes, POS(0, U64_MAX), 0);
 
 	k = bch2_btree_iter_prev(iter);
 	if (!IS_ERR_OR_NULL(k.k))
diff --git a/fs/bcachefs/extent_update.c b/fs/bcachefs/extent_update.c
index 16d2bca8a662..bb4b2b4352e0 100644
--- a/fs/bcachefs/extent_update.c
+++ b/fs/bcachefs/extent_update.c
@@ -62,7 +62,7 @@ static int count_iters_for_insert(struct btree_trans *trans,
 		struct bkey_s_c r_k;
 
 		for_each_btree_key(trans, iter,
-				   BTREE_ID_REFLINK, POS(0, idx + offset),
+				   BTREE_ID_reflink, POS(0, idx + offset),
 				   BTREE_ITER_SLOTS, r_k, ret2) {
 			if (bkey_cmp(bkey_start_pos(r_k.k),
 				     POS(0, idx + sectors)) >= 0)
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 08236ceac4df..515840bc3eaa 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -676,7 +676,7 @@ bool bch2_check_range_allocated(struct bch_fs *c, struct bpos pos, u64 size,
 
 	bch2_trans_init(&trans, c, 0, 0);
 
-	for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, pos,
+	for_each_btree_key(&trans, iter, BTREE_ID_extents, pos,
 			   BTREE_ITER_SLOTS, k, err) {
 		if (bkey_cmp(bkey_start_pos(k.k), end) >= 0)
 			break;
diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 80ef9d6df287..4ccc9318a924 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -865,7 +865,7 @@ void bch2_readahead(struct readahead_control *ractl)
 
 	bch2_trans_init(&trans, c, 0, 0);
 
-	iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, POS_MIN,
+	iter = bch2_trans_get_iter(&trans, BTREE_ID_extents, POS_MIN,
 				   BTREE_ITER_SLOTS);
 
 	bch2_pagecache_add_get(&inode->ei_pagecache_lock);
@@ -911,7 +911,7 @@ static void __bchfs_readpage(struct bch_fs *c, struct bch_read_bio *rbio,
 	BUG_ON(!bio_add_page(&rbio->bio, page, PAGE_SIZE, 0));
 
 	bch2_trans_init(&trans, c, 0, 0);
-	iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, POS_MIN,
+	iter = bch2_trans_get_iter(&trans, BTREE_ID_extents, POS_MIN,
 				   BTREE_ITER_SLOTS);
 
 	bchfs_read(&trans, iter, rbio, inum, NULL);
@@ -2144,7 +2144,7 @@ static inline int range_has_data(struct bch_fs *c,
 
 	bch2_trans_init(&trans, c, 0, 0);
 
-	for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, start, 0, k, ret) {
+	for_each_btree_key(&trans, iter, BTREE_ID_extents, start, 0, k, ret) {
 		if (bkey_cmp(bkey_start_pos(k.k), end) >= 0)
 			break;
 
@@ -2520,7 +2520,7 @@ static long bchfs_fcollapse_finsert(struct bch_inode_info *inode,
 			goto err;
 	}
 
-	src = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
+	src = bch2_trans_get_iter(&trans, BTREE_ID_extents,
 			POS(inode->v.i_ino, src_start >> 9),
 			BTREE_ITER_INTENT);
 	dst = bch2_trans_copy_iter(&trans, src);
@@ -2675,7 +2675,7 @@ static long bchfs_fallocate(struct bch_inode_info *inode, int mode,
 		truncate_pagecache_range(&inode->v, offset, end - 1);
 	}
 
-	iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
+	iter = bch2_trans_get_iter(&trans, BTREE_ID_extents,
 			POS(inode->v.i_ino, block_start >> 9),
 			BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
 	end_pos = POS(inode->v.i_ino, block_end >> 9);
@@ -3006,7 +3006,7 @@ static loff_t bch2_seek_data(struct file *file, u64 offset)
 
 	bch2_trans_init(&trans, c, 0, 0);
 
-	for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS,
+	for_each_btree_key(&trans, iter, BTREE_ID_extents,
 			   POS(inode->v.i_ino, offset >> 9), 0, k, ret) {
 		if (k.k->p.inode != inode->v.i_ino) {
 			break;
@@ -3101,7 +3101,7 @@ static loff_t bch2_seek_hole(struct file *file, u64 offset)
 
 	bch2_trans_init(&trans, c, 0, 0);
 
-	for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS,
+	for_each_btree_key(&trans, iter, BTREE_ID_extents,
 			   POS(inode->v.i_ino, offset >> 9),
 			   BTREE_ITER_SLOTS, k, ret) {
 		if (k.k->p.inode != inode->v.i_ino) {
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index bcb2f83fe354..2d5e00a42b3e 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -916,7 +916,7 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
 	bch2_bkey_buf_init(&prev);
 	bch2_trans_init(&trans, c, 0, 0);
 
-	iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
+	iter = bch2_trans_get_iter(&trans, BTREE_ID_extents,
 				   POS(ei->v.i_ino, start >> 9), 0);
 retry:
 	while ((k = bch2_btree_iter_peek(iter)).k &&
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 7f78edcfe565..ebc234b0b6fe 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -24,7 +24,7 @@ static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum)
 	u64 sectors = 0;
 	int ret;
 
-	for_each_btree_key(trans, iter, BTREE_ID_EXTENTS,
+	for_each_btree_key(trans, iter, BTREE_ID_extents,
 			   POS(inum, 0), 0, k, ret) {
 		if (k.k->p.inode != inum)
 			break;
@@ -396,7 +396,7 @@ err_redo:
 	if (fsck_err(c, "cannot fix dirent by removing trailing garbage %s (%zu)\n"
 		     "hash table key at wrong offset: btree %u, offset %llu, "
 		     "hashed to %llu chain starts at %llu\n%s",
-		     buf, strlen(buf), BTREE_ID_DIRENTS,
+		     buf, strlen(buf), BTREE_ID_dirents,
 		     k->k->p.offset, hash, h->chain->pos.offset,
 		     (bch2_bkey_val_to_text(&PBUF(buf), c,
 					    *k), buf))) {
@@ -415,7 +415,7 @@ err_redo:
 
 static int bch2_inode_truncate(struct bch_fs *c, u64 inode_nr, u64 new_size)
 {
-	return bch2_btree_delete_range(c, BTREE_ID_EXTENTS,
+	return bch2_btree_delete_range(c, BTREE_ID_extents,
 			POS(inode_nr, round_up(new_size, block_bytes(c)) >> 9),
 			POS(inode_nr + 1, 0), NULL);
 }
@@ -474,7 +474,7 @@ static int check_extents(struct bch_fs *c)
 
 	bch_verbose(c, "checking extents");
 
-	iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
+	iter = bch2_trans_get_iter(&trans, BTREE_ID_extents,
 				   POS(BCACHEFS_ROOT_INO, 0),
 				   BTREE_ITER_INTENT);
 retry:
@@ -537,7 +537,7 @@ retry:
 
 			bch2_inode_pack(c, &p, &w.inode);
 
-			ret = bch2_btree_insert(c, BTREE_ID_INODES,
+			ret = bch2_btree_insert(c, BTREE_ID_inodes,
 						&p.inode.k_i, NULL, NULL,
 						BTREE_INSERT_NOFAIL|
 						BTREE_INSERT_LAZY_RW);
@@ -595,7 +595,7 @@ static int check_dirents(struct bch_fs *c)
 
 	hash_check_init(&h);
 
-	iter = bch2_trans_get_iter(&trans, BTREE_ID_DIRENTS,
+	iter = bch2_trans_get_iter(&trans, BTREE_ID_dirents,
 				   POS(BCACHEFS_ROOT_INO, 0), 0);
 retry:
 	for_each_btree_key_continue(iter, 0, k, ret) {
@@ -747,7 +747,7 @@ static int check_xattrs(struct bch_fs *c)
 
 	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
 
-	iter = bch2_trans_get_iter(&trans, BTREE_ID_XATTRS,
+	iter = bch2_trans_get_iter(&trans, BTREE_ID_xattrs,
 				   POS(BCACHEFS_ROOT_INO, 0), 0);
 retry:
 	for_each_btree_key_continue(iter, 0, k, ret) {
@@ -810,7 +810,7 @@ create_root:
 
 	bch2_inode_pack(c, &packed, root_inode);
 
-	return bch2_btree_insert(c, BTREE_ID_INODES, &packed.inode.k_i,
+	return bch2_btree_insert(c, BTREE_ID_inodes, &packed.inode.k_i,
 				 NULL, NULL,
 				 BTREE_INSERT_NOFAIL|
 				 BTREE_INSERT_LAZY_RW);
@@ -958,7 +958,7 @@ next:
 		if (e->offset == U64_MAX)
 			goto up;
 
-		for_each_btree_key(&trans, iter, BTREE_ID_DIRENTS,
+		for_each_btree_key(&trans, iter, BTREE_ID_dirents,
 				   POS(e->inum, e->offset + 1), 0, k, ret) {
 			if (k.k->p.inode != e->inum)
 				break;
@@ -1011,7 +1011,7 @@ up:
 		path.nr--;
 	}
 
-	iter = bch2_trans_get_iter(&trans, BTREE_ID_INODES, POS_MIN, 0);
+	iter = bch2_trans_get_iter(&trans, BTREE_ID_inodes, POS_MIN, 0);
 retry:
 	for_each_btree_key_continue(iter, 0, k, ret) {
 		if (k.k->type != KEY_TYPE_inode)
@@ -1108,7 +1108,7 @@ static int bch2_gc_walk_dirents(struct bch_fs *c, nlink_table *links,
 
 	inc_link(c, links, range_start, range_end, BCACHEFS_ROOT_INO, false);
 
-	for_each_btree_key(&trans, iter, BTREE_ID_DIRENTS, POS_MIN, 0, k, ret) {
+	for_each_btree_key(&trans, iter, BTREE_ID_dirents, POS_MIN, 0, k, ret) {
 		switch (k.k->type) {
 		case KEY_TYPE_dirent:
 			d = bkey_s_c_to_dirent(k);
@@ -1349,7 +1349,7 @@ static int bch2_gc_walk_inodes(struct bch_fs *c,
 
 	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
 
-	iter = bch2_trans_get_iter(&trans, BTREE_ID_INODES,
+	iter = bch2_trans_get_iter(&trans, BTREE_ID_inodes,
 				   POS(0, range_start), 0);
 	nlinks_iter = genradix_iter_init(links, 0);
 
@@ -1475,7 +1475,7 @@ int bch2_fsck_walk_inodes_only(struct bch_fs *c)
 
 	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
 
-	for_each_btree_key(&trans, iter, BTREE_ID_INODES, POS_MIN, 0, k, ret) {
+	for_each_btree_key(&trans, iter, BTREE_ID_inodes, POS_MIN, 0, k, ret) {
 		if (k.k->type != KEY_TYPE_inode)
 			continue;
 
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index 8377d39ccc4d..a3d2bae0a652 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -300,7 +300,7 @@ struct btree_iter *bch2_inode_peek(struct btree_trans *trans,
 	struct bkey_s_c k;
 	int ret;
 
-	iter = bch2_trans_get_iter(trans, BTREE_ID_INODES, POS(0, inum),
+	iter = bch2_trans_get_iter(trans, BTREE_ID_inodes, POS(0, inum),
 				   BTREE_ITER_CACHED|flags);
 	k = bch2_btree_iter_peek_cached(iter);
 	ret = bkey_err(k);
@@ -498,7 +498,7 @@ int bch2_inode_create(struct btree_trans *trans,
 	if (IS_ERR(inode_p))
 		return PTR_ERR(inode_p);
 again:
-	for_each_btree_key(trans, iter, BTREE_ID_INODES, POS(0, start),
+	for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, start),
 			   BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) {
 		if (bkey_cmp(iter->pos, POS(0, max)) > 0)
 			break;
@@ -513,7 +513,7 @@ again:
 		 * cache before using a slot:
 		 */
 		if (k.k->type != KEY_TYPE_inode &&
-		    !bch2_btree_key_cache_find(c, BTREE_ID_INODES, iter->pos))
+		    !bch2_btree_key_cache_find(c, BTREE_ID_inodes, iter->pos))
 			goto found_slot;
 	}
 
@@ -560,11 +560,11 @@ int bch2_inode_rm(struct bch_fs *c, u64 inode_nr, bool cached)
 	 * XXX: the dirent could ideally would delete whiteouts when they're no
 	 * longer needed
 	 */
-	ret   = bch2_btree_delete_range_trans(&trans, BTREE_ID_EXTENTS,
+	ret   = bch2_btree_delete_range_trans(&trans, BTREE_ID_extents,
 					      start, end, NULL) ?:
-		bch2_btree_delete_range_trans(&trans, BTREE_ID_XATTRS,
+		bch2_btree_delete_range_trans(&trans, BTREE_ID_xattrs,
 					      start, end, NULL) ?:
-		bch2_btree_delete_range_trans(&trans, BTREE_ID_DIRENTS,
+		bch2_btree_delete_range_trans(&trans, BTREE_ID_dirents,
 					      start, end, NULL);
 	if (ret)
 		goto err;
@@ -574,11 +574,11 @@ retry:
 	bi_generation = 0;
 
 	if (cached) {
-		iter = bch2_trans_get_iter(&trans, BTREE_ID_INODES, POS(0, inode_nr),
+		iter = bch2_trans_get_iter(&trans, BTREE_ID_inodes, POS(0, inode_nr),
 					   BTREE_ITER_CACHED|BTREE_ITER_INTENT);
 		k = bch2_btree_iter_peek_cached(iter);
 	} else {
-		iter = bch2_trans_get_iter(&trans, BTREE_ID_INODES, POS(0, inode_nr),
+		iter = bch2_trans_get_iter(&trans, BTREE_ID_inodes, POS(0, inode_nr),
 					   BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
 		k = bch2_btree_iter_peek_slot(iter);
 	}
@@ -636,7 +636,7 @@ int __bch2_inode_find_by_inum_trans(struct btree_trans *trans, u64 inode_nr,
 	struct bkey_s_c k;
 	int ret;
 
-	iter = bch2_trans_get_iter(trans, BTREE_ID_INODES,
+	iter = bch2_trans_get_iter(trans, BTREE_ID_inodes,
 			POS(0, inode_nr), flags);
 	k = (flags & BTREE_ITER_TYPE) == BTREE_ITER_CACHED
 		? bch2_btree_iter_peek_cached(iter)
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index 8a4d05eee381..de3bd22edb5a 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -408,7 +408,7 @@ int bch2_fpunch(struct bch_fs *c, u64 inum, u64 start, u64 end,
 	int ret = 0;
 
 	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024);
-	iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
+	iter = bch2_trans_get_iter(&trans, BTREE_ID_extents,
 				   POS(inum, start),
 				   BTREE_ITER_INTENT);
 
@@ -435,7 +435,7 @@ int bch2_write_index_default(struct bch_write_op *op)
 	bch2_bkey_buf_init(&sk);
 	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024);
 
-	iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
+	iter = bch2_trans_get_iter(&trans, BTREE_ID_extents,
 				   bkey_start_pos(&k->k),
 				   BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
 
@@ -1530,8 +1530,8 @@ static struct promote_op *promote_alloc(struct bch_fs *c,
 
 	promote = __promote_alloc(c,
 				  k.k->type == KEY_TYPE_reflink_v
-				  ? BTREE_ID_REFLINK
-				  : BTREE_ID_EXTENTS,
+				  ? BTREE_ID_reflink
+				  : BTREE_ID_extents,
 				  k, pos, pick, opts, sectors, rbio);
 	if (!promote)
 		return NULL;
@@ -1627,7 +1627,7 @@ static void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio
 	bch2_bkey_buf_init(&sk);
 	bch2_trans_init(&trans, c, 0, 0);
 
-	iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
+	iter = bch2_trans_get_iter(&trans, BTREE_ID_extents,
 				   rbio->pos, BTREE_ITER_SLOTS);
 retry:
 	rbio->bio.bi_status = 0;
@@ -1682,7 +1682,7 @@ static void bch2_read_retry(struct bch_fs *c, struct bch_read_bio *rbio,
 retry:
 	bch2_trans_begin(&trans);
 
-	for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS,
+	for_each_btree_key(&trans, iter, BTREE_ID_extents,
 			   POS(inode, bvec_iter.bi_sector),
 			   BTREE_ITER_SLOTS, k, ret) {
 		unsigned bytes, sectors, offset_into_extent;
@@ -1801,7 +1801,7 @@ static int __bch2_rbio_narrow_crcs(struct btree_trans *trans,
 	if (crc_is_compressed(rbio->pick.crc))
 		return 0;
 
-	iter = bch2_trans_get_iter(trans, BTREE_ID_EXTENTS, rbio->pos,
+	iter = bch2_trans_get_iter(trans, BTREE_ID_extents, rbio->pos,
 				   BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
 	k = bch2_btree_iter_peek_slot(iter);
 	if ((ret = bkey_err(k)))
@@ -2011,7 +2011,7 @@ int __bch2_read_indirect_extent(struct btree_trans *trans,
 	reflink_offset = le64_to_cpu(bkey_i_to_reflink_p(orig_k->k)->v.idx) +
 		*offset_into_extent;
 
-	iter = bch2_trans_get_iter(trans, BTREE_ID_REFLINK,
+	iter = bch2_trans_get_iter(trans, BTREE_ID_reflink,
 				   POS(0, reflink_offset),
 				   BTREE_ITER_SLOTS);
 	k = bch2_btree_iter_peek_slot(iter);
@@ -2319,7 +2319,7 @@ void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, u64 inode)
 retry:
 	bch2_trans_begin(&trans);
 
-	iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
+	iter = bch2_trans_get_iter(&trans, BTREE_ID_extents,
 				   POS(inode, rbio->bio.bi_iter.bi_sector),
 				   BTREE_ITER_SLOTS);
 	while (1) {
diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c
index 6241ff0c129f..1db2c2d6b970 100644
--- a/fs/bcachefs/migrate.c
+++ b/fs/bcachefs/migrate.c
@@ -99,8 +99,8 @@ static int __bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags
 
 static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
 {
-	return  __bch2_dev_usrdata_drop(c, dev_idx, flags, BTREE_ID_EXTENTS) ?:
-		__bch2_dev_usrdata_drop(c, dev_idx, flags, BTREE_ID_REFLINK);
+	return  __bch2_dev_usrdata_drop(c, dev_idx, flags, BTREE_ID_extents) ?:
+		__bch2_dev_usrdata_drop(c, dev_idx, flags, BTREE_ID_reflink);
 }
 
 static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index 2343f41715ef..dfe7f05f39e9 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -578,7 +578,7 @@ peek:
 		if (!bkey_extent_is_direct_data(k.k))
 			goto next_nondata;
 
-		if (btree_id == BTREE_ID_EXTENTS &&
+		if (btree_id == BTREE_ID_extents &&
 		    cur_inum != k.k->p.inode) {
 			struct bch_inode_unpacked inode;
 
@@ -668,8 +668,8 @@ int bch2_move_data(struct bch_fs *c,
 	     id++) {
 		stats->btree_id = id;
 
-		if (id != BTREE_ID_EXTENTS &&
-		    id != BTREE_ID_REFLINK)
+		if (id != BTREE_ID_extents &&
+		    id != BTREE_ID_reflink)
 			continue;
 
 		ret = __bch2_move_data(c, &ctxt, rate, wp,
diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c
index d53b6dccd161..a6c734efe328 100644
--- a/fs/bcachefs/opts.c
+++ b/fs/bcachefs/opts.c
@@ -30,6 +30,13 @@ const char * const bch2_sb_compat[] = {
 	NULL
 };
 
+const char * const bch2_btree_ids[] = {
+#define x(name, ...) #name,
+	BCH_BTREE_IDS()
+#undef x
+	NULL
+};
+
 const char * const bch2_csum_opts[] = {
 	"none",
 	"crc32c",
diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
index d2b3549a33af..38d78ca46c9c 100644
--- a/fs/bcachefs/opts.h
+++ b/fs/bcachefs/opts.h
@@ -11,6 +11,7 @@
 extern const char * const bch2_error_actions[];
 extern const char * const bch2_sb_features[];
 extern const char * const bch2_sb_compat[];
+extern const char * const bch2_btree_ids[];
 extern const char * const bch2_csum_opts[];
 extern const char * const bch2_compression_opts[];
 extern const char * const bch2_str_hash_types[];
diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c
index d3032a46e7f3..041da982d051 100644
--- a/fs/bcachefs/quota.c
+++ b/fs/bcachefs/quota.c
@@ -363,7 +363,7 @@ static int bch2_quota_init_type(struct bch_fs *c, enum quota_types type)
 
 	bch2_trans_init(&trans, c, 0, 0);
 
-	for_each_btree_key(&trans, iter, BTREE_ID_QUOTAS, POS(type, 0),
+	for_each_btree_key(&trans, iter, BTREE_ID_quotas, POS(type, 0),
 			   BTREE_ITER_PREFETCH, k, ret) {
 		if (k.k->p.inode != type)
 			break;
@@ -435,7 +435,7 @@ int bch2_fs_quota_read(struct bch_fs *c)
 
 	bch2_trans_init(&trans, c, 0, 0);
 
-	for_each_btree_key(&trans, iter, BTREE_ID_INODES, POS_MIN,
+	for_each_btree_key(&trans, iter, BTREE_ID_inodes, POS_MIN,
 			   BTREE_ITER_PREFETCH, k, ret) {
 		switch (k.k->type) {
 		case KEY_TYPE_inode:
@@ -526,7 +526,7 @@ static int bch2_quota_remove(struct super_block *sb, unsigned uflags)
 		if (c->opts.usrquota)
 			return -EINVAL;
 
-		ret = bch2_btree_delete_range(c, BTREE_ID_QUOTAS,
+		ret = bch2_btree_delete_range(c, BTREE_ID_quotas,
 					      POS(QTYP_USR, 0),
 					      POS(QTYP_USR + 1, 0),
 					      NULL);
@@ -538,7 +538,7 @@ static int bch2_quota_remove(struct super_block *sb, unsigned uflags)
 		if (c->opts.grpquota)
 			return -EINVAL;
 
-		ret = bch2_btree_delete_range(c, BTREE_ID_QUOTAS,
+		ret = bch2_btree_delete_range(c, BTREE_ID_quotas,
 					      POS(QTYP_GRP, 0),
 					      POS(QTYP_GRP + 1, 0),
 					      NULL);
@@ -550,7 +550,7 @@ static int bch2_quota_remove(struct super_block *sb, unsigned uflags)
 		if (c->opts.prjquota)
 			return -EINVAL;
 
-		ret = bch2_btree_delete_range(c, BTREE_ID_QUOTAS,
+		ret = bch2_btree_delete_range(c, BTREE_ID_quotas,
 					      POS(QTYP_PRJ, 0),
 					      POS(QTYP_PRJ + 1, 0),
 					      NULL);
@@ -718,7 +718,7 @@ static int bch2_set_quota_trans(struct btree_trans *trans,
 	struct bkey_s_c k;
 	int ret;
 
-	iter = bch2_trans_get_iter(trans, BTREE_ID_QUOTAS, new_quota->k.p,
+	iter = bch2_trans_get_iter(trans, BTREE_ID_quotas, new_quota->k.p,
 				   BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
 	k = bch2_btree_iter_peek_slot(iter);
 
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 4d7badcc568b..b68fcd1d19e4 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -33,7 +33,7 @@ static void drop_alloc_keys(struct journal_keys *keys)
 	size_t src, dst;
 
 	for (src = 0, dst = 0; src < keys->nr; src++)
-		if (keys->d[src].btree_id != BTREE_ID_ALLOC)
+		if (keys->d[src].btree_id != BTREE_ID_alloc)
 			keys->d[dst++] = keys->d[src];
 
 	keys->nr = dst;
@@ -554,7 +554,7 @@ static int __bch2_alloc_replay_key(struct btree_trans *trans, struct bkey_i *k)
 	struct btree_iter *iter;
 	int ret;
 
-	iter = bch2_trans_get_iter(trans, BTREE_ID_ALLOC, k->k.p,
+	iter = bch2_trans_get_iter(trans, BTREE_ID_alloc, k->k.p,
 				   BTREE_ITER_CACHED|
 				   BTREE_ITER_CACHED_NOFILL|
 				   BTREE_ITER_INTENT);
@@ -606,7 +606,7 @@ static int bch2_journal_replay(struct bch_fs *c,
 	for_each_journal_key(keys, i) {
 		cond_resched();
 
-		if (!i->level && i->btree_id == BTREE_ID_ALLOC) {
+		if (!i->level && i->btree_id == BTREE_ID_alloc) {
 			j->replay_journal_seq = keys.journal_seq_base + i->journal_seq;
 			ret = bch2_alloc_replay_key(c, i->k);
 			if (ret)
@@ -645,7 +645,7 @@ static int bch2_journal_replay(struct bch_fs *c,
 	for_each_journal_key(keys, i) {
 		cond_resched();
 
-		if (i->level || i->btree_id == BTREE_ID_ALLOC)
+		if (i->level || i->btree_id == BTREE_ID_alloc)
 			continue;
 
 		replay_now_at(j, keys.journal_seq_base + i->journal_seq);
@@ -931,28 +931,28 @@ static int read_btree_roots(struct bch_fs *c)
 		if (!r->alive)
 			continue;
 
-		if (i == BTREE_ID_ALLOC &&
+		if (i == BTREE_ID_alloc &&
 		    c->opts.reconstruct_alloc) {
 			c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
 			continue;
 		}
 
 		if (r->error) {
-			__fsck_err(c, i == BTREE_ID_ALLOC
+			__fsck_err(c, i == BTREE_ID_alloc
 				   ? FSCK_CAN_IGNORE : 0,
 				   "invalid btree root %s",
 				   bch2_btree_ids[i]);
-			if (i == BTREE_ID_ALLOC)
+			if (i == BTREE_ID_alloc)
 				c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
 		}
 
 		ret = bch2_btree_root_read(c, i, &r->key, r->level);
 		if (ret) {
-			__fsck_err(c, i == BTREE_ID_ALLOC
+			__fsck_err(c, i == BTREE_ID_alloc
 				   ? FSCK_CAN_IGNORE : 0,
 				   "error reading btree root %s",
 				   bch2_btree_ids[i]);
-			if (i == BTREE_ID_ALLOC)
+			if (i == BTREE_ID_alloc)
 				c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
 		}
 	}
@@ -1346,7 +1346,7 @@ int bch2_fs_initialize(struct bch_fs *c)
 	bch2_inode_pack(c, &packed_inode, &root_inode);
 
 	err = "error creating root directory";
-	ret = bch2_btree_insert(c, BTREE_ID_INODES,
+	ret = bch2_btree_insert(c, BTREE_ID_inodes,
 				&packed_inode.inode.k_i,
 				NULL, NULL, 0);
 	if (ret)
diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c
index 930547de3309..a2cc078597f2 100644
--- a/fs/bcachefs/reflink.c
+++ b/fs/bcachefs/reflink.c
@@ -119,7 +119,7 @@ static int bch2_make_extent_indirect(struct btree_trans *trans,
 	if (orig->k.type == KEY_TYPE_inline_data)
 		bch2_check_set_feature(c, BCH_FEATURE_reflink_inline_data);
 
-	for_each_btree_key(trans, reflink_iter, BTREE_ID_REFLINK,
+	for_each_btree_key(trans, reflink_iter, BTREE_ID_reflink,
 			   POS(0, c->reflink_hint),
 			   BTREE_ITER_INTENT|BTREE_ITER_SLOTS, k, ret) {
 		if (reflink_iter->pos.inode) {
@@ -219,9 +219,9 @@ s64 bch2_remap_range(struct bch_fs *c,
 	bch2_bkey_buf_init(&new_src);
 	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 4096);
 
-	src_iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, src_start,
+	src_iter = bch2_trans_get_iter(&trans, BTREE_ID_extents, src_start,
 				       BTREE_ITER_INTENT);
-	dst_iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, dst_start,
+	dst_iter = bch2_trans_get_iter(&trans, BTREE_ID_extents, dst_start,
 				       BTREE_ITER_INTENT);
 
 	while (1) {
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index 776c026ac838..f843a3b34ba2 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -945,7 +945,7 @@ void bch2_sb_clean_renumber(struct bch_sb_field_clean *clean, int write)
 	for (entry = clean->start;
 	     entry < (struct jset_entry *) vstruct_end(&clean->field);
 	     entry = vstruct_next(entry))
-		bch2_bkey_renumber(BKEY_TYPE_BTREE, bkey_to_packed(entry->start), write);
+		bch2_bkey_renumber(BKEY_TYPE_btree, bkey_to_packed(entry->start), write);
 }
 
 int bch2_fs_mark_dirty(struct bch_fs *c)
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index be6e66e0db71..de8e770ba300 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -1436,7 +1436,7 @@ int bch2_dev_remove_alloc(struct bch_fs *c, struct bch_dev *ca)
 
 	for (i = 0; i < ca->mi.nbuckets; i++) {
 		ret = bch2_btree_key_cache_flush(&trans,
-				BTREE_ID_ALLOC, POS(ca->dev_idx, i));
+				BTREE_ID_alloc, POS(ca->dev_idx, i));
 		if (ret)
 			break;
 	}
@@ -1445,7 +1445,7 @@ int bch2_dev_remove_alloc(struct bch_fs *c, struct bch_dev *ca)
 	if (ret)
 		return ret;
 
-	return bch2_btree_delete_range(c, BTREE_ID_ALLOC,
+	return bch2_btree_delete_range(c, BTREE_ID_alloc,
 				       POS(ca->dev_idx, 0),
 				       POS(ca->dev_idx + 1, 0),
 				       NULL);
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index c4d79096c53a..b9078adaa747 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -259,7 +259,7 @@ static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c
 
 	bch2_trans_init(&trans, c, 0, 0);
 
-	for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, POS_MIN, 0, k, ret)
+	for_each_btree_key(&trans, iter, BTREE_ID_extents, POS_MIN, 0, k, ret)
 		if (k.k->type == KEY_TYPE_extent) {
 			struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
 			const union bch_extent_entry *entry;
diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c
index 6d0f8e233e8b..f25a27f26202 100644
--- a/fs/bcachefs/tests.c
+++ b/fs/bcachefs/tests.c
@@ -13,12 +13,12 @@ static void delete_test_keys(struct bch_fs *c)
 {
 	int ret;
 
-	ret = bch2_btree_delete_range(c, BTREE_ID_EXTENTS,
+	ret = bch2_btree_delete_range(c, BTREE_ID_extents,
 				      POS(0, 0), POS(0, U64_MAX),
 				      NULL);
 	BUG_ON(ret);
 
-	ret = bch2_btree_delete_range(c, BTREE_ID_XATTRS,
+	ret = bch2_btree_delete_range(c, BTREE_ID_xattrs,
 				      POS(0, 0), POS(0, U64_MAX),
 				      NULL);
 	BUG_ON(ret);
@@ -37,7 +37,7 @@ static int test_delete(struct bch_fs *c, u64 nr)
 
 	bch2_trans_init(&trans, c, 0, 0);
 
-	iter = bch2_trans_get_iter(&trans, BTREE_ID_XATTRS, k.k.p,
+	iter = bch2_trans_get_iter(&trans, BTREE_ID_xattrs, k.k.p,
 				   BTREE_ITER_INTENT);
 
 	ret = bch2_btree_iter_traverse(iter);
@@ -82,7 +82,7 @@ static int test_delete_written(struct bch_fs *c, u64 nr)
 
 	bch2_trans_init(&trans, c, 0, 0);
 
-	iter = bch2_trans_get_iter(&trans, BTREE_ID_XATTRS, k.k.p,
+	iter = bch2_trans_get_iter(&trans, BTREE_ID_xattrs, k.k.p,
 				   BTREE_ITER_INTENT);
 
 	ret = bch2_btree_iter_traverse(iter);
@@ -130,7 +130,7 @@ static int test_iterate(struct bch_fs *c, u64 nr)
 		bkey_cookie_init(&k.k_i);
 		k.k.p.offset = i;
 
-		ret = bch2_btree_insert(c, BTREE_ID_XATTRS, &k.k_i,
+		ret = bch2_btree_insert(c, BTREE_ID_xattrs, &k.k_i,
 					NULL, NULL, 0);
 		if (ret) {
 			bch_err(c, "insert error in test_iterate: %i", ret);
@@ -142,7 +142,7 @@ static int test_iterate(struct bch_fs *c, u64 nr)
 
 	i = 0;
 
-	for_each_btree_key(&trans, iter, BTREE_ID_XATTRS,
+	for_each_btree_key(&trans, iter, BTREE_ID_xattrs,
 			   POS_MIN, 0, k, ret) {
 		if (k.k->p.inode)
 			break;
@@ -184,7 +184,7 @@ static int test_iterate_extents(struct bch_fs *c, u64 nr)
 		k.k.p.offset = i + 8;
 		k.k.size = 8;
 
-		ret = bch2_btree_insert(c, BTREE_ID_EXTENTS, &k.k_i,
+		ret = bch2_btree_insert(c, BTREE_ID_extents, &k.k_i,
 					NULL, NULL, 0);
 		if (ret) {
 			bch_err(c, "insert error in test_iterate_extents: %i", ret);
@@ -196,7 +196,7 @@ static int test_iterate_extents(struct bch_fs *c, u64 nr)
 
 	i = 0;
 
-	for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS,
+	for_each_btree_key(&trans, iter, BTREE_ID_extents,
 			   POS_MIN, 0, k, ret) {
 		BUG_ON(bkey_start_offset(k.k) != i);
 		i = k.k->p.offset;
@@ -237,7 +237,7 @@ static int test_iterate_slots(struct bch_fs *c, u64 nr)
 		bkey_cookie_init(&k.k_i);
 		k.k.p.offset = i * 2;
 
-		ret = bch2_btree_insert(c, BTREE_ID_XATTRS, &k.k_i,
+		ret = bch2_btree_insert(c, BTREE_ID_xattrs, &k.k_i,
 					NULL, NULL, 0);
 		if (ret) {
 			bch_err(c, "insert error in test_iterate_slots: %i", ret);
@@ -249,7 +249,7 @@ static int test_iterate_slots(struct bch_fs *c, u64 nr)
 
 	i = 0;
 
-	for_each_btree_key(&trans, iter, BTREE_ID_XATTRS, POS_MIN,
+	for_each_btree_key(&trans, iter, BTREE_ID_xattrs, POS_MIN,
 			   0, k, ret) {
 		if (k.k->p.inode)
 			break;
@@ -265,7 +265,7 @@ static int test_iterate_slots(struct bch_fs *c, u64 nr)
 
 	i = 0;
 
-	for_each_btree_key(&trans, iter, BTREE_ID_XATTRS, POS_MIN,
+	for_each_btree_key(&trans, iter, BTREE_ID_xattrs, POS_MIN,
 			   BTREE_ITER_SLOTS, k, ret) {
 		BUG_ON(k.k->p.offset != i);
 		BUG_ON(bkey_deleted(k.k) != (i & 1));
@@ -300,7 +300,7 @@ static int test_iterate_slots_extents(struct bch_fs *c, u64 nr)
 		k.k.p.offset = i + 16;
 		k.k.size = 8;
 
-		ret = bch2_btree_insert(c, BTREE_ID_EXTENTS, &k.k_i,
+		ret = bch2_btree_insert(c, BTREE_ID_extents, &k.k_i,
 					NULL, NULL, 0);
 		if (ret) {
 			bch_err(c, "insert error in test_iterate_slots_extents: %i", ret);
@@ -312,7 +312,7 @@ static int test_iterate_slots_extents(struct bch_fs *c, u64 nr)
 
 	i = 0;
 
-	for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, POS_MIN,
+	for_each_btree_key(&trans, iter, BTREE_ID_extents, POS_MIN,
 			   0, k, ret) {
 		BUG_ON(bkey_start_offset(k.k) != i + 8);
 		BUG_ON(k.k->size != 8);
@@ -326,7 +326,7 @@ static int test_iterate_slots_extents(struct bch_fs *c, u64 nr)
 
 	i = 0;
 
-	for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, POS_MIN,
+	for_each_btree_key(&trans, iter, BTREE_ID_extents, POS_MIN,
 			   BTREE_ITER_SLOTS, k, ret) {
 		BUG_ON(bkey_deleted(k.k) != !(i % 16));
 
@@ -354,7 +354,7 @@ static int test_peek_end(struct bch_fs *c, u64 nr)
 
 	bch2_trans_init(&trans, c, 0, 0);
 
-	iter = bch2_trans_get_iter(&trans, BTREE_ID_XATTRS, POS_MIN, 0);
+	iter = bch2_trans_get_iter(&trans, BTREE_ID_xattrs, POS_MIN, 0);
 
 	k = bch2_btree_iter_peek(iter);
 	BUG_ON(k.k);
@@ -374,7 +374,7 @@ static int test_peek_end_extents(struct bch_fs *c, u64 nr)
 
 	bch2_trans_init(&trans, c, 0, 0);
 
-	iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, POS_MIN, 0);
+	iter = bch2_trans_get_iter(&trans, BTREE_ID_extents, POS_MIN, 0);
 
 	k = bch2_btree_iter_peek(iter);
 	BUG_ON(k.k);
@@ -403,7 +403,7 @@ static int insert_test_extent(struct bch_fs *c,
 	k.k_i.k.size = end - start;
 	k.k_i.k.version.lo = test_version++;
 
-	ret = bch2_btree_insert(c, BTREE_ID_EXTENTS, &k.k_i,
+	ret = bch2_btree_insert(c, BTREE_ID_extents, &k.k_i,
 				NULL, NULL, 0);
 	if (ret)
 		bch_err(c, "insert error in insert_test_extent: %i", ret);
@@ -475,7 +475,7 @@ static int rand_insert(struct bch_fs *c, u64 nr)
 		k.k.p.offset = test_rand();
 
 		ret = __bch2_trans_do(&trans, NULL, NULL, 0,
-			__bch2_btree_insert(&trans, BTREE_ID_XATTRS, &k.k_i));
+			__bch2_btree_insert(&trans, BTREE_ID_xattrs, &k.k_i));
 		if (ret) {
 			bch_err(c, "error in rand_insert: %i", ret);
 			break;
@@ -495,7 +495,7 @@ static int rand_lookup(struct bch_fs *c, u64 nr)
 	u64 i;
 
 	bch2_trans_init(&trans, c, 0, 0);
-	iter = bch2_trans_get_iter(&trans, BTREE_ID_XATTRS, POS_MIN, 0);
+	iter = bch2_trans_get_iter(&trans, BTREE_ID_xattrs, POS_MIN, 0);
 
 	for (i = 0; i < nr; i++) {
 		bch2_btree_iter_set_pos(iter, POS(0, test_rand()));
@@ -522,7 +522,7 @@ static int rand_mixed(struct bch_fs *c, u64 nr)
 	u64 i;
 
 	bch2_trans_init(&trans, c, 0, 0);
-	iter = bch2_trans_get_iter(&trans, BTREE_ID_XATTRS, POS_MIN, 0);
+	iter = bch2_trans_get_iter(&trans, BTREE_ID_xattrs, POS_MIN, 0);
 
 	for (i = 0; i < nr; i++) {
 		bch2_btree_iter_set_pos(iter, POS(0, test_rand()));
@@ -561,7 +561,7 @@ static int __do_delete(struct btree_trans *trans, struct bpos pos)
 	struct bkey_s_c k;
 	int ret = 0;
 
-	iter = bch2_trans_get_iter(trans, BTREE_ID_XATTRS, pos,
+	iter = bch2_trans_get_iter(trans, BTREE_ID_xattrs, pos,
 				   BTREE_ITER_INTENT);
 	k = bch2_btree_iter_peek(iter);
 	ret = bkey_err(k);
@@ -616,7 +616,7 @@ static int seq_insert(struct bch_fs *c, u64 nr)
 
 	bch2_trans_init(&trans, c, 0, 0);
 
-	for_each_btree_key(&trans, iter, BTREE_ID_XATTRS, POS_MIN,
+	for_each_btree_key(&trans, iter, BTREE_ID_xattrs, POS_MIN,
 			   BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) {
 		insert.k.p = iter->pos;
 
@@ -643,7 +643,7 @@ static int seq_lookup(struct bch_fs *c, u64 nr)
 
 	bch2_trans_init(&trans, c, 0, 0);
 
-	for_each_btree_key(&trans, iter, BTREE_ID_XATTRS, POS_MIN, 0, k, ret)
+	for_each_btree_key(&trans, iter, BTREE_ID_xattrs, POS_MIN, 0, k, ret)
 		;
 	bch2_trans_exit(&trans);
 	return ret;
@@ -658,7 +658,7 @@ static int seq_overwrite(struct bch_fs *c, u64 nr)
 
 	bch2_trans_init(&trans, c, 0, 0);
 
-	for_each_btree_key(&trans, iter, BTREE_ID_XATTRS, POS_MIN,
+	for_each_btree_key(&trans, iter, BTREE_ID_xattrs, POS_MIN,
 			   BTREE_ITER_INTENT, k, ret) {
 		struct bkey_i_cookie u;
 
@@ -679,7 +679,7 @@ static int seq_delete(struct bch_fs *c, u64 nr)
 {
 	int ret;
 
-	ret = bch2_btree_delete_range(c, BTREE_ID_XATTRS,
+	ret = bch2_btree_delete_range(c, BTREE_ID_xattrs,
 				      POS(0, 0), POS(0, U64_MAX),
 				      NULL);
 	if (ret)
diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c
index c7840bb949a1..5555d45df54e 100644
--- a/fs/bcachefs/xattr.c
+++ b/fs/bcachefs/xattr.c
@@ -61,7 +61,7 @@ static bool xattr_cmp_bkey(struct bkey_s_c _l, struct bkey_s_c _r)
 }
 
 const struct bch_hash_desc bch2_xattr_hash_desc = {
-	.btree_id	= BTREE_ID_XATTRS,
+	.btree_id	= BTREE_ID_xattrs,
 	.key_type	= KEY_TYPE_xattr,
 	.hash_key	= xattr_hash_key,
 	.hash_bkey	= xattr_hash_bkey,
@@ -279,7 +279,7 @@ ssize_t bch2_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size)
 
 	bch2_trans_init(&trans, c, 0, 0);
 
-	for_each_btree_key(&trans, iter, BTREE_ID_XATTRS,
+	for_each_btree_key(&trans, iter, BTREE_ID_xattrs,
 			   POS(inum, 0), 0, k, ret) {
 		BUG_ON(k.k->p.inode < inum);
 
-- 
cgit 


From 2436cb9fada98d477bb3508a30e520ab3bfaae3e Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 20 Feb 2021 19:47:58 -0500
Subject: bcachefs: Use x-macros for more enums

This patch standardizes all the enums that have associated string tables
(probably more enums should have string tables).

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c |  8 ++---
 fs/bcachefs/bcachefs_format.h  | 73 ++++++++++++++++++++++++++++--------------
 fs/bcachefs/btree_io.c         |  2 +-
 fs/bcachefs/checksum.h         |  6 ++--
 fs/bcachefs/error.c            | 10 +++---
 fs/bcachefs/extents.c          |  2 +-
 fs/bcachefs/journal_io.c       |  6 ++--
 fs/bcachefs/opts.c             | 45 ++++++--------------------
 fs/bcachefs/opts.h             | 11 +++----
 fs/bcachefs/replicas.c         |  2 +-
 fs/bcachefs/str_hash.h         |  6 ++--
 fs/bcachefs/super.c            | 40 +++++++++++------------
 fs/bcachefs/super.h            | 10 +++---
 fs/bcachefs/sysfs.c            |  2 +-
 14 files changed, 110 insertions(+), 113 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 34590e4b8f5d..add04dcb849b 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -720,13 +720,13 @@ static size_t find_reclaimable_buckets(struct bch_fs *c, struct bch_dev *ca)
 	ca->inc_gen_needs_gc			= 0;
 
 	switch (ca->mi.replacement) {
-	case CACHE_REPLACEMENT_LRU:
+	case BCH_CACHE_REPLACEMENT_lru:
 		find_reclaimable_buckets_lru(c, ca);
 		break;
-	case CACHE_REPLACEMENT_FIFO:
+	case BCH_CACHE_REPLACEMENT_fifo:
 		find_reclaimable_buckets_fifo(c, ca);
 		break;
-	case CACHE_REPLACEMENT_RANDOM:
+	case BCH_CACHE_REPLACEMENT_random:
 		find_reclaimable_buckets_random(c, ca);
 		break;
 	}
@@ -1037,7 +1037,7 @@ static int discard_invalidated_buckets(struct bch_fs *c, struct bch_dev *ca)
 
 static inline bool allocator_thread_running(struct bch_dev *ca)
 {
-	return ca->mi.state == BCH_MEMBER_STATE_RW &&
+	return ca->mi.state == BCH_MEMBER_STATE_rw &&
 		test_bit(BCH_FS_ALLOCATOR_RUNNING, &ca->fs->flags);
 }
 
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index e9e501a8c3ec..17cc6131de0c 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -991,19 +991,29 @@ LE64_BITMASK(BCH_MEMBER_NR_READ_ERRORS,	struct bch_member, flags[1], 0,  20);
 LE64_BITMASK(BCH_MEMBER_NR_WRITE_ERRORS,struct bch_member, flags[1], 20, 40);
 #endif
 
+#define BCH_MEMBER_STATES()			\
+	x(rw,		0)			\
+	x(ro,		1)			\
+	x(failed,	2)			\
+	x(spare,	3)
+
 enum bch_member_state {
-	BCH_MEMBER_STATE_RW		= 0,
-	BCH_MEMBER_STATE_RO		= 1,
-	BCH_MEMBER_STATE_FAILED		= 2,
-	BCH_MEMBER_STATE_SPARE		= 3,
-	BCH_MEMBER_STATE_NR		= 4,
+#define x(t, n) BCH_MEMBER_STATE_##t = n,
+	BCH_MEMBER_STATES()
+#undef x
+	BCH_MEMBER_STATE_NR
 };
 
-enum cache_replacement {
-	CACHE_REPLACEMENT_LRU		= 0,
-	CACHE_REPLACEMENT_FIFO		= 1,
-	CACHE_REPLACEMENT_RANDOM	= 2,
-	CACHE_REPLACEMENT_NR		= 3,
+#define BCH_CACHE_REPLACEMENT_POLICIES()	\
+	x(lru,		0)			\
+	x(fifo,		1)			\
+	x(random,	2)
+
+enum bch_cache_replacement_policies {
+#define x(t, n) BCH_CACHE_REPLACEMENT_##t = n,
+	BCH_CACHE_REPLACEMENT_POLICIES()
+#undef x
+	BCH_CACHE_REPLACEMENT_NR
 };
 
 struct bch_sb_field_members {
@@ -1405,11 +1415,16 @@ enum bch_sb_compat {
 
 #define BCH_BKEY_PTRS_MAX		16U
 
+#define BCH_ERROR_ACTIONS()		\
+	x(continue,		0)	\
+	x(ro,			1)	\
+	x(panic,		2)
+
 enum bch_error_actions {
-	BCH_ON_ERROR_CONTINUE		= 0,
-	BCH_ON_ERROR_RO			= 1,
-	BCH_ON_ERROR_PANIC		= 2,
-	BCH_NR_ERROR_ACTIONS		= 3,
+#define x(t, n) BCH_ON_ERROR_##t = n,
+	BCH_ERROR_ACTIONS()
+#undef x
+	BCH_ON_ERROR_NR
 };
 
 enum bch_str_hash_type {
@@ -1420,11 +1435,16 @@ enum bch_str_hash_type {
 	BCH_STR_HASH_NR			= 4,
 };
 
+#define BCH_STR_HASH_OPTS()		\
+	x(crc32c,		0)	\
+	x(crc64,		1)	\
+	x(siphash,		2)
+
 enum bch_str_hash_opts {
-	BCH_STR_HASH_OPT_CRC32C		= 0,
-	BCH_STR_HASH_OPT_CRC64		= 1,
-	BCH_STR_HASH_OPT_SIPHASH	= 2,
-	BCH_STR_HASH_OPT_NR		= 3,
+#define x(t, n) BCH_STR_HASH_OPT_##t = n,
+	BCH_STR_HASH_OPTS()
+#undef x
+	BCH_STR_HASH_OPT_NR
 };
 
 enum bch_csum_type {
@@ -1459,11 +1479,16 @@ static inline _Bool bch2_csum_type_is_encryption(enum bch_csum_type type)
 	}
 }
 
+#define BCH_CSUM_OPTS()			\
+	x(none,			0)	\
+	x(crc32c,		1)	\
+	x(crc64,		2)
+
 enum bch_csum_opts {
-	BCH_CSUM_OPT_NONE		= 0,
-	BCH_CSUM_OPT_CRC32C		= 1,
-	BCH_CSUM_OPT_CRC64		= 2,
-	BCH_CSUM_OPT_NR			= 3,
+#define x(t, n) BCH_CSUM_OPT_##t = n,
+	BCH_CSUM_OPTS()
+#undef x
+	BCH_CSUM_OPT_NR
 };
 
 #define BCH_COMPRESSION_TYPES()		\
@@ -1475,7 +1500,7 @@ enum bch_csum_opts {
 	x(incompressible,	5)
 
 enum bch_compression_type {
-#define x(t, n) BCH_COMPRESSION_TYPE_##t,
+#define x(t, n) BCH_COMPRESSION_TYPE_##t = n,
 	BCH_COMPRESSION_TYPES()
 #undef x
 	BCH_COMPRESSION_TYPE_NR
@@ -1488,7 +1513,7 @@ enum bch_compression_type {
 	x(zstd,		3)
 
 enum bch_compression_opts {
-#define x(t, n) BCH_COMPRESSION_OPT_##t,
+#define x(t, n) BCH_COMPRESSION_OPT_##t = n,
 	BCH_COMPRESSION_OPTS()
 #undef x
 	BCH_COMPRESSION_OPT_NR
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index b7d931335dd6..a0df2c67da65 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -954,7 +954,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
 	bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&b->key)), ptr) {
 		struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
 
-		if (ca->mi.state != BCH_MEMBER_STATE_RW)
+		if (ca->mi.state != BCH_MEMBER_STATE_rw)
 			set_btree_node_need_rewrite(b);
 	}
 out:
diff --git a/fs/bcachefs/checksum.h b/fs/bcachefs/checksum.h
index 24dee8039d57..728b7ef1a149 100644
--- a/fs/bcachefs/checksum.h
+++ b/fs/bcachefs/checksum.h
@@ -77,11 +77,11 @@ static inline enum bch_csum_type bch2_csum_opt_to_type(enum bch_csum_opts type,
 						       bool data)
 {
 	switch (type) {
-	case BCH_CSUM_OPT_NONE:
+	case BCH_CSUM_OPT_none:
 	     return BCH_CSUM_NONE;
-	case BCH_CSUM_OPT_CRC32C:
+	case BCH_CSUM_OPT_crc32c:
 	     return data ? BCH_CSUM_CRC32C : BCH_CSUM_CRC32C_NONZERO;
-	case BCH_CSUM_OPT_CRC64:
+	case BCH_CSUM_OPT_crc64:
 	     return data ? BCH_CSUM_CRC64 : BCH_CSUM_CRC64_NONZERO;
 	default:
 	     BUG();
diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c
index cd46706fb6f5..a8ee1db8aa39 100644
--- a/fs/bcachefs/error.c
+++ b/fs/bcachefs/error.c
@@ -11,13 +11,13 @@ bool bch2_inconsistent_error(struct bch_fs *c)
 	set_bit(BCH_FS_ERROR, &c->flags);
 
 	switch (c->opts.errors) {
-	case BCH_ON_ERROR_CONTINUE:
+	case BCH_ON_ERROR_continue:
 		return false;
-	case BCH_ON_ERROR_RO:
+	case BCH_ON_ERROR_ro:
 		if (bch2_fs_emergency_read_only(c))
 			bch_err(c, "emergency read only");
 		return true;
-	case BCH_ON_ERROR_PANIC:
+	case BCH_ON_ERROR_panic:
 		panic(bch2_fmt(c, "panic after error"));
 		return true;
 	default:
@@ -38,10 +38,10 @@ void bch2_io_error_work(struct work_struct *work)
 	bool dev;
 
 	down_write(&c->state_lock);
-	dev = bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_RO,
+	dev = bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_ro,
 				    BCH_FORCE_IF_DEGRADED);
 	if (dev
-	    ? __bch2_dev_set_state(c, ca, BCH_MEMBER_STATE_RO,
+	    ? __bch2_dev_set_state(c, ca, BCH_MEMBER_STATE_ro,
 				  BCH_FORCE_IF_DEGRADED)
 	    : bch2_fs_emergency_read_only(c))
 		bch_err(ca,
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 515840bc3eaa..3fe9ef50f5c0 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -724,7 +724,7 @@ static unsigned bch2_extent_ptr_durability(struct bch_fs *c,
 
 	ca = bch_dev_bkey_exists(c, p.ptr.dev);
 
-	if (ca->mi.state != BCH_MEMBER_STATE_FAILED)
+	if (ca->mi.state != BCH_MEMBER_STATE_failed)
 		durability = max_t(unsigned, durability, ca->mi.durability);
 
 	if (p.has_ec)
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index fdd5a837902c..756154b85526 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -870,8 +870,8 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list,
 		    !(bch2_dev_has_data(c, ca) & (1 << BCH_DATA_journal)))
 			continue;
 
-		if ((ca->mi.state == BCH_MEMBER_STATE_RW ||
-		     ca->mi.state == BCH_MEMBER_STATE_RO) &&
+		if ((ca->mi.state == BCH_MEMBER_STATE_rw ||
+		     ca->mi.state == BCH_MEMBER_STATE_ro) &&
 		    percpu_ref_tryget(&ca->io_ref))
 			closure_call(&ca->journal.read,
 				     bch2_journal_read_device,
@@ -1064,7 +1064,7 @@ static void __journal_write_alloc(struct journal *j,
 		 * it:
 		 */
 		if (!ca->mi.durability ||
-		    ca->mi.state != BCH_MEMBER_STATE_RW ||
+		    ca->mi.state != BCH_MEMBER_STATE_rw ||
 		    !ja->nr ||
 		    bch2_bkey_has_device(bkey_i_to_s_c(&w->key),
 					 ca->dev_idx) ||
diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c
index a6c734efe328..0cfbb56a57c1 100644
--- a/fs/bcachefs/opts.c
+++ b/fs/bcachefs/opts.c
@@ -9,86 +9,59 @@
 #include "super-io.h"
 #include "util.h"
 
+#define x(t, n) #t,
+
 const char * const bch2_error_actions[] = {
-	"continue",
-	"remount-ro",
-	"panic",
+	BCH_ERROR_ACTIONS()
 	NULL
 };
 
 const char * const bch2_sb_features[] = {
-#define x(f, n) #f,
 	BCH_SB_FEATURES()
-#undef x
 	NULL
 };
 
 const char * const bch2_sb_compat[] = {
-#define x(f, n) #f,
 	BCH_SB_COMPAT()
-#undef x
 	NULL
 };
 
 const char * const bch2_btree_ids[] = {
-#define x(name, ...) #name,
 	BCH_BTREE_IDS()
-#undef x
 	NULL
 };
 
 const char * const bch2_csum_opts[] = {
-	"none",
-	"crc32c",
-	"crc64",
+	BCH_CSUM_OPTS()
 	NULL
 };
 
 const char * const bch2_compression_opts[] = {
-#define x(t, n) #t,
 	BCH_COMPRESSION_OPTS()
-#undef x
 	NULL
 };
 
 const char * const bch2_str_hash_types[] = {
-	"crc32c",
-	"crc64",
-	"siphash",
+	BCH_STR_HASH_OPTS()
 	NULL
 };
 
 const char * const bch2_data_types[] = {
-#define x(t, n) #t,
 	BCH_DATA_TYPES()
-#undef x
 	NULL
 };
 
 const char * const bch2_cache_replacement_policies[] = {
-	"lru",
-	"fifo",
-	"random",
+	BCH_CACHE_REPLACEMENT_POLICIES()
 	NULL
 };
 
-/* Default is -1; we skip past it for struct cached_dev's cache mode */
-const char * const bch2_cache_modes[] = {
-	"default",
-	"writethrough",
-	"writeback",
-	"writearound",
-	"none",
+const char * const bch2_member_states[] = {
+	BCH_MEMBER_STATES()
 	NULL
 };
 
-const char * const bch2_dev_state[] = {
-	"readwrite",
-	"readonly",
-	"failed",
-	"spare",
-	NULL
-};
+#undef x
 
 void bch2_opts_apply(struct bch_opts *dst, struct bch_opts src)
 {
diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
index 38d78ca46c9c..46f91f19dad4 100644
--- a/fs/bcachefs/opts.h
+++ b/fs/bcachefs/opts.h
@@ -17,8 +17,7 @@ extern const char * const bch2_compression_opts[];
 extern const char * const bch2_str_hash_types[];
 extern const char * const bch2_data_types[];
 extern const char * const bch2_cache_replacement_policies[];
-extern const char * const bch2_cache_modes[];
-extern const char * const bch2_dev_state[];
+extern const char * const bch2_member_states[];
 
 /*
  * Mount options; we also store defaults in the superblock.
@@ -91,7 +90,7 @@ enum opt_type {
 	x(errors,			u8,				\
 	  OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,				\
 	  OPT_STR(bch2_error_actions),					\
-	  BCH_SB_ERROR_ACTION,		BCH_ON_ERROR_RO,		\
+	  BCH_SB_ERROR_ACTION,		BCH_ON_ERROR_ro,		\
 	  NULL,		"Action to take on filesystem error")		\
 	x(metadata_replicas,		u8,				\
 	  OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,				\
@@ -116,12 +115,12 @@ enum opt_type {
 	x(metadata_checksum,		u8,				\
 	  OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,				\
 	  OPT_STR(bch2_csum_opts),					\
-	  BCH_SB_META_CSUM_TYPE,	BCH_CSUM_OPT_CRC32C,		\
+	  BCH_SB_META_CSUM_TYPE,	BCH_CSUM_OPT_crc32c,		\
 	  NULL,		NULL)						\
 	x(data_checksum,		u8,				\
 	  OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE,			\
 	  OPT_STR(bch2_csum_opts),					\
-	  BCH_SB_DATA_CSUM_TYPE,	BCH_CSUM_OPT_CRC32C,		\
+	  BCH_SB_DATA_CSUM_TYPE,	BCH_CSUM_OPT_crc32c,		\
 	  NULL,		NULL)						\
 	x(compression,			u8,				\
 	  OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE,			\
@@ -136,7 +135,7 @@ enum opt_type {
 	x(str_hash,			u8,				\
 	  OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,				\
 	  OPT_STR(bch2_str_hash_types),					\
-	  BCH_SB_STR_HASH_TYPE,		BCH_STR_HASH_OPT_SIPHASH,	\
+	  BCH_SB_STR_HASH_TYPE,		BCH_STR_HASH_OPT_siphash,	\
 	  NULL,		"Hash function for directory entries and xattrs")\
 	x(metadata_target,		u16,				\
 	  OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE,			\
diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c
index ddaf833d0bf2..0498c8ac82c8 100644
--- a/fs/bcachefs/replicas.c
+++ b/fs/bcachefs/replicas.c
@@ -982,7 +982,7 @@ bool bch2_have_enough_devs(struct bch_fs *c, struct bch_devs_mask devs,
 			struct bch_dev *ca = bch_dev_bkey_exists(c, e->devs[i]);
 
 			nr_online += test_bit(e->devs[i], devs.d);
-			nr_failed += ca->mi.state == BCH_MEMBER_STATE_FAILED;
+			nr_failed += ca->mi.state == BCH_MEMBER_STATE_failed;
 		}
 
 		if (nr_failed == e->nr_devs)
diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h
index 952b146af750..b85f895de346 100644
--- a/fs/bcachefs/str_hash.h
+++ b/fs/bcachefs/str_hash.h
@@ -18,11 +18,11 @@ static inline enum bch_str_hash_type
 bch2_str_hash_opt_to_type(struct bch_fs *c, enum bch_str_hash_opts opt)
 {
 	switch (opt) {
-	case BCH_STR_HASH_OPT_CRC32C:
+	case BCH_STR_HASH_OPT_crc32c:
 		return BCH_STR_HASH_CRC32C;
-	case BCH_STR_HASH_OPT_CRC64:
+	case BCH_STR_HASH_OPT_crc64:
 		return BCH_STR_HASH_CRC64;
-	case BCH_STR_HASH_OPT_SIPHASH:
+	case BCH_STR_HASH_OPT_siphash:
 		return c->sb.features & (1ULL << BCH_FEATURE_new_siphash)
 			? BCH_STR_HASH_SIPHASH
 			: BCH_STR_HASH_SIPHASH_OLD;
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index de8e770ba300..7c23cae436bb 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -1171,7 +1171,7 @@ static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx)
 
 	ca->fs = c;
 
-	if (ca->mi.state == BCH_MEMBER_STATE_RW &&
+	if (ca->mi.state == BCH_MEMBER_STATE_rw &&
 	    bch2_dev_allocator_start(ca)) {
 		bch2_dev_free(ca);
 		goto err;
@@ -1276,16 +1276,16 @@ bool bch2_dev_state_allowed(struct bch_fs *c, struct bch_dev *ca,
 	lockdep_assert_held(&c->state_lock);
 
 	switch (new_state) {
-	case BCH_MEMBER_STATE_RW:
+	case BCH_MEMBER_STATE_rw:
 		return true;
-	case BCH_MEMBER_STATE_RO:
-		if (ca->mi.state != BCH_MEMBER_STATE_RW)
+	case BCH_MEMBER_STATE_ro:
+		if (ca->mi.state != BCH_MEMBER_STATE_rw)
 			return true;
 
 		/* do we have enough devices to write to?  */
 		for_each_member_device(ca2, c, i)
 			if (ca2 != ca)
-				nr_rw += ca2->mi.state == BCH_MEMBER_STATE_RW;
+				nr_rw += ca2->mi.state == BCH_MEMBER_STATE_rw;
 
 		required = max(!(flags & BCH_FORCE_IF_METADATA_DEGRADED)
 			       ? c->opts.metadata_replicas
@@ -1295,10 +1295,10 @@ bool bch2_dev_state_allowed(struct bch_fs *c, struct bch_dev *ca,
 			       : c->opts.data_replicas_required);
 
 		return nr_rw >= required;
-	case BCH_MEMBER_STATE_FAILED:
-	case BCH_MEMBER_STATE_SPARE:
-		if (ca->mi.state != BCH_MEMBER_STATE_RW &&
-		    ca->mi.state != BCH_MEMBER_STATE_RO)
+	case BCH_MEMBER_STATE_failed:
+	case BCH_MEMBER_STATE_spare:
+		if (ca->mi.state != BCH_MEMBER_STATE_rw &&
+		    ca->mi.state != BCH_MEMBER_STATE_ro)
 			return true;
 
 		/* do we have enough devices to read from?  */
@@ -1335,8 +1335,8 @@ static bool bch2_fs_may_start(struct bch_fs *c)
 			ca = bch_dev_locked(c, i);
 
 			if (!bch2_dev_is_online(ca) &&
-			    (ca->mi.state == BCH_MEMBER_STATE_RW ||
-			     ca->mi.state == BCH_MEMBER_STATE_RO)) {
+			    (ca->mi.state == BCH_MEMBER_STATE_rw ||
+			     ca->mi.state == BCH_MEMBER_STATE_ro)) {
 				mutex_unlock(&c->sb_lock);
 				return false;
 			}
@@ -1369,7 +1369,7 @@ static const char *__bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca)
 {
 	lockdep_assert_held(&c->state_lock);
 
-	BUG_ON(ca->mi.state != BCH_MEMBER_STATE_RW);
+	BUG_ON(ca->mi.state != BCH_MEMBER_STATE_rw);
 
 	bch2_dev_allocator_add(c, ca);
 	bch2_recalc_capacity(c);
@@ -1392,10 +1392,10 @@ int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca,
 	if (!bch2_dev_state_allowed(c, ca, new_state, flags))
 		return -EINVAL;
 
-	if (new_state != BCH_MEMBER_STATE_RW)
+	if (new_state != BCH_MEMBER_STATE_rw)
 		__bch2_dev_read_only(c, ca);
 
-	bch_notice(ca, "%s", bch2_dev_state[new_state]);
+	bch_notice(ca, "%s", bch2_member_states[new_state]);
 
 	mutex_lock(&c->sb_lock);
 	mi = bch2_sb_get_members(c->disk_sb.sb);
@@ -1403,7 +1403,7 @@ int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca,
 	bch2_write_super(c);
 	mutex_unlock(&c->sb_lock);
 
-	if (new_state == BCH_MEMBER_STATE_RW &&
+	if (new_state == BCH_MEMBER_STATE_rw &&
 	    __bch2_dev_read_write(c, ca))
 		ret = -ENOMEM;
 
@@ -1465,7 +1465,7 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
 	 */
 	percpu_ref_put(&ca->ref);
 
-	if (!bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_FAILED, flags)) {
+	if (!bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_failed, flags)) {
 		bch_err(ca, "Cannot remove without losing data");
 		goto err;
 	}
@@ -1549,7 +1549,7 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
 	bch2_dev_usage_journal_reserve(c);
 	return 0;
 err:
-	if (ca->mi.state == BCH_MEMBER_STATE_RW &&
+	if (ca->mi.state == BCH_MEMBER_STATE_rw &&
 	    !percpu_ref_is_zero(&ca->io_ref))
 		__bch2_dev_read_write(c, ca);
 	up_write(&c->state_lock);
@@ -1673,7 +1673,7 @@ have_slot:
 	if (ret)
 		goto err_late;
 
-	if (ca->mi.state == BCH_MEMBER_STATE_RW) {
+	if (ca->mi.state == BCH_MEMBER_STATE_rw) {
 		err = __bch2_dev_read_write(c, ca);
 		if (err)
 			goto err_late;
@@ -1734,7 +1734,7 @@ int bch2_dev_online(struct bch_fs *c, const char *path)
 		goto err;
 	}
 
-	if (ca->mi.state == BCH_MEMBER_STATE_RW) {
+	if (ca->mi.state == BCH_MEMBER_STATE_rw) {
 		err = __bch2_dev_read_write(c, ca);
 		if (err)
 			goto err;
@@ -1768,7 +1768,7 @@ int bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca, int flags)
 		return 0;
 	}
 
-	if (!bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_FAILED, flags)) {
+	if (!bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_failed, flags)) {
 		bch_err(ca, "Cannot offline required disk");
 		up_write(&c->state_lock);
 		return -EINVAL;
diff --git a/fs/bcachefs/super.h b/fs/bcachefs/super.h
index 795229e2d6a1..28e6d78f9fcd 100644
--- a/fs/bcachefs/super.h
+++ b/fs/bcachefs/super.h
@@ -34,7 +34,7 @@ static inline bool bch2_dev_is_online(struct bch_dev *ca)
 static inline bool bch2_dev_is_readable(struct bch_dev *ca)
 {
 	return bch2_dev_is_online(ca) &&
-		ca->mi.state != BCH_MEMBER_STATE_FAILED;
+		ca->mi.state != BCH_MEMBER_STATE_failed;
 }
 
 static inline bool bch2_dev_get_ioref(struct bch_dev *ca, int rw)
@@ -42,8 +42,8 @@ static inline bool bch2_dev_get_ioref(struct bch_dev *ca, int rw)
 	if (!percpu_ref_tryget(&ca->io_ref))
 		return false;
 
-	if (ca->mi.state == BCH_MEMBER_STATE_RW ||
-	    (ca->mi.state == BCH_MEMBER_STATE_RO && rw == READ))
+	if (ca->mi.state == BCH_MEMBER_STATE_rw ||
+	    (ca->mi.state == BCH_MEMBER_STATE_ro && rw == READ))
 		return true;
 
 	percpu_ref_put(&ca->io_ref);
@@ -158,11 +158,11 @@ static inline struct bch_dev *bch2_get_next_online_dev(struct bch_fs *c,
 	__for_each_online_member(ca, c, iter, ~0)
 
 #define for_each_rw_member(ca, c, iter)					\
-	__for_each_online_member(ca, c, iter, 1 << BCH_MEMBER_STATE_RW)
+	__for_each_online_member(ca, c, iter, 1 << BCH_MEMBER_STATE_rw)
 
 #define for_each_readable_member(ca, c, iter)				\
 	__for_each_online_member(ca, c, iter,				\
-		(1 << BCH_MEMBER_STATE_RW)|(1 << BCH_MEMBER_STATE_RO))
+		(1 << BCH_MEMBER_STATE_rw)|(1 << BCH_MEMBER_STATE_ro))
 
 /*
  * If a key exists that references a device, the device won't be going away and
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index b9078adaa747..4b83a98621d7 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -886,7 +886,7 @@ SHOW(bch2_dev)
 	}
 
 	if (attr == &sysfs_state_rw) {
-		bch2_string_opt_to_text(&out, bch2_dev_state,
+		bch2_string_opt_to_text(&out, bch2_member_states,
 					ca->mi.state);
 		pr_buf(&out, "\n");
 		return out.pos - buf;
-- 
cgit 


From 6333bd2f1334595c553278c2580c1b155e319e43 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 20 Feb 2021 20:51:57 -0500
Subject: bcachefs: Improve handling of extents in bch2_trans_update()

The transaction update/commit path cares about whether it's inserting
extents or regular keys; extents require extra passes (handling of
overlapping extents) but sometimes we want to skip all that. This
clarifies things by adding a new member to btree_insert_entry specifying
whether the key being inserted is an extent, instead of overloading
BTREE_ITER_IS_EXTENTS.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_types.h       |  21 +--
 fs/bcachefs/btree_update_leaf.c | 283 ++++++++++++++++++++--------------------
 2 files changed, 155 insertions(+), 149 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index fcaa13b9129c..ee30ac745ee8 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -335,7 +335,11 @@ struct bkey_cached {
 
 struct btree_insert_entry {
 	unsigned		trigger_flags;
+	u8			bkey_type;
+	u8			btree_id;
+	u8			level;
 	unsigned		trans_triggers_run:1;
+	unsigned		is_extent:1;
 	struct bkey_i		*k;
 	struct btree_iter	*iter;
 };
@@ -589,19 +593,20 @@ static inline bool btree_iter_is_extents(struct btree_iter *iter)
 	return btree_node_type_is_extents(btree_iter_key_type(iter));
 }
 
-#define BTREE_NODE_TYPE_HAS_TRIGGERS			\
+#define BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS		\
 	((1U << BKEY_TYPE_extents)|			\
-	 (1U << BKEY_TYPE_alloc)|			\
 	 (1U << BKEY_TYPE_inodes)|			\
-	 (1U << BKEY_TYPE_reflink)|			\
 	 (1U << BKEY_TYPE_stripes)|			\
+	 (1U << BKEY_TYPE_reflink)|			\
 	 (1U << BKEY_TYPE_btree))
 
-#define BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS		\
-	((1U << BKEY_TYPE_extents)|			\
-	 (1U << BKEY_TYPE_inodes)|			\
-	 (1U << BKEY_TYPE_stripes)|			\
-	 (1U << BKEY_TYPE_reflink))
+#define BTREE_NODE_TYPE_HAS_MEM_TRIGGERS		\
+	((1U << BKEY_TYPE_alloc)|			\
+	 (1U << BKEY_TYPE_stripes))
+
+#define BTREE_NODE_TYPE_HAS_TRIGGERS			\
+	(BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS|		\
+	 BTREE_NODE_TYPE_HAS_MEM_TRIGGERS)
 
 enum btree_trigger_flags {
 	__BTREE_TRIGGER_NORUN,		/* Don't run triggers at all */
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index c46016961284..ad85bc78ea35 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -21,6 +21,14 @@
 #include <linux/prefetch.h>
 #include <linux/sort.h>
 
+static inline int btree_insert_entry_cmp(const struct btree_insert_entry *l,
+					 const struct btree_insert_entry *r)
+{
+	return   cmp_int(l->btree_id,	r->btree_id) ?:
+		 -cmp_int(l->level,	r->level) ?:
+		 bkey_cmp(l->k->k.p,	r->k->k.p);
+}
+
 static inline bool same_leaf_as_prev(struct btree_trans *trans,
 				     struct btree_insert_entry *i)
 {
@@ -211,15 +219,15 @@ static bool btree_insert_key_leaf(struct btree_trans *trans,
 /* Normal update interface: */
 
 static inline void btree_insert_entry_checks(struct btree_trans *trans,
-					     struct btree_iter *iter,
-					     struct bkey_i *insert)
+					     struct btree_insert_entry *i)
 {
 	struct bch_fs *c = trans->c;
 
-	BUG_ON(bkey_cmp(insert->k.p, iter->real_pos));
 	BUG_ON(bch2_debug_check_bkeys &&
-	       bch2_bkey_invalid(c, bkey_i_to_s_c(insert),
-				 __btree_node_type(iter->level, iter->btree_id)));
+	       bch2_bkey_invalid(c, bkey_i_to_s_c(i->k), i->bkey_type));
+	BUG_ON(bkey_cmp(i->k->k.p, i->iter->real_pos));
+	BUG_ON(i->level		!= i->iter->level);
+	BUG_ON(i->btree_id	!= i->iter->btree_id);
 }
 
 static noinline int
@@ -332,19 +340,6 @@ static inline void do_btree_insert_one(struct btree_trans *trans,
 	}
 }
 
-static inline bool iter_has_trans_triggers(struct btree_iter *iter)
-{
-	return BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << iter->btree_id);
-}
-
-static inline bool iter_has_nontrans_triggers(struct btree_iter *iter)
-{
-	return (((BTREE_NODE_TYPE_HAS_TRIGGERS &
-		  ~BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS)) |
-		(1U << BTREE_ID_stripes)) &
-		(1U << iter->btree_id);
-}
-
 static noinline void bch2_btree_iter_unlock_noinline(struct btree_iter *iter)
 {
 	__bch2_btree_iter_unlock(iter);
@@ -405,7 +400,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans,
 			return ret;
 		}
 
-		if (btree_node_type_needs_gc(i->iter->btree_id))
+		if (btree_node_type_needs_gc(i->bkey_type))
 			marking = true;
 	}
 
@@ -459,7 +454,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans,
 	}
 
 	trans_for_each_update(trans, i)
-		if (iter_has_nontrans_triggers(i->iter))
+		if (BTREE_NODE_TYPE_HAS_MEM_TRIGGERS & (1U << i->bkey_type))
 			bch2_mark_update(trans, i->iter, i->k,
 					 &fs_usage->u, i->trigger_flags);
 
@@ -531,7 +526,7 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans,
 
 	if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG))
 		trans_for_each_update2(trans, i)
-			btree_insert_entry_checks(trans, i->iter, i->k);
+			btree_insert_entry_checks(trans, i);
 	bch2_btree_trans_verify_locks(trans);
 
 	trans_for_each_update2(trans, i)
@@ -696,69 +691,64 @@ bch2_trans_commit_get_rw_cold(struct btree_trans *trans)
 	return 0;
 }
 
-static inline int btree_iter_pos_cmp(const struct btree_iter *l,
-				     const struct btree_iter *r)
-{
-	return   cmp_int(l->btree_id, r->btree_id) ?:
-		 bkey_cmp(l->pos, r->pos);
-}
-
-static int bch2_trans_update2(struct btree_trans *trans,
-			       struct btree_iter *iter,
-			       struct bkey_i *insert)
+static int __bch2_trans_update2(struct btree_trans *trans,
+				struct btree_insert_entry n)
 {
-	struct btree_insert_entry *i, n = (struct btree_insert_entry) {
-		.iter = iter, .k = insert
-	};
-	int ret;
+	struct btree_insert_entry *i;
 
-	btree_insert_entry_checks(trans, n.iter, n.k);
+	btree_insert_entry_checks(trans, &n);
 
 	EBUG_ON(trans->nr_updates2 >= BTREE_ITER_MAX);
 
-	ret = bch2_btree_iter_traverse(iter);
-	if (unlikely(ret))
-		return ret;
-
-	BUG_ON(iter->uptodate > BTREE_ITER_NEED_PEEK);
+	n.iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT;
 
-	iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT;
-
-	trans_for_each_update2(trans, i) {
-		if (btree_iter_pos_cmp(n.iter, i->iter) == 0) {
-			*i = n;
-			return 0;
-		}
-
-		if (btree_iter_pos_cmp(n.iter, i->iter) <= 0)
+	trans_for_each_update2(trans, i)
+		if (btree_insert_entry_cmp(&n, i) <= 0)
 			break;
-	}
 
-	array_insert_item(trans->updates2, trans->nr_updates2,
-			  i - trans->updates2, n);
+	if (i < trans->updates2 + trans->nr_updates2 &&
+	    !btree_insert_entry_cmp(&n, i))
+		*i = n;
+	else
+		array_insert_item(trans->updates2, trans->nr_updates2,
+				  i - trans->updates2, n);
+
 	return 0;
 }
 
+static int bch2_trans_update2(struct btree_trans *trans,
+			      struct btree_iter *iter,
+			      struct bkey_i *insert)
+{
+	return __bch2_trans_update2(trans, (struct btree_insert_entry) {
+		.bkey_type	= __btree_node_type(iter->level, iter->btree_id),
+		.btree_id	= iter->btree_id,
+		.level		= iter->level,
+		.iter		= iter,
+		.k		= insert,
+	});
+}
+
 static int extent_update_to_keys(struct btree_trans *trans,
-				 struct btree_iter *orig_iter,
-				 struct bkey_i *insert)
+				 struct btree_insert_entry n)
 {
-	struct btree_iter *iter;
 	int ret;
 
-	ret = bch2_extent_can_insert(trans, orig_iter, insert);
+	if (bkey_deleted(&n.k->k))
+		return 0;
+
+	ret = bch2_extent_can_insert(trans, n.iter, n.k);
 	if (ret)
 		return ret;
 
-	if (bkey_deleted(&insert->k))
-		return 0;
+	n.iter = bch2_trans_copy_iter(trans, n.iter);
 
-	iter = bch2_trans_copy_iter(trans, orig_iter);
+	n.iter->flags |= BTREE_ITER_INTENT;
+	__bch2_btree_iter_set_pos(n.iter, n.k->k.p, false);
+	n.is_extent = false;
 
-	iter->flags |= BTREE_ITER_INTENT;
-	__bch2_btree_iter_set_pos(iter, insert->k.p, false);
-	ret = bch2_trans_update2(trans, iter, insert);
-	bch2_trans_iter_put(trans, iter);
+	ret = __bch2_trans_update2(trans, n);
+	bch2_trans_iter_put(trans, n.iter);
 	return ret;
 }
 
@@ -868,7 +858,7 @@ int __bch2_trans_commit(struct btree_trans *trans)
 		if (btree_iter_type(i->iter) != BTREE_ITER_CACHED &&
 		    !(i->trigger_flags & BTREE_TRIGGER_NORUN))
 			bch2_btree_key_cache_verify_clean(trans,
-					i->iter->btree_id, i->iter->pos);
+					i->btree_id, i->k->k.p);
 #endif
 
 	/*
@@ -879,24 +869,7 @@ int __bch2_trans_commit(struct btree_trans *trans)
 		trans_trigger_run = false;
 
 		trans_for_each_update(trans, i) {
-			ret = bch2_btree_iter_traverse(i->iter);
-			if (unlikely(ret)) {
-				trace_trans_restart_traverse(trans->ip);
-				goto out;
-			}
-
-			/*
-			 * We're not using bch2_btree_iter_upgrade here because
-			 * we know trans->nounlock can't be set:
-			 */
-			if (unlikely(!btree_node_intent_locked(i->iter, i->iter->level) &&
-				     !__bch2_btree_iter_upgrade(i->iter, i->iter->level + 1))) {
-				trace_trans_restart_upgrade(trans->ip);
-				ret = -EINTR;
-				goto out;
-			}
-
-			if (iter_has_trans_triggers(i->iter) &&
+			if ((BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type)) &&
 			    !i->trans_triggers_run) {
 				i->trans_triggers_run = true;
 				trans_trigger_run = true;
@@ -914,39 +887,46 @@ int __bch2_trans_commit(struct btree_trans *trans)
 
 	/* Turn extents updates into keys: */
 	trans_for_each_update(trans, i)
-		if (i->iter->flags & BTREE_ITER_IS_EXTENTS) {
+		if (i->is_extent) {
 			struct bpos start = bkey_start_pos(&i->k->k);
 
 			while (i + 1 < trans->updates + trans->nr_updates &&
-			       i[0].iter->btree_id == i[1].iter->btree_id &&
+			       i[0].btree_id == i[1].btree_id &&
 			       !bkey_cmp(i[0].k->k.p, bkey_start_pos(&i[1].k->k)))
 				i++;
 
-			ret = extent_handle_overwrites(trans, i->iter->btree_id,
+			ret = extent_handle_overwrites(trans, i->btree_id,
 						       start, i->k->k.p);
 			if (ret)
 				goto out;
 		}
 
 	trans_for_each_update(trans, i) {
-		if (i->iter->flags & BTREE_ITER_IS_EXTENTS) {
-			ret = extent_update_to_keys(trans, i->iter, i->k);
-		} else {
-			ret = bch2_trans_update2(trans, i->iter, i->k);
-		}
+		ret = i->is_extent
+			? extent_update_to_keys(trans, *i)
+			: __bch2_trans_update2(trans, *i);
 		if (ret)
 			goto out;
 	}
 
 	trans_for_each_update2(trans, i) {
-		BUG_ON(i->iter->locks_want < 1);
-
 		ret = bch2_btree_iter_traverse(i->iter);
 		if (unlikely(ret)) {
 			trace_trans_restart_traverse(trans->ip);
 			goto out;
 		}
 
+		/*
+		 * We're not using bch2_btree_iter_upgrade here because
+		 * we know trans->nounlock can't be set:
+		 */
+		if (unlikely(!btree_node_intent_locked(i->iter, i->iter->level) &&
+			     !__bch2_btree_iter_upgrade(i->iter, i->iter->level + 1))) {
+			trace_trans_restart_upgrade(trans->ip);
+			ret = -EINTR;
+			goto out;
+		}
+
 		u64s = jset_u64s(i->k->k.u64s);
 		if (btree_iter_type(i->iter) == BTREE_ITER_CACHED &&
 		    likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)))
@@ -989,80 +969,101 @@ int bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter,
 		      struct bkey_i *k, enum btree_trigger_flags flags)
 {
 	struct btree_insert_entry *i, n = (struct btree_insert_entry) {
-		.trigger_flags = flags, .iter = iter, .k = k
+		.trigger_flags	= flags,
+		.bkey_type	= __btree_node_type(iter->level, iter->btree_id),
+		.btree_id	= iter->btree_id,
+		.level		= iter->level,
+		.is_extent	= (iter->flags & BTREE_ITER_IS_EXTENTS) != 0,
+		.iter		= iter,
+		.k		= k
 	};
 
+	BUG_ON(trans->nr_updates >= BTREE_ITER_MAX);
+
 #ifdef CONFIG_BCACHEFS_DEBUG
 	BUG_ON(bkey_cmp(iter->pos,
-			(iter->flags & BTREE_ITER_IS_EXTENTS)
-			? bkey_start_pos(&k->k)
-			: k->k.p));
+			n.is_extent ? bkey_start_pos(&k->k) : k->k.p));
 
 	trans_for_each_update(trans, i) {
 		BUG_ON(bkey_cmp(i->iter->pos,
-				 (i->iter->flags & BTREE_ITER_IS_EXTENTS)
-				 ? bkey_start_pos(&i->k->k)
-				 : i->k->k.p));
+				i->is_extent ? bkey_start_pos(&i->k->k) : i->k->k.p));
 
 		BUG_ON(i != trans->updates &&
-		       btree_iter_pos_cmp(i[-1].iter, i[0].iter) >= 0);
+		       btree_insert_entry_cmp(i - 1, i) >= 0);
 	}
 #endif
 
 	iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT;
 
-	if (btree_node_type_is_extents(iter->btree_id)) {
+	if (n.is_extent) {
 		iter->pos_after_commit = k->k.p;
 		iter->flags |= BTREE_ITER_SET_POS_AFTER_COMMIT;
 	}
 
 	/*
-	 * Pending updates are kept sorted: first, find position of new update:
+	 * Pending updates are kept sorted: first, find position of new update,
+	 * then delete/trim any updates the new update overwrites:
 	 */
-	trans_for_each_update(trans, i)
-		if (btree_iter_pos_cmp(iter, i->iter) <= 0)
-			break;
+	if (!n.is_extent) {
+		trans_for_each_update(trans, i)
+			if (btree_insert_entry_cmp(&n, i) <= 0)
+				break;
 
-	/*
-	 * Now delete/trim any updates the new update overwrites:
-	 */
-	if (i > trans->updates &&
-	    i[-1].iter->btree_id == iter->btree_id &&
-	    bkey_cmp(iter->pos, i[-1].k->k.p) < 0)
-		bch2_cut_back(n.iter->pos, i[-1].k);
-
-	while (i < trans->updates + trans->nr_updates &&
-	       iter->btree_id == i->iter->btree_id &&
-	       bkey_cmp(n.k->k.p, i->k->k.p) >= 0)
-		array_remove_item(trans->updates, trans->nr_updates,
-				  i - trans->updates);
-
-	if (i < trans->updates + trans->nr_updates &&
-	    iter->btree_id == i->iter->btree_id &&
-	    bkey_cmp(n.k->k.p, i->iter->pos) > 0) {
-		/*
-		 * When we have an extent that overwrites the start of another
-		 * update, trimming that extent will mean the iterator's
-		 * position has to change since the iterator position has to
-		 * match the extent's start pos - but we don't want to change
-		 * the iterator pos if some other code is using it, so we may
-		 * need to clone it:
-		 */
-		if (trans->iters_live & (1ULL << i->iter->idx)) {
-			i->iter = bch2_trans_copy_iter(trans, i->iter);
+		if (i < trans->updates + trans->nr_updates &&
+		    !btree_insert_entry_cmp(&n, i))
+			*i = n;
+		else
+			array_insert_item(trans->updates, trans->nr_updates,
+					  i - trans->updates, n);
+	} else {
+		trans_for_each_update(trans, i)
+			if (btree_insert_entry_cmp(&n, i) < 0)
+				break;
 
-			i->iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT;
-			bch2_trans_iter_put(trans, i->iter);
+		while (i > trans->updates &&
+		       i[-1].btree_id == n.btree_id &&
+		       bkey_cmp(bkey_start_pos(&n.k->k),
+				bkey_start_pos(&i[-1].k->k)) <= 0) {
+			--i;
+			array_remove_item(trans->updates, trans->nr_updates,
+					  i - trans->updates);
 		}
 
-		bch2_cut_front(n.k->k.p, i->k);
-		bch2_btree_iter_set_pos(i->iter, n.k->k.p);
-	}
+		if (i > trans->updates &&
+		    i[-1].btree_id == n.btree_id &&
+		    bkey_cmp(bkey_start_pos(&n.k->k), i[-1].k->k.p) < 0)
+			bch2_cut_back(bkey_start_pos(&n.k->k), i[-1].k);
 
-	EBUG_ON(trans->nr_updates >= BTREE_ITER_MAX);
+		if (i < trans->updates + trans->nr_updates &&
+		    i->btree_id == n.btree_id &&
+		    bkey_cmp(n.k->k.p, bkey_start_pos(&i->k->k)) > 0) {
+			/* We don't handle splitting extents here: */
+			BUG_ON(bkey_cmp(bkey_start_pos(&n.k->k),
+					bkey_start_pos(&i->k->k)) > 0);
+
+			/*
+			 * When we have an extent that overwrites the start of another
+			 * update, trimming that extent will mean the iterator's
+			 * position has to change since the iterator position has to
+			 * match the extent's start pos - but we don't want to change
+			 * the iterator pos if some other code is using it, so we may
+			 * need to clone it:
+			 */
+			if (trans->iters_live & (1ULL << i->iter->idx)) {
+				i->iter = bch2_trans_copy_iter(trans, i->iter);
+
+				i->iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT;
+				bch2_trans_iter_put(trans, i->iter);
+			}
+
+			bch2_cut_front(n.k->k.p, i->k);
+			bch2_btree_iter_set_pos(i->iter, n.k->k.p);
+		}
+
+		array_insert_item(trans->updates, trans->nr_updates,
+				  i - trans->updates, n);
+	}
 
-	array_insert_item(trans->updates, trans->nr_updates,
-			  i - trans->updates, n);
 	return 0;
 }
 
-- 
cgit 


From 1f7fdc0abd743076dac6bc91b293a4ae1bb70e61 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 20 Feb 2021 22:19:34 -0500
Subject: bcachefs: btree_iter_live()

New helper to clean things up a bit - also, improve iter->flags
handling.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c        | 28 +++++++++-------------------
 fs/bcachefs/btree_iter.h        | 11 +++++++++++
 fs/bcachefs/btree_types.h       |  7 -------
 fs/bcachefs/btree_update_leaf.c |  7 +++----
 4 files changed, 23 insertions(+), 30 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 1bd7c92d705e..9d3333cd76f9 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1701,7 +1701,8 @@ struct bkey_s_c bch2_btree_iter_peek_with_updates(struct btree_iter *iter)
 		k = __bch2_btree_iter_peek_with_updates(iter);
 
 		if (k.k && bkey_deleted(k.k)) {
-			bch2_btree_iter_advance_pos(iter);
+			if (!bch2_btree_iter_advance_pos(iter))
+				return bkey_s_c_null;
 			continue;
 		}
 
@@ -2008,7 +2009,7 @@ static void btree_trans_iter_alloc_fail(struct btree_trans *trans)
 		       bch2_btree_ids[iter->btree_id],
 		       iter->pos.inode,
 		       iter->pos.offset,
-		       (trans->iters_live & (1ULL << iter->idx)) ? " live" : "",
+		       btree_iter_live(trans, iter) ? " live" : "",
 		       (trans->iters_touched & (1ULL << iter->idx)) ? " touched" : "",
 		       iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT ? " keep" : "",
 		       (void *) iter->ip_allocated);
@@ -2089,31 +2090,20 @@ static struct btree_iter *__btree_trans_get_iter(struct btree_trans *trans,
 	if (!best) {
 		iter = btree_trans_iter_alloc(trans);
 		bch2_btree_iter_init(trans, iter, btree_id, pos, flags);
-	} else if ((trans->iters_live & (1ULL << best->idx)) ||
-		   (best->flags & BTREE_ITER_KEEP_UNTIL_COMMIT)) {
+	} else if (btree_iter_keep(trans, best)) {
 		iter = btree_trans_iter_alloc(trans);
 		btree_iter_copy(iter, best);
 	} else {
 		iter = best;
 	}
 
-	iter->flags &= ~BTREE_ITER_KEEP_UNTIL_COMMIT;
-	iter->flags &= ~BTREE_ITER_USER_FLAGS;
-	iter->flags |= flags & BTREE_ITER_USER_FLAGS;
+	flags |= iter->flags & BTREE_ITER_ERROR;
+	iter->flags = flags;
 
-	if (iter->flags & BTREE_ITER_INTENT) {
-		if (!iter->locks_want) {
-			__bch2_btree_iter_unlock(iter);
-			iter->locks_want = 1;
-		}
-	} else
+	if (!(iter->flags & BTREE_ITER_INTENT))
 		bch2_btree_iter_downgrade(iter);
-
-	BUG_ON(iter->btree_id != btree_id);
-	BUG_ON((iter->flags ^ flags) & BTREE_ITER_TYPE);
-	BUG_ON(iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT);
-	BUG_ON(iter->flags & BTREE_ITER_SET_POS_AFTER_COMMIT);
-	BUG_ON(trans->iters_live & (1ULL << iter->idx));
+	else if (!iter->locks_want)
+		__bch2_btree_iter_upgrade_nounlock(iter, 1);
 
 	trans->iters_live	|= 1ULL << iter->idx;
 	trans->iters_touched	|= 1ULL << iter->idx;
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index 12c519ae2a60..e2469436f53b 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -289,6 +289,17 @@ struct btree_iter *bch2_trans_get_node_iter(struct btree_trans *,
 				enum btree_id, struct bpos,
 				unsigned, unsigned, unsigned);
 
+static inline bool btree_iter_live(struct btree_trans *trans, struct btree_iter *iter)
+{
+	return (trans->iters_live & (1ULL << iter->idx)) != 0;
+}
+
+static inline bool btree_iter_keep(struct btree_trans *trans, struct btree_iter *iter)
+{
+	return btree_iter_live(trans, iter) ||
+		(iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT);
+}
+
 #define TRANS_RESET_NOTRAVERSE		(1 << 0)
 
 void bch2_trans_reset(struct btree_trans *, unsigned);
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index ee30ac745ee8..b12a4f9dd1d0 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -216,13 +216,6 @@ enum btree_iter_type {
 #define BTREE_ITER_CACHED_NOFILL	(1 << 9)
 #define BTREE_ITER_CACHED_NOCREATE	(1 << 10)
 
-#define BTREE_ITER_USER_FLAGS				\
-	(BTREE_ITER_SLOTS				\
-	|BTREE_ITER_INTENT				\
-	|BTREE_ITER_PREFETCH				\
-	|BTREE_ITER_CACHED_NOFILL			\
-	|BTREE_ITER_CACHED_NOCREATE)
-
 enum btree_iter_uptodate {
 	BTREE_ITER_UPTODATE		= 0,
 	BTREE_ITER_NEED_PEEK		= 1,
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index ad85bc78ea35..315e2e1e229d 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -512,8 +512,7 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans,
 	 */
 	trans_for_each_iter(trans, iter) {
 		if (iter->nodes_locked != iter->nodes_intent_locked) {
-			if ((iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT) ||
-			    (trans->iters_live & (1ULL << iter->idx))) {
+			if (btree_iter_keep(trans, iter)) {
 				if (!bch2_btree_iter_upgrade(iter, 1)) {
 					trace_trans_restart_upgrade(trans->ip);
 					return -EINTR;
@@ -945,7 +944,7 @@ retry:
 		goto err;
 
 	trans_for_each_iter(trans, iter)
-		if ((trans->iters_live & (1ULL << iter->idx)) &&
+		if (btree_iter_live(trans, iter) &&
 		    (iter->flags & BTREE_ITER_SET_POS_AFTER_COMMIT))
 			bch2_btree_iter_set_pos(iter, iter->pos_after_commit);
 out:
@@ -1049,7 +1048,7 @@ int bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter,
 			 * the iterator pos if some other code is using it, so we may
 			 * need to clone it:
 			 */
-			if (trans->iters_live & (1ULL << i->iter->idx)) {
+			if (btree_iter_live(trans, i->iter)) {
 				i->iter = bch2_trans_copy_iter(trans, i->iter);
 
 				i->iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT;
-- 
cgit 


From b8f0507915319ee8032b3c5b72f65f22812b9f91 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 3 Mar 2021 12:10:49 -0500
Subject: bcachefs: Delete some dead code

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/quota.c | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c
index 041da982d051..8e272519ce0e 100644
--- a/fs/bcachefs/quota.c
+++ b/fs/bcachefs/quota.c
@@ -746,7 +746,6 @@ static int bch2_set_quota(struct super_block *sb, struct kqid qid,
 			  struct qc_dqblk *qdq)
 {
 	struct bch_fs *c = sb->s_fs_info;
-	struct btree_trans trans;
 	struct bkey_i_quota new_quota;
 	int ret;
 
@@ -756,14 +755,10 @@ static int bch2_set_quota(struct super_block *sb, struct kqid qid,
 	bkey_quota_init(&new_quota.k_i);
 	new_quota.k.p = POS(qid.type, from_kqid(&init_user_ns, qid));
 
-	bch2_trans_init(&trans, c, 0, 0);
-
 	ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_NOUNLOCK,
 			    bch2_set_quota_trans(&trans, &new_quota, qdq)) ?:
 		__bch2_quota_set(c, bkey_i_to_s_c(&new_quota.k_i));
 
-	bch2_trans_exit(&trans);
-
 	return ret;
 }
 
-- 
cgit 


From 18fc6ae50312a88c8a109ab0f0e68c21a2a8ab1e Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 2 Mar 2021 22:45:28 -0500
Subject: bcachefs: btree_iter_prev_slot()

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c | 8 ++++++++
 fs/bcachefs/btree_iter.h | 1 +
 2 files changed, 9 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 9d3333cd76f9..c05bc8ff8b8d 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1904,6 +1904,14 @@ struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *iter)
 	return bch2_btree_iter_peek_slot(iter);
 }
 
+struct bkey_s_c bch2_btree_iter_prev_slot(struct btree_iter *iter)
+{
+	if (!bch2_btree_iter_rewind_pos(iter))
+		return bkey_s_c_null;
+
+	return bch2_btree_iter_peek_slot(iter);
+}
+
 struct bkey_s_c bch2_btree_iter_peek_cached(struct btree_iter *iter)
 {
 	struct bkey_cached *ck;
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index e2469436f53b..3ae6c29c6dad 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -171,6 +171,7 @@ struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *);
 
 struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *);
 struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *);
+struct bkey_s_c bch2_btree_iter_prev_slot(struct btree_iter *);
 
 struct bkey_s_c bch2_btree_iter_peek_cached(struct btree_iter *);
 
-- 
cgit 


From f020bfcdb058e4542a4682557e046a750dc71660 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 4 Mar 2021 15:20:22 -0500
Subject: bcachefs: Use bch2_bpos_to_text() more consistently

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bset.c                  |  7 +++---
 fs/bcachefs/btree_cache.c           | 15 ++++++------
 fs/bcachefs/btree_gc.c              | 22 ++++++++---------
 fs/bcachefs/btree_io.c              | 47 +++++++++++++------------------------
 fs/bcachefs/btree_iter.c            | 45 ++++++++++++++++++-----------------
 fs/bcachefs/btree_update_interior.c | 18 ++++++--------
 fs/bcachefs/rebalance.c             |  8 +++----
 7 files changed, 72 insertions(+), 90 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c
index ac2fd8242ca4..c371f402eaa3 100644
--- a/fs/bcachefs/bset.c
+++ b/fs/bcachefs/bset.c
@@ -1717,9 +1717,10 @@ void bch2_bfloat_to_text(struct printbuf *out, struct btree *b,
 		uk = bkey_unpack_key(b, k);
 		pr_buf(out,
 		       "    failed unpacked at depth %u\n"
-		       "\t%llu:%llu\n",
-		       ilog2(j),
-		       uk.p.inode, uk.p.offset);
+		       "\t",
+		       ilog2(j));
+		bch2_bpos_to_text(out, uk.p);
+		pr_buf(out, "\n");
 		break;
 	}
 }
diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index 1a6b4618c2ae..775b3e8468da 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -1064,15 +1064,14 @@ void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c,
 
 	bch2_btree_keys_stats(b, &stats);
 
-	pr_buf(out,
-	       "l %u %llu:%llu - %llu:%llu:\n"
-	       "    ptrs: ",
-	       b->c.level,
-	       b->data->min_key.inode,
-	       b->data->min_key.offset,
-	       b->data->max_key.inode,
-	       b->data->max_key.offset);
+	pr_buf(out, "l %u ", b->c.level);
+	bch2_bpos_to_text(out, b->data->min_key);
+	pr_buf(out, " - ");
+	bch2_bpos_to_text(out, b->data->max_key);
+	pr_buf(out, ":\n"
+	       "    ptrs: ");
 	bch2_val_to_text(out, c, bkey_i_to_s_c(&b->key));
+
 	pr_buf(out, "\n"
 	       "    format: u64s %u fields %u %u %u %u %u\n"
 	       "    unpack fn len: %u\n"
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 7506a3de58ff..808bb9ca8d50 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -73,12 +73,13 @@ static int bch2_gc_check_topology(struct bch_fs *c,
 	if (cur.k->k.type == KEY_TYPE_btree_ptr_v2) {
 		struct bkey_i_btree_ptr_v2 *bp = bkey_i_to_btree_ptr_v2(cur.k);
 
-		if (bkey_deleted(&prev->k->k))
-			scnprintf(buf1, sizeof(buf1), "start of node: %llu:%llu",
-				  node_start.inode,
-				  node_start.offset);
-		else
+		if (bkey_deleted(&prev->k->k)) {
+			struct printbuf out = PBUF(buf1);
+			pr_buf(&out, "start of node: ");
+			bch2_bpos_to_text(&out, node_start);
+		} else {
 			bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(prev->k));
+		}
 
 		if (fsck_err_on(bkey_cmp(expected_start, bp->v.min_key), c,
 				"btree node with incorrect min_key at btree %s level %u:\n"
@@ -554,6 +555,7 @@ static int bch2_gc_btree_init(struct bch_fs *c,
 		: !btree_node_type_needs_gc(btree_id)	? 1
 		: 0;
 	u8 max_stale = 0;
+	char buf[100];
 	int ret = 0;
 
 	b = c->btree_roots[btree_id].b;
@@ -563,16 +565,14 @@ static int bch2_gc_btree_init(struct bch_fs *c,
 
 	six_lock_read(&b->c.lock, NULL, NULL);
 	if (fsck_err_on(bkey_cmp(b->data->min_key, POS_MIN), c,
-			"btree root with incorrect min_key: %llu:%llu",
-			b->data->min_key.inode,
-			b->data->min_key.offset)) {
+			"btree root with incorrect min_key: %s",
+			(bch2_bpos_to_text(&PBUF(buf), b->data->min_key), buf))) {
 		BUG();
 	}
 
 	if (fsck_err_on(bkey_cmp(b->data->max_key, POS_MAX), c,
-			"btree root with incorrect min_key: %llu:%llu",
-			b->data->max_key.inode,
-			b->data->max_key.offset)) {
+			"btree root with incorrect max_key: %s",
+			(bch2_bpos_to_text(&PBUF(buf), b->data->max_key), buf))) {
 		BUG();
 	}
 
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index a0df2c67da65..d547bfabf09f 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -488,12 +488,12 @@ enum btree_validate_ret {
 ({									\
 	__label__ out;							\
 	char _buf[300];							\
-	char *buf2 = _buf;						\
+	char *_buf2 = _buf;						\
 	struct printbuf out = PBUF(_buf);				\
 									\
-	buf2 = kmalloc(4096, GFP_ATOMIC);				\
-	if (buf2)							\
-		out = _PBUF(buf2, 4986);				\
+	_buf2 = kmalloc(4096, GFP_ATOMIC);				\
+	if (_buf2)							\
+		out = _PBUF(_buf2, 4986);				\
 									\
 	btree_err_msg(&out, c, ca, b, i, b->written, write);		\
 	pr_buf(&out, ": " msg, ##__VA_ARGS__);				\
@@ -501,13 +501,13 @@ enum btree_validate_ret {
 	if (type == BTREE_ERR_FIXABLE &&				\
 	    write == READ &&						\
 	    !test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags)) {		\
-		mustfix_fsck_err(c, "%s", buf2);			\
+		mustfix_fsck_err(c, "%s", _buf2);			\
 		goto out;						\
 	}								\
 									\
 	switch (write) {						\
 	case READ:							\
-		bch_err(c, "%s", buf2);					\
+		bch_err(c, "%s", _buf2);					\
 									\
 		switch (type) {						\
 		case BTREE_ERR_FIXABLE:					\
@@ -528,7 +528,7 @@ enum btree_validate_ret {
 		}							\
 		break;							\
 	case WRITE:							\
-		bch_err(c, "corrupt metadata before write: %s", buf2);	\
+		bch_err(c, "corrupt metadata before write: %s", _buf2);	\
 									\
 		if (bch2_fs_inconsistent(c)) {				\
 			ret = BCH_FSCK_ERRORS_NOT_FIXED;		\
@@ -537,8 +537,8 @@ enum btree_validate_ret {
 		break;							\
 	}								\
 out:									\
-	if (buf2 != _buf)						\
-		kfree(buf2);						\
+	if (_buf2 != _buf)						\
+		kfree(_buf2);						\
 	true;								\
 })
 
@@ -550,6 +550,8 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
 {
 	unsigned version = le16_to_cpu(i->version);
 	const char *err;
+	char buf1[100];
+	char buf2[100];
 	int ret = 0;
 
 	btree_err_on((version != BCH_BSET_VERSION_OLD &&
@@ -613,37 +615,20 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
 
 			btree_err_on(bkey_cmp(b->data->min_key, bp->min_key),
 				     BTREE_ERR_MUST_RETRY, c, ca, b, NULL,
-				     "incorrect min_key: got %llu:%llu should be %llu:%llu",
-				     b->data->min_key.inode,
-				     b->data->min_key.offset,
-				     bp->min_key.inode,
-				     bp->min_key.offset);
+				     "incorrect min_key: got %s should be %s",
+				     (bch2_bpos_to_text(&PBUF(buf1), bn->min_key), buf1),
+				     (bch2_bpos_to_text(&PBUF(buf2), bp->min_key), buf2));
 		}
 
 		btree_err_on(bkey_cmp(bn->max_key, b->key.k.p),
 			     BTREE_ERR_MUST_RETRY, c, ca, b, i,
-			     "incorrect max key %llu:%llu",
-			     bn->max_key.inode,
-			     bn->max_key.offset);
+			     "incorrect max key %s",
+			     (bch2_bpos_to_text(&PBUF(buf1), bn->max_key), buf1));
 
 		if (write)
 			compat_btree_node(b->c.level, b->c.btree_id, version,
 					  BSET_BIG_ENDIAN(i), write, bn);
 
-		/* XXX: ideally we would be validating min_key too */
-#if 0
-		/*
-		 * not correct anymore, due to btree node write error
-		 * handling
-		 *
-		 * need to add bn->seq to btree keys and verify
-		 * against that
-		 */
-		btree_err_on(!extent_contains_ptr(bkey_i_to_s_c_extent(&b->key),
-						  bn->ptr),
-			     BTREE_ERR_FATAL, c, b, i,
-			     "incorrect backpointer");
-#endif
 		err = bch2_bkey_format_validate(&bn->format);
 		btree_err_on(err,
 			     BTREE_ERR_FATAL, c, ca, b, i,
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index c05bc8ff8b8d..69d15bb20c7c 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -495,7 +495,7 @@ static void bch2_btree_iter_verify_level(struct btree_iter *iter,
 	struct btree_node_iter tmp = l->iter;
 	bool locked = btree_node_locked(iter, level);
 	struct bkey_packed *p, *k;
-	char buf1[100], buf2[100];
+	char buf1[100], buf2[100], buf3[100];
 	const char *msg;
 
 	if (!bch2_debug_check_iterators)
@@ -552,26 +552,26 @@ unlock:
 		btree_node_unlock(iter, level);
 	return;
 err:
-	strcpy(buf1, "(none)");
 	strcpy(buf2, "(none)");
+	strcpy(buf3, "(none)");
+
+	bch2_bpos_to_text(&PBUF(buf1), iter->real_pos);
 
 	if (p) {
 		struct bkey uk = bkey_unpack_key(l->b, p);
-		bch2_bkey_to_text(&PBUF(buf1), &uk);
+		bch2_bkey_to_text(&PBUF(buf2), &uk);
 	}
 
 	if (k) {
 		struct bkey uk = bkey_unpack_key(l->b, k);
-		bch2_bkey_to_text(&PBUF(buf2), &uk);
+		bch2_bkey_to_text(&PBUF(buf3), &uk);
 	}
 
 	panic("iterator should be %s key at level %u:\n"
-	      "iter pos %llu:%llu\n"
+	      "iter pos %s\n"
 	      "prev key %s\n"
 	      "cur  key %s\n",
-	      msg, level,
-	      iter->real_pos.inode, iter->real_pos.offset,
-	      buf1, buf2);
+	      msg, level, buf1, buf2, buf3);
 }
 
 static void bch2_btree_iter_verify(struct btree_iter *iter)
@@ -876,22 +876,23 @@ static void btree_iter_verify_new_node(struct btree_iter *iter, struct btree *b)
 	if (!k ||
 	    bkey_deleted(k) ||
 	    bkey_cmp_left_packed(l->b, k, &b->key.k.p)) {
-		char buf[100];
+		char buf1[100];
+		char buf2[100];
+		char buf3[100];
+		char buf4[100];
 		struct bkey uk = bkey_unpack_key(b, k);
 
 		bch2_dump_btree_node(iter->trans->c, l->b);
-		bch2_bkey_to_text(&PBUF(buf), &uk);
+		bch2_bpos_to_text(&PBUF(buf1), iter->real_pos);
+		bch2_bkey_to_text(&PBUF(buf2), &uk);
+		bch2_bpos_to_text(&PBUF(buf3), b->data->min_key);
+		bch2_bpos_to_text(&PBUF(buf3), b->data->max_key);
 		panic("parent iter doesn't point to new node:\n"
-		      "iter pos %s %llu:%llu\n"
+		      "iter pos %s %s\n"
 		      "iter key %s\n"
-		      "new node %llu:%llu-%llu:%llu\n",
-		      bch2_btree_ids[iter->btree_id],
-		      iter->pos.inode,
-		      iter->pos.offset,
-		      buf,
-		      b->data->min_key.inode,
-		      b->data->min_key.offset,
-		      b->key.k.p.inode, b->key.k.p.offset);
+		      "new node %s-%s\n",
+		      bch2_btree_ids[iter->btree_id], buf1,
+		      buf2, buf3, buf4);
 	}
 
 	if (!parent_locked)
@@ -2011,12 +2012,12 @@ static void btree_trans_iter_alloc_fail(struct btree_trans *trans)
 
 	struct btree_iter *iter;
 	struct btree_insert_entry *i;
+	char buf[100];
 
 	trans_for_each_iter(trans, iter)
-		printk(KERN_ERR "iter: btree %s pos %llu:%llu%s%s%s %ps\n",
+		printk(KERN_ERR "iter: btree %s pos %s%s%s%s %ps\n",
 		       bch2_btree_ids[iter->btree_id],
-		       iter->pos.inode,
-		       iter->pos.offset,
+		       (bch2_bpos_to_text(&PBUF(buf), iter->pos), buf),
 		       btree_iter_live(trans, iter) ? " live" : "",
 		       (trans->iters_touched & (1ULL << iter->idx)) ? " touched" : "",
 		       iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT ? " keep" : "",
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index e1dd21320153..4ad8084714f9 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -35,6 +35,7 @@ static void btree_node_interior_verify(struct bch_fs *c, struct btree *b)
 	struct bkey_s_c k;
 	struct bkey_s_c_btree_ptr_v2 bp;
 	struct bkey unpacked;
+	char buf1[100], buf2[100];
 
 	BUG_ON(!b->c.level);
 
@@ -51,24 +52,19 @@ static void btree_node_interior_verify(struct bch_fs *c, struct btree *b)
 
 		if (bkey_cmp(next_node, bp.v->min_key)) {
 			bch2_dump_btree_node(c, b);
-			panic("expected next min_key %llu:%llu got %llu:%llu\n",
-			      next_node.inode,
-			      next_node.offset,
-			      bp.v->min_key.inode,
-			      bp.v->min_key.offset);
+			panic("expected next min_key %s got %s\n",
+			      (bch2_bpos_to_text(&PBUF(buf1), next_node), buf1),
+			      (bch2_bpos_to_text(&PBUF(buf2), bp.v->min_key), buf2));
 		}
 
 		bch2_btree_node_iter_advance(&iter, b);
 
 		if (bch2_btree_node_iter_end(&iter)) {
-
 			if (bkey_cmp(k.k->p, b->key.k.p)) {
 				bch2_dump_btree_node(c, b);
-				panic("expected end %llu:%llu got %llu:%llu\n",
-				      b->key.k.p.inode,
-				      b->key.k.p.offset,
-				      k.k->p.inode,
-				      k.k->p.offset);
+				panic("expected end %s got %s\n",
+				      (bch2_bpos_to_text(&PBUF(buf1), b->key.k.p), buf1),
+				      (bch2_bpos_to_text(&PBUF(buf2), k.k->p), buf2));
 			}
 			break;
 		}
diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c
index 0e1f18d82855..b7e61da0f4d1 100644
--- a/fs/bcachefs/rebalance.c
+++ b/fs/bcachefs/rebalance.c
@@ -281,10 +281,10 @@ void bch2_rebalance_work_to_text(struct printbuf *out, struct bch_fs *c)
 		       h1);
 		break;
 	case REBALANCE_RUNNING:
-		pr_buf(out, "running\n");
-		pr_buf(out, "pos %llu:%llu\n",
-		       r->move_stats.pos.inode,
-		       r->move_stats.pos.offset);
+		pr_buf(out, "running\n"
+		       "pos ");
+		bch2_bpos_to_text(out, r->move_stats.pos);
+		pr_buf(out, "\n");
 		break;
 	}
 }
-- 
cgit 


From 61a19ce4255abd1133d4e7cd64a6cfa40d1f37fa Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 4 Mar 2021 16:26:19 -0500
Subject: bcachefs: Fix bpos_diff()

Previously, bpos_diff() did not handle borrows correctly. Minor thing
considering how it was used, but worth fixing.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bkey.h       | 31 +++++++++++++++++++++++++++++++
 fs/bcachefs/btree_iter.c |  8 --------
 2 files changed, 31 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bkey.h b/fs/bcachefs/bkey.h
index 25cb5e985109..77d9d871adfb 100644
--- a/fs/bcachefs/bkey.h
+++ b/fs/bcachefs/bkey.h
@@ -183,6 +183,37 @@ static inline struct bpos bpos_max(struct bpos l, struct bpos r)
 	return bkey_cmp(l, r) > 0 ? l : r;
 }
 
+#define sbb(a, b, borrow)				\
+do {							\
+	typeof(a) d1, d2;				\
+							\
+	d1 = a - borrow;				\
+	borrow  = d1 > a;				\
+							\
+	d2 = d1 - b;					\
+	borrow += d2 > d1;				\
+	a = d2;						\
+} while (0)
+
+/* returns a - b: */
+static inline struct bpos bpos_sub(struct bpos a, struct bpos b)
+{
+	int borrow = 0;
+
+	sbb(a.snapshot, b.snapshot,	borrow);
+	sbb(a.offset,	b.offset,	borrow);
+	sbb(a.inode,	b.inode,	borrow);
+	return a;
+}
+
+static inline struct bpos bpos_diff(struct bpos l, struct bpos r)
+{
+	if (bkey_cmp(l, r) > 0)
+		swap(l, r);
+
+	return bpos_sub(r, l);
+}
+
 void bch2_bpos_swab(struct bpos *);
 void bch2_bkey_swab_key(const struct bkey_format *, struct bkey_packed *);
 
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 69d15bb20c7c..bf59678b609e 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -2067,14 +2067,6 @@ static inline void btree_iter_copy(struct btree_iter *dst,
 	dst->flags &= ~BTREE_ITER_SET_POS_AFTER_COMMIT;
 }
 
-static inline struct bpos bpos_diff(struct bpos l, struct bpos r)
-{
-	if (bkey_cmp(l, r) > 0)
-		swap(l, r);
-
-	return POS(r.inode - l.inode, r.offset - l.offset);
-}
-
 static struct btree_iter *__btree_trans_get_iter(struct btree_trans *trans,
 						 unsigned btree_id, struct bpos pos,
 						 unsigned flags)
-- 
cgit 


From 7d6f07edc28c3c34bad7e6a92921e3fbf8c8dd4e Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 4 Mar 2021 19:06:26 -0500
Subject: bcachefs: Fix compat code for superblock

The bkey compat code wasn't being run for btree roots in the superblock
clean section - this patch fixes it to use the journal entry validate
code.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs_format.h |   1 +
 fs/bcachefs/journal_io.c      | 108 +++++++++++++++++++++---------------------
 fs/bcachefs/journal_io.h      |   3 ++
 fs/bcachefs/recovery.c        |   8 ++--
 fs/bcachefs/super-io.c        |  31 +++++++++---
 fs/bcachefs/super-io.h        |   2 +-
 6 files changed, 88 insertions(+), 65 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index 17cc6131de0c..5d0e340c4dcb 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -1314,6 +1314,7 @@ LE64_BITMASK(BCH_SB_PRJQUOTA,		struct bch_sb, flags[0], 59, 60);
 LE64_BITMASK(BCH_SB_HAS_ERRORS,		struct bch_sb, flags[0], 60, 61);
 
 LE64_BITMASK(BCH_SB_REFLINK,		struct bch_sb, flags[0], 61, 62);
+LE64_BITMASK(BCH_SB_BIG_ENDIAN,		struct bch_sb, flags[0], 62, 63);
 
 /* 61-64 unused */
 
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 756154b85526..7783a874640a 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -201,22 +201,19 @@ static void journal_entry_null_range(void *start, void *end)
 
 #define FSCK_DELETED_KEY	5
 
-static int journal_validate_key(struct bch_fs *c, struct jset *jset,
+static int journal_validate_key(struct bch_fs *c, const char *where,
 				struct jset_entry *entry,
 				unsigned level, enum btree_id btree_id,
-				struct bkey_i *k,
-				const char *type, int write)
+				struct bkey_i *k, const char *type,
+				unsigned version, int big_endian, int write)
 {
 	void *next = vstruct_next(entry);
 	const char *invalid;
-	unsigned version = le32_to_cpu(jset->version);
 	int ret = 0;
 
 	if (journal_entry_err_on(!k->k.u64s, c,
-			"invalid %s in jset %llu offset %zi/%u entry offset %zi/%u: k->u64s 0",
-			type, le64_to_cpu(jset->seq),
-			(u64 *) entry - jset->_data,
-			le32_to_cpu(jset->u64s),
+			"invalid %s in %s entry offset %zi/%u: k->u64s 0",
+			type, where,
 			(u64 *) k - entry->_data,
 			le16_to_cpu(entry->u64s))) {
 		entry->u64s = cpu_to_le16((u64 *) k - entry->_data);
@@ -226,10 +223,8 @@ static int journal_validate_key(struct bch_fs *c, struct jset *jset,
 
 	if (journal_entry_err_on((void *) bkey_next(k) >
 				(void *) vstruct_next(entry), c,
-			"invalid %s in jset %llu offset %zi/%u entry offset %zi/%u: extends past end of journal entry",
-			type, le64_to_cpu(jset->seq),
-			(u64 *) entry - jset->_data,
-			le32_to_cpu(jset->u64s),
+			"invalid %s in %s entry offset %zi/%u: extends past end of journal entry",
+			type, where,
 			(u64 *) k - entry->_data,
 			le16_to_cpu(entry->u64s))) {
 		entry->u64s = cpu_to_le16((u64 *) k - entry->_data);
@@ -238,10 +233,8 @@ static int journal_validate_key(struct bch_fs *c, struct jset *jset,
 	}
 
 	if (journal_entry_err_on(k->k.format != KEY_FORMAT_CURRENT, c,
-			"invalid %s in jset %llu offset %zi/%u entry offset %zi/%u: bad format %u",
-			type, le64_to_cpu(jset->seq),
-			(u64 *) entry - jset->_data,
-			le32_to_cpu(jset->u64s),
+			"invalid %s in %s entry offset %zi/%u: bad format %u",
+			type, where,
 			(u64 *) k - entry->_data,
 			le16_to_cpu(entry->u64s),
 			k->k.format)) {
@@ -252,9 +245,8 @@ static int journal_validate_key(struct bch_fs *c, struct jset *jset,
 	}
 
 	if (!write)
-		bch2_bkey_compat(level, btree_id, version,
-			    JSET_BIG_ENDIAN(jset), write,
-			    NULL, bkey_to_packed(k));
+		bch2_bkey_compat(level, btree_id, version, big_endian,
+				 write, NULL, bkey_to_packed(k));
 
 	invalid = bch2_bkey_invalid(c, bkey_i_to_s_c(k),
 				    __btree_node_type(level, btree_id));
@@ -262,10 +254,8 @@ static int journal_validate_key(struct bch_fs *c, struct jset *jset,
 		char buf[160];
 
 		bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(k));
-		mustfix_fsck_err(c, "invalid %s in jset %llu offset %zi/%u entry offset %zi/%u: %s\n%s",
-				 type, le64_to_cpu(jset->seq),
-				 (u64 *) entry - jset->_data,
-				 le32_to_cpu(jset->u64s),
+		mustfix_fsck_err(c, "invalid %s in %s entry offset %zi/%u: %s\n%s",
+				 type, where,
 				 (u64 *) k - entry->_data,
 				 le16_to_cpu(entry->u64s),
 				 invalid, buf);
@@ -277,25 +267,24 @@ static int journal_validate_key(struct bch_fs *c, struct jset *jset,
 	}
 
 	if (write)
-		bch2_bkey_compat(level, btree_id, version,
-			    JSET_BIG_ENDIAN(jset), write,
-			    NULL, bkey_to_packed(k));
+		bch2_bkey_compat(level, btree_id, version, big_endian,
+				 write, NULL, bkey_to_packed(k));
 fsck_err:
 	return ret;
 }
 
 static int journal_entry_validate_btree_keys(struct bch_fs *c,
-					     struct jset *jset,
+					     const char *where,
 					     struct jset_entry *entry,
-					     int write)
+					     unsigned version, int big_endian, int write)
 {
 	struct bkey_i *k = entry->start;
 
 	while (k != vstruct_last(entry)) {
-		int ret = journal_validate_key(c, jset, entry,
+		int ret = journal_validate_key(c, where, entry,
 					       entry->level,
 					       entry->btree_id,
-					       k, "key", write);
+					       k, "key", version, big_endian, write);
 		if (ret == FSCK_DELETED_KEY)
 			continue;
 
@@ -306,9 +295,9 @@ static int journal_entry_validate_btree_keys(struct bch_fs *c,
 }
 
 static int journal_entry_validate_btree_root(struct bch_fs *c,
-					     struct jset *jset,
+					     const char *where,
 					     struct jset_entry *entry,
-					     int write)
+					     unsigned version, int big_endian, int write)
 {
 	struct bkey_i *k = entry->start;
 	int ret = 0;
@@ -327,25 +316,25 @@ static int journal_entry_validate_btree_root(struct bch_fs *c,
 		return 0;
 	}
 
-	return journal_validate_key(c, jset, entry, 1, entry->btree_id, k,
-				    "btree root", write);
+	return journal_validate_key(c, where, entry, 1, entry->btree_id, k,
+				    "btree root", version, big_endian, write);
 fsck_err:
 	return ret;
 }
 
 static int journal_entry_validate_prio_ptrs(struct bch_fs *c,
-					    struct jset *jset,
+					    const char *where,
 					    struct jset_entry *entry,
-					    int write)
+					    unsigned version, int big_endian, int write)
 {
 	/* obsolete, don't care: */
 	return 0;
 }
 
 static int journal_entry_validate_blacklist(struct bch_fs *c,
-					    struct jset *jset,
+					    const char *where,
 					    struct jset_entry *entry,
-					    int write)
+					    unsigned version, int big_endian, int write)
 {
 	int ret = 0;
 
@@ -358,9 +347,9 @@ fsck_err:
 }
 
 static int journal_entry_validate_blacklist_v2(struct bch_fs *c,
-					       struct jset *jset,
+					       const char *where,
 					       struct jset_entry *entry,
-					       int write)
+					       unsigned version, int big_endian, int write)
 {
 	struct jset_entry_blacklist_v2 *bl_entry;
 	int ret = 0;
@@ -384,9 +373,9 @@ fsck_err:
 }
 
 static int journal_entry_validate_usage(struct bch_fs *c,
-					struct jset *jset,
+					const char *where,
 					struct jset_entry *entry,
-					int write)
+					unsigned version, int big_endian, int write)
 {
 	struct jset_entry_usage *u =
 		container_of(entry, struct jset_entry_usage, entry);
@@ -405,9 +394,9 @@ fsck_err:
 }
 
 static int journal_entry_validate_data_usage(struct bch_fs *c,
-					struct jset *jset,
+					const char *where,
 					struct jset_entry *entry,
-					int write)
+					unsigned version, int big_endian, int write)
 {
 	struct jset_entry_data_usage *u =
 		container_of(entry, struct jset_entry_data_usage, entry);
@@ -427,9 +416,9 @@ fsck_err:
 }
 
 static int journal_entry_validate_clock(struct bch_fs *c,
-					struct jset *jset,
+					const char *where,
 					struct jset_entry *entry,
-					int write)
+					unsigned version, int big_endian, int write)
 {
 	struct jset_entry_clock *clock =
 		container_of(entry, struct jset_entry_clock, entry);
@@ -453,9 +442,9 @@ fsck_err:
 }
 
 static int journal_entry_validate_dev_usage(struct bch_fs *c,
-					    struct jset *jset,
+					    const char *where,
 					    struct jset_entry *entry,
-					    int write)
+					    unsigned version, int big_endian, int write)
 {
 	struct jset_entry_dev_usage *u =
 		container_of(entry, struct jset_entry_dev_usage, entry);
@@ -490,8 +479,8 @@ fsck_err:
 }
 
 struct jset_entry_ops {
-	int (*validate)(struct bch_fs *, struct jset *,
-			struct jset_entry *, int);
+	int (*validate)(struct bch_fs *, const char *,
+			struct jset_entry *, unsigned, int, int);
 };
 
 static const struct jset_entry_ops bch2_jset_entry_ops[] = {
@@ -503,22 +492,29 @@ static const struct jset_entry_ops bch2_jset_entry_ops[] = {
 #undef x
 };
 
-static int journal_entry_validate(struct bch_fs *c, struct jset *jset,
-				  struct jset_entry *entry, int write)
+int bch2_journal_entry_validate(struct bch_fs *c, const char *where,
+				struct jset_entry *entry,
+				unsigned version, int big_endian, int write)
 {
 	return entry->type < BCH_JSET_ENTRY_NR
-		? bch2_jset_entry_ops[entry->type].validate(c, jset,
-							    entry, write)
+		? bch2_jset_entry_ops[entry->type].validate(c, where, entry,
+				version, big_endian, write)
 		: 0;
 }
 
 static int jset_validate_entries(struct bch_fs *c, struct jset *jset,
 				 int write)
 {
+	char buf[100];
 	struct jset_entry *entry;
 	int ret = 0;
 
 	vstruct_for_each(jset, entry) {
+		scnprintf(buf, sizeof(buf), "jset %llu entry offset %zi/%u",
+			  le64_to_cpu(jset->seq),
+			  (u64 *) entry - jset->_data,
+			  le32_to_cpu(jset->u64s));
+
 		if (journal_entry_err_on(vstruct_next(entry) >
 					 vstruct_last(jset), c,
 				"journal entry extends past end of jset")) {
@@ -526,7 +522,9 @@ static int jset_validate_entries(struct bch_fs *c, struct jset *jset,
 			break;
 		}
 
-		ret = journal_entry_validate(c, jset, entry, write);
+		ret = bch2_journal_entry_validate(c, buf, entry,
+					le32_to_cpu(jset->version),
+					JSET_BIG_ENDIAN(jset), write);
 		if (ret)
 			break;
 	}
diff --git a/fs/bcachefs/journal_io.h b/fs/bcachefs/journal_io.h
index a4931ab93a68..f34281a28f12 100644
--- a/fs/bcachefs/journal_io.h
+++ b/fs/bcachefs/journal_io.h
@@ -40,6 +40,9 @@ static inline struct jset_entry *__jset_entry_type_next(struct jset *jset,
 	for_each_jset_entry_type(entry, jset, BCH_JSET_ENTRY_btree_keys)	\
 		vstruct_for_each_safe(entry, k, _n)
 
+int bch2_journal_entry_validate(struct bch_fs *, const char *, struct jset_entry *,
+				unsigned, int, int);
+
 int bch2_journal_read(struct bch_fs *, struct list_head *, u64 *, u64 *);
 
 void bch2_journal_write(struct closure *);
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index b68fcd1d19e4..11d4894b3d63 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -908,9 +908,11 @@ static struct bch_sb_field_clean *read_superblock_clean(struct bch_fs *c)
 		return ERR_PTR(-ENOMEM);
 	}
 
-	if (le16_to_cpu(c->disk_sb.sb->version) <
-	    bcachefs_metadata_version_bkey_renumber)
-		bch2_sb_clean_renumber(clean, READ);
+	ret = bch2_sb_clean_validate(c, clean, READ);
+	if (ret) {
+		mutex_unlock(&c->sb_lock);
+		return ERR_PTR(ret);
+	}
 
 	mutex_unlock(&c->sb_lock);
 
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index f843a3b34ba2..6e61cf5ab217 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -9,6 +9,7 @@
 #include "error.h"
 #include "io.h"
 #include "journal.h"
+#include "journal_io.h"
 #include "journal_seq_blacklist.h"
 #include "replicas.h"
 #include "quota.h"
@@ -715,6 +716,8 @@ int bch2_write_super(struct bch_fs *c)
 	if (test_bit(BCH_FS_ERROR, &c->flags))
 		SET_BCH_SB_HAS_ERRORS(c->disk_sb.sb, 1);
 
+	SET_BCH_SB_BIG_ENDIAN(c->disk_sb.sb, CPU_BIG_ENDIAN);
+
 	for_each_online_member(ca, c, i)
 		bch2_sb_from_fs(c, ca);
 
@@ -938,14 +941,23 @@ static const struct bch_sb_field_ops bch_sb_field_ops_crypt = {
 
 /* BCH_SB_FIELD_clean: */
 
-void bch2_sb_clean_renumber(struct bch_sb_field_clean *clean, int write)
+int bch2_sb_clean_validate(struct bch_fs *c, struct bch_sb_field_clean *clean, int write)
 {
 	struct jset_entry *entry;
+	int ret;
 
 	for (entry = clean->start;
 	     entry < (struct jset_entry *) vstruct_end(&clean->field);
-	     entry = vstruct_next(entry))
-		bch2_bkey_renumber(BKEY_TYPE_btree, bkey_to_packed(entry->start), write);
+	     entry = vstruct_next(entry)) {
+		ret = bch2_journal_entry_validate(c, "superblock", entry,
+						  le16_to_cpu(c->disk_sb.sb->version),
+						  BCH_SB_BIG_ENDIAN(c->disk_sb.sb),
+						  write);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
 }
 
 int bch2_fs_mark_dirty(struct bch_fs *c)
@@ -1079,6 +1091,7 @@ void bch2_fs_mark_clean(struct bch_fs *c)
 	struct bch_sb_field_clean *sb_clean;
 	struct jset_entry *entry;
 	unsigned u64s;
+	int ret;
 
 	mutex_lock(&c->sb_lock);
 	if (BCH_SB_CLEAN(c->disk_sb.sb))
@@ -1113,9 +1126,15 @@ void bch2_fs_mark_clean(struct bch_fs *c)
 	memset(entry, 0,
 	       vstruct_end(&sb_clean->field) - (void *) entry);
 
-	if (le16_to_cpu(c->disk_sb.sb->version) <
-	    bcachefs_metadata_version_bkey_renumber)
-		bch2_sb_clean_renumber(sb_clean, WRITE);
+	/*
+	 * this should be in the write path, and we should be validating every
+	 * superblock section:
+	 */
+	ret = bch2_sb_clean_validate(c, sb_clean, WRITE);
+	if (ret) {
+		bch_err(c, "error writing marking filesystem clean: validate error");
+		goto out;
+	}
 
 	bch2_write_super(c);
 out:
diff --git a/fs/bcachefs/super-io.h b/fs/bcachefs/super-io.h
index dd8d4ba911f0..62d040d571c0 100644
--- a/fs/bcachefs/super-io.h
+++ b/fs/bcachefs/super-io.h
@@ -125,7 +125,7 @@ static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi)
 void bch2_journal_super_entries_add_common(struct bch_fs *,
 					   struct jset_entry **, u64);
 
-void bch2_sb_clean_renumber(struct bch_sb_field_clean *, int);
+int bch2_sb_clean_validate(struct bch_fs *, struct bch_sb_field_clean *, int);
 
 int bch2_fs_mark_dirty(struct bch_fs *);
 void bch2_fs_mark_clean(struct bch_fs *);
-- 
cgit 


From 27ace9cc01ea0ebb4a857c8d91e303fd7ab46b19 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 4 Mar 2021 22:11:28 -0500
Subject: bcachefs: Simplify for_each_btree_key()

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.h | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index 3ae6c29c6dad..0ac8337eba98 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -243,11 +243,9 @@ static inline int bkey_err(struct bkey_s_c k)
 			   _start, _flags, _k, _ret)			\
 	for ((_iter) = bch2_trans_get_iter((_trans), (_btree_id),	\
 					   (_start), (_flags)),		\
-	     (_ret) = PTR_ERR_OR_ZERO(((_k) =				\
-			__bch2_btree_iter_peek(_iter, _flags)).k);	\
-	     !_ret && (_k).k;						\
-	     (_ret) = PTR_ERR_OR_ZERO(((_k) =				\
-			__bch2_btree_iter_next(_iter, _flags)).k))
+	     (_k) = __bch2_btree_iter_peek(_iter, _flags);		\
+	     !((_ret) = bkey_err(_k)) && (_k).k;			\
+	     (_k) = __bch2_btree_iter_next(_iter, _flags))
 
 #define for_each_btree_key_continue(_iter, _flags, _k, _ret)		\
 	for ((_k) = __bch2_btree_iter_peek(_iter, _flags);		\
-- 
cgit 


From a045be5a0edb0f53770e6c6465155d3952e5bbde Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 4 Mar 2021 22:40:41 -0500
Subject: bcachefs: Simplify bch2_btree_iter_peek_prev()

Since we added iter->real_pos, btree_iter_set_pos_to_(next|prev)_leaf no
longer modify iter->pos, so we don't have to save it at the start
anymore.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index bf59678b609e..15bda5c92ad1 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1739,7 +1739,6 @@ struct bkey_s_c bch2_btree_iter_next_with_updates(struct btree_iter *iter)
  */
 struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
 {
-	struct bpos pos = iter->pos;
 	struct btree_iter_level *l = &iter->l[0];
 	struct bkey_s_c k;
 	int ret;
@@ -1764,8 +1763,8 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
 		k = __btree_iter_peek(iter, l);
 		if (!k.k ||
 		    ((iter->flags & BTREE_ITER_IS_EXTENTS)
-		     ? bkey_cmp(bkey_start_pos(k.k), pos) >= 0
-		     : bkey_cmp(bkey_start_pos(k.k), pos) > 0))
+		     ? bkey_cmp(bkey_start_pos(k.k), iter->pos) >= 0
+		     : bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0))
 			k = __btree_iter_prev(iter, l);
 
 		if (likely(k.k))
@@ -1777,10 +1776,10 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
 		}
 	}
 
-	EBUG_ON(bkey_cmp(bkey_start_pos(k.k), pos) > 0);
+	EBUG_ON(bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0);
 
 	/* Extents can straddle iter->pos: */
-	if (bkey_cmp(k.k->p, pos) < 0)
+	if (bkey_cmp(k.k->p, iter->pos) < 0)
 		iter->pos = k.k->p;
 	iter->real_pos = k.k->p;
 	iter->uptodate = BTREE_ITER_UPTODATE;
@@ -1794,8 +1793,6 @@ no_key:
 	 * then we errored going to the previous leaf - make sure it's
 	 * consistent with iter->pos:
 	 */
-	BUG_ON(bkey_cmp(pos, iter->pos) &&
-	       bkey_cmp(iter->pos, POS_MIN));
 	bkey_init(&iter->k);
 	iter->k.p = iter->pos;
 	goto out;
-- 
cgit 


From c7bb769c81cca29462b39ea4689991b9b4d786fb Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 19 Feb 2021 20:44:55 -0500
Subject: bcachefs: __bch2_trans_get_iter() refactoring, BTREE_ITER_NOT_EXTENTS

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c        | 70 +++++++++++++++--------------------------
 fs/bcachefs/btree_iter.h        |  1 -
 fs/bcachefs/btree_types.h       |  1 +
 fs/bcachefs/btree_update_leaf.c | 16 +++++-----
 fs/bcachefs/fsck.c              |  3 +-
 fs/bcachefs/recovery.c          |  2 +-
 6 files changed, 38 insertions(+), 55 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 15bda5c92ad1..35480ba43621 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1494,24 +1494,14 @@ static void btree_iter_set_search_pos(struct btree_iter *iter, struct bpos new_p
 	bch2_btree_iter_verify(iter);
 }
 
-void __bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos,
-			       bool strictly_greater)
+void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos)
 {
 	bkey_init(&iter->k);
 	iter->k.p = iter->pos = new_pos;
 
-	iter->flags &= ~BTREE_ITER_IS_EXTENTS;
-	iter->flags |= strictly_greater ? BTREE_ITER_IS_EXTENTS : 0;
-
 	btree_iter_set_search_pos(iter, btree_iter_search_key(iter));
 }
 
-void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos)
-{
-	__bch2_btree_iter_set_pos(iter, new_pos,
-			(iter->flags & BTREE_ITER_IS_EXTENTS) != 0);
-}
-
 static inline bool bch2_btree_iter_advance_pos(struct btree_iter *iter)
 {
 	struct bpos pos = iter->k.p;
@@ -1932,27 +1922,17 @@ struct bkey_s_c bch2_btree_iter_peek_cached(struct btree_iter *iter)
 }
 
 static inline void bch2_btree_iter_init(struct btree_trans *trans,
-			struct btree_iter *iter, enum btree_id btree_id,
-			struct bpos pos, unsigned flags)
+			struct btree_iter *iter, enum btree_id btree_id)
 {
 	struct bch_fs *c = trans->c;
 	unsigned i;
 
-	if (btree_node_type_is_extents(btree_id) &&
-	    !(flags & BTREE_ITER_NODES))
-		flags |= BTREE_ITER_IS_EXTENTS;
-
 	iter->trans			= trans;
-	iter->pos			= pos;
-	bkey_init(&iter->k);
-	iter->k.p			= pos;
-	iter->flags			= flags;
-	iter->real_pos			= btree_iter_search_key(iter);
 	iter->uptodate			= BTREE_ITER_NEED_TRAVERSE;
 	iter->btree_id			= btree_id;
 	iter->level			= 0;
 	iter->min_depth			= 0;
-	iter->locks_want		= flags & BTREE_ITER_INTENT ? 1 : 0;
+	iter->locks_want		= 0;
 	iter->nodes_locked		= 0;
 	iter->nodes_intent_locked	= 0;
 	for (i = 0; i < ARRAY_SIZE(iter->l); i++)
@@ -2064,12 +2044,16 @@ static inline void btree_iter_copy(struct btree_iter *dst,
 	dst->flags &= ~BTREE_ITER_SET_POS_AFTER_COMMIT;
 }
 
-static struct btree_iter *__btree_trans_get_iter(struct btree_trans *trans,
-						 unsigned btree_id, struct bpos pos,
-						 unsigned flags)
+struct btree_iter *__bch2_trans_get_iter(struct btree_trans *trans,
+					 enum btree_id btree_id, struct bpos pos,
+					 unsigned flags)
 {
 	struct btree_iter *iter, *best = NULL;
 
+	/* We always want a fresh iterator for node iterators: */
+	if ((flags & BTREE_ITER_TYPE) == BTREE_ITER_NODES)
+		goto alloc_iter;
+
 	trans_for_each_iter(trans, iter) {
 		if (btree_iter_type(iter) != (flags & BTREE_ITER_TYPE))
 			continue;
@@ -2084,10 +2068,10 @@ static struct btree_iter *__btree_trans_get_iter(struct btree_trans *trans,
 
 		best = iter;
 	}
-
+alloc_iter:
 	if (!best) {
 		iter = btree_trans_iter_alloc(trans);
-		bch2_btree_iter_init(trans, iter, btree_id, pos, flags);
+		bch2_btree_iter_init(trans, iter, btree_id);
 	} else if (btree_iter_keep(trans, best)) {
 		iter = btree_trans_iter_alloc(trans);
 		btree_iter_copy(iter, best);
@@ -2095,7 +2079,14 @@ static struct btree_iter *__btree_trans_get_iter(struct btree_trans *trans,
 		iter = best;
 	}
 
-	flags |= iter->flags & BTREE_ITER_ERROR;
+	trans->iters_live	|= 1ULL << iter->idx;
+	trans->iters_touched	|= 1ULL << iter->idx;
+
+	if ((flags & BTREE_ITER_TYPE) != BTREE_ITER_NODES &&
+	    btree_node_type_is_extents(btree_id) &&
+	    !(flags & BTREE_ITER_NOT_EXTENTS))
+		flags |= BTREE_ITER_IS_EXTENTS;
+
 	iter->flags = flags;
 
 	if (!(iter->flags & BTREE_ITER_INTENT))
@@ -2103,21 +2094,8 @@ static struct btree_iter *__btree_trans_get_iter(struct btree_trans *trans,
 	else if (!iter->locks_want)
 		__bch2_btree_iter_upgrade_nounlock(iter, 1);
 
-	trans->iters_live	|= 1ULL << iter->idx;
-	trans->iters_touched	|= 1ULL << iter->idx;
-
-	return iter;
-}
-
-struct btree_iter *__bch2_trans_get_iter(struct btree_trans *trans,
-					 enum btree_id btree_id,
-					 struct bpos pos, unsigned flags)
-{
-	struct btree_iter *iter =
-		__btree_trans_get_iter(trans, btree_id, pos, flags);
+	bch2_btree_iter_set_pos(iter, pos);
 
-	__bch2_btree_iter_set_pos(iter, pos,
-		btree_node_type_is_extents(btree_id));
 	return iter;
 }
 
@@ -2129,8 +2107,10 @@ struct btree_iter *bch2_trans_get_node_iter(struct btree_trans *trans,
 					    unsigned flags)
 {
 	struct btree_iter *iter =
-		__btree_trans_get_iter(trans, btree_id, pos,
-				       flags|BTREE_ITER_NODES);
+		__bch2_trans_get_iter(trans, btree_id, pos,
+				       BTREE_ITER_NODES|
+				       BTREE_ITER_NOT_EXTENTS|
+				       flags);
 	unsigned i;
 
 	BUG_ON(bkey_cmp(iter->pos, pos));
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index 0ac8337eba98..bd0c429bd91a 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -175,7 +175,6 @@ struct bkey_s_c bch2_btree_iter_prev_slot(struct btree_iter *);
 
 struct bkey_s_c bch2_btree_iter_peek_cached(struct btree_iter *);
 
-void __bch2_btree_iter_set_pos(struct btree_iter *, struct bpos, bool);
 void bch2_btree_iter_set_pos(struct btree_iter *, struct bpos);
 
 /* Sort order for locking btree iterators: */
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index b12a4f9dd1d0..41fa5ff77e91 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -215,6 +215,7 @@ enum btree_iter_type {
 #define BTREE_ITER_SET_POS_AFTER_COMMIT	(1 << 8)
 #define BTREE_ITER_CACHED_NOFILL	(1 << 9)
 #define BTREE_ITER_CACHED_NOCREATE	(1 << 10)
+#define BTREE_ITER_NOT_EXTENTS		(1 << 11)
 
 enum btree_iter_uptodate {
 	BTREE_ITER_UPTODATE		= 0,
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 315e2e1e229d..bf2a2b0695b6 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -740,10 +740,9 @@ static int extent_update_to_keys(struct btree_trans *trans,
 	if (ret)
 		return ret;
 
-	n.iter = bch2_trans_copy_iter(trans, n.iter);
-
-	n.iter->flags |= BTREE_ITER_INTENT;
-	__bch2_btree_iter_set_pos(n.iter, n.k->k.p, false);
+	n.iter = bch2_trans_get_iter(trans, n.iter->btree_id, n.k->k.p,
+				     BTREE_ITER_INTENT|
+				     BTREE_ITER_NOT_EXTENTS);
 	n.is_extent = false;
 
 	ret = __bch2_trans_update2(trans, n);
@@ -777,7 +776,8 @@ static int extent_handle_overwrites(struct btree_trans *trans,
 			bkey_reassemble(update, k);
 			bch2_cut_back(start, update);
 
-			__bch2_btree_iter_set_pos(update_iter, update->k.p, false);
+			update_iter->flags &= ~BTREE_ITER_IS_EXTENTS;
+			bch2_btree_iter_set_pos(update_iter, update->k.p);
 			ret = bch2_trans_update2(trans, update_iter, update);
 			bch2_trans_iter_put(trans, update_iter);
 			if (ret)
@@ -794,7 +794,8 @@ static int extent_handle_overwrites(struct btree_trans *trans,
 			bkey_reassemble(update, k);
 			bch2_cut_front(end, update);
 
-			__bch2_btree_iter_set_pos(update_iter, update->k.p, false);
+			update_iter->flags &= ~BTREE_ITER_IS_EXTENTS;
+			bch2_btree_iter_set_pos(update_iter, update->k.p);
 			ret = bch2_trans_update2(trans, update_iter, update);
 			bch2_trans_iter_put(trans, update_iter);
 			if (ret)
@@ -811,7 +812,8 @@ static int extent_handle_overwrites(struct btree_trans *trans,
 			update->k.type = KEY_TYPE_deleted;
 			update->k.size = 0;
 
-			__bch2_btree_iter_set_pos(update_iter, update->k.p, false);
+			update_iter->flags &= ~BTREE_ITER_IS_EXTENTS;
+			bch2_btree_iter_set_pos(update_iter, update->k.p);
 			ret = bch2_trans_update2(trans, update_iter, update);
 			bch2_trans_iter_put(trans, update_iter);
 			if (ret)
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index ebc234b0b6fe..7f6b4ac48f3d 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -442,7 +442,8 @@ static int bch2_fix_overlapping_extent(struct btree_trans *trans,
 	 * We don't want to go through the
 	 * extent_handle_overwrites path:
 	 */
-	__bch2_btree_iter_set_pos(u_iter, u->k.p, false);
+	u_iter->flags &= ~BTREE_ITER_IS_EXTENTS;
+	bch2_btree_iter_set_pos(u_iter, u->k.p);
 
 	/*
 	 * XXX: this is going to leave disk space
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 11d4894b3d63..03a25dd5acc6 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -529,7 +529,7 @@ static int __bch2_journal_replay_key(struct btree_trans *trans,
 	 * want that here, journal replay is supposed to treat extents like
 	 * regular keys:
 	 */
-	__bch2_btree_iter_set_pos(iter, k->k.p, false);
+	BUG_ON(iter->flags & BTREE_ITER_IS_EXTENTS);
 
 	ret   = bch2_btree_iter_traverse(iter) ?:
 		bch2_trans_update(trans, iter, k, BTREE_TRIGGER_NORUN);
-- 
cgit 


From 53b3e3c0e2f14f661cd61bbc9b82dc9383f783b9 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 8 Mar 2021 17:09:13 -0500
Subject: bcachefs: Fix locking in bch2_btree_iter_traverse_cached()

bch2_btree_iter_traverse() is supposed to ensure we have the correct
type of lock - it was downgrading if necessary, but if we entered with a
read lock it wasn't upgrading to an intent lock, oops.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_key_cache.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index 6dc13fa3d1f4..76f19f86c8ad 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -298,7 +298,14 @@ fill:
 		set_bit(BKEY_CACHED_ACCESSED, &ck->flags);
 
 	iter->uptodate = BTREE_ITER_NEED_PEEK;
-	bch2_btree_iter_downgrade(iter);
+
+	if (!(iter->flags & BTREE_ITER_INTENT))
+		bch2_btree_iter_downgrade(iter);
+	else if (!iter->locks_want) {
+		if (!__bch2_btree_iter_upgrade(iter, 1))
+			ret = -EINTR;
+	}
+
 	return ret;
 err:
 	if (ret != -EINTR) {
-- 
cgit 


From b3b66e30445e42a94fa171fad99e0b4e4e43c1ac Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 12 Mar 2021 16:55:28 -0500
Subject: bcachefs: Have fsck check for stripe pointers matching stripe

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_gc.c | 21 ++++++++++++++++++++-
 fs/bcachefs/buckets.c  |  2 ++
 fs/bcachefs/ec.c       |  3 ++-
 fs/bcachefs/ec.h       | 37 ++++++++++++++++++++++++++-----------
 fs/bcachefs/ec_types.h |  1 +
 5 files changed, 51 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 808bb9ca8d50..e8cdc82d3451 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -223,6 +223,11 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id,
 					"pointer to nonexistent stripe %llu",
 					(u64) p.ec.idx))
 				do_update = true;
+
+			if (fsck_err_on(!bch2_ptr_matches_stripe_m(m, p), c,
+					"pointer does not match stripe %llu",
+					(u64) p.ec.idx))
+				do_update = true;
 		}
 	}
 
@@ -274,8 +279,22 @@ again:
 				if (extent_entry_type(entry) == BCH_EXTENT_ENTRY_stripe_ptr) {
 					struct stripe *m = genradix_ptr(&c->stripes[true],
 									entry->stripe_ptr.idx);
+					union bch_extent_entry *next_ptr;
+
+					bkey_extent_entry_for_each_from(ptrs, next_ptr, entry)
+						if (extent_entry_type(next_ptr) == BCH_EXTENT_ENTRY_ptr)
+							goto found;
+					next_ptr = NULL;
+found:
+					if (!next_ptr) {
+						bch_err(c, "aieee, found stripe ptr with no data ptr");
+						continue;
+					}
 
-					if (!m || !m->alive) {
+					if (!m || !m->alive ||
+					    !__bch2_ptr_matches_stripe(&m->ptrs[entry->stripe_ptr.block],
+								       &next_ptr->ptr,
+								       m->sectors)) {
 						bch2_bkey_extent_entry_drop(new, entry);
 						goto again;
 					}
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 55b9818a1dc2..7bf2fded816f 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -1215,6 +1215,8 @@ static int bch2_mark_stripe(struct bch_fs *c,
 			m->block_sectors[i] =
 				stripe_blockcount_get(new_s, i);
 			m->blocks_nonempty += !!m->block_sectors[i];
+
+			m->ptrs[i] = new_s->ptrs[i];
 		}
 
 		bch2_bkey_to_replicas(&m->r.e, new);
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index ced8ceeef992..f61d4c873a82 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -151,7 +151,8 @@ static int bkey_matches_stripe(struct bch_stripe *s,
 
 	bkey_for_each_ptr(ptrs, ptr)
 		for (i = 0; i < nr_data; i++)
-			if (__bch2_ptr_matches_stripe(s, ptr, i))
+			if (__bch2_ptr_matches_stripe(&s->ptrs[i], ptr,
+						      le16_to_cpu(s->sectors)))
 				return i;
 
 	return -1;
diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h
index 765baa9d9264..744e51eaf327 100644
--- a/fs/bcachefs/ec.h
+++ b/fs/bcachefs/ec.h
@@ -84,27 +84,42 @@ static inline void stripe_csum_set(struct bch_stripe *s,
 	memcpy(stripe_csum(s, block, csum_idx), &csum, bch_crc_bytes[s->csum_type]);
 }
 
-static inline bool __bch2_ptr_matches_stripe(const struct bch_stripe *s,
-					     const struct bch_extent_ptr *ptr,
-					     unsigned block)
+static inline bool __bch2_ptr_matches_stripe(const struct bch_extent_ptr *stripe_ptr,
+					     const struct bch_extent_ptr *data_ptr,
+					     unsigned sectors)
+{
+	return  data_ptr->dev    == stripe_ptr->dev &&
+		data_ptr->gen    == stripe_ptr->gen &&
+		data_ptr->offset >= stripe_ptr->offset &&
+		data_ptr->offset  < stripe_ptr->offset + sectors;
+}
+
+static inline bool bch2_ptr_matches_stripe(const struct bch_stripe *s,
+					   struct extent_ptr_decoded p)
 {
 	unsigned nr_data = s->nr_blocks - s->nr_redundant;
 
-	if (block >= nr_data)
+	BUG_ON(!p.has_ec);
+
+	if (p.ec.block >= nr_data)
 		return false;
 
-	return  ptr->dev    == s->ptrs[block].dev &&
-		ptr->gen    == s->ptrs[block].gen &&
-		ptr->offset >= s->ptrs[block].offset &&
-		ptr->offset  < s->ptrs[block].offset + le16_to_cpu(s->sectors);
+	return __bch2_ptr_matches_stripe(&s->ptrs[p.ec.block], &p.ptr,
+					 le16_to_cpu(s->sectors));
 }
 
-static inline bool bch2_ptr_matches_stripe(const struct bch_stripe *s,
-					   struct extent_ptr_decoded p)
+static inline bool bch2_ptr_matches_stripe_m(const struct stripe *m,
+					     struct extent_ptr_decoded p)
 {
+	unsigned nr_data = m->nr_blocks - m->nr_redundant;
+
 	BUG_ON(!p.has_ec);
 
-	return __bch2_ptr_matches_stripe(s, &p.ptr, p.ec.block);
+	if (p.ec.block >= nr_data)
+		return false;
+
+	return __bch2_ptr_matches_stripe(&m->ptrs[p.ec.block], &p.ptr,
+					 m->sectors);
 }
 
 struct bch_read_bio;
diff --git a/fs/bcachefs/ec_types.h b/fs/bcachefs/ec_types.h
index 847770166223..3fc31222459a 100644
--- a/fs/bcachefs/ec_types.h
+++ b/fs/bcachefs/ec_types.h
@@ -22,6 +22,7 @@ struct stripe {
 	unsigned		on_heap:1;
 	u8			blocks_nonempty;
 	u16			block_sectors[BCH_BKEY_PTRS_MAX];
+	struct bch_extent_ptr	ptrs[BCH_BKEY_PTRS_MAX];
 
 	struct bch_replicas_padded r;
 };
-- 
cgit 


From 5f0e4ae1c73efe9e4f74492df08202a5845bd19a Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 12 Mar 2021 17:52:42 -0500
Subject: bcachefs: Use __bch2_trans_do() in a few more places

Minor cleanup, it was being open coded.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs.c | 52 +++++++++++++++++++---------------------------------
 1 file changed, 19 insertions(+), 33 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index 2d5e00a42b3e..a168d09ffd37 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -416,16 +416,12 @@ static int __bch2_link(struct bch_fs *c,
 	mutex_lock(&inode->ei_update_lock);
 	bch2_trans_init(&trans, c, 4, 1024);
 
-	do {
-		bch2_trans_begin(&trans);
-		ret   = bch2_link_trans(&trans,
+	ret = __bch2_trans_do(&trans, NULL, &inode->ei_journal_seq,
+			      BTREE_INSERT_NOUNLOCK,
+			bch2_link_trans(&trans,
 					dir->v.i_ino,
 					inode->v.i_ino, &dir_u, &inode_u,
-					&dentry->d_name) ?:
-			bch2_trans_commit(&trans, NULL,
-					&inode->ei_journal_seq,
-					BTREE_INSERT_NOUNLOCK);
-	} while (ret == -EINTR);
+					&dentry->d_name));
 
 	if (likely(!ret)) {
 		BUG_ON(inode_u.bi_inum != inode->v.i_ino);
@@ -472,17 +468,12 @@ static int bch2_unlink(struct inode *vdir, struct dentry *dentry)
 	bch2_lock_inodes(INODE_UPDATE_LOCK, dir, inode);
 	bch2_trans_init(&trans, c, 4, 1024);
 
-	do {
-		bch2_trans_begin(&trans);
-
-		ret   = bch2_unlink_trans(&trans,
+	ret = __bch2_trans_do(&trans, NULL, &dir->ei_journal_seq,
+			      BTREE_INSERT_NOUNLOCK|
+			      BTREE_INSERT_NOFAIL,
+			bch2_unlink_trans(&trans,
 					  dir->v.i_ino, &dir_u,
-					  &inode_u, &dentry->d_name) ?:
-			bch2_trans_commit(&trans, NULL,
-					  &dir->ei_journal_seq,
-					  BTREE_INSERT_NOUNLOCK|
-					  BTREE_INSERT_NOFAIL);
-	} while (ret == -EINTR);
+					  &inode_u, &dentry->d_name));
 
 	if (likely(!ret)) {
 		BUG_ON(inode_u.bi_inum != inode->v.i_ino);
@@ -599,21 +590,16 @@ static int bch2_rename2(struct mnt_idmap *idmap,
 			goto err;
 	}
 
-retry:
-	bch2_trans_begin(&trans);
-	ret   = bch2_rename_trans(&trans,
-				  src_dir->v.i_ino, &src_dir_u,
-				  dst_dir->v.i_ino, &dst_dir_u,
-				  &src_inode_u,
-				  &dst_inode_u,
-				  &src_dentry->d_name,
-				  &dst_dentry->d_name,
-				  mode) ?:
-		bch2_trans_commit(&trans, NULL,
-				  &journal_seq,
-				  BTREE_INSERT_NOUNLOCK);
-	if (ret == -EINTR)
-		goto retry;
+	ret = __bch2_trans_do(&trans, NULL, &journal_seq,
+			      BTREE_INSERT_NOUNLOCK,
+			bch2_rename_trans(&trans,
+					  src_dir->v.i_ino, &src_dir_u,
+					  dst_dir->v.i_ino, &dst_dir_u,
+					  &src_inode_u,
+					  &dst_inode_u,
+					  &src_dentry->d_name,
+					  &dst_dentry->d_name,
+					  mode));
 	if (unlikely(ret))
 		goto err;
 
-- 
cgit 


From 07bca3bd1e5423b2d6fe8c7085af3e92b31c461f Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 2 Mar 2021 18:35:30 -0500
Subject: bcachefs: Kill ei_str_hash

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/acl.c      | 15 +++++++++------
 fs/bcachefs/acl.h      |  4 ++--
 fs/bcachefs/fs-ioctl.c |  4 ++--
 fs/bcachefs/fs.c       |  7 +++----
 fs/bcachefs/fs.h       |  2 --
 fs/bcachefs/xattr.c    | 19 ++++++++++---------
 6 files changed, 26 insertions(+), 25 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/acl.c b/fs/bcachefs/acl.c
index acc1d03c79e4..3879815bcede 100644
--- a/fs/bcachefs/acl.c
+++ b/fs/bcachefs/acl.c
@@ -217,6 +217,7 @@ struct posix_acl *bch2_get_acl(struct mnt_idmap *idmap,
 {
 	struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	struct bch_hash_info hash = bch2_hash_info_init(c, &inode->ei_inode);
 	struct btree_trans trans;
 	struct btree_iter *iter;
 	struct bkey_s_c_xattr xattr;
@@ -227,7 +228,7 @@ retry:
 	bch2_trans_begin(&trans);
 
 	iter = bch2_hash_lookup(&trans, bch2_xattr_hash_desc,
-			&inode->ei_str_hash, inode->v.i_ino,
+			&hash, inode->v.i_ino,
 			&X_SEARCH(acl_to_xattr_type(type), "", 0),
 			0);
 	if (IS_ERR(iter)) {
@@ -290,6 +291,7 @@ int bch2_set_acl(struct mnt_idmap *idmap,
 	struct btree_trans trans;
 	struct btree_iter *inode_iter;
 	struct bch_inode_unpacked inode_u;
+	struct bch_hash_info hash_info;
 	struct posix_acl *acl;
 	umode_t mode;
 	int ret;
@@ -314,9 +316,9 @@ retry:
 			goto err;
 	}
 
-	ret = bch2_set_acl_trans(&trans, &inode_u,
-				 &inode->ei_str_hash,
-				 acl, type);
+	hash_info = bch2_hash_info_init(c, &inode_u);
+
+	ret = bch2_set_acl_trans(&trans, &inode_u, &hash_info, acl, type);
 	if (ret)
 		goto btree_err;
 
@@ -345,10 +347,11 @@ err:
 }
 
 int bch2_acl_chmod(struct btree_trans *trans,
-		   struct bch_inode_info *inode,
+		   struct bch_inode_unpacked *inode,
 		   umode_t mode,
 		   struct posix_acl **new_acl)
 {
+	struct bch_hash_info hash_info = bch2_hash_info_init(trans->c, inode);
 	struct btree_iter *iter;
 	struct bkey_s_c_xattr xattr;
 	struct bkey_i_xattr *new;
@@ -356,7 +359,7 @@ int bch2_acl_chmod(struct btree_trans *trans,
 	int ret = 0;
 
 	iter = bch2_hash_lookup(trans, bch2_xattr_hash_desc,
-			&inode->ei_str_hash, inode->v.i_ino,
+			&hash_info, inode->bi_inum,
 			&X_SEARCH(KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS, "", 0),
 			BTREE_ITER_INTENT);
 	if (IS_ERR(iter))
diff --git a/fs/bcachefs/acl.h b/fs/bcachefs/acl.h
index 73739e38e2d5..f11eb9d4592c 100644
--- a/fs/bcachefs/acl.h
+++ b/fs/bcachefs/acl.h
@@ -33,7 +33,7 @@ int bch2_set_acl_trans(struct btree_trans *,
 		       const struct bch_hash_info *,
 		       struct posix_acl *, int);
 int bch2_set_acl(struct mnt_idmap *, struct dentry *, struct posix_acl *, int);
-int bch2_acl_chmod(struct btree_trans *, struct bch_inode_info *,
+int bch2_acl_chmod(struct btree_trans *, struct bch_inode_unpacked *,
 		   umode_t, struct posix_acl **);
 
 #else
@@ -47,7 +47,7 @@ static inline int bch2_set_acl_trans(struct btree_trans *trans,
 }
 
 static inline int bch2_acl_chmod(struct btree_trans *trans,
-				 struct bch_inode_info *inode,
+				 struct bch_inode_unpacked *inode,
 				 umode_t mode,
 				 struct posix_acl **new_acl)
 {
diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c
index f6773783b958..09a9567b402c 100644
--- a/fs/bcachefs/fs-ioctl.c
+++ b/fs/bcachefs/fs-ioctl.c
@@ -183,6 +183,7 @@ static int bch2_ioc_reinherit_attrs(struct bch_fs *c,
 				    struct bch_inode_info *src,
 				    const char __user *name)
 {
+	struct bch_hash_info hash = bch2_hash_info_init(c, &src->ei_inode);
 	struct bch_inode_info *dst;
 	struct inode *vinode = NULL;
 	char *kname = NULL;
@@ -202,8 +203,7 @@ static int bch2_ioc_reinherit_attrs(struct bch_fs *c,
 	qstr.name	= kname;
 
 	ret = -ENOENT;
-	inum = bch2_dirent_lookup(c, src->v.i_ino,
-				  &src->ei_str_hash,
+	inum = bch2_dirent_lookup(c, src->v.i_ino, &hash,
 				  &qstr);
 	if (!inum)
 		goto err1;
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index a168d09ffd37..ef8505da7391 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -370,11 +370,11 @@ static struct dentry *bch2_lookup(struct inode *vdir, struct dentry *dentry,
 {
 	struct bch_fs *c = vdir->i_sb->s_fs_info;
 	struct bch_inode_info *dir = to_bch_ei(vdir);
+	struct bch_hash_info hash = bch2_hash_info_init(c, &dir->ei_inode);
 	struct inode *vinode = NULL;
 	u64 inum;
 
-	inum = bch2_dirent_lookup(c, dir->v.i_ino,
-				  &dir->ei_str_hash,
+	inum = bch2_dirent_lookup(c, dir->v.i_ino, &hash,
 				  &dentry->d_name);
 
 	if (inum)
@@ -723,7 +723,7 @@ retry:
 	bch2_setattr_copy(idmap, inode, &inode_u, attr);
 
 	if (attr->ia_valid & ATTR_MODE) {
-		ret = bch2_acl_chmod(&trans, inode, inode_u.bi_mode, &acl);
+		ret = bch2_acl_chmod(&trans, &inode_u, inode_u.bi_mode, &acl);
 		if (ret)
 			goto btree_err;
 	}
@@ -1150,7 +1150,6 @@ static void bch2_vfs_inode_init(struct bch_fs *c,
 	inode->ei_flags		= 0;
 	inode->ei_journal_seq	= 0;
 	inode->ei_quota_reserved = 0;
-	inode->ei_str_hash	= bch2_hash_info_init(c, bi);
 	inode->ei_qid		= bch_qid(bi);
 
 	inode->v.i_mapping->a_ops = &bch_address_space_operations;
diff --git a/fs/bcachefs/fs.h b/fs/bcachefs/fs.h
index 8c2796aa7abf..f3072780af51 100644
--- a/fs/bcachefs/fs.h
+++ b/fs/bcachefs/fs.h
@@ -44,8 +44,6 @@ struct bch_inode_info {
 	struct mutex		ei_quota_lock;
 	struct bch_qid		ei_qid;
 
-	struct bch_hash_info	ei_str_hash;
-
 	/* copy of inode in btree: */
 	struct bch_inode_unpacked ei_inode;
 };
diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c
index 5555d45df54e..5692b47eb3c9 100644
--- a/fs/bcachefs/xattr.c
+++ b/fs/bcachefs/xattr.c
@@ -121,6 +121,7 @@ void bch2_xattr_to_text(struct printbuf *out, struct bch_fs *c,
 int bch2_xattr_get(struct bch_fs *c, struct bch_inode_info *inode,
 		   const char *name, void *buffer, size_t size, int type)
 {
+	struct bch_hash_info hash = bch2_hash_info_init(c, &inode->ei_inode);
 	struct btree_trans trans;
 	struct btree_iter *iter;
 	struct bkey_s_c_xattr xattr;
@@ -128,8 +129,8 @@ int bch2_xattr_get(struct bch_fs *c, struct bch_inode_info *inode,
 
 	bch2_trans_init(&trans, c, 0, 0);
 
-	iter = bch2_hash_lookup(&trans, bch2_xattr_hash_desc,
-				&inode->ei_str_hash, inode->v.i_ino,
+	iter = bch2_hash_lookup(&trans, bch2_xattr_hash_desc, &hash,
+				inode->v.i_ino,
 				&X_SEARCH(type, name, strlen(name)),
 				0);
 	if (IS_ERR(iter)) {
@@ -239,7 +240,7 @@ static int bch2_xattr_emit(struct dentry *dentry,
 }
 
 static int bch2_xattr_list_bcachefs(struct bch_fs *c,
-				    struct bch_inode_info *inode,
+				    struct bch_inode_unpacked *inode,
 				    struct xattr_buf *buf,
 				    bool all)
 {
@@ -249,12 +250,12 @@ static int bch2_xattr_list_bcachefs(struct bch_fs *c,
 	u64 v;
 
 	for (id = 0; id < Inode_opt_nr; id++) {
-		v = bch2_inode_opt_get(&inode->ei_inode, id);
+		v = bch2_inode_opt_get(inode, id);
 		if (!v)
 			continue;
 
 		if (!all &&
-		    !(inode->ei_inode.bi_fields_set & (1 << id)))
+		    !(inode->bi_fields_set & (1 << id)))
 			continue;
 
 		ret = __bch2_xattr_emit(prefix, bch2_inode_opts[id],
@@ -298,11 +299,11 @@ ssize_t bch2_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size)
 	if (ret)
 		return ret;
 
-	ret = bch2_xattr_list_bcachefs(c, inode, &buf, false);
+	ret = bch2_xattr_list_bcachefs(c, &inode->ei_inode, &buf, false);
 	if (ret)
 		return ret;
 
-	ret = bch2_xattr_list_bcachefs(c, inode, &buf, true);
+	ret = bch2_xattr_list_bcachefs(c, &inode->ei_inode, &buf, true);
 	if (ret)
 		return ret;
 
@@ -327,10 +328,10 @@ static int bch2_xattr_set_handler(const struct xattr_handler *handler,
 {
 	struct bch_inode_info *inode = to_bch_ei(vinode);
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	struct bch_hash_info hash = bch2_hash_info_init(c, &inode->ei_inode);
 
 	return bch2_trans_do(c, NULL, &inode->ei_journal_seq, 0,
-			bch2_xattr_set(&trans, inode->v.i_ino,
-				       &inode->ei_str_hash,
+			bch2_xattr_set(&trans, inode->v.i_ino, &hash,
 				       name, value, size,
 				       handler->flags, flags));
 }
-- 
cgit 


From c8d94403de0534030e43efce5c2f1d6c2d79dcda Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 12 Mar 2021 20:29:28 -0500
Subject: bcachefs: Consolidate bch2_read_retry and bch2_read()

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/io.c | 124 +++++++++++++------------------------------------------
 fs/bcachefs/io.h |  19 ++++++++-
 2 files changed, 46 insertions(+), 97 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index de3bd22edb5a..b402fc2e51d6 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -1664,82 +1664,6 @@ err:
 	goto out;
 }
 
-static void bch2_read_retry(struct bch_fs *c, struct bch_read_bio *rbio,
-			    struct bvec_iter bvec_iter, u64 inode,
-			    struct bch_io_failures *failed, unsigned flags)
-{
-	struct btree_trans trans;
-	struct btree_iter *iter;
-	struct bkey_buf sk;
-	struct bkey_s_c k;
-	int ret;
-
-	flags &= ~BCH_READ_LAST_FRAGMENT;
-	flags |= BCH_READ_MUST_CLONE;
-
-	bch2_bkey_buf_init(&sk);
-	bch2_trans_init(&trans, c, 0, 0);
-retry:
-	bch2_trans_begin(&trans);
-
-	for_each_btree_key(&trans, iter, BTREE_ID_extents,
-			   POS(inode, bvec_iter.bi_sector),
-			   BTREE_ITER_SLOTS, k, ret) {
-		unsigned bytes, sectors, offset_into_extent;
-
-		bch2_bkey_buf_reassemble(&sk, c, k);
-
-		offset_into_extent = iter->pos.offset -
-			bkey_start_offset(k.k);
-		sectors = k.k->size - offset_into_extent;
-
-		ret = bch2_read_indirect_extent(&trans,
-					&offset_into_extent, &sk);
-		if (ret)
-			break;
-
-		k = bkey_i_to_s_c(sk.k);
-
-		sectors = min(sectors, k.k->size - offset_into_extent);
-
-		bch2_trans_unlock(&trans);
-
-		bytes = min(sectors, bvec_iter_sectors(bvec_iter)) << 9;
-		swap(bvec_iter.bi_size, bytes);
-
-		ret = __bch2_read_extent(&trans, rbio, bvec_iter, k,
-				offset_into_extent, failed, flags);
-		switch (ret) {
-		case READ_RETRY:
-			goto retry;
-		case READ_ERR:
-			goto err;
-		};
-
-		if (bytes == bvec_iter.bi_size)
-			goto out;
-
-		swap(bvec_iter.bi_size, bytes);
-		bio_advance_iter(&rbio->bio, &bvec_iter, bytes);
-	}
-
-	if (ret == -EINTR)
-		goto retry;
-	/*
-	 * If we get here, it better have been because there was an error
-	 * reading a btree node
-	 */
-	BUG_ON(!ret);
-	bch_err_inum_ratelimited(c, inode,
-			"read error %i from btree lookup", ret);
-err:
-	rbio->bio.bi_status = BLK_STS_IOERR;
-out:
-	bch2_trans_exit(&trans);
-	bch2_bkey_buf_exit(&sk, c);
-	bch2_rbio_done(rbio);
-}
-
 static void bch2_rbio_retry(struct work_struct *work)
 {
 	struct bch_read_bio *rbio =
@@ -1762,10 +1686,14 @@ static void bch2_rbio_retry(struct work_struct *work)
 	flags |= BCH_READ_IN_RETRY;
 	flags &= ~BCH_READ_MAY_PROMOTE;
 
-	if (flags & BCH_READ_NODECODE)
+	if (flags & BCH_READ_NODECODE) {
 		bch2_read_retry_nodecode(c, rbio, iter, inode, &failed, flags);
-	else
-		bch2_read_retry(c, rbio, iter, inode, &failed, flags);
+	} else {
+		flags &= ~BCH_READ_LAST_FRAGMENT;
+		flags |= BCH_READ_MUST_CLONE;
+
+		__bch2_read(c, rbio, iter, inode, &failed, flags);
+	}
 }
 
 static void bch2_rbio_error(struct bch_read_bio *rbio, int retry,
@@ -2270,6 +2198,9 @@ out:
 			ret = READ_RETRY;
 		}
 
+		if (!ret)
+			goto out_read_done;
+
 		return ret;
 	}
 
@@ -2296,23 +2227,17 @@ out_read_done:
 	return 0;
 }
 
-void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, u64 inode)
+void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
+		 struct bvec_iter bvec_iter, u64 inode,
+		 struct bch_io_failures *failed, unsigned flags)
 {
 	struct btree_trans trans;
 	struct btree_iter *iter;
 	struct bkey_buf sk;
 	struct bkey_s_c k;
-	unsigned flags = BCH_READ_RETRY_IF_STALE|
-		BCH_READ_MAY_PROMOTE|
-		BCH_READ_USER_MAPPED;
 	int ret;
 
-	BUG_ON(rbio->_state);
 	BUG_ON(flags & BCH_READ_NODECODE);
-	BUG_ON(flags & BCH_READ_IN_RETRY);
-
-	rbio->c = c;
-	rbio->start_time = local_clock();
 
 	bch2_bkey_buf_init(&sk);
 	bch2_trans_init(&trans, c, 0, 0);
@@ -2320,13 +2245,13 @@ retry:
 	bch2_trans_begin(&trans);
 
 	iter = bch2_trans_get_iter(&trans, BTREE_ID_extents,
-				   POS(inode, rbio->bio.bi_iter.bi_sector),
+				   POS(inode, bvec_iter.bi_sector),
 				   BTREE_ITER_SLOTS);
 	while (1) {
 		unsigned bytes, sectors, offset_into_extent;
 
 		bch2_btree_iter_set_pos(iter,
-				POS(inode, rbio->bio.bi_iter.bi_sector));
+				POS(inode, bvec_iter.bi_sector));
 
 		k = bch2_btree_iter_peek_slot(iter);
 		ret = bkey_err(k);
@@ -2358,19 +2283,26 @@ retry:
 		 */
 		bch2_trans_unlock(&trans);
 
-		bytes = min(sectors, bio_sectors(&rbio->bio)) << 9;
-		swap(rbio->bio.bi_iter.bi_size, bytes);
+		bytes = min(sectors, bvec_iter_sectors(bvec_iter)) << 9;
+		swap(bvec_iter.bi_size, bytes);
 
-		if (rbio->bio.bi_iter.bi_size == bytes)
+		if (bvec_iter.bi_size == bytes)
 			flags |= BCH_READ_LAST_FRAGMENT;
 
-		bch2_read_extent(&trans, rbio, k, offset_into_extent, flags);
+		ret = __bch2_read_extent(&trans, rbio, bvec_iter, k,
+					 offset_into_extent, failed, flags);
+		switch (ret) {
+		case READ_RETRY:
+			goto retry;
+		case READ_ERR:
+			goto err;
+		};
 
 		if (flags & BCH_READ_LAST_FRAGMENT)
 			break;
 
-		swap(rbio->bio.bi_iter.bi_size, bytes);
-		bio_advance(&rbio->bio, bytes);
+		swap(bvec_iter.bi_size, bytes);
+		bio_advance_iter(&rbio->bio, &bvec_iter, bytes);
 	}
 out:
 	bch2_trans_exit(&trans);
diff --git a/fs/bcachefs/io.h b/fs/bcachefs/io.h
index 8535e1f631be..1c0a444ea325 100644
--- a/fs/bcachefs/io.h
+++ b/fs/bcachefs/io.h
@@ -156,7 +156,24 @@ static inline void bch2_read_extent(struct btree_trans *trans,
 			   offset_into_extent, NULL, flags);
 }
 
-void bch2_read(struct bch_fs *, struct bch_read_bio *, u64);
+void __bch2_read(struct bch_fs *, struct bch_read_bio *, struct bvec_iter,
+		 u64, struct bch_io_failures *, unsigned flags);
+
+static inline void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
+			     u64 inode)
+{
+	struct bch_io_failures failed = { .nr = 0 };
+
+	BUG_ON(rbio->_state);
+
+	rbio->c = c;
+	rbio->start_time = local_clock();
+
+	__bch2_read(c, rbio, rbio->bio.bi_iter, inode, &failed,
+		    BCH_READ_RETRY_IF_STALE|
+		    BCH_READ_MAY_PROMOTE|
+		    BCH_READ_USER_MAPPED);
+}
 
 static inline struct bch_read_bio *rbio_init(struct bio *bio,
 					     struct bch_io_opts opts)
-- 
cgit 


From 5ff75ccbbc3f262158e5bf02c639539a4da93a43 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 14 Mar 2021 21:30:08 -0400
Subject: bcachefs: Fix read retry path for indirect extents

In the read path, for retry of indirect extents to work we need to
differentiate between the location in the btree the read was for, vs.
the location where we found the data. This patch adds that plumbing to
bch_read_bio.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-io.c    |  6 ++++--
 fs/bcachefs/fs.c       |  4 +++-
 fs/bcachefs/io.c       | 46 +++++++++++++++++++++++++---------------------
 fs/bcachefs/io.h       | 23 +++++++++++++----------
 fs/bcachefs/io_types.h | 14 ++++++++++++--
 fs/bcachefs/move.c     |  8 +++++---
 6 files changed, 62 insertions(+), 39 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 4ccc9318a924..8584b90a3df9 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -788,6 +788,7 @@ retry:
 	while (1) {
 		struct bkey_s_c k;
 		unsigned bytes, sectors, offset_into_extent;
+		enum btree_id data_btree = BTREE_ID_extents;
 
 		bch2_btree_iter_set_pos(iter,
 				POS(inum, rbio->bio.bi_iter.bi_sector));
@@ -803,7 +804,7 @@ retry:
 
 		bch2_bkey_buf_reassemble(&sk, c, k);
 
-		ret = bch2_read_indirect_extent(trans,
+		ret = bch2_read_indirect_extent(trans, &data_btree,
 					&offset_into_extent, &sk);
 		if (ret)
 			break;
@@ -827,7 +828,8 @@ retry:
 		if (bkey_extent_is_allocation(k.k))
 			bch2_add_page_sectors(&rbio->bio, k);
 
-		bch2_read_extent(trans, rbio, k, offset_into_extent, flags);
+		bch2_read_extent(trans, rbio, iter->pos,
+				 data_btree, k, offset_into_extent, flags);
 
 		if (flags & BCH_READ_LAST_FRAGMENT)
 			break;
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index ef8505da7391..1fafd393912c 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -908,6 +908,8 @@ retry:
 	while ((k = bch2_btree_iter_peek(iter)).k &&
 	       !(ret = bkey_err(k)) &&
 	       bkey_cmp(iter->pos, end) < 0) {
+		enum btree_id data_btree = BTREE_ID_extents;
+
 		if (!bkey_extent_is_data(k.k) &&
 		    k.k->type != KEY_TYPE_reservation) {
 			bch2_btree_iter_next(iter);
@@ -920,7 +922,7 @@ retry:
 
 		bch2_bkey_buf_reassemble(&cur, c, k);
 
-		ret = bch2_read_indirect_extent(&trans,
+		ret = bch2_read_indirect_extent(&trans, &data_btree,
 					&offset_into_extent, &cur);
 		if (ret)
 			break;
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index b402fc2e51d6..425502f7b1b8 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -1627,8 +1627,8 @@ static void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio
 	bch2_bkey_buf_init(&sk);
 	bch2_trans_init(&trans, c, 0, 0);
 
-	iter = bch2_trans_get_iter(&trans, BTREE_ID_extents,
-				   rbio->pos, BTREE_ITER_SLOTS);
+	iter = bch2_trans_get_iter(&trans, rbio->data_btree,
+				   rbio->read_pos, BTREE_ITER_SLOTS);
 retry:
 	rbio->bio.bi_status = 0;
 
@@ -1642,14 +1642,17 @@ retry:
 
 	if (!bch2_bkey_matches_ptr(c, k,
 				   rbio->pick.ptr,
-				   rbio->pos.offset -
+				   rbio->data_pos.offset -
 				   rbio->pick.crc.offset)) {
 		/* extent we wanted to read no longer exists: */
 		rbio->hole = true;
 		goto out;
 	}
 
-	ret = __bch2_read_extent(&trans, rbio, bvec_iter, k, 0, failed, flags);
+	ret = __bch2_read_extent(&trans, rbio, bvec_iter,
+				 rbio->read_pos,
+				 rbio->data_btree,
+				 k, 0, failed, flags);
 	if (ret == READ_RETRY)
 		goto retry;
 	if (ret)
@@ -1671,7 +1674,7 @@ static void bch2_rbio_retry(struct work_struct *work)
 	struct bch_fs *c	= rbio->c;
 	struct bvec_iter iter	= rbio->bvec_iter;
 	unsigned flags		= rbio->flags;
-	u64 inode		= rbio->pos.inode;
+	u64 inode		= rbio->read_pos.inode;
 	struct bch_io_failures failed = { .nr = 0 };
 
 	trace_read_retry(&rbio->bio);
@@ -1719,7 +1722,7 @@ static int __bch2_rbio_narrow_crcs(struct btree_trans *trans,
 				   struct bch_read_bio *rbio)
 {
 	struct bch_fs *c = rbio->c;
-	u64 data_offset = rbio->pos.offset - rbio->pick.crc.offset;
+	u64 data_offset = rbio->data_pos.offset - rbio->pick.crc.offset;
 	struct bch_extent_crc_unpacked new_crc;
 	struct btree_iter *iter = NULL;
 	struct bkey_i *new;
@@ -1729,7 +1732,7 @@ static int __bch2_rbio_narrow_crcs(struct btree_trans *trans,
 	if (crc_is_compressed(rbio->pick.crc))
 		return 0;
 
-	iter = bch2_trans_get_iter(trans, BTREE_ID_extents, rbio->pos,
+	iter = bch2_trans_get_iter(trans, rbio->data_btree, rbio->data_pos,
 				   BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
 	k = bch2_btree_iter_peek_slot(iter);
 	if ((ret = bkey_err(k)))
@@ -1862,14 +1865,14 @@ csum_err:
 		return;
 	}
 
-	bch2_dev_inum_io_error(ca, rbio->pos.inode, (u64) rbio->bvec_iter.bi_sector,
+	bch2_dev_inum_io_error(ca, rbio->read_pos.inode, (u64) rbio->bvec_iter.bi_sector,
 		"data checksum error: expected %0llx:%0llx got %0llx:%0llx (type %u)",
 		rbio->pick.crc.csum.hi, rbio->pick.crc.csum.lo,
 		csum.hi, csum.lo, crc.csum_type);
 	bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
 	return;
 decompression_err:
-	bch_err_inum_ratelimited(c, rbio->pos.inode,
+	bch_err_inum_ratelimited(c, rbio->read_pos.inode,
 				 "decompression error");
 	bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR);
 	return;
@@ -1892,13 +1895,9 @@ static void bch2_read_endio(struct bio *bio)
 	if (!rbio->split)
 		rbio->bio.bi_end_io = rbio->end_io;
 
-	/*
-	 * XXX: rbio->pos is not what we want here when reading from indirect
-	 * extents
-	 */
 	if (bch2_dev_inum_io_err_on(bio->bi_status, ca,
-				    rbio->pos.inode,
-				    rbio->pos.offset,
+				    rbio->read_pos.inode,
+				    rbio->read_pos.offset,
 				    "data read error: %s",
 			       bch2_blk_status_to_str(bio->bi_status))) {
 		bch2_rbio_error(rbio, READ_RETRY_AVOID, bio->bi_status);
@@ -1963,7 +1962,8 @@ err:
 }
 
 int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig,
-		       struct bvec_iter iter, struct bkey_s_c k,
+		       struct bvec_iter iter, struct bpos read_pos,
+		       enum btree_id data_btree, struct bkey_s_c k,
 		       unsigned offset_into_extent,
 		       struct bch_io_failures *failed, unsigned flags)
 {
@@ -1973,7 +1973,7 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig,
 	struct bch_dev *ca;
 	struct promote_op *promote = NULL;
 	bool bounce = false, read_full = false, narrow_crcs = false;
-	struct bpos pos = bkey_start_pos(k.k);
+	struct bpos data_pos = bkey_start_pos(k.k);
 	int pick_ret;
 
 	if (bkey_extent_is_inline_data(k.k)) {
@@ -2049,7 +2049,7 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig,
 			 pick.crc.offset ||
 			 offset_into_extent));
 
-		pos.offset += offset_into_extent;
+		data_pos.offset += offset_into_extent;
 		pick.ptr.offset += pick.crc.offset +
 			offset_into_extent;
 		offset_into_extent		= 0;
@@ -2123,7 +2123,9 @@ get_bio:
 	/* XXX: only initialize this if needed */
 	rbio->devs_have		= bch2_bkey_devs(k);
 	rbio->pick		= pick;
-	rbio->pos		= pos;
+	rbio->read_pos		= read_pos;
+	rbio->data_btree	= data_btree;
+	rbio->data_pos		= data_pos;
 	rbio->version		= k.k->version;
 	rbio->promote		= promote;
 	INIT_WORK(&rbio->work, NULL);
@@ -2249,6 +2251,7 @@ retry:
 				   BTREE_ITER_SLOTS);
 	while (1) {
 		unsigned bytes, sectors, offset_into_extent;
+		enum btree_id data_btree = BTREE_ID_extents;
 
 		bch2_btree_iter_set_pos(iter,
 				POS(inode, bvec_iter.bi_sector));
@@ -2264,7 +2267,7 @@ retry:
 
 		bch2_bkey_buf_reassemble(&sk, c, k);
 
-		ret = bch2_read_indirect_extent(&trans,
+		ret = bch2_read_indirect_extent(&trans, &data_btree,
 					&offset_into_extent, &sk);
 		if (ret)
 			goto err;
@@ -2289,7 +2292,8 @@ retry:
 		if (bvec_iter.bi_size == bytes)
 			flags |= BCH_READ_LAST_FRAGMENT;
 
-		ret = __bch2_read_extent(&trans, rbio, bvec_iter, k,
+		ret = __bch2_read_extent(&trans, rbio, bvec_iter, iter->pos,
+					 data_btree, k,
 					 offset_into_extent, failed, flags);
 		switch (ret) {
 		case READ_RETRY:
diff --git a/fs/bcachefs/io.h b/fs/bcachefs/io.h
index 1c0a444ea325..ccbd8c3e6642 100644
--- a/fs/bcachefs/io.h
+++ b/fs/bcachefs/io.h
@@ -121,12 +121,15 @@ int __bch2_read_indirect_extent(struct btree_trans *, unsigned *,
 				struct bkey_buf *);
 
 static inline int bch2_read_indirect_extent(struct btree_trans *trans,
+					    enum btree_id *data_btree,
 					    unsigned *offset_into_extent,
 					    struct bkey_buf *k)
 {
-	return k->k->k.type == KEY_TYPE_reflink_p
-		? __bch2_read_indirect_extent(trans, offset_into_extent, k)
-		: 0;
+	if (k->k->k.type != KEY_TYPE_reflink_p)
+		return 0;
+
+	*data_btree = BTREE_ID_reflink;
+	return __bch2_read_indirect_extent(trans, offset_into_extent, k);
 }
 
 enum bch_read_flags {
@@ -143,17 +146,17 @@ enum bch_read_flags {
 };
 
 int __bch2_read_extent(struct btree_trans *, struct bch_read_bio *,
-		       struct bvec_iter, struct bkey_s_c, unsigned,
+		       struct bvec_iter, struct bpos, enum btree_id,
+		       struct bkey_s_c, unsigned,
 		       struct bch_io_failures *, unsigned);
 
 static inline void bch2_read_extent(struct btree_trans *trans,
-				    struct bch_read_bio *rbio,
-				    struct bkey_s_c k,
-				    unsigned offset_into_extent,
-				    unsigned flags)
+			struct bch_read_bio *rbio, struct bpos read_pos,
+			enum btree_id data_btree, struct bkey_s_c k,
+			unsigned offset_into_extent, unsigned flags)
 {
-	__bch2_read_extent(trans, rbio, rbio->bio.bi_iter, k,
-			   offset_into_extent, NULL, flags);
+	__bch2_read_extent(trans, rbio, rbio->bio.bi_iter, read_pos,
+			   data_btree, k, offset_into_extent, NULL, flags);
 }
 
 void __bch2_read(struct bch_fs *, struct bch_read_bio *, struct bvec_iter,
diff --git a/fs/bcachefs/io_types.h b/fs/bcachefs/io_types.h
index 65969eeac253..99b4b4c4a53b 100644
--- a/fs/bcachefs/io_types.h
+++ b/fs/bcachefs/io_types.h
@@ -58,8 +58,18 @@ struct bch_read_bio {
 	struct bch_devs_list	devs_have;
 
 	struct extent_ptr_decoded pick;
-	/* start pos of data we read (may not be pos of data we want) */
-	struct bpos		pos;
+
+	/*
+	 * pos we read from - different from data_pos for indirect extents:
+	 */
+	struct bpos		read_pos;
+
+	/*
+	 * start pos of data we read (may not be pos of data we want) - for
+	 * promote, narrow extents paths:
+	 */
+	enum btree_id		data_btree;
+	struct bpos		data_pos;
 	struct bversion		version;
 
 	struct promote_op	*promote;
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index dfe7f05f39e9..3ff31d25f396 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -208,9 +208,9 @@ void bch2_migrate_read_done(struct migrate_write *m, struct bch_read_bio *rbio)
 	BUG_ON(!m->op.wbio.bio.bi_vcnt);
 
 	m->ptr		= rbio->pick.ptr;
-	m->offset	= rbio->pos.offset - rbio->pick.crc.offset;
+	m->offset	= rbio->data_pos.offset - rbio->pick.crc.offset;
 	m->op.devs_have	= rbio->devs_have;
-	m->op.pos	= rbio->pos;
+	m->op.pos	= rbio->data_pos;
 	m->op.version	= rbio->version;
 	m->op.crc	= rbio->pick.crc;
 	m->op.wbio.bio.bi_iter.bi_size = m->op.crc.compressed_size << 9;
@@ -492,7 +492,9 @@ static int bch2_move_extent(struct btree_trans *trans,
 	 * ctxt when doing wakeup
 	 */
 	closure_get(&ctxt->cl);
-	bch2_read_extent(trans, &io->rbio, k, 0,
+	bch2_read_extent(trans, &io->rbio,
+			 bkey_start_pos(k.k),
+			 btree_id, k, 0,
 			 BCH_READ_NODECODE|
 			 BCH_READ_LAST_FRAGMENT);
 	return 0;
-- 
cgit 


From 87a432f5d7ee2b2baef6d7e115ceafa18e80f3a3 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 15 Mar 2021 17:26:19 -0400
Subject: bcachefs: Kill reflink option

An option was added to control whether reflink support was on or off
because for a long time, reflink + inline data extent support was
missing - but that's since been fixed, so we can drop the option now.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs_format.h | 2 +-
 fs/bcachefs/fs-io.c           | 3 ---
 fs/bcachefs/opts.h            | 5 -----
 fs/bcachefs/reflink.c         | 3 ---
 4 files changed, 1 insertion(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index 5d0e340c4dcb..df6961805f6f 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -1313,7 +1313,7 @@ LE64_BITMASK(BCH_SB_PRJQUOTA,		struct bch_sb, flags[0], 59, 60);
 
 LE64_BITMASK(BCH_SB_HAS_ERRORS,		struct bch_sb, flags[0], 60, 61);
 
-LE64_BITMASK(BCH_SB_REFLINK,		struct bch_sb, flags[0], 61, 62);
+/* bit 61 was reflink option */
 LE64_BITMASK(BCH_SB_BIG_ENDIAN,		struct bch_sb, flags[0], 62, 63);
 
 /* 61-64 unused */
diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 8584b90a3df9..332795eb9ae8 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -2870,9 +2870,6 @@ loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src,
 	u64 aligned_len;
 	loff_t ret = 0;
 
-	if (!c->opts.reflink)
-		return -EOPNOTSUPP;
-
 	if (remap_flags & ~(REMAP_FILE_DEDUP|REMAP_FILE_ADVISORY))
 		return -EINVAL;
 
diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
index 46f91f19dad4..42bf38922d46 100644
--- a/fs/bcachefs/opts.h
+++ b/fs/bcachefs/opts.h
@@ -213,11 +213,6 @@ enum opt_type {
 	  OPT_BOOL(),							\
 	  BCH_SB_PRJQUOTA,		false,				\
 	  NULL,		"Enable project quotas")			\
-	x(reflink,			u8,				\
-	  OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,				\
-	  OPT_BOOL(),							\
-	  BCH_SB_REFLINK,		true,				\
-	  NULL,		"Enable reflink support")			\
 	x(degraded,			u8,				\
 	  OPT_MOUNT,							\
 	  OPT_BOOL(),							\
diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c
index a2cc078597f2..e0eb2c66300c 100644
--- a/fs/bcachefs/reflink.c
+++ b/fs/bcachefs/reflink.c
@@ -204,9 +204,6 @@ s64 bch2_remap_range(struct bch_fs *c,
 	u64 src_done, dst_done;
 	int ret = 0, ret2 = 0;
 
-	if (!c->opts.reflink)
-		return -EOPNOTSUPP;
-
 	if (!percpu_ref_tryget(&c->writes))
 		return -EROFS;
 
-- 
cgit 


From 57447b7acccac0b7d75846ecfdfcd5a3421de3d0 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 15 Mar 2021 21:18:50 -0400
Subject: bcachefs: Fix a btree iterator leak

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c | 2 +-
 fs/bcachefs/reflink.c    | 6 ++++--
 2 files changed, 5 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 35480ba43621..70828d8876f9 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1992,7 +1992,7 @@ static void btree_trans_iter_alloc_fail(struct btree_trans *trans)
 	char buf[100];
 
 	trans_for_each_iter(trans, iter)
-		printk(KERN_ERR "iter: btree %s pos %s%s%s%s %ps\n",
+		printk(KERN_ERR "iter: btree %s pos %s%s%s%s %pS\n",
 		       bch2_btree_ids[iter->btree_id],
 		       (bch2_bpos_to_text(&PBUF(buf), iter->pos), buf),
 		       btree_iter_live(trans, iter) ? " live" : "",
diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c
index e0eb2c66300c..e9a6a5f639b4 100644
--- a/fs/bcachefs/reflink.c
+++ b/fs/bcachefs/reflink.c
@@ -157,8 +157,10 @@ static int bch2_make_extent_indirect(struct btree_trans *trans,
 	bch2_trans_update(trans, reflink_iter, r_v, 0);
 
 	r_p = bch2_trans_kmalloc(trans, sizeof(*r_p));
-	if (IS_ERR(r_p))
-		return PTR_ERR(r_p);
+	if (IS_ERR(r_p)) {
+		ret = PTR_ERR(r_p);
+		goto err;
+	}
 
 	orig->k.type = KEY_TYPE_reflink_p;
 	r_p = bkey_i_to_reflink_p(orig);
-- 
cgit 


From f2eaea2fc18cfe8ea4e98a7ff573f2886ae94098 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 16 Mar 2021 01:52:55 -0400
Subject: bcachefs: Kill btree_iter_pos_changed()

this is used in only one place now, so just inline it into the caller.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c | 14 ++++----------
 1 file changed, 4 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 70828d8876f9..c1081431a846 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1445,13 +1445,16 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
 
 /* Iterate across keys (in leaf nodes only) */
 
-static void btree_iter_pos_changed(struct btree_iter *iter, int cmp)
+static void btree_iter_set_search_pos(struct btree_iter *iter, struct bpos new_pos)
 {
+	int cmp = bkey_cmp(new_pos, iter->real_pos);
 	unsigned l = iter->level;
 
 	if (!cmp)
 		goto out;
 
+	iter->real_pos = new_pos;
+
 	if (unlikely(btree_iter_type(iter) == BTREE_ITER_CACHED)) {
 		btree_node_unlock(iter, 0);
 		iter->l[0].b = BTREE_ITER_NO_NODE_UP;
@@ -1481,15 +1484,6 @@ out:
 		btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
 	else
 		btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
-}
-
-static void btree_iter_set_search_pos(struct btree_iter *iter, struct bpos new_pos)
-{
-	int cmp = bkey_cmp(new_pos, iter->real_pos);
-
-	iter->real_pos = new_pos;
-
-	btree_iter_pos_changed(iter, cmp);
 
 	bch2_btree_iter_verify(iter);
 }
-- 
cgit 


From 2c944fa12dc731bf43532d99a2bfbe8c4b7186e5 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 19 Mar 2021 13:23:01 -0400
Subject: bcachefs: Add a print statement for when we go read-write

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/super.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 7c23cae436bb..529d33f4a6d7 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -400,6 +400,8 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early)
 	     (!early || c->opts.read_only)))
 		return -EROFS;
 
+	bch_info(c, "going read-write");
+
 	ret = bch2_fs_mark_dirty(c);
 	if (ret)
 		goto err;
-- 
cgit 


From ba401eaac380d9598b0a346290a77e43cb4b8211 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 19 Mar 2021 16:30:01 -0400
Subject: bcachefs: Don't list non journal devs in journal_debug_to_text()

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/journal.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index d5bbbf619359..22069c277c15 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -1236,6 +1236,9 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
 				   &c->rw_devs[BCH_DATA_journal]) {
 		struct journal_device *ja = &ca->journal;
 
+		if (!test_bit(ca->dev_idx, c->rw_devs[BCH_DATA_journal].d))
+			continue;
+
 		if (!ja->nr)
 			continue;
 
-- 
cgit 


From dbb93db9721e8d94e3d1a3c2217fc6265aace811 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 19 Mar 2021 16:32:46 -0400
Subject: bcachefs: Fix btree iterator leak in extent_handle_overwrites()

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update_leaf.c | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index bf2a2b0695b6..f8b493706c94 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -754,7 +754,7 @@ static int extent_handle_overwrites(struct btree_trans *trans,
 				    enum btree_id btree_id,
 				    struct bpos start, struct bpos end)
 {
-	struct btree_iter *iter = NULL, *update_iter;
+	struct btree_iter *iter, *update_iter;
 	struct bkey_i *update;
 	struct bkey_s_c k;
 	int ret = 0;
@@ -767,8 +767,6 @@ static int extent_handle_overwrites(struct btree_trans *trans,
 			break;
 
 		if (bkey_cmp(bkey_start_pos(k.k), start) < 0) {
-			update_iter = bch2_trans_copy_iter(trans, iter);
-
 			update = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
 			if ((ret = PTR_ERR_OR_ZERO(update)))
 				goto err;
@@ -776,6 +774,7 @@ static int extent_handle_overwrites(struct btree_trans *trans,
 			bkey_reassemble(update, k);
 			bch2_cut_back(start, update);
 
+			update_iter = bch2_trans_copy_iter(trans, iter);
 			update_iter->flags &= ~BTREE_ITER_IS_EXTENTS;
 			bch2_btree_iter_set_pos(update_iter, update->k.p);
 			ret = bch2_trans_update2(trans, update_iter, update);
@@ -785,8 +784,6 @@ static int extent_handle_overwrites(struct btree_trans *trans,
 		}
 
 		if (bkey_cmp(k.k->p, end) > 0) {
-			update_iter = bch2_trans_copy_iter(trans, iter);
-
 			update = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
 			if ((ret = PTR_ERR_OR_ZERO(update)))
 				goto err;
@@ -794,6 +791,7 @@ static int extent_handle_overwrites(struct btree_trans *trans,
 			bkey_reassemble(update, k);
 			bch2_cut_front(end, update);
 
+			update_iter = bch2_trans_copy_iter(trans, iter);
 			update_iter->flags &= ~BTREE_ITER_IS_EXTENTS;
 			bch2_btree_iter_set_pos(update_iter, update->k.p);
 			ret = bch2_trans_update2(trans, update_iter, update);
@@ -801,8 +799,6 @@ static int extent_handle_overwrites(struct btree_trans *trans,
 			if (ret)
 				goto err;
 		} else {
-			update_iter = bch2_trans_copy_iter(trans, iter);
-
 			update = bch2_trans_kmalloc(trans, sizeof(struct bkey));
 			if ((ret = PTR_ERR_OR_ZERO(update)))
 				goto err;
@@ -812,6 +808,7 @@ static int extent_handle_overwrites(struct btree_trans *trans,
 			update->k.type = KEY_TYPE_deleted;
 			update->k.size = 0;
 
+			update_iter = bch2_trans_copy_iter(trans, iter);
 			update_iter->flags &= ~BTREE_ITER_IS_EXTENTS;
 			bch2_btree_iter_set_pos(update_iter, update->k.p);
 			ret = bch2_trans_update2(trans, update_iter, update);
@@ -823,8 +820,7 @@ static int extent_handle_overwrites(struct btree_trans *trans,
 		k = bch2_btree_iter_next_with_updates(iter);
 	}
 err:
-	if (!IS_ERR_OR_NULL(iter))
-		bch2_trans_iter_put(trans, iter);
+	bch2_trans_iter_put(trans, iter);
 	return ret;
 }
 
-- 
cgit 


From abcecb49f5f3d24ec865a6c5830f135e12f9299e Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 19 Mar 2021 22:34:54 -0400
Subject: bcachefs: Fsck code refactoring

Change fsck code to always put btree iterators - also, make some flow
control improvements to deal with lock restarts better, and refactor
check_extents() to not walk extents twice for counting/checking
i_sectors.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c |   4 +-
 fs/bcachefs/btree_iter.h |   2 +
 fs/bcachefs/fsck.c       | 198 +++++++++++++++++++++++------------------------
 3 files changed, 102 insertions(+), 102 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index c1081431a846..711734f2023b 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1496,7 +1496,7 @@ void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos)
 	btree_iter_set_search_pos(iter, btree_iter_search_key(iter));
 }
 
-static inline bool bch2_btree_iter_advance_pos(struct btree_iter *iter)
+inline bool bch2_btree_iter_advance_pos(struct btree_iter *iter)
 {
 	struct bpos pos = iter->k.p;
 	bool ret = bkey_cmp(pos, POS_MAX) != 0;
@@ -1507,7 +1507,7 @@ static inline bool bch2_btree_iter_advance_pos(struct btree_iter *iter)
 	return ret;
 }
 
-static inline bool bch2_btree_iter_rewind_pos(struct btree_iter *iter)
+inline bool bch2_btree_iter_rewind_pos(struct btree_iter *iter)
 {
 	struct bpos pos = bkey_start_pos(&iter->k);
 	bool ret = bkey_cmp(pos, POS_MIN) != 0;
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index bd0c429bd91a..76f0f8f3c125 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -175,6 +175,8 @@ struct bkey_s_c bch2_btree_iter_prev_slot(struct btree_iter *);
 
 struct bkey_s_c bch2_btree_iter_peek_cached(struct btree_iter *);
 
+bool bch2_btree_iter_advance_pos(struct btree_iter *);
+bool bch2_btree_iter_rewind_pos(struct btree_iter *);
 void bch2_btree_iter_set_pos(struct btree_iter *, struct bpos);
 
 /* Sort order for locking btree iterators: */
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 7f6b4ac48f3d..033d37891c60 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -319,7 +319,7 @@ static int hash_check_key(struct btree_trans *trans,
 			bch_err(c, "hash_redo_key err %i", ret);
 			return ret;
 		}
-		return 1;
+		return -EINTR;
 	}
 
 	ret = hash_check_duplicates(trans, desc, h, k_iter, k);
@@ -413,18 +413,10 @@ err_redo:
 	goto err;
 }
 
-static int bch2_inode_truncate(struct bch_fs *c, u64 inode_nr, u64 new_size)
-{
-	return bch2_btree_delete_range(c, BTREE_ID_extents,
-			POS(inode_nr, round_up(new_size, block_bytes(c)) >> 9),
-			POS(inode_nr + 1, 0), NULL);
-}
-
-static int bch2_fix_overlapping_extent(struct btree_trans *trans,
-				       struct btree_iter *iter,
+static int fix_overlapping_extent(struct btree_trans *trans,
 				       struct bkey_s_c k, struct bpos cut_at)
 {
-	struct btree_iter *u_iter;
+	struct btree_iter *iter;
 	struct bkey_i *u;
 	int ret;
 
@@ -436,22 +428,24 @@ static int bch2_fix_overlapping_extent(struct btree_trans *trans,
 	bkey_reassemble(u, k);
 	bch2_cut_front(cut_at, u);
 
-	u_iter = bch2_trans_copy_iter(trans, iter);
 
 	/*
-	 * We don't want to go through the
-	 * extent_handle_overwrites path:
+	 * We don't want to go through the extent_handle_overwrites path:
+	 *
+	 * XXX: this is going to screw up disk accounting, extent triggers
+	 * assume things about extent overwrites - we should be running the
+	 * triggers manually here
 	 */
-	u_iter->flags &= ~BTREE_ITER_IS_EXTENTS;
-	bch2_btree_iter_set_pos(u_iter, u->k.p);
+	iter = bch2_trans_get_iter(trans, BTREE_ID_extents, u->k.p,
+				   BTREE_ITER_INTENT|BTREE_ITER_NOT_EXTENTS);
 
-	/*
-	 * XXX: this is going to leave disk space
-	 * accounting slightly wrong
-	 */
-	ret = bch2_trans_update(trans, u_iter, u, 0);
-	bch2_trans_iter_put(trans, u_iter);
-	return ret;
+	BUG_ON(iter->flags & BTREE_ITER_IS_EXTENTS);
+	bch2_trans_update(trans, iter, u, BTREE_TRIGGER_NORUN);
+	bch2_trans_iter_put(trans, iter);
+
+	return bch2_trans_commit(trans, NULL, NULL,
+				 BTREE_INSERT_NOFAIL|
+				 BTREE_INSERT_LAZY_RW);
 }
 
 /*
@@ -466,7 +460,7 @@ static int check_extents(struct bch_fs *c)
 	struct btree_iter *iter;
 	struct bkey_s_c k;
 	struct bkey_buf prev;
-	u64 i_sectors;
+	u64 i_sectors = 0;
 	int ret = 0;
 
 	bch2_bkey_buf_init(&prev);
@@ -479,97 +473,86 @@ static int check_extents(struct bch_fs *c)
 				   POS(BCACHEFS_ROOT_INO, 0),
 				   BTREE_ITER_INTENT);
 retry:
-	for_each_btree_key_continue(iter, 0, k, ret) {
-		/*
-		 * due to retry errors we might see the same extent twice:
-		 */
-		if (bkey_cmp(prev.k->k.p, k.k->p) &&
-		    bkey_cmp(prev.k->k.p, bkey_start_pos(k.k)) > 0) {
+	while ((k = bch2_btree_iter_peek(iter)).k &&
+	       !(ret = bkey_err(k))) {
+		if (w.have_inode &&
+		    w.cur_inum != k.k->p.inode &&
+		    !(w.inode.bi_flags & BCH_INODE_I_SECTORS_DIRTY) &&
+		    fsck_err_on(w.inode.bi_sectors != i_sectors, c,
+				"inode %llu has incorrect i_sectors: got %llu, should be %llu",
+				w.inode.bi_inum,
+				w.inode.bi_sectors, i_sectors)) {
+			struct btree_iter *inode_iter =
+				bch2_trans_get_iter(&trans, BTREE_ID_inodes,
+						    POS(0, w.cur_inum),
+						    BTREE_ITER_INTENT);
+
+			w.inode.bi_sectors = i_sectors;
+
+			ret = __bch2_trans_do(&trans, NULL, NULL,
+					      BTREE_INSERT_NOFAIL|
+					      BTREE_INSERT_LAZY_RW,
+					      bch2_inode_write(&trans, inode_iter, &w.inode));
+			bch2_trans_iter_put(&trans, inode_iter);
+			if (ret)
+				break;
+		}
+
+		if (bkey_cmp(prev.k->k.p, bkey_start_pos(k.k)) > 0) {
 			char buf1[200];
 			char buf2[200];
 
 			bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(prev.k));
 			bch2_bkey_val_to_text(&PBUF(buf2), c, k);
 
-			if (fsck_err(c, "overlapping extents:\n%s\n%s", buf1, buf2)) {
-				ret = __bch2_trans_do(&trans, NULL, NULL,
-						      BTREE_INSERT_NOFAIL|
-						      BTREE_INSERT_LAZY_RW,
-						bch2_fix_overlapping_extent(&trans,
-								iter, k, prev.k->k.p));
-				if (ret)
-					goto err;
-			}
+			if (fsck_err(c, "overlapping extents:\n%s\n%s", buf1, buf2))
+				return fix_overlapping_extent(&trans, k, prev.k->k.p) ?: -EINTR;
 		}
-		bch2_bkey_buf_reassemble(&prev, c, k);
 
 		ret = walk_inode(&trans, &w, k.k->p.inode);
 		if (ret)
 			break;
 
+		if (w.first_this_inode)
+			i_sectors = 0;
+
 		if (fsck_err_on(!w.have_inode, c,
-			"extent type %u for missing inode %llu",
-			k.k->type, k.k->p.inode) ||
+				"extent type %u for missing inode %llu",
+				k.k->type, k.k->p.inode) ||
 		    fsck_err_on(w.have_inode &&
-			!S_ISREG(w.inode.bi_mode) && !S_ISLNK(w.inode.bi_mode), c,
-			"extent type %u for non regular file, inode %llu mode %o",
-			k.k->type, k.k->p.inode, w.inode.bi_mode)) {
-			bch2_trans_unlock(&trans);
-
-			ret = bch2_inode_truncate(c, k.k->p.inode, 0);
-			if (ret)
-				goto err;
-			continue;
+				!S_ISREG(w.inode.bi_mode) && !S_ISLNK(w.inode.bi_mode), c,
+				"extent type %u for non regular file, inode %llu mode %o",
+				k.k->type, k.k->p.inode, w.inode.bi_mode)) {
+			bch2_fs_lazy_rw(c);
+			return bch2_btree_delete_range_trans(&trans, BTREE_ID_extents,
+						       POS(k.k->p.inode, 0),
+						       POS(k.k->p.inode, U64_MAX),
+						       NULL) ?: -EINTR;
 		}
 
-		if (fsck_err_on(w.first_this_inode &&
-			w.have_inode &&
-			!(w.inode.bi_flags & BCH_INODE_I_SECTORS_DIRTY) &&
-			w.inode.bi_sectors !=
-			(i_sectors = bch2_count_inode_sectors(&trans, w.cur_inum)),
-			c, "inode %llu has incorrect i_sectors: got %llu, should be %llu",
-			w.inode.bi_inum,
-			w.inode.bi_sectors, i_sectors)) {
-			struct bkey_inode_buf p;
-
-			w.inode.bi_sectors = i_sectors;
-
-			bch2_trans_unlock(&trans);
-
-			bch2_inode_pack(c, &p, &w.inode);
-
-			ret = bch2_btree_insert(c, BTREE_ID_inodes,
-						&p.inode.k_i, NULL, NULL,
-						BTREE_INSERT_NOFAIL|
-						BTREE_INSERT_LAZY_RW);
-			if (ret) {
-				bch_err(c, "error in fsck: error %i updating inode", ret);
-				goto err;
-			}
-
-			/* revalidate iterator: */
-			k = bch2_btree_iter_peek(iter);
+		if (fsck_err_on(w.have_inode &&
+				!(w.inode.bi_flags & BCH_INODE_I_SIZE_DIRTY) &&
+				k.k->type != KEY_TYPE_reservation &&
+				k.k->p.offset > round_up(w.inode.bi_size, block_bytes(c)) >> 9, c,
+				"extent type %u offset %llu past end of inode %llu, i_size %llu",
+				k.k->type, k.k->p.offset, k.k->p.inode, w.inode.bi_size)) {
+			bch2_fs_lazy_rw(c);
+			return bch2_btree_delete_range_trans(&trans, BTREE_ID_extents,
+					POS(k.k->p.inode, round_up(w.inode.bi_size, block_bytes(c)) >> 9),
+					POS(k.k->p.inode, U64_MAX),
+					NULL) ?: -EINTR;
 		}
 
-		if (fsck_err_on(w.have_inode &&
-			!(w.inode.bi_flags & BCH_INODE_I_SIZE_DIRTY) &&
-			k.k->type != KEY_TYPE_reservation &&
-			k.k->p.offset > round_up(w.inode.bi_size, block_bytes(c)) >> 9, c,
-			"extent type %u offset %llu past end of inode %llu, i_size %llu",
-			k.k->type, k.k->p.offset, k.k->p.inode, w.inode.bi_size)) {
-			bch2_trans_unlock(&trans);
+		if (bkey_extent_is_allocation(k.k))
+			i_sectors += k.k->size;
+		bch2_bkey_buf_reassemble(&prev, c, k);
 
-			ret = bch2_inode_truncate(c, k.k->p.inode,
-						  w.inode.bi_size);
-			if (ret)
-				goto err;
-			continue;
-		}
+		bch2_btree_iter_advance_pos(iter);
 	}
-err:
 fsck_err:
 	if (ret == -EINTR)
 		goto retry;
+	bch2_trans_iter_put(&trans, iter);
 	bch2_bkey_buf_exit(&prev, c);
 	return bch2_trans_exit(&trans) ?: ret;
 }
@@ -599,7 +582,8 @@ static int check_dirents(struct bch_fs *c)
 	iter = bch2_trans_get_iter(&trans, BTREE_ID_dirents,
 				   POS(BCACHEFS_ROOT_INO, 0), 0);
 retry:
-	for_each_btree_key_continue(iter, 0, k, ret) {
+	while ((k = bch2_btree_iter_peek(iter)).k &&
+	       !(ret = bkey_err(k))) {
 		struct bkey_s_c_dirent d;
 		struct bch_inode_unpacked target;
 		bool have_target;
@@ -718,6 +702,8 @@ retry:
 				goto err;
 
 		}
+
+		bch2_btree_iter_advance_pos(iter);
 	}
 
 	hash_stop_chain(&trans, &h);
@@ -726,6 +712,8 @@ fsck_err:
 	if (ret == -EINTR)
 		goto retry;
 
+	bch2_trans_iter_put(&trans, h.chain);
+	bch2_trans_iter_put(&trans, iter);
 	return bch2_trans_exit(&trans) ?: ret;
 }
 
@@ -751,7 +739,8 @@ static int check_xattrs(struct bch_fs *c)
 	iter = bch2_trans_get_iter(&trans, BTREE_ID_xattrs,
 				   POS(BCACHEFS_ROOT_INO, 0), 0);
 retry:
-	for_each_btree_key_continue(iter, 0, k, ret) {
+	while ((k = bch2_btree_iter_peek(iter)).k &&
+	       !(ret = bkey_err(k))) {
 		ret = walk_inode(&trans, &w, k.k->p.inode);
 		if (ret)
 			break;
@@ -761,7 +750,7 @@ retry:
 				k.k->p.inode)) {
 			ret = bch2_btree_delete_at(&trans, iter, 0);
 			if (ret)
-				goto err;
+				break;
 			continue;
 		}
 
@@ -771,12 +760,16 @@ retry:
 		ret = hash_check_key(&trans, bch2_xattr_hash_desc,
 				     &h, iter, k);
 		if (ret)
-			goto fsck_err;
+			break;
+
+		bch2_btree_iter_advance_pos(iter);
 	}
-err:
 fsck_err:
 	if (ret == -EINTR)
 		goto retry;
+
+	bch2_trans_iter_put(&trans, h.chain);
+	bch2_trans_iter_put(&trans, iter);
 	return bch2_trans_exit(&trans) ?: ret;
 }
 
@@ -1127,6 +1120,8 @@ static int bch2_gc_walk_dirents(struct bch_fs *c, nlink_table *links,
 
 		bch2_trans_cond_resched(&trans);
 	}
+	bch2_trans_iter_put(&trans, iter);
+
 	ret = bch2_trans_exit(&trans) ?: ret;
 	if (ret)
 		bch_err(c, "error in fsck: btree error %i while walking dirents", ret);
@@ -1279,8 +1274,10 @@ static int check_inode(struct btree_trans *trans,
 		 * XXX: need to truncate partial blocks too here - or ideally
 		 * just switch units to bytes and that issue goes away
 		 */
-
-		ret = bch2_inode_truncate(c, u.bi_inum, u.bi_size);
+		ret = bch2_btree_delete_range_trans(trans, BTREE_ID_extents,
+				POS(u.bi_inum, round_up(u.bi_size, block_bytes(c)) >> 9),
+				POS(u.bi_inum, U64_MAX),
+				NULL);
 		if (ret) {
 			bch_err(c, "error in fsck: error %i truncating inode", ret);
 			return ret;
@@ -1392,10 +1389,11 @@ peek_nlinks:	link = genradix_iter_peek(&nlinks_iter, links);
 		if (nlinks_pos == iter->pos.offset)
 			genradix_iter_advance(&nlinks_iter, links);
 
-		bch2_btree_iter_next(iter);
+		bch2_btree_iter_advance_pos(iter);
 		bch2_trans_cond_resched(&trans);
 	}
 fsck_err:
+	bch2_trans_iter_put(&trans, iter);
 	bch2_trans_exit(&trans);
 
 	if (ret2)
-- 
cgit 


From 8d956c2fb8f7e91370fea1d27d16c6869b8ada78 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 19 Mar 2021 22:54:18 -0400
Subject: bcachefs: btree_iter_set_dontneed()

This is a bit clearer than using bch2_btree_iter_free().

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c      |  4 ++--
 fs/bcachefs/btree_iter.h      |  5 +++++
 fs/bcachefs/btree_key_cache.c | 21 ++++++++++-----------
 3 files changed, 17 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 711734f2023b..02a486e83881 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1972,7 +1972,7 @@ int bch2_trans_iter_free(struct btree_trans *trans,
 	if (IS_ERR_OR_NULL(iter))
 		return 0;
 
-	trans->iters_touched &= ~(1ULL << iter->idx);
+	set_btree_iter_dontneed(trans, iter);
 
 	return bch2_trans_iter_put(trans, iter);
 }
@@ -2133,7 +2133,7 @@ struct btree_iter *__bch2_trans_copy_iter(struct btree_trans *trans,
 	 * We don't need to preserve this iter since it's cheap to copy it
 	 * again - this will cause trans_iter_put() to free it right away:
 	 */
-	trans->iters_touched &= ~(1ULL << iter->idx);
+	set_btree_iter_dontneed(trans, iter);
 
 	return iter;
 }
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index 76f0f8f3c125..c839bfe6ffa4 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -300,6 +300,11 @@ static inline bool btree_iter_keep(struct btree_trans *trans, struct btree_iter
 		(iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT);
 }
 
+static inline void set_btree_iter_dontneed(struct btree_trans *trans, struct btree_iter *iter)
+{
+	trans->iters_touched &= ~(1ULL << iter->idx);
+}
+
 #define TRANS_RESET_NOTRAVERSE		(1 << 0)
 
 void bch2_trans_reset(struct btree_trans *, unsigned);
diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index 76f19f86c8ad..d7b4df4cff17 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -172,23 +172,21 @@ static int btree_key_cache_fill(struct btree_trans *trans,
 				   ck->key.pos, BTREE_ITER_SLOTS);
 	k = bch2_btree_iter_peek_slot(iter);
 	ret = bkey_err(k);
-	if (ret) {
-		bch2_trans_iter_put(trans, iter);
-		return ret;
-	}
+	if (ret)
+		goto err;
 
 	if (!bch2_btree_node_relock(ck_iter, 0)) {
-		bch2_trans_iter_put(trans, iter);
 		trace_transaction_restart_ip(trans->ip, _THIS_IP_);
-		return -EINTR;
+		ret = -EINTR;
+		goto err;
 	}
 
 	if (k.k->u64s > ck->u64s) {
 		new_u64s = roundup_pow_of_two(k.k->u64s);
 		new_k = kmalloc(new_u64s * sizeof(u64), GFP_NOFS);
 		if (!new_k) {
-			bch2_trans_iter_put(trans, iter);
-			return -ENOMEM;
+			ret = -ENOMEM;
+			goto err;
 		}
 	}
 
@@ -204,9 +202,10 @@ static int btree_key_cache_fill(struct btree_trans *trans,
 	bch2_btree_node_unlock_write(ck_iter->l[0].b, ck_iter);
 
 	/* We're not likely to need this iterator again: */
-	bch2_trans_iter_free(trans, iter);
-
-	return 0;
+	set_btree_iter_dontneed(trans, iter);
+err:
+	bch2_trans_iter_put(trans, iter);
+	return ret;
 }
 
 static int bkey_cached_check_fn(struct six_lock *lock, void *p)
-- 
cgit 


From 50dc0f692a0dbe3e6a95d3f8e5c7e718bc9f021d Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 19 Mar 2021 20:29:11 -0400
Subject: bcachefs: Require all btree iterators to be freed

We keep running into occasional bugs with btree transaction iterators
overflowing - this will make those bugs more visible.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/acl.c              | 20 ++++++++++-------
 fs/bcachefs/alloc_background.c |  4 ++--
 fs/bcachefs/btree_gc.c         | 12 ++++++++---
 fs/bcachefs/btree_io.c         |  1 +
 fs/bcachefs/btree_iter.c       | 17 +++++++++++++++
 fs/bcachefs/debug.c            |  4 ++++
 fs/bcachefs/dirent.c           |  3 +++
 fs/bcachefs/ec.c               |  4 +++-
 fs/bcachefs/extents.c          |  2 ++
 fs/bcachefs/fs-io.c            | 49 +++++++++++++++++++++++-------------------
 fs/bcachefs/fs.c               |  3 +++
 fs/bcachefs/fsck.c             |  3 ++-
 fs/bcachefs/inode.c            |  1 +
 fs/bcachefs/io.c               | 38 ++++++++++++++++----------------
 fs/bcachefs/migrate.c          |  9 ++++++--
 fs/bcachefs/move.c             |  3 +++
 fs/bcachefs/quota.c            |  7 +++++-
 fs/bcachefs/reflink.c          | 26 +++++++++-------------
 fs/bcachefs/tests.c            | 28 ++++++++++++++++++------
 fs/bcachefs/xattr.c            | 18 +++++++++-------
 20 files changed, 163 insertions(+), 89 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/acl.c b/fs/bcachefs/acl.c
index 3879815bcede..afb9562be2b2 100644
--- a/fs/bcachefs/acl.c
+++ b/fs/bcachefs/acl.c
@@ -241,12 +241,12 @@ retry:
 	}
 
 	xattr = bkey_s_c_to_xattr(bch2_btree_iter_peek_slot(iter));
-
 	acl = bch2_acl_from_disk(xattr_val(xattr.v),
 			le16_to_cpu(xattr.v->x_val_len));
 
 	if (!IS_ERR(acl))
 		set_cached_acl(&inode->v, type, acl);
+	bch2_trans_iter_put(&trans, iter);
 out:
 	bch2_trans_exit(&trans);
 	return acl;
@@ -313,7 +313,7 @@ retry:
 	if (type == ACL_TYPE_ACCESS) {
 		ret = posix_acl_update_mode(idmap, &inode->v, &mode, &acl);
 		if (ret)
-			goto err;
+			goto btree_err;
 	}
 
 	hash_info = bch2_hash_info_init(c, &inode_u);
@@ -330,6 +330,8 @@ retry:
 				  &inode->ei_journal_seq,
 				  BTREE_INSERT_NOUNLOCK);
 btree_err:
+	bch2_trans_iter_put(&trans, inode_iter);
+
 	if (ret == -EINTR)
 		goto retry;
 	if (unlikely(ret))
@@ -356,21 +358,22 @@ int bch2_acl_chmod(struct btree_trans *trans,
 	struct bkey_s_c_xattr xattr;
 	struct bkey_i_xattr *new;
 	struct posix_acl *acl;
-	int ret = 0;
+	int ret;
 
 	iter = bch2_hash_lookup(trans, bch2_xattr_hash_desc,
 			&hash_info, inode->bi_inum,
 			&X_SEARCH(KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS, "", 0),
 			BTREE_ITER_INTENT);
-	if (IS_ERR(iter))
-		return PTR_ERR(iter) != -ENOENT ? PTR_ERR(iter) : 0;
+	ret = PTR_ERR_OR_ZERO(iter);
+	if (ret)
+		return ret == -ENOENT ? 0 : ret;
 
 	xattr = bkey_s_c_to_xattr(bch2_btree_iter_peek_slot(iter));
-
 	acl = bch2_acl_from_disk(xattr_val(xattr.v),
 			le16_to_cpu(xattr.v->x_val_len));
-	if (IS_ERR_OR_NULL(acl))
-		return PTR_ERR(acl);
+	ret = PTR_ERR_OR_ZERO(acl);
+	if (ret || !acl)
+		goto err;
 
 	ret = __posix_acl_chmod(&acl, GFP_KERNEL, mode);
 	if (ret)
@@ -387,6 +390,7 @@ int bch2_acl_chmod(struct btree_trans *trans,
 	*new_acl = acl;
 	acl = NULL;
 err:
+	bch2_trans_iter_put(trans, iter);
 	kfree(acl);
 	return ret;
 }
diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index add04dcb849b..e2200cedecca 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -385,7 +385,6 @@ int bch2_alloc_write(struct bch_fs *c, unsigned flags)
 	int ret = 0;
 
 	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
-
 	iter = bch2_trans_get_iter(&trans, BTREE_ID_alloc, POS_MIN,
 				   BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
 
@@ -405,6 +404,7 @@ int bch2_alloc_write(struct bch_fs *c, unsigned flags)
 		}
 	}
 err:
+	bch2_trans_iter_put(&trans, iter);
 	bch2_trans_exit(&trans);
 	return ret;
 }
@@ -926,7 +926,6 @@ static int bch2_invalidate_buckets(struct bch_fs *c, struct bch_dev *ca)
 	int ret = 0;
 
 	bch2_trans_init(&trans, c, 0, 0);
-
 	iter = bch2_trans_get_iter(&trans, BTREE_ID_alloc,
 				   POS(ca->dev_idx, 0),
 				   BTREE_ITER_CACHED|
@@ -942,6 +941,7 @@ static int bch2_invalidate_buckets(struct bch_fs *c, struct bch_dev *ca)
 				(!fifo_empty(&ca->free_inc)
 				 ? BTREE_INSERT_NOWAIT : 0));
 
+	bch2_trans_iter_put(&trans, iter);
 	bch2_trans_exit(&trans);
 
 	/* If we used NOWAIT, don't return the error: */
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index e8cdc82d3451..a303cd376d4b 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -456,6 +456,8 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id,
 
 		bch2_trans_cond_resched(&trans);
 	}
+	bch2_trans_iter_put(&trans, iter);
+
 	ret = bch2_trans_exit(&trans) ?: ret;
 	if (ret)
 		return ret;
@@ -1212,6 +1214,7 @@ static int bch2_gc_btree_gens(struct bch_fs *c, enum btree_id btree_id)
 
 		bch2_btree_iter_next(iter);
 	}
+	bch2_trans_iter_put(&trans, iter);
 
 	bch2_trans_exit(&trans);
 	bch2_bkey_buf_exit(&sk, c);
@@ -1509,6 +1512,7 @@ static int bch2_coalesce_btree(struct bch_fs *c, enum btree_id btree_id)
 	struct btree *b;
 	bool kthread = (current->flags & PF_KTHREAD) != 0;
 	unsigned i;
+	int ret = 0;
 
 	/* Sliding window of adjacent btree nodes */
 	struct btree *merge[GC_MERGE_NODES];
@@ -1557,8 +1561,8 @@ static int bch2_coalesce_btree(struct bch_fs *c, enum btree_id btree_id)
 		lock_seq[0] = merge[0]->c.lock.state.seq;
 
 		if (kthread && kthread_should_stop()) {
-			bch2_trans_exit(&trans);
-			return -ESHUTDOWN;
+			ret = -ESHUTDOWN;
+			break;
 		}
 
 		bch2_trans_cond_resched(&trans);
@@ -1573,7 +1577,9 @@ static int bch2_coalesce_btree(struct bch_fs *c, enum btree_id btree_id)
 			memset(merge + 1, 0,
 			       (GC_MERGE_NODES - 1) * sizeof(merge[0]));
 	}
-	return bch2_trans_exit(&trans);
+	bch2_trans_iter_put(&trans, iter);
+
+	return bch2_trans_exit(&trans) ?: ret;
 }
 
 /**
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index d547bfabf09f..7ec14cd8f02b 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -1208,6 +1208,7 @@ retry:
 	if (ret)
 		goto err;
 out:
+	bch2_trans_iter_put(&trans, iter);
 	bch2_trans_exit(&trans);
 	bch2_bkey_buf_exit(&k, c);
 	bio_put(&wbio->wbio.bio);
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 02a486e83881..00140ae50cb9 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -9,6 +9,7 @@
 #include "btree_locking.h"
 #include "btree_update.h"
 #include "debug.h"
+#include "error.h"
 #include "extents.h"
 #include "journal.h"
 #include "trace.h"
@@ -2116,6 +2117,7 @@ struct btree_iter *bch2_trans_get_node_iter(struct btree_trans *trans,
 	for (i = 0; i < ARRAY_SIZE(iter->l); i++)
 		iter->l[i].b		= NULL;
 	iter->l[iter->level].b		= BTREE_ITER_NO_NODE_INIT;
+	iter->ip_allocated = _RET_IP_;
 
 	return iter;
 }
@@ -2224,6 +2226,8 @@ void bch2_trans_reset(struct btree_trans *trans, unsigned flags)
 		       (void *) &trans->fs_usage_deltas->memset_start);
 	}
 
+	bch2_trans_cond_resched(trans);
+
 	if (!(flags & TRANS_RESET_NOTRAVERSE))
 		bch2_btree_iter_traverse_all(trans);
 }
@@ -2290,6 +2294,19 @@ int bch2_trans_exit(struct btree_trans *trans)
 	bch2_trans_unlock(trans);
 
 #ifdef CONFIG_BCACHEFS_DEBUG
+	if (trans->iters_live) {
+		struct btree_iter *iter;
+
+		bch_err(c, "btree iterators leaked!");
+		trans_for_each_iter(trans, iter)
+			if (btree_iter_live(trans, iter))
+				printk(KERN_ERR "  btree %s allocated at %pS\n",
+				       bch2_btree_ids[iter->btree_id],
+				       (void *) iter->ip_allocated);
+		/* Be noisy about this: */
+		bch2_fatal_error(c);
+	}
+
 	mutex_lock(&trans->c->btree_trans_lock);
 	list_del(&trans->list);
 	mutex_unlock(&trans->c->btree_trans_lock);
diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c
index 3ac700dc72d5..2c2d58514c68 100644
--- a/fs/bcachefs/debug.c
+++ b/fs/bcachefs/debug.c
@@ -242,6 +242,8 @@ static ssize_t bch2_read_btree(struct file *file, char __user *buf,
 		if (!i->size)
 			break;
 	}
+	bch2_trans_iter_put(&trans, iter);
+
 	bch2_trans_exit(&trans);
 
 	return err < 0 ? err : i->ret;
@@ -294,6 +296,8 @@ static ssize_t bch2_read_btree_formats(struct file *file, char __user *buf,
 		if (!i->size)
 			break;
 	}
+	bch2_trans_iter_put(&trans, iter);
+
 	bch2_trans_exit(&trans);
 
 	return err < 0 ? err : i->ret;
diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c
index b0625176ab35..592dd80cf963 100644
--- a/fs/bcachefs/dirent.c
+++ b/fs/bcachefs/dirent.c
@@ -321,6 +321,7 @@ u64 bch2_dirent_lookup(struct bch_fs *c, u64 dir_inum,
 
 	k = bch2_btree_iter_peek_slot(iter);
 	inum = le64_to_cpu(bkey_s_c_to_dirent(k).v->d_inum);
+	bch2_trans_iter_put(&trans, iter);
 out:
 	bch2_trans_exit(&trans);
 	return inum;
@@ -379,6 +380,8 @@ int bch2_readdir(struct bch_fs *c, u64 inum, struct dir_context *ctx)
 			break;
 		ctx->pos = dirent.k->p.offset + 1;
 	}
+	bch2_trans_iter_put(&trans, iter);
+
 	ret = bch2_trans_exit(&trans) ?: ret;
 
 	return ret;
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index f61d4c873a82..bdce37981c5c 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -873,6 +873,7 @@ static int ec_stripe_update_ptrs(struct bch_fs *c,
 		if (ret)
 			break;
 	}
+	bch2_trans_iter_put(&trans, iter);
 
 	bch2_trans_exit(&trans);
 	bch2_bkey_buf_exit(&sk, c);
@@ -1663,12 +1664,13 @@ int bch2_ec_mem_alloc(struct bch_fs *c, bool gc)
 	int ret = 0;
 
 	bch2_trans_init(&trans, c, 0, 0);
-
 	iter = bch2_trans_get_iter(&trans, BTREE_ID_stripes, POS(0, U64_MAX), 0);
 
 	k = bch2_btree_iter_prev(iter);
 	if (!IS_ERR_OR_NULL(k.k))
 		idx = k.k->p.offset + 1;
+
+	bch2_trans_iter_put(&trans, iter);
 	ret = bch2_trans_exit(&trans);
 	if (ret)
 		return ret;
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 3fe9ef50f5c0..8cf45b7b9459 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -687,6 +687,8 @@ bool bch2_check_range_allocated(struct bch_fs *c, struct bpos pos, u64 size,
 			break;
 		}
 	}
+	bch2_trans_iter_put(&trans, iter);
+
 	bch2_trans_exit(&trans);
 
 	return ret;
diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 332795eb9ae8..8891207c46a9 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -866,7 +866,6 @@ void bch2_readahead(struct readahead_control *ractl)
 	BUG_ON(ret);
 
 	bch2_trans_init(&trans, c, 0, 0);
-
 	iter = bch2_trans_get_iter(&trans, BTREE_ID_extents, POS_MIN,
 				   BTREE_ITER_SLOTS);
 
@@ -895,6 +894,7 @@ void bch2_readahead(struct readahead_control *ractl)
 
 	bch2_pagecache_add_put(&inode->ei_pagecache_lock);
 
+	bch2_trans_iter_put(&trans, iter);
 	bch2_trans_exit(&trans);
 	kfree(readpages_iter.pages);
 }
@@ -918,6 +918,7 @@ static void __bchfs_readpage(struct bch_fs *c, struct bch_read_bio *rbio,
 
 	bchfs_read(&trans, iter, rbio, inum, NULL);
 
+	bch2_trans_iter_put(&trans, iter);
 	bch2_trans_exit(&trans);
 }
 
@@ -2155,6 +2156,7 @@ static inline int range_has_data(struct bch_fs *c,
 			break;
 		}
 	}
+	bch2_trans_iter_put(&trans, iter);
 
 	return bch2_trans_exit(&trans) ?: ret;
 }
@@ -2325,6 +2327,7 @@ int bch2_truncate(struct bch_inode_info *inode, struct iattr *iattr)
 	bch2_trans_init(&trans, c, 0, 0);
 	iter = bch2_inode_peek(&trans, &inode_u, inode->v.i_ino, 0);
 	ret = PTR_ERR_OR_ZERO(iter);
+	bch2_trans_iter_put(&trans, iter);
 	bch2_trans_exit(&trans);
 
 	if (ret)
@@ -2459,14 +2462,11 @@ static long bchfs_fcollapse_finsert(struct bch_inode_info *inode,
 	struct btree_iter *src, *dst, *del;
 	loff_t shift, new_size;
 	u64 src_start;
-	int ret;
+	int ret = 0;
 
 	if ((offset | len) & (block_bytes(c) - 1))
 		return -EINVAL;
 
-	bch2_bkey_buf_init(&copy);
-	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 256);
-
 	/*
 	 * We need i_mutex to keep the page cache consistent with the extents
 	 * btree, and the btree consistent with i_size - we don't need outside
@@ -2522,13 +2522,15 @@ static long bchfs_fcollapse_finsert(struct bch_inode_info *inode,
 			goto err;
 	}
 
+	bch2_bkey_buf_init(&copy);
+	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 256);
 	src = bch2_trans_get_iter(&trans, BTREE_ID_extents,
 			POS(inode->v.i_ino, src_start >> 9),
 			BTREE_ITER_INTENT);
 	dst = bch2_trans_copy_iter(&trans, src);
 	del = bch2_trans_copy_iter(&trans, src);
 
-	while (1) {
+	while (ret == 0 || ret == -EINTR) {
 		struct disk_reservation disk_res =
 			bch2_disk_reservation_init(c, 0);
 		struct bkey_i delete;
@@ -2542,7 +2544,7 @@ static long bchfs_fcollapse_finsert(struct bch_inode_info *inode,
 			? bch2_btree_iter_peek_prev(src)
 			: bch2_btree_iter_peek(src);
 		if ((ret = bkey_err(k)))
-			goto bkey_err;
+			continue;
 
 		if (!k.k || k.k->p.inode != inode->v.i_ino)
 			break;
@@ -2562,7 +2564,7 @@ reassemble:
 
 		ret = bch2_extent_atomic_end(dst, copy.k, &atomic_end);
 		if (ret)
-			goto bkey_err;
+			continue;
 
 		if (bkey_cmp(atomic_end, copy.k->k.p)) {
 			if (insert) {
@@ -2605,18 +2607,18 @@ reassemble:
 					  &inode->ei_journal_seq,
 					  BTREE_INSERT_NOFAIL);
 		bch2_disk_reservation_put(c, &disk_res);
-bkey_err:
+
 		if (!ret)
 			bch2_btree_iter_set_pos(src, next_pos);
-
-		if (ret == -EINTR)
-			ret = 0;
-		if (ret)
-			goto err;
-
-		bch2_trans_cond_resched(&trans);
 	}
-	bch2_trans_unlock(&trans);
+	bch2_trans_iter_put(&trans, del);
+	bch2_trans_iter_put(&trans, dst);
+	bch2_trans_iter_put(&trans, src);
+	bch2_trans_exit(&trans);
+	bch2_bkey_buf_exit(&copy, c);
+
+	if (ret)
+		goto err;
 
 	if (!insert) {
 		i_size_write(&inode->v, new_size);
@@ -2626,8 +2628,6 @@ bkey_err:
 		mutex_unlock(&inode->ei_update_lock);
 	}
 err:
-	bch2_trans_exit(&trans);
-	bch2_bkey_buf_exit(&copy, c);
 	bch2_pagecache_block_put(&inode->ei_pagecache_lock);
 	inode_unlock(&inode->v);
 	return ret;
@@ -2682,7 +2682,7 @@ static long bchfs_fallocate(struct bch_inode_info *inode, int mode,
 			BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
 	end_pos = POS(inode->v.i_ino, block_end >> 9);
 
-	while (bkey_cmp(iter->pos, end_pos) < 0) {
+	while (!ret && bkey_cmp(iter->pos, end_pos) < 0) {
 		s64 i_sectors_delta = 0;
 		struct disk_reservation disk_res = { 0 };
 		struct quota_res quota_res = { 0 };
@@ -2746,9 +2746,11 @@ bkey_err:
 		bch2_disk_reservation_put(c, &disk_res);
 		if (ret == -EINTR)
 			ret = 0;
-		if (ret)
-			goto err;
 	}
+	bch2_trans_iter_put(&trans, iter);
+
+	if (ret)
+		goto err;
 
 	/*
 	 * Do we need to extend the file?
@@ -2770,6 +2772,7 @@ bkey_err:
 			ret = PTR_ERR_OR_ZERO(inode_iter);
 		} while (ret == -EINTR);
 
+		bch2_trans_iter_put(&trans, inode_iter);
 		bch2_trans_unlock(&trans);
 
 		if (ret)
@@ -3015,6 +3018,7 @@ static loff_t bch2_seek_data(struct file *file, u64 offset)
 		} else if (k.k->p.offset >> 9 > isize)
 			break;
 	}
+	bch2_trans_iter_put(&trans, iter);
 
 	ret = bch2_trans_exit(&trans) ?: ret;
 	if (ret)
@@ -3118,6 +3122,7 @@ static loff_t bch2_seek_hole(struct file *file, u64 offset)
 			offset = max(offset, bkey_start_offset(k.k) << 9);
 		}
 	}
+	bch2_trans_iter_put(&trans, iter);
 
 	ret = bch2_trans_exit(&trans) ?: ret;
 	if (ret)
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index 1fafd393912c..3acda0389da8 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -734,6 +734,8 @@ retry:
 				  BTREE_INSERT_NOUNLOCK|
 				  BTREE_INSERT_NOFAIL);
 btree_err:
+	bch2_trans_iter_put(&trans, inode_iter);
+
 	if (ret == -EINTR)
 		goto retry;
 	if (unlikely(ret))
@@ -961,6 +963,7 @@ retry:
 		ret = bch2_fill_extent(c, info, bkey_i_to_s_c(prev.k),
 				       FIEMAP_EXTENT_LAST);
 
+	bch2_trans_iter_put(&trans, iter);
 	ret = bch2_trans_exit(&trans) ?: ret;
 	bch2_bkey_buf_exit(&cur, c);
 	bch2_bkey_buf_exit(&prev, c);
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 033d37891c60..f8e0b24d087a 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -1485,11 +1485,12 @@ int bch2_fsck_walk_inodes_only(struct bch_fs *c)
 		     BCH_INODE_I_SECTORS_DIRTY|
 		     BCH_INODE_UNLINKED)) {
 			ret = check_inode(&trans, NULL, iter, inode, NULL);
-			BUG_ON(ret == -EINTR);
 			if (ret)
 				break;
 		}
 	}
+	bch2_trans_iter_put(&trans, iter);
+
 	BUG_ON(ret == -EINTR);
 
 	return bch2_trans_exit(&trans) ?: ret;
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index a3d2bae0a652..aec0fc9228a3 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -620,6 +620,7 @@ retry:
 
 	ret = bch2_trans_commit(&trans, NULL, NULL,
 				BTREE_INSERT_NOFAIL);
+	bch2_trans_iter_put(&trans, iter);
 err:
 	if (ret == -EINTR)
 		goto retry;
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index 425502f7b1b8..b841b3da2510 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -414,6 +414,8 @@ int bch2_fpunch(struct bch_fs *c, u64 inum, u64 start, u64 end,
 
 	ret = bch2_fpunch_at(&trans, iter, POS(inum, end),
 			     journal_seq, i_sectors_delta);
+
+	bch2_trans_iter_put(&trans, iter);
 	bch2_trans_exit(&trans);
 
 	if (ret == -EINTR)
@@ -460,6 +462,7 @@ int bch2_write_index_default(struct bch_write_op *op)
 			bch2_keylist_pop_front(keys);
 	} while (!bch2_keylist_empty(keys));
 
+	bch2_trans_iter_put(&trans, iter);
 	bch2_trans_exit(&trans);
 	bch2_bkey_buf_exit(&sk, c);
 
@@ -1659,6 +1662,7 @@ retry:
 		goto err;
 out:
 	bch2_rbio_done(rbio);
+	bch2_trans_iter_put(&trans, iter);
 	bch2_trans_exit(&trans);
 	bch2_bkey_buf_exit(&sk, c);
 	return;
@@ -2259,7 +2263,7 @@ retry:
 		k = bch2_btree_iter_peek_slot(iter);
 		ret = bkey_err(k);
 		if (ret)
-			goto err;
+			break;
 
 		offset_into_extent = iter->pos.offset -
 			bkey_start_offset(k.k);
@@ -2270,7 +2274,7 @@ retry:
 		ret = bch2_read_indirect_extent(&trans, &data_btree,
 					&offset_into_extent, &sk);
 		if (ret)
-			goto err;
+			break;
 
 		k = bkey_i_to_s_c(sk.k);
 
@@ -2295,12 +2299,8 @@ retry:
 		ret = __bch2_read_extent(&trans, rbio, bvec_iter, iter->pos,
 					 data_btree, k,
 					 offset_into_extent, failed, flags);
-		switch (ret) {
-		case READ_RETRY:
-			goto retry;
-		case READ_ERR:
-			goto err;
-		};
+		if (ret)
+			break;
 
 		if (flags & BCH_READ_LAST_FRAGMENT)
 			break;
@@ -2308,19 +2308,19 @@ retry:
 		swap(bvec_iter.bi_size, bytes);
 		bio_advance_iter(&rbio->bio, &bvec_iter, bytes);
 	}
-out:
-	bch2_trans_exit(&trans);
-	bch2_bkey_buf_exit(&sk, c);
-	return;
-err:
-	if (ret == -EINTR)
+	bch2_trans_iter_put(&trans, iter);
+
+	if (ret == -EINTR || ret == READ_RETRY || ret == READ_RETRY_AVOID)
 		goto retry;
 
-	bch_err_inum_ratelimited(c, inode,
-				 "read error %i from btree lookup", ret);
-	rbio->bio.bi_status = BLK_STS_IOERR;
-	bch2_rbio_done(rbio);
-	goto out;
+	if (ret) {
+		bch_err_inum_ratelimited(c, inode,
+					 "read error %i from btree lookup", ret);
+		rbio->bio.bi_status = BLK_STS_IOERR;
+		bch2_rbio_done(rbio);
+	}
+	bch2_trans_exit(&trans);
+	bch2_bkey_buf_exit(&sk, c);
 }
 
 void bch2_fs_io_exit(struct bch_fs *c)
diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c
index 1db2c2d6b970..4d8b4169923d 100644
--- a/fs/bcachefs/migrate.c
+++ b/fs/bcachefs/migrate.c
@@ -88,6 +88,7 @@ static int __bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags
 		if (ret)
 			break;
 	}
+	bch2_trans_iter_put(&trans, iter);
 
 	ret = bch2_trans_exit(&trans) ?: ret;
 	bch2_bkey_buf_exit(&sk, c);
@@ -135,20 +136,24 @@ retry:
 					    dev_idx, flags, true);
 			if (ret) {
 				bch_err(c, "Cannot drop device without losing data");
-				goto err;
+				break;
 			}
 
 			ret = bch2_btree_node_update_key(c, iter, b, k.k);
 			if (ret == -EINTR) {
 				b = bch2_btree_iter_peek_node(iter);
+				ret = 0;
 				goto retry;
 			}
 			if (ret) {
 				bch_err(c, "Error updating btree node key: %i", ret);
-				goto err;
+				break;
 			}
 		}
 		bch2_trans_iter_free(&trans, iter);
+
+		if (ret)
+			goto err;
 	}
 
 	/* flush relevant btree updates */
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index 3ff31d25f396..f7b0764d9c98 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -195,6 +195,7 @@ nomatch:
 		goto next;
 	}
 out:
+	bch2_trans_iter_put(&trans, iter);
 	bch2_trans_exit(&trans);
 	bch2_bkey_buf_exit(&_insert, c);
 	bch2_bkey_buf_exit(&_new, c);
@@ -641,6 +642,8 @@ next_nondata:
 		bch2_trans_cond_resched(&trans);
 	}
 out:
+
+	bch2_trans_iter_put(&trans, iter);
 	ret = bch2_trans_exit(&trans) ?: ret;
 	bch2_bkey_buf_exit(&sk, c);
 
diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c
index 8e272519ce0e..35b409e0f366 100644
--- a/fs/bcachefs/quota.c
+++ b/fs/bcachefs/quota.c
@@ -372,6 +372,7 @@ static int bch2_quota_init_type(struct bch_fs *c, enum quota_types type)
 		if (ret)
 			break;
 	}
+	bch2_trans_iter_put(&trans, iter);
 
 	return bch2_trans_exit(&trans) ?: ret;
 }
@@ -449,6 +450,8 @@ int bch2_fs_quota_read(struct bch_fs *c)
 					KEY_TYPE_QUOTA_NOCHECK);
 		}
 	}
+	bch2_trans_iter_put(&trans, iter);
+
 	return bch2_trans_exit(&trans) ?: ret;
 }
 
@@ -739,7 +742,9 @@ static int bch2_set_quota_trans(struct btree_trans *trans,
 	if (qdq->d_fieldmask & QC_INO_HARD)
 		new_quota->v.c[Q_INO].hardlimit = cpu_to_le64(qdq->d_ino_hardlimit);
 
-	return bch2_trans_update(trans, iter, &new_quota->k_i, 0);
+	ret = bch2_trans_update(trans, iter, &new_quota->k_i, 0);
+	bch2_trans_iter_put(trans, iter);
+	return ret;
 }
 
 static int bch2_set_quota(struct super_block *sb, struct kqid qid,
diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c
index e9a6a5f639b4..0978ad92614c 100644
--- a/fs/bcachefs/reflink.c
+++ b/fs/bcachefs/reflink.c
@@ -223,20 +223,18 @@ s64 bch2_remap_range(struct bch_fs *c,
 	dst_iter = bch2_trans_get_iter(&trans, BTREE_ID_extents, dst_start,
 				       BTREE_ITER_INTENT);
 
-	while (1) {
+	while (ret == 0 || ret == -EINTR) {
 		bch2_trans_begin(&trans);
 
-		trans.mem_top = 0;
-
 		if (fatal_signal_pending(current)) {
 			ret = -EINTR;
-			goto err;
+			break;
 		}
 
 		src_k = get_next_src(src_iter, src_end);
 		ret = bkey_err(src_k);
 		if (ret)
-			goto btree_err;
+			continue;
 
 		src_done = bpos_min(src_iter->pos, src_end).offset -
 			src_start.offset;
@@ -245,8 +243,6 @@ s64 bch2_remap_range(struct bch_fs *c,
 		if (bkey_cmp(dst_iter->pos, dst_want) < 0) {
 			ret = bch2_fpunch_at(&trans, dst_iter, dst_want,
 					     journal_seq, i_sectors_delta);
-			if (ret)
-				goto btree_err;
 			continue;
 		}
 
@@ -265,7 +261,7 @@ s64 bch2_remap_range(struct bch_fs *c,
 			ret = bch2_make_extent_indirect(&trans, src_iter,
 						new_src.k);
 			if (ret)
-				goto btree_err;
+				continue;
 
 			BUG_ON(src_k.k->type != KEY_TYPE_reflink_p);
 		}
@@ -294,20 +290,16 @@ s64 bch2_remap_range(struct bch_fs *c,
 					 NULL, journal_seq,
 					 new_i_size, i_sectors_delta);
 		if (ret)
-			goto btree_err;
+			continue;
 
 		dst_done = dst_iter->pos.offset - dst_start.offset;
 		src_want = POS(src_start.inode, src_start.offset + dst_done);
 		bch2_btree_iter_set_pos(src_iter, src_want);
-btree_err:
-		if (ret == -EINTR)
-			ret = 0;
-		if (ret)
-			goto err;
 	}
+	bch2_trans_iter_put(&trans, dst_iter);
+	bch2_trans_iter_put(&trans, src_iter);
 
-	BUG_ON(bkey_cmp(dst_iter->pos, dst_end));
-err:
+	BUG_ON(!ret && bkey_cmp(dst_iter->pos, dst_end));
 	BUG_ON(bkey_cmp(dst_iter->pos, dst_end) > 0);
 
 	dst_done = dst_iter->pos.offset - dst_start.offset;
@@ -329,6 +321,8 @@ err:
 			ret2  = bch2_inode_write(&trans, inode_iter, &inode_u) ?:
 				bch2_trans_commit(&trans, NULL, journal_seq, 0);
 		}
+
+		bch2_trans_iter_put(&trans, inode_iter);
 	} while (ret2 == -EINTR);
 
 	ret = bch2_trans_exit(&trans) ?: ret;
diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c
index f25a27f26202..286587a118fe 100644
--- a/fs/bcachefs/tests.c
+++ b/fs/bcachefs/tests.c
@@ -67,6 +67,7 @@ static int test_delete(struct bch_fs *c, u64 nr)
 		goto err;
 	}
 err:
+	bch2_trans_iter_put(&trans, iter);
 	bch2_trans_exit(&trans);
 	return ret;
 }
@@ -106,6 +107,7 @@ static int test_delete_written(struct bch_fs *c, u64 nr)
 		goto err;
 	}
 err:
+	bch2_trans_iter_put(&trans, iter);
 	bch2_trans_exit(&trans);
 	return ret;
 }
@@ -113,7 +115,7 @@ err:
 static int test_iterate(struct bch_fs *c, u64 nr)
 {
 	struct btree_trans trans;
-	struct btree_iter *iter;
+	struct btree_iter *iter = NULL;
 	struct bkey_s_c k;
 	u64 i;
 	int ret = 0;
@@ -159,6 +161,7 @@ static int test_iterate(struct bch_fs *c, u64 nr)
 
 	BUG_ON(i);
 err:
+	bch2_trans_iter_put(&trans, iter);
 	bch2_trans_exit(&trans);
 	return ret;
 }
@@ -166,7 +169,7 @@ err:
 static int test_iterate_extents(struct bch_fs *c, u64 nr)
 {
 	struct btree_trans trans;
-	struct btree_iter *iter;
+	struct btree_iter *iter = NULL;
 	struct bkey_s_c k;
 	u64 i;
 	int ret = 0;
@@ -213,6 +216,7 @@ static int test_iterate_extents(struct bch_fs *c, u64 nr)
 
 	BUG_ON(i);
 err:
+	bch2_trans_iter_put(&trans, iter);
 	bch2_trans_exit(&trans);
 	return ret;
 }
@@ -257,7 +261,7 @@ static int test_iterate_slots(struct bch_fs *c, u64 nr)
 		BUG_ON(k.k->p.offset != i);
 		i += 2;
 	}
-	bch2_trans_iter_free(&trans, iter);
+	bch2_trans_iter_put(&trans, iter);
 
 	BUG_ON(i != nr * 2);
 
@@ -274,6 +278,7 @@ static int test_iterate_slots(struct bch_fs *c, u64 nr)
 		if (i == nr * 2)
 			break;
 	}
+	bch2_trans_iter_put(&trans, iter);
 err:
 	bch2_trans_exit(&trans);
 	return ret;
@@ -318,7 +323,7 @@ static int test_iterate_slots_extents(struct bch_fs *c, u64 nr)
 		BUG_ON(k.k->size != 8);
 		i += 16;
 	}
-	bch2_trans_iter_free(&trans, iter);
+	bch2_trans_iter_put(&trans, iter);
 
 	BUG_ON(i != nr);
 
@@ -337,6 +342,7 @@ static int test_iterate_slots_extents(struct bch_fs *c, u64 nr)
 		if (i == nr)
 			break;
 	}
+	bch2_trans_iter_put(&trans, iter);
 err:
 	bch2_trans_exit(&trans);
 	return 0;
@@ -362,6 +368,8 @@ static int test_peek_end(struct bch_fs *c, u64 nr)
 	k = bch2_btree_iter_peek(iter);
 	BUG_ON(k.k);
 
+	bch2_trans_iter_put(&trans, iter);
+
 	bch2_trans_exit(&trans);
 	return 0;
 }
@@ -382,6 +390,8 @@ static int test_peek_end_extents(struct bch_fs *c, u64 nr)
 	k = bch2_btree_iter_peek(iter);
 	BUG_ON(k.k);
 
+	bch2_trans_iter_put(&trans, iter);
+
 	bch2_trans_exit(&trans);
 	return 0;
 }
@@ -508,7 +518,7 @@ static int rand_lookup(struct bch_fs *c, u64 nr)
 		}
 	}
 
-	bch2_trans_iter_free(&trans, iter);
+	bch2_trans_iter_put(&trans, iter);
 	bch2_trans_exit(&trans);
 	return ret;
 }
@@ -549,7 +559,7 @@ static int rand_mixed(struct bch_fs *c, u64 nr)
 		}
 	}
 
-	bch2_trans_iter_free(&trans, iter);
+	bch2_trans_iter_put(&trans, iter);
 	bch2_trans_exit(&trans);
 	return ret;
 }
@@ -630,6 +640,8 @@ static int seq_insert(struct bch_fs *c, u64 nr)
 		if (++i == nr)
 			break;
 	}
+	bch2_trans_iter_put(&trans, iter);
+
 	bch2_trans_exit(&trans);
 	return ret;
 }
@@ -645,6 +657,8 @@ static int seq_lookup(struct bch_fs *c, u64 nr)
 
 	for_each_btree_key(&trans, iter, BTREE_ID_xattrs, POS_MIN, 0, k, ret)
 		;
+	bch2_trans_iter_put(&trans, iter);
+
 	bch2_trans_exit(&trans);
 	return ret;
 }
@@ -671,6 +685,8 @@ static int seq_overwrite(struct bch_fs *c, u64 nr)
 			break;
 		}
 	}
+	bch2_trans_iter_put(&trans, iter);
+
 	bch2_trans_exit(&trans);
 	return ret;
 }
diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c
index 5692b47eb3c9..f18a795620d8 100644
--- a/fs/bcachefs/xattr.c
+++ b/fs/bcachefs/xattr.c
@@ -133,12 +133,9 @@ int bch2_xattr_get(struct bch_fs *c, struct bch_inode_info *inode,
 				inode->v.i_ino,
 				&X_SEARCH(type, name, strlen(name)),
 				0);
-	if (IS_ERR(iter)) {
-		bch2_trans_exit(&trans);
-		BUG_ON(PTR_ERR(iter) == -EINTR);
-
-		return PTR_ERR(iter) == -ENOENT ? -ENODATA : PTR_ERR(iter);
-	}
+	ret = PTR_ERR_OR_ZERO(iter);
+	if (ret)
+		goto err;
 
 	xattr = bkey_s_c_to_xattr(bch2_btree_iter_peek_slot(iter));
 	ret = le16_to_cpu(xattr.v->x_val_len);
@@ -148,9 +145,12 @@ int bch2_xattr_get(struct bch_fs *c, struct bch_inode_info *inode,
 		else
 			memcpy(buffer, xattr_val(xattr.v), ret);
 	}
-
+	bch2_trans_iter_put(&trans, iter);
+err:
 	bch2_trans_exit(&trans);
-	return ret;
+
+	BUG_ON(ret == -EINTR);
+	return ret == -ENOENT ? -ENODATA : ret;
 }
 
 int bch2_xattr_set(struct btree_trans *trans, u64 inum,
@@ -294,6 +294,8 @@ ssize_t bch2_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size)
 		if (ret)
 			break;
 	}
+	bch2_trans_iter_put(&trans, iter);
+
 	ret = bch2_trans_exit(&trans) ?: ret;
 
 	if (ret)
-- 
cgit 


From e9895f0ab950c0f37f9bb8ad7117f2abb2590411 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 19 Mar 2021 20:40:31 -0400
Subject: bcachefs: Assert that iterators aren't being double freed

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c | 1 +
 fs/bcachefs/buckets.c    | 4 ++++
 2 files changed, 5 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 00140ae50cb9..6e860d47da4f 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1956,6 +1956,7 @@ int bch2_trans_iter_put(struct btree_trans *trans,
 		return 0;
 
 	BUG_ON(trans->iters + iter->idx != iter);
+	BUG_ON(!btree_iter_live(trans, iter));
 
 	ret = btree_iter_err(iter);
 
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 7bf2fded816f..df839021fd3d 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -1500,6 +1500,10 @@ static struct btree_iter *trans_get_update(struct btree_trans *trans,
 		       bkey_cmp(pos, i->k->k.p) < 0
 		     : !bkey_cmp(pos, i->iter->pos))) {
 			*k = bkey_i_to_s_c(i->k);
+
+			/* ugly hack.. */
+			BUG_ON(btree_iter_live(trans, i->iter));
+			trans->iters_live |= 1ULL << i->iter->idx;
 			return i->iter;
 		}
 
-- 
cgit 


From 7e6dbac98205ab32a6a924199f42e6496c4149db Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 19 Mar 2021 23:19:05 -0400
Subject: bcachefs: Kill bkey ops->debugcheck method

This code used to be used for running some assertions on alloc info at
runtime, but it long predates fsck and hasn't been good for much in
ages - we can delete it now.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bkey_methods.c |  5 ---
 fs/bcachefs/bkey_methods.h |  1 -
 fs/bcachefs/btree_gc.c     |  4 ---
 fs/bcachefs/extents.c      | 83 ----------------------------------------------
 fs/bcachefs/extents.h      |  5 ---
 5 files changed, 98 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c
index 79e249f49971..878befb5b9ef 100644
--- a/fs/bcachefs/bkey_methods.c
+++ b/fs/bcachefs/bkey_methods.c
@@ -149,7 +149,6 @@ const char *bch2_bkey_in_btree_node(struct btree *b, struct bkey_s_c k)
 
 void bch2_bkey_debugcheck(struct bch_fs *c, struct btree *b, struct bkey_s_c k)
 {
-	const struct bkey_ops *ops = &bch2_bkey_ops[k.k->type];
 	const char *invalid;
 
 	BUG_ON(!k.k->u64s);
@@ -161,11 +160,7 @@ void bch2_bkey_debugcheck(struct bch_fs *c, struct btree *b, struct bkey_s_c k)
 
 		bch2_bkey_val_to_text(&PBUF(buf), c, k);
 		bch2_fs_inconsistent(c, "invalid bkey %s: %s", buf, invalid);
-		return;
 	}
-
-	if (ops->key_debugcheck)
-		ops->key_debugcheck(c, k);
 }
 
 void bch2_bpos_to_text(struct printbuf *out, struct bpos pos)
diff --git a/fs/bcachefs/bkey_methods.h b/fs/bcachefs/bkey_methods.h
index 0bca725ae3b8..bfa6f112aeed 100644
--- a/fs/bcachefs/bkey_methods.h
+++ b/fs/bcachefs/bkey_methods.h
@@ -26,7 +26,6 @@ struct bkey_ops {
 	/* Returns reason for being invalid if invalid, else NULL: */
 	const char *	(*key_invalid)(const struct bch_fs *,
 				       struct bkey_s_c);
-	void		(*key_debugcheck)(struct bch_fs *, struct bkey_s_c);
 	void		(*val_to_text)(struct printbuf *, struct bch_fs *,
 				       struct bkey_s_c);
 	void		(*swab)(struct bkey_s);
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index a303cd376d4b..f75562bf8e21 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -389,8 +389,6 @@ static int btree_gc_mark_node(struct bch_fs *c, struct btree *b, u8 *max_stale,
 	bkey_init(&prev.k->k);
 
 	while ((k = bch2_btree_node_iter_peek_unpack(&iter, b, &unpacked)).k) {
-		bch2_bkey_debugcheck(c, b, k);
-
 		ret = bch2_gc_mark_key(c, b->c.btree_id, b->c.level, false,
 				       &k, max_stale, initial);
 		if (ret)
@@ -491,8 +489,6 @@ static int bch2_gc_btree_init_recurse(struct bch_fs *c, struct btree *b,
 	bkey_init(&prev.k->k);
 
 	while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
-		bch2_bkey_debugcheck(c, b, k);
-
 		BUG_ON(bkey_cmp(k.k->p, b->data->min_key) < 0);
 		BUG_ON(bkey_cmp(k.k->p, b->data->max_key) > 0);
 
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 8cf45b7b9459..76b2459d8e73 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -163,46 +163,6 @@ const char *bch2_btree_ptr_invalid(const struct bch_fs *c, struct bkey_s_c k)
 	return bch2_bkey_ptrs_invalid(c, k);
 }
 
-void bch2_btree_ptr_debugcheck(struct bch_fs *c, struct bkey_s_c k)
-{
-	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-	const struct bch_extent_ptr *ptr;
-	const char *err;
-	char buf[160];
-	struct bucket_mark mark;
-	struct bch_dev *ca;
-
-	if (!test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags))
-		return;
-
-	if (!percpu_down_read_trylock(&c->mark_lock))
-		return;
-
-	bkey_for_each_ptr(ptrs, ptr) {
-		ca = bch_dev_bkey_exists(c, ptr->dev);
-
-		mark = ptr_bucket_mark(ca, ptr);
-
-		err = "stale";
-		if (gen_after(mark.gen, ptr->gen))
-			goto err;
-
-		err = "inconsistent";
-		if (mark.data_type != BCH_DATA_btree ||
-		    mark.dirty_sectors < c->opts.btree_node_size)
-			goto err;
-	}
-out:
-	percpu_up_read(&c->mark_lock);
-	return;
-err:
-	bch2_fs_inconsistent(c, "%s btree pointer %s: bucket %zi gen %i mark %08x",
-		err, (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf),
-		PTR_BUCKET_NR(ca, ptr),
-		mark.gen, (unsigned) mark.v.counter);
-	goto out;
-}
-
 void bch2_btree_ptr_to_text(struct printbuf *out, struct bch_fs *c,
 			    struct bkey_s_c k)
 {
@@ -246,49 +206,6 @@ const char *bch2_extent_invalid(const struct bch_fs *c, struct bkey_s_c k)
 	return bch2_bkey_ptrs_invalid(c, k);
 }
 
-void bch2_extent_debugcheck(struct bch_fs *c, struct bkey_s_c k)
-{
-	struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
-	const union bch_extent_entry *entry;
-	struct extent_ptr_decoded p;
-	char buf[160];
-
-	if (!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags) ||
-	    !test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags))
-		return;
-
-	if (!percpu_down_read_trylock(&c->mark_lock))
-		return;
-
-	extent_for_each_ptr_decode(e, p, entry) {
-		struct bch_dev *ca	= bch_dev_bkey_exists(c, p.ptr.dev);
-		struct bucket_mark mark = ptr_bucket_mark(ca, &p.ptr);
-		unsigned stale		= gen_after(mark.gen, p.ptr.gen);
-		unsigned disk_sectors	= ptr_disk_sectors(p);
-		unsigned mark_sectors	= p.ptr.cached
-			? mark.cached_sectors
-			: mark.dirty_sectors;
-
-		bch2_fs_inconsistent_on(stale && !p.ptr.cached, c,
-			"stale dirty pointer (ptr gen %u bucket %u",
-			p.ptr.gen, mark.gen);
-
-		bch2_fs_inconsistent_on(stale > 96, c,
-			"key too stale: %i", stale);
-
-		bch2_fs_inconsistent_on(!stale &&
-			(mark.data_type != BCH_DATA_user ||
-			 mark_sectors < disk_sectors), c,
-			"extent pointer not marked: %s:\n"
-			"type %u sectors %u < %u",
-			(bch2_bkey_val_to_text(&PBUF(buf), c, e.s_c), buf),
-			mark.data_type,
-			mark_sectors, disk_sectors);
-	}
-
-	percpu_up_read(&c->mark_lock);
-}
-
 void bch2_extent_to_text(struct printbuf *out, struct bch_fs *c,
 			 struct bkey_s_c k)
 {
diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h
index 3988315fc404..2ee50a24501e 100644
--- a/fs/bcachefs/extents.h
+++ b/fs/bcachefs/extents.h
@@ -368,7 +368,6 @@ int bch2_bkey_pick_read_device(struct bch_fs *, struct bkey_s_c,
 /* KEY_TYPE_btree_ptr: */
 
 const char *bch2_btree_ptr_invalid(const struct bch_fs *, struct bkey_s_c);
-void bch2_btree_ptr_debugcheck(struct bch_fs *, struct bkey_s_c);
 void bch2_btree_ptr_to_text(struct printbuf *, struct bch_fs *,
 			    struct bkey_s_c);
 
@@ -379,14 +378,12 @@ void bch2_btree_ptr_v2_compat(enum btree_id, unsigned, unsigned,
 
 #define bch2_bkey_ops_btree_ptr (struct bkey_ops) {		\
 	.key_invalid	= bch2_btree_ptr_invalid,		\
-	.key_debugcheck	= bch2_btree_ptr_debugcheck,		\
 	.val_to_text	= bch2_btree_ptr_to_text,		\
 	.swab		= bch2_ptr_swab,			\
 }
 
 #define bch2_bkey_ops_btree_ptr_v2 (struct bkey_ops) {		\
 	.key_invalid	= bch2_btree_ptr_invalid,		\
-	.key_debugcheck	= bch2_btree_ptr_debugcheck,		\
 	.val_to_text	= bch2_btree_ptr_v2_to_text,		\
 	.swab		= bch2_ptr_swab,			\
 	.compat		= bch2_btree_ptr_v2_compat,		\
@@ -395,14 +392,12 @@ void bch2_btree_ptr_v2_compat(enum btree_id, unsigned, unsigned,
 /* KEY_TYPE_extent: */
 
 const char *bch2_extent_invalid(const struct bch_fs *, struct bkey_s_c);
-void bch2_extent_debugcheck(struct bch_fs *, struct bkey_s_c);
 void bch2_extent_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 enum merge_result bch2_extent_merge(struct bch_fs *,
 				    struct bkey_s, struct bkey_s);
 
 #define bch2_bkey_ops_extent (struct bkey_ops) {		\
 	.key_invalid	= bch2_extent_invalid,			\
-	.key_debugcheck	= bch2_extent_debugcheck,		\
 	.val_to_text	= bch2_extent_to_text,			\
 	.swab		= bch2_ptr_swab,			\
 	.key_normalize	= bch2_extent_normalize,		\
-- 
cgit 


From d361a26d02b97e3f3c0e1563b62a5f7c32ef2f04 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 19 Mar 2021 16:37:24 -0400
Subject: bcachefs: Don't overwrite snapshot field in bch2_cut_back()

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/extents.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 76b2459d8e73..2be49f443eb0 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -1183,7 +1183,7 @@ int bch2_cut_back_s(struct bpos where, struct bkey_s k)
 
 	len = where.offset - bkey_start_offset(k.k);
 
-	k.k->p = where;
+	k.k->p.offset = where.offset;
 	k.k->size = len;
 
 	if (!len) {
-- 
cgit 


From 84cc758d6b19ae8a92389306fe3510e58a08d90c Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 21 Mar 2021 16:03:23 -0400
Subject: bcachefs: Validate bset version field against sb version fields

The superblock version fields need to be accurate to know whether a
filesystem is supported, thus we should be verifying them.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs.h |  1 +
 fs/bcachefs/btree_io.c | 20 ++++++++++++++++++++
 fs/bcachefs/super-io.c |  1 +
 3 files changed, 22 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 9f4e7a3ada36..51aefecb5cbb 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -592,6 +592,7 @@ struct bch_fs {
 		__uuid_t	user_uuid;
 
 		u16		version;
+		u16		version_min;
 		u16		encoded_extent_max;
 
 		u8		nr_devices;
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 7ec14cd8f02b..adeb4f9fb5fd 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -560,6 +560,26 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
 		     BTREE_ERR_FATAL, c, ca, b, i,
 		     "unsupported bset version");
 
+	if (btree_err_on(version < c->sb.version_min,
+			 BTREE_ERR_FIXABLE, c, NULL, b, i,
+			 "bset version %u older than superblock version_min %u",
+			 version, c->sb.version_min)) {
+		mutex_lock(&c->sb_lock);
+		c->disk_sb.sb->version_min = cpu_to_le16(version);
+		bch2_write_super(c);
+		mutex_unlock(&c->sb_lock);
+	}
+
+	if (btree_err_on(version > c->sb.version,
+			 BTREE_ERR_FIXABLE, c, NULL, b, i,
+			 "bset version %u newer than superblock version %u",
+			 version, c->sb.version)) {
+		mutex_lock(&c->sb_lock);
+		c->disk_sb.sb->version = cpu_to_le16(version);
+		bch2_write_super(c);
+		mutex_unlock(&c->sb_lock);
+	}
+
 	if (btree_err_on(b->written + sectors > c->opts.btree_node_size,
 			 BTREE_ERR_FIXABLE, c, ca, b, i,
 			 "bset past end of btree node")) {
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index 6e61cf5ab217..e397a2a70c9c 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -369,6 +369,7 @@ static void bch2_sb_update(struct bch_fs *c)
 	c->sb.uuid		= src->uuid;
 	c->sb.user_uuid		= src->user_uuid;
 	c->sb.version		= le16_to_cpu(src->version);
+	c->sb.version_min	= le16_to_cpu(src->version_min);
 	c->sb.nr_devices	= src->nr_devices;
 	c->sb.clean		= BCH_SB_CLEAN(src);
 	c->sb.encryption_type	= BCH_SB_ENCRYPTION_TYPE(src);
-- 
cgit 


From 73590619ec1b557c2dc64825d8d22f82f79ec8db Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 21 Mar 2021 16:20:40 -0400
Subject: bcachefs: Don't unconditially version_upgrade in initialize

This is mkfs's job. Also, clean up the handling of feature bits some.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c | 24 +-----------------------
 fs/bcachefs/bcachefs_format.h  |  4 ++--
 fs/bcachefs/recovery.c         | 36 +++++++++++++++---------------------
 3 files changed, 18 insertions(+), 46 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index e2200cedecca..be86e36e816a 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -114,25 +114,6 @@ static void bch2_alloc_unpack_v1(struct bkey_alloc_unpacked *out,
 #undef  x
 }
 
-static void bch2_alloc_pack_v1(struct bkey_alloc_buf *dst,
-			       const struct bkey_alloc_unpacked src)
-{
-	struct bkey_i_alloc *a = bkey_alloc_init(&dst->k);
-	void *d = a->v.data;
-	unsigned bytes, idx = 0;
-
-	a->k.p		= POS(src.dev, src.bucket);
-	a->v.fields	= 0;
-	a->v.gen	= src.gen;
-
-#define x(_name, _bits)	alloc_field_v1_put(a, &d, idx++, src._name);
-	BCH_ALLOC_FIELDS_V1()
-#undef  x
-	bytes = (void *) d - (void *) &a->v;
-	set_bkey_val_bytes(&a->k, bytes);
-	memset_u64s_tail(&a->v, 0, bytes);
-}
-
 static int bch2_alloc_unpack_v2(struct bkey_alloc_unpacked *out,
 				struct bkey_s_c k)
 {
@@ -225,10 +206,7 @@ void bch2_alloc_pack(struct bch_fs *c,
 		     struct bkey_alloc_buf *dst,
 		     const struct bkey_alloc_unpacked src)
 {
-	if (c->sb.features & (1ULL << BCH_FEATURE_alloc_v2))
-		bch2_alloc_pack_v2(dst, src);
-	else
-		bch2_alloc_pack_v1(dst, src);
+	bch2_alloc_pack_v2(dst, src);
 }
 
 static unsigned bch_alloc_val_u64s(const struct bch_alloc *a)
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index df6961805f6f..111f7d3c312e 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -1380,6 +1380,7 @@ LE64_BITMASK(BCH_SB_METADATA_TARGET,	struct bch_sb, flags[3], 16, 28);
 	((1ULL << BCH_FEATURE_new_extent_overwrite)|	\
 	 (1ULL << BCH_FEATURE_extents_above_btree_updates)|\
 	 (1ULL << BCH_FEATURE_btree_updates_journalled)|\
+	 (1ULL << BCH_FEATURE_alloc_v2)|\
 	 (1ULL << BCH_FEATURE_extents_across_btree_nodes))
 
 #define BCH_SB_FEATURES_ALL				\
@@ -1387,8 +1388,7 @@ LE64_BITMASK(BCH_SB_METADATA_TARGET,	struct bch_sb, flags[3], 16, 28);
 	 (1ULL << BCH_FEATURE_new_siphash)|		\
 	 (1ULL << BCH_FEATURE_btree_ptr_v2)|		\
 	 (1ULL << BCH_FEATURE_new_varint)|		\
-	 (1ULL << BCH_FEATURE_journal_no_flush)|	\
-	 (1ULL << BCH_FEATURE_alloc_v2))
+	 (1ULL << BCH_FEATURE_journal_no_flush))
 
 enum bch_sb_feature {
 #define x(f, n) BCH_FEATURE_##f,
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 03a25dd5acc6..92f7568175eb 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -991,11 +991,17 @@ int bch2_fs_recovery(struct bch_fs *c)
 		goto err;
 	}
 
+	if (!c->sb.clean &&
+	    !(c->sb.features & (1ULL << BCH_FEATURE_extents_above_btree_updates))) {
+		bch_err(c, "filesystem needs recovery from older version; run fsck from older bcachefs-tools to fix");
+		ret = -EINVAL;
+		goto err;
+	}
+
 	if (!(c->sb.features & (1ULL << BCH_FEATURE_alloc_v2))) {
 		bch_info(c, "alloc_v2 feature bit not set, fsck required");
 		c->opts.fsck = true;
 		c->opts.fix_errors = FSCK_OPT_YES;
-		c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_alloc_v2;
 	}
 
 	if (!c->replicas.entries ||
@@ -1061,13 +1067,6 @@ use_clean:
 		blacklist_seq = journal_seq = le64_to_cpu(clean->journal_seq) + 1;
 	}
 
-	if (!c->sb.clean &&
-	    !(c->sb.features & (1ULL << BCH_FEATURE_extents_above_btree_updates))) {
-		bch_err(c, "filesystem needs recovery from older version; run fsck from older bcachefs-tools to fix");
-		ret = -EINVAL;
-		goto err;
-	}
-
 	if (c->opts.reconstruct_alloc) {
 		c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
 		drop_alloc_keys(&c->journal_keys);
@@ -1228,9 +1227,6 @@ use_clean:
 
 	mutex_lock(&c->sb_lock);
 	if (c->opts.version_upgrade) {
-		if (c->sb.version < bcachefs_metadata_version_new_versioning)
-			c->disk_sb.sb->version_min =
-				le16_to_cpu(bcachefs_metadata_version_min);
 		c->disk_sb.sb->version = le16_to_cpu(bcachefs_metadata_version_current);
 		c->disk_sb.sb->features[0] |= BCH_SB_FEATURES_ALL;
 		write_sb = true;
@@ -1288,19 +1284,17 @@ int bch2_fs_initialize(struct bch_fs *c)
 	bch_notice(c, "initializing new filesystem");
 
 	mutex_lock(&c->sb_lock);
-	for_each_online_member(ca, c, i)
-		bch2_mark_dev_superblock(c, ca, 0);
-	mutex_unlock(&c->sb_lock);
-
-	mutex_lock(&c->sb_lock);
-	c->disk_sb.sb->version = c->disk_sb.sb->version_min =
-		le16_to_cpu(bcachefs_metadata_version_current);
-	c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_atomic_nlink;
-	c->disk_sb.sb->features[0] |= BCH_SB_FEATURES_ALL;
 	c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_extents_above_btree_updates_done;
 	c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_bformat_overflow_done;
 
-	bch2_write_super(c);
+	if (c->opts.version_upgrade) {
+		c->disk_sb.sb->version = le16_to_cpu(bcachefs_metadata_version_current);
+		c->disk_sb.sb->features[0] |= BCH_SB_FEATURES_ALL;
+		bch2_write_super(c);
+	}
+
+	for_each_online_member(ca, c, i)
+		bch2_mark_dev_superblock(c, ca, 0);
 	mutex_unlock(&c->sb_lock);
 
 	set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags);
-- 
cgit 


From 5c1ec980f9984983c90dd08754ad2c28fec1acf1 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 20 Mar 2021 22:05:39 -0400
Subject: bcachefs: Fix iterator picking

comparison was wrong

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 6e860d47da4f..97d806011bfd 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -2058,7 +2058,7 @@ struct btree_iter *__bch2_trans_get_iter(struct btree_trans *trans,
 			continue;
 
 		if (best &&
-		    bkey_cmp(bpos_diff(best->pos, pos),
+		    bkey_cmp(bpos_diff(best->real_pos, pos),
 			     bpos_diff(iter->real_pos, pos)) < 0)
 			continue;
 
-- 
cgit 


From 4ce41957a7370e398dd1ae960e5184af8315de35 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 20 Mar 2021 22:13:30 -0400
Subject: bcachefs: Optimize bch2_btree_iter_verify_level()

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 97d806011bfd..073157f5fbed 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -492,9 +492,9 @@ static void bch2_btree_iter_verify_cached(struct btree_iter *iter)
 static void bch2_btree_iter_verify_level(struct btree_iter *iter,
 					 unsigned level)
 {
-	struct btree_iter_level *l = &iter->l[level];
-	struct btree_node_iter tmp = l->iter;
-	bool locked = btree_node_locked(iter, level);
+	struct btree_iter_level *l;
+	struct btree_node_iter tmp;
+	bool locked;
 	struct bkey_packed *p, *k;
 	char buf1[100], buf2[100], buf3[100];
 	const char *msg;
@@ -502,6 +502,10 @@ static void bch2_btree_iter_verify_level(struct btree_iter *iter,
 	if (!bch2_debug_check_iterators)
 		return;
 
+	l	= &iter->l[level];
+	tmp	= l->iter;
+	locked	= btree_node_locked(iter, level);
+
 	if (btree_iter_type(iter) == BTREE_ITER_CACHED) {
 		if (!level)
 			bch2_btree_iter_verify_cached(iter);
-- 
cgit 


From 4cfb722ca16d89ada724c142201fc86872283a90 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 20 Mar 2021 21:04:57 -0400
Subject: bcachefs: Switch extent_handle_overwrites() to one key at a time

Prep work for snapshots

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update_leaf.c | 103 ++++++++++++++++++----------------------
 1 file changed, 46 insertions(+), 57 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index f8b493706c94..e76916cffd5b 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -690,8 +690,8 @@ bch2_trans_commit_get_rw_cold(struct btree_trans *trans)
 	return 0;
 }
 
-static int __bch2_trans_update2(struct btree_trans *trans,
-				struct btree_insert_entry n)
+static void __bch2_trans_update2(struct btree_trans *trans,
+				 struct btree_insert_entry n)
 {
 	struct btree_insert_entry *i;
 
@@ -711,15 +711,13 @@ static int __bch2_trans_update2(struct btree_trans *trans,
 	else
 		array_insert_item(trans->updates2, trans->nr_updates2,
 				  i - trans->updates2, n);
-
-	return 0;
 }
 
-static int bch2_trans_update2(struct btree_trans *trans,
-			      struct btree_iter *iter,
-			      struct bkey_i *insert)
+static void bch2_trans_update2(struct btree_trans *trans,
+			       struct btree_iter *iter,
+			       struct bkey_i *insert)
 {
-	return __bch2_trans_update2(trans, (struct btree_insert_entry) {
+	__bch2_trans_update2(trans, (struct btree_insert_entry) {
 		.bkey_type	= __btree_node_type(iter->level, iter->btree_id),
 		.btree_id	= iter->btree_id,
 		.level		= iter->level,
@@ -745,82 +743,81 @@ static int extent_update_to_keys(struct btree_trans *trans,
 				     BTREE_ITER_NOT_EXTENTS);
 	n.is_extent = false;
 
-	ret = __bch2_trans_update2(trans, n);
+	__bch2_trans_update2(trans, n);
 	bch2_trans_iter_put(trans, n.iter);
-	return ret;
+	return 0;
 }
 
 static int extent_handle_overwrites(struct btree_trans *trans,
 				    enum btree_id btree_id,
-				    struct bpos start, struct bpos end)
+				    struct bkey_i *insert)
 {
 	struct btree_iter *iter, *update_iter;
+	struct bpos start = bkey_start_pos(&insert->k);
 	struct bkey_i *update;
 	struct bkey_s_c k;
 	int ret = 0;
 
-	iter = bch2_trans_get_iter(trans, btree_id, start, BTREE_ITER_INTENT);
+	iter = bch2_trans_get_iter(trans, btree_id, start,
+				   BTREE_ITER_INTENT);
 	k = bch2_btree_iter_peek_with_updates(iter);
 
 	while (k.k && !(ret = bkey_err(k))) {
-		if (bkey_cmp(end, bkey_start_pos(k.k)) <= 0)
+		if (bkey_cmp(insert->k.p, bkey_start_pos(k.k)) <= 0)
 			break;
 
 		if (bkey_cmp(bkey_start_pos(k.k), start) < 0) {
 			update = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
 			if ((ret = PTR_ERR_OR_ZERO(update)))
-				goto err;
+				break;
 
 			bkey_reassemble(update, k);
+
 			bch2_cut_back(start, update);
 
-			update_iter = bch2_trans_copy_iter(trans, iter);
-			update_iter->flags &= ~BTREE_ITER_IS_EXTENTS;
-			bch2_btree_iter_set_pos(update_iter, update->k.p);
-			ret = bch2_trans_update2(trans, update_iter, update);
+			update_iter = bch2_trans_get_iter(trans, btree_id, update->k.p,
+							  BTREE_ITER_NOT_EXTENTS|
+							  BTREE_ITER_INTENT);
+			bch2_trans_update2(trans, update_iter, update);
 			bch2_trans_iter_put(trans, update_iter);
-			if (ret)
-				goto err;
 		}
 
-		if (bkey_cmp(k.k->p, end) > 0) {
-			update = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
+		if (bkey_cmp(k.k->p, insert->k.p) < 0 ||
+		    (!bkey_cmp(k.k->p, insert->k.p) && bkey_deleted(&insert->k))) {
+			update = bch2_trans_kmalloc(trans, sizeof(struct bkey));
 			if ((ret = PTR_ERR_OR_ZERO(update)))
-				goto err;
+				break;
 
-			bkey_reassemble(update, k);
-			bch2_cut_front(end, update);
+			bkey_init(&update->k);
+			update->k.p = k.k->p;
 
-			update_iter = bch2_trans_copy_iter(trans, iter);
-			update_iter->flags &= ~BTREE_ITER_IS_EXTENTS;
-			bch2_btree_iter_set_pos(update_iter, update->k.p);
-			ret = bch2_trans_update2(trans, update_iter, update);
+			update_iter = bch2_trans_get_iter(trans, btree_id, update->k.p,
+							  BTREE_ITER_NOT_EXTENTS|
+							  BTREE_ITER_INTENT);
+			bch2_trans_update2(trans, update_iter, update);
 			bch2_trans_iter_put(trans, update_iter);
-			if (ret)
-				goto err;
-		} else {
-			update = bch2_trans_kmalloc(trans, sizeof(struct bkey));
+		}
+
+		if (bkey_cmp(k.k->p, insert->k.p) > 0) {
+			update = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
 			if ((ret = PTR_ERR_OR_ZERO(update)))
-				goto err;
+				break;
 
-			update->k = *k.k;
-			set_bkey_val_u64s(&update->k, 0);
-			update->k.type = KEY_TYPE_deleted;
-			update->k.size = 0;
+			bkey_reassemble(update, k);
+			bch2_cut_front(insert->k.p, update);
 
-			update_iter = bch2_trans_copy_iter(trans, iter);
-			update_iter->flags &= ~BTREE_ITER_IS_EXTENTS;
-			bch2_btree_iter_set_pos(update_iter, update->k.p);
-			ret = bch2_trans_update2(trans, update_iter, update);
+			update_iter = bch2_trans_get_iter(trans, btree_id, update->k.p,
+							  BTREE_ITER_NOT_EXTENTS|
+							  BTREE_ITER_INTENT);
+			bch2_trans_update2(trans, update_iter, update);
 			bch2_trans_iter_put(trans, update_iter);
-			if (ret)
-				goto err;
+			break;
 		}
 
 		k = bch2_btree_iter_next_with_updates(iter);
 	}
-err:
 	bch2_trans_iter_put(trans, iter);
+
 	return ret;
 }
 
@@ -885,24 +882,16 @@ int __bch2_trans_commit(struct btree_trans *trans)
 	/* Turn extents updates into keys: */
 	trans_for_each_update(trans, i)
 		if (i->is_extent) {
-			struct bpos start = bkey_start_pos(&i->k->k);
-
-			while (i + 1 < trans->updates + trans->nr_updates &&
-			       i[0].btree_id == i[1].btree_id &&
-			       !bkey_cmp(i[0].k->k.p, bkey_start_pos(&i[1].k->k)))
-				i++;
-
-			ret = extent_handle_overwrites(trans, i->btree_id,
-						       start, i->k->k.p);
-			if (ret)
+			ret = extent_handle_overwrites(trans, i->btree_id, i->k);
+			if (unlikely(ret))
 				goto out;
 		}
 
 	trans_for_each_update(trans, i) {
 		ret = i->is_extent
 			? extent_update_to_keys(trans, *i)
-			: __bch2_trans_update2(trans, *i);
-		if (ret)
+			: (__bch2_trans_update2(trans, *i), 0);
+		if (unlikely(ret))
 			goto out;
 	}
 
-- 
cgit 


From cb16bfaa86f09fae33a712510aa51a03ca370d63 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 21 Mar 2021 00:03:34 -0400
Subject: bcachefs: Get disk reservation when overwriting data in old snapshot

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/io.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index b841b3da2510..5ee9a6c2f4fd 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -224,9 +224,10 @@ int bch2_sum_sector_overwrites(struct btree_trans *trans,
 			(bkey_extent_is_allocation(&new->k) -
 			 bkey_extent_is_allocation(old.k));
 
-		*disk_sectors_delta += sectors *
-			(int) (bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(new)) -
-			       bch2_bkey_nr_ptrs_fully_allocated(old));
+		*disk_sectors_delta += sectors * bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(new));
+		*disk_sectors_delta -= new->k.p.snapshot == old.k->p.snapshot
+			? sectors * bch2_bkey_nr_ptrs_fully_allocated(old)
+			: 0;
 
 		if (!*should_check_enospc &&
 		    (new_replicas > bch2_bkey_replicas(c, old) ||
-- 
cgit 


From e0ba3b6429a4b5995b06dc46afdf4d3530d156bb Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 21 Mar 2021 16:55:25 -0400
Subject: bcachefs: Replace bch2_btree_iter_next() calls with
 bch2_btree_iter_advance

The way btree iterators work internally has been changing, particularly
with the iter->real_pos changes, and bch2_btree_iter_next() is no longer
hyper optimized - it's just advance followed by peek, so it's more
efficient to just call advance where we're not using the return value of
bch2_btree_iter_next().

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_gc.c   |  2 +-
 fs/bcachefs/btree_iter.c | 16 ++++++++--------
 fs/bcachefs/btree_iter.h |  4 ++--
 fs/bcachefs/debug.c      |  2 +-
 fs/bcachefs/ec.c         |  4 ++--
 fs/bcachefs/fs.c         |  2 +-
 fs/bcachefs/fsck.c       |  8 ++++----
 fs/bcachefs/migrate.c    |  2 +-
 fs/bcachefs/move.c       |  2 +-
 9 files changed, 21 insertions(+), 21 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index f75562bf8e21..483360fbda18 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -1208,7 +1208,7 @@ static int bch2_gc_btree_gens(struct bch_fs *c, enum btree_id btree_id)
 			}
 		}
 
-		bch2_btree_iter_next(iter);
+		bch2_btree_iter_advance(iter);
 	}
 	bch2_trans_iter_put(&trans, iter);
 
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 073157f5fbed..d6ef08b27858 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1501,7 +1501,7 @@ void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos)
 	btree_iter_set_search_pos(iter, btree_iter_search_key(iter));
 }
 
-inline bool bch2_btree_iter_advance_pos(struct btree_iter *iter)
+inline bool bch2_btree_iter_advance(struct btree_iter *iter)
 {
 	struct bpos pos = iter->k.p;
 	bool ret = bkey_cmp(pos, POS_MAX) != 0;
@@ -1512,7 +1512,7 @@ inline bool bch2_btree_iter_advance_pos(struct btree_iter *iter)
 	return ret;
 }
 
-inline bool bch2_btree_iter_rewind_pos(struct btree_iter *iter)
+inline bool bch2_btree_iter_rewind(struct btree_iter *iter)
 {
 	struct bpos pos = bkey_start_pos(&iter->k);
 	bool ret = bkey_cmp(pos, POS_MIN) != 0;
@@ -1637,7 +1637,7 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
  */
 struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter)
 {
-	if (!bch2_btree_iter_advance_pos(iter))
+	if (!bch2_btree_iter_advance(iter))
 		return bkey_s_c_null;
 
 	return bch2_btree_iter_peek(iter);
@@ -1691,7 +1691,7 @@ struct bkey_s_c bch2_btree_iter_peek_with_updates(struct btree_iter *iter)
 		k = __bch2_btree_iter_peek_with_updates(iter);
 
 		if (k.k && bkey_deleted(k.k)) {
-			if (!bch2_btree_iter_advance_pos(iter))
+			if (!bch2_btree_iter_advance(iter))
 				return bkey_s_c_null;
 			continue;
 		}
@@ -1716,7 +1716,7 @@ struct bkey_s_c bch2_btree_iter_peek_with_updates(struct btree_iter *iter)
 
 struct bkey_s_c bch2_btree_iter_next_with_updates(struct btree_iter *iter)
 {
-	if (!bch2_btree_iter_advance_pos(iter))
+	if (!bch2_btree_iter_advance(iter))
 		return bkey_s_c_null;
 
 	return bch2_btree_iter_peek_with_updates(iter);
@@ -1793,7 +1793,7 @@ no_key:
  */
 struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *iter)
 {
-	if (!bch2_btree_iter_rewind_pos(iter))
+	if (!bch2_btree_iter_rewind(iter))
 		return bkey_s_c_null;
 
 	return bch2_btree_iter_peek_prev(iter);
@@ -1885,7 +1885,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
 
 struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *iter)
 {
-	if (!bch2_btree_iter_advance_pos(iter))
+	if (!bch2_btree_iter_advance(iter))
 		return bkey_s_c_null;
 
 	return bch2_btree_iter_peek_slot(iter);
@@ -1893,7 +1893,7 @@ struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *iter)
 
 struct bkey_s_c bch2_btree_iter_prev_slot(struct btree_iter *iter)
 {
-	if (!bch2_btree_iter_rewind_pos(iter))
+	if (!bch2_btree_iter_rewind(iter))
 		return bkey_s_c_null;
 
 	return bch2_btree_iter_peek_slot(iter);
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index c839bfe6ffa4..1276d8aaf652 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -175,8 +175,8 @@ struct bkey_s_c bch2_btree_iter_prev_slot(struct btree_iter *);
 
 struct bkey_s_c bch2_btree_iter_peek_cached(struct btree_iter *);
 
-bool bch2_btree_iter_advance_pos(struct btree_iter *);
-bool bch2_btree_iter_rewind_pos(struct btree_iter *);
+bool bch2_btree_iter_advance(struct btree_iter *);
+bool bch2_btree_iter_rewind(struct btree_iter *);
 void bch2_btree_iter_set_pos(struct btree_iter *, struct bpos);
 
 /* Sort order for locking btree iterators: */
diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c
index 2c2d58514c68..8b837ac69d74 100644
--- a/fs/bcachefs/debug.c
+++ b/fs/bcachefs/debug.c
@@ -356,7 +356,7 @@ static ssize_t bch2_read_bfloat_failed(struct file *file, char __user *buf,
 		if (err)
 			break;
 
-		bch2_btree_iter_next(iter);
+		bch2_btree_iter_advance(iter);
 		i->from = iter->pos;
 
 		err = flush_buf(i);
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index bdce37981c5c..370f9e6916f3 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -842,13 +842,13 @@ static int ec_stripe_update_ptrs(struct bch_fs *c,
 		struct bch_extent_ptr *ptr, *ec_ptr = NULL;
 
 		if (extent_has_stripe_ptr(k, s->key.k.p.offset)) {
-			bch2_btree_iter_next(iter);
+			bch2_btree_iter_advance(iter);
 			continue;
 		}
 
 		block = bkey_matches_stripe(&s->key.v, k);
 		if (block < 0) {
-			bch2_btree_iter_next(iter);
+			bch2_btree_iter_advance(iter);
 			continue;
 		}
 
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index 3acda0389da8..77db405e3418 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -914,7 +914,7 @@ retry:
 
 		if (!bkey_extent_is_data(k.k) &&
 		    k.k->type != KEY_TYPE_reservation) {
-			bch2_btree_iter_next(iter);
+			bch2_btree_iter_advance(iter);
 			continue;
 		}
 
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index f8e0b24d087a..ffb30ef7ef00 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -547,7 +547,7 @@ retry:
 			i_sectors += k.k->size;
 		bch2_bkey_buf_reassemble(&prev, c, k);
 
-		bch2_btree_iter_advance_pos(iter);
+		bch2_btree_iter_advance(iter);
 	}
 fsck_err:
 	if (ret == -EINTR)
@@ -703,7 +703,7 @@ retry:
 
 		}
 
-		bch2_btree_iter_advance_pos(iter);
+		bch2_btree_iter_advance(iter);
 	}
 
 	hash_stop_chain(&trans, &h);
@@ -762,7 +762,7 @@ retry:
 		if (ret)
 			break;
 
-		bch2_btree_iter_advance_pos(iter);
+		bch2_btree_iter_advance(iter);
 	}
 fsck_err:
 	if (ret == -EINTR)
@@ -1389,7 +1389,7 @@ peek_nlinks:	link = genradix_iter_peek(&nlinks_iter, links);
 		if (nlinks_pos == iter->pos.offset)
 			genradix_iter_advance(&nlinks_iter, links);
 
-		bch2_btree_iter_advance_pos(iter);
+		bch2_btree_iter_advance(iter);
 		bch2_trans_cond_resched(&trans);
 	}
 fsck_err:
diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c
index 4d8b4169923d..ef69a19f494a 100644
--- a/fs/bcachefs/migrate.c
+++ b/fs/bcachefs/migrate.c
@@ -53,7 +53,7 @@ static int __bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags
 	while ((k = bch2_btree_iter_peek(iter)).k &&
 	       !(ret = bkey_err(k))) {
 		if (!bch2_bkey_has_device(k, dev_idx)) {
-			bch2_btree_iter_next(iter);
+			bch2_btree_iter_advance(iter);
 			continue;
 		}
 
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index f7b0764d9c98..87307670fd4a 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -638,7 +638,7 @@ next:
 		atomic64_add(k.k->size * bch2_bkey_nr_ptrs_allocated(k),
 			     &stats->sectors_seen);
 next_nondata:
-		bch2_btree_iter_next(iter);
+		bch2_btree_iter_advance(iter);
 		bch2_trans_cond_resched(&trans);
 	}
 out:
-- 
cgit 


From 345ca825e7d7e76211a72e13501e6504d22369f9 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 21 Mar 2021 18:09:02 -0400
Subject: bcachefs: Have btree_iter_next_node() use btree_iter_set_search_pos()

btree node iterators need to obey the regular btree node invarionts
w.r.t. iter->real_pos; once they do, bch2_btree_iter_traverse will have
less that it needs to check.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c | 32 ++++++++++++--------------------
 1 file changed, 12 insertions(+), 20 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index d6ef08b27858..572e553d55b1 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -16,6 +16,8 @@
 
 #include <linux/prefetch.h>
 
+static void btree_iter_set_search_pos(struct btree_iter *, struct bpos);
+
 static inline bool is_btree_node(struct btree_iter *iter, unsigned l)
 {
 	return l < BTREE_MAX_DEPTH &&
@@ -1144,11 +1146,6 @@ err:
 	return ret;
 }
 
-static void btree_iter_up(struct btree_iter *iter)
-{
-	btree_node_unlock(iter, iter->level++);
-}
-
 static int btree_iter_traverse_one(struct btree_iter *, unsigned long);
 
 static int __btree_iter_traverse_all(struct btree_trans *trans, int ret)
@@ -1400,11 +1397,11 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
 
 	bch2_trans_cond_resched(iter->trans);
 
-	btree_iter_up(iter);
-
-	if (!bch2_btree_node_relock(iter, iter->level))
-		btree_iter_set_dirty(iter, BTREE_ITER_NEED_RELOCK);
+	btree_node_unlock(iter, iter->level);
+	iter->l[iter->level].b = BTREE_ITER_NO_NODE_UP;
+	iter->level++;
 
+	btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
 	ret = bch2_btree_iter_traverse(iter);
 	if (ret)
 		return NULL;
@@ -1419,20 +1416,15 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
 		 * Haven't gotten to the end of the parent node: go back down to
 		 * the next child node
 		 */
+		btree_iter_set_search_pos(iter, bkey_successor(iter->pos));
 
-		/*
-		 * We don't really want to be unlocking here except we can't
-		 * directly tell btree_iter_traverse() "traverse to this level"
-		 * except by setting iter->level, so we have to unlock so we
-		 * don't screw up our lock invariants:
-		 */
-		if (btree_node_read_locked(iter, iter->level))
-			btree_node_unlock(iter, iter->level);
-
-		iter->pos = iter->real_pos = bkey_successor(iter->pos);
-		iter->level	= iter->min_depth;
+		/* Unlock to avoid screwing up our lock invariants: */
+		btree_node_unlock(iter, iter->level);
 
+		iter->level = iter->min_depth;
 		btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
+		bch2_btree_iter_verify(iter);
+
 		ret = bch2_btree_iter_traverse(iter);
 		if (ret)
 			return NULL;
-- 
cgit 


From 5cde51cd480906e2a5b3cfdc7f24369ee5a63e54 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 21 Mar 2021 17:09:55 -0400
Subject: bcachefs: Iterators are now always consistent with iter->real_pos

This means bch2_btree_iter_traverse_one() can be made more efficient.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c | 20 ++------------------
 1 file changed, 2 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 572e553d55b1..c7fa80bf8b9c 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1234,9 +1234,9 @@ static inline bool btree_iter_good_node(struct btree_iter *iter,
 	    !bch2_btree_node_relock(iter, l))
 		return false;
 
-	if (check_pos <= 0 && btree_iter_pos_before_node(iter, iter->l[l].b))
+	if (check_pos < 0 && btree_iter_pos_before_node(iter, iter->l[l].b))
 		return false;
-	if (check_pos >= 0 && btree_iter_pos_after_node(iter, iter->l[l].b))
+	if (check_pos > 0 && btree_iter_pos_after_node(iter, iter->l[l].b))
 		return false;
 	return true;
 }
@@ -1287,24 +1287,8 @@ static int btree_iter_traverse_one(struct btree_iter *iter,
 	if (unlikely(iter->level >= BTREE_MAX_DEPTH))
 		return 0;
 
-	/*
-	 * XXX: correctly using BTREE_ITER_UPTODATE should make using check_pos
-	 * here unnecessary
-	 */
 	iter->level = btree_iter_up_until_good_node(iter, 0);
 
-	/*
-	 * If we've got a btree node locked (i.e. we aren't about to relock the
-	 * root) - advance its node iterator if necessary:
-	 *
-	 * XXX correctly using BTREE_ITER_UPTODATE should make this unnecessary
-	 */
-	if (is_btree_node(iter, iter->level)) {
-		BUG_ON(!btree_iter_pos_in_node(iter, iter->l[iter->level].b));
-
-		btree_iter_advance_to_pos(iter, &iter->l[iter->level], -1);
-	}
-
 	/*
 	 * Note: iter->nodes[iter->level] may be temporarily NULL here - that
 	 * would indicate to other code that we got to the end of the btree,
-- 
cgit 


From 07fc72e103a6912fd1c7fe1dd8f6f29efbd07164 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 21 Mar 2021 17:01:34 -0400
Subject: bcachefs: Kill btree_iter_peek_uptodate()

Since we're no longer doing next() immediately followed by peek(), this
optimization isn't doing anything anymore.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c | 51 ------------------------------------------------
 1 file changed, 51 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index c7fa80bf8b9c..6c24e499450b 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1346,9 +1346,6 @@ struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter)
 	EBUG_ON(btree_iter_type(iter) != BTREE_ITER_NODES);
 	bch2_btree_iter_verify(iter);
 
-	if (iter->uptodate == BTREE_ITER_UPTODATE)
-		return iter->l[iter->level].b;
-
 	ret = bch2_btree_iter_traverse(iter);
 	if (ret)
 		return NULL;
@@ -1360,7 +1357,6 @@ struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter)
 	BUG_ON(bkey_cmp(b->key.k.p, iter->pos) < 0);
 
 	iter->pos = iter->real_pos = b->key.k.p;
-	iter->uptodate = BTREE_ITER_UPTODATE;
 
 	bch2_btree_iter_verify(iter);
 
@@ -1417,7 +1413,6 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
 	}
 
 	iter->pos = iter->real_pos = b->key.k.p;
-	iter->uptodate = BTREE_ITER_UPTODATE;
 
 	bch2_btree_iter_verify(iter);
 
@@ -1530,34 +1525,6 @@ static inline bool btree_iter_set_pos_to_prev_leaf(struct btree_iter *iter)
 	return ret;
 }
 
-/**
- * btree_iter_peek_uptodate - given an iterator that is uptodate, return the key
- * it currently points to
- */
-static inline struct bkey_s_c btree_iter_peek_uptodate(struct btree_iter *iter)
-{
-	struct btree_iter_level *l = &iter->l[0];
-	struct bkey_s_c ret = { .k = &iter->k };
-
-	if (!bkey_deleted(&iter->k)) {
-		struct bkey_packed *_k =
-			__bch2_btree_node_iter_peek_all(&l->iter, l->b);
-
-		ret.v = bkeyp_val(&l->b->format, _k);
-
-		if (bch2_debug_check_iterators) {
-			struct bkey k = bkey_unpack_key(l->b, _k);
-
-			BUG_ON(memcmp(&k, &iter->k, sizeof(k)));
-		}
-
-		if (bch2_debug_check_bkeys)
-			bch2_bkey_debugcheck(iter->trans->c, l->b, ret);
-	}
-
-	return ret;
-}
-
 /**
  * bch2_btree_iter_peek: returns first key greater than or equal to iterator's
  * current position
@@ -1574,10 +1541,6 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
 
 	btree_iter_set_search_pos(iter, btree_iter_search_key(iter));
 
-	if (iter->uptodate == BTREE_ITER_UPTODATE &&
-	    !bkey_deleted(&iter->k))
-		return btree_iter_peek_uptodate(iter);
-
 	while (1) {
 		ret = bch2_btree_iter_traverse(iter);
 		if (unlikely(ret))
@@ -1600,8 +1563,6 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
 
 	iter->real_pos = k.k->p;
 
-	iter->uptodate = BTREE_ITER_UPTODATE;
-
 	bch2_btree_iter_verify_entry_exit(iter);
 	bch2_btree_iter_verify(iter);
 	return k;
@@ -1686,7 +1647,6 @@ struct bkey_s_c bch2_btree_iter_peek_with_updates(struct btree_iter *iter)
 	if (bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0)
 		iter->pos = bkey_start_pos(k.k);
 
-	iter->uptodate = BTREE_ITER_UPTODATE;
 	return k;
 }
 
@@ -1714,10 +1674,6 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
 
 	btree_iter_set_search_pos(iter, iter->pos);
 
-	if (iter->uptodate == BTREE_ITER_UPTODATE &&
-	    !bkey_deleted(&iter->k))
-		return btree_iter_peek_uptodate(iter);
-
 	while (1) {
 		ret = bch2_btree_iter_traverse(iter);
 		if (unlikely(ret)) {
@@ -1747,7 +1703,6 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
 	if (bkey_cmp(k.k->p, iter->pos) < 0)
 		iter->pos = k.k->p;
 	iter->real_pos = k.k->p;
-	iter->uptodate = BTREE_ITER_UPTODATE;
 out:
 	bch2_btree_iter_verify_entry_exit(iter);
 	bch2_btree_iter_verify(iter);
@@ -1812,8 +1767,6 @@ __bch2_btree_iter_peek_slot_extents(struct btree_iter *iter)
 
 	EBUG_ON(!iter->k.size);
 
-	iter->uptodate = BTREE_ITER_UPTODATE;
-
 	bch2_btree_iter_verify_entry_exit(iter);
 	bch2_btree_iter_verify(iter);
 
@@ -1832,9 +1785,6 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
 
 	btree_iter_set_search_pos(iter, btree_iter_search_key(iter));
 
-	if (iter->uptodate == BTREE_ITER_UPTODATE)
-		return btree_iter_peek_uptodate(iter);
-
 	if (iter->flags & BTREE_ITER_IS_EXTENTS)
 		return __bch2_btree_iter_peek_slot_extents(iter);
 
@@ -1853,7 +1803,6 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
 		k = (struct bkey_s_c) { &iter->k, NULL };
 	}
 
-	iter->uptodate = BTREE_ITER_UPTODATE;
 	bch2_btree_iter_verify_entry_exit(iter);
 	bch2_btree_iter_verify(iter);
 	return k;
-- 
cgit 


From 3b0baf6f29ffce58e96b1a85568e2aa115a114f6 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 21 Mar 2021 19:22:58 -0400
Subject: bcachefs: Internal btree iterator renaming

This just gives some internal helpers some better names.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 6c24e499450b..948afab2a7dc 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -815,23 +815,23 @@ static inline struct bkey_s_c __btree_iter_unpack(struct btree_iter *iter,
 }
 
 /* peek_all() doesn't skip deleted keys */
-static inline struct bkey_s_c __btree_iter_peek_all(struct btree_iter *iter,
-						    struct btree_iter_level *l,
-						    struct bkey *u)
+static inline struct bkey_s_c btree_iter_level_peek_all(struct btree_iter *iter,
+							struct btree_iter_level *l,
+							struct bkey *u)
 {
 	return __btree_iter_unpack(iter, l, u,
 			bch2_btree_node_iter_peek_all(&l->iter, l->b));
 }
 
-static inline struct bkey_s_c __btree_iter_peek(struct btree_iter *iter,
-						struct btree_iter_level *l)
+static inline struct bkey_s_c btree_iter_level_peek(struct btree_iter *iter,
+						    struct btree_iter_level *l)
 {
 	return __btree_iter_unpack(iter, l, &iter->k,
 			bch2_btree_node_iter_peek(&l->iter, l->b));
 }
 
-static inline struct bkey_s_c __btree_iter_prev(struct btree_iter *iter,
-						struct btree_iter_level *l)
+static inline struct bkey_s_c btree_iter_level_prev(struct btree_iter *iter,
+						    struct btree_iter_level *l)
 {
 	return __btree_iter_unpack(iter, l, &iter->k,
 			bch2_btree_node_iter_prev(&l->iter, l->b));
@@ -1546,7 +1546,7 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
 		if (unlikely(ret))
 			return bkey_s_c_err(ret);
 
-		k = __btree_iter_peek(iter, l);
+		k = btree_iter_level_peek(iter, l);
 		if (likely(k.k))
 			break;
 
@@ -1600,7 +1600,7 @@ static struct bkey_s_c __btree_trans_updates_peek(struct btree_iter *iter)
 static struct bkey_s_c __bch2_btree_iter_peek_with_updates(struct btree_iter *iter)
 {
 	struct btree_iter_level *l = &iter->l[0];
-	struct bkey_s_c k = __btree_iter_peek(iter, l);
+	struct bkey_s_c k = btree_iter_level_peek(iter, l);
 	struct bkey_s_c u = __btree_trans_updates_peek(iter);
 
 	if (k.k && (!u.k || bkey_cmp(k.k->p, u.k->p) < 0))
@@ -1681,12 +1681,12 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
 			goto no_key;
 		}
 
-		k = __btree_iter_peek(iter, l);
+		k = btree_iter_level_peek(iter, l);
 		if (!k.k ||
 		    ((iter->flags & BTREE_ITER_IS_EXTENTS)
 		     ? bkey_cmp(bkey_start_pos(k.k), iter->pos) >= 0
 		     : bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0))
-			k = __btree_iter_prev(iter, l);
+			k = btree_iter_level_prev(iter, l);
 
 		if (likely(k.k))
 			break;
@@ -1709,7 +1709,7 @@ out:
 	return k;
 no_key:
 	/*
-	 * __btree_iter_peek() may have set iter->k to a key we didn't want, and
+	 * btree_iter_level_peek() may have set iter->k to a key we didn't want, and
 	 * then we errored going to the previous leaf - make sure it's
 	 * consistent with iter->pos:
 	 */
@@ -1792,7 +1792,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
 	if (unlikely(ret))
 		return bkey_s_c_err(ret);
 
-	k = __btree_iter_peek_all(iter, l, &iter->k);
+	k = btree_iter_level_peek_all(iter, l, &iter->k);
 
 	EBUG_ON(k.k && bkey_deleted(k.k) && bkey_cmp(k.k->p, iter->pos) == 0);
 
-- 
cgit 


From ca58cbd4719f11610ca777c23a285bab11eece03 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 21 Mar 2021 19:32:01 -0400
Subject: bcachefs: Improve iter->real_pos handling

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c | 21 ++++++++++++++-------
 1 file changed, 14 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 948afab2a7dc..d036ace70552 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -826,15 +826,21 @@ static inline struct bkey_s_c btree_iter_level_peek_all(struct btree_iter *iter,
 static inline struct bkey_s_c btree_iter_level_peek(struct btree_iter *iter,
 						    struct btree_iter_level *l)
 {
-	return __btree_iter_unpack(iter, l, &iter->k,
+	struct bkey_s_c k = __btree_iter_unpack(iter, l, &iter->k,
 			bch2_btree_node_iter_peek(&l->iter, l->b));
+
+	iter->real_pos = k.k ? k.k->p : l->b->key.k.p;
+	return k;
 }
 
 static inline struct bkey_s_c btree_iter_level_prev(struct btree_iter *iter,
 						    struct btree_iter_level *l)
 {
-	return __btree_iter_unpack(iter, l, &iter->k,
+	struct bkey_s_c k = __btree_iter_unpack(iter, l, &iter->k,
 			bch2_btree_node_iter_prev(&l->iter, l->b));
+
+	iter->real_pos = k.k ? k.k->p : l->b->data->min_key;
+	return k;
 }
 
 static inline bool btree_iter_advance_to_pos(struct btree_iter *iter,
@@ -1531,7 +1537,6 @@ static inline bool btree_iter_set_pos_to_prev_leaf(struct btree_iter *iter)
  */
 struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
 {
-	struct btree_iter_level *l = &iter->l[0];
 	struct bkey_s_c k;
 	int ret;
 
@@ -1546,7 +1551,7 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
 		if (unlikely(ret))
 			return bkey_s_c_err(ret);
 
-		k = btree_iter_level_peek(iter, l);
+		k = btree_iter_level_peek(iter, &iter->l[0]);
 		if (likely(k.k))
 			break;
 
@@ -1561,8 +1566,6 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
 	if (bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0)
 		iter->pos = bkey_start_pos(k.k);
 
-	iter->real_pos = k.k->p;
-
 	bch2_btree_iter_verify_entry_exit(iter);
 	bch2_btree_iter_verify(iter);
 	return k;
@@ -1619,6 +1622,9 @@ struct bkey_s_c bch2_btree_iter_peek_with_updates(struct btree_iter *iter)
 
 	EBUG_ON(btree_iter_type(iter) != BTREE_ITER_KEYS);
 	bch2_btree_iter_verify(iter);
+	bch2_btree_iter_verify_entry_exit(iter);
+
+	btree_iter_set_search_pos(iter, btree_iter_search_key(iter));
 
 	while (1) {
 		ret = bch2_btree_iter_traverse(iter);
@@ -1647,6 +1653,8 @@ struct bkey_s_c bch2_btree_iter_peek_with_updates(struct btree_iter *iter)
 	if (bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0)
 		iter->pos = bkey_start_pos(k.k);
 
+	bch2_btree_iter_verify_entry_exit(iter);
+	bch2_btree_iter_verify(iter);
 	return k;
 }
 
@@ -1702,7 +1710,6 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
 	/* Extents can straddle iter->pos: */
 	if (bkey_cmp(k.k->p, iter->pos) < 0)
 		iter->pos = k.k->p;
-	iter->real_pos = k.k->p;
 out:
 	bch2_btree_iter_verify_entry_exit(iter);
 	bch2_btree_iter_verify(iter);
-- 
cgit 


From 818664f50571fd04683743600e50731e70fff8f5 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 21 Mar 2021 19:43:31 -0400
Subject: bcachefs: Consolidate bch2_btree_iter_peek() and peek_with_updates()

Ideally we'll be getting rid of peek_with_updates(), but the callers
will need to be checked.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c | 127 ++++++++++++++++++-----------------------------
 1 file changed, 47 insertions(+), 80 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index d036ace70552..43885f907837 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1531,12 +1531,28 @@ static inline bool btree_iter_set_pos_to_prev_leaf(struct btree_iter *iter)
 	return ret;
 }
 
-/**
- * bch2_btree_iter_peek: returns first key greater than or equal to iterator's
- * current position
- */
-struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
+static struct bkey_i *btree_trans_peek_updates(struct btree_trans *trans,
+					       enum btree_id btree_id, struct bpos pos)
 {
+	struct btree_insert_entry *i;
+
+	trans_for_each_update2(trans, i)
+		if ((cmp_int(btree_id,	i->iter->btree_id) ?:
+		     bkey_cmp(pos,	i->k->k.p)) <= 0) {
+			if (btree_id == i->iter->btree_id)
+				return i->k;
+			break;
+		}
+
+	return NULL;
+}
+
+static inline struct bkey_s_c __btree_iter_peek(struct btree_iter *iter, bool with_updates)
+{
+	struct bpos search_key = btree_iter_search_key(iter);
+	struct bkey_i *next_update = with_updates
+		? btree_trans_peek_updates(iter->trans, iter->btree_id, search_key)
+		: NULL;
 	struct bkey_s_c k;
 	int ret;
 
@@ -1544,7 +1560,7 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
 	bch2_btree_iter_verify(iter);
 	bch2_btree_iter_verify_entry_exit(iter);
 
-	btree_iter_set_search_pos(iter, btree_iter_search_key(iter));
+	btree_iter_set_search_pos(iter, search_key);
 
 	while (1) {
 		ret = bch2_btree_iter_traverse(iter);
@@ -1552,16 +1568,28 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
 			return bkey_s_c_err(ret);
 
 		k = btree_iter_level_peek(iter, &iter->l[0]);
-		if (likely(k.k))
+
+		if (next_update &&
+		    bkey_cmp(next_update->k.p, iter->real_pos) <= 0)
+			k = bkey_i_to_s_c(next_update);
+
+		if (likely(k.k)) {
+			if (bkey_deleted(k.k)) {
+				btree_iter_set_search_pos(iter,
+						bkey_successor(k.k->p));
+				continue;
+			}
+
 			break;
+		}
 
 		if (!btree_iter_set_pos_to_next_leaf(iter))
 			return bkey_s_c_null;
 	}
 
 	/*
-	 * iter->pos should always be equal to the key we just
-	 * returned - except extents can straddle iter->pos:
+	 * iter->pos should be mononotically increasing, and always be equal to
+	 * the key we just returned - except extents can straddle iter->pos:
 	 */
 	if (bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0)
 		iter->pos = bkey_start_pos(k.k);
@@ -1571,6 +1599,15 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
 	return k;
 }
 
+/**
+ * bch2_btree_iter_peek: returns first key greater than or equal to iterator's
+ * current position
+ */
+struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
+{
+	return __btree_iter_peek(iter, false);
+}
+
 /**
  * bch2_btree_iter_next: returns first key greater than iterator's current
  * position
@@ -1583,79 +1620,9 @@ struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter)
 	return bch2_btree_iter_peek(iter);
 }
 
-static struct bkey_s_c __btree_trans_updates_peek(struct btree_iter *iter)
-{
-	struct bpos pos = btree_iter_search_key(iter);
-	struct btree_trans *trans = iter->trans;
-	struct btree_insert_entry *i;
-
-	trans_for_each_update2(trans, i)
-		if ((cmp_int(iter->btree_id,	i->iter->btree_id) ?:
-		     bkey_cmp(pos,		i->k->k.p)) <= 0)
-			break;
-
-	return i < trans->updates2 + trans->nr_updates2 &&
-		iter->btree_id == i->iter->btree_id
-		? bkey_i_to_s_c(i->k)
-		: bkey_s_c_null;
-}
-
-static struct bkey_s_c __bch2_btree_iter_peek_with_updates(struct btree_iter *iter)
-{
-	struct btree_iter_level *l = &iter->l[0];
-	struct bkey_s_c k = btree_iter_level_peek(iter, l);
-	struct bkey_s_c u = __btree_trans_updates_peek(iter);
-
-	if (k.k && (!u.k || bkey_cmp(k.k->p, u.k->p) < 0))
-		return k;
-	if (u.k && bkey_cmp(u.k->p, l->b->key.k.p) <= 0) {
-		iter->k = *u.k;
-		return u;
-	}
-	return bkey_s_c_null;
-}
-
 struct bkey_s_c bch2_btree_iter_peek_with_updates(struct btree_iter *iter)
 {
-	struct bkey_s_c k;
-	int ret;
-
-	EBUG_ON(btree_iter_type(iter) != BTREE_ITER_KEYS);
-	bch2_btree_iter_verify(iter);
-	bch2_btree_iter_verify_entry_exit(iter);
-
-	btree_iter_set_search_pos(iter, btree_iter_search_key(iter));
-
-	while (1) {
-		ret = bch2_btree_iter_traverse(iter);
-		if (unlikely(ret))
-			return bkey_s_c_err(ret);
-
-		k = __bch2_btree_iter_peek_with_updates(iter);
-
-		if (k.k && bkey_deleted(k.k)) {
-			if (!bch2_btree_iter_advance(iter))
-				return bkey_s_c_null;
-			continue;
-		}
-
-		if (likely(k.k))
-			break;
-
-		if (!btree_iter_set_pos_to_next_leaf(iter))
-			return bkey_s_c_null;
-	}
-
-	/*
-	 * iter->pos should be mononotically increasing, and always be equal to
-	 * the key we just returned - except extents can straddle iter->pos:
-	 */
-	if (bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0)
-		iter->pos = bkey_start_pos(k.k);
-
-	bch2_btree_iter_verify_entry_exit(iter);
-	bch2_btree_iter_verify(iter);
-	return k;
+	return __btree_iter_peek(iter, true);
 }
 
 struct bkey_s_c bch2_btree_iter_next_with_updates(struct btree_iter *iter)
-- 
cgit 


From bcad562259f7030a1f2e5e0a4e4f6f5b53371c74 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 21 Mar 2021 21:16:52 -0400
Subject: bcachefs: Update iter->real_pos lazily

peek() has to update iter->real_pos - there's no need for
bch2_btree_iter_set_pos() to update it as well.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c | 9 +--------
 fs/bcachefs/btree_iter.h | 7 ++++++-
 2 files changed, 7 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 43885f907837..e9b580ee0026 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1470,14 +1470,6 @@ out:
 	bch2_btree_iter_verify(iter);
 }
 
-void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos)
-{
-	bkey_init(&iter->k);
-	iter->k.p = iter->pos = new_pos;
-
-	btree_iter_set_search_pos(iter, btree_iter_search_key(iter));
-}
-
 inline bool bch2_btree_iter_advance(struct btree_iter *iter)
 {
 	struct bpos pos = iter->k.p;
@@ -1994,6 +1986,7 @@ alloc_iter:
 		__bch2_btree_iter_upgrade_nounlock(iter, 1);
 
 	bch2_btree_iter_set_pos(iter, pos);
+	btree_iter_set_search_pos(iter, btree_iter_search_key(iter));
 
 	return iter;
 }
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index 1276d8aaf652..3ae19e2900a6 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -177,7 +177,12 @@ struct bkey_s_c bch2_btree_iter_peek_cached(struct btree_iter *);
 
 bool bch2_btree_iter_advance(struct btree_iter *);
 bool bch2_btree_iter_rewind(struct btree_iter *);
-void bch2_btree_iter_set_pos(struct btree_iter *, struct bpos);
+
+static inline void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos)
+{
+	bkey_init(&iter->k);
+	iter->k.p = iter->pos = new_pos;
+}
 
 /* Sort order for locking btree iterators: */
 static inline int btree_iter_lock_cmp(const struct btree_iter *l,
-- 
cgit 


From 1fe9b1d33cd2a5e66b3d7bb64fd327d723121a6c Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 22 Mar 2021 15:50:02 -0400
Subject: bcachefs: Include snapshot field in bch2_bpos_to_text

More prep work for snapshots.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bkey_methods.c | 21 +++++++++++++++++----
 1 file changed, 17 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c
index 878befb5b9ef..641169ef91b5 100644
--- a/fs/bcachefs/bkey_methods.c
+++ b/fs/bcachefs/bkey_methods.c
@@ -169,8 +169,22 @@ void bch2_bpos_to_text(struct printbuf *out, struct bpos pos)
 		pr_buf(out, "POS_MIN");
 	else if (!bkey_cmp(pos, POS_MAX))
 		pr_buf(out, "POS_MAX");
-	else
-		pr_buf(out, "%llu:%llu", pos.inode, pos.offset);
+	else {
+		if (pos.inode == U64_MAX)
+			pr_buf(out, "U64_MAX");
+		else
+			pr_buf(out, "%llu", pos.inode);
+		pr_buf(out, ":");
+		if (pos.offset == U64_MAX)
+			pr_buf(out, "U64_MAX");
+		else
+			pr_buf(out, "%llu", pos.offset);
+		pr_buf(out, ":");
+		if (pos.snapshot == U32_MAX)
+			pr_buf(out, "U32_MAX");
+		else
+			pr_buf(out, "%u", pos.snapshot);
+	}
 }
 
 void bch2_bkey_to_text(struct printbuf *out, const struct bkey *k)
@@ -185,8 +199,7 @@ void bch2_bkey_to_text(struct printbuf *out, const struct bkey *k)
 
 		bch2_bpos_to_text(out, k->p);
 
-		pr_buf(out, " snap %u len %u ver %llu",
-		       k->p.snapshot, k->size, k->version.lo);
+		pr_buf(out, " len %u ver %llu", k->size, k->version.lo);
 	} else {
 		pr_buf(out, "(null)");
 	}
-- 
cgit 


From fad7cfed79cc66eb2fe26b422146e1447f1cd25d Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 22 Mar 2021 17:23:30 -0400
Subject: bcachefs: Add an .invalid method for bch2_btree_ptr_v2

It was using the method for btree_ptr_v1, but that wasn't checking all
the fields.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/extents.c | 18 +++++++++++++++++-
 fs/bcachefs/extents.h |  3 ++-
 2 files changed, 19 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 2be49f443eb0..7ac3d7587655 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -157,7 +157,7 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k,
 
 const char *bch2_btree_ptr_invalid(const struct bch_fs *c, struct bkey_s_c k)
 {
-	if (bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX)
+	if (bkey_val_u64s(k.k) > BCH_REPLICAS_MAX)
 		return "value too big";
 
 	return bch2_bkey_ptrs_invalid(c, k);
@@ -169,6 +169,22 @@ void bch2_btree_ptr_to_text(struct printbuf *out, struct bch_fs *c,
 	bch2_bkey_ptrs_to_text(out, c, k);
 }
 
+const char *bch2_btree_ptr_v2_invalid(const struct bch_fs *c, struct bkey_s_c k)
+{
+	struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k);
+
+	if (bkey_val_bytes(k.k) <= sizeof(*bp.v))
+		return "value too small";
+
+	if (bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX)
+		return "value too big";
+
+	if (bp.v->min_key.snapshot)
+		return "invalid min_key.snapshot";
+
+	return bch2_bkey_ptrs_invalid(c, k);
+}
+
 void bch2_btree_ptr_v2_to_text(struct printbuf *out, struct bch_fs *c,
 			    struct bkey_s_c k)
 {
diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h
index 2ee50a24501e..c8069dfb90ff 100644
--- a/fs/bcachefs/extents.h
+++ b/fs/bcachefs/extents.h
@@ -371,6 +371,7 @@ const char *bch2_btree_ptr_invalid(const struct bch_fs *, struct bkey_s_c);
 void bch2_btree_ptr_to_text(struct printbuf *, struct bch_fs *,
 			    struct bkey_s_c);
 
+const char *bch2_btree_ptr_v2_invalid(const struct bch_fs *, struct bkey_s_c);
 void bch2_btree_ptr_v2_to_text(struct printbuf *, struct bch_fs *,
 			    struct bkey_s_c);
 void bch2_btree_ptr_v2_compat(enum btree_id, unsigned, unsigned,
@@ -383,7 +384,7 @@ void bch2_btree_ptr_v2_compat(enum btree_id, unsigned, unsigned,
 }
 
 #define bch2_bkey_ops_btree_ptr_v2 (struct bkey_ops) {		\
-	.key_invalid	= bch2_btree_ptr_invalid,		\
+	.key_invalid	= bch2_btree_ptr_v2_invalid,		\
 	.val_to_text	= bch2_btree_ptr_v2_to_text,		\
 	.swab		= bch2_ptr_swab,			\
 	.compat		= bch2_btree_ptr_v2_compat,		\
-- 
cgit 


From d3e6b9a14d857382086cd4d4619f13cb92afc522 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 21 Mar 2021 22:01:12 -0400
Subject: bcachefs: Improve inode deletion code

It had some silly redundancies.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/inode.c | 45 ++++++++++++++-------------------------------
 1 file changed, 14 insertions(+), 31 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index aec0fc9228a3..f676daf404a2 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -542,12 +542,12 @@ found_slot:
 int bch2_inode_rm(struct bch_fs *c, u64 inode_nr, bool cached)
 {
 	struct btree_trans trans;
-	struct btree_iter *iter;
+	struct btree_iter *iter = NULL;
 	struct bkey_i_inode_generation delete;
 	struct bpos start = POS(inode_nr, 0);
 	struct bpos end = POS(inode_nr + 1, 0);
+	struct bch_inode_unpacked inode_u;
 	struct bkey_s_c k;
-	u64 bi_generation;
 	int ret;
 
 	bch2_trans_init(&trans, c, 0, 0);
@@ -571,8 +571,6 @@ int bch2_inode_rm(struct bch_fs *c, u64 inode_nr, bool cached)
 retry:
 	bch2_trans_begin(&trans);
 
-	bi_generation = 0;
-
 	if (cached) {
 		iter = bch2_trans_get_iter(&trans, BTREE_ID_inodes, POS(0, inode_nr),
 					   BTREE_ITER_CACHED|BTREE_ITER_INTENT);
@@ -587,41 +585,26 @@ retry:
 	if (ret)
 		goto err;
 
-	bch2_fs_inconsistent_on(k.k->type != KEY_TYPE_inode, trans.c,
-				"inode %llu not found when deleting",
-				inode_nr);
-
-	switch (k.k->type) {
-	case KEY_TYPE_inode: {
-		struct bch_inode_unpacked inode_u;
-
-		if (!bch2_inode_unpack(bkey_s_c_to_inode(k), &inode_u))
-			bi_generation = inode_u.bi_generation + 1;
-		break;
-	}
-	case KEY_TYPE_inode_generation: {
-		struct bkey_s_c_inode_generation g =
-			bkey_s_c_to_inode_generation(k);
-		bi_generation = le32_to_cpu(g.v->bi_generation);
-		break;
-	}
+	if (k.k->type != KEY_TYPE_inode) {
+		bch2_fs_inconsistent(trans.c,
+				     "inode %llu not found when deleting",
+				     inode_nr);
+		ret = -EIO;
+		goto err;
 	}
 
-	if (!bi_generation) {
-		bkey_init(&delete.k);
-		delete.k.p.offset = inode_nr;
-	} else {
-		bkey_inode_generation_init(&delete.k_i);
-		delete.k.p.offset = inode_nr;
-		delete.v.bi_generation = cpu_to_le32(bi_generation);
-	}
+	bch2_inode_unpack(bkey_s_c_to_inode(k), &inode_u);
+
+	bkey_inode_generation_init(&delete.k_i);
+	delete.k.p = iter->pos;
+	delete.v.bi_generation = cpu_to_le32(inode_u.bi_generation + 1);
 
 	bch2_trans_update(&trans, iter, &delete.k_i, 0);
 
 	ret = bch2_trans_commit(&trans, NULL, NULL,
 				BTREE_INSERT_NOFAIL);
-	bch2_trans_iter_put(&trans, iter);
 err:
+	bch2_trans_iter_put(&trans, iter);
 	if (ret == -EINTR)
 		goto retry;
 
-- 
cgit 


From 08070cba4a378ca02fdb954c45cf9b8797907fe9 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 23 Mar 2021 21:22:50 -0400
Subject: bcachefs: Split btree_iter_traverse and bch2_btree_iter_traverse()

External (to the btree iterator code) users of bch2_btree_iter_traverse
expect that on success the iterator will be pointed at iter->pos and
have that position locked - but since we split iter->pos and
iter->real_pos, that means it has to update iter->real_pos if necessary.

Internal users don't expect it to modify iter->real_pos, so we need two
separate functions.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c | 40 ++++++++++++++++++++++++++++++++--------
 fs/bcachefs/btree_iter.h | 10 +---------
 2 files changed, 33 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index e9b580ee0026..5f30626d1852 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1329,7 +1329,7 @@ static int btree_iter_traverse_one(struct btree_iter *iter,
 	return 0;
 }
 
-int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter)
+static int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter)
 {
 	struct btree_trans *trans = iter->trans;
 	int ret;
@@ -1342,6 +1342,30 @@ int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter)
 	return ret;
 }
 
+/*
+ * Note:
+ * bch2_btree_iter_traverse() is for external users, btree_iter_traverse() is
+ * for internal btree iterator users
+ *
+ * bch2_btree_iter_traverse sets iter->real_pos to iter->pos,
+ * btree_iter_traverse() does not:
+ */
+static inline int __must_check
+btree_iter_traverse(struct btree_iter *iter)
+{
+	return iter->uptodate >= BTREE_ITER_NEED_RELOCK
+		? __bch2_btree_iter_traverse(iter)
+		: 0;
+}
+
+int __must_check
+bch2_btree_iter_traverse(struct btree_iter *iter)
+{
+	btree_iter_set_search_pos(iter, btree_iter_search_key(iter));
+
+	return btree_iter_traverse(iter);
+}
+
 /* Iterate across nodes (leaf and interior nodes) */
 
 struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter)
@@ -1352,7 +1376,7 @@ struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter)
 	EBUG_ON(btree_iter_type(iter) != BTREE_ITER_NODES);
 	bch2_btree_iter_verify(iter);
 
-	ret = bch2_btree_iter_traverse(iter);
+	ret = btree_iter_traverse(iter);
 	if (ret)
 		return NULL;
 
@@ -1388,7 +1412,7 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
 	iter->level++;
 
 	btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
-	ret = bch2_btree_iter_traverse(iter);
+	ret = btree_iter_traverse(iter);
 	if (ret)
 		return NULL;
 
@@ -1411,7 +1435,7 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
 		btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
 		bch2_btree_iter_verify(iter);
 
-		ret = bch2_btree_iter_traverse(iter);
+		ret = btree_iter_traverse(iter);
 		if (ret)
 			return NULL;
 
@@ -1555,7 +1579,7 @@ static inline struct bkey_s_c __btree_iter_peek(struct btree_iter *iter, bool wi
 	btree_iter_set_search_pos(iter, search_key);
 
 	while (1) {
-		ret = bch2_btree_iter_traverse(iter);
+		ret = btree_iter_traverse(iter);
 		if (unlikely(ret))
 			return bkey_s_c_err(ret);
 
@@ -1642,7 +1666,7 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
 	btree_iter_set_search_pos(iter, iter->pos);
 
 	while (1) {
-		ret = bch2_btree_iter_traverse(iter);
+		ret = btree_iter_traverse(iter);
 		if (unlikely(ret)) {
 			k = bkey_s_c_err(ret);
 			goto no_key;
@@ -1754,7 +1778,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
 	if (iter->flags & BTREE_ITER_IS_EXTENTS)
 		return __bch2_btree_iter_peek_slot_extents(iter);
 
-	ret = bch2_btree_iter_traverse(iter);
+	ret = btree_iter_traverse(iter);
 	if (unlikely(ret))
 		return bkey_s_c_err(ret);
 
@@ -1798,7 +1822,7 @@ struct bkey_s_c bch2_btree_iter_peek_cached(struct btree_iter *iter)
 	EBUG_ON(btree_iter_type(iter) != BTREE_ITER_CACHED);
 	bch2_btree_iter_verify(iter);
 
-	ret = bch2_btree_iter_traverse(iter);
+	ret = btree_iter_traverse(iter);
 	if (unlikely(ret))
 		return bkey_s_c_err(ret);
 
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index 3ae19e2900a6..8768f4cb96fa 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -145,15 +145,7 @@ void bch2_btree_iter_node_drop(struct btree_iter *, struct btree *);
 
 void bch2_btree_iter_reinit_node(struct btree_iter *, struct btree *);
 
-int __must_check __bch2_btree_iter_traverse(struct btree_iter *);
-
-static inline int __must_check
-bch2_btree_iter_traverse(struct btree_iter *iter)
-{
-	return iter->uptodate >= BTREE_ITER_NEED_RELOCK
-		? __bch2_btree_iter_traverse(iter)
-		: 0;
-}
+int __must_check bch2_btree_iter_traverse(struct btree_iter *);
 
 int bch2_btree_iter_traverse_all(struct btree_trans *);
 
-- 
cgit 


From a9d79c6e8ba18665bed30702be5fb238c50e8c63 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 23 Mar 2021 23:52:27 -0400
Subject: bcachefs: Use pcpu mode of six locks for interior nodes

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_cache.c           |  6 ++++++
 fs/bcachefs/btree_iter.c            | 16 ++++++++++++----
 fs/bcachefs/btree_update_interior.c |  5 +++++
 3 files changed, 23 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index 775b3e8468da..f32fc45c85d2 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -147,6 +147,11 @@ int bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b,
 	b->c.level	= level;
 	b->c.btree_id	= id;
 
+	if (level)
+		six_lock_pcpu_alloc(&b->c.lock);
+	else
+		six_lock_pcpu_free_rcu(&b->c.lock);
+
 	mutex_lock(&bc->lock);
 	ret = __bch2_btree_node_hash_insert(bc, b);
 	if (!ret)
@@ -393,6 +398,7 @@ void bch2_fs_btree_cache_exit(struct bch_fs *c)
 	while (!list_empty(&bc->freed)) {
 		b = list_first_entry(&bc->freed, struct btree, list);
 		list_del(&b->list);
+		six_lock_pcpu_free(&b->c.lock);
 		kfree(b);
 	}
 
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 5f30626d1852..cf41ece0d66e 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -79,11 +79,19 @@ void __bch2_btree_node_lock_write(struct btree *b, struct btree_iter *iter)
 	 * goes to 0, and it's safe because we have the node intent
 	 * locked:
 	 */
-	atomic64_sub(__SIX_VAL(read_lock, readers),
-		     &b->c.lock.state.counter);
+	if (!b->c.lock.readers)
+		atomic64_sub(__SIX_VAL(read_lock, readers),
+			     &b->c.lock.state.counter);
+	else
+		this_cpu_sub(*b->c.lock.readers, readers);
+
 	btree_node_lock_type(iter->trans->c, b, SIX_LOCK_write);
-	atomic64_add(__SIX_VAL(read_lock, readers),
-		     &b->c.lock.state.counter);
+
+	if (!b->c.lock.readers)
+		atomic64_add(__SIX_VAL(read_lock, readers),
+			     &b->c.lock.state.counter);
+	else
+		this_cpu_add(*b->c.lock.readers, readers);
 }
 
 bool __bch2_btree_node_relock(struct btree_iter *iter, unsigned level)
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 4ad8084714f9..2c202dd01766 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -988,6 +988,11 @@ static void bch2_btree_set_root_inmem(struct bch_fs *c, struct btree *b)
 	list_del_init(&b->list);
 	mutex_unlock(&c->btree_cache.lock);
 
+	if (b->c.level)
+		six_lock_pcpu_alloc(&b->c.lock);
+	else
+		six_lock_pcpu_free(&b->c.lock);
+
 	mutex_lock(&c->btree_root_lock);
 	BUG_ON(btree_node_root(c, b) &&
 	       (b->c.level < btree_node_root(c, b)->c.level ||
-- 
cgit 


From 7c8b166e584c85f9920d8f82778967eeee0e1b03 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 24 Mar 2021 22:49:05 -0400
Subject: bcachefs: Increase default journal size

The default was 1/256th of the device and capped at 512MB, which is
fairly tiny these days.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/journal.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index 22069c277c15..87623ec8cf47 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -913,14 +913,17 @@ int bch2_dev_journal_alloc(struct bch_dev *ca)
 	if (dynamic_fault("bcachefs:add:journal_alloc"))
 		return -ENOMEM;
 
+	/* 1/128th of the device by default: */
+	nr = ca->mi.nbuckets >> 7;
+
 	/*
-	 * clamp journal size to 1024 buckets or 512MB (in sectors), whichever
+	 * clamp journal size to 8192 buckets or 8GB (in sectors), whichever
 	 * is smaller:
 	 */
-	nr = clamp_t(unsigned, ca->mi.nbuckets >> 8,
+	nr = clamp_t(unsigned, nr,
 		     BCH_JOURNAL_BUCKETS_MIN,
-		     min(1 << 10,
-			 (1 << 20) / ca->mi.bucket_size));
+		     min(1 << 13,
+			 (1 << 24) / ca->mi.bucket_size));
 
 	return __bch2_set_nr_journal_buckets(ca, nr, true, NULL);
 }
-- 
cgit 


From 0390ea8ad8f4079c25d47e8c249a2f621aaec3c0 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 24 Mar 2021 20:22:51 -0400
Subject: bcachefs: Drop bkey noops

Bkey noops were introduced to deal with trimming inline data extents in
place in the btree: if the u64s field of a bkey was 0, that u64 was a
noop and we'd start looking for the next bkey immediately after it.

But extent handling has been lifted above the btree - we no longer
modify existing extents in place in the btree, and the compatibilty code
for old style extent btree nodes is gone, so we can completely drop this
code.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bkey.h                  | 10 ----------
 fs/bcachefs/bkey_sort.c             |  2 +-
 fs/bcachefs/bset.c                  | 26 +++++++++++---------------
 fs/bcachefs/bset.h                  |  2 +-
 fs/bcachefs/btree_gc.c              |  2 +-
 fs/bcachefs/btree_io.c              | 14 ++++++--------
 fs/bcachefs/btree_update_interior.c |  4 ++--
 7 files changed, 22 insertions(+), 38 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bkey.h b/fs/bcachefs/bkey.h
index 77d9d871adfb..51dc49b9ffba 100644
--- a/fs/bcachefs/bkey.h
+++ b/fs/bcachefs/bkey.h
@@ -41,16 +41,6 @@ struct bkey_s {
 
 #define bkey_next(_k)		vstruct_next(_k)
 
-static inline struct bkey_packed *bkey_next_skip_noops(struct bkey_packed *k,
-						       struct bkey_packed *end)
-{
-	k = bkey_next(k);
-
-	while (k != end && !k->u64s)
-		k = (void *) ((u64 *) k + 1);
-	return k;
-}
-
 #define bkey_val_u64s(_k)	((_k)->u64s - BKEY_U64s)
 
 static inline size_t bkey_val_bytes(const struct bkey *k)
diff --git a/fs/bcachefs/bkey_sort.c b/fs/bcachefs/bkey_sort.c
index f2507079ed11..537ab7919e88 100644
--- a/fs/bcachefs/bkey_sort.c
+++ b/fs/bcachefs/bkey_sort.c
@@ -45,7 +45,7 @@ static inline void sort_iter_advance(struct sort_iter *iter, sort_cmp_fn cmp)
 
 	BUG_ON(!iter->used);
 
-	i->k = bkey_next_skip_noops(i->k, i->end);
+	i->k = bkey_next(i->k);
 
 	BUG_ON(i->k > i->end);
 
diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c
index c371f402eaa3..59f613560b65 100644
--- a/fs/bcachefs/bset.c
+++ b/fs/bcachefs/bset.c
@@ -66,7 +66,7 @@ void bch2_dump_bset(struct bch_fs *c, struct btree *b,
 	for (_k = i->start;
 	     _k < vstruct_last(i);
 	     _k = _n) {
-		_n = bkey_next_skip_noops(_k, vstruct_last(i));
+		_n = bkey_next(_k);
 
 		k = bkey_disassemble(b, _k, &uk);
 		if (c)
@@ -532,7 +532,7 @@ start:
 			       rw_aux_tree(b, t)[j - 1].offset);
 		}
 
-		k = bkey_next_skip_noops(k, btree_bkey_last(b, t));
+		k = bkey_next(k);
 		BUG_ON(k >= btree_bkey_last(b, t));
 	}
 }
@@ -747,7 +747,7 @@ retry:
 	/* First we figure out where the first key in each cacheline is */
 	eytzinger1_for_each(j, t->size) {
 		while (bkey_to_cacheline(b, t, k) < cacheline)
-			prev = k, k = bkey_next_skip_noops(k, btree_bkey_last(b, t));
+			prev = k, k = bkey_next(k);
 
 		if (k >= btree_bkey_last(b, t)) {
 			/* XXX: this path sucks */
@@ -764,7 +764,7 @@ retry:
 	}
 
 	while (k != btree_bkey_last(b, t))
-		prev = k, k = bkey_next_skip_noops(k, btree_bkey_last(b, t));
+		prev = k, k = bkey_next(k);
 
 	t->max_key = bkey_unpack_pos(b, prev);
 
@@ -899,7 +899,7 @@ struct bkey_packed *bch2_bkey_prev_filter(struct btree *b,
 	struct bkey_packed *p, *i, *ret = NULL, *orig_k = k;
 
 	while ((p = __bkey_prev(b, t, k)) && !ret) {
-		for (i = p; i != k; i = bkey_next_skip_noops(i, k))
+		for (i = p; i != k; i = bkey_next(i))
 			if (i->type >= min_key_type)
 				ret = i;
 
@@ -910,10 +910,10 @@ struct bkey_packed *bch2_bkey_prev_filter(struct btree *b,
 		BUG_ON(ret >= orig_k);
 
 		for (i = ret
-			? bkey_next_skip_noops(ret, orig_k)
+			? bkey_next(ret)
 			: btree_bkey_first(b, t);
 		     i != orig_k;
-		     i = bkey_next_skip_noops(i, orig_k))
+		     i = bkey_next(i))
 			BUG_ON(i->type >= min_key_type);
 	}
 
@@ -948,7 +948,7 @@ static void ro_aux_tree_fix_invalidated_key(struct btree *b,
 	/* signal to make_bfloat() that they're uninitialized: */
 	min_key.u64s = max_key.u64s = 0;
 
-	if (bkey_next_skip_noops(k, btree_bkey_last(b, t)) == btree_bkey_last(b, t)) {
+	if (bkey_next(k) == btree_bkey_last(b, t)) {
 		t->max_key = bkey_unpack_pos(b, k);
 
 		for (j = 1; j < t->size; j = j * 2 + 1)
@@ -1072,7 +1072,7 @@ static void bch2_bset_fix_lookup_table(struct btree *b,
 		struct bkey_packed *k = start;
 
 		while (1) {
-			k = bkey_next_skip_noops(k, end);
+			k = bkey_next(k);
 			if (k == end)
 				break;
 
@@ -1322,12 +1322,12 @@ struct bkey_packed *bch2_bset_search_linear(struct btree *b,
 		while (m != btree_bkey_last(b, t) &&
 		       bkey_iter_cmp_p_or_unp(b, m,
 					lossy_packed_search, search) < 0)
-			m = bkey_next_skip_noops(m, btree_bkey_last(b, t));
+			m = bkey_next(m);
 
 	if (!packed_search)
 		while (m != btree_bkey_last(b, t) &&
 		       bkey_iter_pos_cmp(b, m, search) < 0)
-			m = bkey_next_skip_noops(m, btree_bkey_last(b, t));
+			m = bkey_next(m);
 
 	if (bch2_expensive_debug_checks) {
 		struct bkey_packed *prev = bch2_bkey_prev_all(b, t, m);
@@ -1561,10 +1561,6 @@ static inline void __bch2_btree_node_iter_advance(struct btree_node_iter *iter,
 
 	EBUG_ON(iter->data->k > iter->data->end);
 
-	while (!__btree_node_iter_set_end(iter, 0) &&
-	       !__bch2_btree_node_iter_peek_all(iter, b)->u64s)
-		iter->data->k++;
-
 	if (unlikely(__btree_node_iter_set_end(iter, 0))) {
 		bch2_btree_node_iter_set_drop(iter, iter->data);
 		return;
diff --git a/fs/bcachefs/bset.h b/fs/bcachefs/bset.h
index f19cd032cf70..8cf2301e510d 100644
--- a/fs/bcachefs/bset.h
+++ b/fs/bcachefs/bset.h
@@ -305,7 +305,7 @@ static inline struct bkey_s __bkey_disassemble(struct btree *b,
 #define bset_tree_for_each_key(_b, _t, _k)				\
 	for (_k = btree_bkey_first(_b, _t);				\
 	     _k != btree_bkey_last(_b, _t);				\
-	     _k = bkey_next_skip_noops(_k, btree_bkey_last(_b, _t)))
+	     _k = bkey_next(_k))
 
 static inline bool bset_has_ro_aux_tree(struct bset_tree *t)
 {
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 483360fbda18..36ed6df39768 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -1373,7 +1373,7 @@ static void bch2_coalesce_nodes(struct bch_fs *c, struct btree_iter *iter,
 		     k < vstruct_last(s2) &&
 		     vstruct_blocks_plus(n1->data, c->block_bits,
 					 u64s + k->u64s) <= blocks;
-		     k = bkey_next_skip_noops(k, vstruct_last(s2))) {
+		     k = bkey_next(k)) {
 			last = k;
 			u64s += k->u64s;
 		}
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index adeb4f9fb5fd..71860e1a3100 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -32,9 +32,9 @@ static void verify_no_dups(struct btree *b,
 	if (start == end)
 		return;
 
-	for (p = start, k = bkey_next_skip_noops(start, end);
+	for (p = start, k = bkey_next(start);
 	     k != end;
-	     p = k, k = bkey_next_skip_noops(k, end)) {
+	     p = k, k = bkey_next(k)) {
 		struct bkey l = bkey_unpack_key(b, p);
 		struct bkey r = bkey_unpack_key(b, k);
 
@@ -47,9 +47,7 @@ static void set_needs_whiteout(struct bset *i, int v)
 {
 	struct bkey_packed *k;
 
-	for (k = i->start;
-	     k != vstruct_last(i);
-	     k = bkey_next_skip_noops(k, vstruct_last(i)))
+	for (k = i->start; k != vstruct_last(i); k = bkey_next(k))
 		k->needs_whiteout = v;
 }
 
@@ -213,7 +211,7 @@ static bool bch2_drop_whiteouts(struct btree *b, enum compact_mode mode)
 		out = i->start;
 
 		for (k = start; k != end; k = n) {
-			n = bkey_next_skip_noops(k, end);
+			n = bkey_next(k);
 
 			if (!bkey_deleted(k)) {
 				bkey_copy(out, k);
@@ -754,7 +752,7 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b,
 		}
 
 		prev = k;
-		k = bkey_next_skip_noops(k, vstruct_last(i));
+		k = bkey_next(k);
 	}
 fsck_err:
 	return ret;
@@ -947,7 +945,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
 			bp.v->mem_ptr = 0;
 		}
 
-		k = bkey_next_skip_noops(k, vstruct_last(i));
+		k = bkey_next(k);
 	}
 
 	bch2_bset_build_aux_tree(b, b->set, false);
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 2c202dd01766..c5e0516ff1fb 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -1119,7 +1119,7 @@ static struct btree *__btree_split_node(struct btree_update *as,
 	 */
 	k = set1->start;
 	while (1) {
-		struct bkey_packed *n = bkey_next_skip_noops(k, vstruct_last(set1));
+		struct bkey_packed *n = bkey_next(k);
 
 		if (n == vstruct_last(set1))
 			break;
@@ -1216,7 +1216,7 @@ static void btree_split_insert_keys(struct btree_update *as, struct btree *b,
 	i = btree_bset_first(b);
 	src = dst = i->start;
 	while (src != vstruct_last(i)) {
-		n = bkey_next_skip_noops(src, vstruct_last(i));
+		n = bkey_next(src);
 		if (!bkey_deleted(src)) {
 			memmove_u64s_down(dst, src, src->u64s);
 			dst = bkey_next(dst);
-- 
cgit 


From 2da5d000b91e0407dedc3baa35cf7c36a0392ff2 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 26 Mar 2021 20:08:56 -0400
Subject: bcachefs: Generate better bkey formats when splitting nodes

On btree node split, we weren't ensuring the min_key of the new larger
node packs in the new format for this node. This triggers some painful
slowpaths in the bset.c aux search tree code - this patch fixes that by
calculating a new format for the new node with the new min_key.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update_interior.c | 53 ++++++++++++++++++++++++-------------
 1 file changed, 34 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index c5e0516ff1fb..3cd431eb3fe7 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -1095,10 +1095,11 @@ static struct btree *__btree_split_node(struct btree_update *as,
 					struct btree *n1,
 					struct btree_iter *iter)
 {
+	struct bkey_format_state s;
 	size_t nr_packed = 0, nr_unpacked = 0;
 	struct btree *n2;
 	struct bset *set1, *set2;
-	struct bkey_packed *k, *prev = NULL;
+	struct bkey_packed *k, *set2_start, *set2_end, *out, *prev = NULL;
 
 	n2 = bch2_btree_node_alloc(as, n1->c.level);
 	bch2_btree_update_add_new_node(as, n2);
@@ -1108,8 +1109,6 @@ static struct btree *__btree_split_node(struct btree_update *as,
 	SET_BTREE_NODE_SEQ(n2->data, BTREE_NODE_SEQ(n1->data));
 	n2->key.k.p = n1->key.k.p;
 
-	btree_node_set_format(n2, n2->data->format);
-
 	set1 = btree_bset_first(n1);
 	set2 = btree_bset_first(n2);
 
@@ -1136,33 +1135,49 @@ static struct btree *__btree_split_node(struct btree_update *as,
 	}
 
 	BUG_ON(!prev);
+	set2_start	= k;
+	set2_end	= vstruct_last(set1);
 
-	btree_set_max(n1, bkey_unpack_pos(n1, prev));
-	btree_set_min(n2, bkey_successor(n1->key.k.p));
-
-	set2->u64s = cpu_to_le16((u64 *) vstruct_end(set1) - (u64 *) k);
-	set1->u64s = cpu_to_le16(le16_to_cpu(set1->u64s) - le16_to_cpu(set2->u64s));
-
+	set1->u64s = cpu_to_le16((u64 *) set2_start - set1->_data);
 	set_btree_bset_end(n1, n1->set);
-	set_btree_bset_end(n2, n2->set);
-
-	n2->nr.live_u64s	= le16_to_cpu(set2->u64s);
-	n2->nr.bset_u64s[0]	= le16_to_cpu(set2->u64s);
-	n2->nr.packed_keys	= n1->nr.packed_keys - nr_packed;
-	n2->nr.unpacked_keys	= n1->nr.unpacked_keys - nr_unpacked;
 
 	n1->nr.live_u64s	= le16_to_cpu(set1->u64s);
 	n1->nr.bset_u64s[0]	= le16_to_cpu(set1->u64s);
 	n1->nr.packed_keys	= nr_packed;
 	n1->nr.unpacked_keys	= nr_unpacked;
 
+	btree_set_max(n1, bkey_unpack_pos(n1, prev));
+	btree_set_min(n2, bkey_successor(n1->key.k.p));
+
+	bch2_bkey_format_init(&s);
+	bch2_bkey_format_add_pos(&s, n2->data->min_key);
+	bch2_bkey_format_add_pos(&s, n2->data->max_key);
+
+	for (k = set2_start; k != set2_end; k = bkey_next(k)) {
+		struct bkey uk = bkey_unpack_key(n1, k);
+		bch2_bkey_format_add_key(&s, &uk);
+	}
+
+	n2->data->format = bch2_bkey_format_done(&s);
+	btree_node_set_format(n2, n2->data->format);
+
+	out = set2->start;
+	memset(&n2->nr, 0, sizeof(n2->nr));
+
+	for (k = set2_start; k != set2_end; k = bkey_next(k)) {
+		BUG_ON(!bch2_bkey_transform(&n2->format, out, bkey_packed(k)
+				       ? &n1->format : &bch2_bkey_format_current, k));
+		out->format = KEY_FORMAT_LOCAL_BTREE;
+		btree_keys_account_key_add(&n2->nr, 0, out);
+		out = bkey_next(out);
+	}
+
+	set2->u64s = cpu_to_le16((u64 *) out - set2->_data);
+	set_btree_bset_end(n2, n2->set);
+
 	BUG_ON(!set1->u64s);
 	BUG_ON(!set2->u64s);
 
-	memcpy_u64s(set2->start,
-		    vstruct_end(set1),
-		    le16_to_cpu(set2->u64s));
-
 	btree_node_reset_sib_u64s(n1);
 	btree_node_reset_sib_u64s(n2);
 
-- 
cgit 


From c7e04e22e000d3d9c2c0ed00cd444b3b8a26cf1a Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 26 Mar 2021 20:10:59 -0400
Subject: bcachefs: Fix building of aux search trees

We weren't packing the min/max keys, which was a major oversight and
completely disabled generating bkey_floats for adjacent nodes.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bset.c | 29 +++++++++++++++++++----------
 1 file changed, 19 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c
index 59f613560b65..34fd2307a320 100644
--- a/fs/bcachefs/bset.c
+++ b/fs/bcachefs/bset.c
@@ -674,16 +674,20 @@ static void make_bfloat(struct btree *b, struct bset_tree *t,
 
 	if (is_power_of_2(j) &&
 	    !min_key->u64s) {
-		k = (void *) min_key;
-		bkey_init(&k->k);
-		k->k.p = b->data->min_key;
+		if (!bkey_pack_pos(min_key, b->data->min_key, b)) {
+			k = (void *) min_key;
+			bkey_init(&k->k);
+			k->k.p = b->data->min_key;
+		}
 	}
 
 	if (is_power_of_2(j + 1) &&
 	    !max_key->u64s) {
-		k = (void *) max_key;
-		bkey_init(&k->k);
-		k->k.p = t->max_key;
+		if (!bkey_pack_pos(max_key, b->data->max_key, b)) {
+			k = (void *) max_key;
+			bkey_init(&k->k);
+			k->k.p = t->max_key;
+		}
 	}
 
 	__make_bfloat(b, t, j, min_key, max_key);
@@ -768,10 +772,15 @@ retry:
 
 	t->max_key = bkey_unpack_pos(b, prev);
 
-	bkey_init(&min_key.k);
-	min_key.k.p = b->data->min_key;
-	bkey_init(&max_key.k);
-	max_key.k.p = t->max_key;
+	if (!bkey_pack_pos(bkey_to_packed(&min_key), b->data->min_key, b)) {
+		bkey_init(&min_key.k);
+		min_key.k.p = b->data->min_key;
+	}
+
+	if (!bkey_pack_pos(bkey_to_packed(&max_key), b->data->max_key, b)) {
+		bkey_init(&max_key.k);
+		max_key.k.p = t->max_key;
+	}
 
 	/* Then we build the tree */
 	eytzinger1_for_each(j, t->size)
-- 
cgit 


From 3bf57160c23d507a46c1a7a4a453405d21ac0d7c Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 26 Mar 2021 20:29:04 -0400
Subject: bcachefs: Fix packed bkey format calculation for new btree roots

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_gc.c              |  3 +++
 fs/bcachefs/btree_update_interior.c | 10 ++++++----
 2 files changed, 9 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 36ed6df39768..9e2f93c2adc2 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -1311,6 +1311,9 @@ static void bch2_coalesce_nodes(struct bch_fs *c, struct btree_iter *iter,
 	/* Find a format that all keys in @old_nodes can pack into */
 	bch2_bkey_format_init(&format_state);
 
+	/*
+	 * XXX: this won't correctly take it account the new min/max keys:
+	 */
 	for (i = 0; i < nr_old_nodes; i++)
 		__bch2_btree_calc_format(&format_state, old_nodes[i]);
 
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 3cd431eb3fe7..2c5084ac1a03 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -82,8 +82,6 @@ void __bch2_btree_calc_format(struct bkey_format_state *s, struct btree *b)
 	struct bset_tree *t;
 	struct bkey uk;
 
-	bch2_bkey_format_add_pos(s, b->data->min_key);
-
 	for_each_bset(b, t)
 		bset_tree_for_each_key(b, t, k)
 			if (!bkey_deleted(k)) {
@@ -97,6 +95,8 @@ static struct bkey_format bch2_btree_calc_format(struct btree *b)
 	struct bkey_format_state s;
 
 	bch2_bkey_format_init(&s);
+	bch2_bkey_format_add_pos(&s, b->data->min_key);
+	bch2_bkey_format_add_pos(&s, b->data->max_key);
 	__bch2_btree_calc_format(&s, b);
 
 	return bch2_bkey_format_done(&s);
@@ -1578,8 +1578,10 @@ retry:
 	}
 
 	bch2_bkey_format_init(&new_s);
-	__bch2_btree_calc_format(&new_s, b);
-	__bch2_btree_calc_format(&new_s, m);
+	bch2_bkey_format_add_pos(&new_s, prev->data->min_key);
+	__bch2_btree_calc_format(&new_s, prev);
+	__bch2_btree_calc_format(&new_s, next);
+	bch2_bkey_format_add_pos(&new_s, next->data->max_key);
 	new_f = bch2_bkey_format_done(&new_s);
 
 	sib_u64s = btree_node_u64s_with_format(b, &new_f) +
-- 
cgit 


From f793fd85dc598616ff903750b2a6d63425c23b2f Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 27 Mar 2021 20:58:57 -0400
Subject: bcachefs: Fix for bch2_trans_commit() unlocking when it's not
 supposed to

When we pass BTREE_INSERT_NOUNLOCK bch2_trans_commit isn't supposed to
unlock after a successful commit, but it was calling
bch2_trans_cond_resched() - oops.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c        | 3 ++-
 fs/bcachefs/btree_iter.h        | 1 +
 fs/bcachefs/btree_update_leaf.c | 8 ++++++--
 3 files changed, 9 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index cf41ece0d66e..fb7614367e1c 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -2155,7 +2155,8 @@ void bch2_trans_reset(struct btree_trans *trans, unsigned flags)
 		       (void *) &trans->fs_usage_deltas->memset_start);
 	}
 
-	bch2_trans_cond_resched(trans);
+	if (!(flags & TRANS_RESET_NOUNLOCK))
+		bch2_trans_cond_resched(trans);
 
 	if (!(flags & TRANS_RESET_NOTRAVERSE))
 		bch2_btree_iter_traverse_all(trans);
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index 8768f4cb96fa..176661b3b879 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -303,6 +303,7 @@ static inline void set_btree_iter_dontneed(struct btree_trans *trans, struct btr
 }
 
 #define TRANS_RESET_NOTRAVERSE		(1 << 0)
+#define TRANS_RESET_NOUNLOCK		(1 << 1)
 
 void bch2_trans_reset(struct btree_trans *, unsigned);
 
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index e76916cffd5b..62fa0d59242a 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -826,7 +826,7 @@ int __bch2_trans_commit(struct btree_trans *trans)
 	struct btree_insert_entry *i = NULL;
 	struct btree_iter *iter;
 	bool trans_trigger_run;
-	unsigned u64s;
+	unsigned u64s, reset_flags = 0;
 	int ret = 0;
 
 	if (!trans->nr_updates)
@@ -940,7 +940,11 @@ out:
 	if (likely(!(trans->flags & BTREE_INSERT_NOCHECK_RW)))
 		percpu_ref_put(&trans->c->writes);
 out_reset:
-	bch2_trans_reset(trans, !ret ? TRANS_RESET_NOTRAVERSE : 0);
+	if (!ret)
+		reset_flags |= TRANS_RESET_NOTRAVERSE;
+	if (!ret && (trans->flags & BTREE_INSERT_NOUNLOCK))
+		reset_flags |= TRANS_RESET_NOUNLOCK;
+	bch2_trans_reset(trans, reset_flags);
 
 	return ret;
 err:
-- 
cgit 


From 2649b514b6cad329da0a4c8cafbd48c32bbc1b9d Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 27 Mar 2021 21:00:26 -0400
Subject: bcachefs: Simplify btree_node_iter_init_pack_failed()

Since we now make sure to always generate packed bkey formats that can
pack the min_key of a btree node, this path should actually never
happen.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bset.c | 19 +++++++------------
 1 file changed, 7 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c
index 34fd2307a320..200cd900e041 100644
--- a/fs/bcachefs/bset.c
+++ b/fs/bcachefs/bset.c
@@ -1167,8 +1167,7 @@ void bch2_bset_delete(struct btree *b,
 __flatten
 static struct bkey_packed *bset_search_write_set(const struct btree *b,
 				struct bset_tree *t,
-				struct bpos *search,
-				const struct bkey_packed *packed_search)
+				struct bpos *search)
 {
 	unsigned l = 0, r = t->size;
 
@@ -1235,9 +1234,6 @@ static struct bkey_packed *bset_search_tree(const struct btree *b,
 			prefetch(&base->f[n << 4]);
 
 		f = &base->f[n];
-
-		if (!unlikely(packed_search))
-			goto slowpath;
 		if (unlikely(f->exponent >= BFLOAT_FAILED))
 			goto slowpath;
 
@@ -1301,7 +1297,7 @@ struct bkey_packed *__bch2_bset_search(struct btree *b,
 	case BSET_NO_AUX_TREE:
 		return btree_bkey_first(b, t);
 	case BSET_RW_AUX_TREE:
-		return bset_search_write_set(b, t, search, lossy_packed_search);
+		return bset_search_write_set(b, t, search);
 	case BSET_RO_AUX_TREE:
 		/*
 		 * Each node in the auxiliary search tree covers a certain range
@@ -1400,16 +1396,15 @@ noinline __flatten __attribute__((cold))
 static void btree_node_iter_init_pack_failed(struct btree_node_iter *iter,
 			      struct btree *b, struct bpos *search)
 {
-	struct bset_tree *t;
+	struct bkey_packed *k;
 
 	trace_bkey_pack_pos_fail(search);
 
-	for_each_bset(b, t)
-		__bch2_btree_node_iter_push(iter, b,
-			bch2_bset_search(b, t, search, NULL, NULL),
-			btree_bkey_last(b, t));
+	bch2_btree_node_iter_init_from_start(iter, b);
 
-	bch2_btree_node_iter_sort(iter, b);
+	while ((k = bch2_btree_node_iter_peek(iter, b)) &&
+	       bkey_iter_pos_cmp(b, k, search) < 0)
+		bch2_btree_node_iter_advance(iter, b);
 }
 
 /**
-- 
cgit 


From 331194a230f5fb266a64880e905c0364aa834964 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 24 Mar 2021 23:37:33 -0400
Subject: bcachefs: btree key cache locking improvements

The btree key cache mutex was becoming a significant bottleneck - it was
mainly used to protect the lists of dirty, clean and freed cached keys.

This patch eliminates the dirty and clean lists - instead, when we need
to scan for keys to drop from the cache we iterate over the rhashtable,
and thus we're able to remove most uses of that lock.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_key_cache.c | 184 +++++++++++++++++++++++++++---------------
 fs/bcachefs/btree_key_cache.h |   8 +-
 fs/bcachefs/btree_types.h     |   7 +-
 fs/bcachefs/journal_reclaim.c |   4 +-
 4 files changed, 130 insertions(+), 73 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index d7b4df4cff17..7ee64efe6fd9 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -70,7 +70,7 @@ static void bkey_cached_evict(struct btree_key_cache *c,
 				      bch2_btree_key_cache_params));
 	memset(&ck->key, ~0, sizeof(ck->key));
 
-	c->nr_keys--;
+	atomic_long_dec(&c->nr_keys);
 }
 
 static void bkey_cached_free(struct btree_key_cache *bc,
@@ -99,12 +99,6 @@ bkey_cached_alloc(struct btree_key_cache *c)
 {
 	struct bkey_cached *ck;
 
-	list_for_each_entry_reverse(ck, &c->freed, list)
-		if (bkey_cached_lock_for_evict(ck)) {
-			c->nr_freed--;
-			return ck;
-		}
-
 	ck = kmem_cache_alloc(bch2_key_cache, GFP_NOFS|__GFP_ZERO);
 	if (likely(ck)) {
 		INIT_LIST_HEAD(&ck->list);
@@ -115,11 +109,39 @@ bkey_cached_alloc(struct btree_key_cache *c)
 		return ck;
 	}
 
-	list_for_each_entry(ck, &c->clean, list)
+	return NULL;
+}
+
+static struct bkey_cached *
+bkey_cached_reuse(struct btree_key_cache *c)
+{
+	struct bucket_table *tbl;
+	struct rhash_head *pos;
+	struct bkey_cached *ck;
+	unsigned i;
+
+	mutex_lock(&c->lock);
+	list_for_each_entry_reverse(ck, &c->freed, list)
 		if (bkey_cached_lock_for_evict(ck)) {
-			bkey_cached_evict(c, ck);
+			c->nr_freed--;
+			list_del(&ck->list);
+			mutex_unlock(&c->lock);
 			return ck;
 		}
+	mutex_unlock(&c->lock);
+
+	rcu_read_lock();
+	tbl = rht_dereference_rcu(c->table.tbl, &c->table);
+	for (i = 0; i < tbl->size; i++)
+		rht_for_each_entry_rcu(ck, pos, tbl, i, hash) {
+			if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) &&
+			    bkey_cached_lock_for_evict(ck)) {
+				bkey_cached_evict(c, ck);
+				rcu_read_unlock();
+				return ck;
+			}
+		}
+	rcu_read_unlock();
 
 	return NULL;
 }
@@ -130,10 +152,17 @@ btree_key_cache_create(struct btree_key_cache *c,
 		       struct bpos pos)
 {
 	struct bkey_cached *ck;
+	bool was_new = true;
 
 	ck = bkey_cached_alloc(c);
-	if (!ck)
-		return ERR_PTR(-ENOMEM);
+
+	if (unlikely(!ck)) {
+		ck = bkey_cached_reuse(c);
+		if (unlikely(!ck))
+			return ERR_PTR(-ENOMEM);
+
+		was_new = false;
+	}
 
 	ck->c.level		= 0;
 	ck->c.btree_id		= btree_id;
@@ -142,17 +171,26 @@ btree_key_cache_create(struct btree_key_cache *c,
 	ck->valid		= false;
 	ck->flags		= 1U << BKEY_CACHED_ACCESSED;
 
-	if (rhashtable_lookup_insert_fast(&c->table,
+	if (unlikely(rhashtable_lookup_insert_fast(&c->table,
 					  &ck->hash,
-					  bch2_btree_key_cache_params)) {
+					  bch2_btree_key_cache_params))) {
 		/* We raced with another fill: */
-		bkey_cached_free(c, ck);
+
+		if (likely(was_new)) {
+			six_unlock_write(&ck->c.lock);
+			six_unlock_intent(&ck->c.lock);
+			kfree(ck);
+		} else {
+			mutex_lock(&c->lock);
+			bkey_cached_free(c, ck);
+			mutex_unlock(&c->lock);
+		}
+
 		return NULL;
 	}
 
-	c->nr_keys++;
+	atomic_long_inc(&c->nr_keys);
 
-	list_move(&ck->list, &c->clean);
 	six_unlock_write(&ck->c.lock);
 
 	return ck;
@@ -239,11 +277,8 @@ retry:
 			return 0;
 		}
 
-		mutex_lock(&c->btree_key_cache.lock);
 		ck = btree_key_cache_create(&c->btree_key_cache,
 					    iter->btree_id, iter->pos);
-		mutex_unlock(&c->btree_key_cache.lock);
-
 		ret = PTR_ERR_OR_ZERO(ck);
 		if (ret)
 			goto err;
@@ -371,15 +406,13 @@ err:
 	bch2_journal_pin_drop(j, &ck->journal);
 	bch2_journal_preres_put(j, &ck->res);
 
+	BUG_ON(!btree_node_locked(c_iter, 0));
+
 	if (!evict) {
-		mutex_lock(&c->btree_key_cache.lock);
 		if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
 			clear_bit(BKEY_CACHED_DIRTY, &ck->flags);
-			c->btree_key_cache.nr_dirty--;
+			atomic_long_dec(&c->btree_key_cache.nr_dirty);
 		}
-
-		list_move_tail(&ck->list, &c->btree_key_cache.clean);
-		mutex_unlock(&c->btree_key_cache.lock);
 	} else {
 evict:
 		BUG_ON(!btree_node_intent_locked(c_iter, 0));
@@ -389,13 +422,14 @@ evict:
 
 		six_lock_write(&ck->c.lock, NULL, NULL);
 
-		mutex_lock(&c->btree_key_cache.lock);
 		if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
 			clear_bit(BKEY_CACHED_DIRTY, &ck->flags);
-			c->btree_key_cache.nr_dirty--;
+			atomic_long_dec(&c->btree_key_cache.nr_dirty);
 		}
 
 		bkey_cached_evict(&c->btree_key_cache, ck);
+
+		mutex_lock(&c->btree_key_cache.lock);
 		bkey_cached_free(&c->btree_key_cache, ck);
 		mutex_unlock(&c->btree_key_cache.lock);
 	}
@@ -476,16 +510,11 @@ bool bch2_btree_insert_key_cached(struct btree_trans *trans,
 	ck->valid = true;
 
 	if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
-		mutex_lock(&c->btree_key_cache.lock);
-		list_move(&ck->list, &c->btree_key_cache.dirty);
-
 		set_bit(BKEY_CACHED_DIRTY, &ck->flags);
-		c->btree_key_cache.nr_dirty++;
+		atomic_long_inc(&c->btree_key_cache.nr_dirty);
 
 		if (bch2_nr_btree_keys_need_flush(c))
 			kick_reclaim = true;
-
-		mutex_unlock(&c->btree_key_cache.lock);
 	}
 
 	bch2_journal_pin_update(&c->journal, trans->journal_res.seq,
@@ -510,9 +539,11 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
 	struct bch_fs *c = container_of(shrink, struct bch_fs,
 					btree_key_cache.shrink);
 	struct btree_key_cache *bc = &c->btree_key_cache;
+	struct bucket_table *tbl;
 	struct bkey_cached *ck, *t;
 	size_t scanned = 0, freed = 0, nr = sc->nr_to_scan;
-	unsigned flags;
+	unsigned start, flags;
+	int srcu_idx;
 
 	/* Return -1 if we can't do anything right now */
 	if (sc->gfp_mask & __GFP_FS)
@@ -520,6 +551,7 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
 	else if (!mutex_trylock(&bc->lock))
 		return -1;
 
+	srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
 	flags = memalloc_nofs_save();
 
 	/*
@@ -541,23 +573,47 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
 	if (scanned >= nr)
 		goto out;
 
-	list_for_each_entry_safe(ck, t, &bc->clean, list) {
-		if (test_bit(BKEY_CACHED_ACCESSED, &ck->flags))
-			clear_bit(BKEY_CACHED_ACCESSED, &ck->flags);
-		else if (bkey_cached_lock_for_evict(ck)) {
-			bkey_cached_evict(bc, ck);
-			bkey_cached_free(bc, ck);
-		}
+	rcu_read_lock();
+	tbl = rht_dereference_rcu(bc->table.tbl, &bc->table);
+	if (bc->shrink_iter >= tbl->size)
+		bc->shrink_iter = 0;
+	start = bc->shrink_iter;
 
-		scanned++;
-		if (scanned >= nr) {
-			if (&t->list != &bc->clean)
-				list_move_tail(&bc->clean, &t->list);
-			goto out;
+	do {
+		struct rhash_head *pos, *next;
+
+		pos = rht_ptr_rcu(rht_bucket(tbl, bc->shrink_iter));
+
+		while (!rht_is_a_nulls(pos)) {
+			next = rht_dereference_bucket_rcu(pos->next, tbl, bc->shrink_iter);
+			ck = container_of(pos, struct bkey_cached, hash);
+
+			if (test_bit(BKEY_CACHED_DIRTY, &ck->flags))
+				goto next;
+
+			if (test_bit(BKEY_CACHED_ACCESSED, &ck->flags))
+				clear_bit(BKEY_CACHED_ACCESSED, &ck->flags);
+			else if (bkey_cached_lock_for_evict(ck)) {
+				bkey_cached_evict(bc, ck);
+				bkey_cached_free(bc, ck);
+			}
+
+			scanned++;
+			if (scanned >= nr)
+				break;
+next:
+			pos = next;
 		}
-	}
+
+		bc->shrink_iter++;
+		if (bc->shrink_iter >= tbl->size)
+			bc->shrink_iter = 0;
+	} while (scanned < nr && bc->shrink_iter != start);
+
+	rcu_read_unlock();
 out:
 	memalloc_nofs_restore(flags);
+	srcu_read_unlock(&c->btree_trans_barrier, srcu_idx);
 	mutex_unlock(&bc->lock);
 
 	return freed;
@@ -570,41 +626,45 @@ static unsigned long bch2_btree_key_cache_count(struct shrinker *shrink,
 					btree_key_cache.shrink);
 	struct btree_key_cache *bc = &c->btree_key_cache;
 
-	return bc->nr_keys;
+	return atomic_long_read(&bc->nr_keys);
 }
 
 void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc)
 {
 	struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
+	struct bucket_table *tbl;
 	struct bkey_cached *ck, *n;
+	struct rhash_head *pos;
+	unsigned i;
 
 	if (bc->shrink.list.next)
 		unregister_shrinker(&bc->shrink);
 
 	mutex_lock(&bc->lock);
-	list_splice(&bc->dirty, &bc->clean);
 
-	list_for_each_entry_safe(ck, n, &bc->clean, list) {
+	rcu_read_lock();
+	tbl = rht_dereference_rcu(bc->table.tbl, &bc->table);
+	for (i = 0; i < tbl->size; i++)
+		rht_for_each_entry_rcu(ck, pos, tbl, i, hash) {
+			bkey_cached_evict(bc, ck);
+			list_add(&ck->list, &bc->freed);
+		}
+	rcu_read_unlock();
+
+	list_for_each_entry_safe(ck, n, &bc->freed, list) {
 		cond_resched();
 
 		bch2_journal_pin_drop(&c->journal, &ck->journal);
 		bch2_journal_preres_put(&c->journal, &ck->res);
 
-		kfree(ck->k);
 		list_del(&ck->list);
+		kfree(ck->k);
 		kmem_cache_free(bch2_key_cache, ck);
-		bc->nr_keys--;
 	}
 
-	BUG_ON(bc->nr_dirty && !bch2_journal_error(&c->journal));
-	BUG_ON(bc->nr_keys);
-
-	list_for_each_entry_safe(ck, n, &bc->freed, list) {
-		cond_resched();
+	BUG_ON(atomic_long_read(&bc->nr_dirty) && !bch2_journal_error(&c->journal));
+	BUG_ON(atomic_long_read(&bc->nr_keys));
 
-		list_del(&ck->list);
-		kmem_cache_free(bch2_key_cache, ck);
-	}
 	mutex_unlock(&bc->lock);
 
 	if (bc->table_init_done)
@@ -615,8 +675,6 @@ void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *c)
 {
 	mutex_init(&c->lock);
 	INIT_LIST_HEAD(&c->freed);
-	INIT_LIST_HEAD(&c->clean);
-	INIT_LIST_HEAD(&c->dirty);
 }
 
 int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc)
@@ -640,8 +698,8 @@ int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc)
 void bch2_btree_key_cache_to_text(struct printbuf *out, struct btree_key_cache *c)
 {
 	pr_buf(out, "nr_freed:\t%zu\n",	c->nr_freed);
-	pr_buf(out, "nr_keys:\t%zu\n",	c->nr_keys);
-	pr_buf(out, "nr_dirty:\t%zu\n",	c->nr_dirty);
+	pr_buf(out, "nr_keys:\t%zu\n",	atomic_long_read(&c->nr_keys));
+	pr_buf(out, "nr_dirty:\t%zu\n",	atomic_long_read(&c->nr_dirty));
 }
 
 void bch2_btree_key_cache_exit(void)
diff --git a/fs/bcachefs/btree_key_cache.h b/fs/bcachefs/btree_key_cache.h
index 2f8b5521718a..02715cd258ab 100644
--- a/fs/bcachefs/btree_key_cache.h
+++ b/fs/bcachefs/btree_key_cache.h
@@ -3,8 +3,8 @@
 
 static inline size_t bch2_nr_btree_keys_need_flush(struct bch_fs *c)
 {
-	size_t nr_dirty = READ_ONCE(c->btree_key_cache.nr_dirty);
-	size_t nr_keys = READ_ONCE(c->btree_key_cache.nr_keys);
+	size_t nr_dirty = atomic_long_read(&c->btree_key_cache.nr_dirty);
+	size_t nr_keys = atomic_long_read(&c->btree_key_cache.nr_keys);
 	size_t max_dirty = 1024 + nr_keys  / 2;
 
 	return max_t(ssize_t, 0, nr_dirty - max_dirty);
@@ -12,8 +12,8 @@ static inline size_t bch2_nr_btree_keys_need_flush(struct bch_fs *c)
 
 static inline bool bch2_btree_key_cache_must_wait(struct bch_fs *c)
 {
-	size_t nr_dirty = READ_ONCE(c->btree_key_cache.nr_dirty);
-	size_t nr_keys = READ_ONCE(c->btree_key_cache.nr_keys);
+	size_t nr_dirty = atomic_long_read(&c->btree_key_cache.nr_dirty);
+	size_t nr_keys = atomic_long_read(&c->btree_key_cache.nr_keys);
 	size_t max_dirty = 4096 + (nr_keys * 3) / 4;
 
 	return nr_dirty > max_dirty &&
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 41fa5ff77e91..d218d883225b 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -292,13 +292,12 @@ struct btree_key_cache {
 	struct rhashtable	table;
 	bool			table_init_done;
 	struct list_head	freed;
-	struct list_head	clean;
-	struct list_head	dirty;
 	struct shrinker		shrink;
+	unsigned		shrink_iter;
 
 	size_t			nr_freed;
-	size_t			nr_keys;
-	size_t			nr_dirty;
+	atomic_long_t		nr_keys;
+	atomic_long_t		nr_dirty;
 };
 
 struct bkey_cached_key {
diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
index 0a16343fb51a..3957312d12f2 100644
--- a/fs/bcachefs/journal_reclaim.c
+++ b/fs/bcachefs/journal_reclaim.c
@@ -610,8 +610,8 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct)
 				j->prereserved.remaining,
 				atomic_read(&c->btree_cache.dirty),
 				c->btree_cache.used,
-				c->btree_key_cache.nr_dirty,
-				c->btree_key_cache.nr_keys);
+				atomic_long_read(&c->btree_key_cache.nr_dirty),
+				atomic_long_read(&c->btree_key_cache.nr_keys));
 
 		nr_flushed = journal_flush_pins(j, seq_to_flush, min_nr);
 
-- 
cgit 


From 43d002432dbb093b2155ebce7f12f79b844817f1 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 3 Feb 2021 21:51:56 -0500
Subject: bcachefs: Add a mechanism for running callbacks at trans commit time

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c        |  1 +
 fs/bcachefs/btree_types.h       |  9 +++++++++
 fs/bcachefs/btree_update.h      |  2 ++
 fs/bcachefs/btree_update_leaf.c | 16 ++++++++++++++++
 4 files changed, 28 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index fb7614367e1c..74c4cacb9aa7 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -2144,6 +2144,7 @@ void bch2_trans_reset(struct btree_trans *trans, unsigned flags)
 	trans->nr_updates2		= 0;
 	trans->mem_top			= 0;
 
+	trans->hooks			= NULL;
 	trans->extra_journal_entries	= NULL;
 	trans->extra_journal_entry_u64s	= 0;
 
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index d218d883225b..bcd8db34d7ee 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -343,6 +343,14 @@ struct btree_insert_entry {
 #define BTREE_ITER_MAX		32
 #endif
 
+struct btree_trans_commit_hook;
+typedef int (btree_trans_commit_hook_fn)(struct btree_trans *, struct btree_trans_commit_hook *);
+
+struct btree_trans_commit_hook {
+	btree_trans_commit_hook_fn	*fn;
+	struct btree_trans_commit_hook	*next;
+};
+
 #define BTREE_TRANS_MEM_MAX	(1U << 14)
 
 struct btree_trans {
@@ -379,6 +387,7 @@ struct btree_trans {
 	struct btree_insert_entry *updates2;
 
 	/* update path: */
+	struct btree_trans_commit_hook *hooks;
 	struct jset_entry	*extra_journal_entries;
 	unsigned		extra_journal_entry_u64s;
 	struct journal_entry_pin *journal_pin;
diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
index a25138080169..4ce12ae29a55 100644
--- a/fs/bcachefs/btree_update.h
+++ b/fs/bcachefs/btree_update.h
@@ -77,6 +77,8 @@ int bch2_btree_node_update_key(struct bch_fs *, struct btree_iter *,
 
 int bch2_trans_update(struct btree_trans *, struct btree_iter *,
 		      struct bkey_i *, enum btree_trigger_flags);
+void bch2_trans_commit_hook(struct btree_trans *,
+			    struct btree_trans_commit_hook *);
 int __bch2_trans_commit(struct btree_trans *);
 
 /**
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 62fa0d59242a..178a93698807 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -369,6 +369,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans,
 	struct bch_fs *c = trans->c;
 	struct bch_fs_usage_online *fs_usage = NULL;
 	struct btree_insert_entry *i;
+	struct btree_trans_commit_hook *h;
 	unsigned u64s = 0;
 	bool marking = false;
 	int ret;
@@ -386,6 +387,14 @@ bch2_trans_commit_write_locked(struct btree_trans *trans,
 
 	prefetch(&trans->c->journal.flags);
 
+	h = trans->hooks;
+	while (h) {
+		ret = h->fn(trans, h);
+		if (ret)
+			return ret;
+		h = h->next;
+	}
+
 	trans_for_each_update2(trans, i) {
 		/* Multiple inserts might go to same leaf: */
 		if (!same_leaf_as_prev(trans, i))
@@ -1057,6 +1066,13 @@ int bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter,
 	return 0;
 }
 
+void bch2_trans_commit_hook(struct btree_trans *trans,
+			    struct btree_trans_commit_hook *h)
+{
+	h->next = trans->hooks;
+	trans->hooks = h;
+}
+
 int __bch2_btree_insert(struct btree_trans *trans,
 			enum btree_id id, struct bkey_i *k)
 {
-- 
cgit 


From 4cf91b0270dc16a6637db4c200c7fb745b941065 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 4 Mar 2021 16:20:16 -0500
Subject: bcachefs: Split out bpos_cmp() and bkey_cmp()

With snapshots, we're going to need to differentiate between comparisons
that should and shouldn't include the snapshot field. bpos_cmp is now
the comparison function that does include the snapshot field, used by
core btree code.

Upper level filesystem code generally does _not_ want to compare against
the snapshot field - that code wants keys to compare as equal even when
one of them is in an ancestor snapshot.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bkey.c                  |  8 ++++----
 fs/bcachefs/bkey.h                  | 26 ++++++++++++--------------
 fs/bcachefs/bkey_methods.c          | 10 +++++-----
 fs/bcachefs/bset.c                  | 12 ++++++------
 fs/bcachefs/bset.h                  | 20 +-------------------
 fs/bcachefs/btree_cache.c           | 10 +++++-----
 fs/bcachefs/btree_gc.c              | 14 +++++++-------
 fs/bcachefs/btree_gc.h              | 10 +++-------
 fs/bcachefs/btree_io.c              |  6 +++---
 fs/bcachefs/btree_io.h              |  4 ++--
 fs/bcachefs/btree_iter.c            | 20 ++++++++++----------
 fs/bcachefs/btree_key_cache.c       |  8 ++++----
 fs/bcachefs/btree_update_interior.c |  4 ++--
 fs/bcachefs/btree_update_leaf.c     |  8 ++++----
 fs/bcachefs/debug.c                 |  4 ++--
 fs/bcachefs/extents.h               | 18 ++++++++++++++++++
 fs/bcachefs/recovery.c              | 16 ++++++++--------
 17 files changed, 96 insertions(+), 102 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bkey.c b/fs/bcachefs/bkey.c
index aeac07e2cb32..8b2befac95d4 100644
--- a/fs/bcachefs/bkey.c
+++ b/fs/bcachefs/bkey.c
@@ -1048,7 +1048,7 @@ int __bch2_bkey_cmp_packed_format_checked(const struct bkey_packed *l,
 			      high_word(f, r),
 			      b->nr_key_bits);
 
-	EBUG_ON(ret != bkey_cmp(bkey_unpack_pos(b, l),
+	EBUG_ON(ret != bpos_cmp(bkey_unpack_pos(b, l),
 				bkey_unpack_pos(b, r)));
 	return ret;
 }
@@ -1058,7 +1058,7 @@ int __bch2_bkey_cmp_left_packed_format_checked(const struct btree *b,
 					       const struct bkey_packed *l,
 					       const struct bpos *r)
 {
-	return bkey_cmp(bkey_unpack_pos_format_checked(b, l), *r);
+	return bpos_cmp(bkey_unpack_pos_format_checked(b, l), *r);
 }
 
 __pure __flatten
@@ -1079,7 +1079,7 @@ int bch2_bkey_cmp_packed(const struct btree *b,
 		r = (void*) &unpacked;
 	}
 
-	return bkey_cmp(((struct bkey *) l)->p, ((struct bkey *) r)->p);
+	return bpos_cmp(((struct bkey *) l)->p, ((struct bkey *) r)->p);
 }
 
 __pure __flatten
@@ -1090,7 +1090,7 @@ int __bch2_bkey_cmp_left_packed(const struct btree *b,
 	const struct bkey *l_unpacked;
 
 	return unlikely(l_unpacked = packed_to_bkey_c(l))
-		? bkey_cmp(l_unpacked->p, *r)
+		? bpos_cmp(l_unpacked->p, *r)
 		: __bch2_bkey_cmp_left_packed_format_checked(b, l, r);
 }
 
diff --git a/fs/bcachefs/bkey.h b/fs/bcachefs/bkey.h
index 51dc49b9ffba..df23c5b48969 100644
--- a/fs/bcachefs/bkey.h
+++ b/fs/bcachefs/bkey.h
@@ -148,29 +148,27 @@ static inline int bkey_cmp_left_packed_byval(const struct btree *b,
 	return bkey_cmp_left_packed(b, l, &r);
 }
 
-#if 1
+static __always_inline int bpos_cmp(struct bpos l, struct bpos r)
+{
+	return  cmp_int(l.inode,    r.inode) ?:
+		cmp_int(l.offset,   r.offset) ?:
+		cmp_int(l.snapshot, r.snapshot);
+}
+
 static __always_inline int bkey_cmp(struct bpos l, struct bpos r)
 {
-	if (l.inode != r.inode)
-		return l.inode < r.inode ? -1 : 1;
-	if (l.offset != r.offset)
-		return l.offset < r.offset ? -1 : 1;
-	if (l.snapshot != r.snapshot)
-		return l.snapshot < r.snapshot ? -1 : 1;
-	return 0;
+	return  cmp_int(l.inode,    r.inode) ?:
+		cmp_int(l.offset,   r.offset);
 }
-#else
-int bkey_cmp(struct bpos l, struct bpos r);
-#endif
 
 static inline struct bpos bpos_min(struct bpos l, struct bpos r)
 {
-	return bkey_cmp(l, r) < 0 ? l : r;
+	return bpos_cmp(l, r) < 0 ? l : r;
 }
 
 static inline struct bpos bpos_max(struct bpos l, struct bpos r)
 {
-	return bkey_cmp(l, r) > 0 ? l : r;
+	return bpos_cmp(l, r) > 0 ? l : r;
 }
 
 #define sbb(a, b, borrow)				\
@@ -198,7 +196,7 @@ static inline struct bpos bpos_sub(struct bpos a, struct bpos b)
 
 static inline struct bpos bpos_diff(struct bpos l, struct bpos r)
 {
-	if (bkey_cmp(l, r) > 0)
+	if (bpos_cmp(l, r) > 0)
 		swap(l, r);
 
 	return bpos_sub(r, l);
diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c
index 641169ef91b5..5e7eadeb3b57 100644
--- a/fs/bcachefs/bkey_methods.c
+++ b/fs/bcachefs/bkey_methods.c
@@ -138,10 +138,10 @@ const char *bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k,
 
 const char *bch2_bkey_in_btree_node(struct btree *b, struct bkey_s_c k)
 {
-	if (bkey_cmp(k.k->p, b->data->min_key) < 0)
+	if (bpos_cmp(k.k->p, b->data->min_key) < 0)
 		return "key before start of btree node";
 
-	if (bkey_cmp(k.k->p, b->data->max_key) > 0)
+	if (bpos_cmp(k.k->p, b->data->max_key) > 0)
 		return "key past end of btree node";
 
 	return NULL;
@@ -165,9 +165,9 @@ void bch2_bkey_debugcheck(struct bch_fs *c, struct btree *b, struct bkey_s_c k)
 
 void bch2_bpos_to_text(struct printbuf *out, struct bpos pos)
 {
-	if (!bkey_cmp(pos, POS_MIN))
+	if (!bpos_cmp(pos, POS_MIN))
 		pr_buf(out, "POS_MIN");
-	else if (!bkey_cmp(pos, POS_MAX))
+	else if (!bpos_cmp(pos, POS_MAX))
 		pr_buf(out, "POS_MAX");
 	else {
 		if (pos.inode == U64_MAX)
@@ -256,7 +256,7 @@ enum merge_result bch2_bkey_merge(struct bch_fs *c,
 	    !ops->key_merge ||
 	    l.k->type != r.k->type ||
 	    bversion_cmp(l.k->version, r.k->version) ||
-	    bkey_cmp(l.k->p, bkey_start_pos(r.k)))
+	    bpos_cmp(l.k->p, bkey_start_pos(r.k)))
 		return BCH_MERGE_NOMERGE;
 
 	ret = ops->key_merge(c, l, r);
diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c
index 200cd900e041..5746199dfafb 100644
--- a/fs/bcachefs/bset.c
+++ b/fs/bcachefs/bset.c
@@ -81,13 +81,13 @@ void bch2_dump_bset(struct bch_fs *c, struct btree *b,
 
 		n = bkey_unpack_key(b, _n);
 
-		if (bkey_cmp(bkey_start_pos(&n), k.k->p) < 0) {
+		if (bpos_cmp(n.p, k.k->p) < 0) {
 			printk(KERN_ERR "Key skipped backwards\n");
 			continue;
 		}
 
 		if (!bkey_deleted(k.k) &&
-		    !bkey_cmp(n.p, k.k->p))
+		    !bpos_cmp(n.p, k.k->p))
 			printk(KERN_ERR "Duplicate keys\n");
 	}
 }
@@ -522,7 +522,7 @@ static void bch2_bset_verify_rw_aux_tree(struct btree *b,
 	goto start;
 	while (1) {
 		if (rw_aux_to_bkey(b, t, j) == k) {
-			BUG_ON(bkey_cmp(rw_aux_tree(b, t)[j].k,
+			BUG_ON(bpos_cmp(rw_aux_tree(b, t)[j].k,
 					bkey_unpack_pos(b, k)));
 start:
 			if (++j == t->size)
@@ -1174,7 +1174,7 @@ static struct bkey_packed *bset_search_write_set(const struct btree *b,
 	while (l + 1 != r) {
 		unsigned m = (l + r) >> 1;
 
-		if (bkey_cmp(rw_aux_tree(b, t)[m].k, *search) < 0)
+		if (bpos_cmp(rw_aux_tree(b, t)[m].k, *search) < 0)
 			l = m;
 		else
 			r = m;
@@ -1306,7 +1306,7 @@ struct bkey_packed *__bch2_bset_search(struct btree *b,
 		 * start and end - handle that here:
 		 */
 
-		if (bkey_cmp(*search, t->max_key) > 0)
+		if (bpos_cmp(*search, t->max_key) > 0)
 			return btree_bkey_last(b, t);
 
 		return bset_search_tree(b, t, search, lossy_packed_search);
@@ -1456,7 +1456,7 @@ void bch2_btree_node_iter_init(struct btree_node_iter *iter,
 	struct bkey_packed *k[MAX_BSETS];
 	unsigned i;
 
-	EBUG_ON(bkey_cmp(*search, b->data->min_key) < 0);
+	EBUG_ON(bpos_cmp(*search, b->data->min_key) < 0);
 	bset_aux_tree_verify(b);
 
 	memset(iter, 0, sizeof(*iter));
diff --git a/fs/bcachefs/bset.h b/fs/bcachefs/bset.h
index 8cf2301e510d..e6c8d081f9b6 100644
--- a/fs/bcachefs/bset.h
+++ b/fs/bcachefs/bset.h
@@ -378,7 +378,7 @@ static inline int bkey_cmp_p_or_unp(const struct btree *b,
 	EBUG_ON(r_packed && !bkey_packed(r_packed));
 
 	if (unlikely(!bkey_packed(l)))
-		return bkey_cmp(packed_to_bkey_c(l)->p, *r);
+		return bpos_cmp(packed_to_bkey_c(l)->p, *r);
 
 	if (likely(r_packed))
 		return __bch2_bkey_cmp_packed_format_checked(l, r_packed, b);
@@ -418,24 +418,6 @@ bch2_bkey_prev(struct btree *b, struct bset_tree *t, struct bkey_packed *k)
 	return bch2_bkey_prev_filter(b, t, k, 1);
 }
 
-enum bch_extent_overlap {
-	BCH_EXTENT_OVERLAP_ALL		= 0,
-	BCH_EXTENT_OVERLAP_BACK		= 1,
-	BCH_EXTENT_OVERLAP_FRONT	= 2,
-	BCH_EXTENT_OVERLAP_MIDDLE	= 3,
-};
-
-/* Returns how k overlaps with m */
-static inline enum bch_extent_overlap bch2_extent_overlap(const struct bkey *k,
-							  const struct bkey *m)
-{
-	int cmp1 = bkey_cmp(k->p, m->p) < 0;
-	int cmp2 = bkey_cmp(bkey_start_pos(k),
-			    bkey_start_pos(m)) > 0;
-
-	return (cmp1 << 1) + cmp2;
-}
-
 /* Btree key iteration */
 
 void bch2_btree_node_iter_push(struct btree_node_iter *, struct btree *,
diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index f32fc45c85d2..63b8423fa87c 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -821,9 +821,9 @@ lock_node:
 
 	EBUG_ON(b->c.btree_id != iter->btree_id);
 	EBUG_ON(BTREE_NODE_LEVEL(b->data) != level);
-	EBUG_ON(bkey_cmp(b->data->max_key, k->k.p));
+	EBUG_ON(bpos_cmp(b->data->max_key, k->k.p));
 	EBUG_ON(b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
-		bkey_cmp(b->data->min_key,
+		bpos_cmp(b->data->min_key,
 			 bkey_i_to_btree_ptr_v2(&b->key)->v.min_key));
 
 	return b;
@@ -904,9 +904,9 @@ lock_node:
 
 	EBUG_ON(b->c.btree_id != btree_id);
 	EBUG_ON(BTREE_NODE_LEVEL(b->data) != level);
-	EBUG_ON(bkey_cmp(b->data->max_key, k->k.p));
+	EBUG_ON(bpos_cmp(b->data->max_key, k->k.p));
 	EBUG_ON(b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
-		bkey_cmp(b->data->min_key,
+		bpos_cmp(b->data->min_key,
 			 bkey_i_to_btree_ptr_v2(&b->key)->v.min_key));
 out:
 	bch2_btree_cache_cannibalize_unlock(c);
@@ -1018,7 +1018,7 @@ out:
 		if (sib != btree_prev_sib)
 			swap(n1, n2);
 
-		if (bkey_cmp(bkey_successor(n1->key.k.p),
+		if (bpos_cmp(bkey_successor(n1->key.k.p),
 			     n2->data->min_key)) {
 			char buf1[200], buf2[200];
 
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 9e2f93c2adc2..2710e4b35da3 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -81,7 +81,7 @@ static int bch2_gc_check_topology(struct bch_fs *c,
 			bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(prev->k));
 		}
 
-		if (fsck_err_on(bkey_cmp(expected_start, bp->v.min_key), c,
+		if (fsck_err_on(bpos_cmp(expected_start, bp->v.min_key), c,
 				"btree node with incorrect min_key at btree %s level %u:\n"
 				"  prev %s\n"
 				"  cur %s",
@@ -92,7 +92,7 @@ static int bch2_gc_check_topology(struct bch_fs *c,
 	}
 
 	if (fsck_err_on(is_last &&
-			bkey_cmp(cur.k->k.p, node_end), c,
+			bpos_cmp(cur.k->k.p, node_end), c,
 			"btree node with incorrect max_key at btree %s level %u:\n"
 			"  %s\n"
 			"  expected %s",
@@ -489,8 +489,8 @@ static int bch2_gc_btree_init_recurse(struct bch_fs *c, struct btree *b,
 	bkey_init(&prev.k->k);
 
 	while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
-		BUG_ON(bkey_cmp(k.k->p, b->data->min_key) < 0);
-		BUG_ON(bkey_cmp(k.k->p, b->data->max_key) > 0);
+		BUG_ON(bpos_cmp(k.k->p, b->data->min_key) < 0);
+		BUG_ON(bpos_cmp(k.k->p, b->data->max_key) > 0);
 
 		ret = bch2_gc_mark_key(c, b->c.btree_id, b->c.level, false,
 				       &k, &max_stale, true);
@@ -581,13 +581,13 @@ static int bch2_gc_btree_init(struct bch_fs *c,
 		return 0;
 
 	six_lock_read(&b->c.lock, NULL, NULL);
-	if (fsck_err_on(bkey_cmp(b->data->min_key, POS_MIN), c,
+	if (fsck_err_on(bpos_cmp(b->data->min_key, POS_MIN), c,
 			"btree root with incorrect min_key: %s",
 			(bch2_bpos_to_text(&PBUF(buf), b->data->min_key), buf))) {
 		BUG();
 	}
 
-	if (fsck_err_on(bkey_cmp(b->data->max_key, POS_MAX), c,
+	if (fsck_err_on(bpos_cmp(b->data->max_key, POS_MAX), c,
 			"btree root with incorrect max_key: %s",
 			(bch2_bpos_to_text(&PBUF(buf), b->data->max_key), buf))) {
 		BUG();
@@ -1448,7 +1448,7 @@ static void bch2_coalesce_nodes(struct bch_fs *c, struct btree_iter *iter,
 		unsigned j;
 
 		for (j = 0; j < nr_new_nodes; j++)
-			if (!bkey_cmp(old_nodes[i]->key.k.p,
+			if (!bpos_cmp(old_nodes[i]->key.k.p,
 				      new_nodes[j]->key.k.p))
 				goto next;
 
diff --git a/fs/bcachefs/btree_gc.h b/fs/bcachefs/btree_gc.h
index d5559827ed7f..44b7d121610f 100644
--- a/fs/bcachefs/btree_gc.h
+++ b/fs/bcachefs/btree_gc.h
@@ -45,13 +45,9 @@ static inline struct gc_pos gc_phase(enum gc_phase phase)
 
 static inline int gc_pos_cmp(struct gc_pos l, struct gc_pos r)
 {
-	if (l.phase != r.phase)
-		return l.phase < r.phase ? -1 : 1;
-	if (bkey_cmp(l.pos, r.pos))
-		return bkey_cmp(l.pos, r.pos);
-	if (l.level != r.level)
-		return l.level < r.level ? -1 : 1;
-	return 0;
+	return  cmp_int(l.phase, r.phase) ?:
+		bpos_cmp(l.pos, r.pos) ?:
+		cmp_int(l.level, r.level);
 }
 
 static inline enum gc_phase btree_id_to_gc_phase(enum btree_id id)
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 71860e1a3100..468b1a294ce9 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -38,7 +38,7 @@ static void verify_no_dups(struct btree *b,
 		struct bkey l = bkey_unpack_key(b, p);
 		struct bkey r = bkey_unpack_key(b, k);
 
-		BUG_ON(bkey_cmp(l.p, bkey_start_pos(&r)) >= 0);
+		BUG_ON(bpos_cmp(l.p, bkey_start_pos(&r)) >= 0);
 	}
 #endif
 }
@@ -631,14 +631,14 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
 				b->data->max_key = b->key.k.p;
 			}
 
-			btree_err_on(bkey_cmp(b->data->min_key, bp->min_key),
+			btree_err_on(bpos_cmp(b->data->min_key, bp->min_key),
 				     BTREE_ERR_MUST_RETRY, c, ca, b, NULL,
 				     "incorrect min_key: got %s should be %s",
 				     (bch2_bpos_to_text(&PBUF(buf1), bn->min_key), buf1),
 				     (bch2_bpos_to_text(&PBUF(buf2), bp->min_key), buf2));
 		}
 
-		btree_err_on(bkey_cmp(bn->max_key, b->key.k.p),
+		btree_err_on(bpos_cmp(bn->max_key, b->key.k.p),
 			     BTREE_ERR_MUST_RETRY, c, ca, b, i,
 			     "incorrect max key %s",
 			     (bch2_bpos_to_text(&PBUF(buf1), bn->max_key), buf1));
diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h
index 16ce6dff6af7..f155a6cc1755 100644
--- a/fs/bcachefs/btree_io.h
+++ b/fs/bcachefs/btree_io.h
@@ -220,7 +220,7 @@ static inline void compat_btree_node(unsigned level, enum btree_id btree_id,
 {
 	if (version < bcachefs_metadata_version_inode_btree_change &&
 	    btree_node_type_is_extents(btree_id) &&
-	    bkey_cmp(bn->min_key, POS_MIN) &&
+	    bpos_cmp(bn->min_key, POS_MIN) &&
 	    write)
 		bn->min_key = bkey_predecessor(bn->min_key);
 
@@ -229,7 +229,7 @@ static inline void compat_btree_node(unsigned level, enum btree_id btree_id,
 
 	if (version < bcachefs_metadata_version_inode_btree_change &&
 	    btree_node_type_is_extents(btree_id) &&
-	    bkey_cmp(bn->min_key, POS_MIN) &&
+	    bpos_cmp(bn->min_key, POS_MIN) &&
 	    !write)
 		bn->min_key = bkey_successor(bn->min_key);
 }
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 74c4cacb9aa7..8c923aa01ea1 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -37,13 +37,13 @@ static inline struct bpos btree_iter_search_key(struct btree_iter *iter)
 static inline bool btree_iter_pos_before_node(struct btree_iter *iter,
 					      struct btree *b)
 {
-	return bkey_cmp(iter->real_pos, b->data->min_key) < 0;
+	return bpos_cmp(iter->real_pos, b->data->min_key) < 0;
 }
 
 static inline bool btree_iter_pos_after_node(struct btree_iter *iter,
 					     struct btree *b)
 {
-	return bkey_cmp(b->key.k.p, iter->real_pos) < 0;
+	return bpos_cmp(b->key.k.p, iter->real_pos) < 0;
 }
 
 static inline bool btree_iter_pos_in_node(struct btree_iter *iter,
@@ -293,7 +293,7 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
 
 		/* Must lock btree nodes in key order: */
 		if (btree_node_locked(linked, level) &&
-		    bkey_cmp(pos, btree_node_pos((void *) linked->l[level].b,
+		    bpos_cmp(pos, btree_node_pos((void *) linked->l[level].b,
 						 btree_iter_type(linked))) <= 0) {
 			deadlock_iter = linked;
 			reason = 7;
@@ -1392,7 +1392,7 @@ struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter)
 	if (!b)
 		return NULL;
 
-	BUG_ON(bkey_cmp(b->key.k.p, iter->pos) < 0);
+	BUG_ON(bpos_cmp(b->key.k.p, iter->pos) < 0);
 
 	iter->pos = iter->real_pos = b->key.k.p;
 
@@ -1429,7 +1429,7 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
 	if (!b)
 		return NULL;
 
-	if (bkey_cmp(iter->pos, b->key.k.p) < 0) {
+	if (bpos_cmp(iter->pos, b->key.k.p) < 0) {
 		/*
 		 * Haven't gotten to the end of the parent node: go back down to
 		 * the next child node
@@ -1461,7 +1461,7 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
 
 static void btree_iter_set_search_pos(struct btree_iter *iter, struct bpos new_pos)
 {
-	int cmp = bkey_cmp(new_pos, iter->real_pos);
+	int cmp = bpos_cmp(new_pos, iter->real_pos);
 	unsigned l = iter->level;
 
 	if (!cmp)
@@ -1505,7 +1505,7 @@ out:
 inline bool bch2_btree_iter_advance(struct btree_iter *iter)
 {
 	struct bpos pos = iter->k.p;
-	bool ret = bkey_cmp(pos, POS_MAX) != 0;
+	bool ret = bpos_cmp(pos, POS_MAX) != 0;
 
 	if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS))
 		pos = bkey_successor(pos);
@@ -1516,7 +1516,7 @@ inline bool bch2_btree_iter_advance(struct btree_iter *iter)
 inline bool bch2_btree_iter_rewind(struct btree_iter *iter)
 {
 	struct bpos pos = bkey_start_pos(&iter->k);
-	bool ret = bkey_cmp(pos, POS_MIN) != 0;
+	bool ret = bpos_cmp(pos, POS_MIN) != 0;
 
 	if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS))
 		pos = bkey_predecessor(pos);
@@ -1527,7 +1527,7 @@ inline bool bch2_btree_iter_rewind(struct btree_iter *iter)
 static inline bool btree_iter_set_pos_to_next_leaf(struct btree_iter *iter)
 {
 	struct bpos next_pos = iter->l[0].b->key.k.p;
-	bool ret = bkey_cmp(next_pos, POS_MAX) != 0;
+	bool ret = bpos_cmp(next_pos, POS_MAX) != 0;
 
 	/*
 	 * Typically, we don't want to modify iter->pos here, since that
@@ -1545,7 +1545,7 @@ static inline bool btree_iter_set_pos_to_next_leaf(struct btree_iter *iter)
 static inline bool btree_iter_set_pos_to_prev_leaf(struct btree_iter *iter)
 {
 	struct bpos next_pos = iter->l[0].b->data->min_key;
-	bool ret = bkey_cmp(next_pos, POS_MIN) != 0;
+	bool ret = bpos_cmp(next_pos, POS_MIN) != 0;
 
 	if (ret)
 		btree_iter_set_search_pos(iter, bkey_predecessor(next_pos));
diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index 7ee64efe6fd9..0858f469f7c2 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -21,7 +21,7 @@ static int bch2_btree_key_cache_cmp_fn(struct rhashtable_compare_arg *arg,
 	const struct bkey_cached_key *key = arg->key;
 
 	return cmp_int(ck->key.btree_id, key->btree_id) ?:
-		bkey_cmp(ck->key.pos, key->pos);
+		bpos_cmp(ck->key.pos, key->pos);
 }
 
 static const struct rhashtable_params bch2_btree_key_cache_params = {
@@ -252,7 +252,7 @@ static int bkey_cached_check_fn(struct six_lock *lock, void *p)
 	const struct btree_iter *iter = p;
 
 	return ck->key.btree_id == iter->btree_id &&
-		!bkey_cmp(ck->key.pos, iter->pos) ? 0 : -1;
+		!bpos_cmp(ck->key.pos, iter->pos) ? 0 : -1;
 }
 
 __flatten
@@ -293,7 +293,7 @@ retry:
 		if (!btree_node_lock((void *) ck, iter->pos, 0, iter, lock_want,
 				     bkey_cached_check_fn, iter, _THIS_IP_)) {
 			if (ck->key.btree_id != iter->btree_id ||
-			    bkey_cmp(ck->key.pos, iter->pos)) {
+			    bpos_cmp(ck->key.pos, iter->pos)) {
 				goto retry;
 			}
 
@@ -303,7 +303,7 @@ retry:
 		}
 
 		if (ck->key.btree_id != iter->btree_id ||
-		    bkey_cmp(ck->key.pos, iter->pos)) {
+		    bpos_cmp(ck->key.pos, iter->pos)) {
 			six_unlock_type(&ck->c.lock, lock_want);
 			goto retry;
 		}
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 2c5084ac1a03..ddb0d03e268c 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -50,7 +50,7 @@ static void btree_node_interior_verify(struct bch_fs *c, struct btree *b)
 			break;
 		bp = bkey_s_c_to_btree_ptr_v2(k);
 
-		if (bkey_cmp(next_node, bp.v->min_key)) {
+		if (bpos_cmp(next_node, bp.v->min_key)) {
 			bch2_dump_btree_node(c, b);
 			panic("expected next min_key %s got %s\n",
 			      (bch2_bpos_to_text(&PBUF(buf1), next_node), buf1),
@@ -60,7 +60,7 @@ static void btree_node_interior_verify(struct bch_fs *c, struct btree *b)
 		bch2_btree_node_iter_advance(&iter, b);
 
 		if (bch2_btree_node_iter_end(&iter)) {
-			if (bkey_cmp(k.k->p, b->key.k.p)) {
+			if (bpos_cmp(k.k->p, b->key.k.p)) {
 				bch2_dump_btree_node(c, b);
 				panic("expected end %s got %s\n",
 				      (bch2_bpos_to_text(&PBUF(buf1), b->key.k.p), buf1),
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 178a93698807..a32c8f34039c 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -26,7 +26,7 @@ static inline int btree_insert_entry_cmp(const struct btree_insert_entry *l,
 {
 	return   cmp_int(l->btree_id,	r->btree_id) ?:
 		 -cmp_int(l->level,	r->level) ?:
-		 bkey_cmp(l->k->k.p,	r->k->k.p);
+		 bpos_cmp(l->k->k.p,	r->k->k.p);
 }
 
 static inline bool same_leaf_as_prev(struct btree_trans *trans,
@@ -70,8 +70,8 @@ bool bch2_btree_bset_insert_key(struct btree_iter *iter,
 	EBUG_ON(btree_node_just_written(b));
 	EBUG_ON(bset_written(b, btree_bset_last(b)));
 	EBUG_ON(bkey_deleted(&insert->k) && bkey_val_u64s(&insert->k));
-	EBUG_ON(bkey_cmp(insert->k.p, b->data->min_key) < 0);
-	EBUG_ON(bkey_cmp(insert->k.p, b->data->max_key) > 0);
+	EBUG_ON(bpos_cmp(insert->k.p, b->data->min_key) < 0);
+	EBUG_ON(bpos_cmp(insert->k.p, b->data->max_key) > 0);
 	EBUG_ON(insert->k.u64s >
 		bch_btree_keys_u64s_remaining(iter->trans->c, b));
 	EBUG_ON(iter->flags & BTREE_ITER_IS_EXTENTS);
@@ -225,7 +225,7 @@ static inline void btree_insert_entry_checks(struct btree_trans *trans,
 
 	BUG_ON(bch2_debug_check_bkeys &&
 	       bch2_bkey_invalid(c, bkey_i_to_s_c(i->k), i->bkey_type));
-	BUG_ON(bkey_cmp(i->k->k.p, i->iter->real_pos));
+	BUG_ON(bpos_cmp(i->k->k.p, i->iter->real_pos));
 	BUG_ON(i->level		!= i->iter->level);
 	BUG_ON(i->btree_id	!= i->iter->btree_id);
 }
diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c
index 8b837ac69d74..059972e5a124 100644
--- a/fs/bcachefs/debug.c
+++ b/fs/bcachefs/debug.c
@@ -273,7 +273,7 @@ static ssize_t bch2_read_btree_formats(struct file *file, char __user *buf,
 	if (err)
 		return err;
 
-	if (!i->size || !bkey_cmp(POS_MAX, i->from))
+	if (!i->size || !bpos_cmp(POS_MAX, i->from))
 		return i->ret;
 
 	bch2_trans_init(&trans, i->c, 0, 0);
@@ -289,7 +289,7 @@ static ssize_t bch2_read_btree_formats(struct file *file, char __user *buf,
 		 * can't easily correctly restart a btree node traversal across
 		 * all nodes, meh
 		 */
-		i->from = bkey_cmp(POS_MAX, b->key.k.p)
+		i->from = bpos_cmp(POS_MAX, b->key.k.p)
 			? bkey_successor(b->key.k.p)
 			: b->key.k.p;
 
diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h
index c8069dfb90ff..ccee43a2019d 100644
--- a/fs/bcachefs/extents.h
+++ b/fs/bcachefs/extents.h
@@ -582,6 +582,24 @@ void bch2_ptr_swab(struct bkey_s);
 
 /* Generic extent code: */
 
+enum bch_extent_overlap {
+	BCH_EXTENT_OVERLAP_ALL		= 0,
+	BCH_EXTENT_OVERLAP_BACK		= 1,
+	BCH_EXTENT_OVERLAP_FRONT	= 2,
+	BCH_EXTENT_OVERLAP_MIDDLE	= 3,
+};
+
+/* Returns how k overlaps with m */
+static inline enum bch_extent_overlap bch2_extent_overlap(const struct bkey *k,
+							  const struct bkey *m)
+{
+	int cmp1 = bkey_cmp(k->p, m->p) < 0;
+	int cmp2 = bkey_cmp(bkey_start_pos(k),
+			    bkey_start_pos(m)) > 0;
+
+	return (cmp1 << 1) + cmp2;
+}
+
 int bch2_cut_front_s(struct bpos, struct bkey_s);
 int bch2_cut_back_s(struct bpos, struct bkey_s);
 
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 92f7568175eb..596f7c1e4245 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -48,14 +48,14 @@ static int __journal_key_cmp(enum btree_id	l_btree_id,
 {
 	return (cmp_int(l_btree_id,	r->btree_id) ?:
 		cmp_int(l_level,	r->level) ?:
-		bkey_cmp(l_pos,	r->k->k.p));
+		bpos_cmp(l_pos,	r->k->k.p));
 }
 
 static int journal_key_cmp(struct journal_key *l, struct journal_key *r)
 {
 	return (cmp_int(l->btree_id,	r->btree_id) ?:
 		cmp_int(l->level,	r->level) ?:
-		bkey_cmp(l->k->k.p,	r->k->k.p));
+		bpos_cmp(l->k->k.p,	r->k->k.p));
 }
 
 static size_t journal_key_search(struct journal_keys *journal_keys,
@@ -90,7 +90,7 @@ static void journal_iter_fix(struct bch_fs *c, struct journal_iter *iter, unsign
 	if (iter->idx > idx ||
 	    (iter->idx == idx &&
 	     biter->last &&
-	     bkey_cmp(n->k.p, biter->unpacked.p) <= 0))
+	     bpos_cmp(n->k.p, biter->unpacked.p) <= 0))
 		iter->idx++;
 }
 
@@ -238,7 +238,7 @@ struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *
 			bkey_i_to_s_c(bch2_journal_iter_peek(&iter->journal));
 
 		if (btree_k.k && journal_k.k) {
-			int cmp = bkey_cmp(btree_k.k->p, journal_k.k->p);
+			int cmp = bpos_cmp(btree_k.k->p, journal_k.k->p);
 
 			if (!cmp)
 				bch2_journal_iter_advance_btree(iter);
@@ -256,7 +256,7 @@ struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *
 		ret = iter->last == journal ? journal_k : btree_k;
 
 		if (iter->b &&
-		    bkey_cmp(ret.k->p, iter->b->data->max_key) > 0) {
+		    bpos_cmp(ret.k->p, iter->b->data->max_key) > 0) {
 			iter->journal.idx = iter->journal.keys->nr;
 			iter->last = none;
 			return bkey_s_c_null;
@@ -419,7 +419,7 @@ static int journal_sort_key_cmp(const void *_l, const void *_r)
 
 	return  cmp_int(l->btree_id,	r->btree_id) ?:
 		cmp_int(l->level,	r->level) ?:
-		bkey_cmp(l->k->k.p, r->k->k.p) ?:
+		bpos_cmp(l->k->k.p, r->k->k.p) ?:
 		cmp_int(l->journal_seq, r->journal_seq) ?:
 		cmp_int(l->journal_offset, r->journal_offset);
 }
@@ -490,7 +490,7 @@ static struct journal_keys journal_keys_sort(struct list_head *journal_entries)
 		while (src + 1 < keys.d + keys.nr &&
 		       src[0].btree_id	== src[1].btree_id &&
 		       src[0].level	== src[1].level &&
-		       !bkey_cmp(src[0].k->k.p, src[1].k->k.p))
+		       !bpos_cmp(src[0].k->k.p, src[1].k->k.p))
 			src++;
 
 		*dst++ = *src++;
@@ -581,7 +581,7 @@ static int journal_sort_seq_cmp(const void *_l, const void *_r)
 	return  cmp_int(r->level,	l->level) ?:
 		cmp_int(l->journal_seq, r->journal_seq) ?:
 		cmp_int(l->btree_id,	r->btree_id) ?:
-		bkey_cmp(l->k->k.p,	r->k->k.p);
+		bpos_cmp(l->k->k.p,	r->k->k.p);
 }
 
 static int bch2_journal_replay(struct bch_fs *c,
-- 
cgit 


From e751c01a8ee1ca934cc0953e2e77ad4ea3e64d5e Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 24 Mar 2021 18:02:16 -0400
Subject: bcachefs: Start using bpos.snapshot field

This patch starts treating the bpos.snapshot field like part of the key
in the btree code:

* bpos_successor() and bpos_predecessor() now include the snapshot field
* Keys in btrees that will be using snapshots (extents, inodes, dirents
  and xattrs) now always have their snapshot field set to U32_MAX

The btree iterator code gets a new flag, BTREE_ITER_ALL_SNAPSHOTS, that
determines whether we're iterating over keys in all snapshots or not -
internally, this controlls whether bkey_(successor|predecessor)
increment/decrement the snapshot field, or only the higher bits of the
key.

We add a new member to struct btree_iter, iter->snapshot: when
BTREE_ITER_ALL_SNAPSHOTS is not set, iter->pos.snapshot should always
equal iter->snapshot, which will be 0 for btrees that don't use
snapshots, and alsways U32_MAX for btrees that will use snapshots
(until we enable snapshot creation).

This patch also introduces a new metadata version number, and compat
code for reading from/writing to older versions - this isn't a forced
upgrade (yet).

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs_format.h       | 24 +++++------
 fs/bcachefs/bkey.c                  | 17 +++++---
 fs/bcachefs/bkey.h                  | 42 +++++++++++++++-----
 fs/bcachefs/bkey_methods.c          | 36 +++++++++++++++--
 fs/bcachefs/bset.c                  |  2 +-
 fs/bcachefs/btree_cache.c           |  2 +-
 fs/bcachefs/btree_gc.c              |  8 ++--
 fs/bcachefs/btree_io.c              | 12 ++----
 fs/bcachefs/btree_io.h              | 26 ++++++++++--
 fs/bcachefs/btree_iter.c            | 79 ++++++++++++++++++++++++++++++++-----
 fs/bcachefs/btree_iter.h            |  3 ++
 fs/bcachefs/btree_types.h           | 16 +++++++-
 fs/bcachefs/btree_update_interior.c | 12 ++++--
 fs/bcachefs/btree_update_leaf.c     | 14 +++++--
 fs/bcachefs/debug.c                 |  6 ++-
 fs/bcachefs/extents.c               |  7 ++--
 fs/bcachefs/fsck.c                  |  1 +
 fs/bcachefs/inode.c                 |  1 +
 fs/bcachefs/io.c                    |  5 +++
 fs/bcachefs/journal_io.c            |  2 +-
 fs/bcachefs/recovery.c              |  8 ++++
 fs/bcachefs/tests.c                 |  1 +
 22 files changed, 251 insertions(+), 73 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index 111f7d3c312e..2172d3cf3680 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -142,19 +142,18 @@ struct bpos {
 #define KEY_SNAPSHOT_MAX		((__u32)~0U)
 #define KEY_SIZE_MAX			((__u32)~0U)
 
-static inline struct bpos POS(__u64 inode, __u64 offset)
+static inline struct bpos SPOS(__u64 inode, __u64 offset, __u32 snapshot)
 {
-	struct bpos ret;
-
-	ret.inode	= inode;
-	ret.offset	= offset;
-	ret.snapshot	= 0;
-
-	return ret;
+	return (struct bpos) {
+		.inode		= inode,
+		.offset		= offset,
+		.snapshot	= snapshot,
+	};
 }
 
-#define POS_MIN				POS(0, 0)
-#define POS_MAX				POS(KEY_INODE_MAX, KEY_OFFSET_MAX)
+#define POS_MIN				SPOS(0, 0, 0)
+#define POS_MAX				SPOS(KEY_INODE_MAX, KEY_OFFSET_MAX, KEY_SNAPSHOT_MAX)
+#define POS(_inode, _offset)		SPOS(_inode, _offset, 0)
 
 /* Empty placeholder struct, for container_of() */
 struct bch_val {
@@ -1208,7 +1207,8 @@ enum bcachefs_metadata_version {
 	bcachefs_metadata_version_new_versioning	= 10,
 	bcachefs_metadata_version_bkey_renumber		= 10,
 	bcachefs_metadata_version_inode_btree_change	= 11,
-	bcachefs_metadata_version_max			= 12,
+	bcachefs_metadata_version_snapshot		= 12,
+	bcachefs_metadata_version_max			= 13,
 };
 
 #define bcachefs_metadata_version_current	(bcachefs_metadata_version_max - 1)
@@ -1749,7 +1749,7 @@ struct btree_node {
 	/* Closed interval: */
 	struct bpos		min_key;
 	struct bpos		max_key;
-	struct bch_extent_ptr	ptr;
+	struct bch_extent_ptr	_ptr; /* not used anymore */
 	struct bkey_format	format;
 
 	union {
diff --git a/fs/bcachefs/bkey.c b/fs/bcachefs/bkey.c
index 8b2befac95d4..a0379f980f7e 100644
--- a/fs/bcachefs/bkey.c
+++ b/fs/bcachefs/bkey.c
@@ -617,15 +617,19 @@ const char *bch2_bkey_format_validate(struct bkey_format *f)
 		return "incorrect number of fields";
 
 	for (i = 0; i < f->nr_fields; i++) {
+		unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i];
+		u64 unpacked_mask = ~((~0ULL << 1) << (unpacked_bits - 1));
 		u64 field_offset = le64_to_cpu(f->field_offset[i]);
 
-		if (f->bits_per_field[i] > 64)
+		if (f->bits_per_field[i] > unpacked_bits)
 			return "field too large";
 
-		if (field_offset &&
-		    (f->bits_per_field[i] == 64 ||
-		    (field_offset + ((1ULL << f->bits_per_field[i]) - 1) <
-		     field_offset)))
+		if ((f->bits_per_field[i] == unpacked_bits) && field_offset)
+			return "offset + bits overflow";
+
+		if (((field_offset + ((1ULL << f->bits_per_field[i]) - 1)) &
+		     unpacked_mask) <
+		    field_offset)
 			return "offset + bits overflow";
 
 		bits += f->bits_per_field[i];
@@ -1126,11 +1130,12 @@ void bch2_bkey_pack_test(void)
 	struct bkey_packed p;
 
 	struct bkey_format test_format = {
-		.key_u64s	= 2,
+		.key_u64s	= 3,
 		.nr_fields	= BKEY_NR_FIELDS,
 		.bits_per_field = {
 			13,
 			64,
+			32,
 		},
 	};
 
diff --git a/fs/bcachefs/bkey.h b/fs/bcachefs/bkey.h
index df23c5b48969..72b4267031d8 100644
--- a/fs/bcachefs/bkey.h
+++ b/fs/bcachefs/bkey.h
@@ -258,24 +258,46 @@ static inline unsigned bkey_format_key_bits(const struct bkey_format *format)
 		format->bits_per_field[BKEY_FIELD_SNAPSHOT];
 }
 
-static inline struct bpos bkey_successor(struct bpos p)
+static inline struct bpos bpos_successor(struct bpos p)
 {
-	struct bpos ret = p;
+	if (!++p.snapshot &&
+	    !++p.offset &&
+	    !++p.inode)
+		BUG();
 
-	if (!++ret.offset)
-		BUG_ON(!++ret.inode);
+	return p;
+}
 
-	return ret;
+static inline struct bpos bpos_predecessor(struct bpos p)
+{
+	if (!p.snapshot-- &&
+	    !p.offset-- &&
+	    !p.inode--)
+		BUG();
+
+	return p;
 }
 
-static inline struct bpos bkey_predecessor(struct bpos p)
+static inline struct bpos bpos_nosnap_successor(struct bpos p)
 {
-	struct bpos ret = p;
+	p.snapshot = 0;
 
-	if (!ret.offset--)
-		BUG_ON(!ret.inode--);
+	if (!++p.offset &&
+	    !++p.inode)
+		BUG();
 
-	return ret;
+	return p;
+}
+
+static inline struct bpos bpos_nosnap_predecessor(struct bpos p)
+{
+	p.snapshot = 0;
+
+	if (!p.offset-- &&
+	    !p.inode--)
+		BUG();
+
+	return p;
 }
 
 static inline u64 bkey_start_offset(const struct bkey *k)
diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c
index 5e7eadeb3b57..6fe95b802e13 100644
--- a/fs/bcachefs/bkey_methods.c
+++ b/fs/bcachefs/bkey_methods.c
@@ -119,9 +119,16 @@ const char *__bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k,
 			return "nonzero size field";
 	}
 
-	if (k.k->p.snapshot)
+	if (type != BKEY_TYPE_btree &&
+	    !btree_type_has_snapshots(type) &&
+	    k.k->p.snapshot)
 		return "nonzero snapshot";
 
+	if (type != BKEY_TYPE_btree &&
+	    btree_type_has_snapshots(type) &&
+	    k.k->p.snapshot != U32_MAX)
+		return "invalid snapshot field";
+
 	if (type != BKEY_TYPE_btree &&
 	    !bkey_cmp(k.k->p, POS_MAX))
 		return "POS_MAX key";
@@ -310,14 +317,15 @@ void __bch2_bkey_compat(unsigned level, enum btree_id btree_id,
 	const struct bkey_ops *ops;
 	struct bkey uk;
 	struct bkey_s u;
+	unsigned nr_compat = 5;
 	int i;
 
 	/*
 	 * Do these operations in reverse order in the write path:
 	 */
 
-	for (i = 0; i < 4; i++)
-	switch (!write ? i : 3 - i) {
+	for (i = 0; i < nr_compat; i++)
+	switch (!write ? i : nr_compat - 1 - i) {
 	case 0:
 		if (big_endian != CPU_BIG_ENDIAN)
 			bch2_bkey_swab_key(f, k);
@@ -351,6 +359,28 @@ void __bch2_bkey_compat(unsigned level, enum btree_id btree_id,
 		}
 		break;
 	case 3:
+		if (version < bcachefs_metadata_version_snapshot &&
+		    (level || btree_type_has_snapshots(btree_id))) {
+			struct bkey_i *u = packed_to_bkey(k);
+
+			if (u) {
+				u->k.p.snapshot = write
+					? 0 : U32_MAX;
+			} else {
+				u64 min_packed = f->field_offset[BKEY_FIELD_SNAPSHOT];
+				u64 max_packed = min_packed +
+					~(~0ULL << f->bits_per_field[BKEY_FIELD_SNAPSHOT]);
+
+				uk = __bch2_bkey_unpack_key(f, k);
+				uk.p.snapshot = write
+					? min_packed : min_t(u64, U32_MAX, max_packed);
+
+				BUG_ON(!bch2_bkey_pack_key(k, &uk, f));
+			}
+		}
+
+		break;
+	case 4:
 		if (!bkey_packed(k)) {
 			u = bkey_i_to_s(packed_to_bkey(k));
 		} else {
diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c
index 5746199dfafb..de4dc2fac1d6 100644
--- a/fs/bcachefs/bset.c
+++ b/fs/bcachefs/bset.c
@@ -1438,7 +1438,7 @@ static void btree_node_iter_init_pack_failed(struct btree_node_iter *iter,
  *    to the search key is going to have 0 sectors after the search key.
  *
  *    But this does mean that we can't just search for
- *    bkey_successor(start_of_range) to get the first extent that overlaps with
+ *    bpos_successor(start_of_range) to get the first extent that overlaps with
  *    the range we want - if we're unlucky and there's an extent that ends
  *    exactly where we searched, then there could be a deleted key at the same
  *    position and we'd get that when we search instead of the preceding extent
diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index 63b8423fa87c..85ac08b9270a 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -1018,7 +1018,7 @@ out:
 		if (sib != btree_prev_sib)
 			swap(n1, n2);
 
-		if (bpos_cmp(bkey_successor(n1->key.k.p),
+		if (bpos_cmp(bpos_successor(n1->key.k.p),
 			     n2->data->min_key)) {
 			char buf1[200], buf2[200];
 
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 2710e4b35da3..842840664562 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -64,7 +64,7 @@ static int bch2_gc_check_topology(struct bch_fs *c,
 	struct bpos node_end	= b->data->max_key;
 	struct bpos expected_start = bkey_deleted(&prev->k->k)
 		? node_start
-		: bkey_successor(prev->k->k.p);
+		: bpos_successor(prev->k->k.p);
 	char buf1[200], buf2[200];
 	bool update_min = false;
 	bool update_max = false;
@@ -1187,7 +1187,9 @@ static int bch2_gc_btree_gens(struct bch_fs *c, enum btree_id btree_id)
 	bch2_trans_init(&trans, c, 0, 0);
 
 	iter = bch2_trans_get_iter(&trans, btree_id, POS_MIN,
-				   BTREE_ITER_PREFETCH);
+				   BTREE_ITER_PREFETCH|
+				   BTREE_ITER_NOT_EXTENTS|
+				   BTREE_ITER_ALL_SNAPSHOTS);
 
 	while ((k = bch2_btree_iter_peek(iter)).k &&
 	       !(ret = bkey_err(k))) {
@@ -1405,7 +1407,7 @@ static void bch2_coalesce_nodes(struct bch_fs *c, struct btree_iter *iter,
 			n1->key.k.p = n1->data->max_key =
 				bkey_unpack_pos(n1, last);
 
-			n2->data->min_key = bkey_successor(n1->data->max_key);
+			n2->data->min_key = bpos_successor(n1->data->max_key);
 
 			memcpy_u64s(vstruct_last(s1),
 				    s2->start, u64s);
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 468b1a294ce9..bc09f9377425 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -612,12 +612,6 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
 			     BTREE_ERR_MUST_RETRY, c, ca, b, i,
 			     "incorrect level");
 
-		if (BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN) {
-			u64 *p = (u64 *) &bn->ptr;
-
-			*p = swab64(*p);
-		}
-
 		if (!write)
 			compat_btree_node(b->c.level, b->c.btree_id, version,
 					  BSET_BIG_ENDIAN(i), write, bn);
@@ -1328,8 +1322,8 @@ static int validate_bset_for_write(struct bch_fs *c, struct btree *b,
 	if (bch2_bkey_invalid(c, bkey_i_to_s_c(&b->key), BKEY_TYPE_btree))
 		return -1;
 
-	ret = validate_bset(c, NULL, b, i, sectors, WRITE, false) ?:
-		validate_bset_keys(c, b, i, &whiteout_u64s, WRITE, false);
+	ret = validate_bset_keys(c, b, i, &whiteout_u64s, WRITE, false) ?:
+		validate_bset(c, NULL, b, i, sectors, WRITE, false);
 	if (ret) {
 		bch2_inconsistent_error(c);
 		dump_stack();
@@ -1482,7 +1476,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
 		validate_before_checksum = true;
 
 	/* validate_bset will be modifying: */
-	if (le16_to_cpu(i->version) <= bcachefs_metadata_version_inode_btree_change)
+	if (le16_to_cpu(i->version) < bcachefs_metadata_version_current)
 		validate_before_checksum = true;
 
 	/* if we're going to be encrypting, check metadata validity first: */
diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h
index f155a6cc1755..9c14cd30a09e 100644
--- a/fs/bcachefs/btree_io.h
+++ b/fs/bcachefs/btree_io.h
@@ -189,8 +189,8 @@ void bch2_btree_flush_all_writes(struct bch_fs *);
 void bch2_dirty_btree_nodes_to_text(struct printbuf *, struct bch_fs *);
 
 static inline void compat_bformat(unsigned level, enum btree_id btree_id,
-				 unsigned version, unsigned big_endian,
-				 int write, struct bkey_format *f)
+				  unsigned version, unsigned big_endian,
+				  int write, struct bkey_format *f)
 {
 	if (version < bcachefs_metadata_version_inode_btree_change &&
 	    btree_id == BTREE_ID_inodes) {
@@ -199,6 +199,16 @@ static inline void compat_bformat(unsigned level, enum btree_id btree_id,
 		swap(f->field_offset[BKEY_FIELD_INODE],
 		     f->field_offset[BKEY_FIELD_OFFSET]);
 	}
+
+	if (version < bcachefs_metadata_version_snapshot &&
+	    (level || btree_type_has_snapshots(btree_id))) {
+		u64 max_packed =
+			~(~0ULL << f->bits_per_field[BKEY_FIELD_SNAPSHOT]);
+
+		f->field_offset[BKEY_FIELD_SNAPSHOT] = write
+			? 0
+			: U32_MAX - max_packed;
+	}
 }
 
 static inline void compat_bpos(unsigned level, enum btree_id btree_id,
@@ -222,16 +232,24 @@ static inline void compat_btree_node(unsigned level, enum btree_id btree_id,
 	    btree_node_type_is_extents(btree_id) &&
 	    bpos_cmp(bn->min_key, POS_MIN) &&
 	    write)
-		bn->min_key = bkey_predecessor(bn->min_key);
+		bn->min_key = bpos_nosnap_predecessor(bn->min_key);
+
+	if (version < bcachefs_metadata_version_snapshot &&
+	    write)
+		bn->max_key.snapshot = 0;
 
 	compat_bpos(level, btree_id, version, big_endian, write, &bn->min_key);
 	compat_bpos(level, btree_id, version, big_endian, write, &bn->max_key);
 
+	if (version < bcachefs_metadata_version_snapshot &&
+	    !write)
+		bn->max_key.snapshot = U32_MAX;
+
 	if (version < bcachefs_metadata_version_inode_btree_change &&
 	    btree_node_type_is_extents(btree_id) &&
 	    bpos_cmp(bn->min_key, POS_MIN) &&
 	    !write)
-		bn->min_key = bkey_successor(bn->min_key);
+		bn->min_key = bpos_nosnap_successor(bn->min_key);
 }
 
 #endif /* _BCACHEFS_BTREE_IO_H */
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 8c923aa01ea1..972486a1f724 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -18,6 +18,36 @@
 
 static void btree_iter_set_search_pos(struct btree_iter *, struct bpos);
 
+static inline struct bpos bkey_successor(struct btree_iter *iter, struct bpos p)
+{
+	EBUG_ON(btree_iter_type(iter) == BTREE_ITER_NODES);
+
+	/* Are we iterating over keys in all snapshots? */
+	if (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) {
+		p = bpos_successor(p);
+	} else {
+		p = bpos_nosnap_successor(p);
+		p.snapshot = iter->snapshot;
+	}
+
+	return p;
+}
+
+static inline struct bpos bkey_predecessor(struct btree_iter *iter, struct bpos p)
+{
+	EBUG_ON(btree_iter_type(iter) == BTREE_ITER_NODES);
+
+	/* Are we iterating over keys in all snapshots? */
+	if (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) {
+		p = bpos_predecessor(p);
+	} else {
+		p = bpos_nosnap_predecessor(p);
+		p.snapshot = iter->snapshot;
+	}
+
+	return p;
+}
+
 static inline bool is_btree_node(struct btree_iter *iter, unsigned l)
 {
 	return l < BTREE_MAX_DEPTH &&
@@ -30,7 +60,7 @@ static inline struct bpos btree_iter_search_key(struct btree_iter *iter)
 
 	if ((iter->flags & BTREE_ITER_IS_EXTENTS) &&
 	    bkey_cmp(pos, POS_MAX))
-		pos = bkey_successor(pos);
+		pos = bkey_successor(iter, pos);
 	return pos;
 }
 
@@ -591,10 +621,24 @@ err:
 
 static void bch2_btree_iter_verify(struct btree_iter *iter)
 {
+	enum btree_iter_type type = btree_iter_type(iter);
 	unsigned i;
 
 	EBUG_ON(iter->btree_id >= BTREE_ID_NR);
 
+	BUG_ON(!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS) &&
+	       iter->pos.snapshot != iter->snapshot);
+
+	BUG_ON((iter->flags & BTREE_ITER_IS_EXTENTS) &&
+	       (iter->flags & BTREE_ITER_ALL_SNAPSHOTS));
+
+	BUG_ON(type == BTREE_ITER_NODES &&
+	       !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS));
+
+	BUG_ON(type != BTREE_ITER_NODES &&
+	       (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) &&
+	       !btree_type_has_snapshots(iter->btree_id));
+
 	bch2_btree_iter_verify_locks(iter);
 
 	for (i = 0; i < BTREE_MAX_DEPTH; i++)
@@ -605,6 +649,9 @@ static void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter)
 {
 	enum btree_iter_type type = btree_iter_type(iter);
 
+	BUG_ON(!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS) &&
+	       iter->pos.snapshot != iter->snapshot);
+
 	BUG_ON((type == BTREE_ITER_KEYS ||
 		type == BTREE_ITER_CACHED) &&
 	       (bkey_cmp(iter->pos, bkey_start_pos(&iter->k)) < 0 ||
@@ -1434,7 +1481,7 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
 		 * Haven't gotten to the end of the parent node: go back down to
 		 * the next child node
 		 */
-		btree_iter_set_search_pos(iter, bkey_successor(iter->pos));
+		btree_iter_set_search_pos(iter, bpos_successor(iter->pos));
 
 		/* Unlock to avoid screwing up our lock invariants: */
 		btree_node_unlock(iter, iter->level);
@@ -1508,7 +1555,7 @@ inline bool bch2_btree_iter_advance(struct btree_iter *iter)
 	bool ret = bpos_cmp(pos, POS_MAX) != 0;
 
 	if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS))
-		pos = bkey_successor(pos);
+		pos = bkey_successor(iter, pos);
 	bch2_btree_iter_set_pos(iter, pos);
 	return ret;
 }
@@ -1519,7 +1566,7 @@ inline bool bch2_btree_iter_rewind(struct btree_iter *iter)
 	bool ret = bpos_cmp(pos, POS_MIN) != 0;
 
 	if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS))
-		pos = bkey_predecessor(pos);
+		pos = bkey_predecessor(iter, pos);
 	bch2_btree_iter_set_pos(iter, pos);
 	return ret;
 }
@@ -1535,7 +1582,7 @@ static inline bool btree_iter_set_pos_to_next_leaf(struct btree_iter *iter)
 	 * btree, in that case we want iter->pos to reflect that:
 	 */
 	if (ret)
-		btree_iter_set_search_pos(iter, bkey_successor(next_pos));
+		btree_iter_set_search_pos(iter, bpos_successor(next_pos));
 	else
 		bch2_btree_iter_set_pos(iter, POS_MAX);
 
@@ -1548,7 +1595,7 @@ static inline bool btree_iter_set_pos_to_prev_leaf(struct btree_iter *iter)
 	bool ret = bpos_cmp(next_pos, POS_MIN) != 0;
 
 	if (ret)
-		btree_iter_set_search_pos(iter, bkey_predecessor(next_pos));
+		btree_iter_set_search_pos(iter, bpos_predecessor(next_pos));
 	else
 		bch2_btree_iter_set_pos(iter, POS_MIN);
 
@@ -1594,13 +1641,13 @@ static inline struct bkey_s_c __btree_iter_peek(struct btree_iter *iter, bool wi
 		k = btree_iter_level_peek(iter, &iter->l[0]);
 
 		if (next_update &&
-		    bkey_cmp(next_update->k.p, iter->real_pos) <= 0)
+		    bpos_cmp(next_update->k.p, iter->real_pos) <= 0)
 			k = bkey_i_to_s_c(next_update);
 
 		if (likely(k.k)) {
 			if (bkey_deleted(k.k)) {
 				btree_iter_set_search_pos(iter,
-						bkey_successor(k.k->p));
+						bkey_successor(iter, k.k->p));
 				continue;
 			}
 
@@ -1739,7 +1786,7 @@ __bch2_btree_iter_peek_slot_extents(struct btree_iter *iter)
 		if (iter->pos.inode == KEY_INODE_MAX)
 			return bkey_s_c_null;
 
-		bch2_btree_iter_set_pos(iter, bkey_successor(iter->pos));
+		bch2_btree_iter_set_pos(iter, bkey_successor(iter, iter->pos));
 	}
 
 	pos = iter->pos;
@@ -1973,6 +2020,14 @@ struct btree_iter *__bch2_trans_get_iter(struct btree_trans *trans,
 {
 	struct btree_iter *iter, *best = NULL;
 
+	if ((flags & BTREE_ITER_TYPE) != BTREE_ITER_NODES &&
+	    !btree_type_has_snapshots(btree_id))
+		flags &= ~BTREE_ITER_ALL_SNAPSHOTS;
+
+	if (!(flags & BTREE_ITER_ALL_SNAPSHOTS))
+		pos.snapshot = btree_type_has_snapshots(btree_id)
+			? U32_MAX : 0;
+
 	/* We always want a fresh iterator for node iterators: */
 	if ((flags & BTREE_ITER_TYPE) == BTREE_ITER_NODES)
 		goto alloc_iter;
@@ -2007,11 +2062,14 @@ alloc_iter:
 
 	if ((flags & BTREE_ITER_TYPE) != BTREE_ITER_NODES &&
 	    btree_node_type_is_extents(btree_id) &&
-	    !(flags & BTREE_ITER_NOT_EXTENTS))
+	    !(flags & BTREE_ITER_NOT_EXTENTS) &&
+	    !(flags & BTREE_ITER_ALL_SNAPSHOTS))
 		flags |= BTREE_ITER_IS_EXTENTS;
 
 	iter->flags = flags;
 
+	iter->snapshot = pos.snapshot;
+
 	if (!(iter->flags & BTREE_ITER_INTENT))
 		bch2_btree_iter_downgrade(iter);
 	else if (!iter->locks_want)
@@ -2034,6 +2092,7 @@ struct btree_iter *bch2_trans_get_node_iter(struct btree_trans *trans,
 		__bch2_trans_get_iter(trans, btree_id, pos,
 				       BTREE_ITER_NODES|
 				       BTREE_ITER_NOT_EXTENTS|
+				       BTREE_ITER_ALL_SNAPSHOTS|
 				       flags);
 	unsigned i;
 
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index 176661b3b879..7585f989ad50 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -172,6 +172,9 @@ bool bch2_btree_iter_rewind(struct btree_iter *);
 
 static inline void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos)
 {
+	if (!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS))
+		new_pos.snapshot = iter->snapshot;
+
 	bkey_init(&iter->k);
 	iter->k.p = iter->pos = new_pos;
 }
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index bcd8db34d7ee..0bcf17159744 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -216,6 +216,7 @@ enum btree_iter_type {
 #define BTREE_ITER_CACHED_NOFILL	(1 << 9)
 #define BTREE_ITER_CACHED_NOCREATE	(1 << 10)
 #define BTREE_ITER_NOT_EXTENTS		(1 << 11)
+#define BTREE_ITER_ALL_SNAPSHOTS	(1 << 12)
 
 enum btree_iter_uptodate {
 	BTREE_ITER_UPTODATE		= 0,
@@ -245,6 +246,8 @@ struct btree_iter {
 	/* what we're searching for/what the iterator actually points to: */
 	struct bpos		real_pos;
 	struct bpos		pos_after_commit;
+	/* When we're filtering by snapshot, the snapshot ID we're looking for: */
+	unsigned		snapshot;
 
 	u16			flags;
 	u8			idx;
@@ -329,7 +332,7 @@ struct bkey_cached {
 struct btree_insert_entry {
 	unsigned		trigger_flags;
 	u8			bkey_type;
-	u8			btree_id;
+	enum btree_id		btree_id:8;
 	u8			level;
 	unsigned		trans_triggers_run:1;
 	unsigned		is_extent:1;
@@ -610,6 +613,17 @@ static inline bool btree_iter_is_extents(struct btree_iter *iter)
 	(BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS|		\
 	 BTREE_NODE_TYPE_HAS_MEM_TRIGGERS)
 
+#define BTREE_ID_HAS_SNAPSHOTS				\
+	((1U << BTREE_ID_extents)|			\
+	 (1U << BTREE_ID_inodes)|			\
+	 (1U << BTREE_ID_dirents)|			\
+	 (1U << BTREE_ID_xattrs))
+
+static inline bool btree_type_has_snapshots(enum btree_id id)
+{
+	return (1 << id) & BTREE_ID_HAS_SNAPSHOTS;
+}
+
 enum btree_trigger_flags {
 	__BTREE_TRIGGER_NORUN,		/* Don't run triggers at all */
 
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index ddb0d03e268c..aad262937645 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -69,7 +69,7 @@ static void btree_node_interior_verify(struct bch_fs *c, struct btree *b)
 			break;
 		}
 
-		next_node = bkey_successor(k.k->p);
+		next_node = bpos_successor(k.k->p);
 	}
 #endif
 }
@@ -289,7 +289,6 @@ static struct btree *bch2_btree_node_alloc(struct btree_update *as, unsigned lev
 	b->data->flags = 0;
 	SET_BTREE_NODE_ID(b->data, as->btree_id);
 	SET_BTREE_NODE_LEVEL(b->data, level);
-	b->data->ptr = bch2_bkey_ptrs_c(bkey_i_to_s_c(&b->key)).start->ptr;
 
 	if (b->key.k.type == KEY_TYPE_btree_ptr_v2) {
 		struct bkey_i_btree_ptr_v2 *bp = bkey_i_to_btree_ptr_v2(&b->key);
@@ -1100,6 +1099,7 @@ static struct btree *__btree_split_node(struct btree_update *as,
 	struct btree *n2;
 	struct bset *set1, *set2;
 	struct bkey_packed *k, *set2_start, *set2_end, *out, *prev = NULL;
+	struct bpos n1_pos;
 
 	n2 = bch2_btree_node_alloc(as, n1->c.level);
 	bch2_btree_update_add_new_node(as, n2);
@@ -1146,8 +1146,12 @@ static struct btree *__btree_split_node(struct btree_update *as,
 	n1->nr.packed_keys	= nr_packed;
 	n1->nr.unpacked_keys	= nr_unpacked;
 
-	btree_set_max(n1, bkey_unpack_pos(n1, prev));
-	btree_set_min(n2, bkey_successor(n1->key.k.p));
+	n1_pos = bkey_unpack_pos(n1, prev);
+	if (as->c->sb.version < bcachefs_metadata_version_snapshot)
+		n1_pos.snapshot = U32_MAX;
+
+	btree_set_max(n1, n1_pos);
+	btree_set_min(n2, bpos_successor(n1->key.k.p));
 
 	bch2_bkey_format_init(&s);
 	bch2_bkey_format_add_pos(&s, n2->data->min_key);
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index a32c8f34039c..88da89e8b170 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -223,9 +223,17 @@ static inline void btree_insert_entry_checks(struct btree_trans *trans,
 {
 	struct bch_fs *c = trans->c;
 
-	BUG_ON(bch2_debug_check_bkeys &&
-	       bch2_bkey_invalid(c, bkey_i_to_s_c(i->k), i->bkey_type));
-	BUG_ON(bpos_cmp(i->k->k.p, i->iter->real_pos));
+	if (bch2_debug_check_bkeys) {
+		const char *invalid = bch2_bkey_invalid(c,
+				bkey_i_to_s_c(i->k), i->bkey_type);
+		if (invalid) {
+			char buf[200];
+
+			bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(i->k));
+			panic("invalid bkey %s on insert: %s\n", buf, invalid);
+		}
+	}
+	BUG_ON(!i->is_extent && bpos_cmp(i->k->k.p, i->iter->real_pos));
 	BUG_ON(i->level		!= i->iter->level);
 	BUG_ON(i->btree_id	!= i->iter->btree_id);
 }
diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c
index 059972e5a124..111310344cec 100644
--- a/fs/bcachefs/debug.c
+++ b/fs/bcachefs/debug.c
@@ -222,7 +222,9 @@ static ssize_t bch2_read_btree(struct file *file, char __user *buf,
 
 	bch2_trans_init(&trans, i->c, 0, 0);
 
-	iter = bch2_trans_get_iter(&trans, i->id, i->from, BTREE_ITER_PREFETCH);
+	iter = bch2_trans_get_iter(&trans, i->id, i->from,
+				   BTREE_ITER_PREFETCH|
+				   BTREE_ITER_ALL_SNAPSHOTS);
 	k = bch2_btree_iter_peek(iter);
 
 	while (k.k && !(err = bkey_err(k))) {
@@ -290,7 +292,7 @@ static ssize_t bch2_read_btree_formats(struct file *file, char __user *buf,
 		 * all nodes, meh
 		 */
 		i->from = bpos_cmp(POS_MAX, b->key.k.p)
-			? bkey_successor(b->key.k.p)
+			? bpos_successor(b->key.k.p)
 			: b->key.k.p;
 
 		if (!i->size)
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 7ac3d7587655..1f28dea26ca2 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -179,7 +179,8 @@ const char *bch2_btree_ptr_v2_invalid(const struct bch_fs *c, struct bkey_s_c k)
 	if (bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX)
 		return "value too big";
 
-	if (bp.v->min_key.snapshot)
+	if (c->sb.version < bcachefs_metadata_version_snapshot &&
+	    bp.v->min_key.snapshot)
 		return "invalid min_key.snapshot";
 
 	return bch2_bkey_ptrs_invalid(c, k);
@@ -211,8 +212,8 @@ void bch2_btree_ptr_v2_compat(enum btree_id btree_id, unsigned version,
 	    btree_node_type_is_extents(btree_id) &&
 	    bkey_cmp(bp.v->min_key, POS_MIN))
 		bp.v->min_key = write
-			? bkey_predecessor(bp.v->min_key)
-			: bkey_successor(bp.v->min_key);
+			? bpos_nosnap_predecessor(bp.v->min_key)
+			: bpos_nosnap_successor(bp.v->min_key);
 }
 
 /* KEY_TYPE_extent: */
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index ffb30ef7ef00..a3acae0ddfa9 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -1318,6 +1318,7 @@ static int check_inode(struct btree_trans *trans,
 		struct bkey_inode_buf p;
 
 		bch2_inode_pack(c, &p, &u);
+		p.inode.k.p = iter->pos;
 
 		ret = __bch2_trans_do(trans, NULL, NULL,
 				      BTREE_INSERT_NOFAIL|
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index f676daf404a2..7044ab73831c 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -332,6 +332,7 @@ int bch2_inode_write(struct btree_trans *trans,
 		return PTR_ERR(inode_p);
 
 	bch2_inode_pack(trans->c, inode_p, inode);
+	inode_p->inode.k.p.snapshot = iter->snapshot;
 	bch2_trans_update(trans, iter, &inode_p->inode.k_i, 0);
 	return 0;
 }
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index 5ee9a6c2f4fd..9c46f67c0d8e 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -332,6 +332,9 @@ int bch2_extent_update(struct btree_trans *trans,
 
 		if (i_sectors_delta || new_i_size) {
 			bch2_inode_pack(trans->c, &inode_p, &inode_u);
+
+			inode_p.inode.k.p.snapshot = iter->snapshot;
+
 			bch2_trans_update(trans, inode_iter,
 					  &inode_p.inode.k_i, 0);
 		}
@@ -447,6 +450,8 @@ int bch2_write_index_default(struct bch_write_op *op)
 
 		k = bch2_keylist_front(keys);
 
+		k->k.p.snapshot = iter->snapshot;
+
 		bch2_bkey_buf_realloc(&sk, c, k->k.u64s);
 		bkey_copy(sk.k, k);
 		bch2_cut_front(iter->pos, sk.k);
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 7783a874640a..4ab9cebee218 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -1449,7 +1449,7 @@ void bch2_journal_write(struct closure *cl)
 	if (bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset)))
 		validate_before_checksum = true;
 
-	if (le32_to_cpu(jset->version) <= bcachefs_metadata_version_inode_btree_change)
+	if (le32_to_cpu(jset->version) < bcachefs_metadata_version_current)
 		validate_before_checksum = true;
 
 	if (validate_before_checksum &&
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 596f7c1e4245..a3a6abb88d6f 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -998,6 +998,13 @@ int bch2_fs_recovery(struct bch_fs *c)
 		goto err;
 	}
 
+	if (!(c->sb.compat & (1ULL << BCH_COMPAT_bformat_overflow_done))) {
+		bch_err(c, "filesystem may have incompatible bkey formats; run fsck from the compat branch to fix");
+		ret = -EINVAL;
+		goto err;
+
+	}
+
 	if (!(c->sb.features & (1ULL << BCH_FEATURE_alloc_v2))) {
 		bch_info(c, "alloc_v2 feature bit not set, fsck required");
 		c->opts.fsck = true;
@@ -1340,6 +1347,7 @@ int bch2_fs_initialize(struct bch_fs *c)
 			S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0, NULL);
 	root_inode.bi_inum = BCACHEFS_ROOT_INO;
 	bch2_inode_pack(c, &packed_inode, &root_inode);
+	packed_inode.inode.k.p.snapshot = U32_MAX;
 
 	err = "error creating root directory";
 	ret = bch2_btree_insert(c, BTREE_ID_inodes,
diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c
index 286587a118fe..3de48c593963 100644
--- a/fs/bcachefs/tests.c
+++ b/fs/bcachefs/tests.c
@@ -483,6 +483,7 @@ static int rand_insert(struct bch_fs *c, u64 nr)
 	for (i = 0; i < nr; i++) {
 		bkey_cookie_init(&k.k_i);
 		k.k.p.offset = test_rand();
+		k.k.p.snapshot = U32_MAX;
 
 		ret = __bch2_trans_do(&trans, NULL, NULL, 0,
 			__bch2_btree_insert(&trans, BTREE_ID_xattrs, &k.k_i));
-- 
cgit 


From ab2a29ccffd0e9fe62afb8bbd45e1709f9726942 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 2 Mar 2021 18:35:30 -0500
Subject: bcachefs: Inode backpointers

This patch adds two new inode fields, bi_dir and bi_dir_offset, that
point back to the inode's dirent.

Since we're only adding fields for a single backpointer, files that have
been hardlinked won't necessarily have valid backpointers: we also add a
new inode flag, BCH_INODE_BACKPTR_UNTRUSTED, that's set if an inode has
ever had multiple links to it. That's ok, because we only really need
this functionality for directories, which can never have multiple
hardlinks - when we add subvolumes, we'll need a way to enemurate and
print subvolumes, and this will let us reconstruct a path to a subvolume
root given a subvolume root inode.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs_format.h |  9 ++++--
 fs/bcachefs/dirent.c          | 18 ++++++++----
 fs/bcachefs/dirent.h          |  6 ++--
 fs/bcachefs/fs-common.c       | 68 +++++++++++++++++++++++++++++++++----------
 fs/bcachefs/fsck.c            | 43 +++++++++++++++++++++++++++
 fs/bcachefs/inode.c           | 18 ++++--------
 fs/bcachefs/inode.h           |  3 +-
 7 files changed, 125 insertions(+), 40 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index 2172d3cf3680..f2b5f5c06ee0 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -710,7 +710,9 @@ struct bch_inode_generation {
 	x(bi_foreground_target,		16)	\
 	x(bi_background_target,		16)	\
 	x(bi_erasure_code,		16)	\
-	x(bi_fields_set,		16)
+	x(bi_fields_set,		16)	\
+	x(bi_dir,			64)	\
+	x(bi_dir_offset,		64)
 
 /* subset of BCH_INODE_FIELDS */
 #define BCH_INODE_OPTS()			\
@@ -746,6 +748,7 @@ enum {
 	__BCH_INODE_I_SIZE_DIRTY= 5,
 	__BCH_INODE_I_SECTORS_DIRTY= 6,
 	__BCH_INODE_UNLINKED	= 7,
+	__BCH_INODE_BACKPTR_UNTRUSTED = 8,
 
 	/* bits 20+ reserved for packed fields below: */
 };
@@ -758,6 +761,7 @@ enum {
 #define BCH_INODE_I_SIZE_DIRTY	(1 << __BCH_INODE_I_SIZE_DIRTY)
 #define BCH_INODE_I_SECTORS_DIRTY (1 << __BCH_INODE_I_SECTORS_DIRTY)
 #define BCH_INODE_UNLINKED	(1 << __BCH_INODE_UNLINKED)
+#define BCH_INODE_BACKPTR_UNTRUSTED (1 << __BCH_INODE_BACKPTR_UNTRUSTED)
 
 LE32_BITMASK(INODE_STR_HASH,	struct bch_inode, bi_flags, 20, 24);
 LE32_BITMASK(INODE_NR_FIELDS,	struct bch_inode, bi_flags, 24, 31);
@@ -1208,7 +1212,8 @@ enum bcachefs_metadata_version {
 	bcachefs_metadata_version_bkey_renumber		= 10,
 	bcachefs_metadata_version_inode_btree_change	= 11,
 	bcachefs_metadata_version_snapshot		= 12,
-	bcachefs_metadata_version_max			= 13,
+	bcachefs_metadata_version_inode_backpointers	= 13,
+	bcachefs_metadata_version_max			= 14,
 };
 
 #define bcachefs_metadata_version_current	(bcachefs_metadata_version_max - 1)
diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c
index 592dd80cf963..cf4ce2e7f29c 100644
--- a/fs/bcachefs/dirent.c
+++ b/fs/bcachefs/dirent.c
@@ -141,7 +141,7 @@ static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans,
 int bch2_dirent_create(struct btree_trans *trans,
 		       u64 dir_inum, const struct bch_hash_info *hash_info,
 		       u8 type, const struct qstr *name, u64 dst_inum,
-		       int flags)
+		       u64 *dir_offset, int flags)
 {
 	struct bkey_i_dirent *dirent;
 	int ret;
@@ -151,8 +151,11 @@ int bch2_dirent_create(struct btree_trans *trans,
 	if (ret)
 		return ret;
 
-	return bch2_hash_set(trans, bch2_dirent_hash_desc, hash_info,
-			     dir_inum, &dirent->k_i, flags);
+	ret = bch2_hash_set(trans, bch2_dirent_hash_desc, hash_info,
+			    dir_inum, &dirent->k_i, flags);
+	*dir_offset = dirent->k.p.offset;
+
+	return ret;
 }
 
 static void dirent_copy_target(struct bkey_i_dirent *dst,
@@ -165,8 +168,8 @@ static void dirent_copy_target(struct bkey_i_dirent *dst,
 int bch2_dirent_rename(struct btree_trans *trans,
 		       u64 src_dir, struct bch_hash_info *src_hash,
 		       u64 dst_dir, struct bch_hash_info *dst_hash,
-		       const struct qstr *src_name, u64 *src_inum,
-		       const struct qstr *dst_name, u64 *dst_inum,
+		       const struct qstr *src_name, u64 *src_inum, u64 *src_offset,
+		       const struct qstr *dst_name, u64 *dst_inum, u64 *dst_offset,
 		       enum bch_rename_mode mode)
 {
 	struct btree_iter *src_iter = NULL, *dst_iter = NULL;
@@ -255,7 +258,7 @@ int bch2_dirent_rename(struct btree_trans *trans,
 				new_dst->k.p = src_iter->pos;
 				bch2_trans_update(trans, src_iter,
 						  &new_dst->k_i, 0);
-				goto out;
+				goto out_set_offset;
 			} else {
 				/* If we're overwriting, we can't insert new_dst
 				 * at a different slot because it has to
@@ -278,6 +281,9 @@ int bch2_dirent_rename(struct btree_trans *trans,
 
 	bch2_trans_update(trans, src_iter, &new_src->k_i, 0);
 	bch2_trans_update(trans, dst_iter, &new_dst->k_i, 0);
+out_set_offset:
+	*src_offset = new_src->k.p.offset;
+	*dst_offset = new_dst->k.p.offset;
 out:
 	bch2_trans_iter_put(trans, src_iter);
 	bch2_trans_iter_put(trans, dst_iter);
diff --git a/fs/bcachefs/dirent.h b/fs/bcachefs/dirent.h
index 34769371dd13..e1d8ce377d43 100644
--- a/fs/bcachefs/dirent.h
+++ b/fs/bcachefs/dirent.h
@@ -31,7 +31,7 @@ static inline unsigned dirent_val_u64s(unsigned len)
 
 int bch2_dirent_create(struct btree_trans *, u64,
 		       const struct bch_hash_info *, u8,
-		       const struct qstr *, u64, int);
+		       const struct qstr *, u64, u64 *, int);
 
 int bch2_dirent_delete_at(struct btree_trans *,
 			  const struct bch_hash_info *,
@@ -46,8 +46,8 @@ enum bch_rename_mode {
 int bch2_dirent_rename(struct btree_trans *,
 		       u64, struct bch_hash_info *,
 		       u64, struct bch_hash_info *,
-		       const struct qstr *, u64 *,
-		       const struct qstr *, u64 *,
+		       const struct qstr *, u64 *, u64 *,
+		       const struct qstr *, u64 *, u64 *,
 		       enum bch_rename_mode);
 
 struct btree_iter *
diff --git a/fs/bcachefs/fs-common.c b/fs/bcachefs/fs-common.c
index 503ce1920f39..83c2168ce480 100644
--- a/fs/bcachefs/fs-common.c
+++ b/fs/bcachefs/fs-common.c
@@ -20,8 +20,10 @@ int bch2_create_trans(struct btree_trans *trans, u64 dir_inum,
 {
 	struct bch_fs *c = trans->c;
 	struct btree_iter *dir_iter = NULL;
+	struct btree_iter *inode_iter = NULL;
 	struct bch_hash_info hash = bch2_hash_info_init(c, new_inode);
-	u64 now = bch2_current_time(trans->c);
+	u64 now = bch2_current_time(c);
+	u64 dir_offset = 0;
 	int ret;
 
 	dir_iter = bch2_inode_peek(trans, dir_u, dir_inum, BTREE_ITER_INTENT);
@@ -34,7 +36,8 @@ int bch2_create_trans(struct btree_trans *trans, u64 dir_inum,
 	if (!name)
 		new_inode->bi_flags |= BCH_INODE_UNLINKED;
 
-	ret = bch2_inode_create(trans, new_inode);
+	inode_iter = bch2_inode_create(trans, new_inode);
+	ret = PTR_ERR_OR_ZERO(inode_iter);
 	if (ret)
 		goto err;
 
@@ -66,11 +69,20 @@ int bch2_create_trans(struct btree_trans *trans, u64 dir_inum,
 		ret = bch2_dirent_create(trans, dir_inum, &dir_hash,
 					 mode_to_type(new_inode->bi_mode),
 					 name, new_inode->bi_inum,
+					 &dir_offset,
 					 BCH_HASH_SET_MUST_CREATE);
 		if (ret)
 			goto err;
 	}
+
+	if (c->sb.version >= bcachefs_metadata_version_inode_backpointers) {
+		new_inode->bi_dir		= dir_u->bi_inum;
+		new_inode->bi_dir_offset	= dir_offset;
+	}
+
+	ret = bch2_inode_write(trans, inode_iter, new_inode);
 err:
+	bch2_trans_iter_put(trans, inode_iter);
 	bch2_trans_iter_put(trans, dir_iter);
 	return ret;
 }
@@ -79,9 +91,11 @@ int bch2_link_trans(struct btree_trans *trans, u64 dir_inum,
 		    u64 inum, struct bch_inode_unpacked *dir_u,
 		    struct bch_inode_unpacked *inode_u, const struct qstr *name)
 {
+	struct bch_fs *c = trans->c;
 	struct btree_iter *dir_iter = NULL, *inode_iter = NULL;
 	struct bch_hash_info dir_hash;
-	u64 now = bch2_current_time(trans->c);
+	u64 now = bch2_current_time(c);
+	u64 dir_offset = 0;
 	int ret;
 
 	inode_iter = bch2_inode_peek(trans, inode_u, inum, BTREE_ITER_INTENT);
@@ -92,6 +106,8 @@ int bch2_link_trans(struct btree_trans *trans, u64 dir_inum,
 	inode_u->bi_ctime = now;
 	bch2_inode_nlink_inc(inode_u);
 
+	inode_u->bi_flags |= BCH_INODE_BACKPTR_UNTRUSTED;
+
 	dir_iter = bch2_inode_peek(trans, dir_u, dir_inum, 0);
 	ret = PTR_ERR_OR_ZERO(dir_iter);
 	if (ret)
@@ -99,12 +115,21 @@ int bch2_link_trans(struct btree_trans *trans, u64 dir_inum,
 
 	dir_u->bi_mtime = dir_u->bi_ctime = now;
 
-	dir_hash = bch2_hash_info_init(trans->c, dir_u);
+	dir_hash = bch2_hash_info_init(c, dir_u);
 
-	ret =   bch2_dirent_create(trans, dir_inum, &dir_hash,
-				  mode_to_type(inode_u->bi_mode),
-				  name, inum, BCH_HASH_SET_MUST_CREATE) ?:
-		bch2_inode_write(trans, dir_iter, dir_u) ?:
+	ret = bch2_dirent_create(trans, dir_inum, &dir_hash,
+				 mode_to_type(inode_u->bi_mode),
+				 name, inum, &dir_offset,
+				 BCH_HASH_SET_MUST_CREATE);
+	if (ret)
+		goto err;
+
+	if (c->sb.version >= bcachefs_metadata_version_inode_backpointers) {
+		inode_u->bi_dir		= dir_inum;
+		inode_u->bi_dir_offset	= dir_offset;
+	}
+
+	ret =   bch2_inode_write(trans, dir_iter, dir_u) ?:
 		bch2_inode_write(trans, inode_iter, inode_u);
 err:
 	bch2_trans_iter_put(trans, dir_iter);
@@ -117,10 +142,11 @@ int bch2_unlink_trans(struct btree_trans *trans,
 		      struct bch_inode_unpacked *inode_u,
 		      const struct qstr *name)
 {
+	struct bch_fs *c = trans->c;
 	struct btree_iter *dir_iter = NULL, *dirent_iter = NULL,
 			  *inode_iter = NULL;
 	struct bch_hash_info dir_hash;
-	u64 inum, now = bch2_current_time(trans->c);
+	u64 inum, now = bch2_current_time(c);
 	struct bkey_s_c k;
 	int ret;
 
@@ -129,7 +155,7 @@ int bch2_unlink_trans(struct btree_trans *trans,
 	if (ret)
 		goto err;
 
-	dir_hash = bch2_hash_info_init(trans->c, dir_u);
+	dir_hash = bch2_hash_info_init(c, dir_u);
 
 	dirent_iter = __bch2_dirent_lookup_trans(trans, dir_inum, &dir_hash,
 						 name, BTREE_ITER_INTENT);
@@ -195,10 +221,12 @@ int bch2_rename_trans(struct btree_trans *trans,
 		      const struct qstr *dst_name,
 		      enum bch_rename_mode mode)
 {
+	struct bch_fs *c = trans->c;
 	struct btree_iter *src_dir_iter = NULL, *dst_dir_iter = NULL;
 	struct btree_iter *src_inode_iter = NULL, *dst_inode_iter = NULL;
 	struct bch_hash_info src_hash, dst_hash;
-	u64 src_inode, dst_inode, now = bch2_current_time(trans->c);
+	u64 src_inode, src_offset, dst_inode, dst_offset;
+	u64 now = bch2_current_time(c);
 	int ret;
 
 	src_dir_iter = bch2_inode_peek(trans, src_dir_u, src_dir,
@@ -207,7 +235,7 @@ int bch2_rename_trans(struct btree_trans *trans,
 	if (ret)
 		goto err;
 
-	src_hash = bch2_hash_info_init(trans->c, src_dir_u);
+	src_hash = bch2_hash_info_init(c, src_dir_u);
 
 	if (dst_dir != src_dir) {
 		dst_dir_iter = bch2_inode_peek(trans, dst_dir_u, dst_dir,
@@ -216,7 +244,7 @@ int bch2_rename_trans(struct btree_trans *trans,
 		if (ret)
 			goto err;
 
-		dst_hash = bch2_hash_info_init(trans->c, dst_dir_u);
+		dst_hash = bch2_hash_info_init(c, dst_dir_u);
 	} else {
 		dst_dir_u = src_dir_u;
 		dst_hash = src_hash;
@@ -225,8 +253,8 @@ int bch2_rename_trans(struct btree_trans *trans,
 	ret = bch2_dirent_rename(trans,
 				 src_dir, &src_hash,
 				 dst_dir, &dst_hash,
-				 src_name, &src_inode,
-				 dst_name, &dst_inode,
+				 src_name, &src_inode, &src_offset,
+				 dst_name, &dst_inode, &dst_offset,
 				 mode);
 	if (ret)
 		goto err;
@@ -245,6 +273,16 @@ int bch2_rename_trans(struct btree_trans *trans,
 			goto err;
 	}
 
+	if (c->sb.version >= bcachefs_metadata_version_inode_backpointers) {
+		src_inode_u->bi_dir		= dst_dir_u->bi_inum;
+		src_inode_u->bi_dir_offset	= dst_offset;
+
+		if (mode == BCH_RENAME_EXCHANGE) {
+			dst_inode_u->bi_dir		= src_dir_u->bi_inum;
+			dst_inode_u->bi_dir_offset	= src_offset;
+		}
+	}
+
 	if (mode == BCH_RENAME_OVERWRITE) {
 		if (S_ISDIR(src_inode_u->bi_mode) !=
 		    S_ISDIR(dst_inode_u->bi_mode)) {
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index a3acae0ddfa9..d65b3e100f78 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -675,6 +675,39 @@ retry:
 			continue;
 		}
 
+		if (!target.bi_nlink &&
+		    !(target.bi_flags & BCH_INODE_BACKPTR_UNTRUSTED) &&
+		    (target.bi_dir != k.k->p.inode ||
+		     target.bi_dir_offset != k.k->p.offset) &&
+		    (fsck_err_on(c->sb.version >= bcachefs_metadata_version_inode_backpointers, c,
+				 "inode %llu has wrong backpointer:\n"
+				 "got       %llu:%llu\n"
+				 "should be %llu:%llu",
+				 d_inum,
+				 target.bi_dir,
+				 target.bi_dir_offset,
+				 k.k->p.inode,
+				 k.k->p.offset) ||
+		     c->opts.version_upgrade)) {
+			struct bkey_inode_buf p;
+
+			target.bi_dir		= k.k->p.inode;
+			target.bi_dir_offset	= k.k->p.offset;
+			bch2_trans_unlock(&trans);
+
+			bch2_inode_pack(c, &p, &target);
+
+			ret = bch2_btree_insert(c, BTREE_ID_inodes,
+						&p.inode.k_i, NULL, NULL,
+						BTREE_INSERT_NOFAIL|
+						BTREE_INSERT_LAZY_RW);
+			if (ret) {
+				bch_err(c, "error in fsck: error %i updating inode", ret);
+				goto err;
+			}
+			continue;
+		}
+
 		if (fsck_err_on(have_target &&
 				d.v->d_type !=
 				mode_to_type(target.bi_mode), c,
@@ -1314,6 +1347,16 @@ static int check_inode(struct btree_trans *trans,
 		do_update = true;
 	}
 
+	if (!S_ISDIR(u.bi_mode) &&
+	    u.bi_nlink &&
+	    !(u.bi_flags & BCH_INODE_BACKPTR_UNTRUSTED) &&
+	    (fsck_err_on(c->sb.version >= bcachefs_metadata_version_inode_backpointers, c,
+			 "inode missing BCH_INODE_BACKPTR_UNTRUSTED flags") ||
+	     c->opts.version_upgrade)) {
+		u.bi_flags |= BCH_INODE_BACKPTR_UNTRUSTED;
+		do_update = true;
+	}
+
 	if (do_update) {
 		struct bkey_inode_buf p;
 
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index 7044ab73831c..b72b3578bbe2 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -470,11 +470,10 @@ static inline u32 bkey_generation(struct bkey_s_c k)
 	}
 }
 
-int bch2_inode_create(struct btree_trans *trans,
-		      struct bch_inode_unpacked *inode_u)
+struct btree_iter *bch2_inode_create(struct btree_trans *trans,
+				     struct bch_inode_unpacked *inode_u)
 {
 	struct bch_fs *c = trans->c;
-	struct bkey_inode_buf *inode_p;
 	struct btree_iter *iter = NULL;
 	struct bkey_s_c k;
 	u64 min, max, start, *hint;
@@ -494,10 +493,6 @@ int bch2_inode_create(struct btree_trans *trans,
 
 	if (start >= max || start < min)
 		start = min;
-
-	inode_p = bch2_trans_kmalloc(trans, sizeof(*inode_p));
-	if (IS_ERR(inode_p))
-		return PTR_ERR(inode_p);
 again:
 	for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, start),
 			   BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) {
@@ -521,7 +516,7 @@ again:
 	bch2_trans_iter_put(trans, iter);
 
 	if (ret)
-		return ret;
+		return ERR_PTR(ret);
 
 	if (start != min) {
 		/* Retry from start */
@@ -529,15 +524,12 @@ again:
 		goto again;
 	}
 
-	return -ENOSPC;
+	return ERR_PTR(-ENOSPC);
 found_slot:
 	*hint			= k.k->p.offset;
 	inode_u->bi_inum	= k.k->p.offset;
 	inode_u->bi_generation	= bkey_generation(k);
-
-	ret = bch2_inode_write(trans, iter, inode_u);
-	bch2_trans_iter_put(trans, iter);
-	return ret;
+	return iter;
 }
 
 int bch2_inode_rm(struct bch_fs *c, u64 inode_nr, bool cached)
diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h
index 1caf036ae928..6bad6dfb7989 100644
--- a/fs/bcachefs/inode.h
+++ b/fs/bcachefs/inode.h
@@ -69,7 +69,8 @@ void bch2_inode_init(struct bch_fs *, struct bch_inode_unpacked *,
 		     uid_t, gid_t, umode_t, dev_t,
 		     struct bch_inode_unpacked *);
 
-int bch2_inode_create(struct btree_trans *, struct bch_inode_unpacked *);
+struct btree_iter *bch2_inode_create(struct btree_trans *,
+				     struct bch_inode_unpacked *);
 
 int bch2_inode_rm(struct bch_fs *, u64, bool);
 
-- 
cgit 


From e6ae27272491afec1994c31eee4744f19ce3b3f4 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 15 Mar 2021 19:18:30 -0400
Subject: bcachefs: Change inode allocation code for snapshots

For snapshots, when we allocate a new inode we want to allocate an inode
number that isn't in use in any other subvolume. We won't be able to use
ITER_SLOTS for this, inode allocation needs to change to use
BTREE_ITER_ALL_SNAPSHOTS.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-common.c |  6 +++-
 fs/bcachefs/inode.c     | 78 ++++++++++++++++++++++++++++++++++---------------
 fs/bcachefs/inode.h     |  2 +-
 3 files changed, 61 insertions(+), 25 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-common.c b/fs/bcachefs/fs-common.c
index 83c2168ce480..281a6135e599 100644
--- a/fs/bcachefs/fs-common.c
+++ b/fs/bcachefs/fs-common.c
@@ -36,7 +36,7 @@ int bch2_create_trans(struct btree_trans *trans, u64 dir_inum,
 	if (!name)
 		new_inode->bi_flags |= BCH_INODE_UNLINKED;
 
-	inode_iter = bch2_inode_create(trans, new_inode);
+	inode_iter = bch2_inode_create(trans, new_inode, U32_MAX);
 	ret = PTR_ERR_OR_ZERO(inode_iter);
 	if (ret)
 		goto err;
@@ -80,6 +80,10 @@ int bch2_create_trans(struct btree_trans *trans, u64 dir_inum,
 		new_inode->bi_dir_offset	= dir_offset;
 	}
 
+	/* XXX use bch2_btree_iter_set_snapshot() */
+	inode_iter->snapshot = U32_MAX;
+	bch2_btree_iter_set_pos(inode_iter, SPOS(0, new_inode->bi_inum, U32_MAX));
+
 	ret = bch2_inode_write(trans, inode_iter, new_inode);
 err:
 	bch2_trans_iter_put(trans, inode_iter);
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index b72b3578bbe2..e650c2a0d7d7 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -471,12 +471,13 @@ static inline u32 bkey_generation(struct bkey_s_c k)
 }
 
 struct btree_iter *bch2_inode_create(struct btree_trans *trans,
-				     struct bch_inode_unpacked *inode_u)
+				     struct bch_inode_unpacked *inode_u,
+				     u32 snapshot)
 {
 	struct bch_fs *c = trans->c;
 	struct btree_iter *iter = NULL;
 	struct bkey_s_c k;
-	u64 min, max, start, *hint;
+	u64 min, max, start, pos, *hint;
 	int ret;
 
 	u64 cpu = raw_smp_processor_id();
@@ -493,39 +494,70 @@ struct btree_iter *bch2_inode_create(struct btree_trans *trans,
 
 	if (start >= max || start < min)
 		start = min;
+
+	pos = start;
+	iter = bch2_trans_get_iter(trans, BTREE_ID_inodes, POS(0, pos),
+				   BTREE_ITER_ALL_SNAPSHOTS|
+				   BTREE_ITER_INTENT);
 again:
-	for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, start),
-			   BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) {
-		if (bkey_cmp(iter->pos, POS(0, max)) > 0)
-			break;
+	while ((k = bch2_btree_iter_peek(iter)).k &&
+	       !(ret = bkey_err(k)) &&
+	       bkey_cmp(k.k->p, POS(0, max)) < 0) {
+		while (pos < iter->pos.offset) {
+			if (!bch2_btree_key_cache_find(c, BTREE_ID_inodes, POS(0, pos)))
+				goto found_slot;
+
+			pos++;
+		}
+
+		if (k.k->p.snapshot == snapshot &&
+		    k.k->type != KEY_TYPE_inode &&
+		    !bch2_btree_key_cache_find(c, BTREE_ID_inodes, SPOS(0, pos, snapshot))) {
+			bch2_btree_iter_next(iter);
+			continue;
+		}
 
 		/*
-		 * There's a potential cache coherency issue with the btree key
-		 * cache code here - we're iterating over the btree, skipping
-		 * that cache. We should never see an empty slot that isn't
-		 * actually empty due to a pending update in the key cache
-		 * because the update that creates the inode isn't done with a
-		 * cached iterator, but - better safe than sorry, check the
-		 * cache before using a slot:
+		 * We don't need to iterate over keys in every snapshot once
+		 * we've found just one:
 		 */
-		if (k.k->type != KEY_TYPE_inode &&
-		    !bch2_btree_key_cache_find(c, BTREE_ID_inodes, iter->pos))
+		pos = iter->pos.offset + 1;
+		bch2_btree_iter_set_pos(iter, POS(0, pos));
+	}
+
+	while (!ret && pos < max) {
+		if (!bch2_btree_key_cache_find(c, BTREE_ID_inodes, POS(0, pos)))
 			goto found_slot;
+
+		pos++;
 	}
 
-	bch2_trans_iter_put(trans, iter);
+	if (!ret && start == min)
+		ret = -ENOSPC;
 
-	if (ret)
+	if (ret) {
+		bch2_trans_iter_put(trans, iter);
 		return ERR_PTR(ret);
-
-	if (start != min) {
-		/* Retry from start */
-		start = min;
-		goto again;
 	}
 
-	return ERR_PTR(-ENOSPC);
+	/* Retry from start */
+	pos = start = min;
+	bch2_btree_iter_set_pos(iter, POS(0, pos));
+	goto again;
 found_slot:
+	bch2_btree_iter_set_pos(iter, SPOS(0, pos, snapshot));
+	k = bch2_btree_iter_peek_slot(iter);
+	ret = bkey_err(k);
+	if (ret) {
+		bch2_trans_iter_put(trans, iter);
+		return ERR_PTR(ret);
+	}
+
+	/* We may have raced while the iterator wasn't pointing at pos: */
+	if (k.k->type == KEY_TYPE_inode ||
+	    bch2_btree_key_cache_find(c, BTREE_ID_inodes, k.k->p))
+		goto again;
+
 	*hint			= k.k->p.offset;
 	inode_u->bi_inum	= k.k->p.offset;
 	inode_u->bi_generation	= bkey_generation(k);
diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h
index 6bad6dfb7989..23c322d9a85b 100644
--- a/fs/bcachefs/inode.h
+++ b/fs/bcachefs/inode.h
@@ -70,7 +70,7 @@ void bch2_inode_init(struct bch_fs *, struct bch_inode_unpacked *,
 		     struct bch_inode_unpacked *);
 
 struct btree_iter *bch2_inode_create(struct btree_trans *,
-				     struct bch_inode_unpacked *);
+				     struct bch_inode_unpacked *, u32);
 
 int bch2_inode_rm(struct bch_fs *, u64, bool);
 
-- 
cgit 


From 883d9701f1589461ae9c9214303a8c175ffb79c5 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 16 Mar 2021 18:08:10 -0400
Subject: bcachefs: Don't use bch2_inode_find_by_inum() in move.c

Since move.c isn't aware of what subvolume we're in, we can't use the
standard inode lookup code - fortunately, we're just using it for
reading IO options.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/move.c | 42 ++++++++++++++++++++++++++++++++++++------
 1 file changed, 36 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index 87307670fd4a..3036db599e7b 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -508,6 +508,32 @@ err:
 	return ret;
 }
 
+static int lookup_inode(struct btree_trans *trans, struct bpos pos,
+			struct bch_inode_unpacked *inode)
+{
+	struct btree_iter *iter;
+	struct bkey_s_c k;
+	int ret;
+
+	iter = bch2_trans_get_iter(trans, BTREE_ID_inodes, pos,
+				   BTREE_ITER_ALL_SNAPSHOTS);
+	k = bch2_btree_iter_peek(iter);
+	ret = bkey_err(k);
+	if (ret)
+		goto err;
+
+	ret = k.k->type == KEY_TYPE_inode ? 0 : -EIO;
+	if (ret)
+		goto err;
+
+	ret = bch2_inode_unpack(bkey_s_c_to_inode(k), inode);
+	if (ret)
+		goto err;
+err:
+	bch2_trans_iter_put(trans, iter);
+	return ret;
+}
+
 static int __bch2_move_data(struct bch_fs *c,
 		struct moving_context *ctxt,
 		struct bch_ratelimit *rate,
@@ -565,7 +591,7 @@ static int __bch2_move_data(struct bch_fs *c,
 				try_to_freeze();
 			}
 		} while (delay);
-peek:
+
 		k = bch2_btree_iter_peek(iter);
 
 		stats->pos = iter->pos;
@@ -585,14 +611,18 @@ peek:
 		    cur_inum != k.k->p.inode) {
 			struct bch_inode_unpacked inode;
 
-			/* don't hold btree locks while looking up inode: */
-			bch2_trans_unlock(&trans);
-
 			io_opts = bch2_opts_to_inode_opts(c->opts);
-			if (!bch2_inode_find_by_inum(c, k.k->p.inode, &inode))
+
+			ret = lookup_inode(&trans,
+					SPOS(0, k.k->p.inode, k.k->p.snapshot),
+					&inode);
+			if (ret == -EINTR)
+				continue;
+
+			if (!ret)
 				bch2_io_opts_apply(&io_opts, bch2_inode_opts_get(&inode));
+
 			cur_inum = k.k->p.inode;
-			goto peek;
 		}
 
 		switch ((data_cmd = pred(c, arg, k, &io_opts, &data_opts))) {
-- 
cgit 


From c5f51cdd5f1c0368c73637bea045d6d20c6f87c2 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 28 Mar 2021 20:57:59 -0400
Subject: bcachefs: Have journal reclaim thread flush more aggressively

This adds a new watermark for the journal reclaim when flushing btree
key cache entries - it should try and stay ahead of where foreground
threads doing transaction commits will enter direct journal reclaim.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_key_cache.h | 9 +++++++++
 fs/bcachefs/journal_reclaim.c | 2 +-
 2 files changed, 10 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_key_cache.h b/fs/bcachefs/btree_key_cache.h
index 02715cd258ab..4e1e5a9c7656 100644
--- a/fs/bcachefs/btree_key_cache.h
+++ b/fs/bcachefs/btree_key_cache.h
@@ -1,6 +1,15 @@
 #ifndef _BCACHEFS_BTREE_KEY_CACHE_H
 #define _BCACHEFS_BTREE_KEY_CACHE_H
 
+static inline size_t bch2_nr_btree_keys_want_flush(struct bch_fs *c)
+{
+	size_t nr_dirty = atomic_long_read(&c->btree_key_cache.nr_dirty);
+	size_t nr_keys = atomic_long_read(&c->btree_key_cache.nr_keys);
+	size_t max_dirty = nr_keys / 4;
+
+	return max_t(ssize_t, 0, nr_dirty - max_dirty);
+}
+
 static inline size_t bch2_nr_btree_keys_need_flush(struct bch_fs *c)
 {
 	size_t nr_dirty = atomic_long_read(&c->btree_key_cache.nr_dirty);
diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
index 3957312d12f2..17af2bbeadee 100644
--- a/fs/bcachefs/journal_reclaim.c
+++ b/fs/bcachefs/journal_reclaim.c
@@ -602,7 +602,7 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct)
 		if (fifo_free(&j->pin) <= 32)
 			min_nr = 1;
 
-		min_nr = max(min_nr, bch2_nr_btree_keys_need_flush(c));
+		min_nr = max(min_nr, bch2_nr_btree_keys_want_flush(c));
 
 		trace_journal_reclaim_start(c,
 				min_nr,
-- 
cgit 


From a84b6c50f18e197070e35a04252fcc5c0abf2904 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 28 Mar 2021 21:20:22 -0400
Subject: bcachefs: Free iterator in bch2_btree_delete_range_trans()

This is specifically to speed up bch2_inode_rm(), so that we're not
traversing iterators we're done with.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update_leaf.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 88da89e8b170..8d0f469c808b 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -1187,7 +1187,7 @@ retry:
 		goto retry;
 	}
 
-	bch2_trans_iter_put(trans, iter);
+	bch2_trans_iter_free(trans, iter);
 	return ret;
 }
 
-- 
cgit 


From 5f65d74d791d9bc07de05d4fa4ed5bb075e07873 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 29 Mar 2021 00:19:05 -0400
Subject: bcachefs: Add repair code for out of order keys in a btree node.

This just drops the offending key - in the bug report where this was
seen, it was clearly a single bit memory error, and fsck will fix the
missing key.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_io.c | 36 ++++++++++++++----------------------
 1 file changed, 14 insertions(+), 22 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index bc09f9377425..fc94782afb60 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -578,6 +578,10 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
 		mutex_unlock(&c->sb_lock);
 	}
 
+	btree_err_on(BSET_SEPARATE_WHITEOUTS(i),
+		     BTREE_ERR_FATAL, c, ca, b, i,
+		     "BSET_SEPARATE_WHITEOUTS no longer supported");
+
 	if (btree_err_on(b->written + sectors > c->opts.btree_node_size,
 			 BTREE_ERR_FIXABLE, c, ca, b, i,
 			 "bset past end of btree node")) {
@@ -660,14 +664,8 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b,
 {
 	unsigned version = le16_to_cpu(i->version);
 	struct bkey_packed *k, *prev = NULL;
-	bool seen_non_whiteout = false;
 	int ret = 0;
 
-	if (!BSET_SEPARATE_WHITEOUTS(i)) {
-		seen_non_whiteout = true;
-		*whiteout_u64s = 0;
-	}
-
 	for (k = i->start;
 	     k != vstruct_last(i);) {
 		struct bkey_s u;
@@ -719,18 +717,7 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b,
 				    BSET_BIG_ENDIAN(i), write,
 				    &b->format, k);
 
-		/*
-		 * with the separate whiteouts thing (used for extents), the
-		 * second set of keys actually can have whiteouts too, so we
-		 * can't solely go off bkey_deleted()...
-		 */
-
-		if (!seen_non_whiteout &&
-		    (!bkey_deleted(k) ||
-		     (prev && bkey_iter_cmp(b, prev, k) > 0))) {
-			*whiteout_u64s = k->_data - i->_data;
-			seen_non_whiteout = true;
-		} else if (prev && bkey_iter_cmp(b, prev, k) > 0) {
+		if (prev && bkey_iter_cmp(b, prev, k) > 0) {
 			char buf1[80];
 			char buf2[80];
 			struct bkey up = bkey_unpack_key(b, prev);
@@ -739,10 +726,15 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b,
 			bch2_bkey_to_text(&PBUF(buf2), u.k);
 
 			bch2_dump_bset(c, b, i, 0);
-			btree_err(BTREE_ERR_FATAL, c, NULL, b, i,
-				  "keys out of order: %s > %s",
-				  buf1, buf2);
-			/* XXX: repair this */
+
+			if (btree_err(BTREE_ERR_FIXABLE, c, NULL, b, i,
+				      "keys out of order: %s > %s",
+				      buf1, buf2)) {
+				i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
+				memmove_u64s_down(k, bkey_next(k),
+						  (u64 *) vstruct_end(i) - (u64 *) k);
+				continue;
+			}
 		}
 
 		prev = k;
-- 
cgit 


From 65bcd6579df322d0c6216a534c65ee41eda2b801 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 28 Mar 2021 20:56:25 -0400
Subject: buckets.c fixups XXX squash

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/buckets.c | 89 +++++++++++++++++++++++----------------------------
 fs/bcachefs/buckets.h |  2 --
 2 files changed, 40 insertions(+), 51 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index df839021fd3d..88deb48a3a37 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -273,10 +273,9 @@ retry:
 
 	ret->online_reserved = percpu_u64_get(c->online_reserved);
 
-	u64s = fs_usage_u64s(c);
 	do {
 		seq = read_seqcount_begin(&c->usage_lock);
-		memcpy(ret, c->usage_base, u64s * sizeof(u64));
+		memcpy(&ret->u, c->usage_base, u64s * sizeof(u64));
 		for (i = 0; i < ARRAY_SIZE(c->usage); i++)
 			acc_u64s_percpu((u64 *) &ret->u, (u64 __percpu *) c->usage[i], u64s);
 	} while (read_seqcount_retry(&c->usage_lock, seq));
@@ -439,45 +438,6 @@ static bool bucket_became_unavailable(struct bucket_mark old,
 	       !is_available_bucket(new);
 }
 
-int bch2_fs_usage_apply(struct bch_fs *c,
-			struct bch_fs_usage_online *src,
-			struct disk_reservation *disk_res,
-			unsigned journal_seq)
-{
-	struct bch_fs_usage *dst = fs_usage_ptr(c, journal_seq, false);
-	s64 added = src->u.data + src->u.reserved;
-	s64 should_not_have_added;
-	int ret = 0;
-
-	percpu_rwsem_assert_held(&c->mark_lock);
-
-	/*
-	 * Not allowed to reduce sectors_available except by getting a
-	 * reservation:
-	 */
-	should_not_have_added = added - (s64) (disk_res ? disk_res->sectors : 0);
-	if (WARN_ONCE(should_not_have_added > 0,
-		      "disk usage increased by %lli more than reservation of %llu",
-		      added, disk_res ? disk_res->sectors : 0)) {
-		atomic64_sub(should_not_have_added, &c->sectors_available);
-		added -= should_not_have_added;
-		ret = -1;
-	}
-
-	if (added > 0) {
-		disk_res->sectors	-= added;
-		src->online_reserved	-= added;
-	}
-
-	this_cpu_add(*c->online_reserved, src->online_reserved);
-
-	preempt_disable();
-	acc_u64s((u64 *) dst, (u64 *) &src->u, fs_usage_u64s(c));
-	preempt_enable();
-
-	return ret;
-}
-
 static inline void account_bucket(struct bch_fs_usage *fs_usage,
 				  struct bch_dev_usage *dev_usage,
 				  enum bch_data_type type,
@@ -672,7 +632,6 @@ static int __bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
 				    size_t b, bool owned_by_allocator,
 				    bool gc)
 {
-	struct bch_fs_usage *fs_usage = fs_usage_ptr(c, 0, gc);
 	struct bucket *g = __bucket(ca, b, gc);
 	struct bucket_mark old, new;
 
@@ -680,13 +639,6 @@ static int __bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
 		new.owned_by_allocator	= owned_by_allocator;
 	}));
 
-	/*
-	 * XXX: this is wrong, this means we'll be doing updates to the percpu
-	 * buckets_alloc counter that don't have an open journal buffer and
-	 * we'll race with the machinery that accumulates that to ca->usage_base
-	 */
-	bch2_dev_usage_update(c, ca, fs_usage, old, new, 0, gc);
-
 	BUG_ON(!gc &&
 	       !owned_by_allocator && !old.owned_by_allocator);
 
@@ -1435,6 +1387,45 @@ int bch2_mark_update(struct btree_trans *trans,
 	return ret;
 }
 
+static int bch2_fs_usage_apply(struct bch_fs *c,
+			       struct bch_fs_usage_online *src,
+			       struct disk_reservation *disk_res,
+			       unsigned journal_seq)
+{
+	struct bch_fs_usage *dst = fs_usage_ptr(c, journal_seq, false);
+	s64 added = src->u.data + src->u.reserved;
+	s64 should_not_have_added;
+	int ret = 0;
+
+	percpu_rwsem_assert_held(&c->mark_lock);
+
+	/*
+	 * Not allowed to reduce sectors_available except by getting a
+	 * reservation:
+	 */
+	should_not_have_added = added - (s64) (disk_res ? disk_res->sectors : 0);
+	if (WARN_ONCE(should_not_have_added > 0,
+		      "disk usage increased by %lli more than reservation of %llu",
+		      added, disk_res ? disk_res->sectors : 0)) {
+		atomic64_sub(should_not_have_added, &c->sectors_available);
+		added -= should_not_have_added;
+		ret = -1;
+	}
+
+	if (added > 0) {
+		disk_res->sectors	-= added;
+		src->online_reserved	-= added;
+	}
+
+	this_cpu_add(*c->online_reserved, src->online_reserved);
+
+	preempt_disable();
+	acc_u64s((u64 *) dst, (u64 *) &src->u, fs_usage_u64s(c));
+	preempt_enable();
+
+	return ret;
+}
+
 void bch2_trans_fs_usage_apply(struct btree_trans *trans,
 			       struct bch_fs_usage_online *fs_usage)
 {
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index e53cee27a720..af8cb74d71e0 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -246,8 +246,6 @@ void bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *,
 
 int bch2_mark_key(struct bch_fs *, struct bkey_s_c, unsigned,
 		  s64, struct bch_fs_usage *, u64, unsigned);
-int bch2_fs_usage_apply(struct bch_fs *, struct bch_fs_usage_online *,
-			struct disk_reservation *, unsigned);
 
 int bch2_mark_update(struct btree_trans *, struct btree_iter *,
 		     struct bkey_i *, struct bch_fs_usage *, unsigned);
-- 
cgit 


From 24db24c749913f71cd90355528bad522cf197f62 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 31 Mar 2021 17:52:52 -0400
Subject: bcachefs: Don't make foreground writes wait behind journal reclaim
 too long

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update_leaf.c | 33 +++++++++++++++++++++++++++------
 fs/bcachefs/journal.c           |  1 +
 fs/bcachefs/journal_reclaim.c   |  6 ++++++
 fs/bcachefs/journal_types.h     |  1 +
 4 files changed, 35 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 8d0f469c808b..a19a4a54a1ff 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -586,6 +586,28 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans,
 	return 0;
 }
 
+static int journal_reclaim_wait_done(struct bch_fs *c)
+{
+	int ret;
+
+	ret = bch2_journal_error(&c->journal);
+	if (ret)
+		return ret;
+
+	ret = !bch2_btree_key_cache_must_wait(c);
+	if (ret)
+		return ret;
+
+	if (mutex_trylock(&c->journal.reclaim_lock)) {
+		ret = bch2_journal_reclaim(&c->journal);
+		mutex_unlock(&c->journal.reclaim_lock);
+	}
+
+	if (!ret)
+		ret = !bch2_btree_key_cache_must_wait(c);
+	return ret;
+}
+
 static noinline
 int bch2_trans_commit_error(struct btree_trans *trans,
 			    struct btree_insert_entry *i,
@@ -668,13 +690,12 @@ int bch2_trans_commit_error(struct btree_trans *trans,
 	case BTREE_INSERT_NEED_JOURNAL_RECLAIM:
 		bch2_trans_unlock(trans);
 
-		do {
-			mutex_lock(&c->journal.reclaim_lock);
-			ret = bch2_journal_reclaim(&c->journal);
-			mutex_unlock(&c->journal.reclaim_lock);
-		} while (!ret && bch2_btree_key_cache_must_wait(c));
+		wait_event(c->journal.reclaim_wait,
+			   (ret = journal_reclaim_wait_done(c)));
+		if (ret < 0)
+			return ret;
 
-		if (!ret && bch2_trans_relock(trans))
+		if (bch2_trans_relock(trans))
 			return 0;
 
 		trace_trans_restart_journal_reclaim(trans->ip);
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index 87623ec8cf47..edbcbe7fb31f 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -1116,6 +1116,7 @@ int bch2_fs_journal_init(struct journal *j)
 	spin_lock_init(&j->err_lock);
 	init_waitqueue_head(&j->wait);
 	INIT_DELAYED_WORK(&j->write_work, journal_write_work);
+	init_waitqueue_head(&j->reclaim_wait);
 	init_waitqueue_head(&j->pin_flush_wait);
 	mutex_init(&j->reclaim_lock);
 	mutex_init(&j->discard_lock);
diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
index 17af2bbeadee..3ef42a47f60d 100644
--- a/fs/bcachefs/journal_reclaim.c
+++ b/fs/bcachefs/journal_reclaim.c
@@ -604,6 +604,9 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct)
 
 		min_nr = max(min_nr, bch2_nr_btree_keys_want_flush(c));
 
+		/* Don't do too many without delivering wakeup: */
+		min_nr = min(min_nr, 128UL);
+
 		trace_journal_reclaim_start(c,
 				min_nr,
 				j->prereserved.reserved,
@@ -620,6 +623,9 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct)
 		else
 			j->nr_background_reclaim += nr_flushed;
 		trace_journal_reclaim_finish(c, nr_flushed);
+
+		if (nr_flushed)
+			wake_up(&j->reclaim_wait);
 	} while (min_nr && nr_flushed);
 
 	memalloc_noreclaim_restore(flags);
diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h
index 8ad10e46dd5d..3db8c3760cca 100644
--- a/fs/bcachefs/journal_types.h
+++ b/fs/bcachefs/journal_types.h
@@ -243,6 +243,7 @@ struct journal {
 	spinlock_t		err_lock;
 
 	struct mutex		reclaim_lock;
+	wait_queue_head_t	reclaim_wait;
 	struct task_struct	*reclaim_thread;
 	bool			reclaim_kicked;
 	u64			nr_direct_reclaim;
-- 
cgit 


From acb3b26e767a809baee5cbbf869166c45ee3bca2 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 31 Mar 2021 14:42:36 -0400
Subject: bcachefs: Move btree lock debugging to slowpath fn

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c    | 22 +++++++++++++++++-----
 fs/bcachefs/btree_locking.h | 15 +--------------
 2 files changed, 18 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 972486a1f724..42113f86e878 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -246,6 +246,7 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
 	struct btree_iter *linked, *deadlock_iter = NULL;
 	u64 start_time = local_clock();
 	unsigned reason = 9;
+	bool ret;
 
 	/* Check if it's safe to block: */
 	trans_for_each_iter(trans, linked) {
@@ -354,12 +355,23 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
 	if (six_trylock_type(&b->c.lock, type))
 		return true;
 
-	if (six_lock_type(&b->c.lock, type, should_sleep_fn, p))
-		return false;
+#ifdef CONFIG_BCACHEFS_DEBUG
+	trans->locking_iter_idx = iter->idx;
+	trans->locking_pos	= pos;
+	trans->locking_btree_id	= iter->btree_id;
+	trans->locking_level	= level;
+	trans->locking		= b;
+#endif
 
-	bch2_time_stats_update(&trans->c->times[lock_to_time_stat(type)],
-			       start_time);
-	return true;
+	ret = six_lock_type(&b->c.lock, type, should_sleep_fn, p) == 0;
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+	trans->locking = NULL;
+#endif
+	if (ret)
+		bch2_time_stats_update(&trans->c->times[lock_to_time_stat(type)],
+				       start_time);
+	return ret;
 }
 
 /* Btree iterator locking: */
diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h
index cc07ef2938ae..1146dd37adf5 100644
--- a/fs/bcachefs/btree_locking.h
+++ b/fs/bcachefs/btree_locking.h
@@ -186,27 +186,14 @@ static inline bool btree_node_lock(struct btree *b,
 			unsigned long ip)
 {
 	struct btree_trans *trans = iter->trans;
-	bool ret;
 
 	EBUG_ON(level >= BTREE_MAX_DEPTH);
 	EBUG_ON(!(trans->iters_linked & (1ULL << iter->idx)));
 
-#ifdef CONFIG_BCACHEFS_DEBUG
-	trans->locking		= b;
-	trans->locking_iter_idx = iter->idx;
-	trans->locking_pos	= pos;
-	trans->locking_btree_id	= iter->btree_id;
-	trans->locking_level	= level;
-#endif
-	ret   = likely(six_trylock_type(&b->c.lock, type)) ||
+	return likely(six_trylock_type(&b->c.lock, type)) ||
 		btree_node_lock_increment(trans, b, level, type) ||
 		__bch2_btree_node_lock(b, pos, level, iter, type,
 				       should_sleep_fn, p, ip);
-
-#ifdef CONFIG_BCACHEFS_DEBUG
-	trans->locking = NULL;
-#endif
-	return ret;
 }
 
 bool __bch2_btree_node_relock(struct btree_iter *, unsigned);
-- 
cgit 


From d5a43661a1e9d9448e9e508470deec973c3d6644 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 30 Mar 2021 20:35:46 -0400
Subject: bcachefs: Improve bch2_trans_relock()

We're getting away from relying on iter->uptodate - this changes
bch2_trans_relock() to more directly specify which iterators should be
relocked.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 42113f86e878..b74d79127df0 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -504,13 +504,12 @@ void bch2_trans_downgrade(struct btree_trans *trans)
 bool bch2_trans_relock(struct btree_trans *trans)
 {
 	struct btree_iter *iter;
-	bool ret = true;
 
 	trans_for_each_iter(trans, iter)
-		if (iter->uptodate == BTREE_ITER_NEED_RELOCK)
-			ret &= bch2_btree_iter_relock(iter, true);
-
-	return ret;
+		if (btree_iter_keep(trans, iter) &&
+		    !bch2_btree_iter_relock(iter, true))
+			return false;
+	return true;
 }
 
 void bch2_trans_unlock(struct btree_trans *trans)
-- 
cgit 


From ba5f03d362a8c7a32fd63c54cd3aeea0c9f3d7cc Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 31 Mar 2021 21:07:37 -0400
Subject: bcachefs: Add a sysfs var for average btree write size

Useful number for performance tuning.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs.h |  3 +++
 fs/bcachefs/btree_io.c |  3 +++
 fs/bcachefs/sysfs.c    | 12 ++++++++++++
 3 files changed, 18 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 51aefecb5cbb..c5ff142871c7 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -796,6 +796,9 @@ mempool_t		bio_bounce_pages;
 	struct bio_set		dio_write_bioset;
 	struct bio_set		dio_read_bioset;
 
+
+	atomic64_t		btree_writes_nr;
+	atomic64_t		btree_writes_sectors;
 	struct bio_list		btree_write_error_list;
 	struct work_struct	btree_write_error_work;
 	spinlock_t		btree_write_error_lock;
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index fc94782afb60..3b45389a8e06 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -1551,6 +1551,9 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
 
 	b->written += sectors_to_write;
 
+	atomic64_inc(&c->btree_writes_nr);
+	atomic64_add(sectors_to_write, &c->btree_writes_sectors);
+
 	/* XXX: submitting IO with btree locks held: */
 	bch2_submit_wbio_replicas(&wbio->wbio, c, BCH_DATA_btree, k.k);
 	bch2_bkey_buf_exit(&k, c);
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index 4b83a98621d7..dd9b54e0d80b 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -153,6 +153,8 @@ read_attribute(io_latency_stats_read);
 read_attribute(io_latency_stats_write);
 read_attribute(congested);
 
+read_attribute(btree_avg_write_size);
+
 read_attribute(bucket_quantiles_last_read);
 read_attribute(bucket_quantiles_last_write);
 read_attribute(bucket_quantiles_fragmentation);
@@ -228,6 +230,14 @@ static size_t bch2_btree_cache_size(struct bch_fs *c)
 	return ret;
 }
 
+static size_t bch2_btree_avg_write_size(struct bch_fs *c)
+{
+	u64 nr = atomic64_read(&c->btree_writes_nr);
+	u64 sectors = atomic64_read(&c->btree_writes_sectors);
+
+	return nr ? div64_u64(sectors, nr) : 0;
+}
+
 static int fs_alloc_debug_to_text(struct printbuf *out, struct bch_fs *c)
 {
 	struct bch_fs_usage_online *fs_usage = bch2_fs_usage_read(c);
@@ -316,6 +326,7 @@ SHOW(bch2_fs)
 	sysfs_print(block_size,			block_bytes(c));
 	sysfs_print(btree_node_size,		btree_bytes(c));
 	sysfs_hprint(btree_cache_size,		bch2_btree_cache_size(c));
+	sysfs_hprint(btree_avg_write_size,	bch2_btree_avg_write_size(c));
 
 	sysfs_print(read_realloc_races,
 		    atomic_long_read(&c->read_realloc_races));
@@ -507,6 +518,7 @@ struct attribute *bch2_fs_files[] = {
 	&sysfs_block_size,
 	&sysfs_btree_node_size,
 	&sysfs_btree_cache_size,
+	&sysfs_btree_avg_write_size,
 
 	&sysfs_journal_write_delay_ms,
 	&sysfs_journal_reclaim_delay_ms,
-- 
cgit 


From e264b2f62a8fdf571e9ca9a741719a9b567573f5 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 31 Mar 2021 15:21:37 -0400
Subject: bcachefs: Improve bch2_btree_update_start()

bch2_btree_update_start() is now responsible for taking gc_lock and
upgrading the iterator to lock parent nodes - greatly simplifying error
handling and all of the callers.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_gc.c              |   5 +-
 fs/bcachefs/btree_update_interior.c | 329 ++++++++++++------------------------
 fs/bcachefs/btree_update_interior.h |   4 +-
 3 files changed, 114 insertions(+), 224 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 842840664562..2f93c9cc757d 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -1336,11 +1336,10 @@ static void bch2_coalesce_nodes(struct bch_fs *c, struct btree_iter *iter,
 		return;
 	}
 
-	as = bch2_btree_update_start(iter->trans, iter->btree_id,
+	as = bch2_btree_update_start(iter, old_nodes[0]->c.level,
 			btree_update_reserve_required(c, parent) + nr_old_nodes,
 			BTREE_INSERT_NOFAIL|
-			BTREE_INSERT_USE_RESERVE,
-			NULL);
+			BTREE_INSERT_USE_RESERVE);
 	if (IS_ERR(as)) {
 		trace_btree_gc_coalesce_fail(c,
 				BTREE_GC_COALESCE_FAIL_RESERVE_GET);
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index aad262937645..b8e37de19d06 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -458,6 +458,10 @@ static void bch2_btree_update_free(struct btree_update *as)
 {
 	struct bch_fs *c = as->c;
 
+	if (as->took_gc_lock)
+		up_read(&c->gc_lock);
+	as->took_gc_lock = false;
+
 	bch2_journal_preres_put(&c->journal, &as->journal_preres);
 
 	bch2_journal_pin_drop(&c->journal, &as->journal);
@@ -893,24 +897,31 @@ void bch2_btree_update_done(struct btree_update *as)
 {
 	BUG_ON(as->mode == BTREE_INTERIOR_NO_UPDATE);
 
+	if (as->took_gc_lock)
+		up_read(&as->c->gc_lock);
+	as->took_gc_lock = false;
+
 	bch2_btree_reserve_put(as);
 
 	continue_at(&as->cl, btree_update_set_nodes_written, system_freezable_wq);
 }
 
 struct btree_update *
-bch2_btree_update_start(struct btree_trans *trans, enum btree_id id,
-			unsigned nr_nodes, unsigned flags,
-			struct closure *cl)
+bch2_btree_update_start(struct btree_iter *iter, unsigned level,
+			unsigned nr_nodes, unsigned flags)
 {
+	struct btree_trans *trans = iter->trans;
 	struct bch_fs *c = trans->c;
 	struct btree_update *as;
+	struct closure cl;
 	int disk_res_flags = (flags & BTREE_INSERT_NOFAIL)
 		? BCH_DISK_RESERVATION_NOFAIL : 0;
 	int journal_flags = (flags & BTREE_INSERT_JOURNAL_RESERVED)
 		? JOURNAL_RES_GET_RECLAIM : 0;
 	int ret = 0;
 
+	closure_init_stack(&cl);
+retry:
 	/*
 	 * This check isn't necessary for correctness - it's just to potentially
 	 * prevent us from doing a lot of work that'll end up being wasted:
@@ -919,12 +930,36 @@ bch2_btree_update_start(struct btree_trans *trans, enum btree_id id,
 	if (ret)
 		return ERR_PTR(ret);
 
+	/*
+	 * XXX: figure out how far we might need to split,
+	 * instead of locking/reserving all the way to the root:
+	 */
+	if (!bch2_btree_iter_upgrade(iter, U8_MAX)) {
+		trace_trans_restart_iter_upgrade(trans->ip);
+		return ERR_PTR(-EINTR);
+	}
+
+	if (flags & BTREE_INSERT_GC_LOCK_HELD)
+		lockdep_assert_held(&c->gc_lock);
+	else if (!down_read_trylock(&c->gc_lock)) {
+		if (flags & BTREE_INSERT_NOUNLOCK)
+			return ERR_PTR(-EINTR);
+
+		bch2_trans_unlock(trans);
+		down_read(&c->gc_lock);
+		if (!bch2_trans_relock(trans)) {
+			up_read(&c->gc_lock);
+			return ERR_PTR(-EINTR);
+		}
+	}
+
 	as = mempool_alloc(&c->btree_interior_update_pool, GFP_NOIO);
 	memset(as, 0, sizeof(*as));
 	closure_init(&as->cl, NULL);
 	as->c		= c;
 	as->mode	= BTREE_INTERIOR_NO_UPDATE;
-	as->btree_id	= id;
+	as->took_gc_lock = !(flags & BTREE_INSERT_GC_LOCK_HELD);
+	as->btree_id	= iter->btree_id;
 	INIT_LIST_HEAD(&as->list);
 	INIT_LIST_HEAD(&as->unwritten_list);
 	INIT_LIST_HEAD(&as->write_blocked_list);
@@ -936,8 +971,14 @@ bch2_btree_update_start(struct btree_trans *trans, enum btree_id id,
 				      BTREE_UPDATE_JOURNAL_RES,
 				      journal_flags|JOURNAL_RES_GET_NONBLOCK);
 	if (ret == -EAGAIN) {
-		if (flags & BTREE_INSERT_NOUNLOCK)
-			return ERR_PTR(-EINTR);
+		/*
+		 * this would be cleaner if bch2_journal_preres_get() took a
+		 * closure argument
+		 */
+		if (flags & BTREE_INSERT_NOUNLOCK) {
+			ret = -EINTR;
+			goto err;
+		}
 
 		bch2_trans_unlock(trans);
 
@@ -945,7 +986,7 @@ bch2_btree_update_start(struct btree_trans *trans, enum btree_id id,
 				BTREE_UPDATE_JOURNAL_RES,
 				journal_flags);
 		if (ret)
-			return ERR_PTR(ret);
+			goto err;
 
 		if (!bch2_trans_relock(trans)) {
 			ret = -EINTR;
@@ -960,7 +1001,8 @@ bch2_btree_update_start(struct btree_trans *trans, enum btree_id id,
 	if (ret)
 		goto err;
 
-	ret = bch2_btree_reserve_get(as, nr_nodes, flags, cl);
+	ret = bch2_btree_reserve_get(as, nr_nodes, flags,
+		!(flags & BTREE_INSERT_NOUNLOCK) ? &cl : NULL);
 	if (ret)
 		goto err;
 
@@ -975,6 +1017,18 @@ bch2_btree_update_start(struct btree_trans *trans, enum btree_id id,
 	return as;
 err:
 	bch2_btree_update_free(as);
+
+	if (ret == -EAGAIN) {
+		BUG_ON(flags & BTREE_INSERT_NOUNLOCK);
+
+		bch2_trans_unlock(trans);
+		closure_sync(&cl);
+		ret = -EINTR;
+	}
+
+	if (ret == -EINTR && bch2_trans_relock(trans))
+		goto retry;
+
 	return ERR_PTR(ret);
 }
 
@@ -1419,6 +1473,7 @@ void bch2_btree_insert_node(struct btree_update *as, struct btree *b,
 	int old_live_u64s = b->nr.live_u64s;
 	int live_u64s_added, u64s_added;
 
+	lockdep_assert_held(&c->gc_lock);
 	BUG_ON(!btree_node_intent_locked(iter, btree_node_root(c, b)->c.level));
 	BUG_ON(!b->c.level);
 	BUG_ON(!as || as->b);
@@ -1466,67 +1521,17 @@ split:
 int bch2_btree_split_leaf(struct bch_fs *c, struct btree_iter *iter,
 			  unsigned flags)
 {
-	struct btree_trans *trans = iter->trans;
 	struct btree *b = iter_l(iter)->b;
 	struct btree_update *as;
-	struct closure cl;
-	int ret = 0;
-
-	closure_init_stack(&cl);
-
-	/* Hack, because gc and splitting nodes doesn't mix yet: */
-	if (!(flags & BTREE_INSERT_GC_LOCK_HELD) &&
-	    !down_read_trylock(&c->gc_lock)) {
-		if (flags & BTREE_INSERT_NOUNLOCK) {
-			trace_transaction_restart_ip(trans->ip, _THIS_IP_);
-			return -EINTR;
-		}
-
-		bch2_trans_unlock(trans);
-		down_read(&c->gc_lock);
-
-		if (!bch2_trans_relock(trans))
-			ret = -EINTR;
-	}
-
-	/*
-	 * XXX: figure out how far we might need to split,
-	 * instead of locking/reserving all the way to the root:
-	 */
-	if (!bch2_btree_iter_upgrade(iter, U8_MAX)) {
-		trace_trans_restart_iter_upgrade(trans->ip);
-		ret = -EINTR;
-		goto out;
-	}
-
-	as = bch2_btree_update_start(trans, iter->btree_id,
-		btree_update_reserve_required(c, b), flags,
-		!(flags & BTREE_INSERT_NOUNLOCK) ? &cl : NULL);
-	if (IS_ERR(as)) {
-		ret = PTR_ERR(as);
-		if (ret == -EAGAIN) {
-			BUG_ON(flags & BTREE_INSERT_NOUNLOCK);
-			bch2_trans_unlock(trans);
-			ret = -EINTR;
 
-			trace_transaction_restart_ip(trans->ip, _THIS_IP_);
-		}
-		goto out;
-	}
+	as = bch2_btree_update_start(iter, iter->level,
+		btree_update_reserve_required(c, b), flags);
+	if (IS_ERR(as))
+		return PTR_ERR(as);
 
 	btree_split(as, b, iter, NULL, flags);
 	bch2_btree_update_done(as);
-
-	/*
-	 * We haven't successfully inserted yet, so don't downgrade all the way
-	 * back to read locks;
-	 */
-	__bch2_btree_iter_downgrade(iter, 1);
-out:
-	if (!(flags & BTREE_INSERT_GC_LOCK_HELD))
-		up_read(&c->gc_lock);
-	closure_sync(&cl);
-	return ret;
+	return 0;
 }
 
 void __bch2_foreground_maybe_merge(struct bch_fs *c,
@@ -1541,13 +1546,10 @@ void __bch2_foreground_maybe_merge(struct bch_fs *c,
 	struct bkey_format new_f;
 	struct bkey_i delete;
 	struct btree *b, *m, *n, *prev, *next, *parent;
-	struct closure cl;
 	size_t sib_u64s;
 	int ret = 0;
 
 	BUG_ON(!btree_node_locked(iter, level));
-
-	closure_init_stack(&cl);
 retry:
 	BUG_ON(!btree_node_locked(iter, level));
 
@@ -1605,25 +1607,15 @@ retry:
 		goto out;
 	}
 
-	/* We're changing btree topology, doesn't mix with gc: */
-	if (!(flags & BTREE_INSERT_GC_LOCK_HELD) &&
-	    !down_read_trylock(&c->gc_lock))
-		goto err_cycle_gc_lock;
-
-	if (!bch2_btree_iter_upgrade(iter, U8_MAX)) {
-		ret = -EINTR;
-		goto err_unlock;
-	}
-
-	as = bch2_btree_update_start(trans, iter->btree_id,
+	as = bch2_btree_update_start(iter, level,
 			 btree_update_reserve_required(c, parent) + 1,
 			 flags|
 			 BTREE_INSERT_NOFAIL|
-			 BTREE_INSERT_USE_RESERVE,
-			 !(flags & BTREE_INSERT_NOUNLOCK) ? &cl : NULL);
-	if (IS_ERR(as)) {
-		ret = PTR_ERR(as);
-		goto err_unlock;
+			 BTREE_INSERT_USE_RESERVE);
+	ret = PTR_ERR_OR_ZERO(as);
+	if (ret) {
+		six_unlock_intent(&m->c.lock);
+		goto err;
 	}
 
 	trace_btree_merge(c, b);
@@ -1671,9 +1663,6 @@ retry:
 	six_unlock_intent(&n->c.lock);
 
 	bch2_btree_update_done(as);
-
-	if (!(flags & BTREE_INSERT_GC_LOCK_HELD))
-		up_read(&c->gc_lock);
 out:
 	bch2_btree_trans_verify_locks(trans);
 
@@ -1686,58 +1675,52 @@ out:
 	 * split path, and downgrading to read locks in there is potentially
 	 * confusing:
 	 */
-	closure_sync(&cl);
 	return;
-
-err_cycle_gc_lock:
-	six_unlock_intent(&m->c.lock);
-
-	if (flags & BTREE_INSERT_NOUNLOCK)
-		goto out;
-
-	bch2_trans_unlock(trans);
-
-	down_read(&c->gc_lock);
-	up_read(&c->gc_lock);
-	ret = -EINTR;
-	goto err;
-
-err_unlock:
-	six_unlock_intent(&m->c.lock);
-	if (!(flags & BTREE_INSERT_GC_LOCK_HELD))
-		up_read(&c->gc_lock);
 err:
 	BUG_ON(ret == -EAGAIN && (flags & BTREE_INSERT_NOUNLOCK));
 
-	if ((ret == -EAGAIN || ret == -EINTR) &&
-	    !(flags & BTREE_INSERT_NOUNLOCK)) {
+	if (ret == -EINTR && !(flags & BTREE_INSERT_NOUNLOCK)) {
 		bch2_trans_unlock(trans);
-		closure_sync(&cl);
 		ret = bch2_btree_iter_traverse(iter);
-		if (ret)
-			goto out;
-
-		goto retry;
+		if (!ret)
+			goto retry;
 	}
 
 	goto out;
 }
 
-static int __btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter,
-				struct btree *b, unsigned flags,
-				struct closure *cl)
+/**
+ * bch_btree_node_rewrite - Rewrite/move a btree node
+ */
+int bch2_btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter,
+			    __le64 seq, unsigned flags)
 {
-	struct btree *n, *parent = btree_node_parent(iter, b);
+	struct btree *b, *n, *parent;
 	struct btree_update *as;
+	int ret;
+
+	flags |= BTREE_INSERT_NOFAIL;
+retry:
+	ret = bch2_btree_iter_traverse(iter);
+	if (ret)
+		goto out;
+
+	b = bch2_btree_iter_peek_node(iter);
+	if (!b || b->data->keys.seq != seq)
+		goto out;
 
-	as = bch2_btree_update_start(iter->trans, iter->btree_id,
+	parent = btree_node_parent(iter, b);
+	as = bch2_btree_update_start(iter, b->c.level,
 		(parent
 		 ? btree_update_reserve_required(c, parent)
 		 : 0) + 1,
-		flags, cl);
-	if (IS_ERR(as)) {
+		flags);
+	ret = PTR_ERR_OR_ZERO(as);
+	if (ret == -EINTR)
+		goto retry;
+	if (ret) {
 		trace_btree_gc_rewrite_node_fail(c, b);
-		return PTR_ERR(as);
+		goto out;
 	}
 
 	bch2_btree_interior_update_will_free_node(as, b);
@@ -1768,60 +1751,8 @@ static int __btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter,
 	six_unlock_intent(&n->c.lock);
 
 	bch2_btree_update_done(as);
-	return 0;
-}
-
-/**
- * bch_btree_node_rewrite - Rewrite/move a btree node
- *
- * Returns 0 on success, -EINTR or -EAGAIN on failure (i.e.
- * btree_check_reserve() has to wait)
- */
-int bch2_btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter,
-			    __le64 seq, unsigned flags)
-{
-	struct btree_trans *trans = iter->trans;
-	struct closure cl;
-	struct btree *b;
-	int ret;
-
-	flags |= BTREE_INSERT_NOFAIL;
-
-	closure_init_stack(&cl);
-
-	bch2_btree_iter_upgrade(iter, U8_MAX);
-
-	if (!(flags & BTREE_INSERT_GC_LOCK_HELD)) {
-		if (!down_read_trylock(&c->gc_lock)) {
-			bch2_trans_unlock(trans);
-			down_read(&c->gc_lock);
-		}
-	}
-
-	while (1) {
-		ret = bch2_btree_iter_traverse(iter);
-		if (ret)
-			break;
-
-		b = bch2_btree_iter_peek_node(iter);
-		if (!b || b->data->keys.seq != seq)
-			break;
-
-		ret = __btree_node_rewrite(c, iter, b, flags, &cl);
-		if (ret != -EAGAIN &&
-		    ret != -EINTR)
-			break;
-
-		bch2_trans_unlock(trans);
-		closure_sync(&cl);
-	}
-
+out:
 	bch2_btree_iter_downgrade(iter);
-
-	if (!(flags & BTREE_INSERT_GC_LOCK_HELD))
-		up_read(&c->gc_lock);
-
-	closure_sync(&cl);
 	return ret;
 }
 
@@ -1892,71 +1823,34 @@ int bch2_btree_node_update_key(struct bch_fs *c, struct btree_iter *iter,
 	struct btree_update *as = NULL;
 	struct btree *new_hash = NULL;
 	struct closure cl;
-	int ret;
+	int ret = 0;
 
 	closure_init_stack(&cl);
 
-	if (!bch2_btree_iter_upgrade(iter, U8_MAX))
-		return -EINTR;
-
-	if (!down_read_trylock(&c->gc_lock)) {
-		bch2_trans_unlock(iter->trans);
-		down_read(&c->gc_lock);
-
-		if (!bch2_trans_relock(iter->trans)) {
-			ret = -EINTR;
-			goto err;
-		}
-	}
-
 	/*
 	 * check btree_ptr_hash_val() after @b is locked by
 	 * btree_iter_traverse():
 	 */
 	if (btree_ptr_hash_val(new_key) != b->hash_val) {
-		/* bch2_btree_reserve_get will unlock */
 		ret = bch2_btree_cache_cannibalize_lock(c, &cl);
 		if (ret) {
 			bch2_trans_unlock(iter->trans);
-			up_read(&c->gc_lock);
 			closure_sync(&cl);
-			down_read(&c->gc_lock);
-
-			if (!bch2_trans_relock(iter->trans)) {
-				ret = -EINTR;
-				goto err;
-			}
+			if (!bch2_trans_relock(iter->trans))
+				return -EINTR;
 		}
 
 		new_hash = bch2_btree_node_mem_alloc(c);
 	}
-retry:
-	as = bch2_btree_update_start(iter->trans, iter->btree_id,
-		parent ? btree_update_reserve_required(c, parent) : 0,
-		BTREE_INSERT_NOFAIL, &cl);
 
+	as = bch2_btree_update_start(iter, b->c.level,
+		parent ? btree_update_reserve_required(c, parent) : 0,
+		BTREE_INSERT_NOFAIL);
 	if (IS_ERR(as)) {
 		ret = PTR_ERR(as);
-		if (ret == -EAGAIN)
-			ret = -EINTR;
-
-		if (ret == -EINTR) {
-			bch2_trans_unlock(iter->trans);
-			up_read(&c->gc_lock);
-			closure_sync(&cl);
-			down_read(&c->gc_lock);
-
-			if (bch2_trans_relock(iter->trans))
-				goto retry;
-		}
-
 		goto err;
 	}
 
-	ret = bch2_mark_bkey_replicas(c, bkey_i_to_s_c(new_key));
-	if (ret)
-		goto err_free_update;
-
 	__bch2_btree_node_update_key(c, as, iter, b, new_hash, new_key);
 
 	bch2_btree_iter_downgrade(iter);
@@ -1969,12 +1863,9 @@ err:
 		six_unlock_write(&new_hash->c.lock);
 		six_unlock_intent(&new_hash->c.lock);
 	}
-	up_read(&c->gc_lock);
 	closure_sync(&cl);
+	bch2_btree_cache_cannibalize_unlock(c);
 	return ret;
-err_free_update:
-	bch2_btree_update_free(as);
-	goto err;
 }
 
 /* Init code: */
diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h
index 45d212730fd7..2a6b51ece0f8 100644
--- a/fs/bcachefs/btree_update_interior.h
+++ b/fs/bcachefs/btree_update_interior.h
@@ -48,6 +48,7 @@ struct btree_update {
 	} mode;
 
 	unsigned			nodes_written:1;
+	unsigned			took_gc_lock:1;
 
 	enum btree_id			btree_id;
 
@@ -120,8 +121,7 @@ struct btree *__bch2_btree_node_alloc_replacement(struct btree_update *,
 
 void bch2_btree_update_done(struct btree_update *);
 struct btree_update *
-bch2_btree_update_start(struct btree_trans *, enum btree_id, unsigned,
-			unsigned, struct closure *);
+bch2_btree_update_start(struct btree_iter *, unsigned, unsigned, unsigned);
 
 void bch2_btree_interior_update_will_free_node(struct btree_update *,
 					       struct btree *);
-- 
cgit 


From 1259cc31b23221feae5e49cc02eaf7cfb7df9f54 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 31 Mar 2021 15:39:16 -0400
Subject: bcachefs: Change where merging of interior btree nodes is trigger
 from

Previously, we were doing btree node merging from
bch2_btree_insert_node() - but this is called from the split path, when
we're in the middle of creating new nodes and deleting new nodes and the
iterators are in a weird state.

Also, this means we're starting a new btree_update while in the middle
of an existing one, and that's asking for deadlocks.

Much simpler and saner to trigger btree node merging _after_ the whole
btree node split path is finished.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update_interior.c | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index b8e37de19d06..fddb0c3e7167 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -1505,14 +1505,6 @@ void bch2_btree_insert_node(struct btree_update *as, struct btree *b,
 	bch2_btree_node_unlock_write(b, iter);
 
 	btree_node_interior_verify(c, b);
-
-	/*
-	 * when called from the btree_split path the new nodes aren't added to
-	 * the btree iterator yet, so the merge path's unlock/wait/relock dance
-	 * won't work:
-	 */
-	bch2_foreground_maybe_merge(c, iter, b->c.level,
-				    flags|BTREE_INSERT_NOUNLOCK);
 	return;
 split:
 	btree_split(as, b, iter, keys, flags);
@@ -1523,6 +1515,8 @@ int bch2_btree_split_leaf(struct bch_fs *c, struct btree_iter *iter,
 {
 	struct btree *b = iter_l(iter)->b;
 	struct btree_update *as;
+	unsigned l;
+	int ret = 0;
 
 	as = bch2_btree_update_start(iter, iter->level,
 		btree_update_reserve_required(c, b), flags);
@@ -1531,7 +1525,11 @@ int bch2_btree_split_leaf(struct bch_fs *c, struct btree_iter *iter,
 
 	btree_split(as, b, iter, NULL, flags);
 	bch2_btree_update_done(as);
-	return 0;
+
+	for (l = iter->level + 1; btree_iter_node(iter, l) && !ret; l++)
+		bch2_foreground_maybe_merge(c, iter, l, flags);
+
+	return ret;
 }
 
 void __bch2_foreground_maybe_merge(struct bch_fs *c,
-- 
cgit 


From 54ca47e114c0cccc075a722f528de2b50b149b49 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 29 Mar 2021 01:13:31 -0400
Subject: bcachefs: Kill bch2_btree_node_get_sibling()

This patch reworks the btree node merge path to use a second btree
iterator to get the sibling node - which means
bch2_btree_iter_get_sibling() can be deleted. Also, it uses
bch2_btree_iter_traverse_all() if necessary - which means it should be
more reliable. We don't currently even try to make it work when
trans->nounlock is set - after a BTREE_INSERT_NOUNLOCK transaction
commit, hopefully this will be a worthwhile tradeoff.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_cache.c           | 134 +-----------------------------------
 fs/bcachefs/btree_cache.h           |   3 -
 fs/bcachefs/btree_update_interior.c |  61 ++++++++++------
 3 files changed, 43 insertions(+), 155 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index 85ac08b9270a..2ec668c3427e 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -913,136 +913,6 @@ out:
 	return b;
 }
 
-struct btree *bch2_btree_node_get_sibling(struct bch_fs *c,
-					  struct btree_iter *iter,
-					  struct btree *b,
-					  enum btree_node_sibling sib)
-{
-	struct btree_trans *trans = iter->trans;
-	struct btree *parent;
-	struct btree_node_iter node_iter;
-	struct bkey_packed *k;
-	struct bkey_buf tmp;
-	struct btree *ret = NULL;
-	unsigned level = b->c.level;
-
-	bch2_bkey_buf_init(&tmp);
-
-	parent = btree_iter_node(iter, level + 1);
-	if (!parent)
-		return NULL;
-
-	/*
-	 * There's a corner case where a btree_iter might have a node locked
-	 * that is just outside its current pos - when
-	 * bch2_btree_iter_set_pos_same_leaf() gets to the end of the node.
-	 *
-	 * But the lock ordering checks in __bch2_btree_node_lock() go off of
-	 * iter->pos, not the node's key: so if the iterator is marked as
-	 * needing to be traversed, we risk deadlock if we don't bail out here:
-	 */
-	if (iter->uptodate >= BTREE_ITER_NEED_TRAVERSE)
-		return ERR_PTR(-EINTR);
-
-	if (!bch2_btree_node_relock(iter, level + 1)) {
-		ret = ERR_PTR(-EINTR);
-		goto out;
-	}
-
-	node_iter = iter->l[parent->c.level].iter;
-
-	k = bch2_btree_node_iter_peek_all(&node_iter, parent);
-	BUG_ON(bkey_cmp_left_packed(parent, k, &b->key.k.p));
-
-	k = sib == btree_prev_sib
-		? bch2_btree_node_iter_prev(&node_iter, parent)
-		: (bch2_btree_node_iter_advance(&node_iter, parent),
-		   bch2_btree_node_iter_peek(&node_iter, parent));
-	if (!k)
-		goto out;
-
-	bch2_bkey_buf_unpack(&tmp, c, parent, k);
-
-	ret = bch2_btree_node_get(c, iter, tmp.k, level,
-				  SIX_LOCK_intent, _THIS_IP_);
-
-	if (PTR_ERR_OR_ZERO(ret) == -EINTR && !trans->nounlock) {
-		struct btree_iter *linked;
-
-		if (!bch2_btree_node_relock(iter, level + 1))
-			goto out;
-
-		/*
-		 * We might have got -EINTR because trylock failed, and we're
-		 * holding other locks that would cause us to deadlock:
-		 */
-		trans_for_each_iter(trans, linked)
-			if (btree_iter_lock_cmp(iter, linked) < 0)
-				__bch2_btree_iter_unlock(linked);
-
-		if (sib == btree_prev_sib)
-			btree_node_unlock(iter, level);
-
-		ret = bch2_btree_node_get(c, iter, tmp.k, level,
-					  SIX_LOCK_intent, _THIS_IP_);
-
-		/*
-		 * before btree_iter_relock() calls btree_iter_verify_locks():
-		 */
-		if (btree_lock_want(iter, level + 1) == BTREE_NODE_UNLOCKED)
-			btree_node_unlock(iter, level + 1);
-
-		if (!bch2_btree_node_relock(iter, level)) {
-			btree_iter_set_dirty(iter, BTREE_ITER_NEED_RELOCK);
-
-			if (!IS_ERR(ret)) {
-				six_unlock_intent(&ret->c.lock);
-				ret = ERR_PTR(-EINTR);
-			}
-		}
-
-		bch2_trans_relock(trans);
-	}
-out:
-	if (btree_lock_want(iter, level + 1) == BTREE_NODE_UNLOCKED)
-		btree_node_unlock(iter, level + 1);
-
-	if (PTR_ERR_OR_ZERO(ret) == -EINTR)
-		bch2_btree_iter_upgrade(iter, level + 2);
-
-	BUG_ON(!IS_ERR(ret) && !btree_node_locked(iter, level));
-
-	if (!IS_ERR_OR_NULL(ret)) {
-		struct btree *n1 = ret, *n2 = b;
-
-		if (sib != btree_prev_sib)
-			swap(n1, n2);
-
-		if (bpos_cmp(bpos_successor(n1->key.k.p),
-			     n2->data->min_key)) {
-			char buf1[200], buf2[200];
-
-			bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(&n1->key));
-			bch2_bkey_val_to_text(&PBUF(buf2), c, bkey_i_to_s_c(&n2->key));
-
-			bch2_fs_inconsistent(c, "btree topology error at btree %s level %u:\n"
-					     "prev: %s\n"
-					     "next: %s\n",
-					     bch2_btree_ids[iter->btree_id], level,
-					     buf1, buf2);
-
-			six_unlock_intent(&ret->c.lock);
-			ret = NULL;
-		}
-	}
-
-	bch2_btree_trans_verify_locks(trans);
-
-	bch2_bkey_buf_exit(&tmp, c);
-
-	return ret;
-}
-
 void bch2_btree_node_prefetch(struct bch_fs *c, struct btree_iter *iter,
 			      const struct bkey_i *k,
 			      enum btree_id btree_id, unsigned level)
@@ -1082,7 +952,7 @@ void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c,
 	       "    format: u64s %u fields %u %u %u %u %u\n"
 	       "    unpack fn len: %u\n"
 	       "    bytes used %zu/%zu (%zu%% full)\n"
-	       "    sib u64s: %u, %u (merge threshold %zu)\n"
+	       "    sib u64s: %u, %u (merge threshold %u)\n"
 	       "    nr packed keys %u\n"
 	       "    nr unpacked keys %u\n"
 	       "    floats %zu\n"
@@ -1099,7 +969,7 @@ void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c,
 	       b->nr.live_u64s * 100 / btree_max_u64s(c),
 	       b->sib_u64s[0],
 	       b->sib_u64s[1],
-	       BTREE_FOREGROUND_MERGE_THRESHOLD(c),
+	       c->btree_foreground_merge_threshold,
 	       b->nr.packed_keys,
 	       b->nr.unpacked_keys,
 	       stats.floats,
diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h
index 217988696a77..aa8fe4a1b04b 100644
--- a/fs/bcachefs/btree_cache.h
+++ b/fs/bcachefs/btree_cache.h
@@ -26,9 +26,6 @@ struct btree *bch2_btree_node_get(struct bch_fs *, struct btree_iter *,
 struct btree *bch2_btree_node_get_noiter(struct bch_fs *, const struct bkey_i *,
 					 enum btree_id, unsigned, bool);
 
-struct btree *bch2_btree_node_get_sibling(struct bch_fs *, struct btree_iter *,
-				struct btree *, enum btree_node_sibling);
-
 void bch2_btree_node_prefetch(struct bch_fs *, struct btree_iter *,
 			      const struct bkey_i *, enum btree_id, unsigned);
 
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index fddb0c3e7167..af7c2df56692 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -1539,36 +1539,50 @@ void __bch2_foreground_maybe_merge(struct bch_fs *c,
 				   enum btree_node_sibling sib)
 {
 	struct btree_trans *trans = iter->trans;
+	struct btree_iter *sib_iter = NULL;
 	struct btree_update *as;
 	struct bkey_format_state new_s;
 	struct bkey_format new_f;
 	struct bkey_i delete;
 	struct btree *b, *m, *n, *prev, *next, *parent;
+	struct bpos sib_pos;
 	size_t sib_u64s;
 	int ret = 0;
 
+	if (trans->nounlock)
+		return;
+
 	BUG_ON(!btree_node_locked(iter, level));
 retry:
+	ret = bch2_btree_iter_traverse(iter);
+	if (ret)
+		goto err;
+
 	BUG_ON(!btree_node_locked(iter, level));
 
 	b = iter->l[level].b;
 
-	parent = btree_node_parent(iter, b);
-	if (!parent)
+	if ((sib == btree_prev_sib && !bpos_cmp(b->data->min_key, POS_MIN)) ||
+	    (sib == btree_next_sib && !bpos_cmp(b->data->max_key, POS_MAX))) {
+		b->sib_u64s[sib] = U16_MAX;
 		goto out;
+	}
 
-	if (b->sib_u64s[sib] > BTREE_FOREGROUND_MERGE_THRESHOLD(c))
-		goto out;
+	sib_pos = sib == btree_prev_sib
+		? bpos_predecessor(b->data->min_key)
+		: bpos_successor(b->data->max_key);
 
-	/* XXX: can't be holding read locks */
-	m = bch2_btree_node_get_sibling(c, iter, b, sib);
-	if (IS_ERR(m)) {
-		ret = PTR_ERR(m);
+	sib_iter = bch2_trans_get_node_iter(trans, iter->btree_id,
+					    sib_pos, U8_MAX, level,
+					    BTREE_ITER_INTENT);
+	ret = bch2_btree_iter_traverse(sib_iter);
+	if (ret)
 		goto err;
-	}
 
-	/* NULL means no sibling: */
-	if (!m) {
+	m = sib_iter->l[level].b;
+
+	if (btree_node_parent(iter, b) !=
+	    btree_node_parent(sib_iter, m)) {
 		b->sib_u64s[sib] = U16_MAX;
 		goto out;
 	}
@@ -1581,6 +1595,8 @@ retry:
 		next = m;
 	}
 
+	BUG_ON(bkey_cmp(bpos_successor(prev->data->max_key), next->data->min_key));
+
 	bch2_bkey_format_init(&new_s);
 	bch2_bkey_format_add_pos(&new_s, prev->data->min_key);
 	__bch2_btree_calc_format(&new_s, prev);
@@ -1598,23 +1614,21 @@ retry:
 	}
 
 	sib_u64s = min(sib_u64s, btree_max_u64s(c));
+	sib_u64s = min(sib_u64s, (size_t) U16_MAX - 1);
 	b->sib_u64s[sib] = sib_u64s;
 
-	if (b->sib_u64s[sib] > BTREE_FOREGROUND_MERGE_THRESHOLD(c)) {
-		six_unlock_intent(&m->c.lock);
+	if (b->sib_u64s[sib] > c->btree_foreground_merge_threshold)
 		goto out;
-	}
 
+	parent = btree_node_parent(iter, b);
 	as = bch2_btree_update_start(iter, level,
 			 btree_update_reserve_required(c, parent) + 1,
 			 flags|
 			 BTREE_INSERT_NOFAIL|
 			 BTREE_INSERT_USE_RESERVE);
 	ret = PTR_ERR_OR_ZERO(as);
-	if (ret) {
-		six_unlock_intent(&m->c.lock);
+	if (ret)
 		goto err;
-	}
 
 	trace_btree_merge(c, b);
 
@@ -1648,6 +1662,7 @@ retry:
 	bch2_btree_update_get_open_buckets(as, n);
 
 	six_lock_increment(&b->c.lock, SIX_LOCK_intent);
+	six_lock_increment(&m->c.lock, SIX_LOCK_intent);
 	bch2_btree_iter_node_drop(iter, b);
 	bch2_btree_iter_node_drop(iter, m);
 
@@ -1663,6 +1678,7 @@ retry:
 	bch2_btree_update_done(as);
 out:
 	bch2_btree_trans_verify_locks(trans);
+	bch2_trans_iter_free(trans, sib_iter);
 
 	/*
 	 * Don't downgrade locks here: we're called after successful insert,
@@ -1675,11 +1691,16 @@ out:
 	 */
 	return;
 err:
-	BUG_ON(ret == -EAGAIN && (flags & BTREE_INSERT_NOUNLOCK));
+	bch2_trans_iter_put(trans, sib_iter);
+	sib_iter = NULL;
+
+	if (ret == -EINTR && bch2_trans_relock(trans)) {
+		ret = 0;
+		goto retry;
+	}
 
 	if (ret == -EINTR && !(flags & BTREE_INSERT_NOUNLOCK)) {
-		bch2_trans_unlock(trans);
-		ret = bch2_btree_iter_traverse(iter);
+		ret = bch2_btree_iter_traverse_all(trans);
 		if (!ret)
 			goto retry;
 	}
-- 
cgit 


From ecab6be7e5c3e19d25a4ad9d5d97c83e3ac67507 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 31 Mar 2021 16:16:39 -0400
Subject: bcachefs: bch2_foreground_maybe_merge() now correctly reports lock
 restarts

This means that btree node splits don't have to automatically trigger a
transaction restart.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update_interior.c | 24 ++++++++++--------------
 fs/bcachefs/btree_update_interior.h | 24 ++++++++++++------------
 2 files changed, 22 insertions(+), 26 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index af7c2df56692..988922699e8b 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -1527,16 +1527,16 @@ int bch2_btree_split_leaf(struct bch_fs *c, struct btree_iter *iter,
 	bch2_btree_update_done(as);
 
 	for (l = iter->level + 1; btree_iter_node(iter, l) && !ret; l++)
-		bch2_foreground_maybe_merge(c, iter, l, flags);
+		ret = bch2_foreground_maybe_merge(c, iter, l, flags);
 
 	return ret;
 }
 
-void __bch2_foreground_maybe_merge(struct bch_fs *c,
-				   struct btree_iter *iter,
-				   unsigned level,
-				   unsigned flags,
-				   enum btree_node_sibling sib)
+int __bch2_foreground_maybe_merge(struct bch_fs *c,
+				  struct btree_iter *iter,
+				  unsigned level,
+				  unsigned flags,
+				  enum btree_node_sibling sib)
 {
 	struct btree_trans *trans = iter->trans;
 	struct btree_iter *sib_iter = NULL;
@@ -1547,10 +1547,7 @@ void __bch2_foreground_maybe_merge(struct bch_fs *c,
 	struct btree *b, *m, *n, *prev, *next, *parent;
 	struct bpos sib_pos;
 	size_t sib_u64s;
-	int ret = 0;
-
-	if (trans->nounlock)
-		return;
+	int ret = 0, ret2 = 0;
 
 	BUG_ON(!btree_node_locked(iter, level));
 retry:
@@ -1689,17 +1686,16 @@ out:
 	 * split path, and downgrading to read locks in there is potentially
 	 * confusing:
 	 */
-	return;
+	return ret ?: ret2;
 err:
 	bch2_trans_iter_put(trans, sib_iter);
 	sib_iter = NULL;
 
-	if (ret == -EINTR && bch2_trans_relock(trans)) {
-		ret = 0;
+	if (ret == -EINTR && bch2_trans_relock(trans))
 		goto retry;
-	}
 
 	if (ret == -EINTR && !(flags & BTREE_INSERT_NOUNLOCK)) {
+		ret2 = ret;
 		ret = bch2_btree_iter_traverse_all(trans);
 		if (!ret)
 			goto retry;
diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h
index 2a6b51ece0f8..f2925b0d7f17 100644
--- a/fs/bcachefs/btree_update_interior.h
+++ b/fs/bcachefs/btree_update_interior.h
@@ -132,10 +132,10 @@ void bch2_btree_insert_node(struct btree_update *, struct btree *,
 			    unsigned);
 int bch2_btree_split_leaf(struct bch_fs *, struct btree_iter *, unsigned);
 
-void __bch2_foreground_maybe_merge(struct bch_fs *, struct btree_iter *,
-				   unsigned, unsigned, enum btree_node_sibling);
+int __bch2_foreground_maybe_merge(struct bch_fs *, struct btree_iter *,
+				  unsigned, unsigned, enum btree_node_sibling);
 
-static inline void bch2_foreground_maybe_merge_sibling(struct bch_fs *c,
+static inline int bch2_foreground_maybe_merge_sibling(struct bch_fs *c,
 					struct btree_iter *iter,
 					unsigned level, unsigned flags,
 					enum btree_node_sibling sib)
@@ -143,27 +143,27 @@ static inline void bch2_foreground_maybe_merge_sibling(struct bch_fs *c,
 	struct btree *b;
 
 	if (iter->uptodate >= BTREE_ITER_NEED_TRAVERSE)
-		return;
+		return 0;
 
 	if (!bch2_btree_node_relock(iter, level))
-		return;
+		return 0;
 
 	b = iter->l[level].b;
 	if (b->sib_u64s[sib] > c->btree_foreground_merge_threshold)
-		return;
+		return 0;
 
-	__bch2_foreground_maybe_merge(c, iter, level, flags, sib);
+	return __bch2_foreground_maybe_merge(c, iter, level, flags, sib);
 }
 
-static inline void bch2_foreground_maybe_merge(struct bch_fs *c,
+static inline int bch2_foreground_maybe_merge(struct bch_fs *c,
 					       struct btree_iter *iter,
 					       unsigned level,
 					       unsigned flags)
 {
-	bch2_foreground_maybe_merge_sibling(c, iter, level, flags,
-					    btree_prev_sib);
-	bch2_foreground_maybe_merge_sibling(c, iter, level, flags,
-					    btree_next_sib);
+	return  bch2_foreground_maybe_merge_sibling(c, iter, level, flags,
+						    btree_prev_sib) ?:
+		bch2_foreground_maybe_merge_sibling(c, iter, level, flags,
+						    btree_next_sib);
 }
 
 void bch2_btree_set_root_for_read(struct bch_fs *, struct btree *);
-- 
cgit 


From b182ff609f2519998a6b4ad853a2fc1d3f0f0af5 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 29 Mar 2021 01:13:31 -0400
Subject: bcachefs: Move btree node merging to before transaction commit

Currently, BTREE_INSERT_NOUNLOCK makes it hard to ensure btree node
merging happens reliably - since btree node merging happens after
transaction commit, we can't drop btree locks and block when starting
the btree update.

This patch moves it to before transaction commit - and failure to do a
merge that we wanted to do just restarts the transaction.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update_leaf.c | 76 ++++++++++++++++++++++++++++++++---------
 1 file changed, 60 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index a19a4a54a1ff..ccfc046220e2 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -492,20 +492,75 @@ err:
 	return ret;
 }
 
+static noinline int maybe_do_btree_merge(struct btree_trans *trans, struct btree_iter *iter)
+{
+	struct btree_insert_entry *i;
+	struct btree *b = iter_l(iter)->b;
+	struct bkey_s_c old;
+	int u64s_delta = 0;
+	int ret;
+
+	/*
+	 * Inserting directly into interior nodes is an uncommon operation with
+	 * various weird edge cases: also, a lot of things about
+	 * BTREE_ITER_NODES iters need to be audited
+	 */
+	if (unlikely(btree_iter_type(iter) != BTREE_ITER_KEYS))
+		return 0;
+
+	BUG_ON(iter->level);
+
+	trans_for_each_update2(trans, i) {
+		if (iter_l(i->iter)->b != b)
+			continue;
+
+		old = bch2_btree_iter_peek_slot(i->iter);
+		ret = bkey_err(old);
+		if (ret)
+			return ret;
+
+		u64s_delta += !bkey_deleted(&i->k->k) ? i->k->k.u64s : 0;
+		u64s_delta -= !bkey_deleted(old.k) ? old.k->u64s : 0;
+	}
+
+	return u64s_delta <= 0
+		? (bch2_foreground_maybe_merge(trans->c, iter, iter->level,
+				trans->flags & ~BTREE_INSERT_NOUNLOCK) ?: -EINTR)
+		: 0;
+}
+
 /*
  * Get journal reservation, take write locks, and attempt to do btree update(s):
  */
 static inline int do_bch2_trans_commit(struct btree_trans *trans,
 				       struct btree_insert_entry **stopped_at)
 {
+	struct bch_fs *c = trans->c;
 	struct btree_insert_entry *i;
 	struct btree_iter *iter;
 	int ret;
 
+	trans_for_each_update2(trans, i) {
+		struct btree *b;
+
+		BUG_ON(!btree_node_intent_locked(i->iter, i->level));
+
+		if (btree_iter_type(i->iter) == BTREE_ITER_CACHED)
+			continue;
+
+		b = iter_l(i->iter)->b;
+		if (b->sib_u64s[0] < c->btree_foreground_merge_threshold ||
+		    b->sib_u64s[1] < c->btree_foreground_merge_threshold) {
+			ret = maybe_do_btree_merge(trans, i->iter);
+			if (unlikely(ret))
+				return ret;
+		}
+	}
+
 	trans_for_each_update2(trans, i)
-		BUG_ON(!btree_node_intent_locked(i->iter, i->iter->level));
+		BUG_ON(!btree_node_intent_locked(i->iter, i->level));
 
-	ret = bch2_journal_preres_get(&trans->c->journal,
+	ret = bch2_journal_preres_get(&c->journal,
 			&trans->journal_preres, trans->journal_preres_u64s,
 			JOURNAL_RES_GET_NONBLOCK|
 			((trans->flags & BTREE_INSERT_JOURNAL_RECLAIM)
@@ -547,7 +602,7 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans,
 
 	trans_for_each_update2(trans, i)
 		if (!same_leaf_as_prev(trans, i))
-			bch2_btree_node_lock_for_insert(trans->c,
+			bch2_btree_node_lock_for_insert(c,
 					iter_l(i->iter)->b, i->iter);
 
 	ret = bch2_trans_commit_write_locked(trans, stopped_at);
@@ -558,29 +613,18 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans,
 							     i->iter);
 
 	if (!ret && trans->journal_pin)
-		bch2_journal_pin_add(&trans->c->journal, trans->journal_res.seq,
+		bch2_journal_pin_add(&c->journal, trans->journal_res.seq,
 				     trans->journal_pin, NULL);
 
 	/*
 	 * Drop journal reservation after dropping write locks, since dropping
 	 * the journal reservation may kick off a journal write:
 	 */
-	bch2_journal_res_put(&trans->c->journal, &trans->journal_res);
+	bch2_journal_res_put(&c->journal, &trans->journal_res);
 
 	if (unlikely(ret))
 		return ret;
 
-	if (trans->flags & BTREE_INSERT_NOUNLOCK)
-		trans->nounlock = true;
-
-	trans_for_each_update2(trans, i)
-		if (btree_iter_type(i->iter) != BTREE_ITER_CACHED &&
-		    !same_leaf_as_prev(trans, i))
-			bch2_foreground_maybe_merge(trans->c, i->iter,
-						    0, trans->flags);
-
-	trans->nounlock = false;
-
 	bch2_trans_downgrade(trans);
 
 	return 0;
-- 
cgit 


From 5c1d808ad8b80618186227e33f46b7cc5fac5461 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 31 Mar 2021 16:43:50 -0400
Subject: bcachefs: Drop trans->nounlock

Since we're no longer doing btree node merging post commit, we can now
delete a bunch of code.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c        | 94 +++++++++++++++++------------------------
 fs/bcachefs/btree_iter.h        |  5 +--
 fs/bcachefs/btree_locking.h     |  9 +---
 fs/bcachefs/btree_types.h       |  1 -
 fs/bcachefs/btree_update_leaf.c |  9 ++--
 5 files changed, 44 insertions(+), 74 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index b74d79127df0..50712c99d2bd 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -267,17 +267,12 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
 		 */
 		if (type == SIX_LOCK_intent &&
 		    linked->nodes_locked != linked->nodes_intent_locked) {
-			if (!(trans->nounlock)) {
-				linked->locks_want = max_t(unsigned,
-						linked->locks_want,
-						__fls(linked->nodes_locked) + 1);
-				if (!btree_iter_get_locks(linked, true, false)) {
-					deadlock_iter = linked;
-					reason = 1;
-				}
-			} else {
+			linked->locks_want = max_t(unsigned,
+					linked->locks_want,
+					__fls(linked->nodes_locked) + 1);
+			if (!btree_iter_get_locks(linked, true, false)) {
 				deadlock_iter = linked;
-				reason = 2;
+				reason = 1;
 			}
 		}
 
@@ -307,18 +302,13 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
 		 * we're about to lock, it must have the ancestors locked too:
 		 */
 		if (level > __fls(linked->nodes_locked)) {
-			if (!(trans->nounlock)) {
-				linked->locks_want =
-					max(level + 1, max_t(unsigned,
-					    linked->locks_want,
-					    iter->locks_want));
-				if (!btree_iter_get_locks(linked, true, false)) {
-					deadlock_iter = linked;
-					reason = 5;
-				}
-			} else {
+			linked->locks_want =
+				max(level + 1, max_t(unsigned,
+				    linked->locks_want,
+				    iter->locks_want));
+			if (!btree_iter_get_locks(linked, true, false)) {
 				deadlock_iter = linked;
-				reason = 6;
+				reason = 5;
 			}
 		}
 
@@ -441,30 +431,6 @@ bool __bch2_btree_iter_upgrade(struct btree_iter *iter,
 	return false;
 }
 
-bool __bch2_btree_iter_upgrade_nounlock(struct btree_iter *iter,
-					unsigned new_locks_want)
-{
-	unsigned l = iter->level;
-
-	EBUG_ON(iter->locks_want >= new_locks_want);
-
-	iter->locks_want = new_locks_want;
-
-	do {
-		if (!btree_iter_node(iter, l))
-			break;
-
-		if (!bch2_btree_node_upgrade(iter, l)) {
-			iter->locks_want = l;
-			return false;
-		}
-
-		l++;
-	} while (l < iter->locks_want);
-
-	return true;
-}
-
 void __bch2_btree_iter_downgrade(struct btree_iter *iter,
 				 unsigned downgrade_to)
 {
@@ -1046,7 +1012,7 @@ void bch2_btree_iter_node_drop(struct btree_iter *iter, struct btree *b)
 
 	trans_for_each_iter(iter->trans, linked)
 		if (linked->l[level].b == b) {
-			__btree_node_unlock(linked, level);
+			btree_node_unlock(linked, level);
 			linked->l[level].b = BTREE_ITER_NO_NODE_DROP;
 		}
 }
@@ -2083,8 +2049,10 @@ alloc_iter:
 
 	if (!(iter->flags & BTREE_ITER_INTENT))
 		bch2_btree_iter_downgrade(iter);
-	else if (!iter->locks_want)
-		__bch2_btree_iter_upgrade_nounlock(iter, 1);
+	else if (!iter->locks_want) {
+		iter->locks_want = 1;
+		btree_iter_get_locks(iter, true, false);
+	}
 
 	bch2_btree_iter_set_pos(iter, pos);
 	btree_iter_set_search_pos(iter, btree_iter_search_key(iter));
@@ -2352,11 +2320,22 @@ bch2_btree_iter_node_to_text(struct printbuf *out,
 			     struct btree_bkey_cached_common *_b,
 			     enum btree_iter_type type)
 {
-	pr_buf(out, "    %px l=%u %s:",
-	       _b, _b->level, bch2_btree_ids[_b->btree_id]);
+	pr_buf(out, "    l=%u %s:",
+	       _b->level, bch2_btree_ids[_b->btree_id]);
 	bch2_bpos_to_text(out, btree_node_pos(_b, type));
 }
 
+static bool trans_has_btree_nodes_locked(struct btree_trans *trans)
+{
+	struct btree_iter *iter;
+
+	trans_for_each_iter(trans, iter)
+		if (btree_iter_type(iter) != BTREE_ITER_CACHED &&
+		    iter->nodes_locked)
+			return true;
+	return false;
+}
+
 void bch2_btree_trans_to_text(struct printbuf *out, struct bch_fs *c)
 {
 #ifdef CONFIG_BCACHEFS_DEBUG
@@ -2367,14 +2346,18 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct bch_fs *c)
 
 	mutex_lock(&c->btree_trans_lock);
 	list_for_each_entry(trans, &c->btree_trans_list, list) {
-		pr_buf(out, "%i %px %ps\n", trans->pid, trans, (void *) trans->ip);
+		if (!trans_has_btree_nodes_locked(trans))
+			continue;
+
+		pr_buf(out, "%i %ps\n", trans->pid, (void *) trans->ip);
 
 		trans_for_each_iter(trans, iter) {
 			if (!iter->nodes_locked)
 				continue;
 
-			pr_buf(out, "  iter %u %s:",
+			pr_buf(out, "  iter %u %c %s:",
 			       iter->idx,
+			       btree_iter_type(iter) == BTREE_ITER_CACHED ? 'c' : 'b',
 			       bch2_btree_ids[iter->btree_id]);
 			bch2_bpos_to_text(out, iter->pos);
 			pr_buf(out, "\n");
@@ -2393,17 +2376,18 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct bch_fs *c)
 
 		b = READ_ONCE(trans->locking);
 		if (b) {
-			pr_buf(out, "  locking iter %u l=%u %s:",
+			iter = &trans->iters[trans->locking_iter_idx];
+			pr_buf(out, "  locking iter %u %c l=%u %s:",
 			       trans->locking_iter_idx,
+			       btree_iter_type(iter) == BTREE_ITER_CACHED ? 'c' : 'b',
 			       trans->locking_level,
 			       bch2_btree_ids[trans->locking_btree_id]);
 			bch2_bpos_to_text(out, trans->locking_pos);
 
-
 			pr_buf(out, " node ");
 			bch2_btree_iter_node_to_text(out,
 					(void *) b,
-					btree_iter_type(&trans->iters[trans->locking_iter_idx]));
+					btree_iter_type(iter));
 			pr_buf(out, "\n");
 		}
 	}
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index 7585f989ad50..1a11e68911ba 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -116,7 +116,6 @@ bool bch2_trans_relock(struct btree_trans *);
 void bch2_trans_unlock(struct btree_trans *);
 
 bool __bch2_btree_iter_upgrade(struct btree_iter *, unsigned);
-bool __bch2_btree_iter_upgrade_nounlock(struct btree_iter *, unsigned);
 
 static inline bool bch2_btree_iter_upgrade(struct btree_iter *iter,
 					   unsigned new_locks_want)
@@ -124,9 +123,7 @@ static inline bool bch2_btree_iter_upgrade(struct btree_iter *iter,
 	new_locks_want = min(new_locks_want, BTREE_MAX_DEPTH);
 
 	return iter->locks_want < new_locks_want
-		? (!iter->trans->nounlock
-		   ? __bch2_btree_iter_upgrade(iter, new_locks_want)
-		   : __bch2_btree_iter_upgrade_nounlock(iter, new_locks_want))
+		? __bch2_btree_iter_upgrade(iter, new_locks_want)
 		: iter->uptodate <= BTREE_ITER_NEED_PEEK;
 }
 
diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h
index 1146dd37adf5..fda164802154 100644
--- a/fs/bcachefs/btree_locking.h
+++ b/fs/bcachefs/btree_locking.h
@@ -94,7 +94,7 @@ btree_lock_want(struct btree_iter *iter, int level)
 	return BTREE_NODE_UNLOCKED;
 }
 
-static inline void __btree_node_unlock(struct btree_iter *iter, unsigned level)
+static inline void btree_node_unlock(struct btree_iter *iter, unsigned level)
 {
 	int lock_type = btree_node_locked_type(iter, level);
 
@@ -105,13 +105,6 @@ static inline void __btree_node_unlock(struct btree_iter *iter, unsigned level)
 	mark_btree_node_unlocked(iter, level);
 }
 
-static inline void btree_node_unlock(struct btree_iter *iter, unsigned level)
-{
-	EBUG_ON(!level && iter->trans->nounlock);
-
-	__btree_node_unlock(iter, level);
-}
-
 static inline void __bch2_btree_iter_unlock(struct btree_iter *iter)
 {
 	btree_iter_set_dirty(iter, BTREE_ITER_NEED_RELOCK);
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 0bcf17159744..5bee4135ab8f 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -374,7 +374,6 @@ struct btree_trans {
 	u8			nr_updates2;
 	unsigned		used_mempool:1;
 	unsigned		error:1;
-	unsigned		nounlock:1;
 	unsigned		in_traverse_all:1;
 
 	u64			iters_linked;
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index ccfc046220e2..592f9516d4e7 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -984,17 +984,14 @@ int __bch2_trans_commit(struct btree_trans *trans)
 			goto out;
 		}
 
-		/*
-		 * We're not using bch2_btree_iter_upgrade here because
-		 * we know trans->nounlock can't be set:
-		 */
-		if (unlikely(!btree_node_intent_locked(i->iter, i->iter->level) &&
-			     !__bch2_btree_iter_upgrade(i->iter, i->iter->level + 1))) {
+		if (unlikely(!bch2_btree_iter_upgrade(i->iter, i->level + 1))) {
 			trace_trans_restart_upgrade(trans->ip);
 			ret = -EINTR;
 			goto out;
 		}
 
+		BUG_ON(!btree_node_intent_locked(i->iter, i->level));
+
 		u64s = jset_u64s(i->k->k.u64s);
 		if (btree_iter_type(i->iter) == BTREE_ITER_CACHED &&
 		    likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)))
-- 
cgit 


From 2fa81d0b5bfdcd9c90725474ab9443f13152dfd8 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 31 Mar 2021 16:10:21 -0400
Subject: bcachefs: Fix BTREE_FOREGROUND_MERGE_HYSTERESIS

We were multiplying instead of dividing - oops.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_cache.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h
index aa8fe4a1b04b..4791c3b64452 100644
--- a/fs/bcachefs/btree_cache.h
+++ b/fs/bcachefs/btree_cache.h
@@ -89,7 +89,7 @@ static inline unsigned btree_blocks(struct bch_fs *c)
 #define BTREE_FOREGROUND_MERGE_THRESHOLD(c)	(btree_max_u64s(c) * 1 / 3)
 #define BTREE_FOREGROUND_MERGE_HYSTERESIS(c)			\
 	(BTREE_FOREGROUND_MERGE_THRESHOLD(c) +			\
-	 (BTREE_FOREGROUND_MERGE_THRESHOLD(c) << 2))
+	 (BTREE_FOREGROUND_MERGE_THRESHOLD(c) >> 2))
 
 #define btree_node_root(_c, _b)	((_c)->btree_roots[(_b)->c.btree_id].b)
 
-- 
cgit 


From 2d587674bad9cf83db0cc43185eb7e2d913cf41f Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 2 Apr 2021 21:29:05 -0400
Subject: bcachefs: Increase commality between BTREE_ITER_NODES and
 BTREE_ITER_KEYS

Eventually BTREE_ITER_NODES should be going away. This patch is to fix a
transaction iterator overflow in the btree node merge path because
BTREE_ITER_NODES iterators couldn't be reused.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c | 81 ++++++++++++++++++++++++++----------------------
 fs/bcachefs/btree_iter.h | 13 +++++---
 2 files changed, 53 insertions(+), 41 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 50712c99d2bd..74094f5e67d2 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -432,25 +432,24 @@ bool __bch2_btree_iter_upgrade(struct btree_iter *iter,
 }
 
 void __bch2_btree_iter_downgrade(struct btree_iter *iter,
-				 unsigned downgrade_to)
+				 unsigned new_locks_want)
 {
-	unsigned l, new_locks_want = downgrade_to ?:
-		(iter->flags & BTREE_ITER_INTENT ? 1 : 0);
+	unsigned l;
 
-	if (iter->locks_want < downgrade_to) {
-		iter->locks_want = new_locks_want;
+	EBUG_ON(iter->locks_want < new_locks_want);
 
-		while (iter->nodes_locked &&
-		       (l = __fls(iter->nodes_locked)) >= iter->locks_want) {
-			if (l > iter->level) {
-				btree_node_unlock(iter, l);
-			} else {
-				if (btree_node_intent_locked(iter, l)) {
-					six_lock_downgrade(&iter->l[l].b->c.lock);
-					iter->nodes_intent_locked ^= 1 << l;
-				}
-				break;
+	iter->locks_want = new_locks_want;
+
+	while (iter->nodes_locked &&
+	       (l = __fls(iter->nodes_locked)) >= iter->locks_want) {
+		if (l > iter->level) {
+			btree_node_unlock(iter, l);
+		} else {
+			if (btree_node_intent_locked(iter, l)) {
+				six_lock_downgrade(&iter->l[l].b->c.lock);
+				iter->nodes_intent_locked ^= 1 << l;
 			}
+			break;
 		}
 	}
 
@@ -1993,6 +1992,8 @@ static inline void btree_iter_copy(struct btree_iter *dst,
 
 struct btree_iter *__bch2_trans_get_iter(struct btree_trans *trans,
 					 enum btree_id btree_id, struct bpos pos,
+					 unsigned locks_want,
+					 unsigned depth,
 					 unsigned flags)
 {
 	struct btree_iter *iter, *best = NULL;
@@ -2005,10 +2006,6 @@ struct btree_iter *__bch2_trans_get_iter(struct btree_trans *trans,
 		pos.snapshot = btree_type_has_snapshots(btree_id)
 			? U32_MAX : 0;
 
-	/* We always want a fresh iterator for node iterators: */
-	if ((flags & BTREE_ITER_TYPE) == BTREE_ITER_NODES)
-		goto alloc_iter;
-
 	trans_for_each_iter(trans, iter) {
 		if (btree_iter_type(iter) != (flags & BTREE_ITER_TYPE))
 			continue;
@@ -2023,7 +2020,7 @@ struct btree_iter *__bch2_trans_get_iter(struct btree_trans *trans,
 
 		best = iter;
 	}
-alloc_iter:
+
 	if (!best) {
 		iter = btree_trans_iter_alloc(trans);
 		bch2_btree_iter_init(trans, iter, btree_id);
@@ -2047,13 +2044,26 @@ alloc_iter:
 
 	iter->snapshot = pos.snapshot;
 
-	if (!(iter->flags & BTREE_ITER_INTENT))
-		bch2_btree_iter_downgrade(iter);
-	else if (!iter->locks_want) {
-		iter->locks_want = 1;
+	locks_want = min(locks_want, BTREE_MAX_DEPTH);
+
+	if (locks_want > iter->locks_want) {
+		iter->locks_want = locks_want;
 		btree_iter_get_locks(iter, true, false);
+	} else if (locks_want < iter->locks_want) {
+		__bch2_btree_iter_downgrade(iter, locks_want);
 	}
 
+	while (iter->level < depth) {
+		btree_node_unlock(iter, iter->level);
+		iter->l[iter->level].b = BTREE_ITER_NO_NODE_INIT;
+		iter->level++;
+	}
+
+	while (iter->level > depth)
+		iter->l[--iter->level].b = BTREE_ITER_NO_NODE_INIT;
+
+	iter->min_depth	= depth;
+
 	bch2_btree_iter_set_pos(iter, pos);
 	btree_iter_set_search_pos(iter, btree_iter_search_key(iter));
 
@@ -2069,21 +2079,16 @@ struct btree_iter *bch2_trans_get_node_iter(struct btree_trans *trans,
 {
 	struct btree_iter *iter =
 		__bch2_trans_get_iter(trans, btree_id, pos,
-				       BTREE_ITER_NODES|
-				       BTREE_ITER_NOT_EXTENTS|
-				       BTREE_ITER_ALL_SNAPSHOTS|
-				       flags);
-	unsigned i;
+				      locks_want, depth,
+				      BTREE_ITER_NODES|
+				      BTREE_ITER_NOT_EXTENTS|
+				      BTREE_ITER_ALL_SNAPSHOTS|
+				      flags);
 
 	BUG_ON(bkey_cmp(iter->pos, pos));
-
-	iter->locks_want = locks_want;
-	iter->level	= depth;
-	iter->min_depth	= depth;
-
-	for (i = 0; i < ARRAY_SIZE(iter->l); i++)
-		iter->l[i].b		= NULL;
-	iter->l[iter->level].b		= BTREE_ITER_NO_NODE_INIT;
+	BUG_ON(iter->locks_want != min(locks_want, BTREE_MAX_DEPTH));
+	BUG_ON(iter->level	!= depth);
+	BUG_ON(iter->min_depth	!= depth);
 	iter->ip_allocated = _RET_IP_;
 
 	return iter;
@@ -2325,6 +2330,7 @@ bch2_btree_iter_node_to_text(struct printbuf *out,
 	bch2_bpos_to_text(out, btree_node_pos(_b, type));
 }
 
+#ifdef CONFIG_BCACHEFS_DEBUG
 static bool trans_has_btree_nodes_locked(struct btree_trans *trans)
 {
 	struct btree_iter *iter;
@@ -2335,6 +2341,7 @@ static bool trans_has_btree_nodes_locked(struct btree_trans *trans)
 			return true;
 	return false;
 }
+#endif
 
 void bch2_btree_trans_to_text(struct printbuf *out, struct bch_fs *c)
 {
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index 1a11e68911ba..455f2fe4929c 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -131,8 +131,10 @@ void __bch2_btree_iter_downgrade(struct btree_iter *, unsigned);
 
 static inline void bch2_btree_iter_downgrade(struct btree_iter *iter)
 {
-	if (iter->locks_want > (iter->flags & BTREE_ITER_INTENT) ? 1 : 0)
-		__bch2_btree_iter_downgrade(iter, 0);
+	unsigned new_locks_want = (iter->flags & BTREE_ITER_INTENT ? 1 : 0);
+
+	if (iter->locks_want > new_locks_want)
+		__bch2_btree_iter_downgrade(iter, new_locks_want);
 }
 
 void bch2_trans_downgrade(struct btree_trans *);
@@ -258,14 +260,17 @@ int bch2_trans_iter_free(struct btree_trans *, struct btree_iter *);
 void bch2_trans_unlink_iters(struct btree_trans *);
 
 struct btree_iter *__bch2_trans_get_iter(struct btree_trans *, enum btree_id,
-					 struct bpos, unsigned);
+					 struct bpos, unsigned,
+					 unsigned, unsigned);
 
 static inline struct btree_iter *
 bch2_trans_get_iter(struct btree_trans *trans, enum btree_id btree_id,
 		    struct bpos pos, unsigned flags)
 {
 	struct btree_iter *iter =
-		__bch2_trans_get_iter(trans, btree_id, pos, flags);
+		__bch2_trans_get_iter(trans, btree_id, pos,
+				      (flags & BTREE_ITER_INTENT) != 0, 0,
+				      flags);
 	iter->ip_allocated = _THIS_IP_;
 	return iter;
 }
-- 
cgit 


From b753d4b338df70df1ee75db21a216385bb88b90b Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 3 Apr 2021 18:37:09 -0400
Subject: bcachefs: Fix this_cpu_ptr() usage

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/buckets.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 88deb48a3a37..4c2485afe43c 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -1392,7 +1392,7 @@ static int bch2_fs_usage_apply(struct bch_fs *c,
 			       struct disk_reservation *disk_res,
 			       unsigned journal_seq)
 {
-	struct bch_fs_usage *dst = fs_usage_ptr(c, journal_seq, false);
+	struct bch_fs_usage *dst;
 	s64 added = src->u.data + src->u.reserved;
 	s64 should_not_have_added;
 	int ret = 0;
@@ -1420,6 +1420,7 @@ static int bch2_fs_usage_apply(struct bch_fs *c,
 	this_cpu_add(*c->online_reserved, src->online_reserved);
 
 	preempt_disable();
+	dst = fs_usage_ptr(c, journal_seq, false);
 	acc_u64s((u64 *) dst, (u64 *) &src->u, fs_usage_u64s(c));
 	preempt_enable();
 
-- 
cgit 


From 6167f7c8ff5ce564423fe8b416b5f95d1712859b Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 3 Apr 2021 19:27:05 -0400
Subject: bcachefs: Fix journal deadlock

After we get a journal reservation, we need to use it - if we erorr out
of a transaction commit, we'll be eating into space in the journal and
if our transaction needs to make forward progress in order to reclaim
space in the journal, we'll deadlock.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update_leaf.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 592f9516d4e7..d3d86aa0ee95 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -426,6 +426,14 @@ bch2_trans_commit_write_locked(struct btree_trans *trans,
 		fs_usage = bch2_fs_usage_scratch_get(c);
 	}
 
+	/* Must be called under mark_lock: */
+	if (marking && trans->fs_usage_deltas &&
+	    bch2_replicas_delta_list_apply(c, &fs_usage->u,
+					   trans->fs_usage_deltas)) {
+		ret = BTREE_INSERT_NEED_MARK_REPLICAS;
+		goto err;
+	}
+
 	/*
 	 * Don't get journal reservation until after we know insert will
 	 * succeed:
@@ -462,14 +470,6 @@ bch2_trans_commit_write_locked(struct btree_trans *trans,
 				i->k->k.version = MAX_VERSION;
 	}
 
-	/* Must be called under mark_lock: */
-	if (marking && trans->fs_usage_deltas &&
-	    bch2_replicas_delta_list_apply(c, &fs_usage->u,
-					   trans->fs_usage_deltas)) {
-		ret = BTREE_INSERT_NEED_MARK_REPLICAS;
-		goto err;
-	}
-
 	trans_for_each_update(trans, i)
 		if (BTREE_NODE_TYPE_HAS_MEM_TRIGGERS & (1U << i->bkey_type))
 			bch2_mark_update(trans, i->iter, i->k,
-- 
cgit 


From 2940295c97f49ffe0b2f564dea394094581073e7 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 3 Apr 2021 16:24:13 -0400
Subject: bcachefs: Be more careful about JOURNAL_RES_GET_RESERVED

JOURNAL_RES_GET_RESERVED should only be used for updatse that need to be
done to free up space in the journal. In particular, when we're flushing
keys from the key cache, if we're flushing them out of order we
shouldn't be using it, since we're using up our remaining space in the
journal without dropping a pin that will let us make forward progress.

With this patch, BTREE_INSERT_JOURNAL_RECLAIM without
BTREE_INSERT_JOURNAL_RESERVED may return -EAGAIN - we can't wait on
journal reclaim if we're already in journal reclaim.

This means we need to propagate these errors up to journal reclaim,
indicating that flushing a journal pin should be retried in the future.

This is prep work for a patch to change the way journal reclaim works,
to split out flushing key cache keys because the btree key cache is too
dirty from journal reclaim because we need space in the journal.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_key_cache.c       | 24 ++++++++++-----
 fs/bcachefs/btree_update_interior.c |  9 ++++--
 fs/bcachefs/btree_update_leaf.c     | 15 ++++++---
 fs/bcachefs/journal.c               | 24 +++++++++++++++
 fs/bcachefs/journal.h               |  3 +-
 fs/bcachefs/journal_reclaim.c       | 61 +++++++++++++++++++++++--------------
 fs/bcachefs/journal_types.h         |  3 +-
 7 files changed, 99 insertions(+), 40 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index 0858f469f7c2..74d982c3402a 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -353,6 +353,7 @@ err:
 static int btree_key_cache_flush_pos(struct btree_trans *trans,
 				     struct bkey_cached_key key,
 				     u64 journal_seq,
+				     unsigned commit_flags,
 				     bool evict)
 {
 	struct bch_fs *c = trans->c;
@@ -391,12 +392,17 @@ retry:
 				  BTREE_INSERT_NOUNLOCK|
 				  BTREE_INSERT_NOCHECK_RW|
 				  BTREE_INSERT_NOFAIL|
-				  BTREE_INSERT_JOURNAL_RESERVED|
-				  BTREE_INSERT_JOURNAL_RECLAIM);
+				  (ck->journal.seq == journal_last_seq(j)
+				   ? BTREE_INSERT_JOURNAL_RESERVED
+				   : 0)|
+				  commit_flags);
 err:
 	if (ret == -EINTR)
 		goto retry;
 
+	if (ret == -EAGAIN)
+		goto out;
+
 	if (ret) {
 		bch2_fs_fatal_err_on(!bch2_journal_error(j), c,
 			"error flushing key cache: %i", ret);
@@ -439,15 +445,16 @@ out:
 	return ret;
 }
 
-static void btree_key_cache_journal_flush(struct journal *j,
-					  struct journal_entry_pin *pin,
-					  u64 seq)
+static int btree_key_cache_journal_flush(struct journal *j,
+					 struct journal_entry_pin *pin,
+					 u64 seq)
 {
 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
 	struct bkey_cached *ck =
 		container_of(pin, struct bkey_cached, journal);
 	struct bkey_cached_key key;
 	struct btree_trans trans;
+	int ret = 0;
 
 	int srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
 
@@ -462,10 +469,13 @@ static void btree_key_cache_journal_flush(struct journal *j,
 	six_unlock_read(&ck->c.lock);
 
 	bch2_trans_init(&trans, c, 0, 0);
-	btree_key_cache_flush_pos(&trans, key, seq, false);
+	ret = btree_key_cache_flush_pos(&trans, key, seq,
+				  BTREE_INSERT_JOURNAL_RECLAIM, false);
 	bch2_trans_exit(&trans);
 unlock:
 	srcu_read_unlock(&c->btree_trans_barrier, srcu_idx);
+
+	return ret;
 }
 
 /*
@@ -481,7 +491,7 @@ int bch2_btree_key_cache_flush(struct btree_trans *trans,
 	if (!bch2_btree_key_cache_find(c, id, pos))
 		return 0;
 
-	return btree_key_cache_flush_pos(trans, key, 0, true);
+	return btree_key_cache_flush_pos(trans, key, 0, 0, true);
 }
 
 bool bch2_btree_insert_key_cached(struct btree_trans *trans,
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 988922699e8b..7aba0e9d99c1 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -916,10 +916,12 @@ bch2_btree_update_start(struct btree_iter *iter, unsigned level,
 	struct closure cl;
 	int disk_res_flags = (flags & BTREE_INSERT_NOFAIL)
 		? BCH_DISK_RESERVATION_NOFAIL : 0;
-	int journal_flags = (flags & BTREE_INSERT_JOURNAL_RESERVED)
-		? JOURNAL_RES_GET_RECLAIM : 0;
+	int journal_flags = 0;
 	int ret = 0;
 
+	if (flags & BTREE_INSERT_JOURNAL_RESERVED)
+		journal_flags |= JOURNAL_RES_GET_RESERVED;
+
 	closure_init_stack(&cl);
 retry:
 	/*
@@ -982,6 +984,9 @@ retry:
 
 		bch2_trans_unlock(trans);
 
+		if (flags & BTREE_INSERT_JOURNAL_RECLAIM)
+			goto err;
+
 		ret = bch2_journal_preres_get(&c->journal, &as->journal_preres,
 				BTREE_UPDATE_JOURNAL_RES,
 				journal_flags);
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index d3d86aa0ee95..ee1c26f2901f 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -134,7 +134,7 @@ fix_iter:
 	return true;
 }
 
-static void __btree_node_flush(struct journal *j, struct journal_entry_pin *pin,
+static int __btree_node_flush(struct journal *j, struct journal_entry_pin *pin,
 			       unsigned i, u64 seq)
 {
 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
@@ -145,14 +145,15 @@ static void __btree_node_flush(struct journal *j, struct journal_entry_pin *pin,
 	bch2_btree_node_write_cond(c, b,
 		(btree_current_write(b) == w && w->journal.seq == seq));
 	six_unlock_read(&b->c.lock);
+	return 0;
 }
 
-static void btree_node_flush0(struct journal *j, struct journal_entry_pin *pin, u64 seq)
+static int btree_node_flush0(struct journal *j, struct journal_entry_pin *pin, u64 seq)
 {
 	return __btree_node_flush(j, pin, 0, seq);
 }
 
-static void btree_node_flush1(struct journal *j, struct journal_entry_pin *pin, u64 seq)
+static int btree_node_flush1(struct journal *j, struct journal_entry_pin *pin, u64 seq)
 {
 	return __btree_node_flush(j, pin, 1, seq);
 }
@@ -563,8 +564,8 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans,
 	ret = bch2_journal_preres_get(&c->journal,
 			&trans->journal_preres, trans->journal_preres_u64s,
 			JOURNAL_RES_GET_NONBLOCK|
-			((trans->flags & BTREE_INSERT_JOURNAL_RECLAIM)
-			 ? JOURNAL_RES_GET_RECLAIM : 0));
+			((trans->flags & BTREE_INSERT_JOURNAL_RESERVED)
+			 ? JOURNAL_RES_GET_RESERVED : 0));
 	if (unlikely(ret == -EAGAIN))
 		ret = bch2_trans_journal_preres_get_cold(trans,
 						trans->journal_preres_u64s);
@@ -721,6 +722,10 @@ int bch2_trans_commit_error(struct btree_trans *trans,
 	case BTREE_INSERT_NEED_JOURNAL_RES:
 		bch2_trans_unlock(trans);
 
+		if ((trans->flags & BTREE_INSERT_JOURNAL_RECLAIM) &&
+		    !(trans->flags & BTREE_INSERT_JOURNAL_RESERVED))
+			return -EAGAIN;
+
 		ret = bch2_trans_journal_res_get(trans, JOURNAL_RES_GET_CHECK);
 		if (ret)
 			return ret;
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index edbcbe7fb31f..bce056cb6841 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -11,6 +11,7 @@
 #include "btree_gc.h"
 #include "btree_update.h"
 #include "buckets.h"
+#include "error.h"
 #include "journal.h"
 #include "journal_io.h"
 #include "journal_reclaim.h"
@@ -449,6 +450,27 @@ unlock:
 	if (!ret)
 		goto retry;
 
+	if ((ret == cur_entry_journal_full ||
+	     ret == cur_entry_journal_pin_full) &&
+	    !can_discard &&
+	    j->reservations.idx == j->reservations.unwritten_idx &&
+	    (flags & JOURNAL_RES_GET_RESERVED)) {
+		char *journal_debug_buf = kmalloc(4096, GFP_ATOMIC);
+
+		bch_err(c, "Journal stuck!");
+		if (journal_debug_buf) {
+			bch2_journal_debug_to_text(&_PBUF(journal_debug_buf, 4096), j);
+			bch_err(c, "%s", journal_debug_buf);
+
+			bch2_journal_pins_to_text(&_PBUF(journal_debug_buf, 4096), j);
+			bch_err(c, "Journal pins:\n%s", journal_debug_buf);
+			kfree(journal_debug_buf);
+		}
+
+		bch2_fatal_error(c);
+		dump_stack();
+	}
+
 	/*
 	 * Journal is full - can't rely on reclaim from work item due to
 	 * freezing:
@@ -1169,6 +1191,7 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
 	       "last_seq_ondisk:\t%llu\n"
 	       "flushed_seq_ondisk:\t%llu\n"
 	       "prereserved:\t\t%u/%u\n"
+	       "each entry reserved:\t%u\n"
 	       "nr flush writes:\t%llu\n"
 	       "nr noflush writes:\t%llu\n"
 	       "nr direct reclaim:\t%llu\n"
@@ -1183,6 +1206,7 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
 	       j->flushed_seq_ondisk,
 	       j->prereserved.reserved,
 	       j->prereserved.remaining,
+	       j->entry_u64s_reserved,
 	       j->nr_flush_writes,
 	       j->nr_noflush_writes,
 	       j->nr_direct_reclaim,
diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h
index 547c735ce3cb..a0d19fad3bdd 100644
--- a/fs/bcachefs/journal.h
+++ b/fs/bcachefs/journal.h
@@ -308,7 +308,6 @@ int bch2_journal_res_get_slowpath(struct journal *, struct journal_res *,
 #define JOURNAL_RES_GET_NONBLOCK	(1 << 0)
 #define JOURNAL_RES_GET_CHECK		(1 << 1)
 #define JOURNAL_RES_GET_RESERVED	(1 << 2)
-#define JOURNAL_RES_GET_RECLAIM		(1 << 3)
 
 static inline int journal_res_get_fast(struct journal *j,
 				       struct journal_res *res,
@@ -446,7 +445,7 @@ static inline int bch2_journal_preres_get_fast(struct journal *j,
 		 * into the reclaim path and deadlock:
 		 */
 
-		if (!(flags & JOURNAL_RES_GET_RECLAIM) &&
+		if (!(flags & JOURNAL_RES_GET_RESERVED) &&
 		    new.reserved > new.remaining)
 			return 0;
 	} while ((v = atomic64_cmpxchg(&j->prereserved.counter,
diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
index 3ef42a47f60d..42ed7a3525b1 100644
--- a/fs/bcachefs/journal_reclaim.c
+++ b/fs/bcachefs/journal_reclaim.c
@@ -239,7 +239,7 @@ void bch2_journal_space_available(struct journal *j)
 	u64s_remaining  = (u64) clean << 6;
 	u64s_remaining -= (u64) total << 3;
 	u64s_remaining = max(0LL, u64s_remaining);
-	u64s_remaining /= 2;
+	u64s_remaining /= 4;
 	u64s_remaining = min_t(u64, u64s_remaining, U32_MAX);
 out:
 	j->cur_entry_sectors	= !ret ? j->space[journal_space_discarded].next_entry : 0;
@@ -353,6 +353,9 @@ static inline void __journal_pin_drop(struct journal *j,
 	if (!journal_pin_active(pin))
 		return;
 
+	if (j->flush_in_progress == pin)
+		j->flush_in_progress_dropped = true;
+
 	pin_list = journal_seq_pin(j, pin->seq);
 	pin->seq = 0;
 	list_del_init(&pin->list);
@@ -439,34 +442,27 @@ journal_get_next_pin(struct journal *j, u64 max_seq, u64 *seq)
 	struct journal_entry_pin_list *pin_list;
 	struct journal_entry_pin *ret = NULL;
 
-	if (!test_bit(JOURNAL_RECLAIM_STARTED, &j->flags))
-		return NULL;
-
-	spin_lock(&j->lock);
-
 	fifo_for_each_entry_ptr(pin_list, &j->pin, *seq)
 		if (*seq > max_seq ||
 		    (ret = list_first_entry_or_null(&pin_list->list,
 				struct journal_entry_pin, list)))
 			break;
 
-	if (ret) {
-		list_move(&ret->list, &pin_list->flushed);
-		BUG_ON(j->flush_in_progress);
-		j->flush_in_progress = ret;
-	}
-
-	spin_unlock(&j->lock);
-
 	return ret;
 }
 
 /* returns true if we did work */
-static u64 journal_flush_pins(struct journal *j, u64 seq_to_flush,
-			      unsigned min_nr)
+static size_t journal_flush_pins(struct journal *j, u64 seq_to_flush,
+				 unsigned min_nr)
 {
 	struct journal_entry_pin *pin;
-	u64 seq, ret = 0;
+	size_t nr_flushed = 0;
+	journal_pin_flush_fn flush_fn;
+	u64 seq;
+	int err;
+
+	if (!test_bit(JOURNAL_RECLAIM_STARTED, &j->flags))
+		return 0;
 
 	lockdep_assert_held(&j->reclaim_lock);
 
@@ -475,23 +471,42 @@ static u64 journal_flush_pins(struct journal *j, u64 seq_to_flush,
 
 		j->last_flushed = jiffies;
 
+		spin_lock(&j->lock);
 		pin = journal_get_next_pin(j, min_nr
 				? U64_MAX : seq_to_flush, &seq);
+		if (pin) {
+			BUG_ON(j->flush_in_progress);
+			j->flush_in_progress = pin;
+			j->flush_in_progress_dropped = false;
+			flush_fn = pin->flush;
+		}
+		spin_unlock(&j->lock);
+
 		if (!pin)
 			break;
 
 		if (min_nr)
 			min_nr--;
 
-		pin->flush(j, pin, seq);
+		err = flush_fn(j, pin, seq);
 
-		BUG_ON(j->flush_in_progress != pin);
+		spin_lock(&j->lock);
+		/* Pin might have been dropped or rearmed: */
+		if (likely(!err && !j->flush_in_progress_dropped))
+			list_move(&pin->list, &journal_seq_pin(j, seq)->flushed);
 		j->flush_in_progress = NULL;
+		j->flush_in_progress_dropped = false;
+		spin_unlock(&j->lock);
+
 		wake_up(&j->pin_flush_wait);
-		ret++;
+
+		if (err)
+			break;
+
+		nr_flushed++;
 	}
 
-	return ret;
+	return nr_flushed;
 }
 
 static u64 journal_seq_to_flush(struct journal *j)
@@ -556,8 +571,8 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct)
 {
 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
 	bool kthread = (current->flags & PF_KTHREAD) != 0;
-	u64 seq_to_flush, nr_flushed = 0;
-	size_t min_nr;
+	u64 seq_to_flush;
+	size_t min_nr, nr_flushed;
 	unsigned flags;
 	int ret = 0;
 
diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h
index 3db8c3760cca..ec3c604cdf22 100644
--- a/fs/bcachefs/journal_types.h
+++ b/fs/bcachefs/journal_types.h
@@ -50,7 +50,7 @@ struct journal_entry_pin_list {
 
 struct journal;
 struct journal_entry_pin;
-typedef void (*journal_pin_flush_fn)(struct journal *j,
+typedef int (*journal_pin_flush_fn)(struct journal *j,
 				struct journal_entry_pin *, u64);
 
 struct journal_entry_pin {
@@ -251,6 +251,7 @@ struct journal {
 
 	unsigned long		last_flushed;
 	struct journal_entry_pin *flush_in_progress;
+	bool			flush_in_progress_dropped;
 	wait_queue_head_t	pin_flush_wait;
 
 	/* protects advancing ja->discard_idx: */
-- 
cgit 


From 9c2e624290f24c69c835bc82b1abe349810e338f Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 3 Apr 2021 19:41:09 -0400
Subject: bcachefs: Fix livelock calling bch2_mark_bkey_replicas()

The bug was that we were trying to find a replicas entry that wasn't
sorted - but, we can also simplify the code by not using
bch2_mark_bkey_replicas and instead ensuring the list of replicas
entries exists directly.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update_leaf.c |  8 +++-----
 fs/bcachefs/buckets.c           | 13 +++++++++++++
 fs/bcachefs/buckets.h           |  2 ++
 3 files changed, 18 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index ee1c26f2901f..54e3850df009 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -707,11 +707,9 @@ int bch2_trans_commit_error(struct btree_trans *trans,
 	case BTREE_INSERT_NEED_MARK_REPLICAS:
 		bch2_trans_unlock(trans);
 
-		trans_for_each_update(trans, i) {
-			ret = bch2_mark_bkey_replicas(c, bkey_i_to_s_c(i->k));
-			if (ret)
-				return ret;
-		}
+		ret = bch2_replicas_delta_list_mark(c, trans->fs_usage_deltas);
+		if (ret)
+			return ret;
 
 		if (bch2_trans_relock(trans))
 			return 0;
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 4c2485afe43c..0b90104ffe7b 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -565,6 +565,7 @@ static inline void update_replicas_list(struct btree_trans *trans,
 	n->delta = sectors;
 	memcpy((void *) n + offsetof(struct replicas_delta, r),
 	       r, replicas_entry_bytes(r));
+	bch2_replicas_entry_sort(&n->r);
 	d->used += b;
 }
 
@@ -615,6 +616,18 @@ unwind:
 	return -1;
 }
 
+int bch2_replicas_delta_list_mark(struct bch_fs *c,
+				  struct replicas_delta_list *r)
+{
+	struct replicas_delta *d = r->d;
+	struct replicas_delta *top = (void *) r->d + r->used;
+	int ret = 0;
+
+	for (d = r->d; !ret && d != top; d = replicas_delta_next(d))
+		ret = bch2_mark_replicas(c, &d->r);
+	return ret;
+}
+
 #define do_mark_fn(fn, c, pos, flags, ...)				\
 ({									\
 	int gc, ret = 0;						\
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index af8cb74d71e0..1b83a768ba06 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -253,6 +253,8 @@ int bch2_mark_update(struct btree_trans *, struct btree_iter *,
 int bch2_replicas_delta_list_apply(struct bch_fs *,
 				   struct bch_fs_usage *,
 				   struct replicas_delta_list *);
+int bch2_replicas_delta_list_mark(struct bch_fs *,
+				  struct replicas_delta_list *);
 int bch2_trans_mark_key(struct btree_trans *, struct bkey_s_c, struct bkey_s_c,
 			unsigned, s64, unsigned);
 int bch2_trans_mark_update(struct btree_trans *, struct btree_iter *iter,
-- 
cgit 


From 35d5aff263629caf98305d12c02d8b64d6981625 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 3 Apr 2021 20:29:05 -0400
Subject: bcachefs: Kill bch2_fs_usage_scratch_get()

This is an important cleanup, eliminating an unnecessary copy in the
transaction commit path.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c            |   1 +
 fs/bcachefs/btree_update_interior.c |   4 -
 fs/bcachefs/btree_update_leaf.c     |  12 +-
 fs/bcachefs/buckets.c               | 211 ++++++++++++------------------------
 fs/bcachefs/buckets.h               |  10 +-
 fs/bcachefs/buckets_types.h         |  16 ---
 fs/bcachefs/replicas.c              |  37 +++++++
 fs/bcachefs/replicas.h              |  25 +++++
 8 files changed, 138 insertions(+), 178 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 74094f5e67d2..203c9adb0623 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -12,6 +12,7 @@
 #include "error.h"
 #include "extents.h"
 #include "journal.h"
+#include "replicas.h"
 #include "trace.h"
 
 #include <linux/prefetch.h>
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 7aba0e9d99c1..e965c8bbddce 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -437,10 +437,6 @@ static int bch2_btree_reserve_get(struct btree_update *as, unsigned nr_nodes,
 			goto err_free;
 		}
 
-		ret = bch2_mark_bkey_replicas(c, bkey_i_to_s_c(&b->key));
-		if (ret)
-			goto err_free;
-
 		as->prealloc_nodes[as->nr_prealloc_nodes++] = b;
 	}
 
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 54e3850df009..13c687fede0b 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -376,7 +376,6 @@ bch2_trans_commit_write_locked(struct btree_trans *trans,
 			       struct btree_insert_entry **stopped_at)
 {
 	struct bch_fs *c = trans->c;
-	struct bch_fs_usage_online *fs_usage = NULL;
 	struct btree_insert_entry *i;
 	struct btree_trans_commit_hook *h;
 	unsigned u64s = 0;
@@ -424,13 +423,11 @@ bch2_trans_commit_write_locked(struct btree_trans *trans,
 
 	if (marking) {
 		percpu_down_read(&c->mark_lock);
-		fs_usage = bch2_fs_usage_scratch_get(c);
 	}
 
 	/* Must be called under mark_lock: */
 	if (marking && trans->fs_usage_deltas &&
-	    bch2_replicas_delta_list_apply(c, &fs_usage->u,
-					   trans->fs_usage_deltas)) {
+	    !bch2_replicas_delta_list_marked(c, trans->fs_usage_deltas)) {
 		ret = BTREE_INSERT_NEED_MARK_REPLICAS;
 		goto err;
 	}
@@ -474,10 +471,10 @@ bch2_trans_commit_write_locked(struct btree_trans *trans,
 	trans_for_each_update(trans, i)
 		if (BTREE_NODE_TYPE_HAS_MEM_TRIGGERS & (1U << i->bkey_type))
 			bch2_mark_update(trans, i->iter, i->k,
-					 &fs_usage->u, i->trigger_flags);
+					 NULL, i->trigger_flags);
 
-	if (marking)
-		bch2_trans_fs_usage_apply(trans, fs_usage);
+	if (marking && trans->fs_usage_deltas)
+		bch2_trans_fs_usage_apply(trans, trans->fs_usage_deltas);
 
 	if (unlikely(c->gc_pos.phase))
 		bch2_trans_mark_gc(trans);
@@ -486,7 +483,6 @@ bch2_trans_commit_write_locked(struct btree_trans *trans,
 		do_btree_insert_one(trans, i->iter, i->k);
 err:
 	if (marking) {
-		bch2_fs_usage_scratch_put(c, fs_usage);
 		percpu_up_read(&c->mark_lock);
 	}
 
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 0b90104ffe7b..47a1b8b12eb9 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -167,37 +167,6 @@ void bch2_fs_usage_initialize(struct bch_fs *c)
 	percpu_up_write(&c->mark_lock);
 }
 
-void bch2_fs_usage_scratch_put(struct bch_fs *c, struct bch_fs_usage_online *fs_usage)
-{
-	if (fs_usage == c->usage_scratch)
-		mutex_unlock(&c->usage_scratch_lock);
-	else
-		kfree(fs_usage);
-}
-
-struct bch_fs_usage_online *bch2_fs_usage_scratch_get(struct bch_fs *c)
-{
-	struct bch_fs_usage_online *ret;
-	unsigned bytes = sizeof(struct bch_fs_usage_online) + sizeof(u64) *
-		READ_ONCE(c->replicas.nr);
-	ret = kzalloc(bytes, GFP_NOWAIT|__GFP_NOWARN);
-	if (ret)
-		return ret;
-
-	if (mutex_trylock(&c->usage_scratch_lock))
-		goto out_pool;
-
-	ret = kzalloc(bytes, GFP_NOFS);
-	if (ret)
-		return ret;
-
-	mutex_lock(&c->usage_scratch_lock);
-out_pool:
-	ret = c->usage_scratch;
-	memset(ret, 0, bytes);
-	return ret;
-}
-
 static inline struct bch_dev_usage *dev_usage_ptr(struct bch_dev *ca,
 						  unsigned journal_seq,
 						  bool gc)
@@ -459,6 +428,8 @@ static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
 	percpu_rwsem_assert_held(&c->mark_lock);
 
 	preempt_disable();
+	if (!fs_usage)
+		fs_usage = fs_usage_ptr(c, journal_seq, gc);
 	u = dev_usage_ptr(ca, journal_seq, gc);
 
 	if (bucket_type(old))
@@ -486,22 +457,17 @@ static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
 		bch2_wake_allocator(ca);
 }
 
-static inline int update_replicas(struct bch_fs *c,
-				  struct bch_fs_usage *fs_usage,
-				  struct bch_replicas_entry *r,
-				  s64 sectors)
+static inline void update_replicas(struct bch_fs *c,
+				   struct bch_fs_usage *fs_usage,
+				   struct bch_replicas_entry *r,
+				   s64 sectors)
 {
 	int idx = bch2_replicas_entry_idx(c, r);
 
-	if (idx < 0)
-		return -1;
-
-	if (!fs_usage)
-		return 0;
+	BUG_ON(idx < 0);
 
 	fs_usage_data_type_to_base(fs_usage, r->data_type, sectors);
 	fs_usage->replicas[idx]		+= sectors;
-	return 0;
 }
 
 static inline void update_cached_sectors(struct bch_fs *c,
@@ -579,55 +545,6 @@ static inline void update_cached_sectors_list(struct btree_trans *trans,
 	update_replicas_list(trans, &r.e, sectors);
 }
 
-static inline struct replicas_delta *
-replicas_delta_next(struct replicas_delta *d)
-{
-	return (void *) d + replicas_entry_bytes(&d->r) + 8;
-}
-
-int bch2_replicas_delta_list_apply(struct bch_fs *c,
-				   struct bch_fs_usage *fs_usage,
-				   struct replicas_delta_list *r)
-{
-	struct replicas_delta *d = r->d;
-	struct replicas_delta *top = (void *) r->d + r->used;
-	unsigned i;
-
-	for (d = r->d; d != top; d = replicas_delta_next(d))
-		if (update_replicas(c, fs_usage, &d->r, d->delta)) {
-			top = d;
-			goto unwind;
-		}
-
-	if (!fs_usage)
-		return 0;
-
-	fs_usage->nr_inodes += r->nr_inodes;
-
-	for (i = 0; i < BCH_REPLICAS_MAX; i++) {
-		fs_usage->reserved += r->persistent_reserved[i];
-		fs_usage->persistent_reserved[i] += r->persistent_reserved[i];
-	}
-
-	return 0;
-unwind:
-	for (d = r->d; d != top; d = replicas_delta_next(d))
-		update_replicas(c, fs_usage, &d->r, -d->delta);
-	return -1;
-}
-
-int bch2_replicas_delta_list_mark(struct bch_fs *c,
-				  struct replicas_delta_list *r)
-{
-	struct replicas_delta *d = r->d;
-	struct replicas_delta *top = (void *) r->d + r->used;
-	int ret = 0;
-
-	for (d = r->d; !ret && d != top; d = replicas_delta_next(d))
-		ret = bch2_mark_replicas(c, &d->r);
-	return ret;
-}
-
 #define do_mark_fn(fn, c, pos, flags, ...)				\
 ({									\
 	int gc, ret = 0;						\
@@ -1400,62 +1317,15 @@ int bch2_mark_update(struct btree_trans *trans,
 	return ret;
 }
 
-static int bch2_fs_usage_apply(struct bch_fs *c,
-			       struct bch_fs_usage_online *src,
-			       struct disk_reservation *disk_res,
-			       unsigned journal_seq)
-{
-	struct bch_fs_usage *dst;
-	s64 added = src->u.data + src->u.reserved;
-	s64 should_not_have_added;
-	int ret = 0;
-
-	percpu_rwsem_assert_held(&c->mark_lock);
-
-	/*
-	 * Not allowed to reduce sectors_available except by getting a
-	 * reservation:
-	 */
-	should_not_have_added = added - (s64) (disk_res ? disk_res->sectors : 0);
-	if (WARN_ONCE(should_not_have_added > 0,
-		      "disk usage increased by %lli more than reservation of %llu",
-		      added, disk_res ? disk_res->sectors : 0)) {
-		atomic64_sub(should_not_have_added, &c->sectors_available);
-		added -= should_not_have_added;
-		ret = -1;
-	}
-
-	if (added > 0) {
-		disk_res->sectors	-= added;
-		src->online_reserved	-= added;
-	}
-
-	this_cpu_add(*c->online_reserved, src->online_reserved);
-
-	preempt_disable();
-	dst = fs_usage_ptr(c, journal_seq, false);
-	acc_u64s((u64 *) dst, (u64 *) &src->u, fs_usage_u64s(c));
-	preempt_enable();
-
-	return ret;
-}
-
-void bch2_trans_fs_usage_apply(struct btree_trans *trans,
-			       struct bch_fs_usage_online *fs_usage)
+static noinline __cold
+void fs_usage_apply_warn(struct btree_trans *trans,
+			 unsigned disk_res_sectors)
 {
 	struct bch_fs *c = trans->c;
 	struct btree_insert_entry *i;
-	static int warned_disk_usage = 0;
-	u64 disk_res_sectors = trans->disk_res ? trans->disk_res->sectors : 0;
 	char buf[200];
 
-	if (!bch2_fs_usage_apply(c, fs_usage, trans->disk_res,
-				 trans->journal_res.seq) ||
-	    warned_disk_usage ||
-	    xchg(&warned_disk_usage, 1))
-		return;
-
-	bch_err(c, "disk usage increased more than %llu sectors reserved",
+	bch_err(c, "disk usage increased more than %u sectors reserved",
 		disk_res_sectors);
 
 	trans_for_each_update(trans, i) {
@@ -1490,6 +1360,65 @@ void bch2_trans_fs_usage_apply(struct btree_trans *trans,
 	}
 }
 
+void bch2_trans_fs_usage_apply(struct btree_trans *trans,
+			       struct replicas_delta_list *deltas)
+{
+	struct bch_fs *c = trans->c;
+	static int warned_disk_usage = 0;
+	bool warn = false;
+	unsigned disk_res_sectors = trans->disk_res ? trans->disk_res->sectors : 0;
+	struct replicas_delta *d = deltas->d;
+	struct replicas_delta *top = (void *) deltas->d + deltas->used;
+	struct bch_fs_usage *dst;
+	s64 added = 0, should_not_have_added;
+	unsigned i;
+
+	percpu_rwsem_assert_held(&c->mark_lock);
+
+	preempt_disable();
+	dst = fs_usage_ptr(c, trans->journal_res.seq, false);
+
+	for (d = deltas->d; d != top; d = replicas_delta_next(d)) {
+		switch (d->r.data_type) {
+		case BCH_DATA_btree:
+		case BCH_DATA_user:
+		case BCH_DATA_parity:
+			added += d->delta;
+		}
+
+		update_replicas(c, dst, &d->r, d->delta);
+	}
+
+	dst->nr_inodes += deltas->nr_inodes;
+
+	for (i = 0; i < BCH_REPLICAS_MAX; i++) {
+		added				+= deltas->persistent_reserved[i];
+		dst->reserved			+= deltas->persistent_reserved[i];
+		dst->persistent_reserved[i]	+= deltas->persistent_reserved[i];
+	}
+
+	/*
+	 * Not allowed to reduce sectors_available except by getting a
+	 * reservation:
+	 */
+	should_not_have_added = added - (s64) disk_res_sectors;
+	if (unlikely(should_not_have_added > 0)) {
+		atomic64_sub(should_not_have_added, &c->sectors_available);
+		added -= should_not_have_added;
+		warn = true;
+	}
+
+	if (added > 0) {
+		trans->disk_res->sectors -= added;
+		this_cpu_sub(*c->online_reserved, added);
+	}
+
+	preempt_enable();
+
+	if (unlikely(warn) && !xchg(&warned_disk_usage, 1))
+		fs_usage_apply_warn(trans, disk_res_sectors);
+}
+
 /* trans_mark: */
 
 static struct btree_iter *trans_get_update(struct btree_trans *trans,
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index 1b83a768ba06..cd81e6aba1b0 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -216,9 +216,6 @@ static inline unsigned dev_usage_u64s(void)
 	return sizeof(struct bch_dev_usage) / sizeof(u64);
 }
 
-void bch2_fs_usage_scratch_put(struct bch_fs *, struct bch_fs_usage_online *);
-struct bch_fs_usage_online *bch2_fs_usage_scratch_get(struct bch_fs *);
-
 u64 bch2_fs_usage_read_one(struct bch_fs *, u64 *);
 
 struct bch_fs_usage_online *bch2_fs_usage_read(struct bch_fs *);
@@ -250,16 +247,11 @@ int bch2_mark_key(struct bch_fs *, struct bkey_s_c, unsigned,
 int bch2_mark_update(struct btree_trans *, struct btree_iter *,
 		     struct bkey_i *, struct bch_fs_usage *, unsigned);
 
-int bch2_replicas_delta_list_apply(struct bch_fs *,
-				   struct bch_fs_usage *,
-				   struct replicas_delta_list *);
-int bch2_replicas_delta_list_mark(struct bch_fs *,
-				  struct replicas_delta_list *);
 int bch2_trans_mark_key(struct btree_trans *, struct bkey_s_c, struct bkey_s_c,
 			unsigned, s64, unsigned);
 int bch2_trans_mark_update(struct btree_trans *, struct btree_iter *iter,
 			   struct bkey_i *insert, unsigned);
-void bch2_trans_fs_usage_apply(struct btree_trans *, struct bch_fs_usage_online *);
+void bch2_trans_fs_usage_apply(struct btree_trans *, struct replicas_delta_list *);
 
 int bch2_trans_mark_metadata_bucket(struct btree_trans *,
 			struct disk_reservation *, struct bch_dev *,
diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h
index b6ea67506cc2..588b1a72adae 100644
--- a/fs/bcachefs/buckets_types.h
+++ b/fs/bcachefs/buckets_types.h
@@ -96,22 +96,6 @@ struct bch_fs_usage_short {
 	u64			nr_inodes;
 };
 
-struct replicas_delta {
-	s64			delta;
-	struct bch_replicas_entry r;
-} __packed;
-
-struct replicas_delta_list {
-	unsigned		size;
-	unsigned		used;
-
-	struct			{} memset_start;
-	u64			nr_inodes;
-	u64			persistent_reserved[BCH_REPLICAS_MAX];
-	struct			{} memset_end;
-	struct replicas_delta	d[0];
-};
-
 /*
  * A reservation for space on disk:
  */
diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c
index 0498c8ac82c8..81aba8caab9e 100644
--- a/fs/bcachefs/replicas.c
+++ b/fs/bcachefs/replicas.c
@@ -471,6 +471,36 @@ static int __bch2_mark_bkey_replicas(struct bch_fs *c, struct bkey_s_c k,
 	return 0;
 }
 
+/* replicas delta list: */
+
+bool bch2_replicas_delta_list_marked(struct bch_fs *c,
+				     struct replicas_delta_list *r)
+{
+	struct replicas_delta *d = r->d;
+	struct replicas_delta *top = (void *) r->d + r->used;
+
+	percpu_rwsem_assert_held(&c->mark_lock);
+
+	for (d = r->d; d != top; d = replicas_delta_next(d))
+		if (bch2_replicas_entry_idx(c, &d->r) < 0)
+			return false;
+	return true;
+}
+
+int bch2_replicas_delta_list_mark(struct bch_fs *c,
+				  struct replicas_delta_list *r)
+{
+	struct replicas_delta *d = r->d;
+	struct replicas_delta *top = (void *) r->d + r->used;
+	int ret = 0;
+
+	for (d = r->d; !ret && d != top; d = replicas_delta_next(d))
+		ret = bch2_mark_replicas(c, &d->r);
+	return ret;
+}
+
+/* bkey replicas: */
+
 bool bch2_bkey_replicas_marked(struct bch_fs *c,
 			       struct bkey_s_c k)
 {
@@ -482,6 +512,11 @@ int bch2_mark_bkey_replicas(struct bch_fs *c, struct bkey_s_c k)
 	return __bch2_mark_bkey_replicas(c, k, false);
 }
 
+/*
+ * Old replicas_gc mechanism: only used for journal replicas entries now, should
+ * die at some point:
+ */
+
 int bch2_replicas_gc_end(struct bch_fs *c, int ret)
 {
 	unsigned i;
@@ -575,6 +610,8 @@ int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask)
 	return 0;
 }
 
+/* New much simpler mechanism for clearing out unneeded replicas entries: */
+
 int bch2_replicas_gc2(struct bch_fs *c)
 {
 	struct bch_replicas_cpu new = { 0 };
diff --git a/fs/bcachefs/replicas.h b/fs/bcachefs/replicas.h
index 8cb1f592f1b6..72ac544f16d8 100644
--- a/fs/bcachefs/replicas.h
+++ b/fs/bcachefs/replicas.h
@@ -26,6 +26,31 @@ bool bch2_replicas_marked(struct bch_fs *, struct bch_replicas_entry *);
 int bch2_mark_replicas(struct bch_fs *,
 		       struct bch_replicas_entry *);
 
+struct replicas_delta {
+	s64			delta;
+	struct bch_replicas_entry r;
+} __packed;
+
+struct replicas_delta_list {
+	unsigned		size;
+	unsigned		used;
+
+	struct			{} memset_start;
+	u64			nr_inodes;
+	u64			persistent_reserved[BCH_REPLICAS_MAX];
+	struct			{} memset_end;
+	struct replicas_delta	d[0];
+};
+
+static inline struct replicas_delta *
+replicas_delta_next(struct replicas_delta *d)
+{
+	return (void *) d + replicas_entry_bytes(&d->r) + 8;
+}
+
+bool bch2_replicas_delta_list_marked(struct bch_fs *, struct replicas_delta_list *);
+int bch2_replicas_delta_list_mark(struct bch_fs *, struct replicas_delta_list *);
+
 void bch2_bkey_to_replicas(struct bch_replicas_entry *, struct bkey_s_c);
 bool bch2_bkey_replicas_marked(struct bch_fs *, struct bkey_s_c);
 int bch2_mark_bkey_replicas(struct bch_fs *, struct bkey_s_c);
-- 
cgit 


From 08e337618f67abb9be1ff4b022a14e8721c5def2 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 3 Apr 2021 21:09:13 -0400
Subject: bcachefs: Drop some memset() calls

gcc is emitting rep stos here, which is silly (and slow) for an 8 byte
memset.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.h | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index 455f2fe4929c..07d9b6d36e51 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -174,8 +174,11 @@ static inline void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos
 	if (!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS))
 		new_pos.snapshot = iter->snapshot;
 
-	bkey_init(&iter->k);
-	iter->k.p = iter->pos = new_pos;
+	iter->k.type = KEY_TYPE_deleted;
+	iter->k.p.inode		= iter->pos.inode	= new_pos.inode;
+	iter->k.p.offset	= iter->pos.offset	= new_pos.offset;
+	iter->k.p.snapshot	= iter->pos.snapshot	= new_pos.snapshot;
+	iter->k.size = 0;
 }
 
 /* Sort order for locking btree iterators: */
-- 
cgit 


From 671cc8a51b019b49a8538aceaaa5e770c1694c1b Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 3 Apr 2021 21:31:02 -0400
Subject: bcachefs: Eliminate memory barrier from fast path of
 journal_preres_put()

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/journal.c       |  2 +-
 fs/bcachefs/journal.h       | 39 ++++++++++++++++++++++-----------------
 fs/bcachefs/journal_types.h |  5 +++--
 3 files changed, 26 insertions(+), 20 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index bce056cb6841..35a48629b63b 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -520,7 +520,7 @@ static bool journal_preres_available(struct journal *j,
 				     unsigned new_u64s,
 				     unsigned flags)
 {
-	bool ret = bch2_journal_preres_get_fast(j, res, new_u64s, flags);
+	bool ret = bch2_journal_preres_get_fast(j, res, new_u64s, flags, true);
 
 	if (!ret && mutex_trylock(&j->reclaim_lock)) {
 		bch2_journal_reclaim(j);
diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h
index a0d19fad3bdd..cc497125889f 100644
--- a/fs/bcachefs/journal.h
+++ b/fs/bcachefs/journal.h
@@ -411,7 +411,12 @@ static inline void bch2_journal_preres_put(struct journal *j,
 
 	s.v = atomic64_sub_return(s.v, &j->prereserved.counter);
 	res->u64s = 0;
-	closure_wake_up(&j->preres_wait);
+
+	if (unlikely(s.waiting)) {
+		clear_bit(ilog2((((union journal_preres_state) { .waiting = 1 }).v)),
+			  (unsigned long *) &j->prereserved.v);
+		closure_wake_up(&j->preres_wait);
+	}
 
 	if (s.reserved <= s.remaining &&
 	    !test_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags)) {
@@ -427,32 +432,32 @@ int __bch2_journal_preres_get(struct journal *,
 static inline int bch2_journal_preres_get_fast(struct journal *j,
 					       struct journal_preres *res,
 					       unsigned new_u64s,
-					       unsigned flags)
+					       unsigned flags,
+					       bool set_waiting)
 {
 	int d = new_u64s - res->u64s;
 	union journal_preres_state old, new;
 	u64 v = atomic64_read(&j->prereserved.counter);
+	int ret;
 
 	do {
 		old.v = new.v = v;
-
-		new.reserved += d;
-
-		/*
-		 * If we're being called from the journal reclaim path, we have
-		 * to unconditionally give out the pre-reservation, there's
-		 * nothing else sensible we can do - otherwise we'd recurse back
-		 * into the reclaim path and deadlock:
-		 */
-
-		if (!(flags & JOURNAL_RES_GET_RESERVED) &&
-		    new.reserved > new.remaining)
+		ret = 0;
+
+		if ((flags & JOURNAL_RES_GET_RESERVED) ||
+		    new.reserved + d < new.remaining) {
+			new.reserved += d;
+			ret = 1;
+		} else if (set_waiting && !new.waiting)
+			new.waiting = true;
+		else
 			return 0;
 	} while ((v = atomic64_cmpxchg(&j->prereserved.counter,
 				       old.v, new.v)) != old.v);
 
-	res->u64s += d;
-	return 1;
+	if (ret)
+		res->u64s += d;
+	return ret;
 }
 
 static inline int bch2_journal_preres_get(struct journal *j,
@@ -463,7 +468,7 @@ static inline int bch2_journal_preres_get(struct journal *j,
 	if (new_u64s <= res->u64s)
 		return 0;
 
-	if (bch2_journal_preres_get_fast(j, res, new_u64s, flags))
+	if (bch2_journal_preres_get_fast(j, res, new_u64s, flags, false))
 		return 0;
 
 	if (flags & JOURNAL_RES_GET_NONBLOCK)
diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h
index ec3c604cdf22..97d764370b89 100644
--- a/fs/bcachefs/journal_types.h
+++ b/fs/bcachefs/journal_types.h
@@ -105,8 +105,9 @@ union journal_preres_state {
 	};
 
 	struct {
-		u32		reserved;
-		u32		remaining;
+		u64		waiting:1,
+				reserved:31,
+				remaining:32;
 	};
 };
 
-- 
cgit 


From 3ce8b463e3e044cc6765c096e2b755416e6f7e84 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 3 Apr 2021 21:54:14 -0400
Subject: bcachefs: kill bset_tree->max_key

Since we now ensure a btree node's max key fits in its packed format,
this isn't needed for the reasons it used to be - and, it was being used
inconsistently.

Also reorder struct btree a bit for performance, and kill some dead
code.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bset.c        | 36 +++---------------------------------
 fs/bcachefs/btree_types.h | 12 +++++-------
 2 files changed, 8 insertions(+), 40 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c
index de4dc2fac1d6..8c038da3c108 100644
--- a/fs/bcachefs/bset.c
+++ b/fs/bcachefs/bset.c
@@ -686,7 +686,7 @@ static void make_bfloat(struct btree *b, struct bset_tree *t,
 		if (!bkey_pack_pos(max_key, b->data->max_key, b)) {
 			k = (void *) max_key;
 			bkey_init(&k->k);
-			k->k.p = t->max_key;
+			k->k.p = b->data->max_key;
 		}
 	}
 
@@ -770,8 +770,6 @@ retry:
 	while (k != btree_bkey_last(b, t))
 		prev = k, k = bkey_next(k);
 
-	t->max_key = bkey_unpack_pos(b, prev);
-
 	if (!bkey_pack_pos(bkey_to_packed(&min_key), b->data->min_key, b)) {
 		bkey_init(&min_key.k);
 		min_key.k.p = b->data->min_key;
@@ -779,7 +777,7 @@ retry:
 
 	if (!bkey_pack_pos(bkey_to_packed(&max_key), b->data->max_key, b)) {
 		bkey_init(&max_key.k);
-		max_key.k.p = t->max_key;
+		max_key.k.p = b->data->max_key;
 	}
 
 	/* Then we build the tree */
@@ -958,8 +956,6 @@ static void ro_aux_tree_fix_invalidated_key(struct btree *b,
 	min_key.u64s = max_key.u64s = 0;
 
 	if (bkey_next(k) == btree_bkey_last(b, t)) {
-		t->max_key = bkey_unpack_pos(b, k);
-
 		for (j = 1; j < t->size; j = j * 2 + 1)
 			make_bfloat(b, t, j, &min_key, &max_key);
 	}
@@ -1299,16 +1295,6 @@ struct bkey_packed *__bch2_bset_search(struct btree *b,
 	case BSET_RW_AUX_TREE:
 		return bset_search_write_set(b, t, search);
 	case BSET_RO_AUX_TREE:
-		/*
-		 * Each node in the auxiliary search tree covers a certain range
-		 * of bits, and keys above and below the set it covers might
-		 * differ outside those bits - so we have to special case the
-		 * start and end - handle that here:
-		 */
-
-		if (bpos_cmp(*search, t->max_key) > 0)
-			return btree_bkey_last(b, t);
-
 		return bset_search_tree(b, t, search, lossy_packed_search);
 	default:
 		unreachable();
@@ -1345,23 +1331,6 @@ struct bkey_packed *bch2_bset_search_linear(struct btree *b,
 	return m;
 }
 
-/*
- * Returns the first key greater than or equal to @search
- */
-static __always_inline __flatten
-struct bkey_packed *bch2_bset_search(struct btree *b,
-				struct bset_tree *t,
-				struct bpos *search,
-				struct bkey_packed *packed_search,
-				const struct bkey_packed *lossy_packed_search)
-{
-	struct bkey_packed *m = __bch2_bset_search(b, t, search,
-						   lossy_packed_search);
-
-	return bch2_bset_search_linear(b, t, search,
-				 packed_search, lossy_packed_search, m);
-}
-
 /* Btree node iterator */
 
 static inline void __bch2_btree_node_iter_push(struct btree_node_iter *iter,
@@ -1457,6 +1426,7 @@ void bch2_btree_node_iter_init(struct btree_node_iter *iter,
 	unsigned i;
 
 	EBUG_ON(bpos_cmp(*search, b->data->min_key) < 0);
+	EBUG_ON(bpos_cmp(*search, b->data->max_key) > 0);
 	bset_aux_tree_verify(b);
 
 	memset(iter, 0, sizeof(*iter));
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 5bee4135ab8f..9db22b35b780 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -47,8 +47,6 @@ struct bset_tree {
 	u16			data_offset;
 	u16			aux_data_offset;
 	u16			end_offset;
-
-	struct bpos		max_key;
 };
 
 struct btree_write {
@@ -98,6 +96,11 @@ struct btree {
 	u8			byte_order;
 	u8			unpack_fn_len;
 
+	struct btree_write	writes[2];
+
+	/* Key/pointer for this btree node */
+	__BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX);
+
 	/*
 	 * XXX: add a delete sequence number, so when bch2_btree_node_relock()
 	 * fails because the lock sequence number has changed - i.e. the
@@ -128,11 +131,6 @@ struct btree {
 
 	/* lru list */
 	struct list_head	list;
-
-	struct btree_write	writes[2];
-
-	/* Key/pointer for this btree node */
-	__BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX);
 };
 
 struct btree_cache {
-- 
cgit 


From ecc1420944c73205225ed6fe4c1781c09759dd10 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 4 Apr 2021 22:38:07 -0400
Subject: bcachefs: Fix an uninitialized variable

Fortunately it was just used in an error message

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/buckets.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 47a1b8b12eb9..7093737a02f3 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -852,7 +852,7 @@ static int mark_stripe_bucket(struct bch_fs *c, struct bkey_s_c k,
 	if (g->stripe && g->stripe != k.k->p.offset) {
 		bch2_fs_inconsistent(c,
 			      "bucket %u:%zu gen %u: multiple stripes using same bucket\n%s",
-			      ptr->dev, PTR_BUCKET_NR(ca, ptr), new.gen,
+			      ptr->dev, PTR_BUCKET_NR(ca, ptr), g->mark.gen,
 			      (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
 		return -EINVAL;
 	}
-- 
cgit 


From f72b1fd710870547f566d7d02563833eda43e67d Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 5 Apr 2021 01:23:55 -0400
Subject: bcachefs: Fix a startup race

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_key_cache.c | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index 74d982c3402a..0af46335bd00 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -692,17 +692,16 @@ int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc)
 	struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
 	int ret;
 
-	bc->shrink.seeks		= 1;
-	bc->shrink.count_objects	= bch2_btree_key_cache_count;
-	bc->shrink.scan_objects		= bch2_btree_key_cache_scan;
-
-	ret =   register_shrinker(&bc->shrink, "%s/btree_key_cache", c->name) ?:
-		rhashtable_init(&bc->table, &bch2_btree_key_cache_params);
+	ret = rhashtable_init(&bc->table, &bch2_btree_key_cache_params);
 	if (ret)
 		return ret;
 
 	bc->table_init_done = true;
-	return 0;
+
+	bc->shrink.seeks		= 1;
+	bc->shrink.count_objects	= bch2_btree_key_cache_count;
+	bc->shrink.scan_objects		= bch2_btree_key_cache_scan;
+	return register_shrinker(&bc->shrink, "%s/btree_key_cache", c->name);
 }
 
 void bch2_btree_key_cache_to_text(struct printbuf *out, struct btree_key_cache *c)
-- 
cgit 


From a0857785001777ff659248e45a2e1688fb43499d Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 6 Apr 2021 13:43:31 -0400
Subject: bcachefs: Increase BSET_CACHELINE to 256 bytes

Linear searches have gotten cheaper relative to binary searches on
modern hardware, due to better branch prediction behaviour.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bset.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bset.h b/fs/bcachefs/bset.h
index e6c8d081f9b6..8acbcb5d86c4 100644
--- a/fs/bcachefs/bset.h
+++ b/fs/bcachefs/bset.h
@@ -188,7 +188,7 @@ static inline enum bset_aux_tree_type bset_aux_tree_type(const struct bset_tree
  * gets to the second cacheline.
  */
 
-#define BSET_CACHELINE		128
+#define BSET_CACHELINE		256
 
 static inline size_t btree_keys_cachelines(const struct btree *b)
 {
-- 
cgit 


From 9d8022db1ccfff6aaf1de6158c2a26b667c70a15 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 6 Apr 2021 14:00:56 -0400
Subject: bcachefs: Eliminate more PAGE_SIZE uses

In userspace, we don't really have a well defined PAGE_SIZE and shouln't
be relying on it. This is some more incremental work to remove
references to it.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/debug.c       |  4 ++--
 fs/bcachefs/super-io.c    | 31 +++++++++++++++----------------
 fs/bcachefs/super.c       |  3 +--
 fs/bcachefs/super_types.h |  2 +-
 fs/bcachefs/util.c        |  2 +-
 5 files changed, 20 insertions(+), 22 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c
index 111310344cec..eb8c57d253fb 100644
--- a/fs/bcachefs/debug.c
+++ b/fs/bcachefs/debug.c
@@ -150,7 +150,7 @@ struct dump_iter {
 	struct bch_fs	*c;
 	enum btree_id		id;
 
-	char			buf[PAGE_SIZE];
+	char			buf[1 << 12];
 	size_t			bytes;	/* what's currently in buf */
 
 	char __user		*ubuf;	/* destination user buffer */
@@ -230,7 +230,7 @@ static ssize_t bch2_read_btree(struct file *file, char __user *buf,
 	while (k.k && !(err = bkey_err(k))) {
 		bch2_bkey_val_to_text(&PBUF(i->buf), i->c, k);
 		i->bytes = strlen(i->buf);
-		BUG_ON(i->bytes >= PAGE_SIZE);
+		BUG_ON(i->bytes >= sizeof(i->buf));
 		i->buf[i->bytes] = '\n';
 		i->bytes++;
 
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index e397a2a70c9c..bf36a5743607 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -53,8 +53,7 @@ static struct bch_sb_field *__bch2_sb_field_resize(struct bch_sb_handle *sb,
 	unsigned old_u64s = f ? le32_to_cpu(f->u64s) : 0;
 	unsigned sb_u64s = le32_to_cpu(sb->sb->u64s) + u64s - old_u64s;
 
-	BUG_ON(get_order(__vstruct_bytes(struct bch_sb, sb_u64s)) >
-	       sb->page_order);
+	BUG_ON(__vstruct_bytes(struct bch_sb, sb_u64s) > sb->buffer_size);
 
 	if (!f && !u64s) {
 		/* nothing to do: */
@@ -105,18 +104,23 @@ void bch2_free_super(struct bch_sb_handle *sb)
 		blkdev_put(sb->bdev, sb->holder);
 	kfree(sb->holder);
 
-	free_pages((unsigned long) sb->sb, sb->page_order);
+	kfree(sb->sb);
 	memset(sb, 0, sizeof(*sb));
 }
 
 int bch2_sb_realloc(struct bch_sb_handle *sb, unsigned u64s)
 {
 	size_t new_bytes = __vstruct_bytes(struct bch_sb, u64s);
-	unsigned order = get_order(new_bytes);
+	size_t new_buffer_size;
 	struct bch_sb *new_sb;
 	struct bio *bio;
 
-	if (sb->sb && sb->page_order >= order)
+	if (sb->bdev)
+		new_bytes = max_t(size_t, new_bytes, bdev_logical_block_size(sb->bdev));
+
+	new_buffer_size = roundup_pow_of_two(new_bytes);
+
+	if (sb->sb && sb->buffer_size >= new_buffer_size)
 		return 0;
 
 	if (sb->have_layout) {
@@ -129,14 +133,14 @@ int bch2_sb_realloc(struct bch_sb_handle *sb, unsigned u64s)
 		}
 	}
 
-	if (sb->page_order >= order && sb->sb)
+	if (sb->buffer_size >= new_buffer_size && sb->sb)
 		return 0;
 
 	if (dynamic_fault("bcachefs:add:super_realloc"))
 		return -ENOMEM;
 
 	if (sb->have_bio) {
-		unsigned nr_bvecs = 1 << order;
+		unsigned nr_bvecs = DIV_ROUND_UP(new_buffer_size, PAGE_SIZE);
 
 		bio = bio_kmalloc(nr_bvecs, GFP_KERNEL);
 		if (!bio)
@@ -149,17 +153,12 @@ int bch2_sb_realloc(struct bch_sb_handle *sb, unsigned u64s)
 		sb->bio = bio;
 	}
 
-	new_sb = (void *) __get_free_pages(GFP_NOFS|__GFP_ZERO, order);
+	new_sb = krealloc(sb->sb, new_buffer_size, GFP_NOFS|__GFP_ZERO);
 	if (!new_sb)
 		return -ENOMEM;
 
-	if (sb->sb)
-		memcpy(new_sb, sb->sb, PAGE_SIZE << sb->page_order);
-
-	free_pages((unsigned long) sb->sb, sb->page_order);
 	sb->sb = new_sb;
-
-	sb->page_order = order;
+	sb->buffer_size = new_buffer_size;
 
 	return 0;
 }
@@ -480,7 +479,7 @@ static const char *read_one_super(struct bch_sb_handle *sb, u64 offset)
 reread:
 	bio_reset(sb->bio, sb->bdev, REQ_OP_READ|REQ_SYNC|REQ_META);
 	sb->bio->bi_iter.bi_sector = offset;
-	bch2_bio_map(sb->bio, sb->sb, PAGE_SIZE << sb->page_order);
+	bch2_bio_map(sb->bio, sb->sb, sb->buffer_size);
 
 	if (submit_bio_wait(sb->bio))
 		return "IO error";
@@ -498,7 +497,7 @@ reread:
 	if (bytes > 512 << sb->sb->layout.sb_max_size_bits)
 		return "Bad superblock: too big";
 
-	if (get_order(bytes) > sb->page_order) {
+	if (bytes > sb->buffer_size) {
 		if (bch2_sb_realloc(sb, le32_to_cpu(sb->sb->u64s)))
 			return "cannot allocate memory";
 		goto reread;
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 529d33f4a6d7..385b41f16754 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -509,8 +509,7 @@ static void __bch2_fs_free(struct bch_fs *c)
 	if (c->wq)
 		destroy_workqueue(c->wq);
 
-	free_pages((unsigned long) c->disk_sb.sb,
-		   c->disk_sb.page_order);
+	bch2_free_super(&c->disk_sb);
 	kvpfree(c, sizeof(*c));
 	module_put(THIS_MODULE);
 }
diff --git a/fs/bcachefs/super_types.h b/fs/bcachefs/super_types.h
index e3a989e3e9d9..b14b2d82c655 100644
--- a/fs/bcachefs/super_types.h
+++ b/fs/bcachefs/super_types.h
@@ -7,7 +7,7 @@ struct bch_sb_handle {
 	struct block_device	*bdev;
 	struct bio		*bio;
 	void			*holder;
-	unsigned		page_order;
+	size_t			buffer_size;
 	fmode_t			mode;
 	unsigned		have_layout:1;
 	unsigned		have_bio:1;
diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c
index 6e665f7f25a3..f183c9d80e2c 100644
--- a/fs/bcachefs/util.c
+++ b/fs/bcachefs/util.c
@@ -154,7 +154,7 @@ void bch2_flags_to_text(struct printbuf *out,
 u64 bch2_read_flag_list(char *opt, const char * const list[])
 {
 	u64 ret = 0;
-	char *p, *s, *d = kstrndup(opt, PAGE_SIZE - 1, GFP_KERNEL);
+	char *p, *s, *d = kstrdup(opt, GFP_KERNEL);
 
 	if (!d)
 		return -ENOMEM;
-- 
cgit 


From 241e26369e1267be376490152ee2c52021b4321a Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 31 Mar 2021 21:44:55 -0400
Subject: bcachefs: Don't flush btree writes more aggressively because of btree
 key cache

We need to flush the btree key cache when it's too dirty, because
otherwise the shrinker won't be able to reclaim memory - this is done by
journal reclaim. But journal reclaim also kicks btree node writes: this
meant that btree node writes were getting kicked much too often just
because we needed to flush btree key cache keys.

This patch splits journal pins into two different lists, and teaches
journal reclaim to not flush btree node writes when it only needs to
flush key cache keys.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c            |  4 ++-
 fs/bcachefs/btree_key_cache.c       |  7 ++--
 fs/bcachefs/btree_key_cache.h       | 12 ++-----
 fs/bcachefs/btree_update_interior.c | 11 ++++--
 fs/bcachefs/journal.c               | 30 ++++++++--------
 fs/bcachefs/journal_reclaim.c       | 68 ++++++++++++++++++++++++-------------
 fs/bcachefs/journal_types.h         |  1 +
 fs/bcachefs/trace.h                 |  5 +++
 8 files changed, 81 insertions(+), 57 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 203c9adb0623..8f5318a38d9b 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -473,8 +473,10 @@ bool bch2_trans_relock(struct btree_trans *trans)
 
 	trans_for_each_iter(trans, iter)
 		if (btree_iter_keep(trans, iter) &&
-		    !bch2_btree_iter_relock(iter, true))
+		    !bch2_btree_iter_relock(iter, true)) {
+			trace_trans_restart_relock(trans->ip);
 			return false;
+		}
 	return true;
 }
 
diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index 0af46335bd00..ac844f47b8dd 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -445,9 +445,8 @@ out:
 	return ret;
 }
 
-static int btree_key_cache_journal_flush(struct journal *j,
-					 struct journal_entry_pin *pin,
-					 u64 seq)
+int bch2_btree_key_cache_journal_flush(struct journal *j,
+				struct journal_entry_pin *pin, u64 seq)
 {
 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
 	struct bkey_cached *ck =
@@ -528,7 +527,7 @@ bool bch2_btree_insert_key_cached(struct btree_trans *trans,
 	}
 
 	bch2_journal_pin_update(&c->journal, trans->journal_res.seq,
-				&ck->journal, btree_key_cache_journal_flush);
+				&ck->journal, bch2_btree_key_cache_journal_flush);
 
 	if (kick_reclaim)
 		journal_reclaim_kick(&c->journal);
diff --git a/fs/bcachefs/btree_key_cache.h b/fs/bcachefs/btree_key_cache.h
index 4e1e5a9c7656..7e2b0a08f745 100644
--- a/fs/bcachefs/btree_key_cache.h
+++ b/fs/bcachefs/btree_key_cache.h
@@ -1,15 +1,6 @@
 #ifndef _BCACHEFS_BTREE_KEY_CACHE_H
 #define _BCACHEFS_BTREE_KEY_CACHE_H
 
-static inline size_t bch2_nr_btree_keys_want_flush(struct bch_fs *c)
-{
-	size_t nr_dirty = atomic_long_read(&c->btree_key_cache.nr_dirty);
-	size_t nr_keys = atomic_long_read(&c->btree_key_cache.nr_keys);
-	size_t max_dirty = nr_keys / 4;
-
-	return max_t(ssize_t, 0, nr_dirty - max_dirty);
-}
-
 static inline size_t bch2_nr_btree_keys_need_flush(struct bch_fs *c)
 {
 	size_t nr_dirty = atomic_long_read(&c->btree_key_cache.nr_dirty);
@@ -29,6 +20,9 @@ static inline bool bch2_btree_key_cache_must_wait(struct bch_fs *c)
 		test_bit(JOURNAL_RECLAIM_STARTED, &c->journal.flags);
 }
 
+int bch2_btree_key_cache_journal_flush(struct journal *,
+				struct journal_entry_pin *, u64);
+
 struct bkey_cached *
 bch2_btree_key_cache_find(struct bch_fs *, enum btree_id, struct bpos);
 
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index e965c8bbddce..b3137525f9c1 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -974,20 +974,25 @@ retry:
 		 * closure argument
 		 */
 		if (flags & BTREE_INSERT_NOUNLOCK) {
+			trace_trans_restart_journal_preres_get(trans->ip);
 			ret = -EINTR;
 			goto err;
 		}
 
 		bch2_trans_unlock(trans);
 
-		if (flags & BTREE_INSERT_JOURNAL_RECLAIM)
-			goto err;
+		if (flags & BTREE_INSERT_JOURNAL_RECLAIM) {
+			bch2_btree_update_free(as);
+			return ERR_PTR(ret);
+		}
 
 		ret = bch2_journal_preres_get(&c->journal, &as->journal_preres,
 				BTREE_UPDATE_JOURNAL_RES,
 				journal_flags);
-		if (ret)
+		if (ret) {
+			trace_trans_restart_journal_preres_get(trans->ip);
 			goto err;
+		}
 
 		if (!bch2_trans_relock(trans)) {
 			ret = -EINTR;
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index 35a48629b63b..af2f8528ac65 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -59,21 +59,23 @@ journal_seq_to_buf(struct journal *j, u64 seq)
 	return buf;
 }
 
-static void journal_pin_new_entry(struct journal *j, int count)
+static void journal_pin_list_init(struct journal_entry_pin_list *p, int count)
 {
-	struct journal_entry_pin_list *p;
+	INIT_LIST_HEAD(&p->list);
+	INIT_LIST_HEAD(&p->key_cache_list);
+	INIT_LIST_HEAD(&p->flushed);
+	atomic_set(&p->count, count);
+	p->devs.nr = 0;
+}
 
+static void journal_pin_new_entry(struct journal *j)
+{
 	/*
 	 * The fifo_push() needs to happen at the same time as j->seq is
 	 * incremented for journal_last_seq() to be calculated correctly
 	 */
 	atomic64_inc(&j->seq);
-	p = fifo_push_ref(&j->pin);
-
-	INIT_LIST_HEAD(&p->list);
-	INIT_LIST_HEAD(&p->flushed);
-	atomic_set(&p->count, count);
-	p->devs.nr = 0;
+	journal_pin_list_init(fifo_push_ref(&j->pin), 1);
 }
 
 static void bch2_journal_buf_init(struct journal *j)
@@ -192,7 +194,7 @@ static bool __journal_entry_close(struct journal *j)
 	__bch2_journal_pin_put(j, le64_to_cpu(buf->data->seq));
 
 	/* Initialize new buffer: */
-	journal_pin_new_entry(j, 1);
+	journal_pin_new_entry(j);
 
 	bch2_journal_buf_init(j);
 
@@ -1030,12 +1032,8 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq,
 	j->pin.back		= cur_seq;
 	atomic64_set(&j->seq, cur_seq - 1);
 
-	fifo_for_each_entry_ptr(p, &j->pin, seq) {
-		INIT_LIST_HEAD(&p->list);
-		INIT_LIST_HEAD(&p->flushed);
-		atomic_set(&p->count, 1);
-		p->devs.nr = 0;
-	}
+	fifo_for_each_entry_ptr(p, &j->pin, seq)
+		journal_pin_list_init(p, 1);
 
 	list_for_each_entry(i, journal_entries, list) {
 		unsigned ptr;
@@ -1058,7 +1056,7 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq,
 	set_bit(JOURNAL_STARTED, &j->flags);
 	j->last_flush_write = jiffies;
 
-	journal_pin_new_entry(j, 1);
+	journal_pin_new_entry(j);
 
 	j->reservations.idx = j->reservations.unwritten_idx = journal_cur_seq(j);
 
diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
index 42ed7a3525b1..0d7fe1f99dbf 100644
--- a/fs/bcachefs/journal_reclaim.c
+++ b/fs/bcachefs/journal_reclaim.c
@@ -407,7 +407,12 @@ void bch2_journal_pin_set(struct journal *j, u64 seq,
 	pin->seq	= seq;
 	pin->flush	= flush_fn;
 
-	list_add(&pin->list, flush_fn ? &pin_list->list : &pin_list->flushed);
+	if (flush_fn == bch2_btree_key_cache_journal_flush)
+		list_add(&pin->list, &pin_list->key_cache_list);
+	else if (flush_fn)
+		list_add(&pin->list, &pin_list->list);
+	else
+		list_add(&pin->list, &pin_list->flushed);
 	spin_unlock(&j->lock);
 
 	/*
@@ -437,23 +442,40 @@ void bch2_journal_pin_flush(struct journal *j, struct journal_entry_pin *pin)
  */
 
 static struct journal_entry_pin *
-journal_get_next_pin(struct journal *j, u64 max_seq, u64 *seq)
+journal_get_next_pin(struct journal *j,
+		     bool get_any,
+		     bool get_key_cache,
+		     u64 max_seq, u64 *seq)
 {
 	struct journal_entry_pin_list *pin_list;
 	struct journal_entry_pin *ret = NULL;
 
-	fifo_for_each_entry_ptr(pin_list, &j->pin, *seq)
-		if (*seq > max_seq ||
-		    (ret = list_first_entry_or_null(&pin_list->list,
-				struct journal_entry_pin, list)))
+	fifo_for_each_entry_ptr(pin_list, &j->pin, *seq) {
+		if (*seq > max_seq && !get_any && !get_key_cache)
 			break;
 
-	return ret;
+		if (*seq <= max_seq || get_any) {
+			ret = list_first_entry_or_null(&pin_list->list,
+				struct journal_entry_pin, list);
+			if (ret)
+				return ret;
+		}
+
+		if (*seq <= max_seq || get_any || get_key_cache) {
+			ret = list_first_entry_or_null(&pin_list->key_cache_list,
+				struct journal_entry_pin, list);
+			if (ret)
+				return ret;
+		}
+	}
+
+	return NULL;
 }
 
 /* returns true if we did work */
 static size_t journal_flush_pins(struct journal *j, u64 seq_to_flush,
-				 unsigned min_nr)
+				 unsigned min_any,
+				 unsigned min_key_cache)
 {
 	struct journal_entry_pin *pin;
 	size_t nr_flushed = 0;
@@ -472,8 +494,10 @@ static size_t journal_flush_pins(struct journal *j, u64 seq_to_flush,
 		j->last_flushed = jiffies;
 
 		spin_lock(&j->lock);
-		pin = journal_get_next_pin(j, min_nr
-				? U64_MAX : seq_to_flush, &seq);
+		pin = journal_get_next_pin(j,
+					   min_any != 0,
+					   min_key_cache != 0,
+					   seq_to_flush, &seq);
 		if (pin) {
 			BUG_ON(j->flush_in_progress);
 			j->flush_in_progress = pin;
@@ -485,8 +509,11 @@ static size_t journal_flush_pins(struct journal *j, u64 seq_to_flush,
 		if (!pin)
 			break;
 
-		if (min_nr)
-			min_nr--;
+		if (min_key_cache && pin->flush == bch2_btree_key_cache_journal_flush)
+			min_key_cache--;
+
+		if (min_any)
+			min_any--;
 
 		err = flush_fn(j, pin, seq);
 
@@ -610,18 +637,9 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct)
 		if (j->prereserved.reserved * 2 > j->prereserved.remaining)
 			min_nr = 1;
 
-		if (atomic_read(&c->btree_cache.dirty) * 4 >
-		    c->btree_cache.used  * 3)
-			min_nr = 1;
-
 		if (fifo_free(&j->pin) <= 32)
 			min_nr = 1;
 
-		min_nr = max(min_nr, bch2_nr_btree_keys_want_flush(c));
-
-		/* Don't do too many without delivering wakeup: */
-		min_nr = min(min_nr, 128UL);
-
 		trace_journal_reclaim_start(c,
 				min_nr,
 				j->prereserved.reserved,
@@ -631,7 +649,9 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct)
 				atomic_long_read(&c->btree_key_cache.nr_dirty),
 				atomic_long_read(&c->btree_key_cache.nr_keys));
 
-		nr_flushed = journal_flush_pins(j, seq_to_flush, min_nr);
+		nr_flushed = journal_flush_pins(j, seq_to_flush,
+					min_nr,
+					min(bch2_nr_btree_keys_need_flush(c), 128UL));
 
 		if (direct)
 			j->nr_direct_reclaim += nr_flushed;
@@ -641,7 +661,7 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct)
 
 		if (nr_flushed)
 			wake_up(&j->reclaim_wait);
-	} while (min_nr && nr_flushed);
+	} while (min_nr && nr_flushed && !direct);
 
 	memalloc_noreclaim_restore(flags);
 
@@ -734,7 +754,7 @@ static int journal_flush_done(struct journal *j, u64 seq_to_flush,
 
 	mutex_lock(&j->reclaim_lock);
 
-	*did_work = journal_flush_pins(j, seq_to_flush, 0) != 0;
+	*did_work = journal_flush_pins(j, seq_to_flush, 0, 0) != 0;
 
 	spin_lock(&j->lock);
 	/*
diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h
index 97d764370b89..f597eb78e66e 100644
--- a/fs/bcachefs/journal_types.h
+++ b/fs/bcachefs/journal_types.h
@@ -43,6 +43,7 @@ struct journal_buf {
 
 struct journal_entry_pin_list {
 	struct list_head		list;
+	struct list_head		key_cache_list;
 	struct list_head		flushed;
 	atomic_t			count;
 	struct bch_devs_list		devs;
diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h
index babb07e3acc4..387c1c49f696 100644
--- a/fs/bcachefs/trace.h
+++ b/fs/bcachefs/trace.h
@@ -716,6 +716,11 @@ DEFINE_EVENT(transaction_restart,	trans_restart_iter_upgrade,
 	TP_ARGS(ip)
 );
 
+DEFINE_EVENT(transaction_restart,	trans_restart_relock,
+	TP_PROTO(unsigned long ip),
+	TP_ARGS(ip)
+);
+
 DEFINE_EVENT(transaction_restart,	trans_restart_traverse,
 	TP_PROTO(unsigned long ip),
 	TP_ARGS(ip)
-- 
cgit 


From 2177147b39098e6f08b3d8d45bbcf7dedd7ebdad Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 6 Apr 2021 15:33:19 -0400
Subject: bcachefs: Improve bset compaction

The previous patch that fixed btree nodes being written too aggressively
now meant that we weren't sorting btree node bsets optimally - this
patch fixes that.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_cache.c           |  2 +-
 fs/bcachefs/btree_io.c              | 51 ++++++++++++++++++++++++-------------
 fs/bcachefs/btree_io.h              |  3 +--
 fs/bcachefs/btree_update_interior.h |  4 ++-
 4 files changed, 39 insertions(+), 21 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index 2ec668c3427e..8ed8610796fb 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -215,7 +215,7 @@ static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush)
 		if (bch2_verify_btree_ondisk)
 			bch2_btree_node_write(c, b, SIX_LOCK_intent);
 		else
-			__bch2_btree_node_write(c, b, SIX_LOCK_read);
+			__bch2_btree_node_write(c, b);
 
 		/* wait for any in flight btree write */
 		btree_node_wait_on_io(b);
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 3b45389a8e06..fd90e434c78c 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -241,7 +241,6 @@ bool bch2_compact_whiteouts(struct bch_fs *c, struct btree *b,
 }
 
 static void btree_node_sort(struct bch_fs *c, struct btree *b,
-			    struct btree_iter *iter,
 			    unsigned start_idx,
 			    unsigned end_idx,
 			    bool filter_whiteouts)
@@ -377,8 +376,7 @@ void bch2_btree_sort_into(struct bch_fs *c,
  * We're about to add another bset to the btree node, so if there's currently
  * too many bsets - sort some of them together:
  */
-static bool btree_node_compact(struct bch_fs *c, struct btree *b,
-			       struct btree_iter *iter)
+static bool btree_node_compact(struct bch_fs *c, struct btree *b)
 {
 	unsigned unwritten_idx;
 	bool ret = false;
@@ -390,13 +388,13 @@ static bool btree_node_compact(struct bch_fs *c, struct btree *b,
 			break;
 
 	if (b->nsets - unwritten_idx > 1) {
-		btree_node_sort(c, b, iter, unwritten_idx,
+		btree_node_sort(c, b, unwritten_idx,
 				b->nsets, false);
 		ret = true;
 	}
 
 	if (unwritten_idx > 1) {
-		btree_node_sort(c, b, iter, 0, unwritten_idx, false);
+		btree_node_sort(c, b, 0, unwritten_idx, false);
 		ret = true;
 	}
 
@@ -426,12 +424,30 @@ void bch2_btree_init_next(struct bch_fs *c, struct btree *b,
 			  struct btree_iter *iter)
 {
 	struct btree_node_entry *bne;
-	bool did_sort;
+	bool reinit_iter = false;
 
 	EBUG_ON(!(b->c.lock.state.seq & 1));
 	EBUG_ON(iter && iter->l[b->c.level].b != b);
+	BUG_ON(bset_written(b, bset(b, &b->set[1])));
+
+	if (b->nsets == MAX_BSETS) {
+		unsigned log_u64s[] = {
+			ilog2(bset_u64s(&b->set[0])),
+			ilog2(bset_u64s(&b->set[1])),
+			ilog2(bset_u64s(&b->set[2])),
+		};
+
+		if (log_u64s[1] >= (log_u64s[0] + log_u64s[2]) / 2) {
+			bch2_btree_node_write(c, b, SIX_LOCK_write);
+			reinit_iter = true;
+		}
+	}
+
+	if (b->nsets == MAX_BSETS &&
+	    btree_node_compact(c, b))
+		reinit_iter = true;
 
-	did_sort = btree_node_compact(c, b, iter);
+	BUG_ON(b->nsets >= MAX_BSETS);
 
 	bne = want_new_bset(c, b);
 	if (bne)
@@ -439,7 +455,7 @@ void bch2_btree_init_next(struct bch_fs *c, struct btree *b,
 
 	bch2_btree_build_aux_trees(b);
 
-	if (iter && did_sort)
+	if (iter && reinit_iter)
 		bch2_btree_iter_reinit_node(iter, b);
 }
 
@@ -1324,8 +1340,7 @@ static int validate_bset_for_write(struct bch_fs *c, struct btree *b,
 	return ret;
 }
 
-void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
-			    enum six_lock_type lock_type_held)
+void __bch2_btree_node_write(struct bch_fs *c, struct btree *b)
 {
 	struct btree_write_bio *wbio;
 	struct bset_tree *t;
@@ -1596,7 +1611,7 @@ bool bch2_btree_post_write_cleanup(struct bch_fs *c, struct btree *b)
 	 * single bset:
 	 */
 	if (b->nsets > 1) {
-		btree_node_sort(c, b, NULL, 0, b->nsets, true);
+		btree_node_sort(c, b, 0, b->nsets, true);
 		invalidated_iter = true;
 	} else {
 		invalidated_iter = bch2_drop_whiteouts(b, COMPACT_ALL);
@@ -1626,13 +1641,12 @@ bool bch2_btree_post_write_cleanup(struct bch_fs *c, struct btree *b)
  * Use this one if the node is intent locked:
  */
 void bch2_btree_node_write(struct bch_fs *c, struct btree *b,
-			  enum six_lock_type lock_type_held)
+			   enum six_lock_type lock_type_held)
 {
-	BUG_ON(lock_type_held == SIX_LOCK_write);
-
 	if (lock_type_held == SIX_LOCK_intent ||
-	    six_lock_tryupgrade(&b->c.lock)) {
-		__bch2_btree_node_write(c, b, SIX_LOCK_intent);
+	    (lock_type_held == SIX_LOCK_read &&
+	     six_lock_tryupgrade(&b->c.lock))) {
+		__bch2_btree_node_write(c, b);
 
 		/* don't cycle lock unnecessarily: */
 		if (btree_node_just_written(b) &&
@@ -1644,7 +1658,10 @@ void bch2_btree_node_write(struct bch_fs *c, struct btree *b,
 		if (lock_type_held == SIX_LOCK_read)
 			six_lock_downgrade(&b->c.lock);
 	} else {
-		__bch2_btree_node_write(c, b, SIX_LOCK_read);
+		__bch2_btree_node_write(c, b);
+		if (lock_type_held == SIX_LOCK_write &&
+		    btree_node_just_written(b))
+			bch2_btree_post_write_cleanup(c, b);
 	}
 }
 
diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h
index 9c14cd30a09e..95c351611045 100644
--- a/fs/bcachefs/btree_io.h
+++ b/fs/bcachefs/btree_io.h
@@ -144,8 +144,7 @@ void bch2_btree_complete_write(struct bch_fs *, struct btree *,
 			      struct btree_write *);
 void bch2_btree_write_error_work(struct work_struct *);
 
-void __bch2_btree_node_write(struct bch_fs *, struct btree *,
-			    enum six_lock_type);
+void __bch2_btree_node_write(struct bch_fs *, struct btree *);
 bool bch2_btree_post_write_cleanup(struct bch_fs *, struct btree *);
 
 void bch2_btree_node_write(struct bch_fs *, struct btree *,
diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h
index f2925b0d7f17..7eef3dbb6ef1 100644
--- a/fs/bcachefs/btree_update_interior.h
+++ b/fs/bcachefs/btree_update_interior.h
@@ -256,13 +256,15 @@ static inline size_t bch_btree_keys_u64s_remaining(struct bch_fs *c,
 	return remaining;
 }
 
+#define BTREE_WRITE_SET_U64s_BITS	9
+
 static inline unsigned btree_write_set_buffer(struct btree *b)
 {
 	/*
 	 * Could buffer up larger amounts of keys for btrees with larger keys,
 	 * pending benchmarking:
 	 */
-	return 4 << 10;
+	return 8 << BTREE_WRITE_SET_U64s_BITS;
 }
 
 static inline struct btree_node_entry *want_new_bset(struct bch_fs *c,
-- 
cgit 


From b6d4f474e4e785a9090992b0f301e57870f73711 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 6 Apr 2021 20:11:28 -0400
Subject: bcachefs: Move some dirent checks to bch2_dirent_invalid()

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/dirent.c | 18 +++++++++++++-----
 fs/bcachefs/fsck.c   | 31 -------------------------------
 2 files changed, 13 insertions(+), 36 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c
index cf4ce2e7f29c..ec4666143f23 100644
--- a/fs/bcachefs/dirent.c
+++ b/fs/bcachefs/dirent.c
@@ -84,16 +84,24 @@ const char *bch2_dirent_invalid(const struct bch_fs *c, struct bkey_s_c k)
 	if (!len)
 		return "empty name";
 
-	/*
-	 * older versions of bcachefs were buggy and creating dirent
-	 * keys that were bigger than necessary:
-	 */
-	if (bkey_val_u64s(k.k) > dirent_val_u64s(len + 7))
+	if (bkey_val_u64s(k.k) > dirent_val_u64s(len))
 		return "value too big";
 
 	if (len > BCH_NAME_MAX)
 		return "dirent name too big";
 
+	if (len == 1 && !memcmp(d.v->d_name, ".", 1))
+		return "invalid name";
+
+	if (len == 2 && !memcmp(d.v->d_name, "..", 2))
+		return "invalid name";
+
+	if (memchr(d.v->d_name, '/', len))
+		return "invalid name";
+
+	if (le64_to_cpu(d.v->d_inum) == d.k->p.inode)
+		return "dirent points to own directory";
+
 	return NULL;
 }
 
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index d65b3e100f78..36baff8409cd 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -569,7 +569,6 @@ static int check_dirents(struct bch_fs *c)
 	struct btree_trans trans;
 	struct btree_iter *iter;
 	struct bkey_s_c k;
-	unsigned name_len;
 	char buf[200];
 	int ret = 0;
 
@@ -628,36 +627,6 @@ retry:
 		d = bkey_s_c_to_dirent(k);
 		d_inum = le64_to_cpu(d.v->d_inum);
 
-		name_len = bch2_dirent_name_bytes(d);
-
-		if (fsck_err_on(!name_len, c, "empty dirent") ||
-		    fsck_err_on(name_len == 1 &&
-				!memcmp(d.v->d_name, ".", 1), c,
-				". dirent") ||
-		    fsck_err_on(name_len == 2 &&
-				!memcmp(d.v->d_name, "..", 2), c,
-				".. dirent") ||
-		    fsck_err_on(name_len == 2 &&
-				!memcmp(d.v->d_name, "..", 2), c,
-				".. dirent") ||
-		    fsck_err_on(memchr(d.v->d_name, '/', name_len), c,
-				"dirent name has invalid chars")) {
-			ret = remove_dirent(&trans, d);
-			if (ret)
-				goto err;
-			continue;
-		}
-
-		if (fsck_err_on(d_inum == d.k->p.inode, c,
-				"dirent points to own directory:\n%s",
-				(bch2_bkey_val_to_text(&PBUF(buf), c,
-						       k), buf))) {
-			ret = remove_dirent(&trans, d);
-			if (ret)
-				goto err;
-			continue;
-		}
-
 		ret = __bch2_inode_find_by_inum_trans(&trans, d_inum, &target, 0);
 		if (ret && ret != -ENOENT)
 			break;
-- 
cgit 


From 3a14d58e7b330f3526509917bb6a38b55a1feef5 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 6 Apr 2021 21:19:25 -0400
Subject: bcachefs: Drop bch2_fsck_inode_nlink()

We've had BCH_FEATURE_atomic_nlink for quite some time, we can drop this
now.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fsck.c     |  9 ---------
 fs/bcachefs/fsck.h     |  1 -
 fs/bcachefs/recovery.c | 33 ++++++++++++++-------------------
 3 files changed, 14 insertions(+), 29 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 36baff8409cd..8fa41b36f72d 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -1468,15 +1468,6 @@ int bch2_fsck_full(struct bch_fs *c)
 		check_inode_nlinks(c, &lostfound_inode);
 }
 
-int bch2_fsck_inode_nlink(struct bch_fs *c)
-{
-	struct bch_inode_unpacked root_inode, lostfound_inode;
-
-	return  check_root(c, &root_inode) ?:
-		check_lostfound(c, &root_inode, &lostfound_inode) ?:
-		check_inode_nlinks(c, &lostfound_inode);
-}
-
 int bch2_fsck_walk_inodes_only(struct bch_fs *c)
 {
 	struct btree_trans trans;
diff --git a/fs/bcachefs/fsck.h b/fs/bcachefs/fsck.h
index 9e4af02bde1e..264f2706b12d 100644
--- a/fs/bcachefs/fsck.h
+++ b/fs/bcachefs/fsck.h
@@ -3,7 +3,6 @@
 #define _BCACHEFS_FSCK_H
 
 int bch2_fsck_full(struct bch_fs *);
-int bch2_fsck_inode_nlink(struct bch_fs *);
 int bch2_fsck_walk_inodes_only(struct bch_fs *);
 
 #endif /* _BCACHEFS_FSCK_H */
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index a3a6abb88d6f..24c0646913a8 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -1005,6 +1005,13 @@ int bch2_fs_recovery(struct bch_fs *c)
 
 	}
 
+	if (!c->sb.clean &&
+	    !(c->sb.features & (1 << BCH_FEATURE_atomic_nlink))) {
+		bch_info(c, "BCH_FEATURE_atomic_nlink not set and filesystem dirty, fsck required");
+		c->opts.fsck = true;
+		c->opts.fix_errors = FSCK_OPT_YES;
+	}
+
 	if (!(c->sb.features & (1ULL << BCH_FEATURE_alloc_v2))) {
 		bch_info(c, "alloc_v2 feature bit not set, fsck required");
 		c->opts.fsck = true;
@@ -1181,25 +1188,6 @@ use_clean:
 		bch_verbose(c, "alloc write done");
 	}
 
-	if (!c->sb.clean) {
-		if (!(c->sb.features & (1 << BCH_FEATURE_atomic_nlink))) {
-			bch_info(c, "checking inode link counts");
-			err = "error in recovery";
-			ret = bch2_fsck_inode_nlink(c);
-			if (ret)
-				goto err;
-			bch_verbose(c, "check inodes done");
-
-		} else {
-			bch_verbose(c, "checking for deleted inodes");
-			err = "error in recovery";
-			ret = bch2_fsck_walk_inodes_only(c);
-			if (ret)
-				goto err;
-			bch_verbose(c, "check inodes done");
-		}
-	}
-
 	if (c->opts.fsck) {
 		bch_info(c, "starting fsck");
 		err = "error in fsck";
@@ -1207,6 +1195,13 @@ use_clean:
 		if (ret)
 			goto err;
 		bch_verbose(c, "fsck done");
+	} else if (!c->sb.clean) {
+		bch_verbose(c, "checking for deleted inodes");
+		err = "error in recovery";
+		ret = bch2_fsck_walk_inodes_only(c);
+		if (ret)
+			goto err;
+		bch_verbose(c, "check inodes done");
 	}
 
 	if (enabled_qtypes(c)) {
-- 
cgit 


From b1bd955ba5693f18a091a5cfe3a21ab3bee74edf Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 7 Apr 2021 21:04:04 -0400
Subject: bcachefs: Don't wait for ALLOC_SCAN_BATCH buckets in allocator

It used to be necessary for the allocator thread to batch up
invalidating buckets when possible - but since we added the btree key
cache that hasn't been a concern, and now it's causing the allocator
thread to livelock when the filesystem is nearly full.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index be86e36e816a..a8a59140efbe 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -1071,7 +1071,7 @@ static int bch2_allocator_thread(void *arg)
 
 		pr_debug("free_inc now empty");
 
-		do {
+		while (1) {
 			cond_resched();
 			/*
 			 * Find some buckets that we can invalidate, either
@@ -1095,22 +1095,21 @@ static int bch2_allocator_thread(void *arg)
 				wake_up_process(c->gc_thread);
 			}
 
+			if (nr)
+				break;
+
 			/*
 			 * If we found any buckets, we have to invalidate them
 			 * before we scan for more - but if we didn't find very
 			 * many we may want to wait on more buckets being
 			 * available so we don't spin:
 			 */
-			if (!nr ||
-			    (nr < ALLOC_SCAN_BATCH(ca) &&
-			     !fifo_empty(&ca->free[RESERVE_NONE]))) {
-				ret = wait_buckets_available(c, ca);
-				if (ret) {
-					up_read(&c->gc_lock);
-					goto stop;
-				}
+			ret = wait_buckets_available(c, ca);
+			if (ret) {
+				up_read(&c->gc_lock);
+				goto stop;
 			}
-		} while (!nr);
+		}
 
 		up_read(&c->gc_lock);
 
-- 
cgit 


From 6ae0d16d29707ac952cdb8c2ccb0628f074e0e69 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 8 Apr 2021 16:15:03 -0400
Subject: bcachefs: Make sure to kick journal reclaim when we're waiting on it

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update_leaf.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 13c687fede0b..c5dab99b0cfc 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -639,6 +639,8 @@ static int journal_reclaim_wait_done(struct bch_fs *c)
 	if (ret)
 		return ret;
 
+	journal_reclaim_kick(&c->journal);
+
 	if (mutex_trylock(&c->journal.reclaim_lock)) {
 		ret = bch2_journal_reclaim(&c->journal);
 		mutex_unlock(&c->journal.reclaim_lock);
-- 
cgit 


From 0e96452eef51f32417d2abdb1806474083da5979 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 9 Apr 2021 15:10:24 -0400
Subject: bcachefs: Fix bch2_gc_btree_gens()

Since we're using a NOT_EXTENTS iterator, we shouldn't be setting the
iter pos to the start of the extent.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_gc.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 2f93c9cc757d..f810ad410cd1 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -1197,8 +1197,6 @@ static int bch2_gc_btree_gens(struct bch_fs *c, enum btree_id btree_id)
 			bch2_bkey_buf_reassemble(&sk, c, k);
 			bch2_extent_normalize(c, bkey_i_to_s(sk.k));
 
-			bch2_btree_iter_set_pos(iter, bkey_start_pos(&sk.k->k));
-
 			bch2_trans_update(&trans, iter, sk.k, 0);
 
 			ret = bch2_trans_commit(&trans, NULL, NULL,
-- 
cgit 


From d7f35163e61d962132539e38be89330834a8455b Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 9 Apr 2021 16:52:30 -0400
Subject: bcachefs: Fix BTREE_ITER_NOT_EXTENTS

bch2_btree_iter_peek() wasn't properly checking for
BTREE_ITER_IS_EXTENTS when updating iter->pos.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_gc.c    | 2 +-
 fs/bcachefs/btree_iter.c  | 4 +++-
 fs/bcachefs/btree_types.h | 4 ++++
 3 files changed, 8 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index f810ad410cd1..bf40efcaa192 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -1243,7 +1243,7 @@ int bch2_gc_gens(struct bch_fs *c)
 	}
 
 	for (i = 0; i < BTREE_ID_NR; i++)
-		if (btree_node_type_needs_gc(i)) {
+		if ((1 << i) & BTREE_ID_HAS_PTRS) {
 			ret = bch2_gc_btree_gens(c, i);
 			if (ret) {
 				bch_err(c, "error recalculating oldest_gen: %i", ret);
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 8f5318a38d9b..ae2e907adc73 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1641,7 +1641,9 @@ static inline struct bkey_s_c __btree_iter_peek(struct btree_iter *iter, bool wi
 	 * iter->pos should be mononotically increasing, and always be equal to
 	 * the key we just returned - except extents can straddle iter->pos:
 	 */
-	if (bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0)
+	if (!(iter->flags & BTREE_ITER_IS_EXTENTS))
+		iter->pos = k.k->p;
+	else if (bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0)
 		iter->pos = bkey_start_pos(k.k);
 
 	bch2_btree_iter_verify_entry_exit(iter);
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 9db22b35b780..493d65882222 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -616,6 +616,10 @@ static inline bool btree_iter_is_extents(struct btree_iter *iter)
 	 (1U << BTREE_ID_dirents)|			\
 	 (1U << BTREE_ID_xattrs))
 
+#define BTREE_ID_HAS_PTRS				\
+	((1U << BTREE_ID_extents)|			\
+	 (1U << BTREE_ID_reflink))
+
 static inline bool btree_type_has_snapshots(enum btree_id id)
 {
 	return (1 << id) & BTREE_ID_HAS_SNAPSHOTS;
-- 
cgit 


From 5c16add5ad7897248ded54c34f65cb9479ca542a Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 6 Apr 2021 21:41:48 -0400
Subject: bcachefs: Check inodes at start of fsck

This splits out checking inode nlinks from the rest of the inode checks
and moves most of the inode checks to the start of fsck, so that other
fsck passes can depend on it.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fsck.c | 355 ++++++++++++++++++++++++++++-------------------------
 1 file changed, 186 insertions(+), 169 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 8fa41b36f72d..6e1f9194a671 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -413,6 +413,151 @@ err_redo:
 	goto err;
 }
 
+static int check_inode(struct btree_trans *trans,
+		       struct btree_iter *iter,
+		       struct bkey_s_c_inode inode)
+{
+	struct bch_fs *c = trans->c;
+	struct bch_inode_unpacked u;
+	bool do_update = false;
+	int ret = 0;
+
+	ret = bch2_inode_unpack(inode, &u);
+
+	if (bch2_fs_inconsistent_on(ret, c,
+			 "error unpacking inode %llu in fsck",
+			 inode.k->p.inode))
+		return ret;
+
+	if (u.bi_flags & BCH_INODE_UNLINKED &&
+	    (!c->sb.clean ||
+	     fsck_err(c, "filesystem marked clean, but inode %llu unlinked",
+		      u.bi_inum))) {
+		bch_verbose(c, "deleting inode %llu", u.bi_inum);
+
+		bch2_trans_unlock(trans);
+		bch2_fs_lazy_rw(c);
+
+		ret = bch2_inode_rm(c, u.bi_inum, false);
+		if (ret)
+			bch_err(c, "error in fsck: error %i while deleting inode", ret);
+		return ret;
+	}
+
+	if (u.bi_flags & BCH_INODE_I_SIZE_DIRTY &&
+	    (!c->sb.clean ||
+	     fsck_err(c, "filesystem marked clean, but inode %llu has i_size dirty",
+		      u.bi_inum))) {
+		bch_verbose(c, "truncating inode %llu", u.bi_inum);
+
+		bch2_trans_unlock(trans);
+		bch2_fs_lazy_rw(c);
+
+		/*
+		 * XXX: need to truncate partial blocks too here - or ideally
+		 * just switch units to bytes and that issue goes away
+		 */
+		ret = bch2_btree_delete_range_trans(trans, BTREE_ID_extents,
+				POS(u.bi_inum, round_up(u.bi_size, block_bytes(c)) >> 9),
+				POS(u.bi_inum, U64_MAX),
+				NULL);
+		if (ret) {
+			bch_err(c, "error in fsck: error %i truncating inode", ret);
+			return ret;
+		}
+
+		/*
+		 * We truncated without our normal sector accounting hook, just
+		 * make sure we recalculate it:
+		 */
+		u.bi_flags |= BCH_INODE_I_SECTORS_DIRTY;
+
+		u.bi_flags &= ~BCH_INODE_I_SIZE_DIRTY;
+		do_update = true;
+	}
+
+	if (u.bi_flags & BCH_INODE_I_SECTORS_DIRTY &&
+	    (!c->sb.clean ||
+	     fsck_err(c, "filesystem marked clean, but inode %llu has i_sectors dirty",
+		      u.bi_inum))) {
+		s64 sectors;
+
+		bch_verbose(c, "recounting sectors for inode %llu",
+			    u.bi_inum);
+
+		sectors = bch2_count_inode_sectors(trans, u.bi_inum);
+		if (sectors < 0) {
+			bch_err(c, "error in fsck: error %i recounting inode sectors",
+				(int) sectors);
+			return sectors;
+		}
+
+		u.bi_sectors = sectors;
+		u.bi_flags &= ~BCH_INODE_I_SECTORS_DIRTY;
+		do_update = true;
+	}
+
+	if (!S_ISDIR(u.bi_mode) &&
+	    u.bi_nlink &&
+	    !(u.bi_flags & BCH_INODE_BACKPTR_UNTRUSTED) &&
+	    (fsck_err_on(c->sb.version >= bcachefs_metadata_version_inode_backpointers, c,
+			 "inode missing BCH_INODE_BACKPTR_UNTRUSTED flags") ||
+	     c->opts.version_upgrade)) {
+		u.bi_flags |= BCH_INODE_BACKPTR_UNTRUSTED;
+		do_update = true;
+	}
+
+	if (do_update) {
+		struct bkey_inode_buf p;
+
+		bch2_inode_pack(c, &p, &u);
+		p.inode.k.p = iter->pos;
+
+		ret = __bch2_trans_do(trans, NULL, NULL,
+				      BTREE_INSERT_NOFAIL|
+				      BTREE_INSERT_LAZY_RW,
+			(bch2_trans_update(trans, iter, &p.inode.k_i, 0), 0));
+		if (ret)
+			bch_err(c, "error in fsck: error %i "
+				"updating inode", ret);
+	}
+fsck_err:
+	return ret;
+}
+
+noinline_for_stack
+static int check_inodes(struct bch_fs *c, bool full)
+{
+	struct btree_trans trans;
+	struct btree_iter *iter;
+	struct bkey_s_c k;
+	struct bkey_s_c_inode inode;
+	int ret;
+
+	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
+
+	for_each_btree_key(&trans, iter, BTREE_ID_inodes, POS_MIN, 0, k, ret) {
+		if (k.k->type != KEY_TYPE_inode)
+			continue;
+
+		inode = bkey_s_c_to_inode(k);
+
+		if (full ||
+		    (inode.v->bi_flags & (BCH_INODE_I_SIZE_DIRTY|
+					  BCH_INODE_I_SECTORS_DIRTY|
+					  BCH_INODE_UNLINKED))) {
+			ret = check_inode(&trans, iter, inode);
+			if (ret)
+				break;
+		}
+	}
+	bch2_trans_iter_put(&trans, iter);
+
+	BUG_ON(ret == -EINTR);
+
+	return bch2_trans_exit(&trans) ?: ret;
+}
+
 static int fix_overlapping_extent(struct btree_trans *trans,
 				       struct bkey_s_c k, struct bpos cut_at)
 {
@@ -1131,61 +1276,70 @@ static int bch2_gc_walk_dirents(struct bch_fs *c, nlink_table *links,
 	return ret;
 }
 
-static int check_inode_nlink(struct bch_fs *c,
+static int check_inode_nlink(struct btree_trans *trans,
 			     struct bch_inode_unpacked *lostfound_inode,
-			     struct bch_inode_unpacked *u,
-			     struct nlink *link,
-			     bool *do_update)
+			     struct btree_iter *iter,
+			     struct bkey_s_c_inode inode,
+			     struct nlink *link)
 {
-	u32 i_nlink = bch2_inode_nlink_get(u);
-	u32 real_i_nlink =
-		link->count * nlink_bias(u->bi_mode) +
-		link->dir_count;
+	struct bch_fs *c = trans->c;
+	struct bch_inode_unpacked u;
+	u32 i_nlink, real_i_nlink;
 	int ret = 0;
 
+	ret = bch2_inode_unpack(inode, &u);
+	/* Should never happen, checked by bch2_inode_invalid: */
+	if (bch2_fs_inconsistent_on(ret, c,
+			 "error unpacking inode %llu in fsck",
+			 inode.k->p.inode))
+		return ret;
+
+	i_nlink = bch2_inode_nlink_get(&u);
+	real_i_nlink = link->count * nlink_bias(u.bi_mode) + link->dir_count;
+
 	/*
 	 * These should have been caught/fixed by earlier passes, we don't
 	 * repair them here:
 	 */
-	if (S_ISDIR(u->bi_mode) && link->count > 1) {
+	if (S_ISDIR(u.bi_mode) && link->count > 1) {
 		need_fsck_err(c, "directory %llu with multiple hardlinks: %u",
-			      u->bi_inum, link->count);
+			      u.bi_inum, link->count);
 		return 0;
 	}
 
-	if (S_ISDIR(u->bi_mode) && !link->count) {
+	if (S_ISDIR(u.bi_mode) && !link->count) {
 		need_fsck_err(c, "unreachable directory found (inum %llu)",
-			      u->bi_inum);
+			      u.bi_inum);
 		return 0;
 	}
 
-	if (!S_ISDIR(u->bi_mode) && link->dir_count) {
+	if (!S_ISDIR(u.bi_mode) && link->dir_count) {
 		need_fsck_err(c, "non directory with subdirectories (inum %llu)",
-			      u->bi_inum);
+			      u.bi_inum);
 		return 0;
 	}
 
 	if (!link->count &&
-	    !(u->bi_flags & BCH_INODE_UNLINKED) &&
+	    !(u.bi_flags & BCH_INODE_UNLINKED) &&
 	    (c->sb.features & (1 << BCH_FEATURE_atomic_nlink))) {
 		if (fsck_err(c, "unreachable inode %llu not marked as unlinked (type %u)",
-			     u->bi_inum, mode_to_type(u->bi_mode)) ==
+			     u.bi_inum, mode_to_type(u.bi_mode)) ==
 		    FSCK_ERR_IGNORE)
 			return 0;
 
-		ret = reattach_inode(c, lostfound_inode, u->bi_inum);
+		ret = reattach_inode(c, lostfound_inode, u.bi_inum);
 		if (ret)
 			return ret;
 
 		link->count = 1;
-		real_i_nlink = nlink_bias(u->bi_mode) + link->dir_count;
+		real_i_nlink = nlink_bias(u.bi_mode) + link->dir_count;
 		goto set_i_nlink;
 	}
 
 	if (i_nlink < link->count) {
 		if (fsck_err(c, "inode %llu i_link too small (%u < %u, type %i)",
-			     u->bi_inum, i_nlink, link->count,
-			     mode_to_type(u->bi_mode)) == FSCK_ERR_IGNORE)
+			     u.bi_inum, i_nlink, link->count,
+			     mode_to_type(u.bi_mode)) == FSCK_ERR_IGNORE)
 			return 0;
 		goto set_i_nlink;
 	}
@@ -1195,7 +1349,7 @@ static int check_inode_nlink(struct bch_fs *c,
 		if (fsck_err(c, "filesystem marked clean, "
 			     "but inode %llu has wrong i_nlink "
 			     "(type %u i_nlink %u, should be %u)",
-			     u->bi_inum, mode_to_type(u->bi_mode),
+			     u.bi_inum, mode_to_type(u.bi_mode),
 			     i_nlink, real_i_nlink) == FSCK_ERR_IGNORE)
 			return 0;
 		goto set_i_nlink;
@@ -1205,7 +1359,7 @@ static int check_inode_nlink(struct bch_fs *c,
 	    (c->sb.features & (1 << BCH_FEATURE_atomic_nlink))) {
 		if (fsck_err(c, "inode %llu has wrong i_nlink "
 			     "(type %u i_nlink %u, should be %u)",
-			     u->bi_inum, mode_to_type(u->bi_mode),
+			     u.bi_inum, mode_to_type(u.bi_mode),
 			     i_nlink, real_i_nlink) == FSCK_ERR_IGNORE)
 			return 0;
 		goto set_i_nlink;
@@ -1213,122 +1367,12 @@ static int check_inode_nlink(struct bch_fs *c,
 
 	if (real_i_nlink && i_nlink != real_i_nlink)
 		bch_verbose(c, "setting inode %llu nlink from %u to %u",
-			    u->bi_inum, i_nlink, real_i_nlink);
+			    u.bi_inum, i_nlink, real_i_nlink);
 set_i_nlink:
 	if (i_nlink != real_i_nlink) {
-		bch2_inode_nlink_set(u, real_i_nlink);
-		*do_update = true;
-	}
-fsck_err:
-	return ret;
-}
-
-static int check_inode(struct btree_trans *trans,
-		       struct bch_inode_unpacked *lostfound_inode,
-		       struct btree_iter *iter,
-		       struct bkey_s_c_inode inode,
-		       struct nlink *link)
-{
-	struct bch_fs *c = trans->c;
-	struct bch_inode_unpacked u;
-	bool do_update = false;
-	int ret = 0;
-
-	ret = bch2_inode_unpack(inode, &u);
-
-	bch2_trans_unlock(trans);
-
-	if (bch2_fs_inconsistent_on(ret, c,
-			 "error unpacking inode %llu in fsck",
-			 inode.k->p.inode))
-		return ret;
-
-	if (link) {
-		ret = check_inode_nlink(c, lostfound_inode, &u, link,
-					&do_update);
-		if (ret)
-			return ret;
-	}
-
-	if (u.bi_flags & BCH_INODE_UNLINKED &&
-	    (!c->sb.clean ||
-	     fsck_err(c, "filesystem marked clean, but inode %llu unlinked",
-		      u.bi_inum))) {
-		bch_verbose(c, "deleting inode %llu", u.bi_inum);
-
-		bch2_fs_lazy_rw(c);
-
-		ret = bch2_inode_rm(c, u.bi_inum, false);
-		if (ret)
-			bch_err(c, "error in fsck: error %i while deleting inode", ret);
-		return ret;
-	}
-
-	if (u.bi_flags & BCH_INODE_I_SIZE_DIRTY &&
-	    (!c->sb.clean ||
-	     fsck_err(c, "filesystem marked clean, but inode %llu has i_size dirty",
-		      u.bi_inum))) {
-		bch_verbose(c, "truncating inode %llu", u.bi_inum);
-
-		bch2_fs_lazy_rw(c);
-
-		/*
-		 * XXX: need to truncate partial blocks too here - or ideally
-		 * just switch units to bytes and that issue goes away
-		 */
-		ret = bch2_btree_delete_range_trans(trans, BTREE_ID_extents,
-				POS(u.bi_inum, round_up(u.bi_size, block_bytes(c)) >> 9),
-				POS(u.bi_inum, U64_MAX),
-				NULL);
-		if (ret) {
-			bch_err(c, "error in fsck: error %i truncating inode", ret);
-			return ret;
-		}
-
-		/*
-		 * We truncated without our normal sector accounting hook, just
-		 * make sure we recalculate it:
-		 */
-		u.bi_flags |= BCH_INODE_I_SECTORS_DIRTY;
-
-		u.bi_flags &= ~BCH_INODE_I_SIZE_DIRTY;
-		do_update = true;
-	}
-
-	if (u.bi_flags & BCH_INODE_I_SECTORS_DIRTY &&
-	    (!c->sb.clean ||
-	     fsck_err(c, "filesystem marked clean, but inode %llu has i_sectors dirty",
-		      u.bi_inum))) {
-		s64 sectors;
-
-		bch_verbose(c, "recounting sectors for inode %llu",
-			    u.bi_inum);
-
-		sectors = bch2_count_inode_sectors(trans, u.bi_inum);
-		if (sectors < 0) {
-			bch_err(c, "error in fsck: error %i recounting inode sectors",
-				(int) sectors);
-			return sectors;
-		}
-
-		u.bi_sectors = sectors;
-		u.bi_flags &= ~BCH_INODE_I_SECTORS_DIRTY;
-		do_update = true;
-	}
-
-	if (!S_ISDIR(u.bi_mode) &&
-	    u.bi_nlink &&
-	    !(u.bi_flags & BCH_INODE_BACKPTR_UNTRUSTED) &&
-	    (fsck_err_on(c->sb.version >= bcachefs_metadata_version_inode_backpointers, c,
-			 "inode missing BCH_INODE_BACKPTR_UNTRUSTED flags") ||
-	     c->opts.version_upgrade)) {
-		u.bi_flags |= BCH_INODE_BACKPTR_UNTRUSTED;
-		do_update = true;
-	}
-
-	if (do_update) {
 		struct bkey_inode_buf p;
 
+		bch2_inode_nlink_set(&u, real_i_nlink);
 		bch2_inode_pack(c, &p, &u);
 		p.inode.k.p = iter->pos;
 
@@ -1337,8 +1381,7 @@ static int check_inode(struct btree_trans *trans,
 				      BTREE_INSERT_LAZY_RW,
 			(bch2_trans_update(trans, iter, &p.inode.k_i, 0), 0));
 		if (ret)
-			bch_err(c, "error in fsck: error %i "
-				"updating inode", ret);
+			bch_err(c, "error in fsck: error %i updating inode", ret);
 	}
 fsck_err:
 	return ret;
@@ -1387,8 +1430,8 @@ peek_nlinks:	link = genradix_iter_peek(&nlinks_iter, links);
 			link = &zero_links;
 
 		if (k.k && k.k->type == KEY_TYPE_inode) {
-			ret = check_inode(&trans, lostfound_inode, iter,
-					  bkey_s_c_to_inode(k), link);
+			ret = check_inode_nlink(&trans, lostfound_inode, iter,
+						bkey_s_c_to_inode(k), link);
 			BUG_ON(ret == -EINTR);
 			if (ret)
 				break;
@@ -1416,7 +1459,7 @@ fsck_err:
 }
 
 noinline_for_stack
-static int check_inode_nlinks(struct bch_fs *c,
+static int check_nlinks(struct bch_fs *c,
 			      struct bch_inode_unpacked *lostfound_inode)
 {
 	nlink_table links;
@@ -1459,43 +1502,17 @@ int bch2_fsck_full(struct bch_fs *c)
 {
 	struct bch_inode_unpacked root_inode, lostfound_inode;
 
-	return  check_extents(c) ?:
+	return  check_inodes(c, true) ?:
+		check_extents(c) ?:
 		check_dirents(c) ?:
 		check_xattrs(c) ?:
 		check_root(c, &root_inode) ?:
 		check_lostfound(c, &root_inode, &lostfound_inode) ?:
 		check_directory_structure(c, &lostfound_inode) ?:
-		check_inode_nlinks(c, &lostfound_inode);
+		check_nlinks(c, &lostfound_inode);
 }
 
 int bch2_fsck_walk_inodes_only(struct bch_fs *c)
 {
-	struct btree_trans trans;
-	struct btree_iter *iter;
-	struct bkey_s_c k;
-	struct bkey_s_c_inode inode;
-	int ret;
-
-	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
-
-	for_each_btree_key(&trans, iter, BTREE_ID_inodes, POS_MIN, 0, k, ret) {
-		if (k.k->type != KEY_TYPE_inode)
-			continue;
-
-		inode = bkey_s_c_to_inode(k);
-
-		if (inode.v->bi_flags &
-		    (BCH_INODE_I_SIZE_DIRTY|
-		     BCH_INODE_I_SECTORS_DIRTY|
-		     BCH_INODE_UNLINKED)) {
-			ret = check_inode(&trans, NULL, iter, inode, NULL);
-			if (ret)
-				break;
-		}
-	}
-	bch2_trans_iter_put(&trans, iter);
-
-	BUG_ON(ret == -EINTR);
-
-	return bch2_trans_exit(&trans) ?: ret;
+	return check_inodes(c, false);
 }
-- 
cgit 


From 7ac2c55e4dec9af38bd9447271944296a4a38814 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 7 Apr 2021 01:55:57 -0400
Subject: bcachefs: Simplify hash table checks

Very early on there was a period where we were accidentally generating
dirents with trailing garbage; we've since dropped support for
filesystems that old and the fsck code can be dropped.

Also, this patch switches to a simpler algorithm for checking hash
tables. It's less efficient on hash collision - but with 64 bit keys,
those are very rare.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fsck.c | 296 ++++++++++++-----------------------------------------
 1 file changed, 65 insertions(+), 231 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 6e1f9194a671..0d27a7a736e0 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -142,42 +142,10 @@ static int walk_inode(struct btree_trans *trans,
 	return 0;
 }
 
-struct hash_check {
-	struct bch_hash_info	info;
-
-	/* start of current chain of hash collisions: */
-	struct btree_iter	*chain;
-
-	/* next offset in current chain of hash collisions: */
-	u64			chain_end;
-};
-
-static void hash_check_init(struct hash_check *h)
-{
-	h->chain = NULL;
-	h->chain_end = 0;
-}
-
-static void hash_stop_chain(struct btree_trans *trans,
-			    struct hash_check *h)
-{
-	if (h->chain)
-		bch2_trans_iter_free(trans, h->chain);
-	h->chain = NULL;
-}
-
-static void hash_check_set_inode(struct btree_trans *trans,
-				 struct hash_check *h,
-				 const struct bch_inode_unpacked *bi)
-{
-	h->info = bch2_hash_info_init(trans->c, bi);
-	hash_stop_chain(trans, h);
-}
-
-static int hash_redo_key(const struct bch_hash_desc desc,
-			 struct btree_trans *trans, struct hash_check *h,
-			 struct btree_iter *k_iter, struct bkey_s_c k,
-			 u64 hashed)
+static int hash_redo_key(struct btree_trans *trans,
+			 const struct bch_hash_desc desc,
+			 struct bch_hash_info *hash_info,
+			 struct btree_iter *k_iter, struct bkey_s_c k)
 {
 	struct bkey_i delete;
 	struct bkey_i *tmp;
@@ -192,7 +160,7 @@ static int hash_redo_key(const struct bch_hash_desc desc,
 	delete.k.p = k_iter->pos;
 	bch2_trans_update(trans, k_iter, &delete, 0);
 
-	return bch2_hash_set(trans, desc, &h->info, k_iter->pos.inode,
+	return bch2_hash_set(trans, desc, hash_info, k_iter->pos.inode,
 			     tmp, 0);
 }
 
@@ -216,201 +184,72 @@ retry:
 	return ret;
 }
 
-static int hash_check_duplicates(struct btree_trans *trans,
-			const struct bch_hash_desc desc, struct hash_check *h,
-			struct btree_iter *k_iter, struct bkey_s_c k)
+static int hash_check_key(struct btree_trans *trans,
+			  const struct bch_hash_desc desc,
+			  struct bch_hash_info *hash_info,
+			  struct btree_iter *k_iter, struct bkey_s_c hash_k)
 {
 	struct bch_fs *c = trans->c;
-	struct btree_iter *iter;
-	struct bkey_s_c k2;
+	struct btree_iter *iter = NULL;
 	char buf[200];
+	struct bkey_s_c k;
+	u64 hash;
 	int ret = 0;
 
-	if (!bkey_cmp(h->chain->pos, k_iter->pos))
+	if (hash_k.k->type != desc.key_type)
 		return 0;
 
-	iter = bch2_trans_copy_iter(trans, h->chain);
+	hash = desc.hash_bkey(hash_info, hash_k);
+
+	if (likely(hash == hash_k.k->p.offset))
+		return 0;
 
-	for_each_btree_key_continue(iter, 0, k2, ret) {
-		if (bkey_cmp(k2.k->p, k.k->p) >= 0)
+	if (hash_k.k->p.offset < hash)
+		goto bad_hash;
+
+	for_each_btree_key(trans, iter, desc.btree_id, POS(hash_k.k->p.inode, hash),
+			   BTREE_ITER_SLOTS, k, ret) {
+		if (!bkey_cmp(k.k->p, hash_k.k->p))
 			break;
 
-		if (fsck_err_on(k2.k->type == desc.key_type &&
-				!desc.cmp_bkey(k, k2), c,
+		if (fsck_err_on(k.k->type == desc.key_type &&
+				!desc.cmp_bkey(k, hash_k), c,
 				"duplicate hash table keys:\n%s",
 				(bch2_bkey_val_to_text(&PBUF(buf), c,
-						       k), buf))) {
-			ret = fsck_hash_delete_at(trans, desc, &h->info, k_iter);
+						       hash_k), buf))) {
+			ret = fsck_hash_delete_at(trans, desc, hash_info, k_iter);
 			if (ret)
 				return ret;
 			ret = 1;
 			break;
 		}
-	}
-fsck_err:
-	bch2_trans_iter_free(trans, iter);
-	return ret;
-}
-
-static void hash_set_chain_start(struct btree_trans *trans,
-			const struct bch_hash_desc desc,
-			struct hash_check *h,
-			struct btree_iter *k_iter, struct bkey_s_c k)
-{
-	bool hole = (k.k->type != KEY_TYPE_hash_whiteout &&
-		     k.k->type != desc.key_type);
-
-	if (hole || k.k->p.offset > h->chain_end + 1)
-		hash_stop_chain(trans, h);
 
-	if (!hole) {
-		if (!h->chain)
-			h->chain = bch2_trans_copy_iter(trans, k_iter);
-
-		h->chain_end = k.k->p.offset;
-	}
-}
-
-static bool key_has_correct_hash(struct btree_trans *trans,
-			const struct bch_hash_desc desc,
-			struct hash_check *h,
-			struct btree_iter *k_iter, struct bkey_s_c k)
-{
-	u64 hash;
-
-	hash_set_chain_start(trans, desc, h, k_iter, k);
-
-	if (k.k->type != desc.key_type)
-		return true;
-
-	hash = desc.hash_bkey(&h->info, k);
-
-	return hash >= h->chain->pos.offset &&
-		hash <= k.k->p.offset;
-}
-
-static int hash_check_key(struct btree_trans *trans,
-			const struct bch_hash_desc desc, struct hash_check *h,
-			struct btree_iter *k_iter, struct bkey_s_c k)
-{
-	struct bch_fs *c = trans->c;
-	char buf[200];
-	u64 hashed;
-	int ret = 0;
-
-	hash_set_chain_start(trans, desc, h, k_iter, k);
-
-	if (k.k->type != desc.key_type)
-		return 0;
-
-	hashed = desc.hash_bkey(&h->info, k);
-
-	if (fsck_err_on(hashed < h->chain->pos.offset ||
-			hashed > k.k->p.offset, c,
-			"hash table key at wrong offset: btree %u, %llu, "
-			"hashed to %llu chain starts at %llu\n%s",
-			desc.btree_id, k.k->p.offset,
-			hashed, h->chain->pos.offset,
-			(bch2_bkey_val_to_text(&PBUF(buf), c, k), buf))) {
-		ret = __bch2_trans_do(trans, NULL, NULL,
-				      BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW,
-			hash_redo_key(desc, trans, h, k_iter, k, hashed));
-		if (ret) {
-			bch_err(c, "hash_redo_key err %i", ret);
-			return ret;
+		if (bkey_deleted(k.k)) {
+			bch2_trans_iter_free(trans, iter);
+			goto bad_hash;
 		}
-		return -EINTR;
-	}
 
-	ret = hash_check_duplicates(trans, desc, h, k_iter, k);
-fsck_err:
+	}
+	bch2_trans_iter_free(trans, iter);
 	return ret;
-}
-
-static int check_dirent_hash(struct btree_trans *trans, struct hash_check *h,
-			     struct btree_iter *iter, struct bkey_s_c *k)
-{
-	struct bch_fs *c = trans->c;
-	struct bkey_i_dirent *d = NULL;
-	int ret = -EINVAL;
-	char buf[200];
-	unsigned len;
-	u64 hash;
-
-	if (key_has_correct_hash(trans, bch2_dirent_hash_desc, h, iter, *k))
+bad_hash:
+	if (fsck_err(c, "hash table key at wrong offset: btree %u inode %llu offset %llu, "
+		     "hashed to %llu should be at %llu\n%s",
+		     desc.btree_id, hash_k.k->p.inode, hash_k.k->p.offset,
+		     hash, iter->pos.offset,
+		     (bch2_bkey_val_to_text(&PBUF(buf), c, hash_k), buf)) == FSCK_ERR_IGNORE)
 		return 0;
 
-	len = bch2_dirent_name_bytes(bkey_s_c_to_dirent(*k));
-	BUG_ON(!len);
-
-	memcpy(buf, bkey_s_c_to_dirent(*k).v->d_name, len);
-	buf[len] = '\0';
-
-	d = kmalloc(bkey_bytes(k->k), GFP_KERNEL);
-	if (!d) {
-		bch_err(c, "memory allocation failure");
-		return -ENOMEM;
-	}
-
-	bkey_reassemble(&d->k_i, *k);
-
-	do {
-		--len;
-		if (!len)
-			goto err_redo;
-
-		d->k.u64s = BKEY_U64s + dirent_val_u64s(len);
-
-		BUG_ON(bkey_val_bytes(&d->k) <
-		       offsetof(struct bch_dirent, d_name) + len);
-
-		memset(d->v.d_name + len, 0,
-		       bkey_val_bytes(&d->k) -
-		       offsetof(struct bch_dirent, d_name) - len);
-
-		hash = bch2_dirent_hash_desc.hash_bkey(&h->info,
-						bkey_i_to_s_c(&d->k_i));
-	} while (hash < h->chain->pos.offset ||
-		 hash > k->k->p.offset);
-
-	if (fsck_err(c, "dirent with junk at end, was %s (%zu) now %s (%u)",
-		     buf, strlen(buf), d->v.d_name, len)) {
-		ret = __bch2_trans_do(trans, NULL, NULL,
-				      BTREE_INSERT_NOFAIL|
-				      BTREE_INSERT_LAZY_RW,
-			(bch2_trans_update(trans, iter, &d->k_i, 0), 0));
-		if (ret)
-			goto err;
-
-		*k = bch2_btree_iter_peek(iter);
-
-		BUG_ON(k->k->type != KEY_TYPE_dirent);
+	ret = __bch2_trans_do(trans, NULL, NULL,
+			      BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW,
+		hash_redo_key(trans, desc, hash_info, k_iter, hash_k));
+	if (ret) {
+		bch_err(c, "hash_redo_key err %i", ret);
+		return ret;
 	}
-err:
+	return -EINTR;
 fsck_err:
-	kfree(d);
 	return ret;
-err_redo:
-	hash = bch2_dirent_hash_desc.hash_bkey(&h->info, *k);
-
-	if (fsck_err(c, "cannot fix dirent by removing trailing garbage %s (%zu)\n"
-		     "hash table key at wrong offset: btree %u, offset %llu, "
-		     "hashed to %llu chain starts at %llu\n%s",
-		     buf, strlen(buf), BTREE_ID_dirents,
-		     k->k->p.offset, hash, h->chain->pos.offset,
-		     (bch2_bkey_val_to_text(&PBUF(buf), c,
-					    *k), buf))) {
-		ret = __bch2_trans_do(trans, NULL, NULL,
-				      BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW,
-			hash_redo_key(bch2_dirent_hash_desc, trans,
-				      h, iter, *k, hash));
-		if (ret)
-			bch_err(c, "hash_redo_key err %i", ret);
-		else
-			ret = 1;
-	}
-
-	goto err;
 }
 
 static int check_inode(struct btree_trans *trans,
@@ -710,7 +549,7 @@ noinline_for_stack
 static int check_dirents(struct bch_fs *c)
 {
 	struct inode_walker w = inode_walker_init();
-	struct hash_check h;
+	struct bch_hash_info hash_info;
 	struct btree_trans trans;
 	struct btree_iter *iter;
 	struct bkey_s_c k;
@@ -721,8 +560,6 @@ static int check_dirents(struct bch_fs *c)
 
 	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
 
-	hash_check_init(&h);
-
 	iter = bch2_trans_get_iter(&trans, BTREE_ID_dirents,
 				   POS(BCACHEFS_ROOT_INO, 0), 0);
 retry:
@@ -749,25 +586,26 @@ retry:
 			ret = bch2_btree_delete_at(&trans, iter, 0);
 			if (ret)
 				goto err;
-			continue;
+			goto next;
 		}
 
-		if (w.first_this_inode && w.have_inode)
-			hash_check_set_inode(&trans, &h, &w.inode);
+		if (!w.have_inode)
+			goto next;
 
-		ret = check_dirent_hash(&trans, &h, iter, &k);
+		if (w.first_this_inode)
+			hash_info = bch2_hash_info_init(c, &w.inode);
+
+		ret = hash_check_key(&trans, bch2_dirent_hash_desc,
+				     &hash_info, iter, k);
 		if (ret > 0) {
 			ret = 0;
-			continue;
+			goto next;
 		}
 		if (ret)
 			goto fsck_err;
 
-		if (ret)
-			goto fsck_err;
-
 		if (k.k->type != KEY_TYPE_dirent)
-			continue;
+			goto next;
 
 		d = bkey_s_c_to_dirent(k);
 		d_inum = le64_to_cpu(d.v->d_inum);
@@ -786,9 +624,12 @@ retry:
 			ret = remove_dirent(&trans, d);
 			if (ret)
 				goto err;
-			continue;
+			goto next;
 		}
 
+		if (!have_target)
+			goto next;
+
 		if (!target.bi_nlink &&
 		    !(target.bi_flags & BCH_INODE_BACKPTR_UNTRUSTED) &&
 		    (target.bi_dir != k.k->p.inode ||
@@ -822,8 +663,7 @@ retry:
 			continue;
 		}
 
-		if (fsck_err_on(have_target &&
-				d.v->d_type !=
+		if (fsck_err_on(d.v->d_type !=
 				mode_to_type(target.bi_mode), c,
 				"incorrect d_type: should be %u:\n%s",
 				mode_to_type(target.bi_mode),
@@ -849,17 +689,14 @@ retry:
 				goto err;
 
 		}
-
+next:
 		bch2_btree_iter_advance(iter);
 	}
-
-	hash_stop_chain(&trans, &h);
 err:
 fsck_err:
 	if (ret == -EINTR)
 		goto retry;
 
-	bch2_trans_iter_put(&trans, h.chain);
 	bch2_trans_iter_put(&trans, iter);
 	return bch2_trans_exit(&trans) ?: ret;
 }
@@ -871,7 +708,7 @@ noinline_for_stack
 static int check_xattrs(struct bch_fs *c)
 {
 	struct inode_walker w = inode_walker_init();
-	struct hash_check h;
+	struct bch_hash_info hash_info;
 	struct btree_trans trans;
 	struct btree_iter *iter;
 	struct bkey_s_c k;
@@ -879,8 +716,6 @@ static int check_xattrs(struct bch_fs *c)
 
 	bch_verbose(c, "checking xattrs");
 
-	hash_check_init(&h);
-
 	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
 
 	iter = bch2_trans_get_iter(&trans, BTREE_ID_xattrs,
@@ -902,10 +737,10 @@ retry:
 		}
 
 		if (w.first_this_inode && w.have_inode)
-			hash_check_set_inode(&trans, &h, &w.inode);
+			hash_info = bch2_hash_info_init(c, &w.inode);
 
 		ret = hash_check_key(&trans, bch2_xattr_hash_desc,
-				     &h, iter, k);
+				     &hash_info, iter, k);
 		if (ret)
 			break;
 
@@ -915,7 +750,6 @@ fsck_err:
 	if (ret == -EINTR)
 		goto retry;
 
-	bch2_trans_iter_put(&trans, h.chain);
 	bch2_trans_iter_put(&trans, iter);
 	return bch2_trans_exit(&trans) ?: ret;
 }
-- 
cgit 


From 8a85b20cd757d9ebc784adc7d56ea378b9bf30c5 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 6 Apr 2021 20:15:26 -0400
Subject: bcachefs: Inode backpointers are now required

This lets us simplify fsck quite a bit, which we need for making fsck
snapshot aware.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fsck.c     | 187 +++++++++++++++++++++++++++++++++++++------------
 fs/bcachefs/recovery.c |   7 ++
 2 files changed, 151 insertions(+), 43 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 0d27a7a736e0..5be86bf60545 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -38,6 +38,49 @@ static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum)
 	return ret ?: sectors;
 }
 
+static int lookup_inode(struct btree_trans *trans, u64 inode_nr,
+			struct bch_inode_unpacked *inode,
+			u32 *snapshot)
+{
+	struct btree_iter *iter;
+	struct bkey_s_c k;
+	int ret;
+
+	iter = bch2_trans_get_iter(trans, BTREE_ID_inodes,
+			POS(0, inode_nr), 0);
+	k = bch2_btree_iter_peek_slot(iter);
+	ret = bkey_err(k);
+	if (ret)
+		goto err;
+
+	if (snapshot)
+		*snapshot = iter->pos.snapshot;
+	ret = k.k->type == KEY_TYPE_inode
+		? bch2_inode_unpack(bkey_s_c_to_inode(k), inode)
+		: -ENOENT;
+err:
+	bch2_trans_iter_put(trans, iter);
+	return ret;
+}
+
+static int write_inode(struct btree_trans *trans,
+		       struct bch_inode_unpacked *inode,
+		       u32 snapshot)
+{
+	struct btree_iter *inode_iter =
+		bch2_trans_get_iter(trans, BTREE_ID_inodes,
+				    SPOS(0, inode->bi_inum, snapshot),
+				    BTREE_ITER_INTENT);
+	int ret = __bch2_trans_do(trans, NULL, NULL,
+				  BTREE_INSERT_NOFAIL|
+				  BTREE_INSERT_LAZY_RW,
+				  bch2_inode_write(trans, inode_iter, inode));
+	bch2_trans_iter_put(trans, inode_iter);
+	if (ret)
+		bch_err(trans->c, "error in fsck: error %i updating inode", ret);
+	return ret;
+}
+
 static int __remove_dirent(struct btree_trans *trans,
 			   struct bkey_s_c_dirent dirent)
 {
@@ -58,7 +101,7 @@ static int __remove_dirent(struct btree_trans *trans,
 	buf[name.len] = '\0';
 	name.name = buf;
 
-	ret = __bch2_inode_find_by_inum_trans(trans, dir_inum, &dir_inode, 0);
+	ret = lookup_inode(trans, dir_inum, &dir_inode, NULL);
 	if (ret && ret != -EINTR)
 		bch_err(c, "remove_dirent: err %i looking up directory inode", ret);
 	if (ret)
@@ -111,6 +154,7 @@ struct inode_walker {
 	bool			first_this_inode;
 	bool			have_inode;
 	u64			cur_inum;
+	u32			snapshot;
 	struct bch_inode_unpacked inode;
 };
 
@@ -126,8 +170,7 @@ static int walk_inode(struct btree_trans *trans,
 		      struct inode_walker *w, u64 inum)
 {
 	if (inum != w->cur_inum) {
-		int ret = __bch2_inode_find_by_inum_trans(trans, inum,
-							  &w->inode, 0);
+		int ret = lookup_inode(trans, inum, &w->inode, &w->snapshot);
 
 		if (ret && ret != -ENOENT)
 			return ret;
@@ -432,6 +475,35 @@ static int fix_overlapping_extent(struct btree_trans *trans,
 				 BTREE_INSERT_LAZY_RW);
 }
 
+static int inode_backpointer_exists(struct btree_trans *trans,
+				    struct bch_inode_unpacked *inode)
+{
+	struct btree_iter *iter;
+	struct bkey_s_c k;
+	int ret;
+
+	iter = bch2_trans_get_iter(trans, BTREE_ID_dirents,
+				   POS(inode->bi_dir, inode->bi_dir_offset), 0);
+	k = bch2_btree_iter_peek_slot(iter);
+	ret = bkey_err(k);
+	if (ret)
+		goto out;
+	if (k.k->type != KEY_TYPE_dirent)
+		goto out;
+
+	ret = le64_to_cpu(bkey_s_c_to_dirent(k).v->d_inum) == inode->bi_inum;
+out:
+	bch2_trans_iter_free(trans, iter);
+	return ret;
+}
+
+static bool inode_backpointer_matches(struct bkey_s_c_dirent d,
+				      struct bch_inode_unpacked *inode)
+{
+	return d.k->p.inode == inode->bi_dir &&
+		d.k->p.offset == inode->bi_dir_offset;
+}
+
 /*
  * Walk extents: verify that extents have a corresponding S_ISREG inode, and
  * that i_size an i_sectors are consistent
@@ -466,18 +538,9 @@ retry:
 				"inode %llu has incorrect i_sectors: got %llu, should be %llu",
 				w.inode.bi_inum,
 				w.inode.bi_sectors, i_sectors)) {
-			struct btree_iter *inode_iter =
-				bch2_trans_get_iter(&trans, BTREE_ID_inodes,
-						    POS(0, w.cur_inum),
-						    BTREE_ITER_INTENT);
-
 			w.inode.bi_sectors = i_sectors;
 
-			ret = __bch2_trans_do(&trans, NULL, NULL,
-					      BTREE_INSERT_NOFAIL|
-					      BTREE_INSERT_LAZY_RW,
-					      bch2_inode_write(&trans, inode_iter, &w.inode));
-			bch2_trans_iter_put(&trans, inode_iter);
+			ret = write_inode(&trans, &w.inode, w.snapshot);
 			if (ret)
 				break;
 		}
@@ -554,6 +617,7 @@ static int check_dirents(struct bch_fs *c)
 	struct btree_iter *iter;
 	struct bkey_s_c k;
 	char buf[200];
+	unsigned nr_subdirs = 0;
 	int ret = 0;
 
 	bch_verbose(c, "checking dirents");
@@ -567,13 +631,29 @@ retry:
 	       !(ret = bkey_err(k))) {
 		struct bkey_s_c_dirent d;
 		struct bch_inode_unpacked target;
+		u32 target_snapshot;
 		bool have_target;
+		bool backpointer_exists = true;
 		u64 d_inum;
 
+		if (w.have_inode &&
+		    w.cur_inum != k.k->p.inode &&
+		    fsck_err_on(w.inode.bi_nlink != nr_subdirs, c,
+				"directory %llu with wrong i_nlink: got %u, should be %u",
+				w.inode.bi_inum, w.inode.bi_nlink, nr_subdirs)) {
+			w.inode.bi_nlink = nr_subdirs;
+			ret = write_inode(&trans, &w.inode, w.snapshot);
+			if (ret)
+				break;
+		}
+
 		ret = walk_inode(&trans, &w, k.k->p.inode);
 		if (ret)
 			break;
 
+		if (w.first_this_inode)
+			nr_subdirs = 0;
+
 		if (fsck_err_on(!w.have_inode, c,
 				"dirent in nonexisting directory:\n%s",
 				(bch2_bkey_val_to_text(&PBUF(buf), c,
@@ -610,7 +690,7 @@ retry:
 		d = bkey_s_c_to_dirent(k);
 		d_inum = le64_to_cpu(d.v->d_inum);
 
-		ret = __bch2_inode_find_by_inum_trans(&trans, d_inum, &target, 0);
+		ret = lookup_inode(&trans, d_inum, &target, &target_snapshot);
 		if (ret && ret != -ENOENT)
 			break;
 
@@ -630,41 +710,60 @@ retry:
 		if (!have_target)
 			goto next;
 
-		if (!target.bi_nlink &&
-		    !(target.bi_flags & BCH_INODE_BACKPTR_UNTRUSTED) &&
-		    (target.bi_dir != k.k->p.inode ||
-		     target.bi_dir_offset != k.k->p.offset) &&
-		    (fsck_err_on(c->sb.version >= bcachefs_metadata_version_inode_backpointers, c,
-				 "inode %llu has wrong backpointer:\n"
-				 "got       %llu:%llu\n"
-				 "should be %llu:%llu",
-				 d_inum,
-				 target.bi_dir,
-				 target.bi_dir_offset,
-				 k.k->p.inode,
-				 k.k->p.offset) ||
-		     c->opts.version_upgrade)) {
-			struct bkey_inode_buf p;
-
-			target.bi_dir		= k.k->p.inode;
-			target.bi_dir_offset	= k.k->p.offset;
-			bch2_trans_unlock(&trans);
+		if (!inode_backpointer_matches(d, &target)) {
+			ret = inode_backpointer_exists(&trans, &target);
+			if (ret < 0)
+				goto err;
 
-			bch2_inode_pack(c, &p, &target);
+			backpointer_exists = ret;
+			ret = 0;
+		}
 
-			ret = bch2_btree_insert(c, BTREE_ID_inodes,
-						&p.inode.k_i, NULL, NULL,
-						BTREE_INSERT_NOFAIL|
-						BTREE_INSERT_LAZY_RW);
-			if (ret) {
-				bch_err(c, "error in fsck: error %i updating inode", ret);
+		if (fsck_err_on(S_ISDIR(target.bi_mode) &&
+				!inode_backpointer_matches(d, &target) &&
+				backpointer_exists, c,
+				"directory %llu with multiple links",
+				target.bi_inum)) {
+			ret = remove_dirent(&trans, d);
+			if (ret)
 				goto err;
-			}
 			continue;
 		}
 
-		if (fsck_err_on(d.v->d_type !=
-				mode_to_type(target.bi_mode), c,
+		if (!inode_backpointer_matches(d, &target) &&
+		    (S_ISDIR(target.bi_mode) || !target.bi_nlink)) {
+			if (backpointer_exists) {
+				if (!fsck_err(c, "inode %llu has multiple links but i_nlink 0",
+					      d_inum))
+					goto check_type;
+
+				target.bi_nlink++;
+				target.bi_flags |= BCH_INODE_BACKPTR_UNTRUSTED;
+			} else {
+				if (c->sb.version >= bcachefs_metadata_version_inode_backpointers &&
+				    !(target.bi_flags & BCH_INODE_BACKPTR_UNTRUSTED) &&
+				    !fsck_err(c, "inode %llu has wrong backpointer:\n"
+					      "got       %llu:%llu\n"
+					      "should be %llu:%llu",
+					      d_inum,
+					      target.bi_dir,
+					      target.bi_dir_offset,
+					      k.k->p.inode,
+					      k.k->p.offset))
+					goto check_type;
+
+				target.bi_dir		= k.k->p.inode;
+				target.bi_dir_offset	= k.k->p.offset;
+				target.bi_flags &= ~BCH_INODE_BACKPTR_UNTRUSTED;
+			}
+
+			ret = write_inode(&trans, &target, target_snapshot);
+			if (ret)
+				goto err;
+			continue;
+		}
+check_type:
+		if (fsck_err_on(d.v->d_type != mode_to_type(target.bi_mode), c,
 				"incorrect d_type: should be %u:\n%s",
 				mode_to_type(target.bi_mode),
 				(bch2_bkey_val_to_text(&PBUF(buf), c,
@@ -689,6 +788,8 @@ retry:
 				goto err;
 
 		}
+
+		nr_subdirs += d.v->d_type == DT_DIR;
 next:
 		bch2_btree_iter_advance(iter);
 	}
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 24c0646913a8..012a08574022 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -1024,6 +1024,13 @@ int bch2_fs_recovery(struct bch_fs *c)
 		set_bit(BCH_FS_REBUILD_REPLICAS, &c->flags);
 	}
 
+	if (c->sb.version < bcachefs_metadata_version_inode_backpointers) {
+		bch_info(c, "version prior to inode backpointers, upgrade and fsck required");
+		c->opts.version_upgrade	= true;
+		c->opts.fsck		= true;
+		c->opts.fix_errors	= FSCK_OPT_YES;
+	}
+
 	ret = bch2_blacklist_table_initialize(c);
 	if (ret) {
 		bch_err(c, "error initializing blacklist table");
-- 
cgit 


From b906aaddf2144b9f4ebdb8618e8ab1af00a58644 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 8 Apr 2021 15:25:29 -0400
Subject: bcachefs: Redo check_nlink fsck pass

Now that we have inode backpointers the check_nlink pass only is
concerned with files that have hardlinks, and can be simplified.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fsck.c | 179 ++++++++++++-----------------------------------------
 1 file changed, 41 insertions(+), 138 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 5be86bf60545..6bc3f2f09e36 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -1137,14 +1137,12 @@ fsck_err:
 
 struct nlink {
 	u32	count;
-	u32	dir_count;
 };
 
 typedef GENRADIX(struct nlink) nlink_table;
 
 static void inc_link(struct bch_fs *c, nlink_table *links,
-		     u64 range_start, u64 *range_end,
-		     u64 inum, bool dir)
+		     u64 range_start, u64 *range_end, u64 inum)
 {
 	struct nlink *link;
 
@@ -1163,10 +1161,7 @@ static void inc_link(struct bch_fs *c, nlink_table *links,
 		return;
 	}
 
-	if (dir)
-		link->dir_count++;
-	else
-		link->count++;
+	link->count++;
 }
 
 noinline_for_stack
@@ -1177,26 +1172,18 @@ static int bch2_gc_walk_dirents(struct bch_fs *c, nlink_table *links,
 	struct btree_iter *iter;
 	struct bkey_s_c k;
 	struct bkey_s_c_dirent d;
-	u64 d_inum;
 	int ret;
 
 	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
 
-	inc_link(c, links, range_start, range_end, BCACHEFS_ROOT_INO, false);
-
 	for_each_btree_key(&trans, iter, BTREE_ID_dirents, POS_MIN, 0, k, ret) {
 		switch (k.k->type) {
 		case KEY_TYPE_dirent:
 			d = bkey_s_c_to_dirent(k);
-			d_inum = le64_to_cpu(d.v->d_inum);
 
-			if (d.v->d_type == DT_DIR)
+			if (d.v->d_type != DT_DIR)
 				inc_link(c, links, range_start, range_end,
-					 d.k->p.inode, true);
-
-			inc_link(c, links, range_start, range_end,
-				 d_inum, false);
-
+					 le64_to_cpu(d.v->d_inum));
 			break;
 		}
 
@@ -1215,99 +1202,48 @@ static int check_inode_nlink(struct btree_trans *trans,
 			     struct bch_inode_unpacked *lostfound_inode,
 			     struct btree_iter *iter,
 			     struct bkey_s_c_inode inode,
-			     struct nlink *link)
+			     unsigned nlink)
 {
 	struct bch_fs *c = trans->c;
 	struct bch_inode_unpacked u;
-	u32 i_nlink, real_i_nlink;
 	int ret = 0;
 
+	/*
+	 * Backpointer and directory structure checks are sufficient for
+	 * directories, since they can't have hardlinks:
+	 */
+	if (S_ISDIR(le16_to_cpu(inode.v->bi_mode)))
+		return 0;
+
 	ret = bch2_inode_unpack(inode, &u);
+
 	/* Should never happen, checked by bch2_inode_invalid: */
 	if (bch2_fs_inconsistent_on(ret, c,
 			 "error unpacking inode %llu in fsck",
 			 inode.k->p.inode))
 		return ret;
 
-	i_nlink = bch2_inode_nlink_get(&u);
-	real_i_nlink = link->count * nlink_bias(u.bi_mode) + link->dir_count;
-
-	/*
-	 * These should have been caught/fixed by earlier passes, we don't
-	 * repair them here:
-	 */
-	if (S_ISDIR(u.bi_mode) && link->count > 1) {
-		need_fsck_err(c, "directory %llu with multiple hardlinks: %u",
-			      u.bi_inum, link->count);
-		return 0;
-	}
-
-	if (S_ISDIR(u.bi_mode) && !link->count) {
-		need_fsck_err(c, "unreachable directory found (inum %llu)",
-			      u.bi_inum);
-		return 0;
-	}
-
-	if (!S_ISDIR(u.bi_mode) && link->dir_count) {
-		need_fsck_err(c, "non directory with subdirectories (inum %llu)",
-			      u.bi_inum);
-		return 0;
-	}
-
-	if (!link->count &&
-	    !(u.bi_flags & BCH_INODE_UNLINKED) &&
-	    (c->sb.features & (1 << BCH_FEATURE_atomic_nlink))) {
-		if (fsck_err(c, "unreachable inode %llu not marked as unlinked (type %u)",
-			     u.bi_inum, mode_to_type(u.bi_mode)) ==
-		    FSCK_ERR_IGNORE)
-			return 0;
-
+	/* Improved directory structure pass will catch this: */
+	if (fsck_err_on(!nlink, c,
+			"unreachable inode %llu not marked as unlinked (type %u)",
+			u.bi_inum, mode_to_type(u.bi_mode))) {
 		ret = reattach_inode(c, lostfound_inode, u.bi_inum);
 		if (ret)
 			return ret;
 
-		link->count = 1;
-		real_i_nlink = nlink_bias(u.bi_mode) + link->dir_count;
-		goto set_i_nlink;
-	}
-
-	if (i_nlink < link->count) {
-		if (fsck_err(c, "inode %llu i_link too small (%u < %u, type %i)",
-			     u.bi_inum, i_nlink, link->count,
-			     mode_to_type(u.bi_mode)) == FSCK_ERR_IGNORE)
-			return 0;
-		goto set_i_nlink;
-	}
-
-	if (i_nlink != real_i_nlink &&
-	    c->sb.clean) {
-		if (fsck_err(c, "filesystem marked clean, "
-			     "but inode %llu has wrong i_nlink "
-			     "(type %u i_nlink %u, should be %u)",
-			     u.bi_inum, mode_to_type(u.bi_mode),
-			     i_nlink, real_i_nlink) == FSCK_ERR_IGNORE)
-			return 0;
-		goto set_i_nlink;
-	}
-
-	if (i_nlink != real_i_nlink &&
-	    (c->sb.features & (1 << BCH_FEATURE_atomic_nlink))) {
-		if (fsck_err(c, "inode %llu has wrong i_nlink "
-			     "(type %u i_nlink %u, should be %u)",
-			     u.bi_inum, mode_to_type(u.bi_mode),
-			     i_nlink, real_i_nlink) == FSCK_ERR_IGNORE)
-			return 0;
-		goto set_i_nlink;
+		nlink = 1;
 	}
 
-	if (real_i_nlink && i_nlink != real_i_nlink)
-		bch_verbose(c, "setting inode %llu nlink from %u to %u",
-			    u.bi_inum, i_nlink, real_i_nlink);
-set_i_nlink:
-	if (i_nlink != real_i_nlink) {
+	if (fsck_err_on(bch2_inode_nlink_get(&u) != nlink, c,
+			"inode %llu has wrong i_nlink (type %u i_nlink %u, should be %u)",
+			u.bi_inum, mode_to_type(u.bi_mode),
+			bch2_inode_nlink_get(&u), nlink)) {
 		struct bkey_inode_buf p;
 
-		bch2_inode_nlink_set(&u, real_i_nlink);
+		if (nlink > 1)
+			u.bi_flags |= BCH_INODE_BACKPTR_UNTRUSTED;
+
+		bch2_inode_nlink_set(&u, nlink);
 		bch2_inode_pack(c, &p, &u);
 		p.inode.k.p = iter->pos;
 
@@ -1331,66 +1267,33 @@ static int bch2_gc_walk_inodes(struct bch_fs *c,
 	struct btree_trans trans;
 	struct btree_iter *iter;
 	struct bkey_s_c k;
-	struct nlink *link, zero_links = { 0, 0 };
-	struct genradix_iter nlinks_iter;
-	int ret = 0, ret2 = 0;
-	u64 nlinks_pos;
+	struct nlink *link;
+	int ret = 0;
 
 	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
 
-	iter = bch2_trans_get_iter(&trans, BTREE_ID_inodes,
-				   POS(0, range_start), 0);
-	nlinks_iter = genradix_iter_init(links, 0);
-
-	while ((k = bch2_btree_iter_peek(iter)).k &&
-	       !(ret2 = bkey_err(k)) &&
-	       iter->pos.offset < range_end) {
-peek_nlinks:	link = genradix_iter_peek(&nlinks_iter, links);
-
-		if (!link && (!k.k || iter->pos.offset >= range_end))
+	for_each_btree_key(&trans, iter, BTREE_ID_inodes,
+			   POS(0, range_start), 0, k, ret) {
+		if (!k.k || k.k->p.offset >= range_end)
 			break;
 
-		nlinks_pos = range_start + nlinks_iter.pos;
-
-		if (link && nlinks_pos < iter->pos.offset) {
-			/* Should have been caught by dirents pass: */
-			need_fsck_err_on(link->count, c,
-				"missing inode %llu (nlink %u)",
-				nlinks_pos, link->count);
-			genradix_iter_advance(&nlinks_iter, links);
-			goto peek_nlinks;
-		}
-
-		if (!link || nlinks_pos > iter->pos.offset)
-			link = &zero_links;
-
-		if (k.k && k.k->type == KEY_TYPE_inode) {
-			ret = check_inode_nlink(&trans, lostfound_inode, iter,
-						bkey_s_c_to_inode(k), link);
-			BUG_ON(ret == -EINTR);
-			if (ret)
-				break;
-		} else {
-			/* Should have been caught by dirents pass: */
-			need_fsck_err_on(link->count, c,
-				"missing inode %llu (nlink %u)",
-				nlinks_pos, link->count);
-		}
+		if (k.k->type != KEY_TYPE_inode)
+			continue;
 
-		if (nlinks_pos == iter->pos.offset)
-			genradix_iter_advance(&nlinks_iter, links);
+		link = genradix_ptr(links, k.k->p.offset - range_start);
+		ret = check_inode_nlink(&trans, lostfound_inode, iter,
+					bkey_s_c_to_inode(k), link ? link->count : 0);
+		if (ret)
+			break;
 
-		bch2_btree_iter_advance(iter);
-		bch2_trans_cond_resched(&trans);
 	}
-fsck_err:
 	bch2_trans_iter_put(&trans, iter);
 	bch2_trans_exit(&trans);
 
-	if (ret2)
-		bch_err(c, "error in fsck: btree error %i while walking inodes", ret2);
+	if (ret)
+		bch_err(c, "error in fsck: btree error %i while walking inodes", ret);
 
-	return ret ?: ret2;
+	return ret;
 }
 
 noinline_for_stack
-- 
cgit 


From b69ac13cb39176634f1dd924dfabe2e282615d41 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 12 Apr 2021 14:00:07 -0400
Subject: bcachefs: Fix bch2_trans_relock()

The patch that changed bch2_trans_relock() to not look at iter->uptodate
also tried to add an optimization by only having it relock
btree_iter_key() iterators (iterators that are live or have been marked
as keep). But, this wasn't thought through - this pops internal iterator
assertions because on transaction restart, when we're traversing
iterators we traverse all iterators marked as linked, and having
bch2_trans_relock() skip some of those mean that it can skil the
iterator that bch2_btree_iter_traverse_one() is currently traversing.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index ae2e907adc73..033a079fb3f3 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -472,8 +472,7 @@ bool bch2_trans_relock(struct btree_trans *trans)
 	struct btree_iter *iter;
 
 	trans_for_each_iter(trans, iter)
-		if (btree_iter_keep(trans, iter) &&
-		    !bch2_btree_iter_relock(iter, true)) {
+		if (!bch2_btree_iter_relock(iter, true)) {
 			trace_trans_restart_relock(trans->ip);
 			return false;
 		}
-- 
cgit 


From 176cf4bf59014d03be6cef33cabb677d2117dbb2 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 9 Apr 2021 03:25:37 -0400
Subject: bcachefs: Fix fsck to not use bch2_link_trans()

bch2_link_trans() uses the btree key cache for inode updates, and fsck
isn't supposed to - also, it's not really what we want for reattaching
unreachable inodes anyways.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fsck.c | 63 +++++++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 51 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 6bc3f2f09e36..d7d26fb40432 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -128,22 +128,63 @@ static int remove_dirent(struct btree_trans *trans,
 			       __remove_dirent(trans, dirent));
 }
 
-static int reattach_inode(struct bch_fs *c,
-			  struct bch_inode_unpacked *lostfound_inode,
-			  u64 inum)
+static int __reattach_inode(struct btree_trans *trans,
+			    struct bch_inode_unpacked *lostfound,
+			    u64 inum)
 {
-	struct bch_inode_unpacked dir_u, inode_u;
+	struct bch_hash_info dir_hash =
+		bch2_hash_info_init(trans->c, lostfound);
+	struct btree_iter *dir_iter = NULL, *inode_iter = NULL;
+	struct bch_inode_unpacked inode_u;
 	char name_buf[20];
 	struct qstr name;
+	u64 dir_offset = 0;
 	int ret;
 
 	snprintf(name_buf, sizeof(name_buf), "%llu", inum);
 	name = (struct qstr) QSTR(name_buf);
 
-	ret = bch2_trans_do(c, NULL, NULL,
-			    BTREE_INSERT_LAZY_RW,
-		bch2_link_trans(&trans, lostfound_inode->bi_inum,
-				inum, &dir_u, &inode_u, &name));
+	inode_iter = bch2_inode_peek(trans, &inode_u, inum, 0);
+	ret = PTR_ERR_OR_ZERO(inode_iter);
+	if (ret)
+		goto err;
+
+	if (S_ISDIR(inode_u.bi_mode)) {
+		lostfound->bi_nlink++;
+
+		ret = write_inode(trans, lostfound, U32_MAX);
+		if (ret)
+			goto err;
+	}
+
+	ret = bch2_dirent_create(trans, lostfound->bi_inum, &dir_hash,
+				 mode_to_type(inode_u.bi_mode),
+				 &name, inum, &dir_offset,
+				 BCH_HASH_SET_MUST_CREATE);
+	if (ret)
+		goto err;
+
+	inode_u.bi_dir		= lostfound->bi_inum;
+	inode_u.bi_dir_offset	= dir_offset;
+
+	ret = write_inode(trans, &inode_u, U32_MAX);
+	if (ret)
+		goto err;
+err:
+	bch2_trans_iter_put(trans, dir_iter);
+	bch2_trans_iter_put(trans, inode_iter);
+	return ret;
+}
+
+static int reattach_inode(struct btree_trans *trans,
+			  struct bch_inode_unpacked *lostfound,
+			  u64 inum)
+{
+	struct bch_fs *c = trans->c;
+	int ret;
+
+	ret = __bch2_trans_do(trans, NULL, NULL, BTREE_INSERT_LAZY_RW,
+			      __reattach_inode(trans, lostfound, inum));
 	if (ret)
 		bch_err(c, "error %i reattaching inode %llu", ret, inum);
 
@@ -1105,9 +1146,7 @@ retry:
 		if (fsck_err_on(!inode_bitmap_test(&dirs_done, k.k->p.offset), c,
 				"unreachable directory found (inum %llu)",
 				k.k->p.offset)) {
-			bch2_trans_unlock(&trans);
-
-			ret = reattach_inode(c, lostfound_inode, k.k->p.offset);
+			ret = reattach_inode(&trans, lostfound_inode, k.k->p.offset);
 			if (ret) {
 				goto err;
 			}
@@ -1227,7 +1266,7 @@ static int check_inode_nlink(struct btree_trans *trans,
 	if (fsck_err_on(!nlink, c,
 			"unreachable inode %llu not marked as unlinked (type %u)",
 			u.bi_inum, mode_to_type(u.bi_mode))) {
-		ret = reattach_inode(c, lostfound_inode, u.bi_inum);
+		ret = reattach_inode(trans, lostfound_inode, u.bi_inum);
 		if (ret)
 			return ret;
 
-- 
cgit 


From d3ff7fec9c604e2cac3d0126f6764c5c0392a271 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 7 Apr 2021 03:11:07 -0400
Subject: bcachefs: Improved check_directory_structure()

Now that we have inode backpointers, we can simplify checking directory
structure: instead of doing a DFS from the filesystem root and then
checking if we found everything, we can iterate over every inode and see
if we can go up until we get to the root.

This patch also has a number of fixes and simplifications for the inode
backpointer checks. Also, it turns out we don't actually need the
BCH_INODE_BACKPTR_UNTRUSTED flag.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update.h |   9 +-
 fs/bcachefs/fs-common.c    |   8 +-
 fs/bcachefs/fsck.c         | 398 ++++++++++++++++++++-------------------------
 fs/bcachefs/inode.c        |  31 +---
 fs/bcachefs/inode.h        |   4 -
 5 files changed, 193 insertions(+), 257 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
index 4ce12ae29a55..0c7caa7e91a0 100644
--- a/fs/bcachefs/btree_update.h
+++ b/fs/bcachefs/btree_update.h
@@ -103,13 +103,12 @@ static inline int bch2_trans_commit(struct btree_trans *trans,
 	return __bch2_trans_commit(trans);
 }
 
-#define __bch2_trans_do(_trans, _disk_res, _journal_seq, _flags, _do)	\
+#define lockrestart_do(_trans, _do)					\
 ({									\
 	int _ret;							\
 									\
 	while (1) {							\
-		_ret = (_do) ?:	bch2_trans_commit(_trans, (_disk_res),	\
-					(_journal_seq), (_flags));	\
+		_ret = (_do);						\
 		if (_ret != -EINTR)					\
 			break;						\
 		bch2_trans_reset(_trans, 0);				\
@@ -118,6 +117,10 @@ static inline int bch2_trans_commit(struct btree_trans *trans,
 	_ret;								\
 })
 
+#define __bch2_trans_do(_trans, _disk_res, _journal_seq, _flags, _do)	\
+	lockrestart_do(_trans, _do ?: bch2_trans_commit(_trans, (_disk_res),\
+					(_journal_seq), (_flags)))
+
 #define bch2_trans_do(_c, _disk_res, _journal_seq, _flags, _do)		\
 ({									\
 	struct btree_trans trans;					\
diff --git a/fs/bcachefs/fs-common.c b/fs/bcachefs/fs-common.c
index 281a6135e599..34d69c3f6680 100644
--- a/fs/bcachefs/fs-common.c
+++ b/fs/bcachefs/fs-common.c
@@ -110,8 +110,6 @@ int bch2_link_trans(struct btree_trans *trans, u64 dir_inum,
 	inode_u->bi_ctime = now;
 	bch2_inode_nlink_inc(inode_u);
 
-	inode_u->bi_flags |= BCH_INODE_BACKPTR_UNTRUSTED;
-
 	dir_iter = bch2_inode_peek(trans, dir_u, dir_inum, 0);
 	ret = PTR_ERR_OR_ZERO(dir_iter);
 	if (ret)
@@ -175,6 +173,12 @@ int bch2_unlink_trans(struct btree_trans *trans,
 	if (ret)
 		goto err;
 
+	if (inode_u->bi_dir		== k.k->p.inode &&
+	    inode_u->bi_dir_offset	== k.k->p.offset) {
+		inode_u->bi_dir		= 0;
+		inode_u->bi_dir_offset	= 0;
+	}
+
 	dir_u->bi_mtime = dir_u->bi_ctime = inode_u->bi_ctime = now;
 	dir_u->bi_nlink -= S_ISDIR(inode_u->bi_mode);
 	bch2_inode_nlink_dec(inode_u);
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index d7d26fb40432..fa1922cb5c87 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -59,7 +59,7 @@ static int lookup_inode(struct btree_trans *trans, u64 inode_nr,
 		? bch2_inode_unpack(bkey_s_c_to_inode(k), inode)
 		: -ENOENT;
 err:
-	bch2_trans_iter_put(trans, iter);
+	bch2_trans_iter_free(trans, iter);
 	return ret;
 }
 
@@ -134,27 +134,26 @@ static int __reattach_inode(struct btree_trans *trans,
 {
 	struct bch_hash_info dir_hash =
 		bch2_hash_info_init(trans->c, lostfound);
-	struct btree_iter *dir_iter = NULL, *inode_iter = NULL;
 	struct bch_inode_unpacked inode_u;
 	char name_buf[20];
 	struct qstr name;
 	u64 dir_offset = 0;
+	u32 snapshot;
 	int ret;
 
 	snprintf(name_buf, sizeof(name_buf), "%llu", inum);
 	name = (struct qstr) QSTR(name_buf);
 
-	inode_iter = bch2_inode_peek(trans, &inode_u, inum, 0);
-	ret = PTR_ERR_OR_ZERO(inode_iter);
+	ret = lookup_inode(trans, inum, &inode_u, &snapshot);
 	if (ret)
-		goto err;
+		return ret;
 
 	if (S_ISDIR(inode_u.bi_mode)) {
 		lostfound->bi_nlink++;
 
 		ret = write_inode(trans, lostfound, U32_MAX);
 		if (ret)
-			goto err;
+			return ret;
 	}
 
 	ret = bch2_dirent_create(trans, lostfound->bi_inum, &dir_hash,
@@ -162,18 +161,12 @@ static int __reattach_inode(struct btree_trans *trans,
 				 &name, inum, &dir_offset,
 				 BCH_HASH_SET_MUST_CREATE);
 	if (ret)
-		goto err;
+		return ret;
 
 	inode_u.bi_dir		= lostfound->bi_inum;
 	inode_u.bi_dir_offset	= dir_offset;
 
-	ret = write_inode(trans, &inode_u, U32_MAX);
-	if (ret)
-		goto err;
-err:
-	bch2_trans_iter_put(trans, dir_iter);
-	bch2_trans_iter_put(trans, inode_iter);
-	return ret;
+	return write_inode(trans, &inode_u, U32_MAX);
 }
 
 static int reattach_inode(struct btree_trans *trans,
@@ -191,6 +184,30 @@ static int reattach_inode(struct btree_trans *trans,
 	return ret;
 }
 
+static int remove_backpointer(struct btree_trans *trans,
+			      struct bch_inode_unpacked *inode)
+{
+	struct btree_iter *iter;
+	struct bkey_s_c k;
+	int ret;
+
+	iter = bch2_trans_get_iter(trans, BTREE_ID_dirents,
+				   POS(inode->bi_dir, inode->bi_dir_offset), 0);
+	k = bch2_btree_iter_peek_slot(iter);
+	ret = bkey_err(k);
+	if (ret)
+		goto out;
+	if (k.k->type != KEY_TYPE_dirent) {
+		ret = -ENOENT;
+		goto out;
+	}
+
+	ret = remove_dirent(trans, bkey_s_c_to_dirent(k));
+out:
+	bch2_trans_iter_put(trans, iter);
+	return ret;
+}
+
 struct inode_walker {
 	bool			first_this_inode;
 	bool			have_inode;
@@ -420,26 +437,18 @@ static int check_inode(struct btree_trans *trans,
 		do_update = true;
 	}
 
-	if (!S_ISDIR(u.bi_mode) &&
-	    u.bi_nlink &&
-	    !(u.bi_flags & BCH_INODE_BACKPTR_UNTRUSTED) &&
-	    (fsck_err_on(c->sb.version >= bcachefs_metadata_version_inode_backpointers, c,
-			 "inode missing BCH_INODE_BACKPTR_UNTRUSTED flags") ||
-	     c->opts.version_upgrade)) {
-		u.bi_flags |= BCH_INODE_BACKPTR_UNTRUSTED;
+	if (u.bi_flags & BCH_INODE_BACKPTR_UNTRUSTED) {
+		u.bi_dir = 0;
+		u.bi_dir_offset = 0;
+		u.bi_flags &= ~BCH_INODE_BACKPTR_UNTRUSTED;
 		do_update = true;
 	}
 
 	if (do_update) {
-		struct bkey_inode_buf p;
-
-		bch2_inode_pack(c, &p, &u);
-		p.inode.k.p = iter->pos;
-
 		ret = __bch2_trans_do(trans, NULL, NULL,
 				      BTREE_INSERT_NOFAIL|
 				      BTREE_INSERT_LAZY_RW,
-			(bch2_trans_update(trans, iter, &p.inode.k_i, 0), 0));
+				bch2_inode_write(trans, iter, &u));
 		if (ret)
 			bch_err(c, "error in fsck: error %i "
 				"updating inode", ret);
@@ -704,7 +713,8 @@ retry:
 				mode_to_type(w.inode.bi_mode),
 				(bch2_bkey_val_to_text(&PBUF(buf), c,
 						       k), buf))) {
-			ret = bch2_btree_delete_at(&trans, iter, 0);
+			ret = lockrestart_do(&trans,
+					bch2_btree_delete_at(&trans, iter, 0));
 			if (ret)
 				goto err;
 			goto next;
@@ -751,6 +761,16 @@ retry:
 		if (!have_target)
 			goto next;
 
+		if (!target.bi_dir &&
+		    !target.bi_dir_offset) {
+			target.bi_dir		= k.k->p.inode;
+			target.bi_dir_offset	= k.k->p.offset;
+
+			ret = write_inode(&trans, &target, target_snapshot);
+			if (ret)
+				goto err;
+		}
+
 		if (!inode_backpointer_matches(d, &target)) {
 			ret = inode_backpointer_exists(&trans, &target);
 			if (ret < 0)
@@ -758,52 +778,47 @@ retry:
 
 			backpointer_exists = ret;
 			ret = 0;
-		}
 
-		if (fsck_err_on(S_ISDIR(target.bi_mode) &&
-				!inode_backpointer_matches(d, &target) &&
-				backpointer_exists, c,
-				"directory %llu with multiple links",
-				target.bi_inum)) {
-			ret = remove_dirent(&trans, d);
-			if (ret)
-				goto err;
-			continue;
-		}
-
-		if (!inode_backpointer_matches(d, &target) &&
-		    (S_ISDIR(target.bi_mode) || !target.bi_nlink)) {
-			if (backpointer_exists) {
-				if (!fsck_err(c, "inode %llu has multiple links but i_nlink 0",
-					      d_inum))
-					goto check_type;
+			if (fsck_err_on(S_ISDIR(target.bi_mode) &&
+					backpointer_exists, c,
+					"directory %llu with multiple links",
+					target.bi_inum)) {
+				ret = remove_dirent(&trans, d);
+				if (ret)
+					goto err;
+				continue;
+			}
 
+			if (fsck_err_on(backpointer_exists &&
+					!target.bi_nlink, c,
+					"inode %llu has multiple links but i_nlink 0",
+					d_inum)) {
 				target.bi_nlink++;
-				target.bi_flags |= BCH_INODE_BACKPTR_UNTRUSTED;
-			} else {
-				if (c->sb.version >= bcachefs_metadata_version_inode_backpointers &&
-				    !(target.bi_flags & BCH_INODE_BACKPTR_UNTRUSTED) &&
-				    !fsck_err(c, "inode %llu has wrong backpointer:\n"
-					      "got       %llu:%llu\n"
-					      "should be %llu:%llu",
-					      d_inum,
-					      target.bi_dir,
-					      target.bi_dir_offset,
-					      k.k->p.inode,
-					      k.k->p.offset))
-					goto check_type;
+				target.bi_flags &= ~BCH_INODE_UNLINKED;
+
+				ret = write_inode(&trans, &target, target_snapshot);
+				if (ret)
+					goto err;
+			}
 
+			if (fsck_err_on(!backpointer_exists, c,
+					"inode %llu has wrong backpointer:\n"
+					"got       %llu:%llu\n"
+					"should be %llu:%llu",
+					d_inum,
+					target.bi_dir,
+					target.bi_dir_offset,
+					k.k->p.inode,
+					k.k->p.offset)) {
 				target.bi_dir		= k.k->p.inode;
 				target.bi_dir_offset	= k.k->p.offset;
-				target.bi_flags &= ~BCH_INODE_BACKPTR_UNTRUSTED;
-			}
 
-			ret = write_inode(&trans, &target, target_snapshot);
-			if (ret)
-				goto err;
-			continue;
+				ret = write_inode(&trans, &target, target_snapshot);
+				if (ret)
+					goto err;
+			}
 		}
-check_type:
+
 		if (fsck_err_on(d.v->d_type != mode_to_type(target.bi_mode), c,
 				"incorrect d_type: should be %u:\n%s",
 				mode_to_type(target.bi_mode),
@@ -900,13 +915,13 @@ fsck_err:
 static int check_root(struct bch_fs *c, struct bch_inode_unpacked *root_inode)
 {
 	struct bkey_inode_buf packed;
+	u32 snapshot;
 	int ret;
 
 	bch_verbose(c, "checking root directory");
 
 	ret = bch2_trans_do(c, NULL, NULL, 0,
-		__bch2_inode_find_by_inum_trans(&trans, BCACHEFS_ROOT_INO,
-						root_inode, 0));
+		lookup_inode(&trans, BCACHEFS_ROOT_INO, root_inode, &snapshot));
 	if (ret && ret != -ENOENT)
 		return ret;
 
@@ -942,6 +957,7 @@ static int check_lostfound(struct bch_fs *c,
 	struct bch_hash_info root_hash_info =
 		bch2_hash_info_init(c, root_inode);
 	u64 inum;
+	u32 snapshot;
 	int ret;
 
 	bch_verbose(c, "checking lost+found");
@@ -954,7 +970,7 @@ static int check_lostfound(struct bch_fs *c,
 	}
 
 	ret = bch2_trans_do(c, NULL, NULL, 0,
-		__bch2_inode_find_by_inum_trans(&trans, inum, lostfound_inode, 0));
+		lookup_inode(&trans, inum, lostfound_inode, &snapshot));
 	if (ret && ret != -ENOENT)
 		return ret;
 
@@ -984,32 +1000,12 @@ create_lostfound:
 	return ret;
 }
 
-typedef GENRADIX(unsigned long) inode_bitmap;
-
-static inline bool inode_bitmap_test(inode_bitmap *b, size_t nr)
-{
-	unsigned long *w = genradix_ptr(b, nr / BITS_PER_LONG);
-	return w ? test_bit(nr & (BITS_PER_LONG - 1), w) : false;
-}
-
-static inline int inode_bitmap_set(inode_bitmap *b, size_t nr)
-{
-	unsigned long *w = genradix_ptr_alloc(b, nr / BITS_PER_LONG, GFP_KERNEL);
-
-	if (!w)
-		return -ENOMEM;
-
-	*w |= 1UL << (nr & (BITS_PER_LONG - 1));
-	return 0;
-}
-
 struct pathbuf {
 	size_t		nr;
 	size_t		size;
 
 	struct pathbuf_entry {
 		u64	inum;
-		u64	offset;
 	}		*entries;
 };
 
@@ -1020,8 +1016,9 @@ static int path_down(struct pathbuf *p, u64 inum)
 		void *n = krealloc(p->entries,
 				   new_size * sizeof(p->entries[0]),
 				   GFP_KERNEL);
-		if (!n)
+		if (!n) {
 			return -ENOMEM;
+		}
 
 		p->entries = n;
 		p->size = new_size;
@@ -1029,149 +1026,119 @@ static int path_down(struct pathbuf *p, u64 inum)
 
 	p->entries[p->nr++] = (struct pathbuf_entry) {
 		.inum = inum,
-		.offset = 0,
 	};
 	return 0;
 }
 
-noinline_for_stack
-static int check_directory_structure(struct bch_fs *c,
-				     struct bch_inode_unpacked *lostfound_inode)
+static int check_path(struct btree_trans *trans,
+		      struct bch_inode_unpacked *lostfound,
+		      struct pathbuf *p,
+		      struct bch_inode_unpacked *inode)
 {
-	inode_bitmap dirs_done;
-	struct pathbuf path = { 0, 0, NULL };
-	struct pathbuf_entry *e;
-	struct btree_trans trans;
-	struct btree_iter *iter;
-	struct bkey_s_c k;
-	struct bkey_s_c_dirent dirent;
-	bool had_unreachable;
-	u64 d_inum;
+	struct bch_fs *c = trans->c;
+	u32 snapshot;
+	size_t i;
 	int ret = 0;
 
-	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
+	p->nr = 0;
 
-	bch_verbose(c, "checking directory structure");
-
-	/* DFS: */
-restart_dfs:
-	genradix_init(&dirs_done);
-	had_unreachable = false;
-
-	ret = inode_bitmap_set(&dirs_done, BCACHEFS_ROOT_INO);
-	if (ret) {
-		bch_err(c, "memory allocation failure in inode_bitmap_set()");
-		goto err;
-	}
-
-	ret = path_down(&path, BCACHEFS_ROOT_INO);
-	if (ret)
-		goto err;
-
-	while (path.nr) {
-next:
-		e = &path.entries[path.nr - 1];
-
-		if (e->offset == U64_MAX)
-			goto up;
-
-		for_each_btree_key(&trans, iter, BTREE_ID_dirents,
-				   POS(e->inum, e->offset + 1), 0, k, ret) {
-			if (k.k->p.inode != e->inum)
-				break;
-
-			e->offset = k.k->p.offset;
-
-			if (k.k->type != KEY_TYPE_dirent)
-				continue;
+	while (inode->bi_inum != BCACHEFS_ROOT_INO) {
+		ret = lockrestart_do(trans,
+			inode_backpointer_exists(trans, inode));
+		if (ret < 0)
+			break;
 
-			dirent = bkey_s_c_to_dirent(k);
+		if (!ret) {
+			if (fsck_err(c,  "unreachable inode %llu, type %u nlink %u backptr %llu:%llu",
+				     inode->bi_inum,
+				     mode_to_type(inode->bi_mode),
+				     inode->bi_nlink,
+				     inode->bi_dir,
+				     inode->bi_dir_offset))
+				ret = reattach_inode(trans, lostfound, inode->bi_inum);
+			break;
+		}
+		ret = 0;
 
-			if (dirent.v->d_type != DT_DIR)
-				continue;
+		if (!S_ISDIR(inode->bi_mode))
+			break;
 
-			d_inum = le64_to_cpu(dirent.v->d_inum);
+		ret = path_down(p, inode->bi_inum);
+		if (ret) {
+			bch_err(c, "memory allocation failure");
+			return ret;
+		}
 
-			if (fsck_err_on(inode_bitmap_test(&dirs_done, d_inum), c,
-					"directory %llu has multiple hardlinks",
-					d_inum)) {
-				ret = remove_dirent(&trans, dirent);
-				if (ret)
-					goto err;
+		for (i = 0; i < p->nr; i++) {
+			if (inode->bi_dir != p->entries[i].inum)
 				continue;
-			}
 
-			ret = inode_bitmap_set(&dirs_done, d_inum);
-			if (ret) {
-				bch_err(c, "memory allocation failure in inode_bitmap_set()");
-				goto err;
-			}
+			/* XXX print path */
+			if (!fsck_err(c, "directory structure loop"))
+				return 0;
 
-			ret = path_down(&path, d_inum);
+			ret = lockrestart_do(trans,
+					 remove_backpointer(trans, inode));
 			if (ret) {
-				goto err;
+				bch_err(c, "error removing dirent: %i", ret);
+				break;
 			}
 
-			ret = bch2_trans_iter_free(&trans, iter);
-			if (ret) {
-				bch_err(c, "btree error %i in fsck", ret);
-				goto err;
-			}
-			goto next;
+			ret = reattach_inode(trans, lostfound, inode->bi_inum);
+			break;
 		}
-		ret = bch2_trans_iter_free(&trans, iter) ?: ret;
+
+		ret = lockrestart_do(trans,
+				lookup_inode(trans, inode->bi_dir, inode, &snapshot));
 		if (ret) {
-			bch_err(c, "btree error %i in fsck", ret);
-			goto err;
+			/* Should have been caught in dirents pass */
+			bch_err(c, "error looking up parent directory: %i", ret);
+			break;
 		}
-up:
-		path.nr--;
 	}
+fsck_err:
+	if (ret)
+		bch_err(c, "%s: err %i", __func__, ret);
+	return ret;
+}
 
-	iter = bch2_trans_get_iter(&trans, BTREE_ID_inodes, POS_MIN, 0);
-retry:
-	for_each_btree_key_continue(iter, 0, k, ret) {
-		if (k.k->type != KEY_TYPE_inode)
-			continue;
+/*
+ * Check for unreachable inodes, as well as loops in the directory structure:
+ * After check_dirents(), if an inode backpointer doesn't exist that means it's
+ * unreachable:
+ */
+static int check_directory_structure(struct bch_fs *c,
+				     struct bch_inode_unpacked *lostfound)
+{
+	struct btree_trans trans;
+	struct btree_iter *iter;
+	struct bkey_s_c k;
+	struct bch_inode_unpacked u;
+	struct pathbuf path = { 0, 0, NULL };
+	int ret;
 
-		if (!S_ISDIR(le16_to_cpu(bkey_s_c_to_inode(k).v->bi_mode)))
-			continue;
+	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
 
-		ret = bch2_empty_dir_trans(&trans, k.k->p.inode);
-		if (ret == -EINTR)
-			goto retry;
-		if (!ret)
+	for_each_btree_key(&trans, iter, BTREE_ID_inodes, POS_MIN, 0, k, ret) {
+		if (k.k->type != KEY_TYPE_inode)
 			continue;
 
-		if (fsck_err_on(!inode_bitmap_test(&dirs_done, k.k->p.offset), c,
-				"unreachable directory found (inum %llu)",
-				k.k->p.offset)) {
-			ret = reattach_inode(&trans, lostfound_inode, k.k->p.offset);
-			if (ret) {
-				goto err;
-			}
-
-			had_unreachable = true;
+		ret = bch2_inode_unpack(bkey_s_c_to_inode(k), &u);
+		if (ret) {
+			/* Should have been caught earlier in fsck: */
+			bch_err(c, "error unpacking inode %llu: %i", k.k->p.offset, ret);
+			break;
 		}
-	}
-	bch2_trans_iter_free(&trans, iter);
-	if (ret)
-		goto err;
 
-	if (had_unreachable) {
-		bch_info(c, "reattached unreachable directories, restarting pass to check for loops");
-		genradix_free(&dirs_done);
-		kfree(path.entries);
-		memset(&dirs_done, 0, sizeof(dirs_done));
-		memset(&path, 0, sizeof(path));
-		goto restart_dfs;
+		ret = check_path(&trans, lostfound, &path, &u);
+		if (ret)
+			break;
 	}
-err:
-fsck_err:
-	ret = bch2_trans_exit(&trans) ?: ret;
-	genradix_free(&dirs_done);
-	kfree(path.entries);
-	return ret;
+	bch2_trans_iter_put(&trans, iter);
+
+	BUG_ON(ret == -EINTR);
+
+	return bch2_trans_exit(&trans) ?: ret;
 }
 
 struct nlink {
@@ -1254,6 +1221,11 @@ static int check_inode_nlink(struct btree_trans *trans,
 	if (S_ISDIR(le16_to_cpu(inode.v->bi_mode)))
 		return 0;
 
+	if (!nlink) {
+		bch_err(c, "no links found to inode %llu", inode.k->p.offset);
+		return -EINVAL;
+	}
+
 	ret = bch2_inode_unpack(inode, &u);
 
 	/* Should never happen, checked by bch2_inode_invalid: */
@@ -1262,34 +1234,16 @@ static int check_inode_nlink(struct btree_trans *trans,
 			 inode.k->p.inode))
 		return ret;
 
-	/* Improved directory structure pass will catch this: */
-	if (fsck_err_on(!nlink, c,
-			"unreachable inode %llu not marked as unlinked (type %u)",
-			u.bi_inum, mode_to_type(u.bi_mode))) {
-		ret = reattach_inode(trans, lostfound_inode, u.bi_inum);
-		if (ret)
-			return ret;
-
-		nlink = 1;
-	}
-
 	if (fsck_err_on(bch2_inode_nlink_get(&u) != nlink, c,
 			"inode %llu has wrong i_nlink (type %u i_nlink %u, should be %u)",
 			u.bi_inum, mode_to_type(u.bi_mode),
 			bch2_inode_nlink_get(&u), nlink)) {
-		struct bkey_inode_buf p;
-
-		if (nlink > 1)
-			u.bi_flags |= BCH_INODE_BACKPTR_UNTRUSTED;
-
 		bch2_inode_nlink_set(&u, nlink);
-		bch2_inode_pack(c, &p, &u);
-		p.inode.k.p = iter->pos;
 
 		ret = __bch2_trans_do(trans, NULL, NULL,
 				      BTREE_INSERT_NOFAIL|
 				      BTREE_INSERT_LAZY_RW,
-			(bch2_trans_update(trans, iter, &p.inode.k_i, 0), 0));
+				bch2_inode_write(trans, iter, &u));
 		if (ret)
 			bch_err(c, "error in fsck: error %i updating inode", ret);
 	}
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index e650c2a0d7d7..66b0bc01c75e 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -307,7 +307,7 @@ struct btree_iter *bch2_inode_peek(struct btree_trans *trans,
 	if (ret)
 		goto err;
 
-	ret = k.k->type == KEY_TYPE_inode ? 0 : -EIO;
+	ret = k.k->type == KEY_TYPE_inode ? 0 : -ENOENT;
 	if (ret)
 		goto err;
 
@@ -637,39 +637,18 @@ err:
 	return ret;
 }
 
-int __bch2_inode_find_by_inum_trans(struct btree_trans *trans, u64 inode_nr,
-				    struct bch_inode_unpacked *inode,
-				    unsigned flags)
+static int bch2_inode_find_by_inum_trans(struct btree_trans *trans, u64 inode_nr,
+					 struct bch_inode_unpacked *inode)
 {
 	struct btree_iter *iter;
-	struct bkey_s_c k;
 	int ret;
 
-	iter = bch2_trans_get_iter(trans, BTREE_ID_inodes,
-			POS(0, inode_nr), flags);
-	k = (flags & BTREE_ITER_TYPE) == BTREE_ITER_CACHED
-		? bch2_btree_iter_peek_cached(iter)
-		: bch2_btree_iter_peek_slot(iter);
-	ret = bkey_err(k);
-	if (ret)
-		goto err;
-
-	ret = k.k->type == KEY_TYPE_inode
-		? bch2_inode_unpack(bkey_s_c_to_inode(k), inode)
-		: -ENOENT;
-err:
+	iter = bch2_inode_peek(trans, inode, inode_nr, 0);
+	ret = PTR_ERR_OR_ZERO(iter);
 	bch2_trans_iter_put(trans, iter);
 	return ret;
 }
 
-int bch2_inode_find_by_inum_trans(struct btree_trans *trans, u64 inode_nr,
-				  struct bch_inode_unpacked *inode)
-{
-	return __bch2_inode_find_by_inum_trans(trans, inode_nr,
-					       inode, BTREE_ITER_CACHED);
-
-}
-
 int bch2_inode_find_by_inum(struct bch_fs *c, u64 inode_nr,
 			    struct bch_inode_unpacked *inode)
 {
diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h
index 23c322d9a85b..558d5464095d 100644
--- a/fs/bcachefs/inode.h
+++ b/fs/bcachefs/inode.h
@@ -74,10 +74,6 @@ struct btree_iter *bch2_inode_create(struct btree_trans *,
 
 int bch2_inode_rm(struct bch_fs *, u64, bool);
 
-int __bch2_inode_find_by_inum_trans(struct btree_trans *, u64,
-				    struct bch_inode_unpacked *, unsigned);
-int bch2_inode_find_by_inum_trans(struct btree_trans *, u64,
-				  struct bch_inode_unpacked *);
 int bch2_inode_find_by_inum(struct bch_fs *, u64, struct bch_inode_unpacked *);
 
 static inline struct bch_io_opts bch2_inode_opts_get(struct bch_inode_unpacked *inode)
-- 
cgit 


From 423300e8fe41e7613a8edba59bca85eb7517731d Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 13 Apr 2021 10:26:59 -0400
Subject: bcachefs: BCH_BEATURE_atomic_nlink is obsolete

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/recovery.c | 11 ++---------
 1 file changed, 2 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 012a08574022..9991a4f67163 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -1005,13 +1005,6 @@ int bch2_fs_recovery(struct bch_fs *c)
 
 	}
 
-	if (!c->sb.clean &&
-	    !(c->sb.features & (1 << BCH_FEATURE_atomic_nlink))) {
-		bch_info(c, "BCH_FEATURE_atomic_nlink not set and filesystem dirty, fsck required");
-		c->opts.fsck = true;
-		c->opts.fix_errors = FSCK_OPT_YES;
-	}
-
 	if (!(c->sb.features & (1ULL << BCH_FEATURE_alloc_v2))) {
 		bch_info(c, "alloc_v2 feature bit not set, fsck required");
 		c->opts.fsck = true;
@@ -1247,8 +1240,8 @@ use_clean:
 	}
 
 	if (c->opts.fsck &&
-	    !test_bit(BCH_FS_ERROR, &c->flags)) {
-		c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_atomic_nlink;
+	    !test_bit(BCH_FS_ERROR, &c->flags) &&
+	    BCH_SB_HAS_ERRORS(c->disk_sb.sb)) {
 		SET_BCH_SB_HAS_ERRORS(c->disk_sb.sb, 0);
 		write_sb = true;
 	}
-- 
cgit 


From 319c1305071c3ac813c3ea9eed15d1a0d26f983a Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 13 Apr 2021 10:30:58 -0400
Subject: bcachefs: Fix heap overrun in bch2_fs_usage_read() XXX squash

oops

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/buckets.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 7093737a02f3..4791f4896d6b 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -224,7 +224,7 @@ u64 bch2_fs_usage_read_one(struct bch_fs *c, u64 *v)
 struct bch_fs_usage_online *bch2_fs_usage_read(struct bch_fs *c)
 {
 	struct bch_fs_usage_online *ret;
-	unsigned seq, i, v, u64s = fs_usage_u64s(c);
+	unsigned seq, i, v, u64s = fs_usage_u64s(c) + 1;
 retry:
 	ret = kmalloc(u64s * sizeof(u64), GFP_NOFS);
 	if (unlikely(!ret))
@@ -232,7 +232,7 @@ retry:
 
 	percpu_down_read(&c->mark_lock);
 
-	v = fs_usage_u64s(c);
+	v = fs_usage_u64s(c) + 1;
 	if (unlikely(u64s != v)) {
 		u64s = v;
 		percpu_up_read(&c->mark_lock);
-- 
cgit 


From ac516d0e7db76bc77dea9313570b3924e0605d7b Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 13 Apr 2021 15:00:40 -0400
Subject: bcachefs: Add the status of bucket gen gc to sysfs

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs.h |  3 +++
 fs/bcachefs/btree_gc.c |  7 +++++++
 fs/bcachefs/sysfs.c    | 14 ++++++++++++++
 3 files changed, 24 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index c5ff142871c7..4dff12fd7cc1 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -719,6 +719,9 @@ struct bch_fs {
 	atomic_t		kick_gc;
 	unsigned long		gc_count;
 
+	enum btree_id		gc_gens_btree;
+	struct bpos		gc_gens_pos;
+
 	/*
 	 * Tracks GC's progress - everything in the range [ZERO_KEY..gc_cur_pos]
 	 * has been marked by GC.
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index bf40efcaa192..9f47db77c66b 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -1193,6 +1193,8 @@ static int bch2_gc_btree_gens(struct bch_fs *c, enum btree_id btree_id)
 
 	while ((k = bch2_btree_iter_peek(iter)).k &&
 	       !(ret = bkey_err(k))) {
+		c->gc_gens_pos = iter->pos;
+
 		if (gc_btree_gens_key(c, k)) {
 			bch2_bkey_buf_reassemble(&sk, c, k);
 			bch2_extent_normalize(c, bkey_i_to_s(sk.k));
@@ -1244,6 +1246,8 @@ int bch2_gc_gens(struct bch_fs *c)
 
 	for (i = 0; i < BTREE_ID_NR; i++)
 		if ((1 << i) & BTREE_ID_HAS_PTRS) {
+			c->gc_gens_btree = i;
+			c->gc_gens_pos = POS_MIN;
 			ret = bch2_gc_btree_gens(c, i);
 			if (ret) {
 				bch_err(c, "error recalculating oldest_gen: %i", ret);
@@ -1260,6 +1264,9 @@ int bch2_gc_gens(struct bch_fs *c)
 		up_read(&ca->bucket_lock);
 	}
 
+	c->gc_gens_btree	= 0;
+	c->gc_gens_pos		= POS_MIN;
+
 	c->gc_count++;
 err:
 	up_read(&c->gc_lock);
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index dd9b54e0d80b..077f3a8cead7 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -136,6 +136,7 @@ write_attribute(trigger_btree_coalesce);
 write_attribute(trigger_gc);
 write_attribute(prune_cache);
 rw_attribute(btree_gc_periodic);
+rw_attribute(gc_gens_pos);
 
 read_attribute(uuid);
 read_attribute(minor);
@@ -312,6 +313,13 @@ static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c
 	return 0;
 }
 
+void bch2_gc_gens_pos_to_text(struct printbuf *out, struct bch_fs *c)
+{
+	pr_buf(out, "%s: ", bch2_btree_ids[c->gc_gens_btree]);
+	bch2_bpos_to_text(out, c->gc_gens_pos);
+	pr_buf(out, "\n");
+}
+
 SHOW(bch2_fs)
 {
 	struct bch_fs *c = container_of(kobj, struct bch_fs, kobj);
@@ -337,6 +345,11 @@ SHOW(bch2_fs)
 
 	sysfs_printf(btree_gc_periodic, "%u",	(int) c->btree_gc_periodic);
 
+	if (attr == &sysfs_gc_gens_pos) {
+		bch2_gc_gens_pos_to_text(&out, c);
+		return out.pos - buf;
+	}
+
 	sysfs_printf(copy_gc_enabled, "%i", c->copy_gc_enabled);
 
 	sysfs_printf(rebalance_enabled,		"%i", c->rebalance.enabled);
@@ -566,6 +579,7 @@ struct attribute *bch2_fs_internal_files[] = {
 	&sysfs_trigger_journal_flush,
 	&sysfs_trigger_btree_coalesce,
 	&sysfs_trigger_gc,
+	&sysfs_gc_gens_pos,
 	&sysfs_prune_cache,
 
 	&sysfs_copy_gc_enabled,
-- 
cgit 


From e949fbbba0dce7cd80f7c2e932c289d3717c9759 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 13 Apr 2021 15:10:39 -0400
Subject: bcachefs: Ensure bucket gen gc completes

We don't want it to block, if it can't allocate it should just continue
instead of possibly deadlocking.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_gc.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 9f47db77c66b..c14794cf1be8 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -1181,7 +1181,7 @@ static int bch2_gc_btree_gens(struct bch_fs *c, enum btree_id btree_id)
 	struct btree_iter *iter;
 	struct bkey_s_c k;
 	struct bkey_buf sk;
-	int ret = 0;
+	int ret = 0, commit_err = 0;
 
 	bch2_bkey_buf_init(&sk);
 	bch2_trans_init(&trans, c, 0, 0);
@@ -1195,18 +1195,18 @@ static int bch2_gc_btree_gens(struct bch_fs *c, enum btree_id btree_id)
 	       !(ret = bkey_err(k))) {
 		c->gc_gens_pos = iter->pos;
 
-		if (gc_btree_gens_key(c, k)) {
+		if (gc_btree_gens_key(c, k) && !commit_err) {
 			bch2_bkey_buf_reassemble(&sk, c, k);
 			bch2_extent_normalize(c, bkey_i_to_s(sk.k));
 
 			bch2_trans_update(&trans, iter, sk.k, 0);
 
-			ret = bch2_trans_commit(&trans, NULL, NULL,
-						BTREE_INSERT_NOFAIL);
-			if (ret == -EINTR)
+			commit_err = bch2_trans_commit(&trans, NULL, NULL,
+						       BTREE_INSERT_NOWAIT|
+						       BTREE_INSERT_NOFAIL);
+			if (commit_err == -EINTR) {
+				commit_err = 0;
 				continue;
-			if (ret) {
-				break;
 			}
 		}
 
-- 
cgit 


From 4aac975b6c9100cb08da4645291a262d970c1922 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 14 Apr 2021 12:10:17 -0400
Subject: bcachefs: Add a perf test for multiple updates per commit

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/tests.c | 37 +++++++++++++++++++++++++++++++++++++
 1 file changed, 37 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c
index 3de48c593963..bb4756566377 100644
--- a/fs/bcachefs/tests.c
+++ b/fs/bcachefs/tests.c
@@ -497,6 +497,42 @@ static int rand_insert(struct bch_fs *c, u64 nr)
 	return ret;
 }
 
+static int rand_insert_multi(struct bch_fs *c, u64 nr)
+{
+	struct btree_trans trans;
+	struct bkey_i_cookie k[8];
+	int ret = 0;
+	unsigned j;
+	u64 i;
+
+	bch2_trans_init(&trans, c, 0, 0);
+
+	for (i = 0; i < nr; i += ARRAY_SIZE(k)) {
+		for (j = 0; j < ARRAY_SIZE(k); j++) {
+			bkey_cookie_init(&k[j].k_i);
+			k[j].k.p.offset = test_rand();
+			k[j].k.p.snapshot = U32_MAX;
+		}
+
+		ret = __bch2_trans_do(&trans, NULL, NULL, 0,
+			__bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[0].k_i) ?:
+			__bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[1].k_i) ?:
+			__bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[2].k_i) ?:
+			__bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[3].k_i) ?:
+			__bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[4].k_i) ?:
+			__bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[5].k_i) ?:
+			__bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[6].k_i) ?:
+			__bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[7].k_i));
+		if (ret) {
+			bch_err(c, "error in rand_insert_multi: %i", ret);
+			break;
+		}
+	}
+
+	bch2_trans_exit(&trans);
+	return ret;
+}
+
 static int rand_lookup(struct bch_fs *c, u64 nr)
 {
 	struct btree_trans trans;
@@ -765,6 +801,7 @@ int bch2_btree_perf_test(struct bch_fs *c, const char *testname,
 	if (!strcmp(testname, #_test)) j.fn = _test
 
 	perf_test(rand_insert);
+	perf_test(rand_insert_multi);
 	perf_test(rand_lookup);
 	perf_test(rand_mixed);
 	perf_test(rand_delete);
-- 
cgit 


From d44a6e350ed28c00e00f5d8d5882682275dc0945 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 14 Apr 2021 12:17:41 -0400
Subject: bcachefs: Drop old style btree node coalescing

We have foreground btree node merging now, and any future btree node
merging improvements are going to be based off of that code.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_gc.c | 340 -------------------------------------------------
 fs/bcachefs/btree_gc.h |   2 -
 fs/bcachefs/sysfs.c    |   5 -
 fs/bcachefs/trace.h    |  37 ------
 4 files changed, 384 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index c14794cf1be8..b61d27de5cd7 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -1273,346 +1273,6 @@ err:
 	return ret;
 }
 
-/* Btree coalescing */
-
-static void recalc_packed_keys(struct btree *b)
-{
-	struct bset *i = btree_bset_first(b);
-	struct bkey_packed *k;
-
-	memset(&b->nr, 0, sizeof(b->nr));
-
-	BUG_ON(b->nsets != 1);
-
-	vstruct_for_each(i, k)
-		btree_keys_account_key_add(&b->nr, 0, k);
-}
-
-static void bch2_coalesce_nodes(struct bch_fs *c, struct btree_iter *iter,
-				struct btree *old_nodes[GC_MERGE_NODES])
-{
-	struct btree *parent = btree_node_parent(iter, old_nodes[0]);
-	unsigned i, nr_old_nodes, nr_new_nodes, u64s = 0;
-	unsigned blocks = btree_blocks(c) * 2 / 3;
-	struct btree *new_nodes[GC_MERGE_NODES];
-	struct btree_update *as;
-	struct keylist keylist;
-	struct bkey_format_state format_state;
-	struct bkey_format new_format;
-
-	memset(new_nodes, 0, sizeof(new_nodes));
-	bch2_keylist_init(&keylist, NULL);
-
-	/* Count keys that are not deleted */
-	for (i = 0; i < GC_MERGE_NODES && old_nodes[i]; i++)
-		u64s += old_nodes[i]->nr.live_u64s;
-
-	nr_old_nodes = nr_new_nodes = i;
-
-	/* Check if all keys in @old_nodes could fit in one fewer node */
-	if (nr_old_nodes <= 1 ||
-	    __vstruct_blocks(struct btree_node, c->block_bits,
-			     DIV_ROUND_UP(u64s, nr_old_nodes - 1)) > blocks)
-		return;
-
-	/* Find a format that all keys in @old_nodes can pack into */
-	bch2_bkey_format_init(&format_state);
-
-	/*
-	 * XXX: this won't correctly take it account the new min/max keys:
-	 */
-	for (i = 0; i < nr_old_nodes; i++)
-		__bch2_btree_calc_format(&format_state, old_nodes[i]);
-
-	new_format = bch2_bkey_format_done(&format_state);
-
-	/* Check if repacking would make any nodes too big to fit */
-	for (i = 0; i < nr_old_nodes; i++)
-		if (!bch2_btree_node_format_fits(c, old_nodes[i], &new_format)) {
-			trace_btree_gc_coalesce_fail(c,
-					BTREE_GC_COALESCE_FAIL_FORMAT_FITS);
-			return;
-		}
-
-	if (bch2_keylist_realloc(&keylist, NULL, 0,
-			BKEY_BTREE_PTR_U64s_MAX * nr_old_nodes)) {
-		trace_btree_gc_coalesce_fail(c,
-				BTREE_GC_COALESCE_FAIL_KEYLIST_REALLOC);
-		return;
-	}
-
-	as = bch2_btree_update_start(iter, old_nodes[0]->c.level,
-			btree_update_reserve_required(c, parent) + nr_old_nodes,
-			BTREE_INSERT_NOFAIL|
-			BTREE_INSERT_USE_RESERVE);
-	if (IS_ERR(as)) {
-		trace_btree_gc_coalesce_fail(c,
-				BTREE_GC_COALESCE_FAIL_RESERVE_GET);
-		bch2_keylist_free(&keylist, NULL);
-		return;
-	}
-
-	trace_btree_gc_coalesce(c, old_nodes[0]);
-
-	for (i = 0; i < nr_old_nodes; i++)
-		bch2_btree_interior_update_will_free_node(as, old_nodes[i]);
-
-	/* Repack everything with @new_format and sort down to one bset */
-	for (i = 0; i < nr_old_nodes; i++)
-		new_nodes[i] =
-			__bch2_btree_node_alloc_replacement(as, old_nodes[i],
-							    new_format);
-
-	/*
-	 * Conceptually we concatenate the nodes together and slice them
-	 * up at different boundaries.
-	 */
-	for (i = nr_new_nodes - 1; i > 0; --i) {
-		struct btree *n1 = new_nodes[i];
-		struct btree *n2 = new_nodes[i - 1];
-
-		struct bset *s1 = btree_bset_first(n1);
-		struct bset *s2 = btree_bset_first(n2);
-		struct bkey_packed *k, *last = NULL;
-
-		/* Calculate how many keys from @n2 we could fit inside @n1 */
-		u64s = 0;
-
-		for (k = s2->start;
-		     k < vstruct_last(s2) &&
-		     vstruct_blocks_plus(n1->data, c->block_bits,
-					 u64s + k->u64s) <= blocks;
-		     k = bkey_next(k)) {
-			last = k;
-			u64s += k->u64s;
-		}
-
-		if (u64s == le16_to_cpu(s2->u64s)) {
-			/* n2 fits entirely in n1 */
-			n1->key.k.p = n1->data->max_key = n2->data->max_key;
-
-			memcpy_u64s(vstruct_last(s1),
-				    s2->start,
-				    le16_to_cpu(s2->u64s));
-			le16_add_cpu(&s1->u64s, le16_to_cpu(s2->u64s));
-
-			set_btree_bset_end(n1, n1->set);
-
-			six_unlock_write(&n2->c.lock);
-			bch2_btree_node_free_never_inserted(c, n2);
-			six_unlock_intent(&n2->c.lock);
-
-			memmove(new_nodes + i - 1,
-				new_nodes + i,
-				sizeof(new_nodes[0]) * (nr_new_nodes - i));
-			new_nodes[--nr_new_nodes] = NULL;
-		} else if (u64s) {
-			/* move part of n2 into n1 */
-			n1->key.k.p = n1->data->max_key =
-				bkey_unpack_pos(n1, last);
-
-			n2->data->min_key = bpos_successor(n1->data->max_key);
-
-			memcpy_u64s(vstruct_last(s1),
-				    s2->start, u64s);
-			le16_add_cpu(&s1->u64s, u64s);
-
-			memmove(s2->start,
-				vstruct_idx(s2, u64s),
-				(le16_to_cpu(s2->u64s) - u64s) * sizeof(u64));
-			s2->u64s = cpu_to_le16(le16_to_cpu(s2->u64s) - u64s);
-
-			set_btree_bset_end(n1, n1->set);
-			set_btree_bset_end(n2, n2->set);
-		}
-	}
-
-	for (i = 0; i < nr_new_nodes; i++) {
-		struct btree *n = new_nodes[i];
-
-		recalc_packed_keys(n);
-		btree_node_reset_sib_u64s(n);
-
-		bch2_btree_build_aux_trees(n);
-
-		bch2_btree_update_add_new_node(as, n);
-		six_unlock_write(&n->c.lock);
-
-		bch2_btree_node_write(c, n, SIX_LOCK_intent);
-	}
-
-	/*
-	 * The keys for the old nodes get deleted. We don't want to insert keys
-	 * that compare equal to the keys for the new nodes we'll also be
-	 * inserting - we can't because keys on a keylist must be strictly
-	 * greater than the previous keys, and we also don't need to since the
-	 * key for the new node will serve the same purpose (overwriting the key
-	 * for the old node).
-	 */
-	for (i = 0; i < nr_old_nodes; i++) {
-		struct bkey_i delete;
-		unsigned j;
-
-		for (j = 0; j < nr_new_nodes; j++)
-			if (!bpos_cmp(old_nodes[i]->key.k.p,
-				      new_nodes[j]->key.k.p))
-				goto next;
-
-		bkey_init(&delete.k);
-		delete.k.p = old_nodes[i]->key.k.p;
-		bch2_keylist_add_in_order(&keylist, &delete);
-next:
-		i = i;
-	}
-
-	/*
-	 * Keys for the new nodes get inserted: bch2_btree_insert_keys() only
-	 * does the lookup once and thus expects the keys to be in sorted order
-	 * so we have to make sure the new keys are correctly ordered with
-	 * respect to the deleted keys added in the previous loop
-	 */
-	for (i = 0; i < nr_new_nodes; i++)
-		bch2_keylist_add_in_order(&keylist, &new_nodes[i]->key);
-
-	/* Insert the newly coalesced nodes */
-	bch2_btree_insert_node(as, parent, iter, &keylist, 0);
-
-	BUG_ON(!bch2_keylist_empty(&keylist));
-
-	BUG_ON(iter->l[old_nodes[0]->c.level].b != old_nodes[0]);
-
-	bch2_btree_iter_node_replace(iter, new_nodes[0]);
-
-	for (i = 0; i < nr_new_nodes; i++)
-		bch2_btree_update_get_open_buckets(as, new_nodes[i]);
-
-	/* Free the old nodes and update our sliding window */
-	for (i = 0; i < nr_old_nodes; i++) {
-		bch2_btree_node_free_inmem(c, old_nodes[i], iter);
-
-		/*
-		 * the index update might have triggered a split, in which case
-		 * the nodes we coalesced - the new nodes we just created -
-		 * might not be sibling nodes anymore - don't add them to the
-		 * sliding window (except the first):
-		 */
-		if (!i) {
-			old_nodes[i] = new_nodes[i];
-		} else {
-			old_nodes[i] = NULL;
-		}
-	}
-
-	for (i = 0; i < nr_new_nodes; i++)
-		six_unlock_intent(&new_nodes[i]->c.lock);
-
-	bch2_btree_update_done(as);
-	bch2_keylist_free(&keylist, NULL);
-}
-
-static int bch2_coalesce_btree(struct bch_fs *c, enum btree_id btree_id)
-{
-	struct btree_trans trans;
-	struct btree_iter *iter;
-	struct btree *b;
-	bool kthread = (current->flags & PF_KTHREAD) != 0;
-	unsigned i;
-	int ret = 0;
-
-	/* Sliding window of adjacent btree nodes */
-	struct btree *merge[GC_MERGE_NODES];
-	u32 lock_seq[GC_MERGE_NODES];
-
-	bch2_trans_init(&trans, c, 0, 0);
-
-	/*
-	 * XXX: We don't have a good way of positively matching on sibling nodes
-	 * that have the same parent - this code works by handling the cases
-	 * where they might not have the same parent, and is thus fragile. Ugh.
-	 *
-	 * Perhaps redo this to use multiple linked iterators?
-	 */
-	memset(merge, 0, sizeof(merge));
-
-	__for_each_btree_node(&trans, iter, btree_id, POS_MIN,
-			      BTREE_MAX_DEPTH, 0,
-			      BTREE_ITER_PREFETCH, b) {
-		memmove(merge + 1, merge,
-			sizeof(merge) - sizeof(merge[0]));
-		memmove(lock_seq + 1, lock_seq,
-			sizeof(lock_seq) - sizeof(lock_seq[0]));
-
-		merge[0] = b;
-
-		for (i = 1; i < GC_MERGE_NODES; i++) {
-			if (!merge[i] ||
-			    !six_relock_intent(&merge[i]->c.lock, lock_seq[i]))
-				break;
-
-			if (merge[i]->c.level != merge[0]->c.level) {
-				six_unlock_intent(&merge[i]->c.lock);
-				break;
-			}
-		}
-		memset(merge + i, 0, (GC_MERGE_NODES - i) * sizeof(merge[0]));
-
-		bch2_coalesce_nodes(c, iter, merge);
-
-		for (i = 1; i < GC_MERGE_NODES && merge[i]; i++) {
-			lock_seq[i] = merge[i]->c.lock.state.seq;
-			six_unlock_intent(&merge[i]->c.lock);
-		}
-
-		lock_seq[0] = merge[0]->c.lock.state.seq;
-
-		if (kthread && kthread_should_stop()) {
-			ret = -ESHUTDOWN;
-			break;
-		}
-
-		bch2_trans_cond_resched(&trans);
-
-		/*
-		 * If the parent node wasn't relocked, it might have been split
-		 * and the nodes in our sliding window might not have the same
-		 * parent anymore - blow away the sliding window:
-		 */
-		if (btree_iter_node(iter, iter->level + 1) &&
-		    !btree_node_intent_locked(iter, iter->level + 1))
-			memset(merge + 1, 0,
-			       (GC_MERGE_NODES - 1) * sizeof(merge[0]));
-	}
-	bch2_trans_iter_put(&trans, iter);
-
-	return bch2_trans_exit(&trans) ?: ret;
-}
-
-/**
- * bch_coalesce - coalesce adjacent nodes with low occupancy
- */
-void bch2_coalesce(struct bch_fs *c)
-{
-	enum btree_id id;
-
-	down_read(&c->gc_lock);
-	trace_gc_coalesce_start(c);
-
-	for (id = 0; id < BTREE_ID_NR; id++) {
-		int ret = c->btree_roots[id].b
-			? bch2_coalesce_btree(c, id)
-			: 0;
-
-		if (ret) {
-			if (ret != -ESHUTDOWN)
-				bch_err(c, "btree coalescing failed: %d", ret);
-			return;
-		}
-	}
-
-	trace_gc_coalesce_end(c);
-	up_read(&c->gc_lock);
-}
-
 static int bch2_gc_thread(void *arg)
 {
 	struct bch_fs *c = arg;
diff --git a/fs/bcachefs/btree_gc.h b/fs/bcachefs/btree_gc.h
index 44b7d121610f..868723a30b15 100644
--- a/fs/bcachefs/btree_gc.h
+++ b/fs/bcachefs/btree_gc.h
@@ -4,8 +4,6 @@
 
 #include "btree_types.h"
 
-void bch2_coalesce(struct bch_fs *);
-
 int bch2_gc(struct bch_fs *, bool, bool);
 int bch2_gc_gens(struct bch_fs *);
 void bch2_gc_thread_stop(struct bch_fs *);
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index 077f3a8cead7..21ef7719cf55 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -132,7 +132,6 @@ do {									\
 } while (0)
 
 write_attribute(trigger_journal_flush);
-write_attribute(trigger_btree_coalesce);
 write_attribute(trigger_gc);
 write_attribute(prune_cache);
 rw_attribute(btree_gc_periodic);
@@ -478,9 +477,6 @@ STORE(bch2_fs)
 	if (attr == &sysfs_trigger_journal_flush)
 		bch2_journal_meta(&c->journal);
 
-	if (attr == &sysfs_trigger_btree_coalesce)
-		bch2_coalesce(c);
-
 	if (attr == &sysfs_trigger_gc) {
 		/*
 		 * Full gc is currently incompatible with btree key cache:
@@ -577,7 +573,6 @@ struct attribute *bch2_fs_internal_files[] = {
 	&sysfs_extent_migrate_raced,
 
 	&sysfs_trigger_journal_flush,
-	&sysfs_trigger_btree_coalesce,
 	&sysfs_trigger_gc,
 	&sysfs_gc_gens_pos,
 	&sysfs_prune_cache,
diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h
index 387c1c49f696..493f9223c5bd 100644
--- a/fs/bcachefs/trace.h
+++ b/fs/bcachefs/trace.h
@@ -353,28 +353,6 @@ DEFINE_EVENT(btree_node, btree_set_root,
 
 /* Garbage collection */
 
-DEFINE_EVENT(btree_node, btree_gc_coalesce,
-	TP_PROTO(struct bch_fs *c, struct btree *b),
-	TP_ARGS(c, b)
-);
-
-TRACE_EVENT(btree_gc_coalesce_fail,
-	TP_PROTO(struct bch_fs *c, int reason),
-	TP_ARGS(c, reason),
-
-	TP_STRUCT__entry(
-		__field(u8,		reason			)
-		__array(char,		uuid,	16		)
-	),
-
-	TP_fast_assign(
-		__entry->reason		= reason;
-		memcpy(__entry->uuid, c->disk_sb.sb->user_uuid.b, 16);
-	),
-
-	TP_printk("%pU: %u", __entry->uuid, __entry->reason)
-);
-
 DEFINE_EVENT(btree_node, btree_gc_rewrite_node,
 	TP_PROTO(struct bch_fs *c, struct btree *b),
 	TP_ARGS(c, b)
@@ -395,16 +373,6 @@ DEFINE_EVENT(bch_fs, gc_end,
 	TP_ARGS(c)
 );
 
-DEFINE_EVENT(bch_fs, gc_coalesce_start,
-	TP_PROTO(struct bch_fs *c),
-	TP_ARGS(c)
-);
-
-DEFINE_EVENT(bch_fs, gc_coalesce_end,
-	TP_PROTO(struct bch_fs *c),
-	TP_ARGS(c)
-);
-
 DEFINE_EVENT(bch_fs, gc_cannot_inc_gens,
 	TP_PROTO(struct bch_fs *c),
 	TP_ARGS(c)
@@ -453,11 +421,6 @@ TRACE_EVENT(invalidate,
 		  MINOR(__entry->dev), __entry->offset)
 );
 
-DEFINE_EVENT(bch_fs, rescale_prios,
-	TP_PROTO(struct bch_fs *c),
-	TP_ARGS(c)
-);
-
 DECLARE_EVENT_CLASS(bucket_alloc,
 	TP_PROTO(struct bch_dev *ca, enum alloc_reserve reserve),
 	TP_ARGS(ca, reserve),
-- 
cgit 


From a0c9cc1727ecf302ab0be07e93d0c023e64be8da Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 14 Apr 2021 13:29:34 -0400
Subject: bcachefs: Better iterator picking

Avoid cloning iterators if we don't have to.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 033a079fb3f3..7d8b7d765cf7 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -2017,10 +2017,14 @@ struct btree_iter *__bch2_trans_get_iter(struct btree_trans *trans,
 		if (iter->btree_id != btree_id)
 			continue;
 
-		if (best &&
-		    bkey_cmp(bpos_diff(best->real_pos, pos),
-			     bpos_diff(iter->real_pos, pos)) < 0)
-			continue;
+		if (best) {
+			int cmp = bkey_cmp(bpos_diff(best->real_pos, pos),
+					   bpos_diff(iter->real_pos, pos));
+
+			if (cmp < 0 ||
+			    ((cmp == 0 && btree_iter_keep(trans, iter))))
+				continue;
+		}
 
 		best = iter;
 	}
-- 
cgit 


From 5e427571c59c26b04f483b305cf0e63498f21601 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 14 Apr 2021 17:45:31 -0400
Subject: bcachefs: Don't call bch2_btree_iter_traverse() unnecessarily

If we let bch2_trans_commit() do it, it'll traverse iterators in sorted
order which means we'll get fewer lock restarts.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update_leaf.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index c5dab99b0cfc..9a747b420180 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -1153,8 +1153,7 @@ int __bch2_btree_insert(struct btree_trans *trans,
 	iter = bch2_trans_get_iter(trans, id, bkey_start_pos(&k->k),
 				   BTREE_ITER_INTENT);
 
-	ret   = bch2_btree_iter_traverse(iter) ?:
-		bch2_trans_update(trans, iter, k, 0);
+	ret = bch2_trans_update(trans, iter, k, 0);
 	bch2_trans_iter_put(trans, iter);
 	return ret;
 }
-- 
cgit 


From 1b9374adecea64244b0c67aa27e032feae1f1bca Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 14 Apr 2021 20:22:10 -0400
Subject: bcachefs: Fix bch2_gc_done() error messages

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_gc.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index b61d27de5cd7..068cfbb2c489 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -828,7 +828,7 @@ static int bch2_gc_done(struct bch_fs *c,
 	if (dst->b[b].mark._f != src->b[b].mark._f) {			\
 		if (verify)						\
 			fsck_err(c, "bucket %u:%zu gen %u data type %s has wrong " #_f	\
-				": got %u, should be %u", i, b,		\
+				": got %u, should be %u", dev, b,	\
 				dst->b[b].mark.gen,			\
 				bch2_data_types[dst->b[b].mark.data_type],\
 				dst->b[b].mark._f, src->b[b].mark._f);	\
@@ -836,7 +836,7 @@ static int bch2_gc_done(struct bch_fs *c,
 		set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags);		\
 	}
 #define copy_dev_field(_f, _msg, ...)					\
-	copy_field(_f, "dev %u has wrong " _msg, i, ##__VA_ARGS__)
+	copy_field(_f, "dev %u has wrong " _msg, dev, ##__VA_ARGS__)
 #define copy_fs_field(_f, _msg, ...)					\
 	copy_field(_f, "fs has wrong " _msg, ##__VA_ARGS__)
 
-- 
cgit 


From 0ef107859bc868f783cbbbf055a907c702896661 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 14 Apr 2021 22:15:55 -0400
Subject: bcachefs: Fix journal_reclaim_wait_done()

Can't run arbitrary code inside a wait_event() conditional, due to
task state being weird...

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update_leaf.c | 24 +++++-------------------
 1 file changed, 5 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 9a747b420180..21a26987f975 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -629,25 +629,11 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans,
 
 static int journal_reclaim_wait_done(struct bch_fs *c)
 {
-	int ret;
-
-	ret = bch2_journal_error(&c->journal);
-	if (ret)
-		return ret;
-
-	ret = !bch2_btree_key_cache_must_wait(c);
-	if (ret)
-		return ret;
-
-	journal_reclaim_kick(&c->journal);
-
-	if (mutex_trylock(&c->journal.reclaim_lock)) {
-		ret = bch2_journal_reclaim(&c->journal);
-		mutex_unlock(&c->journal.reclaim_lock);
-	}
+	int ret = bch2_journal_error(&c->journal) ?:
+		!bch2_btree_key_cache_must_wait(c);
 
 	if (!ret)
-		ret = !bch2_btree_key_cache_must_wait(c);
+		journal_reclaim_kick(&c->journal);
 	return ret;
 }
 
@@ -735,8 +721,8 @@ int bch2_trans_commit_error(struct btree_trans *trans,
 	case BTREE_INSERT_NEED_JOURNAL_RECLAIM:
 		bch2_trans_unlock(trans);
 
-		wait_event(c->journal.reclaim_wait,
-			   (ret = journal_reclaim_wait_done(c)));
+		wait_event_freezable(c->journal.reclaim_wait,
+				     (ret = journal_reclaim_wait_done(c)));
 		if (ret < 0)
 			return ret;
 
-- 
cgit 


From 2527dd91580b1eb5ff1f8df1b47817ac60395830 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 14 Apr 2021 13:26:15 -0400
Subject: bcachefs: Improve bch2_btree_iter_traverse_all()

By changing it to upgrade iterators to intent locks to avoid lock
restarts we can simplify __bch2_btree_node_lock() quite a bit - this
fixes a probable bug where it could potentially drop a lock on an
unrelated error but still succeed instead of causing a transaction
restart.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c | 87 ++++++++++++++++++++----------------------------
 fs/bcachefs/btree_iter.h |  2 +-
 fs/bcachefs/trace.h      | 44 ++++++++++++++++++++----
 3 files changed, 76 insertions(+), 57 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 7d8b7d765cf7..c7a0ffc2cad5 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -268,13 +268,8 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
 		 */
 		if (type == SIX_LOCK_intent &&
 		    linked->nodes_locked != linked->nodes_intent_locked) {
-			linked->locks_want = max_t(unsigned,
-					linked->locks_want,
-					__fls(linked->nodes_locked) + 1);
-			if (!btree_iter_get_locks(linked, true, false)) {
-				deadlock_iter = linked;
-				reason = 1;
-			}
+			deadlock_iter = linked;
+			reason = 1;
 		}
 
 		if (linked->btree_id != iter->btree_id) {
@@ -303,14 +298,8 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
 		 * we're about to lock, it must have the ancestors locked too:
 		 */
 		if (level > __fls(linked->nodes_locked)) {
-			linked->locks_want =
-				max(level + 1, max_t(unsigned,
-				    linked->locks_want,
-				    iter->locks_want));
-			if (!btree_iter_get_locks(linked, true, false)) {
-				deadlock_iter = linked;
-				reason = 5;
-			}
+			deadlock_iter = linked;
+			reason = 5;
 		}
 
 		/* Must lock btree nodes in key order: */
@@ -320,26 +309,17 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
 			deadlock_iter = linked;
 			reason = 7;
 		}
-
-		/*
-		 * Recheck if this is a node we already have locked - since one
-		 * of the get_locks() calls might've successfully
-		 * upgraded/relocked it:
-		 */
-		if (linked->l[level].b == b &&
-		    btree_node_locked_type(linked, level) >= type) {
-			six_lock_increment(&b->c.lock, type);
-			return true;
-		}
 	}
 
 	if (unlikely(deadlock_iter)) {
 		trace_trans_restart_would_deadlock(iter->trans->ip, ip,
-				reason,
+				trans->in_traverse_all, reason,
 				deadlock_iter->btree_id,
 				btree_iter_type(deadlock_iter),
+				&deadlock_iter->real_pos,
 				iter->btree_id,
-				btree_iter_type(iter));
+				btree_iter_type(iter),
+				&pos);
 		return false;
 	}
 
@@ -407,29 +387,11 @@ bool bch2_btree_iter_relock(struct btree_iter *iter, bool trace)
 bool __bch2_btree_iter_upgrade(struct btree_iter *iter,
 			       unsigned new_locks_want)
 {
-	struct btree_iter *linked;
-
 	EBUG_ON(iter->locks_want >= new_locks_want);
 
 	iter->locks_want = new_locks_want;
 
-	if (btree_iter_get_locks(iter, true, true))
-		return true;
-
-	/*
-	 * Ancestor nodes must be locked before child nodes, so set locks_want
-	 * on iterators that might lock ancestors before us to avoid getting
-	 * -EINTR later:
-	 */
-	trans_for_each_iter(iter->trans, linked)
-		if (linked != iter &&
-		    linked->btree_id == iter->btree_id &&
-		    linked->locks_want < new_locks_want) {
-			linked->locks_want = new_locks_want;
-			btree_iter_get_locks(linked, true, false);
-		}
-
-	return false;
+	return btree_iter_get_locks(iter, true, true);
 }
 
 void __bch2_btree_iter_downgrade(struct btree_iter *iter,
@@ -1192,7 +1154,8 @@ static int __btree_iter_traverse_all(struct btree_trans *trans, int ret)
 	struct bch_fs *c = trans->c;
 	struct btree_iter *iter;
 	u8 sorted[BTREE_ITER_MAX];
-	unsigned i, nr_sorted = 0;
+	int i, nr_sorted = 0;
+	bool relock_fail;
 
 	if (trans->in_traverse_all)
 		return -EINTR;
@@ -1200,15 +1163,36 @@ static int __btree_iter_traverse_all(struct btree_trans *trans, int ret)
 	trans->in_traverse_all = true;
 retry_all:
 	nr_sorted = 0;
+	relock_fail = false;
 
-	trans_for_each_iter(trans, iter)
+	trans_for_each_iter(trans, iter) {
+		if (!bch2_btree_iter_relock(iter, true))
+			relock_fail = true;
 		sorted[nr_sorted++] = iter->idx;
+	}
+
+	if (!relock_fail) {
+		trans->in_traverse_all = false;
+		return 0;
+	}
 
 #define btree_iter_cmp_by_idx(_l, _r)				\
 		btree_iter_lock_cmp(&trans->iters[_l], &trans->iters[_r])
 
 	bubble_sort(sorted, nr_sorted, btree_iter_cmp_by_idx);
 #undef btree_iter_cmp_by_idx
+
+	for (i = nr_sorted - 2; i >= 0; --i) {
+		struct btree_iter *iter1 = trans->iters + sorted[i];
+		struct btree_iter *iter2 = trans->iters + sorted[i + 1];
+
+		if (iter1->btree_id == iter2->btree_id &&
+		    iter1->locks_want < iter2->locks_want)
+			__bch2_btree_iter_upgrade(iter1, iter2->locks_want);
+		else if (!iter1->locks_want && iter2->locks_want)
+			__bch2_btree_iter_upgrade(iter1, 1);
+	}
+
 	bch2_trans_unlock(trans);
 	cond_resched();
 
@@ -1258,6 +1242,8 @@ out:
 	bch2_btree_cache_cannibalize_unlock(c);
 
 	trans->in_traverse_all = false;
+
+	trace_trans_traverse_all(trans->ip);
 	return ret;
 }
 
@@ -2210,7 +2196,8 @@ void bch2_trans_reset(struct btree_trans *trans, unsigned flags)
 	if (!(flags & TRANS_RESET_NOUNLOCK))
 		bch2_trans_cond_resched(trans);
 
-	if (!(flags & TRANS_RESET_NOTRAVERSE))
+	if (!(flags & TRANS_RESET_NOTRAVERSE) &&
+	    trans->iters_linked)
 		bch2_btree_iter_traverse_all(trans);
 }
 
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index 07d9b6d36e51..2f63adb9e420 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -187,7 +187,7 @@ static inline int btree_iter_lock_cmp(const struct btree_iter *l,
 {
 	return   cmp_int(l->btree_id, r->btree_id) ?:
 		-cmp_int(btree_iter_is_cached(l), btree_iter_is_cached(r)) ?:
-		 bkey_cmp(l->pos, r->pos);
+		 bkey_cmp(l->real_pos, r->real_pos);
 }
 
 /*
diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h
index 493f9223c5bd..02f2662e7bde 100644
--- a/fs/bcachefs/trace.h
+++ b/fs/bcachefs/trace.h
@@ -561,43 +561,70 @@ DEFINE_EVENT(transaction_restart,	trans_restart_btree_node_reused,
 TRACE_EVENT(trans_restart_would_deadlock,
 	TP_PROTO(unsigned long	trans_ip,
 		 unsigned long	caller_ip,
+		 bool		in_traverse_all,
 		 unsigned	reason,
 		 enum btree_id	have_btree_id,
 		 unsigned	have_iter_type,
+		 struct bpos	*have_pos,
 		 enum btree_id	want_btree_id,
-		 unsigned	want_iter_type),
-	TP_ARGS(trans_ip, caller_ip, reason,
-		have_btree_id, have_iter_type,
-		want_btree_id, want_iter_type),
+		 unsigned	want_iter_type,
+		 struct bpos	*want_pos),
+	TP_ARGS(trans_ip, caller_ip, in_traverse_all, reason,
+		have_btree_id, have_iter_type, have_pos,
+		want_btree_id, want_iter_type, want_pos),
 
 	TP_STRUCT__entry(
 		__field(unsigned long,		trans_ip	)
 		__field(unsigned long,		caller_ip	)
+		__field(u8,			in_traverse_all	)
 		__field(u8,			reason		)
 		__field(u8,			have_btree_id	)
 		__field(u8,			have_iter_type	)
 		__field(u8,			want_btree_id	)
 		__field(u8,			want_iter_type	)
+
+		__field(u64,			have_pos_inode	)
+		__field(u64,			have_pos_offset	)
+		__field(u32,			have_pos_snapshot)
+		__field(u32,			want_pos_snapshot)
+		__field(u64,			want_pos_inode	)
+		__field(u64,			want_pos_offset	)
 	),
 
 	TP_fast_assign(
 		__entry->trans_ip		= trans_ip;
 		__entry->caller_ip		= caller_ip;
+		__entry->in_traverse_all	= in_traverse_all;
 		__entry->reason			= reason;
 		__entry->have_btree_id		= have_btree_id;
 		__entry->have_iter_type		= have_iter_type;
 		__entry->want_btree_id		= want_btree_id;
 		__entry->want_iter_type		= want_iter_type;
+
+		__entry->have_pos_inode		= have_pos->inode;
+		__entry->have_pos_offset	= have_pos->offset;
+		__entry->have_pos_snapshot	= have_pos->snapshot;
+
+		__entry->want_pos_inode		= want_pos->inode;
+		__entry->want_pos_offset	= want_pos->offset;
+		__entry->want_pos_snapshot	= want_pos->snapshot;
 	),
 
-	TP_printk("%pS %pS because %u have %u:%u want %u:%u",
+	TP_printk("%pS %pS traverse_all %u because %u have %u:%u %llu:%llu:%u want %u:%u %llu:%llu:%u",
 		  (void *) __entry->trans_ip,
 		  (void *) __entry->caller_ip,
+		  __entry->in_traverse_all,
 		  __entry->reason,
 		  __entry->have_btree_id,
 		  __entry->have_iter_type,
+		  __entry->have_pos_inode,
+		  __entry->have_pos_offset,
+		  __entry->have_pos_snapshot,
 		  __entry->want_btree_id,
-		  __entry->want_iter_type)
+		  __entry->want_iter_type,
+		  __entry->want_pos_inode,
+		  __entry->want_pos_offset,
+		  __entry->want_pos_snapshot)
 );
 
 TRACE_EVENT(trans_restart_iters_realloced,
@@ -689,6 +716,11 @@ DEFINE_EVENT(transaction_restart,	trans_restart_traverse,
 	TP_ARGS(ip)
 );
 
+DEFINE_EVENT(transaction_restart,	trans_traverse_all,
+	TP_PROTO(unsigned long ip),
+	TP_ARGS(ip)
+);
+
 DECLARE_EVENT_CLASS(node_lock_fail,
 	TP_PROTO(unsigned level, u32 iter_seq, unsigned node, u32 node_seq),
 	TP_ARGS(level, iter_seq, node, node_seq),
-- 
cgit 


From 558509aa0198b9c062ff7b91bf8feb74513e0965 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 15 Apr 2021 12:36:40 -0400
Subject: bcachefs: Don't downgrade iterators in bch2_trans_get_iter()

This fixes a livelock with btree node splits.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index c7a0ffc2cad5..dabf0e18c1ff 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -2038,13 +2038,18 @@ struct btree_iter *__bch2_trans_get_iter(struct btree_trans *trans,
 
 	iter->snapshot = pos.snapshot;
 
-	locks_want = min(locks_want, BTREE_MAX_DEPTH);
+	/*
+	 * If the iterator has locks_want greater than requested, we explicitly
+	 * do not downgrade it here - on transaction restart because btree node
+	 * split needs to upgrade locks, we might be putting/getting the
+	 * iterator again. Downgrading iterators only happens via an explicit
+	 * bch2_trans_downgrade().
+	 */
 
+	locks_want = min(locks_want, BTREE_MAX_DEPTH);
 	if (locks_want > iter->locks_want) {
 		iter->locks_want = locks_want;
 		btree_iter_get_locks(iter, true, false);
-	} else if (locks_want < iter->locks_want) {
-		__bch2_btree_iter_downgrade(iter, locks_want);
 	}
 
 	while (iter->level < depth) {
-- 
cgit 


From 73a117d2d8a0d9923648653b6400f534e0038281 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 15 Apr 2021 12:50:09 -0400
Subject: bcachefs: Improve trans_restart_mem_realloced tracepoint

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c | 27 +++++++++------------------
 fs/bcachefs/trace.h      | 37 +++++++++++++------------------------
 2 files changed, 22 insertions(+), 42 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index dabf0e18c1ff..f82976aab7d9 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -2111,11 +2111,14 @@ struct btree_iter *__bch2_trans_copy_iter(struct btree_trans *trans,
 	return iter;
 }
 
-static int bch2_trans_preload_mem(struct btree_trans *trans, size_t size)
+void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
 {
-	if (size > trans->mem_bytes) {
+	size_t new_top = trans->mem_top + size;
+	void *p;
+
+	if (new_top > trans->mem_bytes) {
 		size_t old_bytes = trans->mem_bytes;
-		size_t new_bytes = roundup_pow_of_two(size);
+		size_t new_bytes = roundup_pow_of_two(new_top);
 		void *new_mem;
 
 		WARN_ON_ONCE(new_bytes > BTREE_TRANS_MEM_MAX);
@@ -2128,29 +2131,17 @@ static int bch2_trans_preload_mem(struct btree_trans *trans, size_t size)
 		}
 
 		if (!new_mem)
-			return -ENOMEM;
+			return ERR_PTR(-ENOMEM);
 
 		trans->mem = new_mem;
 		trans->mem_bytes = new_bytes;
 
 		if (old_bytes) {
-			trace_trans_restart_mem_realloced(trans->ip, new_bytes);
-			return -EINTR;
+			trace_trans_restart_mem_realloced(trans->ip, _RET_IP_, new_bytes);
+			return ERR_PTR(-EINTR);
 		}
 	}
 
-	return 0;
-}
-
-void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
-{
-	void *p;
-	int ret;
-
-	ret = bch2_trans_preload_mem(trans, trans->mem_top + size);
-	if (ret)
-		return ERR_PTR(ret);
-
 	p = trans->mem + trans->mem_top;
 	trans->mem_top += size;
 	return p;
diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h
index 02f2662e7bde..887c0adddf12 100644
--- a/fs/bcachefs/trace.h
+++ b/fs/bcachefs/trace.h
@@ -627,38 +627,27 @@ TRACE_EVENT(trans_restart_would_deadlock,
 		  __entry->want_pos_snapshot)
 );
 
-TRACE_EVENT(trans_restart_iters_realloced,
-	TP_PROTO(unsigned long ip, unsigned nr),
-	TP_ARGS(ip, nr),
-
-	TP_STRUCT__entry(
-		__field(unsigned long,		ip	)
-		__field(unsigned,		nr	)
-	),
-
-	TP_fast_assign(
-		__entry->ip	= ip;
-		__entry->nr	= nr;
-	),
-
-	TP_printk("%pS nr %u", (void *) __entry->ip, __entry->nr)
-);
-
 TRACE_EVENT(trans_restart_mem_realloced,
-	TP_PROTO(unsigned long ip, unsigned long bytes),
-	TP_ARGS(ip, bytes),
+	TP_PROTO(unsigned long trans_ip, unsigned long caller_ip,
+		 unsigned long bytes),
+	TP_ARGS(trans_ip, caller_ip, bytes),
 
 	TP_STRUCT__entry(
-		__field(unsigned long,		ip	)
-		__field(unsigned long,		bytes	)
+		__field(unsigned long,		trans_ip	)
+		__field(unsigned long,		caller_ip	)
+		__field(unsigned long,		bytes		)
 	),
 
 	TP_fast_assign(
-		__entry->ip	= ip;
-		__entry->bytes	= bytes;
+		__entry->trans_ip	= trans_ip;
+		__entry->caller_ip	= caller_ip;
+		__entry->bytes		= bytes;
 	),
 
-	TP_printk("%pS bytes %lu", (void *) __entry->ip, __entry->bytes)
+	TP_printk("%pS %pS bytes %lu",
+		  (void *) __entry->trans_ip,
+		  (void *) __entry->caller_ip,
+		  __entry->bytes)
 );
 
 DEFINE_EVENT(transaction_restart,	trans_restart_journal_res_get,
-- 
cgit 


From d62ab355d7475a0da3267c8376ef436ba92f72c1 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 14 Apr 2021 20:25:33 -0400
Subject: bcachefs: Fix bch2_trans_mark_dev_sb()

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c |  6 ++--
 fs/bcachefs/buckets.c          | 63 ++++++++++++------------------------------
 fs/bcachefs/buckets.h          |  8 ++----
 fs/bcachefs/buckets_types.h    |  5 ++++
 fs/bcachefs/journal.c          |  2 +-
 fs/bcachefs/recovery.c         | 10 ++++---
 fs/bcachefs/super.c            |  4 +--
 7 files changed, 38 insertions(+), 60 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index a8a59140efbe..c115c76b2197 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -254,9 +254,9 @@ void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c,
 {
 	struct bkey_alloc_unpacked u = bch2_alloc_unpack(k);
 
-	pr_buf(out, "gen %u oldest_gen %u data_type %u",
-	       u.gen, u.oldest_gen, u.data_type);
-#define x(_name, ...)	pr_buf(out, #_name " %llu ", (u64) u._name);
+	pr_buf(out, "gen %u oldest_gen %u data_type %s",
+	       u.gen, u.oldest_gen, bch2_data_types[u.data_type]);
+#define x(_name, ...)	pr_buf(out, " " #_name " %llu", (u64) u._name);
 	BCH_ALLOC_FIELDS_V2()
 #undef  x
 }
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 4791f4896d6b..297ff7d3b06e 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -2024,22 +2024,6 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
 		goto out;
 	}
 
-	if ((unsigned) (u.dirty_sectors + sectors) > ca->mi.bucket_size) {
-		bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
-			"bucket %llu:%llu gen %u data type %s sector count overflow: %u + %u > %u\n"
-			"while marking %s",
-			iter->pos.inode, iter->pos.offset, u.gen,
-			bch2_data_types[u.data_type ?: type],
-			u.dirty_sectors, sectors, ca->mi.bucket_size,
-			bch2_data_types[type]);
-		ret = -EIO;
-		goto out;
-	}
-
-	if (u.data_type		== type &&
-	    u.dirty_sectors	== sectors)
-		goto out;
-
 	u.data_type	= type;
 	u.dirty_sectors	= sectors;
 
@@ -2051,53 +2035,44 @@ out:
 }
 
 int bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
-				    struct disk_reservation *res,
 				    struct bch_dev *ca, size_t b,
 				    enum bch_data_type type,
 				    unsigned sectors)
 {
-	return __bch2_trans_do(trans, res, NULL, 0,
-			__bch2_trans_mark_metadata_bucket(trans, ca, b, BCH_DATA_journal,
-							ca->mi.bucket_size));
-
+	return __bch2_trans_do(trans, NULL, NULL, 0,
+			__bch2_trans_mark_metadata_bucket(trans, ca, b, type, sectors));
 }
 
 static int bch2_trans_mark_metadata_sectors(struct btree_trans *trans,
-					    struct disk_reservation *res,
 					    struct bch_dev *ca,
 					    u64 start, u64 end,
 					    enum bch_data_type type,
 					    u64 *bucket, unsigned *bucket_sectors)
 {
-	int ret;
-
 	do {
 		u64 b = sector_to_bucket(ca, start);
 		unsigned sectors =
 			min_t(u64, bucket_to_sector(ca, b + 1), end) - start;
 
-		if (b != *bucket) {
-			if (*bucket_sectors) {
-				ret = bch2_trans_mark_metadata_bucket(trans, res, ca,
-						*bucket, type, *bucket_sectors);
-				if (ret)
-					return ret;
-			}
+		if (b != *bucket && *bucket_sectors) {
+			int ret = bch2_trans_mark_metadata_bucket(trans, ca, *bucket,
+								  type, *bucket_sectors);
+			if (ret)
+				return ret;
 
-			*bucket		= b;
-			*bucket_sectors	= 0;
+			*bucket_sectors = 0;
 		}
 
+		*bucket		= b;
 		*bucket_sectors	+= sectors;
 		start += sectors;
-	} while (!ret && start < end);
+	} while (start < end);
 
 	return 0;
 }
 
 static int __bch2_trans_mark_dev_sb(struct btree_trans *trans,
-			     struct disk_reservation *res,
-			     struct bch_dev *ca)
+				    struct bch_dev *ca)
 {
 	struct bch_sb_layout *layout = &ca->disk_sb.sb->layout;
 	u64 bucket = 0;
@@ -2108,14 +2083,14 @@ static int __bch2_trans_mark_dev_sb(struct btree_trans *trans,
 		u64 offset = le64_to_cpu(layout->sb_offset[i]);
 
 		if (offset == BCH_SB_SECTOR) {
-			ret = bch2_trans_mark_metadata_sectors(trans, res, ca,
+			ret = bch2_trans_mark_metadata_sectors(trans, ca,
 						0, BCH_SB_SECTOR,
 						BCH_DATA_sb, &bucket, &bucket_sectors);
 			if (ret)
 				return ret;
 		}
 
-		ret = bch2_trans_mark_metadata_sectors(trans, res, ca, offset,
+		ret = bch2_trans_mark_metadata_sectors(trans, ca, offset,
 				      offset + (1 << layout->sb_max_size_bits),
 				      BCH_DATA_sb, &bucket, &bucket_sectors);
 		if (ret)
@@ -2123,14 +2098,14 @@ static int __bch2_trans_mark_dev_sb(struct btree_trans *trans,
 	}
 
 	if (bucket_sectors) {
-		ret = bch2_trans_mark_metadata_bucket(trans, res, ca,
+		ret = bch2_trans_mark_metadata_bucket(trans, ca,
 				bucket, BCH_DATA_sb, bucket_sectors);
 		if (ret)
 			return ret;
 	}
 
 	for (i = 0; i < ca->journal.nr; i++) {
-		ret = bch2_trans_mark_metadata_bucket(trans, res, ca,
+		ret = bch2_trans_mark_metadata_bucket(trans, ca,
 				ca->journal.buckets[i],
 				BCH_DATA_journal, ca->mi.bucket_size);
 		if (ret)
@@ -2140,12 +2115,10 @@ static int __bch2_trans_mark_dev_sb(struct btree_trans *trans,
 	return 0;
 }
 
-int bch2_trans_mark_dev_sb(struct bch_fs *c,
-			   struct disk_reservation *res,
-			   struct bch_dev *ca)
+int bch2_trans_mark_dev_sb(struct bch_fs *c, struct bch_dev *ca)
 {
-	return bch2_trans_do(c, res, NULL, 0,
-			__bch2_trans_mark_dev_sb(&trans, res, ca));
+	return bch2_trans_do(c, NULL, NULL, 0,
+			__bch2_trans_mark_dev_sb(&trans, ca));
 }
 
 /* Disk reservations: */
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index cd81e6aba1b0..794c426e2198 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -253,11 +253,9 @@ int bch2_trans_mark_update(struct btree_trans *, struct btree_iter *iter,
 			   struct bkey_i *insert, unsigned);
 void bch2_trans_fs_usage_apply(struct btree_trans *, struct replicas_delta_list *);
 
-int bch2_trans_mark_metadata_bucket(struct btree_trans *,
-			struct disk_reservation *, struct bch_dev *,
-			size_t, enum bch_data_type, unsigned);
-int bch2_trans_mark_dev_sb(struct bch_fs *, struct disk_reservation *,
-			   struct bch_dev *);
+int bch2_trans_mark_metadata_bucket(struct btree_trans *, struct bch_dev *,
+				    size_t, enum bch_data_type, unsigned);
+int bch2_trans_mark_dev_sb(struct bch_fs *, struct bch_dev *);
 
 /* disk reservations: */
 
diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h
index 588b1a72adae..b2de2995c5e7 100644
--- a/fs/bcachefs/buckets_types.h
+++ b/fs/bcachefs/buckets_types.h
@@ -59,6 +59,11 @@ struct bch_dev_usage {
 	struct {
 		u64		buckets;
 		u64		sectors; /* _compressed_ sectors: */
+		/*
+		 * XXX
+		 * Why do we have this? Isn't it just buckets * bucket_size -
+		 * sectors?
+		 */
 		u64		fragmented;
 	}			d[BCH_DATA_NR];
 };
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index af2f8528ac65..03d52a778074 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -864,7 +864,7 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
 
 		if (c && !new_fs)
 			ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_NOFAIL,
-				bch2_trans_mark_metadata_bucket(&trans, NULL, ca,
+				bch2_trans_mark_metadata_bucket(&trans, ca,
 						bucket, BCH_DATA_journal,
 						ca->mi.bucket_size));
 
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 9991a4f67163..2dc3dee4efc8 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -1333,10 +1333,12 @@ int bch2_fs_initialize(struct bch_fs *c)
 	 * Write out the superblock and journal buckets, now that we can do
 	 * btree updates
 	 */
-	err = "error writing alloc info";
-	ret = bch2_alloc_write(c, 0);
-	if (ret)
-		goto err;
+	err = "error marking superblock and journal";
+	for_each_member_device(ca, c, i) {
+		ret = bch2_trans_mark_dev_sb(c, ca);
+		if (ret)
+			goto err;
+	}
 
 	bch2_inode_init(c, &root_inode, 0, 0,
 			S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0, NULL);
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 385b41f16754..18ad2db9f4bf 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -1670,7 +1670,7 @@ have_slot:
 	bch2_dev_usage_journal_reserve(c);
 
 	err = "error marking superblock";
-	ret = bch2_trans_mark_dev_sb(c, NULL, ca);
+	ret = bch2_trans_mark_dev_sb(c, ca);
 	if (ret)
 		goto err_late;
 
@@ -1730,7 +1730,7 @@ int bch2_dev_online(struct bch_fs *c, const char *path)
 
 	ca = bch_dev_locked(c, dev_idx);
 
-	if (bch2_trans_mark_dev_sb(c, NULL, ca)) {
+	if (bch2_trans_mark_dev_sb(c, ca)) {
 		err = "bch2_trans_mark_dev_sb() error";
 		goto err;
 	}
-- 
cgit 


From 633632ef1be790af24897cfcf165d936ca379b45 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 14 Apr 2021 20:23:58 -0400
Subject: bcachefs: Simplify bch2_set_nr_journal_buckets()

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/journal.c | 35 +++++++++++++++--------------------
 1 file changed, 15 insertions(+), 20 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index 03d52a778074..2724a58ada05 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -786,7 +786,7 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
 	 * We may be called from the device add path, before the new device has
 	 * actually been added to the running filesystem:
 	 */
-	if (c)
+	if (!new_fs)
 		spin_lock(&c->journal.lock);
 
 	memcpy(new_buckets,	ja->buckets,	ja->nr * sizeof(u64));
@@ -794,17 +794,17 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
 	swap(new_buckets,	ja->buckets);
 	swap(new_bucket_seq,	ja->bucket_seq);
 
-	if (c)
+	if (!new_fs)
 		spin_unlock(&c->journal.lock);
 
 	while (ja->nr < nr) {
 		struct open_bucket *ob = NULL;
 		unsigned pos;
-		long bucket;
+		long b;
 
 		if (new_fs) {
-			bucket = bch2_bucket_alloc_new_fs(ca);
-			if (bucket < 0) {
+			b = bch2_bucket_alloc_new_fs(ca);
+			if (b < 0) {
 				ret = -ENOSPC;
 				goto err;
 			}
@@ -818,10 +818,8 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
 				goto err;
 			}
 
-			bucket = sector_to_bucket(ca, ob->ptr.offset);
-		}
+			b = sector_to_bucket(ca, ob->ptr.offset);
 
-		if (c) {
 			percpu_down_read(&c->mark_lock);
 			spin_lock(&c->journal.lock);
 		}
@@ -838,9 +836,9 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
 		__array_insert_item(journal_buckets->buckets,	ja->nr, pos);
 		ja->nr++;
 
-		ja->buckets[pos] = bucket;
+		ja->buckets[pos] = b;
 		ja->bucket_seq[pos] = 0;
-		journal_buckets->buckets[pos] = cpu_to_le64(bucket);
+		journal_buckets->buckets[pos] = cpu_to_le64(b);
 
 		if (pos <= ja->discard_idx)
 			ja->discard_idx = (ja->discard_idx + 1) % ja->nr;
@@ -851,28 +849,25 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
 		if (pos <= ja->cur_idx)
 			ja->cur_idx = (ja->cur_idx + 1) % ja->nr;
 
-		if (!c || new_fs)
-			bch2_mark_metadata_bucket(c, ca, bucket, BCH_DATA_journal,
+		if (new_fs) {
+			bch2_mark_metadata_bucket(c, ca, b, BCH_DATA_journal,
 						  ca->mi.bucket_size,
 						  gc_phase(GC_PHASE_SB),
 						  0);
-
-		if (c) {
+		} else {
 			spin_unlock(&c->journal.lock);
 			percpu_up_read(&c->mark_lock);
-		}
 
-		if (c && !new_fs)
 			ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_NOFAIL,
 				bch2_trans_mark_metadata_bucket(&trans, ca,
-						bucket, BCH_DATA_journal,
+						b, BCH_DATA_journal,
 						ca->mi.bucket_size));
 
-		if (!new_fs)
 			bch2_open_bucket_put(c, ob);
 
-		if (ret)
-			goto err;
+			if (ret)
+				goto err;
+		}
 	}
 err:
 	bch2_sb_resize_journal(&ca->disk_sb,
-- 
cgit 


From 3e07a7300f0684f7c77485e65a1ae97c7ab2514f Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 15 Apr 2021 18:31:58 -0400
Subject: bcachefs: Fix an RCU splat

Writepoints are never deallocated so the rcu_read_lock() isn't really
needed, but we are doing lockless list traversal.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_foreground.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index 4834ac798b9e..56b114888e49 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -683,11 +683,14 @@ static struct write_point *__writepoint_find(struct hlist_head *head,
 {
 	struct write_point *wp;
 
+	rcu_read_lock();
 	hlist_for_each_entry_rcu(wp, head, node)
 		if (wp->write_point == write_point)
-			return wp;
-
-	return NULL;
+			goto out;
+	wp = NULL;
+out:
+	rcu_read_unlock();
+	return wp;
 }
 
 static inline bool too_many_writepoints(struct bch_fs *c, unsigned factor)
-- 
cgit 


From 96f399d0eed9a63e706c045407675622f32f5a5d Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 16 Apr 2021 12:38:14 -0400
Subject: bcachefs: Fix journal reclaim loop

When dirty key cache keys were separated from other journal pins, we
broke the loop conditional in __bch2_journal_reclaim() - it's supposed
to keep looping as long as there's work to do.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/journal_reclaim.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
index 0d7fe1f99dbf..e2086c76d104 100644
--- a/fs/bcachefs/journal_reclaim.c
+++ b/fs/bcachefs/journal_reclaim.c
@@ -599,7 +599,7 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct)
 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
 	bool kthread = (current->flags & PF_KTHREAD) != 0;
 	u64 seq_to_flush;
-	size_t min_nr, nr_flushed;
+	size_t min_nr, min_key_cache, nr_flushed;
 	unsigned flags;
 	int ret = 0;
 
@@ -649,9 +649,10 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct)
 				atomic_long_read(&c->btree_key_cache.nr_dirty),
 				atomic_long_read(&c->btree_key_cache.nr_keys));
 
+		min_key_cache = min(bch2_nr_btree_keys_need_flush(c), 128UL);
+
 		nr_flushed = journal_flush_pins(j, seq_to_flush,
-					min_nr,
-					min(bch2_nr_btree_keys_need_flush(c), 128UL));
+						min_nr, min_key_cache);
 
 		if (direct)
 			j->nr_direct_reclaim += nr_flushed;
@@ -661,7 +662,7 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct)
 
 		if (nr_flushed)
 			wake_up(&j->reclaim_wait);
-	} while (min_nr && nr_flushed && !direct);
+	} while ((min_nr || min_key_cache) && !direct);
 
 	memalloc_noreclaim_restore(flags);
 
-- 
cgit 


From 5e6a668b19614c44819c4b0f7691da92df973384 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 16 Apr 2021 14:29:26 -0400
Subject: bcachefs: Fix transaction restarts due to upgrading of cloned
 iterators

This fixes a regression from
  52d86202fd bcachefs: Improve bch2_btree_iter_traverse_all()

We want to avoid mucking with other iterators in the btree transaction
in operations that are only supposed to be touching individual iterators
- that patch was a cleanup to move lock ordering handling to
bch2_btree_iter_traverse_all(). But it broke upgrading of cloned
iterators.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c | 35 ++++++++++++++++++++++++++++++++++-
 1 file changed, 34 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index f82976aab7d9..11f7b47e3e7f 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -387,11 +387,44 @@ bool bch2_btree_iter_relock(struct btree_iter *iter, bool trace)
 bool __bch2_btree_iter_upgrade(struct btree_iter *iter,
 			       unsigned new_locks_want)
 {
+	struct btree_iter *linked;
+
 	EBUG_ON(iter->locks_want >= new_locks_want);
 
 	iter->locks_want = new_locks_want;
 
-	return btree_iter_get_locks(iter, true, true);
+	if (btree_iter_get_locks(iter, true, true))
+		return true;
+
+	/*
+	 * XXX: this is ugly - we'd prefer to not be mucking with other
+	 * iterators in the btree_trans here.
+	 *
+	 * On failure to upgrade the iterator, setting iter->locks_want and
+	 * calling get_locks() is sufficient to make bch2_btree_iter_traverse()
+	 * get the locks we want on transaction restart.
+	 *
+	 * But if this iterator was a clone, on transaction restart what we did
+	 * to this iterator isn't going to be preserved.
+	 *
+	 * Possibly we could add an iterator field for the parent iterator when
+	 * an iterator is a copy - for now, we'll just upgrade any other
+	 * iterators with the same btree id.
+	 *
+	 * The code below used to be needed to ensure ancestor nodes get locked
+	 * before interior nodes - now that's handled by
+	 * bch2_btree_iter_traverse_all().
+	 */
+	trans_for_each_iter(iter->trans, linked)
+		if (linked != iter &&
+		    btree_iter_type(linked) == btree_iter_type(iter) &&
+		    linked->btree_id == iter->btree_id &&
+		    linked->locks_want < new_locks_want) {
+			linked->locks_want = new_locks_want;
+			btree_iter_get_locks(linked, true, false);
+		}
+
+	return false;
 }
 
 void __bch2_btree_iter_downgrade(struct btree_iter *iter,
-- 
cgit 


From ae8bbb9fac2c74ce3132adae7b059d1cb8535039 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 16 Apr 2021 14:48:51 -0400
Subject: bcachefs: Simplify fsck remove_dirent()

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fsck.c | 52 +++++++++++++++++++---------------------------------
 1 file changed, 19 insertions(+), 33 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index fa1922cb5c87..e6036d36e0f9 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -81,51 +81,37 @@ static int write_inode(struct btree_trans *trans,
 	return ret;
 }
 
-static int __remove_dirent(struct btree_trans *trans,
-			   struct bkey_s_c_dirent dirent)
+static int __remove_dirent(struct btree_trans *trans, struct bpos pos)
 {
 	struct bch_fs *c = trans->c;
-	struct qstr name;
+	struct btree_iter *iter;
 	struct bch_inode_unpacked dir_inode;
 	struct bch_hash_info dir_hash_info;
-	u64 dir_inum = dirent.k->p.inode;
 	int ret;
-	char *buf;
-
-	name.len = bch2_dirent_name_bytes(dirent);
-	buf = bch2_trans_kmalloc(trans, name.len + 1);
-	if (IS_ERR(buf))
-		return PTR_ERR(buf);
-
-	memcpy(buf, dirent.v->d_name, name.len);
-	buf[name.len] = '\0';
-	name.name = buf;
 
-	ret = lookup_inode(trans, dir_inum, &dir_inode, NULL);
-	if (ret && ret != -EINTR)
-		bch_err(c, "remove_dirent: err %i looking up directory inode", ret);
+	ret = lookup_inode(trans, pos.inode, &dir_inode, NULL);
 	if (ret)
 		return ret;
 
 	dir_hash_info = bch2_hash_info_init(c, &dir_inode);
 
-	ret = bch2_hash_delete(trans, bch2_dirent_hash_desc,
-			       &dir_hash_info, dir_inum, &name);
-	if (ret && ret != -EINTR)
-		bch_err(c, "remove_dirent: err %i deleting dirent", ret);
-	if (ret)
-		return ret;
+	iter = bch2_trans_get_iter(trans, BTREE_ID_dirents, pos, BTREE_ITER_INTENT);
 
-	return 0;
+	ret = bch2_hash_delete_at(trans, bch2_dirent_hash_desc,
+				  &dir_hash_info, iter);
+	bch2_trans_iter_put(trans, iter);
+	return ret;
 }
 
-static int remove_dirent(struct btree_trans *trans,
-			 struct bkey_s_c_dirent dirent)
+static int remove_dirent(struct btree_trans *trans, struct bpos pos)
 {
-	return __bch2_trans_do(trans, NULL, NULL,
-			       BTREE_INSERT_NOFAIL|
-			       BTREE_INSERT_LAZY_RW,
-			       __remove_dirent(trans, dirent));
+	int ret = __bch2_trans_do(trans, NULL, NULL,
+				  BTREE_INSERT_NOFAIL|
+				  BTREE_INSERT_LAZY_RW,
+				  __remove_dirent(trans, pos));
+	if (ret)
+		bch_err(trans->c, "remove_dirent: err %i deleting dirent", ret);
+	return ret;
 }
 
 static int __reattach_inode(struct btree_trans *trans,
@@ -202,7 +188,7 @@ static int remove_backpointer(struct btree_trans *trans,
 		goto out;
 	}
 
-	ret = remove_dirent(trans, bkey_s_c_to_dirent(k));
+	ret = remove_dirent(trans, k.k->p);
 out:
 	bch2_trans_iter_put(trans, iter);
 	return ret;
@@ -752,7 +738,7 @@ retry:
 				"dirent points to missing inode:\n%s",
 				(bch2_bkey_val_to_text(&PBUF(buf), c,
 						       k), buf))) {
-			ret = remove_dirent(&trans, d);
+			ret = remove_dirent(&trans, d.k->p);
 			if (ret)
 				goto err;
 			goto next;
@@ -783,7 +769,7 @@ retry:
 					backpointer_exists, c,
 					"directory %llu with multiple links",
 					target.bi_inum)) {
-				ret = remove_dirent(&trans, d);
+				ret = remove_dirent(&trans, d.k->p);
 				if (ret)
 					goto err;
 				continue;
-- 
cgit 


From f24fab9cbad1838fc4356e9e8b861a19cf055c3f Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 16 Apr 2021 17:26:25 -0400
Subject: bcachefs: Fix some small memory leaks

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fsck.c     | 2 ++
 fs/bcachefs/replicas.c | 4 ++--
 2 files changed, 4 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index e6036d36e0f9..4a48ef5d1bfb 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -1124,6 +1124,8 @@ static int check_directory_structure(struct bch_fs *c,
 
 	BUG_ON(ret == -EINTR);
 
+	kfree(path.entries);
+
 	return bch2_trans_exit(&trans) ?: ret;
 }
 
diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c
index 81aba8caab9e..5a8b0a7b7197 100644
--- a/fs/bcachefs/replicas.c
+++ b/fs/bcachefs/replicas.c
@@ -320,8 +320,8 @@ static int replicas_table_update(struct bch_fs *c,
 out:
 	free_percpu(new_gc);
 	kfree(new_scratch);
-	free_percpu(new_usage[1]);
-	free_percpu(new_usage[0]);
+	for (i = 0; i < ARRAY_SIZE(new_usage); i++)
+		free_percpu(new_usage[i]);
 	kfree(new_base);
 	return ret;
 err:
-- 
cgit 


From f02810a1a4cdcf643f6149396769e0fb38a94a78 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 16 Apr 2021 17:34:53 -0400
Subject: bcachefs: Fix an unused var warning in userspace

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fsck.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 4a48ef5d1bfb..cfe606342032 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -159,13 +159,10 @@ static int reattach_inode(struct btree_trans *trans,
 			  struct bch_inode_unpacked *lostfound,
 			  u64 inum)
 {
-	struct bch_fs *c = trans->c;
-	int ret;
-
-	ret = __bch2_trans_do(trans, NULL, NULL, BTREE_INSERT_LAZY_RW,
+	int ret = __bch2_trans_do(trans, NULL, NULL, BTREE_INSERT_LAZY_RW,
 			      __reattach_inode(trans, lostfound, inum));
 	if (ret)
-		bch_err(c, "error %i reattaching inode %llu", ret, inum);
+		bch_err(trans->c, "error %i reattaching inode %llu", ret, inum);
 
 	return ret;
 }
-- 
cgit 


From 694015c2b1cf114577e4526c0aa370d0ed0a10e5 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 16 Apr 2021 20:35:20 -0400
Subject: bcachefs: Refactor bchfs_fallocate() to not nest btree_trans on stack

Upcoming patch is going to disallow multiple btree_trans on the stack.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-io.c | 103 ++++++++++++++++++++++++----------------------------
 1 file changed, 48 insertions(+), 55 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 8891207c46a9..a7cd85647354 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -2633,54 +2633,21 @@ err:
 	return ret;
 }
 
-static long bchfs_fallocate(struct bch_inode_info *inode, int mode,
-			    loff_t offset, loff_t len)
+static int __bchfs_fallocate(struct bch_inode_info *inode, int mode,
+			     u64 start_sector, u64 end_sector)
 {
-	struct address_space *mapping = inode->v.i_mapping;
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
 	struct btree_trans trans;
 	struct btree_iter *iter;
-	struct bpos end_pos;
-	loff_t end		= offset + len;
-	loff_t block_start	= round_down(offset,	block_bytes(c));
-	loff_t block_end	= round_up(end,		block_bytes(c));
-	unsigned sectors;
+	struct bpos end_pos = POS(inode->v.i_ino, end_sector);
 	unsigned replicas = io_opts(c, &inode->ei_inode).data_replicas;
-	int ret;
+	int ret = 0;
 
 	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
 
-	inode_lock(&inode->v);
-	inode_dio_wait(&inode->v);
-	bch2_pagecache_block_get(&inode->ei_pagecache_lock);
-
-	if (!(mode & FALLOC_FL_KEEP_SIZE) && end > inode->v.i_size) {
-		ret = inode_newsize_ok(&inode->v, end);
-		if (ret)
-			goto err;
-	}
-
-	if (mode & FALLOC_FL_ZERO_RANGE) {
-		ret = __bch2_truncate_page(inode,
-					   offset >> PAGE_SHIFT,
-					   offset, end);
-
-		if (!ret &&
-		    offset >> PAGE_SHIFT != end >> PAGE_SHIFT)
-			ret = __bch2_truncate_page(inode,
-						   end >> PAGE_SHIFT,
-						   offset, end);
-
-		if (unlikely(ret))
-			goto err;
-
-		truncate_pagecache_range(&inode->v, offset, end - 1);
-	}
-
 	iter = bch2_trans_get_iter(&trans, BTREE_ID_extents,
-			POS(inode->v.i_ino, block_start >> 9),
+			POS(inode->v.i_ino, start_sector),
 			BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
-	end_pos = POS(inode->v.i_ino, block_end >> 9);
 
 	while (!ret && bkey_cmp(iter->pos, end_pos) < 0) {
 		s64 i_sectors_delta = 0;
@@ -2688,6 +2655,7 @@ static long bchfs_fallocate(struct bch_inode_info *inode, int mode,
 		struct quota_res quota_res = { 0 };
 		struct bkey_i_reservation reservation;
 		struct bkey_s_c k;
+		unsigned sectors;
 
 		bch2_trans_begin(&trans);
 
@@ -2748,7 +2716,48 @@ bkey_err:
 			ret = 0;
 	}
 	bch2_trans_iter_put(&trans, iter);
+	bch2_trans_exit(&trans);
+	return ret;
+}
 
+static long bchfs_fallocate(struct bch_inode_info *inode, int mode,
+			    loff_t offset, loff_t len)
+{
+	struct address_space *mapping = inode->v.i_mapping;
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	loff_t end		= offset + len;
+	loff_t block_start	= round_down(offset,	block_bytes(c));
+	loff_t block_end	= round_up(end,		block_bytes(c));
+	int ret;
+
+	inode_lock(&inode->v);
+	inode_dio_wait(&inode->v);
+	bch2_pagecache_block_get(&inode->ei_pagecache_lock);
+
+	if (!(mode & FALLOC_FL_KEEP_SIZE) && end > inode->v.i_size) {
+		ret = inode_newsize_ok(&inode->v, end);
+		if (ret)
+			goto err;
+	}
+
+	if (mode & FALLOC_FL_ZERO_RANGE) {
+		ret = __bch2_truncate_page(inode,
+					   offset >> PAGE_SHIFT,
+					   offset, end);
+
+		if (!ret &&
+		    offset >> PAGE_SHIFT != end >> PAGE_SHIFT)
+			ret = __bch2_truncate_page(inode,
+						   end >> PAGE_SHIFT,
+						   offset, end);
+
+		if (unlikely(ret))
+			goto err;
+
+		truncate_pagecache_range(&inode->v, offset, end - 1);
+	}
+
+	ret = __bchfs_fallocate(inode, mode, block_start >> 9, block_end >> 9);
 	if (ret)
 		goto err;
 
@@ -2762,28 +2771,13 @@ bkey_err:
 	if (end >= inode->v.i_size &&
 	    (!(mode & FALLOC_FL_KEEP_SIZE) ||
 	     (mode & FALLOC_FL_ZERO_RANGE))) {
-		struct btree_iter *inode_iter;
-		struct bch_inode_unpacked inode_u;
-
-		do {
-			bch2_trans_begin(&trans);
-			inode_iter = bch2_inode_peek(&trans, &inode_u,
-						     inode->v.i_ino, 0);
-			ret = PTR_ERR_OR_ZERO(inode_iter);
-		} while (ret == -EINTR);
-
-		bch2_trans_iter_put(&trans, inode_iter);
-		bch2_trans_unlock(&trans);
-
-		if (ret)
-			goto err;
 
 		/*
 		 * Sync existing appends before extending i_size,
 		 * as in bch2_extend():
 		 */
 		ret = filemap_write_and_wait_range(mapping,
-					inode_u.bi_size, S64_MAX);
+					inode->ei_inode.bi_size, S64_MAX);
 		if (ret)
 			goto err;
 
@@ -2797,7 +2791,6 @@ bkey_err:
 		mutex_unlock(&inode->ei_update_lock);
 	}
 err:
-	bch2_trans_exit(&trans);
 	bch2_pagecache_block_put(&inode->ei_pagecache_lock);
 	inode_unlock(&inode->v);
 	return ret;
-- 
cgit 


From dac1525d9c0d6e69da561dbc2becdcd32230b907 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 16 Apr 2021 21:34:00 -0400
Subject: bcachefs: gc shouldn't care about owned_by_allocator

The owned_by_allocator field is a purely in memory thing, even if/when
we bring back GC at runtime there's no need for it to be recalculating
this field. This is prep work for pulling it out of struct bucket, and
eventually getting rid of the bucket array.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c |  5 ++-
 fs/bcachefs/alloc_foreground.c |  3 +-
 fs/bcachefs/bcachefs.h         |  1 -
 fs/bcachefs/btree_gc.c         | 53 +--------------------------
 fs/bcachefs/btree_gc.h         |  8 -----
 fs/bcachefs/buckets.c          | 82 +++---------------------------------------
 fs/bcachefs/buckets.h          |  3 +-
 7 files changed, 9 insertions(+), 146 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index c115c76b2197..6c1da7873295 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -783,7 +783,7 @@ static int bch2_invalidate_one_bucket2(struct btree_trans *trans,
 
 	BUG_ON(m.dirty_sectors);
 
-	bch2_mark_alloc_bucket(c, ca, b, true, gc_pos_alloc(c, NULL), 0);
+	bch2_mark_alloc_bucket(c, ca, b, true);
 
 	spin_lock(&c->freelist_lock);
 	verify_not_on_freelist(c, ca, b);
@@ -880,8 +880,7 @@ out:
 		percpu_down_read(&c->mark_lock);
 		spin_lock(&c->freelist_lock);
 
-		bch2_mark_alloc_bucket(c, ca, b, false,
-				       gc_pos_alloc(c, NULL), 0);
+		bch2_mark_alloc_bucket(c, ca, b, false);
 
 		BUG_ON(!fifo_pop_back(&ca->free_inc, b2));
 		BUG_ON(b != b2);
diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index 56b114888e49..f2f392eeb54a 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -98,8 +98,7 @@ void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob)
 	percpu_down_read(&c->mark_lock);
 	spin_lock(&ob->lock);
 
-	bch2_mark_alloc_bucket(c, ca, PTR_BUCKET_NR(ca, &ob->ptr),
-			       false, gc_pos_alloc(c, ob), 0);
+	bch2_mark_alloc_bucket(c, ca, PTR_BUCKET_NR(ca, &ob->ptr), false);
 	ob->valid = false;
 	ob->type = 0;
 
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 4dff12fd7cc1..55e40b3acb90 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -379,7 +379,6 @@ enum gc_phase {
 	GC_PHASE_BTREE_reflink,
 
 	GC_PHASE_PENDING_DELETE,
-	GC_PHASE_ALLOC,
 };
 
 struct gc_pos {
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 068cfbb2c489..a7ffd566d431 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -730,52 +730,6 @@ static void bch2_mark_pending_btree_node_frees(struct bch_fs *c)
 }
 #endif
 
-static void bch2_mark_allocator_buckets(struct bch_fs *c)
-{
-	struct bch_dev *ca;
-	struct open_bucket *ob;
-	size_t i, j, iter;
-	unsigned ci;
-
-	percpu_down_read(&c->mark_lock);
-
-	spin_lock(&c->freelist_lock);
-	gc_pos_set(c, gc_pos_alloc(c, NULL));
-
-	for_each_member_device(ca, c, ci) {
-		fifo_for_each_entry(i, &ca->free_inc, iter)
-			bch2_mark_alloc_bucket(c, ca, i, true,
-					       gc_pos_alloc(c, NULL),
-					       BTREE_TRIGGER_GC);
-
-
-
-		for (j = 0; j < RESERVE_NR; j++)
-			fifo_for_each_entry(i, &ca->free[j], iter)
-				bch2_mark_alloc_bucket(c, ca, i, true,
-						       gc_pos_alloc(c, NULL),
-						       BTREE_TRIGGER_GC);
-	}
-
-	spin_unlock(&c->freelist_lock);
-
-	for (ob = c->open_buckets;
-	     ob < c->open_buckets + ARRAY_SIZE(c->open_buckets);
-	     ob++) {
-		spin_lock(&ob->lock);
-		if (ob->valid) {
-			gc_pos_set(c, gc_pos_alloc(c, ob));
-			ca = bch_dev_bkey_exists(c, ob->ptr.dev);
-			bch2_mark_alloc_bucket(c, ca, PTR_BUCKET_NR(ca, &ob->ptr), true,
-					       gc_pos_alloc(c, ob),
-					       BTREE_TRIGGER_GC);
-		}
-		spin_unlock(&ob->lock);
-	}
-
-	percpu_up_read(&c->mark_lock);
-}
-
 static void bch2_gc_free(struct bch_fs *c)
 {
 	struct bch_dev *ca;
@@ -880,7 +834,6 @@ static int bch2_gc_done(struct bch_fs *c,
 		for (b = 0; b < src->nbuckets; b++) {
 			copy_bucket_field(gen);
 			copy_bucket_field(data_type);
-			copy_bucket_field(owned_by_allocator);
 			copy_bucket_field(stripe);
 			copy_bucket_field(dirty_sectors);
 			copy_bucket_field(cached_sectors);
@@ -1020,10 +973,8 @@ static int bch2_gc_start(struct bch_fs *c,
 
 			if (metadata_only &&
 			    (s->mark.data_type == BCH_DATA_user ||
-			     s->mark.data_type == BCH_DATA_cached)) {
+			     s->mark.data_type == BCH_DATA_cached))
 				d->_mark = s->mark;
-				d->_mark.owned_by_allocator = 0;
-			}
 		}
 	};
 
@@ -1079,8 +1030,6 @@ again:
 #if 0
 	bch2_mark_pending_btree_node_frees(c);
 #endif
-	bch2_mark_allocator_buckets(c);
-
 	c->gc_count++;
 
 	if (test_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags) ||
diff --git a/fs/bcachefs/btree_gc.h b/fs/bcachefs/btree_gc.h
index 868723a30b15..e9a87394370a 100644
--- a/fs/bcachefs/btree_gc.h
+++ b/fs/bcachefs/btree_gc.h
@@ -90,14 +90,6 @@ static inline struct gc_pos gc_pos_btree_root(enum btree_id id)
 	return gc_pos_btree(id, POS_MAX, BTREE_MAX_DEPTH);
 }
 
-static inline struct gc_pos gc_pos_alloc(struct bch_fs *c, struct open_bucket *ob)
-{
-	return (struct gc_pos) {
-		.phase	= GC_PHASE_ALLOC,
-		.pos	= POS(ob ? ob - c->open_buckets : 0, 0),
-	};
-}
-
 static inline bool gc_visited(struct bch_fs *c, struct gc_pos pos)
 {
 	unsigned seq;
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 297ff7d3b06e..3f68a30acd5b 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -3,64 +3,6 @@
  * Code for manipulating bucket marks for garbage collection.
  *
  * Copyright 2014 Datera, Inc.
- *
- * Bucket states:
- * - free bucket: mark == 0
- *   The bucket contains no data and will not be read
- *
- * - allocator bucket: owned_by_allocator == 1
- *   The bucket is on a free list, or it is an open bucket
- *
- * - cached bucket: owned_by_allocator == 0 &&
- *                  dirty_sectors == 0 &&
- *                  cached_sectors > 0
- *   The bucket contains data but may be safely discarded as there are
- *   enough replicas of the data on other cache devices, or it has been
- *   written back to the backing device
- *
- * - dirty bucket: owned_by_allocator == 0 &&
- *                 dirty_sectors > 0
- *   The bucket contains data that we must not discard (either only copy,
- *   or one of the 'main copies' for data requiring multiple replicas)
- *
- * - metadata bucket: owned_by_allocator == 0 && is_metadata == 1
- *   This is a btree node, journal or gen/prio bucket
- *
- * Lifecycle:
- *
- * bucket invalidated => bucket on freelist => open bucket =>
- *     [dirty bucket =>] cached bucket => bucket invalidated => ...
- *
- * Note that cache promotion can skip the dirty bucket step, as data
- * is copied from a deeper tier to a shallower tier, onto a cached
- * bucket.
- * Note also that a cached bucket can spontaneously become dirty --
- * see below.
- *
- * Only a traversal of the key space can determine whether a bucket is
- * truly dirty or cached.
- *
- * Transitions:
- *
- * - free => allocator: bucket was invalidated
- * - cached => allocator: bucket was invalidated
- *
- * - allocator => dirty: open bucket was filled up
- * - allocator => cached: open bucket was filled up
- * - allocator => metadata: metadata was allocated
- *
- * - dirty => cached: dirty sectors were copied to a deeper tier
- * - dirty => free: dirty sectors were overwritten or moved (copy gc)
- * - cached => free: cached sectors were overwritten
- *
- * - metadata => free: metadata was freed
- *
- * Oddities:
- * - cached => dirty: a device was removed so formerly replicated data
- *                    is no longer sufficiently replicated
- * - free => cached: cannot happen
- * - free => dirty: cannot happen
- * - free => metadata: cannot happen
  */
 
 #include "bcachefs.h"
@@ -558,33 +500,17 @@ static inline void update_cached_sectors_list(struct btree_trans *trans,
 	ret;								\
 })
 
-static int __bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
-				    size_t b, bool owned_by_allocator,
-				    bool gc)
+void bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
+			    size_t b, bool owned_by_allocator)
 {
-	struct bucket *g = __bucket(ca, b, gc);
+	struct bucket *g = bucket(ca, b);
 	struct bucket_mark old, new;
 
 	old = bucket_cmpxchg(g, new, ({
 		new.owned_by_allocator	= owned_by_allocator;
 	}));
 
-	BUG_ON(!gc &&
-	       !owned_by_allocator && !old.owned_by_allocator);
-
-	return 0;
-}
-
-void bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
-			    size_t b, bool owned_by_allocator,
-			    struct gc_pos pos, unsigned flags)
-{
-	preempt_disable();
-
-	do_mark_fn(__bch2_mark_alloc_bucket, c, pos, flags,
-		   ca, b, owned_by_allocator);
-
-	preempt_enable();
+	BUG_ON(owned_by_allocator == old.owned_by_allocator);
 }
 
 static int bch2_mark_alloc(struct bch_fs *c,
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index 794c426e2198..7463e6420b14 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -235,8 +235,7 @@ bch2_fs_usage_read_short(struct bch_fs *);
 void bch2_bucket_seq_cleanup(struct bch_fs *);
 void bch2_fs_usage_initialize(struct bch_fs *);
 
-void bch2_mark_alloc_bucket(struct bch_fs *, struct bch_dev *,
-			    size_t, bool, struct gc_pos, unsigned);
+void bch2_mark_alloc_bucket(struct bch_fs *, struct bch_dev *, size_t, bool);
 void bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *,
 			       size_t, enum bch_data_type, unsigned,
 			       struct gc_pos, unsigned);
-- 
cgit 


From 6ad060b0eb45d2eaa5411be042bd3b53900f992e Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 16 Apr 2021 21:53:23 -0400
Subject: bcachefs: Allocator thread doesn't need gc_lock anymore

Even with runtime gc (which currently isn't supported), runtime gc no
longer clears/recalculates the main set of bucket marks - it allocates
and calculates another set, updating the primary at the end.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c | 26 +++++---------------------
 1 file changed, 5 insertions(+), 21 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 6c1da7873295..84a560659413 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -209,7 +209,7 @@ void bch2_alloc_pack(struct bch_fs *c,
 	bch2_alloc_pack_v2(dst, src);
 }
 
-static unsigned bch_alloc_val_u64s(const struct bch_alloc *a)
+static unsigned bch_alloc_v1_val_u64s(const struct bch_alloc *a)
 {
 	unsigned i, bytes = offsetof(struct bch_alloc, data);
 
@@ -229,7 +229,7 @@ const char *bch2_alloc_v1_invalid(const struct bch_fs *c, struct bkey_s_c k)
 		return "invalid device";
 
 	/* allow for unknown fields */
-	if (bkey_val_u64s(a.k) < bch_alloc_val_u64s(a.v))
+	if (bkey_val_u64s(a.k) < bch_alloc_v1_val_u64s(a.v))
 		return "incorrect value size";
 
 	return NULL;
@@ -293,11 +293,8 @@ int bch2_alloc_read(struct bch_fs *c, struct journal_keys *journal_keys)
 {
 	int ret;
 
-	down_read(&c->gc_lock);
 	ret = bch2_btree_and_journal_walk(c, journal_keys, BTREE_ID_alloc,
 					  NULL, bch2_alloc_read_fn);
-	up_read(&c->gc_lock);
-
 	if (ret) {
 		bch_err(c, "error reading alloc info: %i", ret);
 		return ret;
@@ -475,10 +472,8 @@ static int wait_buckets_available(struct bch_fs *c, struct bch_dev *ca)
 		if (available)
 			break;
 
-		up_read(&c->gc_lock);
 		schedule();
 		try_to_freeze();
-		down_read(&c->gc_lock);
 	}
 
 	__set_current_state(TASK_RUNNING);
@@ -914,7 +909,6 @@ static int bch2_invalidate_buckets(struct bch_fs *c, struct bch_dev *ca)
 	       !fifo_full(&ca->free_inc) &&
 	       ca->alloc_heap.used)
 		ret = bch2_invalidate_one_bucket2(&trans, ca, iter, &journal_seq,
-				BTREE_INSERT_GC_LOCK_HELD|
 				(!fifo_empty(&ca->free_inc)
 				 ? BTREE_INSERT_NOWAIT : 0));
 
@@ -1055,18 +1049,12 @@ static int bch2_allocator_thread(void *arg)
 		if (ret)
 			goto stop;
 
-		down_read(&c->gc_lock);
-
 		ret = bch2_invalidate_buckets(c, ca);
-		if (ret) {
-			up_read(&c->gc_lock);
+		if (ret)
 			goto stop;
-		}
 
-		if (!fifo_empty(&ca->free_inc)) {
-			up_read(&c->gc_lock);
+		if (!fifo_empty(&ca->free_inc))
 			continue;
-		}
 
 		pr_debug("free_inc now empty");
 
@@ -1104,14 +1092,10 @@ static int bch2_allocator_thread(void *arg)
 			 * available so we don't spin:
 			 */
 			ret = wait_buckets_available(c, ca);
-			if (ret) {
-				up_read(&c->gc_lock);
+			if (ret)
 				goto stop;
-			}
 		}
 
-		up_read(&c->gc_lock);
-
 		pr_debug("%zu buckets to invalidate", nr);
 
 		/*
-- 
cgit 


From 04903131db3e09a6a074755071c46febc7e2449f Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 18 Apr 2021 17:26:34 -0400
Subject: bcachefs: Handle errors in bch2_trans_mark_update()

It's not actually the case that iterators are always checked here -
__bch2_trans_commit() checks for that after running triggers.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c | 2 ++
 fs/bcachefs/buckets.c          | 5 +++--
 2 files changed, 5 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 84a560659413..ab60bf259b0c 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -293,8 +293,10 @@ int bch2_alloc_read(struct bch_fs *c, struct journal_keys *journal_keys)
 {
 	int ret;
 
+	down_read(&c->gc_lock);
 	ret = bch2_btree_and_journal_walk(c, journal_keys, BTREE_ID_alloc,
 					  NULL, bch2_alloc_read_fn);
+	up_read(&c->gc_lock);
 	if (ret) {
 		bch_err(c, "error reading alloc info: %i", ret);
 		return ret;
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 3f68a30acd5b..b51f311ad60e 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -1836,10 +1836,11 @@ int bch2_trans_mark_update(struct btree_trans *trans,
 		return 0;
 
 	if (!btree_node_type_is_extents(iter->btree_id)) {
-		/* iterators should be uptodate, shouldn't get errors here: */
 		if (btree_iter_type(iter) != BTREE_ITER_CACHED) {
 			old = bch2_btree_iter_peek_slot(iter);
-			BUG_ON(bkey_err(old));
+			ret = bkey_err(old);
+			if (ret)
+				return ret;
 		} else {
 			struct bkey_cached *ck = (void *) iter->l[0].b;
 
-- 
cgit 


From 27cc532ef2d7c3bc4687547f59fe5d4a82affab7 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 17 Apr 2021 23:18:17 -0400
Subject: bcachefs: Check that keys are in the correct btrees

We've started seeing bug reports of pointers to btree nodes being
detected in leaf nodes. This should catch that before it's happened, and
it's something we should've been checking anyways.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bkey_methods.c | 39 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 39 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c
index 6fe95b802e13..cf2e054cca2f 100644
--- a/fs/bcachefs/bkey_methods.c
+++ b/fs/bcachefs/bkey_methods.c
@@ -98,12 +98,51 @@ const char *bch2_bkey_val_invalid(struct bch_fs *c, struct bkey_s_c k)
 	return bch2_bkey_ops[k.k->type].key_invalid(c, k);
 }
 
+static unsigned bch2_key_types_allowed[] = {
+	[BKEY_TYPE_extents] =
+		(1U << KEY_TYPE_error)|
+		(1U << KEY_TYPE_cookie)|
+		(1U << KEY_TYPE_extent)|
+		(1U << KEY_TYPE_reservation)|
+		(1U << KEY_TYPE_reflink_p)|
+		(1U << KEY_TYPE_inline_data),
+	[BKEY_TYPE_inodes] =
+		(1U << KEY_TYPE_inode)|
+		(1U << KEY_TYPE_inode_generation),
+	[BKEY_TYPE_dirents] =
+		(1U << KEY_TYPE_hash_whiteout)|
+		(1U << KEY_TYPE_dirent),
+	[BKEY_TYPE_xattrs] =
+		(1U << KEY_TYPE_cookie)|
+		(1U << KEY_TYPE_hash_whiteout)|
+		(1U << KEY_TYPE_xattr),
+	[BKEY_TYPE_alloc] =
+		(1U << KEY_TYPE_alloc)|
+		(1U << KEY_TYPE_alloc_v2),
+	[BKEY_TYPE_quotas] =
+		(1U << KEY_TYPE_quota),
+	[BKEY_TYPE_stripes] =
+		(1U << KEY_TYPE_stripe),
+	[BKEY_TYPE_reflink] =
+		(1U << KEY_TYPE_reflink_v)|
+		(1U << KEY_TYPE_indirect_inline_data),
+	[BKEY_TYPE_btree] =
+		(1U << KEY_TYPE_btree_ptr)|
+		(1U << KEY_TYPE_btree_ptr_v2),
+};
+
 const char *__bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k,
 				enum btree_node_type type)
 {
+	unsigned key_types_allowed = (1U << KEY_TYPE_deleted)|
+		bch2_key_types_allowed[type] ;
+
 	if (k.k->u64s < BKEY_U64s)
 		return "u64s too small";
 
+	if (!(key_types_allowed & (1U << k.k->type)))
+		return "invalid key type for this btree";
+
 	if (type == BKEY_TYPE_btree &&
 	    bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX)
 		return "value too big";
-- 
cgit 


From fa272f33bbfc68856efa7aa0f2e33d9fe5982e17 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 18 Apr 2021 17:44:35 -0400
Subject: bcachefs: Always check for invalid bkeys in trans commit path

We check for this prior to metadata being written, but we're seeing some
strange bugs lately, and this will help catch those closer to where they
occur.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update_leaf.c | 27 ++++++++++++---------------
 1 file changed, 12 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 21a26987f975..6e26531eb671 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -222,18 +222,6 @@ static bool btree_insert_key_leaf(struct btree_trans *trans,
 static inline void btree_insert_entry_checks(struct btree_trans *trans,
 					     struct btree_insert_entry *i)
 {
-	struct bch_fs *c = trans->c;
-
-	if (bch2_debug_check_bkeys) {
-		const char *invalid = bch2_bkey_invalid(c,
-				bkey_i_to_s_c(i->k), i->bkey_type);
-		if (invalid) {
-			char buf[200];
-
-			bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(i->k));
-			panic("invalid bkey %s on insert: %s\n", buf, invalid);
-		}
-	}
 	BUG_ON(!i->is_extent && bpos_cmp(i->k->k.p, i->iter->real_pos));
 	BUG_ON(i->level		!= i->iter->level);
 	BUG_ON(i->btree_id	!= i->iter->btree_id);
@@ -592,9 +580,18 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans,
 		}
 	}
 
-	if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG))
-		trans_for_each_update2(trans, i)
-			btree_insert_entry_checks(trans, i);
+	trans_for_each_update2(trans, i) {
+		const char *invalid = bch2_bkey_invalid(c,
+				bkey_i_to_s_c(i->k), i->bkey_type);
+		if (invalid) {
+			char buf[200];
+
+			bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(i->k));
+			bch_err(c, "invalid bkey %s on insert: %s\n", buf, invalid);
+			bch2_fatal_error(c);
+		}
+		btree_insert_entry_checks(trans, i);
+	}
 	bch2_btree_trans_verify_locks(trans);
 
 	trans_for_each_update2(trans, i)
-- 
cgit 


From 89baec780f8b218f5a8bce777b13b6116e416ff6 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 17 Apr 2021 20:37:04 -0400
Subject: bcachefs: Allocator refactoring

This uses the kthread_wait_freezable() macro to simplify a lot of the
allocator thread code, along with cleaning up bch2_invalidate_bucket2().

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c | 382 +++++++++++++++--------------------------
 fs/bcachefs/alloc_foreground.c |  47 +----
 fs/bcachefs/trace.h            |  43 +++--
 3 files changed, 161 insertions(+), 311 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index ab60bf259b0c..2d532fe4d30b 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -441,50 +441,6 @@ out:
  * commands to the newly free buckets, then puts them on the various freelists.
  */
 
-/**
- * wait_buckets_available - wait on reclaimable buckets
- *
- * If there aren't enough available buckets to fill up free_inc, wait until
- * there are.
- */
-static int wait_buckets_available(struct bch_fs *c, struct bch_dev *ca)
-{
-	unsigned long gc_count = c->gc_count;
-	s64 available;
-	int ret = 0;
-
-	ca->allocator_state = ALLOCATOR_blocked;
-	closure_wake_up(&c->freelist_wait);
-
-	while (1) {
-		set_current_state(TASK_INTERRUPTIBLE);
-		if (kthread_should_stop()) {
-			ret = 1;
-			break;
-		}
-
-		if (gc_count != c->gc_count)
-			ca->inc_gen_really_needs_gc = 0;
-
-		available  = dev_buckets_reclaimable(ca);
-		available -= ca->inc_gen_really_needs_gc;
-
-		available = max(available, 0LL);
-
-		if (available)
-			break;
-
-		schedule();
-		try_to_freeze();
-	}
-
-	__set_current_state(TASK_RUNNING);
-	ca->allocator_state = ALLOCATOR_running;
-	closure_wake_up(&c->freelist_wait);
-
-	return ret;
-}
-
 static bool bch2_can_invalidate_bucket(struct bch_dev *ca, size_t b,
 				       struct bucket_mark m)
 {
@@ -502,11 +458,8 @@ static bool bch2_can_invalidate_bucket(struct bch_dev *ca, size_t b,
 
 	gc_gen = bucket_gc_gen(bucket(ca, b));
 
-	if (gc_gen >= BUCKET_GC_GEN_MAX / 2)
-		ca->inc_gen_needs_gc++;
-
-	if (gc_gen >= BUCKET_GC_GEN_MAX)
-		ca->inc_gen_really_needs_gc++;
+	ca->inc_gen_needs_gc		+= gc_gen >= BUCKET_GC_GEN_MAX / 2;
+	ca->inc_gen_really_needs_gc	+= gc_gen >= BUCKET_GC_GEN_MAX;
 
 	return gc_gen < BUCKET_GC_GEN_MAX;
 }
@@ -583,6 +536,8 @@ static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca)
 		struct bucket_mark m = READ_ONCE(g->mark);
 		unsigned key = bucket_sort_key(g, m, now, last_seq_ondisk);
 
+		cond_resched();
+
 		if (!bch2_can_invalidate_bucket(ca, b, m))
 			continue;
 
@@ -599,8 +554,6 @@ static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca)
 				.key	= key,
 			};
 		}
-
-		cond_resched();
 	}
 
 	if (e.nr)
@@ -693,6 +646,7 @@ static size_t find_reclaimable_buckets(struct bch_fs *c, struct bch_dev *ca)
 	size_t i, nr = 0;
 
 	ca->inc_gen_needs_gc			= 0;
+	ca->inc_gen_really_needs_gc		= 0;
 
 	switch (ca->mi.replacement) {
 	case BCH_CACHE_REPLACEMENT_lru:
@@ -714,25 +668,6 @@ static size_t find_reclaimable_buckets(struct bch_fs *c, struct bch_dev *ca)
 	return nr;
 }
 
-static inline long next_alloc_bucket(struct bch_dev *ca)
-{
-	struct alloc_heap_entry e, *top = ca->alloc_heap.data;
-
-	while (ca->alloc_heap.used) {
-		if (top->nr) {
-			size_t b = top->bucket;
-
-			top->bucket++;
-			top->nr--;
-			return b;
-		}
-
-		heap_pop(&ca->alloc_heap, e, bucket_alloc_cmp, NULL);
-	}
-
-	return -1;
-}
-
 /*
  * returns sequence number of most recent journal entry that updated this
  * bucket:
@@ -755,17 +690,56 @@ static u64 bucket_journal_seq(struct bch_fs *c, struct bucket_mark m)
 	}
 }
 
-static int bch2_invalidate_one_bucket2(struct btree_trans *trans,
-				       struct bch_dev *ca,
-				       struct btree_iter *iter,
-				       u64 *journal_seq, unsigned flags)
+static int bucket_invalidate_btree(struct btree_trans *trans,
+				   struct bch_dev *ca, u64 b)
 {
 	struct bch_fs *c = trans->c;
-	struct bkey_alloc_buf a;
+	struct bkey_alloc_buf *a;
 	struct bkey_alloc_unpacked u;
 	struct bucket *g;
 	struct bucket_mark m;
-	bool invalidating_cached_data;
+	struct btree_iter *iter =
+		bch2_trans_get_iter(trans, BTREE_ID_alloc,
+				    POS(ca->dev_idx, b),
+				    BTREE_ITER_CACHED|
+				    BTREE_ITER_CACHED_NOFILL|
+				    BTREE_ITER_INTENT);
+	int ret;
+
+	a = bch2_trans_kmalloc(trans, sizeof(*a));
+	ret = PTR_ERR_OR_ZERO(a);
+	if (ret)
+		goto err;
+
+	ret = bch2_btree_iter_traverse(iter);
+	if (ret)
+		goto err;
+
+	percpu_down_read(&c->mark_lock);
+	g = bucket(ca, b);
+	m = READ_ONCE(g->mark);
+	u = alloc_mem_to_key(iter, g, m);
+	percpu_up_read(&c->mark_lock);
+
+	u.gen++;
+	u.data_type	= 0;
+	u.dirty_sectors	= 0;
+	u.cached_sectors = 0;
+	u.read_time	= atomic64_read(&c->io_clock[READ].now);
+	u.write_time	= atomic64_read(&c->io_clock[WRITE].now);
+
+	bch2_alloc_pack(c, a, u);
+	bch2_trans_update(trans, iter, &a->k, BTREE_TRIGGER_BUCKET_INVALIDATE);
+err:
+	bch2_trans_iter_put(trans, iter);
+	return ret;
+}
+
+static int bch2_invalidate_one_bucket(struct bch_fs *c, struct bch_dev *ca,
+				      u64 *journal_seq, unsigned flags)
+{
+	struct bucket *g;
+	struct bucket_mark m;
 	size_t b;
 	int ret = 0;
 
@@ -811,48 +785,12 @@ static int bch2_invalidate_one_bucket2(struct btree_trans *trans,
 		goto out;
 	}
 
-	bch2_btree_iter_set_pos(iter, POS(ca->dev_idx, b));
-retry:
-	ret = bch2_btree_iter_traverse(iter);
-	if (ret)
-		return ret;
-
-	percpu_down_read(&c->mark_lock);
-	g = bucket(ca, iter->pos.offset);
-	m = READ_ONCE(g->mark);
-	u = alloc_mem_to_key(iter, g, m);
-
-	percpu_up_read(&c->mark_lock);
-
-	invalidating_cached_data = u.cached_sectors != 0;
-
-	u.gen++;
-	u.data_type	= 0;
-	u.dirty_sectors	= 0;
-	u.cached_sectors = 0;
-	u.read_time	= atomic64_read(&c->io_clock[READ].now);
-	u.write_time	= atomic64_read(&c->io_clock[WRITE].now);
-
-	bch2_alloc_pack(c, &a, u);
-	bch2_trans_update(trans, iter, &a.k,
-			  BTREE_TRIGGER_BUCKET_INVALIDATE);
-
-	/*
-	 * XXX:
-	 * when using deferred btree updates, we have journal reclaim doing
-	 * btree updates and thus requiring the allocator to make forward
-	 * progress, and here the allocator is requiring space in the journal -
-	 * so we need a journal pre-reservation:
-	 */
-	ret = bch2_trans_commit(trans, NULL,
-				invalidating_cached_data ? journal_seq : NULL,
-				BTREE_INSERT_NOUNLOCK|
-				BTREE_INSERT_NOCHECK_RW|
-				BTREE_INSERT_NOFAIL|
-				BTREE_INSERT_JOURNAL_RESERVED|
-				flags);
-	if (ret == -EINTR)
-		goto retry;
+	ret = bch2_trans_do(c, NULL, journal_seq,
+			    BTREE_INSERT_NOCHECK_RW|
+			    BTREE_INSERT_NOFAIL|
+			    BTREE_INSERT_JOURNAL_RESERVED|
+			    flags,
+			    bucket_invalidate_btree(&trans, ca, b));
 out:
 	if (!ret) {
 		/* remove from alloc_heap: */
@@ -894,28 +832,23 @@ out:
  */
 static int bch2_invalidate_buckets(struct bch_fs *c, struct bch_dev *ca)
 {
-	struct btree_trans trans;
-	struct btree_iter *iter;
 	u64 journal_seq = 0;
 	int ret = 0;
 
-	bch2_trans_init(&trans, c, 0, 0);
-	iter = bch2_trans_get_iter(&trans, BTREE_ID_alloc,
-				   POS(ca->dev_idx, 0),
-				   BTREE_ITER_CACHED|
-				   BTREE_ITER_CACHED_NOFILL|
-				   BTREE_ITER_INTENT);
-
 	/* Only use nowait if we've already invalidated at least one bucket: */
 	while (!ret &&
 	       !fifo_full(&ca->free_inc) &&
-	       ca->alloc_heap.used)
-		ret = bch2_invalidate_one_bucket2(&trans, ca, iter, &journal_seq,
+	       ca->alloc_heap.used) {
+		ret = bch2_invalidate_one_bucket(c, ca, &journal_seq,
 				(!fifo_empty(&ca->free_inc)
 				 ? BTREE_INSERT_NOWAIT : 0));
-
-	bch2_trans_iter_put(&trans, iter);
-	bch2_trans_exit(&trans);
+		/*
+		 * We only want to batch up invalidates when they're going to
+		 * require flushing the journal:
+		 */
+		if (!journal_seq)
+			break;
+	}
 
 	/* If we used NOWAIT, don't return the error: */
 	if (!fifo_empty(&ca->free_inc))
@@ -935,83 +868,72 @@ static int bch2_invalidate_buckets(struct bch_fs *c, struct bch_dev *ca)
 	return 0;
 }
 
-static int push_invalidated_bucket(struct bch_fs *c, struct bch_dev *ca, size_t bucket)
+static void alloc_thread_set_state(struct bch_dev *ca, unsigned new_state)
+{
+	if (ca->allocator_state != new_state) {
+		ca->allocator_state = new_state;
+		closure_wake_up(&ca->fs->freelist_wait);
+	}
+}
+
+static int push_invalidated_bucket(struct bch_fs *c, struct bch_dev *ca, u64 b)
 {
 	unsigned i;
 	int ret = 0;
 
-	while (1) {
-		set_current_state(TASK_INTERRUPTIBLE);
-
-		spin_lock(&c->freelist_lock);
-		for (i = 0; i < RESERVE_NR; i++) {
-
-			/*
-			 * Don't strand buckets on the copygc freelist until
-			 * after recovery is finished:
-			 */
-			if (!test_bit(BCH_FS_STARTED, &c->flags) &&
-			    i == RESERVE_MOVINGGC)
-				continue;
-
-			if (fifo_push(&ca->free[i], bucket)) {
-				fifo_pop(&ca->free_inc, bucket);
-
-				closure_wake_up(&c->freelist_wait);
-				ca->allocator_state = ALLOCATOR_running;
-
-				spin_unlock(&c->freelist_lock);
-				goto out;
-			}
-		}
-
-		if (ca->allocator_state != ALLOCATOR_blocked_full) {
-			ca->allocator_state = ALLOCATOR_blocked_full;
-			closure_wake_up(&c->freelist_wait);
-		}
-
-		spin_unlock(&c->freelist_lock);
+	spin_lock(&c->freelist_lock);
+	for (i = 0; i < RESERVE_NR; i++) {
+		/*
+		 * Don't strand buckets on the copygc freelist until
+		 * after recovery is finished:
+		 */
+		if (i == RESERVE_MOVINGGC &&
+		    !test_bit(BCH_FS_STARTED, &c->flags))
+			continue;
 
-		if ((current->flags & PF_KTHREAD) &&
-		    kthread_should_stop()) {
+		if (fifo_push(&ca->free[i], b)) {
+			fifo_pop(&ca->free_inc, b);
 			ret = 1;
 			break;
 		}
-
-		schedule();
-		try_to_freeze();
 	}
-out:
-	__set_current_state(TASK_RUNNING);
+	spin_unlock(&c->freelist_lock);
+
+	ca->allocator_state = ret
+		? ALLOCATOR_running
+		: ALLOCATOR_blocked_full;
+	closure_wake_up(&c->freelist_wait);
 	return ret;
 }
 
-/*
- * Pulls buckets off free_inc, discards them (if enabled), then adds them to
- * freelists, waiting until there's room if necessary:
- */
-static int discard_invalidated_buckets(struct bch_fs *c, struct bch_dev *ca)
+static void discard_one_bucket(struct bch_fs *c, struct bch_dev *ca, u64 b)
 {
-	while (!fifo_empty(&ca->free_inc)) {
-		size_t bucket = fifo_peek(&ca->free_inc);
-
-		if (ca->mi.discard &&
-		    bdev_max_discard_sectors(ca->disk_sb.bdev))
-			blkdev_issue_discard(ca->disk_sb.bdev,
-					     bucket_to_sector(ca, bucket),
-					     ca->mi.bucket_size, GFP_NOIO);
-
-		if (push_invalidated_bucket(c, ca, bucket))
-			return 1;
-	}
+	if (ca->mi.discard &&
+	    bdev_max_discard_sectors(ca->disk_sb.bdev))
+		blkdev_issue_discard(ca->disk_sb.bdev, bucket_to_sector(ca, b),
+				     ca->mi.bucket_size, GFP_NOFS);
+}
 
-	return 0;
+static bool allocator_thread_running(struct bch_dev *ca)
+{
+	unsigned state = ca->mi.state == BCH_MEMBER_STATE_rw &&
+		test_bit(BCH_FS_ALLOCATOR_RUNNING, &ca->fs->flags)
+		? ALLOCATOR_running
+		: ALLOCATOR_stopped;
+	alloc_thread_set_state(ca, state);
+	return state == ALLOCATOR_running;
 }
 
-static inline bool allocator_thread_running(struct bch_dev *ca)
+static int buckets_available(struct bch_dev *ca, unsigned long gc_count)
 {
-	return ca->mi.state == BCH_MEMBER_STATE_rw &&
-		test_bit(BCH_FS_ALLOCATOR_RUNNING, &ca->fs->flags);
+	s64 available = dev_buckets_reclaimable(ca) -
+		(gc_count == ca->fs->gc_count ? ca->inc_gen_really_needs_gc : 0);
+	bool ret = available > 0;
+
+	alloc_thread_set_state(ca, ret
+			       ? ALLOCATOR_running
+			       : ALLOCATOR_blocked);
+	return ret;
 }
 
 /**
@@ -1026,56 +948,29 @@ static int bch2_allocator_thread(void *arg)
 {
 	struct bch_dev *ca = arg;
 	struct bch_fs *c = ca->fs;
+	unsigned long gc_count = c->gc_count;
 	size_t nr;
 	int ret;
 
 	set_freezable();
 
 	while (1) {
-		if (!allocator_thread_running(ca)) {
-			ca->allocator_state = ALLOCATOR_stopped;
-			if (kthread_wait_freezable(allocator_thread_running(ca)))
-				break;
-		}
-
-		ca->allocator_state = ALLOCATOR_running;
-
-		cond_resched();
-		if (kthread_should_stop())
-			break;
-
-		pr_debug("discarding %zu invalidated buckets",
-			 fifo_used(&ca->free_inc));
-
-		ret = discard_invalidated_buckets(c, ca);
+		ret = kthread_wait_freezable(allocator_thread_running(ca));
 		if (ret)
 			goto stop;
 
-		ret = bch2_invalidate_buckets(c, ca);
-		if (ret)
-			goto stop;
-
-		if (!fifo_empty(&ca->free_inc))
-			continue;
-
-		pr_debug("free_inc now empty");
-
-		while (1) {
+		while (!ca->alloc_heap.used) {
 			cond_resched();
-			/*
-			 * Find some buckets that we can invalidate, either
-			 * they're completely unused, or only contain clean data
-			 * that's been written back to the backing device or
-			 * another cache tier
-			 */
 
-			pr_debug("scanning for reclaimable buckets");
+			ret = kthread_wait_freezable(buckets_available(ca, gc_count));
+			if (ret)
+				goto stop;
 
+			gc_count = c->gc_count;
 			nr = find_reclaimable_buckets(c, ca);
 
-			pr_debug("found %zu buckets", nr);
-
-			trace_alloc_batch(ca, nr, ca->alloc_heap.size);
+			trace_alloc_scan(ca, nr, ca->inc_gen_needs_gc,
+					 ca->inc_gen_really_needs_gc);
 
 			if ((ca->inc_gen_needs_gc >= ALLOC_SCAN_BATCH(ca) ||
 			     ca->inc_gen_really_needs_gc) &&
@@ -1083,33 +978,24 @@ static int bch2_allocator_thread(void *arg)
 				atomic_inc(&c->kick_gc);
 				wake_up_process(c->gc_thread);
 			}
+		}
 
-			if (nr)
-				break;
+		ret = bch2_invalidate_buckets(c, ca);
+		if (ret)
+			goto stop;
 
-			/*
-			 * If we found any buckets, we have to invalidate them
-			 * before we scan for more - but if we didn't find very
-			 * many we may want to wait on more buckets being
-			 * available so we don't spin:
-			 */
-			ret = wait_buckets_available(c, ca);
+		while (!fifo_empty(&ca->free_inc)) {
+			u64 b = fifo_peek(&ca->free_inc);
+
+			discard_one_bucket(c, ca, b);
+
+			ret = kthread_wait_freezable(push_invalidated_bucket(c, ca, b));
 			if (ret)
 				goto stop;
 		}
-
-		pr_debug("%zu buckets to invalidate", nr);
-
-		/*
-		 * alloc_heap is now full of newly-invalidated buckets: next,
-		 * write out the new bucket gens:
-		 */
 	}
-
 stop:
-	pr_debug("alloc thread stopping (ret %i)", ret);
-	ca->allocator_state = ALLOCATOR_stopped;
-	closure_wake_up(&c->freelist_wait);
+	alloc_thread_set_state(ca, ALLOCATOR_stopped);
 	return 0;
 }
 
diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index f2f392eeb54a..6bf4140477a0 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -1,57 +1,14 @@
 // SPDX-License-Identifier: GPL-2.0
 /*
- * Primary bucket allocation code
- *
  * Copyright 2012 Google, Inc.
  *
- * Allocation in bcache is done in terms of buckets:
- *
- * Each bucket has associated an 8 bit gen; this gen corresponds to the gen in
- * btree pointers - they must match for the pointer to be considered valid.
- *
- * Thus (assuming a bucket has no dirty data or metadata in it) we can reuse a
- * bucket simply by incrementing its gen.
- *
- * The gens (along with the priorities; it's really the gens are important but
- * the code is named as if it's the priorities) are written in an arbitrary list
- * of buckets on disk, with a pointer to them in the journal header.
- *
- * When we invalidate a bucket, we have to write its new gen to disk and wait
- * for that write to complete before we use it - otherwise after a crash we
- * could have pointers that appeared to be good but pointed to data that had
- * been overwritten.
- *
- * Since the gens and priorities are all stored contiguously on disk, we can
- * batch this up: We fill up the free_inc list with freshly invalidated buckets,
- * call prio_write(), and when prio_write() finishes we pull buckets off the
- * free_inc list and optionally discard them.
- *
- * free_inc isn't the only freelist - if it was, we'd often have to sleep while
- * priorities and gens were being written before we could allocate. c->free is a
- * smaller freelist, and buckets on that list are always ready to be used.
- *
- * If we've got discards enabled, that happens when a bucket moves from the
- * free_inc list to the free list.
- *
- * It's important to ensure that gens don't wrap around - with respect to
- * either the oldest gen in the btree or the gen on disk. This is quite
- * difficult to do in practice, but we explicitly guard against it anyways - if
- * a bucket is in danger of wrapping around we simply skip invalidating it that
- * time around, and we garbage collect or rewrite the priorities sooner than we
- * would have otherwise.
+ * Foreground allocator code: allocate buckets from freelist, and allocate in
+ * sector granularity from writepoints.
  *
  * bch2_bucket_alloc() allocates a single bucket from a specific device.
  *
  * bch2_bucket_alloc_set() allocates one or more buckets from different devices
  * in a given filesystem.
- *
- * invalidate_buckets() drives all the processes described above. It's called
- * from bch2_bucket_alloc() and a few other places that need to make sure free
- * buckets are ready.
- *
- * invalidate_buckets_(lru|fifo)() find buckets that are available to be
- * invalidated, and then invalidate them and stick them on the free_inc list -
- * in either lru or fifo order.
  */
 
 #include "bcachefs.h"
diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h
index 887c0adddf12..c6d98f4c50e7 100644
--- a/fs/bcachefs/trace.h
+++ b/fs/bcachefs/trace.h
@@ -380,24 +380,27 @@ DEFINE_EVENT(bch_fs, gc_cannot_inc_gens,
 
 /* Allocator */
 
-TRACE_EVENT(alloc_batch,
-	TP_PROTO(struct bch_dev *ca, size_t free, size_t total),
-	TP_ARGS(ca, free, total),
+TRACE_EVENT(alloc_scan,
+	TP_PROTO(struct bch_dev *ca, u64 found, u64 inc_gen, u64 inc_gen_skipped),
+	TP_ARGS(ca, found, inc_gen, inc_gen_skipped),
 
 	TP_STRUCT__entry(
-		__array(char,		uuid,	16	)
-		__field(size_t,		free		)
-		__field(size_t,		total		)
+		__field(dev_t,		dev		)
+		__field(u64,		found		)
+		__field(u64,		inc_gen		)
+		__field(u64,		inc_gen_skipped	)
 	),
 
 	TP_fast_assign(
-		memcpy(__entry->uuid, ca->uuid.b, 16);
-		__entry->free = free;
-		__entry->total = total;
+		__entry->dev		= ca->disk_sb.bdev->bd_dev;
+		__entry->found		= found;
+		__entry->inc_gen	= inc_gen;
+		__entry->inc_gen_skipped = inc_gen_skipped;
 	),
 
-	TP_printk("%pU free %zu total %zu",
-		__entry->uuid, __entry->free, __entry->total)
+	TP_printk("%d,%d found %llu inc_gen %llu inc_gen_skipped %llu",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->found, __entry->inc_gen, __entry->inc_gen_skipped)
 );
 
 TRACE_EVENT(invalidate,
@@ -417,8 +420,10 @@ TRACE_EVENT(invalidate,
 	),
 
 	TP_printk("invalidated %u sectors at %d,%d sector=%llu",
-		  __entry->sectors, MAJOR(__entry->dev),
-		  MINOR(__entry->dev), __entry->offset)
+		  __entry->sectors,
+		  MAJOR(__entry->dev),
+		  MINOR(__entry->dev),
+		  __entry->offset)
 );
 
 DECLARE_EVENT_CLASS(bucket_alloc,
@@ -426,16 +431,18 @@ DECLARE_EVENT_CLASS(bucket_alloc,
 	TP_ARGS(ca, reserve),
 
 	TP_STRUCT__entry(
-		__array(char,			uuid,	16)
-		__field(enum alloc_reserve,	reserve	  )
+		__field(dev_t,			dev	)
+		__field(enum alloc_reserve,	reserve	)
 	),
 
 	TP_fast_assign(
-		memcpy(__entry->uuid, ca->uuid.b, 16);
-		__entry->reserve = reserve;
+		__entry->dev		= ca->disk_sb.bdev->bd_dev;
+		__entry->reserve	= reserve;
 	),
 
-	TP_printk("%pU reserve %d", __entry->uuid, __entry->reserve)
+	TP_printk("%d,%d reserve %d",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->reserve)
 );
 
 DEFINE_EVENT(bucket_alloc, bucket_alloc,
-- 
cgit 


From e95d7edfb709cb05cc128a6f4067b2ef05f6a8ac Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 19 Apr 2021 00:33:05 -0400
Subject: bcachefs: Preallocate trans mem in bch2_migrate_index_update()

This will help avoid transaction restarts.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/move.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index 3036db599e7b..1c4ead7bd0bf 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -67,7 +67,7 @@ static int bch2_migrate_index_update(struct bch_write_op *op)
 	bch2_bkey_buf_init(&_insert);
 	bch2_bkey_buf_realloc(&_insert, c, U8_MAX);
 
-	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
+	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024);
 
 	iter = bch2_trans_get_iter(&trans, m->btree_id,
 				   bkey_start_pos(&bch2_keylist_front(keys)->k),
-- 
cgit 


From 8ce600d44720a86c069fafefc92167ed85cc72c7 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 19 Apr 2021 17:07:20 -0400
Subject: bcachefs: Fix for btree_gc repairing interior btree ptrs

Using the normal transaction commit path to insert and journal updates
to interior nodes hadn't been done before this repair code was written,
not surprising that there was a bug.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update_leaf.c | 19 ++++++++++---------
 fs/bcachefs/journal.h           |  5 +++--
 2 files changed, 13 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 6e26531eb671..96b53effded7 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -307,8 +307,7 @@ btree_key_can_insert_cached(struct btree_trans *trans,
 }
 
 static inline void do_btree_insert_one(struct btree_trans *trans,
-				       struct btree_iter *iter,
-				       struct bkey_i *insert)
+				       struct btree_insert_entry *i)
 {
 	struct bch_fs *c = trans->c;
 	struct journal *j = &c->journal;
@@ -317,20 +316,22 @@ static inline void do_btree_insert_one(struct btree_trans *trans,
 	EBUG_ON(trans->journal_res.ref !=
 		!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY));
 
-	insert->k.needs_whiteout = false;
+	i->k->k.needs_whiteout = false;
 
-	did_work = (btree_iter_type(iter) != BTREE_ITER_CACHED)
-		? btree_insert_key_leaf(trans, iter, insert)
-		: bch2_btree_insert_key_cached(trans, iter, insert);
+	did_work = (btree_iter_type(i->iter) != BTREE_ITER_CACHED)
+		? btree_insert_key_leaf(trans, i->iter, i->k)
+		: bch2_btree_insert_key_cached(trans, i->iter, i->k);
 	if (!did_work)
 		return;
 
 	if (likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) {
 		bch2_journal_add_keys(j, &trans->journal_res,
-				      iter->btree_id, insert);
+				      i->btree_id,
+				      i->level,
+				      i->k);
 
 		bch2_journal_set_has_inode(j, &trans->journal_res,
-					   insert->k.p.inode);
+					   i->k->k.p.inode);
 
 		if (trans->journal_seq)
 			*trans->journal_seq = trans->journal_res.seq;
@@ -468,7 +469,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans,
 		bch2_trans_mark_gc(trans);
 
 	trans_for_each_update2(trans, i)
-		do_btree_insert_one(trans, i->iter, i->k);
+		do_btree_insert_one(trans, i);
 err:
 	if (marking) {
 		percpu_up_read(&c->mark_lock);
diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h
index cc497125889f..1d556790b38e 100644
--- a/fs/bcachefs/journal.h
+++ b/fs/bcachefs/journal.h
@@ -241,10 +241,11 @@ static inline void bch2_journal_add_entry(struct journal *j, struct journal_res
 }
 
 static inline void bch2_journal_add_keys(struct journal *j, struct journal_res *res,
-					enum btree_id id, const struct bkey_i *k)
+					enum btree_id id, unsigned level,
+					const struct bkey_i *k)
 {
 	bch2_journal_add_entry(j, res, BCH_JSET_ENTRY_btree_keys,
-			       id, 0, k, k->k.u64s);
+			       id, level, k, k->k.u64s);
 }
 
 static inline bool journal_entry_empty(struct jset *j)
-- 
cgit 


From 4d47b21c4dac0d27ad02add2c68be0afaa63ef98 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 19 Apr 2021 17:17:34 -0400
Subject: bcachefs: Fix a use after free

Turns out, we weren't waiting on in flight btree writes when freeing
existing btree nodes. This lead to stray btree writes overwriting newly
allocated buckets, but only started showing itself with some of the
recent allocator work and another patch to move submitting of btree
writes to worqueues.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update_interior.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index b3137525f9c1..dabd1a3e3aa3 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -887,6 +887,14 @@ void bch2_btree_interior_update_will_free_node(struct btree_update *as,
 	btree_update_drop_new_node(c, b);
 
 	btree_update_will_delete_key(as, &b->key);
+
+	/*
+	 * XXX: Waiting on io with btree node locks held, we don't want to be
+	 * doing this. We can't have btree writes happening after the space has
+	 * been freed, but we really only need to block before
+	 * btree_update_nodes_written_trans() happens.
+	 */
+	btree_node_wait_on_io(b);
 }
 
 void bch2_btree_update_done(struct btree_update *as)
-- 
cgit 


From 51c804ed2a60d995c1a358018491471e34bb76b6 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 6 Apr 2021 15:28:34 -0400
Subject: bcachefs: Punt btree writes to workqueue to submit

We don't want to be submitting IO with btree locks held, and btree
writes usually aren't latency sensitive.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_io.c | 20 ++++++++++++--------
 fs/bcachefs/btree_io.h |  1 +
 2 files changed, 13 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index fd90e434c78c..2f5b7c629a9c 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -1340,6 +1340,13 @@ static int validate_bset_for_write(struct bch_fs *c, struct btree *b,
 	return ret;
 }
 
+static void btree_write_submit(struct work_struct *work)
+{
+	struct btree_write_bio *wbio = container_of(work, struct btree_write_bio, work);
+
+	bch2_submit_wbio_replicas(&wbio->wbio, wbio->wbio.c, BCH_DATA_btree, &wbio->key);
+}
+
 void __bch2_btree_node_write(struct bch_fs *c, struct btree *b)
 {
 	struct btree_write_bio *wbio;
@@ -1347,7 +1354,6 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b)
 	struct bset *i;
 	struct btree_node *bn = NULL;
 	struct btree_node_entry *bne = NULL;
-	struct bkey_buf k;
 	struct bch_extent_ptr *ptr;
 	struct sort_iter sort_iter;
 	struct nonce nonce;
@@ -1358,8 +1364,6 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b)
 	bool validate_before_checksum = false;
 	void *data;
 
-	bch2_bkey_buf_init(&k);
-
 	if (test_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags))
 		return;
 
@@ -1538,6 +1542,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b)
 	wbio_init(&wbio->wbio.bio);
 	wbio->data			= data;
 	wbio->bytes			= bytes;
+	wbio->wbio.c			= c;
 	wbio->wbio.used_mempool		= used_mempool;
 	wbio->wbio.bio.bi_end_io	= btree_node_write_endio;
 	wbio->wbio.bio.bi_private	= b;
@@ -1559,9 +1564,9 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b)
 	 * just make all btree node writes FUA to keep things sane.
 	 */
 
-	bch2_bkey_buf_copy(&k, c, &b->key);
+	bkey_copy(&wbio->key, &b->key);
 
-	bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(k.k)), ptr)
+	bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&wbio->key)), ptr)
 		ptr->offset += b->written;
 
 	b->written += sectors_to_write;
@@ -1569,9 +1574,8 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b)
 	atomic64_inc(&c->btree_writes_nr);
 	atomic64_add(sectors_to_write, &c->btree_writes_sectors);
 
-	/* XXX: submitting IO with btree locks held: */
-	bch2_submit_wbio_replicas(&wbio->wbio, c, BCH_DATA_btree, k.k);
-	bch2_bkey_buf_exit(&k, c);
+	INIT_WORK(&wbio->work, btree_write_submit);
+	schedule_work(&wbio->work);
 	return;
 err:
 	set_btree_node_noevict(b);
diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h
index 95c351611045..c8a8b05a19b0 100644
--- a/fs/bcachefs/btree_io.h
+++ b/fs/bcachefs/btree_io.h
@@ -42,6 +42,7 @@ struct btree_read_bio {
 
 struct btree_write_bio {
 	struct work_struct	work;
+	__BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX);
 	void			*data;
 	unsigned		bytes;
 	struct bch_write_bio	wbio;
-- 
cgit 


From 7b7278bbaf2a5d2e6955d763a8a30a3ba190dc10 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 20 Apr 2021 20:21:39 -0400
Subject: bcachefs: Fix two btree iterator leaks

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/ec.c   | 1 +
 fs/bcachefs/move.c | 6 ++++--
 2 files changed, 5 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index 370f9e6916f3..7d681a7254c4 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -1621,6 +1621,7 @@ int bch2_stripes_write(struct bch_fs *c, unsigned flags)
 		if (ret)
 			break;
 	}
+	bch2_trans_iter_put(&trans, iter);
 
 	bch2_trans_exit(&trans);
 
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index 1c4ead7bd0bf..bce10eb4eb4c 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -761,7 +761,7 @@ static int bch2_move_btree(struct bch_fs *c,
 				    id == start_btree_id ? start_pos : POS_MIN,
 				    BTREE_ITER_PREFETCH, b) {
 			if (kthread && kthread_should_stop())
-				goto out;
+				break;
 
 			if ((cmp_int(id, end_btree_id) ?:
 			     bkey_cmp(b->key.k.p, end_pos)) > 0)
@@ -788,8 +788,10 @@ next:
 		}
 
 		ret = bch2_trans_iter_free(&trans, iter) ?: ret;
+		if (kthread && kthread_should_stop())
+			break;
 	}
-out:
+
 	bch2_trans_exit(&trans);
 
 	if (ret)
-- 
cgit 


From 6adaac0b95dd39da290f6006822fbf7560d2d1b5 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 20 Apr 2021 20:21:12 -0400
Subject: bcachefs: Update bch2_btree_verify()

bch2_btree_verify() verifies that the btree node on disk matches what we
have in memory. This patch changes it to verify every replica, and also
fixes it for interior btree nodes - there's a mem_ptr field which is
used as a scratch space and needs to be zeroed out for comparing with
what's on disk.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs.h    |  12 ++---
 fs/bcachefs/btree_cache.c |  22 +--------
 fs/bcachefs/btree_cache.h |   1 +
 fs/bcachefs/debug.c       | 120 ++++++++++++++++++++++++++++++----------------
 fs/bcachefs/debug.h       |   4 --
 5 files changed, 87 insertions(+), 72 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 55e40b3acb90..dc7b78b7e2a2 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -259,7 +259,11 @@ do {									\
 	BCH_DEBUG_PARAM(btree_gc_rewrite_disabled,			\
 		"Disables rewriting of btree nodes during mark and sweep")\
 	BCH_DEBUG_PARAM(btree_shrinker_disabled,			\
-		"Disables the shrinker callback for the btree node cache")
+		"Disables the shrinker callback for the btree node cache")\
+	BCH_DEBUG_PARAM(verify_btree_ondisk,				\
+		"Reread btree nodes at various points to verify the "	\
+		"mergesort in the read path against modifications "	\
+		"done in memory")
 
 /* Parameters that should only be compiled in in debug mode: */
 #define BCH_DEBUG_PARAMS_DEBUG()					\
@@ -273,10 +277,6 @@ do {									\
 		"information) when iterating over keys")		\
 	BCH_DEBUG_PARAM(debug_check_btree_accounting,			\
 		"Verify btree accounting for keys within a node")	\
-	BCH_DEBUG_PARAM(verify_btree_ondisk,				\
-		"Reread btree nodes at various points to verify the "	\
-		"mergesort in the read path against modifications "	\
-		"done in memory")					\
 	BCH_DEBUG_PARAM(journal_seq_verify,				\
 		"Store the journal sequence number in the version "	\
 		"number of every btree key, and verify that btree "	\
@@ -816,11 +816,9 @@ mempool_t		bio_bounce_pages;
 	/* DEBUG JUNK */
 	struct dentry		*debug;
 	struct btree_debug	btree_debug[BTREE_ID_NR];
-#ifdef CONFIG_BCACHEFS_DEBUG
 	struct btree		*verify_data;
 	struct btree_node	*verify_ondisk;
 	struct mutex		verify_lock;
-#endif
 
 	u64			*unused_inode_hints;
 	unsigned		inode_shard_bits;
diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index 8ed8610796fb..cc703c2602cf 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -101,7 +101,7 @@ static struct btree *__btree_node_mem_alloc(struct bch_fs *c)
 	return b;
 }
 
-static struct btree *btree_node_mem_alloc(struct bch_fs *c)
+struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *c)
 {
 	struct btree_cache *bc = &c->btree_cache;
 	struct btree *b = __btree_node_mem_alloc(c);
@@ -367,12 +367,10 @@ void bch2_fs_btree_cache_exit(struct bch_fs *c)
 	flags = memalloc_nofs_save();
 	mutex_lock(&bc->lock);
 
-#ifdef CONFIG_BCACHEFS_DEBUG
 	if (c->verify_data)
 		list_move(&c->verify_data->list, &bc->live);
 
 	kvpfree(c->verify_ondisk, btree_bytes(c));
-#endif
 
 	for (i = 0; i < BTREE_ID_NR; i++)
 		if (c->btree_roots[i].b)
@@ -426,31 +424,15 @@ int bch2_fs_btree_cache_init(struct bch_fs *c)
 	bch2_recalc_btree_reserve(c);
 
 	for (i = 0; i < bc->reserve; i++)
-		if (!btree_node_mem_alloc(c)) {
+		if (!__bch2_btree_node_mem_alloc(c)) {
 			ret = -ENOMEM;
 			goto out;
 		}
 
 	list_splice_init(&bc->live, &bc->freeable);
 
-#ifdef CONFIG_BCACHEFS_DEBUG
 	mutex_init(&c->verify_lock);
 
-	c->verify_ondisk = kvpmalloc(btree_bytes(c), GFP_KERNEL);
-	if (!c->verify_ondisk) {
-		ret = -ENOMEM;
-		goto out;
-	}
-
-	c->verify_data = btree_node_mem_alloc(c);
-	if (!c->verify_data) {
-		ret = -ENOMEM;
-		goto out;
-	}
-
-	list_del_init(&c->verify_data->list);
-#endif
-
 	bc->shrink.count_objects	= bch2_btree_cache_count;
 	bc->shrink.scan_objects		= bch2_btree_cache_scan;
 	bc->shrink.seeks		= 4;
diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h
index 4791c3b64452..c517cc029454 100644
--- a/fs/bcachefs/btree_cache.h
+++ b/fs/bcachefs/btree_cache.h
@@ -17,6 +17,7 @@ int bch2_btree_node_hash_insert(struct btree_cache *, struct btree *,
 void bch2_btree_cache_cannibalize_unlock(struct bch_fs *);
 int bch2_btree_cache_cannibalize_lock(struct bch_fs *, struct closure *);
 
+struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *);
 struct btree *bch2_btree_node_mem_alloc(struct bch_fs *);
 
 struct btree *bch2_btree_node_get(struct bch_fs *, struct btree_iter *,
diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c
index eb8c57d253fb..d6dde62b6d48 100644
--- a/fs/bcachefs/debug.c
+++ b/fs/bcachefs/debug.c
@@ -29,40 +29,19 @@
 
 static struct dentry *bch_debug;
 
-#ifdef CONFIG_BCACHEFS_DEBUG
-
-void __bch2_btree_verify(struct bch_fs *c, struct btree *b)
+static bool bch2_btree_verify_replica(struct bch_fs *c, struct btree *b,
+				      struct extent_ptr_decoded pick)
 {
 	struct btree *v = c->verify_data;
-	struct btree_node *n_ondisk, *n_sorted, *n_inmemory;
-	struct bset *sorted, *inmemory;
-	struct extent_ptr_decoded pick;
-	struct bch_dev *ca;
+	struct btree_node *n_ondisk = c->verify_ondisk;
+	struct btree_node *n_sorted = c->verify_data->data;
+	struct bset *sorted, *inmemory = &b->data->keys;
+	struct bch_dev *ca = bch_dev_bkey_exists(c, pick.ptr.dev);
 	struct bio *bio;
+	bool failed = false;
 
-	if (c->opts.nochanges)
-		return;
-
-	btree_node_io_lock(b);
-	mutex_lock(&c->verify_lock);
-
-	n_ondisk = c->verify_ondisk;
-	n_sorted = c->verify_data->data;
-	n_inmemory = b->data;
-
-	bkey_copy(&v->key, &b->key);
-	v->written	= 0;
-	v->c.level	= b->c.level;
-	v->c.btree_id	= b->c.btree_id;
-	bch2_btree_keys_init(v);
-
-	if (bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key),
-				       NULL, &pick) <= 0)
-		return;
-
-	ca = bch_dev_bkey_exists(c, pick.ptr.dev);
 	if (!bch2_dev_get_ioref(ca, READ))
-		return;
+		return false;
 
 	bio = bio_alloc_bioset(ca->disk_sb.bdev,
 			       buf_pages(n_sorted, btree_bytes(c)),
@@ -79,12 +58,12 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b)
 
 	memcpy(n_ondisk, n_sorted, btree_bytes(c));
 
+	v->written = 0;
 	if (bch2_btree_node_read_done(c, ca, v, false))
-		goto out;
+		return false;
 
 	n_sorted = c->verify_data->data;
 	sorted = &n_sorted->keys;
-	inmemory = &n_inmemory->keys;
 
 	if (inmemory->u64s != sorted->u64s ||
 	    memcmp(inmemory->start,
@@ -102,8 +81,8 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b)
 		printk(KERN_ERR "*** read back in:\n");
 		bch2_dump_bset(c, v, sorted, 0);
 
-		while (offset < b->written) {
-			if (!offset ) {
+		while (offset < v->written) {
+			if (!offset) {
 				i = &n_ondisk->keys;
 				sectors = vstruct_blocks(n_ondisk, c->block_bits) <<
 					c->block_bits;
@@ -122,25 +101,84 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b)
 			offset += sectors;
 		}
 
-		printk(KERN_ERR "*** block %u/%u not written\n",
-		       offset >> c->block_bits, btree_blocks(c));
-
 		for (j = 0; j < le16_to_cpu(inmemory->u64s); j++)
 			if (inmemory->_data[j] != sorted->_data[j])
 				break;
 
-		printk(KERN_ERR "b->written %u\n", b->written);
-
 		console_unlock();
-		panic("verify failed at %u\n", j);
+		bch_err(c, "verify failed at key %u", j);
+
+		failed = true;
+	}
+
+	if (v->written != b->written) {
+		bch_err(c, "written wrong: expected %u, got %u",
+			b->written, v->written);
+		failed = true;
+	}
+
+	return failed;
+}
+
+void __bch2_btree_verify(struct bch_fs *c, struct btree *b)
+{
+	struct bkey_ptrs_c ptrs;
+	struct extent_ptr_decoded p;
+	const union bch_extent_entry *entry;
+	struct btree *v;
+	struct bset *inmemory = &b->data->keys;
+	struct bkey_packed *k;
+	bool failed = false;
+
+	if (c->opts.nochanges)
+		return;
+
+	btree_node_io_lock(b);
+	mutex_lock(&c->verify_lock);
+
+	if (!c->verify_ondisk) {
+		c->verify_ondisk = kvpmalloc(btree_bytes(c), GFP_KERNEL);
+		if (!c->verify_ondisk)
+			goto out;
+	}
+
+	if (!c->verify_data) {
+		c->verify_data = __bch2_btree_node_mem_alloc(c);
+		if (!c->verify_data)
+			goto out;
+
+		list_del_init(&c->verify_data->list);
+	}
+
+	BUG_ON(b->nsets != 1);
+
+	for (k = inmemory->start; k != vstruct_last(inmemory); k = bkey_next(k))
+		if (k->type == KEY_TYPE_btree_ptr_v2) {
+			struct bch_btree_ptr_v2 *v = (void *) bkeyp_val(&b->format, k);
+			v->mem_ptr = 0;
+		}
+
+	v = c->verify_data;
+	bkey_copy(&v->key, &b->key);
+	v->c.level	= b->c.level;
+	v->c.btree_id	= b->c.btree_id;
+	bch2_btree_keys_init(v);
+
+	ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(&b->key));
+	bkey_for_each_ptr_decode(&b->key.k, ptrs, p, entry)
+		failed |= bch2_btree_verify_replica(c, b, p);
+
+	if (failed) {
+		char buf[200];
+
+		bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(&b->key));
+		bch2_fs_fatal_error(c, "btree node verify failed for : %s\n", buf);
 	}
 out:
 	mutex_unlock(&c->verify_lock);
 	btree_node_io_unlock(b);
 }
 
-#endif
-
 #ifdef CONFIG_DEBUG_FS
 
 /* XXX: bch_fs refcounting */
diff --git a/fs/bcachefs/debug.h b/fs/bcachefs/debug.h
index 7ac1615e9447..0b86736e5e1b 100644
--- a/fs/bcachefs/debug.h
+++ b/fs/bcachefs/debug.h
@@ -8,11 +8,7 @@ struct bio;
 struct btree;
 struct bch_fs;
 
-#ifdef CONFIG_BCACHEFS_DEBUG
 void __bch2_btree_verify(struct bch_fs *, struct btree *);
-#else
-static inline void __bch2_btree_verify(struct bch_fs *c, struct btree *b) {}
-#endif
 
 static inline void bch2_btree_verify(struct bch_fs *c, struct btree *b)
 {
-- 
cgit 


From f09517fc5136ad2ea67b6f332ed2445f053ccfe6 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 20 Apr 2021 17:09:25 -0400
Subject: bcachefs: Fix a deadlock on journal reclaim

Flushing the btree key cache needs to use allocation reserves - journal
reclaim depends on flushing the btree key cache for making forward
progress, and the allocator and copygc depend on journal reclaim making
forward progress.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_key_cache.c |  6 ++++++
 fs/bcachefs/journal_reclaim.c |  2 +-
 fs/bcachefs/movinggc.c        | 13 ++++++++++++-
 3 files changed, 19 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index ac844f47b8dd..0716c3314a36 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -386,12 +386,18 @@ retry:
 		goto evict;
 	}
 
+	/*
+	 * Since journal reclaim depends on us making progress here, and the
+	 * allocator/copygc depend on journal reclaim making progress, we need
+	 * to be using alloc reserves:
+	 * */
 	ret   = bch2_btree_iter_traverse(b_iter) ?:
 		bch2_trans_update(trans, b_iter, ck->k, BTREE_TRIGGER_NORUN) ?:
 		bch2_trans_commit(trans, NULL, NULL,
 				  BTREE_INSERT_NOUNLOCK|
 				  BTREE_INSERT_NOCHECK_RW|
 				  BTREE_INSERT_NOFAIL|
+				  BTREE_INSERT_USE_RESERVE|
 				  (ck->journal.seq == journal_last_seq(j)
 				   ? BTREE_INSERT_JOURNAL_RESERVED
 				   : 0)|
diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
index e2086c76d104..416f8611f008 100644
--- a/fs/bcachefs/journal_reclaim.c
+++ b/fs/bcachefs/journal_reclaim.c
@@ -634,7 +634,7 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct)
 			       msecs_to_jiffies(j->reclaim_delay_ms)))
 			min_nr = 1;
 
-		if (j->prereserved.reserved * 2 > j->prereserved.remaining)
+		if (j->prereserved.reserved * 4 > j->prereserved.remaining)
 			min_nr = 1;
 
 		if (fifo_free(&j->pin) <= 32)
diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c
index 3d57a72e63e4..f9146ccd70ef 100644
--- a/fs/bcachefs/movinggc.c
+++ b/fs/bcachefs/movinggc.c
@@ -87,9 +87,20 @@ static enum data_cmd copygc_pred(struct bch_fs *c, void *arg,
 		if (i >= 0 &&
 		    p.ptr.offset < h->data[i].offset + ca->mi.bucket_size &&
 		    p.ptr.gen == h->data[i].gen) {
+			/*
+			 * We need to use the journal reserve here, because
+			 *  - journal reclaim depends on btree key cache
+			 *    flushing to make forward progress,
+			 *  - which has to make forward progress when the
+			 *    journal is pre-reservation full,
+			 *  - and depends on allocation - meaning allocator and
+			 *    copygc
+			 */
+
 			data_opts->target		= io_opts->background_target;
 			data_opts->nr_replicas		= 1;
-			data_opts->btree_insert_flags	= BTREE_INSERT_USE_RESERVE;
+			data_opts->btree_insert_flags	= BTREE_INSERT_USE_RESERVE|
+				BTREE_INSERT_JOURNAL_RESERVED;
 			data_opts->rewrite_dev		= p.ptr.dev;
 
 			if (p.has_ec)
-- 
cgit 


From eb365fbc333519651bed300eb866f26dbc1efd05 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 21 Apr 2021 18:08:39 -0400
Subject: bcachefs: Don't BUG() in update_replicas

Apparently, we have a bug where in mark and sweep while accounting for a
key, a replicas entry isn't found. Change the code to print out the key
we couldn't mark and halt instead of a BUG_ON().

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/buckets.c | 49 ++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 36 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index b51f311ad60e..e6e984587b5d 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -399,20 +399,22 @@ static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
 		bch2_wake_allocator(ca);
 }
 
-static inline void update_replicas(struct bch_fs *c,
+static inline int update_replicas(struct bch_fs *c,
 				   struct bch_fs_usage *fs_usage,
 				   struct bch_replicas_entry *r,
 				   s64 sectors)
 {
 	int idx = bch2_replicas_entry_idx(c, r);
 
-	BUG_ON(idx < 0);
+	if (idx < 0)
+		return -1;
 
 	fs_usage_data_type_to_base(fs_usage, r->data_type, sectors);
 	fs_usage->replicas[idx]		+= sectors;
+	return 0;
 }
 
-static inline void update_cached_sectors(struct bch_fs *c,
+static inline int update_cached_sectors(struct bch_fs *c,
 					 struct bch_fs_usage *fs_usage,
 					 unsigned dev, s64 sectors)
 {
@@ -420,7 +422,7 @@ static inline void update_cached_sectors(struct bch_fs *c,
 
 	bch2_replicas_entry_cached(&r.e, dev);
 
-	update_replicas(c, fs_usage, &r.e, sectors);
+	return update_replicas(c, fs_usage, &r.e, sectors);
 }
 
 static struct replicas_delta_list *
@@ -573,8 +575,12 @@ static int bch2_mark_alloc(struct bch_fs *c,
 
 	if ((flags & BTREE_TRIGGER_BUCKET_INVALIDATE) &&
 	    old_m.cached_sectors) {
-		update_cached_sectors(c, fs_usage, ca->dev_idx,
-				      -old_m.cached_sectors);
+		if (update_cached_sectors(c, fs_usage, ca->dev_idx,
+				      -old_m.cached_sectors)) {
+			bch2_fs_fatal_error(c, "bch2_mark_alloc(): no replicas entry while updating cached sectors");
+			return -1;
+		}
+
 		trace_invalidate(ca, bucket_to_sector(ca, new.k->p.offset),
 				 old_m.cached_sectors);
 	}
@@ -956,8 +962,12 @@ static int bch2_mark_extent(struct bch_fs *c,
 
 		if (p.ptr.cached) {
 			if (!stale)
-				update_cached_sectors(c, fs_usage, p.ptr.dev,
-						      disk_sectors);
+				if (update_cached_sectors(c, fs_usage, p.ptr.dev,
+							  disk_sectors)) {
+					bch2_fs_fatal_error(c, "bch2_mark_extent(): no replicas entry while updating cached sectors");
+					return -1;
+
+				}
 		} else if (!p.has_ec) {
 			dirty_sectors	       += disk_sectors;
 			r.e.devs[r.e.nr_devs++]	= p.ptr.dev;
@@ -976,8 +986,15 @@ static int bch2_mark_extent(struct bch_fs *c,
 		}
 	}
 
-	if (r.e.nr_devs)
-		update_replicas(c, fs_usage, &r.e, dirty_sectors);
+	if (r.e.nr_devs) {
+		if (update_replicas(c, fs_usage, &r.e, dirty_sectors)) {
+			char buf[200];
+
+			bch2_bkey_val_to_text(&PBUF(buf), c, k);
+			bch2_fs_fatal_error(c, "no replicas entry for %s", buf);
+			return -1;
+		}
+	}
 
 	return 0;
 }
@@ -1051,8 +1068,14 @@ static int bch2_mark_stripe(struct bch_fs *c,
 				return ret;
 		}
 
-		update_replicas(c, fs_usage, &m->r.e,
-				((s64) m->sectors * m->nr_redundant));
+		if (update_replicas(c, fs_usage, &m->r.e,
+				((s64) m->sectors * m->nr_redundant))) {
+			char buf[200];
+
+			bch2_bkey_val_to_text(&PBUF(buf), c, new);
+			bch2_fs_fatal_error(c, "no replicas entry for %s", buf);
+			return -1;
+		}
 	}
 
 	return 0;
@@ -1312,7 +1335,7 @@ void bch2_trans_fs_usage_apply(struct btree_trans *trans,
 			added += d->delta;
 		}
 
-		update_replicas(c, dst, &d->r, d->delta);
+		BUG_ON(update_replicas(c, dst, &d->r, d->delta));
 	}
 
 	dst->nr_inodes += deltas->nr_inodes;
-- 
cgit 


From 58686a259ed28f1ee50cc0aaa039e986aa218e0c Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 19 Apr 2021 22:19:18 -0400
Subject: bcachefs: Lookup/create lost+found lazily

This is prep work for subvolumes - each subvolume will have its own
lost+found.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fsck.c | 222 ++++++++++++++++++++++++++---------------------------
 1 file changed, 111 insertions(+), 111 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index cfe606342032..1ce038846476 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -38,9 +38,9 @@ static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum)
 	return ret ?: sectors;
 }
 
-static int lookup_inode(struct btree_trans *trans, u64 inode_nr,
-			struct bch_inode_unpacked *inode,
-			u32 *snapshot)
+static int __lookup_inode(struct btree_trans *trans, u64 inode_nr,
+			  struct bch_inode_unpacked *inode,
+			  u32 *snapshot)
 {
 	struct btree_iter *iter;
 	struct bkey_s_c k;
@@ -63,19 +63,34 @@ err:
 	return ret;
 }
 
-static int write_inode(struct btree_trans *trans,
-		       struct bch_inode_unpacked *inode,
-		       u32 snapshot)
+static int lookup_inode(struct btree_trans *trans, u64 inode_nr,
+			struct bch_inode_unpacked *inode,
+			u32 *snapshot)
+{
+	return lockrestart_do(trans, __lookup_inode(trans, inode_nr, inode, snapshot));
+}
+
+static int __write_inode(struct btree_trans *trans,
+			 struct bch_inode_unpacked *inode,
+			 u32 snapshot)
 {
 	struct btree_iter *inode_iter =
 		bch2_trans_get_iter(trans, BTREE_ID_inodes,
 				    SPOS(0, inode->bi_inum, snapshot),
 				    BTREE_ITER_INTENT);
+	int ret = bch2_inode_write(trans, inode_iter, inode);
+	bch2_trans_iter_put(trans, inode_iter);
+	return ret;
+}
+
+static int write_inode(struct btree_trans *trans,
+		       struct bch_inode_unpacked *inode,
+		       u32 snapshot)
+{
 	int ret = __bch2_trans_do(trans, NULL, NULL,
 				  BTREE_INSERT_NOFAIL|
 				  BTREE_INSERT_LAZY_RW,
-				  bch2_inode_write(trans, inode_iter, inode));
-	bch2_trans_iter_put(trans, inode_iter);
+				  __write_inode(trans, inode, snapshot));
 	if (ret)
 		bch_err(trans->c, "error in fsck: error %i updating inode", ret);
 	return ret;
@@ -114,57 +129,101 @@ static int remove_dirent(struct btree_trans *trans, struct bpos pos)
 	return ret;
 }
 
-static int __reattach_inode(struct btree_trans *trans,
-			    struct bch_inode_unpacked *lostfound,
-			    u64 inum)
+/* Get lost+found, create if it doesn't exist: */
+static int lookup_lostfound(struct btree_trans *trans,
+			    struct bch_inode_unpacked *lostfound)
 {
-	struct bch_hash_info dir_hash =
-		bch2_hash_info_init(trans->c, lostfound);
-	struct bch_inode_unpacked inode_u;
+	struct bch_fs *c = trans->c;
+	struct bch_inode_unpacked root;
+	struct bch_hash_info root_hash_info;
+	struct qstr lostfound_str = QSTR("lost+found");
+	u64 inum;
+	u32 snapshot;
+	int ret;
+
+	ret = lookup_inode(trans, BCACHEFS_ROOT_INO, &root, &snapshot);
+	if (ret && ret != -ENOENT)
+		return ret;
+
+	root_hash_info = bch2_hash_info_init(c, &root);
+	inum = bch2_dirent_lookup(c, BCACHEFS_ROOT_INO, &root_hash_info,
+				  &lostfound_str);
+	if (!inum) {
+		bch_notice(c, "creating lost+found");
+		goto create_lostfound;
+	}
+
+	ret = lookup_inode(trans, inum, lostfound, &snapshot);
+	if (ret && ret != -ENOENT) {
+		/*
+		 * The check_dirents pass has already run, dangling dirents
+		 * shouldn't exist here:
+		 */
+		bch_err(c, "error looking up lost+found: %i", ret);
+		return ret;
+	}
+
+	if (ret == -ENOENT) {
+create_lostfound:
+		bch2_inode_init_early(c, lostfound);
+
+		ret = __bch2_trans_do(trans, NULL, NULL,
+				      BTREE_INSERT_NOFAIL|
+				      BTREE_INSERT_LAZY_RW,
+			bch2_create_trans(trans,
+					  BCACHEFS_ROOT_INO, &root,
+					  lostfound,
+					  &lostfound_str,
+					  0, 0, S_IFDIR|0700, 0, NULL, NULL));
+		if (ret)
+			bch_err(c, "error creating lost+found: %i", ret);
+	}
+
+	return 0;
+}
+
+static int reattach_inode(struct btree_trans *trans,
+			  struct bch_inode_unpacked *inode)
+{
+	struct bch_hash_info dir_hash;
+	struct bch_inode_unpacked lostfound;
 	char name_buf[20];
 	struct qstr name;
 	u64 dir_offset = 0;
-	u32 snapshot;
 	int ret;
 
-	snprintf(name_buf, sizeof(name_buf), "%llu", inum);
-	name = (struct qstr) QSTR(name_buf);
-
-	ret = lookup_inode(trans, inum, &inode_u, &snapshot);
+	ret = lookup_lostfound(trans, &lostfound);
 	if (ret)
 		return ret;
 
-	if (S_ISDIR(inode_u.bi_mode)) {
-		lostfound->bi_nlink++;
+	if (S_ISDIR(inode->bi_mode)) {
+		lostfound.bi_nlink++;
 
-		ret = write_inode(trans, lostfound, U32_MAX);
+		ret = write_inode(trans, &lostfound, U32_MAX);
 		if (ret)
 			return ret;
 	}
 
-	ret = bch2_dirent_create(trans, lostfound->bi_inum, &dir_hash,
-				 mode_to_type(inode_u.bi_mode),
-				 &name, inum, &dir_offset,
-				 BCH_HASH_SET_MUST_CREATE);
-	if (ret)
-		return ret;
+	dir_hash = bch2_hash_info_init(trans->c, &lostfound);
 
-	inode_u.bi_dir		= lostfound->bi_inum;
-	inode_u.bi_dir_offset	= dir_offset;
+	snprintf(name_buf, sizeof(name_buf), "%llu", inode->bi_inum);
+	name = (struct qstr) QSTR(name_buf);
 
-	return write_inode(trans, &inode_u, U32_MAX);
-}
+	ret = __bch2_trans_do(trans, NULL, NULL, BTREE_INSERT_LAZY_RW,
+		bch2_dirent_create(trans, lostfound.bi_inum, &dir_hash,
+				   mode_to_type(inode->bi_mode),
+				   &name, inode->bi_inum, &dir_offset,
+				   BCH_HASH_SET_MUST_CREATE));
+	if (ret) {
+		bch_err(trans->c, "error %i reattaching inode %llu",
+			ret, inode->bi_inum);
+		return ret;
+	}
 
-static int reattach_inode(struct btree_trans *trans,
-			  struct bch_inode_unpacked *lostfound,
-			  u64 inum)
-{
-	int ret = __bch2_trans_do(trans, NULL, NULL, BTREE_INSERT_LAZY_RW,
-			      __reattach_inode(trans, lostfound, inum));
-	if (ret)
-		bch_err(trans->c, "error %i reattaching inode %llu", ret, inum);
+	inode->bi_dir		= lostfound.bi_inum;
+	inode->bi_dir_offset	= dir_offset;
 
-	return ret;
+	return write_inode(trans, inode, U32_MAX);
 }
 
 static int remove_backpointer(struct btree_trans *trans,
@@ -931,58 +990,6 @@ create_root:
 				 BTREE_INSERT_LAZY_RW);
 }
 
-/* Get lost+found, create if it doesn't exist: */
-static int check_lostfound(struct bch_fs *c,
-			   struct bch_inode_unpacked *root_inode,
-			   struct bch_inode_unpacked *lostfound_inode)
-{
-	struct qstr lostfound = QSTR("lost+found");
-	struct bch_hash_info root_hash_info =
-		bch2_hash_info_init(c, root_inode);
-	u64 inum;
-	u32 snapshot;
-	int ret;
-
-	bch_verbose(c, "checking lost+found");
-
-	inum = bch2_dirent_lookup(c, BCACHEFS_ROOT_INO, &root_hash_info,
-				 &lostfound);
-	if (!inum) {
-		bch_notice(c, "creating lost+found");
-		goto create_lostfound;
-	}
-
-	ret = bch2_trans_do(c, NULL, NULL, 0,
-		lookup_inode(&trans, inum, lostfound_inode, &snapshot));
-	if (ret && ret != -ENOENT)
-		return ret;
-
-	if (fsck_err_on(ret, c, "lost+found missing"))
-		goto create_lostfound;
-
-	if (fsck_err_on(!S_ISDIR(lostfound_inode->bi_mode), c,
-			"lost+found inode not a directory"))
-		goto create_lostfound;
-
-	return 0;
-fsck_err:
-	return ret;
-create_lostfound:
-	bch2_inode_init_early(c, lostfound_inode);
-
-	ret = bch2_trans_do(c, NULL, NULL,
-			    BTREE_INSERT_NOFAIL|
-			    BTREE_INSERT_LAZY_RW,
-		bch2_create_trans(&trans,
-				  BCACHEFS_ROOT_INO, root_inode,
-				  lostfound_inode, &lostfound,
-				  0, 0, S_IFDIR|0700, 0, NULL, NULL));
-	if (ret)
-		bch_err(c, "error creating lost+found: %i", ret);
-
-	return ret;
-}
-
 struct pathbuf {
 	size_t		nr;
 	size_t		size;
@@ -1014,7 +1021,6 @@ static int path_down(struct pathbuf *p, u64 inum)
 }
 
 static int check_path(struct btree_trans *trans,
-		      struct bch_inode_unpacked *lostfound,
 		      struct pathbuf *p,
 		      struct bch_inode_unpacked *inode)
 {
@@ -1038,7 +1044,7 @@ static int check_path(struct btree_trans *trans,
 				     inode->bi_nlink,
 				     inode->bi_dir,
 				     inode->bi_dir_offset))
-				ret = reattach_inode(trans, lostfound, inode->bi_inum);
+				ret = reattach_inode(trans, inode);
 			break;
 		}
 		ret = 0;
@@ -1067,12 +1073,11 @@ static int check_path(struct btree_trans *trans,
 				break;
 			}
 
-			ret = reattach_inode(trans, lostfound, inode->bi_inum);
+			ret = reattach_inode(trans, inode);
 			break;
 		}
 
-		ret = lockrestart_do(trans,
-				lookup_inode(trans, inode->bi_dir, inode, &snapshot));
+		ret = lookup_inode(trans, inode->bi_dir, inode, &snapshot);
 		if (ret) {
 			/* Should have been caught in dirents pass */
 			bch_err(c, "error looking up parent directory: %i", ret);
@@ -1090,8 +1095,7 @@ fsck_err:
  * After check_dirents(), if an inode backpointer doesn't exist that means it's
  * unreachable:
  */
-static int check_directory_structure(struct bch_fs *c,
-				     struct bch_inode_unpacked *lostfound)
+static int check_directory_structure(struct bch_fs *c)
 {
 	struct btree_trans trans;
 	struct btree_iter *iter;
@@ -1113,7 +1117,7 @@ static int check_directory_structure(struct bch_fs *c,
 			break;
 		}
 
-		ret = check_path(&trans, lostfound, &path, &u);
+		ret = check_path(&trans, &path, &u);
 		if (ret)
 			break;
 	}
@@ -1190,7 +1194,6 @@ static int bch2_gc_walk_dirents(struct bch_fs *c, nlink_table *links,
 }
 
 static int check_inode_nlink(struct btree_trans *trans,
-			     struct bch_inode_unpacked *lostfound_inode,
 			     struct btree_iter *iter,
 			     struct bkey_s_c_inode inode,
 			     unsigned nlink)
@@ -1238,7 +1241,6 @@ fsck_err:
 
 noinline_for_stack
 static int bch2_gc_walk_inodes(struct bch_fs *c,
-			       struct bch_inode_unpacked *lostfound_inode,
 			       nlink_table *links,
 			       u64 range_start, u64 range_end)
 {
@@ -1259,7 +1261,7 @@ static int bch2_gc_walk_inodes(struct bch_fs *c,
 			continue;
 
 		link = genradix_ptr(links, k.k->p.offset - range_start);
-		ret = check_inode_nlink(&trans, lostfound_inode, iter,
+		ret = check_inode_nlink(&trans, iter,
 					bkey_s_c_to_inode(k), link ? link->count : 0);
 		if (ret)
 			break;
@@ -1275,8 +1277,7 @@ static int bch2_gc_walk_inodes(struct bch_fs *c,
 }
 
 noinline_for_stack
-static int check_nlinks(struct bch_fs *c,
-			      struct bch_inode_unpacked *lostfound_inode)
+static int check_nlinks(struct bch_fs *c)
 {
 	nlink_table links;
 	u64 this_iter_range_start, next_iter_range_start = 0;
@@ -1296,7 +1297,7 @@ static int check_nlinks(struct bch_fs *c,
 		if (ret)
 			break;
 
-		ret = bch2_gc_walk_inodes(c, lostfound_inode, &links,
+		ret = bch2_gc_walk_inodes(c, &links,
 					 this_iter_range_start,
 					 next_iter_range_start);
 		if (ret)
@@ -1316,16 +1317,15 @@ static int check_nlinks(struct bch_fs *c,
  */
 int bch2_fsck_full(struct bch_fs *c)
 {
-	struct bch_inode_unpacked root_inode, lostfound_inode;
+	struct bch_inode_unpacked root_inode;
 
 	return  check_inodes(c, true) ?:
 		check_extents(c) ?:
 		check_dirents(c) ?:
 		check_xattrs(c) ?:
 		check_root(c, &root_inode) ?:
-		check_lostfound(c, &root_inode, &lostfound_inode) ?:
-		check_directory_structure(c, &lostfound_inode) ?:
-		check_nlinks(c, &lostfound_inode);
+		check_directory_structure(c) ?:
+		check_nlinks(c);
 }
 
 int bch2_fsck_walk_inodes_only(struct bch_fs *c)
-- 
cgit 


From 1c8441bea5366ec1e4f077ab675b49a603d7cad8 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 23 Apr 2021 16:18:43 -0400
Subject: bcachefs: Fix repair leading to replicas not marked

bch2_check_fix_ptrs() was being called after checking if the replicas
set was marked - but repair could change which replicas set needed to be
marked. Oops.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_gc.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index a7ffd566d431..6983a1197905 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -330,6 +330,10 @@ static int bch2_gc_mark_key(struct bch_fs *c, enum btree_id btree_id,
 		BUG_ON(bch2_journal_seq_verify &&
 		       k->k->version.lo > journal_cur_seq(&c->journal));
 
+		ret = bch2_check_fix_ptrs(c, btree_id, level, is_root, k);
+		if (ret)
+			goto err;
+
 		if (fsck_err_on(k->k->version.lo > atomic64_read(&c->key_version), c,
 				"key version number higher than recorded: %llu > %llu",
 				k->k->version.lo,
@@ -346,8 +350,6 @@ static int bch2_gc_mark_key(struct bch_fs *c, enum btree_id btree_id,
 				goto err;
 			}
 		}
-
-		ret = bch2_check_fix_ptrs(c, btree_id, level, is_root, k);
 	}
 
 	ptrs = bch2_bkey_ptrs_c(*k);
-- 
cgit 


From 537c32f5212396751bf39b10d71af60af32dc257 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 23 Apr 2021 16:05:49 -0400
Subject: bcachefs: Don't BUG_ON() btree topology error

This replaces an assertion in the btree merge path with a
bch2_inconsistent_error() - fsck will fix it.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_cache.c           | 45 ++++++++++++++++++++++++++++++-------
 fs/bcachefs/btree_update_interior.c | 14 +++++++++++-
 2 files changed, 50 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index cc703c2602cf..5a5eb99baefa 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -686,6 +686,41 @@ static int lock_node_check_fn(struct six_lock *lock, void *p)
 	return b->hash_val == btree_ptr_hash_val(k) ? 0 : -1;
 }
 
+static noinline void btree_bad_header(struct bch_fs *c, struct btree *b)
+{
+	char buf1[100], buf2[100], buf3[100], buf4[100];
+
+	if (!test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags))
+		return;
+
+	bch2_bpos_to_text(&PBUF(buf1), b->key.k.type == KEY_TYPE_btree_ptr_v2
+		? bkey_i_to_btree_ptr_v2(&b->key)->v.min_key
+		: POS_MIN);
+	bch2_bpos_to_text(&PBUF(buf2), b->data->min_key);
+
+	bch2_bpos_to_text(&PBUF(buf3), b->key.k.p);
+	bch2_bpos_to_text(&PBUF(buf4), b->data->max_key);
+	bch2_fs_inconsistent(c, "btree node header doesn't match ptr\n"
+			     "btree: ptr %u header %llu\n"
+			     "level: ptr %u header %llu\n"
+			     "min ptr %s node header %s\n"
+			     "max ptr %s node header %s",
+			     b->c.btree_id,	BTREE_NODE_ID(b->data),
+			     b->c.level,	BTREE_NODE_LEVEL(b->data),
+			     buf1, buf2, buf3, buf4);
+}
+
+static inline void btree_check_header(struct bch_fs *c, struct btree *b)
+{
+	if (b->c.btree_id != BTREE_NODE_ID(b->data) ||
+	    b->c.level != BTREE_NODE_LEVEL(b->data) ||
+	    bpos_cmp(b->data->max_key, b->key.k.p) ||
+	    (b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
+	     bpos_cmp(b->data->min_key,
+		      bkey_i_to_btree_ptr_v2(&b->key)->v.min_key)))
+		btree_bad_header(c, b);
+}
+
 /**
  * bch_btree_node_get - find a btree node in the cache and lock it, reading it
  * in from disk if necessary.
@@ -803,10 +838,7 @@ lock_node:
 
 	EBUG_ON(b->c.btree_id != iter->btree_id);
 	EBUG_ON(BTREE_NODE_LEVEL(b->data) != level);
-	EBUG_ON(bpos_cmp(b->data->max_key, k->k.p));
-	EBUG_ON(b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
-		bpos_cmp(b->data->min_key,
-			 bkey_i_to_btree_ptr_v2(&b->key)->v.min_key));
+	btree_check_header(c, b);
 
 	return b;
 }
@@ -886,10 +918,7 @@ lock_node:
 
 	EBUG_ON(b->c.btree_id != btree_id);
 	EBUG_ON(BTREE_NODE_LEVEL(b->data) != level);
-	EBUG_ON(bpos_cmp(b->data->max_key, k->k.p));
-	EBUG_ON(b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
-		bpos_cmp(b->data->min_key,
-			 bkey_i_to_btree_ptr_v2(&b->key)->v.min_key));
+	btree_check_header(c, b);
 out:
 	bch2_btree_cache_cannibalize_unlock(c);
 	return b;
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index dabd1a3e3aa3..986b396ba177 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -1606,7 +1606,19 @@ retry:
 		next = m;
 	}
 
-	BUG_ON(bkey_cmp(bpos_successor(prev->data->max_key), next->data->min_key));
+	if (bkey_cmp(bpos_successor(prev->data->max_key), next->data->min_key)) {
+		char buf1[100], buf2[100];
+
+		bch2_bpos_to_text(&PBUF(buf1), prev->data->max_key);
+		bch2_bpos_to_text(&PBUF(buf2), next->data->min_key);
+		bch2_fs_inconsistent(c,
+				     "btree topology error in btree merge:\n"
+				     "prev ends at   %s\n"
+				     "next starts at %s\n",
+				     buf1, buf2);
+		ret = -EIO;
+		goto err;
+	}
 
 	bch2_bkey_format_init(&new_s);
 	bch2_bkey_format_add_pos(&new_s, prev->data->min_key);
-- 
cgit 


From 65c0601a329580f6a016298f7148305288b7d719 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 24 Apr 2021 00:38:16 -0400
Subject: bcachefs: Use mmap() instead of vmalloc_exec() in userspace

Calling mmap() directly is much better than malloc() then mprotect(), we
end up with much less address space fragmentation.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_cache.c | 23 +++++++++++++++--------
 1 file changed, 15 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index 5a5eb99baefa..a42e0922f5e9 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -33,21 +33,21 @@ static inline unsigned btree_cache_can_free(struct btree_cache *bc)
 	return max_t(int, 0, bc->used - bc->reserve);
 }
 
-static void __btree_node_data_free(struct bch_fs *c, struct btree *b)
+static void btree_node_data_free(struct bch_fs *c, struct btree *b)
 {
+	struct btree_cache *bc = &c->btree_cache;
+
 	EBUG_ON(btree_node_write_in_flight(b));
 
 	kvpfree(b->data, btree_bytes(c));
 	b->data = NULL;
+#ifdef __KERNEL__
 	kvfree(b->aux_data);
+#else
+	munmap(b->aux_data, btree_aux_data_bytes(b));
+#endif
 	b->aux_data = NULL;
-}
 
-static void btree_node_data_free(struct bch_fs *c, struct btree *b)
-{
-	struct btree_cache *bc = &c->btree_cache;
-
-	__btree_node_data_free(c, b);
 	bc->used--;
 	list_move(&b->list, &bc->freed);
 }
@@ -75,8 +75,15 @@ static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp)
 	b->data = kvpmalloc(btree_bytes(c), gfp);
 	if (!b->data)
 		return -ENOMEM;
-
+#ifdef __KERNEL__
 	b->aux_data = kvmalloc(btree_aux_data_bytes(b), gfp);
+#else
+	b->aux_data = mmap(NULL, btree_aux_data_bytes(b),
+			   PROT_READ|PROT_WRITE|PROT_EXEC,
+			   MAP_PRIVATE|MAP_ANONYMOUS, 0, 0);
+	if (b->aux_data == MAP_FAILED)
+		b->aux_data = NULL;
+#endif
 	if (!b->aux_data) {
 		kvpfree(b->data, btree_bytes(c));
 		b->data = NULL;
-- 
cgit 


From bc2e5d5c6636f6a5794e4ed306fb008b02e6f419 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 24 Apr 2021 00:42:02 -0400
Subject: bcachefs: Fix an out of bounds read

bch2_varint_decode() can read up to 7 bytes past the end of the buffer,
which means we need to allocate slightly larger key cache buffers.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_key_cache.c   | 10 ++++++++--
 fs/bcachefs/btree_update_leaf.c |  6 ++++++
 2 files changed, 14 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index 0716c3314a36..019d4d164553 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -219,8 +219,14 @@ static int btree_key_cache_fill(struct btree_trans *trans,
 		goto err;
 	}
 
-	if (k.k->u64s > ck->u64s) {
-		new_u64s = roundup_pow_of_two(k.k->u64s);
+	/*
+	 * bch2_varint_decode can read past the end of the buffer by at
+	 * most 7 bytes (it won't be used):
+	 */
+	new_u64s = k.k->u64s + 1;
+
+	if (new_u64s > ck->u64s) {
+		new_u64s = roundup_pow_of_two(new_u64s);
 		new_k = kmalloc(new_u64s * sizeof(u64), GFP_NOFS);
 		if (!new_k) {
 			ret = -ENOMEM;
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 96b53effded7..e537bd64e1fb 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -293,6 +293,12 @@ btree_key_can_insert_cached(struct btree_trans *trans,
 	    !(trans->flags & BTREE_INSERT_JOURNAL_RECLAIM))
 		return BTREE_INSERT_NEED_JOURNAL_RECLAIM;
 
+	/*
+	 * bch2_varint_decode can read past the end of the buffer by at most 7
+	 * bytes (it won't be used):
+	 */
+	u64s += 1;
+
 	if (u64s <= ck->u64s)
 		return BTREE_INSERT_OK;
 
-- 
cgit 


From 8058b532ac3bb5a0cec80056fde8a4669de2b551 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 24 Apr 2021 00:59:29 -0400
Subject: bcachefs: Fix bch2_verify_keylist_sorted

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/keylist.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/keylist.c b/fs/bcachefs/keylist.c
index 864dfaa67b7a..cda77835b9ea 100644
--- a/fs/bcachefs/keylist.c
+++ b/fs/bcachefs/keylist.c
@@ -62,6 +62,6 @@ void bch2_verify_keylist_sorted(struct keylist *l)
 
 	for_each_keylist_key(l, k)
 		BUG_ON(bkey_next(k) != l->top &&
-		       bkey_cmp(k->k.p, bkey_next(k)->k.p) >= 0);
+		       bpos_cmp(k->k.p, bkey_next(k)->k.p) >= 0);
 }
 #endif
-- 
cgit 


From bcd25dac53344577b784bad47775f97f02c47285 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 24 Apr 2021 02:47:41 -0400
Subject: bcachefs: Rewrite btree nodes with errors

This patch adds self healing functionality for btree nodes - if we
notice a problem when reading a btree node, we just rewrite it.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_io.c              |  7 ++++++
 fs/bcachefs/btree_update.h          |  1 +
 fs/bcachefs/btree_update_interior.c | 50 +++++++++++++++++++++++++++++++++++++
 3 files changed, 58 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 2f5b7c629a9c..cea151a5d4f8 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -986,6 +986,7 @@ static void btree_node_read_work(struct work_struct *work)
 	struct bch_io_failures failed = { .nr = 0 };
 	char buf[200];
 	struct printbuf out;
+	bool saw_error = false;
 	bool can_retry;
 
 	goto start;
@@ -1022,6 +1023,8 @@ start:
 		    !bch2_btree_node_read_done(c, ca, b, can_retry))
 			break;
 
+		saw_error = true;
+
 		if (!can_retry) {
 			set_btree_node_read_error(b);
 			break;
@@ -1031,6 +1034,10 @@ start:
 	bch2_time_stats_update(&c->times[BCH_TIME_btree_node_read],
 			       rb->start_time);
 	bio_put(&rb->bio);
+
+	if (saw_error && !btree_node_read_error(b))
+		bch2_btree_node_rewrite_async(c, b);
+
 	clear_btree_node_read_in_flight(b);
 	wake_up_bit(&b->flags, BTREE_NODE_read_in_flight);
 }
diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
index 0c7caa7e91a0..56131ac516ce 100644
--- a/fs/bcachefs/btree_update.h
+++ b/fs/bcachefs/btree_update.h
@@ -72,6 +72,7 @@ int bch2_btree_delete_range(struct bch_fs *, enum btree_id,
 
 int bch2_btree_node_rewrite(struct bch_fs *c, struct btree_iter *,
 			    __le64, unsigned);
+void bch2_btree_node_rewrite_async(struct bch_fs *, struct btree *);
 int bch2_btree_node_update_key(struct bch_fs *, struct btree_iter *,
 			       struct btree *, struct bkey_i *);
 
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 986b396ba177..3ca4114c74ad 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -1797,6 +1797,56 @@ out:
 	return ret;
 }
 
+struct async_btree_rewrite {
+	struct bch_fs		*c;
+	struct work_struct	work;
+	enum btree_id		btree_id;
+	unsigned		level;
+	struct bpos		pos;
+	__le64			seq;
+};
+
+void async_btree_node_rewrite_work(struct work_struct *work)
+{
+	struct async_btree_rewrite *a =
+		container_of(work, struct async_btree_rewrite, work);
+	struct bch_fs *c = a->c;
+	struct btree_trans trans;
+	struct btree_iter *iter;
+
+	bch2_trans_init(&trans, c, 0, 0);
+	iter = bch2_trans_get_node_iter(&trans, a->btree_id, a->pos,
+					BTREE_MAX_DEPTH, a->level, 0);
+	bch2_btree_node_rewrite(c, iter, a->seq, 0);
+	bch2_trans_iter_put(&trans, iter);
+	bch2_trans_exit(&trans);
+	percpu_ref_put(&c->writes);
+	kfree(a);
+}
+
+void bch2_btree_node_rewrite_async(struct bch_fs *c, struct btree *b)
+{
+	struct async_btree_rewrite *a;
+
+	if (!percpu_ref_tryget(&c->writes))
+		return;
+
+	a = kmalloc(sizeof(*a), GFP_NOFS);
+	if (!a) {
+		percpu_ref_put(&c->writes);
+		return;
+	}
+
+	a->c		= c;
+	a->btree_id	= b->c.btree_id;
+	a->level	= b->c.level;
+	a->pos		= b->key.k.p;
+	a->seq		= b->data->keys.seq;
+
+	INIT_WORK(&a->work, async_btree_node_rewrite_work);
+	queue_work(system_long_wq, &a->work);
+}
+
 static void __bch2_btree_node_update_key(struct bch_fs *c,
 					 struct btree_update *as,
 					 struct btree_iter *iter,
-- 
cgit 


From 0098376f03c1e9591beeab9815c4d756dd2d68a7 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 23 Apr 2021 19:25:27 -0400
Subject: bcachefs: New helper __bch2_btree_insert_keys_interior()

Consolidate common parts of bch2_btree_insert_keys_interior() and
btree_split_insert_keys() - prep work for adding some new topology
assertions.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update_interior.c | 48 ++++++++++++++++++-------------------
 1 file changed, 23 insertions(+), 25 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 3ca4114c74ad..113e81e8730c 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -1154,6 +1154,27 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, struct btree *b
 	set_btree_node_need_write(b);
 }
 
+static void
+__bch2_btree_insert_keys_interior(struct btree_update *as, struct btree *b,
+				  struct btree_iter *iter, struct keylist *keys,
+				  struct btree_node_iter node_iter)
+{
+	struct bkey_i *insert = bch2_keylist_front(keys);
+	struct bkey_packed *k;
+
+	BUG_ON(btree_node_type(b) != BKEY_TYPE_btree);
+
+	while ((k = bch2_btree_node_iter_prev_all(&node_iter, b)) &&
+	       (bkey_cmp_left_packed(b, k, &insert->k.p) >= 0))
+		;
+
+	while (!bch2_keylist_empty(keys)) {
+		bch2_insert_fixup_btree_ptr(as, b, iter,
+				bch2_keylist_front(keys), &node_iter);
+		bch2_keylist_pop_front(keys);
+	}
+}
+
 /*
  * Move keys from n1 (original replacement node, now lower node) to n2 (higher
  * node)
@@ -1284,16 +1305,9 @@ static void btree_split_insert_keys(struct btree_update *as, struct btree *b,
 	struct bkey_packed *src, *dst, *n;
 	struct bset *i;
 
-	BUG_ON(btree_node_type(b) != BKEY_TYPE_btree);
-
 	bch2_btree_node_iter_init(&node_iter, b, &k->k.p);
 
-	while (!bch2_keylist_empty(keys)) {
-		k = bch2_keylist_front(keys);
-
-		bch2_insert_fixup_btree_ptr(as, b, iter, k, &node_iter);
-		bch2_keylist_pop_front(keys);
-	}
+	__bch2_btree_insert_keys_interior(as, b, iter, keys, node_iter);
 
 	/*
 	 * We can't tolerate whiteouts here - with whiteouts there can be
@@ -1439,24 +1453,8 @@ bch2_btree_insert_keys_interior(struct btree_update *as, struct btree *b,
 				struct btree_iter *iter, struct keylist *keys)
 {
 	struct btree_iter *linked;
-	struct btree_node_iter node_iter;
-	struct bkey_i *insert = bch2_keylist_front(keys);
-	struct bkey_packed *k;
-
-	/* Don't screw up @iter's position: */
-	node_iter = iter->l[b->c.level].iter;
-
-	/*
-	 * btree_split(), btree_gc_coalesce() will insert keys before
-	 * the iterator's current position - they know the keys go in
-	 * the node the iterator points to:
-	 */
-	while ((k = bch2_btree_node_iter_prev_all(&node_iter, b)) &&
-	       (bkey_cmp_left_packed(b, k, &insert->k.p) >= 0))
-		;
 
-	for_each_keylist_key(keys, insert)
-		bch2_insert_fixup_btree_ptr(as, b, iter, insert, &node_iter);
+	__bch2_btree_insert_keys_interior(as, b, iter, keys, iter->l[b->c.level].iter);
 
 	btree_update_updated_node(as, b);
 
-- 
cgit 


From 4932e07ea04bcc7f1649052183d1ebbab30c711c Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 24 Apr 2021 18:02:59 -0400
Subject: bcachefs: Fix key cache assertion

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs.h        |  1 +
 fs/bcachefs/btree_key_cache.c |  4 +++-
 fs/bcachefs/recovery.c        | 11 +++++++----
 fs/bcachefs/super.c           |  1 +
 4 files changed, 12 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index dc7b78b7e2a2..64a9094ec748 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -489,6 +489,7 @@ enum {
 	BCH_FS_FSCK_DONE,
 	BCH_FS_STARTED,
 	BCH_FS_RW,
+	BCH_FS_WAS_RW,
 
 	/* shutdown: */
 	BCH_FS_STOPPING,
diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index 019d4d164553..f8b9ca4dfb2b 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -683,7 +683,9 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc)
 		kmem_cache_free(bch2_key_cache, ck);
 	}
 
-	BUG_ON(atomic_long_read(&bc->nr_dirty) && !bch2_journal_error(&c->journal));
+	BUG_ON(atomic_long_read(&bc->nr_dirty) &&
+	       !bch2_journal_error(&c->journal) &&
+	       test_bit(BCH_FS_WAS_RW, &c->flags));
 	BUG_ON(atomic_long_read(&bc->nr_keys));
 
 	mutex_unlock(&bc->lock);
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 2dc3dee4efc8..fe6886e42216 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -973,7 +973,7 @@ int bch2_fs_recovery(struct bch_fs *c)
 	struct jset *last_journal_entry = NULL;
 	u64 blacklist_seq, journal_seq;
 	bool write_sb = false;
-	int ret;
+	int ret = 0;
 
 	if (c->sb.clean)
 		clean = read_superblock_clean(c);
@@ -1253,10 +1253,9 @@ use_clean:
 	if (c->journal_seq_blacklist_table &&
 	    c->journal_seq_blacklist_table->nr > 128)
 		queue_work(system_long_wq, &c->journal_seq_blacklist_gc_work);
-out:
+
 	ret = 0;
-err:
-fsck_err:
+out:
 	set_bit(BCH_FS_FSCK_DONE, &c->flags);
 	bch2_flush_fsck_errs(c);
 
@@ -1270,6 +1269,10 @@ fsck_err:
 	else
 		bch_verbose(c, "ret %i", ret);
 	return ret;
+err:
+fsck_err:
+	bch2_fs_emergency_read_only(c);
+	goto out;
 }
 
 int bch2_fs_initialize(struct bch_fs *c)
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 18ad2db9f4bf..64bc5ed33203 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -441,6 +441,7 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early)
 
 	percpu_ref_reinit(&c->writes);
 	set_bit(BCH_FS_RW, &c->flags);
+	set_bit(BCH_FS_WAS_RW, &c->flags);
 	return 0;
 err:
 	__bch2_fs_read_only(c);
-- 
cgit 


From aae15aafcd43ec5346ac6c3f61c09798d26593ee Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 24 Apr 2021 16:32:35 -0400
Subject: bcachefs: New and improved topology repair code

This splits out btree topology repair into a separate pass, and makes
some improvements:
 - When we have to pick which of two overlapping nodes to drop keys
   from, we use the btree node header sequence number to preserve the
   newer node

 - the gc code has been changed so that it doesn't bail out if we're
   continuing/ignoring on fsck error - this way the dump tool can skip
   running the repair pass but still walk all reachable metadata

 - add a new superblock flag indicating when a filesystem is known to
   have btree topology issues, and the topology repair pass should be
   run

 - changing the start/end of a node might mean keys in that node have to
   be deleted: this patch handles that better by splitting it out into a
   separate function and running it explicitly in the topology repair
   code, previously those keys were only being dropped when the btree
   node was read in.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs.h              |   3 +
 fs/bcachefs/bcachefs_format.h       |   4 +-
 fs/bcachefs/btree_gc.c              | 455 +++++++++++++++++++++++++++++-------
 fs/bcachefs/btree_io.c              |  58 ++++-
 fs/bcachefs/btree_io.h              |   2 +
 fs/bcachefs/btree_update_interior.c |  11 +-
 fs/bcachefs/error.c                 |  18 +-
 fs/bcachefs/error.h                 |   3 +
 fs/bcachefs/recovery.c              |   3 +-
 fs/bcachefs/super-io.c              |   7 +
 fs/bcachefs/super.c                 |   5 +
 11 files changed, 471 insertions(+), 98 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 64a9094ec748..323705f352de 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -485,6 +485,7 @@ enum {
 	BCH_FS_ALLOCATOR_RUNNING,
 	BCH_FS_ALLOCATOR_STOPPING,
 	BCH_FS_INITIAL_GC_DONE,
+	BCH_FS_INITIAL_GC_UNFIXED,
 	BCH_FS_BTREE_INTERIOR_REPLAY_DONE,
 	BCH_FS_FSCK_DONE,
 	BCH_FS_STARTED,
@@ -498,7 +499,9 @@ enum {
 
 	/* errors: */
 	BCH_FS_ERROR,
+	BCH_FS_TOPOLOGY_ERROR,
 	BCH_FS_ERRORS_FIXED,
+	BCH_FS_ERRORS_NOT_FIXED,
 
 	/* misc: */
 	BCH_FS_NEED_ANOTHER_GC,
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index f2b5f5c06ee0..98a87e4928ab 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -1317,12 +1317,10 @@ LE64_BITMASK(BCH_SB_GRPQUOTA,		struct bch_sb, flags[0], 58, 59);
 LE64_BITMASK(BCH_SB_PRJQUOTA,		struct bch_sb, flags[0], 59, 60);
 
 LE64_BITMASK(BCH_SB_HAS_ERRORS,		struct bch_sb, flags[0], 60, 61);
+LE64_BITMASK(BCH_SB_HAS_TOPOLOGY_ERRORS,struct bch_sb, flags[0], 61, 62);
 
-/* bit 61 was reflink option */
 LE64_BITMASK(BCH_SB_BIG_ENDIAN,		struct bch_sb, flags[0], 62, 63);
 
-/* 61-64 unused */
-
 LE64_BITMASK(BCH_SB_STR_HASH_TYPE,	struct bch_sb, flags[1],  0,  4);
 LE64_BITMASK(BCH_SB_COMPRESSION_TYPE,	struct bch_sb, flags[1],  4,  8);
 LE64_BITMASK(BCH_SB_INODE_32BIT,	struct bch_sb, flags[1],  8,  9);
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 6983a1197905..09a49dc63144 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -66,8 +66,6 @@ static int bch2_gc_check_topology(struct bch_fs *c,
 		? node_start
 		: bpos_successor(prev->k->k.p);
 	char buf1[200], buf2[200];
-	bool update_min = false;
-	bool update_max = false;
 	int ret = 0;
 
 	if (cur.k->k.type == KEY_TYPE_btree_ptr_v2) {
@@ -81,83 +79,341 @@ static int bch2_gc_check_topology(struct bch_fs *c,
 			bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(prev->k));
 		}
 
-		if (fsck_err_on(bpos_cmp(expected_start, bp->v.min_key), c,
-				"btree node with incorrect min_key at btree %s level %u:\n"
-				"  prev %s\n"
-				"  cur %s",
-				bch2_btree_ids[b->c.btree_id], b->c.level,
-				buf1,
-				(bch2_bkey_val_to_text(&PBUF(buf2), c, bkey_i_to_s_c(cur.k)), buf2)))
-			update_min = true;
+		if (bpos_cmp(expected_start, bp->v.min_key)) {
+			bch2_topology_error(c);
+
+			if (fsck_err(c, "btree node with incorrect min_key at btree %s level %u:\n"
+				     "  prev %s\n"
+				     "  cur %s",
+				     bch2_btree_ids[b->c.btree_id], b->c.level,
+				     buf1,
+				     (bch2_bkey_val_to_text(&PBUF(buf2), c, bkey_i_to_s_c(cur.k)), buf2))) {
+				bch_info(c, "Halting mark and sweep to start topology repair pass");
+				return FSCK_ERR_START_TOPOLOGY_REPAIR;
+			} else {
+				set_bit(BCH_FS_INITIAL_GC_UNFIXED, &c->flags);
+			}
+		}
+	}
+
+	if (is_last && bpos_cmp(cur.k->k.p, node_end)) {
+		bch2_topology_error(c);
+
+		if (fsck_err(c, "btree node with incorrect max_key at btree %s level %u:\n"
+			     "  %s\n"
+			     "  expected %s",
+			     bch2_btree_ids[b->c.btree_id], b->c.level,
+			     (bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(cur.k)), buf1),
+			     (bch2_bpos_to_text(&PBUF(buf2), node_end), buf2))) {
+			bch_info(c, "Halting mark and sweep to start topology repair pass");
+			return FSCK_ERR_START_TOPOLOGY_REPAIR;
+		} else {
+			set_bit(BCH_FS_INITIAL_GC_UNFIXED, &c->flags);
+		}
+	}
+
+	bch2_bkey_buf_copy(prev, c, cur.k);
+fsck_err:
+	return ret;
+}
+
+static void btree_ptr_to_v2(struct btree *b, struct bkey_i_btree_ptr_v2 *dst)
+{
+	switch (b->key.k.type) {
+	case KEY_TYPE_btree_ptr: {
+		struct bkey_i_btree_ptr *src = bkey_i_to_btree_ptr(&b->key);
+
+		dst->k.p		= src->k.p;
+		dst->v.mem_ptr		= 0;
+		dst->v.seq		= b->data->keys.seq;
+		dst->v.sectors_written	= 0;
+		dst->v.flags		= 0;
+		dst->v.min_key		= b->data->min_key;
+		set_bkey_val_bytes(&dst->k, sizeof(dst->v) + bkey_val_bytes(&src->k));
+		memcpy(dst->v.start, src->v.start, bkey_val_bytes(&src->k));
+		break;
+	}
+	case KEY_TYPE_btree_ptr_v2:
+		bkey_copy(&dst->k_i, &b->key);
+		break;
+	default:
+		BUG();
+	}
+}
+
+static int set_node_min(struct bch_fs *c, struct btree *b, struct bpos new_min)
+{
+	struct bkey_i_btree_ptr_v2 *new;
+	int ret;
+
+	new = kmalloc(BKEY_BTREE_PTR_U64s_MAX * sizeof(u64), GFP_KERNEL);
+	if (!new)
+		return -ENOMEM;
+
+	btree_ptr_to_v2(b, new);
+	b->data->min_key	= new_min;
+	new->v.min_key		= new_min;
+	SET_BTREE_PTR_RANGE_UPDATED(&new->v, true);
+
+	ret = bch2_journal_key_insert(c, b->c.btree_id, b->c.level + 1, &new->k_i);
+	if (ret) {
+		kfree(new);
+		return ret;
+	}
+
+	bch2_btree_node_drop_keys_outside_node(b);
+
+	return 0;
+}
+
+static int set_node_max(struct bch_fs *c, struct btree *b, struct bpos new_max)
+{
+	struct bkey_i_btree_ptr_v2 *new;
+	int ret;
+
+	ret = bch2_journal_key_delete(c, b->c.btree_id, b->c.level + 1, b->key.k.p);
+	if (ret)
+		return ret;
+
+	new = kmalloc(BKEY_BTREE_PTR_U64s_MAX * sizeof(u64), GFP_KERNEL);
+	if (!new)
+		return -ENOMEM;
+
+	btree_ptr_to_v2(b, new);
+	b->data->max_key	= new_max;
+	new->k.p		= new_max;
+	SET_BTREE_PTR_RANGE_UPDATED(&new->v, true);
+
+	ret = bch2_journal_key_insert(c, b->c.btree_id, b->c.level + 1, &new->k_i);
+	if (ret) {
+		kfree(new);
+		return ret;
+	}
+
+	bch2_btree_node_drop_keys_outside_node(b);
+
+	mutex_lock(&c->btree_cache.lock);
+	bch2_btree_node_hash_remove(&c->btree_cache, b);
+
+	bkey_copy(&b->key, &new->k_i);
+	ret = __bch2_btree_node_hash_insert(&c->btree_cache, b);
+	BUG_ON(ret);
+	mutex_unlock(&c->btree_cache.lock);
+	return 0;
+}
+
+static int btree_repair_node_start(struct bch_fs *c, struct btree *b,
+				   struct btree *prev, struct btree *cur)
+{
+	struct bpos expected_start = !prev
+		? b->data->min_key
+		: bpos_successor(prev->key.k.p);
+	char buf1[200], buf2[200];
+	int ret = 0;
+
+	if (!prev) {
+		struct printbuf out = PBUF(buf1);
+		pr_buf(&out, "start of node: ");
+		bch2_bpos_to_text(&out, b->data->min_key);
+	} else {
+		bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(&prev->key));
+	}
+
+	if (mustfix_fsck_err_on(bpos_cmp(expected_start, cur->data->min_key), c,
+			"btree node with incorrect min_key at btree %s level %u:\n"
+			"  prev %s\n"
+			"  cur %s",
+			bch2_btree_ids[b->c.btree_id], b->c.level,
+			buf1,
+			(bch2_bkey_val_to_text(&PBUF(buf2), c, bkey_i_to_s_c(&cur->key)), buf2))) {
+		if (prev &&
+		    bpos_cmp(expected_start, cur->data->min_key) > 0 &&
+		    BTREE_NODE_SEQ(cur->data) > BTREE_NODE_SEQ(prev->data))
+			ret = set_node_max(c, prev,
+				bpos_predecessor(cur->data->min_key));
+		else
+			ret = set_node_min(c, cur, expected_start);
+		if (ret)
+			return ret;
 	}
+fsck_err:
+	return ret;
+}
 
-	if (fsck_err_on(is_last &&
-			bpos_cmp(cur.k->k.p, node_end), c,
+static int btree_repair_node_end(struct bch_fs *c, struct btree *b,
+				 struct btree *child)
+{
+	char buf1[200], buf2[200];
+	int ret = 0;
+
+	if (mustfix_fsck_err_on(bpos_cmp(child->key.k.p, b->key.k.p), c,
 			"btree node with incorrect max_key at btree %s level %u:\n"
 			"  %s\n"
 			"  expected %s",
 			bch2_btree_ids[b->c.btree_id], b->c.level,
-			(bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(cur.k)), buf1),
-			(bch2_bpos_to_text(&PBUF(buf2), node_end), buf2)))
-		update_max = true;
+			(bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(&child->key)), buf1),
+			(bch2_bpos_to_text(&PBUF(buf2), b->key.k.p), buf2))) {
+		ret = set_node_max(c, child, b->key.k.p);
+		if (ret)
+			return ret;
+	}
+fsck_err:
+	return ret;
+}
 
-	bch2_bkey_buf_copy(prev, c, cur.k);
+#define DROP_THIS_NODE		10
 
-	if (update_min || update_max) {
-		struct bkey_i *new;
-		struct bkey_i_btree_ptr_v2 *bp = NULL;
-		struct btree *n;
+static int bch2_btree_repair_topology_recurse(struct bch_fs *c, struct btree *b)
+{
+	struct btree_and_journal_iter iter;
+	struct bkey_s_c k;
+	struct bkey_buf tmp;
+	struct btree *prev = NULL, *cur = NULL;
+	bool have_child, dropped_children = false;
+	char buf[200];
+	int ret = 0;
 
-		if (update_max) {
+	if (!b->c.level)
+		return 0;
+again:
+	have_child = dropped_children = false;
+	bch2_bkey_buf_init(&tmp);
+	bch2_btree_and_journal_iter_init_node_iter(&iter, c, b);
+
+	while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
+		BUG_ON(bpos_cmp(k.k->p, b->data->min_key) < 0);
+		BUG_ON(bpos_cmp(k.k->p, b->data->max_key) > 0);
+
+		bch2_btree_and_journal_iter_advance(&iter);
+		bch2_bkey_buf_reassemble(&tmp, c, k);
+
+		cur = bch2_btree_node_get_noiter(c, tmp.k,
+					b->c.btree_id, b->c.level - 1,
+					false);
+		ret = PTR_ERR_OR_ZERO(cur);
+
+		if (mustfix_fsck_err_on(ret == -EIO, c,
+				"Unreadable btree node at btree %s level %u:\n"
+				"  %s",
+				bch2_btree_ids[b->c.btree_id],
+				b->c.level - 1,
+				(bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(tmp.k)), buf))) {
 			ret = bch2_journal_key_delete(c, b->c.btree_id,
-						      b->c.level, cur.k->k.p);
+						      b->c.level, tmp.k->k.p);
 			if (ret)
-				return ret;
+				goto err;
+			continue;
 		}
 
-		new = kmalloc(bkey_bytes(&cur.k->k), GFP_KERNEL);
-		if (!new) {
-			bch_err(c, "%s: error allocating new key", __func__);
-			return -ENOMEM;
+		if (ret) {
+			bch_err(c, "%s: error %i getting btree node",
+				__func__, ret);
+			break;
 		}
 
-		bkey_copy(new, cur.k);
+		ret = btree_repair_node_start(c, b, prev, cur);
+		if (prev)
+			six_unlock_read(&prev->c.lock);
+		prev = cur;
+		cur = NULL;
+
+		if (ret)
+			break;
+	}
+
+	if (!ret && !IS_ERR_OR_NULL(prev)) {
+		BUG_ON(cur);
+		ret = btree_repair_node_end(c, b, prev);
+	}
+
+	if (!IS_ERR_OR_NULL(prev))
+		six_unlock_read(&prev->c.lock);
+	prev = NULL;
+	if (!IS_ERR_OR_NULL(cur))
+		six_unlock_read(&cur->c.lock);
+	cur = NULL;
 
-		if (new->k.type == KEY_TYPE_btree_ptr_v2)
-			bp = bkey_i_to_btree_ptr_v2(new);
+	if (ret)
+		goto err;
+
+	bch2_btree_and_journal_iter_exit(&iter);
+	bch2_btree_and_journal_iter_init_node_iter(&iter, c, b);
+
+	while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
+		bch2_bkey_buf_reassemble(&tmp, c, k);
+		bch2_btree_and_journal_iter_advance(&iter);
 
-		if (update_min)
-			bp->v.min_key = expected_start;
-		if (update_max)
-			new->k.p = node_end;
-		if (bp)
-			SET_BTREE_PTR_RANGE_UPDATED(&bp->v, true);
+		cur = bch2_btree_node_get_noiter(c, tmp.k,
+					b->c.btree_id, b->c.level - 1,
+					false);
+		ret = PTR_ERR_OR_ZERO(cur);
 
-		ret = bch2_journal_key_insert(c, b->c.btree_id, b->c.level, new);
 		if (ret) {
-			kfree(new);
-			return ret;
+			bch_err(c, "%s: error %i getting btree node",
+				__func__, ret);
+			goto err;
 		}
 
-		n = bch2_btree_node_get_noiter(c, cur.k, b->c.btree_id,
-					       b->c.level - 1, true);
-		if (n) {
-			mutex_lock(&c->btree_cache.lock);
-			bch2_btree_node_hash_remove(&c->btree_cache, n);
-
-			bkey_copy(&n->key, new);
-			if (update_min)
-				n->data->min_key = expected_start;
-			if (update_max)
-				n->data->max_key = node_end;
-
-			ret = __bch2_btree_node_hash_insert(&c->btree_cache, n);
-			BUG_ON(ret);
-			mutex_unlock(&c->btree_cache.lock);
-			six_unlock_read(&n->c.lock);
+		ret = bch2_btree_repair_topology_recurse(c, cur);
+		six_unlock_read(&cur->c.lock);
+		cur = NULL;
+
+		if (ret == DROP_THIS_NODE) {
+			ret = bch2_journal_key_delete(c, b->c.btree_id,
+						      b->c.level, tmp.k->k.p);
+			dropped_children = true;
 		}
+
+		if (ret)
+			goto err;
+
+		have_child = true;
 	}
+
+	if (mustfix_fsck_err_on(!have_child, c,
+			"empty interior btree node at btree %s level %u\n"
+			"  %s",
+			bch2_btree_ids[b->c.btree_id],
+			b->c.level,
+			(bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(&b->key)), buf)))
+		ret = DROP_THIS_NODE;
+err:
 fsck_err:
+	if (!IS_ERR_OR_NULL(prev))
+		six_unlock_read(&prev->c.lock);
+	if (!IS_ERR_OR_NULL(cur))
+		six_unlock_read(&cur->c.lock);
+
+	bch2_btree_and_journal_iter_exit(&iter);
+	bch2_bkey_buf_exit(&tmp, c);
+
+	if (!ret && dropped_children)
+		goto again;
+
+	return ret;
+}
+
+static int bch2_repair_topology(struct bch_fs *c)
+{
+	struct btree *b;
+	unsigned i;
+	int ret = 0;
+
+	for (i = 0; i < BTREE_ID_NR && !ret; i++) {
+		b = c->btree_roots[i].b;
+		if (btree_node_fake(b))
+			continue;
+
+		six_lock_read(&b->c.lock, NULL, NULL);
+		ret = bch2_btree_repair_topology_recurse(c, b);
+		six_unlock_read(&b->c.lock);
+
+		if (ret == DROP_THIS_NODE) {
+			bch_err(c, "empty btree root - repair unimplemented");
+			ret = FSCK_ERR_EXIT;
+		}
+	}
+
 	return ret;
 }
 
@@ -483,6 +739,7 @@ static int bch2_gc_btree_init_recurse(struct bch_fs *c, struct btree *b,
 	struct bkey_s_c k;
 	struct bkey_buf cur, prev;
 	u8 max_stale = 0;
+	char buf[200];
 	int ret = 0;
 
 	bch2_btree_and_journal_iter_init_node_iter(&iter, c, b);
@@ -498,7 +755,7 @@ static int bch2_gc_btree_init_recurse(struct bch_fs *c, struct btree *b,
 				       &k, &max_stale, true);
 		if (ret) {
 			bch_err(c, "%s: error %i from bch2_gc_mark_key", __func__, ret);
-			break;
+			goto fsck_err;
 		}
 
 		if (b->c.level) {
@@ -511,7 +768,7 @@ static int bch2_gc_btree_init_recurse(struct bch_fs *c, struct btree *b,
 					&prev, cur,
 					!bch2_btree_and_journal_iter_peek(&iter).k);
 			if (ret)
-				break;
+				goto fsck_err;
 		} else {
 			bch2_btree_and_journal_iter_advance(&iter);
 		}
@@ -532,18 +789,25 @@ static int bch2_gc_btree_init_recurse(struct bch_fs *c, struct btree *b,
 						false);
 			ret = PTR_ERR_OR_ZERO(child);
 
-			if (fsck_err_on(ret == -EIO, c,
-					"unreadable btree node")) {
-				ret = bch2_journal_key_delete(c, b->c.btree_id,
-							      b->c.level, cur.k->k.p);
-				if (ret)
-					return ret;
-
-				set_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags);
-				continue;
-			}
-
-			if (ret) {
+			if (ret == -EIO) {
+				bch2_topology_error(c);
+
+				if (fsck_err(c, "Unreadable btree node at btree %s level %u:\n"
+					"  %s",
+					bch2_btree_ids[b->c.btree_id],
+					b->c.level - 1,
+					(bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(cur.k)), buf))) {
+					ret = FSCK_ERR_START_TOPOLOGY_REPAIR;
+					bch_info(c, "Halting mark and sweep to start topology repair pass");
+					goto fsck_err;
+				} else {
+					/* Continue marking when opted to not
+					 * fix the error: */
+					ret = 0;
+					set_bit(BCH_FS_INITIAL_GC_UNFIXED, &c->flags);
+					continue;
+				}
+			} else if (ret) {
 				bch_err(c, "%s: error %i getting btree node",
 					__func__, ret);
 				break;
@@ -583,16 +847,20 @@ static int bch2_gc_btree_init(struct bch_fs *c,
 		return 0;
 
 	six_lock_read(&b->c.lock, NULL, NULL);
-	if (fsck_err_on(bpos_cmp(b->data->min_key, POS_MIN), c,
+	if (mustfix_fsck_err_on(bpos_cmp(b->data->min_key, POS_MIN), c,
 			"btree root with incorrect min_key: %s",
 			(bch2_bpos_to_text(&PBUF(buf), b->data->min_key), buf))) {
-		BUG();
+		bch_err(c, "repair unimplemented");
+		ret = FSCK_ERR_EXIT;
+		goto fsck_err;
 	}
 
-	if (fsck_err_on(bpos_cmp(b->data->max_key, POS_MAX), c,
+	if (mustfix_fsck_err_on(bpos_cmp(b->data->max_key, POS_MAX), c,
 			"btree root with incorrect max_key: %s",
 			(bch2_bpos_to_text(&PBUF(buf), b->data->max_key), buf))) {
-		BUG();
+		bch_err(c, "repair unimplemented");
+		ret = FSCK_ERR_EXIT;
+		goto fsck_err;
 	}
 
 	if (b->c.level >= target_depth)
@@ -607,7 +875,7 @@ static int bch2_gc_btree_init(struct bch_fs *c,
 fsck_err:
 	six_unlock_read(&b->c.lock);
 
-	if (ret)
+	if (ret < 0)
 		bch_err(c, "%s: ret %i", __func__, ret);
 	return ret;
 }
@@ -622,23 +890,20 @@ static int bch2_gc_btrees(struct bch_fs *c, bool initial, bool metadata_only)
 {
 	enum btree_id ids[BTREE_ID_NR];
 	unsigned i;
+	int ret = 0;
 
 	for (i = 0; i < BTREE_ID_NR; i++)
 		ids[i] = i;
 	bubble_sort(ids, BTREE_ID_NR, btree_id_gc_phase_cmp);
 
-	for (i = 0; i < BTREE_ID_NR; i++) {
-		enum btree_id id = ids[i];
-		int ret = initial
-			? bch2_gc_btree_init(c, id, metadata_only)
-			: bch2_gc_btree(c, id, initial, metadata_only);
-		if (ret) {
-			bch_err(c, "%s: ret %i", __func__, ret);
-			return ret;
-		}
-	}
+	for (i = 0; i < BTREE_ID_NR && !ret; i++)
+		ret = initial
+			? bch2_gc_btree_init(c, ids[i], metadata_only)
+			: bch2_gc_btree(c, ids[i], initial, metadata_only);
 
-	return 0;
+	if (ret < 0)
+		bch_err(c, "%s: ret %i", __func__, ret);
+	return ret;
 }
 
 static void mark_metadata_sectors(struct bch_fs *c, struct bch_dev *ca,
@@ -1025,7 +1290,27 @@ again:
 
 	bch2_mark_superblocks(c);
 
+	if (test_bit(BCH_FS_TOPOLOGY_ERROR, &c->flags) &&
+	    !test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags) &&
+	    c->opts.fix_errors != FSCK_OPT_NO) {
+		bch_info(c, "starting topology repair pass");
+		ret = bch2_repair_topology(c);
+		if (ret)
+			goto out;
+		bch_info(c, "topology repair pass done");
+	}
+
 	ret = bch2_gc_btrees(c, initial, metadata_only);
+
+	if (ret == FSCK_ERR_START_TOPOLOGY_REPAIR &&
+	    !test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags)) {
+		set_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags);
+		ret = 0;
+	}
+
+	if (ret == FSCK_ERR_START_TOPOLOGY_REPAIR)
+		ret = FSCK_ERR_EXIT;
+
 	if (ret)
 		goto out;
 
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index cea151a5d4f8..389524ce1fb6 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -558,6 +558,55 @@ out:									\
 
 #define btree_err_on(cond, ...)	((cond) ? btree_err(__VA_ARGS__) : false)
 
+/*
+ * When btree topology repair changes the start or end of a node, that might
+ * mean we have to drop keys that are no longer inside the node:
+ */
+__cold
+void bch2_btree_node_drop_keys_outside_node(struct btree *b)
+{
+	struct bset_tree *t;
+	struct bkey_s_c k;
+	struct bkey unpacked;
+	struct btree_node_iter iter;
+
+	for_each_bset(b, t) {
+		struct bset *i = bset(b, t);
+		struct bkey_packed *k;
+
+		for (k = i->start; k != vstruct_last(i); k = bkey_next(k))
+			if (bkey_cmp_left_packed(b, k, &b->data->min_key) >= 0)
+				break;
+
+		if (k != i->start) {
+			unsigned shift = (u64 *) k - (u64 *) i->start;
+
+			memmove_u64s_down(i->start, k,
+					  (u64 *) vstruct_end(i) - (u64 *) k);
+			i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - shift);
+			set_btree_bset_end(b, t);
+			bch2_bset_set_no_aux_tree(b, t);
+		}
+
+		for (k = i->start; k != vstruct_last(i); k = bkey_next(k))
+			if (bkey_cmp_left_packed(b, k, &b->data->max_key) > 0)
+				break;
+
+		if (k != vstruct_last(i)) {
+			i->u64s = cpu_to_le16((u64 *) k - (u64 *) i->start);
+			set_btree_bset_end(b, t);
+			bch2_bset_set_no_aux_tree(b, t);
+		}
+	}
+
+	bch2_btree_build_aux_trees(b);
+
+	for_each_btree_node_key_unpack(b, k, &iter, &unpacked) {
+		BUG_ON(bpos_cmp(k.k->p, b->data->min_key) < 0);
+		BUG_ON(bpos_cmp(k.k->p, b->data->max_key) > 0);
+	}
+}
+
 static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
 			 struct btree *b, struct bset *i,
 			 unsigned sectors, int write, bool have_retry)
@@ -680,6 +729,8 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b,
 {
 	unsigned version = le16_to_cpu(i->version);
 	struct bkey_packed *k, *prev = NULL;
+	bool updated_range = b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
+		BTREE_PTR_RANGE_UPDATED(&bkey_i_to_btree_ptr_v2(&b->key)->v);
 	int ret = 0;
 
 	for (k = i->start;
@@ -713,7 +764,7 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b,
 		u = __bkey_disassemble(b, k, &tmp);
 
 		invalid = __bch2_bkey_invalid(c, u.s_c, btree_node_type(b)) ?:
-			bch2_bkey_in_btree_node(b, u.s_c) ?:
+			(!updated_range ?  bch2_bkey_in_btree_node(b, u.s_c) : NULL) ?:
 			(write ? bch2_bkey_val_invalid(c, u.s_c) : NULL);
 		if (invalid) {
 			char buf[160];
@@ -770,6 +821,8 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
 	struct bch_extent_ptr *ptr;
 	struct bset *i;
 	bool used_mempool, blacklisted;
+	bool updated_range = b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
+		BTREE_PTR_RANGE_UPDATED(&bkey_i_to_btree_ptr_v2(&b->key)->v);
 	unsigned u64s;
 	int ret, retry_read = 0, write = READ;
 
@@ -917,6 +970,9 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
 
 	btree_bounce_free(c, btree_bytes(c), used_mempool, sorted);
 
+	if (updated_range)
+		bch2_btree_node_drop_keys_outside_node(b);
+
 	i = &b->data->keys;
 	for (k = i->start; k != vstruct_last(i);) {
 		struct bkey tmp;
diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h
index c8a8b05a19b0..cadcf7f886d7 100644
--- a/fs/bcachefs/btree_io.h
+++ b/fs/bcachefs/btree_io.h
@@ -131,6 +131,8 @@ static inline void bset_encrypt(struct bch_fs *c, struct bset *i, unsigned offse
 
 void bch2_btree_sort_into(struct bch_fs *, struct btree *, struct btree *);
 
+void bch2_btree_node_drop_keys_outside_node(struct btree *);
+
 void bch2_btree_build_aux_trees(struct btree *);
 void bch2_btree_init_next(struct bch_fs *, struct btree *,
 			 struct btree_iter *);
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 113e81e8730c..ea522b4583fd 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -1609,11 +1609,12 @@ retry:
 
 		bch2_bpos_to_text(&PBUF(buf1), prev->data->max_key);
 		bch2_bpos_to_text(&PBUF(buf2), next->data->min_key);
-		bch2_fs_inconsistent(c,
-				     "btree topology error in btree merge:\n"
-				     "prev ends at   %s\n"
-				     "next starts at %s\n",
-				     buf1, buf2);
+		bch_err(c,
+			"btree topology error in btree merge:\n"
+			"  prev ends at   %s\n"
+			"  next starts at %s",
+			buf1, buf2);
+		bch2_topology_error(c);
 		ret = -EIO;
 		goto err;
 	}
diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c
index a8ee1db8aa39..90c3b986c264 100644
--- a/fs/bcachefs/error.c
+++ b/fs/bcachefs/error.c
@@ -25,6 +25,13 @@ bool bch2_inconsistent_error(struct bch_fs *c)
 	}
 }
 
+void bch2_topology_error(struct bch_fs *c)
+{
+	set_bit(BCH_FS_TOPOLOGY_ERROR, &c->flags);
+	if (test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags))
+		bch2_inconsistent_error(c);
+}
+
 void bch2_fatal_error(struct bch_fs *c)
 {
 	if (bch2_fs_emergency_read_only(c))
@@ -74,9 +81,13 @@ enum fsck_err_ret bch2_fsck_err(struct bch_fs *c, unsigned flags,
 		vprintk(fmt, args);
 		va_end(args);
 
-		return bch2_inconsistent_error(c)
-			? FSCK_ERR_EXIT
-			: FSCK_ERR_FIX;
+		if (c->opts.errors == BCH_ON_ERROR_continue) {
+			bch_err(c, "fixing");
+			return FSCK_ERR_FIX;
+		} else {
+			bch2_inconsistent_error(c);
+			return FSCK_ERR_EXIT;
+		}
 	}
 
 	mutex_lock(&c->fsck_error_lock);
@@ -146,6 +157,7 @@ print:
 		set_bit(BCH_FS_ERRORS_FIXED, &c->flags);
 		return FSCK_ERR_FIX;
 	} else {
+		set_bit(BCH_FS_ERRORS_NOT_FIXED, &c->flags);
 		set_bit(BCH_FS_ERROR, &c->flags);
 		return c->opts.fix_errors == FSCK_OPT_EXIT ||
 			!(flags & FSCK_CAN_IGNORE)
diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h
index 0e49fd728e44..d8cd19b3f63c 100644
--- a/fs/bcachefs/error.h
+++ b/fs/bcachefs/error.h
@@ -29,6 +29,8 @@ struct work_struct;
 
 bool bch2_inconsistent_error(struct bch_fs *);
 
+void bch2_topology_error(struct bch_fs *);
+
 #define bch2_fs_inconsistent(c, ...)					\
 ({									\
 	bch_err(c, __VA_ARGS__);					\
@@ -88,6 +90,7 @@ enum fsck_err_ret {
 	FSCK_ERR_IGNORE	= 0,
 	FSCK_ERR_FIX	= 1,
 	FSCK_ERR_EXIT	= 2,
+	FSCK_ERR_START_TOPOLOGY_REPAIR = 3,
 };
 
 struct fsck_err_state {
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index fe6886e42216..a9ccd14effe7 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -1241,8 +1241,9 @@ use_clean:
 
 	if (c->opts.fsck &&
 	    !test_bit(BCH_FS_ERROR, &c->flags) &&
-	    BCH_SB_HAS_ERRORS(c->disk_sb.sb)) {
+	    !test_bit(BCH_FS_ERRORS_NOT_FIXED, &c->flags)) {
 		SET_BCH_SB_HAS_ERRORS(c->disk_sb.sb, 0);
+		SET_BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb, 0);
 		write_sb = true;
 	}
 
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index bf36a5743607..e0de6f0c0cb4 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -439,6 +439,11 @@ int bch2_sb_to_fs(struct bch_fs *c, struct bch_sb *src)
 
 	__copy_super(&c->disk_sb, src);
 
+	if (BCH_SB_HAS_ERRORS(c->disk_sb.sb))
+		set_bit(BCH_FS_ERROR, &c->flags);
+	if (BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb))
+		set_bit(BCH_FS_TOPOLOGY_ERROR, &c->flags);
+
 	ret = bch2_sb_replicas_to_cpu_replicas(c);
 	if (ret)
 		return ret;
@@ -715,6 +720,8 @@ int bch2_write_super(struct bch_fs *c)
 
 	if (test_bit(BCH_FS_ERROR, &c->flags))
 		SET_BCH_SB_HAS_ERRORS(c->disk_sb.sb, 1);
+	if (test_bit(BCH_FS_TOPOLOGY_ERROR, &c->flags))
+		SET_BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb, 1);
 
 	SET_BCH_SB_BIG_ENDIAN(c->disk_sb.sb, CPU_BIG_ENDIAN);
 
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 64bc5ed33203..78db2c0a5f5a 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -388,6 +388,11 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early)
 	unsigned i;
 	int ret;
 
+	if (test_bit(BCH_FS_INITIAL_GC_UNFIXED, &c->flags)) {
+		bch_err(c, "cannot go rw, unfixed btree errors");
+		return -EROFS;
+	}
+
 	if (test_bit(BCH_FS_RW, &c->flags))
 		return 0;
 
-- 
cgit 


From e3b4b48c17a0f749f2786e756714a56316a519b3 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 24 Apr 2021 22:33:25 -0400
Subject: bcachefs: Fix a null ptr deref

Fix a few memory safety issues, found by asan in userspace.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fsck.c | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 1ce038846476..b867576b3ffd 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -290,21 +290,24 @@ static int hash_redo_key(struct btree_trans *trans,
 			 struct bch_hash_info *hash_info,
 			 struct btree_iter *k_iter, struct bkey_s_c k)
 {
-	struct bkey_i delete;
+	struct bkey_i *delete;
 	struct bkey_i *tmp;
 
+	delete = bch2_trans_kmalloc(trans, sizeof(*delete));
+	if (IS_ERR(delete))
+		return PTR_ERR(delete);
+
 	tmp = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
 	if (IS_ERR(tmp))
 		return PTR_ERR(tmp);
 
 	bkey_reassemble(tmp, k);
 
-	bkey_init(&delete.k);
-	delete.k.p = k_iter->pos;
-	bch2_trans_update(trans, k_iter, &delete, 0);
+	bkey_init(&delete->k);
+	delete->k.p = k_iter->pos;
+	bch2_trans_update(trans, k_iter, delete, 0);
 
-	return bch2_hash_set(trans, desc, hash_info, k_iter->pos.inode,
-			     tmp, 0);
+	return bch2_hash_set(trans, desc, hash_info, k_iter->pos.inode, tmp, 0);
 }
 
 static int fsck_hash_delete_at(struct btree_trans *trans,
@@ -377,9 +380,8 @@ static int hash_check_key(struct btree_trans *trans,
 	return ret;
 bad_hash:
 	if (fsck_err(c, "hash table key at wrong offset: btree %u inode %llu offset %llu, "
-		     "hashed to %llu should be at %llu\n%s",
-		     desc.btree_id, hash_k.k->p.inode, hash_k.k->p.offset,
-		     hash, iter->pos.offset,
+		     "hashed to %llu\n%s",
+		     desc.btree_id, hash_k.k->p.inode, hash_k.k->p.offset, hash,
 		     (bch2_bkey_val_to_text(&PBUF(buf), c, hash_k), buf)) == FSCK_ERR_IGNORE)
 		return 0;
 
-- 
cgit 


From fc51b041b72a7cbffc60811ff14d25207a4f7624 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 21 Apr 2021 21:08:49 -0400
Subject: bcachefs: New check_nlinks algorithm for snapshots

With snapshots, using a radix tree for the table of link counts won't
work anymore because we also need to distinguish between inodes with
different snapshot IDs. Instead, this patch builds up a sorted array of
inodes that have hardlinks that we can binary search on - taking
advantage of the fact that with inode backpointers, the check_nlinks()
pass _only_ needs to concern itself with inodes that have hardlinks now.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fsck.c | 228 ++++++++++++++++++++++++++++++++++-------------------
 1 file changed, 145 insertions(+), 83 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index b867576b3ffd..8ae4e4c30933 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -12,8 +12,8 @@
 #include "super.h"
 #include "xattr.h"
 
+#include <linux/bsearch.h>
 #include <linux/dcache.h> /* struct qstr */
-#include <linux/generic-radix-tree.h>
 
 #define QSTR(n) { { { .len = strlen(n) } }, .name = n }
 
@@ -1132,38 +1132,120 @@ static int check_directory_structure(struct bch_fs *c)
 	return bch2_trans_exit(&trans) ?: ret;
 }
 
-struct nlink {
-	u32	count;
-};
+struct nlink_table {
+	size_t		nr;
+	size_t		size;
 
-typedef GENRADIX(struct nlink) nlink_table;
+	struct nlink {
+		u64	inum;
+		u32	snapshot;
+		u32	count;
+	}		*d;
+};
 
-static void inc_link(struct bch_fs *c, nlink_table *links,
-		     u64 range_start, u64 *range_end, u64 inum)
+static int add_nlink(struct nlink_table *t, u64 inum, u32 snapshot)
 {
-	struct nlink *link;
+	if (t->nr == t->size) {
+		size_t new_size = max_t(size_t, 128UL, t->size * 2);
+		void *d = kvmalloc(new_size * sizeof(t->d[0]), GFP_KERNEL);
+		if (!d) {
+			return -ENOMEM;
+		}
 
-	if (inum < range_start || inum >= *range_end)
-		return;
+		memcpy(d, t->d, t->size * sizeof(t->d[0]));
+		kvfree(t->d);
 
-	if (inum - range_start >= SIZE_MAX / sizeof(struct nlink)) {
-		*range_end = inum;
-		return;
+		t->d = d;
+		t->size = new_size;
 	}
 
-	link = genradix_ptr_alloc(links, inum - range_start, GFP_KERNEL);
-	if (!link) {
-		bch_verbose(c, "allocation failed during fsck - will need another pass");
-		*range_end = inum;
+
+	t->d[t->nr++] = (struct nlink) {
+		.inum		= inum,
+		.snapshot	= snapshot,
+	};
+
+	return 0;
+}
+
+static int nlink_cmp(const void *_l, const void *_r)
+{
+	const struct nlink *l = _l;
+	const struct nlink *r = _r;
+
+	return cmp_int(l->inum, r->inum) ?: cmp_int(l->snapshot, r->snapshot);
+}
+
+static void inc_link(struct bch_fs *c, struct nlink_table *links,
+		     u64 range_start, u64 range_end, u64 inum)
+{
+	struct nlink *link, key = {
+		.inum = inum, .snapshot = U32_MAX,
+	};
+
+	if (inum < range_start || inum >= range_end)
 		return;
+
+	link = __inline_bsearch(&key, links->d, links->nr,
+				sizeof(links->d[0]), nlink_cmp);
+	if (link)
+		link->count++;
+}
+
+noinline_for_stack
+static int check_nlinks_find_hardlinks(struct bch_fs *c,
+				       struct nlink_table *t,
+				       u64 start, u64 *end)
+{
+	struct btree_trans trans;
+	struct btree_iter *iter;
+	struct bkey_s_c k;
+	struct bkey_s_c_inode inode;
+	struct bch_inode_unpacked u;
+	int ret = 0;
+
+	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
+
+	for_each_btree_key(&trans, iter, BTREE_ID_inodes,
+			   POS(0, start), 0, k, ret) {
+		if (k.k->type != KEY_TYPE_inode)
+			continue;
+
+		inode = bkey_s_c_to_inode(k);
+
+		/*
+		 * Backpointer and directory structure checks are sufficient for
+		 * directories, since they can't have hardlinks:
+		 */
+		if (S_ISDIR(le16_to_cpu(inode.v->bi_mode)))
+			continue;
+
+		/* Should never fail, checked by bch2_inode_invalid: */
+		BUG_ON(bch2_inode_unpack(inode, &u));
+
+		if (!u.bi_nlink)
+			continue;
+
+		ret = add_nlink(t, k.k->p.offset, k.k->p.snapshot);
+		if (ret) {
+			*end = k.k->p.offset;
+			ret = 0;
+			break;
+		}
+
 	}
+	bch2_trans_iter_put(&trans, iter);
+	bch2_trans_exit(&trans);
+
+	if (ret)
+		bch_err(c, "error in fsck: btree error %i while walking inodes", ret);
 
-	link->count++;
+	return ret;
 }
 
 noinline_for_stack
-static int bch2_gc_walk_dirents(struct bch_fs *c, nlink_table *links,
-			       u64 range_start, u64 *range_end)
+static int check_nlinks_walk_dirents(struct bch_fs *c, struct nlink_table *links,
+				     u64 range_start, u64 range_end)
 {
 	struct btree_trans trans;
 	struct btree_iter *iter;
@@ -1195,80 +1277,58 @@ static int bch2_gc_walk_dirents(struct bch_fs *c, nlink_table *links,
 	return ret;
 }
 
-static int check_inode_nlink(struct btree_trans *trans,
-			     struct btree_iter *iter,
-			     struct bkey_s_c_inode inode,
-			     unsigned nlink)
-{
-	struct bch_fs *c = trans->c;
-	struct bch_inode_unpacked u;
-	int ret = 0;
-
-	/*
-	 * Backpointer and directory structure checks are sufficient for
-	 * directories, since they can't have hardlinks:
-	 */
-	if (S_ISDIR(le16_to_cpu(inode.v->bi_mode)))
-		return 0;
-
-	if (!nlink) {
-		bch_err(c, "no links found to inode %llu", inode.k->p.offset);
-		return -EINVAL;
-	}
-
-	ret = bch2_inode_unpack(inode, &u);
-
-	/* Should never happen, checked by bch2_inode_invalid: */
-	if (bch2_fs_inconsistent_on(ret, c,
-			 "error unpacking inode %llu in fsck",
-			 inode.k->p.inode))
-		return ret;
-
-	if (fsck_err_on(bch2_inode_nlink_get(&u) != nlink, c,
-			"inode %llu has wrong i_nlink (type %u i_nlink %u, should be %u)",
-			u.bi_inum, mode_to_type(u.bi_mode),
-			bch2_inode_nlink_get(&u), nlink)) {
-		bch2_inode_nlink_set(&u, nlink);
-
-		ret = __bch2_trans_do(trans, NULL, NULL,
-				      BTREE_INSERT_NOFAIL|
-				      BTREE_INSERT_LAZY_RW,
-				bch2_inode_write(trans, iter, &u));
-		if (ret)
-			bch_err(c, "error in fsck: error %i updating inode", ret);
-	}
-fsck_err:
-	return ret;
-}
-
 noinline_for_stack
-static int bch2_gc_walk_inodes(struct bch_fs *c,
-			       nlink_table *links,
+static int check_nlinks_update_hardlinks(struct bch_fs *c,
+			       struct nlink_table *links,
 			       u64 range_start, u64 range_end)
 {
 	struct btree_trans trans;
 	struct btree_iter *iter;
 	struct bkey_s_c k;
-	struct nlink *link;
+	struct bkey_s_c_inode inode;
+	struct bch_inode_unpacked u;
+	struct nlink *link = links->d;
 	int ret = 0;
 
 	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
 
 	for_each_btree_key(&trans, iter, BTREE_ID_inodes,
 			   POS(0, range_start), 0, k, ret) {
-		if (!k.k || k.k->p.offset >= range_end)
+		if (k.k->p.offset >= range_end)
 			break;
 
 		if (k.k->type != KEY_TYPE_inode)
 			continue;
 
-		link = genradix_ptr(links, k.k->p.offset - range_start);
-		ret = check_inode_nlink(&trans, iter,
-					bkey_s_c_to_inode(k), link ? link->count : 0);
-		if (ret)
-			break;
+		inode = bkey_s_c_to_inode(k);
+		if (S_ISDIR(le16_to_cpu(inode.v->bi_mode)))
+			continue;
+
+		BUG_ON(bch2_inode_unpack(inode, &u));
 
+		if (!u.bi_nlink)
+			continue;
+
+		while (link->inum < k.k->p.offset) {
+			link++;
+			BUG_ON(link >= links->d + links->nr);
+		}
+
+		if (fsck_err_on(bch2_inode_nlink_get(&u) != link->count, c,
+				"inode %llu has wrong i_nlink (type %u i_nlink %u, should be %u)",
+				u.bi_inum, mode_to_type(u.bi_mode),
+				bch2_inode_nlink_get(&u), link->count)) {
+			bch2_inode_nlink_set(&u, link->count);
+
+			ret = __bch2_trans_do(&trans, NULL, NULL,
+					      BTREE_INSERT_NOFAIL|
+					      BTREE_INSERT_LAZY_RW,
+					bch2_inode_write(&trans, iter, &u));
+			if (ret)
+				bch_err(c, "error in fsck: error %i updating inode", ret);
+		}
 	}
+fsck_err:
 	bch2_trans_iter_put(&trans, iter);
 	bch2_trans_exit(&trans);
 
@@ -1281,34 +1341,36 @@ static int bch2_gc_walk_inodes(struct bch_fs *c,
 noinline_for_stack
 static int check_nlinks(struct bch_fs *c)
 {
-	nlink_table links;
+	struct nlink_table links = { 0 };
 	u64 this_iter_range_start, next_iter_range_start = 0;
 	int ret = 0;
 
 	bch_verbose(c, "checking inode nlinks");
 
-	genradix_init(&links);
-
 	do {
 		this_iter_range_start = next_iter_range_start;
 		next_iter_range_start = U64_MAX;
 
-		ret = bch2_gc_walk_dirents(c, &links,
+		ret = check_nlinks_find_hardlinks(c, &links,
+						  this_iter_range_start,
+						  &next_iter_range_start);
+
+		ret = check_nlinks_walk_dirents(c, &links,
 					  this_iter_range_start,
-					  &next_iter_range_start);
+					  next_iter_range_start);
 		if (ret)
 			break;
 
-		ret = bch2_gc_walk_inodes(c, &links,
+		ret = check_nlinks_update_hardlinks(c, &links,
 					 this_iter_range_start,
 					 next_iter_range_start);
 		if (ret)
 			break;
 
-		genradix_free(&links);
+		links.nr = 0;
 	} while (next_iter_range_start != U64_MAX);
 
-	genradix_free(&links);
+	kvfree(links.d);
 
 	return ret;
 }
-- 
cgit 


From ceda1b9a179ffd8ece3f7d15d5b1379eb2552215 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 25 Apr 2021 16:24:03 -0400
Subject: bcachefs: Evict btree nodes we're deleting

There was a bug that led to duplicate btree node pointers being inserted
at the wrong level. The new topology repair code can fix that, except
that the btree cache code gets confused when we read in a btree node
from the pointer that was at the wrong level. This patch evicts nodes
that we're deleting to, which nicely solves the problem.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_cache.c | 30 ++++++++++++++++++++++++++++++
 fs/bcachefs/btree_cache.h |  2 ++
 fs/bcachefs/btree_gc.c    |  2 ++
 3 files changed, 34 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index a42e0922f5e9..85c19e4e5216 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -948,6 +948,36 @@ void bch2_btree_node_prefetch(struct bch_fs *c, struct btree_iter *iter,
 	bch2_btree_node_fill(c, iter, k, btree_id, level, SIX_LOCK_read, false);
 }
 
+void bch2_btree_node_evict(struct bch_fs *c, const struct bkey_i *k)
+{
+	struct btree_cache *bc = &c->btree_cache;
+	struct btree *b;
+
+	b = btree_cache_find(bc, k);
+	if (!b)
+		return;
+
+	six_lock_intent(&b->c.lock, NULL, NULL);
+	six_lock_write(&b->c.lock, NULL, NULL);
+
+	wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight,
+		       TASK_UNINTERRUPTIBLE);
+	__bch2_btree_node_write(c, b);
+
+	/* wait for any in flight btree write */
+	btree_node_wait_on_io(b);
+
+	BUG_ON(btree_node_dirty(b));
+
+	mutex_lock(&bc->lock);
+	btree_node_data_free(c, b);
+	bch2_btree_node_hash_remove(bc, b);
+	mutex_unlock(&bc->lock);
+
+	six_unlock_write(&b->c.lock);
+	six_unlock_intent(&b->c.lock);
+}
+
 void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c,
 			     struct btree *b)
 {
diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h
index c517cc029454..40dd263a7caa 100644
--- a/fs/bcachefs/btree_cache.h
+++ b/fs/bcachefs/btree_cache.h
@@ -30,6 +30,8 @@ struct btree *bch2_btree_node_get_noiter(struct bch_fs *, const struct bkey_i *,
 void bch2_btree_node_prefetch(struct bch_fs *, struct btree_iter *,
 			      const struct bkey_i *, enum btree_id, unsigned);
 
+void bch2_btree_node_evict(struct bch_fs *, const struct bkey_i *);
+
 void bch2_fs_btree_cache_exit(struct bch_fs *);
 int bch2_fs_btree_cache_init(struct bch_fs *);
 void bch2_fs_btree_cache_init_early(struct btree_cache *);
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 09a49dc63144..1c2eab41f7ca 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -298,6 +298,7 @@ again:
 				bch2_btree_ids[b->c.btree_id],
 				b->c.level - 1,
 				(bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(tmp.k)), buf))) {
+			bch2_btree_node_evict(c, tmp.k);
 			ret = bch2_journal_key_delete(c, b->c.btree_id,
 						      b->c.level, tmp.k->k.p);
 			if (ret)
@@ -359,6 +360,7 @@ again:
 		cur = NULL;
 
 		if (ret == DROP_THIS_NODE) {
+			bch2_btree_node_evict(c, tmp.k);
 			ret = bch2_journal_key_delete(c, b->c.btree_id,
 						      b->c.level, tmp.k->k.p);
 			dropped_children = true;
-- 
cgit 


From d36cdb045aa7029f8495d18760da6c944ab95cb3 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 27 Apr 2021 11:12:17 -0400
Subject: bcachefs: Fix __bch2_trans_get_iter()

We need to also set iter->uptodate to indicate it needs to be traversed.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 11f7b47e3e7f..757e5650f33c 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -2085,15 +2085,16 @@ struct btree_iter *__bch2_trans_get_iter(struct btree_trans *trans,
 		btree_iter_get_locks(iter, true, false);
 	}
 
-	while (iter->level < depth) {
+	while (iter->level != depth) {
 		btree_node_unlock(iter, iter->level);
 		iter->l[iter->level].b = BTREE_ITER_NO_NODE_INIT;
-		iter->level++;
+		iter->uptodate = BTREE_ITER_NEED_TRAVERSE;
+		if (iter->level < depth)
+			iter->level++;
+		else
+			iter->level--;
 	}
 
-	while (iter->level > depth)
-		iter->l[--iter->level].b = BTREE_ITER_NO_NODE_INIT;
-
 	iter->min_depth	= depth;
 
 	bch2_btree_iter_set_pos(iter, pos);
-- 
cgit 


From 3dea728ce64bdd3c8c5e98cb7b8a148d52c8f0d4 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 29 Apr 2021 16:56:17 -0400
Subject: bcachefs: New tracepoint for bch2_trans_get_iter()

Trying to debug an issue where after traverse_all() we shouldn't have to
traverse any iterators... yet we are

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c | 32 +++++++++++++++++++--------
 fs/bcachefs/trace.h      | 56 ++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 79 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 757e5650f33c..525afe3509ec 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -2020,6 +2020,13 @@ struct btree_iter *__bch2_trans_get_iter(struct btree_trans *trans,
 					 unsigned flags)
 {
 	struct btree_iter *iter, *best = NULL;
+	struct bpos real_pos, pos_min = POS_MIN;
+
+	if ((flags & BTREE_ITER_TYPE) != BTREE_ITER_NODES &&
+	    btree_node_type_is_extents(btree_id) &&
+	    !(flags & BTREE_ITER_NOT_EXTENTS) &&
+	    !(flags & BTREE_ITER_ALL_SNAPSHOTS))
+		flags |= BTREE_ITER_IS_EXTENTS;
 
 	if ((flags & BTREE_ITER_TYPE) != BTREE_ITER_NODES &&
 	    !btree_type_has_snapshots(btree_id))
@@ -2029,6 +2036,12 @@ struct btree_iter *__bch2_trans_get_iter(struct btree_trans *trans,
 		pos.snapshot = btree_type_has_snapshots(btree_id)
 			? U32_MAX : 0;
 
+	real_pos = pos;
+
+	if ((flags & BTREE_ITER_IS_EXTENTS) &&
+	    bkey_cmp(pos, POS_MAX))
+		real_pos = bpos_nosnap_successor(pos);
+
 	trans_for_each_iter(trans, iter) {
 		if (btree_iter_type(iter) != (flags & BTREE_ITER_TYPE))
 			continue;
@@ -2037,8 +2050,8 @@ struct btree_iter *__bch2_trans_get_iter(struct btree_trans *trans,
 			continue;
 
 		if (best) {
-			int cmp = bkey_cmp(bpos_diff(best->real_pos, pos),
-					   bpos_diff(iter->real_pos, pos));
+			int cmp = bkey_cmp(bpos_diff(best->real_pos, real_pos),
+					   bpos_diff(iter->real_pos, real_pos));
 
 			if (cmp < 0 ||
 			    ((cmp == 0 && btree_iter_keep(trans, iter))))
@@ -2048,6 +2061,13 @@ struct btree_iter *__bch2_trans_get_iter(struct btree_trans *trans,
 		best = iter;
 	}
 
+	trace_trans_get_iter(_RET_IP_, trans->ip,
+			     btree_id,
+			     &real_pos, locks_want,
+			     best ? &best->real_pos : &pos_min,
+			     best ? best->locks_want : 0,
+			     best ? best->uptodate : BTREE_ITER_NEED_TRAVERSE);
+
 	if (!best) {
 		iter = btree_trans_iter_alloc(trans);
 		bch2_btree_iter_init(trans, iter, btree_id);
@@ -2061,12 +2081,6 @@ struct btree_iter *__bch2_trans_get_iter(struct btree_trans *trans,
 	trans->iters_live	|= 1ULL << iter->idx;
 	trans->iters_touched	|= 1ULL << iter->idx;
 
-	if ((flags & BTREE_ITER_TYPE) != BTREE_ITER_NODES &&
-	    btree_node_type_is_extents(btree_id) &&
-	    !(flags & BTREE_ITER_NOT_EXTENTS) &&
-	    !(flags & BTREE_ITER_ALL_SNAPSHOTS))
-		flags |= BTREE_ITER_IS_EXTENTS;
-
 	iter->flags = flags;
 
 	iter->snapshot = pos.snapshot;
@@ -2098,7 +2112,7 @@ struct btree_iter *__bch2_trans_get_iter(struct btree_trans *trans,
 	iter->min_depth	= depth;
 
 	bch2_btree_iter_set_pos(iter, pos);
-	btree_iter_set_search_pos(iter, btree_iter_search_key(iter));
+	btree_iter_set_search_pos(iter, real_pos);
 
 	return iter;
 }
diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h
index c6d98f4c50e7..7e518f7618d4 100644
--- a/fs/bcachefs/trace.h
+++ b/fs/bcachefs/trace.h
@@ -528,6 +528,62 @@ TRACE_EVENT(copygc,
 		__entry->buckets_moved, __entry->buckets_not_moved)
 );
 
+TRACE_EVENT(trans_get_iter,
+	TP_PROTO(unsigned long caller, unsigned long ip,
+		 enum btree_id btree_id,
+		 struct bpos *pos_want,
+		 unsigned locks_want,
+		 struct bpos *pos_found,
+		 unsigned locks_found,
+		 unsigned uptodate),
+	TP_ARGS(caller, ip, btree_id,
+		pos_want, locks_want,
+		pos_found, locks_found,
+		uptodate),
+
+	TP_STRUCT__entry(
+		__field(unsigned long,	caller			)
+		__field(unsigned long,	ip			)
+		__field(u8,		btree_id		)
+		__field(u8,		uptodate		)
+		__field(u8,		locks_want		)
+		__field(u8,		locks_found		)
+		__field(u64,		pos_want_inode		)
+		__field(u64,		pos_want_offset		)
+		__field(u32,		pos_want_snapshot	)
+		__field(u64,		pos_found_inode		)
+		__field(u64,		pos_found_offset	)
+		__field(u32,		pos_found_snapshot	)
+	),
+
+	TP_fast_assign(
+		__entry->caller			= caller;
+		__entry->ip			= ip;
+		__entry->btree_id		= btree_id;
+		__entry->uptodate		= uptodate;
+		__entry->pos_want_inode		= pos_want->inode;
+		__entry->pos_want_offset	= pos_want->offset;
+		__entry->pos_want_snapshot	= pos_want->snapshot;
+		__entry->pos_found_inode	= pos_found->inode;
+		__entry->pos_found_offset	= pos_found->offset;
+		__entry->pos_found_snapshot	= pos_found->snapshot;
+	),
+
+	TP_printk("%ps %pS btree %u uptodate %u want %llu:%llu:%u locks %u found %llu:%llu:%u locks %u",
+		  (void *) __entry->caller,
+		  (void *) __entry->ip,
+		  __entry->btree_id,
+		  __entry->uptodate,
+		  __entry->pos_want_inode,
+		  __entry->pos_want_offset,
+		  __entry->pos_want_snapshot,
+		  __entry->locks_want,
+		  __entry->pos_found_inode,
+		  __entry->pos_found_offset,
+		  __entry->pos_found_snapshot,
+		  __entry->locks_found)
+);
+
 TRACE_EVENT(transaction_restart_ip,
 	TP_PROTO(unsigned long caller, unsigned long ip),
 	TP_ARGS(caller, ip),
-- 
cgit 


From d99af4f194d7af8aa11233707826875ef0704034 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 29 Apr 2021 22:32:44 -0400
Subject: bcachefs: Call bch2_inconsistent_error() on missing stripe/indirect
 extent

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/buckets.c | 4 ++++
 fs/bcachefs/io.c      | 1 +
 2 files changed, 5 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index e6e984587b5d..d6f0325affcc 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -902,6 +902,7 @@ static int bch2_mark_stripe_ptr(struct bch_fs *c,
 		spin_unlock(&c->ec_stripes_heap_lock);
 		bch_err_ratelimited(c, "pointer to nonexistent stripe %llu",
 				    (u64) p.idx);
+		bch2_inconsistent_error(c);
 		return -EIO;
 	}
 
@@ -1019,6 +1020,7 @@ static int bch2_mark_stripe(struct bch_fs *c,
 	if (!m || (old_s && !m->alive)) {
 		bch_err_ratelimited(c, "error marking nonexistent stripe %zu",
 				    idx);
+		bch2_inconsistent_error(c);
 		return -1;
 	}
 
@@ -1503,6 +1505,7 @@ static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans,
 		bch2_fs_inconsistent(c,
 			"pointer to nonexistent stripe %llu",
 			(u64) p.ec.idx);
+		bch2_inconsistent_error(c);
 		ret = -EIO;
 		goto out;
 	}
@@ -1743,6 +1746,7 @@ static int __bch2_trans_mark_reflink_p(struct btree_trans *trans,
 		bch2_fs_inconsistent(c,
 			"%llu:%llu len %u points to nonexistent indirect extent %llu",
 			p.k->p.inode, p.k->p.offset, p.k->size, idx);
+		bch2_inconsistent_error(c);
 		ret = -EIO;
 		goto err;
 	}
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index 9c46f67c0d8e..1e0effcece7f 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -1960,6 +1960,7 @@ int __bch2_read_indirect_extent(struct btree_trans *trans,
 	    k.k->type != KEY_TYPE_indirect_inline_data) {
 		bch_err_inum_ratelimited(trans->c, orig_k->k->k.p.inode,
 				"pointer to nonexistent indirect extent");
+		bch2_inconsistent_error(trans->c);
 		ret = -EIO;
 		goto err;
 	}
-- 
cgit 


From baa6502905df0acb94afbf6c93f51a1f2aa4c7ca Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 27 Apr 2021 14:02:00 -0400
Subject: bcachefs: Change bch2_btree_key_cache_count() to exclude dirty keys

We're seeing livelocks that appear to be due to
bch2_btree_key_cache_scan repeatedly scanning and blocking other tasks
from using the key cache lock - we probably shouldn't be reporting
objects that can't actually be freed yet.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_key_cache.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index f8b9ca4dfb2b..221cb0f46db0 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -646,8 +646,10 @@ static unsigned long bch2_btree_key_cache_count(struct shrinker *shrink,
 	struct bch_fs *c = container_of(shrink, struct bch_fs,
 					btree_key_cache.shrink);
 	struct btree_key_cache *bc = &c->btree_key_cache;
+	long nr = atomic_long_read(&bc->nr_keys) -
+		atomic_long_read(&bc->nr_dirty);
 
-	return atomic_long_read(&bc->nr_keys);
+	return max(0L, nr);
 }
 
 void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc)
-- 
cgit 


From d4b4422345fcb4e284260bd52166e189c137e846 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 27 Apr 2021 14:03:13 -0400
Subject: bcachefs: Change copygc wait amount to be min of per device waits

We're seeing a filesystem get stuck when all devices but one have no
more reclaimable buckets - because the copygc wait amount is curretly
filesystem wide.

This patch should fix that, possibly at the expensive of running too
much when only one or a few devices is full and the rebalance thread
needs to move data around.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/movinggc.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c
index f9146ccd70ef..acb4d943db79 100644
--- a/fs/bcachefs/movinggc.c
+++ b/fs/bcachefs/movinggc.c
@@ -293,17 +293,19 @@ unsigned long bch2_copygc_wait_amount(struct bch_fs *c)
 {
 	struct bch_dev *ca;
 	unsigned dev_idx;
-	u64 fragmented_allowed = 0, fragmented = 0;
+	s64 wait = S64_MAX, fragmented_allowed, fragmented;
 
 	for_each_rw_member(ca, c, dev_idx) {
 		struct bch_dev_usage usage = bch2_dev_usage_read(ca);
 
-		fragmented_allowed += ((__dev_buckets_reclaimable(ca, usage) *
+		fragmented_allowed = ((__dev_buckets_reclaimable(ca, usage) *
 					ca->mi.bucket_size) >> 1);
-		fragmented += usage.d[BCH_DATA_user].fragmented;
+		fragmented = usage.d[BCH_DATA_user].fragmented;
+
+		wait = min(wait, max(0LL, fragmented_allowed - fragmented));
 	}
 
-	return max_t(s64, 0, fragmented_allowed - fragmented);
+	return wait;
 }
 
 static int bch2_copygc_thread(void *arg)
-- 
cgit 


From 050197b1c1df1cfee84523bf2183c8674e06d10f Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 28 Apr 2021 19:36:12 -0400
Subject: bcachefs: Ensure that fpunch updates inode timestamps

Fixes xfstests generic/059

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-io.c | 14 ++++++++++++++
 fs/bcachefs/fs.c    |  2 +-
 2 files changed, 15 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index a7cd85647354..fbf171a4c191 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -2407,6 +2407,15 @@ err:
 
 /* fallocate: */
 
+static int inode_update_times_fn(struct bch_inode_info *inode,
+				 struct bch_inode_unpacked *bi, void *p)
+{
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+
+	bi->bi_mtime = bi->bi_ctime = bch2_current_time(c);
+	return 0;
+}
+
 static long bchfs_fpunch(struct bch_inode_info *inode, loff_t offset, loff_t len)
 {
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
@@ -2444,6 +2453,11 @@ static long bchfs_fpunch(struct bch_inode_info *inode, loff_t offset, loff_t len
 				  &i_sectors_delta);
 		i_sectors_acct(c, inode, NULL, i_sectors_delta);
 	}
+
+	mutex_lock(&inode->ei_update_lock);
+	ret = bch2_write_inode(c, inode, inode_update_times_fn, NULL,
+			       ATTR_MTIME|ATTR_CTIME) ?: ret;
+	mutex_unlock(&inode->ei_update_lock);
 err:
 	bch2_pagecache_block_put(&inode->ei_pagecache_lock);
 	inode_unlock(&inode->v);
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index 77db405e3418..67e9a354ad37 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -145,7 +145,7 @@ int __must_check bch2_write_inode(struct bch_fs *c,
 	struct bch_inode_unpacked inode_u;
 	int ret;
 
-	bch2_trans_init(&trans, c, 0, 0);
+	bch2_trans_init(&trans, c, 0, 256);
 retry:
 	bch2_trans_begin(&trans);
 
-- 
cgit 


From 2ce867df3161886cfc6baf54aa9ef53f2281cdee Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 28 Apr 2021 22:12:07 -0400
Subject: bcachefs: Make sure to initialize j->last_flushed

If the journal reclaim thread makes it to the timeout without ever
initializing j->last_flushed, we could end up sleeping for a very long
time.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/journal.c         |  4 ++++
 fs/bcachefs/journal_reclaim.c | 18 ++++++++++++------
 fs/bcachefs/journal_reclaim.h |  8 +++-----
 fs/bcachefs/journal_types.h   |  1 +
 4 files changed, 20 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index 2724a58ada05..a70540853586 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -1189,6 +1189,8 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
 	       "nr noflush writes:\t%llu\n"
 	       "nr direct reclaim:\t%llu\n"
 	       "nr background reclaim:\t%llu\n"
+	       "reclaim kicked:\t\t%u\n"
+	       "reclaim runs in:\t%u ms\n"
 	       "current entry sectors:\t%u\n"
 	       "current entry error:\t%u\n"
 	       "current entry:\t\t",
@@ -1204,6 +1206,8 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
 	       j->nr_noflush_writes,
 	       j->nr_direct_reclaim,
 	       j->nr_background_reclaim,
+	       j->reclaim_kicked,
+	       jiffies_to_msecs(j->next_reclaim - jiffies),
 	       j->cur_entry_sectors,
 	       j->cur_entry_error);
 
diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
index 416f8611f008..812620d3de31 100644
--- a/fs/bcachefs/journal_reclaim.c
+++ b/fs/bcachefs/journal_reclaim.c
@@ -677,13 +677,15 @@ int bch2_journal_reclaim(struct journal *j)
 static int bch2_journal_reclaim_thread(void *arg)
 {
 	struct journal *j = arg;
-	unsigned long next;
+	unsigned long delay, now;
 	int ret = 0;
 
 	set_freezable();
 
 	kthread_wait_freezable(test_bit(JOURNAL_RECLAIM_STARTED, &j->flags));
 
+	j->last_flushed = jiffies;
+
 	while (!ret && !kthread_should_stop()) {
 		j->reclaim_kicked = false;
 
@@ -691,18 +693,22 @@ static int bch2_journal_reclaim_thread(void *arg)
 		ret = __bch2_journal_reclaim(j, false);
 		mutex_unlock(&j->reclaim_lock);
 
-		next = j->last_flushed + msecs_to_jiffies(j->reclaim_delay_ms);
+		now = jiffies;
+		delay = msecs_to_jiffies(j->reclaim_delay_ms);
+		j->next_reclaim = j->last_flushed + delay;
+
+		if (!time_in_range(j->next_reclaim, now, now + delay))
+			j->next_reclaim = now + delay;
 
 		while (1) {
-			set_current_state(TASK_INTERRUPTIBLE);
+			set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE);
 			if (kthread_should_stop())
 				break;
 			if (j->reclaim_kicked)
 				break;
-			if (time_after_eq(jiffies, next))
+			if (time_after_eq(jiffies, j->next_reclaim))
 				break;
-			schedule_timeout(next - jiffies);
-			try_to_freeze();
+			schedule_timeout(j->next_reclaim - jiffies);
 
 		}
 		__set_current_state(TASK_RUNNING);
diff --git a/fs/bcachefs/journal_reclaim.h b/fs/bcachefs/journal_reclaim.h
index adf1f5c981cd..0fd1af120db5 100644
--- a/fs/bcachefs/journal_reclaim.h
+++ b/fs/bcachefs/journal_reclaim.h
@@ -8,11 +8,9 @@ static inline void journal_reclaim_kick(struct journal *j)
 {
 	struct task_struct *p = READ_ONCE(j->reclaim_thread);
 
-	if (p && !j->reclaim_kicked) {
-		j->reclaim_kicked = true;
-		if (p)
-			wake_up_process(p);
-	}
+	j->reclaim_kicked = true;
+	if (p)
+		wake_up_process(p);
 }
 
 unsigned bch2_journal_dev_buckets_available(struct journal *,
diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h
index f597eb78e66e..6e2a2d6b8346 100644
--- a/fs/bcachefs/journal_types.h
+++ b/fs/bcachefs/journal_types.h
@@ -248,6 +248,7 @@ struct journal {
 	wait_queue_head_t	reclaim_wait;
 	struct task_struct	*reclaim_thread;
 	bool			reclaim_kicked;
+	unsigned long		next_reclaim;
 	u64			nr_direct_reclaim;
 	u64			nr_background_reclaim;
 
-- 
cgit 


From 4f6dad46cb4b6db75758bc790a74d7f0f04e450b Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 29 Apr 2021 00:21:54 -0400
Subject: bcachefs: Add a tracepoint for when we block on journal reclaim

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update_leaf.c | 2 ++
 fs/bcachefs/trace.h             | 5 +++++
 2 files changed, 7 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index e537bd64e1fb..2e7b9210564d 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -725,6 +725,8 @@ int bch2_trans_commit_error(struct btree_trans *trans,
 	case BTREE_INSERT_NEED_JOURNAL_RECLAIM:
 		bch2_trans_unlock(trans);
 
+		trace_trans_blocked_journal_reclaim(trans->ip);
+
 		wait_event_freezable(c->journal.reclaim_wait,
 				     (ret = journal_reclaim_wait_done(c)));
 		if (ret < 0)
diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h
index 7e518f7618d4..b0a696ae4fc1 100644
--- a/fs/bcachefs/trace.h
+++ b/fs/bcachefs/trace.h
@@ -621,6 +621,11 @@ DEFINE_EVENT(transaction_restart,	trans_restart_btree_node_reused,
 	TP_ARGS(ip)
 );
 
+DEFINE_EVENT(transaction_restart,	trans_blocked_journal_reclaim,
+	TP_PROTO(unsigned long ip),
+	TP_ARGS(ip)
+);
+
 TRACE_EVENT(trans_restart_would_deadlock,
 	TP_PROTO(unsigned long	trans_ip,
 		 unsigned long	caller_ip,
-- 
cgit 


From 595c1e9bab7fd5512250d0e297e50a549af59b1f Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 28 Apr 2021 22:51:42 -0400
Subject: bcachefs: Fix time handling

There were some overflows in the time conversion functions - fix this by
converting tv_sec and tv_nsec separately. Also, set sb->time_min and
sb->time_max.

Fixes xfstest generic/258.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs.h | 23 ++++++++++++++---------
 fs/bcachefs/fs.c       |  4 +++-
 fs/bcachefs/super-io.c | 10 ++++++++--
 3 files changed, 25 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 323705f352de..c47e69931b8a 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -605,11 +605,13 @@ struct bch_fs {
 
 		u64		time_base_lo;
 		u32		time_base_hi;
-		u32		time_precision;
+		unsigned	time_units_per_sec;
+		unsigned	nsec_per_time_unit;
 		u64		features;
 		u64		compat;
 	}			sb;
 
+
 	struct bch_sb_handle	disk_sb;
 
 	unsigned short		block_bits;	/* ilog2(block_size) */
@@ -872,19 +874,22 @@ static inline unsigned block_bytes(const struct bch_fs *c)
 	return c->opts.block_size << 9;
 }
 
-static inline struct timespec64 bch2_time_to_timespec(struct bch_fs *c, u64 time)
+static inline struct timespec64 bch2_time_to_timespec(struct bch_fs *c, s64 time)
 {
-	return ns_to_timespec64(time * c->sb.time_precision + c->sb.time_base_lo);
+	struct timespec64 t;
+	s32 rem;
+
+	time += c->sb.time_base_lo;
+
+	t.tv_sec = div_s64_rem(time, c->sb.time_units_per_sec, &rem);
+	t.tv_nsec = rem * c->sb.nsec_per_time_unit;
+	return t;
 }
 
 static inline s64 timespec_to_bch2_time(struct bch_fs *c, struct timespec64 ts)
 {
-	s64 ns = timespec64_to_ns(&ts) - c->sb.time_base_lo;
-
-	if (c->sb.time_precision == 1)
-		return ns;
-
-	return div_s64(ns, c->sb.time_precision);
+	return (ts.tv_sec * c->sb.time_units_per_sec +
+		(int) ts.tv_nsec / c->sb.nsec_per_time_unit) - c->sb.time_base_lo;
 }
 
 static inline s64 bch2_current_time(struct bch_fs *c)
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index 67e9a354ad37..b00f35201132 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -1565,7 +1565,9 @@ got_sb:
 #endif
 	sb->s_xattr		= bch2_xattr_handlers;
 	sb->s_magic		= BCACHEFS_STATFS_MAGIC;
-	sb->s_time_gran		= c->sb.time_precision;
+	sb->s_time_gran		= c->sb.nsec_per_time_unit;
+	sb->s_time_min		= div_s64(S64_MIN, c->sb.time_units_per_sec) + 1;
+	sb->s_time_max		= div_s64(S64_MAX, c->sb.time_units_per_sec);
 	c->vfs_sb		= sb;
 	strlcpy(sb->s_id, c->name, sizeof(sb->s_id));
 
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index e0de6f0c0cb4..4c7cea4cfc2b 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -373,9 +373,15 @@ static void bch2_sb_update(struct bch_fs *c)
 	c->sb.clean		= BCH_SB_CLEAN(src);
 	c->sb.encryption_type	= BCH_SB_ENCRYPTION_TYPE(src);
 	c->sb.encoded_extent_max= 1 << BCH_SB_ENCODED_EXTENT_MAX_BITS(src);
-	c->sb.time_base_lo	= le64_to_cpu(src->time_base_lo);
+
+	c->sb.nsec_per_time_unit = le32_to_cpu(src->time_precision);
+	c->sb.time_units_per_sec = NSEC_PER_SEC / c->sb.nsec_per_time_unit;
+
+	/* XXX this is wrong, we need a 96 or 128 bit integer type */
+	c->sb.time_base_lo	= div_u64(le64_to_cpu(src->time_base_lo),
+					  c->sb.nsec_per_time_unit);
 	c->sb.time_base_hi	= le32_to_cpu(src->time_base_hi);
-	c->sb.time_precision	= le32_to_cpu(src->time_precision);
+
 	c->sb.features		= le64_to_cpu(src->features[0]);
 	c->sb.compat		= le64_to_cpu(src->compat[0]);
 
-- 
cgit 


From e68031fb468aff9963d344d04e2de8824c83c8ae Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 29 Apr 2021 16:55:26 -0400
Subject: bcachefs: Mark newly allocated btree nodes as accessed

This was a major oversight - this means under memory pressure we can end
up reading in a btree node, then having it evicted before we get to use
it.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_cache.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index 85c19e4e5216..a13e5eef868e 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -590,6 +590,7 @@ out:
 	b->sib_u64s[1]		= 0;
 	b->whiteout_u64s	= 0;
 	bch2_btree_keys_init(b);
+	set_btree_node_accessed(b);
 
 	bch2_time_stats_update(&c->times[BCH_TIME_btree_node_mem_alloc],
 			       start_time);
-- 
cgit 


From ac1019d32b31c8bca6e0e5f86b403e91a1da5786 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 29 Apr 2021 15:37:47 -0400
Subject: bcachefs: Clean up bch2_btree_and_journal_walk()

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c | 13 +++++--------
 fs/bcachefs/alloc_background.h |  3 +--
 fs/bcachefs/ec.c               | 14 +++++---------
 fs/bcachefs/ec.h               |  3 +--
 fs/bcachefs/recovery.c         | 36 ++++++++++++------------------------
 fs/bcachefs/recovery.h         |  7 ++-----
 6 files changed, 26 insertions(+), 50 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 2d532fe4d30b..996b1afd1380 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -261,16 +261,14 @@ void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c,
 #undef  x
 }
 
-static int bch2_alloc_read_fn(struct bch_fs *c, enum btree_id id,
-			      unsigned level, struct bkey_s_c k)
+static int bch2_alloc_read_fn(struct bch_fs *c, struct bkey_s_c k)
 {
 	struct bch_dev *ca;
 	struct bucket *g;
 	struct bkey_alloc_unpacked u;
 
-	if (level ||
-	    (k.k->type != KEY_TYPE_alloc &&
-	     k.k->type != KEY_TYPE_alloc_v2))
+	if (k.k->type != KEY_TYPE_alloc &&
+	    k.k->type != KEY_TYPE_alloc_v2)
 		return 0;
 
 	ca = bch_dev_bkey_exists(c, k.k->p.inode);
@@ -289,13 +287,12 @@ static int bch2_alloc_read_fn(struct bch_fs *c, enum btree_id id,
 	return 0;
 }
 
-int bch2_alloc_read(struct bch_fs *c, struct journal_keys *journal_keys)
+int bch2_alloc_read(struct bch_fs *c)
 {
 	int ret;
 
 	down_read(&c->gc_lock);
-	ret = bch2_btree_and_journal_walk(c, journal_keys, BTREE_ID_alloc,
-					  NULL, bch2_alloc_read_fn);
+	ret = bch2_btree_and_journal_walk(c, BTREE_ID_alloc, bch2_alloc_read_fn);
 	up_read(&c->gc_lock);
 	if (ret) {
 		bch_err(c, "error reading alloc info: %i", ret);
diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h
index ad15a80602c0..9cadfdb5b83d 100644
--- a/fs/bcachefs/alloc_background.h
+++ b/fs/bcachefs/alloc_background.h
@@ -91,8 +91,7 @@ void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 	.val_to_text	= bch2_alloc_to_text,		\
 }
 
-struct journal_keys;
-int bch2_alloc_read(struct bch_fs *, struct journal_keys *);
+int bch2_alloc_read(struct bch_fs *);
 
 static inline void bch2_wake_allocator(struct bch_dev *ca)
 {
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index 7d681a7254c4..4fc774631d20 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -1630,26 +1630,22 @@ int bch2_stripes_write(struct bch_fs *c, unsigned flags)
 	return ret;
 }
 
-static int bch2_stripes_read_fn(struct bch_fs *c, enum btree_id id,
-			      unsigned level, struct bkey_s_c k)
+static int bch2_stripes_read_fn(struct bch_fs *c, struct bkey_s_c k)
 {
 	int ret = 0;
 
-	if (k.k->type == KEY_TYPE_stripe) {
+	if (k.k->type == KEY_TYPE_stripe)
 		ret = __ec_stripe_mem_alloc(c, k.k->p.offset, GFP_KERNEL) ?:
 			bch2_mark_key(c, k, 0, 0, NULL, 0,
 				      BTREE_TRIGGER_NOATOMIC);
-		if (ret)
-			return ret;
-	}
 
 	return ret;
 }
 
-int bch2_stripes_read(struct bch_fs *c, struct journal_keys *journal_keys)
+int bch2_stripes_read(struct bch_fs *c)
 {
-	int ret = bch2_btree_and_journal_walk(c, journal_keys, BTREE_ID_stripes,
-					  NULL, bch2_stripes_read_fn);
+	int ret = bch2_btree_and_journal_walk(c, BTREE_ID_stripes,
+					      bch2_stripes_read_fn);
 	if (ret)
 		bch_err(c, "error reading stripes: %i", ret);
 
diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h
index 744e51eaf327..e79626b59509 100644
--- a/fs/bcachefs/ec.h
+++ b/fs/bcachefs/ec.h
@@ -215,8 +215,7 @@ void bch2_ec_flush_new_stripes(struct bch_fs *);
 
 void bch2_stripes_heap_start(struct bch_fs *);
 
-struct journal_keys;
-int bch2_stripes_read(struct bch_fs *, struct journal_keys *);
+int bch2_stripes_read(struct bch_fs *);
 int bch2_stripes_write(struct bch_fs *, unsigned);
 
 int bch2_ec_mem_alloc(struct bch_fs *, bool);
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index a9ccd14effe7..b35b297d4446 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -323,9 +323,7 @@ static void btree_and_journal_iter_prefetch(struct bch_fs *c, struct btree *b,
 }
 
 static int bch2_btree_and_journal_walk_recurse(struct bch_fs *c, struct btree *b,
-				struct journal_keys *journal_keys,
 				enum btree_id btree_id,
-				btree_walk_node_fn node_fn,
 				btree_walk_key_fn key_fn)
 {
 	struct btree_and_journal_iter iter;
@@ -338,15 +336,9 @@ static int bch2_btree_and_journal_walk_recurse(struct bch_fs *c, struct btree *b
 	bch2_btree_and_journal_iter_init_node_iter(&iter, c, b);
 
 	while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
-		ret = key_fn(c, btree_id, b->c.level, k);
-		if (ret)
-			break;
-
 		if (b->c.level) {
 			bch2_bkey_buf_reassemble(&tmp, c, k);
 
-			bch2_btree_and_journal_iter_advance(&iter);
-
 			child = bch2_btree_node_get_noiter(c, tmp.k,
 						b->c.btree_id, b->c.level - 1,
 						false);
@@ -357,16 +349,17 @@ static int bch2_btree_and_journal_walk_recurse(struct bch_fs *c, struct btree *b
 
 			btree_and_journal_iter_prefetch(c, b, iter);
 
-			ret   = (node_fn ? node_fn(c, b) : 0) ?:
-				bch2_btree_and_journal_walk_recurse(c, child,
-					journal_keys, btree_id, node_fn, key_fn);
+			ret = bch2_btree_and_journal_walk_recurse(c, child,
+					btree_id, key_fn);
 			six_unlock_read(&child->c.lock);
-
-			if (ret)
-				break;
 		} else {
-			bch2_btree_and_journal_iter_advance(&iter);
+			ret = key_fn(c, k);
 		}
+
+		if (ret)
+			break;
+
+		bch2_btree_and_journal_iter_advance(&iter);
 	}
 
 	bch2_btree_and_journal_iter_exit(&iter);
@@ -374,9 +367,7 @@ static int bch2_btree_and_journal_walk_recurse(struct bch_fs *c, struct btree *b
 	return ret;
 }
 
-int bch2_btree_and_journal_walk(struct bch_fs *c, struct journal_keys *journal_keys,
-				enum btree_id btree_id,
-				btree_walk_node_fn node_fn,
+int bch2_btree_and_journal_walk(struct bch_fs *c, enum btree_id btree_id,
 				btree_walk_key_fn key_fn)
 {
 	struct btree *b = c->btree_roots[btree_id].b;
@@ -386,10 +377,7 @@ int bch2_btree_and_journal_walk(struct bch_fs *c, struct journal_keys *journal_k
 		return 0;
 
 	six_lock_read(&b->c.lock, NULL, NULL);
-	ret   = (node_fn ? node_fn(c, b) : 0) ?:
-		bch2_btree_and_journal_walk_recurse(c, b, journal_keys, btree_id,
-						    node_fn, key_fn) ?:
-		key_fn(c, btree_id, b->c.level + 1, bkey_i_to_s_c(&b->key));
+	ret = bch2_btree_and_journal_walk_recurse(c, b, btree_id, key_fn);
 	six_unlock_read(&b->c.lock);
 
 	return ret;
@@ -1120,14 +1108,14 @@ use_clean:
 
 	bch_verbose(c, "starting alloc read");
 	err = "error reading allocation information";
-	ret = bch2_alloc_read(c, &c->journal_keys);
+	ret = bch2_alloc_read(c);
 	if (ret)
 		goto err;
 	bch_verbose(c, "alloc read done");
 
 	bch_verbose(c, "starting stripes_read");
 	err = "error reading stripes";
-	ret = bch2_stripes_read(c, &c->journal_keys);
+	ret = bch2_stripes_read(c);
 	if (ret)
 		goto err;
 	bch_verbose(c, "stripes_read done");
diff --git a/fs/bcachefs/recovery.h b/fs/bcachefs/recovery.h
index fa91851b9ed7..e5565e4f335a 100644
--- a/fs/bcachefs/recovery.h
+++ b/fs/bcachefs/recovery.h
@@ -45,12 +45,9 @@ void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *,
 						struct bch_fs *,
 						struct btree *);
 
-typedef int (*btree_walk_node_fn)(struct bch_fs *c, struct btree *b);
-typedef int (*btree_walk_key_fn)(struct bch_fs *c, enum btree_id id,
-				 unsigned level, struct bkey_s_c k);
+typedef int (*btree_walk_key_fn)(struct bch_fs *c, struct bkey_s_c k);
 
-int bch2_btree_and_journal_walk(struct bch_fs *, struct journal_keys *, enum btree_id,
-				btree_walk_node_fn, btree_walk_key_fn);
+int bch2_btree_and_journal_walk(struct bch_fs *, enum btree_id, btree_walk_key_fn);
 
 void bch2_journal_keys_free(struct journal_keys *);
 void bch2_journal_entries_free(struct list_head *);
-- 
cgit 


From 1784d43a88ecf0da66514f4b7f0f52b4182ef81c Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 7 May 2021 23:32:26 -0400
Subject: bcachefs: Fix usage of last_seq + encryption

jset->last_seq is in the region that's encrypted - on journal write
completion, we were using it and getting garbage. This patch shadows it
to fix.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/journal.c       | 3 ++-
 fs/bcachefs/journal_io.c    | 7 +++----
 fs/bcachefs/journal_types.h | 1 +
 3 files changed, 6 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index a70540853586..a7cc0b167072 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -189,7 +189,8 @@ static bool __journal_entry_close(struct journal *j)
 	 * Hence, we want update/set last_seq on the current journal entry right
 	 * before we open a new one:
 	 */
-	buf->data->last_seq	= cpu_to_le64(journal_last_seq(j));
+	buf->last_seq		= journal_last_seq(j);
+	buf->data->last_seq	= cpu_to_le64(buf->last_seq);
 
 	__bch2_journal_pin_put(j, le64_to_cpu(buf->data->seq));
 
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 4ab9cebee218..2d3fc33720b8 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -1236,7 +1236,7 @@ static void journal_write_done(struct closure *cl)
 		bch2_bkey_devs(bkey_i_to_s_c(&w->key));
 	struct bch_replicas_padded replicas;
 	union journal_res_state old, new;
-	u64 v, seq, last_seq;
+	u64 v, seq;
 	int err = 0;
 
 	bch2_time_stats_update(j->write_time, j->write_start_time);
@@ -1255,7 +1255,6 @@ static void journal_write_done(struct closure *cl)
 
 	spin_lock(&j->lock);
 	seq = le64_to_cpu(w->data->seq);
-	last_seq = le64_to_cpu(w->data->last_seq);
 
 	if (seq >= j->pin.front)
 		journal_seq_pin(j, seq)->devs = devs;
@@ -1266,7 +1265,7 @@ static void journal_write_done(struct closure *cl)
 
 	if (!JSET_NO_FLUSH(w->data)) {
 		j->flushed_seq_ondisk = seq;
-		j->last_seq_ondisk = last_seq;
+		j->last_seq_ondisk = w->last_seq;
 	}
 
 	/*
@@ -1400,7 +1399,7 @@ void bch2_journal_write(struct closure *cl)
 	    test_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags)) {
 		w->noflush = true;
 		SET_JSET_NO_FLUSH(jset, true);
-		jset->last_seq = 0;
+		jset->last_seq = w->last_seq = 0;
 
 		j->nr_noflush_writes++;
 	} else {
diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h
index 6e2a2d6b8346..b4e4d5e0bc93 100644
--- a/fs/bcachefs/journal_types.h
+++ b/fs/bcachefs/journal_types.h
@@ -23,6 +23,7 @@ struct journal_buf {
 	__BKEY_PADDED(key, BCH_REPLICAS_MAX);
 
 	struct closure_waitlist	wait;
+	u64			last_seq;	/* copy of data->last_seq */
 
 	unsigned		buf_size;	/* size in bytes of @data */
 	unsigned		sectors;	/* maximum size for current entry */
-- 
cgit 


From 5bc38f44fa8e938044bb3b69c8881f3682fe97f6 Mon Sep 17 00:00:00 2001
From: Dan Robertson <dan@dlrobertson.com>
Date: Fri, 7 May 2021 22:29:02 -0400
Subject: bcachefs: Fix oob write in __bch2_btree_node_write

Fix a possible out of bounds write in __bch2_btree_node_write when
the data buffer padding is cleared up to the block size. The out of
bounds write is possible if the data buffers size is not a multiple
of the block size.

Signed-off-by: Dan Robertson <dan@dlrobertson.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_io.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 389524ce1fb6..dbaa05ac764c 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -1500,6 +1500,9 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b)
 	/* bch2_varint_decode may read up to 7 bytes past the end of the buffer: */
 	bytes += 8;
 
+	/* buffer must be a multiple of the block size */
+	bytes = round_up(bytes, block_bytes(c));
+
 	data = btree_bounce_alloc(c, bytes, &used_mempool);
 
 	if (!b->written) {
-- 
cgit 


From 3a402c8dabf11142d78d0f6174b50db6ba846c4e Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 7 May 2021 20:43:43 -0400
Subject: bcachefs: Fix some refcounting bugs

We really need debug mode assertions that ca->ref and ca->io_ref are
used correctly.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c |  2 +-
 fs/bcachefs/btree_gc.c         |  6 ++++--
 fs/bcachefs/buckets.c          |  2 +-
 fs/bcachefs/recovery.c         |  4 +++-
 fs/bcachefs/super.c            | 12 ++++++++----
 fs/bcachefs/super.h            |  5 +----
 6 files changed, 18 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 996b1afd1380..791066b6b39b 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -371,7 +371,7 @@ int bch2_alloc_write(struct bch_fs *c, unsigned flags)
 
 			ret = bch2_alloc_write_key(&trans, iter, flags);
 			if (ret) {
-				percpu_ref_put(&ca->io_ref);
+				percpu_ref_put(&ca->ref);
 				goto err;
 			}
 			bch2_btree_iter_next_slot(iter);
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 1c2eab41f7ca..bd4dd1d67a1d 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -1023,7 +1023,7 @@ static void bch2_gc_free(struct bch_fs *c)
 static int bch2_gc_done(struct bch_fs *c,
 			bool initial, bool metadata_only)
 {
-	struct bch_dev *ca;
+	struct bch_dev *ca = NULL;
 	bool verify = !metadata_only && (!initial ||
 		       (c->sb.compat & (1ULL << BCH_COMPAT_alloc_info)));
 	unsigned i, dev;
@@ -1169,6 +1169,8 @@ static int bch2_gc_done(struct bch_fs *c,
 #undef copy_stripe_field
 #undef copy_field
 fsck_err:
+	if (ca)
+		percpu_ref_put(&ca->ref);
 	if (ret)
 		bch_err(c, "%s: ret %i", __func__, ret);
 	return ret;
@@ -1177,7 +1179,7 @@ fsck_err:
 static int bch2_gc_start(struct bch_fs *c,
 			 bool metadata_only)
 {
-	struct bch_dev *ca;
+	struct bch_dev *ca = NULL;
 	unsigned i;
 	int ret;
 
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index d6f0325affcc..a83d5de87d39 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -2071,7 +2071,7 @@ static int __bch2_trans_mark_dev_sb(struct btree_trans *trans,
 
 int bch2_trans_mark_dev_sb(struct bch_fs *c, struct bch_dev *ca)
 {
-	return bch2_trans_do(c, NULL, NULL, 0,
+	return bch2_trans_do(c, NULL, NULL, BTREE_INSERT_LAZY_RW,
 			__bch2_trans_mark_dev_sb(&trans, ca));
 }
 
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index b35b297d4446..cd538ecc1f3f 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -1328,8 +1328,10 @@ int bch2_fs_initialize(struct bch_fs *c)
 	err = "error marking superblock and journal";
 	for_each_member_device(ca, c, i) {
 		ret = bch2_trans_mark_dev_sb(c, ca);
-		if (ret)
+		if (ret) {
+			percpu_ref_put(&ca->ref);
 			goto err;
+		}
 	}
 
 	bch2_inode_init(c, &root_inode, 0, 0,
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 78db2c0a5f5a..9065e264c567 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -629,9 +629,11 @@ static const char *bch2_fs_online(struct bch_fs *c)
 	down_write(&c->state_lock);
 
 	err = "error creating sysfs objects";
-	__for_each_member_device(ca, c, i, NULL)
-		if (bch2_dev_sysfs_online(c, ca))
+	for_each_member_device(ca, c, i)
+		if (bch2_dev_sysfs_online(c, ca)) {
+			percpu_ref_put(&ca->ref);
 			goto err;
+		}
 
 	list_add(&c->list, &bch_fs_list);
 	err = NULL;
@@ -1839,12 +1841,14 @@ struct bch_dev *bch2_dev_lookup(struct bch_fs *c, const char *path)
 	if (ret)
 		return ERR_PTR(ret);
 
-	for_each_member_device(ca, c, i)
+	rcu_read_lock();
+	for_each_member_device_rcu(ca, c, i, NULL)
 		if (ca->disk_sb.bdev->bd_dev == dev)
 			goto found;
-
 	ca = ERR_PTR(-ENOENT);
 found:
+	rcu_read_unlock();
+
 	return ca;
 }
 
diff --git a/fs/bcachefs/super.h b/fs/bcachefs/super.h
index 28e6d78f9fcd..b151bffcd3a3 100644
--- a/fs/bcachefs/super.h
+++ b/fs/bcachefs/super.h
@@ -107,11 +107,8 @@ static inline struct bch_dev *__bch2_next_dev(struct bch_fs *c, unsigned *iter,
 	return ca;
 }
 
-#define __for_each_member_device(ca, c, iter, mask)			\
-	for ((iter) = 0; ((ca) = __bch2_next_dev((c), &(iter), mask)); (iter)++)
-
 #define for_each_member_device_rcu(ca, c, iter, mask)			\
-	__for_each_member_device(ca, c, iter, mask)
+	for ((iter) = 0; ((ca) = __bch2_next_dev((c), &(iter), mask)); (iter)++)
 
 static inline struct bch_dev *bch2_get_next_dev(struct bch_fs *c, unsigned *iter)
 {
-- 
cgit 


From 933532b8b25cd7b739f30d66a0e9fcc9e03d57ec Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 3 May 2021 20:31:27 -0400
Subject: bcachefs: Fix reflink trigger

The trigger for reflink pointers wasn't always incrementing/decrementing
the refcounts correctly - this patch fixes that logic.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/buckets.c | 43 +++++++++++++++++++++++++++++++++++++------
 fs/bcachefs/io.c      |  5 ++++-
 2 files changed, 41 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index a83d5de87d39..8806e8306e4e 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -1710,9 +1710,28 @@ static __le64 *bkey_refcount(struct bkey_i *k)
 	}
 }
 
+static bool reflink_p_frag_references(struct bkey_s_c_reflink_p p,
+				      u64 start, u64 end,
+				      struct bkey_s_c k)
+{
+	if (start == end)
+		return false;
+
+	start	+= le64_to_cpu(p.v->idx);
+	end	+= le64_to_cpu(p.v->idx);
+
+	if (end <= bkey_start_offset(k.k))
+		return false;
+	if (start >= k.k->p.offset)
+		return false;
+	return true;
+}
+
 static int __bch2_trans_mark_reflink_p(struct btree_trans *trans,
 			struct bkey_s_c_reflink_p p,
 			u64 idx, unsigned sectors,
+			unsigned front_frag,
+			unsigned back_frag,
 			unsigned flags)
 {
 	struct bch_fs *c = trans->c;
@@ -1720,6 +1739,7 @@ static int __bch2_trans_mark_reflink_p(struct btree_trans *trans,
 	struct bkey_s_c k;
 	struct bkey_i *n;
 	__le64 *refcount;
+	int add = !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1;
 	s64 ret;
 
 	ret = trans_get_key(trans, BTREE_ID_reflink,
@@ -1727,12 +1747,17 @@ static int __bch2_trans_mark_reflink_p(struct btree_trans *trans,
 	if (ret < 0)
 		return ret;
 
-	if ((flags & BTREE_TRIGGER_OVERWRITE) &&
-	    (bkey_start_offset(k.k) < idx ||
-	     k.k->p.offset > idx + sectors))
+	if (reflink_p_frag_references(p, 0, front_frag, k) &&
+	    reflink_p_frag_references(p, back_frag, p.k->size, k)) {
+		BUG_ON(!(flags & BTREE_TRIGGER_OVERWRITE_SPLIT));
+		add = -add;
+	} else if (reflink_p_frag_references(p, 0, front_frag, k) ||
+		   reflink_p_frag_references(p, back_frag, p.k->size, k)) {
+		BUG_ON(!(flags & BTREE_TRIGGER_OVERWRITE));
 		goto out;
+	}
 
-	sectors = k.k->p.offset - idx;
+	sectors = min_t(u64, sectors, k.k->p.offset - idx);
 
 	n = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
 	ret = PTR_ERR_OR_ZERO(n);
@@ -1751,7 +1776,8 @@ static int __bch2_trans_mark_reflink_p(struct btree_trans *trans,
 		goto err;
 	}
 
-	le64_add_cpu(refcount, !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1);
+	BUG_ON(!*refcount && (flags & BTREE_TRIGGER_OVERWRITE));
+	le64_add_cpu(refcount, add);
 
 	if (!*refcount) {
 		n->k.type = KEY_TYPE_deleted;
@@ -1772,13 +1798,18 @@ static int bch2_trans_mark_reflink_p(struct btree_trans *trans,
 			s64 sectors, unsigned flags)
 {
 	u64 idx = le64_to_cpu(p.v->idx) + offset;
+	unsigned front_frag, back_frag;
 	s64 ret = 0;
 
 	sectors = abs(sectors);
 	BUG_ON(offset + sectors > p.k->size);
 
+	front_frag = offset;
+	back_frag = offset + sectors;
+
 	while (sectors) {
-		ret = __bch2_trans_mark_reflink_p(trans, p, idx, sectors, flags);
+		ret = __bch2_trans_mark_reflink_p(trans, p, idx, sectors,
+					front_frag, back_frag, flags);
 		if (ret < 0)
 			break;
 
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index 1e0effcece7f..2b2e118114bd 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -1959,7 +1959,10 @@ int __bch2_read_indirect_extent(struct btree_trans *trans,
 	if (k.k->type != KEY_TYPE_reflink_v &&
 	    k.k->type != KEY_TYPE_indirect_inline_data) {
 		bch_err_inum_ratelimited(trans->c, orig_k->k->k.p.inode,
-				"pointer to nonexistent indirect extent");
+			"%llu len %u points to nonexistent indirect extent %llu",
+			orig_k->k->k.p.offset,
+			orig_k->k->k.size,
+			reflink_offset);
 		bch2_inconsistent_error(trans->c);
 		ret = -EIO;
 		goto err;
-- 
cgit 


From 360746bf6f7cae871a683e1718bedba6a29f10dc Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 29 Apr 2021 21:44:05 -0400
Subject: bcachefs: Fix bch2_btree_iter_peek_with_updates()

By not re-fetching the next update we were going into an infinite loop.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 525afe3509ec..8c343d5cec0c 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1618,16 +1618,17 @@ static struct bkey_i *btree_trans_peek_updates(struct btree_trans *trans,
 static inline struct bkey_s_c __btree_iter_peek(struct btree_iter *iter, bool with_updates)
 {
 	struct bpos search_key = btree_iter_search_key(iter);
-	struct bkey_i *next_update = with_updates
-		? btree_trans_peek_updates(iter->trans, iter->btree_id, search_key)
-		: NULL;
+	struct bkey_i *next_update;
 	struct bkey_s_c k;
 	int ret;
 
 	EBUG_ON(btree_iter_type(iter) != BTREE_ITER_KEYS);
 	bch2_btree_iter_verify(iter);
 	bch2_btree_iter_verify_entry_exit(iter);
-
+start:
+	next_update = with_updates
+		? btree_trans_peek_updates(iter->trans, iter->btree_id, search_key)
+		: NULL;
 	btree_iter_set_search_pos(iter, search_key);
 
 	while (1) {
@@ -1643,9 +1644,8 @@ static inline struct bkey_s_c __btree_iter_peek(struct btree_iter *iter, bool wi
 
 		if (likely(k.k)) {
 			if (bkey_deleted(k.k)) {
-				btree_iter_set_search_pos(iter,
-						bkey_successor(iter, k.k->p));
-				continue;
+				search_key = bkey_successor(iter, k.k->p);
+				goto start;
 			}
 
 			break;
-- 
cgit 


From 909004d2f9f8d56997010eac3ae975e214ff9d0f Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 14 May 2021 16:56:26 -0400
Subject: bcachefs: Make sure to use BTREE_ITER_PREFETCH in fsck

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fsck.c | 31 +++++++++++++++++++++++--------
 1 file changed, 23 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 8ae4e4c30933..fcdcf42f85a4 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -512,7 +512,9 @@ static int check_inodes(struct bch_fs *c, bool full)
 
 	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
 
-	for_each_btree_key(&trans, iter, BTREE_ID_inodes, POS_MIN, 0, k, ret) {
+	for_each_btree_key(&trans, iter, BTREE_ID_inodes, POS_MIN,
+			   BTREE_ITER_INTENT|
+			   BTREE_ITER_PREFETCH, k, ret) {
 		if (k.k->type != KEY_TYPE_inode)
 			continue;
 
@@ -621,7 +623,8 @@ static int check_extents(struct bch_fs *c)
 
 	iter = bch2_trans_get_iter(&trans, BTREE_ID_extents,
 				   POS(BCACHEFS_ROOT_INO, 0),
-				   BTREE_ITER_INTENT);
+				   BTREE_ITER_INTENT|
+				   BTREE_ITER_PREFETCH);
 retry:
 	while ((k = bch2_btree_iter_peek(iter)).k &&
 	       !(ret = bkey_err(k))) {
@@ -719,7 +722,9 @@ static int check_dirents(struct bch_fs *c)
 	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
 
 	iter = bch2_trans_get_iter(&trans, BTREE_ID_dirents,
-				   POS(BCACHEFS_ROOT_INO, 0), 0);
+				   POS(BCACHEFS_ROOT_INO, 0),
+				   BTREE_ITER_INTENT|
+				   BTREE_ITER_PREFETCH);
 retry:
 	while ((k = bch2_btree_iter_peek(iter)).k &&
 	       !(ret = bkey_err(k))) {
@@ -920,7 +925,9 @@ static int check_xattrs(struct bch_fs *c)
 	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
 
 	iter = bch2_trans_get_iter(&trans, BTREE_ID_xattrs,
-				   POS(BCACHEFS_ROOT_INO, 0), 0);
+				   POS(BCACHEFS_ROOT_INO, 0),
+				   BTREE_ITER_INTENT|
+				   BTREE_ITER_PREFETCH);
 retry:
 	while ((k = bch2_btree_iter_peek(iter)).k &&
 	       !(ret = bkey_err(k))) {
@@ -1108,7 +1115,9 @@ static int check_directory_structure(struct bch_fs *c)
 
 	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
 
-	for_each_btree_key(&trans, iter, BTREE_ID_inodes, POS_MIN, 0, k, ret) {
+	for_each_btree_key(&trans, iter, BTREE_ID_inodes, POS_MIN,
+			   BTREE_ITER_INTENT|
+			   BTREE_ITER_PREFETCH, k, ret) {
 		if (k.k->type != KEY_TYPE_inode)
 			continue;
 
@@ -1207,7 +1216,9 @@ static int check_nlinks_find_hardlinks(struct bch_fs *c,
 	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
 
 	for_each_btree_key(&trans, iter, BTREE_ID_inodes,
-			   POS(0, start), 0, k, ret) {
+			   POS(0, start),
+			   BTREE_ITER_INTENT|
+			   BTREE_ITER_PREFETCH, k, ret) {
 		if (k.k->type != KEY_TYPE_inode)
 			continue;
 
@@ -1255,7 +1266,9 @@ static int check_nlinks_walk_dirents(struct bch_fs *c, struct nlink_table *links
 
 	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
 
-	for_each_btree_key(&trans, iter, BTREE_ID_dirents, POS_MIN, 0, k, ret) {
+	for_each_btree_key(&trans, iter, BTREE_ID_dirents, POS_MIN,
+			   BTREE_ITER_INTENT|
+			   BTREE_ITER_PREFETCH, k, ret) {
 		switch (k.k->type) {
 		case KEY_TYPE_dirent:
 			d = bkey_s_c_to_dirent(k);
@@ -1293,7 +1306,9 @@ static int check_nlinks_update_hardlinks(struct bch_fs *c,
 	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
 
 	for_each_btree_key(&trans, iter, BTREE_ID_inodes,
-			   POS(0, range_start), 0, k, ret) {
+			   POS(0, range_start),
+			   BTREE_ITER_INTENT|
+			   BTREE_ITER_PREFETCH, k, ret) {
 		if (k.k->p.offset >= range_end)
 			break;
 
-- 
cgit 


From ffcf9ec78c133fb85ff13d8119ff404e11820834 Mon Sep 17 00:00:00 2001
From: Stijn Tintel <stijn@linux-ipv6.be>
Date: Thu, 13 May 2021 23:08:47 +0300
Subject: bcachefs: avoid out-of-bounds in split_devs

Calling mount with an empty source string causes an out-of-bounds error
in split_devs. Check the length of the source string to avoid this.

Signed-off-by: Stijn Tintel <stijn@linux-ipv6.be>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index b00f35201132..5eef67358cfb 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -32,6 +32,7 @@
 #include <linux/random.h>
 #include <linux/seq_file.h>
 #include <linux/statfs.h>
+#include <linux/string.h>
 #include <linux/xattr.h>
 
 static struct kmem_cache *bch2_inode_cache;
@@ -1324,6 +1325,9 @@ static char **split_devs(const char *_dev_name, unsigned *nr)
 	char *dev_name = NULL, **devs = NULL, *s;
 	size_t i, nr_devs = 0;
 
+	if (strlen(_dev_name) == 0)
+		return NULL;
+
 	dev_name = kstrdup(_dev_name, GFP_KERNEL);
 	if (!dev_name)
 		return NULL;
-- 
cgit 


From baf056b87da88b0c9812d53f8b12072652d1c07b Mon Sep 17 00:00:00 2001
From: Dan Robertson <dan@dlrobertson.com>
Date: Sun, 9 May 2021 18:52:23 -0400
Subject: bcachefs: Fix error in parsing of mount options

When parsing the mount options duplicate the given options. This is
required as the options are parsed twice and strsep is used in parsing.
The options will be modified into a possibly invalid options set for the
second round of parsing if the options are not duplicated before
parsing.

Signed-off-by: Dan Robertson <dan@dlrobertson.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/opts.c | 27 ++++++++++++++++++++++-----
 1 file changed, 22 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c
index 0cfbb56a57c1..64bf5a382d63 100644
--- a/fs/bcachefs/opts.c
+++ b/fs/bcachefs/opts.c
@@ -315,11 +315,20 @@ int bch2_opts_check_may_set(struct bch_fs *c)
 int bch2_parse_mount_opts(struct bch_fs *c, struct bch_opts *opts,
 			  char *options)
 {
+	char *copied_opts, *copied_opts_start;
 	char *opt, *name, *val;
 	int ret, id;
 	u64 v;
 
-	while ((opt = strsep(&options, ",")) != NULL) {
+	if (!options)
+		return 0;
+
+	copied_opts = kstrdup(options, GFP_KERNEL);
+	if (!copied_opts)
+		return -1;
+	copied_opts_start = copied_opts;
+
+	while ((opt = strsep(&copied_opts, ",")) != NULL) {
 		name	= strsep(&opt, "=");
 		val	= opt;
 
@@ -363,16 +372,24 @@ int bch2_parse_mount_opts(struct bch_fs *c, struct bch_opts *opts,
 		bch2_opt_set_by_id(opts, id, v);
 	}
 
-	return 0;
+	ret = 0;
+	goto out;
+
 bad_opt:
 	pr_err("Bad mount option %s", name);
-	return -1;
+	ret = -1;
+	goto out;
 bad_val:
 	pr_err("Invalid value %s for mount option %s", val, name);
-	return -1;
+	ret = -1;
+	goto out;
 no_val:
 	pr_err("Mount option %s requires a value", name);
-	return -1;
+	ret = -1;
+	goto out;
+out:
+	kfree(copied_opts_start);
+	return ret;
 }
 
 /* io opts: */
-- 
cgit 


From ec4ab9d2fc08132113dc5d45ac68af2158377122 Mon Sep 17 00:00:00 2001
From: Dan Robertson <dan@dlrobertson.com>
Date: Wed, 12 May 2021 14:07:57 -0400
Subject: bcachefs: Fix possible null deref on mount

Ensure that the block device pointer in a superblock handle is not
null before dereferencing it in bch2_dev_to_fs. The block device pointer
may be null when mounting a new bcachefs filesystem given another mounted
bcachefs filesystem exists that has at least one device that is offline.

Signed-off-by: Dan Robertson <dan@dlrobertson.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/super.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 9065e264c567..71493b5ff695 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -117,7 +117,7 @@ struct bch_fs *bch2_dev_to_fs(dev_t dev)
 
 	list_for_each_entry(c, &bch_fs_list, list)
 		for_each_member_device_rcu(ca, c, i, NULL)
-			if (ca->disk_sb.bdev->bd_dev == dev) {
+			if (ca->disk_sb.bdev && ca->disk_sb.bdev->bd_dev == dev) {
 				closure_get(&c->cl);
 				goto found;
 			}
-- 
cgit 


From 2b25de552f8a8d9cae5b54c83137c67e03ee1957 Mon Sep 17 00:00:00 2001
From: Dan Robertson <dan@dlrobertson.com>
Date: Wed, 12 May 2021 20:54:37 -0400
Subject: bcachefs: Fix null deref in bch2_ioctl_read_super

Do not attempt to cleanup the returned value of bch2_device_lookup if
the returned value was an error pointer. We currently check to see if
the returned value is null and run the cleanup otherwise. As a result,
we attempt to run the cleanup on a error pointer.

Signed-off-by: Dan Robertson <dan@dlrobertson.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/chardev.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c
index 34085e32a159..b0cbbb70161d 100644
--- a/fs/bcachefs/chardev.c
+++ b/fs/bcachefs/chardev.c
@@ -523,7 +523,7 @@ static long bch2_ioctl_read_super(struct bch_fs *c,
 	ret = copy_to_user((void __user *)(unsigned long)arg.sb,
 			   sb, vstruct_bytes(sb));
 err:
-	if (ca)
+	if (!IS_ERR_OR_NULL(ca))
 		percpu_ref_put(&ca->ref);
 	mutex_unlock(&c->sb_lock);
 	return ret;
-- 
cgit 


From faf1a5f41772984d492f9805ded9a34dcdce724d Mon Sep 17 00:00:00 2001
From: Dan Robertson <dan@dlrobertson.com>
Date: Wed, 5 May 2021 07:09:43 -0400
Subject: bcachefs: Fix out of bounds read in fs usage ioctl

Fix a possible read out of bounds if bch2_ioctl_fs_usage is called when
replica_entries_bytes is set to a value that is smaller than the size
of bch_replicas_usage.

Signed-off-by: Dan Robertson <dan@dlrobertson.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/chardev.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c
index b0cbbb70161d..99f112072ae5 100644
--- a/fs/bcachefs/chardev.c
+++ b/fs/bcachefs/chardev.c
@@ -414,7 +414,8 @@ static long bch2_ioctl_fs_usage(struct bch_fs *c,
 		struct bch_replicas_entry *src_e =
 			cpu_replicas_entry(&c->replicas, i);
 
-		if (replicas_usage_next(dst_e) > dst_end) {
+		/* check that we have enough space for one replicas entry */
+		if (dst_e + 1 > dst_end) {
 			ret = -ERANGE;
 			break;
 		}
-- 
cgit 


From e1036ce5810222620c98660d64ccc649652cf6c9 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 14 May 2021 21:28:37 -0400
Subject: bcachefs: Repair code for multiple types of data in same bucket

bch2_check_fix_ptrs() is awkward, we need to find a way to improve it.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_gc.c | 53 +++++++++++++++++++++++++++++++++++++++-----------
 fs/bcachefs/extents.h  | 24 +++++++++++++++++++++++
 2 files changed, 66 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index bd4dd1d67a1d..5b839cca8a9d 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -427,18 +427,38 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id,
 	const union bch_extent_entry *entry;
 	struct extent_ptr_decoded p = { 0 };
 	bool do_update = false;
+	char buf[200];
 	int ret = 0;
 
 	bkey_for_each_ptr_decode(k->k, ptrs, p, entry) {
 		struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev);
 		struct bucket *g = PTR_BUCKET(ca, &p.ptr, true);
 		struct bucket *g2 = PTR_BUCKET(ca, &p.ptr, false);
+		enum bch_data_type data_type = bch2_bkey_ptr_data_type(*k, &entry->ptr);
+
+		if (fsck_err_on(g->mark.data_type &&
+				g->mark.data_type != data_type, c,
+				"bucket %u:%zu different types of data in same bucket: %s, %s\n"
+				"while marking %s",
+				p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
+				bch2_data_types[g->mark.data_type],
+				bch2_data_types[data_type],
+				(bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf))) {
+			if (data_type == BCH_DATA_btree) {
+				g2->_mark.data_type = g->_mark.data_type = data_type;
+				set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags);
+			} else {
+				do_update = true;
+			}
+		}
 
 		if (fsck_err_on(!g->gen_valid, c,
-				"bucket %u:%zu data type %s ptr gen %u missing in alloc btree",
+				"bucket %u:%zu data type %s ptr gen %u missing in alloc btree\n"
+				"while marking %s",
 				p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
 				bch2_data_types[ptr_data_type(k->k, &p.ptr)],
-				p.ptr.gen)) {
+				p.ptr.gen,
+				(bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf))) {
 			if (!p.ptr.cached) {
 				g2->_mark.gen	= g->_mark.gen		= p.ptr.gen;
 				g2->gen_valid	= g->gen_valid		= true;
@@ -449,10 +469,12 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id,
 		}
 
 		if (fsck_err_on(gen_cmp(p.ptr.gen, g->mark.gen) > 0, c,
-				"bucket %u:%zu data type %s ptr gen in the future: %u > %u",
+				"bucket %u:%zu data type %s ptr gen in the future: %u > %u\n"
+				"while marking %s",
 				p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
 				bch2_data_types[ptr_data_type(k->k, &p.ptr)],
-				p.ptr.gen, g->mark.gen)) {
+				p.ptr.gen, g->mark.gen,
+				(bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf))) {
 			if (!p.ptr.cached) {
 				g2->_mark.gen	= g->_mark.gen	= p.ptr.gen;
 				g2->gen_valid	= g->gen_valid	= true;
@@ -468,23 +490,29 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id,
 
 		if (fsck_err_on(!p.ptr.cached &&
 				gen_cmp(p.ptr.gen, g->mark.gen) < 0, c,
-				"bucket %u:%zu data type %s stale dirty ptr: %u < %u",
+				"bucket %u:%zu data type %s stale dirty ptr: %u < %u\n"
+				"while marking %s",
 				p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
 				bch2_data_types[ptr_data_type(k->k, &p.ptr)],
-				p.ptr.gen, g->mark.gen))
+				p.ptr.gen, g->mark.gen,
+				(bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf)))
 			do_update = true;
 
 		if (p.has_ec) {
 			struct stripe *m = genradix_ptr(&c->stripes[true], p.ec.idx);
 
 			if (fsck_err_on(!m || !m->alive, c,
-					"pointer to nonexistent stripe %llu",
-					(u64) p.ec.idx))
+					"pointer to nonexistent stripe %llu\n"
+					"while marking %s",
+					(u64) p.ec.idx,
+					(bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf)))
 				do_update = true;
 
 			if (fsck_err_on(!bch2_ptr_matches_stripe_m(m, p), c,
-					"pointer does not match stripe %llu",
-					(u64) p.ec.idx))
+					"pointer does not match stripe %llu\n"
+					"while marking %s",
+					(u64) p.ec.idx,
+					(bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf)))
 				do_update = true;
 		}
 	}
@@ -525,11 +553,14 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id,
 			bch2_bkey_drop_ptrs(bkey_i_to_s(new), ptr, ({
 				struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
 				struct bucket *g = PTR_BUCKET(ca, ptr, true);
+				enum bch_data_type data_type = bch2_bkey_ptr_data_type(*k, ptr);
 
 				(ptr->cached &&
 				 (!g->gen_valid || gen_cmp(ptr->gen, g->mark.gen) > 0)) ||
 				(!ptr->cached &&
-				 gen_cmp(ptr->gen, g->mark.gen) < 0);
+				 gen_cmp(ptr->gen, g->mark.gen) < 0) ||
+				(g->mark.data_type &&
+				 g->mark.data_type != data_type);
 			}));
 again:
 			ptrs = bch2_bkey_ptrs(bkey_i_to_s(new));
diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h
index ccee43a2019d..9999805f955e 100644
--- a/fs/bcachefs/extents.h
+++ b/fs/bcachefs/extents.h
@@ -529,6 +529,30 @@ static inline struct bch_devs_list bch2_bkey_cached_devs(struct bkey_s_c k)
 	return ret;
 }
 
+static inline unsigned bch2_bkey_ptr_data_type(struct bkey_s_c k, const struct bch_extent_ptr *ptr)
+{
+	switch (k.k->type) {
+	case KEY_TYPE_btree_ptr:
+	case KEY_TYPE_btree_ptr_v2:
+		return BCH_DATA_btree;
+	case KEY_TYPE_extent:
+	case KEY_TYPE_reflink_v:
+		return BCH_DATA_user;
+	case KEY_TYPE_stripe: {
+		struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k);
+
+		BUG_ON(ptr < s.v->ptrs ||
+		       ptr >= s.v->ptrs + s.v->nr_blocks);
+
+		return ptr >= s.v->ptrs + s.v->nr_blocks - s.v->nr_redundant
+			? BCH_DATA_parity
+			: BCH_DATA_user;
+	}
+	default:
+		BUG();
+	}
+}
+
 unsigned bch2_bkey_nr_ptrs(struct bkey_s_c);
 unsigned bch2_bkey_nr_ptrs_allocated(struct bkey_s_c);
 unsigned bch2_bkey_nr_ptrs_fully_allocated(struct bkey_s_c);
-- 
cgit 


From d125615a4e22194dff9cecd12ec5a608f8c0f920 Mon Sep 17 00:00:00 2001
From: Dan Robertson <dan@dlrobertson.com>
Date: Fri, 14 May 2021 20:02:44 -0400
Subject: bcachefs: properly initialize used values

 - Ensure the second key value in bch_hash_info is initialized to zero
   if the info type is of type BCH_STR_HASH_SIPHASH.

 - Initialize the possibly returned value in bch2_inode_create. Assuming
   bch2_btree_iter_peek returns bkey_s_c_null, the uninitialized value
   of ret could be returned to the user as an error pointer.

 - Fix compiler warning in initialization of bkey_s_c_stripe

fs/bcachefs/buckets.c:1646:35: warning: suggest braces around initialization
of subobject [-Wmissing-braces]
        struct bkey_s_c_stripe new_s = { NULL };
                                         ^~~~

Signed-off-by: Dan Robertson <dan@dlrobertson.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/buckets.c  |  4 ++--
 fs/bcachefs/inode.c    |  2 +-
 fs/bcachefs/str_hash.h | 17 ++++++++++-------
 3 files changed, 13 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 8806e8306e4e..dad1c7d27bab 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -1646,8 +1646,8 @@ static int bch2_trans_mark_stripe(struct btree_trans *trans,
 				  struct bkey_s_c old, struct bkey_s_c new,
 				  unsigned flags)
 {
-	struct bkey_s_c_stripe old_s = { NULL };
-	struct bkey_s_c_stripe new_s = { NULL };
+	struct bkey_s_c_stripe old_s = { .k = NULL };
+	struct bkey_s_c_stripe new_s = { .k = NULL };
 	struct bch_replicas_padded r;
 	unsigned i;
 	int ret = 0;
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index 66b0bc01c75e..2ae55467c583 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -478,7 +478,7 @@ struct btree_iter *bch2_inode_create(struct btree_trans *trans,
 	struct btree_iter *iter = NULL;
 	struct bkey_s_c k;
 	u64 min, max, start, pos, *hint;
-	int ret;
+	int ret = 0;
 
 	u64 cpu = raw_smp_processor_id();
 	unsigned bits = (c->opts.inodes_32bit
diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h
index b85f895de346..eab669af7032 100644
--- a/fs/bcachefs/str_hash.h
+++ b/fs/bcachefs/str_hash.h
@@ -33,10 +33,11 @@ bch2_str_hash_opt_to_type(struct bch_fs *c, enum bch_str_hash_opts opt)
 
 struct bch_hash_info {
 	u8			type;
-	union {
-		__le64		crc_key;
-		SIPHASH_KEY	siphash_key;
-	};
+	/*
+	 * For crc32 or crc64 string hashes the first key value of
+	 * the siphash_key (k0) is used as the key.
+	 */
+	SIPHASH_KEY	siphash_key;
 };
 
 static inline struct bch_hash_info
@@ -46,7 +47,7 @@ bch2_hash_info_init(struct bch_fs *c, const struct bch_inode_unpacked *bi)
 	struct bch_hash_info info = {
 		.type = (bi->bi_flags >> INODE_STR_HASH_OFFSET) &
 			~(~0U << INODE_STR_HASH_BITS),
-		.crc_key = bi->bi_hash_seed,
+		.siphash_key = { .k0 = bi->bi_hash_seed }
 	};
 
 	if (unlikely(info.type == BCH_STR_HASH_SIPHASH_OLD)) {
@@ -76,10 +77,12 @@ static inline void bch2_str_hash_init(struct bch_str_hash_ctx *ctx,
 {
 	switch (info->type) {
 	case BCH_STR_HASH_CRC32C:
-		ctx->crc32c = crc32c(~0, &info->crc_key, sizeof(info->crc_key));
+		ctx->crc32c = crc32c(~0, &info->siphash_key.k0,
+				     sizeof(info->siphash_key.k0));
 		break;
 	case BCH_STR_HASH_CRC64:
-		ctx->crc64 = crc64_be(~0, &info->crc_key, sizeof(info->crc_key));
+		ctx->crc64 = crc64_be(~0, &info->siphash_key.k0,
+				      sizeof(info->siphash_key.k0));
 		break;
 	case BCH_STR_HASH_SIPHASH_OLD:
 	case BCH_STR_HASH_SIPHASH:
-- 
cgit 


From 6ebe32b94c77f80aca235e476bad321ecc355035 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 16 May 2021 23:46:08 -0400
Subject: bcachefs: Fix locking in __bch2_set_nr_journal_buckets()

We weren't holding mark_lock correctly - it's needed for the new_fs
path.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/journal.c | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index a7cc0b167072..7cbea06f57fd 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -804,8 +804,11 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
 		long b;
 
 		if (new_fs) {
+			if (c)
+				percpu_down_read(&c->mark_lock);
 			b = bch2_bucket_alloc_new_fs(ca);
 			if (b < 0) {
+				percpu_up_read(&c->mark_lock);
 				ret = -ENOSPC;
 				goto err;
 			}
@@ -820,10 +823,10 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
 			}
 
 			b = sector_to_bucket(ca, ob->ptr.offset);
+		}
 
-			percpu_down_read(&c->mark_lock);
+		if (c)
 			spin_lock(&c->journal.lock);
-		}
 
 		/*
 		 * XXX
@@ -850,15 +853,17 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
 		if (pos <= ja->cur_idx)
 			ja->cur_idx = (ja->cur_idx + 1) % ja->nr;
 
+		if (c)
+			spin_unlock(&c->journal.lock);
+
 		if (new_fs) {
 			bch2_mark_metadata_bucket(c, ca, b, BCH_DATA_journal,
 						  ca->mi.bucket_size,
 						  gc_phase(GC_PHASE_SB),
 						  0);
+			if (c)
+				percpu_up_read(&c->mark_lock);
 		} else {
-			spin_unlock(&c->journal.lock);
-			percpu_up_read(&c->mark_lock);
-
 			ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_NOFAIL,
 				bch2_trans_mark_metadata_bucket(&trans, ca,
 						b, BCH_DATA_journal,
-- 
cgit 


From 2cd0563461b68b895bd683e89ad70f3975e98e85 Mon Sep 17 00:00:00 2001
From: Brett Holman <bpholman5@gmail.com>
Date: Sun, 16 May 2021 21:53:55 -0600
Subject: bcachefs: made changes to support clang, fixed a couple bugs

fs/bcachefs/bset.c              edited prefetch macro to add clang support
fs/bcachefs/btree_iter.c        bugfix: initialize iter->real_pos in bch2_btree_iter_init for later use
fs/bcachefs/io.c                bugfix: eliminated undefined behavior (negative bitshift)
fs/bcachefs/buckets.c           bugfix: invert sign to handle 64bit abs()

Signed-off-by: Brett Holman <bpholman5@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bset.c       | 2 +-
 fs/bcachefs/btree_iter.c | 1 +
 fs/bcachefs/buckets.c    | 4 +++-
 fs/bcachefs/io.c         | 2 +-
 4 files changed, 6 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c
index 8c038da3c108..e569d9a9b906 100644
--- a/fs/bcachefs/bset.c
+++ b/fs/bcachefs/bset.c
@@ -1181,7 +1181,7 @@ static struct bkey_packed *bset_search_write_set(const struct btree *b,
 
 static inline void prefetch_four_cachelines(void *p)
 {
-#ifdef CONFIG_X86_64
+#if (CONFIG_X86_64 && !defined(__clang__))
 	asm(".intel_syntax noprefix;"
 	    "prefetcht0 [%0 - 127 + 64 * 0];"
 	    "prefetcht0 [%0 - 127 + 64 * 1];"
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 8c343d5cec0c..745f1ac4f538 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1898,6 +1898,7 @@ static inline void bch2_btree_iter_init(struct btree_trans *trans,
 	iter->trans			= trans;
 	iter->uptodate			= BTREE_ITER_NEED_TRAVERSE;
 	iter->btree_id			= btree_id;
+	iter->real_pos			= POS_MIN;
 	iter->level			= 0;
 	iter->min_depth			= 0;
 	iter->locks_want		= 0;
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index dad1c7d27bab..a167e30e1412 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -1801,7 +1801,9 @@ static int bch2_trans_mark_reflink_p(struct btree_trans *trans,
 	unsigned front_frag, back_frag;
 	s64 ret = 0;
 
-	sectors = abs(sectors);
+	if (sectors < 0)
+		sectors = -sectors;
+
 	BUG_ON(offset + sectors > p.k->size);
 
 	front_frag = offset;
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index 2b2e118114bd..9fc858a0a496 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -121,7 +121,7 @@ void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw)
 		 * the time:
 		 */
 		if (abs((int) (old - io_latency)) < (old >> 1) &&
-		    now & ~(~0 << 5))
+		    now & ~(~0U << 5))
 			break;
 
 		new = ewma_add(old, io_latency, 5);
-- 
cgit 


From 2e8f9d23cbc75a1d45506186da8c7a5c1605b622 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 17 May 2021 00:08:06 -0400
Subject: bcachefs: Make sure to pass a disk reservation to
 bch2_extent_update()

It's needed when we split an existing compressed extent - we get a null
ptr deref without it.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/reflink.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c
index 0978ad92614c..405a194d10e5 100644
--- a/fs/bcachefs/reflink.c
+++ b/fs/bcachefs/reflink.c
@@ -2,6 +2,7 @@
 #include "bcachefs.h"
 #include "bkey_buf.h"
 #include "btree_update.h"
+#include "buckets.h"
 #include "extents.h"
 #include "inode.h"
 #include "io.h"
@@ -224,6 +225,8 @@ s64 bch2_remap_range(struct bch_fs *c,
 				       BTREE_ITER_INTENT);
 
 	while (ret == 0 || ret == -EINTR) {
+		struct disk_reservation disk_res = { 0 };
+
 		bch2_trans_begin(&trans);
 
 		if (fatal_signal_pending(current)) {
@@ -287,8 +290,9 @@ s64 bch2_remap_range(struct bch_fs *c,
 				    dst_end.offset - dst_iter->pos.offset));
 
 		ret = bch2_extent_update(&trans, dst_iter, new_dst.k,
-					 NULL, journal_seq,
+					 &disk_res, journal_seq,
 					 new_i_size, i_sectors_delta);
+		bch2_disk_reservation_put(c, &disk_res);
 		if (ret)
 			continue;
 
-- 
cgit 


From bbfcb4519d61cdd83d1f9741e4f8525c0ffa8a8a Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 17 May 2021 00:28:50 -0400
Subject: bcachefs: Fix bch2_extent_can_insert() call

It was being skipped when hole punching, leading to problems when
splitting compressed extents.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_update_leaf.c |  6 +++---
 fs/bcachefs/buckets.c           | 10 ++++++----
 2 files changed, 9 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 2e7b9210564d..fbe6a17ffd8a 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -806,13 +806,13 @@ static int extent_update_to_keys(struct btree_trans *trans,
 {
 	int ret;
 
-	if (bkey_deleted(&n.k->k))
-		return 0;
-
 	ret = bch2_extent_can_insert(trans, n.iter, n.k);
 	if (ret)
 		return ret;
 
+	if (bkey_deleted(&n.k->k))
+		return 0;
+
 	n.iter = bch2_trans_get_iter(trans, n.iter->btree_id, n.k->k.p,
 				     BTREE_ITER_INTENT|
 				     BTREE_ITER_NOT_EXTENTS);
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index a167e30e1412..afee0594efae 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -1270,14 +1270,15 @@ int bch2_mark_update(struct btree_trans *trans,
 
 static noinline __cold
 void fs_usage_apply_warn(struct btree_trans *trans,
-			 unsigned disk_res_sectors)
+			 unsigned disk_res_sectors,
+			 s64 should_not_have_added)
 {
 	struct bch_fs *c = trans->c;
 	struct btree_insert_entry *i;
 	char buf[200];
 
-	bch_err(c, "disk usage increased more than %u sectors reserved",
-		disk_res_sectors);
+	bch_err(c, "disk usage increased %lli more than %u sectors reserved",
+		should_not_have_added, disk_res_sectors);
 
 	trans_for_each_update(trans, i) {
 		pr_err("while inserting");
@@ -1309,6 +1310,7 @@ void fs_usage_apply_warn(struct btree_trans *trans,
 			}
 		}
 	}
+	__WARN();
 }
 
 void bch2_trans_fs_usage_apply(struct btree_trans *trans,
@@ -1367,7 +1369,7 @@ void bch2_trans_fs_usage_apply(struct btree_trans *trans,
 	preempt_enable();
 
 	if (unlikely(warn) && !xchg(&warned_disk_usage, 1))
-		fs_usage_apply_warn(trans, disk_res_sectors);
+		fs_usage_apply_warn(trans, disk_res_sectors, should_not_have_added);
 }
 
 /* trans_mark: */
-- 
cgit 


From 82355e2882339067e1d6deaec68e629f63259c0d Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 17 May 2021 16:10:06 -0400
Subject: bcachefs: Fix a memcpy call

Not supposed to pass a null ptr to memcpy (even if the size is 0).

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/fsck.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index fcdcf42f85a4..a40459d2b0f0 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -1161,7 +1161,8 @@ static int add_nlink(struct nlink_table *t, u64 inum, u32 snapshot)
 			return -ENOMEM;
 		}
 
-		memcpy(d, t->d, t->size * sizeof(t->d[0]));
+		if (t->d)
+			memcpy(d, t->d, t->size * sizeof(t->d[0]));
 		kvfree(t->d);
 
 		t->d = d;
-- 
cgit 


From c21d5377791d94fca0d990eee82111572412640c Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 17 May 2021 16:43:30 -0400
Subject: bcachefs: Fix for bch2_bkey_pack_pos() not initializing len/version
 fields

This bug led to push_whiteout() generating whiteouts that failed
bch2_bkey_invalid() due to nonzero length fields - oops.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/bkey.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bkey.c b/fs/bcachefs/bkey.c
index a0379f980f7e..5de88a93f33f 100644
--- a/fs/bcachefs/bkey.c
+++ b/fs/bcachefs/bkey.c
@@ -446,8 +446,15 @@ enum bkey_pack_pos_ret bch2_bkey_pack_pos_lossy(struct bkey_packed *out,
 	struct bpos orig = in;
 #endif
 	bool exact = true;
+	unsigned i;
 
-	*w = 0;
+	/*
+	 * bch2_bkey_pack_key() will write to all of f->key_u64s, minus the 3
+	 * byte header, but pack_pos() won't if the len/version fields are big
+	 * enough - we need to make sure to zero them out:
+	 */
+	for (i = 0; i < f->key_u64s; i++)
+		w[i] = 0;
 
 	if (unlikely(in.snapshot <
 		     le64_to_cpu(f->field_offset[BKEY_FIELD_SNAPSHOT]))) {
-- 
cgit 


From ed34341189478344eb54588ce73f190b86da4d5e Mon Sep 17 00:00:00 2001
From: Dan Robertson <dan@dlrobertson.com>
Date: Tue, 18 May 2021 20:36:20 -0400
Subject: bcachefs: statfs resports incorrect avail blocks

The current implementation of bch_statfs does not scale the number of
available blocks provided in f_bavail by the reserve factor. This causes
an allocation of a file of this size to fail.

Signed-off-by: Dan Robertson <dan@dlrobertson.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/buckets.c | 7 -------
 fs/bcachefs/buckets.h | 7 +++++++
 fs/bcachefs/fs.c      | 4 ++--
 3 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index afee0594efae..b37cdf7279de 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -261,18 +261,11 @@ void bch2_fs_usage_to_text(struct printbuf *out,
 	}
 }
 
-#define RESERVE_FACTOR	6
-
 static u64 reserve_factor(u64 r)
 {
 	return r + (round_up(r, (1 << RESERVE_FACTOR)) >> RESERVE_FACTOR);
 }
 
-static u64 avail_factor(u64 r)
-{
-	return div_u64(r << RESERVE_FACTOR, (1 << RESERVE_FACTOR) + 1);
-}
-
 u64 bch2_fs_sectors_used(struct bch_fs *c, struct bch_fs_usage_online *fs_usage)
 {
 	return min(fs_usage->u.hidden +
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index 7463e6420b14..04a2a9310cdd 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -294,6 +294,13 @@ static inline int bch2_disk_reservation_get(struct bch_fs *c,
 	return bch2_disk_reservation_add(c, res, sectors * nr_replicas, flags);
 }
 
+#define RESERVE_FACTOR	6
+
+static inline u64 avail_factor(u64 r)
+{
+	return div_u64(r << RESERVE_FACTOR, (1 << RESERVE_FACTOR) + 1);
+}
+
 int bch2_dev_buckets_resize(struct bch_fs *, struct bch_dev *, u64);
 void bch2_dev_buckets_free(struct bch_dev *);
 int bch2_dev_buckets_alloc(struct bch_fs *, struct bch_dev *);
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index 5eef67358cfb..9a595c205dbf 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -1274,8 +1274,8 @@ static int bch2_statfs(struct dentry *dentry, struct kstatfs *buf)
 	buf->f_type	= BCACHEFS_STATFS_MAGIC;
 	buf->f_bsize	= sb->s_blocksize;
 	buf->f_blocks	= usage.capacity >> shift;
-	buf->f_bfree	= (usage.capacity - usage.used) >> shift;
-	buf->f_bavail	= buf->f_bfree;
+	buf->f_bfree	= usage.free >> shift;
+	buf->f_bavail	= avail_factor(usage.free) >> shift;
 
 	buf->f_files	= usage.nr_inodes + avail_inodes;
 	buf->f_ffree	= avail_inodes;
-- 
cgit 


From ef1b20924b0f584740094fdf7166acfb80338f0c Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 18 May 2021 23:53:43 -0400
Subject: bcachefs: Ratelimiting for writeback IOs

Writeback throttling is a kernel config option and not always enabled.
When it's not enabled we need a fallback, to avoid unbounded memory
pinning and work item backlogs.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs.h | 2 ++
 fs/bcachefs/fs-io.c    | 4 ++++
 fs/bcachefs/super.c    | 2 ++
 3 files changed, 8 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index c47e69931b8a..c5cafbd6d87a 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -191,6 +191,7 @@
 #include <linux/percpu-rwsem.h>
 #include <linux/rhashtable.h>
 #include <linux/rwsem.h>
+#include <linux/semaphore.h>
 #include <linux/seqlock.h>
 #include <linux/shrinker.h>
 #include <linux/srcu.h>
@@ -746,6 +747,7 @@ struct bch_fs {
 	struct rw_semaphore	gc_lock;
 
 	/* IO PATH */
+	struct semaphore	io_in_flight;
 	struct bio_set		bio_read;
 	struct bio_set		bio_read_split;
 	struct bio_set		bio_write;
diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index fbf171a4c191..763195ed0b3c 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -997,6 +997,8 @@ static void bch2_writepage_io_done(struct closure *cl)
 	struct bio_vec *bvec;
 	unsigned i;
 
+	up(&io->op.c->io_in_flight);
+
 	if (io->op.error) {
 		set_bit(EI_INODE_ERROR, &io->inode->ei_flags);
 
@@ -1059,6 +1061,8 @@ static void bch2_writepage_do_io(struct bch_writepage_state *w)
 {
 	struct bch_writepage_io *io = w->io;
 
+	down(&io->op.c->io_in_flight);
+
 	w->io = NULL;
 	closure_call(&io->op.cl, bch2_write, NULL, &io->cl);
 	continue_at(&io->cl, bch2_writepage_io_done, NULL);
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 71493b5ff695..b0bcd3bbb53b 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -717,6 +717,8 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 
 	seqcount_init(&c->usage_lock);
 
+	sema_init(&c->io_in_flight, 128);
+
 	c->copy_gc_enabled		= 1;
 	c->rebalance.enabled		= 1;
 	c->promote_whole_extents	= true;
-- 
cgit 


From d6462f494dcf57422f36f9c4704c2c128354f6c4 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 18 May 2021 23:17:03 -0400
Subject: bcachefs: Split extents if necessary in bch2_trans_update()

Currently, we handle multiple overlapping extents in the same
transaction commit by doing fixups in bch2_trans_update() - this patch
extents that to split updates when necessary. The next patch that
changes the reflink code to not fragment extents when making them
indirect will require this.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_update_leaf.c | 27 ++++++++++++++++++++++++---
 fs/bcachefs/io.c                |  5 ++---
 fs/bcachefs/reflink.c           |  6 ++++--
 3 files changed, 30 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index fbe6a17ffd8a..cc844ca81bcc 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -1099,9 +1099,30 @@ int bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter,
 		if (i < trans->updates + trans->nr_updates &&
 		    i->btree_id == n.btree_id &&
 		    bkey_cmp(n.k->k.p, bkey_start_pos(&i->k->k)) > 0) {
-			/* We don't handle splitting extents here: */
-			BUG_ON(bkey_cmp(bkey_start_pos(&n.k->k),
-					bkey_start_pos(&i->k->k)) > 0);
+			if (bkey_cmp(bkey_start_pos(&n.k->k),
+				     bkey_start_pos(&i->k->k)) > 0) {
+				struct btree_insert_entry split = *i;
+				int ret;
+
+				BUG_ON(trans->nr_updates + 1 >= BTREE_ITER_MAX);
+
+				split.k = bch2_trans_kmalloc(trans, bkey_bytes(&i->k->k));
+				ret = PTR_ERR_OR_ZERO(split.k);
+				if (ret)
+					return ret;
+
+				bkey_copy(split.k, i->k);
+				bch2_cut_back(bkey_start_pos(&n.k->k), split.k);
+
+				split.iter = bch2_trans_get_iter(trans, split.btree_id,
+								 bkey_start_pos(&split.k->k),
+								 BTREE_ITER_INTENT);
+				split.iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT;
+				bch2_trans_iter_put(trans, split.iter);
+				array_insert_item(trans->updates, trans->nr_updates,
+						  i - trans->updates, split);
+				i++;
+			}
 
 			/*
 			 * When we have an extent that overwrites the start of another
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index 9fc858a0a496..5a45e738bc80 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -342,9 +342,8 @@ int bch2_extent_update(struct btree_trans *trans,
 		bch2_trans_iter_put(trans, inode_iter);
 	}
 
-	bch2_trans_update(trans, iter, k, 0);
-
-	ret = bch2_trans_commit(trans, disk_res, journal_seq,
+	ret =   bch2_trans_update(trans, iter, k, 0) ?:
+		bch2_trans_commit(trans, disk_res, journal_seq,
 				BTREE_INSERT_NOCHECK_RW|
 				BTREE_INSERT_NOFAIL);
 	if (ret)
diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c
index 405a194d10e5..ec8532b39a49 100644
--- a/fs/bcachefs/reflink.c
+++ b/fs/bcachefs/reflink.c
@@ -155,7 +155,9 @@ static int bch2_make_extent_indirect(struct btree_trans *trans,
 	*refcount	= 0;
 	memcpy(refcount + 1, &orig->v, bkey_val_bytes(&orig->k));
 
-	bch2_trans_update(trans, reflink_iter, r_v, 0);
+	ret = bch2_trans_update(trans, reflink_iter, r_v, 0);
+	if (ret)
+		goto err;
 
 	r_p = bch2_trans_kmalloc(trans, sizeof(*r_p));
 	if (IS_ERR(r_p)) {
@@ -168,7 +170,7 @@ static int bch2_make_extent_indirect(struct btree_trans *trans,
 	set_bkey_val_bytes(&r_p->k, sizeof(r_p->v));
 	r_p->v.idx = cpu_to_le64(bkey_start_offset(&r_v->k));
 
-	bch2_trans_update(trans, extent_iter, &r_p->k_i, 0);
+	ret = bch2_trans_update(trans, extent_iter, &r_p->k_i, 0);
 err:
 	if (!IS_ERR(reflink_iter))
 		c->reflink_hint = reflink_iter->pos.offset;
-- 
cgit 


From e7084c9c8151b226c83258ccdccf172b74688324 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 19 May 2021 21:21:49 -0400
Subject: bcachefs: Make bch2_remap_range respect O_SYNC

Caught by xfstest generic/628

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/fs-io.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 763195ed0b3c..162f0eeb38df 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -2941,6 +2941,11 @@ loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src,
 	if (pos_dst + ret > dst->v.i_size)
 		i_size_write(&dst->v, pos_dst + ret);
 	spin_unlock(&dst->v.i_lock);
+
+	if (((file_dst->f_flags & (__O_SYNC | O_DSYNC)) ||
+	     IS_SYNC(file_inode(file_dst))) &&
+	    !c->opts.journal_flush_disabled)
+		ret = bch2_journal_flush_seq(&c->journal, dst->ei_journal_seq);
 err:
 	bch2_unlock_inodes(INODE_LOCK|INODE_PAGECACHE_BLOCK, src, dst);
 
-- 
cgit 


From 16ac8c9523a2744545bb773b41433a5007deeacb Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 20 May 2021 00:09:47 -0400
Subject: bcachefs: Fix inode backpointers in RENAME_OVERWRITE

When we delete the dirent an inode points to, we need to zero out the
backpointer fields - this was missed in the RENAME_OVERWRITE case.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/dirent.c    | 5 ++++-
 fs/bcachefs/fs-common.c | 7 +++++++
 2 files changed, 11 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c
index ec4666143f23..3bf6379cefe6 100644
--- a/fs/bcachefs/dirent.c
+++ b/fs/bcachefs/dirent.c
@@ -210,6 +210,8 @@ int bch2_dirent_rename(struct btree_trans *trans,
 
 	if (mode != BCH_RENAME)
 		*dst_inum = le64_to_cpu(bkey_s_c_to_dirent(old_dst).v->d_inum);
+	if (mode != BCH_RENAME_EXCHANGE)
+		*src_offset = dst_iter->pos.offset;
 
 	/* Lookup src: */
 	src_iter = bch2_hash_lookup(trans, bch2_dirent_hash_desc,
@@ -290,7 +292,8 @@ int bch2_dirent_rename(struct btree_trans *trans,
 	bch2_trans_update(trans, src_iter, &new_src->k_i, 0);
 	bch2_trans_update(trans, dst_iter, &new_dst->k_i, 0);
 out_set_offset:
-	*src_offset = new_src->k.p.offset;
+	if (mode == BCH_RENAME_EXCHANGE)
+		*src_offset = new_src->k.p.offset;
 	*dst_offset = new_dst->k.p.offset;
 out:
 	bch2_trans_iter_put(trans, src_iter);
diff --git a/fs/bcachefs/fs-common.c b/fs/bcachefs/fs-common.c
index 34d69c3f6680..08c6af886df7 100644
--- a/fs/bcachefs/fs-common.c
+++ b/fs/bcachefs/fs-common.c
@@ -289,6 +289,13 @@ int bch2_rename_trans(struct btree_trans *trans,
 			dst_inode_u->bi_dir		= src_dir_u->bi_inum;
 			dst_inode_u->bi_dir_offset	= src_offset;
 		}
+
+		if (mode == BCH_RENAME_OVERWRITE &&
+		    dst_inode_u->bi_dir		== dst_dir_u->bi_inum &&
+		    dst_inode_u->bi_dir_offset	== src_offset) {
+			dst_inode_u->bi_dir		= 0;
+			dst_inode_u->bi_dir_offset	= 0;
+		}
 	}
 
 	if (mode == BCH_RENAME_OVERWRITE) {
-- 
cgit 


From a6336910b1c30703fe0d5078ef1c656ea1d39096 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 20 May 2021 15:49:23 -0400
Subject: bcachefs: Fix for buffered writes getting -ENOSPC

Buffered writes may have to increase their disk reservation at btree
update time, due to compression and erasure coding being unpredictable:
O_DIRECT writes should be checking for -ENOSPC, but buffered writes have
already been accepted and should not.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_update_leaf.c |  3 +++
 fs/bcachefs/fs-io.c             |  3 ++-
 fs/bcachefs/io.c                | 26 ++++++++++++++++----------
 fs/bcachefs/io.h                |  9 +++++----
 fs/bcachefs/reflink.c           |  3 ++-
 5 files changed, 28 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index cc844ca81bcc..3c4bf13d4ef9 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -690,6 +690,7 @@ int bch2_trans_commit_error(struct btree_trans *trans,
 		}
 		break;
 	case BTREE_INSERT_ENOSPC:
+		BUG_ON(flags & BTREE_INSERT_NOFAIL);
 		ret = -ENOSPC;
 		break;
 	case BTREE_INSERT_NEED_MARK_REPLICAS:
@@ -743,6 +744,8 @@ int bch2_trans_commit_error(struct btree_trans *trans,
 		break;
 	}
 
+	BUG_ON(ret == -ENOSPC && (flags & BTREE_INSERT_NOFAIL));
+
 	return ret;
 }
 
diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 162f0eeb38df..45e58ba34463 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -1910,6 +1910,7 @@ static long bch2_dio_write_loop(struct dio_write *dio)
 		if ((req->ki_flags & IOCB_DSYNC) &&
 		    !c->opts.journal_flush_disabled)
 			dio->op.flags |= BCH_WRITE_FLUSH;
+		dio->op.flags |= BCH_WRITE_CHECK_ENOSPC;
 
 		ret = bch2_disk_reservation_get(c, &dio->op.res, bio_sectors(bio),
 						dio->op.opts.data_replicas, 0);
@@ -2725,7 +2726,7 @@ static int __bchfs_fallocate(struct bch_inode_info *inode, int mode,
 
 		ret = bch2_extent_update(&trans, iter, &reservation.k_i,
 				&disk_res, &inode->ei_journal_seq,
-				0, &i_sectors_delta);
+				0, &i_sectors_delta, true);
 		i_sectors_acct(c, inode, &quota_res, i_sectors_delta);
 bkey_err:
 		bch2_quota_reservation_put(c, inode, &quota_res);
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index 5a45e738bc80..eafefb651d1e 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -197,7 +197,7 @@ int bch2_sum_sector_overwrites(struct btree_trans *trans,
 			       struct btree_iter *extent_iter,
 			       struct bkey_i *new,
 			       bool *maybe_extending,
-			       bool *should_check_enospc,
+			       bool *usage_increasing,
 			       s64 *i_sectors_delta,
 			       s64 *disk_sectors_delta)
 {
@@ -209,7 +209,7 @@ int bch2_sum_sector_overwrites(struct btree_trans *trans,
 	int ret = 0;
 
 	*maybe_extending	= true;
-	*should_check_enospc	= false;
+	*usage_increasing	= false;
 	*i_sectors_delta	= 0;
 	*disk_sectors_delta	= 0;
 
@@ -229,10 +229,10 @@ int bch2_sum_sector_overwrites(struct btree_trans *trans,
 			? sectors * bch2_bkey_nr_ptrs_fully_allocated(old)
 			: 0;
 
-		if (!*should_check_enospc &&
+		if (!*usage_increasing &&
 		    (new_replicas > bch2_bkey_replicas(c, old) ||
 		     (!new_compressed && bch2_bkey_sectors_compressed(old))))
-			*should_check_enospc = true;
+			*usage_increasing = true;
 
 		if (bkey_cmp(old.k->p, new->k.p) >= 0) {
 			/*
@@ -267,11 +267,12 @@ int bch2_extent_update(struct btree_trans *trans,
 		       struct disk_reservation *disk_res,
 		       u64 *journal_seq,
 		       u64 new_i_size,
-		       s64 *i_sectors_delta_total)
+		       s64 *i_sectors_delta_total,
+		       bool check_enospc)
 {
 	/* this must live until after bch2_trans_commit(): */
 	struct bkey_inode_buf inode_p;
-	bool extending = false, should_check_enospc;
+	bool extending = false, usage_increasing;
 	s64 i_sectors_delta = 0, disk_sectors_delta = 0;
 	int ret;
 
@@ -281,17 +282,20 @@ int bch2_extent_update(struct btree_trans *trans,
 
 	ret = bch2_sum_sector_overwrites(trans, iter, k,
 			&extending,
-			&should_check_enospc,
+			&usage_increasing,
 			&i_sectors_delta,
 			&disk_sectors_delta);
 	if (ret)
 		return ret;
 
+	if (!usage_increasing)
+		check_enospc = false;
+
 	if (disk_res &&
 	    disk_sectors_delta > (s64) disk_res->sectors) {
 		ret = bch2_disk_reservation_add(trans->c, disk_res,
 					disk_sectors_delta - disk_res->sectors,
-					!should_check_enospc
+					!check_enospc
 					? BCH_DISK_RESERVATION_NOFAIL : 0);
 		if (ret)
 			return ret;
@@ -346,6 +350,7 @@ int bch2_extent_update(struct btree_trans *trans,
 		bch2_trans_commit(trans, disk_res, journal_seq,
 				BTREE_INSERT_NOCHECK_RW|
 				BTREE_INSERT_NOFAIL);
+	BUG_ON(ret == -ENOSPC);
 	if (ret)
 		return ret;
 
@@ -384,7 +389,7 @@ int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter,
 
 		ret = bch2_extent_update(trans, iter, &delete,
 				&disk_res, journal_seq,
-				0, i_sectors_delta);
+				0, i_sectors_delta, false);
 		bch2_disk_reservation_put(c, &disk_res);
 btree_err:
 		if (ret == -EINTR) {
@@ -457,7 +462,8 @@ int bch2_write_index_default(struct bch_write_op *op)
 
 		ret = bch2_extent_update(&trans, iter, sk.k,
 					 &op->res, op_journal_seq(op),
-					 op->new_i_size, &op->i_sectors_delta);
+					 op->new_i_size, &op->i_sectors_delta,
+					 op->flags & BCH_WRITE_CHECK_ENOSPC);
 		if (ret == -EINTR)
 			continue;
 		if (ret)
diff --git a/fs/bcachefs/io.h b/fs/bcachefs/io.h
index ccbd8c3e6642..d1fd37ef2fc0 100644
--- a/fs/bcachefs/io.h
+++ b/fs/bcachefs/io.h
@@ -38,11 +38,12 @@ enum bch_write_flags {
 	BCH_WRITE_ONLY_SPECIFIED_DEVS	= (1 << 6),
 	BCH_WRITE_WROTE_DATA_INLINE	= (1 << 7),
 	BCH_WRITE_FROM_INTERNAL		= (1 << 8),
+	BCH_WRITE_CHECK_ENOSPC		= (1 << 9),
 
 	/* Internal: */
-	BCH_WRITE_JOURNAL_SEQ_PTR	= (1 << 9),
-	BCH_WRITE_SKIP_CLOSURE_PUT	= (1 << 10),
-	BCH_WRITE_DONE			= (1 << 11),
+	BCH_WRITE_JOURNAL_SEQ_PTR	= (1 << 10),
+	BCH_WRITE_SKIP_CLOSURE_PUT	= (1 << 11),
+	BCH_WRITE_DONE			= (1 << 12),
 };
 
 static inline u64 *op_journal_seq(struct bch_write_op *op)
@@ -68,7 +69,7 @@ int bch2_sum_sector_overwrites(struct btree_trans *, struct btree_iter *,
 			       struct bkey_i *, bool *, bool *, s64 *, s64 *);
 int bch2_extent_update(struct btree_trans *, struct btree_iter *,
 		       struct bkey_i *, struct disk_reservation *,
-		       u64 *, u64, s64 *);
+		       u64 *, u64, s64 *, bool);
 int bch2_fpunch_at(struct btree_trans *, struct btree_iter *,
 		   struct bpos, u64 *, s64 *);
 int bch2_fpunch(struct bch_fs *c, u64, u64, u64, u64 *, s64 *);
diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c
index ec8532b39a49..c624fabe1e1c 100644
--- a/fs/bcachefs/reflink.c
+++ b/fs/bcachefs/reflink.c
@@ -293,7 +293,8 @@ s64 bch2_remap_range(struct bch_fs *c,
 
 		ret = bch2_extent_update(&trans, dst_iter, new_dst.k,
 					 &disk_res, journal_seq,
-					 new_i_size, i_sectors_delta);
+					 new_i_size, i_sectors_delta,
+					 true);
 		bch2_disk_reservation_put(c, &disk_res);
 		if (ret)
 			continue;
-- 
cgit 


From 304b7e08c72ee20f315142047ee4e133102ab911 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 20 May 2021 20:47:27 -0400
Subject: bcachefs: Fix an uninitialized var

this fixes a valgrind complaint

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_update_interior.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index ea522b4583fd..ea1b09b36059 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -286,6 +286,7 @@ static struct btree *bch2_btree_node_alloc(struct btree_update *as, unsigned lev
 
 	memset(&b->nr, 0, sizeof(b->nr));
 	b->data->magic = cpu_to_le64(bset_magic(c));
+	memset(&b->data->_ptr, 0, sizeof(b->data->_ptr));
 	b->data->flags = 0;
 	SET_BTREE_NODE_ID(b->data, as->btree_id);
 	SET_BTREE_NODE_LEVEL(b->data, level);
-- 
cgit 


From 596d3bdc1e749c63e4defc6abbe2552d035e4470 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 21 May 2021 16:06:54 -0400
Subject: bcachefs: Don't repair btree nodes until after interior journal
 replay is done

We need the btree to be in a consistent state before we can rewrite
btree nodes.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_update_interior.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index ea1b09b36059..1f14f3e56aef 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -1828,6 +1828,9 @@ void bch2_btree_node_rewrite_async(struct bch_fs *c, struct btree *b)
 {
 	struct async_btree_rewrite *a;
 
+	if (!test_bit(BCH_FS_BTREE_INTERIOR_REPLAY_DONE, &c->flags))
+		return;
+
 	if (!percpu_ref_tryget(&c->writes))
 		return;
 
-- 
cgit 


From 1ce0cf5fe9300f28e834e6fa001cc5a114cd0493 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 21 May 2021 23:57:37 -0400
Subject: bcachefs: Add a debug mode that always reads from every btree replica

There's a new module parameter, verify_all_btree_replicas, that enables
reading from every btree replica when reading in btree nodes and
comparing them against each other. We've been seeing some strange btree
corruption - this will hopefully aid in tracking it down and catching it
more often.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/bcachefs.h |   5 +-
 fs/bcachefs/btree_io.c | 273 +++++++++++++++++++++++++++++++++++++++++++++++--
 fs/bcachefs/btree_io.h |   4 +
 3 files changed, 274 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index c5cafbd6d87a..3de62571fb9f 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -264,7 +264,10 @@ do {									\
 	BCH_DEBUG_PARAM(verify_btree_ondisk,				\
 		"Reread btree nodes at various points to verify the "	\
 		"mergesort in the read path against modifications "	\
-		"done in memory")
+		"done in memory")					\
+	BCH_DEBUG_PARAM(verify_all_btree_replicas,			\
+		"When reading btree nodes, read all replicas and "	\
+		"compare them")
 
 /* Parameters that should only be compiled in in debug mode: */
 #define BCH_DEBUG_PARAMS_DEBUG()					\
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index dbaa05ac764c..69b1435653a4 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -521,7 +521,7 @@ enum btree_validate_ret {
 									\
 	switch (write) {						\
 	case READ:							\
-		bch_err(c, "%s", _buf2);					\
+		bch_err(c, "%s", _buf2);				\
 									\
 		switch (type) {						\
 		case BTREE_ERR_FIXABLE:					\
@@ -1036,8 +1036,8 @@ static void btree_node_read_work(struct work_struct *work)
 	struct btree_read_bio *rb =
 		container_of(work, struct btree_read_bio, work);
 	struct bch_fs *c	= rb->c;
+	struct btree *b		= rb->b;
 	struct bch_dev *ca	= bch_dev_bkey_exists(c, rb->pick.ptr.dev);
-	struct btree *b		= rb->bio.bi_private;
 	struct bio *bio		= &rb->bio;
 	struct bch_io_failures failed = { .nr = 0 };
 	char buf[200];
@@ -1112,6 +1112,262 @@ static void btree_node_read_endio(struct bio *bio)
 	queue_work(system_unbound_wq, &rb->work);
 }
 
+struct btree_node_read_all {
+	struct closure		cl;
+	struct bch_fs		*c;
+	struct btree		*b;
+	unsigned		nr;
+	void			*buf[BCH_REPLICAS_MAX];
+	struct bio		*bio[BCH_REPLICAS_MAX];
+	int			err[BCH_REPLICAS_MAX];
+};
+
+static unsigned btree_node_sectors_written(struct bch_fs *c, void *data)
+{
+	struct btree_node *bn = data;
+	struct btree_node_entry *bne;
+	unsigned offset = 0;
+
+	if (le64_to_cpu(bn->magic) !=  bset_magic(c))
+		return 0;
+
+	while (offset < c->opts.btree_node_size) {
+		if (!offset) {
+			offset += vstruct_sectors(bn, c->block_bits);
+		} else {
+			bne = data + (offset << 9);
+			if (bne->keys.seq != bn->keys.seq)
+				break;
+			offset += vstruct_sectors(bne, c->block_bits);
+		}
+	}
+
+	return offset;
+}
+
+static bool btree_node_has_extra_bsets(struct bch_fs *c, unsigned offset, void *data)
+{
+	struct btree_node *bn = data;
+	struct btree_node_entry *bne;
+
+	if (!offset)
+		return false;
+
+	while (offset < c->opts.btree_node_size) {
+		bne = data + (offset << 9);
+		if (bne->keys.seq == bn->keys.seq)
+			return true;
+		offset++;
+	}
+
+	return false;
+	return offset;
+}
+
+static void btree_node_read_all_replicas_done(struct closure *cl)
+{
+	struct btree_node_read_all *ra =
+		container_of(cl, struct btree_node_read_all, cl);
+	struct bch_fs *c = ra->c;
+	struct btree *b = ra->b;
+	bool have_good_copy = false;
+	bool dump_bset_maps = false;
+	bool have_retry = false;
+	int ret = 0, write = READ;
+	unsigned i, written, written2;
+	__le64 seq = b->key.k.type == KEY_TYPE_btree_ptr_v2
+		? bkey_i_to_btree_ptr_v2(&b->key)->v.seq : 0;
+
+	for (i = 0; i < ra->nr; i++) {
+		if (ra->err[i])
+			continue;
+
+		if (!have_good_copy) {
+			memcpy(b->data, ra->buf[i], btree_bytes(c));
+			have_good_copy = true;
+			written = btree_node_sectors_written(c, b->data);
+		}
+
+		/* Try to get the right btree node: */
+		if (have_good_copy &&
+		    seq &&
+		    b->data->keys.seq != seq &&
+		    ((struct btree_node *) ra->buf[i])->keys.seq == seq) {
+			memcpy(b->data, ra->buf[i], btree_bytes(c));
+			written = btree_node_sectors_written(c, b->data);
+		}
+
+		written2 = btree_node_sectors_written(c, ra->buf[i]);
+		if (btree_err_on(written2 != written, BTREE_ERR_FIXABLE, c, NULL, b, NULL,
+				 "btree node sectors written mismatch: %u != %u",
+				 written, written2) ||
+		    btree_err_on(btree_node_has_extra_bsets(c, written2, ra->buf[i]),
+				 BTREE_ERR_FIXABLE, c, NULL, b, NULL,
+				 "found bset signature after last bset") ||
+		    btree_err_on(memcmp(b->data, ra->buf[i], written << 9),
+				 BTREE_ERR_FIXABLE, c, NULL, b, NULL,
+				 "btree node replicas content mismatch"))
+			dump_bset_maps = true;
+
+		if (written2 > written) {
+			written = written2;
+			memcpy(b->data, ra->buf[i], btree_bytes(c));
+		}
+	}
+fsck_err:
+	if (dump_bset_maps) {
+		for (i = 0; i < ra->nr; i++) {
+			char buf[200];
+			struct printbuf out = PBUF(buf);
+			struct btree_node *bn = ra->buf[i];
+			struct btree_node_entry *bne = NULL;
+			unsigned offset = 0, sectors;
+			bool gap = false;
+
+			if (ra->err[i])
+				continue;
+
+			while (offset < c->opts.btree_node_size) {
+				if (!offset) {
+					sectors = vstruct_sectors(bn, c->block_bits);
+				} else {
+					bne = ra->buf[i] + (offset << 9);
+					if (bne->keys.seq != bn->keys.seq)
+						break;
+					sectors = vstruct_sectors(bne, c->block_bits);
+				}
+
+				pr_buf(&out, " %u-%u", offset, offset + sectors);
+				if (bne && bch2_journal_seq_is_blacklisted(c,
+							le64_to_cpu(bne->keys.journal_seq), false))
+					pr_buf(&out, "*");
+				offset += sectors;
+			}
+
+			while (offset < c->opts.btree_node_size) {
+				bne = ra->buf[i] + (offset << 9);
+				if (bne->keys.seq == bn->keys.seq) {
+					if (!gap)
+						pr_buf(&out, " GAP");
+					gap = true;
+
+					sectors = vstruct_sectors(bne, c->block_bits);
+					pr_buf(&out, " %u-%u", offset, offset + sectors);
+					if (bch2_journal_seq_is_blacklisted(c,
+							le64_to_cpu(bne->keys.journal_seq), false))
+						pr_buf(&out, "*");
+				}
+				offset++;
+			}
+
+			bch_err(c, "replica %u:%s", i, buf);
+		}
+	}
+
+	if (have_good_copy)
+		bch2_btree_node_read_done(c, NULL, b, false);
+	else
+		set_btree_node_read_error(b);
+
+	for (i = 0; i < ra->nr; i++) {
+		mempool_free(ra->buf[i], &c->btree_bounce_pool);
+		bio_put(ra->bio[i]);
+	}
+
+	closure_debug_destroy(&ra->cl);
+	kfree(ra);
+
+	clear_btree_node_read_in_flight(b);
+	wake_up_bit(&b->flags, BTREE_NODE_read_in_flight);
+}
+
+static void btree_node_read_all_replicas_endio(struct bio *bio)
+{
+	struct btree_read_bio *rb =
+		container_of(bio, struct btree_read_bio, bio);
+	struct bch_fs *c	= rb->c;
+	struct btree_node_read_all *ra = rb->ra;
+
+	if (rb->have_ioref) {
+		struct bch_dev *ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev);
+		bch2_latency_acct(ca, rb->start_time, READ);
+	}
+
+	ra->err[rb->idx] = bio->bi_status;
+	closure_put(&ra->cl);
+}
+
+/*
+ * XXX This allocates multiple times from the same mempools, and can deadlock
+ * under sufficient memory pressure (but is only a debug path)
+ */
+static int btree_node_read_all_replicas(struct bch_fs *c, struct btree *b, bool sync)
+{
+	struct bkey_s_c k = bkey_i_to_s_c(&b->key);
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+	const union bch_extent_entry *entry;
+	struct extent_ptr_decoded pick;
+	struct btree_node_read_all *ra;
+	unsigned i;
+
+	ra = kzalloc(sizeof(*ra), GFP_NOFS);
+	if (!ra)
+		return -ENOMEM;
+
+	closure_init(&ra->cl, NULL);
+	ra->c	= c;
+	ra->b	= b;
+	ra->nr	= bch2_bkey_nr_ptrs(k);
+
+	for (i = 0; i < ra->nr; i++) {
+		ra->buf[i] = mempool_alloc(&c->btree_bounce_pool, GFP_NOFS);
+		ra->bio[i] = bio_alloc_bioset(NULL,
+					      buf_pages(ra->buf[i], btree_bytes(c)),
+					      REQ_OP_READ|REQ_SYNC|REQ_META,
+					      GFP_NOFS,
+					      &c->btree_bio);
+	}
+
+	i = 0;
+	bkey_for_each_ptr_decode(k.k, ptrs, pick, entry) {
+		struct bch_dev *ca = bch_dev_bkey_exists(c, pick.ptr.dev);
+		struct btree_read_bio *rb =
+			container_of(ra->bio[i], struct btree_read_bio, bio);
+		rb->c			= c;
+		rb->b			= b;
+		rb->ra			= ra;
+		rb->start_time		= local_clock();
+		rb->have_ioref		= bch2_dev_get_ioref(ca, READ);
+		rb->idx			= i;
+		rb->pick		= pick;
+		rb->bio.bi_iter.bi_sector = pick.ptr.offset;
+		rb->bio.bi_end_io	= btree_node_read_all_replicas_endio;
+		bch2_bio_map(&rb->bio, ra->buf[i], btree_bytes(c));
+
+		if (rb->have_ioref) {
+			this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_btree],
+				     bio_sectors(&rb->bio));
+			bio_set_dev(&rb->bio, ca->disk_sb.bdev);
+
+			closure_get(&ra->cl);
+			submit_bio(&rb->bio);
+		} else {
+			ra->err[i] = BLK_STS_REMOVED;
+		}
+
+		i++;
+	}
+
+	if (sync) {
+		closure_sync(&ra->cl);
+		btree_node_read_all_replicas_done(&ra->cl);
+	} else {
+		continue_at(&ra->cl, btree_node_read_all_replicas_done, system_unbound_wq);
+	}
+
+	return 0;
+}
+
 void bch2_btree_node_read(struct bch_fs *c, struct btree *b,
 			  bool sync)
 {
@@ -1125,6 +1381,12 @@ void bch2_btree_node_read(struct bch_fs *c, struct btree *b,
 	btree_pos_to_text(&PBUF(buf), c, b);
 	trace_btree_read(c, b);
 
+	set_btree_node_read_in_flight(b);
+
+	if (bch2_verify_all_btree_replicas &&
+	    !btree_node_read_all_replicas(c, b, sync))
+		return;
+
 	ret = bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key),
 					 NULL, &pick);
 	if (bch2_fs_fatal_err_on(ret <= 0, c,
@@ -1143,17 +1405,16 @@ void bch2_btree_node_read(struct bch_fs *c, struct btree *b,
 			       &c->btree_bio);
 	rb = container_of(bio, struct btree_read_bio, bio);
 	rb->c			= c;
+	rb->b			= b;
+	rb->ra			= NULL;
 	rb->start_time		= local_clock();
 	rb->have_ioref		= bch2_dev_get_ioref(ca, READ);
 	rb->pick		= pick;
 	INIT_WORK(&rb->work, btree_node_read_work);
 	bio->bi_iter.bi_sector	= pick.ptr.offset;
 	bio->bi_end_io		= btree_node_read_endio;
-	bio->bi_private		= b;
 	bch2_bio_map(bio, b->data, btree_bytes(c));
 
-	set_btree_node_read_in_flight(b);
-
 	if (rb->have_ioref) {
 		this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_btree],
 			     bio_sectors(bio));
@@ -1162,7 +1423,6 @@ void bch2_btree_node_read(struct bch_fs *c, struct btree *b,
 		if (sync) {
 			submit_bio_wait(bio);
 
-			bio->bi_private	= b;
 			btree_node_read_work(&rb->work);
 		} else {
 			submit_bio(bio);
@@ -1174,7 +1434,6 @@ void bch2_btree_node_read(struct bch_fs *c, struct btree *b,
 			btree_node_read_work(&rb->work);
 		else
 			queue_work(system_unbound_wq, &rb->work);
-
 	}
 }
 
diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h
index cadcf7f886d7..abbc4675964a 100644
--- a/fs/bcachefs/btree_io.h
+++ b/fs/bcachefs/btree_io.h
@@ -13,6 +13,7 @@ struct bch_fs;
 struct btree_write;
 struct btree;
 struct btree_iter;
+struct btree_node_read_all;
 
 static inline bool btree_node_dirty(struct btree *b)
 {
@@ -33,8 +34,11 @@ static inline void clear_btree_node_dirty(struct bch_fs *c, struct btree *b)
 
 struct btree_read_bio {
 	struct bch_fs		*c;
+	struct btree		*b;
+	struct btree_node_read_all *ra;
 	u64			start_time;
 	unsigned		have_ioref:1;
+	unsigned		idx:7;
 	struct extent_ptr_decoded	pick;
 	struct work_struct	work;
 	struct bio		bio;
-- 
cgit 


From 2eba51a69a7aba82bd4639a71b0906be7e23ca5c Mon Sep 17 00:00:00 2001
From: Brett Holman <bpholman5@gmail.com>
Date: Fri, 21 May 2021 16:45:38 -0600
Subject: bcachefs: rewrote prefetch asm in gas syntax for clang compatibility

Signed-off-by: Brett Holman <bpholman5@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bset.c | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c
index e569d9a9b906..84c4664c9912 100644
--- a/fs/bcachefs/bset.c
+++ b/fs/bcachefs/bset.c
@@ -1181,13 +1181,11 @@ static struct bkey_packed *bset_search_write_set(const struct btree *b,
 
 static inline void prefetch_four_cachelines(void *p)
 {
-#if (CONFIG_X86_64 && !defined(__clang__))
-	asm(".intel_syntax noprefix;"
-	    "prefetcht0 [%0 - 127 + 64 * 0];"
-	    "prefetcht0 [%0 - 127 + 64 * 1];"
-	    "prefetcht0 [%0 - 127 + 64 * 2];"
-	    "prefetcht0 [%0 - 127 + 64 * 3];"
-	    ".att_syntax prefix;"
+#if CONFIG_X86_64
+	asm("prefetcht0 (-127 + 64 * 0)(%0);"
+	    "prefetcht0 (-127 + 64 * 1)(%0);"
+	    "prefetcht0 (-127 + 64 * 2)(%0);"
+	    "prefetcht0 (-127 + 64 * 3)(%0);"
 	    :
 	    : "r" (p + 127));
 #else
-- 
cgit 


From 731bdd2eff41808ded960fbe62a83672904e1600 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 22 May 2021 17:37:25 -0400
Subject: bcachefs: Add a workqueue for btree io completions

Also, clean up workqueue usage - we shouldn't be using system
workqueues, pretty much everything we do needs to be on our own
WQ_MEM_RECLAIM workqueues.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/bcachefs.h              |  3 ++-
 fs/bcachefs/btree_io.c              | 13 +++++++------
 fs/bcachefs/btree_update_interior.c |  5 +++--
 fs/bcachefs/io.c                    |  9 ++++++++-
 fs/bcachefs/io.h                    |  2 +-
 fs/bcachefs/journal.c               |  6 ++++--
 fs/bcachefs/journal_io.c            | 14 +++++++-------
 fs/bcachefs/super.c                 | 10 +++++++---
 8 files changed, 39 insertions(+), 23 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 3de62571fb9f..2b0c9b1c841b 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -627,6 +627,7 @@ struct bch_fs {
 
 	/* BTREE CACHE */
 	struct bio_set		btree_bio;
+	struct workqueue_struct	*io_complete_wq;
 
 	struct btree_root	btree_roots[BTREE_ID_NR];
 	struct mutex		btree_root_lock;
@@ -664,7 +665,7 @@ struct bch_fs {
 
 	struct btree_key_cache	btree_key_cache;
 
-	struct workqueue_struct	*wq;
+	struct workqueue_struct	*btree_update_wq;
 	/* copygc needs its own workqueue for index updates.. */
 	struct workqueue_struct	*copygc_wq;
 
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 69b1435653a4..4d06e765a777 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -1109,7 +1109,7 @@ static void btree_node_read_endio(struct bio *bio)
 		bch2_latency_acct(ca, rb->start_time, READ);
 	}
 
-	queue_work(system_unbound_wq, &rb->work);
+	queue_work(c->io_complete_wq, &rb->work);
 }
 
 struct btree_node_read_all {
@@ -1362,7 +1362,8 @@ static int btree_node_read_all_replicas(struct bch_fs *c, struct btree *b, bool
 		closure_sync(&ra->cl);
 		btree_node_read_all_replicas_done(&ra->cl);
 	} else {
-		continue_at(&ra->cl, btree_node_read_all_replicas_done, system_unbound_wq);
+		continue_at(&ra->cl, btree_node_read_all_replicas_done,
+			    c->io_complete_wq);
 	}
 
 	return 0;
@@ -1433,7 +1434,7 @@ void bch2_btree_node_read(struct bch_fs *c, struct btree *b,
 		if (sync)
 			btree_node_read_work(&rb->work);
 		else
-			queue_work(system_unbound_wq, &rb->work);
+			queue_work(c->io_complete_wq, &rb->work);
 	}
 }
 
@@ -1600,7 +1601,7 @@ static void btree_node_write_work(struct work_struct *work)
 		bio_list_add(&c->btree_write_error_list, &wbio->wbio.bio);
 		spin_unlock_irqrestore(&c->btree_write_error_lock, flags);
 
-		queue_work(c->wq, &c->btree_write_error_work);
+		queue_work(c->btree_update_wq, &c->btree_write_error_work);
 		return;
 	}
 
@@ -1639,7 +1640,7 @@ static void btree_node_write_endio(struct bio *bio)
 			container_of(orig, struct btree_write_bio, wbio);
 
 		INIT_WORK(&wb->work, btree_node_write_work);
-		queue_work(system_unbound_wq, &wb->work);
+		queue_work(c->io_complete_wq, &wb->work);
 	}
 }
 
@@ -1900,7 +1901,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b)
 	atomic64_add(sectors_to_write, &c->btree_writes_sectors);
 
 	INIT_WORK(&wbio->work, btree_write_submit);
-	schedule_work(&wbio->work);
+	queue_work(c->io_complete_wq, &wbio->work);
 	return;
 err:
 	set_btree_node_noevict(b);
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 1f14f3e56aef..6eeb0ca58b6a 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -908,7 +908,8 @@ void bch2_btree_update_done(struct btree_update *as)
 
 	bch2_btree_reserve_put(as);
 
-	continue_at(&as->cl, btree_update_set_nodes_written, system_freezable_wq);
+	continue_at(&as->cl, btree_update_set_nodes_written,
+		    as->c->btree_interior_update_worker);
 }
 
 struct btree_update *
@@ -1847,7 +1848,7 @@ void bch2_btree_node_rewrite_async(struct bch_fs *c, struct btree *b)
 	a->seq		= b->data->keys.seq;
 
 	INIT_WORK(&a->work, async_btree_node_rewrite_work);
-	queue_work(system_long_wq, &a->work);
+	queue_work(c->btree_interior_update_worker, &a->work);
 }
 
 static void __bch2_btree_node_update_key(struct bch_fs *c,
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index eafefb651d1e..bf59875db546 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -1440,7 +1440,7 @@ static void promote_start(struct promote_op *op, struct bch_read_bio *rbio)
 	bch2_migrate_read_done(&op->write, rbio);
 
 	closure_init(cl, NULL);
-	closure_call(&op->write.op.cl, bch2_write, c->wq, cl);
+	closure_call(&op->write.op.cl, bch2_write, c->btree_update_wq, cl);
 	closure_return_with_destructor(cl, promote_done);
 }
 
@@ -1823,6 +1823,13 @@ static void __bch2_read_endio(struct work_struct *work)
 	if (bch2_crc_cmp(csum, rbio->pick.crc.csum) && !c->opts.no_data_io)
 		goto csum_err;
 
+	/*
+	 * XXX
+	 * We need to rework the narrow_crcs path to deliver the read completion
+	 * first, and then punt to a different workqueue, otherwise we're
+	 * holding up reads while doing btree updates which is bad for memory
+	 * reclaim.
+	 */
 	if (unlikely(rbio->narrow_crcs))
 		bch2_rbio_narrow_crcs(rbio);
 
diff --git a/fs/bcachefs/io.h b/fs/bcachefs/io.h
index d1fd37ef2fc0..5d692a2228a6 100644
--- a/fs/bcachefs/io.h
+++ b/fs/bcachefs/io.h
@@ -62,7 +62,7 @@ static inline struct workqueue_struct *index_update_wq(struct bch_write_op *op)
 {
 	return op->alloc_reserve == RESERVE_MOVINGGC
 		? op->c->copygc_wq
-		: op->c->wq;
+		: op->c->btree_update_wq;
 }
 
 int bch2_sum_sector_overwrites(struct btree_trans *, struct btree_iter *,
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index 7cbea06f57fd..f72e3124d351 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -117,7 +117,9 @@ void bch2_journal_halt(struct journal *j)
 
 void __bch2_journal_buf_put(struct journal *j)
 {
-	closure_call(&j->io, bch2_journal_write, system_highpri_wq, NULL);
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
+
+	closure_call(&j->io, bch2_journal_write, c->io_complete_wq, NULL);
 }
 
 /*
@@ -303,7 +305,7 @@ static int journal_entry_open(struct journal *j)
 				       j->res_get_blocked_start);
 	j->res_get_blocked_start = 0;
 
-	mod_delayed_work(system_freezable_wq,
+	mod_delayed_work(c->io_complete_wq,
 			 &j->write_work,
 			 msecs_to_jiffies(j->write_delay_ms));
 	journal_wake(j);
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 2d3fc33720b8..66a0e44a21a6 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -1295,12 +1295,12 @@ static void journal_write_done(struct closure *cl)
 	journal_wake(j);
 
 	if (test_bit(JOURNAL_NEED_WRITE, &j->flags))
-		mod_delayed_work(system_freezable_wq, &j->write_work, 0);
+		mod_delayed_work(c->io_complete_wq, &j->write_work, 0);
 	spin_unlock(&j->lock);
 
 	if (new.unwritten_idx != new.idx &&
 	    !journal_state_count(new, new.unwritten_idx))
-		closure_call(&j->io, bch2_journal_write, system_highpri_wq, NULL);
+		closure_call(&j->io, bch2_journal_write, c->io_complete_wq, NULL);
 }
 
 static void journal_write_endio(struct bio *bio)
@@ -1367,7 +1367,7 @@ static void do_journal_write(struct closure *cl)
 			le64_to_cpu(w->data->seq);
 	}
 
-	continue_at(cl, journal_write_done, system_highpri_wq);
+	continue_at(cl, journal_write_done, c->io_complete_wq);
 	return;
 }
 
@@ -1506,7 +1506,7 @@ retry_alloc:
 			journal_debug_buf);
 		kfree(journal_debug_buf);
 		bch2_fatal_error(c);
-		continue_at(cl, journal_write_done, system_highpri_wq);
+		continue_at(cl, journal_write_done, c->io_complete_wq);
 		return;
 	}
 
@@ -1537,14 +1537,14 @@ retry_alloc:
 
 	bch2_bucket_seq_cleanup(c);
 
-	continue_at(cl, do_journal_write, system_highpri_wq);
+	continue_at(cl, do_journal_write, c->io_complete_wq);
 	return;
 no_io:
 	bch2_bucket_seq_cleanup(c);
 
-	continue_at(cl, journal_write_done, system_highpri_wq);
+	continue_at(cl, journal_write_done, c->io_complete_wq);
 	return;
 err:
 	bch2_inconsistent_error(c);
-	continue_at(cl, journal_write_done, system_highpri_wq);
+	continue_at(cl, journal_write_done, c->io_complete_wq);
 }
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index b0bcd3bbb53b..62f9fee51b37 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -510,10 +510,12 @@ static void __bch2_fs_free(struct bch_fs *c)
 	kfree(c->unused_inode_hints);
 	free_heap(&c->copygc_heap);
 
+	if (c->io_complete_wq )
+		destroy_workqueue(c->io_complete_wq );
 	if (c->copygc_wq)
 		destroy_workqueue(c->copygc_wq);
-	if (c->wq)
-		destroy_workqueue(c->wq);
+	if (c->btree_update_wq)
+		destroy_workqueue(c->btree_update_wq);
 
 	bch2_free_super(&c->disk_sb);
 	kvpfree(c, sizeof(*c));
@@ -762,10 +764,12 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 
 	c->inode_shard_bits = ilog2(roundup_pow_of_two(num_possible_cpus()));
 
-	if (!(c->wq = alloc_workqueue("bcachefs",
+	if (!(c->btree_update_wq = alloc_workqueue("bcachefs",
 				WQ_FREEZABLE|WQ_MEM_RECLAIM, 1)) ||
 	    !(c->copygc_wq = alloc_workqueue("bcachefs_copygc",
 				WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) ||
+	    !(c->io_complete_wq = alloc_workqueue("bcachefs_io",
+				WQ_FREEZABLE|WQ_HIGHPRI|WQ_MEM_RECLAIM, 1)) ||
 	    percpu_ref_init(&c->writes, bch2_writes_disabled,
 			    PERCPU_REF_INIT_DEAD, GFP_KERNEL) ||
 	    mempool_init_kmalloc_pool(&c->fill_iter, 1, iter_size) ||
-- 
cgit 


From 4495cbed568b074c8fbd9a3b761176d38a4955e3 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 22 May 2021 21:13:17 -0400
Subject: bcachefs: Improve FS_IOC_GOINGDOWN ioctl

We weren't interpreting the flags argument at all.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/fs-ioctl.c | 60 +++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 49 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c
index 09a9567b402c..6d6368555875 100644
--- a/fs/bcachefs/fs-ioctl.c
+++ b/fs/bcachefs/fs-ioctl.c
@@ -13,6 +13,9 @@
 #include <linux/mount.h>
 
 #define FS_IOC_GOINGDOWN	     _IOR('X', 125, __u32)
+#define FSOP_GOING_FLAGS_DEFAULT	0x0	/* going down */
+#define FSOP_GOING_FLAGS_LOGFLUSH	0x1	/* flush log but not data */
+#define FSOP_GOING_FLAGS_NOLOGFLUSH	0x2	/* don't flush log nor data */
 
 struct flags_set {
 	unsigned		mask;
@@ -247,11 +250,54 @@ err1:
 	return ret;
 }
 
+static int bch2_ioc_goingdown(struct bch_fs *c, u32 __user *arg)
+{
+	u32 flags;
+	int ret = 0;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if (get_user(flags, arg))
+		return -EFAULT;
+
+	bch_notice(c, "shutdown by ioctl type %u", flags);
+
+	down_write(&c->vfs_sb->s_umount);
+
+	switch (flags) {
+	case FSOP_GOING_FLAGS_DEFAULT:
+		ret = freeze_bdev(c->vfs_sb->s_bdev);
+		if (ret)
+			goto err;
+
+		bch2_journal_flush(&c->journal);
+		c->vfs_sb->s_flags |= SB_RDONLY;
+		bch2_fs_emergency_read_only(c);
+		thaw_bdev(c->vfs_sb->s_bdev);
+		break;
+
+	case FSOP_GOING_FLAGS_LOGFLUSH:
+		bch2_journal_flush(&c->journal);
+		fallthrough;
+
+	case FSOP_GOING_FLAGS_NOLOGFLUSH:
+		c->vfs_sb->s_flags |= SB_RDONLY;
+		bch2_fs_emergency_read_only(c);
+		break;
+	default:
+		ret = -EINVAL;
+		break;
+	}
+err:
+	up_write(&c->vfs_sb->s_umount);
+	return ret;
+}
+
 long bch2_fs_file_ioctl(struct file *file, unsigned cmd, unsigned long arg)
 {
 	struct bch_inode_info *inode = file_bch_inode(file);
-	struct super_block *sb = inode->v.i_sb;
-	struct bch_fs *c = sb->s_fs_info;
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
 
 	switch (cmd) {
 	case FS_IOC_GETFLAGS:
@@ -276,15 +322,7 @@ long bch2_fs_file_ioctl(struct file *file, unsigned cmd, unsigned long arg)
 		return -ENOTTY;
 
 	case FS_IOC_GOINGDOWN:
-		if (!capable(CAP_SYS_ADMIN))
-			return -EPERM;
-
-		down_write(&sb->s_umount);
-		sb->s_flags |= SB_RDONLY;
-		if (bch2_fs_emergency_read_only(c))
-			bch_err(c, "emergency read only due to ioctl");
-		up_write(&sb->s_umount);
-		return 0;
+		return bch2_ioc_goingdown(c, (u32 __user *) arg);
 
 	default:
 		return bch2_fs_ioctl(c, cmd, (void __user *) arg);
-- 
cgit 


From 9dd89a05fd946869c23e12fcfb9b9643b337fd56 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 22 May 2021 21:43:20 -0400
Subject: bcachefs: Fix an issue with inconsistent btree writes after unclean
 shutdown

After unclean shutdown, btree writes may have completed on one device
and not others - and this inconsistency could lead us to writing new
bsets with a gap in our btree node in one of our replicas.

Fortunately, this is only an issue with bsets that are newer than the
most recent journal flush, and we already have a mechanism for detecting
and blacklisting those. We just need to make sure to start new btree
writes after the most recent _non_ blacklisted bset.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_io.c | 19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 4d06e765a777..18d12c012cc6 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -824,6 +824,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
 	bool updated_range = b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
 		BTREE_PTR_RANGE_UPDATED(&bkey_i_to_btree_ptr_v2(&b->key)->v);
 	unsigned u64s;
+	unsigned nonblacklisted_written = 0;
 	int ret, retry_read = 0, write = READ;
 
 	b->version_ondisk = U16_MAX;
@@ -943,15 +944,31 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
 		sort_iter_add(iter,
 			      vstruct_idx(i, whiteout_u64s),
 			      vstruct_last(i));
+
+		nonblacklisted_written = b->written;
 	}
 
 	for (bne = write_block(b);
 	     bset_byte_offset(b, bne) < btree_bytes(c);
 	     bne = (void *) bne + block_bytes(c))
-		btree_err_on(bne->keys.seq == b->data->keys.seq,
+		btree_err_on(bne->keys.seq == b->data->keys.seq &&
+			     !bch2_journal_seq_is_blacklisted(c,
+					le64_to_cpu(bne->keys.journal_seq),
+					true),
 			     BTREE_ERR_WANT_RETRY, c, ca, b, NULL,
 			     "found bset signature after last bset");
 
+	/*
+	 * Blacklisted bsets are those that were written after the most recent
+	 * (flush) journal write. Since there wasn't a flush, they may not have
+	 * made it to all devices - which means we shouldn't write new bsets
+	 * after them, as that could leave a gap and then reads from that device
+	 * wouldn't find all the bsets in that btree node - which means it's
+	 * important that we start writing new bsets after the most recent _non_
+	 * blacklisted bset:
+	 */
+	b->written = nonblacklisted_written;
+
 	sorted = btree_bounce_alloc(c, btree_bytes(c), &used_mempool);
 	sorted->keys.u64s = 0;
 
-- 
cgit 


From 443d2760e51f3e17ddb47ddaf969ba49e6dbd06d Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 23 May 2021 18:42:51 -0400
Subject: bcachefs: Fix a null ptr deref

bch2_btree_iter_peek() won't always return a key - whoops.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/move.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index bce10eb4eb4c..8807b18ddc43 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -522,6 +522,11 @@ static int lookup_inode(struct btree_trans *trans, struct bpos pos,
 	if (ret)
 		goto err;
 
+	if (!k.k || bkey_cmp(k.k->p, pos)) {
+		ret = -ENOENT;
+		goto err;
+	}
+
 	ret = k.k->type == KEY_TYPE_inode ? 0 : -EIO;
 	if (ret)
 		goto err;
-- 
cgit 


From c4d4b2f01abd95af7c6dccf236e2760aef5a9b69 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 25 May 2021 18:42:05 -0400
Subject: bcachefs: Add a cond_resched call to the copygc main loop

We seem to have a bug where the copygc thread ends up spinning and
making the system unusable - this will at least prevent it from locking
up the machine, and it's a good thing to have anyways.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/movinggc.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c
index acb4d943db79..e8f9d70af930 100644
--- a/fs/bcachefs/movinggc.c
+++ b/fs/bcachefs/movinggc.c
@@ -317,6 +317,8 @@ static int bch2_copygc_thread(void *arg)
 	set_freezable();
 
 	while (!kthread_should_stop()) {
+		cond_resched();
+
 		if (kthread_wait_freezable(c->copy_gc_enabled))
 			break;
 
-- 
cgit 


From 19d2819d2d01bf46d0a12cafc532af0aab9b1cc1 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 26 May 2021 01:03:35 -0400
Subject: bcachefs: Add a tracepoint for copygc waiting

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/movinggc.c |  1 +
 fs/bcachefs/trace.h    | 21 +++++++++++++++++++++
 2 files changed, 22 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c
index e8f9d70af930..b805371fe99f 100644
--- a/fs/bcachefs/movinggc.c
+++ b/fs/bcachefs/movinggc.c
@@ -326,6 +326,7 @@ static int bch2_copygc_thread(void *arg)
 		wait = bch2_copygc_wait_amount(c);
 
 		if (wait > clock->max_slop) {
+			trace_copygc_wait(c, wait, last + wait);
 			c->copygc_wait = last + wait;
 			bch2_kthread_io_clock_wait(clock, last + wait,
 					MAX_SCHEDULE_TIMEOUT);
diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h
index b0a696ae4fc1..1f62d82624bd 100644
--- a/fs/bcachefs/trace.h
+++ b/fs/bcachefs/trace.h
@@ -528,6 +528,27 @@ TRACE_EVENT(copygc,
 		__entry->buckets_moved, __entry->buckets_not_moved)
 );
 
+TRACE_EVENT(copygc_wait,
+	TP_PROTO(struct bch_fs *c,
+		 u64 wait_amount, u64 until),
+	TP_ARGS(c, wait_amount, until),
+
+	TP_STRUCT__entry(
+		__array(char,		uuid,	16		)
+		__field(u64,		wait_amount		)
+		__field(u64,		until			)
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->uuid, c->sb.user_uuid.b, 16);
+		__entry->wait_amount	= wait_amount;
+		__entry->until		= until;
+	),
+
+	TP_printk("%pU waiting for %llu sectors until %llu",
+		__entry->uuid, __entry->wait_amount, __entry->until)
+);
+
 TRACE_EVENT(trans_get_iter,
 	TP_PROTO(unsigned long caller, unsigned long ip,
 		 enum btree_id btree_id,
-- 
cgit 


From ddc7dd62f0971d5c46c155134c647e7d493b2045 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 27 May 2021 19:15:44 -0400
Subject: bcachefs: Don't use uuid in tracepoints

%pU for printing out pointers to uuids doesn't work in perf trace

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/bcachefs.h |  1 +
 fs/bcachefs/fs.c       |  2 ++
 fs/bcachefs/trace.h    | 93 +++++++++++++++++++++++---------------------------
 3 files changed, 45 insertions(+), 51 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 2b0c9b1c841b..2ed795a58c81 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -568,6 +568,7 @@ struct bch_fs {
 	int			minor;
 	struct device		*chardev;
 	struct super_block	*vfs_sb;
+	dev_t			dev;
 	char			name[40];
 
 	/* ro/rw, add/remove/resize devices: */
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index 9a595c205dbf..b1b7ee722f2c 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -1591,6 +1591,8 @@ got_sb:
 		break;
 	}
 
+	c->dev = sb->s_dev;
+
 #ifdef CONFIG_BCACHEFS_POSIX_ACL
 	if (c->opts.acl)
 		sb->s_flags	|= SB_POSIXACL;
diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h
index 1f62d82624bd..d447b79bd6ee 100644
--- a/fs/bcachefs/trace.h
+++ b/fs/bcachefs/trace.h
@@ -49,14 +49,14 @@ DECLARE_EVENT_CLASS(bch_fs,
 	TP_ARGS(c),
 
 	TP_STRUCT__entry(
-		__array(char,		uuid,	16 )
+		__field(dev_t,		dev			)
 	),
 
 	TP_fast_assign(
-		memcpy(__entry->uuid, c->sb.user_uuid.b, 16);
+		__entry->dev		= c->dev;
 	),
 
-	TP_printk("%pU", __entry->uuid)
+	TP_printk("%d,%d", MAJOR(__entry->dev), MINOR(__entry->dev))
 );
 
 DECLARE_EVENT_CLASS(bio,
@@ -131,7 +131,7 @@ TRACE_EVENT(journal_reclaim_start,
 		btree_key_cache_dirty, btree_key_cache_total),
 
 	TP_STRUCT__entry(
-		__array(char,		uuid,	16		)
+		__field(dev_t,		dev			)
 		__field(u64,		min_nr			)
 		__field(u64,		prereserved		)
 		__field(u64,		prereserved_total	)
@@ -142,7 +142,7 @@ TRACE_EVENT(journal_reclaim_start,
 	),
 
 	TP_fast_assign(
-		memcpy(__entry->uuid, c->sb.user_uuid.b, 16);
+		__entry->dev			= c->dev;
 		__entry->min_nr			= min_nr;
 		__entry->prereserved		= prereserved;
 		__entry->prereserved_total	= prereserved_total;
@@ -152,8 +152,8 @@ TRACE_EVENT(journal_reclaim_start,
 		__entry->btree_key_cache_total	= btree_key_cache_total;
 	),
 
-	TP_printk("%pU min %llu prereserved %llu/%llu btree cache %llu/%llu key cache %llu/%llu",
-		  __entry->uuid,
+	TP_printk("%d,%d min %llu prereserved %llu/%llu btree cache %llu/%llu key cache %llu/%llu",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  __entry->min_nr,
 		  __entry->prereserved,
 		  __entry->prereserved_total,
@@ -168,16 +168,18 @@ TRACE_EVENT(journal_reclaim_finish,
 	TP_ARGS(c, nr_flushed),
 
 	TP_STRUCT__entry(
-		__array(char,		uuid,	16 )
-		__field(u64,		nr_flushed )
+		__field(dev_t,		dev			)
+		__field(u64,		nr_flushed		)
 	),
 
 	TP_fast_assign(
-		memcpy(__entry->uuid, c->sb.user_uuid.b, 16);
-		__entry->nr_flushed = nr_flushed;
+		__entry->dev		= c->dev;
+		__entry->nr_flushed	= nr_flushed;
 	),
 
-	TP_printk("%pU flushed %llu", __entry->uuid, __entry->nr_flushed)
+	TP_printk("%d%d flushed %llu",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->nr_flushed)
 );
 
 /* bset.c: */
@@ -194,7 +196,7 @@ DECLARE_EVENT_CLASS(btree_node,
 	TP_ARGS(c, b),
 
 	TP_STRUCT__entry(
-		__array(char,		uuid,		16	)
+		__field(dev_t,		dev			)
 		__field(u8,		level			)
 		__field(u8,		id			)
 		__field(u64,		inode			)
@@ -202,15 +204,16 @@ DECLARE_EVENT_CLASS(btree_node,
 	),
 
 	TP_fast_assign(
-		memcpy(__entry->uuid, c->sb.user_uuid.b, 16);
+		__entry->dev		= c->dev;
 		__entry->level		= b->c.level;
 		__entry->id		= b->c.btree_id;
 		__entry->inode		= b->key.k.p.inode;
 		__entry->offset		= b->key.k.p.offset;
 	),
 
-	TP_printk("%pU  %u id %u %llu:%llu",
-		  __entry->uuid, __entry->level, __entry->id,
+	TP_printk("%d,%d  %u id %u %llu:%llu",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->level, __entry->id,
 		  __entry->inode, __entry->offset)
 );
 
@@ -254,32 +257,17 @@ DEFINE_EVENT(btree_node, btree_node_reap,
 	TP_ARGS(c, b)
 );
 
-DECLARE_EVENT_CLASS(btree_node_cannibalize_lock,
-	TP_PROTO(struct bch_fs *c),
-	TP_ARGS(c),
-
-	TP_STRUCT__entry(
-		__array(char,			uuid,	16	)
-	),
-
-	TP_fast_assign(
-		memcpy(__entry->uuid, c->sb.user_uuid.b, 16);
-	),
-
-	TP_printk("%pU", __entry->uuid)
-);
-
-DEFINE_EVENT(btree_node_cannibalize_lock, btree_node_cannibalize_lock_fail,
+DEFINE_EVENT(bch_fs, btree_node_cannibalize_lock_fail,
 	TP_PROTO(struct bch_fs *c),
 	TP_ARGS(c)
 );
 
-DEFINE_EVENT(btree_node_cannibalize_lock, btree_node_cannibalize_lock,
+DEFINE_EVENT(bch_fs, btree_node_cannibalize_lock,
 	TP_PROTO(struct bch_fs *c),
 	TP_ARGS(c)
 );
 
-DEFINE_EVENT(btree_node_cannibalize_lock, btree_node_cannibalize,
+DEFINE_EVENT(bch_fs, btree_node_cannibalize,
 	TP_PROTO(struct bch_fs *c),
 	TP_ARGS(c)
 );
@@ -294,18 +282,19 @@ TRACE_EVENT(btree_reserve_get_fail,
 	TP_ARGS(c, required, cl),
 
 	TP_STRUCT__entry(
-		__array(char,			uuid,	16	)
+		__field(dev_t,		dev			)
 		__field(size_t,			required	)
 		__field(struct closure *,	cl		)
 	),
 
 	TP_fast_assign(
-		memcpy(__entry->uuid, c->sb.user_uuid.b, 16);
+		__entry->dev		= c->dev;
 		__entry->required = required;
 		__entry->cl = cl;
 	),
 
-	TP_printk("%pU required %zu by %p", __entry->uuid,
+	TP_printk("%d,%d required %zu by %p",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  __entry->required, __entry->cl)
 );
 
@@ -483,19 +472,20 @@ TRACE_EVENT(move_data,
 	TP_ARGS(c, sectors_moved, keys_moved),
 
 	TP_STRUCT__entry(
-		__array(char,		uuid,	16	)
+		__field(dev_t,		dev			)
 		__field(u64,		sectors_moved	)
 		__field(u64,		keys_moved	)
 	),
 
 	TP_fast_assign(
-		memcpy(__entry->uuid, c->sb.user_uuid.b, 16);
+		__entry->dev			= c->dev;
 		__entry->sectors_moved = sectors_moved;
 		__entry->keys_moved = keys_moved;
 	),
 
-	TP_printk("%pU sectors_moved %llu keys_moved %llu",
-		__entry->uuid, __entry->sectors_moved, __entry->keys_moved)
+	TP_printk("%d,%d sectors_moved %llu keys_moved %llu",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->sectors_moved, __entry->keys_moved)
 );
 
 TRACE_EVENT(copygc,
@@ -507,7 +497,7 @@ TRACE_EVENT(copygc,
 		buckets_moved, buckets_not_moved),
 
 	TP_STRUCT__entry(
-		__array(char,		uuid,	16		)
+		__field(dev_t,		dev			)
 		__field(u64,		sectors_moved		)
 		__field(u64,		sectors_not_moved	)
 		__field(u64,		buckets_moved		)
@@ -515,17 +505,17 @@ TRACE_EVENT(copygc,
 	),
 
 	TP_fast_assign(
-		memcpy(__entry->uuid, c->sb.user_uuid.b, 16);
+		__entry->dev			= c->dev;
 		__entry->sectors_moved		= sectors_moved;
 		__entry->sectors_not_moved	= sectors_not_moved;
 		__entry->buckets_moved		= buckets_moved;
 		__entry->buckets_not_moved = buckets_moved;
 	),
 
-	TP_printk("%pU sectors moved %llu remain %llu buckets moved %llu remain %llu",
-		__entry->uuid,
-		__entry->sectors_moved, __entry->sectors_not_moved,
-		__entry->buckets_moved, __entry->buckets_not_moved)
+	TP_printk("%d,%d sectors moved %llu remain %llu buckets moved %llu remain %llu",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->sectors_moved, __entry->sectors_not_moved,
+		  __entry->buckets_moved, __entry->buckets_not_moved)
 );
 
 TRACE_EVENT(copygc_wait,
@@ -534,19 +524,20 @@ TRACE_EVENT(copygc_wait,
 	TP_ARGS(c, wait_amount, until),
 
 	TP_STRUCT__entry(
-		__array(char,		uuid,	16		)
+		__field(dev_t,		dev			)
 		__field(u64,		wait_amount		)
 		__field(u64,		until			)
 	),
 
 	TP_fast_assign(
-		memcpy(__entry->uuid, c->sb.user_uuid.b, 16);
+		__entry->dev		= c->dev;
 		__entry->wait_amount	= wait_amount;
 		__entry->until		= until;
 	),
 
-	TP_printk("%pU waiting for %llu sectors until %llu",
-		__entry->uuid, __entry->wait_amount, __entry->until)
+	TP_printk("%d,%u waiting for %llu sectors until %llu",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->wait_amount, __entry->until)
 );
 
 TRACE_EVENT(trans_get_iter,
-- 
cgit 


From 7e94eeffe0e79a54e525ad05302eb454fb96affd Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 31 Oct 2022 22:28:09 -0400
Subject: bcachefs: Inline fastpath of bch2_disk_reservation_add()

The fastpath now doesn't even disable preemption - instead we use a (non
locked) cmpxchg.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/buckets.c |  2 +-
 fs/bcachefs/buckets.h | 30 +++++++++++++++++++++++++-----
 2 files changed, 26 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index b37cdf7279de..6389ec7ba18b 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -2107,7 +2107,7 @@ int bch2_trans_mark_dev_sb(struct bch_fs *c, struct bch_dev *ca)
 
 #define SECTORS_CACHE	1024
 
-int bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res,
+int __bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res,
 			      u64 sectors, int flags)
 {
 	struct bch_fs_pcpu *pcpu;
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index 04a2a9310cdd..61be96a7b03d 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -261,15 +261,35 @@ int bch2_trans_mark_dev_sb(struct bch_fs *, struct bch_dev *);
 static inline void bch2_disk_reservation_put(struct bch_fs *c,
 					     struct disk_reservation *res)
 {
-	this_cpu_sub(*c->online_reserved, res->sectors);
-	res->sectors = 0;
+	if (res->sectors) {
+		this_cpu_sub(*c->online_reserved, res->sectors);
+		res->sectors = 0;
+	}
 }
 
 #define BCH_DISK_RESERVATION_NOFAIL		(1 << 0)
 
-int bch2_disk_reservation_add(struct bch_fs *,
-			      struct disk_reservation *,
-			      u64, int);
+int __bch2_disk_reservation_add(struct bch_fs *,
+				struct disk_reservation *,
+				u64, int);
+
+static inline int bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res,
+					    u64 sectors, int flags)
+{
+	u64 old, new;
+
+	do {
+		old = this_cpu_read(c->pcpu->sectors_available);
+		if (sectors > old)
+			return __bch2_disk_reservation_add(c, res, sectors, flags);
+
+		new = old - sectors;
+	} while (this_cpu_cmpxchg(c->pcpu->sectors_available, old, new) != old);
+
+	this_cpu_add(*c->online_reserved, sectors);
+	res->sectors			+= sectors;
+	return 0;
+}
 
 static inline struct disk_reservation
 bch2_disk_reservation_init(struct bch_fs *c, unsigned nr_replicas)
-- 
cgit 


From af171183194f73cca9a2f44ba13907ecc9c761a9 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 28 Oct 2022 23:57:01 -0400
Subject: bcachefs: Kill bch_write_op.index_update_fn

This deletes bch_write_op.index_update_fn: indirect function calls have
gotten considerably more expensive post spectre/meltdown, and we only
have two different index_update_fns - this patch adds a flag to specify
which one to use (normal vs. data move path).

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/io.c       | 11 +++++++----
 fs/bcachefs/io.h       | 49 +++++++++++++++++++++++++++++++------------------
 fs/bcachefs/io_types.h |  2 --
 fs/bcachefs/move.c     |  6 +++---
 fs/bcachefs/move.h     |  1 +
 5 files changed, 42 insertions(+), 27 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index bf59875db546..1b457e90a172 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -432,7 +432,7 @@ int bch2_fpunch(struct bch_fs *c, u64 inum, u64 start, u64 end,
 	return ret;
 }
 
-int bch2_write_index_default(struct bch_write_op *op)
+static int bch2_write_index_default(struct bch_write_op *op)
 {
 	struct bch_fs *c = op->c;
 	struct bkey_buf sk;
@@ -577,7 +577,7 @@ static void __bch2_write_index(struct bch_write_op *op)
 	struct bch_extent_ptr *ptr;
 	struct bkey_i *src, *dst = keys->keys, *n, *k;
 	unsigned dev;
-	int ret;
+	int ret = 0;
 
 	for (src = keys->keys; src != keys->top; src = n) {
 		n = bkey_next(src);
@@ -614,7 +614,10 @@ static void __bch2_write_index(struct bch_write_op *op)
 
 	if (!bch2_keylist_empty(keys)) {
 		u64 sectors_start = keylist_sectors(keys);
-		int ret = op->index_update_fn(op);
+
+		ret = !(op->flags & BCH_WRITE_MOVE)
+			? bch2_write_index_default(op)
+			: bch2_migrate_index_update(op);
 
 		BUG_ON(ret == -EINTR);
 		BUG_ON(keylist_sectors(keys) && !ret);
@@ -624,7 +627,7 @@ static void __bch2_write_index(struct bch_write_op *op)
 		if (ret) {
 			bch_err_inum_ratelimited(c, op->pos.inode,
 				"write error %i from btree update", ret);
-			op->error = ret;
+			goto err;
 		}
 	}
 out:
diff --git a/fs/bcachefs/io.h b/fs/bcachefs/io.h
index 5d692a2228a6..f21ffb53c1e4 100644
--- a/fs/bcachefs/io.h
+++ b/fs/bcachefs/io.h
@@ -29,23 +29,39 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *, struct bch_fs *,
 const char *bch2_blk_status_to_str(blk_status_t);
 
 enum bch_write_flags {
-	BCH_WRITE_ALLOC_NOWAIT		= (1 << 0),
-	BCH_WRITE_CACHED		= (1 << 1),
-	BCH_WRITE_FLUSH			= (1 << 2),
-	BCH_WRITE_DATA_ENCODED		= (1 << 3),
-	BCH_WRITE_PAGES_STABLE		= (1 << 4),
-	BCH_WRITE_PAGES_OWNED		= (1 << 5),
-	BCH_WRITE_ONLY_SPECIFIED_DEVS	= (1 << 6),
-	BCH_WRITE_WROTE_DATA_INLINE	= (1 << 7),
-	BCH_WRITE_FROM_INTERNAL		= (1 << 8),
-	BCH_WRITE_CHECK_ENOSPC		= (1 << 9),
-
-	/* Internal: */
-	BCH_WRITE_JOURNAL_SEQ_PTR	= (1 << 10),
-	BCH_WRITE_SKIP_CLOSURE_PUT	= (1 << 11),
-	BCH_WRITE_DONE			= (1 << 12),
+	__BCH_WRITE_ALLOC_NOWAIT,
+	__BCH_WRITE_CACHED,
+	__BCH_WRITE_FLUSH,
+	__BCH_WRITE_DATA_ENCODED,
+	__BCH_WRITE_PAGES_STABLE,
+	__BCH_WRITE_PAGES_OWNED,
+	__BCH_WRITE_ONLY_SPECIFIED_DEVS,
+	__BCH_WRITE_WROTE_DATA_INLINE,
+	__BCH_WRITE_FROM_INTERNAL,
+	__BCH_WRITE_CHECK_ENOSPC,
+	__BCH_WRITE_MOVE,
+	__BCH_WRITE_JOURNAL_SEQ_PTR,
+	__BCH_WRITE_SKIP_CLOSURE_PUT,
+	__BCH_WRITE_DONE,
 };
 
+#define BCH_WRITE_ALLOC_NOWAIT		(1U << __BCH_WRITE_ALLOC_NOWAIT)
+#define BCH_WRITE_CACHED		(1U << __BCH_WRITE_CACHED)
+#define BCH_WRITE_FLUSH			(1U << __BCH_WRITE_FLUSH)
+#define BCH_WRITE_DATA_ENCODED		(1U << __BCH_WRITE_DATA_ENCODED)
+#define BCH_WRITE_PAGES_STABLE		(1U << __BCH_WRITE_PAGES_STABLE)
+#define BCH_WRITE_PAGES_OWNED		(1U << __BCH_WRITE_PAGES_OWNED)
+#define BCH_WRITE_ONLY_SPECIFIED_DEVS	(1U << __BCH_WRITE_ONLY_SPECIFIED_DEVS)
+#define BCH_WRITE_WROTE_DATA_INLINE	(1U << __BCH_WRITE_WROTE_DATA_INLINE)
+#define BCH_WRITE_FROM_INTERNAL		(1U << __BCH_WRITE_FROM_INTERNAL)
+#define BCH_WRITE_CHECK_ENOSPC		(1U << __BCH_WRITE_CHECK_ENOSPC)
+#define BCH_WRITE_MOVE			(1U << __BCH_WRITE_MOVE)
+
+/* Internal: */
+#define BCH_WRITE_JOURNAL_SEQ_PTR	(1U << __BCH_WRITE_JOURNAL_SEQ_PTR)
+#define BCH_WRITE_SKIP_CLOSURE_PUT	(1U << __BCH_WRITE_SKIP_CLOSURE_PUT)
+#define BCH_WRITE_DONE			(1U << __BCH_WRITE_DONE)
+
 static inline u64 *op_journal_seq(struct bch_write_op *op)
 {
 	return (op->flags & BCH_WRITE_JOURNAL_SEQ_PTR)
@@ -74,8 +90,6 @@ int bch2_fpunch_at(struct btree_trans *, struct btree_iter *,
 		   struct bpos, u64 *, s64 *);
 int bch2_fpunch(struct bch_fs *c, u64, u64, u64, u64 *, s64 *);
 
-int bch2_write_index_default(struct bch_write_op *);
-
 static inline void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c,
 				      struct bch_io_opts opts)
 {
@@ -101,7 +115,6 @@ static inline void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c,
 	op->journal_seq		= 0;
 	op->new_i_size		= U64_MAX;
 	op->i_sectors_delta	= 0;
-	op->index_update_fn	= bch2_write_index_default;
 }
 
 void bch2_write(struct closure *);
diff --git a/fs/bcachefs/io_types.h b/fs/bcachefs/io_types.h
index 99b4b4c4a53b..148808bdea50 100644
--- a/fs/bcachefs/io_types.h
+++ b/fs/bcachefs/io_types.h
@@ -146,8 +146,6 @@ struct bch_write_op {
 	u64			new_i_size;
 	s64			i_sectors_delta;
 
-	int			(*index_update_fn)(struct bch_write_op *);
-
 	struct bch_devs_mask	failed;
 
 	struct keylist		insert_keys;
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index 8807b18ddc43..61c9be4acd5f 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -52,7 +52,7 @@ struct moving_context {
 	wait_queue_head_t	wait;
 };
 
-static int bch2_migrate_index_update(struct bch_write_op *op)
+int bch2_migrate_index_update(struct bch_write_op *op)
 {
 	struct bch_fs *c = op->c;
 	struct btree_trans trans;
@@ -266,11 +266,11 @@ int bch2_migrate_write_init(struct bch_fs *c, struct migrate_write *m,
 	m->op.flags |= BCH_WRITE_PAGES_STABLE|
 		BCH_WRITE_PAGES_OWNED|
 		BCH_WRITE_DATA_ENCODED|
-		BCH_WRITE_FROM_INTERNAL;
+		BCH_WRITE_FROM_INTERNAL|
+		BCH_WRITE_MOVE;
 
 	m->op.nr_replicas	= data_opts.nr_replicas;
 	m->op.nr_replicas_required = data_opts.nr_replicas;
-	m->op.index_update_fn	= bch2_migrate_index_update;
 
 	switch (data_cmd) {
 	case DATA_ADD_REPLICAS: {
diff --git a/fs/bcachefs/move.h b/fs/bcachefs/move.h
index 5076153689d1..99d6acb10880 100644
--- a/fs/bcachefs/move.h
+++ b/fs/bcachefs/move.h
@@ -41,6 +41,7 @@ struct migrate_write {
 	struct bch_write_op	op;
 };
 
+int bch2_migrate_index_update(struct bch_write_op *);
 void bch2_migrate_read_done(struct migrate_write *, struct bch_read_bio *);
 int bch2_migrate_write_init(struct bch_fs *, struct migrate_write *,
 			    struct write_point_specifier,
-- 
cgit 


From 9f311f2166eb969dbe3d69ab24cd78567a30d62c Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 29 Oct 2022 02:47:33 -0400
Subject: bcachefs: Don't use bch_write_op->cl for delivering completions

We already had op->end_io as an alternative mechanism to op->cl.parent
for delivering write completions; this switches all code paths to using
op->end_io.

Two reasons:
 - op->end_io is more efficient, due to fewer atomic ops, this completes
   the conversion that was originally only done for the direct IO path.
 - We'll be restructing the write path to use a different mechanism for
   punting to process context, refactoring to not use op->cl will make
   that easier.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-io.c | 23 ++++++-----------------
 fs/bcachefs/io.c    | 21 +++++++--------------
 fs/bcachefs/move.c  | 29 +++++++++++++++--------------
 3 files changed, 28 insertions(+), 45 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 45e58ba34463..08746a6cbc31 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -65,7 +65,6 @@ struct quota_res {
 };
 
 struct bch_writepage_io {
-	struct closure			cl;
 	struct bch_inode_info		*inode;
 
 	/* must be last: */
@@ -979,18 +978,10 @@ static inline struct bch_writepage_state bch_writepage_state_init(struct bch_fs
 	};
 }
 
-static void bch2_writepage_io_free(struct closure *cl)
-{
-	struct bch_writepage_io *io = container_of(cl,
-					struct bch_writepage_io, cl);
-
-	bio_put(&io->op.wbio.bio);
-}
-
-static void bch2_writepage_io_done(struct closure *cl)
+static void bch2_writepage_io_done(struct bch_write_op *op)
 {
-	struct bch_writepage_io *io = container_of(cl,
-					struct bch_writepage_io, cl);
+	struct bch_writepage_io *io =
+		container_of(op, struct bch_writepage_io, op);
 	struct bch_fs *c = io->op.c;
 	struct bio *bio = &io->op.wbio.bio;
 	struct bvec_iter_all iter;
@@ -1054,7 +1045,7 @@ static void bch2_writepage_io_done(struct closure *cl)
 			end_page_writeback(bvec->bv_page);
 	}
 
-	closure_return_with_destructor(&io->cl, bch2_writepage_io_free);
+	bio_put(&io->op.wbio.bio);
 }
 
 static void bch2_writepage_do_io(struct bch_writepage_state *w)
@@ -1064,8 +1055,7 @@ static void bch2_writepage_do_io(struct bch_writepage_state *w)
 	down(&io->op.c->io_in_flight);
 
 	w->io = NULL;
-	closure_call(&io->op.cl, bch2_write, NULL, &io->cl);
-	continue_at(&io->cl, bch2_writepage_io_done, NULL);
+	closure_call(&io->op.cl, bch2_write, NULL, NULL);
 }
 
 /*
@@ -1087,9 +1077,7 @@ static void bch2_writepage_io_alloc(struct bch_fs *c,
 					      &c->writepage_bioset),
 			     struct bch_writepage_io, op.wbio.bio);
 
-	closure_init(&w->io->cl, NULL);
 	w->io->inode		= inode;
-
 	op			= &w->io->op;
 	bch2_write_op_init(op, c, w->opts);
 	op->target		= w->opts.foreground_target;
@@ -1098,6 +1086,7 @@ static void bch2_writepage_io_alloc(struct bch_fs *c,
 	op->res.nr_replicas	= nr_replicas;
 	op->write_point		= writepoint_hashed(inode->ei_last_dirtied);
 	op->pos			= POS(inode->v.i_ino, sector);
+	op->end_io		= bch2_writepage_io_done;
 	op->wbio.bio.bi_iter.bi_sector = sector;
 	op->wbio.bio.bi_opf	= wbc_to_write_flags(wbc);
 }
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index 1b457e90a172..4424129cad46 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -558,13 +558,9 @@ static void bch2_write_done(struct closure *cl)
 
 	bch2_time_stats_update(&c->times[BCH_TIME_data_write], op->start_time);
 
-	if (op->end_io) {
-		EBUG_ON(cl->parent);
-		closure_debug_destroy(cl);
-		op->end_io(op);
-	} else {
-		closure_return(cl);
-	}
+	EBUG_ON(cl->parent);
+	closure_debug_destroy(cl);
+	op->end_io(op);
 }
 
 /**
@@ -1357,7 +1353,6 @@ err:
 /* Cache promotion on read */
 
 struct promote_op {
-	struct closure		cl;
 	struct rcu_head		rcu;
 	u64			start_time;
 
@@ -1411,10 +1406,10 @@ static void promote_free(struct bch_fs *c, struct promote_op *op)
 	kfree_rcu(op, rcu);
 }
 
-static void promote_done(struct closure *cl)
+static void promote_done(struct bch_write_op *wop)
 {
 	struct promote_op *op =
-		container_of(cl, struct promote_op, cl);
+		container_of(wop, struct promote_op, write.op);
 	struct bch_fs *c = op->write.op.c;
 
 	bch2_time_stats_update(&c->times[BCH_TIME_data_promote],
@@ -1427,7 +1422,6 @@ static void promote_done(struct closure *cl)
 static void promote_start(struct promote_op *op, struct bch_read_bio *rbio)
 {
 	struct bch_fs *c = rbio->c;
-	struct closure *cl = &op->cl;
 	struct bio *bio = &op->write.op.wbio.bio;
 
 	trace_promote(&rbio->bio);
@@ -1442,9 +1436,7 @@ static void promote_start(struct promote_op *op, struct bch_read_bio *rbio)
 
 	bch2_migrate_read_done(&op->write, rbio);
 
-	closure_init(cl, NULL);
-	closure_call(&op->write.op.cl, bch2_write, c->btree_update_wq, cl);
-	closure_return_with_destructor(cl, promote_done);
+	closure_call(&op->write.op.cl, bch2_write, c->btree_update_wq, NULL);
 }
 
 static struct promote_op *__promote_alloc(struct bch_fs *c,
@@ -1509,6 +1501,7 @@ static struct promote_op *__promote_alloc(struct bch_fs *c,
 			},
 			btree_id, k);
 	BUG_ON(ret);
+	op->write.op.end_io = promote_done;
 
 	return op;
 err:
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index 61c9be4acd5f..41e67f7de275 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -324,9 +324,8 @@ int bch2_migrate_write_init(struct bch_fs *c, struct migrate_write *m,
 	return 0;
 }
 
-static void move_free(struct closure *cl)
+static void move_free(struct moving_io *io)
 {
-	struct moving_io *io = container_of(cl, struct moving_io, cl);
 	struct moving_context *ctxt = io->write.ctxt;
 	struct bvec_iter_all iter;
 	struct bio_vec *bv;
@@ -342,28 +341,28 @@ static void move_free(struct closure *cl)
 	kfree(io);
 }
 
-static void move_write_done(struct closure *cl)
+static void move_write_done(struct bch_write_op *op)
 {
-	struct moving_io *io = container_of(cl, struct moving_io, cl);
+	struct moving_io *io = container_of(op, struct moving_io, write.op);
+	struct moving_context *ctxt = io->write.ctxt;
 
 	atomic_sub(io->write_sectors, &io->write.ctxt->write_sectors);
-	closure_return_with_destructor(cl, move_free);
+	move_free(io);
+	closure_put(&ctxt->cl);
 }
 
-static void move_write(struct closure *cl)
+static void move_write(struct moving_io *io)
 {
-	struct moving_io *io = container_of(cl, struct moving_io, cl);
-
 	if (unlikely(io->rbio.bio.bi_status || io->rbio.hole)) {
-		closure_return_with_destructor(cl, move_free);
+		move_free(io);
 		return;
 	}
 
-	bch2_migrate_read_done(&io->write, &io->rbio);
-
+	closure_get(&io->write.ctxt->cl);
 	atomic_add(io->write_sectors, &io->write.ctxt->write_sectors);
-	closure_call(&io->write.op.cl, bch2_write, NULL, cl);
-	continue_at(cl, move_write_done, NULL);
+
+	bch2_migrate_read_done(&io->write, &io->rbio);
+	closure_call(&io->write.op.cl, bch2_write, NULL, NULL);
 }
 
 static inline struct moving_io *next_pending_write(struct moving_context *ctxt)
@@ -394,7 +393,7 @@ static void do_pending_writes(struct moving_context *ctxt)
 
 	while ((io = next_pending_write(ctxt))) {
 		list_del(&io->list);
-		closure_call(&io->cl, move_write, NULL, &ctxt->cl);
+		move_write(io);
 	}
 }
 
@@ -480,6 +479,8 @@ static int bch2_move_extent(struct btree_trans *trans,
 	if (ret)
 		goto err_free_pages;
 
+	io->write.op.end_io = move_write_done;
+
 	atomic64_inc(&ctxt->stats->keys_moved);
 	atomic64_add(k.k->size, &ctxt->stats->sectors_moved);
 
-- 
cgit 


From b282a74faebc9475355179aff40b98b5dbe0ae01 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 27 May 2021 20:20:20 -0400
Subject: bcachefs: Add an option to control sharding new inode numbers

We're seeing a bug where inode creates end up spinning in
bch2_inode_create - disabling sharding will simplify what we're testing.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/bcachefs_format.h |  1 +
 fs/bcachefs/inode.c           | 21 ++++++++++++++-------
 fs/bcachefs/opts.h            |  7 ++++++-
 3 files changed, 21 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index 98a87e4928ab..6cfb8959d579 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -1348,6 +1348,7 @@ LE64_BITMASK(BCH_SB_GC_RESERVE_BYTES,	struct bch_sb, flags[2],  4, 64);
 
 LE64_BITMASK(BCH_SB_ERASURE_CODE,	struct bch_sb, flags[3],  0, 16);
 LE64_BITMASK(BCH_SB_METADATA_TARGET,	struct bch_sb, flags[3], 16, 28);
+LE64_BITMASK(BCH_SB_SHARD_INUMS,	struct bch_sb, flags[3], 28, 29);
 
 /*
  * Features:
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index 2ae55467c583..0af493c8669d 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -479,16 +479,23 @@ struct btree_iter *bch2_inode_create(struct btree_trans *trans,
 	struct bkey_s_c k;
 	u64 min, max, start, pos, *hint;
 	int ret = 0;
+	unsigned bits = (c->opts.inodes_32bit ? 31 : 63);
 
-	u64 cpu = raw_smp_processor_id();
-	unsigned bits = (c->opts.inodes_32bit
-		? 31 : 63) - c->inode_shard_bits;
+	if (c->opts.shard_inode_numbers) {
+		u64 cpu = raw_smp_processor_id();
 
-	min = (cpu << bits);
-	max = (cpu << bits) | ~(ULLONG_MAX << bits);
+		bits -= c->inode_shard_bits;
 
-	min = max_t(u64, min, BLOCKDEV_INODE_MAX);
-	hint = c->unused_inode_hints + cpu;
+		min = (cpu << bits);
+		max = (cpu << bits) | ~(ULLONG_MAX << bits);
+
+		min = max_t(u64, min, BLOCKDEV_INODE_MAX);
+		hint = c->unused_inode_hints + cpu;
+	} else {
+		min = BLOCKDEV_INODE_MAX;
+		max = ~(ULLONG_MAX << bits);
+		hint = c->unused_inode_hints;
+	}
 
 	start = READ_ONCE(*hint);
 
diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
index 42bf38922d46..074ab2d4f0fa 100644
--- a/fs/bcachefs/opts.h
+++ b/fs/bcachefs/opts.h
@@ -165,8 +165,13 @@ enum opt_type {
 	x(inodes_32bit,			u8,				\
 	  OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,				\
 	  OPT_BOOL(),							\
-	  BCH_SB_INODE_32BIT,		false,				\
+	  BCH_SB_INODE_32BIT,		true,				\
 	  NULL,		"Constrain inode numbers to 32 bits")		\
+	x(shard_inode_numbers,		u8,				\
+	  OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,				\
+	  OPT_BOOL(),							\
+	  BCH_SB_SHARD_INUMS,		false,				\
+	  NULL,		"Shard new inode numbers by CPU id")		\
 	x(gc_reserve_percent,		u8,				\
 	  OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,				\
 	  OPT_UINT(5, 21),						\
-- 
cgit 


From 9eba7c8d15ce236b8033ae1f50fc1168984a507b Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 27 May 2021 21:16:50 -0400
Subject: bcachefs: Reflink refcount fix

__bch2_trans_mark_reflink_p wasn't always correctly returning the number
of sectors processed - the new logic is a bit more straightforward
overall too.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/buckets.c | 24 +++++++++++++-----------
 1 file changed, 13 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 6389ec7ba18b..b452ff003e6c 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -1735,6 +1735,7 @@ static int __bch2_trans_mark_reflink_p(struct btree_trans *trans,
 	struct bkey_i *n;
 	__le64 *refcount;
 	int add = !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1;
+	int frags_referenced;
 	s64 ret;
 
 	ret = trans_get_key(trans, BTREE_ID_reflink,
@@ -1742,18 +1743,20 @@ static int __bch2_trans_mark_reflink_p(struct btree_trans *trans,
 	if (ret < 0)
 		return ret;
 
-	if (reflink_p_frag_references(p, 0, front_frag, k) &&
-	    reflink_p_frag_references(p, back_frag, p.k->size, k)) {
+	sectors = min_t(u64, sectors, k.k->p.offset - idx);
+
+	frags_referenced =
+		reflink_p_frag_references(p, 0, front_frag, k) +
+		reflink_p_frag_references(p, back_frag, p.k->size, k);
+
+	if (frags_referenced == 2) {
 		BUG_ON(!(flags & BTREE_TRIGGER_OVERWRITE_SPLIT));
 		add = -add;
-	} else if (reflink_p_frag_references(p, 0, front_frag, k) ||
-		   reflink_p_frag_references(p, back_frag, p.k->size, k)) {
+	} else if (frags_referenced == 1) {
 		BUG_ON(!(flags & BTREE_TRIGGER_OVERWRITE));
 		goto out;
 	}
 
-	sectors = min_t(u64, sectors, k.k->p.offset - idx);
-
 	n = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
 	ret = PTR_ERR_OR_ZERO(n);
 	if (ret)
@@ -1808,14 +1811,13 @@ static int bch2_trans_mark_reflink_p(struct btree_trans *trans,
 		ret = __bch2_trans_mark_reflink_p(trans, p, idx, sectors,
 					front_frag, back_frag, flags);
 		if (ret < 0)
-			break;
+			return ret;
 
-		idx += ret;
-		sectors = max_t(s64, 0LL, sectors - ret);
-		ret = 0;
+		idx	+= ret;
+		sectors	-= ret;
 	}
 
-	return ret;
+	return 0;
 }
 
 int bch2_trans_mark_key(struct btree_trans *trans,
-- 
cgit 


From d797ca3d8ed3b493d6558426fb4b4980ff186c37 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 27 May 2021 23:16:25 -0400
Subject: bcachefs: Fix journal write error path

Journal write errors were racing with the submission path - potentially
causing writes to other replicas to not get submitted.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/journal_io.c    | 25 +++++++++++--------------
 fs/bcachefs/journal_types.h |  1 +
 2 files changed, 12 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 66a0e44a21a6..e42ca2a5e201 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -1232,8 +1232,6 @@ static void journal_write_done(struct closure *cl)
 	struct journal *j = container_of(cl, struct journal, io);
 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
 	struct journal_buf *w = journal_last_unwritten_buf(j);
-	struct bch_devs_list devs =
-		bch2_bkey_devs(bkey_i_to_s_c(&w->key));
 	struct bch_replicas_padded replicas;
 	union journal_res_state old, new;
 	u64 v, seq;
@@ -1241,11 +1239,12 @@ static void journal_write_done(struct closure *cl)
 
 	bch2_time_stats_update(j->write_time, j->write_start_time);
 
-	if (!devs.nr) {
+	if (!w->devs_written.nr) {
 		bch_err(c, "unable to write journal to sufficient devices");
 		err = -EIO;
 	} else {
-		bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal, devs);
+		bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal,
+					 w->devs_written);
 		if (bch2_mark_replicas(c, &replicas.e))
 			err = -EIO;
 	}
@@ -1257,7 +1256,7 @@ static void journal_write_done(struct closure *cl)
 	seq = le64_to_cpu(w->data->seq);
 
 	if (seq >= j->pin.front)
-		journal_seq_pin(j, seq)->devs = devs;
+		journal_seq_pin(j, seq)->devs = w->devs_written;
 
 	j->seq_ondisk		= seq;
 	if (err && (!j->err_seq || seq < j->err_seq))
@@ -1307,15 +1306,15 @@ static void journal_write_endio(struct bio *bio)
 {
 	struct bch_dev *ca = bio->bi_private;
 	struct journal *j = &ca->fs->journal;
+	struct journal_buf *w = journal_last_unwritten_buf(j);
+	unsigned long flags;
 
-	if (bch2_dev_io_err_on(bio->bi_status, ca, "journal write error: %s",
+	if (bch2_dev_io_err_on(bio->bi_status, ca, "error writing journal entry %llu: %s",
+			       le64_to_cpu(w->data->seq),
 			       bch2_blk_status_to_str(bio->bi_status)) ||
 	    bch2_meta_write_fault("journal")) {
-		struct journal_buf *w = journal_last_unwritten_buf(j);
-		unsigned long flags;
-
 		spin_lock_irqsave(&j->err_lock, flags);
-		bch2_bkey_drop_device(bkey_i_to_s(&w->key), ca->dev_idx);
+		bch2_dev_list_drop_dev(&w->devs_written, ca->dev_idx);
 		spin_unlock_irqrestore(&j->err_lock, flags);
 	}
 
@@ -1510,10 +1509,8 @@ retry_alloc:
 		return;
 	}
 
-	/*
-	 * XXX: we really should just disable the entire journal in nochanges
-	 * mode
-	 */
+	w->devs_written = bch2_bkey_devs(bkey_i_to_s_c(&w->key));
+
 	if (c->opts.nochanges)
 		goto no_io;
 
diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h
index b4e4d5e0bc93..cce02bad850c 100644
--- a/fs/bcachefs/journal_types.h
+++ b/fs/bcachefs/journal_types.h
@@ -21,6 +21,7 @@ struct journal_buf {
 	struct jset		*data;
 
 	__BKEY_PADDED(key, BCH_REPLICAS_MAX);
+	struct bch_devs_list	devs_written;
 
 	struct closure_waitlist	wait;
 	u64			last_seq;	/* copy of data->last_seq */
-- 
cgit 


From bff796ae6539b50bb3fe595e8db7b72e3255a34d Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 28 May 2021 05:06:18 -0400
Subject: bcachefs: Fix pathalogical behaviour with inode sharding by cpu ID

If the transactior restarts on a different CPU, it could end up needing
to read in a different btree node, which makes another transaction
restart more likely...

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/fs-common.c | 3 ++-
 fs/bcachefs/inode.c     | 4 +---
 fs/bcachefs/inode.h     | 2 +-
 3 files changed, 4 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-common.c b/fs/bcachefs/fs-common.c
index 08c6af886df7..00a63fecb976 100644
--- a/fs/bcachefs/fs-common.c
+++ b/fs/bcachefs/fs-common.c
@@ -23,6 +23,7 @@ int bch2_create_trans(struct btree_trans *trans, u64 dir_inum,
 	struct btree_iter *inode_iter = NULL;
 	struct bch_hash_info hash = bch2_hash_info_init(c, new_inode);
 	u64 now = bch2_current_time(c);
+	u64 cpu = raw_smp_processor_id();
 	u64 dir_offset = 0;
 	int ret;
 
@@ -36,7 +37,7 @@ int bch2_create_trans(struct btree_trans *trans, u64 dir_inum,
 	if (!name)
 		new_inode->bi_flags |= BCH_INODE_UNLINKED;
 
-	inode_iter = bch2_inode_create(trans, new_inode, U32_MAX);
+	inode_iter = bch2_inode_create(trans, new_inode, U32_MAX, cpu);
 	ret = PTR_ERR_OR_ZERO(inode_iter);
 	if (ret)
 		goto err;
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index 0af493c8669d..524da018e9d9 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -472,7 +472,7 @@ static inline u32 bkey_generation(struct bkey_s_c k)
 
 struct btree_iter *bch2_inode_create(struct btree_trans *trans,
 				     struct bch_inode_unpacked *inode_u,
-				     u32 snapshot)
+				     u32 snapshot, u64 cpu)
 {
 	struct bch_fs *c = trans->c;
 	struct btree_iter *iter = NULL;
@@ -482,8 +482,6 @@ struct btree_iter *bch2_inode_create(struct btree_trans *trans,
 	unsigned bits = (c->opts.inodes_32bit ? 31 : 63);
 
 	if (c->opts.shard_inode_numbers) {
-		u64 cpu = raw_smp_processor_id();
-
 		bits -= c->inode_shard_bits;
 
 		min = (cpu << bits);
diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h
index 558d5464095d..2cb081ae44d9 100644
--- a/fs/bcachefs/inode.h
+++ b/fs/bcachefs/inode.h
@@ -70,7 +70,7 @@ void bch2_inode_init(struct bch_fs *, struct bch_inode_unpacked *,
 		     struct bch_inode_unpacked *);
 
 struct btree_iter *bch2_inode_create(struct btree_trans *,
-				     struct bch_inode_unpacked *, u32);
+				     struct bch_inode_unpacked *, u32, u64);
 
 int bch2_inode_rm(struct bch_fs *, u64, bool);
 
-- 
cgit 


From 9f2772c45460e420de9a88980129bef135c6b76f Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 27 May 2021 21:38:00 -0400
Subject: bcachefs: Split out btree_error_wq

We can't use btree_update_wq becuase btree updates may be waiting on
btree writes to complete.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs.h | 1 +
 fs/bcachefs/btree_io.c | 2 +-
 fs/bcachefs/super.c    | 4 ++++
 3 files changed, 6 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 2ed795a58c81..6962b3ddf575 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -667,6 +667,7 @@ struct bch_fs {
 	struct btree_key_cache	btree_key_cache;
 
 	struct workqueue_struct	*btree_update_wq;
+	struct workqueue_struct	*btree_error_wq;
 	/* copygc needs its own workqueue for index updates.. */
 	struct workqueue_struct	*copygc_wq;
 
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 18d12c012cc6..0f2a730e04b5 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -1618,7 +1618,7 @@ static void btree_node_write_work(struct work_struct *work)
 		bio_list_add(&c->btree_write_error_list, &wbio->wbio.bio);
 		spin_unlock_irqrestore(&c->btree_write_error_lock, flags);
 
-		queue_work(c->btree_update_wq, &c->btree_write_error_work);
+		queue_work(c->btree_error_wq, &c->btree_write_error_work);
 		return;
 	}
 
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 62f9fee51b37..3bea7917004f 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -514,6 +514,8 @@ static void __bch2_fs_free(struct bch_fs *c)
 		destroy_workqueue(c->io_complete_wq );
 	if (c->copygc_wq)
 		destroy_workqueue(c->copygc_wq);
+	if (c->btree_error_wq)
+		destroy_workqueue(c->btree_error_wq);
 	if (c->btree_update_wq)
 		destroy_workqueue(c->btree_update_wq);
 
@@ -766,6 +768,8 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 
 	if (!(c->btree_update_wq = alloc_workqueue("bcachefs",
 				WQ_FREEZABLE|WQ_MEM_RECLAIM, 1)) ||
+	    !(c->btree_error_wq = alloc_workqueue("bcachefs_error",
+				WQ_FREEZABLE|WQ_MEM_RECLAIM, 1)) ||
 	    !(c->copygc_wq = alloc_workqueue("bcachefs_copygc",
 				WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) ||
 	    !(c->io_complete_wq = alloc_workqueue("bcachefs_io",
-- 
cgit 


From ee7570546e89ece9b56eaa22c94a6ec0941ec827 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 10 Sep 2023 23:33:08 -0400
Subject: bcachefs: Fix a deadlock

Waiting on a btree node write with btree locks held can deadlock, if the
write errors: the write error path has to do do a btree update to drop
the pointer to the replica that errored.

The interior update path has to wait on in flight btree writes before
freeing nodes on disk. Previously, this was done in
bch2_btree_interior_update_will_free_node(), and could deadlock; now, we
just stash a pointer to the node and do it in
btree_update_nodes_written(), just prior to the transactional part of
the update.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_io.c              |  4 ++++
 fs/bcachefs/btree_update_interior.c | 26 +++++++++++++++++++-------
 fs/bcachefs/btree_update_interior.h |  4 ++++
 3 files changed, 27 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 0f2a730e04b5..4ffdc11f4d9a 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -1727,6 +1727,10 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b)
 			return;
 
 		if (old & (1 << BTREE_NODE_write_in_flight)) {
+			/*
+			 * XXX waiting on btree writes with btree locks held -
+			 * this can deadlock, and we hit the write error path
+			 */
 			btree_node_wait_on_io(b);
 			continue;
 		}
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 6eeb0ca58b6a..569db972f3bb 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -550,6 +550,22 @@ static void btree_update_nodes_written(struct btree_update *as)
 
 	BUG_ON(!journal_pin_active(&as->journal));
 
+	/*
+	 * Wait for any in flight writes to finish before we free the old nodes
+	 * on disk:
+	 */
+	for (i = 0; i < as->nr_old_nodes; i++) {
+		struct btree *old = as->old_nodes[i];
+		__le64 seq;
+
+		six_lock_read(&old->c.lock, NULL, NULL);
+		seq = old->data ? old->data->keys.seq : 0;
+		six_unlock_read(&old->c.lock);
+
+		if (seq == as->old_nodes_seq[i])
+			btree_node_wait_on_io(old);
+	}
+
 	/*
 	 * We did an update to a parent node where the pointers we added pointed
 	 * to child nodes that weren't written yet: now, the child nodes have
@@ -889,13 +905,9 @@ void bch2_btree_interior_update_will_free_node(struct btree_update *as,
 
 	btree_update_will_delete_key(as, &b->key);
 
-	/*
-	 * XXX: Waiting on io with btree node locks held, we don't want to be
-	 * doing this. We can't have btree writes happening after the space has
-	 * been freed, but we really only need to block before
-	 * btree_update_nodes_written_trans() happens.
-	 */
-	btree_node_wait_on_io(b);
+	as->old_nodes[as->nr_old_nodes] = b;
+	as->old_nodes_seq[as->nr_old_nodes] = b->data->keys.seq;
+	as->nr_old_nodes++;
 }
 
 void bch2_btree_update_done(struct btree_update *as)
diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h
index 7eef3dbb6ef1..7ed67b47e1b9 100644
--- a/fs/bcachefs/btree_update_interior.h
+++ b/fs/bcachefs/btree_update_interior.h
@@ -92,6 +92,10 @@ struct btree_update {
 	struct btree			*new_nodes[BTREE_UPDATE_NODES_MAX];
 	unsigned			nr_new_nodes;
 
+	struct btree			*old_nodes[BTREE_UPDATE_NODES_MAX];
+	__le64				old_nodes_seq[BTREE_UPDATE_NODES_MAX];
+	unsigned			nr_old_nodes;
+
 	open_bucket_idx_t		open_buckets[BTREE_UPDATE_NODES_MAX *
 						     BCH_REPLICAS_MAX];
 	open_bucket_idx_t		nr_open_buckets;
-- 
cgit 


From c0ebe3e48c75ab075eb1c67aef957109c67e1643 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 23 May 2021 17:04:13 -0400
Subject: bcachefs: Assorted endianness fixes

Found by sparse

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_iter.c            |  2 ++
 fs/bcachefs/journal_io.c            |  5 +++--
 fs/bcachefs/journal_seq_blacklist.c |  6 ++----
 fs/bcachefs/move.c                  |  4 ++--
 fs/bcachefs/recovery.c              | 18 +++++++++---------
 fs/bcachefs/super-io.c              | 14 +++++++-------
 fs/bcachefs/super.c                 |  2 +-
 fs/bcachefs/sysfs.c                 |  2 +-
 8 files changed, 27 insertions(+), 26 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 745f1ac4f538..4b590b2096a7 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -2268,6 +2268,7 @@ static void bch2_trans_alloc_iters(struct btree_trans *trans, struct bch_fs *c)
 void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c,
 		     unsigned expected_nr_iters,
 		     size_t expected_mem_bytes)
+	__acquires(&c->btree_trans_barrier)
 {
 	memset(trans, 0, sizeof(*trans));
 	trans->c		= c;
@@ -2302,6 +2303,7 @@ void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c,
 }
 
 int bch2_trans_exit(struct btree_trans *trans)
+	__releases(&c->btree_trans_barrier)
 {
 	struct bch_fs *c = trans->c;
 
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index e42ca2a5e201..a7d08657cb4f 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -833,7 +833,7 @@ static void bch2_journal_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
 	unsigned i;
 
 	for (i = 0; i < j->nr_ptrs; i++) {
-		struct bch_dev *ca = c->devs[j->ptrs[i].dev];
+		struct bch_dev *ca = bch_dev_bkey_exists(c, j->ptrs[i].dev);
 		u64 offset;
 
 		div64_u64_rem(j->ptrs[i].offset, ca->mi.bucket_size, &offset);
@@ -1398,7 +1398,8 @@ void bch2_journal_write(struct closure *cl)
 	    test_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags)) {
 		w->noflush = true;
 		SET_JSET_NO_FLUSH(jset, true);
-		jset->last_seq = w->last_seq = 0;
+		jset->last_seq	= 0;
+		w->last_seq	= 0;
 
 		j->nr_noflush_writes++;
 	} else {
diff --git a/fs/bcachefs/journal_seq_blacklist.c b/fs/bcachefs/journal_seq_blacklist.c
index e1b63f3879f4..f2060f903cbc 100644
--- a/fs/bcachefs/journal_seq_blacklist.c
+++ b/fs/bcachefs/journal_seq_blacklist.c
@@ -111,8 +111,7 @@ int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64 start, u64 end)
 	bl->start[nr].start	= cpu_to_le64(start);
 	bl->start[nr].end	= cpu_to_le64(end);
 out_write_sb:
-	c->disk_sb.sb->features[0] |=
-		1ULL << BCH_FEATURE_journal_seq_blacklist_v3;
+	c->disk_sb.sb->features[0] |= cpu_to_le64(1ULL << BCH_FEATURE_journal_seq_blacklist_v3);
 
 	ret = bch2_write_super(c);
 out:
@@ -298,8 +297,7 @@ void bch2_blacklist_entries_gc(struct work_struct *work)
 		BUG_ON(new_nr && !bl);
 
 		if (!new_nr)
-			c->disk_sb.sb->features[0] &=
-				~(1ULL << BCH_FEATURE_journal_seq_blacklist_v3);
+			c->disk_sb.sb->features[0] &= cpu_to_le64(~(1ULL << BCH_FEATURE_journal_seq_blacklist_v3));
 
 		bch2_write_super(c);
 	}
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index 41e67f7de275..638cd7e97a46 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -926,8 +926,8 @@ int bch2_scan_old_btree_nodes(struct bch_fs *c, struct bch_move_stats *stats)
 			      rewrite_old_nodes_pred, c, stats);
 	if (!ret) {
 		mutex_lock(&c->sb_lock);
-		c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_extents_above_btree_updates_done;
-		c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_bformat_overflow_done;
+		c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_extents_above_btree_updates_done);
+		c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_bformat_overflow_done);
 		c->disk_sb.sb->version_min = c->disk_sb.sb->version;
 		bch2_write_super(c);
 		mutex_unlock(&c->sb_lock);
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index cd538ecc1f3f..9bd6348842e0 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -716,7 +716,7 @@ static int journal_replay_entry_early(struct bch_fs *c,
 	case BCH_JSET_ENTRY_dev_usage: {
 		struct jset_entry_dev_usage *u =
 			container_of(entry, struct jset_entry_dev_usage, entry);
-		struct bch_dev *ca = bch_dev_bkey_exists(c, u->dev);
+		struct bch_dev *ca = bch_dev_bkey_exists(c, le32_to_cpu(u->dev));
 		unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64);
 		unsigned nr_types = (bytes - sizeof(struct jset_entry_dev_usage)) /
 			sizeof(struct jset_entry_dev_usage_type);
@@ -755,7 +755,7 @@ static int journal_replay_entry_early(struct bch_fs *c,
 		struct jset_entry_clock *clock =
 			container_of(entry, struct jset_entry_clock, entry);
 
-		atomic64_set(&c->io_clock[clock->rw].now, clock->time);
+		atomic64_set(&c->io_clock[clock->rw].now, le64_to_cpu(clock->time));
 	}
 	}
 
@@ -1217,13 +1217,13 @@ use_clean:
 
 	mutex_lock(&c->sb_lock);
 	if (c->opts.version_upgrade) {
-		c->disk_sb.sb->version = le16_to_cpu(bcachefs_metadata_version_current);
-		c->disk_sb.sb->features[0] |= BCH_SB_FEATURES_ALL;
+		c->disk_sb.sb->version = cpu_to_le16(bcachefs_metadata_version_current);
+		c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALL);
 		write_sb = true;
 	}
 
 	if (!test_bit(BCH_FS_ERROR, &c->flags)) {
-		c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_alloc_info;
+		c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_alloc_info);
 		write_sb = true;
 	}
 
@@ -1278,12 +1278,12 @@ int bch2_fs_initialize(struct bch_fs *c)
 	bch_notice(c, "initializing new filesystem");
 
 	mutex_lock(&c->sb_lock);
-	c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_extents_above_btree_updates_done;
-	c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_bformat_overflow_done;
+	c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_extents_above_btree_updates_done);
+	c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_bformat_overflow_done);
 
 	if (c->opts.version_upgrade) {
-		c->disk_sb.sb->version = le16_to_cpu(bcachefs_metadata_version_current);
-		c->disk_sb.sb->features[0] |= BCH_SB_FEATURES_ALL;
+		c->disk_sb.sb->version = cpu_to_le16(bcachefs_metadata_version_current);
+		c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALL);
 		bch2_write_super(c);
 	}
 
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index 4c7cea4cfc2b..fc6fb302706a 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -984,7 +984,7 @@ int bch2_fs_mark_dirty(struct bch_fs *c)
 
 	mutex_lock(&c->sb_lock);
 	SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
-	c->disk_sb.sb->features[0] |= BCH_SB_FEATURES_ALWAYS;
+	c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALWAYS);
 	ret = bch2_write_super(c);
 	mutex_unlock(&c->sb_lock);
 
@@ -1001,7 +1001,7 @@ static struct jset_entry *jset_entry_init(struct jset_entry **end, size_t size)
 	 * The u64s field counts from the start of data, ignoring the shared
 	 * fields.
 	 */
-	entry->u64s = u64s - 1;
+	entry->u64s = cpu_to_le16(u64s - 1);
 
 	*end = vstruct_next(*end);
 	return entry;
@@ -1095,7 +1095,7 @@ void bch2_journal_super_entries_add_common(struct bch_fs *c,
 
 		clock->entry.type = BCH_JSET_ENTRY_clock;
 		clock->rw	= i;
-		clock->time	= atomic64_read(&c->io_clock[i].now);
+		clock->time	= cpu_to_le64(atomic64_read(&c->io_clock[i].now));
 	}
 }
 
@@ -1112,10 +1112,10 @@ void bch2_fs_mark_clean(struct bch_fs *c)
 
 	SET_BCH_SB_CLEAN(c->disk_sb.sb, true);
 
-	c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_alloc_info;
-	c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_alloc_metadata;
-	c->disk_sb.sb->features[0] &= ~(1ULL << BCH_FEATURE_extents_above_btree_updates);
-	c->disk_sb.sb->features[0] &= ~(1ULL << BCH_FEATURE_btree_updates_journalled);
+	c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_alloc_info);
+	c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_alloc_metadata);
+	c->disk_sb.sb->features[0] &= cpu_to_le64(~(1ULL << BCH_FEATURE_extents_above_btree_updates));
+	c->disk_sb.sb->features[0] &= cpu_to_le64(~(1ULL << BCH_FEATURE_btree_updates_journalled));
 
 	u64s = sizeof(*sb_clean) / sizeof(u64) + c->journal.entry_u64s_reserved;
 
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 3bea7917004f..ef7322a8b460 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -1445,7 +1445,7 @@ int bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca,
 
 /* Device add/removal: */
 
-int bch2_dev_remove_alloc(struct bch_fs *c, struct bch_dev *ca)
+static int bch2_dev_remove_alloc(struct bch_fs *c, struct bch_dev *ca)
 {
 	struct btree_trans trans;
 	size_t i;
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index 21ef7719cf55..84a7acb04d01 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -312,7 +312,7 @@ static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c
 	return 0;
 }
 
-void bch2_gc_gens_pos_to_text(struct printbuf *out, struct bch_fs *c)
+static void bch2_gc_gens_pos_to_text(struct printbuf *out, struct bch_fs *c)
 {
 	pr_buf(out, "%s: ", bch2_btree_ids[c->gc_gens_btree]);
 	bch2_bpos_to_text(out, c->gc_gens_pos);
-- 
cgit 


From 890b74f03d70946e46f8169c94232fb717f46989 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 23 May 2021 02:31:33 -0400
Subject: bcachefs: Fsck for reflink refcounts

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/bcachefs.h |  11 +++
 fs/bcachefs/btree_gc.c | 202 ++++++++++++++++++++++++++++++++++++++++++++++++-
 fs/bcachefs/buckets.c  | 152 ++++++++++++++++++++++++++++++-------
 fs/bcachefs/reflink.c  |   2 +-
 fs/bcachefs/reflink.h  |  24 ++++++
 5 files changed, 359 insertions(+), 32 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 6962b3ddf575..9bd60369703f 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -391,6 +391,14 @@ struct gc_pos {
 	unsigned		level;
 };
 
+struct reflink_gc {
+	u64		offset;
+	u32		size;
+	u32		refcount;
+};
+
+typedef GENRADIX(struct reflink_gc) reflink_gc_table;
+
 struct io_count {
 	u64			sectors[2][BCH_DATA_NR];
 };
@@ -806,6 +814,9 @@ mempool_t		bio_bounce_pages;
 
 	/* REFLINK */
 	u64			reflink_hint;
+	reflink_gc_table	reflink_gc_table;
+	size_t			reflink_gc_nr;
+	size_t			reflink_gc_idx;
 
 	/* VFS IO PATH - fs-io.c */
 	struct bio_set		writepage_bioset;
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 5b839cca8a9d..5a2acaba04c9 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -23,6 +23,7 @@
 #include "keylist.h"
 #include "move.h"
 #include "recovery.h"
+#include "reflink.h"
 #include "replicas.h"
 #include "super-io.h"
 #include "trace.h"
@@ -1285,6 +1286,201 @@ static int bch2_gc_start(struct bch_fs *c,
 	return 0;
 }
 
+static int bch2_gc_reflink_done_initial_fn(struct bch_fs *c, struct bkey_s_c k)
+{
+	struct reflink_gc *r;
+	const __le64 *refcount = bkey_refcount_c(k);
+	char buf[200];
+	int ret = 0;
+
+	if (!refcount)
+		return 0;
+
+	r = genradix_ptr(&c->reflink_gc_table, c->reflink_gc_idx++);
+	if (!r)
+		return -ENOMEM;
+
+	if (!r ||
+	    r->offset != k.k->p.offset ||
+	    r->size != k.k->size) {
+		bch_err(c, "unexpected inconsistency walking reflink table at gc finish");
+		return -EINVAL;
+	}
+
+	if (fsck_err_on(r->refcount != le64_to_cpu(*refcount), c,
+			"reflink key has wrong refcount:\n"
+			"  %s\n"
+			"  should be %u",
+			(bch2_bkey_val_to_text(&PBUF(buf), c, k), buf),
+			r->refcount)) {
+		struct bkey_i *new;
+
+		new = kmalloc(bkey_bytes(k.k), GFP_KERNEL);
+		if (!new) {
+			ret = -ENOMEM;
+			goto fsck_err;
+		}
+
+		bkey_reassemble(new, k);
+
+		if (!r->refcount) {
+			new->k.type = KEY_TYPE_deleted;
+			new->k.size = 0;
+		} else {
+			*bkey_refcount(new) = cpu_to_le64(r->refcount);
+		}
+
+		ret = bch2_journal_key_insert(c, BTREE_ID_reflink, 0, new);
+		if (ret)
+			kfree(new);
+	}
+fsck_err:
+	return ret;
+}
+
+static int bch2_gc_reflink_done(struct bch_fs *c, bool initial,
+				bool metadata_only)
+{
+	struct btree_trans trans;
+	struct btree_iter *iter;
+	struct bkey_s_c k;
+	struct reflink_gc *r;
+	size_t idx = 0;
+	char buf[200];
+	int ret = 0;
+
+	if (metadata_only)
+		return 0;
+
+	if (initial) {
+		c->reflink_gc_idx = 0;
+
+		ret = bch2_btree_and_journal_walk(c, BTREE_ID_reflink,
+				bch2_gc_reflink_done_initial_fn);
+		goto out;
+	}
+
+	bch2_trans_init(&trans, c, 0, 0);
+
+	for_each_btree_key(&trans, iter, BTREE_ID_reflink, POS_MIN,
+			   BTREE_ITER_PREFETCH, k, ret) {
+		const __le64 *refcount = bkey_refcount_c(k);
+
+		if (!refcount)
+			continue;
+
+		r = genradix_ptr(&c->reflink_gc_table, idx);
+		if (!r ||
+		    r->offset != k.k->p.offset ||
+		    r->size != k.k->size) {
+			bch_err(c, "unexpected inconsistency walking reflink table at gc finish");
+			ret = -EINVAL;
+			break;
+		}
+
+		if (fsck_err_on(r->refcount != le64_to_cpu(*refcount), c,
+				"reflink key has wrong refcount:\n"
+				"  %s\n"
+				"  should be %u",
+				(bch2_bkey_val_to_text(&PBUF(buf), c, k), buf),
+				r->refcount)) {
+			struct bkey_i *new;
+
+			new = kmalloc(bkey_bytes(k.k), GFP_KERNEL);
+			if (!new) {
+				ret = -ENOMEM;
+				break;
+			}
+
+			bkey_reassemble(new, k);
+
+			if (!r->refcount)
+				new->k.type = KEY_TYPE_deleted;
+			else
+				*bkey_refcount(new) = cpu_to_le64(r->refcount);
+
+			ret = __bch2_trans_do(&trans, NULL, NULL, 0,
+					__bch2_btree_insert(&trans, BTREE_ID_reflink, new));
+			kfree(new);
+
+			if (ret)
+				break;
+		}
+	}
+fsck_err:
+	bch2_trans_iter_put(&trans, iter);
+	bch2_trans_exit(&trans);
+out:
+	genradix_free(&c->reflink_gc_table);
+	c->reflink_gc_nr = 0;
+	return ret;
+}
+
+static int bch2_gc_reflink_start_initial_fn(struct bch_fs *c, struct bkey_s_c k)
+{
+
+	struct reflink_gc *r;
+	const __le64 *refcount = bkey_refcount_c(k);
+
+	if (!refcount)
+		return 0;
+
+	r = genradix_ptr_alloc(&c->reflink_gc_table, c->reflink_gc_nr++,
+			       GFP_KERNEL);
+	if (!r)
+		return -ENOMEM;
+
+	r->offset	= k.k->p.offset;
+	r->size		= k.k->size;
+	r->refcount	= 0;
+	return 0;
+}
+
+static int bch2_gc_reflink_start(struct bch_fs *c, bool initial,
+				 bool metadata_only)
+{
+	struct btree_trans trans;
+	struct btree_iter *iter;
+	struct bkey_s_c k;
+	struct reflink_gc *r;
+	int ret;
+
+	if (metadata_only)
+		return 0;
+
+	genradix_free(&c->reflink_gc_table);
+	c->reflink_gc_nr = 0;
+
+	if (initial)
+		return bch2_btree_and_journal_walk(c, BTREE_ID_reflink,
+				bch2_gc_reflink_start_initial_fn);
+
+	bch2_trans_init(&trans, c, 0, 0);
+
+	for_each_btree_key(&trans, iter, BTREE_ID_reflink, POS_MIN,
+			   BTREE_ITER_PREFETCH, k, ret) {
+		const __le64 *refcount = bkey_refcount_c(k);
+
+		if (!refcount)
+			continue;
+
+		r = genradix_ptr_alloc(&c->reflink_gc_table, c->reflink_gc_nr++,
+				       GFP_KERNEL);
+		if (!r) {
+			ret = -ENOMEM;
+			break;
+		}
+
+		r->offset	= k.k->p.offset;
+		r->size		= k.k->size;
+		r->refcount	= 0;
+	}
+	bch2_trans_iter_put(&trans, iter);
+
+	bch2_trans_exit(&trans);
+	return 0;
+}
+
 /**
  * bch2_gc - walk _all_ references to buckets, and recompute them:
  *
@@ -1319,7 +1515,8 @@ int bch2_gc(struct bch_fs *c, bool initial, bool metadata_only)
 	closure_wait_event(&c->btree_interior_update_wait,
 			   !bch2_btree_interior_updates_nr_pending(c));
 again:
-	ret = bch2_gc_start(c, metadata_only);
+	ret   = bch2_gc_start(c, metadata_only) ?:
+		bch2_gc_reflink_start(c, initial, metadata_only);
 	if (ret)
 		goto out;
 
@@ -1381,7 +1578,8 @@ out:
 		bch2_journal_block(&c->journal);
 
 		percpu_down_write(&c->mark_lock);
-		ret = bch2_gc_done(c, initial, metadata_only);
+		ret   = bch2_gc_reflink_done(c, initial, metadata_only) ?:
+			bch2_gc_done(c, initial, metadata_only);
 
 		bch2_journal_unblock(&c->journal);
 	} else {
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index b452ff003e6c..ba6b1e770dcf 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -14,6 +14,7 @@
 #include "ec.h"
 #include "error.h"
 #include "movinggc.h"
+#include "reflink.h"
 #include "replicas.h"
 #include "trace.h"
 
@@ -1076,6 +1077,124 @@ static int bch2_mark_stripe(struct bch_fs *c,
 	return 0;
 }
 
+static int __reflink_p_frag_references(struct bkey_s_c_reflink_p p,
+				       u64 p_start, u64 p_end,
+				       u64 v_start, u64 v_end)
+{
+	if (p_start == p_end)
+		return false;
+
+	p_start	+= le64_to_cpu(p.v->idx);
+	p_end	+= le64_to_cpu(p.v->idx);
+
+	if (p_end <= v_start)
+		return false;
+	if (p_start >= v_end)
+		return false;
+	return true;
+}
+
+static int reflink_p_frag_references(struct bkey_s_c_reflink_p p,
+				     u64 start, u64 end,
+				     struct bkey_s_c k)
+{
+	return __reflink_p_frag_references(p, start, end,
+					   bkey_start_offset(k.k),
+					   k.k->p.offset);
+}
+
+static int __bch2_mark_reflink_p(struct bch_fs *c,
+			struct bkey_s_c_reflink_p p,
+			u64 idx, unsigned sectors,
+			unsigned front_frag,
+			unsigned back_frag,
+			unsigned flags,
+			size_t *r_idx)
+{
+	struct reflink_gc *r;
+	int add = !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1;
+	int frags_referenced;
+
+	while (1) {
+		if (*r_idx >= c->reflink_gc_nr)
+			goto not_found;
+		r = genradix_ptr(&c->reflink_gc_table, *r_idx);
+		BUG_ON(!r);
+
+		if (r->offset > idx)
+			break;
+		(*r_idx)++;
+	}
+
+	frags_referenced =
+		__reflink_p_frag_references(p, 0, front_frag,
+					    r->offset - r->size, r->offset) +
+		__reflink_p_frag_references(p, back_frag, p.k->size,
+					    r->offset - r->size, r->offset);
+
+	if (frags_referenced == 2) {
+		BUG_ON(!(flags & BTREE_TRIGGER_OVERWRITE_SPLIT));
+		add = -add;
+	} else if (frags_referenced == 1) {
+		BUG_ON(!(flags & BTREE_TRIGGER_OVERWRITE));
+		add = 0;
+	}
+
+	BUG_ON((s64) r->refcount + add < 0);
+
+	r->refcount += add;
+	return min_t(u64, sectors, r->offset - idx);
+not_found:
+	bch2_fs_inconsistent(c,
+		"%llu:%llu len %u points to nonexistent indirect extent %llu",
+		p.k->p.inode, p.k->p.offset, p.k->size, idx);
+	bch2_inconsistent_error(c);
+	return -EIO;
+}
+
+static int bch2_mark_reflink_p(struct bch_fs *c,
+			       struct bkey_s_c_reflink_p p, unsigned offset,
+			       s64 sectors, unsigned flags)
+{
+	u64 idx = le64_to_cpu(p.v->idx) + offset;
+	struct reflink_gc *ref;
+	size_t l, r, m;
+	unsigned front_frag, back_frag;
+	s64 ret = 0;
+
+	if (sectors < 0)
+		sectors = -sectors;
+
+	BUG_ON(offset + sectors > p.k->size);
+
+	front_frag = offset;
+	back_frag = offset + sectors;
+
+	l = 0;
+	r = c->reflink_gc_nr;
+	while (l < r) {
+		m = l + (r - l) / 2;
+
+		ref = genradix_ptr(&c->reflink_gc_table, m);
+		if (ref->offset <= idx)
+			l = m + 1;
+		else
+			r = m;
+	}
+
+	while (sectors) {
+		ret = __bch2_mark_reflink_p(c, p, idx, sectors,
+				front_frag, back_frag, flags, &l);
+		if (ret < 0)
+			return ret;
+
+		idx	+= ret;
+		sectors	-= ret;
+	}
+
+	return 0;
+}
+
 static int bch2_mark_key_locked(struct bch_fs *c,
 		   struct bkey_s_c old,
 		   struct bkey_s_c new,
@@ -1131,6 +1250,10 @@ static int bch2_mark_key_locked(struct bch_fs *c,
 		fs_usage->persistent_reserved[replicas - 1]	+= sectors;
 		break;
 	}
+	case KEY_TYPE_reflink_p:
+		ret = bch2_mark_reflink_p(c, bkey_s_c_to_reflink_p(k),
+					  offset, sectors, flags);
+		break;
 	}
 
 	preempt_enable();
@@ -1693,35 +1816,6 @@ static int bch2_trans_mark_stripe(struct btree_trans *trans,
 	return ret;
 }
 
-static __le64 *bkey_refcount(struct bkey_i *k)
-{
-	switch (k->k.type) {
-	case KEY_TYPE_reflink_v:
-		return &bkey_i_to_reflink_v(k)->v.refcount;
-	case KEY_TYPE_indirect_inline_data:
-		return &bkey_i_to_indirect_inline_data(k)->v.refcount;
-	default:
-		return NULL;
-	}
-}
-
-static bool reflink_p_frag_references(struct bkey_s_c_reflink_p p,
-				      u64 start, u64 end,
-				      struct bkey_s_c k)
-{
-	if (start == end)
-		return false;
-
-	start	+= le64_to_cpu(p.v->idx);
-	end	+= le64_to_cpu(p.v->idx);
-
-	if (end <= bkey_start_offset(k.k))
-		return false;
-	if (start >= k.k->p.offset)
-		return false;
-	return true;
-}
-
 static int __bch2_trans_mark_reflink_p(struct btree_trans *trans,
 			struct bkey_s_c_reflink_p p,
 			u64 idx, unsigned sectors,
diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c
index c624fabe1e1c..e986b5284d37 100644
--- a/fs/bcachefs/reflink.c
+++ b/fs/bcachefs/reflink.c
@@ -151,7 +151,7 @@ static int bch2_make_extent_indirect(struct btree_trans *trans,
 
 	set_bkey_val_bytes(&r_v->k, sizeof(__le64) + bkey_val_bytes(&orig->k));
 
-	refcount	= (void *) &r_v->v;
+	refcount	= bkey_refcount(r_v);
 	*refcount	= 0;
 	memcpy(refcount + 1, &orig->v, bkey_val_bytes(&orig->k));
 
diff --git a/fs/bcachefs/reflink.h b/fs/bcachefs/reflink.h
index 9d5e7dc58f2b..bfc785619ee8 100644
--- a/fs/bcachefs/reflink.h
+++ b/fs/bcachefs/reflink.h
@@ -34,6 +34,30 @@ void bch2_indirect_inline_data_to_text(struct printbuf *,
 	.val_to_text	= bch2_indirect_inline_data_to_text,	\
 }
 
+static inline const __le64 *bkey_refcount_c(struct bkey_s_c k)
+{
+	switch (k.k->type) {
+	case KEY_TYPE_reflink_v:
+		return &bkey_s_c_to_reflink_v(k).v->refcount;
+	case KEY_TYPE_indirect_inline_data:
+		return &bkey_s_c_to_indirect_inline_data(k).v->refcount;
+	default:
+		return NULL;
+	}
+}
+
+static inline __le64 *bkey_refcount(struct bkey_i *k)
+{
+	switch (k->k.type) {
+	case KEY_TYPE_reflink_v:
+		return &bkey_i_to_reflink_v(k)->v.refcount;
+	case KEY_TYPE_indirect_inline_data:
+		return &bkey_i_to_indirect_inline_data(k)->v.refcount;
+	default:
+		return NULL;
+	}
+}
+
 s64 bch2_remap_range(struct bch_fs *, struct bpos, struct bpos,
 		     u64, u64 *, u64, s64 *);
 
-- 
cgit 


From 649d9a4dfc834230b55c4ae34b60067d1d54c4d1 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 20 Mar 2021 22:14:10 -0400
Subject: bcachefs: Don't fragment extents when making them indirect

This fixes a "disk usage increased without a reservation" bug, when
reflinking compressed extents. Also, there's no good reason for reflink
to be fragmenting extents anyways.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/reflink.c | 52 ++++++++++++++++++++++-----------------------------
 1 file changed, 22 insertions(+), 30 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c
index e986b5284d37..a420729288d4 100644
--- a/fs/bcachefs/reflink.c
+++ b/fs/bcachefs/reflink.c
@@ -181,18 +181,19 @@ err:
 
 static struct bkey_s_c get_next_src(struct btree_iter *iter, struct bpos end)
 {
-	struct bkey_s_c k = bch2_btree_iter_peek(iter);
+	struct bkey_s_c k;
 	int ret;
 
 	for_each_btree_key_continue(iter, 0, k, ret) {
 		if (bkey_cmp(iter->pos, end) >= 0)
-			return bkey_s_c_null;
+			break;
 
 		if (bkey_extent_is_data(k.k))
-			break;
+			return k;
 	}
 
-	return k;
+	bch2_btree_iter_set_pos(iter, end);
+	return bkey_s_c_null;
 }
 
 s64 bch2_remap_range(struct bch_fs *c,
@@ -205,8 +206,8 @@ s64 bch2_remap_range(struct bch_fs *c,
 	struct bkey_s_c src_k;
 	struct bkey_buf new_dst, new_src;
 	struct bpos dst_end = dst_start, src_end = src_start;
-	struct bpos dst_want, src_want;
-	u64 src_done, dst_done;
+	struct bpos src_want;
+	u64 dst_done;
 	int ret = 0, ret2 = 0;
 
 	if (!percpu_ref_tryget(&c->writes))
@@ -226,7 +227,8 @@ s64 bch2_remap_range(struct bch_fs *c,
 	dst_iter = bch2_trans_get_iter(&trans, BTREE_ID_extents, dst_start,
 				       BTREE_ITER_INTENT);
 
-	while (ret == 0 || ret == -EINTR) {
+	while ((ret == 0 || ret == -EINTR) &&
+	       bkey_cmp(dst_iter->pos, dst_end) < 0) {
 		struct disk_reservation disk_res = { 0 };
 
 		bch2_trans_begin(&trans);
@@ -236,32 +238,29 @@ s64 bch2_remap_range(struct bch_fs *c,
 			break;
 		}
 
+		dst_done = dst_iter->pos.offset - dst_start.offset;
+		src_want = POS(src_start.inode, src_start.offset + dst_done);
+		bch2_btree_iter_set_pos(src_iter, src_want);
+
 		src_k = get_next_src(src_iter, src_end);
 		ret = bkey_err(src_k);
 		if (ret)
 			continue;
 
-		src_done = bpos_min(src_iter->pos, src_end).offset -
-			src_start.offset;
-		dst_want = POS(dst_start.inode, dst_start.offset + src_done);
-
-		if (bkey_cmp(dst_iter->pos, dst_want) < 0) {
-			ret = bch2_fpunch_at(&trans, dst_iter, dst_want,
-					     journal_seq, i_sectors_delta);
+		if (bkey_cmp(src_want, src_iter->pos) < 0) {
+			ret = bch2_fpunch_at(&trans, dst_iter,
+					bpos_min(dst_end,
+						 POS(dst_iter->pos.inode, dst_iter->pos.offset +
+						     src_iter->pos.offset - src_want.offset)),
+						 journal_seq, i_sectors_delta);
 			continue;
 		}
 
-		BUG_ON(bkey_cmp(dst_iter->pos, dst_want));
-
-		if (!bkey_cmp(dst_iter->pos, dst_end))
-			break;
-
 		if (src_k.k->type != KEY_TYPE_reflink_p) {
 			bch2_bkey_buf_reassemble(&new_src, c, src_k);
 			src_k = bkey_i_to_s_c(new_src.k);
 
-			bch2_cut_front(src_iter->pos,	new_src.k);
-			bch2_cut_back(src_end,		new_src.k);
+			bch2_btree_iter_set_pos(src_iter, bkey_start_pos(src_k.k));
 
 			ret = bch2_make_extent_indirect(&trans, src_iter,
 						new_src.k);
@@ -278,7 +277,7 @@ s64 bch2_remap_range(struct bch_fs *c,
 				bkey_reflink_p_init(new_dst.k);
 
 			u64 offset = le64_to_cpu(src_p.v->idx) +
-				(src_iter->pos.offset -
+				(src_want.offset -
 				 bkey_start_offset(src_k.k));
 
 			dst_p->v.idx = cpu_to_le64(offset);
@@ -288,20 +287,13 @@ s64 bch2_remap_range(struct bch_fs *c,
 
 		new_dst.k->k.p = dst_iter->pos;
 		bch2_key_resize(&new_dst.k->k,
-				min(src_k.k->p.offset - src_iter->pos.offset,
+				min(src_k.k->p.offset - src_want.offset,
 				    dst_end.offset - dst_iter->pos.offset));
-
 		ret = bch2_extent_update(&trans, dst_iter, new_dst.k,
 					 &disk_res, journal_seq,
 					 new_i_size, i_sectors_delta,
 					 true);
 		bch2_disk_reservation_put(c, &disk_res);
-		if (ret)
-			continue;
-
-		dst_done = dst_iter->pos.offset - dst_start.offset;
-		src_want = POS(src_start.inode, src_start.offset + dst_done);
-		bch2_btree_iter_set_pos(src_iter, src_want);
 	}
 	bch2_trans_iter_put(&trans, dst_iter);
 	bch2_trans_iter_put(&trans, src_iter);
-- 
cgit 


From d7fc453bdb012d5bc021e693896989fd5973823f Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 31 May 2021 00:13:39 -0400
Subject: bcachefs: Journal space calculation fix

When devices have different bucket sizes, we may accumulate a journal
write that doesn't fit on some of our devices - previously, we'd
underflow when calculating space on that device and then everything
would get weird.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/journal_reclaim.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
index 812620d3de31..54d8e2f32a37 100644
--- a/fs/bcachefs/journal_reclaim.c
+++ b/fs/bcachefs/journal_reclaim.c
@@ -93,6 +93,10 @@ journal_dev_space_available(struct journal *j, struct bch_dev *ca,
 	 * until we write it out - thus, account for it here:
 	 */
 	while ((unwritten = get_unwritten_sectors(j, &idx))) {
+		/* entry won't fit on this device, skip: */
+		if (unwritten > ca->mi.bucket_size)
+			continue;
+
 		if (unwritten >= sectors) {
 			if (!buckets) {
 				sectors = 0;
-- 
cgit 


From 01254036a326e0663eaeabb2eb3359c04caed2a0 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 31 May 2021 20:52:39 -0400
Subject: bcachefs; Check for allocator thread shutdown

We were missing a kthread_should_stop() check in the loop in
bch2_invalidate_buckets(), very occasionally leading to us getting stuck
while shutting down.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/alloc_background.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 791066b6b39b..b6bfba7aea0a 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -836,6 +836,11 @@ static int bch2_invalidate_buckets(struct bch_fs *c, struct bch_dev *ca)
 	while (!ret &&
 	       !fifo_full(&ca->free_inc) &&
 	       ca->alloc_heap.used) {
+		if (kthread_should_stop()) {
+			ret = 1;
+			break;
+		}
+
 		ret = bch2_invalidate_one_bucket(c, ca, &journal_seq,
 				(!fifo_empty(&ca->free_inc)
 				 ? BTREE_INSERT_NOWAIT : 0));
-- 
cgit 


From bc3f8b25f3a4403d6e8c9067e6f0fc9cf23be6fe Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 2 Jun 2021 00:15:07 -0400
Subject: bcachefs: Check for errors from bch2_trans_update()

Upcoming refactoring is going to change bch2_trans_update() to start
returning transaction restarts.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/acl.c               |  2 +-
 fs/bcachefs/alloc_background.c  |  9 +++++----
 fs/bcachefs/btree_gc.c          |  5 +++--
 fs/bcachefs/btree_update_leaf.c | 10 +++++-----
 fs/bcachefs/buckets.c           |  4 +++-
 fs/bcachefs/ec.c                | 15 ++++++---------
 fs/bcachefs/fsck.c              | 17 ++++++++---------
 fs/bcachefs/inode.c             |  8 +++-----
 fs/bcachefs/io.c                | 12 ++++++++----
 fs/bcachefs/migrate.c           |  5 ++---
 fs/bcachefs/move.c              |  5 ++---
 fs/bcachefs/str_hash.h          | 14 +++++++-------
 fs/bcachefs/tests.c             |  2 +-
 13 files changed, 54 insertions(+), 54 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/acl.c b/fs/bcachefs/acl.c
index afb9562be2b2..be31d27443bc 100644
--- a/fs/bcachefs/acl.c
+++ b/fs/bcachefs/acl.c
@@ -386,7 +386,7 @@ int bch2_acl_chmod(struct btree_trans *trans,
 	}
 
 	new->k.p = iter->pos;
-	bch2_trans_update(trans, iter, &new->k_i, 0);
+	ret = bch2_trans_update(trans, iter, &new->k_i, 0);
 	*new_acl = acl;
 	acl = NULL;
 err:
diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index b6bfba7aea0a..15f9adf0876a 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -340,9 +340,9 @@ retry:
 		return 0;
 
 	bch2_alloc_pack(c, &a, new_u);
-	bch2_trans_update(trans, iter, &a.k,
-			  BTREE_TRIGGER_NORUN);
-	ret = bch2_trans_commit(trans, NULL, NULL,
+	ret   = bch2_trans_update(trans, iter, &a.k,
+				  BTREE_TRIGGER_NORUN) ?:
+		bch2_trans_commit(trans, NULL, NULL,
 				BTREE_INSERT_NOFAIL|flags);
 err:
 	if (ret == -EINTR)
@@ -726,7 +726,8 @@ static int bucket_invalidate_btree(struct btree_trans *trans,
 	u.write_time	= atomic64_read(&c->io_clock[WRITE].now);
 
 	bch2_alloc_pack(c, a, u);
-	bch2_trans_update(trans, iter, &a->k, BTREE_TRIGGER_BUCKET_INVALIDATE);
+	ret = bch2_trans_update(trans, iter, &a->k,
+				BTREE_TRIGGER_BUCKET_INVALIDATE);
 err:
 	bch2_trans_iter_put(trans, iter);
 	return ret;
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 5a2acaba04c9..20830b2e007f 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -1668,9 +1668,10 @@ static int bch2_gc_btree_gens(struct bch_fs *c, enum btree_id btree_id)
 			bch2_bkey_buf_reassemble(&sk, c, k);
 			bch2_extent_normalize(c, bkey_i_to_s(sk.k));
 
-			bch2_trans_update(&trans, iter, sk.k, 0);
 
-			commit_err = bch2_trans_commit(&trans, NULL, NULL,
+			commit_err =
+				bch2_trans_update(&trans, iter, sk.k, 0) ?:
+				bch2_trans_commit(&trans, NULL, NULL,
 						       BTREE_INSERT_NOWAIT|
 						       BTREE_INSERT_NOFAIL);
 			if (commit_err == -EINTR) {
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 3c4bf13d4ef9..839262c9501a 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -1198,9 +1198,9 @@ int bch2_btree_delete_at(struct btree_trans *trans,
 	bkey_init(&k.k);
 	k.k.p = iter->pos;
 
-	bch2_trans_update(trans, iter, &k, 0);
-	return bch2_trans_commit(trans, NULL, NULL,
-				 BTREE_INSERT_NOFAIL|flags);
+	return  bch2_trans_update(trans, iter, &k, 0) ?:
+		bch2_trans_commit(trans, NULL, NULL,
+				  BTREE_INSERT_NOFAIL|flags);
 }
 
 int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id id,
@@ -1251,8 +1251,8 @@ retry:
 				break;
 		}
 
-		bch2_trans_update(trans, iter, &delete, 0);
-		ret = bch2_trans_commit(trans, NULL, journal_seq,
+		ret   = bch2_trans_update(trans, iter, &delete, 0) ?:
+			bch2_trans_commit(trans, NULL, journal_seq,
 					BTREE_INSERT_NOFAIL);
 		if (ret)
 			break;
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index ba6b1e770dcf..c9e299706c74 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -1877,7 +1877,9 @@ static int __bch2_trans_mark_reflink_p(struct btree_trans *trans,
 	}
 
 	bch2_btree_iter_set_pos(iter, bkey_start_pos(k.k));
-	bch2_trans_update(trans, iter, n, 0);
+	ret = bch2_trans_update(trans, iter, n, 0);
+	if (ret)
+		goto err;
 out:
 	ret = sectors;
 err:
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index 4fc774631d20..0ee8ecd9d8a0 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -741,9 +741,8 @@ found_slot:
 
 	stripe->k.p = iter->pos;
 
-	bch2_trans_update(&trans, iter, &stripe->k_i, 0);
-
-	ret = bch2_trans_commit(&trans, res, NULL,
+	ret   = bch2_trans_update(&trans, iter, &stripe->k_i, 0) ?:
+		bch2_trans_commit(&trans, res, NULL,
 				BTREE_INSERT_NOFAIL);
 err:
 	bch2_trans_iter_put(&trans, iter);
@@ -791,7 +790,7 @@ static int ec_stripe_bkey_update(struct btree_trans *trans,
 		stripe_blockcount_set(&new->v, i,
 			stripe_blockcount_get(existing, i));
 
-	bch2_trans_update(trans, iter, &new->k_i, 0);
+	ret = bch2_trans_update(trans, iter, &new->k_i, 0);
 err:
 	bch2_trans_iter_put(trans, iter);
 	return ret;
@@ -864,9 +863,8 @@ static int ec_stripe_update_ptrs(struct bch_fs *c,
 		extent_stripe_ptr_add(e, s, ec_ptr, block);
 
 		bch2_btree_iter_set_pos(iter, bkey_start_pos(&sk.k->k));
-		bch2_trans_update(&trans, iter, sk.k, 0);
-
-		ret = bch2_trans_commit(&trans, NULL, NULL,
+		ret   = bch2_trans_update(&trans, iter, sk.k, 0) ?:
+			bch2_trans_commit(&trans, NULL, NULL,
 					BTREE_INSERT_NOFAIL);
 		if (ret == -EINTR)
 			ret = 0;
@@ -1588,8 +1586,7 @@ write:
 		stripe_blockcount_set(&new_key->v, i,
 				      m->block_sectors[i]);
 
-	bch2_trans_update(trans, iter, &new_key->k_i, 0);
-	return 0;
+	return bch2_trans_update(trans, iter, &new_key->k_i, 0);
 }
 
 int bch2_stripes_write(struct bch_fs *c, unsigned flags)
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index a40459d2b0f0..89a130d9c537 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -305,9 +305,8 @@ static int hash_redo_key(struct btree_trans *trans,
 
 	bkey_init(&delete->k);
 	delete->k.p = k_iter->pos;
-	bch2_trans_update(trans, k_iter, delete, 0);
-
-	return bch2_hash_set(trans, desc, hash_info, k_iter->pos.inode, tmp, 0);
+	return  bch2_trans_update(trans, k_iter, delete, 0) ?:
+		bch2_hash_set(trans, desc, hash_info, k_iter->pos.inode, tmp, 0);
 }
 
 static int fsck_hash_delete_at(struct btree_trans *trans,
@@ -563,12 +562,12 @@ static int fix_overlapping_extent(struct btree_trans *trans,
 				   BTREE_ITER_INTENT|BTREE_ITER_NOT_EXTENTS);
 
 	BUG_ON(iter->flags & BTREE_ITER_IS_EXTENTS);
-	bch2_trans_update(trans, iter, u, BTREE_TRIGGER_NORUN);
+	ret   = bch2_trans_update(trans, iter, u, BTREE_TRIGGER_NORUN) ?:
+		bch2_trans_commit(trans, NULL, NULL,
+				  BTREE_INSERT_NOFAIL|
+				  BTREE_INSERT_LAZY_RW);
 	bch2_trans_iter_put(trans, iter);
-
-	return bch2_trans_commit(trans, NULL, NULL,
-				 BTREE_INSERT_NOFAIL|
-				 BTREE_INSERT_LAZY_RW);
+	return ret;
 }
 
 static int inode_backpointer_exists(struct btree_trans *trans,
@@ -887,7 +886,7 @@ retry:
 			ret = __bch2_trans_do(&trans, NULL, NULL,
 					      BTREE_INSERT_NOFAIL|
 					      BTREE_INSERT_LAZY_RW,
-				(bch2_trans_update(&trans, iter, &n->k_i, 0), 0));
+				bch2_trans_update(&trans, iter, &n->k_i, 0));
 			kfree(n);
 			if (ret)
 				goto err;
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index 524da018e9d9..18b568887144 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -333,8 +333,7 @@ int bch2_inode_write(struct btree_trans *trans,
 
 	bch2_inode_pack(trans->c, inode_p, inode);
 	inode_p->inode.k.p.snapshot = iter->snapshot;
-	bch2_trans_update(trans, iter, &inode_p->inode.k_i, 0);
-	return 0;
+	return bch2_trans_update(trans, iter, &inode_p->inode.k_i, 0);
 }
 
 const char *bch2_inode_invalid(const struct bch_fs *c, struct bkey_s_c k)
@@ -629,9 +628,8 @@ retry:
 	delete.k.p = iter->pos;
 	delete.v.bi_generation = cpu_to_le32(inode_u.bi_generation + 1);
 
-	bch2_trans_update(&trans, iter, &delete.k_i, 0);
-
-	ret = bch2_trans_commit(&trans, NULL, NULL,
+	ret   = bch2_trans_update(&trans, iter, &delete.k_i, 0) ?:
+		bch2_trans_commit(&trans, NULL, NULL,
 				BTREE_INSERT_NOFAIL);
 err:
 	bch2_trans_iter_put(&trans, iter);
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index 4424129cad46..e13382fc5b01 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -311,8 +311,9 @@ int bch2_extent_update(struct btree_trans *trans,
 
 		inode_iter = bch2_inode_peek(trans, &inode_u,
 				k->k.p.inode, BTREE_ITER_INTENT);
-		if (IS_ERR(inode_iter))
-			return PTR_ERR(inode_iter);
+		ret = PTR_ERR_OR_ZERO(inode_iter);
+		if (ret)
+			return ret;
 
 		/*
 		 * XXX:
@@ -339,11 +340,14 @@ int bch2_extent_update(struct btree_trans *trans,
 
 			inode_p.inode.k.p.snapshot = iter->snapshot;
 
-			bch2_trans_update(trans, inode_iter,
+			ret = bch2_trans_update(trans, inode_iter,
 					  &inode_p.inode.k_i, 0);
 		}
 
 		bch2_trans_iter_put(trans, inode_iter);
+
+		if (ret)
+			return ret;
 	}
 
 	ret =   bch2_trans_update(trans, iter, k, 0) ?:
@@ -1780,7 +1784,7 @@ static int __bch2_rbio_narrow_crcs(struct btree_trans *trans,
 	if (!bch2_bkey_narrow_crcs(new, new_crc))
 		goto out;
 
-	bch2_trans_update(trans, iter, new, 0);
+	ret = bch2_trans_update(trans, iter, new, 0);
 out:
 	bch2_trans_iter_put(trans, iter);
 	return ret;
diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c
index ef69a19f494a..6ebe49ba2248 100644
--- a/fs/bcachefs/migrate.c
+++ b/fs/bcachefs/migrate.c
@@ -73,9 +73,8 @@ static int __bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags
 
 		bch2_btree_iter_set_pos(iter, bkey_start_pos(&sk.k->k));
 
-		bch2_trans_update(&trans, iter, sk.k, 0);
-
-		ret = bch2_trans_commit(&trans, NULL, NULL,
+		ret   = bch2_trans_update(&trans, iter, sk.k, 0) ?:
+			bch2_trans_commit(&trans, NULL, NULL,
 					BTREE_INSERT_NOFAIL);
 
 		/*
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index 638cd7e97a46..93d7beaa5fce 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -162,9 +162,8 @@ int bch2_migrate_index_update(struct bch_write_op *op)
 				goto out;
 		}
 
-		bch2_trans_update(&trans, iter, insert, 0);
-
-		ret = bch2_trans_commit(&trans, &op->res,
+		ret   = bch2_trans_update(&trans, iter, insert, 0) ?:
+			bch2_trans_commit(&trans, &op->res,
 				op_journal_seq(op),
 				BTREE_INSERT_NOFAIL|
 				m->data_opts.btree_insert_flags);
diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h
index eab669af7032..2ff8e5bd2744 100644
--- a/fs/bcachefs/str_hash.h
+++ b/fs/bcachefs/str_hash.h
@@ -281,7 +281,7 @@ not_found:
 			swap(iter, slot);
 
 		insert->k.p = iter->pos;
-		bch2_trans_update(trans, iter, insert, 0);
+		ret = bch2_trans_update(trans, iter, insert, 0);
 	}
 
 	goto out;
@@ -296,20 +296,20 @@ int bch2_hash_delete_at(struct btree_trans *trans,
 	struct bkey_i *delete;
 	int ret;
 
+	delete = bch2_trans_kmalloc(trans, sizeof(*delete));
+	ret = PTR_ERR_OR_ZERO(delete);
+	if (ret)
+		return ret;
+
 	ret = bch2_hash_needs_whiteout(trans, desc, info, iter);
 	if (ret < 0)
 		return ret;
 
-	delete = bch2_trans_kmalloc(trans, sizeof(*delete));
-	if (IS_ERR(delete))
-		return PTR_ERR(delete);
-
 	bkey_init(&delete->k);
 	delete->k.p = iter->pos;
 	delete->k.type = ret ? KEY_TYPE_hash_whiteout : KEY_TYPE_deleted;
 
-	bch2_trans_update(trans, iter, delete, 0);
-	return 0;
+	return bch2_trans_update(trans, iter, delete, 0);
 }
 
 static __always_inline
diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c
index bb4756566377..87208da5e350 100644
--- a/fs/bcachefs/tests.c
+++ b/fs/bcachefs/tests.c
@@ -621,7 +621,7 @@ static int __do_delete(struct btree_trans *trans, struct bpos pos)
 	bkey_init(&delete.k);
 	delete.k.p = k.k->p;
 
-	bch2_trans_update(trans, iter, &delete, 0);
+	ret = bch2_trans_update(trans, iter, &delete, 0);
 err:
 	bch2_trans_iter_put(trans, iter);
 	return ret;
-- 
cgit 


From f7beb4ca04521f26d9ac2e9bee1bfbf99c55fabb Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 2 Jun 2021 23:31:42 -0400
Subject: bcachefs: Preallocate transaction mem

This helps avoid transaction restarts.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/fs-io.c | 4 ++--
 fs/bcachefs/fs.c    | 2 +-
 fs/bcachefs/inode.c | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 08746a6cbc31..132ff0497b39 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -2531,7 +2531,7 @@ static long bchfs_fcollapse_finsert(struct bch_inode_info *inode,
 	}
 
 	bch2_bkey_buf_init(&copy);
-	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 256);
+	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024);
 	src = bch2_trans_get_iter(&trans, BTREE_ID_extents,
 			POS(inode->v.i_ino, src_start >> 9),
 			BTREE_ITER_INTENT);
@@ -2651,7 +2651,7 @@ static int __bchfs_fallocate(struct bch_inode_info *inode, int mode,
 	unsigned replicas = io_opts(c, &inode->ei_inode).data_replicas;
 	int ret = 0;
 
-	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
+	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 512);
 
 	iter = bch2_trans_get_iter(&trans, BTREE_ID_extents,
 			POS(inode->v.i_ino, start_sector),
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index b1b7ee722f2c..99885fb97aa2 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -146,7 +146,7 @@ int __must_check bch2_write_inode(struct bch_fs *c,
 	struct bch_inode_unpacked inode_u;
 	int ret;
 
-	bch2_trans_init(&trans, c, 0, 256);
+	bch2_trans_init(&trans, c, 0, 512);
 retry:
 	bch2_trans_begin(&trans);
 
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index 18b568887144..c65bfee1897e 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -579,7 +579,7 @@ int bch2_inode_rm(struct bch_fs *c, u64 inode_nr, bool cached)
 	struct bkey_s_c k;
 	int ret;
 
-	bch2_trans_init(&trans, c, 0, 0);
+	bch2_trans_init(&trans, c, 0, 1024);
 
 	/*
 	 * If this was a directory, there shouldn't be any real dirents left -
-- 
cgit 


From 531a0095c958d258ee0c336851acd2bdb642b967 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 4 Jun 2021 15:18:10 -0400
Subject: bcachefs: Improve btree iterator tracepoints

This patch adds some new tracepoints to the btree iterator code, and
adds new fields to the existing tracepoints - primarily for the iterator
position.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_cache.c           |   5 +-
 fs/bcachefs/btree_iter.c            | 106 +++++----
 fs/bcachefs/btree_iter.h            |   1 -
 fs/bcachefs/btree_types.h           |   1 +
 fs/bcachefs/btree_update_interior.c |   8 +-
 fs/bcachefs/btree_update_leaf.c     |  51 ++--
 fs/bcachefs/trace.h                 | 455 +++++++++++++++++++++++++-----------
 7 files changed, 429 insertions(+), 198 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index a13e5eef868e..5991ebee228c 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -816,7 +816,10 @@ lock_node:
 			if (bch2_btree_node_relock(iter, level + 1))
 				goto retry;
 
-			trace_trans_restart_btree_node_reused(iter->trans->ip);
+			trace_trans_restart_btree_node_reused(iter->trans->ip,
+							      trace_ip,
+							      iter->btree_id,
+							      &iter->real_pos);
 			return ERR_PTR(-EINTR);
 		}
 	}
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 4b590b2096a7..78eae2169347 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -178,8 +178,8 @@ success:
 	return true;
 }
 
-static inline bool btree_iter_get_locks(struct btree_iter *iter,
-					bool upgrade, bool trace)
+static inline bool btree_iter_get_locks(struct btree_iter *iter, bool upgrade,
+					unsigned long trace_ip)
 {
 	unsigned l = iter->level;
 	int fail_idx = -1;
@@ -191,16 +191,17 @@ static inline bool btree_iter_get_locks(struct btree_iter *iter,
 		if (!(upgrade
 		      ? bch2_btree_node_upgrade(iter, l)
 		      : bch2_btree_node_relock(iter, l))) {
-			if (trace)
-				(upgrade
-				 ? trace_node_upgrade_fail
-				 : trace_node_relock_fail)(l, iter->l[l].lock_seq,
-						is_btree_node(iter, l)
-						? 0
-						: (unsigned long) iter->l[l].b,
-						is_btree_node(iter, l)
-						? iter->l[l].b->c.lock.state.seq
-						: 0);
+			(upgrade
+			 ? trace_node_upgrade_fail
+			 : trace_node_relock_fail)(iter->trans->ip, trace_ip,
+					iter->btree_id, &iter->real_pos,
+					l, iter->l[l].lock_seq,
+					is_btree_node(iter, l)
+					? 0
+					: (unsigned long) iter->l[l].b,
+					is_btree_node(iter, l)
+					? iter->l[l].b->c.lock.state.seq
+					: 0);
 
 			fail_idx = l;
 			btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
@@ -379,9 +380,9 @@ static inline void bch2_btree_iter_verify_locks(struct btree_iter *iter) {}
 #endif
 
 __flatten
-bool bch2_btree_iter_relock(struct btree_iter *iter, bool trace)
+static bool bch2_btree_iter_relock(struct btree_iter *iter, unsigned long trace_ip)
 {
-	return btree_iter_get_locks(iter, false, trace);
+	return btree_iter_get_locks(iter, false, trace_ip);
 }
 
 bool __bch2_btree_iter_upgrade(struct btree_iter *iter,
@@ -393,7 +394,7 @@ bool __bch2_btree_iter_upgrade(struct btree_iter *iter,
 
 	iter->locks_want = new_locks_want;
 
-	if (btree_iter_get_locks(iter, true, true))
+	if (btree_iter_get_locks(iter, true, _THIS_IP_))
 		return true;
 
 	/*
@@ -421,7 +422,7 @@ bool __bch2_btree_iter_upgrade(struct btree_iter *iter,
 		    linked->btree_id == iter->btree_id &&
 		    linked->locks_want < new_locks_want) {
 			linked->locks_want = new_locks_want;
-			btree_iter_get_locks(linked, true, false);
+			btree_iter_get_locks(linked, true, _THIS_IP_);
 		}
 
 	return false;
@@ -467,8 +468,9 @@ bool bch2_trans_relock(struct btree_trans *trans)
 	struct btree_iter *iter;
 
 	trans_for_each_iter(trans, iter)
-		if (!bch2_btree_iter_relock(iter, true)) {
-			trace_trans_restart_relock(trans->ip);
+		if (!bch2_btree_iter_relock(iter, _RET_IP_)) {
+			trace_trans_restart_relock(trans->ip, _RET_IP_,
+					iter->btree_id, &iter->real_pos);
 			return false;
 		}
 	return true;
@@ -1182,7 +1184,8 @@ err:
 
 static int btree_iter_traverse_one(struct btree_iter *, unsigned long);
 
-static int __btree_iter_traverse_all(struct btree_trans *trans, int ret)
+static int __btree_iter_traverse_all(struct btree_trans *trans, int ret,
+				     unsigned long trace_ip)
 {
 	struct bch_fs *c = trans->c;
 	struct btree_iter *iter;
@@ -1199,7 +1202,7 @@ retry_all:
 	relock_fail = false;
 
 	trans_for_each_iter(trans, iter) {
-		if (!bch2_btree_iter_relock(iter, true))
+		if (!bch2_btree_iter_relock(iter, _THIS_IP_))
 			relock_fail = true;
 		sorted[nr_sorted++] = iter->idx;
 	}
@@ -1276,13 +1279,13 @@ out:
 
 	trans->in_traverse_all = false;
 
-	trace_trans_traverse_all(trans->ip);
+	trace_trans_traverse_all(trans->ip, trace_ip);
 	return ret;
 }
 
 int bch2_btree_iter_traverse_all(struct btree_trans *trans)
 {
-	return __btree_iter_traverse_all(trans, 0);
+	return __btree_iter_traverse_all(trans, 0, _RET_IP_);
 }
 
 static inline bool btree_iter_good_node(struct btree_iter *iter,
@@ -1327,6 +1330,7 @@ static int btree_iter_traverse_one(struct btree_iter *iter,
 				   unsigned long trace_ip)
 {
 	unsigned depth_want = iter->level;
+	int ret = 0;
 
 	/*
 	 * if we need interior nodes locked, call btree_iter_relock() to make
@@ -1334,16 +1338,18 @@ static int btree_iter_traverse_one(struct btree_iter *iter,
 	 */
 	if (iter->uptodate == BTREE_ITER_NEED_RELOCK ||
 	    iter->locks_want > 1)
-		bch2_btree_iter_relock(iter, false);
+		bch2_btree_iter_relock(iter, _THIS_IP_);
 
-	if (btree_iter_type(iter) == BTREE_ITER_CACHED)
-		return bch2_btree_iter_traverse_cached(iter);
+	if (btree_iter_type(iter) == BTREE_ITER_CACHED) {
+		ret = bch2_btree_iter_traverse_cached(iter);
+		goto out;
+	}
 
 	if (iter->uptodate < BTREE_ITER_NEED_RELOCK)
-		return 0;
+		goto out;
 
 	if (unlikely(iter->level >= BTREE_MAX_DEPTH))
-		return 0;
+		goto out;
 
 	iter->level = btree_iter_up_until_good_node(iter, 0);
 
@@ -1354,12 +1360,18 @@ static int btree_iter_traverse_one(struct btree_iter *iter,
 	 * btree_iter_lock_root() comes next and that it can't fail
 	 */
 	while (iter->level > depth_want) {
-		int ret = btree_iter_node(iter, iter->level)
+		ret = btree_iter_node(iter, iter->level)
 			? btree_iter_down(iter, trace_ip)
 			: btree_iter_lock_root(iter, depth_want, trace_ip);
 		if (unlikely(ret)) {
-			if (ret == 1)
-				return 0;
+			if (ret == 1) {
+				/*
+				 * Got to the end of the btree (in
+				 * BTREE_ITER_NODES mode)
+				 */
+				ret = 0;
+				goto out;
+			}
 
 			iter->level = depth_want;
 
@@ -1371,14 +1383,16 @@ static int btree_iter_traverse_one(struct btree_iter *iter,
 				iter->l[iter->level].b =
 					BTREE_ITER_NO_NODE_DOWN;
 			}
-			return ret;
+			goto out;
 		}
 	}
 
 	iter->uptodate = BTREE_ITER_NEED_PEEK;
-
+out:
+	trace_iter_traverse(iter->trans->ip, trace_ip,
+			    iter->btree_id, &iter->real_pos, ret);
 	bch2_btree_iter_verify(iter);
-	return 0;
+	return ret;
 }
 
 static int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter)
@@ -1389,7 +1403,7 @@ static int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter)
 	ret =   bch2_trans_cond_resched(trans) ?:
 		btree_iter_traverse_one(iter, _RET_IP_);
 	if (unlikely(ret))
-		ret = __btree_iter_traverse_all(trans, ret);
+		ret = __btree_iter_traverse_all(trans, ret, _RET_IP_);
 
 	return ret;
 }
@@ -1505,6 +1519,7 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
 
 static void btree_iter_set_search_pos(struct btree_iter *iter, struct bpos new_pos)
 {
+	struct bpos old_pos = iter->real_pos;
 	int cmp = bpos_cmp(new_pos, iter->real_pos);
 	unsigned l = iter->level;
 
@@ -1515,7 +1530,7 @@ static void btree_iter_set_search_pos(struct btree_iter *iter, struct bpos new_p
 
 	if (unlikely(btree_iter_type(iter) == BTREE_ITER_CACHED)) {
 		btree_node_unlock(iter, 0);
-		iter->l[0].b = BTREE_ITER_NO_NODE_UP;
+		iter->l[0].b = BTREE_ITER_NO_NODE_CACHED;
 		btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
 		return;
 	}
@@ -1544,6 +1559,11 @@ out:
 		btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
 
 	bch2_btree_iter_verify(iter);
+#ifdef CONFIG_BCACHEFS_DEBUG
+	trace_iter_set_search_pos(iter->trans->ip, _RET_IP_,
+				  iter->btree_id,
+				  &old_pos, &new_pos, l);
+#endif
 }
 
 inline bool bch2_btree_iter_advance(struct btree_iter *iter)
@@ -2062,13 +2082,6 @@ struct btree_iter *__bch2_trans_get_iter(struct btree_trans *trans,
 		best = iter;
 	}
 
-	trace_trans_get_iter(_RET_IP_, trans->ip,
-			     btree_id,
-			     &real_pos, locks_want,
-			     best ? &best->real_pos : &pos_min,
-			     best ? best->locks_want : 0,
-			     best ? best->uptodate : BTREE_ITER_NEED_TRAVERSE);
-
 	if (!best) {
 		iter = btree_trans_iter_alloc(trans);
 		bch2_btree_iter_init(trans, iter, btree_id);
@@ -2097,7 +2110,7 @@ struct btree_iter *__bch2_trans_get_iter(struct btree_trans *trans,
 	locks_want = min(locks_want, BTREE_MAX_DEPTH);
 	if (locks_want > iter->locks_want) {
 		iter->locks_want = locks_want;
-		btree_iter_get_locks(iter, true, false);
+		btree_iter_get_locks(iter, true, _THIS_IP_);
 	}
 
 	while (iter->level != depth) {
@@ -2115,6 +2128,13 @@ struct btree_iter *__bch2_trans_get_iter(struct btree_trans *trans,
 	bch2_btree_iter_set_pos(iter, pos);
 	btree_iter_set_search_pos(iter, real_pos);
 
+	trace_trans_get_iter(_RET_IP_, trans->ip,
+			     btree_id,
+			     &real_pos, locks_want, iter->uptodate,
+			     best ? &best->real_pos	: &pos_min,
+			     best ? best->locks_want	: U8_MAX,
+			     best ? best->uptodate	: U8_MAX);
+
 	return iter;
 }
 
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index 2f63adb9e420..01b834bf79f7 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -111,7 +111,6 @@ void bch2_btree_node_iter_fix(struct btree_iter *, struct btree *,
 			      struct btree_node_iter *, struct bkey_packed *,
 			      unsigned, unsigned);
 
-bool bch2_btree_iter_relock(struct btree_iter *, bool);
 bool bch2_trans_relock(struct btree_trans *);
 void bch2_trans_unlock(struct btree_trans *);
 
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 493d65882222..50595f5f158b 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -230,6 +230,7 @@ enum btree_iter_uptodate {
 #define BTREE_ITER_NO_NODE_DOWN		((struct btree *) 5)
 #define BTREE_ITER_NO_NODE_INIT		((struct btree *) 6)
 #define BTREE_ITER_NO_NODE_ERROR	((struct btree *) 7)
+#define BTREE_ITER_NO_NODE_CACHED	((struct btree *) 8)
 
 /*
  * @pos			- iterator's current position
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 569db972f3bb..bb01b036c7a2 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -955,7 +955,9 @@ retry:
 	 * instead of locking/reserving all the way to the root:
 	 */
 	if (!bch2_btree_iter_upgrade(iter, U8_MAX)) {
-		trace_trans_restart_iter_upgrade(trans->ip);
+		trace_trans_restart_iter_upgrade(trans->ip, _RET_IP_,
+						 iter->btree_id,
+						 &iter->real_pos);
 		return ERR_PTR(-EINTR);
 	}
 
@@ -996,7 +998,7 @@ retry:
 		 * closure argument
 		 */
 		if (flags & BTREE_INSERT_NOUNLOCK) {
-			trace_trans_restart_journal_preres_get(trans->ip);
+			trace_trans_restart_journal_preres_get(trans->ip, _RET_IP_);
 			ret = -EINTR;
 			goto err;
 		}
@@ -1012,7 +1014,7 @@ retry:
 				BTREE_UPDATE_JOURNAL_RES,
 				journal_flags);
 		if (ret) {
-			trace_trans_restart_journal_preres_get(trans->ip);
+			trace_trans_restart_journal_preres_get(trans->ip, _RET_IP_);
 			goto err;
 		}
 
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 839262c9501a..9eb31d31ed42 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -228,7 +228,8 @@ static inline void btree_insert_entry_checks(struct btree_trans *trans,
 }
 
 static noinline int
-bch2_trans_journal_preres_get_cold(struct btree_trans *trans, unsigned u64s)
+bch2_trans_journal_preres_get_cold(struct btree_trans *trans, unsigned u64s,
+				   unsigned long trace_ip)
 {
 	struct bch_fs *c = trans->c;
 	int ret;
@@ -241,7 +242,7 @@ bch2_trans_journal_preres_get_cold(struct btree_trans *trans, unsigned u64s)
 		return ret;
 
 	if (!bch2_trans_relock(trans)) {
-		trace_trans_restart_journal_preres_get(trans->ip);
+		trace_trans_restart_journal_preres_get(trans->ip, trace_ip);
 		return -EINTR;
 	}
 
@@ -368,7 +369,8 @@ static noinline void bch2_trans_mark_gc(struct btree_trans *trans)
 
 static inline int
 bch2_trans_commit_write_locked(struct btree_trans *trans,
-			       struct btree_insert_entry **stopped_at)
+			       struct btree_insert_entry **stopped_at,
+			       unsigned long trace_ip)
 {
 	struct bch_fs *c = trans->c;
 	struct btree_insert_entry *i;
@@ -378,7 +380,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans,
 	int ret;
 
 	if (race_fault()) {
-		trace_trans_restart_fault_inject(trans->ip);
+		trace_trans_restart_fault_inject(trans->ip, trace_ip);
 		return -EINTR;
 	}
 
@@ -525,7 +527,8 @@ static noinline int maybe_do_btree_merge(struct btree_trans *trans, struct btree
  * Get journal reservation, take write locks, and attempt to do btree update(s):
  */
 static inline int do_bch2_trans_commit(struct btree_trans *trans,
-				       struct btree_insert_entry **stopped_at)
+				       struct btree_insert_entry **stopped_at,
+				       unsigned long trace_ip)
 {
 	struct bch_fs *c = trans->c;
 	struct btree_insert_entry *i;
@@ -559,7 +562,7 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans,
 			 ? JOURNAL_RES_GET_RESERVED : 0));
 	if (unlikely(ret == -EAGAIN))
 		ret = bch2_trans_journal_preres_get_cold(trans,
-						trans->journal_preres_u64s);
+						trans->journal_preres_u64s, trace_ip);
 	if (unlikely(ret))
 		return ret;
 
@@ -578,7 +581,9 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans,
 		if (iter->nodes_locked != iter->nodes_intent_locked) {
 			if (btree_iter_keep(trans, iter)) {
 				if (!bch2_btree_iter_upgrade(iter, 1)) {
-					trace_trans_restart_upgrade(trans->ip);
+					trace_trans_restart_upgrade(trans->ip, trace_ip,
+								    iter->btree_id,
+								    &iter->real_pos);
 					return -EINTR;
 				}
 			} else {
@@ -606,7 +611,7 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans,
 			bch2_btree_node_lock_for_insert(c,
 					iter_l(i->iter)->b, i->iter);
 
-	ret = bch2_trans_commit_write_locked(trans, stopped_at);
+	ret = bch2_trans_commit_write_locked(trans, stopped_at, trace_ip);
 
 	trans_for_each_update2(trans, i)
 		if (!same_leaf_as_prev(trans, i))
@@ -644,7 +649,7 @@ static int journal_reclaim_wait_done(struct bch_fs *c)
 static noinline
 int bch2_trans_commit_error(struct btree_trans *trans,
 			    struct btree_insert_entry *i,
-			    int ret)
+			    int ret, unsigned long trace_ip)
 {
 	struct bch_fs *c = trans->c;
 	unsigned flags = trans->flags;
@@ -685,7 +690,9 @@ int bch2_trans_commit_error(struct btree_trans *trans,
 		if (!ret ||
 		    ret == -EINTR ||
 		    (flags & BTREE_INSERT_NOUNLOCK)) {
-			trace_trans_restart_btree_node_split(trans->ip);
+			trace_trans_restart_btree_node_split(trans->ip, trace_ip,
+							     i->iter->btree_id,
+							     &i->iter->real_pos);
 			ret = -EINTR;
 		}
 		break;
@@ -703,7 +710,7 @@ int bch2_trans_commit_error(struct btree_trans *trans,
 		if (bch2_trans_relock(trans))
 			return 0;
 
-		trace_trans_restart_mark_replicas(trans->ip);
+		trace_trans_restart_mark_replicas(trans->ip, trace_ip);
 		ret = -EINTR;
 		break;
 	case BTREE_INSERT_NEED_JOURNAL_RES:
@@ -720,13 +727,13 @@ int bch2_trans_commit_error(struct btree_trans *trans,
 		if (bch2_trans_relock(trans))
 			return 0;
 
-		trace_trans_restart_journal_res_get(trans->ip);
+		trace_trans_restart_journal_res_get(trans->ip, trace_ip);
 		ret = -EINTR;
 		break;
 	case BTREE_INSERT_NEED_JOURNAL_RECLAIM:
 		bch2_trans_unlock(trans);
 
-		trace_trans_blocked_journal_reclaim(trans->ip);
+		trace_trans_blocked_journal_reclaim(trans->ip, trace_ip);
 
 		wait_event_freezable(c->journal.reclaim_wait,
 				     (ret = journal_reclaim_wait_done(c)));
@@ -736,7 +743,7 @@ int bch2_trans_commit_error(struct btree_trans *trans,
 		if (bch2_trans_relock(trans))
 			return 0;
 
-		trace_trans_restart_journal_reclaim(trans->ip);
+		trace_trans_restart_journal_reclaim(trans->ip, trace_ip);
 		ret = -EINTR;
 		break;
 	default:
@@ -950,7 +957,9 @@ int __bch2_trans_commit(struct btree_trans *trans)
 							     i->trigger_flags);
 				if (unlikely(ret)) {
 					if (ret == -EINTR)
-						trace_trans_restart_mark(trans->ip);
+						trace_trans_restart_mark(trans->ip, _RET_IP_,
+									 i->iter->btree_id,
+									 &i->iter->pos);
 					goto out;
 				}
 			}
@@ -976,12 +985,16 @@ int __bch2_trans_commit(struct btree_trans *trans)
 	trans_for_each_update2(trans, i) {
 		ret = bch2_btree_iter_traverse(i->iter);
 		if (unlikely(ret)) {
-			trace_trans_restart_traverse(trans->ip);
+			trace_trans_restart_traverse(trans->ip, _RET_IP_,
+						     i->iter->btree_id,
+						     &i->iter->pos);
 			goto out;
 		}
 
 		if (unlikely(!bch2_btree_iter_upgrade(i->iter, i->level + 1))) {
-			trace_trans_restart_upgrade(trans->ip);
+			trace_trans_restart_upgrade(trans->ip, _RET_IP_,
+						    i->iter->btree_id,
+						    &i->iter->pos);
 			ret = -EINTR;
 			goto out;
 		}
@@ -997,7 +1010,7 @@ int __bch2_trans_commit(struct btree_trans *trans)
 retry:
 	memset(&trans->journal_res, 0, sizeof(trans->journal_res));
 
-	ret = do_bch2_trans_commit(trans, &i);
+	ret = do_bch2_trans_commit(trans, &i, _RET_IP_);
 
 	/* make sure we didn't drop or screw up locks: */
 	bch2_btree_trans_verify_locks(trans);
@@ -1023,7 +1036,7 @@ out_reset:
 
 	return ret;
 err:
-	ret = bch2_trans_commit_error(trans, i, ret);
+	ret = bch2_trans_commit_error(trans, i, ret, _RET_IP_);
 	if (ret)
 		goto out;
 
diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h
index d447b79bd6ee..ae2aee8ddee8 100644
--- a/fs/bcachefs/trace.h
+++ b/fs/bcachefs/trace.h
@@ -541,59 +541,66 @@ TRACE_EVENT(copygc_wait,
 );
 
 TRACE_EVENT(trans_get_iter,
-	TP_PROTO(unsigned long caller, unsigned long ip,
+	TP_PROTO(unsigned long trans_ip,
+		 unsigned long caller_ip,
 		 enum btree_id btree_id,
-		 struct bpos *pos_want,
-		 unsigned locks_want,
-		 struct bpos *pos_found,
-		 unsigned locks_found,
-		 unsigned uptodate),
-	TP_ARGS(caller, ip, btree_id,
-		pos_want, locks_want,
-		pos_found, locks_found,
-		uptodate),
+		 struct bpos *got_pos,
+		 unsigned got_locks,
+		 unsigned got_uptodate,
+		 struct bpos *src_pos,
+		 unsigned src_locks,
+		 unsigned src_uptodate),
+	TP_ARGS(trans_ip, caller_ip, btree_id,
+		got_pos, got_locks, got_uptodate,
+		src_pos, src_locks, src_uptodate),
 
 	TP_STRUCT__entry(
-		__field(unsigned long,	caller			)
-		__field(unsigned long,	ip			)
-		__field(u8,		btree_id		)
-		__field(u8,		uptodate		)
-		__field(u8,		locks_want		)
-		__field(u8,		locks_found		)
-		__field(u64,		pos_want_inode		)
-		__field(u64,		pos_want_offset		)
-		__field(u32,		pos_want_snapshot	)
-		__field(u64,		pos_found_inode		)
-		__field(u64,		pos_found_offset	)
-		__field(u32,		pos_found_snapshot	)
+		__field(unsigned long,		trans_ip		)
+		__field(unsigned long,		caller_ip		)
+		__field(u8,			btree_id		)
+		__field(u64,			got_pos_inode		)
+		__field(u64,			got_pos_offset		)
+		__field(u32,			got_pos_snapshot	)
+		__field(u8,			got_locks		)
+		__field(u8,			got_uptodate		)
+		__field(u64,			src_pos_inode		)
+		__field(u64,			src_pos_offset		)
+		__field(u32,			src_pos_snapshot	)
+		__field(u8,			src_locks		)
+		__field(u8,			src_uptodate		)
 	),
 
 	TP_fast_assign(
-		__entry->caller			= caller;
-		__entry->ip			= ip;
+		__entry->trans_ip		= trans_ip;
+		__entry->caller_ip		= caller_ip;
 		__entry->btree_id		= btree_id;
-		__entry->uptodate		= uptodate;
-		__entry->pos_want_inode		= pos_want->inode;
-		__entry->pos_want_offset	= pos_want->offset;
-		__entry->pos_want_snapshot	= pos_want->snapshot;
-		__entry->pos_found_inode	= pos_found->inode;
-		__entry->pos_found_offset	= pos_found->offset;
-		__entry->pos_found_snapshot	= pos_found->snapshot;
-	),
-
-	TP_printk("%ps %pS btree %u uptodate %u want %llu:%llu:%u locks %u found %llu:%llu:%u locks %u",
-		  (void *) __entry->caller,
-		  (void *) __entry->ip,
+		__entry->got_pos_inode		= got_pos->inode;
+		__entry->got_pos_offset		= got_pos->offset;
+		__entry->got_pos_snapshot	= got_pos->snapshot;
+		__entry->got_locks		= got_locks;
+		__entry->got_uptodate		= got_uptodate;
+		__entry->src_pos_inode		= src_pos->inode;
+		__entry->src_pos_offset		= src_pos->offset;
+		__entry->src_pos_snapshot	= src_pos->snapshot;
+		__entry->src_locks		= src_locks;
+		__entry->src_uptodate		= src_uptodate;
+	),
+
+	TP_printk("%ps %pS btree %u got %llu:%llu:%u l %u u %u "
+		  "src %llu:%llu:%u l %u u %u",
+		  (void *) __entry->trans_ip,
+		  (void *) __entry->caller_ip,
 		  __entry->btree_id,
-		  __entry->uptodate,
-		  __entry->pos_want_inode,
-		  __entry->pos_want_offset,
-		  __entry->pos_want_snapshot,
-		  __entry->locks_want,
-		  __entry->pos_found_inode,
-		  __entry->pos_found_offset,
-		  __entry->pos_found_snapshot,
-		  __entry->locks_found)
+		  __entry->got_pos_inode,
+		  __entry->got_pos_offset,
+		  __entry->got_pos_snapshot,
+		  __entry->got_locks,
+		  __entry->got_uptodate,
+		  __entry->src_pos_inode,
+		  __entry->src_pos_offset,
+		  __entry->src_pos_snapshot,
+		  __entry->src_locks,
+		  __entry->src_uptodate)
 );
 
 TRACE_EVENT(transaction_restart_ip,
@@ -614,28 +621,241 @@ TRACE_EVENT(transaction_restart_ip,
 );
 
 DECLARE_EVENT_CLASS(transaction_restart,
-	TP_PROTO(unsigned long ip),
-	TP_ARGS(ip),
+	TP_PROTO(unsigned long trans_ip,
+		 unsigned long caller_ip),
+	TP_ARGS(trans_ip, caller_ip),
 
 	TP_STRUCT__entry(
-		__field(unsigned long,		ip	)
+		__field(unsigned long,		trans_ip	)
+		__field(unsigned long,		caller_ip	)
 	),
 
 	TP_fast_assign(
-		__entry->ip = ip;
+		__entry->trans_ip		= trans_ip;
+		__entry->caller_ip		= caller_ip;
 	),
 
-	TP_printk("%pS", (void *) __entry->ip)
+	TP_printk("%pS %pS",
+		  (void *) __entry->trans_ip,
+		  (void *) __entry->caller_ip)
 );
 
-DEFINE_EVENT(transaction_restart,	trans_restart_btree_node_reused,
-	TP_PROTO(unsigned long ip),
-	TP_ARGS(ip)
+DEFINE_EVENT(transaction_restart,	trans_blocked_journal_reclaim,
+	TP_PROTO(unsigned long trans_ip,
+		 unsigned long caller_ip),
+	TP_ARGS(trans_ip, caller_ip)
 );
 
-DEFINE_EVENT(transaction_restart,	trans_blocked_journal_reclaim,
-	TP_PROTO(unsigned long ip),
-	TP_ARGS(ip)
+DEFINE_EVENT(transaction_restart,	trans_restart_journal_res_get,
+	TP_PROTO(unsigned long trans_ip,
+		 unsigned long caller_ip),
+	TP_ARGS(trans_ip, caller_ip)
+);
+
+DEFINE_EVENT(transaction_restart,	trans_restart_journal_preres_get,
+	TP_PROTO(unsigned long trans_ip,
+		 unsigned long caller_ip),
+	TP_ARGS(trans_ip, caller_ip)
+);
+
+DEFINE_EVENT(transaction_restart,	trans_restart_journal_reclaim,
+	TP_PROTO(unsigned long trans_ip,
+		 unsigned long caller_ip),
+	TP_ARGS(trans_ip, caller_ip)
+);
+
+DEFINE_EVENT(transaction_restart,	trans_restart_fault_inject,
+	TP_PROTO(unsigned long trans_ip,
+		 unsigned long caller_ip),
+	TP_ARGS(trans_ip, caller_ip)
+);
+
+DEFINE_EVENT(transaction_restart,	trans_traverse_all,
+	TP_PROTO(unsigned long trans_ip,
+		 unsigned long caller_ip),
+	TP_ARGS(trans_ip, caller_ip)
+);
+
+DEFINE_EVENT(transaction_restart,	trans_restart_mark_replicas,
+	TP_PROTO(unsigned long trans_ip,
+		 unsigned long caller_ip),
+	TP_ARGS(trans_ip, caller_ip)
+);
+
+DECLARE_EVENT_CLASS(transaction_restart_iter,
+	TP_PROTO(unsigned long trans_ip,
+		 unsigned long caller_ip,
+		 enum btree_id btree_id,
+		 struct bpos *pos),
+	TP_ARGS(trans_ip, caller_ip, btree_id, pos),
+
+	TP_STRUCT__entry(
+		__field(unsigned long,		trans_ip	)
+		__field(unsigned long,		caller_ip	)
+		__field(u8,			btree_id	)
+		__field(u64,			pos_inode	)
+		__field(u64,			pos_offset	)
+		__field(u32,			pos_snapshot	)
+	),
+
+	TP_fast_assign(
+		__entry->trans_ip		= trans_ip;
+		__entry->caller_ip		= caller_ip;
+		__entry->btree_id		= btree_id;
+		__entry->pos_inode		= pos->inode;
+		__entry->pos_offset		= pos->offset;
+		__entry->pos_snapshot		= pos->snapshot;
+	),
+
+	TP_printk("%ps %pS btree %u pos %llu:%llu:%u",
+		  (void *) __entry->trans_ip,
+		  (void *) __entry->caller_ip,
+		  __entry->btree_id,
+		  __entry->pos_inode,
+		  __entry->pos_offset,
+		  __entry->pos_snapshot)
+);
+
+DEFINE_EVENT(transaction_restart_iter,	trans_restart_btree_node_reused,
+	TP_PROTO(unsigned long trans_ip,
+		 unsigned long caller_ip,
+		 enum btree_id btree_id,
+		 struct bpos *pos),
+	TP_ARGS(trans_ip, caller_ip, btree_id, pos)
+);
+
+DEFINE_EVENT(transaction_restart_iter,	trans_restart_btree_node_split,
+	TP_PROTO(unsigned long trans_ip,
+		 unsigned long caller_ip,
+		 enum btree_id btree_id,
+		 struct bpos *pos),
+	TP_ARGS(trans_ip, caller_ip, btree_id, pos)
+);
+
+DEFINE_EVENT(transaction_restart_iter,	trans_restart_mark,
+	TP_PROTO(unsigned long trans_ip,
+		 unsigned long caller_ip,
+		 enum btree_id btree_id,
+		 struct bpos *pos),
+	TP_ARGS(trans_ip, caller_ip, btree_id, pos)
+);
+
+DEFINE_EVENT(transaction_restart_iter,	trans_restart_upgrade,
+	TP_PROTO(unsigned long trans_ip,
+		 unsigned long caller_ip,
+		 enum btree_id btree_id,
+		 struct bpos *pos),
+	TP_ARGS(trans_ip, caller_ip, btree_id, pos)
+);
+
+DEFINE_EVENT(transaction_restart_iter,	trans_restart_iter_upgrade,
+	TP_PROTO(unsigned long trans_ip,
+		 unsigned long caller_ip,
+		 enum btree_id btree_id,
+		 struct bpos *pos),
+	TP_ARGS(trans_ip, caller_ip, btree_id, pos)
+);
+
+DEFINE_EVENT(transaction_restart_iter,	trans_restart_relock,
+	TP_PROTO(unsigned long trans_ip,
+		 unsigned long caller_ip,
+		 enum btree_id btree_id,
+		 struct bpos *pos),
+	TP_ARGS(trans_ip, caller_ip, btree_id, pos)
+);
+
+DEFINE_EVENT(transaction_restart_iter,	trans_restart_traverse,
+	TP_PROTO(unsigned long trans_ip,
+		 unsigned long caller_ip,
+		 enum btree_id btree_id,
+		 struct bpos *pos),
+	TP_ARGS(trans_ip, caller_ip, btree_id, pos)
+);
+
+TRACE_EVENT(iter_traverse,
+	TP_PROTO(unsigned long	trans_ip,
+		 unsigned long	caller_ip,
+		 enum btree_id	btree_id,
+		 struct bpos	*pos,
+		 int ret),
+	TP_ARGS(trans_ip, caller_ip, btree_id, pos, ret),
+
+	TP_STRUCT__entry(
+		__field(unsigned long,		trans_ip	)
+		__field(unsigned long,		caller_ip	)
+		__field(u8,			btree_id	)
+		__field(u64,			pos_inode	)
+		__field(u64,			pos_offset	)
+		__field(u32,			pos_snapshot	)
+		__field(s32,			ret		)
+	),
+
+	TP_fast_assign(
+		__entry->trans_ip		= trans_ip;
+		__entry->caller_ip		= caller_ip;
+		__entry->btree_id		= btree_id;
+		__entry->pos_inode		= pos->inode;
+		__entry->pos_offset		= pos->offset;
+		__entry->pos_snapshot		= pos->snapshot;
+		__entry->ret			= ret;
+	),
+
+	TP_printk("%ps %pS pos %u %llu:%llu:%u ret %i",
+		  (void *) __entry->trans_ip,
+		  (void *) __entry->caller_ip,
+		  __entry->btree_id,
+		  __entry->pos_inode,
+		  __entry->pos_offset,
+		  __entry->pos_snapshot,
+		  __entry->ret)
+);
+
+TRACE_EVENT(iter_set_search_pos,
+	TP_PROTO(unsigned long	trans_ip,
+		 unsigned long	caller_ip,
+		 enum btree_id	btree_id,
+		 struct bpos	*old_pos,
+		 struct bpos	*new_pos,
+		 unsigned	good_level),
+	TP_ARGS(trans_ip, caller_ip, btree_id, old_pos, new_pos, good_level),
+
+	TP_STRUCT__entry(
+		__field(unsigned long,		trans_ip		)
+		__field(unsigned long,		caller_ip		)
+		__field(u8,			btree_id		)
+		__field(u64,			old_pos_inode		)
+		__field(u64,			old_pos_offset		)
+		__field(u32,			old_pos_snapshot	)
+		__field(u64,			new_pos_inode		)
+		__field(u64,			new_pos_offset		)
+		__field(u32,			new_pos_snapshot	)
+		__field(u8,			good_level		)
+	),
+
+	TP_fast_assign(
+		__entry->trans_ip		= trans_ip;
+		__entry->caller_ip		= caller_ip;
+		__entry->btree_id		= btree_id;
+		__entry->old_pos_inode		= old_pos->inode;
+		__entry->old_pos_offset		= old_pos->offset;
+		__entry->old_pos_snapshot	= old_pos->snapshot;
+		__entry->new_pos_inode		= new_pos->inode;
+		__entry->new_pos_offset		= new_pos->offset;
+		__entry->new_pos_snapshot	= new_pos->snapshot;
+		__entry->good_level		= good_level;
+	),
+
+	TP_printk("%ps %pS btree %u old pos %llu:%llu:%u new pos %llu:%llu:%u l %u",
+		  (void *) __entry->trans_ip,
+		  (void *) __entry->caller_ip,
+		  __entry->btree_id,
+		  __entry->old_pos_inode,
+		  __entry->old_pos_offset,
+		  __entry->old_pos_snapshot,
+		  __entry->new_pos_inode,
+		  __entry->new_pos_offset,
+		  __entry->new_pos_snapshot,
+		  __entry->good_level)
 );
 
 TRACE_EVENT(trans_restart_would_deadlock,
@@ -730,97 +950,70 @@ TRACE_EVENT(trans_restart_mem_realloced,
 		  __entry->bytes)
 );
 
-DEFINE_EVENT(transaction_restart,	trans_restart_journal_res_get,
-	TP_PROTO(unsigned long ip),
-	TP_ARGS(ip)
-);
-
-DEFINE_EVENT(transaction_restart,	trans_restart_journal_preres_get,
-	TP_PROTO(unsigned long ip),
-	TP_ARGS(ip)
-);
-
-DEFINE_EVENT(transaction_restart,	trans_restart_journal_reclaim,
-	TP_PROTO(unsigned long ip),
-	TP_ARGS(ip)
-);
-
-DEFINE_EVENT(transaction_restart,	trans_restart_mark_replicas,
-	TP_PROTO(unsigned long ip),
-	TP_ARGS(ip)
-);
-
-DEFINE_EVENT(transaction_restart,	trans_restart_fault_inject,
-	TP_PROTO(unsigned long ip),
-	TP_ARGS(ip)
-);
-
-DEFINE_EVENT(transaction_restart,	trans_restart_btree_node_split,
-	TP_PROTO(unsigned long ip),
-	TP_ARGS(ip)
-);
-
-DEFINE_EVENT(transaction_restart,	trans_restart_mark,
-	TP_PROTO(unsigned long ip),
-	TP_ARGS(ip)
-);
-
-DEFINE_EVENT(transaction_restart,	trans_restart_upgrade,
-	TP_PROTO(unsigned long ip),
-	TP_ARGS(ip)
-);
-
-DEFINE_EVENT(transaction_restart,	trans_restart_iter_upgrade,
-	TP_PROTO(unsigned long ip),
-	TP_ARGS(ip)
-);
-
-DEFINE_EVENT(transaction_restart,	trans_restart_relock,
-	TP_PROTO(unsigned long ip),
-	TP_ARGS(ip)
-);
-
-DEFINE_EVENT(transaction_restart,	trans_restart_traverse,
-	TP_PROTO(unsigned long ip),
-	TP_ARGS(ip)
-);
-
-DEFINE_EVENT(transaction_restart,	trans_traverse_all,
-	TP_PROTO(unsigned long ip),
-	TP_ARGS(ip)
-);
-
 DECLARE_EVENT_CLASS(node_lock_fail,
-	TP_PROTO(unsigned level, u32 iter_seq, unsigned node, u32 node_seq),
-	TP_ARGS(level, iter_seq, node, node_seq),
+	TP_PROTO(unsigned long trans_ip,
+		 unsigned long caller_ip,
+		 enum btree_id btree_id,
+		 struct bpos *pos,
+		 unsigned level, u32 iter_seq, unsigned node, u32 node_seq),
+	TP_ARGS(trans_ip, caller_ip, btree_id, pos,
+		level, iter_seq, node, node_seq),
 
 	TP_STRUCT__entry(
-		__field(u32,		level)
-		__field(u32,		iter_seq)
-		__field(u32,		node)
-		__field(u32,		node_seq)
+		__field(unsigned long,		trans_ip	)
+		__field(unsigned long,		caller_ip	)
+		__field(u8,			btree_id	)
+		__field(u64,			pos_inode	)
+		__field(u64,			pos_offset	)
+		__field(u32,			pos_snapshot	)
+		__field(u32,			level		)
+		__field(u32,			iter_seq	)
+		__field(u32,			node		)
+		__field(u32,			node_seq	)
 	),
 
 	TP_fast_assign(
-		__entry->level		= level;
-		__entry->iter_seq	= iter_seq;
-		__entry->node		= node;
-		__entry->node_seq	= node_seq;
+		__entry->trans_ip		= trans_ip;
+		__entry->caller_ip		= caller_ip;
+		__entry->btree_id		= btree_id;
+		__entry->pos_inode		= pos->inode;
+		__entry->pos_offset		= pos->offset;
+		__entry->pos_snapshot		= pos->snapshot;
+		__entry->level			= level;
+		__entry->iter_seq		= iter_seq;
+		__entry->node			= node;
+		__entry->node_seq		= node_seq;
 	),
 
-	TP_printk("level %u iter seq %u node %u node seq %u",
+	TP_printk("%ps %pS btree %u pos %llu:%llu:%u level %u iter seq %u node %u node seq %u",
+		  (void *) __entry->trans_ip,
+		  (void *) __entry->caller_ip,
+		  __entry->btree_id,
+		  __entry->pos_inode,
+		  __entry->pos_offset,
+		  __entry->pos_snapshot,
 		  __entry->level, __entry->iter_seq,
 		  __entry->node, __entry->node_seq)
 );
 
 DEFINE_EVENT(node_lock_fail, node_upgrade_fail,
-	TP_PROTO(unsigned level, u32 iter_seq, unsigned node, u32 node_seq),
-	TP_ARGS(level, iter_seq, node, node_seq)
+	TP_PROTO(unsigned long trans_ip,
+		 unsigned long caller_ip,
+		 enum btree_id btree_id,
+		 struct bpos *pos,
+		 unsigned level, u32 iter_seq, unsigned node, u32 node_seq),
+	TP_ARGS(trans_ip, caller_ip, btree_id, pos,
+		level, iter_seq, node, node_seq)
 );
 
 DEFINE_EVENT(node_lock_fail, node_relock_fail,
-	TP_PROTO(unsigned level, u32 iter_seq, unsigned node, u32 node_seq),
-	TP_ARGS(level, iter_seq, node, node_seq)
+	TP_PROTO(unsigned long trans_ip,
+		 unsigned long caller_ip,
+		 enum btree_id btree_id,
+		 struct bpos *pos,
+		 unsigned level, u32 iter_seq, unsigned node, u32 node_seq),
+	TP_ARGS(trans_ip, caller_ip, btree_id, pos,
+		level, iter_seq, node, node_seq)
 );
 
 #endif /* _TRACE_BCACHEFS_H */
-- 
cgit 


From 66a0a49750d77926a6d4b47d7ac1f4904094fb7e Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 4 Jun 2021 17:17:45 -0400
Subject: bcachefs: btree_iter->should_be_locked

Add a field to struct btree_iter for tracking whether it should be
locked - this fixes spurious transaction restarts in
bch2_trans_relock().

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_iter.c  | 28 ++++++++++++++++++++++++++--
 fs/bcachefs/btree_iter.h  |  1 +
 fs/bcachefs/btree_types.h |  8 +++++++-
 3 files changed, 34 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 78eae2169347..7246519b3250 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -463,12 +463,20 @@ void bch2_trans_downgrade(struct btree_trans *trans)
 
 /* Btree transaction locking: */
 
+static inline bool btree_iter_should_be_locked(struct btree_trans *trans,
+					       struct btree_iter *iter)
+{
+	return (iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT) ||
+		iter->should_be_locked;
+}
+
 bool bch2_trans_relock(struct btree_trans *trans)
 {
 	struct btree_iter *iter;
 
 	trans_for_each_iter(trans, iter)
-		if (!bch2_btree_iter_relock(iter, _RET_IP_)) {
+		if (!bch2_btree_iter_relock(iter, _RET_IP_) &&
+		    btree_iter_should_be_locked(trans, iter)) {
 			trace_trans_restart_relock(trans->ip, _RET_IP_,
 					iter->btree_id, &iter->real_pos);
 			return false;
@@ -1427,9 +1435,16 @@ btree_iter_traverse(struct btree_iter *iter)
 int __must_check
 bch2_btree_iter_traverse(struct btree_iter *iter)
 {
+	int ret;
+
 	btree_iter_set_search_pos(iter, btree_iter_search_key(iter));
 
-	return btree_iter_traverse(iter);
+	ret = btree_iter_traverse(iter);
+	if (ret)
+		return ret;
+
+	iter->should_be_locked = true;
+	return 0;
 }
 
 /* Iterate across nodes (leaf and interior nodes) */
@@ -1455,6 +1470,7 @@ struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter)
 	iter->pos = iter->real_pos = b->key.k.p;
 
 	bch2_btree_iter_verify(iter);
+	iter->should_be_locked = true;
 
 	return b;
 }
@@ -1511,6 +1527,7 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
 	iter->pos = iter->real_pos = b->key.k.p;
 
 	bch2_btree_iter_verify(iter);
+	iter->should_be_locked = true;
 
 	return b;
 }
@@ -1527,6 +1544,7 @@ static void btree_iter_set_search_pos(struct btree_iter *iter, struct bpos new_p
 		goto out;
 
 	iter->real_pos = new_pos;
+	iter->should_be_locked = false;
 
 	if (unlikely(btree_iter_type(iter) == BTREE_ITER_CACHED)) {
 		btree_node_unlock(iter, 0);
@@ -1686,6 +1704,7 @@ start:
 
 	bch2_btree_iter_verify_entry_exit(iter);
 	bch2_btree_iter_verify(iter);
+	iter->should_be_locked = true;
 	return k;
 }
 
@@ -1770,6 +1789,7 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
 out:
 	bch2_btree_iter_verify_entry_exit(iter);
 	bch2_btree_iter_verify(iter);
+	iter->should_be_locked = true;
 	return k;
 no_key:
 	/*
@@ -1869,6 +1889,8 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
 
 	bch2_btree_iter_verify_entry_exit(iter);
 	bch2_btree_iter_verify(iter);
+	iter->should_be_locked = true;
+
 	return k;
 }
 
@@ -1906,6 +1928,8 @@ struct bkey_s_c bch2_btree_iter_peek_cached(struct btree_iter *iter)
 		bkey_cmp(iter->pos, ck->key.pos));
 	BUG_ON(!ck->valid);
 
+	iter->should_be_locked = true;
+
 	return bkey_i_to_s_c(ck->k);
 }
 
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index 01b834bf79f7..a2ce711fd61f 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -178,6 +178,7 @@ static inline void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos
 	iter->k.p.offset	= iter->pos.offset	= new_pos.offset;
 	iter->k.p.snapshot	= iter->pos.snapshot	= new_pos.snapshot;
 	iter->k.size = 0;
+	iter->should_be_locked = false;
 }
 
 /* Sort order for locking btree iterators: */
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 50595f5f158b..bb0f92e3b3ab 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -252,7 +252,13 @@ struct btree_iter {
 	u8			idx;
 
 	enum btree_id		btree_id:4;
-	enum btree_iter_uptodate uptodate:4;
+	enum btree_iter_uptodate uptodate:3;
+	/*
+	 * True if we've returned a key (and thus are expected to keep it
+	 * locked), false after set_pos - for avoiding spurious transaction
+	 * restarts in bch2_trans_relock():
+	 */
+	bool			should_be_locked:1;
 	unsigned		level:4,
 				min_depth:4,
 				locks_want:4,
-- 
cgit 


From 59e2480ff7360b5c6ecd418aee795a7087a3e8f6 Mon Sep 17 00:00:00 2001
From: Dan Robertson <dan@dlrobertson.com>
Date: Sat, 5 Jun 2021 19:03:16 -0400
Subject: bcachefs: do not compile acl mod on minimal config

Do not compile the acl.o target if BCACHEFS_POSIX_ACL is not enabled.

Signed-off-by: Dan Robertson <dan@dlrobertson.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/Makefile | 3 ++-
 fs/bcachefs/xattr.c  | 2 ++
 2 files changed, 4 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile
index dad2fe2530e5..af3b83f871df 100644
--- a/fs/bcachefs/Makefile
+++ b/fs/bcachefs/Makefile
@@ -2,7 +2,6 @@
 obj-$(CONFIG_BCACHEFS_FS)	+= bcachefs.o
 
 bcachefs-y		:=	\
-	acl.o			\
 	alloc_background.o	\
 	alloc_foreground.o	\
 	bkey.o			\
@@ -59,3 +58,5 @@ bcachefs-y		:=	\
 	util.o			\
 	varint.o		\
 	xattr.o
+
+bcachefs-$(CONFIG_BCACHEFS_POSIX_ACL) += acl.o
diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c
index f18a795620d8..1993bfcee788 100644
--- a/fs/bcachefs/xattr.c
+++ b/fs/bcachefs/xattr.c
@@ -562,8 +562,10 @@ static const struct xattr_handler bch_xattr_bcachefs_effective_handler = {
 
 const struct xattr_handler *bch2_xattr_handlers[] = {
 	&bch_xattr_user_handler,
+#ifdef CONFIG_BCACHEFS_POSIX_ACL
 	&nop_posix_acl_access,
 	&nop_posix_acl_default,
+#endif
 	&bch_xattr_trusted_handler,
 	&bch_xattr_security_handler,
 #ifndef NO_BCACHEFS_FS
-- 
cgit 


From ca47fa236278312e1931b303416a79863f80936b Mon Sep 17 00:00:00 2001
From: Brett Holman <bpholman5@gmail.com>
Date: Sun, 6 Jun 2021 09:29:42 -0600
Subject: bcachefs: Fix unitialized use of a value

Signed-off-by: Brett Holman <bpholman5@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/replicas.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c
index 5a8b0a7b7197..64fdf53a630a 100644
--- a/fs/bcachefs/replicas.c
+++ b/fs/bcachefs/replicas.c
@@ -442,6 +442,8 @@ static int __bch2_mark_bkey_replicas(struct bch_fs *c, struct bkey_s_c k,
 	unsigned i;
 	int ret;
 
+	memset(&search, 0, sizeof(search));
+
 	for (i = 0; i < cached.nr; i++) {
 		bch2_replicas_entry_cached(&search.e, cached.devs[i]);
 
-- 
cgit 


From 7138f220973307d4f130eec33e4cb3f571a0f413 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 8 Jun 2021 16:29:24 -0400
Subject: bcachefs: Fix a spurious debug mode assertion

When we switched to using bch2_btree_bset_insert_key() for extents it
turned out it started leaving invalid keys around - of type deleted but
nonzero size - but this is fine (if ugly) because they're never written
out.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_iter.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 7246519b3250..17338410d1fe 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -846,7 +846,14 @@ static inline struct bkey_s_c __btree_iter_unpack(struct btree_iter *iter,
 
 	ret = bkey_disassemble(l->b, k, u);
 
-	if (bch2_debug_check_bkeys)
+	/*
+	 * XXX: bch2_btree_bset_insert_key() generates invalid keys when we
+	 * overwrite extents - it sets k->type = KEY_TYPE_deleted on the key
+	 * being overwritten but doesn't change k->size. But this is ok, because
+	 * those keys are never written out, we just have to avoid a spurious
+	 * assertion here:
+	 */
+	if (bch2_debug_check_bkeys && !bkey_deleted(ret.k))
 		bch2_bkey_debugcheck(iter->trans->c, l->b, ret);
 
 	return ret;
-- 
cgit 


From 224ec3e677ca82eb4216a0916ce7b04f401e90e7 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 8 Jun 2021 22:50:30 -0400
Subject: bcachefs: Don't mark superblocks past end of usable space

bcachefs-tools recently started putting a backup superblock at the end
of the device. This causes a problem if the bucket size doesn't divide
the device size - but we can fix it by just skipping marking that part.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/buckets.c | 12 ++++++++++++
 fs/bcachefs/super.c   |  5 +++++
 2 files changed, 17 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index c9e299706c74..db8c3b7f5fa1 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -635,6 +635,12 @@ void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
 	BUG_ON(type != BCH_DATA_sb &&
 	       type != BCH_DATA_journal);
 
+	/*
+	 * Backup superblock might be past the end of our normal usable space:
+	 */
+	if (b >= ca->mi.nbuckets)
+		return;
+
 	preempt_disable();
 
 	if (likely(c)) {
@@ -2088,6 +2094,12 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
 	};
 	int ret = 0;
 
+	/*
+	 * Backup superblock might be past the end of our normal usable space:
+	 */
+	if (b >= ca->mi.nbuckets)
+		return 0;
+
 	a = bch2_trans_start_alloc_update(trans, &iter, &ptr, &u);
 	if (IS_ERR(a))
 		return PTR_ERR(a);
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index ef7322a8b460..619cfdcd2934 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -1826,6 +1826,11 @@ int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
 		goto err;
 	}
 
+	ret = bch2_trans_mark_dev_sb(c, ca);
+	if (ret) {
+		goto err;
+	}
+
 	mutex_lock(&c->sb_lock);
 	mi = &bch2_sb_get_members(c->disk_sb.sb)->members[ca->dev_idx];
 	mi->nbuckets = cpu_to_le64(nbuckets);
-- 
cgit 


From 74cc1abdbf50fcdaedf4bcf9a800c575ec44e17a Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 10 Jun 2021 13:21:39 -0400
Subject: bcachefs: Fix a buffer overrun

In make_extent_indirect(), we were allocating too small of a buffer for
the new indirect extent.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/reflink.c | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c
index a420729288d4..6aa37726341d 100644
--- a/fs/bcachefs/reflink.c
+++ b/fs/bcachefs/reflink.c
@@ -138,7 +138,7 @@ static int bch2_make_extent_indirect(struct btree_trans *trans,
 	/* rewind iter to start of hole, if necessary: */
 	bch2_btree_iter_set_pos(reflink_iter, bkey_start_pos(k.k));
 
-	r_v = bch2_trans_kmalloc(trans, sizeof(__le64) + bkey_val_bytes(&orig->k));
+	r_v = bch2_trans_kmalloc(trans, sizeof(__le64) + bkey_bytes(&orig->k));
 	ret = PTR_ERR_OR_ZERO(r_v);
 	if (ret)
 		goto err;
@@ -159,12 +159,6 @@ static int bch2_make_extent_indirect(struct btree_trans *trans,
 	if (ret)
 		goto err;
 
-	r_p = bch2_trans_kmalloc(trans, sizeof(*r_p));
-	if (IS_ERR(r_p)) {
-		ret = PTR_ERR(r_p);
-		goto err;
-	}
-
 	orig->k.type = KEY_TYPE_reflink_p;
 	r_p = bkey_i_to_reflink_p(orig);
 	set_bkey_val_bytes(&r_p->k, sizeof(r_p->v));
-- 
cgit 


From 4351d3ecb4a2d0c7165b3b72bc4bd1c02371685d Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 7 Jun 2021 13:28:50 -0400
Subject: bcachefs: More topology repair code

This improves the handling of overlapping btree nodes; now, we handle
the case where one btree node completely overwrites another.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_gc.c | 129 +++++++++++++++++++++++++++++++++++--------------
 1 file changed, 93 insertions(+), 36 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 20830b2e007f..b94fac1bc114 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -36,6 +36,9 @@
 #include <linux/rcupdate.h>
 #include <linux/sched/task.h>
 
+#define DROP_THIS_NODE		10
+#define DROP_PREV_NODE		11
+
 static inline void __gc_pos_set(struct bch_fs *c, struct gc_pos new_pos)
 {
 	preempt_disable();
@@ -203,8 +206,8 @@ static int set_node_max(struct bch_fs *c, struct btree *b, struct bpos new_max)
 	return 0;
 }
 
-static int btree_repair_node_start(struct bch_fs *c, struct btree *b,
-				   struct btree *prev, struct btree *cur)
+static int btree_repair_node_boundaries(struct bch_fs *c, struct btree *b,
+					struct btree *prev, struct btree *cur)
 {
 	struct bpos expected_start = !prev
 		? b->data->min_key
@@ -220,22 +223,50 @@ static int btree_repair_node_start(struct bch_fs *c, struct btree *b,
 		bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(&prev->key));
 	}
 
-	if (mustfix_fsck_err_on(bpos_cmp(expected_start, cur->data->min_key), c,
-			"btree node with incorrect min_key at btree %s level %u:\n"
-			"  prev %s\n"
-			"  cur %s",
-			bch2_btree_ids[b->c.btree_id], b->c.level,
-			buf1,
-			(bch2_bkey_val_to_text(&PBUF(buf2), c, bkey_i_to_s_c(&cur->key)), buf2))) {
-		if (prev &&
-		    bpos_cmp(expected_start, cur->data->min_key) > 0 &&
-		    BTREE_NODE_SEQ(cur->data) > BTREE_NODE_SEQ(prev->data))
+	bch2_bkey_val_to_text(&PBUF(buf2), c, bkey_i_to_s_c(&cur->key));
+
+	if (prev &&
+	    bpos_cmp(expected_start, cur->data->min_key) > 0 &&
+	    BTREE_NODE_SEQ(cur->data) > BTREE_NODE_SEQ(prev->data)) {
+		/* cur overwrites prev: */
+
+		if (mustfix_fsck_err_on(bpos_cmp(prev->data->min_key,
+						 cur->data->min_key) >= 0, c,
+				"btree node overwritten by next node at btree %s level %u:\n"
+				"  node %s\n"
+				"  next %s",
+				bch2_btree_ids[b->c.btree_id], b->c.level,
+				buf1, buf2))
+			return DROP_PREV_NODE;
+
+		if (mustfix_fsck_err_on(bpos_cmp(prev->key.k.p,
+						 bpos_predecessor(cur->data->min_key)), c,
+				"btree node with incorrect max_key at btree %s level %u:\n"
+				"  node %s\n"
+				"  next %s",
+				bch2_btree_ids[b->c.btree_id], b->c.level,
+				buf1, buf2))
 			ret = set_node_max(c, prev,
-				bpos_predecessor(cur->data->min_key));
-		else
-			ret = set_node_min(c, cur, expected_start);
-		if (ret)
-			return ret;
+					   bpos_predecessor(cur->data->min_key));
+	} else {
+		/* prev overwrites cur: */
+
+		if (mustfix_fsck_err_on(bpos_cmp(expected_start,
+						 cur->data->max_key) >= 0, c,
+				"btree node overwritten by prev node at btree %s level %u:\n"
+				"  prev %s\n"
+				"  node %s",
+				bch2_btree_ids[b->c.btree_id], b->c.level,
+				buf1, buf2))
+			return DROP_THIS_NODE;
+
+		if (mustfix_fsck_err_on(bpos_cmp(expected_start, cur->data->min_key), c,
+				"btree node with incorrect min_key at btree %s level %u:\n"
+				"  prev %s\n"
+				"  node %s",
+				bch2_btree_ids[b->c.btree_id], b->c.level,
+				buf1, buf2))
+		    ret = set_node_min(c, cur, expected_start);
 	}
 fsck_err:
 	return ret;
@@ -262,13 +293,11 @@ fsck_err:
 	return ret;
 }
 
-#define DROP_THIS_NODE		10
-
 static int bch2_btree_repair_topology_recurse(struct bch_fs *c, struct btree *b)
 {
 	struct btree_and_journal_iter iter;
 	struct bkey_s_c k;
-	struct bkey_buf tmp;
+	struct bkey_buf prev_k, cur_k;
 	struct btree *prev = NULL, *cur = NULL;
 	bool have_child, dropped_children = false;
 	char buf[200];
@@ -277,8 +306,10 @@ static int bch2_btree_repair_topology_recurse(struct bch_fs *c, struct btree *b)
 	if (!b->c.level)
 		return 0;
 again:
+	prev = NULL;
 	have_child = dropped_children = false;
-	bch2_bkey_buf_init(&tmp);
+	bch2_bkey_buf_init(&prev_k);
+	bch2_bkey_buf_init(&cur_k);
 	bch2_btree_and_journal_iter_init_node_iter(&iter, c, b);
 
 	while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
@@ -286,9 +317,9 @@ again:
 		BUG_ON(bpos_cmp(k.k->p, b->data->max_key) > 0);
 
 		bch2_btree_and_journal_iter_advance(&iter);
-		bch2_bkey_buf_reassemble(&tmp, c, k);
+		bch2_bkey_buf_reassemble(&cur_k, c, k);
 
-		cur = bch2_btree_node_get_noiter(c, tmp.k,
+		cur = bch2_btree_node_get_noiter(c, cur_k.k,
 					b->c.btree_id, b->c.level - 1,
 					false);
 		ret = PTR_ERR_OR_ZERO(cur);
@@ -298,12 +329,12 @@ again:
 				"  %s",
 				bch2_btree_ids[b->c.btree_id],
 				b->c.level - 1,
-				(bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(tmp.k)), buf))) {
-			bch2_btree_node_evict(c, tmp.k);
+				(bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(cur_k.k)), buf))) {
+			bch2_btree_node_evict(c, cur_k.k);
 			ret = bch2_journal_key_delete(c, b->c.btree_id,
-						      b->c.level, tmp.k->k.p);
+						      b->c.level, cur_k.k->k.p);
 			if (ret)
-				goto err;
+				break;
 			continue;
 		}
 
@@ -313,14 +344,39 @@ again:
 			break;
 		}
 
-		ret = btree_repair_node_start(c, b, prev, cur);
+		ret = btree_repair_node_boundaries(c, b, prev, cur);
+
+		if (ret == DROP_THIS_NODE) {
+			six_unlock_read(&cur->c.lock);
+			bch2_btree_node_evict(c, cur_k.k);
+			ret = bch2_journal_key_delete(c, b->c.btree_id,
+						      b->c.level, cur_k.k->k.p);
+			if (ret)
+				break;
+			continue;
+		}
+
 		if (prev)
 			six_unlock_read(&prev->c.lock);
-		prev = cur;
-		cur = NULL;
+		prev = NULL;
 
-		if (ret)
+		if (ret == DROP_PREV_NODE) {
+			bch2_btree_node_evict(c, prev_k.k);
+			ret = bch2_journal_key_delete(c, b->c.btree_id,
+						      b->c.level, prev_k.k->k.p);
+			if (ret)
+				break;
+
+			bch2_btree_and_journal_iter_exit(&iter);
+			bch2_bkey_buf_exit(&prev_k, c);
+			bch2_bkey_buf_exit(&cur_k, c);
+			goto again;
+		} else if (ret)
 			break;
+
+		prev = cur;
+		cur = NULL;
+		bch2_bkey_buf_copy(&prev_k, c, cur_k.k);
 	}
 
 	if (!ret && !IS_ERR_OR_NULL(prev)) {
@@ -342,10 +398,10 @@ again:
 	bch2_btree_and_journal_iter_init_node_iter(&iter, c, b);
 
 	while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
-		bch2_bkey_buf_reassemble(&tmp, c, k);
+		bch2_bkey_buf_reassemble(&cur_k, c, k);
 		bch2_btree_and_journal_iter_advance(&iter);
 
-		cur = bch2_btree_node_get_noiter(c, tmp.k,
+		cur = bch2_btree_node_get_noiter(c, cur_k.k,
 					b->c.btree_id, b->c.level - 1,
 					false);
 		ret = PTR_ERR_OR_ZERO(cur);
@@ -361,9 +417,9 @@ again:
 		cur = NULL;
 
 		if (ret == DROP_THIS_NODE) {
-			bch2_btree_node_evict(c, tmp.k);
+			bch2_btree_node_evict(c, cur_k.k);
 			ret = bch2_journal_key_delete(c, b->c.btree_id,
-						      b->c.level, tmp.k->k.p);
+						      b->c.level, cur_k.k->k.p);
 			dropped_children = true;
 		}
 
@@ -388,7 +444,8 @@ fsck_err:
 		six_unlock_read(&cur->c.lock);
 
 	bch2_btree_and_journal_iter_exit(&iter);
-	bch2_bkey_buf_exit(&tmp, c);
+	bch2_bkey_buf_exit(&prev_k, c);
+	bch2_bkey_buf_exit(&cur_k, c);
 
 	if (!ret && dropped_children)
 		goto again;
-- 
cgit 


From c205321b12058afc757a5a41352b5042a27b7223 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 8 Apr 2021 22:26:53 -0400
Subject: bcachefs: Drop all btree locks when submitting btree node reads

As a rule we don't want to be holding btree locks while submitting IO -
this will improve overall filesystem latency.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_cache.c | 39 +++++++++++++++++++++++++++++----------
 fs/bcachefs/btree_iter.c  |  4 ++++
 2 files changed, 33 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index 5991ebee228c..15f597ab03e7 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -663,13 +663,9 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c,
 		return NULL;
 	}
 
-	/*
-	 * Unlock before doing IO:
-	 *
-	 * XXX: ideally should be dropping all btree node locks here
-	 */
-	if (iter && btree_node_read_locked(iter, level + 1))
-		btree_node_unlock(iter, level + 1);
+	/* Unlock before doing IO: */
+	if (iter && sync)
+		bch2_trans_unlock(iter->trans);
 
 	bch2_btree_node_read(c, b, sync);
 
@@ -680,6 +676,16 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c,
 		return NULL;
 	}
 
+	/*
+	 * XXX: this will probably always fail because btree_iter_relock()
+	 * currently fails for iterators that aren't pointed at a valid btree
+	 * node
+	 */
+	if (iter && !bch2_trans_relock(iter->trans)) {
+		six_unlock_intent(&b->c.lock);
+		return ERR_PTR(-EINTR);
+	}
+
 	if (lock_type == SIX_LOCK_read)
 		six_lock_downgrade(&b->c.lock);
 
@@ -824,9 +830,22 @@ lock_node:
 		}
 	}
 
-	/* XXX: waiting on IO with btree locks held: */
-	wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight,
-		       TASK_UNINTERRUPTIBLE);
+	if (unlikely(btree_node_read_in_flight(b))) {
+		six_unlock_type(&b->c.lock, lock_type);
+		bch2_trans_unlock(iter->trans);
+
+		wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight,
+			       TASK_UNINTERRUPTIBLE);
+
+		/*
+		 * XXX: check if this always fails - btree_iter_relock()
+		 * currently fails for iterators that aren't pointed at a valid
+		 * btree node
+		 */
+		if (iter && !bch2_trans_relock(iter->trans))
+			return ERR_PTR(-EINTR);
+		goto retry;
+	}
 
 	prefetch(b->aux_data);
 
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 17338410d1fe..5c38562ab206 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1191,7 +1191,11 @@ static __always_inline int btree_iter_down(struct btree_iter *iter,
 	if (iter->flags & BTREE_ITER_PREFETCH)
 		btree_iter_prefetch(iter);
 
+	if (btree_node_read_locked(iter, level + 1))
+		btree_node_unlock(iter, level + 1);
 	iter->level = level;
+
+	bch2_btree_iter_verify_locks(iter);
 err:
 	bch2_bkey_buf_exit(&tmp, c);
 	return ret;
-- 
cgit 


From 509d3e0a8dc9714ca9ac2477f188eced3bc4a205 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 20 Mar 2021 15:12:05 -0400
Subject: bcachefs: Child btree iterators

This adds the ability for btree iterators to own child iterators - to be
used by an upcoming rework of bch2_btree_iter_peek_slot(), so we can
scan forwards while maintaining our current position.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_iter.c  | 68 ++++++++++++++++++++++++++++++++++++++++-------
 fs/bcachefs/btree_iter.h  |  6 +++++
 fs/bcachefs/btree_types.h | 18 ++++++++-----
 3 files changed, 75 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 5c38562ab206..eccc7a39df01 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -18,6 +18,9 @@
 #include <linux/prefetch.h>
 
 static void btree_iter_set_search_pos(struct btree_iter *, struct bpos);
+static struct btree_iter *btree_iter_child_alloc(struct btree_iter *, unsigned long);
+static struct btree_iter *btree_trans_iter_alloc(struct btree_trans *);
+static void btree_iter_copy(struct btree_iter *, struct btree_iter *);
 
 static inline struct bpos bkey_successor(struct btree_iter *iter, struct bpos p)
 {
@@ -1967,9 +1970,39 @@ static inline void bch2_btree_iter_init(struct btree_trans *trans,
 
 /* new transactional stuff: */
 
+static void btree_iter_child_free(struct btree_iter *iter)
+{
+	struct btree_iter *child = btree_iter_child(iter);
+
+	if (child) {
+		bch2_trans_iter_free(iter->trans, child);
+		iter->child_idx = U8_MAX;
+	}
+}
+
+static struct btree_iter *btree_iter_child_alloc(struct btree_iter *iter,
+						 unsigned long ip)
+{
+	struct btree_trans *trans = iter->trans;
+	struct btree_iter *child = btree_iter_child(iter);
+
+	if (!child) {
+		child = btree_trans_iter_alloc(trans);
+		child->ip_allocated	= ip;
+		iter->child_idx		= child->idx;
+
+		trans->iters_live	|= 1ULL << child->idx;
+		trans->iters_touched	|= 1ULL << child->idx;
+	}
+
+	return child;
+}
+
 static inline void __bch2_trans_iter_free(struct btree_trans *trans,
 					  unsigned idx)
 {
+	btree_iter_child_free(&trans->iters[idx]);
+
 	__bch2_btree_iter_unlock(&trans->iters[idx]);
 	trans->iters_linked		&= ~(1ULL << idx);
 	trans->iters_live		&= ~(1ULL << idx);
@@ -2037,6 +2070,7 @@ static void btree_trans_iter_alloc_fail(struct btree_trans *trans)
 
 static struct btree_iter *btree_trans_iter_alloc(struct btree_trans *trans)
 {
+	struct btree_iter *iter;
 	unsigned idx;
 
 	if (unlikely(trans->iters_linked ==
@@ -2044,21 +2078,28 @@ static struct btree_iter *btree_trans_iter_alloc(struct btree_trans *trans)
 		btree_trans_iter_alloc_fail(trans);
 
 	idx = __ffs64(~trans->iters_linked);
-
+	iter = &trans->iters[idx];
+
+	iter->trans		= trans;
+	iter->idx		= idx;
+	iter->child_idx		= U8_MAX;
+	iter->flags		= 0;
+	iter->nodes_locked	= 0;
+	iter->nodes_intent_locked = 0;
 	trans->iters_linked	|= 1ULL << idx;
-	trans->iters[idx].idx	 = idx;
-	trans->iters[idx].flags	 = 0;
-	return &trans->iters[idx];
+	return iter;
 }
 
-static inline void btree_iter_copy(struct btree_iter *dst,
-				   struct btree_iter *src)
+static void btree_iter_copy(struct btree_iter *dst, struct btree_iter *src)
 {
-	unsigned i, idx = dst->idx;
+	unsigned i, offset = offsetof(struct btree_iter, flags);
 
-	*dst = *src;
-	dst->idx = idx;
-	dst->flags &= ~BTREE_ITER_KEEP_UNTIL_COMMIT;
+	__bch2_btree_iter_unlock(dst);
+	btree_iter_child_free(dst);
+
+	memcpy((void *) dst + offset,
+	       (void *) src + offset,
+	       sizeof(struct btree_iter) - offset);
 
 	for (i = 0; i < BTREE_MAX_DEPTH; i++)
 		if (btree_node_locked(dst, i))
@@ -2365,6 +2406,13 @@ int bch2_trans_exit(struct btree_trans *trans)
 	bch2_trans_unlock(trans);
 
 #ifdef CONFIG_BCACHEFS_DEBUG
+	if (trans->iters_live) {
+		struct btree_iter *iter;
+
+		trans_for_each_iter(trans, iter)
+			btree_iter_child_free(iter);
+	}
+
 	if (trans->iters_live) {
 		struct btree_iter *iter;
 
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index a2ce711fd61f..18732ca531ec 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -181,6 +181,12 @@ static inline void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos
 	iter->should_be_locked = false;
 }
 
+static inline struct btree_iter *btree_iter_child(struct btree_iter *iter)
+{
+	return iter->child_idx == U8_MAX ? NULL
+		: iter->trans->iters + iter->child_idx;
+}
+
 /* Sort order for locking btree iterators: */
 static inline int btree_iter_lock_cmp(const struct btree_iter *l,
 				      const struct btree_iter *r)
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index bb0f92e3b3ab..97e021648685 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -241,15 +241,20 @@ enum btree_iter_uptodate {
  */
 struct btree_iter {
 	struct btree_trans	*trans;
-	struct bpos		pos;
-	/* what we're searching for/what the iterator actually points to: */
-	struct bpos		real_pos;
-	struct bpos		pos_after_commit;
+	unsigned long		ip_allocated;
+
+	u8			idx;
+	u8			child_idx;
+
+	/* btree_iter_copy starts here: */
+	u16			flags;
+
 	/* When we're filtering by snapshot, the snapshot ID we're looking for: */
 	unsigned		snapshot;
 
-	u16			flags;
-	u8			idx;
+	struct bpos		pos;
+	struct bpos		real_pos;
+	struct bpos		pos_after_commit;
 
 	enum btree_id		btree_id:4;
 	enum btree_iter_uptodate uptodate:3;
@@ -276,7 +281,6 @@ struct btree_iter {
 	 * bch2_btree_iter_next_slot() can correctly advance pos.
 	 */
 	struct bkey		k;
-	unsigned long		ip_allocated;
 };
 
 static inline enum btree_iter_type
-- 
cgit 


From 5288e66a7b732aae8a905ddba86b3b65acb6a911 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 4 Jun 2021 00:29:49 -0400
Subject: bcachefs: BTREE_ITER_WITH_UPDATES

This drops bch2_btree_iter_peek_with_updates() and replaces it with a
new flag, BTREE_ITER_WITH_UPDATES, and also reworks
bch2_btree_iter_peek_slot() to respect it too.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c        | 79 ++++++++++++++++++++---------------------
 fs/bcachefs/btree_iter.h        |  3 --
 fs/bcachefs/btree_types.h       | 13 +++----
 fs/bcachefs/btree_update_leaf.c | 12 +++----
 4 files changed, 50 insertions(+), 57 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index eccc7a39df01..d6de24e92339 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -864,10 +864,9 @@ static inline struct bkey_s_c __btree_iter_unpack(struct btree_iter *iter,
 
 /* peek_all() doesn't skip deleted keys */
 static inline struct bkey_s_c btree_iter_level_peek_all(struct btree_iter *iter,
-							struct btree_iter_level *l,
-							struct bkey *u)
+							struct btree_iter_level *l)
 {
-	return __btree_iter_unpack(iter, l, u,
+	return __btree_iter_unpack(iter, l, &iter->k,
 			bch2_btree_node_iter_peek_all(&l->iter, l->b));
 }
 
@@ -1651,23 +1650,39 @@ static inline bool btree_iter_set_pos_to_prev_leaf(struct btree_iter *iter)
 	return ret;
 }
 
-static struct bkey_i *btree_trans_peek_updates(struct btree_trans *trans,
-					       enum btree_id btree_id, struct bpos pos)
+static noinline struct bkey_i *__btree_trans_peek_updates(struct btree_iter *iter,
+						 struct bpos pos)
 {
 	struct btree_insert_entry *i;
+	struct bkey_i *ret = NULL;
 
-	trans_for_each_update2(trans, i)
-		if ((cmp_int(btree_id,	i->iter->btree_id) ?:
-		     bkey_cmp(pos,	i->k->k.p)) <= 0) {
-			if (btree_id == i->iter->btree_id)
-				return i->k;
+	trans_for_each_update2(iter->trans, i) {
+		if (i->btree_id < iter->btree_id)
+			continue;
+		if (i->btree_id > iter->btree_id)
 			break;
-		}
+		if (bpos_cmp(i->k->k.p, pos) < 0)
+			continue;
+		if (!ret || bpos_cmp(i->k->k.p, ret->k.p) < 0)
+			ret = i->k;
+	}
 
-	return NULL;
+	return ret;
+}
+
+static inline struct bkey_i *btree_trans_peek_updates(struct btree_iter *iter,
+						      struct bpos pos)
+{
+	return iter->flags & BTREE_ITER_WITH_UPDATES
+		? __btree_trans_peek_updates(iter, pos)
+		: NULL;
 }
 
-static inline struct bkey_s_c __btree_iter_peek(struct btree_iter *iter, bool with_updates)
+/**
+ * bch2_btree_iter_peek: returns first key greater than or equal to iterator's
+ * current position
+ */
+struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
 {
 	struct bpos search_key = btree_iter_search_key(iter);
 	struct bkey_i *next_update;
@@ -1678,9 +1693,7 @@ static inline struct bkey_s_c __btree_iter_peek(struct btree_iter *iter, bool wi
 	bch2_btree_iter_verify(iter);
 	bch2_btree_iter_verify_entry_exit(iter);
 start:
-	next_update = with_updates
-		? btree_trans_peek_updates(iter->trans, iter->btree_id, search_key)
-		: NULL;
+	next_update = btree_trans_peek_updates(iter, search_key);
 	btree_iter_set_search_pos(iter, search_key);
 
 	while (1) {
@@ -1691,8 +1704,10 @@ start:
 		k = btree_iter_level_peek(iter, &iter->l[0]);
 
 		if (next_update &&
-		    bpos_cmp(next_update->k.p, iter->real_pos) <= 0)
+		    bpos_cmp(next_update->k.p, iter->real_pos) <= 0) {
+			iter->k = next_update->k;
 			k = bkey_i_to_s_c(next_update);
+		}
 
 		if (likely(k.k)) {
 			if (bkey_deleted(k.k)) {
@@ -1722,15 +1737,6 @@ start:
 	return k;
 }
 
-/**
- * bch2_btree_iter_peek: returns first key greater than or equal to iterator's
- * current position
- */
-struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
-{
-	return __btree_iter_peek(iter, false);
-}
-
 /**
  * bch2_btree_iter_next: returns first key greater than iterator's current
  * position
@@ -1743,19 +1749,6 @@ struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter)
 	return bch2_btree_iter_peek(iter);
 }
 
-struct bkey_s_c bch2_btree_iter_peek_with_updates(struct btree_iter *iter)
-{
-	return __btree_iter_peek(iter, true);
-}
-
-struct bkey_s_c bch2_btree_iter_next_with_updates(struct btree_iter *iter)
-{
-	if (!bch2_btree_iter_advance(iter))
-		return bkey_s_c_null;
-
-	return bch2_btree_iter_peek_with_updates(iter);
-}
-
 /**
  * bch2_btree_iter_peek_prev: returns first key less than or equal to
  * iterator's current position
@@ -1767,6 +1760,7 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
 	int ret;
 
 	EBUG_ON(btree_iter_type(iter) != BTREE_ITER_KEYS);
+	EBUG_ON(iter->flags & BTREE_ITER_WITH_UPDATES);
 	bch2_btree_iter_verify(iter);
 	bch2_btree_iter_verify_entry_exit(iter);
 
@@ -1890,7 +1884,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
 	if (unlikely(ret))
 		return bkey_s_c_err(ret);
 
-	k = btree_iter_level_peek_all(iter, l, &iter->k);
+	k = btree_iter_level_peek_all(iter, l);
 
 	EBUG_ON(k.k && bkey_deleted(k.k) && bkey_cmp(k.k->p, iter->pos) == 0);
 
@@ -1926,12 +1920,17 @@ struct bkey_s_c bch2_btree_iter_prev_slot(struct btree_iter *iter)
 
 struct bkey_s_c bch2_btree_iter_peek_cached(struct btree_iter *iter)
 {
+	struct bkey_i *next_update;
 	struct bkey_cached *ck;
 	int ret;
 
 	EBUG_ON(btree_iter_type(iter) != BTREE_ITER_CACHED);
 	bch2_btree_iter_verify(iter);
 
+	next_update = btree_trans_peek_updates(iter, iter->pos);
+	if (next_update && !bpos_cmp(next_update->k.p, iter->pos))
+		return bkey_i_to_s_c(next_update);
+
 	ret = btree_iter_traverse(iter);
 	if (unlikely(ret))
 		return bkey_s_c_err(ret);
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index 18732ca531ec..ba98cfea4d60 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -153,9 +153,6 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *);
 struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *);
 struct bkey_s_c bch2_btree_iter_next(struct btree_iter *);
 
-struct bkey_s_c bch2_btree_iter_peek_with_updates(struct btree_iter *);
-struct bkey_s_c bch2_btree_iter_next_with_updates(struct btree_iter *);
-
 struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *);
 struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *);
 
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 97e021648685..89780b4aa057 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -209,12 +209,13 @@ enum btree_iter_type {
  * @pos or the first key strictly greater than @pos
  */
 #define BTREE_ITER_IS_EXTENTS		(1 << 6)
-#define BTREE_ITER_ERROR		(1 << 7)
-#define BTREE_ITER_SET_POS_AFTER_COMMIT	(1 << 8)
-#define BTREE_ITER_CACHED_NOFILL	(1 << 9)
-#define BTREE_ITER_CACHED_NOCREATE	(1 << 10)
-#define BTREE_ITER_NOT_EXTENTS		(1 << 11)
-#define BTREE_ITER_ALL_SNAPSHOTS	(1 << 12)
+#define BTREE_ITER_NOT_EXTENTS		(1 << 7)
+#define BTREE_ITER_ERROR		(1 << 8)
+#define BTREE_ITER_SET_POS_AFTER_COMMIT	(1 << 9)
+#define BTREE_ITER_CACHED_NOFILL	(1 << 10)
+#define BTREE_ITER_CACHED_NOCREATE	(1 << 11)
+#define BTREE_ITER_WITH_UPDATES		(1 << 12)
+#define BTREE_ITER_ALL_SNAPSHOTS	(1 << 13)
 
 enum btree_iter_uptodate {
 	BTREE_ITER_UPTODATE		= 0,
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 9eb31d31ed42..123127980853 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -841,13 +841,11 @@ static int extent_handle_overwrites(struct btree_trans *trans,
 	struct bpos start = bkey_start_pos(&insert->k);
 	struct bkey_i *update;
 	struct bkey_s_c k;
-	int ret = 0;
-
-	iter = bch2_trans_get_iter(trans, btree_id, start,
-				   BTREE_ITER_INTENT);
-	k = bch2_btree_iter_peek_with_updates(iter);
+	int ret;
 
-	while (k.k && !(ret = bkey_err(k))) {
+	for_each_btree_key(trans, iter, btree_id, start,
+			   BTREE_ITER_INTENT|
+			   BTREE_ITER_WITH_UPDATES, k, ret) {
 		if (bkey_cmp(insert->k.p, bkey_start_pos(k.k)) <= 0)
 			break;
 
@@ -898,8 +896,6 @@ static int extent_handle_overwrites(struct btree_trans *trans,
 			bch2_trans_iter_put(trans, update_iter);
 			break;
 		}
-
-		k = bch2_btree_iter_next_with_updates(iter);
 	}
 	bch2_trans_iter_put(trans, iter);
 
-- 
cgit 


From e750296bf5599b748360b3497bcfc7243dceb185 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 30 Dec 2022 22:41:38 -0500
Subject: bcachefs: bch2_btree_iter_peek_slot() now supports
 BTREE_ITER_WITH_UPDATES

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index d6de24e92339..fa21739c24bc 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1868,6 +1868,7 @@ __bch2_btree_iter_peek_slot_extents(struct btree_iter *iter)
 struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
 {
 	struct btree_iter_level *l = &iter->l[0];
+	struct bkey_i *next_update;
 	struct bkey_s_c k;
 	int ret;
 
@@ -1885,9 +1886,15 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
 		return bkey_s_c_err(ret);
 
 	k = btree_iter_level_peek_all(iter, l);
-
 	EBUG_ON(k.k && bkey_deleted(k.k) && bkey_cmp(k.k->p, iter->pos) == 0);
 
+	next_update = btree_trans_peek_updates(iter, iter->pos);
+	if (next_update &&
+	    (!k.k || bpos_cmp(next_update->k.p, k.k->p) <= 0)) {
+		iter->k = next_update->k;
+		k = bkey_i_to_s_c(next_update);
+	}
+
 	if (!k.k || bkey_cmp(iter->pos, k.k->p)) {
 		/* hole */
 		bkey_init(&iter->k);
-- 
cgit 


From 1d214eb18d6670e3b0415340df0a68b47ca6e5fc Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 30 Dec 2022 19:15:53 -0500
Subject: bcachefs: Kill __bch2_btree_iter_peek_slot_extents()

This codepath won't just be for extents in the future, it'll also be for
BTREE_ITER_FILTER_SNAPSHOTS mode.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c | 114 ++++++++++++++++++++++-------------------------
 1 file changed, 53 insertions(+), 61 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index fa21739c24bc..6f313e598335 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1822,53 +1822,9 @@ struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *iter)
 	return bch2_btree_iter_peek_prev(iter);
 }
 
-static inline struct bkey_s_c
-__bch2_btree_iter_peek_slot_extents(struct btree_iter *iter)
-{
-	struct bkey_s_c k;
-	struct bpos pos, next_start;
-
-	/* keys & holes can't span inode numbers: */
-	if (iter->pos.offset == KEY_OFFSET_MAX) {
-		if (iter->pos.inode == KEY_INODE_MAX)
-			return bkey_s_c_null;
-
-		bch2_btree_iter_set_pos(iter, bkey_successor(iter, iter->pos));
-	}
-
-	pos = iter->pos;
-	k = bch2_btree_iter_peek(iter);
-	iter->pos = pos;
-
-	if (bkey_err(k))
-		return k;
-
-	if (k.k && bkey_cmp(bkey_start_pos(k.k), iter->pos) <= 0)
-		return k;
-
-	next_start = k.k ? bkey_start_pos(k.k) : POS_MAX;
-
-	bkey_init(&iter->k);
-	iter->k.p = iter->pos;
-	bch2_key_resize(&iter->k,
-			min_t(u64, KEY_SIZE_MAX,
-			      (next_start.inode == iter->pos.inode
-			       ? next_start.offset
-			       : KEY_OFFSET_MAX) -
-			      iter->pos.offset));
-
-	EBUG_ON(!iter->k.size);
-
-	bch2_btree_iter_verify_entry_exit(iter);
-	bch2_btree_iter_verify(iter);
-
-	return (struct bkey_s_c) { &iter->k, NULL };
-}
-
 struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
 {
-	struct btree_iter_level *l = &iter->l[0];
-	struct bkey_i *next_update;
+	struct bpos search_key = btree_iter_search_key(iter);
 	struct bkey_s_c k;
 	int ret;
 
@@ -1876,30 +1832,66 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
 	bch2_btree_iter_verify(iter);
 	bch2_btree_iter_verify_entry_exit(iter);
 
-	btree_iter_set_search_pos(iter, btree_iter_search_key(iter));
+	btree_iter_set_search_pos(iter, search_key);
+
+	/* extents can't span inode numbers: */
+	if ((iter->flags & BTREE_ITER_IS_EXTENTS) &&
+	    iter->pos.offset == KEY_OFFSET_MAX) {
+		if (iter->pos.inode == KEY_INODE_MAX)
+			return bkey_s_c_null;
 
-	if (iter->flags & BTREE_ITER_IS_EXTENTS)
-		return __bch2_btree_iter_peek_slot_extents(iter);
+		bch2_btree_iter_set_pos(iter, bpos_nosnap_successor(iter->pos));
+	}
 
 	ret = btree_iter_traverse(iter);
 	if (unlikely(ret))
 		return bkey_s_c_err(ret);
 
-	k = btree_iter_level_peek_all(iter, l);
-	EBUG_ON(k.k && bkey_deleted(k.k) && bkey_cmp(k.k->p, iter->pos) == 0);
+	if (!(iter->flags & BTREE_ITER_IS_EXTENTS)) {
+		struct bkey_i *next_update = btree_trans_peek_updates(iter, search_key);
 
-	next_update = btree_trans_peek_updates(iter, iter->pos);
-	if (next_update &&
-	    (!k.k || bpos_cmp(next_update->k.p, k.k->p) <= 0)) {
-		iter->k = next_update->k;
-		k = bkey_i_to_s_c(next_update);
-	}
+		k = btree_iter_level_peek_all(iter, &iter->l[0]);
+		EBUG_ON(k.k && bkey_deleted(k.k) && bkey_cmp(k.k->p, iter->pos) == 0);
 
-	if (!k.k || bkey_cmp(iter->pos, k.k->p)) {
-		/* hole */
-		bkey_init(&iter->k);
-		iter->k.p = iter->pos;
-		k = (struct bkey_s_c) { &iter->k, NULL };
+		if (next_update &&
+		    (!k.k || bpos_cmp(next_update->k.p, k.k->p) <= 0)) {
+			iter->k = next_update->k;
+			k = bkey_i_to_s_c(next_update);
+		}
+
+		if (!k.k ||
+		    ((iter->flags & BTREE_ITER_ALL_SNAPSHOTS)
+		     ? bpos_cmp(iter->pos, k.k->p)
+		     : bkey_cmp(iter->pos, k.k->p))) {
+			bkey_init(&iter->k);
+			iter->k.p = iter->pos;
+			k = (struct bkey_s_c) { &iter->k, NULL };
+		}
+	} else {
+		struct bpos next;
+		struct bpos pos = iter->pos;
+
+		k = bch2_btree_iter_peek(iter);
+		iter->pos = pos;
+
+		if (unlikely(bkey_err(k)))
+			return k;
+
+		next = k.k ? bkey_start_pos(k.k) : POS_MAX;
+
+		if (bkey_cmp(iter->pos, next) < 0) {
+			bkey_init(&iter->k);
+			iter->k.p = iter->pos;
+			bch2_key_resize(&iter->k,
+					min_t(u64, KEY_SIZE_MAX,
+					      (next.inode == iter->pos.inode
+					       ? next.offset
+					       : KEY_OFFSET_MAX) -
+					      iter->pos.offset));
+
+			k = (struct bkey_s_c) { &iter->k, NULL };
+			EBUG_ON(!k.k->size);
+		}
 	}
 
 	bch2_btree_iter_verify_entry_exit(iter);
-- 
cgit 


From b1d87f527d7e6eb89395d4a0218b7e4e3974ff1b Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 30 Dec 2022 19:15:53 -0500
Subject: bcachefs: bch2_btree_iter_peek_slot() now saves initial position when
 searching

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 6f313e598335..491cc279a973 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1869,10 +1869,22 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
 		}
 	} else {
 		struct bpos next;
-		struct bpos pos = iter->pos;
 
-		k = bch2_btree_iter_peek(iter);
-		iter->pos = pos;
+		if (iter->flags & BTREE_ITER_INTENT) {
+			struct btree_iter *child =
+				btree_iter_child_alloc(iter, _THIS_IP_);
+
+			btree_iter_copy(child, iter);
+			k = bch2_btree_iter_peek(child);
+
+			if (k.k && !bkey_err(k))
+				iter->k = child->k;
+		} else {
+			struct bpos pos = iter->pos;
+
+			k = bch2_btree_iter_peek(iter);
+			iter->pos = pos;
+		}
 
 		if (unlikely(bkey_err(k)))
 			return k;
-- 
cgit 


From 8e6bbc4181c9eb1bc8dcb0a96522447c6b6ad76e Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 2 Jun 2021 00:18:34 -0400
Subject: bcachefs: Move extent_handle_overwrites() to bch2_trans_update()

This lifts handling of overlapping extents out of __bch2_trans_commit()
and moves it to where we first do the update - which means that
BTREE_ITER_WITH_UPDATES can now work correctly in extents mode.

Also, this patch reworks how extent triggers work: previously, on
partial extent overwrite we would pass this information to the trigger,
telling it what part of the extent was being overwritten. But, this
approach has had too many subtle corner cases - now, we only mark whole
extents, meaning on partial extent overwrite we unmark the old extent
and mark the new extent.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_iter.c        |   2 +-
 fs/bcachefs/btree_update_leaf.c | 157 +++++++++++-----------------------------
 fs/bcachefs/buckets.c           | 145 +++++++++----------------------------
 3 files changed, 77 insertions(+), 227 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 491cc279a973..93952d9f7ebc 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1656,7 +1656,7 @@ static noinline struct bkey_i *__btree_trans_peek_updates(struct btree_iter *ite
 	struct btree_insert_entry *i;
 	struct bkey_i *ret = NULL;
 
-	trans_for_each_update2(iter->trans, i) {
+	trans_for_each_update(iter->trans, i) {
 		if (i->btree_id < iter->btree_id)
 			continue;
 		if (i->btree_id > iter->btree_id)
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 123127980853..0a31270e3caf 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -775,7 +775,7 @@ bch2_trans_commit_get_rw_cold(struct btree_trans *trans)
 	return 0;
 }
 
-static void __bch2_trans_update2(struct btree_trans *trans,
+static void bch2_trans_update2(struct btree_trans *trans,
 				 struct btree_insert_entry n)
 {
 	struct btree_insert_entry *i;
@@ -798,44 +798,23 @@ static void __bch2_trans_update2(struct btree_trans *trans,
 				  i - trans->updates2, n);
 }
 
-static void bch2_trans_update2(struct btree_trans *trans,
-			       struct btree_iter *iter,
-			       struct bkey_i *insert)
-{
-	__bch2_trans_update2(trans, (struct btree_insert_entry) {
-		.bkey_type	= __btree_node_type(iter->level, iter->btree_id),
-		.btree_id	= iter->btree_id,
-		.level		= iter->level,
-		.iter		= iter,
-		.k		= insert,
-	});
-}
-
 static int extent_update_to_keys(struct btree_trans *trans,
 				 struct btree_insert_entry n)
 {
-	int ret;
-
-	ret = bch2_extent_can_insert(trans, n.iter, n.k);
-	if (ret)
-		return ret;
-
-	if (bkey_deleted(&n.k->k))
-		return 0;
-
 	n.iter = bch2_trans_get_iter(trans, n.iter->btree_id, n.k->k.p,
 				     BTREE_ITER_INTENT|
 				     BTREE_ITER_NOT_EXTENTS);
 	n.is_extent = false;
 
-	__bch2_trans_update2(trans, n);
+	bch2_trans_update2(trans, n);
 	bch2_trans_iter_put(trans, n.iter);
 	return 0;
 }
 
 static int extent_handle_overwrites(struct btree_trans *trans,
 				    enum btree_id btree_id,
-				    struct bkey_i *insert)
+				    struct bkey_i *insert,
+				    unsigned trigger_flags)
 {
 	struct btree_iter *iter, *update_iter;
 	struct bpos start = bkey_start_pos(&insert->k);
@@ -861,7 +840,8 @@ static int extent_handle_overwrites(struct btree_trans *trans,
 			update_iter = bch2_trans_get_iter(trans, btree_id, update->k.p,
 							  BTREE_ITER_NOT_EXTENTS|
 							  BTREE_ITER_INTENT);
-			bch2_trans_update2(trans, update_iter, update);
+			bch2_trans_update(trans, update_iter, update,
+					  trigger_flags);
 			bch2_trans_iter_put(trans, update_iter);
 		}
 
@@ -877,7 +857,8 @@ static int extent_handle_overwrites(struct btree_trans *trans,
 			update_iter = bch2_trans_get_iter(trans, btree_id, update->k.p,
 							  BTREE_ITER_NOT_EXTENTS|
 							  BTREE_ITER_INTENT);
-			bch2_trans_update2(trans, update_iter, update);
+			bch2_trans_update(trans, update_iter, update,
+					  trigger_flags);
 			bch2_trans_iter_put(trans, update_iter);
 		}
 
@@ -892,7 +873,8 @@ static int extent_handle_overwrites(struct btree_trans *trans,
 			update_iter = bch2_trans_get_iter(trans, btree_id, update->k.p,
 							  BTREE_ITER_NOT_EXTENTS|
 							  BTREE_ITER_INTENT);
-			bch2_trans_update2(trans, update_iter, update);
+			bch2_trans_update(trans, update_iter, update,
+					  trigger_flags);
 			bch2_trans_iter_put(trans, update_iter);
 			break;
 		}
@@ -962,18 +944,10 @@ int __bch2_trans_commit(struct btree_trans *trans)
 		}
 	} while (trans_trigger_run);
 
-	/* Turn extents updates into keys: */
-	trans_for_each_update(trans, i)
-		if (i->is_extent) {
-			ret = extent_handle_overwrites(trans, i->btree_id, i->k);
-			if (unlikely(ret))
-				goto out;
-		}
-
 	trans_for_each_update(trans, i) {
 		ret = i->is_extent
 			? extent_update_to_keys(trans, *i)
-			: (__bch2_trans_update2(trans, *i), 0);
+			: (bch2_trans_update2(trans, *i), 0);
 		if (unlikely(ret))
 			goto out;
 	}
@@ -1051,6 +1025,7 @@ int bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter,
 		.iter		= iter,
 		.k		= k
 	};
+	int ret = 0;
 
 	BUG_ON(trans->nr_updates >= BTREE_ITER_MAX);
 
@@ -1067,97 +1042,47 @@ int bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter,
 	}
 #endif
 
-	iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT;
-
 	if (n.is_extent) {
+		ret = bch2_extent_can_insert(trans, n.iter, n.k);
+		if (ret)
+			return ret;
+
+		ret = extent_handle_overwrites(trans, n.btree_id, n.k, flags);
+		if (ret)
+			return ret;
+
 		iter->pos_after_commit = k->k.p;
 		iter->flags |= BTREE_ITER_SET_POS_AFTER_COMMIT;
+
+		if (bkey_deleted(&n.k->k))
+			return 0;
+
+		n.iter = bch2_trans_get_iter(trans, n.iter->btree_id, n.k->k.p,
+					     BTREE_ITER_INTENT|
+					     BTREE_ITER_NOT_EXTENTS);
+		bch2_trans_iter_put(trans, n.iter);
+		n.is_extent = false;
 	}
 
+	BUG_ON(n.iter->flags & BTREE_ITER_IS_EXTENTS);
+
+	n.iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT;
+
 	/*
 	 * Pending updates are kept sorted: first, find position of new update,
 	 * then delete/trim any updates the new update overwrites:
 	 */
-	if (!n.is_extent) {
-		trans_for_each_update(trans, i)
-			if (btree_insert_entry_cmp(&n, i) <= 0)
-				break;
-
-		if (i < trans->updates + trans->nr_updates &&
-		    !btree_insert_entry_cmp(&n, i))
-			*i = n;
-		else
-			array_insert_item(trans->updates, trans->nr_updates,
-					  i - trans->updates, n);
-	} else {
-		trans_for_each_update(trans, i)
-			if (btree_insert_entry_cmp(&n, i) < 0)
-				break;
-
-		while (i > trans->updates &&
-		       i[-1].btree_id == n.btree_id &&
-		       bkey_cmp(bkey_start_pos(&n.k->k),
-				bkey_start_pos(&i[-1].k->k)) <= 0) {
-			--i;
-			array_remove_item(trans->updates, trans->nr_updates,
-					  i - trans->updates);
-		}
-
-		if (i > trans->updates &&
-		    i[-1].btree_id == n.btree_id &&
-		    bkey_cmp(bkey_start_pos(&n.k->k), i[-1].k->k.p) < 0)
-			bch2_cut_back(bkey_start_pos(&n.k->k), i[-1].k);
-
-		if (i < trans->updates + trans->nr_updates &&
-		    i->btree_id == n.btree_id &&
-		    bkey_cmp(n.k->k.p, bkey_start_pos(&i->k->k)) > 0) {
-			if (bkey_cmp(bkey_start_pos(&n.k->k),
-				     bkey_start_pos(&i->k->k)) > 0) {
-				struct btree_insert_entry split = *i;
-				int ret;
-
-				BUG_ON(trans->nr_updates + 1 >= BTREE_ITER_MAX);
-
-				split.k = bch2_trans_kmalloc(trans, bkey_bytes(&i->k->k));
-				ret = PTR_ERR_OR_ZERO(split.k);
-				if (ret)
-					return ret;
-
-				bkey_copy(split.k, i->k);
-				bch2_cut_back(bkey_start_pos(&n.k->k), split.k);
-
-				split.iter = bch2_trans_get_iter(trans, split.btree_id,
-								 bkey_start_pos(&split.k->k),
-								 BTREE_ITER_INTENT);
-				split.iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT;
-				bch2_trans_iter_put(trans, split.iter);
-				array_insert_item(trans->updates, trans->nr_updates,
-						  i - trans->updates, split);
-				i++;
-			}
-
-			/*
-			 * When we have an extent that overwrites the start of another
-			 * update, trimming that extent will mean the iterator's
-			 * position has to change since the iterator position has to
-			 * match the extent's start pos - but we don't want to change
-			 * the iterator pos if some other code is using it, so we may
-			 * need to clone it:
-			 */
-			if (btree_iter_live(trans, i->iter)) {
-				i->iter = bch2_trans_copy_iter(trans, i->iter);
-
-				i->iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT;
-				bch2_trans_iter_put(trans, i->iter);
-			}
-
-			bch2_cut_front(n.k->k.p, i->k);
-			bch2_btree_iter_set_pos(i->iter, n.k->k.p);
-		}
+	trans_for_each_update(trans, i)
+		if (btree_insert_entry_cmp(&n, i) <= 0)
+			break;
 
+	if (i < trans->updates + trans->nr_updates &&
+	    !btree_insert_entry_cmp(&n, i)) {
+		BUG_ON(i->trans_triggers_run);
+		*i = n;
+	} else
 		array_insert_item(trans->updates, trans->nr_updates,
 				  i - trans->updates, n);
-	}
 
 	return 0;
 }
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index db8c3b7f5fa1..3c5c73f97b8c 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -1519,29 +1519,6 @@ static struct btree_iter *trans_get_update(struct btree_trans *trans,
 	return NULL;
 }
 
-static int trans_get_key(struct btree_trans *trans,
-			 enum btree_id btree_id, struct bpos pos,
-			 struct btree_iter **iter,
-			 struct bkey_s_c *k)
-{
-	unsigned flags = btree_id != BTREE_ID_alloc
-		? BTREE_ITER_SLOTS
-		: BTREE_ITER_CACHED;
-	int ret;
-
-	*iter = trans_get_update(trans, btree_id, pos, k);
-	if (*iter)
-		return 1;
-
-	*iter = bch2_trans_get_iter(trans, btree_id, pos,
-				    flags|BTREE_ITER_INTENT);
-	*k = __bch2_btree_iter_peek(*iter, flags);
-	ret = bkey_err(*k);
-	if (ret)
-		bch2_trans_iter_put(trans, *iter);
-	return ret;
-}
-
 static struct bkey_alloc_buf *
 bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree_iter **_iter,
 			      const struct bch_extent_ptr *ptr,
@@ -1621,9 +1598,13 @@ static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans,
 	struct bch_replicas_padded r;
 	int ret = 0;
 
-	ret = trans_get_key(trans, BTREE_ID_stripes, POS(0, p.ec.idx), &iter, &k);
-	if (ret < 0)
-		return ret;
+	iter = bch2_trans_get_iter(trans, BTREE_ID_stripes, POS(0, p.ec.idx),
+				   BTREE_ITER_INTENT|
+				   BTREE_ITER_WITH_UPDATES);
+	k = bch2_btree_iter_peek_slot(iter);
+	ret = bkey_err(k);
+	if (ret)
+		goto err;
 
 	if (k.k->type != KEY_TYPE_stripe) {
 		bch2_fs_inconsistent(c,
@@ -1631,7 +1612,7 @@ static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans,
 			(u64) p.ec.idx);
 		bch2_inconsistent_error(c);
 		ret = -EIO;
-		goto out;
+		goto err;
 	}
 
 	if (!bch2_ptr_matches_stripe(bkey_s_c_to_stripe(k).v, p)) {
@@ -1639,13 +1620,13 @@ static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans,
 			"stripe pointer doesn't match stripe %llu",
 			(u64) p.ec.idx);
 		ret = -EIO;
-		goto out;
+		goto err;
 	}
 
 	s = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
 	ret = PTR_ERR_OR_ZERO(s);
 	if (ret)
-		goto out;
+		goto err;
 
 	bkey_reassemble(&s->k_i, k);
 	stripe_blockcount_set(&s->v, p.ec.block,
@@ -1656,7 +1637,7 @@ static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans,
 	bch2_bkey_to_replicas(&r.e, bkey_i_to_s_c(&s->k_i));
 	r.e.data_type = data_type;
 	update_replicas_list(trans, &r.e, sectors);
-out:
+err:
 	bch2_trans_iter_put(trans, iter);
 	return ret;
 }
@@ -1838,10 +1819,13 @@ static int __bch2_trans_mark_reflink_p(struct btree_trans *trans,
 	int frags_referenced;
 	s64 ret;
 
-	ret = trans_get_key(trans, BTREE_ID_reflink,
-			    POS(0, idx), &iter, &k);
-	if (ret < 0)
-		return ret;
+	iter = bch2_trans_get_iter(trans, BTREE_ID_reflink, POS(0, idx),
+				   BTREE_ITER_INTENT|
+				   BTREE_ITER_WITH_UPDATES);
+	k = bch2_btree_iter_peek_slot(iter);
+	ret = bkey_err(k);
+	if (ret)
+		goto err;
 
 	sectors = min_t(u64, sectors, k.k->p.offset - idx);
 
@@ -1994,86 +1978,27 @@ int bch2_trans_mark_update(struct btree_trans *trans,
 	if (!btree_node_type_needs_gc(iter->btree_id))
 		return 0;
 
-	if (!btree_node_type_is_extents(iter->btree_id)) {
-		if (btree_iter_type(iter) != BTREE_ITER_CACHED) {
-			old = bch2_btree_iter_peek_slot(iter);
-			ret = bkey_err(old);
-			if (ret)
-				return ret;
-		} else {
-			struct bkey_cached *ck = (void *) iter->l[0].b;
-
-			BUG_ON(!ck->valid);
-			old = bkey_i_to_s_c(ck->k);
-		}
-
-		if (old.k->type == new->k.type) {
-			ret   = bch2_trans_mark_key(trans, old, bkey_i_to_s_c(new), 0, 0,
-					BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|flags);
-		} else {
-			ret   = bch2_trans_mark_key(trans, old, bkey_i_to_s_c(new), 0, 0,
-					BTREE_TRIGGER_INSERT|flags) ?:
-				bch2_trans_mark_key(trans, old, bkey_i_to_s_c(new), 0, 0,
-					BTREE_TRIGGER_OVERWRITE|flags);
-		}
-	} else {
-		struct btree_iter *copy;
-		struct bkey _old;
-
-		EBUG_ON(btree_iter_type(iter) == BTREE_ITER_CACHED);
-
-		bkey_init(&_old);
-		old = (struct bkey_s_c) { &_old, NULL };
-
-		ret = bch2_trans_mark_key(trans, old, bkey_i_to_s_c(new),
-					  0, new->k.size,
-					  BTREE_TRIGGER_INSERT);
+	if (btree_iter_type(iter) != BTREE_ITER_CACHED) {
+		old = bch2_btree_iter_peek_slot(iter);
+		ret = bkey_err(old);
 		if (ret)
 			return ret;
+	} else {
+		struct bkey_cached *ck = (void *) iter->l[0].b;
 
-		copy = bch2_trans_copy_iter(trans, iter);
-
-		for_each_btree_key_continue(copy, 0, old, ret) {
-			unsigned offset = 0;
-			s64 sectors = -((s64) old.k->size);
-
-			flags |= BTREE_TRIGGER_OVERWRITE;
-
-			if (bkey_cmp(new->k.p, bkey_start_pos(old.k)) <= 0)
-				break;
-
-			switch (bch2_extent_overlap(&new->k, old.k)) {
-			case BCH_EXTENT_OVERLAP_ALL:
-				offset = 0;
-				sectors = -((s64) old.k->size);
-				break;
-			case BCH_EXTENT_OVERLAP_BACK:
-				offset = bkey_start_offset(&new->k) -
-					bkey_start_offset(old.k);
-				sectors = bkey_start_offset(&new->k) -
-					old.k->p.offset;
-				break;
-			case BCH_EXTENT_OVERLAP_FRONT:
-				offset = 0;
-				sectors = bkey_start_offset(old.k) -
-					new->k.p.offset;
-				break;
-			case BCH_EXTENT_OVERLAP_MIDDLE:
-				offset = bkey_start_offset(&new->k) -
-					bkey_start_offset(old.k);
-				sectors = -((s64) new->k.size);
-				flags |= BTREE_TRIGGER_OVERWRITE_SPLIT;
-				break;
-			}
-
-			BUG_ON(sectors >= 0);
+		BUG_ON(!ck->valid);
+		old = bkey_i_to_s_c(ck->k);
+	}
 
-			ret = bch2_trans_mark_key(trans, old, bkey_i_to_s_c(new),
-					offset, sectors, flags);
-			if (ret)
-				break;
-		}
-		bch2_trans_iter_put(trans, copy);
+	if (old.k->type == new->k.type &&
+	    !btree_node_type_is_extents(iter->btree_id)) {
+		ret   = bch2_trans_mark_key(trans, old, bkey_i_to_s_c(new), 0, 0,
+				BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|flags);
+	} else {
+		ret   = bch2_trans_mark_key(trans, old, bkey_i_to_s_c(new), 0, new->k.size,
+				BTREE_TRIGGER_INSERT|flags) ?:
+			bch2_trans_mark_key(trans, old, bkey_i_to_s_c(new), 0, -((s64) old.k->size),
+				BTREE_TRIGGER_OVERWRITE|flags);
 	}
 
 	return ret;
-- 
cgit 


From c1949baa51cb2794bf3be8f092f0f43a7ca0904b Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 7 Jun 2021 13:39:21 -0400
Subject: bcachefs: Simplify reflink trigger

Now that we only mark entire extents, we can ditch the
"reflink_p_frag_references" code.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/buckets.c | 67 +++------------------------------------------------
 1 file changed, 3 insertions(+), 64 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 3c5c73f97b8c..02aa84e896e5 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -1083,32 +1083,6 @@ static int bch2_mark_stripe(struct bch_fs *c,
 	return 0;
 }
 
-static int __reflink_p_frag_references(struct bkey_s_c_reflink_p p,
-				       u64 p_start, u64 p_end,
-				       u64 v_start, u64 v_end)
-{
-	if (p_start == p_end)
-		return false;
-
-	p_start	+= le64_to_cpu(p.v->idx);
-	p_end	+= le64_to_cpu(p.v->idx);
-
-	if (p_end <= v_start)
-		return false;
-	if (p_start >= v_end)
-		return false;
-	return true;
-}
-
-static int reflink_p_frag_references(struct bkey_s_c_reflink_p p,
-				     u64 start, u64 end,
-				     struct bkey_s_c k)
-{
-	return __reflink_p_frag_references(p, start, end,
-					   bkey_start_offset(k.k),
-					   k.k->p.offset);
-}
-
 static int __bch2_mark_reflink_p(struct bch_fs *c,
 			struct bkey_s_c_reflink_p p,
 			u64 idx, unsigned sectors,
@@ -1119,7 +1093,6 @@ static int __bch2_mark_reflink_p(struct bch_fs *c,
 {
 	struct reflink_gc *r;
 	int add = !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1;
-	int frags_referenced;
 
 	while (1) {
 		if (*r_idx >= c->reflink_gc_nr)
@@ -1132,20 +1105,6 @@ static int __bch2_mark_reflink_p(struct bch_fs *c,
 		(*r_idx)++;
 	}
 
-	frags_referenced =
-		__reflink_p_frag_references(p, 0, front_frag,
-					    r->offset - r->size, r->offset) +
-		__reflink_p_frag_references(p, back_frag, p.k->size,
-					    r->offset - r->size, r->offset);
-
-	if (frags_referenced == 2) {
-		BUG_ON(!(flags & BTREE_TRIGGER_OVERWRITE_SPLIT));
-		add = -add;
-	} else if (frags_referenced == 1) {
-		BUG_ON(!(flags & BTREE_TRIGGER_OVERWRITE));
-		add = 0;
-	}
-
 	BUG_ON((s64) r->refcount + add < 0);
 
 	r->refcount += add;
@@ -1806,8 +1765,6 @@ static int bch2_trans_mark_stripe(struct btree_trans *trans,
 static int __bch2_trans_mark_reflink_p(struct btree_trans *trans,
 			struct bkey_s_c_reflink_p p,
 			u64 idx, unsigned sectors,
-			unsigned front_frag,
-			unsigned back_frag,
 			unsigned flags)
 {
 	struct bch_fs *c = trans->c;
@@ -1816,7 +1773,6 @@ static int __bch2_trans_mark_reflink_p(struct btree_trans *trans,
 	struct bkey_i *n;
 	__le64 *refcount;
 	int add = !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1;
-	int frags_referenced;
 	s64 ret;
 
 	iter = bch2_trans_get_iter(trans, BTREE_ID_reflink, POS(0, idx),
@@ -1829,18 +1785,6 @@ static int __bch2_trans_mark_reflink_p(struct btree_trans *trans,
 
 	sectors = min_t(u64, sectors, k.k->p.offset - idx);
 
-	frags_referenced =
-		reflink_p_frag_references(p, 0, front_frag, k) +
-		reflink_p_frag_references(p, back_frag, p.k->size, k);
-
-	if (frags_referenced == 2) {
-		BUG_ON(!(flags & BTREE_TRIGGER_OVERWRITE_SPLIT));
-		add = -add;
-	} else if (frags_referenced == 1) {
-		BUG_ON(!(flags & BTREE_TRIGGER_OVERWRITE));
-		goto out;
-	}
-
 	n = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
 	ret = PTR_ERR_OR_ZERO(n);
 	if (ret)
@@ -1870,7 +1814,7 @@ static int __bch2_trans_mark_reflink_p(struct btree_trans *trans,
 	ret = bch2_trans_update(trans, iter, n, 0);
 	if (ret)
 		goto err;
-out:
+
 	ret = sectors;
 err:
 	bch2_trans_iter_put(trans, iter);
@@ -1882,20 +1826,15 @@ static int bch2_trans_mark_reflink_p(struct btree_trans *trans,
 			s64 sectors, unsigned flags)
 {
 	u64 idx = le64_to_cpu(p.v->idx) + offset;
-	unsigned front_frag, back_frag;
 	s64 ret = 0;
 
 	if (sectors < 0)
 		sectors = -sectors;
 
-	BUG_ON(offset + sectors > p.k->size);
-
-	front_frag = offset;
-	back_frag = offset + sectors;
+	BUG_ON(offset || sectors != p.k->size);
 
 	while (sectors) {
-		ret = __bch2_trans_mark_reflink_p(trans, p, idx, sectors,
-					front_frag, back_frag, flags);
+		ret = __bch2_trans_mark_reflink_p(trans, p, idx, sectors, flags);
 		if (ret < 0)
 			return ret;
 
-- 
cgit 


From cd8319fdd91a600594b2edb6c6eed65dc74354c8 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 7 Jun 2021 14:54:56 -0400
Subject: bcachefs: Kill trans->updates2

Now that extent handling has been lifted to bch2_trans_update(), we
don't need to keep two different lists of updates.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_iter.c        |  3 --
 fs/bcachefs/btree_types.h       |  3 --
 fs/bcachefs/btree_update.h      |  5 ---
 fs/bcachefs/btree_update_leaf.c | 78 +++++++++--------------------------------
 4 files changed, 16 insertions(+), 73 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 93952d9f7ebc..c03a7de19f78 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -2329,7 +2329,6 @@ void bch2_trans_reset(struct btree_trans *trans, unsigned flags)
 	trans->iters_touched &= trans->iters_live;
 
 	trans->nr_updates		= 0;
-	trans->nr_updates2		= 0;
 	trans->mem_top			= 0;
 
 	trans->hooks			= NULL;
@@ -2368,7 +2367,6 @@ static void bch2_trans_alloc_iters(struct btree_trans *trans, struct bch_fs *c)
 
 	trans->iters		= p; p += iters_bytes;
 	trans->updates		= p; p += updates_bytes;
-	trans->updates2		= p; p += updates_bytes;
 }
 
 void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c,
@@ -2574,7 +2572,6 @@ int bch2_fs_btree_iter_init(struct bch_fs *c)
 	return  init_srcu_struct(&c->btree_trans_barrier) ?:
 		mempool_init_kmalloc_pool(&c->btree_iters_pool, 1,
 			sizeof(struct btree_iter) * nr +
-			sizeof(struct btree_insert_entry) * nr +
 			sizeof(struct btree_insert_entry) * nr) ?:
 		mempool_init_kmalloc_pool(&c->btree_trans_mem_pool, 1,
 					  BTREE_TRANS_MEM_MAX);
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 89780b4aa057..e9c209f7a39f 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -345,7 +345,6 @@ struct btree_insert_entry {
 	enum btree_id		btree_id:8;
 	u8			level;
 	unsigned		trans_triggers_run:1;
-	unsigned		is_extent:1;
 	struct bkey_i		*k;
 	struct btree_iter	*iter;
 };
@@ -381,7 +380,6 @@ struct btree_trans {
 	int			srcu_idx;
 
 	u8			nr_updates;
-	u8			nr_updates2;
 	unsigned		used_mempool:1;
 	unsigned		error:1;
 	unsigned		in_traverse_all:1;
@@ -396,7 +394,6 @@ struct btree_trans {
 
 	struct btree_iter	*iters;
 	struct btree_insert_entry *updates;
-	struct btree_insert_entry *updates2;
 
 	/* update path: */
 	struct btree_trans_commit_hook *hooks;
diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
index 56131ac516ce..cbfc8544def4 100644
--- a/fs/bcachefs/btree_update.h
+++ b/fs/bcachefs/btree_update.h
@@ -140,9 +140,4 @@ static inline int bch2_trans_commit(struct btree_trans *trans,
 	     (_i) < (_trans)->updates + (_trans)->nr_updates;		\
 	     (_i)++)
 
-#define trans_for_each_update2(_trans, _i)				\
-	for ((_i) = (_trans)->updates2;					\
-	     (_i) < (_trans)->updates2 + (_trans)->nr_updates2;		\
-	     (_i)++)
-
 #endif /* _BCACHEFS_BTREE_UPDATE_H */
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 0a31270e3caf..e16feeebcf2c 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -32,7 +32,7 @@ static inline int btree_insert_entry_cmp(const struct btree_insert_entry *l,
 static inline bool same_leaf_as_prev(struct btree_trans *trans,
 				     struct btree_insert_entry *i)
 {
-	return i != trans->updates2 &&
+	return i != trans->updates &&
 		iter_l(i[0].iter)->b == iter_l(i[-1].iter)->b;
 }
 
@@ -222,7 +222,7 @@ static bool btree_insert_key_leaf(struct btree_trans *trans,
 static inline void btree_insert_entry_checks(struct btree_trans *trans,
 					     struct btree_insert_entry *i)
 {
-	BUG_ON(!i->is_extent && bpos_cmp(i->k->k.p, i->iter->real_pos));
+	BUG_ON(bpos_cmp(i->k->k.p, i->iter->real_pos));
 	BUG_ON(i->level		!= i->iter->level);
 	BUG_ON(i->btree_id	!= i->iter->btree_id);
 }
@@ -400,7 +400,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans,
 		h = h->next;
 	}
 
-	trans_for_each_update2(trans, i) {
+	trans_for_each_update(trans, i) {
 		/* Multiple inserts might go to same leaf: */
 		if (!same_leaf_as_prev(trans, i))
 			u64s = 0;
@@ -458,10 +458,10 @@ bch2_trans_commit_write_locked(struct btree_trans *trans,
 
 	if (!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)) {
 		if (bch2_journal_seq_verify)
-			trans_for_each_update2(trans, i)
+			trans_for_each_update(trans, i)
 				i->k->k.version.lo = trans->journal_res.seq;
 		else if (bch2_inject_invalid_keys)
-			trans_for_each_update2(trans, i)
+			trans_for_each_update(trans, i)
 				i->k->k.version = MAX_VERSION;
 	}
 
@@ -476,7 +476,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans,
 	if (unlikely(c->gc_pos.phase))
 		bch2_trans_mark_gc(trans);
 
-	trans_for_each_update2(trans, i)
+	trans_for_each_update(trans, i)
 		do_btree_insert_one(trans, i);
 err:
 	if (marking) {
@@ -504,7 +504,7 @@ static noinline int maybe_do_btree_merge(struct btree_trans *trans, struct btree
 
 	BUG_ON(iter->level);
 
-	trans_for_each_update2(trans, i) {
+	trans_for_each_update(trans, i) {
 		if (iter_l(i->iter)->b != b)
 			continue;
 
@@ -535,7 +535,7 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans,
 	struct btree_iter *iter;
 	int ret;
 
-	trans_for_each_update2(trans, i) {
+	trans_for_each_update(trans, i) {
 		struct btree *b;
 
 		BUG_ON(!btree_node_intent_locked(i->iter, i->level));
@@ -552,7 +552,7 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans,
 		}
 	}
 
-	trans_for_each_update2(trans, i)
+	trans_for_each_update(trans, i)
 		BUG_ON(!btree_node_intent_locked(i->iter, i->level));
 
 	ret = bch2_journal_preres_get(&c->journal,
@@ -592,7 +592,7 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans,
 		}
 	}
 
-	trans_for_each_update2(trans, i) {
+	trans_for_each_update(trans, i) {
 		const char *invalid = bch2_bkey_invalid(c,
 				bkey_i_to_s_c(i->k), i->bkey_type);
 		if (invalid) {
@@ -606,14 +606,14 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans,
 	}
 	bch2_btree_trans_verify_locks(trans);
 
-	trans_for_each_update2(trans, i)
+	trans_for_each_update(trans, i)
 		if (!same_leaf_as_prev(trans, i))
 			bch2_btree_node_lock_for_insert(c,
 					iter_l(i->iter)->b, i->iter);
 
 	ret = bch2_trans_commit_write_locked(trans, stopped_at, trace_ip);
 
-	trans_for_each_update2(trans, i)
+	trans_for_each_update(trans, i)
 		if (!same_leaf_as_prev(trans, i))
 			bch2_btree_node_unlock_write_inlined(iter_l(i->iter)->b,
 							     i->iter);
@@ -775,42 +775,6 @@ bch2_trans_commit_get_rw_cold(struct btree_trans *trans)
 	return 0;
 }
 
-static void bch2_trans_update2(struct btree_trans *trans,
-				 struct btree_insert_entry n)
-{
-	struct btree_insert_entry *i;
-
-	btree_insert_entry_checks(trans, &n);
-
-	EBUG_ON(trans->nr_updates2 >= BTREE_ITER_MAX);
-
-	n.iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT;
-
-	trans_for_each_update2(trans, i)
-		if (btree_insert_entry_cmp(&n, i) <= 0)
-			break;
-
-	if (i < trans->updates2 + trans->nr_updates2 &&
-	    !btree_insert_entry_cmp(&n, i))
-		*i = n;
-	else
-		array_insert_item(trans->updates2, trans->nr_updates2,
-				  i - trans->updates2, n);
-}
-
-static int extent_update_to_keys(struct btree_trans *trans,
-				 struct btree_insert_entry n)
-{
-	n.iter = bch2_trans_get_iter(trans, n.iter->btree_id, n.k->k.p,
-				     BTREE_ITER_INTENT|
-				     BTREE_ITER_NOT_EXTENTS);
-	n.is_extent = false;
-
-	bch2_trans_update2(trans, n);
-	bch2_trans_iter_put(trans, n.iter);
-	return 0;
-}
-
 static int extent_handle_overwrites(struct btree_trans *trans,
 				    enum btree_id btree_id,
 				    struct bkey_i *insert,
@@ -945,14 +909,6 @@ int __bch2_trans_commit(struct btree_trans *trans)
 	} while (trans_trigger_run);
 
 	trans_for_each_update(trans, i) {
-		ret = i->is_extent
-			? extent_update_to_keys(trans, *i)
-			: (bch2_trans_update2(trans, *i), 0);
-		if (unlikely(ret))
-			goto out;
-	}
-
-	trans_for_each_update2(trans, i) {
 		ret = bch2_btree_iter_traverse(i->iter);
 		if (unlikely(ret)) {
 			trace_trans_restart_traverse(trans->ip, _RET_IP_,
@@ -1021,28 +977,27 @@ int bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter,
 		.bkey_type	= __btree_node_type(iter->level, iter->btree_id),
 		.btree_id	= iter->btree_id,
 		.level		= iter->level,
-		.is_extent	= (iter->flags & BTREE_ITER_IS_EXTENTS) != 0,
 		.iter		= iter,
 		.k		= k
 	};
+	bool is_extent = (iter->flags & BTREE_ITER_IS_EXTENTS) != 0;
 	int ret = 0;
 
 	BUG_ON(trans->nr_updates >= BTREE_ITER_MAX);
 
 #ifdef CONFIG_BCACHEFS_DEBUG
 	BUG_ON(bkey_cmp(iter->pos,
-			n.is_extent ? bkey_start_pos(&k->k) : k->k.p));
+			is_extent ? bkey_start_pos(&k->k) : k->k.p));
 
 	trans_for_each_update(trans, i) {
-		BUG_ON(bkey_cmp(i->iter->pos,
-				i->is_extent ? bkey_start_pos(&i->k->k) : i->k->k.p));
+		BUG_ON(bkey_cmp(i->iter->pos, i->k->k.p));
 
 		BUG_ON(i != trans->updates &&
 		       btree_insert_entry_cmp(i - 1, i) >= 0);
 	}
 #endif
 
-	if (n.is_extent) {
+	if (is_extent) {
 		ret = bch2_extent_can_insert(trans, n.iter, n.k);
 		if (ret)
 			return ret;
@@ -1061,7 +1016,6 @@ int bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter,
 					     BTREE_ITER_INTENT|
 					     BTREE_ITER_NOT_EXTENTS);
 		bch2_trans_iter_put(trans, n.iter);
-		n.is_extent = false;
 	}
 
 	BUG_ON(n.iter->flags & BTREE_ITER_IS_EXTENTS);
-- 
cgit 


From 59ba21d99fc7a19d32fc4c2cb21509b8876d8e01 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 28 Apr 2021 23:49:30 -0400
Subject: bcachefs: Clean up key merging

This patch simplifies the key merging code by getting rid of partial
merges - it's simpler and saner if we just don't merge extents when
they'd overflow k->size.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/bkey_methods.c | 19 +++---------------
 fs/bcachefs/bkey_methods.h | 29 +++++++++++++--------------
 fs/bcachefs/extents.c      | 50 +++++++++++++++++++---------------------------
 fs/bcachefs/extents.h      |  6 ++----
 fs/bcachefs/reflink.c      | 22 ++++++++++----------
 fs/bcachefs/reflink.h      |  3 +--
 6 files changed, 50 insertions(+), 79 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c
index cf2e054cca2f..ff9d770aabea 100644
--- a/fs/bcachefs/bkey_methods.c
+++ b/fs/bcachefs/bkey_methods.c
@@ -84,7 +84,7 @@ static void key_type_inline_data_to_text(struct printbuf *out, struct bch_fs *c,
 	.val_to_text	= key_type_inline_data_to_text,	\
 }
 
-static const struct bkey_ops bch2_bkey_ops[] = {
+const struct bkey_ops bch2_bkey_ops[] = {
 #define x(name, nr) [KEY_TYPE_##name]	= bch2_bkey_ops_##name,
 	BCH_BKEY_TYPES()
 #undef x
@@ -292,24 +292,11 @@ bool bch2_bkey_normalize(struct bch_fs *c, struct bkey_s k)
 		: false;
 }
 
-enum merge_result bch2_bkey_merge(struct bch_fs *c,
-				  struct bkey_s l, struct bkey_s r)
+bool bch2_bkey_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r)
 {
 	const struct bkey_ops *ops = &bch2_bkey_ops[l.k->type];
-	enum merge_result ret;
 
-	if (bch2_key_merging_disabled ||
-	    !ops->key_merge ||
-	    l.k->type != r.k->type ||
-	    bversion_cmp(l.k->version, r.k->version) ||
-	    bpos_cmp(l.k->p, bkey_start_pos(r.k)))
-		return BCH_MERGE_NOMERGE;
-
-	ret = ops->key_merge(c, l, r);
-
-	if (ret != BCH_MERGE_NOMERGE)
-		l.k->needs_whiteout |= r.k->needs_whiteout;
-	return ret;
+	return bch2_bkey_maybe_mergable(l.k, r.k) && ops->key_merge(c, l, r);
 }
 
 static const struct old_bkey_type {
diff --git a/fs/bcachefs/bkey_methods.h b/fs/bcachefs/bkey_methods.h
index bfa6f112aeed..3012035db1a3 100644
--- a/fs/bcachefs/bkey_methods.h
+++ b/fs/bcachefs/bkey_methods.h
@@ -11,17 +11,6 @@ enum btree_node_type;
 
 extern const char * const bch2_bkey_types[];
 
-enum merge_result {
-	BCH_MERGE_NOMERGE,
-
-	/*
-	 * The keys were mergeable, but would have overflowed size - so instead
-	 * l was changed to the maximum size, and both keys were modified:
-	 */
-	BCH_MERGE_PARTIAL,
-	BCH_MERGE_MERGE,
-};
-
 struct bkey_ops {
 	/* Returns reason for being invalid if invalid, else NULL: */
 	const char *	(*key_invalid)(const struct bch_fs *,
@@ -30,13 +19,14 @@ struct bkey_ops {
 				       struct bkey_s_c);
 	void		(*swab)(struct bkey_s);
 	bool		(*key_normalize)(struct bch_fs *, struct bkey_s);
-	enum merge_result (*key_merge)(struct bch_fs *,
-				       struct bkey_s, struct bkey_s);
+	bool		(*key_merge)(struct bch_fs *, struct bkey_s, struct bkey_s_c);
 	void		(*compat)(enum btree_id id, unsigned version,
 				  unsigned big_endian, int write,
 				  struct bkey_s);
 };
 
+extern const struct bkey_ops bch2_bkey_ops[];
+
 const char *bch2_bkey_val_invalid(struct bch_fs *, struct bkey_s_c);
 const char *__bch2_bkey_invalid(struct bch_fs *, struct bkey_s_c,
 				enum btree_node_type);
@@ -57,8 +47,17 @@ void bch2_bkey_swab_val(struct bkey_s);
 
 bool bch2_bkey_normalize(struct bch_fs *, struct bkey_s);
 
-enum merge_result bch2_bkey_merge(struct bch_fs *,
-				  struct bkey_s, struct bkey_s);
+static inline bool bch2_bkey_maybe_mergable(const struct bkey *l, const struct bkey *r)
+{
+	return l->type == r->type &&
+		!bversion_cmp(l->version, r->version) &&
+		!bpos_cmp(l->p, bkey_start_pos(r)) &&
+		(u64) l->size + r->size <= KEY_SIZE_MAX &&
+		bch2_bkey_ops[l->type].key_merge &&
+		!bch2_key_merging_disabled;
+}
+
+bool bch2_bkey_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);
 
 void bch2_bkey_renumber(enum btree_node_type, struct bkey_packed *, int);
 
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 1f28dea26ca2..2ced3557e13b 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -229,17 +229,16 @@ void bch2_extent_to_text(struct printbuf *out, struct bch_fs *c,
 	bch2_bkey_ptrs_to_text(out, c, k);
 }
 
-enum merge_result bch2_extent_merge(struct bch_fs *c,
-				    struct bkey_s _l, struct bkey_s _r)
+bool bch2_extent_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r)
 {
 	struct bkey_s_extent l = bkey_s_to_extent(_l);
-	struct bkey_s_extent r = bkey_s_to_extent(_r);
+	struct bkey_s_c_extent r = bkey_s_c_to_extent(_r);
 	union bch_extent_entry *en_l = l.v->start;
-	union bch_extent_entry *en_r = r.v->start;
+	const union bch_extent_entry *en_r = r.v->start;
 	struct bch_extent_crc_unpacked crc_l, crc_r;
 
 	if (bkey_val_u64s(l.k) != bkey_val_u64s(r.k))
-		return BCH_MERGE_NOMERGE;
+		return false;
 
 	crc_l = bch2_extent_crc_unpack(l.k, NULL);
 
@@ -247,7 +246,7 @@ enum merge_result bch2_extent_merge(struct bch_fs *c,
 		en_r = vstruct_idx(r.v, (u64 *) en_l - l.v->_data);
 
 		if (extent_entry_type(en_l) != extent_entry_type(en_r))
-			return BCH_MERGE_NOMERGE;
+			return false;
 
 		switch (extent_entry_type(en_l)) {
 		case BCH_EXTENT_ENTRY_ptr: {
@@ -258,20 +257,20 @@ enum merge_result bch2_extent_merge(struct bch_fs *c,
 			if (lp->offset + crc_l.compressed_size != rp->offset ||
 			    lp->dev			!= rp->dev ||
 			    lp->gen			!= rp->gen)
-				return BCH_MERGE_NOMERGE;
+				return false;
 
 			/* We don't allow extents to straddle buckets: */
 			ca = bch_dev_bkey_exists(c, lp->dev);
 
 			if (PTR_BUCKET_NR(ca, lp) != PTR_BUCKET_NR(ca, rp))
-				return BCH_MERGE_NOMERGE;
+				return false;
 
 			break;
 		}
 		case BCH_EXTENT_ENTRY_stripe_ptr:
 			if (en_l->stripe_ptr.block	!= en_r->stripe_ptr.block ||
 			    en_l->stripe_ptr.idx	!= en_r->stripe_ptr.idx)
-				return BCH_MERGE_NOMERGE;
+				return false;
 			break;
 		case BCH_EXTENT_ENTRY_crc32:
 		case BCH_EXTENT_ENTRY_crc64:
@@ -282,30 +281,30 @@ enum merge_result bch2_extent_merge(struct bch_fs *c,
 			if (crc_l.csum_type		!= crc_r.csum_type ||
 			    crc_l.compression_type	!= crc_r.compression_type ||
 			    crc_l.nonce			!= crc_r.nonce)
-				return BCH_MERGE_NOMERGE;
+				return false;
 
 			if (crc_l.offset + crc_l.live_size != crc_l.compressed_size ||
 			    crc_r.offset)
-				return BCH_MERGE_NOMERGE;
+				return false;
 
 			if (!bch2_checksum_mergeable(crc_l.csum_type))
-				return BCH_MERGE_NOMERGE;
+				return false;
 
 			if (crc_is_compressed(crc_l))
-				return BCH_MERGE_NOMERGE;
+				return false;
 
 			if (crc_l.csum_type &&
 			    crc_l.uncompressed_size +
 			    crc_r.uncompressed_size > c->sb.encoded_extent_max)
-				return BCH_MERGE_NOMERGE;
+				return false;
 
 			if (crc_l.uncompressed_size + crc_r.uncompressed_size >
 			    bch2_crc_field_size_max[extent_entry_type(en_l)])
-				return BCH_MERGE_NOMERGE;
+				return false;
 
 			break;
 		default:
-			return BCH_MERGE_NOMERGE;
+			return false;
 		}
 	}
 
@@ -333,8 +332,7 @@ enum merge_result bch2_extent_merge(struct bch_fs *c,
 	}
 
 	bch2_key_resize(l.k, l.k->size + r.k->size);
-
-	return BCH_MERGE_MERGE;
+	return true;
 }
 
 /* KEY_TYPE_reservation: */
@@ -362,25 +360,17 @@ void bch2_reservation_to_text(struct printbuf *out, struct bch_fs *c,
 	       r.v->nr_replicas);
 }
 
-enum merge_result bch2_reservation_merge(struct bch_fs *c,
-					 struct bkey_s _l, struct bkey_s _r)
+bool bch2_reservation_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r)
 {
 	struct bkey_s_reservation l = bkey_s_to_reservation(_l);
-	struct bkey_s_reservation r = bkey_s_to_reservation(_r);
+	struct bkey_s_c_reservation r = bkey_s_c_to_reservation(_r);
 
 	if (l.v->generation != r.v->generation ||
 	    l.v->nr_replicas != r.v->nr_replicas)
-		return BCH_MERGE_NOMERGE;
-
-	if ((u64) l.k->size + r.k->size > KEY_SIZE_MAX) {
-		bch2_key_resize(l.k, KEY_SIZE_MAX);
-		bch2_cut_front_s(l.k->p, r.s);
-		return BCH_MERGE_PARTIAL;
-	}
+		return false;
 
 	bch2_key_resize(l.k, l.k->size + r.k->size);
-
-	return BCH_MERGE_MERGE;
+	return true;
 }
 
 /* Extent checksum entries: */
diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h
index 9999805f955e..3f6224f75ce8 100644
--- a/fs/bcachefs/extents.h
+++ b/fs/bcachefs/extents.h
@@ -394,8 +394,7 @@ void bch2_btree_ptr_v2_compat(enum btree_id, unsigned, unsigned,
 
 const char *bch2_extent_invalid(const struct bch_fs *, struct bkey_s_c);
 void bch2_extent_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
-enum merge_result bch2_extent_merge(struct bch_fs *,
-				    struct bkey_s, struct bkey_s);
+bool bch2_extent_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);
 
 #define bch2_bkey_ops_extent (struct bkey_ops) {		\
 	.key_invalid	= bch2_extent_invalid,			\
@@ -409,8 +408,7 @@ enum merge_result bch2_extent_merge(struct bch_fs *,
 
 const char *bch2_reservation_invalid(const struct bch_fs *, struct bkey_s_c);
 void bch2_reservation_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
-enum merge_result bch2_reservation_merge(struct bch_fs *,
-					 struct bkey_s, struct bkey_s);
+bool bch2_reservation_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);
 
 #define bch2_bkey_ops_reservation (struct bkey_ops) {		\
 	.key_invalid	= bch2_reservation_invalid,		\
diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c
index 6aa37726341d..ead31f9e31aa 100644
--- a/fs/bcachefs/reflink.c
+++ b/fs/bcachefs/reflink.c
@@ -42,24 +42,22 @@ void bch2_reflink_p_to_text(struct printbuf *out, struct bch_fs *c,
 	pr_buf(out, "idx %llu", le64_to_cpu(p.v->idx));
 }
 
-enum merge_result bch2_reflink_p_merge(struct bch_fs *c,
-				       struct bkey_s _l, struct bkey_s _r)
+bool bch2_reflink_p_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r)
 {
 	struct bkey_s_reflink_p l = bkey_s_to_reflink_p(_l);
-	struct bkey_s_reflink_p r = bkey_s_to_reflink_p(_r);
+	struct bkey_s_c_reflink_p r = bkey_s_c_to_reflink_p(_r);
 
-	if (le64_to_cpu(l.v->idx) + l.k->size != le64_to_cpu(r.v->idx))
-		return BCH_MERGE_NOMERGE;
+	/*
+	 * Disabled for now, the triggers code needs to be reworked for merging
+	 * of reflink pointers to work:
+	 */
+	return false;
 
-	if ((u64) l.k->size + r.k->size > KEY_SIZE_MAX) {
-		bch2_key_resize(l.k, KEY_SIZE_MAX);
-		bch2_cut_front_s(l.k->p, _r);
-		return BCH_MERGE_PARTIAL;
-	}
+	if (le64_to_cpu(l.v->idx) + l.k->size != le64_to_cpu(r.v->idx))
+		return false;
 
 	bch2_key_resize(l.k, l.k->size + r.k->size);
-
-	return BCH_MERGE_MERGE;
+	return true;
 }
 
 /* indirect extents */
diff --git a/fs/bcachefs/reflink.h b/fs/bcachefs/reflink.h
index bfc785619ee8..68c5cb5a2780 100644
--- a/fs/bcachefs/reflink.h
+++ b/fs/bcachefs/reflink.h
@@ -5,8 +5,7 @@
 const char *bch2_reflink_p_invalid(const struct bch_fs *, struct bkey_s_c);
 void bch2_reflink_p_to_text(struct printbuf *, struct bch_fs *,
 			    struct bkey_s_c);
-enum merge_result bch2_reflink_p_merge(struct bch_fs *,
-				       struct bkey_s, struct bkey_s);
+bool bch2_reflink_p_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);
 
 #define bch2_bkey_ops_reflink_p (struct bkey_ops) {		\
 	.key_invalid	= bch2_reflink_p_invalid,		\
-- 
cgit 


From 81d22e5d832452ee479c64d5678c7422bc1bef5b Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 28 Apr 2021 23:52:19 -0400
Subject: bcachefs: Refactor extent_handle_overwrites()

Prep work for extent merging

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update_leaf.c | 94 +++++++++++++++++++++++++----------------
 1 file changed, 57 insertions(+), 37 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index e16feeebcf2c..1c12a11e45f7 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -775,74 +775,94 @@ bch2_trans_commit_get_rw_cold(struct btree_trans *trans)
 	return 0;
 }
 
+static int __btree_delete_at(struct btree_trans *trans, enum btree_id btree_id,
+			     struct bpos pos, unsigned trigger_flags)
+{
+	struct btree_iter *iter;
+	struct bkey_i *update;
+	int ret;
+
+	update = bch2_trans_kmalloc(trans, sizeof(struct bkey));
+	if ((ret = PTR_ERR_OR_ZERO(update)))
+		return ret;
+
+	bkey_init(&update->k);
+	update->k.p = pos;
+
+	iter = bch2_trans_get_iter(trans, btree_id, pos,
+				   BTREE_ITER_NOT_EXTENTS|
+				   BTREE_ITER_INTENT);
+	bch2_trans_update(trans, iter, update, trigger_flags);
+	bch2_trans_iter_put(trans, iter);
+	return 0;
+}
+
 static int extent_handle_overwrites(struct btree_trans *trans,
-				    enum btree_id btree_id,
-				    struct bkey_i *insert,
-				    unsigned trigger_flags)
+				    struct btree_insert_entry *i)
 {
 	struct btree_iter *iter, *update_iter;
-	struct bpos start = bkey_start_pos(&insert->k);
+	struct bpos start = bkey_start_pos(&i->k->k);
 	struct bkey_i *update;
 	struct bkey_s_c k;
-	int ret;
+	int ret = 0;
 
-	for_each_btree_key(trans, iter, btree_id, start,
-			   BTREE_ITER_INTENT|
-			   BTREE_ITER_WITH_UPDATES, k, ret) {
-		if (bkey_cmp(insert->k.p, bkey_start_pos(k.k)) <= 0)
-			break;
+	iter = bch2_trans_get_iter(trans, i->btree_id, start,
+				   BTREE_ITER_INTENT|
+				   BTREE_ITER_WITH_UPDATES|
+				   BTREE_ITER_NOT_EXTENTS);
+	k = bch2_btree_iter_peek(iter);
+	if (!k.k || (ret = bkey_err(k)))
+		goto out;
 
+	if (!bkey_cmp(k.k->p, bkey_start_pos(&i->k->k)))
+		goto next;
+
+	while (bkey_cmp(i->k->k.p, bkey_start_pos(k.k)) > 0) {
 		if (bkey_cmp(bkey_start_pos(k.k), start) < 0) {
 			update = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
 			if ((ret = PTR_ERR_OR_ZERO(update)))
-				break;
+				goto out;
 
 			bkey_reassemble(update, k);
 
 			bch2_cut_back(start, update);
 
-			update_iter = bch2_trans_get_iter(trans, btree_id, update->k.p,
+			update_iter = bch2_trans_get_iter(trans, i->btree_id, update->k.p,
 							  BTREE_ITER_NOT_EXTENTS|
 							  BTREE_ITER_INTENT);
-			bch2_trans_update(trans, update_iter, update,
-					  trigger_flags);
+			bch2_trans_update(trans, update_iter, update, i->trigger_flags);
 			bch2_trans_iter_put(trans, update_iter);
 		}
 
-		if (bkey_cmp(k.k->p, insert->k.p) < 0 ||
-		    (!bkey_cmp(k.k->p, insert->k.p) && bkey_deleted(&insert->k))) {
-			update = bch2_trans_kmalloc(trans, sizeof(struct bkey));
-			if ((ret = PTR_ERR_OR_ZERO(update)))
-				break;
-
-			bkey_init(&update->k);
-			update->k.p = k.k->p;
-
-			update_iter = bch2_trans_get_iter(trans, btree_id, update->k.p,
-							  BTREE_ITER_NOT_EXTENTS|
-							  BTREE_ITER_INTENT);
-			bch2_trans_update(trans, update_iter, update,
-					  trigger_flags);
-			bch2_trans_iter_put(trans, update_iter);
+		if (bkey_cmp(k.k->p, i->k->k.p) <= 0) {
+			ret = __btree_delete_at(trans, i->btree_id, k.k->p,
+						i->trigger_flags);
+			if (ret)
+				goto out;
 		}
 
-		if (bkey_cmp(k.k->p, insert->k.p) > 0) {
+		if (bkey_cmp(k.k->p, i->k->k.p) > 0) {
 			update = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
 			if ((ret = PTR_ERR_OR_ZERO(update)))
-				break;
+				goto out;
 
 			bkey_reassemble(update, k);
-			bch2_cut_front(insert->k.p, update);
+			bch2_cut_front(i->k->k.p, update);
 
-			update_iter = bch2_trans_get_iter(trans, btree_id, update->k.p,
+			update_iter = bch2_trans_get_iter(trans, i->btree_id, update->k.p,
 							  BTREE_ITER_NOT_EXTENTS|
 							  BTREE_ITER_INTENT);
 			bch2_trans_update(trans, update_iter, update,
-					  trigger_flags);
+					  i->trigger_flags);
 			bch2_trans_iter_put(trans, update_iter);
-			break;
+			goto out;
 		}
+next:
+		k = bch2_btree_iter_next(iter);
+		if (!k.k || (ret = bkey_err(k)))
+			goto out;
 	}
+out:
 	bch2_trans_iter_put(trans, iter);
 
 	return ret;
@@ -1002,7 +1022,7 @@ int bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter,
 		if (ret)
 			return ret;
 
-		ret = extent_handle_overwrites(trans, n.btree_id, n.k, flags);
+		ret = extent_handle_overwrites(trans, &n);
 		if (ret)
 			return ret;
 
@@ -1012,7 +1032,7 @@ int bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter,
 		if (bkey_deleted(&n.k->k))
 			return 0;
 
-		n.iter = bch2_trans_get_iter(trans, n.iter->btree_id, n.k->k.p,
+		n.iter = bch2_trans_get_iter(trans, n.btree_id, n.k->k.p,
 					     BTREE_ITER_INTENT|
 					     BTREE_ITER_NOT_EXTENTS);
 		bch2_trans_iter_put(trans, n.iter);
-- 
cgit 


From 5db95e50e11bdd0a191a22fdbc1e68ec900aff89 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 28 Apr 2021 23:52:19 -0400
Subject: bcachefs: Re-implement extent merging in transaction commit path

We haven't had extent merging in quite some time. It used to be done by
the btree code when sorting btree nodes, but that was eliminated as part
of the work to separate extent handling from core btree code.

This patch re-implements extent merging in the transaction commit path.
We don't currently have the ability to merge reflink pointers, we need
to do some work on the triggers code to be able to do that without
ending up with incorrect refcounts.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/bkey_methods.c      |  6 +++++-
 fs/bcachefs/bkey_methods.h      |  5 +----
 fs/bcachefs/btree_update_leaf.c | 41 ++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 46 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c
index ff9d770aabea..1ad6a9d30bb5 100644
--- a/fs/bcachefs/bkey_methods.c
+++ b/fs/bcachefs/bkey_methods.c
@@ -296,7 +296,11 @@ bool bch2_bkey_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r)
 {
 	const struct bkey_ops *ops = &bch2_bkey_ops[l.k->type];
 
-	return bch2_bkey_maybe_mergable(l.k, r.k) && ops->key_merge(c, l, r);
+	return bch2_bkey_maybe_mergable(l.k, r.k) &&
+		(u64) l.k->size + r.k->size <= KEY_SIZE_MAX &&
+		bch2_bkey_ops[l.k->type].key_merge &&
+		!bch2_key_merging_disabled &&
+		ops->key_merge(c, l, r);
 }
 
 static const struct old_bkey_type {
diff --git a/fs/bcachefs/bkey_methods.h b/fs/bcachefs/bkey_methods.h
index 3012035db1a3..4e316c2f6954 100644
--- a/fs/bcachefs/bkey_methods.h
+++ b/fs/bcachefs/bkey_methods.h
@@ -51,10 +51,7 @@ static inline bool bch2_bkey_maybe_mergable(const struct bkey *l, const struct b
 {
 	return l->type == r->type &&
 		!bversion_cmp(l->version, r->version) &&
-		!bpos_cmp(l->p, bkey_start_pos(r)) &&
-		(u64) l->size + r->size <= KEY_SIZE_MAX &&
-		bch2_bkey_ops[l->type].key_merge &&
-		!bch2_key_merging_disabled;
+		!bpos_cmp(l->p, bkey_start_pos(r));
 }
 
 bool bch2_bkey_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 1c12a11e45f7..4040771d645b 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -797,9 +797,38 @@ static int __btree_delete_at(struct btree_trans *trans, enum btree_id btree_id,
 	return 0;
 }
 
+static noinline int extent_front_merge(struct btree_trans *trans,
+				       struct bkey_s_c k,
+				       struct btree_insert_entry *i)
+{
+	struct bch_fs *c = trans->c;
+	struct bpos l_pos = k.k->p;
+	struct bkey_i *update;
+	int ret;
+
+	update = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
+	ret = PTR_ERR_OR_ZERO(update);
+	if (ret)
+		return ret;
+
+	bkey_reassemble(update, k);
+
+	if (bch2_bkey_merge(c, bkey_i_to_s(update), bkey_i_to_s_c(i->k))) {
+		ret = __btree_delete_at(trans, i->btree_id, l_pos,
+					i->trigger_flags);
+		if (ret)
+			return ret;
+
+		i->k = update;
+	}
+
+	return 0;
+}
+
 static int extent_handle_overwrites(struct btree_trans *trans,
 				    struct btree_insert_entry *i)
 {
+	struct bch_fs *c = trans->c;
 	struct btree_iter *iter, *update_iter;
 	struct bpos start = bkey_start_pos(&i->k->k);
 	struct bkey_i *update;
@@ -814,8 +843,15 @@ static int extent_handle_overwrites(struct btree_trans *trans,
 	if (!k.k || (ret = bkey_err(k)))
 		goto out;
 
-	if (!bkey_cmp(k.k->p, bkey_start_pos(&i->k->k)))
+	if (!bkey_cmp(k.k->p, bkey_start_pos(&i->k->k))) {
+		if (bch2_bkey_maybe_mergable(k.k, &i->k->k)) {
+			ret = extent_front_merge(trans, k, i);
+			if (ret)
+				goto out;
+		}
+
 		goto next;
+	}
 
 	while (bkey_cmp(i->k->k.p, bkey_start_pos(k.k)) > 0) {
 		if (bkey_cmp(bkey_start_pos(k.k), start) < 0) {
@@ -862,6 +898,9 @@ next:
 		if (!k.k || (ret = bkey_err(k)))
 			goto out;
 	}
+
+	if (bch2_bkey_maybe_mergable(&i->k->k, k.k))
+		bch2_bkey_merge(c, bkey_i_to_s(i->k), k);
 out:
 	bch2_trans_iter_put(trans, iter);
 
-- 
cgit 


From c2177e4da332df6fb91b08b874a79051f13897bc Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 15 May 2021 00:37:37 -0400
Subject: bcachefs: Improved extent merging

Previously, checksummed extents could only be merged when the checksum
covered only the currently live data.

xfstest generic/064 creates a test file, then uses finsert calls to
split the extent, then collapse calls to see if they get merged. But
without any reads to trigger the narrow_crcs path, each of the split
extents will still have a checksum for the entire original extent.

This patch improves the extent merge path so that if either of the
extents we're attempting to merge has a checksum that covers the entire
merged extent, we just use that checksum.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/extents.c | 139 ++++++++++++++++++++++++++++----------------------
 1 file changed, 79 insertions(+), 60 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 2ced3557e13b..abb15688a664 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -233,102 +233,121 @@ bool bch2_extent_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r)
 {
 	struct bkey_s_extent l = bkey_s_to_extent(_l);
 	struct bkey_s_c_extent r = bkey_s_c_to_extent(_r);
-	union bch_extent_entry *en_l = l.v->start;
-	const union bch_extent_entry *en_r = r.v->start;
-	struct bch_extent_crc_unpacked crc_l, crc_r;
+	union bch_extent_entry *en_l;
+	const union bch_extent_entry *en_r;
+	struct extent_ptr_decoded lp, rp;
+	bool use_right_ptr;
+	struct bch_dev *ca;
 
 	if (bkey_val_u64s(l.k) != bkey_val_u64s(r.k))
 		return false;
 
-	crc_l = bch2_extent_crc_unpack(l.k, NULL);
-
 	extent_for_each_entry(l, en_l) {
 		en_r = vstruct_idx(r.v, (u64 *) en_l - l.v->_data);
 
 		if (extent_entry_type(en_l) != extent_entry_type(en_r))
 			return false;
+	}
 
-		switch (extent_entry_type(en_l)) {
-		case BCH_EXTENT_ENTRY_ptr: {
-			const struct bch_extent_ptr *lp = &en_l->ptr;
-			const struct bch_extent_ptr *rp = &en_r->ptr;
-			struct bch_dev *ca;
-
-			if (lp->offset + crc_l.compressed_size != rp->offset ||
-			    lp->dev			!= rp->dev ||
-			    lp->gen			!= rp->gen)
-				return false;
-
-			/* We don't allow extents to straddle buckets: */
-			ca = bch_dev_bkey_exists(c, lp->dev);
-
-			if (PTR_BUCKET_NR(ca, lp) != PTR_BUCKET_NR(ca, rp))
-				return false;
+	en_l = l.v->start;
+	en_r = r.v->start;
+	lp.crc = bch2_extent_crc_unpack(l.k, NULL);
+	rp.crc = bch2_extent_crc_unpack(r.k, NULL);
+
+	while (__bkey_ptr_next_decode(l.k, extent_entry_last(l), lp, en_l) &&
+	       __bkey_ptr_next_decode(r.k, extent_entry_last(r), rp, en_r)) {
+		if (lp.ptr.offset + lp.crc.offset + lp.crc.live_size !=
+		    rp.ptr.offset + rp.crc.offset ||
+		    lp.ptr.dev			!= rp.ptr.dev ||
+		    lp.ptr.gen			!= rp.ptr.gen ||
+		    lp.has_ec			!= rp.has_ec)
+			return false;
 
-			break;
-		}
-		case BCH_EXTENT_ENTRY_stripe_ptr:
-			if (en_l->stripe_ptr.block	!= en_r->stripe_ptr.block ||
-			    en_l->stripe_ptr.idx	!= en_r->stripe_ptr.idx)
-				return false;
-			break;
-		case BCH_EXTENT_ENTRY_crc32:
-		case BCH_EXTENT_ENTRY_crc64:
-		case BCH_EXTENT_ENTRY_crc128:
-			crc_l = bch2_extent_crc_unpack(l.k, entry_to_crc(en_l));
-			crc_r = bch2_extent_crc_unpack(r.k, entry_to_crc(en_r));
+		/* Extents may not straddle buckets: */
+		ca = bch_dev_bkey_exists(c, lp.ptr.dev);
+		if (PTR_BUCKET_NR(ca, &lp.ptr) != PTR_BUCKET_NR(ca, &rp.ptr))
+			return false;
 
-			if (crc_l.csum_type		!= crc_r.csum_type ||
-			    crc_l.compression_type	!= crc_r.compression_type ||
-			    crc_l.nonce			!= crc_r.nonce)
-				return false;
+		if (lp.has_ec			!= rp.has_ec ||
+		    (lp.has_ec &&
+		     (lp.ec.block		!= rp.ec.block ||
+		      lp.ec.redundancy		!= rp.ec.redundancy ||
+		      lp.ec.idx			!= rp.ec.idx)))
+			return false;
 
-			if (crc_l.offset + crc_l.live_size != crc_l.compressed_size ||
-			    crc_r.offset)
-				return false;
+		if (lp.crc.compression_type	!= rp.crc.compression_type ||
+		    lp.crc.nonce		!= rp.crc.nonce)
+			return false;
 
-			if (!bch2_checksum_mergeable(crc_l.csum_type))
+		if (lp.crc.offset + lp.crc.live_size + rp.crc.live_size <=
+		    lp.crc.uncompressed_size) {
+			/* can use left extent's crc entry */
+		} else if (lp.crc.live_size <= rp.crc.offset ) {
+			/* can use right extent's crc entry */
+		} else {
+			/* check if checksums can be merged: */
+			if (lp.crc.csum_type		!= rp.crc.csum_type ||
+			    lp.crc.nonce		!= rp.crc.nonce ||
+			    crc_is_compressed(lp.crc) ||
+			    !bch2_checksum_mergeable(lp.crc.csum_type))
 				return false;
 
-			if (crc_is_compressed(crc_l))
+			if (lp.crc.offset + lp.crc.live_size != lp.crc.compressed_size ||
+			    rp.crc.offset)
 				return false;
 
-			if (crc_l.csum_type &&
-			    crc_l.uncompressed_size +
-			    crc_r.uncompressed_size > c->sb.encoded_extent_max)
+			if (lp.crc.csum_type &&
+			    lp.crc.uncompressed_size +
+			    rp.crc.uncompressed_size > c->sb.encoded_extent_max)
 				return false;
 
-			if (crc_l.uncompressed_size + crc_r.uncompressed_size >
+			if (lp.crc.uncompressed_size + rp.crc.uncompressed_size >
 			    bch2_crc_field_size_max[extent_entry_type(en_l)])
 				return false;
-
-			break;
-		default:
-			return false;
 		}
+
+		en_l = extent_entry_next(en_l);
+		en_r = extent_entry_next(en_r);
 	}
 
+	use_right_ptr = false;
 	extent_for_each_entry(l, en_l) {
 		struct bch_extent_crc_unpacked crc_l, crc_r;
 
 		en_r = vstruct_idx(r.v, (u64 *) en_l - l.v->_data);
 
+		if (extent_entry_type(en_l) == BCH_EXTENT_ENTRY_ptr &&
+		    use_right_ptr)
+			en_l->ptr = en_r->ptr;
+
 		if (!extent_entry_is_crc(en_l))
 			continue;
 
+		use_right_ptr = false;
+
 		crc_l = bch2_extent_crc_unpack(l.k, entry_to_crc(en_l));
 		crc_r = bch2_extent_crc_unpack(r.k, entry_to_crc(en_r));
 
-		crc_l.csum = bch2_checksum_merge(crc_l.csum_type,
-						 crc_l.csum,
-						 crc_r.csum,
-						 crc_r.uncompressed_size << 9);
-
-		crc_l.uncompressed_size	+= crc_r.uncompressed_size;
-		crc_l.compressed_size	+= crc_r.compressed_size;
-
-		bch2_extent_crc_pack(entry_to_crc(en_l), crc_l,
-				     extent_entry_type(en_l));
+		if (crc_l.offset + crc_l.live_size + crc_r.live_size <=
+		    crc_l.uncompressed_size) {
+			/* can use left extent's crc entry */
+		} else if (crc_l.live_size <= crc_r.offset ) {
+			/* can use right extent's crc entry */
+			crc_r.offset -= crc_l.live_size;
+			bch2_extent_crc_pack(entry_to_crc(en_l), crc_r,
+					     extent_entry_type(en_l));
+			use_right_ptr = true;
+		} else {
+			crc_l.csum = bch2_checksum_merge(crc_l.csum_type,
+							 crc_l.csum,
+							 crc_r.csum,
+							 crc_r.uncompressed_size << 9);
+
+			crc_l.uncompressed_size	+= crc_r.uncompressed_size;
+			crc_l.compressed_size	+= crc_r.compressed_size;
+			bch2_extent_crc_pack(entry_to_crc(en_l), crc_l,
+					     extent_entry_type(en_l));
+		}
 	}
 
 	bch2_key_resize(l.k, l.k->size + r.k->size);
-- 
cgit 


From b058ac20916c9bfbcd5b9caa356bd95269d5c78c Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 15 May 2021 15:04:08 -0400
Subject: bcachefs: Merging for indirect extents

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/extents.c | 95 +++++++++++++++++++++++++++------------------------
 fs/bcachefs/reflink.c |  8 +++++
 2 files changed, 58 insertions(+), 45 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index abb15688a664..c7643e6c8816 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -229,33 +229,36 @@ void bch2_extent_to_text(struct printbuf *out, struct bch_fs *c,
 	bch2_bkey_ptrs_to_text(out, c, k);
 }
 
-bool bch2_extent_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r)
+bool bch2_extent_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r)
 {
-	struct bkey_s_extent l = bkey_s_to_extent(_l);
-	struct bkey_s_c_extent r = bkey_s_c_to_extent(_r);
+	struct bkey_ptrs   l_ptrs = bch2_bkey_ptrs(l);
+	struct bkey_ptrs_c r_ptrs = bch2_bkey_ptrs_c(r);
 	union bch_extent_entry *en_l;
 	const union bch_extent_entry *en_r;
 	struct extent_ptr_decoded lp, rp;
 	bool use_right_ptr;
 	struct bch_dev *ca;
 
-	if (bkey_val_u64s(l.k) != bkey_val_u64s(r.k))
-		return false;
-
-	extent_for_each_entry(l, en_l) {
-		en_r = vstruct_idx(r.v, (u64 *) en_l - l.v->_data);
-
+	en_l = l_ptrs.start;
+	en_r = r_ptrs.start;
+	while (en_l < l_ptrs.end && en_r < r_ptrs.end) {
 		if (extent_entry_type(en_l) != extent_entry_type(en_r))
 			return false;
+
+		en_l = extent_entry_next(en_l);
+		en_r = extent_entry_next(en_r);
 	}
 
-	en_l = l.v->start;
-	en_r = r.v->start;
+	if (en_l < l_ptrs.end || en_r < r_ptrs.end)
+		return false;
+
+	en_l = l_ptrs.start;
+	en_r = r_ptrs.start;
 	lp.crc = bch2_extent_crc_unpack(l.k, NULL);
 	rp.crc = bch2_extent_crc_unpack(r.k, NULL);
 
-	while (__bkey_ptr_next_decode(l.k, extent_entry_last(l), lp, en_l) &&
-	       __bkey_ptr_next_decode(r.k, extent_entry_last(r), rp, en_r)) {
+	while (__bkey_ptr_next_decode(l.k, l_ptrs.end, lp, en_l) &&
+	       __bkey_ptr_next_decode(r.k, r_ptrs.end, rp, en_r)) {
 		if (lp.ptr.offset + lp.crc.offset + lp.crc.live_size !=
 		    rp.ptr.offset + rp.crc.offset ||
 		    lp.ptr.dev			!= rp.ptr.dev ||
@@ -311,43 +314,45 @@ bool bch2_extent_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r)
 	}
 
 	use_right_ptr = false;
-	extent_for_each_entry(l, en_l) {
-		struct bch_extent_crc_unpacked crc_l, crc_r;
-
-		en_r = vstruct_idx(r.v, (u64 *) en_l - l.v->_data);
-
+	en_l = l_ptrs.start;
+	en_r = r_ptrs.start;
+	while (en_l < l_ptrs.end) {
 		if (extent_entry_type(en_l) == BCH_EXTENT_ENTRY_ptr &&
 		    use_right_ptr)
 			en_l->ptr = en_r->ptr;
 
-		if (!extent_entry_is_crc(en_l))
-			continue;
-
-		use_right_ptr = false;
-
-		crc_l = bch2_extent_crc_unpack(l.k, entry_to_crc(en_l));
-		crc_r = bch2_extent_crc_unpack(r.k, entry_to_crc(en_r));
-
-		if (crc_l.offset + crc_l.live_size + crc_r.live_size <=
-		    crc_l.uncompressed_size) {
-			/* can use left extent's crc entry */
-		} else if (crc_l.live_size <= crc_r.offset ) {
-			/* can use right extent's crc entry */
-			crc_r.offset -= crc_l.live_size;
-			bch2_extent_crc_pack(entry_to_crc(en_l), crc_r,
-					     extent_entry_type(en_l));
-			use_right_ptr = true;
-		} else {
-			crc_l.csum = bch2_checksum_merge(crc_l.csum_type,
-							 crc_l.csum,
-							 crc_r.csum,
-							 crc_r.uncompressed_size << 9);
-
-			crc_l.uncompressed_size	+= crc_r.uncompressed_size;
-			crc_l.compressed_size	+= crc_r.compressed_size;
-			bch2_extent_crc_pack(entry_to_crc(en_l), crc_l,
-					     extent_entry_type(en_l));
+		if (extent_entry_is_crc(en_l)) {
+			struct bch_extent_crc_unpacked crc_l =
+				bch2_extent_crc_unpack(l.k, entry_to_crc(en_l));
+			struct bch_extent_crc_unpacked crc_r =
+				bch2_extent_crc_unpack(r.k, entry_to_crc(en_r));
+
+			use_right_ptr = false;
+
+			if (crc_l.offset + crc_l.live_size + crc_r.live_size <=
+			    crc_l.uncompressed_size) {
+				/* can use left extent's crc entry */
+			} else if (crc_l.live_size <= crc_r.offset ) {
+				/* can use right extent's crc entry */
+				crc_r.offset -= crc_l.live_size;
+				bch2_extent_crc_pack(entry_to_crc(en_l), crc_r,
+						     extent_entry_type(en_l));
+				use_right_ptr = true;
+			} else {
+				crc_l.csum = bch2_checksum_merge(crc_l.csum_type,
+								 crc_l.csum,
+								 crc_r.csum,
+								 crc_r.uncompressed_size << 9);
+
+				crc_l.uncompressed_size	+= crc_r.uncompressed_size;
+				crc_l.compressed_size	+= crc_r.compressed_size;
+				bch2_extent_crc_pack(entry_to_crc(en_l), crc_l,
+						     extent_entry_type(en_l));
+			}
 		}
+
+		en_l = extent_entry_next(en_l);
+		en_r = extent_entry_next(en_r);
 	}
 
 	bch2_key_resize(l.k, l.k->size + r.k->size);
diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c
index ead31f9e31aa..ba700810a4be 100644
--- a/fs/bcachefs/reflink.c
+++ b/fs/bcachefs/reflink.c
@@ -82,6 +82,14 @@ void bch2_reflink_v_to_text(struct printbuf *out, struct bch_fs *c,
 	bch2_bkey_ptrs_to_text(out, c, k);
 }
 
+bool bch2_reflink_v_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r)
+{
+	struct bkey_s_reflink_v   l = bkey_s_to_reflink_v(_l);
+	struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(_r);
+
+	return l.v->refcount == r.v->refcount && bch2_extent_merge(c, _l, _r);
+}
+
 /* indirect inline data */
 
 const char *bch2_indirect_inline_data_invalid(const struct bch_fs *c,
-- 
cgit 


From 7ed158f2949a3c142b2be9a0bc67775cb69b9df5 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 7 Jun 2021 16:50:30 -0400
Subject: bcachefs: Always zero memory from bch2_trans_kmalloc()

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_iter.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index c03a7de19f78..d9ce82e04e76 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -2299,6 +2299,7 @@ void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
 
 	p = trans->mem + trans->mem_top;
 	trans->mem_top += size;
+	memset(p, 0, size);
 	return p;
 }
 
-- 
cgit 


From 90d22a660a4ed5e1500602f15edcb91ab38a1e7c Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 10 Jun 2021 23:51:09 -0400
Subject: bcachefs: Fix overflow in journal_replay_entry_early

If filesystem on disk was used by a version with a larger BCH_DATA_NR
thas the currently running version, we don't want this to cause a buffer
overrun.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/recovery.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 9bd6348842e0..f32414171aab 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -725,7 +725,7 @@ static int journal_replay_entry_early(struct bch_fs *c,
 		ca->usage_base->buckets_ec		= le64_to_cpu(u->buckets_ec);
 		ca->usage_base->buckets_unavailable	= le64_to_cpu(u->buckets_unavailable);
 
-		for (i = 0; i < nr_types; i++) {
+		for (i = 0; i < min_t(unsigned, nr_types, BCH_DATA_NR); i++) {
 			ca->usage_base->d[i].buckets	= le64_to_cpu(u->d[i].buckets);
 			ca->usage_base->d[i].sectors	= le64_to_cpu(u->d[i].sectors);
 			ca->usage_base->d[i].fragmented	= le64_to_cpu(u->d[i].fragmented);
-- 
cgit 


From a49e9a0589d1828af787bacf0a1e18eca7facb88 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 12 Jun 2021 15:45:56 -0400
Subject: bcachefs: Fix null ptr deref when splitting compressed extents

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_iter.c        |  1 +
 fs/bcachefs/btree_types.h       |  6 +++++-
 fs/bcachefs/btree_update_leaf.c | 37 +++++++++++++++++++++----------------
 fs/bcachefs/extent_update.c     | 35 -----------------------------------
 fs/bcachefs/extent_update.h     |  4 ----
 5 files changed, 27 insertions(+), 56 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index d9ce82e04e76..0dd7938101a5 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -2329,6 +2329,7 @@ void bch2_trans_reset(struct btree_trans *trans, unsigned flags)
 
 	trans->iters_touched &= trans->iters_live;
 
+	trans->extra_journal_res	= 0;
 	trans->nr_updates		= 0;
 	trans->mem_top			= 0;
 
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index e9c209f7a39f..23de816b4757 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -383,6 +383,11 @@ struct btree_trans {
 	unsigned		used_mempool:1;
 	unsigned		error:1;
 	unsigned		in_traverse_all:1;
+	/*
+	 * For when bch2_trans_update notices we'll be splitting a compressed
+	 * extent:
+	 */
+	unsigned		extra_journal_res;
 
 	u64			iters_linked;
 	u64			iters_live;
@@ -680,7 +685,6 @@ enum btree_insert_ret {
 	BTREE_INSERT_OK,
 	/* leaf node needs to be split */
 	BTREE_INSERT_BTREE_NODE_FULL,
-	BTREE_INSERT_ENOSPC,
 	BTREE_INSERT_NEED_MARK_REPLICAS,
 	BTREE_INSERT_NEED_JOURNAL_RES,
 	BTREE_INSERT_NEED_JOURNAL_RECLAIM,
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 4040771d645b..852d4e39199f 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -696,10 +696,6 @@ int bch2_trans_commit_error(struct btree_trans *trans,
 			ret = -EINTR;
 		}
 		break;
-	case BTREE_INSERT_ENOSPC:
-		BUG_ON(flags & BTREE_INSERT_NOFAIL);
-		ret = -ENOSPC;
-		break;
 	case BTREE_INSERT_NEED_MARK_REPLICAS:
 		bch2_trans_unlock(trans);
 
@@ -833,7 +829,7 @@ static int extent_handle_overwrites(struct btree_trans *trans,
 	struct bpos start = bkey_start_pos(&i->k->k);
 	struct bkey_i *update;
 	struct bkey_s_c k;
-	int ret = 0;
+	int ret = 0, compressed_sectors;
 
 	iter = bch2_trans_get_iter(trans, i->btree_id, start,
 				   BTREE_ITER_INTENT|
@@ -854,6 +850,16 @@ static int extent_handle_overwrites(struct btree_trans *trans,
 	}
 
 	while (bkey_cmp(i->k->k.p, bkey_start_pos(k.k)) > 0) {
+		/*
+		 * If we're going to be splitting a compressed extent, note it
+		 * so that __bch2_trans_commit() can increase our disk
+		 * reservation:
+		 */
+		if (bkey_cmp(bkey_start_pos(k.k), start) < 0 &&
+		    bkey_cmp(k.k->p, i->k->k.p) > 0 &&
+		    (compressed_sectors = bch2_bkey_sectors_compressed(k)))
+			trans->extra_journal_res += compressed_sectors;
+
 		if (bkey_cmp(bkey_start_pos(k.k), start) < 0) {
 			update = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
 			if ((ret = PTR_ERR_OR_ZERO(update)))
@@ -992,6 +998,15 @@ int __bch2_trans_commit(struct btree_trans *trans)
 			trans->journal_preres_u64s += u64s;
 		trans->journal_u64s += u64s;
 	}
+
+	if (trans->extra_journal_res) {
+		ret = bch2_disk_reservation_add(trans->c, trans->disk_res,
+				trans->extra_journal_res,
+				(trans->flags & BTREE_INSERT_NOFAIL)
+				? BCH_DISK_RESERVATION_NOFAIL : 0);
+		if (ret)
+			goto err;
+	}
 retry:
 	memset(&trans->journal_res, 0, sizeof(trans->journal_res));
 
@@ -1045,22 +1060,12 @@ int bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter,
 	BUG_ON(trans->nr_updates >= BTREE_ITER_MAX);
 
 #ifdef CONFIG_BCACHEFS_DEBUG
-	BUG_ON(bkey_cmp(iter->pos,
-			is_extent ? bkey_start_pos(&k->k) : k->k.p));
-
-	trans_for_each_update(trans, i) {
-		BUG_ON(bkey_cmp(i->iter->pos, i->k->k.p));
-
+	trans_for_each_update(trans, i)
 		BUG_ON(i != trans->updates &&
 		       btree_insert_entry_cmp(i - 1, i) >= 0);
-	}
 #endif
 
 	if (is_extent) {
-		ret = bch2_extent_can_insert(trans, n.iter, n.k);
-		if (ret)
-			return ret;
-
 		ret = extent_handle_overwrites(trans, &n);
 		if (ret)
 			return ret;
diff --git a/fs/bcachefs/extent_update.c b/fs/bcachefs/extent_update.c
index bb4b2b4352e0..ef4aaf1c30ed 100644
--- a/fs/bcachefs/extent_update.c
+++ b/fs/bcachefs/extent_update.c
@@ -173,38 +173,3 @@ int bch2_extent_is_atomic(struct bkey_i *k, struct btree_iter *iter)
 
 	return !bkey_cmp(end, k->k.p);
 }
-
-enum btree_insert_ret
-bch2_extent_can_insert(struct btree_trans *trans,
-		       struct btree_iter *iter,
-		       struct bkey_i *insert)
-{
-	struct bkey_s_c k;
-	int ret, sectors;
-
-	k = bch2_btree_iter_peek_slot(iter);
-	ret = bkey_err(k);
-	if (ret)
-		return ret;
-
-	/* Check if we're splitting a compressed extent: */
-
-	if (bkey_cmp(bkey_start_pos(&insert->k), bkey_start_pos(k.k)) > 0 &&
-	    bkey_cmp(insert->k.p, k.k->p) < 0 &&
-	    (sectors = bch2_bkey_sectors_compressed(k))) {
-		int flags = trans->flags & BTREE_INSERT_NOFAIL
-			? BCH_DISK_RESERVATION_NOFAIL : 0;
-
-		switch (bch2_disk_reservation_add(trans->c, trans->disk_res,
-						  sectors, flags)) {
-		case 0:
-			break;
-		case -ENOSPC:
-			return BTREE_INSERT_ENOSPC;
-		default:
-			BUG();
-		}
-	}
-
-	return BTREE_INSERT_OK;
-}
diff --git a/fs/bcachefs/extent_update.h b/fs/bcachefs/extent_update.h
index 38dc084627d2..2fa4602967e0 100644
--- a/fs/bcachefs/extent_update.h
+++ b/fs/bcachefs/extent_update.h
@@ -9,8 +9,4 @@ int bch2_extent_atomic_end(struct btree_iter *, struct bkey_i *,
 int bch2_extent_trim_atomic(struct bkey_i *, struct btree_iter *);
 int bch2_extent_is_atomic(struct bkey_i *, struct btree_iter *);
 
-enum btree_insert_ret
-bch2_extent_can_insert(struct btree_trans *, struct btree_iter *,
-		       struct bkey_i *);
-
 #endif /* _BCACHEFS_EXTENT_UPDATE_H */
-- 
cgit 


From 044c8c9e05bc87cdc610eb320e8fdc694f3866df Mon Sep 17 00:00:00 2001
From: Dan Robertson <dan@dlrobertson.com>
Date: Thu, 10 Jun 2021 07:52:42 -0400
Subject: bcachefs: mount: fix null deref with null devname

 - Fix null deref on mount when given a null device name.
 - Move the dev_name checks to return EINVAL when it is invalid.

Signed-off-by: Dan Robertson <dan@dlrobertson.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index 99885fb97aa2..efb467316756 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -1325,9 +1325,6 @@ static char **split_devs(const char *_dev_name, unsigned *nr)
 	char *dev_name = NULL, **devs = NULL, *s;
 	size_t i, nr_devs = 0;
 
-	if (strlen(_dev_name) == 0)
-		return NULL;
-
 	dev_name = kstrdup(_dev_name, GFP_KERNEL);
 	if (!dev_name)
 		return NULL;
@@ -1503,6 +1500,9 @@ static struct dentry *bch2_mount(struct file_system_type *fs_type,
 	if (ret)
 		return ERR_PTR(ret);
 
+	if (!dev_name || strlen(dev_name) == 0)
+		return ERR_PTR(-EINVAL);
+
 	devs = split_devs(dev_name, &nr_devs);
 	if (!devs)
 		return ERR_PTR(-ENOMEM);
-- 
cgit 


From 45c2e33f791a6c5ebae0accdbfbfef4720701339 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 12 Jun 2021 17:20:02 -0400
Subject: bcachefs: Allow shorter JSET_ENTRY_dev_usage entries

If the last entry(ies) would be all zeros, there's no need to write them
out - the read path already handles that.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/journal_io.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index a7d08657cb4f..e797d6376a82 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -449,7 +449,7 @@ static int journal_entry_validate_dev_usage(struct bch_fs *c,
 	struct jset_entry_dev_usage *u =
 		container_of(entry, struct jset_entry_dev_usage, entry);
 	unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64);
-	unsigned expected = sizeof(*u) + sizeof(u->d[0]) * 7; /* Current value of BCH_DATA_NR */
+	unsigned expected = sizeof(*u);
 	unsigned dev;
 	int ret = 0;
 
-- 
cgit 


From 953ee28a3ee4672455aa13f805c81c96a08ceded Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 10 Jun 2021 20:15:50 -0400
Subject: bcachefs: Kill bch2_btree_iter_peek_cached()

It's now been rolled into bch2_btree_iter_peek_slot()

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_iter.c | 65 +++++++++++++++++++++---------------------------
 fs/bcachefs/btree_iter.h | 11 +++-----
 fs/bcachefs/buckets.c    | 15 +++--------
 fs/bcachefs/inode.c      | 17 ++++++-------
 4 files changed, 42 insertions(+), 66 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 0dd7938101a5..52ce2fb87cd7 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1824,35 +1824,54 @@ struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *iter)
 
 struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
 {
-	struct bpos search_key = btree_iter_search_key(iter);
+	struct bpos search_key;
 	struct bkey_s_c k;
 	int ret;
 
-	EBUG_ON(btree_iter_type(iter) != BTREE_ITER_KEYS);
+	EBUG_ON(btree_iter_type(iter) != BTREE_ITER_KEYS &&
+		btree_iter_type(iter) != BTREE_ITER_CACHED);
 	bch2_btree_iter_verify(iter);
 	bch2_btree_iter_verify_entry_exit(iter);
 
-	btree_iter_set_search_pos(iter, search_key);
-
 	/* extents can't span inode numbers: */
 	if ((iter->flags & BTREE_ITER_IS_EXTENTS) &&
-	    iter->pos.offset == KEY_OFFSET_MAX) {
+	    unlikely(iter->pos.offset == KEY_OFFSET_MAX)) {
 		if (iter->pos.inode == KEY_INODE_MAX)
 			return bkey_s_c_null;
 
 		bch2_btree_iter_set_pos(iter, bpos_nosnap_successor(iter->pos));
 	}
 
+	search_key = btree_iter_search_key(iter);
+	btree_iter_set_search_pos(iter, search_key);
+
 	ret = btree_iter_traverse(iter);
 	if (unlikely(ret))
 		return bkey_s_c_err(ret);
 
-	if (!(iter->flags & BTREE_ITER_IS_EXTENTS)) {
-		struct bkey_i *next_update = btree_trans_peek_updates(iter, search_key);
+	if (btree_iter_type(iter) == BTREE_ITER_CACHED ||
+	    !(iter->flags & BTREE_ITER_IS_EXTENTS)) {
+		struct bkey_i *next_update;
+		struct bkey_cached *ck;
 
-		k = btree_iter_level_peek_all(iter, &iter->l[0]);
-		EBUG_ON(k.k && bkey_deleted(k.k) && bkey_cmp(k.k->p, iter->pos) == 0);
+		switch (btree_iter_type(iter)) {
+		case BTREE_ITER_KEYS:
+			k = btree_iter_level_peek_all(iter, &iter->l[0]);
+			EBUG_ON(k.k && bkey_deleted(k.k) && bkey_cmp(k.k->p, iter->pos) == 0);
+			break;
+		case BTREE_ITER_CACHED:
+			ck = (void *) iter->l[0].b;
+			EBUG_ON(iter->btree_id != ck->key.btree_id ||
+				bkey_cmp(iter->pos, ck->key.pos));
+			BUG_ON(!ck->valid);
 
+			k = bkey_i_to_s_c(ck->k);
+			break;
+		case BTREE_ITER_NODES:
+			BUG();
+		}
+
+		next_update = btree_trans_peek_updates(iter, search_key);
 		if (next_update &&
 		    (!k.k || bpos_cmp(next_update->k.p, k.k->p) <= 0)) {
 			iter->k = next_update->k;
@@ -1929,34 +1948,6 @@ struct bkey_s_c bch2_btree_iter_prev_slot(struct btree_iter *iter)
 	return bch2_btree_iter_peek_slot(iter);
 }
 
-struct bkey_s_c bch2_btree_iter_peek_cached(struct btree_iter *iter)
-{
-	struct bkey_i *next_update;
-	struct bkey_cached *ck;
-	int ret;
-
-	EBUG_ON(btree_iter_type(iter) != BTREE_ITER_CACHED);
-	bch2_btree_iter_verify(iter);
-
-	next_update = btree_trans_peek_updates(iter, iter->pos);
-	if (next_update && !bpos_cmp(next_update->k.p, iter->pos))
-		return bkey_i_to_s_c(next_update);
-
-	ret = btree_iter_traverse(iter);
-	if (unlikely(ret))
-		return bkey_s_c_err(ret);
-
-	ck = (void *) iter->l[0].b;
-
-	EBUG_ON(iter->btree_id != ck->key.btree_id ||
-		bkey_cmp(iter->pos, ck->key.pos));
-	BUG_ON(!ck->valid);
-
-	iter->should_be_locked = true;
-
-	return bkey_i_to_s_c(ck->k);
-}
-
 static inline void bch2_btree_iter_init(struct btree_trans *trans,
 			struct btree_iter *iter, enum btree_id btree_id)
 {
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index ba98cfea4d60..27c685a482ec 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -160,8 +160,6 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *);
 struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *);
 struct bkey_s_c bch2_btree_iter_prev_slot(struct btree_iter *);
 
-struct bkey_s_c bch2_btree_iter_peek_cached(struct btree_iter *);
-
 bool bch2_btree_iter_advance(struct btree_iter *);
 bool bch2_btree_iter_rewind(struct btree_iter *);
 
@@ -224,12 +222,9 @@ static inline int bch2_trans_cond_resched(struct btree_trans *trans)
 static inline struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter,
 						     unsigned flags)
 {
-	if ((flags & BTREE_ITER_TYPE) == BTREE_ITER_CACHED)
-		return bch2_btree_iter_peek_cached(iter);
-	else
-		return flags & BTREE_ITER_SLOTS
-			? bch2_btree_iter_peek_slot(iter)
-			: bch2_btree_iter_peek(iter);
+	return flags & BTREE_ITER_SLOTS
+		? bch2_btree_iter_peek_slot(iter)
+		: bch2_btree_iter_peek(iter);
 }
 
 static inline struct bkey_s_c __bch2_btree_iter_next(struct btree_iter *iter,
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 02aa84e896e5..c0e4cec21b74 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -1917,17 +1917,10 @@ int bch2_trans_mark_update(struct btree_trans *trans,
 	if (!btree_node_type_needs_gc(iter->btree_id))
 		return 0;
 
-	if (btree_iter_type(iter) != BTREE_ITER_CACHED) {
-		old = bch2_btree_iter_peek_slot(iter);
-		ret = bkey_err(old);
-		if (ret)
-			return ret;
-	} else {
-		struct bkey_cached *ck = (void *) iter->l[0].b;
-
-		BUG_ON(!ck->valid);
-		old = bkey_i_to_s_c(ck->k);
-	}
+	old = bch2_btree_iter_peek_slot(iter);
+	ret = bkey_err(old);
+	if (ret)
+		return ret;
 
 	if (old.k->type == new->k.type &&
 	    !btree_node_type_is_extents(iter->btree_id)) {
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index c65bfee1897e..c5f93b8ca1c6 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -302,7 +302,7 @@ struct btree_iter *bch2_inode_peek(struct btree_trans *trans,
 
 	iter = bch2_trans_get_iter(trans, BTREE_ID_inodes, POS(0, inum),
 				   BTREE_ITER_CACHED|flags);
-	k = bch2_btree_iter_peek_cached(iter);
+	k = bch2_btree_iter_peek_slot(iter);
 	ret = bkey_err(k);
 	if (ret)
 		goto err;
@@ -600,15 +600,12 @@ int bch2_inode_rm(struct bch_fs *c, u64 inode_nr, bool cached)
 retry:
 	bch2_trans_begin(&trans);
 
-	if (cached) {
-		iter = bch2_trans_get_iter(&trans, BTREE_ID_inodes, POS(0, inode_nr),
-					   BTREE_ITER_CACHED|BTREE_ITER_INTENT);
-		k = bch2_btree_iter_peek_cached(iter);
-	} else {
-		iter = bch2_trans_get_iter(&trans, BTREE_ID_inodes, POS(0, inode_nr),
-					   BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
-		k = bch2_btree_iter_peek_slot(iter);
-	}
+	iter = bch2_trans_get_iter(&trans, BTREE_ID_inodes, POS(0, inode_nr),
+				   (cached
+				    ? BTREE_ITER_CACHED
+				    : BTREE_ITER_SLOTS)|
+				   BTREE_ITER_INTENT);
+	k = bch2_btree_iter_peek_slot(iter);
 
 	ret = bkey_err(k);
 	if (ret)
-- 
cgit 


From 290448ed2e0ac1165d2dd7a37da8b1e58e69bc08 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 10 Jun 2021 23:33:27 -0400
Subject: bcachefs: Don't underflow c->sectors_available

This rarely used error path should've been checking for underflow -
oops.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/buckets.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index c0e4cec21b74..da24c4038fc9 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -1437,7 +1437,14 @@ void bch2_trans_fs_usage_apply(struct btree_trans *trans,
 	 */
 	should_not_have_added = added - (s64) disk_res_sectors;
 	if (unlikely(should_not_have_added > 0)) {
-		atomic64_sub(should_not_have_added, &c->sectors_available);
+		u64 old, new, v = atomic64_read(&c->sectors_available);
+
+		do {
+			old = v;
+			new = max_t(s64, 0, old - should_not_have_added);
+		} while ((v = atomic64_cmpxchg(&c->sectors_available,
+					       old, new)) != old);
+
 		added -= should_not_have_added;
 		warn = true;
 	}
-- 
cgit 


From bb6bbf4a06c57c064a95a8b0c7a9b0746187170a Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 12 Jun 2021 22:33:53 -0400
Subject: bcachefs: Clear iter->should_be_locked in bch2_trans_reset

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_iter.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 52ce2fb87cd7..e9351632cc03 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -2312,9 +2312,11 @@ void bch2_trans_reset(struct btree_trans *trans, unsigned flags)
 {
 	struct btree_iter *iter;
 
-	trans_for_each_iter(trans, iter)
+	trans_for_each_iter(trans, iter) {
 		iter->flags &= ~(BTREE_ITER_KEEP_UNTIL_COMMIT|
 				 BTREE_ITER_SET_POS_AFTER_COMMIT);
+		iter->should_be_locked = false;
+	}
 
 	bch2_trans_unlink_iters(trans);
 
-- 
cgit 


From 120f63e321fa9fdb1837dd64fcd50e2b144b0d6d Mon Sep 17 00:00:00 2001
From: Janpieter Sollie <janpieter.sollie@edpnet.be>
Date: Sun, 13 Jun 2021 22:01:08 +0200
Subject: bcachefs: fix a possible bcachefs checksum mapping error opt-checksum
 enum to type-checksum enum

This fixes some rare cases where the metadata checksum option specified
may map to the wrong actual checksum type.

Signed-off-by: Janpieter Sollie <janpieter.sollie@edpnet.be>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/super-io.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index fc6fb302706a..403e77e2c515 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -684,7 +684,7 @@ static void write_one_super(struct bch_fs *c, struct bch_dev *ca, unsigned idx)
 
 	sb->offset = sb->layout.sb_offset[idx];
 
-	SET_BCH_SB_CSUM_TYPE(sb, c->opts.metadata_checksum);
+	SET_BCH_SB_CSUM_TYPE(sb, bch2_csum_opt_to_type(c->opts.metadata_checksum, false));
 	sb->csum = csum_vstruct(c, BCH_SB_CSUM_TYPE(sb),
 				null_nonce(), sb);
 
-- 
cgit 


From 2ed5cd508d0b893ded6a1ba586f7c6cd7a95cc36 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 14 Jun 2021 14:47:26 -0400
Subject: bcachefs: Fix a memory leak in dio write path

Commit c42bca92be928ce7dece5fc04cf68d0e37ee6718 "bio: don't copy bvec
for direct IO" changed bio_iov_iter_get_pages() to point bio->bi_iovec
at the incoming biovec, meaning if we already allocated one, it'll be
leaked.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/fs-io.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 132ff0497b39..e39e22581e4b 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -2025,7 +2025,9 @@ ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter)
 	}
 
 	bio = bio_alloc_bioset(NULL,
-			       iov_iter_npages(iter, BIO_MAX_VECS),
+			       iov_iter_is_bvec(iter)
+			       ? 0
+			       : iov_iter_npages(iter, BIO_MAX_VECS),
 			       REQ_OP_WRITE,
 			       GFP_KERNEL,
 			       &c->dio_write_bioset);
-- 
cgit 


From 8ee529e9c16b9c72293794f441270e954a9c6508 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 14 Jun 2021 16:32:44 -0400
Subject: bcachefs: Make sure bch2_trans_mark_update uses correct iter flags

Now that bch2_btree_iter_peek_with_updates() has been removed in favor
of BTREE_ITER_WITH_UPDATES, we need to make sure it's not used where we
don't want it.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/buckets.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index da24c4038fc9..c427744a665f 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -1916,7 +1916,7 @@ int bch2_trans_mark_update(struct btree_trans *trans,
 			   unsigned flags)
 {
 	struct bkey_s_c	old;
-	int ret;
+	int iter_flags, ret;
 
 	if (unlikely(flags & BTREE_TRIGGER_NORUN))
 		return 0;
@@ -1924,7 +1924,13 @@ int bch2_trans_mark_update(struct btree_trans *trans,
 	if (!btree_node_type_needs_gc(iter->btree_id))
 		return 0;
 
+	iter_flags = iter->flags & BTREE_ITER_WITH_UPDATES;
+	iter->flags &= ~BTREE_ITER_WITH_UPDATES;
+
 	old = bch2_btree_iter_peek_slot(iter);
+
+	iter->flags |= iter_flags;
+
 	ret = bkey_err(old);
 	if (ret)
 		return ret;
-- 
cgit 


From b89726ab86aab15ea8526fb365da6a0f5d474241 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 14 Jun 2021 16:35:03 -0400
Subject: bcachefs: Kill __btree_delete_at()

With trans->updates2 gone, we can now drop this helper and use
bch2_btree_delete_at() instead.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_update_leaf.c | 64 +++++++++++++++--------------------------
 fs/bcachefs/fsck.c              |  2 +-
 fs/bcachefs/tests.c             |  9 ++++--
 3 files changed, 30 insertions(+), 45 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 852d4e39199f..634e25e94425 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -771,34 +771,12 @@ bch2_trans_commit_get_rw_cold(struct btree_trans *trans)
 	return 0;
 }
 
-static int __btree_delete_at(struct btree_trans *trans, enum btree_id btree_id,
-			     struct bpos pos, unsigned trigger_flags)
-{
-	struct btree_iter *iter;
-	struct bkey_i *update;
-	int ret;
-
-	update = bch2_trans_kmalloc(trans, sizeof(struct bkey));
-	if ((ret = PTR_ERR_OR_ZERO(update)))
-		return ret;
-
-	bkey_init(&update->k);
-	update->k.p = pos;
-
-	iter = bch2_trans_get_iter(trans, btree_id, pos,
-				   BTREE_ITER_NOT_EXTENTS|
-				   BTREE_ITER_INTENT);
-	bch2_trans_update(trans, iter, update, trigger_flags);
-	bch2_trans_iter_put(trans, iter);
-	return 0;
-}
-
 static noinline int extent_front_merge(struct btree_trans *trans,
+				       struct btree_iter *iter,
 				       struct bkey_s_c k,
 				       struct btree_insert_entry *i)
 {
 	struct bch_fs *c = trans->c;
-	struct bpos l_pos = k.k->p;
 	struct bkey_i *update;
 	int ret;
 
@@ -810,8 +788,13 @@ static noinline int extent_front_merge(struct btree_trans *trans,
 	bkey_reassemble(update, k);
 
 	if (bch2_bkey_merge(c, bkey_i_to_s(update), bkey_i_to_s_c(i->k))) {
-		ret = __btree_delete_at(trans, i->btree_id, l_pos,
-					i->trigger_flags);
+		struct btree_iter *update_iter =
+			bch2_trans_copy_iter(trans, iter);
+
+		ret = bch2_btree_delete_at(trans, update_iter,
+					   i->trigger_flags);
+		bch2_trans_iter_put(trans, update_iter);
+
 		if (ret)
 			return ret;
 
@@ -841,7 +824,7 @@ static int extent_handle_overwrites(struct btree_trans *trans,
 
 	if (!bkey_cmp(k.k->p, bkey_start_pos(&i->k->k))) {
 		if (bch2_bkey_maybe_mergable(k.k, &i->k->k)) {
-			ret = extent_front_merge(trans, k, i);
+			ret = extent_front_merge(trans, iter, k, i);
 			if (ret)
 				goto out;
 		}
@@ -877,8 +860,11 @@ static int extent_handle_overwrites(struct btree_trans *trans,
 		}
 
 		if (bkey_cmp(k.k->p, i->k->k.p) <= 0) {
-			ret = __btree_delete_at(trans, i->btree_id, k.k->p,
-						i->trigger_flags);
+			update_iter = bch2_trans_copy_iter(trans, iter);
+			ret = bch2_btree_delete_at(trans, update_iter,
+						   i->trigger_flags);
+			bch2_trans_iter_put(trans, update_iter);
+
 			if (ret)
 				goto out;
 		}
@@ -891,12 +877,7 @@ static int extent_handle_overwrites(struct btree_trans *trans,
 			bkey_reassemble(update, k);
 			bch2_cut_front(i->k->k.p, update);
 
-			update_iter = bch2_trans_get_iter(trans, i->btree_id, update->k.p,
-							  BTREE_ITER_NOT_EXTENTS|
-							  BTREE_ITER_INTENT);
-			bch2_trans_update(trans, update_iter, update,
-					  i->trigger_flags);
-			bch2_trans_iter_put(trans, update_iter);
+			bch2_trans_update(trans, iter, update, i->trigger_flags);
 			goto out;
 		}
 next:
@@ -1143,16 +1124,17 @@ int bch2_btree_insert(struct bch_fs *c, enum btree_id id,
 }
 
 int bch2_btree_delete_at(struct btree_trans *trans,
-			 struct btree_iter *iter, unsigned flags)
+			 struct btree_iter *iter, unsigned trigger_flags)
 {
-	struct bkey_i k;
+	struct bkey_i *k;
 
-	bkey_init(&k.k);
-	k.k.p = iter->pos;
+	k = bch2_trans_kmalloc(trans, sizeof(*k));
+	if (IS_ERR(k))
+		return PTR_ERR(k);
 
-	return  bch2_trans_update(trans, iter, &k, 0) ?:
-		bch2_trans_commit(trans, NULL, NULL,
-				  BTREE_INSERT_NOFAIL|flags);
+	bkey_init(&k->k);
+	k->k.p = iter->pos;
+	return  bch2_trans_update(trans, iter, k, trigger_flags);
 }
 
 int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id id,
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 89a130d9c537..1bb595f4003a 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -761,7 +761,7 @@ retry:
 				mode_to_type(w.inode.bi_mode),
 				(bch2_bkey_val_to_text(&PBUF(buf), c,
 						       k), buf))) {
-			ret = lockrestart_do(&trans,
+			ret = __bch2_trans_do(&trans, NULL, NULL, 0,
 					bch2_btree_delete_at(&trans, iter, 0));
 			if (ret)
 				goto err;
diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c
index 87208da5e350..fa9f600fc17c 100644
--- a/fs/bcachefs/tests.c
+++ b/fs/bcachefs/tests.c
@@ -54,14 +54,16 @@ static int test_delete(struct bch_fs *c, u64 nr)
 	}
 
 	pr_info("deleting once");
-	ret = bch2_btree_delete_at(&trans, iter, 0);
+	ret = __bch2_trans_do(&trans, NULL, NULL, 0,
+			 bch2_btree_delete_at(&trans, iter, 0));
 	if (ret) {
 		bch_err(c, "delete error (first) in test_delete: %i", ret);
 		goto err;
 	}
 
 	pr_info("deleting twice");
-	ret = bch2_btree_delete_at(&trans, iter, 0);
+	ret = __bch2_trans_do(&trans, NULL, NULL, 0,
+			 bch2_btree_delete_at(&trans, iter, 0));
 	if (ret) {
 		bch_err(c, "delete error (second) in test_delete: %i", ret);
 		goto err;
@@ -101,7 +103,8 @@ static int test_delete_written(struct bch_fs *c, u64 nr)
 
 	bch2_journal_flush_all_pins(&c->journal);
 
-	ret = bch2_btree_delete_at(&trans, iter, 0);
+	ret = __bch2_trans_do(&trans, NULL, NULL, 0,
+			 bch2_btree_delete_at(&trans, iter, 0));
 	if (ret) {
 		bch_err(c, "delete error in test_delete_written: %i", ret);
 		goto err;
-- 
cgit 


From 8c3f6da9fc526e7ba0f6449efa1040084406e9ba Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 14 Jun 2021 18:16:10 -0400
Subject: bcachefs: Improve iter->should_be_locked

Adding iter->should_be_locked introduced a regression where it ended up
not being set on the iterator passed to bch2_btree_update_start(), which
is definitely not what we want.

This patch requires it to be set when calling bch2_trans_update(), and
adds various fixups to make that happen.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_iter.h            |  6 ++++++
 fs/bcachefs/btree_update_interior.c |  2 ++
 fs/bcachefs/btree_update_leaf.c     | 13 +++++++++----
 fs/bcachefs/buckets.c               |  2 +-
 fs/bcachefs/extent_update.c         |  4 ++++
 fs/bcachefs/fs-common.c             |  3 ++-
 fs/bcachefs/fs-io.c                 |  3 ++-
 fs/bcachefs/fsck.c                  | 12 +++++++++---
 fs/bcachefs/recovery.c              | 15 ++++-----------
 fs/bcachefs/reflink.c               |  6 +++---
 fs/bcachefs/tests.c                 | 16 ++++++++--------
 11 files changed, 50 insertions(+), 32 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index 27c685a482ec..6efea281d87f 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -176,6 +176,12 @@ static inline void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos
 	iter->should_be_locked = false;
 }
 
+static inline void bch2_btree_iter_set_pos_to_extent_start(struct btree_iter *iter)
+{
+	BUG_ON(!(iter->flags & BTREE_ITER_IS_EXTENTS));
+	iter->pos = bkey_start_pos(&iter->k);
+}
+
 static inline struct btree_iter *btree_iter_child(struct btree_iter *iter)
 {
 	return iter->child_idx == U8_MAX ? NULL
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index bb01b036c7a2..e35e842efe81 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -937,6 +937,8 @@ bch2_btree_update_start(struct btree_iter *iter, unsigned level,
 	int journal_flags = 0;
 	int ret = 0;
 
+	BUG_ON(!iter->should_be_locked);
+
 	if (flags & BTREE_INSERT_JOURNAL_RESERVED)
 		journal_flags |= JOURNAL_RES_GET_RESERVED;
 
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 634e25e94425..00706b952630 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -855,6 +855,10 @@ static int extent_handle_overwrites(struct btree_trans *trans,
 			update_iter = bch2_trans_get_iter(trans, i->btree_id, update->k.p,
 							  BTREE_ITER_NOT_EXTENTS|
 							  BTREE_ITER_INTENT);
+			ret = bch2_btree_iter_traverse(update_iter);
+			if (ret)
+				goto out;
+
 			bch2_trans_update(trans, update_iter, update, i->trigger_flags);
 			bch2_trans_iter_put(trans, update_iter);
 		}
@@ -1039,6 +1043,7 @@ int bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter,
 	int ret = 0;
 
 	BUG_ON(trans->nr_updates >= BTREE_ITER_MAX);
+	BUG_ON(!iter->should_be_locked);
 
 #ifdef CONFIG_BCACHEFS_DEBUG
 	trans_for_each_update(trans, i)
@@ -1102,7 +1107,8 @@ int __bch2_btree_insert(struct btree_trans *trans,
 	iter = bch2_trans_get_iter(trans, id, bkey_start_pos(&k->k),
 				   BTREE_ITER_INTENT);
 
-	ret = bch2_trans_update(trans, iter, k, 0);
+	ret   = bch2_btree_iter_traverse(iter) ?:
+		bch2_trans_update(trans, iter, k, 0);
 	bch2_trans_iter_put(trans, iter);
 	return ret;
 }
@@ -1147,13 +1153,12 @@ int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id id,
 
 	iter = bch2_trans_get_iter(trans, id, start, BTREE_ITER_INTENT);
 retry:
-	while ((k = bch2_btree_iter_peek(iter)).k &&
+	while ((bch2_trans_begin(trans),
+	       (k = bch2_btree_iter_peek(iter)).k) &&
 	       !(ret = bkey_err(k)) &&
 	       bkey_cmp(iter->pos, end) < 0) {
 		struct bkey_i delete;
 
-		bch2_trans_begin(trans);
-
 		bkey_init(&delete.k);
 
 		/*
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index c427744a665f..3dd206d3b546 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -1817,7 +1817,7 @@ static int __bch2_trans_mark_reflink_p(struct btree_trans *trans,
 		set_bkey_val_u64s(&n->k, 0);
 	}
 
-	bch2_btree_iter_set_pos(iter, bkey_start_pos(k.k));
+	bch2_btree_iter_set_pos_to_extent_start(iter);
 	ret = bch2_trans_update(trans, iter, n, 0);
 	if (ret)
 		goto err;
diff --git a/fs/bcachefs/extent_update.c b/fs/bcachefs/extent_update.c
index ef4aaf1c30ed..4a8dd085f7fb 100644
--- a/fs/bcachefs/extent_update.c
+++ b/fs/bcachefs/extent_update.c
@@ -104,6 +104,10 @@ int bch2_extent_atomic_end(struct btree_iter *iter,
 	unsigned nr_iters = 0;
 	int ret;
 
+	ret = bch2_btree_iter_traverse(iter);
+	if (ret)
+		return ret;
+
 	*end = insert->k.p;
 
 	/* extent_update_to_keys(): */
diff --git a/fs/bcachefs/fs-common.c b/fs/bcachefs/fs-common.c
index 00a63fecb976..60c54438074e 100644
--- a/fs/bcachefs/fs-common.c
+++ b/fs/bcachefs/fs-common.c
@@ -85,7 +85,8 @@ int bch2_create_trans(struct btree_trans *trans, u64 dir_inum,
 	inode_iter->snapshot = U32_MAX;
 	bch2_btree_iter_set_pos(inode_iter, SPOS(0, new_inode->bi_inum, U32_MAX));
 
-	ret = bch2_inode_write(trans, inode_iter, new_inode);
+	ret   = bch2_btree_iter_traverse(inode_iter) ?:
+		bch2_inode_write(trans, inode_iter, new_inode);
 err:
 	bch2_trans_iter_put(trans, inode_iter);
 	bch2_trans_iter_put(trans, dir_iter);
diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index e39e22581e4b..0ffc3971d1b2 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -2611,7 +2611,8 @@ reassemble:
 			BUG_ON(ret);
 		}
 
-		ret =   bch2_trans_update(&trans, del, &delete, trigger_flags) ?:
+		ret =   bch2_btree_iter_traverse(del) ?:
+			bch2_trans_update(&trans, del, &delete, trigger_flags) ?:
 			bch2_trans_update(&trans, dst, copy.k, trigger_flags) ?:
 			bch2_trans_commit(&trans, &disk_res,
 					  &inode->ei_journal_seq,
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 1bb595f4003a..7ea1a41ac637 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -78,7 +78,8 @@ static int __write_inode(struct btree_trans *trans,
 		bch2_trans_get_iter(trans, BTREE_ID_inodes,
 				    SPOS(0, inode->bi_inum, snapshot),
 				    BTREE_ITER_INTENT);
-	int ret = bch2_inode_write(trans, inode_iter, inode);
+	int ret = bch2_btree_iter_traverse(inode_iter) ?:
+		bch2_inode_write(trans, inode_iter, inode);
 	bch2_trans_iter_put(trans, inode_iter);
 	return ret;
 }
@@ -305,7 +306,8 @@ static int hash_redo_key(struct btree_trans *trans,
 
 	bkey_init(&delete->k);
 	delete->k.p = k_iter->pos;
-	return  bch2_trans_update(trans, k_iter, delete, 0) ?:
+	return  bch2_btree_iter_traverse(k_iter) ?:
+		bch2_trans_update(trans, k_iter, delete, 0) ?:
 		bch2_hash_set(trans, desc, hash_info, k_iter->pos.inode, tmp, 0);
 }
 
@@ -491,6 +493,7 @@ static int check_inode(struct btree_trans *trans,
 		ret = __bch2_trans_do(trans, NULL, NULL,
 				      BTREE_INSERT_NOFAIL|
 				      BTREE_INSERT_LAZY_RW,
+				bch2_btree_iter_traverse(iter) ?:
 				bch2_inode_write(trans, iter, &u));
 		if (ret)
 			bch_err(c, "error in fsck: error %i "
@@ -562,7 +565,8 @@ static int fix_overlapping_extent(struct btree_trans *trans,
 				   BTREE_ITER_INTENT|BTREE_ITER_NOT_EXTENTS);
 
 	BUG_ON(iter->flags & BTREE_ITER_IS_EXTENTS);
-	ret   = bch2_trans_update(trans, iter, u, BTREE_TRIGGER_NORUN) ?:
+	ret   = bch2_btree_iter_traverse(iter) ?:
+		bch2_trans_update(trans, iter, u, BTREE_TRIGGER_NORUN) ?:
 		bch2_trans_commit(trans, NULL, NULL,
 				  BTREE_INSERT_NOFAIL|
 				  BTREE_INSERT_LAZY_RW);
@@ -886,6 +890,7 @@ retry:
 			ret = __bch2_trans_do(&trans, NULL, NULL,
 					      BTREE_INSERT_NOFAIL|
 					      BTREE_INSERT_LAZY_RW,
+				bch2_btree_iter_traverse(iter) ?:
 				bch2_trans_update(&trans, iter, &n->k_i, 0));
 			kfree(n);
 			if (ret)
@@ -1338,6 +1343,7 @@ static int check_nlinks_update_hardlinks(struct bch_fs *c,
 			ret = __bch2_trans_do(&trans, NULL, NULL,
 					      BTREE_INSERT_NOFAIL|
 					      BTREE_INSERT_LAZY_RW,
+					      bch2_btree_iter_traverse(iter) ?:
 					bch2_inode_write(&trans, iter, &u));
 			if (ret)
 				bch_err(c, "error in fsck: error %i updating inode", ret);
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index f32414171aab..c6fa4ca31ae9 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -509,16 +509,8 @@ static int __bch2_journal_replay_key(struct btree_trans *trans,
 
 	iter = bch2_trans_get_node_iter(trans, id, k->k.p,
 					BTREE_MAX_DEPTH, level,
-					BTREE_ITER_INTENT);
-
-	/*
-	 * iter->flags & BTREE_ITER_IS_EXTENTS triggers the update path to run
-	 * extent_handle_overwrites() and extent_update_to_keys() - but we don't
-	 * want that here, journal replay is supposed to treat extents like
-	 * regular keys:
-	 */
-	BUG_ON(iter->flags & BTREE_ITER_IS_EXTENTS);
-
+					BTREE_ITER_INTENT|
+					BTREE_ITER_NOT_EXTENTS);
 	ret   = bch2_btree_iter_traverse(iter) ?:
 		bch2_trans_update(trans, iter, k, BTREE_TRIGGER_NORUN);
 	bch2_trans_iter_put(trans, iter);
@@ -546,7 +538,8 @@ static int __bch2_alloc_replay_key(struct btree_trans *trans, struct bkey_i *k)
 				   BTREE_ITER_CACHED|
 				   BTREE_ITER_CACHED_NOFILL|
 				   BTREE_ITER_INTENT);
-	ret = bch2_trans_update(trans, iter, k, BTREE_TRIGGER_NORUN);
+	ret   = bch2_btree_iter_traverse(iter) ?:
+		bch2_trans_update(trans, iter, k, BTREE_TRIGGER_NORUN);
 	bch2_trans_iter_put(trans, iter);
 	return ret;
 }
diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c
index ba700810a4be..ebf391245470 100644
--- a/fs/bcachefs/reflink.c
+++ b/fs/bcachefs/reflink.c
@@ -142,7 +142,7 @@ static int bch2_make_extent_indirect(struct btree_trans *trans,
 		goto err;
 
 	/* rewind iter to start of hole, if necessary: */
-	bch2_btree_iter_set_pos(reflink_iter, bkey_start_pos(k.k));
+	bch2_btree_iter_set_pos_to_extent_start(reflink_iter);
 
 	r_v = bch2_trans_kmalloc(trans, sizeof(__le64) + bkey_bytes(&orig->k));
 	ret = PTR_ERR_OR_ZERO(r_v);
@@ -257,11 +257,11 @@ s64 bch2_remap_range(struct bch_fs *c,
 		}
 
 		if (src_k.k->type != KEY_TYPE_reflink_p) {
+			bch2_btree_iter_set_pos_to_extent_start(src_iter);
+
 			bch2_bkey_buf_reassemble(&new_src, c, src_k);
 			src_k = bkey_i_to_s_c(new_src.k);
 
-			bch2_btree_iter_set_pos(src_iter, bkey_start_pos(src_k.k));
-
 			ret = bch2_make_extent_indirect(&trans, src_iter,
 						new_src.k);
 			if (ret)
diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c
index fa9f600fc17c..a8b8e3a072ad 100644
--- a/fs/bcachefs/tests.c
+++ b/fs/bcachefs/tests.c
@@ -40,13 +40,8 @@ static int test_delete(struct bch_fs *c, u64 nr)
 	iter = bch2_trans_get_iter(&trans, BTREE_ID_xattrs, k.k.p,
 				   BTREE_ITER_INTENT);
 
-	ret = bch2_btree_iter_traverse(iter);
-	if (ret) {
-		bch_err(c, "lookup error in test_delete: %i", ret);
-		goto err;
-	}
-
 	ret = __bch2_trans_do(&trans, NULL, NULL, 0,
+		bch2_btree_iter_traverse(iter) ?:
 		bch2_trans_update(&trans, iter, &k.k_i, 0));
 	if (ret) {
 		bch_err(c, "update error in test_delete: %i", ret);
@@ -55,7 +50,8 @@ static int test_delete(struct bch_fs *c, u64 nr)
 
 	pr_info("deleting once");
 	ret = __bch2_trans_do(&trans, NULL, NULL, 0,
-			 bch2_btree_delete_at(&trans, iter, 0));
+		bch2_btree_iter_traverse(iter) ?:
+		bch2_btree_delete_at(&trans, iter, 0));
 	if (ret) {
 		bch_err(c, "delete error (first) in test_delete: %i", ret);
 		goto err;
@@ -63,7 +59,8 @@ static int test_delete(struct bch_fs *c, u64 nr)
 
 	pr_info("deleting twice");
 	ret = __bch2_trans_do(&trans, NULL, NULL, 0,
-			 bch2_btree_delete_at(&trans, iter, 0));
+		bch2_btree_iter_traverse(iter) ?:
+		bch2_btree_delete_at(&trans, iter, 0));
 	if (ret) {
 		bch_err(c, "delete error (second) in test_delete: %i", ret);
 		goto err;
@@ -591,6 +588,7 @@ static int rand_mixed(struct bch_fs *c, u64 nr)
 			k.k.p = iter->pos;
 
 			ret = __bch2_trans_do(&trans, NULL, NULL, 0,
+				bch2_btree_iter_traverse(iter) ?:
 				bch2_trans_update(&trans, iter, &k.k_i, 0));
 			if (ret) {
 				bch_err(c, "update error in rand_mixed: %i", ret);
@@ -671,6 +669,7 @@ static int seq_insert(struct bch_fs *c, u64 nr)
 		insert.k.p = iter->pos;
 
 		ret = __bch2_trans_do(&trans, NULL, NULL, 0,
+			bch2_btree_iter_traverse(iter) ?:
 			bch2_trans_update(&trans, iter, &insert.k_i, 0));
 		if (ret) {
 			bch_err(c, "error in seq_insert: %i", ret);
@@ -719,6 +718,7 @@ static int seq_overwrite(struct bch_fs *c, u64 nr)
 		bkey_reassemble(&u.k_i, k);
 
 		ret = __bch2_trans_do(&trans, NULL, NULL, 0,
+			bch2_btree_iter_traverse(iter) ?:
 			bch2_trans_update(&trans, iter, &u.k_i, 0));
 		if (ret) {
 			bch_err(c, "error in seq_overwrite: %i", ret);
-- 
cgit 


From 68a507a2e8cdc9b90599bb5d220a696abdc54838 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 14 Jun 2021 22:29:54 -0400
Subject: bcachefs: fix truncate with ATTR_MODE

After the v5.12 rebase, we started oopsing when truncate was passed
ATTR_MODE, due to not passing mnt_userns to setattr_copy(). This
refactors things so that truncate/extend finish by using
bch2_setattr_nonsize(), which solves the problem.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/fs-io.c | 43 +++++++++++++++++++++++--------------------
 fs/bcachefs/fs-io.h |  3 ++-
 fs/bcachefs/fs.c    | 11 +++++++----
 fs/bcachefs/fs.h    |  4 ++++
 4 files changed, 36 insertions(+), 25 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 0ffc3971d1b2..a25c3b70ef74 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -2252,11 +2252,11 @@ static int bch2_truncate_page(struct bch_inode_info *inode, loff_t from)
 				    from, round_up(from, PAGE_SIZE));
 }
 
-static int bch2_extend(struct bch_inode_info *inode,
+static int bch2_extend(struct mnt_idmap *idmap,
+		       struct bch_inode_info *inode,
 		       struct bch_inode_unpacked *inode_u,
 		       struct iattr *iattr)
 {
-	struct bch_fs *c = inode->v.i_sb->s_fs_info;
 	struct address_space *mapping = inode->v.i_mapping;
 	int ret;
 
@@ -2270,25 +2270,15 @@ static int bch2_extend(struct bch_inode_info *inode,
 		return ret;
 
 	truncate_setsize(&inode->v, iattr->ia_size);
-	/* ATTR_MODE will never be set here, ns argument isn't needed: */
-	setattr_copy(NULL, &inode->v, iattr);
-
-	mutex_lock(&inode->ei_update_lock);
-	ret = bch2_write_inode_size(c, inode, inode->v.i_size,
-				    ATTR_MTIME|ATTR_CTIME);
-	mutex_unlock(&inode->ei_update_lock);
 
-	return ret;
+	return bch2_setattr_nonsize(idmap, inode, iattr);
 }
 
 static int bch2_truncate_finish_fn(struct bch_inode_info *inode,
 				   struct bch_inode_unpacked *bi,
 				   void *p)
 {
-	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-
 	bi->bi_flags &= ~BCH_INODE_I_SIZE_DIRTY;
-	bi->bi_mtime = bi->bi_ctime = bch2_current_time(c);
 	return 0;
 }
 
@@ -2302,7 +2292,8 @@ static int bch2_truncate_start_fn(struct bch_inode_info *inode,
 	return 0;
 }
 
-int bch2_truncate(struct bch_inode_info *inode, struct iattr *iattr)
+int bch2_truncate(struct mnt_idmap *idmap,
+		  struct bch_inode_info *inode, struct iattr *iattr)
 {
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
 	struct address_space *mapping = inode->v.i_mapping;
@@ -2313,6 +2304,18 @@ int bch2_truncate(struct bch_inode_info *inode, struct iattr *iattr)
 	s64 i_sectors_delta = 0;
 	int ret = 0;
 
+	/*
+	 * Don't update timestamps if we're not doing anything:
+	 */
+	if (iattr->ia_size == inode->v.i_size)
+		return 0;
+
+	if (!(iattr->ia_valid & ATTR_MTIME))
+		ktime_get_coarse_real_ts64(&iattr->ia_mtime);
+	if (!(iattr->ia_valid & ATTR_CTIME))
+		ktime_get_coarse_real_ts64(&iattr->ia_ctime);
+	iattr->ia_valid |= ATTR_MTIME|ATTR_CTIME;
+
 	inode_dio_wait(&inode->v);
 	bch2_pagecache_block_get(&inode->ei_pagecache_lock);
 
@@ -2342,10 +2345,12 @@ int bch2_truncate(struct bch_inode_info *inode, struct iattr *iattr)
 		inode->v.i_size < inode_u.bi_size);
 
 	if (iattr->ia_size > inode->v.i_size) {
-		ret = bch2_extend(inode, &inode_u, iattr);
+		ret = bch2_extend(idmap, inode, &inode_u, iattr);
 		goto err;
 	}
 
+	iattr->ia_valid &= ~ATTR_SIZE;
+
 	ret = bch2_truncate_page(inode, iattr->ia_size);
 	if (unlikely(ret))
 		goto err;
@@ -2389,13 +2394,11 @@ int bch2_truncate(struct bch_inode_info *inode, struct iattr *iattr)
 	if (unlikely(ret))
 		goto err;
 
-	/* ATTR_MODE will never be set here, ns argument isn't needed: */
-	setattr_copy(NULL, &inode->v, iattr);
-
 	mutex_lock(&inode->ei_update_lock);
-	ret = bch2_write_inode(c, inode, bch2_truncate_finish_fn, NULL,
-			       ATTR_MTIME|ATTR_CTIME);
+	ret = bch2_write_inode(c, inode, bch2_truncate_finish_fn, NULL, 0);
 	mutex_unlock(&inode->ei_update_lock);
+
+	ret = bch2_setattr_nonsize(idmap, inode, iattr);
 err:
 	bch2_pagecache_block_put(&inode->ei_pagecache_lock);
 	return ret;
diff --git a/fs/bcachefs/fs-io.h b/fs/bcachefs/fs-io.h
index 2a2df58a46bb..64b16b44e25a 100644
--- a/fs/bcachefs/fs-io.h
+++ b/fs/bcachefs/fs-io.h
@@ -31,7 +31,8 @@ ssize_t bch2_write_iter(struct kiocb *, struct iov_iter *);
 
 int bch2_fsync(struct file *, loff_t, loff_t, int);
 
-int bch2_truncate(struct bch_inode_info *, struct iattr *);
+int bch2_truncate(struct mnt_idmap *,
+		  struct bch_inode_info *, struct iattr *);
 long bch2_fallocate_dispatch(struct file *, int, loff_t, loff_t);
 
 loff_t bch2_remap_file_range(struct file *, loff_t, struct file *,
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index efb467316756..71e738b98967 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -662,6 +662,9 @@ static void bch2_setattr_copy(struct mnt_idmap *idmap,
 	if (ia_valid & ATTR_GID)
 		bi->bi_gid = from_kgid(i_user_ns(&inode->v), attr->ia_gid);
 
+	if (ia_valid & ATTR_SIZE)
+		bi->bi_size = attr->ia_size;
+
 	if (ia_valid & ATTR_ATIME)
 		bi->bi_atime = timespec_to_bch2_time(c, attr->ia_atime);
 	if (ia_valid & ATTR_MTIME)
@@ -682,9 +685,9 @@ static void bch2_setattr_copy(struct mnt_idmap *idmap,
 	}
 }
 
-static int bch2_setattr_nonsize(struct mnt_idmap *idmap,
-				struct bch_inode_info *inode,
-				struct iattr *attr)
+int bch2_setattr_nonsize(struct mnt_idmap *idmap,
+			 struct bch_inode_info *inode,
+			 struct iattr *attr)
 {
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
 	struct bch_qid qid;
@@ -808,7 +811,7 @@ static int bch2_setattr(struct mnt_idmap *idmap,
 		return ret;
 
 	return iattr->ia_valid & ATTR_SIZE
-		? bch2_truncate(inode, iattr)
+		? bch2_truncate(idmap, inode, iattr)
 		: bch2_setattr_nonsize(idmap, inode, iattr);
 }
 
diff --git a/fs/bcachefs/fs.h b/fs/bcachefs/fs.h
index f3072780af51..c08a828d66cd 100644
--- a/fs/bcachefs/fs.h
+++ b/fs/bcachefs/fs.h
@@ -166,6 +166,10 @@ void bch2_inode_update_after_write(struct bch_fs *,
 int __must_check bch2_write_inode(struct bch_fs *, struct bch_inode_info *,
 				  inode_set_fn, void *, unsigned);
 
+int bch2_setattr_nonsize(struct mnt_idmap *,
+			 struct bch_inode_info *,
+			 struct iattr *);
+
 void bch2_vfs_exit(void);
 int bch2_vfs_init(void);
 
-- 
cgit 


From 297d89343dd9be32878a747c53d3477c986a01b6 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 10 Jun 2021 21:44:27 -0400
Subject: bcachefs: Extensive triggers cleanups

 - We no longer mark subsets of extents, they're marked like regular
   keys now - which means we can drop the offset & sectors arguments
   to trigger functions
 - Drop other arguments that are no longer needed anymore in various
   places - fs_usage
 - Drop the logic for handling extents in bch2_mark_update() that isn't
   needed anymore, to match bch2_trans_mark_update()
 - Better logic for hanlding the BTREE_ITER_CACHED_NOFILL case, where we
   don't have an old key to mark

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_gc.c              |   6 +-
 fs/bcachefs/btree_types.h           |   6 +-
 fs/bcachefs/btree_update_interior.c |   4 +-
 fs/bcachefs/btree_update_leaf.c     |   4 +-
 fs/bcachefs/buckets.c               | 573 ++++++++++++++++--------------------
 fs/bcachefs/buckets.h               |  23 +-
 fs/bcachefs/ec.c                    |   3 +-
 fs/bcachefs/extents.h               |  11 +
 8 files changed, 284 insertions(+), 346 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index b94fac1bc114..480d4afb9aae 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -669,6 +669,7 @@ static int bch2_gc_mark_key(struct bch_fs *c, enum btree_id btree_id,
 	struct bkey_ptrs_c ptrs;
 	const struct bch_extent_ptr *ptr;
 	unsigned flags =
+		BTREE_TRIGGER_INSERT|
 		BTREE_TRIGGER_GC|
 		(initial ? BTREE_TRIGGER_NOATOMIC : 0);
 	int ret = 0;
@@ -710,7 +711,7 @@ static int bch2_gc_mark_key(struct bch_fs *c, enum btree_id btree_id,
 		*max_stale = max(*max_stale, ptr_stale(ca, ptr));
 	}
 
-	bch2_mark_key(c, *k, 0, k->k->size, NULL, 0, flags);
+	bch2_mark_key(c, *k, flags);
 fsck_err:
 err:
 	if (ret)
@@ -1081,8 +1082,7 @@ static void bch2_mark_pending_btree_node_frees(struct bch_fs *c)
 	for_each_pending_btree_node_free(c, as, d)
 		if (d->index_update_done)
 			bch2_mark_key(c, bkey_i_to_s_c(&d->key),
-				      0, 0, NULL, 0,
-				      BTREE_TRIGGER_GC);
+				      BTREE_TRIGGER_INSERT|BTREE_TRIGGER_GC);
 
 	mutex_unlock(&c->btree_interior_update_lock);
 }
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 23de816b4757..3e7edaffbb9d 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -644,7 +644,6 @@ enum btree_trigger_flags {
 
 	__BTREE_TRIGGER_INSERT,
 	__BTREE_TRIGGER_OVERWRITE,
-	__BTREE_TRIGGER_OVERWRITE_SPLIT,
 
 	__BTREE_TRIGGER_GC,
 	__BTREE_TRIGGER_BUCKET_INVALIDATE,
@@ -655,12 +654,15 @@ enum btree_trigger_flags {
 
 #define BTREE_TRIGGER_INSERT		(1U << __BTREE_TRIGGER_INSERT)
 #define BTREE_TRIGGER_OVERWRITE		(1U << __BTREE_TRIGGER_OVERWRITE)
-#define BTREE_TRIGGER_OVERWRITE_SPLIT	(1U << __BTREE_TRIGGER_OVERWRITE_SPLIT)
 
 #define BTREE_TRIGGER_GC		(1U << __BTREE_TRIGGER_GC)
 #define BTREE_TRIGGER_BUCKET_INVALIDATE	(1U << __BTREE_TRIGGER_BUCKET_INVALIDATE)
 #define BTREE_TRIGGER_NOATOMIC		(1U << __BTREE_TRIGGER_NOATOMIC)
 
+#define BTREE_TRIGGER_WANTS_OLD_AND_NEW		\
+	((1U << KEY_TYPE_stripe)|		\
+	 (1U << KEY_TYPE_inode))
+
 static inline bool btree_node_type_needs_gc(enum btree_node_type type)
 {
 	return BTREE_NODE_TYPE_HAS_TRIGGERS & (1U << type);
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index e35e842efe81..f577fd386ab4 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -511,7 +511,7 @@ static int btree_update_nodes_written_trans(struct btree_trans *trans,
 		ret = bch2_trans_mark_key(trans,
 					  bkey_s_c_null,
 					  bkey_i_to_s_c(k),
-					  0, 0, BTREE_TRIGGER_INSERT);
+					  BTREE_TRIGGER_INSERT);
 		if (ret)
 			return ret;
 	}
@@ -520,7 +520,7 @@ static int btree_update_nodes_written_trans(struct btree_trans *trans,
 		ret = bch2_trans_mark_key(trans,
 					  bkey_i_to_s_c(k),
 					  bkey_s_c_null,
-					  0, 0, BTREE_TRIGGER_OVERWRITE);
+					  BTREE_TRIGGER_OVERWRITE);
 		if (ret)
 			return ret;
 	}
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 00706b952630..81c111176b1f 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -362,7 +362,7 @@ static noinline void bch2_trans_mark_gc(struct btree_trans *trans)
 		BUG_ON(btree_iter_type(i->iter) == BTREE_ITER_CACHED);
 
 		if (gc_visited(c, gc_pos_btree_node(i->iter->l[0].b)))
-			bch2_mark_update(trans, i->iter, i->k, NULL,
+			bch2_mark_update(trans, i->iter, i->k,
 					 i->trigger_flags|BTREE_TRIGGER_GC);
 	}
 }
@@ -468,7 +468,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans,
 	trans_for_each_update(trans, i)
 		if (BTREE_NODE_TYPE_HAS_MEM_TRIGGERS & (1U << i->bkey_type))
 			bch2_mark_update(trans, i->iter, i->k,
-					 NULL, i->trigger_flags);
+					 i->trigger_flags);
 
 	if (marking && trans->fs_usage_deltas)
 		bch2_trans_fs_usage_apply(trans, trans->fs_usage_deltas);
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 3dd206d3b546..71900e7e921f 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -355,17 +355,16 @@ static inline void account_bucket(struct bch_fs_usage *fs_usage,
 }
 
 static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
-				  struct bch_fs_usage *fs_usage,
 				  struct bucket_mark old, struct bucket_mark new,
 				  u64 journal_seq, bool gc)
 {
+	struct bch_fs_usage *fs_usage;
 	struct bch_dev_usage *u;
 
 	percpu_rwsem_assert_held(&c->mark_lock);
 
 	preempt_disable();
-	if (!fs_usage)
-		fs_usage = fs_usage_ptr(c, journal_seq, gc);
+	fs_usage = fs_usage_ptr(c, journal_seq, gc);
 	u = dev_usage_ptr(ca, journal_seq, gc);
 
 	if (bucket_type(old))
@@ -393,30 +392,48 @@ static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
 		bch2_wake_allocator(ca);
 }
 
+static inline int __update_replicas(struct bch_fs *c,
+				    struct bch_fs_usage *fs_usage,
+				    struct bch_replicas_entry *r,
+				    s64 sectors)
+{
+	int idx = bch2_replicas_entry_idx(c, r);
+
+	if (idx < 0)
+		return -1;
+
+	fs_usage_data_type_to_base(fs_usage, r->data_type, sectors);
+	fs_usage->replicas[idx]		+= sectors;
+	return 0;
+}
+
 static inline int update_replicas(struct bch_fs *c,
-				   struct bch_fs_usage *fs_usage,
-				   struct bch_replicas_entry *r,
-				   s64 sectors)
+			struct bch_replicas_entry *r, s64 sectors,
+			unsigned journal_seq, bool gc)
 {
+	struct bch_fs_usage __percpu *fs_usage;
 	int idx = bch2_replicas_entry_idx(c, r);
 
 	if (idx < 0)
 		return -1;
 
+	preempt_disable();
+	fs_usage = fs_usage_ptr(c, journal_seq, gc);
 	fs_usage_data_type_to_base(fs_usage, r->data_type, sectors);
 	fs_usage->replicas[idx]		+= sectors;
+	preempt_enable();
 	return 0;
 }
 
 static inline int update_cached_sectors(struct bch_fs *c,
-					 struct bch_fs_usage *fs_usage,
-					 unsigned dev, s64 sectors)
+			unsigned dev, s64 sectors,
+			unsigned journal_seq, bool gc)
 {
 	struct bch_replicas_padded r;
 
 	bch2_replicas_entry_cached(&r.e, dev);
 
-	return update_replicas(c, fs_usage, &r.e, sectors);
+	return update_replicas(c, &r.e, sectors, journal_seq, gc);
 }
 
 static struct replicas_delta_list *
@@ -511,7 +528,6 @@ void bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
 
 static int bch2_mark_alloc(struct bch_fs *c,
 			   struct bkey_s_c old, struct bkey_s_c new,
-			   struct bch_fs_usage *fs_usage,
 			   u64 journal_seq, unsigned flags)
 {
 	bool gc = flags & BTREE_TRIGGER_GC;
@@ -553,7 +569,7 @@ static int bch2_mark_alloc(struct bch_fs *c,
 		}
 	}));
 
-	bch2_dev_usage_update(c, ca, fs_usage, old_m, m, journal_seq, gc);
+	bch2_dev_usage_update(c, ca, old_m, m, journal_seq, gc);
 
 	g->io_time[READ]	= u.read_time;
 	g->io_time[WRITE]	= u.write_time;
@@ -569,8 +585,8 @@ static int bch2_mark_alloc(struct bch_fs *c,
 
 	if ((flags & BTREE_TRIGGER_BUCKET_INVALIDATE) &&
 	    old_m.cached_sectors) {
-		if (update_cached_sectors(c, fs_usage, ca->dev_idx,
-				      -old_m.cached_sectors)) {
+		if (update_cached_sectors(c, ca->dev_idx, -old_m.cached_sectors,
+					  journal_seq, gc)) {
 			bch2_fs_fatal_error(c, "bch2_mark_alloc(): no replicas entry while updating cached sectors");
 			return -1;
 		}
@@ -621,8 +637,7 @@ static int __bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
 		old.dirty_sectors, sectors);
 
 	if (c)
-		bch2_dev_usage_update(c, ca, fs_usage_ptr(c, 0, gc),
-				      old, new, 0, gc);
+		bch2_dev_usage_update(c, ca, old, new, 0, gc);
 
 	return 0;
 }
@@ -641,54 +656,20 @@ void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
 	if (b >= ca->mi.nbuckets)
 		return;
 
-	preempt_disable();
-
 	if (likely(c)) {
 		do_mark_fn(__bch2_mark_metadata_bucket, c, pos, flags,
 			   ca, b, type, sectors);
 	} else {
 		__bch2_mark_metadata_bucket(c, ca, b, type, sectors, 0);
 	}
-
-	preempt_enable();
 }
 
-static s64 disk_sectors_scaled(unsigned n, unsigned d, unsigned sectors)
+static s64 ptr_disk_sectors(s64 sectors, struct extent_ptr_decoded p)
 {
-	return DIV_ROUND_UP(sectors * n, d);
-}
-
-static s64 __ptr_disk_sectors_delta(unsigned old_size,
-				    unsigned offset, s64 delta,
-				    unsigned flags,
-				    unsigned n, unsigned d)
-{
-	BUG_ON(!n || !d);
-
-	if (flags & BTREE_TRIGGER_OVERWRITE_SPLIT) {
-		BUG_ON(offset + -delta > old_size);
-
-		return -disk_sectors_scaled(n, d, old_size) +
-			disk_sectors_scaled(n, d, offset) +
-			disk_sectors_scaled(n, d, old_size - offset + delta);
-	} else if (flags & BTREE_TRIGGER_OVERWRITE) {
-		BUG_ON(offset + -delta > old_size);
-
-		return -disk_sectors_scaled(n, d, old_size) +
-			disk_sectors_scaled(n, d, old_size + delta);
-	} else {
-		return  disk_sectors_scaled(n, d, delta);
-	}
-}
-
-static s64 ptr_disk_sectors_delta(struct extent_ptr_decoded p,
-				  unsigned offset, s64 delta,
-				  unsigned flags)
-{
-	return __ptr_disk_sectors_delta(p.crc.live_size,
-					offset, delta, flags,
-					p.crc.compressed_size,
-					p.crc.uncompressed_size);
+	return p.crc.compression_type
+		? DIV_ROUND_UP(sectors * p.crc.compressed_size,
+			       p.crc.uncompressed_size)
+		: sectors;
 }
 
 static int check_bucket_ref(struct bch_fs *c, struct bkey_s_c k,
@@ -767,7 +748,6 @@ static int check_bucket_ref(struct bch_fs *c, struct bkey_s_c k,
 
 static int mark_stripe_bucket(struct bch_fs *c, struct bkey_s_c k,
 			     unsigned ptr_idx,
-			     struct bch_fs_usage *fs_usage,
 			     u64 journal_seq, unsigned flags)
 {
 	const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
@@ -809,7 +789,7 @@ static int mark_stripe_bucket(struct bch_fs *c, struct bkey_s_c k,
 	g->stripe		= k.k->p.offset;
 	g->stripe_redundancy	= s->nr_redundant;
 
-	bch2_dev_usage_update(c, ca, fs_usage, old, new, journal_seq, gc);
+	bch2_dev_usage_update(c, ca, old, new, journal_seq, gc);
 	return 0;
 }
 
@@ -838,7 +818,6 @@ static int __mark_pointer(struct bch_fs *c, struct bkey_s_c k,
 static int bch2_mark_pointer(struct bch_fs *c, struct bkey_s_c k,
 			     struct extent_ptr_decoded p,
 			     s64 sectors, enum bch_data_type data_type,
-			     struct bch_fs_usage *fs_usage,
 			     u64 journal_seq, unsigned flags)
 {
 	bool gc = flags & BTREE_TRIGGER_GC;
@@ -876,7 +855,7 @@ static int bch2_mark_pointer(struct bch_fs *c, struct bkey_s_c k,
 			      old.v.counter,
 			      new.v.counter)) != old.v.counter);
 
-	bch2_dev_usage_update(c, ca, fs_usage, old, new, journal_seq, gc);
+	bch2_dev_usage_update(c, ca, old, new, journal_seq, gc);
 
 	BUG_ON(!gc && bucket_became_unavailable(old, new));
 
@@ -886,8 +865,8 @@ static int bch2_mark_pointer(struct bch_fs *c, struct bkey_s_c k,
 static int bch2_mark_stripe_ptr(struct bch_fs *c,
 				struct bch_extent_stripe_ptr p,
 				enum bch_data_type data_type,
-				struct bch_fs_usage *fs_usage,
-				s64 sectors, unsigned flags)
+				s64 sectors,
+				unsigned journal_seq, unsigned flags)
 {
 	bool gc = flags & BTREE_TRIGGER_GC;
 	struct bch_replicas_padded r;
@@ -922,40 +901,46 @@ static int bch2_mark_stripe_ptr(struct bch_fs *c,
 	spin_unlock(&c->ec_stripes_heap_lock);
 
 	r.e.data_type = data_type;
-	update_replicas(c, fs_usage, &r.e, sectors);
+	update_replicas(c, &r.e, sectors, journal_seq, gc);
 
 	return 0;
 }
 
 static int bch2_mark_extent(struct bch_fs *c,
 			    struct bkey_s_c old, struct bkey_s_c new,
-			    unsigned offset, s64 sectors,
-			    enum bch_data_type data_type,
-			    struct bch_fs_usage *fs_usage,
 			    unsigned journal_seq, unsigned flags)
 {
+	bool gc = flags & BTREE_TRIGGER_GC;
 	struct bkey_s_c k = flags & BTREE_TRIGGER_INSERT ? new : old;
 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
 	const union bch_extent_entry *entry;
 	struct extent_ptr_decoded p;
 	struct bch_replicas_padded r;
+	enum bch_data_type data_type = bkey_is_btree_ptr(k.k)
+		? BCH_DATA_btree
+		: BCH_DATA_user;
+	s64 sectors = bkey_is_btree_ptr(k.k)
+		? c->opts.btree_node_size
+		: k.k->size;
 	s64 dirty_sectors = 0;
 	bool stale;
 	int ret;
 
+	BUG_ON((flags & (BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE)) ==
+	       (BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE));
+
+	if (flags & BTREE_TRIGGER_OVERWRITE)
+		sectors = -sectors;
+
 	r.e.data_type	= data_type;
 	r.e.nr_devs	= 0;
 	r.e.nr_required	= 1;
 
-	BUG_ON(!sectors);
-
 	bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
-		s64 disk_sectors = data_type == BCH_DATA_btree
-			? sectors
-			: ptr_disk_sectors_delta(p, offset, sectors, flags);
+		s64 disk_sectors = ptr_disk_sectors(sectors, p);
 
 		ret = bch2_mark_pointer(c, k, p, disk_sectors, data_type,
-					fs_usage, journal_seq, flags);
+					journal_seq, flags);
 		if (ret < 0)
 			return ret;
 
@@ -963,8 +948,8 @@ static int bch2_mark_extent(struct bch_fs *c,
 
 		if (p.ptr.cached) {
 			if (!stale)
-				if (update_cached_sectors(c, fs_usage, p.ptr.dev,
-							  disk_sectors)) {
+				if (update_cached_sectors(c, p.ptr.dev, disk_sectors,
+							  journal_seq, gc)) {
 					bch2_fs_fatal_error(c, "bch2_mark_extent(): no replicas entry while updating cached sectors");
 					return -1;
 
@@ -974,7 +959,7 @@ static int bch2_mark_extent(struct bch_fs *c,
 			r.e.devs[r.e.nr_devs++]	= p.ptr.dev;
 		} else {
 			ret = bch2_mark_stripe_ptr(c, p.ec, data_type,
-					fs_usage, disk_sectors, flags);
+					disk_sectors, journal_seq, flags);
 			if (ret)
 				return ret;
 
@@ -988,7 +973,7 @@ static int bch2_mark_extent(struct bch_fs *c,
 	}
 
 	if (r.e.nr_devs) {
-		if (update_replicas(c, fs_usage, &r.e, dirty_sectors)) {
+		if (update_replicas(c, &r.e, dirty_sectors, journal_seq, gc)) {
 			char buf[200];
 
 			bch2_bkey_val_to_text(&PBUF(buf), c, k);
@@ -1001,9 +986,8 @@ static int bch2_mark_extent(struct bch_fs *c,
 }
 
 static int bch2_mark_stripe(struct bch_fs *c,
-			    struct bkey_s_c old, struct bkey_s_c new,
-			    struct bch_fs_usage *fs_usage,
-			    u64 journal_seq, unsigned flags)
+			struct bkey_s_c old, struct bkey_s_c new,
+			u64 journal_seq, unsigned flags)
 {
 	bool gc = flags & BTREE_TRIGGER_GC;
 	size_t idx = new.k->p.offset;
@@ -1064,14 +1048,14 @@ static int bch2_mark_stripe(struct bch_fs *c,
 		m->blocks_nonempty = 0;
 
 		for (i = 0; i < new_s->nr_blocks; i++) {
-			ret = mark_stripe_bucket(c, new, i, fs_usage,
-						 journal_seq, flags);
+			ret = mark_stripe_bucket(c, new, i, journal_seq, flags);
 			if (ret)
 				return ret;
 		}
 
-		if (update_replicas(c, fs_usage, &m->r.e,
-				((s64) m->sectors * m->nr_redundant))) {
+		if (update_replicas(c, &m->r.e,
+				    ((s64) m->sectors * m->nr_redundant),
+				    journal_seq, gc)) {
 			char buf[200];
 
 			bch2_bkey_val_to_text(&PBUF(buf), c, new);
@@ -1083,13 +1067,47 @@ static int bch2_mark_stripe(struct bch_fs *c,
 	return 0;
 }
 
-static int __bch2_mark_reflink_p(struct bch_fs *c,
-			struct bkey_s_c_reflink_p p,
-			u64 idx, unsigned sectors,
-			unsigned front_frag,
-			unsigned back_frag,
-			unsigned flags,
-			size_t *r_idx)
+static int bch2_mark_inode(struct bch_fs *c,
+			struct bkey_s_c old, struct bkey_s_c new,
+			u64 journal_seq, unsigned flags)
+{
+	struct bch_fs_usage __percpu *fs_usage;
+
+	preempt_disable();
+	fs_usage = fs_usage_ptr(c, journal_seq, flags & BTREE_TRIGGER_GC);
+	fs_usage->nr_inodes += new.k->type == KEY_TYPE_inode;
+	fs_usage->nr_inodes -= old.k->type == KEY_TYPE_inode;
+	preempt_enable();
+	return 0;
+}
+
+static int bch2_mark_reservation(struct bch_fs *c,
+			struct bkey_s_c old, struct bkey_s_c new,
+			u64 journal_seq, unsigned flags)
+{
+	struct bkey_s_c k = flags & BTREE_TRIGGER_INSERT ? new : old;
+	struct bch_fs_usage __percpu *fs_usage;
+	unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas;
+	s64 sectors = (s64) k.k->size;
+
+	if (flags & BTREE_TRIGGER_OVERWRITE)
+		sectors = -sectors;
+	sectors *= replicas;
+
+	preempt_disable();
+	fs_usage = fs_usage_ptr(c, journal_seq, flags & BTREE_TRIGGER_GC);
+	replicas = clamp_t(unsigned, replicas, 1,
+			   ARRAY_SIZE(fs_usage->persistent_reserved));
+
+	fs_usage->reserved				+= sectors;
+	fs_usage->persistent_reserved[replicas - 1]	+= sectors;
+	preempt_enable();
+
+	return 0;
+}
+
+static s64 __bch2_mark_reflink_p(struct bch_fs *c, struct bkey_s_c_reflink_p p,
+				 u64 idx, unsigned flags, size_t *r_idx)
 {
 	struct reflink_gc *r;
 	int add = !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1;
@@ -1100,7 +1118,7 @@ static int __bch2_mark_reflink_p(struct bch_fs *c,
 		r = genradix_ptr(&c->reflink_gc_table, *r_idx);
 		BUG_ON(!r);
 
-		if (r->offset > idx)
+		if (idx < r->offset)
 			break;
 		(*r_idx)++;
 	}
@@ -1108,7 +1126,7 @@ static int __bch2_mark_reflink_p(struct bch_fs *c,
 	BUG_ON((s64) r->refcount + add < 0);
 
 	r->refcount += add;
-	return min_t(u64, sectors, r->offset - idx);
+	return r->offset - idx;
 not_found:
 	bch2_fs_inconsistent(c,
 		"%llu:%llu len %u points to nonexistent indirect extent %llu",
@@ -1118,22 +1136,19 @@ not_found:
 }
 
 static int bch2_mark_reflink_p(struct bch_fs *c,
-			       struct bkey_s_c_reflink_p p, unsigned offset,
-			       s64 sectors, unsigned flags)
+			struct bkey_s_c old, struct bkey_s_c new,
+			u64 journal_seq, unsigned flags)
 {
-	u64 idx = le64_to_cpu(p.v->idx) + offset;
+	struct bkey_s_c k = flags & BTREE_TRIGGER_INSERT ? new : old;
+	struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
 	struct reflink_gc *ref;
 	size_t l, r, m;
-	unsigned front_frag, back_frag;
+	u64 idx = le64_to_cpu(p.v->idx);
+	unsigned sectors = p.k->size;
 	s64 ret = 0;
 
-	if (sectors < 0)
-		sectors = -sectors;
-
-	BUG_ON(offset + sectors > p.k->size);
-
-	front_frag = offset;
-	back_frag = offset + sectors;
+	BUG_ON((flags & (BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE)) ==
+	       (BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE));
 
 	l = 0;
 	r = c->reflink_gc_nr;
@@ -1148,11 +1163,11 @@ static int bch2_mark_reflink_p(struct bch_fs *c,
 	}
 
 	while (sectors) {
-		ret = __bch2_mark_reflink_p(c, p, idx, sectors,
-				front_frag, back_frag, flags, &l);
+		ret = __bch2_mark_reflink_p(c, p, idx, flags, &l);
 		if (ret < 0)
 			return ret;
 
+		ret = min_t(s64, ret, sectors);
 		idx	+= ret;
 		sectors	-= ret;
 	}
@@ -1163,99 +1178,55 @@ static int bch2_mark_reflink_p(struct bch_fs *c,
 static int bch2_mark_key_locked(struct bch_fs *c,
 		   struct bkey_s_c old,
 		   struct bkey_s_c new,
-		   unsigned offset, s64 sectors,
-		   struct bch_fs_usage *fs_usage,
 		   u64 journal_seq, unsigned flags)
 {
 	struct bkey_s_c k = flags & BTREE_TRIGGER_INSERT ? new : old;
-	int ret = 0;
 
 	BUG_ON(!(flags & (BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE)));
 
-	preempt_disable();
-
-	if (!fs_usage || (flags & BTREE_TRIGGER_GC))
-		fs_usage = fs_usage_ptr(c, journal_seq,
-					flags & BTREE_TRIGGER_GC);
-
 	switch (k.k->type) {
 	case KEY_TYPE_alloc:
 	case KEY_TYPE_alloc_v2:
-		ret = bch2_mark_alloc(c, old, new, fs_usage, journal_seq, flags);
-		break;
+		return bch2_mark_alloc(c, old, new, journal_seq, flags);
 	case KEY_TYPE_btree_ptr:
 	case KEY_TYPE_btree_ptr_v2:
-		sectors = !(flags & BTREE_TRIGGER_OVERWRITE)
-			?  c->opts.btree_node_size
-			: -c->opts.btree_node_size;
-
-		ret = bch2_mark_extent(c, old, new, offset, sectors,
-				BCH_DATA_btree, fs_usage, journal_seq, flags);
-		break;
 	case KEY_TYPE_extent:
 	case KEY_TYPE_reflink_v:
-		ret = bch2_mark_extent(c, old, new, offset, sectors,
-				BCH_DATA_user, fs_usage, journal_seq, flags);
-		break;
+		return bch2_mark_extent(c, old, new, journal_seq, flags);
 	case KEY_TYPE_stripe:
-		ret = bch2_mark_stripe(c, old, new, fs_usage, journal_seq, flags);
-		break;
+		return bch2_mark_stripe(c, old, new, journal_seq, flags);
 	case KEY_TYPE_inode:
-		fs_usage->nr_inodes += new.k->type == KEY_TYPE_inode;
-		fs_usage->nr_inodes -= old.k->type == KEY_TYPE_inode;
-		break;
-	case KEY_TYPE_reservation: {
-		unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas;
-
-		sectors *= replicas;
-		replicas = clamp_t(unsigned, replicas, 1,
-				   ARRAY_SIZE(fs_usage->persistent_reserved));
-
-		fs_usage->reserved				+= sectors;
-		fs_usage->persistent_reserved[replicas - 1]	+= sectors;
-		break;
-	}
+		return bch2_mark_inode(c, old, new, journal_seq, flags);
+	case KEY_TYPE_reservation:
+		return bch2_mark_reservation(c, old, new, journal_seq, flags);
 	case KEY_TYPE_reflink_p:
-		ret = bch2_mark_reflink_p(c, bkey_s_c_to_reflink_p(k),
-					  offset, sectors, flags);
-		break;
+		return bch2_mark_reflink_p(c, old, new, journal_seq, flags);
+	default:
+		return 0;
 	}
-
-	preempt_enable();
-
-	return ret;
 }
 
-int bch2_mark_key(struct bch_fs *c, struct bkey_s_c new,
-		  unsigned offset, s64 sectors,
-		  struct bch_fs_usage *fs_usage,
-		  u64 journal_seq, unsigned flags)
+int bch2_mark_key(struct bch_fs *c, struct bkey_s_c new, unsigned flags)
 {
-	struct bkey deleted;
+	struct bkey deleted = KEY(0, 0, 0);
 	struct bkey_s_c old = (struct bkey_s_c) { &deleted, NULL };
 	int ret;
 
-	bkey_init(&deleted);
-
 	percpu_down_read(&c->mark_lock);
-	ret = bch2_mark_key_locked(c, old, new, offset, sectors,
-				   fs_usage, journal_seq,
-				   BTREE_TRIGGER_INSERT|flags);
+	ret = bch2_mark_key_locked(c, old, new, 0, flags);
 	percpu_up_read(&c->mark_lock);
 
 	return ret;
 }
 
-int bch2_mark_update(struct btree_trans *trans,
-		     struct btree_iter *iter,
-		     struct bkey_i *new,
-		     struct bch_fs_usage *fs_usage,
-		     unsigned flags)
+int bch2_mark_update(struct btree_trans *trans, struct btree_iter *iter,
+		     struct bkey_i *new, unsigned flags)
 {
 	struct bch_fs		*c = trans->c;
+	struct bkey		_deleted = KEY(0, 0, 0);
+	struct bkey_s_c		deleted = (struct bkey_s_c) { &_deleted, NULL };
 	struct bkey_s_c		old;
-	struct bkey		unpacked;
-	int ret = 0;
+	int iter_flags, ret;
 
 	if (unlikely(flags & BTREE_TRIGGER_NORUN))
 		return 0;
@@ -1263,87 +1234,36 @@ int bch2_mark_update(struct btree_trans *trans,
 	if (!btree_node_type_needs_gc(iter->btree_id))
 		return 0;
 
-	bkey_init(&unpacked);
-	old = (struct bkey_s_c) { &unpacked, NULL };
+	if (likely(!(iter->flags & BTREE_ITER_CACHED_NOFILL))) {
+		iter_flags = iter->flags & BTREE_ITER_WITH_UPDATES;
+		iter->flags &= ~BTREE_ITER_WITH_UPDATES;
 
-	if (!btree_node_type_is_extents(iter->btree_id)) {
-		/* iterators should be uptodate, shouldn't get errors here: */
-		if (btree_iter_type(iter) != BTREE_ITER_CACHED) {
-			old = bch2_btree_iter_peek_slot(iter);
-			BUG_ON(bkey_err(old));
-		} else {
-			struct bkey_cached *ck = (void *) iter->l[0].b;
+		old = bch2_btree_iter_peek_slot(iter);
+		iter->flags |= iter_flags;
 
-			if (ck->valid)
-				old = bkey_i_to_s_c(ck->k);
-		}
+		ret = bkey_err(old);
+		if (ret)
+			return ret;
+	} else {
+		/*
+		 * If BTREE_ITER_CACHED_NOFILL was used, we better not be
+		 * running triggers that do anything on removal (alloc btree):
+		 */
+		old = deleted;
+	}
 
-		if (old.k->type == new->k.type) {
-			bch2_mark_key_locked(c, old, bkey_i_to_s_c(new), 0, 0,
-				fs_usage, trans->journal_res.seq,
+	if (old.k->type == new->k.type &&
+	    ((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) {
+		ret   = bch2_mark_key_locked(c, old, bkey_i_to_s_c(new),
+				trans->journal_res.seq,
 				BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|flags);
-
-		} else {
-			bch2_mark_key_locked(c, old, bkey_i_to_s_c(new), 0, 0,
-				fs_usage, trans->journal_res.seq,
-				BTREE_TRIGGER_INSERT|flags);
-			bch2_mark_key_locked(c, old, bkey_i_to_s_c(new), 0, 0,
-				fs_usage, trans->journal_res.seq,
-				BTREE_TRIGGER_OVERWRITE|flags);
-		}
 	} else {
-		struct btree_iter *copy;
-
-		BUG_ON(btree_iter_type(iter) == BTREE_ITER_CACHED);
-		bch2_mark_key_locked(c, old, bkey_i_to_s_c(new),
-			0, new->k.size,
-			fs_usage, trans->journal_res.seq,
-			BTREE_TRIGGER_INSERT|flags);
-
-		copy = bch2_trans_copy_iter(trans, iter);
-
-		for_each_btree_key_continue(copy, 0, old, ret) {
-			unsigned offset = 0;
-			s64 sectors = -((s64) old.k->size);
-
-			flags |= BTREE_TRIGGER_OVERWRITE;
-
-			if (bkey_cmp(new->k.p, bkey_start_pos(old.k)) <= 0)
-				break;
-
-			switch (bch2_extent_overlap(&new->k, old.k)) {
-			case BCH_EXTENT_OVERLAP_ALL:
-				offset = 0;
-				sectors = -((s64) old.k->size);
-				break;
-			case BCH_EXTENT_OVERLAP_BACK:
-				offset = bkey_start_offset(&new->k) -
-					bkey_start_offset(old.k);
-				sectors = bkey_start_offset(&new->k) -
-					old.k->p.offset;
-				break;
-			case BCH_EXTENT_OVERLAP_FRONT:
-				offset = 0;
-				sectors = bkey_start_offset(old.k) -
-					new->k.p.offset;
-				break;
-			case BCH_EXTENT_OVERLAP_MIDDLE:
-				offset = bkey_start_offset(&new->k) -
-					bkey_start_offset(old.k);
-				sectors = -((s64) new->k.size);
-				flags |= BTREE_TRIGGER_OVERWRITE_SPLIT;
-				break;
-			}
-
-			BUG_ON(sectors >= 0);
-
-			ret = bch2_mark_key_locked(c, old, bkey_i_to_s_c(new),
-					offset, sectors, fs_usage,
-					trans->journal_res.seq, flags) ?: 1;
-			if (ret <= 0)
-				break;
-		}
-		bch2_trans_iter_put(trans, copy);
+		ret   = bch2_mark_key_locked(c, deleted, bkey_i_to_s_c(new),
+				trans->journal_res.seq,
+				BTREE_TRIGGER_INSERT|flags) ?:
+			bch2_mark_key_locked(c, old, deleted,
+				trans->journal_res.seq,
+				BTREE_TRIGGER_OVERWRITE|flags);
 	}
 
 	return ret;
@@ -1420,7 +1340,7 @@ void bch2_trans_fs_usage_apply(struct btree_trans *trans,
 			added += d->delta;
 		}
 
-		BUG_ON(update_replicas(c, dst, &d->r, d->delta));
+		BUG_ON(__update_replicas(c, dst, &d->r, d->delta));
 	}
 
 	dst->nr_inodes += deltas->nr_inodes;
@@ -1609,31 +1529,38 @@ err:
 }
 
 static int bch2_trans_mark_extent(struct btree_trans *trans,
-			struct bkey_s_c k, unsigned offset,
-			s64 sectors, unsigned flags,
-			enum bch_data_type data_type)
+			struct bkey_s_c k, unsigned flags)
 {
+	struct bch_fs *c = trans->c;
 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
 	const union bch_extent_entry *entry;
 	struct extent_ptr_decoded p;
 	struct bch_replicas_padded r;
+	enum bch_data_type data_type = bkey_is_btree_ptr(k.k)
+		? BCH_DATA_btree
+		: BCH_DATA_user;
+	s64 sectors = bkey_is_btree_ptr(k.k)
+		? c->opts.btree_node_size
+		: k.k->size;
 	s64 dirty_sectors = 0;
 	bool stale;
 	int ret;
 
+	BUG_ON((flags & (BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE)) ==
+	       (BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE));
+
+	if (flags & BTREE_TRIGGER_OVERWRITE)
+		sectors = -sectors;
+
 	r.e.data_type	= data_type;
 	r.e.nr_devs	= 0;
 	r.e.nr_required	= 1;
 
-	BUG_ON(!sectors);
-
 	bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
-		s64 disk_sectors = data_type == BCH_DATA_btree
-			? sectors
-			: ptr_disk_sectors_delta(p, offset, sectors, flags);
+		s64 disk_sectors = ptr_disk_sectors(sectors, p);
 
-		ret = bch2_trans_mark_pointer(trans, k, p, disk_sectors,
-					      data_type);
+		ret = bch2_trans_mark_pointer(trans, k, p,
+					disk_sectors, data_type);
 		if (ret < 0)
 			return ret;
 
@@ -1769,10 +1696,49 @@ static int bch2_trans_mark_stripe(struct btree_trans *trans,
 	return ret;
 }
 
+static int bch2_trans_mark_inode(struct btree_trans *trans,
+				 struct bkey_s_c old,
+				 struct bkey_s_c new,
+				 unsigned flags)
+{
+	int nr = (new.k->type == KEY_TYPE_inode) -
+		(old.k->type == KEY_TYPE_inode);
+
+	if (nr) {
+		struct replicas_delta_list *d =
+			replicas_deltas_realloc(trans, 0);
+		d->nr_inodes += nr;
+	}
+
+	return 0;
+}
+
+static int bch2_trans_mark_reservation(struct btree_trans *trans,
+				       struct bkey_s_c k, unsigned flags)
+{
+	unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas;
+	s64 sectors = (s64) k.k->size;
+	struct replicas_delta_list *d;
+
+	BUG_ON((flags & (BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE)) ==
+	       (BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE));
+
+	if (flags & BTREE_TRIGGER_OVERWRITE)
+		sectors = -sectors;
+	sectors *= replicas;
+
+	d = replicas_deltas_realloc(trans, 0);
+
+	replicas = clamp_t(unsigned, replicas, 1,
+			   ARRAY_SIZE(d->persistent_reserved));
+
+	d->persistent_reserved[replicas - 1] += sectors;
+	return 0;
+}
+
 static int __bch2_trans_mark_reflink_p(struct btree_trans *trans,
 			struct bkey_s_c_reflink_p p,
-			u64 idx, unsigned sectors,
-			unsigned flags)
+			u64 idx, unsigned flags)
 {
 	struct bch_fs *c = trans->c;
 	struct btree_iter *iter;
@@ -1790,8 +1756,6 @@ static int __bch2_trans_mark_reflink_p(struct btree_trans *trans,
 	if (ret)
 		goto err;
 
-	sectors = min_t(u64, sectors, k.k->p.offset - idx);
-
 	n = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
 	ret = PTR_ERR_OR_ZERO(n);
 	if (ret)
@@ -1822,29 +1786,26 @@ static int __bch2_trans_mark_reflink_p(struct btree_trans *trans,
 	if (ret)
 		goto err;
 
-	ret = sectors;
+	ret = k.k->p.offset - idx;
 err:
 	bch2_trans_iter_put(trans, iter);
 	return ret;
 }
 
 static int bch2_trans_mark_reflink_p(struct btree_trans *trans,
-			struct bkey_s_c_reflink_p p, unsigned offset,
-			s64 sectors, unsigned flags)
+				     struct bkey_s_c k, unsigned flags)
 {
-	u64 idx = le64_to_cpu(p.v->idx) + offset;
+	struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
+	u64 idx = le64_to_cpu(p.v->idx);
+	unsigned sectors = p.k->size;
 	s64 ret = 0;
 
-	if (sectors < 0)
-		sectors = -sectors;
-
-	BUG_ON(offset || sectors != p.k->size);
-
 	while (sectors) {
-		ret = __bch2_trans_mark_reflink_p(trans, p, idx, sectors, flags);
+		ret = __bch2_trans_mark_reflink_p(trans, p, idx, flags);
 		if (ret < 0)
 			return ret;
 
+		ret = min_t(s64, ret, sectors);
 		idx	+= ret;
 		sectors	-= ret;
 	}
@@ -1852,59 +1813,27 @@ static int bch2_trans_mark_reflink_p(struct btree_trans *trans,
 	return 0;
 }
 
-int bch2_trans_mark_key(struct btree_trans *trans,
-			struct bkey_s_c old,
-			struct bkey_s_c new,
-			unsigned offset, s64 sectors, unsigned flags)
+int bch2_trans_mark_key(struct btree_trans *trans, struct bkey_s_c old,
+			struct bkey_s_c new, unsigned flags)
 {
-	struct bch_fs *c = trans->c;
 	struct bkey_s_c k = flags & BTREE_TRIGGER_INSERT ? new : old;
-	struct replicas_delta_list *d;
 
 	BUG_ON(!(flags & (BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE)));
 
 	switch (k.k->type) {
 	case KEY_TYPE_btree_ptr:
 	case KEY_TYPE_btree_ptr_v2:
-		sectors = !(flags & BTREE_TRIGGER_OVERWRITE)
-			?  c->opts.btree_node_size
-			: -c->opts.btree_node_size;
-
-		return bch2_trans_mark_extent(trans, k, offset, sectors,
-					      flags, BCH_DATA_btree);
 	case KEY_TYPE_extent:
 	case KEY_TYPE_reflink_v:
-		return bch2_trans_mark_extent(trans, k, offset, sectors,
-					      flags, BCH_DATA_user);
+		return bch2_trans_mark_extent(trans, k, flags);
 	case KEY_TYPE_stripe:
 		return bch2_trans_mark_stripe(trans, old, new, flags);
-	case KEY_TYPE_inode: {
-		int nr = (new.k->type == KEY_TYPE_inode) -
-			 (old.k->type == KEY_TYPE_inode);
-
-		if (nr) {
-			d = replicas_deltas_realloc(trans, 0);
-			d->nr_inodes += nr;
-		}
-
-		return 0;
-	}
-	case KEY_TYPE_reservation: {
-		unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas;
-
-		d = replicas_deltas_realloc(trans, 0);
-
-		sectors *= replicas;
-		replicas = clamp_t(unsigned, replicas, 1,
-				   ARRAY_SIZE(d->persistent_reserved));
-
-		d->persistent_reserved[replicas - 1] += sectors;
-		return 0;
-	}
+	case KEY_TYPE_inode:
+		return bch2_trans_mark_inode(trans, old, new, flags);
+	case KEY_TYPE_reservation:
+		return bch2_trans_mark_reservation(trans, k, flags);
 	case KEY_TYPE_reflink_p:
-		return bch2_trans_mark_reflink_p(trans,
-					bkey_s_c_to_reflink_p(k),
-					offset, sectors, flags);
+		return bch2_trans_mark_reflink_p(trans, k, flags);
 	default:
 		return 0;
 	}
@@ -1915,7 +1844,9 @@ int bch2_trans_mark_update(struct btree_trans *trans,
 			   struct bkey_i *new,
 			   unsigned flags)
 {
-	struct bkey_s_c	old;
+	struct bkey		_deleted = KEY(0, 0, 0);
+	struct bkey_s_c		deleted = (struct bkey_s_c) { &_deleted, NULL };
+	struct bkey_s_c		old;
 	int iter_flags, ret;
 
 	if (unlikely(flags & BTREE_TRIGGER_NORUN))
@@ -1924,25 +1855,33 @@ int bch2_trans_mark_update(struct btree_trans *trans,
 	if (!btree_node_type_needs_gc(iter->btree_id))
 		return 0;
 
-	iter_flags = iter->flags & BTREE_ITER_WITH_UPDATES;
-	iter->flags &= ~BTREE_ITER_WITH_UPDATES;
 
-	old = bch2_btree_iter_peek_slot(iter);
+	if (likely(!(iter->flags & BTREE_ITER_CACHED_NOFILL))) {
+		iter_flags = iter->flags & BTREE_ITER_WITH_UPDATES;
+		iter->flags &= ~BTREE_ITER_WITH_UPDATES;
 
-	iter->flags |= iter_flags;
+		old = bch2_btree_iter_peek_slot(iter);
+		iter->flags |= iter_flags;
 
-	ret = bkey_err(old);
-	if (ret)
-		return ret;
+		ret = bkey_err(old);
+		if (ret)
+			return ret;
+	} else {
+		/*
+		 * If BTREE_ITER_CACHED_NOFILL was used, we better not be
+		 * running triggers that do anything on removal (alloc btree):
+		 */
+		old = deleted;
+	}
 
 	if (old.k->type == new->k.type &&
-	    !btree_node_type_is_extents(iter->btree_id)) {
-		ret   = bch2_trans_mark_key(trans, old, bkey_i_to_s_c(new), 0, 0,
+	    ((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) {
+		ret   = bch2_trans_mark_key(trans, old, bkey_i_to_s_c(new),
 				BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|flags);
 	} else {
-		ret   = bch2_trans_mark_key(trans, old, bkey_i_to_s_c(new), 0, new->k.size,
+		ret   = bch2_trans_mark_key(trans, deleted, bkey_i_to_s_c(new),
 				BTREE_TRIGGER_INSERT|flags) ?:
-			bch2_trans_mark_key(trans, old, bkey_i_to_s_c(new), 0, -((s64) old.k->size),
+			bch2_trans_mark_key(trans, old, deleted,
 				BTREE_TRIGGER_OVERWRITE|flags);
 	}
 
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index 61be96a7b03d..3fb91ef60685 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -125,20 +125,6 @@ static inline u8 ptr_stale(struct bch_dev *ca,
 	return gen_after(ptr_bucket_mark(ca, ptr).gen, ptr->gen);
 }
 
-static inline s64 __ptr_disk_sectors(struct extent_ptr_decoded p,
-				     unsigned live_size)
-{
-	return live_size && p.crc.compression_type
-		? max(1U, DIV_ROUND_UP(live_size * p.crc.compressed_size,
-				       p.crc.uncompressed_size))
-		: live_size;
-}
-
-static inline s64 ptr_disk_sectors(struct extent_ptr_decoded p)
-{
-	return __ptr_disk_sectors(p, p.crc.live_size);
-}
-
 /* bucket gc marks */
 
 static inline unsigned bucket_sectors_used(struct bucket_mark mark)
@@ -240,14 +226,13 @@ void bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *,
 			       size_t, enum bch_data_type, unsigned,
 			       struct gc_pos, unsigned);
 
-int bch2_mark_key(struct bch_fs *, struct bkey_s_c, unsigned,
-		  s64, struct bch_fs_usage *, u64, unsigned);
+int bch2_mark_key(struct bch_fs *, struct bkey_s_c, unsigned);
 
 int bch2_mark_update(struct btree_trans *, struct btree_iter *,
-		     struct bkey_i *, struct bch_fs_usage *, unsigned);
+		     struct bkey_i *, unsigned);
 
-int bch2_trans_mark_key(struct btree_trans *, struct bkey_s_c, struct bkey_s_c,
-			unsigned, s64, unsigned);
+int bch2_trans_mark_key(struct btree_trans *, struct bkey_s_c,
+			struct bkey_s_c, unsigned);
 int bch2_trans_mark_update(struct btree_trans *, struct btree_iter *iter,
 			   struct bkey_i *insert, unsigned);
 void bch2_trans_fs_usage_apply(struct btree_trans *, struct replicas_delta_list *);
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index 0ee8ecd9d8a0..7cc7ca94b082 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -1633,7 +1633,8 @@ static int bch2_stripes_read_fn(struct bch_fs *c, struct bkey_s_c k)
 
 	if (k.k->type == KEY_TYPE_stripe)
 		ret = __ec_stripe_mem_alloc(c, k.k->p.offset, GFP_KERNEL) ?:
-			bch2_mark_key(c, k, 0, 0, NULL, 0,
+			bch2_mark_key(c, k,
+				      BTREE_TRIGGER_INSERT|
 				      BTREE_TRIGGER_NOATOMIC);
 
 	return ret;
diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h
index 3f6224f75ce8..43cef0a3bdf3 100644
--- a/fs/bcachefs/extents.h
+++ b/fs/bcachefs/extents.h
@@ -426,6 +426,17 @@ void bch2_extent_crc_append(struct bkey_i *,
 
 /* Generic code for keys with pointers: */
 
+static inline bool bkey_is_btree_ptr(const struct bkey *k)
+{
+	switch (k->type) {
+	case KEY_TYPE_btree_ptr:
+	case KEY_TYPE_btree_ptr_v2:
+		return true;
+	default:
+		return false;
+	}
+}
+
 static inline bool bkey_extent_is_direct_data(const struct bkey *k)
 {
 	switch (k->type) {
-- 
cgit 


From 109a4277124ca1a0ff670b519315c4f381c83ad3 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 10 Jun 2021 23:34:02 -0400
Subject: bcachefs: Don't disable preemption unnecessarily

Small improvements to some percpu utility code.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/util.c | 6 +-----
 fs/bcachefs/util.h | 5 +----
 2 files changed, 2 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c
index f183c9d80e2c..f287bca8498d 100644
--- a/fs/bcachefs/util.c
+++ b/fs/bcachefs/util.c
@@ -893,13 +893,9 @@ void eytzinger0_find_test(void)
  */
 u64 *bch2_acc_percpu_u64s(u64 __percpu *p, unsigned nr)
 {
-	u64 *ret;
+	u64 *ret = this_cpu_ptr(p);
 	int cpu;
 
-	preempt_disable();
-	ret = this_cpu_ptr(p);
-	preempt_enable();
-
 	for_each_possible_cpu(cpu) {
 		u64 *i = per_cpu_ptr(p, cpu);
 
diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h
index 91aa8c0a0e09..a0cbebf190b4 100644
--- a/fs/bcachefs/util.h
+++ b/fs/bcachefs/util.h
@@ -715,10 +715,7 @@ static inline void percpu_u64_set(u64 __percpu *dst, u64 src)
 
 	for_each_possible_cpu(cpu)
 		*per_cpu_ptr(dst, cpu) = 0;
-
-	preempt_disable();
-	*this_cpu_ptr(dst) = src;
-	preempt_enable();
+	this_cpu_write(*dst, src);
 }
 
 static inline void acc_u64s(u64 *acc, const u64 *src, unsigned nr)
-- 
cgit 


From 33a1f84b76b94a64cc8244b176567257170c6695 Mon Sep 17 00:00:00 2001
From: Dan Robertson <dan@dlrobertson.com>
Date: Wed, 16 Jun 2021 23:21:23 -0400
Subject: bcachefs: ensure iter->should_be_locked is set

Ensure that iter->should_be_locked value is set to true before we
call bch2_trans_update in ec_stripe_update_ptrs.

Signed-off-by: Dan Robertson <dan@dlrobertson.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/ec.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index 7cc7ca94b082..c0855245f2ec 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -863,7 +863,8 @@ static int ec_stripe_update_ptrs(struct bch_fs *c,
 		extent_stripe_ptr_add(e, s, ec_ptr, block);
 
 		bch2_btree_iter_set_pos(iter, bkey_start_pos(&sk.k->k));
-		ret   = bch2_trans_update(&trans, iter, sk.k, 0) ?:
+		ret   = bch2_btree_iter_traverse(iter) ?:
+			bch2_trans_update(&trans, iter, sk.k, 0) ?:
 			bch2_trans_commit(&trans, NULL, NULL,
 					BTREE_INSERT_NOFAIL);
 		if (ret == -EINTR)
-- 
cgit 


From 0806151913dff8547aae3713c1f793eb5d702b96 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 21 Jun 2021 16:28:43 -0400
Subject: bcachefs: Don't ratelimit certain fsck errors

It's unhelpful if we see "Halting mark and sweep to start topology
repair" but we don't see the error that triggered it.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_gc.c | 46 +++++++++++++++++++++++++++++-----------------
 fs/bcachefs/error.c    |  1 +
 fs/bcachefs/error.h    |  1 +
 3 files changed, 31 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 480d4afb9aae..de0d88a64244 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -86,12 +86,16 @@ static int bch2_gc_check_topology(struct bch_fs *c,
 		if (bpos_cmp(expected_start, bp->v.min_key)) {
 			bch2_topology_error(c);
 
-			if (fsck_err(c, "btree node with incorrect min_key at btree %s level %u:\n"
-				     "  prev %s\n"
-				     "  cur %s",
-				     bch2_btree_ids[b->c.btree_id], b->c.level,
-				     buf1,
-				     (bch2_bkey_val_to_text(&PBUF(buf2), c, bkey_i_to_s_c(cur.k)), buf2))) {
+			if (__fsck_err(c,
+				  FSCK_CAN_FIX|
+				  FSCK_CAN_IGNORE|
+				  FSCK_NO_RATELIMIT,
+				  "btree node with incorrect min_key at btree %s level %u:\n"
+				  "  prev %s\n"
+				  "  cur %s",
+				  bch2_btree_ids[b->c.btree_id], b->c.level,
+				  buf1,
+				  (bch2_bkey_val_to_text(&PBUF(buf2), c, bkey_i_to_s_c(cur.k)), buf2))) {
 				bch_info(c, "Halting mark and sweep to start topology repair pass");
 				return FSCK_ERR_START_TOPOLOGY_REPAIR;
 			} else {
@@ -103,12 +107,16 @@ static int bch2_gc_check_topology(struct bch_fs *c,
 	if (is_last && bpos_cmp(cur.k->k.p, node_end)) {
 		bch2_topology_error(c);
 
-		if (fsck_err(c, "btree node with incorrect max_key at btree %s level %u:\n"
-			     "  %s\n"
-			     "  expected %s",
-			     bch2_btree_ids[b->c.btree_id], b->c.level,
-			     (bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(cur.k)), buf1),
-			     (bch2_bpos_to_text(&PBUF(buf2), node_end), buf2))) {
+		if (__fsck_err(c,
+			  FSCK_CAN_FIX|
+			  FSCK_CAN_IGNORE|
+			  FSCK_NO_RATELIMIT,
+			  "btree node with incorrect max_key at btree %s level %u:\n"
+			  "  %s\n"
+			  "  expected %s",
+			  bch2_btree_ids[b->c.btree_id], b->c.level,
+			  (bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(cur.k)), buf1),
+			  (bch2_bpos_to_text(&PBUF(buf2), node_end), buf2))) {
 			bch_info(c, "Halting mark and sweep to start topology repair pass");
 			return FSCK_ERR_START_TOPOLOGY_REPAIR;
 		} else {
@@ -884,11 +892,15 @@ static int bch2_gc_btree_init_recurse(struct bch_fs *c, struct btree *b,
 			if (ret == -EIO) {
 				bch2_topology_error(c);
 
-				if (fsck_err(c, "Unreadable btree node at btree %s level %u:\n"
-					"  %s",
-					bch2_btree_ids[b->c.btree_id],
-					b->c.level - 1,
-					(bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(cur.k)), buf))) {
+				if (__fsck_err(c,
+					  FSCK_CAN_FIX|
+					  FSCK_CAN_IGNORE|
+					  FSCK_NO_RATELIMIT,
+					  "Unreadable btree node at btree %s level %u:\n"
+					  "  %s",
+					  bch2_btree_ids[b->c.btree_id],
+					  b->c.level - 1,
+					  (bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(cur.k)), buf))) {
 					ret = FSCK_ERR_START_TOPOLOGY_REPAIR;
 					bch_info(c, "Halting mark and sweep to start topology repair pass");
 					goto fsck_err;
diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c
index 90c3b986c264..2cea694575e9 100644
--- a/fs/bcachefs/error.c
+++ b/fs/bcachefs/error.c
@@ -111,6 +111,7 @@ found:
 	list_move(&s->list, &c->fsck_errors);
 	s->nr++;
 	if (c->opts.ratelimit_errors &&
+	    !(flags & FSCK_NO_RATELIMIT) &&
 	    s->nr >= FSCK_ERR_RATELIMIT_NR) {
 		if (s->nr == FSCK_ERR_RATELIMIT_NR)
 			suppressing = true;
diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h
index d8cd19b3f63c..986938298adc 100644
--- a/fs/bcachefs/error.h
+++ b/fs/bcachefs/error.h
@@ -104,6 +104,7 @@ struct fsck_err_state {
 #define FSCK_CAN_FIX		(1 << 0)
 #define FSCK_CAN_IGNORE		(1 << 1)
 #define FSCK_NEED_FSCK		(1 << 2)
+#define FSCK_NO_RATELIMIT	(1 << 3)
 
 __printf(3, 4) __cold
 enum fsck_err_ret bch2_fsck_err(struct bch_fs *,
-- 
cgit 


From d976a84e3b486b04314dcf5183af3123549cda95 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 22 Jun 2021 20:44:54 -0400
Subject: bcachefs: Don't loop into topology repair

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/bcachefs.h |  1 +
 fs/bcachefs/btree_gc.c | 12 +++++++++---
 2 files changed, 10 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 9bd60369703f..bed2e76e6dc8 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -498,6 +498,7 @@ enum {
 	BCH_FS_ALLOCATOR_STOPPING,
 	BCH_FS_INITIAL_GC_DONE,
 	BCH_FS_INITIAL_GC_UNFIXED,
+	BCH_FS_TOPOLOGY_REPAIR_DONE,
 	BCH_FS_BTREE_INTERIOR_REPLAY_DONE,
 	BCH_FS_FSCK_DONE,
 	BCH_FS_STARTED,
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index de0d88a64244..1d690190d958 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -95,7 +95,8 @@ static int bch2_gc_check_topology(struct bch_fs *c,
 				  "  cur %s",
 				  bch2_btree_ids[b->c.btree_id], b->c.level,
 				  buf1,
-				  (bch2_bkey_val_to_text(&PBUF(buf2), c, bkey_i_to_s_c(cur.k)), buf2))) {
+				  (bch2_bkey_val_to_text(&PBUF(buf2), c, bkey_i_to_s_c(cur.k)), buf2)) &&
+			    !test_bit(BCH_FS_TOPOLOGY_REPAIR_DONE, &c->flags)) {
 				bch_info(c, "Halting mark and sweep to start topology repair pass");
 				return FSCK_ERR_START_TOPOLOGY_REPAIR;
 			} else {
@@ -116,7 +117,8 @@ static int bch2_gc_check_topology(struct bch_fs *c,
 			  "  expected %s",
 			  bch2_btree_ids[b->c.btree_id], b->c.level,
 			  (bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(cur.k)), buf1),
-			  (bch2_bpos_to_text(&PBUF(buf2), node_end), buf2))) {
+			  (bch2_bpos_to_text(&PBUF(buf2), node_end), buf2)) &&
+		    !test_bit(BCH_FS_TOPOLOGY_REPAIR_DONE, &c->flags)) {
 			bch_info(c, "Halting mark and sweep to start topology repair pass");
 			return FSCK_ERR_START_TOPOLOGY_REPAIR;
 		} else {
@@ -900,7 +902,8 @@ static int bch2_gc_btree_init_recurse(struct bch_fs *c, struct btree *b,
 					  "  %s",
 					  bch2_btree_ids[b->c.btree_id],
 					  b->c.level - 1,
-					  (bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(cur.k)), buf))) {
+					  (bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(cur.k)), buf)) &&
+				    !test_bit(BCH_FS_TOPOLOGY_REPAIR_DONE, &c->flags)) {
 					ret = FSCK_ERR_START_TOPOLOGY_REPAIR;
 					bch_info(c, "Halting mark and sweep to start topology repair pass");
 					goto fsck_err;
@@ -1599,11 +1602,14 @@ again:
 		if (ret)
 			goto out;
 		bch_info(c, "topology repair pass done");
+
+		set_bit(BCH_FS_TOPOLOGY_REPAIR_DONE, &c->flags);
 	}
 
 	ret = bch2_gc_btrees(c, initial, metadata_only);
 
 	if (ret == FSCK_ERR_START_TOPOLOGY_REPAIR &&
+	    !test_bit(BCH_FS_TOPOLOGY_REPAIR_DONE, &c->flags) &&
 	    !test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags)) {
 		set_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags);
 		ret = 0;
-- 
cgit 


From 50ad5d097977dc5c688e7a1a6dc2e74c37da5adf Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 22 Jun 2021 21:51:17 -0400
Subject: bcachefs: Fix btree_node_read_all_replicas() error handling

We weren't checking bch2_btree_node_read_done() for errors, oops.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_io.c | 39 ++++++++++++++++++++-------------------
 1 file changed, 20 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 4ffdc11f4d9a..6274211d09f1 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -1187,31 +1187,27 @@ static void btree_node_read_all_replicas_done(struct closure *cl)
 		container_of(cl, struct btree_node_read_all, cl);
 	struct bch_fs *c = ra->c;
 	struct btree *b = ra->b;
-	bool have_good_copy = false;
 	bool dump_bset_maps = false;
 	bool have_retry = false;
-	int ret = 0, write = READ;
+	int ret = 0, best = -1, write = READ;
 	unsigned i, written, written2;
 	__le64 seq = b->key.k.type == KEY_TYPE_btree_ptr_v2
 		? bkey_i_to_btree_ptr_v2(&b->key)->v.seq : 0;
 
 	for (i = 0; i < ra->nr; i++) {
+		struct btree_node *bn = ra->buf[i];
+
 		if (ra->err[i])
 			continue;
 
-		if (!have_good_copy) {
-			memcpy(b->data, ra->buf[i], btree_bytes(c));
-			have_good_copy = true;
-			written = btree_node_sectors_written(c, b->data);
-		}
+		if (le64_to_cpu(bn->magic) != bset_magic(c) ||
+		    (seq && seq != bn->keys.seq))
+			continue;
 
-		/* Try to get the right btree node: */
-		if (have_good_copy &&
-		    seq &&
-		    b->data->keys.seq != seq &&
-		    ((struct btree_node *) ra->buf[i])->keys.seq == seq) {
-			memcpy(b->data, ra->buf[i], btree_bytes(c));
-			written = btree_node_sectors_written(c, b->data);
+		if (best < 0) {
+			best = i;
+			written = btree_node_sectors_written(c, bn);
+			continue;
 		}
 
 		written2 = btree_node_sectors_written(c, ra->buf[i]);
@@ -1221,14 +1217,14 @@ static void btree_node_read_all_replicas_done(struct closure *cl)
 		    btree_err_on(btree_node_has_extra_bsets(c, written2, ra->buf[i]),
 				 BTREE_ERR_FIXABLE, c, NULL, b, NULL,
 				 "found bset signature after last bset") ||
-		    btree_err_on(memcmp(b->data, ra->buf[i], written << 9),
+		    btree_err_on(memcmp(ra->buf[best], ra->buf[i], written << 9),
 				 BTREE_ERR_FIXABLE, c, NULL, b, NULL,
 				 "btree node replicas content mismatch"))
 			dump_bset_maps = true;
 
 		if (written2 > written) {
 			written = written2;
-			memcpy(b->data, ra->buf[i], btree_bytes(c));
+			best = i;
 		}
 	}
 fsck_err:
@@ -1281,9 +1277,14 @@ fsck_err:
 		}
 	}
 
-	if (have_good_copy)
-		bch2_btree_node_read_done(c, NULL, b, false);
-	else
+	if (best >= 0) {
+		memcpy(b->data, ra->buf[best], btree_bytes(c));
+		ret = bch2_btree_node_read_done(c, NULL, b, false);
+	} else {
+		ret = -1;
+	}
+
+	if (ret)
 		set_btree_node_read_error(b);
 
 	for (i = 0; i < ra->nr; i++) {
-- 
cgit 


From 077ed08ec7edbcf0bfcadfa01260a5b0cb4ed4db Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 30 Jun 2021 15:44:11 -0400
Subject: bcachefs: Use memalloc_nofs_save() in bch2_read_endio()

This solves a problematic memory allocation in bch2_bio_uncompress() ->
vmap().

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/io.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index e13382fc5b01..f80b3ce4c7d7 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -1808,8 +1808,11 @@ static void __bch2_read_endio(struct work_struct *work)
 	struct bvec_iter dst_iter = rbio->bvec_iter;
 	struct bch_extent_crc_unpacked crc = rbio->pick.crc;
 	struct nonce nonce = extent_nonce(rbio->version, crc);
+	unsigned nofs_flags;
 	struct bch_csum csum;
 
+	nofs_flags = memalloc_nofs_save();
+
 	/* Reset iterator for checksumming and copying bounced data: */
 	if (rbio->bounce) {
 		src->bi_iter.bi_size		= crc.compressed_size << 9;
@@ -1874,6 +1877,8 @@ nodecode:
 		rbio = bch2_rbio_free(rbio);
 		bch2_rbio_done(rbio);
 	}
+out:
+	memalloc_nofs_restore(nofs_flags);
 	return;
 csum_err:
 	/*
@@ -1884,7 +1889,7 @@ csum_err:
 	if (!rbio->bounce && (rbio->flags & BCH_READ_USER_MAPPED)) {
 		rbio->flags |= BCH_READ_MUST_BOUNCE;
 		bch2_rbio_error(rbio, READ_RETRY, BLK_STS_IOERR);
-		return;
+		goto out;
 	}
 
 	bch2_dev_inum_io_error(ca, rbio->read_pos.inode, (u64) rbio->bvec_iter.bi_sector,
@@ -1892,12 +1897,12 @@ csum_err:
 		rbio->pick.crc.csum.hi, rbio->pick.crc.csum.lo,
 		csum.hi, csum.lo, crc.csum_type);
 	bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
-	return;
+	goto out;
 decompression_err:
 	bch_err_inum_ratelimited(c, rbio->read_pos.inode,
 				 "decompression error");
 	bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR);
-	return;
+	goto out;
 }
 
 static void bch2_read_endio(struct bio *bio)
-- 
cgit 


From 729608a606fdef3dc97762081442d7b4a40d16a4 Mon Sep 17 00:00:00 2001
From: Christopher James Halse Rogers <raof@ubuntu.com>
Date: Fri, 25 Jun 2021 11:45:19 +1000
Subject: bcachefs: Fix unused variable warning when !BCACHEFS_DEBUG

Signed-off-by: Christopher James Halse Rogers <raof@ubuntu.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index e9351632cc03..cf26da1ab895 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1549,7 +1549,9 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
 
 static void btree_iter_set_search_pos(struct btree_iter *iter, struct bpos new_pos)
 {
+#ifdef CONFIG_BCACHEFS_DEBUG
 	struct bpos old_pos = iter->real_pos;
+#endif
 	int cmp = bpos_cmp(new_pos, iter->real_pos);
 	unsigned l = iter->level;
 
-- 
cgit 


From 32abe2e38f5da39a79561b8e42f22062171c798d Mon Sep 17 00:00:00 2001
From: Dan Robertson <dan@dlrobertson.com>
Date: Tue, 29 Jun 2021 18:52:13 -0400
Subject: bcachefs: ensure iter->should_be_locked is set

Ensure that iter->should_be_locked is set to true before we
call bch2_trans_update in __bch2_dev_usrdata_drop.

Signed-off-by: Dan Robertson <dan@dlrobertson.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/migrate.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c
index 6ebe49ba2248..91a9f584dd6d 100644
--- a/fs/bcachefs/migrate.c
+++ b/fs/bcachefs/migrate.c
@@ -73,7 +73,8 @@ static int __bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags
 
 		bch2_btree_iter_set_pos(iter, bkey_start_pos(&sk.k->k));
 
-		ret   = bch2_trans_update(&trans, iter, sk.k, 0) ?:
+		ret   = bch2_btree_iter_traverse(iter) ?:
+			bch2_trans_update(&trans, iter, sk.k, 0) ?:
 			bch2_trans_commit(&trans, NULL, NULL,
 					BTREE_INSERT_NOFAIL);
 
-- 
cgit 


From 6f152b0f375450b72724b6eb2ec00f7669fc910e Mon Sep 17 00:00:00 2001
From: Dan Robertson <dan@dlrobertson.com>
Date: Fri, 2 Jul 2021 21:22:06 -0400
Subject: bcachefs: fix ifdef for x86_64 asm

The implementation of prefetch_four_cachelines should use ifdef
CONFIG_X86_64 to conditionally compile x86_64 asm.

Signed-off-by: Dan Robertson <dan@dlrobertson.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bset.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c
index 84c4664c9912..9484f28be6de 100644
--- a/fs/bcachefs/bset.c
+++ b/fs/bcachefs/bset.c
@@ -1181,7 +1181,7 @@ static struct bkey_packed *bset_search_write_set(const struct btree *b,
 
 static inline void prefetch_four_cachelines(void *p)
 {
-#if CONFIG_X86_64
+#ifdef CONFIG_X86_64
 	asm("prefetcht0 (-127 + 64 * 0)(%0);"
 	    "prefetcht0 (-127 + 64 * 1)(%0);"
 	    "prefetcht0 (-127 + 64 * 2)(%0);"
-- 
cgit 


From 78d66ab1ca541ba95a9ad89780466398b348c230 Mon Sep 17 00:00:00 2001
From: Dan Robertson <dan@dlrobertson.com>
Date: Sun, 27 Jun 2021 20:54:34 -0400
Subject: bcachefs: fix truncate without a size change

Do not attempt to shortcut a truncate when the given new size is
the same as the current size. There may be blocks allocated to the
file that extend beyond the i_size. The ctime and mtime should
not be updated in this case.

Signed-off-by: Dan Robertson <dan@dlrobertson.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-io.c | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index a25c3b70ef74..bcf954a2394f 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -2305,16 +2305,17 @@ int bch2_truncate(struct mnt_idmap *idmap,
 	int ret = 0;
 
 	/*
-	 * Don't update timestamps if we're not doing anything:
+	 * If the truncate call with change the size of the file, the
+	 * cmtimes should be updated. If the size will not change, we
+	 * do not need to update the cmtimes.
 	 */
-	if (iattr->ia_size == inode->v.i_size)
-		return 0;
-
-	if (!(iattr->ia_valid & ATTR_MTIME))
-		ktime_get_coarse_real_ts64(&iattr->ia_mtime);
-	if (!(iattr->ia_valid & ATTR_CTIME))
-		ktime_get_coarse_real_ts64(&iattr->ia_ctime);
-	iattr->ia_valid |= ATTR_MTIME|ATTR_CTIME;
+	if (iattr->ia_size != inode->v.i_size) {
+		if (!(iattr->ia_valid & ATTR_MTIME))
+			ktime_get_coarse_real_ts64(&iattr->ia_mtime);
+		if (!(iattr->ia_valid & ATTR_CTIME))
+			ktime_get_coarse_real_ts64(&iattr->ia_ctime);
+		iattr->ia_valid |= ATTR_MTIME|ATTR_CTIME;
+	}
 
 	inode_dio_wait(&inode->v);
 	bch2_pagecache_block_get(&inode->ei_pagecache_lock);
-- 
cgit 


From fdc6b08451167bfc1ae260b252ad2bf2f9735f50 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 24 Jun 2021 13:19:25 -0400
Subject: bcachefs: Fix shift-by-64 in bch2_bkey_format_validate()

We need to ensure that packed formats can't represent fields larger than
the unpacked format, which is a bit tricky since the calculations can
also overflow a u64. This patch fixes a shift and simplifies the overall
calculations.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/bkey.c | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bkey.c b/fs/bcachefs/bkey.c
index 5de88a93f33f..3e62eeb6774e 100644
--- a/fs/bcachefs/bkey.c
+++ b/fs/bcachefs/bkey.c
@@ -623,22 +623,22 @@ const char *bch2_bkey_format_validate(struct bkey_format *f)
 	if (f->nr_fields != BKEY_NR_FIELDS)
 		return "incorrect number of fields";
 
+	/*
+	 * Verify that the packed format can't represent fields larger than the
+	 * unpacked format:
+	 */
 	for (i = 0; i < f->nr_fields; i++) {
 		unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i];
-		u64 unpacked_mask = ~((~0ULL << 1) << (unpacked_bits - 1));
+		u64 unpacked_max = ~((~0ULL << 1) << (unpacked_bits - 1));
+		u64 packed_max = f->bits_per_field[i]
+			? ~((~0ULL << 1) << (f->bits_per_field[i] - 1))
+			: 0;
 		u64 field_offset = le64_to_cpu(f->field_offset[i]);
 
-		if (f->bits_per_field[i] > unpacked_bits)
+		if (packed_max + field_offset < packed_max ||
+		    packed_max + field_offset > unpacked_max)
 			return "field too large";
 
-		if ((f->bits_per_field[i] == unpacked_bits) && field_offset)
-			return "offset + bits overflow";
-
-		if (((field_offset + ((1ULL << f->bits_per_field[i]) - 1)) &
-		     unpacked_mask) <
-		    field_offset)
-			return "offset + bits overflow";
-
 		bits += f->bits_per_field[i];
 	}
 
-- 
cgit 


From e8e9607f3c1bb927002b7582b68d36c7eb3e92e2 Mon Sep 17 00:00:00 2001
From: Dan Robertson <dan@dlrobertson.com>
Date: Wed, 23 Jun 2021 19:25:00 -0400
Subject: bcachefs: statfs bfree and bavail should be the same

The value of f_bfree and f_bavail should be the same. The value of
f_bfree is not currently scaled by the availability factor.

Signed-off-by: Dan Robertson <dan@dlrobertson.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index 71e738b98967..1b0d63219c3b 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -1277,8 +1277,8 @@ static int bch2_statfs(struct dentry *dentry, struct kstatfs *buf)
 	buf->f_type	= BCACHEFS_STATFS_MAGIC;
 	buf->f_bsize	= sb->s_blocksize;
 	buf->f_blocks	= usage.capacity >> shift;
-	buf->f_bfree	= usage.free >> shift;
-	buf->f_bavail	= avail_factor(usage.free) >> shift;
+	buf->f_bfree	= avail_factor(usage.free) >> shift;
+	buf->f_bavail	= buf->f_bfree;
 
 	buf->f_files	= usage.nr_inodes + avail_inodes;
 	buf->f_ffree	= avail_inodes;
-- 
cgit 


From 31029f2f70e6abc833ceefb0f32adf4e7bf42235 Mon Sep 17 00:00:00 2001
From: Dan Robertson <dan@dlrobertson.com>
Date: Wed, 23 Jun 2021 21:52:41 -0400
Subject: bcachefs: Fix bch2_acl_chmod() cleanup on error

Avoid calling kfree on the returned error pointer if
bch2_acl_from_disk fails.

Signed-off-by: Dan Robertson <dan@dlrobertson.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/acl.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/acl.c b/fs/bcachefs/acl.c
index be31d27443bc..1642518d3233 100644
--- a/fs/bcachefs/acl.c
+++ b/fs/bcachefs/acl.c
@@ -372,7 +372,7 @@ int bch2_acl_chmod(struct btree_trans *trans,
 	acl = bch2_acl_from_disk(xattr_val(xattr.v),
 			le16_to_cpu(xattr.v->x_val_len));
 	ret = PTR_ERR_OR_ZERO(acl);
-	if (ret || !acl)
+	if (IS_ERR_OR_NULL(acl))
 		goto err;
 
 	ret = __posix_acl_chmod(&acl, GFP_KERNEL, mode);
@@ -391,7 +391,8 @@ int bch2_acl_chmod(struct btree_trans *trans,
 	acl = NULL;
 err:
 	bch2_trans_iter_put(trans, iter);
-	kfree(acl);
+	if (!IS_ERR_OR_NULL(acl))
+		kfree(acl);
 	return ret;
 }
 
-- 
cgit 


From 508b1f71396486648e3d42d1814e7f148fbb7751 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 3 Jul 2021 23:57:09 -0400
Subject: bcachefs: Fix bch2_btree_iter_peek_prev()

In !BTREE_ITER_IS_EXTENTS mode, we shouldn't be looking at k->size, i.e.
we shouldn't use bkey_start_pos().

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_iter.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index cf26da1ab895..31d8c89ae255 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1779,7 +1779,7 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
 		if (!k.k ||
 		    ((iter->flags & BTREE_ITER_IS_EXTENTS)
 		     ? bkey_cmp(bkey_start_pos(k.k), iter->pos) >= 0
-		     : bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0))
+		     : bkey_cmp(k.k->p, iter->pos) > 0))
 			k = btree_iter_level_prev(iter, l);
 
 		if (likely(k.k))
-- 
cgit 


From a515d0a50c6e8bb9b8ce58d6e396b4c5eeea6905 Mon Sep 17 00:00:00 2001
From: Tobias Geerinckx-Rice <me@tobias.gr>
Date: Sun, 4 Jul 2021 21:35:32 +0200
Subject: bcachefs: Enforce SYS_CAP_ADMIN within ioctls

bch2_fs_ioctl() didn't distinguish between unsupported ioctls and those
which the current user is unauthorised to perform.  That kept the code
simple but meant that, for example, an unprivileged TIOCGWINSZ ioctl on
a bcachefs file would return -EPERM instead of the expected -ENOTTY.
The same call made by a privileged user would correctly return -ENOTTY.

Fix this discrepancy by moving the check for CAP_SYS_ADMIN into each
privileged ioctl function.

Signed-off-by: Tobias Geerinckx-Rice <me@tobias.gr>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/chardev.c | 44 ++++++++++++++++++++++++++++++++++++--------
 1 file changed, 36 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c
index 99f112072ae5..c1d8143b3fd8 100644
--- a/fs/bcachefs/chardev.c
+++ b/fs/bcachefs/chardev.c
@@ -157,6 +157,9 @@ static long bch2_ioctl_query_uuid(struct bch_fs *c,
 #if 0
 static long bch2_ioctl_start(struct bch_fs *c, struct bch_ioctl_start arg)
 {
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
 	if (arg.flags || arg.pad)
 		return -EINVAL;
 
@@ -165,6 +168,9 @@ static long bch2_ioctl_start(struct bch_fs *c, struct bch_ioctl_start arg)
 
 static long bch2_ioctl_stop(struct bch_fs *c)
 {
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
 	bch2_fs_stop(c);
 	return 0;
 }
@@ -175,6 +181,9 @@ static long bch2_ioctl_disk_add(struct bch_fs *c, struct bch_ioctl_disk arg)
 	char *path;
 	int ret;
 
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
 	if (arg.flags || arg.pad)
 		return -EINVAL;
 
@@ -192,6 +201,9 @@ static long bch2_ioctl_disk_remove(struct bch_fs *c, struct bch_ioctl_disk arg)
 {
 	struct bch_dev *ca;
 
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
 	if ((arg.flags & ~(BCH_FORCE_IF_DATA_LOST|
 			   BCH_FORCE_IF_METADATA_LOST|
 			   BCH_FORCE_IF_DEGRADED|
@@ -211,6 +223,9 @@ static long bch2_ioctl_disk_online(struct bch_fs *c, struct bch_ioctl_disk arg)
 	char *path;
 	int ret;
 
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
 	if (arg.flags || arg.pad)
 		return -EINVAL;
 
@@ -228,6 +243,9 @@ static long bch2_ioctl_disk_offline(struct bch_fs *c, struct bch_ioctl_disk arg)
 	struct bch_dev *ca;
 	int ret;
 
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
 	if ((arg.flags & ~(BCH_FORCE_IF_DATA_LOST|
 			   BCH_FORCE_IF_METADATA_LOST|
 			   BCH_FORCE_IF_DEGRADED|
@@ -250,6 +268,9 @@ static long bch2_ioctl_disk_set_state(struct bch_fs *c,
 	struct bch_dev *ca;
 	int ret;
 
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
 	if ((arg.flags & ~(BCH_FORCE_IF_DATA_LOST|
 			   BCH_FORCE_IF_METADATA_LOST|
 			   BCH_FORCE_IF_DEGRADED|
@@ -331,6 +352,9 @@ static long bch2_ioctl_data(struct bch_fs *c,
 	unsigned flags = O_RDONLY|O_CLOEXEC|O_NONBLOCK;
 	int ret, fd = -1;
 
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
 	if (arg.op >= BCH_DATA_OP_NR || arg.flags)
 		return -EINVAL;
 
@@ -497,6 +521,9 @@ static long bch2_ioctl_read_super(struct bch_fs *c,
 	struct bch_sb *sb;
 	int ret = 0;
 
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
 	if ((arg.flags & ~(BCH_BY_INDEX|BCH_READ_DEV)) ||
 	    arg.pad)
 		return -EINVAL;
@@ -537,6 +564,9 @@ static long bch2_ioctl_disk_get_idx(struct bch_fs *c,
 	struct bch_dev *ca;
 	unsigned i;
 
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
 	for_each_online_member(ca, c, i)
 		if (ca->disk_sb.bdev->bd_dev == dev) {
 			percpu_ref_put(&ca->io_ref);
@@ -552,6 +582,9 @@ static long bch2_ioctl_disk_resize(struct bch_fs *c,
 	struct bch_dev *ca;
 	int ret;
 
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
 	if ((arg.flags & ~BCH_BY_INDEX) ||
 	    arg.pad)
 		return -EINVAL;
@@ -572,6 +605,9 @@ static long bch2_ioctl_disk_resize_journal(struct bch_fs *c,
 	struct bch_dev *ca;
 	int ret;
 
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
 	if ((arg.flags & ~BCH_BY_INDEX) ||
 	    arg.pad)
 		return -EINVAL;
@@ -597,7 +633,6 @@ do {									\
 
 long bch2_fs_ioctl(struct bch_fs *c, unsigned cmd, void __user *arg)
 {
-	/* ioctls that don't require admin cap: */
 	switch (cmd) {
 	case BCH_IOCTL_QUERY_UUID:
 		return bch2_ioctl_query_uuid(c, arg);
@@ -605,12 +640,6 @@ long bch2_fs_ioctl(struct bch_fs *c, unsigned cmd, void __user *arg)
 		return bch2_ioctl_fs_usage(c, arg);
 	case BCH_IOCTL_DEV_USAGE:
 		return bch2_ioctl_dev_usage(c, arg);
-	}
-
-	if (!capable(CAP_SYS_ADMIN))
-		return -EPERM;
-
-	switch (cmd) {
 #if 0
 	case BCH_IOCTL_START:
 		BCH_IOCTL(start, struct bch_ioctl_start);
@@ -626,7 +655,6 @@ long bch2_fs_ioctl(struct bch_fs *c, unsigned cmd, void __user *arg)
 	if (!test_bit(BCH_FS_STARTED, &c->flags))
 		return -EINVAL;
 
-	/* ioctls that do require admin cap: */
 	switch (cmd) {
 	case BCH_IOCTL_DISK_ADD:
 		BCH_IOCTL(disk_add, struct bch_ioctl_disk);
-- 
cgit 


From 80ff5d18ee975f201c8913be066ebedf887003f9 Mon Sep 17 00:00:00 2001
From: jpsollie <janpieter.sollie@edpnet.be>
Date: Thu, 17 Jun 2021 11:29:59 +0200
Subject: bcachefs: Prepare checksums for more advanced algorithms

Perform abstraction of hash calculation for advanced checksum algorithms.
Algorithms like xxhash do not store their state as a u64 int.

Signed-off-by: jpsollie <janpieter.sollie@edpnet.be>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/checksum.c | 95 +++++++++++++++++++++++++++++++-------------------
 1 file changed, 59 insertions(+), 36 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/checksum.c b/fs/bcachefs/checksum.c
index 3d88719ba86c..6c23a9073dbf 100644
--- a/fs/bcachefs/checksum.c
+++ b/fs/bcachefs/checksum.c
@@ -16,53 +16,68 @@
 #include <crypto/skcipher.h>
 #include <keys/user-type.h>
 
-static u64 bch2_checksum_init(unsigned type)
+/*
+ * bch2_checksum state is an abstraction of the checksum state calculated over different pages.
+ * it features page merging without having the checksum algorithm lose its state.
+ * for native checksum aglorithms (like crc), a default seed value will do.
+ * for hash-like algorithms, a state needs to be stored
+ */
+
+struct bch2_checksum_state {
+	union {
+		u64 seed;
+	};
+	unsigned int type;
+};
+
+static void bch2_checksum_init(struct bch2_checksum_state *state)
 {
-	switch (type) {
+	switch (state->type) {
 	case BCH_CSUM_NONE:
-		return 0;
-	case BCH_CSUM_CRC32C_NONZERO:
-		return U32_MAX;
-	case BCH_CSUM_CRC64_NONZERO:
-		return U64_MAX;
 	case BCH_CSUM_CRC32C:
-		return 0;
 	case BCH_CSUM_CRC64:
-		return 0;
+		state->seed = 0;
+		break;
+	case BCH_CSUM_CRC32C_NONZERO:
+		state->seed = U32_MAX;
+		break;
+	case BCH_CSUM_CRC64_NONZERO:
+		state->seed = U64_MAX;
+		break;
 	default:
 		BUG();
 	}
 }
 
-static u64 bch2_checksum_final(unsigned type, u64 crc)
+static u64 bch2_checksum_final(const struct bch2_checksum_state *state)
 {
-	switch (type) {
+	switch (state->type) {
 	case BCH_CSUM_NONE:
-		return 0;
-	case BCH_CSUM_CRC32C_NONZERO:
-		return crc ^ U32_MAX;
-	case BCH_CSUM_CRC64_NONZERO:
-		return crc ^ U64_MAX;
 	case BCH_CSUM_CRC32C:
-		return crc;
 	case BCH_CSUM_CRC64:
-		return crc;
+		return state->seed;
+	case BCH_CSUM_CRC32C_NONZERO:
+		return state->seed ^ U32_MAX;
+	case BCH_CSUM_CRC64_NONZERO:
+		return state->seed ^ U64_MAX;
 	default:
 		BUG();
 	}
 }
 
-static u64 bch2_checksum_update(unsigned type, u64 crc, const void *data, size_t len)
+static void bch2_checksum_update(struct bch2_checksum_state *state, const void *data, size_t len)
 {
-	switch (type) {
+	switch (state->type) {
 	case BCH_CSUM_NONE:
-		return 0;
+		return;
 	case BCH_CSUM_CRC32C_NONZERO:
 	case BCH_CSUM_CRC32C:
-		return crc32c(crc, data, len);
+		state->seed = crc32c(state->seed, data, len);
+		break;
 	case BCH_CSUM_CRC64_NONZERO:
 	case BCH_CSUM_CRC64:
-		return crc64_be(crc, data, len);
+		state->seed = crc64_be(state->seed, data, len);
+		break;
 	default:
 		BUG();
 	}
@@ -141,12 +156,14 @@ struct bch_csum bch2_checksum(struct bch_fs *c, unsigned type,
 	case BCH_CSUM_CRC64_NONZERO:
 	case BCH_CSUM_CRC32C:
 	case BCH_CSUM_CRC64: {
-		u64 crc = bch2_checksum_init(type);
+		struct bch2_checksum_state state;
 
-		crc = bch2_checksum_update(type, crc, data, len);
-		crc = bch2_checksum_final(type, crc);
+		state.type = type;
 
-		return (struct bch_csum) { .lo = cpu_to_le64(crc) };
+		bch2_checksum_init(&state);
+		bch2_checksum_update(&state, data, len);
+
+		return (struct bch_csum) { .lo = cpu_to_le64(bch2_checksum_final(&state)) };
 	}
 
 	case BCH_CSUM_CHACHA20_POLY1305_80:
@@ -190,23 +207,23 @@ static struct bch_csum __bch2_checksum_bio(struct bch_fs *c, unsigned type,
 	case BCH_CSUM_CRC64_NONZERO:
 	case BCH_CSUM_CRC32C:
 	case BCH_CSUM_CRC64: {
-		u64 crc = bch2_checksum_init(type);
+		struct bch2_checksum_state state;
+
+		state.type = type;
+		bch2_checksum_init(&state);
 
 #ifdef CONFIG_HIGHMEM
 		__bio_for_each_segment(bv, bio, *iter, *iter) {
 			void *p = kmap_atomic(bv.bv_page) + bv.bv_offset;
-			crc = bch2_checksum_update(type,
-				crc, p, bv.bv_len);
+			bch2_checksum_update(&state, p, bv.bv_len);
 			kunmap_atomic(p);
 		}
 #else
 		__bio_for_each_bvec(bv, bio, *iter, *iter)
-			crc = bch2_checksum_update(type, crc,
-				page_address(bv.bv_page) + bv.bv_offset,
+			bch2_checksum_update(&state, page_address(bv.bv_page) + bv.bv_offset,
 				bv.bv_len);
 #endif
-		crc = bch2_checksum_final(type, crc);
-		return (struct bch_csum) { .lo = cpu_to_le64(crc) };
+		return (struct bch_csum) { .lo = cpu_to_le64(bch2_checksum_final(&state)) };
 	}
 
 	case BCH_CSUM_CHACHA20_POLY1305_80:
@@ -284,16 +301,22 @@ void bch2_encrypt_bio(struct bch_fs *c, unsigned type,
 struct bch_csum bch2_checksum_merge(unsigned type, struct bch_csum a,
 				    struct bch_csum b, size_t b_len)
 {
+	struct bch2_checksum_state state;
+
+	state.type = type;
+	bch2_checksum_init(&state);
+	state.seed = a.lo;
+
 	BUG_ON(!bch2_checksum_mergeable(type));
 
 	while (b_len) {
 		unsigned b = min_t(unsigned, b_len, PAGE_SIZE);
 
-		a.lo = bch2_checksum_update(type, a.lo,
+		bch2_checksum_update(&state,
 				page_address(ZERO_PAGE(0)), b);
 		b_len -= b;
 	}
-
+	a.lo = bch2_checksum_final(&state);
 	a.lo ^= b.lo;
 	a.hi ^= b.hi;
 	return a;
-- 
cgit 


From 41e633826a1418f3b492d9137d395289e6e67d15 Mon Sep 17 00:00:00 2001
From: jpsollie <janpieter.sollie@edpnet.be>
Date: Thu, 17 Jun 2021 13:42:09 +0200
Subject: bcachefs: add bcachefs xxhash support

xxhash is a much faster algorithm compared to crc32.
could be used to speed up checksum calculation.
xxhash 64-bit only, as it is much faster on 64-bit CPUs compared to xxh32.

Signed-off-by: jpsollie <janpieter.sollie@edpnet.be>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/Kconfig           |  1 +
 fs/bcachefs/bcachefs_format.h |  7 +++++--
 fs/bcachefs/checksum.c        | 12 ++++++++++++
 fs/bcachefs/checksum.h        |  2 ++
 4 files changed, 20 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/Kconfig b/fs/bcachefs/Kconfig
index 151c4b10d543..bfe7e6c9c064 100644
--- a/fs/bcachefs/Kconfig
+++ b/fs/bcachefs/Kconfig
@@ -19,6 +19,7 @@ config BCACHEFS_FS
 	select KEYS
 	select RAID6_PQ
 	select XOR_BLOCKS
+	select XXHASH
 	select SRCU
 	help
 	The bcachefs filesystem - a modern, copy on write filesystem, with
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index 6cfb8959d579..63f7c7c8f390 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -1460,7 +1460,8 @@ enum bch_csum_type {
 	BCH_CSUM_CHACHA20_POLY1305_128	= 4,
 	BCH_CSUM_CRC32C			= 5,
 	BCH_CSUM_CRC64			= 6,
-	BCH_CSUM_NR			= 7,
+	BCH_CSUM_XXHASH			= 7,
+	BCH_CSUM_NR			= 8,
 };
 
 static const unsigned bch_crc_bytes[] = {
@@ -1469,6 +1470,7 @@ static const unsigned bch_crc_bytes[] = {
 	[BCH_CSUM_CRC32C]			= 4,
 	[BCH_CSUM_CRC64_NONZERO]		= 8,
 	[BCH_CSUM_CRC64]			= 8,
+	[BCH_CSUM_XXHASH]			= 8,
 	[BCH_CSUM_CHACHA20_POLY1305_80]		= 10,
 	[BCH_CSUM_CHACHA20_POLY1305_128]	= 16,
 };
@@ -1487,7 +1489,8 @@ static inline _Bool bch2_csum_type_is_encryption(enum bch_csum_type type)
 #define BCH_CSUM_OPTS()			\
 	x(none,			0)	\
 	x(crc32c,		1)	\
-	x(crc64,		2)
+	x(crc64,		2)	\
+	x(xxhash,		3)
 
 enum bch_csum_opts {
 #define x(t, n) BCH_CSUM_OPT_##t = n,
diff --git a/fs/bcachefs/checksum.c b/fs/bcachefs/checksum.c
index 6c23a9073dbf..d20924e579bf 100644
--- a/fs/bcachefs/checksum.c
+++ b/fs/bcachefs/checksum.c
@@ -6,6 +6,7 @@
 
 #include <linux/crc32c.h>
 #include <linux/crypto.h>
+#include <linux/xxhash.h>
 #include <linux/key.h>
 #include <linux/random.h>
 #include <linux/scatterlist.h>
@@ -26,6 +27,7 @@
 struct bch2_checksum_state {
 	union {
 		u64 seed;
+		struct xxh64_state h64state;
 	};
 	unsigned int type;
 };
@@ -44,6 +46,9 @@ static void bch2_checksum_init(struct bch2_checksum_state *state)
 	case BCH_CSUM_CRC64_NONZERO:
 		state->seed = U64_MAX;
 		break;
+	case BCH_CSUM_XXHASH:
+		xxh64_reset(&state->h64state, 0);
+		break;
 	default:
 		BUG();
 	}
@@ -60,6 +65,8 @@ static u64 bch2_checksum_final(const struct bch2_checksum_state *state)
 		return state->seed ^ U32_MAX;
 	case BCH_CSUM_CRC64_NONZERO:
 		return state->seed ^ U64_MAX;
+	case BCH_CSUM_XXHASH:
+		return xxh64_digest(&state->h64state);
 	default:
 		BUG();
 	}
@@ -78,6 +85,9 @@ static void bch2_checksum_update(struct bch2_checksum_state *state, const void *
 	case BCH_CSUM_CRC64:
 		state->seed = crc64_be(state->seed, data, len);
 		break;
+	case BCH_CSUM_XXHASH:
+		xxh64_update(&state->h64state, data, len);
+		break;
 	default:
 		BUG();
 	}
@@ -155,6 +165,7 @@ struct bch_csum bch2_checksum(struct bch_fs *c, unsigned type,
 	case BCH_CSUM_CRC32C_NONZERO:
 	case BCH_CSUM_CRC64_NONZERO:
 	case BCH_CSUM_CRC32C:
+	case BCH_CSUM_XXHASH:
 	case BCH_CSUM_CRC64: {
 		struct bch2_checksum_state state;
 
@@ -206,6 +217,7 @@ static struct bch_csum __bch2_checksum_bio(struct bch_fs *c, unsigned type,
 	case BCH_CSUM_CRC32C_NONZERO:
 	case BCH_CSUM_CRC64_NONZERO:
 	case BCH_CSUM_CRC32C:
+	case BCH_CSUM_XXHASH:
 	case BCH_CSUM_CRC64: {
 		struct bch2_checksum_state state;
 
diff --git a/fs/bcachefs/checksum.h b/fs/bcachefs/checksum.h
index 728b7ef1a149..6841fb16568a 100644
--- a/fs/bcachefs/checksum.h
+++ b/fs/bcachefs/checksum.h
@@ -83,6 +83,8 @@ static inline enum bch_csum_type bch2_csum_opt_to_type(enum bch_csum_opts type,
 	     return data ? BCH_CSUM_CRC32C : BCH_CSUM_CRC32C_NONZERO;
 	case BCH_CSUM_OPT_crc64:
 	     return data ? BCH_CSUM_CRC64 : BCH_CSUM_CRC64_NONZERO;
+	case BCH_CSUM_OPT_xxhash:
+	     return BCH_CSUM_XXHASH;
 	default:
 	     BUG();
 	}
-- 
cgit 


From 618b1c0e20ac7bccebba0346ab1e1403fadd6aa0 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 5 Jul 2021 22:02:07 -0400
Subject: bcachefs: Split out SPOS_MAX

Internal btree code really wants a POS_MAX with all fields ~0; external
code more likely wants the snapshot field to be 0, because when we're
passing it to bch2_trans_get_iter() it's used for the snapshot we're
operating in, which should be 0 for most btrees that don't use
snapshots.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/bcachefs_format.h       | 3 ++-
 fs/bcachefs/btree_gc.c              | 2 +-
 fs/bcachefs/btree_gc.h              | 2 +-
 fs/bcachefs/btree_iter.c            | 8 ++++----
 fs/bcachefs/btree_update_interior.c | 8 ++++----
 fs/bcachefs/debug.c                 | 4 ++--
 fs/bcachefs/move.c                  | 4 ++--
 7 files changed, 16 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index 63f7c7c8f390..24853bd8923b 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -152,7 +152,8 @@ static inline struct bpos SPOS(__u64 inode, __u64 offset, __u32 snapshot)
 }
 
 #define POS_MIN				SPOS(0, 0, 0)
-#define POS_MAX				SPOS(KEY_INODE_MAX, KEY_OFFSET_MAX, KEY_SNAPSHOT_MAX)
+#define POS_MAX				SPOS(KEY_INODE_MAX, KEY_OFFSET_MAX, 0)
+#define SPOS_MAX			SPOS(KEY_INODE_MAX, KEY_OFFSET_MAX, KEY_SNAPSHOT_MAX)
 #define POS(_inode, _offset)		SPOS(_inode, _offset, 0)
 
 /* Empty placeholder struct, for container_of() */
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 1d690190d958..8bddab73a461 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -962,7 +962,7 @@ static int bch2_gc_btree_init(struct bch_fs *c,
 		goto fsck_err;
 	}
 
-	if (mustfix_fsck_err_on(bpos_cmp(b->data->max_key, POS_MAX), c,
+	if (mustfix_fsck_err_on(bpos_cmp(b->data->max_key, SPOS_MAX), c,
 			"btree root with incorrect max_key: %s",
 			(bch2_bpos_to_text(&PBUF(buf), b->data->max_key), buf))) {
 		bch_err(c, "repair unimplemented");
diff --git a/fs/bcachefs/btree_gc.h b/fs/bcachefs/btree_gc.h
index e9a87394370a..59dfb069e699 100644
--- a/fs/bcachefs/btree_gc.h
+++ b/fs/bcachefs/btree_gc.h
@@ -87,7 +87,7 @@ static inline struct gc_pos gc_pos_btree_node(struct btree *b)
  */
 static inline struct gc_pos gc_pos_btree_root(enum btree_id id)
 {
-	return gc_pos_btree(id, POS_MAX, BTREE_MAX_DEPTH);
+	return gc_pos_btree(id, SPOS_MAX, BTREE_MAX_DEPTH);
 }
 
 static inline bool gc_visited(struct bch_fs *c, struct gc_pos pos)
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 31d8c89ae255..8d42feae070a 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1081,7 +1081,7 @@ static inline int btree_iter_lock_root(struct btree_iter *iter,
 		}
 
 		lock_type = __btree_lock_want(iter, iter->level);
-		if (unlikely(!btree_node_lock(b, POS_MAX, iter->level,
+		if (unlikely(!btree_node_lock(b, SPOS_MAX, iter->level,
 					      iter, lock_type,
 					      lock_root_check_fn, rootp,
 					      trace_ip)))
@@ -1602,7 +1602,7 @@ out:
 inline bool bch2_btree_iter_advance(struct btree_iter *iter)
 {
 	struct bpos pos = iter->k.p;
-	bool ret = bpos_cmp(pos, POS_MAX) != 0;
+	bool ret = bpos_cmp(pos, SPOS_MAX) != 0;
 
 	if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS))
 		pos = bkey_successor(iter, pos);
@@ -1624,7 +1624,7 @@ inline bool bch2_btree_iter_rewind(struct btree_iter *iter)
 static inline bool btree_iter_set_pos_to_next_leaf(struct btree_iter *iter)
 {
 	struct bpos next_pos = iter->l[0].b->key.k.p;
-	bool ret = bpos_cmp(next_pos, POS_MAX) != 0;
+	bool ret = bpos_cmp(next_pos, SPOS_MAX) != 0;
 
 	/*
 	 * Typically, we don't want to modify iter->pos here, since that
@@ -1634,7 +1634,7 @@ static inline bool btree_iter_set_pos_to_next_leaf(struct btree_iter *iter)
 	if (ret)
 		btree_iter_set_search_pos(iter, bpos_successor(next_pos));
 	else
-		bch2_btree_iter_set_pos(iter, POS_MAX);
+		bch2_btree_iter_set_pos(iter, SPOS_MAX);
 
 	return ret;
 }
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index f577fd386ab4..361ae8063bb2 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -367,7 +367,7 @@ static struct btree *__btree_root_alloc(struct btree_update *as, unsigned level)
 	struct btree *b = bch2_btree_node_alloc(as, level);
 
 	btree_set_min(b, POS_MIN);
-	btree_set_max(b, POS_MAX);
+	btree_set_max(b, SPOS_MAX);
 	b->data->format = bch2_btree_calc_format(b);
 
 	btree_node_set_format(b, b->data->format);
@@ -1590,7 +1590,7 @@ retry:
 	b = iter->l[level].b;
 
 	if ((sib == btree_prev_sib && !bpos_cmp(b->data->min_key, POS_MIN)) ||
-	    (sib == btree_next_sib && !bpos_cmp(b->data->max_key, POS_MAX))) {
+	    (sib == btree_next_sib && !bpos_cmp(b->data->max_key, SPOS_MAX))) {
 		b->sib_u64s[sib] = U16_MAX;
 		goto out;
 	}
@@ -2014,7 +2014,7 @@ void bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id)
 	b->c.btree_id	= id;
 
 	bkey_btree_ptr_init(&b->key);
-	b->key.k.p = POS_MAX;
+	b->key.k.p = SPOS_MAX;
 	*((u64 *) bkey_i_to_btree_ptr(&b->key)->v.start) = U64_MAX - id;
 
 	bch2_bset_init_first(b, &b->data->keys);
@@ -2022,7 +2022,7 @@ void bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id)
 
 	b->data->flags = 0;
 	btree_set_min(b, POS_MIN);
-	btree_set_max(b, POS_MAX);
+	btree_set_max(b, SPOS_MAX);
 	b->data->format = bch2_btree_calc_format(b);
 	btree_node_set_format(b, b->data->format);
 
diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c
index d6dde62b6d48..cbadb38f680f 100644
--- a/fs/bcachefs/debug.c
+++ b/fs/bcachefs/debug.c
@@ -313,7 +313,7 @@ static ssize_t bch2_read_btree_formats(struct file *file, char __user *buf,
 	if (err)
 		return err;
 
-	if (!i->size || !bpos_cmp(POS_MAX, i->from))
+	if (!i->size || !bpos_cmp(SPOS_MAX, i->from))
 		return i->ret;
 
 	bch2_trans_init(&trans, i->c, 0, 0);
@@ -329,7 +329,7 @@ static ssize_t bch2_read_btree_formats(struct file *file, char __user *buf,
 		 * can't easily correctly restart a btree node traversal across
 		 * all nodes, meh
 		 */
-		i->from = bpos_cmp(POS_MAX, b->key.k.p)
+		i->from = bpos_cmp(SPOS_MAX, b->key.k.p)
 			? bpos_successor(b->key.k.p)
 			: b->key.k.p;
 
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index 93d7beaa5fce..f1337a16cafb 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -769,7 +769,7 @@ static int bch2_move_btree(struct bch_fs *c,
 				break;
 
 			if ((cmp_int(id, end_btree_id) ?:
-			     bkey_cmp(b->key.k.p, end_pos)) > 0)
+			     bpos_cmp(b->key.k.p, end_pos)) > 0)
 				break;
 
 			stats->pos = iter->pos;
@@ -921,7 +921,7 @@ int bch2_scan_old_btree_nodes(struct bch_fs *c, struct bch_move_stats *stats)
 
 	ret = bch2_move_btree(c,
 			      0,		POS_MIN,
-			      BTREE_ID_NR,	POS_MAX,
+			      BTREE_ID_NR,	SPOS_MAX,
 			      rewrite_old_nodes_pred, c, stats);
 	if (!ret) {
 		mutex_lock(&c->sb_lock);
-- 
cgit 


From c21affdd06661c8eb73c71fc54cdb8ec921b968f Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 5 Jul 2021 22:08:28 -0400
Subject: bcachefs: Fix bch2_btree_iter_peek_slot() assertion

This assertion is checking that what the iterator points to is
consistent with iter->real_pos, and since it's an internal btree
ordering property it should be using bpos_cmp.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_iter.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 8d42feae070a..1592929178bf 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1859,7 +1859,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
 		switch (btree_iter_type(iter)) {
 		case BTREE_ITER_KEYS:
 			k = btree_iter_level_peek_all(iter, &iter->l[0]);
-			EBUG_ON(k.k && bkey_deleted(k.k) && bkey_cmp(k.k->p, iter->pos) == 0);
+			EBUG_ON(k.k && bkey_deleted(k.k) && bpos_cmp(k.k->p, iter->pos) == 0);
 			break;
 		case BTREE_ITER_CACHED:
 			ck = (void *) iter->l[0].b;
-- 
cgit 


From d5bee8ca5a118f352dc81f2452ac41dcaf5100c5 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 5 Jul 2021 22:18:07 -0400
Subject: bcachefs: bch2_d_types[]

Add readable names for d_type, and use it in dirent_to_text().

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/dirent.c |  2 +-
 fs/bcachefs/opts.c   | 12 ++++++++++++
 fs/bcachefs/opts.h   |  1 +
 3 files changed, 14 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c
index 3bf6379cefe6..d5883ab7de21 100644
--- a/fs/bcachefs/dirent.c
+++ b/fs/bcachefs/dirent.c
@@ -112,7 +112,7 @@ void bch2_dirent_to_text(struct printbuf *out, struct bch_fs *c,
 
 	bch_scnmemcpy(out, d.v->d_name,
 		      bch2_dirent_name_bytes(d));
-	pr_buf(out, " -> %llu type %u", d.v->d_inum, d.v->d_type);
+	pr_buf(out, " -> %llu type %s", d.v->d_inum, bch2_d_types[d.v->d_type]);
 }
 
 static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans,
diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c
index 64bf5a382d63..fd3f7cddb9ab 100644
--- a/fs/bcachefs/opts.c
+++ b/fs/bcachefs/opts.c
@@ -63,6 +63,18 @@ const char * const bch2_member_states[] = {
 
 #undef x
 
+const char * const bch2_d_types[] = {
+	[DT_UNKNOWN]	= "unknown",
+	[DT_FIFO]	= "fifo",
+	[DT_CHR]	= "chr",
+	[DT_DIR]	= "dir",
+	[DT_BLK]	= "blk",
+	[DT_REG]	= "reg",
+	[DT_LNK]	= "lnk",
+	[DT_SOCK]	= "sock",
+	[DT_WHT]	= "whiteout",
+};
+
 void bch2_opts_apply(struct bch_opts *dst, struct bch_opts src)
 {
 #define x(_name, ...)						\
diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
index 074ab2d4f0fa..707659e31893 100644
--- a/fs/bcachefs/opts.h
+++ b/fs/bcachefs/opts.h
@@ -18,6 +18,7 @@ extern const char * const bch2_str_hash_types[];
 extern const char * const bch2_data_types[];
 extern const char * const bch2_cache_replacement_policies[];
 extern const char * const bch2_member_states[];
+extern const char * const bch2_d_types[];
 
 /*
  * Mount options; we also store defaults in the superblock.
-- 
cgit 


From b00fde8fb1e7271a4c07ed89249a115ccea96cc3 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 5 Jul 2021 22:16:02 -0400
Subject: bcachefs: BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE

Add a new flag to control assertions about updating to internal snapshot
nodes, that normally should not be written to - to be used in an
upcoming patch.

Also do some renaming - trigger_flags is now update_flags.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_key_cache.c   |  4 +++-
 fs/bcachefs/btree_types.h       |  8 ++++++--
 fs/bcachefs/btree_update.h      |  2 +-
 fs/bcachefs/btree_update_leaf.c | 29 +++++++++++++++--------------
 4 files changed, 25 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index 221cb0f46db0..bafa1f0abc88 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -398,7 +398,9 @@ retry:
 	 * to be using alloc reserves:
 	 * */
 	ret   = bch2_btree_iter_traverse(b_iter) ?:
-		bch2_trans_update(trans, b_iter, ck->k, BTREE_TRIGGER_NORUN) ?:
+		bch2_trans_update(trans, b_iter, ck->k,
+				  BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|
+				  BTREE_TRIGGER_NORUN) ?:
 		bch2_trans_commit(trans, NULL, NULL,
 				  BTREE_INSERT_NOUNLOCK|
 				  BTREE_INSERT_NOCHECK_RW|
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 3e7edaffbb9d..134d221d150e 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -340,7 +340,7 @@ struct bkey_cached {
 };
 
 struct btree_insert_entry {
-	unsigned		trigger_flags;
+	unsigned		flags;
 	u8			bkey_type;
 	enum btree_id		btree_id:8;
 	u8			level;
@@ -639,7 +639,9 @@ static inline bool btree_type_has_snapshots(enum btree_id id)
 	return (1 << id) & BTREE_ID_HAS_SNAPSHOTS;
 }
 
-enum btree_trigger_flags {
+enum btree_update_flags {
+	__BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE,
+
 	__BTREE_TRIGGER_NORUN,		/* Don't run triggers at all */
 
 	__BTREE_TRIGGER_INSERT,
@@ -650,6 +652,8 @@ enum btree_trigger_flags {
 	__BTREE_TRIGGER_NOATOMIC,
 };
 
+#define BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE (1U << __BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE)
+
 #define BTREE_TRIGGER_NORUN		(1U << __BTREE_TRIGGER_NORUN)
 
 #define BTREE_TRIGGER_INSERT		(1U << __BTREE_TRIGGER_INSERT)
diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
index cbfc8544def4..1c085a28b832 100644
--- a/fs/bcachefs/btree_update.h
+++ b/fs/bcachefs/btree_update.h
@@ -77,7 +77,7 @@ int bch2_btree_node_update_key(struct bch_fs *, struct btree_iter *,
 			       struct btree *, struct bkey_i *);
 
 int bch2_trans_update(struct btree_trans *, struct btree_iter *,
-		      struct bkey_i *, enum btree_trigger_flags);
+		      struct bkey_i *, enum btree_update_flags);
 void bch2_trans_commit_hook(struct btree_trans *,
 			    struct btree_trans_commit_hook *);
 int __bch2_trans_commit(struct btree_trans *);
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 81c111176b1f..984e060f6732 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -363,7 +363,7 @@ static noinline void bch2_trans_mark_gc(struct btree_trans *trans)
 
 		if (gc_visited(c, gc_pos_btree_node(i->iter->l[0].b)))
 			bch2_mark_update(trans, i->iter, i->k,
-					 i->trigger_flags|BTREE_TRIGGER_GC);
+					 i->flags|BTREE_TRIGGER_GC);
 	}
 }
 
@@ -468,7 +468,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans,
 	trans_for_each_update(trans, i)
 		if (BTREE_NODE_TYPE_HAS_MEM_TRIGGERS & (1U << i->bkey_type))
 			bch2_mark_update(trans, i->iter, i->k,
-					 i->trigger_flags);
+					 i->flags);
 
 	if (marking && trans->fs_usage_deltas)
 		bch2_trans_fs_usage_apply(trans, trans->fs_usage_deltas);
@@ -791,8 +791,7 @@ static noinline int extent_front_merge(struct btree_trans *trans,
 		struct btree_iter *update_iter =
 			bch2_trans_copy_iter(trans, iter);
 
-		ret = bch2_btree_delete_at(trans, update_iter,
-					   i->trigger_flags);
+		ret = bch2_btree_delete_at(trans, update_iter, i->flags);
 		bch2_trans_iter_put(trans, update_iter);
 
 		if (ret)
@@ -859,14 +858,16 @@ static int extent_handle_overwrites(struct btree_trans *trans,
 			if (ret)
 				goto out;
 
-			bch2_trans_update(trans, update_iter, update, i->trigger_flags);
+			bch2_trans_update(trans, update_iter, update,
+					  BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|
+					  i->flags);
 			bch2_trans_iter_put(trans, update_iter);
 		}
 
 		if (bkey_cmp(k.k->p, i->k->k.p) <= 0) {
 			update_iter = bch2_trans_copy_iter(trans, iter);
 			ret = bch2_btree_delete_at(trans, update_iter,
-						   i->trigger_flags);
+						   i->flags);
 			bch2_trans_iter_put(trans, update_iter);
 
 			if (ret)
@@ -881,7 +882,7 @@ static int extent_handle_overwrites(struct btree_trans *trans,
 			bkey_reassemble(update, k);
 			bch2_cut_front(i->k->k.p, update);
 
-			bch2_trans_update(trans, iter, update, i->trigger_flags);
+			bch2_trans_update(trans, iter, update, i->flags);
 			goto out;
 		}
 next:
@@ -927,7 +928,7 @@ int __bch2_trans_commit(struct btree_trans *trans)
 #ifdef CONFIG_BCACHEFS_DEBUG
 	trans_for_each_update(trans, i)
 		if (btree_iter_type(i->iter) != BTREE_ITER_CACHED &&
-		    !(i->trigger_flags & BTREE_TRIGGER_NORUN))
+		    !(i->flags & BTREE_TRIGGER_NORUN))
 			bch2_btree_key_cache_verify_clean(trans,
 					i->btree_id, i->k->k.p);
 #endif
@@ -945,8 +946,8 @@ int __bch2_trans_commit(struct btree_trans *trans)
 				i->trans_triggers_run = true;
 				trans_trigger_run = true;
 
-				ret = bch2_trans_mark_update(trans, i->iter, i->k,
-							     i->trigger_flags);
+				ret = bch2_trans_mark_update(trans, i->iter,
+							     i->k, i->flags);
 				if (unlikely(ret)) {
 					if (ret == -EINTR)
 						trace_trans_restart_mark(trans->ip, _RET_IP_,
@@ -1029,10 +1030,10 @@ err:
 }
 
 int bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter,
-		      struct bkey_i *k, enum btree_trigger_flags flags)
+		      struct bkey_i *k, enum btree_update_flags flags)
 {
 	struct btree_insert_entry *i, n = (struct btree_insert_entry) {
-		.trigger_flags	= flags,
+		.flags		= flags,
 		.bkey_type	= __btree_node_type(iter->level, iter->btree_id),
 		.btree_id	= iter->btree_id,
 		.level		= iter->level,
@@ -1130,7 +1131,7 @@ int bch2_btree_insert(struct bch_fs *c, enum btree_id id,
 }
 
 int bch2_btree_delete_at(struct btree_trans *trans,
-			 struct btree_iter *iter, unsigned trigger_flags)
+			 struct btree_iter *iter, unsigned update_flags)
 {
 	struct bkey_i *k;
 
@@ -1140,7 +1141,7 @@ int bch2_btree_delete_at(struct btree_trans *trans,
 
 	bkey_init(&k->k);
 	k->k.p = iter->pos;
-	return  bch2_trans_update(trans, iter, k, trigger_flags);
+	return bch2_trans_update(trans, iter, k, update_flags);
 }
 
 int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id id,
-- 
cgit 


From f0412b6e44bba6dc6f36feab560c4d325f9f021e Mon Sep 17 00:00:00 2001
From: Dan Robertson <dan@dlrobertson.com>
Date: Thu, 8 Jul 2021 18:15:38 -0400
Subject: bcachefs: set disk state should check new_state

A new device state that is not a valid state should return -EINVAL
in the disk set state ioctl.

Signed-off-by: Dan Robertson <dan@dlrobertson.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/chardev.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c
index c1d8143b3fd8..aae9a2db8b0d 100644
--- a/fs/bcachefs/chardev.c
+++ b/fs/bcachefs/chardev.c
@@ -275,7 +275,8 @@ static long bch2_ioctl_disk_set_state(struct bch_fs *c,
 			   BCH_FORCE_IF_METADATA_LOST|
 			   BCH_FORCE_IF_DEGRADED|
 			   BCH_BY_INDEX)) ||
-	    arg.pad[0] || arg.pad[1] || arg.pad[2])
+	    arg.pad[0] || arg.pad[1] || arg.pad[2] ||
+	    arg.new_state >= BCH_MEMBER_STATE_NR)
 		return -EINVAL;
 
 	ca = bch2_device_lookup(c, arg.dev, arg.flags);
-- 
cgit 


From d38494c462b457d1d4b551b0a84425f5d1d19606 Mon Sep 17 00:00:00 2001
From: Dan Robertson <dan@dlrobertson.com>
Date: Wed, 7 Jul 2021 22:31:36 -0400
Subject: bcachefs: docs: add docs for bch2_trans_reset

Add basic kernel docs for bch2_trans_reset and bch2_trans_begin.

Signed-off-by: Dan Robertson <dan@dlrobertson.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c | 16 ++++++++++++++++
 fs/bcachefs/btree_iter.h |  7 +++++++
 2 files changed, 23 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 1592929178bf..13e4dea8a09b 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -2310,6 +2310,22 @@ inline void bch2_trans_unlink_iters(struct btree_trans *trans)
 	}
 }
 
+/**
+ * bch2_trans_reset() - reset a transaction after a interrupted attempt
+ * @trans: transaction to reset
+ * @flags: transaction reset flags.
+ *
+ * While iterating over nodes or updating nodes a attempt to lock a btree
+ * node may return EINTR when the trylock fails. When this occurs
+ * bch2_trans_reset() or bch2_trans_begin() should be called and the
+ * transaction retried.
+ *
+ * Transaction reset flags include:
+ *
+ *  - TRANS_RESET_NOUNLOCK   - Do not attempt to unlock and reschedule the
+ *			       transaction.
+ *  - TRANS_RESET_NOTRAVERSE - Do not traverse all linked iters.
+ */
 void bch2_trans_reset(struct btree_trans *trans, unsigned flags)
 {
 	struct btree_iter *iter;
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index 6efea281d87f..31175cf00c0a 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -319,6 +319,13 @@ static inline void set_btree_iter_dontneed(struct btree_trans *trans, struct btr
 
 void bch2_trans_reset(struct btree_trans *, unsigned);
 
+/**
+ * bch2_trans_begin() - ensure lock consistency of transaction on retry
+ * @trans: transaction to prepare
+ *
+ * Ensure lock ordering is correct before potentially retrying a transaction
+ * after a failed trylock.
+ */
 static inline void bch2_trans_begin(struct btree_trans *trans)
 {
 	return bch2_trans_reset(trans, 0);
-- 
cgit 


From e3a67bdb6e3ab6a6850c546a3772998fe5242069 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 10 Jul 2021 23:22:06 -0400
Subject: bcachefs: Regularize argument passing of btree_trans

btree_trans should always be passed when we have one - iter->trans is
disfavoured. This mainly updates old code in btree_update_interior.c,
some of which predates btree_trans.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_gc.c              |  4 +--
 fs/bcachefs/btree_io.c              |  8 +++--
 fs/bcachefs/btree_io.h              |  4 +--
 fs/bcachefs/btree_update.h          |  8 ++---
 fs/bcachefs/btree_update_interior.c | 60 +++++++++++++++++++++++--------------
 fs/bcachefs/btree_update_interior.h | 25 +++++++---------
 fs/bcachefs/btree_update_leaf.c     | 17 ++++++-----
 fs/bcachefs/migrate.c               |  2 +-
 fs/bcachefs/move.c                  |  2 +-
 9 files changed, 73 insertions(+), 57 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 8bddab73a461..f0a5b6b2b189 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -800,13 +800,13 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id,
 
 		if (!initial) {
 			if (max_stale > 64)
-				bch2_btree_node_rewrite(c, iter,
+				bch2_btree_node_rewrite(&trans, iter,
 						b->data->keys.seq,
 						BTREE_INSERT_NOWAIT|
 						BTREE_INSERT_GC_LOCK_HELD);
 			else if (!bch2_btree_gc_rewrite_disabled &&
 				 (bch2_btree_gc_always_rewrite || max_stale > 16))
-				bch2_btree_node_rewrite(c, iter,
+				bch2_btree_node_rewrite(&trans, iter,
 						b->data->keys.seq,
 						BTREE_INSERT_NOWAIT|
 						BTREE_INSERT_GC_LOCK_HELD);
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 6274211d09f1..0095c789f6e5 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -420,9 +420,11 @@ void bch2_btree_build_aux_trees(struct btree *b)
  *
  * Returns true if we sorted (i.e. invalidated iterators
  */
-void bch2_btree_init_next(struct bch_fs *c, struct btree *b,
-			  struct btree_iter *iter)
+void bch2_btree_init_next(struct btree_trans *trans,
+			  struct btree_iter *iter,
+			  struct btree *b)
 {
+	struct bch_fs *c = trans->c;
 	struct btree_node_entry *bne;
 	bool reinit_iter = false;
 
@@ -1563,7 +1565,7 @@ retry:
 	if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(k.k)))
 		goto err;
 
-	ret = bch2_btree_node_update_key(c, iter, b, k.k);
+	ret = bch2_btree_node_update_key(&trans, iter, b, k.k);
 	if (ret == -EINTR)
 		goto retry;
 	if (ret)
diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h
index abbc4675964a..fae67622c127 100644
--- a/fs/bcachefs/btree_io.h
+++ b/fs/bcachefs/btree_io.h
@@ -138,8 +138,8 @@ void bch2_btree_sort_into(struct bch_fs *, struct btree *, struct btree *);
 void bch2_btree_node_drop_keys_outside_node(struct btree *);
 
 void bch2_btree_build_aux_trees(struct btree *);
-void bch2_btree_init_next(struct bch_fs *, struct btree *,
-			 struct btree_iter *);
+void bch2_btree_init_next(struct btree_trans *, struct btree_iter *,
+			  struct btree *);
 
 int bch2_btree_node_read_done(struct bch_fs *, struct bch_dev *,
 			      struct btree *, bool);
diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
index 1c085a28b832..12065bba82dd 100644
--- a/fs/bcachefs/btree_update.h
+++ b/fs/bcachefs/btree_update.h
@@ -8,8 +8,8 @@
 struct bch_fs;
 struct btree;
 
-void bch2_btree_node_lock_for_insert(struct bch_fs *, struct btree *,
-				     struct btree_iter *);
+void bch2_btree_node_lock_for_insert(struct btree_trans *, struct btree_iter *,
+				     struct btree *);
 bool bch2_btree_bset_insert_key(struct btree_iter *, struct btree *,
 				struct btree_node_iter *, struct bkey_i *);
 void bch2_btree_add_journal_pin(struct bch_fs *, struct btree *, u64);
@@ -70,10 +70,10 @@ int bch2_btree_delete_range_trans(struct btree_trans *, enum btree_id,
 int bch2_btree_delete_range(struct bch_fs *, enum btree_id,
 			    struct bpos, struct bpos, u64 *);
 
-int bch2_btree_node_rewrite(struct bch_fs *c, struct btree_iter *,
+int bch2_btree_node_rewrite(struct btree_trans *, struct btree_iter *,
 			    __le64, unsigned);
 void bch2_btree_node_rewrite_async(struct bch_fs *, struct btree *);
-int bch2_btree_node_update_key(struct bch_fs *, struct btree_iter *,
+int bch2_btree_node_update_key(struct btree_trans *, struct btree_iter *,
 			       struct btree *, struct bkey_i *);
 
 int bch2_trans_update(struct btree_trans *, struct btree_iter *,
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 361ae8063bb2..a5d3973e163a 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -22,6 +22,10 @@
 
 #include <linux/random.h>
 
+static void bch2_btree_insert_node(struct btree_update *, struct btree_trans *,
+				   struct btree_iter *, struct btree *,
+				   struct keylist *, unsigned);
+
 /* Debug code: */
 
 /*
@@ -1355,8 +1359,9 @@ static void btree_split_insert_keys(struct btree_update *as, struct btree *b,
 	btree_node_interior_verify(as->c, b);
 }
 
-static void btree_split(struct btree_update *as, struct btree *b,
-			struct btree_iter *iter, struct keylist *keys,
+static void btree_split(struct btree_update *as,
+			struct btree_trans *trans, struct btree_iter *iter,
+			struct btree *b, struct keylist *keys,
 			unsigned flags)
 {
 	struct bch_fs *c = as->c;
@@ -1422,7 +1427,7 @@ static void btree_split(struct btree_update *as, struct btree *b,
 
 	if (parent) {
 		/* Split a non root node */
-		bch2_btree_insert_node(as, parent, iter, &as->parent_keys, flags);
+		bch2_btree_insert_node(as, trans, iter, parent, &as->parent_keys, flags);
 	} else if (n3) {
 		bch2_btree_set_root(as, n3, iter);
 	} else {
@@ -1460,7 +1465,7 @@ static void btree_split(struct btree_update *as, struct btree *b,
 		six_unlock_intent(&n2->c.lock);
 	six_unlock_intent(&n1->c.lock);
 
-	bch2_btree_trans_verify_locks(iter->trans);
+	bch2_btree_trans_verify_locks(trans);
 
 	bch2_time_stats_update(&c->times[BCH_TIME_btree_node_split],
 			       start_time);
@@ -1494,9 +1499,10 @@ bch2_btree_insert_keys_interior(struct btree_update *as, struct btree *b,
  * If a split occurred, this function will return early. This can only happen
  * for leaf nodes -- inserts into interior nodes have to be atomic.
  */
-void bch2_btree_insert_node(struct btree_update *as, struct btree *b,
-			    struct btree_iter *iter, struct keylist *keys,
-			    unsigned flags)
+static void bch2_btree_insert_node(struct btree_update *as,
+				   struct btree_trans *trans, struct btree_iter *iter,
+				   struct btree *b, struct keylist *keys,
+				   unsigned flags)
 {
 	struct bch_fs *c = as->c;
 	int old_u64s = le16_to_cpu(btree_bset_last(b)->u64s);
@@ -1509,7 +1515,7 @@ void bch2_btree_insert_node(struct btree_update *as, struct btree *b,
 	BUG_ON(!as || as->b);
 	bch2_verify_keylist_sorted(keys);
 
-	bch2_btree_node_lock_for_insert(c, b, iter);
+	bch2_btree_node_lock_for_insert(trans, iter, b);
 
 	if (!bch2_btree_node_insert_fits(c, b, bch2_keylist_u64s(keys))) {
 		bch2_btree_node_unlock_write(b, iter);
@@ -1537,12 +1543,14 @@ void bch2_btree_insert_node(struct btree_update *as, struct btree *b,
 	btree_node_interior_verify(c, b);
 	return;
 split:
-	btree_split(as, b, iter, keys, flags);
+	btree_split(as, trans, iter, b, keys, flags);
 }
 
-int bch2_btree_split_leaf(struct bch_fs *c, struct btree_iter *iter,
+int bch2_btree_split_leaf(struct btree_trans *trans,
+			  struct btree_iter *iter,
 			  unsigned flags)
 {
+	struct bch_fs *c = trans->c;
 	struct btree *b = iter_l(iter)->b;
 	struct btree_update *as;
 	unsigned l;
@@ -1553,22 +1561,22 @@ int bch2_btree_split_leaf(struct bch_fs *c, struct btree_iter *iter,
 	if (IS_ERR(as))
 		return PTR_ERR(as);
 
-	btree_split(as, b, iter, NULL, flags);
+	btree_split(as, trans, iter, b, NULL, flags);
 	bch2_btree_update_done(as);
 
 	for (l = iter->level + 1; btree_iter_node(iter, l) && !ret; l++)
-		ret = bch2_foreground_maybe_merge(c, iter, l, flags);
+		ret = bch2_foreground_maybe_merge(trans, iter, l, flags);
 
 	return ret;
 }
 
-int __bch2_foreground_maybe_merge(struct bch_fs *c,
+int __bch2_foreground_maybe_merge(struct btree_trans *trans,
 				  struct btree_iter *iter,
 				  unsigned level,
 				  unsigned flags,
 				  enum btree_node_sibling sib)
 {
-	struct btree_trans *trans = iter->trans;
+	struct bch_fs *c = trans->c;
 	struct btree_iter *sib_iter = NULL;
 	struct btree_update *as;
 	struct bkey_format_state new_s;
@@ -1697,7 +1705,7 @@ retry:
 
 	bch2_btree_node_write(c, n, SIX_LOCK_intent);
 
-	bch2_btree_insert_node(as, parent, iter, &as->parent_keys, flags);
+	bch2_btree_insert_node(as, trans, iter, parent, &as->parent_keys, flags);
 
 	bch2_btree_update_get_open_buckets(as, n);
 
@@ -1750,9 +1758,11 @@ err:
 /**
  * bch_btree_node_rewrite - Rewrite/move a btree node
  */
-int bch2_btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter,
+int bch2_btree_node_rewrite(struct btree_trans *trans,
+			    struct btree_iter *iter,
 			    __le64 seq, unsigned flags)
 {
+	struct bch_fs *c = trans->c;
 	struct btree *b, *n, *parent;
 	struct btree_update *as;
 	int ret;
@@ -1795,7 +1805,8 @@ retry:
 
 	if (parent) {
 		bch2_keylist_add(&as->parent_keys, &n->key);
-		bch2_btree_insert_node(as, parent, iter, &as->parent_keys, flags);
+		bch2_btree_insert_node(as, trans, iter, parent,
+				       &as->parent_keys, flags);
 	} else {
 		bch2_btree_set_root(as, n, iter);
 	}
@@ -1834,7 +1845,7 @@ void async_btree_node_rewrite_work(struct work_struct *work)
 	bch2_trans_init(&trans, c, 0, 0);
 	iter = bch2_trans_get_node_iter(&trans, a->btree_id, a->pos,
 					BTREE_MAX_DEPTH, a->level, 0);
-	bch2_btree_node_rewrite(c, iter, a->seq, 0);
+	bch2_btree_node_rewrite(&trans, iter, a->seq, 0);
 	bch2_trans_iter_put(&trans, iter);
 	bch2_trans_exit(&trans);
 	percpu_ref_put(&c->writes);
@@ -1867,12 +1878,13 @@ void bch2_btree_node_rewrite_async(struct bch_fs *c, struct btree *b)
 	queue_work(c->btree_interior_update_worker, &a->work);
 }
 
-static void __bch2_btree_node_update_key(struct bch_fs *c,
-					 struct btree_update *as,
+static void __bch2_btree_node_update_key(struct btree_update *as,
+					 struct btree_trans *trans,
 					 struct btree_iter *iter,
 					 struct btree *b, struct btree *new_hash,
 					 struct bkey_i *new_key)
 {
+	struct bch_fs *c = as->c;
 	struct btree *parent;
 	int ret;
 
@@ -1889,7 +1901,7 @@ static void __bch2_btree_node_update_key(struct bch_fs *c,
 		}
 
 		bch2_keylist_add(&as->parent_keys, new_key);
-		bch2_btree_insert_node(as, parent, iter, &as->parent_keys, 0);
+		bch2_btree_insert_node(as, trans, iter, parent, &as->parent_keys, 0);
 
 		if (new_hash) {
 			mutex_lock(&c->btree_cache.lock);
@@ -1926,10 +1938,12 @@ static void __bch2_btree_node_update_key(struct bch_fs *c,
 	bch2_btree_update_done(as);
 }
 
-int bch2_btree_node_update_key(struct bch_fs *c, struct btree_iter *iter,
+int bch2_btree_node_update_key(struct btree_trans *trans,
+			       struct btree_iter *iter,
 			       struct btree *b,
 			       struct bkey_i *new_key)
 {
+	struct bch_fs *c = trans->c;
 	struct btree *parent = btree_node_parent(iter, b);
 	struct btree_update *as = NULL;
 	struct btree *new_hash = NULL;
@@ -1962,7 +1976,7 @@ int bch2_btree_node_update_key(struct bch_fs *c, struct btree_iter *iter,
 		goto err;
 	}
 
-	__bch2_btree_node_update_key(c, as, iter, b, new_hash, new_key);
+	__bch2_btree_node_update_key(as, trans, iter, b, new_hash, new_key);
 
 	bch2_btree_iter_downgrade(iter);
 err:
diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h
index 7ed67b47e1b9..e88e737ee813 100644
--- a/fs/bcachefs/btree_update_interior.h
+++ b/fs/bcachefs/btree_update_interior.h
@@ -131,15 +131,12 @@ void bch2_btree_interior_update_will_free_node(struct btree_update *,
 					       struct btree *);
 void bch2_btree_update_add_new_node(struct btree_update *, struct btree *);
 
-void bch2_btree_insert_node(struct btree_update *, struct btree *,
-			    struct btree_iter *, struct keylist *,
-			    unsigned);
-int bch2_btree_split_leaf(struct bch_fs *, struct btree_iter *, unsigned);
+int bch2_btree_split_leaf(struct btree_trans *, struct btree_iter *, unsigned);
 
-int __bch2_foreground_maybe_merge(struct bch_fs *, struct btree_iter *,
+int __bch2_foreground_maybe_merge(struct btree_trans *, struct btree_iter *,
 				  unsigned, unsigned, enum btree_node_sibling);
 
-static inline int bch2_foreground_maybe_merge_sibling(struct bch_fs *c,
+static inline int bch2_foreground_maybe_merge_sibling(struct btree_trans *trans,
 					struct btree_iter *iter,
 					unsigned level, unsigned flags,
 					enum btree_node_sibling sib)
@@ -153,20 +150,20 @@ static inline int bch2_foreground_maybe_merge_sibling(struct bch_fs *c,
 		return 0;
 
 	b = iter->l[level].b;
-	if (b->sib_u64s[sib] > c->btree_foreground_merge_threshold)
+	if (b->sib_u64s[sib] > trans->c->btree_foreground_merge_threshold)
 		return 0;
 
-	return __bch2_foreground_maybe_merge(c, iter, level, flags, sib);
+	return __bch2_foreground_maybe_merge(trans, iter, level, flags, sib);
 }
 
-static inline int bch2_foreground_maybe_merge(struct bch_fs *c,
-					       struct btree_iter *iter,
-					       unsigned level,
-					       unsigned flags)
+static inline int bch2_foreground_maybe_merge(struct btree_trans *trans,
+					      struct btree_iter *iter,
+					      unsigned level,
+					      unsigned flags)
 {
-	return  bch2_foreground_maybe_merge_sibling(c, iter, level, flags,
+	return  bch2_foreground_maybe_merge_sibling(trans, iter, level, flags,
 						    btree_prev_sib) ?:
-		bch2_foreground_maybe_merge_sibling(c, iter, level, flags,
+		bch2_foreground_maybe_merge_sibling(trans, iter, level, flags,
 						    btree_next_sib);
 }
 
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 984e060f6732..c593c8d712c5 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -36,9 +36,12 @@ static inline bool same_leaf_as_prev(struct btree_trans *trans,
 		iter_l(i[0].iter)->b == iter_l(i[-1].iter)->b;
 }
 
-inline void bch2_btree_node_lock_for_insert(struct bch_fs *c, struct btree *b,
-					    struct btree_iter *iter)
+inline void bch2_btree_node_lock_for_insert(struct btree_trans *trans,
+					    struct btree_iter *iter,
+					    struct btree *b)
 {
+	struct bch_fs *c = trans->c;
+
 	bch2_btree_node_lock_write(b, iter);
 
 	if (btree_iter_type(iter) == BTREE_ITER_CACHED)
@@ -53,7 +56,7 @@ inline void bch2_btree_node_lock_for_insert(struct bch_fs *c, struct btree *b,
 	 * a new bset to insert into:
 	 */
 	if (want_new_bset(c, b))
-		bch2_btree_init_next(c, b, iter);
+		bch2_btree_init_next(trans, iter, b);
 }
 
 /* Inserting into a given leaf node (last stage of insert): */
@@ -518,7 +521,7 @@ static noinline int maybe_do_btree_merge(struct btree_trans *trans, struct btree
 	}
 
 	return u64s_delta <= 0
-		? (bch2_foreground_maybe_merge(trans->c, iter, iter->level,
+		? (bch2_foreground_maybe_merge(trans, iter, iter->level,
 				trans->flags & ~BTREE_INSERT_NOUNLOCK) ?: -EINTR)
 		: 0;
 }
@@ -608,8 +611,8 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans,
 
 	trans_for_each_update(trans, i)
 		if (!same_leaf_as_prev(trans, i))
-			bch2_btree_node_lock_for_insert(c,
-					iter_l(i->iter)->b, i->iter);
+			bch2_btree_node_lock_for_insert(trans, i->iter,
+					iter_l(i->iter)->b);
 
 	ret = bch2_trans_commit_write_locked(trans, stopped_at, trace_ip);
 
@@ -662,7 +665,7 @@ int bch2_trans_commit_error(struct btree_trans *trans,
 
 	switch (ret) {
 	case BTREE_INSERT_BTREE_NODE_FULL:
-		ret = bch2_btree_split_leaf(c, i->iter, flags);
+		ret = bch2_btree_split_leaf(trans, i->iter, flags);
 
 		/*
 		 * if the split succeeded without dropping locks the insert will
diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c
index 91a9f584dd6d..aacd6385db1f 100644
--- a/fs/bcachefs/migrate.c
+++ b/fs/bcachefs/migrate.c
@@ -139,7 +139,7 @@ retry:
 				break;
 			}
 
-			ret = bch2_btree_node_update_key(c, iter, b, k.k);
+			ret = bch2_btree_node_update_key(&trans, iter, b, k.k);
 			if (ret == -EINTR) {
 				b = bch2_btree_iter_peek_node(iter);
 				ret = 0;
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index f1337a16cafb..3069f32efddd 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -786,7 +786,7 @@ static int bch2_move_btree(struct bch_fs *c,
 				BUG();
 			}
 
-			ret = bch2_btree_node_rewrite(c, iter,
+			ret = bch2_btree_node_rewrite(&trans, iter,
 					b->data->keys.seq, 0) ?: ret;
 next:
 			bch2_trans_cond_resched(&trans);
-- 
cgit 


From 19d5432445fd0898231063518c375f7551d7c3d2 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 10 Jul 2021 23:03:15 -0400
Subject: bcachefs: Really don't hold btree locks while btree IOs are in flight

This is something we've attempted to stick to for quite some time, as it
helps guarantee filesystem latency - but there's a few remaining paths
that this patch fixes.

This is also necessary for an upcoming patch to update btree pointers
after every btree write - since the btree write completion path will now
be doing btree operations.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_cache.c           | 95 +++++++++++++++++++++++--------------
 fs/bcachefs/btree_io.c              | 51 ++++++++++++++++++--
 fs/bcachefs/btree_io.h              | 26 +++-------
 fs/bcachefs/btree_update_interior.c |  2 +-
 fs/bcachefs/debug.c                 |  4 +-
 5 files changed, 116 insertions(+), 62 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index 15f597ab03e7..051d2867ad63 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -187,6 +187,17 @@ static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush)
 	int ret = 0;
 
 	lockdep_assert_held(&bc->lock);
+wait_on_io:
+	if (b->flags & ((1U << BTREE_NODE_dirty)|
+			(1U << BTREE_NODE_read_in_flight)|
+			(1U << BTREE_NODE_write_in_flight))) {
+		if (!flush)
+			return -ENOMEM;
+
+		/* XXX: waiting on IO with btree cache lock held */
+		bch2_btree_node_wait_on_read(b);
+		bch2_btree_node_wait_on_write(b);
+	}
 
 	if (!six_trylock_intent(&b->c.lock))
 		return -ENOMEM;
@@ -194,25 +205,26 @@ static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush)
 	if (!six_trylock_write(&b->c.lock))
 		goto out_unlock_intent;
 
+	/* recheck under lock */
+	if (b->flags & ((1U << BTREE_NODE_read_in_flight)|
+			(1U << BTREE_NODE_write_in_flight))) {
+		if (!flush)
+			goto out_unlock;
+		six_unlock_write(&b->c.lock);
+		six_unlock_intent(&b->c.lock);
+		goto wait_on_io;
+	}
+
 	if (btree_node_noevict(b))
 		goto out_unlock;
 
 	if (!btree_node_may_write(b))
 		goto out_unlock;
 
-	if (btree_node_dirty(b) &&
-	    test_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags))
-		goto out_unlock;
-
-	if (btree_node_dirty(b) ||
-	    btree_node_write_in_flight(b) ||
-	    btree_node_read_in_flight(b)) {
-		if (!flush)
+	if (btree_node_dirty(b)) {
+		if (!flush ||
+		    test_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags))
 			goto out_unlock;
-
-		wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight,
-			       TASK_UNINTERRUPTIBLE);
-
 		/*
 		 * Using the underscore version because we don't want to compact
 		 * bsets after the write, since this node is about to be evicted
@@ -224,8 +236,9 @@ static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush)
 		else
 			__bch2_btree_node_write(c, b);
 
-		/* wait for any in flight btree write */
-		btree_node_wait_on_io(b);
+		six_unlock_write(&b->c.lock);
+		six_unlock_intent(&b->c.lock);
+		goto wait_on_io;
 	}
 out:
 	if (b->hash_val && !ret)
@@ -581,6 +594,7 @@ got_node:
 	}
 
 	BUG_ON(btree_node_hashed(b));
+	BUG_ON(btree_node_dirty(b));
 	BUG_ON(btree_node_write_in_flight(b));
 out:
 	b->flags		= 0;
@@ -634,6 +648,7 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c,
 {
 	struct btree_cache *bc = &c->btree_cache;
 	struct btree *b;
+	u32 seq;
 
 	BUG_ON(level + 1 >= BTREE_MAX_DEPTH);
 	/*
@@ -663,31 +678,31 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c,
 		return NULL;
 	}
 
+	set_btree_node_read_in_flight(b);
+
+	six_unlock_write(&b->c.lock);
+	seq = b->c.lock.state.seq;
+	six_unlock_intent(&b->c.lock);
+
 	/* Unlock before doing IO: */
 	if (iter && sync)
 		bch2_trans_unlock(iter->trans);
 
 	bch2_btree_node_read(c, b, sync);
 
-	six_unlock_write(&b->c.lock);
-
-	if (!sync) {
-		six_unlock_intent(&b->c.lock);
+	if (!sync)
 		return NULL;
-	}
 
 	/*
 	 * XXX: this will probably always fail because btree_iter_relock()
 	 * currently fails for iterators that aren't pointed at a valid btree
 	 * node
 	 */
-	if (iter && !bch2_trans_relock(iter->trans)) {
-		six_unlock_intent(&b->c.lock);
+	if (iter && !bch2_trans_relock(iter->trans))
 		return ERR_PTR(-EINTR);
-	}
 
-	if (lock_type == SIX_LOCK_read)
-		six_lock_downgrade(&b->c.lock);
+	if (!six_relock_type(&b->c.lock, lock_type, seq))
+		return ERR_PTR(-EINTR);
 
 	return b;
 }
@@ -831,11 +846,12 @@ lock_node:
 	}
 
 	if (unlikely(btree_node_read_in_flight(b))) {
+		u32 seq = b->c.lock.state.seq;
+
 		six_unlock_type(&b->c.lock, lock_type);
 		bch2_trans_unlock(iter->trans);
 
-		wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight,
-			       TASK_UNINTERRUPTIBLE);
+		bch2_btree_node_wait_on_read(b);
 
 		/*
 		 * XXX: check if this always fails - btree_iter_relock()
@@ -844,7 +860,9 @@ lock_node:
 		 */
 		if (iter && !bch2_trans_relock(iter->trans))
 			return ERR_PTR(-EINTR);
-		goto retry;
+
+		if (!six_relock_type(&b->c.lock, lock_type, seq))
+			goto retry;
 	}
 
 	prefetch(b->aux_data);
@@ -923,8 +941,7 @@ lock_node:
 	}
 
 	/* XXX: waiting on IO with btree locks held: */
-	wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight,
-		       TASK_UNINTERRUPTIBLE);
+	__bch2_btree_node_wait_on_read(b);
 
 	prefetch(b->aux_data);
 
@@ -979,16 +996,24 @@ void bch2_btree_node_evict(struct bch_fs *c, const struct bkey_i *k)
 	b = btree_cache_find(bc, k);
 	if (!b)
 		return;
+wait_on_io:
+	/* not allowed to wait on io with btree locks held: */
+
+	/* XXX we're called from btree_gc which will be holding other btree
+	 * nodes locked
+	 * */
+	__bch2_btree_node_wait_on_read(b);
+	__bch2_btree_node_wait_on_write(b);
 
 	six_lock_intent(&b->c.lock, NULL, NULL);
 	six_lock_write(&b->c.lock, NULL, NULL);
 
-	wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight,
-		       TASK_UNINTERRUPTIBLE);
-	__bch2_btree_node_write(c, b);
-
-	/* wait for any in flight btree write */
-	btree_node_wait_on_io(b);
+	if (btree_node_dirty(b)) {
+		__bch2_btree_node_write(c, b);
+		six_unlock_write(&b->c.lock);
+		six_unlock_intent(&b->c.lock);
+		goto wait_on_io;
+	}
 
 	BUG_ON(btree_node_dirty(b));
 
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 0095c789f6e5..2974b2ad6966 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -22,6 +22,44 @@
 
 #include <linux/sched/mm.h>
 
+void bch2_btree_node_io_unlock(struct btree *b)
+{
+	EBUG_ON(!btree_node_write_in_flight(b));
+
+	clear_btree_node_write_in_flight(b);
+	wake_up_bit(&b->flags, BTREE_NODE_write_in_flight);
+}
+
+void bch2_btree_node_io_lock(struct btree *b)
+{
+	wait_on_bit_lock_io(&b->flags, BTREE_NODE_write_in_flight,
+			    TASK_UNINTERRUPTIBLE);
+}
+
+void __bch2_btree_node_wait_on_read(struct btree *b)
+{
+	wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight,
+		       TASK_UNINTERRUPTIBLE);
+}
+
+void __bch2_btree_node_wait_on_write(struct btree *b)
+{
+	wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight,
+		       TASK_UNINTERRUPTIBLE);
+}
+
+void bch2_btree_node_wait_on_read(struct btree *b)
+{
+	wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight,
+		       TASK_UNINTERRUPTIBLE);
+}
+
+void bch2_btree_node_wait_on_write(struct btree *b)
+{
+	wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight,
+		       TASK_UNINTERRUPTIBLE);
+}
+
 static void verify_no_dups(struct btree *b,
 			   struct bkey_packed *start,
 			   struct bkey_packed *end)
@@ -432,7 +470,8 @@ void bch2_btree_init_next(struct btree_trans *trans,
 	EBUG_ON(iter && iter->l[b->c.level].b != b);
 	BUG_ON(bset_written(b, bset(b, &b->set[1])));
 
-	if (b->nsets == MAX_BSETS) {
+	if (b->nsets == MAX_BSETS &&
+	    !btree_node_write_in_flight(b)) {
 		unsigned log_u64s[] = {
 			ilog2(bset_u64s(&b->set[0])),
 			ilog2(bset_u64s(&b->set[1])),
@@ -1402,8 +1441,6 @@ void bch2_btree_node_read(struct bch_fs *c, struct btree *b,
 	btree_pos_to_text(&PBUF(buf), c, b);
 	trace_btree_read(c, b);
 
-	set_btree_node_read_in_flight(b);
-
 	if (bch2_verify_all_btree_replicas &&
 	    !btree_node_read_all_replicas(c, b, sync))
 		return;
@@ -1480,6 +1517,8 @@ int bch2_btree_root_read(struct bch_fs *c, enum btree_id id,
 	bkey_copy(&b->key, k);
 	BUG_ON(bch2_btree_node_hash_insert(&c->btree_cache, b, level, id));
 
+	set_btree_node_read_in_flight(b);
+
 	bch2_btree_node_read(c, b, true);
 
 	if (btree_node_read_error(b)) {
@@ -1525,7 +1564,7 @@ static void btree_node_write_done(struct bch_fs *c, struct btree *b)
 	struct btree_write *w = btree_prev_write(b);
 
 	bch2_btree_complete_write(c, b, w);
-	btree_node_io_unlock(b);
+	bch2_btree_node_io_unlock(b);
 }
 
 static void bch2_btree_node_write_error(struct bch_fs *c,
@@ -1707,6 +1746,8 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b)
 	bool validate_before_checksum = false;
 	void *data;
 
+	BUG_ON(btree_node_write_in_flight(b));
+
 	if (test_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags))
 		return;
 
@@ -1734,7 +1775,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b)
 			 * XXX waiting on btree writes with btree locks held -
 			 * this can deadlock, and we hit the write error path
 			 */
-			btree_node_wait_on_io(b);
+			bch2_btree_node_wait_on_write(b);
 			continue;
 		}
 
diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h
index fae67622c127..89fd4aba5218 100644
--- a/fs/bcachefs/btree_io.h
+++ b/fs/bcachefs/btree_io.h
@@ -52,24 +52,12 @@ struct btree_write_bio {
 	struct bch_write_bio	wbio;
 };
 
-static inline void btree_node_io_unlock(struct btree *b)
-{
-	EBUG_ON(!btree_node_write_in_flight(b));
-	clear_btree_node_write_in_flight(b);
-	wake_up_bit(&b->flags, BTREE_NODE_write_in_flight);
-}
-
-static inline void btree_node_io_lock(struct btree *b)
-{
-	wait_on_bit_lock_io(&b->flags, BTREE_NODE_write_in_flight,
-			    TASK_UNINTERRUPTIBLE);
-}
-
-static inline void btree_node_wait_on_io(struct btree *b)
-{
-	wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight,
-		       TASK_UNINTERRUPTIBLE);
-}
+void bch2_btree_node_io_unlock(struct btree *);
+void bch2_btree_node_io_lock(struct btree *);
+void __bch2_btree_node_wait_on_read(struct btree *);
+void __bch2_btree_node_wait_on_write(struct btree *);
+void bch2_btree_node_wait_on_read(struct btree *);
+void bch2_btree_node_wait_on_write(struct btree *);
 
 static inline bool btree_node_may_write(struct btree *b)
 {
@@ -169,7 +157,7 @@ static inline void btree_node_write_if_need(struct bch_fs *c, struct btree *b,
 		}
 
 		six_unlock_type(&b->c.lock, lock_held);
-		btree_node_wait_on_io(b);
+		bch2_btree_node_wait_on_write(b);
 		btree_node_lock_type(c, b, lock_held);
 	}
 }
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index a5d3973e163a..37dadbae41e5 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -567,7 +567,7 @@ static void btree_update_nodes_written(struct btree_update *as)
 		six_unlock_read(&old->c.lock);
 
 		if (seq == as->old_nodes_seq[i])
-			btree_node_wait_on_io(old);
+			bch2_btree_node_wait_on_write(old);
 	}
 
 	/*
diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c
index cbadb38f680f..6a28de30ea3b 100644
--- a/fs/bcachefs/debug.c
+++ b/fs/bcachefs/debug.c
@@ -133,7 +133,7 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b)
 	if (c->opts.nochanges)
 		return;
 
-	btree_node_io_lock(b);
+	bch2_btree_node_io_lock(b);
 	mutex_lock(&c->verify_lock);
 
 	if (!c->verify_ondisk) {
@@ -176,7 +176,7 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b)
 	}
 out:
 	mutex_unlock(&c->verify_lock);
-	btree_node_io_unlock(b);
+	bch2_btree_node_io_unlock(b);
 }
 
 #ifdef CONFIG_DEBUG_FS
-- 
cgit 


From 2680325b7803c336bb675addfe38c06c44e54273 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 11 Jul 2021 13:54:07 -0400
Subject: bcachefs: Mask out unknown compat features when going read-write

Compat features should be cleared if the filesystem was touched by a
version that doesn't support them.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/super-io.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index 403e77e2c515..be080c407286 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -985,6 +985,7 @@ int bch2_fs_mark_dirty(struct bch_fs *c)
 	mutex_lock(&c->sb_lock);
 	SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
 	c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALWAYS);
+	c->disk_sb.sb->compat[0] &= cpu_to_le64((1ULL << BCH_COMPAT_NR) - 1);
 	ret = bch2_write_super(c);
 	mutex_unlock(&c->sb_lock);
 
-- 
cgit 


From 0a70089062c63b0861217d9ffb76d3ac073d3fde Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 11 Jul 2021 16:41:14 -0400
Subject: bcachefs: Kick off btree node writes from write completions

This is a performance improvement by removing the need to wait for the
in flight btree write to complete before kicking one off, which is going
to be needed to avoid a performance regression with the upcoming patch
to update btree ptrs after every btree write.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_cache.c |  4 ++--
 fs/bcachefs/btree_io.c    | 61 +++++++++++++++++++++++++++++++++++++----------
 fs/bcachefs/btree_io.h    | 19 +++++----------
 3 files changed, 56 insertions(+), 28 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index 051d2867ad63..430d5951263f 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -234,7 +234,7 @@ wait_on_io:
 		if (bch2_verify_btree_ondisk)
 			bch2_btree_node_write(c, b, SIX_LOCK_intent);
 		else
-			__bch2_btree_node_write(c, b);
+			__bch2_btree_node_write(c, b, false);
 
 		six_unlock_write(&b->c.lock);
 		six_unlock_intent(&b->c.lock);
@@ -1009,7 +1009,7 @@ wait_on_io:
 	six_lock_write(&b->c.lock, NULL, NULL);
 
 	if (btree_node_dirty(b)) {
-		__bch2_btree_node_write(c, b);
+		__bch2_btree_node_write(c, b, false);
 		six_unlock_write(&b->c.lock);
 		six_unlock_intent(&b->c.lock);
 		goto wait_on_io;
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 2974b2ad6966..1d4b5fcd1e39 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -1562,9 +1562,47 @@ void bch2_btree_complete_write(struct bch_fs *c, struct btree *b,
 static void btree_node_write_done(struct bch_fs *c, struct btree *b)
 {
 	struct btree_write *w = btree_prev_write(b);
+	unsigned long old, new, v;
 
 	bch2_btree_complete_write(c, b, w);
-	bch2_btree_node_io_unlock(b);
+
+	v = READ_ONCE(b->flags);
+	do {
+		old = new = v;
+
+		if (old & (1U << BTREE_NODE_need_write))
+			goto do_write;
+
+		new &= ~(1U << BTREE_NODE_write_in_flight);
+	} while ((v = cmpxchg(&b->flags, old, new)) != old);
+
+	wake_up_bit(&b->flags, BTREE_NODE_write_in_flight);
+	return;
+
+do_write:
+	six_lock_read(&b->c.lock, NULL, NULL);
+	v = READ_ONCE(b->flags);
+	do {
+		old = new = v;
+
+		if ((old & (1U << BTREE_NODE_dirty)) &&
+		    (old & (1U << BTREE_NODE_need_write)) &&
+		    !(old & (1U << BTREE_NODE_never_write)) &&
+		    btree_node_may_write(b)) {
+			new &= ~(1U << BTREE_NODE_dirty);
+			new &= ~(1U << BTREE_NODE_need_write);
+			new |=  (1U << BTREE_NODE_write_in_flight);
+			new |=  (1U << BTREE_NODE_just_written);
+			new ^=  (1U << BTREE_NODE_write_idx);
+		} else {
+			new &= ~(1U << BTREE_NODE_write_in_flight);
+		}
+	} while ((v = cmpxchg(&b->flags, old, new)) != old);
+
+	if (new & (1U << BTREE_NODE_write_in_flight))
+		__bch2_btree_node_write(c, b, true);
+
+	six_unlock_read(&b->c.lock);
 }
 
 static void bch2_btree_node_write_error(struct bch_fs *c,
@@ -1729,7 +1767,7 @@ static void btree_write_submit(struct work_struct *work)
 	bch2_submit_wbio_replicas(&wbio->wbio, wbio->wbio.c, BCH_DATA_btree, &wbio->key);
 }
 
-void __bch2_btree_node_write(struct bch_fs *c, struct btree *b)
+void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, bool already_started)
 {
 	struct btree_write_bio *wbio;
 	struct bset_tree *t;
@@ -1746,7 +1784,8 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b)
 	bool validate_before_checksum = false;
 	void *data;
 
-	BUG_ON(btree_node_write_in_flight(b));
+	if (already_started)
+		goto do_write;
 
 	if (test_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags))
 		return;
@@ -1770,14 +1809,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b)
 		if (old & (1 << BTREE_NODE_never_write))
 			return;
 
-		if (old & (1 << BTREE_NODE_write_in_flight)) {
-			/*
-			 * XXX waiting on btree writes with btree locks held -
-			 * this can deadlock, and we hit the write error path
-			 */
-			bch2_btree_node_wait_on_write(b);
-			continue;
-		}
+		BUG_ON(old & (1 << BTREE_NODE_write_in_flight));
 
 		new &= ~(1 << BTREE_NODE_dirty);
 		new &= ~(1 << BTREE_NODE_need_write);
@@ -1786,6 +1818,9 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b)
 		new ^=  (1 << BTREE_NODE_write_idx);
 	} while (cmpxchg_acquire(&b->flags, old, new) != old);
 
+	if (new & (1U << BTREE_NODE_need_write))
+		return;
+do_write:
 	atomic_dec(&c->btree_cache.dirty);
 
 	BUG_ON(btree_node_fake(b));
@@ -2041,7 +2076,7 @@ void bch2_btree_node_write(struct bch_fs *c, struct btree *b,
 	if (lock_type_held == SIX_LOCK_intent ||
 	    (lock_type_held == SIX_LOCK_read &&
 	     six_lock_tryupgrade(&b->c.lock))) {
-		__bch2_btree_node_write(c, b);
+		__bch2_btree_node_write(c, b, false);
 
 		/* don't cycle lock unnecessarily: */
 		if (btree_node_just_written(b) &&
@@ -2053,7 +2088,7 @@ void bch2_btree_node_write(struct bch_fs *c, struct btree *b,
 		if (lock_type_held == SIX_LOCK_read)
 			six_lock_downgrade(&b->c.lock);
 	} else {
-		__bch2_btree_node_write(c, b);
+		__bch2_btree_node_write(c, b, false);
 		if (lock_type_held == SIX_LOCK_write &&
 		    btree_node_just_written(b))
 			bch2_btree_post_write_cleanup(c, b);
diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h
index 89fd4aba5218..3732d135de8d 100644
--- a/fs/bcachefs/btree_io.h
+++ b/fs/bcachefs/btree_io.h
@@ -139,7 +139,7 @@ void bch2_btree_complete_write(struct bch_fs *, struct btree *,
 			      struct btree_write *);
 void bch2_btree_write_error_work(struct work_struct *);
 
-void __bch2_btree_node_write(struct bch_fs *, struct btree *);
+void __bch2_btree_node_write(struct bch_fs *, struct btree *, bool);
 bool bch2_btree_post_write_cleanup(struct bch_fs *, struct btree *);
 
 void bch2_btree_node_write(struct bch_fs *, struct btree *,
@@ -148,18 +148,11 @@ void bch2_btree_node_write(struct bch_fs *, struct btree *,
 static inline void btree_node_write_if_need(struct bch_fs *c, struct btree *b,
 					    enum six_lock_type lock_held)
 {
-	while (b->written &&
-	       btree_node_need_write(b) &&
-	       btree_node_may_write(b)) {
-		if (!btree_node_write_in_flight(b)) {
-			bch2_btree_node_write(c, b, lock_held);
-			break;
-		}
-
-		six_unlock_type(&b->c.lock, lock_held);
-		bch2_btree_node_wait_on_write(b);
-		btree_node_lock_type(c, b, lock_held);
-	}
+	if (b->written &&
+	    btree_node_need_write(b) &&
+	    btree_node_may_write(b) &&
+	    !btree_node_write_in_flight(b))
+		bch2_btree_node_write(c, b, lock_held);
 }
 
 #define bch2_btree_node_write_cond(_c, _b, cond)			\
-- 
cgit 


From 003e738d4f0f61a8711ce41a03d8fb01a1cd7733 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 12 Jul 2021 23:17:15 -0400
Subject: bcachefs: Ensure bad d_type doesn't oops in bch2_dirent_to_text()

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/dirent.c | 5 ++++-
 fs/bcachefs/opts.c   | 2 +-
 2 files changed, 5 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c
index d5883ab7de21..a95165b8eddf 100644
--- a/fs/bcachefs/dirent.c
+++ b/fs/bcachefs/dirent.c
@@ -112,7 +112,10 @@ void bch2_dirent_to_text(struct printbuf *out, struct bch_fs *c,
 
 	bch_scnmemcpy(out, d.v->d_name,
 		      bch2_dirent_name_bytes(d));
-	pr_buf(out, " -> %llu type %s", d.v->d_inum, bch2_d_types[d.v->d_type]);
+	pr_buf(out, " -> %llu type %s", d.v->d_inum,
+	       d.v->d_type < DT_MAX
+	       ? bch2_d_types[d.v->d_type]
+	       : "(bad d_type)");
 }
 
 static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans,
diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c
index fd3f7cddb9ab..5de296078219 100644
--- a/fs/bcachefs/opts.c
+++ b/fs/bcachefs/opts.c
@@ -63,7 +63,7 @@ const char * const bch2_member_states[] = {
 
 #undef x
 
-const char * const bch2_d_types[] = {
+const char * const bch2_d_types[DT_MAX] = {
 	[DT_UNKNOWN]	= "unknown",
 	[DT_FIFO]	= "fifo",
 	[DT_CHR]	= "chr",
-- 
cgit 


From 2e655e6de202d891f0232cfd3c56b8f8c176cf99 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 12 Jul 2021 23:52:49 -0400
Subject: bcachefs: Add open_buckets to sysfs

This is to help debug a rare shutdown deadlock in the allocator code -
the btree code is leaking open_buckets.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/alloc_background.c | 19 +++++++++++++++++++
 fs/bcachefs/alloc_background.h |  2 ++
 fs/bcachefs/sysfs.c            |  7 +++++++
 3 files changed, 28 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 15f9adf0876a..fc20649b19cf 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -1232,3 +1232,22 @@ void bch2_fs_allocator_background_init(struct bch_fs *c)
 {
 	spin_lock_init(&c->freelist_lock);
 }
+
+void bch2_open_buckets_to_text(struct printbuf *out, struct bch_fs *c)
+{
+	struct open_bucket *ob;
+
+	for (ob = c->open_buckets;
+	     ob < c->open_buckets + ARRAY_SIZE(c->open_buckets);
+	     ob++) {
+		spin_lock(&ob->lock);
+		if (ob->valid && !ob->on_partial_list) {
+			pr_buf(out, "%zu ref %u type %s\n",
+			       ob - c->open_buckets,
+			       atomic_read(&ob->pin),
+			       bch2_data_types[ob->type]);
+		}
+		spin_unlock(&ob->lock);
+	}
+
+}
diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h
index 9cadfdb5b83d..a4f6bf56b18f 100644
--- a/fs/bcachefs/alloc_background.h
+++ b/fs/bcachefs/alloc_background.h
@@ -132,4 +132,6 @@ int bch2_dev_allocator_start(struct bch_dev *);
 int bch2_alloc_write(struct bch_fs *, unsigned);
 void bch2_fs_allocator_background_init(struct bch_fs *);
 
+void bch2_open_buckets_to_text(struct printbuf *, struct bch_fs *);
+
 #endif /* _BCACHEFS_ALLOC_BACKGROUND_H */
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index 84a7acb04d01..9b1ffbf96e14 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -171,6 +171,7 @@ read_attribute(btree_cache);
 read_attribute(btree_key_cache);
 read_attribute(btree_transactions);
 read_attribute(stripes_heap);
+read_attribute(open_buckets);
 
 read_attribute(internal_uuid);
 
@@ -409,6 +410,11 @@ SHOW(bch2_fs)
 		return out.pos - buf;
 	}
 
+	if (attr == &sysfs_open_buckets) {
+		bch2_open_buckets_to_text(&out, c);
+		return out.pos - buf;
+	}
+
 	if (attr == &sysfs_compression_stats) {
 		bch2_compression_stats_to_text(&out, c);
 		return out.pos - buf;
@@ -567,6 +573,7 @@ struct attribute *bch2_fs_internal_files[] = {
 	&sysfs_btree_key_cache,
 	&sysfs_btree_transactions,
 	&sysfs_stripes_heap,
+	&sysfs_open_buckets,
 
 	&sysfs_read_realloc_races,
 	&sysfs_extent_migrate_done,
-- 
cgit 


From 8d3445878166ea726bc24326003ea7b9739cdc00 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 13 Jul 2021 16:03:51 -0400
Subject: bcachefs: Add safe versions of varint encode/decode

This adds safe versions of bch2_varint_(encode|decode) that don't read
or write past the end of the buffer, or varint being encoded.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/alloc_background.c |  4 +--
 fs/bcachefs/inode.c            |  6 ++--
 fs/bcachefs/varint.c           | 73 +++++++++++++++++++++++++++++++++++++++++-
 fs/bcachefs/varint.h           |  3 ++
 4 files changed, 80 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index fc20649b19cf..26aca7d3977b 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -130,7 +130,7 @@ static int bch2_alloc_unpack_v2(struct bkey_alloc_unpacked *out,
 
 #define x(_name, _bits)							\
 	if (fieldnr < a.v->nr_fields) {					\
-		ret = bch2_varint_decode(in, end, &v);			\
+		ret = bch2_varint_decode_fast(in, end, &v);		\
 		if (ret < 0)						\
 			return ret;					\
 		in += ret;						\
@@ -166,7 +166,7 @@ static void bch2_alloc_pack_v2(struct bkey_alloc_buf *dst,
 	nr_fields++;							\
 									\
 	if (src._name) {						\
-		out += bch2_varint_encode(out, src._name);		\
+		out += bch2_varint_encode_fast(out, src._name);		\
 									\
 		last_nonzero_field = out;				\
 		last_nonzero_fieldnr = nr_fields;			\
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index c5f93b8ca1c6..565aebba30e6 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -137,7 +137,7 @@ static void bch2_inode_pack_v2(struct bkey_inode_buf *packed,
 	nr_fields++;							\
 									\
 	if (inode->_name) {						\
-		ret = bch2_varint_encode(out, inode->_name);		\
+		ret = bch2_varint_encode_fast(out, inode->_name);	\
 		out += ret;						\
 									\
 		if (_bits > 64)						\
@@ -246,13 +246,13 @@ static int bch2_inode_unpack_v2(struct bkey_s_c_inode inode,
 
 #define x(_name, _bits)							\
 	if (fieldnr < INODE_NR_FIELDS(inode.v)) {			\
-		ret = bch2_varint_decode(in, end, &v[0]);		\
+		ret = bch2_varint_decode_fast(in, end, &v[0]);		\
 		if (ret < 0)						\
 			return ret;					\
 		in += ret;						\
 									\
 		if (_bits > 64) {					\
-			ret = bch2_varint_decode(in, end, &v[1]);	\
+			ret = bch2_varint_decode_fast(in, end, &v[1]);	\
 			if (ret < 0)					\
 				return ret;				\
 			in += ret;					\
diff --git a/fs/bcachefs/varint.c b/fs/bcachefs/varint.c
index 0f3d06a6a685..6955ff5dc19c 100644
--- a/fs/bcachefs/varint.c
+++ b/fs/bcachefs/varint.c
@@ -2,10 +2,18 @@
 
 #include <linux/bitops.h>
 #include <linux/math.h>
+#include <linux/string.h>
 #include <asm/unaligned.h>
 
 #include "varint.h"
 
+/**
+ * bch2_varint_encode - encode a variable length integer
+ * @out - destination to encode to
+ * @v	- unsigned integer to encode
+ *
+ * Returns the size in bytes of the encoded integer - at most 9 bytes
+ */
 int bch2_varint_encode(u8 *out, u64 v)
 {
 	unsigned bits = fls64(v|1);
@@ -14,16 +22,79 @@ int bch2_varint_encode(u8 *out, u64 v)
 	if (likely(bytes < 9)) {
 		v <<= bytes;
 		v |= ~(~0 << (bytes - 1));
+		v = cpu_to_le64(v);
+		memcpy(out, &v, bytes);
 	} else {
 		*out++ = 255;
 		bytes = 9;
+		put_unaligned_le64(v, out);
 	}
 
-	put_unaligned_le64(v, out);
 	return bytes;
 }
 
+/**
+ * bch2_varint_decode - encode a variable length integer
+ * @in	- varint to decode
+ * @end	- end of buffer to decode from
+ * @out	- on success, decoded integer
+ *
+ * Returns the size in bytes of the decoded integer - or -1 on failure (would
+ * have read past the end of the buffer)
+ */
 int bch2_varint_decode(const u8 *in, const u8 *end, u64 *out)
+{
+	unsigned bytes = likely(in < end)
+		? ffz(*in & 255) + 1
+		: 1;
+	u64 v;
+
+	if (unlikely(in + bytes > end))
+		return -1;
+
+	if (likely(bytes < 9)) {
+		v = 0;
+		memcpy(&v, in, bytes);
+		v = le64_to_cpu(v);
+		v >>= bytes;
+	} else {
+		v = get_unaligned_le64(++in);
+	}
+
+	*out = v;
+	return bytes;
+}
+
+/**
+ * bch2_varint_encode_fast - fast version of bch2_varint_encode
+ *
+ * This version assumes it's always safe to write 8 bytes to @out, even if the
+ * encoded integer would be smaller.
+ */
+int bch2_varint_encode_fast(u8 *out, u64 v)
+{
+	unsigned bits = fls64(v|1);
+	unsigned bytes = DIV_ROUND_UP(bits, 7);
+
+	if (likely(bytes < 9)) {
+		v <<= bytes;
+		v |= ~(~0 << (bytes - 1));
+	} else {
+		*out++ = 255;
+		bytes = 9;
+	}
+
+	put_unaligned_le64(v, out);
+	return bytes;
+}
+
+/**
+ * bch2_varint_decode_fast - fast version of bch2_varint_decode
+ *
+ * This version assumes that it is safe to read at most 8 bytes past the end of
+ * @end (we still return an error if the varint extends past @end).
+ */
+int bch2_varint_decode_fast(const u8 *in, const u8 *end, u64 *out)
 {
 	u64 v = get_unaligned_le64(in);
 	unsigned bytes = ffz(v & 255) + 1;
diff --git a/fs/bcachefs/varint.h b/fs/bcachefs/varint.h
index 8daf813576b7..92a182fb3d7a 100644
--- a/fs/bcachefs/varint.h
+++ b/fs/bcachefs/varint.h
@@ -5,4 +5,7 @@
 int bch2_varint_encode(u8 *, u64);
 int bch2_varint_decode(const u8 *, const u8 *, u64 *);
 
+int bch2_varint_encode_fast(u8 *, u64);
+int bch2_varint_decode_fast(const u8 *, const u8 *, u64 *);
+
 #endif /* _BCACHEFS_VARINT_H */
-- 
cgit 


From 9f6e1f7bb0e136871159a71d2bcd31c5dd8cbe81 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 13 Jul 2021 16:12:00 -0400
Subject: bcachefs: Fix an allocator shutdown deadlock

On fstest generic/388, we were seeing sporadic deadlocks in the
emergency shutdown, where we'd get stuck shutting down the allocator
because bch2_btree_update_start() -> bch2_btree_reserve_get() allocated
and then deallocated some btree nodes, putting them back on the
btree_reserve_cache, after the allocator shutdown code had already
cleared out that cache.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_update_interior.c | 27 ++++++++++++++++-----------
 1 file changed, 16 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 37dadbae41e5..0b78fb9d3561 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -948,13 +948,6 @@ bch2_btree_update_start(struct btree_iter *iter, unsigned level,
 
 	closure_init_stack(&cl);
 retry:
-	/*
-	 * This check isn't necessary for correctness - it's just to potentially
-	 * prevent us from doing a lot of work that'll end up being wasted:
-	 */
-	ret = bch2_journal_error(&c->journal);
-	if (ret)
-		return ERR_PTR(ret);
 
 	/*
 	 * XXX: figure out how far we might need to split,
@@ -995,6 +988,22 @@ retry:
 	bch2_keylist_init(&as->new_keys, as->_new_keys);
 	bch2_keylist_init(&as->parent_keys, as->inline_keys);
 
+	mutex_lock(&c->btree_interior_update_lock);
+	list_add_tail(&as->list, &c->btree_interior_update_list);
+	mutex_unlock(&c->btree_interior_update_lock);
+
+	/*
+	 * We don't want to allocate if we're in an error state, that can cause
+	 * deadlock on emergency shutdown due to open buckets getting stuck in
+	 * the btree_reserve_cache after allocator shutdown has cleared it out.
+	 * This check needs to come after adding us to the btree_interior_update
+	 * list but before calling bch2_btree_reserve_get, to synchronize with
+	 * __bch2_fs_read_only().
+	 */
+	ret = bch2_journal_error(&c->journal);
+	if (ret)
+		goto err;
+
 	ret = bch2_journal_preres_get(&c->journal, &as->journal_preres,
 				      BTREE_UPDATE_JOURNAL_RES,
 				      journal_flags|JOURNAL_RES_GET_NONBLOCK);
@@ -1046,10 +1055,6 @@ retry:
 			     atomic64_read(&c->journal.seq),
 			     &as->journal, NULL);
 
-	mutex_lock(&c->btree_interior_update_lock);
-	list_add_tail(&as->list, &c->btree_interior_update_list);
-	mutex_unlock(&c->btree_interior_update_lock);
-
 	return as;
 err:
 	bch2_btree_update_free(as);
-- 
cgit 


From 996fb577fdff69b1e5a72d7ca0f152ff7bb42bc8 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 13 Jun 2021 17:07:18 -0400
Subject: bcachefs: Add an option for whether inodes use the key cache

We probably don't ever want to flip this off in production, but it may
be useful for certain kinds of testing.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/bcachefs_format.h |  1 +
 fs/bcachefs/inode.c           | 17 ++++++++++-------
 fs/bcachefs/opts.h            |  5 +++++
 3 files changed, 16 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index 24853bd8923b..e6be594fd0be 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -1350,6 +1350,7 @@ LE64_BITMASK(BCH_SB_GC_RESERVE_BYTES,	struct bch_sb, flags[2],  4, 64);
 LE64_BITMASK(BCH_SB_ERASURE_CODE,	struct bch_sb, flags[3],  0, 16);
 LE64_BITMASK(BCH_SB_METADATA_TARGET,	struct bch_sb, flags[3], 16, 28);
 LE64_BITMASK(BCH_SB_SHARD_INUMS,	struct bch_sb, flags[3], 28, 29);
+LE64_BITMASK(BCH_SB_INODES_USE_KEY_CACHE,struct bch_sb, flags[3], 29, 30);
 
 /*
  * Features:
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index 565aebba30e6..8c6627907431 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -300,8 +300,10 @@ struct btree_iter *bch2_inode_peek(struct btree_trans *trans,
 	struct bkey_s_c k;
 	int ret;
 
-	iter = bch2_trans_get_iter(trans, BTREE_ID_inodes, POS(0, inum),
-				   BTREE_ITER_CACHED|flags);
+	if (trans->c->opts.inodes_use_key_cache)
+		flags |= BTREE_ITER_CACHED;
+
+	iter = bch2_trans_get_iter(trans, BTREE_ID_inodes, POS(0, inum), flags);
 	k = bch2_btree_iter_peek_slot(iter);
 	ret = bkey_err(k);
 	if (ret)
@@ -577,8 +579,12 @@ int bch2_inode_rm(struct bch_fs *c, u64 inode_nr, bool cached)
 	struct bpos end = POS(inode_nr + 1, 0);
 	struct bch_inode_unpacked inode_u;
 	struct bkey_s_c k;
+	unsigned iter_flags = BTREE_ITER_INTENT;
 	int ret;
 
+	if (cached && c->opts.inodes_use_key_cache)
+		iter_flags |= BTREE_ITER_CACHED;
+
 	bch2_trans_init(&trans, c, 0, 1024);
 
 	/*
@@ -600,11 +606,8 @@ int bch2_inode_rm(struct bch_fs *c, u64 inode_nr, bool cached)
 retry:
 	bch2_trans_begin(&trans);
 
-	iter = bch2_trans_get_iter(&trans, BTREE_ID_inodes, POS(0, inode_nr),
-				   (cached
-				    ? BTREE_ITER_CACHED
-				    : BTREE_ITER_SLOTS)|
-				   BTREE_ITER_INTENT);
+	iter = bch2_trans_get_iter(&trans, BTREE_ID_inodes,
+				   POS(0, inode_nr), iter_flags);
 	k = bch2_btree_iter_peek_slot(iter);
 
 	ret = bkey_err(k);
diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
index 707659e31893..2cba0e137b58 100644
--- a/fs/bcachefs/opts.h
+++ b/fs/bcachefs/opts.h
@@ -173,6 +173,11 @@ enum opt_type {
 	  OPT_BOOL(),							\
 	  BCH_SB_SHARD_INUMS,		false,				\
 	  NULL,		"Shard new inode numbers by CPU id")		\
+	x(inodes_use_key_cache,	u8,					\
+	  OPT_FORMAT|OPT_MOUNT,						\
+	  OPT_BOOL(),							\
+	  BCH_SB_INODES_USE_KEY_CACHE,	true,				\
+	  NULL,		"Use the btree key cache for the inodes btree")	\
 	x(gc_reserve_percent,		u8,				\
 	  OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,				\
 	  OPT_UINT(5, 21),						\
-- 
cgit 


From 5468f1195dcfb9256b46484822dde31c9ec0af1e Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 14 Jul 2021 00:14:45 -0400
Subject: bcachefs: Fix a memory leak in the dio write path

There were some error paths where we were leaking page refs - oops.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-io.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index bcf954a2394f..30c12864d537 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -1826,8 +1826,6 @@ static long bch2_dio_write_loop(struct dio_write *dio)
 	struct bch_inode_info *inode = file_bch_inode(req->ki_filp);
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
 	struct bio *bio = &dio->op.wbio.bio;
-	struct bvec_iter_all iter;
-	struct bio_vec *bv;
 	unsigned unaligned, iter_count;
 	bool sync = dio->sync, dropped_locks;
 	long ret;
@@ -1882,8 +1880,6 @@ static long bch2_dio_write_loop(struct dio_write *dio)
 			 * bio_iov_iter_get_pages was only able to get <
 			 * blocksize worth of pages:
 			 */
-			bio_for_each_segment_all(bv, bio, iter)
-				put_page(bv->bv_page);
 			ret = -EFAULT;
 			goto err;
 		}
@@ -1938,6 +1934,7 @@ loop:
 		spin_unlock(&inode->v.i_lock);
 
 		bio_release_pages(bio, false);
+		bio->bi_vcnt = 0;
 
 		if (dio->op.error) {
 			set_bit(EI_INODE_ERROR, &inode->ei_flags);
@@ -1959,6 +1956,7 @@ err:
 	if (dio->free_iov)
 		kfree(dio->iter.__iov);
 
+	bio_release_pages(bio, false);
 	bio_put(bio);
 
 	/* inode->i_dio_count is our ref on inode and thus bch_fs */
-- 
cgit 


From 5aab66353423f6398975ed9d7174f58628f6eb19 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 14 Jul 2021 15:13:27 -0400
Subject: bcachefs: Tighten up btree_iter locking assertions

We weren't correctly verifying that we had interior node intent locks -
this patch also fixes bugs uncovered by the new assertions.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_cache.c     |  8 ++++++--
 fs/bcachefs/btree_iter.c      | 41 +++++++++++++++++++++++++----------------
 fs/bcachefs/btree_iter.h      |  2 ++
 fs/bcachefs/btree_key_cache.c |  4 +++-
 4 files changed, 36 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index 430d5951263f..19afbdcae5e4 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -698,7 +698,9 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c,
 	 * currently fails for iterators that aren't pointed at a valid btree
 	 * node
 	 */
-	if (iter && !bch2_trans_relock(iter->trans))
+	if (iter &&
+	    (!bch2_trans_relock(iter->trans) ||
+	     !bch2_btree_iter_relock(iter, _THIS_IP_)))
 		return ERR_PTR(-EINTR);
 
 	if (!six_relock_type(&b->c.lock, lock_type, seq))
@@ -858,7 +860,9 @@ lock_node:
 		 * currently fails for iterators that aren't pointed at a valid
 		 * btree node
 		 */
-		if (iter && !bch2_trans_relock(iter->trans))
+		if (iter &&
+		    (!bch2_trans_relock(iter->trans) ||
+		     !bch2_btree_iter_relock(iter, _THIS_IP_)))
 			return ERR_PTR(-EINTR);
 
 		if (!six_relock_type(&b->c.lock, lock_type, seq))
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 13e4dea8a09b..22419929ac1b 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -361,7 +361,7 @@ static void bch2_btree_iter_verify_locks(struct btree_iter *iter)
 		return;
 	}
 
-	for (l = 0; is_btree_node(iter, l); l++) {
+	for (l = 0; btree_iter_node(iter, l); l++) {
 		if (iter->uptodate >= BTREE_ITER_NEED_RELOCK &&
 		    !btree_node_locked(iter, l))
 			continue;
@@ -383,7 +383,7 @@ static inline void bch2_btree_iter_verify_locks(struct btree_iter *iter) {}
 #endif
 
 __flatten
-static bool bch2_btree_iter_relock(struct btree_iter *iter, unsigned long trace_ip)
+bool bch2_btree_iter_relock(struct btree_iter *iter, unsigned long trace_ip)
 {
 	return btree_iter_get_locks(iter, false, trace_ip);
 }
@@ -607,6 +607,8 @@ err:
 
 static void bch2_btree_iter_verify(struct btree_iter *iter)
 {
+	struct btree_trans *trans = iter->trans;
+	struct bch_fs *c = trans->c;
 	enum btree_iter_type type = btree_iter_type(iter);
 	unsigned i;
 
@@ -625,10 +627,16 @@ static void bch2_btree_iter_verify(struct btree_iter *iter)
 	       (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) &&
 	       !btree_type_has_snapshots(iter->btree_id));
 
-	bch2_btree_iter_verify_locks(iter);
+	for (i = 0; i < (type != BTREE_ITER_CACHED ? BTREE_MAX_DEPTH : 1); i++) {
+		if (!iter->l[i].b) {
+			BUG_ON(c->btree_roots[iter->btree_id].b->c.level > i);
+			break;
+		}
 
-	for (i = 0; i < BTREE_MAX_DEPTH; i++)
 		bch2_btree_iter_verify_level(iter, i);
+	}
+
+	bch2_btree_iter_verify_locks(iter);
 }
 
 static void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter)
@@ -1350,30 +1358,30 @@ static inline unsigned btree_iter_up_until_good_node(struct btree_iter *iter,
 static int btree_iter_traverse_one(struct btree_iter *iter,
 				   unsigned long trace_ip)
 {
-	unsigned depth_want = iter->level;
+	unsigned l, depth_want = iter->level;
 	int ret = 0;
 
-	/*
-	 * if we need interior nodes locked, call btree_iter_relock() to make
-	 * sure we walk back up enough that we lock them:
-	 */
-	if (iter->uptodate == BTREE_ITER_NEED_RELOCK ||
-	    iter->locks_want > 1)
-		bch2_btree_iter_relock(iter, _THIS_IP_);
-
 	if (btree_iter_type(iter) == BTREE_ITER_CACHED) {
 		ret = bch2_btree_iter_traverse_cached(iter);
 		goto out;
 	}
 
-	if (iter->uptodate < BTREE_ITER_NEED_RELOCK)
-		goto out;
-
 	if (unlikely(iter->level >= BTREE_MAX_DEPTH))
 		goto out;
 
 	iter->level = btree_iter_up_until_good_node(iter, 0);
 
+	/* If we need intent locks, take them too: */
+	for (l = iter->level + 1;
+	     l < iter->locks_want && btree_iter_node(iter, l);
+	     l++)
+		if (!bch2_btree_node_relock(iter, l))
+			while (iter->level <= l) {
+				btree_node_unlock(iter, iter->level);
+				iter->l[iter->level].b = BTREE_ITER_NO_NODE_UP;
+				iter->level++;
+			}
+
 	/*
 	 * Note: iter->nodes[iter->level] may be temporarily NULL here - that
 	 * would indicate to other code that we got to the end of the btree,
@@ -1394,6 +1402,7 @@ static int btree_iter_traverse_one(struct btree_iter *iter,
 				goto out;
 			}
 
+			__bch2_btree_iter_unlock(iter);
 			iter->level = depth_want;
 
 			if (ret == -EIO) {
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index 31175cf00c0a..58f15b716d49 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -111,6 +111,8 @@ void bch2_btree_node_iter_fix(struct btree_iter *, struct btree *,
 			      struct btree_node_iter *, struct bkey_packed *,
 			      unsigned, unsigned);
 
+bool bch2_btree_iter_relock(struct btree_iter *, unsigned long);
+
 bool bch2_trans_relock(struct btree_trans *);
 void bch2_trans_unlock(struct btree_trans *);
 
diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index bafa1f0abc88..d60b6084fdf0 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -271,7 +271,9 @@ int bch2_btree_iter_traverse_cached(struct btree_iter *iter)
 
 	BUG_ON(iter->level);
 
-	if (btree_node_locked(iter, 0)) {
+	iter->l[1].b = NULL;
+
+	if (bch2_btree_node_relock(iter, 0)) {
 		ck = (void *) iter->l[0].b;
 		goto fill;
 	}
-- 
cgit 


From 914f2786b8923232eb925fe75cb7d0b0b3788d91 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 14 Jul 2021 20:28:27 -0400
Subject: bcachefs: Improvements to fsck check_dirents()

The fsck code handles transaction restarts in a very ad hoc way, and not
always correctly. This patch makes some improvements to check_dirents(),
but more work needs to be done to figure out how this kind of code
should be structured.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/fsck.c | 345 +++++++++++++++++++++++++++--------------------------
 1 file changed, 178 insertions(+), 167 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 7ea1a41ac637..bedfd34803ce 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -267,11 +267,11 @@ static struct inode_walker inode_walker_init(void)
 	};
 }
 
-static int walk_inode(struct btree_trans *trans,
-		      struct inode_walker *w, u64 inum)
+static int __walk_inode(struct btree_trans *trans,
+			struct inode_walker *w, u64 inum)
 {
 	if (inum != w->cur_inum) {
-		int ret = lookup_inode(trans, inum, &w->inode, &w->snapshot);
+		int ret = __lookup_inode(trans, inum, &w->inode, &w->snapshot);
 
 		if (ret && ret != -ENOENT)
 			return ret;
@@ -286,6 +286,12 @@ static int walk_inode(struct btree_trans *trans,
 	return 0;
 }
 
+static int walk_inode(struct btree_trans *trans,
+		      struct inode_walker *w, u64 inum)
+{
+	return lockrestart_do(trans, __walk_inode(trans, w, inum));
+}
+
 static int hash_redo_key(struct btree_trans *trans,
 			 const struct bch_hash_desc desc,
 			 struct bch_hash_info *hash_info,
@@ -704,210 +710,215 @@ fsck_err:
 	return bch2_trans_exit(&trans) ?: ret;
 }
 
-/*
- * Walk dirents: verify that they all have a corresponding S_ISDIR inode,
- * validate d_type
- */
-noinline_for_stack
-static int check_dirents(struct bch_fs *c)
+static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
+			struct bch_hash_info *hash_info,
+			struct inode_walker *w, unsigned *nr_subdirs)
 {
-	struct inode_walker w = inode_walker_init();
-	struct bch_hash_info hash_info;
-	struct btree_trans trans;
-	struct btree_iter *iter;
+	struct bch_fs *c = trans->c;
 	struct bkey_s_c k;
+	struct bkey_s_c_dirent d;
+	struct bch_inode_unpacked target;
+	u32 target_snapshot;
+	bool have_target;
+	bool backpointer_exists = true;
+	u64 d_inum;
 	char buf[200];
-	unsigned nr_subdirs = 0;
-	int ret = 0;
+	int ret;
 
-	bch_verbose(c, "checking dirents");
+	k = bch2_btree_iter_peek(iter);
+	if (!k.k)
+		return 1;
 
-	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
+	ret = bkey_err(k);
+	if (ret)
+		return ret;
 
-	iter = bch2_trans_get_iter(&trans, BTREE_ID_dirents,
-				   POS(BCACHEFS_ROOT_INO, 0),
-				   BTREE_ITER_INTENT|
-				   BTREE_ITER_PREFETCH);
-retry:
-	while ((k = bch2_btree_iter_peek(iter)).k &&
-	       !(ret = bkey_err(k))) {
-		struct bkey_s_c_dirent d;
-		struct bch_inode_unpacked target;
-		u32 target_snapshot;
-		bool have_target;
-		bool backpointer_exists = true;
-		u64 d_inum;
+	if (w->have_inode &&
+	    w->cur_inum != k.k->p.inode &&
+	    fsck_err_on(w->inode.bi_nlink != *nr_subdirs, c,
+			"directory %llu with wrong i_nlink: got %u, should be %u",
+			w->inode.bi_inum, w->inode.bi_nlink, *nr_subdirs)) {
+		w->inode.bi_nlink = *nr_subdirs;
+		ret = write_inode(trans, &w->inode, w->snapshot);
+		return ret ?: -EINTR;
+	}
 
-		if (w.have_inode &&
-		    w.cur_inum != k.k->p.inode &&
-		    fsck_err_on(w.inode.bi_nlink != nr_subdirs, c,
-				"directory %llu with wrong i_nlink: got %u, should be %u",
-				w.inode.bi_inum, w.inode.bi_nlink, nr_subdirs)) {
-			w.inode.bi_nlink = nr_subdirs;
-			ret = write_inode(&trans, &w.inode, w.snapshot);
-			if (ret)
-				break;
-		}
+	ret = __walk_inode(trans, w, k.k->p.inode);
+	if (ret)
+		return ret;
 
-		ret = walk_inode(&trans, &w, k.k->p.inode);
-		if (ret)
-			break;
+	if (w->first_this_inode)
+		*nr_subdirs = 0;
 
-		if (w.first_this_inode)
-			nr_subdirs = 0;
+	if (fsck_err_on(!w->have_inode, c,
+			"dirent in nonexisting directory:\n%s",
+			(bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)) ||
+	    fsck_err_on(!S_ISDIR(w->inode.bi_mode), c,
+			"dirent in non directory inode type %u:\n%s",
+			mode_to_type(w->inode.bi_mode),
+			(bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)))
+		return __bch2_trans_do(trans, NULL, NULL, 0,
+				bch2_btree_delete_at(trans, iter, 0));
 
-		if (fsck_err_on(!w.have_inode, c,
-				"dirent in nonexisting directory:\n%s",
-				(bch2_bkey_val_to_text(&PBUF(buf), c,
-						       k), buf)) ||
-		    fsck_err_on(!S_ISDIR(w.inode.bi_mode), c,
-				"dirent in non directory inode type %u:\n%s",
-				mode_to_type(w.inode.bi_mode),
-				(bch2_bkey_val_to_text(&PBUF(buf), c,
-						       k), buf))) {
-			ret = __bch2_trans_do(&trans, NULL, NULL, 0,
-					bch2_btree_delete_at(&trans, iter, 0));
-			if (ret)
-				goto err;
-			goto next;
-		}
+	if (!w->have_inode)
+		return 0;
 
-		if (!w.have_inode)
-			goto next;
+	if (w->first_this_inode)
+		*hash_info = bch2_hash_info_init(c, &w->inode);
 
-		if (w.first_this_inode)
-			hash_info = bch2_hash_info_init(c, &w.inode);
+	ret = hash_check_key(trans, bch2_dirent_hash_desc,
+			     hash_info, iter, k);
+	if (ret < 0)
+		return ret;
+	if (ret) /* dirent has been deleted */
+		return 0;
 
-		ret = hash_check_key(&trans, bch2_dirent_hash_desc,
-				     &hash_info, iter, k);
-		if (ret > 0) {
-			ret = 0;
-			goto next;
-		}
-		if (ret)
-			goto fsck_err;
+	if (k.k->type != KEY_TYPE_dirent)
+		return 0;
+
+	d = bkey_s_c_to_dirent(k);
+	d_inum = le64_to_cpu(d.v->d_inum);
 
-		if (k.k->type != KEY_TYPE_dirent)
-			goto next;
+	ret = __lookup_inode(trans, d_inum, &target, &target_snapshot);
+	if (ret && ret != -ENOENT)
+		return ret;
 
-		d = bkey_s_c_to_dirent(k);
-		d_inum = le64_to_cpu(d.v->d_inum);
+	have_target = !ret;
+	ret = 0;
 
-		ret = lookup_inode(&trans, d_inum, &target, &target_snapshot);
-		if (ret && ret != -ENOENT)
-			break;
+	if (fsck_err_on(!have_target, c,
+			"dirent points to missing inode:\n%s",
+			(bch2_bkey_val_to_text(&PBUF(buf), c,
+					       k), buf)))
+		return remove_dirent(trans, d.k->p);
 
-		have_target = !ret;
+	if (!have_target)
+		return 0;
+
+	if (!target.bi_dir &&
+	    !target.bi_dir_offset) {
+		target.bi_dir		= k.k->p.inode;
+		target.bi_dir_offset	= k.k->p.offset;
+
+		ret = __write_inode(trans, &target, target_snapshot) ?:
+			bch2_trans_commit(trans, NULL, NULL,
+					  BTREE_INSERT_NOFAIL|
+					  BTREE_INSERT_LAZY_RW|
+					  BTREE_INSERT_NOUNLOCK);
+		if (ret)
+			return ret;
+		return -EINTR;
+	}
+
+	if (!inode_backpointer_matches(d, &target)) {
+		ret = inode_backpointer_exists(trans, &target);
+		if (ret < 0)
+			return ret;
+
+		backpointer_exists = ret;
 		ret = 0;
 
-		if (fsck_err_on(!have_target, c,
-				"dirent points to missing inode:\n%s",
-				(bch2_bkey_val_to_text(&PBUF(buf), c,
-						       k), buf))) {
-			ret = remove_dirent(&trans, d.k->p);
-			if (ret)
-				goto err;
-			goto next;
+		if (fsck_err_on(S_ISDIR(target.bi_mode) &&
+				backpointer_exists, c,
+				"directory %llu with multiple links",
+				target.bi_inum))
+			return remove_dirent(trans, d.k->p);
+
+		if (fsck_err_on(backpointer_exists &&
+				!target.bi_nlink, c,
+				"inode %llu has multiple links but i_nlink 0",
+				d_inum)) {
+			target.bi_nlink++;
+			target.bi_flags &= ~BCH_INODE_UNLINKED;
+
+			ret = write_inode(trans, &target, target_snapshot);
+			return ret ?: -EINTR;
 		}
 
-		if (!have_target)
-			goto next;
-
-		if (!target.bi_dir &&
-		    !target.bi_dir_offset) {
+		if (fsck_err_on(!backpointer_exists, c,
+				"inode %llu has wrong backpointer:\n"
+				"got       %llu:%llu\n"
+				"should be %llu:%llu",
+				d_inum,
+				target.bi_dir,
+				target.bi_dir_offset,
+				k.k->p.inode,
+				k.k->p.offset)) {
 			target.bi_dir		= k.k->p.inode;
 			target.bi_dir_offset	= k.k->p.offset;
 
-			ret = write_inode(&trans, &target, target_snapshot);
-			if (ret)
-				goto err;
+			ret = write_inode(trans, &target, target_snapshot);
+			return ret ?: -EINTR;
 		}
+	}
 
-		if (!inode_backpointer_matches(d, &target)) {
-			ret = inode_backpointer_exists(&trans, &target);
-			if (ret < 0)
-				goto err;
-
-			backpointer_exists = ret;
-			ret = 0;
+	if (fsck_err_on(d.v->d_type != mode_to_type(target.bi_mode), c,
+			"incorrect d_type: should be %u:\n%s",
+			mode_to_type(target.bi_mode),
+			(bch2_bkey_val_to_text(&PBUF(buf), c,
+					       k), buf))) {
+		struct bkey_i_dirent *n;
 
-			if (fsck_err_on(S_ISDIR(target.bi_mode) &&
-					backpointer_exists, c,
-					"directory %llu with multiple links",
-					target.bi_inum)) {
-				ret = remove_dirent(&trans, d.k->p);
-				if (ret)
-					goto err;
-				continue;
-			}
+		n = kmalloc(bkey_bytes(d.k), GFP_KERNEL);
+		if (!n)
+			return -ENOMEM;
 
-			if (fsck_err_on(backpointer_exists &&
-					!target.bi_nlink, c,
-					"inode %llu has multiple links but i_nlink 0",
-					d_inum)) {
-				target.bi_nlink++;
-				target.bi_flags &= ~BCH_INODE_UNLINKED;
+		bkey_reassemble(&n->k_i, d.s_c);
+		n->v.d_type = mode_to_type(target.bi_mode);
 
-				ret = write_inode(&trans, &target, target_snapshot);
-				if (ret)
-					goto err;
-			}
+		ret = __bch2_trans_do(trans, NULL, NULL,
+				      BTREE_INSERT_NOFAIL|
+				      BTREE_INSERT_LAZY_RW,
+			bch2_btree_iter_traverse(iter) ?:
+			bch2_trans_update(trans, iter, &n->k_i, 0));
+		kfree(n);
+		return ret ?: -EINTR;
+	}
 
-			if (fsck_err_on(!backpointer_exists, c,
-					"inode %llu has wrong backpointer:\n"
-					"got       %llu:%llu\n"
-					"should be %llu:%llu",
-					d_inum,
-					target.bi_dir,
-					target.bi_dir_offset,
-					k.k->p.inode,
-					k.k->p.offset)) {
-				target.bi_dir		= k.k->p.inode;
-				target.bi_dir_offset	= k.k->p.offset;
-
-				ret = write_inode(&trans, &target, target_snapshot);
-				if (ret)
-					goto err;
-			}
-		}
+	*nr_subdirs += d.v->d_type == DT_DIR;
+	return 0;
+fsck_err:
+	return ret;
+}
 
-		if (fsck_err_on(d.v->d_type != mode_to_type(target.bi_mode), c,
-				"incorrect d_type: should be %u:\n%s",
-				mode_to_type(target.bi_mode),
-				(bch2_bkey_val_to_text(&PBUF(buf), c,
-						       k), buf))) {
-			struct bkey_i_dirent *n;
+/*
+ * Walk dirents: verify that they all have a corresponding S_ISDIR inode,
+ * validate d_type
+ */
+noinline_for_stack
+static int check_dirents(struct bch_fs *c)
+{
+	struct inode_walker w = inode_walker_init();
+	struct bch_hash_info hash_info;
+	struct btree_trans trans;
+	struct btree_iter *iter;
+	unsigned nr_subdirs = 0;
+	int ret = 0;
 
-			n = kmalloc(bkey_bytes(d.k), GFP_KERNEL);
-			if (!n) {
-				ret = -ENOMEM;
-				goto err;
-			}
+	bch_verbose(c, "checking dirents");
 
-			bkey_reassemble(&n->k_i, d.s_c);
-			n->v.d_type = mode_to_type(target.bi_mode);
+	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
 
-			ret = __bch2_trans_do(&trans, NULL, NULL,
-					      BTREE_INSERT_NOFAIL|
-					      BTREE_INSERT_LAZY_RW,
-				bch2_btree_iter_traverse(iter) ?:
-				bch2_trans_update(&trans, iter, &n->k_i, 0));
-			kfree(n);
-			if (ret)
-				goto err;
+	iter = bch2_trans_get_iter(&trans, BTREE_ID_dirents,
+				   POS(BCACHEFS_ROOT_INO, 0),
+				   BTREE_ITER_INTENT|
+				   BTREE_ITER_PREFETCH);
 
+	while (1) {
+		ret = lockrestart_do(&trans,
+				check_dirent(&trans, iter, &hash_info, &w, &nr_subdirs));
+		if (ret == 1) {
+			/* at end */
+			ret = 0;
+			break;
 		}
+		if (ret)
+			break;
 
-		nr_subdirs += d.v->d_type == DT_DIR;
-next:
 		bch2_btree_iter_advance(iter);
 	}
-err:
-fsck_err:
-	if (ret == -EINTR)
-		goto retry;
-
 	bch2_trans_iter_put(&trans, iter);
+
 	return bch2_trans_exit(&trans) ?: ret;
 }
 
-- 
cgit 


From 71f892a48239a7ec025fdd7f88d252823fe9bff4 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 14 Jul 2021 23:35:11 -0400
Subject: bcachefs: Fix bch2_btree_iter_rewind()

We'd hit a BUG() when rewinding at the start of the btree on btrees with
snapshots.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_iter.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 22419929ac1b..745bf48241fd 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1622,7 +1622,9 @@ inline bool bch2_btree_iter_advance(struct btree_iter *iter)
 inline bool bch2_btree_iter_rewind(struct btree_iter *iter)
 {
 	struct bpos pos = bkey_start_pos(&iter->k);
-	bool ret = bpos_cmp(pos, POS_MIN) != 0;
+	bool ret = (iter->flags & BTREE_ITER_ALL_SNAPSHOTS
+		    ? bpos_cmp(pos, POS_MIN)
+		    : bkey_cmp(pos, POS_MIN)) != 0;
 
 	if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS))
 		pos = bkey_predecessor(iter, pos);
-- 
cgit 


From eb7f44db8da087e571f82eb7a8b9d9336bee60f1 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 14 Jul 2021 21:25:55 -0400
Subject: bcachefs: Fixes for unit tests

The unit tests hadn't been updated for various recent btree changes -
this patch makes them work again.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/tests.c | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c
index a8b8e3a072ad..9c7812f62935 100644
--- a/fs/bcachefs/tests.c
+++ b/fs/bcachefs/tests.c
@@ -34,6 +34,7 @@ static int test_delete(struct bch_fs *c, u64 nr)
 	int ret;
 
 	bkey_cookie_init(&k.k_i);
+	k.k.p.snapshot = U32_MAX;
 
 	bch2_trans_init(&trans, c, 0, 0);
 
@@ -79,29 +80,27 @@ static int test_delete_written(struct bch_fs *c, u64 nr)
 	int ret;
 
 	bkey_cookie_init(&k.k_i);
+	k.k.p.snapshot = U32_MAX;
 
 	bch2_trans_init(&trans, c, 0, 0);
 
 	iter = bch2_trans_get_iter(&trans, BTREE_ID_xattrs, k.k.p,
 				   BTREE_ITER_INTENT);
 
-	ret = bch2_btree_iter_traverse(iter);
-	if (ret) {
-		bch_err(c, "lookup error in test_delete_written: %i", ret);
-		goto err;
-	}
-
 	ret = __bch2_trans_do(&trans, NULL, NULL, 0,
+		bch2_btree_iter_traverse(iter) ?:
 		bch2_trans_update(&trans, iter, &k.k_i, 0));
 	if (ret) {
 		bch_err(c, "update error in test_delete_written: %i", ret);
 		goto err;
 	}
 
+	bch2_trans_unlock(&trans);
 	bch2_journal_flush_all_pins(&c->journal);
 
 	ret = __bch2_trans_do(&trans, NULL, NULL, 0,
-			 bch2_btree_delete_at(&trans, iter, 0));
+		bch2_btree_iter_traverse(iter) ?:
+		bch2_btree_delete_at(&trans, iter, 0));
 	if (ret) {
 		bch_err(c, "delete error in test_delete_written: %i", ret);
 		goto err;
@@ -131,6 +130,7 @@ static int test_iterate(struct bch_fs *c, u64 nr)
 
 		bkey_cookie_init(&k.k_i);
 		k.k.p.offset = i;
+		k.k.p.snapshot = U32_MAX;
 
 		ret = bch2_btree_insert(c, BTREE_ID_xattrs, &k.k_i,
 					NULL, NULL, 0);
@@ -185,6 +185,7 @@ static int test_iterate_extents(struct bch_fs *c, u64 nr)
 
 		bkey_cookie_init(&k.k_i);
 		k.k.p.offset = i + 8;
+		k.k.p.snapshot = U32_MAX;
 		k.k.size = 8;
 
 		ret = bch2_btree_insert(c, BTREE_ID_extents, &k.k_i,
@@ -240,6 +241,7 @@ static int test_iterate_slots(struct bch_fs *c, u64 nr)
 
 		bkey_cookie_init(&k.k_i);
 		k.k.p.offset = i * 2;
+		k.k.p.snapshot = U32_MAX;
 
 		ret = bch2_btree_insert(c, BTREE_ID_xattrs, &k.k_i,
 					NULL, NULL, 0);
@@ -303,6 +305,7 @@ static int test_iterate_slots_extents(struct bch_fs *c, u64 nr)
 
 		bkey_cookie_init(&k.k_i);
 		k.k.p.offset = i + 16;
+		k.k.p.snapshot = U32_MAX;
 		k.k.size = 8;
 
 		ret = bch2_btree_insert(c, BTREE_ID_extents, &k.k_i,
@@ -410,6 +413,7 @@ static int insert_test_extent(struct bch_fs *c,
 
 	bkey_cookie_init(&k.k_i);
 	k.k_i.k.p.offset = end;
+	k.k_i.k.p.snapshot = U32_MAX;
 	k.k_i.k.size = end - start;
 	k.k_i.k.version.lo = test_version++;
 
-- 
cgit 


From f8f86c6aec1ecb21839933ff3615dcd219ef026f Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 15 Jul 2021 13:42:43 -0400
Subject: bcachefs: Improve btree_bad_header() error message

We should always print out the full btree node ptr.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_cache.c | 25 ++++++++++++-------------
 fs/bcachefs/extents.c     |  5 +++--
 2 files changed, 15 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index 19afbdcae5e4..e2c02ae98f83 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -719,26 +719,25 @@ static int lock_node_check_fn(struct six_lock *lock, void *p)
 
 static noinline void btree_bad_header(struct bch_fs *c, struct btree *b)
 {
-	char buf1[100], buf2[100], buf3[100], buf4[100];
+	char buf1[200], buf2[100], buf3[100];
 
 	if (!test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags))
 		return;
 
-	bch2_bpos_to_text(&PBUF(buf1), b->key.k.type == KEY_TYPE_btree_ptr_v2
-		? bkey_i_to_btree_ptr_v2(&b->key)->v.min_key
-		: POS_MIN);
+	bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(&b->key));
 	bch2_bpos_to_text(&PBUF(buf2), b->data->min_key);
+	bch2_bpos_to_text(&PBUF(buf3), b->data->max_key);
 
-	bch2_bpos_to_text(&PBUF(buf3), b->key.k.p);
-	bch2_bpos_to_text(&PBUF(buf4), b->data->max_key);
 	bch2_fs_inconsistent(c, "btree node header doesn't match ptr\n"
-			     "btree: ptr %u header %llu\n"
-			     "level: ptr %u header %llu\n"
-			     "min ptr %s node header %s\n"
-			     "max ptr %s node header %s",
-			     b->c.btree_id,	BTREE_NODE_ID(b->data),
-			     b->c.level,	BTREE_NODE_LEVEL(b->data),
-			     buf1, buf2, buf3, buf4);
+			     "btree %s level %u\n"
+			     "ptr: %s\n"
+			     "header: btree %s level %llu\n"
+			     "min %s max %s\n",
+			     bch2_btree_ids[b->c.btree_id], b->c.level,
+			     buf1,
+			     bch2_btree_ids[BTREE_NODE_ID(b->data)],
+			     BTREE_NODE_LEVEL(b->data),
+			     buf2, buf3);
 }
 
 static inline void btree_check_header(struct bch_fs *c, struct btree *b)
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index c7643e6c8816..6524703f3da4 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -191,9 +191,10 @@ void bch2_btree_ptr_v2_to_text(struct printbuf *out, struct bch_fs *c,
 {
 	struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k);
 
-	pr_buf(out, "seq %llx written %u min_key ",
+	pr_buf(out, "seq %llx written %u min_key %s",
 	       le64_to_cpu(bp.v->seq),
-	       le16_to_cpu(bp.v->sectors_written));
+	       le16_to_cpu(bp.v->sectors_written),
+	       BTREE_PTR_RANGE_UPDATED(bp.v) ? "R " : "");
 
 	bch2_bpos_to_text(out, bp.v->min_key);
 	pr_buf(out, " ");
-- 
cgit 


From 9f1833cadda7bb40a77dc9fd1b85798e20d92195 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 10 Jul 2021 13:44:42 -0400
Subject: bcachefs: Update btree ptrs after every write

This closes a significant hole (and last known hole) in our ability to
verify metadata. Previously, since btree nodes are log structured, we
couldn't detect lost btree writes that weren't the first write to a
given node. Additionally, this seems to have lead to some significant
metadata corruption on multi device filesystems with metadata
replication: since a write may have made it to one device and not
another, if we read that btree node back from the replica that did have
that write and started appending after that point, the other replica
would have a gap in the bset entries and reading from that replica
wouldn't find the rest of the bsets.

But, since updates to interior btree nodes are now journalled, we can
close this hole by updating pointers to btree nodes after every write
with the currently written number of sectors, without negatively
affecting performance. This means we will always detect lost or corrupt
metadata - it also means that our btree is now a curious hybrid of COW
and non COW btrees, with all the benefits of both (excluding
complexity).

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/bcachefs.h              |   4 +-
 fs/bcachefs/bcachefs_format.h       |   3 +-
 fs/bcachefs/btree_io.c              | 222 +++++++++++++++---------------------
 fs/bcachefs/btree_io.h              |  11 +-
 fs/bcachefs/btree_iter.h            |   2 +-
 fs/bcachefs/btree_types.h           |   2 +
 fs/bcachefs/btree_update.h          |   4 +-
 fs/bcachefs/btree_update_interior.c | 194 ++++++++++++++++++++-----------
 fs/bcachefs/btree_update_leaf.c     |   3 +-
 fs/bcachefs/io_types.h              |   3 +-
 fs/bcachefs/migrate.c               |   2 +-
 fs/bcachefs/recovery.c              |   5 +
 fs/bcachefs/super.c                 |   9 +-
 13 files changed, 250 insertions(+), 214 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index bed2e76e6dc8..6a289b6f1fb4 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -676,7 +676,7 @@ struct bch_fs {
 	struct btree_key_cache	btree_key_cache;
 
 	struct workqueue_struct	*btree_update_wq;
-	struct workqueue_struct	*btree_error_wq;
+	struct workqueue_struct	*btree_io_complete_wq;
 	/* copygc needs its own workqueue for index updates.. */
 	struct workqueue_struct	*copygc_wq;
 
@@ -827,8 +827,6 @@ mempool_t		bio_bounce_pages;
 
 	atomic64_t		btree_writes_nr;
 	atomic64_t		btree_writes_sectors;
-	struct bio_list		btree_write_error_list;
-	struct work_struct	btree_write_error_work;
 	spinlock_t		btree_write_error_lock;
 
 	/* ERRORS */
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index e6be594fd0be..659bcfe09fb4 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -1214,7 +1214,8 @@ enum bcachefs_metadata_version {
 	bcachefs_metadata_version_inode_btree_change	= 11,
 	bcachefs_metadata_version_snapshot		= 12,
 	bcachefs_metadata_version_inode_backpointers	= 13,
-	bcachefs_metadata_version_max			= 14,
+	bcachefs_metadata_version_btree_ptr_sectors_written = 14,
+	bcachefs_metadata_version_max			= 15,
 };
 
 #define bcachefs_metadata_version_current	(bcachefs_metadata_version_max - 1)
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 1d4b5fcd1e39..b99e4198bdbe 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -26,6 +26,7 @@ void bch2_btree_node_io_unlock(struct btree *b)
 {
 	EBUG_ON(!btree_node_write_in_flight(b));
 
+	clear_btree_node_write_in_flight_inner(b);
 	clear_btree_node_write_in_flight(b);
 	wake_up_bit(&b->flags, BTREE_NODE_write_in_flight);
 }
@@ -865,7 +866,8 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
 	bool updated_range = b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
 		BTREE_PTR_RANGE_UPDATED(&bkey_i_to_btree_ptr_v2(&b->key)->v);
 	unsigned u64s;
-	unsigned nonblacklisted_written = 0;
+	unsigned blacklisted_written, nonblacklisted_written = 0;
+	unsigned ptr_written = btree_ptr_sectors_written(&b->key);
 	int ret, retry_read = 0, write = READ;
 
 	b->version_ondisk = U16_MAX;
@@ -896,7 +898,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
 			     b->data->keys.seq, bp->seq);
 	}
 
-	while (b->written < c->opts.btree_node_size) {
+	while (b->written < (ptr_written ?: c->opts.btree_node_size)) {
 		unsigned sectors, whiteout_u64s = 0;
 		struct nonce nonce;
 		struct bch_csum csum;
@@ -976,6 +978,10 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
 		btree_err_on(blacklisted && first,
 			     BTREE_ERR_FIXABLE, c, ca, b, i,
 			     "first btree node bset has blacklisted journal seq");
+
+		btree_err_on(blacklisted && ptr_written,
+			     BTREE_ERR_FIXABLE, c, ca, b, i,
+			     "found blacklisted bset in btree node with sectors_written");
 		if (blacklisted && !first)
 			continue;
 
@@ -989,26 +995,34 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
 		nonblacklisted_written = b->written;
 	}
 
-	for (bne = write_block(b);
-	     bset_byte_offset(b, bne) < btree_bytes(c);
-	     bne = (void *) bne + block_bytes(c))
-		btree_err_on(bne->keys.seq == b->data->keys.seq &&
-			     !bch2_journal_seq_is_blacklisted(c,
-					le64_to_cpu(bne->keys.journal_seq),
-					true),
+	if (ptr_written) {
+		btree_err_on(b->written < ptr_written,
 			     BTREE_ERR_WANT_RETRY, c, ca, b, NULL,
-			     "found bset signature after last bset");
+			     "btree node data missing: expected %u sectors, found %u",
+			     ptr_written, b->written);
+	} else {
+		for (bne = write_block(b);
+		     bset_byte_offset(b, bne) < btree_bytes(c);
+		     bne = (void *) bne + block_bytes(c))
+			btree_err_on(bne->keys.seq == b->data->keys.seq &&
+				     !bch2_journal_seq_is_blacklisted(c,
+								      le64_to_cpu(bne->keys.journal_seq),
+								      true),
+				     BTREE_ERR_WANT_RETRY, c, ca, b, NULL,
+				     "found bset signature after last bset");
 
-	/*
-	 * Blacklisted bsets are those that were written after the most recent
-	 * (flush) journal write. Since there wasn't a flush, they may not have
-	 * made it to all devices - which means we shouldn't write new bsets
-	 * after them, as that could leave a gap and then reads from that device
-	 * wouldn't find all the bsets in that btree node - which means it's
-	 * important that we start writing new bsets after the most recent _non_
-	 * blacklisted bset:
-	 */
-	b->written = nonblacklisted_written;
+		/*
+		 * Blacklisted bsets are those that were written after the most recent
+		 * (flush) journal write. Since there wasn't a flush, they may not have
+		 * made it to all devices - which means we shouldn't write new bsets
+		 * after them, as that could leave a gap and then reads from that device
+		 * wouldn't find all the bsets in that btree node - which means it's
+		 * important that we start writing new bsets after the most recent _non_
+		 * blacklisted bset:
+		 */
+		blacklisted_written = b->written;
+		b->written = nonblacklisted_written;
+	}
 
 	sorted = btree_bounce_alloc(c, btree_bytes(c), &used_mempool);
 	sorted->keys.u64s = 0;
@@ -1076,6 +1090,9 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
 		if (ca->mi.state != BCH_MEMBER_STATE_rw)
 			set_btree_node_need_rewrite(b);
 	}
+
+	if (!ptr_written)
+		set_btree_node_need_rewrite(b);
 out:
 	mempool_free(iter, &c->fill_iter);
 	return retry_read;
@@ -1574,6 +1591,7 @@ static void btree_node_write_done(struct bch_fs *c, struct btree *b)
 			goto do_write;
 
 		new &= ~(1U << BTREE_NODE_write_in_flight);
+		new &= ~(1U << BTREE_NODE_write_in_flight_inner);
 	} while ((v = cmpxchg(&b->flags, old, new)) != old);
 
 	wake_up_bit(&b->flags, BTREE_NODE_write_in_flight);
@@ -1592,10 +1610,12 @@ do_write:
 			new &= ~(1U << BTREE_NODE_dirty);
 			new &= ~(1U << BTREE_NODE_need_write);
 			new |=  (1U << BTREE_NODE_write_in_flight);
+			new |=  (1U << BTREE_NODE_write_in_flight_inner);
 			new |=  (1U << BTREE_NODE_just_written);
 			new ^=  (1U << BTREE_NODE_write_idx);
 		} else {
 			new &= ~(1U << BTREE_NODE_write_in_flight);
+			new &= ~(1U << BTREE_NODE_write_in_flight_inner);
 		}
 	} while ((v = cmpxchg(&b->flags, old, new)) != old);
 
@@ -1605,52 +1625,38 @@ do_write:
 	six_unlock_read(&b->c.lock);
 }
 
-static void bch2_btree_node_write_error(struct bch_fs *c,
-					struct btree_write_bio *wbio)
+static void btree_node_write_work(struct work_struct *work)
 {
+	struct btree_write_bio *wbio =
+		container_of(work, struct btree_write_bio, work);
+	struct bch_fs *c	= wbio->wbio.c;
 	struct btree *b		= wbio->wbio.bio.bi_private;
-	struct bkey_buf k;
 	struct bch_extent_ptr *ptr;
-	struct btree_trans trans;
-	struct btree_iter *iter;
 	int ret;
 
-	bch2_bkey_buf_init(&k);
-	bch2_trans_init(&trans, c, 0, 0);
-
-	iter = bch2_trans_get_node_iter(&trans, b->c.btree_id, b->key.k.p,
-					BTREE_MAX_DEPTH, b->c.level, 0);
-retry:
-	ret = bch2_btree_iter_traverse(iter);
-	if (ret)
-		goto err;
-
-	/* has node been freed? */
-	if (iter->l[b->c.level].b != b) {
-		/* node has been freed: */
-		BUG_ON(!btree_node_dying(b));
-		goto out;
-	}
-
-	BUG_ON(!btree_node_hashed(b));
-
-	bch2_bkey_buf_copy(&k, c, &b->key);
+	btree_bounce_free(c,
+		wbio->data_bytes,
+		wbio->wbio.used_mempool,
+		wbio->data);
 
-	bch2_bkey_drop_ptrs(bkey_i_to_s(k.k), ptr,
+	bch2_bkey_drop_ptrs(bkey_i_to_s(&wbio->key), ptr,
 		bch2_dev_list_has_dev(wbio->wbio.failed, ptr->dev));
 
-	if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(k.k)))
+	if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(&wbio->key)))
 		goto err;
 
-	ret = bch2_btree_node_update_key(&trans, iter, b, k.k);
-	if (ret == -EINTR)
-		goto retry;
-	if (ret)
-		goto err;
+	if (wbio->wbio.first_btree_write) {
+		if (wbio->wbio.failed.nr) {
+
+		}
+	} else {
+		ret = bch2_trans_do(c, NULL, NULL, 0,
+			bch2_btree_node_update_key_get_iter(&trans, b, &wbio->key,
+							    !wbio->wbio.failed.nr));
+		if (ret)
+			goto err;
+	}
 out:
-	bch2_trans_iter_put(&trans, iter);
-	bch2_trans_exit(&trans);
-	bch2_bkey_buf_exit(&k, c);
 	bio_put(&wbio->wbio.bio);
 	btree_node_write_done(c, b);
 	return;
@@ -1660,58 +1666,14 @@ err:
 	goto out;
 }
 
-void bch2_btree_write_error_work(struct work_struct *work)
-{
-	struct bch_fs *c = container_of(work, struct bch_fs,
-					btree_write_error_work);
-	struct bio *bio;
-
-	while (1) {
-		spin_lock_irq(&c->btree_write_error_lock);
-		bio = bio_list_pop(&c->btree_write_error_list);
-		spin_unlock_irq(&c->btree_write_error_lock);
-
-		if (!bio)
-			break;
-
-		bch2_btree_node_write_error(c,
-			container_of(bio, struct btree_write_bio, wbio.bio));
-	}
-}
-
-static void btree_node_write_work(struct work_struct *work)
-{
-	struct btree_write_bio *wbio =
-		container_of(work, struct btree_write_bio, work);
-	struct bch_fs *c	= wbio->wbio.c;
-	struct btree *b		= wbio->wbio.bio.bi_private;
-
-	btree_bounce_free(c,
-		wbio->bytes,
-		wbio->wbio.used_mempool,
-		wbio->data);
-
-	if (wbio->wbio.failed.nr) {
-		unsigned long flags;
-
-		spin_lock_irqsave(&c->btree_write_error_lock, flags);
-		bio_list_add(&c->btree_write_error_list, &wbio->wbio.bio);
-		spin_unlock_irqrestore(&c->btree_write_error_lock, flags);
-
-		queue_work(c->btree_error_wq, &c->btree_write_error_work);
-		return;
-	}
-
-	bio_put(&wbio->wbio.bio);
-	btree_node_write_done(c, b);
-}
-
 static void btree_node_write_endio(struct bio *bio)
 {
 	struct bch_write_bio *wbio	= to_wbio(bio);
 	struct bch_write_bio *parent	= wbio->split ? wbio->parent : NULL;
 	struct bch_write_bio *orig	= parent ?: wbio;
+	struct btree_write_bio *wb	= container_of(orig, struct btree_write_bio, wbio);
 	struct bch_fs *c		= wbio->c;
+	struct btree *b			= wbio->bio.bi_private;
 	struct bch_dev *ca		= bch_dev_bkey_exists(c, wbio->dev);
 	unsigned long flags;
 
@@ -1732,13 +1694,13 @@ static void btree_node_write_endio(struct bio *bio)
 	if (parent) {
 		bio_put(bio);
 		bio_endio(&parent->bio);
-	} else {
-		struct btree_write_bio *wb =
-			container_of(orig, struct btree_write_bio, wbio);
-
-		INIT_WORK(&wb->work, btree_node_write_work);
-		queue_work(c->io_complete_wq, &wb->work);
+		return;
 	}
+
+	clear_btree_node_write_in_flight_inner(b);
+	wake_up_bit(&b->flags, BTREE_NODE_write_in_flight_inner);
+	INIT_WORK(&wb->work, btree_node_write_work);
+	queue_work(c->btree_io_complete_wq, &wb->work);
 }
 
 static int validate_bset_for_write(struct bch_fs *c, struct btree *b,
@@ -1763,8 +1725,15 @@ static int validate_bset_for_write(struct bch_fs *c, struct btree *b,
 static void btree_write_submit(struct work_struct *work)
 {
 	struct btree_write_bio *wbio = container_of(work, struct btree_write_bio, work);
+	struct bch_extent_ptr *ptr;
+	__BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp;
+
+	bkey_copy(&tmp.k, &wbio->key);
+
+	bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&tmp.k)), ptr)
+		ptr->offset += wbio->sector_offset;
 
-	bch2_submit_wbio_replicas(&wbio->wbio, wbio->wbio.c, BCH_DATA_btree, &wbio->key);
+	bch2_submit_wbio_replicas(&wbio->wbio, wbio->wbio.c, BCH_DATA_btree, &tmp.k);
 }
 
 void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, bool already_started)
@@ -1774,7 +1743,6 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, bool already_sta
 	struct bset *i;
 	struct btree_node *bn = NULL;
 	struct btree_node_entry *bne = NULL;
-	struct bch_extent_ptr *ptr;
 	struct sort_iter sort_iter;
 	struct nonce nonce;
 	unsigned bytes_to_write, sectors_to_write, bytes, u64s;
@@ -1814,6 +1782,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, bool already_sta
 		new &= ~(1 << BTREE_NODE_dirty);
 		new &= ~(1 << BTREE_NODE_need_write);
 		new |=  (1 << BTREE_NODE_write_in_flight);
+		new |=  (1 << BTREE_NODE_write_in_flight_inner);
 		new |=  (1 << BTREE_NODE_just_written);
 		new ^=  (1 << BTREE_NODE_write_idx);
 	} while (cmpxchg_acquire(&b->flags, old, new) != old);
@@ -1967,36 +1936,29 @@ do_write:
 			    struct btree_write_bio, wbio.bio);
 	wbio_init(&wbio->wbio.bio);
 	wbio->data			= data;
-	wbio->bytes			= bytes;
+	wbio->data_bytes		= bytes;
+	wbio->sector_offset		= b->written;
 	wbio->wbio.c			= c;
 	wbio->wbio.used_mempool		= used_mempool;
+	wbio->wbio.first_btree_write	= !b->written;
 	wbio->wbio.bio.bi_end_io	= btree_node_write_endio;
 	wbio->wbio.bio.bi_private	= b;
 
 	bch2_bio_map(&wbio->wbio.bio, data, sectors_to_write << 9);
 
-	/*
-	 * If we're appending to a leaf node, we don't technically need FUA -
-	 * this write just needs to be persisted before the next journal write,
-	 * which will be marked FLUSH|FUA.
-	 *
-	 * Similarly if we're writing a new btree root - the pointer is going to
-	 * be in the next journal entry.
-	 *
-	 * But if we're writing a new btree node (that isn't a root) or
-	 * appending to a non leaf btree node, we need either FUA or a flush
-	 * when we write the parent with the new pointer. FUA is cheaper than a
-	 * flush, and writes appending to leaf nodes aren't blocking anything so
-	 * just make all btree node writes FUA to keep things sane.
-	 */
-
 	bkey_copy(&wbio->key, &b->key);
 
-	bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&wbio->key)), ptr)
-		ptr->offset += b->written;
-
 	b->written += sectors_to_write;
 
+	if (wbio->wbio.first_btree_write &&
+	    b->key.k.type == KEY_TYPE_btree_ptr_v2)
+		bkey_i_to_btree_ptr_v2(&b->key)->v.sectors_written =
+			cpu_to_le16(b->written);
+
+	if (wbio->key.k.type == KEY_TYPE_btree_ptr_v2)
+		bkey_i_to_btree_ptr_v2(&wbio->key)->v.sectors_written =
+			cpu_to_le16(b->written);
+
 	atomic64_inc(&c->btree_writes_nr);
 	atomic64_add(sectors_to_write, &c->btree_writes_sectors);
 
@@ -2005,6 +1967,10 @@ do_write:
 	return;
 err:
 	set_btree_node_noevict(b);
+	if (!b->written &&
+	    b->key.k.type == KEY_TYPE_btree_ptr_v2)
+		bkey_i_to_btree_ptr_v2(&b->key)->v.sectors_written =
+			cpu_to_le16(sectors_to_write);
 	b->written += sectors_to_write;
 nowrite:
 	btree_bounce_free(c, bytes, used_mempool, data);
diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h
index 3732d135de8d..7fdcf879c7d4 100644
--- a/fs/bcachefs/btree_io.h
+++ b/fs/bcachefs/btree_io.h
@@ -32,6 +32,13 @@ static inline void clear_btree_node_dirty(struct bch_fs *c, struct btree *b)
 		atomic_dec(&c->btree_cache.dirty);
 }
 
+static inline unsigned btree_ptr_sectors_written(struct bkey_i *k)
+{
+	return k->k.type == KEY_TYPE_btree_ptr_v2
+		? le16_to_cpu(bkey_i_to_btree_ptr_v2(k)->v.sectors_written)
+		: 0;
+}
+
 struct btree_read_bio {
 	struct bch_fs		*c;
 	struct btree		*b;
@@ -48,7 +55,8 @@ struct btree_write_bio {
 	struct work_struct	work;
 	__BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX);
 	void			*data;
-	unsigned		bytes;
+	unsigned		data_bytes;
+	unsigned		sector_offset;
 	struct bch_write_bio	wbio;
 };
 
@@ -137,7 +145,6 @@ int bch2_btree_root_read(struct bch_fs *, enum btree_id,
 
 void bch2_btree_complete_write(struct bch_fs *, struct btree *,
 			      struct btree_write *);
-void bch2_btree_write_error_work(struct work_struct *);
 
 void __bch2_btree_node_write(struct bch_fs *, struct btree *, bool);
 bool bch2_btree_post_write_cleanup(struct bch_fs *, struct btree *);
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index 58f15b716d49..7385cca43f8b 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -132,7 +132,7 @@ void __bch2_btree_iter_downgrade(struct btree_iter *, unsigned);
 
 static inline void bch2_btree_iter_downgrade(struct btree_iter *iter)
 {
-	unsigned new_locks_want = (iter->flags & BTREE_ITER_INTENT ? 1 : 0);
+	unsigned new_locks_want = iter->level + !!(iter->flags & BTREE_ITER_INTENT);
 
 	if (iter->locks_want > new_locks_want)
 		__bch2_btree_iter_downgrade(iter, new_locks_want);
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 134d221d150e..78b312e5bcf3 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -435,6 +435,7 @@ enum btree_flags {
 	BTREE_NODE_write_idx,
 	BTREE_NODE_accessed,
 	BTREE_NODE_write_in_flight,
+	BTREE_NODE_write_in_flight_inner,
 	BTREE_NODE_just_written,
 	BTREE_NODE_dying,
 	BTREE_NODE_fake,
@@ -449,6 +450,7 @@ BTREE_FLAG(noevict);
 BTREE_FLAG(write_idx);
 BTREE_FLAG(accessed);
 BTREE_FLAG(write_in_flight);
+BTREE_FLAG(write_in_flight_inner);
 BTREE_FLAG(just_written);
 BTREE_FLAG(dying);
 BTREE_FLAG(fake);
diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
index 12065bba82dd..bab135fae0b0 100644
--- a/fs/bcachefs/btree_update.h
+++ b/fs/bcachefs/btree_update.h
@@ -74,7 +74,9 @@ int bch2_btree_node_rewrite(struct btree_trans *, struct btree_iter *,
 			    __le64, unsigned);
 void bch2_btree_node_rewrite_async(struct bch_fs *, struct btree *);
 int bch2_btree_node_update_key(struct btree_trans *, struct btree_iter *,
-			       struct btree *, struct bkey_i *);
+			       struct btree *, struct bkey_i *, bool);
+int bch2_btree_node_update_key_get_iter(struct btree_trans *,
+				struct btree *, struct bkey_i *, bool);
 
 int bch2_trans_update(struct btree_trans *, struct btree_iter *,
 		      struct bkey_i *, enum btree_update_flags);
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 0b78fb9d3561..e9b7af4c3574 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -246,11 +246,7 @@ retry:
 		goto retry;
 	}
 
-	if (c->sb.features & (1ULL << BCH_FEATURE_btree_ptr_v2))
-		bkey_btree_ptr_v2_init(&tmp.k);
-	else
-		bkey_btree_ptr_init(&tmp.k);
-
+	bkey_btree_ptr_v2_init(&tmp.k);
 	bch2_alloc_sectors_append_ptrs(c, wp, &tmp.k, c->opts.btree_node_size);
 
 	bch2_open_bucket_get(c, wp, &ob);
@@ -567,7 +563,8 @@ static void btree_update_nodes_written(struct btree_update *as)
 		six_unlock_read(&old->c.lock);
 
 		if (seq == as->old_nodes_seq[i])
-			bch2_btree_node_wait_on_write(old);
+			wait_on_bit_io(&old->flags, BTREE_NODE_write_in_flight_inner,
+				       TASK_UNINTERRUPTIBLE);
 	}
 
 	/*
@@ -1153,6 +1150,9 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, struct btree *b
 	struct bkey_packed *k;
 	const char *invalid;
 
+	BUG_ON(insert->k.type == KEY_TYPE_btree_ptr_v2 &&
+	       !btree_ptr_sectors_written(insert));
+
 	invalid = bch2_bkey_invalid(c, bkey_i_to_s_c(insert), btree_node_type(b)) ?:
 		bch2_bkey_in_btree_node(b, bkey_i_to_s_c(insert));
 	if (invalid) {
@@ -1395,6 +1395,7 @@ static void btree_split(struct btree_update *as,
 		six_unlock_write(&n2->c.lock);
 		six_unlock_write(&n1->c.lock);
 
+		bch2_btree_node_write(c, n1, SIX_LOCK_intent);
 		bch2_btree_node_write(c, n2, SIX_LOCK_intent);
 
 		/*
@@ -1422,12 +1423,12 @@ static void btree_split(struct btree_update *as,
 		bch2_btree_build_aux_trees(n1);
 		six_unlock_write(&n1->c.lock);
 
+		bch2_btree_node_write(c, n1, SIX_LOCK_intent);
+
 		if (parent)
 			bch2_keylist_add(&as->parent_keys, &n1->key);
 	}
 
-	bch2_btree_node_write(c, n1, SIX_LOCK_intent);
-
 	/* New nodes all written, now make them visible: */
 
 	if (parent) {
@@ -1703,13 +1704,13 @@ retry:
 	bch2_btree_build_aux_trees(n);
 	six_unlock_write(&n->c.lock);
 
+	bch2_btree_node_write(c, n, SIX_LOCK_intent);
+
 	bkey_init(&delete.k);
 	delete.k.p = prev->key.k.p;
 	bch2_keylist_add(&as->parent_keys, &delete);
 	bch2_keylist_add(&as->parent_keys, &n->key);
 
-	bch2_btree_node_write(c, n, SIX_LOCK_intent);
-
 	bch2_btree_insert_node(as, trans, iter, parent, &as->parent_keys, flags);
 
 	bch2_btree_update_get_open_buckets(as, n);
@@ -1883,74 +1884,109 @@ void bch2_btree_node_rewrite_async(struct bch_fs *c, struct btree *b)
 	queue_work(c->btree_interior_update_worker, &a->work);
 }
 
-static void __bch2_btree_node_update_key(struct btree_update *as,
-					 struct btree_trans *trans,
-					 struct btree_iter *iter,
-					 struct btree *b, struct btree *new_hash,
-					 struct bkey_i *new_key)
+static int __bch2_btree_node_update_key(struct btree_trans *trans,
+					struct btree_iter *iter,
+					struct btree *b, struct btree *new_hash,
+					struct bkey_i *new_key,
+					bool skip_triggers)
 {
-	struct bch_fs *c = as->c;
+	struct bch_fs *c = trans->c;
+	struct btree_iter *iter2 = NULL;
 	struct btree *parent;
+	u64 journal_entries[BKEY_BTREE_PTR_U64s_MAX];
 	int ret;
 
-	btree_update_will_delete_key(as, &b->key);
-	btree_update_will_add_key(as, new_key);
+	if (!skip_triggers) {
+		ret = bch2_trans_mark_key(trans,
+					  bkey_s_c_null,
+					  bkey_i_to_s_c(new_key),
+					  BTREE_TRIGGER_INSERT);
+		if (ret)
+			return ret;
+
+		ret = bch2_trans_mark_key(trans,
+					  bkey_i_to_s_c(&b->key),
+					  bkey_s_c_null,
+					  BTREE_TRIGGER_OVERWRITE);
+		if (ret)
+			return ret;
+	}
+
+	if (new_hash) {
+		bkey_copy(&new_hash->key, new_key);
+		ret = bch2_btree_node_hash_insert(&c->btree_cache,
+				new_hash, b->c.level, b->c.btree_id);
+		BUG_ON(ret);
+	}
 
 	parent = btree_node_parent(iter, b);
 	if (parent) {
-		if (new_hash) {
-			bkey_copy(&new_hash->key, new_key);
-			ret = bch2_btree_node_hash_insert(&c->btree_cache,
-					new_hash, b->c.level, b->c.btree_id);
-			BUG_ON(ret);
-		}
+		iter2 = bch2_trans_copy_iter(trans, iter);
 
-		bch2_keylist_add(&as->parent_keys, new_key);
-		bch2_btree_insert_node(as, trans, iter, parent, &as->parent_keys, 0);
+		BUG_ON(iter2->level != b->c.level);
+		BUG_ON(bpos_cmp(iter2->pos, new_key->k.p));
 
-		if (new_hash) {
-			mutex_lock(&c->btree_cache.lock);
-			bch2_btree_node_hash_remove(&c->btree_cache, new_hash);
+		btree_node_unlock(iter2, iter2->level);
+		iter2->l[iter2->level].b = BTREE_ITER_NO_NODE_UP;
+		iter2->level++;
 
-			bch2_btree_node_hash_remove(&c->btree_cache, b);
-
-			bkey_copy(&b->key, new_key);
-			ret = __bch2_btree_node_hash_insert(&c->btree_cache, b);
-			BUG_ON(ret);
-			mutex_unlock(&c->btree_cache.lock);
-		} else {
-			bkey_copy(&b->key, new_key);
-		}
+		ret   = bch2_btree_iter_traverse(iter2) ?:
+			bch2_trans_update(trans, iter2, new_key, BTREE_TRIGGER_NORUN);
+		if (ret)
+			goto err;
 	} else {
 		BUG_ON(btree_node_root(c, b) != b);
 
-		bch2_btree_node_lock_write(b, iter);
-		bkey_copy(&b->key, new_key);
+		trans->extra_journal_entries = (void *) &journal_entries[0];
+		trans->extra_journal_entry_u64s =
+			journal_entry_set((void *) &journal_entries[0],
+					  BCH_JSET_ENTRY_btree_root,
+					  b->c.btree_id, b->c.level,
+					  new_key, new_key->k.u64s);
+	}
 
-		if (btree_ptr_hash_val(&b->key) != b->hash_val) {
-			mutex_lock(&c->btree_cache.lock);
-			bch2_btree_node_hash_remove(&c->btree_cache, b);
+	ret = bch2_trans_commit(trans, NULL, NULL,
+				BTREE_INSERT_NOFAIL|
+				BTREE_INSERT_NOCHECK_RW|
+				BTREE_INSERT_JOURNAL_RECLAIM|
+				BTREE_INSERT_JOURNAL_RESERVED|
+				BTREE_INSERT_NOUNLOCK);
+	if (ret)
+		goto err;
 
-			ret = __bch2_btree_node_hash_insert(&c->btree_cache, b);
-			BUG_ON(ret);
-			mutex_unlock(&c->btree_cache.lock);
-		}
+	bch2_btree_node_lock_write(b, iter);
 
-		btree_update_updated_root(as, b);
-		bch2_btree_node_unlock_write(b, iter);
+	if (new_hash) {
+		mutex_lock(&c->btree_cache.lock);
+		bch2_btree_node_hash_remove(&c->btree_cache, new_hash);
+		bch2_btree_node_hash_remove(&c->btree_cache, b);
+
+		bkey_copy(&b->key, new_key);
+		ret = __bch2_btree_node_hash_insert(&c->btree_cache, b);
+		BUG_ON(ret);
+		mutex_unlock(&c->btree_cache.lock);
+	} else {
+		bkey_copy(&b->key, new_key);
 	}
 
-	bch2_btree_update_done(as);
+	bch2_btree_node_unlock_write(b, iter);
+out:
+	bch2_trans_iter_put(trans, iter2);
+	return ret;
+err:
+	if (new_hash) {
+		mutex_lock(&c->btree_cache.lock);
+		bch2_btree_node_hash_remove(&c->btree_cache, b);
+		mutex_unlock(&c->btree_cache.lock);
+	}
+	goto out;
 }
 
-int bch2_btree_node_update_key(struct btree_trans *trans,
-			       struct btree_iter *iter,
-			       struct btree *b,
-			       struct bkey_i *new_key)
+int bch2_btree_node_update_key(struct btree_trans *trans, struct btree_iter *iter,
+			       struct btree *b, struct bkey_i *new_key,
+			       bool skip_triggers)
 {
 	struct bch_fs *c = trans->c;
-	struct btree *parent = btree_node_parent(iter, b);
-	struct btree_update *as = NULL;
 	struct btree *new_hash = NULL;
 	struct closure cl;
 	int ret = 0;
@@ -1964,27 +2000,18 @@ int bch2_btree_node_update_key(struct btree_trans *trans,
 	if (btree_ptr_hash_val(new_key) != b->hash_val) {
 		ret = bch2_btree_cache_cannibalize_lock(c, &cl);
 		if (ret) {
-			bch2_trans_unlock(iter->trans);
+			bch2_trans_unlock(trans);
 			closure_sync(&cl);
-			if (!bch2_trans_relock(iter->trans))
+			if (!bch2_trans_relock(trans))
 				return -EINTR;
 		}
 
 		new_hash = bch2_btree_node_mem_alloc(c);
 	}
 
-	as = bch2_btree_update_start(iter, b->c.level,
-		parent ? btree_update_reserve_required(c, parent) : 0,
-		BTREE_INSERT_NOFAIL);
-	if (IS_ERR(as)) {
-		ret = PTR_ERR(as);
-		goto err;
-	}
-
-	__bch2_btree_node_update_key(as, trans, iter, b, new_hash, new_key);
+	ret = __bch2_btree_node_update_key(trans, iter, b, new_hash,
+					   new_key, skip_triggers);
 
-	bch2_btree_iter_downgrade(iter);
-err:
 	if (new_hash) {
 		mutex_lock(&c->btree_cache.lock);
 		list_move(&new_hash->list, &c->btree_cache.freeable);
@@ -1998,6 +2025,35 @@ err:
 	return ret;
 }
 
+int bch2_btree_node_update_key_get_iter(struct btree_trans *trans,
+					struct btree *b, struct bkey_i *new_key,
+					bool skip_triggers)
+{
+	struct btree_iter *iter;
+	int ret;
+
+	iter = bch2_trans_get_node_iter(trans, b->c.btree_id, b->key.k.p,
+					BTREE_MAX_DEPTH, b->c.level,
+					BTREE_ITER_INTENT);
+	ret = bch2_btree_iter_traverse(iter);
+	if (ret)
+		goto out;
+
+	/* has node been freed? */
+	if (iter->l[b->c.level].b != b) {
+		/* node has been freed: */
+		BUG_ON(!btree_node_dying(b));
+		goto out;
+	}
+
+	BUG_ON(!btree_node_hashed(b));
+
+	ret = bch2_btree_node_update_key(trans, iter, b, new_key, skip_triggers);
+out:
+	bch2_trans_iter_put(trans, iter);
+	return ret;
+}
+
 /* Init code: */
 
 /*
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index c593c8d712c5..c9de49286fb7 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -910,7 +910,8 @@ int __bch2_trans_commit(struct btree_trans *trans)
 	unsigned u64s, reset_flags = 0;
 	int ret = 0;
 
-	if (!trans->nr_updates)
+	if (!trans->nr_updates &&
+	    !trans->extra_journal_entry_u64s)
 		goto out_reset;
 
 	if (trans->flags & BTREE_INSERT_GC_LOCK_HELD)
diff --git a/fs/bcachefs/io_types.h b/fs/bcachefs/io_types.h
index 148808bdea50..50361f2fb8f1 100644
--- a/fs/bcachefs/io_types.h
+++ b/fs/bcachefs/io_types.h
@@ -95,7 +95,8 @@ struct bch_write_bio {
 				bounce:1,
 				put_bio:1,
 				have_ioref:1,
-				used_mempool:1;
+				used_mempool:1,
+				first_btree_write:1;
 	);
 
 	struct bio		bio;
diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c
index aacd6385db1f..1f65eca48c6e 100644
--- a/fs/bcachefs/migrate.c
+++ b/fs/bcachefs/migrate.c
@@ -139,7 +139,7 @@ retry:
 				break;
 			}
 
-			ret = bch2_btree_node_update_key(&trans, iter, b, k.k);
+			ret = bch2_btree_node_update_key(&trans, iter, b, k.k, false);
 			if (ret == -EINTR) {
 				b = bch2_btree_iter_peek_node(iter);
 				ret = 0;
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index c6fa4ca31ae9..84e224fb0d01 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -1005,6 +1005,11 @@ int bch2_fs_recovery(struct bch_fs *c)
 		c->opts.fix_errors	= FSCK_OPT_YES;
 	}
 
+	if (c->sb.version < bcachefs_metadata_version_btree_ptr_sectors_written) {
+		bch_info(c, "version prior to btree_ptr_sectors_written, upgrade required");
+		c->opts.version_upgrade	= true;
+	}
+
 	ret = bch2_blacklist_table_initialize(c);
 	if (ret) {
 		bch_err(c, "error initializing blacklist table");
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 619cfdcd2934..11557a863d3d 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -514,8 +514,8 @@ static void __bch2_fs_free(struct bch_fs *c)
 		destroy_workqueue(c->io_complete_wq );
 	if (c->copygc_wq)
 		destroy_workqueue(c->copygc_wq);
-	if (c->btree_error_wq)
-		destroy_workqueue(c->btree_error_wq);
+	if (c->btree_io_complete_wq)
+		destroy_workqueue(c->btree_io_complete_wq);
 	if (c->btree_update_wq)
 		destroy_workqueue(c->btree_update_wq);
 
@@ -567,7 +567,6 @@ void __bch2_fs_stop(struct bch_fs *c)
 	for_each_member_device(ca, c, i)
 		cancel_work_sync(&ca->io_error_work);
 
-	cancel_work_sync(&c->btree_write_error_work);
 	cancel_work_sync(&c->read_only_work);
 }
 
@@ -696,9 +695,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 
 	mutex_init(&c->bio_bounce_pages_lock);
 
-	bio_list_init(&c->btree_write_error_list);
 	spin_lock_init(&c->btree_write_error_lock);
-	INIT_WORK(&c->btree_write_error_work, bch2_btree_write_error_work);
 
 	INIT_WORK(&c->journal_seq_blacklist_gc_work,
 		  bch2_blacklist_entries_gc);
@@ -768,7 +765,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 
 	if (!(c->btree_update_wq = alloc_workqueue("bcachefs",
 				WQ_FREEZABLE|WQ_MEM_RECLAIM, 1)) ||
-	    !(c->btree_error_wq = alloc_workqueue("bcachefs_error",
+	    !(c->btree_io_complete_wq = alloc_workqueue("bcachefs_btree_io",
 				WQ_FREEZABLE|WQ_MEM_RECLAIM, 1)) ||
 	    !(c->copygc_wq = alloc_workqueue("bcachefs_copygc",
 				WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) ||
-- 
cgit 


From 47924527e643e6160c6726669b90cad8aeb6d977 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 10 Sep 2023 23:35:02 -0400
Subject: Revert "bcachefs: statfs bfree and bavail should be the same"

This reverts commit 664f9847bec525d396d62d2db094ca9020289ae0.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index 1b0d63219c3b..71e738b98967 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -1277,8 +1277,8 @@ static int bch2_statfs(struct dentry *dentry, struct kstatfs *buf)
 	buf->f_type	= BCACHEFS_STATFS_MAGIC;
 	buf->f_bsize	= sb->s_blocksize;
 	buf->f_blocks	= usage.capacity >> shift;
-	buf->f_bfree	= avail_factor(usage.free) >> shift;
-	buf->f_bavail	= buf->f_bfree;
+	buf->f_bfree	= usage.free >> shift;
+	buf->f_bavail	= avail_factor(usage.free) >> shift;
 
 	buf->f_files	= usage.nr_inodes + avail_inodes;
 	buf->f_ffree	= avail_inodes;
-- 
cgit 


From e719fc34f00004813f98c8c3f8f3f364b1d77afc Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 16 Jul 2021 12:57:27 -0400
Subject: bcachefs: BSET_OFFSET()

Add a field to struct bset for the sector offset within the btree node
where it was written.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/bcachefs_format.h |  3 +++
 fs/bcachefs/btree_io.c        | 19 +++++++++++++------
 2 files changed, 16 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index 659bcfe09fb4..156198850b67 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -1749,6 +1749,9 @@ LE32_BITMASK(BSET_BIG_ENDIAN,	struct bset, flags, 4, 5);
 LE32_BITMASK(BSET_SEPARATE_WHITEOUTS,
 				struct bset, flags, 5, 6);
 
+/* Sector offset within the btree node: */
+LE32_BITMASK(BSET_OFFSET,	struct bset, flags, 16, 32);
+
 struct btree_node {
 	struct bch_csum		csum;
 	__le64			magic;
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index b99e4198bdbe..e42ade7cbc4b 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -651,7 +651,8 @@ void bch2_btree_node_drop_keys_outside_node(struct btree *b)
 
 static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
 			 struct btree *b, struct bset *i,
-			 unsigned sectors, int write, bool have_retry)
+			 unsigned offset, unsigned sectors,
+			 int write, bool have_retry)
 {
 	unsigned version = le16_to_cpu(i->version);
 	const char *err;
@@ -689,18 +690,23 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
 		     BTREE_ERR_FATAL, c, ca, b, i,
 		     "BSET_SEPARATE_WHITEOUTS no longer supported");
 
-	if (btree_err_on(b->written + sectors > c->opts.btree_node_size,
+	if (btree_err_on(offset + sectors > c->opts.btree_node_size,
 			 BTREE_ERR_FIXABLE, c, ca, b, i,
 			 "bset past end of btree node")) {
 		i->u64s = 0;
 		return 0;
 	}
 
-	btree_err_on(b->written && !i->u64s,
+	btree_err_on(offset && !i->u64s,
 		     BTREE_ERR_FIXABLE, c, ca, b, i,
 		     "empty bset");
 
-	if (!b->written) {
+	btree_err_on(BSET_OFFSET(i) &&
+		     BSET_OFFSET(i) != offset,
+		     BTREE_ERR_WANT_RETRY, c, ca, b, i,
+		     "bset at wrong sector offset");
+
+	if (!offset) {
 		struct btree_node *bn =
 			container_of(i, struct btree_node, keys);
 		/* These indicate that we read the wrong btree node: */
@@ -954,7 +960,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
 		b->version_ondisk = min(b->version_ondisk,
 					le16_to_cpu(i->version));
 
-		ret = validate_bset(c, ca, b, i, sectors,
+		ret = validate_bset(c, ca, b, i, b->written, sectors,
 				    READ, have_retry);
 		if (ret)
 			goto fsck_err;
@@ -1713,7 +1719,7 @@ static int validate_bset_for_write(struct bch_fs *c, struct btree *b,
 		return -1;
 
 	ret = validate_bset_keys(c, b, i, &whiteout_u64s, WRITE, false) ?:
-		validate_bset(c, NULL, b, i, sectors, WRITE, false);
+		validate_bset(c, NULL, b, i, b->written, sectors, WRITE, false);
 	if (ret) {
 		bch2_inconsistent_error(c);
 		dump_stack();
@@ -1876,6 +1882,7 @@ do_write:
 	i->version = c->sb.version < bcachefs_metadata_version_new_versioning
 		? cpu_to_le16(BCH_BSET_VERSION_OLD)
 		: cpu_to_le16(c->sb.version);
+	SET_BSET_OFFSET(i, b->written);
 	SET_BSET_CSUM_TYPE(i, bch2_meta_checksum_type(c));
 
 	if (bch2_csum_type_is_encryption(BSET_CSUM_TYPE(i)))
-- 
cgit 


From 5f87f3c116f67fe22ba8a9f461830480a716fe9b Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 20 Jul 2021 20:14:44 -0400
Subject: bcachefs: Don't downgrade in traverse()

Downgrading of btree iterators is something that should only happen
explicitly.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_key_cache.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index d60b6084fdf0..d89cfab4df81 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -341,12 +341,10 @@ fill:
 
 	iter->uptodate = BTREE_ITER_NEED_PEEK;
 
-	if (!(iter->flags & BTREE_ITER_INTENT))
-		bch2_btree_iter_downgrade(iter);
-	else if (!iter->locks_want) {
-		if (!__bch2_btree_iter_upgrade(iter, 1))
-			ret = -EINTR;
-	}
+	if ((iter->flags & BTREE_ITER_INTENT) &&
+	    !iter->locks_want &&
+	    __bch2_btree_iter_upgrade(iter, 1))
+		ret = -EINTR;
 
 	return ret;
 err:
-- 
cgit 


From 4909fe50b31ad919d998329002d707dd097094ff Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 20 Jul 2021 21:07:21 -0400
Subject: bcachefs: Handle lock restarts in bch2_xattr_get()

Snapshots add another btree lookup, thus we need to handle lock
restarts.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/xattr.c | 23 ++++++++++++-----------
 1 file changed, 12 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c
index 1993bfcee788..d7160e8cdc07 100644
--- a/fs/bcachefs/xattr.c
+++ b/fs/bcachefs/xattr.c
@@ -118,18 +118,15 @@ void bch2_xattr_to_text(struct printbuf *out, struct bch_fs *c,
 		      le16_to_cpu(xattr.v->x_val_len));
 }
 
-int bch2_xattr_get(struct bch_fs *c, struct bch_inode_info *inode,
-		   const char *name, void *buffer, size_t size, int type)
+static int bch2_xattr_get_trans(struct btree_trans *trans, struct bch_inode_info *inode,
+				const char *name, void *buffer, size_t size, int type)
 {
-	struct bch_hash_info hash = bch2_hash_info_init(c, &inode->ei_inode);
-	struct btree_trans trans;
+	struct bch_hash_info hash = bch2_hash_info_init(trans->c, &inode->ei_inode);
 	struct btree_iter *iter;
 	struct bkey_s_c_xattr xattr;
 	int ret;
 
-	bch2_trans_init(&trans, c, 0, 0);
-
-	iter = bch2_hash_lookup(&trans, bch2_xattr_hash_desc, &hash,
+	iter = bch2_hash_lookup(trans, bch2_xattr_hash_desc, &hash,
 				inode->v.i_ino,
 				&X_SEARCH(type, name, strlen(name)),
 				0);
@@ -145,14 +142,18 @@ int bch2_xattr_get(struct bch_fs *c, struct bch_inode_info *inode,
 		else
 			memcpy(buffer, xattr_val(xattr.v), ret);
 	}
-	bch2_trans_iter_put(&trans, iter);
+	bch2_trans_iter_put(trans, iter);
 err:
-	bch2_trans_exit(&trans);
-
-	BUG_ON(ret == -EINTR);
 	return ret == -ENOENT ? -ENODATA : ret;
 }
 
+int bch2_xattr_get(struct bch_fs *c, struct bch_inode_info *inode,
+		   const char *name, void *buffer, size_t size, int type)
+{
+	return bch2_trans_do(c, NULL, NULL, 0,
+		bch2_xattr_get_trans(&trans, inode, name, buffer, size, type));
+}
+
 int bch2_xattr_set(struct btree_trans *trans, u64 inum,
 		   const struct bch_hash_info *hash_info,
 		   const char *name, const void *value, size_t size,
-- 
cgit 


From b97bbd4ec35d5359398513d381345765cd80c012 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 20 Jul 2021 21:18:16 -0400
Subject: bcachefs: Use bch2_inode_find_by_inum() in truncate

This is needed for snapshots because we need to start handling lock
restarts even when just calling bch2_inode_peek().

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/fs-io.c | 13 +------------
 1 file changed, 1 insertion(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 30c12864d537..4af8cd018e3a 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -2296,8 +2296,6 @@ int bch2_truncate(struct mnt_idmap *idmap,
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
 	struct address_space *mapping = inode->v.i_mapping;
 	struct bch_inode_unpacked inode_u;
-	struct btree_trans trans;
-	struct btree_iter *iter;
 	u64 new_i_size = iattr->ia_size;
 	s64 i_sectors_delta = 0;
 	int ret = 0;
@@ -2318,16 +2316,7 @@ int bch2_truncate(struct mnt_idmap *idmap,
 	inode_dio_wait(&inode->v);
 	bch2_pagecache_block_get(&inode->ei_pagecache_lock);
 
-	/*
-	 * fetch current on disk i_size: inode is locked, i_size can only
-	 * increase underneath us:
-	 */
-	bch2_trans_init(&trans, c, 0, 0);
-	iter = bch2_inode_peek(&trans, &inode_u, inode->v.i_ino, 0);
-	ret = PTR_ERR_OR_ZERO(iter);
-	bch2_trans_iter_put(&trans, iter);
-	bch2_trans_exit(&trans);
-
+	ret = bch2_inode_find_by_inum(c, inode->v.i_ino, &inode_u);
 	if (ret)
 		goto err;
 
-- 
cgit 


From 382005442674a327e7238d6dc8c16071bfc723f3 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 21 Jul 2021 13:23:50 -0400
Subject: bcachefs: Don't squash return code in check_dirents()

We were squashing BCH_FSCK_ERRORS_NOT_FIXED.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/fsck.c | 13 +++----------
 1 file changed, 3 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index bedfd34803ce..63d42542c194 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -727,7 +727,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
 
 	k = bch2_btree_iter_peek(iter);
 	if (!k.k)
-		return 1;
+		return 0;
 
 	ret = bkey_err(k);
 	if (ret)
@@ -904,19 +904,12 @@ static int check_dirents(struct bch_fs *c)
 				   BTREE_ITER_INTENT|
 				   BTREE_ITER_PREFETCH);
 
-	while (1) {
+	do {
 		ret = lockrestart_do(&trans,
 				check_dirent(&trans, iter, &hash_info, &w, &nr_subdirs));
-		if (ret == 1) {
-			/* at end */
-			ret = 0;
-			break;
-		}
 		if (ret)
 			break;
-
-		bch2_btree_iter_advance(iter);
-	}
+	} while (bch2_btree_iter_advance(iter));
 	bch2_trans_iter_put(&trans, iter);
 
 	return bch2_trans_exit(&trans) ?: ret;
-- 
cgit 


From d7b21954b9049d749b98b50dad14f35d44f3340f Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 21 Jul 2021 13:55:51 -0400
Subject: bcachefs: Pretty-ify bch2_bkey_val_to_text()

Don't print out the ": " when there isn't a value to print.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/bkey_methods.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c
index 1ad6a9d30bb5..465be5fee7d9 100644
--- a/fs/bcachefs/bkey_methods.c
+++ b/fs/bcachefs/bkey_methods.c
@@ -269,7 +269,7 @@ void bch2_bkey_val_to_text(struct printbuf *out, struct bch_fs *c,
 {
 	bch2_bkey_to_text(out, k.k);
 
-	if (k.k) {
+	if (bkey_val_bytes(k.k)) {
 		pr_buf(out, ": ");
 		bch2_val_to_text(out, c, k);
 	}
-- 
cgit 


From ed5580b43b083a96fe2cbcf6b1cfe340fa52fca2 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 24 Jul 2021 20:20:02 -0400
Subject: bcachefs: Fix a btree iterator leak

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_update_leaf.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index c9de49286fb7..18f4ba1e305c 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -858,8 +858,10 @@ static int extent_handle_overwrites(struct btree_trans *trans,
 							  BTREE_ITER_NOT_EXTENTS|
 							  BTREE_ITER_INTENT);
 			ret = bch2_btree_iter_traverse(update_iter);
-			if (ret)
+			if (ret) {
+				bch2_trans_iter_put(trans, update_iter);
 				goto out;
+			}
 
 			bch2_trans_update(trans, update_iter, update,
 					  BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|
-- 
cgit 


From a6eba44b889f9da03ff701f9676297148ed5dcc3 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 23 Jul 2021 18:26:38 -0400
Subject: bcachefs: Use bch2_trans_do() in bch2_btree_key_cache_journal_flush()

We're working to standardize handling of transaction restarts.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_key_cache.c | 22 +++++++---------------
 1 file changed, 7 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index d89cfab4df81..cb71fe0dd742 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -376,10 +376,9 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans,
 				     BTREE_ITER_CACHED_NOFILL|
 				     BTREE_ITER_CACHED_NOCREATE|
 				     BTREE_ITER_INTENT);
-retry:
 	ret = bch2_btree_iter_traverse(c_iter);
 	if (ret)
-		goto err;
+		goto out;
 
 	ck = (void *) c_iter->l[0].b;
 	if (!ck ||
@@ -410,15 +409,10 @@ retry:
 				   ? BTREE_INSERT_JOURNAL_RESERVED
 				   : 0)|
 				  commit_flags);
-err:
-	if (ret == -EINTR)
-		goto retry;
-
-	if (ret == -EAGAIN)
-		goto out;
-
 	if (ret) {
-		bch2_fs_fatal_err_on(!bch2_journal_error(j), c,
+		bch2_fs_fatal_err_on(ret != -EINTR &&
+				     ret != -EAGAIN &&
+				     !bch2_journal_error(j), c,
 			"error flushing key cache: %i", ret);
 		goto out;
 	}
@@ -466,7 +460,6 @@ int bch2_btree_key_cache_journal_flush(struct journal *j,
 	struct bkey_cached *ck =
 		container_of(pin, struct bkey_cached, journal);
 	struct bkey_cached_key key;
-	struct btree_trans trans;
 	int ret = 0;
 
 	int srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
@@ -481,10 +474,9 @@ int bch2_btree_key_cache_journal_flush(struct journal *j,
 	}
 	six_unlock_read(&ck->c.lock);
 
-	bch2_trans_init(&trans, c, 0, 0);
-	ret = btree_key_cache_flush_pos(&trans, key, seq,
-				  BTREE_INSERT_JOURNAL_RECLAIM, false);
-	bch2_trans_exit(&trans);
+	ret = bch2_trans_do(c, NULL, NULL, 0,
+		btree_key_cache_flush_pos(&trans, key, seq,
+				BTREE_INSERT_JOURNAL_RECLAIM, false));
 unlock:
 	srcu_read_unlock(&c->btree_trans_barrier, srcu_idx);
 
-- 
cgit 


From 6e075b54a3749d3f94e4b87ed8294f8d6ab09bac Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 24 Jul 2021 17:12:51 -0400
Subject: bcachefs: bch2_btree_iter_relock_intent()

This adds a new helper for btree_cache.c that does what we want where
the iterator is still being traverse - and also eliminates some
unnecessary transaction restarts.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_cache.c | 23 +++++++++--------------
 fs/bcachefs/btree_cache.h |  2 +-
 fs/bcachefs/btree_iter.c  | 34 +++++++++++++++++++++++++++++++---
 fs/bcachefs/btree_iter.h  |  1 +
 4 files changed, 42 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index e2c02ae98f83..6d5cf2a5a159 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -693,14 +693,9 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c,
 	if (!sync)
 		return NULL;
 
-	/*
-	 * XXX: this will probably always fail because btree_iter_relock()
-	 * currently fails for iterators that aren't pointed at a valid btree
-	 * node
-	 */
 	if (iter &&
 	    (!bch2_trans_relock(iter->trans) ||
-	     !bch2_btree_iter_relock(iter, _THIS_IP_)))
+	     !bch2_btree_iter_relock_intent(iter)))
 		return ERR_PTR(-EINTR);
 
 	if (!six_relock_type(&b->c.lock, lock_type, seq))
@@ -760,11 +755,12 @@ static inline void btree_check_header(struct bch_fs *c, struct btree *b)
  * The btree node will have either a read or a write lock held, depending on
  * the @write parameter.
  */
-struct btree *bch2_btree_node_get(struct bch_fs *c, struct btree_iter *iter,
+struct btree *bch2_btree_node_get(struct btree_trans *trans, struct btree_iter *iter,
 				  const struct bkey_i *k, unsigned level,
 				  enum six_lock_type lock_type,
 				  unsigned long trace_ip)
 {
+	struct bch_fs *c = trans->c;
 	struct btree_cache *bc = &c->btree_cache;
 	struct btree *b;
 	struct bset_tree *t;
@@ -838,7 +834,7 @@ lock_node:
 			if (bch2_btree_node_relock(iter, level + 1))
 				goto retry;
 
-			trace_trans_restart_btree_node_reused(iter->trans->ip,
+			trace_trans_restart_btree_node_reused(trans->ip,
 							      trace_ip,
 							      iter->btree_id,
 							      &iter->real_pos);
@@ -850,18 +846,17 @@ lock_node:
 		u32 seq = b->c.lock.state.seq;
 
 		six_unlock_type(&b->c.lock, lock_type);
-		bch2_trans_unlock(iter->trans);
+		bch2_trans_unlock(trans);
 
 		bch2_btree_node_wait_on_read(b);
 
 		/*
-		 * XXX: check if this always fails - btree_iter_relock()
-		 * currently fails for iterators that aren't pointed at a valid
-		 * btree node
+		 * should_be_locked is not set on this iterator yet, so we need
+		 * to relock it specifically:
 		 */
 		if (iter &&
-		    (!bch2_trans_relock(iter->trans) ||
-		     !bch2_btree_iter_relock(iter, _THIS_IP_)))
+		    (!bch2_trans_relock(trans) ||
+		     !bch2_btree_iter_relock_intent(iter)))
 			return ERR_PTR(-EINTR);
 
 		if (!six_relock_type(&b->c.lock, lock_type, seq))
diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h
index 40dd263a7caa..6e9f08597d94 100644
--- a/fs/bcachefs/btree_cache.h
+++ b/fs/bcachefs/btree_cache.h
@@ -20,7 +20,7 @@ int bch2_btree_cache_cannibalize_lock(struct bch_fs *, struct closure *);
 struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *);
 struct btree *bch2_btree_node_mem_alloc(struct bch_fs *);
 
-struct btree *bch2_btree_node_get(struct bch_fs *, struct btree_iter *,
+struct btree *bch2_btree_node_get(struct btree_trans *, struct btree_iter *,
 				  const struct bkey_i *, unsigned,
 				  enum six_lock_type, unsigned long);
 
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 745bf48241fd..5c3404699136 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -205,7 +205,6 @@ static inline bool btree_iter_get_locks(struct btree_iter *iter, bool upgrade,
 					is_btree_node(iter, l)
 					? iter->l[l].b->c.lock.state.seq
 					: 0);
-
 			fail_idx = l;
 			btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
 		}
@@ -382,6 +381,34 @@ void bch2_btree_trans_verify_locks(struct btree_trans *trans)
 static inline void bch2_btree_iter_verify_locks(struct btree_iter *iter) {}
 #endif
 
+/*
+ * Only for btree_cache.c - only relocks intent locks
+ */
+bool bch2_btree_iter_relock_intent(struct btree_iter *iter)
+{
+	unsigned l;
+
+	for (l = iter->level;
+	     l < iter->locks_want && btree_iter_node(iter, l);
+	     l++) {
+		if (!bch2_btree_node_relock(iter, l)) {
+			trace_node_relock_fail(iter->trans->ip, _RET_IP_,
+					iter->btree_id, &iter->real_pos,
+					l, iter->l[l].lock_seq,
+					is_btree_node(iter, l)
+					? 0
+					: (unsigned long) iter->l[l].b,
+					is_btree_node(iter, l)
+					? iter->l[l].b->c.lock.state.seq
+					: 0);
+			btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
+			return false;
+		}
+	}
+
+	return true;
+}
+
 __flatten
 bool bch2_btree_iter_relock(struct btree_iter *iter, unsigned long trace_ip)
 {
@@ -1172,7 +1199,8 @@ static noinline void btree_node_mem_ptr_set(struct btree_iter *iter,
 static __always_inline int btree_iter_down(struct btree_iter *iter,
 					   unsigned long trace_ip)
 {
-	struct bch_fs *c = iter->trans->c;
+	struct btree_trans *trans  = iter->trans;
+	struct bch_fs *c = trans->c;
 	struct btree_iter_level *l = &iter->l[iter->level];
 	struct btree *b;
 	unsigned level = iter->level - 1;
@@ -1186,7 +1214,7 @@ static __always_inline int btree_iter_down(struct btree_iter *iter,
 	bch2_bkey_buf_unpack(&tmp, c, l->b,
 			 bch2_btree_node_iter_peek(&l->iter, l->b));
 
-	b = bch2_btree_node_get(c, iter, tmp.k, level, lock_type, trace_ip);
+	b = bch2_btree_node_get(trans, iter, tmp.k, level, lock_type, trace_ip);
 	ret = PTR_ERR_OR_ZERO(b);
 	if (unlikely(ret))
 		goto err;
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index 7385cca43f8b..3889683e16f8 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -111,6 +111,7 @@ void bch2_btree_node_iter_fix(struct btree_iter *, struct btree *,
 			      struct btree_node_iter *, struct bkey_packed *,
 			      unsigned, unsigned);
 
+bool bch2_btree_iter_relock_intent(struct btree_iter *);
 bool bch2_btree_iter_relock(struct btree_iter *, unsigned long);
 
 bool bch2_trans_relock(struct btree_trans *);
-- 
cgit 


From 2b4e4b8cfa3857cd44506b2e57fd587e7494d777 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 24 Jul 2021 17:31:25 -0400
Subject: bcachefs: Minor tracepoint improvements

Btree iterator tracepoints should print whether they're for the key
cache.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_iter.c |  6 +++++-
 fs/bcachefs/trace.h      | 22 ++++++++++++++++------
 2 files changed, 21 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 5c3404699136..37b314f0a002 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -197,6 +197,7 @@ static inline bool btree_iter_get_locks(struct btree_iter *iter, bool upgrade,
 			(upgrade
 			 ? trace_node_upgrade_fail
 			 : trace_node_relock_fail)(iter->trans->ip, trace_ip,
+					btree_iter_type(iter) == BTREE_ITER_CACHED,
 					iter->btree_id, &iter->real_pos,
 					l, iter->l[l].lock_seq,
 					is_btree_node(iter, l)
@@ -393,6 +394,7 @@ bool bch2_btree_iter_relock_intent(struct btree_iter *iter)
 	     l++) {
 		if (!bch2_btree_node_relock(iter, l)) {
 			trace_node_relock_fail(iter->trans->ip, _RET_IP_,
+					btree_iter_type(iter) == BTREE_ITER_CACHED,
 					iter->btree_id, &iter->real_pos,
 					l, iter->l[l].lock_seq,
 					is_btree_node(iter, l)
@@ -1386,6 +1388,7 @@ static inline unsigned btree_iter_up_until_good_node(struct btree_iter *iter,
 static int btree_iter_traverse_one(struct btree_iter *iter,
 				   unsigned long trace_ip)
 {
+	struct btree_trans *trans = iter->trans;
 	unsigned l, depth_want = iter->level;
 	int ret = 0;
 
@@ -1447,7 +1450,8 @@ static int btree_iter_traverse_one(struct btree_iter *iter,
 
 	iter->uptodate = BTREE_ITER_NEED_PEEK;
 out:
-	trace_iter_traverse(iter->trans->ip, trace_ip,
+	trace_iter_traverse(trans->ip, trace_ip,
+			    btree_iter_type(iter) == BTREE_ITER_CACHED,
 			    iter->btree_id, &iter->real_pos, ret);
 	bch2_btree_iter_verify(iter);
 	return ret;
diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h
index ae2aee8ddee8..af1f415fb5e7 100644
--- a/fs/bcachefs/trace.h
+++ b/fs/bcachefs/trace.h
@@ -775,14 +775,16 @@ DEFINE_EVENT(transaction_restart_iter,	trans_restart_traverse,
 TRACE_EVENT(iter_traverse,
 	TP_PROTO(unsigned long	trans_ip,
 		 unsigned long	caller_ip,
+		 bool key_cache,
 		 enum btree_id	btree_id,
 		 struct bpos	*pos,
 		 int ret),
-	TP_ARGS(trans_ip, caller_ip, btree_id, pos, ret),
+	TP_ARGS(trans_ip, caller_ip, key_cache, btree_id, pos, ret),
 
 	TP_STRUCT__entry(
 		__field(unsigned long,		trans_ip	)
 		__field(unsigned long,		caller_ip	)
+		__field(u8,			key_cache	)
 		__field(u8,			btree_id	)
 		__field(u64,			pos_inode	)
 		__field(u64,			pos_offset	)
@@ -793,6 +795,7 @@ TRACE_EVENT(iter_traverse,
 	TP_fast_assign(
 		__entry->trans_ip		= trans_ip;
 		__entry->caller_ip		= caller_ip;
+		__entry->key_cache		= key_cache;
 		__entry->btree_id		= btree_id;
 		__entry->pos_inode		= pos->inode;
 		__entry->pos_offset		= pos->offset;
@@ -800,9 +803,10 @@ TRACE_EVENT(iter_traverse,
 		__entry->ret			= ret;
 	),
 
-	TP_printk("%ps %pS pos %u %llu:%llu:%u ret %i",
+	TP_printk("%ps %pS key cache %u btree %u %llu:%llu:%u ret %i",
 		  (void *) __entry->trans_ip,
 		  (void *) __entry->caller_ip,
+		  __entry->key_cache,
 		  __entry->btree_id,
 		  __entry->pos_inode,
 		  __entry->pos_offset,
@@ -953,15 +957,17 @@ TRACE_EVENT(trans_restart_mem_realloced,
 DECLARE_EVENT_CLASS(node_lock_fail,
 	TP_PROTO(unsigned long trans_ip,
 		 unsigned long caller_ip,
+		 bool key_cache,
 		 enum btree_id btree_id,
 		 struct bpos *pos,
 		 unsigned level, u32 iter_seq, unsigned node, u32 node_seq),
-	TP_ARGS(trans_ip, caller_ip, btree_id, pos,
+	TP_ARGS(trans_ip, caller_ip, key_cache, btree_id, pos,
 		level, iter_seq, node, node_seq),
 
 	TP_STRUCT__entry(
 		__field(unsigned long,		trans_ip	)
 		__field(unsigned long,		caller_ip	)
+		__field(u8,			key_cache	)
 		__field(u8,			btree_id	)
 		__field(u64,			pos_inode	)
 		__field(u64,			pos_offset	)
@@ -975,6 +981,7 @@ DECLARE_EVENT_CLASS(node_lock_fail,
 	TP_fast_assign(
 		__entry->trans_ip		= trans_ip;
 		__entry->caller_ip		= caller_ip;
+		__entry->key_cache		= key_cache;
 		__entry->btree_id		= btree_id;
 		__entry->pos_inode		= pos->inode;
 		__entry->pos_offset		= pos->offset;
@@ -985,9 +992,10 @@ DECLARE_EVENT_CLASS(node_lock_fail,
 		__entry->node_seq		= node_seq;
 	),
 
-	TP_printk("%ps %pS btree %u pos %llu:%llu:%u level %u iter seq %u node %u node seq %u",
+	TP_printk("%ps %pS key cache %u btree %u pos %llu:%llu:%u level %u iter seq %u node %u node seq %u",
 		  (void *) __entry->trans_ip,
 		  (void *) __entry->caller_ip,
+		  __entry->key_cache,
 		  __entry->btree_id,
 		  __entry->pos_inode,
 		  __entry->pos_offset,
@@ -999,20 +1007,22 @@ DECLARE_EVENT_CLASS(node_lock_fail,
 DEFINE_EVENT(node_lock_fail, node_upgrade_fail,
 	TP_PROTO(unsigned long trans_ip,
 		 unsigned long caller_ip,
+		 bool key_cache,
 		 enum btree_id btree_id,
 		 struct bpos *pos,
 		 unsigned level, u32 iter_seq, unsigned node, u32 node_seq),
-	TP_ARGS(trans_ip, caller_ip, btree_id, pos,
+	TP_ARGS(trans_ip, caller_ip, key_cache, btree_id, pos,
 		level, iter_seq, node, node_seq)
 );
 
 DEFINE_EVENT(node_lock_fail, node_relock_fail,
 	TP_PROTO(unsigned long trans_ip,
 		 unsigned long caller_ip,
+		 bool key_cache,
 		 enum btree_id btree_id,
 		 struct bpos *pos,
 		 unsigned level, u32 iter_seq, unsigned node, u32 node_seq),
-	TP_ARGS(trans_ip, caller_ip, btree_id, pos,
+	TP_ARGS(trans_ip, caller_ip, key_cache, btree_id, pos,
 		level, iter_seq, node, node_seq)
 );
 
-- 
cgit 


From a32b9573c742c9a9289aa26e0a28d0f9a9c439a4 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 26 Jul 2021 15:52:41 -0400
Subject: bcachefs: Add an option for btree node mem ptr optimization

bch2_btree_node_ptr_v2 has a field for stashing a pointer to the in
memory btree node; this is safe because we clear this field when reading
in nodes from disk and we never free in memory btree nodes - but, we
have bug reports that indicate something might be faulty with this
optimization, so let's add an option for it.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_cache.c | 16 ++++++++++------
 fs/bcachefs/opts.h        |  5 +++++
 2 files changed, 15 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index 6d5cf2a5a159..818b8df063da 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -767,9 +767,11 @@ struct btree *bch2_btree_node_get(struct btree_trans *trans, struct btree_iter *
 
 	EBUG_ON(level >= BTREE_MAX_DEPTH);
 
-	b = btree_node_mem_ptr(k);
-	if (b)
-		goto lock_node;
+	if (c->opts.btree_node_mem_ptr_optimization) {
+		b = btree_node_mem_ptr(k);
+		if (b)
+			goto lock_node;
+	}
 retry:
 	b = btree_cache_find(bc, k);
 	if (unlikely(!b)) {
@@ -902,9 +904,11 @@ struct btree *bch2_btree_node_get_noiter(struct bch_fs *c,
 
 	EBUG_ON(level >= BTREE_MAX_DEPTH);
 
-	b = btree_node_mem_ptr(k);
-	if (b)
-		goto lock_node;
+	if (c->opts.btree_node_mem_ptr_optimization) {
+		b = btree_node_mem_ptr(k);
+		if (b)
+			goto lock_node;
+	}
 retry:
 	b = btree_cache_find(bc, k);
 	if (unlikely(!b)) {
diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
index 2cba0e137b58..0799c9d2bee0 100644
--- a/fs/bcachefs/opts.h
+++ b/fs/bcachefs/opts.h
@@ -178,6 +178,11 @@ enum opt_type {
 	  OPT_BOOL(),							\
 	  BCH_SB_INODES_USE_KEY_CACHE,	true,				\
 	  NULL,		"Use the btree key cache for the inodes btree")	\
+	x(btree_node_mem_ptr_optimization, u8,				\
+	  OPT_MOUNT|OPT_RUNTIME,					\
+	  OPT_BOOL(),							\
+	  NO_SB_OPT,			true,				\
+	  NULL,		"Stash pointer to in memory btree node in btree ptr")\
 	x(gc_reserve_percent,		u8,				\
 	  OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,				\
 	  OPT_UINT(5, 21),						\
-- 
cgit 


From 6918bb55f6ae106c14fa3bb0428fd6588e107982 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 25 Jul 2021 14:20:43 -0400
Subject: bcachefs: Don't traverse iterators in __bch2_trans_commit()

They should already be traversed, and we're asserting that since the
introduction of iter->should_be_locked

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_update_leaf.c | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 18f4ba1e305c..b354624133a1 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -966,13 +966,7 @@ int __bch2_trans_commit(struct btree_trans *trans)
 	} while (trans_trigger_run);
 
 	trans_for_each_update(trans, i) {
-		ret = bch2_btree_iter_traverse(i->iter);
-		if (unlikely(ret)) {
-			trace_trans_restart_traverse(trans->ip, _RET_IP_,
-						     i->iter->btree_id,
-						     &i->iter->pos);
-			goto out;
-		}
+		BUG_ON(!i->iter->should_be_locked);
 
 		if (unlikely(!bch2_btree_iter_upgrade(i->iter, i->level + 1))) {
 			trace_trans_restart_upgrade(trans->ip, _RET_IP_,
@@ -1072,7 +1066,11 @@ int bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter,
 		n.iter = bch2_trans_get_iter(trans, n.btree_id, n.k->k.p,
 					     BTREE_ITER_INTENT|
 					     BTREE_ITER_NOT_EXTENTS);
+		ret = bch2_btree_iter_traverse(n.iter);
 		bch2_trans_iter_put(trans, n.iter);
+
+		if (ret)
+			return ret;
 	}
 
 	BUG_ON(n.iter->flags & BTREE_ITER_IS_EXTENTS);
-- 
cgit 


From fe5233979ace0008b2fab4a05be897d449e44094 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 27 Jul 2021 17:58:58 -0400
Subject: bcachefs: bch2_trans_relock() only relocks iters that should be
 locked

This avoids unexpected lock restarts in bch2_btree_iter_traverse_all().

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_iter.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 37b314f0a002..a78bdcda2467 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -495,8 +495,7 @@ void bch2_trans_downgrade(struct btree_trans *trans)
 
 /* Btree transaction locking: */
 
-static inline bool btree_iter_should_be_locked(struct btree_trans *trans,
-					       struct btree_iter *iter)
+static inline bool btree_iter_should_be_locked(struct btree_iter *iter)
 {
 	return (iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT) ||
 		iter->should_be_locked;
@@ -507,8 +506,8 @@ bool bch2_trans_relock(struct btree_trans *trans)
 	struct btree_iter *iter;
 
 	trans_for_each_iter(trans, iter)
-		if (!bch2_btree_iter_relock(iter, _RET_IP_) &&
-		    btree_iter_should_be_locked(trans, iter)) {
+		if (btree_iter_should_be_locked(iter) &&
+		    !bch2_btree_iter_relock(iter, _RET_IP_)) {
 			trace_trans_restart_relock(trans->ip, _RET_IP_,
 					iter->btree_id, &iter->real_pos);
 			return false;
-- 
cgit 


From 67b07638f1fab974284846d77cce771fed88ded3 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 24 Jul 2021 17:43:35 -0400
Subject: bcachefs: traverse_all() is responsible for clearing should_be_locked

bch2_btree_iter_traverse_all() may loop, and it needs to clear
iter->should_be_locked on every iteration.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_iter.c | 14 ++------------
 1 file changed, 2 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index a78bdcda2467..2a649166c042 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1249,7 +1249,6 @@ static int __btree_iter_traverse_all(struct btree_trans *trans, int ret,
 	struct btree_iter *iter;
 	u8 sorted[BTREE_ITER_MAX];
 	int i, nr_sorted = 0;
-	bool relock_fail;
 
 	if (trans->in_traverse_all)
 		return -EINTR;
@@ -1257,17 +1256,10 @@ static int __btree_iter_traverse_all(struct btree_trans *trans, int ret,
 	trans->in_traverse_all = true;
 retry_all:
 	nr_sorted = 0;
-	relock_fail = false;
 
 	trans_for_each_iter(trans, iter) {
-		if (!bch2_btree_iter_relock(iter, _THIS_IP_))
-			relock_fail = true;
 		sorted[nr_sorted++] = iter->idx;
-	}
-
-	if (!relock_fail) {
-		trans->in_traverse_all = false;
-		return 0;
+		iter->should_be_locked = false;
 	}
 
 #define btree_iter_cmp_by_idx(_l, _r)				\
@@ -2372,11 +2364,9 @@ void bch2_trans_reset(struct btree_trans *trans, unsigned flags)
 {
 	struct btree_iter *iter;
 
-	trans_for_each_iter(trans, iter) {
+	trans_for_each_iter(trans, iter)
 		iter->flags &= ~(BTREE_ITER_KEEP_UNTIL_COMMIT|
 				 BTREE_ITER_SET_POS_AFTER_COMMIT);
-		iter->should_be_locked = false;
-	}
 
 	bch2_trans_unlink_iters(trans);
 
-- 
cgit 


From 8b3e9bd65f61dde939538452cbb2a608bc562d34 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 24 Jul 2021 19:50:40 -0400
Subject: bcachefs: Always check for transaction restarts

On transaction restart iterators won't be locked anymore - make sure
we're always checking for errors.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/acl.c              | 18 ++++++++++++++++--
 fs/bcachefs/alloc_background.c |  2 +-
 fs/bcachefs/btree_cache.c      | 11 ++++++-----
 fs/bcachefs/btree_cache.h      |  4 ++--
 fs/bcachefs/btree_iter.c       | 12 +++++++-----
 fs/bcachefs/dirent.c           | 18 +++++++++++++++---
 fs/bcachefs/fs-common.c        |  4 ++++
 fs/bcachefs/fs-io.c            |  4 ++--
 fs/bcachefs/inode.c            |  2 +-
 fs/bcachefs/io.c               |  6 +++++-
 fs/bcachefs/move.c             |  2 +-
 fs/bcachefs/reflink.c          |  5 +++--
 fs/bcachefs/str_hash.h         |  2 +-
 fs/bcachefs/xattr.c            |  8 +++++++-
 14 files changed, 71 insertions(+), 27 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/acl.c b/fs/bcachefs/acl.c
index 1642518d3233..d3e2e24758ba 100644
--- a/fs/bcachefs/acl.c
+++ b/fs/bcachefs/acl.c
@@ -222,6 +222,8 @@ struct posix_acl *bch2_get_acl(struct mnt_idmap *idmap,
 	struct btree_iter *iter;
 	struct bkey_s_c_xattr xattr;
 	struct posix_acl *acl = NULL;
+	struct bkey_s_c k;
+	int ret;
 
 	bch2_trans_init(&trans, c, 0, 0);
 retry:
@@ -240,7 +242,14 @@ retry:
 		goto out;
 	}
 
-	xattr = bkey_s_c_to_xattr(bch2_btree_iter_peek_slot(iter));
+	k = bch2_btree_iter_peek_slot(iter);
+	ret = bkey_err(k);
+	if (ret) {
+		acl = ERR_PTR(ret);
+		goto out;
+	}
+
+	xattr = bkey_s_c_to_xattr(k);
 	acl = bch2_acl_from_disk(xattr_val(xattr.v),
 			le16_to_cpu(xattr.v->x_val_len));
 
@@ -358,6 +367,7 @@ int bch2_acl_chmod(struct btree_trans *trans,
 	struct bkey_s_c_xattr xattr;
 	struct bkey_i_xattr *new;
 	struct posix_acl *acl;
+	struct bkey_s_c k;
 	int ret;
 
 	iter = bch2_hash_lookup(trans, bch2_xattr_hash_desc,
@@ -368,7 +378,11 @@ int bch2_acl_chmod(struct btree_trans *trans,
 	if (ret)
 		return ret == -ENOENT ? 0 : ret;
 
-	xattr = bkey_s_c_to_xattr(bch2_btree_iter_peek_slot(iter));
+	k = bch2_btree_iter_peek_slot(iter);
+	xattr = bkey_s_c_to_xattr(k);
+	if (ret)
+		goto err;
+
 	acl = bch2_acl_from_disk(xattr_val(xattr.v),
 			le16_to_cpu(xattr.v->x_val_len));
 	ret = PTR_ERR_OR_ZERO(acl);
diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 26aca7d3977b..932a8176dff7 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -374,7 +374,7 @@ int bch2_alloc_write(struct bch_fs *c, unsigned flags)
 				percpu_ref_put(&ca->ref);
 				goto err;
 			}
-			bch2_btree_iter_next_slot(iter);
+			bch2_btree_iter_advance(iter);
 		}
 	}
 err:
diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index 818b8df063da..252801dee028 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -973,9 +973,9 @@ out:
 	return b;
 }
 
-void bch2_btree_node_prefetch(struct bch_fs *c, struct btree_iter *iter,
-			      const struct bkey_i *k,
-			      enum btree_id btree_id, unsigned level)
+int bch2_btree_node_prefetch(struct bch_fs *c, struct btree_iter *iter,
+			     const struct bkey_i *k,
+			     enum btree_id btree_id, unsigned level)
 {
 	struct btree_cache *bc = &c->btree_cache;
 	struct btree *b;
@@ -985,9 +985,10 @@ void bch2_btree_node_prefetch(struct bch_fs *c, struct btree_iter *iter,
 
 	b = btree_cache_find(bc, k);
 	if (b)
-		return;
+		return 0;
 
-	bch2_btree_node_fill(c, iter, k, btree_id, level, SIX_LOCK_read, false);
+	b = bch2_btree_node_fill(c, iter, k, btree_id, level, SIX_LOCK_read, false);
+	return PTR_ERR_OR_ZERO(b);
 }
 
 void bch2_btree_node_evict(struct bch_fs *c, const struct bkey_i *k)
diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h
index 6e9f08597d94..eb57dc3c70b7 100644
--- a/fs/bcachefs/btree_cache.h
+++ b/fs/bcachefs/btree_cache.h
@@ -27,8 +27,8 @@ struct btree *bch2_btree_node_get(struct btree_trans *, struct btree_iter *,
 struct btree *bch2_btree_node_get_noiter(struct bch_fs *, const struct bkey_i *,
 					 enum btree_id, unsigned, bool);
 
-void bch2_btree_node_prefetch(struct bch_fs *, struct btree_iter *,
-			      const struct bkey_i *, enum btree_id, unsigned);
+int bch2_btree_node_prefetch(struct bch_fs *, struct btree_iter *,
+			     const struct bkey_i *, enum btree_id, unsigned);
 
 void bch2_btree_node_evict(struct bch_fs *, const struct bkey_i *);
 
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 2a649166c042..89de6a8a8a9e 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1142,7 +1142,7 @@ static inline int btree_iter_lock_root(struct btree_iter *iter,
 }
 
 noinline
-static void btree_iter_prefetch(struct btree_iter *iter)
+static int btree_iter_prefetch(struct btree_iter *iter)
 {
 	struct bch_fs *c = iter->trans->c;
 	struct btree_iter_level *l = &iter->l[iter->level];
@@ -1153,10 +1153,11 @@ static void btree_iter_prefetch(struct btree_iter *iter)
 		? (iter->level > 1 ? 0 :  2)
 		: (iter->level > 1 ? 1 : 16);
 	bool was_locked = btree_node_locked(iter, iter->level);
+	int ret = 0;
 
 	bch2_bkey_buf_init(&tmp);
 
-	while (nr) {
+	while (nr && !ret) {
 		if (!bch2_btree_node_relock(iter, iter->level))
 			break;
 
@@ -1166,14 +1167,15 @@ static void btree_iter_prefetch(struct btree_iter *iter)
 			break;
 
 		bch2_bkey_buf_unpack(&tmp, c, l->b, k);
-		bch2_btree_node_prefetch(c, iter, tmp.k, iter->btree_id,
-					 iter->level - 1);
+		ret = bch2_btree_node_prefetch(c, iter, tmp.k, iter->btree_id,
+					       iter->level - 1);
 	}
 
 	if (!was_locked)
 		btree_node_unlock(iter, iter->level);
 
 	bch2_bkey_buf_exit(&tmp, c);
+	return ret;
 }
 
 static noinline void btree_node_mem_ptr_set(struct btree_iter *iter,
@@ -1228,7 +1230,7 @@ static __always_inline int btree_iter_down(struct btree_iter *iter,
 		btree_node_mem_ptr_set(iter, level + 1, b);
 
 	if (iter->flags & BTREE_ITER_PREFETCH)
-		btree_iter_prefetch(iter);
+		ret = btree_iter_prefetch(iter);
 
 	if (btree_node_read_locked(iter, level + 1))
 		btree_node_unlock(iter, level + 1);
diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c
index a95165b8eddf..02b29681f695 100644
--- a/fs/bcachefs/dirent.c
+++ b/fs/bcachefs/dirent.c
@@ -210,6 +210,9 @@ int bch2_dirent_rename(struct btree_trans *trans,
 		goto out;
 
 	old_dst = bch2_btree_iter_peek_slot(dst_iter);
+	ret = bkey_err(old_dst);
+	if (ret)
+		goto out;
 
 	if (mode != BCH_RENAME)
 		*dst_inum = le64_to_cpu(bkey_s_c_to_dirent(old_dst).v->d_inum);
@@ -225,6 +228,10 @@ int bch2_dirent_rename(struct btree_trans *trans,
 		goto out;
 
 	old_src = bch2_btree_iter_peek_slot(src_iter);
+	ret = bkey_err(old_src);
+	if (ret)
+		goto out;
+
 	*src_inum = le64_to_cpu(bkey_s_c_to_dirent(old_src).v->d_inum);
 
 	/* Create new dst key: */
@@ -329,20 +336,25 @@ u64 bch2_dirent_lookup(struct bch_fs *c, u64 dir_inum,
 	struct btree_iter *iter;
 	struct bkey_s_c k;
 	u64 inum = 0;
+	int ret = 0;
 
 	bch2_trans_init(&trans, c, 0, 0);
 
 	iter = __bch2_dirent_lookup_trans(&trans, dir_inum,
 					  hash_info, name, 0);
-	if (IS_ERR(iter)) {
-		BUG_ON(PTR_ERR(iter) == -EINTR);
+	ret = PTR_ERR_OR_ZERO(iter);
+	if (ret)
 		goto out;
-	}
 
 	k = bch2_btree_iter_peek_slot(iter);
+	ret = bkey_err(k);
+	if (ret)
+		goto out;
+
 	inum = le64_to_cpu(bkey_s_c_to_dirent(k).v->d_inum);
 	bch2_trans_iter_put(&trans, iter);
 out:
+	BUG_ON(ret == -EINTR);
 	bch2_trans_exit(&trans);
 	return inum;
 }
diff --git a/fs/bcachefs/fs-common.c b/fs/bcachefs/fs-common.c
index 60c54438074e..2189a11ccad8 100644
--- a/fs/bcachefs/fs-common.c
+++ b/fs/bcachefs/fs-common.c
@@ -168,6 +168,10 @@ int bch2_unlink_trans(struct btree_trans *trans,
 		goto err;
 
 	k = bch2_btree_iter_peek_slot(dirent_iter);
+	ret = bkey_err(k);
+	if (ret)
+		goto err;
+
 	inum = le64_to_cpu(bkey_s_c_to_dirent(k).v->d_inum);
 
 	inode_iter = bch2_inode_peek(trans, inode_u, inum, BTREE_ITER_INTENT);
diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 4af8cd018e3a..1a06f77101ab 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -2668,13 +2668,13 @@ static int __bchfs_fallocate(struct bch_inode_info *inode, int mode,
 		/* already reserved */
 		if (k.k->type == KEY_TYPE_reservation &&
 		    bkey_s_c_to_reservation(k).v->nr_replicas >= replicas) {
-			bch2_btree_iter_next_slot(iter);
+			bch2_btree_iter_advance(iter);
 			continue;
 		}
 
 		if (bkey_extent_is_data(k.k) &&
 		    !(mode & FALLOC_FL_ZERO_RANGE)) {
-			bch2_btree_iter_next_slot(iter);
+			bch2_btree_iter_advance(iter);
 			continue;
 		}
 
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index 8c6627907431..102bd3c9d30f 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -519,7 +519,7 @@ again:
 		if (k.k->p.snapshot == snapshot &&
 		    k.k->type != KEY_TYPE_inode &&
 		    !bch2_btree_key_cache_find(c, BTREE_ID_inodes, SPOS(0, pos, snapshot))) {
-			bch2_btree_iter_next(iter);
+			bch2_btree_iter_advance(iter);
 			continue;
 		}
 
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index f80b3ce4c7d7..5ff8fea80fba 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -245,8 +245,12 @@ int bch2_sum_sector_overwrites(struct btree_trans *trans,
 			 * writing to, because i_size could be up to one block
 			 * less:
 			 */
-			if (!bkey_cmp(old.k->p, new->k.p))
+			if (!bkey_cmp(old.k->p, new->k.p)) {
 				old = bch2_btree_iter_next(iter);
+				ret = bkey_err(old);
+				if (ret)
+					break;
+			}
 
 			if (old.k && !bkey_err(old) &&
 			    old.k->p.inode == extent_iter->pos.inode &&
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index 3069f32efddd..7dea6637ae58 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -190,7 +190,7 @@ nomatch:
 		}
 		atomic_long_inc(&c->extent_migrate_raced);
 		trace_move_race(&new->k);
-		bch2_btree_iter_next_slot(iter);
+		bch2_btree_iter_advance(iter);
 		goto next;
 	}
 out:
diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c
index ebf391245470..8b168246ca38 100644
--- a/fs/bcachefs/reflink.c
+++ b/fs/bcachefs/reflink.c
@@ -192,8 +192,9 @@ static struct bkey_s_c get_next_src(struct btree_iter *iter, struct bpos end)
 			return k;
 	}
 
-	bch2_btree_iter_set_pos(iter, end);
-	return bkey_s_c_null;
+	if (bkey_cmp(iter->pos, end) >= 0)
+		bch2_btree_iter_set_pos(iter, end);
+	return ret ? bkey_s_c_err(ret) : bkey_s_c_null;
 }
 
 s64 bch2_remap_range(struct bch_fs *c,
diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h
index 2ff8e5bd2744..236023494191 100644
--- a/fs/bcachefs/str_hash.h
+++ b/fs/bcachefs/str_hash.h
@@ -209,7 +209,7 @@ int bch2_hash_needs_whiteout(struct btree_trans *trans,
 
 	iter = bch2_trans_copy_iter(trans, start);
 
-	bch2_btree_iter_next_slot(iter);
+	bch2_btree_iter_advance(iter);
 
 	for_each_btree_key_continue(iter, BTREE_ITER_SLOTS, k, ret) {
 		if (k.k->type != desc.key_type &&
diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c
index d7160e8cdc07..bf4164f98743 100644
--- a/fs/bcachefs/xattr.c
+++ b/fs/bcachefs/xattr.c
@@ -124,6 +124,7 @@ static int bch2_xattr_get_trans(struct btree_trans *trans, struct bch_inode_info
 	struct bch_hash_info hash = bch2_hash_info_init(trans->c, &inode->ei_inode);
 	struct btree_iter *iter;
 	struct bkey_s_c_xattr xattr;
+	struct bkey_s_c k;
 	int ret;
 
 	iter = bch2_hash_lookup(trans, bch2_xattr_hash_desc, &hash,
@@ -134,7 +135,12 @@ static int bch2_xattr_get_trans(struct btree_trans *trans, struct bch_inode_info
 	if (ret)
 		goto err;
 
-	xattr = bkey_s_c_to_xattr(bch2_btree_iter_peek_slot(iter));
+	k = bch2_btree_iter_peek_slot(iter);
+	ret = bkey_err(k);
+	if (ret)
+		goto err;
+
+	xattr = bkey_s_c_to_xattr(k);
 	ret = le16_to_cpu(xattr.v->x_val_len);
 	if (buffer) {
 		if (ret > size)
-- 
cgit 


From 700c25b32a776a70849c025d898ba1a7431279e4 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 24 Jul 2021 20:24:10 -0400
Subject: bcachefs: Use bch2_trans_begin() more consistently

Upcoming patch will require that a transaction restart is always
immediately followed by bch2_trans_begin().

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_update.h |  2 +-
 fs/bcachefs/fs-io.c        |  4 ++++
 fs/bcachefs/fs.c           |  2 ++
 fs/bcachefs/io.c           | 17 +++++++++--------
 fs/bcachefs/move.c         |  7 ++++---
 fs/bcachefs/reflink.c      |  4 ++--
 6 files changed, 22 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
index bab135fae0b0..b5f35a419004 100644
--- a/fs/bcachefs/btree_update.h
+++ b/fs/bcachefs/btree_update.h
@@ -114,7 +114,7 @@ static inline int bch2_trans_commit(struct btree_trans *trans,
 		_ret = (_do);						\
 		if (_ret != -EINTR)					\
 			break;						\
-		bch2_trans_reset(_trans, 0);				\
+		bch2_trans_begin(_trans);				\
 	}								\
 									\
 	_ret;								\
diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 1a06f77101ab..30e5acd2e97c 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -784,6 +784,8 @@ static void bchfs_read(struct btree_trans *trans, struct btree_iter *iter,
 
 	bch2_bkey_buf_init(&sk);
 retry:
+	bch2_trans_begin(trans);
+
 	while (1) {
 		struct bkey_s_c k;
 		unsigned bytes, sectors, offset_into_extent;
@@ -2541,6 +2543,8 @@ static long bchfs_fcollapse_finsert(struct bch_inode_info *inode,
 		struct bpos atomic_end;
 		unsigned trigger_flags = 0;
 
+		bch2_trans_begin(&trans);
+
 		k = insert
 			? bch2_btree_iter_peek_prev(src)
 			: bch2_btree_iter_peek(src);
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index 71e738b98967..b2d6e80bbb78 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -911,6 +911,8 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
 	iter = bch2_trans_get_iter(&trans, BTREE_ID_extents,
 				   POS(ei->v.i_ino, start >> 9), 0);
 retry:
+	bch2_trans_begin(&trans);
+
 	while ((k = bch2_btree_iter_peek(iter)).k &&
 	       !(ret = bkey_err(k)) &&
 	       bkey_cmp(iter->pos, end) < 0) {
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index 5ff8fea80fba..f293876e0bbc 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -376,14 +376,13 @@ int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter,
 	struct bkey_s_c k;
 	int ret = 0, ret2 = 0;
 
-	while ((k = bch2_btree_iter_peek(iter)).k &&
+	while ((bch2_trans_begin(trans),
+		(k = bch2_btree_iter_peek(iter)).k) &&
 	       bkey_cmp(iter->pos, end) < 0) {
 		struct disk_reservation disk_res =
 			bch2_disk_reservation_init(c, 0);
 		struct bkey_i delete;
 
-		bch2_trans_begin(trans);
-
 		ret = bkey_err(k);
 		if (ret)
 			goto btree_err;
@@ -2278,12 +2277,13 @@ void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
 
 	bch2_bkey_buf_init(&sk);
 	bch2_trans_init(&trans, c, 0, 0);
-retry:
-	bch2_trans_begin(&trans);
 
 	iter = bch2_trans_get_iter(&trans, BTREE_ID_extents,
 				   POS(inode, bvec_iter.bi_sector),
 				   BTREE_ITER_SLOTS);
+retry:
+	bch2_trans_begin(&trans);
+
 	while (1) {
 		unsigned bytes, sectors, offset_into_extent;
 		enum btree_id data_btree = BTREE_ID_extents;
@@ -2339,19 +2339,20 @@ retry:
 		swap(bvec_iter.bi_size, bytes);
 		bio_advance_iter(&rbio->bio, &bvec_iter, bytes);
 	}
-	bch2_trans_iter_put(&trans, iter);
 
 	if (ret == -EINTR || ret == READ_RETRY || ret == READ_RETRY_AVOID)
 		goto retry;
 
+	bch2_trans_iter_put(&trans, iter);
+	bch2_trans_exit(&trans);
+	bch2_bkey_buf_exit(&sk, c);
+
 	if (ret) {
 		bch_err_inum_ratelimited(c, inode,
 					 "read error %i from btree lookup", ret);
 		rbio->bio.bi_status = BLK_STS_IOERR;
 		bch2_rbio_done(rbio);
 	}
-	bch2_trans_exit(&trans);
-	bch2_bkey_buf_exit(&sk, c);
 }
 
 void bch2_fs_io_exit(struct bch_fs *c)
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index 7dea6637ae58..9a423a3e4570 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -83,7 +83,7 @@ int bch2_migrate_index_update(struct bch_write_op *op)
 		bool extending = false, should_check_enospc;
 		s64 i_sectors_delta = 0, disk_sectors_delta = 0;
 
-		bch2_trans_reset(&trans, 0);
+		bch2_trans_begin(&trans);
 
 		k = bch2_btree_iter_peek_slot(iter);
 		ret = bkey_err(k);
@@ -597,6 +597,8 @@ static int __bch2_move_data(struct bch_fs *c,
 			}
 		} while (delay);
 
+		bch2_trans_begin(&trans);
+
 		k = bch2_btree_iter_peek(iter);
 
 		stats->pos = iter->pos;
@@ -652,8 +654,7 @@ static int __bch2_move_data(struct bch_fs *c,
 					data_cmd, data_opts);
 		if (ret2) {
 			if (ret2 == -EINTR) {
-				bch2_trans_reset(&trans, 0);
-				bch2_trans_cond_resched(&trans);
+				bch2_trans_begin(&trans);
 				continue;
 			}
 
diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c
index 8b168246ca38..3d9c5c5b0eba 100644
--- a/fs/bcachefs/reflink.c
+++ b/fs/bcachefs/reflink.c
@@ -305,12 +305,12 @@ s64 bch2_remap_range(struct bch_fs *c,
 	dst_done = dst_iter->pos.offset - dst_start.offset;
 	new_i_size = min(dst_iter->pos.offset << 9, new_i_size);
 
-	bch2_trans_begin(&trans);
-
 	do {
 		struct bch_inode_unpacked inode_u;
 		struct btree_iter *inode_iter;
 
+		bch2_trans_begin(&trans);
+
 		inode_iter = bch2_inode_peek(&trans, &inode_u,
 				dst_start.inode, BTREE_ITER_INTENT);
 		ret2 = PTR_ERR_OR_ZERO(inode_iter);
-- 
cgit 


From a88171c9e6150e0692b392362fc2d56d83cef998 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 24 Jul 2021 17:38:15 -0400
Subject: bcachefs: Clean up interior update paths

Btree node merging now happens prior to transaction commit, not after,
so we don't need to pay attention to BTREE_INSERT_NOUNLOCK.

Also, foreground_maybe_merge shouldn't be calling
bch2_btree_iter_traverse_all() - this is becoming private to the btree
iterator code and should only be called by bch2_trans_begin().

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_iter.c            |  2 +-
 fs/bcachefs/btree_iter.h            |  2 --
 fs/bcachefs/btree_update_interior.c | 29 +++--------------------------
 3 files changed, 4 insertions(+), 29 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 89de6a8a8a9e..df3aba54526c 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1335,7 +1335,7 @@ out:
 	return ret;
 }
 
-int bch2_btree_iter_traverse_all(struct btree_trans *trans)
+static int bch2_btree_iter_traverse_all(struct btree_trans *trans)
 {
 	return __btree_iter_traverse_all(trans, 0, _RET_IP_);
 }
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index 3889683e16f8..bcb8f0ebbdf4 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -148,8 +148,6 @@ void bch2_btree_iter_reinit_node(struct btree_iter *, struct btree *);
 
 int __must_check bch2_btree_iter_traverse(struct btree_iter *);
 
-int bch2_btree_iter_traverse_all(struct btree_trans *);
-
 struct btree *bch2_btree_iter_peek_node(struct btree_iter *);
 struct btree *bch2_btree_iter_next_node(struct btree_iter *);
 
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index e9b7af4c3574..23a5a4941df0 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -960,9 +960,6 @@ retry:
 	if (flags & BTREE_INSERT_GC_LOCK_HELD)
 		lockdep_assert_held(&c->gc_lock);
 	else if (!down_read_trylock(&c->gc_lock)) {
-		if (flags & BTREE_INSERT_NOUNLOCK)
-			return ERR_PTR(-EINTR);
-
 		bch2_trans_unlock(trans);
 		down_read(&c->gc_lock);
 		if (!bch2_trans_relock(trans)) {
@@ -1005,16 +1002,6 @@ retry:
 				      BTREE_UPDATE_JOURNAL_RES,
 				      journal_flags|JOURNAL_RES_GET_NONBLOCK);
 	if (ret == -EAGAIN) {
-		/*
-		 * this would be cleaner if bch2_journal_preres_get() took a
-		 * closure argument
-		 */
-		if (flags & BTREE_INSERT_NOUNLOCK) {
-			trace_trans_restart_journal_preres_get(trans->ip, _RET_IP_);
-			ret = -EINTR;
-			goto err;
-		}
-
 		bch2_trans_unlock(trans);
 
 		if (flags & BTREE_INSERT_JOURNAL_RECLAIM) {
@@ -1043,8 +1030,7 @@ retry:
 	if (ret)
 		goto err;
 
-	ret = bch2_btree_reserve_get(as, nr_nodes, flags,
-		!(flags & BTREE_INSERT_NOUNLOCK) ? &cl : NULL);
+	ret = bch2_btree_reserve_get(as, nr_nodes, flags, &cl);
 	if (ret)
 		goto err;
 
@@ -1057,8 +1043,6 @@ err:
 	bch2_btree_update_free(as);
 
 	if (ret == -EAGAIN) {
-		BUG_ON(flags & BTREE_INSERT_NOUNLOCK);
-
 		bch2_trans_unlock(trans);
 		closure_sync(&cl);
 		ret = -EINTR;
@@ -1593,12 +1577,12 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans,
 	size_t sib_u64s;
 	int ret = 0, ret2 = 0;
 
-	BUG_ON(!btree_node_locked(iter, level));
 retry:
 	ret = bch2_btree_iter_traverse(iter);
 	if (ret)
-		goto err;
+		return ret;
 
+	BUG_ON(!iter->should_be_locked);
 	BUG_ON(!btree_node_locked(iter, level));
 
 	b = iter->l[level].b;
@@ -1751,13 +1735,6 @@ err:
 	if (ret == -EINTR && bch2_trans_relock(trans))
 		goto retry;
 
-	if (ret == -EINTR && !(flags & BTREE_INSERT_NOUNLOCK)) {
-		ret2 = ret;
-		ret = bch2_btree_iter_traverse_all(trans);
-		if (!ret)
-			goto retry;
-	}
-
 	goto out;
 }
 
-- 
cgit 


From 3cc5288a62f2657b99faab428f993673ca9d033f Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 28 Jul 2021 16:17:10 -0400
Subject: bcachefs: Change lockrestart_do() to always call bch2_trans_begin()

More consistent behaviour means less likely to trip over ourselves in
silly ways.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_update.h | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
index b5f35a419004..a1f2d9822821 100644
--- a/fs/bcachefs/btree_update.h
+++ b/fs/bcachefs/btree_update.h
@@ -110,12 +110,10 @@ static inline int bch2_trans_commit(struct btree_trans *trans,
 ({									\
 	int _ret;							\
 									\
-	while (1) {							\
-		_ret = (_do);						\
-		if (_ret != -EINTR)					\
-			break;						\
+	do {								\
 		bch2_trans_begin(_trans);				\
-	}								\
+		_ret = (_do);						\
+	} while (_ret == -EINTR);					\
 									\
 	_ret;								\
 })
-- 
cgit 


From e5af273fcefb13cdd9e3c27d9ee6c52ee6b12264 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 25 Jul 2021 17:19:52 -0400
Subject: bcachefs: trans->restarted

Start tracking when btree transactions have been restarted - and assert
that we're always calling bch2_trans_begin() immediately after
transaction restart.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_cache.c           | 19 ++++++++++----
 fs/bcachefs/btree_gc.c              |  3 ++-
 fs/bcachefs/btree_iter.c            | 50 ++++++++++++++++++++++++++++---------
 fs/bcachefs/btree_iter.h            |  8 ++++++
 fs/bcachefs/btree_key_cache.c       | 23 ++++++++++-------
 fs/bcachefs/btree_types.h           |  7 +++---
 fs/bcachefs/btree_update_interior.c |  1 +
 fs/bcachefs/btree_update_leaf.c     | 34 ++++++++++++++++++-------
 8 files changed, 106 insertions(+), 39 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index 252801dee028..5c12897964b6 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -655,8 +655,10 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c,
 	 * Parent node must be locked, else we could read in a btree node that's
 	 * been freed:
 	 */
-	if (iter && !bch2_btree_node_relock(iter, level + 1))
+	if (iter && !bch2_btree_node_relock(iter, level + 1)) {
+		btree_trans_restart(iter->trans);
 		return ERR_PTR(-EINTR);
+	}
 
 	b = bch2_btree_node_mem_alloc(c);
 	if (IS_ERR(b))
@@ -695,11 +697,15 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c,
 
 	if (iter &&
 	    (!bch2_trans_relock(iter->trans) ||
-	     !bch2_btree_iter_relock_intent(iter)))
+	     !bch2_btree_iter_relock_intent(iter))) {
+		BUG_ON(!iter->trans->restarted);
 		return ERR_PTR(-EINTR);
+	}
 
-	if (!six_relock_type(&b->c.lock, lock_type, seq))
+	if (!six_relock_type(&b->c.lock, lock_type, seq)) {
+		btree_trans_restart(iter->trans);
 		return ERR_PTR(-EINTR);
+	}
 
 	return b;
 }
@@ -824,7 +830,7 @@ lock_node:
 
 		if (!btree_node_lock(b, k->k.p, level, iter, lock_type,
 				     lock_node_check_fn, (void *) k, trace_ip)) {
-			if (b->hash_val != btree_ptr_hash_val(k))
+			if (!trans->restarted)
 				goto retry;
 			return ERR_PTR(-EINTR);
 		}
@@ -840,6 +846,7 @@ lock_node:
 							      trace_ip,
 							      iter->btree_id,
 							      &iter->real_pos);
+			btree_trans_restart(trans);
 			return ERR_PTR(-EINTR);
 		}
 	}
@@ -858,8 +865,10 @@ lock_node:
 		 */
 		if (iter &&
 		    (!bch2_trans_relock(trans) ||
-		     !bch2_btree_iter_relock_intent(iter)))
+		     !bch2_btree_iter_relock_intent(iter))) {
+			BUG_ON(!trans->restarted);
 			return ERR_PTR(-EINTR);
+		}
 
 		if (!six_relock_type(&b->c.lock, lock_type, seq))
 			goto retry;
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index f0a5b6b2b189..2a84685f4e60 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -1735,7 +1735,8 @@ static int bch2_gc_btree_gens(struct bch_fs *c, enum btree_id btree_id)
 				   BTREE_ITER_NOT_EXTENTS|
 				   BTREE_ITER_ALL_SNAPSHOTS);
 
-	while ((k = bch2_btree_iter_peek(iter)).k &&
+	while ((bch2_trans_begin(&trans),
+		k = bch2_btree_iter_peek(iter)).k &&
 	       !(ret = bkey_err(k))) {
 		c->gc_gens_pos = iter->pos;
 
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index df3aba54526c..816b9369c833 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -316,7 +316,7 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
 	}
 
 	if (unlikely(deadlock_iter)) {
-		trace_trans_restart_would_deadlock(iter->trans->ip, ip,
+		trace_trans_restart_would_deadlock(trans->ip, ip,
 				trans->in_traverse_all, reason,
 				deadlock_iter->btree_id,
 				btree_iter_type(deadlock_iter),
@@ -324,6 +324,7 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
 				iter->btree_id,
 				btree_iter_type(iter),
 				&pos);
+		btree_trans_restart(trans);
 		return false;
 	}
 
@@ -404,6 +405,7 @@ bool bch2_btree_iter_relock_intent(struct btree_iter *iter)
 					? iter->l[l].b->c.lock.state.seq
 					: 0);
 			btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
+			btree_trans_restart(iter->trans);
 			return false;
 		}
 	}
@@ -414,7 +416,11 @@ bool bch2_btree_iter_relock_intent(struct btree_iter *iter)
 __flatten
 bool bch2_btree_iter_relock(struct btree_iter *iter, unsigned long trace_ip)
 {
-	return btree_iter_get_locks(iter, false, trace_ip);
+	bool ret = btree_iter_get_locks(iter, false, trace_ip);
+
+	if (!ret)
+		btree_trans_restart(iter->trans);
+	return ret;
 }
 
 bool __bch2_btree_iter_upgrade(struct btree_iter *iter,
@@ -457,6 +463,8 @@ bool __bch2_btree_iter_upgrade(struct btree_iter *iter,
 			btree_iter_get_locks(linked, true, _THIS_IP_);
 		}
 
+	if (iter->should_be_locked)
+		btree_trans_restart(iter->trans);
 	return false;
 }
 
@@ -505,11 +513,15 @@ bool bch2_trans_relock(struct btree_trans *trans)
 {
 	struct btree_iter *iter;
 
+	if (unlikely(trans->restarted))
+		return false;
+
 	trans_for_each_iter(trans, iter)
 		if (btree_iter_should_be_locked(iter) &&
 		    !bch2_btree_iter_relock(iter, _RET_IP_)) {
 			trace_trans_restart_relock(trans->ip, _RET_IP_,
 					iter->btree_id, &iter->real_pos);
+			BUG_ON(!trans->restarted);
 			return false;
 		}
 	return true;
@@ -1088,11 +1100,12 @@ static int lock_root_check_fn(struct six_lock *lock, void *p)
 	return b == *rootp ? 0 : -1;
 }
 
-static inline int btree_iter_lock_root(struct btree_iter *iter,
+static inline int btree_iter_lock_root(struct btree_trans *trans,
+				       struct btree_iter *iter,
 				       unsigned depth_want,
 				       unsigned long trace_ip)
 {
-	struct bch_fs *c = iter->trans->c;
+	struct bch_fs *c = trans->c;
 	struct btree *b, **rootp = &c->btree_roots[iter->btree_id].b;
 	enum six_lock_type lock_type;
 	unsigned i;
@@ -1120,8 +1133,11 @@ static inline int btree_iter_lock_root(struct btree_iter *iter,
 		if (unlikely(!btree_node_lock(b, SPOS_MAX, iter->level,
 					      iter, lock_type,
 					      lock_root_check_fn, rootp,
-					      trace_ip)))
-			return -EINTR;
+					      trace_ip))) {
+			if (trans->restarted)
+				return -EINTR;
+			continue;
+		}
 
 		if (likely(b == READ_ONCE(*rootp) &&
 			   b->c.level == iter->level &&
@@ -1199,10 +1215,10 @@ static noinline void btree_node_mem_ptr_set(struct btree_iter *iter,
 		btree_node_unlock(iter, plevel);
 }
 
-static __always_inline int btree_iter_down(struct btree_iter *iter,
+static __always_inline int btree_iter_down(struct btree_trans *trans,
+					   struct btree_iter *iter,
 					   unsigned long trace_ip)
 {
-	struct btree_trans *trans  = iter->trans;
 	struct bch_fs *c = trans->c;
 	struct btree_iter_level *l = &iter->l[iter->level];
 	struct btree *b;
@@ -1257,6 +1273,8 @@ static int __btree_iter_traverse_all(struct btree_trans *trans, int ret,
 
 	trans->in_traverse_all = true;
 retry_all:
+	trans->restarted = false;
+
 	nr_sorted = 0;
 
 	trans_for_each_iter(trans, iter) {
@@ -1319,11 +1337,11 @@ retry_all:
 	}
 
 	if (hweight64(trans->iters_live) > 1)
-		ret = -EINTR;
+		ret = btree_trans_restart(trans);
 	else
 		trans_for_each_iter(trans, iter)
 			if (iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT) {
-				ret = -EINTR;
+				ret = btree_trans_restart(trans);
 				break;
 			}
 out:
@@ -1414,8 +1432,8 @@ static int btree_iter_traverse_one(struct btree_iter *iter,
 	 */
 	while (iter->level > depth_want) {
 		ret = btree_iter_node(iter, iter->level)
-			? btree_iter_down(iter, trace_ip)
-			: btree_iter_lock_root(iter, depth_want, trace_ip);
+			? btree_iter_down(trans, iter, trace_ip)
+			: btree_iter_lock_root(trans, iter, depth_want, trace_ip);
 		if (unlikely(ret)) {
 			if (ret == 1) {
 				/*
@@ -1443,6 +1461,7 @@ static int btree_iter_traverse_one(struct btree_iter *iter,
 
 	iter->uptodate = BTREE_ITER_NEED_PEEK;
 out:
+	BUG_ON((ret == -EINTR) != !!trans->restarted);
 	trace_iter_traverse(trans->ip, trace_ip,
 			    btree_iter_type(iter) == BTREE_ITER_CACHED,
 			    iter->btree_id, &iter->real_pos, ret);
@@ -1589,6 +1608,8 @@ static void btree_iter_set_search_pos(struct btree_iter *iter, struct bpos new_p
 	int cmp = bpos_cmp(new_pos, iter->real_pos);
 	unsigned l = iter->level;
 
+	EBUG_ON(iter->trans->restarted);
+
 	if (!cmp)
 		goto out;
 
@@ -2158,6 +2179,8 @@ struct btree_iter *__bch2_trans_get_iter(struct btree_trans *trans,
 	struct btree_iter *iter, *best = NULL;
 	struct bpos real_pos, pos_min = POS_MIN;
 
+	EBUG_ON(trans->restarted);
+
 	if ((flags & BTREE_ITER_TYPE) != BTREE_ITER_NODES &&
 	    btree_node_type_is_extents(btree_id) &&
 	    !(flags & BTREE_ITER_NOT_EXTENTS) &&
@@ -2322,6 +2345,7 @@ void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
 
 		if (old_bytes) {
 			trace_trans_restart_mem_realloced(trans->ip, _RET_IP_, new_bytes);
+			btree_trans_restart(trans);
 			return ERR_PTR(-EINTR);
 		}
 	}
@@ -2396,6 +2420,8 @@ void bch2_trans_reset(struct btree_trans *trans, unsigned flags)
 	if (!(flags & TRANS_RESET_NOTRAVERSE) &&
 	    trans->iters_linked)
 		bch2_btree_iter_traverse_all(trans);
+
+	trans->restarted = false;
 }
 
 static void bch2_trans_alloc_iters(struct btree_trans *trans, struct bch_fs *c)
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index bcb8f0ebbdf4..243f65f0b7ad 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -117,6 +117,14 @@ bool bch2_btree_iter_relock(struct btree_iter *, unsigned long);
 bool bch2_trans_relock(struct btree_trans *);
 void bch2_trans_unlock(struct btree_trans *);
 
+__always_inline
+static inline int btree_trans_restart(struct btree_trans *trans)
+{
+	trans->restarted = true;
+	bch2_trans_unlock(trans);
+	return -EINTR;
+}
+
 bool __bch2_btree_iter_upgrade(struct btree_iter *, unsigned);
 
 static inline bool bch2_btree_iter_upgrade(struct btree_iter *iter,
diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index cb71fe0dd742..8fb18ad2e1ae 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -215,7 +215,7 @@ static int btree_key_cache_fill(struct btree_trans *trans,
 
 	if (!bch2_btree_node_relock(ck_iter, 0)) {
 		trace_transaction_restart_ip(trans->ip, _THIS_IP_);
-		ret = -EINTR;
+		ret = btree_trans_restart(trans);
 		goto err;
 	}
 
@@ -234,6 +234,10 @@ static int btree_key_cache_fill(struct btree_trans *trans,
 		}
 	}
 
+	/*
+	 * XXX: not allowed to be holding read locks when we take a write lock,
+	 * currently
+	 */
 	bch2_btree_node_lock_write(ck_iter->l[0].b, ck_iter);
 	if (new_k) {
 		kfree(ck->k);
@@ -300,10 +304,8 @@ retry:
 
 		if (!btree_node_lock((void *) ck, iter->pos, 0, iter, lock_want,
 				     bkey_cached_check_fn, iter, _THIS_IP_)) {
-			if (ck->key.btree_id != iter->btree_id ||
-			    bpos_cmp(ck->key.pos, iter->pos)) {
+			if (!trans->restarted)
 				goto retry;
-			}
 
 			trace_transaction_restart_ip(trans->ip, _THIS_IP_);
 			ret = -EINTR;
@@ -323,10 +325,10 @@ retry:
 	iter->l[0].b		= (void *) ck;
 fill:
 	if (!ck->valid && !(iter->flags & BTREE_ITER_CACHED_NOFILL)) {
-		if (!btree_node_intent_locked(iter, 0))
-			bch2_btree_iter_upgrade(iter, 1);
-		if (!btree_node_intent_locked(iter, 0)) {
+		if (!iter->locks_want &&
+		    !!__bch2_btree_iter_upgrade(iter, 1)) {
 			trace_transaction_restart_ip(trans->ip, _THIS_IP_);
+			BUG_ON(!trans->restarted);
 			ret = -EINTR;
 			goto err;
 		}
@@ -342,9 +344,12 @@ fill:
 	iter->uptodate = BTREE_ITER_NEED_PEEK;
 
 	if ((iter->flags & BTREE_ITER_INTENT) &&
-	    !iter->locks_want &&
-	    __bch2_btree_iter_upgrade(iter, 1))
+	    !bch2_btree_iter_upgrade(iter, 1)) {
+		BUG_ON(!trans->restarted);
 		ret = -EINTR;
+	}
+
+	BUG_ON(!ret && !btree_node_locked(iter, 0));
 
 	return ret;
 err:
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 78b312e5bcf3..4fa37fbc41fa 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -380,9 +380,10 @@ struct btree_trans {
 	int			srcu_idx;
 
 	u8			nr_updates;
-	unsigned		used_mempool:1;
-	unsigned		error:1;
-	unsigned		in_traverse_all:1;
+	bool			used_mempool:1;
+	bool			error:1;
+	bool			in_traverse_all:1;
+	bool			restarted:1;
 	/*
 	 * For when bch2_trans_update notices we'll be splitting a compressed
 	 * extent:
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 23a5a4941df0..2e8697196ac9 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -1006,6 +1006,7 @@ retry:
 
 		if (flags & BTREE_INSERT_JOURNAL_RECLAIM) {
 			bch2_btree_update_free(as);
+			btree_trans_restart(trans);
 			return ERR_PTR(ret);
 		}
 
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index b354624133a1..3fbdf3e5fe01 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -384,6 +384,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans,
 
 	if (race_fault()) {
 		trace_trans_restart_fault_inject(trans->ip, trace_ip);
+		trans->restarted = true;
 		return -EINTR;
 	}
 
@@ -520,10 +521,17 @@ static noinline int maybe_do_btree_merge(struct btree_trans *trans, struct btree
 		u64s_delta -= !bkey_deleted(old.k) ? old.k->u64s : 0;
 	}
 
-	return u64s_delta <= 0
-		? (bch2_foreground_maybe_merge(trans, iter, iter->level,
-				trans->flags & ~BTREE_INSERT_NOUNLOCK) ?: -EINTR)
-		: 0;
+	if (u64s_delta > 0)
+		return 0;
+
+	ret = bch2_foreground_maybe_merge(trans, iter, iter->level,
+				trans->flags & ~BTREE_INSERT_NOUNLOCK);
+	if (!ret) {
+		ret = -EINTR;
+		trans->restarted = true;
+	}
+
+	return ret;
 }
 
 /*
@@ -587,6 +595,7 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans,
 					trace_trans_restart_upgrade(trans->ip, trace_ip,
 								    iter->btree_id,
 								    &iter->real_pos);
+					trans->restarted = true;
 					return -EINTR;
 				}
 			} else {
@@ -696,6 +705,7 @@ int bch2_trans_commit_error(struct btree_trans *trans,
 			trace_trans_restart_btree_node_split(trans->ip, trace_ip,
 							     i->iter->btree_id,
 							     &i->iter->real_pos);
+			trans->restarted = true;
 			ret = -EINTR;
 		}
 		break;
@@ -704,7 +714,7 @@ int bch2_trans_commit_error(struct btree_trans *trans,
 
 		ret = bch2_replicas_delta_list_mark(c, trans->fs_usage_deltas);
 		if (ret)
-			return ret;
+			break;
 
 		if (bch2_trans_relock(trans))
 			return 0;
@@ -716,12 +726,15 @@ int bch2_trans_commit_error(struct btree_trans *trans,
 		bch2_trans_unlock(trans);
 
 		if ((trans->flags & BTREE_INSERT_JOURNAL_RECLAIM) &&
-		    !(trans->flags & BTREE_INSERT_JOURNAL_RESERVED))
-			return -EAGAIN;
+		    !(trans->flags & BTREE_INSERT_JOURNAL_RESERVED)) {
+			trans->restarted = true;
+			ret = -EAGAIN;
+			break;
+		}
 
 		ret = bch2_trans_journal_res_get(trans, JOURNAL_RES_GET_CHECK);
 		if (ret)
-			return ret;
+			break;
 
 		if (bch2_trans_relock(trans))
 			return 0;
@@ -737,7 +750,7 @@ int bch2_trans_commit_error(struct btree_trans *trans,
 		wait_event_freezable(c->journal.reclaim_wait,
 				     (ret = journal_reclaim_wait_done(c)));
 		if (ret < 0)
-			return ret;
+			break;
 
 		if (bch2_trans_relock(trans))
 			return 0;
@@ -750,6 +763,7 @@ int bch2_trans_commit_error(struct btree_trans *trans,
 		break;
 	}
 
+	BUG_ON((ret == EINTR || ret == -EAGAIN) && !trans->restarted);
 	BUG_ON(ret == -ENOSPC && (flags & BTREE_INSERT_NOFAIL));
 
 	return ret;
@@ -972,6 +986,7 @@ int __bch2_trans_commit(struct btree_trans *trans)
 			trace_trans_restart_upgrade(trans->ip, _RET_IP_,
 						    i->iter->btree_id,
 						    &i->iter->pos);
+			trans->restarted = true;
 			ret = -EINTR;
 			goto out;
 		}
@@ -994,6 +1009,7 @@ int __bch2_trans_commit(struct btree_trans *trans)
 			goto err;
 	}
 retry:
+	BUG_ON(trans->restarted);
 	memset(&trans->journal_res, 0, sizeof(trans->journal_res));
 
 	ret = do_bch2_trans_commit(trans, &i, _RET_IP_);
-- 
cgit 


From b4e09b351be7085cada5c08c1eabee5f0eb498b0 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 27 Jul 2021 18:01:52 -0400
Subject: bcachefs: bch2_btree_iter_traverse() shouldn't normally call
 traverse_all()

If there's more than one iterator in the btree_trans, it's requried to
call bch2_trans_begin() to handle transaction restarts.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_iter.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 816b9369c833..ac7a7c18ebe2 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1476,8 +1476,10 @@ static int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter)
 
 	ret =   bch2_trans_cond_resched(trans) ?:
 		btree_iter_traverse_one(iter, _RET_IP_);
-	if (unlikely(ret))
+	if (unlikely(ret) && hweight64(trans->iters_linked) == 1) {
 		ret = __btree_iter_traverse_all(trans, ret, _RET_IP_);
+		BUG_ON(ret == -EINTR);
+	}
 
 	return ret;
 }
-- 
cgit 


From e829b7175b54c9ccdf4412baea592bc495bc97f1 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 22 Jul 2021 12:39:11 -0400
Subject: bcachefs: Ensure btree_iter_traverse() obeys iter->should_be_locked

iter->should_be_locked means that if bch2_btree_iter_relock() fails, we
need to restart the transaction.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_iter.c | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index ac7a7c18ebe2..b7292f932000 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1403,6 +1403,15 @@ static int btree_iter_traverse_one(struct btree_iter *iter,
 	unsigned l, depth_want = iter->level;
 	int ret = 0;
 
+	/*
+	 * Ensure we obey iter->should_be_locked: if it's set, we can't unlock
+	 * and re-traverse the iterator without a transaction restart:
+	 */
+	if (iter->should_be_locked) {
+		ret = bch2_btree_iter_relock(iter, trace_ip) ? 0 : -EINTR;
+		goto out;
+	}
+
 	if (btree_iter_type(iter) == BTREE_ITER_CACHED) {
 		ret = bch2_btree_iter_traverse_cached(iter);
 		goto out;
-- 
cgit 


From 955af634410cbaead0498485ac376165b0f947fb Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 24 Jul 2021 23:57:28 -0400
Subject: bcachefs: __bch2_trans_commit() no longer calls bch2_trans_reset()

It's now the caller's responsibility to call bch2_trans_begin.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_iter.c        | 25 +++++++++----------------
 fs/bcachefs/btree_iter.h        | 17 +----------------
 fs/bcachefs/btree_update_leaf.c | 20 ++++++++++++++------
 3 files changed, 24 insertions(+), 38 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index b7292f932000..929bf20c1029 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -2382,22 +2382,14 @@ inline void bch2_trans_unlink_iters(struct btree_trans *trans)
 }
 
 /**
- * bch2_trans_reset() - reset a transaction after a interrupted attempt
+ * bch2_trans_begin() - reset a transaction after a interrupted attempt
  * @trans: transaction to reset
- * @flags: transaction reset flags.
  *
  * While iterating over nodes or updating nodes a attempt to lock a btree
  * node may return EINTR when the trylock fails. When this occurs
- * bch2_trans_reset() or bch2_trans_begin() should be called and the
- * transaction retried.
- *
- * Transaction reset flags include:
- *
- *  - TRANS_RESET_NOUNLOCK   - Do not attempt to unlock and reschedule the
- *			       transaction.
- *  - TRANS_RESET_NOTRAVERSE - Do not traverse all linked iters.
+ * bch2_trans_begin() should be called and the transaction retried.
  */
-void bch2_trans_reset(struct btree_trans *trans, unsigned flags)
+void bch2_trans_begin(struct btree_trans *trans)
 {
 	struct btree_iter *iter;
 
@@ -2405,8 +2397,11 @@ void bch2_trans_reset(struct btree_trans *trans, unsigned flags)
 		iter->flags &= ~(BTREE_ITER_KEEP_UNTIL_COMMIT|
 				 BTREE_ITER_SET_POS_AFTER_COMMIT);
 
+	/*
+	 * XXX: we shouldn't be doing this if the transaction was restarted, but
+	 * currently we still overflow transaction iterators if we do that
+	 * */
 	bch2_trans_unlink_iters(trans);
-
 	trans->iters_touched &= trans->iters_live;
 
 	trans->extra_journal_res	= 0;
@@ -2425,11 +2420,9 @@ void bch2_trans_reset(struct btree_trans *trans, unsigned flags)
 		       (void *) &trans->fs_usage_deltas->memset_start);
 	}
 
-	if (!(flags & TRANS_RESET_NOUNLOCK))
-		bch2_trans_cond_resched(trans);
+	bch2_trans_cond_resched(trans);
 
-	if (!(flags & TRANS_RESET_NOTRAVERSE) &&
-	    trans->iters_linked)
+	if (trans->restarted)
 		bch2_btree_iter_traverse_all(trans);
 
 	trans->restarted = false;
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index 243f65f0b7ad..aeabc07d2c9c 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -323,22 +323,7 @@ static inline void set_btree_iter_dontneed(struct btree_trans *trans, struct btr
 	trans->iters_touched &= ~(1ULL << iter->idx);
 }
 
-#define TRANS_RESET_NOTRAVERSE		(1 << 0)
-#define TRANS_RESET_NOUNLOCK		(1 << 1)
-
-void bch2_trans_reset(struct btree_trans *, unsigned);
-
-/**
- * bch2_trans_begin() - ensure lock consistency of transaction on retry
- * @trans: transaction to prepare
- *
- * Ensure lock ordering is correct before potentially retrying a transaction
- * after a failed trylock.
- */
-static inline void bch2_trans_begin(struct btree_trans *trans)
-{
-	return bch2_trans_reset(trans, 0);
-}
+void bch2_trans_begin(struct btree_trans *);
 
 void *bch2_trans_kmalloc(struct btree_trans *, size_t);
 void bch2_trans_init(struct btree_trans *, struct bch_fs *, unsigned, size_t);
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 3fbdf3e5fe01..b119bb9eb6f1 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -923,7 +923,7 @@ int __bch2_trans_commit(struct btree_trans *trans)
 	struct btree_insert_entry *i = NULL;
 	struct btree_iter *iter;
 	bool trans_trigger_run;
-	unsigned u64s, reset_flags = 0;
+	unsigned u64s;
 	int ret = 0;
 
 	if (!trans->nr_updates &&
@@ -1030,11 +1030,19 @@ out:
 	if (likely(!(trans->flags & BTREE_INSERT_NOCHECK_RW)))
 		percpu_ref_put(&trans->c->writes);
 out_reset:
-	if (!ret)
-		reset_flags |= TRANS_RESET_NOTRAVERSE;
-	if (!ret && (trans->flags & BTREE_INSERT_NOUNLOCK))
-		reset_flags |= TRANS_RESET_NOUNLOCK;
-	bch2_trans_reset(trans, reset_flags);
+	trans->extra_journal_res	= 0;
+	trans->nr_updates		= 0;
+	trans->hooks			= NULL;
+	trans->extra_journal_entries	= NULL;
+	trans->extra_journal_entry_u64s	= 0;
+
+	if (trans->fs_usage_deltas) {
+		trans->fs_usage_deltas->used = 0;
+		memset((void *) trans->fs_usage_deltas +
+		       offsetof(struct replicas_delta_list, memset_start), 0,
+		       (void *) &trans->fs_usage_deltas->memset_end -
+		       (void *) &trans->fs_usage_deltas->memset_start);
+	}
 
 	return ret;
 err:
-- 
cgit 


From b253a90d06b8a8004eeb057e82f72dc8120d972b Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 24 Jul 2021 14:25:01 -0400
Subject: bcachefs: Btree splits no longer automatically cause a transaction
 restart

With the new and improved handling of transaction restarts, this should
finally be safe.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_update_leaf.c | 49 +++++------------------------------------
 1 file changed, 5 insertions(+), 44 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index b119bb9eb6f1..7ce94340f817 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -524,14 +524,8 @@ static noinline int maybe_do_btree_merge(struct btree_trans *trans, struct btree
 	if (u64s_delta > 0)
 		return 0;
 
-	ret = bch2_foreground_maybe_merge(trans, iter, iter->level,
+	return bch2_foreground_maybe_merge(trans, iter, iter->level,
 				trans->flags & ~BTREE_INSERT_NOUNLOCK);
-	if (!ret) {
-		ret = -EINTR;
-		trans->restarted = true;
-	}
-
-	return ret;
 }
 
 /*
@@ -664,50 +658,17 @@ int bch2_trans_commit_error(struct btree_trans *trans,
 			    int ret, unsigned long trace_ip)
 {
 	struct bch_fs *c = trans->c;
-	unsigned flags = trans->flags;
-
-	/*
-	 * BTREE_INSERT_NOUNLOCK means don't unlock _after_ successful btree
-	 * update; if we haven't done anything yet it doesn't apply
-	 */
-	flags &= ~BTREE_INSERT_NOUNLOCK;
 
 	switch (ret) {
 	case BTREE_INSERT_BTREE_NODE_FULL:
-		ret = bch2_btree_split_leaf(trans, i->iter, flags);
-
-		/*
-		 * if the split succeeded without dropping locks the insert will
-		 * still be atomic (what the caller peeked() and is overwriting
-		 * won't have changed)
-		 */
-#if 0
-		/*
-		 * XXX:
-		 * split -> btree node merging (of parent node) might still drop
-		 * locks when we're not passing it BTREE_INSERT_NOUNLOCK
-		 *
-		 * we don't want to pass BTREE_INSERT_NOUNLOCK to split as that
-		 * will inhibit merging - but we don't have a reliable way yet
-		 * (do we?) of checking if we dropped locks in this path
-		 */
+		ret = bch2_btree_split_leaf(trans, i->iter, trans->flags);
 		if (!ret)
-			goto retry;
-#endif
+			return 0;
 
-		/*
-		 * don't care if we got ENOSPC because we told split it
-		 * couldn't block:
-		 */
-		if (!ret ||
-		    ret == -EINTR ||
-		    (flags & BTREE_INSERT_NOUNLOCK)) {
+		if (ret == -EINTR)
 			trace_trans_restart_btree_node_split(trans->ip, trace_ip,
 							     i->iter->btree_id,
 							     &i->iter->real_pos);
-			trans->restarted = true;
-			ret = -EINTR;
-		}
 		break;
 	case BTREE_INSERT_NEED_MARK_REPLICAS:
 		bch2_trans_unlock(trans);
@@ -764,7 +725,7 @@ int bch2_trans_commit_error(struct btree_trans *trans,
 	}
 
 	BUG_ON((ret == EINTR || ret == -EAGAIN) && !trans->restarted);
-	BUG_ON(ret == -ENOSPC && (flags & BTREE_INSERT_NOFAIL));
+	BUG_ON(ret == -ENOSPC && (trans->flags & BTREE_INSERT_NOFAIL));
 
 	return ret;
 }
-- 
cgit 


From 1a488e73067c65086191a63fe61e57692383fb27 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 27 Jul 2021 22:15:04 -0400
Subject: bcachefs: Kill BTREE_INSERT_NOUNLOCK

With the recent transaction restart changes, it's no longer needed - all
transaction commits have BTREE_INSERT_NOUNLOCK semantics.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/acl.c                   |  3 +--
 fs/bcachefs/btree_key_cache.c       |  1 -
 fs/bcachefs/btree_update.h          |  6 ------
 fs/bcachefs/btree_update_interior.c |  3 +--
 fs/bcachefs/btree_update_leaf.c     |  4 ++--
 fs/bcachefs/fs.c                    | 12 +++---------
 fs/bcachefs/fsck.c                  |  3 +--
 fs/bcachefs/quota.c                 |  2 +-
 8 files changed, 9 insertions(+), 25 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/acl.c b/fs/bcachefs/acl.c
index d3e2e24758ba..39ac6d2e178d 100644
--- a/fs/bcachefs/acl.c
+++ b/fs/bcachefs/acl.c
@@ -336,8 +336,7 @@ retry:
 
 	ret =   bch2_inode_write(&trans, inode_iter, &inode_u) ?:
 		bch2_trans_commit(&trans, NULL,
-				  &inode->ei_journal_seq,
-				  BTREE_INSERT_NOUNLOCK);
+				  &inode->ei_journal_seq, 0);
 btree_err:
 	bch2_trans_iter_put(&trans, inode_iter);
 
diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index 8fb18ad2e1ae..ac8f40810d7a 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -406,7 +406,6 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans,
 				  BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|
 				  BTREE_TRIGGER_NORUN) ?:
 		bch2_trans_commit(trans, NULL, NULL,
-				  BTREE_INSERT_NOUNLOCK|
 				  BTREE_INSERT_NOCHECK_RW|
 				  BTREE_INSERT_NOFAIL|
 				  BTREE_INSERT_USE_RESERVE|
diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
index a1f2d9822821..217b52e1a168 100644
--- a/fs/bcachefs/btree_update.h
+++ b/fs/bcachefs/btree_update.h
@@ -15,7 +15,6 @@ bool bch2_btree_bset_insert_key(struct btree_iter *, struct btree *,
 void bch2_btree_add_journal_pin(struct bch_fs *, struct btree *, u64);
 
 enum btree_insert_flags {
-	__BTREE_INSERT_NOUNLOCK,
 	__BTREE_INSERT_NOFAIL,
 	__BTREE_INSERT_NOCHECK_RW,
 	__BTREE_INSERT_LAZY_RW,
@@ -29,11 +28,6 @@ enum btree_insert_flags {
 	__BCH_HASH_SET_MUST_REPLACE,
 };
 
-/*
- * Don't drop locks _after_ successfully updating btree:
- */
-#define BTREE_INSERT_NOUNLOCK		(1 << __BTREE_INSERT_NOUNLOCK)
-
 /* Don't check for -ENOSPC: */
 #define BTREE_INSERT_NOFAIL		(1 << __BTREE_INSERT_NOFAIL)
 
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 2e8697196ac9..2594738f3d53 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -1927,8 +1927,7 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans,
 				BTREE_INSERT_NOFAIL|
 				BTREE_INSERT_NOCHECK_RW|
 				BTREE_INSERT_JOURNAL_RECLAIM|
-				BTREE_INSERT_JOURNAL_RESERVED|
-				BTREE_INSERT_NOUNLOCK);
+				BTREE_INSERT_JOURNAL_RESERVED);
 	if (ret)
 		goto err;
 
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 7ce94340f817..882b9da8fe61 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -524,8 +524,8 @@ static noinline int maybe_do_btree_merge(struct btree_trans *trans, struct btree
 	if (u64s_delta > 0)
 		return 0;
 
-	return bch2_foreground_maybe_merge(trans, iter, iter->level,
-				trans->flags & ~BTREE_INSERT_NOUNLOCK);
+	return bch2_foreground_maybe_merge(trans, iter,
+				iter->level, trans->flags);
 }
 
 /*
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index b2d6e80bbb78..f6c058540712 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -157,7 +157,6 @@ retry:
 		bch2_inode_write(&trans, iter, &inode_u) ?:
 		bch2_trans_commit(&trans, NULL,
 				  &inode->ei_journal_seq,
-				  BTREE_INSERT_NOUNLOCK|
 				  BTREE_INSERT_NOFAIL);
 
 	/*
@@ -295,8 +294,7 @@ retry:
 	if (unlikely(ret))
 		goto err_before_quota;
 
-	ret   = bch2_trans_commit(&trans, NULL, &journal_seq,
-				  BTREE_INSERT_NOUNLOCK);
+	ret   = bch2_trans_commit(&trans, NULL, &journal_seq, 0);
 	if (unlikely(ret)) {
 		bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, -1,
 				KEY_TYPE_QUOTA_WARN);
@@ -417,8 +415,7 @@ static int __bch2_link(struct bch_fs *c,
 	mutex_lock(&inode->ei_update_lock);
 	bch2_trans_init(&trans, c, 4, 1024);
 
-	ret = __bch2_trans_do(&trans, NULL, &inode->ei_journal_seq,
-			      BTREE_INSERT_NOUNLOCK,
+	ret = __bch2_trans_do(&trans, NULL, &inode->ei_journal_seq, 0,
 			bch2_link_trans(&trans,
 					dir->v.i_ino,
 					inode->v.i_ino, &dir_u, &inode_u,
@@ -470,7 +467,6 @@ static int bch2_unlink(struct inode *vdir, struct dentry *dentry)
 	bch2_trans_init(&trans, c, 4, 1024);
 
 	ret = __bch2_trans_do(&trans, NULL, &dir->ei_journal_seq,
-			      BTREE_INSERT_NOUNLOCK|
 			      BTREE_INSERT_NOFAIL,
 			bch2_unlink_trans(&trans,
 					  dir->v.i_ino, &dir_u,
@@ -591,8 +587,7 @@ static int bch2_rename2(struct mnt_idmap *idmap,
 			goto err;
 	}
 
-	ret = __bch2_trans_do(&trans, NULL, &journal_seq,
-			      BTREE_INSERT_NOUNLOCK,
+	ret = __bch2_trans_do(&trans, NULL, &journal_seq, 0,
 			bch2_rename_trans(&trans,
 					  src_dir->v.i_ino, &src_dir_u,
 					  dst_dir->v.i_ino, &dst_dir_u,
@@ -735,7 +730,6 @@ retry:
 	ret =   bch2_inode_write(&trans, inode_iter, &inode_u) ?:
 		bch2_trans_commit(&trans, NULL,
 				  &inode->ei_journal_seq,
-				  BTREE_INSERT_NOUNLOCK|
 				  BTREE_INSERT_NOFAIL);
 btree_err:
 	bch2_trans_iter_put(&trans, inode_iter);
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 63d42542c194..36eba46d566e 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -803,8 +803,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
 		ret = __write_inode(trans, &target, target_snapshot) ?:
 			bch2_trans_commit(trans, NULL, NULL,
 					  BTREE_INSERT_NOFAIL|
-					  BTREE_INSERT_LAZY_RW|
-					  BTREE_INSERT_NOUNLOCK);
+					  BTREE_INSERT_LAZY_RW);
 		if (ret)
 			return ret;
 		return -EINTR;
diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c
index 35b409e0f366..7861781a4a7f 100644
--- a/fs/bcachefs/quota.c
+++ b/fs/bcachefs/quota.c
@@ -760,7 +760,7 @@ static int bch2_set_quota(struct super_block *sb, struct kqid qid,
 	bkey_quota_init(&new_quota.k_i);
 	new_quota.k.p = POS(qid.type, from_kqid(&init_user_ns, qid));
 
-	ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_NOUNLOCK,
+	ret = bch2_trans_do(c, NULL, NULL, 0,
 			    bch2_set_quota_trans(&trans, &new_quota, qdq)) ?:
 		__bch2_quota_set(c, bkey_i_to_s_c(&new_quota.k_i));
 
-- 
cgit 


From 0d32711ef65511ff52b5c42e5618a6e83b406cad Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 27 Jul 2021 22:32:05 -0400
Subject: bcachefs: traverse_all() shouldn't be restarting the transaction

We're only called by bch2_trans_begin() now.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_iter.c | 9 ---------
 1 file changed, 9 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 929bf20c1029..d1a03fdba9ce 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1335,15 +1335,6 @@ retry_all:
 		if (ret)
 			goto retry_all;
 	}
-
-	if (hweight64(trans->iters_live) > 1)
-		ret = btree_trans_restart(trans);
-	else
-		trans_for_each_iter(trans, iter)
-			if (iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT) {
-				ret = btree_trans_restart(trans);
-				break;
-			}
 out:
 	bch2_btree_cache_cannibalize_unlock(c);
 
-- 
cgit 


From 9cba7bf7c7edc6ae6579945588964b3b90e33258 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 27 Jul 2021 22:28:39 -0400
Subject: bcachefs: Don't drop read locks at transaction commit time

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_update_leaf.c | 27 ++++++++-------------------
 1 file changed, 8 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 882b9da8fe61..20275be97256 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -348,11 +348,6 @@ static inline void do_btree_insert_one(struct btree_trans *trans,
 	}
 }
 
-static noinline void bch2_btree_iter_unlock_noinline(struct btree_iter *iter)
-{
-	__bch2_btree_iter_unlock(iter);
-}
-
 static noinline void bch2_trans_mark_gc(struct btree_trans *trans)
 {
 	struct bch_fs *c = trans->c;
@@ -582,21 +577,15 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans,
 	 * or anything else that might call bch2_trans_relock(), since that
 	 * would just retake the read locks:
 	 */
-	trans_for_each_iter(trans, iter) {
-		if (iter->nodes_locked != iter->nodes_intent_locked) {
-			if (btree_iter_keep(trans, iter)) {
-				if (!bch2_btree_iter_upgrade(iter, 1)) {
-					trace_trans_restart_upgrade(trans->ip, trace_ip,
-								    iter->btree_id,
-								    &iter->real_pos);
-					trans->restarted = true;
-					return -EINTR;
-				}
-			} else {
-				bch2_btree_iter_unlock_noinline(iter);
-			}
+	trans_for_each_iter(trans, iter)
+		if (iter->nodes_locked != iter->nodes_intent_locked &&
+		    !bch2_btree_iter_upgrade(iter, 1)) {
+			trace_trans_restart_upgrade(trans->ip, trace_ip,
+						    iter->btree_id,
+						    &iter->real_pos);
+			trans->restarted = true;
+			return -EINTR;
 		}
-	}
 
 	trans_for_each_update(trans, i) {
 		const char *invalid = bch2_bkey_invalid(c,
-- 
cgit 


From 877da05ffb13c1a998070707e0d15df0167f9364 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 30 Jul 2021 14:33:06 -0400
Subject: bcachefs: Zero out mem_ptr field in btree ptr keys from journal
 replay

This fixes a bad ptr deref on recovery from unclean shutdown in
bch2_btree_node_get_noiter().

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/recovery.c | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 84e224fb0d01..afb72648fe54 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -39,6 +39,20 @@ static void drop_alloc_keys(struct journal_keys *keys)
 	keys->nr = dst;
 }
 
+/*
+ * Btree node pointers have a field to stack a pointer to the in memory btree
+ * node; we need to zero out this field when reading in btree nodes, or when
+ * reading in keys from the journal:
+ */
+static void zero_out_btree_mem_ptr(struct journal_keys *keys)
+{
+	struct journal_key *i;
+
+	for (i = keys->d; i < keys->d + keys->nr; i++)
+		if (i->k->k.type == KEY_TYPE_btree_ptr_v2)
+			bkey_i_to_btree_ptr_v2(i->k)->v.mem_ptr = 0;
+}
+
 /* iterate over keys read from the journal: */
 
 static int __journal_key_cmp(enum btree_id	l_btree_id,
@@ -1072,6 +1086,8 @@ use_clean:
 		drop_alloc_keys(&c->journal_keys);
 	}
 
+	zero_out_btree_mem_ptr(&c->journal_keys);
+
 	ret = journal_replay_early(c, clean, &c->journal_entries);
 	if (ret)
 		goto err;
-- 
cgit 


From 0423fb7185e3c0178b3a09f24afc3777c2ef9522 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 12 Jun 2021 15:45:45 -0400
Subject: bcachefs: Keep a sorted list of btree iterators

This will be used to make other operations on btree iterators within a
transaction more efficient, and enable some other improvements to how we
manage btree iterators.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c  | 235 ++++++++++++++++++++++++++++++++++++++--------
 fs/bcachefs/btree_iter.h  |  43 +++++++--
 fs/bcachefs/btree_types.h |   4 +
 fs/bcachefs/util.h        |  14 +++
 4 files changed, 249 insertions(+), 47 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index d1a03fdba9ce..c14be8093116 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -18,10 +18,20 @@
 #include <linux/prefetch.h>
 
 static void btree_iter_set_search_pos(struct btree_iter *, struct bpos);
+static inline void btree_trans_sort_iters(struct btree_trans *);
 static struct btree_iter *btree_iter_child_alloc(struct btree_iter *, unsigned long);
-static struct btree_iter *btree_trans_iter_alloc(struct btree_trans *);
+static struct btree_iter *btree_trans_iter_alloc(struct btree_trans *,
+						 struct btree_iter *);
 static void btree_iter_copy(struct btree_iter *, struct btree_iter *);
 
+static inline int btree_iter_cmp(const struct btree_iter *l,
+				 const struct btree_iter *r)
+{
+	return   cmp_int(l->btree_id, r->btree_id) ?:
+		-cmp_int(btree_iter_is_cached(l), btree_iter_is_cached(r)) ?:
+		 bkey_cmp(l->real_pos, r->real_pos);
+}
+
 static inline struct bpos bkey_successor(struct btree_iter *iter, struct bpos p)
 {
 	EBUG_ON(btree_iter_type(iter) == BTREE_ITER_NODES);
@@ -925,6 +935,7 @@ static inline struct bkey_s_c btree_iter_level_peek(struct btree_iter *iter,
 			bch2_btree_node_iter_peek(&l->iter, l->b));
 
 	iter->real_pos = k.k ? k.k->p : l->b->key.k.p;
+	iter->trans->iters_sorted = false;
 	return k;
 }
 
@@ -935,6 +946,7 @@ static inline struct bkey_s_c btree_iter_level_prev(struct btree_iter *iter,
 			bch2_btree_node_iter_prev(&l->iter, l->b));
 
 	iter->real_pos = k.k ? k.k->p : l->b->data->min_key;
+	iter->trans->iters_sorted = false;
 	return k;
 }
 
@@ -1264,9 +1276,8 @@ static int __btree_iter_traverse_all(struct btree_trans *trans, int ret,
 				     unsigned long trace_ip)
 {
 	struct bch_fs *c = trans->c;
-	struct btree_iter *iter;
-	u8 sorted[BTREE_ITER_MAX];
-	int i, nr_sorted = 0;
+	struct btree_iter *iter, *prev = NULL;
+	int i;
 
 	if (trans->in_traverse_all)
 		return -EINTR;
@@ -1275,28 +1286,21 @@ static int __btree_iter_traverse_all(struct btree_trans *trans, int ret,
 retry_all:
 	trans->restarted = false;
 
-	nr_sorted = 0;
-
-	trans_for_each_iter(trans, iter) {
-		sorted[nr_sorted++] = iter->idx;
+	trans_for_each_iter(trans, iter)
 		iter->should_be_locked = false;
-	}
-
-#define btree_iter_cmp_by_idx(_l, _r)				\
-		btree_iter_lock_cmp(&trans->iters[_l], &trans->iters[_r])
 
-	bubble_sort(sorted, nr_sorted, btree_iter_cmp_by_idx);
-#undef btree_iter_cmp_by_idx
+	btree_trans_sort_iters(trans);
 
-	for (i = nr_sorted - 2; i >= 0; --i) {
-		struct btree_iter *iter1 = trans->iters + sorted[i];
-		struct btree_iter *iter2 = trans->iters + sorted[i + 1];
+	trans_for_each_iter_inorder_reverse(trans, iter, i) {
+		if (prev) {
+			if (iter->btree_id == prev->btree_id &&
+			    iter->locks_want < prev->locks_want)
+				__bch2_btree_iter_upgrade(iter, prev->locks_want);
+			else if (!iter->locks_want && prev->locks_want)
+				__bch2_btree_iter_upgrade(iter, 1);
+		}
 
-		if (iter1->btree_id == iter2->btree_id &&
-		    iter1->locks_want < iter2->locks_want)
-			__bch2_btree_iter_upgrade(iter1, iter2->locks_want);
-		else if (!iter1->locks_want && iter2->locks_want)
-			__bch2_btree_iter_upgrade(iter1, 1);
+		prev = iter;
 	}
 
 	bch2_trans_unlock(trans);
@@ -1321,20 +1325,29 @@ retry_all:
 	BUG_ON(ret && ret != -EINTR);
 
 	/* Now, redo traversals in correct order: */
-	for (i = 0; i < nr_sorted; i++) {
-		unsigned idx = sorted[i];
+	i = 0;
+	while (i < trans->nr_sorted) {
+		iter = trans->iters + trans->sorted[i];
 
-		/*
-		 * sucessfully traversing one iterator can cause another to be
-		 * unlinked, in btree_key_cache_fill()
-		 */
-		if (!(trans->iters_linked & (1ULL << idx)))
-			continue;
+		EBUG_ON(!(trans->iters_linked & (1ULL << iter->idx)));
 
-		ret = btree_iter_traverse_one(&trans->iters[idx], _THIS_IP_);
+		ret = btree_iter_traverse_one(iter, _THIS_IP_);
 		if (ret)
 			goto retry_all;
+
+		EBUG_ON(!(trans->iters_linked & (1ULL << iter->idx)));
+
+		if (iter->nodes_locked)
+			i++;
 	}
+
+	/*
+	 * BTREE_ITER_NEED_RELOCK is ok here - if we called bch2_trans_unlock()
+	 * and relock(), relock() won't relock since iter->should_be_locked
+	 * isn't set yet, which is all fine
+	 */
+	trans_for_each_iter(trans, iter)
+		BUG_ON(iter->uptodate >= BTREE_ITER_NEED_TRAVERSE);
 out:
 	bch2_btree_cache_cannibalize_unlock(c);
 
@@ -1536,6 +1549,7 @@ struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter)
 	BUG_ON(bpos_cmp(b->key.k.p, iter->pos) < 0);
 
 	iter->pos = iter->real_pos = b->key.k.p;
+	iter->trans->iters_sorted = false;
 
 	bch2_btree_iter_verify(iter);
 	iter->should_be_locked = true;
@@ -1593,6 +1607,7 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
 	}
 
 	iter->pos = iter->real_pos = b->key.k.p;
+	iter->trans->iters_sorted = false;
 
 	bch2_btree_iter_verify(iter);
 	iter->should_be_locked = true;
@@ -1617,6 +1632,7 @@ static void btree_iter_set_search_pos(struct btree_iter *iter, struct bpos new_p
 
 	iter->real_pos = new_pos;
 	iter->should_be_locked = false;
+	iter->trans->iters_sorted = false;
 
 	if (unlikely(btree_iter_type(iter) == BTREE_ITER_CACHED)) {
 		btree_node_unlock(iter, 0);
@@ -2032,6 +2048,135 @@ static inline void bch2_btree_iter_init(struct btree_trans *trans,
 
 /* new transactional stuff: */
 
+#ifdef CONFIG_BCACHEFS_DEBUG
+static void btree_trans_verify_sorted_refs(struct btree_trans *trans)
+{
+	struct btree_iter *iter;
+	unsigned i;
+
+	BUG_ON(trans->nr_sorted != hweight64(trans->iters_linked));
+
+	trans_for_each_iter(trans, iter) {
+		BUG_ON(iter->sorted_idx >= trans->nr_sorted);
+		BUG_ON(trans->sorted[iter->sorted_idx] != iter->idx);
+	}
+
+	for (i = 0; i < trans->nr_sorted; i++) {
+		unsigned idx = trans->sorted[i];
+
+		EBUG_ON(!(trans->iters_linked & (1ULL << idx)));
+		BUG_ON(trans->iters[idx].sorted_idx != i);
+	}
+}
+#else
+static inline void btree_trans_verify_sorted_refs(struct btree_trans *trans) {}
+#endif
+
+static void btree_trans_verify_sorted(struct btree_trans *trans)
+{
+#ifdef CONFIG_BCACHEFS_DEBUG
+	struct btree_iter *iter, *prev = NULL;
+	unsigned i;
+
+	trans_for_each_iter_inorder(trans, iter, i) {
+		BUG_ON(prev && btree_iter_cmp(prev, iter) > 0);
+		prev = iter;
+	}
+#endif
+}
+
+static noinline void __btree_trans_sort_iters(struct btree_trans *trans)
+{
+	int i, l = 0, r = trans->nr_sorted, inc = 1;
+	bool swapped;
+
+	/*
+	 * Cocktail shaker sort: this is efficient because iterators will be
+	 * mostly sorteda.
+	 */
+	do {
+		swapped = false;
+
+		for (i = inc > 0 ? l : r - 2;
+		     i + 1 < r && i >= l;
+		     i += inc) {
+			if (btree_iter_cmp(trans->iters + trans->sorted[i],
+					   trans->iters + trans->sorted[i + 1]) > 0) {
+				swap(trans->sorted[i], trans->sorted[i + 1]);
+				trans->iters[trans->sorted[i]].sorted_idx = i;
+				trans->iters[trans->sorted[i + 1]].sorted_idx = i + 1;
+				swapped = true;
+			}
+		}
+
+		if (inc > 0)
+			--r;
+		else
+			l++;
+		inc = -inc;
+	} while (swapped);
+
+	trans->iters_sorted = true;
+
+	btree_trans_verify_sorted(trans);
+}
+
+static inline void btree_trans_sort_iters(struct btree_trans *trans)
+{
+	btree_trans_verify_sorted_refs(trans);
+
+	if (trans->iters_sorted) {
+		if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG))
+			btree_trans_verify_sorted(trans);
+		return;
+	}
+	__btree_trans_sort_iters(trans);
+}
+
+static inline void btree_iter_list_remove(struct btree_trans *trans,
+					  struct btree_iter *iter)
+{
+	unsigned i;
+
+	EBUG_ON(iter->sorted_idx >= trans->nr_sorted);
+#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
+	trans->nr_sorted--;
+	memmove_u64s_down_small(trans->sorted + iter->sorted_idx,
+				trans->sorted + iter->sorted_idx + 1,
+				DIV_ROUND_UP(trans->nr_sorted - iter->sorted_idx, 8));
+#else
+	array_remove_item(trans->sorted, trans->nr_sorted, iter->sorted_idx);
+#endif
+	for (i = iter->sorted_idx; i < trans->nr_sorted; i++)
+		trans->iters[trans->sorted[i]].sorted_idx = i;
+
+	iter->sorted_idx = U8_MAX;
+}
+
+static inline void btree_iter_list_add(struct btree_trans *trans,
+				       struct btree_iter *pos,
+				       struct btree_iter *iter)
+{
+	unsigned i;
+
+	iter->sorted_idx = pos ? pos->sorted_idx + 1 : 0;
+
+#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
+	memmove_u64s_up_small(trans->sorted + iter->sorted_idx + 1,
+			      trans->sorted + iter->sorted_idx,
+			      DIV_ROUND_UP(trans->nr_sorted - iter->sorted_idx, 8));
+	trans->nr_sorted++;
+	trans->sorted[iter->sorted_idx] = iter->idx;
+#else
+	array_insert_item(trans->sorted, trans->nr_sorted, iter->sorted_idx, iter->idx);
+#endif
+
+	for (i = iter->sorted_idx; i < trans->nr_sorted; i++)
+		trans->iters[trans->sorted[i]].sorted_idx = i;
+
+	btree_trans_verify_sorted_refs(trans);
+}
+
 static void btree_iter_child_free(struct btree_iter *iter)
 {
 	struct btree_iter *child = btree_iter_child(iter);
@@ -2049,7 +2194,7 @@ static struct btree_iter *btree_iter_child_alloc(struct btree_iter *iter,
 	struct btree_iter *child = btree_iter_child(iter);
 
 	if (!child) {
-		child = btree_trans_iter_alloc(trans);
+		child = btree_trans_iter_alloc(trans, iter);
 		child->ip_allocated	= ip;
 		iter->child_idx		= child->idx;
 
@@ -2065,10 +2210,14 @@ static inline void __bch2_trans_iter_free(struct btree_trans *trans,
 {
 	btree_iter_child_free(&trans->iters[idx]);
 
+	btree_iter_list_remove(trans, &trans->iters[idx]);
+
 	__bch2_btree_iter_unlock(&trans->iters[idx]);
 	trans->iters_linked		&= ~(1ULL << idx);
 	trans->iters_live		&= ~(1ULL << idx);
 	trans->iters_touched		&= ~(1ULL << idx);
+
+	btree_trans_verify_sorted_refs(trans);
 }
 
 int bch2_trans_iter_put(struct btree_trans *trans,
@@ -2109,12 +2258,15 @@ static void btree_trans_iter_alloc_fail(struct btree_trans *trans)
 
 	struct btree_iter *iter;
 	struct btree_insert_entry *i;
+	unsigned idx;
 	char buf[100];
 
-	trans_for_each_iter(trans, iter)
+	btree_trans_sort_iters(trans);
+
+	trans_for_each_iter_inorder(trans, iter, idx)
 		printk(KERN_ERR "iter: btree %s pos %s%s%s%s %pS\n",
 		       bch2_btree_ids[iter->btree_id],
-		       (bch2_bpos_to_text(&PBUF(buf), iter->pos), buf),
+		       (bch2_bpos_to_text(&PBUF(buf), iter->real_pos), buf),
 		       btree_iter_live(trans, iter) ? " live" : "",
 		       (trans->iters_touched & (1ULL << iter->idx)) ? " touched" : "",
 		       iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT ? " keep" : "",
@@ -2130,11 +2282,14 @@ static void btree_trans_iter_alloc_fail(struct btree_trans *trans)
 	panic("trans iter oveflow\n");
 }
 
-static struct btree_iter *btree_trans_iter_alloc(struct btree_trans *trans)
+static struct btree_iter *btree_trans_iter_alloc(struct btree_trans *trans,
+						 struct btree_iter *pos)
 {
 	struct btree_iter *iter;
 	unsigned idx;
 
+	btree_trans_verify_sorted_refs(trans);
+
 	if (unlikely(trans->iters_linked ==
 		     ~((~0ULL << 1) << (BTREE_ITER_MAX - 1))))
 		btree_trans_iter_alloc_fail(trans);
@@ -2145,10 +2300,13 @@ static struct btree_iter *btree_trans_iter_alloc(struct btree_trans *trans)
 	iter->trans		= trans;
 	iter->idx		= idx;
 	iter->child_idx		= U8_MAX;
+	iter->sorted_idx	= U8_MAX;
 	iter->flags		= 0;
 	iter->nodes_locked	= 0;
 	iter->nodes_intent_locked = 0;
 	trans->iters_linked	|= 1ULL << idx;
+
+	btree_iter_list_add(trans, pos, iter);
 	return iter;
 }
 
@@ -2170,6 +2328,7 @@ static void btree_iter_copy(struct btree_iter *dst, struct btree_iter *src)
 
 	dst->flags &= ~BTREE_ITER_KEEP_UNTIL_COMMIT;
 	dst->flags &= ~BTREE_ITER_SET_POS_AFTER_COMMIT;
+	dst->trans->iters_sorted = false;
 }
 
 struct btree_iter *__bch2_trans_get_iter(struct btree_trans *trans,
@@ -2223,10 +2382,10 @@ struct btree_iter *__bch2_trans_get_iter(struct btree_trans *trans,
 	}
 
 	if (!best) {
-		iter = btree_trans_iter_alloc(trans);
+		iter = btree_trans_iter_alloc(trans, best);
 		bch2_btree_iter_init(trans, iter, btree_id);
 	} else if (btree_iter_keep(trans, best)) {
-		iter = btree_trans_iter_alloc(trans);
+		iter = btree_trans_iter_alloc(trans, best);
 		btree_iter_copy(iter, best);
 	} else {
 		iter = best;
@@ -2307,7 +2466,7 @@ struct btree_iter *__bch2_trans_copy_iter(struct btree_trans *trans,
 {
 	struct btree_iter *iter;
 
-	iter = btree_trans_iter_alloc(trans);
+	iter = btree_trans_iter_alloc(trans, src);
 	btree_iter_copy(iter, src);
 
 	trans->iters_live |= 1ULL << iter->idx;
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index aeabc07d2c9c..6fb0cb8252eb 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -71,6 +71,36 @@ __trans_next_iter(struct btree_trans *trans, unsigned idx)
 	     (_iter);							\
 	     _iter = __trans_next_iter((_trans), (_iter)->idx + 1))
 
+static inline struct btree_iter *next_btree_iter(struct btree_trans *trans, struct btree_iter *iter)
+{
+	unsigned idx = iter ? iter->sorted_idx + 1 : 0;
+
+	EBUG_ON(idx > trans->nr_sorted);
+
+	return idx < trans->nr_sorted
+		? trans->iters + trans->sorted[idx]
+		: NULL;
+}
+
+static inline struct btree_iter *prev_btree_iter(struct btree_trans *trans, struct btree_iter *iter)
+{
+	unsigned idx = iter ? iter->sorted_idx : trans->nr_sorted;
+
+	return idx
+		? trans->iters + trans->sorted[idx - 1]
+		: NULL;
+}
+
+#define trans_for_each_iter_inorder(_trans, _iter, _i)			\
+	for (_i = 0;							\
+	     ((_iter) = (_trans)->iters + trans->sorted[_i]), (_i) < (_trans)->nr_sorted;\
+	     _i++)
+
+#define trans_for_each_iter_inorder_reverse(_trans, _iter, _i)		\
+	for (_i = trans->nr_sorted - 1;					\
+	     ((_iter) = (_trans)->iters + trans->sorted[_i]), (_i) >= 0;\
+	     --_i)
+
 static inline bool __iter_has_node(const struct btree_iter *iter,
 				   const struct btree *b)
 {
@@ -191,19 +221,14 @@ static inline void bch2_btree_iter_set_pos_to_extent_start(struct btree_iter *it
 	iter->pos = bkey_start_pos(&iter->k);
 }
 
-static inline struct btree_iter *btree_iter_child(struct btree_iter *iter)
+static inline struct btree_iter *idx_to_btree_iter(struct btree_trans *trans, unsigned idx)
 {
-	return iter->child_idx == U8_MAX ? NULL
-		: iter->trans->iters + iter->child_idx;
+	return idx != U8_MAX ? trans->iters + idx : NULL;
 }
 
-/* Sort order for locking btree iterators: */
-static inline int btree_iter_lock_cmp(const struct btree_iter *l,
-				      const struct btree_iter *r)
+static inline struct btree_iter *btree_iter_child(struct btree_iter *iter)
 {
-	return   cmp_int(l->btree_id, r->btree_id) ?:
-		-cmp_int(btree_iter_is_cached(l), btree_iter_is_cached(r)) ?:
-		 bkey_cmp(l->real_pos, r->real_pos);
+	return idx_to_btree_iter(iter->trans, iter->child_idx);
 }
 
 /*
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 4fa37fbc41fa..7a9aece2eb87 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -246,6 +246,7 @@ struct btree_iter {
 
 	u8			idx;
 	u8			child_idx;
+	u8			sorted_idx;
 
 	/* btree_iter_copy starts here: */
 	u16			flags;
@@ -379,11 +380,13 @@ struct btree_trans {
 	unsigned long		ip;
 	int			srcu_idx;
 
+	u8			nr_sorted;
 	u8			nr_updates;
 	bool			used_mempool:1;
 	bool			error:1;
 	bool			in_traverse_all:1;
 	bool			restarted:1;
+	bool			iters_sorted:1;
 	/*
 	 * For when bch2_trans_update notices we'll be splitting a compressed
 	 * extent:
@@ -398,6 +401,7 @@ struct btree_trans {
 	unsigned		mem_bytes;
 	void			*mem;
 
+	u8			sorted[BTREE_ITER_MAX + 8];
 	struct btree_iter	*iters;
 	struct btree_insert_entry *updates;
 
diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h
index a0cbebf190b4..41dfc5867c9e 100644
--- a/fs/bcachefs/util.h
+++ b/fs/bcachefs/util.h
@@ -593,6 +593,20 @@ static inline void memmove_u64s_down(void *dst, const void *src,
 	__memmove_u64s_down(dst, src, u64s);
 }
 
+static inline void __memmove_u64s_down_small(void *dst, const void *src,
+				       unsigned u64s)
+{
+	memcpy_u64s_small(dst, src, u64s);
+}
+
+static inline void memmove_u64s_down_small(void *dst, const void *src,
+				     unsigned u64s)
+{
+	EBUG_ON(dst > src);
+
+	__memmove_u64s_down_small(dst, src, u64s);
+}
+
 static inline void __memmove_u64s_up_small(void *_dst, const void *_src,
 					   unsigned u64s)
 {
-- 
cgit 


From 0f120eac34f31e7e6c4904d9e92e2e95e2edfbaa Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 30 Jul 2021 17:59:37 -0400
Subject: bcachefs: Add flags field to bch2_inode_to_text()

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/inode.c | 23 +++++++++++++++++------
 fs/bcachefs/inode.h |  2 ++
 2 files changed, 19 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index 102bd3c9d30f..63f50891594c 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -371,6 +371,22 @@ const char *bch2_inode_invalid(const struct bch_fs *c, struct bkey_s_c k)
 	return NULL;
 }
 
+static void __bch2_inode_unpacked_to_text(struct printbuf *out, struct bch_inode_unpacked *inode)
+{
+	pr_buf(out, "mode %o flags %x ", inode->bi_mode, inode->bi_flags);
+
+#define x(_name, _bits)						\
+	pr_buf(out, #_name " %llu ", (u64) inode->_name);
+	BCH_INODE_FIELDS()
+#undef  x
+}
+
+void bch2_inode_unpacked_to_text(struct printbuf *out, struct bch_inode_unpacked *inode)
+{
+	pr_buf(out, "inum: %llu ", inode->bi_inum);
+	__bch2_inode_unpacked_to_text(out, inode);
+}
+
 void bch2_inode_to_text(struct printbuf *out, struct bch_fs *c,
 		       struct bkey_s_c k)
 {
@@ -382,12 +398,7 @@ void bch2_inode_to_text(struct printbuf *out, struct bch_fs *c,
 		return;
 	}
 
-	pr_buf(out, "mode: %o ", unpacked.bi_mode);
-
-#define x(_name, _bits)						\
-	pr_buf(out, #_name ": %llu ", (u64) unpacked._name);
-	BCH_INODE_FIELDS()
-#undef  x
+	__bch2_inode_unpacked_to_text(out, &unpacked);
 }
 
 const char *bch2_inode_generation_invalid(const struct bch_fs *c,
diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h
index 2cb081ae44d9..d67af4f56f05 100644
--- a/fs/bcachefs/inode.h
+++ b/fs/bcachefs/inode.h
@@ -55,6 +55,8 @@ void bch2_inode_pack(struct bch_fs *, struct bkey_inode_buf *,
 		     const struct bch_inode_unpacked *);
 int bch2_inode_unpack(struct bkey_s_c_inode, struct bch_inode_unpacked *);
 
+void bch2_inode_unpacked_to_text(struct printbuf *, struct bch_inode_unpacked *);
+
 struct btree_iter *bch2_inode_peek(struct btree_trans *,
 			struct bch_inode_unpacked *, u64, unsigned);
 int bch2_inode_write(struct btree_trans *, struct btree_iter *,
-- 
cgit 


From e3637266024de9bafa64f315d7d03cf0291ace6a Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 30 Jul 2021 18:01:33 -0400
Subject: bcachefs: Ensure that new inodes hit underlying btree

Inode creation is done with non-cached btree iterators, but then in the
same transaction the inode may be updated again with a cached iterator -
it makes cache coherency easier if new inodes always land in the
underlying btree.

This patch adds a check to bch2_trans_update() - if the same key is
updated multiple times in the same transaction with both cached and non
cache iterators, use the non cached iterator.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_update_leaf.c | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 20275be97256..179091e4c561 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -1062,7 +1062,19 @@ int bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter,
 	if (i < trans->updates + trans->nr_updates &&
 	    !btree_insert_entry_cmp(&n, i)) {
 		BUG_ON(i->trans_triggers_run);
-		*i = n;
+
+		/*
+		 * This is a hack to ensure that inode creates update the btree,
+		 * not the key cache, which helps with cache coherency issues in
+		 * other areas:
+		 */
+		if (btree_iter_type(n.iter) == BTREE_ITER_CACHED &&
+		    btree_iter_type(i->iter) != BTREE_ITER_CACHED) {
+			i->k = n.k;
+			i->flags = n.flags;
+		} else {
+			*i = n;
+		}
 	} else
 		array_insert_item(trans->updates, trans->nr_updates,
 				  i - trans->updates, n);
-- 
cgit 


From 3737e0ddfbce4791a6415fa685e235b03924ff01 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 5 Aug 2021 13:02:39 -0400
Subject: bcachefs: Fix an unhandled transaction restart

__bch2_read() -> __bch2_read_extent() -> bch2_bucket_io_time_reset() may
cause a transaction restart, which we don't return an error for because
it doesn't prevent us from making forward progress on the read we're
submitting.

Instead, change __bch2_read() and bchfs_read() to check for transaction
restarts.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/fs-io.c | 9 +++++++++
 fs/bcachefs/io.c    | 9 +++++++++
 2 files changed, 18 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 30e5acd2e97c..9ac10b72d1cf 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -791,6 +791,15 @@ retry:
 		unsigned bytes, sectors, offset_into_extent;
 		enum btree_id data_btree = BTREE_ID_extents;
 
+		/*
+		 * read_extent -> io_time_reset may cause a transaction restart
+		 * without returning an error, we need to check for that here:
+		 */
+		if (!bch2_trans_relock(trans)) {
+			ret = -EINTR;
+			break;
+		}
+
 		bch2_btree_iter_set_pos(iter,
 				POS(inum, rbio->bio.bi_iter.bi_sector));
 
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index f293876e0bbc..30d9b6e4abf7 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -2288,6 +2288,15 @@ retry:
 		unsigned bytes, sectors, offset_into_extent;
 		enum btree_id data_btree = BTREE_ID_extents;
 
+		/*
+		 * read_extent -> io_time_reset may cause a transaction restart
+		 * without returning an error, we need to check for that here:
+		 */
+		if (!bch2_trans_relock(&trans)) {
+			ret = -EINTR;
+			break;
+		}
+
 		bch2_btree_iter_set_pos(iter,
 				POS(inode, bvec_iter.bi_sector));
 
-- 
cgit 


From d2c50773946dfc21021c18c03aa35f1ccb47669a Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 7 Aug 2021 18:21:35 -0400
Subject: bcachefs: Fix btree_trans_peek_updates()

Should have been using bpos_cmp(), not bkey_cmp().

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_iter.c | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index c14be8093116..fa19ef141252 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1727,8 +1727,7 @@ static inline bool btree_iter_set_pos_to_prev_leaf(struct btree_iter *iter)
 	return ret;
 }
 
-static noinline struct bkey_i *__btree_trans_peek_updates(struct btree_iter *iter,
-						 struct bpos pos)
+static noinline struct bkey_i *__btree_trans_peek_updates(struct btree_iter *iter)
 {
 	struct btree_insert_entry *i;
 	struct bkey_i *ret = NULL;
@@ -1738,7 +1737,7 @@ static noinline struct bkey_i *__btree_trans_peek_updates(struct btree_iter *ite
 			continue;
 		if (i->btree_id > iter->btree_id)
 			break;
-		if (bpos_cmp(i->k->k.p, pos) < 0)
+		if (bpos_cmp(i->k->k.p, iter->real_pos) < 0)
 			continue;
 		if (!ret || bpos_cmp(i->k->k.p, ret->k.p) < 0)
 			ret = i->k;
@@ -1747,11 +1746,10 @@ static noinline struct bkey_i *__btree_trans_peek_updates(struct btree_iter *ite
 	return ret;
 }
 
-static inline struct bkey_i *btree_trans_peek_updates(struct btree_iter *iter,
-						      struct bpos pos)
+static inline struct bkey_i *btree_trans_peek_updates(struct btree_iter *iter)
 {
 	return iter->flags & BTREE_ITER_WITH_UPDATES
-		? __btree_trans_peek_updates(iter, pos)
+		? __btree_trans_peek_updates(iter)
 		: NULL;
 }
 
@@ -1770,7 +1768,6 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
 	bch2_btree_iter_verify(iter);
 	bch2_btree_iter_verify_entry_exit(iter);
 start:
-	next_update = btree_trans_peek_updates(iter, search_key);
 	btree_iter_set_search_pos(iter, search_key);
 
 	while (1) {
@@ -1778,8 +1775,13 @@ start:
 		if (unlikely(ret))
 			return bkey_s_c_err(ret);
 
+		/*
+		 * btree_iter_level_peek() mutates iter->real_pos, which
+		 * btree_trans_peek_updates() checks against, so we have to call
+		 * them in this order:
+		 */
+		next_update = btree_trans_peek_updates(iter);
 		k = btree_iter_level_peek(iter, &iter->l[0]);
-
 		if (next_update &&
 		    bpos_cmp(next_update->k.p, iter->real_pos) <= 0) {
 			iter->k = next_update->k;
@@ -1931,6 +1933,8 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
 		struct bkey_i *next_update;
 		struct bkey_cached *ck;
 
+		next_update = btree_trans_peek_updates(iter);
+
 		switch (btree_iter_type(iter)) {
 		case BTREE_ITER_KEYS:
 			k = btree_iter_level_peek_all(iter, &iter->l[0]);
@@ -1948,7 +1952,6 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
 			BUG();
 		}
 
-		next_update = btree_trans_peek_updates(iter, search_key);
 		if (next_update &&
 		    (!k.k || bpos_cmp(next_update->k.p, k.k->p) <= 0)) {
 			iter->k = next_update->k;
-- 
cgit 


From c8476a4eb29972d756df33c3eb85c5792f798748 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 7 Aug 2021 18:19:33 -0400
Subject: bcachefs: Minor btree iter refactoring

This makes the flow control in bch2_btree_iter_peek() and
bch2_btree_iter_peek_prev() a bit cleaner.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_iter.c | 59 ++++++++++++++++--------------------------------
 1 file changed, 20 insertions(+), 39 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index fa19ef141252..f38f231fb296 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1696,37 +1696,6 @@ inline bool bch2_btree_iter_rewind(struct btree_iter *iter)
 	return ret;
 }
 
-static inline bool btree_iter_set_pos_to_next_leaf(struct btree_iter *iter)
-{
-	struct bpos next_pos = iter->l[0].b->key.k.p;
-	bool ret = bpos_cmp(next_pos, SPOS_MAX) != 0;
-
-	/*
-	 * Typically, we don't want to modify iter->pos here, since that
-	 * indicates where we searched from - unless we got to the end of the
-	 * btree, in that case we want iter->pos to reflect that:
-	 */
-	if (ret)
-		btree_iter_set_search_pos(iter, bpos_successor(next_pos));
-	else
-		bch2_btree_iter_set_pos(iter, SPOS_MAX);
-
-	return ret;
-}
-
-static inline bool btree_iter_set_pos_to_prev_leaf(struct btree_iter *iter)
-{
-	struct bpos next_pos = iter->l[0].b->data->min_key;
-	bool ret = bpos_cmp(next_pos, POS_MIN) != 0;
-
-	if (ret)
-		btree_iter_set_search_pos(iter, bpos_predecessor(next_pos));
-	else
-		bch2_btree_iter_set_pos(iter, POS_MIN);
-
-	return ret;
-}
-
 static noinline struct bkey_i *__btree_trans_peek_updates(struct btree_iter *iter)
 {
 	struct btree_insert_entry *i;
@@ -1767,10 +1736,10 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
 	EBUG_ON(btree_iter_type(iter) != BTREE_ITER_KEYS);
 	bch2_btree_iter_verify(iter);
 	bch2_btree_iter_verify_entry_exit(iter);
-start:
-	btree_iter_set_search_pos(iter, search_key);
 
 	while (1) {
+		btree_iter_set_search_pos(iter, search_key);
+
 		ret = btree_iter_traverse(iter);
 		if (unlikely(ret))
 			return bkey_s_c_err(ret);
@@ -1791,14 +1760,20 @@ start:
 		if (likely(k.k)) {
 			if (bkey_deleted(k.k)) {
 				search_key = bkey_successor(iter, k.k->p);
-				goto start;
+				continue;
 			}
 
 			break;
 		}
 
-		if (!btree_iter_set_pos_to_next_leaf(iter))
-			return bkey_s_c_null;
+		if (unlikely(!bpos_cmp(iter->l[0].b->key.k.p, SPOS_MAX))) {
+			bch2_btree_iter_set_pos(iter, SPOS_MAX);
+			k = bkey_s_c_null;
+			goto out;
+		}
+
+		/* Advance to next leaf node: */
+		search_key = bpos_successor(iter->l[0].b->key.k.p);
 	}
 
 	/*
@@ -1810,6 +1785,7 @@ start:
 	else if (bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0)
 		iter->pos = bkey_start_pos(k.k);
 
+out:
 	bch2_btree_iter_verify_entry_exit(iter);
 	bch2_btree_iter_verify(iter);
 	iter->should_be_locked = true;
@@ -1834,6 +1810,7 @@ struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter)
  */
 struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
 {
+	struct bpos search_key = iter->pos;
 	struct btree_iter_level *l = &iter->l[0];
 	struct bkey_s_c k;
 	int ret;
@@ -1843,9 +1820,9 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
 	bch2_btree_iter_verify(iter);
 	bch2_btree_iter_verify_entry_exit(iter);
 
-	btree_iter_set_search_pos(iter, iter->pos);
-
 	while (1) {
+		btree_iter_set_search_pos(iter, search_key);
+
 		ret = btree_iter_traverse(iter);
 		if (unlikely(ret)) {
 			k = bkey_s_c_err(ret);
@@ -1862,10 +1839,14 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
 		if (likely(k.k))
 			break;
 
-		if (!btree_iter_set_pos_to_prev_leaf(iter)) {
+		if (unlikely(!bpos_cmp(iter->l[0].b->data->min_key, POS_MIN))) {
+			bch2_btree_iter_set_pos(iter, POS_MIN);
 			k = bkey_s_c_null;
 			goto no_key;
 		}
+
+		/* Advance to previous leaf node: */
+		search_key = bpos_predecessor(iter->l[0].b->data->min_key);
 	}
 
 	EBUG_ON(bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0);
-- 
cgit 


From 8ddef4d6ccedcd571c9b81f6cd8dff8ddcdb918a Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 17 Aug 2021 15:03:53 -0400
Subject: bcachefs: Fix a valgrind conditional jump

Valgrind was complaining about a jump depending on uninitialized memory
- we weren't, but this change makes the code less confusing for valgrind
to follow.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/varint.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/varint.c b/fs/bcachefs/varint.c
index 6955ff5dc19c..e87da470c581 100644
--- a/fs/bcachefs/varint.c
+++ b/fs/bcachefs/varint.c
@@ -97,7 +97,7 @@ int bch2_varint_encode_fast(u8 *out, u64 v)
 int bch2_varint_decode_fast(const u8 *in, const u8 *end, u64 *out)
 {
 	u64 v = get_unaligned_le64(in);
-	unsigned bytes = ffz(v & 255) + 1;
+	unsigned bytes = ffz(*in) + 1;
 
 	if (unlikely(in + bytes > end))
 		return -1;
-- 
cgit 


From 62df3d443c389115f1664676ef75ba0fd157c2a4 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 17 Aug 2021 15:29:21 -0400
Subject: bcachefs: Disk space accounting fix

DIV_ROUND_UP() wasn't doing what we wanted when passing it negative
numbers - fix it by just not passing it negative numbers anymore.

Also, no need to do the scaling by compression ratio for incompressible
data.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/buckets.c | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 71900e7e921f..e00c02dcb63e 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -666,7 +666,10 @@ void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
 
 static s64 ptr_disk_sectors(s64 sectors, struct extent_ptr_decoded p)
 {
-	return p.crc.compression_type
+	EBUG_ON(sectors < 0);
+
+	return p.crc.compression_type &&
+		p.crc.compression_type != BCH_COMPRESSION_TYPE_incompressible
 		? DIV_ROUND_UP(sectors * p.crc.compressed_size,
 			       p.crc.uncompressed_size)
 		: sectors;
@@ -929,9 +932,6 @@ static int bch2_mark_extent(struct bch_fs *c,
 	BUG_ON((flags & (BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE)) ==
 	       (BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE));
 
-	if (flags & BTREE_TRIGGER_OVERWRITE)
-		sectors = -sectors;
-
 	r.e.data_type	= data_type;
 	r.e.nr_devs	= 0;
 	r.e.nr_required	= 1;
@@ -939,6 +939,9 @@ static int bch2_mark_extent(struct bch_fs *c,
 	bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
 		s64 disk_sectors = ptr_disk_sectors(sectors, p);
 
+		if (flags & BTREE_TRIGGER_OVERWRITE)
+			disk_sectors = -disk_sectors;
+
 		ret = bch2_mark_pointer(c, k, p, disk_sectors, data_type,
 					journal_seq, flags);
 		if (ret < 0)
@@ -1549,9 +1552,6 @@ static int bch2_trans_mark_extent(struct btree_trans *trans,
 	BUG_ON((flags & (BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE)) ==
 	       (BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE));
 
-	if (flags & BTREE_TRIGGER_OVERWRITE)
-		sectors = -sectors;
-
 	r.e.data_type	= data_type;
 	r.e.nr_devs	= 0;
 	r.e.nr_required	= 1;
@@ -1559,6 +1559,9 @@ static int bch2_trans_mark_extent(struct btree_trans *trans,
 	bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
 		s64 disk_sectors = ptr_disk_sectors(sectors, p);
 
+		if (flags & BTREE_TRIGGER_OVERWRITE)
+			disk_sectors = -disk_sectors;
+
 		ret = bch2_trans_mark_pointer(trans, k, p,
 					disk_sectors, data_type);
 		if (ret < 0)
-- 
cgit 


From 28624ba424fd73154438fd6552e6b555b3376a65 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 18 Aug 2021 16:19:28 -0400
Subject: bcachefs: Be sure to check ptr->dev in copygc pred function

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/movinggc.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c
index b805371fe99f..651381a5ccc5 100644
--- a/fs/bcachefs/movinggc.c
+++ b/fs/bcachefs/movinggc.c
@@ -85,6 +85,7 @@ static enum data_cmd copygc_pred(struct bch_fs *c, void *arg,
 		BUG_ON(i != j);
 #endif
 		if (i >= 0 &&
+		    p.ptr.dev == h->data[i].dev &&
 		    p.ptr.offset < h->data[i].offset + ca->mi.bucket_size &&
 		    p.ptr.gen == h->data[i].gen) {
 			/*
-- 
cgit 


From fd0bd123d5bc11d665ad0e80eaf4b458619f8b3d Mon Sep 17 00:00:00 2001
From: Brett Holman <bholman.devel@gmail.com>
Date: Tue, 17 Aug 2021 17:14:26 -0600
Subject: bcachefs: Fix 32 bit build failures

This fix replaces multiple 64 bit divisions with do_div() equivalents.

Signed-off-by: Brett Holman <bholman.devel@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/buckets.c | 2 +-
 fs/bcachefs/tests.c   | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index e00c02dcb63e..d7994229ad5a 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -670,7 +670,7 @@ static s64 ptr_disk_sectors(s64 sectors, struct extent_ptr_decoded p)
 
 	return p.crc.compression_type &&
 		p.crc.compression_type != BCH_COMPRESSION_TYPE_incompressible
-		? DIV_ROUND_UP(sectors * p.crc.compressed_size,
+		? DIV_ROUND_UP_ULL(sectors * p.crc.compressed_size,
 			       p.crc.uncompressed_size)
 		: sectors;
 }
diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c
index 9c7812f62935..43b514974d91 100644
--- a/fs/bcachefs/tests.c
+++ b/fs/bcachefs/tests.c
@@ -778,7 +778,7 @@ static int btree_perf_test_thread(void *data)
 		wait_event(j->ready_wait, !atomic_read(&j->ready));
 	}
 
-	ret = j->fn(j->c, j->nr / j->nr_threads);
+	ret = j->fn(j->c, div64_u64(j->nr, j->nr_threads));
 	if (ret)
 		j->ret = ret;
 
@@ -854,11 +854,11 @@ int bch2_btree_perf_test(struct bch_fs *c, const char *testname,
 
 	scnprintf(name_buf, sizeof(name_buf), "%s:", testname);
 	bch2_hprint(&PBUF(nr_buf), nr);
-	bch2_hprint(&PBUF(per_sec_buf), nr * NSEC_PER_SEC / time);
+	bch2_hprint(&PBUF(per_sec_buf), div64_u64(nr * NSEC_PER_SEC, time));
 	printk(KERN_INFO "%-12s %s with %u threads in %5llu sec, %5llu nsec per iter, %5s per sec\n",
 		name_buf, nr_buf, nr_threads,
-		time / NSEC_PER_SEC,
-		time * nr_threads / nr,
+		div_u64(time, NSEC_PER_SEC),
+		div_u64(time * nr_threads, nr),
 		per_sec_buf);
 	return j.ret;
 }
-- 
cgit 


From 8dd6ed9451894e2168328e2203d227303b472ca3 Mon Sep 17 00:00:00 2001
From: Brett Holman <bholman.devel@gmail.com>
Date: Fri, 23 Jul 2021 13:57:19 -0600
Subject: bcachefs: add progress stats to sysfs

This adds progress stats to sysfs for copygc, rebalance, recovery, and the
cmd_job ioctls.

Signed-off-by: Brett Holman <bholman.devel@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs.h        |  4 ++++
 fs/bcachefs/move.c            | 31 +++++++++++++++++++++++++++++++
 fs/bcachefs/move.h            |  4 ++++
 fs/bcachefs/move_types.h      |  2 ++
 fs/bcachefs/movinggc.c        |  3 ++-
 fs/bcachefs/rebalance.c       | 11 +++++------
 fs/bcachefs/rebalance_types.h |  1 -
 fs/bcachefs/recovery.c        |  4 +++-
 fs/bcachefs/super.c           |  3 +++
 fs/bcachefs/sysfs.c           | 40 ++++++++++++++++++++++++++++++++++++++++
 10 files changed, 94 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 6a289b6f1fb4..e2aac1da18ae 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -792,6 +792,10 @@ mempool_t		bio_bounce_pages;
 	struct write_point	copygc_write_point;
 	s64			copygc_wait;
 
+	/* DATA PROGRESS STATS */
+	struct list_head	data_progress_list;
+	struct mutex		data_progress_lock;
+
 	/* STRIPES: */
 	GENRADIX(struct stripe) stripes[2];
 
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index 9a423a3e4570..8d28d8fc5395 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -686,6 +686,30 @@ out:
 	return ret;
 }
 
+inline void bch_move_stats_init(struct bch_move_stats *stats, char *name)
+{
+	memset(stats, 0, sizeof(*stats));
+
+	scnprintf(stats->name, sizeof(stats->name),
+			"%s", name);
+}
+
+static inline void progress_list_add(struct bch_fs *c,
+				     struct bch_move_stats *stats)
+{
+	mutex_lock(&c->data_progress_lock);
+	list_add(&stats->list, &c->data_progress_list);
+	mutex_unlock(&c->data_progress_lock);
+}
+
+static inline void progress_list_del(struct bch_fs *c,
+				     struct bch_move_stats *stats)
+{
+	mutex_lock(&c->data_progress_lock);
+	list_del(&stats->list);
+	mutex_unlock(&c->data_progress_lock);
+}
+
 int bch2_move_data(struct bch_fs *c,
 		   enum btree_id start_btree_id, struct bpos start_pos,
 		   enum btree_id end_btree_id,   struct bpos end_pos,
@@ -698,6 +722,7 @@ int bch2_move_data(struct bch_fs *c,
 	enum btree_id id;
 	int ret;
 
+	progress_list_add(c, stats);
 	closure_init_stack(&ctxt.cl);
 	INIT_LIST_HEAD(&ctxt.reads);
 	init_waitqueue_head(&ctxt.wait);
@@ -731,6 +756,7 @@ int bch2_move_data(struct bch_fs *c,
 			atomic64_read(&stats->sectors_moved),
 			atomic64_read(&stats->keys_moved));
 
+	progress_list_del(c, stats);
 	return ret;
 }
 
@@ -755,6 +781,7 @@ static int bch2_move_btree(struct bch_fs *c,
 	int ret = 0;
 
 	bch2_trans_init(&trans, c, 0, 0);
+	progress_list_add(c, stats);
 
 	stats->data_type = BCH_DATA_btree;
 
@@ -803,6 +830,7 @@ next:
 	if (ret)
 		bch_err(c, "error %i in bch2_move_btree", ret);
 
+	progress_list_del(c, stats);
 	return ret;
 }
 
@@ -944,6 +972,7 @@ int bch2_data_job(struct bch_fs *c,
 
 	switch (op.op) {
 	case BCH_DATA_OP_REREPLICATE:
+		bch_move_stats_init(stats, "rereplicate");
 		stats->data_type = BCH_DATA_journal;
 		ret = bch2_journal_flush_device_pins(&c->journal, -1);
 
@@ -968,6 +997,7 @@ int bch2_data_job(struct bch_fs *c,
 		if (op.migrate.dev >= c->sb.nr_devices)
 			return -EINVAL;
 
+		bch_move_stats_init(stats, "migrate");
 		stats->data_type = BCH_DATA_journal;
 		ret = bch2_journal_flush_device_pins(&c->journal, op.migrate.dev);
 
@@ -985,6 +1015,7 @@ int bch2_data_job(struct bch_fs *c,
 		ret = bch2_replicas_gc2(c) ?: ret;
 		break;
 	case BCH_DATA_OP_REWRITE_OLD_NODES:
+		bch_move_stats_init(stats, "rewrite_old_nodes");
 		ret = bch2_scan_old_btree_nodes(c, stats);
 		break;
 	default:
diff --git a/fs/bcachefs/move.h b/fs/bcachefs/move.h
index 99d6acb10880..901d8f875946 100644
--- a/fs/bcachefs/move.h
+++ b/fs/bcachefs/move.h
@@ -67,4 +67,8 @@ int bch2_data_job(struct bch_fs *,
 		  struct bch_move_stats *,
 		  struct bch_ioctl_data);
 
+inline void bch_move_stats_init(struct bch_move_stats *stats,
+				char *name);
+
+
 #endif /* _BCACHEFS_MOVE_H */
diff --git a/fs/bcachefs/move_types.h b/fs/bcachefs/move_types.h
index fc0de165af9f..9df6d18137a5 100644
--- a/fs/bcachefs/move_types.h
+++ b/fs/bcachefs/move_types.h
@@ -6,6 +6,8 @@ struct bch_move_stats {
 	enum bch_data_type	data_type;
 	enum btree_id		btree_id;
 	struct bpos		pos;
+	struct list_head	list;
+	char			name[32];
 
 	atomic64_t		keys_moved;
 	atomic64_t		keys_raced;
diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c
index 651381a5ccc5..b05dcbbd1a47 100644
--- a/fs/bcachefs/movinggc.c
+++ b/fs/bcachefs/movinggc.c
@@ -147,7 +147,8 @@ static int bch2_copygc(struct bch_fs *c)
 	size_t b, heap_size = 0;
 	int ret;
 
-	memset(&move_stats, 0, sizeof(move_stats));
+	bch_move_stats_init(&move_stats, "copygc");
+
 	/*
 	 * Find buckets with lowest sector counts, skipping completely
 	 * empty buckets, by building a maxheap sorted by sector count,
diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c
index b7e61da0f4d1..fe0a1dbac199 100644
--- a/fs/bcachefs/rebalance.c
+++ b/fs/bcachefs/rebalance.c
@@ -166,6 +166,7 @@ static int bch2_rebalance_thread(void *arg)
 	struct bch_fs_rebalance *r = &c->rebalance;
 	struct io_clock *clock = &c->io_clock[WRITE];
 	struct rebalance_work w, p;
+	struct bch_move_stats move_stats;
 	unsigned long start, prev_start;
 	unsigned long prev_run_time, prev_run_cputime;
 	unsigned long cputime, prev_cputime;
@@ -179,6 +180,7 @@ static int bch2_rebalance_thread(void *arg)
 	prev_start	= jiffies;
 	prev_cputime	= curr_cputime();
 
+	bch_move_stats_init(&move_stats, "rebalance");
 	while (!kthread_wait_freezable(r->enabled)) {
 		cond_resched();
 
@@ -235,7 +237,7 @@ static int bch2_rebalance_thread(void *arg)
 		prev_cputime	= cputime;
 
 		r->state = REBALANCE_RUNNING;
-		memset(&r->move_stats, 0, sizeof(r->move_stats));
+		memset(&move_stats, 0, sizeof(move_stats));
 		rebalance_work_reset(c);
 
 		bch2_move_data(c,
@@ -245,7 +247,7 @@ static int bch2_rebalance_thread(void *arg)
 			       NULL, /*  &r->pd.rate, */
 			       writepoint_ptr(&c->rebalance_write_point),
 			       rebalance_pred, NULL,
-			       &r->move_stats);
+			       &move_stats);
 	}
 
 	return 0;
@@ -281,10 +283,7 @@ void bch2_rebalance_work_to_text(struct printbuf *out, struct bch_fs *c)
 		       h1);
 		break;
 	case REBALANCE_RUNNING:
-		pr_buf(out, "running\n"
-		       "pos ");
-		bch2_bpos_to_text(out, r->move_stats.pos);
-		pr_buf(out, "\n");
+		pr_buf(out, "running\n");
 		break;
 	}
 }
diff --git a/fs/bcachefs/rebalance_types.h b/fs/bcachefs/rebalance_types.h
index 2f62a643c39f..7462a92e9598 100644
--- a/fs/bcachefs/rebalance_types.h
+++ b/fs/bcachefs/rebalance_types.h
@@ -19,7 +19,6 @@ struct bch_fs_rebalance {
 	enum rebalance_state	state;
 	u64			throttled_until_iotime;
 	unsigned long		throttled_until_cputime;
-	struct bch_move_stats	move_stats;
 
 	unsigned		enabled:1;
 };
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index afb72648fe54..b02af94f4037 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -1216,7 +1216,9 @@ use_clean:
 
 	if (!(c->sb.compat & (1ULL << BCH_COMPAT_extents_above_btree_updates_done)) ||
 	    !(c->sb.compat & (1ULL << BCH_COMPAT_bformat_overflow_done))) {
-		struct bch_move_stats stats = { 0 };
+		struct bch_move_stats stats;
+
+		bch_move_stats_init(&stats, "recovery");
 
 		bch_info(c, "scanning for old btree nodes");
 		ret = bch2_fs_read_write(c);
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 11557a863d3d..1d793e554084 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -712,6 +712,9 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 	INIT_LIST_HEAD(&c->ec_stripe_new_list);
 	mutex_init(&c->ec_stripe_new_lock);
 
+	INIT_LIST_HEAD(&c->data_progress_list);
+	mutex_init(&c->data_progress_lock);
+
 	spin_lock_init(&c->ec_stripes_heap_lock);
 
 	seqcount_init(&c->gc_pos_lock);
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index 9b1ffbf96e14..b5ce336f00ca 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -203,6 +203,8 @@ read_attribute(new_stripes);
 read_attribute(io_timers_read);
 read_attribute(io_timers_write);
 
+read_attribute(data_op_data_progress);
+
 #ifdef CONFIG_BCACHEFS_TESTS
 write_attribute(perf_test);
 #endif /* CONFIG_BCACHEFS_TESTS */
@@ -239,6 +241,37 @@ static size_t bch2_btree_avg_write_size(struct bch_fs *c)
 	return nr ? div64_u64(sectors, nr) : 0;
 }
 
+static long stats_to_text(struct printbuf *out, struct bch_fs *c,
+			  struct bch_move_stats *stats)
+{
+	pr_buf(out, "%s: data type %s btree_id %s position: ",
+		stats->name,
+		bch2_data_types[stats->data_type],
+		bch2_btree_ids[stats->btree_id]);
+	bch2_bpos_to_text(out, stats->pos);
+	pr_buf(out, "%s", "\n");
+
+	return 0;
+}
+
+static long data_progress_to_text(struct printbuf *out, struct bch_fs *c)
+{
+	long ret = 0;
+	struct bch_move_stats *iter;
+
+	mutex_lock(&c->data_progress_lock);
+
+	if (list_empty(&c->data_progress_list))
+		pr_buf(out, "%s", "no progress to report\n");
+	else
+		list_for_each_entry(iter, &c->data_progress_list, list) {
+			stats_to_text(out, c, iter);
+		}
+
+	mutex_unlock(&c->data_progress_lock);
+	return ret;
+}
+
 static int fs_alloc_debug_to_text(struct printbuf *out, struct bch_fs *c)
 {
 	struct bch_fs_usage_online *fs_usage = bch2_fs_usage_read(c);
@@ -434,6 +467,11 @@ SHOW(bch2_fs)
 		return out.pos - buf;
 	}
 
+	if (attr == &sysfs_data_op_data_progress) {
+		data_progress_to_text(&out, c);
+		return out.pos - buf;
+	}
+
 	return 0;
 }
 
@@ -596,6 +634,8 @@ struct attribute *bch2_fs_internal_files[] = {
 	&sysfs_io_timers_read,
 	&sysfs_io_timers_write,
 
+	&sysfs_data_op_data_progress,
+
 	&sysfs_internal_uuid,
 	NULL
 };
-- 
cgit 


From f4ccfe07e20e52a5f6ace9ef995bca1c07a70fac Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 22 Aug 2021 12:56:56 -0400
Subject: bcachefs: Fix unhandled transaction restart in bch2_gc_btree_gens()

This fixes https://github.com/koverstreet/bcachefs/issues/305

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_gc.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 2a84685f4e60..4a3f39a619a1 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -1736,8 +1736,14 @@ static int bch2_gc_btree_gens(struct bch_fs *c, enum btree_id btree_id)
 				   BTREE_ITER_ALL_SNAPSHOTS);
 
 	while ((bch2_trans_begin(&trans),
-		k = bch2_btree_iter_peek(iter)).k &&
-	       !(ret = bkey_err(k))) {
+		k = bch2_btree_iter_peek(iter)).k) {
+		ret = bkey_err(k);
+
+		if (ret == -EINTR)
+			continue;
+		if (ret)
+			break;
+
 		c->gc_gens_pos = iter->pos;
 
 		if (gc_btree_gens_key(c, k) && !commit_err) {
-- 
cgit 


From dc02bed6d9b8c85a8db686fecac5749fbde8cba1 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 23 Aug 2021 17:19:17 -0400
Subject: bcachefs: Free iterator if we have duplicate

This helps - but does not fully fix - the outstanding "transaction
iterator overflow" bugs.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_iter.c        | 21 +++++++++++++++++++--
 fs/bcachefs/btree_update_leaf.c |  1 +
 2 files changed, 20 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index f38f231fb296..c97569450741 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -2204,6 +2204,22 @@ static inline void __bch2_trans_iter_free(struct btree_trans *trans,
 	btree_trans_verify_sorted_refs(trans);
 }
 
+static bool have_iter_at_pos(struct btree_trans *trans,
+			     struct btree_iter *iter)
+{
+	struct btree_iter *n;
+
+	n = prev_btree_iter(trans, iter);
+	if (n && !btree_iter_cmp(n, iter))
+		return true;
+
+	n = next_btree_iter(trans, iter);
+	if (n && !btree_iter_cmp(n, iter))
+		return true;
+
+	return false;
+}
+
 int bch2_trans_iter_put(struct btree_trans *trans,
 			struct btree_iter *iter)
 {
@@ -2217,8 +2233,9 @@ int bch2_trans_iter_put(struct btree_trans *trans,
 
 	ret = btree_iter_err(iter);
 
-	if (!(trans->iters_touched & (1ULL << iter->idx)) &&
-	    !(iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT))
+	if (!(iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT) &&
+	    (!(trans->iters_touched & (1ULL << iter->idx)) ||
+	     have_iter_at_pos(trans, iter)))
 		__bch2_trans_iter_free(trans, iter->idx);
 
 	trans->iters_live	&= ~(1ULL << iter->idx);
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 179091e4c561..cfb691fa65cf 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -1040,6 +1040,7 @@ int bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter,
 		n.iter = bch2_trans_get_iter(trans, n.btree_id, n.k->k.p,
 					     BTREE_ITER_INTENT|
 					     BTREE_ITER_NOT_EXTENTS);
+		n.iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT;
 		ret = bch2_btree_iter_traverse(n.iter);
 		bch2_trans_iter_put(trans, n.iter);
 
-- 
cgit 


From 1865ccff159cce61d017d1c93e024d2f482d6a0c Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 24 Aug 2021 20:31:44 -0400
Subject: bcachefs: Add SPOS_MAX to bpos_to_text()

Better pretty printing ftw

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/bkey_methods.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c
index 465be5fee7d9..8efef485c6d8 100644
--- a/fs/bcachefs/bkey_methods.c
+++ b/fs/bcachefs/bkey_methods.c
@@ -215,6 +215,8 @@ void bch2_bpos_to_text(struct printbuf *out, struct bpos pos)
 		pr_buf(out, "POS_MIN");
 	else if (!bpos_cmp(pos, POS_MAX))
 		pr_buf(out, "POS_MAX");
+	else if (!bpos_cmp(pos, SPOS_MAX))
+		pr_buf(out, "SPOS_MAX");
 	else {
 		if (pos.inode == U64_MAX)
 			pr_buf(out, "U64_MAX");
-- 
cgit 


From e6e024e9bf250e24cc8f8546a4645d7271d7f090 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 24 Aug 2021 16:54:36 -0400
Subject: bcachefs: Ensure iter->real_pos is consistent with key returned

iter->real_pos needs to match the key returned or bad things will happen
when we go to update the key at that position. When we returned a
pending update from btree_trans_peek_updates(), this wasn't necessarily
the case.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_iter.c | 84 +++++++++++++++++++++++++-----------------------
 1 file changed, 43 insertions(+), 41 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index c97569450741..ee3b4098fa76 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1728,6 +1728,7 @@ static inline struct bkey_i *btree_trans_peek_updates(struct btree_iter *iter)
  */
 struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
 {
+	struct btree_iter_level *l = &iter->l[0];
 	struct bpos search_key = btree_iter_search_key(iter);
 	struct bkey_i *next_update;
 	struct bkey_s_c k;
@@ -1741,39 +1742,47 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
 		btree_iter_set_search_pos(iter, search_key);
 
 		ret = btree_iter_traverse(iter);
-		if (unlikely(ret))
-			return bkey_s_c_err(ret);
+		if (unlikely(ret)) {
+			/* ensure that iter->k is consistent with iter->pos: */
+			bch2_btree_iter_set_pos(iter, iter->pos);
+			k = bkey_s_c_err(ret);
+			goto out;
+		}
 
-		/*
-		 * btree_iter_level_peek() mutates iter->real_pos, which
-		 * btree_trans_peek_updates() checks against, so we have to call
-		 * them in this order:
-		 */
 		next_update = btree_trans_peek_updates(iter);
-		k = btree_iter_level_peek(iter, &iter->l[0]);
+		k = btree_iter_level_peek_all(iter, l);
+
+		/* * In the btree, deleted keys sort before non deleted: */
+		if (k.k && bkey_deleted(k.k) &&
+		    (!next_update ||
+		     bpos_cmp(k.k->p, next_update->k.p) <= 0)) {
+			search_key = k.k->p;
+			continue;
+		}
+
 		if (next_update &&
-		    bpos_cmp(next_update->k.p, iter->real_pos) <= 0) {
+		    bpos_cmp(next_update->k.p,
+			     k.k ? k.k->p : l->b->key.k.p) <= 0) {
 			iter->k = next_update->k;
 			k = bkey_i_to_s_c(next_update);
 		}
 
 		if (likely(k.k)) {
-			if (bkey_deleted(k.k)) {
-				search_key = bkey_successor(iter, k.k->p);
-				continue;
-			}
-
-			break;
-		}
-
-		if (unlikely(!bpos_cmp(iter->l[0].b->key.k.p, SPOS_MAX))) {
+			if (likely(!bkey_deleted(k.k)))
+				break;
+
+			/* Advance to next key: */
+			search_key = bkey_successor(iter, k.k->p);
+		} else if (likely(bpos_cmp(l->b->key.k.p, SPOS_MAX))) {
+			/* Advance to next leaf node: */
+			search_key = bpos_successor(l->b->key.k.p);
+		} else {
+			/* End of btree: */
 			bch2_btree_iter_set_pos(iter, SPOS_MAX);
+			iter->real_pos = SPOS_MAX;
 			k = bkey_s_c_null;
 			goto out;
 		}
-
-		/* Advance to next leaf node: */
-		search_key = bpos_successor(iter->l[0].b->key.k.p);
 	}
 
 	/*
@@ -1784,11 +1793,11 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
 		iter->pos = k.k->p;
 	else if (bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0)
 		iter->pos = bkey_start_pos(k.k);
-
+	iter->real_pos = k.k->p;
 out:
+	iter->should_be_locked = true;
 	bch2_btree_iter_verify_entry_exit(iter);
 	bch2_btree_iter_verify(iter);
-	iter->should_be_locked = true;
 	return k;
 }
 
@@ -1825,8 +1834,10 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
 
 		ret = btree_iter_traverse(iter);
 		if (unlikely(ret)) {
+			/* ensure that iter->k is consistent with iter->pos: */
+			bch2_btree_iter_set_pos(iter, iter->pos);
 			k = bkey_s_c_err(ret);
-			goto no_key;
+			goto out;
 		}
 
 		k = btree_iter_level_peek(iter, l);
@@ -1836,17 +1847,17 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
 		     : bkey_cmp(k.k->p, iter->pos) > 0))
 			k = btree_iter_level_prev(iter, l);
 
-		if (likely(k.k))
+		if (likely(k.k)) {
 			break;
-
-		if (unlikely(!bpos_cmp(iter->l[0].b->data->min_key, POS_MIN))) {
+		} else if (likely(bpos_cmp(l->b->data->min_key, POS_MIN))) {
+			/* Advance to previous leaf node: */
+			search_key = bpos_predecessor(l->b->data->min_key);
+		} else {
+			/* Start of btree: */
 			bch2_btree_iter_set_pos(iter, POS_MIN);
 			k = bkey_s_c_null;
-			goto no_key;
+			goto out;
 		}
-
-		/* Advance to previous leaf node: */
-		search_key = bpos_predecessor(iter->l[0].b->data->min_key);
 	}
 
 	EBUG_ON(bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0);
@@ -1855,19 +1866,10 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
 	if (bkey_cmp(k.k->p, iter->pos) < 0)
 		iter->pos = k.k->p;
 out:
+	iter->should_be_locked = true;
 	bch2_btree_iter_verify_entry_exit(iter);
 	bch2_btree_iter_verify(iter);
-	iter->should_be_locked = true;
 	return k;
-no_key:
-	/*
-	 * btree_iter_level_peek() may have set iter->k to a key we didn't want, and
-	 * then we errored going to the previous leaf - make sure it's
-	 * consistent with iter->pos:
-	 */
-	bkey_init(&iter->k);
-	iter->k.p = iter->pos;
-	goto out;
 }
 
 /**
-- 
cgit 


From 84841b0d1320c2084facb2f040d6ac0dfced943e Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 24 Aug 2021 21:26:43 -0400
Subject: bcachefs: bch2_dump_trans_iters_updates()

This factors out bch2_dump_trans_iters_updates() from the iter alloc
overflow path, and makes some small improvements to what it prints.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_iter.c        | 29 ++++++++++++++---------------
 fs/bcachefs/btree_iter.h        |  2 ++
 fs/bcachefs/btree_types.h       |  1 +
 fs/bcachefs/btree_update_leaf.c |  3 ++-
 4 files changed, 19 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index ee3b4098fa76..5bb714d1525b 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -2256,33 +2256,30 @@ int bch2_trans_iter_free(struct btree_trans *trans,
 }
 
 noinline __cold
-static void btree_trans_iter_alloc_fail(struct btree_trans *trans)
+void bch2_dump_trans_iters_updates(struct btree_trans *trans)
 {
-
 	struct btree_iter *iter;
 	struct btree_insert_entry *i;
 	unsigned idx;
-	char buf[100];
+	char buf1[300], buf2[100];
 
 	btree_trans_sort_iters(trans);
 
 	trans_for_each_iter_inorder(trans, iter, idx)
-		printk(KERN_ERR "iter: btree %s pos %s%s%s%s %pS\n",
+		printk(KERN_ERR "iter: btree %s pos %s real_pos %s%s%s%s %pS\n",
 		       bch2_btree_ids[iter->btree_id],
-		       (bch2_bpos_to_text(&PBUF(buf), iter->real_pos), buf),
+		       (bch2_bpos_to_text(&PBUF(buf1), iter->pos), buf1),
+		       (bch2_bpos_to_text(&PBUF(buf2), iter->real_pos), buf2),
 		       btree_iter_live(trans, iter) ? " live" : "",
 		       (trans->iters_touched & (1ULL << iter->idx)) ? " touched" : "",
 		       iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT ? " keep" : "",
 		       (void *) iter->ip_allocated);
 
-	trans_for_each_update(trans, i) {
-		char buf[300];
-
-		bch2_bkey_val_to_text(&PBUF(buf), trans->c, bkey_i_to_s_c(i->k));
-		printk(KERN_ERR "update: btree %s %s\n",
-		       bch2_btree_ids[i->iter->btree_id], buf);
-	}
-	panic("trans iter oveflow\n");
+	trans_for_each_update(trans, i)
+		printk(KERN_ERR "update: btree %s %s %pS\n",
+		       bch2_btree_ids[i->btree_id],
+		       (bch2_bkey_val_to_text(&PBUF(buf1), trans->c, bkey_i_to_s_c(i->k)), buf1),
+		       (void *) i->ip_allocated);
 }
 
 static struct btree_iter *btree_trans_iter_alloc(struct btree_trans *trans,
@@ -2294,8 +2291,10 @@ static struct btree_iter *btree_trans_iter_alloc(struct btree_trans *trans,
 	btree_trans_verify_sorted_refs(trans);
 
 	if (unlikely(trans->iters_linked ==
-		     ~((~0ULL << 1) << (BTREE_ITER_MAX - 1))))
-		btree_trans_iter_alloc_fail(trans);
+		     ~((~0ULL << 1) << (BTREE_ITER_MAX - 1)))) {
+		bch2_dump_trans_iters_updates(trans);
+		panic("trans iter oveflow\n");
+	}
 
 	idx = __ffs64(~trans->iters_linked);
 	iter = &trans->iters[idx];
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index 6fb0cb8252eb..5c754d466543 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -295,6 +295,8 @@ static inline int bkey_err(struct bkey_s_c k)
 
 /* new multiple iterator interface: */
 
+void bch2_dump_trans_iters_updates(struct btree_trans *);
+
 int bch2_trans_iter_put(struct btree_trans *, struct btree_iter *);
 int bch2_trans_iter_free(struct btree_trans *, struct btree_iter *);
 
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 7a9aece2eb87..6ef3285541f2 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -348,6 +348,7 @@ struct btree_insert_entry {
 	unsigned		trans_triggers_run:1;
 	struct bkey_i		*k;
 	struct btree_iter	*iter;
+	unsigned long		ip_allocated;
 };
 
 #ifndef CONFIG_LOCKDEP
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index cfb691fa65cf..791c121adeb2 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -1012,7 +1012,8 @@ int bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter,
 		.btree_id	= iter->btree_id,
 		.level		= iter->level,
 		.iter		= iter,
-		.k		= k
+		.k		= k,
+		.ip_allocated	= _RET_IP_,
 	};
 	bool is_extent = (iter->flags & BTREE_ITER_IS_EXTENTS) != 0;
 	int ret = 0;
-- 
cgit 


From 9f6bd30703a32c25988c6aca87c1be1baddc57bd Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 24 Aug 2021 21:30:06 -0400
Subject: bcachefs: Reduce iter->trans usage

Disfavoured, and should go away.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_io.c              |   2 +-
 fs/bcachefs/btree_iter.c            | 150 ++++++++++++++++++++----------------
 fs/bcachefs/btree_iter.h            |  27 ++++---
 fs/bcachefs/btree_key_cache.c       |   4 +-
 fs/bcachefs/btree_locking.h         |  17 ++--
 fs/bcachefs/btree_update.h          |   5 +-
 fs/bcachefs/btree_update_interior.c | 146 +++++++++++++++++------------------
 fs/bcachefs/btree_update_interior.h |  14 ----
 fs/bcachefs/btree_update_leaf.c     |  27 +++----
 fs/bcachefs/ec.c                    |  10 +--
 fs/bcachefs/extent_update.c         |  22 ++----
 fs/bcachefs/extent_update.h         |   8 +-
 fs/bcachefs/fs-io.c                 |   2 +-
 fs/bcachefs/io.c                    |   2 +-
 14 files changed, 220 insertions(+), 216 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index e42ade7cbc4b..99799d93cf09 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -498,7 +498,7 @@ void bch2_btree_init_next(struct btree_trans *trans,
 	bch2_btree_build_aux_trees(b);
 
 	if (iter && reinit_iter)
-		bch2_btree_iter_reinit_node(iter, b);
+		bch2_btree_iter_reinit_node(trans, iter, b);
 }
 
 static void btree_pos_to_text(struct printbuf *out, struct bch_fs *c,
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 5bb714d1525b..ef5e7e9884f5 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -19,10 +19,11 @@
 
 static void btree_iter_set_search_pos(struct btree_iter *, struct bpos);
 static inline void btree_trans_sort_iters(struct btree_trans *);
-static struct btree_iter *btree_iter_child_alloc(struct btree_iter *, unsigned long);
+static struct btree_iter *btree_iter_child_alloc(struct btree_trans *,
+					struct btree_iter *, unsigned long);
 static struct btree_iter *btree_trans_iter_alloc(struct btree_trans *,
 						 struct btree_iter *);
-static void btree_iter_copy(struct btree_iter *, struct btree_iter *);
+static void btree_iter_copy(struct btree_trans *, struct btree_iter *, struct btree_iter *);
 
 static inline int btree_iter_cmp(const struct btree_iter *l,
 				 const struct btree_iter *r)
@@ -100,19 +101,21 @@ static inline bool btree_iter_pos_in_node(struct btree_iter *iter,
 
 /* Btree node locking: */
 
-void bch2_btree_node_unlock_write(struct btree *b, struct btree_iter *iter)
+void bch2_btree_node_unlock_write(struct btree_trans *trans,
+			struct btree_iter *iter, struct btree *b)
 {
-	bch2_btree_node_unlock_write_inlined(b, iter);
+	bch2_btree_node_unlock_write_inlined(trans, iter, b);
 }
 
-void __bch2_btree_node_lock_write(struct btree *b, struct btree_iter *iter)
+void __bch2_btree_node_lock_write(struct btree_trans *trans,
+			struct btree_iter *iter, struct btree *b)
 {
 	struct btree_iter *linked;
 	unsigned readers = 0;
 
 	EBUG_ON(!btree_node_intent_locked(iter, b->c.level));
 
-	trans_for_each_iter(iter->trans, linked)
+	trans_for_each_iter(trans, linked)
 		if (linked->l[b->c.level].b == b &&
 		    btree_node_read_locked(linked, b->c.level))
 			readers++;
@@ -129,7 +132,7 @@ void __bch2_btree_node_lock_write(struct btree *b, struct btree_iter *iter)
 	else
 		this_cpu_sub(*b->c.lock.readers, readers);
 
-	btree_node_lock_type(iter->trans->c, b, SIX_LOCK_write);
+	btree_node_lock_type(trans->c, b, SIX_LOCK_write);
 
 	if (!b->c.lock.readers)
 		atomic64_add(__SIX_VAL(read_lock, readers),
@@ -191,8 +194,9 @@ success:
 	return true;
 }
 
-static inline bool btree_iter_get_locks(struct btree_iter *iter, bool upgrade,
-					unsigned long trace_ip)
+static inline bool btree_iter_get_locks(struct btree_trans *trans,
+					struct btree_iter *iter,
+					bool upgrade, unsigned long trace_ip)
 {
 	unsigned l = iter->level;
 	int fail_idx = -1;
@@ -206,7 +210,7 @@ static inline bool btree_iter_get_locks(struct btree_iter *iter, bool upgrade,
 		      : bch2_btree_node_relock(iter, l))) {
 			(upgrade
 			 ? trace_node_upgrade_fail
-			 : trace_node_relock_fail)(iter->trans->ip, trace_ip,
+			 : trace_node_relock_fail)(trans->ip, trace_ip,
 					btree_iter_type(iter) == BTREE_ITER_CACHED,
 					iter->btree_id, &iter->real_pos,
 					l, iter->l[l].lock_seq,
@@ -237,7 +241,7 @@ static inline bool btree_iter_get_locks(struct btree_iter *iter, bool upgrade,
 	if (iter->uptodate == BTREE_ITER_NEED_RELOCK)
 		iter->uptodate = BTREE_ITER_NEED_PEEK;
 
-	bch2_btree_trans_verify_locks(iter->trans);
+	bch2_btree_trans_verify_locks(trans);
 
 	return iter->uptodate < BTREE_ITER_NEED_RELOCK;
 }
@@ -363,11 +367,12 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
 /* Btree iterator locking: */
 
 #ifdef CONFIG_BCACHEFS_DEBUG
-static void bch2_btree_iter_verify_locks(struct btree_iter *iter)
+static void bch2_btree_iter_verify_locks(struct btree_trans *trans,
+					 struct btree_iter *iter)
 {
 	unsigned l;
 
-	if (!(iter->trans->iters_linked & (1ULL << iter->idx))) {
+	if (!(trans->iters_linked & (1ULL << iter->idx))) {
 		BUG_ON(iter->nodes_locked);
 		return;
 	}
@@ -387,10 +392,11 @@ void bch2_btree_trans_verify_locks(struct btree_trans *trans)
 	struct btree_iter *iter;
 
 	trans_for_each_iter(trans, iter)
-		bch2_btree_iter_verify_locks(iter);
+		bch2_btree_iter_verify_locks(trans, iter);
 }
 #else
-static inline void bch2_btree_iter_verify_locks(struct btree_iter *iter) {}
+static inline void bch2_btree_iter_verify_locks(struct btree_trans *trans,
+						struct btree_iter *iter) {}
 #endif
 
 /*
@@ -398,13 +404,14 @@ static inline void bch2_btree_iter_verify_locks(struct btree_iter *iter) {}
  */
 bool bch2_btree_iter_relock_intent(struct btree_iter *iter)
 {
+	struct btree_trans *trans = iter->trans;
 	unsigned l;
 
 	for (l = iter->level;
 	     l < iter->locks_want && btree_iter_node(iter, l);
 	     l++) {
 		if (!bch2_btree_node_relock(iter, l)) {
-			trace_node_relock_fail(iter->trans->ip, _RET_IP_,
+			trace_node_relock_fail(trans->ip, _RET_IP_,
 					btree_iter_type(iter) == BTREE_ITER_CACHED,
 					iter->btree_id, &iter->real_pos,
 					l, iter->l[l].lock_seq,
@@ -415,7 +422,7 @@ bool bch2_btree_iter_relock_intent(struct btree_iter *iter)
 					? iter->l[l].b->c.lock.state.seq
 					: 0);
 			btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
-			btree_trans_restart(iter->trans);
+			btree_trans_restart(trans);
 			return false;
 		}
 	}
@@ -424,25 +431,27 @@ bool bch2_btree_iter_relock_intent(struct btree_iter *iter)
 }
 
 __flatten
-bool bch2_btree_iter_relock(struct btree_iter *iter, unsigned long trace_ip)
+static bool bch2_btree_iter_relock(struct btree_trans *trans,
+			struct btree_iter *iter, unsigned long trace_ip)
 {
-	bool ret = btree_iter_get_locks(iter, false, trace_ip);
+	bool ret = btree_iter_get_locks(trans, iter, false, trace_ip);
 
 	if (!ret)
-		btree_trans_restart(iter->trans);
+		btree_trans_restart(trans);
 	return ret;
 }
 
 bool __bch2_btree_iter_upgrade(struct btree_iter *iter,
 			       unsigned new_locks_want)
 {
+	struct btree_trans *trans = iter->trans;
 	struct btree_iter *linked;
 
 	EBUG_ON(iter->locks_want >= new_locks_want);
 
 	iter->locks_want = new_locks_want;
 
-	if (btree_iter_get_locks(iter, true, _THIS_IP_))
+	if (btree_iter_get_locks(trans, iter, true, _THIS_IP_))
 		return true;
 
 	/*
@@ -464,17 +473,17 @@ bool __bch2_btree_iter_upgrade(struct btree_iter *iter,
 	 * before interior nodes - now that's handled by
 	 * bch2_btree_iter_traverse_all().
 	 */
-	trans_for_each_iter(iter->trans, linked)
+	trans_for_each_iter(trans, linked)
 		if (linked != iter &&
 		    btree_iter_type(linked) == btree_iter_type(iter) &&
 		    linked->btree_id == iter->btree_id &&
 		    linked->locks_want < new_locks_want) {
 			linked->locks_want = new_locks_want;
-			btree_iter_get_locks(linked, true, _THIS_IP_);
+			btree_iter_get_locks(trans, linked, true, _THIS_IP_);
 		}
 
 	if (iter->should_be_locked)
-		btree_trans_restart(iter->trans);
+		btree_trans_restart(trans);
 	return false;
 }
 
@@ -528,7 +537,7 @@ bool bch2_trans_relock(struct btree_trans *trans)
 
 	trans_for_each_iter(trans, iter)
 		if (btree_iter_should_be_locked(iter) &&
-		    !bch2_btree_iter_relock(iter, _RET_IP_)) {
+		    !bch2_btree_iter_relock(trans, iter, _RET_IP_)) {
 			trace_trans_restart_relock(trans->ip, _RET_IP_,
 					iter->btree_id, &iter->real_pos);
 			BUG_ON(!trans->restarted);
@@ -686,7 +695,7 @@ static void bch2_btree_iter_verify(struct btree_iter *iter)
 		bch2_btree_iter_verify_level(iter, i);
 	}
 
-	bch2_btree_iter_verify_locks(iter);
+	bch2_btree_iter_verify_locks(trans, iter);
 }
 
 static void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter)
@@ -753,13 +762,14 @@ static void __bch2_btree_iter_fix_key_modified(struct btree_iter *iter,
 	btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
 }
 
-void bch2_btree_iter_fix_key_modified(struct btree_iter *iter,
+void bch2_btree_iter_fix_key_modified(struct btree_trans *trans,
+				      struct btree_iter *iter,
 				      struct btree *b,
 				      struct bkey_packed *where)
 {
 	struct btree_iter *linked;
 
-	trans_for_each_iter_with_node(iter->trans, b, linked) {
+	trans_for_each_iter_with_node(trans, b, linked) {
 		__bch2_btree_iter_fix_key_modified(linked, b, where);
 		bch2_btree_iter_verify_level(linked, b->c.level);
 	}
@@ -863,7 +873,8 @@ fixup_done:
 		btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
 }
 
-void bch2_btree_node_iter_fix(struct btree_iter *iter,
+void bch2_btree_node_iter_fix(struct btree_trans *trans,
+			      struct btree_iter *iter,
 			      struct btree *b,
 			      struct btree_node_iter *node_iter,
 			      struct bkey_packed *where,
@@ -881,7 +892,7 @@ void bch2_btree_node_iter_fix(struct btree_iter *iter,
 			bch2_btree_node_iter_verify(node_iter, b);
 	}
 
-	trans_for_each_iter_with_node(iter->trans, b, linked) {
+	trans_for_each_iter_with_node(trans, b, linked) {
 		__bch2_btree_node_iter_fix(linked, b,
 					   &linked->l[b->c.level].iter, t,
 					   where, clobber_u64s, new_u64s);
@@ -1055,12 +1066,13 @@ static inline void btree_iter_node_set(struct btree_iter *iter,
  * A btree node is being replaced - update the iterator to point to the new
  * node:
  */
-void bch2_btree_iter_node_replace(struct btree_iter *iter, struct btree *b)
+void bch2_btree_iter_node_replace(struct btree_trans *trans,
+			struct btree_iter *iter, struct btree *b)
 {
 	enum btree_node_locked_type t;
 	struct btree_iter *linked;
 
-	trans_for_each_iter(iter->trans, linked)
+	trans_for_each_iter(trans, linked)
 		if (btree_iter_type(linked) != BTREE_ITER_CACHED &&
 		    btree_iter_pos_in_node(linked, b)) {
 			/*
@@ -1080,12 +1092,13 @@ void bch2_btree_iter_node_replace(struct btree_iter *iter, struct btree *b)
 		}
 }
 
-void bch2_btree_iter_node_drop(struct btree_iter *iter, struct btree *b)
+void bch2_btree_iter_node_drop(struct btree_trans *trans,
+			struct btree_iter *iter, struct btree *b)
 {
 	struct btree_iter *linked;
 	unsigned level = b->c.level;
 
-	trans_for_each_iter(iter->trans, linked)
+	trans_for_each_iter(trans, linked)
 		if (linked->l[level].b == b) {
 			btree_node_unlock(linked, level);
 			linked->l[level].b = BTREE_ITER_NO_NODE_DROP;
@@ -1096,11 +1109,12 @@ void bch2_btree_iter_node_drop(struct btree_iter *iter, struct btree *b)
  * A btree node has been modified in such a way as to invalidate iterators - fix
  * them:
  */
-void bch2_btree_iter_reinit_node(struct btree_iter *iter, struct btree *b)
+void bch2_btree_iter_reinit_node(struct btree_trans *trans,
+			struct btree_iter *iter, struct btree *b)
 {
 	struct btree_iter *linked;
 
-	trans_for_each_iter_with_node(iter->trans, b, linked)
+	trans_for_each_iter_with_node(trans, b, linked)
 		__btree_iter_init(linked, b->c.level);
 }
 
@@ -1170,9 +1184,9 @@ static inline int btree_iter_lock_root(struct btree_trans *trans,
 }
 
 noinline
-static int btree_iter_prefetch(struct btree_iter *iter)
+static int btree_iter_prefetch(struct btree_trans *trans, struct btree_iter *iter)
 {
-	struct bch_fs *c = iter->trans->c;
+	struct bch_fs *c = trans->c;
 	struct btree_iter_level *l = &iter->l[iter->level];
 	struct btree_node_iter node_iter = l->iter;
 	struct bkey_packed *k;
@@ -1258,19 +1272,20 @@ static __always_inline int btree_iter_down(struct btree_trans *trans,
 		btree_node_mem_ptr_set(iter, level + 1, b);
 
 	if (iter->flags & BTREE_ITER_PREFETCH)
-		ret = btree_iter_prefetch(iter);
+		ret = btree_iter_prefetch(trans, iter);
 
 	if (btree_node_read_locked(iter, level + 1))
 		btree_node_unlock(iter, level + 1);
 	iter->level = level;
 
-	bch2_btree_iter_verify_locks(iter);
+	bch2_btree_iter_verify_locks(trans, iter);
 err:
 	bch2_bkey_buf_exit(&tmp, c);
 	return ret;
 }
 
-static int btree_iter_traverse_one(struct btree_iter *, unsigned long);
+static int btree_iter_traverse_one(struct btree_trans *,
+			struct btree_iter *, unsigned long);
 
 static int __btree_iter_traverse_all(struct btree_trans *trans, int ret,
 				     unsigned long trace_ip)
@@ -1331,7 +1346,7 @@ retry_all:
 
 		EBUG_ON(!(trans->iters_linked & (1ULL << iter->idx)));
 
-		ret = btree_iter_traverse_one(iter, _THIS_IP_);
+		ret = btree_iter_traverse_one(trans, iter, _THIS_IP_);
 		if (ret)
 			goto retry_all;
 
@@ -1400,10 +1415,10 @@ static inline unsigned btree_iter_up_until_good_node(struct btree_iter *iter,
  * On error, caller (peek_node()/peek_key()) must return NULL; the error is
  * stashed in the iterator and returned from bch2_trans_exit().
  */
-static int btree_iter_traverse_one(struct btree_iter *iter,
+static int btree_iter_traverse_one(struct btree_trans *trans,
+				   struct btree_iter *iter,
 				   unsigned long trace_ip)
 {
-	struct btree_trans *trans = iter->trans;
 	unsigned l, depth_want = iter->level;
 	int ret = 0;
 
@@ -1412,7 +1427,7 @@ static int btree_iter_traverse_one(struct btree_iter *iter,
 	 * and re-traverse the iterator without a transaction restart:
 	 */
 	if (iter->should_be_locked) {
-		ret = bch2_btree_iter_relock(iter, trace_ip) ? 0 : -EINTR;
+		ret = bch2_btree_iter_relock(trans, iter, trace_ip) ? 0 : -EINTR;
 		goto out;
 	}
 
@@ -1488,7 +1503,7 @@ static int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter)
 	int ret;
 
 	ret =   bch2_trans_cond_resched(trans) ?:
-		btree_iter_traverse_one(iter, _RET_IP_);
+		btree_iter_traverse_one(trans, iter, _RET_IP_);
 	if (unlikely(ret) && hweight64(trans->iters_linked) == 1) {
 		ret = __btree_iter_traverse_all(trans, ret, _RET_IP_);
 		BUG_ON(ret == -EINTR);
@@ -1619,20 +1634,21 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
 
 static void btree_iter_set_search_pos(struct btree_iter *iter, struct bpos new_pos)
 {
+	struct btree_trans *trans = iter->trans;
 #ifdef CONFIG_BCACHEFS_DEBUG
 	struct bpos old_pos = iter->real_pos;
 #endif
 	int cmp = bpos_cmp(new_pos, iter->real_pos);
 	unsigned l = iter->level;
 
-	EBUG_ON(iter->trans->restarted);
+	EBUG_ON(trans->restarted);
 
 	if (!cmp)
 		goto out;
 
 	iter->real_pos = new_pos;
 	iter->should_be_locked = false;
-	iter->trans->iters_sorted = false;
+	trans->iters_sorted = false;
 
 	if (unlikely(btree_iter_type(iter) == BTREE_ITER_CACHED)) {
 		btree_node_unlock(iter, 0);
@@ -1666,7 +1682,7 @@ out:
 
 	bch2_btree_iter_verify(iter);
 #ifdef CONFIG_BCACHEFS_DEBUG
-	trace_iter_set_search_pos(iter->trans->ip, _RET_IP_,
+	trace_iter_set_search_pos(trans->ip, _RET_IP_,
 				  iter->btree_id,
 				  &old_pos, &new_pos, l);
 #endif
@@ -1886,6 +1902,7 @@ struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *iter)
 
 struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
 {
+	struct btree_trans *trans = iter->trans;
 	struct bpos search_key;
 	struct bkey_s_c k;
 	int ret;
@@ -1954,9 +1971,9 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
 
 		if (iter->flags & BTREE_ITER_INTENT) {
 			struct btree_iter *child =
-				btree_iter_child_alloc(iter, _THIS_IP_);
+				btree_iter_child_alloc(trans, iter, _THIS_IP_);
 
-			btree_iter_copy(child, iter);
+			btree_iter_copy(trans, child, iter);
 			k = bch2_btree_iter_peek(child);
 
 			if (k.k && !bkey_err(k))
@@ -2163,21 +2180,21 @@ static inline void btree_iter_list_add(struct btree_trans *trans,
 	btree_trans_verify_sorted_refs(trans);
 }
 
-static void btree_iter_child_free(struct btree_iter *iter)
+static void btree_iter_child_free(struct btree_trans *trans, struct btree_iter *iter)
 {
-	struct btree_iter *child = btree_iter_child(iter);
+	struct btree_iter *child = btree_iter_child(trans, iter);
 
 	if (child) {
-		bch2_trans_iter_free(iter->trans, child);
+		bch2_trans_iter_free(trans, child);
 		iter->child_idx = U8_MAX;
 	}
 }
 
-static struct btree_iter *btree_iter_child_alloc(struct btree_iter *iter,
+static struct btree_iter *btree_iter_child_alloc(struct btree_trans *trans,
+						 struct btree_iter *iter,
 						 unsigned long ip)
 {
-	struct btree_trans *trans = iter->trans;
-	struct btree_iter *child = btree_iter_child(iter);
+	struct btree_iter *child = btree_iter_child(trans, iter);
 
 	if (!child) {
 		child = btree_trans_iter_alloc(trans, iter);
@@ -2194,7 +2211,7 @@ static struct btree_iter *btree_iter_child_alloc(struct btree_iter *iter,
 static inline void __bch2_trans_iter_free(struct btree_trans *trans,
 					  unsigned idx)
 {
-	btree_iter_child_free(&trans->iters[idx]);
+	btree_iter_child_free(trans, &trans->iters[idx]);
 
 	btree_iter_list_remove(trans, &trans->iters[idx]);
 
@@ -2312,12 +2329,13 @@ static struct btree_iter *btree_trans_iter_alloc(struct btree_trans *trans,
 	return iter;
 }
 
-static void btree_iter_copy(struct btree_iter *dst, struct btree_iter *src)
+static void btree_iter_copy(struct btree_trans *trans, struct btree_iter *dst,
+			    struct btree_iter *src)
 {
 	unsigned i, offset = offsetof(struct btree_iter, flags);
 
 	__bch2_btree_iter_unlock(dst);
-	btree_iter_child_free(dst);
+	btree_iter_child_free(trans, dst);
 
 	memcpy((void *) dst + offset,
 	       (void *) src + offset,
@@ -2330,7 +2348,7 @@ static void btree_iter_copy(struct btree_iter *dst, struct btree_iter *src)
 
 	dst->flags &= ~BTREE_ITER_KEEP_UNTIL_COMMIT;
 	dst->flags &= ~BTREE_ITER_SET_POS_AFTER_COMMIT;
-	dst->trans->iters_sorted = false;
+	trans->iters_sorted = false;
 }
 
 struct btree_iter *__bch2_trans_get_iter(struct btree_trans *trans,
@@ -2388,7 +2406,7 @@ struct btree_iter *__bch2_trans_get_iter(struct btree_trans *trans,
 		bch2_btree_iter_init(trans, iter, btree_id);
 	} else if (btree_iter_keep(trans, best)) {
 		iter = btree_trans_iter_alloc(trans, best);
-		btree_iter_copy(iter, best);
+		btree_iter_copy(trans, iter, best);
 	} else {
 		iter = best;
 	}
@@ -2411,7 +2429,7 @@ struct btree_iter *__bch2_trans_get_iter(struct btree_trans *trans,
 	locks_want = min(locks_want, BTREE_MAX_DEPTH);
 	if (locks_want > iter->locks_want) {
 		iter->locks_want = locks_want;
-		btree_iter_get_locks(iter, true, _THIS_IP_);
+		btree_iter_get_locks(trans, iter, true, _THIS_IP_);
 	}
 
 	while (iter->level != depth) {
@@ -2464,12 +2482,12 @@ struct btree_iter *bch2_trans_get_node_iter(struct btree_trans *trans,
 }
 
 struct btree_iter *__bch2_trans_copy_iter(struct btree_trans *trans,
-					struct btree_iter *src)
+					  struct btree_iter *src)
 {
 	struct btree_iter *iter;
 
 	iter = btree_trans_iter_alloc(trans, src);
-	btree_iter_copy(iter, src);
+	btree_iter_copy(trans, iter, src);
 
 	trans->iters_live |= 1ULL << iter->idx;
 	/*
@@ -2647,7 +2665,7 @@ int bch2_trans_exit(struct btree_trans *trans)
 		struct btree_iter *iter;
 
 		trans_for_each_iter(trans, iter)
-			btree_iter_child_free(iter);
+			btree_iter_child_free(trans, iter);
 	}
 
 	if (trans->iters_live) {
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index 5c754d466543..ea129387ebb7 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -135,14 +135,13 @@ static inline void bch2_btree_trans_verify_iters(struct btree_trans *trans,
 static inline void bch2_btree_trans_verify_locks(struct btree_trans *iter) {}
 #endif
 
-void bch2_btree_iter_fix_key_modified(struct btree_iter *, struct btree *,
-					   struct bkey_packed *);
-void bch2_btree_node_iter_fix(struct btree_iter *, struct btree *,
-			      struct btree_node_iter *, struct bkey_packed *,
-			      unsigned, unsigned);
+void bch2_btree_iter_fix_key_modified(struct btree_trans *trans, struct btree_iter *,
+				      struct btree *, struct bkey_packed *);
+void bch2_btree_node_iter_fix(struct btree_trans *trans, struct btree_iter *,
+			      struct btree *, struct btree_node_iter *,
+			      struct bkey_packed *, unsigned, unsigned);
 
 bool bch2_btree_iter_relock_intent(struct btree_iter *);
-bool bch2_btree_iter_relock(struct btree_iter *, unsigned long);
 
 bool bch2_trans_relock(struct btree_trans *);
 void bch2_trans_unlock(struct btree_trans *);
@@ -179,10 +178,13 @@ static inline void bch2_btree_iter_downgrade(struct btree_iter *iter)
 
 void bch2_trans_downgrade(struct btree_trans *);
 
-void bch2_btree_iter_node_replace(struct btree_iter *, struct btree *);
-void bch2_btree_iter_node_drop(struct btree_iter *, struct btree *);
+void bch2_btree_iter_node_replace(struct btree_trans *trans,
+				  struct btree_iter *, struct btree *);
+void bch2_btree_iter_node_drop(struct btree_trans *,
+			       struct btree_iter *, struct btree *);
 
-void bch2_btree_iter_reinit_node(struct btree_iter *, struct btree *);
+void bch2_btree_iter_reinit_node(struct btree_trans *,
+				 struct btree_iter *, struct btree *);
 
 int __must_check bch2_btree_iter_traverse(struct btree_iter *);
 
@@ -226,9 +228,10 @@ static inline struct btree_iter *idx_to_btree_iter(struct btree_trans *trans, un
 	return idx != U8_MAX ? trans->iters + idx : NULL;
 }
 
-static inline struct btree_iter *btree_iter_child(struct btree_iter *iter)
+static inline struct btree_iter *btree_iter_child(struct btree_trans *trans,
+						  struct btree_iter *iter)
 {
-	return idx_to_btree_iter(iter->trans, iter->child_idx);
+	return idx_to_btree_iter(trans, iter->child_idx);
 }
 
 /*
@@ -319,7 +322,7 @@ bch2_trans_get_iter(struct btree_trans *trans, enum btree_id btree_id,
 }
 
 struct btree_iter *__bch2_trans_copy_iter(struct btree_trans *,
-					struct btree_iter *);
+					  struct btree_iter *);
 static inline struct btree_iter *
 bch2_trans_copy_iter(struct btree_trans *trans, struct btree_iter *src)
 {
diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index ac8f40810d7a..c7d223f91bf6 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -238,7 +238,7 @@ static int btree_key_cache_fill(struct btree_trans *trans,
 	 * XXX: not allowed to be holding read locks when we take a write lock,
 	 * currently
 	 */
-	bch2_btree_node_lock_write(ck_iter->l[0].b, ck_iter);
+	bch2_btree_node_lock_write(trans, ck_iter, ck_iter->l[0].b);
 	if (new_k) {
 		kfree(ck->k);
 		ck->u64s = new_u64s;
@@ -247,7 +247,7 @@ static int btree_key_cache_fill(struct btree_trans *trans,
 
 	bkey_reassemble(ck->k, k);
 	ck->valid = true;
-	bch2_btree_node_unlock_write(ck_iter->l[0].b, ck_iter);
+	bch2_btree_node_unlock_write(trans, ck_iter, ck_iter->l[0].b);
 
 	/* We're not likely to need this iterator again: */
 	set_btree_iter_dontneed(trans, iter);
diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h
index fda164802154..0acc731df8e9 100644
--- a/fs/bcachefs/btree_locking.h
+++ b/fs/bcachefs/btree_locking.h
@@ -207,30 +207,35 @@ static inline bool bch2_btree_node_relock(struct btree_iter *iter,
  * succeed:
  */
 static inline void
-bch2_btree_node_unlock_write_inlined(struct btree *b, struct btree_iter *iter)
+bch2_btree_node_unlock_write_inlined(struct btree_trans *trans, struct btree_iter *iter,
+				     struct btree *b)
 {
 	struct btree_iter *linked;
 
 	EBUG_ON(iter->l[b->c.level].b != b);
 	EBUG_ON(iter->l[b->c.level].lock_seq + 1 != b->c.lock.state.seq);
 
-	trans_for_each_iter_with_node(iter->trans, b, linked)
+	trans_for_each_iter_with_node(trans, b, linked)
 		linked->l[b->c.level].lock_seq += 2;
 
 	six_unlock_write(&b->c.lock);
 }
 
-void bch2_btree_node_unlock_write(struct btree *, struct btree_iter *);
+void bch2_btree_node_unlock_write(struct btree_trans *,
+			struct btree_iter *, struct btree *);
 
-void __bch2_btree_node_lock_write(struct btree *, struct btree_iter *);
+void __bch2_btree_node_lock_write(struct btree_trans *,
+			struct btree_iter *, struct btree *);
 
-static inline void bch2_btree_node_lock_write(struct btree *b, struct btree_iter *iter)
+static inline void bch2_btree_node_lock_write(struct btree_trans *trans,
+					      struct btree_iter *iter,
+					      struct btree *b)
 {
 	EBUG_ON(iter->l[b->c.level].b != b);
 	EBUG_ON(iter->l[b->c.level].lock_seq != b->c.lock.state.seq);
 
 	if (unlikely(!six_trylock_write(&b->c.lock)))
-		__bch2_btree_node_lock_write(b, iter);
+		__bch2_btree_node_lock_write(trans, iter, b);
 }
 
 #endif /* _BCACHEFS_BTREE_LOCKING_H */
diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
index 217b52e1a168..5707baf10262 100644
--- a/fs/bcachefs/btree_update.h
+++ b/fs/bcachefs/btree_update.h
@@ -10,8 +10,9 @@ struct btree;
 
 void bch2_btree_node_lock_for_insert(struct btree_trans *, struct btree_iter *,
 				     struct btree *);
-bool bch2_btree_bset_insert_key(struct btree_iter *, struct btree *,
-				struct btree_node_iter *, struct bkey_i *);
+bool bch2_btree_bset_insert_key(struct btree_trans *, struct btree_iter *,
+				struct btree *, struct btree_node_iter *,
+				struct bkey_i *);
 void bch2_btree_add_journal_pin(struct bch_fs *, struct btree *, u64);
 
 enum btree_insert_flags {
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 2594738f3d53..4acd49900611 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -25,6 +25,7 @@
 static void bch2_btree_insert_node(struct btree_update *, struct btree_trans *,
 				   struct btree_iter *, struct btree *,
 				   struct keylist *, unsigned);
+static void bch2_btree_update_add_new_node(struct btree_update *, struct btree *);
 
 /* Debug code: */
 
@@ -159,27 +160,14 @@ static void __btree_node_free(struct bch_fs *c, struct btree *b)
 	mutex_unlock(&c->btree_cache.lock);
 }
 
-void bch2_btree_node_free_never_inserted(struct bch_fs *c, struct btree *b)
-{
-	struct open_buckets ob = b->ob;
-
-	b->ob.nr = 0;
-
-	clear_btree_node_dirty(c, b);
-
-	btree_node_lock_type(c, b, SIX_LOCK_write);
-	__btree_node_free(c, b);
-	six_unlock_write(&b->c.lock);
-
-	bch2_open_buckets_put(c, &ob);
-}
-
-void bch2_btree_node_free_inmem(struct bch_fs *c, struct btree *b,
-				struct btree_iter *iter)
+static void bch2_btree_node_free_inmem(struct btree_trans *trans,
+				       struct btree_iter *iter,
+				       struct btree *b)
 {
+	struct bch_fs *c = trans->c;
 	struct btree_iter *linked;
 
-	trans_for_each_iter(iter->trans, linked)
+	trans_for_each_iter(trans, linked)
 		BUG_ON(linked->l[b->c.level].b == b);
 
 	six_lock_write(&b->c.lock, NULL, NULL);
@@ -773,7 +761,7 @@ static void btree_update_updated_root(struct btree_update *as, struct btree *b)
  * And it adds @b to the list of @as's new nodes, so that we can update sector
  * counts in bch2_btree_update_nodes_written:
  */
-void bch2_btree_update_add_new_node(struct btree_update *as, struct btree *b)
+static void bch2_btree_update_add_new_node(struct btree_update *as, struct btree *b)
 {
 	struct bch_fs *c = as->c;
 
@@ -827,7 +815,7 @@ found:
 		closure_put(&as->cl);
 }
 
-void bch2_btree_update_get_open_buckets(struct btree_update *as, struct btree *b)
+static void bch2_btree_update_get_open_buckets(struct btree_update *as, struct btree *b)
 {
 	while (b->ob.nr)
 		as->open_buckets[as->nr_open_buckets++] =
@@ -839,7 +827,7 @@ void bch2_btree_update_get_open_buckets(struct btree_update *as, struct btree *b
  * nodes and thus outstanding btree_updates - redirect @b's
  * btree_updates to point to this btree_update:
  */
-void bch2_btree_interior_update_will_free_node(struct btree_update *as,
+static void bch2_btree_interior_update_will_free_node(struct btree_update *as,
 					       struct btree *b)
 {
 	struct bch_fs *c = as->c;
@@ -911,7 +899,7 @@ void bch2_btree_interior_update_will_free_node(struct btree_update *as,
 	as->nr_old_nodes++;
 }
 
-void bch2_btree_update_done(struct btree_update *as)
+static void bch2_btree_update_done(struct btree_update *as)
 {
 	BUG_ON(as->mode == BTREE_INTERIOR_NO_UPDATE);
 
@@ -925,11 +913,10 @@ void bch2_btree_update_done(struct btree_update *as)
 		    as->c->btree_interior_update_worker);
 }
 
-struct btree_update *
-bch2_btree_update_start(struct btree_iter *iter, unsigned level,
-			unsigned nr_nodes, unsigned flags)
+static struct btree_update *
+bch2_btree_update_start(struct btree_trans *trans, struct btree_iter *iter,
+			unsigned level, unsigned nr_nodes, unsigned flags)
 {
-	struct btree_trans *trans = iter->trans;
 	struct bch_fs *c = trans->c;
 	struct btree_update *as;
 	struct closure cl;
@@ -1092,8 +1079,10 @@ static void bch2_btree_set_root_inmem(struct bch_fs *c, struct btree *b)
  * is nothing new to be done.  This just guarantees that there is a
  * journal write.
  */
-static void bch2_btree_set_root(struct btree_update *as, struct btree *b,
-				struct btree_iter *iter)
+static void bch2_btree_set_root(struct btree_update *as,
+				struct btree_trans *trans,
+				struct btree_iter *iter,
+				struct btree *b)
 {
 	struct bch_fs *c = as->c;
 	struct btree *old;
@@ -1108,7 +1097,7 @@ static void bch2_btree_set_root(struct btree_update *as, struct btree *b,
 	 * Ensure no one is using the old root while we switch to the
 	 * new root:
 	 */
-	bch2_btree_node_lock_write(old, iter);
+	bch2_btree_node_lock_write(trans, iter, old);
 
 	bch2_btree_set_root_inmem(c, b);
 
@@ -1121,15 +1110,17 @@ static void bch2_btree_set_root(struct btree_update *as, struct btree *b,
 	 * an intent lock on the new root, and any updates that would
 	 * depend on the new root would have to update the new root.
 	 */
-	bch2_btree_node_unlock_write(old, iter);
+	bch2_btree_node_unlock_write(trans, iter, old);
 }
 
 /* Interior node updates: */
 
-static void bch2_insert_fixup_btree_ptr(struct btree_update *as, struct btree *b,
+static void bch2_insert_fixup_btree_ptr(struct btree_update *as,
+					struct btree_trans *trans,
 					struct btree_iter *iter,
-					struct bkey_i *insert,
-					struct btree_node_iter *node_iter)
+					struct btree *b,
+					struct btree_node_iter *node_iter,
+					struct bkey_i *insert)
 {
 	struct bch_fs *c = as->c;
 	struct bkey_packed *k;
@@ -1161,15 +1152,18 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, struct btree *b
 	       bkey_iter_pos_cmp(b, k, &insert->k.p) < 0)
 		bch2_btree_node_iter_advance(node_iter, b);
 
-	bch2_btree_bset_insert_key(iter, b, node_iter, insert);
+	bch2_btree_bset_insert_key(trans, iter, b, node_iter, insert);
 	set_btree_node_dirty(c, b);
 	set_btree_node_need_write(b);
 }
 
 static void
-__bch2_btree_insert_keys_interior(struct btree_update *as, struct btree *b,
-				  struct btree_iter *iter, struct keylist *keys,
-				  struct btree_node_iter node_iter)
+__bch2_btree_insert_keys_interior(struct btree_update *as,
+				  struct btree_trans *trans,
+				  struct btree_iter *iter,
+				  struct btree *b,
+				  struct btree_node_iter node_iter,
+				  struct keylist *keys)
 {
 	struct bkey_i *insert = bch2_keylist_front(keys);
 	struct bkey_packed *k;
@@ -1181,8 +1175,8 @@ __bch2_btree_insert_keys_interior(struct btree_update *as, struct btree *b,
 		;
 
 	while (!bch2_keylist_empty(keys)) {
-		bch2_insert_fixup_btree_ptr(as, b, iter,
-				bch2_keylist_front(keys), &node_iter);
+		bch2_insert_fixup_btree_ptr(as, trans, iter, b,
+				&node_iter, bch2_keylist_front(keys));
 		bch2_keylist_pop_front(keys);
 	}
 }
@@ -1308,8 +1302,10 @@ static struct btree *__btree_split_node(struct btree_update *as,
  * nodes that were coalesced, and thus in the middle of a child node post
  * coalescing:
  */
-static void btree_split_insert_keys(struct btree_update *as, struct btree *b,
+static void btree_split_insert_keys(struct btree_update *as,
+				    struct btree_trans *trans,
 				    struct btree_iter *iter,
+				    struct btree *b,
 				    struct keylist *keys)
 {
 	struct btree_node_iter node_iter;
@@ -1319,7 +1315,7 @@ static void btree_split_insert_keys(struct btree_update *as, struct btree *b,
 
 	bch2_btree_node_iter_init(&node_iter, b, &k->k.p);
 
-	__bch2_btree_insert_keys_interior(as, b, iter, keys, node_iter);
+	__bch2_btree_insert_keys_interior(as, trans, iter, b, node_iter, keys);
 
 	/*
 	 * We can't tolerate whiteouts here - with whiteouts there can be
@@ -1368,7 +1364,7 @@ static void btree_split(struct btree_update *as,
 	bch2_btree_update_add_new_node(as, n1);
 
 	if (keys)
-		btree_split_insert_keys(as, n1, iter, keys);
+		btree_split_insert_keys(as, trans, iter, n1, keys);
 
 	if (bset_u64s(&n1->set[0]) > BTREE_SPLIT_THRESHOLD(c)) {
 		trace_btree_split(c, b);
@@ -1398,7 +1394,7 @@ static void btree_split(struct btree_update *as,
 			n3->sib_u64s[0] = U16_MAX;
 			n3->sib_u64s[1] = U16_MAX;
 
-			btree_split_insert_keys(as, n3, iter, &as->parent_keys);
+			btree_split_insert_keys(as, trans, iter, n3, &as->parent_keys);
 
 			bch2_btree_node_write(c, n3, SIX_LOCK_intent);
 		}
@@ -1420,10 +1416,10 @@ static void btree_split(struct btree_update *as,
 		/* Split a non root node */
 		bch2_btree_insert_node(as, trans, iter, parent, &as->parent_keys, flags);
 	} else if (n3) {
-		bch2_btree_set_root(as, n3, iter);
+		bch2_btree_set_root(as, trans, iter, n3);
 	} else {
 		/* Root filled up but didn't need to be split */
-		bch2_btree_set_root(as, n1, iter);
+		bch2_btree_set_root(as, trans, iter, n1);
 	}
 
 	bch2_btree_update_get_open_buckets(as, n1);
@@ -1435,12 +1431,12 @@ static void btree_split(struct btree_update *as,
 	/* Successful split, update the iterator to point to the new nodes: */
 
 	six_lock_increment(&b->c.lock, SIX_LOCK_intent);
-	bch2_btree_iter_node_drop(iter, b);
+	bch2_btree_iter_node_drop(trans, iter, b);
 	if (n3)
-		bch2_btree_iter_node_replace(iter, n3);
+		bch2_btree_iter_node_replace(trans, iter, n3);
 	if (n2)
-		bch2_btree_iter_node_replace(iter, n2);
-	bch2_btree_iter_node_replace(iter, n1);
+		bch2_btree_iter_node_replace(trans, iter, n2);
+	bch2_btree_iter_node_replace(trans, iter, n1);
 
 	/*
 	 * The old node must be freed (in memory) _before_ unlocking the new
@@ -1448,7 +1444,7 @@ static void btree_split(struct btree_update *as,
 	 * node after another thread has locked and updated the new node, thus
 	 * seeing stale data:
 	 */
-	bch2_btree_node_free_inmem(c, b, iter);
+	bch2_btree_node_free_inmem(trans, iter, b);
 
 	if (n3)
 		six_unlock_intent(&n3->c.lock);
@@ -1463,19 +1459,23 @@ static void btree_split(struct btree_update *as,
 }
 
 static void
-bch2_btree_insert_keys_interior(struct btree_update *as, struct btree *b,
-				struct btree_iter *iter, struct keylist *keys)
+bch2_btree_insert_keys_interior(struct btree_update *as,
+				struct btree_trans *trans,
+				struct btree_iter *iter,
+				struct btree *b,
+				struct keylist *keys)
 {
 	struct btree_iter *linked;
 
-	__bch2_btree_insert_keys_interior(as, b, iter, keys, iter->l[b->c.level].iter);
+	__bch2_btree_insert_keys_interior(as, trans, iter, b,
+					  iter->l[b->c.level].iter, keys);
 
 	btree_update_updated_node(as, b);
 
-	trans_for_each_iter_with_node(iter->trans, b, linked)
+	trans_for_each_iter_with_node(trans, b, linked)
 		bch2_btree_node_iter_peek(&linked->l[b->c.level].iter, b);
 
-	bch2_btree_trans_verify_iters(iter->trans, b);
+	bch2_btree_trans_verify_iters(trans, b);
 }
 
 /**
@@ -1509,13 +1509,13 @@ static void bch2_btree_insert_node(struct btree_update *as,
 	bch2_btree_node_lock_for_insert(trans, iter, b);
 
 	if (!bch2_btree_node_insert_fits(c, b, bch2_keylist_u64s(keys))) {
-		bch2_btree_node_unlock_write(b, iter);
+		bch2_btree_node_unlock_write(trans, iter, b);
 		goto split;
 	}
 
 	btree_node_interior_verify(c, b);
 
-	bch2_btree_insert_keys_interior(as, b, iter, keys);
+	bch2_btree_insert_keys_interior(as, trans, iter, b, keys);
 
 	live_u64s_added = (int) b->nr.live_u64s - old_live_u64s;
 	u64s_added = (int) le16_to_cpu(btree_bset_last(b)->u64s) - old_u64s;
@@ -1527,9 +1527,9 @@ static void bch2_btree_insert_node(struct btree_update *as,
 
 	if (u64s_added > live_u64s_added &&
 	    bch2_maybe_compact_whiteouts(c, b))
-		bch2_btree_iter_reinit_node(iter, b);
+		bch2_btree_iter_reinit_node(trans, iter, b);
 
-	bch2_btree_node_unlock_write(b, iter);
+	bch2_btree_node_unlock_write(trans, iter, b);
 
 	btree_node_interior_verify(c, b);
 	return;
@@ -1547,7 +1547,7 @@ int bch2_btree_split_leaf(struct btree_trans *trans,
 	unsigned l;
 	int ret = 0;
 
-	as = bch2_btree_update_start(iter, iter->level,
+	as = bch2_btree_update_start(trans, iter, iter->level,
 		btree_update_reserve_required(c, b), flags);
 	if (IS_ERR(as))
 		return PTR_ERR(as);
@@ -1660,7 +1660,7 @@ retry:
 		goto out;
 
 	parent = btree_node_parent(iter, b);
-	as = bch2_btree_update_start(iter, level,
+	as = bch2_btree_update_start(trans, iter, level,
 			 btree_update_reserve_required(c, parent) + 1,
 			 flags|
 			 BTREE_INSERT_NOFAIL|
@@ -1702,15 +1702,15 @@ retry:
 
 	six_lock_increment(&b->c.lock, SIX_LOCK_intent);
 	six_lock_increment(&m->c.lock, SIX_LOCK_intent);
-	bch2_btree_iter_node_drop(iter, b);
-	bch2_btree_iter_node_drop(iter, m);
+	bch2_btree_iter_node_drop(trans, iter, b);
+	bch2_btree_iter_node_drop(trans, iter, m);
 
-	bch2_btree_iter_node_replace(iter, n);
+	bch2_btree_iter_node_replace(trans, iter, n);
 
 	bch2_btree_trans_verify_iters(trans, n);
 
-	bch2_btree_node_free_inmem(c, b, iter);
-	bch2_btree_node_free_inmem(c, m, iter);
+	bch2_btree_node_free_inmem(trans, iter, b);
+	bch2_btree_node_free_inmem(trans, iter, m);
 
 	six_unlock_intent(&n->c.lock);
 
@@ -1762,7 +1762,7 @@ retry:
 		goto out;
 
 	parent = btree_node_parent(iter, b);
-	as = bch2_btree_update_start(iter, b->c.level,
+	as = bch2_btree_update_start(trans, iter, b->c.level,
 		(parent
 		 ? btree_update_reserve_required(c, parent)
 		 : 0) + 1,
@@ -1792,15 +1792,15 @@ retry:
 		bch2_btree_insert_node(as, trans, iter, parent,
 				       &as->parent_keys, flags);
 	} else {
-		bch2_btree_set_root(as, n, iter);
+		bch2_btree_set_root(as, trans, iter, n);
 	}
 
 	bch2_btree_update_get_open_buckets(as, n);
 
 	six_lock_increment(&b->c.lock, SIX_LOCK_intent);
-	bch2_btree_iter_node_drop(iter, b);
-	bch2_btree_iter_node_replace(iter, n);
-	bch2_btree_node_free_inmem(c, b, iter);
+	bch2_btree_iter_node_drop(trans, iter, b);
+	bch2_btree_iter_node_replace(trans, iter, n);
+	bch2_btree_node_free_inmem(trans, iter, b);
 	six_unlock_intent(&n->c.lock);
 
 	bch2_btree_update_done(as);
@@ -1931,7 +1931,7 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans,
 	if (ret)
 		goto err;
 
-	bch2_btree_node_lock_write(b, iter);
+	bch2_btree_node_lock_write(trans, iter, b);
 
 	if (new_hash) {
 		mutex_lock(&c->btree_cache.lock);
@@ -1946,7 +1946,7 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans,
 		bkey_copy(&b->key, new_key);
 	}
 
-	bch2_btree_node_unlock_write(b, iter);
+	bch2_btree_node_unlock_write(trans, iter, b);
 out:
 	bch2_trans_iter_put(trans, iter2);
 	return ret;
diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h
index e88e737ee813..07046dab614b 100644
--- a/fs/bcachefs/btree_update_interior.h
+++ b/fs/bcachefs/btree_update_interior.h
@@ -113,24 +113,10 @@ struct btree_update {
 	u64				inline_keys[BKEY_BTREE_PTR_U64s_MAX * 3];
 };
 
-void bch2_btree_node_free_inmem(struct bch_fs *, struct btree *,
-				struct btree_iter *);
-void bch2_btree_node_free_never_inserted(struct bch_fs *, struct btree *);
-
-void bch2_btree_update_get_open_buckets(struct btree_update *, struct btree *);
-
 struct btree *__bch2_btree_node_alloc_replacement(struct btree_update *,
 						  struct btree *,
 						  struct bkey_format);
 
-void bch2_btree_update_done(struct btree_update *);
-struct btree_update *
-bch2_btree_update_start(struct btree_iter *, unsigned, unsigned, unsigned);
-
-void bch2_btree_interior_update_will_free_node(struct btree_update *,
-					       struct btree *);
-void bch2_btree_update_add_new_node(struct btree_update *, struct btree *);
-
 int bch2_btree_split_leaf(struct btree_trans *, struct btree_iter *, unsigned);
 
 int __bch2_foreground_maybe_merge(struct btree_trans *, struct btree_iter *,
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 791c121adeb2..b32c8f14823a 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -42,14 +42,14 @@ inline void bch2_btree_node_lock_for_insert(struct btree_trans *trans,
 {
 	struct bch_fs *c = trans->c;
 
-	bch2_btree_node_lock_write(b, iter);
+	bch2_btree_node_lock_write(trans, iter, b);
 
 	if (btree_iter_type(iter) == BTREE_ITER_CACHED)
 		return;
 
 	if (unlikely(btree_node_just_written(b)) &&
 	    bch2_btree_post_write_cleanup(c, b))
-		bch2_btree_iter_reinit_node(iter, b);
+		bch2_btree_iter_reinit_node(trans, iter, b);
 
 	/*
 	 * If the last bset has been written, or if it's gotten too big - start
@@ -62,7 +62,8 @@ inline void bch2_btree_node_lock_for_insert(struct btree_trans *trans,
 /* Inserting into a given leaf node (last stage of insert): */
 
 /* Handle overwrites and do insert, for non extents: */
-bool bch2_btree_bset_insert_key(struct btree_iter *iter,
+bool bch2_btree_bset_insert_key(struct btree_trans *trans,
+				struct btree_iter *iter,
 				struct btree *b,
 				struct btree_node_iter *node_iter,
 				struct bkey_i *insert)
@@ -76,7 +77,7 @@ bool bch2_btree_bset_insert_key(struct btree_iter *iter,
 	EBUG_ON(bpos_cmp(insert->k.p, b->data->min_key) < 0);
 	EBUG_ON(bpos_cmp(insert->k.p, b->data->max_key) > 0);
 	EBUG_ON(insert->k.u64s >
-		bch_btree_keys_u64s_remaining(iter->trans->c, b));
+		bch_btree_keys_u64s_remaining(trans->c, b));
 	EBUG_ON(iter->flags & BTREE_ITER_IS_EXTENTS);
 
 	k = bch2_btree_node_iter_peek_all(node_iter, b);
@@ -96,7 +97,7 @@ bool bch2_btree_bset_insert_key(struct btree_iter *iter,
 		k->type = KEY_TYPE_deleted;
 
 		if (k->needs_whiteout)
-			push_whiteout(iter->trans->c, b, insert->k.p);
+			push_whiteout(trans->c, b, insert->k.p);
 		k->needs_whiteout = false;
 
 		if (k >= btree_bset_last(b)->start) {
@@ -104,7 +105,7 @@ bool bch2_btree_bset_insert_key(struct btree_iter *iter,
 			bch2_bset_delete(b, k, clobber_u64s);
 			goto fix_iter;
 		} else {
-			bch2_btree_iter_fix_key_modified(iter, b, k);
+			bch2_btree_iter_fix_key_modified(trans, iter, b, k);
 		}
 
 		return true;
@@ -122,7 +123,7 @@ bool bch2_btree_bset_insert_key(struct btree_iter *iter,
 			clobber_u64s = k->u64s;
 			goto overwrite;
 		} else {
-			bch2_btree_iter_fix_key_modified(iter, b, k);
+			bch2_btree_iter_fix_key_modified(trans, iter, b, k);
 		}
 	}
 
@@ -132,7 +133,7 @@ overwrite:
 	new_u64s = k->u64s;
 fix_iter:
 	if (clobber_u64s != new_u64s)
-		bch2_btree_node_iter_fix(iter, b, node_iter, k,
+		bch2_btree_node_iter_fix(trans, iter, b, node_iter, k,
 					 clobber_u64s, new_u64s);
 	return true;
 }
@@ -190,7 +191,7 @@ static bool btree_insert_key_leaf(struct btree_trans *trans,
 	EBUG_ON(!iter->level &&
 		!test_bit(BCH_FS_BTREE_INTERIOR_REPLAY_DONE, &c->flags));
 
-	if (unlikely(!bch2_btree_bset_insert_key(iter, b,
+	if (unlikely(!bch2_btree_bset_insert_key(trans, iter, b,
 					&iter_l(iter)->iter, insert)))
 		return false;
 
@@ -212,7 +213,7 @@ static bool btree_insert_key_leaf(struct btree_trans *trans,
 
 	if (u64s_added > live_u64s_added &&
 	    bch2_maybe_compact_whiteouts(c, b))
-		bch2_btree_iter_reinit_node(iter, b);
+		bch2_btree_iter_reinit_node(trans, iter, b);
 
 	trace_btree_insert_key(c, b, insert);
 	return true;
@@ -610,8 +611,8 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans,
 
 	trans_for_each_update(trans, i)
 		if (!same_leaf_as_prev(trans, i))
-			bch2_btree_node_unlock_write_inlined(iter_l(i->iter)->b,
-							     i->iter);
+			bch2_btree_node_unlock_write_inlined(trans, i->iter,
+							iter_l(i->iter)->b);
 
 	if (!ret && trans->journal_pin)
 		bch2_journal_pin_add(&c->journal, trans->journal_res.seq,
@@ -1178,7 +1179,7 @@ retry:
 			bch2_key_resize(&delete.k, max_sectors);
 			bch2_cut_back(end, &delete);
 
-			ret = bch2_extent_trim_atomic(&delete, iter);
+			ret = bch2_extent_trim_atomic(trans, iter, &delete);
 			if (ret)
 				break;
 		}
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index c0855245f2ec..c20c80bd344d 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -552,19 +552,19 @@ static int __ec_stripe_mem_alloc(struct bch_fs *c, size_t idx, gfp_t gfp)
 	return 0;
 }
 
-static int ec_stripe_mem_alloc(struct bch_fs *c,
+static int ec_stripe_mem_alloc(struct btree_trans *trans,
 			       struct btree_iter *iter)
 {
 	size_t idx = iter->pos.offset;
 	int ret = 0;
 
-	if (!__ec_stripe_mem_alloc(c, idx, GFP_NOWAIT|__GFP_NOWARN))
+	if (!__ec_stripe_mem_alloc(trans->c, idx, GFP_NOWAIT|__GFP_NOWARN))
 		return ret;
 
-	bch2_trans_unlock(iter->trans);
+	bch2_trans_unlock(trans);
 	ret = -EINTR;
 
-	if (!__ec_stripe_mem_alloc(c, idx, GFP_KERNEL))
+	if (!__ec_stripe_mem_alloc(trans->c, idx, GFP_KERNEL))
 		return ret;
 
 	return -ENOMEM;
@@ -735,7 +735,7 @@ retry:
 found_slot:
 	start_pos = iter->pos;
 
-	ret = ec_stripe_mem_alloc(c, iter);
+	ret = ec_stripe_mem_alloc(&trans, iter);
 	if (ret)
 		goto err;
 
diff --git a/fs/bcachefs/extent_update.c b/fs/bcachefs/extent_update.c
index 4a8dd085f7fb..93d55f46233f 100644
--- a/fs/bcachefs/extent_update.c
+++ b/fs/bcachefs/extent_update.c
@@ -94,11 +94,11 @@ static int count_iters_for_insert(struct btree_trans *trans,
 
 #define EXTENT_ITERS_MAX	(BTREE_ITER_MAX / 3)
 
-int bch2_extent_atomic_end(struct btree_iter *iter,
+int bch2_extent_atomic_end(struct btree_trans *trans,
+			   struct btree_iter *iter,
 			   struct bkey_i *insert,
 			   struct bpos *end)
 {
-	struct btree_trans *trans = iter->trans;
 	struct btree_iter *copy;
 	struct bkey_s_c k;
 	unsigned nr_iters = 0;
@@ -153,27 +153,17 @@ int bch2_extent_atomic_end(struct btree_iter *iter,
 	return ret < 0 ? ret : 0;
 }
 
-int bch2_extent_trim_atomic(struct bkey_i *k, struct btree_iter *iter)
+int bch2_extent_trim_atomic(struct btree_trans *trans,
+			    struct btree_iter *iter,
+			    struct bkey_i *k)
 {
 	struct bpos end;
 	int ret;
 
-	ret = bch2_extent_atomic_end(iter, k, &end);
+	ret = bch2_extent_atomic_end(trans, iter, k, &end);
 	if (ret)
 		return ret;
 
 	bch2_cut_back(end, k);
 	return 0;
 }
-
-int bch2_extent_is_atomic(struct bkey_i *k, struct btree_iter *iter)
-{
-	struct bpos end;
-	int ret;
-
-	ret = bch2_extent_atomic_end(iter, k, &end);
-	if (ret)
-		return ret;
-
-	return !bkey_cmp(end, k->k.p);
-}
diff --git a/fs/bcachefs/extent_update.h b/fs/bcachefs/extent_update.h
index 2fa4602967e0..6f5cf449361a 100644
--- a/fs/bcachefs/extent_update.h
+++ b/fs/bcachefs/extent_update.h
@@ -4,9 +4,9 @@
 
 #include "bcachefs.h"
 
-int bch2_extent_atomic_end(struct btree_iter *, struct bkey_i *,
-			   struct bpos *);
-int bch2_extent_trim_atomic(struct bkey_i *, struct btree_iter *);
-int bch2_extent_is_atomic(struct bkey_i *, struct btree_iter *);
+int bch2_extent_atomic_end(struct btree_trans *, struct btree_iter *,
+			   struct bkey_i *, struct bpos *);
+int bch2_extent_trim_atomic(struct btree_trans *, struct btree_iter *,
+			    struct bkey_i *);
 
 #endif /* _BCACHEFS_EXTENT_UPDATE_H */
diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 9ac10b72d1cf..251029c33164 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -2576,7 +2576,7 @@ reassemble:
 		copy.k->k.p.offset += shift >> 9;
 		bch2_btree_iter_set_pos(dst, bkey_start_pos(&copy.k->k));
 
-		ret = bch2_extent_atomic_end(dst, copy.k, &atomic_end);
+		ret = bch2_extent_atomic_end(&trans, dst, copy.k, &atomic_end);
 		if (ret)
 			continue;
 
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index 30d9b6e4abf7..27f6b3245741 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -280,7 +280,7 @@ int bch2_extent_update(struct btree_trans *trans,
 	s64 i_sectors_delta = 0, disk_sectors_delta = 0;
 	int ret;
 
-	ret = bch2_extent_trim_atomic(k, iter);
+	ret = bch2_extent_trim_atomic(trans, iter, k);
 	if (ret)
 		return ret;
 
-- 
cgit 


From 638c6ff951bfebbecde5b6912ce22a02a77fafc1 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 25 Aug 2021 01:03:25 -0400
Subject: bcachefs: Refactor bch2_trans_update_extent()

This consolidates the code for doing extent updates, and makes the btree
iterator usage a bit cleaner and more efficient.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_update_leaf.c | 313 ++++++++++++++++++++--------------------
 1 file changed, 156 insertions(+), 157 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index b32c8f14823a..1b4945145428 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -739,136 +739,6 @@ bch2_trans_commit_get_rw_cold(struct btree_trans *trans)
 	return 0;
 }
 
-static noinline int extent_front_merge(struct btree_trans *trans,
-				       struct btree_iter *iter,
-				       struct bkey_s_c k,
-				       struct btree_insert_entry *i)
-{
-	struct bch_fs *c = trans->c;
-	struct bkey_i *update;
-	int ret;
-
-	update = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
-	ret = PTR_ERR_OR_ZERO(update);
-	if (ret)
-		return ret;
-
-	bkey_reassemble(update, k);
-
-	if (bch2_bkey_merge(c, bkey_i_to_s(update), bkey_i_to_s_c(i->k))) {
-		struct btree_iter *update_iter =
-			bch2_trans_copy_iter(trans, iter);
-
-		ret = bch2_btree_delete_at(trans, update_iter, i->flags);
-		bch2_trans_iter_put(trans, update_iter);
-
-		if (ret)
-			return ret;
-
-		i->k = update;
-	}
-
-	return 0;
-}
-
-static int extent_handle_overwrites(struct btree_trans *trans,
-				    struct btree_insert_entry *i)
-{
-	struct bch_fs *c = trans->c;
-	struct btree_iter *iter, *update_iter;
-	struct bpos start = bkey_start_pos(&i->k->k);
-	struct bkey_i *update;
-	struct bkey_s_c k;
-	int ret = 0, compressed_sectors;
-
-	iter = bch2_trans_get_iter(trans, i->btree_id, start,
-				   BTREE_ITER_INTENT|
-				   BTREE_ITER_WITH_UPDATES|
-				   BTREE_ITER_NOT_EXTENTS);
-	k = bch2_btree_iter_peek(iter);
-	if (!k.k || (ret = bkey_err(k)))
-		goto out;
-
-	if (!bkey_cmp(k.k->p, bkey_start_pos(&i->k->k))) {
-		if (bch2_bkey_maybe_mergable(k.k, &i->k->k)) {
-			ret = extent_front_merge(trans, iter, k, i);
-			if (ret)
-				goto out;
-		}
-
-		goto next;
-	}
-
-	while (bkey_cmp(i->k->k.p, bkey_start_pos(k.k)) > 0) {
-		/*
-		 * If we're going to be splitting a compressed extent, note it
-		 * so that __bch2_trans_commit() can increase our disk
-		 * reservation:
-		 */
-		if (bkey_cmp(bkey_start_pos(k.k), start) < 0 &&
-		    bkey_cmp(k.k->p, i->k->k.p) > 0 &&
-		    (compressed_sectors = bch2_bkey_sectors_compressed(k)))
-			trans->extra_journal_res += compressed_sectors;
-
-		if (bkey_cmp(bkey_start_pos(k.k), start) < 0) {
-			update = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
-			if ((ret = PTR_ERR_OR_ZERO(update)))
-				goto out;
-
-			bkey_reassemble(update, k);
-
-			bch2_cut_back(start, update);
-
-			update_iter = bch2_trans_get_iter(trans, i->btree_id, update->k.p,
-							  BTREE_ITER_NOT_EXTENTS|
-							  BTREE_ITER_INTENT);
-			ret = bch2_btree_iter_traverse(update_iter);
-			if (ret) {
-				bch2_trans_iter_put(trans, update_iter);
-				goto out;
-			}
-
-			bch2_trans_update(trans, update_iter, update,
-					  BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|
-					  i->flags);
-			bch2_trans_iter_put(trans, update_iter);
-		}
-
-		if (bkey_cmp(k.k->p, i->k->k.p) <= 0) {
-			update_iter = bch2_trans_copy_iter(trans, iter);
-			ret = bch2_btree_delete_at(trans, update_iter,
-						   i->flags);
-			bch2_trans_iter_put(trans, update_iter);
-
-			if (ret)
-				goto out;
-		}
-
-		if (bkey_cmp(k.k->p, i->k->k.p) > 0) {
-			update = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
-			if ((ret = PTR_ERR_OR_ZERO(update)))
-				goto out;
-
-			bkey_reassemble(update, k);
-			bch2_cut_front(i->k->k.p, update);
-
-			bch2_trans_update(trans, iter, update, i->flags);
-			goto out;
-		}
-next:
-		k = bch2_btree_iter_next(iter);
-		if (!k.k || (ret = bkey_err(k)))
-			goto out;
-	}
-
-	if (bch2_bkey_maybe_mergable(&i->k->k, k.k))
-		bch2_bkey_merge(c, bkey_i_to_s(i->k), k);
-out:
-	bch2_trans_iter_put(trans, iter);
-
-	return ret;
-}
-
 int __bch2_trans_commit(struct btree_trans *trans)
 {
 	struct btree_insert_entry *i = NULL;
@@ -1004,6 +874,157 @@ err:
 	goto retry;
 }
 
+static noinline int extent_front_merge(struct btree_trans *trans,
+				       struct btree_iter *iter,
+				       struct bkey_s_c k,
+				       struct bkey_i **insert,
+				       enum btree_update_flags flags)
+{
+	struct bch_fs *c = trans->c;
+	struct bkey_i *update;
+	int ret;
+
+	update = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
+	ret = PTR_ERR_OR_ZERO(update);
+	if (ret)
+		return ret;
+
+	bkey_reassemble(update, k);
+
+	if (bch2_bkey_merge(c, bkey_i_to_s(update), bkey_i_to_s_c(*insert))) {
+		struct btree_iter *update_iter =
+			bch2_trans_copy_iter(trans, iter);
+
+		ret = bch2_btree_delete_at(trans, update_iter, flags);
+		bch2_trans_iter_put(trans, update_iter);
+
+		if (ret)
+			return ret;
+
+		*insert = update;
+	}
+
+	return 0;
+}
+
+static int bch2_trans_update_extent(struct btree_trans *trans,
+				    struct btree_iter *orig_iter,
+				    struct bkey_i *insert,
+				    enum btree_update_flags flags)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter *iter, *update_iter;
+	struct bpos start = bkey_start_pos(&insert->k);
+	struct bkey_i *update;
+	struct bkey_s_c k;
+	enum btree_id btree_id = orig_iter->btree_id;
+	int ret = 0, compressed_sectors;
+
+	orig_iter->pos_after_commit = insert->k.p;
+	orig_iter->flags |= BTREE_ITER_SET_POS_AFTER_COMMIT;
+
+	iter = bch2_trans_get_iter(trans, btree_id, start,
+				   BTREE_ITER_INTENT|
+				   BTREE_ITER_WITH_UPDATES|
+				   BTREE_ITER_NOT_EXTENTS);
+	k = bch2_btree_iter_peek(iter);
+	if ((ret = bkey_err(k)))
+		goto err;
+	if (!k.k)
+		goto out;
+
+	if (!bkey_cmp(k.k->p, bkey_start_pos(&insert->k))) {
+		if (bch2_bkey_maybe_mergable(k.k, &insert->k)) {
+			ret = extent_front_merge(trans, iter, k, &insert, flags);
+			if (ret)
+				goto out;
+		}
+
+		goto next;
+	}
+
+	if (!bkey_cmp(k.k->p, bkey_start_pos(&insert->k)))
+		goto next;
+
+	while (bkey_cmp(insert->k.p, bkey_start_pos(k.k)) > 0) {
+		/*
+		 * If we're going to be splitting a compressed extent, note it
+		 * so that __bch2_trans_commit() can increase our disk
+		 * reservation:
+		 */
+		if (bkey_cmp(bkey_start_pos(k.k), start) < 0 &&
+		    bkey_cmp(k.k->p, insert->k.p) > 0 &&
+		    (compressed_sectors = bch2_bkey_sectors_compressed(k)))
+			trans->extra_journal_res += compressed_sectors;
+
+		if (bkey_cmp(bkey_start_pos(k.k), start) < 0) {
+			update = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
+			if ((ret = PTR_ERR_OR_ZERO(update)))
+				goto err;
+
+			bkey_reassemble(update, k);
+
+			bch2_cut_back(start, update);
+
+			update_iter = bch2_trans_get_iter(trans, btree_id, update->k.p,
+							  BTREE_ITER_NOT_EXTENTS|
+							  BTREE_ITER_INTENT);
+			ret   = bch2_btree_iter_traverse(update_iter) ?:
+				bch2_trans_update(trans, update_iter, update,
+						  BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|
+						  flags);
+			bch2_trans_iter_put(trans, update_iter);
+			if (ret)
+				goto err;
+		}
+
+		if (bkey_cmp(k.k->p, insert->k.p) <= 0) {
+			update_iter = bch2_trans_copy_iter(trans, iter);
+			ret = bch2_btree_delete_at(trans, update_iter,
+						   flags);
+			bch2_trans_iter_put(trans, update_iter);
+
+			if (ret)
+				goto err;
+		}
+
+		if (bkey_cmp(k.k->p, insert->k.p) > 0) {
+			update = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
+			if ((ret = PTR_ERR_OR_ZERO(update)))
+				goto err;
+
+			bkey_reassemble(update, k);
+			bch2_cut_front(insert->k.p, update);
+
+			update_iter = bch2_trans_copy_iter(trans, iter);
+			bch2_trans_update(trans, update_iter, update, flags);
+			bch2_trans_iter_put(trans, update_iter);
+			goto out;
+		}
+next:
+		k = bch2_btree_iter_next(iter);
+		if ((ret = bkey_err(k)))
+			goto err;
+		if (!k.k)
+			goto out;
+	}
+
+	if (bch2_bkey_maybe_mergable(&insert->k, k.k))
+		bch2_bkey_merge(c, bkey_i_to_s(insert), k);
+out:
+	if (!bkey_deleted(&insert->k)) {
+		bch2_btree_iter_set_pos(iter, insert->k.p);
+		ret   = bch2_btree_iter_traverse(iter) ?:
+			bch2_trans_update(trans, iter, insert, flags);
+	} else {
+		set_btree_iter_dontneed(trans, iter);
+	}
+err:
+	bch2_trans_iter_put(trans, iter);
+
+	return ret;
+}
+
 int bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter,
 		      struct bkey_i *k, enum btree_update_flags flags)
 {
@@ -1016,41 +1037,19 @@ int bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter,
 		.k		= k,
 		.ip_allocated	= _RET_IP_,
 	};
-	bool is_extent = (iter->flags & BTREE_ITER_IS_EXTENTS) != 0;
-	int ret = 0;
 
-	BUG_ON(trans->nr_updates >= BTREE_ITER_MAX);
 	BUG_ON(!iter->should_be_locked);
 
+	if (iter->flags & BTREE_ITER_IS_EXTENTS)
+		return bch2_trans_update_extent(trans, iter, k, flags);
+
 #ifdef CONFIG_BCACHEFS_DEBUG
 	trans_for_each_update(trans, i)
 		BUG_ON(i != trans->updates &&
 		       btree_insert_entry_cmp(i - 1, i) >= 0);
 #endif
-
-	if (is_extent) {
-		ret = extent_handle_overwrites(trans, &n);
-		if (ret)
-			return ret;
-
-		iter->pos_after_commit = k->k.p;
-		iter->flags |= BTREE_ITER_SET_POS_AFTER_COMMIT;
-
-		if (bkey_deleted(&n.k->k))
-			return 0;
-
-		n.iter = bch2_trans_get_iter(trans, n.btree_id, n.k->k.p,
-					     BTREE_ITER_INTENT|
-					     BTREE_ITER_NOT_EXTENTS);
-		n.iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT;
-		ret = bch2_btree_iter_traverse(n.iter);
-		bch2_trans_iter_put(trans, n.iter);
-
-		if (ret)
-			return ret;
-	}
-
-	BUG_ON(n.iter->flags & BTREE_ITER_IS_EXTENTS);
+	BUG_ON(trans->nr_updates >= BTREE_ITER_MAX);
+	BUG_ON(bpos_cmp(n.k->k.p, n.iter->real_pos));
 
 	n.iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT;
 
-- 
cgit 


From 5f8077cca89bdcc3d7660567462b17831356826d Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 29 Aug 2021 19:34:37 -0400
Subject: bcachefs: Kill BTREE_ITER_SET_POS_AFTER_COMMIT

BTREE_ITER_SET_POS_AFTER_COMMIT is used internally to automagically
advance extent btree iterators on sucessful commit.

But with the upcomnig btree_path patch it's getting more awkward to
support, and it adds overhead to core data structures that's only used
in a few places, and can be easily done by the caller instead.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_iter.c        |  4 +---
 fs/bcachefs/btree_types.h       | 10 ++++------
 fs/bcachefs/btree_update_leaf.c |  9 ---------
 fs/bcachefs/ec.c                |  5 +++++
 fs/bcachefs/io.c                |  5 +++++
 fs/bcachefs/move.c              |  9 +++++++--
 6 files changed, 22 insertions(+), 20 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index ef5e7e9884f5..70995d61dd49 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -2347,7 +2347,6 @@ static void btree_iter_copy(struct btree_trans *trans, struct btree_iter *dst,
 					   __btree_lock_want(dst, i));
 
 	dst->flags &= ~BTREE_ITER_KEEP_UNTIL_COMMIT;
-	dst->flags &= ~BTREE_ITER_SET_POS_AFTER_COMMIT;
 	trans->iters_sorted = false;
 }
 
@@ -2564,8 +2563,7 @@ void bch2_trans_begin(struct btree_trans *trans)
 	struct btree_iter *iter;
 
 	trans_for_each_iter(trans, iter)
-		iter->flags &= ~(BTREE_ITER_KEEP_UNTIL_COMMIT|
-				 BTREE_ITER_SET_POS_AFTER_COMMIT);
+		iter->flags &= ~BTREE_ITER_KEEP_UNTIL_COMMIT;
 
 	/*
 	 * XXX: we shouldn't be doing this if the transaction was restarted, but
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 6ef3285541f2..577cc57174fa 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -211,11 +211,10 @@ enum btree_iter_type {
 #define BTREE_ITER_IS_EXTENTS		(1 << 6)
 #define BTREE_ITER_NOT_EXTENTS		(1 << 7)
 #define BTREE_ITER_ERROR		(1 << 8)
-#define BTREE_ITER_SET_POS_AFTER_COMMIT	(1 << 9)
-#define BTREE_ITER_CACHED_NOFILL	(1 << 10)
-#define BTREE_ITER_CACHED_NOCREATE	(1 << 11)
-#define BTREE_ITER_WITH_UPDATES		(1 << 12)
-#define BTREE_ITER_ALL_SNAPSHOTS	(1 << 13)
+#define BTREE_ITER_CACHED_NOFILL	(1 << 9)
+#define BTREE_ITER_CACHED_NOCREATE	(1 << 10)
+#define BTREE_ITER_WITH_UPDATES		(1 << 11)
+#define BTREE_ITER_ALL_SNAPSHOTS	(1 << 12)
 
 enum btree_iter_uptodate {
 	BTREE_ITER_UPTODATE		= 0,
@@ -256,7 +255,6 @@ struct btree_iter {
 
 	struct bpos		pos;
 	struct bpos		real_pos;
-	struct bpos		pos_after_commit;
 
 	enum btree_id		btree_id:4;
 	enum btree_iter_uptodate uptodate:3;
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 1b4945145428..2409696dc63f 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -742,7 +742,6 @@ bch2_trans_commit_get_rw_cold(struct btree_trans *trans)
 int __bch2_trans_commit(struct btree_trans *trans)
 {
 	struct btree_insert_entry *i = NULL;
-	struct btree_iter *iter;
 	bool trans_trigger_run;
 	unsigned u64s;
 	int ret = 0;
@@ -840,11 +839,6 @@ retry:
 
 	if (ret)
 		goto err;
-
-	trans_for_each_iter(trans, iter)
-		if (btree_iter_live(trans, iter) &&
-		    (iter->flags & BTREE_ITER_SET_POS_AFTER_COMMIT))
-			bch2_btree_iter_set_pos(iter, iter->pos_after_commit);
 out:
 	bch2_journal_preres_put(&trans->c->journal, &trans->journal_preres);
 
@@ -920,9 +914,6 @@ static int bch2_trans_update_extent(struct btree_trans *trans,
 	enum btree_id btree_id = orig_iter->btree_id;
 	int ret = 0, compressed_sectors;
 
-	orig_iter->pos_after_commit = insert->k.p;
-	orig_iter->flags |= BTREE_ITER_SET_POS_AFTER_COMMIT;
-
 	iter = bch2_trans_get_iter(trans, btree_id, start,
 				   BTREE_ITER_INTENT|
 				   BTREE_ITER_WITH_UPDATES|
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index c20c80bd344d..7ad74987757f 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -824,6 +824,7 @@ static int ec_stripe_update_ptrs(struct bch_fs *c,
 	struct bkey_s_c k;
 	struct bkey_s_extent e;
 	struct bkey_buf sk;
+	struct bpos next_pos;
 	int ret = 0, dev, block;
 
 	bch2_bkey_buf_init(&sk);
@@ -863,10 +864,14 @@ static int ec_stripe_update_ptrs(struct bch_fs *c,
 		extent_stripe_ptr_add(e, s, ec_ptr, block);
 
 		bch2_btree_iter_set_pos(iter, bkey_start_pos(&sk.k->k));
+		next_pos = sk.k->k.p;
+
 		ret   = bch2_btree_iter_traverse(iter) ?:
 			bch2_trans_update(&trans, iter, sk.k, 0) ?:
 			bch2_trans_commit(&trans, NULL, NULL,
 					BTREE_INSERT_NOFAIL);
+		if (!ret)
+			bch2_btree_iter_set_pos(iter, next_pos);
 		if (ret == -EINTR)
 			ret = 0;
 		if (ret)
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index 27f6b3245741..34295419190d 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -276,6 +276,7 @@ int bch2_extent_update(struct btree_trans *trans,
 {
 	/* this must live until after bch2_trans_commit(): */
 	struct bkey_inode_buf inode_p;
+	struct bpos next_pos;
 	bool extending = false, usage_increasing;
 	s64 i_sectors_delta = 0, disk_sectors_delta = 0;
 	int ret;
@@ -354,6 +355,8 @@ int bch2_extent_update(struct btree_trans *trans,
 			return ret;
 	}
 
+	next_pos = k->k.p;
+
 	ret =   bch2_trans_update(trans, iter, k, 0) ?:
 		bch2_trans_commit(trans, disk_res, journal_seq,
 				BTREE_INSERT_NOCHECK_RW|
@@ -362,6 +365,8 @@ int bch2_extent_update(struct btree_trans *trans,
 	if (ret)
 		return ret;
 
+	bch2_btree_iter_set_pos(iter, next_pos);
+
 	if (i_sectors_delta_total)
 		*i_sectors_delta_total += i_sectors_delta;
 	return 0;
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index 8d28d8fc5395..c804af8b81de 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -79,6 +79,7 @@ int bch2_migrate_index_update(struct bch_write_op *op)
 		struct bkey_i_extent *new;
 		const union bch_extent_entry *entry;
 		struct extent_ptr_decoded p;
+		struct bpos next_pos;
 		bool did_work = false;
 		bool extending = false, should_check_enospc;
 		s64 i_sectors_delta = 0, disk_sectors_delta = 0;
@@ -162,14 +163,18 @@ int bch2_migrate_index_update(struct bch_write_op *op)
 				goto out;
 		}
 
+		next_pos = insert->k.p;
+
 		ret   = bch2_trans_update(&trans, iter, insert, 0) ?:
 			bch2_trans_commit(&trans, &op->res,
 				op_journal_seq(op),
 				BTREE_INSERT_NOFAIL|
 				m->data_opts.btree_insert_flags);
-err:
-		if (!ret)
+		if (!ret) {
+			bch2_btree_iter_set_pos(iter, next_pos);
 			atomic_long_inc(&c->extent_migrate_done);
+		}
+err:
 		if (ret == -EINTR)
 			ret = 0;
 		if (ret)
-- 
cgit 


From 05046a962f0cdfbeec91d64714df84456ce09a1b Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 27 Aug 2021 20:55:44 -0400
Subject: bcachefs: Better algorithm for btree node merging in write path

The existing algorithm was O(n^2) in the number of updates in the
commit.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_update_leaf.c | 79 ++++++++++++++++-------------------------
 1 file changed, 30 insertions(+), 49 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 2409696dc63f..92b6b5cec2ae 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -36,6 +36,13 @@ static inline bool same_leaf_as_prev(struct btree_trans *trans,
 		iter_l(i[0].iter)->b == iter_l(i[-1].iter)->b;
 }
 
+static inline bool same_leaf_as_next(struct btree_trans *trans,
+				     struct btree_insert_entry *i)
+{
+	return i + 1 < trans->updates + trans->nr_updates &&
+		iter_l(i[0].iter)->b == iter_l(i[1].iter)->b;
+}
+
 inline void bch2_btree_node_lock_for_insert(struct btree_trans *trans,
 					    struct btree_iter *iter,
 					    struct btree *b)
@@ -486,44 +493,6 @@ err:
 	return ret;
 }
 
-static noinline int maybe_do_btree_merge(struct btree_trans *trans, struct btree_iter *iter)
-{
-	struct btree_insert_entry *i;
-	struct btree *b = iter_l(iter)->b;
-	struct bkey_s_c old;
-	int u64s_delta = 0;
-	int ret;
-
-	/*
-	 * Inserting directly into interior nodes is an uncommon operation with
-	 * various weird edge cases: also, a lot of things about
-	 * BTREE_ITER_NODES iters need to be audited
-	 */
-	if (unlikely(btree_iter_type(iter) != BTREE_ITER_KEYS))
-		return 0;
-
-	BUG_ON(iter->level);
-
-	trans_for_each_update(trans, i) {
-		if (iter_l(i->iter)->b != b)
-			continue;
-
-		old = bch2_btree_iter_peek_slot(i->iter);
-		ret = bkey_err(old);
-		if (ret)
-			return ret;
-
-		u64s_delta += !bkey_deleted(&i->k->k) ? i->k->k.u64s : 0;
-		u64s_delta -= !bkey_deleted(old.k) ? old.k->u64s : 0;
-	}
-
-	if (u64s_delta > 0)
-		return 0;
-
-	return bch2_foreground_maybe_merge(trans, iter,
-				iter->level, trans->flags);
-}
-
 /*
  * Get journal reservation, take write locks, and attempt to do btree update(s):
  */
@@ -534,22 +503,34 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans,
 	struct bch_fs *c = trans->c;
 	struct btree_insert_entry *i;
 	struct btree_iter *iter;
-	int ret;
+	struct bkey_s_c old;
+	int ret, u64s_delta = 0;
 
 	trans_for_each_update(trans, i) {
-		struct btree *b;
+		/*
+		 * peek_slot() doesn't work on a BTREE_ITER_NODES iter; those
+		 * iterator types should probably go away
+		 */
+		if (btree_iter_type(i->iter) != BTREE_ITER_KEYS)
+			continue;
 
-		BUG_ON(!btree_node_intent_locked(i->iter, i->level));
+		old = bch2_btree_iter_peek_slot(i->iter);
+		ret = bkey_err(old);
+		if (unlikely(ret))
+			return ret;
 
-		if (btree_iter_type(i->iter) == BTREE_ITER_CACHED)
-			continue;
+		u64s_delta += !bkey_deleted(&i->k->k) ? i->k->k.u64s : 0;
+		u64s_delta -= !bkey_deleted(old.k) ? old.k->u64s : 0;
+
+		if (!same_leaf_as_next(trans, i)) {
+			if (u64s_delta <= 0) {
+				ret = bch2_foreground_maybe_merge(trans, i->iter,
+							i->iter->level, trans->flags);
+				if (unlikely(ret))
+					return ret;
+			}
 
-		b = iter_l(i->iter)->b;
-		if (b->sib_u64s[0] < c->btree_foreground_merge_threshold ||
-		    b->sib_u64s[1] < c->btree_foreground_merge_threshold) {
-			ret = maybe_do_btree_merge(trans, i->iter);
-			if (unlikely(ret))
-				return ret;
+			u64s_delta = 0;
 		}
 	}
 
-- 
cgit 


From 78cf784eaac1ebfc8982e815618b0fcc5927fafb Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 30 Aug 2021 14:22:43 -0400
Subject: bcachefs: Further reduce iter->trans usage

This is prep work for splitting btree_path out from btree_iter -
btree_path will not have a pointer to btree_trans.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_cache.c           |  40 +++++-----
 fs/bcachefs/btree_cache.h           |   5 +-
 fs/bcachefs/btree_iter.c            | 155 ++++++++++++++++++------------------
 fs/bcachefs/btree_iter.h            |  10 ++-
 fs/bcachefs/btree_key_cache.c       |  13 ++-
 fs/bcachefs/btree_key_cache.h       |   2 +-
 fs/bcachefs/btree_locking.h         |  30 ++++---
 fs/bcachefs/btree_update_interior.c |   2 +-
 fs/bcachefs/btree_update_interior.h |   2 +-
 fs/bcachefs/btree_update_leaf.c     |   5 +-
 fs/bcachefs/recovery.c              |   4 +-
 11 files changed, 136 insertions(+), 132 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index 5c12897964b6..354c75f59730 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -639,6 +639,7 @@ err:
 
 /* Slowpath, don't want it inlined into btree_iter_traverse() */
 static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c,
+				struct btree_trans *trans,
 				struct btree_iter *iter,
 				const struct bkey_i *k,
 				enum btree_id btree_id,
@@ -655,8 +656,8 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c,
 	 * Parent node must be locked, else we could read in a btree node that's
 	 * been freed:
 	 */
-	if (iter && !bch2_btree_node_relock(iter, level + 1)) {
-		btree_trans_restart(iter->trans);
+	if (trans && !bch2_btree_node_relock(trans, iter, level + 1)) {
+		btree_trans_restart(trans);
 		return ERR_PTR(-EINTR);
 	}
 
@@ -687,23 +688,23 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c,
 	six_unlock_intent(&b->c.lock);
 
 	/* Unlock before doing IO: */
-	if (iter && sync)
-		bch2_trans_unlock(iter->trans);
+	if (trans && sync)
+		bch2_trans_unlock(trans);
 
 	bch2_btree_node_read(c, b, sync);
 
 	if (!sync)
 		return NULL;
 
-	if (iter &&
-	    (!bch2_trans_relock(iter->trans) ||
-	     !bch2_btree_iter_relock_intent(iter))) {
-		BUG_ON(!iter->trans->restarted);
+	if (trans &&
+	    (!bch2_trans_relock(trans) ||
+	     !bch2_btree_iter_relock_intent(trans, iter))) {
+		BUG_ON(!trans->restarted);
 		return ERR_PTR(-EINTR);
 	}
 
 	if (!six_relock_type(&b->c.lock, lock_type, seq)) {
-		btree_trans_restart(iter->trans);
+		btree_trans_restart(trans);
 		return ERR_PTR(-EINTR);
 	}
 
@@ -786,7 +787,7 @@ retry:
 		 * else we could read in a btree node from disk that's been
 		 * freed:
 		 */
-		b = bch2_btree_node_fill(c, iter, k, iter->btree_id,
+		b = bch2_btree_node_fill(c, trans, iter, k, iter->btree_id,
 					 level, lock_type, true);
 
 		/* We raced and found the btree node in the cache */
@@ -828,7 +829,7 @@ lock_node:
 		if (btree_node_read_locked(iter, level + 1))
 			btree_node_unlock(iter, level + 1);
 
-		if (!btree_node_lock(b, k->k.p, level, iter, lock_type,
+		if (!btree_node_lock(trans, iter, b, k->k.p, level, lock_type,
 				     lock_node_check_fn, (void *) k, trace_ip)) {
 			if (!trans->restarted)
 				goto retry;
@@ -839,7 +840,7 @@ lock_node:
 			     b->c.level != level ||
 			     race_fault())) {
 			six_unlock_type(&b->c.lock, lock_type);
-			if (bch2_btree_node_relock(iter, level + 1))
+			if (bch2_btree_node_relock(trans, iter, level + 1))
 				goto retry;
 
 			trace_trans_restart_btree_node_reused(trans->ip,
@@ -863,9 +864,9 @@ lock_node:
 		 * should_be_locked is not set on this iterator yet, so we need
 		 * to relock it specifically:
 		 */
-		if (iter &&
+		if (trans &&
 		    (!bch2_trans_relock(trans) ||
-		     !bch2_btree_iter_relock_intent(iter))) {
+		     !bch2_btree_iter_relock_intent(trans, iter))) {
 			BUG_ON(!trans->restarted);
 			return ERR_PTR(-EINTR);
 		}
@@ -924,7 +925,7 @@ retry:
 		if (nofill)
 			goto out;
 
-		b = bch2_btree_node_fill(c, NULL, k, btree_id,
+		b = bch2_btree_node_fill(c, NULL, NULL, k, btree_id,
 					 level, SIX_LOCK_read, true);
 
 		/* We raced and found the btree node in the cache */
@@ -982,21 +983,24 @@ out:
 	return b;
 }
 
-int bch2_btree_node_prefetch(struct bch_fs *c, struct btree_iter *iter,
+int bch2_btree_node_prefetch(struct bch_fs *c,
+			     struct btree_trans *trans,
+			     struct btree_iter *iter,
 			     const struct bkey_i *k,
 			     enum btree_id btree_id, unsigned level)
 {
 	struct btree_cache *bc = &c->btree_cache;
 	struct btree *b;
 
-	BUG_ON(iter && !btree_node_locked(iter, level + 1));
+	BUG_ON(trans && !btree_node_locked(iter, level + 1));
 	BUG_ON(level >= BTREE_MAX_DEPTH);
 
 	b = btree_cache_find(bc, k);
 	if (b)
 		return 0;
 
-	b = bch2_btree_node_fill(c, iter, k, btree_id, level, SIX_LOCK_read, false);
+	b = bch2_btree_node_fill(c, trans, iter, k, btree_id,
+				 level, SIX_LOCK_read, false);
 	return PTR_ERR_OR_ZERO(b);
 }
 
diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h
index eb57dc3c70b7..3b671cf0056d 100644
--- a/fs/bcachefs/btree_cache.h
+++ b/fs/bcachefs/btree_cache.h
@@ -27,8 +27,9 @@ struct btree *bch2_btree_node_get(struct btree_trans *, struct btree_iter *,
 struct btree *bch2_btree_node_get_noiter(struct bch_fs *, const struct bkey_i *,
 					 enum btree_id, unsigned, bool);
 
-int bch2_btree_node_prefetch(struct bch_fs *, struct btree_iter *,
-			     const struct bkey_i *, enum btree_id, unsigned);
+int bch2_btree_node_prefetch(struct bch_fs *, struct btree_trans *,
+			     struct btree_iter *, const struct bkey_i *,
+			     enum btree_id, unsigned);
 
 void bch2_btree_node_evict(struct bch_fs *, const struct bkey_i *);
 
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 70995d61dd49..16b9f6a986f4 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -107,17 +107,14 @@ void bch2_btree_node_unlock_write(struct btree_trans *trans,
 	bch2_btree_node_unlock_write_inlined(trans, iter, b);
 }
 
-void __bch2_btree_node_lock_write(struct btree_trans *trans,
-			struct btree_iter *iter, struct btree *b)
+void __bch2_btree_node_lock_write(struct btree_trans *trans, struct btree *b)
 {
-	struct btree_iter *linked;
+	struct btree_iter *iter;
 	unsigned readers = 0;
 
-	EBUG_ON(!btree_node_intent_locked(iter, b->c.level));
-
-	trans_for_each_iter(trans, linked)
-		if (linked->l[b->c.level].b == b &&
-		    btree_node_read_locked(linked, b->c.level))
+	trans_for_each_iter(trans, iter)
+		if (iter->l[b->c.level].b == b &&
+		    btree_node_read_locked(iter, b->c.level))
 			readers++;
 
 	/*
@@ -141,7 +138,8 @@ void __bch2_btree_node_lock_write(struct btree_trans *trans,
 		this_cpu_add(*b->c.lock.readers, readers);
 }
 
-bool __bch2_btree_node_relock(struct btree_iter *iter, unsigned level)
+bool __bch2_btree_node_relock(struct btree_trans *trans,
+			      struct btree_iter *iter, unsigned level)
 {
 	struct btree *b = btree_iter_node(iter, level);
 	int want = __btree_lock_want(iter, level);
@@ -154,7 +152,7 @@ bool __bch2_btree_node_relock(struct btree_iter *iter, unsigned level)
 
 	if (six_relock_type(&b->c.lock, want, iter->l[level].lock_seq) ||
 	    (btree_node_lock_seq_matches(iter, b, level) &&
-	     btree_node_lock_increment(iter->trans, b, level, want))) {
+	     btree_node_lock_increment(trans, b, level, want))) {
 		mark_btree_node_locked(iter, level, want);
 		return true;
 	} else {
@@ -162,7 +160,8 @@ bool __bch2_btree_node_relock(struct btree_iter *iter, unsigned level)
 	}
 }
 
-static bool bch2_btree_node_upgrade(struct btree_iter *iter, unsigned level)
+static bool bch2_btree_node_upgrade(struct btree_trans *trans,
+				    struct btree_iter *iter, unsigned level)
 {
 	struct btree *b = iter->l[level].b;
 
@@ -183,7 +182,7 @@ static bool bch2_btree_node_upgrade(struct btree_iter *iter, unsigned level)
 		goto success;
 
 	if (btree_node_lock_seq_matches(iter, b, level) &&
-	    btree_node_lock_increment(iter->trans, b, level, BTREE_NODE_INTENT_LOCKED)) {
+	    btree_node_lock_increment(trans, b, level, BTREE_NODE_INTENT_LOCKED)) {
 		btree_node_unlock(iter, level);
 		goto success;
 	}
@@ -206,8 +205,8 @@ static inline bool btree_iter_get_locks(struct btree_trans *trans,
 			break;
 
 		if (!(upgrade
-		      ? bch2_btree_node_upgrade(iter, l)
-		      : bch2_btree_node_relock(iter, l))) {
+		      ? bch2_btree_node_upgrade(trans, iter, l)
+		      : bch2_btree_node_relock(trans, iter, l))) {
 			(upgrade
 			 ? trace_node_upgrade_fail
 			 : trace_node_relock_fail)(trans->ip, trace_ip,
@@ -255,13 +254,13 @@ static struct bpos btree_node_pos(struct btree_bkey_cached_common *_b,
 }
 
 /* Slowpath: */
-bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
-			    unsigned level, struct btree_iter *iter,
+bool __bch2_btree_node_lock(struct btree_trans *trans,
+			    struct btree_iter *iter,
+			    struct btree *b, struct bpos pos, unsigned level,
 			    enum six_lock_type type,
 			    six_lock_should_sleep_fn should_sleep_fn, void *p,
 			    unsigned long ip)
 {
-	struct btree_trans *trans = iter->trans;
 	struct btree_iter *linked, *deadlock_iter = NULL;
 	u64 start_time = local_clock();
 	unsigned reason = 9;
@@ -367,16 +366,10 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
 /* Btree iterator locking: */
 
 #ifdef CONFIG_BCACHEFS_DEBUG
-static void bch2_btree_iter_verify_locks(struct btree_trans *trans,
-					 struct btree_iter *iter)
+static void bch2_btree_iter_verify_locks(struct btree_iter *iter)
 {
 	unsigned l;
 
-	if (!(trans->iters_linked & (1ULL << iter->idx))) {
-		BUG_ON(iter->nodes_locked);
-		return;
-	}
-
 	for (l = 0; btree_iter_node(iter, l); l++) {
 		if (iter->uptodate >= BTREE_ITER_NEED_RELOCK &&
 		    !btree_node_locked(iter, l))
@@ -392,25 +385,24 @@ void bch2_btree_trans_verify_locks(struct btree_trans *trans)
 	struct btree_iter *iter;
 
 	trans_for_each_iter(trans, iter)
-		bch2_btree_iter_verify_locks(trans, iter);
+		bch2_btree_iter_verify_locks(iter);
 }
 #else
-static inline void bch2_btree_iter_verify_locks(struct btree_trans *trans,
-						struct btree_iter *iter) {}
+static inline void bch2_btree_iter_verify_locks(struct btree_iter *iter) {}
 #endif
 
 /*
  * Only for btree_cache.c - only relocks intent locks
  */
-bool bch2_btree_iter_relock_intent(struct btree_iter *iter)
+bool bch2_btree_iter_relock_intent(struct btree_trans *trans,
+				   struct btree_iter *iter)
 {
-	struct btree_trans *trans = iter->trans;
 	unsigned l;
 
 	for (l = iter->level;
 	     l < iter->locks_want && btree_iter_node(iter, l);
 	     l++) {
-		if (!bch2_btree_node_relock(iter, l)) {
+		if (!bch2_btree_node_relock(trans, iter, l)) {
 			trace_node_relock_fail(trans->ip, _RET_IP_,
 					btree_iter_type(iter) == BTREE_ITER_CACHED,
 					iter->btree_id, &iter->real_pos,
@@ -441,10 +433,10 @@ static bool bch2_btree_iter_relock(struct btree_trans *trans,
 	return ret;
 }
 
-bool __bch2_btree_iter_upgrade(struct btree_iter *iter,
+bool __bch2_btree_iter_upgrade(struct btree_trans *trans,
+			       struct btree_iter *iter,
 			       unsigned new_locks_want)
 {
-	struct btree_trans *trans = iter->trans;
 	struct btree_iter *linked;
 
 	EBUG_ON(iter->locks_want >= new_locks_want);
@@ -509,7 +501,7 @@ void __bch2_btree_iter_downgrade(struct btree_iter *iter,
 		}
 	}
 
-	bch2_btree_trans_verify_locks(iter->trans);
+	bch2_btree_iter_verify_locks(iter);
 }
 
 void bch2_trans_downgrade(struct btree_trans *trans)
@@ -558,12 +550,13 @@ void bch2_trans_unlock(struct btree_trans *trans)
 
 #ifdef CONFIG_BCACHEFS_DEBUG
 
-static void bch2_btree_iter_verify_cached(struct btree_iter *iter)
+static void bch2_btree_iter_verify_cached(struct btree_trans *trans,
+					  struct btree_iter *iter)
 {
 	struct bkey_cached *ck;
 	bool locked = btree_node_locked(iter, 0);
 
-	if (!bch2_btree_node_relock(iter, 0))
+	if (!bch2_btree_node_relock(trans, iter, 0))
 		return;
 
 	ck = (void *) iter->l[0].b;
@@ -574,8 +567,8 @@ static void bch2_btree_iter_verify_cached(struct btree_iter *iter)
 		btree_node_unlock(iter, 0);
 }
 
-static void bch2_btree_iter_verify_level(struct btree_iter *iter,
-					 unsigned level)
+static void bch2_btree_iter_verify_level(struct btree_trans *trans,
+				struct btree_iter *iter, unsigned level)
 {
 	struct btree_iter_level *l;
 	struct btree_node_iter tmp;
@@ -593,7 +586,7 @@ static void bch2_btree_iter_verify_level(struct btree_iter *iter,
 
 	if (btree_iter_type(iter) == BTREE_ITER_CACHED) {
 		if (!level)
-			bch2_btree_iter_verify_cached(iter);
+			bch2_btree_iter_verify_cached(trans, iter);
 		return;
 	}
 
@@ -602,7 +595,7 @@ static void bch2_btree_iter_verify_level(struct btree_iter *iter,
 	if (!btree_iter_node(iter, level))
 		return;
 
-	if (!bch2_btree_node_relock(iter, level))
+	if (!bch2_btree_node_relock(trans, iter, level))
 		return;
 
 	BUG_ON(!btree_iter_pos_in_node(iter, l->b));
@@ -692,10 +685,10 @@ static void bch2_btree_iter_verify(struct btree_iter *iter)
 			break;
 		}
 
-		bch2_btree_iter_verify_level(iter, i);
+		bch2_btree_iter_verify_level(trans, iter, i);
 	}
 
-	bch2_btree_iter_verify_locks(trans, iter);
+	bch2_btree_iter_verify_locks(iter);
 }
 
 static void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter)
@@ -719,12 +712,13 @@ void bch2_btree_trans_verify_iters(struct btree_trans *trans, struct btree *b)
 		return;
 
 	trans_for_each_iter_with_node(trans, b, iter)
-		bch2_btree_iter_verify_level(iter, b->c.level);
+		bch2_btree_iter_verify_level(trans, iter, b->c.level);
 }
 
 #else
 
-static inline void bch2_btree_iter_verify_level(struct btree_iter *iter, unsigned l) {}
+static inline void bch2_btree_iter_verify_level(struct btree_trans *trans,
+					struct btree_iter *iter, unsigned l) {}
 static inline void bch2_btree_iter_verify(struct btree_iter *iter) {}
 static inline void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter) {}
 
@@ -771,7 +765,7 @@ void bch2_btree_iter_fix_key_modified(struct btree_trans *trans,
 
 	trans_for_each_iter_with_node(trans, b, linked) {
 		__bch2_btree_iter_fix_key_modified(linked, b, where);
-		bch2_btree_iter_verify_level(linked, b->c.level);
+		bch2_btree_iter_verify_level(trans, linked, b->c.level);
 	}
 }
 
@@ -896,7 +890,7 @@ void bch2_btree_node_iter_fix(struct btree_trans *trans,
 		__bch2_btree_node_iter_fix(linked, b,
 					   &linked->l[b->c.level].iter, t,
 					   where, clobber_u64s, new_u64s);
-		bch2_btree_iter_verify_level(linked, b->c.level);
+		bch2_btree_iter_verify_level(trans, linked, b->c.level);
 	}
 }
 
@@ -983,7 +977,8 @@ static inline bool btree_iter_advance_to_pos(struct btree_iter *iter,
 /*
  * Verify that iterator for parent node points to child node:
  */
-static void btree_iter_verify_new_node(struct btree_iter *iter, struct btree *b)
+static void btree_iter_verify_new_node(struct btree_trans *trans,
+				       struct btree_iter *iter, struct btree *b)
 {
 	struct btree_iter_level *l;
 	unsigned plevel;
@@ -999,7 +994,7 @@ static void btree_iter_verify_new_node(struct btree_iter *iter, struct btree *b)
 
 	parent_locked = btree_node_locked(iter, plevel);
 
-	if (!bch2_btree_node_relock(iter, plevel))
+	if (!bch2_btree_node_relock(trans, iter, plevel))
 		return;
 
 	l = &iter->l[plevel];
@@ -1013,7 +1008,7 @@ static void btree_iter_verify_new_node(struct btree_iter *iter, struct btree *b)
 		char buf4[100];
 		struct bkey uk = bkey_unpack_key(b, k);
 
-		bch2_dump_btree_node(iter->trans->c, l->b);
+		bch2_dump_btree_node(trans->c, l->b);
 		bch2_bpos_to_text(&PBUF(buf1), iter->real_pos);
 		bch2_bkey_to_text(&PBUF(buf2), &uk);
 		bch2_bpos_to_text(&PBUF(buf3), b->data->min_key);
@@ -1030,8 +1025,8 @@ static void btree_iter_verify_new_node(struct btree_iter *iter, struct btree *b)
 		btree_node_unlock(iter, b->c.level + 1);
 }
 
-static inline void __btree_iter_init(struct btree_iter *iter,
-				     unsigned level)
+static inline void __btree_iter_level_init(struct btree_iter *iter,
+					   unsigned level)
 {
 	struct btree_iter_level *l = &iter->l[level];
 
@@ -1047,19 +1042,20 @@ static inline void __btree_iter_init(struct btree_iter *iter,
 	btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
 }
 
-static inline void btree_iter_node_set(struct btree_iter *iter,
-				       struct btree *b)
+static inline void btree_iter_level_init(struct btree_trans *trans,
+					 struct btree_iter *iter,
+					 struct btree *b)
 {
 	BUG_ON(btree_iter_type(iter) == BTREE_ITER_CACHED);
 
-	btree_iter_verify_new_node(iter, b);
+	btree_iter_verify_new_node(trans, iter, b);
 
 	EBUG_ON(!btree_iter_pos_in_node(iter, b));
 	EBUG_ON(b->c.lock.state.seq & 1);
 
 	iter->l[b->c.level].lock_seq = b->c.lock.state.seq;
 	iter->l[b->c.level].b = b;
-	__btree_iter_init(iter, b->c.level);
+	__btree_iter_level_init(iter, b->c.level);
 }
 
 /*
@@ -1088,7 +1084,7 @@ void bch2_btree_iter_node_replace(struct btree_trans *trans,
 				mark_btree_node_locked(linked, b->c.level, (enum six_lock_type) t);
 			}
 
-			btree_iter_node_set(linked, b);
+			btree_iter_level_init(trans, linked, b);
 		}
 }
 
@@ -1115,7 +1111,7 @@ void bch2_btree_iter_reinit_node(struct btree_trans *trans,
 	struct btree_iter *linked;
 
 	trans_for_each_iter_with_node(trans, b, linked)
-		__btree_iter_init(linked, b->c.level);
+		__btree_iter_level_init(linked, b->c.level);
 }
 
 static int lock_root_check_fn(struct six_lock *lock, void *p)
@@ -1156,8 +1152,8 @@ static inline int btree_iter_lock_root(struct btree_trans *trans,
 		}
 
 		lock_type = __btree_lock_want(iter, iter->level);
-		if (unlikely(!btree_node_lock(b, SPOS_MAX, iter->level,
-					      iter, lock_type,
+		if (unlikely(!btree_node_lock(trans, iter, b, SPOS_MAX,
+					      iter->level, lock_type,
 					      lock_root_check_fn, rootp,
 					      trace_ip))) {
 			if (trans->restarted)
@@ -1175,7 +1171,7 @@ static inline int btree_iter_lock_root(struct btree_trans *trans,
 				iter->l[i].b = NULL;
 
 			mark_btree_node_locked(iter, iter->level, lock_type);
-			btree_iter_node_set(iter, b);
+			btree_iter_level_init(trans, iter, b);
 			return 0;
 		}
 
@@ -1200,7 +1196,7 @@ static int btree_iter_prefetch(struct btree_trans *trans, struct btree_iter *ite
 	bch2_bkey_buf_init(&tmp);
 
 	while (nr && !ret) {
-		if (!bch2_btree_node_relock(iter, iter->level))
+		if (!bch2_btree_node_relock(trans, iter, iter->level))
 			break;
 
 		bch2_btree_node_iter_advance(&node_iter, l->b);
@@ -1209,8 +1205,8 @@ static int btree_iter_prefetch(struct btree_trans *trans, struct btree_iter *ite
 			break;
 
 		bch2_bkey_buf_unpack(&tmp, c, l->b, k);
-		ret = bch2_btree_node_prefetch(c, iter, tmp.k, iter->btree_id,
-					       iter->level - 1);
+		ret = bch2_btree_node_prefetch(c, trans, iter, tmp.k,
+					iter->btree_id, iter->level - 1);
 	}
 
 	if (!was_locked)
@@ -1220,7 +1216,8 @@ static int btree_iter_prefetch(struct btree_trans *trans, struct btree_iter *ite
 	return ret;
 }
 
-static noinline void btree_node_mem_ptr_set(struct btree_iter *iter,
+static noinline void btree_node_mem_ptr_set(struct btree_trans *trans,
+					    struct btree_iter *iter,
 					    unsigned plevel, struct btree *b)
 {
 	struct btree_iter_level *l = &iter->l[plevel];
@@ -1228,7 +1225,7 @@ static noinline void btree_node_mem_ptr_set(struct btree_iter *iter,
 	struct bkey_packed *k;
 	struct bch_btree_ptr_v2 *bp;
 
-	if (!bch2_btree_node_relock(iter, plevel))
+	if (!bch2_btree_node_relock(trans, iter, plevel))
 		return;
 
 	k = bch2_btree_node_iter_peek_all(&l->iter, l->b);
@@ -1265,11 +1262,11 @@ static __always_inline int btree_iter_down(struct btree_trans *trans,
 		goto err;
 
 	mark_btree_node_locked(iter, level, lock_type);
-	btree_iter_node_set(iter, b);
+	btree_iter_level_init(trans, iter, b);
 
 	if (tmp.k->k.type == KEY_TYPE_btree_ptr_v2 &&
 	    unlikely(b != btree_node_mem_ptr(tmp.k)))
-		btree_node_mem_ptr_set(iter, level + 1, b);
+		btree_node_mem_ptr_set(trans, iter, level + 1, b);
 
 	if (iter->flags & BTREE_ITER_PREFETCH)
 		ret = btree_iter_prefetch(trans, iter);
@@ -1278,7 +1275,7 @@ static __always_inline int btree_iter_down(struct btree_trans *trans,
 		btree_node_unlock(iter, level + 1);
 	iter->level = level;
 
-	bch2_btree_iter_verify_locks(trans, iter);
+	bch2_btree_iter_verify_locks(iter);
 err:
 	bch2_bkey_buf_exit(&tmp, c);
 	return ret;
@@ -1310,9 +1307,9 @@ retry_all:
 		if (prev) {
 			if (iter->btree_id == prev->btree_id &&
 			    iter->locks_want < prev->locks_want)
-				__bch2_btree_iter_upgrade(iter, prev->locks_want);
+				__bch2_btree_iter_upgrade(trans, iter, prev->locks_want);
 			else if (!iter->locks_want && prev->locks_want)
-				__bch2_btree_iter_upgrade(iter, 1);
+				__bch2_btree_iter_upgrade(trans, iter, 1);
 		}
 
 		prev = iter;
@@ -1377,11 +1374,12 @@ static int bch2_btree_iter_traverse_all(struct btree_trans *trans)
 	return __btree_iter_traverse_all(trans, 0, _RET_IP_);
 }
 
-static inline bool btree_iter_good_node(struct btree_iter *iter,
+static inline bool btree_iter_good_node(struct btree_trans *trans,
+					struct btree_iter *iter,
 					unsigned l, int check_pos)
 {
 	if (!is_btree_node(iter, l) ||
-	    !bch2_btree_node_relock(iter, l))
+	    !bch2_btree_node_relock(trans, iter, l))
 		return false;
 
 	if (check_pos < 0 && btree_iter_pos_before_node(iter, iter->l[l].b))
@@ -1391,13 +1389,14 @@ static inline bool btree_iter_good_node(struct btree_iter *iter,
 	return true;
 }
 
-static inline unsigned btree_iter_up_until_good_node(struct btree_iter *iter,
+static inline unsigned btree_iter_up_until_good_node(struct btree_trans *trans,
+						     struct btree_iter *iter,
 						     int check_pos)
 {
 	unsigned l = iter->level;
 
 	while (btree_iter_node(iter, l) &&
-	       !btree_iter_good_node(iter, l, check_pos)) {
+	       !btree_iter_good_node(trans, iter, l, check_pos)) {
 		btree_node_unlock(iter, l);
 		iter->l[l].b = BTREE_ITER_NO_NODE_UP;
 		l++;
@@ -1432,20 +1431,20 @@ static int btree_iter_traverse_one(struct btree_trans *trans,
 	}
 
 	if (btree_iter_type(iter) == BTREE_ITER_CACHED) {
-		ret = bch2_btree_iter_traverse_cached(iter);
+		ret = bch2_btree_iter_traverse_cached(trans, iter);
 		goto out;
 	}
 
 	if (unlikely(iter->level >= BTREE_MAX_DEPTH))
 		goto out;
 
-	iter->level = btree_iter_up_until_good_node(iter, 0);
+	iter->level = btree_iter_up_until_good_node(trans, iter, 0);
 
 	/* If we need intent locks, take them too: */
 	for (l = iter->level + 1;
 	     l < iter->locks_want && btree_iter_node(iter, l);
 	     l++)
-		if (!bch2_btree_node_relock(iter, l))
+		if (!bch2_btree_node_relock(trans, iter, l))
 			while (iter->level <= l) {
 				btree_node_unlock(iter, iter->level);
 				iter->l[iter->level].b = BTREE_ITER_NO_NODE_UP;
@@ -1657,7 +1656,7 @@ static void btree_iter_set_search_pos(struct btree_iter *iter, struct bpos new_p
 		return;
 	}
 
-	l = btree_iter_up_until_good_node(iter, cmp);
+	l = btree_iter_up_until_good_node(trans, iter, cmp);
 
 	if (btree_iter_node(iter, l)) {
 		/*
@@ -1668,7 +1667,7 @@ static void btree_iter_set_search_pos(struct btree_iter *iter, struct bpos new_p
 		 */
 		if (cmp < 0 ||
 		    !btree_iter_advance_to_pos(iter, &iter->l[l], 8))
-			__btree_iter_init(iter, l);
+			__btree_iter_level_init(iter, l);
 
 		/* Don't leave it locked if we're not supposed to: */
 		if (btree_lock_want(iter, l) == BTREE_NODE_UNLOCKED)
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index ea129387ebb7..a175eb5f26fd 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -141,7 +141,7 @@ void bch2_btree_node_iter_fix(struct btree_trans *trans, struct btree_iter *,
 			      struct btree *, struct btree_node_iter *,
 			      struct bkey_packed *, unsigned, unsigned);
 
-bool bch2_btree_iter_relock_intent(struct btree_iter *);
+bool bch2_btree_iter_relock_intent(struct btree_trans *, struct btree_iter *);
 
 bool bch2_trans_relock(struct btree_trans *);
 void bch2_trans_unlock(struct btree_trans *);
@@ -154,15 +154,17 @@ static inline int btree_trans_restart(struct btree_trans *trans)
 	return -EINTR;
 }
 
-bool __bch2_btree_iter_upgrade(struct btree_iter *, unsigned);
+bool __bch2_btree_iter_upgrade(struct btree_trans *,
+			       struct btree_iter *, unsigned);
 
-static inline bool bch2_btree_iter_upgrade(struct btree_iter *iter,
+static inline bool bch2_btree_iter_upgrade(struct btree_trans *trans,
+					   struct btree_iter *iter,
 					   unsigned new_locks_want)
 {
 	new_locks_want = min(new_locks_want, BTREE_MAX_DEPTH);
 
 	return iter->locks_want < new_locks_want
-		? __bch2_btree_iter_upgrade(iter, new_locks_want)
+		? __bch2_btree_iter_upgrade(trans, iter, new_locks_want)
 		: iter->uptodate <= BTREE_ITER_NEED_PEEK;
 }
 
diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index c7d223f91bf6..ba03581c5290 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -213,7 +213,7 @@ static int btree_key_cache_fill(struct btree_trans *trans,
 	if (ret)
 		goto err;
 
-	if (!bch2_btree_node_relock(ck_iter, 0)) {
+	if (!bch2_btree_node_relock(trans, ck_iter, 0)) {
 		trace_transaction_restart_ip(trans->ip, _THIS_IP_);
 		ret = btree_trans_restart(trans);
 		goto err;
@@ -266,9 +266,8 @@ static int bkey_cached_check_fn(struct six_lock *lock, void *p)
 }
 
 __flatten
-int bch2_btree_iter_traverse_cached(struct btree_iter *iter)
+int bch2_btree_iter_traverse_cached(struct btree_trans *trans, struct btree_iter *iter)
 {
-	struct btree_trans *trans = iter->trans;
 	struct bch_fs *c = trans->c;
 	struct bkey_cached *ck;
 	int ret = 0;
@@ -277,7 +276,7 @@ int bch2_btree_iter_traverse_cached(struct btree_iter *iter)
 
 	iter->l[1].b = NULL;
 
-	if (bch2_btree_node_relock(iter, 0)) {
+	if (bch2_btree_node_relock(trans, iter, 0)) {
 		ck = (void *) iter->l[0].b;
 		goto fill;
 	}
@@ -302,7 +301,7 @@ retry:
 	} else {
 		enum six_lock_type lock_want = __btree_lock_want(iter, 0);
 
-		if (!btree_node_lock((void *) ck, iter->pos, 0, iter, lock_want,
+		if (!btree_node_lock(trans, iter, (void *) ck, iter->pos, 0, lock_want,
 				     bkey_cached_check_fn, iter, _THIS_IP_)) {
 			if (!trans->restarted)
 				goto retry;
@@ -326,7 +325,7 @@ retry:
 fill:
 	if (!ck->valid && !(iter->flags & BTREE_ITER_CACHED_NOFILL)) {
 		if (!iter->locks_want &&
-		    !!__bch2_btree_iter_upgrade(iter, 1)) {
+		    !!__bch2_btree_iter_upgrade(trans, iter, 1)) {
 			trace_transaction_restart_ip(trans->ip, _THIS_IP_);
 			BUG_ON(!trans->restarted);
 			ret = -EINTR;
@@ -344,7 +343,7 @@ fill:
 	iter->uptodate = BTREE_ITER_NEED_PEEK;
 
 	if ((iter->flags & BTREE_ITER_INTENT) &&
-	    !bch2_btree_iter_upgrade(iter, 1)) {
+	    !bch2_btree_iter_upgrade(trans, iter, 1)) {
 		BUG_ON(!trans->restarted);
 		ret = -EINTR;
 	}
diff --git a/fs/bcachefs/btree_key_cache.h b/fs/bcachefs/btree_key_cache.h
index 7e2b0a08f745..d890632e4425 100644
--- a/fs/bcachefs/btree_key_cache.h
+++ b/fs/bcachefs/btree_key_cache.h
@@ -26,7 +26,7 @@ int bch2_btree_key_cache_journal_flush(struct journal *,
 struct bkey_cached *
 bch2_btree_key_cache_find(struct bch_fs *, enum btree_id, struct bpos);
 
-int bch2_btree_iter_traverse_cached(struct btree_iter *);
+int bch2_btree_iter_traverse_cached(struct btree_trans *, struct btree_iter *);
 
 bool bch2_btree_insert_key_cached(struct btree_trans *,
 			struct btree_iter *, struct bkey_i *);
diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h
index 0acc731df8e9..b490e4808631 100644
--- a/fs/bcachefs/btree_locking.h
+++ b/fs/bcachefs/btree_locking.h
@@ -166,40 +166,38 @@ static inline bool btree_node_lock_increment(struct btree_trans *trans,
 	return false;
 }
 
-bool __bch2_btree_node_lock(struct btree *, struct bpos, unsigned,
-			    struct btree_iter *, enum six_lock_type,
-			    six_lock_should_sleep_fn, void *,
-			    unsigned long);
+bool __bch2_btree_node_lock(struct btree_trans *, struct btree_iter *,
+			    struct btree *, struct bpos, unsigned,
+			    enum six_lock_type, six_lock_should_sleep_fn,
+			    void *, unsigned long);
 
-static inline bool btree_node_lock(struct btree *b,
-			struct bpos pos, unsigned level,
+static inline bool btree_node_lock(struct btree_trans *trans,
 			struct btree_iter *iter,
+			struct btree *b, struct bpos pos, unsigned level,
 			enum six_lock_type type,
 			six_lock_should_sleep_fn should_sleep_fn, void *p,
 			unsigned long ip)
 {
-	struct btree_trans *trans = iter->trans;
-
 	EBUG_ON(level >= BTREE_MAX_DEPTH);
 	EBUG_ON(!(trans->iters_linked & (1ULL << iter->idx)));
 
 	return likely(six_trylock_type(&b->c.lock, type)) ||
 		btree_node_lock_increment(trans, b, level, type) ||
-		__bch2_btree_node_lock(b, pos, level, iter, type,
+		__bch2_btree_node_lock(trans, iter, b, pos, level, type,
 				       should_sleep_fn, p, ip);
 }
 
-bool __bch2_btree_node_relock(struct btree_iter *, unsigned);
+bool __bch2_btree_node_relock(struct btree_trans *, struct btree_iter *, unsigned);
 
-static inline bool bch2_btree_node_relock(struct btree_iter *iter,
-					  unsigned level)
+static inline bool bch2_btree_node_relock(struct btree_trans *trans,
+					  struct btree_iter *iter, unsigned level)
 {
 	EBUG_ON(btree_node_locked(iter, level) &&
 		btree_node_locked_type(iter, level) !=
 		__btree_lock_want(iter, level));
 
 	return likely(btree_node_locked(iter, level)) ||
-		__bch2_btree_node_relock(iter, level);
+		__bch2_btree_node_relock(trans, iter, level);
 }
 
 /*
@@ -224,8 +222,7 @@ bch2_btree_node_unlock_write_inlined(struct btree_trans *trans, struct btree_ite
 void bch2_btree_node_unlock_write(struct btree_trans *,
 			struct btree_iter *, struct btree *);
 
-void __bch2_btree_node_lock_write(struct btree_trans *,
-			struct btree_iter *, struct btree *);
+void __bch2_btree_node_lock_write(struct btree_trans *, struct btree *);
 
 static inline void bch2_btree_node_lock_write(struct btree_trans *trans,
 					      struct btree_iter *iter,
@@ -233,9 +230,10 @@ static inline void bch2_btree_node_lock_write(struct btree_trans *trans,
 {
 	EBUG_ON(iter->l[b->c.level].b != b);
 	EBUG_ON(iter->l[b->c.level].lock_seq != b->c.lock.state.seq);
+	EBUG_ON(!btree_node_intent_locked(iter, b->c.level));
 
 	if (unlikely(!six_trylock_write(&b->c.lock)))
-		__bch2_btree_node_lock_write(trans, iter, b);
+		__bch2_btree_node_lock_write(trans, b);
 }
 
 #endif /* _BCACHEFS_BTREE_LOCKING_H */
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 4acd49900611..c1d4227738cf 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -937,7 +937,7 @@ retry:
 	 * XXX: figure out how far we might need to split,
 	 * instead of locking/reserving all the way to the root:
 	 */
-	if (!bch2_btree_iter_upgrade(iter, U8_MAX)) {
+	if (!bch2_btree_iter_upgrade(trans, iter, U8_MAX)) {
 		trace_trans_restart_iter_upgrade(trans->ip, _RET_IP_,
 						 iter->btree_id,
 						 &iter->real_pos);
diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h
index 07046dab614b..13b3a1bf0f4f 100644
--- a/fs/bcachefs/btree_update_interior.h
+++ b/fs/bcachefs/btree_update_interior.h
@@ -132,7 +132,7 @@ static inline int bch2_foreground_maybe_merge_sibling(struct btree_trans *trans,
 	if (iter->uptodate >= BTREE_ITER_NEED_TRAVERSE)
 		return 0;
 
-	if (!bch2_btree_node_relock(iter, level))
+	if (!bch2_btree_node_relock(trans, iter, level))
 		return 0;
 
 	b = iter->l[level].b;
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 92b6b5cec2ae..e93db33fcfb7 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -561,7 +561,7 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans,
 	 */
 	trans_for_each_iter(trans, iter)
 		if (iter->nodes_locked != iter->nodes_intent_locked &&
-		    !bch2_btree_iter_upgrade(iter, 1)) {
+		    !bch2_btree_iter_upgrade(trans, iter, 1)) {
 			trace_trans_restart_upgrade(trans->ip, trace_ip,
 						    iter->btree_id,
 						    &iter->real_pos);
@@ -783,7 +783,8 @@ int __bch2_trans_commit(struct btree_trans *trans)
 	trans_for_each_update(trans, i) {
 		BUG_ON(!i->iter->should_be_locked);
 
-		if (unlikely(!bch2_btree_iter_upgrade(i->iter, i->level + 1))) {
+		if (unlikely(!bch2_btree_iter_upgrade(trans, i->iter,
+						      i->level + 1))) {
 			trace_trans_restart_upgrade(trans->ip, _RET_IP_,
 						    i->iter->btree_id,
 						    &i->iter->pos);
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index b02af94f4037..71b0f14f41f3 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -326,8 +326,8 @@ static void btree_and_journal_iter_prefetch(struct bch_fs *c, struct btree *b,
 	       (k = bch2_btree_and_journal_iter_peek(&iter)).k) {
 		bch2_bkey_buf_reassemble(&tmp, c, k);
 
-		bch2_btree_node_prefetch(c, NULL, tmp.k,
-					b->c.btree_id, b->c.level - 1);
+		bch2_btree_node_prefetch(c, NULL, NULL, tmp.k,
+					 b->c.btree_id, b->c.level - 1);
 
 		bch2_btree_and_journal_iter_advance(&iter);
 		i++;
-- 
cgit 


From f7a966a3e2546a7fc76bc34b78e7dad7cebfa8c2 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 30 Aug 2021 14:36:03 -0400
Subject: bcachefs: Clean up/rename bch2_trans_node_* fns

These utility functions are for managing btree node state within a
btree_trans - rename them for consistency, and drop some unneeded
arguments.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_io.c              |  9 +++----
 fs/bcachefs/btree_io.h              |  3 +--
 fs/bcachefs/btree_iter.c            | 52 +++++++++++++++++--------------------
 fs/bcachefs/btree_iter.h            | 12 +++------
 fs/bcachefs/btree_update_interior.c | 35 ++++++++++++-------------
 fs/bcachefs/btree_update_leaf.c     | 11 ++++----
 6 files changed, 54 insertions(+), 68 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 99799d93cf09..f51dd3ec0797 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -459,16 +459,13 @@ void bch2_btree_build_aux_trees(struct btree *b)
  *
  * Returns true if we sorted (i.e. invalidated iterators
  */
-void bch2_btree_init_next(struct btree_trans *trans,
-			  struct btree_iter *iter,
-			  struct btree *b)
+void bch2_btree_init_next(struct btree_trans *trans, struct btree *b)
 {
 	struct bch_fs *c = trans->c;
 	struct btree_node_entry *bne;
 	bool reinit_iter = false;
 
 	EBUG_ON(!(b->c.lock.state.seq & 1));
-	EBUG_ON(iter && iter->l[b->c.level].b != b);
 	BUG_ON(bset_written(b, bset(b, &b->set[1])));
 
 	if (b->nsets == MAX_BSETS &&
@@ -497,8 +494,8 @@ void bch2_btree_init_next(struct btree_trans *trans,
 
 	bch2_btree_build_aux_trees(b);
 
-	if (iter && reinit_iter)
-		bch2_btree_iter_reinit_node(trans, iter, b);
+	if (reinit_iter)
+		bch2_trans_node_reinit_iter(trans, b);
 }
 
 static void btree_pos_to_text(struct printbuf *out, struct bch_fs *c,
diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h
index 7fdcf879c7d4..0f20224e2a77 100644
--- a/fs/bcachefs/btree_io.h
+++ b/fs/bcachefs/btree_io.h
@@ -134,8 +134,7 @@ void bch2_btree_sort_into(struct bch_fs *, struct btree *, struct btree *);
 void bch2_btree_node_drop_keys_outside_node(struct btree *);
 
 void bch2_btree_build_aux_trees(struct btree *);
-void bch2_btree_init_next(struct btree_trans *, struct btree_iter *,
-			  struct btree *);
+void bch2_btree_init_next(struct btree_trans *, struct btree *);
 
 int bch2_btree_node_read_done(struct bch_fs *, struct bch_dev *,
 			      struct btree *, bool);
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 16b9f6a986f4..91047f378e8f 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -757,15 +757,14 @@ static void __bch2_btree_iter_fix_key_modified(struct btree_iter *iter,
 }
 
 void bch2_btree_iter_fix_key_modified(struct btree_trans *trans,
-				      struct btree_iter *iter,
 				      struct btree *b,
 				      struct bkey_packed *where)
 {
-	struct btree_iter *linked;
+	struct btree_iter *iter;
 
-	trans_for_each_iter_with_node(trans, b, linked) {
-		__bch2_btree_iter_fix_key_modified(linked, b, where);
-		bch2_btree_iter_verify_level(trans, linked, b->c.level);
+	trans_for_each_iter_with_node(trans, b, iter) {
+		__bch2_btree_iter_fix_key_modified(iter, b, where);
+		bch2_btree_iter_verify_level(trans, iter, b->c.level);
 	}
 }
 
@@ -1062,42 +1061,40 @@ static inline void btree_iter_level_init(struct btree_trans *trans,
  * A btree node is being replaced - update the iterator to point to the new
  * node:
  */
-void bch2_btree_iter_node_replace(struct btree_trans *trans,
-			struct btree_iter *iter, struct btree *b)
+void bch2_trans_node_add(struct btree_trans *trans, struct btree *b)
 {
 	enum btree_node_locked_type t;
-	struct btree_iter *linked;
+	struct btree_iter *iter;
 
-	trans_for_each_iter(trans, linked)
-		if (btree_iter_type(linked) != BTREE_ITER_CACHED &&
-		    btree_iter_pos_in_node(linked, b)) {
+	trans_for_each_iter(trans, iter)
+		if (btree_iter_type(iter) != BTREE_ITER_CACHED &&
+		    btree_iter_pos_in_node(iter, b)) {
 			/*
-			 * bch2_btree_iter_node_drop() has already been called -
+			 * bch2_trans_node_drop() has already been called -
 			 * the old node we're replacing has already been
 			 * unlocked and the pointer invalidated
 			 */
-			BUG_ON(btree_node_locked(linked, b->c.level));
+			BUG_ON(btree_node_locked(iter, b->c.level));
 
-			t = btree_lock_want(linked, b->c.level);
+			t = btree_lock_want(iter, b->c.level);
 			if (t != BTREE_NODE_UNLOCKED) {
 				six_lock_increment(&b->c.lock, (enum six_lock_type) t);
-				mark_btree_node_locked(linked, b->c.level, (enum six_lock_type) t);
+				mark_btree_node_locked(iter, b->c.level, (enum six_lock_type) t);
 			}
 
-			btree_iter_level_init(trans, linked, b);
+			btree_iter_level_init(trans, iter, b);
 		}
 }
 
-void bch2_btree_iter_node_drop(struct btree_trans *trans,
-			struct btree_iter *iter, struct btree *b)
+void bch2_trans_node_drop(struct btree_trans *trans, struct btree *b)
 {
-	struct btree_iter *linked;
+	struct btree_iter *iter;
 	unsigned level = b->c.level;
 
-	trans_for_each_iter(trans, linked)
-		if (linked->l[level].b == b) {
-			btree_node_unlock(linked, level);
-			linked->l[level].b = BTREE_ITER_NO_NODE_DROP;
+	trans_for_each_iter(trans, iter)
+		if (iter->l[level].b == b) {
+			btree_node_unlock(iter, level);
+			iter->l[level].b = BTREE_ITER_NO_NODE_DROP;
 		}
 }
 
@@ -1105,13 +1102,12 @@ void bch2_btree_iter_node_drop(struct btree_trans *trans,
  * A btree node has been modified in such a way as to invalidate iterators - fix
  * them:
  */
-void bch2_btree_iter_reinit_node(struct btree_trans *trans,
-			struct btree_iter *iter, struct btree *b)
+void bch2_trans_node_reinit_iter(struct btree_trans *trans, struct btree *b)
 {
-	struct btree_iter *linked;
+	struct btree_iter *iter;
 
-	trans_for_each_iter_with_node(trans, b, linked)
-		__btree_iter_level_init(linked, b->c.level);
+	trans_for_each_iter_with_node(trans, b, iter)
+		__btree_iter_level_init(iter, b->c.level);
 }
 
 static int lock_root_check_fn(struct six_lock *lock, void *p)
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index a175eb5f26fd..0f65e24bf102 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -135,7 +135,7 @@ static inline void bch2_btree_trans_verify_iters(struct btree_trans *trans,
 static inline void bch2_btree_trans_verify_locks(struct btree_trans *iter) {}
 #endif
 
-void bch2_btree_iter_fix_key_modified(struct btree_trans *trans, struct btree_iter *,
+void bch2_btree_iter_fix_key_modified(struct btree_trans *trans,
 				      struct btree *, struct bkey_packed *);
 void bch2_btree_node_iter_fix(struct btree_trans *trans, struct btree_iter *,
 			      struct btree *, struct btree_node_iter *,
@@ -180,13 +180,9 @@ static inline void bch2_btree_iter_downgrade(struct btree_iter *iter)
 
 void bch2_trans_downgrade(struct btree_trans *);
 
-void bch2_btree_iter_node_replace(struct btree_trans *trans,
-				  struct btree_iter *, struct btree *);
-void bch2_btree_iter_node_drop(struct btree_trans *,
-			       struct btree_iter *, struct btree *);
-
-void bch2_btree_iter_reinit_node(struct btree_trans *,
-				 struct btree_iter *, struct btree *);
+void bch2_trans_node_add(struct btree_trans *trans, struct btree *);
+void bch2_trans_node_drop(struct btree_trans *, struct btree *);
+void bch2_trans_node_reinit_iter(struct btree_trans *, struct btree *);
 
 int __must_check bch2_btree_iter_traverse(struct btree_iter *);
 
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index c1d4227738cf..80227e032103 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -161,14 +161,13 @@ static void __btree_node_free(struct bch_fs *c, struct btree *b)
 }
 
 static void bch2_btree_node_free_inmem(struct btree_trans *trans,
-				       struct btree_iter *iter,
 				       struct btree *b)
 {
 	struct bch_fs *c = trans->c;
-	struct btree_iter *linked;
+	struct btree_iter *iter;
 
-	trans_for_each_iter(trans, linked)
-		BUG_ON(linked->l[b->c.level].b == b);
+	trans_for_each_iter(trans, iter)
+		BUG_ON(iter->l[b->c.level].b == b);
 
 	six_lock_write(&b->c.lock, NULL, NULL);
 	__btree_node_free(c, b);
@@ -1431,12 +1430,12 @@ static void btree_split(struct btree_update *as,
 	/* Successful split, update the iterator to point to the new nodes: */
 
 	six_lock_increment(&b->c.lock, SIX_LOCK_intent);
-	bch2_btree_iter_node_drop(trans, iter, b);
+	bch2_trans_node_drop(trans, b);
 	if (n3)
-		bch2_btree_iter_node_replace(trans, iter, n3);
+		bch2_trans_node_add(trans, n3);
 	if (n2)
-		bch2_btree_iter_node_replace(trans, iter, n2);
-	bch2_btree_iter_node_replace(trans, iter, n1);
+		bch2_trans_node_add(trans, n2);
+	bch2_trans_node_add(trans, n1);
 
 	/*
 	 * The old node must be freed (in memory) _before_ unlocking the new
@@ -1444,7 +1443,7 @@ static void btree_split(struct btree_update *as,
 	 * node after another thread has locked and updated the new node, thus
 	 * seeing stale data:
 	 */
-	bch2_btree_node_free_inmem(trans, iter, b);
+	bch2_btree_node_free_inmem(trans, b);
 
 	if (n3)
 		six_unlock_intent(&n3->c.lock);
@@ -1527,7 +1526,7 @@ static void bch2_btree_insert_node(struct btree_update *as,
 
 	if (u64s_added > live_u64s_added &&
 	    bch2_maybe_compact_whiteouts(c, b))
-		bch2_btree_iter_reinit_node(trans, iter, b);
+		bch2_trans_node_reinit_iter(trans, b);
 
 	bch2_btree_node_unlock_write(trans, iter, b);
 
@@ -1702,15 +1701,15 @@ retry:
 
 	six_lock_increment(&b->c.lock, SIX_LOCK_intent);
 	six_lock_increment(&m->c.lock, SIX_LOCK_intent);
-	bch2_btree_iter_node_drop(trans, iter, b);
-	bch2_btree_iter_node_drop(trans, iter, m);
+	bch2_trans_node_drop(trans, b);
+	bch2_trans_node_drop(trans, m);
 
-	bch2_btree_iter_node_replace(trans, iter, n);
+	bch2_trans_node_add(trans, n);
 
 	bch2_btree_trans_verify_iters(trans, n);
 
-	bch2_btree_node_free_inmem(trans, iter, b);
-	bch2_btree_node_free_inmem(trans, iter, m);
+	bch2_btree_node_free_inmem(trans, b);
+	bch2_btree_node_free_inmem(trans, m);
 
 	six_unlock_intent(&n->c.lock);
 
@@ -1798,9 +1797,9 @@ retry:
 	bch2_btree_update_get_open_buckets(as, n);
 
 	six_lock_increment(&b->c.lock, SIX_LOCK_intent);
-	bch2_btree_iter_node_drop(trans, iter, b);
-	bch2_btree_iter_node_replace(trans, iter, n);
-	bch2_btree_node_free_inmem(trans, iter, b);
+	bch2_trans_node_drop(trans, b);
+	bch2_trans_node_add(trans, n);
+	bch2_btree_node_free_inmem(trans, b);
 	six_unlock_intent(&n->c.lock);
 
 	bch2_btree_update_done(as);
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index e93db33fcfb7..20b950ce31f1 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -56,14 +56,14 @@ inline void bch2_btree_node_lock_for_insert(struct btree_trans *trans,
 
 	if (unlikely(btree_node_just_written(b)) &&
 	    bch2_btree_post_write_cleanup(c, b))
-		bch2_btree_iter_reinit_node(trans, iter, b);
+		bch2_trans_node_reinit_iter(trans, b);
 
 	/*
 	 * If the last bset has been written, or if it's gotten too big - start
 	 * a new bset to insert into:
 	 */
 	if (want_new_bset(c, b))
-		bch2_btree_init_next(trans, iter, b);
+		bch2_btree_init_next(trans, b);
 }
 
 /* Inserting into a given leaf node (last stage of insert): */
@@ -85,7 +85,6 @@ bool bch2_btree_bset_insert_key(struct btree_trans *trans,
 	EBUG_ON(bpos_cmp(insert->k.p, b->data->max_key) > 0);
 	EBUG_ON(insert->k.u64s >
 		bch_btree_keys_u64s_remaining(trans->c, b));
-	EBUG_ON(iter->flags & BTREE_ITER_IS_EXTENTS);
 
 	k = bch2_btree_node_iter_peek_all(node_iter, b);
 	if (k && bkey_cmp_left_packed(b, k, &insert->k.p))
@@ -112,7 +111,7 @@ bool bch2_btree_bset_insert_key(struct btree_trans *trans,
 			bch2_bset_delete(b, k, clobber_u64s);
 			goto fix_iter;
 		} else {
-			bch2_btree_iter_fix_key_modified(trans, iter, b, k);
+			bch2_btree_iter_fix_key_modified(trans, b, k);
 		}
 
 		return true;
@@ -130,7 +129,7 @@ bool bch2_btree_bset_insert_key(struct btree_trans *trans,
 			clobber_u64s = k->u64s;
 			goto overwrite;
 		} else {
-			bch2_btree_iter_fix_key_modified(trans, iter, b, k);
+			bch2_btree_iter_fix_key_modified(trans, b, k);
 		}
 	}
 
@@ -220,7 +219,7 @@ static bool btree_insert_key_leaf(struct btree_trans *trans,
 
 	if (u64s_added > live_u64s_added &&
 	    bch2_maybe_compact_whiteouts(c, b))
-		bch2_btree_iter_reinit_node(trans, iter, b);
+		bch2_trans_node_reinit_iter(trans, b);
 
 	trace_btree_insert_key(c, b, insert);
 	return true;
-- 
cgit 


From a0a568794d09a2092062ed4137499ed0884cf2b4 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 30 Aug 2021 14:45:11 -0400
Subject: bcachefs: More renaming

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_iter.c            |  6 +++---
 fs/bcachefs/btree_iter.h            | 10 +++++-----
 fs/bcachefs/btree_update_interior.c |  8 ++++----
 fs/bcachefs/btree_update_leaf.c     |  4 ++--
 4 files changed, 14 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 91047f378e8f..bc93fac24ce1 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -240,7 +240,7 @@ static inline bool btree_iter_get_locks(struct btree_trans *trans,
 	if (iter->uptodate == BTREE_ITER_NEED_RELOCK)
 		iter->uptodate = BTREE_ITER_NEED_PEEK;
 
-	bch2_btree_trans_verify_locks(trans);
+	bch2_trans_verify_locks(trans);
 
 	return iter->uptodate < BTREE_ITER_NEED_RELOCK;
 }
@@ -380,7 +380,7 @@ static void bch2_btree_iter_verify_locks(struct btree_iter *iter)
 	}
 }
 
-void bch2_btree_trans_verify_locks(struct btree_trans *trans)
+void bch2_trans_verify_locks(struct btree_trans *trans)
 {
 	struct btree_iter *iter;
 
@@ -704,7 +704,7 @@ static void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter)
 		bkey_cmp(iter->pos, iter->k.p) > 0));
 }
 
-void bch2_btree_trans_verify_iters(struct btree_trans *trans, struct btree *b)
+void bch2_trans_verify_iters(struct btree_trans *trans, struct btree *b)
 {
 	struct btree_iter *iter;
 
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index 0f65e24bf102..7aee1a38d9a8 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -127,12 +127,12 @@ __trans_next_iter_with_node(struct btree_trans *trans, struct btree *b,
 						 (_iter)->idx + 1))
 
 #ifdef CONFIG_BCACHEFS_DEBUG
-void bch2_btree_trans_verify_iters(struct btree_trans *, struct btree *);
-void bch2_btree_trans_verify_locks(struct btree_trans *);
+void bch2_trans_verify_iters(struct btree_trans *, struct btree *);
+void bch2_trans_verify_locks(struct btree_trans *);
 #else
-static inline void bch2_btree_trans_verify_iters(struct btree_trans *trans,
-						 struct btree *b) {}
-static inline void bch2_btree_trans_verify_locks(struct btree_trans *iter) {}
+static inline void bch2_trans_verify_iters(struct btree_trans *trans,
+					   struct btree *b) {}
+static inline void bch2_trans_verify_locks(struct btree_trans *iter) {}
 #endif
 
 void bch2_btree_iter_fix_key_modified(struct btree_trans *trans,
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 80227e032103..b115b9246880 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -1451,7 +1451,7 @@ static void btree_split(struct btree_update *as,
 		six_unlock_intent(&n2->c.lock);
 	six_unlock_intent(&n1->c.lock);
 
-	bch2_btree_trans_verify_locks(trans);
+	bch2_trans_verify_locks(trans);
 
 	bch2_time_stats_update(&c->times[BCH_TIME_btree_node_split],
 			       start_time);
@@ -1474,7 +1474,7 @@ bch2_btree_insert_keys_interior(struct btree_update *as,
 	trans_for_each_iter_with_node(trans, b, linked)
 		bch2_btree_node_iter_peek(&linked->l[b->c.level].iter, b);
 
-	bch2_btree_trans_verify_iters(trans, b);
+	bch2_trans_verify_iters(trans, b);
 }
 
 /**
@@ -1706,7 +1706,7 @@ retry:
 
 	bch2_trans_node_add(trans, n);
 
-	bch2_btree_trans_verify_iters(trans, n);
+	bch2_trans_verify_iters(trans, n);
 
 	bch2_btree_node_free_inmem(trans, b);
 	bch2_btree_node_free_inmem(trans, m);
@@ -1715,7 +1715,7 @@ retry:
 
 	bch2_btree_update_done(as);
 out:
-	bch2_btree_trans_verify_locks(trans);
+	bch2_trans_verify_locks(trans);
 	bch2_trans_iter_free(trans, sib_iter);
 
 	/*
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 20b950ce31f1..4bcfa25e68e3 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -580,7 +580,7 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans,
 		}
 		btree_insert_entry_checks(trans, i);
 	}
-	bch2_btree_trans_verify_locks(trans);
+	bch2_trans_verify_locks(trans);
 
 	trans_for_each_update(trans, i)
 		if (!same_leaf_as_prev(trans, i))
@@ -816,7 +816,7 @@ retry:
 	ret = do_bch2_trans_commit(trans, &i, _RET_IP_);
 
 	/* make sure we didn't drop or screw up locks: */
-	bch2_btree_trans_verify_locks(trans);
+	bch2_trans_verify_locks(trans);
 
 	if (ret)
 		goto err;
-- 
cgit 


From 6fba6b83b426cf489c3c133d0e2a0260889aba2e Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 30 Aug 2021 16:08:34 -0400
Subject: bcachefs: Prefer using btree_insert_entry to btree_iter

This moves some data dependencies forward, to improve pipelining.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_types.h       |  3 +-
 fs/bcachefs/btree_update_leaf.c | 66 ++++++++++++++++++++---------------------
 fs/bcachefs/buckets.c           |  2 +-
 3 files changed, 36 insertions(+), 35 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 577cc57174fa..1ddb2eea5b15 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -343,7 +343,8 @@ struct btree_insert_entry {
 	u8			bkey_type;
 	enum btree_id		btree_id:8;
 	u8			level;
-	unsigned		trans_triggers_run:1;
+	bool			cached:1;
+	bool			trans_triggers_run:1;
 	struct bkey_i		*k;
 	struct btree_iter	*iter;
 	unsigned long		ip_allocated;
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 4bcfa25e68e3..3aa2ca82a62d 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -29,18 +29,23 @@ static inline int btree_insert_entry_cmp(const struct btree_insert_entry *l,
 		 bpos_cmp(l->k->k.p,	r->k->k.p);
 }
 
+static inline struct btree_iter_level *insert_l(struct btree_insert_entry *i)
+{
+	return i->iter->l + i->level;
+}
+
 static inline bool same_leaf_as_prev(struct btree_trans *trans,
 				     struct btree_insert_entry *i)
 {
 	return i != trans->updates &&
-		iter_l(i[0].iter)->b == iter_l(i[-1].iter)->b;
+		insert_l(&i[0])->b == insert_l(&i[-1])->b;
 }
 
 static inline bool same_leaf_as_next(struct btree_trans *trans,
 				     struct btree_insert_entry *i)
 {
 	return i + 1 < trans->updates + trans->nr_updates &&
-		iter_l(i[0].iter)->b == iter_l(i[1].iter)->b;
+		insert_l(&i[0])->b == insert_l(&i[1])->b;
 }
 
 inline void bch2_btree_node_lock_for_insert(struct btree_trans *trans,
@@ -183,22 +188,21 @@ inline void bch2_btree_add_journal_pin(struct bch_fs *c,
  * btree_insert_key - insert a key one key into a leaf node
  */
 static bool btree_insert_key_leaf(struct btree_trans *trans,
-				  struct btree_iter *iter,
-				  struct bkey_i *insert)
+				  struct btree_insert_entry *insert)
 {
 	struct bch_fs *c = trans->c;
-	struct btree *b = iter_l(iter)->b;
+	struct btree *b = insert_l(insert)->b;
 	struct bset_tree *t = bset_tree_last(b);
 	struct bset *i = bset(b, t);
 	int old_u64s = bset_u64s(t);
 	int old_live_u64s = b->nr.live_u64s;
 	int live_u64s_added, u64s_added;
 
-	EBUG_ON(!iter->level &&
+	EBUG_ON(!insert->level &&
 		!test_bit(BCH_FS_BTREE_INTERIOR_REPLAY_DONE, &c->flags));
 
-	if (unlikely(!bch2_btree_bset_insert_key(trans, iter, b,
-					&iter_l(iter)->iter, insert)))
+	if (unlikely(!bch2_btree_bset_insert_key(trans, insert->iter, b,
+					&insert_l(insert)->iter, insert->k)))
 		return false;
 
 	i->journal_seq = cpu_to_le64(max(trans->journal_res.seq,
@@ -221,7 +225,7 @@ static bool btree_insert_key_leaf(struct btree_trans *trans,
 	    bch2_maybe_compact_whiteouts(c, b))
 		bch2_trans_node_reinit_iter(trans, b);
 
-	trace_btree_insert_key(c, b, insert);
+	trace_btree_insert_key(c, b, insert->k);
 	return true;
 }
 
@@ -274,13 +278,12 @@ static inline int bch2_trans_journal_res_get(struct btree_trans *trans,
 	return ret == -EAGAIN ? BTREE_INSERT_NEED_JOURNAL_RES : ret;
 }
 
-static enum btree_insert_ret
+static inline enum btree_insert_ret
 btree_key_can_insert(struct btree_trans *trans,
-		     struct btree_iter *iter,
+		     struct btree *b,
 		     unsigned u64s)
 {
 	struct bch_fs *c = trans->c;
-	struct btree *b = iter_l(iter)->b;
 
 	if (!bch2_btree_node_insert_fits(c, b, u64s))
 		return BTREE_INSERT_BTREE_NODE_FULL;
@@ -297,7 +300,7 @@ btree_key_can_insert_cached(struct btree_trans *trans,
 	unsigned new_u64s;
 	struct bkey_i *new_k;
 
-	BUG_ON(iter->level);
+	EBUG_ON(iter->level);
 
 	if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) &&
 	    bch2_btree_key_cache_must_wait(trans->c) &&
@@ -335,8 +338,8 @@ static inline void do_btree_insert_one(struct btree_trans *trans,
 
 	i->k->k.needs_whiteout = false;
 
-	did_work = (btree_iter_type(i->iter) != BTREE_ITER_CACHED)
-		? btree_insert_key_leaf(trans, i->iter, i->k)
+	did_work = !i->cached
+		? btree_insert_key_leaf(trans, i)
 		: bch2_btree_insert_key_cached(trans, i->iter, i->k);
 	if (!did_work)
 		return;
@@ -364,9 +367,9 @@ static noinline void bch2_trans_mark_gc(struct btree_trans *trans)
 		/*
 		 * XXX: synchronization of cached update triggers with gc
 		 */
-		BUG_ON(btree_iter_type(i->iter) == BTREE_ITER_CACHED);
+		BUG_ON(i->cached || i->level);
 
-		if (gc_visited(c, gc_pos_btree_node(i->iter->l[0].b)))
+		if (gc_visited(c, gc_pos_btree_node(insert_l(i)->b)))
 			bch2_mark_update(trans, i->iter, i->k,
 					 i->flags|BTREE_TRIGGER_GC);
 	}
@@ -412,8 +415,8 @@ bch2_trans_commit_write_locked(struct btree_trans *trans,
 			u64s = 0;
 
 		u64s += i->k->k.u64s;
-		ret = btree_iter_type(i->iter) != BTREE_ITER_CACHED
-			? btree_key_can_insert(trans, i->iter, u64s)
+		ret = !i->cached
+			? btree_key_can_insert(trans, insert_l(i)->b, u64s)
 			: btree_key_can_insert_cached(trans, i->iter, u64s);
 		if (ret) {
 			*stopped_at = i;
@@ -473,8 +476,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans,
 
 	trans_for_each_update(trans, i)
 		if (BTREE_NODE_TYPE_HAS_MEM_TRIGGERS & (1U << i->bkey_type))
-			bch2_mark_update(trans, i->iter, i->k,
-					 i->flags);
+			bch2_mark_update(trans, i->iter, i->k, i->flags);
 
 	if (marking && trans->fs_usage_deltas)
 		bch2_trans_fs_usage_apply(trans, trans->fs_usage_deltas);
@@ -524,7 +526,7 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans,
 		if (!same_leaf_as_next(trans, i)) {
 			if (u64s_delta <= 0) {
 				ret = bch2_foreground_maybe_merge(trans, i->iter,
-							i->iter->level, trans->flags);
+							i->level, trans->flags);
 				if (unlikely(ret))
 					return ret;
 			}
@@ -585,14 +587,14 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans,
 	trans_for_each_update(trans, i)
 		if (!same_leaf_as_prev(trans, i))
 			bch2_btree_node_lock_for_insert(trans, i->iter,
-					iter_l(i->iter)->b);
+					insert_l(i)->b);
 
 	ret = bch2_trans_commit_write_locked(trans, stopped_at, trace_ip);
 
 	trans_for_each_update(trans, i)
 		if (!same_leaf_as_prev(trans, i))
 			bch2_btree_node_unlock_write_inlined(trans, i->iter,
-							iter_l(i->iter)->b);
+							insert_l(i)->b);
 
 	if (!ret && trans->journal_pin)
 		bch2_journal_pin_add(&c->journal, trans->journal_res.seq,
@@ -637,8 +639,7 @@ int bch2_trans_commit_error(struct btree_trans *trans,
 
 		if (ret == -EINTR)
 			trace_trans_restart_btree_node_split(trans->ip, trace_ip,
-							     i->iter->btree_id,
-							     &i->iter->real_pos);
+						i->btree_id, &i->iter->real_pos);
 		break;
 	case BTREE_INSERT_NEED_MARK_REPLICAS:
 		bch2_trans_unlock(trans);
@@ -747,7 +748,7 @@ int __bch2_trans_commit(struct btree_trans *trans)
 
 #ifdef CONFIG_BCACHEFS_DEBUG
 	trans_for_each_update(trans, i)
-		if (btree_iter_type(i->iter) != BTREE_ITER_CACHED &&
+		if (!i->cached &&
 		    !(i->flags & BTREE_TRIGGER_NORUN))
 			bch2_btree_key_cache_verify_clean(trans,
 					i->btree_id, i->k->k.p);
@@ -771,7 +772,7 @@ int __bch2_trans_commit(struct btree_trans *trans)
 				if (unlikely(ret)) {
 					if (ret == -EINTR)
 						trace_trans_restart_mark(trans->ip, _RET_IP_,
-									 i->iter->btree_id,
+									 i->btree_id,
 									 &i->iter->pos);
 					goto out;
 				}
@@ -785,8 +786,7 @@ int __bch2_trans_commit(struct btree_trans *trans)
 		if (unlikely(!bch2_btree_iter_upgrade(trans, i->iter,
 						      i->level + 1))) {
 			trace_trans_restart_upgrade(trans->ip, _RET_IP_,
-						    i->iter->btree_id,
-						    &i->iter->pos);
+						i->btree_id, &i->iter->pos);
 			trans->restarted = true;
 			ret = -EINTR;
 			goto out;
@@ -795,7 +795,7 @@ int __bch2_trans_commit(struct btree_trans *trans)
 		BUG_ON(!btree_node_intent_locked(i->iter, i->level));
 
 		u64s = jset_u64s(i->k->k.u64s);
-		if (btree_iter_type(i->iter) == BTREE_ITER_CACHED &&
+		if (i->cached &&
 		    likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)))
 			trans->journal_preres_u64s += u64s;
 		trans->journal_u64s += u64s;
@@ -1005,6 +1005,7 @@ int bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter,
 		.bkey_type	= __btree_node_type(iter->level, iter->btree_id),
 		.btree_id	= iter->btree_id,
 		.level		= iter->level,
+		.cached		= btree_iter_is_cached(iter),
 		.iter		= iter,
 		.k		= k,
 		.ip_allocated	= _RET_IP_,
@@ -1042,8 +1043,7 @@ int bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter,
 		 * not the key cache, which helps with cache coherency issues in
 		 * other areas:
 		 */
-		if (btree_iter_type(n.iter) == BTREE_ITER_CACHED &&
-		    btree_iter_type(i->iter) != BTREE_ITER_CACHED) {
+		if (n.cached && !i->cached) {
 			i->k = n.k;
 			i->flags = n.flags;
 		} else {
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index d7994229ad5a..a1d4a25bc42c 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -1290,7 +1290,7 @@ void fs_usage_apply_warn(struct btree_trans *trans,
 		pr_err("%s", buf);
 		pr_err("overlapping with");
 
-		if (btree_iter_type(i->iter) != BTREE_ITER_CACHED) {
+		if (!i->cached) {
 			struct btree_iter *copy = bch2_trans_copy_iter(trans, i->iter);
 			struct bkey_s_c k;
 			int ret;
-- 
cgit 


From deb0e573b4c1f1f0733662976230a77624160794 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 30 Aug 2021 17:31:09 -0400
Subject: bcachefs: Kill BTREE_ITER_NEED_PEEK

This was used for an optimization that hasn't existing in quite awhile
- iter->uptodate will probably be going away as well.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_iter.c      | 15 ++-------------
 fs/bcachefs/btree_iter.h      |  2 +-
 fs/bcachefs/btree_key_cache.c |  2 +-
 fs/bcachefs/btree_types.h     |  5 ++---
 4 files changed, 6 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index bc93fac24ce1..fc0a32dd427f 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -238,7 +238,7 @@ static inline bool btree_iter_get_locks(struct btree_trans *trans,
 	}
 
 	if (iter->uptodate == BTREE_ITER_NEED_RELOCK)
-		iter->uptodate = BTREE_ITER_NEED_PEEK;
+		iter->uptodate = BTREE_ITER_UPTODATE;
 
 	bch2_trans_verify_locks(trans);
 
@@ -752,8 +752,6 @@ static void __bch2_btree_iter_fix_key_modified(struct btree_iter *iter,
 
 	if (bkey_iter_pos_cmp(l->b, where, &iter->real_pos) < 0)
 		bch2_btree_node_iter_advance(&l->iter, l->b);
-
-	btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
 }
 
 void bch2_btree_iter_fix_key_modified(struct btree_trans *trans,
@@ -859,11 +857,6 @@ fixup_done:
 							    b, t, k2);
 		}
 	}
-
-	if (!b->c.level &&
-	    node_iter == &iter->l[0].iter &&
-	    iter_current_key_modified)
-		btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
 }
 
 void bch2_btree_node_iter_fix(struct btree_trans *trans,
@@ -1037,8 +1030,6 @@ static inline void __btree_iter_level_init(struct btree_iter *iter,
 	 */
 	if (level)
 		bch2_btree_node_iter_peek(&l->iter, l->b);
-
-	btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
 }
 
 static inline void btree_iter_level_init(struct btree_trans *trans,
@@ -1482,7 +1473,7 @@ static int btree_iter_traverse_one(struct btree_trans *trans,
 		}
 	}
 
-	iter->uptodate = BTREE_ITER_NEED_PEEK;
+	iter->uptodate = BTREE_ITER_UPTODATE;
 out:
 	BUG_ON((ret == -EINTR) != !!trans->restarted);
 	trace_iter_traverse(trans->ip, trace_ip,
@@ -1672,8 +1663,6 @@ static void btree_iter_set_search_pos(struct btree_iter *iter, struct bpos new_p
 out:
 	if (l != iter->level)
 		btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
-	else
-		btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
 
 	bch2_btree_iter_verify(iter);
 #ifdef CONFIG_BCACHEFS_DEBUG
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index 7aee1a38d9a8..4ba55e02d4b7 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -165,7 +165,7 @@ static inline bool bch2_btree_iter_upgrade(struct btree_trans *trans,
 
 	return iter->locks_want < new_locks_want
 		? __bch2_btree_iter_upgrade(trans, iter, new_locks_want)
-		: iter->uptodate <= BTREE_ITER_NEED_PEEK;
+		: iter->uptodate == BTREE_ITER_UPTODATE;
 }
 
 void __bch2_btree_iter_downgrade(struct btree_iter *, unsigned);
diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index ba03581c5290..61210db57f56 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -340,7 +340,7 @@ fill:
 	if (!test_bit(BKEY_CACHED_ACCESSED, &ck->flags))
 		set_bit(BKEY_CACHED_ACCESSED, &ck->flags);
 
-	iter->uptodate = BTREE_ITER_NEED_PEEK;
+	iter->uptodate = BTREE_ITER_UPTODATE;
 
 	if ((iter->flags & BTREE_ITER_INTENT) &&
 	    !bch2_btree_iter_upgrade(trans, iter, 1)) {
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 1ddb2eea5b15..d288404e35c4 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -218,9 +218,8 @@ enum btree_iter_type {
 
 enum btree_iter_uptodate {
 	BTREE_ITER_UPTODATE		= 0,
-	BTREE_ITER_NEED_PEEK		= 1,
-	BTREE_ITER_NEED_RELOCK		= 2,
-	BTREE_ITER_NEED_TRAVERSE	= 3,
+	BTREE_ITER_NEED_RELOCK		= 1,
+	BTREE_ITER_NEED_TRAVERSE	= 2,
 };
 
 #define BTREE_ITER_NO_NODE_GET_LOCKS	((struct btree *) 1)
-- 
cgit 


From f21566f17aa8f100b6d106f657a75964a3482f54 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 30 Aug 2021 15:54:41 -0400
Subject: bcachefs: Kill BTREE_ITER_NODES

We really only need to distinguish between btree iterators and btree key
cache iterators - this is more prep work for btree_path.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_iter.c        | 173 +++++++++++++++++-----------------------
 fs/bcachefs/btree_types.h       |  44 ++++------
 fs/bcachefs/btree_update_leaf.c |  10 +--
 3 files changed, 94 insertions(+), 133 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index fc0a32dd427f..7acec1e6db3d 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -28,15 +28,14 @@ static void btree_iter_copy(struct btree_trans *, struct btree_iter *, struct bt
 static inline int btree_iter_cmp(const struct btree_iter *l,
 				 const struct btree_iter *r)
 {
-	return   cmp_int(l->btree_id, r->btree_id) ?:
-		-cmp_int(btree_iter_is_cached(l), btree_iter_is_cached(r)) ?:
-		 bkey_cmp(l->real_pos, r->real_pos);
+	return   cmp_int(l->btree_id,	r->btree_id) ?:
+		-cmp_int(l->cached,	r->cached) ?:
+		 bkey_cmp(l->real_pos,	r->real_pos) ?:
+		-cmp_int(l->level,	r->level);
 }
 
 static inline struct bpos bkey_successor(struct btree_iter *iter, struct bpos p)
 {
-	EBUG_ON(btree_iter_type(iter) == BTREE_ITER_NODES);
-
 	/* Are we iterating over keys in all snapshots? */
 	if (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) {
 		p = bpos_successor(p);
@@ -50,8 +49,6 @@ static inline struct bpos bkey_successor(struct btree_iter *iter, struct bpos p)
 
 static inline struct bpos bkey_predecessor(struct btree_iter *iter, struct bpos p)
 {
-	EBUG_ON(btree_iter_type(iter) == BTREE_ITER_NODES);
-
 	/* Are we iterating over keys in all snapshots? */
 	if (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) {
 		p = bpos_predecessor(p);
@@ -210,7 +207,7 @@ static inline bool btree_iter_get_locks(struct btree_trans *trans,
 			(upgrade
 			 ? trace_node_upgrade_fail
 			 : trace_node_relock_fail)(trans->ip, trace_ip,
-					btree_iter_type(iter) == BTREE_ITER_CACHED,
+					iter->cached,
 					iter->btree_id, &iter->real_pos,
 					l, iter->l[l].lock_seq,
 					is_btree_node(iter, l)
@@ -246,9 +243,9 @@ static inline bool btree_iter_get_locks(struct btree_trans *trans,
 }
 
 static struct bpos btree_node_pos(struct btree_bkey_cached_common *_b,
-				  enum btree_iter_type type)
+				  bool cached)
 {
-	return  type != BTREE_ITER_CACHED
+	return !cached
 		? container_of(_b, struct btree, c)->key.k.p
 		: container_of(_b, struct bkey_cached, c)->key.pos;
 }
@@ -301,8 +298,8 @@ bool __bch2_btree_node_lock(struct btree_trans *trans,
 		 * Within the same btree, cached iterators come before non
 		 * cached iterators:
 		 */
-		if (btree_iter_is_cached(linked) != btree_iter_is_cached(iter)) {
-			if (btree_iter_is_cached(iter)) {
+		if (linked->cached != iter->cached) {
+			if (iter->cached) {
 				deadlock_iter = linked;
 				reason = 4;
 			}
@@ -322,7 +319,7 @@ bool __bch2_btree_node_lock(struct btree_trans *trans,
 		/* Must lock btree nodes in key order: */
 		if (btree_node_locked(linked, level) &&
 		    bpos_cmp(pos, btree_node_pos((void *) linked->l[level].b,
-						 btree_iter_type(linked))) <= 0) {
+						 linked->cached)) <= 0) {
 			deadlock_iter = linked;
 			reason = 7;
 		}
@@ -332,10 +329,10 @@ bool __bch2_btree_node_lock(struct btree_trans *trans,
 		trace_trans_restart_would_deadlock(trans->ip, ip,
 				trans->in_traverse_all, reason,
 				deadlock_iter->btree_id,
-				btree_iter_type(deadlock_iter),
+				deadlock_iter->cached,
 				&deadlock_iter->real_pos,
 				iter->btree_id,
-				btree_iter_type(iter),
+				iter->cached,
 				&pos);
 		btree_trans_restart(trans);
 		return false;
@@ -404,7 +401,7 @@ bool bch2_btree_iter_relock_intent(struct btree_trans *trans,
 	     l++) {
 		if (!bch2_btree_node_relock(trans, iter, l)) {
 			trace_node_relock_fail(trans->ip, _RET_IP_,
-					btree_iter_type(iter) == BTREE_ITER_CACHED,
+					iter->cached,
 					iter->btree_id, &iter->real_pos,
 					l, iter->l[l].lock_seq,
 					is_btree_node(iter, l)
@@ -467,7 +464,7 @@ bool __bch2_btree_iter_upgrade(struct btree_trans *trans,
 	 */
 	trans_for_each_iter(trans, linked)
 		if (linked != iter &&
-		    btree_iter_type(linked) == btree_iter_type(iter) &&
+		    linked->cached == iter->cached &&
 		    linked->btree_id == iter->btree_id &&
 		    linked->locks_want < new_locks_want) {
 			linked->locks_want = new_locks_want;
@@ -584,7 +581,7 @@ static void bch2_btree_iter_verify_level(struct btree_trans *trans,
 	tmp	= l->iter;
 	locked	= btree_node_locked(iter, level);
 
-	if (btree_iter_type(iter) == BTREE_ITER_CACHED) {
+	if (iter->cached) {
 		if (!level)
 			bch2_btree_iter_verify_cached(trans, iter);
 		return;
@@ -600,13 +597,6 @@ static void bch2_btree_iter_verify_level(struct btree_trans *trans,
 
 	BUG_ON(!btree_iter_pos_in_node(iter, l->b));
 
-	/*
-	 * node iterators don't use leaf node iterator:
-	 */
-	if (btree_iter_type(iter) == BTREE_ITER_NODES &&
-	    level <= iter->min_depth)
-		goto unlock;
-
 	bch2_btree_node_iter_verify(&l->iter, l->b);
 
 	/*
@@ -630,7 +620,7 @@ static void bch2_btree_iter_verify_level(struct btree_trans *trans,
 		msg = "after";
 		goto err;
 	}
-unlock:
+
 	if (!locked)
 		btree_node_unlock(iter, level);
 	return;
@@ -661,7 +651,6 @@ static void bch2_btree_iter_verify(struct btree_iter *iter)
 {
 	struct btree_trans *trans = iter->trans;
 	struct bch_fs *c = trans->c;
-	enum btree_iter_type type = btree_iter_type(iter);
 	unsigned i;
 
 	EBUG_ON(iter->btree_id >= BTREE_ID_NR);
@@ -672,14 +661,11 @@ static void bch2_btree_iter_verify(struct btree_iter *iter)
 	BUG_ON((iter->flags & BTREE_ITER_IS_EXTENTS) &&
 	       (iter->flags & BTREE_ITER_ALL_SNAPSHOTS));
 
-	BUG_ON(type == BTREE_ITER_NODES &&
-	       !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS));
-
-	BUG_ON(type != BTREE_ITER_NODES &&
+	BUG_ON(!(iter->flags & __BTREE_ITER_ALL_SNAPSHOTS) &&
 	       (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) &&
 	       !btree_type_has_snapshots(iter->btree_id));
 
-	for (i = 0; i < (type != BTREE_ITER_CACHED ? BTREE_MAX_DEPTH : 1); i++) {
+	for (i = 0; i < (!iter->cached ? BTREE_MAX_DEPTH : 1); i++) {
 		if (!iter->l[i].b) {
 			BUG_ON(c->btree_roots[iter->btree_id].b->c.level > i);
 			break;
@@ -693,15 +679,11 @@ static void bch2_btree_iter_verify(struct btree_iter *iter)
 
 static void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter)
 {
-	enum btree_iter_type type = btree_iter_type(iter);
-
 	BUG_ON(!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS) &&
 	       iter->pos.snapshot != iter->snapshot);
 
-	BUG_ON((type == BTREE_ITER_KEYS ||
-		type == BTREE_ITER_CACHED) &&
-	       (bkey_cmp(iter->pos, bkey_start_pos(&iter->k)) < 0 ||
-		bkey_cmp(iter->pos, iter->k.p) > 0));
+	BUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&iter->k)) < 0 ||
+	       bkey_cmp(iter->pos, iter->k.p) > 0);
 }
 
 void bch2_trans_verify_iters(struct btree_trans *trans, struct btree *b)
@@ -1036,7 +1018,7 @@ static inline void btree_iter_level_init(struct btree_trans *trans,
 					 struct btree_iter *iter,
 					 struct btree *b)
 {
-	BUG_ON(btree_iter_type(iter) == BTREE_ITER_CACHED);
+	BUG_ON(iter->cached);
 
 	btree_iter_verify_new_node(trans, iter, b);
 
@@ -1058,7 +1040,7 @@ void bch2_trans_node_add(struct btree_trans *trans, struct btree *b)
 	struct btree_iter *iter;
 
 	trans_for_each_iter(trans, iter)
-		if (btree_iter_type(iter) != BTREE_ITER_CACHED &&
+		if (!iter->cached &&
 		    btree_iter_pos_in_node(iter, b)) {
 			/*
 			 * bch2_trans_node_drop() has already been called -
@@ -1417,7 +1399,7 @@ static int btree_iter_traverse_one(struct btree_trans *trans,
 		goto out;
 	}
 
-	if (btree_iter_type(iter) == BTREE_ITER_CACHED) {
+	if (iter->cached) {
 		ret = bch2_btree_iter_traverse_cached(trans, iter);
 		goto out;
 	}
@@ -1451,8 +1433,8 @@ static int btree_iter_traverse_one(struct btree_trans *trans,
 		if (unlikely(ret)) {
 			if (ret == 1) {
 				/*
-				 * Got to the end of the btree (in
-				 * BTREE_ITER_NODES mode)
+				 * No nodes at this level - got to the end of
+				 * the btree:
 				 */
 				ret = 0;
 				goto out;
@@ -1477,7 +1459,7 @@ static int btree_iter_traverse_one(struct btree_trans *trans,
 out:
 	BUG_ON((ret == -EINTR) != !!trans->restarted);
 	trace_iter_traverse(trans->ip, trace_ip,
-			    btree_iter_type(iter) == BTREE_ITER_CACHED,
+			    iter->cached,
 			    iter->btree_id, &iter->real_pos, ret);
 	bch2_btree_iter_verify(iter);
 	return ret;
@@ -1533,42 +1515,44 @@ bch2_btree_iter_traverse(struct btree_iter *iter)
 
 struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter)
 {
-	struct btree *b;
+	struct btree *b = NULL;
 	int ret;
 
-	EBUG_ON(btree_iter_type(iter) != BTREE_ITER_NODES);
+	EBUG_ON(iter->cached);
 	bch2_btree_iter_verify(iter);
 
 	ret = btree_iter_traverse(iter);
 	if (ret)
-		return NULL;
+		goto out;
 
 	b = btree_iter_node(iter, iter->level);
 	if (!b)
-		return NULL;
+		goto out;
 
 	BUG_ON(bpos_cmp(b->key.k.p, iter->pos) < 0);
 
-	iter->pos = iter->real_pos = b->key.k.p;
+	bkey_init(&iter->k);
+	iter->k.p = iter->pos = iter->real_pos = b->key.k.p;
 	iter->trans->iters_sorted = false;
-
-	bch2_btree_iter_verify(iter);
 	iter->should_be_locked = true;
+out:
+	bch2_btree_iter_verify_entry_exit(iter);
+	bch2_btree_iter_verify(iter);
 
 	return b;
 }
 
 struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
 {
-	struct btree *b;
+	struct btree *b = NULL;
 	int ret;
 
-	EBUG_ON(btree_iter_type(iter) != BTREE_ITER_NODES);
+	EBUG_ON(iter->cached);
 	bch2_btree_iter_verify(iter);
 
 	/* already got to end? */
 	if (!btree_iter_node(iter, iter->level))
-		return NULL;
+		goto out;
 
 	bch2_trans_cond_resched(iter->trans);
 
@@ -1579,12 +1563,12 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
 	btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
 	ret = btree_iter_traverse(iter);
 	if (ret)
-		return NULL;
+		goto out;
 
 	/* got to end? */
 	b = btree_iter_node(iter, iter->level);
 	if (!b)
-		return NULL;
+		goto out;
 
 	if (bpos_cmp(iter->pos, b->key.k.p) < 0) {
 		/*
@@ -1601,17 +1585,21 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
 		bch2_btree_iter_verify(iter);
 
 		ret = btree_iter_traverse(iter);
-		if (ret)
-			return NULL;
+		if (ret) {
+			b = NULL;
+			goto out;
+		}
 
 		b = iter->l[iter->level].b;
 	}
 
-	iter->pos = iter->real_pos = b->key.k.p;
+	bkey_init(&iter->k);
+	iter->k.p = iter->pos = iter->real_pos = b->key.k.p;
 	iter->trans->iters_sorted = false;
-
-	bch2_btree_iter_verify(iter);
 	iter->should_be_locked = true;
+out:
+	bch2_btree_iter_verify_entry_exit(iter);
+	bch2_btree_iter_verify(iter);
 
 	return b;
 }
@@ -1636,7 +1624,7 @@ static void btree_iter_set_search_pos(struct btree_iter *iter, struct bpos new_p
 	iter->should_be_locked = false;
 	trans->iters_sorted = false;
 
-	if (unlikely(btree_iter_type(iter) == BTREE_ITER_CACHED)) {
+	if (unlikely(iter->cached)) {
 		btree_node_unlock(iter, 0);
 		iter->l[0].b = BTREE_ITER_NO_NODE_CACHED;
 		btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
@@ -1734,7 +1722,7 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
 	struct bkey_s_c k;
 	int ret;
 
-	EBUG_ON(btree_iter_type(iter) != BTREE_ITER_KEYS);
+	EBUG_ON(iter->cached || iter->level);
 	bch2_btree_iter_verify(iter);
 	bch2_btree_iter_verify_entry_exit(iter);
 
@@ -1824,7 +1812,7 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
 	struct bkey_s_c k;
 	int ret;
 
-	EBUG_ON(btree_iter_type(iter) != BTREE_ITER_KEYS);
+	EBUG_ON(iter->cached || iter->level);
 	EBUG_ON(iter->flags & BTREE_ITER_WITH_UPDATES);
 	bch2_btree_iter_verify(iter);
 	bch2_btree_iter_verify_entry_exit(iter);
@@ -1891,8 +1879,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
 	struct bkey_s_c k;
 	int ret;
 
-	EBUG_ON(btree_iter_type(iter) != BTREE_ITER_KEYS &&
-		btree_iter_type(iter) != BTREE_ITER_CACHED);
+	EBUG_ON(iter->level);
 	bch2_btree_iter_verify(iter);
 	bch2_btree_iter_verify_entry_exit(iter);
 
@@ -1912,28 +1899,21 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
 	if (unlikely(ret))
 		return bkey_s_c_err(ret);
 
-	if (btree_iter_type(iter) == BTREE_ITER_CACHED ||
-	    !(iter->flags & BTREE_ITER_IS_EXTENTS)) {
+	if (!(iter->flags & BTREE_ITER_IS_EXTENTS)) {
 		struct bkey_i *next_update;
-		struct bkey_cached *ck;
 
 		next_update = btree_trans_peek_updates(iter);
 
-		switch (btree_iter_type(iter)) {
-		case BTREE_ITER_KEYS:
+		if (!iter->cached) {
 			k = btree_iter_level_peek_all(iter, &iter->l[0]);
 			EBUG_ON(k.k && bkey_deleted(k.k) && bpos_cmp(k.k->p, iter->pos) == 0);
-			break;
-		case BTREE_ITER_CACHED:
-			ck = (void *) iter->l[0].b;
+		} else {
+			struct bkey_cached *ck = (void *) iter->l[0].b;
 			EBUG_ON(iter->btree_id != ck->key.btree_id ||
 				bkey_cmp(iter->pos, ck->key.pos));
 			BUG_ON(!ck->valid);
 
 			k = bkey_i_to_s_c(ck->k);
-			break;
-		case BTREE_ITER_NODES:
-			BUG();
 		}
 
 		if (next_update &&
@@ -2345,14 +2325,12 @@ struct btree_iter *__bch2_trans_get_iter(struct btree_trans *trans,
 
 	EBUG_ON(trans->restarted);
 
-	if ((flags & BTREE_ITER_TYPE) != BTREE_ITER_NODES &&
-	    btree_node_type_is_extents(btree_id) &&
-	    !(flags & BTREE_ITER_NOT_EXTENTS) &&
-	    !(flags & BTREE_ITER_ALL_SNAPSHOTS))
+	if (!(flags & (BTREE_ITER_ALL_SNAPSHOTS|BTREE_ITER_NOT_EXTENTS)) &&
+	    btree_node_type_is_extents(btree_id))
 		flags |= BTREE_ITER_IS_EXTENTS;
 
-	if ((flags & BTREE_ITER_TYPE) != BTREE_ITER_NODES &&
-	    !btree_type_has_snapshots(btree_id))
+	if (!btree_type_has_snapshots(btree_id) &&
+	    !(flags & __BTREE_ITER_ALL_SNAPSHOTS))
 		flags &= ~BTREE_ITER_ALL_SNAPSHOTS;
 
 	if (!(flags & BTREE_ITER_ALL_SNAPSHOTS))
@@ -2366,7 +2344,7 @@ struct btree_iter *__bch2_trans_get_iter(struct btree_trans *trans,
 		real_pos = bpos_nosnap_successor(pos);
 
 	trans_for_each_iter(trans, iter) {
-		if (btree_iter_type(iter) != (flags & BTREE_ITER_TYPE))
+		if (iter->cached != (flags & BTREE_ITER_CACHED))
 			continue;
 
 		if (iter->btree_id != btree_id)
@@ -2397,9 +2375,9 @@ struct btree_iter *__bch2_trans_get_iter(struct btree_trans *trans,
 	trans->iters_live	|= 1ULL << iter->idx;
 	trans->iters_touched	|= 1ULL << iter->idx;
 
-	iter->flags = flags;
-
-	iter->snapshot = pos.snapshot;
+	iter->cached	= flags & BTREE_ITER_CACHED;
+	iter->flags	= flags;
+	iter->snapshot	= pos.snapshot;
 
 	/*
 	 * If the iterator has locks_want greater than requested, we explicitly
@@ -2450,8 +2428,8 @@ struct btree_iter *bch2_trans_get_node_iter(struct btree_trans *trans,
 	struct btree_iter *iter =
 		__bch2_trans_get_iter(trans, btree_id, pos,
 				      locks_want, depth,
-				      BTREE_ITER_NODES|
 				      BTREE_ITER_NOT_EXTENTS|
+				      __BTREE_ITER_ALL_SNAPSHOTS|
 				      BTREE_ITER_ALL_SNAPSHOTS|
 				      flags);
 
@@ -2705,21 +2683,20 @@ int bch2_trans_exit(struct btree_trans *trans)
 static void __maybe_unused
 bch2_btree_iter_node_to_text(struct printbuf *out,
 			     struct btree_bkey_cached_common *_b,
-			     enum btree_iter_type type)
+			     bool cached)
 {
 	pr_buf(out, "    l=%u %s:",
 	       _b->level, bch2_btree_ids[_b->btree_id]);
-	bch2_bpos_to_text(out, btree_node_pos(_b, type));
+	bch2_bpos_to_text(out, btree_node_pos(_b, cached));
 }
 
 #ifdef CONFIG_BCACHEFS_DEBUG
-static bool trans_has_btree_nodes_locked(struct btree_trans *trans)
+static bool trans_has_locks(struct btree_trans *trans)
 {
 	struct btree_iter *iter;
 
 	trans_for_each_iter(trans, iter)
-		if (btree_iter_type(iter) != BTREE_ITER_CACHED &&
-		    iter->nodes_locked)
+		if (iter->nodes_locked)
 			return true;
 	return false;
 }
@@ -2735,7 +2712,7 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct bch_fs *c)
 
 	mutex_lock(&c->btree_trans_lock);
 	list_for_each_entry(trans, &c->btree_trans_list, list) {
-		if (!trans_has_btree_nodes_locked(trans))
+		if (!trans_has_locks(trans))
 			continue;
 
 		pr_buf(out, "%i %ps\n", trans->pid, (void *) trans->ip);
@@ -2746,7 +2723,7 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct bch_fs *c)
 
 			pr_buf(out, "  iter %u %c %s:",
 			       iter->idx,
-			       btree_iter_type(iter) == BTREE_ITER_CACHED ? 'c' : 'b',
+			       iter->cached ? 'c' : 'b',
 			       bch2_btree_ids[iter->btree_id]);
 			bch2_bpos_to_text(out, iter->pos);
 			pr_buf(out, "\n");
@@ -2757,7 +2734,7 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct bch_fs *c)
 					       btree_node_intent_locked(iter, l) ? "i" : "r", l);
 					bch2_btree_iter_node_to_text(out,
 							(void *) iter->l[l].b,
-							btree_iter_type(iter));
+							iter->cached);
 					pr_buf(out, "\n");
 				}
 			}
@@ -2768,7 +2745,7 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct bch_fs *c)
 			iter = &trans->iters[trans->locking_iter_idx];
 			pr_buf(out, "  locking iter %u %c l=%u %s:",
 			       trans->locking_iter_idx,
-			       btree_iter_type(iter) == BTREE_ITER_CACHED ? 'c' : 'b',
+			       iter->cached ? 'c' : 'b',
 			       trans->locking_level,
 			       bch2_btree_ids[trans->locking_btree_id]);
 			bch2_bpos_to_text(out, trans->locking_pos);
@@ -2776,7 +2753,7 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct bch_fs *c)
 			pr_buf(out, " node ");
 			bch2_btree_iter_node_to_text(out,
 					(void *) b,
-					btree_iter_type(iter));
+					iter->cached);
 			pr_buf(out, "\n");
 		}
 	}
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index d288404e35c4..56dc5fbb7c91 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -176,44 +176,38 @@ struct btree_node_iter {
 	} data[MAX_BSETS];
 };
 
-enum btree_iter_type {
-	BTREE_ITER_KEYS,
-	BTREE_ITER_NODES,
-	BTREE_ITER_CACHED,
-};
-
-#define BTREE_ITER_TYPE			((1 << 2) - 1)
-
 /*
  * Iterate over all possible positions, synthesizing deleted keys for holes:
  */
-#define BTREE_ITER_SLOTS		(1 << 2)
+#define BTREE_ITER_SLOTS		(1 << 0)
 /*
  * Indicates that intent locks should be taken on leaf nodes, because we expect
  * to be doing updates:
  */
-#define BTREE_ITER_INTENT		(1 << 3)
+#define BTREE_ITER_INTENT		(1 << 1)
 /*
  * Causes the btree iterator code to prefetch additional btree nodes from disk:
  */
-#define BTREE_ITER_PREFETCH		(1 << 4)
+#define BTREE_ITER_PREFETCH		(1 << 2)
 /*
  * Indicates that this iterator should not be reused until transaction commit,
  * either because a pending update references it or because the update depends
  * on that particular key being locked (e.g. by the str_hash code, for hash
  * table consistency)
  */
-#define BTREE_ITER_KEEP_UNTIL_COMMIT	(1 << 5)
+#define BTREE_ITER_KEEP_UNTIL_COMMIT	(1 << 3)
 /*
  * Used in bch2_btree_iter_traverse(), to indicate whether we're searching for
  * @pos or the first key strictly greater than @pos
  */
-#define BTREE_ITER_IS_EXTENTS		(1 << 6)
-#define BTREE_ITER_NOT_EXTENTS		(1 << 7)
-#define BTREE_ITER_ERROR		(1 << 8)
-#define BTREE_ITER_CACHED_NOFILL	(1 << 9)
-#define BTREE_ITER_CACHED_NOCREATE	(1 << 10)
-#define BTREE_ITER_WITH_UPDATES		(1 << 11)
+#define BTREE_ITER_IS_EXTENTS		(1 << 4)
+#define BTREE_ITER_NOT_EXTENTS		(1 << 5)
+#define BTREE_ITER_ERROR		(1 << 6)
+#define BTREE_ITER_CACHED		(1 << 7)
+#define BTREE_ITER_CACHED_NOFILL	(1 << 8)
+#define BTREE_ITER_CACHED_NOCREATE	(1 << 9)
+#define BTREE_ITER_WITH_UPDATES		(1 << 10)
+#define __BTREE_ITER_ALL_SNAPSHOTS	(1 << 11)
 #define BTREE_ITER_ALL_SNAPSHOTS	(1 << 12)
 
 enum btree_iter_uptodate {
@@ -256,7 +250,8 @@ struct btree_iter {
 	struct bpos		real_pos;
 
 	enum btree_id		btree_id:4;
-	enum btree_iter_uptodate uptodate:3;
+	bool			cached:1;
+	enum btree_iter_uptodate uptodate:2;
 	/*
 	 * True if we've returned a key (and thus are expected to keep it
 	 * locked), false after set_pos - for avoiding spurious transaction
@@ -282,17 +277,6 @@ struct btree_iter {
 	struct bkey		k;
 };
 
-static inline enum btree_iter_type
-btree_iter_type(const struct btree_iter *iter)
-{
-	return iter->flags & BTREE_ITER_TYPE;
-}
-
-static inline bool btree_iter_is_cached(const struct btree_iter *iter)
-{
-	return btree_iter_type(iter) == BTREE_ITER_CACHED;
-}
-
 static inline struct btree_iter_level *iter_l(struct btree_iter *iter)
 {
 	return iter->l + iter->level;
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 3aa2ca82a62d..3a59f10e0b87 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -56,7 +56,7 @@ inline void bch2_btree_node_lock_for_insert(struct btree_trans *trans,
 
 	bch2_btree_node_lock_write(trans, iter, b);
 
-	if (btree_iter_type(iter) == BTREE_ITER_CACHED)
+	if (iter->cached)
 		return;
 
 	if (unlikely(btree_node_just_written(b)) &&
@@ -509,10 +509,10 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans,
 
 	trans_for_each_update(trans, i) {
 		/*
-		 * peek_slot() doesn't work on a BTREE_ITER_NODES iter; those
-		 * iterator types should probably go away
+		 * peek_slot() doesn't yet work on iterators that point to
+		 * interior nodes:
 		 */
-		if (btree_iter_type(i->iter) != BTREE_ITER_KEYS)
+		if (i->cached || i->level)
 			continue;
 
 		old = bch2_btree_iter_peek_slot(i->iter);
@@ -1005,7 +1005,7 @@ int bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter,
 		.bkey_type	= __btree_node_type(iter->level, iter->btree_id),
 		.btree_id	= iter->btree_id,
 		.level		= iter->level,
-		.cached		= btree_iter_is_cached(iter),
+		.cached		= iter->cached,
 		.iter		= iter,
 		.k		= k,
 		.ip_allocated	= _RET_IP_,
-- 
cgit 


From cab8e233734d89cd3c857ee42fe26e49f57d5e75 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 1 Sep 2021 00:50:18 -0400
Subject: bcachefs: Add an assertion for removing btree nodes from cache

Chasing a bug that has something to do with the btree node cache.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_cache.c           | 3 ++-
 fs/bcachefs/btree_update_interior.c | 5 +++--
 2 files changed, 5 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index 354c75f59730..c94ed4da1ca4 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -129,7 +129,8 @@ struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *c)
 
 void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b)
 {
-	rhashtable_remove_fast(&bc->table, &b->hash, bch_btree_cache_params);
+	int ret = rhashtable_remove_fast(&bc->table, &b->hash, bch_btree_cache_params);
+	BUG_ON(ret);
 
 	/* Cause future lookups for this node to fail: */
 	b->hash_val = 0;
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index b115b9246880..652f08dea804 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -153,8 +153,6 @@ static void __btree_node_free(struct bch_fs *c, struct btree *b)
 
 	clear_btree_node_noevict(b);
 
-	bch2_btree_node_hash_remove(&c->btree_cache, b);
-
 	mutex_lock(&c->btree_cache.lock);
 	list_move(&b->list, &c->btree_cache.freeable);
 	mutex_unlock(&c->btree_cache.lock);
@@ -170,7 +168,10 @@ static void bch2_btree_node_free_inmem(struct btree_trans *trans,
 		BUG_ON(iter->l[b->c.level].b == b);
 
 	six_lock_write(&b->c.lock, NULL, NULL);
+
+	bch2_btree_node_hash_remove(&c->btree_cache, b);
 	__btree_node_free(c, b);
+
 	six_unlock_write(&b->c.lock);
 	six_unlock_intent(&b->c.lock);
 }
-- 
cgit 


From fbf14104dabe91e6c3b5544f9289eebc94a52c93 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 1 Sep 2021 18:06:01 -0400
Subject: bcachefs: Improve an error message

When we detect an invalid key being inserted, we should print what code
was doing the update.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_update_leaf.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 3a59f10e0b87..5e57ff5a5ceb 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -577,7 +577,9 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans,
 			char buf[200];
 
 			bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(i->k));
-			bch_err(c, "invalid bkey %s on insert: %s\n", buf, invalid);
+			bch_err(c, "invalid bkey %s on insert from %ps -> %ps: %s\n",
+				buf, (void *) trans->ip,
+				(void *) i->ip_allocated, invalid);
 			bch2_fatal_error(c);
 		}
 		btree_insert_entry_checks(trans, i);
-- 
cgit 


From 8f54337dc6825f323f7761c182d98efdd180ce70 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 3 Sep 2021 17:32:42 -0400
Subject: bcachefs: Fix initialization of bch_write_op.nonce

If an extent ends up with a replica that is encrypted an a replica that
isn't encrypted (due the user changing options), and then
copygc/rebalance moves one of the replicas by reading from the
unencrypted replica, we had a bug where we wouldn't correctly initialize
op->nonce - for each crc field in an extent, crc.offset + crc.nonce must
be equal.

This patch fixes that by moving op.nonce initialization to
bch2_migrate_write_init.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/move.c | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index c804af8b81de..3c2e566beb2d 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -220,11 +220,6 @@ void bch2_migrate_read_done(struct migrate_write *m, struct bch_read_bio *rbio)
 	m->op.crc	= rbio->pick.crc;
 	m->op.wbio.bio.bi_iter.bi_size = m->op.crc.compressed_size << 9;
 
-	if (bch2_csum_type_is_encryption(m->op.crc.csum_type)) {
-		m->op.nonce	= m->op.crc.nonce + m->op.crc.offset;
-		m->op.csum_type = m->op.crc.csum_type;
-	}
-
 	if (m->data_cmd == DATA_REWRITE)
 		bch2_dev_list_drop_dev(&m->op.devs_have, m->data_opts.rewrite_dev);
 }
@@ -239,6 +234,7 @@ int bch2_migrate_write_init(struct bch_fs *c, struct migrate_write *m,
 {
 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
 	const union bch_extent_entry *entry;
+	struct bch_extent_crc_unpacked crc;
 	struct extent_ptr_decoded p;
 	int ret;
 
@@ -259,6 +255,18 @@ int bch2_migrate_write_init(struct bch_fs *c, struct migrate_write *m,
 	m->op.target	= data_opts.target,
 	m->op.write_point = wp;
 
+	/*
+	 * op->csum_type is normally initialized from the fs/file's current
+	 * options - but if an extent is encrypted, we require that it stays
+	 * encrypted:
+	 */
+	bkey_for_each_crc(k.k, ptrs, crc, entry)
+		if (bch2_csum_type_is_encryption(crc.csum_type)) {
+			m->op.nonce	= crc.nonce + crc.offset;
+			m->op.csum_type = crc.csum_type;
+			break;
+		}
+
 	if (m->data_opts.btree_insert_flags & BTREE_INSERT_USE_RESERVE) {
 		m->op.alloc_reserve = RESERVE_MOVINGGC;
 		m->op.flags |= BCH_WRITE_ALLOC_NOWAIT;
-- 
cgit 


From 67e0dd8f0d8b4bf09098c4692abcb43a20089dff Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 30 Aug 2021 15:18:31 -0400
Subject: bcachefs: btree_path

This splits btree_iter into two components: btree_iter is now the
externally visible componont, and it points to a btree_path which is now
reference counted.

This means we no longer have to clone iterators up front if they might
be mutated - btree_path can be shared by multiple iterators, and cloned
if an iterator would mutate a shared btree_path. This will help us use
iterators more efficiently, as well as slimming down the main long lived
state in btree_trans, and significantly cleans up the logic for iterator
lifetimes.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/acl.c                   |   41 +-
 fs/bcachefs/alloc_background.c      |   55 +-
 fs/bcachefs/bcachefs.h              |    8 +-
 fs/bcachefs/bset.c                  |    4 +-
 fs/bcachefs/btree_cache.c           |   36 +-
 fs/bcachefs/btree_cache.h           |    7 +-
 fs/bcachefs/btree_gc.c              |   36 +-
 fs/bcachefs/btree_iter.c            | 2068 ++++++++++++++++++-----------------
 fs/bcachefs/btree_iter.h            |  265 ++---
 fs/bcachefs/btree_key_cache.c       |  135 ++-
 fs/bcachefs/btree_key_cache.h       |    5 +-
 fs/bcachefs/btree_locking.h         |  117 +-
 fs/bcachefs/btree_types.h           |   94 +-
 fs/bcachefs/btree_update.h          |   13 +-
 fs/bcachefs/btree_update_interior.c |  217 ++--
 fs/bcachefs/btree_update_interior.h |   20 +-
 fs/bcachefs/btree_update_leaf.c     |  216 ++--
 fs/bcachefs/buckets.c               |  177 +--
 fs/bcachefs/buckets.h               |    6 +-
 fs/bcachefs/debug.c                 |   32 +-
 fs/bcachefs/dirent.c                |   77 +-
 fs/bcachefs/dirent.h                |    3 +-
 fs/bcachefs/ec.c                    |   76 +-
 fs/bcachefs/extent_update.c         |   10 +-
 fs/bcachefs/extents.c               |    4 +-
 fs/bcachefs/fs-common.c             |  113 +-
 fs/bcachefs/fs-io.c                 |   82 +-
 fs/bcachefs/fs.c                    |   44 +-
 fs/bcachefs/fsck.c                  |  159 +--
 fs/bcachefs/inode.c                 |   61 +-
 fs/bcachefs/inode.h                 |    8 +-
 fs/bcachefs/io.c                    |   98 +-
 fs/bcachefs/journal_seq_blacklist.c |    4 +-
 fs/bcachefs/migrate.c               |   26 +-
 fs/bcachefs/move.c                  |   62 +-
 fs/bcachefs/quota.c                 |   20 +-
 fs/bcachefs/recovery.c              |   34 +-
 fs/bcachefs/reflink.c               |   76 +-
 fs/bcachefs/str_hash.h              |   65 +-
 fs/bcachefs/super.c                 |    8 +-
 fs/bcachefs/sysfs.c                 |    3 +-
 fs/bcachefs/tests.c                 |  189 ++--
 fs/bcachefs/trace.h                 |    4 +-
 fs/bcachefs/xattr.c                 |   26 +-
 44 files changed, 2385 insertions(+), 2419 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/acl.c b/fs/bcachefs/acl.c
index 39ac6d2e178d..93b78e4e6e0d 100644
--- a/fs/bcachefs/acl.c
+++ b/fs/bcachefs/acl.c
@@ -219,7 +219,7 @@ struct posix_acl *bch2_get_acl(struct mnt_idmap *idmap,
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
 	struct bch_hash_info hash = bch2_hash_info_init(c, &inode->ei_inode);
 	struct btree_trans trans;
-	struct btree_iter *iter;
+	struct btree_iter iter = { NULL };
 	struct bkey_s_c_xattr xattr;
 	struct posix_acl *acl = NULL;
 	struct bkey_s_c k;
@@ -229,20 +229,19 @@ struct posix_acl *bch2_get_acl(struct mnt_idmap *idmap,
 retry:
 	bch2_trans_begin(&trans);
 
-	iter = bch2_hash_lookup(&trans, bch2_xattr_hash_desc,
+	ret = bch2_hash_lookup(&trans, &iter, bch2_xattr_hash_desc,
 			&hash, inode->v.i_ino,
 			&X_SEARCH(acl_to_xattr_type(type), "", 0),
 			0);
-	if (IS_ERR(iter)) {
-		if (PTR_ERR(iter) == -EINTR)
+	if (ret) {
+		if (ret == -EINTR)
 			goto retry;
-
-		if (PTR_ERR(iter) != -ENOENT)
-			acl = ERR_CAST(iter);
+		if (ret != -ENOENT)
+			acl = ERR_PTR(ret);
 		goto out;
 	}
 
-	k = bch2_btree_iter_peek_slot(iter);
+	k = bch2_btree_iter_peek_slot(&iter);
 	ret = bkey_err(k);
 	if (ret) {
 		acl = ERR_PTR(ret);
@@ -255,8 +254,8 @@ retry:
 
 	if (!IS_ERR(acl))
 		set_cached_acl(&inode->v, type, acl);
-	bch2_trans_iter_put(&trans, iter);
 out:
+	bch2_trans_iter_exit(&trans, &iter);
 	bch2_trans_exit(&trans);
 	return acl;
 }
@@ -298,7 +297,7 @@ int bch2_set_acl(struct mnt_idmap *idmap,
 	struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
 	struct btree_trans trans;
-	struct btree_iter *inode_iter;
+	struct btree_iter inode_iter = { NULL };
 	struct bch_inode_unpacked inode_u;
 	struct bch_hash_info hash_info;
 	struct posix_acl *acl;
@@ -311,9 +310,8 @@ retry:
 	bch2_trans_begin(&trans);
 	acl = _acl;
 
-	inode_iter = bch2_inode_peek(&trans, &inode_u, inode->v.i_ino,
-				     BTREE_ITER_INTENT);
-	ret = PTR_ERR_OR_ZERO(inode_iter);
+	ret = bch2_inode_peek(&trans, &inode_iter, &inode_u, inode->v.i_ino,
+			      BTREE_ITER_INTENT);
 	if (ret)
 		goto btree_err;
 
@@ -334,11 +332,11 @@ retry:
 	inode_u.bi_ctime	= bch2_current_time(c);
 	inode_u.bi_mode		= mode;
 
-	ret =   bch2_inode_write(&trans, inode_iter, &inode_u) ?:
+	ret =   bch2_inode_write(&trans, &inode_iter, &inode_u) ?:
 		bch2_trans_commit(&trans, NULL,
 				  &inode->ei_journal_seq, 0);
 btree_err:
-	bch2_trans_iter_put(&trans, inode_iter);
+	bch2_trans_iter_exit(&trans, &inode_iter);
 
 	if (ret == -EINTR)
 		goto retry;
@@ -362,22 +360,21 @@ int bch2_acl_chmod(struct btree_trans *trans,
 		   struct posix_acl **new_acl)
 {
 	struct bch_hash_info hash_info = bch2_hash_info_init(trans->c, inode);
-	struct btree_iter *iter;
+	struct btree_iter iter;
 	struct bkey_s_c_xattr xattr;
 	struct bkey_i_xattr *new;
 	struct posix_acl *acl;
 	struct bkey_s_c k;
 	int ret;
 
-	iter = bch2_hash_lookup(trans, bch2_xattr_hash_desc,
+	ret = bch2_hash_lookup(trans, &iter, bch2_xattr_hash_desc,
 			&hash_info, inode->bi_inum,
 			&X_SEARCH(KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS, "", 0),
 			BTREE_ITER_INTENT);
-	ret = PTR_ERR_OR_ZERO(iter);
 	if (ret)
 		return ret == -ENOENT ? 0 : ret;
 
-	k = bch2_btree_iter_peek_slot(iter);
+	k = bch2_btree_iter_peek_slot(&iter);
 	xattr = bkey_s_c_to_xattr(k);
 	if (ret)
 		goto err;
@@ -398,12 +395,12 @@ int bch2_acl_chmod(struct btree_trans *trans,
 		goto err;
 	}
 
-	new->k.p = iter->pos;
-	ret = bch2_trans_update(trans, iter, &new->k_i, 0);
+	new->k.p = iter.pos;
+	ret = bch2_trans_update(trans, &iter, &new->k_i, 0);
 	*new_acl = acl;
 	acl = NULL;
 err:
-	bch2_trans_iter_put(trans, iter);
+	bch2_trans_iter_exit(trans, &iter);
 	if (!IS_ERR_OR_NULL(acl))
 		kfree(acl);
 	return ret;
diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 932a8176dff7..54fbfb22d671 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -353,32 +353,32 @@ err:
 int bch2_alloc_write(struct bch_fs *c, unsigned flags)
 {
 	struct btree_trans trans;
-	struct btree_iter *iter;
+	struct btree_iter iter;
 	struct bch_dev *ca;
 	unsigned i;
 	int ret = 0;
 
 	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
-	iter = bch2_trans_get_iter(&trans, BTREE_ID_alloc, POS_MIN,
-				   BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+	bch2_trans_iter_init(&trans, &iter, BTREE_ID_alloc, POS_MIN,
+			     BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
 
 	for_each_member_device(ca, c, i) {
-		bch2_btree_iter_set_pos(iter,
+		bch2_btree_iter_set_pos(&iter,
 			POS(ca->dev_idx, ca->mi.first_bucket));
 
-		while (iter->pos.offset < ca->mi.nbuckets) {
+		while (iter.pos.offset < ca->mi.nbuckets) {
 			bch2_trans_cond_resched(&trans);
 
-			ret = bch2_alloc_write_key(&trans, iter, flags);
+			ret = bch2_alloc_write_key(&trans, &iter, flags);
 			if (ret) {
 				percpu_ref_put(&ca->ref);
 				goto err;
 			}
-			bch2_btree_iter_advance(iter);
+			bch2_btree_iter_advance(&iter);
 		}
 	}
 err:
-	bch2_trans_iter_put(&trans, iter);
+	bch2_trans_iter_exit(&trans, &iter);
 	bch2_trans_exit(&trans);
 	return ret;
 }
@@ -390,18 +390,18 @@ int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev,
 {
 	struct bch_fs *c = trans->c;
 	struct bch_dev *ca = bch_dev_bkey_exists(c, dev);
-	struct btree_iter *iter;
+	struct btree_iter iter;
 	struct bucket *g;
 	struct bkey_alloc_buf *a;
 	struct bkey_alloc_unpacked u;
 	u64 *time, now;
 	int ret = 0;
 
-	iter = bch2_trans_get_iter(trans, BTREE_ID_alloc, POS(dev, bucket_nr),
-				   BTREE_ITER_CACHED|
-				   BTREE_ITER_CACHED_NOFILL|
-				   BTREE_ITER_INTENT);
-	ret = bch2_btree_iter_traverse(iter);
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, POS(dev, bucket_nr),
+			     BTREE_ITER_CACHED|
+			     BTREE_ITER_CACHED_NOFILL|
+			     BTREE_ITER_INTENT);
+	ret = bch2_btree_iter_traverse(&iter);
 	if (ret)
 		goto out;
 
@@ -412,7 +412,7 @@ int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev,
 
 	percpu_down_read(&c->mark_lock);
 	g = bucket(ca, bucket_nr);
-	u = alloc_mem_to_key(iter, g, READ_ONCE(g->mark));
+	u = alloc_mem_to_key(&iter, g, READ_ONCE(g->mark));
 	percpu_up_read(&c->mark_lock);
 
 	time = rw == READ ? &u.read_time : &u.write_time;
@@ -423,10 +423,10 @@ int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev,
 	*time = now;
 
 	bch2_alloc_pack(c, a, u);
-	ret   = bch2_trans_update(trans, iter, &a->k, 0) ?:
+	ret   = bch2_trans_update(trans, &iter, &a->k, 0) ?:
 		bch2_trans_commit(trans, NULL, NULL, 0);
 out:
-	bch2_trans_iter_put(trans, iter);
+	bch2_trans_iter_exit(trans, &iter);
 	return ret;
 }
 
@@ -695,27 +695,28 @@ static int bucket_invalidate_btree(struct btree_trans *trans,
 	struct bkey_alloc_unpacked u;
 	struct bucket *g;
 	struct bucket_mark m;
-	struct btree_iter *iter =
-		bch2_trans_get_iter(trans, BTREE_ID_alloc,
-				    POS(ca->dev_idx, b),
-				    BTREE_ITER_CACHED|
-				    BTREE_ITER_CACHED_NOFILL|
-				    BTREE_ITER_INTENT);
+	struct btree_iter iter;
 	int ret;
 
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc,
+			     POS(ca->dev_idx, b),
+			     BTREE_ITER_CACHED|
+			     BTREE_ITER_CACHED_NOFILL|
+			     BTREE_ITER_INTENT);
+
 	a = bch2_trans_kmalloc(trans, sizeof(*a));
 	ret = PTR_ERR_OR_ZERO(a);
 	if (ret)
 		goto err;
 
-	ret = bch2_btree_iter_traverse(iter);
+	ret = bch2_btree_iter_traverse(&iter);
 	if (ret)
 		goto err;
 
 	percpu_down_read(&c->mark_lock);
 	g = bucket(ca, b);
 	m = READ_ONCE(g->mark);
-	u = alloc_mem_to_key(iter, g, m);
+	u = alloc_mem_to_key(&iter, g, m);
 	percpu_up_read(&c->mark_lock);
 
 	u.gen++;
@@ -726,10 +727,10 @@ static int bucket_invalidate_btree(struct btree_trans *trans,
 	u.write_time	= atomic64_read(&c->io_clock[WRITE].now);
 
 	bch2_alloc_pack(c, a, u);
-	ret = bch2_trans_update(trans, iter, &a->k,
+	ret = bch2_trans_update(trans, &iter, &a->k,
 				BTREE_TRIGGER_BUCKET_INVALIDATE);
 err:
-	bch2_trans_iter_put(trans, iter);
+	bch2_trans_iter_exit(trans, &iter);
 	return ret;
 }
 
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index e2aac1da18ae..114ae77a8a02 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -558,8 +558,8 @@ struct journal_keys {
 	u64			journal_seq_base;
 };
 
-struct btree_iter_buf {
-	struct btree_iter	*iter;
+struct btree_path_buf {
+	struct btree_path	*path;
 };
 
 #define REPLICAS_DELTA_LIST_MAX	(1U << 16)
@@ -667,9 +667,9 @@ struct bch_fs {
 	/* btree_iter.c: */
 	struct mutex		btree_trans_lock;
 	struct list_head	btree_trans_list;
-	mempool_t		btree_iters_pool;
+	mempool_t		btree_paths_pool;
 	mempool_t		btree_trans_mem_pool;
-	struct btree_iter_buf  __percpu	*btree_iters_bufs;
+	struct btree_path_buf  __percpu	*btree_paths_bufs;
 
 	struct srcu_struct	btree_trans_barrier;
 
diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c
index 9484f28be6de..2e0ad3a4fa67 100644
--- a/fs/bcachefs/bset.c
+++ b/fs/bcachefs/bset.c
@@ -185,9 +185,11 @@ void bch2_btree_node_iter_verify(struct btree_node_iter *iter,
 		return;
 
 	/* Verify no duplicates: */
-	btree_node_iter_for_each(iter, set)
+	btree_node_iter_for_each(iter, set) {
+		BUG_ON(set->k > set->end);
 		btree_node_iter_for_each(iter, s2)
 			BUG_ON(set != s2 && set->end == s2->end);
+	}
 
 	/* Verify that set->end is correct: */
 	btree_node_iter_for_each(iter, set) {
diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index c94ed4da1ca4..d45218d5fd35 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -641,7 +641,7 @@ err:
 /* Slowpath, don't want it inlined into btree_iter_traverse() */
 static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c,
 				struct btree_trans *trans,
-				struct btree_iter *iter,
+				struct btree_path *path,
 				const struct bkey_i *k,
 				enum btree_id btree_id,
 				unsigned level,
@@ -657,7 +657,7 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c,
 	 * Parent node must be locked, else we could read in a btree node that's
 	 * been freed:
 	 */
-	if (trans && !bch2_btree_node_relock(trans, iter, level + 1)) {
+	if (trans && !bch2_btree_node_relock(trans, path, level + 1)) {
 		btree_trans_restart(trans);
 		return ERR_PTR(-EINTR);
 	}
@@ -699,7 +699,7 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c,
 
 	if (trans &&
 	    (!bch2_trans_relock(trans) ||
-	     !bch2_btree_iter_relock_intent(trans, iter))) {
+	     !bch2_btree_path_relock_intent(trans, path))) {
 		BUG_ON(!trans->restarted);
 		return ERR_PTR(-EINTR);
 	}
@@ -763,7 +763,7 @@ static inline void btree_check_header(struct bch_fs *c, struct btree *b)
  * The btree node will have either a read or a write lock held, depending on
  * the @write parameter.
  */
-struct btree *bch2_btree_node_get(struct btree_trans *trans, struct btree_iter *iter,
+struct btree *bch2_btree_node_get(struct btree_trans *trans, struct btree_path *path,
 				  const struct bkey_i *k, unsigned level,
 				  enum six_lock_type lock_type,
 				  unsigned long trace_ip)
@@ -788,7 +788,7 @@ retry:
 		 * else we could read in a btree node from disk that's been
 		 * freed:
 		 */
-		b = bch2_btree_node_fill(c, trans, iter, k, iter->btree_id,
+		b = bch2_btree_node_fill(c, trans, path, k, path->btree_id,
 					 level, lock_type, true);
 
 		/* We raced and found the btree node in the cache */
@@ -827,10 +827,10 @@ lock_node:
 		 * the parent was modified, when the pointer to the node we want
 		 * was removed - and we'll bail out:
 		 */
-		if (btree_node_read_locked(iter, level + 1))
-			btree_node_unlock(iter, level + 1);
+		if (btree_node_read_locked(path, level + 1))
+			btree_node_unlock(path, level + 1);
 
-		if (!btree_node_lock(trans, iter, b, k->k.p, level, lock_type,
+		if (!btree_node_lock(trans, path, b, k->k.p, level, lock_type,
 				     lock_node_check_fn, (void *) k, trace_ip)) {
 			if (!trans->restarted)
 				goto retry;
@@ -841,13 +841,13 @@ lock_node:
 			     b->c.level != level ||
 			     race_fault())) {
 			six_unlock_type(&b->c.lock, lock_type);
-			if (bch2_btree_node_relock(trans, iter, level + 1))
+			if (bch2_btree_node_relock(trans, path, level + 1))
 				goto retry;
 
 			trace_trans_restart_btree_node_reused(trans->ip,
 							      trace_ip,
-							      iter->btree_id,
-							      &iter->real_pos);
+							      path->btree_id,
+							      &path->pos);
 			btree_trans_restart(trans);
 			return ERR_PTR(-EINTR);
 		}
@@ -862,12 +862,12 @@ lock_node:
 		bch2_btree_node_wait_on_read(b);
 
 		/*
-		 * should_be_locked is not set on this iterator yet, so we need
-		 * to relock it specifically:
+		 * should_be_locked is not set on this path yet, so we need to
+		 * relock it specifically:
 		 */
 		if (trans &&
 		    (!bch2_trans_relock(trans) ||
-		     !bch2_btree_iter_relock_intent(trans, iter))) {
+		     !bch2_btree_path_relock_intent(trans, path))) {
 			BUG_ON(!trans->restarted);
 			return ERR_PTR(-EINTR);
 		}
@@ -895,7 +895,7 @@ lock_node:
 		return ERR_PTR(-EIO);
 	}
 
-	EBUG_ON(b->c.btree_id != iter->btree_id);
+	EBUG_ON(b->c.btree_id != path->btree_id);
 	EBUG_ON(BTREE_NODE_LEVEL(b->data) != level);
 	btree_check_header(c, b);
 
@@ -986,21 +986,21 @@ out:
 
 int bch2_btree_node_prefetch(struct bch_fs *c,
 			     struct btree_trans *trans,
-			     struct btree_iter *iter,
+			     struct btree_path *path,
 			     const struct bkey_i *k,
 			     enum btree_id btree_id, unsigned level)
 {
 	struct btree_cache *bc = &c->btree_cache;
 	struct btree *b;
 
-	BUG_ON(trans && !btree_node_locked(iter, level + 1));
+	BUG_ON(trans && !btree_node_locked(path, level + 1));
 	BUG_ON(level >= BTREE_MAX_DEPTH);
 
 	b = btree_cache_find(bc, k);
 	if (b)
 		return 0;
 
-	b = bch2_btree_node_fill(c, trans, iter, k, btree_id,
+	b = bch2_btree_node_fill(c, trans, path, k, btree_id,
 				 level, SIX_LOCK_read, false);
 	return PTR_ERR_OR_ZERO(b);
 }
diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h
index 3b671cf0056d..2f6e0ea87616 100644
--- a/fs/bcachefs/btree_cache.h
+++ b/fs/bcachefs/btree_cache.h
@@ -20,16 +20,15 @@ int bch2_btree_cache_cannibalize_lock(struct bch_fs *, struct closure *);
 struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *);
 struct btree *bch2_btree_node_mem_alloc(struct bch_fs *);
 
-struct btree *bch2_btree_node_get(struct btree_trans *, struct btree_iter *,
+struct btree *bch2_btree_node_get(struct btree_trans *, struct btree_path *,
 				  const struct bkey_i *, unsigned,
 				  enum six_lock_type, unsigned long);
 
 struct btree *bch2_btree_node_get_noiter(struct bch_fs *, const struct bkey_i *,
 					 enum btree_id, unsigned, bool);
 
-int bch2_btree_node_prefetch(struct bch_fs *, struct btree_trans *,
-			     struct btree_iter *, const struct bkey_i *,
-			     enum btree_id, unsigned);
+int bch2_btree_node_prefetch(struct bch_fs *, struct btree_trans *, struct btree_path *,
+			     const struct bkey_i *, enum btree_id, unsigned);
 
 void bch2_btree_node_evict(struct bch_fs *, const struct bkey_i *);
 
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 4a3f39a619a1..66367ab9f20a 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -775,7 +775,7 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id,
 			 bool initial, bool metadata_only)
 {
 	struct btree_trans trans;
-	struct btree_iter *iter;
+	struct btree_iter iter;
 	struct btree *b;
 	unsigned depth = metadata_only			? 1
 		: bch2_expensive_debug_checks		? 0
@@ -800,13 +800,13 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id,
 
 		if (!initial) {
 			if (max_stale > 64)
-				bch2_btree_node_rewrite(&trans, iter,
+				bch2_btree_node_rewrite(&trans, &iter,
 						b->data->keys.seq,
 						BTREE_INSERT_NOWAIT|
 						BTREE_INSERT_GC_LOCK_HELD);
 			else if (!bch2_btree_gc_rewrite_disabled &&
 				 (bch2_btree_gc_always_rewrite || max_stale > 16))
-				bch2_btree_node_rewrite(&trans, iter,
+				bch2_btree_node_rewrite(&trans, &iter,
 						b->data->keys.seq,
 						BTREE_INSERT_NOWAIT|
 						BTREE_INSERT_GC_LOCK_HELD);
@@ -814,7 +814,7 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id,
 
 		bch2_trans_cond_resched(&trans);
 	}
-	bch2_trans_iter_put(&trans, iter);
+	bch2_trans_iter_exit(&trans, &iter);
 
 	ret = bch2_trans_exit(&trans) ?: ret;
 	if (ret)
@@ -1414,7 +1414,7 @@ static int bch2_gc_reflink_done(struct bch_fs *c, bool initial,
 				bool metadata_only)
 {
 	struct btree_trans trans;
-	struct btree_iter *iter;
+	struct btree_iter iter;
 	struct bkey_s_c k;
 	struct reflink_gc *r;
 	size_t idx = 0;
@@ -1480,7 +1480,7 @@ static int bch2_gc_reflink_done(struct bch_fs *c, bool initial,
 		}
 	}
 fsck_err:
-	bch2_trans_iter_put(&trans, iter);
+	bch2_trans_iter_exit(&trans, &iter);
 	bch2_trans_exit(&trans);
 out:
 	genradix_free(&c->reflink_gc_table);
@@ -1512,7 +1512,7 @@ static int bch2_gc_reflink_start(struct bch_fs *c, bool initial,
 				 bool metadata_only)
 {
 	struct btree_trans trans;
-	struct btree_iter *iter;
+	struct btree_iter iter;
 	struct bkey_s_c k;
 	struct reflink_gc *r;
 	int ret;
@@ -1547,7 +1547,7 @@ static int bch2_gc_reflink_start(struct bch_fs *c, bool initial,
 		r->size		= k.k->size;
 		r->refcount	= 0;
 	}
-	bch2_trans_iter_put(&trans, iter);
+	bch2_trans_iter_exit(&trans, &iter);
 
 	bch2_trans_exit(&trans);
 	return 0;
@@ -1722,7 +1722,7 @@ static bool gc_btree_gens_key(struct bch_fs *c, struct bkey_s_c k)
 static int bch2_gc_btree_gens(struct bch_fs *c, enum btree_id btree_id)
 {
 	struct btree_trans trans;
-	struct btree_iter *iter;
+	struct btree_iter iter;
 	struct bkey_s_c k;
 	struct bkey_buf sk;
 	int ret = 0, commit_err = 0;
@@ -1730,13 +1730,13 @@ static int bch2_gc_btree_gens(struct bch_fs *c, enum btree_id btree_id)
 	bch2_bkey_buf_init(&sk);
 	bch2_trans_init(&trans, c, 0, 0);
 
-	iter = bch2_trans_get_iter(&trans, btree_id, POS_MIN,
-				   BTREE_ITER_PREFETCH|
-				   BTREE_ITER_NOT_EXTENTS|
-				   BTREE_ITER_ALL_SNAPSHOTS);
+	bch2_trans_iter_init(&trans, &iter, btree_id, POS_MIN,
+			     BTREE_ITER_PREFETCH|
+			     BTREE_ITER_NOT_EXTENTS|
+			     BTREE_ITER_ALL_SNAPSHOTS);
 
 	while ((bch2_trans_begin(&trans),
-		k = bch2_btree_iter_peek(iter)).k) {
+		k = bch2_btree_iter_peek(&iter)).k) {
 		ret = bkey_err(k);
 
 		if (ret == -EINTR)
@@ -1744,7 +1744,7 @@ static int bch2_gc_btree_gens(struct bch_fs *c, enum btree_id btree_id)
 		if (ret)
 			break;
 
-		c->gc_gens_pos = iter->pos;
+		c->gc_gens_pos = iter.pos;
 
 		if (gc_btree_gens_key(c, k) && !commit_err) {
 			bch2_bkey_buf_reassemble(&sk, c, k);
@@ -1752,7 +1752,7 @@ static int bch2_gc_btree_gens(struct bch_fs *c, enum btree_id btree_id)
 
 
 			commit_err =
-				bch2_trans_update(&trans, iter, sk.k, 0) ?:
+				bch2_trans_update(&trans, &iter, sk.k, 0) ?:
 				bch2_trans_commit(&trans, NULL, NULL,
 						       BTREE_INSERT_NOWAIT|
 						       BTREE_INSERT_NOFAIL);
@@ -1762,9 +1762,9 @@ static int bch2_gc_btree_gens(struct bch_fs *c, enum btree_id btree_id)
 			}
 		}
 
-		bch2_btree_iter_advance(iter);
+		bch2_btree_iter_advance(&iter);
 	}
-	bch2_trans_iter_put(&trans, iter);
+	bch2_trans_iter_exit(&trans, &iter);
 
 	bch2_trans_exit(&trans);
 	bch2_bkey_buf_exit(&sk, c);
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 7acec1e6db3d..06379f3e40a6 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -17,21 +17,30 @@
 
 #include <linux/prefetch.h>
 
-static void btree_iter_set_search_pos(struct btree_iter *, struct bpos);
-static inline void btree_trans_sort_iters(struct btree_trans *);
-static struct btree_iter *btree_iter_child_alloc(struct btree_trans *,
-					struct btree_iter *, unsigned long);
-static struct btree_iter *btree_trans_iter_alloc(struct btree_trans *,
-						 struct btree_iter *);
-static void btree_iter_copy(struct btree_trans *, struct btree_iter *, struct btree_iter *);
+static inline void btree_trans_sort_paths(struct btree_trans *);
 
-static inline int btree_iter_cmp(const struct btree_iter *l,
-				 const struct btree_iter *r)
+static inline void btree_path_list_remove(struct btree_trans *, struct btree_path *);
+static inline void btree_path_list_add(struct btree_trans *, struct btree_path *,
+				       struct btree_path *);
+
+static struct btree_path *btree_path_alloc(struct btree_trans *, struct btree_path *);
+
+static inline int __btree_path_cmp(const struct btree_path *l,
+				   enum btree_id	r_btree_id,
+				   bool			r_cached,
+				   struct bpos		r_pos,
+				   unsigned		r_level)
 {
-	return   cmp_int(l->btree_id,	r->btree_id) ?:
-		-cmp_int(l->cached,	r->cached) ?:
-		 bkey_cmp(l->real_pos,	r->real_pos) ?:
-		-cmp_int(l->level,	r->level);
+	return   cmp_int(l->btree_id,	r_btree_id) ?:
+		 cmp_int(l->cached,	r_cached) ?:
+		 bpos_cmp(l->pos,	r_pos) ?:
+		-cmp_int(l->level,	r_level);
+}
+
+static inline int btree_path_cmp(const struct btree_path *l,
+				 const struct btree_path *r)
+{
+	return __btree_path_cmp(l, r->btree_id, r->cached, r->pos, r->level);
 }
 
 static inline struct bpos bkey_successor(struct btree_iter *iter, struct bpos p)
@@ -60,10 +69,10 @@ static inline struct bpos bkey_predecessor(struct btree_iter *iter, struct bpos
 	return p;
 }
 
-static inline bool is_btree_node(struct btree_iter *iter, unsigned l)
+static inline bool is_btree_node(struct btree_path *path, unsigned l)
 {
 	return l < BTREE_MAX_DEPTH &&
-		(unsigned long) iter->l[l].b >= 128;
+		(unsigned long) path->l[l].b >= 128;
 }
 
 static inline struct bpos btree_iter_search_key(struct btree_iter *iter)
@@ -76,42 +85,42 @@ static inline struct bpos btree_iter_search_key(struct btree_iter *iter)
 	return pos;
 }
 
-static inline bool btree_iter_pos_before_node(struct btree_iter *iter,
+static inline bool btree_path_pos_before_node(struct btree_path *path,
 					      struct btree *b)
 {
-	return bpos_cmp(iter->real_pos, b->data->min_key) < 0;
+	return bpos_cmp(path->pos, b->data->min_key) < 0;
 }
 
-static inline bool btree_iter_pos_after_node(struct btree_iter *iter,
+static inline bool btree_path_pos_after_node(struct btree_path *path,
 					     struct btree *b)
 {
-	return bpos_cmp(b->key.k.p, iter->real_pos) < 0;
+	return bpos_cmp(b->key.k.p, path->pos) < 0;
 }
 
-static inline bool btree_iter_pos_in_node(struct btree_iter *iter,
+static inline bool btree_path_pos_in_node(struct btree_path *path,
 					  struct btree *b)
 {
-	return iter->btree_id == b->c.btree_id &&
-		!btree_iter_pos_before_node(iter, b) &&
-		!btree_iter_pos_after_node(iter, b);
+	return path->btree_id == b->c.btree_id &&
+		!btree_path_pos_before_node(path, b) &&
+		!btree_path_pos_after_node(path, b);
 }
 
 /* Btree node locking: */
 
 void bch2_btree_node_unlock_write(struct btree_trans *trans,
-			struct btree_iter *iter, struct btree *b)
+			struct btree_path *path, struct btree *b)
 {
-	bch2_btree_node_unlock_write_inlined(trans, iter, b);
+	bch2_btree_node_unlock_write_inlined(trans, path, b);
 }
 
 void __bch2_btree_node_lock_write(struct btree_trans *trans, struct btree *b)
 {
-	struct btree_iter *iter;
+	struct btree_path *linked;
 	unsigned readers = 0;
 
-	trans_for_each_iter(trans, iter)
-		if (iter->l[b->c.level].b == b &&
-		    btree_node_read_locked(iter, b->c.level))
+	trans_for_each_path(trans, linked)
+		if (linked->l[b->c.level].b == b &&
+		    btree_node_read_locked(linked, b->c.level))
 			readers++;
 
 	/*
@@ -136,21 +145,21 @@ void __bch2_btree_node_lock_write(struct btree_trans *trans, struct btree *b)
 }
 
 bool __bch2_btree_node_relock(struct btree_trans *trans,
-			      struct btree_iter *iter, unsigned level)
+			      struct btree_path *path, unsigned level)
 {
-	struct btree *b = btree_iter_node(iter, level);
-	int want = __btree_lock_want(iter, level);
+	struct btree *b = btree_path_node(path, level);
+	int want = __btree_lock_want(path, level);
 
-	if (!is_btree_node(iter, level))
+	if (!is_btree_node(path, level))
 		return false;
 
 	if (race_fault())
 		return false;
 
-	if (six_relock_type(&b->c.lock, want, iter->l[level].lock_seq) ||
-	    (btree_node_lock_seq_matches(iter, b, level) &&
+	if (six_relock_type(&b->c.lock, want, path->l[level].lock_seq) ||
+	    (btree_node_lock_seq_matches(path, b, level) &&
 	     btree_node_lock_increment(trans, b, level, want))) {
-		mark_btree_node_locked(iter, level, want);
+		mark_btree_node_locked(path, level, want);
 		return true;
 	} else {
 		return false;
@@ -158,88 +167,88 @@ bool __bch2_btree_node_relock(struct btree_trans *trans,
 }
 
 static bool bch2_btree_node_upgrade(struct btree_trans *trans,
-				    struct btree_iter *iter, unsigned level)
+				    struct btree_path *path, unsigned level)
 {
-	struct btree *b = iter->l[level].b;
+	struct btree *b = path->l[level].b;
 
-	EBUG_ON(btree_lock_want(iter, level) != BTREE_NODE_INTENT_LOCKED);
+	EBUG_ON(btree_lock_want(path, level) != BTREE_NODE_INTENT_LOCKED);
 
-	if (!is_btree_node(iter, level))
+	if (!is_btree_node(path, level))
 		return false;
 
-	if (btree_node_intent_locked(iter, level))
+	if (btree_node_intent_locked(path, level))
 		return true;
 
 	if (race_fault())
 		return false;
 
-	if (btree_node_locked(iter, level)
+	if (btree_node_locked(path, level)
 	    ? six_lock_tryupgrade(&b->c.lock)
-	    : six_relock_type(&b->c.lock, SIX_LOCK_intent, iter->l[level].lock_seq))
+	    : six_relock_type(&b->c.lock, SIX_LOCK_intent, path->l[level].lock_seq))
 		goto success;
 
-	if (btree_node_lock_seq_matches(iter, b, level) &&
+	if (btree_node_lock_seq_matches(path, b, level) &&
 	    btree_node_lock_increment(trans, b, level, BTREE_NODE_INTENT_LOCKED)) {
-		btree_node_unlock(iter, level);
+		btree_node_unlock(path, level);
 		goto success;
 	}
 
 	return false;
 success:
-	mark_btree_node_intent_locked(iter, level);
+	mark_btree_node_intent_locked(path, level);
 	return true;
 }
 
-static inline bool btree_iter_get_locks(struct btree_trans *trans,
-					struct btree_iter *iter,
+static inline bool btree_path_get_locks(struct btree_trans *trans,
+					struct btree_path *path,
 					bool upgrade, unsigned long trace_ip)
 {
-	unsigned l = iter->level;
+	unsigned l = path->level;
 	int fail_idx = -1;
 
 	do {
-		if (!btree_iter_node(iter, l))
+		if (!btree_path_node(path, l))
 			break;
 
 		if (!(upgrade
-		      ? bch2_btree_node_upgrade(trans, iter, l)
-		      : bch2_btree_node_relock(trans, iter, l))) {
+		      ? bch2_btree_node_upgrade(trans, path, l)
+		      : bch2_btree_node_relock(trans, path, l))) {
 			(upgrade
 			 ? trace_node_upgrade_fail
 			 : trace_node_relock_fail)(trans->ip, trace_ip,
-					iter->cached,
-					iter->btree_id, &iter->real_pos,
-					l, iter->l[l].lock_seq,
-					is_btree_node(iter, l)
+					path->cached,
+					path->btree_id, &path->pos,
+					l, path->l[l].lock_seq,
+					is_btree_node(path, l)
 					? 0
-					: (unsigned long) iter->l[l].b,
-					is_btree_node(iter, l)
-					? iter->l[l].b->c.lock.state.seq
+					: (unsigned long) path->l[l].b,
+					is_btree_node(path, l)
+					? path->l[l].b->c.lock.state.seq
 					: 0);
 			fail_idx = l;
-			btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
+			btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
 		}
 
 		l++;
-	} while (l < iter->locks_want);
+	} while (l < path->locks_want);
 
 	/*
 	 * When we fail to get a lock, we have to ensure that any child nodes
-	 * can't be relocked so bch2_btree_iter_traverse has to walk back up to
+	 * can't be relocked so bch2_btree_path_traverse has to walk back up to
 	 * the node that we failed to relock:
 	 */
 	while (fail_idx >= 0) {
-		btree_node_unlock(iter, fail_idx);
-		iter->l[fail_idx].b = BTREE_ITER_NO_NODE_GET_LOCKS;
+		btree_node_unlock(path, fail_idx);
+		path->l[fail_idx].b = BTREE_ITER_NO_NODE_GET_LOCKS;
 		--fail_idx;
 	}
 
-	if (iter->uptodate == BTREE_ITER_NEED_RELOCK)
-		iter->uptodate = BTREE_ITER_UPTODATE;
+	if (path->uptodate == BTREE_ITER_NEED_RELOCK)
+		path->uptodate = BTREE_ITER_UPTODATE;
 
 	bch2_trans_verify_locks(trans);
 
-	return iter->uptodate < BTREE_ITER_NEED_RELOCK;
+	return path->uptodate < BTREE_ITER_NEED_RELOCK;
 }
 
 static struct bpos btree_node_pos(struct btree_bkey_cached_common *_b,
@@ -252,19 +261,20 @@ static struct bpos btree_node_pos(struct btree_bkey_cached_common *_b,
 
 /* Slowpath: */
 bool __bch2_btree_node_lock(struct btree_trans *trans,
-			    struct btree_iter *iter,
-			    struct btree *b, struct bpos pos, unsigned level,
+			    struct btree_path *path,
+			    struct btree *b,
+			    struct bpos pos, unsigned level,
 			    enum six_lock_type type,
 			    six_lock_should_sleep_fn should_sleep_fn, void *p,
 			    unsigned long ip)
 {
-	struct btree_iter *linked, *deadlock_iter = NULL;
+	struct btree_path *linked, *deadlock_path = NULL;
 	u64 start_time = local_clock();
 	unsigned reason = 9;
 	bool ret;
 
 	/* Check if it's safe to block: */
-	trans_for_each_iter(trans, linked) {
+	trans_for_each_path(trans, linked) {
 		if (!linked->nodes_locked)
 			continue;
 
@@ -282,25 +292,25 @@ bool __bch2_btree_node_lock(struct btree_trans *trans,
 		 */
 		if (type == SIX_LOCK_intent &&
 		    linked->nodes_locked != linked->nodes_intent_locked) {
-			deadlock_iter = linked;
+			deadlock_path = linked;
 			reason = 1;
 		}
 
-		if (linked->btree_id != iter->btree_id) {
-			if (linked->btree_id > iter->btree_id) {
-				deadlock_iter = linked;
+		if (linked->btree_id != path->btree_id) {
+			if (linked->btree_id > path->btree_id) {
+				deadlock_path = linked;
 				reason = 3;
 			}
 			continue;
 		}
 
 		/*
-		 * Within the same btree, cached iterators come before non
-		 * cached iterators:
+		 * Within the same btree, cached paths come before non
+		 * cached paths:
 		 */
-		if (linked->cached != iter->cached) {
-			if (iter->cached) {
-				deadlock_iter = linked;
+		if (linked->cached != path->cached) {
+			if (path->cached) {
+				deadlock_path = linked;
 				reason = 4;
 			}
 			continue;
@@ -308,11 +318,11 @@ bool __bch2_btree_node_lock(struct btree_trans *trans,
 
 		/*
 		 * Interior nodes must be locked before their descendants: if
-		 * another iterator has possible descendants locked of the node
+		 * another path has possible descendants locked of the node
 		 * we're about to lock, it must have the ancestors locked too:
 		 */
 		if (level > __fls(linked->nodes_locked)) {
-			deadlock_iter = linked;
+			deadlock_path = linked;
 			reason = 5;
 		}
 
@@ -320,19 +330,19 @@ bool __bch2_btree_node_lock(struct btree_trans *trans,
 		if (btree_node_locked(linked, level) &&
 		    bpos_cmp(pos, btree_node_pos((void *) linked->l[level].b,
 						 linked->cached)) <= 0) {
-			deadlock_iter = linked;
+			deadlock_path = linked;
 			reason = 7;
 		}
 	}
 
-	if (unlikely(deadlock_iter)) {
+	if (unlikely(deadlock_path)) {
 		trace_trans_restart_would_deadlock(trans->ip, ip,
 				trans->in_traverse_all, reason,
-				deadlock_iter->btree_id,
-				deadlock_iter->cached,
-				&deadlock_iter->real_pos,
-				iter->btree_id,
-				iter->cached,
+				deadlock_path->btree_id,
+				deadlock_path->cached,
+				&deadlock_path->pos,
+				path->btree_id,
+				path->cached,
 				&pos);
 		btree_trans_restart(trans);
 		return false;
@@ -342,9 +352,9 @@ bool __bch2_btree_node_lock(struct btree_trans *trans,
 		return true;
 
 #ifdef CONFIG_BCACHEFS_DEBUG
-	trans->locking_iter_idx = iter->idx;
+	trans->locking_path_idx = path->idx;
 	trans->locking_pos	= pos;
-	trans->locking_btree_id	= iter->btree_id;
+	trans->locking_btree_id	= path->btree_id;
 	trans->locking_level	= level;
 	trans->locking		= b;
 #endif
@@ -363,54 +373,57 @@ bool __bch2_btree_node_lock(struct btree_trans *trans,
 /* Btree iterator locking: */
 
 #ifdef CONFIG_BCACHEFS_DEBUG
-static void bch2_btree_iter_verify_locks(struct btree_iter *iter)
+
+static void bch2_btree_path_verify_locks(struct btree_path *path)
 {
 	unsigned l;
 
-	for (l = 0; btree_iter_node(iter, l); l++) {
-		if (iter->uptodate >= BTREE_ITER_NEED_RELOCK &&
-		    !btree_node_locked(iter, l))
+	for (l = 0; btree_path_node(path, l); l++) {
+		if (path->uptodate >= BTREE_ITER_NEED_RELOCK &&
+		    !btree_node_locked(path, l))
 			continue;
 
-		BUG_ON(btree_lock_want(iter, l) !=
-		       btree_node_locked_type(iter, l));
+		BUG_ON(btree_lock_want(path, l) !=
+		       btree_node_locked_type(path, l));
 	}
 }
 
 void bch2_trans_verify_locks(struct btree_trans *trans)
 {
-	struct btree_iter *iter;
+	struct btree_path *path;
 
-	trans_for_each_iter(trans, iter)
-		bch2_btree_iter_verify_locks(iter);
+	trans_for_each_path(trans, path)
+		bch2_btree_path_verify_locks(path);
 }
 #else
-static inline void bch2_btree_iter_verify_locks(struct btree_iter *iter) {}
+static inline void bch2_btree_path_verify_locks(struct btree_path *path) {}
 #endif
 
+/* Btree path locking: */
+
 /*
  * Only for btree_cache.c - only relocks intent locks
  */
-bool bch2_btree_iter_relock_intent(struct btree_trans *trans,
-				   struct btree_iter *iter)
+bool bch2_btree_path_relock_intent(struct btree_trans *trans,
+				   struct btree_path *path)
 {
 	unsigned l;
 
-	for (l = iter->level;
-	     l < iter->locks_want && btree_iter_node(iter, l);
+	for (l = path->level;
+	     l < path->locks_want && btree_path_node(path, l);
 	     l++) {
-		if (!bch2_btree_node_relock(trans, iter, l)) {
+		if (!bch2_btree_node_relock(trans, path, l)) {
 			trace_node_relock_fail(trans->ip, _RET_IP_,
-					iter->cached,
-					iter->btree_id, &iter->real_pos,
-					l, iter->l[l].lock_seq,
-					is_btree_node(iter, l)
+					path->cached,
+					path->btree_id, &path->pos,
+					l, path->l[l].lock_seq,
+					is_btree_node(path, l)
 					? 0
-					: (unsigned long) iter->l[l].b,
-					is_btree_node(iter, l)
-					? iter->l[l].b->c.lock.state.seq
+					: (unsigned long) path->l[l].b,
+					is_btree_node(path, l)
+					? path->l[l].b->c.lock.state.seq
 					: 0);
-			btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
+			btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
 			btree_trans_restart(trans);
 			return false;
 		}
@@ -420,27 +433,27 @@ bool bch2_btree_iter_relock_intent(struct btree_trans *trans,
 }
 
 __flatten
-static bool bch2_btree_iter_relock(struct btree_trans *trans,
-			struct btree_iter *iter, unsigned long trace_ip)
+static bool bch2_btree_path_relock(struct btree_trans *trans,
+			struct btree_path *path, unsigned long trace_ip)
 {
-	bool ret = btree_iter_get_locks(trans, iter, false, trace_ip);
+	bool ret = btree_path_get_locks(trans, path, false, trace_ip);
 
 	if (!ret)
 		btree_trans_restart(trans);
 	return ret;
 }
 
-bool __bch2_btree_iter_upgrade(struct btree_trans *trans,
-			       struct btree_iter *iter,
+bool __bch2_btree_path_upgrade(struct btree_trans *trans,
+			       struct btree_path *path,
 			       unsigned new_locks_want)
 {
-	struct btree_iter *linked;
+	struct btree_path *linked;
 
-	EBUG_ON(iter->locks_want >= new_locks_want);
+	EBUG_ON(path->locks_want >= new_locks_want);
 
-	iter->locks_want = new_locks_want;
+	path->locks_want = new_locks_want;
 
-	if (btree_iter_get_locks(trans, iter, true, _THIS_IP_))
+	if (btree_path_get_locks(trans, path, true, _THIS_IP_))
 		return true;
 
 	/*
@@ -448,7 +461,7 @@ bool __bch2_btree_iter_upgrade(struct btree_trans *trans,
 	 * iterators in the btree_trans here.
 	 *
 	 * On failure to upgrade the iterator, setting iter->locks_want and
-	 * calling get_locks() is sufficient to make bch2_btree_iter_traverse()
+	 * calling get_locks() is sufficient to make bch2_btree_path_traverse()
 	 * get the locks we want on transaction restart.
 	 *
 	 * But if this iterator was a clone, on transaction restart what we did
@@ -460,75 +473,67 @@ bool __bch2_btree_iter_upgrade(struct btree_trans *trans,
 	 *
 	 * The code below used to be needed to ensure ancestor nodes get locked
 	 * before interior nodes - now that's handled by
-	 * bch2_btree_iter_traverse_all().
+	 * bch2_btree_path_traverse_all().
 	 */
-	trans_for_each_iter(trans, linked)
-		if (linked != iter &&
-		    linked->cached == iter->cached &&
-		    linked->btree_id == iter->btree_id &&
+	trans_for_each_path(trans, linked)
+		if (linked != path &&
+		    linked->cached == path->cached &&
+		    linked->btree_id == path->btree_id &&
 		    linked->locks_want < new_locks_want) {
 			linked->locks_want = new_locks_want;
-			btree_iter_get_locks(trans, linked, true, _THIS_IP_);
+			btree_path_get_locks(trans, linked, true, _THIS_IP_);
 		}
 
-	if (iter->should_be_locked)
-		btree_trans_restart(trans);
 	return false;
 }
 
-void __bch2_btree_iter_downgrade(struct btree_iter *iter,
+void __bch2_btree_path_downgrade(struct btree_path *path,
 				 unsigned new_locks_want)
 {
 	unsigned l;
 
-	EBUG_ON(iter->locks_want < new_locks_want);
+	EBUG_ON(path->locks_want < new_locks_want);
 
-	iter->locks_want = new_locks_want;
+	path->locks_want = new_locks_want;
 
-	while (iter->nodes_locked &&
-	       (l = __fls(iter->nodes_locked)) >= iter->locks_want) {
-		if (l > iter->level) {
-			btree_node_unlock(iter, l);
+	while (path->nodes_locked &&
+	       (l = __fls(path->nodes_locked)) >= path->locks_want) {
+		if (l > path->level) {
+			btree_node_unlock(path, l);
 		} else {
-			if (btree_node_intent_locked(iter, l)) {
-				six_lock_downgrade(&iter->l[l].b->c.lock);
-				iter->nodes_intent_locked ^= 1 << l;
+			if (btree_node_intent_locked(path, l)) {
+				six_lock_downgrade(&path->l[l].b->c.lock);
+				path->nodes_intent_locked ^= 1 << l;
 			}
 			break;
 		}
 	}
 
-	bch2_btree_iter_verify_locks(iter);
+	bch2_btree_path_verify_locks(path);
 }
 
 void bch2_trans_downgrade(struct btree_trans *trans)
 {
-	struct btree_iter *iter;
+	struct btree_path *path;
 
-	trans_for_each_iter(trans, iter)
-		bch2_btree_iter_downgrade(iter);
+	trans_for_each_path(trans, path)
+		bch2_btree_path_downgrade(path);
 }
 
 /* Btree transaction locking: */
 
-static inline bool btree_iter_should_be_locked(struct btree_iter *iter)
-{
-	return (iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT) ||
-		iter->should_be_locked;
-}
-
 bool bch2_trans_relock(struct btree_trans *trans)
 {
-	struct btree_iter *iter;
+	struct btree_path *path;
 
 	if (unlikely(trans->restarted))
 		return false;
 
-	trans_for_each_iter(trans, iter)
-		if (btree_iter_should_be_locked(iter) &&
-		    !bch2_btree_iter_relock(trans, iter, _RET_IP_)) {
+	trans_for_each_path(trans, path)
+		if (path->should_be_locked &&
+		    !bch2_btree_path_relock(trans, path, _RET_IP_)) {
 			trace_trans_restart_relock(trans->ip, _RET_IP_,
-					iter->btree_id, &iter->real_pos);
+					path->btree_id, &path->pos);
 			BUG_ON(!trans->restarted);
 			return false;
 		}
@@ -537,37 +542,37 @@ bool bch2_trans_relock(struct btree_trans *trans)
 
 void bch2_trans_unlock(struct btree_trans *trans)
 {
-	struct btree_iter *iter;
+	struct btree_path *path;
 
-	trans_for_each_iter(trans, iter)
-		__bch2_btree_iter_unlock(iter);
+	trans_for_each_path(trans, path)
+		__bch2_btree_path_unlock(path);
 }
 
 /* Btree iterator: */
 
 #ifdef CONFIG_BCACHEFS_DEBUG
 
-static void bch2_btree_iter_verify_cached(struct btree_trans *trans,
-					  struct btree_iter *iter)
+static void bch2_btree_path_verify_cached(struct btree_trans *trans,
+					  struct btree_path *path)
 {
 	struct bkey_cached *ck;
-	bool locked = btree_node_locked(iter, 0);
+	bool locked = btree_node_locked(path, 0);
 
-	if (!bch2_btree_node_relock(trans, iter, 0))
+	if (!bch2_btree_node_relock(trans, path, 0))
 		return;
 
-	ck = (void *) iter->l[0].b;
-	BUG_ON(ck->key.btree_id != iter->btree_id ||
-	       bkey_cmp(ck->key.pos, iter->pos));
+	ck = (void *) path->l[0].b;
+	BUG_ON(ck->key.btree_id != path->btree_id ||
+	       bkey_cmp(ck->key.pos, path->pos));
 
 	if (!locked)
-		btree_node_unlock(iter, 0);
+		btree_node_unlock(path, 0);
 }
 
-static void bch2_btree_iter_verify_level(struct btree_trans *trans,
-				struct btree_iter *iter, unsigned level)
+static void bch2_btree_path_verify_level(struct btree_trans *trans,
+				struct btree_path *path, unsigned level)
 {
-	struct btree_iter_level *l;
+	struct btree_path_level *l;
 	struct btree_node_iter tmp;
 	bool locked;
 	struct bkey_packed *p, *k;
@@ -577,25 +582,23 @@ static void bch2_btree_iter_verify_level(struct btree_trans *trans,
 	if (!bch2_debug_check_iterators)
 		return;
 
-	l	= &iter->l[level];
+	l	= &path->l[level];
 	tmp	= l->iter;
-	locked	= btree_node_locked(iter, level);
+	locked	= btree_node_locked(path, level);
 
-	if (iter->cached) {
+	if (path->cached) {
 		if (!level)
-			bch2_btree_iter_verify_cached(trans, iter);
+			bch2_btree_path_verify_cached(trans, path);
 		return;
 	}
 
-	BUG_ON(iter->level < iter->min_depth);
-
-	if (!btree_iter_node(iter, level))
+	if (!btree_path_node(path, level))
 		return;
 
-	if (!bch2_btree_node_relock(trans, iter, level))
+	if (!bch2_btree_node_relock(trans, path, level))
 		return;
 
-	BUG_ON(!btree_iter_pos_in_node(iter, l->b));
+	BUG_ON(!btree_path_pos_in_node(path, l->b));
 
 	bch2_btree_node_iter_verify(&l->iter, l->b);
 
@@ -606,29 +609,29 @@ static void bch2_btree_iter_verify_level(struct btree_trans *trans,
 	 * For extents, the iterator may have skipped past deleted keys (but not
 	 * whiteouts)
 	 */
-	p = level || btree_node_type_is_extents(iter->btree_id)
+	p = level || btree_node_type_is_extents(path->btree_id)
 		? bch2_btree_node_iter_prev(&tmp, l->b)
 		: bch2_btree_node_iter_prev_all(&tmp, l->b);
 	k = bch2_btree_node_iter_peek_all(&l->iter, l->b);
 
-	if (p && bkey_iter_pos_cmp(l->b, p, &iter->real_pos) >= 0) {
+	if (p && bkey_iter_pos_cmp(l->b, p, &path->pos) >= 0) {
 		msg = "before";
 		goto err;
 	}
 
-	if (k && bkey_iter_pos_cmp(l->b, k, &iter->real_pos) < 0) {
+	if (k && bkey_iter_pos_cmp(l->b, k, &path->pos) < 0) {
 		msg = "after";
 		goto err;
 	}
 
 	if (!locked)
-		btree_node_unlock(iter, level);
+		btree_node_unlock(path, level);
 	return;
 err:
 	strcpy(buf2, "(none)");
 	strcpy(buf3, "(none)");
 
-	bch2_bpos_to_text(&PBUF(buf1), iter->real_pos);
+	bch2_bpos_to_text(&PBUF(buf1), path->pos);
 
 	if (p) {
 		struct bkey uk = bkey_unpack_key(l->b, p);
@@ -640,20 +643,51 @@ err:
 		bch2_bkey_to_text(&PBUF(buf3), &uk);
 	}
 
-	panic("iterator should be %s key at level %u:\n"
-	      "iter pos %s\n"
+	panic("path should be %s key at level %u:\n"
+	      "path pos %s\n"
 	      "prev key %s\n"
 	      "cur  key %s\n",
 	      msg, level, buf1, buf2, buf3);
 }
 
-static void bch2_btree_iter_verify(struct btree_iter *iter)
+static void bch2_btree_path_verify(struct btree_trans *trans,
+				   struct btree_path *path)
 {
-	struct btree_trans *trans = iter->trans;
 	struct bch_fs *c = trans->c;
 	unsigned i;
 
-	EBUG_ON(iter->btree_id >= BTREE_ID_NR);
+	EBUG_ON(path->btree_id >= BTREE_ID_NR);
+
+	for (i = 0; i < (!path->cached ? BTREE_MAX_DEPTH : 1); i++) {
+		if (!path->l[i].b) {
+			BUG_ON(c->btree_roots[path->btree_id].b->c.level > i);
+			break;
+		}
+
+		bch2_btree_path_verify_level(trans, path, i);
+	}
+
+	bch2_btree_path_verify_locks(path);
+}
+
+void bch2_trans_verify_paths(struct btree_trans *trans)
+{
+	struct btree_path *path;
+
+	if (!bch2_debug_check_iterators)
+		return;
+
+	trans_for_each_path(trans, path)
+		bch2_btree_path_verify(trans, path);
+}
+
+static void bch2_btree_iter_verify(struct btree_iter *iter)
+{
+	struct btree_trans *trans = iter->trans;
+
+	BUG_ON(iter->btree_id >= BTREE_ID_NR);
+
+	BUG_ON(!!(iter->flags & BTREE_ITER_CACHED) != iter->path->cached);
 
 	BUG_ON(!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS) &&
 	       iter->pos.snapshot != iter->snapshot);
@@ -665,16 +699,7 @@ static void bch2_btree_iter_verify(struct btree_iter *iter)
 	       (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) &&
 	       !btree_type_has_snapshots(iter->btree_id));
 
-	for (i = 0; i < (!iter->cached ? BTREE_MAX_DEPTH : 1); i++) {
-		if (!iter->l[i].b) {
-			BUG_ON(c->btree_roots[iter->btree_id].b->c.level > i);
-			break;
-		}
-
-		bch2_btree_iter_verify_level(trans, iter, i);
-	}
-
-	bch2_btree_iter_verify_locks(iter);
+	bch2_btree_path_verify(trans, iter->path);
 }
 
 static void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter)
@@ -686,26 +711,19 @@ static void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter)
 	       bkey_cmp(iter->pos, iter->k.p) > 0);
 }
 
-void bch2_trans_verify_iters(struct btree_trans *trans, struct btree *b)
-{
-	struct btree_iter *iter;
-
-	if (!bch2_debug_check_iterators)
-		return;
-
-	trans_for_each_iter_with_node(trans, b, iter)
-		bch2_btree_iter_verify_level(trans, iter, b->c.level);
-}
-
 #else
 
-static inline void bch2_btree_iter_verify_level(struct btree_trans *trans,
-					struct btree_iter *iter, unsigned l) {}
+static inline void bch2_btree_path_verify_level(struct btree_trans *trans,
+						struct btree_path *path, unsigned l) {}
+static inline void bch2_btree_path_verify(struct btree_trans *trans,
+					  struct btree_path *path) {}
 static inline void bch2_btree_iter_verify(struct btree_iter *iter) {}
 static inline void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter) {}
 
 #endif
 
+/* Btree path: fixups after btree updates */
+
 static void btree_node_iter_set_set_pos(struct btree_node_iter *iter,
 					struct btree *b,
 					struct bset_tree *t,
@@ -723,38 +741,38 @@ static void btree_node_iter_set_set_pos(struct btree_node_iter *iter,
 	bch2_btree_node_iter_push(iter, b, k, btree_bkey_last(b, t));
 }
 
-static void __bch2_btree_iter_fix_key_modified(struct btree_iter *iter,
+static void __bch2_btree_path_fix_key_modified(struct btree_path *path,
 					       struct btree *b,
 					       struct bkey_packed *where)
 {
-	struct btree_iter_level *l = &iter->l[b->c.level];
+	struct btree_path_level *l = &path->l[b->c.level];
 
 	if (where != bch2_btree_node_iter_peek_all(&l->iter, l->b))
 		return;
 
-	if (bkey_iter_pos_cmp(l->b, where, &iter->real_pos) < 0)
+	if (bkey_iter_pos_cmp(l->b, where, &path->pos) < 0)
 		bch2_btree_node_iter_advance(&l->iter, l->b);
 }
 
-void bch2_btree_iter_fix_key_modified(struct btree_trans *trans,
+void bch2_btree_path_fix_key_modified(struct btree_trans *trans,
 				      struct btree *b,
 				      struct bkey_packed *where)
 {
-	struct btree_iter *iter;
+	struct btree_path *path;
 
-	trans_for_each_iter_with_node(trans, b, iter) {
-		__bch2_btree_iter_fix_key_modified(iter, b, where);
-		bch2_btree_iter_verify_level(trans, iter, b->c.level);
+	trans_for_each_path_with_node(trans, b, path) {
+		__bch2_btree_path_fix_key_modified(path, b, where);
+		bch2_btree_path_verify_level(trans, path, b->c.level);
 	}
 }
 
-static void __bch2_btree_node_iter_fix(struct btree_iter *iter,
-				      struct btree *b,
-				      struct btree_node_iter *node_iter,
-				      struct bset_tree *t,
-				      struct bkey_packed *where,
-				      unsigned clobber_u64s,
-				      unsigned new_u64s)
+static void __bch2_btree_node_iter_fix(struct btree_path *path,
+				       struct btree *b,
+				       struct btree_node_iter *node_iter,
+				       struct bset_tree *t,
+				       struct bkey_packed *where,
+				       unsigned clobber_u64s,
+				       unsigned new_u64s)
 {
 	const struct bkey_packed *end = btree_bkey_last(b, t);
 	struct btree_node_iter_set *set;
@@ -772,7 +790,7 @@ static void __bch2_btree_node_iter_fix(struct btree_iter *iter,
 
 	/* didn't find the bset in the iterator - might have to readd it: */
 	if (new_u64s &&
-	    bkey_iter_pos_cmp(b, where, &iter->real_pos) >= 0) {
+	    bkey_iter_pos_cmp(b, where, &path->pos) >= 0) {
 		bch2_btree_node_iter_push(node_iter, b, where, end);
 		goto fixup_done;
 	} else {
@@ -787,7 +805,7 @@ found:
 		return;
 
 	if (new_u64s &&
-	    bkey_iter_pos_cmp(b, where, &iter->real_pos) >= 0) {
+	    bkey_iter_pos_cmp(b, where, &path->pos) >= 0) {
 		set->k = offset;
 	} else if (set->k < offset + clobber_u64s) {
 		set->k = offset + new_u64s;
@@ -814,7 +832,7 @@ fixup_done:
 	if (!bch2_btree_node_iter_end(node_iter) &&
 	    iter_current_key_modified &&
 	    (b->c.level ||
-	     btree_node_type_is_extents(iter->btree_id))) {
+	     btree_node_type_is_extents(path->btree_id))) {
 		struct bset_tree *t;
 		struct bkey_packed *k, *k2, *p;
 
@@ -842,7 +860,7 @@ fixup_done:
 }
 
 void bch2_btree_node_iter_fix(struct btree_trans *trans,
-			      struct btree_iter *iter,
+			      struct btree_path *path,
 			      struct btree *b,
 			      struct btree_node_iter *node_iter,
 			      struct bkey_packed *where,
@@ -850,26 +868,28 @@ void bch2_btree_node_iter_fix(struct btree_trans *trans,
 			      unsigned new_u64s)
 {
 	struct bset_tree *t = bch2_bkey_to_bset_inlined(b, where);
-	struct btree_iter *linked;
+	struct btree_path *linked;
 
-	if (node_iter != &iter->l[b->c.level].iter) {
-		__bch2_btree_node_iter_fix(iter, b, node_iter, t,
+	if (node_iter != &path->l[b->c.level].iter) {
+		__bch2_btree_node_iter_fix(path, b, node_iter, t,
 					   where, clobber_u64s, new_u64s);
 
 		if (bch2_debug_check_iterators)
 			bch2_btree_node_iter_verify(node_iter, b);
 	}
 
-	trans_for_each_iter_with_node(trans, b, linked) {
+	trans_for_each_path_with_node(trans, b, linked) {
 		__bch2_btree_node_iter_fix(linked, b,
 					   &linked->l[b->c.level].iter, t,
 					   where, clobber_u64s, new_u64s);
-		bch2_btree_iter_verify_level(trans, linked, b->c.level);
+		bch2_btree_path_verify_level(trans, linked, b->c.level);
 	}
 }
 
-static inline struct bkey_s_c __btree_iter_unpack(struct btree_iter *iter,
-						  struct btree_iter_level *l,
+/* Btree path level: pointer to a particular btree node and node iter */
+
+static inline struct bkey_s_c __btree_iter_unpack(struct bch_fs *c,
+						  struct btree_path_level *l,
 						  struct bkey *u,
 						  struct bkey_packed *k)
 {
@@ -894,50 +914,54 @@ static inline struct bkey_s_c __btree_iter_unpack(struct btree_iter *iter,
 	 * assertion here:
 	 */
 	if (bch2_debug_check_bkeys && !bkey_deleted(ret.k))
-		bch2_bkey_debugcheck(iter->trans->c, l->b, ret);
+		bch2_bkey_debugcheck(c, l->b, ret);
 
 	return ret;
 }
 
-/* peek_all() doesn't skip deleted keys */
-static inline struct bkey_s_c btree_iter_level_peek_all(struct btree_iter *iter,
-							struct btree_iter_level *l)
+static inline struct bkey_s_c btree_path_level_peek_all(struct bch_fs *c,
+							struct btree_path_level *l,
+							struct bkey *u)
 {
-	return __btree_iter_unpack(iter, l, &iter->k,
+	return __btree_iter_unpack(c, l, u,
 			bch2_btree_node_iter_peek_all(&l->iter, l->b));
 }
 
-static inline struct bkey_s_c btree_iter_level_peek(struct btree_iter *iter,
-						    struct btree_iter_level *l)
+static inline struct bkey_s_c btree_path_level_peek(struct btree_trans *trans,
+						    struct btree_path *path,
+						    struct btree_path_level *l,
+						    struct bkey *u)
 {
-	struct bkey_s_c k = __btree_iter_unpack(iter, l, &iter->k,
+	struct bkey_s_c k = __btree_iter_unpack(trans->c, l, u,
 			bch2_btree_node_iter_peek(&l->iter, l->b));
 
-	iter->real_pos = k.k ? k.k->p : l->b->key.k.p;
-	iter->trans->iters_sorted = false;
+	path->pos = k.k ? k.k->p : l->b->key.k.p;
+	trans->paths_sorted = false;
 	return k;
 }
 
-static inline struct bkey_s_c btree_iter_level_prev(struct btree_iter *iter,
-						    struct btree_iter_level *l)
+static inline struct bkey_s_c btree_path_level_prev(struct btree_trans *trans,
+						    struct btree_path *path,
+						    struct btree_path_level *l,
+						    struct bkey *u)
 {
-	struct bkey_s_c k = __btree_iter_unpack(iter, l, &iter->k,
+	struct bkey_s_c k = __btree_iter_unpack(trans->c, l, u,
 			bch2_btree_node_iter_prev(&l->iter, l->b));
 
-	iter->real_pos = k.k ? k.k->p : l->b->data->min_key;
-	iter->trans->iters_sorted = false;
+	path->pos = k.k ? k.k->p : l->b->data->min_key;
+	trans->paths_sorted = false;
 	return k;
 }
 
-static inline bool btree_iter_advance_to_pos(struct btree_iter *iter,
-					     struct btree_iter_level *l,
+static inline bool btree_path_advance_to_pos(struct btree_path *path,
+					     struct btree_path_level *l,
 					     int max_advance)
 {
 	struct bkey_packed *k;
 	int nr_advanced = 0;
 
 	while ((k = bch2_btree_node_iter_peek_all(&l->iter, l->b)) &&
-	       bkey_iter_pos_cmp(l->b, k, &iter->real_pos) < 0) {
+	       bkey_iter_pos_cmp(l->b, k, &path->pos) < 0) {
 		if (max_advance > 0 && nr_advanced >= max_advance)
 			return false;
 
@@ -951,10 +975,10 @@ static inline bool btree_iter_advance_to_pos(struct btree_iter *iter,
 /*
  * Verify that iterator for parent node points to child node:
  */
-static void btree_iter_verify_new_node(struct btree_trans *trans,
-				       struct btree_iter *iter, struct btree *b)
+static void btree_path_verify_new_node(struct btree_trans *trans,
+				       struct btree_path *path, struct btree *b)
 {
-	struct btree_iter_level *l;
+	struct btree_path_level *l;
 	unsigned plevel;
 	bool parent_locked;
 	struct bkey_packed *k;
@@ -963,15 +987,15 @@ static void btree_iter_verify_new_node(struct btree_trans *trans,
 		return;
 
 	plevel = b->c.level + 1;
-	if (!btree_iter_node(iter, plevel))
+	if (!btree_path_node(path, plevel))
 		return;
 
-	parent_locked = btree_node_locked(iter, plevel);
+	parent_locked = btree_node_locked(path, plevel);
 
-	if (!bch2_btree_node_relock(trans, iter, plevel))
+	if (!bch2_btree_node_relock(trans, path, plevel))
 		return;
 
-	l = &iter->l[plevel];
+	l = &path->l[plevel];
 	k = bch2_btree_node_iter_peek_all(&l->iter, l->b);
 	if (!k ||
 	    bkey_deleted(k) ||
@@ -983,7 +1007,7 @@ static void btree_iter_verify_new_node(struct btree_trans *trans,
 		struct bkey uk = bkey_unpack_key(b, k);
 
 		bch2_dump_btree_node(trans->c, l->b);
-		bch2_bpos_to_text(&PBUF(buf1), iter->real_pos);
+		bch2_bpos_to_text(&PBUF(buf1), path->pos);
 		bch2_bkey_to_text(&PBUF(buf2), &uk);
 		bch2_bpos_to_text(&PBUF(buf3), b->data->min_key);
 		bch2_bpos_to_text(&PBUF(buf3), b->data->max_key);
@@ -991,20 +1015,20 @@ static void btree_iter_verify_new_node(struct btree_trans *trans,
 		      "iter pos %s %s\n"
 		      "iter key %s\n"
 		      "new node %s-%s\n",
-		      bch2_btree_ids[iter->btree_id], buf1,
+		      bch2_btree_ids[path->btree_id], buf1,
 		      buf2, buf3, buf4);
 	}
 
 	if (!parent_locked)
-		btree_node_unlock(iter, b->c.level + 1);
+		btree_node_unlock(path, b->c.level + 1);
 }
 
-static inline void __btree_iter_level_init(struct btree_iter *iter,
+static inline void __btree_path_level_init(struct btree_path *path,
 					   unsigned level)
 {
-	struct btree_iter_level *l = &iter->l[level];
+	struct btree_path_level *l = &path->l[level];
 
-	bch2_btree_node_iter_init(&l->iter, l->b, &iter->real_pos);
+	bch2_btree_node_iter_init(&l->iter, l->b, &path->pos);
 
 	/*
 	 * Iterators to interior nodes should always be pointed at the first non
@@ -1014,22 +1038,24 @@ static inline void __btree_iter_level_init(struct btree_iter *iter,
 		bch2_btree_node_iter_peek(&l->iter, l->b);
 }
 
-static inline void btree_iter_level_init(struct btree_trans *trans,
-					 struct btree_iter *iter,
+static inline void btree_path_level_init(struct btree_trans *trans,
+					 struct btree_path *path,
 					 struct btree *b)
 {
-	BUG_ON(iter->cached);
+	BUG_ON(path->cached);
 
-	btree_iter_verify_new_node(trans, iter, b);
+	btree_path_verify_new_node(trans, path, b);
 
-	EBUG_ON(!btree_iter_pos_in_node(iter, b));
+	EBUG_ON(!btree_path_pos_in_node(path, b));
 	EBUG_ON(b->c.lock.state.seq & 1);
 
-	iter->l[b->c.level].lock_seq = b->c.lock.state.seq;
-	iter->l[b->c.level].b = b;
-	__btree_iter_level_init(iter, b->c.level);
+	path->l[b->c.level].lock_seq = b->c.lock.state.seq;
+	path->l[b->c.level].b = b;
+	__btree_path_level_init(path, b->c.level);
 }
 
+/* Btree path: fixups after btree node updates: */
+
 /*
  * A btree node is being replaced - update the iterator to point to the new
  * node:
@@ -1037,37 +1063,37 @@ static inline void btree_iter_level_init(struct btree_trans *trans,
 void bch2_trans_node_add(struct btree_trans *trans, struct btree *b)
 {
 	enum btree_node_locked_type t;
-	struct btree_iter *iter;
+	struct btree_path *path;
 
-	trans_for_each_iter(trans, iter)
-		if (!iter->cached &&
-		    btree_iter_pos_in_node(iter, b)) {
+	trans_for_each_path(trans, path)
+		if (!path->cached &&
+		    btree_path_pos_in_node(path, b)) {
 			/*
-			 * bch2_trans_node_drop() has already been called -
+			 * bch2_btree_path_node_drop() has already been called -
 			 * the old node we're replacing has already been
 			 * unlocked and the pointer invalidated
 			 */
-			BUG_ON(btree_node_locked(iter, b->c.level));
+			BUG_ON(btree_node_locked(path, b->c.level));
 
-			t = btree_lock_want(iter, b->c.level);
+			t = btree_lock_want(path, b->c.level);
 			if (t != BTREE_NODE_UNLOCKED) {
 				six_lock_increment(&b->c.lock, (enum six_lock_type) t);
-				mark_btree_node_locked(iter, b->c.level, (enum six_lock_type) t);
+				mark_btree_node_locked(path, b->c.level, (enum six_lock_type) t);
 			}
 
-			btree_iter_level_init(trans, iter, b);
+			btree_path_level_init(trans, path, b);
 		}
 }
 
 void bch2_trans_node_drop(struct btree_trans *trans, struct btree *b)
 {
-	struct btree_iter *iter;
+	struct btree_path *path;
 	unsigned level = b->c.level;
 
-	trans_for_each_iter(trans, iter)
-		if (iter->l[level].b == b) {
-			btree_node_unlock(iter, level);
-			iter->l[level].b = BTREE_ITER_NO_NODE_DROP;
+	trans_for_each_path(trans, path)
+		if (path->l[level].b == b) {
+			btree_node_unlock(path, level);
+			path->l[level].b = BTREE_ITER_NO_NODE_DROP;
 		}
 }
 
@@ -1077,12 +1103,14 @@ void bch2_trans_node_drop(struct btree_trans *trans, struct btree *b)
  */
 void bch2_trans_node_reinit_iter(struct btree_trans *trans, struct btree *b)
 {
-	struct btree_iter *iter;
+	struct btree_path *path;
 
-	trans_for_each_iter_with_node(trans, b, iter)
-		__btree_iter_level_init(iter, b->c.level);
+	trans_for_each_path_with_node(trans, b, path)
+		__btree_path_level_init(path, b->c.level);
 }
 
+/* Btree path: traverse, set_pos: */
+
 static int lock_root_check_fn(struct six_lock *lock, void *p)
 {
 	struct btree *b = container_of(lock, struct btree, c.lock);
@@ -1091,38 +1119,38 @@ static int lock_root_check_fn(struct six_lock *lock, void *p)
 	return b == *rootp ? 0 : -1;
 }
 
-static inline int btree_iter_lock_root(struct btree_trans *trans,
-				       struct btree_iter *iter,
+static inline int btree_path_lock_root(struct btree_trans *trans,
+				       struct btree_path *path,
 				       unsigned depth_want,
 				       unsigned long trace_ip)
 {
 	struct bch_fs *c = trans->c;
-	struct btree *b, **rootp = &c->btree_roots[iter->btree_id].b;
+	struct btree *b, **rootp = &c->btree_roots[path->btree_id].b;
 	enum six_lock_type lock_type;
 	unsigned i;
 
-	EBUG_ON(iter->nodes_locked);
+	EBUG_ON(path->nodes_locked);
 
 	while (1) {
 		b = READ_ONCE(*rootp);
-		iter->level = READ_ONCE(b->c.level);
+		path->level = READ_ONCE(b->c.level);
 
-		if (unlikely(iter->level < depth_want)) {
+		if (unlikely(path->level < depth_want)) {
 			/*
 			 * the root is at a lower depth than the depth we want:
 			 * got to the end of the btree, or we're walking nodes
 			 * greater than some depth and there are no nodes >=
 			 * that depth
 			 */
-			iter->level = depth_want;
-			for (i = iter->level; i < BTREE_MAX_DEPTH; i++)
-				iter->l[i].b = NULL;
+			path->level = depth_want;
+			for (i = path->level; i < BTREE_MAX_DEPTH; i++)
+				path->l[i].b = NULL;
 			return 1;
 		}
 
-		lock_type = __btree_lock_want(iter, iter->level);
-		if (unlikely(!btree_node_lock(trans, iter, b, SPOS_MAX,
-					      iter->level, lock_type,
+		lock_type = __btree_lock_want(path, path->level);
+		if (unlikely(!btree_node_lock(trans, path, b, SPOS_MAX,
+					      path->level, lock_type,
 					      lock_root_check_fn, rootp,
 					      trace_ip))) {
 			if (trans->restarted)
@@ -1131,16 +1159,16 @@ static inline int btree_iter_lock_root(struct btree_trans *trans,
 		}
 
 		if (likely(b == READ_ONCE(*rootp) &&
-			   b->c.level == iter->level &&
+			   b->c.level == path->level &&
 			   !race_fault())) {
-			for (i = 0; i < iter->level; i++)
-				iter->l[i].b = BTREE_ITER_NO_NODE_LOCK_ROOT;
-			iter->l[iter->level].b = b;
-			for (i = iter->level + 1; i < BTREE_MAX_DEPTH; i++)
-				iter->l[i].b = NULL;
-
-			mark_btree_node_locked(iter, iter->level, lock_type);
-			btree_iter_level_init(trans, iter, b);
+			for (i = 0; i < path->level; i++)
+				path->l[i].b = BTREE_ITER_NO_NODE_LOCK_ROOT;
+			path->l[path->level].b = b;
+			for (i = path->level + 1; i < BTREE_MAX_DEPTH; i++)
+				path->l[i].b = NULL;
+
+			mark_btree_node_locked(path, path->level, lock_type);
+			btree_path_level_init(trans, path, b);
 			return 0;
 		}
 
@@ -1149,23 +1177,23 @@ static inline int btree_iter_lock_root(struct btree_trans *trans,
 }
 
 noinline
-static int btree_iter_prefetch(struct btree_trans *trans, struct btree_iter *iter)
+static int btree_path_prefetch(struct btree_trans *trans, struct btree_path *path)
 {
 	struct bch_fs *c = trans->c;
-	struct btree_iter_level *l = &iter->l[iter->level];
+	struct btree_path_level *l = path_l(path);
 	struct btree_node_iter node_iter = l->iter;
 	struct bkey_packed *k;
 	struct bkey_buf tmp;
 	unsigned nr = test_bit(BCH_FS_STARTED, &c->flags)
-		? (iter->level > 1 ? 0 :  2)
-		: (iter->level > 1 ? 1 : 16);
-	bool was_locked = btree_node_locked(iter, iter->level);
+		? (path->level > 1 ? 0 :  2)
+		: (path->level > 1 ? 1 : 16);
+	bool was_locked = btree_node_locked(path, path->level);
 	int ret = 0;
 
 	bch2_bkey_buf_init(&tmp);
 
 	while (nr && !ret) {
-		if (!bch2_btree_node_relock(trans, iter, iter->level))
+		if (!bch2_btree_node_relock(trans, path, path->level))
 			break;
 
 		bch2_btree_node_iter_advance(&node_iter, l->b);
@@ -1174,27 +1202,27 @@ static int btree_iter_prefetch(struct btree_trans *trans, struct btree_iter *ite
 			break;
 
 		bch2_bkey_buf_unpack(&tmp, c, l->b, k);
-		ret = bch2_btree_node_prefetch(c, trans, iter, tmp.k,
-					iter->btree_id, iter->level - 1);
+		ret = bch2_btree_node_prefetch(c, trans, path, tmp.k, path->btree_id,
+					       path->level - 1);
 	}
 
 	if (!was_locked)
-		btree_node_unlock(iter, iter->level);
+		btree_node_unlock(path, path->level);
 
 	bch2_bkey_buf_exit(&tmp, c);
 	return ret;
 }
 
 static noinline void btree_node_mem_ptr_set(struct btree_trans *trans,
-					    struct btree_iter *iter,
+					    struct btree_path *path,
 					    unsigned plevel, struct btree *b)
 {
-	struct btree_iter_level *l = &iter->l[plevel];
-	bool locked = btree_node_locked(iter, plevel);
+	struct btree_path_level *l = &path->l[plevel];
+	bool locked = btree_node_locked(path, plevel);
 	struct bkey_packed *k;
 	struct bch_btree_ptr_v2 *bp;
 
-	if (!bch2_btree_node_relock(trans, iter, plevel))
+	if (!bch2_btree_node_relock(trans, path, plevel))
 		return;
 
 	k = bch2_btree_node_iter_peek_all(&l->iter, l->b);
@@ -1204,60 +1232,61 @@ static noinline void btree_node_mem_ptr_set(struct btree_trans *trans,
 	bp->mem_ptr = (unsigned long)b;
 
 	if (!locked)
-		btree_node_unlock(iter, plevel);
+		btree_node_unlock(path, plevel);
 }
 
-static __always_inline int btree_iter_down(struct btree_trans *trans,
-					   struct btree_iter *iter,
+static __always_inline int btree_path_down(struct btree_trans *trans,
+					   struct btree_path *path,
+					   unsigned flags,
 					   unsigned long trace_ip)
 {
 	struct bch_fs *c = trans->c;
-	struct btree_iter_level *l = &iter->l[iter->level];
+	struct btree_path_level *l = path_l(path);
 	struct btree *b;
-	unsigned level = iter->level - 1;
-	enum six_lock_type lock_type = __btree_lock_want(iter, level);
+	unsigned level = path->level - 1;
+	enum six_lock_type lock_type = __btree_lock_want(path, level);
 	struct bkey_buf tmp;
 	int ret;
 
-	EBUG_ON(!btree_node_locked(iter, iter->level));
+	EBUG_ON(!btree_node_locked(path, path->level));
 
 	bch2_bkey_buf_init(&tmp);
 	bch2_bkey_buf_unpack(&tmp, c, l->b,
 			 bch2_btree_node_iter_peek(&l->iter, l->b));
 
-	b = bch2_btree_node_get(trans, iter, tmp.k, level, lock_type, trace_ip);
+	b = bch2_btree_node_get(trans, path, tmp.k, level, lock_type, trace_ip);
 	ret = PTR_ERR_OR_ZERO(b);
 	if (unlikely(ret))
 		goto err;
 
-	mark_btree_node_locked(iter, level, lock_type);
-	btree_iter_level_init(trans, iter, b);
+	mark_btree_node_locked(path, level, lock_type);
+	btree_path_level_init(trans, path, b);
 
 	if (tmp.k->k.type == KEY_TYPE_btree_ptr_v2 &&
 	    unlikely(b != btree_node_mem_ptr(tmp.k)))
-		btree_node_mem_ptr_set(trans, iter, level + 1, b);
+		btree_node_mem_ptr_set(trans, path, level + 1, b);
 
-	if (iter->flags & BTREE_ITER_PREFETCH)
-		ret = btree_iter_prefetch(trans, iter);
+	if (flags & BTREE_ITER_PREFETCH)
+		ret = btree_path_prefetch(trans, path);
 
-	if (btree_node_read_locked(iter, level + 1))
-		btree_node_unlock(iter, level + 1);
-	iter->level = level;
+	if (btree_node_read_locked(path, level + 1))
+		btree_node_unlock(path, level + 1);
+	path->level = level;
 
-	bch2_btree_iter_verify_locks(iter);
+	bch2_btree_path_verify_locks(path);
 err:
 	bch2_bkey_buf_exit(&tmp, c);
 	return ret;
 }
 
-static int btree_iter_traverse_one(struct btree_trans *,
-			struct btree_iter *, unsigned long);
+static int btree_path_traverse_one(struct btree_trans *, struct btree_path *,
+				   unsigned, unsigned long);
 
-static int __btree_iter_traverse_all(struct btree_trans *trans, int ret,
+static int __btree_path_traverse_all(struct btree_trans *trans, int ret,
 				     unsigned long trace_ip)
 {
 	struct bch_fs *c = trans->c;
-	struct btree_iter *iter, *prev = NULL;
+	struct btree_path *path, *prev = NULL;
 	int i;
 
 	if (trans->in_traverse_all)
@@ -1267,21 +1296,21 @@ static int __btree_iter_traverse_all(struct btree_trans *trans, int ret,
 retry_all:
 	trans->restarted = false;
 
-	trans_for_each_iter(trans, iter)
-		iter->should_be_locked = false;
+	trans_for_each_path(trans, path)
+		path->should_be_locked = false;
 
-	btree_trans_sort_iters(trans);
+	btree_trans_sort_paths(trans);
 
-	trans_for_each_iter_inorder_reverse(trans, iter, i) {
+	trans_for_each_path_inorder_reverse(trans, path, i) {
 		if (prev) {
-			if (iter->btree_id == prev->btree_id &&
-			    iter->locks_want < prev->locks_want)
-				__bch2_btree_iter_upgrade(trans, iter, prev->locks_want);
-			else if (!iter->locks_want && prev->locks_want)
-				__bch2_btree_iter_upgrade(trans, iter, 1);
+			if (path->btree_id == prev->btree_id &&
+			    path->locks_want < prev->locks_want)
+				__bch2_btree_path_upgrade(trans, path, prev->locks_want);
+			else if (!path->locks_want && prev->locks_want)
+				__bch2_btree_path_upgrade(trans, path, 1);
 		}
 
-		prev = iter;
+		prev = path;
 	}
 
 	bch2_trans_unlock(trans);
@@ -1308,27 +1337,27 @@ retry_all:
 	/* Now, redo traversals in correct order: */
 	i = 0;
 	while (i < trans->nr_sorted) {
-		iter = trans->iters + trans->sorted[i];
+		path = trans->paths + trans->sorted[i];
 
-		EBUG_ON(!(trans->iters_linked & (1ULL << iter->idx)));
+		EBUG_ON(!(trans->paths_allocated & (1ULL << path->idx)));
 
-		ret = btree_iter_traverse_one(trans, iter, _THIS_IP_);
+		ret = btree_path_traverse_one(trans, path, 0, _THIS_IP_);
 		if (ret)
 			goto retry_all;
 
-		EBUG_ON(!(trans->iters_linked & (1ULL << iter->idx)));
+		EBUG_ON(!(trans->paths_allocated & (1ULL << path->idx)));
 
-		if (iter->nodes_locked)
+		if (path->nodes_locked)
 			i++;
 	}
 
 	/*
 	 * BTREE_ITER_NEED_RELOCK is ok here - if we called bch2_trans_unlock()
-	 * and relock(), relock() won't relock since iter->should_be_locked
+	 * and relock(), relock() won't relock since path->should_be_locked
 	 * isn't set yet, which is all fine
 	 */
-	trans_for_each_iter(trans, iter)
-		BUG_ON(iter->uptodate >= BTREE_ITER_NEED_TRAVERSE);
+	trans_for_each_path(trans, path)
+		BUG_ON(path->uptodate >= BTREE_ITER_NEED_TRAVERSE);
 out:
 	bch2_btree_cache_cannibalize_unlock(c);
 
@@ -1338,36 +1367,36 @@ out:
 	return ret;
 }
 
-static int bch2_btree_iter_traverse_all(struct btree_trans *trans)
+static int bch2_btree_path_traverse_all(struct btree_trans *trans)
 {
-	return __btree_iter_traverse_all(trans, 0, _RET_IP_);
+	return __btree_path_traverse_all(trans, 0, _RET_IP_);
 }
 
-static inline bool btree_iter_good_node(struct btree_trans *trans,
-					struct btree_iter *iter,
+static inline bool btree_path_good_node(struct btree_trans *trans,
+					struct btree_path *path,
 					unsigned l, int check_pos)
 {
-	if (!is_btree_node(iter, l) ||
-	    !bch2_btree_node_relock(trans, iter, l))
+	if (!is_btree_node(path, l) ||
+	    !bch2_btree_node_relock(trans, path, l))
 		return false;
 
-	if (check_pos < 0 && btree_iter_pos_before_node(iter, iter->l[l].b))
+	if (check_pos < 0 && btree_path_pos_before_node(path, path->l[l].b))
 		return false;
-	if (check_pos > 0 && btree_iter_pos_after_node(iter, iter->l[l].b))
+	if (check_pos > 0 && btree_path_pos_after_node(path, path->l[l].b))
 		return false;
 	return true;
 }
 
-static inline unsigned btree_iter_up_until_good_node(struct btree_trans *trans,
-						     struct btree_iter *iter,
+static inline unsigned btree_path_up_until_good_node(struct btree_trans *trans,
+						     struct btree_path *path,
 						     int check_pos)
 {
-	unsigned l = iter->level;
+	unsigned l = path->level;
 
-	while (btree_iter_node(iter, l) &&
-	       !btree_iter_good_node(trans, iter, l, check_pos)) {
-		btree_node_unlock(iter, l);
-		iter->l[l].b = BTREE_ITER_NO_NODE_UP;
+	while (btree_path_node(path, l) &&
+	       !btree_path_good_node(trans, path, l, check_pos)) {
+		btree_node_unlock(path, l);
+		path->l[l].b = BTREE_ITER_NO_NODE_UP;
 		l++;
 	}
 
@@ -1383,53 +1412,54 @@ static inline unsigned btree_iter_up_until_good_node(struct btree_trans *trans,
  * On error, caller (peek_node()/peek_key()) must return NULL; the error is
  * stashed in the iterator and returned from bch2_trans_exit().
  */
-static int btree_iter_traverse_one(struct btree_trans *trans,
-				   struct btree_iter *iter,
+static int btree_path_traverse_one(struct btree_trans *trans,
+				   struct btree_path *path,
+				   unsigned flags,
 				   unsigned long trace_ip)
 {
-	unsigned l, depth_want = iter->level;
+	unsigned l, depth_want = path->level;
 	int ret = 0;
 
 	/*
-	 * Ensure we obey iter->should_be_locked: if it's set, we can't unlock
-	 * and re-traverse the iterator without a transaction restart:
+	 * Ensure we obey path->should_be_locked: if it's set, we can't unlock
+	 * and re-traverse the path without a transaction restart:
 	 */
-	if (iter->should_be_locked) {
-		ret = bch2_btree_iter_relock(trans, iter, trace_ip) ? 0 : -EINTR;
+	if (path->should_be_locked) {
+		ret = bch2_btree_path_relock(trans, path, trace_ip) ? 0 : -EINTR;
 		goto out;
 	}
 
-	if (iter->cached) {
-		ret = bch2_btree_iter_traverse_cached(trans, iter);
+	if (path->cached) {
+		ret = bch2_btree_path_traverse_cached(trans, path, flags);
 		goto out;
 	}
 
-	if (unlikely(iter->level >= BTREE_MAX_DEPTH))
+	if (unlikely(path->level >= BTREE_MAX_DEPTH))
 		goto out;
 
-	iter->level = btree_iter_up_until_good_node(trans, iter, 0);
+	path->level = btree_path_up_until_good_node(trans, path, 0);
 
 	/* If we need intent locks, take them too: */
-	for (l = iter->level + 1;
-	     l < iter->locks_want && btree_iter_node(iter, l);
+	for (l = path->level + 1;
+	     l < path->locks_want && btree_path_node(path, l);
 	     l++)
-		if (!bch2_btree_node_relock(trans, iter, l))
-			while (iter->level <= l) {
-				btree_node_unlock(iter, iter->level);
-				iter->l[iter->level].b = BTREE_ITER_NO_NODE_UP;
-				iter->level++;
+		if (!bch2_btree_node_relock(trans, path, l))
+			while (path->level <= l) {
+				btree_node_unlock(path, path->level);
+				path->l[path->level].b = BTREE_ITER_NO_NODE_UP;
+				path->level++;
 			}
 
 	/*
-	 * Note: iter->nodes[iter->level] may be temporarily NULL here - that
+	 * Note: path->nodes[path->level] may be temporarily NULL here - that
 	 * would indicate to other code that we got to the end of the btree,
 	 * here it indicates that relocking the root failed - it's critical that
-	 * btree_iter_lock_root() comes next and that it can't fail
+	 * btree_path_lock_root() comes next and that it can't fail
 	 */
-	while (iter->level > depth_want) {
-		ret = btree_iter_node(iter, iter->level)
-			? btree_iter_down(trans, iter, trace_ip)
-			: btree_iter_lock_root(trans, iter, depth_want, trace_ip);
+	while (path->level > depth_want) {
+		ret = btree_path_node(path, path->level)
+			? btree_path_down(trans, path, flags, trace_ip)
+			: btree_path_lock_root(trans, path, depth_want, trace_ip);
 		if (unlikely(ret)) {
 			if (ret == 1) {
 				/*
@@ -1440,74 +1470,405 @@ static int btree_iter_traverse_one(struct btree_trans *trans,
 				goto out;
 			}
 
-			__bch2_btree_iter_unlock(iter);
-			iter->level = depth_want;
+			__bch2_btree_path_unlock(path);
+			path->level = depth_want;
 
-			if (ret == -EIO) {
-				iter->flags |= BTREE_ITER_ERROR;
-				iter->l[iter->level].b =
+			if (ret == -EIO)
+				path->l[path->level].b =
 					BTREE_ITER_NO_NODE_ERROR;
-			} else {
-				iter->l[iter->level].b =
+			else
+				path->l[path->level].b =
 					BTREE_ITER_NO_NODE_DOWN;
-			}
 			goto out;
 		}
 	}
 
-	iter->uptodate = BTREE_ITER_UPTODATE;
+	path->uptodate = BTREE_ITER_UPTODATE;
 out:
 	BUG_ON((ret == -EINTR) != !!trans->restarted);
 	trace_iter_traverse(trans->ip, trace_ip,
-			    iter->cached,
-			    iter->btree_id, &iter->real_pos, ret);
-	bch2_btree_iter_verify(iter);
+			    path->cached,
+			    path->btree_id, &path->pos, ret);
+	bch2_btree_path_verify(trans, path);
 	return ret;
 }
 
-static int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter)
+static int __btree_path_traverse_all(struct btree_trans *, int, unsigned long);
+
+int __must_check bch2_btree_path_traverse(struct btree_trans *trans,
+					  struct btree_path *path, unsigned flags)
 {
-	struct btree_trans *trans = iter->trans;
 	int ret;
 
+	if (path->uptodate < BTREE_ITER_NEED_RELOCK)
+		return 0;
+
 	ret =   bch2_trans_cond_resched(trans) ?:
-		btree_iter_traverse_one(trans, iter, _RET_IP_);
-	if (unlikely(ret) && hweight64(trans->iters_linked) == 1) {
-		ret = __btree_iter_traverse_all(trans, ret, _RET_IP_);
+		btree_path_traverse_one(trans, path, flags, _RET_IP_);
+	if (unlikely(ret) && hweight64(trans->paths_allocated) == 1) {
+		ret = __btree_path_traverse_all(trans, ret, _RET_IP_);
 		BUG_ON(ret == -EINTR);
 	}
 
 	return ret;
 }
 
-/*
- * Note:
- * bch2_btree_iter_traverse() is for external users, btree_iter_traverse() is
- * for internal btree iterator users
- *
- * bch2_btree_iter_traverse sets iter->real_pos to iter->pos,
- * btree_iter_traverse() does not:
- */
-static inline int __must_check
-btree_iter_traverse(struct btree_iter *iter)
+static void btree_path_copy(struct btree_trans *trans, struct btree_path *dst,
+			    struct btree_path *src)
+{
+	unsigned i, offset = offsetof(struct btree_path, pos);
+
+	memcpy((void *) dst + offset,
+	       (void *) src + offset,
+	       sizeof(struct btree_path) - offset);
+
+	for (i = 0; i < BTREE_MAX_DEPTH; i++)
+		if (btree_node_locked(dst, i))
+			six_lock_increment(&dst->l[i].b->c.lock,
+					   __btree_lock_want(dst, i));
+
+	trans->paths_sorted = false;
+}
+
+struct btree_path * __must_check
+__bch2_btree_path_make_mut(struct btree_trans *trans,
+			 struct btree_path *path, bool intent)
+{
+	struct btree_path *new = btree_path_alloc(trans, path);
+
+	btree_path_copy(trans, new, path);
+	__btree_path_get(new, intent);
+	__btree_path_put(path, intent);
+	path = new;
+	path->preserve = false;
+#ifdef CONFIG_BCACHEFS_DEBUG
+	path->ip_allocated = _RET_IP_;
+#endif
+	return path;
+}
+
+static struct btree_path * __must_check
+__bch2_btree_path_set_pos(struct btree_trans *trans,
+			  struct btree_path *path, struct bpos new_pos,
+			  bool intent, int cmp)
+{
+#ifdef CONFIG_BCACHEFS_DEBUG
+	struct bpos old_pos = path->pos;
+#endif
+	unsigned l = path->level;
+
+	EBUG_ON(trans->restarted);
+	EBUG_ON(!path->ref);
+
+	path = bch2_btree_path_make_mut(trans, path, intent);
+
+	path->pos		= new_pos;
+	path->should_be_locked	= false;
+	trans->paths_sorted	= false;
+
+	if (unlikely(path->cached)) {
+		btree_node_unlock(path, 0);
+		path->l[0].b = BTREE_ITER_NO_NODE_CACHED;
+		btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
+		goto out;
+	}
+
+	l = btree_path_up_until_good_node(trans, path, cmp);
+
+	if (btree_path_node(path, l)) {
+		/*
+		 * We might have to skip over many keys, or just a few: try
+		 * advancing the node iterator, and if we have to skip over too
+		 * many keys just reinit it (or if we're rewinding, since that
+		 * is expensive).
+		 */
+		if (cmp < 0 ||
+		    !btree_path_advance_to_pos(path, &path->l[l], 8))
+			__btree_path_level_init(path, l);
+
+		/* Don't leave it locked if we're not supposed to: */
+		if (btree_lock_want(path, l) == BTREE_NODE_UNLOCKED)
+			btree_node_unlock(path, l);
+	}
+
+	if (l != path->level)
+		btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
+out:
+	bch2_btree_path_verify(trans, path);
+#ifdef CONFIG_BCACHEFS_DEBUG
+	trace_path_set_pos(trans->ip, _RET_IP_, path->btree_id,
+			   &old_pos, &new_pos, l);
+#endif
+	return path;
+}
+
+static inline struct btree_path * __must_check
+btree_path_set_pos(struct btree_trans *trans,
+		   struct btree_path *path, struct bpos new_pos,
+		   bool intent)
+{
+	int cmp = bpos_cmp(new_pos, path->pos);
+
+	return cmp
+		? __bch2_btree_path_set_pos(trans, path, new_pos, intent, cmp)
+		: path;
+}
+
+/* Btree path: main interface: */
+
+static struct btree_path *have_path_at_pos(struct btree_trans *trans, struct btree_path *path)
+{
+	struct btree_path *next;
+
+	next = prev_btree_path(trans, path);
+	if (next && !btree_path_cmp(next, path))
+		return next;
+
+	next = next_btree_path(trans, path);
+	if (next && !btree_path_cmp(next, path))
+		return next;
+
+	return NULL;
+}
+
+static bool have_node_at_pos(struct btree_trans *trans, struct btree_path *path)
+{
+	struct btree_path *next;
+
+	next = prev_btree_path(trans, path);
+	if (next && path_l(next)->b == path_l(path)->b)
+		return true;
+
+	next = next_btree_path(trans, path);
+	if (next && path_l(next)->b == path_l(path)->b)
+		return true;
+
+	return false;
+}
+
+static inline void __bch2_path_free(struct btree_trans *trans, struct btree_path *path)
 {
-	return iter->uptodate >= BTREE_ITER_NEED_RELOCK
-		? __bch2_btree_iter_traverse(iter)
-		: 0;
+	__bch2_btree_path_unlock(path);
+	btree_path_list_remove(trans, path);
+	trans->paths_allocated &= ~(1ULL << path->idx);
 }
 
+void bch2_path_put(struct btree_trans *trans, struct btree_path *path, bool intent)
+{
+	struct btree_path *dup;
+
+	EBUG_ON(trans->paths + path->idx != path);
+	EBUG_ON(!path->ref);
+
+	if (!__btree_path_put(path, intent))
+		return;
+
+	/*
+	 * Perhaps instead we should check for duplicate paths in traverse_all:
+	 */
+	if (path->preserve &&
+	    (dup = have_path_at_pos(trans, path))) {
+		dup->preserve = true;
+		path->preserve = false;
+	}
+
+	if (!path->preserve &&
+	    have_node_at_pos(trans, path))
+		__bch2_path_free(trans, path);
+}
+
+noinline __cold
+void bch2_dump_trans_paths_updates(struct btree_trans *trans)
+{
+	struct btree_path *path;
+	struct btree_insert_entry *i;
+	unsigned idx;
+	char buf[300];
+
+	btree_trans_sort_paths(trans);
+
+	trans_for_each_path_inorder(trans, path, idx)
+		printk(KERN_ERR "path: idx %u ref %u:%u%s btree %s pos %s %pS\n",
+		       path->idx, path->ref, path->intent_ref,
+		       path->preserve ? " preserve" : "",
+		       bch2_btree_ids[path->btree_id],
+		       (bch2_bpos_to_text(&PBUF(buf), path->pos), buf),
+#ifdef CONFIG_BCACHEFS_DEBUG
+		       (void *) path->ip_allocated
+#else
+		       NULL
+#endif
+		       );
+
+	trans_for_each_update(trans, i)
+		printk(KERN_ERR "update: btree %s %s %pS\n",
+		       bch2_btree_ids[i->btree_id],
+		       (bch2_bkey_val_to_text(&PBUF(buf), trans->c, bkey_i_to_s_c(i->k)), buf),
+		       (void *) i->ip_allocated);
+}
+
+static struct btree_path *btree_path_alloc(struct btree_trans *trans,
+					   struct btree_path *pos)
+{
+	struct btree_path *path;
+	unsigned idx;
+
+	if (unlikely(trans->paths_allocated ==
+		     ~((~0ULL << 1) << (BTREE_ITER_MAX - 1)))) {
+		bch2_dump_trans_paths_updates(trans);
+		panic("trans path oveflow\n");
+	}
+
+	idx = __ffs64(~trans->paths_allocated);
+	trans->paths_allocated |= 1ULL << idx;
+
+	path = &trans->paths[idx];
+
+	path->idx		= idx;
+	path->ref		= 0;
+	path->intent_ref	= 0;
+	path->nodes_locked	= 0;
+	path->nodes_intent_locked = 0;
+
+	btree_path_list_add(trans, pos, path);
+	return path;
+}
+
+struct btree_path *bch2_path_get(struct btree_trans *trans, bool cached,
+				 enum btree_id btree_id, struct bpos pos,
+				 unsigned locks_want, unsigned level,
+				 bool intent)
+{
+	struct btree_path *path, *best = NULL;
+	struct bpos pos_min = POS_MIN;
+	int i;
+
+	BUG_ON(trans->restarted);
+
+	trans_for_each_path(trans, path) {
+		if (path->cached != cached ||
+		    path->btree_id != btree_id ||
+		    path->level != level)
+			continue;
+
+		if (best) {
+			int cmp = bkey_cmp(bpos_diff(best->pos, pos),
+					   bpos_diff(path->pos, pos));
+
+			if (cmp < 0 ||
+			    ((cmp == 0 && (path->ref || path->preserve))))
+				continue;
+		}
+
+		best = path;
+	}
+
+	if (best) {
+		__btree_path_get(best, intent);
+		path = btree_path_set_pos(trans, best, pos, intent);
+		path->preserve = true;
+	} else {
+		path = btree_path_alloc(trans, NULL);
+
+		__btree_path_get(path, intent);
+		path->pos			= pos;
+		path->btree_id			= btree_id;
+		path->cached			= cached;
+		path->preserve			= true;
+		path->uptodate			= BTREE_ITER_NEED_TRAVERSE;
+		path->should_be_locked		= false;
+		path->level			= level;
+		path->locks_want		= locks_want;
+		path->nodes_locked		= 0;
+		path->nodes_intent_locked	= 0;
+		for (i = 0; i < ARRAY_SIZE(path->l); i++)
+			path->l[i].b		= BTREE_ITER_NO_NODE_INIT;
+#ifdef CONFIG_BCACHEFS_DEBUG
+		path->ip_allocated		= _RET_IP_;
+#endif
+		trans->paths_sorted		= false;
+	}
+
+	if (path->intent_ref)
+		locks_want = max(locks_want, level + 1);
+
+	/*
+	 * If the path has locks_want greater than requested, we don't downgrade
+	 * it here - on transaction restart because btree node split needs to
+	 * upgrade locks, we might be putting/getting the iterator again.
+	 * Downgrading iterators only happens via bch2_trans_downgrade(), after
+	 * a successful transaction commit.
+	 */
+
+	locks_want = min(locks_want, BTREE_MAX_DEPTH);
+	if (locks_want > path->locks_want) {
+		path->locks_want = locks_want;
+		btree_path_get_locks(trans, path, true, _THIS_IP_);
+	}
+
+	trace_trans_get_path(_RET_IP_, trans->ip, btree_id,
+			     &pos, locks_want, path->uptodate,
+			     best ? &best->pos		: &pos_min,
+			     best ? best->locks_want	: U8_MAX,
+			     best ? best->uptodate	: U8_MAX);
+
+	return path;
+}
+
+inline struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *path, struct bkey *u)
+{
+
+	struct bkey_s_c k;
+
+	BUG_ON(path->uptodate != BTREE_ITER_UPTODATE);
+
+	if (!path->cached) {
+		struct btree_path_level *l = path_l(path);
+		struct bkey_packed *_k =
+			bch2_btree_node_iter_peek_all(&l->iter, l->b);
+
+		k = _k ? bkey_disassemble(l->b, _k, u) : bkey_s_c_null;
+
+		EBUG_ON(k.k && bkey_deleted(k.k) && bpos_cmp(k.k->p, path->pos) == 0);
+
+		if (!k.k || bpos_cmp(path->pos, k.k->p))
+			goto hole;
+	} else {
+		struct bkey_cached *ck = (void *) path->l[0].b;
+
+		EBUG_ON(path->btree_id != ck->key.btree_id ||
+			bkey_cmp(path->pos, ck->key.pos));
+
+		/* BTREE_ITER_CACHED_NOFILL? */
+		if (unlikely(!ck->valid))
+			goto hole;
+
+		k = bkey_i_to_s_c(ck->k);
+	}
+
+	return k;
+hole:
+	bkey_init(u);
+	u->p = path->pos;
+	return (struct bkey_s_c) { u, NULL };
+}
+
+/* Btree iterators: */
+
 int __must_check
 bch2_btree_iter_traverse(struct btree_iter *iter)
 {
 	int ret;
 
-	btree_iter_set_search_pos(iter, btree_iter_search_key(iter));
+	iter->path = btree_path_set_pos(iter->trans, iter->path,
+					btree_iter_search_key(iter),
+					iter->flags & BTREE_ITER_INTENT);
 
-	ret = btree_iter_traverse(iter);
+	ret = bch2_btree_path_traverse(iter->trans, iter->path, iter->flags);
 	if (ret)
 		return ret;
 
-	iter->should_be_locked = true;
+	iter->path->should_be_locked = true;
 	return 0;
 }
 
@@ -1518,23 +1879,22 @@ struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter)
 	struct btree *b = NULL;
 	int ret;
 
-	EBUG_ON(iter->cached);
+	EBUG_ON(iter->path->cached);
 	bch2_btree_iter_verify(iter);
 
-	ret = btree_iter_traverse(iter);
+	ret = bch2_btree_path_traverse(iter->trans, iter->path, iter->flags);
 	if (ret)
 		goto out;
 
-	b = btree_iter_node(iter, iter->level);
+	b = btree_path_node(iter->path, iter->path->level);
 	if (!b)
 		goto out;
 
 	BUG_ON(bpos_cmp(b->key.k.p, iter->pos) < 0);
 
 	bkey_init(&iter->k);
-	iter->k.p = iter->pos = iter->real_pos = b->key.k.p;
-	iter->trans->iters_sorted = false;
-	iter->should_be_locked = true;
+	iter->k.p = iter->pos = b->key.k.p;
+	iter->path->should_be_locked = true;
 out:
 	bch2_btree_iter_verify_entry_exit(iter);
 	bch2_btree_iter_verify(iter);
@@ -1544,29 +1904,31 @@ out:
 
 struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
 {
+	struct btree_trans *trans = iter->trans;
+	struct btree_path *path = iter->path;
 	struct btree *b = NULL;
 	int ret;
 
-	EBUG_ON(iter->cached);
+	EBUG_ON(iter->path->cached);
 	bch2_btree_iter_verify(iter);
 
 	/* already got to end? */
-	if (!btree_iter_node(iter, iter->level))
+	if (!btree_path_node(path, path->level))
 		goto out;
 
-	bch2_trans_cond_resched(iter->trans);
+	bch2_trans_cond_resched(trans);
 
-	btree_node_unlock(iter, iter->level);
-	iter->l[iter->level].b = BTREE_ITER_NO_NODE_UP;
-	iter->level++;
+	btree_node_unlock(path, path->level);
+	path->l[path->level].b = BTREE_ITER_NO_NODE_UP;
+	path->level++;
 
-	btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
-	ret = btree_iter_traverse(iter);
+	btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
+	ret = bch2_btree_path_traverse(trans, path, iter->flags);
 	if (ret)
 		goto out;
 
 	/* got to end? */
-	b = btree_iter_node(iter, iter->level);
+	b = btree_path_node(path, path->level);
 	if (!b)
 		goto out;
 
@@ -1575,28 +1937,29 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
 		 * Haven't gotten to the end of the parent node: go back down to
 		 * the next child node
 		 */
-		btree_iter_set_search_pos(iter, bpos_successor(iter->pos));
+		path = iter->path =
+			btree_path_set_pos(trans, path, bpos_successor(iter->pos),
+					   iter->flags & BTREE_ITER_INTENT);
 
 		/* Unlock to avoid screwing up our lock invariants: */
-		btree_node_unlock(iter, iter->level);
+		btree_node_unlock(path, path->level);
 
-		iter->level = iter->min_depth;
-		btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
+		path->level = iter->min_depth;
+		btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
 		bch2_btree_iter_verify(iter);
 
-		ret = btree_iter_traverse(iter);
+		ret = bch2_btree_path_traverse(trans, path, iter->flags);
 		if (ret) {
 			b = NULL;
 			goto out;
 		}
 
-		b = iter->l[iter->level].b;
+		b = path->l[path->level].b;
 	}
 
 	bkey_init(&iter->k);
-	iter->k.p = iter->pos = iter->real_pos = b->key.k.p;
-	iter->trans->iters_sorted = false;
-	iter->should_be_locked = true;
+	iter->k.p = iter->pos = b->key.k.p;
+	iter->path->should_be_locked = true;
 out:
 	bch2_btree_iter_verify_entry_exit(iter);
 	bch2_btree_iter_verify(iter);
@@ -1606,60 +1969,6 @@ out:
 
 /* Iterate across keys (in leaf nodes only) */
 
-static void btree_iter_set_search_pos(struct btree_iter *iter, struct bpos new_pos)
-{
-	struct btree_trans *trans = iter->trans;
-#ifdef CONFIG_BCACHEFS_DEBUG
-	struct bpos old_pos = iter->real_pos;
-#endif
-	int cmp = bpos_cmp(new_pos, iter->real_pos);
-	unsigned l = iter->level;
-
-	EBUG_ON(trans->restarted);
-
-	if (!cmp)
-		goto out;
-
-	iter->real_pos = new_pos;
-	iter->should_be_locked = false;
-	trans->iters_sorted = false;
-
-	if (unlikely(iter->cached)) {
-		btree_node_unlock(iter, 0);
-		iter->l[0].b = BTREE_ITER_NO_NODE_CACHED;
-		btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
-		return;
-	}
-
-	l = btree_iter_up_until_good_node(trans, iter, cmp);
-
-	if (btree_iter_node(iter, l)) {
-		/*
-		 * We might have to skip over many keys, or just a few: try
-		 * advancing the node iterator, and if we have to skip over too
-		 * many keys just reinit it (or if we're rewinding, since that
-		 * is expensive).
-		 */
-		if (cmp < 0 ||
-		    !btree_iter_advance_to_pos(iter, &iter->l[l], 8))
-			__btree_iter_level_init(iter, l);
-
-		/* Don't leave it locked if we're not supposed to: */
-		if (btree_lock_want(iter, l) == BTREE_NODE_UNLOCKED)
-			btree_node_unlock(iter, l);
-	}
-out:
-	if (l != iter->level)
-		btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
-
-	bch2_btree_iter_verify(iter);
-#ifdef CONFIG_BCACHEFS_DEBUG
-	trace_iter_set_search_pos(trans->ip, _RET_IP_,
-				  iter->btree_id,
-				  &old_pos, &new_pos, l);
-#endif
-}
-
 inline bool bch2_btree_iter_advance(struct btree_iter *iter)
 {
 	struct bpos pos = iter->k.p;
@@ -1684,7 +1993,7 @@ inline bool bch2_btree_iter_rewind(struct btree_iter *iter)
 	return ret;
 }
 
-static noinline struct bkey_i *__btree_trans_peek_updates(struct btree_iter *iter)
+struct bkey_i *__bch2_btree_trans_peek_updates(struct btree_iter *iter)
 {
 	struct btree_insert_entry *i;
 	struct bkey_i *ret = NULL;
@@ -1694,7 +2003,7 @@ static noinline struct bkey_i *__btree_trans_peek_updates(struct btree_iter *ite
 			continue;
 		if (i->btree_id > iter->btree_id)
 			break;
-		if (bpos_cmp(i->k->k.p, iter->real_pos) < 0)
+		if (bpos_cmp(i->k->k.p, iter->path->pos) < 0)
 			continue;
 		if (!ret || bpos_cmp(i->k->k.p, ret->k.p) < 0)
 			ret = i->k;
@@ -1703,33 +2012,27 @@ static noinline struct bkey_i *__btree_trans_peek_updates(struct btree_iter *ite
 	return ret;
 }
 
-static inline struct bkey_i *btree_trans_peek_updates(struct btree_iter *iter)
-{
-	return iter->flags & BTREE_ITER_WITH_UPDATES
-		? __btree_trans_peek_updates(iter)
-		: NULL;
-}
-
 /**
  * bch2_btree_iter_peek: returns first key greater than or equal to iterator's
  * current position
  */
 struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
 {
-	struct btree_iter_level *l = &iter->l[0];
+	struct btree_trans *trans = iter->trans;
 	struct bpos search_key = btree_iter_search_key(iter);
 	struct bkey_i *next_update;
 	struct bkey_s_c k;
-	int ret;
+	int ret, cmp;
 
-	EBUG_ON(iter->cached || iter->level);
+	EBUG_ON(iter->path->cached || iter->path->level);
 	bch2_btree_iter_verify(iter);
 	bch2_btree_iter_verify_entry_exit(iter);
 
 	while (1) {
-		btree_iter_set_search_pos(iter, search_key);
+		iter->path = btree_path_set_pos(trans, iter->path, search_key,
+				   iter->flags & BTREE_ITER_INTENT);
 
-		ret = btree_iter_traverse(iter);
+		ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
 		if (unlikely(ret)) {
 			/* ensure that iter->k is consistent with iter->pos: */
 			bch2_btree_iter_set_pos(iter, iter->pos);
@@ -1738,7 +2041,7 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
 		}
 
 		next_update = btree_trans_peek_updates(iter);
-		k = btree_iter_level_peek_all(iter, l);
+		k = btree_path_level_peek_all(trans->c, &iter->path->l[0], &iter->k);
 
 		/* * In the btree, deleted keys sort before non deleted: */
 		if (k.k && bkey_deleted(k.k) &&
@@ -1750,7 +2053,7 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
 
 		if (next_update &&
 		    bpos_cmp(next_update->k.p,
-			     k.k ? k.k->p : l->b->key.k.p) <= 0) {
+			     k.k ? k.k->p : iter->path->l[0].b->key.k.p) <= 0) {
 			iter->k = next_update->k;
 			k = bkey_i_to_s_c(next_update);
 		}
@@ -1761,13 +2064,12 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
 
 			/* Advance to next key: */
 			search_key = bkey_successor(iter, k.k->p);
-		} else if (likely(bpos_cmp(l->b->key.k.p, SPOS_MAX))) {
+		} else if (likely(bpos_cmp(iter->path->l[0].b->key.k.p, SPOS_MAX))) {
 			/* Advance to next leaf node: */
-			search_key = bpos_successor(l->b->key.k.p);
+			search_key = bpos_successor(iter->path->l[0].b->key.k.p);
 		} else {
 			/* End of btree: */
 			bch2_btree_iter_set_pos(iter, SPOS_MAX);
-			iter->real_pos = SPOS_MAX;
 			k = bkey_s_c_null;
 			goto out;
 		}
@@ -1781,9 +2083,15 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
 		iter->pos = k.k->p;
 	else if (bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0)
 		iter->pos = bkey_start_pos(k.k);
-	iter->real_pos = k.k->p;
+
+	cmp = bpos_cmp(k.k->p, iter->path->pos);
+	if (cmp) {
+		iter->path->pos = k.k->p;
+		trans->paths_sorted = false;
+	}
 out:
-	iter->should_be_locked = true;
+	iter->path->should_be_locked = true;
+
 	bch2_btree_iter_verify_entry_exit(iter);
 	bch2_btree_iter_verify(iter);
 	return k;
@@ -1807,20 +2115,21 @@ struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter)
  */
 struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
 {
+	struct btree_trans *trans = iter->trans;
 	struct bpos search_key = iter->pos;
-	struct btree_iter_level *l = &iter->l[0];
 	struct bkey_s_c k;
 	int ret;
 
-	EBUG_ON(iter->cached || iter->level);
+	EBUG_ON(iter->path->cached || iter->path->level);
 	EBUG_ON(iter->flags & BTREE_ITER_WITH_UPDATES);
 	bch2_btree_iter_verify(iter);
 	bch2_btree_iter_verify_entry_exit(iter);
 
 	while (1) {
-		btree_iter_set_search_pos(iter, search_key);
+		iter->path = btree_path_set_pos(trans, iter->path, search_key,
+						iter->flags & BTREE_ITER_INTENT);
 
-		ret = btree_iter_traverse(iter);
+		ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
 		if (unlikely(ret)) {
 			/* ensure that iter->k is consistent with iter->pos: */
 			bch2_btree_iter_set_pos(iter, iter->pos);
@@ -1828,18 +2137,20 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
 			goto out;
 		}
 
-		k = btree_iter_level_peek(iter, l);
+		k = btree_path_level_peek(trans, iter->path,
+					  &iter->path->l[0], &iter->k);
 		if (!k.k ||
 		    ((iter->flags & BTREE_ITER_IS_EXTENTS)
 		     ? bkey_cmp(bkey_start_pos(k.k), iter->pos) >= 0
 		     : bkey_cmp(k.k->p, iter->pos) > 0))
-			k = btree_iter_level_prev(iter, l);
+			k = btree_path_level_prev(trans, iter->path,
+						  &iter->path->l[0], &iter->k);
 
 		if (likely(k.k)) {
 			break;
-		} else if (likely(bpos_cmp(l->b->data->min_key, POS_MIN))) {
+		} else if (likely(bpos_cmp(iter->path->l[0].b->data->min_key, POS_MIN))) {
 			/* Advance to previous leaf node: */
-			search_key = bpos_predecessor(l->b->data->min_key);
+			search_key = bpos_predecessor(iter->path->l[0].b->data->min_key);
 		} else {
 			/* Start of btree: */
 			bch2_btree_iter_set_pos(iter, POS_MIN);
@@ -1854,9 +2165,11 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
 	if (bkey_cmp(k.k->p, iter->pos) < 0)
 		iter->pos = k.k->p;
 out:
-	iter->should_be_locked = true;
+	iter->path->should_be_locked = true;
+
 	bch2_btree_iter_verify_entry_exit(iter);
 	bch2_btree_iter_verify(iter);
+
 	return k;
 }
 
@@ -1879,7 +2192,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
 	struct bkey_s_c k;
 	int ret;
 
-	EBUG_ON(iter->level);
+	EBUG_ON(iter->path->level);
 	bch2_btree_iter_verify(iter);
 	bch2_btree_iter_verify_entry_exit(iter);
 
@@ -1893,9 +2206,10 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
 	}
 
 	search_key = btree_iter_search_key(iter);
-	btree_iter_set_search_pos(iter, search_key);
+	iter->path = btree_path_set_pos(trans, iter->path, search_key,
+					iter->flags & BTREE_ITER_INTENT);
 
-	ret = btree_iter_traverse(iter);
+	ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
 	if (unlikely(ret))
 		return bkey_s_c_err(ret);
 
@@ -1903,23 +2217,12 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
 		struct bkey_i *next_update;
 
 		next_update = btree_trans_peek_updates(iter);
-
-		if (!iter->cached) {
-			k = btree_iter_level_peek_all(iter, &iter->l[0]);
-			EBUG_ON(k.k && bkey_deleted(k.k) && bpos_cmp(k.k->p, iter->pos) == 0);
-		} else {
-			struct bkey_cached *ck = (void *) iter->l[0].b;
-			EBUG_ON(iter->btree_id != ck->key.btree_id ||
-				bkey_cmp(iter->pos, ck->key.pos));
-			BUG_ON(!ck->valid);
-
-			k = bkey_i_to_s_c(ck->k);
-		}
-
 		if (next_update &&
-		    (!k.k || bpos_cmp(next_update->k.p, k.k->p) <= 0)) {
+		    !bpos_cmp(next_update->k.p, iter->pos)) {
 			iter->k = next_update->k;
 			k = bkey_i_to_s_c(next_update);
+		} else {
+			k = bch2_btree_path_peek_slot(iter->path, &iter->k);
 		}
 
 		if (!k.k ||
@@ -1934,14 +2237,16 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
 		struct bpos next;
 
 		if (iter->flags & BTREE_ITER_INTENT) {
-			struct btree_iter *child =
-				btree_iter_child_alloc(trans, iter, _THIS_IP_);
+			struct btree_iter iter2;
 
-			btree_iter_copy(trans, child, iter);
-			k = bch2_btree_iter_peek(child);
+			bch2_trans_copy_iter(&iter2, iter);
+			k = bch2_btree_iter_peek(&iter2);
 
-			if (k.k && !bkey_err(k))
-				iter->k = child->k;
+			if (k.k && !bkey_err(k)) {
+				iter->k = iter2.k;
+				k.k = &iter->k;
+			}
+			bch2_trans_iter_exit(trans, &iter2);
 		} else {
 			struct bpos pos = iter->pos;
 
@@ -1969,9 +2274,10 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
 		}
 	}
 
+	iter->path->should_be_locked = true;
+
 	bch2_btree_iter_verify_entry_exit(iter);
 	bch2_btree_iter_verify(iter);
-	iter->should_be_locked = true;
 
 	return k;
 }
@@ -1992,47 +2298,26 @@ struct bkey_s_c bch2_btree_iter_prev_slot(struct btree_iter *iter)
 	return bch2_btree_iter_peek_slot(iter);
 }
 
-static inline void bch2_btree_iter_init(struct btree_trans *trans,
-			struct btree_iter *iter, enum btree_id btree_id)
-{
-	struct bch_fs *c = trans->c;
-	unsigned i;
-
-	iter->trans			= trans;
-	iter->uptodate			= BTREE_ITER_NEED_TRAVERSE;
-	iter->btree_id			= btree_id;
-	iter->real_pos			= POS_MIN;
-	iter->level			= 0;
-	iter->min_depth			= 0;
-	iter->locks_want		= 0;
-	iter->nodes_locked		= 0;
-	iter->nodes_intent_locked	= 0;
-	for (i = 0; i < ARRAY_SIZE(iter->l); i++)
-		iter->l[i].b		= BTREE_ITER_NO_NODE_INIT;
-
-	prefetch(c->btree_roots[btree_id].b);
-}
-
 /* new transactional stuff: */
 
 #ifdef CONFIG_BCACHEFS_DEBUG
 static void btree_trans_verify_sorted_refs(struct btree_trans *trans)
 {
-	struct btree_iter *iter;
+	struct btree_path *path;
 	unsigned i;
 
-	BUG_ON(trans->nr_sorted != hweight64(trans->iters_linked));
+	BUG_ON(trans->nr_sorted != hweight64(trans->paths_allocated));
 
-	trans_for_each_iter(trans, iter) {
-		BUG_ON(iter->sorted_idx >= trans->nr_sorted);
-		BUG_ON(trans->sorted[iter->sorted_idx] != iter->idx);
+	trans_for_each_path(trans, path) {
+		BUG_ON(path->sorted_idx >= trans->nr_sorted);
+		BUG_ON(trans->sorted[path->sorted_idx] != path->idx);
 	}
 
 	for (i = 0; i < trans->nr_sorted; i++) {
 		unsigned idx = trans->sorted[i];
 
-		EBUG_ON(!(trans->iters_linked & (1ULL << idx)));
-		BUG_ON(trans->iters[idx].sorted_idx != i);
+		EBUG_ON(!(trans->paths_allocated & (1ULL << idx)));
+		BUG_ON(trans->paths[idx].sorted_idx != i);
 	}
 }
 #else
@@ -2042,17 +2327,17 @@ static inline void btree_trans_verify_sorted_refs(struct btree_trans *trans) {}
 static void btree_trans_verify_sorted(struct btree_trans *trans)
 {
 #ifdef CONFIG_BCACHEFS_DEBUG
-	struct btree_iter *iter, *prev = NULL;
+	struct btree_path *path, *prev = NULL;
 	unsigned i;
 
-	trans_for_each_iter_inorder(trans, iter, i) {
-		BUG_ON(prev && btree_iter_cmp(prev, iter) > 0);
-		prev = iter;
+	trans_for_each_path_inorder(trans, path, i) {
+		BUG_ON(prev && btree_path_cmp(prev, path) > 0);
+		prev = path;
 	}
 #endif
 }
 
-static noinline void __btree_trans_sort_iters(struct btree_trans *trans)
+static noinline void __btree_trans_sort_paths(struct btree_trans *trans)
 {
 	int i, l = 0, r = trans->nr_sorted, inc = 1;
 	bool swapped;
@@ -2067,11 +2352,11 @@ static noinline void __btree_trans_sort_iters(struct btree_trans *trans)
 		for (i = inc > 0 ? l : r - 2;
 		     i + 1 < r && i >= l;
 		     i += inc) {
-			if (btree_iter_cmp(trans->iters + trans->sorted[i],
-					   trans->iters + trans->sorted[i + 1]) > 0) {
+			if (btree_path_cmp(trans->paths + trans->sorted[i],
+					   trans->paths + trans->sorted[i + 1]) > 0) {
 				swap(trans->sorted[i], trans->sorted[i + 1]);
-				trans->iters[trans->sorted[i]].sorted_idx = i;
-				trans->iters[trans->sorted[i + 1]].sorted_idx = i + 1;
+				trans->paths[trans->sorted[i]].sorted_idx = i;
+				trans->paths[trans->sorted[i + 1]].sorted_idx = i + 1;
 				swapped = true;
 			}
 		}
@@ -2083,246 +2368,82 @@ static noinline void __btree_trans_sort_iters(struct btree_trans *trans)
 		inc = -inc;
 	} while (swapped);
 
-	trans->iters_sorted = true;
+	trans->paths_sorted = true;
 
 	btree_trans_verify_sorted(trans);
 }
 
-static inline void btree_trans_sort_iters(struct btree_trans *trans)
+static inline void btree_trans_sort_paths(struct btree_trans *trans)
 {
 	btree_trans_verify_sorted_refs(trans);
 
-	if (trans->iters_sorted) {
+	if (trans->paths_sorted) {
 		if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG))
 			btree_trans_verify_sorted(trans);
 		return;
 	}
-	__btree_trans_sort_iters(trans);
+	__btree_trans_sort_paths(trans);
 }
 
-static inline void btree_iter_list_remove(struct btree_trans *trans,
-					  struct btree_iter *iter)
+static inline void btree_path_list_remove(struct btree_trans *trans,
+					  struct btree_path *path)
 {
 	unsigned i;
 
-	EBUG_ON(iter->sorted_idx >= trans->nr_sorted);
+	EBUG_ON(path->sorted_idx >= trans->nr_sorted);
 #ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
 	trans->nr_sorted--;
-	memmove_u64s_down_small(trans->sorted + iter->sorted_idx,
-				trans->sorted + iter->sorted_idx + 1,
-				DIV_ROUND_UP(trans->nr_sorted - iter->sorted_idx, 8));
+	memmove_u64s_down_small(trans->sorted + path->sorted_idx,
+				trans->sorted + path->sorted_idx + 1,
+				DIV_ROUND_UP(trans->nr_sorted - path->sorted_idx, 8));
 #else
-	array_remove_item(trans->sorted, trans->nr_sorted, iter->sorted_idx);
+	array_remove_item(trans->sorted, trans->nr_sorted, path->sorted_idx);
 #endif
-	for (i = iter->sorted_idx; i < trans->nr_sorted; i++)
-		trans->iters[trans->sorted[i]].sorted_idx = i;
+	for (i = path->sorted_idx; i < trans->nr_sorted; i++)
+		trans->paths[trans->sorted[i]].sorted_idx = i;
 
-	iter->sorted_idx = U8_MAX;
+	path->sorted_idx = U8_MAX;
 }
 
-static inline void btree_iter_list_add(struct btree_trans *trans,
-				       struct btree_iter *pos,
-				       struct btree_iter *iter)
+static inline void btree_path_list_add(struct btree_trans *trans,
+				       struct btree_path *pos,
+				       struct btree_path *path)
 {
 	unsigned i;
 
-	iter->sorted_idx = pos ? pos->sorted_idx + 1 : 0;
+	path->sorted_idx = pos ? pos->sorted_idx + 1 : 0;
 
 #ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
-	memmove_u64s_up_small(trans->sorted + iter->sorted_idx + 1,
-			      trans->sorted + iter->sorted_idx,
-			      DIV_ROUND_UP(trans->nr_sorted - iter->sorted_idx, 8));
+	memmove_u64s_up_small(trans->sorted + path->sorted_idx + 1,
+			      trans->sorted + path->sorted_idx,
+			      DIV_ROUND_UP(trans->nr_sorted - path->sorted_idx, 8));
 	trans->nr_sorted++;
-	trans->sorted[iter->sorted_idx] = iter->idx;
+	trans->sorted[path->sorted_idx] = path->idx;
 #else
-	array_insert_item(trans->sorted, trans->nr_sorted, iter->sorted_idx, iter->idx);
+	array_insert_item(trans->sorted, trans->nr_sorted, path->sorted_idx, path->idx);
 #endif
 
-	for (i = iter->sorted_idx; i < trans->nr_sorted; i++)
-		trans->iters[trans->sorted[i]].sorted_idx = i;
-
-	btree_trans_verify_sorted_refs(trans);
-}
-
-static void btree_iter_child_free(struct btree_trans *trans, struct btree_iter *iter)
-{
-	struct btree_iter *child = btree_iter_child(trans, iter);
-
-	if (child) {
-		bch2_trans_iter_free(trans, child);
-		iter->child_idx = U8_MAX;
-	}
-}
-
-static struct btree_iter *btree_iter_child_alloc(struct btree_trans *trans,
-						 struct btree_iter *iter,
-						 unsigned long ip)
-{
-	struct btree_iter *child = btree_iter_child(trans, iter);
-
-	if (!child) {
-		child = btree_trans_iter_alloc(trans, iter);
-		child->ip_allocated	= ip;
-		iter->child_idx		= child->idx;
-
-		trans->iters_live	|= 1ULL << child->idx;
-		trans->iters_touched	|= 1ULL << child->idx;
-	}
-
-	return child;
-}
-
-static inline void __bch2_trans_iter_free(struct btree_trans *trans,
-					  unsigned idx)
-{
-	btree_iter_child_free(trans, &trans->iters[idx]);
-
-	btree_iter_list_remove(trans, &trans->iters[idx]);
-
-	__bch2_btree_iter_unlock(&trans->iters[idx]);
-	trans->iters_linked		&= ~(1ULL << idx);
-	trans->iters_live		&= ~(1ULL << idx);
-	trans->iters_touched		&= ~(1ULL << idx);
-
-	btree_trans_verify_sorted_refs(trans);
-}
-
-static bool have_iter_at_pos(struct btree_trans *trans,
-			     struct btree_iter *iter)
-{
-	struct btree_iter *n;
-
-	n = prev_btree_iter(trans, iter);
-	if (n && !btree_iter_cmp(n, iter))
-		return true;
-
-	n = next_btree_iter(trans, iter);
-	if (n && !btree_iter_cmp(n, iter))
-		return true;
-
-	return false;
-}
-
-int bch2_trans_iter_put(struct btree_trans *trans,
-			struct btree_iter *iter)
-{
-	int ret;
-
-	if (IS_ERR_OR_NULL(iter))
-		return 0;
-
-	BUG_ON(trans->iters + iter->idx != iter);
-	BUG_ON(!btree_iter_live(trans, iter));
-
-	ret = btree_iter_err(iter);
-
-	if (!(iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT) &&
-	    (!(trans->iters_touched & (1ULL << iter->idx)) ||
-	     have_iter_at_pos(trans, iter)))
-		__bch2_trans_iter_free(trans, iter->idx);
-
-	trans->iters_live	&= ~(1ULL << iter->idx);
-	return ret;
-}
-
-int bch2_trans_iter_free(struct btree_trans *trans,
-			 struct btree_iter *iter)
-{
-	if (IS_ERR_OR_NULL(iter))
-		return 0;
-
-	set_btree_iter_dontneed(trans, iter);
-
-	return bch2_trans_iter_put(trans, iter);
-}
-
-noinline __cold
-void bch2_dump_trans_iters_updates(struct btree_trans *trans)
-{
-	struct btree_iter *iter;
-	struct btree_insert_entry *i;
-	unsigned idx;
-	char buf1[300], buf2[100];
-
-	btree_trans_sort_iters(trans);
-
-	trans_for_each_iter_inorder(trans, iter, idx)
-		printk(KERN_ERR "iter: btree %s pos %s real_pos %s%s%s%s %pS\n",
-		       bch2_btree_ids[iter->btree_id],
-		       (bch2_bpos_to_text(&PBUF(buf1), iter->pos), buf1),
-		       (bch2_bpos_to_text(&PBUF(buf2), iter->real_pos), buf2),
-		       btree_iter_live(trans, iter) ? " live" : "",
-		       (trans->iters_touched & (1ULL << iter->idx)) ? " touched" : "",
-		       iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT ? " keep" : "",
-		       (void *) iter->ip_allocated);
-
-	trans_for_each_update(trans, i)
-		printk(KERN_ERR "update: btree %s %s %pS\n",
-		       bch2_btree_ids[i->btree_id],
-		       (bch2_bkey_val_to_text(&PBUF(buf1), trans->c, bkey_i_to_s_c(i->k)), buf1),
-		       (void *) i->ip_allocated);
-}
-
-static struct btree_iter *btree_trans_iter_alloc(struct btree_trans *trans,
-						 struct btree_iter *pos)
-{
-	struct btree_iter *iter;
-	unsigned idx;
+	for (i = path->sorted_idx; i < trans->nr_sorted; i++)
+		trans->paths[trans->sorted[i]].sorted_idx = i;
 
 	btree_trans_verify_sorted_refs(trans);
-
-	if (unlikely(trans->iters_linked ==
-		     ~((~0ULL << 1) << (BTREE_ITER_MAX - 1)))) {
-		bch2_dump_trans_iters_updates(trans);
-		panic("trans iter oveflow\n");
-	}
-
-	idx = __ffs64(~trans->iters_linked);
-	iter = &trans->iters[idx];
-
-	iter->trans		= trans;
-	iter->idx		= idx;
-	iter->child_idx		= U8_MAX;
-	iter->sorted_idx	= U8_MAX;
-	iter->flags		= 0;
-	iter->nodes_locked	= 0;
-	iter->nodes_intent_locked = 0;
-	trans->iters_linked	|= 1ULL << idx;
-
-	btree_iter_list_add(trans, pos, iter);
-	return iter;
 }
 
-static void btree_iter_copy(struct btree_trans *trans, struct btree_iter *dst,
-			    struct btree_iter *src)
+void bch2_trans_iter_exit(struct btree_trans *trans, struct btree_iter *iter)
 {
-	unsigned i, offset = offsetof(struct btree_iter, flags);
-
-	__bch2_btree_iter_unlock(dst);
-	btree_iter_child_free(trans, dst);
-
-	memcpy((void *) dst + offset,
-	       (void *) src + offset,
-	       sizeof(struct btree_iter) - offset);
-
-	for (i = 0; i < BTREE_MAX_DEPTH; i++)
-		if (btree_node_locked(dst, i))
-			six_lock_increment(&dst->l[i].b->c.lock,
-					   __btree_lock_want(dst, i));
-
-	dst->flags &= ~BTREE_ITER_KEEP_UNTIL_COMMIT;
-	trans->iters_sorted = false;
+	if (iter->path)
+		bch2_path_put(trans, iter->path,
+			      iter->flags & BTREE_ITER_INTENT);
+	iter->path = NULL;
 }
 
-struct btree_iter *__bch2_trans_get_iter(struct btree_trans *trans,
-					 enum btree_id btree_id, struct bpos pos,
-					 unsigned locks_want,
-					 unsigned depth,
-					 unsigned flags)
+static void __bch2_trans_iter_init(struct btree_trans *trans,
+				   struct btree_iter *iter,
+				   enum btree_id btree_id, struct bpos pos,
+				   unsigned locks_want,
+				   unsigned depth,
+				   unsigned flags)
 {
-	struct btree_iter *iter, *best = NULL;
-	struct bpos real_pos, pos_min = POS_MIN;
-
 	EBUG_ON(trans->restarted);
 
 	if (!(flags & (BTREE_ITER_ALL_SNAPSHOTS|BTREE_ITER_NOT_EXTENTS)) &&
@@ -2337,127 +2458,58 @@ struct btree_iter *__bch2_trans_get_iter(struct btree_trans *trans,
 		pos.snapshot = btree_type_has_snapshots(btree_id)
 			? U32_MAX : 0;
 
-	real_pos = pos;
-
-	if ((flags & BTREE_ITER_IS_EXTENTS) &&
-	    bkey_cmp(pos, POS_MAX))
-		real_pos = bpos_nosnap_successor(pos);
-
-	trans_for_each_iter(trans, iter) {
-		if (iter->cached != (flags & BTREE_ITER_CACHED))
-			continue;
-
-		if (iter->btree_id != btree_id)
-			continue;
-
-		if (best) {
-			int cmp = bkey_cmp(bpos_diff(best->real_pos, real_pos),
-					   bpos_diff(iter->real_pos, real_pos));
-
-			if (cmp < 0 ||
-			    ((cmp == 0 && btree_iter_keep(trans, iter))))
-				continue;
-		}
-
-		best = iter;
-	}
-
-	if (!best) {
-		iter = btree_trans_iter_alloc(trans, best);
-		bch2_btree_iter_init(trans, iter, btree_id);
-	} else if (btree_iter_keep(trans, best)) {
-		iter = btree_trans_iter_alloc(trans, best);
-		btree_iter_copy(trans, iter, best);
-	} else {
-		iter = best;
-	}
-
-	trans->iters_live	|= 1ULL << iter->idx;
-	trans->iters_touched	|= 1ULL << iter->idx;
-
-	iter->cached	= flags & BTREE_ITER_CACHED;
+	iter->trans	= trans;
+	iter->path	= NULL;
+	iter->btree_id	= btree_id;
+	iter->min_depth	= depth;
 	iter->flags	= flags;
 	iter->snapshot	= pos.snapshot;
+	iter->pos	= pos;
+	iter->k.type	= KEY_TYPE_deleted;
+	iter->k.p	= pos;
+	iter->k.size	= 0;
 
-	/*
-	 * If the iterator has locks_want greater than requested, we explicitly
-	 * do not downgrade it here - on transaction restart because btree node
-	 * split needs to upgrade locks, we might be putting/getting the
-	 * iterator again. Downgrading iterators only happens via an explicit
-	 * bch2_trans_downgrade().
-	 */
-
-	locks_want = min(locks_want, BTREE_MAX_DEPTH);
-	if (locks_want > iter->locks_want) {
-		iter->locks_want = locks_want;
-		btree_iter_get_locks(trans, iter, true, _THIS_IP_);
-	}
-
-	while (iter->level != depth) {
-		btree_node_unlock(iter, iter->level);
-		iter->l[iter->level].b = BTREE_ITER_NO_NODE_INIT;
-		iter->uptodate = BTREE_ITER_NEED_TRAVERSE;
-		if (iter->level < depth)
-			iter->level++;
-		else
-			iter->level--;
-	}
-
-	iter->min_depth	= depth;
-
-	bch2_btree_iter_set_pos(iter, pos);
-	btree_iter_set_search_pos(iter, real_pos);
-
-	trace_trans_get_iter(_RET_IP_, trans->ip,
-			     btree_id,
-			     &real_pos, locks_want, iter->uptodate,
-			     best ? &best->real_pos	: &pos_min,
-			     best ? best->locks_want	: U8_MAX,
-			     best ? best->uptodate	: U8_MAX);
-
-	return iter;
+	iter->path = bch2_path_get(trans,
+				   flags & BTREE_ITER_CACHED,
+				   btree_id,
+				   btree_iter_search_key(iter),
+				   locks_want,
+				   depth,
+				   flags & BTREE_ITER_INTENT);
 }
 
-struct btree_iter *bch2_trans_get_node_iter(struct btree_trans *trans,
-					    enum btree_id btree_id,
-					    struct bpos pos,
-					    unsigned locks_want,
-					    unsigned depth,
-					    unsigned flags)
+void bch2_trans_iter_init(struct btree_trans *trans,
+			  struct btree_iter *iter,
+			  unsigned btree_id, struct bpos pos,
+			  unsigned flags)
 {
-	struct btree_iter *iter =
-		__bch2_trans_get_iter(trans, btree_id, pos,
-				      locks_want, depth,
-				      BTREE_ITER_NOT_EXTENTS|
-				      __BTREE_ITER_ALL_SNAPSHOTS|
-				      BTREE_ITER_ALL_SNAPSHOTS|
-				      flags);
-
-	BUG_ON(bkey_cmp(iter->pos, pos));
-	BUG_ON(iter->locks_want != min(locks_want, BTREE_MAX_DEPTH));
-	BUG_ON(iter->level	!= depth);
-	BUG_ON(iter->min_depth	!= depth);
-	iter->ip_allocated = _RET_IP_;
-
-	return iter;
+	__bch2_trans_iter_init(trans, iter, btree_id, pos,
+			       0, 0, flags);
 }
 
-struct btree_iter *__bch2_trans_copy_iter(struct btree_trans *trans,
-					  struct btree_iter *src)
+void bch2_trans_node_iter_init(struct btree_trans *trans,
+			       struct btree_iter *iter,
+			       enum btree_id btree_id,
+			       struct bpos pos,
+			       unsigned locks_want,
+			       unsigned depth,
+			       unsigned flags)
 {
-	struct btree_iter *iter;
-
-	iter = btree_trans_iter_alloc(trans, src);
-	btree_iter_copy(trans, iter, src);
-
-	trans->iters_live |= 1ULL << iter->idx;
-	/*
-	 * We don't need to preserve this iter since it's cheap to copy it
-	 * again - this will cause trans_iter_put() to free it right away:
-	 */
-	set_btree_iter_dontneed(trans, iter);
+	__bch2_trans_iter_init(trans, iter, btree_id, pos, locks_want, depth,
+			       BTREE_ITER_NOT_EXTENTS|
+			       __BTREE_ITER_ALL_SNAPSHOTS|
+			       BTREE_ITER_ALL_SNAPSHOTS|
+			       flags);
+	BUG_ON(iter->path->locks_want	 < min(locks_want, BTREE_MAX_DEPTH));
+	BUG_ON(iter->path->level	!= depth);
+	BUG_ON(iter->min_depth		!= depth);
+}
 
-	return iter;
+void bch2_trans_copy_iter(struct btree_iter *dst, struct btree_iter *src)
+{
+	*dst = *src;
+	if (src->path)
+		__btree_path_get(src->path, src->flags & BTREE_ITER_INTENT);
 }
 
 void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
@@ -2498,20 +2550,6 @@ void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
 	return p;
 }
 
-inline void bch2_trans_unlink_iters(struct btree_trans *trans)
-{
-	u64 iters = trans->iters_linked &
-		~trans->iters_touched &
-		~trans->iters_live;
-
-	while (iters) {
-		unsigned idx = __ffs64(iters);
-
-		iters &= ~(1ULL << idx);
-		__bch2_trans_iter_free(trans, idx);
-	}
-}
-
 /**
  * bch2_trans_begin() - reset a transaction after a interrupted attempt
  * @trans: transaction to reset
@@ -2522,17 +2560,11 @@ inline void bch2_trans_unlink_iters(struct btree_trans *trans)
  */
 void bch2_trans_begin(struct btree_trans *trans)
 {
-	struct btree_iter *iter;
-
-	trans_for_each_iter(trans, iter)
-		iter->flags &= ~BTREE_ITER_KEEP_UNTIL_COMMIT;
+	struct btree_insert_entry *i;
+	struct btree_path *path;
 
-	/*
-	 * XXX: we shouldn't be doing this if the transaction was restarted, but
-	 * currently we still overflow transaction iterators if we do that
-	 * */
-	bch2_trans_unlink_iters(trans);
-	trans->iters_touched &= trans->iters_live;
+	trans_for_each_update(trans, i)
+		__btree_path_put(i->path, true);
 
 	trans->extra_journal_res	= 0;
 	trans->nr_updates		= 0;
@@ -2550,29 +2582,41 @@ void bch2_trans_begin(struct btree_trans *trans)
 		       (void *) &trans->fs_usage_deltas->memset_start);
 	}
 
+	trans_for_each_path(trans, path) {
+		/*
+		 * XXX: we probably shouldn't be doing this if the transaction
+		 * was restarted, but currently we still overflow transaction
+		 * iterators if we do that
+		 */
+		if (!path->ref && !path->preserve)
+			__bch2_path_free(trans, path);
+		else
+			path->preserve = path->should_be_locked = false;
+	}
+
 	bch2_trans_cond_resched(trans);
 
 	if (trans->restarted)
-		bch2_btree_iter_traverse_all(trans);
+		bch2_btree_path_traverse_all(trans);
 
 	trans->restarted = false;
 }
 
-static void bch2_trans_alloc_iters(struct btree_trans *trans, struct bch_fs *c)
+static void bch2_trans_alloc_paths(struct btree_trans *trans, struct bch_fs *c)
 {
-	size_t iters_bytes	= sizeof(struct btree_iter) * BTREE_ITER_MAX;
+	size_t paths_bytes	= sizeof(struct btree_path) * BTREE_ITER_MAX;
 	size_t updates_bytes	= sizeof(struct btree_insert_entry) * BTREE_ITER_MAX;
 	void *p = NULL;
 
 	BUG_ON(trans->used_mempool);
 
 #ifdef __KERNEL__
-	p = this_cpu_xchg(c->btree_iters_bufs->iter, NULL);
+	p = this_cpu_xchg(c->btree_paths_bufs->path , NULL);
 #endif
 	if (!p)
-		p = mempool_alloc(&trans->c->btree_iters_pool, GFP_NOFS);
+		p = mempool_alloc(&trans->c->btree_paths_pool, GFP_NOFS);
 
-	trans->iters		= p; p += iters_bytes;
+	trans->paths		= p; p += paths_bytes;
 	trans->updates		= p; p += updates_bytes;
 }
 
@@ -2585,11 +2629,7 @@ void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c,
 	trans->c		= c;
 	trans->ip		= _RET_IP_;
 
-	/*
-	 * reallocating iterators currently completely breaks
-	 * bch2_trans_iter_put(), we always allocate the max:
-	 */
-	bch2_trans_alloc_iters(trans, c);
+	bch2_trans_alloc_paths(trans, c);
 
 	if (expected_mem_bytes) {
 		expected_mem_bytes = roundup_pow_of_two(expected_mem_bytes);
@@ -2613,54 +2653,63 @@ void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c,
 #endif
 }
 
+static void check_btree_paths_leaked(struct btree_trans *trans)
+{
+#ifdef CONFIG_BCACHEFS_DEBUG
+	struct bch_fs *c = trans->c;
+	struct btree_path *path;
+
+	trans_for_each_path(trans, path)
+		if (path->ref)
+			goto leaked;
+	return;
+leaked:
+	bch_err(c, "btree paths leaked from %pS!", (void *) trans->ip);
+	trans_for_each_path(trans, path)
+		if (path->ref)
+			printk(KERN_ERR "  btree %s %pS\n",
+			       bch2_btree_ids[path->btree_id],
+			       (void *) path->ip_allocated);
+	/* Be noisy about this: */
+	bch2_fatal_error(c);
+#endif
+}
+
 int bch2_trans_exit(struct btree_trans *trans)
 	__releases(&c->btree_trans_barrier)
 {
+	struct btree_insert_entry *i;
 	struct bch_fs *c = trans->c;
 
 	bch2_trans_unlock(trans);
 
-#ifdef CONFIG_BCACHEFS_DEBUG
-	if (trans->iters_live) {
-		struct btree_iter *iter;
-
-		trans_for_each_iter(trans, iter)
-			btree_iter_child_free(trans, iter);
-	}
+	trans_for_each_update(trans, i)
+		__btree_path_put(i->path, true);
+	trans->nr_updates		= 0;
 
-	if (trans->iters_live) {
-		struct btree_iter *iter;
-
-		bch_err(c, "btree iterators leaked!");
-		trans_for_each_iter(trans, iter)
-			if (btree_iter_live(trans, iter))
-				printk(KERN_ERR "  btree %s allocated at %pS\n",
-				       bch2_btree_ids[iter->btree_id],
-				       (void *) iter->ip_allocated);
-		/* Be noisy about this: */
-		bch2_fatal_error(c);
-	}
+	check_btree_paths_leaked(trans);
 
-	mutex_lock(&trans->c->btree_trans_lock);
+#ifdef CONFIG_BCACHEFS_DEBUG
+	mutex_lock(&c->btree_trans_lock);
 	list_del(&trans->list);
-	mutex_unlock(&trans->c->btree_trans_lock);
+	mutex_unlock(&c->btree_trans_lock);
 #endif
 
 	srcu_read_unlock(&c->btree_trans_barrier, trans->srcu_idx);
 
-	bch2_journal_preres_put(&trans->c->journal, &trans->journal_preres);
+	bch2_journal_preres_put(&c->journal, &trans->journal_preres);
 
 	if (trans->fs_usage_deltas) {
 		if (trans->fs_usage_deltas->size + sizeof(trans->fs_usage_deltas) ==
 		    REPLICAS_DELTA_LIST_MAX)
 			mempool_free(trans->fs_usage_deltas,
-				     &trans->c->replicas_delta_pool);
+				     &c->replicas_delta_pool);
 		else
 			kfree(trans->fs_usage_deltas);
 	}
 
 	if (trans->mem_bytes == BTREE_TRANS_MEM_MAX)
-		mempool_free(trans->mem, &trans->c->btree_trans_mem_pool);
+		mempool_free(trans->mem, &c->btree_trans_mem_pool);
 	else
 		kfree(trans->mem);
 
@@ -2668,20 +2717,20 @@ int bch2_trans_exit(struct btree_trans *trans)
 	/*
 	 * Userspace doesn't have a real percpu implementation:
 	 */
-	trans->iters = this_cpu_xchg(c->btree_iters_bufs->iter, trans->iters);
+	trans->paths = this_cpu_xchg(c->btree_paths_bufs->path, trans->paths);
 #endif
 
-	if (trans->iters)
-		mempool_free(trans->iters, &trans->c->btree_iters_pool);
+	if (trans->paths)
+		mempool_free(trans->paths, &c->btree_paths_pool);
 
 	trans->mem	= (void *) 0x1;
-	trans->iters	= (void *) 0x1;
+	trans->paths	= (void *) 0x1;
 
 	return trans->error ? -EIO : 0;
 }
 
 static void __maybe_unused
-bch2_btree_iter_node_to_text(struct printbuf *out,
+bch2_btree_path_node_to_text(struct printbuf *out,
 			     struct btree_bkey_cached_common *_b,
 			     bool cached)
 {
@@ -2693,10 +2742,10 @@ bch2_btree_iter_node_to_text(struct printbuf *out,
 #ifdef CONFIG_BCACHEFS_DEBUG
 static bool trans_has_locks(struct btree_trans *trans)
 {
-	struct btree_iter *iter;
+	struct btree_path *path;
 
-	trans_for_each_iter(trans, iter)
-		if (iter->nodes_locked)
+	trans_for_each_path(trans, path)
+		if (path->nodes_locked)
 			return true;
 	return false;
 }
@@ -2706,7 +2755,7 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct bch_fs *c)
 {
 #ifdef CONFIG_BCACHEFS_DEBUG
 	struct btree_trans *trans;
-	struct btree_iter *iter;
+	struct btree_path *path;
 	struct btree *b;
 	unsigned l;
 
@@ -2717,24 +2766,24 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct bch_fs *c)
 
 		pr_buf(out, "%i %ps\n", trans->pid, (void *) trans->ip);
 
-		trans_for_each_iter(trans, iter) {
-			if (!iter->nodes_locked)
+		trans_for_each_path(trans, path) {
+			if (!path->nodes_locked)
 				continue;
 
-			pr_buf(out, "  iter %u %c %s:",
-			       iter->idx,
-			       iter->cached ? 'c' : 'b',
-			       bch2_btree_ids[iter->btree_id]);
-			bch2_bpos_to_text(out, iter->pos);
+			pr_buf(out, "  path %u %c %s:",
+			       path->idx,
+			       path->cached ? 'c' : 'b',
+			       bch2_btree_ids[path->btree_id]);
+			bch2_bpos_to_text(out, path->pos);
 			pr_buf(out, "\n");
 
 			for (l = 0; l < BTREE_MAX_DEPTH; l++) {
-				if (btree_node_locked(iter, l)) {
+				if (btree_node_locked(path, l)) {
 					pr_buf(out, "    %s l=%u ",
-					       btree_node_intent_locked(iter, l) ? "i" : "r", l);
-					bch2_btree_iter_node_to_text(out,
-							(void *) iter->l[l].b,
-							iter->cached);
+					       btree_node_intent_locked(path, l) ? "i" : "r", l);
+					bch2_btree_path_node_to_text(out,
+							(void *) path->l[l].b,
+							path->cached);
 					pr_buf(out, "\n");
 				}
 			}
@@ -2742,18 +2791,17 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct bch_fs *c)
 
 		b = READ_ONCE(trans->locking);
 		if (b) {
-			iter = &trans->iters[trans->locking_iter_idx];
-			pr_buf(out, "  locking iter %u %c l=%u %s:",
-			       trans->locking_iter_idx,
-			       iter->cached ? 'c' : 'b',
+			path = &trans->paths[trans->locking_path_idx];
+			pr_buf(out, "  locking path %u %c l=%u %s:",
+			       trans->locking_path_idx,
+			       path->cached ? 'c' : 'b',
 			       trans->locking_level,
 			       bch2_btree_ids[trans->locking_btree_id]);
 			bch2_bpos_to_text(out, trans->locking_pos);
 
 			pr_buf(out, " node ");
-			bch2_btree_iter_node_to_text(out,
-					(void *) b,
-					iter->cached);
+			bch2_btree_path_node_to_text(out,
+					(void *) b, path->cached);
 			pr_buf(out, "\n");
 		}
 	}
@@ -2764,7 +2812,7 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct bch_fs *c)
 void bch2_fs_btree_iter_exit(struct bch_fs *c)
 {
 	mempool_exit(&c->btree_trans_mem_pool);
-	mempool_exit(&c->btree_iters_pool);
+	mempool_exit(&c->btree_paths_pool);
 	cleanup_srcu_struct(&c->btree_trans_barrier);
 }
 
@@ -2776,8 +2824,8 @@ int bch2_fs_btree_iter_init(struct bch_fs *c)
 	mutex_init(&c->btree_trans_lock);
 
 	return  init_srcu_struct(&c->btree_trans_barrier) ?:
-		mempool_init_kmalloc_pool(&c->btree_iters_pool, 1,
-			sizeof(struct btree_iter) * nr +
+		mempool_init_kmalloc_pool(&c->btree_paths_pool, 1,
+			sizeof(struct btree_path) * nr +
 			sizeof(struct btree_insert_entry) * nr) ?:
 		mempool_init_kmalloc_pool(&c->btree_trans_mem_pool, 1,
 					  BTREE_TRANS_MEM_MAX);
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index 4ba55e02d4b7..983d61122458 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -5,40 +5,49 @@
 #include "bset.h"
 #include "btree_types.h"
 
-static inline void btree_iter_set_dirty(struct btree_iter *iter,
-					enum btree_iter_uptodate u)
+static inline void __btree_path_get(struct btree_path *path, bool intent)
 {
-	iter->uptodate = max_t(unsigned, iter->uptodate, u);
+	path->ref++;
+	path->intent_ref += intent;
 }
 
-static inline struct btree *btree_iter_node(struct btree_iter *iter,
+static inline bool __btree_path_put(struct btree_path *path, bool intent)
+{
+	EBUG_ON(!path->ref);
+	EBUG_ON(!path->intent_ref && intent);
+	path->intent_ref -= intent;
+	return --path->ref == 0;
+}
+
+static inline void btree_path_set_dirty(struct btree_path *path,
+					enum btree_path_uptodate u)
+{
+	path->uptodate = max_t(unsigned, path->uptodate, u);
+}
+
+static inline struct btree *btree_path_node(struct btree_path *path,
 					    unsigned level)
 {
-	return level < BTREE_MAX_DEPTH ? iter->l[level].b : NULL;
+	return level < BTREE_MAX_DEPTH ? path->l[level].b : NULL;
 }
 
-static inline bool btree_node_lock_seq_matches(const struct btree_iter *iter,
+static inline bool btree_node_lock_seq_matches(const struct btree_path *path,
 					const struct btree *b, unsigned level)
 {
 	/*
 	 * We don't compare the low bits of the lock sequence numbers because
-	 * @iter might have taken a write lock on @b, and we don't want to skip
-	 * the linked iterator if the sequence numbers were equal before taking
-	 * that write lock. The lock sequence number is incremented by taking
-	 * and releasing write locks and is even when unlocked:
+	 * @path might have taken a write lock on @b, and we don't want to skip
+	 * the linked path if the sequence numbers were equal before taking that
+	 * write lock. The lock sequence number is incremented by taking and
+	 * releasing write locks and is even when unlocked:
 	 */
-	return iter->l[level].lock_seq >> 1 == b->c.lock.state.seq >> 1;
+	return path->l[level].lock_seq >> 1 == b->c.lock.state.seq >> 1;
 }
 
-static inline struct btree *btree_node_parent(struct btree_iter *iter,
+static inline struct btree *btree_node_parent(struct btree_path *path,
 					      struct btree *b)
 {
-	return btree_iter_node(iter, b->c.level + 1);
-}
-
-static inline bool btree_trans_has_multiple_iters(const struct btree_trans *trans)
-{
-	return hweight64(trans->iters_linked) > 1;
+	return btree_path_node(path, b->c.level + 1);
 }
 
 static inline int btree_iter_err(const struct btree_iter *iter)
@@ -46,102 +55,121 @@ static inline int btree_iter_err(const struct btree_iter *iter)
 	return iter->flags & BTREE_ITER_ERROR ? -EIO : 0;
 }
 
-/* Iterate over iters within a transaction: */
+/* Iterate over paths within a transaction: */
 
-static inline struct btree_iter *
-__trans_next_iter(struct btree_trans *trans, unsigned idx)
+static inline struct btree_path *
+__trans_next_path(struct btree_trans *trans, unsigned idx)
 {
 	u64 l;
 
 	if (idx == BTREE_ITER_MAX)
 		return NULL;
 
-	l = trans->iters_linked >> idx;
+	l = trans->paths_allocated >> idx;
 	if (!l)
 		return NULL;
 
 	idx += __ffs64(l);
 	EBUG_ON(idx >= BTREE_ITER_MAX);
-	EBUG_ON(trans->iters[idx].idx != idx);
-	return &trans->iters[idx];
+	EBUG_ON(trans->paths[idx].idx != idx);
+	return &trans->paths[idx];
 }
 
-#define trans_for_each_iter(_trans, _iter)				\
-	for (_iter = __trans_next_iter((_trans), 0);			\
-	     (_iter);							\
-	     _iter = __trans_next_iter((_trans), (_iter)->idx + 1))
+#define trans_for_each_path(_trans, _path)				\
+	for (_path = __trans_next_path((_trans), 0);			\
+	     (_path);							\
+	     _path = __trans_next_path((_trans), (_path)->idx + 1))
 
-static inline struct btree_iter *next_btree_iter(struct btree_trans *trans, struct btree_iter *iter)
+static inline struct btree_path *next_btree_path(struct btree_trans *trans, struct btree_path *path)
 {
-	unsigned idx = iter ? iter->sorted_idx + 1 : 0;
+	unsigned idx = path ? path->sorted_idx + 1 : 0;
 
 	EBUG_ON(idx > trans->nr_sorted);
 
 	return idx < trans->nr_sorted
-		? trans->iters + trans->sorted[idx]
+		? trans->paths + trans->sorted[idx]
 		: NULL;
 }
 
-static inline struct btree_iter *prev_btree_iter(struct btree_trans *trans, struct btree_iter *iter)
+static inline struct btree_path *prev_btree_path(struct btree_trans *trans, struct btree_path *path)
 {
-	unsigned idx = iter ? iter->sorted_idx : trans->nr_sorted;
+	unsigned idx = path ? path->sorted_idx : trans->nr_sorted;
 
 	return idx
-		? trans->iters + trans->sorted[idx - 1]
+		? trans->paths + trans->sorted[idx - 1]
 		: NULL;
 }
 
-#define trans_for_each_iter_inorder(_trans, _iter, _i)			\
+#define trans_for_each_path_inorder(_trans, _path, _i)			\
 	for (_i = 0;							\
-	     ((_iter) = (_trans)->iters + trans->sorted[_i]), (_i) < (_trans)->nr_sorted;\
+	     ((_path) = (_trans)->paths + trans->sorted[_i]), (_i) < (_trans)->nr_sorted;\
 	     _i++)
 
-#define trans_for_each_iter_inorder_reverse(_trans, _iter, _i)		\
+#define trans_for_each_path_inorder_reverse(_trans, _path, _i)		\
 	for (_i = trans->nr_sorted - 1;					\
-	     ((_iter) = (_trans)->iters + trans->sorted[_i]), (_i) >= 0;\
+	     ((_path) = (_trans)->paths + trans->sorted[_i]), (_i) >= 0;\
 	     --_i)
 
-static inline bool __iter_has_node(const struct btree_iter *iter,
+static inline bool __path_has_node(const struct btree_path *path,
 				   const struct btree *b)
 {
-	return iter->l[b->c.level].b == b &&
-		btree_node_lock_seq_matches(iter, b, b->c.level);
+	return path->l[b->c.level].b == b &&
+		btree_node_lock_seq_matches(path, b, b->c.level);
 }
 
-static inline struct btree_iter *
-__trans_next_iter_with_node(struct btree_trans *trans, struct btree *b,
+static inline struct btree_path *
+__trans_next_path_with_node(struct btree_trans *trans, struct btree *b,
 			    unsigned idx)
 {
-	struct btree_iter *iter = __trans_next_iter(trans, idx);
+	struct btree_path *path = __trans_next_path(trans, idx);
+
+	while (path && !__path_has_node(path, b))
+		path = __trans_next_path(trans, path->idx + 1);
 
-	while (iter && !__iter_has_node(iter, b))
-		iter = __trans_next_iter(trans, iter->idx + 1);
+	return path;
+}
+
+#define trans_for_each_path_with_node(_trans, _b, _path)		\
+	for (_path = __trans_next_path_with_node((_trans), (_b), 0);	\
+	     (_path);							\
+	     _path = __trans_next_path_with_node((_trans), (_b),	\
+						 (_path)->idx + 1))
+
+struct btree_path *__bch2_btree_path_make_mut(struct btree_trans *,
+					      struct btree_path *, bool);
 
-	return iter;
+static inline struct btree_path * __must_check
+bch2_btree_path_make_mut(struct btree_trans *trans,
+			 struct btree_path *path, bool intent)
+{
+	if (path->ref > 1 || path->preserve)
+		path = __bch2_btree_path_make_mut(trans, path, intent);
+	return path;
 }
 
-#define trans_for_each_iter_with_node(_trans, _b, _iter)		\
-	for (_iter = __trans_next_iter_with_node((_trans), (_b), 0);	\
-	     (_iter);							\
-	     _iter = __trans_next_iter_with_node((_trans), (_b),	\
-						 (_iter)->idx + 1))
+int __must_check bch2_btree_path_traverse(struct btree_trans *,
+					  struct btree_path *, unsigned);
+struct btree_path *bch2_path_get(struct btree_trans *, bool, enum btree_id,
+				 struct bpos, unsigned, unsigned, bool);
+inline struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *, struct bkey *);
 
 #ifdef CONFIG_BCACHEFS_DEBUG
-void bch2_trans_verify_iters(struct btree_trans *, struct btree *);
+void bch2_trans_verify_paths(struct btree_trans *);
 void bch2_trans_verify_locks(struct btree_trans *);
 #else
-static inline void bch2_trans_verify_iters(struct btree_trans *trans,
-					   struct btree *b) {}
-static inline void bch2_trans_verify_locks(struct btree_trans *iter) {}
+static inline void bch2_trans_verify_paths(struct btree_trans *trans) {}
+static inline void bch2_trans_verify_locks(struct btree_trans *trans) {}
 #endif
 
-void bch2_btree_iter_fix_key_modified(struct btree_trans *trans,
+void bch2_btree_path_fix_key_modified(struct btree_trans *trans,
 				      struct btree *, struct bkey_packed *);
-void bch2_btree_node_iter_fix(struct btree_trans *trans, struct btree_iter *,
+void bch2_btree_node_iter_fix(struct btree_trans *trans, struct btree_path *,
 			      struct btree *, struct btree_node_iter *,
 			      struct bkey_packed *, unsigned, unsigned);
 
-bool bch2_btree_iter_relock_intent(struct btree_trans *, struct btree_iter *);
+bool bch2_btree_path_relock_intent(struct btree_trans *, struct btree_path *);
+
+void bch2_path_put(struct btree_trans *, struct btree_path *, bool);
 
 bool bch2_trans_relock(struct btree_trans *);
 void bch2_trans_unlock(struct btree_trans *);
@@ -154,28 +182,28 @@ static inline int btree_trans_restart(struct btree_trans *trans)
 	return -EINTR;
 }
 
-bool __bch2_btree_iter_upgrade(struct btree_trans *,
-			       struct btree_iter *, unsigned);
+bool __bch2_btree_path_upgrade(struct btree_trans *,
+			       struct btree_path *, unsigned);
 
-static inline bool bch2_btree_iter_upgrade(struct btree_trans *trans,
-					   struct btree_iter *iter,
+static inline bool bch2_btree_path_upgrade(struct btree_trans *trans,
+					   struct btree_path *path,
 					   unsigned new_locks_want)
 {
 	new_locks_want = min(new_locks_want, BTREE_MAX_DEPTH);
 
-	return iter->locks_want < new_locks_want
-		? __bch2_btree_iter_upgrade(trans, iter, new_locks_want)
-		: iter->uptodate == BTREE_ITER_UPTODATE;
+	return path->locks_want < new_locks_want
+		? __bch2_btree_path_upgrade(trans, path, new_locks_want)
+		: path->uptodate == BTREE_ITER_UPTODATE;
 }
 
-void __bch2_btree_iter_downgrade(struct btree_iter *, unsigned);
+void __bch2_btree_path_downgrade(struct btree_path *, unsigned);
 
-static inline void bch2_btree_iter_downgrade(struct btree_iter *iter)
+static inline void bch2_btree_path_downgrade(struct btree_path *path)
 {
-	unsigned new_locks_want = iter->level + !!(iter->flags & BTREE_ITER_INTENT);
+	unsigned new_locks_want = path->level + !!path->intent_ref;
 
-	if (iter->locks_want > new_locks_want)
-		__bch2_btree_iter_downgrade(iter, new_locks_want);
+	if (path->locks_want > new_locks_want)
+		__bch2_btree_path_downgrade(path, new_locks_want);
 }
 
 void bch2_trans_downgrade(struct btree_trans *);
@@ -212,7 +240,8 @@ static inline void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos
 	iter->k.p.offset	= iter->pos.offset	= new_pos.offset;
 	iter->k.p.snapshot	= iter->pos.snapshot	= new_pos.snapshot;
 	iter->k.size = 0;
-	iter->should_be_locked = false;
+	if (iter->path->ref == 1)
+		iter->path->should_be_locked = false;
 }
 
 static inline void bch2_btree_iter_set_pos_to_extent_start(struct btree_iter *iter)
@@ -221,17 +250,6 @@ static inline void bch2_btree_iter_set_pos_to_extent_start(struct btree_iter *it
 	iter->pos = bkey_start_pos(&iter->k);
 }
 
-static inline struct btree_iter *idx_to_btree_iter(struct btree_trans *trans, unsigned idx)
-{
-	return idx != U8_MAX ? trans->iters + idx : NULL;
-}
-
-static inline struct btree_iter *btree_iter_child(struct btree_trans *trans,
-						  struct btree_iter *iter)
-{
-	return idx_to_btree_iter(trans, iter->child_idx);
-}
-
 /*
  * Unlocks before scheduling
  * Note: does not revalidate iterator
@@ -249,11 +267,11 @@ static inline int bch2_trans_cond_resched(struct btree_trans *trans)
 
 #define __for_each_btree_node(_trans, _iter, _btree_id, _start,	\
 			      _locks_want, _depth, _flags, _b)		\
-	for (iter = bch2_trans_get_node_iter((_trans), (_btree_id),	\
+	for (bch2_trans_node_iter_init((_trans), &(_iter), (_btree_id),	\
 				_start, _locks_want, _depth, _flags),	\
-	     _b = bch2_btree_iter_peek_node(_iter);			\
+	     _b = bch2_btree_iter_peek_node(&(_iter));			\
 	     (_b);							\
-	     (_b) = bch2_btree_iter_next_node(_iter))
+	     (_b) = bch2_btree_iter_next_node(&(_iter)))
 
 #define for_each_btree_node(_trans, _iter, _btree_id, _start,		\
 			    _flags, _b)					\
@@ -283,77 +301,36 @@ static inline int bkey_err(struct bkey_s_c k)
 
 #define for_each_btree_key(_trans, _iter, _btree_id,			\
 			   _start, _flags, _k, _ret)			\
-	for ((_iter) = bch2_trans_get_iter((_trans), (_btree_id),	\
-					   (_start), (_flags)),		\
-	     (_k) = __bch2_btree_iter_peek(_iter, _flags);		\
+	for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id),	\
+				  (_start), (_flags)),			\
+	     (_k) = __bch2_btree_iter_peek(&(_iter), _flags);		\
 	     !((_ret) = bkey_err(_k)) && (_k).k;			\
-	     (_k) = __bch2_btree_iter_next(_iter, _flags))
+	     (_k) = __bch2_btree_iter_next(&(_iter), _flags))
 
 #define for_each_btree_key_continue(_iter, _flags, _k, _ret)		\
-	for ((_k) = __bch2_btree_iter_peek(_iter, _flags);		\
+	for ((_k) = __bch2_btree_iter_peek(&(_iter), _flags);		\
 	     !((_ret) = bkey_err(_k)) && (_k).k;			\
-	     (_k) = __bch2_btree_iter_next(_iter, _flags))
+	     (_k) = __bch2_btree_iter_next(&(_iter), _flags))
 
 /* new multiple iterator interface: */
 
-void bch2_dump_trans_iters_updates(struct btree_trans *);
-
-int bch2_trans_iter_put(struct btree_trans *, struct btree_iter *);
-int bch2_trans_iter_free(struct btree_trans *, struct btree_iter *);
-
-void bch2_trans_unlink_iters(struct btree_trans *);
-
-struct btree_iter *__bch2_trans_get_iter(struct btree_trans *, enum btree_id,
-					 struct bpos, unsigned,
-					 unsigned, unsigned);
-
-static inline struct btree_iter *
-bch2_trans_get_iter(struct btree_trans *trans, enum btree_id btree_id,
-		    struct bpos pos, unsigned flags)
-{
-	struct btree_iter *iter =
-		__bch2_trans_get_iter(trans, btree_id, pos,
-				      (flags & BTREE_ITER_INTENT) != 0, 0,
-				      flags);
-	iter->ip_allocated = _THIS_IP_;
-	return iter;
-}
-
-struct btree_iter *__bch2_trans_copy_iter(struct btree_trans *,
-					  struct btree_iter *);
-static inline struct btree_iter *
-bch2_trans_copy_iter(struct btree_trans *trans, struct btree_iter *src)
-{
-	struct btree_iter *iter =
-		__bch2_trans_copy_iter(trans, src);
-
-	iter->ip_allocated = _THIS_IP_;
-	return iter;
-}
-
-struct btree_iter *bch2_trans_get_node_iter(struct btree_trans *,
-				enum btree_id, struct bpos,
-				unsigned, unsigned, unsigned);
-
-static inline bool btree_iter_live(struct btree_trans *trans, struct btree_iter *iter)
-{
-	return (trans->iters_live & (1ULL << iter->idx)) != 0;
-}
+void bch2_dump_trans_paths_updates(struct btree_trans *);
 
-static inline bool btree_iter_keep(struct btree_trans *trans, struct btree_iter *iter)
-{
-	return btree_iter_live(trans, iter) ||
-		(iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT);
-}
+void bch2_trans_iter_exit(struct btree_trans *, struct btree_iter *);
+void bch2_trans_iter_init(struct btree_trans *, struct btree_iter *,
+			  unsigned, struct bpos, unsigned);
+void bch2_trans_node_iter_init(struct btree_trans *, struct btree_iter *,
+			       enum btree_id, struct bpos,
+			       unsigned, unsigned, unsigned);
+void bch2_trans_copy_iter(struct btree_iter *, struct btree_iter *);
 
-static inline void set_btree_iter_dontneed(struct btree_trans *trans, struct btree_iter *iter)
+static inline void set_btree_iter_dontneed(struct btree_iter *iter)
 {
-	trans->iters_touched &= ~(1ULL << iter->idx);
+	iter->path->preserve = false;
 }
 
-void bch2_trans_begin(struct btree_trans *);
-
 void *bch2_trans_kmalloc(struct btree_trans *, size_t);
+void bch2_trans_begin(struct btree_trans *);
 void bch2_trans_init(struct btree_trans *, struct bch_fs *, unsigned, size_t);
 int bch2_trans_exit(struct btree_trans *);
 
diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index 61210db57f56..9bdc2c3f21bf 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -197,23 +197,23 @@ btree_key_cache_create(struct btree_key_cache *c,
 }
 
 static int btree_key_cache_fill(struct btree_trans *trans,
-				struct btree_iter *ck_iter,
+				struct btree_path *ck_path,
 				struct bkey_cached *ck)
 {
-	struct btree_iter *iter;
+	struct btree_iter iter;
 	struct bkey_s_c k;
 	unsigned new_u64s = 0;
 	struct bkey_i *new_k = NULL;
 	int ret;
 
-	iter = bch2_trans_get_iter(trans, ck->key.btree_id,
-				   ck->key.pos, BTREE_ITER_SLOTS);
-	k = bch2_btree_iter_peek_slot(iter);
+	bch2_trans_iter_init(trans, &iter, ck->key.btree_id,
+			     ck->key.pos, BTREE_ITER_SLOTS);
+	k = bch2_btree_iter_peek_slot(&iter);
 	ret = bkey_err(k);
 	if (ret)
 		goto err;
 
-	if (!bch2_btree_node_relock(trans, ck_iter, 0)) {
+	if (!bch2_btree_node_relock(trans, ck_path, 0)) {
 		trace_transaction_restart_ip(trans->ip, _THIS_IP_);
 		ret = btree_trans_restart(trans);
 		goto err;
@@ -238,7 +238,7 @@ static int btree_key_cache_fill(struct btree_trans *trans,
 	 * XXX: not allowed to be holding read locks when we take a write lock,
 	 * currently
 	 */
-	bch2_btree_node_lock_write(trans, ck_iter, ck_iter->l[0].b);
+	bch2_btree_node_lock_write(trans, ck_path, ck_path->l[0].b);
 	if (new_k) {
 		kfree(ck->k);
 		ck->u64s = new_u64s;
@@ -247,62 +247,64 @@ static int btree_key_cache_fill(struct btree_trans *trans,
 
 	bkey_reassemble(ck->k, k);
 	ck->valid = true;
-	bch2_btree_node_unlock_write(trans, ck_iter, ck_iter->l[0].b);
+	bch2_btree_node_unlock_write(trans, ck_path, ck_path->l[0].b);
 
 	/* We're not likely to need this iterator again: */
-	set_btree_iter_dontneed(trans, iter);
+	set_btree_iter_dontneed(&iter);
 err:
-	bch2_trans_iter_put(trans, iter);
+	bch2_trans_iter_exit(trans, &iter);
 	return ret;
 }
 
 static int bkey_cached_check_fn(struct six_lock *lock, void *p)
 {
 	struct bkey_cached *ck = container_of(lock, struct bkey_cached, c.lock);
-	const struct btree_iter *iter = p;
+	const struct btree_path *path = p;
 
-	return ck->key.btree_id == iter->btree_id &&
-		!bpos_cmp(ck->key.pos, iter->pos) ? 0 : -1;
+	return ck->key.btree_id == path->btree_id &&
+		!bpos_cmp(ck->key.pos, path->pos) ? 0 : -1;
 }
 
 __flatten
-int bch2_btree_iter_traverse_cached(struct btree_trans *trans, struct btree_iter *iter)
+int bch2_btree_path_traverse_cached(struct btree_trans *trans, struct btree_path *path,
+				    unsigned flags)
 {
 	struct bch_fs *c = trans->c;
 	struct bkey_cached *ck;
 	int ret = 0;
 
-	BUG_ON(iter->level);
+	BUG_ON(path->level);
 
-	iter->l[1].b = NULL;
+	path->l[1].b = NULL;
 
-	if (bch2_btree_node_relock(trans, iter, 0)) {
-		ck = (void *) iter->l[0].b;
+	if (bch2_btree_node_relock(trans, path, 0)) {
+		ck = (void *) path->l[0].b;
 		goto fill;
 	}
 retry:
-	ck = bch2_btree_key_cache_find(c, iter->btree_id, iter->pos);
+	ck = bch2_btree_key_cache_find(c, path->btree_id, path->pos);
 	if (!ck) {
-		if (iter->flags & BTREE_ITER_CACHED_NOCREATE) {
-			iter->l[0].b = NULL;
+		if (flags & BTREE_ITER_CACHED_NOCREATE) {
+			path->l[0].b = NULL;
 			return 0;
 		}
 
 		ck = btree_key_cache_create(&c->btree_key_cache,
-					    iter->btree_id, iter->pos);
+					    path->btree_id, path->pos);
 		ret = PTR_ERR_OR_ZERO(ck);
 		if (ret)
 			goto err;
 		if (!ck)
 			goto retry;
 
-		mark_btree_node_locked(iter, 0, SIX_LOCK_intent);
-		iter->locks_want = 1;
+		mark_btree_node_locked(path, 0, SIX_LOCK_intent);
+		path->locks_want = 1;
 	} else {
-		enum six_lock_type lock_want = __btree_lock_want(iter, 0);
+		enum six_lock_type lock_want = __btree_lock_want(path, 0);
 
-		if (!btree_node_lock(trans, iter, (void *) ck, iter->pos, 0, lock_want,
-				     bkey_cached_check_fn, iter, _THIS_IP_)) {
+		if (!btree_node_lock(trans, path, (void *) ck, path->pos, 0,
+				     lock_want,
+				     bkey_cached_check_fn, path, _THIS_IP_)) {
 			if (!trans->restarted)
 				goto retry;
 
@@ -311,28 +313,27 @@ retry:
 			goto err;
 		}
 
-		if (ck->key.btree_id != iter->btree_id ||
-		    bpos_cmp(ck->key.pos, iter->pos)) {
+		if (ck->key.btree_id != path->btree_id ||
+		    bpos_cmp(ck->key.pos, path->pos)) {
 			six_unlock_type(&ck->c.lock, lock_want);
 			goto retry;
 		}
 
-		mark_btree_node_locked(iter, 0, lock_want);
+		mark_btree_node_locked(path, 0, lock_want);
 	}
 
-	iter->l[0].lock_seq	= ck->c.lock.state.seq;
-	iter->l[0].b		= (void *) ck;
+	path->l[0].lock_seq	= ck->c.lock.state.seq;
+	path->l[0].b		= (void *) ck;
 fill:
-	if (!ck->valid && !(iter->flags & BTREE_ITER_CACHED_NOFILL)) {
-		if (!iter->locks_want &&
-		    !!__bch2_btree_iter_upgrade(trans, iter, 1)) {
+	if (!ck->valid && !(flags & BTREE_ITER_CACHED_NOFILL)) {
+		if (!path->locks_want &&
+		    !__bch2_btree_path_upgrade(trans, path, 1)) {
 			trace_transaction_restart_ip(trans->ip, _THIS_IP_);
-			BUG_ON(!trans->restarted);
-			ret = -EINTR;
+			ret = btree_trans_restart(trans);
 			goto err;
 		}
 
-		ret = btree_key_cache_fill(trans, iter, ck);
+		ret = btree_key_cache_fill(trans, path, ck);
 		if (ret)
 			goto err;
 	}
@@ -340,22 +341,14 @@ fill:
 	if (!test_bit(BKEY_CACHED_ACCESSED, &ck->flags))
 		set_bit(BKEY_CACHED_ACCESSED, &ck->flags);
 
-	iter->uptodate = BTREE_ITER_UPTODATE;
-
-	if ((iter->flags & BTREE_ITER_INTENT) &&
-	    !bch2_btree_iter_upgrade(trans, iter, 1)) {
-		BUG_ON(!trans->restarted);
-		ret = -EINTR;
-	}
-
-	BUG_ON(!ret && !btree_node_locked(iter, 0));
+	path->uptodate = BTREE_ITER_UPTODATE;
+	BUG_ON(btree_node_locked_type(path, 0) != btree_lock_want(path, 0));
 
 	return ret;
 err:
 	if (ret != -EINTR) {
-		btree_node_unlock(iter, 0);
-		iter->flags |= BTREE_ITER_ERROR;
-		iter->l[0].b = BTREE_ITER_NO_NODE_ERROR;
+		btree_node_unlock(path, 0);
+		path->l[0].b = BTREE_ITER_NO_NODE_ERROR;
 	}
 	return ret;
 }
@@ -368,23 +361,23 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans,
 {
 	struct bch_fs *c = trans->c;
 	struct journal *j = &c->journal;
-	struct btree_iter *c_iter = NULL, *b_iter = NULL;
+	struct btree_iter c_iter, b_iter;
 	struct bkey_cached *ck = NULL;
 	int ret;
 
-	b_iter = bch2_trans_get_iter(trans, key.btree_id, key.pos,
-				     BTREE_ITER_SLOTS|
-				     BTREE_ITER_INTENT);
-	c_iter = bch2_trans_get_iter(trans, key.btree_id, key.pos,
-				     BTREE_ITER_CACHED|
-				     BTREE_ITER_CACHED_NOFILL|
-				     BTREE_ITER_CACHED_NOCREATE|
-				     BTREE_ITER_INTENT);
-	ret = bch2_btree_iter_traverse(c_iter);
+	bch2_trans_iter_init(trans, &b_iter, key.btree_id, key.pos,
+			     BTREE_ITER_SLOTS|
+			     BTREE_ITER_INTENT);
+	bch2_trans_iter_init(trans, &c_iter, key.btree_id, key.pos,
+			     BTREE_ITER_CACHED|
+			     BTREE_ITER_CACHED_NOFILL|
+			     BTREE_ITER_CACHED_NOCREATE|
+			     BTREE_ITER_INTENT);
+	ret = bch2_btree_iter_traverse(&c_iter);
 	if (ret)
 		goto out;
 
-	ck = (void *) c_iter->l[0].b;
+	ck = (void *) c_iter.path->l[0].b;
 	if (!ck ||
 	    (journal_seq && ck->journal.seq != journal_seq))
 		goto out;
@@ -400,8 +393,8 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans,
 	 * allocator/copygc depend on journal reclaim making progress, we need
 	 * to be using alloc reserves:
 	 * */
-	ret   = bch2_btree_iter_traverse(b_iter) ?:
-		bch2_trans_update(trans, b_iter, ck->k,
+	ret   = bch2_btree_iter_traverse(&b_iter) ?:
+		bch2_trans_update(trans, &b_iter, ck->k,
 				  BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|
 				  BTREE_TRIGGER_NORUN) ?:
 		bch2_trans_commit(trans, NULL, NULL,
@@ -423,7 +416,7 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans,
 	bch2_journal_pin_drop(j, &ck->journal);
 	bch2_journal_preres_put(j, &ck->res);
 
-	BUG_ON(!btree_node_locked(c_iter, 0));
+	BUG_ON(!btree_node_locked(c_iter.path, 0));
 
 	if (!evict) {
 		if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
@@ -432,10 +425,10 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans,
 		}
 	} else {
 evict:
-		BUG_ON(!btree_node_intent_locked(c_iter, 0));
+		BUG_ON(!btree_node_intent_locked(c_iter.path, 0));
 
-		mark_btree_node_unlocked(c_iter, 0);
-		c_iter->l[0].b = NULL;
+		mark_btree_node_unlocked(c_iter.path, 0);
+		c_iter.path->l[0].b = NULL;
 
 		six_lock_write(&ck->c.lock, NULL, NULL);
 
@@ -451,8 +444,8 @@ evict:
 		mutex_unlock(&c->btree_key_cache.lock);
 	}
 out:
-	bch2_trans_iter_put(trans, b_iter);
-	bch2_trans_iter_put(trans, c_iter);
+	bch2_trans_iter_exit(trans, &b_iter);
+	bch2_trans_iter_exit(trans, &c_iter);
 	return ret;
 }
 
@@ -503,11 +496,11 @@ int bch2_btree_key_cache_flush(struct btree_trans *trans,
 }
 
 bool bch2_btree_insert_key_cached(struct btree_trans *trans,
-				  struct btree_iter *iter,
+				  struct btree_path *path,
 				  struct bkey_i *insert)
 {
 	struct bch_fs *c = trans->c;
-	struct bkey_cached *ck = (void *) iter->l[0].b;
+	struct bkey_cached *ck = (void *) path->l[0].b;
 	bool kick_reclaim = false;
 
 	BUG_ON(insert->u64s > ck->u64s);
diff --git a/fs/bcachefs/btree_key_cache.h b/fs/bcachefs/btree_key_cache.h
index d890632e4425..0768ef3ca776 100644
--- a/fs/bcachefs/btree_key_cache.h
+++ b/fs/bcachefs/btree_key_cache.h
@@ -26,10 +26,11 @@ int bch2_btree_key_cache_journal_flush(struct journal *,
 struct bkey_cached *
 bch2_btree_key_cache_find(struct bch_fs *, enum btree_id, struct bpos);
 
-int bch2_btree_iter_traverse_cached(struct btree_trans *, struct btree_iter *);
+int bch2_btree_path_traverse_cached(struct btree_trans *, struct btree_path *,
+				    unsigned);
 
 bool bch2_btree_insert_key_cached(struct btree_trans *,
-			struct btree_iter *, struct bkey_i *);
+			struct btree_path *, struct bkey_i *);
 int bch2_btree_key_cache_flush(struct btree_trans *,
 			       enum btree_id, struct bpos);
 #ifdef CONFIG_BCACHEFS_DEBUG
diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h
index b490e4808631..d05689180c63 100644
--- a/fs/bcachefs/btree_locking.h
+++ b/fs/bcachefs/btree_locking.h
@@ -20,7 +20,7 @@ enum btree_node_locked_type {
 	BTREE_NODE_INTENT_LOCKED	= SIX_LOCK_intent,
 };
 
-static inline int btree_node_locked_type(struct btree_iter *iter,
+static inline int btree_node_locked_type(struct btree_path *path,
 					 unsigned level)
 {
 	/*
@@ -29,35 +29,35 @@ static inline int btree_node_locked_type(struct btree_iter *iter,
 	 * branches:
 	 */
 	return BTREE_NODE_UNLOCKED +
-		((iter->nodes_locked >> level) & 1) +
-		((iter->nodes_intent_locked >> level) & 1);
+		((path->nodes_locked >> level) & 1) +
+		((path->nodes_intent_locked >> level) & 1);
 }
 
-static inline bool btree_node_intent_locked(struct btree_iter *iter,
+static inline bool btree_node_intent_locked(struct btree_path *path,
 					    unsigned level)
 {
-	return btree_node_locked_type(iter, level) == BTREE_NODE_INTENT_LOCKED;
+	return btree_node_locked_type(path, level) == BTREE_NODE_INTENT_LOCKED;
 }
 
-static inline bool btree_node_read_locked(struct btree_iter *iter,
+static inline bool btree_node_read_locked(struct btree_path *path,
 					  unsigned level)
 {
-	return btree_node_locked_type(iter, level) == BTREE_NODE_READ_LOCKED;
+	return btree_node_locked_type(path, level) == BTREE_NODE_READ_LOCKED;
 }
 
-static inline bool btree_node_locked(struct btree_iter *iter, unsigned level)
+static inline bool btree_node_locked(struct btree_path *path, unsigned level)
 {
-	return iter->nodes_locked & (1 << level);
+	return path->nodes_locked & (1 << level);
 }
 
-static inline void mark_btree_node_unlocked(struct btree_iter *iter,
+static inline void mark_btree_node_unlocked(struct btree_path *path,
 					    unsigned level)
 {
-	iter->nodes_locked &= ~(1 << level);
-	iter->nodes_intent_locked &= ~(1 << level);
+	path->nodes_locked &= ~(1 << level);
+	path->nodes_intent_locked &= ~(1 << level);
 }
 
-static inline void mark_btree_node_locked(struct btree_iter *iter,
+static inline void mark_btree_node_locked(struct btree_path *path,
 					  unsigned level,
 					  enum six_lock_type type)
 {
@@ -65,52 +65,52 @@ static inline void mark_btree_node_locked(struct btree_iter *iter,
 	BUILD_BUG_ON(SIX_LOCK_read   != 0);
 	BUILD_BUG_ON(SIX_LOCK_intent != 1);
 
-	iter->nodes_locked |= 1 << level;
-	iter->nodes_intent_locked |= type << level;
+	path->nodes_locked |= 1 << level;
+	path->nodes_intent_locked |= type << level;
 }
 
-static inline void mark_btree_node_intent_locked(struct btree_iter *iter,
+static inline void mark_btree_node_intent_locked(struct btree_path *path,
 						 unsigned level)
 {
-	mark_btree_node_locked(iter, level, SIX_LOCK_intent);
+	mark_btree_node_locked(path, level, SIX_LOCK_intent);
 }
 
-static inline enum six_lock_type __btree_lock_want(struct btree_iter *iter, int level)
+static inline enum six_lock_type __btree_lock_want(struct btree_path *path, int level)
 {
-	return level < iter->locks_want
+	return level < path->locks_want
 		? SIX_LOCK_intent
 		: SIX_LOCK_read;
 }
 
 static inline enum btree_node_locked_type
-btree_lock_want(struct btree_iter *iter, int level)
+btree_lock_want(struct btree_path *path, int level)
 {
-	if (level < iter->level)
+	if (level < path->level)
 		return BTREE_NODE_UNLOCKED;
-	if (level < iter->locks_want)
+	if (level < path->locks_want)
 		return BTREE_NODE_INTENT_LOCKED;
-	if (level == iter->level)
+	if (level == path->level)
 		return BTREE_NODE_READ_LOCKED;
 	return BTREE_NODE_UNLOCKED;
 }
 
-static inline void btree_node_unlock(struct btree_iter *iter, unsigned level)
+static inline void btree_node_unlock(struct btree_path *path, unsigned level)
 {
-	int lock_type = btree_node_locked_type(iter, level);
+	int lock_type = btree_node_locked_type(path, level);
 
 	EBUG_ON(level >= BTREE_MAX_DEPTH);
 
 	if (lock_type != BTREE_NODE_UNLOCKED)
-		six_unlock_type(&iter->l[level].b->c.lock, lock_type);
-	mark_btree_node_unlocked(iter, level);
+		six_unlock_type(&path->l[level].b->c.lock, lock_type);
+	mark_btree_node_unlocked(path, level);
 }
 
-static inline void __bch2_btree_iter_unlock(struct btree_iter *iter)
+static inline void __bch2_btree_path_unlock(struct btree_path *path)
 {
-	btree_iter_set_dirty(iter, BTREE_ITER_NEED_RELOCK);
+	btree_path_set_dirty(path, BTREE_ITER_NEED_RELOCK);
 
-	while (iter->nodes_locked)
-		btree_node_unlock(iter, __ffs(iter->nodes_locked));
+	while (path->nodes_locked)
+		btree_node_unlock(path, __ffs(path->nodes_locked));
 }
 
 static inline enum bch_time_stats lock_to_time_stat(enum six_lock_type type)
@@ -154,11 +154,11 @@ static inline bool btree_node_lock_increment(struct btree_trans *trans,
 					     struct btree *b, unsigned level,
 					     enum btree_node_locked_type want)
 {
-	struct btree_iter *iter;
+	struct btree_path *path;
 
-	trans_for_each_iter(trans, iter)
-		if (iter->l[level].b == b &&
-		    btree_node_locked_type(iter, level) >= want) {
+	trans_for_each_path(trans, path)
+		if (path->l[level].b == b &&
+		    btree_node_locked_type(path, level) >= want) {
 			six_lock_increment(&b->c.lock, want);
 			return true;
 		}
@@ -166,38 +166,39 @@ static inline bool btree_node_lock_increment(struct btree_trans *trans,
 	return false;
 }
 
-bool __bch2_btree_node_lock(struct btree_trans *, struct btree_iter *,
+bool __bch2_btree_node_lock(struct btree_trans *, struct btree_path *,
 			    struct btree *, struct bpos, unsigned,
-			    enum six_lock_type, six_lock_should_sleep_fn,
-			    void *, unsigned long);
+			    enum six_lock_type,
+			    six_lock_should_sleep_fn, void *,
+			    unsigned long);
 
 static inline bool btree_node_lock(struct btree_trans *trans,
-			struct btree_iter *iter,
+			struct btree_path *path,
 			struct btree *b, struct bpos pos, unsigned level,
 			enum six_lock_type type,
 			six_lock_should_sleep_fn should_sleep_fn, void *p,
 			unsigned long ip)
 {
 	EBUG_ON(level >= BTREE_MAX_DEPTH);
-	EBUG_ON(!(trans->iters_linked & (1ULL << iter->idx)));
+	EBUG_ON(!(trans->paths_allocated & (1ULL << path->idx)));
 
 	return likely(six_trylock_type(&b->c.lock, type)) ||
 		btree_node_lock_increment(trans, b, level, type) ||
-		__bch2_btree_node_lock(trans, iter, b, pos, level, type,
+		__bch2_btree_node_lock(trans, path, b, pos, level, type,
 				       should_sleep_fn, p, ip);
 }
 
-bool __bch2_btree_node_relock(struct btree_trans *, struct btree_iter *, unsigned);
+bool __bch2_btree_node_relock(struct btree_trans *, struct btree_path *, unsigned);
 
 static inline bool bch2_btree_node_relock(struct btree_trans *trans,
-					  struct btree_iter *iter, unsigned level)
+					  struct btree_path *path, unsigned level)
 {
-	EBUG_ON(btree_node_locked(iter, level) &&
-		btree_node_locked_type(iter, level) !=
-		__btree_lock_want(iter, level));
+	EBUG_ON(btree_node_locked(path, level) &&
+		btree_node_locked_type(path, level) !=
+		__btree_lock_want(path, level));
 
-	return likely(btree_node_locked(iter, level)) ||
-		__bch2_btree_node_relock(trans, iter, level);
+	return likely(btree_node_locked(path, level)) ||
+		__bch2_btree_node_relock(trans, path, level);
 }
 
 /*
@@ -205,32 +206,32 @@ static inline bool bch2_btree_node_relock(struct btree_trans *trans,
  * succeed:
  */
 static inline void
-bch2_btree_node_unlock_write_inlined(struct btree_trans *trans, struct btree_iter *iter,
+bch2_btree_node_unlock_write_inlined(struct btree_trans *trans, struct btree_path *path,
 				     struct btree *b)
 {
-	struct btree_iter *linked;
+	struct btree_path *linked;
 
-	EBUG_ON(iter->l[b->c.level].b != b);
-	EBUG_ON(iter->l[b->c.level].lock_seq + 1 != b->c.lock.state.seq);
+	EBUG_ON(path->l[b->c.level].b != b);
+	EBUG_ON(path->l[b->c.level].lock_seq + 1 != b->c.lock.state.seq);
 
-	trans_for_each_iter_with_node(trans, b, linked)
+	trans_for_each_path_with_node(trans, b, linked)
 		linked->l[b->c.level].lock_seq += 2;
 
 	six_unlock_write(&b->c.lock);
 }
 
 void bch2_btree_node_unlock_write(struct btree_trans *,
-			struct btree_iter *, struct btree *);
+			struct btree_path *, struct btree *);
 
 void __bch2_btree_node_lock_write(struct btree_trans *, struct btree *);
 
 static inline void bch2_btree_node_lock_write(struct btree_trans *trans,
-					      struct btree_iter *iter,
+					      struct btree_path *path,
 					      struct btree *b)
 {
-	EBUG_ON(iter->l[b->c.level].b != b);
-	EBUG_ON(iter->l[b->c.level].lock_seq != b->c.lock.state.seq);
-	EBUG_ON(!btree_node_intent_locked(iter, b->c.level));
+	EBUG_ON(path->l[b->c.level].b != b);
+	EBUG_ON(path->l[b->c.level].lock_seq != b->c.lock.state.seq);
+	EBUG_ON(!btree_node_intent_locked(path, b->c.level));
 
 	if (unlikely(!six_trylock_write(&b->c.lock)))
 		__bch2_btree_node_lock_write(trans, b);
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 56dc5fbb7c91..b7cded2095ff 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -210,7 +210,7 @@ struct btree_node_iter {
 #define __BTREE_ITER_ALL_SNAPSHOTS	(1 << 11)
 #define BTREE_ITER_ALL_SNAPSHOTS	(1 << 12)
 
-enum btree_iter_uptodate {
+enum btree_path_uptodate {
 	BTREE_ITER_UPTODATE		= 0,
 	BTREE_ITER_NEED_RELOCK		= 1,
 	BTREE_ITER_NEED_TRAVERSE	= 2,
@@ -225,51 +225,66 @@ enum btree_iter_uptodate {
 #define BTREE_ITER_NO_NODE_ERROR	((struct btree *) 7)
 #define BTREE_ITER_NO_NODE_CACHED	((struct btree *) 8)
 
-/*
- * @pos			- iterator's current position
- * @level		- current btree depth
- * @locks_want		- btree level below which we start taking intent locks
- * @nodes_locked	- bitmask indicating which nodes in @nodes are locked
- * @nodes_intent_locked	- bitmask indicating which locks are intent locks
- */
-struct btree_iter {
-	struct btree_trans	*trans;
-	unsigned long		ip_allocated;
-
+struct btree_path {
 	u8			idx;
-	u8			child_idx;
 	u8			sorted_idx;
+	u8			ref;
+	u8			intent_ref;
 
 	/* btree_iter_copy starts here: */
-	u16			flags;
-
-	/* When we're filtering by snapshot, the snapshot ID we're looking for: */
-	unsigned		snapshot;
-
 	struct bpos		pos;
-	struct bpos		real_pos;
 
 	enum btree_id		btree_id:4;
 	bool			cached:1;
-	enum btree_iter_uptodate uptodate:2;
+	bool			preserve:1;
+	enum btree_path_uptodate uptodate:2;
 	/*
-	 * True if we've returned a key (and thus are expected to keep it
-	 * locked), false after set_pos - for avoiding spurious transaction
-	 * restarts in bch2_trans_relock():
+	 * When true, failing to relock this path will cause the transaction to
+	 * restart:
 	 */
 	bool			should_be_locked:1;
-	unsigned		level:4,
-				min_depth:4,
+	unsigned		level:3,
 				locks_want:4,
 				nodes_locked:4,
 				nodes_intent_locked:4;
 
-	struct btree_iter_level {
+	struct btree_path_level {
 		struct btree	*b;
 		struct btree_node_iter iter;
 		u32		lock_seq;
 	}			l[BTREE_MAX_DEPTH];
+#ifdef CONFIG_BCACHEFS_DEBUG
+	unsigned long		ip_allocated;
+#endif
+};
 
+static inline struct btree_path_level *path_l(struct btree_path *path)
+{
+	return path->l + path->level;
+}
+
+/*
+ * @pos			- iterator's current position
+ * @level		- current btree depth
+ * @locks_want		- btree level below which we start taking intent locks
+ * @nodes_locked	- bitmask indicating which nodes in @nodes are locked
+ * @nodes_intent_locked	- bitmask indicating which locks are intent locks
+ */
+struct btree_iter {
+	struct btree_trans	*trans;
+	struct btree_path	*path;
+
+	enum btree_id		btree_id:4;
+	unsigned		min_depth:4;
+
+	/* btree_iter_copy starts here: */
+	u16			flags;
+
+	/* When we're filtering by snapshot, the snapshot ID we're looking for: */
+	unsigned		snapshot;
+
+	struct bpos		pos;
+	struct bpos		pos_after_commit;
 	/*
 	 * Current unpacked key - so that bch2_btree_iter_next()/
 	 * bch2_btree_iter_next_slot() can correctly advance pos.
@@ -277,11 +292,6 @@ struct btree_iter {
 	struct bkey		k;
 };
 
-static inline struct btree_iter_level *iter_l(struct btree_iter *iter)
-{
-	return iter->l + iter->level;
-}
-
 struct btree_key_cache {
 	struct mutex		lock;
 	struct rhashtable	table;
@@ -329,7 +339,7 @@ struct btree_insert_entry {
 	bool			cached:1;
 	bool			trans_triggers_run:1;
 	struct bkey_i		*k;
-	struct btree_iter	*iter;
+	struct btree_path	*path;
 	unsigned long		ip_allocated;
 };
 
@@ -354,7 +364,7 @@ struct btree_trans {
 #ifdef CONFIG_BCACHEFS_DEBUG
 	struct list_head	list;
 	struct btree		*locking;
-	unsigned		locking_iter_idx;
+	unsigned		locking_path_idx;
 	struct bpos		locking_pos;
 	u8			locking_btree_id;
 	u8			locking_level;
@@ -369,23 +379,21 @@ struct btree_trans {
 	bool			error:1;
 	bool			in_traverse_all:1;
 	bool			restarted:1;
-	bool			iters_sorted:1;
+	bool			paths_sorted:1;
 	/*
 	 * For when bch2_trans_update notices we'll be splitting a compressed
 	 * extent:
 	 */
 	unsigned		extra_journal_res;
 
-	u64			iters_linked;
-	u64			iters_live;
-	u64			iters_touched;
+	u64			paths_allocated;
 
 	unsigned		mem_top;
 	unsigned		mem_bytes;
 	void			*mem;
 
 	u8			sorted[BTREE_ITER_MAX + 8];
-	struct btree_iter	*iters;
+	struct btree_path	*paths;
 	struct btree_insert_entry *updates;
 
 	/* update path: */
@@ -589,16 +597,6 @@ static inline bool btree_node_is_extents(struct btree *b)
 	return btree_node_type_is_extents(btree_node_type(b));
 }
 
-static inline enum btree_node_type btree_iter_key_type(struct btree_iter *iter)
-{
-	return __btree_node_type(iter->level, iter->btree_id);
-}
-
-static inline bool btree_iter_is_extents(struct btree_iter *iter)
-{
-	return btree_node_type_is_extents(btree_iter_key_type(iter));
-}
-
 #define BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS		\
 	((1U << BKEY_TYPE_extents)|			\
 	 (1U << BKEY_TYPE_inodes)|			\
diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
index 5707baf10262..058d283a105c 100644
--- a/fs/bcachefs/btree_update.h
+++ b/fs/bcachefs/btree_update.h
@@ -8,9 +8,9 @@
 struct bch_fs;
 struct btree;
 
-void bch2_btree_node_lock_for_insert(struct btree_trans *, struct btree_iter *,
+void bch2_btree_node_lock_for_insert(struct btree_trans *, struct btree_path *,
 				     struct btree *);
-bool bch2_btree_bset_insert_key(struct btree_trans *, struct btree_iter *,
+bool bch2_btree_bset_insert_key(struct btree_trans *, struct btree_path *,
 				struct btree *, struct btree_node_iter *,
 				struct bkey_i *);
 void bch2_btree_add_journal_pin(struct bch_fs *, struct btree *, u64);
@@ -135,4 +135,13 @@ static inline int bch2_trans_commit(struct btree_trans *trans,
 	     (_i) < (_trans)->updates + (_trans)->nr_updates;		\
 	     (_i)++)
 
+struct bkey_i *__bch2_btree_trans_peek_updates(struct btree_iter *);
+
+static inline struct bkey_i *btree_trans_peek_updates(struct btree_iter *iter)
+{
+	return iter->flags & BTREE_ITER_WITH_UPDATES
+		? __bch2_btree_trans_peek_updates(iter)
+		: NULL;
+}
+
 #endif /* _BCACHEFS_BTREE_UPDATE_H */
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 652f08dea804..6dcce175fd8b 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -23,7 +23,7 @@
 #include <linux/random.h>
 
 static void bch2_btree_insert_node(struct btree_update *, struct btree_trans *,
-				   struct btree_iter *, struct btree *,
+				   struct btree_path *, struct btree *,
 				   struct keylist *, unsigned);
 static void bch2_btree_update_add_new_node(struct btree_update *, struct btree *);
 
@@ -162,10 +162,10 @@ static void bch2_btree_node_free_inmem(struct btree_trans *trans,
 				       struct btree *b)
 {
 	struct bch_fs *c = trans->c;
-	struct btree_iter *iter;
+	struct btree_path *path;
 
-	trans_for_each_iter(trans, iter)
-		BUG_ON(iter->l[b->c.level].b == b);
+	trans_for_each_path(trans, path)
+		BUG_ON(path->l[b->c.level].b == b);
 
 	six_lock_write(&b->c.lock, NULL, NULL);
 
@@ -914,7 +914,7 @@ static void bch2_btree_update_done(struct btree_update *as)
 }
 
 static struct btree_update *
-bch2_btree_update_start(struct btree_trans *trans, struct btree_iter *iter,
+bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
 			unsigned level, unsigned nr_nodes, unsigned flags)
 {
 	struct bch_fs *c = trans->c;
@@ -925,7 +925,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_iter *iter,
 	int journal_flags = 0;
 	int ret = 0;
 
-	BUG_ON(!iter->should_be_locked);
+	BUG_ON(!path->should_be_locked);
 
 	if (flags & BTREE_INSERT_JOURNAL_RESERVED)
 		journal_flags |= JOURNAL_RES_GET_RESERVED;
@@ -937,11 +937,11 @@ retry:
 	 * XXX: figure out how far we might need to split,
 	 * instead of locking/reserving all the way to the root:
 	 */
-	if (!bch2_btree_iter_upgrade(trans, iter, U8_MAX)) {
+	if (!bch2_btree_path_upgrade(trans, path, U8_MAX)) {
 		trace_trans_restart_iter_upgrade(trans->ip, _RET_IP_,
-						 iter->btree_id,
-						 &iter->real_pos);
-		return ERR_PTR(-EINTR);
+						 path->btree_id, &path->pos);
+		ret = btree_trans_restart(trans);
+		return ERR_PTR(ret);
 	}
 
 	if (flags & BTREE_INSERT_GC_LOCK_HELD)
@@ -961,7 +961,7 @@ retry:
 	as->c		= c;
 	as->mode	= BTREE_INTERIOR_NO_UPDATE;
 	as->took_gc_lock = !(flags & BTREE_INSERT_GC_LOCK_HELD);
-	as->btree_id	= iter->btree_id;
+	as->btree_id	= path->btree_id;
 	INIT_LIST_HEAD(&as->list);
 	INIT_LIST_HEAD(&as->unwritten_list);
 	INIT_LIST_HEAD(&as->write_blocked_list);
@@ -1081,7 +1081,7 @@ static void bch2_btree_set_root_inmem(struct bch_fs *c, struct btree *b)
  */
 static void bch2_btree_set_root(struct btree_update *as,
 				struct btree_trans *trans,
-				struct btree_iter *iter,
+				struct btree_path *path,
 				struct btree *b)
 {
 	struct bch_fs *c = as->c;
@@ -1097,7 +1097,7 @@ static void bch2_btree_set_root(struct btree_update *as,
 	 * Ensure no one is using the old root while we switch to the
 	 * new root:
 	 */
-	bch2_btree_node_lock_write(trans, iter, old);
+	bch2_btree_node_lock_write(trans, path, old);
 
 	bch2_btree_set_root_inmem(c, b);
 
@@ -1110,14 +1110,14 @@ static void bch2_btree_set_root(struct btree_update *as,
 	 * an intent lock on the new root, and any updates that would
 	 * depend on the new root would have to update the new root.
 	 */
-	bch2_btree_node_unlock_write(trans, iter, old);
+	bch2_btree_node_unlock_write(trans, path, old);
 }
 
 /* Interior node updates: */
 
 static void bch2_insert_fixup_btree_ptr(struct btree_update *as,
 					struct btree_trans *trans,
-					struct btree_iter *iter,
+					struct btree_path *path,
 					struct btree *b,
 					struct btree_node_iter *node_iter,
 					struct bkey_i *insert)
@@ -1152,7 +1152,7 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as,
 	       bkey_iter_pos_cmp(b, k, &insert->k.p) < 0)
 		bch2_btree_node_iter_advance(node_iter, b);
 
-	bch2_btree_bset_insert_key(trans, iter, b, node_iter, insert);
+	bch2_btree_bset_insert_key(trans, path, b, node_iter, insert);
 	set_btree_node_dirty(c, b);
 	set_btree_node_need_write(b);
 }
@@ -1160,7 +1160,7 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as,
 static void
 __bch2_btree_insert_keys_interior(struct btree_update *as,
 				  struct btree_trans *trans,
-				  struct btree_iter *iter,
+				  struct btree_path *path,
 				  struct btree *b,
 				  struct btree_node_iter node_iter,
 				  struct keylist *keys)
@@ -1175,7 +1175,7 @@ __bch2_btree_insert_keys_interior(struct btree_update *as,
 		;
 
 	while (!bch2_keylist_empty(keys)) {
-		bch2_insert_fixup_btree_ptr(as, trans, iter, b,
+		bch2_insert_fixup_btree_ptr(as, trans, path, b,
 				&node_iter, bch2_keylist_front(keys));
 		bch2_keylist_pop_front(keys);
 	}
@@ -1186,8 +1186,7 @@ __bch2_btree_insert_keys_interior(struct btree_update *as,
  * node)
  */
 static struct btree *__btree_split_node(struct btree_update *as,
-					struct btree *n1,
-					struct btree_iter *iter)
+					struct btree *n1)
 {
 	struct bkey_format_state s;
 	size_t nr_packed = 0, nr_unpacked = 0;
@@ -1304,7 +1303,7 @@ static struct btree *__btree_split_node(struct btree_update *as,
  */
 static void btree_split_insert_keys(struct btree_update *as,
 				    struct btree_trans *trans,
-				    struct btree_iter *iter,
+				    struct btree_path *path,
 				    struct btree *b,
 				    struct keylist *keys)
 {
@@ -1315,7 +1314,7 @@ static void btree_split_insert_keys(struct btree_update *as,
 
 	bch2_btree_node_iter_init(&node_iter, b, &k->k.p);
 
-	__bch2_btree_insert_keys_interior(as, trans, iter, b, node_iter, keys);
+	__bch2_btree_insert_keys_interior(as, trans, path, b, node_iter, keys);
 
 	/*
 	 * We can't tolerate whiteouts here - with whiteouts there can be
@@ -1345,18 +1344,17 @@ static void btree_split_insert_keys(struct btree_update *as,
 	btree_node_interior_verify(as->c, b);
 }
 
-static void btree_split(struct btree_update *as,
-			struct btree_trans *trans, struct btree_iter *iter,
-			struct btree *b, struct keylist *keys,
-			unsigned flags)
+static void btree_split(struct btree_update *as, struct btree_trans *trans,
+			struct btree_path *path, struct btree *b,
+			struct keylist *keys, unsigned flags)
 {
 	struct bch_fs *c = as->c;
-	struct btree *parent = btree_node_parent(iter, b);
+	struct btree *parent = btree_node_parent(path, b);
 	struct btree *n1, *n2 = NULL, *n3 = NULL;
 	u64 start_time = local_clock();
 
 	BUG_ON(!parent && (b != btree_node_root(c, b)));
-	BUG_ON(!btree_node_intent_locked(iter, btree_node_root(c, b)->c.level));
+	BUG_ON(!btree_node_intent_locked(path, btree_node_root(c, b)->c.level));
 
 	bch2_btree_interior_update_will_free_node(as, b);
 
@@ -1364,12 +1362,12 @@ static void btree_split(struct btree_update *as,
 	bch2_btree_update_add_new_node(as, n1);
 
 	if (keys)
-		btree_split_insert_keys(as, trans, iter, n1, keys);
+		btree_split_insert_keys(as, trans, path, n1, keys);
 
 	if (bset_u64s(&n1->set[0]) > BTREE_SPLIT_THRESHOLD(c)) {
 		trace_btree_split(c, b);
 
-		n2 = __btree_split_node(as, n1, iter);
+		n2 = __btree_split_node(as, n1);
 
 		bch2_btree_build_aux_trees(n2);
 		bch2_btree_build_aux_trees(n1);
@@ -1394,7 +1392,7 @@ static void btree_split(struct btree_update *as,
 			n3->sib_u64s[0] = U16_MAX;
 			n3->sib_u64s[1] = U16_MAX;
 
-			btree_split_insert_keys(as, trans, iter, n3, &as->parent_keys);
+			btree_split_insert_keys(as, trans, path, n3, &as->parent_keys);
 
 			bch2_btree_node_write(c, n3, SIX_LOCK_intent);
 		}
@@ -1414,12 +1412,12 @@ static void btree_split(struct btree_update *as,
 
 	if (parent) {
 		/* Split a non root node */
-		bch2_btree_insert_node(as, trans, iter, parent, &as->parent_keys, flags);
+		bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys, flags);
 	} else if (n3) {
-		bch2_btree_set_root(as, trans, iter, n3);
+		bch2_btree_set_root(as, trans, path, n3);
 	} else {
 		/* Root filled up but didn't need to be split */
-		bch2_btree_set_root(as, trans, iter, n1);
+		bch2_btree_set_root(as, trans, path, n1);
 	}
 
 	bch2_btree_update_get_open_buckets(as, n1);
@@ -1428,7 +1426,7 @@ static void btree_split(struct btree_update *as,
 	if (n3)
 		bch2_btree_update_get_open_buckets(as, n3);
 
-	/* Successful split, update the iterator to point to the new nodes: */
+	/* Successful split, update the path to point to the new nodes: */
 
 	six_lock_increment(&b->c.lock, SIX_LOCK_intent);
 	bch2_trans_node_drop(trans, b);
@@ -1461,21 +1459,21 @@ static void btree_split(struct btree_update *as,
 static void
 bch2_btree_insert_keys_interior(struct btree_update *as,
 				struct btree_trans *trans,
-				struct btree_iter *iter,
+				struct btree_path *path,
 				struct btree *b,
 				struct keylist *keys)
 {
-	struct btree_iter *linked;
+	struct btree_path *linked;
 
-	__bch2_btree_insert_keys_interior(as, trans, iter, b,
-					  iter->l[b->c.level].iter, keys);
+	__bch2_btree_insert_keys_interior(as, trans, path, b,
+					  path->l[b->c.level].iter, keys);
 
 	btree_update_updated_node(as, b);
 
-	trans_for_each_iter_with_node(trans, b, linked)
+	trans_for_each_path_with_node(trans, b, linked)
 		bch2_btree_node_iter_peek(&linked->l[b->c.level].iter, b);
 
-	bch2_trans_verify_iters(trans, b);
+	bch2_trans_verify_paths(trans);
 }
 
 /**
@@ -1490,10 +1488,9 @@ bch2_btree_insert_keys_interior(struct btree_update *as,
  * If a split occurred, this function will return early. This can only happen
  * for leaf nodes -- inserts into interior nodes have to be atomic.
  */
-static void bch2_btree_insert_node(struct btree_update *as,
-				   struct btree_trans *trans, struct btree_iter *iter,
-				   struct btree *b, struct keylist *keys,
-				   unsigned flags)
+static void bch2_btree_insert_node(struct btree_update *as, struct btree_trans *trans,
+				   struct btree_path *path, struct btree *b,
+				   struct keylist *keys, unsigned flags)
 {
 	struct bch_fs *c = as->c;
 	int old_u64s = le16_to_cpu(btree_bset_last(b)->u64s);
@@ -1501,21 +1498,21 @@ static void bch2_btree_insert_node(struct btree_update *as,
 	int live_u64s_added, u64s_added;
 
 	lockdep_assert_held(&c->gc_lock);
-	BUG_ON(!btree_node_intent_locked(iter, btree_node_root(c, b)->c.level));
+	BUG_ON(!btree_node_intent_locked(path, btree_node_root(c, b)->c.level));
 	BUG_ON(!b->c.level);
 	BUG_ON(!as || as->b);
 	bch2_verify_keylist_sorted(keys);
 
-	bch2_btree_node_lock_for_insert(trans, iter, b);
+	bch2_btree_node_lock_for_insert(trans, path, b);
 
 	if (!bch2_btree_node_insert_fits(c, b, bch2_keylist_u64s(keys))) {
-		bch2_btree_node_unlock_write(trans, iter, b);
+		bch2_btree_node_unlock_write(trans, path, b);
 		goto split;
 	}
 
 	btree_node_interior_verify(c, b);
 
-	bch2_btree_insert_keys_interior(as, trans, iter, b, keys);
+	bch2_btree_insert_keys_interior(as, trans, path, b, keys);
 
 	live_u64s_added = (int) b->nr.live_u64s - old_live_u64s;
 	u64s_added = (int) le16_to_cpu(btree_bset_last(b)->u64s) - old_u64s;
@@ -1529,46 +1526,46 @@ static void bch2_btree_insert_node(struct btree_update *as,
 	    bch2_maybe_compact_whiteouts(c, b))
 		bch2_trans_node_reinit_iter(trans, b);
 
-	bch2_btree_node_unlock_write(trans, iter, b);
+	bch2_btree_node_unlock_write(trans, path, b);
 
 	btree_node_interior_verify(c, b);
 	return;
 split:
-	btree_split(as, trans, iter, b, keys, flags);
+	btree_split(as, trans, path, b, keys, flags);
 }
 
 int bch2_btree_split_leaf(struct btree_trans *trans,
-			  struct btree_iter *iter,
+			  struct btree_path *path,
 			  unsigned flags)
 {
 	struct bch_fs *c = trans->c;
-	struct btree *b = iter_l(iter)->b;
+	struct btree *b = path_l(path)->b;
 	struct btree_update *as;
 	unsigned l;
 	int ret = 0;
 
-	as = bch2_btree_update_start(trans, iter, iter->level,
+	as = bch2_btree_update_start(trans, path, path->level,
 		btree_update_reserve_required(c, b), flags);
 	if (IS_ERR(as))
 		return PTR_ERR(as);
 
-	btree_split(as, trans, iter, b, NULL, flags);
+	btree_split(as, trans, path, b, NULL, flags);
 	bch2_btree_update_done(as);
 
-	for (l = iter->level + 1; btree_iter_node(iter, l) && !ret; l++)
-		ret = bch2_foreground_maybe_merge(trans, iter, l, flags);
+	for (l = path->level + 1; btree_path_node(path, l) && !ret; l++)
+		ret = bch2_foreground_maybe_merge(trans, path, l, flags);
 
 	return ret;
 }
 
 int __bch2_foreground_maybe_merge(struct btree_trans *trans,
-				  struct btree_iter *iter,
+				  struct btree_path *path,
 				  unsigned level,
 				  unsigned flags,
 				  enum btree_node_sibling sib)
 {
 	struct bch_fs *c = trans->c;
-	struct btree_iter *sib_iter = NULL;
+	struct btree_path *sib_path = NULL;
 	struct btree_update *as;
 	struct bkey_format_state new_s;
 	struct bkey_format new_f;
@@ -1579,14 +1576,14 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans,
 	int ret = 0, ret2 = 0;
 
 retry:
-	ret = bch2_btree_iter_traverse(iter);
+	ret = bch2_btree_path_traverse(trans, path, false);
 	if (ret)
 		return ret;
 
-	BUG_ON(!iter->should_be_locked);
-	BUG_ON(!btree_node_locked(iter, level));
+	BUG_ON(!path->should_be_locked);
+	BUG_ON(!btree_node_locked(path, level));
 
-	b = iter->l[level].b;
+	b = path->l[level].b;
 
 	if ((sib == btree_prev_sib && !bpos_cmp(b->data->min_key, POS_MIN)) ||
 	    (sib == btree_next_sib && !bpos_cmp(b->data->max_key, SPOS_MAX))) {
@@ -1598,17 +1595,18 @@ retry:
 		? bpos_predecessor(b->data->min_key)
 		: bpos_successor(b->data->max_key);
 
-	sib_iter = bch2_trans_get_node_iter(trans, iter->btree_id,
-					    sib_pos, U8_MAX, level,
-					    BTREE_ITER_INTENT);
-	ret = bch2_btree_iter_traverse(sib_iter);
+	sib_path = bch2_path_get(trans, false, path->btree_id,
+				 sib_pos, U8_MAX, level, true);
+	ret = bch2_btree_path_traverse(trans, sib_path, false);
 	if (ret)
 		goto err;
 
-	m = sib_iter->l[level].b;
+	sib_path->should_be_locked = true;
+
+	m = sib_path->l[level].b;
 
-	if (btree_node_parent(iter, b) !=
-	    btree_node_parent(sib_iter, m)) {
+	if (btree_node_parent(path, b) !=
+	    btree_node_parent(sib_path, m)) {
 		b->sib_u64s[sib] = U16_MAX;
 		goto out;
 	}
@@ -1659,8 +1657,8 @@ retry:
 	if (b->sib_u64s[sib] > c->btree_foreground_merge_threshold)
 		goto out;
 
-	parent = btree_node_parent(iter, b);
-	as = bch2_btree_update_start(trans, iter, level,
+	parent = btree_node_parent(path, b);
+	as = bch2_btree_update_start(trans, path, level,
 			 btree_update_reserve_required(c, parent) + 1,
 			 flags|
 			 BTREE_INSERT_NOFAIL|
@@ -1696,7 +1694,7 @@ retry:
 	bch2_keylist_add(&as->parent_keys, &delete);
 	bch2_keylist_add(&as->parent_keys, &n->key);
 
-	bch2_btree_insert_node(as, trans, iter, parent, &as->parent_keys, flags);
+	bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys, flags);
 
 	bch2_btree_update_get_open_buckets(as, n);
 
@@ -1707,7 +1705,7 @@ retry:
 
 	bch2_trans_node_add(trans, n);
 
-	bch2_trans_verify_iters(trans, n);
+	bch2_trans_verify_paths(trans);
 
 	bch2_btree_node_free_inmem(trans, b);
 	bch2_btree_node_free_inmem(trans, m);
@@ -1717,7 +1715,8 @@ retry:
 	bch2_btree_update_done(as);
 out:
 	bch2_trans_verify_locks(trans);
-	bch2_trans_iter_free(trans, sib_iter);
+	if (sib_path)
+		bch2_path_put(trans, sib_path, true);
 
 	/*
 	 * Don't downgrade locks here: we're called after successful insert,
@@ -1730,8 +1729,9 @@ out:
 	 */
 	return ret ?: ret2;
 err:
-	bch2_trans_iter_put(trans, sib_iter);
-	sib_iter = NULL;
+	if (sib_path)
+		bch2_path_put(trans, sib_path, true);
+	sib_path = NULL;
 
 	if (ret == -EINTR && bch2_trans_relock(trans))
 		goto retry;
@@ -1761,8 +1761,8 @@ retry:
 	if (!b || b->data->keys.seq != seq)
 		goto out;
 
-	parent = btree_node_parent(iter, b);
-	as = bch2_btree_update_start(trans, iter, b->c.level,
+	parent = btree_node_parent(iter->path, b);
+	as = bch2_btree_update_start(trans, iter->path, b->c.level,
 		(parent
 		 ? btree_update_reserve_required(c, parent)
 		 : 0) + 1,
@@ -1789,10 +1789,10 @@ retry:
 
 	if (parent) {
 		bch2_keylist_add(&as->parent_keys, &n->key);
-		bch2_btree_insert_node(as, trans, iter, parent,
+		bch2_btree_insert_node(as, trans, iter->path, parent,
 				       &as->parent_keys, flags);
 	} else {
-		bch2_btree_set_root(as, trans, iter, n);
+		bch2_btree_set_root(as, trans, iter->path, n);
 	}
 
 	bch2_btree_update_get_open_buckets(as, n);
@@ -1805,7 +1805,7 @@ retry:
 
 	bch2_btree_update_done(as);
 out:
-	bch2_btree_iter_downgrade(iter);
+	bch2_btree_path_downgrade(iter->path);
 	return ret;
 }
 
@@ -1824,13 +1824,13 @@ void async_btree_node_rewrite_work(struct work_struct *work)
 		container_of(work, struct async_btree_rewrite, work);
 	struct bch_fs *c = a->c;
 	struct btree_trans trans;
-	struct btree_iter *iter;
+	struct btree_iter iter;
 
 	bch2_trans_init(&trans, c, 0, 0);
-	iter = bch2_trans_get_node_iter(&trans, a->btree_id, a->pos,
+	bch2_trans_node_iter_init(&trans, &iter, a->btree_id, a->pos,
 					BTREE_MAX_DEPTH, a->level, 0);
-	bch2_btree_node_rewrite(&trans, iter, a->seq, 0);
-	bch2_trans_iter_put(&trans, iter);
+	bch2_btree_node_rewrite(&trans, &iter, a->seq, 0);
+	bch2_trans_iter_exit(&trans, &iter);
 	bch2_trans_exit(&trans);
 	percpu_ref_put(&c->writes);
 	kfree(a);
@@ -1869,7 +1869,7 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans,
 					bool skip_triggers)
 {
 	struct bch_fs *c = trans->c;
-	struct btree_iter *iter2 = NULL;
+	struct btree_iter iter2 = { NULL };
 	struct btree *parent;
 	u64 journal_entries[BKEY_BTREE_PTR_U64s_MAX];
 	int ret;
@@ -1897,19 +1897,22 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans,
 		BUG_ON(ret);
 	}
 
-	parent = btree_node_parent(iter, b);
+	parent = btree_node_parent(iter->path, b);
 	if (parent) {
-		iter2 = bch2_trans_copy_iter(trans, iter);
+		bch2_trans_copy_iter(&iter2, iter);
 
-		BUG_ON(iter2->level != b->c.level);
-		BUG_ON(bpos_cmp(iter2->pos, new_key->k.p));
+		iter2.path = bch2_btree_path_make_mut(trans, iter2.path,
+				iter2.flags & BTREE_ITER_INTENT);
 
-		btree_node_unlock(iter2, iter2->level);
-		iter2->l[iter2->level].b = BTREE_ITER_NO_NODE_UP;
-		iter2->level++;
+		BUG_ON(iter2.path->level != b->c.level);
+		BUG_ON(bpos_cmp(iter2.path->pos, new_key->k.p));
 
-		ret   = bch2_btree_iter_traverse(iter2) ?:
-			bch2_trans_update(trans, iter2, new_key, BTREE_TRIGGER_NORUN);
+		btree_node_unlock(iter2.path, iter2.path->level);
+		path_l(iter2.path)->b = BTREE_ITER_NO_NODE_UP;
+		iter2.path->level++;
+
+		ret   = bch2_btree_iter_traverse(&iter2) ?:
+			bch2_trans_update(trans, &iter2, new_key, BTREE_TRIGGER_NORUN);
 		if (ret)
 			goto err;
 	} else {
@@ -1931,7 +1934,7 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans,
 	if (ret)
 		goto err;
 
-	bch2_btree_node_lock_write(trans, iter, b);
+	bch2_btree_node_lock_write(trans, iter->path, b);
 
 	if (new_hash) {
 		mutex_lock(&c->btree_cache.lock);
@@ -1946,9 +1949,9 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans,
 		bkey_copy(&b->key, new_key);
 	}
 
-	bch2_btree_node_unlock_write(trans, iter, b);
+	bch2_btree_node_unlock_write(trans, iter->path, b);
 out:
-	bch2_trans_iter_put(trans, iter2);
+	bch2_trans_iter_exit(trans, &iter2);
 	return ret;
 err:
 	if (new_hash) {
@@ -2006,18 +2009,18 @@ int bch2_btree_node_update_key_get_iter(struct btree_trans *trans,
 					struct btree *b, struct bkey_i *new_key,
 					bool skip_triggers)
 {
-	struct btree_iter *iter;
+	struct btree_iter iter;
 	int ret;
 
-	iter = bch2_trans_get_node_iter(trans, b->c.btree_id, b->key.k.p,
-					BTREE_MAX_DEPTH, b->c.level,
-					BTREE_ITER_INTENT);
-	ret = bch2_btree_iter_traverse(iter);
+	bch2_trans_node_iter_init(trans, &iter, b->c.btree_id, b->key.k.p,
+				  BTREE_MAX_DEPTH, b->c.level,
+				  BTREE_ITER_INTENT);
+	ret = bch2_btree_iter_traverse(&iter);
 	if (ret)
 		goto out;
 
 	/* has node been freed? */
-	if (iter->l[b->c.level].b != b) {
+	if (iter.path->l[b->c.level].b != b) {
 		/* node has been freed: */
 		BUG_ON(!btree_node_dying(b));
 		goto out;
@@ -2025,9 +2028,9 @@ int bch2_btree_node_update_key_get_iter(struct btree_trans *trans,
 
 	BUG_ON(!btree_node_hashed(b));
 
-	ret = bch2_btree_node_update_key(trans, iter, b, new_key, skip_triggers);
+	ret = bch2_btree_node_update_key(trans, &iter, b, new_key, skip_triggers);
 out:
-	bch2_trans_iter_put(trans, iter);
+	bch2_trans_iter_exit(trans, &iter);
 	return ret;
 }
 
diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h
index 13b3a1bf0f4f..c06cfcc66db7 100644
--- a/fs/bcachefs/btree_update_interior.h
+++ b/fs/bcachefs/btree_update_interior.h
@@ -117,39 +117,39 @@ struct btree *__bch2_btree_node_alloc_replacement(struct btree_update *,
 						  struct btree *,
 						  struct bkey_format);
 
-int bch2_btree_split_leaf(struct btree_trans *, struct btree_iter *, unsigned);
+int bch2_btree_split_leaf(struct btree_trans *, struct btree_path *, unsigned);
 
-int __bch2_foreground_maybe_merge(struct btree_trans *, struct btree_iter *,
+int __bch2_foreground_maybe_merge(struct btree_trans *, struct btree_path *,
 				  unsigned, unsigned, enum btree_node_sibling);
 
 static inline int bch2_foreground_maybe_merge_sibling(struct btree_trans *trans,
-					struct btree_iter *iter,
+					struct btree_path *path,
 					unsigned level, unsigned flags,
 					enum btree_node_sibling sib)
 {
 	struct btree *b;
 
-	if (iter->uptodate >= BTREE_ITER_NEED_TRAVERSE)
+	if (path->uptodate >= BTREE_ITER_NEED_TRAVERSE)
 		return 0;
 
-	if (!bch2_btree_node_relock(trans, iter, level))
+	if (!bch2_btree_node_relock(trans, path, level))
 		return 0;
 
-	b = iter->l[level].b;
+	b = path->l[level].b;
 	if (b->sib_u64s[sib] > trans->c->btree_foreground_merge_threshold)
 		return 0;
 
-	return __bch2_foreground_maybe_merge(trans, iter, level, flags, sib);
+	return __bch2_foreground_maybe_merge(trans, path, level, flags, sib);
 }
 
 static inline int bch2_foreground_maybe_merge(struct btree_trans *trans,
-					      struct btree_iter *iter,
+					      struct btree_path *path,
 					      unsigned level,
 					      unsigned flags)
 {
-	return  bch2_foreground_maybe_merge_sibling(trans, iter, level, flags,
+	return  bch2_foreground_maybe_merge_sibling(trans, path, level, flags,
 						    btree_prev_sib) ?:
-		bch2_foreground_maybe_merge_sibling(trans, iter, level, flags,
+		bch2_foreground_maybe_merge_sibling(trans, path, level, flags,
 						    btree_next_sib);
 }
 
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 5e57ff5a5ceb..4fb5a5666e20 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -29,9 +29,9 @@ static inline int btree_insert_entry_cmp(const struct btree_insert_entry *l,
 		 bpos_cmp(l->k->k.p,	r->k->k.p);
 }
 
-static inline struct btree_iter_level *insert_l(struct btree_insert_entry *i)
+static inline struct btree_path_level *insert_l(struct btree_insert_entry *i)
 {
-	return i->iter->l + i->level;
+	return i->path->l + i->level;
 }
 
 static inline bool same_leaf_as_prev(struct btree_trans *trans,
@@ -49,14 +49,14 @@ static inline bool same_leaf_as_next(struct btree_trans *trans,
 }
 
 inline void bch2_btree_node_lock_for_insert(struct btree_trans *trans,
-					    struct btree_iter *iter,
+					    struct btree_path *path,
 					    struct btree *b)
 {
 	struct bch_fs *c = trans->c;
 
-	bch2_btree_node_lock_write(trans, iter, b);
+	bch2_btree_node_lock_write(trans, path, b);
 
-	if (iter->cached)
+	if (path->cached)
 		return;
 
 	if (unlikely(btree_node_just_written(b)) &&
@@ -75,7 +75,7 @@ inline void bch2_btree_node_lock_for_insert(struct btree_trans *trans,
 
 /* Handle overwrites and do insert, for non extents: */
 bool bch2_btree_bset_insert_key(struct btree_trans *trans,
-				struct btree_iter *iter,
+				struct btree_path *path,
 				struct btree *b,
 				struct btree_node_iter *node_iter,
 				struct bkey_i *insert)
@@ -116,7 +116,7 @@ bool bch2_btree_bset_insert_key(struct btree_trans *trans,
 			bch2_bset_delete(b, k, clobber_u64s);
 			goto fix_iter;
 		} else {
-			bch2_btree_iter_fix_key_modified(trans, b, k);
+			bch2_btree_path_fix_key_modified(trans, b, k);
 		}
 
 		return true;
@@ -134,7 +134,7 @@ bool bch2_btree_bset_insert_key(struct btree_trans *trans,
 			clobber_u64s = k->u64s;
 			goto overwrite;
 		} else {
-			bch2_btree_iter_fix_key_modified(trans, b, k);
+			bch2_btree_path_fix_key_modified(trans, b, k);
 		}
 	}
 
@@ -144,7 +144,7 @@ overwrite:
 	new_u64s = k->u64s;
 fix_iter:
 	if (clobber_u64s != new_u64s)
-		bch2_btree_node_iter_fix(trans, iter, b, node_iter, k,
+		bch2_btree_node_iter_fix(trans, path, b, node_iter, k,
 					 clobber_u64s, new_u64s);
 	return true;
 }
@@ -201,7 +201,7 @@ static bool btree_insert_key_leaf(struct btree_trans *trans,
 	EBUG_ON(!insert->level &&
 		!test_bit(BCH_FS_BTREE_INTERIOR_REPLAY_DONE, &c->flags));
 
-	if (unlikely(!bch2_btree_bset_insert_key(trans, insert->iter, b,
+	if (unlikely(!bch2_btree_bset_insert_key(trans, insert->path, b,
 					&insert_l(insert)->iter, insert->k)))
 		return false;
 
@@ -236,9 +236,10 @@ static bool btree_insert_key_leaf(struct btree_trans *trans,
 static inline void btree_insert_entry_checks(struct btree_trans *trans,
 					     struct btree_insert_entry *i)
 {
-	BUG_ON(bpos_cmp(i->k->k.p, i->iter->real_pos));
-	BUG_ON(i->level		!= i->iter->level);
-	BUG_ON(i->btree_id	!= i->iter->btree_id);
+	BUG_ON(bpos_cmp(i->k->k.p, i->path->pos));
+	BUG_ON(i->cached	!= i->path->cached);
+	BUG_ON(i->level		!= i->path->level);
+	BUG_ON(i->btree_id	!= i->path->btree_id);
 }
 
 static noinline int
@@ -293,14 +294,14 @@ btree_key_can_insert(struct btree_trans *trans,
 
 static enum btree_insert_ret
 btree_key_can_insert_cached(struct btree_trans *trans,
-			    struct btree_iter *iter,
+			    struct btree_path *path,
 			    unsigned u64s)
 {
-	struct bkey_cached *ck = (void *) iter->l[0].b;
+	struct bkey_cached *ck = (void *) path->l[0].b;
 	unsigned new_u64s;
 	struct bkey_i *new_k;
 
-	EBUG_ON(iter->level);
+	EBUG_ON(path->level);
 
 	if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) &&
 	    bch2_btree_key_cache_must_wait(trans->c) &&
@@ -340,7 +341,7 @@ static inline void do_btree_insert_one(struct btree_trans *trans,
 
 	did_work = !i->cached
 		? btree_insert_key_leaf(trans, i)
-		: bch2_btree_insert_key_cached(trans, i->iter, i->k);
+		: bch2_btree_insert_key_cached(trans, i->path, i->k);
 	if (!did_work)
 		return;
 
@@ -366,11 +367,12 @@ static noinline void bch2_trans_mark_gc(struct btree_trans *trans)
 	trans_for_each_update(trans, i) {
 		/*
 		 * XXX: synchronization of cached update triggers with gc
+		 * XXX: synchronization of interior node updates with gc
 		 */
 		BUG_ON(i->cached || i->level);
 
 		if (gc_visited(c, gc_pos_btree_node(insert_l(i)->b)))
-			bch2_mark_update(trans, i->iter, i->k,
+			bch2_mark_update(trans, i->path, i->k,
 					 i->flags|BTREE_TRIGGER_GC);
 	}
 }
@@ -417,7 +419,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans,
 		u64s += i->k->k.u64s;
 		ret = !i->cached
 			? btree_key_can_insert(trans, insert_l(i)->b, u64s)
-			: btree_key_can_insert_cached(trans, i->iter, u64s);
+			: btree_key_can_insert_cached(trans, i->path, u64s);
 		if (ret) {
 			*stopped_at = i;
 			return ret;
@@ -476,7 +478,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans,
 
 	trans_for_each_update(trans, i)
 		if (BTREE_NODE_TYPE_HAS_MEM_TRIGGERS & (1U << i->bkey_type))
-			bch2_mark_update(trans, i->iter, i->k, i->flags);
+			bch2_mark_update(trans, i->path, i->k, i->flags);
 
 	if (marking && trans->fs_usage_deltas)
 		bch2_trans_fs_usage_apply(trans, trans->fs_usage_deltas);
@@ -503,11 +505,13 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans,
 {
 	struct bch_fs *c = trans->c;
 	struct btree_insert_entry *i;
-	struct btree_iter *iter;
+	struct btree_path *path;
 	struct bkey_s_c old;
 	int ret, u64s_delta = 0;
 
 	trans_for_each_update(trans, i) {
+		struct bkey u;
+
 		/*
 		 * peek_slot() doesn't yet work on iterators that point to
 		 * interior nodes:
@@ -515,7 +519,7 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans,
 		if (i->cached || i->level)
 			continue;
 
-		old = bch2_btree_iter_peek_slot(i->iter);
+		old = bch2_btree_path_peek_slot(i->path, &u);
 		ret = bkey_err(old);
 		if (unlikely(ret))
 			return ret;
@@ -525,7 +529,7 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans,
 
 		if (!same_leaf_as_next(trans, i)) {
 			if (u64s_delta <= 0) {
-				ret = bch2_foreground_maybe_merge(trans, i->iter,
+				ret = bch2_foreground_maybe_merge(trans, i->path,
 							i->level, trans->flags);
 				if (unlikely(ret))
 					return ret;
@@ -536,7 +540,7 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans,
 	}
 
 	trans_for_each_update(trans, i)
-		BUG_ON(!btree_node_intent_locked(i->iter, i->level));
+		BUG_ON(!btree_node_intent_locked(i->path, i->level));
 
 	ret = bch2_journal_preres_get(&c->journal,
 			&trans->journal_preres, trans->journal_preres_u64s,
@@ -560,14 +564,12 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans,
 	 * or anything else that might call bch2_trans_relock(), since that
 	 * would just retake the read locks:
 	 */
-	trans_for_each_iter(trans, iter)
-		if (iter->nodes_locked != iter->nodes_intent_locked &&
-		    !bch2_btree_iter_upgrade(trans, iter, 1)) {
+	trans_for_each_path(trans, path)
+		if (path->nodes_locked != path->nodes_intent_locked &&
+		    !bch2_btree_path_upgrade(trans, path, path->level + 1)) {
 			trace_trans_restart_upgrade(trans->ip, trace_ip,
-						    iter->btree_id,
-						    &iter->real_pos);
-			trans->restarted = true;
-			return -EINTR;
+						    path->btree_id, &path->pos);
+			return btree_trans_restart(trans);
 		}
 
 	trans_for_each_update(trans, i) {
@@ -581,6 +583,7 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans,
 				buf, (void *) trans->ip,
 				(void *) i->ip_allocated, invalid);
 			bch2_fatal_error(c);
+			return -EINVAL;
 		}
 		btree_insert_entry_checks(trans, i);
 	}
@@ -588,14 +591,14 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans,
 
 	trans_for_each_update(trans, i)
 		if (!same_leaf_as_prev(trans, i))
-			bch2_btree_node_lock_for_insert(trans, i->iter,
+			bch2_btree_node_lock_for_insert(trans, i->path,
 					insert_l(i)->b);
 
 	ret = bch2_trans_commit_write_locked(trans, stopped_at, trace_ip);
 
 	trans_for_each_update(trans, i)
 		if (!same_leaf_as_prev(trans, i))
-			bch2_btree_node_unlock_write_inlined(trans, i->iter,
+			bch2_btree_node_unlock_write_inlined(trans, i->path,
 							insert_l(i)->b);
 
 	if (!ret && trans->journal_pin)
@@ -635,13 +638,13 @@ int bch2_trans_commit_error(struct btree_trans *trans,
 
 	switch (ret) {
 	case BTREE_INSERT_BTREE_NODE_FULL:
-		ret = bch2_btree_split_leaf(trans, i->iter, trans->flags);
+		ret = bch2_btree_split_leaf(trans, i->path, trans->flags);
 		if (!ret)
 			return 0;
 
 		if (ret == -EINTR)
 			trace_trans_restart_btree_node_split(trans->ip, trace_ip,
-						i->btree_id, &i->iter->real_pos);
+						i->btree_id, &i->path->pos);
 		break;
 	case BTREE_INSERT_NEED_MARK_REPLICAS:
 		bch2_trans_unlock(trans);
@@ -749,6 +752,10 @@ int __bch2_trans_commit(struct btree_trans *trans)
 	}
 
 #ifdef CONFIG_BCACHEFS_DEBUG
+	/*
+	 * if BTREE_TRIGGER_NORUN is set, it means we're probably being called
+	 * from the key cache flush code:
+	 */
 	trans_for_each_update(trans, i)
 		if (!i->cached &&
 		    !(i->flags & BTREE_TRIGGER_NORUN))
@@ -769,13 +776,12 @@ int __bch2_trans_commit(struct btree_trans *trans)
 				i->trans_triggers_run = true;
 				trans_trigger_run = true;
 
-				ret = bch2_trans_mark_update(trans, i->iter,
+				ret = bch2_trans_mark_update(trans, i->path,
 							     i->k, i->flags);
 				if (unlikely(ret)) {
 					if (ret == -EINTR)
 						trace_trans_restart_mark(trans->ip, _RET_IP_,
-									 i->btree_id,
-									 &i->iter->pos);
+								i->btree_id, &i->path->pos);
 					goto out;
 				}
 			}
@@ -783,18 +789,16 @@ int __bch2_trans_commit(struct btree_trans *trans)
 	} while (trans_trigger_run);
 
 	trans_for_each_update(trans, i) {
-		BUG_ON(!i->iter->should_be_locked);
+		BUG_ON(!i->path->should_be_locked);
 
-		if (unlikely(!bch2_btree_iter_upgrade(trans, i->iter,
-						      i->level + 1))) {
+		if (unlikely(!bch2_btree_path_upgrade(trans, i->path, i->level + 1))) {
 			trace_trans_restart_upgrade(trans->ip, _RET_IP_,
-						i->btree_id, &i->iter->pos);
-			trans->restarted = true;
-			ret = -EINTR;
+						    i->btree_id, &i->path->pos);
+			ret = btree_trans_restart(trans);
 			goto out;
 		}
 
-		BUG_ON(!btree_node_intent_locked(i->iter, i->level));
+		BUG_ON(!btree_node_intent_locked(i->path, i->level));
 
 		u64s = jset_u64s(i->k->k.u64s);
 		if (i->cached &&
@@ -828,6 +832,9 @@ out:
 	if (likely(!(trans->flags & BTREE_INSERT_NOCHECK_RW)))
 		percpu_ref_put(&trans->c->writes);
 out_reset:
+	trans_for_each_update(trans, i)
+		bch2_path_put(trans, i->path, true);
+
 	trans->extra_journal_res	= 0;
 	trans->nr_updates		= 0;
 	trans->hooks			= NULL;
@@ -869,11 +876,11 @@ static noinline int extent_front_merge(struct btree_trans *trans,
 	bkey_reassemble(update, k);
 
 	if (bch2_bkey_merge(c, bkey_i_to_s(update), bkey_i_to_s_c(*insert))) {
-		struct btree_iter *update_iter =
-			bch2_trans_copy_iter(trans, iter);
+		struct btree_iter update_iter;
 
-		ret = bch2_btree_delete_at(trans, update_iter, flags);
-		bch2_trans_iter_put(trans, update_iter);
+		bch2_trans_copy_iter(&update_iter, iter);
+		ret = bch2_btree_delete_at(trans, &update_iter, flags);
+		bch2_trans_iter_exit(trans, &update_iter);
 
 		if (ret)
 			return ret;
@@ -890,18 +897,18 @@ static int bch2_trans_update_extent(struct btree_trans *trans,
 				    enum btree_update_flags flags)
 {
 	struct bch_fs *c = trans->c;
-	struct btree_iter *iter, *update_iter;
+	struct btree_iter iter, update_iter;
 	struct bpos start = bkey_start_pos(&insert->k);
 	struct bkey_i *update;
 	struct bkey_s_c k;
 	enum btree_id btree_id = orig_iter->btree_id;
 	int ret = 0, compressed_sectors;
 
-	iter = bch2_trans_get_iter(trans, btree_id, start,
-				   BTREE_ITER_INTENT|
-				   BTREE_ITER_WITH_UPDATES|
-				   BTREE_ITER_NOT_EXTENTS);
-	k = bch2_btree_iter_peek(iter);
+	bch2_trans_iter_init(trans, &iter, btree_id, start,
+			     BTREE_ITER_INTENT|
+			     BTREE_ITER_WITH_UPDATES|
+			     BTREE_ITER_NOT_EXTENTS);
+	k = bch2_btree_iter_peek(&iter);
 	if ((ret = bkey_err(k)))
 		goto err;
 	if (!k.k)
@@ -909,7 +916,7 @@ static int bch2_trans_update_extent(struct btree_trans *trans,
 
 	if (!bkey_cmp(k.k->p, bkey_start_pos(&insert->k))) {
 		if (bch2_bkey_maybe_mergable(k.k, &insert->k)) {
-			ret = extent_front_merge(trans, iter, k, &insert, flags);
+			ret = extent_front_merge(trans, &iter, k, &insert, flags);
 			if (ret)
 				goto out;
 		}
@@ -940,23 +947,22 @@ static int bch2_trans_update_extent(struct btree_trans *trans,
 
 			bch2_cut_back(start, update);
 
-			update_iter = bch2_trans_get_iter(trans, btree_id, update->k.p,
-							  BTREE_ITER_NOT_EXTENTS|
-							  BTREE_ITER_INTENT);
-			ret   = bch2_btree_iter_traverse(update_iter) ?:
-				bch2_trans_update(trans, update_iter, update,
+			bch2_trans_iter_init(trans, &update_iter, btree_id, update->k.p,
+					     BTREE_ITER_NOT_EXTENTS|
+					     BTREE_ITER_INTENT);
+			ret   = bch2_btree_iter_traverse(&update_iter) ?:
+				bch2_trans_update(trans, &update_iter, update,
 						  BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|
 						  flags);
-			bch2_trans_iter_put(trans, update_iter);
+			bch2_trans_iter_exit(trans, &update_iter);
 			if (ret)
 				goto err;
 		}
 
 		if (bkey_cmp(k.k->p, insert->k.p) <= 0) {
-			update_iter = bch2_trans_copy_iter(trans, iter);
-			ret = bch2_btree_delete_at(trans, update_iter,
-						   flags);
-			bch2_trans_iter_put(trans, update_iter);
+			bch2_trans_copy_iter(&update_iter, &iter);
+			ret = bch2_btree_delete_at(trans, &update_iter, flags);
+			bch2_trans_iter_exit(trans, &update_iter);
 
 			if (ret)
 				goto err;
@@ -970,13 +976,13 @@ static int bch2_trans_update_extent(struct btree_trans *trans,
 			bkey_reassemble(update, k);
 			bch2_cut_front(insert->k.p, update);
 
-			update_iter = bch2_trans_copy_iter(trans, iter);
-			bch2_trans_update(trans, update_iter, update, flags);
-			bch2_trans_iter_put(trans, update_iter);
+			bch2_trans_copy_iter(&update_iter, &iter);
+			bch2_trans_update(trans, &update_iter, update, flags);
+			bch2_trans_iter_exit(trans, &update_iter);
 			goto out;
 		}
 next:
-		k = bch2_btree_iter_next(iter);
+		k = bch2_btree_iter_next(&iter);
 		if ((ret = bkey_err(k)))
 			goto err;
 		if (!k.k)
@@ -987,14 +993,12 @@ next:
 		bch2_bkey_merge(c, bkey_i_to_s(insert), k);
 out:
 	if (!bkey_deleted(&insert->k)) {
-		bch2_btree_iter_set_pos(iter, insert->k.p);
-		ret   = bch2_btree_iter_traverse(iter) ?:
-			bch2_trans_update(trans, iter, insert, flags);
-	} else {
-		set_btree_iter_dontneed(trans, iter);
+		bch2_btree_iter_set_pos(&iter, insert->k.p);
+		ret   = bch2_btree_iter_traverse(&iter) ?:
+			bch2_trans_update(trans, &iter, insert, flags);
 	}
 err:
-	bch2_trans_iter_put(trans, iter);
+	bch2_trans_iter_exit(trans, &iter);
 
 	return ret;
 }
@@ -1002,31 +1006,34 @@ err:
 int bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter,
 		      struct bkey_i *k, enum btree_update_flags flags)
 {
-	struct btree_insert_entry *i, n = (struct btree_insert_entry) {
+	struct btree_insert_entry *i, n;
+
+	BUG_ON(!iter->path->should_be_locked);
+
+	if (iter->flags & BTREE_ITER_IS_EXTENTS)
+		return bch2_trans_update_extent(trans, iter, k, flags);
+
+	BUG_ON(trans->nr_updates >= BTREE_ITER_MAX);
+	BUG_ON(bpos_cmp(k->k.p, iter->path->pos));
+
+	n = (struct btree_insert_entry) {
 		.flags		= flags,
-		.bkey_type	= __btree_node_type(iter->level, iter->btree_id),
+		.bkey_type	= __btree_node_type(iter->path->level, iter->btree_id),
 		.btree_id	= iter->btree_id,
-		.level		= iter->level,
-		.cached		= iter->cached,
-		.iter		= iter,
+		.level		= iter->path->level,
+		.cached		= iter->flags & BTREE_ITER_CACHED,
+		.path		= iter->path,
 		.k		= k,
 		.ip_allocated	= _RET_IP_,
 	};
 
-	BUG_ON(!iter->should_be_locked);
-
-	if (iter->flags & BTREE_ITER_IS_EXTENTS)
-		return bch2_trans_update_extent(trans, iter, k, flags);
+	__btree_path_get(n.path, true);
 
 #ifdef CONFIG_BCACHEFS_DEBUG
 	trans_for_each_update(trans, i)
 		BUG_ON(i != trans->updates &&
 		       btree_insert_entry_cmp(i - 1, i) >= 0);
 #endif
-	BUG_ON(trans->nr_updates >= BTREE_ITER_MAX);
-	BUG_ON(bpos_cmp(n.k->k.p, n.iter->real_pos));
-
-	n.iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT;
 
 	/*
 	 * Pending updates are kept sorted: first, find position of new update,
@@ -1048,7 +1055,10 @@ int bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter,
 		if (n.cached && !i->cached) {
 			i->k = n.k;
 			i->flags = n.flags;
+
+			__btree_path_get(n.path, false);
 		} else {
+			bch2_path_put(trans, i->path, true);
 			*i = n;
 		}
 	} else
@@ -1068,15 +1078,15 @@ void bch2_trans_commit_hook(struct btree_trans *trans,
 int __bch2_btree_insert(struct btree_trans *trans,
 			enum btree_id id, struct bkey_i *k)
 {
-	struct btree_iter *iter;
+	struct btree_iter iter;
 	int ret;
 
-	iter = bch2_trans_get_iter(trans, id, bkey_start_pos(&k->k),
+	bch2_trans_iter_init(trans, &iter, id, bkey_start_pos(&k->k),
 				   BTREE_ITER_INTENT);
 
-	ret   = bch2_btree_iter_traverse(iter) ?:
-		bch2_trans_update(trans, iter, k, 0);
-	bch2_trans_iter_put(trans, iter);
+	ret   = bch2_btree_iter_traverse(&iter) ?:
+		bch2_trans_update(trans, &iter, k, 0);
+	bch2_trans_iter_exit(trans, &iter);
 	return ret;
 }
 
@@ -1114,16 +1124,16 @@ int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id id,
 				  struct bpos start, struct bpos end,
 				  u64 *journal_seq)
 {
-	struct btree_iter *iter;
+	struct btree_iter iter;
 	struct bkey_s_c k;
 	int ret = 0;
 
-	iter = bch2_trans_get_iter(trans, id, start, BTREE_ITER_INTENT);
+	bch2_trans_iter_init(trans, &iter, id, start, BTREE_ITER_INTENT);
 retry:
 	while ((bch2_trans_begin(trans),
-	       (k = bch2_btree_iter_peek(iter)).k) &&
+	       (k = bch2_btree_iter_peek(&iter)).k) &&
 	       !(ret = bkey_err(k)) &&
-	       bkey_cmp(iter->pos, end) < 0) {
+	       bkey_cmp(iter.pos, end) < 0) {
 		struct bkey_i delete;
 
 		bkey_init(&delete.k);
@@ -1142,9 +1152,9 @@ retry:
 		 * (bch2_btree_iter_peek() does guarantee that iter.pos >=
 		 * bkey_start_pos(k.k)).
 		 */
-		delete.k.p = iter->pos;
+		delete.k.p = iter.pos;
 
-		if (btree_node_type_is_extents(iter->btree_id)) {
+		if (btree_node_type_is_extents(id)) {
 			unsigned max_sectors =
 				KEY_SIZE_MAX & (~0 << trans->c->block_bits);
 
@@ -1152,12 +1162,12 @@ retry:
 			bch2_key_resize(&delete.k, max_sectors);
 			bch2_cut_back(end, &delete);
 
-			ret = bch2_extent_trim_atomic(trans, iter, &delete);
+			ret = bch2_extent_trim_atomic(trans, &iter, &delete);
 			if (ret)
 				break;
 		}
 
-		ret   = bch2_trans_update(trans, iter, &delete, 0) ?:
+		ret   = bch2_trans_update(trans, &iter, &delete, 0) ?:
 			bch2_trans_commit(trans, NULL, journal_seq,
 					BTREE_INSERT_NOFAIL);
 		if (ret)
@@ -1171,7 +1181,7 @@ retry:
 		goto retry;
 	}
 
-	bch2_trans_iter_free(trans, iter);
+	bch2_trans_iter_exit(trans, &iter);
 	return ret;
 }
 
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index a1d4a25bc42c..6831c002961d 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -1222,38 +1222,23 @@ int bch2_mark_key(struct bch_fs *c, struct bkey_s_c new, unsigned flags)
 	return ret;
 }
 
-int bch2_mark_update(struct btree_trans *trans, struct btree_iter *iter,
+int bch2_mark_update(struct btree_trans *trans, struct btree_path *path,
 		     struct bkey_i *new, unsigned flags)
 {
 	struct bch_fs		*c = trans->c;
 	struct bkey		_deleted = KEY(0, 0, 0);
 	struct bkey_s_c		deleted = (struct bkey_s_c) { &_deleted, NULL };
 	struct bkey_s_c		old;
-	int iter_flags, ret;
+	struct bkey		unpacked;
+	int ret;
 
 	if (unlikely(flags & BTREE_TRIGGER_NORUN))
 		return 0;
 
-	if (!btree_node_type_needs_gc(iter->btree_id))
+	if (!btree_node_type_needs_gc(path->btree_id))
 		return 0;
 
-	if (likely(!(iter->flags & BTREE_ITER_CACHED_NOFILL))) {
-		iter_flags = iter->flags & BTREE_ITER_WITH_UPDATES;
-		iter->flags &= ~BTREE_ITER_WITH_UPDATES;
-
-		old = bch2_btree_iter_peek_slot(iter);
-		iter->flags |= iter_flags;
-
-		ret = bkey_err(old);
-		if (ret)
-			return ret;
-	} else {
-		/*
-		 * If BTREE_ITER_CACHED_NOFILL was used, we better not be
-		 * running triggers that do anything on removal (alloc btree):
-		 */
-		old = deleted;
-	}
+	old = bch2_btree_path_peek_slot(path, &unpacked);
 
 	if (old.k->type == new->k.type &&
 	    ((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) {
@@ -1291,22 +1276,13 @@ void fs_usage_apply_warn(struct btree_trans *trans,
 		pr_err("overlapping with");
 
 		if (!i->cached) {
-			struct btree_iter *copy = bch2_trans_copy_iter(trans, i->iter);
-			struct bkey_s_c k;
-			int ret;
-
-			for_each_btree_key_continue(copy, 0, k, ret) {
-				if (btree_node_type_is_extents(i->iter->btree_id)
-				    ? bkey_cmp(i->k->k.p, bkey_start_pos(k.k)) <= 0
-				    : bkey_cmp(i->k->k.p, k.k->p))
-					break;
+			struct bkey u;
+			struct bkey_s_c k = bch2_btree_path_peek_slot(i->path, &u);
 
-				bch2_bkey_val_to_text(&PBUF(buf), c, k);
-				pr_err("%s", buf);
-			}
-			bch2_trans_iter_put(trans, copy);
+			bch2_bkey_val_to_text(&PBUF(buf), c, k);
+			pr_err("%s", buf);
 		} else {
-			struct bkey_cached *ck = (void *) i->iter->l[0].b;
+			struct bkey_cached *ck = (void *) i->path->l[0].b;
 
 			if (ck->valid) {
 				bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(ck->k));
@@ -1385,31 +1361,8 @@ void bch2_trans_fs_usage_apply(struct btree_trans *trans,
 
 /* trans_mark: */
 
-static struct btree_iter *trans_get_update(struct btree_trans *trans,
-			    enum btree_id btree_id, struct bpos pos,
-			    struct bkey_s_c *k)
-{
-	struct btree_insert_entry *i;
-
-	trans_for_each_update(trans, i)
-		if (i->iter->btree_id == btree_id &&
-		    (btree_node_type_is_extents(btree_id)
-		     ? bkey_cmp(pos, bkey_start_pos(&i->k->k)) >= 0 &&
-		       bkey_cmp(pos, i->k->k.p) < 0
-		     : !bkey_cmp(pos, i->iter->pos))) {
-			*k = bkey_i_to_s_c(i->k);
-
-			/* ugly hack.. */
-			BUG_ON(btree_iter_live(trans, i->iter));
-			trans->iters_live |= 1ULL << i->iter->idx;
-			return i->iter;
-		}
-
-	return NULL;
-}
-
 static struct bkey_alloc_buf *
-bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree_iter **_iter,
+bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree_iter *iter,
 			      const struct bch_extent_ptr *ptr,
 			      struct bkey_alloc_unpacked *u)
 {
@@ -1417,36 +1370,34 @@ bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree_iter **_it
 	struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
 	struct bpos pos = POS(ptr->dev, PTR_BUCKET_NR(ca, ptr));
 	struct bucket *g;
-	struct btree_iter *iter;
-	struct bkey_s_c k;
 	struct bkey_alloc_buf *a;
+	struct bkey_i *update;
 	int ret;
 
 	a = bch2_trans_kmalloc(trans, sizeof(struct bkey_alloc_buf));
 	if (IS_ERR(a))
 		return a;
 
-	iter = trans_get_update(trans, BTREE_ID_alloc, pos, &k);
-	if (iter) {
-		*u = bch2_alloc_unpack(k);
-	} else {
-		iter = bch2_trans_get_iter(trans, BTREE_ID_alloc, pos,
-					   BTREE_ITER_CACHED|
-					   BTREE_ITER_CACHED_NOFILL|
-					   BTREE_ITER_INTENT);
-		ret = bch2_btree_iter_traverse(iter);
-		if (ret) {
-			bch2_trans_iter_put(trans, iter);
-			return ERR_PTR(ret);
-		}
+	bch2_trans_iter_init(trans, iter, BTREE_ID_alloc, pos,
+			     BTREE_ITER_CACHED|
+			     BTREE_ITER_CACHED_NOFILL|
+			     BTREE_ITER_INTENT);
+	ret = bch2_btree_iter_traverse(iter);
+	if (ret) {
+		bch2_trans_iter_exit(trans, iter);
+		return ERR_PTR(ret);
+	}
 
+	update = __bch2_btree_trans_peek_updates(iter);
+	if (update && !bpos_cmp(update->k.p, pos)) {
+		*u = bch2_alloc_unpack(bkey_i_to_s_c(update));
+	} else {
 		percpu_down_read(&c->mark_lock);
 		g = bucket(ca, pos.offset);
 		*u = alloc_mem_to_key(iter, g, READ_ONCE(g->mark));
 		percpu_up_read(&c->mark_lock);
 	}
 
-	*_iter = iter;
 	return a;
 }
 
@@ -1455,7 +1406,7 @@ static int bch2_trans_mark_pointer(struct btree_trans *trans,
 			s64 sectors, enum bch_data_type data_type)
 {
 	struct bch_fs *c = trans->c;
-	struct btree_iter *iter;
+	struct btree_iter iter;
 	struct bkey_alloc_unpacked u;
 	struct bkey_alloc_buf *a;
 	int ret;
@@ -1470,9 +1421,9 @@ static int bch2_trans_mark_pointer(struct btree_trans *trans,
 		goto out;
 
 	bch2_alloc_pack(c, a, u);
-	bch2_trans_update(trans, iter, &a->k, 0);
+	bch2_trans_update(trans, &iter, &a->k, 0);
 out:
-	bch2_trans_iter_put(trans, iter);
+	bch2_trans_iter_exit(trans, &iter);
 	return ret;
 }
 
@@ -1481,16 +1432,16 @@ static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans,
 			s64 sectors, enum bch_data_type data_type)
 {
 	struct bch_fs *c = trans->c;
-	struct btree_iter *iter;
+	struct btree_iter iter;
 	struct bkey_s_c k;
 	struct bkey_i_stripe *s;
 	struct bch_replicas_padded r;
 	int ret = 0;
 
-	iter = bch2_trans_get_iter(trans, BTREE_ID_stripes, POS(0, p.ec.idx),
-				   BTREE_ITER_INTENT|
-				   BTREE_ITER_WITH_UPDATES);
-	k = bch2_btree_iter_peek_slot(iter);
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_stripes, POS(0, p.ec.idx),
+			     BTREE_ITER_INTENT|
+			     BTREE_ITER_WITH_UPDATES);
+	k = bch2_btree_iter_peek_slot(&iter);
 	ret = bkey_err(k);
 	if (ret)
 		goto err;
@@ -1521,13 +1472,13 @@ static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans,
 	stripe_blockcount_set(&s->v, p.ec.block,
 		stripe_blockcount_get(&s->v, p.ec.block) +
 		sectors);
-	bch2_trans_update(trans, iter, &s->k_i, 0);
+	bch2_trans_update(trans, &iter, &s->k_i, 0);
 
 	bch2_bkey_to_replicas(&r.e, bkey_i_to_s_c(&s->k_i));
 	r.e.data_type = data_type;
 	update_replicas_list(trans, &r.e, sectors);
 err:
-	bch2_trans_iter_put(trans, iter);
+	bch2_trans_iter_exit(trans, &iter);
 	return ret;
 }
 
@@ -1599,7 +1550,7 @@ static int bch2_trans_mark_stripe_alloc_ref(struct btree_trans *trans,
 	struct bch_fs *c = trans->c;
 	const struct bch_extent_ptr *ptr = &s.v->ptrs[idx];
 	struct bkey_alloc_buf *a;
-	struct btree_iter *iter;
+	struct btree_iter iter;
 	struct bkey_alloc_unpacked u;
 	bool parity = idx >= s.v->nr_blocks - s.v->nr_redundant;
 	int ret = 0;
@@ -1623,7 +1574,7 @@ static int bch2_trans_mark_stripe_alloc_ref(struct btree_trans *trans,
 	if (!deleting) {
 		if (bch2_fs_inconsistent_on(u.stripe && u.stripe != s.k->p.offset, c,
 				"bucket %llu:%llu gen %u: multiple stripes using same bucket (%u, %llu)",
-				iter->pos.inode, iter->pos.offset, u.gen,
+				iter.pos.inode, iter.pos.offset, u.gen,
 				u.stripe, s.k->p.offset)) {
 			ret = -EIO;
 			goto err;
@@ -1637,9 +1588,9 @@ static int bch2_trans_mark_stripe_alloc_ref(struct btree_trans *trans,
 	}
 
 	bch2_alloc_pack(c, a, u);
-	bch2_trans_update(trans, iter, &a->k, 0);
+	bch2_trans_update(trans, &iter, &a->k, 0);
 err:
-	bch2_trans_iter_put(trans, iter);
+	bch2_trans_iter_exit(trans, &iter);
 	return ret;
 }
 
@@ -1744,17 +1695,17 @@ static int __bch2_trans_mark_reflink_p(struct btree_trans *trans,
 			u64 idx, unsigned flags)
 {
 	struct bch_fs *c = trans->c;
-	struct btree_iter *iter;
+	struct btree_iter iter;
 	struct bkey_s_c k;
 	struct bkey_i *n;
 	__le64 *refcount;
 	int add = !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1;
 	s64 ret;
 
-	iter = bch2_trans_get_iter(trans, BTREE_ID_reflink, POS(0, idx),
-				   BTREE_ITER_INTENT|
-				   BTREE_ITER_WITH_UPDATES);
-	k = bch2_btree_iter_peek_slot(iter);
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_reflink, POS(0, idx),
+			     BTREE_ITER_INTENT|
+			     BTREE_ITER_WITH_UPDATES);
+	k = bch2_btree_iter_peek_slot(&iter);
 	ret = bkey_err(k);
 	if (ret)
 		goto err;
@@ -1784,14 +1735,14 @@ static int __bch2_trans_mark_reflink_p(struct btree_trans *trans,
 		set_bkey_val_u64s(&n->k, 0);
 	}
 
-	bch2_btree_iter_set_pos_to_extent_start(iter);
-	ret = bch2_trans_update(trans, iter, n, 0);
+	bch2_btree_iter_set_pos_to_extent_start(&iter);
+	ret = bch2_trans_update(trans, &iter, n, 0);
 	if (ret)
 		goto err;
 
 	ret = k.k->p.offset - idx;
 err:
-	bch2_trans_iter_put(trans, iter);
+	bch2_trans_iter_exit(trans, &iter);
 	return ret;
 }
 
@@ -1843,39 +1794,23 @@ int bch2_trans_mark_key(struct btree_trans *trans, struct bkey_s_c old,
 }
 
 int bch2_trans_mark_update(struct btree_trans *trans,
-			   struct btree_iter *iter,
+			   struct btree_path *path,
 			   struct bkey_i *new,
 			   unsigned flags)
 {
 	struct bkey		_deleted = KEY(0, 0, 0);
 	struct bkey_s_c		deleted = (struct bkey_s_c) { &_deleted, NULL };
 	struct bkey_s_c		old;
-	int iter_flags, ret;
+	struct bkey		unpacked;
+	int ret;
 
 	if (unlikely(flags & BTREE_TRIGGER_NORUN))
 		return 0;
 
-	if (!btree_node_type_needs_gc(iter->btree_id))
+	if (!btree_node_type_needs_gc(path->btree_id))
 		return 0;
 
-
-	if (likely(!(iter->flags & BTREE_ITER_CACHED_NOFILL))) {
-		iter_flags = iter->flags & BTREE_ITER_WITH_UPDATES;
-		iter->flags &= ~BTREE_ITER_WITH_UPDATES;
-
-		old = bch2_btree_iter_peek_slot(iter);
-		iter->flags |= iter_flags;
-
-		ret = bkey_err(old);
-		if (ret)
-			return ret;
-	} else {
-		/*
-		 * If BTREE_ITER_CACHED_NOFILL was used, we better not be
-		 * running triggers that do anything on removal (alloc btree):
-		 */
-		old = deleted;
-	}
+	old = bch2_btree_path_peek_slot(path, &unpacked);
 
 	if (old.k->type == new->k.type &&
 	    ((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) {
@@ -1897,7 +1832,7 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
 				    unsigned sectors)
 {
 	struct bch_fs *c = trans->c;
-	struct btree_iter *iter;
+	struct btree_iter iter;
 	struct bkey_alloc_unpacked u;
 	struct bkey_alloc_buf *a;
 	struct bch_extent_ptr ptr = {
@@ -1920,7 +1855,7 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
 		bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
 			"bucket %llu:%llu gen %u different types of data in same bucket: %s, %s\n"
 			"while marking %s",
-			iter->pos.inode, iter->pos.offset, u.gen,
+			iter.pos.inode, iter.pos.offset, u.gen,
 			bch2_data_types[u.data_type],
 			bch2_data_types[type],
 			bch2_data_types[type]);
@@ -1932,9 +1867,9 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
 	u.dirty_sectors	= sectors;
 
 	bch2_alloc_pack(c, a, u);
-	bch2_trans_update(trans, iter, &a->k, 0);
+	bch2_trans_update(trans, &iter, &a->k, 0);
 out:
-	bch2_trans_iter_put(trans, iter);
+	bch2_trans_iter_exit(trans, &iter);
 	return ret;
 }
 
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index 3fb91ef60685..4687fba2eed6 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -228,13 +228,13 @@ void bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *,
 
 int bch2_mark_key(struct bch_fs *, struct bkey_s_c, unsigned);
 
-int bch2_mark_update(struct btree_trans *, struct btree_iter *,
+int bch2_mark_update(struct btree_trans *, struct btree_path *,
 		     struct bkey_i *, unsigned);
 
 int bch2_trans_mark_key(struct btree_trans *, struct bkey_s_c,
 			struct bkey_s_c, unsigned);
-int bch2_trans_mark_update(struct btree_trans *, struct btree_iter *iter,
-			   struct bkey_i *insert, unsigned);
+int bch2_trans_mark_update(struct btree_trans *, struct btree_path *,
+			   struct bkey_i *, unsigned);
 void bch2_trans_fs_usage_apply(struct btree_trans *, struct replicas_delta_list *);
 
 int bch2_trans_mark_metadata_bucket(struct btree_trans *, struct bch_dev *,
diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c
index 6a28de30ea3b..5ffb7f0a3bf6 100644
--- a/fs/bcachefs/debug.c
+++ b/fs/bcachefs/debug.c
@@ -243,7 +243,7 @@ static ssize_t bch2_read_btree(struct file *file, char __user *buf,
 {
 	struct dump_iter *i = file->private_data;
 	struct btree_trans trans;
-	struct btree_iter *iter;
+	struct btree_iter iter;
 	struct bkey_s_c k;
 	int err;
 
@@ -260,10 +260,10 @@ static ssize_t bch2_read_btree(struct file *file, char __user *buf,
 
 	bch2_trans_init(&trans, i->c, 0, 0);
 
-	iter = bch2_trans_get_iter(&trans, i->id, i->from,
-				   BTREE_ITER_PREFETCH|
-				   BTREE_ITER_ALL_SNAPSHOTS);
-	k = bch2_btree_iter_peek(iter);
+	bch2_trans_iter_init(&trans, &iter, i->id, i->from,
+			     BTREE_ITER_PREFETCH|
+			     BTREE_ITER_ALL_SNAPSHOTS);
+	k = bch2_btree_iter_peek(&iter);
 
 	while (k.k && !(err = bkey_err(k))) {
 		bch2_bkey_val_to_text(&PBUF(i->buf), i->c, k);
@@ -272,8 +272,8 @@ static ssize_t bch2_read_btree(struct file *file, char __user *buf,
 		i->buf[i->bytes] = '\n';
 		i->bytes++;
 
-		k = bch2_btree_iter_next(iter);
-		i->from = iter->pos;
+		k = bch2_btree_iter_next(&iter);
+		i->from = iter.pos;
 
 		err = flush_buf(i);
 		if (err)
@@ -282,7 +282,7 @@ static ssize_t bch2_read_btree(struct file *file, char __user *buf,
 		if (!i->size)
 			break;
 	}
-	bch2_trans_iter_put(&trans, iter);
+	bch2_trans_iter_exit(&trans, &iter);
 
 	bch2_trans_exit(&trans);
 
@@ -301,7 +301,7 @@ static ssize_t bch2_read_btree_formats(struct file *file, char __user *buf,
 {
 	struct dump_iter *i = file->private_data;
 	struct btree_trans trans;
-	struct btree_iter *iter;
+	struct btree_iter iter;
 	struct btree *b;
 	int err;
 
@@ -336,7 +336,7 @@ static ssize_t bch2_read_btree_formats(struct file *file, char __user *buf,
 		if (!i->size)
 			break;
 	}
-	bch2_trans_iter_put(&trans, iter);
+	bch2_trans_iter_exit(&trans, &iter);
 
 	bch2_trans_exit(&trans);
 
@@ -355,7 +355,7 @@ static ssize_t bch2_read_bfloat_failed(struct file *file, char __user *buf,
 {
 	struct dump_iter *i = file->private_data;
 	struct btree_trans trans;
-	struct btree_iter *iter;
+	struct btree_iter iter;
 	struct bkey_s_c k;
 	struct btree *prev_node = NULL;
 	int err;
@@ -373,11 +373,11 @@ static ssize_t bch2_read_bfloat_failed(struct file *file, char __user *buf,
 
 	bch2_trans_init(&trans, i->c, 0, 0);
 
-	iter = bch2_trans_get_iter(&trans, i->id, i->from, BTREE_ITER_PREFETCH);
+	bch2_trans_iter_init(&trans, &iter, i->id, i->from, BTREE_ITER_PREFETCH);
 
-	while ((k = bch2_btree_iter_peek(iter)).k &&
+	while ((k = bch2_btree_iter_peek(&iter)).k &&
 	       !(err = bkey_err(k))) {
-		struct btree_iter_level *l = &iter->l[0];
+		struct btree_path_level *l = &iter.path->l[0];
 		struct bkey_packed *_k =
 			bch2_btree_node_iter_peek(&l->iter, l->b);
 
@@ -396,8 +396,8 @@ static ssize_t bch2_read_bfloat_failed(struct file *file, char __user *buf,
 		if (err)
 			break;
 
-		bch2_btree_iter_advance(iter);
-		i->from = iter->pos;
+		bch2_btree_iter_advance(&iter);
+		i->from = iter.pos;
 
 		err = flush_buf(i);
 		if (err)
diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c
index 02b29681f695..1d510f7728b6 100644
--- a/fs/bcachefs/dirent.c
+++ b/fs/bcachefs/dirent.c
@@ -183,7 +183,8 @@ int bch2_dirent_rename(struct btree_trans *trans,
 		       const struct qstr *dst_name, u64 *dst_inum, u64 *dst_offset,
 		       enum bch_rename_mode mode)
 {
-	struct btree_iter *src_iter = NULL, *dst_iter = NULL;
+	struct btree_iter src_iter = { NULL };
+	struct btree_iter dst_iter = { NULL };
 	struct bkey_s_c old_src, old_dst;
 	struct bkey_i_dirent *new_src = NULL, *new_dst = NULL;
 	struct bpos dst_pos =
@@ -199,17 +200,16 @@ int bch2_dirent_rename(struct btree_trans *trans,
 	 * the target already exists - we're relying on the VFS
 	 * to do that check for us for correctness:
 	 */
-	dst_iter = mode == BCH_RENAME
-		? bch2_hash_hole(trans, bch2_dirent_hash_desc,
+	ret = mode == BCH_RENAME
+		? bch2_hash_hole(trans, &dst_iter, bch2_dirent_hash_desc,
 				 dst_hash, dst_dir, dst_name)
-		: bch2_hash_lookup(trans, bch2_dirent_hash_desc,
+		: bch2_hash_lookup(trans, &dst_iter, bch2_dirent_hash_desc,
 				   dst_hash, dst_dir, dst_name,
 				   BTREE_ITER_INTENT);
-	ret = PTR_ERR_OR_ZERO(dst_iter);
 	if (ret)
 		goto out;
 
-	old_dst = bch2_btree_iter_peek_slot(dst_iter);
+	old_dst = bch2_btree_iter_peek_slot(&dst_iter);
 	ret = bkey_err(old_dst);
 	if (ret)
 		goto out;
@@ -217,17 +217,16 @@ int bch2_dirent_rename(struct btree_trans *trans,
 	if (mode != BCH_RENAME)
 		*dst_inum = le64_to_cpu(bkey_s_c_to_dirent(old_dst).v->d_inum);
 	if (mode != BCH_RENAME_EXCHANGE)
-		*src_offset = dst_iter->pos.offset;
+		*src_offset = dst_iter.pos.offset;
 
 	/* Lookup src: */
-	src_iter = bch2_hash_lookup(trans, bch2_dirent_hash_desc,
-				    src_hash, src_dir, src_name,
-				    BTREE_ITER_INTENT);
-	ret = PTR_ERR_OR_ZERO(src_iter);
+	ret = bch2_hash_lookup(trans, &src_iter, bch2_dirent_hash_desc,
+			       src_hash, src_dir, src_name,
+			       BTREE_ITER_INTENT);
 	if (ret)
 		goto out;
 
-	old_src = bch2_btree_iter_peek_slot(src_iter);
+	old_src = bch2_btree_iter_peek_slot(&src_iter);
 	ret = bkey_err(old_src);
 	if (ret)
 		goto out;
@@ -241,7 +240,7 @@ int bch2_dirent_rename(struct btree_trans *trans,
 		goto out;
 
 	dirent_copy_target(new_dst, bkey_s_c_to_dirent(old_src));
-	new_dst->k.p = dst_iter->pos;
+	new_dst->k.p = dst_iter.pos;
 
 	/* Create new src key: */
 	if (mode == BCH_RENAME_EXCHANGE) {
@@ -251,7 +250,7 @@ int bch2_dirent_rename(struct btree_trans *trans,
 			goto out;
 
 		dirent_copy_target(new_src, bkey_s_c_to_dirent(old_dst));
-		new_src->k.p = src_iter->pos;
+		new_src->k.p = src_iter.pos;
 	} else {
 		new_src = bch2_trans_kmalloc(trans, sizeof(struct bkey_i));
 		ret = PTR_ERR_OR_ZERO(new_src);
@@ -259,10 +258,10 @@ int bch2_dirent_rename(struct btree_trans *trans,
 			goto out;
 
 		bkey_init(&new_src->k);
-		new_src->k.p = src_iter->pos;
+		new_src->k.p = src_iter.pos;
 
-		if (bkey_cmp(dst_pos, src_iter->pos) <= 0 &&
-		    bkey_cmp(src_iter->pos, dst_iter->pos) < 0) {
+		if (bkey_cmp(dst_pos, src_iter.pos) <= 0 &&
+		    bkey_cmp(src_iter.pos, dst_iter.pos) < 0) {
 			/*
 			 * We have a hash collision for the new dst key,
 			 * and new_src - the key we're deleting - is between
@@ -275,8 +274,8 @@ int bch2_dirent_rename(struct btree_trans *trans,
 				 * If we're not overwriting, we can just insert
 				 * new_dst at the src position:
 				 */
-				new_dst->k.p = src_iter->pos;
-				bch2_trans_update(trans, src_iter,
+				new_dst->k.p = src_iter.pos;
+				bch2_trans_update(trans, &src_iter,
 						  &new_dst->k_i, 0);
 				goto out_set_offset;
 			} else {
@@ -290,7 +289,7 @@ int bch2_dirent_rename(struct btree_trans *trans,
 		} else {
 			/* Check if we need a whiteout to delete src: */
 			ret = bch2_hash_needs_whiteout(trans, bch2_dirent_hash_desc,
-						       src_hash, src_iter);
+						       src_hash, &src_iter);
 			if (ret < 0)
 				goto out;
 
@@ -299,15 +298,15 @@ int bch2_dirent_rename(struct btree_trans *trans,
 		}
 	}
 
-	bch2_trans_update(trans, src_iter, &new_src->k_i, 0);
-	bch2_trans_update(trans, dst_iter, &new_dst->k_i, 0);
+	bch2_trans_update(trans, &src_iter, &new_src->k_i, 0);
+	bch2_trans_update(trans, &dst_iter, &new_dst->k_i, 0);
 out_set_offset:
 	if (mode == BCH_RENAME_EXCHANGE)
 		*src_offset = new_src->k.p.offset;
 	*dst_offset = new_dst->k.p.offset;
 out:
-	bch2_trans_iter_put(trans, src_iter);
-	bch2_trans_iter_put(trans, dst_iter);
+	bch2_trans_iter_exit(trans, &src_iter);
+	bch2_trans_iter_exit(trans, &dst_iter);
 	return ret;
 }
 
@@ -319,12 +318,13 @@ int bch2_dirent_delete_at(struct btree_trans *trans,
 				   hash_info, iter);
 }
 
-struct btree_iter *
-__bch2_dirent_lookup_trans(struct btree_trans *trans, u64 dir_inum,
-			   const struct bch_hash_info *hash_info,
-			   const struct qstr *name, unsigned flags)
+int __bch2_dirent_lookup_trans(struct btree_trans *trans,
+			       struct btree_iter *iter,
+			       u64 dir_inum,
+			       const struct bch_hash_info *hash_info,
+			       const struct qstr *name, unsigned flags)
 {
-	return bch2_hash_lookup(trans, bch2_dirent_hash_desc,
+	return bch2_hash_lookup(trans, iter, bch2_dirent_hash_desc,
 				hash_info, dir_inum, name, flags);
 }
 
@@ -333,26 +333,25 @@ u64 bch2_dirent_lookup(struct bch_fs *c, u64 dir_inum,
 		       const struct qstr *name)
 {
 	struct btree_trans trans;
-	struct btree_iter *iter;
+	struct btree_iter iter;
 	struct bkey_s_c k;
 	u64 inum = 0;
 	int ret = 0;
 
 	bch2_trans_init(&trans, c, 0, 0);
 
-	iter = __bch2_dirent_lookup_trans(&trans, dir_inum,
-					  hash_info, name, 0);
-	ret = PTR_ERR_OR_ZERO(iter);
+	ret = __bch2_dirent_lookup_trans(&trans, &iter, dir_inum,
+					 hash_info, name, 0);
 	if (ret)
 		goto out;
 
-	k = bch2_btree_iter_peek_slot(iter);
+	k = bch2_btree_iter_peek_slot(&iter);
 	ret = bkey_err(k);
 	if (ret)
 		goto out;
 
 	inum = le64_to_cpu(bkey_s_c_to_dirent(k).v->d_inum);
-	bch2_trans_iter_put(&trans, iter);
+	bch2_trans_iter_exit(&trans, &iter);
 out:
 	BUG_ON(ret == -EINTR);
 	bch2_trans_exit(&trans);
@@ -361,7 +360,7 @@ out:
 
 int bch2_empty_dir_trans(struct btree_trans *trans, u64 dir_inum)
 {
-	struct btree_iter *iter;
+	struct btree_iter iter;
 	struct bkey_s_c k;
 	int ret;
 
@@ -375,7 +374,7 @@ int bch2_empty_dir_trans(struct btree_trans *trans, u64 dir_inum)
 			break;
 		}
 	}
-	bch2_trans_iter_put(trans, iter);
+	bch2_trans_iter_exit(trans, &iter);
 
 	return ret;
 }
@@ -383,7 +382,7 @@ int bch2_empty_dir_trans(struct btree_trans *trans, u64 dir_inum)
 int bch2_readdir(struct bch_fs *c, u64 inum, struct dir_context *ctx)
 {
 	struct btree_trans trans;
-	struct btree_iter *iter;
+	struct btree_iter iter;
 	struct bkey_s_c k;
 	struct bkey_s_c_dirent dirent;
 	int ret;
@@ -412,7 +411,7 @@ int bch2_readdir(struct bch_fs *c, u64 inum, struct dir_context *ctx)
 			break;
 		ctx->pos = dirent.k->p.offset + 1;
 	}
-	bch2_trans_iter_put(&trans, iter);
+	bch2_trans_iter_exit(&trans, &iter);
 
 	ret = bch2_trans_exit(&trans) ?: ret;
 
diff --git a/fs/bcachefs/dirent.h b/fs/bcachefs/dirent.h
index e1d8ce377d43..c14f6029e1c9 100644
--- a/fs/bcachefs/dirent.h
+++ b/fs/bcachefs/dirent.h
@@ -50,8 +50,7 @@ int bch2_dirent_rename(struct btree_trans *,
 		       const struct qstr *, u64 *, u64 *,
 		       enum bch_rename_mode);
 
-struct btree_iter *
-__bch2_dirent_lookup_trans(struct btree_trans *, u64,
+int __bch2_dirent_lookup_trans(struct btree_trans *, struct btree_iter *, u64,
 			   const struct bch_hash_info *,
 			   const struct qstr *, unsigned);
 u64 bch2_dirent_lookup(struct bch_fs *, u64, const struct bch_hash_info *,
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index 7ad74987757f..2c538f9b54f8 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -429,13 +429,14 @@ static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf,
 static int get_stripe_key(struct bch_fs *c, u64 idx, struct ec_stripe_buf *stripe)
 {
 	struct btree_trans trans;
-	struct btree_iter *iter;
+	struct btree_iter iter;
 	struct bkey_s_c k;
 	int ret;
 
 	bch2_trans_init(&trans, c, 0, 0);
-	iter = bch2_trans_get_iter(&trans, BTREE_ID_stripes, POS(0, idx), BTREE_ITER_SLOTS);
-	k = bch2_btree_iter_peek_slot(iter);
+	bch2_trans_iter_init(&trans, &iter, BTREE_ID_stripes,
+			     POS(0, idx), BTREE_ITER_SLOTS);
+	k = bch2_btree_iter_peek_slot(&iter);
 	ret = bkey_err(k);
 	if (ret)
 		goto err;
@@ -445,6 +446,7 @@ static int get_stripe_key(struct bch_fs *c, u64 idx, struct ec_stripe_buf *strip
 	}
 	bkey_reassemble(&stripe->key.k_i, k);
 err:
+	bch2_trans_iter_exit(&trans, &iter);
 	bch2_trans_exit(&trans);
 	return ret;
 }
@@ -704,7 +706,7 @@ static int ec_stripe_bkey_insert(struct bch_fs *c,
 				 struct disk_reservation *res)
 {
 	struct btree_trans trans;
-	struct btree_iter *iter;
+	struct btree_iter iter;
 	struct bkey_s_c k;
 	struct bpos min_pos = POS(0, 1);
 	struct bpos start_pos = bpos_max(min_pos, POS(0, c->ec_stripe_hint));
@@ -719,7 +721,7 @@ retry:
 		if (bkey_cmp(k.k->p, POS(0, U32_MAX)) > 0) {
 			if (start_pos.offset) {
 				start_pos = min_pos;
-				bch2_btree_iter_set_pos(iter, start_pos);
+				bch2_btree_iter_set_pos(&iter, start_pos);
 				continue;
 			}
 
@@ -733,19 +735,19 @@ retry:
 
 	goto err;
 found_slot:
-	start_pos = iter->pos;
+	start_pos = iter.pos;
 
-	ret = ec_stripe_mem_alloc(&trans, iter);
+	ret = ec_stripe_mem_alloc(&trans, &iter);
 	if (ret)
 		goto err;
 
-	stripe->k.p = iter->pos;
+	stripe->k.p = iter.pos;
 
-	ret   = bch2_trans_update(&trans, iter, &stripe->k_i, 0) ?:
+	ret   = bch2_trans_update(&trans, &iter, &stripe->k_i, 0) ?:
 		bch2_trans_commit(&trans, res, NULL,
 				BTREE_INSERT_NOFAIL);
 err:
-	bch2_trans_iter_put(&trans, iter);
+	bch2_trans_iter_exit(&trans, &iter);
 
 	if (ret == -EINTR)
 		goto retry;
@@ -759,15 +761,15 @@ err:
 static int ec_stripe_bkey_update(struct btree_trans *trans,
 				 struct bkey_i_stripe *new)
 {
-	struct btree_iter *iter;
+	struct btree_iter iter;
 	struct bkey_s_c k;
 	const struct bch_stripe *existing;
 	unsigned i;
 	int ret;
 
-	iter = bch2_trans_get_iter(trans, BTREE_ID_stripes,
-				   new->k.p, BTREE_ITER_INTENT);
-	k = bch2_btree_iter_peek_slot(iter);
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_stripes,
+			     new->k.p, BTREE_ITER_INTENT);
+	k = bch2_btree_iter_peek_slot(&iter);
 	ret = bkey_err(k);
 	if (ret)
 		goto err;
@@ -790,9 +792,9 @@ static int ec_stripe_bkey_update(struct btree_trans *trans,
 		stripe_blockcount_set(&new->v, i,
 			stripe_blockcount_get(existing, i));
 
-	ret = bch2_trans_update(trans, iter, &new->k_i, 0);
+	ret = bch2_trans_update(trans, &iter, &new->k_i, 0);
 err:
-	bch2_trans_iter_put(trans, iter);
+	bch2_trans_iter_exit(trans, &iter);
 	return ret;
 }
 
@@ -820,7 +822,7 @@ static int ec_stripe_update_ptrs(struct bch_fs *c,
 				 struct bkey *pos)
 {
 	struct btree_trans trans;
-	struct btree_iter *iter;
+	struct btree_iter iter;
 	struct bkey_s_c k;
 	struct bkey_s_extent e;
 	struct bkey_buf sk;
@@ -832,23 +834,23 @@ static int ec_stripe_update_ptrs(struct bch_fs *c,
 
 	/* XXX this doesn't support the reflink btree */
 
-	iter = bch2_trans_get_iter(&trans, BTREE_ID_extents,
-				   bkey_start_pos(pos),
-				   BTREE_ITER_INTENT);
+	bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
+			     bkey_start_pos(pos),
+			     BTREE_ITER_INTENT);
 
-	while ((k = bch2_btree_iter_peek(iter)).k &&
+	while ((k = bch2_btree_iter_peek(&iter)).k &&
 	       !(ret = bkey_err(k)) &&
 	       bkey_cmp(bkey_start_pos(k.k), pos->p) < 0) {
 		struct bch_extent_ptr *ptr, *ec_ptr = NULL;
 
 		if (extent_has_stripe_ptr(k, s->key.k.p.offset)) {
-			bch2_btree_iter_advance(iter);
+			bch2_btree_iter_advance(&iter);
 			continue;
 		}
 
 		block = bkey_matches_stripe(&s->key.v, k);
 		if (block < 0) {
-			bch2_btree_iter_advance(iter);
+			bch2_btree_iter_advance(&iter);
 			continue;
 		}
 
@@ -863,21 +865,21 @@ static int ec_stripe_update_ptrs(struct bch_fs *c,
 
 		extent_stripe_ptr_add(e, s, ec_ptr, block);
 
-		bch2_btree_iter_set_pos(iter, bkey_start_pos(&sk.k->k));
+		bch2_btree_iter_set_pos(&iter, bkey_start_pos(&sk.k->k));
 		next_pos = sk.k->k.p;
 
-		ret   = bch2_btree_iter_traverse(iter) ?:
-			bch2_trans_update(&trans, iter, sk.k, 0) ?:
+		ret   = bch2_btree_iter_traverse(&iter) ?:
+			bch2_trans_update(&trans, &iter, sk.k, 0) ?:
 			bch2_trans_commit(&trans, NULL, NULL,
 					BTREE_INSERT_NOFAIL);
 		if (!ret)
-			bch2_btree_iter_set_pos(iter, next_pos);
+			bch2_btree_iter_set_pos(&iter, next_pos);
 		if (ret == -EINTR)
 			ret = 0;
 		if (ret)
 			break;
 	}
-	bch2_trans_iter_put(&trans, iter);
+	bch2_trans_iter_exit(&trans, &iter);
 
 	bch2_trans_exit(&trans);
 	bch2_bkey_buf_exit(&sk, c);
@@ -1598,7 +1600,7 @@ write:
 int bch2_stripes_write(struct bch_fs *c, unsigned flags)
 {
 	struct btree_trans trans;
-	struct btree_iter *iter;
+	struct btree_iter iter;
 	struct genradix_iter giter;
 	struct bkey_i_stripe *new_key;
 	struct stripe *m;
@@ -1609,8 +1611,8 @@ int bch2_stripes_write(struct bch_fs *c, unsigned flags)
 
 	bch2_trans_init(&trans, c, 0, 0);
 
-	iter = bch2_trans_get_iter(&trans, BTREE_ID_stripes, POS_MIN,
-				   BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+	bch2_trans_iter_init(&trans, &iter, BTREE_ID_stripes, POS_MIN,
+			     BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
 
 	genradix_for_each(&c->stripes[0], giter, m) {
 		if (!m->alive)
@@ -1618,13 +1620,13 @@ int bch2_stripes_write(struct bch_fs *c, unsigned flags)
 
 		ret = __bch2_trans_do(&trans, NULL, NULL,
 				      BTREE_INSERT_NOFAIL|flags,
-			__bch2_stripe_write_key(&trans, iter, m,
+			__bch2_stripe_write_key(&trans, &iter, m,
 					giter.pos, new_key));
 
 		if (ret)
 			break;
 	}
-	bch2_trans_iter_put(&trans, iter);
+	bch2_trans_iter_exit(&trans, &iter);
 
 	bch2_trans_exit(&trans);
 
@@ -1659,19 +1661,19 @@ int bch2_stripes_read(struct bch_fs *c)
 int bch2_ec_mem_alloc(struct bch_fs *c, bool gc)
 {
 	struct btree_trans trans;
-	struct btree_iter *iter;
+	struct btree_iter iter;
 	struct bkey_s_c k;
 	size_t i, idx = 0;
 	int ret = 0;
 
 	bch2_trans_init(&trans, c, 0, 0);
-	iter = bch2_trans_get_iter(&trans, BTREE_ID_stripes, POS(0, U64_MAX), 0);
+	bch2_trans_iter_init(&trans, &iter, BTREE_ID_stripes, POS(0, U64_MAX), 0);
 
-	k = bch2_btree_iter_prev(iter);
+	k = bch2_btree_iter_prev(&iter);
 	if (!IS_ERR_OR_NULL(k.k))
 		idx = k.k->p.offset + 1;
 
-	bch2_trans_iter_put(&trans, iter);
+	bch2_trans_iter_exit(&trans, &iter);
 	ret = bch2_trans_exit(&trans);
 	if (ret)
 		return ret;
diff --git a/fs/bcachefs/extent_update.c b/fs/bcachefs/extent_update.c
index 93d55f46233f..9d959b053def 100644
--- a/fs/bcachefs/extent_update.c
+++ b/fs/bcachefs/extent_update.c
@@ -58,7 +58,7 @@ static int count_iters_for_insert(struct btree_trans *trans,
 		u64 idx = le64_to_cpu(p.v->idx);
 		unsigned sectors = bpos_min(*end, p.k->p).offset -
 			bkey_start_offset(p.k);
-		struct btree_iter *iter;
+		struct btree_iter iter;
 		struct bkey_s_c r_k;
 
 		for_each_btree_key(trans, iter,
@@ -83,8 +83,8 @@ static int count_iters_for_insert(struct btree_trans *trans,
 				break;
 			}
 		}
+		bch2_trans_iter_exit(trans, &iter);
 
-		bch2_trans_iter_put(trans, iter);
 		break;
 	}
 	}
@@ -99,7 +99,7 @@ int bch2_extent_atomic_end(struct btree_trans *trans,
 			   struct bkey_i *insert,
 			   struct bpos *end)
 {
-	struct btree_iter *copy;
+	struct btree_iter copy;
 	struct bkey_s_c k;
 	unsigned nr_iters = 0;
 	int ret;
@@ -118,7 +118,7 @@ int bch2_extent_atomic_end(struct btree_trans *trans,
 	if (ret < 0)
 		return ret;
 
-	copy = bch2_trans_copy_iter(trans, iter);
+	bch2_trans_copy_iter(&copy, iter);
 
 	for_each_btree_key_continue(copy, 0, k, ret) {
 		unsigned offset = 0;
@@ -149,7 +149,7 @@ int bch2_extent_atomic_end(struct btree_trans *trans,
 			break;
 	}
 
-	bch2_trans_iter_put(trans, copy);
+	bch2_trans_iter_exit(trans, &copy);
 	return ret < 0 ? ret : 0;
 }
 
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 6524703f3da4..0190605711e5 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -615,7 +615,7 @@ bool bch2_check_range_allocated(struct bch_fs *c, struct bpos pos, u64 size,
 				unsigned nr_replicas, bool compressed)
 {
 	struct btree_trans trans;
-	struct btree_iter *iter;
+	struct btree_iter iter;
 	struct bpos end = pos;
 	struct bkey_s_c k;
 	bool ret = true;
@@ -636,7 +636,7 @@ bool bch2_check_range_allocated(struct bch_fs *c, struct bpos pos, u64 size,
 			break;
 		}
 	}
-	bch2_trans_iter_put(&trans, iter);
+	bch2_trans_iter_exit(&trans, &iter);
 
 	bch2_trans_exit(&trans);
 
diff --git a/fs/bcachefs/fs-common.c b/fs/bcachefs/fs-common.c
index 2189a11ccad8..a6617455ea12 100644
--- a/fs/bcachefs/fs-common.c
+++ b/fs/bcachefs/fs-common.c
@@ -19,16 +19,15 @@ int bch2_create_trans(struct btree_trans *trans, u64 dir_inum,
 		      struct posix_acl *acl)
 {
 	struct bch_fs *c = trans->c;
-	struct btree_iter *dir_iter = NULL;
-	struct btree_iter *inode_iter = NULL;
+	struct btree_iter dir_iter = { NULL };
+	struct btree_iter inode_iter = { NULL };
 	struct bch_hash_info hash = bch2_hash_info_init(c, new_inode);
 	u64 now = bch2_current_time(c);
 	u64 cpu = raw_smp_processor_id();
 	u64 dir_offset = 0;
 	int ret;
 
-	dir_iter = bch2_inode_peek(trans, dir_u, dir_inum, BTREE_ITER_INTENT);
-	ret = PTR_ERR_OR_ZERO(dir_iter);
+	ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir_inum, BTREE_ITER_INTENT);
 	if (ret)
 		goto err;
 
@@ -37,8 +36,7 @@ int bch2_create_trans(struct btree_trans *trans, u64 dir_inum,
 	if (!name)
 		new_inode->bi_flags |= BCH_INODE_UNLINKED;
 
-	inode_iter = bch2_inode_create(trans, new_inode, U32_MAX, cpu);
-	ret = PTR_ERR_OR_ZERO(inode_iter);
+	ret = bch2_inode_create(trans, &inode_iter, new_inode, U32_MAX, cpu);
 	if (ret)
 		goto err;
 
@@ -63,7 +61,7 @@ int bch2_create_trans(struct btree_trans *trans, u64 dir_inum,
 		if (S_ISDIR(new_inode->bi_mode))
 			dir_u->bi_nlink++;
 
-		ret = bch2_inode_write(trans, dir_iter, dir_u);
+		ret = bch2_inode_write(trans, &dir_iter, dir_u);
 		if (ret)
 			goto err;
 
@@ -82,14 +80,14 @@ int bch2_create_trans(struct btree_trans *trans, u64 dir_inum,
 	}
 
 	/* XXX use bch2_btree_iter_set_snapshot() */
-	inode_iter->snapshot = U32_MAX;
-	bch2_btree_iter_set_pos(inode_iter, SPOS(0, new_inode->bi_inum, U32_MAX));
+	inode_iter.snapshot = U32_MAX;
+	bch2_btree_iter_set_pos(&inode_iter, SPOS(0, new_inode->bi_inum, U32_MAX));
 
-	ret   = bch2_btree_iter_traverse(inode_iter) ?:
-		bch2_inode_write(trans, inode_iter, new_inode);
+	ret   = bch2_btree_iter_traverse(&inode_iter) ?:
+		bch2_inode_write(trans, &inode_iter, new_inode);
 err:
-	bch2_trans_iter_put(trans, inode_iter);
-	bch2_trans_iter_put(trans, dir_iter);
+	bch2_trans_iter_exit(trans, &inode_iter);
+	bch2_trans_iter_exit(trans, &dir_iter);
 	return ret;
 }
 
@@ -98,22 +96,21 @@ int bch2_link_trans(struct btree_trans *trans, u64 dir_inum,
 		    struct bch_inode_unpacked *inode_u, const struct qstr *name)
 {
 	struct bch_fs *c = trans->c;
-	struct btree_iter *dir_iter = NULL, *inode_iter = NULL;
+	struct btree_iter dir_iter = { NULL };
+	struct btree_iter inode_iter = { NULL };
 	struct bch_hash_info dir_hash;
 	u64 now = bch2_current_time(c);
 	u64 dir_offset = 0;
 	int ret;
 
-	inode_iter = bch2_inode_peek(trans, inode_u, inum, BTREE_ITER_INTENT);
-	ret = PTR_ERR_OR_ZERO(inode_iter);
+	ret = bch2_inode_peek(trans, &inode_iter, inode_u, inum, BTREE_ITER_INTENT);
 	if (ret)
 		goto err;
 
 	inode_u->bi_ctime = now;
 	bch2_inode_nlink_inc(inode_u);
 
-	dir_iter = bch2_inode_peek(trans, dir_u, dir_inum, 0);
-	ret = PTR_ERR_OR_ZERO(dir_iter);
+	ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir_inum, 0);
 	if (ret)
 		goto err;
 
@@ -133,11 +130,11 @@ int bch2_link_trans(struct btree_trans *trans, u64 dir_inum,
 		inode_u->bi_dir_offset	= dir_offset;
 	}
 
-	ret =   bch2_inode_write(trans, dir_iter, dir_u) ?:
-		bch2_inode_write(trans, inode_iter, inode_u);
+	ret =   bch2_inode_write(trans, &dir_iter, dir_u) ?:
+		bch2_inode_write(trans, &inode_iter, inode_u);
 err:
-	bch2_trans_iter_put(trans, dir_iter);
-	bch2_trans_iter_put(trans, inode_iter);
+	bch2_trans_iter_exit(trans, &dir_iter);
+	bch2_trans_iter_exit(trans, &inode_iter);
 	return ret;
 }
 
@@ -147,35 +144,33 @@ int bch2_unlink_trans(struct btree_trans *trans,
 		      const struct qstr *name)
 {
 	struct bch_fs *c = trans->c;
-	struct btree_iter *dir_iter = NULL, *dirent_iter = NULL,
-			  *inode_iter = NULL;
+	struct btree_iter dir_iter = { NULL };
+	struct btree_iter dirent_iter = { NULL };
+	struct btree_iter inode_iter = { NULL };
 	struct bch_hash_info dir_hash;
 	u64 inum, now = bch2_current_time(c);
 	struct bkey_s_c k;
 	int ret;
 
-	dir_iter = bch2_inode_peek(trans, dir_u, dir_inum, BTREE_ITER_INTENT);
-	ret = PTR_ERR_OR_ZERO(dir_iter);
+	ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir_inum, BTREE_ITER_INTENT);
 	if (ret)
 		goto err;
 
 	dir_hash = bch2_hash_info_init(c, dir_u);
 
-	dirent_iter = __bch2_dirent_lookup_trans(trans, dir_inum, &dir_hash,
-						 name, BTREE_ITER_INTENT);
-	ret = PTR_ERR_OR_ZERO(dirent_iter);
+	ret = __bch2_dirent_lookup_trans(trans, &dirent_iter, dir_inum, &dir_hash,
+					 name, BTREE_ITER_INTENT);
 	if (ret)
 		goto err;
 
-	k = bch2_btree_iter_peek_slot(dirent_iter);
+	k = bch2_btree_iter_peek_slot(&dirent_iter);
 	ret = bkey_err(k);
 	if (ret)
 		goto err;
 
 	inum = le64_to_cpu(bkey_s_c_to_dirent(k).v->d_inum);
 
-	inode_iter = bch2_inode_peek(trans, inode_u, inum, BTREE_ITER_INTENT);
-	ret = PTR_ERR_OR_ZERO(inode_iter);
+	ret = bch2_inode_peek(trans, &inode_iter, inode_u, inum, BTREE_ITER_INTENT);
 	if (ret)
 		goto err;
 
@@ -192,13 +187,13 @@ int bch2_unlink_trans(struct btree_trans *trans,
 	ret =   (S_ISDIR(inode_u->bi_mode)
 		 ? bch2_empty_dir_trans(trans, inum)
 		 : 0) ?:
-		bch2_dirent_delete_at(trans, &dir_hash, dirent_iter) ?:
-		bch2_inode_write(trans, dir_iter, dir_u) ?:
-		bch2_inode_write(trans, inode_iter, inode_u);
+		bch2_dirent_delete_at(trans, &dir_hash, &dirent_iter) ?:
+		bch2_inode_write(trans, &dir_iter, dir_u) ?:
+		bch2_inode_write(trans, &inode_iter, inode_u);
 err:
-	bch2_trans_iter_put(trans, inode_iter);
-	bch2_trans_iter_put(trans, dirent_iter);
-	bch2_trans_iter_put(trans, dir_iter);
+	bch2_trans_iter_exit(trans, &inode_iter);
+	bch2_trans_iter_exit(trans, &dirent_iter);
+	bch2_trans_iter_exit(trans, &dir_iter);
 	return ret;
 }
 
@@ -236,25 +231,25 @@ int bch2_rename_trans(struct btree_trans *trans,
 		      enum bch_rename_mode mode)
 {
 	struct bch_fs *c = trans->c;
-	struct btree_iter *src_dir_iter = NULL, *dst_dir_iter = NULL;
-	struct btree_iter *src_inode_iter = NULL, *dst_inode_iter = NULL;
+	struct btree_iter src_dir_iter = { NULL };
+	struct btree_iter dst_dir_iter = { NULL };
+	struct btree_iter src_inode_iter = { NULL };
+	struct btree_iter dst_inode_iter = { NULL };
 	struct bch_hash_info src_hash, dst_hash;
 	u64 src_inode, src_offset, dst_inode, dst_offset;
 	u64 now = bch2_current_time(c);
 	int ret;
 
-	src_dir_iter = bch2_inode_peek(trans, src_dir_u, src_dir,
-				       BTREE_ITER_INTENT);
-	ret = PTR_ERR_OR_ZERO(src_dir_iter);
+	ret = bch2_inode_peek(trans, &src_dir_iter, src_dir_u, src_dir,
+			      BTREE_ITER_INTENT);
 	if (ret)
 		goto err;
 
 	src_hash = bch2_hash_info_init(c, src_dir_u);
 
 	if (dst_dir != src_dir) {
-		dst_dir_iter = bch2_inode_peek(trans, dst_dir_u, dst_dir,
-					       BTREE_ITER_INTENT);
-		ret = PTR_ERR_OR_ZERO(dst_dir_iter);
+		ret = bch2_inode_peek(trans, &dst_dir_iter, dst_dir_u, dst_dir,
+				      BTREE_ITER_INTENT);
 		if (ret)
 			goto err;
 
@@ -273,16 +268,14 @@ int bch2_rename_trans(struct btree_trans *trans,
 	if (ret)
 		goto err;
 
-	src_inode_iter = bch2_inode_peek(trans, src_inode_u, src_inode,
-					 BTREE_ITER_INTENT);
-	ret = PTR_ERR_OR_ZERO(src_inode_iter);
+	ret = bch2_inode_peek(trans, &src_inode_iter, src_inode_u, src_inode,
+			      BTREE_ITER_INTENT);
 	if (ret)
 		goto err;
 
 	if (dst_inode) {
-		dst_inode_iter = bch2_inode_peek(trans, dst_inode_u, dst_inode,
-						 BTREE_ITER_INTENT);
-		ret = PTR_ERR_OR_ZERO(dst_inode_iter);
+		ret = bch2_inode_peek(trans, &dst_inode_iter, dst_inode_u, dst_inode,
+				      BTREE_ITER_INTENT);
 		if (ret)
 			goto err;
 	}
@@ -357,18 +350,18 @@ int bch2_rename_trans(struct btree_trans *trans,
 	if (dst_inode)
 		dst_inode_u->bi_ctime	= now;
 
-	ret =   bch2_inode_write(trans, src_dir_iter, src_dir_u) ?:
+	ret =   bch2_inode_write(trans, &src_dir_iter, src_dir_u) ?:
 		(src_dir != dst_dir
-		 ? bch2_inode_write(trans, dst_dir_iter, dst_dir_u)
+		 ? bch2_inode_write(trans, &dst_dir_iter, dst_dir_u)
 		 : 0 ) ?:
-		bch2_inode_write(trans, src_inode_iter, src_inode_u) ?:
+		bch2_inode_write(trans, &src_inode_iter, src_inode_u) ?:
 		(dst_inode
-		 ? bch2_inode_write(trans, dst_inode_iter, dst_inode_u)
+		 ? bch2_inode_write(trans, &dst_inode_iter, dst_inode_u)
 		 : 0 );
 err:
-	bch2_trans_iter_put(trans, dst_inode_iter);
-	bch2_trans_iter_put(trans, src_inode_iter);
-	bch2_trans_iter_put(trans, dst_dir_iter);
-	bch2_trans_iter_put(trans, src_dir_iter);
+	bch2_trans_iter_exit(trans, &dst_inode_iter);
+	bch2_trans_iter_exit(trans, &src_inode_iter);
+	bch2_trans_iter_exit(trans, &dst_dir_iter);
+	bch2_trans_iter_exit(trans, &src_dir_iter);
 	return ret;
 }
diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 251029c33164..909db2f104cd 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -867,7 +867,7 @@ void bch2_readahead(struct readahead_control *ractl)
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
 	struct bch_io_opts opts = io_opts(c, &inode->ei_inode);
 	struct btree_trans trans;
-	struct btree_iter *iter;
+	struct btree_iter iter;
 	struct page *page;
 	struct readpages_iter readpages_iter;
 	int ret;
@@ -876,8 +876,8 @@ void bch2_readahead(struct readahead_control *ractl)
 	BUG_ON(ret);
 
 	bch2_trans_init(&trans, c, 0, 0);
-	iter = bch2_trans_get_iter(&trans, BTREE_ID_extents, POS_MIN,
-				   BTREE_ITER_SLOTS);
+	bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents, POS_MIN,
+			     BTREE_ITER_SLOTS);
 
 	bch2_pagecache_add_get(&inode->ei_pagecache_lock);
 
@@ -898,13 +898,13 @@ void bch2_readahead(struct readahead_control *ractl)
 		rbio->bio.bi_end_io = bch2_readpages_end_io;
 		BUG_ON(!bio_add_page(&rbio->bio, page, PAGE_SIZE, 0));
 
-		bchfs_read(&trans, iter, rbio, inode->v.i_ino,
+		bchfs_read(&trans, &iter, rbio, inode->v.i_ino,
 			   &readpages_iter);
 	}
 
 	bch2_pagecache_add_put(&inode->ei_pagecache_lock);
 
-	bch2_trans_iter_put(&trans, iter);
+	bch2_trans_iter_exit(&trans, &iter);
 	bch2_trans_exit(&trans);
 	kfree(readpages_iter.pages);
 }
@@ -913,7 +913,7 @@ static void __bchfs_readpage(struct bch_fs *c, struct bch_read_bio *rbio,
 			     u64 inum, struct page *page)
 {
 	struct btree_trans trans;
-	struct btree_iter *iter;
+	struct btree_iter iter;
 
 	bch2_page_state_create(page, __GFP_NOFAIL);
 
@@ -923,12 +923,12 @@ static void __bchfs_readpage(struct bch_fs *c, struct bch_read_bio *rbio,
 	BUG_ON(!bio_add_page(&rbio->bio, page, PAGE_SIZE, 0));
 
 	bch2_trans_init(&trans, c, 0, 0);
-	iter = bch2_trans_get_iter(&trans, BTREE_ID_extents, POS_MIN,
-				   BTREE_ITER_SLOTS);
+	bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents, POS_MIN,
+			     BTREE_ITER_SLOTS);
 
-	bchfs_read(&trans, iter, rbio, inum, NULL);
+	bchfs_read(&trans, &iter, rbio, inum, NULL);
 
-	bch2_trans_iter_put(&trans, iter);
+	bch2_trans_iter_exit(&trans, &iter);
 	bch2_trans_exit(&trans);
 }
 
@@ -2146,7 +2146,7 @@ static inline int range_has_data(struct bch_fs *c,
 				  struct bpos end)
 {
 	struct btree_trans trans;
-	struct btree_iter *iter;
+	struct btree_iter iter;
 	struct bkey_s_c k;
 	int ret = 0;
 
@@ -2161,7 +2161,7 @@ static inline int range_has_data(struct bch_fs *c,
 			break;
 		}
 	}
-	bch2_trans_iter_put(&trans, iter);
+	bch2_trans_iter_exit(&trans, &iter);
 
 	return bch2_trans_exit(&trans) ?: ret;
 }
@@ -2471,7 +2471,7 @@ static long bchfs_fcollapse_finsert(struct bch_inode_info *inode,
 	struct address_space *mapping = inode->v.i_mapping;
 	struct bkey_buf copy;
 	struct btree_trans trans;
-	struct btree_iter *src, *dst, *del;
+	struct btree_iter src, dst, del;
 	loff_t shift, new_size;
 	u64 src_start;
 	int ret = 0;
@@ -2536,11 +2536,11 @@ static long bchfs_fcollapse_finsert(struct bch_inode_info *inode,
 
 	bch2_bkey_buf_init(&copy);
 	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024);
-	src = bch2_trans_get_iter(&trans, BTREE_ID_extents,
+	bch2_trans_iter_init(&trans, &src, BTREE_ID_extents,
 			POS(inode->v.i_ino, src_start >> 9),
 			BTREE_ITER_INTENT);
-	dst = bch2_trans_copy_iter(&trans, src);
-	del = bch2_trans_copy_iter(&trans, src);
+	bch2_trans_copy_iter(&dst, &src);
+	bch2_trans_copy_iter(&del, &src);
 
 	while (ret == 0 || ret == -EINTR) {
 		struct disk_reservation disk_res =
@@ -2555,8 +2555,8 @@ static long bchfs_fcollapse_finsert(struct bch_inode_info *inode,
 		bch2_trans_begin(&trans);
 
 		k = insert
-			? bch2_btree_iter_peek_prev(src)
-			: bch2_btree_iter_peek(src);
+			? bch2_btree_iter_peek_prev(&src)
+			: bch2_btree_iter_peek(&src);
 		if ((ret = bkey_err(k)))
 			continue;
 
@@ -2574,9 +2574,9 @@ reassemble:
 			bch2_cut_front(move_pos, copy.k);
 
 		copy.k->k.p.offset += shift >> 9;
-		bch2_btree_iter_set_pos(dst, bkey_start_pos(&copy.k->k));
+		bch2_btree_iter_set_pos(&dst, bkey_start_pos(&copy.k->k));
 
-		ret = bch2_extent_atomic_end(&trans, dst, copy.k, &atomic_end);
+		ret = bch2_extent_atomic_end(&trans, &dst, copy.k, &atomic_end);
 		if (ret)
 			continue;
 
@@ -2594,7 +2594,7 @@ reassemble:
 		delete.k.p = copy.k->k.p;
 		delete.k.size = copy.k->k.size;
 		delete.k.p.offset -= shift >> 9;
-		bch2_btree_iter_set_pos(del, bkey_start_pos(&delete.k));
+		bch2_btree_iter_set_pos(&del, bkey_start_pos(&delete.k));
 
 		next_pos = insert ? bkey_start_pos(&delete.k) : delete.k.p;
 
@@ -2615,20 +2615,20 @@ reassemble:
 			BUG_ON(ret);
 		}
 
-		ret =   bch2_btree_iter_traverse(del) ?:
-			bch2_trans_update(&trans, del, &delete, trigger_flags) ?:
-			bch2_trans_update(&trans, dst, copy.k, trigger_flags) ?:
+		ret =   bch2_btree_iter_traverse(&del) ?:
+			bch2_trans_update(&trans, &del, &delete, trigger_flags) ?:
+			bch2_trans_update(&trans, &dst, copy.k, trigger_flags) ?:
 			bch2_trans_commit(&trans, &disk_res,
 					  &inode->ei_journal_seq,
 					  BTREE_INSERT_NOFAIL);
 		bch2_disk_reservation_put(c, &disk_res);
 
 		if (!ret)
-			bch2_btree_iter_set_pos(src, next_pos);
+			bch2_btree_iter_set_pos(&src, next_pos);
 	}
-	bch2_trans_iter_put(&trans, del);
-	bch2_trans_iter_put(&trans, dst);
-	bch2_trans_iter_put(&trans, src);
+	bch2_trans_iter_exit(&trans, &del);
+	bch2_trans_iter_exit(&trans, &dst);
+	bch2_trans_iter_exit(&trans, &src);
 	bch2_trans_exit(&trans);
 	bch2_bkey_buf_exit(&copy, c);
 
@@ -2653,18 +2653,18 @@ static int __bchfs_fallocate(struct bch_inode_info *inode, int mode,
 {
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
 	struct btree_trans trans;
-	struct btree_iter *iter;
+	struct btree_iter iter;
 	struct bpos end_pos = POS(inode->v.i_ino, end_sector);
 	unsigned replicas = io_opts(c, &inode->ei_inode).data_replicas;
 	int ret = 0;
 
 	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 512);
 
-	iter = bch2_trans_get_iter(&trans, BTREE_ID_extents,
+	bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
 			POS(inode->v.i_ino, start_sector),
 			BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
 
-	while (!ret && bkey_cmp(iter->pos, end_pos) < 0) {
+	while (!ret && bkey_cmp(iter.pos, end_pos) < 0) {
 		s64 i_sectors_delta = 0;
 		struct disk_reservation disk_res = { 0 };
 		struct quota_res quota_res = { 0 };
@@ -2674,20 +2674,20 @@ static int __bchfs_fallocate(struct bch_inode_info *inode, int mode,
 
 		bch2_trans_begin(&trans);
 
-		k = bch2_btree_iter_peek_slot(iter);
+		k = bch2_btree_iter_peek_slot(&iter);
 		if ((ret = bkey_err(k)))
 			goto bkey_err;
 
 		/* already reserved */
 		if (k.k->type == KEY_TYPE_reservation &&
 		    bkey_s_c_to_reservation(k).v->nr_replicas >= replicas) {
-			bch2_btree_iter_advance(iter);
+			bch2_btree_iter_advance(&iter);
 			continue;
 		}
 
 		if (bkey_extent_is_data(k.k) &&
 		    !(mode & FALLOC_FL_ZERO_RANGE)) {
-			bch2_btree_iter_advance(iter);
+			bch2_btree_iter_advance(&iter);
 			continue;
 		}
 
@@ -2696,7 +2696,7 @@ static int __bchfs_fallocate(struct bch_inode_info *inode, int mode,
 		reservation.k.p		= k.k->p;
 		reservation.k.size	= k.k->size;
 
-		bch2_cut_front(iter->pos,	&reservation.k_i);
+		bch2_cut_front(iter.pos,	&reservation.k_i);
 		bch2_cut_back(end_pos,		&reservation.k_i);
 
 		sectors = reservation.k.size;
@@ -2720,7 +2720,7 @@ static int __bchfs_fallocate(struct bch_inode_info *inode, int mode,
 			reservation.v.nr_replicas = disk_res.nr_replicas;
 		}
 
-		ret = bch2_extent_update(&trans, iter, &reservation.k_i,
+		ret = bch2_extent_update(&trans, &iter, &reservation.k_i,
 				&disk_res, &inode->ei_journal_seq,
 				0, &i_sectors_delta, true);
 		i_sectors_acct(c, inode, &quota_res, i_sectors_delta);
@@ -2730,7 +2730,7 @@ bkey_err:
 		if (ret == -EINTR)
 			ret = 0;
 	}
-	bch2_trans_iter_put(&trans, iter);
+	bch2_trans_iter_exit(&trans, &iter);
 	bch2_trans_exit(&trans);
 	return ret;
 }
@@ -3010,7 +3010,7 @@ static loff_t bch2_seek_data(struct file *file, u64 offset)
 	struct bch_inode_info *inode = file_bch_inode(file);
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
 	struct btree_trans trans;
-	struct btree_iter *iter;
+	struct btree_iter iter;
 	struct bkey_s_c k;
 	u64 isize, next_data = MAX_LFS_FILESIZE;
 	int ret;
@@ -3031,7 +3031,7 @@ static loff_t bch2_seek_data(struct file *file, u64 offset)
 		} else if (k.k->p.offset >> 9 > isize)
 			break;
 	}
-	bch2_trans_iter_put(&trans, iter);
+	bch2_trans_iter_exit(&trans, &iter);
 
 	ret = bch2_trans_exit(&trans) ?: ret;
 	if (ret)
@@ -3106,7 +3106,7 @@ static loff_t bch2_seek_hole(struct file *file, u64 offset)
 	struct bch_inode_info *inode = file_bch_inode(file);
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
 	struct btree_trans trans;
-	struct btree_iter *iter;
+	struct btree_iter iter;
 	struct bkey_s_c k;
 	u64 isize, next_hole = MAX_LFS_FILESIZE;
 	int ret;
@@ -3135,7 +3135,7 @@ static loff_t bch2_seek_hole(struct file *file, u64 offset)
 			offset = max(offset, bkey_start_offset(k.k) << 9);
 		}
 	}
-	bch2_trans_iter_put(&trans, iter);
+	bch2_trans_iter_exit(&trans, &iter);
 
 	ret = bch2_trans_exit(&trans) ?: ret;
 	if (ret)
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index f6c058540712..570ae826ebb5 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -142,7 +142,7 @@ int __must_check bch2_write_inode(struct bch_fs *c,
 				  void *p, unsigned fields)
 {
 	struct btree_trans trans;
-	struct btree_iter *iter;
+	struct btree_iter iter = { NULL };
 	struct bch_inode_unpacked inode_u;
 	int ret;
 
@@ -150,11 +150,10 @@ int __must_check bch2_write_inode(struct bch_fs *c,
 retry:
 	bch2_trans_begin(&trans);
 
-	iter = bch2_inode_peek(&trans, &inode_u, inode->v.i_ino,
-			       BTREE_ITER_INTENT);
-	ret   = PTR_ERR_OR_ZERO(iter) ?:
+	ret   = bch2_inode_peek(&trans, &iter, &inode_u, inode->v.i_ino,
+				BTREE_ITER_INTENT) ?:
 		(set ? set(inode, &inode_u, p) : 0) ?:
-		bch2_inode_write(&trans, iter, &inode_u) ?:
+		bch2_inode_write(&trans, &iter, &inode_u) ?:
 		bch2_trans_commit(&trans, NULL,
 				  &inode->ei_journal_seq,
 				  BTREE_INSERT_NOFAIL);
@@ -166,7 +165,7 @@ retry:
 	if (!ret)
 		bch2_inode_update_after_write(c, inode, &inode_u, fields);
 
-	bch2_trans_iter_put(&trans, iter);
+	bch2_trans_iter_exit(&trans, &iter);
 
 	if (ret == -EINTR)
 		goto retry;
@@ -687,7 +686,7 @@ int bch2_setattr_nonsize(struct mnt_idmap *idmap,
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
 	struct bch_qid qid;
 	struct btree_trans trans;
-	struct btree_iter *inode_iter;
+	struct btree_iter inode_iter = { NULL };
 	struct bch_inode_unpacked inode_u;
 	struct posix_acl *acl = NULL;
 	int ret;
@@ -713,9 +712,8 @@ retry:
 	kfree(acl);
 	acl = NULL;
 
-	inode_iter = bch2_inode_peek(&trans, &inode_u, inode->v.i_ino,
-				     BTREE_ITER_INTENT);
-	ret = PTR_ERR_OR_ZERO(inode_iter);
+	ret = bch2_inode_peek(&trans, &inode_iter, &inode_u, inode->v.i_ino,
+			      BTREE_ITER_INTENT);
 	if (ret)
 		goto btree_err;
 
@@ -727,12 +725,12 @@ retry:
 			goto btree_err;
 	}
 
-	ret =   bch2_inode_write(&trans, inode_iter, &inode_u) ?:
+	ret =   bch2_inode_write(&trans, &inode_iter, &inode_u) ?:
 		bch2_trans_commit(&trans, NULL,
 				  &inode->ei_journal_seq,
 				  BTREE_INSERT_NOFAIL);
 btree_err:
-	bch2_trans_iter_put(&trans, inode_iter);
+	bch2_trans_iter_exit(&trans, &inode_iter);
 
 	if (ret == -EINTR)
 		goto retry;
@@ -883,7 +881,7 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
 	struct bch_fs *c = vinode->i_sb->s_fs_info;
 	struct bch_inode_info *ei = to_bch_ei(vinode);
 	struct btree_trans trans;
-	struct btree_iter *iter;
+	struct btree_iter iter;
 	struct bkey_s_c k;
 	struct bkey_buf cur, prev;
 	struct bpos end = POS(ei->v.i_ino, (start + len) >> 9);
@@ -902,23 +900,23 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
 	bch2_bkey_buf_init(&prev);
 	bch2_trans_init(&trans, c, 0, 0);
 
-	iter = bch2_trans_get_iter(&trans, BTREE_ID_extents,
-				   POS(ei->v.i_ino, start >> 9), 0);
+	bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
+			     POS(ei->v.i_ino, start >> 9), 0);
 retry:
 	bch2_trans_begin(&trans);
 
-	while ((k = bch2_btree_iter_peek(iter)).k &&
+	while ((k = bch2_btree_iter_peek(&iter)).k &&
 	       !(ret = bkey_err(k)) &&
-	       bkey_cmp(iter->pos, end) < 0) {
+	       bkey_cmp(iter.pos, end) < 0) {
 		enum btree_id data_btree = BTREE_ID_extents;
 
 		if (!bkey_extent_is_data(k.k) &&
 		    k.k->type != KEY_TYPE_reservation) {
-			bch2_btree_iter_advance(iter);
+			bch2_btree_iter_advance(&iter);
 			continue;
 		}
 
-		offset_into_extent	= iter->pos.offset -
+		offset_into_extent	= iter.pos.offset -
 			bkey_start_offset(k.k);
 		sectors			= k.k->size - offset_into_extent;
 
@@ -939,7 +937,7 @@ retry:
 				   offset_into_extent),
 			       cur.k);
 		bch2_key_resize(&cur.k->k, sectors);
-		cur.k->k.p = iter->pos;
+		cur.k->k.p = iter.pos;
 		cur.k->k.p.offset += cur.k->k.size;
 
 		if (have_extent) {
@@ -952,8 +950,8 @@ retry:
 		bkey_copy(prev.k, cur.k);
 		have_extent = true;
 
-		bch2_btree_iter_set_pos(iter,
-			POS(iter->pos.inode, iter->pos.offset + sectors));
+		bch2_btree_iter_set_pos(&iter,
+			POS(iter.pos.inode, iter.pos.offset + sectors));
 	}
 
 	if (ret == -EINTR)
@@ -963,7 +961,7 @@ retry:
 		ret = bch2_fill_extent(c, info, bkey_i_to_s_c(prev.k),
 				       FIEMAP_EXTENT_LAST);
 
-	bch2_trans_iter_put(&trans, iter);
+	bch2_trans_iter_exit(&trans, &iter);
 	ret = bch2_trans_exit(&trans) ?: ret;
 	bch2_bkey_buf_exit(&cur, c);
 	bch2_bkey_buf_exit(&prev, c);
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 36eba46d566e..eb979e79eaac 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -19,7 +19,7 @@
 
 static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum)
 {
-	struct btree_iter *iter;
+	struct btree_iter iter;
 	struct bkey_s_c k;
 	u64 sectors = 0;
 	int ret;
@@ -33,7 +33,7 @@ static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum)
 			sectors += k.k->size;
 	}
 
-	bch2_trans_iter_free(trans, iter);
+	bch2_trans_iter_exit(trans, &iter);
 
 	return ret ?: sectors;
 }
@@ -42,24 +42,24 @@ static int __lookup_inode(struct btree_trans *trans, u64 inode_nr,
 			  struct bch_inode_unpacked *inode,
 			  u32 *snapshot)
 {
-	struct btree_iter *iter;
+	struct btree_iter iter;
 	struct bkey_s_c k;
 	int ret;
 
-	iter = bch2_trans_get_iter(trans, BTREE_ID_inodes,
-			POS(0, inode_nr), 0);
-	k = bch2_btree_iter_peek_slot(iter);
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes,
+			     POS(0, inode_nr), 0);
+	k = bch2_btree_iter_peek_slot(&iter);
 	ret = bkey_err(k);
 	if (ret)
 		goto err;
 
 	if (snapshot)
-		*snapshot = iter->pos.snapshot;
+		*snapshot = iter.pos.snapshot;
 	ret = k.k->type == KEY_TYPE_inode
 		? bch2_inode_unpack(bkey_s_c_to_inode(k), inode)
 		: -ENOENT;
 err:
-	bch2_trans_iter_free(trans, iter);
+	bch2_trans_iter_exit(trans, &iter);
 	return ret;
 }
 
@@ -74,13 +74,16 @@ static int __write_inode(struct btree_trans *trans,
 			 struct bch_inode_unpacked *inode,
 			 u32 snapshot)
 {
-	struct btree_iter *inode_iter =
-		bch2_trans_get_iter(trans, BTREE_ID_inodes,
-				    SPOS(0, inode->bi_inum, snapshot),
-				    BTREE_ITER_INTENT);
-	int ret = bch2_btree_iter_traverse(inode_iter) ?:
-		bch2_inode_write(trans, inode_iter, inode);
-	bch2_trans_iter_put(trans, inode_iter);
+	struct btree_iter iter;
+	int ret;
+
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes,
+			    SPOS(0, inode->bi_inum, snapshot),
+			    BTREE_ITER_INTENT);
+
+	ret   = bch2_btree_iter_traverse(&iter) ?:
+		bch2_inode_write(trans, &iter, inode);
+	bch2_trans_iter_exit(trans, &iter);
 	return ret;
 }
 
@@ -100,7 +103,7 @@ static int write_inode(struct btree_trans *trans,
 static int __remove_dirent(struct btree_trans *trans, struct bpos pos)
 {
 	struct bch_fs *c = trans->c;
-	struct btree_iter *iter;
+	struct btree_iter iter;
 	struct bch_inode_unpacked dir_inode;
 	struct bch_hash_info dir_hash_info;
 	int ret;
@@ -111,11 +114,11 @@ static int __remove_dirent(struct btree_trans *trans, struct bpos pos)
 
 	dir_hash_info = bch2_hash_info_init(c, &dir_inode);
 
-	iter = bch2_trans_get_iter(trans, BTREE_ID_dirents, pos, BTREE_ITER_INTENT);
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_dirents, pos, BTREE_ITER_INTENT);
 
 	ret = bch2_hash_delete_at(trans, bch2_dirent_hash_desc,
-				  &dir_hash_info, iter);
-	bch2_trans_iter_put(trans, iter);
+				  &dir_hash_info, &iter);
+	bch2_trans_iter_exit(trans, &iter);
 	return ret;
 }
 
@@ -230,13 +233,13 @@ static int reattach_inode(struct btree_trans *trans,
 static int remove_backpointer(struct btree_trans *trans,
 			      struct bch_inode_unpacked *inode)
 {
-	struct btree_iter *iter;
+	struct btree_iter iter;
 	struct bkey_s_c k;
 	int ret;
 
-	iter = bch2_trans_get_iter(trans, BTREE_ID_dirents,
-				   POS(inode->bi_dir, inode->bi_dir_offset), 0);
-	k = bch2_btree_iter_peek_slot(iter);
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_dirents,
+			     POS(inode->bi_dir, inode->bi_dir_offset), 0);
+	k = bch2_btree_iter_peek_slot(&iter);
 	ret = bkey_err(k);
 	if (ret)
 		goto out;
@@ -247,7 +250,7 @@ static int remove_backpointer(struct btree_trans *trans,
 
 	ret = remove_dirent(trans, k.k->p);
 out:
-	bch2_trans_iter_put(trans, iter);
+	bch2_trans_iter_exit(trans, &iter);
 	return ret;
 }
 
@@ -343,7 +346,7 @@ static int hash_check_key(struct btree_trans *trans,
 			  struct btree_iter *k_iter, struct bkey_s_c hash_k)
 {
 	struct bch_fs *c = trans->c;
-	struct btree_iter *iter = NULL;
+	struct btree_iter iter = { NULL };
 	char buf[200];
 	struct bkey_s_c k;
 	u64 hash;
@@ -378,12 +381,12 @@ static int hash_check_key(struct btree_trans *trans,
 		}
 
 		if (bkey_deleted(k.k)) {
-			bch2_trans_iter_free(trans, iter);
+			bch2_trans_iter_exit(trans, &iter);
 			goto bad_hash;
 		}
 
 	}
-	bch2_trans_iter_free(trans, iter);
+	bch2_trans_iter_exit(trans, &iter);
 	return ret;
 bad_hash:
 	if (fsck_err(c, "hash table key at wrong offset: btree %u inode %llu offset %llu, "
@@ -513,7 +516,7 @@ noinline_for_stack
 static int check_inodes(struct bch_fs *c, bool full)
 {
 	struct btree_trans trans;
-	struct btree_iter *iter;
+	struct btree_iter iter;
 	struct bkey_s_c k;
 	struct bkey_s_c_inode inode;
 	int ret;
@@ -532,12 +535,12 @@ static int check_inodes(struct bch_fs *c, bool full)
 		    (inode.v->bi_flags & (BCH_INODE_I_SIZE_DIRTY|
 					  BCH_INODE_I_SECTORS_DIRTY|
 					  BCH_INODE_UNLINKED))) {
-			ret = check_inode(&trans, iter, inode);
+			ret = check_inode(&trans, &iter, inode);
 			if (ret)
 				break;
 		}
 	}
-	bch2_trans_iter_put(&trans, iter);
+	bch2_trans_iter_exit(&trans, &iter);
 
 	BUG_ON(ret == -EINTR);
 
@@ -547,7 +550,7 @@ static int check_inodes(struct bch_fs *c, bool full)
 static int fix_overlapping_extent(struct btree_trans *trans,
 				       struct bkey_s_c k, struct bpos cut_at)
 {
-	struct btree_iter *iter;
+	struct btree_iter iter;
 	struct bkey_i *u;
 	int ret;
 
@@ -567,29 +570,29 @@ static int fix_overlapping_extent(struct btree_trans *trans,
 	 * assume things about extent overwrites - we should be running the
 	 * triggers manually here
 	 */
-	iter = bch2_trans_get_iter(trans, BTREE_ID_extents, u->k.p,
-				   BTREE_ITER_INTENT|BTREE_ITER_NOT_EXTENTS);
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, u->k.p,
+			     BTREE_ITER_INTENT|BTREE_ITER_NOT_EXTENTS);
 
-	BUG_ON(iter->flags & BTREE_ITER_IS_EXTENTS);
-	ret   = bch2_btree_iter_traverse(iter) ?:
-		bch2_trans_update(trans, iter, u, BTREE_TRIGGER_NORUN) ?:
+	BUG_ON(iter.flags & BTREE_ITER_IS_EXTENTS);
+	ret   = bch2_btree_iter_traverse(&iter) ?:
+		bch2_trans_update(trans, &iter, u, BTREE_TRIGGER_NORUN) ?:
 		bch2_trans_commit(trans, NULL, NULL,
 				  BTREE_INSERT_NOFAIL|
 				  BTREE_INSERT_LAZY_RW);
-	bch2_trans_iter_put(trans, iter);
+	bch2_trans_iter_exit(trans, &iter);
 	return ret;
 }
 
 static int inode_backpointer_exists(struct btree_trans *trans,
 				    struct bch_inode_unpacked *inode)
 {
-	struct btree_iter *iter;
+	struct btree_iter iter;
 	struct bkey_s_c k;
 	int ret;
 
-	iter = bch2_trans_get_iter(trans, BTREE_ID_dirents,
-				   POS(inode->bi_dir, inode->bi_dir_offset), 0);
-	k = bch2_btree_iter_peek_slot(iter);
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_dirents,
+			     POS(inode->bi_dir, inode->bi_dir_offset), 0);
+	k = bch2_btree_iter_peek_slot(&iter);
 	ret = bkey_err(k);
 	if (ret)
 		goto out;
@@ -598,7 +601,7 @@ static int inode_backpointer_exists(struct btree_trans *trans,
 
 	ret = le64_to_cpu(bkey_s_c_to_dirent(k).v->d_inum) == inode->bi_inum;
 out:
-	bch2_trans_iter_free(trans, iter);
+	bch2_trans_iter_exit(trans, &iter);
 	return ret;
 }
 
@@ -618,7 +621,7 @@ static int check_extents(struct bch_fs *c)
 {
 	struct inode_walker w = inode_walker_init();
 	struct btree_trans trans;
-	struct btree_iter *iter;
+	struct btree_iter iter;
 	struct bkey_s_c k;
 	struct bkey_buf prev;
 	u64 i_sectors = 0;
@@ -630,12 +633,12 @@ static int check_extents(struct bch_fs *c)
 
 	bch_verbose(c, "checking extents");
 
-	iter = bch2_trans_get_iter(&trans, BTREE_ID_extents,
-				   POS(BCACHEFS_ROOT_INO, 0),
-				   BTREE_ITER_INTENT|
-				   BTREE_ITER_PREFETCH);
+	bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
+			     POS(BCACHEFS_ROOT_INO, 0),
+			     BTREE_ITER_INTENT|
+			     BTREE_ITER_PREFETCH);
 retry:
-	while ((k = bch2_btree_iter_peek(iter)).k &&
+	while ((k = bch2_btree_iter_peek(&iter)).k &&
 	       !(ret = bkey_err(k))) {
 		if (w.have_inode &&
 		    w.cur_inum != k.k->p.inode &&
@@ -700,12 +703,12 @@ retry:
 			i_sectors += k.k->size;
 		bch2_bkey_buf_reassemble(&prev, c, k);
 
-		bch2_btree_iter_advance(iter);
+		bch2_btree_iter_advance(&iter);
 	}
 fsck_err:
 	if (ret == -EINTR)
 		goto retry;
-	bch2_trans_iter_put(&trans, iter);
+	bch2_trans_iter_exit(&trans, &iter);
 	bch2_bkey_buf_exit(&prev, c);
 	return bch2_trans_exit(&trans) ?: ret;
 }
@@ -890,7 +893,7 @@ static int check_dirents(struct bch_fs *c)
 	struct inode_walker w = inode_walker_init();
 	struct bch_hash_info hash_info;
 	struct btree_trans trans;
-	struct btree_iter *iter;
+	struct btree_iter iter;
 	unsigned nr_subdirs = 0;
 	int ret = 0;
 
@@ -898,18 +901,18 @@ static int check_dirents(struct bch_fs *c)
 
 	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
 
-	iter = bch2_trans_get_iter(&trans, BTREE_ID_dirents,
-				   POS(BCACHEFS_ROOT_INO, 0),
-				   BTREE_ITER_INTENT|
-				   BTREE_ITER_PREFETCH);
+	bch2_trans_iter_init(&trans, &iter, BTREE_ID_dirents,
+			     POS(BCACHEFS_ROOT_INO, 0),
+			     BTREE_ITER_INTENT|
+			     BTREE_ITER_PREFETCH);
 
 	do {
 		ret = lockrestart_do(&trans,
-				check_dirent(&trans, iter, &hash_info, &w, &nr_subdirs));
+				check_dirent(&trans, &iter, &hash_info, &w, &nr_subdirs));
 		if (ret)
 			break;
-	} while (bch2_btree_iter_advance(iter));
-	bch2_trans_iter_put(&trans, iter);
+	} while (bch2_btree_iter_advance(&iter));
+	bch2_trans_iter_exit(&trans, &iter);
 
 	return bch2_trans_exit(&trans) ?: ret;
 }
@@ -923,7 +926,7 @@ static int check_xattrs(struct bch_fs *c)
 	struct inode_walker w = inode_walker_init();
 	struct bch_hash_info hash_info;
 	struct btree_trans trans;
-	struct btree_iter *iter;
+	struct btree_iter iter;
 	struct bkey_s_c k;
 	int ret = 0;
 
@@ -931,12 +934,12 @@ static int check_xattrs(struct bch_fs *c)
 
 	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
 
-	iter = bch2_trans_get_iter(&trans, BTREE_ID_xattrs,
-				   POS(BCACHEFS_ROOT_INO, 0),
-				   BTREE_ITER_INTENT|
-				   BTREE_ITER_PREFETCH);
+	bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs,
+			     POS(BCACHEFS_ROOT_INO, 0),
+			     BTREE_ITER_INTENT|
+			     BTREE_ITER_PREFETCH);
 retry:
-	while ((k = bch2_btree_iter_peek(iter)).k &&
+	while ((k = bch2_btree_iter_peek(&iter)).k &&
 	       !(ret = bkey_err(k))) {
 		ret = walk_inode(&trans, &w, k.k->p.inode);
 		if (ret)
@@ -945,7 +948,7 @@ retry:
 		if (fsck_err_on(!w.have_inode, c,
 				"xattr for missing inode %llu",
 				k.k->p.inode)) {
-			ret = bch2_btree_delete_at(&trans, iter, 0);
+			ret = bch2_btree_delete_at(&trans, &iter, 0);
 			if (ret)
 				break;
 			continue;
@@ -955,17 +958,17 @@ retry:
 			hash_info = bch2_hash_info_init(c, &w.inode);
 
 		ret = hash_check_key(&trans, bch2_xattr_hash_desc,
-				     &hash_info, iter, k);
+				     &hash_info, &iter, k);
 		if (ret)
 			break;
 
-		bch2_btree_iter_advance(iter);
+		bch2_btree_iter_advance(&iter);
 	}
 fsck_err:
 	if (ret == -EINTR)
 		goto retry;
 
-	bch2_trans_iter_put(&trans, iter);
+	bch2_trans_iter_exit(&trans, &iter);
 	return bch2_trans_exit(&trans) ?: ret;
 }
 
@@ -1114,7 +1117,7 @@ fsck_err:
 static int check_directory_structure(struct bch_fs *c)
 {
 	struct btree_trans trans;
-	struct btree_iter *iter;
+	struct btree_iter iter;
 	struct bkey_s_c k;
 	struct bch_inode_unpacked u;
 	struct pathbuf path = { 0, 0, NULL };
@@ -1139,7 +1142,7 @@ static int check_directory_structure(struct bch_fs *c)
 		if (ret)
 			break;
 	}
-	bch2_trans_iter_put(&trans, iter);
+	bch2_trans_iter_exit(&trans, &iter);
 
 	BUG_ON(ret == -EINTR);
 
@@ -1215,7 +1218,7 @@ static int check_nlinks_find_hardlinks(struct bch_fs *c,
 				       u64 start, u64 *end)
 {
 	struct btree_trans trans;
-	struct btree_iter *iter;
+	struct btree_iter iter;
 	struct bkey_s_c k;
 	struct bkey_s_c_inode inode;
 	struct bch_inode_unpacked u;
@@ -1253,7 +1256,7 @@ static int check_nlinks_find_hardlinks(struct bch_fs *c,
 		}
 
 	}
-	bch2_trans_iter_put(&trans, iter);
+	bch2_trans_iter_exit(&trans, &iter);
 	bch2_trans_exit(&trans);
 
 	if (ret)
@@ -1267,7 +1270,7 @@ static int check_nlinks_walk_dirents(struct bch_fs *c, struct nlink_table *links
 				     u64 range_start, u64 range_end)
 {
 	struct btree_trans trans;
-	struct btree_iter *iter;
+	struct btree_iter iter;
 	struct bkey_s_c k;
 	struct bkey_s_c_dirent d;
 	int ret;
@@ -1289,7 +1292,7 @@ static int check_nlinks_walk_dirents(struct bch_fs *c, struct nlink_table *links
 
 		bch2_trans_cond_resched(&trans);
 	}
-	bch2_trans_iter_put(&trans, iter);
+	bch2_trans_iter_exit(&trans, &iter);
 
 	ret = bch2_trans_exit(&trans) ?: ret;
 	if (ret)
@@ -1304,7 +1307,7 @@ static int check_nlinks_update_hardlinks(struct bch_fs *c,
 			       u64 range_start, u64 range_end)
 {
 	struct btree_trans trans;
-	struct btree_iter *iter;
+	struct btree_iter iter;
 	struct bkey_s_c k;
 	struct bkey_s_c_inode inode;
 	struct bch_inode_unpacked u;
@@ -1346,14 +1349,14 @@ static int check_nlinks_update_hardlinks(struct bch_fs *c,
 			ret = __bch2_trans_do(&trans, NULL, NULL,
 					      BTREE_INSERT_NOFAIL|
 					      BTREE_INSERT_LAZY_RW,
-					      bch2_btree_iter_traverse(iter) ?:
-					bch2_inode_write(&trans, iter, &u));
+					      bch2_btree_iter_traverse(&iter) ?:
+					bch2_inode_write(&trans, &iter, &u));
 			if (ret)
 				bch_err(c, "error in fsck: error %i updating inode", ret);
 		}
 	}
 fsck_err:
-	bch2_trans_iter_put(&trans, iter);
+	bch2_trans_iter_exit(&trans, &iter);
 	bch2_trans_exit(&trans);
 
 	if (ret)
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index 63f50891594c..2b653ee03f4f 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -292,18 +292,18 @@ int bch2_inode_unpack(struct bkey_s_c_inode inode,
 	return 0;
 }
 
-struct btree_iter *bch2_inode_peek(struct btree_trans *trans,
-				   struct bch_inode_unpacked *inode,
-				   u64 inum, unsigned flags)
+int bch2_inode_peek(struct btree_trans *trans,
+		    struct btree_iter *iter,
+		    struct bch_inode_unpacked *inode,
+		    u64 inum, unsigned flags)
 {
-	struct btree_iter *iter;
 	struct bkey_s_c k;
 	int ret;
 
 	if (trans->c->opts.inodes_use_key_cache)
 		flags |= BTREE_ITER_CACHED;
 
-	iter = bch2_trans_get_iter(trans, BTREE_ID_inodes, POS(0, inum), flags);
+	bch2_trans_iter_init(trans, iter, BTREE_ID_inodes, POS(0, inum), flags);
 	k = bch2_btree_iter_peek_slot(iter);
 	ret = bkey_err(k);
 	if (ret)
@@ -317,10 +317,10 @@ struct btree_iter *bch2_inode_peek(struct btree_trans *trans,
 	if (ret)
 		goto err;
 
-	return iter;
+	return 0;
 err:
-	bch2_trans_iter_put(trans, iter);
-	return ERR_PTR(ret);
+	bch2_trans_iter_exit(trans, iter);
+	return ret;
 }
 
 int bch2_inode_write(struct btree_trans *trans,
@@ -482,12 +482,12 @@ static inline u32 bkey_generation(struct bkey_s_c k)
 	}
 }
 
-struct btree_iter *bch2_inode_create(struct btree_trans *trans,
-				     struct bch_inode_unpacked *inode_u,
-				     u32 snapshot, u64 cpu)
+int bch2_inode_create(struct btree_trans *trans,
+		      struct btree_iter *iter,
+		      struct bch_inode_unpacked *inode_u,
+		      u32 snapshot, u64 cpu)
 {
 	struct bch_fs *c = trans->c;
-	struct btree_iter *iter = NULL;
 	struct bkey_s_c k;
 	u64 min, max, start, pos, *hint;
 	int ret = 0;
@@ -513,9 +513,9 @@ struct btree_iter *bch2_inode_create(struct btree_trans *trans,
 		start = min;
 
 	pos = start;
-	iter = bch2_trans_get_iter(trans, BTREE_ID_inodes, POS(0, pos),
-				   BTREE_ITER_ALL_SNAPSHOTS|
-				   BTREE_ITER_INTENT);
+	bch2_trans_iter_init(trans, iter, BTREE_ID_inodes, POS(0, pos),
+			     BTREE_ITER_ALL_SNAPSHOTS|
+			     BTREE_ITER_INTENT);
 again:
 	while ((k = bch2_btree_iter_peek(iter)).k &&
 	       !(ret = bkey_err(k)) &&
@@ -553,8 +553,8 @@ again:
 		ret = -ENOSPC;
 
 	if (ret) {
-		bch2_trans_iter_put(trans, iter);
-		return ERR_PTR(ret);
+		bch2_trans_iter_exit(trans, iter);
+		return ret;
 	}
 
 	/* Retry from start */
@@ -566,8 +566,8 @@ found_slot:
 	k = bch2_btree_iter_peek_slot(iter);
 	ret = bkey_err(k);
 	if (ret) {
-		bch2_trans_iter_put(trans, iter);
-		return ERR_PTR(ret);
+		bch2_trans_iter_exit(trans, iter);
+		return ret;
 	}
 
 	/* We may have raced while the iterator wasn't pointing at pos: */
@@ -578,13 +578,13 @@ found_slot:
 	*hint			= k.k->p.offset;
 	inode_u->bi_inum	= k.k->p.offset;
 	inode_u->bi_generation	= bkey_generation(k);
-	return iter;
+	return 0;
 }
 
 int bch2_inode_rm(struct bch_fs *c, u64 inode_nr, bool cached)
 {
 	struct btree_trans trans;
-	struct btree_iter *iter = NULL;
+	struct btree_iter iter = { NULL };
 	struct bkey_i_inode_generation delete;
 	struct bpos start = POS(inode_nr, 0);
 	struct bpos end = POS(inode_nr + 1, 0);
@@ -617,9 +617,9 @@ int bch2_inode_rm(struct bch_fs *c, u64 inode_nr, bool cached)
 retry:
 	bch2_trans_begin(&trans);
 
-	iter = bch2_trans_get_iter(&trans, BTREE_ID_inodes,
-				   POS(0, inode_nr), iter_flags);
-	k = bch2_btree_iter_peek_slot(iter);
+	bch2_trans_iter_init(&trans, &iter, BTREE_ID_inodes,
+			     POS(0, inode_nr), iter_flags);
+	k = bch2_btree_iter_peek_slot(&iter);
 
 	ret = bkey_err(k);
 	if (ret)
@@ -636,14 +636,14 @@ retry:
 	bch2_inode_unpack(bkey_s_c_to_inode(k), &inode_u);
 
 	bkey_inode_generation_init(&delete.k_i);
-	delete.k.p = iter->pos;
+	delete.k.p = iter.pos;
 	delete.v.bi_generation = cpu_to_le32(inode_u.bi_generation + 1);
 
-	ret   = bch2_trans_update(&trans, iter, &delete.k_i, 0) ?:
+	ret   = bch2_trans_update(&trans, &iter, &delete.k_i, 0) ?:
 		bch2_trans_commit(&trans, NULL, NULL,
 				BTREE_INSERT_NOFAIL);
 err:
-	bch2_trans_iter_put(&trans, iter);
+	bch2_trans_iter_exit(&trans, &iter);
 	if (ret == -EINTR)
 		goto retry;
 
@@ -654,12 +654,11 @@ err:
 static int bch2_inode_find_by_inum_trans(struct btree_trans *trans, u64 inode_nr,
 					 struct bch_inode_unpacked *inode)
 {
-	struct btree_iter *iter;
+	struct btree_iter iter = { NULL };
 	int ret;
 
-	iter = bch2_inode_peek(trans, inode, inode_nr, 0);
-	ret = PTR_ERR_OR_ZERO(iter);
-	bch2_trans_iter_put(trans, iter);
+	ret = bch2_inode_peek(trans, &iter, inode, inode_nr, 0);
+	bch2_trans_iter_exit(trans, &iter);
 	return ret;
 }
 
diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h
index d67af4f56f05..25bef104ebcc 100644
--- a/fs/bcachefs/inode.h
+++ b/fs/bcachefs/inode.h
@@ -57,8 +57,8 @@ int bch2_inode_unpack(struct bkey_s_c_inode, struct bch_inode_unpacked *);
 
 void bch2_inode_unpacked_to_text(struct printbuf *, struct bch_inode_unpacked *);
 
-struct btree_iter *bch2_inode_peek(struct btree_trans *,
-			struct bch_inode_unpacked *, u64, unsigned);
+int bch2_inode_peek(struct btree_trans *, struct btree_iter *,
+		    struct bch_inode_unpacked *, u64, unsigned);
 int bch2_inode_write(struct btree_trans *, struct btree_iter *,
 		     struct bch_inode_unpacked *);
 
@@ -71,8 +71,8 @@ void bch2_inode_init(struct bch_fs *, struct bch_inode_unpacked *,
 		     uid_t, gid_t, umode_t, dev_t,
 		     struct bch_inode_unpacked *);
 
-struct btree_iter *bch2_inode_create(struct btree_trans *,
-				     struct bch_inode_unpacked *, u32, u64);
+int bch2_inode_create(struct btree_trans *, struct btree_iter *,
+		      struct bch_inode_unpacked *, u32, u64);
 
 int bch2_inode_rm(struct bch_fs *, u64, bool);
 
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index 34295419190d..bee33258c0d8 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -202,7 +202,7 @@ int bch2_sum_sector_overwrites(struct btree_trans *trans,
 			       s64 *disk_sectors_delta)
 {
 	struct bch_fs *c = trans->c;
-	struct btree_iter *iter;
+	struct btree_iter iter;
 	struct bkey_s_c old;
 	unsigned new_replicas = bch2_bkey_replicas(c, bkey_i_to_s_c(new));
 	bool new_compressed = bch2_bkey_sectors_compressed(bkey_i_to_s_c(new));
@@ -213,7 +213,7 @@ int bch2_sum_sector_overwrites(struct btree_trans *trans,
 	*i_sectors_delta	= 0;
 	*disk_sectors_delta	= 0;
 
-	iter = bch2_trans_copy_iter(trans, extent_iter);
+	bch2_trans_copy_iter(&iter, extent_iter);
 
 	for_each_btree_key_continue(iter, BTREE_ITER_SLOTS, old, ret) {
 		s64 sectors = min(new->k.p.offset, old.k->p.offset) -
@@ -246,7 +246,7 @@ int bch2_sum_sector_overwrites(struct btree_trans *trans,
 			 * less:
 			 */
 			if (!bkey_cmp(old.k->p, new->k.p)) {
-				old = bch2_btree_iter_next(iter);
+				old = bch2_btree_iter_next(&iter);
 				ret = bkey_err(old);
 				if (ret)
 					break;
@@ -261,7 +261,7 @@ int bch2_sum_sector_overwrites(struct btree_trans *trans,
 		}
 	}
 
-	bch2_trans_iter_put(trans, iter);
+	bch2_trans_iter_exit(trans, &iter);
 	return ret;
 }
 
@@ -311,12 +311,11 @@ int bch2_extent_update(struct btree_trans *trans,
 		: 0;
 
 	if (i_sectors_delta || new_i_size) {
-		struct btree_iter *inode_iter;
+		struct btree_iter inode_iter;
 		struct bch_inode_unpacked inode_u;
 
-		inode_iter = bch2_inode_peek(trans, &inode_u,
+		ret = bch2_inode_peek(trans, &inode_iter, &inode_u,
 				k->k.p.inode, BTREE_ITER_INTENT);
-		ret = PTR_ERR_OR_ZERO(inode_iter);
 		if (ret)
 			return ret;
 
@@ -345,11 +344,11 @@ int bch2_extent_update(struct btree_trans *trans,
 
 			inode_p.inode.k.p.snapshot = iter->snapshot;
 
-			ret = bch2_trans_update(trans, inode_iter,
+			ret = bch2_trans_update(trans, &inode_iter,
 					  &inode_p.inode.k_i, 0);
 		}
 
-		bch2_trans_iter_put(trans, inode_iter);
+		bch2_trans_iter_exit(trans, &inode_iter);
 
 		if (ret)
 			return ret;
@@ -424,18 +423,18 @@ int bch2_fpunch(struct bch_fs *c, u64 inum, u64 start, u64 end,
 		u64 *journal_seq, s64 *i_sectors_delta)
 {
 	struct btree_trans trans;
-	struct btree_iter *iter;
+	struct btree_iter iter;
 	int ret = 0;
 
 	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024);
-	iter = bch2_trans_get_iter(&trans, BTREE_ID_extents,
+	bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
 				   POS(inum, start),
 				   BTREE_ITER_INTENT);
 
-	ret = bch2_fpunch_at(&trans, iter, POS(inum, end),
+	ret = bch2_fpunch_at(&trans, &iter, POS(inum, end),
 			     journal_seq, i_sectors_delta);
 
-	bch2_trans_iter_put(&trans, iter);
+	bch2_trans_iter_exit(&trans, &iter);
 	bch2_trans_exit(&trans);
 
 	if (ret == -EINTR)
@@ -451,28 +450,28 @@ static int bch2_write_index_default(struct bch_write_op *op)
 	struct keylist *keys = &op->insert_keys;
 	struct bkey_i *k = bch2_keylist_front(keys);
 	struct btree_trans trans;
-	struct btree_iter *iter;
+	struct btree_iter iter;
 	int ret;
 
 	bch2_bkey_buf_init(&sk);
 	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024);
 
-	iter = bch2_trans_get_iter(&trans, BTREE_ID_extents,
-				   bkey_start_pos(&k->k),
-				   BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+	bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
+			     bkey_start_pos(&k->k),
+			     BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
 
 	do {
 		bch2_trans_begin(&trans);
 
 		k = bch2_keylist_front(keys);
 
-		k->k.p.snapshot = iter->snapshot;
+		k->k.p.snapshot = iter.snapshot;
 
 		bch2_bkey_buf_realloc(&sk, c, k->k.u64s);
 		bkey_copy(sk.k, k);
-		bch2_cut_front(iter->pos, sk.k);
+		bch2_cut_front(iter.pos, sk.k);
 
-		ret = bch2_extent_update(&trans, iter, sk.k,
+		ret = bch2_extent_update(&trans, &iter, sk.k,
 					 &op->res, op_journal_seq(op),
 					 op->new_i_size, &op->i_sectors_delta,
 					 op->flags & BCH_WRITE_CHECK_ENOSPC);
@@ -481,11 +480,11 @@ static int bch2_write_index_default(struct bch_write_op *op)
 		if (ret)
 			break;
 
-		if (bkey_cmp(iter->pos, k->k.p) >= 0)
+		if (bkey_cmp(iter.pos, k->k.p) >= 0)
 			bch2_keylist_pop_front(keys);
 	} while (!bch2_keylist_empty(keys));
 
-	bch2_trans_iter_put(&trans, iter);
+	bch2_trans_iter_exit(&trans, &iter);
 	bch2_trans_exit(&trans);
 	bch2_bkey_buf_exit(&sk, c);
 
@@ -1638,7 +1637,7 @@ static void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio
 				     unsigned flags)
 {
 	struct btree_trans trans;
-	struct btree_iter *iter;
+	struct btree_iter iter;
 	struct bkey_buf sk;
 	struct bkey_s_c k;
 	int ret;
@@ -1649,12 +1648,12 @@ static void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio
 	bch2_bkey_buf_init(&sk);
 	bch2_trans_init(&trans, c, 0, 0);
 
-	iter = bch2_trans_get_iter(&trans, rbio->data_btree,
-				   rbio->read_pos, BTREE_ITER_SLOTS);
+	bch2_trans_iter_init(&trans, &iter, rbio->data_btree,
+			     rbio->read_pos, BTREE_ITER_SLOTS);
 retry:
 	rbio->bio.bi_status = 0;
 
-	k = bch2_btree_iter_peek_slot(iter);
+	k = bch2_btree_iter_peek_slot(&iter);
 	if (bkey_err(k))
 		goto err;
 
@@ -1681,7 +1680,7 @@ retry:
 		goto err;
 out:
 	bch2_rbio_done(rbio);
-	bch2_trans_iter_put(&trans, iter);
+	bch2_trans_iter_exit(&trans, &iter);
 	bch2_trans_exit(&trans);
 	bch2_bkey_buf_exit(&sk, c);
 	return;
@@ -1747,7 +1746,7 @@ static int __bch2_rbio_narrow_crcs(struct btree_trans *trans,
 	struct bch_fs *c = rbio->c;
 	u64 data_offset = rbio->data_pos.offset - rbio->pick.crc.offset;
 	struct bch_extent_crc_unpacked new_crc;
-	struct btree_iter *iter = NULL;
+	struct btree_iter iter;
 	struct bkey_i *new;
 	struct bkey_s_c k;
 	int ret = 0;
@@ -1755,9 +1754,9 @@ static int __bch2_rbio_narrow_crcs(struct btree_trans *trans,
 	if (crc_is_compressed(rbio->pick.crc))
 		return 0;
 
-	iter = bch2_trans_get_iter(trans, rbio->data_btree, rbio->data_pos,
-				   BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
-	k = bch2_btree_iter_peek_slot(iter);
+	bch2_trans_iter_init(trans, &iter, rbio->data_btree, rbio->data_pos,
+			     BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+	k = bch2_btree_iter_peek_slot(&iter);
 	if ((ret = bkey_err(k)))
 		goto out;
 
@@ -1792,9 +1791,9 @@ static int __bch2_rbio_narrow_crcs(struct btree_trans *trans,
 	if (!bch2_bkey_narrow_crcs(new, new_crc))
 		goto out;
 
-	ret = bch2_trans_update(trans, iter, new, 0);
+	ret = bch2_trans_update(trans, &iter, new, 0);
 out:
-	bch2_trans_iter_put(trans, iter);
+	bch2_trans_iter_exit(trans, &iter);
 	return ret;
 }
 
@@ -1965,7 +1964,7 @@ int __bch2_read_indirect_extent(struct btree_trans *trans,
 				unsigned *offset_into_extent,
 				struct bkey_buf *orig_k)
 {
-	struct btree_iter *iter;
+	struct btree_iter iter;
 	struct bkey_s_c k;
 	u64 reflink_offset;
 	int ret;
@@ -1973,10 +1972,10 @@ int __bch2_read_indirect_extent(struct btree_trans *trans,
 	reflink_offset = le64_to_cpu(bkey_i_to_reflink_p(orig_k->k)->v.idx) +
 		*offset_into_extent;
 
-	iter = bch2_trans_get_iter(trans, BTREE_ID_reflink,
-				   POS(0, reflink_offset),
-				   BTREE_ITER_SLOTS);
-	k = bch2_btree_iter_peek_slot(iter);
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_reflink,
+			     POS(0, reflink_offset),
+			     BTREE_ITER_SLOTS);
+	k = bch2_btree_iter_peek_slot(&iter);
 	ret = bkey_err(k);
 	if (ret)
 		goto err;
@@ -1993,10 +1992,10 @@ int __bch2_read_indirect_extent(struct btree_trans *trans,
 		goto err;
 	}
 
-	*offset_into_extent = iter->pos.offset - bkey_start_offset(k.k);
+	*offset_into_extent = iter.pos.offset - bkey_start_offset(k.k);
 	bch2_bkey_buf_reassemble(orig_k, trans->c, k);
 err:
-	bch2_trans_iter_put(trans, iter);
+	bch2_trans_iter_exit(trans, &iter);
 	return ret;
 }
 
@@ -2273,7 +2272,7 @@ void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
 		 struct bch_io_failures *failed, unsigned flags)
 {
 	struct btree_trans trans;
-	struct btree_iter *iter;
+	struct btree_iter iter;
 	struct bkey_buf sk;
 	struct bkey_s_c k;
 	int ret;
@@ -2282,10 +2281,9 @@ void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
 
 	bch2_bkey_buf_init(&sk);
 	bch2_trans_init(&trans, c, 0, 0);
-
-	iter = bch2_trans_get_iter(&trans, BTREE_ID_extents,
-				   POS(inode, bvec_iter.bi_sector),
-				   BTREE_ITER_SLOTS);
+	bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
+			     POS(inode, bvec_iter.bi_sector),
+			     BTREE_ITER_SLOTS);
 retry:
 	bch2_trans_begin(&trans);
 
@@ -2302,15 +2300,15 @@ retry:
 			break;
 		}
 
-		bch2_btree_iter_set_pos(iter,
+		bch2_btree_iter_set_pos(&iter,
 				POS(inode, bvec_iter.bi_sector));
 
-		k = bch2_btree_iter_peek_slot(iter);
+		k = bch2_btree_iter_peek_slot(&iter);
 		ret = bkey_err(k);
 		if (ret)
 			break;
 
-		offset_into_extent = iter->pos.offset -
+		offset_into_extent = iter.pos.offset -
 			bkey_start_offset(k.k);
 		sectors = k.k->size - offset_into_extent;
 
@@ -2341,7 +2339,7 @@ retry:
 		if (bvec_iter.bi_size == bytes)
 			flags |= BCH_READ_LAST_FRAGMENT;
 
-		ret = __bch2_read_extent(&trans, rbio, bvec_iter, iter->pos,
+		ret = __bch2_read_extent(&trans, rbio, bvec_iter, iter.pos,
 					 data_btree, k,
 					 offset_into_extent, failed, flags);
 		if (ret)
@@ -2357,7 +2355,7 @@ retry:
 	if (ret == -EINTR || ret == READ_RETRY || ret == READ_RETRY_AVOID)
 		goto retry;
 
-	bch2_trans_iter_put(&trans, iter);
+	bch2_trans_iter_exit(&trans, &iter);
 	bch2_trans_exit(&trans);
 	bch2_bkey_buf_exit(&sk, c);
 
diff --git a/fs/bcachefs/journal_seq_blacklist.c b/fs/bcachefs/journal_seq_blacklist.c
index f2060f903cbc..68fb2ebd91ac 100644
--- a/fs/bcachefs/journal_seq_blacklist.c
+++ b/fs/bcachefs/journal_seq_blacklist.c
@@ -250,7 +250,7 @@ void bch2_blacklist_entries_gc(struct work_struct *work)
 	bch2_trans_init(&trans, c, 0, 0);
 
 	for (i = 0; i < BTREE_ID_NR; i++) {
-		struct btree_iter *iter;
+		struct btree_iter iter;
 		struct btree *b;
 
 		for_each_btree_node(&trans, iter, i, POS_MIN,
@@ -259,7 +259,7 @@ void bch2_blacklist_entries_gc(struct work_struct *work)
 				bch2_trans_exit(&trans);
 				return;
 			}
-		bch2_trans_iter_free(&trans, iter);
+		bch2_trans_iter_exit(&trans, &iter);
 	}
 
 	ret = bch2_trans_exit(&trans);
diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c
index 1f65eca48c6e..1899326d9754 100644
--- a/fs/bcachefs/migrate.c
+++ b/fs/bcachefs/migrate.c
@@ -39,7 +39,7 @@ static int __bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags
 				   enum btree_id btree_id)
 {
 	struct btree_trans trans;
-	struct btree_iter *iter;
+	struct btree_iter iter;
 	struct bkey_s_c k;
 	struct bkey_buf sk;
 	int ret = 0;
@@ -47,13 +47,13 @@ static int __bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags
 	bch2_bkey_buf_init(&sk);
 	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
 
-	iter = bch2_trans_get_iter(&trans, btree_id, POS_MIN,
-				   BTREE_ITER_PREFETCH);
+	bch2_trans_iter_init(&trans, &iter, btree_id, POS_MIN,
+			     BTREE_ITER_PREFETCH);
 
-	while ((k = bch2_btree_iter_peek(iter)).k &&
+	while ((k = bch2_btree_iter_peek(&iter)).k &&
 	       !(ret = bkey_err(k))) {
 		if (!bch2_bkey_has_device(k, dev_idx)) {
-			bch2_btree_iter_advance(iter);
+			bch2_btree_iter_advance(&iter);
 			continue;
 		}
 
@@ -71,10 +71,10 @@ static int __bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags
 		 */
 		bch2_extent_normalize(c, bkey_i_to_s(sk.k));
 
-		bch2_btree_iter_set_pos(iter, bkey_start_pos(&sk.k->k));
+		bch2_btree_iter_set_pos(&iter, bkey_start_pos(&sk.k->k));
 
-		ret   = bch2_btree_iter_traverse(iter) ?:
-			bch2_trans_update(&trans, iter, sk.k, 0) ?:
+		ret   = bch2_btree_iter_traverse(&iter) ?:
+			bch2_trans_update(&trans, &iter, sk.k, 0) ?:
 			bch2_trans_commit(&trans, NULL, NULL,
 					BTREE_INSERT_NOFAIL);
 
@@ -88,7 +88,7 @@ static int __bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags
 		if (ret)
 			break;
 	}
-	bch2_trans_iter_put(&trans, iter);
+	bch2_trans_iter_exit(&trans, &iter);
 
 	ret = bch2_trans_exit(&trans) ?: ret;
 	bch2_bkey_buf_exit(&sk, c);
@@ -107,7 +107,7 @@ static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
 static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
 {
 	struct btree_trans trans;
-	struct btree_iter *iter;
+	struct btree_iter iter;
 	struct closure cl;
 	struct btree *b;
 	struct bkey_buf k;
@@ -139,9 +139,9 @@ retry:
 				break;
 			}
 
-			ret = bch2_btree_node_update_key(&trans, iter, b, k.k, false);
+			ret = bch2_btree_node_update_key(&trans, &iter, b, k.k, false);
 			if (ret == -EINTR) {
-				b = bch2_btree_iter_peek_node(iter);
+				b = bch2_btree_iter_peek_node(&iter);
 				ret = 0;
 				goto retry;
 			}
@@ -150,7 +150,7 @@ retry:
 				break;
 			}
 		}
-		bch2_trans_iter_free(&trans, iter);
+		bch2_trans_iter_exit(&trans, &iter);
 
 		if (ret)
 			goto err;
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index 3c2e566beb2d..eb2b91f7e682 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -56,7 +56,7 @@ int bch2_migrate_index_update(struct bch_write_op *op)
 {
 	struct bch_fs *c = op->c;
 	struct btree_trans trans;
-	struct btree_iter *iter;
+	struct btree_iter iter;
 	struct migrate_write *m =
 		container_of(op, struct migrate_write, op);
 	struct keylist *keys = &op->insert_keys;
@@ -69,9 +69,9 @@ int bch2_migrate_index_update(struct bch_write_op *op)
 
 	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024);
 
-	iter = bch2_trans_get_iter(&trans, m->btree_id,
-				   bkey_start_pos(&bch2_keylist_front(keys)->k),
-				   BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+	bch2_trans_iter_init(&trans, &iter, m->btree_id,
+			     bkey_start_pos(&bch2_keylist_front(keys)->k),
+			     BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
 
 	while (1) {
 		struct bkey_s_c k;
@@ -86,7 +86,7 @@ int bch2_migrate_index_update(struct bch_write_op *op)
 
 		bch2_trans_begin(&trans);
 
-		k = bch2_btree_iter_peek_slot(iter);
+		k = bch2_btree_iter_peek_slot(&iter);
 		ret = bkey_err(k);
 		if (ret)
 			goto err;
@@ -102,9 +102,9 @@ int bch2_migrate_index_update(struct bch_write_op *op)
 
 		bch2_bkey_buf_copy(&_new, c, bch2_keylist_front(keys));
 		new = bkey_i_to_extent(_new.k);
-		bch2_cut_front(iter->pos, &new->k_i);
+		bch2_cut_front(iter.pos, &new->k_i);
 
-		bch2_cut_front(iter->pos,	insert);
+		bch2_cut_front(iter.pos,	insert);
 		bch2_cut_back(new->k.p,		insert);
 		bch2_cut_back(insert->k.p,	&new->k_i);
 
@@ -146,7 +146,7 @@ int bch2_migrate_index_update(struct bch_write_op *op)
 					       op->opts.background_target,
 					       op->opts.data_replicas);
 
-		ret = bch2_sum_sector_overwrites(&trans, iter, insert,
+		ret = bch2_sum_sector_overwrites(&trans, &iter, insert,
 						 &extending,
 						 &should_check_enospc,
 						 &i_sectors_delta,
@@ -165,13 +165,13 @@ int bch2_migrate_index_update(struct bch_write_op *op)
 
 		next_pos = insert->k.p;
 
-		ret   = bch2_trans_update(&trans, iter, insert, 0) ?:
+		ret   = bch2_trans_update(&trans, &iter, insert, 0) ?:
 			bch2_trans_commit(&trans, &op->res,
 				op_journal_seq(op),
 				BTREE_INSERT_NOFAIL|
 				m->data_opts.btree_insert_flags);
 		if (!ret) {
-			bch2_btree_iter_set_pos(iter, next_pos);
+			bch2_btree_iter_set_pos(&iter, next_pos);
 			atomic_long_inc(&c->extent_migrate_done);
 		}
 err:
@@ -180,7 +180,7 @@ err:
 		if (ret)
 			break;
 next:
-		while (bkey_cmp(iter->pos, bch2_keylist_front(keys)->k.p) >= 0) {
+		while (bkey_cmp(iter.pos, bch2_keylist_front(keys)->k.p) >= 0) {
 			bch2_keylist_pop_front(keys);
 			if (bch2_keylist_empty(keys))
 				goto out;
@@ -188,18 +188,18 @@ next:
 		continue;
 nomatch:
 		if (m->ctxt) {
-			BUG_ON(k.k->p.offset <= iter->pos.offset);
+			BUG_ON(k.k->p.offset <= iter.pos.offset);
 			atomic64_inc(&m->ctxt->stats->keys_raced);
-			atomic64_add(k.k->p.offset - iter->pos.offset,
+			atomic64_add(k.k->p.offset - iter.pos.offset,
 				     &m->ctxt->stats->sectors_raced);
 		}
 		atomic_long_inc(&c->extent_migrate_raced);
 		trace_move_race(&new->k);
-		bch2_btree_iter_advance(iter);
+		bch2_btree_iter_advance(&iter);
 		goto next;
 	}
 out:
-	bch2_trans_iter_put(&trans, iter);
+	bch2_trans_iter_exit(&trans, &iter);
 	bch2_trans_exit(&trans);
 	bch2_bkey_buf_exit(&_insert, c);
 	bch2_bkey_buf_exit(&_new, c);
@@ -524,13 +524,13 @@ err:
 static int lookup_inode(struct btree_trans *trans, struct bpos pos,
 			struct bch_inode_unpacked *inode)
 {
-	struct btree_iter *iter;
+	struct btree_iter iter;
 	struct bkey_s_c k;
 	int ret;
 
-	iter = bch2_trans_get_iter(trans, BTREE_ID_inodes, pos,
-				   BTREE_ITER_ALL_SNAPSHOTS);
-	k = bch2_btree_iter_peek(iter);
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes, pos,
+			     BTREE_ITER_ALL_SNAPSHOTS);
+	k = bch2_btree_iter_peek(&iter);
 	ret = bkey_err(k);
 	if (ret)
 		goto err;
@@ -548,7 +548,7 @@ static int lookup_inode(struct btree_trans *trans, struct bpos pos,
 	if (ret)
 		goto err;
 err:
-	bch2_trans_iter_put(trans, iter);
+	bch2_trans_iter_exit(trans, &iter);
 	return ret;
 }
 
@@ -566,7 +566,7 @@ static int __bch2_move_data(struct bch_fs *c,
 	struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts);
 	struct bkey_buf sk;
 	struct btree_trans trans;
-	struct btree_iter *iter;
+	struct btree_iter iter;
 	struct bkey_s_c k;
 	struct data_opts data_opts;
 	enum data_cmd data_cmd;
@@ -580,8 +580,8 @@ static int __bch2_move_data(struct bch_fs *c,
 	stats->btree_id	= btree_id;
 	stats->pos	= start;
 
-	iter = bch2_trans_get_iter(&trans, btree_id, start,
-				   BTREE_ITER_PREFETCH);
+	bch2_trans_iter_init(&trans, &iter, btree_id, start,
+			     BTREE_ITER_PREFETCH);
 
 	if (rate)
 		bch2_ratelimit_reset(rate);
@@ -612,9 +612,9 @@ static int __bch2_move_data(struct bch_fs *c,
 
 		bch2_trans_begin(&trans);
 
-		k = bch2_btree_iter_peek(iter);
+		k = bch2_btree_iter_peek(&iter);
 
-		stats->pos = iter->pos;
+		stats->pos = iter.pos;
 
 		if (!k.k)
 			break;
@@ -687,12 +687,12 @@ next:
 		atomic64_add(k.k->size * bch2_bkey_nr_ptrs_allocated(k),
 			     &stats->sectors_seen);
 next_nondata:
-		bch2_btree_iter_advance(iter);
+		bch2_btree_iter_advance(&iter);
 		bch2_trans_cond_resched(&trans);
 	}
 out:
 
-	bch2_trans_iter_put(&trans, iter);
+	bch2_trans_iter_exit(&trans, &iter);
 	ret = bch2_trans_exit(&trans) ?: ret;
 	bch2_bkey_buf_exit(&sk, c);
 
@@ -786,7 +786,7 @@ static int bch2_move_btree(struct bch_fs *c,
 	bool kthread = (current->flags & PF_KTHREAD) != 0;
 	struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts);
 	struct btree_trans trans;
-	struct btree_iter *iter;
+	struct btree_iter iter;
 	struct btree *b;
 	enum btree_id id;
 	struct data_opts data_opts;
@@ -813,7 +813,7 @@ static int bch2_move_btree(struct bch_fs *c,
 			     bpos_cmp(b->key.k.p, end_pos)) > 0)
 				break;
 
-			stats->pos = iter->pos;
+			stats->pos = iter.pos;
 
 			switch ((cmd = pred(c, arg, b, &io_opts, &data_opts))) {
 			case DATA_SKIP:
@@ -827,13 +827,13 @@ static int bch2_move_btree(struct bch_fs *c,
 				BUG();
 			}
 
-			ret = bch2_btree_node_rewrite(&trans, iter,
+			ret = bch2_btree_node_rewrite(&trans, &iter,
 					b->data->keys.seq, 0) ?: ret;
 next:
 			bch2_trans_cond_resched(&trans);
 		}
+		bch2_trans_iter_exit(&trans, &iter);
 
-		ret = bch2_trans_iter_free(&trans, iter) ?: ret;
 		if (kthread && kthread_should_stop())
 			break;
 	}
diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c
index 7861781a4a7f..9b0f4d3f176d 100644
--- a/fs/bcachefs/quota.c
+++ b/fs/bcachefs/quota.c
@@ -357,7 +357,7 @@ static int __bch2_quota_set(struct bch_fs *c, struct bkey_s_c k)
 static int bch2_quota_init_type(struct bch_fs *c, enum quota_types type)
 {
 	struct btree_trans trans;
-	struct btree_iter *iter;
+	struct btree_iter iter;
 	struct bkey_s_c k;
 	int ret = 0;
 
@@ -372,7 +372,7 @@ static int bch2_quota_init_type(struct bch_fs *c, enum quota_types type)
 		if (ret)
 			break;
 	}
-	bch2_trans_iter_put(&trans, iter);
+	bch2_trans_iter_exit(&trans, &iter);
 
 	return bch2_trans_exit(&trans) ?: ret;
 }
@@ -419,7 +419,7 @@ int bch2_fs_quota_read(struct bch_fs *c)
 	unsigned i, qtypes = enabled_qtypes(c);
 	struct bch_memquota_type *q;
 	struct btree_trans trans;
-	struct btree_iter *iter;
+	struct btree_iter iter;
 	struct bch_inode_unpacked u;
 	struct bkey_s_c k;
 	int ret;
@@ -450,7 +450,7 @@ int bch2_fs_quota_read(struct bch_fs *c)
 					KEY_TYPE_QUOTA_NOCHECK);
 		}
 	}
-	bch2_trans_iter_put(&trans, iter);
+	bch2_trans_iter_exit(&trans, &iter);
 
 	return bch2_trans_exit(&trans) ?: ret;
 }
@@ -717,13 +717,13 @@ static int bch2_set_quota_trans(struct btree_trans *trans,
 				struct bkey_i_quota *new_quota,
 				struct qc_dqblk *qdq)
 {
-	struct btree_iter *iter;
+	struct btree_iter iter;
 	struct bkey_s_c k;
 	int ret;
 
-	iter = bch2_trans_get_iter(trans, BTREE_ID_quotas, new_quota->k.p,
-				   BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
-	k = bch2_btree_iter_peek_slot(iter);
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_quotas, new_quota->k.p,
+			     BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+	k = bch2_btree_iter_peek_slot(&iter);
 
 	ret = bkey_err(k);
 	if (unlikely(ret))
@@ -742,8 +742,8 @@ static int bch2_set_quota_trans(struct btree_trans *trans,
 	if (qdq->d_fieldmask & QC_INO_HARD)
 		new_quota->v.c[Q_INO].hardlimit = cpu_to_le64(qdq->d_ino_hardlimit);
 
-	ret = bch2_trans_update(trans, iter, &new_quota->k_i, 0);
-	bch2_trans_iter_put(trans, iter);
+	ret = bch2_trans_update(trans, &iter, &new_quota->k_i, 0);
+	bch2_trans_iter_exit(trans, &iter);
 	return ret;
 }
 
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 71b0f14f41f3..11208e83fabe 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -327,7 +327,7 @@ static void btree_and_journal_iter_prefetch(struct bch_fs *c, struct btree *b,
 		bch2_bkey_buf_reassemble(&tmp, c, k);
 
 		bch2_btree_node_prefetch(c, NULL, NULL, tmp.k,
-					 b->c.btree_id, b->c.level - 1);
+					b->c.btree_id, b->c.level - 1);
 
 		bch2_btree_and_journal_iter_advance(&iter);
 		i++;
@@ -518,16 +518,16 @@ static int __bch2_journal_replay_key(struct btree_trans *trans,
 				     enum btree_id id, unsigned level,
 				     struct bkey_i *k)
 {
-	struct btree_iter *iter;
+	struct btree_iter iter;
 	int ret;
 
-	iter = bch2_trans_get_node_iter(trans, id, k->k.p,
-					BTREE_MAX_DEPTH, level,
-					BTREE_ITER_INTENT|
-					BTREE_ITER_NOT_EXTENTS);
-	ret   = bch2_btree_iter_traverse(iter) ?:
-		bch2_trans_update(trans, iter, k, BTREE_TRIGGER_NORUN);
-	bch2_trans_iter_put(trans, iter);
+	bch2_trans_node_iter_init(trans, &iter, id, k->k.p,
+				  BTREE_MAX_DEPTH, level,
+				  BTREE_ITER_INTENT|
+				  BTREE_ITER_NOT_EXTENTS);
+	ret   = bch2_btree_iter_traverse(&iter) ?:
+		bch2_trans_update(trans, &iter, k, BTREE_TRIGGER_NORUN);
+	bch2_trans_iter_exit(trans, &iter);
 	return ret;
 }
 
@@ -545,16 +545,16 @@ static int bch2_journal_replay_key(struct bch_fs *c, struct journal_key *k)
 
 static int __bch2_alloc_replay_key(struct btree_trans *trans, struct bkey_i *k)
 {
-	struct btree_iter *iter;
+	struct btree_iter iter;
 	int ret;
 
-	iter = bch2_trans_get_iter(trans, BTREE_ID_alloc, k->k.p,
-				   BTREE_ITER_CACHED|
-				   BTREE_ITER_CACHED_NOFILL|
-				   BTREE_ITER_INTENT);
-	ret   = bch2_btree_iter_traverse(iter) ?:
-		bch2_trans_update(trans, iter, k, BTREE_TRIGGER_NORUN);
-	bch2_trans_iter_put(trans, iter);
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, k->k.p,
+			     BTREE_ITER_CACHED|
+			     BTREE_ITER_CACHED_NOFILL|
+			     BTREE_ITER_INTENT);
+	ret   = bch2_btree_iter_traverse(&iter) ?:
+		bch2_trans_update(trans, &iter, k, BTREE_TRIGGER_NORUN);
+	bch2_trans_iter_exit(trans, &iter);
 	return ret;
 }
 
diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c
index 3d9c5c5b0eba..576cfbccf5b5 100644
--- a/fs/bcachefs/reflink.c
+++ b/fs/bcachefs/reflink.c
@@ -116,7 +116,7 @@ static int bch2_make_extent_indirect(struct btree_trans *trans,
 				     struct bkey_i *orig)
 {
 	struct bch_fs *c = trans->c;
-	struct btree_iter *reflink_iter;
+	struct btree_iter reflink_iter = { NULL };
 	struct bkey_s_c k;
 	struct bkey_i *r_v;
 	struct bkey_i_reflink_p *r_p;
@@ -129,8 +129,8 @@ static int bch2_make_extent_indirect(struct btree_trans *trans,
 	for_each_btree_key(trans, reflink_iter, BTREE_ID_reflink,
 			   POS(0, c->reflink_hint),
 			   BTREE_ITER_INTENT|BTREE_ITER_SLOTS, k, ret) {
-		if (reflink_iter->pos.inode) {
-			bch2_btree_iter_set_pos(reflink_iter, POS_MIN);
+		if (reflink_iter.pos.inode) {
+			bch2_btree_iter_set_pos(&reflink_iter, POS_MIN);
 			continue;
 		}
 
@@ -142,7 +142,7 @@ static int bch2_make_extent_indirect(struct btree_trans *trans,
 		goto err;
 
 	/* rewind iter to start of hole, if necessary: */
-	bch2_btree_iter_set_pos_to_extent_start(reflink_iter);
+	bch2_btree_iter_set_pos_to_extent_start(&reflink_iter);
 
 	r_v = bch2_trans_kmalloc(trans, sizeof(__le64) + bkey_bytes(&orig->k));
 	ret = PTR_ERR_OR_ZERO(r_v);
@@ -151,7 +151,7 @@ static int bch2_make_extent_indirect(struct btree_trans *trans,
 
 	bkey_init(&r_v->k);
 	r_v->k.type	= bkey_type_to_indirect(&orig->k);
-	r_v->k.p	= reflink_iter->pos;
+	r_v->k.p	= reflink_iter.pos;
 	bch2_key_resize(&r_v->k, orig->k.size);
 	r_v->k.version	= orig->k.version;
 
@@ -161,7 +161,7 @@ static int bch2_make_extent_indirect(struct btree_trans *trans,
 	*refcount	= 0;
 	memcpy(refcount + 1, &orig->v, bkey_val_bytes(&orig->k));
 
-	ret = bch2_trans_update(trans, reflink_iter, r_v, 0);
+	ret = bch2_trans_update(trans, &reflink_iter, r_v, 0);
 	if (ret)
 		goto err;
 
@@ -172,9 +172,8 @@ static int bch2_make_extent_indirect(struct btree_trans *trans,
 
 	ret = bch2_trans_update(trans, extent_iter, &r_p->k_i, 0);
 err:
-	if (!IS_ERR(reflink_iter))
-		c->reflink_hint = reflink_iter->pos.offset;
-	bch2_trans_iter_put(trans, reflink_iter);
+	c->reflink_hint = reflink_iter.pos.offset;
+	bch2_trans_iter_exit(trans, &reflink_iter);
 
 	return ret;
 }
@@ -184,7 +183,7 @@ static struct bkey_s_c get_next_src(struct btree_iter *iter, struct bpos end)
 	struct bkey_s_c k;
 	int ret;
 
-	for_each_btree_key_continue(iter, 0, k, ret) {
+	for_each_btree_key_continue(*iter, 0, k, ret) {
 		if (bkey_cmp(iter->pos, end) >= 0)
 			break;
 
@@ -203,7 +202,7 @@ s64 bch2_remap_range(struct bch_fs *c,
 		     u64 new_i_size, s64 *i_sectors_delta)
 {
 	struct btree_trans trans;
-	struct btree_iter *dst_iter, *src_iter;
+	struct btree_iter dst_iter, src_iter;
 	struct bkey_s_c src_k;
 	struct bkey_buf new_dst, new_src;
 	struct bpos dst_end = dst_start, src_end = src_start;
@@ -223,13 +222,13 @@ s64 bch2_remap_range(struct bch_fs *c,
 	bch2_bkey_buf_init(&new_src);
 	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 4096);
 
-	src_iter = bch2_trans_get_iter(&trans, BTREE_ID_extents, src_start,
-				       BTREE_ITER_INTENT);
-	dst_iter = bch2_trans_get_iter(&trans, BTREE_ID_extents, dst_start,
-				       BTREE_ITER_INTENT);
+	bch2_trans_iter_init(&trans, &src_iter, BTREE_ID_extents, src_start,
+			     BTREE_ITER_INTENT);
+	bch2_trans_iter_init(&trans, &dst_iter, BTREE_ID_extents, dst_start,
+			     BTREE_ITER_INTENT);
 
 	while ((ret == 0 || ret == -EINTR) &&
-	       bkey_cmp(dst_iter->pos, dst_end) < 0) {
+	       bkey_cmp(dst_iter.pos, dst_end) < 0) {
 		struct disk_reservation disk_res = { 0 };
 
 		bch2_trans_begin(&trans);
@@ -239,31 +238,31 @@ s64 bch2_remap_range(struct bch_fs *c,
 			break;
 		}
 
-		dst_done = dst_iter->pos.offset - dst_start.offset;
+		dst_done = dst_iter.pos.offset - dst_start.offset;
 		src_want = POS(src_start.inode, src_start.offset + dst_done);
-		bch2_btree_iter_set_pos(src_iter, src_want);
+		bch2_btree_iter_set_pos(&src_iter, src_want);
 
-		src_k = get_next_src(src_iter, src_end);
+		src_k = get_next_src(&src_iter, src_end);
 		ret = bkey_err(src_k);
 		if (ret)
 			continue;
 
-		if (bkey_cmp(src_want, src_iter->pos) < 0) {
-			ret = bch2_fpunch_at(&trans, dst_iter,
+		if (bkey_cmp(src_want, src_iter.pos) < 0) {
+			ret = bch2_fpunch_at(&trans, &dst_iter,
 					bpos_min(dst_end,
-						 POS(dst_iter->pos.inode, dst_iter->pos.offset +
-						     src_iter->pos.offset - src_want.offset)),
+						 POS(dst_iter.pos.inode, dst_iter.pos.offset +
+						     src_iter.pos.offset - src_want.offset)),
 						 journal_seq, i_sectors_delta);
 			continue;
 		}
 
 		if (src_k.k->type != KEY_TYPE_reflink_p) {
-			bch2_btree_iter_set_pos_to_extent_start(src_iter);
+			bch2_btree_iter_set_pos_to_extent_start(&src_iter);
 
 			bch2_bkey_buf_reassemble(&new_src, c, src_k);
 			src_k = bkey_i_to_s_c(new_src.k);
 
-			ret = bch2_make_extent_indirect(&trans, src_iter,
+			ret = bch2_make_extent_indirect(&trans, &src_iter,
 						new_src.k);
 			if (ret)
 				continue;
@@ -286,43 +285,42 @@ s64 bch2_remap_range(struct bch_fs *c,
 			BUG();
 		}
 
-		new_dst.k->k.p = dst_iter->pos;
+		new_dst.k->k.p = dst_iter.pos;
 		bch2_key_resize(&new_dst.k->k,
 				min(src_k.k->p.offset - src_want.offset,
-				    dst_end.offset - dst_iter->pos.offset));
-		ret = bch2_extent_update(&trans, dst_iter, new_dst.k,
+				    dst_end.offset - dst_iter.pos.offset));
+		ret = bch2_extent_update(&trans, &dst_iter, new_dst.k,
 					 &disk_res, journal_seq,
 					 new_i_size, i_sectors_delta,
 					 true);
 		bch2_disk_reservation_put(c, &disk_res);
 	}
-	bch2_trans_iter_put(&trans, dst_iter);
-	bch2_trans_iter_put(&trans, src_iter);
+	bch2_trans_iter_exit(&trans, &dst_iter);
+	bch2_trans_iter_exit(&trans, &src_iter);
 
-	BUG_ON(!ret && bkey_cmp(dst_iter->pos, dst_end));
-	BUG_ON(bkey_cmp(dst_iter->pos, dst_end) > 0);
+	BUG_ON(!ret && bkey_cmp(dst_iter.pos, dst_end));
+	BUG_ON(bkey_cmp(dst_iter.pos, dst_end) > 0);
 
-	dst_done = dst_iter->pos.offset - dst_start.offset;
-	new_i_size = min(dst_iter->pos.offset << 9, new_i_size);
+	dst_done = dst_iter.pos.offset - dst_start.offset;
+	new_i_size = min(dst_iter.pos.offset << 9, new_i_size);
 
 	do {
 		struct bch_inode_unpacked inode_u;
-		struct btree_iter *inode_iter;
+		struct btree_iter inode_iter = { NULL };
 
 		bch2_trans_begin(&trans);
 
-		inode_iter = bch2_inode_peek(&trans, &inode_u,
+		ret2 = bch2_inode_peek(&trans, &inode_iter, &inode_u,
 				dst_start.inode, BTREE_ITER_INTENT);
-		ret2 = PTR_ERR_OR_ZERO(inode_iter);
 
 		if (!ret2 &&
 		    inode_u.bi_size < new_i_size) {
 			inode_u.bi_size = new_i_size;
-			ret2  = bch2_inode_write(&trans, inode_iter, &inode_u) ?:
+			ret2  = bch2_inode_write(&trans, &inode_iter, &inode_u) ?:
 				bch2_trans_commit(&trans, NULL, journal_seq, 0);
 		}
 
-		bch2_trans_iter_put(&trans, inode_iter);
+		bch2_trans_iter_exit(&trans, &inode_iter);
 	} while (ret2 == -EINTR);
 
 	ret = bch2_trans_exit(&trans) ?: ret;
diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h
index 236023494191..c6a132b3c5bb 100644
--- a/fs/bcachefs/str_hash.h
+++ b/fs/bcachefs/str_hash.h
@@ -139,18 +139,18 @@ struct bch_hash_desc {
 	bool		(*cmp_bkey)(struct bkey_s_c, struct bkey_s_c);
 };
 
-static __always_inline struct btree_iter *
+static __always_inline int
 bch2_hash_lookup(struct btree_trans *trans,
+		 struct btree_iter *iter,
 		 const struct bch_hash_desc desc,
 		 const struct bch_hash_info *info,
 		 u64 inode, const void *key,
 		 unsigned flags)
 {
-	struct btree_iter *iter;
 	struct bkey_s_c k;
 	int ret;
 
-	for_each_btree_key(trans, iter, desc.btree_id,
+	for_each_btree_key(trans, *iter, desc.btree_id,
 			   POS(inode, desc.hash_key(info, key)),
 			   BTREE_ITER_SLOTS|flags, k, ret) {
 		if (iter->pos.inode != inode)
@@ -158,7 +158,7 @@ bch2_hash_lookup(struct btree_trans *trans,
 
 		if (k.k->type == desc.key_type) {
 			if (!desc.cmp_key(k, key))
-				return iter;
+				return 0;
 		} else if (k.k->type == KEY_TYPE_hash_whiteout) {
 			;
 		} else {
@@ -166,35 +166,33 @@ bch2_hash_lookup(struct btree_trans *trans,
 			break;
 		}
 	}
-	bch2_trans_iter_put(trans, iter);
+	bch2_trans_iter_exit(trans, iter);
 
-	return ERR_PTR(ret ?: -ENOENT);
+	return ret ?: -ENOENT;
 }
 
-static __always_inline struct btree_iter *
+static __always_inline int
 bch2_hash_hole(struct btree_trans *trans,
+	       struct btree_iter *iter,
 	       const struct bch_hash_desc desc,
 	       const struct bch_hash_info *info,
 	       u64 inode, const void *key)
 {
-	struct btree_iter *iter;
 	struct bkey_s_c k;
 	int ret;
 
-	for_each_btree_key(trans, iter, desc.btree_id,
+	for_each_btree_key(trans, *iter, desc.btree_id,
 			   POS(inode, desc.hash_key(info, key)),
 			   BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) {
 		if (iter->pos.inode != inode)
 			break;
 
 		if (k.k->type != desc.key_type)
-			return iter;
+			return 0;
 	}
+	bch2_trans_iter_exit(trans, iter);
 
-	iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT;
-	bch2_trans_iter_put(trans, iter);
-
-	return ERR_PTR(ret ?: -ENOSPC);
+	return ret ?: -ENOSPC;
 }
 
 static __always_inline
@@ -203,13 +201,13 @@ int bch2_hash_needs_whiteout(struct btree_trans *trans,
 			     const struct bch_hash_info *info,
 			     struct btree_iter *start)
 {
-	struct btree_iter *iter;
+	struct btree_iter iter;
 	struct bkey_s_c k;
 	int ret;
 
-	iter = bch2_trans_copy_iter(trans, start);
+	bch2_trans_copy_iter(&iter, start);
 
-	bch2_btree_iter_advance(iter);
+	bch2_btree_iter_advance(&iter);
 
 	for_each_btree_key_continue(iter, BTREE_ITER_SLOTS, k, ret) {
 		if (k.k->type != desc.key_type &&
@@ -218,13 +216,12 @@ int bch2_hash_needs_whiteout(struct btree_trans *trans,
 
 		if (k.k->type == desc.key_type &&
 		    desc.hash_bkey(info, k) <= start->pos.offset) {
-			iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT;
 			ret = 1;
 			break;
 		}
 	}
 
-	bch2_trans_iter_put(trans, iter);
+	bch2_trans_iter_exit(trans, &iter);
 	return ret;
 }
 
@@ -234,7 +231,7 @@ int bch2_hash_set(struct btree_trans *trans,
 		  const struct bch_hash_info *info,
 		  u64 inode, struct bkey_i *insert, int flags)
 {
-	struct btree_iter *iter, *slot = NULL;
+	struct btree_iter iter, slot = { NULL };
 	struct bkey_s_c k;
 	bool found = false;
 	int ret;
@@ -242,7 +239,7 @@ int bch2_hash_set(struct btree_trans *trans,
 	for_each_btree_key(trans, iter, desc.btree_id,
 			   POS(inode, desc.hash_bkey(info, bkey_i_to_s_c(insert))),
 			   BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) {
-		if (iter->pos.inode != inode)
+		if (iter.pos.inode != inode)
 			break;
 
 		if (k.k->type == desc.key_type) {
@@ -253,9 +250,9 @@ int bch2_hash_set(struct btree_trans *trans,
 			continue;
 		}
 
-		if (!slot &&
+		if (!slot.path &&
 		    !(flags & BCH_HASH_SET_MUST_REPLACE))
-			slot = bch2_trans_copy_iter(trans, iter);
+			bch2_trans_copy_iter(&slot, &iter);
 
 		if (k.k->type != KEY_TYPE_hash_whiteout)
 			goto not_found;
@@ -264,8 +261,8 @@ int bch2_hash_set(struct btree_trans *trans,
 	if (!ret)
 		ret = -ENOSPC;
 out:
-	bch2_trans_iter_put(trans, slot);
-	bch2_trans_iter_put(trans, iter);
+	bch2_trans_iter_exit(trans, &slot);
+	bch2_trans_iter_exit(trans, &iter);
 
 	return ret;
 found:
@@ -277,11 +274,11 @@ not_found:
 	} else if (found && (flags & BCH_HASH_SET_MUST_CREATE)) {
 		ret = -EEXIST;
 	} else {
-		if (!found && slot)
+		if (!found && slot.path)
 			swap(iter, slot);
 
-		insert->k.p = iter->pos;
-		ret = bch2_trans_update(trans, iter, insert, 0);
+		insert->k.p = iter.pos;
+		ret = bch2_trans_update(trans, &iter, insert, 0);
 	}
 
 	goto out;
@@ -318,16 +315,16 @@ int bch2_hash_delete(struct btree_trans *trans,
 		     const struct bch_hash_info *info,
 		     u64 inode, const void *key)
 {
-	struct btree_iter *iter;
+	struct btree_iter iter;
 	int ret;
 
-	iter = bch2_hash_lookup(trans, desc, info, inode, key,
+	ret = bch2_hash_lookup(trans, &iter, desc, info, inode, key,
 				BTREE_ITER_INTENT);
-	if (IS_ERR(iter))
-		return PTR_ERR(iter);
+	if (ret)
+		return ret;
 
-	ret = bch2_hash_delete_at(trans, desc, info, iter);
-	bch2_trans_iter_put(trans, iter);
+	ret = bch2_hash_delete_at(trans, desc, info, &iter);
+	bch2_trans_iter_exit(trans, &iter);
 	return ret;
 }
 
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 1d793e554084..b18ca3947ac8 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -494,11 +494,11 @@ static void __bch2_fs_free(struct bch_fs *c)
 	percpu_free_rwsem(&c->mark_lock);
 	free_percpu(c->online_reserved);
 
-	if (c->btree_iters_bufs)
+	if (c->btree_paths_bufs)
 		for_each_possible_cpu(cpu)
-			kfree(per_cpu_ptr(c->btree_iters_bufs, cpu)->iter);
+			kfree(per_cpu_ptr(c->btree_paths_bufs, cpu)->path);
 
-	free_percpu(c->btree_iters_bufs);
+	free_percpu(c->btree_paths_bufs);
 	free_percpu(c->pcpu);
 	mempool_exit(&c->large_bkey_pool);
 	mempool_exit(&c->btree_bounce_pool);
@@ -783,7 +783,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 			BIOSET_NEED_BVECS) ||
 	    !(c->pcpu = alloc_percpu(struct bch_fs_pcpu)) ||
 	    !(c->online_reserved = alloc_percpu(u64)) ||
-	    !(c->btree_iters_bufs = alloc_percpu(struct btree_iter_buf)) ||
+	    !(c->btree_paths_bufs = alloc_percpu(struct btree_path_buf)) ||
 	    mempool_init_kvpmalloc_pool(&c->btree_bounce_pool, 1,
 					btree_bytes(c)) ||
 	    mempool_init_kmalloc_pool(&c->large_bkey_pool, 1, 2048) ||
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index b5ce336f00ca..92e58f5c6bbf 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -290,7 +290,7 @@ static int fs_alloc_debug_to_text(struct printbuf *out, struct bch_fs *c)
 static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c)
 {
 	struct btree_trans trans;
-	struct btree_iter *iter;
+	struct btree_iter iter;
 	struct bkey_s_c k;
 	u64 nr_uncompressed_extents = 0, uncompressed_sectors = 0,
 	    nr_compressed_extents = 0,
@@ -325,6 +325,7 @@ static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c
 				break;
 			}
 		}
+	bch2_trans_iter_exit(&trans, &iter);
 
 	ret = bch2_trans_exit(&trans) ?: ret;
 	if (ret)
diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c
index 43b514974d91..1b583b134853 100644
--- a/fs/bcachefs/tests.c
+++ b/fs/bcachefs/tests.c
@@ -29,7 +29,7 @@ static void delete_test_keys(struct bch_fs *c)
 static int test_delete(struct bch_fs *c, u64 nr)
 {
 	struct btree_trans trans;
-	struct btree_iter *iter;
+	struct btree_iter iter;
 	struct bkey_i_cookie k;
 	int ret;
 
@@ -37,13 +37,12 @@ static int test_delete(struct bch_fs *c, u64 nr)
 	k.k.p.snapshot = U32_MAX;
 
 	bch2_trans_init(&trans, c, 0, 0);
-
-	iter = bch2_trans_get_iter(&trans, BTREE_ID_xattrs, k.k.p,
-				   BTREE_ITER_INTENT);
+	bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs, k.k.p,
+			     BTREE_ITER_INTENT);
 
 	ret = __bch2_trans_do(&trans, NULL, NULL, 0,
-		bch2_btree_iter_traverse(iter) ?:
-		bch2_trans_update(&trans, iter, &k.k_i, 0));
+		bch2_btree_iter_traverse(&iter) ?:
+		bch2_trans_update(&trans, &iter, &k.k_i, 0));
 	if (ret) {
 		bch_err(c, "update error in test_delete: %i", ret);
 		goto err;
@@ -51,8 +50,8 @@ static int test_delete(struct bch_fs *c, u64 nr)
 
 	pr_info("deleting once");
 	ret = __bch2_trans_do(&trans, NULL, NULL, 0,
-		bch2_btree_iter_traverse(iter) ?:
-		bch2_btree_delete_at(&trans, iter, 0));
+		bch2_btree_iter_traverse(&iter) ?:
+		bch2_btree_delete_at(&trans, &iter, 0));
 	if (ret) {
 		bch_err(c, "delete error (first) in test_delete: %i", ret);
 		goto err;
@@ -60,14 +59,14 @@ static int test_delete(struct bch_fs *c, u64 nr)
 
 	pr_info("deleting twice");
 	ret = __bch2_trans_do(&trans, NULL, NULL, 0,
-		bch2_btree_iter_traverse(iter) ?:
-		bch2_btree_delete_at(&trans, iter, 0));
+		bch2_btree_iter_traverse(&iter) ?:
+		bch2_btree_delete_at(&trans, &iter, 0));
 	if (ret) {
 		bch_err(c, "delete error (second) in test_delete: %i", ret);
 		goto err;
 	}
 err:
-	bch2_trans_iter_put(&trans, iter);
+	bch2_trans_iter_exit(&trans, &iter);
 	bch2_trans_exit(&trans);
 	return ret;
 }
@@ -75,7 +74,7 @@ err:
 static int test_delete_written(struct bch_fs *c, u64 nr)
 {
 	struct btree_trans trans;
-	struct btree_iter *iter;
+	struct btree_iter iter;
 	struct bkey_i_cookie k;
 	int ret;
 
@@ -84,12 +83,12 @@ static int test_delete_written(struct bch_fs *c, u64 nr)
 
 	bch2_trans_init(&trans, c, 0, 0);
 
-	iter = bch2_trans_get_iter(&trans, BTREE_ID_xattrs, k.k.p,
-				   BTREE_ITER_INTENT);
+	bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs, k.k.p,
+			     BTREE_ITER_INTENT);
 
 	ret = __bch2_trans_do(&trans, NULL, NULL, 0,
-		bch2_btree_iter_traverse(iter) ?:
-		bch2_trans_update(&trans, iter, &k.k_i, 0));
+		bch2_btree_iter_traverse(&iter) ?:
+		bch2_trans_update(&trans, &iter, &k.k_i, 0));
 	if (ret) {
 		bch_err(c, "update error in test_delete_written: %i", ret);
 		goto err;
@@ -99,14 +98,14 @@ static int test_delete_written(struct bch_fs *c, u64 nr)
 	bch2_journal_flush_all_pins(&c->journal);
 
 	ret = __bch2_trans_do(&trans, NULL, NULL, 0,
-		bch2_btree_iter_traverse(iter) ?:
-		bch2_btree_delete_at(&trans, iter, 0));
+		bch2_btree_iter_traverse(&iter) ?:
+		bch2_btree_delete_at(&trans, &iter, 0));
 	if (ret) {
 		bch_err(c, "delete error in test_delete_written: %i", ret);
 		goto err;
 	}
 err:
-	bch2_trans_iter_put(&trans, iter);
+	bch2_trans_iter_exit(&trans, &iter);
 	bch2_trans_exit(&trans);
 	return ret;
 }
@@ -114,7 +113,7 @@ err:
 static int test_iterate(struct bch_fs *c, u64 nr)
 {
 	struct btree_trans trans;
-	struct btree_iter *iter = NULL;
+	struct btree_iter iter = { NULL };
 	struct bkey_s_c k;
 	u64 i;
 	int ret = 0;
@@ -156,12 +155,12 @@ static int test_iterate(struct bch_fs *c, u64 nr)
 
 	pr_info("iterating backwards");
 
-	while (!IS_ERR_OR_NULL((k = bch2_btree_iter_prev(iter)).k))
+	while (!IS_ERR_OR_NULL((k = bch2_btree_iter_prev(&iter)).k))
 		BUG_ON(k.k->p.offset != --i);
 
 	BUG_ON(i);
 err:
-	bch2_trans_iter_put(&trans, iter);
+	bch2_trans_iter_exit(&trans, &iter);
 	bch2_trans_exit(&trans);
 	return ret;
 }
@@ -169,7 +168,7 @@ err:
 static int test_iterate_extents(struct bch_fs *c, u64 nr)
 {
 	struct btree_trans trans;
-	struct btree_iter *iter = NULL;
+	struct btree_iter iter = { NULL };
 	struct bkey_s_c k;
 	u64 i;
 	int ret = 0;
@@ -210,14 +209,14 @@ static int test_iterate_extents(struct bch_fs *c, u64 nr)
 
 	pr_info("iterating backwards");
 
-	while (!IS_ERR_OR_NULL((k = bch2_btree_iter_prev(iter)).k)) {
+	while (!IS_ERR_OR_NULL((k = bch2_btree_iter_prev(&iter)).k)) {
 		BUG_ON(k.k->p.offset != i);
 		i = bkey_start_offset(k.k);
 	}
 
 	BUG_ON(i);
 err:
-	bch2_trans_iter_put(&trans, iter);
+	bch2_trans_iter_exit(&trans, &iter);
 	bch2_trans_exit(&trans);
 	return ret;
 }
@@ -225,7 +224,7 @@ err:
 static int test_iterate_slots(struct bch_fs *c, u64 nr)
 {
 	struct btree_trans trans;
-	struct btree_iter *iter;
+	struct btree_iter iter = { NULL };
 	struct bkey_s_c k;
 	u64 i;
 	int ret = 0;
@@ -263,7 +262,7 @@ static int test_iterate_slots(struct bch_fs *c, u64 nr)
 		BUG_ON(k.k->p.offset != i);
 		i += 2;
 	}
-	bch2_trans_iter_put(&trans, iter);
+	bch2_trans_iter_exit(&trans, &iter);
 
 	BUG_ON(i != nr * 2);
 
@@ -280,7 +279,7 @@ static int test_iterate_slots(struct bch_fs *c, u64 nr)
 		if (i == nr * 2)
 			break;
 	}
-	bch2_trans_iter_put(&trans, iter);
+	bch2_trans_iter_exit(&trans, &iter);
 err:
 	bch2_trans_exit(&trans);
 	return ret;
@@ -289,7 +288,7 @@ err:
 static int test_iterate_slots_extents(struct bch_fs *c, u64 nr)
 {
 	struct btree_trans trans;
-	struct btree_iter *iter;
+	struct btree_iter iter = { NULL };
 	struct bkey_s_c k;
 	u64 i;
 	int ret = 0;
@@ -326,7 +325,7 @@ static int test_iterate_slots_extents(struct bch_fs *c, u64 nr)
 		BUG_ON(k.k->size != 8);
 		i += 16;
 	}
-	bch2_trans_iter_put(&trans, iter);
+	bch2_trans_iter_exit(&trans, &iter);
 
 	BUG_ON(i != nr);
 
@@ -345,7 +344,7 @@ static int test_iterate_slots_extents(struct bch_fs *c, u64 nr)
 		if (i == nr)
 			break;
 	}
-	bch2_trans_iter_put(&trans, iter);
+	bch2_trans_iter_exit(&trans, &iter);
 err:
 	bch2_trans_exit(&trans);
 	return 0;
@@ -358,21 +357,19 @@ err:
 static int test_peek_end(struct bch_fs *c, u64 nr)
 {
 	struct btree_trans trans;
-	struct btree_iter *iter;
+	struct btree_iter iter;
 	struct bkey_s_c k;
 
 	bch2_trans_init(&trans, c, 0, 0);
+	bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs, POS_MIN, 0);
 
-	iter = bch2_trans_get_iter(&trans, BTREE_ID_xattrs, POS_MIN, 0);
-
-	k = bch2_btree_iter_peek(iter);
+	k = bch2_btree_iter_peek(&iter);
 	BUG_ON(k.k);
 
-	k = bch2_btree_iter_peek(iter);
+	k = bch2_btree_iter_peek(&iter);
 	BUG_ON(k.k);
 
-	bch2_trans_iter_put(&trans, iter);
-
+	bch2_trans_iter_exit(&trans, &iter);
 	bch2_trans_exit(&trans);
 	return 0;
 }
@@ -380,21 +377,19 @@ static int test_peek_end(struct bch_fs *c, u64 nr)
 static int test_peek_end_extents(struct bch_fs *c, u64 nr)
 {
 	struct btree_trans trans;
-	struct btree_iter *iter;
+	struct btree_iter iter;
 	struct bkey_s_c k;
 
 	bch2_trans_init(&trans, c, 0, 0);
+	bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents, POS_MIN, 0);
 
-	iter = bch2_trans_get_iter(&trans, BTREE_ID_extents, POS_MIN, 0);
-
-	k = bch2_btree_iter_peek(iter);
+	k = bch2_btree_iter_peek(&iter);
 	BUG_ON(k.k);
 
-	k = bch2_btree_iter_peek(iter);
+	k = bch2_btree_iter_peek(&iter);
 	BUG_ON(k.k);
 
-	bch2_trans_iter_put(&trans, iter);
-
+	bch2_trans_iter_exit(&trans, &iter);
 	bch2_trans_exit(&trans);
 	return 0;
 }
@@ -540,18 +535,18 @@ static int rand_insert_multi(struct bch_fs *c, u64 nr)
 static int rand_lookup(struct bch_fs *c, u64 nr)
 {
 	struct btree_trans trans;
-	struct btree_iter *iter;
+	struct btree_iter iter;
 	struct bkey_s_c k;
 	int ret = 0;
 	u64 i;
 
 	bch2_trans_init(&trans, c, 0, 0);
-	iter = bch2_trans_get_iter(&trans, BTREE_ID_xattrs, POS_MIN, 0);
+	bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs, POS_MIN, 0);
 
 	for (i = 0; i < nr; i++) {
-		bch2_btree_iter_set_pos(iter, POS(0, test_rand()));
+		bch2_btree_iter_set_pos(&iter, POS(0, test_rand()));
 
-		k = bch2_btree_iter_peek(iter);
+		k = bch2_btree_iter_peek(&iter);
 		ret = bkey_err(k);
 		if (ret) {
 			bch_err(c, "error in rand_lookup: %i", ret);
@@ -559,63 +554,73 @@ static int rand_lookup(struct bch_fs *c, u64 nr)
 		}
 	}
 
-	bch2_trans_iter_put(&trans, iter);
+	bch2_trans_iter_exit(&trans, &iter);
 	bch2_trans_exit(&trans);
 	return ret;
 }
 
+static int rand_mixed_trans(struct btree_trans *trans,
+			    struct btree_iter *iter,
+			    struct bkey_i_cookie *cookie,
+			    u64 i, u64 pos)
+{
+	struct bkey_s_c k;
+	int ret;
+
+	bch2_btree_iter_set_pos(iter, POS(0, pos));
+
+	k = bch2_btree_iter_peek(iter);
+	ret = bkey_err(k);
+	if (ret && ret != -EINTR)
+		bch_err(trans->c, "lookup error in rand_mixed: %i", ret);
+	if (ret)
+		return ret;
+
+	if (!(i & 3) && k.k) {
+		bkey_cookie_init(&cookie->k_i);
+		cookie->k.p = iter->pos;
+		bch2_trans_update(trans, iter, &cookie->k_i, 0);
+	}
+
+	return 0;
+}
+
 static int rand_mixed(struct bch_fs *c, u64 nr)
 {
 	struct btree_trans trans;
-	struct btree_iter *iter;
-	struct bkey_s_c k;
+	struct btree_iter iter;
+	struct bkey_i_cookie cookie;
 	int ret = 0;
-	u64 i;
+	u64 i, rand;
 
 	bch2_trans_init(&trans, c, 0, 0);
-	iter = bch2_trans_get_iter(&trans, BTREE_ID_xattrs, POS_MIN, 0);
+	bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs, POS_MIN, 0);
 
 	for (i = 0; i < nr; i++) {
-		bch2_btree_iter_set_pos(iter, POS(0, test_rand()));
-
-		k = bch2_btree_iter_peek(iter);
-		ret = bkey_err(k);
+		rand = test_rand();
+		ret = __bch2_trans_do(&trans, NULL, NULL, 0,
+			rand_mixed_trans(&trans, &iter, &cookie, i, rand));
 		if (ret) {
-			bch_err(c, "lookup error in rand_mixed: %i", ret);
+			bch_err(c, "update error in rand_mixed: %i", ret);
 			break;
 		}
-
-		if (!(i & 3) && k.k) {
-			struct bkey_i_cookie k;
-
-			bkey_cookie_init(&k.k_i);
-			k.k.p = iter->pos;
-
-			ret = __bch2_trans_do(&trans, NULL, NULL, 0,
-				bch2_btree_iter_traverse(iter) ?:
-				bch2_trans_update(&trans, iter, &k.k_i, 0));
-			if (ret) {
-				bch_err(c, "update error in rand_mixed: %i", ret);
-				break;
-			}
-		}
 	}
 
-	bch2_trans_iter_put(&trans, iter);
+	bch2_trans_iter_exit(&trans, &iter);
 	bch2_trans_exit(&trans);
 	return ret;
 }
 
 static int __do_delete(struct btree_trans *trans, struct bpos pos)
 {
-	struct btree_iter *iter;
+	struct btree_iter iter;
 	struct bkey_i delete;
 	struct bkey_s_c k;
 	int ret = 0;
 
-	iter = bch2_trans_get_iter(trans, BTREE_ID_xattrs, pos,
-				   BTREE_ITER_INTENT);
-	k = bch2_btree_iter_peek(iter);
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, pos,
+			     BTREE_ITER_INTENT);
+	k = bch2_btree_iter_peek(&iter);
 	ret = bkey_err(k);
 	if (ret)
 		goto err;
@@ -626,9 +631,9 @@ static int __do_delete(struct btree_trans *trans, struct bpos pos)
 	bkey_init(&delete.k);
 	delete.k.p = k.k->p;
 
-	ret = bch2_trans_update(trans, iter, &delete, 0);
+	ret = bch2_trans_update(trans, &iter, &delete, 0);
 err:
-	bch2_trans_iter_put(trans, iter);
+	bch2_trans_iter_exit(trans, &iter);
 	return ret;
 }
 
@@ -658,7 +663,7 @@ static int rand_delete(struct bch_fs *c, u64 nr)
 static int seq_insert(struct bch_fs *c, u64 nr)
 {
 	struct btree_trans trans;
-	struct btree_iter *iter;
+	struct btree_iter iter;
 	struct bkey_s_c k;
 	struct bkey_i_cookie insert;
 	int ret = 0;
@@ -670,11 +675,11 @@ static int seq_insert(struct bch_fs *c, u64 nr)
 
 	for_each_btree_key(&trans, iter, BTREE_ID_xattrs, POS_MIN,
 			   BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) {
-		insert.k.p = iter->pos;
+		insert.k.p = iter.pos;
 
 		ret = __bch2_trans_do(&trans, NULL, NULL, 0,
-			bch2_btree_iter_traverse(iter) ?:
-			bch2_trans_update(&trans, iter, &insert.k_i, 0));
+			bch2_btree_iter_traverse(&iter) ?:
+			bch2_trans_update(&trans, &iter, &insert.k_i, 0));
 		if (ret) {
 			bch_err(c, "error in seq_insert: %i", ret);
 			break;
@@ -683,7 +688,7 @@ static int seq_insert(struct bch_fs *c, u64 nr)
 		if (++i == nr)
 			break;
 	}
-	bch2_trans_iter_put(&trans, iter);
+	bch2_trans_iter_exit(&trans, &iter);
 
 	bch2_trans_exit(&trans);
 	return ret;
@@ -692,7 +697,7 @@ static int seq_insert(struct bch_fs *c, u64 nr)
 static int seq_lookup(struct bch_fs *c, u64 nr)
 {
 	struct btree_trans trans;
-	struct btree_iter *iter;
+	struct btree_iter iter;
 	struct bkey_s_c k;
 	int ret = 0;
 
@@ -700,7 +705,7 @@ static int seq_lookup(struct bch_fs *c, u64 nr)
 
 	for_each_btree_key(&trans, iter, BTREE_ID_xattrs, POS_MIN, 0, k, ret)
 		;
-	bch2_trans_iter_put(&trans, iter);
+	bch2_trans_iter_exit(&trans, &iter);
 
 	bch2_trans_exit(&trans);
 	return ret;
@@ -709,7 +714,7 @@ static int seq_lookup(struct bch_fs *c, u64 nr)
 static int seq_overwrite(struct bch_fs *c, u64 nr)
 {
 	struct btree_trans trans;
-	struct btree_iter *iter;
+	struct btree_iter iter;
 	struct bkey_s_c k;
 	int ret = 0;
 
@@ -722,14 +727,14 @@ static int seq_overwrite(struct bch_fs *c, u64 nr)
 		bkey_reassemble(&u.k_i, k);
 
 		ret = __bch2_trans_do(&trans, NULL, NULL, 0,
-			bch2_btree_iter_traverse(iter) ?:
-			bch2_trans_update(&trans, iter, &u.k_i, 0));
+			bch2_btree_iter_traverse(&iter) ?:
+			bch2_trans_update(&trans, &iter, &u.k_i, 0));
 		if (ret) {
 			bch_err(c, "error in seq_overwrite: %i", ret);
 			break;
 		}
 	}
-	bch2_trans_iter_put(&trans, iter);
+	bch2_trans_iter_exit(&trans, &iter);
 
 	bch2_trans_exit(&trans);
 	return ret;
diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h
index af1f415fb5e7..44a556518d4a 100644
--- a/fs/bcachefs/trace.h
+++ b/fs/bcachefs/trace.h
@@ -540,7 +540,7 @@ TRACE_EVENT(copygc_wait,
 		  __entry->wait_amount, __entry->until)
 );
 
-TRACE_EVENT(trans_get_iter,
+TRACE_EVENT(trans_get_path,
 	TP_PROTO(unsigned long trans_ip,
 		 unsigned long caller_ip,
 		 enum btree_id btree_id,
@@ -814,7 +814,7 @@ TRACE_EVENT(iter_traverse,
 		  __entry->ret)
 );
 
-TRACE_EVENT(iter_set_search_pos,
+TRACE_EVENT(path_set_pos,
 	TP_PROTO(unsigned long	trans_ip,
 		 unsigned long	caller_ip,
 		 enum btree_id	btree_id,
diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c
index bf4164f98743..babbfaadeb3f 100644
--- a/fs/bcachefs/xattr.c
+++ b/fs/bcachefs/xattr.c
@@ -122,23 +122,22 @@ static int bch2_xattr_get_trans(struct btree_trans *trans, struct bch_inode_info
 				const char *name, void *buffer, size_t size, int type)
 {
 	struct bch_hash_info hash = bch2_hash_info_init(trans->c, &inode->ei_inode);
-	struct btree_iter *iter;
+	struct btree_iter iter;
 	struct bkey_s_c_xattr xattr;
 	struct bkey_s_c k;
 	int ret;
 
-	iter = bch2_hash_lookup(trans, bch2_xattr_hash_desc, &hash,
-				inode->v.i_ino,
-				&X_SEARCH(type, name, strlen(name)),
-				0);
-	ret = PTR_ERR_OR_ZERO(iter);
+	ret = bch2_hash_lookup(trans, &iter, bch2_xattr_hash_desc, &hash,
+			       inode->v.i_ino,
+			       &X_SEARCH(type, name, strlen(name)),
+			       0);
 	if (ret)
-		goto err;
+		goto err1;
 
-	k = bch2_btree_iter_peek_slot(iter);
+	k = bch2_btree_iter_peek_slot(&iter);
 	ret = bkey_err(k);
 	if (ret)
-		goto err;
+		goto err2;
 
 	xattr = bkey_s_c_to_xattr(k);
 	ret = le16_to_cpu(xattr.v->x_val_len);
@@ -148,8 +147,9 @@ static int bch2_xattr_get_trans(struct btree_trans *trans, struct bch_inode_info
 		else
 			memcpy(buffer, xattr_val(xattr.v), ret);
 	}
-	bch2_trans_iter_put(trans, iter);
-err:
+err2:
+	bch2_trans_iter_exit(trans, &iter);
+err1:
 	return ret == -ENOENT ? -ENODATA : ret;
 }
 
@@ -279,7 +279,7 @@ ssize_t bch2_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size)
 	struct bch_fs *c = dentry->d_sb->s_fs_info;
 	struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
 	struct btree_trans trans;
-	struct btree_iter *iter;
+	struct btree_iter iter;
 	struct bkey_s_c k;
 	struct xattr_buf buf = { .buf = buffer, .len = buffer_size };
 	u64 inum = dentry->d_inode->i_ino;
@@ -301,7 +301,7 @@ ssize_t bch2_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size)
 		if (ret)
 			break;
 	}
-	bch2_trans_iter_put(&trans, iter);
+	bch2_trans_iter_exit(&trans, &iter);
 
 	ret = bch2_trans_exit(&trans) ?: ret;
 
-- 
cgit 


From 807dda8c83620ab0fd1d93bbe8bdc4a289cbd045 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 30 Aug 2021 15:18:31 -0400
Subject: bcachefs: Kill bpos_diff() XXX check for perf regression

This improves the btree iterator lookup code by using
trans_for_each_iter_inorder().

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/bkey.h       | 31 -------------------------------
 fs/bcachefs/btree_iter.c | 43 +++++++++++++++++++++----------------------
 2 files changed, 21 insertions(+), 53 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bkey.h b/fs/bcachefs/bkey.h
index 72b4267031d8..904ceb67a029 100644
--- a/fs/bcachefs/bkey.h
+++ b/fs/bcachefs/bkey.h
@@ -171,37 +171,6 @@ static inline struct bpos bpos_max(struct bpos l, struct bpos r)
 	return bpos_cmp(l, r) > 0 ? l : r;
 }
 
-#define sbb(a, b, borrow)				\
-do {							\
-	typeof(a) d1, d2;				\
-							\
-	d1 = a - borrow;				\
-	borrow  = d1 > a;				\
-							\
-	d2 = d1 - b;					\
-	borrow += d2 > d1;				\
-	a = d2;						\
-} while (0)
-
-/* returns a - b: */
-static inline struct bpos bpos_sub(struct bpos a, struct bpos b)
-{
-	int borrow = 0;
-
-	sbb(a.snapshot, b.snapshot,	borrow);
-	sbb(a.offset,	b.offset,	borrow);
-	sbb(a.inode,	b.inode,	borrow);
-	return a;
-}
-
-static inline struct bpos bpos_diff(struct bpos l, struct bpos r)
-{
-	if (bpos_cmp(l, r) > 0)
-		swap(l, r);
-
-	return bpos_sub(r, l);
-}
-
 void bch2_bpos_swab(struct bpos *);
 void bch2_bkey_swab_key(const struct bkey_format *, struct bkey_packed *);
 
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 06379f3e40a6..a856f5e90727 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1739,36 +1739,35 @@ struct btree_path *bch2_path_get(struct btree_trans *trans, bool cached,
 				 unsigned locks_want, unsigned level,
 				 bool intent)
 {
-	struct btree_path *path, *best = NULL;
+	struct btree_path *path, *path_pos = NULL;
 	struct bpos pos_min = POS_MIN;
 	int i;
 
 	BUG_ON(trans->restarted);
 
-	trans_for_each_path(trans, path) {
-		if (path->cached != cached ||
-		    path->btree_id != btree_id ||
-		    path->level != level)
-			continue;
-
-		if (best) {
-			int cmp = bkey_cmp(bpos_diff(best->pos, pos),
-					   bpos_diff(path->pos, pos));
+	btree_trans_sort_paths(trans);
 
-			if (cmp < 0 ||
-			    ((cmp == 0 && (path->ref || path->preserve))))
-				continue;
-		}
+	trans_for_each_path_inorder(trans, path, i) {
+		if (__btree_path_cmp(path,
+				     btree_id,
+				     cached,
+				     pos,
+				     level) > 0)
+			break;
 
-		best = path;
+		path_pos = path;
 	}
 
-	if (best) {
-		__btree_path_get(best, intent);
-		path = btree_path_set_pos(trans, best, pos, intent);
+	if (path_pos &&
+	    path_pos->cached	== cached &&
+	    path_pos->btree_id	== btree_id &&
+	    path_pos->level	== level) {
+		__btree_path_get(path_pos, intent);
+		path = btree_path_set_pos(trans, path_pos, pos, intent);
 		path->preserve = true;
 	} else {
-		path = btree_path_alloc(trans, NULL);
+		path = btree_path_alloc(trans, path_pos);
+		path_pos = NULL;
 
 		__btree_path_get(path, intent);
 		path->pos			= pos;
@@ -1808,9 +1807,9 @@ struct btree_path *bch2_path_get(struct btree_trans *trans, bool cached,
 
 	trace_trans_get_path(_RET_IP_, trans->ip, btree_id,
 			     &pos, locks_want, path->uptodate,
-			     best ? &best->pos		: &pos_min,
-			     best ? best->locks_want	: U8_MAX,
-			     best ? best->uptodate	: U8_MAX);
+			     path_pos ? &path_pos->pos		: &pos_min,
+			     path_pos ? path_pos->locks_want	: U8_MAX,
+			     path_pos ? path_pos->uptodate	: U8_MAX);
 
 	return path;
 }
-- 
cgit 


From 068bcaa589e268fe0bca1f972b3a08a18be8c5dc Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 3 Sep 2021 17:18:57 -0400
Subject: bcachefs: Add more assertions for locking btree iterators out of
 order

btree_path_traverse_all() traverses btree iterators in sorted order, and
thus shouldn't see transaction restarts due to potential deadlocks - but
sometimes we do. This patch adds some more assertions and tracks some
more state to help track this down.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_iter.c      | 50 ++++++++++++++++++++-----------------------
 fs/bcachefs/btree_iter.h      | 10 +++++++++
 fs/bcachefs/btree_key_cache.c |  4 ++--
 fs/bcachefs/btree_locking.h   | 18 +++++++++++++---
 fs/bcachefs/btree_types.h     |  2 ++
 5 files changed, 52 insertions(+), 32 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index a856f5e90727..edb4084f7b90 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -17,8 +17,6 @@
 
 #include <linux/prefetch.h>
 
-static inline void btree_trans_sort_paths(struct btree_trans *);
-
 static inline void btree_path_list_remove(struct btree_trans *, struct btree_path *);
 static inline void btree_path_list_add(struct btree_trans *, struct btree_path *,
 				       struct btree_path *);
@@ -159,7 +157,7 @@ bool __bch2_btree_node_relock(struct btree_trans *trans,
 	if (six_relock_type(&b->c.lock, want, path->l[level].lock_seq) ||
 	    (btree_node_lock_seq_matches(path, b, level) &&
 	     btree_node_lock_increment(trans, b, level, want))) {
-		mark_btree_node_locked(path, level, want);
+		mark_btree_node_locked(trans, path, level, want);
 		return true;
 	} else {
 		return false;
@@ -195,7 +193,7 @@ static bool bch2_btree_node_upgrade(struct btree_trans *trans,
 
 	return false;
 success:
-	mark_btree_node_intent_locked(path, level);
+	mark_btree_node_intent_locked(trans, path, level);
 	return true;
 }
 
@@ -1078,7 +1076,7 @@ void bch2_trans_node_add(struct btree_trans *trans, struct btree *b)
 			t = btree_lock_want(path, b->c.level);
 			if (t != BTREE_NODE_UNLOCKED) {
 				six_lock_increment(&b->c.lock, (enum six_lock_type) t);
-				mark_btree_node_locked(path, b->c.level, (enum six_lock_type) t);
+				mark_btree_node_locked(trans, path, b->c.level, (enum six_lock_type) t);
 			}
 
 			btree_path_level_init(trans, path, b);
@@ -1167,7 +1165,7 @@ static inline int btree_path_lock_root(struct btree_trans *trans,
 			for (i = path->level + 1; i < BTREE_MAX_DEPTH; i++)
 				path->l[i].b = NULL;
 
-			mark_btree_node_locked(path, path->level, lock_type);
+			mark_btree_node_locked(trans, path, path->level, lock_type);
 			btree_path_level_init(trans, path, b);
 			return 0;
 		}
@@ -1259,7 +1257,7 @@ static __always_inline int btree_path_down(struct btree_trans *trans,
 	if (unlikely(ret))
 		goto err;
 
-	mark_btree_node_locked(path, level, lock_type);
+	mark_btree_node_locked(trans, path, level, lock_type);
 	btree_path_level_init(trans, path, b);
 
 	if (tmp.k->k.type == KEY_TYPE_btree_ptr_v2 &&
@@ -1340,6 +1338,9 @@ retry_all:
 		path = trans->paths + trans->sorted[i];
 
 		EBUG_ON(!(trans->paths_allocated & (1ULL << path->idx)));
+#ifdef CONFIG_BCACHEFS_DEBUG
+		trans->traverse_all_idx = path->idx;
+#endif
 
 		ret = btree_path_traverse_one(trans, path, 0, _THIS_IP_);
 		if (ret)
@@ -1361,6 +1362,9 @@ retry_all:
 out:
 	bch2_btree_cache_cannibalize_unlock(c);
 
+#ifdef CONFIG_BCACHEFS_DEBUG
+	trans->traverse_all_idx = U8_MAX;
+#endif
 	trans->in_traverse_all = false;
 
 	trace_trans_traverse_all(trans->ip, trace_ip);
@@ -2319,13 +2323,9 @@ static void btree_trans_verify_sorted_refs(struct btree_trans *trans)
 		BUG_ON(trans->paths[idx].sorted_idx != i);
 	}
 }
-#else
-static inline void btree_trans_verify_sorted_refs(struct btree_trans *trans) {}
-#endif
 
 static void btree_trans_verify_sorted(struct btree_trans *trans)
 {
-#ifdef CONFIG_BCACHEFS_DEBUG
 	struct btree_path *path, *prev = NULL;
 	unsigned i;
 
@@ -2333,14 +2333,22 @@ static void btree_trans_verify_sorted(struct btree_trans *trans)
 		BUG_ON(prev && btree_path_cmp(prev, path) > 0);
 		prev = path;
 	}
-#endif
 }
+#else
+static inline void btree_trans_verify_sorted_refs(struct btree_trans *trans) {}
+static inline void btree_trans_verify_sorted(struct btree_trans *trans) {}
+#endif
 
-static noinline void __btree_trans_sort_paths(struct btree_trans *trans)
+void __bch2_btree_trans_sort_paths(struct btree_trans *trans)
 {
 	int i, l = 0, r = trans->nr_sorted, inc = 1;
 	bool swapped;
 
+	btree_trans_verify_sorted_refs(trans);
+
+	if (trans->paths_sorted)
+		goto out;
+
 	/*
 	 * Cocktail shaker sort: this is efficient because iterators will be
 	 * mostly sorteda.
@@ -2368,22 +2376,10 @@ static noinline void __btree_trans_sort_paths(struct btree_trans *trans)
 	} while (swapped);
 
 	trans->paths_sorted = true;
-
+out:
 	btree_trans_verify_sorted(trans);
 }
 
-static inline void btree_trans_sort_paths(struct btree_trans *trans)
-{
-	btree_trans_verify_sorted_refs(trans);
-
-	if (trans->paths_sorted) {
-		if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG))
-			btree_trans_verify_sorted(trans);
-		return;
-	}
-	__btree_trans_sort_paths(trans);
-}
-
 static inline void btree_path_list_remove(struct btree_trans *trans,
 					  struct btree_path *path)
 {
@@ -2410,7 +2406,7 @@ static inline void btree_path_list_add(struct btree_trans *trans,
 {
 	unsigned i;
 
-	path->sorted_idx = pos ? pos->sorted_idx + 1 : 0;
+	path->sorted_idx = pos ? pos->sorted_idx + 1 : trans->nr_sorted;
 
 #ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
 	memmove_u64s_up_small(trans->sorted + path->sorted_idx + 1,
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index 983d61122458..19ce6a6ece7d 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -57,6 +57,16 @@ static inline int btree_iter_err(const struct btree_iter *iter)
 
 /* Iterate over paths within a transaction: */
 
+void __bch2_btree_trans_sort_paths(struct btree_trans *);
+
+static inline void btree_trans_sort_paths(struct btree_trans *trans)
+{
+	if (!IS_ENABLED(CONFIG_BCACHEFS_DEBUG) &&
+	    trans->paths_sorted)
+		return;
+	__bch2_btree_trans_sort_paths(trans);
+}
+
 static inline struct btree_path *
 __trans_next_path(struct btree_trans *trans, unsigned idx)
 {
diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index 9bdc2c3f21bf..9d3a64f37f09 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -297,7 +297,7 @@ retry:
 		if (!ck)
 			goto retry;
 
-		mark_btree_node_locked(path, 0, SIX_LOCK_intent);
+		mark_btree_node_locked(trans, path, 0, SIX_LOCK_intent);
 		path->locks_want = 1;
 	} else {
 		enum six_lock_type lock_want = __btree_lock_want(path, 0);
@@ -319,7 +319,7 @@ retry:
 			goto retry;
 		}
 
-		mark_btree_node_locked(path, 0, lock_want);
+		mark_btree_node_locked(trans, path, 0, lock_want);
 	}
 
 	path->l[0].lock_seq	= ck->c.lock.state.seq;
diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h
index d05689180c63..ff58870311f3 100644
--- a/fs/bcachefs/btree_locking.h
+++ b/fs/bcachefs/btree_locking.h
@@ -57,7 +57,8 @@ static inline void mark_btree_node_unlocked(struct btree_path *path,
 	path->nodes_intent_locked &= ~(1 << level);
 }
 
-static inline void mark_btree_node_locked(struct btree_path *path,
+static inline void mark_btree_node_locked(struct btree_trans *trans,
+					  struct btree_path *path,
 					  unsigned level,
 					  enum six_lock_type type)
 {
@@ -67,12 +68,20 @@ static inline void mark_btree_node_locked(struct btree_path *path,
 
 	path->nodes_locked |= 1 << level;
 	path->nodes_intent_locked |= type << level;
+#ifdef CONFIG_BCACHEFS_DEBUG
+	path->ip_locked = _RET_IP_;
+	btree_trans_sort_paths(trans);
+	BUG_ON(trans->in_traverse_all &&
+	       trans->traverse_all_idx != U8_MAX &&
+	       path->sorted_idx > trans->paths[trans->traverse_all_idx].sorted_idx);
+#endif
 }
 
-static inline void mark_btree_node_intent_locked(struct btree_path *path,
+static inline void mark_btree_node_intent_locked(struct btree_trans *trans,
+						 struct btree_path *path,
 						 unsigned level)
 {
-	mark_btree_node_locked(path, level, SIX_LOCK_intent);
+	mark_btree_node_locked(trans, path, level, SIX_LOCK_intent);
 }
 
 static inline enum six_lock_type __btree_lock_want(struct btree_path *path, int level)
@@ -111,6 +120,9 @@ static inline void __bch2_btree_path_unlock(struct btree_path *path)
 
 	while (path->nodes_locked)
 		btree_node_unlock(path, __ffs(path->nodes_locked));
+#ifdef CONFIG_BCACHEFS_DEBUG
+	path->ip_locked = 0;
+#endif
 }
 
 static inline enum bch_time_stats lock_to_time_stat(enum six_lock_type type)
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index b7cded2095ff..ce64d3ad768b 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -255,6 +255,7 @@ struct btree_path {
 	}			l[BTREE_MAX_DEPTH];
 #ifdef CONFIG_BCACHEFS_DEBUG
 	unsigned long		ip_allocated;
+	unsigned long		ip_locked;
 #endif
 };
 
@@ -368,6 +369,7 @@ struct btree_trans {
 	struct bpos		locking_pos;
 	u8			locking_btree_id;
 	u8			locking_level;
+	u8			traverse_all_idx;
 	pid_t			pid;
 #endif
 	unsigned long		ip;
-- 
cgit 


From 1ae29c1faaa3af9e8c490206634f2648016634cd Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 4 Sep 2021 21:19:48 -0400
Subject: bcachefs: Extent btree iterators are no longer special

Since iter->real_pos was introduced, we no longer have to deal with
extent btree iterators that have skipped past deleted keys - this is a
real performance improvement on btree updates.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_iter.c | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index edb4084f7b90..16384543149e 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -601,13 +601,9 @@ static void bch2_btree_path_verify_level(struct btree_trans *trans,
 	bch2_btree_node_iter_verify(&l->iter, l->b);
 
 	/*
-	 * For interior nodes, the iterator will have skipped past
-	 * deleted keys:
-	 *
-	 * For extents, the iterator may have skipped past deleted keys (but not
-	 * whiteouts)
+	 * For interior nodes, the iterator will have skipped past deleted keys:
 	 */
-	p = level || btree_node_type_is_extents(path->btree_id)
+	p = level
 		? bch2_btree_node_iter_prev(&tmp, l->b)
 		: bch2_btree_node_iter_prev_all(&tmp, l->b);
 	k = bch2_btree_node_iter_peek_all(&l->iter, l->b);
@@ -829,8 +825,7 @@ fixup_done:
 	 */
 	if (!bch2_btree_node_iter_end(node_iter) &&
 	    iter_current_key_modified &&
-	    (b->c.level ||
-	     btree_node_type_is_extents(path->btree_id))) {
+	    b->c.level) {
 		struct bset_tree *t;
 		struct bkey_packed *k, *k2, *p;
 
-- 
cgit 


From 1d3ecd7ea790cb650d8c80741ecd4f03780ff78b Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 4 Sep 2021 21:23:11 -0400
Subject: bcachefs: Tighten up btree locking invariants

New rule is: if a btree path holds any locks it should be holding
precisely the locks wanted (accoringing to path->level and
path->locks_want).

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_iter.c            | 69 +++++++++++++++----------------------
 fs/bcachefs/btree_iter.h            |  1 -
 fs/bcachefs/btree_update_interior.c |  8 ++---
 3 files changed, 32 insertions(+), 46 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 16384543149e..790c1185db63 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -224,7 +224,6 @@ static inline bool btree_path_get_locks(struct btree_trans *trans,
 					? path->l[l].b->c.lock.state.seq
 					: 0);
 			fail_idx = l;
-			btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
 		}
 
 		l++;
@@ -235,10 +234,14 @@ static inline bool btree_path_get_locks(struct btree_trans *trans,
 	 * can't be relocked so bch2_btree_path_traverse has to walk back up to
 	 * the node that we failed to relock:
 	 */
-	while (fail_idx >= 0) {
-		btree_node_unlock(path, fail_idx);
-		path->l[fail_idx].b = BTREE_ITER_NO_NODE_GET_LOCKS;
-		--fail_idx;
+	if (fail_idx >= 0) {
+		__bch2_btree_path_unlock(path);
+		btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
+
+		do {
+			path->l[fail_idx].b = BTREE_ITER_NO_NODE_GET_LOCKS;
+			--fail_idx;
+		} while (fail_idx >= 0);
 	}
 
 	if (path->uptodate == BTREE_ITER_NEED_RELOCK)
@@ -376,14 +379,14 @@ static void bch2_btree_path_verify_locks(struct btree_path *path)
 {
 	unsigned l;
 
-	for (l = 0; btree_path_node(path, l); l++) {
-		if (path->uptodate >= BTREE_ITER_NEED_RELOCK &&
-		    !btree_node_locked(path, l))
-			continue;
+	if (!path->nodes_locked) {
+		BUG_ON(path->uptodate == BTREE_ITER_UPTODATE);
+		return;
+	}
 
+	for (l = 0; btree_path_node(path, l); l++)
 		BUG_ON(btree_lock_want(path, l) !=
 		       btree_node_locked_type(path, l));
-	}
 }
 
 void bch2_trans_verify_locks(struct btree_trans *trans)
@@ -421,6 +424,7 @@ bool bch2_btree_path_relock_intent(struct btree_trans *trans,
 					is_btree_node(path, l)
 					? path->l[l].b->c.lock.state.seq
 					: 0);
+			__bch2_btree_path_unlock(path);
 			btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
 			btree_trans_restart(trans);
 			return false;
@@ -668,9 +672,6 @@ void bch2_trans_verify_paths(struct btree_trans *trans)
 {
 	struct btree_path *path;
 
-	if (!bch2_debug_check_iterators)
-		return;
-
 	trans_for_each_path(trans, path)
 		bch2_btree_path_verify(trans, path);
 }
@@ -1013,7 +1014,7 @@ static void btree_path_verify_new_node(struct btree_trans *trans,
 	}
 
 	if (!parent_locked)
-		btree_node_unlock(path, b->c.level + 1);
+		btree_node_unlock(path, plevel);
 }
 
 static inline void __btree_path_level_init(struct btree_path *path,
@@ -1055,21 +1056,17 @@ static inline void btree_path_level_init(struct btree_trans *trans,
  */
 void bch2_trans_node_add(struct btree_trans *trans, struct btree *b)
 {
-	enum btree_node_locked_type t;
 	struct btree_path *path;
 
 	trans_for_each_path(trans, path)
 		if (!path->cached &&
 		    btree_path_pos_in_node(path, b)) {
-			/*
-			 * bch2_btree_path_node_drop() has already been called -
-			 * the old node we're replacing has already been
-			 * unlocked and the pointer invalidated
-			 */
-			BUG_ON(btree_node_locked(path, b->c.level));
+			enum btree_node_locked_type t =
+				btree_lock_want(path, b->c.level);
 
-			t = btree_lock_want(path, b->c.level);
-			if (t != BTREE_NODE_UNLOCKED) {
+			if (path->nodes_locked &&
+			    t != BTREE_NODE_UNLOCKED) {
+				btree_node_unlock(path, b->c.level);
 				six_lock_increment(&b->c.lock, (enum six_lock_type) t);
 				mark_btree_node_locked(trans, path, b->c.level, (enum six_lock_type) t);
 			}
@@ -1078,18 +1075,6 @@ void bch2_trans_node_add(struct btree_trans *trans, struct btree *b)
 		}
 }
 
-void bch2_trans_node_drop(struct btree_trans *trans, struct btree *b)
-{
-	struct btree_path *path;
-	unsigned level = b->c.level;
-
-	trans_for_each_path(trans, path)
-		if (path->l[level].b == b) {
-			btree_node_unlock(path, level);
-			path->l[level].b = BTREE_ITER_NO_NODE_DROP;
-		}
-}
-
 /*
  * A btree node has been modified in such a way as to invalidate iterators - fix
  * them:
@@ -1392,6 +1377,9 @@ static inline unsigned btree_path_up_until_good_node(struct btree_trans *trans,
 {
 	unsigned l = path->level;
 
+	if (!path->nodes_locked)
+		btree_path_get_locks(trans, path, false, _THIS_IP_);
+
 	while (btree_path_node(path, l) &&
 	       !btree_path_good_node(trans, path, l, check_pos)) {
 		btree_node_unlock(path, l);
@@ -1584,14 +1572,12 @@ __bch2_btree_path_set_pos(struct btree_trans *trans,
 		if (cmp < 0 ||
 		    !btree_path_advance_to_pos(path, &path->l[l], 8))
 			__btree_path_level_init(path, l);
-
-		/* Don't leave it locked if we're not supposed to: */
-		if (btree_lock_want(path, l) == BTREE_NODE_UNLOCKED)
-			btree_node_unlock(path, l);
 	}
 
-	if (l != path->level)
+	if (l != path->level) {
 		btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
+		__bch2_btree_path_unlock(path);
+	}
 out:
 	bch2_btree_path_verify(trans, path);
 #ifdef CONFIG_BCACHEFS_DEBUG
@@ -2760,9 +2746,10 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct bch_fs *c)
 			if (!path->nodes_locked)
 				continue;
 
-			pr_buf(out, "  path %u %c %s:",
+			pr_buf(out, "  path %u %c l=%u %s:",
 			       path->idx,
 			       path->cached ? 'c' : 'b',
+			       path->level,
 			       bch2_btree_ids[path->btree_id]);
 			bch2_bpos_to_text(out, path->pos);
 			pr_buf(out, "\n");
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index 19ce6a6ece7d..2459291231ea 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -219,7 +219,6 @@ static inline void bch2_btree_path_downgrade(struct btree_path *path)
 void bch2_trans_downgrade(struct btree_trans *);
 
 void bch2_trans_node_add(struct btree_trans *trans, struct btree *);
-void bch2_trans_node_drop(struct btree_trans *, struct btree *);
 void bch2_trans_node_reinit_iter(struct btree_trans *, struct btree *);
 
 int __must_check bch2_btree_iter_traverse(struct btree_iter *);
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 6dcce175fd8b..f31db1310715 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -1429,7 +1429,6 @@ static void btree_split(struct btree_update *as, struct btree_trans *trans,
 	/* Successful split, update the path to point to the new nodes: */
 
 	six_lock_increment(&b->c.lock, SIX_LOCK_intent);
-	bch2_trans_node_drop(trans, b);
 	if (n3)
 		bch2_trans_node_add(trans, n3);
 	if (n2)
@@ -1694,14 +1693,16 @@ retry:
 	bch2_keylist_add(&as->parent_keys, &delete);
 	bch2_keylist_add(&as->parent_keys, &n->key);
 
+	bch2_trans_verify_paths(trans);
+
 	bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys, flags);
 
+	bch2_trans_verify_paths(trans);
+
 	bch2_btree_update_get_open_buckets(as, n);
 
 	six_lock_increment(&b->c.lock, SIX_LOCK_intent);
 	six_lock_increment(&m->c.lock, SIX_LOCK_intent);
-	bch2_trans_node_drop(trans, b);
-	bch2_trans_node_drop(trans, m);
 
 	bch2_trans_node_add(trans, n);
 
@@ -1798,7 +1799,6 @@ retry:
 	bch2_btree_update_get_open_buckets(as, n);
 
 	six_lock_increment(&b->c.lock, SIX_LOCK_intent);
-	bch2_trans_node_drop(trans, b);
 	bch2_trans_node_add(trans, n);
 	bch2_btree_node_free_inmem(trans, b);
 	six_unlock_intent(&n->c.lock);
-- 
cgit 


From f48361b00c4e6854a66ebf32c11849d4762a239e Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 5 Sep 2021 00:05:08 -0400
Subject: bcachefs: Drop some fast path tracepoints

These haven't turned out to be useful

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_iter.c        |  42 +------
 fs/bcachefs/btree_update_leaf.c |   1 -
 fs/bcachefs/trace.h             | 247 ----------------------------------------
 3 files changed, 1 insertion(+), 289 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 790c1185db63..81351673ede3 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -210,21 +210,8 @@ static inline bool btree_path_get_locks(struct btree_trans *trans,
 
 		if (!(upgrade
 		      ? bch2_btree_node_upgrade(trans, path, l)
-		      : bch2_btree_node_relock(trans, path, l))) {
-			(upgrade
-			 ? trace_node_upgrade_fail
-			 : trace_node_relock_fail)(trans->ip, trace_ip,
-					path->cached,
-					path->btree_id, &path->pos,
-					l, path->l[l].lock_seq,
-					is_btree_node(path, l)
-					? 0
-					: (unsigned long) path->l[l].b,
-					is_btree_node(path, l)
-					? path->l[l].b->c.lock.state.seq
-					: 0);
+		      : bch2_btree_node_relock(trans, path, l)))
 			fail_idx = l;
-		}
 
 		l++;
 	} while (l < path->locks_want);
@@ -414,16 +401,6 @@ bool bch2_btree_path_relock_intent(struct btree_trans *trans,
 	     l < path->locks_want && btree_path_node(path, l);
 	     l++) {
 		if (!bch2_btree_node_relock(trans, path, l)) {
-			trace_node_relock_fail(trans->ip, _RET_IP_,
-					path->cached,
-					path->btree_id, &path->pos,
-					l, path->l[l].lock_seq,
-					is_btree_node(path, l)
-					? 0
-					: (unsigned long) path->l[l].b,
-					is_btree_node(path, l)
-					? path->l[l].b->c.lock.state.seq
-					: 0);
 			__bch2_btree_path_unlock(path);
 			btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
 			btree_trans_restart(trans);
@@ -1473,9 +1450,6 @@ static int btree_path_traverse_one(struct btree_trans *trans,
 	path->uptodate = BTREE_ITER_UPTODATE;
 out:
 	BUG_ON((ret == -EINTR) != !!trans->restarted);
-	trace_iter_traverse(trans->ip, trace_ip,
-			    path->cached,
-			    path->btree_id, &path->pos, ret);
 	bch2_btree_path_verify(trans, path);
 	return ret;
 }
@@ -1539,9 +1513,6 @@ __bch2_btree_path_set_pos(struct btree_trans *trans,
 			  struct btree_path *path, struct bpos new_pos,
 			  bool intent, int cmp)
 {
-#ifdef CONFIG_BCACHEFS_DEBUG
-	struct bpos old_pos = path->pos;
-#endif
 	unsigned l = path->level;
 
 	EBUG_ON(trans->restarted);
@@ -1580,10 +1551,6 @@ __bch2_btree_path_set_pos(struct btree_trans *trans,
 	}
 out:
 	bch2_btree_path_verify(trans, path);
-#ifdef CONFIG_BCACHEFS_DEBUG
-	trace_path_set_pos(trans->ip, _RET_IP_, path->btree_id,
-			   &old_pos, &new_pos, l);
-#endif
 	return path;
 }
 
@@ -1725,7 +1692,6 @@ struct btree_path *bch2_path_get(struct btree_trans *trans, bool cached,
 				 bool intent)
 {
 	struct btree_path *path, *path_pos = NULL;
-	struct bpos pos_min = POS_MIN;
 	int i;
 
 	BUG_ON(trans->restarted);
@@ -1790,12 +1756,6 @@ struct btree_path *bch2_path_get(struct btree_trans *trans, bool cached,
 		btree_path_get_locks(trans, path, true, _THIS_IP_);
 	}
 
-	trace_trans_get_path(_RET_IP_, trans->ip, btree_id,
-			     &pos, locks_want, path->uptodate,
-			     path_pos ? &path_pos->pos		: &pos_min,
-			     path_pos ? path_pos->locks_want	: U8_MAX,
-			     path_pos ? path_pos->uptodate	: U8_MAX);
-
 	return path;
 }
 
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 4fb5a5666e20..dfa1086e5247 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -225,7 +225,6 @@ static bool btree_insert_key_leaf(struct btree_trans *trans,
 	    bch2_maybe_compact_whiteouts(c, b))
 		bch2_trans_node_reinit_iter(trans, b);
 
-	trace_btree_insert_key(c, b, insert->k);
 	return true;
 }
 
diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h
index 44a556518d4a..960dcc8ce3e6 100644
--- a/fs/bcachefs/trace.h
+++ b/fs/bcachefs/trace.h
@@ -298,28 +298,6 @@ TRACE_EVENT(btree_reserve_get_fail,
 		  __entry->required, __entry->cl)
 );
 
-TRACE_EVENT(btree_insert_key,
-	TP_PROTO(struct bch_fs *c, struct btree *b, struct bkey_i *k),
-	TP_ARGS(c, b, k),
-
-	TP_STRUCT__entry(
-		__field(u8,		id			)
-		__field(u64,		inode			)
-		__field(u64,		offset			)
-		__field(u32,		size			)
-	),
-
-	TP_fast_assign(
-		__entry->id		= b->c.btree_id;
-		__entry->inode		= k->k.p.inode;
-		__entry->offset		= k->k.p.offset;
-		__entry->size		= k->k.size;
-	),
-
-	TP_printk("btree %u: %llu:%llu len %u", __entry->id,
-		  __entry->inode, __entry->offset, __entry->size)
-);
-
 DEFINE_EVENT(btree_node, btree_split,
 	TP_PROTO(struct bch_fs *c, struct btree *b),
 	TP_ARGS(c, b)
@@ -540,69 +518,6 @@ TRACE_EVENT(copygc_wait,
 		  __entry->wait_amount, __entry->until)
 );
 
-TRACE_EVENT(trans_get_path,
-	TP_PROTO(unsigned long trans_ip,
-		 unsigned long caller_ip,
-		 enum btree_id btree_id,
-		 struct bpos *got_pos,
-		 unsigned got_locks,
-		 unsigned got_uptodate,
-		 struct bpos *src_pos,
-		 unsigned src_locks,
-		 unsigned src_uptodate),
-	TP_ARGS(trans_ip, caller_ip, btree_id,
-		got_pos, got_locks, got_uptodate,
-		src_pos, src_locks, src_uptodate),
-
-	TP_STRUCT__entry(
-		__field(unsigned long,		trans_ip		)
-		__field(unsigned long,		caller_ip		)
-		__field(u8,			btree_id		)
-		__field(u64,			got_pos_inode		)
-		__field(u64,			got_pos_offset		)
-		__field(u32,			got_pos_snapshot	)
-		__field(u8,			got_locks		)
-		__field(u8,			got_uptodate		)
-		__field(u64,			src_pos_inode		)
-		__field(u64,			src_pos_offset		)
-		__field(u32,			src_pos_snapshot	)
-		__field(u8,			src_locks		)
-		__field(u8,			src_uptodate		)
-	),
-
-	TP_fast_assign(
-		__entry->trans_ip		= trans_ip;
-		__entry->caller_ip		= caller_ip;
-		__entry->btree_id		= btree_id;
-		__entry->got_pos_inode		= got_pos->inode;
-		__entry->got_pos_offset		= got_pos->offset;
-		__entry->got_pos_snapshot	= got_pos->snapshot;
-		__entry->got_locks		= got_locks;
-		__entry->got_uptodate		= got_uptodate;
-		__entry->src_pos_inode		= src_pos->inode;
-		__entry->src_pos_offset		= src_pos->offset;
-		__entry->src_pos_snapshot	= src_pos->snapshot;
-		__entry->src_locks		= src_locks;
-		__entry->src_uptodate		= src_uptodate;
-	),
-
-	TP_printk("%ps %pS btree %u got %llu:%llu:%u l %u u %u "
-		  "src %llu:%llu:%u l %u u %u",
-		  (void *) __entry->trans_ip,
-		  (void *) __entry->caller_ip,
-		  __entry->btree_id,
-		  __entry->got_pos_inode,
-		  __entry->got_pos_offset,
-		  __entry->got_pos_snapshot,
-		  __entry->got_locks,
-		  __entry->got_uptodate,
-		  __entry->src_pos_inode,
-		  __entry->src_pos_offset,
-		  __entry->src_pos_snapshot,
-		  __entry->src_locks,
-		  __entry->src_uptodate)
-);
-
 TRACE_EVENT(transaction_restart_ip,
 	TP_PROTO(unsigned long caller, unsigned long ip),
 	TP_ARGS(caller, ip),
@@ -772,96 +687,6 @@ DEFINE_EVENT(transaction_restart_iter,	trans_restart_traverse,
 	TP_ARGS(trans_ip, caller_ip, btree_id, pos)
 );
 
-TRACE_EVENT(iter_traverse,
-	TP_PROTO(unsigned long	trans_ip,
-		 unsigned long	caller_ip,
-		 bool key_cache,
-		 enum btree_id	btree_id,
-		 struct bpos	*pos,
-		 int ret),
-	TP_ARGS(trans_ip, caller_ip, key_cache, btree_id, pos, ret),
-
-	TP_STRUCT__entry(
-		__field(unsigned long,		trans_ip	)
-		__field(unsigned long,		caller_ip	)
-		__field(u8,			key_cache	)
-		__field(u8,			btree_id	)
-		__field(u64,			pos_inode	)
-		__field(u64,			pos_offset	)
-		__field(u32,			pos_snapshot	)
-		__field(s32,			ret		)
-	),
-
-	TP_fast_assign(
-		__entry->trans_ip		= trans_ip;
-		__entry->caller_ip		= caller_ip;
-		__entry->key_cache		= key_cache;
-		__entry->btree_id		= btree_id;
-		__entry->pos_inode		= pos->inode;
-		__entry->pos_offset		= pos->offset;
-		__entry->pos_snapshot		= pos->snapshot;
-		__entry->ret			= ret;
-	),
-
-	TP_printk("%ps %pS key cache %u btree %u %llu:%llu:%u ret %i",
-		  (void *) __entry->trans_ip,
-		  (void *) __entry->caller_ip,
-		  __entry->key_cache,
-		  __entry->btree_id,
-		  __entry->pos_inode,
-		  __entry->pos_offset,
-		  __entry->pos_snapshot,
-		  __entry->ret)
-);
-
-TRACE_EVENT(path_set_pos,
-	TP_PROTO(unsigned long	trans_ip,
-		 unsigned long	caller_ip,
-		 enum btree_id	btree_id,
-		 struct bpos	*old_pos,
-		 struct bpos	*new_pos,
-		 unsigned	good_level),
-	TP_ARGS(trans_ip, caller_ip, btree_id, old_pos, new_pos, good_level),
-
-	TP_STRUCT__entry(
-		__field(unsigned long,		trans_ip		)
-		__field(unsigned long,		caller_ip		)
-		__field(u8,			btree_id		)
-		__field(u64,			old_pos_inode		)
-		__field(u64,			old_pos_offset		)
-		__field(u32,			old_pos_snapshot	)
-		__field(u64,			new_pos_inode		)
-		__field(u64,			new_pos_offset		)
-		__field(u32,			new_pos_snapshot	)
-		__field(u8,			good_level		)
-	),
-
-	TP_fast_assign(
-		__entry->trans_ip		= trans_ip;
-		__entry->caller_ip		= caller_ip;
-		__entry->btree_id		= btree_id;
-		__entry->old_pos_inode		= old_pos->inode;
-		__entry->old_pos_offset		= old_pos->offset;
-		__entry->old_pos_snapshot	= old_pos->snapshot;
-		__entry->new_pos_inode		= new_pos->inode;
-		__entry->new_pos_offset		= new_pos->offset;
-		__entry->new_pos_snapshot	= new_pos->snapshot;
-		__entry->good_level		= good_level;
-	),
-
-	TP_printk("%ps %pS btree %u old pos %llu:%llu:%u new pos %llu:%llu:%u l %u",
-		  (void *) __entry->trans_ip,
-		  (void *) __entry->caller_ip,
-		  __entry->btree_id,
-		  __entry->old_pos_inode,
-		  __entry->old_pos_offset,
-		  __entry->old_pos_snapshot,
-		  __entry->new_pos_inode,
-		  __entry->new_pos_offset,
-		  __entry->new_pos_snapshot,
-		  __entry->good_level)
-);
-
 TRACE_EVENT(trans_restart_would_deadlock,
 	TP_PROTO(unsigned long	trans_ip,
 		 unsigned long	caller_ip,
@@ -954,78 +779,6 @@ TRACE_EVENT(trans_restart_mem_realloced,
 		  __entry->bytes)
 );
 
-DECLARE_EVENT_CLASS(node_lock_fail,
-	TP_PROTO(unsigned long trans_ip,
-		 unsigned long caller_ip,
-		 bool key_cache,
-		 enum btree_id btree_id,
-		 struct bpos *pos,
-		 unsigned level, u32 iter_seq, unsigned node, u32 node_seq),
-	TP_ARGS(trans_ip, caller_ip, key_cache, btree_id, pos,
-		level, iter_seq, node, node_seq),
-
-	TP_STRUCT__entry(
-		__field(unsigned long,		trans_ip	)
-		__field(unsigned long,		caller_ip	)
-		__field(u8,			key_cache	)
-		__field(u8,			btree_id	)
-		__field(u64,			pos_inode	)
-		__field(u64,			pos_offset	)
-		__field(u32,			pos_snapshot	)
-		__field(u32,			level		)
-		__field(u32,			iter_seq	)
-		__field(u32,			node		)
-		__field(u32,			node_seq	)
-	),
-
-	TP_fast_assign(
-		__entry->trans_ip		= trans_ip;
-		__entry->caller_ip		= caller_ip;
-		__entry->key_cache		= key_cache;
-		__entry->btree_id		= btree_id;
-		__entry->pos_inode		= pos->inode;
-		__entry->pos_offset		= pos->offset;
-		__entry->pos_snapshot		= pos->snapshot;
-		__entry->level			= level;
-		__entry->iter_seq		= iter_seq;
-		__entry->node			= node;
-		__entry->node_seq		= node_seq;
-	),
-
-	TP_printk("%ps %pS key cache %u btree %u pos %llu:%llu:%u level %u iter seq %u node %u node seq %u",
-		  (void *) __entry->trans_ip,
-		  (void *) __entry->caller_ip,
-		  __entry->key_cache,
-		  __entry->btree_id,
-		  __entry->pos_inode,
-		  __entry->pos_offset,
-		  __entry->pos_snapshot,
-		  __entry->level, __entry->iter_seq,
-		  __entry->node, __entry->node_seq)
-);
-
-DEFINE_EVENT(node_lock_fail, node_upgrade_fail,
-	TP_PROTO(unsigned long trans_ip,
-		 unsigned long caller_ip,
-		 bool key_cache,
-		 enum btree_id btree_id,
-		 struct bpos *pos,
-		 unsigned level, u32 iter_seq, unsigned node, u32 node_seq),
-	TP_ARGS(trans_ip, caller_ip, key_cache, btree_id, pos,
-		level, iter_seq, node, node_seq)
-);
-
-DEFINE_EVENT(node_lock_fail, node_relock_fail,
-	TP_PROTO(unsigned long trans_ip,
-		 unsigned long caller_ip,
-		 bool key_cache,
-		 enum btree_id btree_id,
-		 struct bpos *pos,
-		 unsigned level, u32 iter_seq, unsigned node, u32 node_seq),
-	TP_ARGS(trans_ip, caller_ip, key_cache, btree_id, pos,
-		level, iter_seq, node, node_seq)
-);
-
 #endif /* _TRACE_BCACHEFS_H */
 
 /* This part must be outside protection */
-- 
cgit 


From 22b383ad7e1928de8da14d66a7154a9bfebf9a46 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 5 Sep 2021 00:22:32 -0400
Subject: bcachefs: Kill retry loop in btree merge path

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_update_interior.c | 34 +++++-----------------------------
 fs/bcachefs/btree_update_interior.h |  6 +-----
 2 files changed, 6 insertions(+), 34 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index f31db1310715..73a79563487d 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -1572,12 +1572,7 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans,
 	struct btree *b, *m, *n, *prev, *next, *parent;
 	struct bpos sib_pos;
 	size_t sib_u64s;
-	int ret = 0, ret2 = 0;
-
-retry:
-	ret = bch2_btree_path_traverse(trans, path, false);
-	if (ret)
-		return ret;
+	int ret = 0;
 
 	BUG_ON(!path->should_be_locked);
 	BUG_ON(!btree_node_locked(path, level));
@@ -1587,7 +1582,7 @@ retry:
 	if ((sib == btree_prev_sib && !bpos_cmp(b->data->min_key, POS_MIN)) ||
 	    (sib == btree_next_sib && !bpos_cmp(b->data->max_key, SPOS_MAX))) {
 		b->sib_u64s[sib] = U16_MAX;
-		goto out;
+		return 0;
 	}
 
 	sib_pos = sib == btree_prev_sib
@@ -1715,29 +1710,10 @@ retry:
 
 	bch2_btree_update_done(as);
 out:
-	bch2_trans_verify_locks(trans);
-	if (sib_path)
-		bch2_path_put(trans, sib_path, true);
-
-	/*
-	 * Don't downgrade locks here: we're called after successful insert,
-	 * and the caller will downgrade locks after a successful insert
-	 * anyways (in case e.g. a split was required first)
-	 *
-	 * And we're also called when inserting into interior nodes in the
-	 * split path, and downgrading to read locks in there is potentially
-	 * confusing:
-	 */
-	return ret ?: ret2;
 err:
-	if (sib_path)
-		bch2_path_put(trans, sib_path, true);
-	sib_path = NULL;
-
-	if (ret == -EINTR && bch2_trans_relock(trans))
-		goto retry;
-
-	goto out;
+	bch2_path_put(trans, sib_path, true);
+	bch2_trans_verify_locks(trans);
+	return ret;
 }
 
 /**
diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h
index c06cfcc66db7..8e03bd987d6d 100644
--- a/fs/bcachefs/btree_update_interior.h
+++ b/fs/bcachefs/btree_update_interior.h
@@ -129,11 +129,7 @@ static inline int bch2_foreground_maybe_merge_sibling(struct btree_trans *trans,
 {
 	struct btree *b;
 
-	if (path->uptodate >= BTREE_ITER_NEED_TRAVERSE)
-		return 0;
-
-	if (!bch2_btree_node_relock(trans, path, level))
-		return 0;
+	EBUG_ON(!btree_node_locked(path, level));
 
 	b = path->l[level].b;
 	if (b->sib_u64s[sib] > trans->c->btree_foreground_merge_threshold)
-- 
cgit 


From cf3c68cda684b41bcdd4d9a8dba4abd4abb32881 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 6 Sep 2021 15:38:12 -0400
Subject: bcachefs: No need to clone iterators for update

Since btree_path is now internally refcounted, we don't need to clone an
iterator before calling bch2_trans_update() if we'll be mutating it.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_update_leaf.c | 22 +++++++---------------
 1 file changed, 7 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index dfa1086e5247..310442fcc37f 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -875,12 +875,7 @@ static noinline int extent_front_merge(struct btree_trans *trans,
 	bkey_reassemble(update, k);
 
 	if (bch2_bkey_merge(c, bkey_i_to_s(update), bkey_i_to_s_c(*insert))) {
-		struct btree_iter update_iter;
-
-		bch2_trans_copy_iter(&update_iter, iter);
-		ret = bch2_btree_delete_at(trans, &update_iter, flags);
-		bch2_trans_iter_exit(trans, &update_iter);
-
+		ret = bch2_btree_delete_at(trans, iter, flags);
 		if (ret)
 			return ret;
 
@@ -959,10 +954,7 @@ static int bch2_trans_update_extent(struct btree_trans *trans,
 		}
 
 		if (bkey_cmp(k.k->p, insert->k.p) <= 0) {
-			bch2_trans_copy_iter(&update_iter, &iter);
-			ret = bch2_btree_delete_at(trans, &update_iter, flags);
-			bch2_trans_iter_exit(trans, &update_iter);
-
+			ret = bch2_btree_delete_at(trans, &iter, flags);
 			if (ret)
 				goto err;
 		}
@@ -975,9 +967,10 @@ static int bch2_trans_update_extent(struct btree_trans *trans,
 			bkey_reassemble(update, k);
 			bch2_cut_front(insert->k.p, update);
 
-			bch2_trans_copy_iter(&update_iter, &iter);
-			bch2_trans_update(trans, &update_iter, update, flags);
-			bch2_trans_iter_exit(trans, &update_iter);
+			ret = bch2_trans_update(trans, &iter, update, flags);
+			if (ret)
+				goto err;
+
 			goto out;
 		}
 next:
@@ -1081,8 +1074,7 @@ int __bch2_btree_insert(struct btree_trans *trans,
 	int ret;
 
 	bch2_trans_iter_init(trans, &iter, id, bkey_start_pos(&k->k),
-				   BTREE_ITER_INTENT);
-
+			     BTREE_ITER_INTENT);
 	ret   = bch2_btree_iter_traverse(&iter) ?:
 		bch2_trans_update(trans, &iter, k, 0);
 	bch2_trans_iter_exit(trans, &iter);
-- 
cgit 


From 8ffa63cd7eed11597d90d83a5fdac0ba5fdf6834 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 7 Sep 2021 00:58:13 -0400
Subject: bcachefs: Enabled shard_inode_numbers by default

We'd like performance increasing options to be on by default.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/opts.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
index 0799c9d2bee0..4b79e3bc0ac0 100644
--- a/fs/bcachefs/opts.h
+++ b/fs/bcachefs/opts.h
@@ -171,7 +171,7 @@ enum opt_type {
 	x(shard_inode_numbers,		u8,				\
 	  OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,				\
 	  OPT_BOOL(),							\
-	  BCH_SB_SHARD_INUMS,		false,				\
+	  BCH_SB_SHARD_INUMS,		true,				\
 	  NULL,		"Shard new inode numbers by CPU id")		\
 	x(inodes_use_key_cache,	u8,					\
 	  OPT_FORMAT|OPT_MOUNT,						\
-- 
cgit 


From c404f2038602580b6bdddeff5e9a4d42717da3b8 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 7 Sep 2021 13:55:33 -0400
Subject: bcachefs: Add a missing btree_path_make_mut() call

Also add another small helper, btree_path_clone().

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_iter.c | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 81351673ede3..3aa2777e40a5 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1491,16 +1491,22 @@ static void btree_path_copy(struct btree_trans *trans, struct btree_path *dst,
 	trans->paths_sorted = false;
 }
 
-struct btree_path * __must_check
-__bch2_btree_path_make_mut(struct btree_trans *trans,
-			 struct btree_path *path, bool intent)
+static struct btree_path *btree_path_clone(struct btree_trans *trans, struct btree_path *src,
+					   bool intent)
 {
-	struct btree_path *new = btree_path_alloc(trans, path);
+	struct btree_path *new = btree_path_alloc(trans, src);
 
-	btree_path_copy(trans, new, path);
+	btree_path_copy(trans, new, src);
 	__btree_path_get(new, intent);
+	return new;
+}
+
+struct btree_path * __must_check
+__bch2_btree_path_make_mut(struct btree_trans *trans,
+			   struct btree_path *path, bool intent)
+{
 	__btree_path_put(path, intent);
-	path = new;
+	path = btree_path_clone(trans, path, intent);
 	path->preserve = false;
 #ifdef CONFIG_BCACHEFS_DEBUG
 	path->ip_allocated = _RET_IP_;
@@ -2030,6 +2036,8 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
 
 	cmp = bpos_cmp(k.k->p, iter->path->pos);
 	if (cmp) {
+		iter->path = bch2_btree_path_make_mut(trans, iter->path,
+					iter->flags & BTREE_ITER_INTENT);
 		iter->path->pos = k.k->p;
 		trans->paths_sorted = false;
 	}
-- 
cgit 


From db92f2ea5ed576748b538d15446cebb65bb8d31f Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 7 Sep 2021 15:34:16 -0400
Subject: bcachefs: Optimize btree lookups in write path

This patch significantly reduces the number of btree lookups required in
the extent update path.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_iter.c        |  8 +++++++-
 fs/bcachefs/btree_iter.h        |  1 +
 fs/bcachefs/btree_update_leaf.c |  9 ++++++++-
 fs/bcachefs/io.c                | 10 ++++++++++
 4 files changed, 26 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 3aa2777e40a5..d2ee6e9aa370 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1805,6 +1805,12 @@ hole:
 
 /* Btree iterators: */
 
+int __must_check
+__bch2_btree_iter_traverse(struct btree_iter *iter)
+{
+	return bch2_btree_path_traverse(iter->trans, iter->path, iter->flags);
+}
+
 int __must_check
 bch2_btree_iter_traverse(struct btree_iter *iter)
 {
@@ -2416,7 +2422,7 @@ static void __bch2_trans_iter_init(struct btree_trans *trans,
 	iter->path = bch2_path_get(trans,
 				   flags & BTREE_ITER_CACHED,
 				   btree_id,
-				   btree_iter_search_key(iter),
+				   iter->pos,
 				   locks_want,
 				   depth,
 				   flags & BTREE_ITER_INTENT);
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index 2459291231ea..58add0bb1c81 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -221,6 +221,7 @@ void bch2_trans_downgrade(struct btree_trans *);
 void bch2_trans_node_add(struct btree_trans *trans, struct btree *);
 void bch2_trans_node_reinit_iter(struct btree_trans *, struct btree *);
 
+int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter);
 int __must_check bch2_btree_iter_traverse(struct btree_iter *);
 
 struct btree *bch2_btree_iter_peek_node(struct btree_iter *);
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 310442fcc37f..eb4217a3b719 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -985,7 +985,14 @@ next:
 		bch2_bkey_merge(c, bkey_i_to_s(insert), k);
 out:
 	if (!bkey_deleted(&insert->k)) {
-		bch2_btree_iter_set_pos(&iter, insert->k.p);
+		/*
+		 * Rewinding iterators is expensive: get a new one and the one
+		 * that points to the start of insert will be cloned from:
+		 */
+		bch2_trans_iter_exit(trans, &iter);
+		bch2_trans_iter_init(trans, &iter, btree_id, insert->k.p,
+				     BTREE_ITER_NOT_EXTENTS|
+				     BTREE_ITER_INTENT);
 		ret   = bch2_btree_iter_traverse(&iter) ?:
 			bch2_trans_update(trans, &iter, insert, flags);
 	}
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index bee33258c0d8..f95ceb820faa 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -281,6 +281,16 @@ int bch2_extent_update(struct btree_trans *trans,
 	s64 i_sectors_delta = 0, disk_sectors_delta = 0;
 	int ret;
 
+	/*
+	 * This traverses us the iterator without changing iter->path->pos to
+	 * search_key() (which is pos + 1 for extents): we want there to be a
+	 * path already traversed at iter->pos because
+	 * bch2_trans_extent_update() will use it to attempt extent merging
+	 */
+	ret = __bch2_btree_iter_traverse(iter);
+	if (ret)
+		return ret;
+
 	ret = bch2_extent_trim_atomic(trans, iter, k);
 	if (ret)
 		return ret;
-- 
cgit 


From 8ee0134e03b541f3723d92586f1385ab50e42ac2 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 7 Sep 2021 20:23:30 -0400
Subject: bcachefs: Consolidate intent lock code in
 btree_path_up_until_good_node

We need to take all needed intent locks when relocking an iterator:
bch2_btree_path_traverse() had a special cased, faster version of this,
but it really should be in up_until_good_node() so that set_pos() can
use it too.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_iter.c | 29 +++++++++++++----------------
 1 file changed, 13 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index d2ee6e9aa370..fbe1a1170df6 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1352,10 +1352,7 @@ static inline unsigned btree_path_up_until_good_node(struct btree_trans *trans,
 						     struct btree_path *path,
 						     int check_pos)
 {
-	unsigned l = path->level;
-
-	if (!path->nodes_locked)
-		btree_path_get_locks(trans, path, false, _THIS_IP_);
+	unsigned i, l = path->level;
 
 	while (btree_path_node(path, l) &&
 	       !btree_path_good_node(trans, path, l, check_pos)) {
@@ -1364,6 +1361,17 @@ static inline unsigned btree_path_up_until_good_node(struct btree_trans *trans,
 		l++;
 	}
 
+	/* If we need intent locks, take them too: */
+	for (i = l + 1;
+	     i < path->locks_want && btree_path_node(path, i);
+	     i++)
+		if (!bch2_btree_node_relock(trans, path, i))
+			while (l <= i) {
+				btree_node_unlock(path, l);
+				path->l[l].b = BTREE_ITER_NO_NODE_UP;
+				l++;
+			}
+
 	return l;
 }
 
@@ -1381,7 +1389,7 @@ static int btree_path_traverse_one(struct btree_trans *trans,
 				   unsigned flags,
 				   unsigned long trace_ip)
 {
-	unsigned l, depth_want = path->level;
+	unsigned depth_want = path->level;
 	int ret = 0;
 
 	/*
@@ -1403,17 +1411,6 @@ static int btree_path_traverse_one(struct btree_trans *trans,
 
 	path->level = btree_path_up_until_good_node(trans, path, 0);
 
-	/* If we need intent locks, take them too: */
-	for (l = path->level + 1;
-	     l < path->locks_want && btree_path_node(path, l);
-	     l++)
-		if (!bch2_btree_node_relock(trans, path, l))
-			while (path->level <= l) {
-				btree_node_unlock(path, path->level);
-				path->l[path->level].b = BTREE_ITER_NO_NODE_UP;
-				path->level++;
-			}
-
 	/*
 	 * Note: path->nodes[path->level] may be temporarily NULL here - that
 	 * would indicate to other code that we got to the end of the btree,
-- 
cgit 


From b301105b48d2805ca0e29b1b0f660cf2232511ee Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 7 Sep 2021 21:24:05 -0400
Subject: bcachefs: normalize_read_intent_locks

This is a new approach to avoiding the self deadlock we'd get if we
tried to take a write lock on a node while holding a read lock - we
simply upgrade the readers to intent.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_update_leaf.c | 73 ++++++++++++++++++++++++++++++++++-------
 1 file changed, 61 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index eb4217a3b719..576f0739fdbd 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -48,14 +48,12 @@ static inline bool same_leaf_as_next(struct btree_trans *trans,
 		insert_l(&i[0])->b == insert_l(&i[1])->b;
 }
 
-inline void bch2_btree_node_lock_for_insert(struct btree_trans *trans,
-					    struct btree_path *path,
-					    struct btree *b)
+static inline void bch2_btree_node_prep_for_write(struct btree_trans *trans,
+						  struct btree_path *path,
+						  struct btree *b)
 {
 	struct bch_fs *c = trans->c;
 
-	bch2_btree_node_lock_write(trans, path, b);
-
 	if (path->cached)
 		return;
 
@@ -71,6 +69,14 @@ inline void bch2_btree_node_lock_for_insert(struct btree_trans *trans,
 		bch2_btree_init_next(trans, b);
 }
 
+void bch2_btree_node_lock_for_insert(struct btree_trans *trans,
+				     struct btree_path *path,
+				     struct btree *b)
+{
+	bch2_btree_node_lock_write(trans, path, b);
+	bch2_btree_node_prep_for_write(trans, path, b);
+}
+
 /* Inserting into a given leaf node (last stage of insert): */
 
 /* Handle overwrites and do insert, for non extents: */
@@ -495,6 +501,50 @@ err:
 	return ret;
 }
 
+static inline void upgrade_readers(struct btree_trans *trans, struct btree_path *path)
+{
+	struct btree *b = path_l(path)->b;
+
+	do {
+		if (path->nodes_locked &&
+		    path->nodes_locked != path->nodes_intent_locked)
+			BUG_ON(!bch2_btree_path_upgrade(trans, path, path->level + 1));
+	} while ((path = prev_btree_path(trans, path)) &&
+		 path_l(path)->b == b);
+}
+
+/*
+ * Check for nodes that we have both read and intent locks on, and upgrade the
+ * readers to intent:
+ */
+static inline void normalize_read_intent_locks(struct btree_trans *trans)
+{
+	struct btree_path *path;
+	unsigned i, nr_read = 0, nr_intent = 0;
+
+	trans_for_each_path_inorder(trans, path, i) {
+		struct btree_path *next = i + 1 < trans->nr_sorted
+			? trans->paths + trans->sorted[i + 1]
+			: NULL;
+
+		if (path->nodes_locked) {
+			if (path->nodes_intent_locked)
+				nr_intent++;
+			else
+				nr_read++;
+		}
+
+		if (!next || path_l(path)->b != path_l(next)->b) {
+			if (nr_read && nr_intent)
+				upgrade_readers(trans, path);
+
+			nr_read = nr_intent = 0;
+		}
+	}
+
+	bch2_trans_verify_locks(trans);
+}
+
 /*
  * Get journal reservation, take write locks, and attempt to do btree update(s):
  */
@@ -538,9 +588,6 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans,
 		}
 	}
 
-	trans_for_each_update(trans, i)
-		BUG_ON(!btree_node_intent_locked(i->path, i->level));
-
 	ret = bch2_journal_preres_get(&c->journal,
 			&trans->journal_preres, trans->journal_preres_u64s,
 			JOURNAL_RES_GET_NONBLOCK|
@@ -586,12 +633,14 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans,
 		}
 		btree_insert_entry_checks(trans, i);
 	}
-	bch2_trans_verify_locks(trans);
+
+	normalize_read_intent_locks(trans);
 
 	trans_for_each_update(trans, i)
-		if (!same_leaf_as_prev(trans, i))
-			bch2_btree_node_lock_for_insert(trans, i->path,
-					insert_l(i)->b);
+		if (!same_leaf_as_prev(trans, i)) {
+			btree_node_lock_type(c, insert_l(i)->b, SIX_LOCK_write);
+			bch2_btree_node_prep_for_write(trans, i->path, insert_l(i)->b);
+		}
 
 	ret = bch2_trans_commit_write_locked(trans, stopped_at, trace_ip);
 
-- 
cgit 


From caaa66aa546a27e75fb3cf32df1906140f85f1c9 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 7 Sep 2021 21:25:32 -0400
Subject: bcachefs: Better approach to write vs. read lock deadlocks

Instead of unconditionally upgrading read locks to intent locks in
do_bch2_trans_commit(), this patch changes the path that takes write
locks to first trylock, and then if trylock fails check if we have a
conflicting read lock, and restart the transaction if necessary.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_update_leaf.c | 108 +++++++++++++++++++++++++---------------
 fs/bcachefs/trace.h             |  15 ++++++
 2 files changed, 82 insertions(+), 41 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 576f0739fdbd..ab5cca892e1a 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -545,6 +545,54 @@ static inline void normalize_read_intent_locks(struct btree_trans *trans)
 	bch2_trans_verify_locks(trans);
 }
 
+static inline bool have_conflicting_read_lock(struct btree_trans *trans, struct btree_path *pos)
+{
+	struct btree_path *path;
+	unsigned i;
+
+	trans_for_each_path_inorder(trans, path, i) {
+		//if (path == pos)
+		//	break;
+
+		if (path->nodes_locked != path->nodes_intent_locked)
+			return true;
+	}
+
+	return false;
+}
+
+static inline int trans_lock_write(struct btree_trans *trans)
+{
+	struct btree_insert_entry *i;
+
+	trans_for_each_update(trans, i) {
+		if (same_leaf_as_prev(trans, i))
+			continue;
+
+		if (!six_trylock_write(&insert_l(i)->b->c.lock)) {
+			if (have_conflicting_read_lock(trans, i->path))
+				goto fail;
+
+			__btree_node_lock_type(trans->c, insert_l(i)->b,
+					       SIX_LOCK_write);
+		}
+
+		bch2_btree_node_prep_for_write(trans, i->path, insert_l(i)->b);
+	}
+
+	return 0;
+fail:
+	while (--i >= trans->updates) {
+		if (same_leaf_as_prev(trans, i))
+			continue;
+
+		bch2_btree_node_unlock_write_inlined(trans, i->path, insert_l(i)->b);
+	}
+
+	trace_trans_restart_would_deadlock_write(trans->ip);
+	return btree_trans_restart(trans);
+}
+
 /*
  * Get journal reservation, take write locks, and attempt to do btree update(s):
  */
@@ -554,10 +602,25 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans,
 {
 	struct bch_fs *c = trans->c;
 	struct btree_insert_entry *i;
-	struct btree_path *path;
 	struct bkey_s_c old;
 	int ret, u64s_delta = 0;
 
+	trans_for_each_update(trans, i) {
+		const char *invalid = bch2_bkey_invalid(c,
+				bkey_i_to_s_c(i->k), i->bkey_type);
+		if (invalid) {
+			char buf[200];
+
+			bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(i->k));
+			bch_err(c, "invalid bkey %s on insert from %ps -> %ps: %s\n",
+				buf, (void *) trans->ip,
+				(void *) i->ip_allocated, invalid);
+			bch2_fatal_error(c);
+			return -EINVAL;
+		}
+		btree_insert_entry_checks(trans, i);
+	}
+
 	trans_for_each_update(trans, i) {
 		struct bkey u;
 
@@ -599,48 +662,11 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans,
 	if (unlikely(ret))
 		return ret;
 
-	/*
-	 * Can't be holding any read locks when we go to take write locks:
-	 * another thread could be holding an intent lock on the same node we
-	 * have a read lock on, and it'll block trying to take a write lock
-	 * (because we hold a read lock) and it could be blocking us by holding
-	 * its own read lock (while we're trying to to take write locks).
-	 *
-	 * note - this must be done after bch2_trans_journal_preres_get_cold()
-	 * or anything else that might call bch2_trans_relock(), since that
-	 * would just retake the read locks:
-	 */
-	trans_for_each_path(trans, path)
-		if (path->nodes_locked != path->nodes_intent_locked &&
-		    !bch2_btree_path_upgrade(trans, path, path->level + 1)) {
-			trace_trans_restart_upgrade(trans->ip, trace_ip,
-						    path->btree_id, &path->pos);
-			return btree_trans_restart(trans);
-		}
-
-	trans_for_each_update(trans, i) {
-		const char *invalid = bch2_bkey_invalid(c,
-				bkey_i_to_s_c(i->k), i->bkey_type);
-		if (invalid) {
-			char buf[200];
-
-			bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(i->k));
-			bch_err(c, "invalid bkey %s on insert from %ps -> %ps: %s\n",
-				buf, (void *) trans->ip,
-				(void *) i->ip_allocated, invalid);
-			bch2_fatal_error(c);
-			return -EINVAL;
-		}
-		btree_insert_entry_checks(trans, i);
-	}
-
 	normalize_read_intent_locks(trans);
 
-	trans_for_each_update(trans, i)
-		if (!same_leaf_as_prev(trans, i)) {
-			btree_node_lock_type(c, insert_l(i)->b, SIX_LOCK_write);
-			bch2_btree_node_prep_for_write(trans, i->path, insert_l(i)->b);
-		}
+	ret = trans_lock_write(trans);
+	if (unlikely(ret))
+		return ret;
 
 	ret = bch2_trans_commit_write_locked(trans, stopped_at, trace_ip);
 
diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h
index 960dcc8ce3e6..21d026277540 100644
--- a/fs/bcachefs/trace.h
+++ b/fs/bcachefs/trace.h
@@ -756,6 +756,21 @@ TRACE_EVENT(trans_restart_would_deadlock,
 		  __entry->want_pos_snapshot)
 );
 
+TRACE_EVENT(trans_restart_would_deadlock_write,
+	TP_PROTO(unsigned long trans_ip),
+	TP_ARGS(trans_ip),
+
+	TP_STRUCT__entry(
+		__field(unsigned long,		trans_ip	)
+	),
+
+	TP_fast_assign(
+		__entry->trans_ip	= trans_ip;
+	),
+
+	TP_printk("%ps", (void *) __entry->trans_ip)
+);
+
 TRACE_EVENT(trans_restart_mem_realloced,
 	TP_PROTO(unsigned long trans_ip, unsigned long caller_ip,
 		 unsigned long bytes),
-- 
cgit 


From 5b5b03e7320dedcdfef968a1add47c18ab7b98a2 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 7 Sep 2021 23:04:04 -0400
Subject: bcachefs: Add missing BTREE_ITER_INTENT

No reason not to be using it here...

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/fs-common.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-common.c b/fs/bcachefs/fs-common.c
index a6617455ea12..6bc82559c9b1 100644
--- a/fs/bcachefs/fs-common.c
+++ b/fs/bcachefs/fs-common.c
@@ -110,7 +110,7 @@ int bch2_link_trans(struct btree_trans *trans, u64 dir_inum,
 	inode_u->bi_ctime = now;
 	bch2_inode_nlink_inc(inode_u);
 
-	ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir_inum, 0);
+	ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir_inum, BTREE_ITER_INTENT);
 	if (ret)
 		goto err;
 
-- 
cgit 


From c79272d1e4bb2ff65f4d68f193239d900658c5e6 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 9 Sep 2021 19:05:34 -0400
Subject: bcachefs: Fix some compiler warnings

gcc couldn't always deduce that written wasn't used uninitialized

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_io.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index f51dd3ec0797..f01f78952942 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -1251,7 +1251,7 @@ static void btree_node_read_all_replicas_done(struct closure *cl)
 	bool dump_bset_maps = false;
 	bool have_retry = false;
 	int ret = 0, best = -1, write = READ;
-	unsigned i, written, written2;
+	unsigned i, written = 0, written2 = 0;
 	__le64 seq = b->key.k.type == KEY_TYPE_btree_ptr_v2
 		? bkey_i_to_btree_ptr_v2(&b->key)->v.seq : 0;
 
-- 
cgit 


From aa76bd3321f018af08629bb98ba2d5f0b4b18546 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 13 Sep 2021 12:38:40 -0400
Subject: bcachefs: Add a missing bch2_trans_relock() call

This was causing an assertion to pop in fsck, in one of the repair
paths.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_update_leaf.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index ab5cca892e1a..8ab771334557 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -795,6 +795,9 @@ bch2_trans_commit_get_rw_cold(struct btree_trans *trans)
 	if (ret)
 		return ret;
 
+	if (!bch2_trans_relock(trans))
+		return -EINTR;
+
 	percpu_ref_get(&c->writes);
 	return 0;
 }
-- 
cgit 


From aae4eea60cf0c824abe6be809f0260df8574f49a Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 13 Sep 2021 16:04:49 -0400
Subject: bcachefs: Improve btree_node_mem_ptr optimization

This patch checks b->hash_val before attempting to lock the node in the
btree, which makes it more equivalent to the "lookup in hash table"
path - and potentially avoids an unnecessary transaction restart if
btree_node_mem_ptr(k) no longer points to the node we want.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_cache.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index d45218d5fd35..7f5620a4d7c5 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -777,7 +777,12 @@ struct btree *bch2_btree_node_get(struct btree_trans *trans, struct btree_path *
 
 	if (c->opts.btree_node_mem_ptr_optimization) {
 		b = btree_node_mem_ptr(k);
-		if (b)
+		/*
+		 * Check b->hash_val _before_ calling btree_node_lock() - this
+		 * might not be the node we want anymore, and trying to lock the
+		 * wrong node could cause an unneccessary transaction restart:
+		 */
+		if (b && b->hash_val == btree_ptr_hash_val(k))
 			goto lock_node;
 	}
 retry:
-- 
cgit 


From 3074bc0f7de749440c5e2b01a03ee2226fe69b52 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 15 Sep 2021 11:15:18 -0400
Subject: Revert "bcachefs: Add more assertions for locking btree iterators out
 of order"

Figured out the bug we were chasing, and it had nothing to do with
locking btree iterators/paths out of order.

This reverts commit ff08733dd298c969aec7c7828095458f73fd5374.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_iter.c      | 18 +++++-------------
 fs/bcachefs/btree_key_cache.c |  4 ++--
 fs/bcachefs/btree_locking.h   | 18 +++---------------
 fs/bcachefs/btree_types.h     |  2 --
 4 files changed, 10 insertions(+), 32 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index fbe1a1170df6..4cfd793f85e7 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -157,7 +157,7 @@ bool __bch2_btree_node_relock(struct btree_trans *trans,
 	if (six_relock_type(&b->c.lock, want, path->l[level].lock_seq) ||
 	    (btree_node_lock_seq_matches(path, b, level) &&
 	     btree_node_lock_increment(trans, b, level, want))) {
-		mark_btree_node_locked(trans, path, level, want);
+		mark_btree_node_locked(path, level, want);
 		return true;
 	} else {
 		return false;
@@ -193,7 +193,7 @@ static bool bch2_btree_node_upgrade(struct btree_trans *trans,
 
 	return false;
 success:
-	mark_btree_node_intent_locked(trans, path, level);
+	mark_btree_node_intent_locked(path, level);
 	return true;
 }
 
@@ -1045,7 +1045,7 @@ void bch2_trans_node_add(struct btree_trans *trans, struct btree *b)
 			    t != BTREE_NODE_UNLOCKED) {
 				btree_node_unlock(path, b->c.level);
 				six_lock_increment(&b->c.lock, (enum six_lock_type) t);
-				mark_btree_node_locked(trans, path, b->c.level, (enum six_lock_type) t);
+				mark_btree_node_locked(path, b->c.level, (enum six_lock_type) t);
 			}
 
 			btree_path_level_init(trans, path, b);
@@ -1122,7 +1122,7 @@ static inline int btree_path_lock_root(struct btree_trans *trans,
 			for (i = path->level + 1; i < BTREE_MAX_DEPTH; i++)
 				path->l[i].b = NULL;
 
-			mark_btree_node_locked(trans, path, path->level, lock_type);
+			mark_btree_node_locked(path, path->level, lock_type);
 			btree_path_level_init(trans, path, b);
 			return 0;
 		}
@@ -1214,7 +1214,7 @@ static __always_inline int btree_path_down(struct btree_trans *trans,
 	if (unlikely(ret))
 		goto err;
 
-	mark_btree_node_locked(trans, path, level, lock_type);
+	mark_btree_node_locked(path, level, lock_type);
 	btree_path_level_init(trans, path, b);
 
 	if (tmp.k->k.type == KEY_TYPE_btree_ptr_v2 &&
@@ -1295,9 +1295,6 @@ retry_all:
 		path = trans->paths + trans->sorted[i];
 
 		EBUG_ON(!(trans->paths_allocated & (1ULL << path->idx)));
-#ifdef CONFIG_BCACHEFS_DEBUG
-		trans->traverse_all_idx = path->idx;
-#endif
 
 		ret = btree_path_traverse_one(trans, path, 0, _THIS_IP_);
 		if (ret)
@@ -1319,11 +1316,6 @@ retry_all:
 out:
 	bch2_btree_cache_cannibalize_unlock(c);
 
-#ifdef CONFIG_BCACHEFS_DEBUG
-	trans->traverse_all_idx = U8_MAX;
-#endif
-	trans->in_traverse_all = false;
-
 	trace_trans_traverse_all(trans->ip, trace_ip);
 	return ret;
 }
diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index 9d3a64f37f09..9bdc2c3f21bf 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -297,7 +297,7 @@ retry:
 		if (!ck)
 			goto retry;
 
-		mark_btree_node_locked(trans, path, 0, SIX_LOCK_intent);
+		mark_btree_node_locked(path, 0, SIX_LOCK_intent);
 		path->locks_want = 1;
 	} else {
 		enum six_lock_type lock_want = __btree_lock_want(path, 0);
@@ -319,7 +319,7 @@ retry:
 			goto retry;
 		}
 
-		mark_btree_node_locked(trans, path, 0, lock_want);
+		mark_btree_node_locked(path, 0, lock_want);
 	}
 
 	path->l[0].lock_seq	= ck->c.lock.state.seq;
diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h
index ff58870311f3..d05689180c63 100644
--- a/fs/bcachefs/btree_locking.h
+++ b/fs/bcachefs/btree_locking.h
@@ -57,8 +57,7 @@ static inline void mark_btree_node_unlocked(struct btree_path *path,
 	path->nodes_intent_locked &= ~(1 << level);
 }
 
-static inline void mark_btree_node_locked(struct btree_trans *trans,
-					  struct btree_path *path,
+static inline void mark_btree_node_locked(struct btree_path *path,
 					  unsigned level,
 					  enum six_lock_type type)
 {
@@ -68,20 +67,12 @@ static inline void mark_btree_node_locked(struct btree_trans *trans,
 
 	path->nodes_locked |= 1 << level;
 	path->nodes_intent_locked |= type << level;
-#ifdef CONFIG_BCACHEFS_DEBUG
-	path->ip_locked = _RET_IP_;
-	btree_trans_sort_paths(trans);
-	BUG_ON(trans->in_traverse_all &&
-	       trans->traverse_all_idx != U8_MAX &&
-	       path->sorted_idx > trans->paths[trans->traverse_all_idx].sorted_idx);
-#endif
 }
 
-static inline void mark_btree_node_intent_locked(struct btree_trans *trans,
-						 struct btree_path *path,
+static inline void mark_btree_node_intent_locked(struct btree_path *path,
 						 unsigned level)
 {
-	mark_btree_node_locked(trans, path, level, SIX_LOCK_intent);
+	mark_btree_node_locked(path, level, SIX_LOCK_intent);
 }
 
 static inline enum six_lock_type __btree_lock_want(struct btree_path *path, int level)
@@ -120,9 +111,6 @@ static inline void __bch2_btree_path_unlock(struct btree_path *path)
 
 	while (path->nodes_locked)
 		btree_node_unlock(path, __ffs(path->nodes_locked));
-#ifdef CONFIG_BCACHEFS_DEBUG
-	path->ip_locked = 0;
-#endif
 }
 
 static inline enum bch_time_stats lock_to_time_stat(enum six_lock_type type)
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index ce64d3ad768b..b7cded2095ff 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -255,7 +255,6 @@ struct btree_path {
 	}			l[BTREE_MAX_DEPTH];
 #ifdef CONFIG_BCACHEFS_DEBUG
 	unsigned long		ip_allocated;
-	unsigned long		ip_locked;
 #endif
 };
 
@@ -369,7 +368,6 @@ struct btree_trans {
 	struct bpos		locking_pos;
 	u8			locking_btree_id;
 	u8			locking_level;
-	u8			traverse_all_idx;
 	pid_t			pid;
 #endif
 	unsigned long		ip;
-- 
cgit 


From 8948fc8f1521702d87a21b9c43c1228e53e3fc29 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 26 Sep 2021 13:54:14 -0400
Subject: bcachefs: Disable quota support

Existing quota support breaks badly with snapshots. We're not deleting
the code because some of it will be needed when we reimplement quotas
along the lines of btrfs subvolume quotas.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/opts.h | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
index 4b79e3bc0ac0..c5def5b1f558 100644
--- a/fs/bcachefs/opts.h
+++ b/fs/bcachefs/opts.h
@@ -215,19 +215,19 @@ enum opt_type {
 	  BCH_SB_POSIX_ACL,		true,				\
 	  NULL,		"Enable POSIX acls")				\
 	x(usrquota,			u8,				\
-	  OPT_FORMAT|OPT_MOUNT,						\
+	  0,								\
 	  OPT_BOOL(),							\
-	  BCH_SB_USRQUOTA,		false,				\
+	  NO_SB_OPT,		false,					\
 	  NULL,		"Enable user quotas")				\
 	x(grpquota,			u8,				\
-	  OPT_FORMAT|OPT_MOUNT,						\
+	  0,								\
 	  OPT_BOOL(),							\
-	  BCH_SB_GRPQUOTA,		false,				\
+	  NO_SB_OPT,		false,					\
 	  NULL,		"Enable group quotas")				\
 	x(prjquota,			u8,				\
-	  OPT_FORMAT|OPT_MOUNT,						\
+	  0,								\
 	  OPT_BOOL(),							\
-	  BCH_SB_PRJQUOTA,		false,				\
+	  NO_SB_OPT,		false,					\
 	  NULL,		"Enable project quotas")			\
 	x(degraded,			u8,				\
 	  OPT_MOUNT,							\
-- 
cgit 


From 14b393ee768e8339b9c64f82df24e8c081bdbff9 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 16 Mar 2021 00:42:25 -0400
Subject: bcachefs: Subvolumes, snapshots

This patch adds subvolume.c - support for the subvolumes and snapshots
btrees and related data types and on disk data structures. The next
patches will start hooking up this new code to existing code.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/Makefile            |   1 +
 fs/bcachefs/bcachefs.h          |  23 +
 fs/bcachefs/bcachefs_format.h   |  55 ++-
 fs/bcachefs/bcachefs_ioctl.h    |  15 +
 fs/bcachefs/bkey_methods.c      |   5 +
 fs/bcachefs/btree_key_cache.c   |   5 +
 fs/bcachefs/btree_types.h       |  15 +-
 fs/bcachefs/btree_update_leaf.c |   6 +
 fs/bcachefs/buckets.c           |   3 +
 fs/bcachefs/dirent.c            |   5 +-
 fs/bcachefs/fsck.c              |   4 +-
 fs/bcachefs/inode.c             |  15 +-
 fs/bcachefs/opts.c              |   3 +-
 fs/bcachefs/recovery.c          | 121 ++++-
 fs/bcachefs/subvolume.c         | 981 ++++++++++++++++++++++++++++++++++++++++
 fs/bcachefs/subvolume.h         |  77 ++++
 fs/bcachefs/super.c             |   4 +
 17 files changed, 1314 insertions(+), 24 deletions(-)
 create mode 100644 fs/bcachefs/subvolume.c
 create mode 100644 fs/bcachefs/subvolume.h

(limited to 'fs')

diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile
index af3b83f871df..a2769a85b029 100644
--- a/fs/bcachefs/Makefile
+++ b/fs/bcachefs/Makefile
@@ -50,6 +50,7 @@ bcachefs-y		:=	\
 	replicas.o		\
 	siphash.o		\
 	six.o			\
+	subvolume.o		\
 	super.o			\
 	super-io.o		\
 	sysfs.o			\
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 114ae77a8a02..1608faae0d0b 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -381,6 +381,8 @@ enum gc_phase {
 	GC_PHASE_BTREE_alloc,
 	GC_PHASE_BTREE_quotas,
 	GC_PHASE_BTREE_reflink,
+	GC_PHASE_BTREE_subvolumes,
+	GC_PHASE_BTREE_snapshots,
 
 	GC_PHASE_PENDING_DELETE,
 };
@@ -564,6 +566,21 @@ struct btree_path_buf {
 
 #define REPLICAS_DELTA_LIST_MAX	(1U << 16)
 
+struct snapshot_t {
+	u32			parent;
+	u32			children[2];
+	u32			subvol; /* Nonzero only if a subvolume points to this node: */
+	u32			equiv;
+};
+
+typedef struct {
+	u32		subvol;
+	u64		inum;
+} subvol_inum;
+
+#define BCACHEFS_ROOT_SUBVOL_INUM					\
+	((subvol_inum) { BCACHEFS_ROOT_SUBVOL,	BCACHEFS_ROOT_INO })
+
 struct bch_fs {
 	struct closure		cl;
 
@@ -635,6 +652,12 @@ struct bch_fs {
 	struct closure		sb_write;
 	struct mutex		sb_lock;
 
+	/* snapshot.c: */
+	GENRADIX(struct snapshot_t) snapshots;
+	struct bch_snapshot_table __rcu *snapshot_table;
+	struct mutex		snapshot_table_lock;
+	struct work_struct	snapshot_delete_work;
+
 	/* BTREE CACHE */
 	struct bio_set		btree_bio;
 	struct workqueue_struct	*io_complete_wq;
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index 156198850b67..ae8f3a5bc787 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -346,7 +346,9 @@ static inline void bkey_init(struct bkey *k)
 	x(inline_data,		17)			\
 	x(btree_ptr_v2,		18)			\
 	x(indirect_inline_data,	19)			\
-	x(alloc_v2,		20)
+	x(alloc_v2,		20)			\
+	x(subvolume,		21)			\
+	x(snapshot,		22)
 
 enum bch_bkey_type {
 #define x(name, nr) KEY_TYPE_##name	= nr,
@@ -690,6 +692,10 @@ struct bch_inode_generation {
 	__le32			pad;
 } __attribute__((packed, aligned(8)));
 
+/*
+ * bi_subvol and bi_parent_subvol are only set for subvolume roots:
+ */
+
 #define BCH_INODE_FIELDS()			\
 	x(bi_atime,			96)	\
 	x(bi_ctime,			96)	\
@@ -713,7 +719,9 @@ struct bch_inode_generation {
 	x(bi_erasure_code,		16)	\
 	x(bi_fields_set,		16)	\
 	x(bi_dir,			64)	\
-	x(bi_dir_offset,		64)
+	x(bi_dir_offset,		64)	\
+	x(bi_subvol,			32)	\
+	x(bi_parent_subvol,		32)
 
 /* subset of BCH_INODE_FIELDS */
 #define BCH_INODE_OPTS()			\
@@ -796,6 +804,9 @@ struct bch_dirent {
 	__u8			d_name[];
 } __attribute__((packed, aligned(8)));
 
+#define DT_SUBVOL	16
+#define BCH_DT_MAX	17
+
 #define BCH_NAME_MAX	(U8_MAX * sizeof(u64) -				\
 			 sizeof(struct bkey) -				\
 			 offsetof(struct bch_dirent, d_name))
@@ -932,6 +943,42 @@ struct bch_inline_data {
 	u8			data[0];
 };
 
+/* Subvolumes: */
+
+#define SUBVOL_POS_MIN		POS(0, 1)
+#define SUBVOL_POS_MAX		POS(0, S32_MAX)
+#define BCACHEFS_ROOT_SUBVOL	1
+
+struct bch_subvolume {
+	struct bch_val		v;
+	__le32			flags;
+	__le32			snapshot;
+	__le64			inode;
+};
+
+LE32_BITMASK(BCH_SUBVOLUME_RO,		struct bch_subvolume, flags,  0,  1)
+/*
+ * We need to know whether a subvolume is a snapshot so we can know whether we
+ * can delete it (or whether it should just be rm -rf'd)
+ */
+LE32_BITMASK(BCH_SUBVOLUME_SNAP,	struct bch_subvolume, flags,  1,  2)
+
+/* Snapshots */
+
+struct bch_snapshot {
+	struct bch_val		v;
+	__le32			flags;
+	__le32			parent;
+	__le32			children[2];
+	__le32			subvol;
+	__le32			pad;
+};
+
+LE32_BITMASK(BCH_SNAPSHOT_DELETED,	struct bch_snapshot, flags,  0,  1)
+
+/* True if a subvolume points to this snapshot node: */
+LE32_BITMASK(BCH_SNAPSHOT_SUBVOL,	struct bch_snapshot, flags,  1,  2)
+
 /* Optional/variable size superblock sections: */
 
 struct bch_sb_field {
@@ -1702,7 +1749,9 @@ LE32_BITMASK(JSET_NO_FLUSH,	struct jset, flags, 5, 6);
 	x(alloc,	4)			\
 	x(quotas,	5)			\
 	x(stripes,	6)			\
-	x(reflink,	7)
+	x(reflink,	7)			\
+	x(subvolumes,	8)			\
+	x(snapshots,	9)
 
 enum btree_id {
 #define x(kwd, val) BTREE_ID_##kwd = val,
diff --git a/fs/bcachefs/bcachefs_ioctl.h b/fs/bcachefs/bcachefs_ioctl.h
index 1ef9907e07ad..66ab3aea9767 100644
--- a/fs/bcachefs/bcachefs_ioctl.h
+++ b/fs/bcachefs/bcachefs_ioctl.h
@@ -78,6 +78,9 @@ struct bch_ioctl_incremental {
 #define BCH_IOCTL_DISK_RESIZE	_IOW(0xbc,	14,  struct bch_ioctl_disk_resize)
 #define BCH_IOCTL_DISK_RESIZE_JOURNAL _IOW(0xbc,15,  struct bch_ioctl_disk_resize_journal)
 
+#define BCH_IOCTL_SUBVOLUME_CREATE _IOW(0xbc,	16,  struct bch_ioctl_subvolume)
+#define BCH_IOCTL_SUBVOLUME_DESTROY _IOW(0xbc,	17,  struct bch_ioctl_subvolume)
+
 /* ioctl below act on a particular file, not the filesystem as a whole: */
 
 #define BCHFS_IOC_REINHERIT_ATTRS	_IOR(0xbc, 64, const char __user *)
@@ -349,4 +352,16 @@ struct bch_ioctl_disk_resize_journal {
 	__u64			nbuckets;
 };
 
+struct bch_ioctl_subvolume {
+	__u32			flags;
+	__u32			dirfd;
+	__u16			mode;
+	__u16			pad[3];
+	__u64			dst_ptr;
+	__u64			src_ptr;
+};
+
+#define BCH_SUBVOL_SNAPSHOT_CREATE	(1U << 0)
+#define BCH_SUBVOL_SNAPSHOT_RO		(1U << 1)
+
 #endif /* _BCACHEFS_IOCTL_H */
diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c
index 8efef485c6d8..42fdcc4487de 100644
--- a/fs/bcachefs/bkey_methods.c
+++ b/fs/bcachefs/bkey_methods.c
@@ -11,6 +11,7 @@
 #include "inode.h"
 #include "quota.h"
 #include "reflink.h"
+#include "subvolume.h"
 #include "xattr.h"
 
 const char * const bch2_bkey_types[] = {
@@ -126,6 +127,10 @@ static unsigned bch2_key_types_allowed[] = {
 	[BKEY_TYPE_reflink] =
 		(1U << KEY_TYPE_reflink_v)|
 		(1U << KEY_TYPE_indirect_inline_data),
+	[BKEY_TYPE_subvolumes] =
+		(1U << KEY_TYPE_subvolume),
+	[BKEY_TYPE_snapshots] =
+		(1U << KEY_TYPE_snapshot),
 	[BKEY_TYPE_btree] =
 		(1U << KEY_TYPE_btree_ptr)|
 		(1U << KEY_TYPE_btree_ptr_v2),
diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index 9bdc2c3f21bf..7be580555374 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -164,6 +164,11 @@ btree_key_cache_create(struct btree_key_cache *c,
 		was_new = false;
 	}
 
+	if (btree_id == BTREE_ID_subvolumes)
+		six_lock_pcpu_alloc(&ck->c.lock);
+	else
+		six_lock_pcpu_free(&ck->c.lock);
+
 	ck->c.level		= 0;
 	ck->c.btree_id		= btree_id;
 	ck->key.btree_id	= btree_id;
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index b7cded2095ff..9250ac69e8b1 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -606,7 +606,8 @@ static inline bool btree_node_is_extents(struct btree *b)
 
 #define BTREE_NODE_TYPE_HAS_MEM_TRIGGERS		\
 	((1U << BKEY_TYPE_alloc)|			\
-	 (1U << BKEY_TYPE_stripes))
+	 (1U << BKEY_TYPE_stripes)|			\
+	 (1U << BKEY_TYPE_snapshots))
 
 #define BTREE_NODE_TYPE_HAS_TRIGGERS			\
 	(BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS|		\
@@ -653,7 +654,8 @@ enum btree_update_flags {
 
 #define BTREE_TRIGGER_WANTS_OLD_AND_NEW		\
 	((1U << KEY_TYPE_stripe)|		\
-	 (1U << KEY_TYPE_inode))
+	 (1U << KEY_TYPE_inode)|		\
+	 (1U << KEY_TYPE_snapshot))
 
 static inline bool btree_node_type_needs_gc(enum btree_node_type type)
 {
@@ -670,11 +672,6 @@ struct btree_root {
 	s8			error;
 };
 
-/*
- * Optional hook that will be called just prior to a btree node update, when
- * we're holding the write lock and we know what key is about to be overwritten:
- */
-
 enum btree_insert_ret {
 	BTREE_INSERT_OK,
 	/* leaf node needs to be split */
@@ -695,8 +692,4 @@ enum btree_node_sibling {
 	btree_next_sib,
 };
 
-typedef struct btree_nr_keys (*sort_fix_overlapping_fn)(struct bset *,
-							struct btree *,
-							struct btree_node_iter *);
-
 #endif /* _BCACHEFS_BTREE_TYPES_H */
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 8ab771334557..6e904f9195cc 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -15,6 +15,7 @@
 #include "journal.h"
 #include "journal_reclaim.h"
 #include "keylist.h"
+#include "subvolume.h"
 #include "replicas.h"
 #include "trace.h"
 
@@ -245,6 +246,11 @@ static inline void btree_insert_entry_checks(struct btree_trans *trans,
 	BUG_ON(i->cached	!= i->path->cached);
 	BUG_ON(i->level		!= i->path->level);
 	BUG_ON(i->btree_id	!= i->path->btree_id);
+	EBUG_ON(!i->level &&
+		!(i->flags & BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) &&
+		test_bit(JOURNAL_REPLAY_DONE, &trans->c->journal.flags) &&
+		i->k->k.p.snapshot &&
+		bch2_snapshot_internal_node(trans->c, i->k->k.p.snapshot));
 }
 
 static noinline int
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 6831c002961d..2d2bdfb7977d 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -16,6 +16,7 @@
 #include "movinggc.h"
 #include "reflink.h"
 #include "replicas.h"
+#include "subvolume.h"
 #include "trace.h"
 
 #include <linux/preempt.h>
@@ -1204,6 +1205,8 @@ static int bch2_mark_key_locked(struct bch_fs *c,
 		return bch2_mark_reservation(c, old, new, journal_seq, flags);
 	case KEY_TYPE_reflink_p:
 		return bch2_mark_reflink_p(c, old, new, journal_seq, flags);
+	case KEY_TYPE_snapshot:
+		return bch2_mark_snapshot(c, old, new, journal_seq, flags);
 	default:
 		return 0;
 	}
diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c
index 1d510f7728b6..53c7687a9ca8 100644
--- a/fs/bcachefs/dirent.c
+++ b/fs/bcachefs/dirent.c
@@ -99,7 +99,8 @@ const char *bch2_dirent_invalid(const struct bch_fs *c, struct bkey_s_c k)
 	if (memchr(d.v->d_name, '/', len))
 		return "invalid name";
 
-	if (le64_to_cpu(d.v->d_inum) == d.k->p.inode)
+	if (d.v->d_type != DT_SUBVOL &&
+	    le64_to_cpu(d.v->d_inum) == d.k->p.inode)
 		return "dirent points to own directory";
 
 	return NULL;
@@ -113,7 +114,7 @@ void bch2_dirent_to_text(struct printbuf *out, struct bch_fs *c,
 	bch_scnmemcpy(out, d.v->d_name,
 		      bch2_dirent_name_bytes(d));
 	pr_buf(out, " -> %llu type %s", d.v->d_inum,
-	       d.v->d_type < DT_MAX
+	       d.v->d_type < BCH_DT_MAX
 	       ? bch2_d_types[d.v->d_type]
 	       : "(bad d_type)");
 }
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index eb979e79eaac..62158c0803db 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -9,6 +9,7 @@
 #include "fsck.h"
 #include "inode.h"
 #include "keylist.h"
+#include "subvolume.h"
 #include "super.h"
 #include "xattr.h"
 
@@ -1410,7 +1411,8 @@ int bch2_fsck_full(struct bch_fs *c)
 {
 	struct bch_inode_unpacked root_inode;
 
-	return  check_inodes(c, true) ?:
+	return  bch2_fs_snapshots_check(c) ?:
+		check_inodes(c, true) ?:
 		check_extents(c) ?:
 		check_dirents(c) ?:
 		check_xattrs(c) ?:
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index 2b653ee03f4f..3b19dc6b9ddc 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -8,6 +8,7 @@
 #include "extents.h"
 #include "inode.h"
 #include "str_hash.h"
+#include "subvolume.h"
 #include "varint.h"
 
 #include <linux/random.h>
@@ -340,8 +341,8 @@ int bch2_inode_write(struct btree_trans *trans,
 
 const char *bch2_inode_invalid(const struct bch_fs *c, struct bkey_s_c k)
 {
-		struct bkey_s_c_inode inode = bkey_s_c_to_inode(k);
-		struct bch_inode_unpacked unpacked;
+	struct bkey_s_c_inode inode = bkey_s_c_to_inode(k);
+	struct bch_inode_unpacked unpacked;
 
 	if (k.k->p.inode)
 		return "nonzero k.p.inode";
@@ -368,6 +369,9 @@ const char *bch2_inode_invalid(const struct bch_fs *c, struct bkey_s_c k)
 	    unpacked.bi_nlink != 0)
 		return "flagged as unlinked but bi_nlink != 0";
 
+	if (unpacked.bi_subvol && !S_ISDIR(unpacked.bi_mode))
+		return "subvolume root but not a directory";
+
 	return NULL;
 }
 
@@ -635,6 +639,13 @@ retry:
 
 	bch2_inode_unpack(bkey_s_c_to_inode(k), &inode_u);
 
+	/* Subvolume root? */
+	if (inode_u.bi_subvol) {
+		ret = bch2_subvolume_delete(&trans, inode_u.bi_subvol, -1);
+		if (ret)
+			goto err;
+	}
+
 	bkey_inode_generation_init(&delete.k_i);
 	delete.k.p = iter.pos;
 	delete.v.bi_generation = cpu_to_le32(inode_u.bi_generation + 1);
diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c
index 5de296078219..ff99c6d24abd 100644
--- a/fs/bcachefs/opts.c
+++ b/fs/bcachefs/opts.c
@@ -63,7 +63,7 @@ const char * const bch2_member_states[] = {
 
 #undef x
 
-const char * const bch2_d_types[DT_MAX] = {
+const char * const bch2_d_types[BCH_DT_MAX] = {
 	[DT_UNKNOWN]	= "unknown",
 	[DT_FIFO]	= "fifo",
 	[DT_CHR]	= "chr",
@@ -73,6 +73,7 @@ const char * const bch2_d_types[DT_MAX] = {
 	[DT_LNK]	= "lnk",
 	[DT_SOCK]	= "sock",
 	[DT_WHT]	= "whiteout",
+	[DT_SUBVOL]	= "subvol",
 };
 
 void bch2_opts_apply(struct bch_opts *dst, struct bch_opts src)
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 11208e83fabe..2aab57cf09e1 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -20,6 +20,7 @@
 #include "quota.h"
 #include "recovery.h"
 #include "replicas.h"
+#include "subvolume.h"
 #include "super-io.h"
 
 #include <linux/sort.h>
@@ -961,6 +962,81 @@ fsck_err:
 	return ret;
 }
 
+static int bch2_fs_initialize_subvolumes(struct bch_fs *c)
+{
+	struct bkey_i_snapshot	root_snapshot;
+	struct bkey_i_subvolume root_volume;
+	int ret;
+
+	bkey_snapshot_init(&root_snapshot.k_i);
+	root_snapshot.k.p.offset = U32_MAX;
+	root_snapshot.v.flags	= 0;
+	root_snapshot.v.parent	= 0;
+	root_snapshot.v.subvol	= BCACHEFS_ROOT_SUBVOL;
+	root_snapshot.v.pad	= 0;
+	SET_BCH_SNAPSHOT_SUBVOL(&root_snapshot.v, true);
+
+	ret = bch2_btree_insert(c, BTREE_ID_snapshots,
+				&root_snapshot.k_i,
+				NULL, NULL, 0);
+	if (ret)
+		return ret;
+
+
+	bkey_subvolume_init(&root_volume.k_i);
+	root_volume.k.p.offset = BCACHEFS_ROOT_SUBVOL;
+	root_volume.v.flags	= 0;
+	root_volume.v.snapshot	= cpu_to_le32(U32_MAX);
+	root_volume.v.inode	= cpu_to_le64(BCACHEFS_ROOT_INO);
+
+	ret = bch2_btree_insert(c, BTREE_ID_subvolumes,
+				&root_volume.k_i,
+				NULL, NULL, 0);
+	if (ret)
+		return ret;
+
+	return 0;
+}
+
+static int bch2_fs_upgrade_for_subvolumes(struct btree_trans *trans)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	struct bch_inode_unpacked inode;
+	struct bkey_inode_buf *packed;
+	int ret;
+
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes,
+			     POS(0, BCACHEFS_ROOT_INO), 0);
+	k = bch2_btree_iter_peek_slot(&iter);
+	ret = bkey_err(k);
+	if (ret)
+		goto err;
+
+	if (k.k->type != KEY_TYPE_inode) {
+		bch_err(c, "root inode not found");
+		ret = -ENOENT;
+		goto err;
+	}
+
+	ret = bch2_inode_unpack(bkey_s_c_to_inode(k), &inode);
+	BUG_ON(ret);
+
+	inode.bi_subvol = BCACHEFS_ROOT_SUBVOL;
+
+	packed = bch2_trans_kmalloc(trans, sizeof(*packed));
+	ret = PTR_ERR_OR_ZERO(packed);
+	if (ret)
+		goto err;
+
+	bch2_inode_pack(c, packed, &inode);
+	ret = bch2_trans_update(trans, &iter, &packed->inode.k_i, 0);
+err:
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
 int bch2_fs_recovery(struct bch_fs *c)
 {
 	const char *err = "cannot allocate memory";
@@ -1017,11 +1093,12 @@ int bch2_fs_recovery(struct bch_fs *c)
 		c->opts.version_upgrade	= true;
 		c->opts.fsck		= true;
 		c->opts.fix_errors	= FSCK_OPT_YES;
-	}
-
-	if (c->sb.version < bcachefs_metadata_version_btree_ptr_sectors_written) {
+	} else if (c->sb.version < bcachefs_metadata_version_btree_ptr_sectors_written) {
 		bch_info(c, "version prior to btree_ptr_sectors_written, upgrade required");
 		c->opts.version_upgrade	= true;
+	} else if (c->sb.version < bcachefs_metadata_version_snapshot) {
+		bch_info(c, "filesystem version is prior to snapshot field - upgrading");
+		c->opts.version_upgrade = true;
 	}
 
 	ret = bch2_blacklist_table_initialize(c);
@@ -1190,6 +1267,29 @@ use_clean:
 		bch_verbose(c, "alloc write done");
 	}
 
+	if (c->sb.version < bcachefs_metadata_version_snapshot) {
+		err = "error creating root snapshot node";
+		ret = bch2_fs_initialize_subvolumes(c);
+		if (ret)
+			goto err;
+	}
+
+	bch_verbose(c, "reading snapshots table");
+	err = "error reading snapshots table";
+	ret = bch2_fs_snapshots_start(c);
+	if (ret)
+		goto err;
+	bch_verbose(c, "reading snapshots done");
+
+	if (c->sb.version < bcachefs_metadata_version_snapshot) {
+		/* set bi_subvol on root inode */
+		err = "error upgrade root inode for subvolumes";
+		ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_LAZY_RW,
+				    bch2_fs_upgrade_for_subvolumes(&trans));
+		if (ret)
+			goto err;
+	}
+
 	if (c->opts.fsck) {
 		bch_info(c, "starting fsck");
 		err = "error in fsck";
@@ -1350,9 +1450,22 @@ int bch2_fs_initialize(struct bch_fs *c)
 		}
 	}
 
+	err = "error creating root snapshot node";
+	ret = bch2_fs_initialize_subvolumes(c);
+	if (ret)
+		goto err;
+
+	bch_verbose(c, "reading snapshots table");
+	err = "error reading snapshots table";
+	ret = bch2_fs_snapshots_start(c);
+	if (ret)
+		goto err;
+	bch_verbose(c, "reading snapshots done");
+
 	bch2_inode_init(c, &root_inode, 0, 0,
 			S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0, NULL);
-	root_inode.bi_inum = BCACHEFS_ROOT_INO;
+	root_inode.bi_inum	= BCACHEFS_ROOT_INO;
+	root_inode.bi_subvol	= BCACHEFS_ROOT_SUBVOL;
 	bch2_inode_pack(c, &packed_inode, &root_inode);
 	packed_inode.inode.k.p.snapshot = U32_MAX;
 
diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c
new file mode 100644
index 000000000000..ff3b4d2d86b9
--- /dev/null
+++ b/fs/bcachefs/subvolume.c
@@ -0,0 +1,981 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "btree_key_cache.h"
+#include "btree_update.h"
+#include "error.h"
+#include "subvolume.h"
+
+/* Snapshot tree: */
+
+static void bch2_delete_dead_snapshots_work(struct work_struct *);
+static void bch2_delete_dead_snapshots(struct bch_fs *);
+
+void bch2_snapshot_to_text(struct printbuf *out, struct bch_fs *c,
+			   struct bkey_s_c k)
+{
+	struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(k);
+
+	pr_buf(out, "is_subvol %llu deleted %llu parent %u children %u %u subvol %u",
+	       BCH_SNAPSHOT_SUBVOL(s.v),
+	       BCH_SNAPSHOT_DELETED(s.v),
+	       le32_to_cpu(s.v->parent),
+	       le32_to_cpu(s.v->children[0]),
+	       le32_to_cpu(s.v->children[1]),
+	       le32_to_cpu(s.v->subvol));
+}
+
+const char *bch2_snapshot_invalid(const struct bch_fs *c, struct bkey_s_c k)
+{
+	struct bkey_s_c_snapshot s;
+	u32 i, id;
+
+	if (bkey_cmp(k.k->p, POS(0, U32_MAX)) > 0 ||
+	    bkey_cmp(k.k->p, POS(0, 1)) < 0)
+		return "bad pos";
+
+	if (bkey_val_bytes(k.k) != sizeof(struct bch_snapshot))
+		return "bad val size";
+
+	s = bkey_s_c_to_snapshot(k);
+
+	id = le32_to_cpu(s.v->parent);
+	if (id && id <= k.k->p.offset)
+		return "bad parent node";
+
+	if (le32_to_cpu(s.v->children[0]) < le32_to_cpu(s.v->children[1]))
+		return "children not normalized";
+
+	if (s.v->children[0] &&
+	    s.v->children[0] == s.v->children[1])
+		return "duplicate child nodes";
+
+	for (i = 0; i < 2; i++) {
+		id = le32_to_cpu(s.v->children[i]);
+
+		if (id >= k.k->p.offset)
+			return "bad child node";
+	}
+
+	return NULL;
+}
+
+int bch2_mark_snapshot(struct bch_fs *c,
+		       struct bkey_s_c old, struct bkey_s_c new,
+		       u64 journal_seq, unsigned flags)
+{
+	struct snapshot_t *t;
+
+	t = genradix_ptr_alloc(&c->snapshots,
+			       U32_MAX - new.k->p.offset,
+			       GFP_KERNEL);
+	if (!t)
+		return -ENOMEM;
+
+	if (new.k->type == KEY_TYPE_snapshot) {
+		struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(new);
+
+		t->parent	= le32_to_cpu(s.v->parent);
+		t->children[0]	= le32_to_cpu(s.v->children[0]);
+		t->children[1]	= le32_to_cpu(s.v->children[1]);
+		t->subvol	= BCH_SNAPSHOT_SUBVOL(s.v) ? le32_to_cpu(s.v->subvol) : 0;
+	} else {
+		t->parent	= 0;
+		t->children[0]	= 0;
+		t->children[1]	= 0;
+		t->subvol	= 0;
+	}
+
+	return 0;
+}
+
+static int subvol_lookup(struct btree_trans *trans, unsigned id, struct bch_subvolume *s)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	int ret;
+
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_subvolumes, POS(0, id), 0);
+	k = bch2_btree_iter_peek_slot(&iter);
+	ret = bkey_err(k) ?: k.k->type == KEY_TYPE_subvolume ? 0 : -ENOENT;
+
+	if (!ret)
+		*s = *bkey_s_c_to_subvolume(k).v;
+
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+static int snapshot_lookup(struct btree_trans *trans, u32 id,
+			   struct bch_snapshot *s)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	int ret;
+
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots, POS(0, id),
+			     BTREE_ITER_WITH_UPDATES);
+	k = bch2_btree_iter_peek_slot(&iter);
+	ret = bkey_err(k) ?: k.k->type == KEY_TYPE_snapshot ? 0 : -ENOENT;
+
+	if (!ret)
+		*s = *bkey_s_c_to_snapshot(k).v;
+
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+static int snapshot_live(struct btree_trans *trans, u32 id)
+{
+	struct bch_snapshot v;
+	int ret;
+
+	if (!id)
+		return 0;
+
+	ret = lockrestart_do(trans, snapshot_lookup(trans, id, &v));
+	if (ret == -ENOENT)
+		bch_err(trans->c, "snapshot node %u not found", id);
+	if (ret)
+		return ret;
+
+	return !BCH_SNAPSHOT_DELETED(&v);
+}
+
+static int bch2_snapshots_set_equiv(struct btree_trans *trans)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	struct bkey_s_c_snapshot snap;
+	unsigned i;
+	int ret;
+
+	for_each_btree_key(trans, iter, BTREE_ID_snapshots,
+			   POS_MIN, 0, k, ret) {
+		u32 id = k.k->p.offset, child[2];
+		unsigned nr_live = 0, live_idx;
+
+		if (k.k->type != KEY_TYPE_snapshot)
+			continue;
+
+		snap = bkey_s_c_to_snapshot(k);
+		child[0] = le32_to_cpu(snap.v->children[0]);
+		child[1] = le32_to_cpu(snap.v->children[1]);
+
+		for (i = 0; i < 2; i++) {
+			ret = snapshot_live(trans, child[i]);
+			if (ret < 0)
+				break;
+
+			if (ret)
+				live_idx = i;
+			nr_live += ret;
+		}
+
+		snapshot_t(c, id)->equiv = nr_live == 1
+			? snapshot_t(c, child[live_idx])->equiv
+			: id;
+	}
+	bch2_trans_iter_exit(trans, &iter);
+
+	if (ret)
+		bch_err(c, "error walking snapshots: %i", ret);
+
+	return ret;
+}
+
+/* fsck: */
+static int bch2_snapshot_check(struct btree_trans *trans,
+			       struct bkey_s_c_snapshot s)
+{
+	struct bch_subvolume subvol;
+	struct bch_snapshot v;
+	u32 i, id;
+	int ret;
+
+	id = le32_to_cpu(s.v->subvol);
+	ret = lockrestart_do(trans, subvol_lookup(trans, id, &subvol));
+	if (ret == -ENOENT)
+		bch_err(trans->c, "snapshot node %llu has nonexistent subvolume %u",
+			s.k->p.offset, id);
+	if (ret)
+		return ret;
+
+	if (BCH_SNAPSHOT_SUBVOL(s.v) != (le32_to_cpu(subvol.snapshot) == s.k->p.offset)) {
+		bch_err(trans->c, "snapshot node %llu has wrong BCH_SNAPSHOT_SUBVOL",
+			s.k->p.offset);
+		return -EINVAL;
+	}
+
+	id = le32_to_cpu(s.v->parent);
+	if (id) {
+		ret = lockrestart_do(trans, snapshot_lookup(trans, id, &v));
+		if (ret == -ENOENT)
+			bch_err(trans->c, "snapshot node %llu has nonexistent parent %u",
+				s.k->p.offset, id);
+		if (ret)
+			return ret;
+
+		if (le32_to_cpu(v.children[0]) != s.k->p.offset &&
+		    le32_to_cpu(v.children[1]) != s.k->p.offset) {
+			bch_err(trans->c, "snapshot parent %u missing pointer to child %llu",
+				id, s.k->p.offset);
+			return -EINVAL;
+		}
+	}
+
+	for (i = 0; i < 2 && s.v->children[i]; i++) {
+		id = le32_to_cpu(s.v->children[i]);
+
+		ret = lockrestart_do(trans, snapshot_lookup(trans, id, &v));
+		if (ret == -ENOENT)
+			bch_err(trans->c, "snapshot node %llu has nonexistent child %u",
+				s.k->p.offset, id);
+		if (ret)
+			return ret;
+
+		if (le32_to_cpu(v.parent) != s.k->p.offset) {
+			bch_err(trans->c, "snapshot child %u has wrong parent (got %u should be %llu)",
+				id, le32_to_cpu(v.parent), s.k->p.offset);
+			return -EINVAL;
+		}
+	}
+
+	return 0;
+}
+
+int bch2_fs_snapshots_check(struct bch_fs *c)
+{
+	struct btree_trans trans;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	struct bch_snapshot s;
+	unsigned id;
+	int ret;
+
+	bch2_trans_init(&trans, c, 0, 0);
+
+	for_each_btree_key(&trans, iter, BTREE_ID_snapshots,
+			   POS_MIN, 0, k, ret) {
+		if (k.k->type != KEY_TYPE_snapshot)
+			continue;
+
+		ret = bch2_snapshot_check(&trans, bkey_s_c_to_snapshot(k));
+		if (ret)
+			break;
+	}
+	bch2_trans_iter_exit(&trans, &iter);
+
+	if (ret) {
+		bch_err(c, "error %i checking snapshots", ret);
+		goto err;
+	}
+
+	for_each_btree_key(&trans, iter, BTREE_ID_subvolumes,
+			   POS_MIN, 0, k, ret) {
+		if (k.k->type != KEY_TYPE_subvolume)
+			continue;
+again_2:
+		id = le32_to_cpu(bkey_s_c_to_subvolume(k).v->snapshot);
+		ret = snapshot_lookup(&trans, id, &s);
+
+		if (ret == -EINTR) {
+			k = bch2_btree_iter_peek(&iter);
+			goto again_2;
+		} else if (ret == -ENOENT)
+			bch_err(c, "subvolume %llu points to nonexistent snapshot %u",
+				k.k->p.offset, id);
+		else if (ret)
+			break;
+	}
+	bch2_trans_iter_exit(&trans, &iter);
+err:
+	bch2_trans_exit(&trans);
+	return ret;
+}
+
+void bch2_fs_snapshots_exit(struct bch_fs *c)
+{
+	genradix_free(&c->snapshots);
+}
+
+int bch2_fs_snapshots_start(struct bch_fs *c)
+{
+	struct btree_trans trans;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	bool have_deleted = false;
+	int ret = 0;
+
+	bch2_trans_init(&trans, c, 0, 0);
+
+	for_each_btree_key(&trans, iter, BTREE_ID_snapshots,
+			   POS_MIN, 0, k, ret) {
+	       if (bkey_cmp(k.k->p, POS(0, U32_MAX)) > 0)
+		       break;
+
+		if (k.k->type != KEY_TYPE_snapshot) {
+			bch_err(c, "found wrong key type %u in snapshot node table",
+				k.k->type);
+			continue;
+		}
+
+		if (BCH_SNAPSHOT_DELETED(bkey_s_c_to_snapshot(k).v))
+			have_deleted = true;
+
+		ret = bch2_mark_snapshot(c, bkey_s_c_null, k, 0, 0);
+		if (ret)
+			break;
+	}
+	bch2_trans_iter_exit(&trans, &iter);
+
+	if (ret)
+		goto err;
+
+	ret = bch2_snapshots_set_equiv(&trans);
+	if (ret)
+		goto err;
+err:
+	bch2_trans_exit(&trans);
+
+	if (!ret && have_deleted) {
+		bch_info(c, "restarting deletion of dead snapshots");
+		if (c->opts.fsck) {
+			bch2_delete_dead_snapshots_work(&c->snapshot_delete_work);
+		} else {
+			bch2_delete_dead_snapshots(c);
+		}
+	}
+
+	return ret;
+}
+
+/*
+ * Mark a snapshot as deleted, for future cleanup:
+ */
+static int bch2_snapshot_node_set_deleted(struct btree_trans *trans, u32 id)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	struct bkey_i_snapshot *s;
+	int ret = 0;
+
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots, POS(0, id),
+			     BTREE_ITER_INTENT);
+	k = bch2_btree_iter_peek_slot(&iter);
+	ret = bkey_err(k);
+	if (ret)
+		goto err;
+
+	if (k.k->type != KEY_TYPE_snapshot) {
+		bch2_fs_inconsistent(trans->c, "missing snapshot %u", id);
+		ret = -ENOENT;
+		goto err;
+	}
+
+	/* already deleted? */
+	if (BCH_SNAPSHOT_DELETED(bkey_s_c_to_snapshot(k).v))
+		goto err;
+
+	s = bch2_trans_kmalloc(trans, sizeof(*s));
+	ret = PTR_ERR_OR_ZERO(s);
+	if (ret)
+		goto err;
+
+	bkey_reassemble(&s->k_i, k);
+
+	SET_BCH_SNAPSHOT_DELETED(&s->v, true);
+	ret = bch2_trans_update(trans, &iter, &s->k_i, 0);
+	if (ret)
+		goto err;
+err:
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+static int bch2_snapshot_node_delete(struct btree_trans *trans, u32 id)
+{
+	struct btree_iter iter, p_iter = (struct btree_iter) { NULL };
+	struct bkey_s_c k;
+	struct bkey_s_c_snapshot s;
+	struct bkey_i_snapshot *parent;
+	u32 parent_id;
+	unsigned i;
+	int ret = 0;
+
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots, POS(0, id),
+			     BTREE_ITER_INTENT);
+	k = bch2_btree_iter_peek_slot(&iter);
+	ret = bkey_err(k);
+	if (ret)
+		goto err;
+
+	if (k.k->type != KEY_TYPE_snapshot) {
+		bch2_fs_inconsistent(trans->c, "missing snapshot %u", id);
+		ret = -ENOENT;
+		goto err;
+	}
+
+	s = bkey_s_c_to_snapshot(k);
+
+	BUG_ON(!BCH_SNAPSHOT_DELETED(s.v));
+	parent_id = le32_to_cpu(s.v->parent);
+
+	if (parent_id) {
+		bch2_trans_iter_init(trans, &p_iter, BTREE_ID_snapshots,
+				     POS(0, parent_id),
+				     BTREE_ITER_INTENT);
+		k = bch2_btree_iter_peek_slot(&p_iter);
+		ret = bkey_err(k);
+		if (ret)
+			goto err;
+
+		if (k.k->type != KEY_TYPE_snapshot) {
+			bch2_fs_inconsistent(trans->c, "missing snapshot %u", parent_id);
+			ret = -ENOENT;
+			goto err;
+		}
+
+		parent = bch2_trans_kmalloc(trans, sizeof(*parent));
+		ret = PTR_ERR_OR_ZERO(parent);
+		if (ret)
+			goto err;
+
+		bkey_reassemble(&parent->k_i, k);
+
+		for (i = 0; i < 2; i++)
+			if (le32_to_cpu(parent->v.children[i]) == id)
+				break;
+
+		if (i == 2)
+			bch_err(trans->c, "snapshot %u missing child pointer to %u",
+				parent_id, id);
+		else
+			parent->v.children[i] = 0;
+
+		if (le32_to_cpu(parent->v.children[0]) <
+		    le32_to_cpu(parent->v.children[1]))
+			swap(parent->v.children[0],
+			     parent->v.children[1]);
+
+		ret = bch2_trans_update(trans, &p_iter, &parent->k_i, 0);
+		if (ret)
+			goto err;
+	}
+
+	ret = bch2_btree_delete_at(trans, &iter, 0);
+err:
+	bch2_trans_iter_exit(trans, &p_iter);
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+static int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent,
+				     u32 *new_snapids,
+				     u32 *snapshot_subvols,
+				     unsigned nr_snapids)
+{
+	struct btree_iter iter;
+	struct bkey_i_snapshot *n;
+	struct bkey_s_c k;
+	unsigned i;
+	int ret = 0;
+
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots,
+			     POS_MIN, BTREE_ITER_INTENT);
+	k = bch2_btree_iter_peek(&iter);
+	ret = bkey_err(k);
+	if (ret)
+		goto err;
+
+	for (i = 0; i < nr_snapids; i++) {
+		k = bch2_btree_iter_prev_slot(&iter);
+		ret = bkey_err(k);
+		if (ret)
+			goto err;
+
+		if (!k.k || !k.k->p.offset) {
+			ret = -ENOSPC;
+			goto err;
+		}
+
+		n = bch2_trans_kmalloc(trans, sizeof(*n));
+		ret = PTR_ERR_OR_ZERO(n);
+		if (ret)
+			return ret;
+
+		bkey_snapshot_init(&n->k_i);
+		n->k.p		= iter.pos;
+		n->v.flags	= 0;
+		n->v.parent	= cpu_to_le32(parent);
+		n->v.subvol	= cpu_to_le32(snapshot_subvols[i]);
+		n->v.pad	= 0;
+		SET_BCH_SNAPSHOT_SUBVOL(&n->v, true);
+
+		bch2_trans_update(trans, &iter, &n->k_i, 0);
+
+		ret = bch2_mark_snapshot(trans->c, bkey_s_c_null, bkey_i_to_s_c(&n->k_i), 0, 0);
+		if (ret)
+			break;
+
+		new_snapids[i]	= iter.pos.offset;
+	}
+
+	if (parent) {
+		bch2_btree_iter_set_pos(&iter, POS(0, parent));
+		k = bch2_btree_iter_peek(&iter);
+		ret = bkey_err(k);
+		if (ret)
+			goto err;
+
+		if (k.k->type != KEY_TYPE_snapshot) {
+			bch_err(trans->c, "snapshot %u not found", parent);
+			ret = -ENOENT;
+			goto err;
+		}
+
+		n = bch2_trans_kmalloc(trans, sizeof(*n));
+		ret = PTR_ERR_OR_ZERO(n);
+		if (ret)
+			return ret;
+
+		bkey_reassemble(&n->k_i, k);
+
+		if (n->v.children[0] || n->v.children[1]) {
+			bch_err(trans->c, "Trying to add child snapshot nodes to parent that already has children");
+			ret = -EINVAL;
+			goto err;
+		}
+
+		n->v.children[0] = cpu_to_le32(new_snapids[0]);
+		n->v.children[1] = cpu_to_le32(new_snapids[1]);
+		SET_BCH_SNAPSHOT_SUBVOL(&n->v, false);
+		bch2_trans_update(trans, &iter, &n->k_i, 0);
+	}
+err:
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+/* List of snapshot IDs that are being deleted: */
+struct snapshot_id_list {
+	u32		nr;
+	u32		size;
+	u32		*d;
+};
+
+static bool snapshot_list_has_id(struct snapshot_id_list *s, u32 id)
+{
+	unsigned i;
+
+	for (i = 0; i < s->nr; i++)
+		if (id == s->d[i])
+			return true;
+	return false;
+}
+
+static int snapshot_id_add(struct snapshot_id_list *s, u32 id)
+{
+	BUG_ON(snapshot_list_has_id(s, id));
+
+	if (s->nr == s->size) {
+		size_t new_size = max(8U, s->size * 2);
+		void *n = krealloc(s->d,
+				   new_size * sizeof(s->d[0]),
+				   GFP_KERNEL);
+		if (!n) {
+			pr_err("error allocating snapshot ID list");
+			return -ENOMEM;
+		}
+
+		s->d	= n;
+		s->size = new_size;
+	};
+
+	s->d[s->nr++] = id;
+	return 0;
+}
+
+static int bch2_snapshot_delete_keys_btree(struct btree_trans *trans,
+					   struct snapshot_id_list *deleted,
+					   enum btree_id btree_id)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	struct snapshot_id_list equiv_seen = { 0 };
+	struct bpos last_pos = POS_MIN;
+	int ret = 0;
+
+	/*
+	 * XXX: We should also delete whiteouts that no longer overwrite
+	 * anything
+	 */
+
+	bch2_trans_iter_init(trans, &iter, btree_id, POS_MIN,
+			     BTREE_ITER_INTENT|
+			     BTREE_ITER_PREFETCH|
+			     BTREE_ITER_NOT_EXTENTS|
+			     BTREE_ITER_ALL_SNAPSHOTS);
+
+	while ((bch2_trans_begin(trans),
+		(k = bch2_btree_iter_peek(&iter)).k) &&
+	       !(ret = bkey_err(k))) {
+		u32 equiv = snapshot_t(c, k.k->p.snapshot)->equiv;
+
+		if (bkey_cmp(k.k->p, last_pos))
+			equiv_seen.nr = 0;
+		last_pos = k.k->p;
+
+		if (snapshot_list_has_id(deleted, k.k->p.snapshot) ||
+		    snapshot_list_has_id(&equiv_seen, equiv)) {
+			if (btree_id == BTREE_ID_inodes &&
+			    bch2_btree_key_cache_flush(trans, btree_id, iter.pos))
+				continue;
+
+			ret = __bch2_trans_do(trans, NULL, NULL,
+					      BTREE_INSERT_NOFAIL,
+				bch2_btree_iter_traverse(&iter) ?:
+				bch2_btree_delete_at(trans, &iter,
+					BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE));
+			if (ret)
+				break;
+		} else {
+			ret = snapshot_id_add(&equiv_seen, equiv);
+			if (ret)
+				break;
+		}
+
+		bch2_btree_iter_advance(&iter);
+	}
+	bch2_trans_iter_exit(trans, &iter);
+
+	kfree(equiv_seen.d);
+
+	return ret;
+}
+
+static void bch2_delete_dead_snapshots_work(struct work_struct *work)
+{
+	struct bch_fs *c = container_of(work, struct bch_fs, snapshot_delete_work);
+	struct btree_trans trans;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	struct bkey_s_c_snapshot snap;
+	struct snapshot_id_list deleted = { 0 };
+	u32 i, id, children[2];
+	int ret = 0;
+
+	bch2_trans_init(&trans, c, 0, 0);
+
+	/*
+	 * For every snapshot node: If we have no live children and it's not
+	 * pointed to by a subvolume, delete it:
+	 */
+	for_each_btree_key(&trans, iter, BTREE_ID_snapshots,
+			   POS_MIN, 0, k, ret) {
+		if (k.k->type != KEY_TYPE_snapshot)
+			continue;
+
+		snap = bkey_s_c_to_snapshot(k);
+		if (BCH_SNAPSHOT_DELETED(snap.v) ||
+		    BCH_SNAPSHOT_SUBVOL(snap.v))
+			continue;
+
+		children[0] = le32_to_cpu(snap.v->children[0]);
+		children[1] = le32_to_cpu(snap.v->children[1]);
+
+		ret   = snapshot_live(&trans, children[0]) ?:
+			snapshot_live(&trans, children[1]);
+		if (ret < 0)
+			break;
+		if (ret)
+			continue;
+
+		ret = __bch2_trans_do(&trans, NULL, NULL, 0,
+			bch2_snapshot_node_set_deleted(&trans, iter.pos.offset));
+		if (ret) {
+			bch_err(c, "error deleting snapshot %llu: %i", iter.pos.offset, ret);
+			break;
+		}
+	}
+	bch2_trans_iter_exit(&trans, &iter);
+
+	if (ret) {
+		bch_err(c, "error walking snapshots: %i", ret);
+		goto err;
+	}
+
+	ret = bch2_snapshots_set_equiv(&trans);
+	if (ret)
+		goto err;
+
+	for_each_btree_key(&trans, iter, BTREE_ID_snapshots,
+			   POS_MIN, 0, k, ret) {
+		if (k.k->type != KEY_TYPE_snapshot)
+			continue;
+
+		snap = bkey_s_c_to_snapshot(k);
+		if (BCH_SNAPSHOT_DELETED(snap.v)) {
+			ret = snapshot_id_add(&deleted, k.k->p.offset);
+			if (ret)
+				break;
+		}
+	}
+	bch2_trans_iter_exit(&trans, &iter);
+
+	if (ret) {
+		bch_err(c, "error walking snapshots: %i", ret);
+		goto err;
+	}
+
+	for (id = 0; id < BTREE_ID_NR; id++) {
+		if (!btree_type_has_snapshots(id))
+			continue;
+
+		ret = bch2_snapshot_delete_keys_btree(&trans, &deleted, id);
+		if (ret) {
+			bch_err(c, "error deleting snapshot keys: %i", ret);
+			goto err;
+		}
+	}
+
+	for (i = 0; i < deleted.nr; i++) {
+		ret = __bch2_trans_do(&trans, NULL, NULL, 0,
+			bch2_snapshot_node_delete(&trans, deleted.d[i]));
+		if (ret) {
+			bch_err(c, "error deleting snapshot %u: %i",
+				deleted.d[i], ret);
+			goto err;
+		}
+	}
+err:
+	kfree(deleted.d);
+	bch2_trans_exit(&trans);
+	percpu_ref_put(&c->writes);
+}
+
+static void bch2_delete_dead_snapshots(struct bch_fs *c)
+{
+	if (unlikely(!percpu_ref_tryget(&c->writes)))
+		return;
+
+	if (!queue_work(system_long_wq, &c->snapshot_delete_work))
+		percpu_ref_put(&c->writes);
+}
+
+static int bch2_delete_dead_snapshots_hook(struct btree_trans *trans,
+					   struct btree_trans_commit_hook *h)
+{
+	bch2_delete_dead_snapshots(trans->c);
+	return 0;
+}
+
+/* Subvolumes: */
+
+const char *bch2_subvolume_invalid(const struct bch_fs *c, struct bkey_s_c k)
+{
+	if (bkey_cmp(k.k->p, SUBVOL_POS_MIN) < 0)
+		return "invalid pos";
+
+	if (bkey_cmp(k.k->p, SUBVOL_POS_MAX) > 0)
+		return "invalid pos";
+
+	if (bkey_val_bytes(k.k) != sizeof(struct bch_subvolume))
+		return "bad val size";
+
+	return NULL;
+}
+
+void bch2_subvolume_to_text(struct printbuf *out, struct bch_fs *c,
+			    struct bkey_s_c k)
+{
+	struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k);
+
+	pr_buf(out, "root %llu snapshot id %u",
+	       le64_to_cpu(s.v->inode),
+	       le32_to_cpu(s.v->snapshot));
+}
+
+int bch2_subvolume_get_snapshot(struct btree_trans *trans, u32 subvol,
+				u32 *snapid)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	int ret;
+
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_subvolumes,
+			     POS(0, subvol),
+			     BTREE_ITER_CACHED|
+			     BTREE_ITER_WITH_UPDATES);
+	k = bch2_btree_iter_peek_slot(&iter);
+	ret = bkey_err(k);
+	if (ret)
+		goto err;
+
+	if (k.k->type != KEY_TYPE_subvolume) {
+		bch2_fs_inconsistent(trans->c, "missing subvolume %u", subvol);
+		ret = -EIO;
+		goto err;
+	}
+
+	*snapid = le32_to_cpu(bkey_s_c_to_subvolume(k).v->snapshot);
+err:
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+/* XXX: mark snapshot id for deletion, walk btree and delete: */
+int bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid,
+			  int deleting_snapshot)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	struct bkey_s_c_subvolume subvol;
+	struct btree_trans_commit_hook *h;
+	struct bkey_i *delete;
+	u32 snapid;
+	int ret = 0;
+
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_subvolumes,
+			     POS(0, subvolid),
+			     BTREE_ITER_CACHED|
+			     BTREE_ITER_INTENT);
+	k = bch2_btree_iter_peek_slot(&iter);
+	ret = bkey_err(k);
+	if (ret)
+		goto err;
+
+	if (k.k->type != KEY_TYPE_subvolume) {
+		bch2_fs_inconsistent(trans->c, "missing subvolume %u", subvolid);
+		ret = -EIO;
+		goto err;
+	}
+
+	subvol = bkey_s_c_to_subvolume(k);
+	snapid = le32_to_cpu(subvol.v->snapshot);
+
+	if (deleting_snapshot >= 0 &&
+	    deleting_snapshot != BCH_SUBVOLUME_SNAP(subvol.v)) {
+		ret = -ENOENT;
+		goto err;
+	}
+
+	delete = bch2_trans_kmalloc(trans, sizeof(*delete));
+	ret = PTR_ERR_OR_ZERO(delete);
+	if (ret)
+		goto err;
+
+	bkey_init(&delete->k);
+	delete->k.p = iter.pos;
+	ret = bch2_trans_update(trans, &iter, delete, 0);
+	if (ret)
+		goto err;
+
+	ret = bch2_snapshot_node_set_deleted(trans, snapid);
+
+	h = bch2_trans_kmalloc(trans, sizeof(*h));
+	ret = PTR_ERR_OR_ZERO(h);
+	if (ret)
+		goto err;
+
+	h->fn = bch2_delete_dead_snapshots_hook;
+	bch2_trans_commit_hook(trans, h);
+err:
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+int bch2_subvolume_create(struct btree_trans *trans, u64 inode,
+			  u32 src_subvolid,
+			  u32 *new_subvolid,
+			  u32 *new_snapshotid,
+			  bool ro)
+{
+	struct btree_iter dst_iter, src_iter = (struct btree_iter) { NULL };
+	struct bkey_i_subvolume *new_subvol = NULL;
+	struct bkey_i_subvolume *src_subvol = NULL;
+	struct bkey_s_c k;
+	u32 parent = 0, new_nodes[2], snapshot_subvols[2];
+	int ret = 0;
+
+	for_each_btree_key(trans, dst_iter, BTREE_ID_subvolumes, SUBVOL_POS_MIN,
+			   BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) {
+		if (bkey_cmp(k.k->p, SUBVOL_POS_MAX) > 0)
+			break;
+		if (bkey_deleted(k.k))
+			goto found_slot;
+	}
+
+	if (!ret)
+		ret = -ENOSPC;
+	goto err;
+found_slot:
+	snapshot_subvols[0] = dst_iter.pos.offset;
+	snapshot_subvols[1] = src_subvolid;
+
+	if (src_subvolid) {
+		/* Creating a snapshot: */
+		src_subvol = bch2_trans_kmalloc(trans, sizeof(*src_subvol));
+		ret = PTR_ERR_OR_ZERO(src_subvol);
+		if (ret)
+			goto err;
+
+		bch2_trans_iter_init(trans, &src_iter, BTREE_ID_subvolumes,
+				     POS(0, src_subvolid),
+				     BTREE_ITER_CACHED|
+				     BTREE_ITER_INTENT);
+		k = bch2_btree_iter_peek_slot(&src_iter);
+		ret = bkey_err(k);
+		if (ret)
+			goto err;
+
+		if (k.k->type != KEY_TYPE_subvolume) {
+			bch_err(trans->c, "subvolume %u not found", src_subvolid);
+			ret = -ENOENT;
+			goto err;
+		}
+
+		bkey_reassemble(&src_subvol->k_i, k);
+		parent = le32_to_cpu(src_subvol->v.snapshot);
+	}
+
+	ret = bch2_snapshot_node_create(trans, parent, new_nodes,
+					snapshot_subvols,
+					src_subvolid ? 2 : 1);
+	if (ret)
+		goto err;
+
+	if (src_subvolid) {
+		src_subvol->v.snapshot = cpu_to_le32(new_nodes[1]);
+		bch2_trans_update(trans, &src_iter, &src_subvol->k_i, 0);
+	}
+
+	new_subvol = bch2_trans_kmalloc(trans, sizeof(*new_subvol));
+	ret = PTR_ERR_OR_ZERO(new_subvol);
+	if (ret)
+		goto err;
+
+	bkey_subvolume_init(&new_subvol->k_i);
+	new_subvol->v.flags	= 0;
+	new_subvol->v.snapshot	= cpu_to_le32(new_nodes[0]);
+	new_subvol->v.inode	= cpu_to_le64(inode);
+	SET_BCH_SUBVOLUME_RO(&new_subvol->v, ro);
+	SET_BCH_SUBVOLUME_SNAP(&new_subvol->v, src_subvolid != 0);
+	new_subvol->k.p		= dst_iter.pos;
+	bch2_trans_update(trans, &dst_iter, &new_subvol->k_i, 0);
+
+	*new_subvolid	= new_subvol->k.p.offset;
+	*new_snapshotid	= new_nodes[0];
+err:
+	bch2_trans_iter_exit(trans, &src_iter);
+	bch2_trans_iter_exit(trans, &dst_iter);
+	return ret;
+}
+
+int bch2_fs_subvolumes_init(struct bch_fs *c)
+{
+	INIT_WORK(&c->snapshot_delete_work, bch2_delete_dead_snapshots_work);
+	return 0;
+}
diff --git a/fs/bcachefs/subvolume.h b/fs/bcachefs/subvolume.h
new file mode 100644
index 000000000000..cea4c665af32
--- /dev/null
+++ b/fs/bcachefs/subvolume.h
@@ -0,0 +1,77 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_SUBVOLUME_H
+#define _BCACHEFS_SUBVOLUME_H
+
+void bch2_snapshot_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
+const char *bch2_snapshot_invalid(const struct bch_fs *, struct bkey_s_c);
+
+#define bch2_bkey_ops_snapshot (struct bkey_ops) {		\
+	.key_invalid	= bch2_snapshot_invalid,		\
+	.val_to_text	= bch2_snapshot_to_text,		\
+}
+
+int bch2_mark_snapshot(struct bch_fs *, struct bkey_s_c,
+		       struct bkey_s_c, u64, unsigned);
+
+static inline struct snapshot_t *snapshot_t(struct bch_fs *c, u32 id)
+{
+	return genradix_ptr(&c->snapshots, U32_MAX - id);
+}
+
+static inline u32 bch2_snapshot_parent(struct bch_fs *c, u32 id)
+{
+	return snapshot_t(c, id)->parent;
+}
+
+static inline u32 bch2_snapshot_internal_node(struct bch_fs *c, u32 id)
+{
+	struct snapshot_t *s = snapshot_t(c, id);
+
+	return s->children[0] || s->children[1];
+}
+
+static inline u32 bch2_snapshot_sibling(struct bch_fs *c, u32 id)
+{
+	struct snapshot_t *s;
+	u32 parent = bch2_snapshot_parent(c, id);
+
+	if (!parent)
+		return 0;
+
+	s = snapshot_t(c, bch2_snapshot_parent(c, id));
+	if (id == s->children[0])
+		return s->children[1];
+	if (id == s->children[1])
+		return s->children[0];
+	return 0;
+}
+
+static inline bool bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor)
+{
+	while (id && id < ancestor)
+		id = bch2_snapshot_parent(c, id);
+
+	return id == ancestor;
+}
+
+int bch2_fs_snapshots_check(struct bch_fs *);
+void bch2_fs_snapshots_exit(struct bch_fs *);
+int bch2_fs_snapshots_start(struct bch_fs *);
+
+const char *bch2_subvolume_invalid(const struct bch_fs *, struct bkey_s_c);
+void bch2_subvolume_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
+
+#define bch2_bkey_ops_subvolume (struct bkey_ops) {		\
+	.key_invalid	= bch2_subvolume_invalid,		\
+	.val_to_text	= bch2_subvolume_to_text,		\
+}
+
+int bch2_subvolume_get_snapshot(struct btree_trans *, u32, u32 *);
+
+int bch2_subvolume_delete(struct btree_trans *, u32, int);
+int bch2_subvolume_create(struct btree_trans *, u64, u32,
+			  u32 *, u32 *, bool);
+
+int bch2_fs_subvolumes_init(struct bch_fs *);
+
+#endif /* _BCACHEFS_SUBVOLUME_H */
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index b18ca3947ac8..a861ec32dbde 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -39,6 +39,7 @@
 #include "rebalance.h"
 #include "recovery.h"
 #include "replicas.h"
+#include "subvolume.h"
 #include "super.h"
 #include "super-io.h"
 #include "sysfs.h"
@@ -475,6 +476,7 @@ static void __bch2_fs_free(struct bch_fs *c)
 	for (i = 0; i < BCH_TIME_STAT_NR; i++)
 		bch2_time_stats_exit(&c->times[i]);
 
+	bch2_fs_snapshots_exit(c);
 	bch2_fs_quota_exit(c);
 	bch2_fs_fsio_exit(c);
 	bch2_fs_ec_exit(c);
@@ -694,6 +696,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 	mutex_init(&c->usage_scratch_lock);
 
 	mutex_init(&c->bio_bounce_pages_lock);
+	mutex_init(&c->snapshot_table_lock);
 
 	spin_lock_init(&c->btree_write_error_lock);
 
@@ -797,6 +800,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 	    bch2_fs_btree_key_cache_init(&c->btree_key_cache) ||
 	    bch2_fs_btree_iter_init(c) ||
 	    bch2_fs_btree_interior_update_init(c) ||
+	    bch2_fs_subvolumes_init(c) ||
 	    bch2_fs_io_init(c) ||
 	    bch2_fs_encryption_init(c) ||
 	    bch2_fs_compress_init(c) ||
-- 
cgit 


From b9e1adf57988fb4632b86c43fde1551a24299b86 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 16 Mar 2021 00:46:26 -0400
Subject: bcachefs: Add support for dirents that point to subvolumes

Dirents currently always point to inodes. Subvolumes add a new type of
dirent, with d_type DT_SUBVOL, that instead points to an entry in the
subvolumes btree, and the subvolume has a pointer to the root inode.

This patch adds bch2_dirent_read_target() to get the inode (and
potentially subvolume) a dirent points to, and changes existing code to
use that instead of reading from d_inum directly.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/dirent.c    | 105 +++++++++++++++++++++++++++++++++++++++---------
 fs/bcachefs/dirent.h    |  14 ++++++-
 fs/bcachefs/fs-common.c |   9 +----
 fs/bcachefs/fsck.c      |  23 ++++++++++-
 4 files changed, 123 insertions(+), 28 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c
index 53c7687a9ca8..f3aef0686928 100644
--- a/fs/bcachefs/dirent.c
+++ b/fs/bcachefs/dirent.c
@@ -177,6 +177,61 @@ static void dirent_copy_target(struct bkey_i_dirent *dst,
 	dst->v.d_type = src.v->d_type;
 }
 
+int __bch2_dirent_read_target(struct btree_trans *trans,
+			      struct bkey_s_c_dirent d,
+			      u32 *subvol, u32 *snapshot, u64 *inum,
+			      bool is_fsck)
+{
+	int ret = 0;
+
+	*subvol		= 0;
+	*snapshot	= d.k->p.snapshot;
+
+	if (likely(d.v->d_type != DT_SUBVOL)) {
+		*inum = le64_to_cpu(d.v->d_inum);
+	} else {
+		struct btree_iter iter;
+		struct bkey_s_c k;
+		struct bkey_s_c_subvolume s;
+		int ret;
+
+		*subvol = le64_to_cpu(d.v->d_inum);
+		bch2_trans_iter_init(trans, &iter, BTREE_ID_subvolumes,
+				     POS(0, *subvol),
+				     BTREE_ITER_CACHED);
+		k = bch2_btree_iter_peek_slot(&iter);
+		ret = bkey_err(k);
+		if (ret)
+			goto err;
+
+		if (k.k->type != KEY_TYPE_subvolume) {
+			ret = -ENOENT;
+			goto err;
+		}
+
+		s = bkey_s_c_to_subvolume(k);
+		*snapshot	= le32_to_cpu(s.v->snapshot);
+		*inum		= le64_to_cpu(s.v->inode);
+err:
+		if (ret == -ENOENT && !is_fsck)
+			bch2_fs_inconsistent(trans->c, "pointer to missing subvolume %u",
+					     *subvol);
+
+		bch2_trans_iter_exit(trans, &iter);
+	}
+
+	return ret;
+}
+
+int bch2_dirent_read_target(struct btree_trans *trans,
+			    struct bkey_s_c_dirent d, u64 *target)
+{
+	u32 subvol, snapshot;
+
+	return __bch2_dirent_read_target(trans, d, &subvol,
+					 &snapshot, target, false);
+}
+
 int bch2_dirent_rename(struct btree_trans *trans,
 		       u64 src_dir, struct bch_hash_info *src_hash,
 		       u64 dst_dir, struct bch_hash_info *dst_hash,
@@ -323,10 +378,32 @@ int __bch2_dirent_lookup_trans(struct btree_trans *trans,
 			       struct btree_iter *iter,
 			       u64 dir_inum,
 			       const struct bch_hash_info *hash_info,
-			       const struct qstr *name, unsigned flags)
+			       const struct qstr *name, u64 *inum,
+			       unsigned flags)
 {
-	return bch2_hash_lookup(trans, iter, bch2_dirent_hash_desc,
-				hash_info, dir_inum, name, flags);
+	struct bkey_s_c k;
+	struct bkey_s_c_dirent d;
+	int ret;
+
+	ret = bch2_hash_lookup(trans, iter, bch2_dirent_hash_desc,
+			       hash_info, dir_inum, name, flags);
+	if (ret)
+		return ret;
+
+	k = bch2_btree_iter_peek_slot(iter);
+	ret = bkey_err(k);
+	if (ret) {
+		bch2_trans_iter_exit(trans, iter);
+		return ret;
+	}
+
+	d = bkey_s_c_to_dirent(k);
+
+	ret = bch2_dirent_read_target(trans, d, inum);
+	if (ret)
+		bch2_trans_iter_exit(trans, iter);
+
+	return ret;
 }
 
 u64 bch2_dirent_lookup(struct bch_fs *c, u64 dir_inum,
@@ -335,26 +412,18 @@ u64 bch2_dirent_lookup(struct bch_fs *c, u64 dir_inum,
 {
 	struct btree_trans trans;
 	struct btree_iter iter;
-	struct bkey_s_c k;
 	u64 inum = 0;
 	int ret = 0;
 
 	bch2_trans_init(&trans, c, 0, 0);
+retry:
+	bch2_trans_begin(&trans);
+	ret = __bch2_dirent_lookup_trans(&trans, &iter, dir_inum, hash_info,
+					 name, &inum, 0);
 
-	ret = __bch2_dirent_lookup_trans(&trans, &iter, dir_inum,
-					 hash_info, name, 0);
-	if (ret)
-		goto out;
-
-	k = bch2_btree_iter_peek_slot(&iter);
-	ret = bkey_err(k);
-	if (ret)
-		goto out;
-
-	inum = le64_to_cpu(bkey_s_c_to_dirent(k).v->d_inum);
 	bch2_trans_iter_exit(&trans, &iter);
-out:
-	BUG_ON(ret == -EINTR);
+	if (ret == -EINTR)
+		goto retry;
 	bch2_trans_exit(&trans);
 	return inum;
 }
@@ -408,7 +477,7 @@ int bch2_readdir(struct bch_fs *c, u64 inum, struct dir_context *ctx)
 		if (!dir_emit(ctx, dirent.v->d_name,
 			      bch2_dirent_name_bytes(dirent),
 			      le64_to_cpu(dirent.v->d_inum),
-			      dirent.v->d_type))
+			      vfs_d_type(dirent.v->d_type)))
 			break;
 		ctx->pos = dirent.k->p.offset + 1;
 	}
diff --git a/fs/bcachefs/dirent.h b/fs/bcachefs/dirent.h
index c14f6029e1c9..3cd05a2454e1 100644
--- a/fs/bcachefs/dirent.h
+++ b/fs/bcachefs/dirent.h
@@ -37,6 +37,17 @@ int bch2_dirent_delete_at(struct btree_trans *,
 			  const struct bch_hash_info *,
 			  struct btree_iter *);
 
+int __bch2_dirent_read_target(struct btree_trans *, struct bkey_s_c_dirent,
+			      u32 *, u32 *, u64 *, bool);
+
+int bch2_dirent_read_target(struct btree_trans *,
+			    struct bkey_s_c_dirent, u64 *);
+
+static inline unsigned vfs_d_type(unsigned type)
+{
+	return type == DT_SUBVOL ? DT_DIR : type;
+}
+
 enum bch_rename_mode {
 	BCH_RENAME,
 	BCH_RENAME_OVERWRITE,
@@ -52,7 +63,8 @@ int bch2_dirent_rename(struct btree_trans *,
 
 int __bch2_dirent_lookup_trans(struct btree_trans *, struct btree_iter *, u64,
 			   const struct bch_hash_info *,
-			   const struct qstr *, unsigned);
+			   const struct qstr *, u64 *,
+			   unsigned);
 u64 bch2_dirent_lookup(struct bch_fs *, u64, const struct bch_hash_info *,
 		       const struct qstr *);
 
diff --git a/fs/bcachefs/fs-common.c b/fs/bcachefs/fs-common.c
index 6bc82559c9b1..96b09b005d0b 100644
--- a/fs/bcachefs/fs-common.c
+++ b/fs/bcachefs/fs-common.c
@@ -159,17 +159,10 @@ int bch2_unlink_trans(struct btree_trans *trans,
 	dir_hash = bch2_hash_info_init(c, dir_u);
 
 	ret = __bch2_dirent_lookup_trans(trans, &dirent_iter, dir_inum, &dir_hash,
-					 name, BTREE_ITER_INTENT);
+					 name, &inum, BTREE_ITER_INTENT);
 	if (ret)
 		goto err;
 
-	k = bch2_btree_iter_peek_slot(&dirent_iter);
-	ret = bkey_err(k);
-	if (ret)
-		goto err;
-
-	inum = le64_to_cpu(bkey_s_c_to_dirent(k).v->d_inum);
-
 	ret = bch2_inode_peek(trans, &inode_iter, inode_u, inum, BTREE_ITER_INTENT);
 	if (ret)
 		goto err;
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 62158c0803db..dca4abda2c41 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -723,6 +723,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
 	struct bkey_s_c_dirent d;
 	struct bch_inode_unpacked target;
 	u32 target_snapshot;
+	u32 target_subvol;
 	bool have_target;
 	bool backpointer_exists = true;
 	u64 d_inum;
@@ -783,6 +784,10 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
 	d = bkey_s_c_to_dirent(k);
 	d_inum = le64_to_cpu(d.v->d_inum);
 
+	ret = bch2_dirent_read_target(trans, d, &d_inum);
+	if (ret && ret != -ENOENT)
+		return ret;
+
 	ret = __lookup_inode(trans, d_inum, &target, &target_snapshot);
 	if (ret && ret != -ENOENT)
 		return ret;
@@ -855,7 +860,23 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
 		}
 	}
 
-	if (fsck_err_on(d.v->d_type != mode_to_type(target.bi_mode), c,
+	target_subvol = d.v->d_type == DT_SUBVOL
+		? le64_to_cpu(d.v->d_inum) : 0;
+
+	if (fsck_err_on(target.bi_subvol != target_subvol, c,
+			"subvol root %llu has wrong subvol field:\n"
+			"got       %u\n"
+			"should be %u",
+			target.bi_inum,
+			target.bi_subvol,
+			target_subvol)) {
+		target.bi_subvol = target_subvol;
+
+		ret = write_inode(trans, &target, target_snapshot);
+		return ret ?: -EINTR;
+	}
+
+	if (fsck_err_on(vfs_d_type(d.v->d_type) != mode_to_type(target.bi_mode), c,
 			"incorrect d_type: should be %u:\n%s",
 			mode_to_type(target.bi_mode),
 			(bch2_bkey_val_to_text(&PBUF(buf), c,
-- 
cgit 


From 81ed9ce3671125ee384c1a205747a853ca2a1739 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 19 Apr 2021 23:31:40 -0400
Subject: bcachefs: Per subvolume lost+found

On existing filesystems, we have a single global lost+found. Introducing
subvolumes means we need to introduce per subvolume lost+found
directories, because inodes are added to lost+found by their inode
number, and inode numbers are now only unique within a subvolume.

This patch adds support to fsck for per subvolume lost+found.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/fsck.c | 94 ++++++++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 84 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index dca4abda2c41..e4ca05aae76c 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -39,6 +39,71 @@ static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum)
 	return ret ?: sectors;
 }
 
+static int __snapshot_lookup_subvol(struct btree_trans *trans, u32 snapshot,
+				    u32 *subvol)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	int ret;
+
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots,
+			     POS(0, snapshot), 0);
+	k = bch2_btree_iter_peek_slot(&iter);
+	ret = bkey_err(k);
+	if (ret)
+		goto err;
+
+	if (k.k->type != KEY_TYPE_snapshot) {
+		bch_err(trans->c, "snapshot %u not fonud", snapshot);
+		ret = -ENOENT;
+		goto err;
+	}
+
+	*subvol = le32_to_cpu(bkey_s_c_to_snapshot(k).v->subvol);
+err:
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+
+}
+
+static int snapshot_lookup_subvol(struct btree_trans *trans, u32 snapshot,
+				  u32 *subvol)
+{
+	return lockrestart_do(trans, __snapshot_lookup_subvol(trans, snapshot, subvol));
+}
+
+static int __subvol_lookup_root(struct btree_trans *trans, u32 subvol,
+				u64 *inum)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	int ret;
+
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_subvolumes,
+			     POS(0, subvol), 0);
+	k = bch2_btree_iter_peek_slot(&iter);
+	ret = bkey_err(k);
+	if (ret)
+		goto err;
+
+	if (k.k->type != KEY_TYPE_subvolume) {
+		bch_err(trans->c, "subvolume %u not fonud", subvol);
+		ret = -ENOENT;
+		goto err;
+	}
+
+	*inum = le64_to_cpu(bkey_s_c_to_subvolume(k).v->inode);
+err:
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+
+}
+
+static int subvol_lookup_root(struct btree_trans *trans, u32 subvol, u64 *inum)
+{
+	return lockrestart_do(trans, __subvol_lookup_root(trans, subvol, inum));
+}
+
 static int __lookup_inode(struct btree_trans *trans, u64 inode_nr,
 			  struct bch_inode_unpacked *inode,
 			  u32 *snapshot)
@@ -136,6 +201,7 @@ static int remove_dirent(struct btree_trans *trans, struct bpos pos)
 
 /* Get lost+found, create if it doesn't exist: */
 static int lookup_lostfound(struct btree_trans *trans,
+			    u32 subvol,
 			    struct bch_inode_unpacked *lostfound)
 {
 	struct bch_fs *c = trans->c;
@@ -146,12 +212,14 @@ static int lookup_lostfound(struct btree_trans *trans,
 	u32 snapshot;
 	int ret;
 
-	ret = lookup_inode(trans, BCACHEFS_ROOT_INO, &root, &snapshot);
+	ret = subvol_lookup_root(trans, subvol, &inum);
+
+	ret = lookup_inode(trans, inum, &root, &snapshot);
 	if (ret && ret != -ENOENT)
 		return ret;
 
 	root_hash_info = bch2_hash_info_init(c, &root);
-	inum = bch2_dirent_lookup(c, BCACHEFS_ROOT_INO, &root_hash_info,
+	inum = bch2_dirent_lookup(c, root.bi_inum, &root_hash_info,
 				  &lostfound_str);
 	if (!inum) {
 		bch_notice(c, "creating lost+found");
@@ -188,16 +256,22 @@ create_lostfound:
 }
 
 static int reattach_inode(struct btree_trans *trans,
-			  struct bch_inode_unpacked *inode)
+			  struct bch_inode_unpacked *inode,
+			  u32 snapshot)
 {
 	struct bch_hash_info dir_hash;
 	struct bch_inode_unpacked lostfound;
 	char name_buf[20];
 	struct qstr name;
 	u64 dir_offset = 0;
+	u32 subvol;
 	int ret;
 
-	ret = lookup_lostfound(trans, &lostfound);
+	ret = snapshot_lookup_subvol(trans, snapshot, &subvol);
+	if (ret)
+		return ret;
+
+	ret = lookup_lostfound(trans, subvol, &lostfound);
 	if (ret)
 		return ret;
 
@@ -1063,10 +1137,10 @@ static int path_down(struct pathbuf *p, u64 inum)
 
 static int check_path(struct btree_trans *trans,
 		      struct pathbuf *p,
-		      struct bch_inode_unpacked *inode)
+		      struct bch_inode_unpacked *inode,
+		      u32 snapshot)
 {
 	struct bch_fs *c = trans->c;
-	u32 snapshot;
 	size_t i;
 	int ret = 0;
 
@@ -1085,7 +1159,7 @@ static int check_path(struct btree_trans *trans,
 				     inode->bi_nlink,
 				     inode->bi_dir,
 				     inode->bi_dir_offset))
-				ret = reattach_inode(trans, inode);
+				ret = reattach_inode(trans, inode, snapshot);
 			break;
 		}
 		ret = 0;
@@ -1108,13 +1182,13 @@ static int check_path(struct btree_trans *trans,
 				return 0;
 
 			ret = lockrestart_do(trans,
-					 remove_backpointer(trans, inode));
+					remove_backpointer(trans, inode));
 			if (ret) {
 				bch_err(c, "error removing dirent: %i", ret);
 				break;
 			}
 
-			ret = reattach_inode(trans, inode);
+			ret = reattach_inode(trans, inode, snapshot);
 			break;
 		}
 
@@ -1160,7 +1234,7 @@ static int check_directory_structure(struct bch_fs *c)
 			break;
 		}
 
-		ret = check_path(&trans, &path, &u);
+		ret = check_path(&trans, &path, &u, iter.pos.snapshot);
 		if (ret)
 			break;
 	}
-- 
cgit 


From 284ae18c1d7aa44232baedf860a004ceb32fea62 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 16 Mar 2021 01:33:39 -0400
Subject: bcachefs: Add subvolume to ei_inode_info

Filesystem operations generally operate within a subvolume: at the start
of every btree transaction we'll be looking up (and locking) the
subvolume to get the current snapshot ID, which we then use for our
other btree lookups in BTREE_ITER_FILTER_SNAPSHOTS mode.

But inodes don't record what subvolume they're in - they can't, because
if they did we'd have to update every single inode within a subvolume
when taking a snapshot in order to keep that field up to date. So it
needs to be tracked in memory, based on how we got to that inode.

Hence this patch adds a subvolume field to ei_inode_info, and switches
to iget5() so we can index by it in the inode hash table.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/fs-ioctl.c |  6 ++--
 fs/bcachefs/fs.c       | 85 ++++++++++++++++++++++++++++++++++++--------------
 fs/bcachefs/fs.h       | 12 ++++++-
 3 files changed, 76 insertions(+), 27 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c
index 6d6368555875..ff6b1739342d 100644
--- a/fs/bcachefs/fs-ioctl.c
+++ b/fs/bcachefs/fs-ioctl.c
@@ -192,7 +192,7 @@ static int bch2_ioc_reinherit_attrs(struct bch_fs *c,
 	char *kname = NULL;
 	struct qstr qstr;
 	int ret = 0;
-	u64 inum;
+	subvol_inum inum = { .subvol = 1 };
 
 	kname = kmalloc(BCH_NAME_MAX + 1, GFP_KERNEL);
 	if (!kname)
@@ -206,9 +206,9 @@ static int bch2_ioc_reinherit_attrs(struct bch_fs *c,
 	qstr.name	= kname;
 
 	ret = -ENOENT;
-	inum = bch2_dirent_lookup(c, src->v.i_ino, &hash,
+	inum.inum = bch2_dirent_lookup(c, src->v.i_ino, &hash,
 				  &qstr);
-	if (!inum)
+	if (!inum.inum)
 		goto err1;
 
 	vinode = bch2_vfs_inode_get(c, inum);
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index 570ae826ebb5..7a994f3f9d20 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -37,7 +37,7 @@
 
 static struct kmem_cache *bch2_inode_cache;
 
-static void bch2_vfs_inode_init(struct bch_fs *,
+static void bch2_vfs_inode_init(struct bch_fs *, subvol_inum,
 				struct bch_inode_info *,
 				struct bch_inode_unpacked *);
 
@@ -209,40 +209,68 @@ int bch2_fs_quota_transfer(struct bch_fs *c,
 	return ret;
 }
 
-struct inode *bch2_vfs_inode_get(struct bch_fs *c, u64 inum)
+static int bch2_iget5_test(struct inode *vinode, void *p)
+{
+	struct bch_inode_info *inode = to_bch_ei(vinode);
+	subvol_inum *inum = p;
+
+	return inode->ei_subvol == inum->subvol &&
+		inode->ei_inode.bi_inum == inum->inum;
+}
+
+static int bch2_iget5_set(struct inode *vinode, void *p)
+{
+	struct bch_inode_info *inode = to_bch_ei(vinode);
+	subvol_inum *inum = p;
+
+	inode->v.i_ino		= inum->inum;
+	inode->ei_subvol	= inum->subvol;
+	inode->ei_inode.bi_inum	= inum->inum;
+	return 0;
+}
+
+static unsigned bch2_inode_hash(subvol_inum inum)
+{
+	return jhash_3words(inum.subvol, inum.inum >> 32, inum.inum, JHASH_INITVAL);
+}
+
+struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum)
 {
 	struct bch_inode_unpacked inode_u;
 	struct bch_inode_info *inode;
 	int ret;
 
-	inode = to_bch_ei(iget_locked(c->vfs_sb, inum));
+	/*
+	 * debug assert, to be removed when we start creating
+	 * subvolumes/snapshots:
+	 */
+	BUG_ON(inum.subvol != BCACHEFS_ROOT_SUBVOL);
+
+	inode = to_bch_ei(iget5_locked(c->vfs_sb,
+				       bch2_inode_hash(inum),
+				       bch2_iget5_test,
+				       bch2_iget5_set,
+				       &inum));
 	if (unlikely(!inode))
 		return ERR_PTR(-ENOMEM);
 	if (!(inode->v.i_state & I_NEW))
 		return &inode->v;
 
-	ret = bch2_inode_find_by_inum(c, inum, &inode_u);
+	ret = bch2_inode_find_by_inum(c, inum.inum, &inode_u);
 	if (ret) {
 		iget_failed(&inode->v);
 		return ERR_PTR(ret);
 	}
 
-	bch2_vfs_inode_init(c, inode, &inode_u);
+	bch2_vfs_inode_init(c, inum, inode, &inode_u);
 
-	inode->ei_journal_seq = bch2_inode_journal_seq(&c->journal, inum);
+	inode->ei_journal_seq = bch2_inode_journal_seq(&c->journal, inum.inum);
 
 	unlock_new_inode(&inode->v);
 
 	return &inode->v;
 }
 
-static int inum_test(struct inode *inode, void *p)
-{
-	unsigned long *ino = p;
-
-	return *ino == inode->i_ino;
-}
-
 static struct bch_inode_info *
 __bch2_create(struct mnt_idmap *idmap,
 	      struct bch_inode_info *dir, struct dentry *dentry,
@@ -254,6 +282,7 @@ __bch2_create(struct mnt_idmap *idmap,
 	struct bch_inode_info *inode, *old;
 	struct bch_inode_unpacked inode_u;
 	struct posix_acl *default_acl = NULL, *acl = NULL;
+	subvol_inum inum;
 	u64 journal_seq = 0;
 	int ret;
 
@@ -310,7 +339,10 @@ err_before_quota:
 		mutex_unlock(&dir->ei_update_lock);
 	}
 
-	bch2_vfs_inode_init(c, inode, &inode_u);
+	inum.subvol = inode_u.bi_subvol ?: dir->ei_subvol;
+	inum.inum = inode_u.bi_inum;
+
+	bch2_vfs_inode_init(c, inum, inode, &inode_u);
 	journal_seq_copy(c, inode, journal_seq);
 
 	set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl);
@@ -323,8 +355,12 @@ err_before_quota:
 	 */
 
 	inode->v.i_state |= I_CREATING;
-	old = to_bch_ei(inode_insert5(&inode->v, inode->v.i_ino,
-				      inum_test, NULL, &inode->v.i_ino));
+
+	old = to_bch_ei(inode_insert5(&inode->v,
+				      bch2_inode_hash(inum),
+				      bch2_iget5_test,
+				      bch2_iget5_set,
+				      &inum));
 	BUG_ON(!old);
 
 	if (unlikely(old != inode)) {
@@ -370,12 +406,12 @@ static struct dentry *bch2_lookup(struct inode *vdir, struct dentry *dentry,
 	struct bch_inode_info *dir = to_bch_ei(vdir);
 	struct bch_hash_info hash = bch2_hash_info_init(c, &dir->ei_inode);
 	struct inode *vinode = NULL;
-	u64 inum;
+	subvol_inum inum = { .subvol = 1 };
 
-	inum = bch2_dirent_lookup(c, dir->v.i_ino, &hash,
+	inum.inum = bch2_dirent_lookup(c, dir->v.i_ino, &hash,
 				  &dentry->d_name);
 
-	if (inum)
+	if (inum.inum)
 		vinode = bch2_vfs_inode_get(c, inum);
 
 	return d_splice_alias(vinode, dentry);
@@ -1098,6 +1134,7 @@ static const struct address_space_operations bch_address_space_operations = {
 	.error_remove_page = generic_error_remove_page,
 };
 
+#if 0
 static struct inode *bch2_nfs_get_inode(struct super_block *sb,
 		u64 ino, u32 generation)
 {
@@ -1131,14 +1168,15 @@ static struct dentry *bch2_fh_to_parent(struct super_block *sb, struct fid *fid,
 	return generic_fh_to_parent(sb, fid, fh_len, fh_type,
 				    bch2_nfs_get_inode);
 }
+#endif
 
 static const struct export_operations bch_export_ops = {
-	.fh_to_dentry	= bch2_fh_to_dentry,
-	.fh_to_parent	= bch2_fh_to_parent,
+	//.fh_to_dentry	= bch2_fh_to_dentry,
+	//.fh_to_parent	= bch2_fh_to_parent,
 	//.get_parent	= bch2_get_parent,
 };
 
-static void bch2_vfs_inode_init(struct bch_fs *c,
+static void bch2_vfs_inode_init(struct bch_fs *c, subvol_inum inum,
 				struct bch_inode_info *inode,
 				struct bch_inode_unpacked *bi)
 {
@@ -1154,6 +1192,7 @@ static void bch2_vfs_inode_init(struct bch_fs *c,
 	inode->ei_journal_seq	= 0;
 	inode->ei_quota_reserved = 0;
 	inode->ei_qid		= bch_qid(bi);
+	inode->ei_subvol	= inum.subvol;
 
 	inode->v.i_mapping->a_ops = &bch_address_space_operations;
 
@@ -1595,7 +1634,7 @@ got_sb:
 		sb->s_flags	|= SB_POSIXACL;
 #endif
 
-	vinode = bch2_vfs_inode_get(c, BCACHEFS_ROOT_INO);
+	vinode = bch2_vfs_inode_get(c, BCACHEFS_ROOT_SUBVOL_INUM);
 	if (IS_ERR(vinode)) {
 		bch_err(c, "error mounting: error getting root inode %i",
 			(int) PTR_ERR(vinode));
diff --git a/fs/bcachefs/fs.h b/fs/bcachefs/fs.h
index c08a828d66cd..6dae425bf616 100644
--- a/fs/bcachefs/fs.h
+++ b/fs/bcachefs/fs.h
@@ -44,10 +44,20 @@ struct bch_inode_info {
 	struct mutex		ei_quota_lock;
 	struct bch_qid		ei_qid;
 
+	u32			ei_subvol;
+
 	/* copy of inode in btree: */
 	struct bch_inode_unpacked ei_inode;
 };
 
+static inline subvol_inum inode_inum(struct bch_inode_info *inode)
+{
+	return (subvol_inum) {
+		.subvol	= inode->ei_subvol,
+		.inum	= inode->ei_inode.bi_inum,
+	};
+}
+
 /*
  * Set if we've gotten a btree error for this inode, and thus the vfs inode and
  * btree inode may be inconsistent:
@@ -153,7 +163,7 @@ static inline int bch2_set_projid(struct bch_fs *c,
 				      KEY_TYPE_QUOTA_PREALLOC);
 }
 
-struct inode *bch2_vfs_inode_get(struct bch_fs *, u64);
+struct inode *bch2_vfs_inode_get(struct bch_fs *, subvol_inum);
 
 /* returns 0 if we want to do the update, or error is passed up */
 typedef int (*inode_set_fn)(struct bch_inode_info *,
-- 
cgit 


From c075ff700ff397671636bf45f6ef6ef330258d3e Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 4 Mar 2021 22:29:25 -0500
Subject: bcachefs: BTREE_ITER_FILTER_SNAPSHOTS

For snapshots, we need to implement btree lookups that return the first
key that's an ancestor of the snapshot ID the lookup is being done in -
and filter out keys in unrelated snapshots. This patch adds the btree
iterator flag BTREE_ITER_FILTER_SNAPSHOTS which does that filtering.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_iter.c      | 168 ++++++++++++++++++++++++++++++++++++++----
 fs/bcachefs/btree_iter.h      |   9 +++
 fs/bcachefs/btree_key_cache.c |   3 +-
 fs/bcachefs/btree_types.h     |   1 +
 4 files changed, 166 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 4cfd793f85e7..b589b96bc9e7 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -13,6 +13,7 @@
 #include "extents.h"
 #include "journal.h"
 #include "replicas.h"
+#include "subvolume.h"
 #include "trace.h"
 
 #include <linux/prefetch.h>
@@ -683,6 +684,55 @@ static void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter)
 	       bkey_cmp(iter->pos, iter->k.p) > 0);
 }
 
+static int bch2_btree_iter_verify_ret(struct btree_iter *iter, struct bkey_s_c k)
+{
+	struct btree_trans *trans = iter->trans;
+	struct btree_iter copy;
+	struct bkey_s_c prev;
+	int ret = 0;
+
+	if (!bch2_debug_check_iterators)
+		return 0;
+
+	if (!(iter->flags & BTREE_ITER_FILTER_SNAPSHOTS))
+		return 0;
+
+	if (bkey_err(k) || !k.k)
+		return 0;
+
+	BUG_ON(!bch2_snapshot_is_ancestor(trans->c,
+					  iter->snapshot,
+					  k.k->p.snapshot));
+
+	bch2_trans_iter_init(trans, &copy, iter->btree_id, iter->pos,
+			     BTREE_ITER_ALL_SNAPSHOTS);
+	prev = bch2_btree_iter_prev(&copy);
+	if (!prev.k)
+		goto out;
+
+	ret = bkey_err(prev);
+	if (ret)
+		goto out;
+
+	if (!bkey_cmp(prev.k->p, k.k->p) &&
+	    bch2_snapshot_is_ancestor(trans->c, iter->snapshot,
+				      prev.k->p.snapshot) > 0) {
+		char buf1[100], buf2[200];
+
+		bch2_bkey_to_text(&PBUF(buf1), k.k);
+		bch2_bkey_to_text(&PBUF(buf2), prev.k);
+
+		panic("iter snap %u\n"
+		      "k    %s\n"
+		      "prev %s\n",
+		      iter->snapshot,
+		      buf1, buf2);
+	}
+out:
+	bch2_trans_iter_exit(trans, &copy);
+	return ret;
+}
+
 #else
 
 static inline void bch2_btree_path_verify_level(struct btree_trans *trans,
@@ -691,6 +741,7 @@ static inline void bch2_btree_path_verify(struct btree_trans *trans,
 					  struct btree_path *path) {}
 static inline void bch2_btree_iter_verify(struct btree_iter *iter) {}
 static inline void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter) {}
+static inline int bch2_btree_iter_verify_ret(struct btree_iter *iter, struct bkey_s_c k) { return 0; }
 
 #endif
 
@@ -2004,11 +2055,25 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
 		}
 
 		if (likely(k.k)) {
-			if (likely(!bkey_deleted(k.k)))
-				break;
+			/*
+			 * We can never have a key in a leaf node at POS_MAX, so
+			 * we don't have to check these successor() calls:
+			 */
+			if ((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) &&
+			    !bch2_snapshot_is_ancestor(trans->c,
+						       iter->snapshot,
+						       k.k->p.snapshot)) {
+				search_key = bpos_successor(k.k->p);
+				continue;
+			}
 
-			/* Advance to next key: */
-			search_key = bkey_successor(iter, k.k->p);
+			if (bkey_whiteout(k.k) &&
+			    !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)) {
+				search_key = bkey_successor(iter, k.k->p);
+				continue;
+			}
+
+			break;
 		} else if (likely(bpos_cmp(iter->path->l[0].b->key.k.p, SPOS_MAX))) {
 			/* Advance to next leaf node: */
 			search_key = bpos_successor(iter->path->l[0].b->key.k.p);
@@ -2029,6 +2094,9 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
 	else if (bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0)
 		iter->pos = bkey_start_pos(k.k);
 
+	if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)
+		iter->pos.snapshot = iter->snapshot;
+
 	cmp = bpos_cmp(k.k->p, iter->path->pos);
 	if (cmp) {
 		iter->path = bch2_btree_path_make_mut(trans, iter->path,
@@ -2041,6 +2109,10 @@ out:
 
 	bch2_btree_iter_verify_entry_exit(iter);
 	bch2_btree_iter_verify(iter);
+	ret = bch2_btree_iter_verify_ret(iter, k);
+	if (unlikely(ret))
+		return bkey_s_c_err(ret);
+
 	return k;
 }
 
@@ -2064,7 +2136,10 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
 {
 	struct btree_trans *trans = iter->trans;
 	struct bpos search_key = iter->pos;
+	struct btree_path *saved_path = NULL;
 	struct bkey_s_c k;
+	struct bkey saved_k;
+	const struct bch_val *saved_v;
 	int ret;
 
 	EBUG_ON(iter->path->cached || iter->path->level);
@@ -2072,6 +2147,9 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
 	bch2_btree_iter_verify(iter);
 	bch2_btree_iter_verify_entry_exit(iter);
 
+	if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)
+		search_key.snapshot = U32_MAX;
+
 	while (1) {
 		iter->path = btree_path_set_pos(trans, iter->path, search_key,
 						iter->flags & BTREE_ITER_INTENT);
@@ -2088,12 +2166,55 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
 					  &iter->path->l[0], &iter->k);
 		if (!k.k ||
 		    ((iter->flags & BTREE_ITER_IS_EXTENTS)
-		     ? bkey_cmp(bkey_start_pos(k.k), iter->pos) >= 0
-		     : bkey_cmp(k.k->p, iter->pos) > 0))
+		     ? bpos_cmp(bkey_start_pos(k.k), search_key) >= 0
+		     : bpos_cmp(k.k->p, search_key) > 0))
 			k = btree_path_level_prev(trans, iter->path,
 						  &iter->path->l[0], &iter->k);
 
 		if (likely(k.k)) {
+			if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) {
+				if (k.k->p.snapshot == iter->snapshot)
+					goto got_key;
+
+				/*
+				 * If we have a saved candidate, and we're no
+				 * longer at the same _key_ (not pos), return
+				 * that candidate
+				 */
+				if (saved_path && bkey_cmp(k.k->p, saved_k.p)) {
+					bch2_path_put(trans, iter->path,
+						      iter->flags & BTREE_ITER_INTENT);
+					iter->path = saved_path;
+					saved_path = NULL;
+					iter->k	= saved_k;
+					k.v	= saved_v;
+					goto got_key;
+				}
+
+				if (bch2_snapshot_is_ancestor(iter->trans->c,
+							      iter->snapshot,
+							      k.k->p.snapshot)) {
+					if (saved_path)
+						bch2_path_put(trans, saved_path,
+						      iter->flags & BTREE_ITER_INTENT);
+					saved_path = btree_path_clone(trans, iter->path,
+								iter->flags & BTREE_ITER_INTENT);
+					saved_k = *k.k;
+					saved_v = k.v;
+				}
+
+				search_key = bpos_predecessor(k.k->p);
+				continue;
+			}
+got_key:
+			if (bkey_whiteout(k.k) &&
+			    !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)) {
+				search_key = bkey_predecessor(iter, k.k->p);
+				if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)
+					search_key.snapshot = U32_MAX;
+				continue;
+			}
+
 			break;
 		} else if (likely(bpos_cmp(iter->path->l[0].b->data->min_key, POS_MIN))) {
 			/* Advance to previous leaf node: */
@@ -2111,7 +2232,12 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
 	/* Extents can straddle iter->pos: */
 	if (bkey_cmp(k.k->p, iter->pos) < 0)
 		iter->pos = k.k->p;
+
+	if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)
+		iter->pos.snapshot = iter->snapshot;
 out:
+	if (saved_path)
+		bch2_path_put(trans, saved_path, iter->flags & BTREE_ITER_INTENT);
 	iter->path->should_be_locked = true;
 
 	bch2_btree_iter_verify_entry_exit(iter);
@@ -2160,7 +2286,8 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
 	if (unlikely(ret))
 		return bkey_s_c_err(ret);
 
-	if (!(iter->flags & BTREE_ITER_IS_EXTENTS)) {
+	if ((iter->flags & BTREE_ITER_CACHED) ||
+	    !(iter->flags & (BTREE_ITER_IS_EXTENTS|BTREE_ITER_FILTER_SNAPSHOTS))) {
 		struct bkey_i *next_update;
 
 		next_update = btree_trans_peek_updates(iter);
@@ -2209,15 +2336,18 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
 		if (bkey_cmp(iter->pos, next) < 0) {
 			bkey_init(&iter->k);
 			iter->k.p = iter->pos;
-			bch2_key_resize(&iter->k,
-					min_t(u64, KEY_SIZE_MAX,
-					      (next.inode == iter->pos.inode
-					       ? next.offset
-					       : KEY_OFFSET_MAX) -
-					      iter->pos.offset));
+
+			if (iter->flags & BTREE_ITER_IS_EXTENTS) {
+				bch2_key_resize(&iter->k,
+						min_t(u64, KEY_SIZE_MAX,
+						      (next.inode == iter->pos.inode
+						       ? next.offset
+						       : KEY_OFFSET_MAX) -
+						      iter->pos.offset));
+				EBUG_ON(!iter->k.size);
+			}
 
 			k = (struct bkey_s_c) { &iter->k, NULL };
-			EBUG_ON(!k.k->size);
 		}
 	}
 
@@ -2225,6 +2355,9 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
 
 	bch2_btree_iter_verify_entry_exit(iter);
 	bch2_btree_iter_verify(iter);
+	ret = bch2_btree_iter_verify_ret(iter, k);
+	if (unlikely(ret))
+		return bkey_s_c_err(ret);
 
 	return k;
 }
@@ -2392,6 +2525,13 @@ static void __bch2_trans_iter_init(struct btree_trans *trans,
 	if (!btree_type_has_snapshots(btree_id) &&
 	    !(flags & __BTREE_ITER_ALL_SNAPSHOTS))
 		flags &= ~BTREE_ITER_ALL_SNAPSHOTS;
+#if 0
+	/* let's have this be explicitly set: */
+	if ((flags & BTREE_ITER_TYPE) != BTREE_ITER_NODES &&
+	    btree_type_has_snapshots(btree_id) &&
+	    !(flags & BTREE_ITER_ALL_SNAPSHOTS))
+		flags |= BTREE_ITER_FILTER_SNAPSHOTS;
+#endif
 
 	if (!(flags & BTREE_ITER_ALL_SNAPSHOTS))
 		pos.snapshot = btree_type_has_snapshots(btree_id)
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index 58add0bb1c81..feb2fcff1485 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -260,6 +260,15 @@ static inline void bch2_btree_iter_set_pos_to_extent_start(struct btree_iter *it
 	iter->pos = bkey_start_pos(&iter->k);
 }
 
+static inline void bch2_btree_iter_set_snapshot(struct btree_iter *iter, u32 snapshot)
+{
+	struct bpos pos = iter->pos;
+
+	iter->snapshot = snapshot;
+	pos.snapshot = snapshot;
+	bch2_btree_iter_set_pos(iter, pos);
+}
+
 /*
  * Unlocks before scheduling
  * Note: does not revalidate iterator
diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index 7be580555374..50b44e55dfe7 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -372,7 +372,8 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans,
 
 	bch2_trans_iter_init(trans, &b_iter, key.btree_id, key.pos,
 			     BTREE_ITER_SLOTS|
-			     BTREE_ITER_INTENT);
+			     BTREE_ITER_INTENT|
+			     BTREE_ITER_ALL_SNAPSHOTS);
 	bch2_trans_iter_init(trans, &c_iter, key.btree_id, key.pos,
 			     BTREE_ITER_CACHED|
 			     BTREE_ITER_CACHED_NOFILL|
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 9250ac69e8b1..081b82d3848e 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -209,6 +209,7 @@ struct btree_node_iter {
 #define BTREE_ITER_WITH_UPDATES		(1 << 10)
 #define __BTREE_ITER_ALL_SNAPSHOTS	(1 << 11)
 #define BTREE_ITER_ALL_SNAPSHOTS	(1 << 12)
+#define BTREE_ITER_FILTER_SNAPSHOTS	(1 << 13)
 
 enum btree_path_uptodate {
 	BTREE_ITER_UPTODATE		= 0,
-- 
cgit 


From 6fed42bb7750e217b0d1169ccfccc7639a3e1d3f Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 16 Mar 2021 00:28:17 -0400
Subject: bcachefs: Plumb through subvolume id

To implement snapshots, we need every filesystem btree operation (every
btree operation without a subvolume) to start by looking up the
subvolume and getting the current snapshot ID, with
bch2_subvolume_get_snapshot() - then, that snapshot ID is used for doing
btree lookups in BTREE_ITER_FILTER_SNAPSHOTS mode.

This patch adds those bch2_subvolume_get_snapshot() calls, and also
switches to passing around a subvol_inum instead of just an inode
number.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/acl.c       |  25 +++++-----
 fs/bcachefs/acl.h       |  11 ++---
 fs/bcachefs/dirent.c    | 107 +++++++++++++++++++++++++++-------------
 fs/bcachefs/dirent.h    |  29 +++++------
 fs/bcachefs/extents.c   |  32 ------------
 fs/bcachefs/extents.h   |   1 -
 fs/bcachefs/fs-common.c | 127 ++++++++++++++++++++++++++++++------------------
 fs/bcachefs/fs-common.h |  21 ++++----
 fs/bcachefs/fs-io.c     | 117 +++++++++++++++++++++++++++++++++++++++-----
 fs/bcachefs/fs-ioctl.c  |   8 ++-
 fs/bcachefs/fs.c        |  77 +++++++++++++++++------------
 fs/bcachefs/fs.h        |   4 ++
 fs/bcachefs/fsck.c      |   5 +-
 fs/bcachefs/inode.c     | 109 +++++++++++++++++++++++++++++++++--------
 fs/bcachefs/inode.h     |   7 +--
 fs/bcachefs/io.c        |   5 +-
 fs/bcachefs/move.c      |   3 +-
 fs/bcachefs/recovery.c  |   5 +-
 fs/bcachefs/reflink.c   |  18 ++++++-
 fs/bcachefs/reflink.h   |   4 +-
 fs/bcachefs/str_hash.h  |  41 +++++++++++-----
 fs/bcachefs/xattr.c     |  23 +++++++--
 fs/bcachefs/xattr.h     |   3 +-
 23 files changed, 526 insertions(+), 256 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/acl.c b/fs/bcachefs/acl.c
index 93b78e4e6e0d..2afa15b26700 100644
--- a/fs/bcachefs/acl.c
+++ b/fs/bcachefs/acl.c
@@ -230,7 +230,7 @@ retry:
 	bch2_trans_begin(&trans);
 
 	ret = bch2_hash_lookup(&trans, &iter, bch2_xattr_hash_desc,
-			&hash, inode->v.i_ino,
+			&hash, inode_inum(inode),
 			&X_SEARCH(acl_to_xattr_type(type), "", 0),
 			0);
 	if (ret) {
@@ -260,11 +260,11 @@ out:
 	return acl;
 }
 
-int bch2_set_acl_trans(struct btree_trans *trans,
+int bch2_set_acl_trans(struct btree_trans *trans, subvol_inum inum,
 		       struct bch_inode_unpacked *inode_u,
-		       const struct bch_hash_info *hash_info,
 		       struct posix_acl *acl, int type)
 {
+	struct bch_hash_info hash_info = bch2_hash_info_init(trans->c, inode_u);
 	int ret;
 
 	if (type == ACL_TYPE_DEFAULT &&
@@ -277,14 +277,14 @@ int bch2_set_acl_trans(struct btree_trans *trans,
 		if (IS_ERR(xattr))
 			return PTR_ERR(xattr);
 
-		ret = bch2_hash_set(trans, bch2_xattr_hash_desc, hash_info,
-				    inode_u->bi_inum, &xattr->k_i, 0);
+		ret = bch2_hash_set(trans, bch2_xattr_hash_desc, &hash_info,
+				    inum, &xattr->k_i, 0);
 	} else {
 		struct xattr_search_key search =
 			X_SEARCH(acl_to_xattr_type(type), "", 0);
 
-		ret = bch2_hash_delete(trans, bch2_xattr_hash_desc, hash_info,
-				       inode_u->bi_inum, &search);
+		ret = bch2_hash_delete(trans, bch2_xattr_hash_desc, &hash_info,
+				       inum, &search);
 	}
 
 	return ret == -ENOENT ? 0 : ret;
@@ -299,7 +299,6 @@ int bch2_set_acl(struct mnt_idmap *idmap,
 	struct btree_trans trans;
 	struct btree_iter inode_iter = { NULL };
 	struct bch_inode_unpacked inode_u;
-	struct bch_hash_info hash_info;
 	struct posix_acl *acl;
 	umode_t mode;
 	int ret;
@@ -310,7 +309,7 @@ retry:
 	bch2_trans_begin(&trans);
 	acl = _acl;
 
-	ret = bch2_inode_peek(&trans, &inode_iter, &inode_u, inode->v.i_ino,
+	ret = bch2_inode_peek(&trans, &inode_iter, &inode_u, inode_inum(inode),
 			      BTREE_ITER_INTENT);
 	if (ret)
 		goto btree_err;
@@ -323,9 +322,7 @@ retry:
 			goto btree_err;
 	}
 
-	hash_info = bch2_hash_info_init(c, &inode_u);
-
-	ret = bch2_set_acl_trans(&trans, &inode_u, &hash_info, acl, type);
+	ret = bch2_set_acl_trans(&trans, inode_inum(inode), &inode_u, acl, type);
 	if (ret)
 		goto btree_err;
 
@@ -354,7 +351,7 @@ err:
 	return ret;
 }
 
-int bch2_acl_chmod(struct btree_trans *trans,
+int bch2_acl_chmod(struct btree_trans *trans, subvol_inum inum,
 		   struct bch_inode_unpacked *inode,
 		   umode_t mode,
 		   struct posix_acl **new_acl)
@@ -368,7 +365,7 @@ int bch2_acl_chmod(struct btree_trans *trans,
 	int ret;
 
 	ret = bch2_hash_lookup(trans, &iter, bch2_xattr_hash_desc,
-			&hash_info, inode->bi_inum,
+			       &hash_info, inum,
 			&X_SEARCH(KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS, "", 0),
 			BTREE_ITER_INTENT);
 	if (ret)
diff --git a/fs/bcachefs/acl.h b/fs/bcachefs/acl.h
index f11eb9d4592c..bb21d8d696a2 100644
--- a/fs/bcachefs/acl.h
+++ b/fs/bcachefs/acl.h
@@ -28,25 +28,24 @@ typedef struct {
 
 struct posix_acl *bch2_get_acl(struct mnt_idmap *, struct dentry *, int);
 
-int bch2_set_acl_trans(struct btree_trans *,
+int bch2_set_acl_trans(struct btree_trans *, subvol_inum,
 		       struct bch_inode_unpacked *,
-		       const struct bch_hash_info *,
 		       struct posix_acl *, int);
 int bch2_set_acl(struct mnt_idmap *, struct dentry *, struct posix_acl *, int);
-int bch2_acl_chmod(struct btree_trans *, struct bch_inode_unpacked *,
+int bch2_acl_chmod(struct btree_trans *, subvol_inum,
+		   struct bch_inode_unpacked *,
 		   umode_t, struct posix_acl **);
 
 #else
 
-static inline int bch2_set_acl_trans(struct btree_trans *trans,
+static inline int bch2_set_acl_trans(struct btree_trans *trans, subvol_inum inum,
 				     struct bch_inode_unpacked *inode_u,
-				     const struct bch_hash_info *hash_info,
 				     struct posix_acl *acl, int type)
 {
 	return 0;
 }
 
-static inline int bch2_acl_chmod(struct btree_trans *trans,
+static inline int bch2_acl_chmod(struct btree_trans *trans, subvol_inum inum,
 				 struct bch_inode_unpacked *inode,
 				 umode_t mode,
 				 struct posix_acl **new_acl)
diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c
index f3aef0686928..f290580594ce 100644
--- a/fs/bcachefs/dirent.c
+++ b/fs/bcachefs/dirent.c
@@ -8,6 +8,7 @@
 #include "fs.h"
 #include "keylist.h"
 #include "str_hash.h"
+#include "subvolume.h"
 
 #include <linux/dcache.h>
 
@@ -150,8 +151,8 @@ static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans,
 	return dirent;
 }
 
-int bch2_dirent_create(struct btree_trans *trans,
-		       u64 dir_inum, const struct bch_hash_info *hash_info,
+int bch2_dirent_create(struct btree_trans *trans, subvol_inum dir,
+		       const struct bch_hash_info *hash_info,
 		       u8 type, const struct qstr *name, u64 dst_inum,
 		       u64 *dir_offset, int flags)
 {
@@ -164,7 +165,7 @@ int bch2_dirent_create(struct btree_trans *trans,
 		return ret;
 
 	ret = bch2_hash_set(trans, bch2_dirent_hash_desc, hash_info,
-			    dir_inum, &dirent->k_i, flags);
+			    dir, &dirent->k_i, flags);
 	*dir_offset = dirent->k.p.offset;
 
 	return ret;
@@ -223,31 +224,40 @@ err:
 	return ret;
 }
 
-int bch2_dirent_read_target(struct btree_trans *trans,
-			    struct bkey_s_c_dirent d, u64 *target)
+static int bch2_dirent_read_target(struct btree_trans *trans, subvol_inum dir,
+				   struct bkey_s_c_dirent d, subvol_inum *target)
 {
-	u32 subvol, snapshot;
+	u32 snapshot;
+	int ret = 0;
 
-	return __bch2_dirent_read_target(trans, d, &subvol,
-					 &snapshot, target, false);
+	ret = __bch2_dirent_read_target(trans, d, &target->subvol, &snapshot,
+					&target->inum, false);
+	if (!target->subvol)
+		target->subvol = dir.subvol;
+
+	return ret;
 }
 
 int bch2_dirent_rename(struct btree_trans *trans,
-		       u64 src_dir, struct bch_hash_info *src_hash,
-		       u64 dst_dir, struct bch_hash_info *dst_hash,
-		       const struct qstr *src_name, u64 *src_inum, u64 *src_offset,
-		       const struct qstr *dst_name, u64 *dst_inum, u64 *dst_offset,
-		       enum bch_rename_mode mode)
+		subvol_inum src_dir, struct bch_hash_info *src_hash,
+		subvol_inum dst_dir, struct bch_hash_info *dst_hash,
+		const struct qstr *src_name, subvol_inum *src_inum, u64 *src_offset,
+		const struct qstr *dst_name, subvol_inum *dst_inum, u64 *dst_offset,
+		enum bch_rename_mode mode)
 {
 	struct btree_iter src_iter = { NULL };
 	struct btree_iter dst_iter = { NULL };
 	struct bkey_s_c old_src, old_dst;
 	struct bkey_i_dirent *new_src = NULL, *new_dst = NULL;
 	struct bpos dst_pos =
-		POS(dst_dir, bch2_dirent_hash(dst_hash, dst_name));
+		POS(dst_dir.inum, bch2_dirent_hash(dst_hash, dst_name));
 	int ret = 0;
 
-	*src_inum = *dst_inum = 0;
+	if (src_dir.subvol != dst_dir.subvol)
+		return -EXDEV;
+
+	memset(src_inum, 0, sizeof(*src_inum));
+	memset(dst_inum, 0, sizeof(*dst_inum));
 
 	/*
 	 * Lookup dst:
@@ -270,8 +280,12 @@ int bch2_dirent_rename(struct btree_trans *trans,
 	if (ret)
 		goto out;
 
-	if (mode != BCH_RENAME)
-		*dst_inum = le64_to_cpu(bkey_s_c_to_dirent(old_dst).v->d_inum);
+	if (mode != BCH_RENAME) {
+		ret = bch2_dirent_read_target(trans, dst_dir,
+				bkey_s_c_to_dirent(old_dst), dst_inum);
+		if (ret)
+			goto out;
+	}
 	if (mode != BCH_RENAME_EXCHANGE)
 		*src_offset = dst_iter.pos.offset;
 
@@ -287,7 +301,10 @@ int bch2_dirent_rename(struct btree_trans *trans,
 	if (ret)
 		goto out;
 
-	*src_inum = le64_to_cpu(bkey_s_c_to_dirent(old_src).v->d_inum);
+	ret = bch2_dirent_read_target(trans, src_dir,
+			bkey_s_c_to_dirent(old_src), src_inum);
+	if (ret)
+		goto out;
 
 	/* Create new dst key: */
 	new_dst = dirent_create_key(trans, 0, dst_name, 0);
@@ -376,17 +393,22 @@ int bch2_dirent_delete_at(struct btree_trans *trans,
 
 int __bch2_dirent_lookup_trans(struct btree_trans *trans,
 			       struct btree_iter *iter,
-			       u64 dir_inum,
+			       subvol_inum dir,
 			       const struct bch_hash_info *hash_info,
-			       const struct qstr *name, u64 *inum,
+			       const struct qstr *name, subvol_inum *inum,
 			       unsigned flags)
 {
 	struct bkey_s_c k;
 	struct bkey_s_c_dirent d;
+	u32 snapshot;
 	int ret;
 
+	ret = bch2_subvolume_get_snapshot(trans, dir.subvol, &snapshot);
+	if (ret)
+		return ret;
+
 	ret = bch2_hash_lookup(trans, iter, bch2_dirent_hash_desc,
-			       hash_info, dir_inum, name, flags);
+			       hash_info, dir, name, flags);
 	if (ret)
 		return ret;
 
@@ -399,44 +421,49 @@ int __bch2_dirent_lookup_trans(struct btree_trans *trans,
 
 	d = bkey_s_c_to_dirent(k);
 
-	ret = bch2_dirent_read_target(trans, d, inum);
+	ret = bch2_dirent_read_target(trans, dir, d, inum);
 	if (ret)
 		bch2_trans_iter_exit(trans, iter);
 
 	return ret;
 }
 
-u64 bch2_dirent_lookup(struct bch_fs *c, u64 dir_inum,
+u64 bch2_dirent_lookup(struct bch_fs *c, subvol_inum dir,
 		       const struct bch_hash_info *hash_info,
-		       const struct qstr *name)
+		       const struct qstr *name, subvol_inum *inum)
 {
 	struct btree_trans trans;
 	struct btree_iter iter;
-	u64 inum = 0;
-	int ret = 0;
+	int ret;
 
 	bch2_trans_init(&trans, c, 0, 0);
 retry:
 	bch2_trans_begin(&trans);
-	ret = __bch2_dirent_lookup_trans(&trans, &iter, dir_inum, hash_info,
-					 name, &inum, 0);
+
+	ret = __bch2_dirent_lookup_trans(&trans, &iter, dir, hash_info,
+					  name, inum, 0);
 
 	bch2_trans_iter_exit(&trans, &iter);
 	if (ret == -EINTR)
 		goto retry;
 	bch2_trans_exit(&trans);
-	return inum;
+	return ret;
 }
 
-int bch2_empty_dir_trans(struct btree_trans *trans, u64 dir_inum)
+int bch2_empty_dir_trans(struct btree_trans *trans, subvol_inum dir)
 {
 	struct btree_iter iter;
 	struct bkey_s_c k;
+	u32 snapshot;
 	int ret;
 
+	ret = bch2_subvolume_get_snapshot(trans, dir.subvol, &snapshot);
+	if (ret)
+		return ret;
+
 	for_each_btree_key(trans, iter, BTREE_ID_dirents,
-			   POS(dir_inum, 0), 0, k, ret) {
-		if (k.k->p.inode > dir_inum)
+			   SPOS(dir.inum, 0, snapshot), 0, k, ret) {
+		if (k.k->p.inode > dir.inum)
 			break;
 
 		if (k.k->type == KEY_TYPE_dirent) {
@@ -449,19 +476,26 @@ int bch2_empty_dir_trans(struct btree_trans *trans, u64 dir_inum)
 	return ret;
 }
 
-int bch2_readdir(struct bch_fs *c, u64 inum, struct dir_context *ctx)
+int bch2_readdir(struct bch_fs *c, subvol_inum inum, struct dir_context *ctx)
 {
 	struct btree_trans trans;
 	struct btree_iter iter;
 	struct bkey_s_c k;
 	struct bkey_s_c_dirent dirent;
+	u32 snapshot;
 	int ret;
 
 	bch2_trans_init(&trans, c, 0, 0);
+retry:
+	bch2_trans_begin(&trans);
+
+	ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot);
+	if (ret)
+		goto err;
 
 	for_each_btree_key(&trans, iter, BTREE_ID_dirents,
-			   POS(inum, ctx->pos), 0, k, ret) {
-		if (k.k->p.inode > inum)
+			   SPOS(inum.inum, ctx->pos, snapshot), 0, k, ret) {
+		if (k.k->p.inode > inum.inum)
 			break;
 
 		if (k.k->type != KEY_TYPE_dirent)
@@ -482,6 +516,9 @@ int bch2_readdir(struct bch_fs *c, u64 inum, struct dir_context *ctx)
 		ctx->pos = dirent.k->p.offset + 1;
 	}
 	bch2_trans_iter_exit(&trans, &iter);
+err:
+	if (ret == -EINTR)
+		goto retry;
 
 	ret = bch2_trans_exit(&trans) ?: ret;
 
diff --git a/fs/bcachefs/dirent.h b/fs/bcachefs/dirent.h
index 3cd05a2454e1..88b784a99cb5 100644
--- a/fs/bcachefs/dirent.h
+++ b/fs/bcachefs/dirent.h
@@ -29,7 +29,7 @@ static inline unsigned dirent_val_u64s(unsigned len)
 			    sizeof(u64));
 }
 
-int bch2_dirent_create(struct btree_trans *, u64,
+int bch2_dirent_create(struct btree_trans *, subvol_inum,
 		       const struct bch_hash_info *, u8,
 		       const struct qstr *, u64, u64 *, int);
 
@@ -40,9 +40,6 @@ int bch2_dirent_delete_at(struct btree_trans *,
 int __bch2_dirent_read_target(struct btree_trans *, struct bkey_s_c_dirent,
 			      u32 *, u32 *, u64 *, bool);
 
-int bch2_dirent_read_target(struct btree_trans *,
-			    struct bkey_s_c_dirent, u64 *);
-
 static inline unsigned vfs_d_type(unsigned type)
 {
 	return type == DT_SUBVOL ? DT_DIR : type;
@@ -55,20 +52,20 @@ enum bch_rename_mode {
 };
 
 int bch2_dirent_rename(struct btree_trans *,
-		       u64, struct bch_hash_info *,
-		       u64, struct bch_hash_info *,
-		       const struct qstr *, u64 *, u64 *,
-		       const struct qstr *, u64 *, u64 *,
+		       subvol_inum, struct bch_hash_info *,
+		       subvol_inum, struct bch_hash_info *,
+		       const struct qstr *, subvol_inum *, u64 *,
+		       const struct qstr *, subvol_inum *, u64 *,
 		       enum bch_rename_mode);
 
-int __bch2_dirent_lookup_trans(struct btree_trans *, struct btree_iter *, u64,
-			   const struct bch_hash_info *,
-			   const struct qstr *, u64 *,
-			   unsigned);
-u64 bch2_dirent_lookup(struct bch_fs *, u64, const struct bch_hash_info *,
-		       const struct qstr *);
+int __bch2_dirent_lookup_trans(struct btree_trans *, struct btree_iter *,
+			       subvol_inum, const struct bch_hash_info *,
+			       const struct qstr *, subvol_inum *, unsigned);
+u64 bch2_dirent_lookup(struct bch_fs *, subvol_inum,
+		       const struct bch_hash_info *,
+		       const struct qstr *, subvol_inum *);
 
-int bch2_empty_dir_trans(struct btree_trans *, u64);
-int bch2_readdir(struct bch_fs *, u64, struct dir_context *);
+int bch2_empty_dir_trans(struct btree_trans *, subvol_inum);
+int bch2_readdir(struct bch_fs *, subvol_inum, struct dir_context *);
 
 #endif /* _BCACHEFS_DIRENT_H */
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 0190605711e5..966d6ef41793 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -611,38 +611,6 @@ bool bch2_bkey_is_incompressible(struct bkey_s_c k)
 	return false;
 }
 
-bool bch2_check_range_allocated(struct bch_fs *c, struct bpos pos, u64 size,
-				unsigned nr_replicas, bool compressed)
-{
-	struct btree_trans trans;
-	struct btree_iter iter;
-	struct bpos end = pos;
-	struct bkey_s_c k;
-	bool ret = true;
-	int err;
-
-	end.offset += size;
-
-	bch2_trans_init(&trans, c, 0, 0);
-
-	for_each_btree_key(&trans, iter, BTREE_ID_extents, pos,
-			   BTREE_ITER_SLOTS, k, err) {
-		if (bkey_cmp(bkey_start_pos(k.k), end) >= 0)
-			break;
-
-		if (nr_replicas > bch2_bkey_replicas(c, k) ||
-		    (!compressed && bch2_bkey_sectors_compressed(k))) {
-			ret = false;
-			break;
-		}
-	}
-	bch2_trans_iter_exit(&trans, &iter);
-
-	bch2_trans_exit(&trans);
-
-	return ret;
-}
-
 unsigned bch2_bkey_replicas(struct bch_fs *c, struct bkey_s_c k)
 {
 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h
index 43cef0a3bdf3..afd3067bb64e 100644
--- a/fs/bcachefs/extents.h
+++ b/fs/bcachefs/extents.h
@@ -567,7 +567,6 @@ unsigned bch2_bkey_nr_ptrs_allocated(struct bkey_s_c);
 unsigned bch2_bkey_nr_ptrs_fully_allocated(struct bkey_s_c);
 bool bch2_bkey_is_incompressible(struct bkey_s_c);
 unsigned bch2_bkey_sectors_compressed(struct bkey_s_c);
-bool bch2_check_range_allocated(struct bch_fs *, struct bpos, u64, unsigned, bool);
 
 unsigned bch2_bkey_replicas(struct bch_fs *, struct bkey_s_c);
 unsigned bch2_bkey_durability(struct bch_fs *, struct bkey_s_c);
diff --git a/fs/bcachefs/fs-common.c b/fs/bcachefs/fs-common.c
index 96b09b005d0b..02bf32cc7659 100644
--- a/fs/bcachefs/fs-common.c
+++ b/fs/bcachefs/fs-common.c
@@ -6,28 +6,38 @@
 #include "dirent.h"
 #include "fs-common.h"
 #include "inode.h"
+#include "subvolume.h"
 #include "xattr.h"
 
 #include <linux/posix_acl.h>
 
-int bch2_create_trans(struct btree_trans *trans, u64 dir_inum,
+int bch2_create_trans(struct btree_trans *trans,
+		      subvol_inum dir,
 		      struct bch_inode_unpacked *dir_u,
 		      struct bch_inode_unpacked *new_inode,
 		      const struct qstr *name,
 		      uid_t uid, gid_t gid, umode_t mode, dev_t rdev,
 		      struct posix_acl *default_acl,
-		      struct posix_acl *acl)
+		      struct posix_acl *acl,
+		      unsigned flags)
 {
 	struct bch_fs *c = trans->c;
 	struct btree_iter dir_iter = { NULL };
 	struct btree_iter inode_iter = { NULL };
-	struct bch_hash_info hash = bch2_hash_info_init(c, new_inode);
+	subvol_inum new_inum = dir;
 	u64 now = bch2_current_time(c);
 	u64 cpu = raw_smp_processor_id();
 	u64 dir_offset = 0;
+	u64 dir_target;
+	u32 snapshot;
+	unsigned dir_type;
 	int ret;
 
-	ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir_inum, BTREE_ITER_INTENT);
+	ret = bch2_subvolume_get_snapshot(trans, dir.subvol, &snapshot);
+	if (ret)
+		goto err;
+
+	ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir, BTREE_ITER_INTENT);
 	if (ret)
 		goto err;
 
@@ -36,19 +46,23 @@ int bch2_create_trans(struct btree_trans *trans, u64 dir_inum,
 	if (!name)
 		new_inode->bi_flags |= BCH_INODE_UNLINKED;
 
-	ret = bch2_inode_create(trans, &inode_iter, new_inode, U32_MAX, cpu);
+	ret = bch2_inode_create(trans, &inode_iter, new_inode, snapshot, cpu);
 	if (ret)
 		goto err;
 
+	new_inum.inum	= new_inode->bi_inum;
+	dir_target	= new_inode->bi_inum;
+	dir_type	= mode_to_type(new_inode->bi_mode);
+
 	if (default_acl) {
-		ret = bch2_set_acl_trans(trans, new_inode, &hash,
+		ret = bch2_set_acl_trans(trans, new_inum, new_inode,
 					 default_acl, ACL_TYPE_DEFAULT);
 		if (ret)
 			goto err;
 	}
 
 	if (acl) {
-		ret = bch2_set_acl_trans(trans, new_inode, &hash,
+		ret = bch2_set_acl_trans(trans, new_inum, new_inode,
 					 acl, ACL_TYPE_ACCESS);
 		if (ret)
 			goto err;
@@ -56,18 +70,19 @@ int bch2_create_trans(struct btree_trans *trans, u64 dir_inum,
 
 	if (name) {
 		struct bch_hash_info dir_hash = bch2_hash_info_init(c, dir_u);
-		dir_u->bi_mtime = dir_u->bi_ctime = now;
 
 		if (S_ISDIR(new_inode->bi_mode))
 			dir_u->bi_nlink++;
+		dir_u->bi_mtime = dir_u->bi_ctime = now;
 
 		ret = bch2_inode_write(trans, &dir_iter, dir_u);
 		if (ret)
 			goto err;
 
-		ret = bch2_dirent_create(trans, dir_inum, &dir_hash,
-					 mode_to_type(new_inode->bi_mode),
-					 name, new_inode->bi_inum,
+		ret = bch2_dirent_create(trans, dir, &dir_hash,
+					 dir_type,
+					 name,
+					 dir_target,
 					 &dir_offset,
 					 BCH_HASH_SET_MUST_CREATE);
 		if (ret)
@@ -79,9 +94,8 @@ int bch2_create_trans(struct btree_trans *trans, u64 dir_inum,
 		new_inode->bi_dir_offset	= dir_offset;
 	}
 
-	/* XXX use bch2_btree_iter_set_snapshot() */
-	inode_iter.snapshot = U32_MAX;
-	bch2_btree_iter_set_pos(&inode_iter, SPOS(0, new_inode->bi_inum, U32_MAX));
+	inode_iter.flags &= ~BTREE_ITER_ALL_SNAPSHOTS;
+	bch2_btree_iter_set_snapshot(&inode_iter, snapshot);
 
 	ret   = bch2_btree_iter_traverse(&inode_iter) ?:
 		bch2_inode_write(trans, &inode_iter, new_inode);
@@ -91,9 +105,10 @@ err:
 	return ret;
 }
 
-int bch2_link_trans(struct btree_trans *trans, u64 dir_inum,
-		    u64 inum, struct bch_inode_unpacked *dir_u,
-		    struct bch_inode_unpacked *inode_u, const struct qstr *name)
+int bch2_link_trans(struct btree_trans *trans,
+		    subvol_inum dir,  struct bch_inode_unpacked *dir_u,
+		    subvol_inum inum, struct bch_inode_unpacked *inode_u,
+		    const struct qstr *name)
 {
 	struct bch_fs *c = trans->c;
 	struct btree_iter dir_iter = { NULL };
@@ -103,6 +118,9 @@ int bch2_link_trans(struct btree_trans *trans, u64 dir_inum,
 	u64 dir_offset = 0;
 	int ret;
 
+	if (dir.subvol != inum.subvol)
+		return -EXDEV;
+
 	ret = bch2_inode_peek(trans, &inode_iter, inode_u, inum, BTREE_ITER_INTENT);
 	if (ret)
 		goto err;
@@ -110,7 +128,7 @@ int bch2_link_trans(struct btree_trans *trans, u64 dir_inum,
 	inode_u->bi_ctime = now;
 	bch2_inode_nlink_inc(inode_u);
 
-	ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir_inum, BTREE_ITER_INTENT);
+	ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir, BTREE_ITER_INTENT);
 	if (ret)
 		goto err;
 
@@ -118,15 +136,15 @@ int bch2_link_trans(struct btree_trans *trans, u64 dir_inum,
 
 	dir_hash = bch2_hash_info_init(c, dir_u);
 
-	ret = bch2_dirent_create(trans, dir_inum, &dir_hash,
+	ret = bch2_dirent_create(trans, dir, &dir_hash,
 				 mode_to_type(inode_u->bi_mode),
-				 name, inum, &dir_offset,
+				 name, inum.inum, &dir_offset,
 				 BCH_HASH_SET_MUST_CREATE);
 	if (ret)
 		goto err;
 
 	if (c->sb.version >= bcachefs_metadata_version_inode_backpointers) {
-		inode_u->bi_dir		= dir_inum;
+		inode_u->bi_dir		= dir.inum;
 		inode_u->bi_dir_offset	= dir_offset;
 	}
 
@@ -139,7 +157,8 @@ err:
 }
 
 int bch2_unlink_trans(struct btree_trans *trans,
-		      u64 dir_inum, struct bch_inode_unpacked *dir_u,
+		      subvol_inum dir,
+		      struct bch_inode_unpacked *dir_u,
 		      struct bch_inode_unpacked *inode_u,
 		      const struct qstr *name)
 {
@@ -148,39 +167,49 @@ int bch2_unlink_trans(struct btree_trans *trans,
 	struct btree_iter dirent_iter = { NULL };
 	struct btree_iter inode_iter = { NULL };
 	struct bch_hash_info dir_hash;
-	u64 inum, now = bch2_current_time(c);
-	struct bkey_s_c k;
+	subvol_inum inum;
+	u64 now = bch2_current_time(c);
 	int ret;
 
-	ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir_inum, BTREE_ITER_INTENT);
+	ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir, BTREE_ITER_INTENT);
 	if (ret)
 		goto err;
 
 	dir_hash = bch2_hash_info_init(c, dir_u);
 
-	ret = __bch2_dirent_lookup_trans(trans, &dirent_iter, dir_inum, &dir_hash,
+	ret = __bch2_dirent_lookup_trans(trans, &dirent_iter, dir, &dir_hash,
 					 name, &inum, BTREE_ITER_INTENT);
 	if (ret)
 		goto err;
 
-	ret = bch2_inode_peek(trans, &inode_iter, inode_u, inum, BTREE_ITER_INTENT);
+	ret = bch2_inode_peek(trans, &inode_iter, inode_u, inum,
+			      BTREE_ITER_INTENT);
 	if (ret)
 		goto err;
 
-	if (inode_u->bi_dir		== k.k->p.inode &&
-	    inode_u->bi_dir_offset	== k.k->p.offset) {
+	if (inode_u->bi_dir		== dirent_iter.pos.inode &&
+	    inode_u->bi_dir_offset	== dirent_iter.pos.offset) {
 		inode_u->bi_dir		= 0;
 		inode_u->bi_dir_offset	= 0;
 	}
 
+	if (S_ISDIR(inode_u->bi_mode)) {
+		ret = bch2_empty_dir_trans(trans, inum);
+		if (ret)
+			goto err;
+	}
+
+	if (dir.subvol != inum.subvol) {
+		ret = bch2_subvolume_delete(trans, inum.subvol, false);
+		if (ret)
+			goto err;
+	}
+
 	dir_u->bi_mtime = dir_u->bi_ctime = inode_u->bi_ctime = now;
 	dir_u->bi_nlink -= S_ISDIR(inode_u->bi_mode);
 	bch2_inode_nlink_dec(inode_u);
 
-	ret =   (S_ISDIR(inode_u->bi_mode)
-		 ? bch2_empty_dir_trans(trans, inum)
-		 : 0) ?:
-		bch2_dirent_delete_at(trans, &dir_hash, &dirent_iter) ?:
+	ret =   bch2_dirent_delete_at(trans, &dir_hash, &dirent_iter) ?:
 		bch2_inode_write(trans, &dir_iter, dir_u) ?:
 		bch2_inode_write(trans, &inode_iter, inode_u);
 err:
@@ -215,8 +244,8 @@ bool bch2_reinherit_attrs(struct bch_inode_unpacked *dst_u,
 }
 
 int bch2_rename_trans(struct btree_trans *trans,
-		      u64 src_dir, struct bch_inode_unpacked *src_dir_u,
-		      u64 dst_dir, struct bch_inode_unpacked *dst_dir_u,
+		      subvol_inum src_dir, struct bch_inode_unpacked *src_dir_u,
+		      subvol_inum dst_dir, struct bch_inode_unpacked *dst_dir_u,
 		      struct bch_inode_unpacked *src_inode_u,
 		      struct bch_inode_unpacked *dst_inode_u,
 		      const struct qstr *src_name,
@@ -229,7 +258,8 @@ int bch2_rename_trans(struct btree_trans *trans,
 	struct btree_iter src_inode_iter = { NULL };
 	struct btree_iter dst_inode_iter = { NULL };
 	struct bch_hash_info src_hash, dst_hash;
-	u64 src_inode, src_offset, dst_inode, dst_offset;
+	subvol_inum src_inum, dst_inum;
+	u64 src_offset, dst_offset;
 	u64 now = bch2_current_time(c);
 	int ret;
 
@@ -240,7 +270,8 @@ int bch2_rename_trans(struct btree_trans *trans,
 
 	src_hash = bch2_hash_info_init(c, src_dir_u);
 
-	if (dst_dir != src_dir) {
+	if (dst_dir.inum	!= src_dir.inum ||
+	    dst_dir.subvol	!= src_dir.subvol) {
 		ret = bch2_inode_peek(trans, &dst_dir_iter, dst_dir_u, dst_dir,
 				      BTREE_ITER_INTENT);
 		if (ret)
@@ -255,19 +286,19 @@ int bch2_rename_trans(struct btree_trans *trans,
 	ret = bch2_dirent_rename(trans,
 				 src_dir, &src_hash,
 				 dst_dir, &dst_hash,
-				 src_name, &src_inode, &src_offset,
-				 dst_name, &dst_inode, &dst_offset,
+				 src_name, &src_inum, &src_offset,
+				 dst_name, &dst_inum, &dst_offset,
 				 mode);
 	if (ret)
 		goto err;
 
-	ret = bch2_inode_peek(trans, &src_inode_iter, src_inode_u, src_inode,
+	ret = bch2_inode_peek(trans, &src_inode_iter, src_inode_u, src_inum,
 			      BTREE_ITER_INTENT);
 	if (ret)
 		goto err;
 
-	if (dst_inode) {
-		ret = bch2_inode_peek(trans, &dst_inode_iter, dst_inode_u, dst_inode,
+	if (dst_inum.inum) {
+		ret = bch2_inode_peek(trans, &dst_inode_iter, dst_inode_u, dst_inum,
 				      BTREE_ITER_INTENT);
 		if (ret)
 			goto err;
@@ -298,7 +329,7 @@ int bch2_rename_trans(struct btree_trans *trans,
 		}
 
 		if (S_ISDIR(dst_inode_u->bi_mode) &&
-		    bch2_empty_dir_trans(trans, dst_inode)) {
+		    bch2_empty_dir_trans(trans, dst_inum)) {
 			ret = -ENOTEMPTY;
 			goto err;
 		}
@@ -322,7 +353,7 @@ int bch2_rename_trans(struct btree_trans *trans,
 		dst_dir_u->bi_nlink++;
 	}
 
-	if (dst_inode && S_ISDIR(dst_inode_u->bi_mode)) {
+	if (dst_inum.inum && S_ISDIR(dst_inode_u->bi_mode)) {
 		dst_dir_u->bi_nlink--;
 		src_dir_u->bi_nlink += mode == BCH_RENAME_EXCHANGE;
 	}
@@ -333,22 +364,22 @@ int bch2_rename_trans(struct btree_trans *trans,
 	src_dir_u->bi_mtime		= now;
 	src_dir_u->bi_ctime		= now;
 
-	if (src_dir != dst_dir) {
+	if (src_dir.inum != dst_dir.inum) {
 		dst_dir_u->bi_mtime	= now;
 		dst_dir_u->bi_ctime	= now;
 	}
 
 	src_inode_u->bi_ctime		= now;
 
-	if (dst_inode)
+	if (dst_inum.inum)
 		dst_inode_u->bi_ctime	= now;
 
 	ret =   bch2_inode_write(trans, &src_dir_iter, src_dir_u) ?:
-		(src_dir != dst_dir
+		(src_dir.inum != dst_dir.inum
 		 ? bch2_inode_write(trans, &dst_dir_iter, dst_dir_u)
 		 : 0 ) ?:
 		bch2_inode_write(trans, &src_inode_iter, src_inode_u) ?:
-		(dst_inode
+		(dst_inum.inum
 		 ? bch2_inode_write(trans, &dst_inode_iter, dst_inode_u)
 		 : 0 );
 err:
diff --git a/fs/bcachefs/fs-common.h b/fs/bcachefs/fs-common.h
index 2273b7961c9b..1bb2ac4dc13a 100644
--- a/fs/bcachefs/fs-common.h
+++ b/fs/bcachefs/fs-common.h
@@ -4,27 +4,30 @@
 
 struct posix_acl;
 
-int bch2_create_trans(struct btree_trans *, u64,
+#define BCH_CREATE_TMPFILE		(1U << 0)
+
+int bch2_create_trans(struct btree_trans *, subvol_inum,
 		      struct bch_inode_unpacked *,
 		      struct bch_inode_unpacked *,
 		      const struct qstr *,
 		      uid_t, gid_t, umode_t, dev_t,
 		      struct posix_acl *,
-		      struct posix_acl *);
+		      struct posix_acl *,
+		      unsigned);
 
-int bch2_link_trans(struct btree_trans *, u64,
-		    u64, struct bch_inode_unpacked *,
-		    struct bch_inode_unpacked *,
+int bch2_link_trans(struct btree_trans *,
+		    subvol_inum, struct bch_inode_unpacked *,
+		    subvol_inum, struct bch_inode_unpacked *,
 		    const struct qstr *);
 
-int bch2_unlink_trans(struct btree_trans *,
-		      u64, struct bch_inode_unpacked *,
+int bch2_unlink_trans(struct btree_trans *, subvol_inum,
+		      struct bch_inode_unpacked *,
 		      struct bch_inode_unpacked *,
 		      const struct qstr *);
 
 int bch2_rename_trans(struct btree_trans *,
-		      u64, struct bch_inode_unpacked *,
-		      u64, struct bch_inode_unpacked *,
+		      subvol_inum, struct bch_inode_unpacked *,
+		      subvol_inum, struct bch_inode_unpacked *,
 		      struct bch_inode_unpacked *,
 		      struct bch_inode_unpacked *,
 		      const struct qstr *,
diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 909db2f104cd..7a0772195182 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -1790,6 +1790,49 @@ ssize_t bch2_read_iter(struct kiocb *iocb, struct iov_iter *iter)
 
 /* O_DIRECT writes */
 
+static bool bch2_check_range_allocated(struct bch_fs *c, subvol_inum inum,
+				       u64 offset, u64 size,
+				       unsigned nr_replicas, bool compressed)
+{
+	struct btree_trans trans;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	u64 end = offset + size;
+	u32 snapshot;
+	bool ret = true;
+	int err;
+
+	bch2_trans_init(&trans, c, 0, 0);
+retry:
+	bch2_trans_begin(&trans);
+
+	err = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot);
+	if (err)
+		goto err;
+
+	for_each_btree_key(&trans, iter, BTREE_ID_extents,
+			   SPOS(inum.inum, offset, snapshot),
+			   BTREE_ITER_SLOTS, k, err) {
+		if (bkey_cmp(bkey_start_pos(k.k), POS(inum.inum, end)) >= 0)
+			break;
+
+		if (nr_replicas > bch2_bkey_replicas(c, k) ||
+		    (!compressed && bch2_bkey_sectors_compressed(k))) {
+			ret = false;
+			break;
+		}
+	}
+
+	offset = iter.pos.offset;
+	bch2_trans_iter_exit(&trans, &iter);
+err:
+	if (err == -EINTR)
+		goto retry;
+	bch2_trans_exit(&trans);
+
+	return err ? false : ret;
+}
+
 /*
  * We're going to return -EIOCBQUEUED, but we haven't finished consuming the
  * iov_iter yet, so we need to stash a copy of the iovec: it might be on the
@@ -1911,8 +1954,8 @@ static long bch2_dio_write_loop(struct dio_write *dio)
 		ret = bch2_disk_reservation_get(c, &dio->op.res, bio_sectors(bio),
 						dio->op.opts.data_replicas, 0);
 		if (unlikely(ret) &&
-		    !bch2_check_range_allocated(c, dio->op.pos,
-				bio_sectors(bio),
+		    !bch2_check_range_allocated(c, inode_inum(inode),
+				dio->op.pos.offset, bio_sectors(bio),
 				dio->op.opts.data_replicas,
 				dio->op.opts.compression != 0))
 			goto err;
@@ -2141,9 +2184,9 @@ out:
 
 /* truncate: */
 
-static inline int range_has_data(struct bch_fs *c,
-				  struct bpos start,
-				  struct bpos end)
+static inline int range_has_data(struct bch_fs *c, u32 subvol,
+				 struct bpos start,
+				 struct bpos end)
 {
 	struct btree_trans trans;
 	struct btree_iter iter;
@@ -2151,6 +2194,12 @@ static inline int range_has_data(struct bch_fs *c,
 	int ret = 0;
 
 	bch2_trans_init(&trans, c, 0, 0);
+retry:
+	bch2_trans_begin(&trans);
+
+	ret = bch2_subvolume_get_snapshot(&trans, subvol, &start.snapshot);
+	if (ret)
+		goto err;
 
 	for_each_btree_key(&trans, iter, BTREE_ID_extents, start, 0, k, ret) {
 		if (bkey_cmp(bkey_start_pos(k.k), end) >= 0)
@@ -2161,7 +2210,11 @@ static inline int range_has_data(struct bch_fs *c,
 			break;
 		}
 	}
+	start = iter.pos;
 	bch2_trans_iter_exit(&trans, &iter);
+err:
+	if (ret == -EINTR)
+		goto retry;
 
 	return bch2_trans_exit(&trans) ?: ret;
 }
@@ -2193,7 +2246,7 @@ static int __bch2_truncate_page(struct bch_inode_info *inode,
 		 * XXX: we're doing two index lookups when we end up reading the
 		 * page
 		 */
-		ret = range_has_data(c,
+		ret = range_has_data(c, inode->ei_subvol,
 				POS(inode->v.i_ino, index << PAGE_SECTOR_SHIFT),
 				POS(inode->v.i_ino, (index + 1) << PAGE_SECTOR_SHIFT));
 		if (ret <= 0)
@@ -2327,7 +2380,7 @@ int bch2_truncate(struct mnt_idmap *idmap,
 	inode_dio_wait(&inode->v);
 	bch2_pagecache_block_get(&inode->ei_pagecache_lock);
 
-	ret = bch2_inode_find_by_inum(c, inode->v.i_ino, &inode_u);
+	ret = bch2_inode_find_by_inum(c, inode_inum(inode), &inode_u);
 	if (ret)
 		goto err;
 
@@ -2551,6 +2604,18 @@ static long bchfs_fcollapse_finsert(struct bch_inode_info *inode,
 		struct bpos move_pos = POS(inode->v.i_ino, offset >> 9);
 		struct bpos atomic_end;
 		unsigned trigger_flags = 0;
+		u32 snapshot;
+
+		bch2_trans_begin(&trans);
+
+		ret = bch2_subvolume_get_snapshot(&trans,
+					inode->ei_subvol, &snapshot);
+		if (ret)
+			continue;
+
+		bch2_btree_iter_set_snapshot(&src, snapshot);
+		bch2_btree_iter_set_snapshot(&dst, snapshot);
+		bch2_btree_iter_set_snapshot(&del, snapshot);
 
 		bch2_trans_begin(&trans);
 
@@ -2671,9 +2736,17 @@ static int __bchfs_fallocate(struct bch_inode_info *inode, int mode,
 		struct bkey_i_reservation reservation;
 		struct bkey_s_c k;
 		unsigned sectors;
+		u32 snapshot;
 
 		bch2_trans_begin(&trans);
 
+		ret = bch2_subvolume_get_snapshot(&trans,
+					inode->ei_subvol, &snapshot);
+		if (ret)
+			goto bkey_err;
+
+		bch2_btree_iter_set_snapshot(&iter, snapshot);
+
 		k = bch2_btree_iter_peek_slot(&iter);
 		if ((ret = bkey_err(k)))
 			goto bkey_err;
@@ -2918,8 +2991,8 @@ loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src,
 	mark_range_unallocated(src, pos_src, pos_src + aligned_len);
 
 	ret = bch2_remap_range(c,
-			       POS(dst->v.i_ino, pos_dst >> 9),
-			       POS(src->v.i_ino, pos_src >> 9),
+			       inode_inum(dst), pos_dst >> 9,
+			       inode_inum(src), pos_src >> 9,
 			       aligned_len >> 9,
 			       &dst->ei_journal_seq,
 			       pos_dst + len, &i_sectors_delta);
@@ -3012,7 +3085,9 @@ static loff_t bch2_seek_data(struct file *file, u64 offset)
 	struct btree_trans trans;
 	struct btree_iter iter;
 	struct bkey_s_c k;
+	subvol_inum inum = inode_inum(inode);
 	u64 isize, next_data = MAX_LFS_FILESIZE;
+	u32 snapshot;
 	int ret;
 
 	isize = i_size_read(&inode->v);
@@ -3020,9 +3095,15 @@ static loff_t bch2_seek_data(struct file *file, u64 offset)
 		return -ENXIO;
 
 	bch2_trans_init(&trans, c, 0, 0);
+retry:
+	bch2_trans_begin(&trans);
+
+	ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot);
+	if (ret)
+		goto err;
 
 	for_each_btree_key(&trans, iter, BTREE_ID_extents,
-			   POS(inode->v.i_ino, offset >> 9), 0, k, ret) {
+			   SPOS(inode->v.i_ino, offset >> 9, snapshot), 0, k, ret) {
 		if (k.k->p.inode != inode->v.i_ino) {
 			break;
 		} else if (bkey_extent_is_data(k.k)) {
@@ -3032,6 +3113,9 @@ static loff_t bch2_seek_data(struct file *file, u64 offset)
 			break;
 	}
 	bch2_trans_iter_exit(&trans, &iter);
+err:
+	if (ret == -EINTR)
+		goto retry;
 
 	ret = bch2_trans_exit(&trans) ?: ret;
 	if (ret)
@@ -3108,7 +3192,9 @@ static loff_t bch2_seek_hole(struct file *file, u64 offset)
 	struct btree_trans trans;
 	struct btree_iter iter;
 	struct bkey_s_c k;
+	subvol_inum inum = inode_inum(inode);
 	u64 isize, next_hole = MAX_LFS_FILESIZE;
+	u32 snapshot;
 	int ret;
 
 	isize = i_size_read(&inode->v);
@@ -3116,9 +3202,15 @@ static loff_t bch2_seek_hole(struct file *file, u64 offset)
 		return -ENXIO;
 
 	bch2_trans_init(&trans, c, 0, 0);
+retry:
+	bch2_trans_begin(&trans);
+
+	ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot);
+	if (ret)
+		goto err;
 
 	for_each_btree_key(&trans, iter, BTREE_ID_extents,
-			   POS(inode->v.i_ino, offset >> 9),
+			   SPOS(inode->v.i_ino, offset >> 9, snapshot),
 			   BTREE_ITER_SLOTS, k, ret) {
 		if (k.k->p.inode != inode->v.i_ino) {
 			next_hole = bch2_seek_pagecache_hole(&inode->v,
@@ -3136,6 +3228,9 @@ static loff_t bch2_seek_hole(struct file *file, u64 offset)
 		}
 	}
 	bch2_trans_iter_exit(&trans, &iter);
+err:
+	if (ret == -EINTR)
+		goto retry;
 
 	ret = bch2_trans_exit(&trans) ?: ret;
 	if (ret)
diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c
index ff6b1739342d..91f52ab9b4e2 100644
--- a/fs/bcachefs/fs-ioctl.c
+++ b/fs/bcachefs/fs-ioctl.c
@@ -192,7 +192,7 @@ static int bch2_ioc_reinherit_attrs(struct bch_fs *c,
 	char *kname = NULL;
 	struct qstr qstr;
 	int ret = 0;
-	subvol_inum inum = { .subvol = 1 };
+	subvol_inum inum;
 
 	kname = kmalloc(BCH_NAME_MAX + 1, GFP_KERNEL);
 	if (!kname)
@@ -205,10 +205,8 @@ static int bch2_ioc_reinherit_attrs(struct bch_fs *c,
 	qstr.len	= ret;
 	qstr.name	= kname;
 
-	ret = -ENOENT;
-	inum.inum = bch2_dirent_lookup(c, src->v.i_ino, &hash,
-				  &qstr);
-	if (!inum.inum)
+	ret = bch2_dirent_lookup(c, inode_inum(src), &hash, &qstr, &inum);
+	if (ret)
 		goto err1;
 
 	vinode = bch2_vfs_inode_get(c, inum);
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index 7a994f3f9d20..0d47d9d5737b 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -150,7 +150,7 @@ int __must_check bch2_write_inode(struct bch_fs *c,
 retry:
 	bch2_trans_begin(&trans);
 
-	ret   = bch2_inode_peek(&trans, &iter, &inode_u, inode->v.i_ino,
+	ret   = bch2_inode_peek(&trans, &iter, &inode_u, inode_inum(inode),
 				BTREE_ITER_INTENT) ?:
 		(set ? set(inode, &inode_u, p) : 0) ?:
 		bch2_inode_write(&trans, &iter, &inode_u) ?:
@@ -256,7 +256,7 @@ struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum)
 	if (!(inode->v.i_state & I_NEW))
 		return &inode->v;
 
-	ret = bch2_inode_find_by_inum(c, inum.inum, &inode_u);
+	ret = bch2_inode_find_by_inum(c, inum, &inode_u);
 	if (ret) {
 		iget_failed(&inode->v);
 		return ERR_PTR(ret);
@@ -271,10 +271,10 @@ struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum)
 	return &inode->v;
 }
 
-static struct bch_inode_info *
+struct bch_inode_info *
 __bch2_create(struct mnt_idmap *idmap,
 	      struct bch_inode_info *dir, struct dentry *dentry,
-	      umode_t mode, dev_t rdev, bool tmpfile)
+	      umode_t mode, dev_t rdev, unsigned flags)
 {
 	struct bch_fs *c = dir->v.i_sb->s_fs_info;
 	struct btree_trans trans;
@@ -303,20 +303,23 @@ __bch2_create(struct mnt_idmap *idmap,
 
 	bch2_inode_init_early(c, &inode_u);
 
-	if (!tmpfile)
+	if (!(flags & BCH_CREATE_TMPFILE))
 		mutex_lock(&dir->ei_update_lock);
 
 	bch2_trans_init(&trans, c, 8,
-			2048 + (!tmpfile ? dentry->d_name.len : 0));
+			2048 + (!(flags & BCH_CREATE_TMPFILE)
+				? dentry->d_name.len : 0));
 retry:
 	bch2_trans_begin(&trans);
 
-	ret   = bch2_create_trans(&trans, dir->v.i_ino, &dir_u, &inode_u,
-				  !tmpfile ? &dentry->d_name : NULL,
+	ret   = bch2_create_trans(&trans,
+				  inode_inum(dir), &dir_u, &inode_u,
+				  !(flags & BCH_CREATE_TMPFILE)
+				  ? &dentry->d_name : NULL,
 				  from_kuid(i_user_ns(&dir->v), current_fsuid()),
 				  from_kgid(i_user_ns(&dir->v), current_fsgid()),
 				  mode, rdev,
-				  default_acl, acl) ?:
+				  default_acl, acl, flags) ?:
 		bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, 1,
 				KEY_TYPE_QUOTA_PREALLOC);
 	if (unlikely(ret))
@@ -332,7 +335,7 @@ err_before_quota:
 		goto err_trans;
 	}
 
-	if (!tmpfile) {
+	if (!(flags & BCH_CREATE_TMPFILE)) {
 		bch2_inode_update_after_write(c, dir, &dir_u,
 					      ATTR_MTIME|ATTR_CTIME);
 		journal_seq_copy(c, dir, journal_seq);
@@ -387,7 +390,7 @@ err:
 	posix_acl_release(acl);
 	return inode;
 err_trans:
-	if (!tmpfile)
+	if (!(flags & BCH_CREATE_TMPFILE))
 		mutex_unlock(&dir->ei_update_lock);
 
 	bch2_trans_exit(&trans);
@@ -407,11 +410,12 @@ static struct dentry *bch2_lookup(struct inode *vdir, struct dentry *dentry,
 	struct bch_hash_info hash = bch2_hash_info_init(c, &dir->ei_inode);
 	struct inode *vinode = NULL;
 	subvol_inum inum = { .subvol = 1 };
+	int ret;
 
-	inum.inum = bch2_dirent_lookup(c, dir->v.i_ino, &hash,
-				  &dentry->d_name);
+	ret = bch2_dirent_lookup(c, inode_inum(dir), &hash,
+				 &dentry->d_name, &inum);
 
-	if (inum.inum)
+	if (!ret)
 		vinode = bch2_vfs_inode_get(c, inum);
 
 	return d_splice_alias(vinode, dentry);
@@ -422,7 +426,7 @@ static int bch2_mknod(struct mnt_idmap *idmap,
 		      umode_t mode, dev_t rdev)
 {
 	struct bch_inode_info *inode =
-		__bch2_create(idmap, to_bch_ei(vdir), dentry, mode, rdev, false);
+		__bch2_create(idmap, to_bch_ei(vdir), dentry, mode, rdev, 0);
 
 	if (IS_ERR(inode))
 		return PTR_ERR(inode);
@@ -452,8 +456,8 @@ static int __bch2_link(struct bch_fs *c,
 
 	ret = __bch2_trans_do(&trans, NULL, &inode->ei_journal_seq, 0,
 			bch2_link_trans(&trans,
-					dir->v.i_ino,
-					inode->v.i_ino, &dir_u, &inode_u,
+					inode_inum(dir),   &dir_u,
+					inode_inum(inode), &inode_u,
 					&dentry->d_name));
 
 	if (likely(!ret)) {
@@ -504,7 +508,7 @@ static int bch2_unlink(struct inode *vdir, struct dentry *dentry)
 	ret = __bch2_trans_do(&trans, NULL, &dir->ei_journal_seq,
 			      BTREE_INSERT_NOFAIL,
 			bch2_unlink_trans(&trans,
-					  dir->v.i_ino, &dir_u,
+					  inode_inum(dir), &dir_u,
 					  &inode_u, &dentry->d_name));
 
 	if (likely(!ret)) {
@@ -531,7 +535,8 @@ static int bch2_symlink(struct mnt_idmap *idmap,
 	struct bch_inode_info *dir = to_bch_ei(vdir), *inode;
 	int ret;
 
-	inode = __bch2_create(idmap, dir, dentry, S_IFLNK|S_IRWXUGO, 0, true);
+	inode = __bch2_create(idmap, dir, dentry, S_IFLNK|S_IRWXUGO, 0,
+			      BCH_CREATE_TMPFILE);
 	if (unlikely(IS_ERR(inode)))
 		return PTR_ERR(inode);
 
@@ -624,8 +629,8 @@ static int bch2_rename2(struct mnt_idmap *idmap,
 
 	ret = __bch2_trans_do(&trans, NULL, &journal_seq, 0,
 			bch2_rename_trans(&trans,
-					  src_dir->v.i_ino, &src_dir_u,
-					  dst_dir->v.i_ino, &dst_dir_u,
+					  inode_inum(src_dir), &src_dir_u,
+					  inode_inum(dst_dir), &dst_dir_u,
 					  &src_inode_u,
 					  &dst_inode_u,
 					  &src_dentry->d_name,
@@ -748,7 +753,7 @@ retry:
 	kfree(acl);
 	acl = NULL;
 
-	ret = bch2_inode_peek(&trans, &inode_iter, &inode_u, inode->v.i_ino,
+	ret = bch2_inode_peek(&trans, &inode_iter, &inode_u, inode_inum(inode),
 			      BTREE_ITER_INTENT);
 	if (ret)
 		goto btree_err;
@@ -756,7 +761,8 @@ retry:
 	bch2_setattr_copy(idmap, inode, &inode_u, attr);
 
 	if (attr->ia_valid & ATTR_MODE) {
-		ret = bch2_acl_chmod(&trans, &inode_u, inode_u.bi_mode, &acl);
+		ret = bch2_acl_chmod(&trans, inode_inum(inode), &inode_u,
+				     inode_u.bi_mode, &acl);
 		if (ret)
 			goto btree_err;
 	}
@@ -848,7 +854,8 @@ static int bch2_tmpfile(struct mnt_idmap *idmap,
 {
 	struct bch_inode_info *inode =
 		__bch2_create(idmap, to_bch_ei(vdir),
-			      file->f_path.dentry, mode, 0, true);
+			      file->f_path.dentry, mode, 0,
+			      BCH_CREATE_TMPFILE);
 
 	if (IS_ERR(inode))
 		return PTR_ERR(inode);
@@ -923,6 +930,7 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
 	struct bpos end = POS(ei->v.i_ino, (start + len) >> 9);
 	unsigned offset_into_extent, sectors;
 	bool have_extent = false;
+	u32 snapshot;
 	int ret = 0;
 
 	ret = fiemap_prep(&ei->v, info, start, &len, FIEMAP_FLAG_SYNC);
@@ -932,15 +940,21 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
 	if (start + len < start)
 		return -EINVAL;
 
+	start >>= 9;
+
 	bch2_bkey_buf_init(&cur);
 	bch2_bkey_buf_init(&prev);
 	bch2_trans_init(&trans, c, 0, 0);
-
-	bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
-			     POS(ei->v.i_ino, start >> 9), 0);
 retry:
 	bch2_trans_begin(&trans);
 
+	ret = bch2_subvolume_get_snapshot(&trans, ei->ei_subvol, &snapshot);
+	if (ret)
+		goto err;
+
+	bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
+			     SPOS(ei->v.i_ino, start, snapshot), 0);
+
 	while ((k = bch2_btree_iter_peek(&iter)).k &&
 	       !(ret = bkey_err(k)) &&
 	       bkey_cmp(iter.pos, end) < 0) {
@@ -989,7 +1003,9 @@ retry:
 		bch2_btree_iter_set_pos(&iter,
 			POS(iter.pos.inode, iter.pos.offset + sectors));
 	}
-
+	start = iter.pos.offset;
+	bch2_trans_iter_exit(&trans, &iter);
+err:
 	if (ret == -EINTR)
 		goto retry;
 
@@ -997,7 +1013,6 @@ retry:
 		ret = bch2_fill_extent(c, info, bkey_i_to_s_c(prev.k),
 				       FIEMAP_EXTENT_LAST);
 
-	bch2_trans_iter_exit(&trans, &iter);
 	ret = bch2_trans_exit(&trans) ?: ret;
 	bch2_bkey_buf_exit(&cur, c);
 	bch2_bkey_buf_exit(&prev, c);
@@ -1034,7 +1049,7 @@ static int bch2_vfs_readdir(struct file *file, struct dir_context *ctx)
 	if (!dir_emit_dots(file, ctx))
 		return 0;
 
-	return bch2_readdir(c, inode->v.i_ino, ctx);
+	return bch2_readdir(c, inode_inum(inode), ctx);
 }
 
 static const struct file_operations bch_file_operations = {
@@ -1290,7 +1305,7 @@ static void bch2_evict_inode(struct inode *vinode)
 				KEY_TYPE_QUOTA_WARN);
 		bch2_quota_acct(c, inode->ei_qid, Q_INO, -1,
 				KEY_TYPE_QUOTA_WARN);
-		bch2_inode_rm(c, inode->v.i_ino, true);
+		bch2_inode_rm(c, inode_inum(inode), true);
 	}
 }
 
diff --git a/fs/bcachefs/fs.h b/fs/bcachefs/fs.h
index 6dae425bf616..aa755987b36c 100644
--- a/fs/bcachefs/fs.h
+++ b/fs/bcachefs/fs.h
@@ -144,6 +144,10 @@ struct bch_inode_unpacked;
 
 #ifndef NO_BCACHEFS_FS
 
+struct bch_inode_info *
+__bch2_create(struct mnt_idmap *, struct bch_inode_info *,
+	      struct dentry *, umode_t, dev_t, unsigned);
+
 int bch2_fs_quota_transfer(struct bch_fs *,
 			   struct bch_inode_info *,
 			   struct bch_qid,
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index e4ca05aae76c..40b107715cdd 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -858,7 +858,10 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
 	d = bkey_s_c_to_dirent(k);
 	d_inum = le64_to_cpu(d.v->d_inum);
 
-	ret = bch2_dirent_read_target(trans, d, &d_inum);
+	ret = __bch2_dirent_read_target(&trans, d,
+					&target_subvol,
+					&target_snapshot,
+					&target_inum);
 	if (ret && ret != -ENOENT)
 		return ret;
 
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index 3b19dc6b9ddc..7fccf842a46b 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -6,6 +6,7 @@
 #include "btree_update.h"
 #include "error.h"
 #include "extents.h"
+#include "extent_update.h"
 #include "inode.h"
 #include "str_hash.h"
 #include "subvolume.h"
@@ -296,15 +297,21 @@ int bch2_inode_unpack(struct bkey_s_c_inode inode,
 int bch2_inode_peek(struct btree_trans *trans,
 		    struct btree_iter *iter,
 		    struct bch_inode_unpacked *inode,
-		    u64 inum, unsigned flags)
+		    subvol_inum inum, unsigned flags)
 {
 	struct bkey_s_c k;
+	u32 snapshot;
 	int ret;
 
 	if (trans->c->opts.inodes_use_key_cache)
 		flags |= BTREE_ITER_CACHED;
 
-	bch2_trans_iter_init(trans, iter, BTREE_ID_inodes, POS(0, inum), flags);
+	ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
+	if (ret)
+		return ret;
+
+	bch2_trans_iter_init(trans, iter, BTREE_ID_inodes,
+			     SPOS(0, inum.inum, snapshot), flags);
 	k = bch2_btree_iter_peek_slot(iter);
 	ret = bkey_err(k);
 	if (ret)
@@ -486,6 +493,9 @@ static inline u32 bkey_generation(struct bkey_s_c k)
 	}
 }
 
+/*
+ * This just finds an empty slot:
+ */
 int bch2_inode_create(struct btree_trans *trans,
 		      struct btree_iter *iter,
 		      struct bch_inode_unpacked *inode_u,
@@ -585,16 +595,74 @@ found_slot:
 	return 0;
 }
 
-int bch2_inode_rm(struct bch_fs *c, u64 inode_nr, bool cached)
+static int bch2_inode_delete_keys(struct btree_trans *trans,
+				  subvol_inum inum, enum btree_id id)
+{
+	u64 offset = 0;
+	int ret = 0;
+
+	while (!ret || ret == -EINTR) {
+		struct btree_iter iter;
+		struct bkey_s_c k;
+		struct bkey_i delete;
+		u32 snapshot;
+
+		bch2_trans_begin(trans);
+
+		ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
+		if (ret)
+			continue;
+
+		bch2_trans_iter_init(trans, &iter, id,
+				     SPOS(inum.inum, offset, snapshot),
+				     BTREE_ITER_INTENT);
+		k = bch2_btree_iter_peek(&iter);
+
+		if (!k.k || iter.pos.inode != inum.inum) {
+			bch2_trans_iter_exit(trans, &iter);
+			break;
+		}
+
+		ret = bkey_err(k);
+		if (ret)
+			goto err;
+
+		bkey_init(&delete.k);
+		delete.k.p = iter.pos;
+
+		if (btree_node_type_is_extents(iter.btree_id)) {
+			unsigned max_sectors =
+				min_t(u64, U64_MAX - iter.pos.offset,
+				      KEY_SIZE_MAX & (~0 << trans->c->block_bits));
+
+			/* create the biggest key we can */
+			bch2_key_resize(&delete.k, max_sectors);
+
+			ret = bch2_extent_trim_atomic(trans, &iter, &delete);
+			if (ret)
+				goto err;
+		}
+
+		ret = bch2_trans_update(trans, &iter, &delete, 0) ?:
+		      bch2_trans_commit(trans, NULL, NULL,
+					BTREE_INSERT_NOFAIL);
+err:
+		offset = iter.pos.offset;
+		bch2_trans_iter_exit(trans, &iter);
+	}
+
+	return ret;
+}
+
+int bch2_inode_rm(struct bch_fs *c, subvol_inum inum, bool cached)
 {
 	struct btree_trans trans;
 	struct btree_iter iter = { NULL };
 	struct bkey_i_inode_generation delete;
-	struct bpos start = POS(inode_nr, 0);
-	struct bpos end = POS(inode_nr + 1, 0);
 	struct bch_inode_unpacked inode_u;
 	struct bkey_s_c k;
 	unsigned iter_flags = BTREE_ITER_INTENT;
+	u32 snapshot;
 	int ret;
 
 	if (cached && c->opts.inodes_use_key_cache)
@@ -610,19 +678,20 @@ int bch2_inode_rm(struct bch_fs *c, u64 inode_nr, bool cached)
 	 * XXX: the dirent could ideally would delete whiteouts when they're no
 	 * longer needed
 	 */
-	ret   = bch2_btree_delete_range_trans(&trans, BTREE_ID_extents,
-					      start, end, NULL) ?:
-		bch2_btree_delete_range_trans(&trans, BTREE_ID_xattrs,
-					      start, end, NULL) ?:
-		bch2_btree_delete_range_trans(&trans, BTREE_ID_dirents,
-					      start, end, NULL);
+	ret   = bch2_inode_delete_keys(&trans, inum, BTREE_ID_extents) ?:
+		bch2_inode_delete_keys(&trans, inum, BTREE_ID_xattrs) ?:
+		bch2_inode_delete_keys(&trans, inum, BTREE_ID_dirents);
 	if (ret)
 		goto err;
 retry:
 	bch2_trans_begin(&trans);
 
+	ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot);
+	if (ret)
+		goto err;
+
 	bch2_trans_iter_init(&trans, &iter, BTREE_ID_inodes,
-			     POS(0, inode_nr), iter_flags);
+			     SPOS(0, inum.inum, snapshot), iter_flags);
 	k = bch2_btree_iter_peek_slot(&iter);
 
 	ret = bkey_err(k);
@@ -632,7 +701,7 @@ retry:
 	if (k.k->type != KEY_TYPE_inode) {
 		bch2_fs_inconsistent(trans.c,
 				     "inode %llu not found when deleting",
-				     inode_nr);
+				     inum.inum);
 		ret = -EIO;
 		goto err;
 	}
@@ -662,20 +731,22 @@ err:
 	return ret;
 }
 
-static int bch2_inode_find_by_inum_trans(struct btree_trans *trans, u64 inode_nr,
+static int bch2_inode_find_by_inum_trans(struct btree_trans *trans,
+					 subvol_inum inum,
 					 struct bch_inode_unpacked *inode)
 {
-	struct btree_iter iter = { NULL };
+	struct btree_iter iter;
 	int ret;
 
-	ret = bch2_inode_peek(trans, &iter, inode, inode_nr, 0);
-	bch2_trans_iter_exit(trans, &iter);
+	ret = bch2_inode_peek(trans, &iter, inode, inum, 0);
+	if (!ret)
+		bch2_trans_iter_exit(trans, &iter);
 	return ret;
 }
 
-int bch2_inode_find_by_inum(struct bch_fs *c, u64 inode_nr,
+int bch2_inode_find_by_inum(struct bch_fs *c, subvol_inum inum,
 			    struct bch_inode_unpacked *inode)
 {
 	return bch2_trans_do(c, NULL, NULL, 0,
-		bch2_inode_find_by_inum_trans(&trans, inode_nr, inode));
+		bch2_inode_find_by_inum_trans(&trans, inum, inode));
 }
diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h
index 25bef104ebcc..9e84cddcc6cb 100644
--- a/fs/bcachefs/inode.h
+++ b/fs/bcachefs/inode.h
@@ -58,7 +58,7 @@ int bch2_inode_unpack(struct bkey_s_c_inode, struct bch_inode_unpacked *);
 void bch2_inode_unpacked_to_text(struct printbuf *, struct bch_inode_unpacked *);
 
 int bch2_inode_peek(struct btree_trans *, struct btree_iter *,
-		    struct bch_inode_unpacked *, u64, unsigned);
+		    struct bch_inode_unpacked *, subvol_inum, unsigned);
 int bch2_inode_write(struct btree_trans *, struct btree_iter *,
 		     struct bch_inode_unpacked *);
 
@@ -74,9 +74,10 @@ void bch2_inode_init(struct bch_fs *, struct bch_inode_unpacked *,
 int bch2_inode_create(struct btree_trans *, struct btree_iter *,
 		      struct bch_inode_unpacked *, u32, u64);
 
-int bch2_inode_rm(struct bch_fs *, u64, bool);
+int bch2_inode_rm(struct bch_fs *, subvol_inum, bool);
 
-int bch2_inode_find_by_inum(struct bch_fs *, u64, struct bch_inode_unpacked *);
+int bch2_inode_find_by_inum(struct bch_fs *, subvol_inum,
+			    struct bch_inode_unpacked *);
 
 static inline struct bch_io_opts bch2_inode_opts_get(struct bch_inode_unpacked *inode)
 {
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index f95ceb820faa..0f5e0099b848 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -325,7 +325,10 @@ int bch2_extent_update(struct btree_trans *trans,
 		struct bch_inode_unpacked inode_u;
 
 		ret = bch2_inode_peek(trans, &inode_iter, &inode_u,
-				k->k.p.inode, BTREE_ITER_INTENT);
+				      (subvol_inum) {
+				      .subvol = BCACHEFS_ROOT_SUBVOL,
+				      .inum = k->k.p.inode,
+				      }, BTREE_ITER_INTENT);
 		if (ret)
 			return ret;
 
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index eb2b91f7e682..9dc6684139de 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -581,7 +581,8 @@ static int __bch2_move_data(struct bch_fs *c,
 	stats->pos	= start;
 
 	bch2_trans_iter_init(&trans, &iter, btree_id, start,
-			     BTREE_ITER_PREFETCH);
+			     BTREE_ITER_PREFETCH|
+			     BTREE_ITER_ALL_SNAPSHOTS);
 
 	if (rate)
 		bch2_ratelimit_reset(rate);
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 2aab57cf09e1..47c8fecc6839 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -1480,11 +1480,12 @@ int bch2_fs_initialize(struct bch_fs *c)
 
 	err = "error creating lost+found";
 	ret = bch2_trans_do(c, NULL, NULL, 0,
-		bch2_create_trans(&trans, BCACHEFS_ROOT_INO,
+		bch2_create_trans(&trans,
+				  BCACHEFS_ROOT_SUBVOL_INUM,
 				  &root_inode, &lostfound_inode,
 				  &lostfound,
 				  0, 0, S_IFDIR|0700, 0,
-				  NULL, NULL));
+				  NULL, NULL, 0));
 	if (ret) {
 		bch_err(c, "error creating lost+found");
 		goto err;
diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c
index 576cfbccf5b5..be4b47bc7438 100644
--- a/fs/bcachefs/reflink.c
+++ b/fs/bcachefs/reflink.c
@@ -7,6 +7,7 @@
 #include "inode.h"
 #include "io.h"
 #include "reflink.h"
+#include "subvolume.h"
 
 #include <linux/sched/signal.h>
 
@@ -197,7 +198,8 @@ static struct bkey_s_c get_next_src(struct btree_iter *iter, struct bpos end)
 }
 
 s64 bch2_remap_range(struct bch_fs *c,
-		     struct bpos dst_start, struct bpos src_start,
+		     subvol_inum dst_inum, u64 dst_offset,
+		     subvol_inum src_inum, u64 src_offset,
 		     u64 remap_sectors, u64 *journal_seq,
 		     u64 new_i_size, s64 *i_sectors_delta)
 {
@@ -205,6 +207,8 @@ s64 bch2_remap_range(struct bch_fs *c,
 	struct btree_iter dst_iter, src_iter;
 	struct bkey_s_c src_k;
 	struct bkey_buf new_dst, new_src;
+	struct bpos dst_start = POS(dst_inum.inum, dst_offset);
+	struct bpos src_start = POS(src_inum.inum, src_offset);
 	struct bpos dst_end = dst_start, src_end = src_start;
 	struct bpos src_want;
 	u64 dst_done;
@@ -238,6 +242,16 @@ s64 bch2_remap_range(struct bch_fs *c,
 			break;
 		}
 
+		ret = bch2_subvolume_get_snapshot(&trans, src_inum.subvol,
+						  &src_iter.snapshot);
+		if (ret)
+			continue;
+
+		ret = bch2_subvolume_get_snapshot(&trans, dst_inum.subvol,
+						  &dst_iter.snapshot);
+		if (ret)
+			continue;
+
 		dst_done = dst_iter.pos.offset - dst_start.offset;
 		src_want = POS(src_start.inode, src_start.offset + dst_done);
 		bch2_btree_iter_set_pos(&src_iter, src_want);
@@ -311,7 +325,7 @@ s64 bch2_remap_range(struct bch_fs *c,
 		bch2_trans_begin(&trans);
 
 		ret2 = bch2_inode_peek(&trans, &inode_iter, &inode_u,
-				dst_start.inode, BTREE_ITER_INTENT);
+				       dst_inum, BTREE_ITER_INTENT);
 
 		if (!ret2 &&
 		    inode_u.bi_size < new_i_size) {
diff --git a/fs/bcachefs/reflink.h b/fs/bcachefs/reflink.h
index 68c5cb5a2780..4c1b82860b0b 100644
--- a/fs/bcachefs/reflink.h
+++ b/fs/bcachefs/reflink.h
@@ -57,7 +57,7 @@ static inline __le64 *bkey_refcount(struct bkey_i *k)
 	}
 }
 
-s64 bch2_remap_range(struct bch_fs *, struct bpos, struct bpos,
-		     u64, u64 *, u64, s64 *);
+s64 bch2_remap_range(struct bch_fs *, subvol_inum, u64,
+		     subvol_inum, u64, u64, u64 *, u64, s64 *);
 
 #endif /* _BCACHEFS_REFLINK_H */
diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h
index c6a132b3c5bb..6418089531ad 100644
--- a/fs/bcachefs/str_hash.h
+++ b/fs/bcachefs/str_hash.h
@@ -8,6 +8,7 @@
 #include "error.h"
 #include "inode.h"
 #include "siphash.h"
+#include "subvolume.h"
 #include "super.h"
 
 #include <linux/crc32c.h>
@@ -144,16 +145,21 @@ bch2_hash_lookup(struct btree_trans *trans,
 		 struct btree_iter *iter,
 		 const struct bch_hash_desc desc,
 		 const struct bch_hash_info *info,
-		 u64 inode, const void *key,
+		 subvol_inum inum, const void *key,
 		 unsigned flags)
 {
 	struct bkey_s_c k;
+	u32 snapshot;
 	int ret;
 
+	ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
+	if (ret)
+		return ret;
+
 	for_each_btree_key(trans, *iter, desc.btree_id,
-			   POS(inode, desc.hash_key(info, key)),
+			   SPOS(inum.inum, desc.hash_key(info, key), snapshot),
 			   BTREE_ITER_SLOTS|flags, k, ret) {
-		if (iter->pos.inode != inode)
+		if (iter->pos.inode != inum.inum)
 			break;
 
 		if (k.k->type == desc.key_type) {
@@ -176,15 +182,20 @@ bch2_hash_hole(struct btree_trans *trans,
 	       struct btree_iter *iter,
 	       const struct bch_hash_desc desc,
 	       const struct bch_hash_info *info,
-	       u64 inode, const void *key)
+	       subvol_inum inum, const void *key)
 {
 	struct bkey_s_c k;
+	u32 snapshot;
 	int ret;
 
+	ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
+	if (ret)
+		return ret;
+
 	for_each_btree_key(trans, *iter, desc.btree_id,
-			   POS(inode, desc.hash_key(info, key)),
+			   SPOS(inum.inum, desc.hash_key(info, key), snapshot),
 			   BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) {
-		if (iter->pos.inode != inode)
+		if (iter->pos.inode != inum.inum)
 			break;
 
 		if (k.k->type != desc.key_type)
@@ -229,17 +240,25 @@ static __always_inline
 int bch2_hash_set(struct btree_trans *trans,
 		  const struct bch_hash_desc desc,
 		  const struct bch_hash_info *info,
-		  u64 inode, struct bkey_i *insert, int flags)
+		  subvol_inum inum,
+		  struct bkey_i *insert, int flags)
 {
 	struct btree_iter iter, slot = { NULL };
 	struct bkey_s_c k;
 	bool found = false;
+	u32 snapshot;
 	int ret;
 
+	ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
+	if (ret)
+		return ret;
+
 	for_each_btree_key(trans, iter, desc.btree_id,
-			   POS(inode, desc.hash_bkey(info, bkey_i_to_s_c(insert))),
+			   SPOS(inum.inum,
+				desc.hash_bkey(info, bkey_i_to_s_c(insert)),
+				snapshot),
 			   BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) {
-		if (iter.pos.inode != inode)
+		if (iter.pos.inode != inum.inum)
 			break;
 
 		if (k.k->type == desc.key_type) {
@@ -313,12 +332,12 @@ static __always_inline
 int bch2_hash_delete(struct btree_trans *trans,
 		     const struct bch_hash_desc desc,
 		     const struct bch_hash_info *info,
-		     u64 inode, const void *key)
+		     subvol_inum inum, const void *key)
 {
 	struct btree_iter iter;
 	int ret;
 
-	ret = bch2_hash_lookup(trans, &iter, desc, info, inode, key,
+	ret = bch2_hash_lookup(trans, &iter, desc, info, inum, key,
 				BTREE_ITER_INTENT);
 	if (ret)
 		return ret;
diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c
index babbfaadeb3f..ff81a25698ff 100644
--- a/fs/bcachefs/xattr.c
+++ b/fs/bcachefs/xattr.c
@@ -128,7 +128,7 @@ static int bch2_xattr_get_trans(struct btree_trans *trans, struct bch_inode_info
 	int ret;
 
 	ret = bch2_hash_lookup(trans, &iter, bch2_xattr_hash_desc, &hash,
-			       inode->v.i_ino,
+			       inode_inum(inode),
 			       &X_SEARCH(type, name, strlen(name)),
 			       0);
 	if (ret)
@@ -160,7 +160,7 @@ int bch2_xattr_get(struct bch_fs *c, struct bch_inode_info *inode,
 		bch2_xattr_get_trans(&trans, inode, name, buffer, size, type));
 }
 
-int bch2_xattr_set(struct btree_trans *trans, u64 inum,
+int bch2_xattr_set(struct btree_trans *trans, subvol_inum inum,
 		   const struct bch_hash_info *hash_info,
 		   const char *name, const void *value, size_t size,
 		   int type, int flags)
@@ -282,13 +282,21 @@ ssize_t bch2_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size)
 	struct btree_iter iter;
 	struct bkey_s_c k;
 	struct xattr_buf buf = { .buf = buffer, .len = buffer_size };
-	u64 inum = dentry->d_inode->i_ino;
+	u64 offset = 0, inum = inode->ei_inode.bi_inum;
+	u32 snapshot;
 	int ret;
 
 	bch2_trans_init(&trans, c, 0, 0);
+retry:
+	bch2_trans_begin(&trans);
+	iter = (struct btree_iter) { NULL };
+
+	ret = bch2_subvolume_get_snapshot(&trans, inode->ei_subvol, &snapshot);
+	if (ret)
+		goto err;
 
 	for_each_btree_key(&trans, iter, BTREE_ID_xattrs,
-			   POS(inum, 0), 0, k, ret) {
+			   SPOS(inum, offset, snapshot), 0, k, ret) {
 		BUG_ON(k.k->p.inode < inum);
 
 		if (k.k->p.inode > inum)
@@ -301,7 +309,12 @@ ssize_t bch2_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size)
 		if (ret)
 			break;
 	}
+
+	offset = iter.pos.offset;
 	bch2_trans_iter_exit(&trans, &iter);
+err:
+	if (ret == -EINTR)
+		goto retry;
 
 	ret = bch2_trans_exit(&trans) ?: ret;
 
@@ -340,7 +353,7 @@ static int bch2_xattr_set_handler(const struct xattr_handler *handler,
 	struct bch_hash_info hash = bch2_hash_info_init(c, &inode->ei_inode);
 
 	return bch2_trans_do(c, NULL, &inode->ei_journal_seq, 0,
-			bch2_xattr_set(&trans, inode->v.i_ino, &hash,
+			bch2_xattr_set(&trans, inode_inum(inode), &hash,
 				       name, value, size,
 				       handler->flags, flags));
 }
diff --git a/fs/bcachefs/xattr.h b/fs/bcachefs/xattr.h
index 4151065ab853..f4f896545e1c 100644
--- a/fs/bcachefs/xattr.h
+++ b/fs/bcachefs/xattr.h
@@ -39,7 +39,8 @@ struct bch_inode_info;
 int bch2_xattr_get(struct bch_fs *, struct bch_inode_info *,
 		  const char *, void *, size_t, int);
 
-int bch2_xattr_set(struct btree_trans *, u64, const struct bch_hash_info *,
+int bch2_xattr_set(struct btree_trans *, subvol_inum,
+		   const struct bch_hash_info *,
 		   const char *, const void *, size_t, int, int);
 
 ssize_t bch2_xattr_list(struct dentry *, char *, size_t);
-- 
cgit 


From ef1669ffc69c4926066451e1d0bc32e5ed6fc7d7 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 20 Apr 2021 00:15:44 -0400
Subject: bcachefs: Update fsck for snapshots

This updates the fsck algorithms to handle snapshots - meaning there
will be multiple versions of the same key (extents, inodes, dirents,
xattrs) in different snapshots, and we have to carefully consider which
keys are visible in which snapshot.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_update.h      |    2 +-
 fs/bcachefs/btree_update_leaf.c |    5 +-
 fs/bcachefs/fsck.c              | 1400 +++++++++++++++++++++++++++++----------
 3 files changed, 1044 insertions(+), 363 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
index 058d283a105c..6f19b67c398f 100644
--- a/fs/bcachefs/btree_update.h
+++ b/fs/bcachefs/btree_update.h
@@ -61,7 +61,7 @@ int bch2_btree_insert(struct bch_fs *, enum btree_id, struct bkey_i *,
 		     struct disk_reservation *, u64 *, int flags);
 
 int bch2_btree_delete_range_trans(struct btree_trans *, enum btree_id,
-				  struct bpos, struct bpos, u64 *);
+				  struct bpos, struct bpos, unsigned, u64 *);
 int bch2_btree_delete_range(struct bch_fs *, enum btree_id,
 			    struct bpos, struct bpos, u64 *);
 
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 6e904f9195cc..1922bf8236f7 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -1204,13 +1204,14 @@ int bch2_btree_delete_at(struct btree_trans *trans,
 
 int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id id,
 				  struct bpos start, struct bpos end,
+				  unsigned iter_flags,
 				  u64 *journal_seq)
 {
 	struct btree_iter iter;
 	struct bkey_s_c k;
 	int ret = 0;
 
-	bch2_trans_iter_init(trans, &iter, id, start, BTREE_ITER_INTENT);
+	bch2_trans_iter_init(trans, &iter, id, start, BTREE_ITER_INTENT|iter_flags);
 retry:
 	while ((bch2_trans_begin(trans),
 	       (k = bch2_btree_iter_peek(&iter)).k) &&
@@ -1277,5 +1278,5 @@ int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id,
 			    u64 *journal_seq)
 {
 	return bch2_trans_do(c, NULL, journal_seq, 0,
-			     bch2_btree_delete_range_trans(&trans, id, start, end, journal_seq));
+			     bch2_btree_delete_range_trans(&trans, id, start, end, 0, journal_seq));
 }
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 40b107715cdd..b4a6b3d2ed07 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -18,7 +18,8 @@
 
 #define QSTR(n) { { { .len = strlen(n) } }, .name = n }
 
-static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum)
+static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum,
+				    u32 snapshot)
 {
 	struct btree_iter iter;
 	struct bkey_s_c k;
@@ -26,7 +27,7 @@ static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum)
 	int ret;
 
 	for_each_btree_key(trans, iter, BTREE_ID_extents,
-			   POS(inum, 0), 0, k, ret) {
+			   SPOS(inum, 0, snapshot), 0, k, ret) {
 		if (k.k->p.inode != inum)
 			break;
 
@@ -39,6 +40,33 @@ static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum)
 	return ret ?: sectors;
 }
 
+static s64 bch2_count_subdirs(struct btree_trans *trans, u64 inum,
+				    u32 snapshot)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	struct bkey_s_c_dirent d;
+	u64 subdirs = 0;
+	int ret;
+
+	for_each_btree_key(trans, iter, BTREE_ID_dirents,
+			   SPOS(inum, 0, snapshot), 0, k, ret) {
+		if (k.k->p.inode != inum)
+			break;
+
+		if (k.k->type != KEY_TYPE_dirent)
+			continue;
+
+		d = bkey_s_c_to_dirent(k);
+		if (d.v->d_type == DT_DIR)
+			subdirs++;
+	}
+
+	bch2_trans_iter_exit(trans, &iter);
+
+	return ret ?: subdirs;
+}
+
 static int __snapshot_lookup_subvol(struct btree_trans *trans, u32 snapshot,
 				    u32 *subvol)
 {
@@ -72,8 +100,8 @@ static int snapshot_lookup_subvol(struct btree_trans *trans, u32 snapshot,
 	return lockrestart_do(trans, __snapshot_lookup_subvol(trans, snapshot, subvol));
 }
 
-static int __subvol_lookup_root(struct btree_trans *trans, u32 subvol,
-				u64 *inum)
+static int __subvol_lookup(struct btree_trans *trans, u32 subvol,
+			   u32 *snapshot, u64 *inum)
 {
 	struct btree_iter iter;
 	struct bkey_s_c k;
@@ -92,6 +120,7 @@ static int __subvol_lookup_root(struct btree_trans *trans, u32 subvol,
 		goto err;
 	}
 
+	*snapshot = le32_to_cpu(bkey_s_c_to_subvolume(k).v->snapshot);
 	*inum = le64_to_cpu(bkey_s_c_to_subvolume(k).v->inode);
 err:
 	bch2_trans_iter_exit(trans, &iter);
@@ -99,9 +128,10 @@ err:
 
 }
 
-static int subvol_lookup_root(struct btree_trans *trans, u32 subvol, u64 *inum)
+static int subvol_lookup(struct btree_trans *trans, u32 subvol,
+			 u32 *snapshot, u64 *inum)
 {
-	return lockrestart_do(trans, __subvol_lookup_root(trans, subvol, inum));
+	return lockrestart_do(trans, __subvol_lookup(trans, subvol, snapshot, inum));
 }
 
 static int __lookup_inode(struct btree_trans *trans, u64 inode_nr,
@@ -113,14 +143,13 @@ static int __lookup_inode(struct btree_trans *trans, u64 inode_nr,
 	int ret;
 
 	bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes,
-			     POS(0, inode_nr), 0);
+			     SPOS(0, inode_nr, *snapshot), 0);
 	k = bch2_btree_iter_peek_slot(&iter);
 	ret = bkey_err(k);
 	if (ret)
 		goto err;
 
-	if (snapshot)
-		*snapshot = iter.pos.snapshot;
+	*snapshot = iter.pos.snapshot;
 	ret = k.k->type == KEY_TYPE_inode
 		? bch2_inode_unpack(bkey_s_c_to_inode(k), inode)
 		: -ENOENT;
@@ -136,6 +165,36 @@ static int lookup_inode(struct btree_trans *trans, u64 inode_nr,
 	return lockrestart_do(trans, __lookup_inode(trans, inode_nr, inode, snapshot));
 }
 
+static int __lookup_dirent(struct btree_trans *trans,
+			   struct bch_hash_info hash_info,
+			   subvol_inum dir, struct qstr *name,
+			   u64 *target, unsigned *type)
+{
+	struct btree_iter iter;
+	struct bkey_s_c_dirent d;
+	int ret;
+
+	ret = bch2_hash_lookup(trans, &iter, bch2_dirent_hash_desc,
+			       &hash_info, dir, name, 0);
+	if (ret)
+		return ret;
+
+	d = bkey_s_c_to_dirent(bch2_btree_iter_peek_slot(&iter));
+	*target = le64_to_cpu(d.v->d_inum);
+	*type = d.v->d_type;
+	bch2_trans_iter_exit(trans, &iter);
+	return 0;
+}
+
+static int lookup_dirent(struct btree_trans *trans,
+			 struct bch_hash_info hash_info,
+			 subvol_inum dir, struct qstr *name,
+			 u64 *target, unsigned *type)
+{
+	return lockrestart_do(trans,
+		__lookup_dirent(trans, hash_info, dir, name, target, type));
+}
+
 static int __write_inode(struct btree_trans *trans,
 			 struct bch_inode_unpacked *inode,
 			 u32 snapshot)
@@ -166,6 +225,71 @@ static int write_inode(struct btree_trans *trans,
 	return ret;
 }
 
+static int fsck_inode_rm(struct btree_trans *trans, u64 inum, u32 snapshot)
+{
+	struct btree_iter iter = { NULL };
+	struct bkey_i_inode_generation delete;
+	struct bch_inode_unpacked inode_u;
+	struct bkey_s_c k;
+	int ret;
+
+	ret   = bch2_btree_delete_range_trans(trans, BTREE_ID_extents,
+					      SPOS(inum, 0, snapshot),
+					      SPOS(inum, U64_MAX, snapshot),
+					      0, NULL) ?:
+		bch2_btree_delete_range_trans(trans, BTREE_ID_dirents,
+					      SPOS(inum, 0, snapshot),
+					      SPOS(inum, U64_MAX, snapshot),
+					      0, NULL) ?:
+		bch2_btree_delete_range_trans(trans, BTREE_ID_xattrs,
+					      SPOS(inum, 0, snapshot),
+					      SPOS(inum, U64_MAX, snapshot),
+					      0, NULL);
+	if (ret)
+		goto err;
+retry:
+	bch2_trans_begin(trans);
+
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes,
+			     SPOS(0, inum, snapshot), BTREE_ITER_INTENT);
+	k = bch2_btree_iter_peek_slot(&iter);
+
+	ret = bkey_err(k);
+	if (ret)
+		goto err;
+
+	if (k.k->type != KEY_TYPE_inode) {
+		bch2_fs_inconsistent(trans->c,
+				     "inode %llu:%u not found when deleting",
+				     inum, snapshot);
+		ret = -EIO;
+		goto err;
+	}
+
+	bch2_inode_unpack(bkey_s_c_to_inode(k), &inode_u);
+
+	/* Subvolume root? */
+	if (inode_u.bi_subvol) {
+		ret = bch2_subvolume_delete(trans, inode_u.bi_subvol, -1);
+		if (ret)
+			goto err;
+	}
+
+	bkey_inode_generation_init(&delete.k_i);
+	delete.k.p = iter.pos;
+	delete.v.bi_generation = cpu_to_le32(inode_u.bi_generation + 1);
+
+	ret   = bch2_trans_update(trans, &iter, &delete.k_i, 0) ?:
+		bch2_trans_commit(trans, NULL, NULL,
+				BTREE_INSERT_NOFAIL);
+err:
+	bch2_trans_iter_exit(trans, &iter);
+	if (ret == -EINTR)
+		goto retry;
+
+	return ret;
+}
+
 static int __remove_dirent(struct btree_trans *trans, struct bpos pos)
 {
 	struct bch_fs *c = trans->c;
@@ -200,32 +324,49 @@ static int remove_dirent(struct btree_trans *trans, struct bpos pos)
 }
 
 /* Get lost+found, create if it doesn't exist: */
-static int lookup_lostfound(struct btree_trans *trans,
-			    u32 subvol,
+static int lookup_lostfound(struct btree_trans *trans, u32 subvol,
 			    struct bch_inode_unpacked *lostfound)
 {
 	struct bch_fs *c = trans->c;
 	struct bch_inode_unpacked root;
 	struct bch_hash_info root_hash_info;
 	struct qstr lostfound_str = QSTR("lost+found");
-	u64 inum;
+	subvol_inum root_inum = { .subvol = subvol };
+	u64 inum = 0;
+	unsigned d_type = 0;
 	u32 snapshot;
 	int ret;
 
-	ret = subvol_lookup_root(trans, subvol, &inum);
+	ret = subvol_lookup(trans, subvol, &snapshot, &root_inum.inum);
+	if (ret)
+		return ret;
 
-	ret = lookup_inode(trans, inum, &root, &snapshot);
-	if (ret && ret != -ENOENT)
+	ret = lookup_inode(trans, root_inum.inum, &root, &snapshot);
+	if (ret) {
+		bch_err(c, "error fetching subvol root: %i", ret);
 		return ret;
+	}
 
 	root_hash_info = bch2_hash_info_init(c, &root);
-	inum = bch2_dirent_lookup(c, root.bi_inum, &root_hash_info,
-				  &lostfound_str);
-	if (!inum) {
+
+	ret = lookup_dirent(trans, root_hash_info, root_inum,
+			    &lostfound_str, &inum, &d_type);
+	if (ret == -ENOENT) {
 		bch_notice(c, "creating lost+found");
 		goto create_lostfound;
 	}
 
+	if (ret) {
+		bch_err(c, "error looking up lost+found: %i", ret);
+		return ret;
+	}
+
+	if (d_type != DT_DIR) {
+		bch_err(c, "error looking up lost+found: not a directory");
+		return ret;
+
+	}
+
 	ret = lookup_inode(trans, inum, lostfound, &snapshot);
 	if (ret && ret != -ENOENT) {
 		/*
@@ -243,11 +384,9 @@ create_lostfound:
 		ret = __bch2_trans_do(trans, NULL, NULL,
 				      BTREE_INSERT_NOFAIL|
 				      BTREE_INSERT_LAZY_RW,
-			bch2_create_trans(trans,
-					  BCACHEFS_ROOT_INO, &root,
-					  lostfound,
-					  &lostfound_str,
-					  0, 0, S_IFDIR|0700, 0, NULL, NULL));
+			bch2_create_trans(trans, root_inum, &root,
+					  lostfound, &lostfound_str,
+					  0, 0, S_IFDIR|0700, 0, NULL, NULL, 0));
 		if (ret)
 			bch_err(c, "error creating lost+found: %i", ret);
 	}
@@ -257,7 +396,7 @@ create_lostfound:
 
 static int reattach_inode(struct btree_trans *trans,
 			  struct bch_inode_unpacked *inode,
-			  u32 snapshot)
+			  u32 inode_snapshot)
 {
 	struct bch_hash_info dir_hash;
 	struct bch_inode_unpacked lostfound;
@@ -267,7 +406,7 @@ static int reattach_inode(struct btree_trans *trans,
 	u32 subvol;
 	int ret;
 
-	ret = snapshot_lookup_subvol(trans, snapshot, &subvol);
+	ret = snapshot_lookup_subvol(trans, inode_snapshot, &subvol);
 	if (ret)
 		return ret;
 
@@ -289,10 +428,15 @@ static int reattach_inode(struct btree_trans *trans,
 	name = (struct qstr) QSTR(name_buf);
 
 	ret = __bch2_trans_do(trans, NULL, NULL, BTREE_INSERT_LAZY_RW,
-		bch2_dirent_create(trans, lostfound.bi_inum, &dir_hash,
-				   mode_to_type(inode->bi_mode),
-				   &name, inode->bi_inum, &dir_offset,
-				   BCH_HASH_SET_MUST_CREATE));
+			bch2_dirent_create(trans,
+					   (subvol_inum) {
+						.subvol = subvol,
+						.inum = lostfound.bi_inum,
+					   },
+					   &dir_hash,
+					   mode_to_type(inode->bi_mode),
+					   &name, inode->bi_inum, &dir_offset,
+					   BCH_HASH_SET_MUST_CREATE));
 	if (ret) {
 		bch_err(trans->c, "error %i reattaching inode %llu",
 			ret, inode->bi_inum);
@@ -302,7 +446,7 @@ static int reattach_inode(struct btree_trans *trans,
 	inode->bi_dir		= lostfound.bi_inum;
 	inode->bi_dir_offset	= dir_offset;
 
-	return write_inode(trans, inode, U32_MAX);
+	return write_inode(trans, inode, inode_snapshot);
 }
 
 static int remove_backpointer(struct btree_trans *trans,
@@ -329,45 +473,287 @@ out:
 	return ret;
 }
 
+struct snapshots_seen {
+	struct bpos			pos;
+	size_t				nr;
+	size_t				size;
+	u32				*d;
+};
+
+static void snapshots_seen_exit(struct snapshots_seen *s)
+{
+	kfree(s->d);
+	s->d = NULL;
+}
+
+static void snapshots_seen_init(struct snapshots_seen *s)
+{
+	memset(s, 0, sizeof(*s));
+}
+
+static int snapshots_seen_update(struct bch_fs *c, struct snapshots_seen *s, struct bpos pos)
+{
+	pos.snapshot = snapshot_t(c, pos.snapshot)->equiv;
+
+	if (bkey_cmp(s->pos, pos))
+		s->nr = 0;
+	s->pos = pos;
+
+	if (s->nr == s->size) {
+		size_t new_size = max(s->size, 128UL) * 2;
+		u32 *d = krealloc(s->d, new_size * sizeof(s->d[0]), GFP_KERNEL);
+
+		if (!d) {
+			bch_err(c, "error reallocating snapshots_seen table (new size %zu)",
+				new_size);
+			return -ENOMEM;
+		}
+
+		s->size = new_size;
+		s->d	= d;
+	}
+
+	/* Might get called multiple times due to lock restarts */
+	if (s->nr && s->d[s->nr - 1] == pos.snapshot)
+		return 0;
+
+	s->d[s->nr++] = pos.snapshot;
+	return 0;
+}
+
+/**
+ * key_visible_in_snapshot - returns true if @id is a descendent of @ancestor,
+ * and @ancestor hasn't been overwritten in @seen
+ *
+ * That is, returns whether key in @ancestor snapshot is visible in @id snapshot
+ */
+static bool key_visible_in_snapshot(struct bch_fs *c, struct snapshots_seen *seen,
+				    u32 id, u32 ancestor)
+{
+	ssize_t i;
+
+	BUG_ON(id > ancestor);
+
+	id		= snapshot_t(c, id)->equiv;
+	ancestor	= snapshot_t(c, ancestor)->equiv;
+
+	/* @ancestor should be the snapshot most recently added to @seen */
+	BUG_ON(!seen->nr || seen->d[seen->nr - 1] != ancestor);
+	BUG_ON(seen->pos.snapshot != ancestor);
+
+	if (id == ancestor)
+		return true;
+
+	if (!bch2_snapshot_is_ancestor(c, id, ancestor))
+		return false;
+
+	for (i = seen->nr - 2;
+	     i >= 0 && seen->d[i] >= id;
+	     --i)
+		if (bch2_snapshot_is_ancestor(c, id, seen->d[i]) &&
+		    bch2_snapshot_is_ancestor(c, seen->d[i], ancestor))
+			return false;
+
+	return true;
+}
+
+/**
+ * ref_visible - given a key with snapshot id @src that points to a key with
+ * snapshot id @dst, test whether there is some snapshot in which @dst is
+ * visible.
+ *
+ * This assumes we're visiting @src keys in natural key order.
+ *
+ * @s	- list of snapshot IDs already seen at @src
+ * @src	- snapshot ID of src key
+ * @dst	- snapshot ID of dst key
+ */
+static int ref_visible(struct bch_fs *c, struct snapshots_seen *s,
+		       u32 src, u32 dst)
+{
+	return dst <= src
+		? key_visible_in_snapshot(c, s, dst, src)
+		: bch2_snapshot_is_ancestor(c, src, dst);
+}
+
+#define for_each_visible_inode(_c, _s, _w, _snapshot, _i)	\
+	for (_i = (_w)->d; _i < (_w)->d + (_w)->nr && (_i)->snapshot <= (_snapshot); _i++)\
+		if (key_visible_in_snapshot(_c, _s, _i->snapshot, _snapshot))
+
 struct inode_walker {
-	bool			first_this_inode;
-	bool			have_inode;
-	u64			cur_inum;
-	u32			snapshot;
-	struct bch_inode_unpacked inode;
+	bool				first_this_inode;
+	u64				cur_inum;
+
+	size_t				nr;
+	size_t				size;
+	struct inode_walker_entry {
+		struct bch_inode_unpacked inode;
+		u32			snapshot;
+		u64			count;
+	} *d;
 };
 
+static void inode_walker_exit(struct inode_walker *w)
+{
+	kfree(w->d);
+	w->d = NULL;
+}
+
 static struct inode_walker inode_walker_init(void)
 {
-	return (struct inode_walker) {
-		.cur_inum	= -1,
-		.have_inode	= false,
+	return (struct inode_walker) { 0, };
+}
+
+static int inode_walker_realloc(struct inode_walker *w)
+{
+	if (w->nr == w->size) {
+		size_t new_size = max_t(size_t, 8UL, w->size * 2);
+		void *d = krealloc(w->d, new_size * sizeof(w->d[0]),
+				   GFP_KERNEL);
+		if (!d)
+			return -ENOMEM;
+
+		w->d = d;
+		w->size = new_size;
+	}
+
+	return 0;
+}
+
+static int add_inode(struct bch_fs *c, struct inode_walker *w,
+		     struct bkey_s_c_inode inode)
+{
+	struct bch_inode_unpacked u;
+	int ret;
+
+	ret = inode_walker_realloc(w);
+	if (ret)
+		return ret;
+
+	BUG_ON(bch2_inode_unpack(inode, &u));
+
+	w->d[w->nr++] = (struct inode_walker_entry) {
+		.inode		= u,
+		.snapshot	= snapshot_t(c, inode.k->p.snapshot)->equiv,
 	};
+
+	return 0;
 }
 
 static int __walk_inode(struct btree_trans *trans,
-			struct inode_walker *w, u64 inum)
+			struct inode_walker *w, struct bpos pos)
 {
-	if (inum != w->cur_inum) {
-		int ret = __lookup_inode(trans, inum, &w->inode, &w->snapshot);
+	struct bch_fs *c = trans->c;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	unsigned i, ancestor_pos;
+	int ret;
 
-		if (ret && ret != -ENOENT)
-			return ret;
+	pos.snapshot = snapshot_t(c, pos.snapshot)->equiv;
 
-		w->have_inode	= !ret;
-		w->cur_inum	= inum;
-		w->first_this_inode = true;
-	} else {
+	if (pos.inode == w->cur_inum) {
 		w->first_this_inode = false;
+		goto lookup_snapshot;
 	}
 
-	return 0;
+	w->nr = 0;
+
+	for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, pos.inode),
+			   BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
+		if (k.k->p.offset != pos.inode)
+			break;
+
+		if (k.k->type == KEY_TYPE_inode)
+			add_inode(c, w, bkey_s_c_to_inode(k));
+	}
+	bch2_trans_iter_exit(trans, &iter);
+
+	if (ret)
+		return ret;
+
+	w->cur_inum		= pos.inode;
+	w->first_this_inode	= true;
+lookup_snapshot:
+	for (i = 0; i < w->nr; i++)
+		if (bch2_snapshot_is_ancestor(c, pos.snapshot, w->d[i].snapshot))
+			goto found;
+	return INT_MAX;
+found:
+	BUG_ON(pos.snapshot > w->d[i].snapshot);
+
+	if (pos.snapshot != w->d[i].snapshot) {
+		ancestor_pos = i;
+
+		while (i && w->d[i - 1].snapshot > pos.snapshot)
+			--i;
+
+		ret = inode_walker_realloc(w);
+		if (ret)
+			return ret;
+
+		array_insert_item(w->d, w->nr, i, w->d[ancestor_pos]);
+		w->d[i].snapshot = pos.snapshot;
+		w->d[i].count	= 0;
+	}
+
+	return i;
 }
 
 static int walk_inode(struct btree_trans *trans,
-		      struct inode_walker *w, u64 inum)
+		      struct inode_walker *w, struct bpos pos)
 {
-	return lockrestart_do(trans, __walk_inode(trans, w, inum));
+	return lockrestart_do(trans, __walk_inode(trans, w, pos));
+}
+
+static int __get_visible_inodes(struct btree_trans *trans,
+				struct inode_walker *w,
+				struct snapshots_seen *s,
+				u64 inum)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	int ret;
+
+	w->nr = 0;
+
+	for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, inum),
+			   BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
+		if (k.k->p.offset != inum)
+			break;
+
+		if (k.k->type != KEY_TYPE_inode)
+			continue;
+
+		if (ref_visible(c, s, s->pos.snapshot, k.k->p.snapshot)) {
+			add_inode(c, w, bkey_s_c_to_inode(k));
+			if (k.k->p.snapshot >= s->pos.snapshot)
+				break;
+		}
+	}
+	bch2_trans_iter_exit(trans, &iter);
+
+	return ret;
+}
+
+static int check_key_has_snapshot(struct btree_trans *trans,
+				  struct btree_iter *iter,
+				  struct bkey_s_c k)
+{
+	struct bch_fs *c = trans->c;
+	char buf[200];
+	int ret = 0;
+
+	if (fsck_err_on(!snapshot_t(c, k.k->p.snapshot)->equiv, c,
+			"key in missing snapshot: %s",
+			(bch2_bkey_val_to_text(&PBUF(buf), c, k), buf))) {
+		ret = __bch2_trans_do(trans, NULL, NULL, BTREE_INSERT_LAZY_RW,
+			bch2_btree_delete_at(trans, iter,
+					     BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE));
+		return ret ?: -EINTR;
+	}
+fsck_err:
+	return ret;
 }
 
 static int hash_redo_key(struct btree_trans *trans,
@@ -375,6 +761,9 @@ static int hash_redo_key(struct btree_trans *trans,
 			 struct bch_hash_info *hash_info,
 			 struct btree_iter *k_iter, struct bkey_s_c k)
 {
+	bch_err(trans->c, "hash_redo_key() not implemented yet");
+	return -EINVAL;
+#if 0
 	struct bkey_i *delete;
 	struct bkey_i *tmp;
 
@@ -393,6 +782,7 @@ static int hash_redo_key(struct btree_trans *trans,
 	return  bch2_btree_iter_traverse(k_iter) ?:
 		bch2_trans_update(trans, k_iter, delete, 0) ?:
 		bch2_hash_set(trans, desc, hash_info, k_iter->pos.inode, tmp, 0);
+#endif
 }
 
 static int fsck_hash_delete_at(struct btree_trans *trans,
@@ -484,30 +874,29 @@ fsck_err:
 
 static int check_inode(struct btree_trans *trans,
 		       struct btree_iter *iter,
-		       struct bkey_s_c_inode inode)
+		       struct bch_inode_unpacked *prev,
+		       struct bch_inode_unpacked u)
 {
 	struct bch_fs *c = trans->c;
-	struct bch_inode_unpacked u;
 	bool do_update = false;
 	int ret = 0;
 
-	ret = bch2_inode_unpack(inode, &u);
-
-	if (bch2_fs_inconsistent_on(ret, c,
-			 "error unpacking inode %llu in fsck",
-			 inode.k->p.inode))
-		return ret;
+	if (fsck_err_on(prev &&
+			(prev->bi_hash_seed		!= u.bi_hash_seed ||
+			 mode_to_type(prev->bi_mode) != mode_to_type(u.bi_mode)), c,
+			"inodes in different snapshots don't match")) {
+		bch_err(c, "repair not implemented yet");
+		return -EINVAL;
+	}
 
 	if (u.bi_flags & BCH_INODE_UNLINKED &&
 	    (!c->sb.clean ||
 	     fsck_err(c, "filesystem marked clean, but inode %llu unlinked",
 		      u.bi_inum))) {
-		bch_verbose(c, "deleting inode %llu", u.bi_inum);
-
 		bch2_trans_unlock(trans);
 		bch2_fs_lazy_rw(c);
 
-		ret = bch2_inode_rm(c, u.bi_inum, false);
+		ret = fsck_inode_rm(trans, u.bi_inum, iter->pos.snapshot);
 		if (ret)
 			bch_err(c, "error in fsck: error %i while deleting inode", ret);
 		return ret;
@@ -527,9 +916,10 @@ static int check_inode(struct btree_trans *trans,
 		 * just switch units to bytes and that issue goes away
 		 */
 		ret = bch2_btree_delete_range_trans(trans, BTREE_ID_extents,
-				POS(u.bi_inum, round_up(u.bi_size, block_bytes(c)) >> 9),
+				SPOS(u.bi_inum, round_up(u.bi_size, block_bytes(c)) >> 9,
+				     iter->pos.snapshot),
 				POS(u.bi_inum, U64_MAX),
-				NULL);
+				0, NULL);
 		if (ret) {
 			bch_err(c, "error in fsck: error %i truncating inode", ret);
 			return ret;
@@ -554,7 +944,7 @@ static int check_inode(struct btree_trans *trans,
 		bch_verbose(c, "recounting sectors for inode %llu",
 			    u.bi_inum);
 
-		sectors = bch2_count_inode_sectors(trans, u.bi_inum);
+		sectors = bch2_count_inode_sectors(trans, u.bi_inum, iter->pos.snapshot);
 		if (sectors < 0) {
 			bch_err(c, "error in fsck: error %i recounting inode sectors",
 				(int) sectors);
@@ -574,11 +964,7 @@ static int check_inode(struct btree_trans *trans,
 	}
 
 	if (do_update) {
-		ret = __bch2_trans_do(trans, NULL, NULL,
-				      BTREE_INSERT_NOFAIL|
-				      BTREE_INSERT_LAZY_RW,
-				bch2_btree_iter_traverse(iter) ?:
-				bch2_inode_write(trans, iter, &u));
+		ret = write_inode(trans, &u, iter->pos.snapshot);
 		if (ret)
 			bch_err(c, "error in fsck: error %i "
 				"updating inode", ret);
@@ -594,26 +980,49 @@ static int check_inodes(struct bch_fs *c, bool full)
 	struct btree_iter iter;
 	struct bkey_s_c k;
 	struct bkey_s_c_inode inode;
+	struct bch_inode_unpacked prev, u;
 	int ret;
 
+	memset(&prev, 0, sizeof(prev));
+
 	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
 
 	for_each_btree_key(&trans, iter, BTREE_ID_inodes, POS_MIN,
 			   BTREE_ITER_INTENT|
-			   BTREE_ITER_PREFETCH, k, ret) {
+			   BTREE_ITER_PREFETCH|
+			   BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
+		ret = check_key_has_snapshot(&trans, &iter, k);
+		if (ret)
+			break;
+
+		/*
+		 * if snapshot id isn't a leaf node, skip it - deletion in
+		 * particular is not atomic, so on the internal snapshot nodes
+		 * we can see inodes marked for deletion after a clean shutdown
+		 */
+		if (bch2_snapshot_internal_node(c, k.k->p.snapshot))
+			continue;
+
 		if (k.k->type != KEY_TYPE_inode)
 			continue;
 
 		inode = bkey_s_c_to_inode(k);
 
-		if (full ||
-		    (inode.v->bi_flags & (BCH_INODE_I_SIZE_DIRTY|
-					  BCH_INODE_I_SECTORS_DIRTY|
-					  BCH_INODE_UNLINKED))) {
-			ret = check_inode(&trans, &iter, inode);
-			if (ret)
-				break;
-		}
+		if (!full &&
+		    !(inode.v->bi_flags & (BCH_INODE_I_SIZE_DIRTY|
+					   BCH_INODE_I_SECTORS_DIRTY|
+					   BCH_INODE_UNLINKED)))
+			continue;
+
+		BUG_ON(bch2_inode_unpack(inode, &u));
+
+		ret = check_inode(&trans, &iter,
+				  full && prev.bi_inum == u.bi_inum
+				  ? &prev : NULL, u);
+		if (ret)
+			break;
+
+		prev = u;
 	}
 	bch2_trans_iter_exit(&trans, &iter);
 
@@ -622,6 +1031,29 @@ static int check_inodes(struct bch_fs *c, bool full)
 	return bch2_trans_exit(&trans) ?: ret;
 }
 
+noinline_for_stack
+static int check_subvols(struct bch_fs *c)
+{
+	struct btree_trans trans;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	int ret;
+
+	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
+
+	for_each_btree_key(&trans, iter, BTREE_ID_subvolumes, POS_MIN,
+			   0, k, ret) {
+	}
+	bch2_trans_iter_exit(&trans, &iter);
+
+	bch2_trans_exit(&trans);
+	return ret;
+}
+
+/*
+ * Checking for overlapping extents needs to be reimplemented
+ */
+#if 0
 static int fix_overlapping_extent(struct btree_trans *trans,
 				       struct bkey_s_c k, struct bpos cut_at)
 {
@@ -638,55 +1070,195 @@ static int fix_overlapping_extent(struct btree_trans *trans,
 	bch2_cut_front(cut_at, u);
 
 
-	/*
-	 * We don't want to go through the extent_handle_overwrites path:
-	 *
-	 * XXX: this is going to screw up disk accounting, extent triggers
-	 * assume things about extent overwrites - we should be running the
-	 * triggers manually here
-	 */
-	bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, u->k.p,
-			     BTREE_ITER_INTENT|BTREE_ITER_NOT_EXTENTS);
+	/*
+	 * We don't want to go through the extent_handle_overwrites path:
+	 *
+	 * XXX: this is going to screw up disk accounting, extent triggers
+	 * assume things about extent overwrites - we should be running the
+	 * triggers manually here
+	 */
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, u->k.p,
+			     BTREE_ITER_INTENT|BTREE_ITER_NOT_EXTENTS);
+
+	BUG_ON(iter.flags & BTREE_ITER_IS_EXTENTS);
+	ret   = bch2_btree_iter_traverse(&iter) ?:
+		bch2_trans_update(trans, &iter, u, BTREE_TRIGGER_NORUN) ?:
+		bch2_trans_commit(trans, NULL, NULL,
+				  BTREE_INSERT_NOFAIL|
+				  BTREE_INSERT_LAZY_RW);
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+#endif
+
+static int inode_backpointer_exists(struct btree_trans *trans,
+				    struct bch_inode_unpacked *inode,
+				    u32 snapshot)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	int ret;
+
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_dirents,
+			SPOS(inode->bi_dir, inode->bi_dir_offset, snapshot), 0);
+	k = bch2_btree_iter_peek_slot(&iter);
+	ret = bkey_err(k);
+	if (ret)
+		goto out;
+	if (k.k->type != KEY_TYPE_dirent)
+		goto out;
+
+	ret = le64_to_cpu(bkey_s_c_to_dirent(k).v->d_inum) == inode->bi_inum;
+out:
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+static bool inode_backpointer_matches(struct bkey_s_c_dirent d,
+				      struct bch_inode_unpacked *inode)
+{
+	return d.k->p.inode == inode->bi_dir &&
+		d.k->p.offset == inode->bi_dir_offset;
+}
+
+static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w)
+{
+	struct bch_fs *c = trans->c;
+	struct inode_walker_entry *i;
+	int ret = 0, ret2 = 0;
+	s64 count2;
+
+	for (i = w->d; i < w->d + w->nr; i++) {
+		if (i->inode.bi_sectors == i->count)
+			continue;
+
+		count2 = lockrestart_do(trans,
+			bch2_count_inode_sectors(trans, w->cur_inum, i->snapshot));
+
+		if (i->count != count2) {
+			bch_err(c, "fsck counted i_sectors wrong: got %llu should be %llu",
+				i->count, count2);
+			i->count = count2;
+			if (i->inode.bi_sectors == i->count)
+				continue;
+		}
+
+		if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_I_SECTORS_DIRTY), c,
+			    "inode %llu:%u has incorrect i_sectors: got %llu, should be %llu",
+			    w->cur_inum, i->snapshot,
+			    i->inode.bi_sectors, i->count) == FSCK_ERR_IGNORE)
+			continue;
+
+		i->inode.bi_sectors = i->count;
+		ret = write_inode(trans, &i->inode, i->snapshot);
+		if (ret)
+			break;
+		ret2 = -EINTR;
+	}
+fsck_err:
+	return ret ?: ret2;
+}
+
+static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
+			struct inode_walker *inode,
+			struct snapshots_seen *s)
+{
+	struct bch_fs *c = trans->c;
+	struct bkey_s_c k;
+	struct inode_walker_entry *i;
+	char buf[200];
+	int ret = 0;
+
+	k = bch2_btree_iter_peek(iter);
+	if (!k.k)
+		return 0;
+
+	ret = bkey_err(k);
+	if (ret)
+		return ret;
+
+	ret = check_key_has_snapshot(trans, iter, k);
+	if (ret)
+		return ret;
+
+	ret = snapshots_seen_update(c, s, k.k->p);
+	if (ret)
+		return ret;
+
+	if (k.k->type == KEY_TYPE_whiteout)
+		return 0;
+
+	if (inode->cur_inum != k.k->p.inode) {
+		ret = check_i_sectors(trans, inode);
+		if (ret)
+			return ret;
+	}
+#if 0
+	if (bkey_cmp(prev.k->k.p, bkey_start_pos(k.k)) > 0) {
+		char buf1[200];
+		char buf2[200];
+
+		bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(prev.k));
+		bch2_bkey_val_to_text(&PBUF(buf2), c, k);
+
+		if (fsck_err(c, "overlapping extents:\n%s\n%s", buf1, buf2))
+			return fix_overlapping_extent(trans, k, prev.k->k.p) ?: -EINTR;
+	}
+#endif
+	ret = __walk_inode(trans, inode, k.k->p);
+	if (ret < 0)
+		return ret;
+
+	if (fsck_err_on(ret == INT_MAX, c,
+			"extent in missing inode:\n  %s",
+			(bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)))
+		return __bch2_trans_do(trans, NULL, NULL, BTREE_INSERT_LAZY_RW,
+			bch2_btree_delete_at(trans, iter,
+					     BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE));
+
+	if (ret == INT_MAX)
+		return 0;
 
-	BUG_ON(iter.flags & BTREE_ITER_IS_EXTENTS);
-	ret   = bch2_btree_iter_traverse(&iter) ?:
-		bch2_trans_update(trans, &iter, u, BTREE_TRIGGER_NORUN) ?:
-		bch2_trans_commit(trans, NULL, NULL,
-				  BTREE_INSERT_NOFAIL|
-				  BTREE_INSERT_LAZY_RW);
-	bch2_trans_iter_exit(trans, &iter);
-	return ret;
-}
+	i = inode->d + ret;
+	ret = 0;
 
-static int inode_backpointer_exists(struct btree_trans *trans,
-				    struct bch_inode_unpacked *inode)
-{
-	struct btree_iter iter;
-	struct bkey_s_c k;
-	int ret;
+	if (fsck_err_on(!S_ISREG(i->inode.bi_mode) &&
+			!S_ISLNK(i->inode.bi_mode), c,
+			"extent in non regular inode mode %o:\n  %s",
+			i->inode.bi_mode,
+			(bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)))
+		return __bch2_trans_do(trans, NULL, NULL, BTREE_INSERT_LAZY_RW,
+			 bch2_btree_delete_at(trans, iter,
+					      BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE));
+
+	if (!bch2_snapshot_internal_node(c, k.k->p.snapshot)) {
+		for_each_visible_inode(c, s, inode, k.k->p.snapshot, i) {
+			if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_I_SIZE_DIRTY) &&
+					k.k->type != KEY_TYPE_reservation &&
+					k.k->p.offset > round_up(i->inode.bi_size, block_bytes(c)) >> 9, c,
+					"extent type %u offset %llu past end of inode %llu, i_size %llu",
+					k.k->type, k.k->p.offset, k.k->p.inode, i->inode.bi_size)) {
+				bch2_fs_lazy_rw(c);
+				return bch2_btree_delete_range_trans(trans, BTREE_ID_extents,
+						SPOS(k.k->p.inode, round_up(i->inode.bi_size, block_bytes(c)) >> 9,
+						     k.k->p.snapshot),
+						POS(k.k->p.inode, U64_MAX),
+						0, NULL) ?: -EINTR;
+			}
+		}
+	}
 
-	bch2_trans_iter_init(trans, &iter, BTREE_ID_dirents,
-			     POS(inode->bi_dir, inode->bi_dir_offset), 0);
-	k = bch2_btree_iter_peek_slot(&iter);
-	ret = bkey_err(k);
-	if (ret)
-		goto out;
-	if (k.k->type != KEY_TYPE_dirent)
-		goto out;
+	if (bkey_extent_is_allocation(k.k))
+		for_each_visible_inode(c, s, inode, k.k->p.snapshot, i)
+			i->count += k.k->size;
+#if 0
+	bch2_bkey_buf_reassemble(&prev, c, k);
+#endif
 
-	ret = le64_to_cpu(bkey_s_c_to_dirent(k).v->d_inum) == inode->bi_inum;
-out:
-	bch2_trans_iter_exit(trans, &iter);
+fsck_err:
 	return ret;
 }
 
-static bool inode_backpointer_matches(struct bkey_s_c_dirent d,
-				      struct bch_inode_unpacked *inode)
-{
-	return d.k->p.inode == inode->bi_dir &&
-		d.k->p.offset == inode->bi_dir_offset;
-}
-
 /*
  * Walk extents: verify that extents have a corresponding S_ISREG inode, and
  * that i_size an i_sectors are consistent
@@ -695,15 +1267,17 @@ noinline_for_stack
 static int check_extents(struct bch_fs *c)
 {
 	struct inode_walker w = inode_walker_init();
+	struct snapshots_seen s;
 	struct btree_trans trans;
 	struct btree_iter iter;
-	struct bkey_s_c k;
-	struct bkey_buf prev;
-	u64 i_sectors = 0;
 	int ret = 0;
 
+#if 0
+	struct bkey_buf prev;
 	bch2_bkey_buf_init(&prev);
 	prev.k->k = KEY(0, 0, 0);
+#endif
+	snapshots_seen_init(&s);
 	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
 
 	bch_verbose(c, "checking extents");
@@ -711,96 +1285,172 @@ static int check_extents(struct bch_fs *c)
 	bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
 			     POS(BCACHEFS_ROOT_INO, 0),
 			     BTREE_ITER_INTENT|
-			     BTREE_ITER_PREFETCH);
-retry:
-	while ((k = bch2_btree_iter_peek(&iter)).k &&
-	       !(ret = bkey_err(k))) {
-		if (w.have_inode &&
-		    w.cur_inum != k.k->p.inode &&
-		    !(w.inode.bi_flags & BCH_INODE_I_SECTORS_DIRTY) &&
-		    fsck_err_on(w.inode.bi_sectors != i_sectors, c,
-				"inode %llu has incorrect i_sectors: got %llu, should be %llu",
-				w.inode.bi_inum,
-				w.inode.bi_sectors, i_sectors)) {
-			w.inode.bi_sectors = i_sectors;
-
-			ret = write_inode(&trans, &w.inode, w.snapshot);
+			     BTREE_ITER_PREFETCH|
+			     BTREE_ITER_ALL_SNAPSHOTS);
+
+	do {
+		ret = lockrestart_do(&trans,
+			check_extent(&trans, &iter, &w, &s));
+		if (ret)
+			break;
+	} while (bch2_btree_iter_advance(&iter));
+	bch2_trans_iter_exit(&trans, &iter);
+#if 0
+	bch2_bkey_buf_exit(&prev, c);
+#endif
+	inode_walker_exit(&w);
+	bch2_trans_exit(&trans);
+	snapshots_seen_exit(&s);
+
+	return ret;
+}
+
+static int check_subdir_count(struct btree_trans *trans, struct inode_walker *w)
+{
+	struct bch_fs *c = trans->c;
+	struct inode_walker_entry *i;
+	int ret = 0, ret2 = 0;
+	s64 count2;
+
+	for (i = w->d; i < w->d + w->nr; i++) {
+		if (i->inode.bi_nlink == i->count)
+			continue;
+
+		count2 = lockrestart_do(trans,
+				bch2_count_subdirs(trans, w->cur_inum, i->snapshot));
+
+		if (i->count != count2) {
+			bch_err(c, "fsck counted subdirectories wrong: got %llu should be %llu",
+				i->count, count2);
+			i->count = count2;
+			if (i->inode.bi_nlink == i->count)
+				continue;
+		}
+
+		if (fsck_err_on(i->inode.bi_nlink != i->count, c,
+				"directory %llu:%u with wrong i_nlink: got %u, should be %llu",
+				w->cur_inum, i->snapshot, i->inode.bi_nlink, i->count)) {
+			i->inode.bi_nlink = i->count;
+			ret = write_inode(trans, &i->inode, i->snapshot);
 			if (ret)
 				break;
+			ret2 = -EINTR;
 		}
+	}
+fsck_err:
+	return ret ?: ret2;
+}
 
-		if (bkey_cmp(prev.k->k.p, bkey_start_pos(k.k)) > 0) {
-			char buf1[200];
-			char buf2[200];
+static int check_dirent_target(struct btree_trans *trans,
+			       struct btree_iter *iter,
+			       struct bkey_s_c_dirent d,
+			       struct bch_inode_unpacked *target,
+			       u32 target_snapshot)
+{
+	struct bch_fs *c = trans->c;
+	bool backpointer_exists = true;
+	char buf[200];
+	int ret = 0;
+
+	if (!target->bi_dir &&
+	    !target->bi_dir_offset) {
+		target->bi_dir		= d.k->p.inode;
+		target->bi_dir_offset	= d.k->p.offset;
+
+		ret = write_inode(trans, target, target_snapshot);
+		if (ret)
+			goto err;
+	}
+
+	if (!inode_backpointer_matches(d, target)) {
+		ret = inode_backpointer_exists(trans, target, d.k->p.snapshot);
+		if (ret < 0)
+			goto err;
 
-			bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(prev.k));
-			bch2_bkey_val_to_text(&PBUF(buf2), c, k);
+		backpointer_exists = ret;
+		ret = 0;
 
-			if (fsck_err(c, "overlapping extents:\n%s\n%s", buf1, buf2))
-				return fix_overlapping_extent(&trans, k, prev.k->k.p) ?: -EINTR;
+		if (fsck_err_on(S_ISDIR(target->bi_mode) &&
+				backpointer_exists, c,
+				"directory %llu with multiple links",
+				target->bi_inum)) {
+			ret = remove_dirent(trans, d.k->p);
+			if (ret)
+				goto err;
+			return 0;
 		}
 
-		ret = walk_inode(&trans, &w, k.k->p.inode);
-		if (ret)
-			break;
+		if (fsck_err_on(backpointer_exists &&
+				!target->bi_nlink, c,
+				"inode %llu has multiple links but i_nlink 0",
+				target->bi_inum)) {
+			target->bi_nlink++;
+			target->bi_flags &= ~BCH_INODE_UNLINKED;
 
-		if (w.first_this_inode)
-			i_sectors = 0;
-
-		if (fsck_err_on(!w.have_inode, c,
-				"extent type %u for missing inode %llu",
-				k.k->type, k.k->p.inode) ||
-		    fsck_err_on(w.have_inode &&
-				!S_ISREG(w.inode.bi_mode) && !S_ISLNK(w.inode.bi_mode), c,
-				"extent type %u for non regular file, inode %llu mode %o",
-				k.k->type, k.k->p.inode, w.inode.bi_mode)) {
-			bch2_fs_lazy_rw(c);
-			return bch2_btree_delete_range_trans(&trans, BTREE_ID_extents,
-						       POS(k.k->p.inode, 0),
-						       POS(k.k->p.inode, U64_MAX),
-						       NULL) ?: -EINTR;
+			ret = write_inode(trans, target, target_snapshot);
+			if (ret)
+				goto err;
 		}
 
-		if (fsck_err_on(w.have_inode &&
-				!(w.inode.bi_flags & BCH_INODE_I_SIZE_DIRTY) &&
-				k.k->type != KEY_TYPE_reservation &&
-				k.k->p.offset > round_up(w.inode.bi_size, block_bytes(c)) >> 9, c,
-				"extent type %u offset %llu past end of inode %llu, i_size %llu",
-				k.k->type, k.k->p.offset, k.k->p.inode, w.inode.bi_size)) {
-			bch2_fs_lazy_rw(c);
-			return bch2_btree_delete_range_trans(&trans, BTREE_ID_extents,
-					POS(k.k->p.inode, round_up(w.inode.bi_size, block_bytes(c)) >> 9),
-					POS(k.k->p.inode, U64_MAX),
-					NULL) ?: -EINTR;
+		if (fsck_err_on(!backpointer_exists, c,
+				"inode %llu has wrong backpointer:\n"
+				"got       %llu:%llu\n"
+				"should be %llu:%llu",
+				target->bi_inum,
+				target->bi_dir,
+				target->bi_dir_offset,
+				d.k->p.inode,
+				d.k->p.offset)) {
+			target->bi_dir		= d.k->p.inode;
+			target->bi_dir_offset	= d.k->p.offset;
+
+			ret = write_inode(trans, target, target_snapshot);
+			if (ret)
+				goto err;
 		}
+	}
 
-		if (bkey_extent_is_allocation(k.k))
-			i_sectors += k.k->size;
-		bch2_bkey_buf_reassemble(&prev, c, k);
+	if (fsck_err_on(vfs_d_type(d.v->d_type) != mode_to_type(target->bi_mode), c,
+			"incorrect d_type: should be %u:\n%s",
+			mode_to_type(target->bi_mode),
+			(bch2_bkey_val_to_text(&PBUF(buf), c, d.s_c), buf))) {
+		struct bkey_i_dirent *n;
 
-		bch2_btree_iter_advance(&iter);
+		n = kmalloc(bkey_bytes(d.k), GFP_KERNEL);
+		if (!n) {
+			ret = -ENOMEM;
+			goto err;
+		}
+
+		bkey_reassemble(&n->k_i, d.s_c);
+		n->v.d_type = mode_to_type(target->bi_mode);
+
+		ret = __bch2_trans_do(trans, NULL, NULL,
+				      BTREE_INSERT_NOFAIL|
+				      BTREE_INSERT_LAZY_RW,
+			bch2_trans_update(trans, iter, &n->k_i, 0));
+		kfree(n);
+		if (ret)
+			goto err;
 	}
+err:
 fsck_err:
-	if (ret == -EINTR)
-		goto retry;
-	bch2_trans_iter_exit(&trans, &iter);
-	bch2_bkey_buf_exit(&prev, c);
-	return bch2_trans_exit(&trans) ?: ret;
+	return ret;
 }
 
 static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
 			struct bch_hash_info *hash_info,
-			struct inode_walker *w, unsigned *nr_subdirs)
+			struct inode_walker *dir,
+			struct inode_walker *target,
+			struct snapshots_seen *s)
 {
 	struct bch_fs *c = trans->c;
 	struct bkey_s_c k;
 	struct bkey_s_c_dirent d;
-	struct bch_inode_unpacked target;
+	struct inode_walker_entry *i;
 	u32 target_snapshot;
 	u32 target_subvol;
-	bool have_target;
-	bool backpointer_exists = true;
-	u64 d_inum;
+	u64 target_inum;
 	char buf[200];
 	int ret;
 
@@ -812,38 +1462,49 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
 	if (ret)
 		return ret;
 
-	if (w->have_inode &&
-	    w->cur_inum != k.k->p.inode &&
-	    fsck_err_on(w->inode.bi_nlink != *nr_subdirs, c,
-			"directory %llu with wrong i_nlink: got %u, should be %u",
-			w->inode.bi_inum, w->inode.bi_nlink, *nr_subdirs)) {
-		w->inode.bi_nlink = *nr_subdirs;
-		ret = write_inode(trans, &w->inode, w->snapshot);
-		return ret ?: -EINTR;
-	}
+	ret = check_key_has_snapshot(trans, iter, k);
+	if (ret)
+		return ret;
 
-	ret = __walk_inode(trans, w, k.k->p.inode);
+	ret = snapshots_seen_update(c, s, k.k->p);
 	if (ret)
 		return ret;
 
-	if (w->first_this_inode)
-		*nr_subdirs = 0;
+	if (k.k->type == KEY_TYPE_whiteout)
+		return 0;
+
+	if (dir->cur_inum != k.k->p.inode) {
+		ret = check_subdir_count(trans, dir);
+		if (ret)
+			return ret;
+	}
+
+	ret = __walk_inode(trans, dir, k.k->p);
+	if (ret < 0)
+		return ret;
 
-	if (fsck_err_on(!w->have_inode, c,
+	if (fsck_err_on(ret == INT_MAX, c,
 			"dirent in nonexisting directory:\n%s",
-			(bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)) ||
-	    fsck_err_on(!S_ISDIR(w->inode.bi_mode), c,
+			(bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)))
+		return __bch2_trans_do(trans, NULL, NULL, BTREE_INSERT_LAZY_RW,
+				bch2_btree_delete_at(trans, iter,
+						     BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE));
+
+	if (ret == INT_MAX)
+		return 0;
+
+	i = dir->d + ret;
+	ret = 0;
+
+	if (fsck_err_on(!S_ISDIR(i->inode.bi_mode), c,
 			"dirent in non directory inode type %u:\n%s",
-			mode_to_type(w->inode.bi_mode),
+			mode_to_type(i->inode.bi_mode),
 			(bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)))
 		return __bch2_trans_do(trans, NULL, NULL, 0,
 				bch2_btree_delete_at(trans, iter, 0));
 
-	if (!w->have_inode)
-		return 0;
-
-	if (w->first_this_inode)
-		*hash_info = bch2_hash_info_init(c, &w->inode);
+	if (dir->first_this_inode)
+		*hash_info = bch2_hash_info_init(c, &dir->d[0].inode);
 
 	ret = hash_check_key(trans, bch2_dirent_hash_desc,
 			     hash_info, iter, k);
@@ -856,128 +1517,76 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
 		return 0;
 
 	d = bkey_s_c_to_dirent(k);
-	d_inum = le64_to_cpu(d.v->d_inum);
 
-	ret = __bch2_dirent_read_target(&trans, d,
+	ret = __bch2_dirent_read_target(trans, d,
 					&target_subvol,
 					&target_snapshot,
-					&target_inum);
+					&target_inum,
+					true);
 	if (ret && ret != -ENOENT)
 		return ret;
 
-	ret = __lookup_inode(trans, d_inum, &target, &target_snapshot);
-	if (ret && ret != -ENOENT)
-		return ret;
+	if (fsck_err_on(ret, c,
+			"dirent points to missing subvolume %llu",
+			le64_to_cpu(d.v->d_inum)))
+		return remove_dirent(trans, d.k->p);
 
-	have_target = !ret;
-	ret = 0;
+	if (target_subvol) {
+		struct bch_inode_unpacked subvol_root;
 
-	if (fsck_err_on(!have_target, c,
-			"dirent points to missing inode:\n%s",
-			(bch2_bkey_val_to_text(&PBUF(buf), c,
-					       k), buf)))
-		return remove_dirent(trans, d.k->p);
+		ret = __lookup_inode(trans, target_inum,
+				   &subvol_root, &target_snapshot);
+		if (ret && ret != -ENOENT)
+			return ret;
 
-	if (!have_target)
-		return 0;
+		if (fsck_err_on(ret, c,
+				"subvolume %u points to missing subvolume root %llu",
+				target_subvol,
+				target_inum)) {
+			bch_err(c, "repair not implemented yet");
+			return -EINVAL;
+		}
 
-	if (!target.bi_dir &&
-	    !target.bi_dir_offset) {
-		target.bi_dir		= k.k->p.inode;
-		target.bi_dir_offset	= k.k->p.offset;
+		if (fsck_err_on(subvol_root.bi_subvol != target_subvol, c,
+				"subvol root %llu has wrong bi_subvol field: got %u, should be %u",
+				target_inum,
+				subvol_root.bi_subvol, target_subvol)) {
+			subvol_root.bi_subvol = target_subvol;
+			ret = write_inode(trans, &subvol_root, target_snapshot);
+			if (ret)
+				return ret;
+		}
 
-		ret = __write_inode(trans, &target, target_snapshot) ?:
-			bch2_trans_commit(trans, NULL, NULL,
-					  BTREE_INSERT_NOFAIL|
-					  BTREE_INSERT_LAZY_RW);
+		ret = check_dirent_target(trans, iter, d, &subvol_root,
+					  target_snapshot);
 		if (ret)
 			return ret;
-		return -EINTR;
-	}
-
-	if (!inode_backpointer_matches(d, &target)) {
-		ret = inode_backpointer_exists(trans, &target);
-		if (ret < 0)
+	} else {
+		ret = __get_visible_inodes(trans, target, s, target_inum);
+		if (ret)
 			return ret;
 
-		backpointer_exists = ret;
-		ret = 0;
-
-		if (fsck_err_on(S_ISDIR(target.bi_mode) &&
-				backpointer_exists, c,
-				"directory %llu with multiple links",
-				target.bi_inum))
-			return remove_dirent(trans, d.k->p);
-
-		if (fsck_err_on(backpointer_exists &&
-				!target.bi_nlink, c,
-				"inode %llu has multiple links but i_nlink 0",
-				d_inum)) {
-			target.bi_nlink++;
-			target.bi_flags &= ~BCH_INODE_UNLINKED;
-
-			ret = write_inode(trans, &target, target_snapshot);
-			return ret ?: -EINTR;
+		if (fsck_err_on(!target->nr, c,
+				"dirent points to missing inode:\n%s",
+				(bch2_bkey_val_to_text(&PBUF(buf), c,
+						       k), buf))) {
+			ret = remove_dirent(trans, d.k->p);
+			if (ret)
+				return ret;
 		}
 
-		if (fsck_err_on(!backpointer_exists, c,
-				"inode %llu has wrong backpointer:\n"
-				"got       %llu:%llu\n"
-				"should be %llu:%llu",
-				d_inum,
-				target.bi_dir,
-				target.bi_dir_offset,
-				k.k->p.inode,
-				k.k->p.offset)) {
-			target.bi_dir		= k.k->p.inode;
-			target.bi_dir_offset	= k.k->p.offset;
-
-			ret = write_inode(trans, &target, target_snapshot);
-			return ret ?: -EINTR;
+		for (i = target->d; i < target->d + target->nr; i++) {
+			ret = check_dirent_target(trans, iter, d,
+						  &i->inode, i->snapshot);
+			if (ret)
+				return ret;
 		}
 	}
 
-	target_subvol = d.v->d_type == DT_SUBVOL
-		? le64_to_cpu(d.v->d_inum) : 0;
-
-	if (fsck_err_on(target.bi_subvol != target_subvol, c,
-			"subvol root %llu has wrong subvol field:\n"
-			"got       %u\n"
-			"should be %u",
-			target.bi_inum,
-			target.bi_subvol,
-			target_subvol)) {
-		target.bi_subvol = target_subvol;
-
-		ret = write_inode(trans, &target, target_snapshot);
-		return ret ?: -EINTR;
-	}
-
-	if (fsck_err_on(vfs_d_type(d.v->d_type) != mode_to_type(target.bi_mode), c,
-			"incorrect d_type: should be %u:\n%s",
-			mode_to_type(target.bi_mode),
-			(bch2_bkey_val_to_text(&PBUF(buf), c,
-					       k), buf))) {
-		struct bkey_i_dirent *n;
-
-		n = kmalloc(bkey_bytes(d.k), GFP_KERNEL);
-		if (!n)
-			return -ENOMEM;
-
-		bkey_reassemble(&n->k_i, d.s_c);
-		n->v.d_type = mode_to_type(target.bi_mode);
-
-		ret = __bch2_trans_do(trans, NULL, NULL,
-				      BTREE_INSERT_NOFAIL|
-				      BTREE_INSERT_LAZY_RW,
-			bch2_btree_iter_traverse(iter) ?:
-			bch2_trans_update(trans, iter, &n->k_i, 0));
-		kfree(n);
-		return ret ?: -EINTR;
-	}
+	if (d.v->d_type == DT_DIR)
+		for_each_visible_inode(c, s, dir, d.k->p.snapshot, i)
+			i->count++;
 
-	*nr_subdirs += d.v->d_type == DT_DIR;
-	return 0;
 fsck_err:
 	return ret;
 }
@@ -989,31 +1598,39 @@ fsck_err:
 noinline_for_stack
 static int check_dirents(struct bch_fs *c)
 {
-	struct inode_walker w = inode_walker_init();
+	struct inode_walker dir = inode_walker_init();
+	struct inode_walker target = inode_walker_init();
+	struct snapshots_seen s;
 	struct bch_hash_info hash_info;
 	struct btree_trans trans;
 	struct btree_iter iter;
-	unsigned nr_subdirs = 0;
 	int ret = 0;
 
 	bch_verbose(c, "checking dirents");
 
+	snapshots_seen_init(&s);
 	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
 
 	bch2_trans_iter_init(&trans, &iter, BTREE_ID_dirents,
 			     POS(BCACHEFS_ROOT_INO, 0),
 			     BTREE_ITER_INTENT|
-			     BTREE_ITER_PREFETCH);
+			     BTREE_ITER_PREFETCH|
+			     BTREE_ITER_ALL_SNAPSHOTS);
 
 	do {
 		ret = lockrestart_do(&trans,
-				check_dirent(&trans, &iter, &hash_info, &w, &nr_subdirs));
+			check_dirent(&trans, &iter, &hash_info,
+				     &dir, &target, &s));
 		if (ret)
 			break;
 	} while (bch2_btree_iter_advance(&iter));
 	bch2_trans_iter_exit(&trans, &iter);
 
-	return bch2_trans_exit(&trans) ?: ret;
+	bch2_trans_exit(&trans);
+	snapshots_seen_exit(&s);
+	inode_walker_exit(&dir);
+	inode_walker_exit(&target);
+	return ret;
 }
 
 /*
@@ -1036,15 +1653,22 @@ static int check_xattrs(struct bch_fs *c)
 	bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs,
 			     POS(BCACHEFS_ROOT_INO, 0),
 			     BTREE_ITER_INTENT|
-			     BTREE_ITER_PREFETCH);
+			     BTREE_ITER_PREFETCH|
+			     BTREE_ITER_ALL_SNAPSHOTS);
 retry:
+	bch2_trans_begin(&trans);
+
 	while ((k = bch2_btree_iter_peek(&iter)).k &&
 	       !(ret = bkey_err(k))) {
-		ret = walk_inode(&trans, &w, k.k->p.inode);
+		ret = check_key_has_snapshot(&trans, &iter, k);
 		if (ret)
 			break;
 
-		if (fsck_err_on(!w.have_inode, c,
+		ret = walk_inode(&trans, &w, k.k->p);
+		if (ret < 0)
+			break;
+
+		if (fsck_err_on(ret == INT_MAX, c,
 				"xattr for missing inode %llu",
 				k.k->p.inode)) {
 			ret = bch2_btree_delete_at(&trans, &iter, 0);
@@ -1053,14 +1677,18 @@ retry:
 			continue;
 		}
 
-		if (w.first_this_inode && w.have_inode)
-			hash_info = bch2_hash_info_init(c, &w.inode);
+		if (ret == INT_MAX)
+			goto next;
+		ret = 0;
+
+		if (w.first_this_inode)
+			hash_info = bch2_hash_info_init(c, &w.d[0].inode);
 
 		ret = hash_check_key(&trans, bch2_xattr_hash_desc,
 				     &hash_info, &iter, k);
 		if (ret)
 			break;
-
+next:
 		bch2_btree_iter_advance(&iter);
 	}
 fsck_err:
@@ -1072,40 +1700,63 @@ fsck_err:
 }
 
 /* Get root directory, create if it doesn't exist: */
-static int check_root(struct bch_fs *c, struct bch_inode_unpacked *root_inode)
+static int check_root(struct bch_fs *c)
 {
-	struct bkey_inode_buf packed;
+	struct btree_trans trans;
+	struct bch_inode_unpacked root_inode;
 	u32 snapshot;
+	u64 inum;
 	int ret;
 
+	bch2_trans_init(&trans, c, 0, 0);
+
 	bch_verbose(c, "checking root directory");
 
-	ret = bch2_trans_do(c, NULL, NULL, 0,
-		lookup_inode(&trans, BCACHEFS_ROOT_INO, root_inode, &snapshot));
+	ret = subvol_lookup(&trans, BCACHEFS_ROOT_SUBVOL, &snapshot, &inum);
 	if (ret && ret != -ENOENT)
 		return ret;
 
-	if (fsck_err_on(ret, c, "root directory missing"))
-		goto create_root;
+	if (mustfix_fsck_err_on(ret, c, "root subvol missing")) {
+		struct bkey_i_subvolume root_subvol;
 
-	if (fsck_err_on(!S_ISDIR(root_inode->bi_mode), c,
-			"root inode not a directory"))
-		goto create_root;
+		snapshot	= U32_MAX;
+		inum		= BCACHEFS_ROOT_INO;
 
-	return 0;
-fsck_err:
-	return ret;
-create_root:
-	bch2_inode_init(c, root_inode, 0, 0, S_IFDIR|0755,
-			0, NULL);
-	root_inode->bi_inum = BCACHEFS_ROOT_INO;
+		bkey_subvolume_init(&root_subvol.k_i);
+		root_subvol.k.p.offset = BCACHEFS_ROOT_SUBVOL;
+		root_subvol.v.flags	= 0;
+		root_subvol.v.snapshot	= cpu_to_le32(snapshot);
+		root_subvol.v.inode	= cpu_to_le64(inum);
+		ret = __bch2_trans_do(&trans, NULL, NULL,
+				      BTREE_INSERT_NOFAIL|
+				      BTREE_INSERT_LAZY_RW,
+			__bch2_btree_insert(&trans, BTREE_ID_subvolumes, &root_subvol.k_i));
+		if (ret) {
+			bch_err(c, "error writing root subvol: %i", ret);
+			goto err;
+		}
+
+	}
+
+	ret = lookup_inode(&trans, BCACHEFS_ROOT_INO, &root_inode, &snapshot);
+	if (ret && ret != -ENOENT)
+		return ret;
 
-	bch2_inode_pack(c, &packed, root_inode);
+	if (mustfix_fsck_err_on(ret, c, "root directory missing") ||
+	    mustfix_fsck_err_on(!S_ISDIR(root_inode.bi_mode), c,
+				"root inode not a directory")) {
+		bch2_inode_init(c, &root_inode, 0, 0, S_IFDIR|0755,
+				0, NULL);
+		root_inode.bi_inum = inum;
 
-	return bch2_btree_insert(c, BTREE_ID_inodes, &packed.inode.k_i,
-				 NULL, NULL,
-				 BTREE_INSERT_NOFAIL|
-				 BTREE_INSERT_LAZY_RW);
+		ret = write_inode(&trans, &root_inode, snapshot);
+		if (ret)
+			bch_err(c, "error writing root inode: %i", ret);
+	}
+err:
+fsck_err:
+	bch2_trans_exit(&trans);
+	return ret;
 }
 
 struct pathbuf {
@@ -1147,17 +1798,18 @@ static int check_path(struct btree_trans *trans,
 	size_t i;
 	int ret = 0;
 
+	snapshot = snapshot_t(c, snapshot)->equiv;
 	p->nr = 0;
 
 	while (inode->bi_inum != BCACHEFS_ROOT_INO) {
 		ret = lockrestart_do(trans,
-			inode_backpointer_exists(trans, inode));
+			inode_backpointer_exists(trans, inode, snapshot));
 		if (ret < 0)
 			break;
 
 		if (!ret) {
-			if (fsck_err(c,  "unreachable inode %llu, type %u nlink %u backptr %llu:%llu",
-				     inode->bi_inum,
+			if (fsck_err(c,  "unreachable inode %llu:%u, type %u nlink %u backptr %llu:%llu",
+				     inode->bi_inum, snapshot,
 				     mode_to_type(inode->bi_mode),
 				     inode->bi_nlink,
 				     inode->bi_dir,
@@ -1226,7 +1878,8 @@ static int check_directory_structure(struct bch_fs *c)
 
 	for_each_btree_key(&trans, iter, BTREE_ID_inodes, POS_MIN,
 			   BTREE_ITER_INTENT|
-			   BTREE_ITER_PREFETCH, k, ret) {
+			   BTREE_ITER_PREFETCH|
+			   BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
 		if (k.k->type != KEY_TYPE_inode)
 			continue;
 
@@ -1237,6 +1890,9 @@ static int check_directory_structure(struct bch_fs *c)
 			break;
 		}
 
+		if (u.bi_flags & BCH_INODE_UNLINKED)
+			continue;
+
 		ret = check_path(&trans, &path, &u, iter.pos.snapshot);
 		if (ret)
 			break;
@@ -1295,8 +1951,9 @@ static int nlink_cmp(const void *_l, const void *_r)
 	return cmp_int(l->inum, r->inum) ?: cmp_int(l->snapshot, r->snapshot);
 }
 
-static void inc_link(struct bch_fs *c, struct nlink_table *links,
-		     u64 range_start, u64 range_end, u64 inum)
+static void inc_link(struct bch_fs *c, struct snapshots_seen *s,
+		     struct nlink_table *links,
+		     u64 range_start, u64 range_end, u64 inum, u32 snapshot)
 {
 	struct nlink *link, key = {
 		.inum = inum, .snapshot = U32_MAX,
@@ -1307,8 +1964,18 @@ static void inc_link(struct bch_fs *c, struct nlink_table *links,
 
 	link = __inline_bsearch(&key, links->d, links->nr,
 				sizeof(links->d[0]), nlink_cmp);
-	if (link)
-		link->count++;
+	if (!link)
+		return;
+
+	while (link > links->d && link[0].inum == link[-1].inum)
+		--link;
+
+	for (; link < links->d + links->nr && link->inum == inum; link++)
+		if (ref_visible(c, s, snapshot, link->snapshot)) {
+			link->count++;
+			if (link->snapshot >= snapshot)
+				break;
+		}
 }
 
 noinline_for_stack
@@ -1328,7 +1995,8 @@ static int check_nlinks_find_hardlinks(struct bch_fs *c,
 	for_each_btree_key(&trans, iter, BTREE_ID_inodes,
 			   POS(0, start),
 			   BTREE_ITER_INTENT|
-			   BTREE_ITER_PREFETCH, k, ret) {
+			   BTREE_ITER_PREFETCH|
+			   BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
 		if (k.k->type != KEY_TYPE_inode)
 			continue;
 
@@ -1369,23 +2037,33 @@ static int check_nlinks_walk_dirents(struct bch_fs *c, struct nlink_table *links
 				     u64 range_start, u64 range_end)
 {
 	struct btree_trans trans;
+	struct snapshots_seen s;
 	struct btree_iter iter;
 	struct bkey_s_c k;
 	struct bkey_s_c_dirent d;
 	int ret;
 
+	snapshots_seen_init(&s);
+
 	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
 
 	for_each_btree_key(&trans, iter, BTREE_ID_dirents, POS_MIN,
 			   BTREE_ITER_INTENT|
-			   BTREE_ITER_PREFETCH, k, ret) {
+			   BTREE_ITER_PREFETCH|
+			   BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
+		ret = snapshots_seen_update(c, &s, k.k->p);
+		if (ret)
+			break;
+
 		switch (k.k->type) {
 		case KEY_TYPE_dirent:
 			d = bkey_s_c_to_dirent(k);
 
-			if (d.v->d_type != DT_DIR)
-				inc_link(c, links, range_start, range_end,
-					 le64_to_cpu(d.v->d_inum));
+			if (d.v->d_type != DT_DIR &&
+			    d.v->d_type != DT_SUBVOL)
+				inc_link(c, &s, links, range_start, range_end,
+					 le64_to_cpu(d.v->d_inum),
+					 d.k->p.snapshot);
 			break;
 		}
 
@@ -1393,10 +2071,11 @@ static int check_nlinks_walk_dirents(struct bch_fs *c, struct nlink_table *links
 	}
 	bch2_trans_iter_exit(&trans, &iter);
 
-	ret = bch2_trans_exit(&trans) ?: ret;
 	if (ret)
 		bch_err(c, "error in fsck: btree error %i while walking dirents", ret);
 
+	bch2_trans_exit(&trans);
+	snapshots_seen_exit(&s);
 	return ret;
 }
 
@@ -1418,7 +2097,8 @@ static int check_nlinks_update_hardlinks(struct bch_fs *c,
 	for_each_btree_key(&trans, iter, BTREE_ID_inodes,
 			   POS(0, range_start),
 			   BTREE_ITER_INTENT|
-			   BTREE_ITER_PREFETCH, k, ret) {
+			   BTREE_ITER_PREFETCH|
+			   BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
 		if (k.k->p.offset >= range_end)
 			break;
 
@@ -1434,7 +2114,8 @@ static int check_nlinks_update_hardlinks(struct bch_fs *c,
 		if (!u.bi_nlink)
 			continue;
 
-		while (link->inum < k.k->p.offset) {
+		while ((cmp_int(link->inum, k.k->p.offset) ?:
+			cmp_int(link->snapshot, k.k->p.snapshot)) < 0) {
 			link++;
 			BUG_ON(link >= links->d + links->nr);
 		}
@@ -1507,14 +2188,13 @@ static int check_nlinks(struct bch_fs *c)
  */
 int bch2_fsck_full(struct bch_fs *c)
 {
-	struct bch_inode_unpacked root_inode;
-
 	return  bch2_fs_snapshots_check(c) ?:
 		check_inodes(c, true) ?:
+		check_subvols(c) ?:
 		check_extents(c) ?:
 		check_dirents(c) ?:
 		check_xattrs(c) ?:
-		check_root(c, &root_inode) ?:
+		check_root(c) ?:
 		check_directory_structure(c) ?:
 		check_nlinks(c);
 }
-- 
cgit 


From 8c6d298ab22fc1b2912ccef4ffd4a01b35f9c5b4 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 12 Mar 2021 20:30:39 -0500
Subject: bcachefs: Convert io paths for snapshots

This plumbs around the subvolume ID as was done previously for other
filesystem code, but now for the IO paths - the control flow in the IO
paths is trickier so the changes in this patch are more involved.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/fs-io.c    |  64 ++++++++++++++-----------
 fs/bcachefs/io.c       | 128 ++++++++++++++++++++++++++++++-------------------
 fs/bcachefs/io.h       |  19 ++++----
 fs/bcachefs/io_types.h |   2 +
 fs/bcachefs/reflink.c  |  24 ++++++----
 5 files changed, 144 insertions(+), 93 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 7a0772195182..736dd71419a5 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -769,23 +769,35 @@ static void readpage_bio_extend(struct readpages_iter *iter,
 	}
 }
 
-static void bchfs_read(struct btree_trans *trans, struct btree_iter *iter,
-		       struct bch_read_bio *rbio, u64 inum,
+static void bchfs_read(struct btree_trans *trans,
+		       struct bch_read_bio *rbio,
+		       subvol_inum inum,
 		       struct readpages_iter *readpages_iter)
 {
 	struct bch_fs *c = trans->c;
+	struct btree_iter iter;
 	struct bkey_buf sk;
 	int flags = BCH_READ_RETRY_IF_STALE|
 		BCH_READ_MAY_PROMOTE;
+	u32 snapshot;
 	int ret = 0;
 
 	rbio->c = c;
 	rbio->start_time = local_clock();
+	rbio->subvol = inum.subvol;
 
 	bch2_bkey_buf_init(&sk);
 retry:
 	bch2_trans_begin(trans);
+	iter = (struct btree_iter) { NULL };
 
+	ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
+	if (ret)
+		goto err;
+
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
+			     SPOS(inum.inum, rbio->bio.bi_iter.bi_sector, snapshot),
+			     BTREE_ITER_SLOTS|BTREE_ITER_FILTER_SNAPSHOTS);
 	while (1) {
 		struct bkey_s_c k;
 		unsigned bytes, sectors, offset_into_extent;
@@ -800,15 +812,15 @@ retry:
 			break;
 		}
 
-		bch2_btree_iter_set_pos(iter,
-				POS(inum, rbio->bio.bi_iter.bi_sector));
+		bch2_btree_iter_set_pos(&iter,
+				POS(inum.inum, rbio->bio.bi_iter.bi_sector));
 
-		k = bch2_btree_iter_peek_slot(iter);
+		k = bch2_btree_iter_peek_slot(&iter);
 		ret = bkey_err(k);
 		if (ret)
 			break;
 
-		offset_into_extent = iter->pos.offset -
+		offset_into_extent = iter.pos.offset -
 			bkey_start_offset(k.k);
 		sectors = k.k->size - offset_into_extent;
 
@@ -838,7 +850,7 @@ retry:
 		if (bkey_extent_is_allocation(k.k))
 			bch2_add_page_sectors(&rbio->bio, k);
 
-		bch2_read_extent(trans, rbio, iter->pos,
+		bch2_read_extent(trans, rbio, iter.pos,
 				 data_btree, k, offset_into_extent, flags);
 
 		if (flags & BCH_READ_LAST_FRAGMENT)
@@ -847,12 +859,14 @@ retry:
 		swap(rbio->bio.bi_iter.bi_size, bytes);
 		bio_advance(&rbio->bio, bytes);
 	}
+err:
+	bch2_trans_iter_exit(trans, &iter);
 
 	if (ret == -EINTR)
 		goto retry;
 
 	if (ret) {
-		bch_err_inum_ratelimited(c, inum,
+		bch_err_inum_ratelimited(c, inum.inum,
 				"read error %i from btree lookup", ret);
 		rbio->bio.bi_status = BLK_STS_IOERR;
 		bio_endio(&rbio->bio);
@@ -867,7 +881,6 @@ void bch2_readahead(struct readahead_control *ractl)
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
 	struct bch_io_opts opts = io_opts(c, &inode->ei_inode);
 	struct btree_trans trans;
-	struct btree_iter iter;
 	struct page *page;
 	struct readpages_iter readpages_iter;
 	int ret;
@@ -876,8 +889,6 @@ void bch2_readahead(struct readahead_control *ractl)
 	BUG_ON(ret);
 
 	bch2_trans_init(&trans, c, 0, 0);
-	bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents, POS_MIN,
-			     BTREE_ITER_SLOTS);
 
 	bch2_pagecache_add_get(&inode->ei_pagecache_lock);
 
@@ -898,22 +909,20 @@ void bch2_readahead(struct readahead_control *ractl)
 		rbio->bio.bi_end_io = bch2_readpages_end_io;
 		BUG_ON(!bio_add_page(&rbio->bio, page, PAGE_SIZE, 0));
 
-		bchfs_read(&trans, &iter, rbio, inode->v.i_ino,
+		bchfs_read(&trans, rbio, inode_inum(inode),
 			   &readpages_iter);
 	}
 
 	bch2_pagecache_add_put(&inode->ei_pagecache_lock);
 
-	bch2_trans_iter_exit(&trans, &iter);
 	bch2_trans_exit(&trans);
 	kfree(readpages_iter.pages);
 }
 
 static void __bchfs_readpage(struct bch_fs *c, struct bch_read_bio *rbio,
-			     u64 inum, struct page *page)
+			     subvol_inum inum, struct page *page)
 {
 	struct btree_trans trans;
-	struct btree_iter iter;
 
 	bch2_page_state_create(page, __GFP_NOFAIL);
 
@@ -923,12 +932,7 @@ static void __bchfs_readpage(struct bch_fs *c, struct bch_read_bio *rbio,
 	BUG_ON(!bio_add_page(&rbio->bio, page, PAGE_SIZE, 0));
 
 	bch2_trans_init(&trans, c, 0, 0);
-	bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents, POS_MIN,
-			     BTREE_ITER_SLOTS);
-
-	bchfs_read(&trans, &iter, rbio, inum, NULL);
-
-	bch2_trans_iter_exit(&trans, &iter);
+	bchfs_read(&trans, rbio, inum, NULL);
 	bch2_trans_exit(&trans);
 }
 
@@ -951,7 +955,7 @@ static int bch2_read_single_page(struct page *page,
 	rbio->bio.bi_private = &done;
 	rbio->bio.bi_end_io = bch2_read_single_page_end_io;
 
-	__bchfs_readpage(c, rbio, inode->v.i_ino, page);
+	__bchfs_readpage(c, rbio, inode_inum(inode), page);
 	wait_for_completion(&done);
 
 	ret = blk_status_to_errno(rbio->bio.bi_status);
@@ -1096,6 +1100,7 @@ static void bch2_writepage_io_alloc(struct bch_fs *c,
 	op->nr_replicas		= nr_replicas;
 	op->res.nr_replicas	= nr_replicas;
 	op->write_point		= writepoint_hashed(inode->ei_last_dirtied);
+	op->subvol		= inode->ei_subvol;
 	op->pos			= POS(inode->v.i_ino, sector);
 	op->end_io		= bch2_writepage_io_done;
 	op->wbio.bio.bi_iter.bi_sector = sector;
@@ -1733,7 +1738,7 @@ start:
 		if (iter->count)
 			closure_get(&dio->cl);
 
-		bch2_read(c, rbio_init(bio, opts), inode->v.i_ino);
+		bch2_read(c, rbio_init(bio, opts), inode_inum(inode));
 	}
 
 	iter->count += shorten;
@@ -1816,7 +1821,8 @@ retry:
 		if (bkey_cmp(bkey_start_pos(k.k), POS(inum.inum, end)) >= 0)
 			break;
 
-		if (nr_replicas > bch2_bkey_replicas(c, k) ||
+		if (k.k->p.snapshot != snapshot ||
+		    nr_replicas > bch2_bkey_replicas(c, k) ||
 		    (!compressed && bch2_bkey_sectors_compressed(k))) {
 			ret = false;
 			break;
@@ -1944,6 +1950,7 @@ static long bch2_dio_write_loop(struct dio_write *dio)
 		op_journal_seq_set(&dio->op, &inode->ei_journal_seq);
 		dio->op.write_point	= writepoint_hashed((unsigned long) current);
 		dio->op.nr_replicas	= dio->op.opts.data_replicas;
+		dio->op.subvol		= inode->ei_subvol;
 		dio->op.pos		= POS(inode->v.i_ino, (u64) req->ki_pos >> 9);
 
 		if ((req->ki_flags & IOCB_DSYNC) &&
@@ -2438,7 +2445,7 @@ int bch2_truncate(struct mnt_idmap *idmap,
 
 	truncate_setsize(&inode->v, iattr->ia_size);
 
-	ret = bch2_fpunch(c, inode->v.i_ino,
+	ret = bch2_fpunch(c, inode_inum(inode),
 			round_up(iattr->ia_size, block_bytes(c)) >> 9,
 			U64_MAX, &inode->ei_journal_seq, &i_sectors_delta);
 	i_sectors_acct(c, inode, NULL, i_sectors_delta);
@@ -2498,7 +2505,7 @@ static long bchfs_fpunch(struct bch_inode_info *inode, loff_t offset, loff_t len
 	if (discard_start < discard_end) {
 		s64 i_sectors_delta = 0;
 
-		ret = bch2_fpunch(c, inode->v.i_ino,
+		ret = bch2_fpunch(c, inode_inum(inode),
 				  discard_start, discard_end,
 				  &inode->ei_journal_seq,
 				  &i_sectors_delta);
@@ -2577,7 +2584,7 @@ static long bchfs_fcollapse_finsert(struct bch_inode_info *inode,
 	} else {
 		s64 i_sectors_delta = 0;
 
-		ret = bch2_fpunch(c, inode->v.i_ino,
+		ret = bch2_fpunch(c, inode_inum(inode),
 				  offset >> 9, (offset + len) >> 9,
 				  &inode->ei_journal_seq,
 				  &i_sectors_delta);
@@ -2793,7 +2800,8 @@ static int __bchfs_fallocate(struct bch_inode_info *inode, int mode,
 			reservation.v.nr_replicas = disk_res.nr_replicas;
 		}
 
-		ret = bch2_extent_update(&trans, &iter, &reservation.k_i,
+		ret = bch2_extent_update(&trans, inode_inum(inode), &iter,
+					 &reservation.k_i,
 				&disk_res, &inode->ei_journal_seq,
 				0, &i_sectors_delta, true);
 		i_sectors_acct(c, inode, &quota_res, i_sectors_delta);
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index 0f5e0099b848..bd96c6bebe18 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -27,6 +27,7 @@
 #include "keylist.h"
 #include "move.h"
 #include "rebalance.h"
+#include "subvolume.h"
 #include "super.h"
 #include "super-io.h"
 #include "trace.h"
@@ -230,7 +231,8 @@ int bch2_sum_sector_overwrites(struct btree_trans *trans,
 			: 0;
 
 		if (!*usage_increasing &&
-		    (new_replicas > bch2_bkey_replicas(c, old) ||
+		    (new->k.p.snapshot != old.k->p.snapshot ||
+		     new_replicas > bch2_bkey_replicas(c, old) ||
 		     (!new_compressed && bch2_bkey_sectors_compressed(old))))
 			*usage_increasing = true;
 
@@ -266,6 +268,7 @@ int bch2_sum_sector_overwrites(struct btree_trans *trans,
 }
 
 int bch2_extent_update(struct btree_trans *trans,
+		       subvol_inum inum,
 		       struct btree_iter *iter,
 		       struct bkey_i *k,
 		       struct disk_reservation *disk_res,
@@ -324,11 +327,8 @@ int bch2_extent_update(struct btree_trans *trans,
 		struct btree_iter inode_iter;
 		struct bch_inode_unpacked inode_u;
 
-		ret = bch2_inode_peek(trans, &inode_iter, &inode_u,
-				      (subvol_inum) {
-				      .subvol = BCACHEFS_ROOT_SUBVOL,
-				      .inum = k->k.p.inode,
-				      }, BTREE_ITER_INTENT);
+		ret = bch2_inode_peek(trans, &inode_iter, &inode_u, inum,
+				      BTREE_ITER_INTENT);
 		if (ret)
 			return ret;
 
@@ -384,22 +384,37 @@ int bch2_extent_update(struct btree_trans *trans,
 	return 0;
 }
 
+/*
+ * Returns -EINTR if we had to drop locks:
+ */
 int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter,
-		   struct bpos end, u64 *journal_seq,
-		   s64 *i_sectors_delta)
+		   subvol_inum inum, u64 end,
+		   u64 *journal_seq, s64 *i_sectors_delta)
 {
 	struct bch_fs *c	= trans->c;
 	unsigned max_sectors	= KEY_SIZE_MAX & (~0 << c->block_bits);
+	struct bpos end_pos = POS(inum.inum, end);
 	struct bkey_s_c k;
 	int ret = 0, ret2 = 0;
+	u32 snapshot;
 
-	while ((bch2_trans_begin(trans),
-		(k = bch2_btree_iter_peek(iter)).k) &&
-	       bkey_cmp(iter->pos, end) < 0) {
+	while (1) {
 		struct disk_reservation disk_res =
 			bch2_disk_reservation_init(c, 0);
 		struct bkey_i delete;
 
+		bch2_trans_begin(trans);
+
+		ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
+		if (ret)
+			goto btree_err;
+
+		bch2_btree_iter_set_snapshot(iter, snapshot);
+
+		k = bch2_btree_iter_peek(iter);
+		if (bkey_cmp(iter->pos, end_pos) >= 0)
+			break;
+
 		ret = bkey_err(k);
 		if (ret)
 			goto btree_err;
@@ -409,9 +424,9 @@ int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter,
 
 		/* create the biggest key we can */
 		bch2_key_resize(&delete.k, max_sectors);
-		bch2_cut_back(end, &delete);
+		bch2_cut_back(end_pos, &delete);
 
-		ret = bch2_extent_update(trans, iter, &delete,
+		ret = bch2_extent_update(trans, inum, iter, &delete,
 				&disk_res, journal_seq,
 				0, i_sectors_delta, false);
 		bch2_disk_reservation_put(c, &disk_res);
@@ -424,36 +439,31 @@ btree_err:
 			break;
 	}
 
-	if (bkey_cmp(iter->pos, end) > 0) {
-		bch2_btree_iter_set_pos(iter, end);
-		ret = bch2_btree_iter_traverse(iter);
-	}
+	if (bkey_cmp(iter->pos, end_pos) > 0)
+		bch2_btree_iter_set_pos(iter, end_pos);
 
 	return ret ?: ret2;
 }
 
-int bch2_fpunch(struct bch_fs *c, u64 inum, u64 start, u64 end,
+int bch2_fpunch(struct bch_fs *c, subvol_inum inum, u64 start, u64 end,
 		u64 *journal_seq, s64 *i_sectors_delta)
 {
 	struct btree_trans trans;
 	struct btree_iter iter;
-	int ret = 0;
+	int ret;
 
 	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024);
 	bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
-				   POS(inum, start),
-				   BTREE_ITER_INTENT);
+			     POS(inum.inum, start),
+			     BTREE_ITER_INTENT);
 
-	ret = bch2_fpunch_at(&trans, &iter, POS(inum, end),
+	ret = bch2_fpunch_at(&trans, &iter, inum, end,
 			     journal_seq, i_sectors_delta);
 
 	bch2_trans_iter_exit(&trans, &iter);
 	bch2_trans_exit(&trans);
 
-	if (ret == -EINTR)
-		ret = 0;
-
-	return ret;
+	return ret == -EINTR ? 0 : ret;
 }
 
 static int bch2_write_index_default(struct bch_write_op *op)
@@ -464,40 +474,51 @@ static int bch2_write_index_default(struct bch_write_op *op)
 	struct bkey_i *k = bch2_keylist_front(keys);
 	struct btree_trans trans;
 	struct btree_iter iter;
+	subvol_inum inum = {
+		.subvol = op->subvol,
+		.inum	= k->k.p.inode,
+	};
 	int ret;
 
+	BUG_ON(!inum.subvol);
+
 	bch2_bkey_buf_init(&sk);
 	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024);
 
-	bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
-			     bkey_start_pos(&k->k),
-			     BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
-
 	do {
 		bch2_trans_begin(&trans);
 
 		k = bch2_keylist_front(keys);
+		bch2_bkey_buf_copy(&sk, c, k);
 
-		k->k.p.snapshot = iter.snapshot;
+		ret = bch2_subvolume_get_snapshot(&trans, inum.subvol,
+						  &sk.k->k.p.snapshot);
+		if (ret == -EINTR)
+			continue;
+		if (ret)
+			break;
 
-		bch2_bkey_buf_realloc(&sk, c, k->k.u64s);
-		bkey_copy(sk.k, k);
-		bch2_cut_front(iter.pos, sk.k);
+		bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
+				     bkey_start_pos(&sk.k->k),
+				     BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
 
-		ret = bch2_extent_update(&trans, &iter, sk.k,
+		ret = bch2_extent_update(&trans, inum, &iter, sk.k,
 					 &op->res, op_journal_seq(op),
 					 op->new_i_size, &op->i_sectors_delta,
 					 op->flags & BCH_WRITE_CHECK_ENOSPC);
+		bch2_trans_iter_exit(&trans, &iter);
+
 		if (ret == -EINTR)
 			continue;
 		if (ret)
 			break;
 
 		if (bkey_cmp(iter.pos, k->k.p) >= 0)
-			bch2_keylist_pop_front(keys);
+			bch2_keylist_pop_front(&op->insert_keys);
+		else
+			bch2_cut_front(iter.pos, k);
 	} while (!bch2_keylist_empty(keys));
 
-	bch2_trans_iter_exit(&trans, &iter);
 	bch2_trans_exit(&trans);
 	bch2_bkey_buf_exit(&sk, c);
 
@@ -1645,7 +1666,7 @@ static void bch2_rbio_done(struct bch_read_bio *rbio)
 }
 
 static void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio,
-				     struct bvec_iter bvec_iter, u64 inode,
+				     struct bvec_iter bvec_iter,
 				     struct bch_io_failures *failed,
 				     unsigned flags)
 {
@@ -1709,7 +1730,10 @@ static void bch2_rbio_retry(struct work_struct *work)
 	struct bch_fs *c	= rbio->c;
 	struct bvec_iter iter	= rbio->bvec_iter;
 	unsigned flags		= rbio->flags;
-	u64 inode		= rbio->read_pos.inode;
+	subvol_inum inum = {
+		.subvol = rbio->subvol,
+		.inum	= rbio->read_pos.inode,
+	};
 	struct bch_io_failures failed = { .nr = 0 };
 
 	trace_read_retry(&rbio->bio);
@@ -1725,12 +1749,12 @@ static void bch2_rbio_retry(struct work_struct *work)
 	flags &= ~BCH_READ_MAY_PROMOTE;
 
 	if (flags & BCH_READ_NODECODE) {
-		bch2_read_retry_nodecode(c, rbio, iter, inode, &failed, flags);
+		bch2_read_retry_nodecode(c, rbio, iter, &failed, flags);
 	} else {
 		flags &= ~BCH_READ_LAST_FRAGMENT;
 		flags |= BCH_READ_MUST_CLONE;
 
-		__bch2_read(c, rbio, iter, inode, &failed, flags);
+		__bch2_read(c, rbio, iter, inum, &failed, flags);
 	}
 }
 
@@ -2174,6 +2198,7 @@ get_bio:
 	/* XXX: only initialize this if needed */
 	rbio->devs_have		= bch2_bkey_devs(k);
 	rbio->pick		= pick;
+	rbio->subvol		= orig->subvol;
 	rbio->read_pos		= read_pos;
 	rbio->data_btree	= data_btree;
 	rbio->data_pos		= data_pos;
@@ -2281,25 +2306,31 @@ out_read_done:
 }
 
 void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
-		 struct bvec_iter bvec_iter, u64 inode,
+		 struct bvec_iter bvec_iter, subvol_inum inum,
 		 struct bch_io_failures *failed, unsigned flags)
 {
 	struct btree_trans trans;
 	struct btree_iter iter;
 	struct bkey_buf sk;
 	struct bkey_s_c k;
+	u32 snapshot;
 	int ret;
 
 	BUG_ON(flags & BCH_READ_NODECODE);
 
 	bch2_bkey_buf_init(&sk);
 	bch2_trans_init(&trans, c, 0, 0);
-	bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
-			     POS(inode, bvec_iter.bi_sector),
-			     BTREE_ITER_SLOTS);
 retry:
 	bch2_trans_begin(&trans);
+	iter = (struct btree_iter) { NULL };
+
+	ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot);
+	if (ret)
+		goto err;
 
+	bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
+			     SPOS(inum.inum, bvec_iter.bi_sector, snapshot),
+			     BTREE_ITER_SLOTS|BTREE_ITER_FILTER_SNAPSHOTS);
 	while (1) {
 		unsigned bytes, sectors, offset_into_extent;
 		enum btree_id data_btree = BTREE_ID_extents;
@@ -2314,7 +2345,7 @@ retry:
 		}
 
 		bch2_btree_iter_set_pos(&iter,
-				POS(inode, bvec_iter.bi_sector));
+				POS(inum.inum, bvec_iter.bi_sector));
 
 		k = bch2_btree_iter_peek_slot(&iter);
 		ret = bkey_err(k);
@@ -2364,16 +2395,17 @@ retry:
 		swap(bvec_iter.bi_size, bytes);
 		bio_advance_iter(&rbio->bio, &bvec_iter, bytes);
 	}
+err:
+	bch2_trans_iter_exit(&trans, &iter);
 
 	if (ret == -EINTR || ret == READ_RETRY || ret == READ_RETRY_AVOID)
 		goto retry;
 
-	bch2_trans_iter_exit(&trans, &iter);
 	bch2_trans_exit(&trans);
 	bch2_bkey_buf_exit(&sk, c);
 
 	if (ret) {
-		bch_err_inum_ratelimited(c, inode,
+		bch_err_inum_ratelimited(c, inum.inum,
 					 "read error %i from btree lookup", ret);
 		rbio->bio.bi_status = BLK_STS_IOERR;
 		bch2_rbio_done(rbio);
diff --git a/fs/bcachefs/io.h b/fs/bcachefs/io.h
index f21ffb53c1e4..ebb0944b4ca3 100644
--- a/fs/bcachefs/io.h
+++ b/fs/bcachefs/io.h
@@ -83,12 +83,13 @@ static inline struct workqueue_struct *index_update_wq(struct bch_write_op *op)
 
 int bch2_sum_sector_overwrites(struct btree_trans *, struct btree_iter *,
 			       struct bkey_i *, bool *, bool *, s64 *, s64 *);
-int bch2_extent_update(struct btree_trans *, struct btree_iter *,
-		       struct bkey_i *, struct disk_reservation *,
-		       u64 *, u64, s64 *, bool);
+int bch2_extent_update(struct btree_trans *, subvol_inum,
+		       struct btree_iter *, struct bkey_i *,
+		       struct disk_reservation *, u64 *, u64, s64 *, bool);
+
 int bch2_fpunch_at(struct btree_trans *, struct btree_iter *,
-		   struct bpos, u64 *, s64 *);
-int bch2_fpunch(struct bch_fs *c, u64, u64, u64, u64 *, s64 *);
+		   subvol_inum, u64, u64 *, s64 *);
+int bch2_fpunch(struct bch_fs *c, subvol_inum, u64, u64, u64 *, s64 *);
 
 static inline void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c,
 				      struct bch_io_opts opts)
@@ -108,6 +109,7 @@ static inline void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c,
 	op->devs_have.nr	= 0;
 	op->target		= 0;
 	op->opts		= opts;
+	op->subvol		= 0;
 	op->pos			= POS_MAX;
 	op->version		= ZERO_VERSION;
 	op->write_point		= (struct write_point_specifier) { 0 };
@@ -174,10 +176,10 @@ static inline void bch2_read_extent(struct btree_trans *trans,
 }
 
 void __bch2_read(struct bch_fs *, struct bch_read_bio *, struct bvec_iter,
-		 u64, struct bch_io_failures *, unsigned flags);
+		 subvol_inum, struct bch_io_failures *, unsigned flags);
 
 static inline void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
-			     u64 inode)
+			     subvol_inum inum)
 {
 	struct bch_io_failures failed = { .nr = 0 };
 
@@ -185,8 +187,9 @@ static inline void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
 
 	rbio->c = c;
 	rbio->start_time = local_clock();
+	rbio->subvol = inum.subvol;
 
-	__bch2_read(c, rbio, rbio->bio.bi_iter, inode, &failed,
+	__bch2_read(c, rbio, rbio->bio.bi_iter, inum, &failed,
 		    BCH_READ_RETRY_IF_STALE|
 		    BCH_READ_MAY_PROMOTE|
 		    BCH_READ_USER_MAPPED);
diff --git a/fs/bcachefs/io_types.h b/fs/bcachefs/io_types.h
index 50361f2fb8f1..53270f0a08a3 100644
--- a/fs/bcachefs/io_types.h
+++ b/fs/bcachefs/io_types.h
@@ -62,6 +62,7 @@ struct bch_read_bio {
 	/*
 	 * pos we read from - different from data_pos for indirect extents:
 	 */
+	u32			subvol;
 	struct bpos		read_pos;
 
 	/*
@@ -124,6 +125,7 @@ struct bch_write_op {
 	u16			nonce;
 	struct bch_io_opts	opts;
 
+	u32			subvol;
 	struct bpos		pos;
 	struct bversion		version;
 
diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c
index be4b47bc7438..92ff609453b8 100644
--- a/fs/bcachefs/reflink.c
+++ b/fs/bcachefs/reflink.c
@@ -212,6 +212,7 @@ s64 bch2_remap_range(struct bch_fs *c,
 	struct bpos dst_end = dst_start, src_end = src_start;
 	struct bpos src_want;
 	u64 dst_done;
+	u32 dst_snapshot, src_snapshot;
 	int ret = 0, ret2 = 0;
 
 	if (!percpu_ref_tryget(&c->writes))
@@ -243,15 +244,19 @@ s64 bch2_remap_range(struct bch_fs *c,
 		}
 
 		ret = bch2_subvolume_get_snapshot(&trans, src_inum.subvol,
-						  &src_iter.snapshot);
+						  &src_snapshot);
 		if (ret)
 			continue;
 
+		bch2_btree_iter_set_snapshot(&src_iter, src_snapshot);
+
 		ret = bch2_subvolume_get_snapshot(&trans, dst_inum.subvol,
-						  &dst_iter.snapshot);
+						  &dst_snapshot);
 		if (ret)
 			continue;
 
+		bch2_btree_iter_set_snapshot(&dst_iter, dst_snapshot);
+
 		dst_done = dst_iter.pos.offset - dst_start.offset;
 		src_want = POS(src_start.inode, src_start.offset + dst_done);
 		bch2_btree_iter_set_pos(&src_iter, src_want);
@@ -262,11 +267,11 @@ s64 bch2_remap_range(struct bch_fs *c,
 			continue;
 
 		if (bkey_cmp(src_want, src_iter.pos) < 0) {
-			ret = bch2_fpunch_at(&trans, &dst_iter,
-					bpos_min(dst_end,
-						 POS(dst_iter.pos.inode, dst_iter.pos.offset +
-						     src_iter.pos.offset - src_want.offset)),
-						 journal_seq, i_sectors_delta);
+			ret = bch2_fpunch_at(&trans, &dst_iter, dst_inum,
+					min(dst_end.offset,
+					    dst_iter.pos.offset +
+					    src_iter.pos.offset - src_want.offset),
+					journal_seq, i_sectors_delta);
 			continue;
 		}
 
@@ -303,8 +308,9 @@ s64 bch2_remap_range(struct bch_fs *c,
 		bch2_key_resize(&new_dst.k->k,
 				min(src_k.k->p.offset - src_want.offset,
 				    dst_end.offset - dst_iter.pos.offset));
-		ret = bch2_extent_update(&trans, &dst_iter, new_dst.k,
-					 &disk_res, journal_seq,
+
+		ret = bch2_extent_update(&trans, dst_inum, &dst_iter,
+					 new_dst.k, &disk_res, journal_seq,
 					 new_i_size, i_sectors_delta,
 					 true);
 		bch2_disk_reservation_put(c, &disk_res);
-- 
cgit 


From 7a7d17b2f7c23c0891b0cbd13fafd3bc805b1b29 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 2 Feb 2021 17:09:10 -0500
Subject: bcachefs: Whiteouts for snapshots

This patch adds KEY_TYPE_whiteout, a new type of whiteout for snapshots,
when we're deleting and the key being deleted is in an ancestor
snapshot - and updates the transaction update/commit path to use it.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/bcachefs_format.h   |   4 +-
 fs/bcachefs/bkey.h              |   2 +-
 fs/bcachefs/bkey_methods.c      |  26 ++++++---
 fs/bcachefs/btree_update_leaf.c | 113 +++++++++++++++++++++++++++++++++++++---
 4 files changed, 127 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index ae8f3a5bc787..f922302332ee 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -327,7 +327,7 @@ static inline void bkey_init(struct bkey *k)
 */
 #define BCH_BKEY_TYPES()				\
 	x(deleted,		0)			\
-	x(discard,		1)			\
+	x(whiteout,		1)			\
 	x(error,		2)			\
 	x(cookie,		3)			\
 	x(hash_whiteout,	4)			\
@@ -361,7 +361,7 @@ struct bch_deleted {
 	struct bch_val		v;
 };
 
-struct bch_discard {
+struct bch_whiteout {
 	struct bch_val		v;
 };
 
diff --git a/fs/bcachefs/bkey.h b/fs/bcachefs/bkey.h
index 904ceb67a029..6a637a408a9f 100644
--- a/fs/bcachefs/bkey.h
+++ b/fs/bcachefs/bkey.h
@@ -63,7 +63,7 @@ static inline void set_bkey_val_bytes(struct bkey *k, unsigned bytes)
 #define bkey_deleted(_k)	((_k)->type == KEY_TYPE_deleted)
 
 #define bkey_whiteout(_k)				\
-	((_k)->type == KEY_TYPE_deleted || (_k)->type == KEY_TYPE_discard)
+	((_k)->type == KEY_TYPE_deleted || (_k)->type == KEY_TYPE_whiteout)
 
 enum bkey_lr_packed {
 	BKEY_PACKED_BOTH,
diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c
index 42fdcc4487de..3133db236b7b 100644
--- a/fs/bcachefs/bkey_methods.c
+++ b/fs/bcachefs/bkey_methods.c
@@ -31,7 +31,7 @@ static const char *deleted_key_invalid(const struct bch_fs *c,
 	.key_invalid = deleted_key_invalid,		\
 }
 
-#define bch2_bkey_ops_discard (struct bkey_ops) {	\
+#define bch2_bkey_ops_whiteout (struct bkey_ops) {	\
 	.key_invalid = deleted_key_invalid,		\
 }
 
@@ -101,6 +101,8 @@ const char *bch2_bkey_val_invalid(struct bch_fs *c, struct bkey_s_c k)
 
 static unsigned bch2_key_types_allowed[] = {
 	[BKEY_TYPE_extents] =
+		(1U << KEY_TYPE_deleted)|
+		(1U << KEY_TYPE_whiteout)|
 		(1U << KEY_TYPE_error)|
 		(1U << KEY_TYPE_cookie)|
 		(1U << KEY_TYPE_extent)|
@@ -108,30 +110,43 @@ static unsigned bch2_key_types_allowed[] = {
 		(1U << KEY_TYPE_reflink_p)|
 		(1U << KEY_TYPE_inline_data),
 	[BKEY_TYPE_inodes] =
+		(1U << KEY_TYPE_deleted)|
+		(1U << KEY_TYPE_whiteout)|
 		(1U << KEY_TYPE_inode)|
 		(1U << KEY_TYPE_inode_generation),
 	[BKEY_TYPE_dirents] =
+		(1U << KEY_TYPE_deleted)|
+		(1U << KEY_TYPE_whiteout)|
 		(1U << KEY_TYPE_hash_whiteout)|
 		(1U << KEY_TYPE_dirent),
 	[BKEY_TYPE_xattrs] =
+		(1U << KEY_TYPE_deleted)|
+		(1U << KEY_TYPE_whiteout)|
 		(1U << KEY_TYPE_cookie)|
 		(1U << KEY_TYPE_hash_whiteout)|
 		(1U << KEY_TYPE_xattr),
 	[BKEY_TYPE_alloc] =
+		(1U << KEY_TYPE_deleted)|
 		(1U << KEY_TYPE_alloc)|
 		(1U << KEY_TYPE_alloc_v2),
 	[BKEY_TYPE_quotas] =
+		(1U << KEY_TYPE_deleted)|
 		(1U << KEY_TYPE_quota),
 	[BKEY_TYPE_stripes] =
+		(1U << KEY_TYPE_deleted)|
 		(1U << KEY_TYPE_stripe),
 	[BKEY_TYPE_reflink] =
+		(1U << KEY_TYPE_deleted)|
 		(1U << KEY_TYPE_reflink_v)|
 		(1U << KEY_TYPE_indirect_inline_data),
 	[BKEY_TYPE_subvolumes] =
+		(1U << KEY_TYPE_deleted)|
 		(1U << KEY_TYPE_subvolume),
 	[BKEY_TYPE_snapshots] =
+		(1U << KEY_TYPE_deleted)|
 		(1U << KEY_TYPE_snapshot),
 	[BKEY_TYPE_btree] =
+		(1U << KEY_TYPE_deleted)|
 		(1U << KEY_TYPE_btree_ptr)|
 		(1U << KEY_TYPE_btree_ptr_v2),
 };
@@ -139,21 +154,18 @@ static unsigned bch2_key_types_allowed[] = {
 const char *__bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k,
 				enum btree_node_type type)
 {
-	unsigned key_types_allowed = (1U << KEY_TYPE_deleted)|
-		bch2_key_types_allowed[type] ;
-
 	if (k.k->u64s < BKEY_U64s)
 		return "u64s too small";
 
-	if (!(key_types_allowed & (1U << k.k->type)))
+	if (!(bch2_key_types_allowed[type] & (1U << k.k->type)))
 		return "invalid key type for this btree";
 
 	if (type == BKEY_TYPE_btree &&
 	    bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX)
 		return "value too big";
 
-	if (btree_node_type_is_extents(type)) {
-		if ((k.k->size == 0) != bkey_deleted(k.k))
+	if (btree_node_type_is_extents(type) && !bkey_whiteout(k.k)) {
+		if (k.k->size == 0)
 			return "bad size field";
 
 		if (k.k->size > k.k->p.offset)
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 1922bf8236f7..2fc134e34572 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -1002,21 +1002,24 @@ static int bch2_trans_update_extent(struct btree_trans *trans,
 		goto next;
 	}
 
-	if (!bkey_cmp(k.k->p, bkey_start_pos(&insert->k)))
+	if (!bkey_cmp(k.k->p, start))
 		goto next;
 
 	while (bkey_cmp(insert->k.p, bkey_start_pos(k.k)) > 0) {
+		bool front_split = bkey_cmp(bkey_start_pos(k.k), start) < 0;
+		bool back_split  = bkey_cmp(k.k->p, insert->k.p) > 0;
+
 		/*
 		 * If we're going to be splitting a compressed extent, note it
 		 * so that __bch2_trans_commit() can increase our disk
 		 * reservation:
 		 */
-		if (bkey_cmp(bkey_start_pos(k.k), start) < 0 &&
-		    bkey_cmp(k.k->p, insert->k.p) > 0 &&
+		if (((front_split && back_split) ||
+		     ((front_split || back_split) && k.k->p.snapshot != insert->k.p.snapshot)) &&
 		    (compressed_sectors = bch2_bkey_sectors_compressed(k)))
 			trans->extra_journal_res += compressed_sectors;
 
-		if (bkey_cmp(bkey_start_pos(k.k), start) < 0) {
+		if (front_split) {
 			update = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
 			if ((ret = PTR_ERR_OR_ZERO(update)))
 				goto err;
@@ -1027,6 +1030,32 @@ static int bch2_trans_update_extent(struct btree_trans *trans,
 
 			bch2_trans_iter_init(trans, &update_iter, btree_id, update->k.p,
 					     BTREE_ITER_NOT_EXTENTS|
+					     BTREE_ITER_ALL_SNAPSHOTS|
+					     BTREE_ITER_INTENT);
+			ret   = bch2_btree_iter_traverse(&update_iter) ?:
+				bch2_trans_update(trans, &update_iter, update,
+						  BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|
+						  flags);
+			bch2_trans_iter_exit(trans, &update_iter);
+
+			if (ret)
+				goto err;
+		}
+
+		if (k.k->p.snapshot != insert->k.p.snapshot &&
+		    (front_split || back_split)) {
+			update = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
+			if ((ret = PTR_ERR_OR_ZERO(update)))
+				goto err;
+
+			bkey_reassemble(update, k);
+
+			bch2_cut_front(start, update);
+			bch2_cut_back(insert->k.p, update);
+
+			bch2_trans_iter_init(trans, &update_iter, btree_id, update->k.p,
+					     BTREE_ITER_NOT_EXTENTS|
+					     BTREE_ITER_ALL_SNAPSHOTS|
 					     BTREE_ITER_INTENT);
 			ret   = bch2_btree_iter_traverse(&update_iter) ?:
 				bch2_trans_update(trans, &update_iter, update,
@@ -1038,12 +1067,32 @@ static int bch2_trans_update_extent(struct btree_trans *trans,
 		}
 
 		if (bkey_cmp(k.k->p, insert->k.p) <= 0) {
-			ret = bch2_btree_delete_at(trans, &iter, flags);
+			update = bch2_trans_kmalloc(trans, sizeof(*update));
+			if ((ret = PTR_ERR_OR_ZERO(update)))
+				goto err;
+
+			bkey_init(&update->k);
+			update->k.p = k.k->p;
+
+			if (insert->k.p.snapshot != k.k->p.snapshot) {
+				update->k.p.snapshot = insert->k.p.snapshot;
+				update->k.type = KEY_TYPE_whiteout;
+			}
+
+			bch2_trans_iter_init(trans, &update_iter, btree_id, update->k.p,
+					     BTREE_ITER_NOT_EXTENTS|
+					     BTREE_ITER_INTENT);
+			ret   = bch2_btree_iter_traverse(&update_iter) ?:
+				bch2_trans_update(trans, &update_iter, update,
+						  BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|
+						  flags);
+			bch2_trans_iter_exit(trans, &update_iter);
+
 			if (ret)
 				goto err;
 		}
 
-		if (bkey_cmp(k.k->p, insert->k.p) > 0) {
+		if (back_split) {
 			update = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
 			if ((ret = PTR_ERR_OR_ZERO(update)))
 				goto err;
@@ -1051,10 +1100,15 @@ static int bch2_trans_update_extent(struct btree_trans *trans,
 			bkey_reassemble(update, k);
 			bch2_cut_front(insert->k.p, update);
 
-			ret = bch2_trans_update(trans, &iter, update, flags);
+			bch2_trans_copy_iter(&update_iter, &iter);
+			update_iter.pos = update->k.p;
+			ret   = bch2_trans_update(trans, &update_iter, update,
+						  BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|
+						  flags);
+			bch2_trans_iter_exit(trans, &update_iter);
+
 			if (ret)
 				goto err;
-
 			goto out;
 		}
 next:
@@ -1086,6 +1140,39 @@ err:
 	return ret;
 }
 
+/*
+ * When deleting, check if we need to emit a whiteout (because we're overwriting
+ * something in an ancestor snapshot)
+ */
+static int need_whiteout_for_snapshot(struct btree_trans *trans,
+				      enum btree_id btree_id, struct bpos pos)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	u32 snapshot = pos.snapshot;
+	int ret;
+
+	if (!bch2_snapshot_parent(trans->c, pos.snapshot))
+		return 0;
+
+	pos.snapshot++;
+
+	for_each_btree_key(trans, iter, btree_id, pos,
+			   BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
+		if (bkey_cmp(k.k->p, pos))
+			break;
+
+		if (bch2_snapshot_is_ancestor(trans->c, snapshot,
+					      k.k->p.snapshot)) {
+			ret = !bkey_whiteout(k.k);
+			break;
+		}
+	}
+	bch2_trans_iter_exit(trans, &iter);
+
+	return ret;
+}
+
 int bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter,
 		      struct bkey_i *k, enum btree_update_flags flags)
 {
@@ -1118,6 +1205,16 @@ int bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter,
 		       btree_insert_entry_cmp(i - 1, i) >= 0);
 #endif
 
+	if (bkey_deleted(&n.k->k) &&
+	    (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)) {
+		int ret = need_whiteout_for_snapshot(trans, n.btree_id, n.k->k.p);
+		if (unlikely(ret < 0))
+			return ret;
+
+		if (ret)
+			n.k->k.type = KEY_TYPE_whiteout;
+	}
+
 	/*
 	 * Pending updates are kept sorted: first, find position of new update,
 	 * then delete/trim any updates the new update overwrites:
-- 
cgit 


From 18443cb9f005b5563e2e3da9b8ccd374a552c3b1 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 5 Aug 2021 00:41:41 -0400
Subject: bcachefs: Update data move path for snapshots

The data move path operates on existing extents, and not within a
subvolume as the regular IO paths do. It needs to change because it may
cause existing extents to be split, and when splitting an existing
extent in an ancestor snapshot we need to make sure the new split has
the same visibility in child snapshots as the existing extent.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_update_leaf.c | 81 +++++++++++++++++++++++++++++++++++------
 fs/bcachefs/fsck.c              | 35 +-----------------
 fs/bcachefs/io.c                |  3 +-
 fs/bcachefs/migrate.c           |  6 ++-
 fs/bcachefs/move.c              | 81 ++++++++++++++++++++++++++++++++++++++++-
 fs/bcachefs/subvolume.h         | 38 +++++++++++++++++++
 6 files changed, 195 insertions(+), 49 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 2fc134e34572..b4a2f2e32248 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -941,6 +941,43 @@ err:
 	goto retry;
 }
 
+static int check_pos_snapshot_overwritten(struct btree_trans *trans,
+					  enum btree_id id,
+					  struct bpos pos)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	int ret;
+
+	if (!snapshot_t(c, pos.snapshot)->children[0])
+		return 0;
+
+	bch2_trans_iter_init(trans, &iter, id, pos,
+			     BTREE_ITER_NOT_EXTENTS|
+			     BTREE_ITER_ALL_SNAPSHOTS);
+	while (1) {
+		k = bch2_btree_iter_prev(&iter);
+		ret = bkey_err(k);
+		if (ret)
+			break;
+
+		if (!k.k)
+			break;
+
+		if (bkey_cmp(pos, k.k->p))
+			break;
+
+		if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, pos.snapshot)) {
+			ret = 1;
+			break;
+		}
+	}
+	bch2_trans_iter_exit(trans, &iter);
+
+	return ret;
+}
+
 static noinline int extent_front_merge(struct btree_trans *trans,
 				       struct btree_iter *iter,
 				       struct bkey_s_c k,
@@ -958,14 +995,40 @@ static noinline int extent_front_merge(struct btree_trans *trans,
 
 	bkey_reassemble(update, k);
 
-	if (bch2_bkey_merge(c, bkey_i_to_s(update), bkey_i_to_s_c(*insert))) {
-		ret = bch2_btree_delete_at(trans, iter, flags);
-		if (ret)
-			return ret;
+	if (!bch2_bkey_merge(c, bkey_i_to_s(update), bkey_i_to_s_c(*insert)))
+		return 0;
 
-		*insert = update;
-	}
+	ret =   check_pos_snapshot_overwritten(trans, iter->btree_id, k.k->p) ?:
+		check_pos_snapshot_overwritten(trans, iter->btree_id, (*insert)->k.p);
+	if (ret < 0)
+		return ret;
+	if (ret)
+		return 0;
+
+	ret = bch2_btree_delete_at(trans, iter, flags);
+	if (ret)
+		return ret;
+
+	*insert = update;
+	return 0;
+}
+
+static noinline int extent_back_merge(struct btree_trans *trans,
+				      struct btree_iter *iter,
+				      struct bkey_i *insert,
+				      struct bkey_s_c k)
+{
+	struct bch_fs *c = trans->c;
+	int ret;
+
+	ret =   check_pos_snapshot_overwritten(trans, iter->btree_id, insert->k.p) ?:
+		check_pos_snapshot_overwritten(trans, iter->btree_id, k.k->p);
+	if (ret < 0)
+		return ret;
+	if (ret)
+		return 0;
 
+	bch2_bkey_merge(c, bkey_i_to_s(insert), k);
 	return 0;
 }
 
@@ -974,7 +1037,6 @@ static int bch2_trans_update_extent(struct btree_trans *trans,
 				    struct bkey_i *insert,
 				    enum btree_update_flags flags)
 {
-	struct bch_fs *c = trans->c;
 	struct btree_iter iter, update_iter;
 	struct bpos start = bkey_start_pos(&insert->k);
 	struct bkey_i *update;
@@ -1002,9 +1064,6 @@ static int bch2_trans_update_extent(struct btree_trans *trans,
 		goto next;
 	}
 
-	if (!bkey_cmp(k.k->p, start))
-		goto next;
-
 	while (bkey_cmp(insert->k.p, bkey_start_pos(k.k)) > 0) {
 		bool front_split = bkey_cmp(bkey_start_pos(k.k), start) < 0;
 		bool back_split  = bkey_cmp(k.k->p, insert->k.p) > 0;
@@ -1120,7 +1179,7 @@ next:
 	}
 
 	if (bch2_bkey_maybe_mergable(&insert->k, k.k))
-		bch2_bkey_merge(c, bkey_i_to_s(insert), k);
+		extent_back_merge(trans, &iter, insert, k);
 out:
 	if (!bkey_deleted(&insert->k)) {
 		/*
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index b4a6b3d2ed07..f9a6a0b3ce7a 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -473,24 +473,6 @@ out:
 	return ret;
 }
 
-struct snapshots_seen {
-	struct bpos			pos;
-	size_t				nr;
-	size_t				size;
-	u32				*d;
-};
-
-static void snapshots_seen_exit(struct snapshots_seen *s)
-{
-	kfree(s->d);
-	s->d = NULL;
-}
-
-static void snapshots_seen_init(struct snapshots_seen *s)
-{
-	memset(s, 0, sizeof(*s));
-}
-
 static int snapshots_seen_update(struct bch_fs *c, struct snapshots_seen *s, struct bpos pos)
 {
 	pos.snapshot = snapshot_t(c, pos.snapshot)->equiv;
@@ -499,26 +481,11 @@ static int snapshots_seen_update(struct bch_fs *c, struct snapshots_seen *s, str
 		s->nr = 0;
 	s->pos = pos;
 
-	if (s->nr == s->size) {
-		size_t new_size = max(s->size, 128UL) * 2;
-		u32 *d = krealloc(s->d, new_size * sizeof(s->d[0]), GFP_KERNEL);
-
-		if (!d) {
-			bch_err(c, "error reallocating snapshots_seen table (new size %zu)",
-				new_size);
-			return -ENOMEM;
-		}
-
-		s->size = new_size;
-		s->d	= d;
-	}
-
 	/* Might get called multiple times due to lock restarts */
 	if (s->nr && s->d[s->nr - 1] == pos.snapshot)
 		return 0;
 
-	s->d[s->nr++] = pos.snapshot;
-	return 0;
+	return snapshots_seen_add(c, s, pos.snapshot);
 }
 
 /**
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index bd96c6bebe18..002fd35e6bfe 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -1828,7 +1828,8 @@ static int __bch2_rbio_narrow_crcs(struct btree_trans *trans,
 	if (!bch2_bkey_narrow_crcs(new, new_crc))
 		goto out;
 
-	ret = bch2_trans_update(trans, &iter, new, 0);
+	ret = bch2_trans_update(trans, &iter, new,
+				BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
 out:
 	bch2_trans_iter_exit(trans, &iter);
 	return ret;
diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c
index 1899326d9754..7c764ee4ea09 100644
--- a/fs/bcachefs/migrate.c
+++ b/fs/bcachefs/migrate.c
@@ -48,7 +48,8 @@ static int __bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags
 	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
 
 	bch2_trans_iter_init(&trans, &iter, btree_id, POS_MIN,
-			     BTREE_ITER_PREFETCH);
+			     BTREE_ITER_PREFETCH|
+			     BTREE_ITER_ALL_SNAPSHOTS);
 
 	while ((k = bch2_btree_iter_peek(&iter)).k &&
 	       !(ret = bkey_err(k))) {
@@ -74,7 +75,8 @@ static int __bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags
 		bch2_btree_iter_set_pos(&iter, bkey_start_pos(&sk.k->k));
 
 		ret   = bch2_btree_iter_traverse(&iter) ?:
-			bch2_trans_update(&trans, &iter, sk.k, 0) ?:
+			bch2_trans_update(&trans, &iter, sk.k,
+					  BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
 			bch2_trans_commit(&trans, NULL, NULL,
 					BTREE_INSERT_NOFAIL);
 
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index 9dc6684139de..2e7d8e2fe331 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -14,6 +14,7 @@
 #include "keylist.h"
 #include "move.h"
 #include "replicas.h"
+#include "subvolume.h"
 #include "super-io.h"
 #include "trace.h"
 
@@ -52,6 +53,81 @@ struct moving_context {
 	wait_queue_head_t	wait;
 };
 
+static int insert_snapshot_whiteouts(struct btree_trans *trans,
+				     enum btree_id id,
+				     struct bpos old_pos,
+				     struct bpos new_pos)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter iter, update_iter;
+	struct bkey_s_c k;
+	struct snapshots_seen s;
+	int ret;
+
+	if (!btree_type_has_snapshots(id))
+		return 0;
+
+	snapshots_seen_init(&s);
+
+	if (!bkey_cmp(old_pos, new_pos))
+		return 0;
+
+	if (!snapshot_t(c, old_pos.snapshot)->children[0])
+		return 0;
+
+	bch2_trans_iter_init(trans, &iter, id, old_pos,
+			     BTREE_ITER_NOT_EXTENTS|
+			     BTREE_ITER_ALL_SNAPSHOTS);
+	while (1) {
+next:
+		k = bch2_btree_iter_prev(&iter);
+		ret = bkey_err(k);
+		if (ret)
+			break;
+
+		if (bkey_cmp(old_pos, k.k->p))
+			break;
+
+		if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, old_pos.snapshot)) {
+			struct bkey_i *update;
+			size_t i;
+
+			for (i = 0; i < s.nr; i++)
+				if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, s.d[i]))
+					goto next;
+
+			update = bch2_trans_kmalloc(trans, sizeof(struct bkey_i));
+
+			ret = PTR_ERR_OR_ZERO(update);
+			if (ret)
+				break;
+
+			bkey_init(&update->k);
+			update->k.p = new_pos;
+			update->k.p.snapshot = k.k->p.snapshot;
+
+			bch2_trans_iter_init(trans, &update_iter, id, update->k.p,
+					     BTREE_ITER_NOT_EXTENTS|
+					     BTREE_ITER_ALL_SNAPSHOTS|
+					     BTREE_ITER_INTENT);
+			ret   = bch2_btree_iter_traverse(&update_iter) ?:
+				bch2_trans_update(trans, &update_iter, update,
+					  BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+			bch2_trans_iter_exit(trans, &update_iter);
+			if (ret)
+				break;
+
+			ret = snapshots_seen_add(c, &s, k.k->p.snapshot);
+			if (ret)
+				break;
+		}
+	}
+	bch2_trans_iter_exit(trans, &iter);
+	kfree(s.d);
+
+	return ret;
+}
+
 int bch2_migrate_index_update(struct bch_write_op *op)
 {
 	struct bch_fs *c = op->c;
@@ -165,7 +241,10 @@ int bch2_migrate_index_update(struct bch_write_op *op)
 
 		next_pos = insert->k.p;
 
-		ret   = bch2_trans_update(&trans, &iter, insert, 0) ?:
+		ret   = insert_snapshot_whiteouts(&trans, m->btree_id,
+						  k.k->p, insert->k.p) ?:
+			bch2_trans_update(&trans, &iter, insert,
+				BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
 			bch2_trans_commit(&trans, &op->res,
 				op_journal_seq(op),
 				BTREE_INSERT_NOFAIL|
diff --git a/fs/bcachefs/subvolume.h b/fs/bcachefs/subvolume.h
index cea4c665af32..0740c7b7f772 100644
--- a/fs/bcachefs/subvolume.h
+++ b/fs/bcachefs/subvolume.h
@@ -54,6 +54,44 @@ static inline bool bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ances
 	return id == ancestor;
 }
 
+struct snapshots_seen {
+	struct bpos			pos;
+	size_t				nr;
+	size_t				size;
+	u32				*d;
+};
+
+static inline void snapshots_seen_exit(struct snapshots_seen *s)
+{
+	kfree(s->d);
+	s->d = NULL;
+}
+
+static inline void snapshots_seen_init(struct snapshots_seen *s)
+{
+	memset(s, 0, sizeof(*s));
+}
+
+static inline int snapshots_seen_add(struct bch_fs *c, struct snapshots_seen *s, u32 id)
+{
+	if (s->nr == s->size) {
+		size_t new_size = max(s->size, 128UL) * 2;
+		u32 *d = krealloc(s->d, new_size * sizeof(s->d[0]), GFP_KERNEL);
+
+		if (!d) {
+			bch_err(c, "error reallocating snapshots_seen table (new size %zu)",
+				new_size);
+			return -ENOMEM;
+		}
+
+		s->size = new_size;
+		s->d	= d;
+	}
+
+	s->d[s->nr++] = id;
+	return 0;
+}
+
 int bch2_fs_snapshots_check(struct bch_fs *);
 void bch2_fs_snapshots_exit(struct bch_fs *);
 int bch2_fs_snapshots_start(struct bch_fs *);
-- 
cgit 


From 6f83cb84bb74cd766f888380cfb5f9268e55d9f1 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 15 Dec 2021 20:38:56 -0500
Subject: bcachefs: Fix unit & perf tests for snapshots

This finishes updating the unit & perf tests for snapshots - btrees that
use snapshots now always require the snapshot field of the start
position to be a valid snapshot ID.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/tests.c | 61 ++++++++++++++++++++++++++++-------------------------
 1 file changed, 32 insertions(+), 29 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c
index 1b583b134853..6023661ece16 100644
--- a/fs/bcachefs/tests.c
+++ b/fs/bcachefs/tests.c
@@ -14,12 +14,14 @@ static void delete_test_keys(struct bch_fs *c)
 	int ret;
 
 	ret = bch2_btree_delete_range(c, BTREE_ID_extents,
-				      POS(0, 0), POS(0, U64_MAX),
+				      SPOS(0, 0, U32_MAX),
+				      SPOS(0, U64_MAX, U32_MAX),
 				      NULL);
 	BUG_ON(ret);
 
 	ret = bch2_btree_delete_range(c, BTREE_ID_xattrs,
-				      POS(0, 0), POS(0, U64_MAX),
+				      SPOS(0, 0, U32_MAX),
+				      SPOS(0, U64_MAX, U32_MAX),
 				      NULL);
 	BUG_ON(ret);
 }
@@ -144,7 +146,7 @@ static int test_iterate(struct bch_fs *c, u64 nr)
 	i = 0;
 
 	for_each_btree_key(&trans, iter, BTREE_ID_xattrs,
-			   POS_MIN, 0, k, ret) {
+			   SPOS(0, 0, U32_MAX), 0, k, ret) {
 		if (k.k->p.inode)
 			break;
 
@@ -200,7 +202,7 @@ static int test_iterate_extents(struct bch_fs *c, u64 nr)
 	i = 0;
 
 	for_each_btree_key(&trans, iter, BTREE_ID_extents,
-			   POS_MIN, 0, k, ret) {
+			   SPOS(0, 0, U32_MAX), 0, k, ret) {
 		BUG_ON(bkey_start_offset(k.k) != i);
 		i = k.k->p.offset;
 	}
@@ -254,8 +256,8 @@ static int test_iterate_slots(struct bch_fs *c, u64 nr)
 
 	i = 0;
 
-	for_each_btree_key(&trans, iter, BTREE_ID_xattrs, POS_MIN,
-			   0, k, ret) {
+	for_each_btree_key(&trans, iter, BTREE_ID_xattrs,
+			   SPOS(0, 0, U32_MAX), 0, k, ret) {
 		if (k.k->p.inode)
 			break;
 
@@ -270,7 +272,8 @@ static int test_iterate_slots(struct bch_fs *c, u64 nr)
 
 	i = 0;
 
-	for_each_btree_key(&trans, iter, BTREE_ID_xattrs, POS_MIN,
+	for_each_btree_key(&trans, iter, BTREE_ID_xattrs,
+			   SPOS(0, 0, U32_MAX),
 			   BTREE_ITER_SLOTS, k, ret) {
 		BUG_ON(k.k->p.offset != i);
 		BUG_ON(bkey_deleted(k.k) != (i & 1));
@@ -319,8 +322,8 @@ static int test_iterate_slots_extents(struct bch_fs *c, u64 nr)
 
 	i = 0;
 
-	for_each_btree_key(&trans, iter, BTREE_ID_extents, POS_MIN,
-			   0, k, ret) {
+	for_each_btree_key(&trans, iter, BTREE_ID_extents,
+			   SPOS(0, 0, U32_MAX), 0, k, ret) {
 		BUG_ON(bkey_start_offset(k.k) != i + 8);
 		BUG_ON(k.k->size != 8);
 		i += 16;
@@ -333,7 +336,8 @@ static int test_iterate_slots_extents(struct bch_fs *c, u64 nr)
 
 	i = 0;
 
-	for_each_btree_key(&trans, iter, BTREE_ID_extents, POS_MIN,
+	for_each_btree_key(&trans, iter, BTREE_ID_extents,
+			   SPOS(0, 0, U32_MAX),
 			   BTREE_ITER_SLOTS, k, ret) {
 		BUG_ON(bkey_deleted(k.k) != !(i % 16));
 
@@ -361,7 +365,8 @@ static int test_peek_end(struct bch_fs *c, u64 nr)
 	struct bkey_s_c k;
 
 	bch2_trans_init(&trans, c, 0, 0);
-	bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs, POS_MIN, 0);
+	bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs,
+			     SPOS(0, 0, U32_MAX), 0);
 
 	k = bch2_btree_iter_peek(&iter);
 	BUG_ON(k.k);
@@ -381,7 +386,8 @@ static int test_peek_end_extents(struct bch_fs *c, u64 nr)
 	struct bkey_s_c k;
 
 	bch2_trans_init(&trans, c, 0, 0);
-	bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents, POS_MIN, 0);
+	bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
+			     SPOS(0, 0, U32_MAX), 0);
 
 	k = bch2_btree_iter_peek(&iter);
 	BUG_ON(k.k);
@@ -404,8 +410,6 @@ static int insert_test_extent(struct bch_fs *c,
 	struct bkey_i_cookie k;
 	int ret;
 
-	//pr_info("inserting %llu-%llu v %llu", start, end, test_version);
-
 	bkey_cookie_init(&k.k_i);
 	k.k_i.k.p.offset = end;
 	k.k_i.k.p.snapshot = U32_MAX;
@@ -541,10 +545,11 @@ static int rand_lookup(struct bch_fs *c, u64 nr)
 	u64 i;
 
 	bch2_trans_init(&trans, c, 0, 0);
-	bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs, POS_MIN, 0);
+	bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs,
+			     SPOS(0, 0, U32_MAX), 0);
 
 	for (i = 0; i < nr; i++) {
-		bch2_btree_iter_set_pos(&iter, POS(0, test_rand()));
+		bch2_btree_iter_set_pos(&iter, SPOS(0, test_rand(), U32_MAX));
 
 		k = bch2_btree_iter_peek(&iter);
 		ret = bkey_err(k);
@@ -567,7 +572,7 @@ static int rand_mixed_trans(struct btree_trans *trans,
 	struct bkey_s_c k;
 	int ret;
 
-	bch2_btree_iter_set_pos(iter, POS(0, pos));
+	bch2_btree_iter_set_pos(iter, SPOS(0, pos, U32_MAX));
 
 	k = bch2_btree_iter_peek(iter);
 	ret = bkey_err(k);
@@ -594,7 +599,8 @@ static int rand_mixed(struct bch_fs *c, u64 nr)
 	u64 i, rand;
 
 	bch2_trans_init(&trans, c, 0, 0);
-	bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs, POS_MIN, 0);
+	bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs,
+			     SPOS(0, 0, U32_MAX), 0);
 
 	for (i = 0; i < nr; i++) {
 		rand = test_rand();
@@ -614,7 +620,6 @@ static int rand_mixed(struct bch_fs *c, u64 nr)
 static int __do_delete(struct btree_trans *trans, struct bpos pos)
 {
 	struct btree_iter iter;
-	struct bkey_i delete;
 	struct bkey_s_c k;
 	int ret = 0;
 
@@ -628,10 +633,7 @@ static int __do_delete(struct btree_trans *trans, struct bpos pos)
 	if (!k.k)
 		goto err;
 
-	bkey_init(&delete.k);
-	delete.k.p = k.k->p;
-
-	ret = bch2_trans_update(trans, &iter, &delete, 0);
+	ret = bch2_btree_delete_at(trans, &iter, 0);
 err:
 	bch2_trans_iter_exit(trans, &iter);
 	return ret;
@@ -646,7 +648,7 @@ static int rand_delete(struct bch_fs *c, u64 nr)
 	bch2_trans_init(&trans, c, 0, 0);
 
 	for (i = 0; i < nr; i++) {
-		struct bpos pos = POS(0, test_rand());
+		struct bpos pos = SPOS(0, test_rand(), U32_MAX);
 
 		ret = __bch2_trans_do(&trans, NULL, NULL, 0,
 			__do_delete(&trans, pos));
@@ -673,7 +675,7 @@ static int seq_insert(struct bch_fs *c, u64 nr)
 
 	bch2_trans_init(&trans, c, 0, 0);
 
-	for_each_btree_key(&trans, iter, BTREE_ID_xattrs, POS_MIN,
+	for_each_btree_key(&trans, iter, BTREE_ID_xattrs, SPOS(0, 0, U32_MAX),
 			   BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) {
 		insert.k.p = iter.pos;
 
@@ -703,7 +705,8 @@ static int seq_lookup(struct bch_fs *c, u64 nr)
 
 	bch2_trans_init(&trans, c, 0, 0);
 
-	for_each_btree_key(&trans, iter, BTREE_ID_xattrs, POS_MIN, 0, k, ret)
+	for_each_btree_key(&trans, iter, BTREE_ID_xattrs,
+			   SPOS(0, 0, U32_MAX), 0, k, ret)
 		;
 	bch2_trans_iter_exit(&trans, &iter);
 
@@ -720,7 +723,8 @@ static int seq_overwrite(struct bch_fs *c, u64 nr)
 
 	bch2_trans_init(&trans, c, 0, 0);
 
-	for_each_btree_key(&trans, iter, BTREE_ID_xattrs, POS_MIN,
+	for_each_btree_key(&trans, iter, BTREE_ID_xattrs,
+			   SPOS(0, 0, U32_MAX),
 			   BTREE_ITER_INTENT, k, ret) {
 		struct bkey_i_cookie u;
 
@@ -745,8 +749,7 @@ static int seq_delete(struct bch_fs *c, u64 nr)
 	int ret;
 
 	ret = bch2_btree_delete_range(c, BTREE_ID_xattrs,
-				      POS(0, 0), POS(0, U64_MAX),
-				      NULL);
+				      SPOS(0, 0, U32_MAX), POS_MAX, NULL);
 	if (ret)
 		bch_err(c, "error in seq_delete: %i", ret);
 	return ret;
-- 
cgit 


From a861c7225b9e31da745c262711d625782b5d766a Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 15 Mar 2021 22:34:00 -0400
Subject: bcachefs: Require snapshot id to be set

Now that all the existing code has been converted for snapshots, this
patch changes the code for initializing a btree iterator to require a
snapshot to be specified, and also change bkey_invalid() to allow for
non U32_MAX snapshot IDs.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/bkey_methods.c |  2 +-
 fs/bcachefs/btree_iter.c   | 20 ++++++++------------
 2 files changed, 9 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c
index 3133db236b7b..f7f4139072b5 100644
--- a/fs/bcachefs/bkey_methods.c
+++ b/fs/bcachefs/bkey_methods.c
@@ -182,7 +182,7 @@ const char *__bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k,
 
 	if (type != BKEY_TYPE_btree &&
 	    btree_type_has_snapshots(type) &&
-	    k.k->p.snapshot != U32_MAX)
+	    !k.k->p.snapshot)
 		return "invalid snapshot field";
 
 	if (type != BKEY_TYPE_btree &&
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index b589b96bc9e7..df542a59fe12 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -677,6 +677,9 @@ static void bch2_btree_iter_verify(struct btree_iter *iter)
 
 static void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter)
 {
+	BUG_ON((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) &&
+	       !iter->pos.snapshot);
+
 	BUG_ON(!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS) &&
 	       iter->pos.snapshot != iter->snapshot);
 
@@ -2522,20 +2525,13 @@ static void __bch2_trans_iter_init(struct btree_trans *trans,
 	    btree_node_type_is_extents(btree_id))
 		flags |= BTREE_ITER_IS_EXTENTS;
 
-	if (!btree_type_has_snapshots(btree_id) &&
-	    !(flags & __BTREE_ITER_ALL_SNAPSHOTS))
+	if (!(flags & __BTREE_ITER_ALL_SNAPSHOTS) &&
+	    !btree_type_has_snapshots(btree_id))
 		flags &= ~BTREE_ITER_ALL_SNAPSHOTS;
-#if 0
-	/* let's have this be explicitly set: */
-	if ((flags & BTREE_ITER_TYPE) != BTREE_ITER_NODES &&
-	    btree_type_has_snapshots(btree_id) &&
-	    !(flags & BTREE_ITER_ALL_SNAPSHOTS))
-		flags |= BTREE_ITER_FILTER_SNAPSHOTS;
-#endif
 
-	if (!(flags & BTREE_ITER_ALL_SNAPSHOTS))
-		pos.snapshot = btree_type_has_snapshots(btree_id)
-			? U32_MAX : 0;
+	if (!(flags & BTREE_ITER_ALL_SNAPSHOTS) &&
+	    btree_type_has_snapshots(btree_id))
+		flags |= BTREE_ITER_FILTER_SNAPSHOTS;
 
 	iter->trans	= trans;
 	iter->path	= NULL;
-- 
cgit 


From 42d237320e9817a94f3a0a2de28156523596b086 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 16 Mar 2021 23:28:43 -0400
Subject: bcachefs: Snapshot creation, deletion

This is the final patch in the patch series implementing snapshots.
This patch implements two new ioctls that work like creation and
deletion of directories, but fancier.

 - BCH_IOCTL_SUBVOLUME_CREATE, for creating new subvolumes and snaphots
 - BCH_IOCTL_SUBVOLUME_DESTROY, for deleting subvolumes and snapshots

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/dirent.c    |   8 ---
 fs/bcachefs/dirent.h    |   4 --
 fs/bcachefs/fs-common.c | 182 +++++++++++++++++++++++++++++++++++++++---------
 fs/bcachefs/fs-common.h |   7 +-
 fs/bcachefs/fs-ioctl.c  | 168 ++++++++++++++++++++++++++++++++++++++++++++
 fs/bcachefs/fs.c        |  29 ++++----
 fs/bcachefs/fs.h        |   3 +-
 fs/bcachefs/fsck.c      |   7 +-
 fs/bcachefs/recovery.c  |   2 +-
 fs/bcachefs/str_hash.h  |   7 +-
 10 files changed, 348 insertions(+), 69 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c
index f290580594ce..8653a106809d 100644
--- a/fs/bcachefs/dirent.c
+++ b/fs/bcachefs/dirent.c
@@ -383,14 +383,6 @@ out:
 	return ret;
 }
 
-int bch2_dirent_delete_at(struct btree_trans *trans,
-			  const struct bch_hash_info *hash_info,
-			  struct btree_iter *iter)
-{
-	return bch2_hash_delete_at(trans, bch2_dirent_hash_desc,
-				   hash_info, iter);
-}
-
 int __bch2_dirent_lookup_trans(struct btree_trans *trans,
 			       struct btree_iter *iter,
 			       subvol_inum dir,
diff --git a/fs/bcachefs/dirent.h b/fs/bcachefs/dirent.h
index 88b784a99cb5..e7f65fbd8e65 100644
--- a/fs/bcachefs/dirent.h
+++ b/fs/bcachefs/dirent.h
@@ -33,10 +33,6 @@ int bch2_dirent_create(struct btree_trans *, subvol_inum,
 		       const struct bch_hash_info *, u8,
 		       const struct qstr *, u64, u64 *, int);
 
-int bch2_dirent_delete_at(struct btree_trans *,
-			  const struct bch_hash_info *,
-			  struct btree_iter *);
-
 int __bch2_dirent_read_target(struct btree_trans *, struct bkey_s_c_dirent,
 			      u32 *, u32 *, u64 *, bool);
 
diff --git a/fs/bcachefs/fs-common.c b/fs/bcachefs/fs-common.c
index 02bf32cc7659..3e8e3c5bf870 100644
--- a/fs/bcachefs/fs-common.c
+++ b/fs/bcachefs/fs-common.c
@@ -11,6 +11,11 @@
 
 #include <linux/posix_acl.h>
 
+static inline int is_subdir_for_nlink(struct bch_inode_unpacked *inode)
+{
+	return S_ISDIR(inode->bi_mode) && !inode->bi_subvol;
+}
+
 int bch2_create_trans(struct btree_trans *trans,
 		      subvol_inum dir,
 		      struct bch_inode_unpacked *dir_u,
@@ -19,6 +24,7 @@ int bch2_create_trans(struct btree_trans *trans,
 		      uid_t uid, gid_t gid, umode_t mode, dev_t rdev,
 		      struct posix_acl *default_acl,
 		      struct posix_acl *acl,
+		      subvol_inum snapshot_src,
 		      unsigned flags)
 {
 	struct bch_fs *c = trans->c;
@@ -27,10 +33,9 @@ int bch2_create_trans(struct btree_trans *trans,
 	subvol_inum new_inum = dir;
 	u64 now = bch2_current_time(c);
 	u64 cpu = raw_smp_processor_id();
-	u64 dir_offset = 0;
 	u64 dir_target;
 	u32 snapshot;
-	unsigned dir_type;
+	unsigned dir_type = mode_to_type(mode);
 	int ret;
 
 	ret = bch2_subvolume_get_snapshot(trans, dir.subvol, &snapshot);
@@ -41,37 +46,122 @@ int bch2_create_trans(struct btree_trans *trans,
 	if (ret)
 		goto err;
 
-	bch2_inode_init_late(new_inode, now, uid, gid, mode, rdev, dir_u);
+	if (!(flags & BCH_CREATE_SNAPSHOT)) {
+		/* Normal create path - allocate a new inode: */
+		bch2_inode_init_late(new_inode, now, uid, gid, mode, rdev, dir_u);
 
-	if (!name)
-		new_inode->bi_flags |= BCH_INODE_UNLINKED;
+		if (flags & BCH_CREATE_TMPFILE)
+			new_inode->bi_flags |= BCH_INODE_UNLINKED;
 
-	ret = bch2_inode_create(trans, &inode_iter, new_inode, snapshot, cpu);
-	if (ret)
-		goto err;
+		ret = bch2_inode_create(trans, &inode_iter, new_inode, snapshot, cpu);
+		if (ret)
+			goto err;
+
+		snapshot_src = (subvol_inum) { 0 };
+	} else {
+		/*
+		 * Creating a snapshot - we're not allocating a new inode, but
+		 * we do have to lookup the root inode of the subvolume we're
+		 * snapshotting and update it (in the new snapshot):
+		 */
+
+		if (!snapshot_src.inum) {
+			/* Inode wasn't specified, just snapshot: */
+			struct btree_iter subvol_iter;
+			struct bkey_s_c k;
+
+			bch2_trans_iter_init(trans, &subvol_iter, BTREE_ID_subvolumes,
+					     POS(0, snapshot_src.subvol), 0);
+			k = bch2_btree_iter_peek_slot(&subvol_iter);
+
+			ret = bkey_err(k);
+			if (!ret && k.k->type != KEY_TYPE_subvolume) {
+				bch_err(c, "subvolume %u not found",
+					snapshot_src.subvol);
+				ret = -ENOENT;
+			}
+
+			if (!ret)
+				snapshot_src.inum = le64_to_cpu(bkey_s_c_to_subvolume(k).v->inode);
+			bch2_trans_iter_exit(trans, &subvol_iter);
+
+			if (ret)
+				goto err;
+		}
+
+		ret = bch2_inode_peek(trans, &inode_iter, new_inode, snapshot_src,
+				      BTREE_ITER_INTENT);
+		if (ret)
+			goto err;
+
+		if (new_inode->bi_subvol != snapshot_src.subvol) {
+			/* Not a subvolume root: */
+			ret = -EINVAL;
+			goto err;
+		}
+
+		/*
+		 * If we're not root, we have to own the subvolume being
+		 * snapshotted:
+		 */
+		if (uid && new_inode->bi_uid != uid) {
+			ret = -EPERM;
+			goto err;
+		}
+
+		flags |= BCH_CREATE_SUBVOL;
+	}
 
 	new_inum.inum	= new_inode->bi_inum;
 	dir_target	= new_inode->bi_inum;
-	dir_type	= mode_to_type(new_inode->bi_mode);
 
-	if (default_acl) {
-		ret = bch2_set_acl_trans(trans, new_inum, new_inode,
-					 default_acl, ACL_TYPE_DEFAULT);
+	if (flags & BCH_CREATE_SUBVOL) {
+		u32 new_subvol, dir_snapshot;
+
+		ret = bch2_subvolume_create(trans, new_inode->bi_inum,
+					    snapshot_src.subvol,
+					    &new_subvol, &snapshot,
+					    (flags & BCH_CREATE_SNAPSHOT_RO) != 0);
 		if (ret)
 			goto err;
-	}
 
-	if (acl) {
-		ret = bch2_set_acl_trans(trans, new_inum, new_inode,
-					 acl, ACL_TYPE_ACCESS);
+		new_inode->bi_parent_subvol	= dir.subvol;
+		new_inode->bi_subvol		= new_subvol;
+		new_inum.subvol			= new_subvol;
+		dir_target			= new_subvol;
+		dir_type			= DT_SUBVOL;
+
+		ret = bch2_subvolume_get_snapshot(trans, dir.subvol, &dir_snapshot);
+		if (ret)
+			goto err;
+
+		bch2_btree_iter_set_snapshot(&dir_iter, dir_snapshot);
+		ret = bch2_btree_iter_traverse(&dir_iter);
 		if (ret)
 			goto err;
 	}
 
-	if (name) {
+	if (!(flags & BCH_CREATE_SNAPSHOT)) {
+		if (default_acl) {
+			ret = bch2_set_acl_trans(trans, new_inum, new_inode,
+						 default_acl, ACL_TYPE_DEFAULT);
+			if (ret)
+				goto err;
+		}
+
+		if (acl) {
+			ret = bch2_set_acl_trans(trans, new_inum, new_inode,
+						 acl, ACL_TYPE_ACCESS);
+			if (ret)
+				goto err;
+		}
+	}
+
+	if (!(flags & BCH_CREATE_TMPFILE)) {
 		struct bch_hash_info dir_hash = bch2_hash_info_init(c, dir_u);
+		u64 dir_offset;
 
-		if (S_ISDIR(new_inode->bi_mode))
+		if (is_subdir_for_nlink(new_inode))
 			dir_u->bi_nlink++;
 		dir_u->bi_mtime = dir_u->bi_ctime = now;
 
@@ -87,11 +177,11 @@ int bch2_create_trans(struct btree_trans *trans,
 					 BCH_HASH_SET_MUST_CREATE);
 		if (ret)
 			goto err;
-	}
 
-	if (c->sb.version >= bcachefs_metadata_version_inode_backpointers) {
-		new_inode->bi_dir		= dir_u->bi_inum;
-		new_inode->bi_dir_offset	= dir_offset;
+		if (c->sb.version >= bcachefs_metadata_version_inode_backpointers) {
+			new_inode->bi_dir		= dir_u->bi_inum;
+			new_inode->bi_dir_offset	= dir_offset;
+		}
 	}
 
 	inode_iter.flags &= ~BTREE_ITER_ALL_SNAPSHOTS;
@@ -160,7 +250,8 @@ int bch2_unlink_trans(struct btree_trans *trans,
 		      subvol_inum dir,
 		      struct bch_inode_unpacked *dir_u,
 		      struct bch_inode_unpacked *inode_u,
-		      const struct qstr *name)
+		      const struct qstr *name,
+		      int deleting_snapshot)
 {
 	struct bch_fs *c = trans->c;
 	struct btree_iter dir_iter = { NULL };
@@ -169,6 +260,7 @@ int bch2_unlink_trans(struct btree_trans *trans,
 	struct bch_hash_info dir_hash;
 	subvol_inum inum;
 	u64 now = bch2_current_time(c);
+	struct bkey_s_c k;
 	int ret;
 
 	ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir, BTREE_ITER_INTENT);
@@ -187,29 +279,51 @@ int bch2_unlink_trans(struct btree_trans *trans,
 	if (ret)
 		goto err;
 
-	if (inode_u->bi_dir		== dirent_iter.pos.inode &&
-	    inode_u->bi_dir_offset	== dirent_iter.pos.offset) {
-		inode_u->bi_dir		= 0;
-		inode_u->bi_dir_offset	= 0;
+	if (deleting_snapshot == 1 && !inode_u->bi_subvol) {
+		ret = -ENOENT;
+		goto err;
 	}
 
-	if (S_ISDIR(inode_u->bi_mode)) {
+	if (deleting_snapshot <= 0 && S_ISDIR(inode_u->bi_mode)) {
 		ret = bch2_empty_dir_trans(trans, inum);
 		if (ret)
 			goto err;
 	}
 
-	if (dir.subvol != inum.subvol) {
-		ret = bch2_subvolume_delete(trans, inum.subvol, false);
+	if (inode_u->bi_subvol) {
+		ret = bch2_subvolume_delete(trans, inode_u->bi_subvol,
+					    deleting_snapshot);
+		if (ret)
+			goto err;
+
+		k = bch2_btree_iter_peek_slot(&dirent_iter);
+		ret = bkey_err(k);
+		if (ret)
+			goto err;
+
+		/*
+		 * If we're deleting a subvolume, we need to really delete the
+		 * dirent, not just emit a whiteout in the current snapshot:
+		 */
+		bch2_btree_iter_set_snapshot(&dirent_iter, k.k->p.snapshot);
+		ret = bch2_btree_iter_traverse(&dirent_iter);
 		if (ret)
 			goto err;
 	}
 
+	if (inode_u->bi_dir		== dirent_iter.pos.inode &&
+	    inode_u->bi_dir_offset	== dirent_iter.pos.offset) {
+		inode_u->bi_dir		= 0;
+		inode_u->bi_dir_offset	= 0;
+	}
+
 	dir_u->bi_mtime = dir_u->bi_ctime = inode_u->bi_ctime = now;
-	dir_u->bi_nlink -= S_ISDIR(inode_u->bi_mode);
+	dir_u->bi_nlink -= is_subdir_for_nlink(inode_u);
 	bch2_inode_nlink_dec(inode_u);
 
-	ret =   bch2_dirent_delete_at(trans, &dir_hash, &dirent_iter) ?:
+	ret =   bch2_hash_delete_at(trans, bch2_dirent_hash_desc,
+				    &dir_hash, &dirent_iter,
+				    BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
 		bch2_inode_write(trans, &dir_iter, dir_u) ?:
 		bch2_inode_write(trans, &inode_iter, inode_u);
 err:
@@ -348,12 +462,12 @@ int bch2_rename_trans(struct btree_trans *trans,
 		goto err;
 	}
 
-	if (S_ISDIR(src_inode_u->bi_mode)) {
+	if (is_subdir_for_nlink(src_inode_u)) {
 		src_dir_u->bi_nlink--;
 		dst_dir_u->bi_nlink++;
 	}
 
-	if (dst_inum.inum && S_ISDIR(dst_inode_u->bi_mode)) {
+	if (dst_inum.inum && is_subdir_for_nlink(dst_inode_u)) {
 		dst_dir_u->bi_nlink--;
 		src_dir_u->bi_nlink += mode == BCH_RENAME_EXCHANGE;
 	}
diff --git a/fs/bcachefs/fs-common.h b/fs/bcachefs/fs-common.h
index 1bb2ac4dc13a..9bb0a9676147 100644
--- a/fs/bcachefs/fs-common.h
+++ b/fs/bcachefs/fs-common.h
@@ -5,6 +5,9 @@
 struct posix_acl;
 
 #define BCH_CREATE_TMPFILE		(1U << 0)
+#define BCH_CREATE_SUBVOL		(1U << 1)
+#define BCH_CREATE_SNAPSHOT		(1U << 2)
+#define BCH_CREATE_SNAPSHOT_RO		(1U << 3)
 
 int bch2_create_trans(struct btree_trans *, subvol_inum,
 		      struct bch_inode_unpacked *,
@@ -13,7 +16,7 @@ int bch2_create_trans(struct btree_trans *, subvol_inum,
 		      uid_t, gid_t, umode_t, dev_t,
 		      struct posix_acl *,
 		      struct posix_acl *,
-		      unsigned);
+		      subvol_inum, unsigned);
 
 int bch2_link_trans(struct btree_trans *,
 		    subvol_inum, struct bch_inode_unpacked *,
@@ -23,7 +26,7 @@ int bch2_link_trans(struct btree_trans *,
 int bch2_unlink_trans(struct btree_trans *, subvol_inum,
 		      struct bch_inode_unpacked *,
 		      struct bch_inode_unpacked *,
-		      const struct qstr *);
+		      const struct qstr *, int);
 
 int bch2_rename_trans(struct btree_trans *,
 		      subvol_inum, struct bch_inode_unpacked *,
diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c
index 91f52ab9b4e2..ae402d350d4c 100644
--- a/fs/bcachefs/fs-ioctl.c
+++ b/fs/bcachefs/fs-ioctl.c
@@ -10,7 +10,11 @@
 #include "quota.h"
 
 #include <linux/compat.h>
+#include <linux/fsnotify.h>
 #include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/security.h>
+#include <linux/writeback.h>
 
 #define FS_IOC_GOINGDOWN	     _IOR('X', 125, __u32)
 #define FSOP_GOING_FLAGS_DEFAULT	0x0	/* going down */
@@ -292,6 +296,154 @@ err:
 	return ret;
 }
 
+static long bch2_ioctl_subvolume_create(struct bch_fs *c, struct file *filp,
+				struct bch_ioctl_subvolume arg)
+{
+	struct inode *dir;
+	struct bch_inode_info *inode;
+	struct user_namespace *s_user_ns;
+	struct dentry *dst_dentry;
+	struct path src_path, dst_path;
+	int how = LOOKUP_FOLLOW;
+	int error;
+	subvol_inum snapshot_src = { 0 };
+	unsigned lookup_flags = 0;
+	unsigned create_flags = BCH_CREATE_SUBVOL;
+
+	if (arg.flags & ~(BCH_SUBVOL_SNAPSHOT_CREATE|
+			  BCH_SUBVOL_SNAPSHOT_RO))
+		return -EINVAL;
+
+	if (!(arg.flags & BCH_SUBVOL_SNAPSHOT_CREATE) &&
+	    (arg.src_ptr ||
+	     (arg.flags & BCH_SUBVOL_SNAPSHOT_RO)))
+		return -EINVAL;
+
+	if (arg.flags & BCH_SUBVOL_SNAPSHOT_CREATE)
+		create_flags |= BCH_CREATE_SNAPSHOT;
+
+	if (arg.flags & BCH_SUBVOL_SNAPSHOT_RO)
+		create_flags |= BCH_CREATE_SNAPSHOT_RO;
+
+	/* why do we need this lock? */
+	down_read(&c->vfs_sb->s_umount);
+
+	if (arg.flags & BCH_SUBVOL_SNAPSHOT_CREATE)
+		sync_inodes_sb(c->vfs_sb);
+retry:
+	if (arg.src_ptr) {
+		error = user_path_at(arg.dirfd,
+				(const char __user *)(unsigned long)arg.src_ptr,
+				how, &src_path);
+		if (error)
+			goto err1;
+
+		if (src_path.dentry->d_sb->s_fs_info != c) {
+			path_put(&src_path);
+			error = -EXDEV;
+			goto err1;
+		}
+
+		snapshot_src = inode_inum(to_bch_ei(src_path.dentry->d_inode));
+	}
+
+	dst_dentry = user_path_create(arg.dirfd,
+			(const char __user *)(unsigned long)arg.dst_ptr,
+			&dst_path, lookup_flags);
+	error = PTR_ERR_OR_ZERO(dst_dentry);
+	if (error)
+		goto err2;
+
+	if (dst_dentry->d_sb->s_fs_info != c) {
+		error = -EXDEV;
+		goto err3;
+	}
+
+	if (dst_dentry->d_inode) {
+		error = -EEXIST;
+		goto err3;
+	}
+
+	dir = dst_path.dentry->d_inode;
+	if (IS_DEADDIR(dir)) {
+		error = -ENOENT;
+		goto err3;
+	}
+
+	s_user_ns = dir->i_sb->s_user_ns;
+	if (!kuid_has_mapping(s_user_ns, current_fsuid()) ||
+	    !kgid_has_mapping(s_user_ns, current_fsgid())) {
+		error = -EOVERFLOW;
+		goto err3;
+	}
+
+	error = inode_permission(file_mnt_idmap(filp),
+				 dir, MAY_WRITE | MAY_EXEC);
+	if (error)
+		goto err3;
+
+	if (!IS_POSIXACL(dir))
+		arg.mode &= ~current_umask();
+
+	error = security_path_mkdir(&dst_path, dst_dentry, arg.mode);
+	if (error)
+		goto err3;
+
+	if ((arg.flags & BCH_SUBVOL_SNAPSHOT_CREATE) &&
+	    !arg.src_ptr)
+		snapshot_src.subvol = to_bch_ei(dir)->ei_inode.bi_subvol;
+
+	inode = __bch2_create(file_mnt_idmap(filp), to_bch_ei(dir),
+			      dst_dentry, arg.mode|S_IFDIR,
+			      0, snapshot_src, create_flags);
+	error = PTR_ERR_OR_ZERO(inode);
+	if (error)
+		goto err3;
+
+	d_instantiate(dst_dentry, &inode->v);
+	fsnotify_mkdir(dir, dst_dentry);
+err3:
+	done_path_create(&dst_path, dst_dentry);
+err2:
+	if (arg.src_ptr)
+		path_put(&src_path);
+
+	if (retry_estale(error, lookup_flags)) {
+		lookup_flags |= LOOKUP_REVAL;
+		goto retry;
+	}
+err1:
+	up_read(&c->vfs_sb->s_umount);
+
+	return error;
+}
+
+static long bch2_ioctl_subvolume_destroy(struct bch_fs *c, struct file *filp,
+				struct bch_ioctl_subvolume arg)
+{
+	struct path path;
+	int ret = 0;
+
+	if (arg.flags)
+		return -EINVAL;
+
+	ret = user_path_at(arg.dirfd,
+			(const char __user *)(unsigned long)arg.dst_ptr,
+			LOOKUP_FOLLOW, &path);
+	if (ret)
+		return ret;
+
+	if (path.dentry->d_sb->s_fs_info != c) {
+		path_put(&path);
+		return -EXDEV;
+	}
+
+	ret = __bch2_unlink(path.dentry->d_parent->d_inode, path.dentry, 1);
+	path_put(&path);
+
+	return ret;
+}
+
 long bch2_fs_file_ioctl(struct file *file, unsigned cmd, unsigned long arg)
 {
 	struct bch_inode_info *inode = file_bch_inode(file);
@@ -322,6 +474,22 @@ long bch2_fs_file_ioctl(struct file *file, unsigned cmd, unsigned long arg)
 	case FS_IOC_GOINGDOWN:
 		return bch2_ioc_goingdown(c, (u32 __user *) arg);
 
+	case BCH_IOCTL_SUBVOLUME_CREATE: {
+		struct bch_ioctl_subvolume i;
+
+		if (copy_from_user(&i, (void __user *) arg, sizeof(i)))
+			return -EFAULT;
+		return bch2_ioctl_subvolume_create(c, file, i);
+	}
+
+	case BCH_IOCTL_SUBVOLUME_DESTROY: {
+		struct bch_ioctl_subvolume i;
+
+		if (copy_from_user(&i, (void __user *) arg, sizeof(i)))
+			return -EFAULT;
+		return bch2_ioctl_subvolume_destroy(c, file, i);
+	}
+
 	default:
 		return bch2_fs_ioctl(c, cmd, (void __user *) arg);
 	}
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index 0d47d9d5737b..7475830bb33f 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -240,12 +240,6 @@ struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum)
 	struct bch_inode_info *inode;
 	int ret;
 
-	/*
-	 * debug assert, to be removed when we start creating
-	 * subvolumes/snapshots:
-	 */
-	BUG_ON(inum.subvol != BCACHEFS_ROOT_SUBVOL);
-
 	inode = to_bch_ei(iget5_locked(c->vfs_sb,
 				       bch2_inode_hash(inum),
 				       bch2_iget5_test,
@@ -274,7 +268,8 @@ struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum)
 struct bch_inode_info *
 __bch2_create(struct mnt_idmap *idmap,
 	      struct bch_inode_info *dir, struct dentry *dentry,
-	      umode_t mode, dev_t rdev, unsigned flags)
+	      umode_t mode, dev_t rdev, subvol_inum snapshot_src,
+	      unsigned flags)
 {
 	struct bch_fs *c = dir->v.i_sb->s_fs_info;
 	struct btree_trans trans;
@@ -319,7 +314,7 @@ retry:
 				  from_kuid(i_user_ns(&dir->v), current_fsuid()),
 				  from_kgid(i_user_ns(&dir->v), current_fsgid()),
 				  mode, rdev,
-				  default_acl, acl, flags) ?:
+				  default_acl, acl, snapshot_src, flags) ?:
 		bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, 1,
 				KEY_TYPE_QUOTA_PREALLOC);
 	if (unlikely(ret))
@@ -426,7 +421,8 @@ static int bch2_mknod(struct mnt_idmap *idmap,
 		      umode_t mode, dev_t rdev)
 {
 	struct bch_inode_info *inode =
-		__bch2_create(idmap, to_bch_ei(vdir), dentry, mode, rdev, 0);
+		__bch2_create(idmap, to_bch_ei(vdir), dentry, mode, rdev,
+			      (subvol_inum) { 0 }, 0);
 
 	if (IS_ERR(inode))
 		return PTR_ERR(inode);
@@ -493,7 +489,8 @@ static int bch2_link(struct dentry *old_dentry, struct inode *vdir,
 	return 0;
 }
 
-static int bch2_unlink(struct inode *vdir, struct dentry *dentry)
+int __bch2_unlink(struct inode *vdir, struct dentry *dentry,
+		  int deleting_snapshot)
 {
 	struct bch_fs *c = vdir->i_sb->s_fs_info;
 	struct bch_inode_info *dir = to_bch_ei(vdir);
@@ -509,7 +506,8 @@ static int bch2_unlink(struct inode *vdir, struct dentry *dentry)
 			      BTREE_INSERT_NOFAIL,
 			bch2_unlink_trans(&trans,
 					  inode_inum(dir), &dir_u,
-					  &inode_u, &dentry->d_name));
+					  &inode_u, &dentry->d_name,
+					  deleting_snapshot));
 
 	if (likely(!ret)) {
 		BUG_ON(inode_u.bi_inum != inode->v.i_ino);
@@ -527,6 +525,11 @@ static int bch2_unlink(struct inode *vdir, struct dentry *dentry)
 	return ret;
 }
 
+static int bch2_unlink(struct inode *vdir, struct dentry *dentry)
+{
+	return __bch2_unlink(vdir, dentry, -1);
+}
+
 static int bch2_symlink(struct mnt_idmap *idmap,
 			struct inode *vdir, struct dentry *dentry,
 			const char *symname)
@@ -536,7 +539,7 @@ static int bch2_symlink(struct mnt_idmap *idmap,
 	int ret;
 
 	inode = __bch2_create(idmap, dir, dentry, S_IFLNK|S_IRWXUGO, 0,
-			      BCH_CREATE_TMPFILE);
+			      (subvol_inum) { 0 }, BCH_CREATE_TMPFILE);
 	if (unlikely(IS_ERR(inode)))
 		return PTR_ERR(inode);
 
@@ -855,7 +858,7 @@ static int bch2_tmpfile(struct mnt_idmap *idmap,
 	struct bch_inode_info *inode =
 		__bch2_create(idmap, to_bch_ei(vdir),
 			      file->f_path.dentry, mode, 0,
-			      BCH_CREATE_TMPFILE);
+			      (subvol_inum) { 0 }, BCH_CREATE_TMPFILE);
 
 	if (IS_ERR(inode))
 		return PTR_ERR(inode);
diff --git a/fs/bcachefs/fs.h b/fs/bcachefs/fs.h
index aa755987b36c..40898c4d197b 100644
--- a/fs/bcachefs/fs.h
+++ b/fs/bcachefs/fs.h
@@ -146,7 +146,7 @@ struct bch_inode_unpacked;
 
 struct bch_inode_info *
 __bch2_create(struct mnt_idmap *, struct bch_inode_info *,
-	      struct dentry *, umode_t, dev_t, unsigned);
+	      struct dentry *, umode_t, dev_t, subvol_inum, unsigned);
 
 int bch2_fs_quota_transfer(struct bch_fs *,
 			   struct bch_inode_info *,
@@ -183,6 +183,7 @@ int __must_check bch2_write_inode(struct bch_fs *, struct bch_inode_info *,
 int bch2_setattr_nonsize(struct mnt_idmap *,
 			 struct bch_inode_info *,
 			 struct iattr *);
+int __bch2_unlink(struct inode *, struct dentry *, int);
 
 void bch2_vfs_exit(void);
 int bch2_vfs_init(void);
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index f9a6a0b3ce7a..16a1eae9b374 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -307,7 +307,7 @@ static int __remove_dirent(struct btree_trans *trans, struct bpos pos)
 	bch2_trans_iter_init(trans, &iter, BTREE_ID_dirents, pos, BTREE_ITER_INTENT);
 
 	ret = bch2_hash_delete_at(trans, bch2_dirent_hash_desc,
-				  &dir_hash_info, &iter);
+				  &dir_hash_info, &iter, 0);
 	bch2_trans_iter_exit(trans, &iter);
 	return ret;
 }
@@ -386,7 +386,8 @@ create_lostfound:
 				      BTREE_INSERT_LAZY_RW,
 			bch2_create_trans(trans, root_inum, &root,
 					  lostfound, &lostfound_str,
-					  0, 0, S_IFDIR|0700, 0, NULL, NULL, 0));
+					  0, 0, S_IFDIR|0700, 0, NULL, NULL,
+					  (subvol_inum) { }, 0));
 		if (ret)
 			bch_err(c, "error creating lost+found: %i", ret);
 	}
@@ -759,7 +760,7 @@ static int fsck_hash_delete_at(struct btree_trans *trans,
 {
 	int ret;
 retry:
-	ret   = bch2_hash_delete_at(trans, desc, info, iter) ?:
+	ret   = bch2_hash_delete_at(trans, desc, info, iter, 0) ?:
 		bch2_trans_commit(trans, NULL, NULL,
 				  BTREE_INSERT_NOFAIL|
 				  BTREE_INSERT_LAZY_RW);
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 47c8fecc6839..64e0b542e779 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -1485,7 +1485,7 @@ int bch2_fs_initialize(struct bch_fs *c)
 				  &root_inode, &lostfound_inode,
 				  &lostfound,
 				  0, 0, S_IFDIR|0700, 0,
-				  NULL, NULL, 0));
+				  NULL, NULL, (subvol_inum) { 0 }, 0));
 	if (ret) {
 		bch_err(c, "error creating lost+found");
 		goto err;
diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h
index 6418089531ad..6486e709b700 100644
--- a/fs/bcachefs/str_hash.h
+++ b/fs/bcachefs/str_hash.h
@@ -307,7 +307,8 @@ static __always_inline
 int bch2_hash_delete_at(struct btree_trans *trans,
 			const struct bch_hash_desc desc,
 			const struct bch_hash_info *info,
-			struct btree_iter *iter)
+			struct btree_iter *iter,
+			unsigned update_flags)
 {
 	struct bkey_i *delete;
 	int ret;
@@ -325,7 +326,7 @@ int bch2_hash_delete_at(struct btree_trans *trans,
 	delete->k.p = iter->pos;
 	delete->k.type = ret ? KEY_TYPE_hash_whiteout : KEY_TYPE_deleted;
 
-	return bch2_trans_update(trans, iter, delete, 0);
+	return bch2_trans_update(trans, iter, delete, update_flags);
 }
 
 static __always_inline
@@ -342,7 +343,7 @@ int bch2_hash_delete(struct btree_trans *trans,
 	if (ret)
 		return ret;
 
-	ret = bch2_hash_delete_at(trans, desc, info, &iter);
+	ret = bch2_hash_delete_at(trans, desc, info, &iter, 0);
 	bch2_trans_iter_exit(trans, &iter);
 	return ret;
 }
-- 
cgit 


From 71ed0056dc1f03346eabcdaa37272041e5d52fe9 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 27 Sep 2021 01:56:31 -0400
Subject: bcachefs: Fix an assertion

We can end up in a strange situation where a btree_path points to a node
being freed even after pointers to it should have been replaced by
pointers to the new node - if the btree node has been reused since the
pointer to it was created.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_update_interior.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 73a79563487d..978bb56275de 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -165,7 +165,8 @@ static void bch2_btree_node_free_inmem(struct btree_trans *trans,
 	struct btree_path *path;
 
 	trans_for_each_path(trans, path)
-		BUG_ON(path->l[b->c.level].b == b);
+		BUG_ON(path->l[b->c.level].b == b &&
+		       path->l[b->c.level].lock_seq == b->c.lock.state.seq);
 
 	six_lock_write(&b->c.lock, NULL, NULL);
 
-- 
cgit 


From 0476fa948e6fac0b7fa04b5bb0ed30631cbf50ea Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 27 Sep 2021 13:25:18 -0400
Subject: bcachefs: Rev the on disk format version for snapshots

This will cause the compat code to be run that creates entries in the
subvolumes and snapshots btrees.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/bcachefs_format.h |  3 ++-
 fs/bcachefs/recovery.c        | 21 ++++++++-------------
 2 files changed, 10 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index f922302332ee..54023edc995e 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -1262,7 +1262,8 @@ enum bcachefs_metadata_version {
 	bcachefs_metadata_version_snapshot		= 12,
 	bcachefs_metadata_version_inode_backpointers	= 13,
 	bcachefs_metadata_version_btree_ptr_sectors_written = 14,
-	bcachefs_metadata_version_max			= 15,
+	bcachefs_metadata_version_snapshot_2		= 15,
+	bcachefs_metadata_version_max			= 16,
 };
 
 #define bcachefs_metadata_version_current	(bcachefs_metadata_version_max - 1)
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 64e0b542e779..6afb37a2e1b0 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -1004,11 +1004,10 @@ static int bch2_fs_upgrade_for_subvolumes(struct btree_trans *trans)
 	struct btree_iter iter;
 	struct bkey_s_c k;
 	struct bch_inode_unpacked inode;
-	struct bkey_inode_buf *packed;
 	int ret;
 
 	bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes,
-			     POS(0, BCACHEFS_ROOT_INO), 0);
+			     SPOS(0, BCACHEFS_ROOT_INO, U32_MAX), 0);
 	k = bch2_btree_iter_peek_slot(&iter);
 	ret = bkey_err(k);
 	if (ret)
@@ -1025,13 +1024,7 @@ static int bch2_fs_upgrade_for_subvolumes(struct btree_trans *trans)
 
 	inode.bi_subvol = BCACHEFS_ROOT_SUBVOL;
 
-	packed = bch2_trans_kmalloc(trans, sizeof(*packed));
-	ret = PTR_ERR_OR_ZERO(packed);
-	if (ret)
-		goto err;
-
-	bch2_inode_pack(c, packed, &inode);
-	ret = bch2_trans_update(trans, &iter, &packed->inode.k_i, 0);
+	ret = bch2_inode_write(trans, &iter, &inode);
 err:
 	bch2_trans_iter_exit(trans, &iter);
 	return ret;
@@ -1096,8 +1089,8 @@ int bch2_fs_recovery(struct bch_fs *c)
 	} else if (c->sb.version < bcachefs_metadata_version_btree_ptr_sectors_written) {
 		bch_info(c, "version prior to btree_ptr_sectors_written, upgrade required");
 		c->opts.version_upgrade	= true;
-	} else if (c->sb.version < bcachefs_metadata_version_snapshot) {
-		bch_info(c, "filesystem version is prior to snapshot field - upgrading");
+	} else if (c->sb.version < bcachefs_metadata_version_snapshot_2) {
+		bch_info(c, "filesystem version is prior to snapshots - upgrading");
 		c->opts.version_upgrade = true;
 	}
 
@@ -1267,7 +1260,9 @@ use_clean:
 		bch_verbose(c, "alloc write done");
 	}
 
-	if (c->sb.version < bcachefs_metadata_version_snapshot) {
+	if (c->sb.version < bcachefs_metadata_version_snapshot_2) {
+		bch2_fs_lazy_rw(c);
+
 		err = "error creating root snapshot node";
 		ret = bch2_fs_initialize_subvolumes(c);
 		if (ret)
@@ -1281,7 +1276,7 @@ use_clean:
 		goto err;
 	bch_verbose(c, "reading snapshots done");
 
-	if (c->sb.version < bcachefs_metadata_version_snapshot) {
+	if (c->sb.version < bcachefs_metadata_version_snapshot_2) {
 		/* set bi_subvol on root inode */
 		err = "error upgrade root inode for subvolumes";
 		ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_LAZY_RW,
-- 
cgit 


From ea0531f84eec65a1204a13167965bc151e0f072c Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 30 Sep 2021 11:09:26 -0400
Subject: bcachefs: Fix check_inode_update_hardlinks()

We were incorrectly using bch2_inode_write(), which gets the snapshot ID
from the iterator, with a BTREE_ITER_ALL_SNAPSHOTS iterator -
fortunately caught by an assertion in the update path.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/fsck.c | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 16a1eae9b374..3622fb4d18e2 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -2094,11 +2094,7 @@ static int check_nlinks_update_hardlinks(struct bch_fs *c,
 				bch2_inode_nlink_get(&u), link->count)) {
 			bch2_inode_nlink_set(&u, link->count);
 
-			ret = __bch2_trans_do(&trans, NULL, NULL,
-					      BTREE_INSERT_NOFAIL|
-					      BTREE_INSERT_LAZY_RW,
-					      bch2_btree_iter_traverse(&iter) ?:
-					bch2_inode_write(&trans, &iter, &u));
+			ret = write_inode(&trans, &u, k.k->p.snapshot);
 			if (ret)
 				bch_err(c, "error in fsck: error %i updating inode", ret);
 		}
-- 
cgit 


From e59a4d787507592ea19dbf48bc71b3120ff5df4d Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 30 Sep 2021 17:51:18 -0400
Subject: bcachefs: Fix a spurious fsck error

We were getting spurious "multiple types of data in same bucket" errors
in fsck, because the check was running for (cached) stale pointers -
oops.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_gc.c | 49 +++++++++++++++++++++++++++++++++----------------
 1 file changed, 33 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 66367ab9f20a..f14667390e2c 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -504,22 +504,6 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id,
 		struct bucket *g2 = PTR_BUCKET(ca, &p.ptr, false);
 		enum bch_data_type data_type = bch2_bkey_ptr_data_type(*k, &entry->ptr);
 
-		if (fsck_err_on(g->mark.data_type &&
-				g->mark.data_type != data_type, c,
-				"bucket %u:%zu different types of data in same bucket: %s, %s\n"
-				"while marking %s",
-				p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
-				bch2_data_types[g->mark.data_type],
-				bch2_data_types[data_type],
-				(bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf))) {
-			if (data_type == BCH_DATA_btree) {
-				g2->_mark.data_type = g->_mark.data_type = data_type;
-				set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags);
-			} else {
-				do_update = true;
-			}
-		}
-
 		if (fsck_err_on(!g->gen_valid, c,
 				"bucket %u:%zu data type %s ptr gen %u missing in alloc btree\n"
 				"while marking %s",
@@ -536,6 +520,19 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id,
 			}
 		}
 
+		if (fsck_err_on(data_type == BCH_DATA_btree &&
+				g->mark.gen != p.ptr.gen, c,
+				"bucket %u:%zu data type %s has metadata but wrong gen: %u != %u\n"
+				"while marking %s",
+				p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
+				bch2_data_types[ptr_data_type(k->k, &p.ptr)],
+				p.ptr.gen, g->mark.gen,
+				(bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf))) {
+			g2->_mark.data_type	= g->_mark.data_type	= data_type;
+			g2->gen_valid		= g->gen_valid		= true;
+			set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags);
+		}
+
 		if (fsck_err_on(gen_cmp(p.ptr.gen, g->mark.gen) > 0, c,
 				"bucket %u:%zu data type %s ptr gen in the future: %u > %u\n"
 				"while marking %s",
@@ -566,6 +563,26 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id,
 				(bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf)))
 			do_update = true;
 
+		if (p.ptr.gen != g->mark.gen)
+			continue;
+
+		if (fsck_err_on(g->mark.data_type &&
+				g->mark.data_type != data_type, c,
+				"bucket %u:%zu different types of data in same bucket: %s, %s\n"
+				"while marking %s",
+				p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
+				bch2_data_types[g->mark.data_type],
+				bch2_data_types[data_type],
+				(bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf))) {
+			if (data_type == BCH_DATA_btree) {
+				g2->_mark.data_type	= g->_mark.data_type	= data_type;
+				g2->gen_valid		= g->gen_valid		= true;
+				set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags);
+			} else {
+				do_update = true;
+			}
+		}
+
 		if (p.has_ec) {
 			struct stripe *m = genradix_ptr(&c->stripes[true], p.ec.idx);
 
-- 
cgit 


From 69294246b7a441a112d1a550ffc8e4e1e45142a4 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 1 Oct 2021 10:08:13 -0400
Subject: bcachefs: Fix allocator shutdown error message

We return 1 to indicate kthread_should_stop() returned true - we
shouldn't be printing an error.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/alloc_background.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 54fbfb22d671..fc1b4b354b05 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -857,10 +857,10 @@ static int bch2_invalidate_buckets(struct bch_fs *c, struct bch_dev *ca)
 	/* If we used NOWAIT, don't return the error: */
 	if (!fifo_empty(&ca->free_inc))
 		ret = 0;
-	if (ret) {
+	if (ret < 0)
 		bch_err(ca, "error invalidating buckets: %i", ret);
+	if (ret)
 		return ret;
-	}
 
 	if (journal_seq)
 		ret = bch2_journal_flush_seq(&c->journal, journal_seq);
-- 
cgit 


From 97996ddfdb9b0b4df22913d04ce01a069a944430 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 30 Sep 2021 19:46:23 -0400
Subject: bcachefs: bch2_subvolume_get()

Factor out a little helper.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/dirent.c    | 27 ++++------------------
 fs/bcachefs/fs-common.c | 22 +++++-------------
 fs/bcachefs/fsck.c      | 23 ++++---------------
 fs/bcachefs/subvolume.c | 59 ++++++++++++++++++++++---------------------------
 fs/bcachefs/subvolume.h |  2 ++
 5 files changed, 41 insertions(+), 92 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c
index 8653a106809d..c7344ac87fcd 100644
--- a/fs/bcachefs/dirent.c
+++ b/fs/bcachefs/dirent.c
@@ -191,34 +191,15 @@ int __bch2_dirent_read_target(struct btree_trans *trans,
 	if (likely(d.v->d_type != DT_SUBVOL)) {
 		*inum = le64_to_cpu(d.v->d_inum);
 	} else {
-		struct btree_iter iter;
-		struct bkey_s_c k;
-		struct bkey_s_c_subvolume s;
+		struct bch_subvolume s;
 		int ret;
 
 		*subvol = le64_to_cpu(d.v->d_inum);
-		bch2_trans_iter_init(trans, &iter, BTREE_ID_subvolumes,
-				     POS(0, *subvol),
-				     BTREE_ITER_CACHED);
-		k = bch2_btree_iter_peek_slot(&iter);
-		ret = bkey_err(k);
-		if (ret)
-			goto err;
-
-		if (k.k->type != KEY_TYPE_subvolume) {
-			ret = -ENOENT;
-			goto err;
-		}
 
-		s = bkey_s_c_to_subvolume(k);
-		*snapshot	= le32_to_cpu(s.v->snapshot);
-		*inum		= le64_to_cpu(s.v->inode);
-err:
-		if (ret == -ENOENT && !is_fsck)
-			bch2_fs_inconsistent(trans->c, "pointer to missing subvolume %u",
-					     *subvol);
+		ret = bch2_subvolume_get(trans, *subvol, !is_fsck, BTREE_ITER_CACHED, &s);
 
-		bch2_trans_iter_exit(trans, &iter);
+		*snapshot	= le32_to_cpu(s.snapshot);
+		*inum		= le64_to_cpu(s.inode);
 	}
 
 	return ret;
diff --git a/fs/bcachefs/fs-common.c b/fs/bcachefs/fs-common.c
index 3e8e3c5bf870..00c7ba17f6c8 100644
--- a/fs/bcachefs/fs-common.c
+++ b/fs/bcachefs/fs-common.c
@@ -67,26 +67,14 @@ int bch2_create_trans(struct btree_trans *trans,
 
 		if (!snapshot_src.inum) {
 			/* Inode wasn't specified, just snapshot: */
-			struct btree_iter subvol_iter;
-			struct bkey_s_c k;
-
-			bch2_trans_iter_init(trans, &subvol_iter, BTREE_ID_subvolumes,
-					     POS(0, snapshot_src.subvol), 0);
-			k = bch2_btree_iter_peek_slot(&subvol_iter);
-
-			ret = bkey_err(k);
-			if (!ret && k.k->type != KEY_TYPE_subvolume) {
-				bch_err(c, "subvolume %u not found",
-					snapshot_src.subvol);
-				ret = -ENOENT;
-			}
-
-			if (!ret)
-				snapshot_src.inum = le64_to_cpu(bkey_s_c_to_subvolume(k).v->inode);
-			bch2_trans_iter_exit(trans, &subvol_iter);
+			struct bch_subvolume s;
 
+			ret = bch2_subvolume_get(trans, snapshot_src.subvol, true,
+						 BTREE_ITER_CACHED, &s);
 			if (ret)
 				goto err;
+
+			snapshot_src.inum = le64_to_cpu(s.inode);
 		}
 
 		ret = bch2_inode_peek(trans, &inode_iter, new_inode, snapshot_src,
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 3622fb4d18e2..208bf6df82b5 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -103,29 +103,14 @@ static int snapshot_lookup_subvol(struct btree_trans *trans, u32 snapshot,
 static int __subvol_lookup(struct btree_trans *trans, u32 subvol,
 			   u32 *snapshot, u64 *inum)
 {
-	struct btree_iter iter;
-	struct bkey_s_c k;
+	struct bch_subvolume s;
 	int ret;
 
-	bch2_trans_iter_init(trans, &iter, BTREE_ID_subvolumes,
-			     POS(0, subvol), 0);
-	k = bch2_btree_iter_peek_slot(&iter);
-	ret = bkey_err(k);
-	if (ret)
-		goto err;
+	ret = bch2_subvolume_get(trans, subvol, false, 0, &s);
 
-	if (k.k->type != KEY_TYPE_subvolume) {
-		bch_err(trans->c, "subvolume %u not fonud", subvol);
-		ret = -ENOENT;
-		goto err;
-	}
-
-	*snapshot = le32_to_cpu(bkey_s_c_to_subvolume(k).v->snapshot);
-	*inum = le64_to_cpu(bkey_s_c_to_subvolume(k).v->inode);
-err:
-	bch2_trans_iter_exit(trans, &iter);
+	*snapshot = le32_to_cpu(s.snapshot);
+	*inum = le64_to_cpu(s.inode);
 	return ret;
-
 }
 
 static int subvol_lookup(struct btree_trans *trans, u32 subvol,
diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c
index ff3b4d2d86b9..d1c111050c35 100644
--- a/fs/bcachefs/subvolume.c
+++ b/fs/bcachefs/subvolume.c
@@ -89,23 +89,6 @@ int bch2_mark_snapshot(struct bch_fs *c,
 	return 0;
 }
 
-static int subvol_lookup(struct btree_trans *trans, unsigned id, struct bch_subvolume *s)
-{
-	struct btree_iter iter;
-	struct bkey_s_c k;
-	int ret;
-
-	bch2_trans_iter_init(trans, &iter, BTREE_ID_subvolumes, POS(0, id), 0);
-	k = bch2_btree_iter_peek_slot(&iter);
-	ret = bkey_err(k) ?: k.k->type == KEY_TYPE_subvolume ? 0 : -ENOENT;
-
-	if (!ret)
-		*s = *bkey_s_c_to_subvolume(k).v;
-
-	bch2_trans_iter_exit(trans, &iter);
-	return ret;
-}
-
 static int snapshot_lookup(struct btree_trans *trans, u32 id,
 			   struct bch_snapshot *s)
 {
@@ -195,7 +178,7 @@ static int bch2_snapshot_check(struct btree_trans *trans,
 	int ret;
 
 	id = le32_to_cpu(s.v->subvol);
-	ret = lockrestart_do(trans, subvol_lookup(trans, id, &subvol));
+	ret = lockrestart_do(trans, bch2_subvolume_get(trans, id, 0, false, &subvol));
 	if (ret == -ENOENT)
 		bch_err(trans->c, "snapshot node %llu has nonexistent subvolume %u",
 			s.k->p.offset, id);
@@ -798,34 +781,44 @@ void bch2_subvolume_to_text(struct printbuf *out, struct bch_fs *c,
 	       le32_to_cpu(s.v->snapshot));
 }
 
-int bch2_subvolume_get_snapshot(struct btree_trans *trans, u32 subvol,
-				u32 *snapid)
+int bch2_subvolume_get(struct btree_trans *trans, unsigned subvol,
+		       bool inconsistent_if_not_found,
+		       int iter_flags,
+		       struct bch_subvolume *s)
 {
 	struct btree_iter iter;
 	struct bkey_s_c k;
 	int ret;
 
-	bch2_trans_iter_init(trans, &iter, BTREE_ID_subvolumes,
-			     POS(0, subvol),
-			     BTREE_ITER_CACHED|
-			     BTREE_ITER_WITH_UPDATES);
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_subvolumes, POS(0, subvol),
+			     iter_flags);
 	k = bch2_btree_iter_peek_slot(&iter);
-	ret = bkey_err(k);
-	if (ret)
-		goto err;
+	ret = bkey_err(k) ?: k.k->type == KEY_TYPE_subvolume ? 0 : -ENOENT;
 
-	if (k.k->type != KEY_TYPE_subvolume) {
+	if (ret == -ENOENT && inconsistent_if_not_found)
 		bch2_fs_inconsistent(trans->c, "missing subvolume %u", subvol);
-		ret = -EIO;
-		goto err;
-	}
+	if (!ret)
+		*s = *bkey_s_c_to_subvolume(k).v;
 
-	*snapid = le32_to_cpu(bkey_s_c_to_subvolume(k).v->snapshot);
-err:
 	bch2_trans_iter_exit(trans, &iter);
 	return ret;
 }
 
+int bch2_subvolume_get_snapshot(struct btree_trans *trans, u32 subvol,
+				u32 *snapid)
+{
+	struct bch_subvolume s;
+	int ret;
+
+	ret = bch2_subvolume_get(trans, subvol, true,
+				 BTREE_ITER_CACHED|
+				 BTREE_ITER_WITH_UPDATES,
+				 &s);
+
+	*snapid = le32_to_cpu(s.snapshot);
+	return ret;
+}
+
 /* XXX: mark snapshot id for deletion, walk btree and delete: */
 int bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid,
 			  int deleting_snapshot)
diff --git a/fs/bcachefs/subvolume.h b/fs/bcachefs/subvolume.h
index 0740c7b7f772..ed02b982ff96 100644
--- a/fs/bcachefs/subvolume.h
+++ b/fs/bcachefs/subvolume.h
@@ -104,6 +104,8 @@ void bch2_subvolume_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c)
 	.val_to_text	= bch2_subvolume_to_text,		\
 }
 
+int bch2_subvolume_get(struct btree_trans *, unsigned,
+		       bool, int, struct bch_subvolume *);
 int bch2_subvolume_get_snapshot(struct btree_trans *, u32, u32 *);
 
 int bch2_subvolume_delete(struct btree_trans *, u32, int);
-- 
cgit 


From a9cb0a6706038292bbc22f50546859783ac492bc Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 7 Oct 2021 14:53:21 -0400
Subject: bcachefs: Fix bch2_dev_remove_alloc()

It was missing a lockrestart_do(), to call bch2_trans_begin() and also
handle transaction restarts.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/super.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index a861ec32dbde..db38d6b0f2ad 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -1458,15 +1458,18 @@ static int bch2_dev_remove_alloc(struct bch_fs *c, struct bch_dev *ca)
 	bch2_trans_init(&trans, c, 0, 0);
 
 	for (i = 0; i < ca->mi.nbuckets; i++) {
-		ret = bch2_btree_key_cache_flush(&trans,
-				BTREE_ID_alloc, POS(ca->dev_idx, i));
+		ret = lockrestart_do(&trans,
+			bch2_btree_key_cache_flush(&trans,
+				BTREE_ID_alloc, POS(ca->dev_idx, i)));
 		if (ret)
 			break;
 	}
 	bch2_trans_exit(&trans);
 
-	if (ret)
+	if (ret) {
+		bch_err(c, "error %i removing dev alloc info", ret);
 		return ret;
+	}
 
 	return bch2_btree_delete_range(c, BTREE_ID_alloc,
 				       POS(ca->dev_idx, 0),
-- 
cgit 


From 502027a8b2c0f0d46daf948cd1315694a46294e3 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 7 Oct 2021 14:54:50 -0400
Subject: bcachefs: Ensure btree_path consistent with node iterators

Btree node iterators want the interior btree_path to point to the same
pos as the returned btree node - this fixes a regression from the
introduction of btree_path, where rewriting/updating keys of btree nodes
(e.g. in bch2_dev_metadata_drop()) via btree node iterators.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_iter.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index df542a59fe12..042b8bdf4445 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1875,13 +1875,14 @@ bch2_btree_iter_traverse(struct btree_iter *iter)
 
 struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter)
 {
+	struct btree_trans *trans = iter->trans;
 	struct btree *b = NULL;
 	int ret;
 
 	EBUG_ON(iter->path->cached);
 	bch2_btree_iter_verify(iter);
 
-	ret = bch2_btree_path_traverse(iter->trans, iter->path, iter->flags);
+	ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
 	if (ret)
 		goto out;
 
@@ -1893,7 +1894,11 @@ struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter)
 
 	bkey_init(&iter->k);
 	iter->k.p = iter->pos = b->key.k.p;
+
+	iter->path = btree_path_set_pos(trans, iter->path, b->key.k.p,
+					iter->flags & BTREE_ITER_INTENT);
 	iter->path->should_be_locked = true;
+	BUG_ON(iter->path->uptodate);
 out:
 	bch2_btree_iter_verify_entry_exit(iter);
 	bch2_btree_iter_verify(iter);
@@ -1958,7 +1963,11 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
 
 	bkey_init(&iter->k);
 	iter->k.p = iter->pos = b->key.k.p;
+
+	iter->path = btree_path_set_pos(trans, iter->path, b->key.k.p,
+					iter->flags & BTREE_ITER_INTENT);
 	iter->path->should_be_locked = true;
+	BUG_ON(iter->path->uptodate);
 out:
 	bch2_btree_iter_verify_entry_exit(iter);
 	bch2_btree_iter_verify(iter);
-- 
cgit 


From d697b9abbae2f2c387aaa4e702afe12e31984a18 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 7 Oct 2021 14:56:56 -0400
Subject: bcachefs: More btree iterator fixes

 - check for getting to the end of the btree in bch2_path_verify_locks
   and __btree_path_traverse_all(), this fixes an infinite loop in
   __btree_path_traverse_all().
 - relax requirement in bch2_btree_node_upgrade() that we must want an
   intent lock, this fixes bugs with paths that point to interior nodes
   (nonzero level).
 - bch2_btree_node_update_key(): fix it to upgrade the path to an intent
   lock, if necessary

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_iter.c            | 19 +++++++++++++++----
 fs/bcachefs/btree_update_interior.c |  9 +++++++++
 2 files changed, 24 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 042b8bdf4445..b086f05b117b 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -170,11 +170,20 @@ static bool bch2_btree_node_upgrade(struct btree_trans *trans,
 {
 	struct btree *b = path->l[level].b;
 
-	EBUG_ON(btree_lock_want(path, level) != BTREE_NODE_INTENT_LOCKED);
-
 	if (!is_btree_node(path, level))
 		return false;
 
+	switch (btree_lock_want(path, level)) {
+	case BTREE_NODE_UNLOCKED:
+		BUG_ON(btree_node_locked(path, level));
+		return true;
+	case BTREE_NODE_READ_LOCKED:
+		BUG_ON(btree_node_intent_locked(path, level));
+		return bch2_btree_node_relock(trans, path, level);
+	case BTREE_NODE_INTENT_LOCKED:
+		break;
+	}
+
 	if (btree_node_intent_locked(path, level))
 		return true;
 
@@ -368,7 +377,8 @@ static void bch2_btree_path_verify_locks(struct btree_path *path)
 	unsigned l;
 
 	if (!path->nodes_locked) {
-		BUG_ON(path->uptodate == BTREE_ITER_UPTODATE);
+		BUG_ON(path->uptodate == BTREE_ITER_UPTODATE &&
+		       btree_path_node(path, path->level));
 		return;
 	}
 
@@ -1356,7 +1366,8 @@ retry_all:
 
 		EBUG_ON(!(trans->paths_allocated & (1ULL << path->idx)));
 
-		if (path->nodes_locked)
+		if (path->nodes_locked ||
+		    !btree_path_node(path, path->level))
 			i++;
 	}
 
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 978bb56275de..4ca2de360b22 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -1945,9 +1945,16 @@ int bch2_btree_node_update_key(struct btree_trans *trans, struct btree_iter *ite
 {
 	struct bch_fs *c = trans->c;
 	struct btree *new_hash = NULL;
+	struct btree_path *path = iter->path;
 	struct closure cl;
 	int ret = 0;
 
+	if (!btree_node_intent_locked(path, b->c.level) &&
+	    !bch2_btree_path_upgrade(trans, path, b->c.level + 1)) {
+		btree_trans_restart(trans);
+		return -EINTR;
+	}
+
 	closure_init_stack(&cl);
 
 	/*
@@ -1966,8 +1973,10 @@ int bch2_btree_node_update_key(struct btree_trans *trans, struct btree_iter *ite
 		new_hash = bch2_btree_node_mem_alloc(c);
 	}
 
+	path->intent_ref++;
 	ret = __bch2_btree_node_update_key(trans, iter, b, new_hash,
 					   new_key, skip_triggers);
+	--path->intent_ref;
 
 	if (new_hash) {
 		mutex_lock(&c->btree_cache.lock);
-- 
cgit 


From 56767d66e888e0e998eabbbcffa5c6da49ef5402 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 7 Oct 2021 14:59:00 -0400
Subject: bcachefs: Fixes for usrdata/metadata drop paths

These paths weren't updated for btree_path and snapshots - a couple of
minor fixes.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/migrate.c | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c
index 7c764ee4ea09..9f9eb799337e 100644
--- a/fs/bcachefs/migrate.c
+++ b/fs/bcachefs/migrate.c
@@ -51,7 +51,8 @@ static int __bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags
 			     BTREE_ITER_PREFETCH|
 			     BTREE_ITER_ALL_SNAPSHOTS);
 
-	while ((k = bch2_btree_iter_peek(&iter)).k &&
+	while ((bch2_trans_begin(&trans),
+		(k = bch2_btree_iter_peek(&iter)).k) &&
 	       !(ret = bkey_err(k))) {
 		if (!bch2_bkey_has_device(k, dev_idx)) {
 			bch2_btree_iter_advance(&iter);
@@ -72,8 +73,6 @@ static int __bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags
 		 */
 		bch2_extent_normalize(c, bkey_i_to_s(sk.k));
 
-		bch2_btree_iter_set_pos(&iter, bkey_start_pos(&sk.k->k));
-
 		ret   = bch2_btree_iter_traverse(&iter) ?:
 			bch2_trans_update(&trans, &iter, sk.k,
 					  BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
@@ -125,12 +124,14 @@ static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
 	closure_init_stack(&cl);
 
 	for (id = 0; id < BTREE_ID_NR; id++) {
-		for_each_btree_node(&trans, iter, id, POS_MIN,
-				    BTREE_ITER_PREFETCH, b) {
-retry:
+		bch2_trans_node_iter_init(&trans, &iter, id, POS_MIN, 0, 0,
+					  BTREE_ITER_PREFETCH);
+
+		while (bch2_trans_begin(&trans),
+		       (b = bch2_btree_iter_peek_node(&iter))) {
 			if (!bch2_bkey_has_device(bkey_i_to_s_c(&b->key),
 						  dev_idx))
-				continue;
+				goto next;
 
 			bch2_bkey_buf_copy(&k, c, &b->key);
 
@@ -143,14 +144,16 @@ retry:
 
 			ret = bch2_btree_node_update_key(&trans, &iter, b, k.k, false);
 			if (ret == -EINTR) {
-				b = bch2_btree_iter_peek_node(&iter);
 				ret = 0;
-				goto retry;
+				continue;
 			}
+
 			if (ret) {
 				bch_err(c, "Error updating btree node key: %i", ret);
 				break;
 			}
+next:
+			bch2_btree_iter_next_node(&iter);
 		}
 		bch2_trans_iter_exit(&trans, &iter);
 
-- 
cgit 


From 4b09ef12e76c3c0e37ecce6c1e33243d65026398 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 7 Oct 2021 18:08:01 -0400
Subject: bcachefs: Fix bch2_move_btree()

bch2_trans_begin() is now required for transaction restarts.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/move.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index 2e7d8e2fe331..5e61cd431ef9 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -883,9 +883,11 @@ static int bch2_move_btree(struct bch_fs *c,
 	     id++) {
 		stats->btree_id = id;
 
-		for_each_btree_node(&trans, iter, id,
-				    id == start_btree_id ? start_pos : POS_MIN,
-				    BTREE_ITER_PREFETCH, b) {
+		bch2_trans_node_iter_init(&trans, &iter, id, POS_MIN, 0, 0,
+					  BTREE_ITER_PREFETCH);
+
+		while (bch2_trans_begin(&trans),
+		       (b = bch2_btree_iter_peek_node(&iter))) {
 			if (kthread && kthread_should_stop())
 				break;
 
@@ -911,6 +913,7 @@ static int bch2_move_btree(struct bch_fs *c,
 					b->data->keys.seq, 0) ?: ret;
 next:
 			bch2_trans_cond_resched(&trans);
+			bch2_btree_iter_next_node(&iter);
 		}
 		bch2_trans_iter_exit(&trans, &iter);
 
-- 
cgit 


From 107fe5af562303cda985c6bb72d36dbcd2076f06 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 7 Oct 2021 18:18:01 -0400
Subject: bcachefs: Fix a pcpu var splat

this_cpu_ptr() emits a warning when used without preemption disabled -
harmless in this case, as we have other locking where
bch2_acc_percpu_u64s() is used.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/util.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c
index f287bca8498d..8211c9a1b6cb 100644
--- a/fs/bcachefs/util.c
+++ b/fs/bcachefs/util.c
@@ -893,9 +893,14 @@ void eytzinger0_find_test(void)
  */
 u64 *bch2_acc_percpu_u64s(u64 __percpu *p, unsigned nr)
 {
-	u64 *ret = this_cpu_ptr(p);
+	u64 *ret;
 	int cpu;
 
+	/* access to pcpu vars has to be blocked by other locking */
+	preempt_disable();
+	ret = this_cpu_ptr(p);
+	preempt_enable();
+
 	for_each_possible_cpu(cpu) {
 		u64 *i = per_cpu_ptr(p, cpu);
 
-- 
cgit 


From 7bd68c73044f1ba39f505082bbed3b2e26f69a13 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 30 Sep 2021 20:09:08 -0400
Subject: bcachefs: Snapshot deletion fix

When we delete a snapshot, we unlink the inode but we don't want to run
the inode_rm path - the unlink path deletes the subvolume directly,
which does everything we need. Also allowing the inode_rm path to run
was getting us "missing subvolume" errors.

There's still another bug with snapshot deletion: we need to make
snapshot deletion a multi stage process, where we unlink the root
dentry, then tear down the page cache, then delete the snapshot.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/fs-common.c | 30 +++++++++++++++++++++++-------
 1 file changed, 23 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-common.c b/fs/bcachefs/fs-common.c
index 00c7ba17f6c8..c49de741e1e3 100644
--- a/fs/bcachefs/fs-common.c
+++ b/fs/bcachefs/fs-common.c
@@ -267,18 +267,33 @@ int bch2_unlink_trans(struct btree_trans *trans,
 	if (ret)
 		goto err;
 
-	if (deleting_snapshot == 1 && !inode_u->bi_subvol) {
-		ret = -ENOENT;
-		goto err;
-	}
-
 	if (deleting_snapshot <= 0 && S_ISDIR(inode_u->bi_mode)) {
 		ret = bch2_empty_dir_trans(trans, inum);
 		if (ret)
 			goto err;
 	}
 
-	if (inode_u->bi_subvol) {
+	if (deleting_snapshot < 0 &&
+	    inode_u->bi_subvol) {
+		struct bch_subvolume s;
+
+		ret = bch2_subvolume_get(trans, inode_u->bi_subvol, true,
+					 BTREE_ITER_CACHED|
+					 BTREE_ITER_WITH_UPDATES,
+					 &s);
+		if (ret)
+			goto err;
+
+		if (BCH_SUBVOLUME_SNAP(&s))
+			deleting_snapshot = 1;
+	}
+
+	if (deleting_snapshot == 1) {
+		if (!inode_u->bi_subvol) {
+			ret = -ENOENT;
+			goto err;
+		}
+
 		ret = bch2_subvolume_delete(trans, inode_u->bi_subvol,
 					    deleting_snapshot);
 		if (ret)
@@ -297,6 +312,8 @@ int bch2_unlink_trans(struct btree_trans *trans,
 		ret = bch2_btree_iter_traverse(&dirent_iter);
 		if (ret)
 			goto err;
+	} else {
+		bch2_inode_nlink_dec(inode_u);
 	}
 
 	if (inode_u->bi_dir		== dirent_iter.pos.inode &&
@@ -307,7 +324,6 @@ int bch2_unlink_trans(struct btree_trans *trans,
 
 	dir_u->bi_mtime = dir_u->bi_ctime = inode_u->bi_ctime = now;
 	dir_u->bi_nlink -= is_subdir_for_nlink(inode_u);
-	bch2_inode_nlink_dec(inode_u);
 
 	ret =   bch2_hash_delete_at(trans, bch2_dirent_hash_desc,
 				    &dir_hash, &dirent_iter,
-- 
cgit 


From e8bde78a178798a414289f2ad5c1f015d8d53139 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 12 Oct 2021 14:15:45 -0400
Subject: bcachefs: Fix rereplicate_pred()

It was switching off of the key type incorrectly - this code must've
been quite old, and not rereplicating anything that wasn't a
btree_ptr_v1 or a plain old extent.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/move.c | 13 +++----------
 1 file changed, 3 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index 5e61cd431ef9..fbb6c043ad9b 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -946,16 +946,9 @@ static enum data_cmd rereplicate_pred(struct bch_fs *c, void *arg,
 				      struct data_opts *data_opts)
 {
 	unsigned nr_good = bch2_bkey_durability(c, k);
-	unsigned replicas = 0;
-
-	switch (k.k->type) {
-	case KEY_TYPE_btree_ptr:
-		replicas = c->opts.metadata_replicas;
-		break;
-	case KEY_TYPE_extent:
-		replicas = io_opts->data_replicas;
-		break;
-	}
+	unsigned replicas = bkey_is_btree_ptr(k.k)
+		? c->opts.metadata_replicas
+		: io_opts->data_replicas;
 
 	if (!nr_good || nr_good >= replicas)
 		return DATA_SKIP;
-- 
cgit 


From 395576807555fa9ffb2ae038cae1fe2699f85b89 Mon Sep 17 00:00:00 2001
From: Brett Holman <bholman.devel@gmail.com>
Date: Tue, 12 Oct 2021 21:11:25 -0600
Subject: bcachefs: Add a valgrind memcheck hint

Prevent false positives in bch2_varint_decode_fast()

Signed-off-by: Brett Holman <bholman.devel@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/varint.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/varint.c b/fs/bcachefs/varint.c
index e87da470c581..5143b603bf67 100644
--- a/fs/bcachefs/varint.c
+++ b/fs/bcachefs/varint.c
@@ -5,6 +5,10 @@
 #include <linux/string.h>
 #include <asm/unaligned.h>
 
+#ifdef CONFIG_VALGRIND
+#include <valgrind/memcheck.h>
+#endif
+
 #include "varint.h"
 
 /**
@@ -96,6 +100,9 @@ int bch2_varint_encode_fast(u8 *out, u64 v)
  */
 int bch2_varint_decode_fast(const u8 *in, const u8 *end, u64 *out)
 {
+#ifdef CONFIG_VALGRIND
+	VALGRIND_MAKE_MEM_DEFINED(in, 8);
+#endif
 	u64 v = get_unaligned_le64(in);
 	unsigned bytes = ffz(*in) + 1;
 
-- 
cgit 


From 776eaddb2cef366b16dac8857899dbb4cc5dfdb1 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 12 Oct 2021 14:25:13 -0400
Subject: bcachefs: Fix deletion in __bch2_dev_usrdata_drop()

With snapshots, __bch2_dev_usr_data_drop() now uses an ALL_SNAPSHOTS
iterator, which isn't an extent iterator - meaning we shouldn't be
inserting whiteouts with nonzero size to delete. This fixes a bug where
we go RO because we tried to insert an invalid key in the device remove
path.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/migrate.c | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c
index 9f9eb799337e..94d5d99ffd2a 100644
--- a/fs/bcachefs/migrate.c
+++ b/fs/bcachefs/migrate.c
@@ -73,6 +73,15 @@ static int __bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags
 		 */
 		bch2_extent_normalize(c, bkey_i_to_s(sk.k));
 
+		/*
+		 * Since we're not inserting through an extent iterator
+		 * (BTREE_ITER_ALL_SNAPSHOTS iterators aren't extent iterators),
+		 * we aren't using the extent overwrite path to delete, we're
+		 * just using the normal key deletion path:
+		 */
+		if (bkey_deleted(&sk.k->k))
+			sk.k->k.size = 0;
+
 		ret   = bch2_btree_iter_traverse(&iter) ?:
 			bch2_trans_update(&trans, &iter, sk.k,
 					  BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
-- 
cgit 


From b9a7d8ac5f6d66619de8a4e37b23574d1ca107cf Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 13 Oct 2021 13:12:26 -0400
Subject: bcachefs: Fix implementation of KEY_TYPE_error

When force-removing a device, we were silently dropping extents that we
no longer had pointers for - we should have been switching them to
KEY_TYPE_error, so that reads for data that was lost return errors.

This patch adds the logic for switching a key to KEY_TYPE_error to
bch2_bkey_drop_ptr(), and improves the logic somewhat.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_gc.c |  1 -
 fs/bcachefs/extents.c  | 90 ++++++++++++++++++++++++++++++++++++--------------
 fs/bcachefs/extents.h  | 14 ++++----
 fs/bcachefs/move.c     |  2 +-
 4 files changed, 74 insertions(+), 33 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index f14667390e2c..dbf44704d0fc 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -1767,7 +1767,6 @@ static int bch2_gc_btree_gens(struct bch_fs *c, enum btree_id btree_id)
 			bch2_bkey_buf_reassemble(&sk, c, k);
 			bch2_extent_normalize(c, bkey_i_to_s(sk.k));
 
-
 			commit_err =
 				bch2_trans_update(&trans, &iter, sk.k, 0) ?:
 				bch2_trans_commit(&trans, NULL, NULL,
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 966d6ef41793..7f1a5c81ef09 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -479,7 +479,7 @@ restart_narrow_pointers:
 
 	bkey_for_each_ptr_decode(&k->k, ptrs, p, i)
 		if (can_narrow_crc(p.crc, n)) {
-			bch2_bkey_drop_ptr(bkey_i_to_s(k), &i->ptr);
+			__bch2_bkey_drop_ptr(bkey_i_to_s(k), &i->ptr);
 			p.ptr.offset += p.crc.offset;
 			p.crc = n;
 			bch2_extent_ptr_decoded_append(k, &p);
@@ -784,41 +784,85 @@ static union bch_extent_entry *extent_entry_prev(struct bkey_ptrs ptrs,
 	return i;
 }
 
-union bch_extent_entry *bch2_bkey_drop_ptr(struct bkey_s k,
-					   struct bch_extent_ptr *ptr)
+static void extent_entry_drop(struct bkey_s k, union bch_extent_entry *entry)
+{
+	union bch_extent_entry *next = extent_entry_next(entry);
+
+	/* stripes have ptrs, but their layout doesn't work with this code */
+	BUG_ON(k.k->type == KEY_TYPE_stripe);
+
+	memmove_u64s_down(entry, next,
+			  (u64 *) bkey_val_end(k) - (u64 *) next);
+	k.k->u64s -= (u64 *) next - (u64 *) entry;
+}
+
+/*
+ * Returns pointer to the next entry after the one being dropped:
+ */
+union bch_extent_entry *__bch2_bkey_drop_ptr(struct bkey_s k,
+					     struct bch_extent_ptr *ptr)
 {
 	struct bkey_ptrs ptrs = bch2_bkey_ptrs(k);
-	union bch_extent_entry *dst, *src, *prev;
+	union bch_extent_entry *entry = to_entry(ptr), *next;
+	union bch_extent_entry *ret = entry;
 	bool drop_crc = true;
 
 	EBUG_ON(ptr < &ptrs.start->ptr ||
 		ptr >= &ptrs.end->ptr);
 	EBUG_ON(ptr->type != 1 << BCH_EXTENT_ENTRY_ptr);
 
-	src = extent_entry_next(to_entry(ptr));
-	if (src != ptrs.end &&
-	    !extent_entry_is_crc(src))
-		drop_crc = false;
-
-	dst = to_entry(ptr);
-	while ((prev = extent_entry_prev(ptrs, dst))) {
-		if (extent_entry_is_ptr(prev))
+	for (next = extent_entry_next(entry);
+	     next != ptrs.end;
+	     next = extent_entry_next(next)) {
+		if (extent_entry_is_crc(next)) {
 			break;
-
-		if (extent_entry_is_crc(prev)) {
-			if (drop_crc)
-				dst = prev;
+		} else if (extent_entry_is_ptr(next)) {
+			drop_crc = false;
 			break;
 		}
+	}
+
+	extent_entry_drop(k, entry);
 
-		dst = prev;
+	while ((entry = extent_entry_prev(ptrs, entry))) {
+		if (extent_entry_is_ptr(entry))
+			break;
+
+		if ((extent_entry_is_crc(entry) && drop_crc) ||
+		    extent_entry_is_stripe_ptr(entry)) {
+			ret = (void *) ret - extent_entry_bytes(entry);
+			extent_entry_drop(k, entry);
+		}
 	}
 
-	memmove_u64s_down(dst, src,
-			  (u64 *) ptrs.end - (u64 *) src);
-	k.k->u64s -= (u64 *) src - (u64 *) dst;
+	return ret;
+}
+
+union bch_extent_entry *bch2_bkey_drop_ptr(struct bkey_s k,
+					   struct bch_extent_ptr *ptr)
+{
+	bool have_dirty = bch2_bkey_dirty_devs(k.s_c).nr;
+	union bch_extent_entry *ret =
+		__bch2_bkey_drop_ptr(k, ptr);
+
+	/*
+	 * If we deleted all the dirty pointers and there's still cached
+	 * pointers, we could set the cached pointers to dirty if they're not
+	 * stale - but to do that correctly we'd need to grab an open_bucket
+	 * reference so that we don't race with bucket reuse:
+	 */
+	if (have_dirty &&
+	    !bch2_bkey_dirty_devs(k.s_c).nr) {
+		k.k->type = KEY_TYPE_error;
+		set_bkey_val_u64s(k.k, 0);
+		ret = NULL;
+	} else if (!bch2_bkey_nr_ptrs(k.s_c)) {
+		k.k->type = KEY_TYPE_deleted;
+		set_bkey_val_u64s(k.k, 0);
+		ret = NULL;
+	}
 
-	return dst;
+	return ret;
 }
 
 void bch2_bkey_drop_device(struct bkey_s k, unsigned dev)
@@ -888,10 +932,6 @@ bool bch2_extent_normalize(struct bch_fs *c, struct bkey_s k)
 		ptr->cached &&
 		ptr_stale(bch_dev_bkey_exists(c, ptr->dev), ptr));
 
-	/* will only happen if all pointers were cached: */
-	if (!bch2_bkey_nr_ptrs(k.s_c))
-		k.k->type = KEY_TYPE_deleted;
-
 	return bkey_deleted(k.k);
 }
 
diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h
index afd3067bb64e..9c2567274a2b 100644
--- a/fs/bcachefs/extents.h
+++ b/fs/bcachefs/extents.h
@@ -78,12 +78,12 @@ static inline size_t extent_entry_u64s(const union bch_extent_entry *entry)
 
 static inline bool extent_entry_is_ptr(const union bch_extent_entry *e)
 {
-	switch (extent_entry_type(e)) {
-	case BCH_EXTENT_ENTRY_ptr:
-		return true;
-	default:
-		return false;
-	}
+	return extent_entry_type(e) == BCH_EXTENT_ENTRY_ptr;
+}
+
+static inline bool extent_entry_is_stripe_ptr(const union bch_extent_entry *e)
+{
+	return extent_entry_type(e) == BCH_EXTENT_ENTRY_stripe_ptr;
 }
 
 static inline bool extent_entry_is_crc(const union bch_extent_entry *e)
@@ -578,6 +578,8 @@ void bch2_bkey_extent_entry_drop(struct bkey_i *, union bch_extent_entry *);
 void bch2_bkey_append_ptr(struct bkey_i *, struct bch_extent_ptr);
 void bch2_extent_ptr_decoded_append(struct bkey_i *,
 				    struct extent_ptr_decoded *);
+union bch_extent_entry *__bch2_bkey_drop_ptr(struct bkey_s,
+					     struct bch_extent_ptr *);
 union bch_extent_entry *bch2_bkey_drop_ptr(struct bkey_s,
 					   struct bch_extent_ptr *);
 
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index fbb6c043ad9b..0db0ce503cd5 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -195,7 +195,7 @@ int bch2_migrate_index_update(struct bch_write_op *op)
 				extent_for_each_ptr(extent_i_to_s(new), new_ptr)
 					new_ptr->cached = true;
 
-			bch2_bkey_drop_ptr(bkey_i_to_s(insert), old_ptr);
+			__bch2_bkey_drop_ptr(bkey_i_to_s(insert), old_ptr);
 		}
 
 		extent_for_each_ptr_decode(extent_i_to_s(new), p, entry) {
-- 
cgit 


From edeb986b017e9489add4daa7e61bc79cdbfb913c Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 13 Oct 2021 13:45:46 -0400
Subject: bcachefs: Don't allocate too-big bios

This fixes a null ptr deref in bio_alloc_bioset() -> biovec_slab()

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/io.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index 002fd35e6bfe..ea2adcc213d0 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -782,6 +782,8 @@ static struct bio *bch2_write_bio_alloc(struct bch_fs *c,
 				       ? ((unsigned long) buf & (PAGE_SIZE - 1))
 				       : 0), PAGE_SIZE);
 
+	pages = min(pages, BIO_MAX_VECS);
+
 	bio = bio_alloc_bioset(NULL, pages, 0,
 			       GFP_NOIO, &c->bio_write);
 	wbio			= wbio_init(bio);
-- 
cgit 


From 60816d9ba69bf746e3f6beda4b6e62e914690024 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 14 Oct 2021 11:47:52 -0400
Subject: bcachefs: Improve bch2_dump_trans_paths_updates()

Also print the key beyng overwritten for each update.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_iter.c | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index b086f05b117b..dd0cd4aecc94 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1695,7 +1695,7 @@ void bch2_dump_trans_paths_updates(struct btree_trans *trans)
 	struct btree_path *path;
 	struct btree_insert_entry *i;
 	unsigned idx;
-	char buf[300];
+	char buf1[300], buf2[300];
 
 	btree_trans_sort_paths(trans);
 
@@ -1704,7 +1704,7 @@ void bch2_dump_trans_paths_updates(struct btree_trans *trans)
 		       path->idx, path->ref, path->intent_ref,
 		       path->preserve ? " preserve" : "",
 		       bch2_btree_ids[path->btree_id],
-		       (bch2_bpos_to_text(&PBUF(buf), path->pos), buf),
+		       (bch2_bpos_to_text(&PBUF(buf1), path->pos), buf1),
 #ifdef CONFIG_BCACHEFS_DEBUG
 		       (void *) path->ip_allocated
 #else
@@ -1712,11 +1712,16 @@ void bch2_dump_trans_paths_updates(struct btree_trans *trans)
 #endif
 		       );
 
-	trans_for_each_update(trans, i)
-		printk(KERN_ERR "update: btree %s %s %pS\n",
+	trans_for_each_update(trans, i) {
+		struct bkey u;
+		struct bkey_s_c old = bch2_btree_path_peek_slot(i->path, &u);
+
+		printk(KERN_ERR "update: btree %s %pS\n  old %s\n  new %s",
 		       bch2_btree_ids[i->btree_id],
-		       (bch2_bkey_val_to_text(&PBUF(buf), trans->c, bkey_i_to_s_c(i->k)), buf),
-		       (void *) i->ip_allocated);
+		       (void *) i->ip_allocated,
+		       (bch2_bkey_val_to_text(&PBUF(buf1), trans->c, old), buf1),
+		       (bch2_bkey_val_to_text(&PBUF(buf2), trans->c, bkey_i_to_s_c(i->k)), buf2));
+	}
 }
 
 static struct btree_path *btree_path_alloc(struct btree_trans *trans,
-- 
cgit 


From bd547c8acb1aa76ceab1987109ac6ceae698fcc9 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 14 Oct 2021 13:14:40 -0400
Subject: bcachefs: Fix __bch2_dirent_read_target()

We were shadowing our exist status, oops

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/dirent.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c
index c7344ac87fcd..cd5468b15ba2 100644
--- a/fs/bcachefs/dirent.c
+++ b/fs/bcachefs/dirent.c
@@ -183,6 +183,7 @@ int __bch2_dirent_read_target(struct btree_trans *trans,
 			      u32 *subvol, u32 *snapshot, u64 *inum,
 			      bool is_fsck)
 {
+	struct bch_subvolume s;
 	int ret = 0;
 
 	*subvol		= 0;
@@ -191,9 +192,6 @@ int __bch2_dirent_read_target(struct btree_trans *trans,
 	if (likely(d.v->d_type != DT_SUBVOL)) {
 		*inum = le64_to_cpu(d.v->d_inum);
 	} else {
-		struct bch_subvolume s;
-		int ret;
-
 		*subvol = le64_to_cpu(d.v->d_inum);
 
 		ret = bch2_subvolume_get(trans, *subvol, !is_fsck, BTREE_ITER_CACHED, &s);
-- 
cgit 


From 70d61a7036c8d046889a4bf4eda504a6a56b2642 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 18 Oct 2021 11:32:06 -0400
Subject: bcachefs: Zero out reflink_p val in bch2_make_extent_indirect()

This bug was only discovered when we started using the 2nd word in the
val, which should have been zeroed out as those fields had never been
used before - ouch.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/reflink.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c
index 92ff609453b8..c63c95fc49b1 100644
--- a/fs/bcachefs/reflink.c
+++ b/fs/bcachefs/reflink.c
@@ -166,9 +166,15 @@ static int bch2_make_extent_indirect(struct btree_trans *trans,
 	if (ret)
 		goto err;
 
+	/*
+	 * orig is in a bkey_buf which statically allocates 5 64s for the val,
+	 * so we know it will be big enough:
+	 */
 	orig->k.type = KEY_TYPE_reflink_p;
 	r_p = bkey_i_to_reflink_p(orig);
 	set_bkey_val_bytes(&r_p->k, sizeof(r_p->v));
+	memset(&r_p->v, 0, sizeof(r_p->v));
+
 	r_p->v.idx = cpu_to_le64(bkey_start_offset(&r_v->k));
 
 	ret = bch2_trans_update(trans, extent_iter, &r_p->k_i, 0);
-- 
cgit 


From 6a0f414e2018fe7a2b001fbc8ccd4a8f4f946214 Mon Sep 17 00:00:00 2001
From: Brett Holman <bholman.devel@gmail.com>
Date: Sat, 16 Oct 2021 19:13:53 -0600
Subject: bcachefs: Fix compiler warnings

Type size_t is architecture-specific. Fix warnings for some non-amd64
arches.

Signed-off-by: Brett Holman <bholman.devel@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/journal_reclaim.c | 2 +-
 fs/bcachefs/subvolume.h       | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
index 54d8e2f32a37..0bc4681ccc24 100644
--- a/fs/bcachefs/journal_reclaim.c
+++ b/fs/bcachefs/journal_reclaim.c
@@ -653,7 +653,7 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct)
 				atomic_long_read(&c->btree_key_cache.nr_dirty),
 				atomic_long_read(&c->btree_key_cache.nr_keys));
 
-		min_key_cache = min(bch2_nr_btree_keys_need_flush(c), 128UL);
+		min_key_cache = min(bch2_nr_btree_keys_need_flush(c), (size_t) 128);
 
 		nr_flushed = journal_flush_pins(j, seq_to_flush,
 						min_nr, min_key_cache);
diff --git a/fs/bcachefs/subvolume.h b/fs/bcachefs/subvolume.h
index ed02b982ff96..f98c8c0dbea2 100644
--- a/fs/bcachefs/subvolume.h
+++ b/fs/bcachefs/subvolume.h
@@ -75,7 +75,7 @@ static inline void snapshots_seen_init(struct snapshots_seen *s)
 static inline int snapshots_seen_add(struct bch_fs *c, struct snapshots_seen *s, u32 id)
 {
 	if (s->nr == s->size) {
-		size_t new_size = max(s->size, 128UL) * 2;
+		size_t new_size = max(s->size, (size_t) 128) * 2;
 		u32 *d = krealloc(s->d, new_size * sizeof(s->d[0]), GFP_KERNEL);
 
 		if (!d) {
-- 
cgit 


From ca130b9c5e120994483a34c72526dcd4bf308d84 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 18 Oct 2021 14:46:57 -0400
Subject: bcachefs: Fix a cache coherency bug in bch2_subvolume_create()

Subvolume deletion doesn't flush & evict the btree key cache - ideally
it would, but that's tricky, so instead bch2_subvolume_create() needs to
make sure the slot doesn't exist in the key cache to avoid triggering
assertions.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/subvolume.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c
index d1c111050c35..9bd8d61c96fe 100644
--- a/fs/bcachefs/subvolume.c
+++ b/fs/bcachefs/subvolume.c
@@ -886,6 +886,7 @@ int bch2_subvolume_create(struct btree_trans *trans, u64 inode,
 			  u32 *new_snapshotid,
 			  bool ro)
 {
+	struct bch_fs *c = trans->c;
 	struct btree_iter dst_iter, src_iter = (struct btree_iter) { NULL };
 	struct bkey_i_subvolume *new_subvol = NULL;
 	struct bkey_i_subvolume *src_subvol = NULL;
@@ -897,7 +898,13 @@ int bch2_subvolume_create(struct btree_trans *trans, u64 inode,
 			   BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) {
 		if (bkey_cmp(k.k->p, SUBVOL_POS_MAX) > 0)
 			break;
-		if (bkey_deleted(k.k))
+
+		/*
+		 * bch2_subvolume_delete() doesn't flush the btree key cache -
+		 * ideally it would but that's tricky
+		 */
+		if (bkey_deleted(k.k) &&
+		    !bch2_btree_key_cache_find(c, BTREE_ID_subvolumes, dst_iter.pos))
 			goto found_slot;
 	}
 
@@ -925,7 +932,7 @@ found_slot:
 			goto err;
 
 		if (k.k->type != KEY_TYPE_subvolume) {
-			bch_err(trans->c, "subvolume %u not found", src_subvolid);
+			bch_err(c, "subvolume %u not found", src_subvolid);
 			ret = -ENOENT;
 			goto err;
 		}
-- 
cgit 


From 488f97764a9adb68d2ebec0a6e5b96f0f0a7bf38 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 19 Oct 2021 01:08:05 -0400
Subject: bcachefs: Fix check_path() across subvolumes

Checking of directory structure across subvolumes was broken - we need
to look up the snapshot ID of the parent subvolume when crossing subvol
boundaries.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/fsck.c | 24 ++++++++++++++++++++++--
 1 file changed, 22 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 208bf6df82b5..826a3577ee93 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -1050,6 +1050,8 @@ static int inode_backpointer_exists(struct btree_trans *trans,
 {
 	struct btree_iter iter;
 	struct bkey_s_c k;
+	u32 target_subvol, target_snapshot;
+	u64 target_inum;
 	int ret;
 
 	bch2_trans_iter_init(trans, &iter, BTREE_ID_dirents,
@@ -1061,7 +1063,15 @@ static int inode_backpointer_exists(struct btree_trans *trans,
 	if (k.k->type != KEY_TYPE_dirent)
 		goto out;
 
-	ret = le64_to_cpu(bkey_s_c_to_dirent(k).v->d_inum) == inode->bi_inum;
+	ret = __bch2_dirent_read_target(trans, bkey_s_c_to_dirent(k),
+					&target_subvol,
+					&target_snapshot,
+					&target_inum,
+					true);
+	if (ret)
+		goto out;
+
+	ret = target_inum == inode->bi_inum;
 out:
 	bch2_trans_iter_exit(trans, &iter);
 	return ret;
@@ -1754,7 +1764,17 @@ static int check_path(struct btree_trans *trans,
 	snapshot = snapshot_t(c, snapshot)->equiv;
 	p->nr = 0;
 
-	while (inode->bi_inum != BCACHEFS_ROOT_INO) {
+	while (!(inode->bi_inum == BCACHEFS_ROOT_INO &&
+		 inode->bi_subvol == BCACHEFS_ROOT_SUBVOL)) {
+		if (inode->bi_parent_subvol) {
+			u64 inum;
+
+			ret = subvol_lookup(trans, inode->bi_parent_subvol,
+					    &snapshot, &inum);
+			if (ret)
+				break;
+		}
+
 		ret = lockrestart_do(trans,
 			inode_backpointer_exists(trans, inode, snapshot));
 		if (ret < 0)
-- 
cgit 


From dfc276df911cb7bf026482a9af7c30a60726daff Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 19 Oct 2021 12:27:47 -0400
Subject: bcachefs: Improve reflink repair code

When a reflink pointer points to an indirect extent that doesn't exist,
we need to replace it with a KEY_TYPE_error key.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_gc.c |  2 +-
 fs/bcachefs/buckets.c  | 51 +++++++++++++++++++++++++++++++++++++++++---------
 2 files changed, 43 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index dbf44704d0fc..ea3f7339ba58 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -738,7 +738,7 @@ static int bch2_gc_mark_key(struct bch_fs *c, enum btree_id btree_id,
 		*max_stale = max(*max_stale, ptr_stale(ca, ptr));
 	}
 
-	bch2_mark_key(c, *k, flags);
+	ret = bch2_mark_key(c, *k, flags);
 fsck_err:
 err:
 	if (ret)
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 2d2bdfb7977d..9c5d18b4efaa 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -14,6 +14,7 @@
 #include "ec.h"
 #include "error.h"
 #include "movinggc.h"
+#include "recovery.h"
 #include "reflink.h"
 #include "replicas.h"
 #include "subvolume.h"
@@ -1115,10 +1116,9 @@ static s64 __bch2_mark_reflink_p(struct bch_fs *c, struct bkey_s_c_reflink_p p,
 {
 	struct reflink_gc *r;
 	int add = !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1;
+	s64 ret = 0;
 
-	while (1) {
-		if (*r_idx >= c->reflink_gc_nr)
-			goto not_found;
+	while (*r_idx < c->reflink_gc_nr) {
 		r = genradix_ptr(&c->reflink_gc_table, *r_idx);
 		BUG_ON(!r);
 
@@ -1127,16 +1127,49 @@ static s64 __bch2_mark_reflink_p(struct bch_fs *c, struct bkey_s_c_reflink_p p,
 		(*r_idx)++;
 	}
 
+	if (*r_idx >= c->reflink_gc_nr ||
+	    idx < r->offset - r->size) {
+		ret = p.k->size;
+		goto not_found;
+	}
+
 	BUG_ON((s64) r->refcount + add < 0);
 
 	r->refcount += add;
 	return r->offset - idx;
 not_found:
-	bch2_fs_inconsistent(c,
-		"%llu:%llu len %u points to nonexistent indirect extent %llu",
-		p.k->p.inode, p.k->p.offset, p.k->size, idx);
-	bch2_inconsistent_error(c);
-	return -EIO;
+	if ((flags & BTREE_TRIGGER_GC) &&
+	    (flags & BTREE_TRIGGER_NOATOMIC)) {
+		/*
+		 * XXX: we're replacing the entire reflink pointer with an error
+		 * key, we should just be replacing the part that was missing:
+		 */
+		if (fsck_err(c, "%llu:%llu len %u points to nonexistent indirect extent %llu",
+			     p.k->p.inode, p.k->p.offset, p.k->size, idx)) {
+			struct bkey_i_error *new;
+
+			new = kmalloc(sizeof(*new), GFP_KERNEL);
+			if (!new) {
+				bch_err(c, "%s: error allocating new key", __func__);
+				return -ENOMEM;
+			}
+
+			bkey_init(&new->k);
+			new->k.type	= KEY_TYPE_error;
+			new->k.p	= p.k->p;
+			new->k.size	= p.k->size;
+			ret = bch2_journal_key_insert(c, BTREE_ID_extents, 0, &new->k_i);
+
+		}
+	} else {
+		bch2_fs_inconsistent(c,
+				     "%llu:%llu len %u points to nonexistent indirect extent %llu",
+				     p.k->p.inode, p.k->p.offset, p.k->size, idx);
+		bch2_inconsistent_error(c);
+		ret = -EIO;
+	}
+fsck_err:
+	return ret;
 }
 
 static int bch2_mark_reflink_p(struct bch_fs *c,
@@ -1168,7 +1201,7 @@ static int bch2_mark_reflink_p(struct bch_fs *c,
 
 	while (sectors) {
 		ret = __bch2_mark_reflink_p(c, p, idx, flags, &l);
-		if (ret < 0)
+		if (ret <= 0)
 			return ret;
 
 		ret = min_t(s64, ret, sectors);
-- 
cgit 


From d355c6f4f73060c8f3eba95b9ae72929669f7516 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 19 Oct 2021 14:20:50 -0400
Subject: bcachefs: for_each_btree_node() now returns errors directly

This changes for_each_btree_node() to work like for_each_btree_key(),
and to that end bch2_btree_iter_peek_node() and next_node() also return
error ptrs.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_gc.c              |  4 ++--
 fs/bcachefs/btree_iter.c            | 20 +++++++++++++-------
 fs/bcachefs/btree_iter.h            | 10 +++++-----
 fs/bcachefs/btree_update_interior.c |  4 ++++
 fs/bcachefs/debug.c                 |  2 +-
 fs/bcachefs/journal_seq_blacklist.c |  4 ++--
 fs/bcachefs/migrate.c               |  8 ++++++--
 fs/bcachefs/move.c                  |  8 ++++++--
 8 files changed, 39 insertions(+), 21 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index ea3f7339ba58..315a78b5ba8b 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -806,7 +806,7 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id,
 	gc_pos_set(c, gc_pos_btree(btree_id, POS_MIN, 0));
 
 	__for_each_btree_node(&trans, iter, btree_id, POS_MIN,
-			      0, depth, BTREE_ITER_PREFETCH, b) {
+			      0, depth, BTREE_ITER_PREFETCH, b, ret) {
 		bch2_verify_btree_nr_keys(b);
 
 		gc_pos_set(c, gc_pos_btree_node(b));
@@ -833,7 +833,7 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id,
 	}
 	bch2_trans_iter_exit(&trans, &iter);
 
-	ret = bch2_trans_exit(&trans) ?: ret;
+	bch2_trans_exit(&trans);
 	if (ret)
 		return ret;
 
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index dd0cd4aecc94..339b3657683a 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1900,7 +1900,7 @@ struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter)
 
 	ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
 	if (ret)
-		goto out;
+		goto err;
 
 	b = btree_path_node(iter->path, iter->path->level);
 	if (!b)
@@ -1920,6 +1920,9 @@ out:
 	bch2_btree_iter_verify(iter);
 
 	return b;
+err:
+	b = ERR_PTR(ret);
+	goto out;
 }
 
 struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
@@ -1936,7 +1939,9 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
 	if (!btree_path_node(path, path->level))
 		goto out;
 
-	bch2_trans_cond_resched(trans);
+	ret = bch2_trans_cond_resched(trans);
+	if (ret)
+		goto err;
 
 	btree_node_unlock(path, path->level);
 	path->l[path->level].b = BTREE_ITER_NO_NODE_UP;
@@ -1945,7 +1950,7 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
 	btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
 	ret = bch2_btree_path_traverse(trans, path, iter->flags);
 	if (ret)
-		goto out;
+		goto err;
 
 	/* got to end? */
 	b = btree_path_node(path, path->level);
@@ -1969,10 +1974,8 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
 		bch2_btree_iter_verify(iter);
 
 		ret = bch2_btree_path_traverse(trans, path, iter->flags);
-		if (ret) {
-			b = NULL;
-			goto out;
-		}
+		if (ret)
+			goto err;
 
 		b = path->l[path->level].b;
 	}
@@ -1989,6 +1992,9 @@ out:
 	bch2_btree_iter_verify(iter);
 
 	return b;
+err:
+	b = ERR_PTR(ret);
+	goto out;
 }
 
 /* Iterate across keys (in leaf nodes only) */
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index feb2fcff1485..1cb4261bd66e 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -284,18 +284,18 @@ static inline int bch2_trans_cond_resched(struct btree_trans *trans)
 	}
 }
 
-#define __for_each_btree_node(_trans, _iter, _btree_id, _start,	\
-			      _locks_want, _depth, _flags, _b)		\
+#define __for_each_btree_node(_trans, _iter, _btree_id, _start,		\
+			      _locks_want, _depth, _flags, _b, _ret)	\
 	for (bch2_trans_node_iter_init((_trans), &(_iter), (_btree_id),	\
 				_start, _locks_want, _depth, _flags),	\
 	     _b = bch2_btree_iter_peek_node(&(_iter));			\
-	     (_b);							\
+	     !((_ret) = PTR_ERR_OR_ZERO(_b)) && (_b);			\
 	     (_b) = bch2_btree_iter_next_node(&(_iter)))
 
 #define for_each_btree_node(_trans, _iter, _btree_id, _start,		\
-			    _flags, _b)					\
+			    _flags, _b, _ret)				\
 	__for_each_btree_node(_trans, _iter, _btree_id, _start,		\
-			      0, 0, _flags, _b)
+			      0, 0, _flags, _b, _ret)
 
 static inline struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter,
 						     unsigned flags)
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 4ca2de360b22..14ecd3f863de 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -1736,6 +1736,10 @@ retry:
 		goto out;
 
 	b = bch2_btree_iter_peek_node(iter);
+	ret = PTR_ERR_OR_ZERO(b);
+	if (ret)
+		goto out;
+
 	if (!b || b->data->keys.seq != seq)
 		goto out;
 
diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c
index 5ffb7f0a3bf6..8b25ef9e1e05 100644
--- a/fs/bcachefs/debug.c
+++ b/fs/bcachefs/debug.c
@@ -318,7 +318,7 @@ static ssize_t bch2_read_btree_formats(struct file *file, char __user *buf,
 
 	bch2_trans_init(&trans, i->c, 0, 0);
 
-	for_each_btree_node(&trans, iter, i->id, i->from, 0, b) {
+	for_each_btree_node(&trans, iter, i->id, i->from, 0, b, err) {
 		bch2_btree_node_to_text(&PBUF(i->buf), i->c, b);
 		i->bytes = strlen(i->buf);
 		err = flush_buf(i);
diff --git a/fs/bcachefs/journal_seq_blacklist.c b/fs/bcachefs/journal_seq_blacklist.c
index 68fb2ebd91ac..f84a63ac15af 100644
--- a/fs/bcachefs/journal_seq_blacklist.c
+++ b/fs/bcachefs/journal_seq_blacklist.c
@@ -254,7 +254,7 @@ void bch2_blacklist_entries_gc(struct work_struct *work)
 		struct btree *b;
 
 		for_each_btree_node(&trans, iter, i, POS_MIN,
-				    BTREE_ITER_PREFETCH, b)
+				    BTREE_ITER_PREFETCH, b, ret)
 			if (test_bit(BCH_FS_STOPPING, &c->flags)) {
 				bch2_trans_exit(&trans);
 				return;
@@ -262,7 +262,7 @@ void bch2_blacklist_entries_gc(struct work_struct *work)
 		bch2_trans_iter_exit(&trans, &iter);
 	}
 
-	ret = bch2_trans_exit(&trans);
+	bch2_trans_exit(&trans);
 	if (ret)
 		return;
 
diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c
index 94d5d99ffd2a..111a41159eb2 100644
--- a/fs/bcachefs/migrate.c
+++ b/fs/bcachefs/migrate.c
@@ -135,9 +135,10 @@ static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
 	for (id = 0; id < BTREE_ID_NR; id++) {
 		bch2_trans_node_iter_init(&trans, &iter, id, POS_MIN, 0, 0,
 					  BTREE_ITER_PREFETCH);
-
+retry:
 		while (bch2_trans_begin(&trans),
-		       (b = bch2_btree_iter_peek_node(&iter))) {
+		       (b = bch2_btree_iter_peek_node(&iter)) &&
+		       !(ret = PTR_ERR_OR_ZERO(b))) {
 			if (!bch2_bkey_has_device(bkey_i_to_s_c(&b->key),
 						  dev_idx))
 				goto next;
@@ -164,6 +165,9 @@ static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
 next:
 			bch2_btree_iter_next_node(&iter);
 		}
+		if (ret == -EINTR)
+			goto retry;
+
 		bch2_trans_iter_exit(&trans, &iter);
 
 		if (ret)
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index 0db0ce503cd5..2f608631cc43 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -885,9 +885,10 @@ static int bch2_move_btree(struct bch_fs *c,
 
 		bch2_trans_node_iter_init(&trans, &iter, id, POS_MIN, 0, 0,
 					  BTREE_ITER_PREFETCH);
-
+retry:
 		while (bch2_trans_begin(&trans),
-		       (b = bch2_btree_iter_peek_node(&iter))) {
+		       (b = bch2_btree_iter_peek_node(&iter)) &&
+		       !(ret = PTR_ERR_OR_ZERO(b))) {
 			if (kthread && kthread_should_stop())
 				break;
 
@@ -915,6 +916,9 @@ next:
 			bch2_trans_cond_resched(&trans);
 			bch2_btree_iter_next_node(&iter);
 		}
+		if (ret == -EINTR)
+			goto retry;
+
 		bch2_trans_iter_exit(&trans, &iter);
 
 		if (kthread && kthread_should_stop())
-- 
cgit 


From 9a796fdb06b56a1811f1afdd40b793e2848a990e Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 19 Oct 2021 15:08:00 -0400
Subject: bcachefs: bch2_trans_exit() no longer returns errors

Now that peek_node()/next_node() are converted to return errors
directly, we don't need bch2_trans_exit() to return errors - it's
cleaner this way and wasn't used much anymore.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_iter.c   | 8 ++------
 fs/bcachefs/btree_iter.h   | 2 +-
 fs/bcachefs/btree_types.h  | 1 -
 fs/bcachefs/btree_update.h | 6 +++---
 fs/bcachefs/dirent.c       | 2 +-
 fs/bcachefs/ec.c           | 5 +++--
 fs/bcachefs/fs-io.c        | 7 ++++---
 fs/bcachefs/fs.c           | 2 +-
 fs/bcachefs/fsck.c         | 9 ++++++---
 fs/bcachefs/migrate.c      | 4 ++--
 fs/bcachefs/move.c         | 2 +-
 fs/bcachefs/quota.c        | 6 ++++--
 fs/bcachefs/reflink.c      | 2 +-
 fs/bcachefs/sysfs.c        | 2 +-
 fs/bcachefs/xattr.c        | 2 +-
 15 files changed, 31 insertions(+), 29 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 339b3657683a..25ed4f2ce19c 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1346,10 +1346,8 @@ retry_all:
 		} while (ret);
 	}
 
-	if (unlikely(ret == -EIO)) {
-		trans->error = true;
+	if (unlikely(ret == -EIO))
 		goto out;
-	}
 
 	BUG_ON(ret && ret != -EINTR);
 
@@ -2781,7 +2779,7 @@ leaked:
 #endif
 }
 
-int bch2_trans_exit(struct btree_trans *trans)
+void bch2_trans_exit(struct btree_trans *trans)
 	__releases(&c->btree_trans_barrier)
 {
 	struct btree_insert_entry *i;
@@ -2831,8 +2829,6 @@ int bch2_trans_exit(struct btree_trans *trans)
 
 	trans->mem	= (void *) 0x1;
 	trans->paths	= (void *) 0x1;
-
-	return trans->error ? -EIO : 0;
 }
 
 static void __maybe_unused
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index 1cb4261bd66e..4cd05fd06e64 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -351,7 +351,7 @@ static inline void set_btree_iter_dontneed(struct btree_iter *iter)
 void *bch2_trans_kmalloc(struct btree_trans *, size_t);
 void bch2_trans_begin(struct btree_trans *);
 void bch2_trans_init(struct btree_trans *, struct bch_fs *, unsigned, size_t);
-int bch2_trans_exit(struct btree_trans *);
+void bch2_trans_exit(struct btree_trans *);
 
 void bch2_btree_trans_to_text(struct printbuf *, struct bch_fs *);
 
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 081b82d3848e..14acbdf34f7b 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -377,7 +377,6 @@ struct btree_trans {
 	u8			nr_sorted;
 	u8			nr_updates;
 	bool			used_mempool:1;
-	bool			error:1;
 	bool			in_traverse_all:1;
 	bool			restarted:1;
 	bool			paths_sorted:1;
diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
index 6f19b67c398f..2ffee9029f34 100644
--- a/fs/bcachefs/btree_update.h
+++ b/fs/bcachefs/btree_update.h
@@ -120,14 +120,14 @@ static inline int bch2_trans_commit(struct btree_trans *trans,
 #define bch2_trans_do(_c, _disk_res, _journal_seq, _flags, _do)		\
 ({									\
 	struct btree_trans trans;					\
-	int _ret, _ret2;						\
+	int _ret;							\
 									\
 	bch2_trans_init(&trans, (_c), 0, 0);				\
 	_ret = __bch2_trans_do(&trans, _disk_res, _journal_seq, _flags,	\
 			       _do);					\
-	_ret2 = bch2_trans_exit(&trans);				\
+	bch2_trans_exit(&trans);					\
 									\
-	_ret ?: _ret2;							\
+	_ret;								\
 })
 
 #define trans_for_each_update(_trans, _i)				\
diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c
index cd5468b15ba2..26df20ad090c 100644
--- a/fs/bcachefs/dirent.c
+++ b/fs/bcachefs/dirent.c
@@ -491,7 +491,7 @@ err:
 	if (ret == -EINTR)
 		goto retry;
 
-	ret = bch2_trans_exit(&trans) ?: ret;
+	bch2_trans_exit(&trans);
 
 	return ret;
 }
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index 2c538f9b54f8..7dfa052e9765 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -1670,11 +1670,12 @@ int bch2_ec_mem_alloc(struct bch_fs *c, bool gc)
 	bch2_trans_iter_init(&trans, &iter, BTREE_ID_stripes, POS(0, U64_MAX), 0);
 
 	k = bch2_btree_iter_prev(&iter);
-	if (!IS_ERR_OR_NULL(k.k))
+	ret = bkey_err(k);
+	if (!ret && k.k)
 		idx = k.k->p.offset + 1;
 
 	bch2_trans_iter_exit(&trans, &iter);
-	ret = bch2_trans_exit(&trans);
+	bch2_trans_exit(&trans);
 	if (ret)
 		return ret;
 
diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 736dd71419a5..079c20cbf10e 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -2223,7 +2223,8 @@ err:
 	if (ret == -EINTR)
 		goto retry;
 
-	return bch2_trans_exit(&trans) ?: ret;
+	bch2_trans_exit(&trans);
+	return ret;
 }
 
 static int __bch2_truncate_page(struct bch_inode_info *inode,
@@ -3125,7 +3126,7 @@ err:
 	if (ret == -EINTR)
 		goto retry;
 
-	ret = bch2_trans_exit(&trans) ?: ret;
+	bch2_trans_exit(&trans);
 	if (ret)
 		return ret;
 
@@ -3240,7 +3241,7 @@ err:
 	if (ret == -EINTR)
 		goto retry;
 
-	ret = bch2_trans_exit(&trans) ?: ret;
+	bch2_trans_exit(&trans);
 	if (ret)
 		return ret;
 
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index 7475830bb33f..334cd335ff11 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -1016,7 +1016,7 @@ err:
 		ret = bch2_fill_extent(c, info, bkey_i_to_s_c(prev.k),
 				       FIEMAP_EXTENT_LAST);
 
-	ret = bch2_trans_exit(&trans) ?: ret;
+	bch2_trans_exit(&trans);
 	bch2_bkey_buf_exit(&cur, c);
 	bch2_bkey_buf_exit(&prev, c);
 	return ret < 0 ? ret : 0;
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 826a3577ee93..a36bc840a62c 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -981,7 +981,8 @@ static int check_inodes(struct bch_fs *c, bool full)
 
 	BUG_ON(ret == -EINTR);
 
-	return bch2_trans_exit(&trans) ?: ret;
+	bch2_trans_exit(&trans);
+	return ret;
 }
 
 noinline_for_stack
@@ -1659,7 +1660,8 @@ fsck_err:
 		goto retry;
 
 	bch2_trans_iter_exit(&trans, &iter);
-	return bch2_trans_exit(&trans) ?: ret;
+	bch2_trans_exit(&trans);
+	return ret;
 }
 
 /* Get root directory, create if it doesn't exist: */
@@ -1876,7 +1878,8 @@ static int check_directory_structure(struct bch_fs *c)
 
 	kfree(path.entries);
 
-	return bch2_trans_exit(&trans) ?: ret;
+	bch2_trans_exit(&trans);
+	return ret;
 }
 
 struct nlink_table {
diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c
index 111a41159eb2..00ba6e1c92ee 100644
--- a/fs/bcachefs/migrate.c
+++ b/fs/bcachefs/migrate.c
@@ -100,7 +100,7 @@ static int __bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags
 	}
 	bch2_trans_iter_exit(&trans, &iter);
 
-	ret = bch2_trans_exit(&trans) ?: ret;
+	bch2_trans_exit(&trans);
 	bch2_bkey_buf_exit(&sk, c);
 
 	BUG_ON(ret == -EINTR);
@@ -180,7 +180,7 @@ next:
 
 	ret = 0;
 err:
-	ret = bch2_trans_exit(&trans) ?: ret;
+	bch2_trans_exit(&trans);
 	bch2_bkey_buf_exit(&k, c);
 
 	BUG_ON(ret == -EINTR);
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index 2f608631cc43..af02f2cf6ee0 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -773,7 +773,7 @@ next_nondata:
 out:
 
 	bch2_trans_iter_exit(&trans, &iter);
-	ret = bch2_trans_exit(&trans) ?: ret;
+	bch2_trans_exit(&trans);
 	bch2_bkey_buf_exit(&sk, c);
 
 	return ret;
diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c
index 9b0f4d3f176d..17fd5bf107bb 100644
--- a/fs/bcachefs/quota.c
+++ b/fs/bcachefs/quota.c
@@ -374,7 +374,8 @@ static int bch2_quota_init_type(struct bch_fs *c, enum quota_types type)
 	}
 	bch2_trans_iter_exit(&trans, &iter);
 
-	return bch2_trans_exit(&trans) ?: ret;
+	bch2_trans_exit(&trans);
+	return ret;
 }
 
 void bch2_fs_quota_exit(struct bch_fs *c)
@@ -452,7 +453,8 @@ int bch2_fs_quota_read(struct bch_fs *c)
 	}
 	bch2_trans_iter_exit(&trans, &iter);
 
-	return bch2_trans_exit(&trans) ?: ret;
+	bch2_trans_exit(&trans);
+	return ret;
 }
 
 /* Enable/disable/delete quotas for an entire filesystem: */
diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c
index c63c95fc49b1..9bcf4216a286 100644
--- a/fs/bcachefs/reflink.c
+++ b/fs/bcachefs/reflink.c
@@ -349,7 +349,7 @@ s64 bch2_remap_range(struct bch_fs *c,
 		bch2_trans_iter_exit(&trans, &inode_iter);
 	} while (ret2 == -EINTR);
 
-	ret = bch2_trans_exit(&trans) ?: ret;
+	bch2_trans_exit(&trans);
 	bch2_bkey_buf_exit(&new_src, c);
 	bch2_bkey_buf_exit(&new_dst, c);
 
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index 92e58f5c6bbf..51eb19b84a28 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -327,7 +327,7 @@ static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c
 		}
 	bch2_trans_iter_exit(&trans, &iter);
 
-	ret = bch2_trans_exit(&trans) ?: ret;
+	bch2_trans_exit(&trans);
 	if (ret)
 		return ret;
 
diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c
index ff81a25698ff..dcd2f6a91a72 100644
--- a/fs/bcachefs/xattr.c
+++ b/fs/bcachefs/xattr.c
@@ -316,7 +316,7 @@ err:
 	if (ret == -EINTR)
 		goto retry;
 
-	ret = bch2_trans_exit(&trans) ?: ret;
+	bch2_trans_exit(&trans);
 
 	if (ret)
 		return ret;
-- 
cgit 


From b71717dac64d76879ba3c70cfcfccf57991205f8 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 19 Oct 2021 15:11:45 -0400
Subject: bcachefs: Handle transaction restarts in bch2_blacklist_entries_gc()

It shouldn't be necessary when we're only using a single iterator and
not doing updates, but that's harder to debug at the moment.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/journal_seq_blacklist.c | 21 +++++++++++++++------
 fs/bcachefs/migrate.c               |  1 +
 fs/bcachefs/move.c                  |  1 +
 3 files changed, 17 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/journal_seq_blacklist.c b/fs/bcachefs/journal_seq_blacklist.c
index f84a63ac15af..79bc0e49389b 100644
--- a/fs/bcachefs/journal_seq_blacklist.c
+++ b/fs/bcachefs/journal_seq_blacklist.c
@@ -253,12 +253,21 @@ void bch2_blacklist_entries_gc(struct work_struct *work)
 		struct btree_iter iter;
 		struct btree *b;
 
-		for_each_btree_node(&trans, iter, i, POS_MIN,
-				    BTREE_ITER_PREFETCH, b, ret)
-			if (test_bit(BCH_FS_STOPPING, &c->flags)) {
-				bch2_trans_exit(&trans);
-				return;
-			}
+		bch2_trans_node_iter_init(&trans, &iter, i, POS_MIN,
+					  0, 0, BTREE_ITER_PREFETCH);
+retry:
+		bch2_trans_begin(&trans);
+
+		b = bch2_btree_iter_peek_node(&iter);
+
+		while (!(ret = PTR_ERR_OR_ZERO(b)) &&
+		       b &&
+		       !test_bit(BCH_FS_STOPPING, &c->flags))
+			b = bch2_btree_iter_next_node(&iter);
+
+		if (ret == -EINTR)
+			goto retry;
+
 		bch2_trans_iter_exit(&trans, &iter);
 	}
 
diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c
index 00ba6e1c92ee..6defc33322b3 100644
--- a/fs/bcachefs/migrate.c
+++ b/fs/bcachefs/migrate.c
@@ -136,6 +136,7 @@ static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
 		bch2_trans_node_iter_init(&trans, &iter, id, POS_MIN, 0, 0,
 					  BTREE_ITER_PREFETCH);
 retry:
+		ret = 0;
 		while (bch2_trans_begin(&trans),
 		       (b = bch2_btree_iter_peek_node(&iter)) &&
 		       !(ret = PTR_ERR_OR_ZERO(b))) {
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index af02f2cf6ee0..4e2bd1474a0a 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -886,6 +886,7 @@ static int bch2_move_btree(struct bch_fs *c,
 		bch2_trans_node_iter_init(&trans, &iter, id, POS_MIN, 0, 0,
 					  BTREE_ITER_PREFETCH);
 retry:
+		ret = 0;
 		while (bch2_trans_begin(&trans),
 		       (b = bch2_btree_iter_peek_node(&iter)) &&
 		       !(ret = PTR_ERR_OR_ZERO(b))) {
-- 
cgit 


From bfe88863cf3063204fc49a04307fa6635554d6e3 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 19 Oct 2021 17:30:16 -0400
Subject: bcachefs: New on disk format to fix reflink_p pointers

We had a bug where reflink_p pointers weren't being initialized to 0,
and when we started using the second word, things broke badly.

This patch revs the on disk format version and adds cleanup code to zero
out the second word of reflink_p pointers before we start using it.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/bcachefs_format.h |  8 ++---
 fs/bcachefs/fsck.c            | 68 ++++++++++++++++++++++++++++++++++++++++++-
 fs/bcachefs/recovery.c        |  8 ++---
 3 files changed, 73 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index 54023edc995e..579acb69115d 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -917,10 +917,7 @@ struct bch_stripe {
 struct bch_reflink_p {
 	struct bch_val		v;
 	__le64			idx;
-
-	__le32			reservation_generation;
-	__u8			nr_replicas;
-	__u8			pad[3];
+	__le64			v2;
 };
 
 struct bch_reflink_v {
@@ -1263,7 +1260,8 @@ enum bcachefs_metadata_version {
 	bcachefs_metadata_version_inode_backpointers	= 13,
 	bcachefs_metadata_version_btree_ptr_sectors_written = 14,
 	bcachefs_metadata_version_snapshot_2		= 15,
-	bcachefs_metadata_version_max			= 16,
+	bcachefs_metadata_version_reflink_p_fix		= 16,
+	bcachefs_metadata_version_max			= 17,
 };
 
 #define bcachefs_metadata_version_current	(bcachefs_metadata_version_max - 1)
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index a36bc840a62c..b43c31b95dff 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -2154,6 +2154,71 @@ static int check_nlinks(struct bch_fs *c)
 	return ret;
 }
 
+static int fix_reflink_p_key(struct btree_trans *trans, struct btree_iter *iter)
+{
+	struct bkey_s_c k;
+	struct bkey_s_c_reflink_p p;
+	struct bkey_i_reflink_p *u;
+	int ret;
+
+	k = bch2_btree_iter_peek(iter);
+	if (!k.k)
+		return 0;
+
+	ret = bkey_err(k);
+	if (ret)
+		return ret;
+
+	if (k.k->type != KEY_TYPE_reflink_p)
+		return 0;
+
+	p = bkey_s_c_to_reflink_p(k);
+
+	if (!p.v->v2)
+		return 0;
+
+	u = bch2_trans_kmalloc(trans, sizeof(*u));
+	ret = PTR_ERR_OR_ZERO(u);
+	if (ret)
+		return ret;
+
+	bkey_reassemble(&u->k_i, k);
+	u->v.v2 = 0;
+
+	return bch2_trans_update(trans, iter, &u->k_i, 0);
+}
+
+static int fix_reflink_p(struct bch_fs *c)
+{
+	struct btree_trans trans;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	int ret;
+
+	if (c->sb.version >= bcachefs_metadata_version_reflink_p_fix)
+		return 0;
+
+	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
+
+	for_each_btree_key(&trans, iter, BTREE_ID_extents, POS_MIN,
+			   BTREE_ITER_INTENT|
+			   BTREE_ITER_PREFETCH|
+			   BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
+		if (k.k->type == KEY_TYPE_reflink_p) {
+			ret = __bch2_trans_do(&trans, NULL, NULL,
+					      BTREE_INSERT_NOFAIL|
+					      BTREE_INSERT_LAZY_RW,
+					      fix_reflink_p_key(&trans, &iter));
+			if (ret)
+				break;
+		}
+	}
+	bch2_trans_iter_exit(&trans, &iter);
+
+	bch2_trans_exit(&trans);
+	return ret;
+}
+
 /*
  * Checks for inconsistencies that shouldn't happen, unless we have a bug.
  * Doesn't fix them yet, mainly because they haven't yet been observed:
@@ -2168,7 +2233,8 @@ int bch2_fsck_full(struct bch_fs *c)
 		check_xattrs(c) ?:
 		check_root(c) ?:
 		check_directory_structure(c) ?:
-		check_nlinks(c);
+		check_nlinks(c) ?:
+		fix_reflink_p(c);
 }
 
 int bch2_fsck_walk_inodes_only(struct bch_fs *c)
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 6afb37a2e1b0..8c53b1e977d1 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -1086,12 +1086,10 @@ int bch2_fs_recovery(struct bch_fs *c)
 		c->opts.version_upgrade	= true;
 		c->opts.fsck		= true;
 		c->opts.fix_errors	= FSCK_OPT_YES;
-	} else if (c->sb.version < bcachefs_metadata_version_btree_ptr_sectors_written) {
-		bch_info(c, "version prior to btree_ptr_sectors_written, upgrade required");
-		c->opts.version_upgrade	= true;
-	} else if (c->sb.version < bcachefs_metadata_version_snapshot_2) {
-		bch_info(c, "filesystem version is prior to snapshots - upgrading");
+	} else if (c->sb.version < bcachefs_metadata_version_reflink_p_fix) {
+		bch_info(c, "filesystem version is prior to reflink_p fix - upgrading");
 		c->opts.version_upgrade = true;
+		c->opts.fsck		= true;
 	}
 
 	ret = bch2_blacklist_table_initialize(c);
-- 
cgit 


From 6d76aefea1902a11c47e20fec5495d30a39891f3 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 14 Oct 2021 09:54:47 -0400
Subject: bcachefs: Fix for leaking of reflinked extents

When a reflink pointer points to only part of an indirect extent, and
then that indirect extent is fragmented (e.g. by copygc), if the reflink
pointer only points to one of the fragments we leak a reference.

Fix this by storing front/back pad values in reflink pointers - when
inserting reflink pointesr, we initialize them to cover the full range
of the indirect extents we reference.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/bcachefs_format.h | 15 ++++++++++++---
 fs/bcachefs/buckets.c         | 45 +++++++++++++++++++++++++++++++++++++------
 fs/bcachefs/fsck.c            |  5 +++--
 fs/bcachefs/reflink.c         |  4 ++++
 4 files changed, 58 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index 579acb69115d..4b2bf8f7b28a 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -917,15 +917,24 @@ struct bch_stripe {
 struct bch_reflink_p {
 	struct bch_val		v;
 	__le64			idx;
-	__le64			v2;
-};
+	/*
+	 * A reflink pointer might point to an indirect extent which is then
+	 * later split (by copygc or rebalance). If we only pointed to part of
+	 * the original indirect extent, and then one of the fragments is
+	 * outside the range we point to, we'd leak a refcount: so when creating
+	 * reflink pointers, we need to store pad values to remember the full
+	 * range we were taking a reference on.
+	 */
+	__le32			front_pad;
+	__le32			back_pad;
+} __attribute__((packed, aligned(8)));
 
 struct bch_reflink_v {
 	struct bch_val		v;
 	__le64			refcount;
 	union bch_extent_entry	start[0];
 	__u64			_data[0];
-};
+} __attribute__((packed, aligned(8)));
 
 struct bch_indirect_inline_data {
 	struct bch_val		v;
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 9c5d18b4efaa..ee1c71e011c7 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -1180,8 +1180,10 @@ static int bch2_mark_reflink_p(struct bch_fs *c,
 	struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
 	struct reflink_gc *ref;
 	size_t l, r, m;
-	u64 idx = le64_to_cpu(p.v->idx);
-	unsigned sectors = p.k->size;
+	u64 idx = le64_to_cpu(p.v->idx) - le32_to_cpu(p.v->front_pad);
+	u64 sectors = (u64) le32_to_cpu(p.v->front_pad) +
+			    le32_to_cpu(p.v->back_pad) +
+			    p.k->size;
 	s64 ret = 0;
 
 	BUG_ON((flags & (BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE)) ==
@@ -1758,12 +1760,33 @@ static int __bch2_trans_mark_reflink_p(struct btree_trans *trans,
 		bch2_fs_inconsistent(c,
 			"%llu:%llu len %u points to nonexistent indirect extent %llu",
 			p.k->p.inode, p.k->p.offset, p.k->size, idx);
-		bch2_inconsistent_error(c);
 		ret = -EIO;
 		goto err;
 	}
 
-	BUG_ON(!*refcount && (flags & BTREE_TRIGGER_OVERWRITE));
+	if (!*refcount && (flags & BTREE_TRIGGER_OVERWRITE)) {
+		bch2_fs_inconsistent(c,
+			"%llu:%llu len %u idx %llu indirect extent refcount underflow",
+			p.k->p.inode, p.k->p.offset, p.k->size, idx);
+		ret = -EIO;
+		goto err;
+	}
+
+	if (flags & BTREE_TRIGGER_INSERT) {
+		struct bch_reflink_p *v = (struct bch_reflink_p *) p.v;
+		u64 pad;
+
+		pad = max_t(s64, le32_to_cpu(v->front_pad),
+			    le64_to_cpu(v->idx) - bkey_start_offset(k.k));
+		BUG_ON(pad > U32_MAX);
+		v->front_pad = cpu_to_le32(pad);
+
+		pad = max_t(s64, le32_to_cpu(v->back_pad),
+			    k.k->p.offset - p.k->size - le64_to_cpu(v->idx));
+		BUG_ON(pad > U32_MAX);
+		v->back_pad = cpu_to_le32(pad);
+	}
+
 	le64_add_cpu(refcount, add);
 
 	if (!*refcount) {
@@ -1786,10 +1809,20 @@ static int bch2_trans_mark_reflink_p(struct btree_trans *trans,
 				     struct bkey_s_c k, unsigned flags)
 {
 	struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
-	u64 idx = le64_to_cpu(p.v->idx);
-	unsigned sectors = p.k->size;
+	u64 idx, sectors;
 	s64 ret = 0;
 
+	if (flags & BTREE_TRIGGER_INSERT) {
+		struct bch_reflink_p *v = (struct bch_reflink_p *) p.v;
+
+		v->front_pad = v->back_pad = 0;
+	}
+
+	idx = le64_to_cpu(p.v->idx) - le32_to_cpu(p.v->front_pad);
+	sectors = (u64) le32_to_cpu(p.v->front_pad) +
+			le32_to_cpu(p.v->back_pad) +
+			p.k->size;
+
 	while (sectors) {
 		ret = __bch2_trans_mark_reflink_p(trans, p, idx, flags);
 		if (ret < 0)
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index b43c31b95dff..c99e1514fd4f 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -2174,7 +2174,7 @@ static int fix_reflink_p_key(struct btree_trans *trans, struct btree_iter *iter)
 
 	p = bkey_s_c_to_reflink_p(k);
 
-	if (!p.v->v2)
+	if (!p.v->front_pad && !p.v->back_pad)
 		return 0;
 
 	u = bch2_trans_kmalloc(trans, sizeof(*u));
@@ -2183,7 +2183,8 @@ static int fix_reflink_p_key(struct btree_trans *trans, struct btree_iter *iter)
 		return ret;
 
 	bkey_reassemble(&u->k_i, k);
-	u->v.v2 = 0;
+	u->v.front_pad	= 0;
+	u->v.back_pad	= 0;
 
 	return bch2_trans_update(trans, iter, &u->k_i, 0);
 }
diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c
index 9bcf4216a286..2827d0ef1019 100644
--- a/fs/bcachefs/reflink.c
+++ b/fs/bcachefs/reflink.c
@@ -32,6 +32,10 @@ const char *bch2_reflink_p_invalid(const struct bch_fs *c, struct bkey_s_c k)
 	if (bkey_val_bytes(p.k) != sizeof(*p.v))
 		return "incorrect value size";
 
+	if (c->sb.version >= bcachefs_metadata_version_reflink_p_fix &&
+	    le64_to_cpu(p.v->idx) < le32_to_cpu(p.v->front_pad))
+		return "idx < front_pad";
+
 	return NULL;
 }
 
-- 
cgit 


From 6e0c886d3ccd81d87054269b96de6e4eb6ba0edd Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 20 Oct 2021 17:59:38 -0400
Subject: bcachefs: Fix check_path() for snapshots

check_path() wasn't checking the snapshot ID when checking for directory
structure loops - so, snapshots would cause us to detect a loop that
wasn't actually a loop.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/fsck.c | 64 ++++++++++++++++++++++++++++++++++++++----------------
 1 file changed, 45 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index c99e1514fd4f..d6f37b9e00fb 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -1357,10 +1357,10 @@ static int check_dirent_target(struct btree_trans *trans,
 		}
 
 		if (fsck_err_on(!backpointer_exists, c,
-				"inode %llu has wrong backpointer:\n"
+				"inode %llu:%u has wrong backpointer:\n"
 				"got       %llu:%llu\n"
 				"should be %llu:%llu",
-				target->bi_inum,
+				target->bi_inum, target_snapshot,
 				target->bi_dir,
 				target->bi_dir_offset,
 				d.k->p.inode,
@@ -1730,10 +1730,23 @@ struct pathbuf {
 
 	struct pathbuf_entry {
 		u64	inum;
+		u32	snapshot;
 	}		*entries;
 };
 
-static int path_down(struct pathbuf *p, u64 inum)
+static bool path_is_dup(struct pathbuf *p, u64 inum, u32 snapshot)
+{
+	struct pathbuf_entry *i;
+
+	for (i = p->entries; i < p->entries + p->nr; i++)
+		if (i->inum	== inum &&
+		    i->snapshot	== snapshot)
+			return true;
+
+	return false;
+}
+
+static int path_down(struct pathbuf *p, u64 inum, u32 snapshot)
 {
 	if (p->nr == p->size) {
 		size_t new_size = max_t(size_t, 256UL, p->size * 2);
@@ -1749,18 +1762,23 @@ static int path_down(struct pathbuf *p, u64 inum)
 	};
 
 	p->entries[p->nr++] = (struct pathbuf_entry) {
-		.inum = inum,
+		.inum		= inum,
+		.snapshot	= snapshot,
 	};
 	return 0;
 }
 
+/*
+ * Check that a given inode is reachable from the root:
+ *
+ * XXX: we should also be verifying that inodes are in the right subvolumes
+ */
 static int check_path(struct btree_trans *trans,
 		      struct pathbuf *p,
 		      struct bch_inode_unpacked *inode,
 		      u32 snapshot)
 {
 	struct bch_fs *c = trans->c;
-	size_t i;
 	int ret = 0;
 
 	snapshot = snapshot_t(c, snapshot)->equiv;
@@ -1768,17 +1786,19 @@ static int check_path(struct btree_trans *trans,
 
 	while (!(inode->bi_inum == BCACHEFS_ROOT_INO &&
 		 inode->bi_subvol == BCACHEFS_ROOT_SUBVOL)) {
+		u32 parent_snapshot = snapshot;
+
 		if (inode->bi_parent_subvol) {
 			u64 inum;
 
 			ret = subvol_lookup(trans, inode->bi_parent_subvol,
-					    &snapshot, &inum);
+					    &parent_snapshot, &inum);
 			if (ret)
 				break;
 		}
 
 		ret = lockrestart_do(trans,
-			inode_backpointer_exists(trans, inode, snapshot));
+			inode_backpointer_exists(trans, inode, parent_snapshot));
 		if (ret < 0)
 			break;
 
@@ -1797,17 +1817,31 @@ static int check_path(struct btree_trans *trans,
 		if (!S_ISDIR(inode->bi_mode))
 			break;
 
-		ret = path_down(p, inode->bi_inum);
+		ret = path_down(p, inode->bi_inum, snapshot);
 		if (ret) {
 			bch_err(c, "memory allocation failure");
 			return ret;
 		}
 
-		for (i = 0; i < p->nr; i++) {
-			if (inode->bi_dir != p->entries[i].inum)
-				continue;
+		snapshot = parent_snapshot;
+
+		ret = lookup_inode(trans, inode->bi_dir, inode, &snapshot);
+		if (ret) {
+			/* Should have been caught in dirents pass */
+			bch_err(c, "error looking up parent directory: %i", ret);
+			break;
+		}
+
+		if (path_is_dup(p, inode->bi_inum, snapshot)) {
+			struct pathbuf_entry *i;
 
 			/* XXX print path */
+			bch_err(c, "directory structure loop");
+
+			for (i = p->entries; i < p->entries + p->nr; i++)
+				pr_err("%llu:%u", i->inum, i->snapshot);
+			pr_err("%llu:%u", inode->bi_inum, snapshot);
+
 			if (!fsck_err(c, "directory structure loop"))
 				return 0;
 
@@ -1819,14 +1853,6 @@ static int check_path(struct btree_trans *trans,
 			}
 
 			ret = reattach_inode(trans, inode, snapshot);
-			break;
-		}
-
-		ret = lookup_inode(trans, inode->bi_dir, inode, &snapshot);
-		if (ret) {
-			/* Should have been caught in dirents pass */
-			bch_err(c, "error looking up parent directory: %i", ret);
-			break;
 		}
 	}
 fsck_err:
-- 
cgit 


From 521b80676927df6b4ad4fc87b7756aad43b96d12 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 20 Oct 2021 20:50:07 -0400
Subject: bcachefs: Delete dentry when deleting snapshots

This fixes a bug where subsequently doing creates with the same name
fails.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/fs-ioctl.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c
index ae402d350d4c..a12b591ec9ca 100644
--- a/fs/bcachefs/fs-ioctl.c
+++ b/fs/bcachefs/fs-ioctl.c
@@ -422,6 +422,7 @@ static long bch2_ioctl_subvolume_destroy(struct bch_fs *c, struct file *filp,
 				struct bch_ioctl_subvolume arg)
 {
 	struct path path;
+	struct inode *dir;
 	int ret = 0;
 
 	if (arg.flags)
@@ -438,7 +439,13 @@ static long bch2_ioctl_subvolume_destroy(struct bch_fs *c, struct file *filp,
 		return -EXDEV;
 	}
 
-	ret = __bch2_unlink(path.dentry->d_parent->d_inode, path.dentry, 1);
+	dir = path.dentry->d_parent->d_inode;
+
+	ret = __bch2_unlink(dir, path.dentry, 1);
+	if (!ret) {
+		fsnotify_rmdir(dir, path.dentry);
+		d_delete(path.dentry);
+	}
 	path_put(&path);
 
 	return ret;
-- 
cgit 


From d17bc1739c5adaf9421cbc51b40e50da677c6b54 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 21 Oct 2021 00:38:13 -0400
Subject: bcachefs: cached data shouldn't prevent fs from mounting

It's not an error if we don't have cached data - skip BCH_DATA_cached in
bch2_have_enough_devs().

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/replicas.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c
index 64fdf53a630a..57e093983dfc 100644
--- a/fs/bcachefs/replicas.c
+++ b/fs/bcachefs/replicas.c
@@ -1017,6 +1017,9 @@ bool bch2_have_enough_devs(struct bch_fs *c, struct bch_devs_mask devs,
 		unsigned i, nr_online = 0, nr_failed = 0, dflags = 0;
 		bool metadata = e->data_type < BCH_DATA_user;
 
+		if (e->data_type == BCH_DATA_cached)
+			continue;
+
 		for (i = 0; i < e->nr_devs; i++) {
 			struct bch_dev *ca = bch_dev_bkey_exists(c, e->devs[i]);
 
-- 
cgit 


From e5fa91d7ac88ac6a8385c14dbc8dcbe1a053e62f Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 21 Oct 2021 12:05:21 -0400
Subject: bcachefs: Fix restart handling in for_each_btree_key()

Code that uses for_each_btree_key often wants transaction restarts to be
handled locally and not returned. Originally, we wouldn't return
transaction restarts if there was a single iterator in the transaction -
the reasoning being if there weren't other iterators being invalidated,
and the current iterator was being advanced/retraversed, there weren't
any locks or iterators we were required to preserve.

But with the btree_path conversion that approach doesn't work anymore -
even when we're using for_each_btree_key() with a single iterator there
will still be two paths in the transaction, since we now always preserve
the path at the pos the iterator was initialized at - the reason being
that on restart we often restart from the same place.

And it turns out there's now a lot of for_each_btree_key() uses that _do
not_ want transaction restarts handled locally, and should be returning
them.

This patch splits out for_each_btree_key_norestart() and
for_each_btree_key_continue_norestart(), and converts existing users as
appropriate. for_each_btree_key(), for_each_btree_key_continue(), and
for_each_btree_node() now handle transaction restarts themselves by
calling bch2_trans_begin() when necessary - and the old hack to not
return transaction restarts when there's a single path in the
transaction has been deleted.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/bcachefs_format.h   |  2 -
 fs/bcachefs/btree_iter.c        | 14 +-----
 fs/bcachefs/btree_iter.h        | 98 +++++++++++++++++++++++++++--------------
 fs/bcachefs/btree_update_leaf.c |  2 +-
 fs/bcachefs/dirent.c            |  4 +-
 fs/bcachefs/extent_update.c     |  4 +-
 fs/bcachefs/fs-io.c             |  8 ++--
 fs/bcachefs/io.c                |  2 +-
 fs/bcachefs/reflink.c           |  4 +-
 fs/bcachefs/str_hash.h          |  8 ++--
 fs/bcachefs/xattr.c             |  2 +-
 11 files changed, 83 insertions(+), 65 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index 4b2bf8f7b28a..6ed3ca075ba0 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -1041,8 +1041,6 @@ LE64_BITMASK(BCH_MEMBER_DATA_ALLOWED,	struct bch_member, flags[0], 15, 20)
 LE64_BITMASK(BCH_MEMBER_GROUP,		struct bch_member, flags[0], 20, 28)
 LE64_BITMASK(BCH_MEMBER_DURABILITY,	struct bch_member, flags[0], 28, 30)
 
-#define BCH_TIER_MAX			4U
-
 #if 0
 LE64_BITMASK(BCH_MEMBER_NR_READ_ERRORS,	struct bch_member, flags[1], 0,  20);
 LE64_BITMASK(BCH_MEMBER_NR_WRITE_ERRORS,struct bch_member, flags[1], 20, 40);
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 25ed4f2ce19c..2b51245dd5fe 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1511,19 +1511,11 @@ static int __btree_path_traverse_all(struct btree_trans *, int, unsigned long);
 int __must_check bch2_btree_path_traverse(struct btree_trans *trans,
 					  struct btree_path *path, unsigned flags)
 {
-	int ret;
-
 	if (path->uptodate < BTREE_ITER_NEED_RELOCK)
 		return 0;
 
-	ret =   bch2_trans_cond_resched(trans) ?:
+	return  bch2_trans_cond_resched(trans) ?:
 		btree_path_traverse_one(trans, path, flags, _RET_IP_);
-	if (unlikely(ret) && hweight64(trans->paths_allocated) == 1) {
-		ret = __btree_path_traverse_all(trans, ret, _RET_IP_);
-		BUG_ON(ret == -EINTR);
-	}
-
-	return ret;
 }
 
 static void btree_path_copy(struct btree_trans *trans, struct btree_path *dst,
@@ -1937,10 +1929,6 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
 	if (!btree_path_node(path, path->level))
 		goto out;
 
-	ret = bch2_trans_cond_resched(trans);
-	if (ret)
-		goto err;
-
 	btree_node_unlock(path, path->level);
 	path->l[path->level].b = BTREE_ITER_NO_NODE_UP;
 	path->level++;
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index 4cd05fd06e64..fea1101155be 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -284,11 +284,39 @@ static inline int bch2_trans_cond_resched(struct btree_trans *trans)
 	}
 }
 
+void bch2_trans_iter_exit(struct btree_trans *, struct btree_iter *);
+void bch2_trans_iter_init(struct btree_trans *, struct btree_iter *,
+			  unsigned, struct bpos, unsigned);
+void bch2_trans_node_iter_init(struct btree_trans *, struct btree_iter *,
+			       enum btree_id, struct bpos,
+			       unsigned, unsigned, unsigned);
+void bch2_trans_copy_iter(struct btree_iter *, struct btree_iter *);
+
+static inline void set_btree_iter_dontneed(struct btree_iter *iter)
+{
+	iter->path->preserve = false;
+}
+
+void *bch2_trans_kmalloc(struct btree_trans *, size_t);
+void bch2_trans_begin(struct btree_trans *);
+
+static inline struct btree *
+__btree_iter_peek_node_and_restart(struct btree_trans *trans, struct btree_iter *iter)
+{
+	struct btree *b;
+
+	while (b = bch2_btree_iter_peek_node(iter),
+	       PTR_ERR_OR_ZERO(b) == -EINTR)
+		bch2_trans_begin(trans);
+
+	return b;
+}
+
 #define __for_each_btree_node(_trans, _iter, _btree_id, _start,		\
 			      _locks_want, _depth, _flags, _b, _ret)	\
 	for (bch2_trans_node_iter_init((_trans), &(_iter), (_btree_id),	\
-				_start, _locks_want, _depth, _flags),	\
-	     _b = bch2_btree_iter_peek_node(&(_iter));			\
+				_start, _locks_want, _depth, _flags);	\
+	     (_b) = __btree_iter_peek_node_and_restart((_trans), &(_iter)),\
 	     !((_ret) = PTR_ERR_OR_ZERO(_b)) && (_b);			\
 	     (_b) = bch2_btree_iter_next_node(&(_iter)))
 
@@ -297,6 +325,11 @@ static inline int bch2_trans_cond_resched(struct btree_trans *trans)
 	__for_each_btree_node(_trans, _iter, _btree_id, _start,		\
 			      0, 0, _flags, _b, _ret)
 
+static inline int bkey_err(struct bkey_s_c k)
+{
+	return PTR_ERR_OR_ZERO(k.k);
+}
+
 static inline struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter,
 						     unsigned flags)
 {
@@ -305,51 +338,50 @@ static inline struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter,
 		: bch2_btree_iter_peek(iter);
 }
 
-static inline struct bkey_s_c __bch2_btree_iter_next(struct btree_iter *iter,
-						     unsigned flags)
+static inline struct bkey_s_c
+__bch2_btree_iter_peek_and_restart(struct btree_trans *trans,
+				   struct btree_iter *iter, unsigned flags)
 {
-	return flags & BTREE_ITER_SLOTS
-		? bch2_btree_iter_next_slot(iter)
-		: bch2_btree_iter_next(iter);
-}
+	struct bkey_s_c k;
 
-static inline int bkey_err(struct bkey_s_c k)
-{
-	return PTR_ERR_OR_ZERO(k.k);
+	while (k = __bch2_btree_iter_peek(iter, flags),
+	       bkey_err(k) == -EINTR)
+		bch2_trans_begin(trans);
+
+	return k;
 }
 
 #define for_each_btree_key(_trans, _iter, _btree_id,			\
 			   _start, _flags, _k, _ret)			\
 	for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id),	\
-				  (_start), (_flags)),			\
-	     (_k) = __bch2_btree_iter_peek(&(_iter), _flags);		\
+				  (_start), (_flags));			\
+	     (_k) = __bch2_btree_iter_peek_and_restart((_trans), &(_iter), _flags),\
 	     !((_ret) = bkey_err(_k)) && (_k).k;			\
-	     (_k) = __bch2_btree_iter_next(&(_iter), _flags))
+	     bch2_btree_iter_advance(&(_iter)))
 
-#define for_each_btree_key_continue(_iter, _flags, _k, _ret)		\
-	for ((_k) = __bch2_btree_iter_peek(&(_iter), _flags);		\
+#define for_each_btree_key_norestart(_trans, _iter, _btree_id,		\
+			   _start, _flags, _k, _ret)			\
+	for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id),	\
+				  (_start), (_flags));			\
+	     (_k) = __bch2_btree_iter_peek(&(_iter), _flags),		\
 	     !((_ret) = bkey_err(_k)) && (_k).k;			\
-	     (_k) = __bch2_btree_iter_next(&(_iter), _flags))
+	     bch2_btree_iter_advance(&(_iter)))
 
-/* new multiple iterator interface: */
-
-void bch2_dump_trans_paths_updates(struct btree_trans *);
+#define for_each_btree_key_continue(_trans, _iter, _flags, _k, _ret)	\
+	for (;								\
+	     (_k) = __bch2_btree_iter_peek_and_restart((_trans), &(_iter), _flags),\
+	     !((_ret) = bkey_err(_k)) && (_k).k;			\
+	     bch2_btree_iter_advance(&(_iter)))
 
-void bch2_trans_iter_exit(struct btree_trans *, struct btree_iter *);
-void bch2_trans_iter_init(struct btree_trans *, struct btree_iter *,
-			  unsigned, struct bpos, unsigned);
-void bch2_trans_node_iter_init(struct btree_trans *, struct btree_iter *,
-			       enum btree_id, struct bpos,
-			       unsigned, unsigned, unsigned);
-void bch2_trans_copy_iter(struct btree_iter *, struct btree_iter *);
+#define for_each_btree_key_continue_norestart(_iter, _flags, _k, _ret)	\
+	for (;								\
+	     (_k) = __bch2_btree_iter_peek(&(_iter), _flags),		\
+	     !((_ret) = bkey_err(_k)) && (_k).k;			\
+	     bch2_btree_iter_advance(&(_iter)))
 
-static inline void set_btree_iter_dontneed(struct btree_iter *iter)
-{
-	iter->path->preserve = false;
-}
+/* new multiple iterator interface: */
 
-void *bch2_trans_kmalloc(struct btree_trans *, size_t);
-void bch2_trans_begin(struct btree_trans *);
+void bch2_dump_trans_paths_updates(struct btree_trans *);
 void bch2_trans_init(struct btree_trans *, struct bch_fs *, unsigned, size_t);
 void bch2_trans_exit(struct btree_trans *);
 
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index b4a2f2e32248..b344979ca7ed 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -1216,7 +1216,7 @@ static int need_whiteout_for_snapshot(struct btree_trans *trans,
 
 	pos.snapshot++;
 
-	for_each_btree_key(trans, iter, btree_id, pos,
+	for_each_btree_key_norestart(trans, iter, btree_id, pos,
 			   BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
 		if (bkey_cmp(k.k->p, pos))
 			break;
diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c
index 26df20ad090c..00dac68701f5 100644
--- a/fs/bcachefs/dirent.c
+++ b/fs/bcachefs/dirent.c
@@ -432,7 +432,7 @@ int bch2_empty_dir_trans(struct btree_trans *trans, subvol_inum dir)
 	if (ret)
 		return ret;
 
-	for_each_btree_key(trans, iter, BTREE_ID_dirents,
+	for_each_btree_key_norestart(trans, iter, BTREE_ID_dirents,
 			   SPOS(dir.inum, 0, snapshot), 0, k, ret) {
 		if (k.k->p.inode > dir.inum)
 			break;
@@ -464,7 +464,7 @@ retry:
 	if (ret)
 		goto err;
 
-	for_each_btree_key(&trans, iter, BTREE_ID_dirents,
+	for_each_btree_key_norestart(&trans, iter, BTREE_ID_dirents,
 			   SPOS(inum.inum, ctx->pos, snapshot), 0, k, ret) {
 		if (k.k->p.inode > inum.inum)
 			break;
diff --git a/fs/bcachefs/extent_update.c b/fs/bcachefs/extent_update.c
index 9d959b053def..58b2c96f450c 100644
--- a/fs/bcachefs/extent_update.c
+++ b/fs/bcachefs/extent_update.c
@@ -61,7 +61,7 @@ static int count_iters_for_insert(struct btree_trans *trans,
 		struct btree_iter iter;
 		struct bkey_s_c r_k;
 
-		for_each_btree_key(trans, iter,
+		for_each_btree_key_norestart(trans, iter,
 				   BTREE_ID_reflink, POS(0, idx + offset),
 				   BTREE_ITER_SLOTS, r_k, ret2) {
 			if (bkey_cmp(bkey_start_pos(r_k.k),
@@ -120,7 +120,7 @@ int bch2_extent_atomic_end(struct btree_trans *trans,
 
 	bch2_trans_copy_iter(&copy, iter);
 
-	for_each_btree_key_continue(copy, 0, k, ret) {
+	for_each_btree_key_continue_norestart(copy, 0, k, ret) {
 		unsigned offset = 0;
 
 		if (bkey_cmp(bkey_start_pos(k.k), *end) >= 0)
diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 079c20cbf10e..f4c97fc0e3d1 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -1815,7 +1815,7 @@ retry:
 	if (err)
 		goto err;
 
-	for_each_btree_key(&trans, iter, BTREE_ID_extents,
+	for_each_btree_key_norestart(&trans, iter, BTREE_ID_extents,
 			   SPOS(inum.inum, offset, snapshot),
 			   BTREE_ITER_SLOTS, k, err) {
 		if (bkey_cmp(bkey_start_pos(k.k), POS(inum.inum, end)) >= 0)
@@ -2208,7 +2208,7 @@ retry:
 	if (ret)
 		goto err;
 
-	for_each_btree_key(&trans, iter, BTREE_ID_extents, start, 0, k, ret) {
+	for_each_btree_key_norestart(&trans, iter, BTREE_ID_extents, start, 0, k, ret) {
 		if (bkey_cmp(bkey_start_pos(k.k), end) >= 0)
 			break;
 
@@ -3111,7 +3111,7 @@ retry:
 	if (ret)
 		goto err;
 
-	for_each_btree_key(&trans, iter, BTREE_ID_extents,
+	for_each_btree_key_norestart(&trans, iter, BTREE_ID_extents,
 			   SPOS(inode->v.i_ino, offset >> 9, snapshot), 0, k, ret) {
 		if (k.k->p.inode != inode->v.i_ino) {
 			break;
@@ -3218,7 +3218,7 @@ retry:
 	if (ret)
 		goto err;
 
-	for_each_btree_key(&trans, iter, BTREE_ID_extents,
+	for_each_btree_key_norestart(&trans, iter, BTREE_ID_extents,
 			   SPOS(inode->v.i_ino, offset >> 9, snapshot),
 			   BTREE_ITER_SLOTS, k, ret) {
 		if (k.k->p.inode != inode->v.i_ino) {
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index ea2adcc213d0..772fdeb722c7 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -216,7 +216,7 @@ int bch2_sum_sector_overwrites(struct btree_trans *trans,
 
 	bch2_trans_copy_iter(&iter, extent_iter);
 
-	for_each_btree_key_continue(iter, BTREE_ITER_SLOTS, old, ret) {
+	for_each_btree_key_continue_norestart(iter, BTREE_ITER_SLOTS, old, ret) {
 		s64 sectors = min(new->k.p.offset, old.k->p.offset) -
 			max(bkey_start_offset(&new->k),
 			    bkey_start_offset(old.k));
diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c
index 2827d0ef1019..8e66e6390e62 100644
--- a/fs/bcachefs/reflink.c
+++ b/fs/bcachefs/reflink.c
@@ -131,7 +131,7 @@ static int bch2_make_extent_indirect(struct btree_trans *trans,
 	if (orig->k.type == KEY_TYPE_inline_data)
 		bch2_check_set_feature(c, BCH_FEATURE_reflink_inline_data);
 
-	for_each_btree_key(trans, reflink_iter, BTREE_ID_reflink,
+	for_each_btree_key_norestart(trans, reflink_iter, BTREE_ID_reflink,
 			   POS(0, c->reflink_hint),
 			   BTREE_ITER_INTENT|BTREE_ITER_SLOTS, k, ret) {
 		if (reflink_iter.pos.inode) {
@@ -194,7 +194,7 @@ static struct bkey_s_c get_next_src(struct btree_iter *iter, struct bpos end)
 	struct bkey_s_c k;
 	int ret;
 
-	for_each_btree_key_continue(*iter, 0, k, ret) {
+	for_each_btree_key_continue_norestart(*iter, 0, k, ret) {
 		if (bkey_cmp(iter->pos, end) >= 0)
 			break;
 
diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h
index 6486e709b700..3e54d0b0fb5c 100644
--- a/fs/bcachefs/str_hash.h
+++ b/fs/bcachefs/str_hash.h
@@ -156,7 +156,7 @@ bch2_hash_lookup(struct btree_trans *trans,
 	if (ret)
 		return ret;
 
-	for_each_btree_key(trans, *iter, desc.btree_id,
+	for_each_btree_key_norestart(trans, *iter, desc.btree_id,
 			   SPOS(inum.inum, desc.hash_key(info, key), snapshot),
 			   BTREE_ITER_SLOTS|flags, k, ret) {
 		if (iter->pos.inode != inum.inum)
@@ -192,7 +192,7 @@ bch2_hash_hole(struct btree_trans *trans,
 	if (ret)
 		return ret;
 
-	for_each_btree_key(trans, *iter, desc.btree_id,
+	for_each_btree_key_norestart(trans, *iter, desc.btree_id,
 			   SPOS(inum.inum, desc.hash_key(info, key), snapshot),
 			   BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) {
 		if (iter->pos.inode != inum.inum)
@@ -220,7 +220,7 @@ int bch2_hash_needs_whiteout(struct btree_trans *trans,
 
 	bch2_btree_iter_advance(&iter);
 
-	for_each_btree_key_continue(iter, BTREE_ITER_SLOTS, k, ret) {
+	for_each_btree_key_continue_norestart(iter, BTREE_ITER_SLOTS, k, ret) {
 		if (k.k->type != desc.key_type &&
 		    k.k->type != KEY_TYPE_hash_whiteout)
 			break;
@@ -253,7 +253,7 @@ int bch2_hash_set(struct btree_trans *trans,
 	if (ret)
 		return ret;
 
-	for_each_btree_key(trans, iter, desc.btree_id,
+	for_each_btree_key_norestart(trans, iter, desc.btree_id,
 			   SPOS(inum.inum,
 				desc.hash_bkey(info, bkey_i_to_s_c(insert)),
 				snapshot),
diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c
index dcd2f6a91a72..181af89b0553 100644
--- a/fs/bcachefs/xattr.c
+++ b/fs/bcachefs/xattr.c
@@ -295,7 +295,7 @@ retry:
 	if (ret)
 		goto err;
 
-	for_each_btree_key(&trans, iter, BTREE_ID_xattrs,
+	for_each_btree_key_norestart(&trans, iter, BTREE_ID_xattrs,
 			   SPOS(inum, offset, snapshot), 0, k, ret) {
 		BUG_ON(k.k->p.inode < inum);
 
-- 
cgit 


From 4db650277d42c0c80cde8fa3571ff1fb2fded8d9 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 12 Oct 2021 12:06:02 -0400
Subject: bcachefs: Subvol dirents are now only visible in parent subvol

This changes the on disk format for dirents that point to subvols so
that they also record the subvolid of the parent subvol, so that we can
filter them out in other subvolumes.

This also updates the dirent code to do that filtering, and in
particular tweaks the rename code - we need to ensure that there's only
ever one dirent (counting multiplicities in different snapshots) that
point to a subvolume.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/bcachefs_format.h |   9 ++-
 fs/bcachefs/dirent.c          | 180 ++++++++++++++++++++++++++----------------
 fs/bcachefs/dirent.h          |   3 -
 fs/bcachefs/fsck.c            | 152 +++++++++++++++++++++++------------
 fs/bcachefs/recovery.c        |   4 +-
 fs/bcachefs/str_hash.h        |  13 ++-
 6 files changed, 232 insertions(+), 129 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index 6ed3ca075ba0..481bf643bd6f 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -793,7 +793,13 @@ struct bch_dirent {
 	struct bch_val		v;
 
 	/* Target inode number: */
+	union {
 	__le64			d_inum;
+	struct {		/* DT_SUBVOL */
+	__le32			d_child_subvol;
+	__le32			d_parent_subvol;
+	};
+	};
 
 	/*
 	 * Copy of mode bits 12-15 from the target inode - so userspace can get
@@ -1268,7 +1274,8 @@ enum bcachefs_metadata_version {
 	bcachefs_metadata_version_btree_ptr_sectors_written = 14,
 	bcachefs_metadata_version_snapshot_2		= 15,
 	bcachefs_metadata_version_reflink_p_fix		= 16,
-	bcachefs_metadata_version_max			= 17,
+	bcachefs_metadata_version_subvol_dirent		= 17,
+	bcachefs_metadata_version_max			= 18,
 };
 
 #define bcachefs_metadata_version_current	(bcachefs_metadata_version_max - 1)
diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c
index 00dac68701f5..2ab9cbaf71f2 100644
--- a/fs/bcachefs/dirent.c
+++ b/fs/bcachefs/dirent.c
@@ -64,6 +64,15 @@ static bool dirent_cmp_bkey(struct bkey_s_c _l, struct bkey_s_c _r)
 	return l_len - r_len ?: memcmp(l.v->d_name, r.v->d_name, l_len);
 }
 
+static bool dirent_is_visible(subvol_inum inum, struct bkey_s_c k)
+{
+	struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
+
+	if (d.v->d_type == DT_SUBVOL)
+		return le32_to_cpu(d.v->d_parent_subvol) == inum.subvol;
+	return true;
+}
+
 const struct bch_hash_desc bch2_dirent_hash_desc = {
 	.btree_id	= BTREE_ID_dirents,
 	.key_type	= KEY_TYPE_dirent,
@@ -71,6 +80,7 @@ const struct bch_hash_desc bch2_dirent_hash_desc = {
 	.hash_bkey	= dirent_hash_bkey,
 	.cmp_key	= dirent_cmp_key,
 	.cmp_bkey	= dirent_cmp_bkey,
+	.is_visible	= dirent_is_visible,
 };
 
 const char *bch2_dirent_invalid(const struct bch_fs *c, struct bkey_s_c k)
@@ -114,14 +124,18 @@ void bch2_dirent_to_text(struct printbuf *out, struct bch_fs *c,
 
 	bch_scnmemcpy(out, d.v->d_name,
 		      bch2_dirent_name_bytes(d));
-	pr_buf(out, " -> %llu type %s", d.v->d_inum,
+	pr_buf(out, " -> %llu type %s",
+	       d.v->d_type != DT_SUBVOL
+	       ? le64_to_cpu(d.v->d_inum)
+	       : le32_to_cpu(d.v->d_child_subvol),
 	       d.v->d_type < BCH_DT_MAX
 	       ? bch2_d_types[d.v->d_type]
 	       : "(bad d_type)");
 }
 
 static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans,
-				u8 type, const struct qstr *name, u64 dst)
+				subvol_inum dir, u8 type,
+				const struct qstr *name, u64 dst)
 {
 	struct bkey_i_dirent *dirent;
 	unsigned u64s = BKEY_U64s + dirent_val_u64s(name->len);
@@ -137,7 +151,14 @@ static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans,
 
 	bkey_dirent_init(&dirent->k_i);
 	dirent->k.u64s = u64s;
-	dirent->v.d_inum = cpu_to_le64(dst);
+
+	if (type != DT_SUBVOL) {
+		dirent->v.d_inum = cpu_to_le64(dst);
+	} else {
+		dirent->v.d_parent_subvol = cpu_to_le32(dir.subvol);
+		dirent->v.d_child_subvol = cpu_to_le32(dst);
+	}
+
 	dirent->v.d_type = type;
 
 	memcpy(dirent->v.d_name, name->name, name->len);
@@ -159,7 +180,7 @@ int bch2_dirent_create(struct btree_trans *trans, subvol_inum dir,
 	struct bkey_i_dirent *dirent;
 	int ret;
 
-	dirent = dirent_create_key(trans, type, name, dst_inum);
+	dirent = dirent_create_key(trans, dir, type, name, dst_inum);
 	ret = PTR_ERR_OR_ZERO(dirent);
 	if (ret)
 		return ret;
@@ -178,45 +199,30 @@ static void dirent_copy_target(struct bkey_i_dirent *dst,
 	dst->v.d_type = src.v->d_type;
 }
 
-int __bch2_dirent_read_target(struct btree_trans *trans,
-			      struct bkey_s_c_dirent d,
-			      u32 *subvol, u32 *snapshot, u64 *inum,
-			      bool is_fsck)
+static int bch2_dirent_read_target(struct btree_trans *trans, subvol_inum dir,
+				   struct bkey_s_c_dirent d, subvol_inum *target)
 {
 	struct bch_subvolume s;
 	int ret = 0;
 
-	*subvol		= 0;
-	*snapshot	= d.k->p.snapshot;
+	if (d.v->d_type == DT_SUBVOL &&
+	    d.v->d_parent_subvol != dir.subvol)
+		return 1;
 
 	if (likely(d.v->d_type != DT_SUBVOL)) {
-		*inum = le64_to_cpu(d.v->d_inum);
+		target->subvol	= dir.subvol;
+		target->inum	= le64_to_cpu(d.v->d_inum);
 	} else {
-		*subvol = le64_to_cpu(d.v->d_inum);
+		target->subvol	= le32_to_cpu(d.v->d_child_subvol);
 
-		ret = bch2_subvolume_get(trans, *subvol, !is_fsck, BTREE_ITER_CACHED, &s);
+		ret = bch2_subvolume_get(trans, target->subvol, true, BTREE_ITER_CACHED, &s);
 
-		*snapshot	= le32_to_cpu(s.snapshot);
-		*inum		= le64_to_cpu(s.inode);
+		target->inum	= le64_to_cpu(s.inode);
 	}
 
 	return ret;
 }
 
-static int bch2_dirent_read_target(struct btree_trans *trans, subvol_inum dir,
-				   struct bkey_s_c_dirent d, subvol_inum *target)
-{
-	u32 snapshot;
-	int ret = 0;
-
-	ret = __bch2_dirent_read_target(trans, d, &target->subvol, &snapshot,
-					&target->inum, false);
-	if (!target->subvol)
-		target->subvol = dir.subvol;
-
-	return ret;
-}
-
 int bch2_dirent_rename(struct btree_trans *trans,
 		subvol_inum src_dir, struct bch_hash_info *src_hash,
 		subvol_inum dst_dir, struct bch_hash_info *dst_hash,
@@ -230,6 +236,7 @@ int bch2_dirent_rename(struct btree_trans *trans,
 	struct bkey_i_dirent *new_src = NULL, *new_dst = NULL;
 	struct bpos dst_pos =
 		POS(dst_dir.inum, bch2_dirent_hash(dst_hash, dst_name));
+	unsigned src_type = 0, dst_type = 0, src_update_flags = 0;
 	int ret = 0;
 
 	if (src_dir.subvol != dst_dir.subvol)
@@ -238,36 +245,6 @@ int bch2_dirent_rename(struct btree_trans *trans,
 	memset(src_inum, 0, sizeof(*src_inum));
 	memset(dst_inum, 0, sizeof(*dst_inum));
 
-	/*
-	 * Lookup dst:
-	 *
-	 * Note that in BCH_RENAME mode, we're _not_ checking if
-	 * the target already exists - we're relying on the VFS
-	 * to do that check for us for correctness:
-	 */
-	ret = mode == BCH_RENAME
-		? bch2_hash_hole(trans, &dst_iter, bch2_dirent_hash_desc,
-				 dst_hash, dst_dir, dst_name)
-		: bch2_hash_lookup(trans, &dst_iter, bch2_dirent_hash_desc,
-				   dst_hash, dst_dir, dst_name,
-				   BTREE_ITER_INTENT);
-	if (ret)
-		goto out;
-
-	old_dst = bch2_btree_iter_peek_slot(&dst_iter);
-	ret = bkey_err(old_dst);
-	if (ret)
-		goto out;
-
-	if (mode != BCH_RENAME) {
-		ret = bch2_dirent_read_target(trans, dst_dir,
-				bkey_s_c_to_dirent(old_dst), dst_inum);
-		if (ret)
-			goto out;
-	}
-	if (mode != BCH_RENAME_EXCHANGE)
-		*src_offset = dst_iter.pos.offset;
-
 	/* Lookup src: */
 	ret = bch2_hash_lookup(trans, &src_iter, bch2_dirent_hash_desc,
 			       src_hash, src_dir, src_name,
@@ -285,8 +262,51 @@ int bch2_dirent_rename(struct btree_trans *trans,
 	if (ret)
 		goto out;
 
+	src_type = bkey_s_c_to_dirent(old_src).v->d_type;
+
+	if (src_type == DT_SUBVOL && mode == BCH_RENAME_EXCHANGE)
+		return -EOPNOTSUPP;
+
+
+	/* Lookup dst: */
+	if (mode == BCH_RENAME) {
+		/*
+		 * Note that we're _not_ checking if the target already exists -
+		 * we're relying on the VFS to do that check for us for
+		 * correctness:
+		 */
+		ret = bch2_hash_hole(trans, &dst_iter, bch2_dirent_hash_desc,
+				     dst_hash, dst_dir, dst_name);
+		if (ret)
+			goto out;
+	} else {
+		ret = bch2_hash_lookup(trans, &dst_iter, bch2_dirent_hash_desc,
+				       dst_hash, dst_dir, dst_name,
+				       BTREE_ITER_INTENT);
+		if (ret)
+			goto out;
+
+		old_dst = bch2_btree_iter_peek_slot(&dst_iter);
+		ret = bkey_err(old_dst);
+		if (ret)
+			goto out;
+
+		ret = bch2_dirent_read_target(trans, dst_dir,
+				bkey_s_c_to_dirent(old_dst), dst_inum);
+		if (ret)
+			goto out;
+
+		dst_type = bkey_s_c_to_dirent(old_dst).v->d_type;
+
+		if (dst_type == DT_SUBVOL)
+			return -EOPNOTSUPP;
+	}
+
+	if (mode != BCH_RENAME_EXCHANGE)
+		*src_offset = dst_iter.pos.offset;
+
 	/* Create new dst key: */
-	new_dst = dirent_create_key(trans, 0, dst_name, 0);
+	new_dst = dirent_create_key(trans, dst_dir, 0, dst_name, 0);
 	ret = PTR_ERR_OR_ZERO(new_dst);
 	if (ret)
 		goto out;
@@ -296,7 +316,7 @@ int bch2_dirent_rename(struct btree_trans *trans,
 
 	/* Create new src key: */
 	if (mode == BCH_RENAME_EXCHANGE) {
-		new_src = dirent_create_key(trans, 0, src_name, 0);
+		new_src = dirent_create_key(trans, src_dir, 0, src_name, 0);
 		ret = PTR_ERR_OR_ZERO(new_src);
 		if (ret)
 			goto out;
@@ -326,10 +346,9 @@ int bch2_dirent_rename(struct btree_trans *trans,
 				 * If we're not overwriting, we can just insert
 				 * new_dst at the src position:
 				 */
-				new_dst->k.p = src_iter.pos;
-				bch2_trans_update(trans, &src_iter,
-						  &new_dst->k_i, 0);
-				goto out_set_offset;
+				new_src = new_dst;
+				new_src->k.p = src_iter.pos;
+				goto out_set_src;
 			} else {
 				/* If we're overwriting, we can't insert new_dst
 				 * at a different slot because it has to
@@ -350,9 +369,25 @@ int bch2_dirent_rename(struct btree_trans *trans,
 		}
 	}
 
-	bch2_trans_update(trans, &src_iter, &new_src->k_i, 0);
 	bch2_trans_update(trans, &dst_iter, &new_dst->k_i, 0);
-out_set_offset:
+out_set_src:
+
+	/*
+	 * If we're deleting a subvolume, we need to really delete the dirent,
+	 * not just emit a whiteout in the current snapshot:
+	 */
+	if (src_type == DT_SUBVOL) {
+		bch2_btree_iter_set_snapshot(&src_iter, old_src.k->p.snapshot);
+		ret = bch2_btree_iter_traverse(&src_iter);
+		if (ret)
+			goto out;
+
+		new_src->k.p = src_iter.pos;
+		src_update_flags |= BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE;
+	}
+
+	bch2_trans_update(trans, &src_iter, &new_src->k_i, src_update_flags);
+
 	if (mode == BCH_RENAME_EXCHANGE)
 		*src_offset = new_src->k.p.offset;
 	*dst_offset = new_dst->k.p.offset;
@@ -393,6 +428,8 @@ int __bch2_dirent_lookup_trans(struct btree_trans *trans,
 	d = bkey_s_c_to_dirent(k);
 
 	ret = bch2_dirent_read_target(trans, dir, d, inum);
+	if (ret > 0)
+		ret = -ENOENT;
 	if (ret)
 		bch2_trans_iter_exit(trans, iter);
 
@@ -453,6 +490,7 @@ int bch2_readdir(struct bch_fs *c, subvol_inum inum, struct dir_context *ctx)
 	struct btree_iter iter;
 	struct bkey_s_c k;
 	struct bkey_s_c_dirent dirent;
+	subvol_inum target;
 	u32 snapshot;
 	int ret;
 
@@ -474,6 +512,12 @@ retry:
 
 		dirent = bkey_s_c_to_dirent(k);
 
+		ret = bch2_dirent_read_target(&trans, inum, dirent, &target);
+		if (ret < 0)
+			break;
+		if (ret)
+			continue;
+
 		/*
 		 * XXX: dir_emit() can fault and block, while we're holding
 		 * locks
@@ -481,7 +525,7 @@ retry:
 		ctx->pos = dirent.k->p.offset;
 		if (!dir_emit(ctx, dirent.v->d_name,
 			      bch2_dirent_name_bytes(dirent),
-			      le64_to_cpu(dirent.v->d_inum),
+			      target.inum,
 			      vfs_d_type(dirent.v->d_type)))
 			break;
 		ctx->pos = dirent.k->p.offset + 1;
diff --git a/fs/bcachefs/dirent.h b/fs/bcachefs/dirent.h
index e7f65fbd8e65..8ae407765fe4 100644
--- a/fs/bcachefs/dirent.h
+++ b/fs/bcachefs/dirent.h
@@ -33,9 +33,6 @@ int bch2_dirent_create(struct btree_trans *, subvol_inum,
 		       const struct bch_hash_info *, u8,
 		       const struct qstr *, u64, u64 *, int);
 
-int __bch2_dirent_read_target(struct btree_trans *, struct bkey_s_c_dirent,
-			      u32 *, u32 *, u64 *, bool);
-
 static inline unsigned vfs_d_type(unsigned type)
 {
 	return type == DT_SUBVOL ? DT_DIR : type;
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index d6f37b9e00fb..58d42734c252 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -134,10 +134,11 @@ static int __lookup_inode(struct btree_trans *trans, u64 inode_nr,
 	if (ret)
 		goto err;
 
-	*snapshot = iter.pos.snapshot;
 	ret = k.k->type == KEY_TYPE_inode
 		? bch2_inode_unpack(bkey_s_c_to_inode(k), inode)
 		: -ENOENT;
+	if (!ret)
+		*snapshot = iter.pos.snapshot;
 err:
 	bch2_trans_iter_exit(trans, &iter);
 	return ret;
@@ -1045,46 +1046,60 @@ static int fix_overlapping_extent(struct btree_trans *trans,
 }
 #endif
 
+static struct bkey_s_c_dirent dirent_get_by_pos(struct btree_trans *trans,
+						struct btree_iter *iter,
+						struct bpos pos)
+{
+	struct bkey_s_c k;
+	int ret;
+
+	bch2_trans_iter_init(trans, iter, BTREE_ID_dirents, pos, 0);
+	k = bch2_btree_iter_peek_slot(iter);
+	ret = bkey_err(k);
+	if (!ret && k.k->type != KEY_TYPE_dirent)
+		ret = -ENOENT;
+	if (ret) {
+		bch2_trans_iter_exit(trans, iter);
+		return (struct bkey_s_c_dirent) { .k = ERR_PTR(ret) };
+	}
+
+	return bkey_s_c_to_dirent(k);
+}
+
+static bool inode_points_to_dirent(struct bch_inode_unpacked *inode,
+				   struct bkey_s_c_dirent d)
+{
+	return  inode->bi_dir		== d.k->p.inode &&
+		inode->bi_dir_offset	== d.k->p.offset;
+}
+
+static bool dirent_points_to_inode(struct bkey_s_c_dirent d,
+				   struct bch_inode_unpacked *inode)
+{
+	return d.v->d_type == DT_SUBVOL
+		? le32_to_cpu(d.v->d_child_subvol)	== inode->bi_subvol
+		: le64_to_cpu(d.v->d_inum)		== inode->bi_inum;
+}
+
 static int inode_backpointer_exists(struct btree_trans *trans,
 				    struct bch_inode_unpacked *inode,
 				    u32 snapshot)
 {
 	struct btree_iter iter;
-	struct bkey_s_c k;
-	u32 target_subvol, target_snapshot;
-	u64 target_inum;
+	struct bkey_s_c_dirent d;
 	int ret;
 
-	bch2_trans_iter_init(trans, &iter, BTREE_ID_dirents,
-			SPOS(inode->bi_dir, inode->bi_dir_offset, snapshot), 0);
-	k = bch2_btree_iter_peek_slot(&iter);
-	ret = bkey_err(k);
+	d = dirent_get_by_pos(trans, &iter,
+			SPOS(inode->bi_dir, inode->bi_dir_offset, snapshot));
+	ret = bkey_err(d.s_c);
 	if (ret)
-		goto out;
-	if (k.k->type != KEY_TYPE_dirent)
-		goto out;
-
-	ret = __bch2_dirent_read_target(trans, bkey_s_c_to_dirent(k),
-					&target_subvol,
-					&target_snapshot,
-					&target_inum,
-					true);
-	if (ret)
-		goto out;
+		return ret;
 
-	ret = target_inum == inode->bi_inum;
-out:
+	ret = dirent_points_to_inode(d, inode);
 	bch2_trans_iter_exit(trans, &iter);
 	return ret;
 }
 
-static bool inode_backpointer_matches(struct bkey_s_c_dirent d,
-				      struct bch_inode_unpacked *inode)
-{
-	return d.k->p.inode == inode->bi_dir &&
-		d.k->p.offset == inode->bi_dir_offset;
-}
-
 static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w)
 {
 	struct bch_fs *c = trans->c;
@@ -1326,7 +1341,7 @@ static int check_dirent_target(struct btree_trans *trans,
 			goto err;
 	}
 
-	if (!inode_backpointer_matches(d, target)) {
+	if (!inode_points_to_dirent(target, d)) {
 		ret = inode_backpointer_exists(trans, target, d.k->p.snapshot);
 		if (ret < 0)
 			goto err;
@@ -1394,8 +1409,34 @@ static int check_dirent_target(struct btree_trans *trans,
 				      BTREE_INSERT_LAZY_RW,
 			bch2_trans_update(trans, iter, &n->k_i, 0));
 		kfree(n);
-		if (ret)
+
+		return ret ?: -EINTR;
+	}
+
+	if (d.v->d_type == DT_SUBVOL &&
+	    target->bi_parent_subvol != le32_to_cpu(d.v->d_parent_subvol) &&
+	    (c->sb.version < bcachefs_metadata_version_subvol_dirent ||
+	     fsck_err(c, "dirent has wrong d_parent_subvol field: got %u, should be %u",
+		      le32_to_cpu(d.v->d_parent_subvol),
+		      target->bi_parent_subvol))) {
+		struct bkey_i_dirent *n;
+
+		n = kmalloc(bkey_bytes(d.k), GFP_KERNEL);
+		if (!n) {
+			ret = -ENOMEM;
 			goto err;
+		}
+
+		bkey_reassemble(&n->k_i, d.s_c);
+		n->v.d_parent_subvol = cpu_to_le32(target->bi_parent_subvol);
+
+		ret = __bch2_trans_do(trans, NULL, NULL,
+				      BTREE_INSERT_NOFAIL|
+				      BTREE_INSERT_LAZY_RW,
+			bch2_trans_update(trans, iter, &n->k_i, 0));
+		kfree(n);
+
+		return ret ?: -EINTR;
 	}
 err:
 fsck_err:
@@ -1412,9 +1453,6 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
 	struct bkey_s_c k;
 	struct bkey_s_c_dirent d;
 	struct inode_walker_entry *i;
-	u32 target_snapshot;
-	u32 target_subvol;
-	u64 target_inum;
 	char buf[200];
 	int ret;
 
@@ -1482,21 +1520,21 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
 
 	d = bkey_s_c_to_dirent(k);
 
-	ret = __bch2_dirent_read_target(trans, d,
-					&target_subvol,
-					&target_snapshot,
-					&target_inum,
-					true);
-	if (ret && ret != -ENOENT)
-		return ret;
+	if (d.v->d_type == DT_SUBVOL) {
+		struct bch_inode_unpacked subvol_root;
+		u32 target_subvol = le32_to_cpu(d.v->d_child_subvol);
+		u32 target_snapshot;
+		u64 target_inum;
 
-	if (fsck_err_on(ret, c,
-			"dirent points to missing subvolume %llu",
-			le64_to_cpu(d.v->d_inum)))
-		return remove_dirent(trans, d.k->p);
+		ret = __subvol_lookup(trans, target_subvol,
+				      &target_snapshot, &target_inum);
+		if (ret && ret != -ENOENT)
+			return ret;
 
-	if (target_subvol) {
-		struct bch_inode_unpacked subvol_root;
+		if (fsck_err_on(ret, c,
+				"dirent points to missing subvolume %llu",
+				le64_to_cpu(d.v->d_child_subvol)))
+			return remove_dirent(trans, d.k->p);
 
 		ret = __lookup_inode(trans, target_inum,
 				   &subvol_root, &target_snapshot);
@@ -1526,7 +1564,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
 		if (ret)
 			return ret;
 	} else {
-		ret = __get_visible_inodes(trans, target, s, target_inum);
+		ret = __get_visible_inodes(trans, target, s, le64_to_cpu(d.v->d_inum));
 		if (ret)
 			return ret;
 
@@ -1786,9 +1824,11 @@ static int check_path(struct btree_trans *trans,
 
 	while (!(inode->bi_inum == BCACHEFS_ROOT_INO &&
 		 inode->bi_subvol == BCACHEFS_ROOT_SUBVOL)) {
+		struct btree_iter dirent_iter;
+		struct bkey_s_c_dirent d;
 		u32 parent_snapshot = snapshot;
 
-		if (inode->bi_parent_subvol) {
+		if (inode->bi_subvol) {
 			u64 inum;
 
 			ret = subvol_lookup(trans, inode->bi_parent_subvol,
@@ -1798,11 +1838,18 @@ static int check_path(struct btree_trans *trans,
 		}
 
 		ret = lockrestart_do(trans,
-			inode_backpointer_exists(trans, inode, parent_snapshot));
-		if (ret < 0)
+			PTR_ERR_OR_ZERO((d = dirent_get_by_pos(trans, &dirent_iter,
+					  SPOS(inode->bi_dir, inode->bi_dir_offset,
+					       parent_snapshot))).k));
+		if (ret && ret != -ENOENT)
 			break;
 
-		if (!ret) {
+		if (!ret && !dirent_points_to_inode(d, inode)) {
+			bch2_trans_iter_exit(trans, &dirent_iter);
+			ret = -ENOENT;
+		}
+
+		if (ret == -ENOENT) {
 			if (fsck_err(c,  "unreachable inode %llu:%u, type %u nlink %u backptr %llu:%llu",
 				     inode->bi_inum, snapshot,
 				     mode_to_type(inode->bi_mode),
@@ -1812,7 +1859,8 @@ static int check_path(struct btree_trans *trans,
 				ret = reattach_inode(trans, inode, snapshot);
 			break;
 		}
-		ret = 0;
+
+		bch2_trans_iter_exit(trans, &dirent_iter);
 
 		if (!S_ISDIR(inode->bi_mode))
 			break;
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 8c53b1e977d1..6bf9c48a7871 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -1086,8 +1086,8 @@ int bch2_fs_recovery(struct bch_fs *c)
 		c->opts.version_upgrade	= true;
 		c->opts.fsck		= true;
 		c->opts.fix_errors	= FSCK_OPT_YES;
-	} else if (c->sb.version < bcachefs_metadata_version_reflink_p_fix) {
-		bch_info(c, "filesystem version is prior to reflink_p fix - upgrading");
+	} else if (c->sb.version < bcachefs_metadata_version_subvol_dirent) {
+		bch_info(c, "filesystem version is prior to subvol_dirent - upgrading");
 		c->opts.version_upgrade = true;
 		c->opts.fsck		= true;
 	}
diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h
index 3e54d0b0fb5c..789dde7c6ac6 100644
--- a/fs/bcachefs/str_hash.h
+++ b/fs/bcachefs/str_hash.h
@@ -138,8 +138,15 @@ struct bch_hash_desc {
 	u64		(*hash_bkey)(const struct bch_hash_info *, struct bkey_s_c);
 	bool		(*cmp_key)(struct bkey_s_c, const void *);
 	bool		(*cmp_bkey)(struct bkey_s_c, struct bkey_s_c);
+	bool		(*is_visible)(subvol_inum inum, struct bkey_s_c);
 };
 
+static inline bool is_visible_key(struct bch_hash_desc desc, subvol_inum inum, struct bkey_s_c k)
+{
+	return k.k->type == desc.key_type &&
+		(!desc.is_visible || desc.is_visible(inum, k));
+}
+
 static __always_inline int
 bch2_hash_lookup(struct btree_trans *trans,
 		 struct btree_iter *iter,
@@ -162,7 +169,7 @@ bch2_hash_lookup(struct btree_trans *trans,
 		if (iter->pos.inode != inum.inum)
 			break;
 
-		if (k.k->type == desc.key_type) {
+		if (is_visible_key(desc, inum, k)) {
 			if (!desc.cmp_key(k, key))
 				return 0;
 		} else if (k.k->type == KEY_TYPE_hash_whiteout) {
@@ -198,7 +205,7 @@ bch2_hash_hole(struct btree_trans *trans,
 		if (iter->pos.inode != inum.inum)
 			break;
 
-		if (k.k->type != desc.key_type)
+		if (!is_visible_key(desc, inum, k))
 			return 0;
 	}
 	bch2_trans_iter_exit(trans, iter);
@@ -261,7 +268,7 @@ int bch2_hash_set(struct btree_trans *trans,
 		if (iter.pos.inode != inum.inum)
 			break;
 
-		if (k.k->type == desc.key_type) {
+		if (is_visible_key(desc, inum, k)) {
 			if (!desc.cmp_bkey(k, bkey_i_to_s_c(insert)))
 				goto found;
 
-- 
cgit 


From de924abbe7a62bdeb6baaba0f2fe2d1c64ef888b Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 21 Oct 2021 14:33:31 -0400
Subject: bcachefs: Fix error handling in bch2_trans_extent_merging

The back merging case wasn't returning errors correctly.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_update_leaf.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index b344979ca7ed..22fce150781e 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -1058,7 +1058,7 @@ static int bch2_trans_update_extent(struct btree_trans *trans,
 		if (bch2_bkey_maybe_mergable(k.k, &insert->k)) {
 			ret = extent_front_merge(trans, &iter, k, &insert, flags);
 			if (ret)
-				goto out;
+				goto err;
 		}
 
 		goto next;
@@ -1178,8 +1178,11 @@ next:
 			goto out;
 	}
 
-	if (bch2_bkey_maybe_mergable(&insert->k, k.k))
-		extent_back_merge(trans, &iter, insert, k);
+	if (bch2_bkey_maybe_mergable(&insert->k, k.k)) {
+		ret = extent_back_merge(trans, &iter, insert, k);
+		if (ret)
+			goto err;
+	}
 out:
 	if (!bkey_deleted(&insert->k)) {
 		/*
-- 
cgit 


From ab44d7bdeebe316a03f37c08c1f66c13ea9aa5d5 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 22 Oct 2021 17:33:38 -0400
Subject: bcachefs: Fix a transaction path overflow

readdir() in a directory with many subvolumes could overflow transaction
paths - this is a simple hack around the issue.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/dirent.c | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c
index 2ab9cbaf71f2..6be3ec4ec4a6 100644
--- a/fs/bcachefs/dirent.c
+++ b/fs/bcachefs/dirent.c
@@ -529,6 +529,15 @@ retry:
 			      vfs_d_type(dirent.v->d_type)))
 			break;
 		ctx->pos = dirent.k->p.offset + 1;
+
+		/*
+		 * read_target looks up subvolumes, we can overflow paths if the
+		 * directory has many subvolumes in it
+		 */
+		if (hweight64(trans.paths_allocated) > BTREE_ITER_MAX / 2) {
+			ret = -EINTR;
+			break;
+		}
 	}
 	bch2_trans_iter_exit(&trans, &iter);
 err:
-- 
cgit 


From 114eea75c73dfd95ae529eb841aad0330793f446 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 24 Oct 2021 11:57:47 -0400
Subject: bcachefs: Fix dev accounting after device add

This is a hacky but effective fix to device usage stats for superblock
and journal being wrong on a newly added device (following the comment
that already told us how it needed to be done!)

Reported-by: Chris Webb <chris@arachsys.com>
Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/super.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index db38d6b0f2ad..d17e4f005b3f 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -1591,6 +1591,8 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
 	struct bch_dev *ca = NULL;
 	struct bch_sb_field_members *mi;
 	struct bch_member dev_mi;
+	struct bucket_array *buckets;
+	struct bucket *g;
 	unsigned dev_idx, nr_devices, u64s;
 	int ret;
 
@@ -1694,6 +1696,16 @@ have_slot:
 
 	bch2_dev_usage_journal_reserve(c);
 
+	/*
+	 * Clear marks before marking transactionally in the btree, so that
+	 * per-device accounting gets done correctly:
+	 */
+	down_read(&ca->bucket_lock);
+	buckets = bucket_array(ca);
+	for_each_bucket(g, buckets)
+		atomic64_set(&g->_mark.v, 0);
+	up_read(&ca->bucket_lock);
+
 	err = "error marking superblock";
 	ret = bch2_trans_mark_dev_sb(c, ca);
 	if (ret)
-- 
cgit 


From b0d1b70af85718a58e4edd796251a3600a20b6d5 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 24 Oct 2021 16:40:05 -0400
Subject: bcachefs: Must check for errors from bch2_trans_cond_resched()

But we don't need to call it from outside the btree iterator code
anymore, since it's called by bch2_trans_begin() and
bch2_btree_path_traverse().

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/alloc_background.c  |  2 --
 fs/bcachefs/btree_gc.c          |  2 --
 fs/bcachefs/btree_iter.c        | 15 +++++++++++++++
 fs/bcachefs/btree_iter.h        | 15 ---------------
 fs/bcachefs/btree_update_leaf.c |  2 --
 fs/bcachefs/fsck.c              |  2 --
 fs/bcachefs/move.c              |  2 --
 7 files changed, 15 insertions(+), 25 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index fc1b4b354b05..2551ccc99d38 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -367,8 +367,6 @@ int bch2_alloc_write(struct bch_fs *c, unsigned flags)
 			POS(ca->dev_idx, ca->mi.first_bucket));
 
 		while (iter.pos.offset < ca->mi.nbuckets) {
-			bch2_trans_cond_resched(&trans);
-
 			ret = bch2_alloc_write_key(&trans, &iter, flags);
 			if (ret) {
 				percpu_ref_put(&ca->ref);
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 315a78b5ba8b..75d881b7a6c9 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -828,8 +828,6 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id,
 						BTREE_INSERT_NOWAIT|
 						BTREE_INSERT_GC_LOCK_HELD);
 		}
-
-		bch2_trans_cond_resched(&trans);
 	}
 	bch2_trans_iter_exit(&trans, &iter);
 
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 2b51245dd5fe..189c020d289a 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -24,6 +24,21 @@ static inline void btree_path_list_add(struct btree_trans *, struct btree_path *
 
 static struct btree_path *btree_path_alloc(struct btree_trans *, struct btree_path *);
 
+/*
+ * Unlocks before scheduling
+ * Note: does not revalidate iterator
+ */
+static inline int bch2_trans_cond_resched(struct btree_trans *trans)
+{
+	if (need_resched() || race_fault()) {
+		bch2_trans_unlock(trans);
+		schedule();
+		return bch2_trans_relock(trans) ? 0 : -EINTR;
+	} else {
+		return 0;
+	}
+}
+
 static inline int __btree_path_cmp(const struct btree_path *l,
 				   enum btree_id	r_btree_id,
 				   bool			r_cached,
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index fea1101155be..e58cad4b8fc6 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -269,21 +269,6 @@ static inline void bch2_btree_iter_set_snapshot(struct btree_iter *iter, u32 sna
 	bch2_btree_iter_set_pos(iter, pos);
 }
 
-/*
- * Unlocks before scheduling
- * Note: does not revalidate iterator
- */
-static inline int bch2_trans_cond_resched(struct btree_trans *trans)
-{
-	if (need_resched() || race_fault()) {
-		bch2_trans_unlock(trans);
-		schedule();
-		return bch2_trans_relock(trans) ? 0 : -EINTR;
-	} else {
-		return 0;
-	}
-}
-
 void bch2_trans_iter_exit(struct btree_trans *, struct btree_iter *);
 void bch2_trans_iter_init(struct btree_trans *, struct btree_iter *,
 			  unsigned, struct bpos, unsigned);
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 22fce150781e..8b4933add017 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -1414,8 +1414,6 @@ retry:
 					BTREE_INSERT_NOFAIL);
 		if (ret)
 			break;
-
-		bch2_trans_cond_resched(trans);
 	}
 
 	if (ret == -EINTR) {
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 58d42734c252..197b9079e2b8 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -2116,8 +2116,6 @@ static int check_nlinks_walk_dirents(struct bch_fs *c, struct nlink_table *links
 					 d.k->p.snapshot);
 			break;
 		}
-
-		bch2_trans_cond_resched(&trans);
 	}
 	bch2_trans_iter_exit(&trans, &iter);
 
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index 4e2bd1474a0a..db9800141728 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -768,7 +768,6 @@ next:
 			     &stats->sectors_seen);
 next_nondata:
 		bch2_btree_iter_advance(&iter);
-		bch2_trans_cond_resched(&trans);
 	}
 out:
 
@@ -914,7 +913,6 @@ retry:
 			ret = bch2_btree_node_rewrite(&trans, &iter,
 					b->data->keys.seq, 0) ?: ret;
 next:
-			bch2_trans_cond_resched(&trans);
 			bch2_btree_iter_next_node(&iter);
 		}
 		if (ret == -EINTR)
-- 
cgit 


From 979735df980972dc9ffdaca8a5171664ac658248 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 24 Oct 2021 16:55:17 -0400
Subject: bcachefs: Fix bch2_btree_iter_next_node()

We were modifying state, then return -EINTR, causing us to skip nodes -
ouch.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_iter.c | 50 ++++++++++++++++++++++++++++++++----------------
 1 file changed, 34 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 189c020d289a..c1dc33248265 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1462,6 +1462,11 @@ static int btree_path_traverse_one(struct btree_trans *trans,
 	unsigned depth_want = path->level;
 	int ret = 0;
 
+	if (unlikely(trans->restarted)) {
+		ret = -EINTR;
+		goto out;
+	}
+
 	/*
 	 * Ensure we obey path->should_be_locked: if it's set, we can't unlock
 	 * and re-traverse the path without a transaction restart:
@@ -1935,30 +1940,41 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
 	struct btree_trans *trans = iter->trans;
 	struct btree_path *path = iter->path;
 	struct btree *b = NULL;
+	unsigned l;
 	int ret;
 
+	BUG_ON(trans->restarted);
 	EBUG_ON(iter->path->cached);
 	bch2_btree_iter_verify(iter);
 
-	/* already got to end? */
+	/* already at end? */
 	if (!btree_path_node(path, path->level))
-		goto out;
+		return NULL;
 
-	btree_node_unlock(path, path->level);
-	path->l[path->level].b = BTREE_ITER_NO_NODE_UP;
-	path->level++;
+	/* got to end? */
+	if (!btree_path_node(path, path->level + 1)) {
+		btree_node_unlock(path, path->level);
+		path->l[path->level].b = BTREE_ITER_NO_NODE_UP;
+		path->level++;
+		return NULL;
+	}
 
-	btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
-	ret = bch2_btree_path_traverse(trans, path, iter->flags);
-	if (ret)
+	if (!bch2_btree_node_relock(trans, path, path->level + 1)) {
+		__bch2_btree_path_unlock(path);
+		path->l[path->level].b = BTREE_ITER_NO_NODE_GET_LOCKS;
+		path->l[path->level + 1].b = BTREE_ITER_NO_NODE_GET_LOCKS;
+		btree_trans_restart(trans);
+		ret = -EINTR;
 		goto err;
+	}
 
-	/* got to end? */
-	b = btree_path_node(path, path->level);
-	if (!b)
-		goto out;
+	b = btree_path_node(path, path->level + 1);
 
-	if (bpos_cmp(iter->pos, b->key.k.p) < 0) {
+	if (!bpos_cmp(iter->pos, b->key.k.p)) {
+		btree_node_unlock(path, path->level);
+		path->l[path->level].b = BTREE_ITER_NO_NODE_UP;
+		path->level++;
+	} else {
 		/*
 		 * Haven't gotten to the end of the parent node: go back down to
 		 * the next child node
@@ -1967,10 +1983,12 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
 			btree_path_set_pos(trans, path, bpos_successor(iter->pos),
 					   iter->flags & BTREE_ITER_INTENT);
 
-		/* Unlock to avoid screwing up our lock invariants: */
-		btree_node_unlock(path, path->level);
-
 		path->level = iter->min_depth;
+
+		for (l = path->level + 1; l < BTREE_MAX_DEPTH; l++)
+			if (btree_lock_want(path, l) == BTREE_NODE_UNLOCKED)
+				btree_node_unlock(path, l);
+
 		btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
 		bch2_btree_iter_verify(iter);
 
-- 
cgit 


From f3cf0999ac1c70676ba2b4d3db7b6f02f213a2d7 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 24 Oct 2021 16:59:33 -0400
Subject: bcachefs: bch2_btree_node_rewrite() now returns transaction restarts

We have been getting away from handling transaction restarts locally -
convert bch2_btree_node_rewrite() to the newer style.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_gc.c              |  6 ++---
 fs/bcachefs/btree_update.h          |  2 +-
 fs/bcachefs/btree_update_interior.c | 53 ++++++++++++++++++++-----------------
 fs/bcachefs/move.c                  |  7 +++--
 4 files changed, 37 insertions(+), 31 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 75d881b7a6c9..4fc882b15d93 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -817,15 +817,13 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id,
 
 		if (!initial) {
 			if (max_stale > 64)
-				bch2_btree_node_rewrite(&trans, &iter,
-						b->data->keys.seq,
+				bch2_btree_node_rewrite(&trans, &iter, b,
 						BTREE_INSERT_NOWAIT|
 						BTREE_INSERT_GC_LOCK_HELD);
 			else if (!bch2_btree_gc_rewrite_disabled &&
 				 (bch2_btree_gc_always_rewrite || max_stale > 16))
 				bch2_btree_node_rewrite(&trans, &iter,
-						b->data->keys.seq,
-						BTREE_INSERT_NOWAIT|
+						b, BTREE_INSERT_NOWAIT|
 						BTREE_INSERT_GC_LOCK_HELD);
 		}
 	}
diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
index 2ffee9029f34..4c1a1b617bf1 100644
--- a/fs/bcachefs/btree_update.h
+++ b/fs/bcachefs/btree_update.h
@@ -66,7 +66,7 @@ int bch2_btree_delete_range(struct bch_fs *, enum btree_id,
 			    struct bpos, struct bpos, u64 *);
 
 int bch2_btree_node_rewrite(struct btree_trans *, struct btree_iter *,
-			    __le64, unsigned);
+			    struct btree *, unsigned);
 void bch2_btree_node_rewrite_async(struct bch_fs *, struct btree *);
 int bch2_btree_node_update_key(struct btree_trans *, struct btree_iter *,
 			       struct btree *, struct bkey_i *, bool);
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 14ecd3f863de..a28c7cf381ce 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -1722,26 +1722,15 @@ err:
  */
 int bch2_btree_node_rewrite(struct btree_trans *trans,
 			    struct btree_iter *iter,
-			    __le64 seq, unsigned flags)
+			    struct btree *b,
+			    unsigned flags)
 {
 	struct bch_fs *c = trans->c;
-	struct btree *b, *n, *parent;
+	struct btree *n, *parent;
 	struct btree_update *as;
 	int ret;
 
 	flags |= BTREE_INSERT_NOFAIL;
-retry:
-	ret = bch2_btree_iter_traverse(iter);
-	if (ret)
-		goto out;
-
-	b = bch2_btree_iter_peek_node(iter);
-	ret = PTR_ERR_OR_ZERO(b);
-	if (ret)
-		goto out;
-
-	if (!b || b->data->keys.seq != seq)
-		goto out;
 
 	parent = btree_node_parent(iter->path, b);
 	as = bch2_btree_update_start(trans, iter->path, b->c.level,
@@ -1750,8 +1739,6 @@ retry:
 		 : 0) + 1,
 		flags);
 	ret = PTR_ERR_OR_ZERO(as);
-	if (ret == -EINTR)
-		goto retry;
 	if (ret) {
 		trace_btree_gc_rewrite_node_fail(c, b);
 		goto out;
@@ -1799,20 +1786,38 @@ struct async_btree_rewrite {
 	__le64			seq;
 };
 
+static int async_btree_node_rewrite_trans(struct btree_trans *trans,
+					  struct async_btree_rewrite *a)
+{
+	struct btree_iter iter;
+	struct btree *b;
+	int ret;
+
+	bch2_trans_node_iter_init(trans, &iter, a->btree_id, a->pos,
+				  BTREE_MAX_DEPTH, a->level, 0);
+	b = bch2_btree_iter_peek_node(&iter);
+	ret = PTR_ERR_OR_ZERO(b);
+	if (ret)
+		goto out;
+
+	if (!b || b->data->keys.seq != a->seq)
+		goto out;
+
+	ret = bch2_btree_node_rewrite(trans, &iter, b, 0);
+out :
+	bch2_trans_iter_exit(trans, &iter);
+
+	return ret;
+}
+
 void async_btree_node_rewrite_work(struct work_struct *work)
 {
 	struct async_btree_rewrite *a =
 		container_of(work, struct async_btree_rewrite, work);
 	struct bch_fs *c = a->c;
-	struct btree_trans trans;
-	struct btree_iter iter;
 
-	bch2_trans_init(&trans, c, 0, 0);
-	bch2_trans_node_iter_init(&trans, &iter, a->btree_id, a->pos,
-					BTREE_MAX_DEPTH, a->level, 0);
-	bch2_btree_node_rewrite(&trans, &iter, a->seq, 0);
-	bch2_trans_iter_exit(&trans, &iter);
-	bch2_trans_exit(&trans);
+	bch2_trans_do(c, NULL, NULL, 0,
+		      async_btree_node_rewrite_trans(&trans, a));
 	percpu_ref_put(&c->writes);
 	kfree(a);
 }
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index db9800141728..1d7b8696af01 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -910,8 +910,11 @@ retry:
 				BUG();
 			}
 
-			ret = bch2_btree_node_rewrite(&trans, &iter,
-					b->data->keys.seq, 0) ?: ret;
+			ret = bch2_btree_node_rewrite(&trans, &iter, b, 0) ?: ret;
+			if (ret == -EINTR)
+				continue;
+			if (ret)
+				break;
 next:
 			bch2_btree_iter_next_node(&iter);
 		}
-- 
cgit 


From 23af498cc44bc7615f3f208f39daab637bcac023 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 24 Oct 2021 17:00:33 -0400
Subject: bcachefs: Ensure we flush btree updates in evacuate path

This fixes a possible race where we fail to remove a device because of
btree nodes still on it, that are being deleted by in flight btree
updates.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/move.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index 1d7b8696af01..5f50b66fe206 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -932,6 +932,10 @@ next:
 	if (ret)
 		bch_err(c, "error %i in bch2_move_btree", ret);
 
+	/* flush relevant btree updates */
+	closure_wait_event(&c->btree_interior_update_wait,
+			   !bch2_btree_interior_updates_nr_pending(c));
+
 	progress_list_del(c, stats);
 	return ret;
 }
@@ -1075,10 +1079,6 @@ int bch2_data_job(struct bch_fs *c,
 				      op.start_btree,	op.start_pos,
 				      op.end_btree,	op.end_pos,
 				      rereplicate_btree_pred, c, stats) ?: ret;
-
-		closure_wait_event(&c->btree_interior_update_wait,
-				   !bch2_btree_interior_updates_nr_pending(c));
-
 		ret = bch2_replicas_gc2(c) ?: ret;
 
 		ret = bch2_move_data(c,
-- 
cgit 


From 396a887d8fdf37d9e3e9a5b2db823184f8ec2eaa Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 21 Oct 2021 15:48:05 -0400
Subject: bcachefs: Fix fsck path for refink pointers

The way __bch2_mark_reflink_p returns errors was clashing with returning
the number of sectors processed - we weren't returning FSCK_ERR_EXIT
correctly.

Fix this by only using the return code for errors, which actually ends
up simplifying the overall logic.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/buckets.c | 122 +++++++++++++++++++-------------------------------
 1 file changed, 46 insertions(+), 76 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index ee1c71e011c7..2982f71bcf2d 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -1112,61 +1112,47 @@ static int bch2_mark_reservation(struct bch_fs *c,
 }
 
 static s64 __bch2_mark_reflink_p(struct bch_fs *c, struct bkey_s_c_reflink_p p,
-				 u64 idx, unsigned flags, size_t *r_idx)
+				 u64 *idx, unsigned flags, size_t r_idx)
 {
 	struct reflink_gc *r;
 	int add = !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1;
 	s64 ret = 0;
 
-	while (*r_idx < c->reflink_gc_nr) {
-		r = genradix_ptr(&c->reflink_gc_table, *r_idx);
-		BUG_ON(!r);
-
-		if (idx < r->offset)
-			break;
-		(*r_idx)++;
-	}
+	if (r_idx >= c->reflink_gc_nr)
+		goto not_found;
 
-	if (*r_idx >= c->reflink_gc_nr ||
-	    idx < r->offset - r->size) {
-		ret = p.k->size;
+	r = genradix_ptr(&c->reflink_gc_table, r_idx);
+	if (*idx < r->offset - r->size)
 		goto not_found;
-	}
 
 	BUG_ON((s64) r->refcount + add < 0);
 
 	r->refcount += add;
-	return r->offset - idx;
+	*idx = r->offset;
+	return 0;
 not_found:
-	if ((flags & BTREE_TRIGGER_GC) &&
-	    (flags & BTREE_TRIGGER_NOATOMIC)) {
-		/*
-		 * XXX: we're replacing the entire reflink pointer with an error
-		 * key, we should just be replacing the part that was missing:
-		 */
-		if (fsck_err(c, "%llu:%llu len %u points to nonexistent indirect extent %llu",
-			     p.k->p.inode, p.k->p.offset, p.k->size, idx)) {
-			struct bkey_i_error *new;
-
-			new = kmalloc(sizeof(*new), GFP_KERNEL);
-			if (!new) {
-				bch_err(c, "%s: error allocating new key", __func__);
-				return -ENOMEM;
-			}
+	*idx = U64_MAX;
+	ret = -EIO;
 
-			bkey_init(&new->k);
-			new->k.type	= KEY_TYPE_error;
-			new->k.p	= p.k->p;
-			new->k.size	= p.k->size;
-			ret = bch2_journal_key_insert(c, BTREE_ID_extents, 0, &new->k_i);
+	/*
+	 * XXX: we're replacing the entire reflink pointer with an error
+	 * key, we should just be replacing the part that was missing:
+	 */
+	if (fsck_err(c, "%llu:%llu len %u points to nonexistent indirect extent %llu",
+		     p.k->p.inode, p.k->p.offset, p.k->size, *idx)) {
+		struct bkey_i_error *new;
 
+		new = kmalloc(sizeof(*new), GFP_KERNEL);
+		if (!new) {
+			bch_err(c, "%s: error allocating new key", __func__);
+			return -ENOMEM;
 		}
-	} else {
-		bch2_fs_inconsistent(c,
-				     "%llu:%llu len %u points to nonexistent indirect extent %llu",
-				     p.k->p.inode, p.k->p.offset, p.k->size, idx);
-		bch2_inconsistent_error(c);
-		ret = -EIO;
+
+		bkey_init(&new->k);
+		new->k.type	= KEY_TYPE_error;
+		new->k.p	= p.k->p;
+		new->k.size	= p.k->size;
+		ret = bch2_journal_key_insert(c, BTREE_ID_extents, 0, &new->k_i);
 	}
 fsck_err:
 	return ret;
@@ -1181,10 +1167,9 @@ static int bch2_mark_reflink_p(struct bch_fs *c,
 	struct reflink_gc *ref;
 	size_t l, r, m;
 	u64 idx = le64_to_cpu(p.v->idx) - le32_to_cpu(p.v->front_pad);
-	u64 sectors = (u64) le32_to_cpu(p.v->front_pad) +
-			    le32_to_cpu(p.v->back_pad) +
-			    p.k->size;
-	s64 ret = 0;
+	u64 end_idx = le64_to_cpu(p.v->idx) + p.k->size +
+		le32_to_cpu(p.v->back_pad);
+	int ret = 0;
 
 	BUG_ON((flags & (BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE)) ==
 	       (BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE));
@@ -1201,17 +1186,10 @@ static int bch2_mark_reflink_p(struct bch_fs *c,
 			r = m;
 	}
 
-	while (sectors) {
-		ret = __bch2_mark_reflink_p(c, p, idx, flags, &l);
-		if (ret <= 0)
-			return ret;
+	while (idx < end_idx && !ret)
+		ret = __bch2_mark_reflink_p(c, p, &idx, flags, l++);
 
-		ret = min_t(s64, ret, sectors);
-		idx	+= ret;
-		sectors	-= ret;
-	}
-
-	return 0;
+	return ret;
 }
 
 static int bch2_mark_key_locked(struct bch_fs *c,
@@ -1730,7 +1708,7 @@ static int bch2_trans_mark_reservation(struct btree_trans *trans,
 
 static int __bch2_trans_mark_reflink_p(struct btree_trans *trans,
 			struct bkey_s_c_reflink_p p,
-			u64 idx, unsigned flags)
+			u64 *idx, unsigned flags)
 {
 	struct bch_fs *c = trans->c;
 	struct btree_iter iter;
@@ -1738,9 +1716,9 @@ static int __bch2_trans_mark_reflink_p(struct btree_trans *trans,
 	struct bkey_i *n;
 	__le64 *refcount;
 	int add = !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1;
-	s64 ret;
+	int ret;
 
-	bch2_trans_iter_init(trans, &iter, BTREE_ID_reflink, POS(0, idx),
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_reflink, POS(0, *idx),
 			     BTREE_ITER_INTENT|
 			     BTREE_ITER_WITH_UPDATES);
 	k = bch2_btree_iter_peek_slot(&iter);
@@ -1759,7 +1737,7 @@ static int __bch2_trans_mark_reflink_p(struct btree_trans *trans,
 	if (!refcount) {
 		bch2_fs_inconsistent(c,
 			"%llu:%llu len %u points to nonexistent indirect extent %llu",
-			p.k->p.inode, p.k->p.offset, p.k->size, idx);
+			p.k->p.inode, p.k->p.offset, p.k->size, *idx);
 		ret = -EIO;
 		goto err;
 	}
@@ -1767,7 +1745,7 @@ static int __bch2_trans_mark_reflink_p(struct btree_trans *trans,
 	if (!*refcount && (flags & BTREE_TRIGGER_OVERWRITE)) {
 		bch2_fs_inconsistent(c,
 			"%llu:%llu len %u idx %llu indirect extent refcount underflow",
-			p.k->p.inode, p.k->p.offset, p.k->size, idx);
+			p.k->p.inode, p.k->p.offset, p.k->size, *idx);
 		ret = -EIO;
 		goto err;
 	}
@@ -1799,7 +1777,7 @@ static int __bch2_trans_mark_reflink_p(struct btree_trans *trans,
 	if (ret)
 		goto err;
 
-	ret = k.k->p.offset - idx;
+	*idx = k.k->p.offset;
 err:
 	bch2_trans_iter_exit(trans, &iter);
 	return ret;
@@ -1809,8 +1787,8 @@ static int bch2_trans_mark_reflink_p(struct btree_trans *trans,
 				     struct bkey_s_c k, unsigned flags)
 {
 	struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
-	u64 idx, sectors;
-	s64 ret = 0;
+	u64 idx, end_idx;
+	int ret = 0;
 
 	if (flags & BTREE_TRIGGER_INSERT) {
 		struct bch_reflink_p *v = (struct bch_reflink_p *) p.v;
@@ -1818,22 +1796,14 @@ static int bch2_trans_mark_reflink_p(struct btree_trans *trans,
 		v->front_pad = v->back_pad = 0;
 	}
 
-	idx = le64_to_cpu(p.v->idx) - le32_to_cpu(p.v->front_pad);
-	sectors = (u64) le32_to_cpu(p.v->front_pad) +
-			le32_to_cpu(p.v->back_pad) +
-			p.k->size;
-
-	while (sectors) {
-		ret = __bch2_trans_mark_reflink_p(trans, p, idx, flags);
-		if (ret < 0)
-			return ret;
+	idx	= le64_to_cpu(p.v->idx) - le32_to_cpu(p.v->front_pad);
+	end_idx = le64_to_cpu(p.v->idx) + p.k->size +
+		le32_to_cpu(p.v->back_pad);
 
-		ret = min_t(s64, ret, sectors);
-		idx	+= ret;
-		sectors	-= ret;
-	}
+	while (idx < end_idx && !ret)
+		ret = __bch2_trans_mark_reflink_p(trans, p, &idx, flags);
 
-	return 0;
+	return ret;
 }
 
 int bch2_trans_mark_key(struct btree_trans *trans, struct bkey_s_c old,
-- 
cgit 


From d121172561d670c8152559614b3575322d709d8d Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 25 Oct 2021 18:30:28 -0400
Subject: bcachefs: More general fix for transaction paths overflow

for_each_btree_key() now calls bch2_trans_begin() as needed; that means,
we can also call it when we're in danger of overflowing transaction
paths.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_iter.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index e58cad4b8fc6..16fa0fe1c5b7 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -329,8 +329,9 @@ __bch2_btree_iter_peek_and_restart(struct btree_trans *trans,
 {
 	struct bkey_s_c k;
 
-	while (k = __bch2_btree_iter_peek(iter, flags),
-	       bkey_err(k) == -EINTR)
+	while ((hweight64(trans->paths_allocated) > BTREE_ITER_MAX / 2) ||
+	       (k = __bch2_btree_iter_peek(iter, flags),
+		bkey_err(k) == -EINTR))
 		bch2_trans_begin(trans);
 
 	return k;
-- 
cgit 


From 6b3d8b8992e59d3a145f67173a0d75fa25e6e750 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 25 Oct 2021 19:30:24 -0400
Subject: bcachefs: Don't run triggers in fix_reflink_p_key()

It seems some users have reflink pointers which span many indirect
extents, from a short window in time when merging of reflink pointers
was allowed.

Now, we're seeing transaction path overflows in fix_reflink_p(), the
code path to clear out the reflink_p fields now used for front/back pad
- but, we don't actually need to be running triggers in that path, which
is an easy partial fix.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/fsck.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 197b9079e2b8..a61d380a47b6 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -2258,7 +2258,7 @@ static int fix_reflink_p_key(struct btree_trans *trans, struct btree_iter *iter)
 	u->v.front_pad	= 0;
 	u->v.back_pad	= 0;
 
-	return bch2_trans_update(trans, iter, &u->k_i, 0);
+	return bch2_trans_update(trans, iter, &u->k_i, BTREE_TRIGGER_NORUN);
 }
 
 static int fix_reflink_p(struct bch_fs *c)
-- 
cgit 


From f3b1e1937973624d3bc5f3ba0824e228ae256b88 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 26 Oct 2021 14:07:43 -0400
Subject: bcachefs: Improve error messages in trans_mark_reflink_p()

We should always print out the key we were marking.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/buckets.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 2982f71bcf2d..fc4d9d75794c 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -1716,6 +1716,7 @@ static int __bch2_trans_mark_reflink_p(struct btree_trans *trans,
 	struct bkey_i *n;
 	__le64 *refcount;
 	int add = !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1;
+	char buf[200];
 	int ret;
 
 	bch2_trans_iter_init(trans, &iter, BTREE_ID_reflink, POS(0, *idx),
@@ -1735,17 +1736,19 @@ static int __bch2_trans_mark_reflink_p(struct btree_trans *trans,
 
 	refcount = bkey_refcount(n);
 	if (!refcount) {
+		bch2_bkey_val_to_text(&PBUF(buf), c, p.s_c);
 		bch2_fs_inconsistent(c,
-			"%llu:%llu len %u points to nonexistent indirect extent %llu",
-			p.k->p.inode, p.k->p.offset, p.k->size, *idx);
+			"nonexistent indirect extent at %llu while marking\n  %s",
+			*idx, buf);
 		ret = -EIO;
 		goto err;
 	}
 
 	if (!*refcount && (flags & BTREE_TRIGGER_OVERWRITE)) {
+		bch2_bkey_val_to_text(&PBUF(buf), c, p.s_c);
 		bch2_fs_inconsistent(c,
-			"%llu:%llu len %u idx %llu indirect extent refcount underflow",
-			p.k->p.inode, p.k->p.offset, p.k->size, *idx);
+			"indirect extent refcount underflow at %llu while marking\n  %s",
+			*idx, buf);
 		ret = -EIO;
 		goto err;
 	}
-- 
cgit 


From 2027875bd8318171159495c948461eae2f84936d Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 11 Oct 2021 12:03:19 -0400
Subject: bcachefs: Add BCH_SUBVOLUME_UNLINKED

Snapshot deletion needs to become a multi step process, where we unlink,
then tear down the page cache, then delete the subvolume - the deleting
flag is equivalent to an inode with i_nlink = 0.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/bcachefs.h        |   4 +
 fs/bcachefs/bcachefs_format.h |   1 +
 fs/bcachefs/fs-common.c       |  30 ++-----
 fs/bcachefs/fs-common.h       |   2 +-
 fs/bcachefs/fs-ioctl.c        |   2 +-
 fs/bcachefs/fs.c              |  11 ++-
 fs/bcachefs/fs.h              |   2 +-
 fs/bcachefs/fsck.c            |  18 ++++-
 fs/bcachefs/inode.c           |   6 +-
 fs/bcachefs/subvolume.c       | 182 ++++++++++++++++++++++++++++++++++++++----
 fs/bcachefs/subvolume.h       |   5 +-
 fs/bcachefs/subvolume_types.h |  11 +++
 12 files changed, 223 insertions(+), 51 deletions(-)
 create mode 100644 fs/bcachefs/subvolume_types.h

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 1608faae0d0b..567270015008 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -353,6 +353,7 @@ enum bch_time_stats {
 #include "quota_types.h"
 #include "rebalance_types.h"
 #include "replicas_types.h"
+#include "subvolume_types.h"
 #include "super_types.h"
 
 /* Number of nodes btree coalesce will try to coalesce at once */
@@ -657,6 +658,9 @@ struct bch_fs {
 	struct bch_snapshot_table __rcu *snapshot_table;
 	struct mutex		snapshot_table_lock;
 	struct work_struct	snapshot_delete_work;
+	struct work_struct	snapshot_wait_for_pagecache_and_delete_work;
+	struct snapshot_id_list	snapshots_unlinked;
+	struct mutex		snapshots_unlinked_lock;
 
 	/* BTREE CACHE */
 	struct bio_set		btree_bio;
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index 481bf643bd6f..8e1423b138a6 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -974,6 +974,7 @@ LE32_BITMASK(BCH_SUBVOLUME_RO,		struct bch_subvolume, flags,  0,  1)
  * can delete it (or whether it should just be rm -rf'd)
  */
 LE32_BITMASK(BCH_SUBVOLUME_SNAP,	struct bch_subvolume, flags,  1,  2)
+LE32_BITMASK(BCH_SUBVOLUME_UNLINKED,	struct bch_subvolume, flags,  2,  3)
 
 /* Snapshots */
 
diff --git a/fs/bcachefs/fs-common.c b/fs/bcachefs/fs-common.c
index c49de741e1e3..5f3429e99115 100644
--- a/fs/bcachefs/fs-common.c
+++ b/fs/bcachefs/fs-common.c
@@ -239,7 +239,7 @@ int bch2_unlink_trans(struct btree_trans *trans,
 		      struct bch_inode_unpacked *dir_u,
 		      struct bch_inode_unpacked *inode_u,
 		      const struct qstr *name,
-		      int deleting_snapshot)
+		      bool deleting_snapshot)
 {
 	struct bch_fs *c = trans->c;
 	struct btree_iter dir_iter = { NULL };
@@ -267,35 +267,19 @@ int bch2_unlink_trans(struct btree_trans *trans,
 	if (ret)
 		goto err;
 
-	if (deleting_snapshot <= 0 && S_ISDIR(inode_u->bi_mode)) {
+	if (!deleting_snapshot && S_ISDIR(inode_u->bi_mode)) {
 		ret = bch2_empty_dir_trans(trans, inum);
 		if (ret)
 			goto err;
 	}
 
-	if (deleting_snapshot < 0 &&
-	    inode_u->bi_subvol) {
-		struct bch_subvolume s;
-
-		ret = bch2_subvolume_get(trans, inode_u->bi_subvol, true,
-					 BTREE_ITER_CACHED|
-					 BTREE_ITER_WITH_UPDATES,
-					 &s);
-		if (ret)
-			goto err;
-
-		if (BCH_SUBVOLUME_SNAP(&s))
-			deleting_snapshot = 1;
+	if (deleting_snapshot && !inode_u->bi_subvol) {
+		ret = -ENOENT;
+		goto err;
 	}
 
-	if (deleting_snapshot == 1) {
-		if (!inode_u->bi_subvol) {
-			ret = -ENOENT;
-			goto err;
-		}
-
-		ret = bch2_subvolume_delete(trans, inode_u->bi_subvol,
-					    deleting_snapshot);
+	if (deleting_snapshot || inode_u->bi_subvol) {
+		ret = bch2_subvolume_unlink(trans, inode_u->bi_subvol);
 		if (ret)
 			goto err;
 
diff --git a/fs/bcachefs/fs-common.h b/fs/bcachefs/fs-common.h
index 9bb0a9676147..dde237859514 100644
--- a/fs/bcachefs/fs-common.h
+++ b/fs/bcachefs/fs-common.h
@@ -26,7 +26,7 @@ int bch2_link_trans(struct btree_trans *,
 int bch2_unlink_trans(struct btree_trans *, subvol_inum,
 		      struct bch_inode_unpacked *,
 		      struct bch_inode_unpacked *,
-		      const struct qstr *, int);
+		      const struct qstr *, bool);
 
 int bch2_rename_trans(struct btree_trans *,
 		      subvol_inum, struct bch_inode_unpacked *,
diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c
index a12b591ec9ca..de94895ace9f 100644
--- a/fs/bcachefs/fs-ioctl.c
+++ b/fs/bcachefs/fs-ioctl.c
@@ -441,7 +441,7 @@ static long bch2_ioctl_subvolume_destroy(struct bch_fs *c, struct file *filp,
 
 	dir = path.dentry->d_parent->d_inode;
 
-	ret = __bch2_unlink(dir, path.dentry, 1);
+	ret = __bch2_unlink(dir, path.dentry, true);
 	if (!ret) {
 		fsnotify_rmdir(dir, path.dentry);
 		d_delete(path.dentry);
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index 334cd335ff11..c325e5c4325c 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -490,7 +490,7 @@ static int bch2_link(struct dentry *old_dentry, struct inode *vdir,
 }
 
 int __bch2_unlink(struct inode *vdir, struct dentry *dentry,
-		  int deleting_snapshot)
+		  bool deleting_snapshot)
 {
 	struct bch_fs *c = vdir->i_sb->s_fs_info;
 	struct bch_inode_info *dir = to_bch_ei(vdir);
@@ -527,7 +527,7 @@ int __bch2_unlink(struct inode *vdir, struct dentry *dentry,
 
 static int bch2_unlink(struct inode *vdir, struct dentry *dentry)
 {
-	return __bch2_unlink(vdir, dentry, -1);
+	return __bch2_unlink(vdir, dentry, false);
 }
 
 static int bch2_symlink(struct mnt_idmap *idmap,
@@ -1292,6 +1292,12 @@ static int bch2_vfs_write_inode(struct inode *vinode,
 	return ret;
 }
 
+static int bch2_drop_inode(struct inode *vinode)
+{
+
+	return generic_drop_inode(vinode);
+}
+
 static void bch2_evict_inode(struct inode *vinode)
 {
 	struct bch_fs *c = vinode->i_sb->s_fs_info;
@@ -1496,6 +1502,7 @@ static const struct super_operations bch_super_operations = {
 	.alloc_inode	= bch2_alloc_inode,
 	.destroy_inode	= bch2_destroy_inode,
 	.write_inode	= bch2_vfs_write_inode,
+	.drop_inode	= bch2_drop_inode,
 	.evict_inode	= bch2_evict_inode,
 	.sync_fs	= bch2_sync_fs,
 	.statfs		= bch2_statfs,
diff --git a/fs/bcachefs/fs.h b/fs/bcachefs/fs.h
index 40898c4d197b..2616b15eb51c 100644
--- a/fs/bcachefs/fs.h
+++ b/fs/bcachefs/fs.h
@@ -183,7 +183,7 @@ int __must_check bch2_write_inode(struct bch_fs *, struct bch_inode_info *,
 int bch2_setattr_nonsize(struct mnt_idmap *,
 			 struct bch_inode_info *,
 			 struct iattr *);
-int __bch2_unlink(struct inode *, struct dentry *, int);
+int __bch2_unlink(struct inode *, struct dentry *, bool);
 
 void bch2_vfs_exit(void);
 int bch2_vfs_init(void);
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index a61d380a47b6..6b3eecdef81a 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -256,7 +256,7 @@ retry:
 
 	/* Subvolume root? */
 	if (inode_u.bi_subvol) {
-		ret = bch2_subvolume_delete(trans, inode_u.bi_subvol, -1);
+		ret = bch2_subvolume_delete(trans, inode_u.bi_subvol);
 		if (ret)
 			goto err;
 	}
@@ -992,12 +992,28 @@ static int check_subvols(struct bch_fs *c)
 	struct btree_trans trans;
 	struct btree_iter iter;
 	struct bkey_s_c k;
+	struct bkey_s_c_subvolume subvol;
 	int ret;
 
 	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
 
 	for_each_btree_key(&trans, iter, BTREE_ID_subvolumes, POS_MIN,
 			   0, k, ret) {
+		if (k.k->type != KEY_TYPE_subvolume)
+			continue;
+
+		subvol = bkey_s_c_to_subvolume(k);
+
+		if (BCH_SUBVOLUME_UNLINKED(subvol.v)) {
+			ret = __bch2_trans_do(&trans,  NULL, NULL,
+					      BTREE_INSERT_LAZY_RW,
+					bch2_subvolume_delete(&trans, iter.pos.offset));
+			if (ret) {
+				bch_err(c, "error deleting subvolume %llu: %i",
+					iter.pos.offset, ret);
+				break;
+			}
+		}
 	}
 	bch2_trans_iter_exit(&trans, &iter);
 
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index 7fccf842a46b..3ae321a99cee 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -709,11 +709,7 @@ retry:
 	bch2_inode_unpack(bkey_s_c_to_inode(k), &inode_u);
 
 	/* Subvolume root? */
-	if (inode_u.bi_subvol) {
-		ret = bch2_subvolume_delete(&trans, inode_u.bi_subvol, -1);
-		if (ret)
-			goto err;
-	}
+	BUG_ON(inode_u.bi_subvol);
 
 	bkey_inode_generation_init(&delete.k_i);
 	delete.k.p = iter.pos;
diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c
index 9bd8d61c96fe..58cda98989b1 100644
--- a/fs/bcachefs/subvolume.c
+++ b/fs/bcachefs/subvolume.c
@@ -4,6 +4,7 @@
 #include "btree_key_cache.h"
 #include "btree_update.h"
 #include "error.h"
+#include "fs.h"
 #include "subvolume.h"
 
 /* Snapshot tree: */
@@ -541,13 +542,6 @@ err:
 	return ret;
 }
 
-/* List of snapshot IDs that are being deleted: */
-struct snapshot_id_list {
-	u32		nr;
-	u32		size;
-	u32		*d;
-};
-
 static bool snapshot_list_has_id(struct snapshot_id_list *s, u32 id)
 {
 	unsigned i;
@@ -819,9 +813,11 @@ int bch2_subvolume_get_snapshot(struct btree_trans *trans, u32 subvol,
 	return ret;
 }
 
-/* XXX: mark snapshot id for deletion, walk btree and delete: */
-int bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid,
-			  int deleting_snapshot)
+/*
+ * Delete subvolume, mark snapshot ID as deleted, queue up snapshot
+ * deletion/cleanup:
+ */
+int bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid)
 {
 	struct btree_iter iter;
 	struct bkey_s_c k;
@@ -849,12 +845,6 @@ int bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid,
 	subvol = bkey_s_c_to_subvolume(k);
 	snapid = le32_to_cpu(subvol.v->snapshot);
 
-	if (deleting_snapshot >= 0 &&
-	    deleting_snapshot != BCH_SUBVOLUME_SNAP(subvol.v)) {
-		ret = -ENOENT;
-		goto err;
-	}
-
 	delete = bch2_trans_kmalloc(trans, sizeof(*delete));
 	ret = PTR_ERR_OR_ZERO(delete);
 	if (ret)
@@ -880,6 +870,163 @@ err:
 	return ret;
 }
 
+static void bch2_evict_subvolume_inodes(struct bch_fs *c,
+				 struct snapshot_id_list *s)
+{
+	struct super_block *sb = c->vfs_sb;
+	struct inode *inode;
+
+	spin_lock(&sb->s_inode_list_lock);
+	list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
+		if (!snapshot_list_has_id(s, to_bch_ei(inode)->ei_subvol) ||
+		    (inode->i_state & I_FREEING))
+			continue;
+
+		d_mark_dontcache(inode);
+		d_prune_aliases(inode);
+	}
+	spin_unlock(&sb->s_inode_list_lock);
+again:
+	cond_resched();
+	spin_lock(&sb->s_inode_list_lock);
+	list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
+		if (!snapshot_list_has_id(s, to_bch_ei(inode)->ei_subvol) ||
+		    (inode->i_state & I_FREEING))
+			continue;
+
+		if (!(inode->i_state & I_DONTCACHE)) {
+			d_mark_dontcache(inode);
+			d_prune_aliases(inode);
+		}
+
+		spin_lock(&inode->i_lock);
+		if (snapshot_list_has_id(s, to_bch_ei(inode)->ei_subvol) &&
+		    !(inode->i_state & I_FREEING)) {
+			wait_queue_head_t *wq = bit_waitqueue(&inode->i_state, __I_NEW);
+			DEFINE_WAIT_BIT(wait, &inode->i_state, __I_NEW);
+			prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
+			spin_unlock(&inode->i_lock);
+			spin_unlock(&sb->s_inode_list_lock);
+			schedule();
+			finish_wait(wq, &wait.wq_entry);
+			goto again;
+		}
+
+		spin_unlock(&inode->i_lock);
+	}
+	spin_unlock(&sb->s_inode_list_lock);
+}
+
+void bch2_subvolume_wait_for_pagecache_and_delete(struct work_struct *work)
+{
+	struct bch_fs *c = container_of(work, struct bch_fs,
+				snapshot_wait_for_pagecache_and_delete_work);
+	struct snapshot_id_list s;
+	u32 *id;
+	int ret = 0;
+
+	while (!ret) {
+		mutex_lock(&c->snapshots_unlinked_lock);
+		s = c->snapshots_unlinked;
+		memset(&c->snapshots_unlinked, 0, sizeof(c->snapshots_unlinked));
+		mutex_unlock(&c->snapshots_unlinked_lock);
+
+		if (!s.nr)
+			break;
+
+		bch2_evict_subvolume_inodes(c, &s);
+
+		for (id = s.d; id < s.d + s.nr; id++) {
+			ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_NOFAIL,
+				      bch2_subvolume_delete(&trans, *id));
+			if (ret) {
+				bch_err(c, "error %i deleting subvolume %u", ret, *id);
+				break;
+			}
+		}
+
+		kfree(s.d);
+	}
+
+	percpu_ref_put(&c->writes);
+}
+
+struct subvolume_unlink_hook {
+	struct btree_trans_commit_hook	h;
+	u32				subvol;
+};
+
+int bch2_subvolume_wait_for_pagecache_and_delete_hook(struct btree_trans *trans,
+						      struct btree_trans_commit_hook *_h)
+{
+	struct subvolume_unlink_hook *h = container_of(_h, struct subvolume_unlink_hook, h);
+	struct bch_fs *c = trans->c;
+	int ret = 0;
+
+	mutex_lock(&c->snapshots_unlinked_lock);
+	if (!snapshot_list_has_id(&c->snapshots_unlinked, h->subvol))
+		ret = snapshot_id_add(&c->snapshots_unlinked, h->subvol);
+	mutex_unlock(&c->snapshots_unlinked_lock);
+
+	if (ret)
+		return ret;
+
+	if (unlikely(!percpu_ref_tryget(&c->writes)))
+		return -EROFS;
+
+	if (!queue_work(system_long_wq, &c->snapshot_wait_for_pagecache_and_delete_work))
+		percpu_ref_put(&c->writes);
+	return 0;
+}
+
+int bch2_subvolume_unlink(struct btree_trans *trans, u32 subvolid)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	struct bkey_i_subvolume *n;
+	struct subvolume_unlink_hook *h;
+	int ret = 0;
+
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_subvolumes,
+			     POS(0, subvolid),
+			     BTREE_ITER_CACHED|
+			     BTREE_ITER_INTENT);
+	k = bch2_btree_iter_peek_slot(&iter);
+	ret = bkey_err(k);
+	if (ret)
+		goto err;
+
+	if (k.k->type != KEY_TYPE_subvolume) {
+		bch2_fs_inconsistent(trans->c, "missing subvolume %u", subvolid);
+		ret = -EIO;
+		goto err;
+	}
+
+	n = bch2_trans_kmalloc(trans, sizeof(*n));
+	ret = PTR_ERR_OR_ZERO(n);
+	if (ret)
+		goto err;
+
+	bkey_reassemble(&n->k_i, k);
+	SET_BCH_SUBVOLUME_UNLINKED(&n->v, true);
+
+	ret = bch2_trans_update(trans, &iter, &n->k_i, 0);
+	if (ret)
+		goto err;
+
+	h = bch2_trans_kmalloc(trans, sizeof(*h));
+	ret = PTR_ERR_OR_ZERO(h);
+	if (ret)
+		goto err;
+
+	h->h.fn		= bch2_subvolume_wait_for_pagecache_and_delete_hook;
+	h->subvol	= subvolid;
+	bch2_trans_commit_hook(trans, &h->h);
+err:
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
 int bch2_subvolume_create(struct btree_trans *trans, u64 inode,
 			  u32 src_subvolid,
 			  u32 *new_subvolid,
@@ -977,5 +1124,8 @@ err:
 int bch2_fs_subvolumes_init(struct bch_fs *c)
 {
 	INIT_WORK(&c->snapshot_delete_work, bch2_delete_dead_snapshots_work);
+	INIT_WORK(&c->snapshot_wait_for_pagecache_and_delete_work,
+		  bch2_subvolume_wait_for_pagecache_and_delete);
+	mutex_init(&c->snapshots_unlinked_lock);
 	return 0;
 }
diff --git a/fs/bcachefs/subvolume.h b/fs/bcachefs/subvolume.h
index f98c8c0dbea2..45234c9de0f6 100644
--- a/fs/bcachefs/subvolume.h
+++ b/fs/bcachefs/subvolume.h
@@ -2,6 +2,8 @@
 #ifndef _BCACHEFS_SUBVOLUME_H
 #define _BCACHEFS_SUBVOLUME_H
 
+#include "subvolume_types.h"
+
 void bch2_snapshot_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 const char *bch2_snapshot_invalid(const struct bch_fs *, struct bkey_s_c);
 
@@ -108,7 +110,8 @@ int bch2_subvolume_get(struct btree_trans *, unsigned,
 		       bool, int, struct bch_subvolume *);
 int bch2_subvolume_get_snapshot(struct btree_trans *, u32, u32 *);
 
-int bch2_subvolume_delete(struct btree_trans *, u32, int);
+int bch2_subvolume_delete(struct btree_trans *, u32);
+int bch2_subvolume_unlink(struct btree_trans *, u32);
 int bch2_subvolume_create(struct btree_trans *, u64, u32,
 			  u32 *, u32 *, bool);
 
diff --git a/fs/bcachefs/subvolume_types.h b/fs/bcachefs/subvolume_types.h
new file mode 100644
index 000000000000..9410b9587591
--- /dev/null
+++ b/fs/bcachefs/subvolume_types.h
@@ -0,0 +1,11 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_SUBVOLUME_TYPES_H
+#define _BCACHEFS_SUBVOLUME_TYPES_H
+
+struct snapshot_id_list {
+	u32		nr;
+	u32		size;
+	u32		*d;
+};
+
+#endif /* _BCACHEFS_SUBVOLUME_TYPES_H */
-- 
cgit 


From f124345e2bed01f852a77776aaed1d106cabafbe Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 26 Oct 2021 16:03:28 -0400
Subject: bcachefs: Drop bch2_journal_meta() call when going RW

Back when we relied on the journal sequence number blacklist machinery
for consistency between btree and the journal, we needed to ensure a new
journal entry was written before any btree writes were done. But, this
had the side effect of consuming some space in the journal prior to
doing journal replay - which could lead to a very wedged filesystem,
since we don't yet have a way to grow the journal prior to going RW.

Fortunately, the journal sequence number blacklist machinery isn't
needed anymore, as btree node pointers now record the numer of sectors
currently written to that node - that code should all be ripped out.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/super.c | 7 -------
 1 file changed, 7 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index d17e4f005b3f..ca2acb7c6134 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -412,13 +412,6 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early)
 	if (ret)
 		goto err;
 
-	/*
-	 * We need to write out a journal entry before we start doing btree
-	 * updates, to ensure that on unclean shutdown new journal blacklist
-	 * entries are created:
-	 */
-	bch2_journal_meta(&c->journal);
-
 	clear_bit(BCH_FS_ALLOC_CLEAN, &c->flags);
 
 	for_each_rw_member(ca, c, i)
-- 
cgit 


From 8325cd1ed480633651edd33fbb5f3be16c4afa47 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 27 Oct 2021 17:53:20 -0400
Subject: bcachefs: Don't do upgrades in nochanges mode

nochanges mode is often used for getting data off of otherwise
nonrecoverable filesystems, which is often because of errors hit during
fsck.

Don't force version upgrade & fsck in nochanges mode, so that it's more
likely to mount.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/recovery.c | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 6bf9c48a7871..da9c3ea528e7 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -1081,15 +1081,17 @@ int bch2_fs_recovery(struct bch_fs *c)
 		set_bit(BCH_FS_REBUILD_REPLICAS, &c->flags);
 	}
 
-	if (c->sb.version < bcachefs_metadata_version_inode_backpointers) {
-		bch_info(c, "version prior to inode backpointers, upgrade and fsck required");
-		c->opts.version_upgrade	= true;
-		c->opts.fsck		= true;
-		c->opts.fix_errors	= FSCK_OPT_YES;
-	} else if (c->sb.version < bcachefs_metadata_version_subvol_dirent) {
-		bch_info(c, "filesystem version is prior to subvol_dirent - upgrading");
-		c->opts.version_upgrade = true;
-		c->opts.fsck		= true;
+	if (!c->opts.nochanges) {
+		if (c->sb.version < bcachefs_metadata_version_inode_backpointers) {
+			bch_info(c, "version prior to inode backpointers, upgrade and fsck required");
+			c->opts.version_upgrade	= true;
+			c->opts.fsck		= true;
+			c->opts.fix_errors	= FSCK_OPT_YES;
+		} else if (c->sb.version < bcachefs_metadata_version_subvol_dirent) {
+			bch_info(c, "filesystem version is prior to subvol_dirent - upgrading");
+			c->opts.version_upgrade = true;
+			c->opts.fsck		= true;
+		}
 	}
 
 	ret = bch2_blacklist_table_initialize(c);
-- 
cgit 


From 41f9b7d39fb11c9f306809681bb6991ac96f9b2e Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 28 Oct 2021 16:24:39 -0400
Subject: bcachefs: Move bch2_evict_subvolume_inodes() to fs.c

This fixes building in userspace - code that's coupled to the kernel VFS
interface should live in fs.c

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/fs.c        | 54 ++++++++++++++++++++++++++++++++++++++++------
 fs/bcachefs/fs.h        |  4 ++++
 fs/bcachefs/subvolume.c | 57 -------------------------------------------------
 fs/bcachefs/subvolume.h | 10 +++++++++
 4 files changed, 61 insertions(+), 64 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index c325e5c4325c..7647e117013d 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -1292,12 +1292,6 @@ static int bch2_vfs_write_inode(struct inode *vinode,
 	return ret;
 }
 
-static int bch2_drop_inode(struct inode *vinode)
-{
-
-	return generic_drop_inode(vinode);
-}
-
 static void bch2_evict_inode(struct inode *vinode)
 {
 	struct bch_fs *c = vinode->i_sb->s_fs_info;
@@ -1318,6 +1312,53 @@ static void bch2_evict_inode(struct inode *vinode)
 	}
 }
 
+void bch2_evict_subvolume_inodes(struct bch_fs *c,
+				 struct snapshot_id_list *s)
+{
+	struct super_block *sb = c->vfs_sb;
+	struct inode *inode;
+
+	spin_lock(&sb->s_inode_list_lock);
+	list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
+		if (!snapshot_list_has_id(s, to_bch_ei(inode)->ei_subvol) ||
+		    (inode->i_state & I_FREEING))
+			continue;
+
+		d_mark_dontcache(inode);
+		d_prune_aliases(inode);
+	}
+	spin_unlock(&sb->s_inode_list_lock);
+again:
+	cond_resched();
+	spin_lock(&sb->s_inode_list_lock);
+	list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
+		if (!snapshot_list_has_id(s, to_bch_ei(inode)->ei_subvol) ||
+		    (inode->i_state & I_FREEING))
+			continue;
+
+		if (!(inode->i_state & I_DONTCACHE)) {
+			d_mark_dontcache(inode);
+			d_prune_aliases(inode);
+		}
+
+		spin_lock(&inode->i_lock);
+		if (snapshot_list_has_id(s, to_bch_ei(inode)->ei_subvol) &&
+		    !(inode->i_state & I_FREEING)) {
+			wait_queue_head_t *wq = bit_waitqueue(&inode->i_state, __I_NEW);
+			DEFINE_WAIT_BIT(wait, &inode->i_state, __I_NEW);
+			prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
+			spin_unlock(&inode->i_lock);
+			spin_unlock(&sb->s_inode_list_lock);
+			schedule();
+			finish_wait(wq, &wait.wq_entry);
+			goto again;
+		}
+
+		spin_unlock(&inode->i_lock);
+	}
+	spin_unlock(&sb->s_inode_list_lock);
+}
+
 static int bch2_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
 	struct super_block *sb = dentry->d_sb;
@@ -1502,7 +1543,6 @@ static const struct super_operations bch_super_operations = {
 	.alloc_inode	= bch2_alloc_inode,
 	.destroy_inode	= bch2_destroy_inode,
 	.write_inode	= bch2_vfs_write_inode,
-	.drop_inode	= bch2_drop_inode,
 	.evict_inode	= bch2_evict_inode,
 	.sync_fs	= bch2_sync_fs,
 	.statfs		= bch2_statfs,
diff --git a/fs/bcachefs/fs.h b/fs/bcachefs/fs.h
index 2616b15eb51c..38c04282da64 100644
--- a/fs/bcachefs/fs.h
+++ b/fs/bcachefs/fs.h
@@ -185,11 +185,15 @@ int bch2_setattr_nonsize(struct mnt_idmap *,
 			 struct iattr *);
 int __bch2_unlink(struct inode *, struct dentry *, bool);
 
+void bch2_evict_subvolume_inodes(struct bch_fs *, struct snapshot_id_list *);
+
 void bch2_vfs_exit(void);
 int bch2_vfs_init(void);
 
 #else
 
+static inline void bch2_evict_subvolume_inodes(struct bch_fs *c,
+					       struct snapshot_id_list *s) {}
 static inline void bch2_vfs_exit(void) {}
 static inline int bch2_vfs_init(void) { return 0; }
 
diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c
index 58cda98989b1..4d385c9e9268 100644
--- a/fs/bcachefs/subvolume.c
+++ b/fs/bcachefs/subvolume.c
@@ -542,16 +542,6 @@ err:
 	return ret;
 }
 
-static bool snapshot_list_has_id(struct snapshot_id_list *s, u32 id)
-{
-	unsigned i;
-
-	for (i = 0; i < s->nr; i++)
-		if (id == s->d[i])
-			return true;
-	return false;
-}
-
 static int snapshot_id_add(struct snapshot_id_list *s, u32 id)
 {
 	BUG_ON(snapshot_list_has_id(s, id));
@@ -870,53 +860,6 @@ err:
 	return ret;
 }
 
-static void bch2_evict_subvolume_inodes(struct bch_fs *c,
-				 struct snapshot_id_list *s)
-{
-	struct super_block *sb = c->vfs_sb;
-	struct inode *inode;
-
-	spin_lock(&sb->s_inode_list_lock);
-	list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
-		if (!snapshot_list_has_id(s, to_bch_ei(inode)->ei_subvol) ||
-		    (inode->i_state & I_FREEING))
-			continue;
-
-		d_mark_dontcache(inode);
-		d_prune_aliases(inode);
-	}
-	spin_unlock(&sb->s_inode_list_lock);
-again:
-	cond_resched();
-	spin_lock(&sb->s_inode_list_lock);
-	list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
-		if (!snapshot_list_has_id(s, to_bch_ei(inode)->ei_subvol) ||
-		    (inode->i_state & I_FREEING))
-			continue;
-
-		if (!(inode->i_state & I_DONTCACHE)) {
-			d_mark_dontcache(inode);
-			d_prune_aliases(inode);
-		}
-
-		spin_lock(&inode->i_lock);
-		if (snapshot_list_has_id(s, to_bch_ei(inode)->ei_subvol) &&
-		    !(inode->i_state & I_FREEING)) {
-			wait_queue_head_t *wq = bit_waitqueue(&inode->i_state, __I_NEW);
-			DEFINE_WAIT_BIT(wait, &inode->i_state, __I_NEW);
-			prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
-			spin_unlock(&inode->i_lock);
-			spin_unlock(&sb->s_inode_list_lock);
-			schedule();
-			finish_wait(wq, &wait.wq_entry);
-			goto again;
-		}
-
-		spin_unlock(&inode->i_lock);
-	}
-	spin_unlock(&sb->s_inode_list_lock);
-}
-
 void bch2_subvolume_wait_for_pagecache_and_delete(struct work_struct *work)
 {
 	struct bch_fs *c = container_of(work, struct bch_fs,
diff --git a/fs/bcachefs/subvolume.h b/fs/bcachefs/subvolume.h
index 45234c9de0f6..b5067dc68fc7 100644
--- a/fs/bcachefs/subvolume.h
+++ b/fs/bcachefs/subvolume.h
@@ -94,6 +94,16 @@ static inline int snapshots_seen_add(struct bch_fs *c, struct snapshots_seen *s,
 	return 0;
 }
 
+static inline bool snapshot_list_has_id(struct snapshot_id_list *s, u32 id)
+{
+	unsigned i;
+
+	for (i = 0; i < s->nr; i++)
+		if (id == s->d[i])
+			return true;
+	return false;
+}
+
 int bch2_fs_snapshots_check(struct bch_fs *);
 void bch2_fs_snapshots_exit(struct bch_fs *);
 int bch2_fs_snapshots_start(struct bch_fs *);
-- 
cgit 


From 6caf05785060b2522f577b000849bbc172efb135 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 28 Oct 2021 16:34:17 -0400
Subject: bcachefs: Fix bch2_btree_iter_advance()

Was popping an assertion on !BTREE_ITER_ALL_SNAPSHOTS iters when getting
to the end.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_iter.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index c1dc33248265..51dd10518214 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -2021,7 +2021,9 @@ err:
 inline bool bch2_btree_iter_advance(struct btree_iter *iter)
 {
 	struct bpos pos = iter->k.p;
-	bool ret = bpos_cmp(pos, SPOS_MAX) != 0;
+	bool ret = (iter->flags & BTREE_ITER_ALL_SNAPSHOTS
+		    ? bpos_cmp(pos, SPOS_MAX)
+		    : bkey_cmp(pos, SPOS_MAX)) != 0;
 
 	if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS))
 		pos = bkey_successor(iter, pos);
-- 
cgit 


From 285b181ad460bb240041a9ca7935f9e884040405 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 28 Oct 2021 16:16:55 -0400
Subject: bcachefs: Improve transaction restart handling in fsck code

The fsck code has been handling transaction restarts locally, to avoid
calling fsck_err() multiple times (and asking the user/logging the error
multiple times) on transaction restart.

However, with our improving assertions about iterator validity, this
isn't working anymore - the code wasn't entirely correct, in ways that
are fine for now but are going to matter once we start wanting online
fsck.

This code converts much of the fsck code to handle transaction restarts
in a more rigorously correct way - moving restart handling up to the top
level of check_dirent, check_xattr and others - at the cost of logging
errors multiple times on transaction restart.

Fixing the issues with logging errors multiple times is probably going
to require memoizing calls to fsck_err() - we'll leave that for future
improvements.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/dirent.c |   4 +-
 fs/bcachefs/fsck.c   | 567 +++++++++++++++++++++++++--------------------------
 fs/bcachefs/inode.h  |   5 +
 fs/bcachefs/opts.h   |   5 +
 4 files changed, 291 insertions(+), 290 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c
index 6be3ec4ec4a6..9267eea810f8 100644
--- a/fs/bcachefs/dirent.c
+++ b/fs/bcachefs/dirent.c
@@ -128,9 +128,7 @@ void bch2_dirent_to_text(struct printbuf *out, struct bch_fs *c,
 	       d.v->d_type != DT_SUBVOL
 	       ? le64_to_cpu(d.v->d_inum)
 	       : le32_to_cpu(d.v->d_child_subvol),
-	       d.v->d_type < BCH_DT_MAX
-	       ? bch2_d_types[d.v->d_type]
-	       : "(bad d_type)");
+	       bch2_d_type_str(d.v->d_type));
 }
 
 static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans,
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 6b3eecdef81a..5bc04c7bbb83 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -94,12 +94,6 @@ err:
 
 }
 
-static int snapshot_lookup_subvol(struct btree_trans *trans, u32 snapshot,
-				  u32 *subvol)
-{
-	return lockrestart_do(trans, __snapshot_lookup_subvol(trans, snapshot, subvol));
-}
-
 static int __subvol_lookup(struct btree_trans *trans, u32 subvol,
 			   u32 *snapshot, u64 *inum)
 {
@@ -140,6 +134,9 @@ static int __lookup_inode(struct btree_trans *trans, u64 inode_nr,
 	if (!ret)
 		*snapshot = iter.pos.snapshot;
 err:
+	if (ret && ret != -EINTR)
+		bch_err(trans->c, "error %i fetching inode %llu:%u",
+			ret, inode_nr, *snapshot);
 	bch2_trans_iter_exit(trans, &iter);
 	return ret;
 }
@@ -172,15 +169,6 @@ static int __lookup_dirent(struct btree_trans *trans,
 	return 0;
 }
 
-static int lookup_dirent(struct btree_trans *trans,
-			 struct bch_hash_info hash_info,
-			 subvol_inum dir, struct qstr *name,
-			 u64 *target, unsigned *type)
-{
-	return lockrestart_do(trans,
-		__lookup_dirent(trans, hash_info, dir, name, target, type));
-}
-
 static int __write_inode(struct btree_trans *trans,
 			 struct bch_inode_unpacked *inode,
 			 u32 snapshot)
@@ -284,7 +272,7 @@ static int __remove_dirent(struct btree_trans *trans, struct bpos pos)
 	struct bch_hash_info dir_hash_info;
 	int ret;
 
-	ret = lookup_inode(trans, pos.inode, &dir_inode, NULL);
+	ret = __lookup_inode(trans, pos.inode, &dir_inode, NULL);
 	if (ret)
 		return ret;
 
@@ -298,17 +286,6 @@ static int __remove_dirent(struct btree_trans *trans, struct bpos pos)
 	return ret;
 }
 
-static int remove_dirent(struct btree_trans *trans, struct bpos pos)
-{
-	int ret = __bch2_trans_do(trans, NULL, NULL,
-				  BTREE_INSERT_NOFAIL|
-				  BTREE_INSERT_LAZY_RW,
-				  __remove_dirent(trans, pos));
-	if (ret)
-		bch_err(trans->c, "remove_dirent: err %i deleting dirent", ret);
-	return ret;
-}
-
 /* Get lost+found, create if it doesn't exist: */
 static int lookup_lostfound(struct btree_trans *trans, u32 subvol,
 			    struct bch_inode_unpacked *lostfound)
@@ -323,65 +300,52 @@ static int lookup_lostfound(struct btree_trans *trans, u32 subvol,
 	u32 snapshot;
 	int ret;
 
-	ret = subvol_lookup(trans, subvol, &snapshot, &root_inum.inum);
+	ret = __subvol_lookup(trans, subvol, &snapshot, &root_inum.inum);
 	if (ret)
 		return ret;
 
-	ret = lookup_inode(trans, root_inum.inum, &root, &snapshot);
-	if (ret) {
-		bch_err(c, "error fetching subvol root: %i", ret);
+	ret = __lookup_inode(trans, root_inum.inum, &root, &snapshot);
+	if (ret)
 		return ret;
-	}
 
 	root_hash_info = bch2_hash_info_init(c, &root);
 
-	ret = lookup_dirent(trans, root_hash_info, root_inum,
+	ret = __lookup_dirent(trans, root_hash_info, root_inum,
 			    &lostfound_str, &inum, &d_type);
 	if (ret == -ENOENT) {
 		bch_notice(c, "creating lost+found");
 		goto create_lostfound;
 	}
 
-	if (ret) {
+	if (ret && ret != -EINTR)
 		bch_err(c, "error looking up lost+found: %i", ret);
+	if (ret)
 		return ret;
-	}
 
 	if (d_type != DT_DIR) {
 		bch_err(c, "error looking up lost+found: not a directory");
 		return ret;
-
 	}
 
-	ret = lookup_inode(trans, inum, lostfound, &snapshot);
-	if (ret && ret != -ENOENT) {
-		/*
-		 * The check_dirents pass has already run, dangling dirents
-		 * shouldn't exist here:
-		 */
-		bch_err(c, "error looking up lost+found: %i", ret);
-		return ret;
-	}
+	/*
+	 * The check_dirents pass has already run, dangling dirents
+	 * shouldn't exist here:
+	 */
+	return __lookup_inode(trans, inum, lostfound, &snapshot);
 
-	if (ret == -ENOENT) {
 create_lostfound:
-		bch2_inode_init_early(c, lostfound);
-
-		ret = __bch2_trans_do(trans, NULL, NULL,
-				      BTREE_INSERT_NOFAIL|
-				      BTREE_INSERT_LAZY_RW,
-			bch2_create_trans(trans, root_inum, &root,
-					  lostfound, &lostfound_str,
-					  0, 0, S_IFDIR|0700, 0, NULL, NULL,
-					  (subvol_inum) { }, 0));
-		if (ret)
-			bch_err(c, "error creating lost+found: %i", ret);
-	}
-
-	return 0;
+	bch2_inode_init_early(c, lostfound);
+
+	ret = bch2_create_trans(trans, root_inum, &root,
+				lostfound, &lostfound_str,
+				0, 0, S_IFDIR|0700, 0, NULL, NULL,
+				(subvol_inum) { }, 0);
+	if (ret && ret != -EINTR)
+		bch_err(c, "error creating lost+found: %i", ret);
+	return ret;
 }
 
-static int reattach_inode(struct btree_trans *trans,
+static int __reattach_inode(struct btree_trans *trans,
 			  struct bch_inode_unpacked *inode,
 			  u32 inode_snapshot)
 {
@@ -393,7 +357,7 @@ static int reattach_inode(struct btree_trans *trans,
 	u32 subvol;
 	int ret;
 
-	ret = snapshot_lookup_subvol(trans, inode_snapshot, &subvol);
+	ret = __snapshot_lookup_subvol(trans, inode_snapshot, &subvol);
 	if (ret)
 		return ret;
 
@@ -404,7 +368,7 @@ static int reattach_inode(struct btree_trans *trans,
 	if (S_ISDIR(inode->bi_mode)) {
 		lostfound.bi_nlink++;
 
-		ret = write_inode(trans, &lostfound, U32_MAX);
+		ret = __write_inode(trans, &lostfound, U32_MAX);
 		if (ret)
 			return ret;
 	}
@@ -414,26 +378,39 @@ static int reattach_inode(struct btree_trans *trans,
 	snprintf(name_buf, sizeof(name_buf), "%llu", inode->bi_inum);
 	name = (struct qstr) QSTR(name_buf);
 
-	ret = __bch2_trans_do(trans, NULL, NULL, BTREE_INSERT_LAZY_RW,
-			bch2_dirent_create(trans,
-					   (subvol_inum) {
-						.subvol = subvol,
-						.inum = lostfound.bi_inum,
-					   },
-					   &dir_hash,
-					   mode_to_type(inode->bi_mode),
-					   &name, inode->bi_inum, &dir_offset,
-					   BCH_HASH_SET_MUST_CREATE));
+	ret = bch2_dirent_create(trans,
+				 (subvol_inum) {
+					.subvol = subvol,
+					.inum = lostfound.bi_inum,
+				 },
+				 &dir_hash,
+				 inode_d_type(inode),
+				 &name, inode->bi_inum, &dir_offset,
+				 BCH_HASH_SET_MUST_CREATE);
+	if (ret)
+		return ret;
+
+	inode->bi_dir		= lostfound.bi_inum;
+	inode->bi_dir_offset	= dir_offset;
+
+	return __write_inode(trans, inode, inode_snapshot);
+}
+
+static int reattach_inode(struct btree_trans *trans,
+			  struct bch_inode_unpacked *inode,
+			  u32 inode_snapshot)
+{
+	int ret = __bch2_trans_do(trans, NULL, NULL,
+				  BTREE_INSERT_LAZY_RW|
+				  BTREE_INSERT_NOFAIL,
+			__reattach_inode(trans, inode, inode_snapshot));
 	if (ret) {
 		bch_err(trans->c, "error %i reattaching inode %llu",
 			ret, inode->bi_inum);
 		return ret;
 	}
 
-	inode->bi_dir		= lostfound.bi_inum;
-	inode->bi_dir_offset	= dir_offset;
-
-	return write_inode(trans, inode, inode_snapshot);
+	return ret;
 }
 
 static int remove_backpointer(struct btree_trans *trans,
@@ -454,7 +431,7 @@ static int remove_backpointer(struct btree_trans *trans,
 		goto out;
 	}
 
-	ret = remove_dirent(trans, k.k->p);
+	ret = __remove_dirent(trans, k.k->p);
 out:
 	bch2_trans_iter_exit(trans, &iter);
 	return ret;
@@ -653,12 +630,6 @@ found:
 	return i;
 }
 
-static int walk_inode(struct btree_trans *trans,
-		      struct inode_walker *w, struct bpos pos)
-{
-	return lockrestart_do(trans, __walk_inode(trans, w, pos));
-}
-
 static int __get_visible_inodes(struct btree_trans *trans,
 				struct inode_walker *w,
 				struct snapshots_seen *s,
@@ -700,12 +671,9 @@ static int check_key_has_snapshot(struct btree_trans *trans,
 
 	if (fsck_err_on(!snapshot_t(c, k.k->p.snapshot)->equiv, c,
 			"key in missing snapshot: %s",
-			(bch2_bkey_val_to_text(&PBUF(buf), c, k), buf))) {
-		ret = __bch2_trans_do(trans, NULL, NULL, BTREE_INSERT_LAZY_RW,
-			bch2_btree_delete_at(trans, iter,
-					     BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE));
-		return ret ?: -EINTR;
-	}
+			(bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)))
+		return bch2_btree_delete_at(trans, iter,
+					    BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: 1;
 fsck_err:
 	return ret;
 }
@@ -739,26 +707,6 @@ static int hash_redo_key(struct btree_trans *trans,
 #endif
 }
 
-static int fsck_hash_delete_at(struct btree_trans *trans,
-			       const struct bch_hash_desc desc,
-			       struct bch_hash_info *info,
-			       struct btree_iter *iter)
-{
-	int ret;
-retry:
-	ret   = bch2_hash_delete_at(trans, desc, info, iter, 0) ?:
-		bch2_trans_commit(trans, NULL, NULL,
-				  BTREE_INSERT_NOFAIL|
-				  BTREE_INSERT_LAZY_RW);
-	if (ret == -EINTR) {
-		ret = bch2_btree_iter_traverse(iter);
-		if (!ret)
-			goto retry;
-	}
-
-	return ret;
-}
-
 static int hash_check_key(struct btree_trans *trans,
 			  const struct bch_hash_desc desc,
 			  struct bch_hash_info *hash_info,
@@ -792,10 +740,7 @@ static int hash_check_key(struct btree_trans *trans,
 				"duplicate hash table keys:\n%s",
 				(bch2_bkey_val_to_text(&PBUF(buf), c,
 						       hash_k), buf))) {
-			ret = fsck_hash_delete_at(trans, desc, hash_info, k_iter);
-			if (ret)
-				return ret;
-			ret = 1;
+			ret = bch2_hash_delete_at(trans, desc, hash_info, k_iter, 0) ?: 1;
 			break;
 		}
 
@@ -814,9 +759,7 @@ bad_hash:
 		     (bch2_bkey_val_to_text(&PBUF(buf), c, hash_k), buf)) == FSCK_ERR_IGNORE)
 		return 0;
 
-	ret = __bch2_trans_do(trans, NULL, NULL,
-			      BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW,
-		hash_redo_key(trans, desc, hash_info, k_iter, hash_k));
+	ret = hash_redo_key(trans, desc, hash_info, k_iter, hash_k);
 	if (ret) {
 		bch_err(c, "hash_redo_key err %i", ret);
 		return ret;
@@ -829,15 +772,53 @@ fsck_err:
 static int check_inode(struct btree_trans *trans,
 		       struct btree_iter *iter,
 		       struct bch_inode_unpacked *prev,
-		       struct bch_inode_unpacked u)
+		       bool full)
 {
 	struct bch_fs *c = trans->c;
+	struct bkey_s_c k;
+	struct bkey_s_c_inode inode;
+	struct bch_inode_unpacked u;
 	bool do_update = false;
-	int ret = 0;
+	int ret;
+
+	k = bch2_btree_iter_peek(iter);
+	if (!k.k)
+		return 0;
+
+	ret = bkey_err(k);
+	if (ret)
+		return ret;
 
-	if (fsck_err_on(prev &&
-			(prev->bi_hash_seed		!= u.bi_hash_seed ||
-			 mode_to_type(prev->bi_mode) != mode_to_type(u.bi_mode)), c,
+	ret = check_key_has_snapshot(trans, iter, k);
+	if (ret)
+		return ret < 0 ? ret : 0;
+
+	/*
+	 * if snapshot id isn't a leaf node, skip it - deletion in
+	 * particular is not atomic, so on the internal snapshot nodes
+	 * we can see inodes marked for deletion after a clean shutdown
+	 */
+	if (bch2_snapshot_internal_node(c, k.k->p.snapshot))
+		return 0;
+
+	if (k.k->type != KEY_TYPE_inode)
+		return 0;
+
+	inode = bkey_s_c_to_inode(k);
+
+	if (!full &&
+	    !(inode.v->bi_flags & (BCH_INODE_I_SIZE_DIRTY|
+				   BCH_INODE_I_SECTORS_DIRTY|
+				   BCH_INODE_UNLINKED)))
+		return 0;
+
+	BUG_ON(bch2_inode_unpack(inode, &u));
+
+	if (prev->bi_inum != u.bi_inum)
+		*prev = u;
+
+	if (fsck_err_on(prev->bi_hash_seed	!= u.bi_hash_seed ||
+			inode_d_type(prev)	!= inode_d_type(&u), c,
 			"inodes in different snapshots don't match")) {
 		bch_err(c, "repair not implemented yet");
 		return -EINVAL;
@@ -932,58 +913,61 @@ static int check_inodes(struct bch_fs *c, bool full)
 {
 	struct btree_trans trans;
 	struct btree_iter iter;
-	struct bkey_s_c k;
-	struct bkey_s_c_inode inode;
-	struct bch_inode_unpacked prev, u;
+	struct bch_inode_unpacked prev = { 0 };
 	int ret;
 
-	memset(&prev, 0, sizeof(prev));
-
 	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
 
-	for_each_btree_key(&trans, iter, BTREE_ID_inodes, POS_MIN,
-			   BTREE_ITER_INTENT|
-			   BTREE_ITER_PREFETCH|
-			   BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
-		ret = check_key_has_snapshot(&trans, &iter, k);
+	bch2_trans_iter_init(&trans, &iter, BTREE_ID_inodes,
+			     POS(BCACHEFS_ROOT_INO, 0),
+			     BTREE_ITER_INTENT|
+			     BTREE_ITER_PREFETCH|
+			     BTREE_ITER_ALL_SNAPSHOTS);
+
+	do {
+		ret = __bch2_trans_do(&trans, NULL, NULL,
+				      BTREE_INSERT_LAZY_RW|
+				      BTREE_INSERT_NOFAIL,
+			check_inode(&trans, &iter, &prev, full));
 		if (ret)
 			break;
+	} while (bch2_btree_iter_advance(&iter));
+	bch2_trans_iter_exit(&trans, &iter);
 
-		/*
-		 * if snapshot id isn't a leaf node, skip it - deletion in
-		 * particular is not atomic, so on the internal snapshot nodes
-		 * we can see inodes marked for deletion after a clean shutdown
-		 */
-		if (bch2_snapshot_internal_node(c, k.k->p.snapshot))
-			continue;
+	bch2_trans_exit(&trans);
+	return ret;
+}
 
-		if (k.k->type != KEY_TYPE_inode)
-			continue;
+static int check_subvol(struct btree_trans *trans,
+			struct btree_iter *iter)
+{
+	struct bkey_s_c k;
+	struct bkey_s_c_subvolume subvol;
+	int ret;
 
-		inode = bkey_s_c_to_inode(k);
+	k = bch2_btree_iter_peek(iter);
+	if (!k.k)
+		return 0;
 
-		if (!full &&
-		    !(inode.v->bi_flags & (BCH_INODE_I_SIZE_DIRTY|
-					   BCH_INODE_I_SECTORS_DIRTY|
-					   BCH_INODE_UNLINKED)))
-			continue;
+	ret = bkey_err(k);
+	if (ret)
+		return ret;
 
-		BUG_ON(bch2_inode_unpack(inode, &u));
+	if (k.k->type != KEY_TYPE_subvolume)
+		return 0;
 
-		ret = check_inode(&trans, &iter,
-				  full && prev.bi_inum == u.bi_inum
-				  ? &prev : NULL, u);
-		if (ret)
-			break;
+	subvol = bkey_s_c_to_subvolume(k);
 
-		prev = u;
+	if (BCH_SUBVOLUME_UNLINKED(subvol.v)) {
+		ret = bch2_subvolume_delete(trans, iter->pos.offset);
+		if (ret && ret != -EINTR)
+			bch_err(trans->c, "error deleting subvolume %llu: %i",
+				iter->pos.offset, ret);
+		if (ret)
+			return ret;
 	}
-	bch2_trans_iter_exit(&trans, &iter);
-
-	BUG_ON(ret == -EINTR);
 
-	bch2_trans_exit(&trans);
-	return ret;
+	return 0;
 }
 
 noinline_for_stack
@@ -991,30 +975,23 @@ static int check_subvols(struct bch_fs *c)
 {
 	struct btree_trans trans;
 	struct btree_iter iter;
-	struct bkey_s_c k;
-	struct bkey_s_c_subvolume subvol;
 	int ret;
 
 	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
 
-	for_each_btree_key(&trans, iter, BTREE_ID_subvolumes, POS_MIN,
-			   0, k, ret) {
-		if (k.k->type != KEY_TYPE_subvolume)
-			continue;
-
-		subvol = bkey_s_c_to_subvolume(k);
+	bch2_trans_iter_init(&trans, &iter, BTREE_ID_subvolumes,
+			     POS_MIN,
+			     BTREE_ITER_INTENT|
+			     BTREE_ITER_PREFETCH);
 
-		if (BCH_SUBVOLUME_UNLINKED(subvol.v)) {
-			ret = __bch2_trans_do(&trans,  NULL, NULL,
-					      BTREE_INSERT_LAZY_RW,
-					bch2_subvolume_delete(&trans, iter.pos.offset));
-			if (ret) {
-				bch_err(c, "error deleting subvolume %llu: %i",
-					iter.pos.offset, ret);
-				break;
-			}
-		}
-	}
+	do {
+		ret = __bch2_trans_do(&trans, NULL, NULL,
+				      BTREE_INSERT_LAZY_RW|
+				      BTREE_INSERT_NOFAIL,
+				      check_subvol(&trans, &iter));
+		if (ret)
+			break;
+	} while (bch2_btree_iter_advance(&iter));
 	bch2_trans_iter_exit(&trans, &iter);
 
 	bch2_trans_exit(&trans);
@@ -1174,7 +1151,7 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
 
 	ret = check_key_has_snapshot(trans, iter, k);
 	if (ret)
-		return ret;
+		return ret < 0 ? ret : 0;
 
 	ret = snapshots_seen_update(c, s, k.k->p);
 	if (ret)
@@ -1207,9 +1184,8 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
 	if (fsck_err_on(ret == INT_MAX, c,
 			"extent in missing inode:\n  %s",
 			(bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)))
-		return __bch2_trans_do(trans, NULL, NULL, BTREE_INSERT_LAZY_RW,
-			bch2_btree_delete_at(trans, iter,
-					     BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE));
+		return bch2_btree_delete_at(trans, iter,
+					    BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
 
 	if (ret == INT_MAX)
 		return 0;
@@ -1222,9 +1198,8 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
 			"extent in non regular inode mode %o:\n  %s",
 			i->inode.bi_mode,
 			(bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)))
-		return __bch2_trans_do(trans, NULL, NULL, BTREE_INSERT_LAZY_RW,
-			 bch2_btree_delete_at(trans, iter,
-					      BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE));
+		return bch2_btree_delete_at(trans, iter,
+					    BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
 
 	if (!bch2_snapshot_internal_node(c, k.k->p.snapshot)) {
 		for_each_visible_inode(c, s, inode, k.k->p.snapshot, i) {
@@ -1284,7 +1259,9 @@ static int check_extents(struct bch_fs *c)
 			     BTREE_ITER_ALL_SNAPSHOTS);
 
 	do {
-		ret = lockrestart_do(&trans,
+		ret = __bch2_trans_do(&trans, NULL, NULL,
+				      BTREE_INSERT_LAZY_RW|
+				      BTREE_INSERT_NOFAIL,
 			check_extent(&trans, &iter, &w, &s));
 		if (ret)
 			break;
@@ -1343,6 +1320,7 @@ static int check_dirent_target(struct btree_trans *trans,
 			       u32 target_snapshot)
 {
 	struct bch_fs *c = trans->c;
+	struct bkey_i_dirent *n;
 	bool backpointer_exists = true;
 	char buf[200];
 	int ret = 0;
@@ -1352,7 +1330,7 @@ static int check_dirent_target(struct btree_trans *trans,
 		target->bi_dir		= d.k->p.inode;
 		target->bi_dir_offset	= d.k->p.offset;
 
-		ret = write_inode(trans, target, target_snapshot);
+		ret = __write_inode(trans, target, target_snapshot);
 		if (ret)
 			goto err;
 	}
@@ -1369,7 +1347,7 @@ static int check_dirent_target(struct btree_trans *trans,
 				backpointer_exists, c,
 				"directory %llu with multiple links",
 				target->bi_inum)) {
-			ret = remove_dirent(trans, d.k->p);
+			ret = __remove_dirent(trans, d.k->p);
 			if (ret)
 				goto err;
 			return 0;
@@ -1382,7 +1360,7 @@ static int check_dirent_target(struct btree_trans *trans,
 			target->bi_nlink++;
 			target->bi_flags &= ~BCH_INODE_UNLINKED;
 
-			ret = write_inode(trans, target, target_snapshot);
+			ret = __write_inode(trans, target, target_snapshot);
 			if (ret)
 				goto err;
 		}
@@ -1399,34 +1377,30 @@ static int check_dirent_target(struct btree_trans *trans,
 			target->bi_dir		= d.k->p.inode;
 			target->bi_dir_offset	= d.k->p.offset;
 
-			ret = write_inode(trans, target, target_snapshot);
+			ret = __write_inode(trans, target, target_snapshot);
 			if (ret)
 				goto err;
 		}
 	}
 
-	if (fsck_err_on(vfs_d_type(d.v->d_type) != mode_to_type(target->bi_mode), c,
-			"incorrect d_type: should be %u:\n%s",
-			mode_to_type(target->bi_mode),
+	if (fsck_err_on(d.v->d_type != inode_d_type(target), c,
+			"incorrect d_type: got %s, should be %s:\n%s",
+			bch2_d_type_str(d.v->d_type),
+			bch2_d_type_str(inode_d_type(target)),
 			(bch2_bkey_val_to_text(&PBUF(buf), c, d.s_c), buf))) {
-		struct bkey_i_dirent *n;
-
-		n = kmalloc(bkey_bytes(d.k), GFP_KERNEL);
-		if (!n) {
-			ret = -ENOMEM;
-			goto err;
-		}
+		n = bch2_trans_kmalloc(trans, bkey_bytes(d.k));
+		ret = PTR_ERR_OR_ZERO(n);
+		if (ret)
+			return ret;
 
 		bkey_reassemble(&n->k_i, d.s_c);
-		n->v.d_type = mode_to_type(target->bi_mode);
+		n->v.d_type = inode_d_type(target);
 
-		ret = __bch2_trans_do(trans, NULL, NULL,
-				      BTREE_INSERT_NOFAIL|
-				      BTREE_INSERT_LAZY_RW,
-			bch2_trans_update(trans, iter, &n->k_i, 0));
-		kfree(n);
+		ret = bch2_trans_update(trans, iter, &n->k_i, 0);
+		if (ret)
+			return ret;
 
-		return ret ?: -EINTR;
+		d = dirent_i_to_s_c(n);
 	}
 
 	if (d.v->d_type == DT_SUBVOL &&
@@ -1435,24 +1409,19 @@ static int check_dirent_target(struct btree_trans *trans,
 	     fsck_err(c, "dirent has wrong d_parent_subvol field: got %u, should be %u",
 		      le32_to_cpu(d.v->d_parent_subvol),
 		      target->bi_parent_subvol))) {
-		struct bkey_i_dirent *n;
-
-		n = kmalloc(bkey_bytes(d.k), GFP_KERNEL);
-		if (!n) {
-			ret = -ENOMEM;
-			goto err;
-		}
+		n = bch2_trans_kmalloc(trans, bkey_bytes(d.k));
+		ret = PTR_ERR_OR_ZERO(n);
+		if (ret)
+			return ret;
 
 		bkey_reassemble(&n->k_i, d.s_c);
 		n->v.d_parent_subvol = cpu_to_le32(target->bi_parent_subvol);
 
-		ret = __bch2_trans_do(trans, NULL, NULL,
-				      BTREE_INSERT_NOFAIL|
-				      BTREE_INSERT_LAZY_RW,
-			bch2_trans_update(trans, iter, &n->k_i, 0));
-		kfree(n);
+		ret = bch2_trans_update(trans, iter, &n->k_i, 0);
+		if (ret)
+			return ret;
 
-		return ret ?: -EINTR;
+		d = dirent_i_to_s_c(n);
 	}
 err:
 fsck_err:
@@ -1482,7 +1451,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
 
 	ret = check_key_has_snapshot(trans, iter, k);
 	if (ret)
-		return ret;
+		return ret < 0 ? ret : 0;
 
 	ret = snapshots_seen_update(c, s, k.k->p);
 	if (ret)
@@ -1504,9 +1473,8 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
 	if (fsck_err_on(ret == INT_MAX, c,
 			"dirent in nonexisting directory:\n%s",
 			(bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)))
-		return __bch2_trans_do(trans, NULL, NULL, BTREE_INSERT_LAZY_RW,
-				bch2_btree_delete_at(trans, iter,
-						     BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE));
+		return bch2_btree_delete_at(trans, iter,
+				BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
 
 	if (ret == INT_MAX)
 		return 0;
@@ -1515,11 +1483,10 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
 	ret = 0;
 
 	if (fsck_err_on(!S_ISDIR(i->inode.bi_mode), c,
-			"dirent in non directory inode type %u:\n%s",
-			mode_to_type(i->inode.bi_mode),
+			"dirent in non directory inode type %s:\n%s",
+			bch2_d_type_str(inode_d_type(&i->inode)),
 			(bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)))
-		return __bch2_trans_do(trans, NULL, NULL, 0,
-				bch2_btree_delete_at(trans, iter, 0));
+		return bch2_btree_delete_at(trans, iter, 0);
 
 	if (dir->first_this_inode)
 		*hash_info = bch2_hash_info_init(c, &dir->d[0].inode);
@@ -1550,7 +1517,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
 		if (fsck_err_on(ret, c,
 				"dirent points to missing subvolume %llu",
 				le64_to_cpu(d.v->d_child_subvol)))
-			return remove_dirent(trans, d.k->p);
+			return __remove_dirent(trans, d.k->p);
 
 		ret = __lookup_inode(trans, target_inum,
 				   &subvol_root, &target_snapshot);
@@ -1570,7 +1537,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
 				target_inum,
 				subvol_root.bi_subvol, target_subvol)) {
 			subvol_root.bi_subvol = target_subvol;
-			ret = write_inode(trans, &subvol_root, target_snapshot);
+			ret = __write_inode(trans, &subvol_root, target_snapshot);
 			if (ret)
 				return ret;
 		}
@@ -1588,7 +1555,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
 				"dirent points to missing inode:\n%s",
 				(bch2_bkey_val_to_text(&PBUF(buf), c,
 						       k), buf))) {
-			ret = remove_dirent(trans, d.k->p);
+			ret = __remove_dirent(trans, d.k->p);
 			if (ret)
 				return ret;
 		}
@@ -1636,7 +1603,9 @@ static int check_dirents(struct bch_fs *c)
 			     BTREE_ITER_ALL_SNAPSHOTS);
 
 	do {
-		ret = lockrestart_do(&trans,
+		ret = __bch2_trans_do(&trans, NULL, NULL,
+				      BTREE_INSERT_LAZY_RW|
+				      BTREE_INSERT_NOFAIL,
 			check_dirent(&trans, &iter, &hash_info,
 				     &dir, &target, &s));
 		if (ret)
@@ -1651,17 +1620,58 @@ static int check_dirents(struct bch_fs *c)
 	return ret;
 }
 
+static int check_xattr(struct btree_trans *trans, struct btree_iter *iter,
+		       struct bch_hash_info *hash_info,
+		       struct inode_walker *inode)
+{
+	struct bch_fs *c = trans->c;
+	struct bkey_s_c k;
+	int ret;
+
+	k = bch2_btree_iter_peek(iter);
+	if (!k.k)
+		return 0;
+
+	ret = bkey_err(k);
+	if (ret)
+		return ret;
+
+	ret = check_key_has_snapshot(trans, iter, k);
+	if (ret)
+		return ret;
+
+	ret = __walk_inode(trans, inode, k.k->p);
+	if (ret < 0)
+		return ret;
+
+	if (fsck_err_on(ret == INT_MAX, c,
+			"xattr for missing inode %llu",
+			k.k->p.inode))
+		return bch2_btree_delete_at(trans, iter, 0);
+
+	if (ret == INT_MAX)
+		return 0;
+
+	ret = 0;
+
+	if (inode->first_this_inode)
+		*hash_info = bch2_hash_info_init(c, &inode->d[0].inode);
+
+	ret = hash_check_key(trans, bch2_xattr_hash_desc, hash_info, iter, k);
+fsck_err:
+	return ret;
+}
+
 /*
  * Walk xattrs: verify that they all have a corresponding inode
  */
 noinline_for_stack
 static int check_xattrs(struct bch_fs *c)
 {
-	struct inode_walker w = inode_walker_init();
+	struct inode_walker inode = inode_walker_init();
 	struct bch_hash_info hash_info;
 	struct btree_trans trans;
 	struct btree_iter iter;
-	struct bkey_s_c k;
 	int ret = 0;
 
 	bch_verbose(c, "checking xattrs");
@@ -1673,65 +1683,31 @@ static int check_xattrs(struct bch_fs *c)
 			     BTREE_ITER_INTENT|
 			     BTREE_ITER_PREFETCH|
 			     BTREE_ITER_ALL_SNAPSHOTS);
-retry:
-	bch2_trans_begin(&trans);
-
-	while ((k = bch2_btree_iter_peek(&iter)).k &&
-	       !(ret = bkey_err(k))) {
-		ret = check_key_has_snapshot(&trans, &iter, k);
-		if (ret)
-			break;
-
-		ret = walk_inode(&trans, &w, k.k->p);
-		if (ret < 0)
-			break;
-
-		if (fsck_err_on(ret == INT_MAX, c,
-				"xattr for missing inode %llu",
-				k.k->p.inode)) {
-			ret = bch2_btree_delete_at(&trans, &iter, 0);
-			if (ret)
-				break;
-			continue;
-		}
-
-		if (ret == INT_MAX)
-			goto next;
-		ret = 0;
-
-		if (w.first_this_inode)
-			hash_info = bch2_hash_info_init(c, &w.d[0].inode);
 
-		ret = hash_check_key(&trans, bch2_xattr_hash_desc,
-				     &hash_info, &iter, k);
+	do {
+		ret = __bch2_trans_do(&trans, NULL, NULL,
+				      BTREE_INSERT_LAZY_RW|
+				      BTREE_INSERT_NOFAIL,
+				      check_xattr(&trans, &iter, &hash_info,
+						  &inode));
 		if (ret)
 			break;
-next:
-		bch2_btree_iter_advance(&iter);
-	}
-fsck_err:
-	if (ret == -EINTR)
-		goto retry;
-
+	} while (bch2_btree_iter_advance(&iter));
 	bch2_trans_iter_exit(&trans, &iter);
+
 	bch2_trans_exit(&trans);
 	return ret;
 }
 
-/* Get root directory, create if it doesn't exist: */
-static int check_root(struct bch_fs *c)
+static int check_root_trans(struct btree_trans *trans)
 {
-	struct btree_trans trans;
+	struct bch_fs *c = trans->c;
 	struct bch_inode_unpacked root_inode;
 	u32 snapshot;
 	u64 inum;
 	int ret;
 
-	bch2_trans_init(&trans, c, 0, 0);
-
-	bch_verbose(c, "checking root directory");
-
-	ret = subvol_lookup(&trans, BCACHEFS_ROOT_SUBVOL, &snapshot, &inum);
+	ret = __subvol_lookup(trans, BCACHEFS_ROOT_SUBVOL, &snapshot, &inum);
 	if (ret && ret != -ENOENT)
 		return ret;
 
@@ -1746,10 +1722,10 @@ static int check_root(struct bch_fs *c)
 		root_subvol.v.flags	= 0;
 		root_subvol.v.snapshot	= cpu_to_le32(snapshot);
 		root_subvol.v.inode	= cpu_to_le64(inum);
-		ret = __bch2_trans_do(&trans, NULL, NULL,
+		ret = __bch2_trans_do(trans, NULL, NULL,
 				      BTREE_INSERT_NOFAIL|
 				      BTREE_INSERT_LAZY_RW,
-			__bch2_btree_insert(&trans, BTREE_ID_subvolumes, &root_subvol.k_i));
+			__bch2_btree_insert(trans, BTREE_ID_subvolumes, &root_subvol.k_i));
 		if (ret) {
 			bch_err(c, "error writing root subvol: %i", ret);
 			goto err;
@@ -1757,7 +1733,7 @@ static int check_root(struct bch_fs *c)
 
 	}
 
-	ret = lookup_inode(&trans, BCACHEFS_ROOT_INO, &root_inode, &snapshot);
+	ret = __lookup_inode(trans, BCACHEFS_ROOT_INO, &root_inode, &snapshot);
 	if (ret && ret != -ENOENT)
 		return ret;
 
@@ -1768,16 +1744,27 @@ static int check_root(struct bch_fs *c)
 				0, NULL);
 		root_inode.bi_inum = inum;
 
-		ret = write_inode(&trans, &root_inode, snapshot);
+		ret = __write_inode(trans, &root_inode, snapshot);
 		if (ret)
 			bch_err(c, "error writing root inode: %i", ret);
 	}
 err:
 fsck_err:
-	bch2_trans_exit(&trans);
 	return ret;
 }
 
+/* Get root directory, create if it doesn't exist: */
+noinline_for_stack
+static int check_root(struct bch_fs *c)
+{
+	bch_verbose(c, "checking root directory");
+
+	return bch2_trans_do(c, NULL, NULL,
+			     BTREE_INSERT_NOFAIL|
+			     BTREE_INSERT_LAZY_RW,
+		check_root_trans(&trans));
+}
+
 struct pathbuf {
 	size_t		nr;
 	size_t		size;
@@ -1866,9 +1853,9 @@ static int check_path(struct btree_trans *trans,
 		}
 
 		if (ret == -ENOENT) {
-			if (fsck_err(c,  "unreachable inode %llu:%u, type %u nlink %u backptr %llu:%llu",
+			if (fsck_err(c,  "unreachable inode %llu:%u, type %s nlink %u backptr %llu:%llu",
 				     inode->bi_inum, snapshot,
-				     mode_to_type(inode->bi_mode),
+				     bch2_d_type_str(inode_d_type(inode)),
 				     inode->bi_nlink,
 				     inode->bi_dir,
 				     inode->bi_dir_offset))
@@ -1909,7 +1896,9 @@ static int check_path(struct btree_trans *trans,
 			if (!fsck_err(c, "directory structure loop"))
 				return 0;
 
-			ret = lockrestart_do(trans,
+			ret = __bch2_trans_do(trans, NULL, NULL,
+					      BTREE_INSERT_NOFAIL|
+					      BTREE_INSERT_LAZY_RW,
 					remove_backpointer(trans, inode));
 			if (ret) {
 				bch_err(c, "error removing dirent: %i", ret);
@@ -1930,6 +1919,7 @@ fsck_err:
  * After check_dirents(), if an inode backpointer doesn't exist that means it's
  * unreachable:
  */
+noinline_for_stack
 static int check_directory_structure(struct bch_fs *c)
 {
 	struct btree_trans trans;
@@ -2277,6 +2267,7 @@ static int fix_reflink_p_key(struct btree_trans *trans, struct btree_iter *iter)
 	return bch2_trans_update(trans, iter, &u->k_i, BTREE_TRIGGER_NORUN);
 }
 
+noinline_for_stack
 static int fix_reflink_p(struct bch_fs *c)
 {
 	struct btree_trans trans;
@@ -2287,6 +2278,8 @@ static int fix_reflink_p(struct bch_fs *c)
 	if (c->sb.version >= bcachefs_metadata_version_reflink_p_fix)
 		return 0;
 
+	bch_verbose(c, "fixing reflink_p keys");
+
 	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
 
 	for_each_btree_key(&trans, iter, BTREE_ID_extents, POS_MIN,
diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h
index 9e84cddcc6cb..009b807cc167 100644
--- a/fs/bcachefs/inode.h
+++ b/fs/bcachefs/inode.h
@@ -134,6 +134,11 @@ static inline u8 mode_to_type(umode_t mode)
 	return (mode >> 12) & 15;
 }
 
+static inline u8 inode_d_type(struct bch_inode_unpacked *inode)
+{
+	return inode->bi_subvol ? DT_SUBVOL : mode_to_type(inode->bi_mode);
+}
+
 /* i_nlink: */
 
 static inline unsigned nlink_bias(umode_t mode)
diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
index c5def5b1f558..4e59bff09578 100644
--- a/fs/bcachefs/opts.h
+++ b/fs/bcachefs/opts.h
@@ -20,6 +20,11 @@ extern const char * const bch2_cache_replacement_policies[];
 extern const char * const bch2_member_states[];
 extern const char * const bch2_d_types[];
 
+static inline const char *bch2_d_type_str(unsigned d_type)
+{
+	return (d_type < BCH_DT_MAX ? bch2_d_types[d_type] : NULL) ?: "(bad d_type)";
+}
+
 /*
  * Mount options; we also store defaults in the superblock.
  *
-- 
cgit 


From fae1157d184084f1716a10273423f8e949d8471f Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 28 Oct 2021 18:22:25 -0400
Subject: bcachefs: Ensure journal doesn't get stuck in nochanges mode

This tweaks the journal code to always act as if there's space available
in nochanges mode, when we're not going to be doing any writes. This
helps in recovering filesystems that won't mount because they need
journal replay and the journal has gotten stuck.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/journal.h         | 1 +
 fs/bcachefs/journal_io.c      | 2 +-
 fs/bcachefs/journal_reclaim.c | 6 ++++--
 fs/bcachefs/journal_types.h   | 1 +
 fs/bcachefs/super.c           | 3 +++
 5 files changed, 10 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h
index 1d556790b38e..99fd253648bf 100644
--- a/fs/bcachefs/journal.h
+++ b/fs/bcachefs/journal.h
@@ -446,6 +446,7 @@ static inline int bch2_journal_preres_get_fast(struct journal *j,
 		ret = 0;
 
 		if ((flags & JOURNAL_RES_GET_RESERVED) ||
+		    test_bit(JOURNAL_NOCHANGES, &j->flags) ||
 		    new.reserved + d < new.remaining) {
 			new.reserved += d;
 			ret = 1;
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index e797d6376a82..ed8d7f90b607 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -1512,7 +1512,7 @@ retry_alloc:
 
 	w->devs_written = bch2_bkey_devs(bkey_i_to_s_c(&w->key));
 
-	if (c->opts.nochanges)
+	if (test_bit(JOURNAL_NOCHANGES, &j->flags))
 		goto no_io;
 
 	for_each_rw_member(ca, c, i)
diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
index 0bc4681ccc24..3f417af16e59 100644
--- a/fs/bcachefs/journal_reclaim.c
+++ b/fs/bcachefs/journal_reclaim.c
@@ -34,8 +34,10 @@ unsigned bch2_journal_dev_buckets_available(struct journal *j,
 					    struct journal_device *ja,
 					    enum journal_space_from from)
 {
-	unsigned available = (journal_space_from(ja, from) -
-			      ja->cur_idx - 1 + ja->nr) % ja->nr;
+	unsigned available = !test_bit(JOURNAL_NOCHANGES, &j->flags)
+		? ((journal_space_from(ja, from) -
+		    ja->cur_idx - 1 + ja->nr) % ja->nr)
+		: ja->nr;
 
 	/*
 	 * Don't use the last bucket unless writing the new last_seq
diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h
index cce02bad850c..0647a53eb35c 100644
--- a/fs/bcachefs/journal_types.h
+++ b/fs/bcachefs/journal_types.h
@@ -154,6 +154,7 @@ enum {
 	JOURNAL_NEED_WRITE,
 	JOURNAL_MAY_GET_UNRESERVED,
 	JOURNAL_MAY_SKIP_FLUSH,
+	JOURNAL_NOCHANGES,
 };
 
 /* Embedded in struct bch_fs */
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index ca2acb7c6134..501fe129ea9c 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -801,6 +801,9 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 	    bch2_fs_fsio_init(c))
 		goto err;
 
+	if (c->opts.nochanges)
+		set_bit(JOURNAL_NOCHANGES, &c->journal.flags);
+
 	mi = bch2_sb_get_members(c->disk_sb.sb);
 	for (i = 0; i < c->sb.nr_devices; i++)
 		if (bch2_dev_exists(c->disk_sb.sb, mi, i) &&
-- 
cgit 


From 37f72492f401671f1f773cc62dddf742e7fc553b Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 29 Oct 2021 18:21:05 -0400
Subject: bcachefs: Fix bch2_mark_update()

When the old or new key doesn't exist, we should still pass in a deleted
key with the correct pos. This fixes a bug in the ec code, when
bch2_mark_stripe() was looking up the wrong in-memory stripe.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/buckets.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index fc4d9d75794c..48687a70411e 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -1231,6 +1231,8 @@ int bch2_mark_key(struct bch_fs *c, struct bkey_s_c new, unsigned flags)
 	struct bkey_s_c old = (struct bkey_s_c) { &deleted, NULL };
 	int ret;
 
+	deleted.p = new.k->p;
+
 	percpu_down_read(&c->mark_lock);
 	ret = bch2_mark_key_locked(c, old, new, 0, flags);
 	percpu_up_read(&c->mark_lock);
@@ -1248,6 +1250,8 @@ int bch2_mark_update(struct btree_trans *trans, struct btree_path *path,
 	struct bkey		unpacked;
 	int ret;
 
+	_deleted.p = path->pos;
+
 	if (unlikely(flags & BTREE_TRIGGER_NORUN))
 		return 0;
 
@@ -1846,6 +1850,8 @@ int bch2_trans_mark_update(struct btree_trans *trans,
 	struct bkey		unpacked;
 	int ret;
 
+	_deleted.p = path->pos;
+
 	if (unlikely(flags & BTREE_TRIGGER_NORUN))
 		return 0;
 
-- 
cgit 


From 961b2d62821f23f9f963ee069b64eb8806f05e40 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 29 Oct 2021 16:29:13 -0400
Subject: bcachefs: Assorted ec fixes

- The backpointer that ec_stripe_update_ptrs() uses now needs to include
  the snapshot ID, which means we have to change where we add the
  backpointer to after getting the snapshot ID for the new extents

- ec_stripe_update_ptrs() needs to be calling bch2_trans_begin()

- improve error message in bch2_mark_stripe()

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_gc.c |  6 ++++--
 fs/bcachefs/buckets.c  |  9 +++++++--
 fs/bcachefs/ec.c       | 23 +++++++++++------------
 fs/bcachefs/ec.h       |  4 ++--
 fs/bcachefs/io.c       |  8 ++++----
 fs/bcachefs/move.c     |  4 ++++
 6 files changed, 32 insertions(+), 22 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 4fc882b15d93..2d9e5c91c9d0 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -699,6 +699,7 @@ static int bch2_gc_mark_key(struct bch_fs *c, enum btree_id btree_id,
 		BTREE_TRIGGER_INSERT|
 		BTREE_TRIGGER_GC|
 		(initial ? BTREE_TRIGGER_NOATOMIC : 0);
+	char buf[200];
 	int ret = 0;
 
 	if (initial) {
@@ -717,8 +718,9 @@ static int bch2_gc_mark_key(struct bch_fs *c, enum btree_id btree_id,
 
 		if (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) ||
 		    fsck_err_on(!bch2_bkey_replicas_marked(c, *k), c,
-				"superblock not marked as containing replicas (type %u)",
-				k->k->type)) {
+				"superblock not marked as containing replicas\n"
+				"  while marking %s",
+				(bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf))) {
 			ret = bch2_mark_bkey_replicas(c, *k);
 			if (ret) {
 				bch_err(c, "error marking bkey replicas: %i", ret);
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 48687a70411e..4cd44a50beab 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -1007,8 +1007,13 @@ static int bch2_mark_stripe(struct bch_fs *c,
 	BUG_ON(gc && old_s);
 
 	if (!m || (old_s && !m->alive)) {
-		bch_err_ratelimited(c, "error marking nonexistent stripe %zu",
-				    idx);
+		char buf1[200], buf2[200];
+
+		bch2_bkey_val_to_text(&PBUF(buf1), c, old);
+		bch2_bkey_val_to_text(&PBUF(buf2), c, new);
+		bch_err_ratelimited(c, "error marking nonexistent stripe %zu while marking\n"
+				    "old %s\n"
+				    "new %s", idx, buf1, buf2);
 		bch2_inconsistent_error(c);
 		return -1;
 	}
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index 7dfa052e9765..ed4a73345e3a 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -837,8 +837,9 @@ static int ec_stripe_update_ptrs(struct bch_fs *c,
 	bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
 			     bkey_start_pos(pos),
 			     BTREE_ITER_INTENT);
-
-	while ((k = bch2_btree_iter_peek(&iter)).k &&
+retry:
+	while (bch2_trans_begin(&trans),
+	       (k = bch2_btree_iter_peek(&iter)).k &&
 	       !(ret = bkey_err(k)) &&
 	       bkey_cmp(bkey_start_pos(k.k), pos->p) < 0) {
 		struct bch_extent_ptr *ptr, *ec_ptr = NULL;
@@ -874,11 +875,11 @@ static int ec_stripe_update_ptrs(struct bch_fs *c,
 					BTREE_INSERT_NOFAIL);
 		if (!ret)
 			bch2_btree_iter_set_pos(&iter, next_pos);
-		if (ret == -EINTR)
-			ret = 0;
 		if (ret)
 			break;
 	}
+	if (ret == -EINTR)
+		goto retry;
 	bch2_trans_iter_exit(&trans, &iter);
 
 	bch2_trans_exit(&trans);
@@ -1069,16 +1070,14 @@ void *bch2_writepoint_ec_buf(struct bch_fs *c, struct write_point *wp)
 	return ob->ec->new_stripe.data[ob->ec_idx] + (offset << 9);
 }
 
-void bch2_ec_add_backpointer(struct bch_fs *c, struct write_point *wp,
-			     struct bpos pos, unsigned sectors)
+void bch2_ob_add_backpointer(struct bch_fs *c, struct open_bucket *ob,
+			     struct bkey *k)
 {
-	struct open_bucket *ob = ec_open_bucket(c, &wp->ptrs);
-	struct ec_stripe_new *ec;
+	struct ec_stripe_new *ec = ob->ec;
 
-	if (!ob)
+	if (!ec)
 		return;
 
-	ec = ob->ec;
 	mutex_lock(&ec->lock);
 
 	if (bch2_keylist_realloc(&ec->keys, ec->inline_keys,
@@ -1088,8 +1087,8 @@ void bch2_ec_add_backpointer(struct bch_fs *c, struct write_point *wp,
 	}
 
 	bkey_init(&ec->keys.top->k);
-	ec->keys.top->k.p	= pos;
-	bch2_key_resize(&ec->keys.top->k, sectors);
+	ec->keys.top->k.p	= k->p;
+	ec->keys.top->k.size	= k->size;
 	bch2_keylist_push(&ec->keys);
 
 	mutex_unlock(&ec->lock);
diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h
index e79626b59509..eb16e140e2c8 100644
--- a/fs/bcachefs/ec.h
+++ b/fs/bcachefs/ec.h
@@ -193,8 +193,8 @@ struct ec_stripe_head {
 int bch2_ec_read_extent(struct bch_fs *, struct bch_read_bio *);
 
 void *bch2_writepoint_ec_buf(struct bch_fs *, struct write_point *);
-void bch2_ec_add_backpointer(struct bch_fs *, struct write_point *,
-			     struct bpos, unsigned);
+void bch2_ob_add_backpointer(struct bch_fs *, struct open_bucket *,
+			     struct bkey *);
 
 void bch2_ec_bucket_written(struct bch_fs *, struct open_bucket *);
 void bch2_ec_bucket_cancel(struct bch_fs *, struct open_bucket *);
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index 772fdeb722c7..bf04b61ae882 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -470,6 +470,7 @@ static int bch2_write_index_default(struct bch_write_op *op)
 {
 	struct bch_fs *c = op->c;
 	struct bkey_buf sk;
+	struct open_bucket *ec_ob = ec_open_bucket(c, &op->open_buckets);
 	struct keylist *keys = &op->insert_keys;
 	struct bkey_i *k = bch2_keylist_front(keys);
 	struct btree_trans trans;
@@ -513,6 +514,9 @@ static int bch2_write_index_default(struct bch_write_op *op)
 		if (ret)
 			break;
 
+		if (ec_ob)
+			bch2_ob_add_backpointer(c, ec_ob, &sk.k->k);
+
 		if (bkey_cmp(iter.pos, k->k.p) >= 0)
 			bch2_keylist_pop_front(&op->insert_keys);
 		else
@@ -950,7 +954,6 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp,
 	struct bio *src = &op->wbio.bio, *dst = src;
 	struct bvec_iter saved_iter;
 	void *ec_buf;
-	struct bpos ec_pos = op->pos;
 	unsigned total_output = 0, total_input = 0;
 	bool bounce = false;
 	bool page_alloc_failed = false;
@@ -1120,9 +1123,6 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp,
 
 	dst->bi_iter.bi_size = total_output;
 do_write:
-	/* might have done a realloc... */
-	bch2_ec_add_backpointer(c, wp, ec_pos, total_input >> 9);
-
 	*_dst = dst;
 	return more;
 csum_err:
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index 5f50b66fe206..2f260360b089 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -8,6 +8,7 @@
 #include "btree_update_interior.h"
 #include "buckets.h"
 #include "disk_groups.h"
+#include "ec.h"
 #include "inode.h"
 #include "io.h"
 #include "journal_reclaim.h"
@@ -135,6 +136,7 @@ int bch2_migrate_index_update(struct bch_write_op *op)
 	struct btree_iter iter;
 	struct migrate_write *m =
 		container_of(op, struct migrate_write, op);
+	struct open_bucket *ec_ob = ec_open_bucket(c, &op->open_buckets);
 	struct keylist *keys = &op->insert_keys;
 	struct bkey_buf _new, _insert;
 	int ret = 0;
@@ -252,6 +254,8 @@ int bch2_migrate_index_update(struct bch_write_op *op)
 		if (!ret) {
 			bch2_btree_iter_set_pos(&iter, next_pos);
 			atomic_long_inc(&c->extent_migrate_done);
+			if (ec_ob)
+				bch2_ob_add_backpointer(c, ec_ob, &insert->k);
 		}
 err:
 		if (ret == -EINTR)
-- 
cgit 


From 904823de497fa6637db8bc7c3b017f121b72bdf3 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 29 Oct 2021 18:43:18 -0400
Subject: bcachefs: Convert bch2_mark_key() to take a btree_trans *

This helps to unify the interface between bch2_mark_key() and
bch2_trans_mark_key() - and it also gives access to the journal
reservation and journal seq in the mark_key path.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/alloc_background.c |   8 ++-
 fs/bcachefs/btree_gc.c         |  85 ++++++++++++++++------------
 fs/bcachefs/btree_iter.c       |   1 +
 fs/bcachefs/buckets.c          | 124 +++++++++++++++++++++++------------------
 fs/bcachefs/buckets.h          |   2 +-
 fs/bcachefs/ec.c               |  14 +++--
 fs/bcachefs/recovery.c         |  12 ++--
 fs/bcachefs/recovery.h         |   4 +-
 fs/bcachefs/subvolume.c        |   9 +--
 fs/bcachefs/subvolume.h        |   4 +-
 10 files changed, 152 insertions(+), 111 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 2551ccc99d38..3b6af70fa186 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -261,8 +261,9 @@ void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c,
 #undef  x
 }
 
-static int bch2_alloc_read_fn(struct bch_fs *c, struct bkey_s_c k)
+static int bch2_alloc_read_fn(struct btree_trans *trans, struct bkey_s_c k)
 {
+	struct bch_fs *c = trans->c;
 	struct bch_dev *ca;
 	struct bucket *g;
 	struct bkey_alloc_unpacked u;
@@ -289,11 +290,14 @@ static int bch2_alloc_read_fn(struct bch_fs *c, struct bkey_s_c k)
 
 int bch2_alloc_read(struct bch_fs *c)
 {
+	struct btree_trans trans;
 	int ret;
 
+	bch2_trans_init(&trans, c, 0, 0);
 	down_read(&c->gc_lock);
-	ret = bch2_btree_and_journal_walk(c, BTREE_ID_alloc, bch2_alloc_read_fn);
+	ret = bch2_btree_and_journal_walk(&trans, BTREE_ID_alloc, bch2_alloc_read_fn);
 	up_read(&c->gc_lock);
+	bch2_trans_exit(&trans);
 	if (ret) {
 		bch_err(c, "error reading alloc info: %i", ret);
 		return ret;
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 2d9e5c91c9d0..48fd89195357 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -688,11 +688,12 @@ fsck_err:
 
 /* marking of btree keys/nodes: */
 
-static int bch2_gc_mark_key(struct bch_fs *c, enum btree_id btree_id,
+static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id,
 			    unsigned level, bool is_root,
 			    struct bkey_s_c *k,
 			    u8 *max_stale, bool initial)
 {
+	struct bch_fs *c = trans->c;
 	struct bkey_ptrs_c ptrs;
 	const struct bch_extent_ptr *ptr;
 	unsigned flags =
@@ -740,7 +741,7 @@ static int bch2_gc_mark_key(struct bch_fs *c, enum btree_id btree_id,
 		*max_stale = max(*max_stale, ptr_stale(ca, ptr));
 	}
 
-	ret = bch2_mark_key(c, *k, flags);
+	ret = bch2_mark_key(trans, *k, flags);
 fsck_err:
 err:
 	if (ret)
@@ -748,9 +749,10 @@ err:
 	return ret;
 }
 
-static int btree_gc_mark_node(struct bch_fs *c, struct btree *b, u8 *max_stale,
+static int btree_gc_mark_node(struct btree_trans *trans, struct btree *b, u8 *max_stale,
 			      bool initial)
 {
+	struct bch_fs *c = trans->c;
 	struct btree_node_iter iter;
 	struct bkey unpacked;
 	struct bkey_s_c k;
@@ -768,7 +770,7 @@ static int btree_gc_mark_node(struct bch_fs *c, struct btree *b, u8 *max_stale,
 	bkey_init(&prev.k->k);
 
 	while ((k = bch2_btree_node_iter_peek_unpack(&iter, b, &unpacked)).k) {
-		ret = bch2_gc_mark_key(c, b->c.btree_id, b->c.level, false,
+		ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level, false,
 				       &k, max_stale, initial);
 		if (ret)
 			break;
@@ -790,10 +792,10 @@ static int btree_gc_mark_node(struct bch_fs *c, struct btree *b, u8 *max_stale,
 	return ret;
 }
 
-static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id,
+static int bch2_gc_btree(struct btree_trans *trans, enum btree_id btree_id,
 			 bool initial, bool metadata_only)
 {
-	struct btree_trans trans;
+	struct bch_fs *c = trans->c;
 	struct btree_iter iter;
 	struct btree *b;
 	unsigned depth = metadata_only			? 1
@@ -803,35 +805,32 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id,
 	u8 max_stale = 0;
 	int ret = 0;
 
-	bch2_trans_init(&trans, c, 0, 0);
-
 	gc_pos_set(c, gc_pos_btree(btree_id, POS_MIN, 0));
 
-	__for_each_btree_node(&trans, iter, btree_id, POS_MIN,
+	__for_each_btree_node(trans, iter, btree_id, POS_MIN,
 			      0, depth, BTREE_ITER_PREFETCH, b, ret) {
 		bch2_verify_btree_nr_keys(b);
 
 		gc_pos_set(c, gc_pos_btree_node(b));
 
-		ret = btree_gc_mark_node(c, b, &max_stale, initial);
+		ret = btree_gc_mark_node(trans, b, &max_stale, initial);
 		if (ret)
 			break;
 
 		if (!initial) {
 			if (max_stale > 64)
-				bch2_btree_node_rewrite(&trans, &iter, b,
+				bch2_btree_node_rewrite(trans, &iter, b,
 						BTREE_INSERT_NOWAIT|
 						BTREE_INSERT_GC_LOCK_HELD);
 			else if (!bch2_btree_gc_rewrite_disabled &&
 				 (bch2_btree_gc_always_rewrite || max_stale > 16))
-				bch2_btree_node_rewrite(&trans, &iter,
+				bch2_btree_node_rewrite(trans, &iter,
 						b, BTREE_INSERT_NOWAIT|
 						BTREE_INSERT_GC_LOCK_HELD);
 		}
 	}
-	bch2_trans_iter_exit(&trans, &iter);
+	bch2_trans_iter_exit(trans, &iter);
 
-	bch2_trans_exit(&trans);
 	if (ret)
 		return ret;
 
@@ -840,7 +839,7 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id,
 	if (!btree_node_fake(b)) {
 		struct bkey_s_c k = bkey_i_to_s_c(&b->key);
 
-		ret = bch2_gc_mark_key(c, b->c.btree_id, b->c.level, true,
+		ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level, true,
 				       &k, &max_stale, initial);
 	}
 	gc_pos_set(c, gc_pos_btree_root(b->c.btree_id));
@@ -849,9 +848,10 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id,
 	return ret;
 }
 
-static int bch2_gc_btree_init_recurse(struct bch_fs *c, struct btree *b,
+static int bch2_gc_btree_init_recurse(struct btree_trans *trans, struct btree *b,
 				      unsigned target_depth)
 {
+	struct bch_fs *c = trans->c;
 	struct btree_and_journal_iter iter;
 	struct bkey_s_c k;
 	struct bkey_buf cur, prev;
@@ -868,7 +868,7 @@ static int bch2_gc_btree_init_recurse(struct bch_fs *c, struct btree *b,
 		BUG_ON(bpos_cmp(k.k->p, b->data->min_key) < 0);
 		BUG_ON(bpos_cmp(k.k->p, b->data->max_key) > 0);
 
-		ret = bch2_gc_mark_key(c, b->c.btree_id, b->c.level, false,
+		ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level, false,
 				       &k, &max_stale, true);
 		if (ret) {
 			bch_err(c, "%s: error %i from bch2_gc_mark_key", __func__, ret);
@@ -935,7 +935,7 @@ static int bch2_gc_btree_init_recurse(struct bch_fs *c, struct btree *b,
 				break;
 			}
 
-			ret = bch2_gc_btree_init_recurse(c, child,
+			ret = bch2_gc_btree_init_recurse(trans, child,
 							 target_depth);
 			six_unlock_read(&child->c.lock);
 
@@ -950,10 +950,11 @@ fsck_err:
 	return ret;
 }
 
-static int bch2_gc_btree_init(struct bch_fs *c,
+static int bch2_gc_btree_init(struct btree_trans *trans,
 			      enum btree_id btree_id,
 			      bool metadata_only)
 {
+	struct bch_fs *c = trans->c;
 	struct btree *b;
 	unsigned target_depth = metadata_only		? 1
 		: bch2_expensive_debug_checks		? 0
@@ -986,12 +987,12 @@ static int bch2_gc_btree_init(struct bch_fs *c,
 	}
 
 	if (b->c.level >= target_depth)
-		ret = bch2_gc_btree_init_recurse(c, b, target_depth);
+		ret = bch2_gc_btree_init_recurse(trans, b, target_depth);
 
 	if (!ret) {
 		struct bkey_s_c k = bkey_i_to_s_c(&b->key);
 
-		ret = bch2_gc_mark_key(c, b->c.btree_id, b->c.level, true,
+		ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level, true,
 				       &k, &max_stale, true);
 	}
 fsck_err:
@@ -1010,21 +1011,26 @@ static inline int btree_id_gc_phase_cmp(enum btree_id l, enum btree_id r)
 
 static int bch2_gc_btrees(struct bch_fs *c, bool initial, bool metadata_only)
 {
+	struct btree_trans trans;
 	enum btree_id ids[BTREE_ID_NR];
 	unsigned i;
 	int ret = 0;
 
+	bch2_trans_init(&trans, c, 0, 0);
+
 	for (i = 0; i < BTREE_ID_NR; i++)
 		ids[i] = i;
 	bubble_sort(ids, BTREE_ID_NR, btree_id_gc_phase_cmp);
 
 	for (i = 0; i < BTREE_ID_NR && !ret; i++)
 		ret = initial
-			? bch2_gc_btree_init(c, ids[i], metadata_only)
-			: bch2_gc_btree(c, ids[i], initial, metadata_only);
+			? bch2_gc_btree_init(&trans, ids[i], metadata_only)
+			: bch2_gc_btree(&trans, ids[i], initial, metadata_only);
 
 	if (ret < 0)
 		bch_err(c, "%s: ret %i", __func__, ret);
+
+	bch2_trans_exit(&trans);
 	return ret;
 }
 
@@ -1373,8 +1379,10 @@ static int bch2_gc_start(struct bch_fs *c,
 	return 0;
 }
 
-static int bch2_gc_reflink_done_initial_fn(struct bch_fs *c, struct bkey_s_c k)
+static int bch2_gc_reflink_done_initial_fn(struct btree_trans *trans,
+					   struct bkey_s_c k)
 {
+	struct bch_fs *c = trans->c;
 	struct reflink_gc *r;
 	const __le64 *refcount = bkey_refcount_c(k);
 	char buf[200];
@@ -1439,16 +1447,16 @@ static int bch2_gc_reflink_done(struct bch_fs *c, bool initial,
 	if (metadata_only)
 		return 0;
 
+	bch2_trans_init(&trans, c, 0, 0);
+
 	if (initial) {
 		c->reflink_gc_idx = 0;
 
-		ret = bch2_btree_and_journal_walk(c, BTREE_ID_reflink,
+		ret = bch2_btree_and_journal_walk(&trans, BTREE_ID_reflink,
 				bch2_gc_reflink_done_initial_fn);
 		goto out;
 	}
 
-	bch2_trans_init(&trans, c, 0, 0);
-
 	for_each_btree_key(&trans, iter, BTREE_ID_reflink, POS_MIN,
 			   BTREE_ITER_PREFETCH, k, ret) {
 		const __le64 *refcount = bkey_refcount_c(k);
@@ -1496,16 +1504,18 @@ static int bch2_gc_reflink_done(struct bch_fs *c, bool initial,
 	}
 fsck_err:
 	bch2_trans_iter_exit(&trans, &iter);
-	bch2_trans_exit(&trans);
 out:
 	genradix_free(&c->reflink_gc_table);
 	c->reflink_gc_nr = 0;
+	bch2_trans_exit(&trans);
 	return ret;
 }
 
-static int bch2_gc_reflink_start_initial_fn(struct bch_fs *c, struct bkey_s_c k)
+static int bch2_gc_reflink_start_initial_fn(struct btree_trans *trans,
+					    struct bkey_s_c k)
 {
 
+	struct bch_fs *c = trans->c;
 	struct reflink_gc *r;
 	const __le64 *refcount = bkey_refcount_c(k);
 
@@ -1530,19 +1540,20 @@ static int bch2_gc_reflink_start(struct bch_fs *c, bool initial,
 	struct btree_iter iter;
 	struct bkey_s_c k;
 	struct reflink_gc *r;
-	int ret;
+	int ret = 0;
 
 	if (metadata_only)
 		return 0;
 
+	bch2_trans_init(&trans, c, 0, 0);
 	genradix_free(&c->reflink_gc_table);
 	c->reflink_gc_nr = 0;
 
-	if (initial)
-		return bch2_btree_and_journal_walk(c, BTREE_ID_reflink,
-				bch2_gc_reflink_start_initial_fn);
-
-	bch2_trans_init(&trans, c, 0, 0);
+	if (initial) {
+		ret = bch2_btree_and_journal_walk(&trans, BTREE_ID_reflink,
+						bch2_gc_reflink_start_initial_fn);
+		goto out;
+	}
 
 	for_each_btree_key(&trans, iter, BTREE_ID_reflink, POS_MIN,
 			   BTREE_ITER_PREFETCH, k, ret) {
@@ -1563,9 +1574,9 @@ static int bch2_gc_reflink_start(struct bch_fs *c, bool initial,
 		r->refcount	= 0;
 	}
 	bch2_trans_iter_exit(&trans, &iter);
-
+out:
 	bch2_trans_exit(&trans);
-	return 0;
+	return ret;
 }
 
 /**
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 51dd10518214..baa8391a4226 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -2693,6 +2693,7 @@ void bch2_trans_begin(struct btree_trans *trans)
 	trans_for_each_update(trans, i)
 		__btree_path_put(i->path, true);
 
+	memset(&trans->journal_res, 0, sizeof(trans->journal_res));
 	trans->extra_journal_res	= 0;
 	trans->nr_updates		= 0;
 	trans->mem_top			= 0;
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 4cd44a50beab..a114a1142340 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -528,11 +528,13 @@ void bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
 	BUG_ON(owned_by_allocator == old.owned_by_allocator);
 }
 
-static int bch2_mark_alloc(struct bch_fs *c,
+static int bch2_mark_alloc(struct btree_trans *trans,
 			   struct bkey_s_c old, struct bkey_s_c new,
-			   u64 journal_seq, unsigned flags)
+			   unsigned flags)
 {
 	bool gc = flags & BTREE_TRIGGER_GC;
+	u64 journal_seq = trans->journal_res.seq;
+	struct bch_fs *c = trans->c;
 	struct bkey_alloc_unpacked u;
 	struct bch_dev *ca;
 	struct bucket *g;
@@ -677,7 +679,8 @@ static s64 ptr_disk_sectors(s64 sectors, struct extent_ptr_decoded p)
 		: sectors;
 }
 
-static int check_bucket_ref(struct bch_fs *c, struct bkey_s_c k,
+static int check_bucket_ref(struct bch_fs *c,
+			    struct bkey_s_c k,
 			    const struct bch_extent_ptr *ptr,
 			    s64 sectors, enum bch_data_type ptr_data_type,
 			    u8 bucket_gen, u8 bucket_data_type,
@@ -751,10 +754,12 @@ static int check_bucket_ref(struct bch_fs *c, struct bkey_s_c k,
 	return 0;
 }
 
-static int mark_stripe_bucket(struct bch_fs *c, struct bkey_s_c k,
-			     unsigned ptr_idx,
-			     u64 journal_seq, unsigned flags)
+static int mark_stripe_bucket(struct btree_trans *trans,
+			      struct bkey_s_c k,
+			      unsigned ptr_idx,
+			      u64 journal_seq, unsigned flags)
 {
+	struct bch_fs *c = trans->c;
 	const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
 	unsigned nr_data = s->nr_blocks - s->nr_redundant;
 	bool parity = ptr_idx >= nr_data;
@@ -798,7 +803,8 @@ static int mark_stripe_bucket(struct bch_fs *c, struct bkey_s_c k,
 	return 0;
 }
 
-static int __mark_pointer(struct bch_fs *c, struct bkey_s_c k,
+static int __mark_pointer(struct btree_trans *trans,
+			  struct bkey_s_c k,
 			  const struct bch_extent_ptr *ptr,
 			  s64 sectors, enum bch_data_type ptr_data_type,
 			  u8 bucket_gen, u8 *bucket_data_type,
@@ -807,7 +813,7 @@ static int __mark_pointer(struct bch_fs *c, struct bkey_s_c k,
 	u16 *dst_sectors = !ptr->cached
 		? dirty_sectors
 		: cached_sectors;
-	int ret = check_bucket_ref(c, k, ptr, sectors, ptr_data_type,
+	int ret = check_bucket_ref(trans->c, k, ptr, sectors, ptr_data_type,
 				   bucket_gen, *bucket_data_type,
 				   *dirty_sectors, *cached_sectors);
 
@@ -820,12 +826,15 @@ static int __mark_pointer(struct bch_fs *c, struct bkey_s_c k,
 	return 0;
 }
 
-static int bch2_mark_pointer(struct bch_fs *c, struct bkey_s_c k,
+static int bch2_mark_pointer(struct btree_trans *trans,
+			     struct bkey_s_c k,
 			     struct extent_ptr_decoded p,
 			     s64 sectors, enum bch_data_type data_type,
-			     u64 journal_seq, unsigned flags)
+			     unsigned flags)
 {
 	bool gc = flags & BTREE_TRIGGER_GC;
+	u64 journal_seq = trans->journal_res.seq;
+	struct bch_fs *c = trans->c;
 	struct bucket_mark old, new;
 	struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev);
 	struct bucket *g = PTR_BUCKET(ca, &p.ptr, gc);
@@ -838,7 +847,8 @@ static int bch2_mark_pointer(struct bch_fs *c, struct bkey_s_c k,
 		new.v.counter = old.v.counter = v;
 		bucket_data_type = new.data_type;
 
-		ret = __mark_pointer(c, k, &p.ptr, sectors, data_type, new.gen,
+		ret = __mark_pointer(trans, k, &p.ptr, sectors,
+				     data_type, new.gen,
 				     &bucket_data_type,
 				     &new.dirty_sectors,
 				     &new.cached_sectors);
@@ -867,13 +877,14 @@ static int bch2_mark_pointer(struct bch_fs *c, struct bkey_s_c k,
 	return 0;
 }
 
-static int bch2_mark_stripe_ptr(struct bch_fs *c,
+static int bch2_mark_stripe_ptr(struct btree_trans *trans,
 				struct bch_extent_stripe_ptr p,
 				enum bch_data_type data_type,
 				s64 sectors,
-				unsigned journal_seq, unsigned flags)
+				unsigned flags)
 {
 	bool gc = flags & BTREE_TRIGGER_GC;
+	struct bch_fs *c = trans->c;
 	struct bch_replicas_padded r;
 	struct stripe *m;
 	unsigned i, blocks_nonempty = 0;
@@ -906,16 +917,18 @@ static int bch2_mark_stripe_ptr(struct bch_fs *c,
 	spin_unlock(&c->ec_stripes_heap_lock);
 
 	r.e.data_type = data_type;
-	update_replicas(c, &r.e, sectors, journal_seq, gc);
+	update_replicas(c, &r.e, sectors, trans->journal_res.seq, gc);
 
 	return 0;
 }
 
-static int bch2_mark_extent(struct bch_fs *c,
+static int bch2_mark_extent(struct btree_trans *trans,
 			    struct bkey_s_c old, struct bkey_s_c new,
-			    unsigned journal_seq, unsigned flags)
+			    unsigned flags)
 {
 	bool gc = flags & BTREE_TRIGGER_GC;
+	u64 journal_seq = trans->journal_res.seq;
+	struct bch_fs *c = trans->c;
 	struct bkey_s_c k = flags & BTREE_TRIGGER_INSERT ? new : old;
 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
 	const union bch_extent_entry *entry;
@@ -944,8 +957,8 @@ static int bch2_mark_extent(struct bch_fs *c,
 		if (flags & BTREE_TRIGGER_OVERWRITE)
 			disk_sectors = -disk_sectors;
 
-		ret = bch2_mark_pointer(c, k, p, disk_sectors, data_type,
-					journal_seq, flags);
+		ret = bch2_mark_pointer(trans, k, p, disk_sectors,
+					data_type, flags);
 		if (ret < 0)
 			return ret;
 
@@ -963,8 +976,8 @@ static int bch2_mark_extent(struct bch_fs *c,
 			dirty_sectors	       += disk_sectors;
 			r.e.devs[r.e.nr_devs++]	= p.ptr.dev;
 		} else {
-			ret = bch2_mark_stripe_ptr(c, p.ec, data_type,
-					disk_sectors, journal_seq, flags);
+			ret = bch2_mark_stripe_ptr(trans, p.ec, data_type,
+					disk_sectors, flags);
 			if (ret)
 				return ret;
 
@@ -990,11 +1003,13 @@ static int bch2_mark_extent(struct bch_fs *c,
 	return 0;
 }
 
-static int bch2_mark_stripe(struct bch_fs *c,
-			struct bkey_s_c old, struct bkey_s_c new,
-			u64 journal_seq, unsigned flags)
+static int bch2_mark_stripe(struct btree_trans *trans,
+			    struct bkey_s_c old, struct bkey_s_c new,
+			    unsigned flags)
 {
 	bool gc = flags & BTREE_TRIGGER_GC;
+	u64 journal_seq = trans->journal_res.seq;
+	struct bch_fs *c = trans->c;
 	size_t idx = new.k->p.offset;
 	const struct bch_stripe *old_s = old.k->type == KEY_TYPE_stripe
 		? bkey_s_c_to_stripe(old).v : NULL;
@@ -1058,7 +1073,7 @@ static int bch2_mark_stripe(struct bch_fs *c,
 		m->blocks_nonempty = 0;
 
 		for (i = 0; i < new_s->nr_blocks; i++) {
-			ret = mark_stripe_bucket(c, new, i, journal_seq, flags);
+			ret = mark_stripe_bucket(trans, new, i, journal_seq, flags);
 			if (ret)
 				return ret;
 		}
@@ -1077,24 +1092,26 @@ static int bch2_mark_stripe(struct bch_fs *c,
 	return 0;
 }
 
-static int bch2_mark_inode(struct bch_fs *c,
-			struct bkey_s_c old, struct bkey_s_c new,
-			u64 journal_seq, unsigned flags)
+static int bch2_mark_inode(struct btree_trans *trans,
+			   struct bkey_s_c old, struct bkey_s_c new,
+			   unsigned flags)
 {
+	struct bch_fs *c = trans->c;
 	struct bch_fs_usage __percpu *fs_usage;
 
 	preempt_disable();
-	fs_usage = fs_usage_ptr(c, journal_seq, flags & BTREE_TRIGGER_GC);
+	fs_usage = fs_usage_ptr(c, trans->journal_res.seq, flags & BTREE_TRIGGER_GC);
 	fs_usage->nr_inodes += new.k->type == KEY_TYPE_inode;
 	fs_usage->nr_inodes -= old.k->type == KEY_TYPE_inode;
 	preempt_enable();
 	return 0;
 }
 
-static int bch2_mark_reservation(struct bch_fs *c,
-			struct bkey_s_c old, struct bkey_s_c new,
-			u64 journal_seq, unsigned flags)
+static int bch2_mark_reservation(struct btree_trans *trans,
+				 struct bkey_s_c old, struct bkey_s_c new,
+				 unsigned flags)
 {
+	struct bch_fs *c = trans->c;
 	struct bkey_s_c k = flags & BTREE_TRIGGER_INSERT ? new : old;
 	struct bch_fs_usage __percpu *fs_usage;
 	unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas;
@@ -1105,7 +1122,7 @@ static int bch2_mark_reservation(struct bch_fs *c,
 	sectors *= replicas;
 
 	preempt_disable();
-	fs_usage = fs_usage_ptr(c, journal_seq, flags & BTREE_TRIGGER_GC);
+	fs_usage = fs_usage_ptr(c, trans->journal_res.seq, flags & BTREE_TRIGGER_GC);
 	replicas = clamp_t(unsigned, replicas, 1,
 			   ARRAY_SIZE(fs_usage->persistent_reserved));
 
@@ -1163,10 +1180,11 @@ fsck_err:
 	return ret;
 }
 
-static int bch2_mark_reflink_p(struct bch_fs *c,
-			struct bkey_s_c old, struct bkey_s_c new,
-			u64 journal_seq, unsigned flags)
+static int bch2_mark_reflink_p(struct btree_trans *trans,
+			       struct bkey_s_c old, struct bkey_s_c new,
+			       unsigned flags)
 {
+	struct bch_fs *c = trans->c;
 	struct bkey_s_c k = flags & BTREE_TRIGGER_INSERT ? new : old;
 	struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
 	struct reflink_gc *ref;
@@ -1197,10 +1215,10 @@ static int bch2_mark_reflink_p(struct bch_fs *c,
 	return ret;
 }
 
-static int bch2_mark_key_locked(struct bch_fs *c,
+static int bch2_mark_key_locked(struct btree_trans *trans,
 		   struct bkey_s_c old,
 		   struct bkey_s_c new,
-		   u64 journal_seq, unsigned flags)
+		   unsigned flags)
 {
 	struct bkey_s_c k = flags & BTREE_TRIGGER_INSERT ? new : old;
 
@@ -1209,29 +1227,30 @@ static int bch2_mark_key_locked(struct bch_fs *c,
 	switch (k.k->type) {
 	case KEY_TYPE_alloc:
 	case KEY_TYPE_alloc_v2:
-		return bch2_mark_alloc(c, old, new, journal_seq, flags);
+		return bch2_mark_alloc(trans, old, new, flags);
 	case KEY_TYPE_btree_ptr:
 	case KEY_TYPE_btree_ptr_v2:
 	case KEY_TYPE_extent:
 	case KEY_TYPE_reflink_v:
-		return bch2_mark_extent(c, old, new, journal_seq, flags);
+		return bch2_mark_extent(trans, old, new, flags);
 	case KEY_TYPE_stripe:
-		return bch2_mark_stripe(c, old, new, journal_seq, flags);
+		return bch2_mark_stripe(trans, old, new, flags);
 	case KEY_TYPE_inode:
-		return bch2_mark_inode(c, old, new, journal_seq, flags);
+		return bch2_mark_inode(trans, old, new, flags);
 	case KEY_TYPE_reservation:
-		return bch2_mark_reservation(c, old, new, journal_seq, flags);
+		return bch2_mark_reservation(trans, old, new, flags);
 	case KEY_TYPE_reflink_p:
-		return bch2_mark_reflink_p(c, old, new, journal_seq, flags);
+		return bch2_mark_reflink_p(trans, old, new, flags);
 	case KEY_TYPE_snapshot:
-		return bch2_mark_snapshot(c, old, new, journal_seq, flags);
+		return bch2_mark_snapshot(trans, old, new, flags);
 	default:
 		return 0;
 	}
 }
 
-int bch2_mark_key(struct bch_fs *c, struct bkey_s_c new, unsigned flags)
+int bch2_mark_key(struct btree_trans *trans, struct bkey_s_c new, unsigned flags)
 {
+	struct bch_fs *c = trans->c;
 	struct bkey deleted = KEY(0, 0, 0);
 	struct bkey_s_c old = (struct bkey_s_c) { &deleted, NULL };
 	int ret;
@@ -1239,7 +1258,7 @@ int bch2_mark_key(struct bch_fs *c, struct bkey_s_c new, unsigned flags)
 	deleted.p = new.k->p;
 
 	percpu_down_read(&c->mark_lock);
-	ret = bch2_mark_key_locked(c, old, new, 0, flags);
+	ret = bch2_mark_key_locked(trans, old, new, flags);
 	percpu_up_read(&c->mark_lock);
 
 	return ret;
@@ -1248,7 +1267,6 @@ int bch2_mark_key(struct bch_fs *c, struct bkey_s_c new, unsigned flags)
 int bch2_mark_update(struct btree_trans *trans, struct btree_path *path,
 		     struct bkey_i *new, unsigned flags)
 {
-	struct bch_fs		*c = trans->c;
 	struct bkey		_deleted = KEY(0, 0, 0);
 	struct bkey_s_c		deleted = (struct bkey_s_c) { &_deleted, NULL };
 	struct bkey_s_c		old;
@@ -1267,15 +1285,12 @@ int bch2_mark_update(struct btree_trans *trans, struct btree_path *path,
 
 	if (old.k->type == new->k.type &&
 	    ((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) {
-		ret   = bch2_mark_key_locked(c, old, bkey_i_to_s_c(new),
-				trans->journal_res.seq,
+		ret   = bch2_mark_key_locked(trans, old, bkey_i_to_s_c(new),
 				BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|flags);
 	} else {
-		ret   = bch2_mark_key_locked(c, deleted, bkey_i_to_s_c(new),
-				trans->journal_res.seq,
+		ret   = bch2_mark_key_locked(trans, deleted, bkey_i_to_s_c(new),
 				BTREE_TRIGGER_INSERT|flags) ?:
-			bch2_mark_key_locked(c, old, deleted,
-				trans->journal_res.seq,
+			bch2_mark_key_locked(trans, old, deleted,
 				BTREE_TRIGGER_OVERWRITE|flags);
 	}
 
@@ -1440,7 +1455,8 @@ static int bch2_trans_mark_pointer(struct btree_trans *trans,
 	if (IS_ERR(a))
 		return PTR_ERR(a);
 
-	ret = __mark_pointer(c, k, &p.ptr, sectors, data_type, u.gen, &u.data_type,
+	ret = __mark_pointer(trans, k, &p.ptr, sectors, data_type,
+			     u.gen, &u.data_type,
 			     &u.dirty_sectors, &u.cached_sectors);
 	if (ret)
 		goto out;
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index 4687fba2eed6..41374463710c 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -226,7 +226,7 @@ void bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *,
 			       size_t, enum bch_data_type, unsigned,
 			       struct gc_pos, unsigned);
 
-int bch2_mark_key(struct bch_fs *, struct bkey_s_c, unsigned);
+int bch2_mark_key(struct btree_trans *, struct bkey_s_c, unsigned);
 
 int bch2_mark_update(struct btree_trans *, struct btree_path *,
 		     struct bkey_i *, unsigned);
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index ed4a73345e3a..74dc6c40dc9e 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -1634,13 +1634,14 @@ int bch2_stripes_write(struct bch_fs *c, unsigned flags)
 	return ret;
 }
 
-static int bch2_stripes_read_fn(struct bch_fs *c, struct bkey_s_c k)
+static int bch2_stripes_read_fn(struct btree_trans *trans, struct bkey_s_c k)
 {
+	struct bch_fs *c = trans->c;
 	int ret = 0;
 
 	if (k.k->type == KEY_TYPE_stripe)
 		ret = __ec_stripe_mem_alloc(c, k.k->p.offset, GFP_KERNEL) ?:
-			bch2_mark_key(c, k,
+			bch2_mark_key(trans, k,
 				      BTREE_TRIGGER_INSERT|
 				      BTREE_TRIGGER_NOATOMIC);
 
@@ -1649,8 +1650,13 @@ static int bch2_stripes_read_fn(struct bch_fs *c, struct bkey_s_c k)
 
 int bch2_stripes_read(struct bch_fs *c)
 {
-	int ret = bch2_btree_and_journal_walk(c, BTREE_ID_stripes,
-					      bch2_stripes_read_fn);
+	struct btree_trans trans;
+	int ret;
+
+	bch2_trans_init(&trans, c, 0, 0);
+	ret = bch2_btree_and_journal_walk(&trans, BTREE_ID_stripes,
+					  bch2_stripes_read_fn);
+	bch2_trans_exit(&trans);
 	if (ret)
 		bch_err(c, "error reading stripes: %i", ret);
 
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index da9c3ea528e7..29fae6dbce76 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -337,10 +337,11 @@ static void btree_and_journal_iter_prefetch(struct bch_fs *c, struct btree *b,
 	bch2_bkey_buf_exit(&tmp, c);
 }
 
-static int bch2_btree_and_journal_walk_recurse(struct bch_fs *c, struct btree *b,
+static int bch2_btree_and_journal_walk_recurse(struct btree_trans *trans, struct btree *b,
 				enum btree_id btree_id,
 				btree_walk_key_fn key_fn)
 {
+	struct bch_fs *c = trans->c;
 	struct btree_and_journal_iter iter;
 	struct bkey_s_c k;
 	struct bkey_buf tmp;
@@ -364,11 +365,11 @@ static int bch2_btree_and_journal_walk_recurse(struct bch_fs *c, struct btree *b
 
 			btree_and_journal_iter_prefetch(c, b, iter);
 
-			ret = bch2_btree_and_journal_walk_recurse(c, child,
+			ret = bch2_btree_and_journal_walk_recurse(trans, child,
 					btree_id, key_fn);
 			six_unlock_read(&child->c.lock);
 		} else {
-			ret = key_fn(c, k);
+			ret = key_fn(trans, k);
 		}
 
 		if (ret)
@@ -382,9 +383,10 @@ static int bch2_btree_and_journal_walk_recurse(struct bch_fs *c, struct btree *b
 	return ret;
 }
 
-int bch2_btree_and_journal_walk(struct bch_fs *c, enum btree_id btree_id,
+int bch2_btree_and_journal_walk(struct btree_trans *trans, enum btree_id btree_id,
 				btree_walk_key_fn key_fn)
 {
+	struct bch_fs *c = trans->c;
 	struct btree *b = c->btree_roots[btree_id].b;
 	int ret = 0;
 
@@ -392,7 +394,7 @@ int bch2_btree_and_journal_walk(struct bch_fs *c, enum btree_id btree_id,
 		return 0;
 
 	six_lock_read(&b->c.lock, NULL, NULL);
-	ret = bch2_btree_and_journal_walk_recurse(c, b, btree_id, key_fn);
+	ret = bch2_btree_and_journal_walk_recurse(trans, b, btree_id, key_fn);
 	six_unlock_read(&b->c.lock);
 
 	return ret;
diff --git a/fs/bcachefs/recovery.h b/fs/bcachefs/recovery.h
index e5565e4f335a..e45c70b3693f 100644
--- a/fs/bcachefs/recovery.h
+++ b/fs/bcachefs/recovery.h
@@ -45,9 +45,9 @@ void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *,
 						struct bch_fs *,
 						struct btree *);
 
-typedef int (*btree_walk_key_fn)(struct bch_fs *c, struct bkey_s_c k);
+typedef int (*btree_walk_key_fn)(struct btree_trans *, struct bkey_s_c);
 
-int bch2_btree_and_journal_walk(struct bch_fs *, enum btree_id, btree_walk_key_fn);
+int bch2_btree_and_journal_walk(struct btree_trans *, enum btree_id, btree_walk_key_fn);
 
 void bch2_journal_keys_free(struct journal_keys *);
 void bch2_journal_entries_free(struct list_head *);
diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c
index 4d385c9e9268..0ef625d21672 100644
--- a/fs/bcachefs/subvolume.c
+++ b/fs/bcachefs/subvolume.c
@@ -61,10 +61,11 @@ const char *bch2_snapshot_invalid(const struct bch_fs *c, struct bkey_s_c k)
 	return NULL;
 }
 
-int bch2_mark_snapshot(struct bch_fs *c,
+int bch2_mark_snapshot(struct btree_trans *trans,
 		       struct bkey_s_c old, struct bkey_s_c new,
-		       u64 journal_seq, unsigned flags)
+		       unsigned flags)
 {
+	struct bch_fs *c = trans->c;
 	struct snapshot_t *t;
 
 	t = genradix_ptr_alloc(&c->snapshots,
@@ -308,7 +309,7 @@ int bch2_fs_snapshots_start(struct bch_fs *c)
 		if (BCH_SNAPSHOT_DELETED(bkey_s_c_to_snapshot(k).v))
 			have_deleted = true;
 
-		ret = bch2_mark_snapshot(c, bkey_s_c_null, k, 0, 0);
+		ret = bch2_mark_snapshot(&trans, bkey_s_c_null, k, 0);
 		if (ret)
 			break;
 	}
@@ -499,7 +500,7 @@ static int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent,
 
 		bch2_trans_update(trans, &iter, &n->k_i, 0);
 
-		ret = bch2_mark_snapshot(trans->c, bkey_s_c_null, bkey_i_to_s_c(&n->k_i), 0, 0);
+		ret = bch2_mark_snapshot(trans, bkey_s_c_null, bkey_i_to_s_c(&n->k_i), 0);
 		if (ret)
 			break;
 
diff --git a/fs/bcachefs/subvolume.h b/fs/bcachefs/subvolume.h
index b5067dc68fc7..dde755b45392 100644
--- a/fs/bcachefs/subvolume.h
+++ b/fs/bcachefs/subvolume.h
@@ -12,8 +12,8 @@ const char *bch2_snapshot_invalid(const struct bch_fs *, struct bkey_s_c);
 	.val_to_text	= bch2_snapshot_to_text,		\
 }
 
-int bch2_mark_snapshot(struct bch_fs *, struct bkey_s_c,
-		       struct bkey_s_c, u64, unsigned);
+int bch2_mark_snapshot(struct btree_trans *, struct bkey_s_c,
+		       struct bkey_s_c, unsigned);
 
 static inline struct snapshot_t *snapshot_t(struct bch_fs *c, u32 id)
 {
-- 
cgit 


From 2debb1b875c140c7a5490d5eb9e88b3c51f375e2 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 29 Oct 2021 18:58:50 -0400
Subject: bcachefs: BTREE_TRIGGER_INSERT now only means insert

This allows triggers to distinguish between a key entering the btree -
i.e. being called from the trans commit path - vs. being called on a key
that already exists, i.e. by GC.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_gc.c |  4 +---
 fs/bcachefs/buckets.c  | 26 +++++---------------------
 fs/bcachefs/ec.c       |  1 -
 3 files changed, 6 insertions(+), 25 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 48fd89195357..197f5c0f3a9a 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -697,7 +697,6 @@ static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id,
 	struct bkey_ptrs_c ptrs;
 	const struct bch_extent_ptr *ptr;
 	unsigned flags =
-		BTREE_TRIGGER_INSERT|
 		BTREE_TRIGGER_GC|
 		(initial ? BTREE_TRIGGER_NOATOMIC : 0);
 	char buf[200];
@@ -1117,8 +1116,7 @@ static void bch2_mark_pending_btree_node_frees(struct bch_fs *c)
 
 	for_each_pending_btree_node_free(c, as, d)
 		if (d->index_update_done)
-			bch2_mark_key(c, bkey_i_to_s_c(&d->key),
-				      BTREE_TRIGGER_INSERT|BTREE_TRIGGER_GC);
+			bch2_mark_key(c, bkey_i_to_s_c(&d->key), BTREE_TRIGGER_GC);
 
 	mutex_unlock(&c->btree_interior_update_lock);
 }
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index a114a1142340..6e1837a0fc64 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -929,7 +929,7 @@ static int bch2_mark_extent(struct btree_trans *trans,
 	bool gc = flags & BTREE_TRIGGER_GC;
 	u64 journal_seq = trans->journal_res.seq;
 	struct bch_fs *c = trans->c;
-	struct bkey_s_c k = flags & BTREE_TRIGGER_INSERT ? new : old;
+	struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old: new;
 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
 	const union bch_extent_entry *entry;
 	struct extent_ptr_decoded p;
@@ -944,9 +944,6 @@ static int bch2_mark_extent(struct btree_trans *trans,
 	bool stale;
 	int ret;
 
-	BUG_ON((flags & (BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE)) ==
-	       (BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE));
-
 	r.e.data_type	= data_type;
 	r.e.nr_devs	= 0;
 	r.e.nr_required	= 1;
@@ -1112,7 +1109,7 @@ static int bch2_mark_reservation(struct btree_trans *trans,
 				 unsigned flags)
 {
 	struct bch_fs *c = trans->c;
-	struct bkey_s_c k = flags & BTREE_TRIGGER_INSERT ? new : old;
+	struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old: new;
 	struct bch_fs_usage __percpu *fs_usage;
 	unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas;
 	s64 sectors = (s64) k.k->size;
@@ -1185,7 +1182,7 @@ static int bch2_mark_reflink_p(struct btree_trans *trans,
 			       unsigned flags)
 {
 	struct bch_fs *c = trans->c;
-	struct bkey_s_c k = flags & BTREE_TRIGGER_INSERT ? new : old;
+	struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old: new;
 	struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
 	struct reflink_gc *ref;
 	size_t l, r, m;
@@ -1194,9 +1191,6 @@ static int bch2_mark_reflink_p(struct btree_trans *trans,
 		le32_to_cpu(p.v->back_pad);
 	int ret = 0;
 
-	BUG_ON((flags & (BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE)) ==
-	       (BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE));
-
 	l = 0;
 	r = c->reflink_gc_nr;
 	while (l < r) {
@@ -1220,9 +1214,7 @@ static int bch2_mark_key_locked(struct btree_trans *trans,
 		   struct bkey_s_c new,
 		   unsigned flags)
 {
-	struct bkey_s_c k = flags & BTREE_TRIGGER_INSERT ? new : old;
-
-	BUG_ON(!(flags & (BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE)));
+	struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old: new;
 
 	switch (k.k->type) {
 	case KEY_TYPE_alloc:
@@ -1541,9 +1533,6 @@ static int bch2_trans_mark_extent(struct btree_trans *trans,
 	bool stale;
 	int ret;
 
-	BUG_ON((flags & (BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE)) ==
-	       (BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE));
-
 	r.e.data_type	= data_type;
 	r.e.nr_devs	= 0;
 	r.e.nr_required	= 1;
@@ -1715,9 +1704,6 @@ static int bch2_trans_mark_reservation(struct btree_trans *trans,
 	s64 sectors = (s64) k.k->size;
 	struct replicas_delta_list *d;
 
-	BUG_ON((flags & (BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE)) ==
-	       (BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE));
-
 	if (flags & BTREE_TRIGGER_OVERWRITE)
 		sectors = -sectors;
 	sectors *= replicas;
@@ -1837,9 +1823,7 @@ static int bch2_trans_mark_reflink_p(struct btree_trans *trans,
 int bch2_trans_mark_key(struct btree_trans *trans, struct bkey_s_c old,
 			struct bkey_s_c new, unsigned flags)
 {
-	struct bkey_s_c k = flags & BTREE_TRIGGER_INSERT ? new : old;
-
-	BUG_ON(!(flags & (BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE)));
+	struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old: new;
 
 	switch (k.k->type) {
 	case KEY_TYPE_btree_ptr:
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index 74dc6c40dc9e..32b17f05a750 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -1642,7 +1642,6 @@ static int bch2_stripes_read_fn(struct btree_trans *trans, struct bkey_s_c k)
 	if (k.k->type == KEY_TYPE_stripe)
 		ret = __ec_stripe_mem_alloc(c, k.k->p.offset, GFP_KERNEL) ?:
 			bch2_mark_key(trans, k,
-				      BTREE_TRIGGER_INSERT|
 				      BTREE_TRIGGER_NOATOMIC);
 
 	return ret;
-- 
cgit 


From d7407292723ea79028afe6729432602ced243972 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 30 Oct 2021 05:28:27 -0400
Subject: bcachefs: Fix faulty assertion

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_iter.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index baa8391a4226..806663799a08 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -661,7 +661,8 @@ static void bch2_btree_path_verify(struct btree_trans *trans,
 
 	for (i = 0; i < (!path->cached ? BTREE_MAX_DEPTH : 1); i++) {
 		if (!path->l[i].b) {
-			BUG_ON(c->btree_roots[path->btree_id].b->c.level > i);
+			BUG_ON(!path->cached &&
+			       c->btree_roots[path->btree_id].b->c.level > i);
 			break;
 		}
 
-- 
cgit 


From f527afea5a2f3c2645080584dafeb6cc5314b652 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 3 Nov 2021 12:08:02 -0400
Subject: bcachefs: Fix upgrade_readers()

The bch2_btree_path_upgrade() call was failing and tripping an assert -
path->level + 1 is in this case not necessarily exactly what we want,
fix it by upgrading exactly the locks we want.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_iter.c        |  4 ++--
 fs/bcachefs/btree_iter.h        |  3 +++
 fs/bcachefs/btree_update_leaf.c | 11 ++++++++++-
 3 files changed, 15 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 806663799a08..2c28e65fdeb5 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -180,8 +180,8 @@ bool __bch2_btree_node_relock(struct btree_trans *trans,
 	}
 }
 
-static bool bch2_btree_node_upgrade(struct btree_trans *trans,
-				    struct btree_path *path, unsigned level)
+bool bch2_btree_node_upgrade(struct btree_trans *trans,
+			     struct btree_path *path, unsigned level)
 {
 	struct btree *b = path->l[level].b;
 
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index 16fa0fe1c5b7..64a3969db263 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -192,6 +192,9 @@ static inline int btree_trans_restart(struct btree_trans *trans)
 	return -EINTR;
 }
 
+bool bch2_btree_node_upgrade(struct btree_trans *,
+			     struct btree_path *, unsigned);
+
 bool __bch2_btree_path_upgrade(struct btree_trans *,
 			       struct btree_path *, unsigned);
 
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 8b4933add017..43ae2d83cfa7 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -507,6 +507,15 @@ err:
 	return ret;
 }
 
+static inline void path_upgrade_readers(struct btree_trans *trans, struct btree_path *path)
+{
+	unsigned l;
+
+	for (l = 0; l < BTREE_MAX_DEPTH; l++)
+		if (btree_node_read_locked(path, l))
+			BUG_ON(!bch2_btree_node_upgrade(trans, path, l));
+}
+
 static inline void upgrade_readers(struct btree_trans *trans, struct btree_path *path)
 {
 	struct btree *b = path_l(path)->b;
@@ -514,7 +523,7 @@ static inline void upgrade_readers(struct btree_trans *trans, struct btree_path
 	do {
 		if (path->nodes_locked &&
 		    path->nodes_locked != path->nodes_intent_locked)
-			BUG_ON(!bch2_btree_path_upgrade(trans, path, path->level + 1));
+			path_upgrade_readers(trans, path);
 	} while ((path = prev_btree_path(trans, path)) &&
 		 path_l(path)->b == b);
 }
-- 
cgit 


From 85eb2bae7fd18f424b28c3f873d106c95e6b9733 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 3 Nov 2021 17:23:03 -0400
Subject: bcachefs: Fix trans_lock_write()

On failure to get a write lock (because we had a conflicting read lock),
we need to make sure to upgrade the read lock to an intent lock - or we
could end up spinning.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_update_leaf.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 43ae2d83cfa7..fa6ba018378b 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -569,7 +569,8 @@ static inline bool have_conflicting_read_lock(struct btree_trans *trans, struct
 		//if (path == pos)
 		//	break;
 
-		if (path->nodes_locked != path->nodes_intent_locked)
+		if (path->nodes_locked != path->nodes_intent_locked &&
+		    !bch2_btree_path_upgrade(trans, path, path->level + 1))
 			return true;
 	}
 
-- 
cgit 


From d647db314adb8b5b60224a7604566746e7203c22 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 3 Nov 2021 17:23:49 -0400
Subject: bcachefs: Improve error message in bch2_write_super()

It's helpful to know what the superblock write is for.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/super-io.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index be080c407286..637408d76270 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -809,7 +809,8 @@ int bch2_write_super(struct bch_fs *c)
 				 !can_mount_with_written ||
 				 (can_mount_without_written &&
 				  !can_mount_with_written), c,
-		"Unable to write superblock to sufficient devices"))
+		"Unable to write superblock to sufficient devices (from %ps)",
+		(void *) _RET_IP_))
 		ret = -1;
 out:
 	/* Make new options visible after they're persistent: */
-- 
cgit 


From 47f80bbf38df59e41c98fd1f9681cf63af82a5fd Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 3 Nov 2021 20:25:35 -0400
Subject: bcachefs: Fix check_inodes()

We were starting at the wrong btree position, and thus not actually
checking any inodes - oops.

Also, make check_key_has_snapshot() a mustfix fsck error, since later
fsck code assumes that all keys have valid snapshot IDs.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/fsck.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 5bc04c7bbb83..8a9cfccf5ee8 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -669,7 +669,7 @@ static int check_key_has_snapshot(struct btree_trans *trans,
 	char buf[200];
 	int ret = 0;
 
-	if (fsck_err_on(!snapshot_t(c, k.k->p.snapshot)->equiv, c,
+	if (mustfix_fsck_err_on(!snapshot_t(c, k.k->p.snapshot)->equiv, c,
 			"key in missing snapshot: %s",
 			(bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)))
 		return bch2_btree_delete_at(trans, iter,
@@ -918,8 +918,7 @@ static int check_inodes(struct bch_fs *c, bool full)
 
 	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
 
-	bch2_trans_iter_init(&trans, &iter, BTREE_ID_inodes,
-			     POS(BCACHEFS_ROOT_INO, 0),
+	bch2_trans_iter_init(&trans, &iter, BTREE_ID_inodes, POS_MIN,
 			     BTREE_ITER_INTENT|
 			     BTREE_ITER_PREFETCH|
 			     BTREE_ITER_ALL_SNAPSHOTS);
-- 
cgit 


From c27314b448d3ef1aa44a82d77ff1e1eec0c5a682 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 3 Nov 2021 21:22:46 -0400
Subject: bcachefs: Fix __remove_dirent()

__lookup_inode() doesn't work for what __remove_dirent() wants - it just
wants the first inode at a given inode number, they all have the same
hash info.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/fsck.c | 31 ++++++++++++++++++++++++++++++-
 1 file changed, 30 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 8a9cfccf5ee8..9519ced976f2 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -113,6 +113,35 @@ static int subvol_lookup(struct btree_trans *trans, u32 subvol,
 	return lockrestart_do(trans, __subvol_lookup(trans, subvol, snapshot, inum));
 }
 
+static int lookup_first_inode(struct btree_trans *trans, u64 inode_nr,
+			      struct bch_inode_unpacked *inode)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	int ret;
+
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes,
+			     POS(0, inode_nr),
+			     BTREE_ITER_ALL_SNAPSHOTS);
+	k = bch2_btree_iter_peek(&iter);
+	ret = bkey_err(k);
+	if (ret)
+		goto err;
+
+	if (!k.k || bkey_cmp(k.k->p, POS(0, inode_nr))) {
+		ret = -ENOENT;
+		goto err;
+	}
+
+	ret = bch2_inode_unpack(bkey_s_c_to_inode(k), inode);
+err:
+	if (ret && ret != -EINTR)
+		bch_err(trans->c, "error %i fetching inode %llu",
+			ret, inode_nr);
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
 static int __lookup_inode(struct btree_trans *trans, u64 inode_nr,
 			  struct bch_inode_unpacked *inode,
 			  u32 *snapshot)
@@ -272,7 +301,7 @@ static int __remove_dirent(struct btree_trans *trans, struct bpos pos)
 	struct bch_hash_info dir_hash_info;
 	int ret;
 
-	ret = __lookup_inode(trans, pos.inode, &dir_inode, NULL);
+	ret = lookup_first_inode(trans, pos.inode, &dir_inode);
 	if (ret)
 		return ret;
 
-- 
cgit 


From 34d74830b25c7d0b4f6affda90225e4849296255 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 31 Dec 2022 00:15:23 -0500
Subject: bcachefs: BTREE_UPDATE_NOJOURNAL

We're going to have btree updates that don't need to be journalled; add
a flag for that.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_types.h       | 2 ++
 fs/bcachefs/btree_update_leaf.c | 7 +++++--
 2 files changed, 7 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 14acbdf34f7b..d8c35ba9ec89 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -630,6 +630,7 @@ static inline bool btree_type_has_snapshots(enum btree_id id)
 
 enum btree_update_flags {
 	__BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE,
+	__BTREE_UPDATE_NOJOURNAL,
 
 	__BTREE_TRIGGER_NORUN,		/* Don't run triggers at all */
 
@@ -642,6 +643,7 @@ enum btree_update_flags {
 };
 
 #define BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE (1U << __BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE)
+#define BTREE_UPDATE_NOJOURNAL		(1U << __BTREE_UPDATE_NOJOURNAL)
 
 #define BTREE_TRIGGER_NORUN		(1U << __BTREE_TRIGGER_NORUN)
 
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index fa6ba018378b..b9c93182f2de 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -356,7 +356,8 @@ static inline void do_btree_insert_one(struct btree_trans *trans,
 	if (!did_work)
 		return;
 
-	if (likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) {
+	if (likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)) &&
+	    !(i->flags & BTREE_UPDATE_NOJOURNAL)) {
 		bch2_journal_add_keys(j, &trans->journal_res,
 				      i->btree_id,
 				      i->level,
@@ -897,7 +898,9 @@ int __bch2_trans_commit(struct btree_trans *trans)
 		if (i->cached &&
 		    likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)))
 			trans->journal_preres_u64s += u64s;
-		trans->journal_u64s += u64s;
+
+		if (!(i->flags & BTREE_UPDATE_NOJOURNAL))
+			trans->journal_u64s += u64s;
 	}
 
 	if (trans->extra_journal_res) {
-- 
cgit 


From 1db84979c7b640c15abae8a013485546bcca3623 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 3 Nov 2021 22:35:34 -0400
Subject: bcachefs: Update inode on every write

This is going to be a performance regression until we get the btree key
cache re-enabled - but it's needed for fixing fsync. Upcoming patches
will record the journal_seq an inode was updated at in the inode itself.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/io.c | 76 +++++++++++++++++++++++++++++---------------------------
 1 file changed, 40 insertions(+), 36 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index bf04b61ae882..ca4e7a5a64b9 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -279,7 +279,10 @@ int bch2_extent_update(struct btree_trans *trans,
 {
 	/* this must live until after bch2_trans_commit(): */
 	struct bkey_inode_buf inode_p;
+	struct btree_iter inode_iter = { NULL };
+	struct bch_inode_unpacked inode_u;
 	struct bpos next_pos;
+	struct bkey_s_c inode;
 	bool extending = false, usage_increasing;
 	s64 i_sectors_delta = 0, disk_sectors_delta = 0;
 	int ret;
@@ -298,6 +301,9 @@ int bch2_extent_update(struct btree_trans *trans,
 	if (ret)
 		return ret;
 
+	new_i_size = min(k->k.p.offset << 9, new_i_size);
+	next_pos = k->k.p;
+
 	ret = bch2_sum_sector_overwrites(trans, iter, k,
 			&extending,
 			&usage_increasing,
@@ -306,14 +312,11 @@ int bch2_extent_update(struct btree_trans *trans,
 	if (ret)
 		return ret;
 
-	if (!usage_increasing)
-		check_enospc = false;
-
 	if (disk_res &&
 	    disk_sectors_delta > (s64) disk_res->sectors) {
 		ret = bch2_disk_reservation_add(trans->c, disk_res,
 					disk_sectors_delta - disk_res->sectors,
-					!check_enospc
+					!check_enospc || !usage_increasing
 					? BCH_DISK_RESERVATION_NOFAIL : 0);
 		if (ret)
 			return ret;
@@ -323,26 +326,25 @@ int bch2_extent_update(struct btree_trans *trans,
 		? min(k->k.p.offset << 9, new_i_size)
 		: 0;
 
-	if (i_sectors_delta || new_i_size) {
-		struct btree_iter inode_iter;
-		struct bch_inode_unpacked inode_u;
+	bch2_trans_iter_init(trans, &inode_iter, BTREE_ID_inodes,
+			     SPOS(0, inum.inum, iter->snapshot),
+			     BTREE_ITER_INTENT|
+			     (trans->c->opts.inodes_use_key_cache
+			      ? BTREE_ITER_CACHED
+			      : 0));
+	inode = bch2_btree_iter_peek_slot(&inode_iter);
+	ret = bkey_err(inode);
+	if (ret)
+		goto err;
 
-		ret = bch2_inode_peek(trans, &inode_iter, &inode_u, inum,
-				      BTREE_ITER_INTENT);
-		if (ret)
-			return ret;
+	ret = inode.k->type == KEY_TYPE_inode ? 0 : -ENOENT;
+	if (ret)
+		goto err;
 
-		/*
-		 * XXX:
-		 * writeback can race a bit with truncate, because truncate
-		 * first updates the inode then truncates the pagecache. This is
-		 * ugly, but lets us preserve the invariant that the in memory
-		 * i_size is always >= the on disk i_size.
-		 *
-		BUG_ON(new_i_size > inode_u.bi_size &&
-		       (inode_u.bi_flags & BCH_INODE_I_SIZE_DIRTY));
-		 */
-		BUG_ON(new_i_size > inode_u.bi_size && !extending);
+	if (i_sectors_delta || new_i_size) {
+		ret = bch2_inode_unpack(bkey_s_c_to_inode(inode), &inode_u);
+		if (ret)
+			goto err;
 
 		if (!(inode_u.bi_flags & BCH_INODE_I_SIZE_DIRTY) &&
 		    new_i_size > inode_u.bi_size)
@@ -351,36 +353,38 @@ int bch2_extent_update(struct btree_trans *trans,
 			new_i_size = 0;
 
 		inode_u.bi_sectors += i_sectors_delta;
+	}
 
-		if (i_sectors_delta || new_i_size) {
-			bch2_inode_pack(trans->c, &inode_p, &inode_u);
-
-			inode_p.inode.k.p.snapshot = iter->snapshot;
+	if (i_sectors_delta || new_i_size) {
+		bch2_inode_pack(trans->c, &inode_p, &inode_u);
 
-			ret = bch2_trans_update(trans, &inode_iter,
-					  &inode_p.inode.k_i, 0);
-		}
+		inode_p.inode.k.p.snapshot = iter->snapshot;
 
-		bch2_trans_iter_exit(trans, &inode_iter);
+		ret = bch2_trans_update(trans, &inode_iter,
+				  &inode_p.inode.k_i, 0);
+	} else {
+		bkey_reassemble(&inode_p.inode.k_i, inode);
 
+		ret = bch2_trans_update(trans, &inode_iter,
+					&inode_p.inode.k_i,
+					BTREE_UPDATE_NOJOURNAL);
 		if (ret)
-			return ret;
+			goto err;
 	}
 
-	next_pos = k->k.p;
-
 	ret =   bch2_trans_update(trans, iter, k, 0) ?:
 		bch2_trans_commit(trans, disk_res, journal_seq,
 				BTREE_INSERT_NOCHECK_RW|
 				BTREE_INSERT_NOFAIL);
-	BUG_ON(ret == -ENOSPC);
+err:
+	bch2_trans_iter_exit(trans, &inode_iter);
 	if (ret)
 		return ret;
 
-	bch2_btree_iter_set_pos(iter, next_pos);
-
 	if (i_sectors_delta_total)
 		*i_sectors_delta_total += i_sectors_delta;
+	bch2_btree_iter_set_pos(iter, next_pos);
+
 	return 0;
 }
 
-- 
cgit 


From 3e52c22255143bb86860abf26ef29a077ac30314 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 29 Oct 2021 21:14:23 -0400
Subject: bcachefs: Add journal_seq to inode & alloc keys

Add fields to inode & alloc keys that record the journal sequence number
when they were most recently modified.

For alloc keys, this is needed to know what journal sequence number we
have to flush before the bucket can be reused. Currently this is tracked
in memory, but we'll be getting rid of the in memory bucket array.

For inodes, this is needed for fsync when the inode has been evicted
from the vfs cache. Currently we use a bloom filter per outstanding
journal buf - but that mechanism has been broken since we added the
ability to not issue a flush/fua for every journal write.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/alloc_background.c |  76 +++++++++++++--
 fs/bcachefs/alloc_background.h |  26 +++--
 fs/bcachefs/bcachefs_format.h  |  31 +++++-
 fs/bcachefs/bkey_methods.c     |   4 +-
 fs/bcachefs/btree_types.h      |   7 +-
 fs/bcachefs/buckets.c          |  41 ++++++--
 fs/bcachefs/fs.c               |   2 +-
 fs/bcachefs/fsck.c             |  58 +++++------
 fs/bcachefs/inode.c            | 211 ++++++++++++++++++++---------------------
 fs/bcachefs/inode.h            |  17 +++-
 fs/bcachefs/io.c               |   4 +-
 fs/bcachefs/move.c             |   4 +-
 fs/bcachefs/quota.c            |   5 +-
 fs/bcachefs/recovery.c         |   7 +-
 14 files changed, 307 insertions(+), 186 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 3b6af70fa186..10514476cffe 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -147,10 +147,44 @@ static int bch2_alloc_unpack_v2(struct bkey_alloc_unpacked *out,
 	return 0;
 }
 
-static void bch2_alloc_pack_v2(struct bkey_alloc_buf *dst,
+static int bch2_alloc_unpack_v3(struct bkey_alloc_unpacked *out,
+				struct bkey_s_c k)
+{
+	struct bkey_s_c_alloc_v3 a = bkey_s_c_to_alloc_v3(k);
+	const u8 *in = a.v->data;
+	const u8 *end = bkey_val_end(a);
+	unsigned fieldnr = 0;
+	int ret;
+	u64 v;
+
+	out->gen	= a.v->gen;
+	out->oldest_gen	= a.v->oldest_gen;
+	out->data_type	= a.v->data_type;
+	out->journal_seq = le64_to_cpu(a.v->journal_seq);
+
+#define x(_name, _bits)							\
+	if (fieldnr < a.v->nr_fields) {					\
+		ret = bch2_varint_decode_fast(in, end, &v);		\
+		if (ret < 0)						\
+			return ret;					\
+		in += ret;						\
+	} else {							\
+		v = 0;							\
+	}								\
+	out->_name = v;							\
+	if (v != out->_name)						\
+		return -1;						\
+	fieldnr++;
+
+	BCH_ALLOC_FIELDS_V2()
+#undef  x
+	return 0;
+}
+
+static void bch2_alloc_pack_v3(struct bkey_alloc_buf *dst,
 			       const struct bkey_alloc_unpacked src)
 {
-	struct bkey_i_alloc_v2 *a = bkey_alloc_v2_init(&dst->k);
+	struct bkey_i_alloc_v3 *a = bkey_alloc_v3_init(&dst->k);
 	unsigned nr_fields = 0, last_nonzero_fieldnr = 0;
 	u8 *out = a->v.data;
 	u8 *end = (void *) &dst[1];
@@ -161,6 +195,7 @@ static void bch2_alloc_pack_v2(struct bkey_alloc_buf *dst,
 	a->v.gen	= src.gen;
 	a->v.oldest_gen	= src.oldest_gen;
 	a->v.data_type	= src.data_type;
+	a->v.journal_seq = cpu_to_le64(src.journal_seq);
 
 #define x(_name, _bits)							\
 	nr_fields++;							\
@@ -194,10 +229,17 @@ struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c k)
 		.gen	= 0,
 	};
 
-	if (k.k->type == KEY_TYPE_alloc_v2)
-		bch2_alloc_unpack_v2(&ret, k);
-	else if (k.k->type == KEY_TYPE_alloc)
+	switch (k.k->type) {
+	case KEY_TYPE_alloc:
 		bch2_alloc_unpack_v1(&ret, k);
+		break;
+	case KEY_TYPE_alloc_v2:
+		bch2_alloc_unpack_v2(&ret, k);
+		break;
+	case KEY_TYPE_alloc_v3:
+		bch2_alloc_unpack_v3(&ret, k);
+		break;
+	}
 
 	return ret;
 }
@@ -206,7 +248,7 @@ void bch2_alloc_pack(struct bch_fs *c,
 		     struct bkey_alloc_buf *dst,
 		     const struct bkey_alloc_unpacked src)
 {
-	bch2_alloc_pack_v2(dst, src);
+	bch2_alloc_pack_v3(dst, src);
 }
 
 static unsigned bch_alloc_v1_val_u64s(const struct bch_alloc *a)
@@ -249,13 +291,28 @@ const char *bch2_alloc_v2_invalid(const struct bch_fs *c, struct bkey_s_c k)
 	return NULL;
 }
 
+const char *bch2_alloc_v3_invalid(const struct bch_fs *c, struct bkey_s_c k)
+{
+	struct bkey_alloc_unpacked u;
+
+	if (k.k->p.inode >= c->sb.nr_devices ||
+	    !c->devs[k.k->p.inode])
+		return "invalid device";
+
+	if (bch2_alloc_unpack_v3(&u, k))
+		return "unpack error";
+
+	return NULL;
+}
+
 void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c,
 			   struct bkey_s_c k)
 {
 	struct bkey_alloc_unpacked u = bch2_alloc_unpack(k);
 
-	pr_buf(out, "gen %u oldest_gen %u data_type %s",
-	       u.gen, u.oldest_gen, bch2_data_types[u.data_type]);
+	pr_buf(out, "gen %u oldest_gen %u data_type %s journal_seq %llu",
+	       u.gen, u.oldest_gen, bch2_data_types[u.data_type],
+	       u.journal_seq);
 #define x(_name, ...)	pr_buf(out, " " #_name " %llu", (u64) u._name);
 	BCH_ALLOC_FIELDS_V2()
 #undef  x
@@ -268,8 +325,7 @@ static int bch2_alloc_read_fn(struct btree_trans *trans, struct bkey_s_c k)
 	struct bucket *g;
 	struct bkey_alloc_unpacked u;
 
-	if (k.k->type != KEY_TYPE_alloc &&
-	    k.k->type != KEY_TYPE_alloc_v2)
+	if (!bkey_is_alloc(k.k))
 		return 0;
 
 	ca = bch_dev_bkey_exists(c, k.k->p.inode);
diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h
index a4f6bf56b18f..370573f8e05d 100644
--- a/fs/bcachefs/alloc_background.h
+++ b/fs/bcachefs/alloc_background.h
@@ -9,6 +9,7 @@
 extern const char * const bch2_allocator_states[];
 
 struct bkey_alloc_unpacked {
+	u64		journal_seq;
 	u64		bucket;
 	u8		dev;
 	u8		gen;
@@ -21,19 +22,11 @@ struct bkey_alloc_unpacked {
 
 struct bkey_alloc_buf {
 	struct bkey_i	k;
+	struct bch_alloc_v3 v;
 
-	union {
-	struct {
 #define x(_name,  _bits)		+ _bits / 8
-	u8		_pad[8 + BCH_ALLOC_FIELDS_V1()];
+	u8		_pad[0 + BCH_ALLOC_FIELDS_V2()];
 #undef  x
-	} _v1;
-	struct {
-#define x(_name,  _bits)		+ 8 + _bits / 8
-	u8		_pad[8 + BCH_ALLOC_FIELDS_V2()];
-#undef  x
-	} _v2;
-	};
 } __attribute__((packed, aligned(8)));
 
 /* How out of date a pointer gen is allowed to be: */
@@ -79,6 +72,7 @@ alloc_mem_to_key(struct btree_iter *iter,
 
 const char *bch2_alloc_v1_invalid(const struct bch_fs *, struct bkey_s_c);
 const char *bch2_alloc_v2_invalid(const struct bch_fs *, struct bkey_s_c);
+const char *bch2_alloc_v3_invalid(const struct bch_fs *, struct bkey_s_c);
 void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 
 #define bch2_bkey_ops_alloc (struct bkey_ops) {		\
@@ -91,6 +85,18 @@ void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 	.val_to_text	= bch2_alloc_to_text,		\
 }
 
+#define bch2_bkey_ops_alloc_v3 (struct bkey_ops) {	\
+	.key_invalid	= bch2_alloc_v3_invalid,	\
+	.val_to_text	= bch2_alloc_to_text,		\
+}
+
+static inline bool bkey_is_alloc(const struct bkey *k)
+{
+	return  k->type == KEY_TYPE_alloc ||
+		k->type == KEY_TYPE_alloc_v2 ||
+		k->type == KEY_TYPE_alloc_v3;
+}
+
 int bch2_alloc_read(struct bch_fs *);
 
 static inline void bch2_wake_allocator(struct bch_dev *ca)
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index 8e1423b138a6..21f1948ef8d0 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -348,7 +348,9 @@ static inline void bkey_init(struct bkey *k)
 	x(indirect_inline_data,	19)			\
 	x(alloc_v2,		20)			\
 	x(subvolume,		21)			\
-	x(snapshot,		22)
+	x(snapshot,		22)			\
+	x(inode_v2,		23)			\
+	x(alloc_v3,		24)
 
 enum bch_bkey_type {
 #define x(name, nr) KEY_TYPE_##name	= nr,
@@ -685,6 +687,16 @@ struct bch_inode {
 	__u8			fields[0];
 } __attribute__((packed, aligned(8)));
 
+struct bch_inode_v2 {
+	struct bch_val		v;
+
+	__le64			bi_journal_seq;
+	__le64			bi_hash_seed;
+	__le64			bi_flags;
+	__le16			bi_mode;
+	__u8			fields[0];
+} __attribute__((packed, aligned(8)));
+
 struct bch_inode_generation {
 	struct bch_val		v;
 
@@ -776,6 +788,9 @@ LE32_BITMASK(INODE_STR_HASH,	struct bch_inode, bi_flags, 20, 24);
 LE32_BITMASK(INODE_NR_FIELDS,	struct bch_inode, bi_flags, 24, 31);
 LE32_BITMASK(INODE_NEW_VARINT,	struct bch_inode, bi_flags, 31, 32);
 
+LE64_BITMASK(INODEv2_STR_HASH,	struct bch_inode_v2, bi_flags, 20, 24);
+LE64_BITMASK(INODEv2_NR_FIELDS,	struct bch_inode_v2, bi_flags, 24, 31);
+
 /* Dirents */
 
 /*
@@ -870,6 +885,17 @@ struct bch_alloc_v2 {
 	x(stripe,		32)		\
 	x(stripe_redundancy,	8)
 
+struct bch_alloc_v3 {
+	struct bch_val		v;
+	__le64			journal_seq;
+	__le32			flags;
+	__u8			nr_fields;
+	__u8			gen;
+	__u8			oldest_gen;
+	__u8			data_type;
+	__u8			data[];
+} __attribute__((packed, aligned(8)));
+
 enum {
 #define x(name, _bits) BCH_ALLOC_FIELD_V1_##name,
 	BCH_ALLOC_FIELDS_V1()
@@ -1276,7 +1302,8 @@ enum bcachefs_metadata_version {
 	bcachefs_metadata_version_snapshot_2		= 15,
 	bcachefs_metadata_version_reflink_p_fix		= 16,
 	bcachefs_metadata_version_subvol_dirent		= 17,
-	bcachefs_metadata_version_max			= 18,
+	bcachefs_metadata_version_inode_v2		= 18,
+	bcachefs_metadata_version_max			= 19,
 };
 
 #define bcachefs_metadata_version_current	(bcachefs_metadata_version_max - 1)
diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c
index f7f4139072b5..c93004741b87 100644
--- a/fs/bcachefs/bkey_methods.c
+++ b/fs/bcachefs/bkey_methods.c
@@ -113,6 +113,7 @@ static unsigned bch2_key_types_allowed[] = {
 		(1U << KEY_TYPE_deleted)|
 		(1U << KEY_TYPE_whiteout)|
 		(1U << KEY_TYPE_inode)|
+		(1U << KEY_TYPE_inode_v2)|
 		(1U << KEY_TYPE_inode_generation),
 	[BKEY_TYPE_dirents] =
 		(1U << KEY_TYPE_deleted)|
@@ -128,7 +129,8 @@ static unsigned bch2_key_types_allowed[] = {
 	[BKEY_TYPE_alloc] =
 		(1U << KEY_TYPE_deleted)|
 		(1U << KEY_TYPE_alloc)|
-		(1U << KEY_TYPE_alloc_v2),
+		(1U << KEY_TYPE_alloc_v2)|
+		(1U << KEY_TYPE_alloc_v3),
 	[BKEY_TYPE_quotas] =
 		(1U << KEY_TYPE_deleted)|
 		(1U << KEY_TYPE_quota),
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index d8c35ba9ec89..5331626e62a5 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -606,6 +606,7 @@ static inline bool btree_node_is_extents(struct btree *b)
 
 #define BTREE_NODE_TYPE_HAS_MEM_TRIGGERS		\
 	((1U << BKEY_TYPE_alloc)|			\
+	 (1U << BKEY_TYPE_inodes)|			\
 	 (1U << BKEY_TYPE_stripes)|			\
 	 (1U << BKEY_TYPE_snapshots))
 
@@ -655,8 +656,12 @@ enum btree_update_flags {
 #define BTREE_TRIGGER_NOATOMIC		(1U << __BTREE_TRIGGER_NOATOMIC)
 
 #define BTREE_TRIGGER_WANTS_OLD_AND_NEW		\
-	((1U << KEY_TYPE_stripe)|		\
+	((1U << KEY_TYPE_alloc)|		\
+	 (1U << KEY_TYPE_alloc_v2)|		\
+	 (1U << KEY_TYPE_alloc_v3)|		\
+	 (1U << KEY_TYPE_stripe)|		\
 	 (1U << KEY_TYPE_inode)|		\
+	 (1U << KEY_TYPE_inode_v2)|		\
 	 (1U << KEY_TYPE_snapshot))
 
 static inline bool btree_node_type_needs_gc(enum btree_node_type type)
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 6e1837a0fc64..b51b1cf3ca25 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -13,6 +13,7 @@
 #include "buckets.h"
 #include "ec.h"
 #include "error.h"
+#include "inode.h"
 #include "movinggc.h"
 #include "recovery.h"
 #include "reflink.h"
@@ -541,8 +542,7 @@ static int bch2_mark_alloc(struct btree_trans *trans,
 	struct bucket_mark old_m, m;
 
 	/* We don't do anything for deletions - do we?: */
-	if (new.k->type != KEY_TYPE_alloc &&
-	    new.k->type != KEY_TYPE_alloc_v2)
+	if (!bkey_is_alloc(new.k))
 		return 0;
 
 	/*
@@ -552,6 +552,15 @@ static int bch2_mark_alloc(struct btree_trans *trans,
 	    !(flags & BTREE_TRIGGER_BUCKET_INVALIDATE))
 		return 0;
 
+	if (flags & BTREE_TRIGGER_INSERT) {
+		struct bch_alloc_v3 *v = (struct bch_alloc_v3 *) new.v;
+
+		BUG_ON(!journal_seq);
+		BUG_ON(new.k->type != KEY_TYPE_alloc_v3);
+
+		v->journal_seq = cpu_to_le64(journal_seq);
+	}
+
 	ca = bch_dev_bkey_exists(c, new.k->p.inode);
 
 	if (new.k->p.offset >= ca->mi.nbuckets)
@@ -1095,12 +1104,24 @@ static int bch2_mark_inode(struct btree_trans *trans,
 {
 	struct bch_fs *c = trans->c;
 	struct bch_fs_usage __percpu *fs_usage;
+	u64 journal_seq = trans->journal_res.seq;
 
-	preempt_disable();
-	fs_usage = fs_usage_ptr(c, trans->journal_res.seq, flags & BTREE_TRIGGER_GC);
-	fs_usage->nr_inodes += new.k->type == KEY_TYPE_inode;
-	fs_usage->nr_inodes -= old.k->type == KEY_TYPE_inode;
-	preempt_enable();
+	if (flags & BTREE_TRIGGER_INSERT) {
+		struct bch_inode_v2 *v = (struct bch_inode_v2 *) new.v;
+
+		BUG_ON(!journal_seq);
+		BUG_ON(new.k->type != KEY_TYPE_inode_v2);
+
+		v->bi_journal_seq = cpu_to_le64(journal_seq);
+	}
+
+	if (flags & BTREE_TRIGGER_GC) {
+		preempt_disable();
+		fs_usage = fs_usage_ptr(c, journal_seq, flags & BTREE_TRIGGER_GC);
+		fs_usage->nr_inodes += bkey_is_inode(new.k);
+		fs_usage->nr_inodes -= bkey_is_inode(old.k);
+		preempt_enable();
+	}
 	return 0;
 }
 
@@ -1219,6 +1240,7 @@ static int bch2_mark_key_locked(struct btree_trans *trans,
 	switch (k.k->type) {
 	case KEY_TYPE_alloc:
 	case KEY_TYPE_alloc_v2:
+	case KEY_TYPE_alloc_v3:
 		return bch2_mark_alloc(trans, old, new, flags);
 	case KEY_TYPE_btree_ptr:
 	case KEY_TYPE_btree_ptr_v2:
@@ -1228,6 +1250,7 @@ static int bch2_mark_key_locked(struct btree_trans *trans,
 	case KEY_TYPE_stripe:
 		return bch2_mark_stripe(trans, old, new, flags);
 	case KEY_TYPE_inode:
+	case KEY_TYPE_inode_v2:
 		return bch2_mark_inode(trans, old, new, flags);
 	case KEY_TYPE_reservation:
 		return bch2_mark_reservation(trans, old, new, flags);
@@ -1685,8 +1708,7 @@ static int bch2_trans_mark_inode(struct btree_trans *trans,
 				 struct bkey_s_c new,
 				 unsigned flags)
 {
-	int nr = (new.k->type == KEY_TYPE_inode) -
-		(old.k->type == KEY_TYPE_inode);
+	int nr = bkey_is_inode(new.k) - bkey_is_inode(old.k);
 
 	if (nr) {
 		struct replicas_delta_list *d =
@@ -1834,6 +1856,7 @@ int bch2_trans_mark_key(struct btree_trans *trans, struct bkey_s_c old,
 	case KEY_TYPE_stripe:
 		return bch2_trans_mark_stripe(trans, old, new, flags);
 	case KEY_TYPE_inode:
+	case KEY_TYPE_inode_v2:
 		return bch2_trans_mark_inode(trans, old, new, flags);
 	case KEY_TYPE_reservation:
 		return bch2_trans_mark_reservation(trans, k, flags);
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index 7647e117013d..64627543fe17 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -1207,7 +1207,7 @@ static void bch2_vfs_inode_init(struct bch_fs *c, subvol_inum inum,
 	inode->v.i_size		= bi->bi_size;
 
 	inode->ei_flags		= 0;
-	inode->ei_journal_seq	= 0;
+	inode->ei_journal_seq	= bi->bi_journal_seq;
 	inode->ei_quota_reserved = 0;
 	inode->ei_qid		= bch_qid(bi);
 	inode->ei_subvol	= inum.subvol;
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 9519ced976f2..361dbf338023 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -133,7 +133,7 @@ static int lookup_first_inode(struct btree_trans *trans, u64 inode_nr,
 		goto err;
 	}
 
-	ret = bch2_inode_unpack(bkey_s_c_to_inode(k), inode);
+	ret = bch2_inode_unpack(k, inode);
 err:
 	if (ret && ret != -EINTR)
 		bch_err(trans->c, "error %i fetching inode %llu",
@@ -157,8 +157,8 @@ static int __lookup_inode(struct btree_trans *trans, u64 inode_nr,
 	if (ret)
 		goto err;
 
-	ret = k.k->type == KEY_TYPE_inode
-		? bch2_inode_unpack(bkey_s_c_to_inode(k), inode)
+	ret = bkey_is_inode(k.k)
+		? bch2_inode_unpack(k, inode)
 		: -ENOENT;
 	if (!ret)
 		*snapshot = iter.pos.snapshot;
@@ -261,7 +261,7 @@ retry:
 	if (ret)
 		goto err;
 
-	if (k.k->type != KEY_TYPE_inode) {
+	if (!bkey_is_inode(k.k)) {
 		bch2_fs_inconsistent(trans->c,
 				     "inode %llu:%u not found when deleting",
 				     inum, snapshot);
@@ -269,7 +269,7 @@ retry:
 		goto err;
 	}
 
-	bch2_inode_unpack(bkey_s_c_to_inode(k), &inode_u);
+	bch2_inode_unpack(k, &inode_u);
 
 	/* Subvolume root? */
 	if (inode_u.bi_subvol) {
@@ -581,7 +581,7 @@ static int inode_walker_realloc(struct inode_walker *w)
 }
 
 static int add_inode(struct bch_fs *c, struct inode_walker *w,
-		     struct bkey_s_c_inode inode)
+		     struct bkey_s_c inode)
 {
 	struct bch_inode_unpacked u;
 	int ret;
@@ -623,8 +623,8 @@ static int __walk_inode(struct btree_trans *trans,
 		if (k.k->p.offset != pos.inode)
 			break;
 
-		if (k.k->type == KEY_TYPE_inode)
-			add_inode(c, w, bkey_s_c_to_inode(k));
+		if (bkey_is_inode(k.k))
+			add_inode(c, w, k);
 	}
 	bch2_trans_iter_exit(trans, &iter);
 
@@ -676,11 +676,11 @@ static int __get_visible_inodes(struct btree_trans *trans,
 		if (k.k->p.offset != inum)
 			break;
 
-		if (k.k->type != KEY_TYPE_inode)
+		if (!bkey_is_inode(k.k))
 			continue;
 
 		if (ref_visible(c, s, s->pos.snapshot, k.k->p.snapshot)) {
-			add_inode(c, w, bkey_s_c_to_inode(k));
+			add_inode(c, w, k);
 			if (k.k->p.snapshot >= s->pos.snapshot)
 				break;
 		}
@@ -805,7 +805,6 @@ static int check_inode(struct btree_trans *trans,
 {
 	struct bch_fs *c = trans->c;
 	struct bkey_s_c k;
-	struct bkey_s_c_inode inode;
 	struct bch_inode_unpacked u;
 	bool do_update = false;
 	int ret;
@@ -830,19 +829,17 @@ static int check_inode(struct btree_trans *trans,
 	if (bch2_snapshot_internal_node(c, k.k->p.snapshot))
 		return 0;
 
-	if (k.k->type != KEY_TYPE_inode)
+	if (!bkey_is_inode(k.k))
 		return 0;
 
-	inode = bkey_s_c_to_inode(k);
+	BUG_ON(bch2_inode_unpack(k, &u));
 
 	if (!full &&
-	    !(inode.v->bi_flags & (BCH_INODE_I_SIZE_DIRTY|
-				   BCH_INODE_I_SECTORS_DIRTY|
-				   BCH_INODE_UNLINKED)))
+	    !(u.bi_flags & (BCH_INODE_I_SIZE_DIRTY|
+			    BCH_INODE_I_SECTORS_DIRTY|
+			    BCH_INODE_UNLINKED)))
 		return 0;
 
-	BUG_ON(bch2_inode_unpack(inode, &u));
-
 	if (prev->bi_inum != u.bi_inum)
 		*prev = u;
 
@@ -1963,10 +1960,10 @@ static int check_directory_structure(struct bch_fs *c)
 			   BTREE_ITER_INTENT|
 			   BTREE_ITER_PREFETCH|
 			   BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
-		if (k.k->type != KEY_TYPE_inode)
+		if (!bkey_is_inode(k.k))
 			continue;
 
-		ret = bch2_inode_unpack(bkey_s_c_to_inode(k), &u);
+		ret = bch2_inode_unpack(k, &u);
 		if (ret) {
 			/* Should have been caught earlier in fsck: */
 			bch_err(c, "error unpacking inode %llu: %i", k.k->p.offset, ret);
@@ -2070,7 +2067,6 @@ static int check_nlinks_find_hardlinks(struct bch_fs *c,
 	struct btree_trans trans;
 	struct btree_iter iter;
 	struct bkey_s_c k;
-	struct bkey_s_c_inode inode;
 	struct bch_inode_unpacked u;
 	int ret = 0;
 
@@ -2081,21 +2077,19 @@ static int check_nlinks_find_hardlinks(struct bch_fs *c,
 			   BTREE_ITER_INTENT|
 			   BTREE_ITER_PREFETCH|
 			   BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
-		if (k.k->type != KEY_TYPE_inode)
+		if (!bkey_is_inode(k.k))
 			continue;
 
-		inode = bkey_s_c_to_inode(k);
+		/* Should never fail, checked by bch2_inode_invalid: */
+		BUG_ON(bch2_inode_unpack(k, &u));
 
 		/*
 		 * Backpointer and directory structure checks are sufficient for
 		 * directories, since they can't have hardlinks:
 		 */
-		if (S_ISDIR(le16_to_cpu(inode.v->bi_mode)))
+		if (S_ISDIR(le16_to_cpu(u.bi_mode)))
 			continue;
 
-		/* Should never fail, checked by bch2_inode_invalid: */
-		BUG_ON(bch2_inode_unpack(inode, &u));
-
 		if (!u.bi_nlink)
 			continue;
 
@@ -2169,7 +2163,6 @@ static int check_nlinks_update_hardlinks(struct bch_fs *c,
 	struct btree_trans trans;
 	struct btree_iter iter;
 	struct bkey_s_c k;
-	struct bkey_s_c_inode inode;
 	struct bch_inode_unpacked u;
 	struct nlink *link = links->d;
 	int ret = 0;
@@ -2184,14 +2177,13 @@ static int check_nlinks_update_hardlinks(struct bch_fs *c,
 		if (k.k->p.offset >= range_end)
 			break;
 
-		if (k.k->type != KEY_TYPE_inode)
+		if (!bkey_is_inode(k.k))
 			continue;
 
-		inode = bkey_s_c_to_inode(k);
-		if (S_ISDIR(le16_to_cpu(inode.v->bi_mode)))
-			continue;
+		BUG_ON(bch2_inode_unpack(k, &u));
 
-		BUG_ON(bch2_inode_unpack(inode, &u));
+		if (S_ISDIR(le16_to_cpu(u.bi_mode)))
+			continue;
 
 		if (!u.bi_nlink)
 			continue;
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index 3ae321a99cee..728545141a39 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -35,29 +35,6 @@ static const u8 bits_table[8] = {
 	13 * 8 - 8,
 };
 
-static int inode_encode_field(u8 *out, u8 *end, u64 hi, u64 lo)
-{
-	__be64 in[2] = { cpu_to_be64(hi), cpu_to_be64(lo), };
-	unsigned shift, bytes, bits = likely(!hi)
-		? fls64(lo)
-		: fls64(hi) + 64;
-
-	for (shift = 1; shift <= 8; shift++)
-		if (bits < bits_table[shift - 1])
-			goto got_shift;
-
-	BUG();
-got_shift:
-	bytes = byte_table[shift - 1];
-
-	BUG_ON(out + bytes > end);
-
-	memcpy(out, (u8 *) in + 16 - bytes, bytes);
-	*out |= (1 << 8) >> shift;
-
-	return bytes;
-}
-
 static int inode_decode_field(const u8 *in, const u8 *end,
 			      u64 out[2], unsigned *out_bits)
 {
@@ -92,42 +69,11 @@ static int inode_decode_field(const u8 *in, const u8 *end,
 	return bytes;
 }
 
-static noinline void bch2_inode_pack_v1(struct bkey_inode_buf *packed,
-					const struct bch_inode_unpacked *inode)
-{
-	struct bkey_i_inode *k = &packed->inode;
-	u8 *out = k->v.fields;
-	u8 *end = (void *) &packed[1];
-	u8 *last_nonzero_field = out;
-	unsigned nr_fields = 0, last_nonzero_fieldnr = 0;
-	unsigned bytes;
-
-#define x(_name, _bits)							\
-	out += inode_encode_field(out, end, 0, inode->_name);		\
-	nr_fields++;							\
-									\
-	if (inode->_name) {						\
-		last_nonzero_field = out;				\
-		last_nonzero_fieldnr = nr_fields;			\
-	}
-
-	BCH_INODE_FIELDS()
-#undef  x
-
-	out = last_nonzero_field;
-	nr_fields = last_nonzero_fieldnr;
-
-	bytes = out - (u8 *) &packed->inode.v;
-	set_bkey_val_bytes(&packed->inode.k, bytes);
-	memset_u64s_tail(&packed->inode.v, 0, bytes);
-
-	SET_INODE_NR_FIELDS(&k->v, nr_fields);
-}
-
-static void bch2_inode_pack_v2(struct bkey_inode_buf *packed,
-			       const struct bch_inode_unpacked *inode)
+void bch2_inode_pack(struct bch_fs *c,
+		     struct bkey_inode_buf *packed,
+		     const struct bch_inode_unpacked *inode)
 {
-	struct bkey_i_inode *k = &packed->inode;
+	struct bkey_i_inode_v2 *k = &packed->inode;
 	u8 *out = k->v.fields;
 	u8 *end = (void *) &packed[1];
 	u8 *last_nonzero_field = out;
@@ -135,6 +81,14 @@ static void bch2_inode_pack_v2(struct bkey_inode_buf *packed,
 	unsigned bytes;
 	int ret;
 
+	bkey_inode_v2_init(&packed->inode.k_i);
+	packed->inode.k.p.offset	= inode->bi_inum;
+	packed->inode.v.bi_journal_seq	= cpu_to_le64(inode->bi_journal_seq);
+	packed->inode.v.bi_hash_seed	= inode->bi_hash_seed;
+	packed->inode.v.bi_flags	= cpu_to_le64(inode->bi_flags);
+	packed->inode.v.bi_flags	= cpu_to_le64(inode->bi_flags);
+	packed->inode.v.bi_mode		= cpu_to_le16(inode->bi_mode);
+
 #define x(_name, _bits)							\
 	nr_fields++;							\
 									\
@@ -165,30 +119,12 @@ static void bch2_inode_pack_v2(struct bkey_inode_buf *packed,
 	set_bkey_val_bytes(&packed->inode.k, bytes);
 	memset_u64s_tail(&packed->inode.v, 0, bytes);
 
-	SET_INODE_NR_FIELDS(&k->v, nr_fields);
-}
-
-void bch2_inode_pack(struct bch_fs *c,
-		     struct bkey_inode_buf *packed,
-		     const struct bch_inode_unpacked *inode)
-{
-	bkey_inode_init(&packed->inode.k_i);
-	packed->inode.k.p.offset	= inode->bi_inum;
-	packed->inode.v.bi_hash_seed	= inode->bi_hash_seed;
-	packed->inode.v.bi_flags	= cpu_to_le32(inode->bi_flags);
-	packed->inode.v.bi_mode		= cpu_to_le16(inode->bi_mode);
-
-	if (c->sb.features & (1ULL << BCH_FEATURE_new_varint)) {
-		SET_INODE_NEW_VARINT(&packed->inode.v, true);
-		bch2_inode_pack_v2(packed, inode);
-	} else {
-		bch2_inode_pack_v1(packed, inode);
-	}
+	SET_INODEv2_NR_FIELDS(&k->v, nr_fields);
 
 	if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) {
 		struct bch_inode_unpacked unpacked;
 
-		int ret = bch2_inode_unpack(inode_i_to_s_c(&packed->inode),
+		int ret = bch2_inode_unpack(bkey_i_to_s_c(&packed->inode.k_i),
 					   &unpacked);
 		BUG_ON(ret);
 		BUG_ON(unpacked.bi_inum		!= inode->bi_inum);
@@ -237,17 +173,16 @@ static noinline int bch2_inode_unpack_v1(struct bkey_s_c_inode inode,
 	return 0;
 }
 
-static int bch2_inode_unpack_v2(struct bkey_s_c_inode inode,
-				struct bch_inode_unpacked *unpacked)
+static int bch2_inode_unpack_v2(struct bch_inode_unpacked *unpacked,
+				const u8 *in, const u8 *end,
+				unsigned nr_fields)
 {
-	const u8 *in = inode.v->fields;
-	const u8 *end = bkey_val_end(inode);
 	unsigned fieldnr = 0;
 	int ret;
 	u64 v[2];
 
 #define x(_name, _bits)							\
-	if (fieldnr < INODE_NR_FIELDS(inode.v)) {			\
+	if (fieldnr < nr_fields) {					\
 		ret = bch2_varint_decode_fast(in, end, &v[0]);		\
 		if (ret < 0)						\
 			return ret;					\
@@ -277,21 +212,43 @@ static int bch2_inode_unpack_v2(struct bkey_s_c_inode inode,
 	return 0;
 }
 
-int bch2_inode_unpack(struct bkey_s_c_inode inode,
+int bch2_inode_unpack(struct bkey_s_c k,
 		      struct bch_inode_unpacked *unpacked)
 {
-	unpacked->bi_inum	= inode.k->p.offset;
-	unpacked->bi_hash_seed	= inode.v->bi_hash_seed;
-	unpacked->bi_flags	= le32_to_cpu(inode.v->bi_flags);
-	unpacked->bi_mode	= le16_to_cpu(inode.v->bi_mode);
-
-	if (INODE_NEW_VARINT(inode.v)) {
-		return bch2_inode_unpack_v2(inode, unpacked);
-	} else {
-		return bch2_inode_unpack_v1(inode, unpacked);
+	switch (k.k->type) {
+	case KEY_TYPE_inode: {
+		struct bkey_s_c_inode inode = bkey_s_c_to_inode(k);
+
+		unpacked->bi_inum	= inode.k->p.offset;
+		unpacked->bi_hash_seed	= inode.v->bi_hash_seed;
+		unpacked->bi_flags	= le32_to_cpu(inode.v->bi_flags);
+		unpacked->bi_mode	= le16_to_cpu(inode.v->bi_mode);
+
+		if (INODE_NEW_VARINT(inode.v)) {
+			return bch2_inode_unpack_v2(unpacked, inode.v->fields,
+						    bkey_val_end(inode),
+						    INODE_NR_FIELDS(inode.v));
+		} else {
+			return bch2_inode_unpack_v1(inode, unpacked);
+		}
+		break;
+	}
+	case KEY_TYPE_inode_v2: {
+		struct bkey_s_c_inode_v2 inode = bkey_s_c_to_inode_v2(k);
+
+		unpacked->bi_inum	= inode.k->p.offset;
+		unpacked->bi_journal_seq= le64_to_cpu(inode.v->bi_journal_seq);
+		unpacked->bi_hash_seed	= inode.v->bi_hash_seed;
+		unpacked->bi_flags	= le64_to_cpu(inode.v->bi_flags);
+		unpacked->bi_mode	= le16_to_cpu(inode.v->bi_mode);
+
+		return bch2_inode_unpack_v2(unpacked, inode.v->fields,
+					    bkey_val_end(inode),
+					    INODEv2_NR_FIELDS(inode.v));
+	}
+	default:
+		BUG();
 	}
-
-	return 0;
 }
 
 int bch2_inode_peek(struct btree_trans *trans,
@@ -317,11 +274,11 @@ int bch2_inode_peek(struct btree_trans *trans,
 	if (ret)
 		goto err;
 
-	ret = k.k->type == KEY_TYPE_inode ? 0 : -ENOENT;
+	ret = bkey_is_inode(k.k) ? 0 : -ENOENT;
 	if (ret)
 		goto err;
 
-	ret = bch2_inode_unpack(bkey_s_c_to_inode(k), inode);
+	ret = bch2_inode_unpack(k, inode);
 	if (ret)
 		goto err;
 
@@ -363,7 +320,43 @@ const char *bch2_inode_invalid(const struct bch_fs *c, struct bkey_s_c k)
 	if (INODE_STR_HASH(inode.v) >= BCH_STR_HASH_NR)
 		return "invalid str hash type";
 
-	if (bch2_inode_unpack(inode, &unpacked))
+	if (bch2_inode_unpack(k, &unpacked))
+		return "invalid variable length fields";
+
+	if (unpacked.bi_data_checksum >= BCH_CSUM_OPT_NR + 1)
+		return "invalid data checksum type";
+
+	if (unpacked.bi_compression >= BCH_COMPRESSION_OPT_NR + 1)
+		return "invalid data checksum type";
+
+	if ((unpacked.bi_flags & BCH_INODE_UNLINKED) &&
+	    unpacked.bi_nlink != 0)
+		return "flagged as unlinked but bi_nlink != 0";
+
+	if (unpacked.bi_subvol && !S_ISDIR(unpacked.bi_mode))
+		return "subvolume root but not a directory";
+
+	return NULL;
+}
+
+const char *bch2_inode_v2_invalid(const struct bch_fs *c, struct bkey_s_c k)
+{
+	struct bkey_s_c_inode_v2 inode = bkey_s_c_to_inode_v2(k);
+	struct bch_inode_unpacked unpacked;
+
+	if (k.k->p.inode)
+		return "nonzero k.p.inode";
+
+	if (bkey_val_bytes(k.k) < sizeof(struct bch_inode))
+		return "incorrect value size";
+
+	if (k.k->p.offset < BLOCKDEV_INODE_MAX)
+		return "fs inode in blockdev range";
+
+	if (INODEv2_STR_HASH(inode.v) >= BCH_STR_HASH_NR)
+		return "invalid str hash type";
+
+	if (bch2_inode_unpack(k, &unpacked))
 		return "invalid variable length fields";
 
 	if (unpacked.bi_data_checksum >= BCH_CSUM_OPT_NR + 1)
@@ -384,10 +377,12 @@ const char *bch2_inode_invalid(const struct bch_fs *c, struct bkey_s_c k)
 
 static void __bch2_inode_unpacked_to_text(struct printbuf *out, struct bch_inode_unpacked *inode)
 {
-	pr_buf(out, "mode %o flags %x ", inode->bi_mode, inode->bi_flags);
+	pr_buf(out, "mode %o flags %x journal_seq %llu",
+	       inode->bi_mode, inode->bi_flags,
+	       inode->bi_journal_seq);
 
 #define x(_name, _bits)						\
-	pr_buf(out, #_name " %llu ", (u64) inode->_name);
+	pr_buf(out, " "#_name " %llu", (u64) inode->_name);
 	BCH_INODE_FIELDS()
 #undef  x
 }
@@ -401,15 +396,14 @@ void bch2_inode_unpacked_to_text(struct printbuf *out, struct bch_inode_unpacked
 void bch2_inode_to_text(struct printbuf *out, struct bch_fs *c,
 		       struct bkey_s_c k)
 {
-	struct bkey_s_c_inode inode = bkey_s_c_to_inode(k);
-	struct bch_inode_unpacked unpacked;
+	struct bch_inode_unpacked inode;
 
-	if (bch2_inode_unpack(inode, &unpacked)) {
+	if (bch2_inode_unpack(k, &inode)) {
 		pr_buf(out, "(unpack error)");
 		return;
 	}
 
-	__bch2_inode_unpacked_to_text(out, &unpacked);
+	__bch2_inode_unpacked_to_text(out, &inode);
 }
 
 const char *bch2_inode_generation_invalid(const struct bch_fs *c,
@@ -485,6 +479,7 @@ static inline u32 bkey_generation(struct bkey_s_c k)
 {
 	switch (k.k->type) {
 	case KEY_TYPE_inode:
+	case KEY_TYPE_inode_v2:
 		BUG();
 	case KEY_TYPE_inode_generation:
 		return le32_to_cpu(bkey_s_c_to_inode_generation(k).v->bi_generation);
@@ -542,7 +537,7 @@ again:
 		}
 
 		if (k.k->p.snapshot == snapshot &&
-		    k.k->type != KEY_TYPE_inode &&
+		    !bkey_is_inode(k.k) &&
 		    !bch2_btree_key_cache_find(c, BTREE_ID_inodes, SPOS(0, pos, snapshot))) {
 			bch2_btree_iter_advance(iter);
 			continue;
@@ -585,7 +580,7 @@ found_slot:
 	}
 
 	/* We may have raced while the iterator wasn't pointing at pos: */
-	if (k.k->type == KEY_TYPE_inode ||
+	if (bkey_is_inode(k.k) ||
 	    bch2_btree_key_cache_find(c, BTREE_ID_inodes, k.k->p))
 		goto again;
 
@@ -698,7 +693,7 @@ retry:
 	if (ret)
 		goto err;
 
-	if (k.k->type != KEY_TYPE_inode) {
+	if (!bkey_is_inode(k.k)) {
 		bch2_fs_inconsistent(trans.c,
 				     "inode %llu not found when deleting",
 				     inum.inum);
@@ -706,7 +701,7 @@ retry:
 		goto err;
 	}
 
-	bch2_inode_unpack(bkey_s_c_to_inode(k), &inode_u);
+	bch2_inode_unpack(k, &inode_u);
 
 	/* Subvolume root? */
 	BUG_ON(inode_u.bi_subvol);
diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h
index 009b807cc167..d433d48de4e0 100644
--- a/fs/bcachefs/inode.h
+++ b/fs/bcachefs/inode.h
@@ -7,6 +7,7 @@
 extern const char * const bch2_inode_opts[];
 
 const char *bch2_inode_invalid(const struct bch_fs *, struct bkey_s_c);
+const char *bch2_inode_v2_invalid(const struct bch_fs *, struct bkey_s_c);
 void bch2_inode_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 
 #define bch2_bkey_ops_inode (struct bkey_ops) {		\
@@ -14,6 +15,17 @@ void bch2_inode_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 	.val_to_text	= bch2_inode_to_text,		\
 }
 
+#define bch2_bkey_ops_inode_v2 (struct bkey_ops) {	\
+	.key_invalid	= bch2_inode_v2_invalid,	\
+	.val_to_text	= bch2_inode_to_text,		\
+}
+
+static inline bool bkey_is_inode(const struct bkey *k)
+{
+	return  k->type == KEY_TYPE_inode ||
+		k->type == KEY_TYPE_inode_v2;
+}
+
 const char *bch2_inode_generation_invalid(const struct bch_fs *,
 					  struct bkey_s_c);
 void bch2_inode_generation_to_text(struct printbuf *, struct bch_fs *,
@@ -34,6 +46,7 @@ typedef u64 u96;
 
 struct bch_inode_unpacked {
 	u64			bi_inum;
+	u64			bi_journal_seq;
 	__le64			bi_hash_seed;
 	u32			bi_flags;
 	u16			bi_mode;
@@ -44,7 +57,7 @@ struct bch_inode_unpacked {
 };
 
 struct bkey_inode_buf {
-	struct bkey_i_inode	inode;
+	struct bkey_i_inode_v2	inode;
 
 #define x(_name, _bits)		+ 8 + _bits / 8
 	u8		_pad[0 + BCH_INODE_FIELDS()];
@@ -53,7 +66,7 @@ struct bkey_inode_buf {
 
 void bch2_inode_pack(struct bch_fs *, struct bkey_inode_buf *,
 		     const struct bch_inode_unpacked *);
-int bch2_inode_unpack(struct bkey_s_c_inode, struct bch_inode_unpacked *);
+int bch2_inode_unpack(struct bkey_s_c, struct bch_inode_unpacked *);
 
 void bch2_inode_unpacked_to_text(struct printbuf *, struct bch_inode_unpacked *);
 
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index ca4e7a5a64b9..0a9cb4d489f4 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -337,12 +337,12 @@ int bch2_extent_update(struct btree_trans *trans,
 	if (ret)
 		goto err;
 
-	ret = inode.k->type == KEY_TYPE_inode ? 0 : -ENOENT;
+	ret = bkey_is_inode(inode.k) ? 0 : -ENOENT;
 	if (ret)
 		goto err;
 
 	if (i_sectors_delta || new_i_size) {
-		ret = bch2_inode_unpack(bkey_s_c_to_inode(inode), &inode_u);
+		ret = bch2_inode_unpack(inode, &inode_u);
 		if (ret)
 			goto err;
 
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index 2f260360b089..249d0b2be167 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -623,11 +623,11 @@ static int lookup_inode(struct btree_trans *trans, struct bpos pos,
 		goto err;
 	}
 
-	ret = k.k->type == KEY_TYPE_inode ? 0 : -EIO;
+	ret = bkey_is_inode(k.k) ? 0 : -EIO;
 	if (ret)
 		goto err;
 
-	ret = bch2_inode_unpack(bkey_s_c_to_inode(k), inode);
+	ret = bch2_inode_unpack(k, inode);
 	if (ret)
 		goto err;
 err:
diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c
index 17fd5bf107bb..5f1216da76d0 100644
--- a/fs/bcachefs/quota.c
+++ b/fs/bcachefs/quota.c
@@ -439,9 +439,8 @@ int bch2_fs_quota_read(struct bch_fs *c)
 
 	for_each_btree_key(&trans, iter, BTREE_ID_inodes, POS_MIN,
 			   BTREE_ITER_PREFETCH, k, ret) {
-		switch (k.k->type) {
-		case KEY_TYPE_inode:
-			ret = bch2_inode_unpack(bkey_s_c_to_inode(k), &u);
+		if (bkey_is_inode(k.k)) {
+			ret = bch2_inode_unpack(k, &u);
 			if (ret)
 				return ret;
 
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 29fae6dbce76..d8e511a0664e 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -1015,13 +1015,13 @@ static int bch2_fs_upgrade_for_subvolumes(struct btree_trans *trans)
 	if (ret)
 		goto err;
 
-	if (k.k->type != KEY_TYPE_inode) {
+	if (!bkey_is_inode(k.k)) {
 		bch_err(c, "root inode not found");
 		ret = -ENOENT;
 		goto err;
 	}
 
-	ret = bch2_inode_unpack(bkey_s_c_to_inode(k), &inode);
+	ret = bch2_inode_unpack(k, &inode);
 	BUG_ON(ret);
 
 	inode.bi_subvol = BCACHEFS_ROOT_SUBVOL;
@@ -1093,6 +1093,9 @@ int bch2_fs_recovery(struct bch_fs *c)
 			bch_info(c, "filesystem version is prior to subvol_dirent - upgrading");
 			c->opts.version_upgrade = true;
 			c->opts.fsck		= true;
+		} else if (c->sb.version < bcachefs_metadata_version_inode_v2) {
+			bch_info(c, "filesystem version is prior to inode_v2 - upgrading");
+			c->opts.version_upgrade = true;
 		}
 	}
 
-- 
cgit 


From 0e030f5e2014bf9a33e977820cf64fce4258cf1d Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 3 Nov 2021 22:33:32 -0400
Subject: bcachefs: Kill journal buf bloom filter

This was used for recording which inodes have been modified by in flight
journal writes, but was broken and has been superceded.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_update_leaf.c |  3 ---
 fs/bcachefs/fs.c                |  4 ----
 fs/bcachefs/journal.c           | 51 -----------------------------------------
 fs/bcachefs/journal.h           | 15 ------------
 fs/bcachefs/journal_types.h     |  2 --
 5 files changed, 75 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index b9c93182f2de..4e9f7e3b5a61 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -363,9 +363,6 @@ static inline void do_btree_insert_one(struct btree_trans *trans,
 				      i->level,
 				      i->k);
 
-		bch2_journal_set_has_inode(j, &trans->journal_res,
-					   i->k->k.p.inode);
-
 		if (trans->journal_seq)
 			*trans->journal_seq = trans->journal_res.seq;
 	}
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index 64627543fe17..12178bd15c34 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -58,8 +58,6 @@ static void journal_seq_copy(struct bch_fs *c,
 		if (old >= journal_seq)
 			break;
 	} while ((v = atomic64_cmpxchg(dst_seq, old, journal_seq)) != old);
-
-	bch2_journal_set_has_inum(&c->journal, dst->v.i_ino, journal_seq);
 }
 
 static void __pagecache_lock_put(struct pagecache_lock *lock, long i)
@@ -258,8 +256,6 @@ struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum)
 
 	bch2_vfs_inode_init(c, inum, inode, &inode_u);
 
-	inode->ei_journal_seq = bch2_inode_journal_seq(&c->journal, inum.inum);
-
 	unlock_new_inode(&inode->v);
 
 	return &inode->v;
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index f72e3124d351..1abd1ac560e6 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -87,8 +87,6 @@ static void bch2_journal_buf_init(struct journal *j)
 	buf->must_flush	= false;
 	buf->separate_flush = false;
 
-	memset(buf->has_inode, 0, sizeof(buf->has_inode));
-
 	memset(buf->data, 0, sizeof(*buf->data));
 	buf->data->seq	= cpu_to_le64(journal_cur_seq(j));
 	buf->data->u64s	= 0;
@@ -334,55 +332,6 @@ static void journal_write_work(struct work_struct *work)
 	journal_entry_close(j);
 }
 
-/*
- * Given an inode number, if that inode number has data in the journal that
- * hasn't yet been flushed, return the journal sequence number that needs to be
- * flushed:
- */
-u64 bch2_inode_journal_seq(struct journal *j, u64 inode)
-{
-	size_t h = hash_64(inode, ilog2(sizeof(j->buf[0].has_inode) * 8));
-	union journal_res_state s;
-	unsigned i;
-	u64 seq;
-
-
-	spin_lock(&j->lock);
-	seq = journal_cur_seq(j);
-	s = READ_ONCE(j->reservations);
-	i = s.idx;
-
-	while (1) {
-		if (test_bit(h, j->buf[i].has_inode))
-			goto out;
-
-		if (i == s.unwritten_idx)
-			break;
-
-		i = (i - 1) & JOURNAL_BUF_MASK;
-		seq--;
-	}
-
-	seq = 0;
-out:
-	spin_unlock(&j->lock);
-
-	return seq;
-}
-
-void bch2_journal_set_has_inum(struct journal *j, u64 inode, u64 seq)
-{
-	size_t h = hash_64(inode, ilog2(sizeof(j->buf[0].has_inode) * 8));
-	struct journal_buf *buf;
-
-	spin_lock(&j->lock);
-
-	if ((buf = journal_seq_to_buf(j, seq)))
-		set_bit(h, buf->has_inode);
-
-	spin_unlock(&j->lock);
-}
-
 static int __journal_res_get(struct journal *j, struct journal_res *res,
 			     unsigned flags)
 {
diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h
index 99fd253648bf..2cfb6c7f0d14 100644
--- a/fs/bcachefs/journal.h
+++ b/fs/bcachefs/journal.h
@@ -141,9 +141,6 @@ static inline u64 journal_cur_seq(struct journal *j)
 	return j->pin.back - 1;
 }
 
-u64 bch2_inode_journal_seq(struct journal *, u64);
-void bch2_journal_set_has_inum(struct journal *, u64, u64);
-
 static inline int journal_state_count(union journal_res_state s, int idx)
 {
 	switch (idx) {
@@ -163,18 +160,6 @@ static inline void journal_state_inc(union journal_res_state *s)
 	s->buf3_count += s->idx == 3;
 }
 
-static inline void bch2_journal_set_has_inode(struct journal *j,
-					      struct journal_res *res,
-					      u64 inum)
-{
-	struct journal_buf *buf = &j->buf[res->idx];
-	unsigned long bit = hash_64(inum, ilog2(sizeof(buf->has_inode) * 8));
-
-	/* avoid atomic op if possible */
-	if (unlikely(!test_bit(bit, buf->has_inode)))
-		set_bit(bit, buf->has_inode);
-}
-
 /*
  * Amount of space that will be taken up by some keys in the journal (i.e.
  * including the jset header)
diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h
index 0647a53eb35c..0fc6569ef149 100644
--- a/fs/bcachefs/journal_types.h
+++ b/fs/bcachefs/journal_types.h
@@ -34,8 +34,6 @@ struct journal_buf {
 	bool			noflush;	/* write has already been kicked off, and was noflush */
 	bool			must_flush;	/* something wants a flush */
 	bool			separate_flush;
-	/* bloom filter: */
-	unsigned long		has_inode[1024 / sizeof(unsigned long)];
 };
 
 /*
-- 
cgit 


From e15a57ac05a9384d81f340ff870633dde62e5d5d Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 4 Nov 2021 11:44:13 -0400
Subject: bcachefs: Kill bucket quantiles sysfs code

We're getting rid of code that uses the in memory bucket array - and we
now have better mechanisms for viewing most of what the bucket quantiles
code gave us (especially internal fragmentation).

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/sysfs.c | 90 -----------------------------------------------------
 1 file changed, 90 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index 51eb19b84a28..864be8601868 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -155,11 +155,6 @@ read_attribute(congested);
 
 read_attribute(btree_avg_write_size);
 
-read_attribute(bucket_quantiles_last_read);
-read_attribute(bucket_quantiles_last_write);
-read_attribute(bucket_quantiles_fragmentation);
-read_attribute(bucket_quantiles_oldest_gen);
-
 read_attribute(reserve_stats);
 read_attribute(btree_cache_size);
 read_attribute(compression_stats);
@@ -751,76 +746,6 @@ struct attribute *bch2_fs_time_stats_files[] = {
 	NULL
 };
 
-typedef unsigned (bucket_map_fn)(struct bch_fs *, struct bch_dev *,
-				 size_t, void *);
-
-static unsigned bucket_last_io_fn(struct bch_fs *c, struct bch_dev *ca,
-				  size_t b, void *private)
-{
-	int rw = (private ? 1 : 0);
-
-	return atomic64_read(&c->io_clock[rw].now) - bucket(ca, b)->io_time[rw];
-}
-
-static unsigned bucket_sectors_used_fn(struct bch_fs *c, struct bch_dev *ca,
-				       size_t b, void *private)
-{
-	struct bucket *g = bucket(ca, b);
-	return bucket_sectors_used(g->mark);
-}
-
-static unsigned bucket_oldest_gen_fn(struct bch_fs *c, struct bch_dev *ca,
-				     size_t b, void *private)
-{
-	return bucket_gc_gen(bucket(ca, b));
-}
-
-static int unsigned_cmp(const void *_l, const void *_r)
-{
-	const unsigned *l = _l;
-	const unsigned *r = _r;
-
-	return cmp_int(*l, *r);
-}
-
-static int quantiles_to_text(struct printbuf *out,
-			     struct bch_fs *c, struct bch_dev *ca,
-			     bucket_map_fn *fn, void *private)
-{
-	size_t i, n;
-	/* Compute 31 quantiles */
-	unsigned q[31], *p;
-
-	down_read(&ca->bucket_lock);
-	n = ca->mi.nbuckets;
-
-	p = vzalloc(n * sizeof(unsigned));
-	if (!p) {
-		up_read(&ca->bucket_lock);
-		return -ENOMEM;
-	}
-
-	for (i = ca->mi.first_bucket; i < n; i++)
-		p[i] = fn(c, ca, i, private);
-
-	sort(p, n, sizeof(unsigned), unsigned_cmp, NULL);
-	up_read(&ca->bucket_lock);
-
-	while (n &&
-	       !p[n - 1])
-		--n;
-
-	for (i = 0; i < ARRAY_SIZE(q); i++)
-		q[i] = p[n * (i + 1) / (ARRAY_SIZE(q) + 1)];
-
-	vfree(p);
-
-	for (i = 0; i < ARRAY_SIZE(q); i++)
-		pr_buf(out, "%u ", q[i]);
-	pr_buf(out, "\n");
-	return 0;
-}
-
 static void reserve_stats_to_text(struct printbuf *out, struct bch_dev *ca)
 {
 	enum alloc_reserve i;
@@ -982,15 +907,6 @@ SHOW(bch2_dev)
 		     clamp(atomic_read(&ca->congested), 0, CONGESTED_MAX)
 		     * 100 / CONGESTED_MAX);
 
-	if (attr == &sysfs_bucket_quantiles_last_read)
-		return quantiles_to_text(&out, c, ca, bucket_last_io_fn, (void *) 0) ?: out.pos - buf;
-	if (attr == &sysfs_bucket_quantiles_last_write)
-		return quantiles_to_text(&out, c, ca, bucket_last_io_fn, (void *) 1) ?: out.pos - buf;
-	if (attr == &sysfs_bucket_quantiles_fragmentation)
-		return quantiles_to_text(&out, c, ca, bucket_sectors_used_fn, NULL)  ?: out.pos - buf;
-	if (attr == &sysfs_bucket_quantiles_oldest_gen)
-		return quantiles_to_text(&out, c, ca, bucket_oldest_gen_fn, NULL)    ?: out.pos - buf;
-
 	if (attr == &sysfs_reserve_stats) {
 		reserve_stats_to_text(&out, ca);
 		return out.pos - buf;
@@ -1082,12 +998,6 @@ struct attribute *bch2_dev_files[] = {
 	&sysfs_io_latency_stats_write,
 	&sysfs_congested,
 
-	/* alloc info - other stats: */
-	&sysfs_bucket_quantiles_last_read,
-	&sysfs_bucket_quantiles_last_write,
-	&sysfs_bucket_quantiles_fragmentation,
-	&sysfs_bucket_quantiles_oldest_gen,
-
 	&sysfs_reserve_stats,
 
 	/* debug: */
-- 
cgit 


From 68a2054d88f7cd2866806148d9a2e4389eb46992 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 5 Nov 2021 15:17:13 -0400
Subject: bcachefs: Switch fsync to use bi_journal_seq

Now that we're recording in each inode the journal sequence number of
the most recent update, fsync becomes a lot simpler and we can delete
all the plumbing for ei_journal_seq.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/acl.c     |  3 +--
 fs/bcachefs/fs-io.c   | 58 +++++++++++++++++++++++++--------------------------
 fs/bcachefs/fs.c      | 52 +++++++--------------------------------------
 fs/bcachefs/fs.h      |  1 -
 fs/bcachefs/io.c      |  9 ++++----
 fs/bcachefs/io.h      | 10 ++-------
 fs/bcachefs/reflink.c |  8 +++----
 fs/bcachefs/reflink.h |  2 +-
 fs/bcachefs/xattr.c   | 18 +++++++++++++++-
 9 files changed, 65 insertions(+), 96 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/acl.c b/fs/bcachefs/acl.c
index 2afa15b26700..51a0b48a5313 100644
--- a/fs/bcachefs/acl.c
+++ b/fs/bcachefs/acl.c
@@ -330,8 +330,7 @@ retry:
 	inode_u.bi_mode		= mode;
 
 	ret =   bch2_inode_write(&trans, &inode_iter, &inode_u) ?:
-		bch2_trans_commit(&trans, NULL,
-				  &inode->ei_journal_seq, 0);
+		bch2_trans_commit(&trans, NULL, NULL, 0);
 btree_err:
 	bch2_trans_iter_exit(&trans, &inode_iter);
 
diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index f4c97fc0e3d1..7de6b7a7aa60 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -1096,7 +1096,6 @@ static void bch2_writepage_io_alloc(struct bch_fs *c,
 	op			= &w->io->op;
 	bch2_write_op_init(op, c, w->opts);
 	op->target		= w->opts.foreground_target;
-	op_journal_seq_set(op, &inode->ei_journal_seq);
 	op->nr_replicas		= nr_replicas;
 	op->res.nr_replicas	= nr_replicas;
 	op->write_point		= writepoint_hashed(inode->ei_last_dirtied);
@@ -1947,7 +1946,6 @@ static long bch2_dio_write_loop(struct dio_write *dio)
 		bch2_write_op_init(&dio->op, c, io_opts(c, &inode->ei_inode));
 		dio->op.end_io		= bch2_dio_write_loop_async;
 		dio->op.target		= dio->op.opts.foreground_target;
-		op_journal_seq_set(&dio->op, &inode->ei_journal_seq);
 		dio->op.write_point	= writepoint_hashed((unsigned long) current);
 		dio->op.nr_replicas	= dio->op.opts.data_replicas;
 		dio->op.subvol		= inode->ei_subvol;
@@ -2164,29 +2162,36 @@ unlock:
 
 /* fsync: */
 
-int bch2_fsync(struct file *file, loff_t start, loff_t end, int datasync)
+/*
+ * inode->ei_inode.bi_journal_seq won't be up to date since it's set in an
+ * insert trigger: look up the btree inode instead
+ */
+static int bch2_flush_inode(struct bch_fs *c, subvol_inum inum)
 {
-	struct bch_inode_info *inode = file_bch_inode(file);
-	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	int ret, ret2;
+	struct bch_inode_unpacked inode;
+	int ret;
 
-	ret = file_write_and_wait_range(file, start, end);
+	if (c->opts.journal_flush_disabled)
+		return 0;
+
+	ret = bch2_inode_find_by_inum(c, inum, &inode);
 	if (ret)
 		return ret;
 
-	if (datasync && !(inode->v.i_state & I_DIRTY_DATASYNC))
-		goto out;
+	return bch2_journal_flush_seq(&c->journal, inode.bi_journal_seq);
+}
 
-	ret = sync_inode_metadata(&inode->v, 1);
-	if (ret)
-		return ret;
-out:
-	if (!c->opts.journal_flush_disabled)
-		ret = bch2_journal_flush_seq(&c->journal,
-					     inode->ei_journal_seq);
-	ret2 = file_check_and_advance_wb_err(file);
+int bch2_fsync(struct file *file, loff_t start, loff_t end, int datasync)
+{
+	struct bch_inode_info *inode = file_bch_inode(file);
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	int ret, ret2, ret3;
+
+	ret = file_write_and_wait_range(file, start, end);
+	ret2 = sync_inode_metadata(&inode->v, 1);
+	ret3 = bch2_flush_inode(c, inode_inum(inode));
 
-	return ret ?: ret2;
+	return ret ?: ret2 ?: ret3;
 }
 
 /* truncate: */
@@ -2448,7 +2453,7 @@ int bch2_truncate(struct mnt_idmap *idmap,
 
 	ret = bch2_fpunch(c, inode_inum(inode),
 			round_up(iattr->ia_size, block_bytes(c)) >> 9,
-			U64_MAX, &inode->ei_journal_seq, &i_sectors_delta);
+			U64_MAX, &i_sectors_delta);
 	i_sectors_acct(c, inode, NULL, i_sectors_delta);
 
 	if (unlikely(ret))
@@ -2508,7 +2513,6 @@ static long bchfs_fpunch(struct bch_inode_info *inode, loff_t offset, loff_t len
 
 		ret = bch2_fpunch(c, inode_inum(inode),
 				  discard_start, discard_end,
-				  &inode->ei_journal_seq,
 				  &i_sectors_delta);
 		i_sectors_acct(c, inode, NULL, i_sectors_delta);
 	}
@@ -2587,7 +2591,6 @@ static long bchfs_fcollapse_finsert(struct bch_inode_info *inode,
 
 		ret = bch2_fpunch(c, inode_inum(inode),
 				  offset >> 9, (offset + len) >> 9,
-				  &inode->ei_journal_seq,
 				  &i_sectors_delta);
 		i_sectors_acct(c, inode, NULL, i_sectors_delta);
 
@@ -2691,8 +2694,7 @@ reassemble:
 		ret =   bch2_btree_iter_traverse(&del) ?:
 			bch2_trans_update(&trans, &del, &delete, trigger_flags) ?:
 			bch2_trans_update(&trans, &dst, copy.k, trigger_flags) ?:
-			bch2_trans_commit(&trans, &disk_res,
-					  &inode->ei_journal_seq,
+			bch2_trans_commit(&trans, &disk_res, NULL,
 					  BTREE_INSERT_NOFAIL);
 		bch2_disk_reservation_put(c, &disk_res);
 
@@ -2803,7 +2805,7 @@ static int __bchfs_fallocate(struct bch_inode_info *inode, int mode,
 
 		ret = bch2_extent_update(&trans, inode_inum(inode), &iter,
 					 &reservation.k_i,
-				&disk_res, &inode->ei_journal_seq,
+				&disk_res, NULL,
 				0, &i_sectors_delta, true);
 		i_sectors_acct(c, inode, &quota_res, i_sectors_delta);
 bkey_err:
@@ -3003,7 +3005,6 @@ loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src,
 			       inode_inum(dst), pos_dst >> 9,
 			       inode_inum(src), pos_src >> 9,
 			       aligned_len >> 9,
-			       &dst->ei_journal_seq,
 			       pos_dst + len, &i_sectors_delta);
 	if (ret < 0)
 		goto err;
@@ -3021,10 +3022,9 @@ loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src,
 		i_size_write(&dst->v, pos_dst + ret);
 	spin_unlock(&dst->v.i_lock);
 
-	if (((file_dst->f_flags & (__O_SYNC | O_DSYNC)) ||
-	     IS_SYNC(file_inode(file_dst))) &&
-	    !c->opts.journal_flush_disabled)
-		ret = bch2_journal_flush_seq(&c->journal, dst->ei_journal_seq);
+	if ((file_dst->f_flags & (__O_SYNC | O_DSYNC)) ||
+	    IS_SYNC(file_inode(file_dst)))
+		ret = bch2_flush_inode(c, inode_inum(dst));
 err:
 	bch2_unlock_inodes(INODE_LOCK|INODE_PAGECACHE_BLOCK, src, dst);
 
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index 12178bd15c34..92919b16f2f5 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -41,25 +41,6 @@ static void bch2_vfs_inode_init(struct bch_fs *, subvol_inum,
 				struct bch_inode_info *,
 				struct bch_inode_unpacked *);
 
-static void journal_seq_copy(struct bch_fs *c,
-			     struct bch_inode_info *dst,
-			     u64 journal_seq)
-{
-	/*
-	 * atomic64_cmpxchg has a fallback for archs that don't support it,
-	 * cmpxchg does not:
-	 */
-	atomic64_t *dst_seq = (void *) &dst->ei_journal_seq;
-	u64 old, v = READ_ONCE(dst->ei_journal_seq);
-
-	do {
-		old = v;
-
-		if (old >= journal_seq)
-			break;
-	} while ((v = atomic64_cmpxchg(dst_seq, old, journal_seq)) != old);
-}
-
 static void __pagecache_lock_put(struct pagecache_lock *lock, long i)
 {
 	BUG_ON(atomic_long_read(&lock->v) == 0);
@@ -152,9 +133,7 @@ retry:
 				BTREE_ITER_INTENT) ?:
 		(set ? set(inode, &inode_u, p) : 0) ?:
 		bch2_inode_write(&trans, &iter, &inode_u) ?:
-		bch2_trans_commit(&trans, NULL,
-				  &inode->ei_journal_seq,
-				  BTREE_INSERT_NOFAIL);
+		bch2_trans_commit(&trans, NULL, NULL, BTREE_INSERT_NOFAIL);
 
 	/*
 	 * the btree node lock protects inode->ei_inode, not ei_update_lock;
@@ -329,7 +308,6 @@ err_before_quota:
 	if (!(flags & BCH_CREATE_TMPFILE)) {
 		bch2_inode_update_after_write(c, dir, &dir_u,
 					      ATTR_MTIME|ATTR_CTIME);
-		journal_seq_copy(c, dir, journal_seq);
 		mutex_unlock(&dir->ei_update_lock);
 	}
 
@@ -337,7 +315,6 @@ err_before_quota:
 	inum.inum = inode_u.bi_inum;
 
 	bch2_vfs_inode_init(c, inum, inode, &inode_u);
-	journal_seq_copy(c, inode, journal_seq);
 
 	set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl);
 	set_cached_acl(&inode->v, ACL_TYPE_DEFAULT, default_acl);
@@ -362,7 +339,6 @@ err_before_quota:
 		 * We raced, another process pulled the new inode into cache
 		 * before us:
 		 */
-		journal_seq_copy(c, old, journal_seq);
 		make_bad_inode(&inode->v);
 		iput(&inode->v);
 
@@ -446,7 +422,7 @@ static int __bch2_link(struct bch_fs *c,
 	mutex_lock(&inode->ei_update_lock);
 	bch2_trans_init(&trans, c, 4, 1024);
 
-	ret = __bch2_trans_do(&trans, NULL, &inode->ei_journal_seq, 0,
+	ret = __bch2_trans_do(&trans, NULL, NULL, 0,
 			bch2_link_trans(&trans,
 					inode_inum(dir),   &dir_u,
 					inode_inum(inode), &inode_u,
@@ -455,7 +431,6 @@ static int __bch2_link(struct bch_fs *c,
 	if (likely(!ret)) {
 		BUG_ON(inode_u.bi_inum != inode->v.i_ino);
 
-		journal_seq_copy(c, inode, dir->ei_journal_seq);
 		bch2_inode_update_after_write(c, dir, &dir_u,
 					      ATTR_MTIME|ATTR_CTIME);
 		bch2_inode_update_after_write(c, inode, &inode_u, ATTR_CTIME);
@@ -498,7 +473,7 @@ int __bch2_unlink(struct inode *vdir, struct dentry *dentry,
 	bch2_lock_inodes(INODE_UPDATE_LOCK, dir, inode);
 	bch2_trans_init(&trans, c, 4, 1024);
 
-	ret = __bch2_trans_do(&trans, NULL, &dir->ei_journal_seq,
+	ret = __bch2_trans_do(&trans, NULL, NULL,
 			      BTREE_INSERT_NOFAIL,
 			bch2_unlink_trans(&trans,
 					  inode_inum(dir), &dir_u,
@@ -508,7 +483,6 @@ int __bch2_unlink(struct inode *vdir, struct dentry *dentry,
 	if (likely(!ret)) {
 		BUG_ON(inode_u.bi_inum != inode->v.i_ino);
 
-		journal_seq_copy(c, inode, dir->ei_journal_seq);
 		bch2_inode_update_after_write(c, dir, &dir_u,
 					      ATTR_MTIME|ATTR_CTIME);
 		bch2_inode_update_after_write(c, inode, &inode_u,
@@ -550,8 +524,6 @@ static int bch2_symlink(struct mnt_idmap *idmap,
 	if (unlikely(ret))
 		goto err;
 
-	journal_seq_copy(c, dir, inode->ei_journal_seq);
-
 	ret = __bch2_link(c, inode, dir, dentry);
 	if (unlikely(ret))
 		goto err;
@@ -586,7 +558,6 @@ static int bch2_rename2(struct mnt_idmap *idmap,
 		? BCH_RENAME_EXCHANGE
 		: dst_dentry->d_inode
 		? BCH_RENAME_OVERWRITE : BCH_RENAME;
-	u64 journal_seq = 0;
 	int ret;
 
 	if (flags & ~(RENAME_NOREPLACE|RENAME_EXCHANGE))
@@ -626,7 +597,7 @@ static int bch2_rename2(struct mnt_idmap *idmap,
 			goto err;
 	}
 
-	ret = __bch2_trans_do(&trans, NULL, &journal_seq, 0,
+	ret = __bch2_trans_do(&trans, NULL, NULL, 0,
 			bch2_rename_trans(&trans,
 					  inode_inum(src_dir), &src_dir_u,
 					  inode_inum(dst_dir), &dst_dir_u,
@@ -644,23 +615,17 @@ static int bch2_rename2(struct mnt_idmap *idmap,
 
 	bch2_inode_update_after_write(c, src_dir, &src_dir_u,
 				      ATTR_MTIME|ATTR_CTIME);
-	journal_seq_copy(c, src_dir, journal_seq);
 
-	if (src_dir != dst_dir) {
+	if (src_dir != dst_dir)
 		bch2_inode_update_after_write(c, dst_dir, &dst_dir_u,
 					      ATTR_MTIME|ATTR_CTIME);
-		journal_seq_copy(c, dst_dir, journal_seq);
-	}
 
 	bch2_inode_update_after_write(c, src_inode, &src_inode_u,
 				      ATTR_CTIME);
-	journal_seq_copy(c, src_inode, journal_seq);
 
-	if (dst_inode) {
+	if (dst_inode)
 		bch2_inode_update_after_write(c, dst_inode, &dst_inode_u,
 					      ATTR_CTIME);
-		journal_seq_copy(c, dst_inode, journal_seq);
-	}
 err:
 	bch2_trans_exit(&trans);
 
@@ -767,8 +732,7 @@ retry:
 	}
 
 	ret =   bch2_inode_write(&trans, &inode_iter, &inode_u) ?:
-		bch2_trans_commit(&trans, NULL,
-				  &inode->ei_journal_seq,
+		bch2_trans_commit(&trans, NULL, NULL,
 				  BTREE_INSERT_NOFAIL);
 btree_err:
 	bch2_trans_iter_exit(&trans, &inode_iter);
@@ -1203,7 +1167,6 @@ static void bch2_vfs_inode_init(struct bch_fs *c, subvol_inum inum,
 	inode->v.i_size		= bi->bi_size;
 
 	inode->ei_flags		= 0;
-	inode->ei_journal_seq	= bi->bi_journal_seq;
 	inode->ei_quota_reserved = 0;
 	inode->ei_qid		= bch_qid(bi);
 	inode->ei_subvol	= inum.subvol;
@@ -1242,7 +1205,6 @@ static struct inode *bch2_alloc_inode(struct super_block *sb)
 	mutex_init(&inode->ei_update_lock);
 	pagecache_lock_init(&inode->ei_pagecache_lock);
 	mutex_init(&inode->ei_quota_lock);
-	inode->ei_journal_seq = 0;
 
 	return &inode->v;
 }
diff --git a/fs/bcachefs/fs.h b/fs/bcachefs/fs.h
index 38c04282da64..1c8936df9fbb 100644
--- a/fs/bcachefs/fs.h
+++ b/fs/bcachefs/fs.h
@@ -36,7 +36,6 @@ struct bch_inode_info {
 	unsigned long		ei_flags;
 
 	struct mutex		ei_update_lock;
-	u64			ei_journal_seq;
 	u64			ei_quota_reserved;
 	unsigned long		ei_last_dirtied;
 	struct pagecache_lock	ei_pagecache_lock;
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index 0a9cb4d489f4..dc41286c229e 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -393,7 +393,7 @@ err:
  */
 int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter,
 		   subvol_inum inum, u64 end,
-		   u64 *journal_seq, s64 *i_sectors_delta)
+		   s64 *i_sectors_delta)
 {
 	struct bch_fs *c	= trans->c;
 	unsigned max_sectors	= KEY_SIZE_MAX & (~0 << c->block_bits);
@@ -431,7 +431,7 @@ int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter,
 		bch2_cut_back(end_pos, &delete);
 
 		ret = bch2_extent_update(trans, inum, iter, &delete,
-				&disk_res, journal_seq,
+				&disk_res, NULL,
 				0, i_sectors_delta, false);
 		bch2_disk_reservation_put(c, &disk_res);
 btree_err:
@@ -450,7 +450,7 @@ btree_err:
 }
 
 int bch2_fpunch(struct bch_fs *c, subvol_inum inum, u64 start, u64 end,
-		u64 *journal_seq, s64 *i_sectors_delta)
+		s64 *i_sectors_delta)
 {
 	struct btree_trans trans;
 	struct btree_iter iter;
@@ -461,8 +461,7 @@ int bch2_fpunch(struct bch_fs *c, subvol_inum inum, u64 start, u64 end,
 			     POS(inum.inum, start),
 			     BTREE_ITER_INTENT);
 
-	ret = bch2_fpunch_at(&trans, &iter, inum, end,
-			     journal_seq, i_sectors_delta);
+	ret = bch2_fpunch_at(&trans, &iter, inum, end, i_sectors_delta);
 
 	bch2_trans_iter_exit(&trans, &iter);
 	bch2_trans_exit(&trans);
diff --git a/fs/bcachefs/io.h b/fs/bcachefs/io.h
index ebb0944b4ca3..8be77561badb 100644
--- a/fs/bcachefs/io.h
+++ b/fs/bcachefs/io.h
@@ -68,12 +68,6 @@ static inline u64 *op_journal_seq(struct bch_write_op *op)
 		? op->journal_seq_p : &op->journal_seq;
 }
 
-static inline void op_journal_seq_set(struct bch_write_op *op, u64 *journal_seq)
-{
-	op->journal_seq_p = journal_seq;
-	op->flags |= BCH_WRITE_JOURNAL_SEQ_PTR;
-}
-
 static inline struct workqueue_struct *index_update_wq(struct bch_write_op *op)
 {
 	return op->alloc_reserve == RESERVE_MOVINGGC
@@ -88,8 +82,8 @@ int bch2_extent_update(struct btree_trans *, subvol_inum,
 		       struct disk_reservation *, u64 *, u64, s64 *, bool);
 
 int bch2_fpunch_at(struct btree_trans *, struct btree_iter *,
-		   subvol_inum, u64, u64 *, s64 *);
-int bch2_fpunch(struct bch_fs *c, subvol_inum, u64, u64, u64 *, s64 *);
+		   subvol_inum, u64, s64 *);
+int bch2_fpunch(struct bch_fs *c, subvol_inum, u64, u64, s64 *);
 
 static inline void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c,
 				      struct bch_io_opts opts)
diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c
index 8e66e6390e62..d003f4088dfc 100644
--- a/fs/bcachefs/reflink.c
+++ b/fs/bcachefs/reflink.c
@@ -210,7 +210,7 @@ static struct bkey_s_c get_next_src(struct btree_iter *iter, struct bpos end)
 s64 bch2_remap_range(struct bch_fs *c,
 		     subvol_inum dst_inum, u64 dst_offset,
 		     subvol_inum src_inum, u64 src_offset,
-		     u64 remap_sectors, u64 *journal_seq,
+		     u64 remap_sectors,
 		     u64 new_i_size, s64 *i_sectors_delta)
 {
 	struct btree_trans trans;
@@ -281,7 +281,7 @@ s64 bch2_remap_range(struct bch_fs *c,
 					min(dst_end.offset,
 					    dst_iter.pos.offset +
 					    src_iter.pos.offset - src_want.offset),
-					journal_seq, i_sectors_delta);
+					i_sectors_delta);
 			continue;
 		}
 
@@ -320,7 +320,7 @@ s64 bch2_remap_range(struct bch_fs *c,
 				    dst_end.offset - dst_iter.pos.offset));
 
 		ret = bch2_extent_update(&trans, dst_inum, &dst_iter,
-					 new_dst.k, &disk_res, journal_seq,
+					 new_dst.k, &disk_res, NULL,
 					 new_i_size, i_sectors_delta,
 					 true);
 		bch2_disk_reservation_put(c, &disk_res);
@@ -347,7 +347,7 @@ s64 bch2_remap_range(struct bch_fs *c,
 		    inode_u.bi_size < new_i_size) {
 			inode_u.bi_size = new_i_size;
 			ret2  = bch2_inode_write(&trans, &inode_iter, &inode_u) ?:
-				bch2_trans_commit(&trans, NULL, journal_seq, 0);
+				bch2_trans_commit(&trans, NULL, NULL, 0);
 		}
 
 		bch2_trans_iter_exit(&trans, &inode_iter);
diff --git a/fs/bcachefs/reflink.h b/fs/bcachefs/reflink.h
index 4c1b82860b0b..3745873fd88d 100644
--- a/fs/bcachefs/reflink.h
+++ b/fs/bcachefs/reflink.h
@@ -58,6 +58,6 @@ static inline __le64 *bkey_refcount(struct bkey_i *k)
 }
 
 s64 bch2_remap_range(struct bch_fs *, subvol_inum, u64,
-		     subvol_inum, u64, u64, u64 *, u64, s64 *);
+		     subvol_inum, u64, u64, u64, s64 *);
 
 #endif /* _BCACHEFS_REFLINK_H */
diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c
index 181af89b0553..21823ce69237 100644
--- a/fs/bcachefs/xattr.c
+++ b/fs/bcachefs/xattr.c
@@ -165,8 +165,24 @@ int bch2_xattr_set(struct btree_trans *trans, subvol_inum inum,
 		   const char *name, const void *value, size_t size,
 		   int type, int flags)
 {
+	struct btree_iter inode_iter = { NULL };
+	struct bch_inode_unpacked inode_u;
 	int ret;
 
+	/*
+	 * We need to do an inode update so that bi_journal_sync gets updated
+	 * and fsync works:
+	 *
+	 * Perhaps we should be updating bi_mtime too?
+	 */
+
+	ret   = bch2_inode_peek(trans, &inode_iter, &inode_u, inum, BTREE_ITER_INTENT) ?:
+		bch2_inode_write(trans, &inode_iter, &inode_u);
+	bch2_trans_iter_exit(trans, &inode_iter);
+
+	if (ret)
+		return ret;
+
 	if (value) {
 		struct bkey_i_xattr *xattr;
 		unsigned namelen = strlen(name);
@@ -352,7 +368,7 @@ static int bch2_xattr_set_handler(const struct xattr_handler *handler,
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
 	struct bch_hash_info hash = bch2_hash_info_init(c, &inode->ei_inode);
 
-	return bch2_trans_do(c, NULL, &inode->ei_journal_seq, 0,
+	return bch2_trans_do(c, NULL, NULL, 0,
 			bch2_xattr_set(&trans, inode_inum(inode), &hash,
 				       name, value, size,
 				       handler->flags, flags));
-- 
cgit 


From 076c783cd31f0d46782f9365fd79725d1248712d Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 6 Nov 2021 00:05:12 -0400
Subject: bcachefs: Fix upgrade path for reflink_p fix

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/buckets.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index b51b1cf3ca25..d4d41646b2e6 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -1207,11 +1207,15 @@ static int bch2_mark_reflink_p(struct btree_trans *trans,
 	struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
 	struct reflink_gc *ref;
 	size_t l, r, m;
-	u64 idx = le64_to_cpu(p.v->idx) - le32_to_cpu(p.v->front_pad);
-	u64 end_idx = le64_to_cpu(p.v->idx) + p.k->size +
-		le32_to_cpu(p.v->back_pad);
+	u64 idx = le64_to_cpu(p.v->idx);
+	u64 end = le64_to_cpu(p.v->idx) + p.k->size;
 	int ret = 0;
 
+	if (c->sb.version >= bcachefs_metadata_version_reflink_p_fix) {
+		idx -= le32_to_cpu(p.v->front_pad);
+		end += le32_to_cpu(p.v->back_pad);
+	}
+
 	l = 0;
 	r = c->reflink_gc_nr;
 	while (l < r) {
@@ -1224,7 +1228,7 @@ static int bch2_mark_reflink_p(struct btree_trans *trans,
 			r = m;
 	}
 
-	while (idx < end_idx && !ret)
+	while (idx < end && !ret)
 		ret = __bch2_mark_reflink_p(c, p, &idx, flags, l++);
 
 	return ret;
-- 
cgit 


From 7be9ab637fb9c5283c5d1b3f41d81bc2c01ad548 Mon Sep 17 00:00:00 2001
From: Chris Webb <chris@arachsys.com>
Date: Thu, 4 Nov 2021 21:03:16 +0000
Subject: bcachefs: Return -ENOKEY/EINVAL when mount decryption fails

bch2_fs_encryption_init() correctly passes back -ENOKEY from request_key()
when no unlock key is found, or -EINVAL if superblock decryption fails
because of an invalid key. However, these get absorbed into a generic NULL
return from bch2_fs_alloc() and later returned to user space as -ENOMEM,
leading to a misleading error from mount(1):

  mount(2) system call failed: Out of memory.

Return explicit error pointers out of bch2_fs_alloc() and handle them in
both callers, so the user instead sees

  mount(2) system call failed: Required key not available.

when attempting to mount a filesystem which is still locked.

Signed-off-by: Chris Webb <chris@arachsys.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/super.c | 49 +++++++++++++++++++++++++++++++++----------------
 1 file changed, 33 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 501fe129ea9c..ea5f8269c3f5 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -646,12 +646,15 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 	struct bch_fs *c;
 	unsigned i, iter_size;
 	const char *err;
+	int ret = 0;
 
 	pr_verbose_init(opts, "");
 
 	c = kvpmalloc(sizeof(struct bch_fs), GFP_KERNEL|__GFP_ZERO);
-	if (!c)
+	if (!c) {
+		c = ERR_PTR(-ENOMEM);
 		goto out;
+	}
 
 	__module_get(THIS_MODULE);
 
@@ -732,13 +735,16 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 
 	mutex_init(&c->sectors_available_lock);
 
-	if (percpu_init_rwsem(&c->mark_lock))
+	if (percpu_init_rwsem(&c->mark_lock)) {
+		ret = -ENOMEM;
 		goto err;
+	}
 
 	mutex_lock(&c->sb_lock);
 
 	if (bch2_sb_to_fs(c, sb)) {
 		mutex_unlock(&c->sb_lock);
+		ret = -ENOMEM;
 		goto err;
 	}
 
@@ -753,8 +759,10 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 	c->block_bits		= ilog2(c->opts.block_size);
 	c->btree_foreground_merge_threshold = BTREE_FOREGROUND_MERGE_THRESHOLD(c);
 
-	if (bch2_fs_init_fault("fs_alloc"))
+	if (bch2_fs_init_fault("fs_alloc")) {
+		ret = -ENOMEM;
 		goto err;
+	}
 
 	iter_size = sizeof(struct sort_iter) +
 		(btree_blocks(c) + 1) * 2 *
@@ -795,10 +803,15 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 	    bch2_fs_btree_interior_update_init(c) ||
 	    bch2_fs_subvolumes_init(c) ||
 	    bch2_fs_io_init(c) ||
-	    bch2_fs_encryption_init(c) ||
 	    bch2_fs_compress_init(c) ||
 	    bch2_fs_ec_init(c) ||
-	    bch2_fs_fsio_init(c))
+	    bch2_fs_fsio_init(c)) {
+		ret = -ENOMEM;
+		goto err;
+	}
+
+	ret = bch2_fs_encryption_init(c);
+	if (ret)
 		goto err;
 
 	if (c->opts.nochanges)
@@ -807,8 +820,10 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 	mi = bch2_sb_get_members(c->disk_sb.sb);
 	for (i = 0; i < c->sb.nr_devices; i++)
 		if (bch2_dev_exists(c->disk_sb.sb, mi, i) &&
-		    bch2_dev_alloc(c, i))
+		    bch2_dev_alloc(c, i)) {
+			ret = -ENOMEM;
 			goto err;
+		}
 
 	bch2_journal_entry_res_resize(&c->journal,
 			&c->btree_root_journal_res,
@@ -823,14 +838,15 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 	mutex_unlock(&bch_fs_list_lock);
 	if (err) {
 		bch_err(c, "bch2_fs_online() error: %s", err);
+		ret = -ENOMEM;
 		goto err;
 	}
 out:
-	pr_verbose_init(opts, "ret %i", c ? 0 : -ENOMEM);
+	pr_verbose_init(opts, "ret %i", PTR_ERR_OR_ZERO(c));
 	return c;
 err:
 	bch2_fs_free(c);
-	c = NULL;
+	c = ERR_PTR(ret);
 	goto out;
 }
 
@@ -1943,10 +1959,11 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices,
 		i++;
 	}
 
-	ret = -ENOMEM;
 	c = bch2_fs_alloc(sb[best_sb].sb, opts);
-	if (!c)
+	if (IS_ERR(c)) {
+		ret = PTR_ERR(c);
 		goto err;
+	}
 
 	err = "bch2_dev_online() error";
 	down_write(&c->state_lock);
@@ -1977,7 +1994,7 @@ err_print:
 	       devices[0], err);
 	ret = -EINVAL;
 err:
-	if (c)
+	if (!IS_ERR_OR_NULL(c))
 		bch2_fs_stop(c);
 	for (i = 0; i < nr_devices; i++)
 		bch2_free_super(&sb[i]);
@@ -2006,12 +2023,12 @@ static const char *__bch2_fs_open_incremental(struct bch_sb_handle *sb,
 		if (err)
 			goto err;
 	} else {
+		allocated_fs = true;
 		c = bch2_fs_alloc(sb->sb, opts);
-		err = "cannot allocate memory";
-		if (!c)
-			goto err;
 
-		allocated_fs = true;
+		err = "bch2_fs_alloc() error";
+		if (IS_ERR(c))
+			goto err;
 	}
 
 	err = "bch2_dev_online() error";
@@ -2037,7 +2054,7 @@ static const char *__bch2_fs_open_incremental(struct bch_sb_handle *sb,
 err:
 	mutex_unlock(&bch_fs_list_lock);
 
-	if (allocated_fs)
+	if (allocated_fs && !IS_ERR(c))
 		bch2_fs_stop(c);
 	else if (c)
 		closure_put(&c->cl);
-- 
cgit 


From e2b605601ad56904c700569a11fc73db7ecc7acd Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 5 Nov 2021 21:28:17 -0400
Subject: bcachefs: Clean up error reporting in the startup path

It used to be that error reporting in the startup path was done by
returning strings describing the error, but that turned out to be a
rather silly idea - if there's something we can describe about the
error, just print it right away.

This converts a good chunk of code to returning error codes, as is more
typical style.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/super.c | 177 ++++++++++++++++++++++++++--------------------------
 1 file changed, 87 insertions(+), 90 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index ea5f8269c3f5..b24e64317a73 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -596,48 +596,53 @@ void bch2_fs_stop(struct bch_fs *c)
 	bch2_fs_free(c);
 }
 
-static const char *bch2_fs_online(struct bch_fs *c)
+static int bch2_fs_online(struct bch_fs *c)
 {
 	struct bch_dev *ca;
-	const char *err = NULL;
 	unsigned i;
-	int ret;
+	int ret = 0;
 
 	lockdep_assert_held(&bch_fs_list_lock);
 
-	if (!list_empty(&c->list))
-		return NULL;
-
-	if (__bch2_uuid_to_fs(c->sb.uuid))
-		return "filesystem UUID already open";
+	if (__bch2_uuid_to_fs(c->sb.uuid)) {
+		bch_err(c, "filesystem UUID already open");
+		return -EINVAL;
+	}
 
 	ret = bch2_fs_chardev_init(c);
-	if (ret)
-		return "error creating character device";
+	if (ret) {
+		bch_err(c, "error creating character device");
+		return ret;
+	}
 
 	bch2_fs_debug_init(c);
 
-	if (kobject_add(&c->kobj, NULL, "%pU", c->sb.user_uuid.b) ||
-	    kobject_add(&c->internal, &c->kobj, "internal") ||
-	    kobject_add(&c->opts_dir, &c->kobj, "options") ||
-	    kobject_add(&c->time_stats, &c->kobj, "time_stats") ||
-	    bch2_opts_create_sysfs_files(&c->opts_dir))
-		return "error creating sysfs objects";
+	ret = kobject_add(&c->kobj, NULL, "%pU", c->sb.user_uuid.b) ?:
+	    kobject_add(&c->internal, &c->kobj, "internal") ?:
+	    kobject_add(&c->opts_dir, &c->kobj, "options") ?:
+	    kobject_add(&c->time_stats, &c->kobj, "time_stats") ?:
+	    bch2_opts_create_sysfs_files(&c->opts_dir);
+	if (ret) {
+		bch_err(c, "error creating sysfs objects");
+		return ret;
+	}
 
 	down_write(&c->state_lock);
 
-	err = "error creating sysfs objects";
-	for_each_member_device(ca, c, i)
-		if (bch2_dev_sysfs_online(c, ca)) {
+	for_each_member_device(ca, c, i) {
+		ret = bch2_dev_sysfs_online(c, ca);
+		if (ret) {
+			bch_err(c, "error creating sysfs objects");
 			percpu_ref_put(&ca->ref);
 			goto err;
 		}
+	}
 
+	BUG_ON(!list_empty(&c->list));
 	list_add(&c->list, &bch_fs_list);
-	err = NULL;
 err:
 	up_write(&c->state_lock);
-	return err;
+	return ret;
 }
 
 static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
@@ -645,7 +650,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 	struct bch_sb_field_members *mi;
 	struct bch_fs *c;
 	unsigned i, iter_size;
-	const char *err;
 	int ret = 0;
 
 	pr_verbose_init(opts, "");
@@ -735,20 +739,16 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 
 	mutex_init(&c->sectors_available_lock);
 
-	if (percpu_init_rwsem(&c->mark_lock)) {
-		ret = -ENOMEM;
+	ret = percpu_init_rwsem(&c->mark_lock);
+	if (ret)
 		goto err;
-	}
 
 	mutex_lock(&c->sb_lock);
+	ret = bch2_sb_to_fs(c, sb);
+	mutex_unlock(&c->sb_lock);
 
-	if (bch2_sb_to_fs(c, sb)) {
-		mutex_unlock(&c->sb_lock);
-		ret = -ENOMEM;
+	if (ret)
 		goto err;
-	}
-
-	mutex_unlock(&c->sb_lock);
 
 	scnprintf(c->name, sizeof(c->name), "%pU", &c->sb.user_uuid);
 
@@ -760,7 +760,8 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 	c->btree_foreground_merge_threshold = BTREE_FOREGROUND_MERGE_THRESHOLD(c);
 
 	if (bch2_fs_init_fault("fs_alloc")) {
-		ret = -ENOMEM;
+		bch_err(c, "fs_alloc fault injected");
+		ret = -EFAULT;
 		goto err;
 	}
 
@@ -792,25 +793,25 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 					btree_bytes(c)) ||
 	    mempool_init_kmalloc_pool(&c->large_bkey_pool, 1, 2048) ||
 	    !(c->unused_inode_hints = kcalloc(1U << c->inode_shard_bits,
-					      sizeof(u64), GFP_KERNEL)) ||
-	    bch2_io_clock_init(&c->io_clock[READ]) ||
-	    bch2_io_clock_init(&c->io_clock[WRITE]) ||
-	    bch2_fs_journal_init(&c->journal) ||
-	    bch2_fs_replicas_init(c) ||
-	    bch2_fs_btree_cache_init(c) ||
-	    bch2_fs_btree_key_cache_init(&c->btree_key_cache) ||
-	    bch2_fs_btree_iter_init(c) ||
-	    bch2_fs_btree_interior_update_init(c) ||
-	    bch2_fs_subvolumes_init(c) ||
-	    bch2_fs_io_init(c) ||
-	    bch2_fs_compress_init(c) ||
-	    bch2_fs_ec_init(c) ||
-	    bch2_fs_fsio_init(c)) {
+					      sizeof(u64), GFP_KERNEL))) {
 		ret = -ENOMEM;
 		goto err;
 	}
 
-	ret = bch2_fs_encryption_init(c);
+	ret = bch2_io_clock_init(&c->io_clock[READ]) ?:
+	    bch2_io_clock_init(&c->io_clock[WRITE]) ?:
+	    bch2_fs_journal_init(&c->journal) ?:
+	    bch2_fs_replicas_init(c) ?:
+	    bch2_fs_btree_cache_init(c) ?:
+	    bch2_fs_btree_key_cache_init(&c->btree_key_cache) ?:
+	    bch2_fs_btree_iter_init(c) ?:
+	    bch2_fs_btree_interior_update_init(c) ?:
+	    bch2_fs_subvolumes_init(c) ?:
+	    bch2_fs_io_init(c) ?:
+	    bch2_fs_encryption_init(c) ?:
+	    bch2_fs_compress_init(c) ?:
+	    bch2_fs_ec_init(c) ?:
+	    bch2_fs_fsio_init(c);
 	if (ret)
 		goto err;
 
@@ -821,7 +822,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 	for (i = 0; i < c->sb.nr_devices; i++)
 		if (bch2_dev_exists(c->disk_sb.sb, mi, i) &&
 		    bch2_dev_alloc(c, i)) {
-			ret = -ENOMEM;
+			ret = -EEXIST;
 			goto err;
 		}
 
@@ -834,13 +835,11 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 			(sizeof(struct jset_entry_clock) / sizeof(u64)) * 2);
 
 	mutex_lock(&bch_fs_list_lock);
-	err = bch2_fs_online(c);
+	ret = bch2_fs_online(c);
 	mutex_unlock(&bch_fs_list_lock);
-	if (err) {
-		bch_err(c, "bch2_fs_online() error: %s", err);
-		ret = -ENOMEM;
+
+	if (ret)
 		goto err;
-	}
 out:
 	pr_verbose_init(opts, "ret %i", PTR_ERR_OR_ZERO(c));
 	return c;
@@ -886,7 +885,6 @@ static void print_mount_opts(struct bch_fs *c)
 
 int bch2_fs_start(struct bch_fs *c)
 {
-	const char *err = "cannot allocate memory";
 	struct bch_sb_field_members *mi;
 	struct bch_dev *ca;
 	time64_t now = ktime_get_real_seconds();
@@ -922,10 +920,11 @@ int bch2_fs_start(struct bch_fs *c)
 	if (ret)
 		goto err;
 
-	err = "dynamic fault";
 	ret = -EINVAL;
-	if (bch2_fs_init_fault("fs_start"))
+	if (bch2_fs_init_fault("fs_start")) {
+		bch_err(c, "fs_start fault injected");
 		goto err;
+	}
 
 	set_bit(BCH_FS_STARTED, &c->flags);
 
@@ -946,7 +945,6 @@ int bch2_fs_start(struct bch_fs *c)
 	if (c->opts.read_only || c->opts.nochanges) {
 		bch2_fs_read_only(c);
 	} else {
-		err = "error going read write";
 		ret = !test_bit(BCH_FS_RW, &c->flags)
 			? bch2_fs_read_write(c)
 			: bch2_fs_read_write_late(c);
@@ -964,25 +962,22 @@ err:
 	case BCH_FSCK_ERRORS_NOT_FIXED:
 		bch_err(c, "filesystem contains errors: please report this to the developers");
 		pr_cont("mount with -o fix_errors to repair\n");
-		err = "fsck error";
 		break;
 	case BCH_FSCK_REPAIR_UNIMPLEMENTED:
 		bch_err(c, "filesystem contains errors: please report this to the developers");
 		pr_cont("repair unimplemented: inform the developers so that it can be added\n");
-		err = "fsck error";
 		break;
 	case BCH_FSCK_REPAIR_IMPOSSIBLE:
 		bch_err(c, "filesystem contains errors, but repair impossible");
-		err = "fsck error";
 		break;
 	case BCH_FSCK_UNKNOWN_VERSION:
-		err = "unknown metadata version";;
+		bch_err(c, "unknown metadata version");
 		break;
 	case -ENOMEM:
-		err = "cannot allocate memory";
+		bch_err(c, "cannot allocate memory");
 		break;
 	case -EIO:
-		err = "IO error";
+		bch_err(c, "IO error");
 		break;
 	}
 
@@ -1400,7 +1395,7 @@ static void __bch2_dev_read_only(struct bch_fs *c, struct bch_dev *ca)
 	bch2_copygc_start(c);
 }
 
-static const char *__bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca)
+static int __bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca)
 {
 	lockdep_assert_held(&c->state_lock);
 
@@ -1409,10 +1404,7 @@ static const char *__bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca)
 	bch2_dev_allocator_add(c, ca);
 	bch2_recalc_capacity(c);
 
-	if (bch2_dev_allocator_start(ca))
-		return "error starting allocator thread";
-
-	return NULL;
+	return bch2_dev_allocator_start(ca);
 }
 
 int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca,
@@ -1438,9 +1430,8 @@ int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca,
 	bch2_write_super(c);
 	mutex_unlock(&c->sb_lock);
 
-	if (new_state == BCH_MEMBER_STATE_rw &&
-	    __bch2_dev_read_write(c, ca))
-		ret = -ENOMEM;
+	if (new_state == BCH_MEMBER_STATE_rw)
+		ret = __bch2_dev_read_write(c, ca);
 
 	rebalance_wakeup(c);
 
@@ -1724,8 +1715,8 @@ have_slot:
 		goto err_late;
 
 	if (ca->mi.state == BCH_MEMBER_STATE_rw) {
-		err = __bch2_dev_read_write(c, ca);
-		if (err)
+		ret = __bch2_dev_read_write(c, ca);
+		if (ret)
 			goto err_late;
 	}
 
@@ -1769,24 +1760,27 @@ int bch2_dev_online(struct bch_fs *c, const char *path)
 	dev_idx = sb.sb->dev_idx;
 
 	err = bch2_dev_in_fs(c->disk_sb.sb, sb.sb);
-	if (err)
+	if (err) {
+		bch_err(c, "error bringing %s online: %s", path, err);
 		goto err;
+	}
 
-	if (bch2_dev_attach_bdev(c, &sb)) {
-		err = "bch2_dev_attach_bdev() error";
+	ret = bch2_dev_attach_bdev(c, &sb);
+	if (ret)
 		goto err;
-	}
 
 	ca = bch_dev_locked(c, dev_idx);
 
-	if (bch2_trans_mark_dev_sb(c, ca)) {
-		err = "bch2_trans_mark_dev_sb() error";
+	ret = bch2_trans_mark_dev_sb(c, ca);
+	if (ret) {
+		bch_err(c, "error bringing %s online: error %i from bch2_trans_mark_dev_sb",
+			path, ret);
 		goto err;
 	}
 
 	if (ca->mi.state == BCH_MEMBER_STATE_rw) {
-		err = __bch2_dev_read_write(c, ca);
-		if (err)
+		ret = __bch2_dev_read_write(c, ca);
+		if (ret)
 			goto err;
 	}
 
@@ -1804,7 +1798,6 @@ int bch2_dev_online(struct bch_fs *c, const char *path)
 err:
 	up_write(&c->state_lock);
 	bch2_free_super(&sb);
-	bch_err(c, "error bringing %s online: %s", path, err);
 	return -EINVAL;
 }
 
@@ -1908,7 +1901,7 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices,
 	struct bch_sb_field_members *mi;
 	unsigned i, best_sb = 0;
 	const char *err;
-	int ret = -ENOMEM;
+	int ret = 0;
 
 	pr_verbose_init(opts, "");
 
@@ -1923,8 +1916,10 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices,
 	}
 
 	sb = kcalloc(nr_devices, sizeof(*sb), GFP_KERNEL);
-	if (!sb)
+	if (!sb) {
+		ret = -ENOMEM;
 		goto err;
+	}
 
 	for (i = 0; i < nr_devices; i++) {
 		ret = bch2_read_super(devices[i], &opts, &sb[i]);
@@ -1965,13 +1960,14 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices,
 		goto err;
 	}
 
-	err = "bch2_dev_online() error";
 	down_write(&c->state_lock);
-	for (i = 0; i < nr_devices; i++)
-		if (bch2_dev_attach_bdev(c, &sb[i])) {
+	for (i = 0; i < nr_devices; i++) {
+		ret = bch2_dev_attach_bdev(c, &sb[i]);
+		if (ret) {
 			up_write(&c->state_lock);
-			goto err_print;
+			goto err;
 		}
+	}
 	up_write(&c->state_lock);
 
 	err = "insufficient devices";
@@ -1996,8 +1992,9 @@ err_print:
 err:
 	if (!IS_ERR_OR_NULL(c))
 		bch2_fs_stop(c);
-	for (i = 0; i < nr_devices; i++)
-		bch2_free_super(&sb[i]);
+	if (sb)
+		for (i = 0; i < nr_devices; i++)
+			bch2_free_super(&sb[i]);
 	c = ERR_PTR(ret);
 	goto out;
 }
-- 
cgit 


From 9a74f63c97a5c8fcfd0469a87afb0aa95f55f806 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 7 Nov 2021 10:19:37 -0500
Subject: bcachefs: path->should_be_locked fixes

 - We should only be clearing should_be_locked in btree_path_set_pos() -
   it's the responsiblity of the btree_path code, not the btree_iter
   code.

 - bch2_path_put() needs to pay attention to path->should_be_locked, to
   ensure we don't drop locks we're supposed to be keeping.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_iter.c | 25 +++++++++++++++++--------
 fs/bcachefs/btree_iter.h |  2 --
 2 files changed, 17 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 2c28e65fdeb5..94ba43626cde 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1654,19 +1654,19 @@ static struct btree_path *have_path_at_pos(struct btree_trans *trans, struct btr
 	return NULL;
 }
 
-static bool have_node_at_pos(struct btree_trans *trans, struct btree_path *path)
+static struct btree_path *have_node_at_pos(struct btree_trans *trans, struct btree_path *path)
 {
 	struct btree_path *next;
 
 	next = prev_btree_path(trans, path);
-	if (next && path_l(next)->b == path_l(path)->b)
-		return true;
+	if (next && next->level == path->level && path_l(next)->b == path_l(path)->b)
+		return next;
 
 	next = next_btree_path(trans, path);
-	if (next && path_l(next)->b == path_l(path)->b)
-		return true;
+	if (next && next->level == path->level && path_l(next)->b == path_l(path)->b)
+		return next;
 
-	return false;
+	return NULL;
 }
 
 static inline void __bch2_path_free(struct btree_trans *trans, struct btree_path *path)
@@ -1693,11 +1693,20 @@ void bch2_path_put(struct btree_trans *trans, struct btree_path *path, bool inte
 	    (dup = have_path_at_pos(trans, path))) {
 		dup->preserve = true;
 		path->preserve = false;
+		goto free;
 	}
 
 	if (!path->preserve &&
-	    have_node_at_pos(trans, path))
-		__bch2_path_free(trans, path);
+	    (dup = have_node_at_pos(trans, path)))
+		goto free;
+	return;
+free:
+	if (path->should_be_locked &&
+	    !btree_node_locked(dup, path->level))
+		return;
+
+	dup->should_be_locked |= path->should_be_locked;
+	__bch2_path_free(trans, path);
 }
 
 noinline __cold
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index 64a3969db263..c71e42a782d6 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -253,8 +253,6 @@ static inline void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos
 	iter->k.p.offset	= iter->pos.offset	= new_pos.offset;
 	iter->k.p.snapshot	= iter->pos.snapshot	= new_pos.snapshot;
 	iter->k.size = 0;
-	if (iter->path->ref == 1)
-		iter->path->should_be_locked = false;
 }
 
 static inline void bch2_btree_iter_set_pos_to_extent_start(struct btree_iter *iter)
-- 
cgit 


From 32b26e8c7f6418b2d8bd404c7482c44141ba52e5 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 6 Nov 2021 00:03:40 -0400
Subject: bcachefs: bch2_assert_pos_locked()

This adds a new assertion to be used by bch2_inode_update_after_write(),
which updates the VFS inode based on the update to the btree inode we
just did - we require that the btree inode still be locked when we do
that update.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/acl.c        |  2 +-
 fs/bcachefs/btree_iter.c | 45 ++++++++++++++++++++++++++++++++++---
 fs/bcachefs/btree_iter.h |  4 ++++
 fs/bcachefs/fs.c         | 58 +++++++++++++++++++++++++++++-------------------
 fs/bcachefs/fs.h         |  2 +-
 fs/bcachefs/inode.c      |  6 ++---
 fs/bcachefs/inode.h      |  2 ++
 7 files changed, 88 insertions(+), 31 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/acl.c b/fs/bcachefs/acl.c
index 51a0b48a5313..00cd40a8d7fa 100644
--- a/fs/bcachefs/acl.c
+++ b/fs/bcachefs/acl.c
@@ -339,7 +339,7 @@ btree_err:
 	if (unlikely(ret))
 		goto err;
 
-	bch2_inode_update_after_write(c, inode, &inode_u,
+	bch2_inode_update_after_write(&trans, inode, &inode_u,
 				      ATTR_CTIME|ATTR_MODE);
 
 	set_cached_acl(&inode->v, type, acl);
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 94ba43626cde..1ad81cad36f1 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -46,7 +46,7 @@ static inline int __btree_path_cmp(const struct btree_path *l,
 				   unsigned		r_level)
 {
 	return   cmp_int(l->btree_id,	r_btree_id) ?:
-		 cmp_int(l->cached,	r_cached) ?:
+		 cmp_int((int) l->cached,	(int) r_cached) ?:
 		 bpos_cmp(l->pos,	r_pos) ?:
 		-cmp_int(l->level,	r_level);
 }
@@ -762,6 +762,43 @@ out:
 	return ret;
 }
 
+void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id,
+			    struct bpos pos, bool key_cache)
+{
+	struct btree_path *path;
+	unsigned idx;
+	char buf[100];
+
+	trans_for_each_path_inorder(trans, path, idx) {
+		int cmp = cmp_int(path->btree_id, id) ?:
+			cmp_int(path->cached, key_cache);
+
+		if (cmp > 0)
+			break;
+		if (cmp < 0)
+			continue;
+
+		if (!(path->nodes_locked & 1) ||
+		    !path->should_be_locked)
+			continue;
+
+		if (!key_cache) {
+			if (bkey_cmp(pos, path->l[0].b->data->min_key) >= 0 &&
+			    bkey_cmp(pos, path->l[0].b->key.k.p) <= 0)
+				return;
+		} else {
+			if (!bkey_cmp(pos, path->pos))
+				return;
+		}
+	}
+
+	bch2_dump_trans_paths_updates(trans);
+	panic("not locked: %s %s%s\n",
+	      bch2_btree_ids[id],
+	      (bch2_bpos_to_text(&PBUF(buf), pos), buf),
+	      key_cache ? " cached" : "");
+}
+
 #else
 
 static inline void bch2_btree_path_verify_level(struct btree_trans *trans,
@@ -1720,11 +1757,13 @@ void bch2_dump_trans_paths_updates(struct btree_trans *trans)
 	btree_trans_sort_paths(trans);
 
 	trans_for_each_path_inorder(trans, path, idx)
-		printk(KERN_ERR "path: idx %u ref %u:%u%s btree %s pos %s %pS\n",
+		printk(KERN_ERR "path: idx %u ref %u:%u%s%s btree %s pos %s locks %u %pS\n",
 		       path->idx, path->ref, path->intent_ref,
-		       path->preserve ? " preserve" : "",
+		       path->should_be_locked ? " S" : "",
+		       path->preserve ? " P" : "",
 		       bch2_btree_ids[path->btree_id],
 		       (bch2_bpos_to_text(&PBUF(buf1), path->pos), buf1),
+		       path->nodes_locked,
 #ifdef CONFIG_BCACHEFS_DEBUG
 		       (void *) path->ip_allocated
 #else
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index c71e42a782d6..72b9605cf3e7 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -166,9 +166,13 @@ inline struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *, struct bke
 #ifdef CONFIG_BCACHEFS_DEBUG
 void bch2_trans_verify_paths(struct btree_trans *);
 void bch2_trans_verify_locks(struct btree_trans *);
+void bch2_assert_pos_locked(struct btree_trans *, enum btree_id,
+			    struct bpos, bool);
 #else
 static inline void bch2_trans_verify_paths(struct btree_trans *trans) {}
 static inline void bch2_trans_verify_locks(struct btree_trans *trans) {}
+static inline void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id,
+					  struct bpos pos, bool key_cache) {}
 #endif
 
 void bch2_btree_path_fix_key_modified(struct btree_trans *trans,
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index 92919b16f2f5..5596081b93c1 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -37,7 +37,7 @@
 
 static struct kmem_cache *bch2_inode_cache;
 
-static void bch2_vfs_inode_init(struct bch_fs *, subvol_inum,
+static void bch2_vfs_inode_init(struct btree_trans *, subvol_inum,
 				struct bch_inode_info *,
 				struct bch_inode_unpacked *);
 
@@ -93,11 +93,19 @@ void bch2_pagecache_block_get(struct pagecache_lock *lock)
 	__pagecache_lock_get(lock, -1);
 }
 
-void bch2_inode_update_after_write(struct bch_fs *c,
+void bch2_inode_update_after_write(struct btree_trans *trans,
 				   struct bch_inode_info *inode,
 				   struct bch_inode_unpacked *bi,
 				   unsigned fields)
 {
+	struct bch_fs *c = trans->c;
+
+	BUG_ON(bi->bi_inum != inode->v.i_ino);
+
+	bch2_assert_pos_locked(trans, BTREE_ID_inodes,
+			       POS(0, bi->bi_inum),
+			       0 && c->opts.inodes_use_key_cache);
+
 	set_nlink(&inode->v, bch2_inode_nlink_get(bi));
 	i_uid_write(&inode->v, bi->bi_uid);
 	i_gid_write(&inode->v, bi->bi_gid);
@@ -126,6 +134,7 @@ int __must_check bch2_write_inode(struct bch_fs *c,
 	int ret;
 
 	bch2_trans_init(&trans, c, 0, 512);
+	trans.ip = _RET_IP_;
 retry:
 	bch2_trans_begin(&trans);
 
@@ -140,7 +149,7 @@ retry:
 	 * this is important for inode updates via bchfs_write_index_update
 	 */
 	if (!ret)
-		bch2_inode_update_after_write(c, inode, &inode_u, fields);
+		bch2_inode_update_after_write(&trans, inode, &inode_u, fields);
 
 	bch2_trans_iter_exit(&trans, &iter);
 
@@ -215,6 +224,7 @@ struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum)
 {
 	struct bch_inode_unpacked inode_u;
 	struct bch_inode_info *inode;
+	struct btree_trans trans;
 	int ret;
 
 	inode = to_bch_ei(iget5_locked(c->vfs_sb,
@@ -227,14 +237,19 @@ struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum)
 	if (!(inode->v.i_state & I_NEW))
 		return &inode->v;
 
-	ret = bch2_inode_find_by_inum(c, inum, &inode_u);
+	bch2_trans_init(&trans, c, 8, 0);
+	ret = lockrestart_do(&trans,
+		bch2_inode_find_by_inum_trans(&trans, inum, &inode_u));
+
+	if (!ret)
+		bch2_vfs_inode_init(&trans, inum, inode, &inode_u);
+	bch2_trans_exit(&trans);
+
 	if (ret) {
 		iget_failed(&inode->v);
 		return ERR_PTR(ret);
 	}
 
-	bch2_vfs_inode_init(c, inum, inode, &inode_u);
-
 	unlock_new_inode(&inode->v);
 
 	return &inode->v;
@@ -306,7 +321,7 @@ err_before_quota:
 	}
 
 	if (!(flags & BCH_CREATE_TMPFILE)) {
-		bch2_inode_update_after_write(c, dir, &dir_u,
+		bch2_inode_update_after_write(&trans, dir, &dir_u,
 					      ATTR_MTIME|ATTR_CTIME);
 		mutex_unlock(&dir->ei_update_lock);
 	}
@@ -314,7 +329,8 @@ err_before_quota:
 	inum.subvol = inode_u.bi_subvol ?: dir->ei_subvol;
 	inum.inum = inode_u.bi_inum;
 
-	bch2_vfs_inode_init(c, inum, inode, &inode_u);
+	bch2_iget5_set(&inode->v, &inum);
+	bch2_vfs_inode_init(&trans, inum, inode, &inode_u);
 
 	set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl);
 	set_cached_acl(&inode->v, ACL_TYPE_DEFAULT, default_acl);
@@ -429,11 +445,9 @@ static int __bch2_link(struct bch_fs *c,
 					&dentry->d_name));
 
 	if (likely(!ret)) {
-		BUG_ON(inode_u.bi_inum != inode->v.i_ino);
-
-		bch2_inode_update_after_write(c, dir, &dir_u,
+		bch2_inode_update_after_write(&trans, dir, &dir_u,
 					      ATTR_MTIME|ATTR_CTIME);
-		bch2_inode_update_after_write(c, inode, &inode_u, ATTR_CTIME);
+		bch2_inode_update_after_write(&trans, inode, &inode_u, ATTR_CTIME);
 	}
 
 	bch2_trans_exit(&trans);
@@ -481,11 +495,9 @@ int __bch2_unlink(struct inode *vdir, struct dentry *dentry,
 					  deleting_snapshot));
 
 	if (likely(!ret)) {
-		BUG_ON(inode_u.bi_inum != inode->v.i_ino);
-
-		bch2_inode_update_after_write(c, dir, &dir_u,
+		bch2_inode_update_after_write(&trans, dir, &dir_u,
 					      ATTR_MTIME|ATTR_CTIME);
-		bch2_inode_update_after_write(c, inode, &inode_u,
+		bch2_inode_update_after_write(&trans, inode, &inode_u,
 					      ATTR_MTIME);
 	}
 
@@ -613,18 +625,18 @@ static int bch2_rename2(struct mnt_idmap *idmap,
 	BUG_ON(dst_inode &&
 	       dst_inode->v.i_ino != dst_inode_u.bi_inum);
 
-	bch2_inode_update_after_write(c, src_dir, &src_dir_u,
+	bch2_inode_update_after_write(&trans, src_dir, &src_dir_u,
 				      ATTR_MTIME|ATTR_CTIME);
 
 	if (src_dir != dst_dir)
-		bch2_inode_update_after_write(c, dst_dir, &dst_dir_u,
+		bch2_inode_update_after_write(&trans, dst_dir, &dst_dir_u,
 					      ATTR_MTIME|ATTR_CTIME);
 
-	bch2_inode_update_after_write(c, src_inode, &src_inode_u,
+	bch2_inode_update_after_write(&trans, src_inode, &src_inode_u,
 				      ATTR_CTIME);
 
 	if (dst_inode)
-		bch2_inode_update_after_write(c, dst_inode, &dst_inode_u,
+		bch2_inode_update_after_write(&trans, dst_inode, &dst_inode_u,
 					      ATTR_CTIME);
 err:
 	bch2_trans_exit(&trans);
@@ -742,7 +754,7 @@ btree_err:
 	if (unlikely(ret))
 		goto err_trans;
 
-	bch2_inode_update_after_write(c, inode, &inode_u, attr->ia_valid);
+	bch2_inode_update_after_write(&trans, inode, &inode_u, attr->ia_valid);
 
 	if (acl)
 		set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl);
@@ -1154,11 +1166,11 @@ static const struct export_operations bch_export_ops = {
 	//.get_parent	= bch2_get_parent,
 };
 
-static void bch2_vfs_inode_init(struct bch_fs *c, subvol_inum inum,
+static void bch2_vfs_inode_init(struct btree_trans *trans, subvol_inum inum,
 				struct bch_inode_info *inode,
 				struct bch_inode_unpacked *bi)
 {
-	bch2_inode_update_after_write(c, inode, bi, ~0);
+	bch2_inode_update_after_write(trans, inode, bi, ~0);
 
 	inode->v.i_blocks	= bi->bi_sectors;
 	inode->v.i_ino		= bi->bi_inum;
diff --git a/fs/bcachefs/fs.h b/fs/bcachefs/fs.h
index 1c8936df9fbb..530238780a88 100644
--- a/fs/bcachefs/fs.h
+++ b/fs/bcachefs/fs.h
@@ -172,7 +172,7 @@ struct inode *bch2_vfs_inode_get(struct bch_fs *, subvol_inum);
 typedef int (*inode_set_fn)(struct bch_inode_info *,
 			    struct bch_inode_unpacked *, void *);
 
-void bch2_inode_update_after_write(struct bch_fs *,
+void bch2_inode_update_after_write(struct btree_trans *,
 				   struct bch_inode_info *,
 				   struct bch_inode_unpacked *,
 				   unsigned);
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index 728545141a39..a24bbc5228c1 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -722,9 +722,9 @@ err:
 	return ret;
 }
 
-static int bch2_inode_find_by_inum_trans(struct btree_trans *trans,
-					 subvol_inum inum,
-					 struct bch_inode_unpacked *inode)
+int bch2_inode_find_by_inum_trans(struct btree_trans *trans,
+				  subvol_inum inum,
+				  struct bch_inode_unpacked *inode)
 {
 	struct btree_iter iter;
 	int ret;
diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h
index d433d48de4e0..723186d8afb6 100644
--- a/fs/bcachefs/inode.h
+++ b/fs/bcachefs/inode.h
@@ -89,6 +89,8 @@ int bch2_inode_create(struct btree_trans *, struct btree_iter *,
 
 int bch2_inode_rm(struct bch_fs *, subvol_inum, bool);
 
+int bch2_inode_find_by_inum_trans(struct btree_trans *, subvol_inum,
+				  struct bch_inode_unpacked *);
 int bch2_inode_find_by_inum(struct bch_fs *, subvol_inum,
 			    struct bch_inode_unpacked *);
 
-- 
cgit 


From 0397a2e8e1778ce5c8fe893e0b0377d11d57cc5f Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 7 Nov 2021 12:10:57 -0500
Subject: bcachefs: Refactor bch2_fpunch_at()

This cleans up the error hanlding and flow control a bit.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/io.c | 23 +++++++++--------------
 1 file changed, 9 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index dc41286c229e..3acd357919a2 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -402,26 +402,31 @@ int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter,
 	int ret = 0, ret2 = 0;
 	u32 snapshot;
 
-	while (1) {
+	while (!ret || ret == -EINTR) {
 		struct disk_reservation disk_res =
 			bch2_disk_reservation_init(c, 0);
 		struct bkey_i delete;
 
+		if (ret)
+			ret2 = ret;
+
 		bch2_trans_begin(trans);
 
 		ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
 		if (ret)
-			goto btree_err;
+			continue;
 
 		bch2_btree_iter_set_snapshot(iter, snapshot);
 
 		k = bch2_btree_iter_peek(iter);
-		if (bkey_cmp(iter->pos, end_pos) >= 0)
+		if (bkey_cmp(iter->pos, end_pos) >= 0) {
+			bch2_btree_iter_set_pos(iter, end_pos);
 			break;
+		}
 
 		ret = bkey_err(k);
 		if (ret)
-			goto btree_err;
+			continue;
 
 		bkey_init(&delete.k);
 		delete.k.p = iter->pos;
@@ -434,18 +439,8 @@ int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter,
 				&disk_res, NULL,
 				0, i_sectors_delta, false);
 		bch2_disk_reservation_put(c, &disk_res);
-btree_err:
-		if (ret == -EINTR) {
-			ret2 = ret;
-			ret = 0;
-		}
-		if (ret)
-			break;
 	}
 
-	if (bkey_cmp(iter->pos, end_pos) > 0)
-		bch2_btree_iter_set_pos(iter, end_pos);
-
 	return ret ?: ret2;
 }
 
-- 
cgit 


From 74163da7c81ac5bb4ecd625f9e41a241ec5d8758 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 6 Nov 2021 13:39:42 -0400
Subject: bcachefs: Fallocate fixes

- fpunch wasn't always correctly updating i_size - when we drop buffered
  writes that were extending a file, we become responsible for writing
  i_size.

- fzero was sometimes zeroing out more data that it should have -
  block_start and block_end were being rounded in the wrong directions

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/fs-io.c | 201 ++++++++++++++++++++++++++--------------------------
 1 file changed, 100 insertions(+), 101 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 7de6b7a7aa60..12b785c5005f 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -2296,6 +2296,14 @@ static int __bch2_truncate_page(struct bch_inode_info *inode,
 		s->s[i].state		= SECTOR_UNALLOCATED;
 	}
 
+	/*
+	 * Caller needs to know whether this page will be written out by
+	 * writeback - doing an i_size update if necessary - or whether it will
+	 * be responsible for the i_size update:
+	 */
+	ret = s->s[(min_t(u64, inode->v.i_size - (index << PAGE_SHIFT),
+			  PAGE_SIZE) - 1) >> 9].state >= SECTOR_DIRTY;
+
 	zero_user_segment(page, start_offset, end_offset);
 
 	/*
@@ -2304,8 +2312,7 @@ static int __bch2_truncate_page(struct bch_inode_info *inode,
 	 * XXX: because we aren't currently tracking whether the page has actual
 	 * data in it (vs. just 0s, or only partially written) this wrong. ick.
 	 */
-	ret = bch2_get_page_disk_reservation(c, inode, page, false);
-	BUG_ON(ret);
+	BUG_ON(bch2_get_page_disk_reservation(c, inode, page, false));
 
 	/*
 	 * This removes any writeable userspace mappings; we need to force
@@ -2327,6 +2334,20 @@ static int bch2_truncate_page(struct bch_inode_info *inode, loff_t from)
 				    from, round_up(from, PAGE_SIZE));
 }
 
+static int bch2_truncate_pages(struct bch_inode_info *inode,
+			       loff_t start, loff_t end)
+{
+	int ret = __bch2_truncate_page(inode, start >> PAGE_SHIFT,
+				       start, end);
+
+	if (ret >= 0 &&
+	    start >> PAGE_SHIFT != end >> PAGE_SHIFT)
+		ret = __bch2_truncate_page(inode,
+					   end >> PAGE_SHIFT,
+					   start, end);
+	return ret;
+}
+
 static int bch2_extend(struct mnt_idmap *idmap,
 		       struct bch_inode_info *inode,
 		       struct bch_inode_unpacked *inode_u,
@@ -2417,7 +2438,7 @@ int bch2_truncate(struct mnt_idmap *idmap,
 	iattr->ia_valid &= ~ATTR_SIZE;
 
 	ret = bch2_truncate_page(inode, iattr->ia_size);
-	if (unlikely(ret))
+	if (unlikely(ret < 0))
 		goto err;
 
 	/*
@@ -2483,48 +2504,39 @@ static int inode_update_times_fn(struct bch_inode_info *inode,
 static long bchfs_fpunch(struct bch_inode_info *inode, loff_t offset, loff_t len)
 {
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	u64 discard_start = round_up(offset, block_bytes(c)) >> 9;
-	u64 discard_end = round_down(offset + len, block_bytes(c)) >> 9;
+	u64 end		= offset + len;
+	u64 block_start	= round_up(offset, block_bytes(c));
+	u64 block_end	= round_down(end, block_bytes(c));
+	bool truncated_last_page;
 	int ret = 0;
 
-	inode_lock(&inode->v);
-	inode_dio_wait(&inode->v);
-	bch2_pagecache_block_get(&inode->ei_pagecache_lock);
-
-	ret = __bch2_truncate_page(inode,
-				   offset >> PAGE_SHIFT,
-				   offset, offset + len);
-	if (unlikely(ret))
+	ret = bch2_truncate_pages(inode, offset, end);
+	if (unlikely(ret < 0))
 		goto err;
 
-	if (offset >> PAGE_SHIFT !=
-	    (offset + len) >> PAGE_SHIFT) {
-		ret = __bch2_truncate_page(inode,
-					   (offset + len) >> PAGE_SHIFT,
-					   offset, offset + len);
-		if (unlikely(ret))
-			goto err;
-	}
+	truncated_last_page = ret;
 
-	truncate_pagecache_range(&inode->v, offset, offset + len - 1);
+	truncate_pagecache_range(&inode->v, offset, end - 1);
 
-	if (discard_start < discard_end) {
+	if (block_start < block_end ) {
 		s64 i_sectors_delta = 0;
 
 		ret = bch2_fpunch(c, inode_inum(inode),
-				  discard_start, discard_end,
+				  block_start >> 9, block_end >> 9,
 				  &i_sectors_delta);
 		i_sectors_acct(c, inode, NULL, i_sectors_delta);
 	}
 
 	mutex_lock(&inode->ei_update_lock);
-	ret = bch2_write_inode(c, inode, inode_update_times_fn, NULL,
-			       ATTR_MTIME|ATTR_CTIME) ?: ret;
+	if (end >= inode->v.i_size && !truncated_last_page) {
+		ret = bch2_write_inode_size(c, inode, inode->v.i_size,
+					    ATTR_MTIME|ATTR_CTIME);
+	} else {
+		ret = bch2_write_inode(c, inode, inode_update_times_fn, NULL,
+				       ATTR_MTIME|ATTR_CTIME);
+	}
 	mutex_unlock(&inode->ei_update_lock);
 err:
-	bch2_pagecache_block_put(&inode->ei_pagecache_lock);
-	inode_unlock(&inode->v);
-
 	return ret;
 }
 
@@ -2544,31 +2556,18 @@ static long bchfs_fcollapse_finsert(struct bch_inode_info *inode,
 	if ((offset | len) & (block_bytes(c) - 1))
 		return -EINVAL;
 
-	/*
-	 * We need i_mutex to keep the page cache consistent with the extents
-	 * btree, and the btree consistent with i_size - we don't need outside
-	 * locking for the extents btree itself, because we're using linked
-	 * iterators
-	 */
-	inode_lock(&inode->v);
-	inode_dio_wait(&inode->v);
-	bch2_pagecache_block_get(&inode->ei_pagecache_lock);
-
 	if (insert) {
-		ret = -EFBIG;
 		if (inode->v.i_sb->s_maxbytes - inode->v.i_size < len)
-			goto err;
+			return -EFBIG;
 
-		ret = -EINVAL;
 		if (offset >= inode->v.i_size)
-			goto err;
+			return -EINVAL;
 
 		src_start	= U64_MAX;
 		shift		= len;
 	} else {
-		ret = -EINVAL;
 		if (offset + len >= inode->v.i_size)
-			goto err;
+			return -EINVAL;
 
 		src_start	= offset + len;
 		shift		= -len;
@@ -2578,7 +2577,7 @@ static long bchfs_fcollapse_finsert(struct bch_inode_info *inode,
 
 	ret = write_invalidate_inode_pages_range(mapping, offset, LLONG_MAX);
 	if (ret)
-		goto err;
+		return ret;
 
 	if (insert) {
 		i_size_write(&inode->v, new_size);
@@ -2595,7 +2594,7 @@ static long bchfs_fcollapse_finsert(struct bch_inode_info *inode,
 		i_sectors_acct(c, inode, NULL, i_sectors_delta);
 
 		if (ret)
-			goto err;
+			return ret;
 	}
 
 	bch2_bkey_buf_init(&copy);
@@ -2708,18 +2707,19 @@ reassemble:
 	bch2_bkey_buf_exit(&copy, c);
 
 	if (ret)
-		goto err;
+		return ret;
 
+	mutex_lock(&inode->ei_update_lock);
 	if (!insert) {
 		i_size_write(&inode->v, new_size);
-		mutex_lock(&inode->ei_update_lock);
 		ret = bch2_write_inode_size(c, inode, new_size,
 					    ATTR_MTIME|ATTR_CTIME);
-		mutex_unlock(&inode->ei_update_lock);
+	} else {
+		/* We need an inode update to update bi_journal_seq for fsync: */
+		ret = bch2_write_inode(c, inode, inode_update_times_fn, NULL,
+				       ATTR_MTIME|ATTR_CTIME);
 	}
-err:
-	bch2_pagecache_block_put(&inode->ei_pagecache_lock);
-	inode_unlock(&inode->v);
+	mutex_unlock(&inode->ei_update_lock);
 	return ret;
 }
 
@@ -2814,6 +2814,17 @@ bkey_err:
 		if (ret == -EINTR)
 			ret = 0;
 	}
+
+	if (ret == -ENOSPC && (mode & FALLOC_FL_ZERO_RANGE)) {
+		struct quota_res quota_res = { 0 };
+		s64 i_sectors_delta = 0;
+
+		bch2_fpunch_at(&trans, &iter, inode_inum(inode),
+			       end_sector, &i_sectors_delta);
+		i_sectors_acct(c, inode, &quota_res, i_sectors_delta);
+		bch2_quota_reservation_put(c, inode, &quota_res);
+	}
+
 	bch2_trans_iter_exit(&trans, &iter);
 	bch2_trans_exit(&trans);
 	return ret;
@@ -2822,77 +2833,58 @@ bkey_err:
 static long bchfs_fallocate(struct bch_inode_info *inode, int mode,
 			    loff_t offset, loff_t len)
 {
-	struct address_space *mapping = inode->v.i_mapping;
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	loff_t end		= offset + len;
-	loff_t block_start	= round_down(offset,	block_bytes(c));
-	loff_t block_end	= round_up(end,		block_bytes(c));
-	int ret;
-
-	inode_lock(&inode->v);
-	inode_dio_wait(&inode->v);
-	bch2_pagecache_block_get(&inode->ei_pagecache_lock);
+	u64 end		= offset + len;
+	u64 block_start	= round_down(offset,	block_bytes(c));
+	u64 block_end	= round_up(end,		block_bytes(c));
+	bool truncated_last_page = false;
+	int ret, ret2 = 0;
 
 	if (!(mode & FALLOC_FL_KEEP_SIZE) && end > inode->v.i_size) {
 		ret = inode_newsize_ok(&inode->v, end);
 		if (ret)
-			goto err;
+			return ret;
 	}
 
 	if (mode & FALLOC_FL_ZERO_RANGE) {
-		ret = __bch2_truncate_page(inode,
-					   offset >> PAGE_SHIFT,
-					   offset, end);
-
-		if (!ret &&
-		    offset >> PAGE_SHIFT != end >> PAGE_SHIFT)
-			ret = __bch2_truncate_page(inode,
-						   end >> PAGE_SHIFT,
-						   offset, end);
+		ret = bch2_truncate_pages(inode, offset, end);
+		if (unlikely(ret < 0))
+			return ret;
 
-		if (unlikely(ret))
-			goto err;
+		truncated_last_page = ret;
 
 		truncate_pagecache_range(&inode->v, offset, end - 1);
+
+		block_start	= round_up(offset,	block_bytes(c));
+		block_end	= round_down(end,	block_bytes(c));
 	}
 
 	ret = __bchfs_fallocate(inode, mode, block_start >> 9, block_end >> 9);
-	if (ret)
-		goto err;
 
 	/*
-	 * Do we need to extend the file?
-	 *
-	 * If we zeroed up to the end of the file, we dropped whatever writes
-	 * were going to write out the current i_size, so we have to extend
-	 * manually even if FL_KEEP_SIZE was set:
+	 * On -ENOSPC in ZERO_RANGE mode, we still want to do the inode update,
+	 * so that the VFS cache i_size is consistent with the btree i_size:
 	 */
-	if (end >= inode->v.i_size &&
-	    (!(mode & FALLOC_FL_KEEP_SIZE) ||
-	     (mode & FALLOC_FL_ZERO_RANGE))) {
+	if (ret &&
+	    !(ret == -ENOSPC && (mode & FALLOC_FL_ZERO_RANGE)))
+		return ret;
 
-		/*
-		 * Sync existing appends before extending i_size,
-		 * as in bch2_extend():
-		 */
-		ret = filemap_write_and_wait_range(mapping,
-					inode->ei_inode.bi_size, S64_MAX);
-		if (ret)
-			goto err;
+	if (mode & FALLOC_FL_KEEP_SIZE && end > inode->v.i_size)
+		end = inode->v.i_size;
 
-		if (mode & FALLOC_FL_KEEP_SIZE)
-			end = inode->v.i_size;
-		else
-			i_size_write(&inode->v, end);
+	if (end >= inode->v.i_size &&
+	    (((mode & FALLOC_FL_ZERO_RANGE) && !truncated_last_page) ||
+	     !(mode & FALLOC_FL_KEEP_SIZE))) {
+		spin_lock(&inode->v.i_lock);
+		i_size_write(&inode->v, end);
+		spin_unlock(&inode->v.i_lock);
 
 		mutex_lock(&inode->ei_update_lock);
-		ret = bch2_write_inode_size(c, inode, end, 0);
+		ret2 = bch2_write_inode_size(c, inode, end, 0);
 		mutex_unlock(&inode->ei_update_lock);
 	}
-err:
-	bch2_pagecache_block_put(&inode->ei_pagecache_lock);
-	inode_unlock(&inode->v);
-	return ret;
+
+	return ret ?: ret2;
 }
 
 long bch2_fallocate_dispatch(struct file *file, int mode,
@@ -2905,6 +2897,10 @@ long bch2_fallocate_dispatch(struct file *file, int mode,
 	if (!percpu_ref_tryget(&c->writes))
 		return -EROFS;
 
+	inode_lock(&inode->v);
+	inode_dio_wait(&inode->v);
+	bch2_pagecache_block_get(&inode->ei_pagecache_lock);
+
 	if (!(mode & ~(FALLOC_FL_KEEP_SIZE|FALLOC_FL_ZERO_RANGE)))
 		ret = bchfs_fallocate(inode, mode, offset, len);
 	else if (mode == (FALLOC_FL_PUNCH_HOLE|FALLOC_FL_KEEP_SIZE))
@@ -2916,6 +2912,9 @@ long bch2_fallocate_dispatch(struct file *file, int mode,
 	else
 		ret = -EOPNOTSUPP;
 
+
+	bch2_pagecache_block_put(&inode->ei_pagecache_lock);
+	inode_unlock(&inode->v);
 	percpu_ref_put(&c->writes);
 
 	return ret;
-- 
cgit 


From 770e821485e0021ea325f7aa2133fddb46ba0821 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 9 Nov 2021 17:20:06 -0500
Subject: bcachefs: Inode updates should generally be BTREE_INSERT_NOFAIL

This fixes a bug where i_size may become inconsistent between the VFS
cache and the btree, when the filesystem is nearly full.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/reflink.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c
index d003f4088dfc..22230f82b8b9 100644
--- a/fs/bcachefs/reflink.c
+++ b/fs/bcachefs/reflink.c
@@ -347,7 +347,8 @@ s64 bch2_remap_range(struct bch_fs *c,
 		    inode_u.bi_size < new_i_size) {
 			inode_u.bi_size = new_i_size;
 			ret2  = bch2_inode_write(&trans, &inode_iter, &inode_u) ?:
-				bch2_trans_commit(&trans, NULL, NULL, 0);
+				bch2_trans_commit(&trans, NULL, NULL,
+						  BTREE_INSERT_NOFAIL);
 		}
 
 		bch2_trans_iter_exit(&trans, &inode_iter);
-- 
cgit 


From f74a5051b0e58a8f4fab26a2fc65b95ee17df7a0 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 11 Nov 2021 13:02:03 -0500
Subject: bcachefs: Don't check for -ENOSPC in page writeback

If at all possible we'd prefer to not fail page writeback unless the
filesystem has been shutdown; allowing errors in page writeback means
things we'd like to assert about i_size consistency between the VFS and
the btree go out the window.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/fs-io.c | 29 ++++++++++++++---------------
 1 file changed, 14 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 12b785c5005f..ac013bb99a43 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -1144,16 +1144,16 @@ static int __bch2_writepage(struct folio *folio,
 do_io:
 	s = bch2_page_state_create(page, __GFP_NOFAIL);
 
-	ret = bch2_get_page_disk_reservation(c, inode, page, true);
-	if (ret) {
-		SetPageError(page);
-		mapping_set_error(page->mapping, ret);
-		unlock_page(page);
-		return 0;
-	}
+	/*
+	 * Things get really hairy with errors during writeback:
+	 */
+	ret = bch2_get_page_disk_reservation(c, inode, page, false);
+	BUG_ON(ret);
 
 	/* Before unlocking the page, get copy of reservations: */
+	spin_lock(&s->lock);
 	orig = *s;
+	spin_unlock(&s->lock);
 
 	for (i = 0; i < PAGE_SECTORS; i++) {
 		if (s->s[i].state < SECTOR_DIRTY)
@@ -1186,7 +1186,7 @@ do_io:
 
 	offset = 0;
 	while (1) {
-		unsigned sectors = 1, dirty_sectors = 0, reserved_sectors = 0;
+		unsigned sectors = 0, dirty_sectors = 0, reserved_sectors = 0;
 		u64 sector;
 
 		while (offset < PAGE_SECTORS &&
@@ -1196,16 +1196,15 @@ do_io:
 		if (offset == PAGE_SECTORS)
 			break;
 
-		sector = ((u64) page->index << PAGE_SECTOR_SHIFT) + offset;
-
 		while (offset + sectors < PAGE_SECTORS &&
-		       orig.s[offset + sectors].state >= SECTOR_DIRTY)
+		       orig.s[offset + sectors].state >= SECTOR_DIRTY) {
+			reserved_sectors += orig.s[offset + sectors].replicas_reserved;
+			dirty_sectors += orig.s[offset + sectors].state == SECTOR_DIRTY;
 			sectors++;
-
-		for (i = offset; i < offset + sectors; i++) {
-			reserved_sectors += orig.s[i].replicas_reserved;
-			dirty_sectors += orig.s[i].state == SECTOR_DIRTY;
 		}
+		BUG_ON(!sectors);
+
+		sector = ((u64) page->index << PAGE_SECTOR_SHIFT) + offset;
 
 		if (w->io &&
 		    (w->io->op.res.nr_replicas != nr_replicas_this_write ||
-- 
cgit 


From 54b2db3d58eadb4496a671d43b1e7c0506dd0220 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 11 Nov 2021 15:50:22 -0500
Subject: bcachefs: Fix infinite loop in bch2_btree_cache_scan()

When attempting to free btree nodes, we might not be able to free all
the nodes that were requested. But the code was looping until it had
freed _all_ the nodes requested, when it should have only been
attempting to free nr nodes.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_cache.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index 7f5620a4d7c5..4e855ae51731 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -309,7 +309,7 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink,
 
 		touched++;
 
-		if (freed >= nr)
+		if (touched >= nr)
 			break;
 
 		if (!btree_node_reclaim(c, b)) {
@@ -323,7 +323,7 @@ restart:
 	list_for_each_entry_safe(b, t, &bc->live, list) {
 		touched++;
 
-		if (freed >= nr) {
+		if (touched >= nr) {
 			/* Save position */
 			if (&t->list != &bc->live)
 				list_move_tail(&bc->live, &t->list);
-- 
cgit 


From 496b7238794ac9209c68fba3592b59576140fa55 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 13 Nov 2021 12:57:00 -0500
Subject: bcachefs: Fix an exiting of uninitialized iterator

bch2_dirent_lookup had an error path where we'd exit a btree_iter that
hadn't been properly initialized.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/dirent.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c
index 9267eea810f8..5db1426faaf3 100644
--- a/fs/bcachefs/dirent.c
+++ b/fs/bcachefs/dirent.c
@@ -418,16 +418,15 @@ int __bch2_dirent_lookup_trans(struct btree_trans *trans,
 
 	k = bch2_btree_iter_peek_slot(iter);
 	ret = bkey_err(k);
-	if (ret) {
-		bch2_trans_iter_exit(trans, iter);
-		return ret;
-	}
+	if (ret)
+		goto err;
 
 	d = bkey_s_c_to_dirent(k);
 
 	ret = bch2_dirent_read_target(trans, dir, d, inum);
 	if (ret > 0)
 		ret = -ENOENT;
+err:
 	if (ret)
 		bch2_trans_iter_exit(trans, iter);
 
@@ -448,10 +447,10 @@ retry:
 
 	ret = __bch2_dirent_lookup_trans(&trans, &iter, dir, hash_info,
 					  name, inum, 0);
-
-	bch2_trans_iter_exit(&trans, &iter);
 	if (ret == -EINTR)
 		goto retry;
+	if (!ret)
+		bch2_trans_iter_exit(&trans, &iter);
 	bch2_trans_exit(&trans);
 	return ret;
 }
-- 
cgit 


From e3f2db39b39b69538db5bfbd9e359e99dcf1c986 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 13 Nov 2021 13:36:26 -0500
Subject: bcachefs: Tweak vfs cache shrinker behaviour

In bcachefs, inodes and dentries are also cached - more compactly - by
the btree node cache, they don't require seeks to recreate.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/fs.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index 5596081b93c1..4561c60c95e3 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -1669,6 +1669,8 @@ got_sb:
 		sb->s_flags	|= SB_POSIXACL;
 #endif
 
+	sb->s_shrink.seeks = 0;
+
 	vinode = bch2_vfs_inode_get(c, BCACHEFS_ROOT_SUBVOL_INUM);
 	if (IS_ERR(vinode)) {
 		bch_err(c, "error mounting: error getting root inode %i",
-- 
cgit 


From 6404dcc9c246c1b71ace52c1a942c675c89c4ffe Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 11 Nov 2021 12:11:33 -0500
Subject: bcachefs: More enum strings

This patch converts more enums in the on disk format to our standard
x-macro-with-strings deal - to enable better pretty-printing.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/bcachefs_format.h | 60 ++++++++++++++++++++++----------------
 fs/bcachefs/checksum.c        | 68 +++++++++++++++++++++----------------------
 fs/bcachefs/checksum.h        | 20 ++++++-------
 fs/bcachefs/ec.c              |  2 +-
 fs/bcachefs/extents.c         |  6 ++--
 fs/bcachefs/io.c              |  2 +-
 fs/bcachefs/opts.c            | 15 ++++++++++
 fs/bcachefs/opts.h            |  5 +++-
 fs/bcachefs/str_hash.h        | 34 +++++++++++-----------
 9 files changed, 120 insertions(+), 92 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index 21f1948ef8d0..7c2846791286 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -1447,7 +1447,7 @@ LE64_BITMASK(BCH_SB_INODES_USE_KEY_CACHE,struct bch_sb, flags[3], 29, 30);
  * journal_seq_blacklist_v3:	gates BCH_SB_FIELD_journal_seq_blacklist
  * reflink:			gates KEY_TYPE_reflink
  * inline_data:			gates KEY_TYPE_inline_data
- * new_siphash:			gates BCH_STR_HASH_SIPHASH
+ * new_siphash:			gates BCH_STR_HASH_siphash
  * new_extent_overwrite:	gates BTREE_NODE_NEW_EXTENT_OVERWRITE
  */
 #define BCH_SB_FEATURES()			\
@@ -1523,12 +1523,17 @@ enum bch_error_actions {
 	BCH_ON_ERROR_NR
 };
 
+#define BCH_STR_HASH_TYPES()		\
+	x(crc32c,		0)	\
+	x(crc64,		1)	\
+	x(siphash_old,		2)	\
+	x(siphash,		3)
+
 enum bch_str_hash_type {
-	BCH_STR_HASH_CRC32C		= 0,
-	BCH_STR_HASH_CRC64		= 1,
-	BCH_STR_HASH_SIPHASH_OLD	= 2,
-	BCH_STR_HASH_SIPHASH		= 3,
-	BCH_STR_HASH_NR			= 4,
+#define x(t, n) BCH_STR_HASH_##t = n,
+	BCH_STR_HASH_TYPES()
+#undef x
+	BCH_STR_HASH_NR
 };
 
 #define BCH_STR_HASH_OPTS()		\
@@ -1543,34 +1548,39 @@ enum bch_str_hash_opts {
 	BCH_STR_HASH_OPT_NR
 };
 
+#define BCH_CSUM_TYPES()			\
+	x(none,				0)	\
+	x(crc32c_nonzero,		1)	\
+	x(crc64_nonzero,		2)	\
+	x(chacha20_poly1305_80,		3)	\
+	x(chacha20_poly1305_128,	4)	\
+	x(crc32c,			5)	\
+	x(crc64,			6)	\
+	x(xxhash,			7)
+
 enum bch_csum_type {
-	BCH_CSUM_NONE			= 0,
-	BCH_CSUM_CRC32C_NONZERO		= 1,
-	BCH_CSUM_CRC64_NONZERO		= 2,
-	BCH_CSUM_CHACHA20_POLY1305_80	= 3,
-	BCH_CSUM_CHACHA20_POLY1305_128	= 4,
-	BCH_CSUM_CRC32C			= 5,
-	BCH_CSUM_CRC64			= 6,
-	BCH_CSUM_XXHASH			= 7,
-	BCH_CSUM_NR			= 8,
+#define x(t, n) BCH_CSUM_##t = n,
+	BCH_CSUM_TYPES()
+#undef x
+	BCH_CSUM_NR
 };
 
 static const unsigned bch_crc_bytes[] = {
-	[BCH_CSUM_NONE]				= 0,
-	[BCH_CSUM_CRC32C_NONZERO]		= 4,
-	[BCH_CSUM_CRC32C]			= 4,
-	[BCH_CSUM_CRC64_NONZERO]		= 8,
-	[BCH_CSUM_CRC64]			= 8,
-	[BCH_CSUM_XXHASH]			= 8,
-	[BCH_CSUM_CHACHA20_POLY1305_80]		= 10,
-	[BCH_CSUM_CHACHA20_POLY1305_128]	= 16,
+	[BCH_CSUM_none]				= 0,
+	[BCH_CSUM_crc32c_nonzero]		= 4,
+	[BCH_CSUM_crc32c]			= 4,
+	[BCH_CSUM_crc64_nonzero]		= 8,
+	[BCH_CSUM_crc64]			= 8,
+	[BCH_CSUM_xxhash]			= 8,
+	[BCH_CSUM_chacha20_poly1305_80]		= 10,
+	[BCH_CSUM_chacha20_poly1305_128]	= 16,
 };
 
 static inline _Bool bch2_csum_type_is_encryption(enum bch_csum_type type)
 {
 	switch (type) {
-	case BCH_CSUM_CHACHA20_POLY1305_80:
-	case BCH_CSUM_CHACHA20_POLY1305_128:
+	case BCH_CSUM_chacha20_poly1305_80:
+	case BCH_CSUM_chacha20_poly1305_128:
 		return true;
 	default:
 		return false;
diff --git a/fs/bcachefs/checksum.c b/fs/bcachefs/checksum.c
index d20924e579bf..fbe8603cfb30 100644
--- a/fs/bcachefs/checksum.c
+++ b/fs/bcachefs/checksum.c
@@ -35,18 +35,18 @@ struct bch2_checksum_state {
 static void bch2_checksum_init(struct bch2_checksum_state *state)
 {
 	switch (state->type) {
-	case BCH_CSUM_NONE:
-	case BCH_CSUM_CRC32C:
-	case BCH_CSUM_CRC64:
+	case BCH_CSUM_none:
+	case BCH_CSUM_crc32c:
+	case BCH_CSUM_crc64:
 		state->seed = 0;
 		break;
-	case BCH_CSUM_CRC32C_NONZERO:
+	case BCH_CSUM_crc32c_nonzero:
 		state->seed = U32_MAX;
 		break;
-	case BCH_CSUM_CRC64_NONZERO:
+	case BCH_CSUM_crc64_nonzero:
 		state->seed = U64_MAX;
 		break;
-	case BCH_CSUM_XXHASH:
+	case BCH_CSUM_xxhash:
 		xxh64_reset(&state->h64state, 0);
 		break;
 	default:
@@ -57,15 +57,15 @@ static void bch2_checksum_init(struct bch2_checksum_state *state)
 static u64 bch2_checksum_final(const struct bch2_checksum_state *state)
 {
 	switch (state->type) {
-	case BCH_CSUM_NONE:
-	case BCH_CSUM_CRC32C:
-	case BCH_CSUM_CRC64:
+	case BCH_CSUM_none:
+	case BCH_CSUM_crc32c:
+	case BCH_CSUM_crc64:
 		return state->seed;
-	case BCH_CSUM_CRC32C_NONZERO:
+	case BCH_CSUM_crc32c_nonzero:
 		return state->seed ^ U32_MAX;
-	case BCH_CSUM_CRC64_NONZERO:
+	case BCH_CSUM_crc64_nonzero:
 		return state->seed ^ U64_MAX;
-	case BCH_CSUM_XXHASH:
+	case BCH_CSUM_xxhash:
 		return xxh64_digest(&state->h64state);
 	default:
 		BUG();
@@ -75,17 +75,17 @@ static u64 bch2_checksum_final(const struct bch2_checksum_state *state)
 static void bch2_checksum_update(struct bch2_checksum_state *state, const void *data, size_t len)
 {
 	switch (state->type) {
-	case BCH_CSUM_NONE:
+	case BCH_CSUM_none:
 		return;
-	case BCH_CSUM_CRC32C_NONZERO:
-	case BCH_CSUM_CRC32C:
+	case BCH_CSUM_crc32c_nonzero:
+	case BCH_CSUM_crc32c:
 		state->seed = crc32c(state->seed, data, len);
 		break;
-	case BCH_CSUM_CRC64_NONZERO:
-	case BCH_CSUM_CRC64:
+	case BCH_CSUM_crc64_nonzero:
+	case BCH_CSUM_crc64:
 		state->seed = crc64_be(state->seed, data, len);
 		break;
-	case BCH_CSUM_XXHASH:
+	case BCH_CSUM_xxhash:
 		xxh64_update(&state->h64state, data, len);
 		break;
 	default:
@@ -161,12 +161,12 @@ struct bch_csum bch2_checksum(struct bch_fs *c, unsigned type,
 			      struct nonce nonce, const void *data, size_t len)
 {
 	switch (type) {
-	case BCH_CSUM_NONE:
-	case BCH_CSUM_CRC32C_NONZERO:
-	case BCH_CSUM_CRC64_NONZERO:
-	case BCH_CSUM_CRC32C:
-	case BCH_CSUM_XXHASH:
-	case BCH_CSUM_CRC64: {
+	case BCH_CSUM_none:
+	case BCH_CSUM_crc32c_nonzero:
+	case BCH_CSUM_crc64_nonzero:
+	case BCH_CSUM_crc32c:
+	case BCH_CSUM_xxhash:
+	case BCH_CSUM_crc64: {
 		struct bch2_checksum_state state;
 
 		state.type = type;
@@ -177,8 +177,8 @@ struct bch_csum bch2_checksum(struct bch_fs *c, unsigned type,
 		return (struct bch_csum) { .lo = cpu_to_le64(bch2_checksum_final(&state)) };
 	}
 
-	case BCH_CSUM_CHACHA20_POLY1305_80:
-	case BCH_CSUM_CHACHA20_POLY1305_128: {
+	case BCH_CSUM_chacha20_poly1305_80:
+	case BCH_CSUM_chacha20_poly1305_128: {
 		SHASH_DESC_ON_STACK(desc, c->poly1305);
 		u8 digest[POLY1305_DIGEST_SIZE];
 		struct bch_csum ret = { 0 };
@@ -212,13 +212,13 @@ static struct bch_csum __bch2_checksum_bio(struct bch_fs *c, unsigned type,
 	struct bio_vec bv;
 
 	switch (type) {
-	case BCH_CSUM_NONE:
+	case BCH_CSUM_none:
 		return (struct bch_csum) { 0 };
-	case BCH_CSUM_CRC32C_NONZERO:
-	case BCH_CSUM_CRC64_NONZERO:
-	case BCH_CSUM_CRC32C:
-	case BCH_CSUM_XXHASH:
-	case BCH_CSUM_CRC64: {
+	case BCH_CSUM_crc32c_nonzero:
+	case BCH_CSUM_crc64_nonzero:
+	case BCH_CSUM_crc32c:
+	case BCH_CSUM_xxhash:
+	case BCH_CSUM_crc64: {
 		struct bch2_checksum_state state;
 
 		state.type = type;
@@ -238,8 +238,8 @@ static struct bch_csum __bch2_checksum_bio(struct bch_fs *c, unsigned type,
 		return (struct bch_csum) { .lo = cpu_to_le64(bch2_checksum_final(&state)) };
 	}
 
-	case BCH_CSUM_CHACHA20_POLY1305_80:
-	case BCH_CSUM_CHACHA20_POLY1305_128: {
+	case BCH_CSUM_chacha20_poly1305_80:
+	case BCH_CSUM_chacha20_poly1305_128: {
 		SHASH_DESC_ON_STACK(desc, c->poly1305);
 		u8 digest[POLY1305_DIGEST_SIZE];
 		struct bch_csum ret = { 0 };
diff --git a/fs/bcachefs/checksum.h b/fs/bcachefs/checksum.h
index 6841fb16568a..f5c1a609c5c4 100644
--- a/fs/bcachefs/checksum.h
+++ b/fs/bcachefs/checksum.h
@@ -13,9 +13,9 @@ static inline bool bch2_checksum_mergeable(unsigned type)
 {
 
 	switch (type) {
-	case BCH_CSUM_NONE:
-	case BCH_CSUM_CRC32C:
-	case BCH_CSUM_CRC64:
+	case BCH_CSUM_none:
+	case BCH_CSUM_crc32c:
+	case BCH_CSUM_crc64:
 		return true;
 	default:
 		return false;
@@ -78,13 +78,13 @@ static inline enum bch_csum_type bch2_csum_opt_to_type(enum bch_csum_opts type,
 {
 	switch (type) {
 	case BCH_CSUM_OPT_none:
-	     return BCH_CSUM_NONE;
+	     return BCH_CSUM_none;
 	case BCH_CSUM_OPT_crc32c:
-	     return data ? BCH_CSUM_CRC32C : BCH_CSUM_CRC32C_NONZERO;
+	     return data ? BCH_CSUM_crc32c : BCH_CSUM_crc32c_nonzero;
 	case BCH_CSUM_OPT_crc64:
-	     return data ? BCH_CSUM_CRC64 : BCH_CSUM_CRC64_NONZERO;
+	     return data ? BCH_CSUM_crc64 : BCH_CSUM_crc64_nonzero;
 	case BCH_CSUM_OPT_xxhash:
-	     return BCH_CSUM_XXHASH;
+	     return BCH_CSUM_xxhash;
 	default:
 	     BUG();
 	}
@@ -95,8 +95,8 @@ static inline enum bch_csum_type bch2_data_checksum_type(struct bch_fs *c,
 {
 	if (c->sb.encryption_type)
 		return c->opts.wide_macs
-			? BCH_CSUM_CHACHA20_POLY1305_128
-			: BCH_CSUM_CHACHA20_POLY1305_80;
+			? BCH_CSUM_chacha20_poly1305_128
+			: BCH_CSUM_chacha20_poly1305_80;
 
 	return bch2_csum_opt_to_type(opt, true);
 }
@@ -104,7 +104,7 @@ static inline enum bch_csum_type bch2_data_checksum_type(struct bch_fs *c,
 static inline enum bch_csum_type bch2_meta_checksum_type(struct bch_fs *c)
 {
 	if (c->sb.encryption_type)
-		return BCH_CSUM_CHACHA20_POLY1305_128;
+		return BCH_CSUM_chacha20_poly1305_128;
 
 	return bch2_csum_opt_to_type(c->opts.metadata_checksum, false);
 }
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index 32b17f05a750..bc8bb963ae43 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -1154,7 +1154,7 @@ static void ec_stripe_key_init(struct bch_fs *c,
 	s->v.nr_blocks			= nr_data + nr_parity;
 	s->v.nr_redundant		= nr_parity;
 	s->v.csum_granularity_bits	= ilog2(c->sb.encoded_extent_max);
-	s->v.csum_type			= BCH_CSUM_CRC32C;
+	s->v.csum_type			= BCH_CSUM_crc32c;
 	s->v.pad			= 0;
 
 	while ((u64s = stripe_val_u64s(&s->v)) > BKEY_VAL_U64s_MAX) {
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 7f1a5c81ef09..8592a0f6327e 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -968,12 +968,12 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
 		case BCH_EXTENT_ENTRY_crc128:
 			crc = bch2_extent_crc_unpack(k.k, entry_to_crc(entry));
 
-			pr_buf(out, "crc: c_size %u size %u offset %u nonce %u csum %u compress %u",
+			pr_buf(out, "crc: c_size %u size %u offset %u nonce %u csum %s compress %s",
 			       crc.compressed_size,
 			       crc.uncompressed_size,
 			       crc.offset, crc.nonce,
-			       crc.csum_type,
-			       crc.compression_type);
+			       bch2_csum_types[crc.csum_type],
+			       bch2_compression_types[crc.compression_type]);
 			break;
 		case BCH_EXTENT_ENTRY_stripe_ptr:
 			ec = &entry->stripe_ptr;
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index 3acd357919a2..c5bd17f913f2 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -2104,7 +2104,7 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig,
 	EBUG_ON(offset_into_extent + bvec_iter_sectors(iter) > k.k->size);
 
 	if (crc_is_compressed(pick.crc) ||
-	    (pick.crc.csum_type != BCH_CSUM_NONE &&
+	    (pick.crc.csum_type != BCH_CSUM_none &&
 	     (bvec_iter_sectors(iter) != pick.crc.uncompressed_size ||
 	      (bch2_csum_type_is_encryption(pick.crc.csum_type) &&
 	       (flags & BCH_READ_USER_MAPPED)) ||
diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c
index ff99c6d24abd..a955ef2008c9 100644
--- a/fs/bcachefs/opts.c
+++ b/fs/bcachefs/opts.c
@@ -31,17 +31,32 @@ const char * const bch2_btree_ids[] = {
 	NULL
 };
 
+const char * const bch2_csum_types[] = {
+	BCH_CSUM_TYPES()
+	NULL
+};
+
 const char * const bch2_csum_opts[] = {
 	BCH_CSUM_OPTS()
 	NULL
 };
 
+const char * const bch2_compression_types[] = {
+	BCH_COMPRESSION_TYPES()
+	NULL
+};
+
 const char * const bch2_compression_opts[] = {
 	BCH_COMPRESSION_OPTS()
 	NULL
 };
 
 const char * const bch2_str_hash_types[] = {
+	BCH_STR_HASH_TYPES()
+	NULL
+};
+
+const char * const bch2_str_hash_opts[] = {
 	BCH_STR_HASH_OPTS()
 	NULL
 };
diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
index 4e59bff09578..10c022ec6ee0 100644
--- a/fs/bcachefs/opts.h
+++ b/fs/bcachefs/opts.h
@@ -12,9 +12,12 @@ extern const char * const bch2_error_actions[];
 extern const char * const bch2_sb_features[];
 extern const char * const bch2_sb_compat[];
 extern const char * const bch2_btree_ids[];
+extern const char * const bch2_csum_types[];
 extern const char * const bch2_csum_opts[];
+extern const char * const bch2_compression_types[];
 extern const char * const bch2_compression_opts[];
 extern const char * const bch2_str_hash_types[];
+extern const char * const bch2_str_hash_opts[];
 extern const char * const bch2_data_types[];
 extern const char * const bch2_cache_replacement_policies[];
 extern const char * const bch2_member_states[];
@@ -140,7 +143,7 @@ enum opt_type {
 	  NULL,		NULL)						\
 	x(str_hash,			u8,				\
 	  OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,				\
-	  OPT_STR(bch2_str_hash_types),					\
+	  OPT_STR(bch2_str_hash_opts),					\
 	  BCH_SB_STR_HASH_TYPE,		BCH_STR_HASH_OPT_siphash,	\
 	  NULL,		"Hash function for directory entries and xattrs")\
 	x(metadata_target,		u16,				\
diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h
index 789dde7c6ac6..57d636740d2f 100644
--- a/fs/bcachefs/str_hash.h
+++ b/fs/bcachefs/str_hash.h
@@ -20,13 +20,13 @@ bch2_str_hash_opt_to_type(struct bch_fs *c, enum bch_str_hash_opts opt)
 {
 	switch (opt) {
 	case BCH_STR_HASH_OPT_crc32c:
-		return BCH_STR_HASH_CRC32C;
+		return BCH_STR_HASH_crc32c;
 	case BCH_STR_HASH_OPT_crc64:
-		return BCH_STR_HASH_CRC64;
+		return BCH_STR_HASH_crc64;
 	case BCH_STR_HASH_OPT_siphash:
 		return c->sb.features & (1ULL << BCH_FEATURE_new_siphash)
-			? BCH_STR_HASH_SIPHASH
-			: BCH_STR_HASH_SIPHASH_OLD;
+			? BCH_STR_HASH_siphash
+			: BCH_STR_HASH_siphash_old;
 	default:
 	     BUG();
 	}
@@ -51,7 +51,7 @@ bch2_hash_info_init(struct bch_fs *c, const struct bch_inode_unpacked *bi)
 		.siphash_key = { .k0 = bi->bi_hash_seed }
 	};
 
-	if (unlikely(info.type == BCH_STR_HASH_SIPHASH_OLD)) {
+	if (unlikely(info.type == BCH_STR_HASH_siphash_old)) {
 		SHASH_DESC_ON_STACK(desc, c->sha256);
 		u8 digest[SHA256_DIGEST_SIZE];
 
@@ -77,16 +77,16 @@ static inline void bch2_str_hash_init(struct bch_str_hash_ctx *ctx,
 				     const struct bch_hash_info *info)
 {
 	switch (info->type) {
-	case BCH_STR_HASH_CRC32C:
+	case BCH_STR_HASH_crc32c:
 		ctx->crc32c = crc32c(~0, &info->siphash_key.k0,
 				     sizeof(info->siphash_key.k0));
 		break;
-	case BCH_STR_HASH_CRC64:
+	case BCH_STR_HASH_crc64:
 		ctx->crc64 = crc64_be(~0, &info->siphash_key.k0,
 				      sizeof(info->siphash_key.k0));
 		break;
-	case BCH_STR_HASH_SIPHASH_OLD:
-	case BCH_STR_HASH_SIPHASH:
+	case BCH_STR_HASH_siphash_old:
+	case BCH_STR_HASH_siphash:
 		SipHash24_Init(&ctx->siphash, &info->siphash_key);
 		break;
 	default:
@@ -99,14 +99,14 @@ static inline void bch2_str_hash_update(struct bch_str_hash_ctx *ctx,
 				       const void *data, size_t len)
 {
 	switch (info->type) {
-	case BCH_STR_HASH_CRC32C:
+	case BCH_STR_HASH_crc32c:
 		ctx->crc32c = crc32c(ctx->crc32c, data, len);
 		break;
-	case BCH_STR_HASH_CRC64:
+	case BCH_STR_HASH_crc64:
 		ctx->crc64 = crc64_be(ctx->crc64, data, len);
 		break;
-	case BCH_STR_HASH_SIPHASH_OLD:
-	case BCH_STR_HASH_SIPHASH:
+	case BCH_STR_HASH_siphash_old:
+	case BCH_STR_HASH_siphash:
 		SipHash24_Update(&ctx->siphash, data, len);
 		break;
 	default:
@@ -118,12 +118,12 @@ static inline u64 bch2_str_hash_end(struct bch_str_hash_ctx *ctx,
 				   const struct bch_hash_info *info)
 {
 	switch (info->type) {
-	case BCH_STR_HASH_CRC32C:
+	case BCH_STR_HASH_crc32c:
 		return ctx->crc32c;
-	case BCH_STR_HASH_CRC64:
+	case BCH_STR_HASH_crc64:
 		return ctx->crc64 >> 1;
-	case BCH_STR_HASH_SIPHASH_OLD:
-	case BCH_STR_HASH_SIPHASH:
+	case BCH_STR_HASH_siphash_old:
+	case BCH_STR_HASH_siphash:
 		return SipHash24_End(&ctx->siphash) >> 1;
 	default:
 		BUG();
-- 
cgit 


From 61d876c25d65c6732f1db1e7611a158fc2c55be3 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 13 Nov 2021 17:44:13 -0500
Subject: bcachefs: Improve bch2_reflink_p_to_text()

.to_text methods generally ought to print all the value fields.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/reflink.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c
index 22230f82b8b9..8dcac7815c9f 100644
--- a/fs/bcachefs/reflink.c
+++ b/fs/bcachefs/reflink.c
@@ -44,7 +44,10 @@ void bch2_reflink_p_to_text(struct printbuf *out, struct bch_fs *c,
 {
 	struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
 
-	pr_buf(out, "idx %llu", le64_to_cpu(p.v->idx));
+	pr_buf(out, "idx %llu front_pad %u back_pad %u",
+	       le64_to_cpu(p.v->idx),
+	       le32_to_cpu(p.v->front_pad),
+	       le32_to_cpu(p.v->back_pad));
 }
 
 bool bch2_reflink_p_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r)
-- 
cgit 


From 531b69e9afed954156b193264daf32c067454952 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 13 Nov 2021 17:53:55 -0500
Subject: bcachefs: Convert journal BUG_ON() to a warning

It's definitely indicative of a bug if we request to flush a journal
sequence number that hasn't happened yet, but it's more useful if we
warn and print out the relevant sequence numbers instead of just dying.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/journal.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index 1abd1ac560e6..1ee012d94b4a 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -550,7 +550,10 @@ int bch2_journal_flush_seq_async(struct journal *j, u64 seq,
 
 	spin_lock(&j->lock);
 
-	BUG_ON(seq > journal_cur_seq(j));
+	if (WARN_ONCE(seq > journal_cur_seq(j),
+		      "requested to flush journal seq %llu, but currently at %llu",
+		      seq, journal_cur_seq(j)))
+		goto out;
 
 	/* Recheck under lock: */
 	if (j->err_seq && seq >= j->err_seq) {
-- 
cgit 


From 1c9e6d50e28c89d03bebfe3e3946746dc1eeab74 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 13 Nov 2021 17:57:52 -0500
Subject: bcachefs: Fix missing field initialization

When unpacking v1 inodes, we were failing to initialize the journal_seq
field, leading to a BUG_ON() when fsync tries to flush a garbage journal
sequence number.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/inode.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index a24bbc5228c1..fb5ed3a07ad7 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -220,6 +220,7 @@ int bch2_inode_unpack(struct bkey_s_c k,
 		struct bkey_s_c_inode inode = bkey_s_c_to_inode(k);
 
 		unpacked->bi_inum	= inode.k->p.offset;
+		unpacked->bi_journal_seq= 0;
 		unpacked->bi_hash_seed	= inode.v->bi_hash_seed;
 		unpacked->bi_flags	= le32_to_cpu(inode.v->bi_flags);
 		unpacked->bi_mode	= le16_to_cpu(inode.v->bi_mode);
-- 
cgit 


From 697e546fb38fb8c3e274c1561aaaab18178809a5 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 26 Oct 2021 17:35:58 -0400
Subject: bcachefs: Refactor journal replay code

This consolidates duplicated code in journal replay - it's only a few
flags that are different for replaying alloc keys.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/recovery.c | 53 ++++++++++++++++----------------------------------
 1 file changed, 17 insertions(+), 36 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index d8e511a0664e..373e309299bb 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -518,57 +518,38 @@ static void replay_now_at(struct journal *j, u64 seq)
 }
 
 static int __bch2_journal_replay_key(struct btree_trans *trans,
-				     enum btree_id id, unsigned level,
-				     struct bkey_i *k)
+				     struct journal_key *k)
 {
 	struct btree_iter iter;
+	unsigned iter_flags =
+		BTREE_ITER_INTENT|
+		BTREE_ITER_NOT_EXTENTS;
 	int ret;
 
-	bch2_trans_node_iter_init(trans, &iter, id, k->k.p,
-				  BTREE_MAX_DEPTH, level,
-				  BTREE_ITER_INTENT|
-				  BTREE_ITER_NOT_EXTENTS);
+	if (!k->level && k->btree_id == BTREE_ID_alloc)
+		iter_flags |= BTREE_ITER_CACHED|BTREE_ITER_CACHED_NOFILL;
+
+	bch2_trans_node_iter_init(trans, &iter, k->btree_id, k->k->k.p,
+				  BTREE_MAX_DEPTH, k->level,
+				  iter_flags);
 	ret   = bch2_btree_iter_traverse(&iter) ?:
-		bch2_trans_update(trans, &iter, k, BTREE_TRIGGER_NORUN);
+		bch2_trans_update(trans, &iter, k->k, BTREE_TRIGGER_NORUN);
 	bch2_trans_iter_exit(trans, &iter);
 	return ret;
 }
 
 static int bch2_journal_replay_key(struct bch_fs *c, struct journal_key *k)
 {
-	unsigned commit_flags = BTREE_INSERT_NOFAIL|
-		BTREE_INSERT_LAZY_RW;
+	unsigned commit_flags =
+		BTREE_INSERT_LAZY_RW|
+		BTREE_INSERT_NOFAIL|
+		BTREE_INSERT_JOURNAL_RESERVED;
 
 	if (!k->allocated)
 		commit_flags |= BTREE_INSERT_JOURNAL_REPLAY;
 
 	return bch2_trans_do(c, NULL, NULL, commit_flags,
-			     __bch2_journal_replay_key(&trans, k->btree_id, k->level, k->k));
-}
-
-static int __bch2_alloc_replay_key(struct btree_trans *trans, struct bkey_i *k)
-{
-	struct btree_iter iter;
-	int ret;
-
-	bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, k->k.p,
-			     BTREE_ITER_CACHED|
-			     BTREE_ITER_CACHED_NOFILL|
-			     BTREE_ITER_INTENT);
-	ret   = bch2_btree_iter_traverse(&iter) ?:
-		bch2_trans_update(trans, &iter, k, BTREE_TRIGGER_NORUN);
-	bch2_trans_iter_exit(trans, &iter);
-	return ret;
-}
-
-static int bch2_alloc_replay_key(struct bch_fs *c, struct bkey_i *k)
-{
-	return bch2_trans_do(c, NULL, NULL,
-			     BTREE_INSERT_NOFAIL|
-			     BTREE_INSERT_USE_RESERVE|
-			     BTREE_INSERT_LAZY_RW|
-			     BTREE_INSERT_JOURNAL_REPLAY,
-			__bch2_alloc_replay_key(&trans, k));
+			     __bch2_journal_replay_key(&trans, k));
 }
 
 static int journal_sort_seq_cmp(const void *_l, const void *_r)
@@ -606,7 +587,7 @@ static int bch2_journal_replay(struct bch_fs *c,
 
 		if (!i->level && i->btree_id == BTREE_ID_alloc) {
 			j->replay_journal_seq = keys.journal_seq_base + i->journal_seq;
-			ret = bch2_alloc_replay_key(c, i->k);
+			ret = bch2_journal_replay_key(c, i);
 			if (ret)
 				goto err;
 		}
-- 
cgit 


From 85e95ca7cc48c23f772387b069d794f69116192b Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 13 Nov 2021 19:49:14 -0500
Subject: bcachefs: Update export_operations for snapshots

When support for snapshots was merged, export operations weren't
updated yet. This patch adds new filehandle types for bcachefs that
include the subvolume ID and updates export operations for subvolumes -
and also .get_parent, support for which was added just prior to
snapshots.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/dirent.c     |   4 +-
 fs/bcachefs/dirent.h     |   3 +
 fs/bcachefs/fs.c         | 230 ++++++++++++++++++++++++++++++++++++++++++-----
 include/linux/exportfs.h |   6 ++
 4 files changed, 218 insertions(+), 25 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c
index 5db1426faaf3..4dfcc955675b 100644
--- a/fs/bcachefs/dirent.c
+++ b/fs/bcachefs/dirent.c
@@ -197,8 +197,8 @@ static void dirent_copy_target(struct bkey_i_dirent *dst,
 	dst->v.d_type = src.v->d_type;
 }
 
-static int bch2_dirent_read_target(struct btree_trans *trans, subvol_inum dir,
-				   struct bkey_s_c_dirent d, subvol_inum *target)
+int bch2_dirent_read_target(struct btree_trans *trans, subvol_inum dir,
+			    struct bkey_s_c_dirent d, subvol_inum *target)
 {
 	struct bch_subvolume s;
 	int ret = 0;
diff --git a/fs/bcachefs/dirent.h b/fs/bcachefs/dirent.h
index 8ae407765fe4..1bb4d802bc1d 100644
--- a/fs/bcachefs/dirent.h
+++ b/fs/bcachefs/dirent.h
@@ -29,6 +29,9 @@ static inline unsigned dirent_val_u64s(unsigned len)
 			    sizeof(u64));
 }
 
+int bch2_dirent_read_target(struct btree_trans *, subvol_inum,
+			    struct bkey_s_c_dirent, subvol_inum *);
+
 int bch2_dirent_create(struct btree_trans *, subvol_inum,
 		       const struct bch_hash_info *, u8,
 		       const struct qstr *, u64, u64 *, int);
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index 4561c60c95e3..61027d349cd8 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -1124,46 +1124,230 @@ static const struct address_space_operations bch_address_space_operations = {
 	.error_remove_page = generic_error_remove_page,
 };
 
-#if 0
-static struct inode *bch2_nfs_get_inode(struct super_block *sb,
-		u64 ino, u32 generation)
+struct bcachefs_fid {
+	u64		inum;
+	u32		subvol;
+	u32		gen;
+} __packed;
+
+struct bcachefs_fid_with_parent {
+	struct bcachefs_fid	fid;
+	struct bcachefs_fid	dir;
+} __packed;
+
+static int bcachefs_fid_valid(int fh_len, int fh_type)
 {
-	struct bch_fs *c = sb->s_fs_info;
-	struct inode *vinode;
+	switch (fh_type) {
+	case FILEID_BCACHEFS_WITHOUT_PARENT:
+		return fh_len == sizeof(struct bcachefs_fid) / sizeof(u32);
+	case FILEID_BCACHEFS_WITH_PARENT:
+		return fh_len == sizeof(struct bcachefs_fid_with_parent) / sizeof(u32);
+	default:
+		return false;
+	}
+}
+
+static struct bcachefs_fid bch2_inode_to_fid(struct bch_inode_info *inode)
+{
+	return (struct bcachefs_fid) {
+		.inum	= inode->ei_inode.bi_inum,
+		.subvol	= inode->ei_subvol,
+		.gen	= inode->ei_inode.bi_generation,
+	};
+}
+
+static int bch2_encode_fh(struct inode *vinode, u32 *fh, int *len,
+			  struct inode *vdir)
+{
+	struct bch_inode_info *inode	= to_bch_ei(vinode);
+	struct bch_inode_info *dir	= to_bch_ei(vdir);
+
+	if (*len < sizeof(struct bcachefs_fid_with_parent) / sizeof(u32))
+		return FILEID_INVALID;
+
+	if (!S_ISDIR(inode->v.i_mode) && dir) {
+		struct bcachefs_fid_with_parent *fid = (void *) fh;
+
+		fid->fid = bch2_inode_to_fid(inode);
+		fid->dir = bch2_inode_to_fid(dir);
+
+		*len = sizeof(*fid) / sizeof(u32);
+		return FILEID_BCACHEFS_WITH_PARENT;
+	} else {
+		struct bcachefs_fid *fid = (void *) fh;
 
-	if (ino < BCACHEFS_ROOT_INO)
-		return ERR_PTR(-ESTALE);
+		*fid = bch2_inode_to_fid(inode);
 
-	vinode = bch2_vfs_inode_get(c, ino);
-	if (IS_ERR(vinode))
-		return ERR_CAST(vinode);
-	if (generation && vinode->i_generation != generation) {
-		/* we didn't find the right inode.. */
+		*len = sizeof(*fid) / sizeof(u32);
+		return FILEID_BCACHEFS_WITHOUT_PARENT;
+	}
+}
+
+static struct inode *bch2_nfs_get_inode(struct super_block *sb,
+					struct bcachefs_fid fid)
+{
+	struct bch_fs *c = sb->s_fs_info;
+	struct inode *vinode = bch2_vfs_inode_get(c, (subvol_inum) {
+				    .subvol = fid.subvol,
+				    .inum = fid.inum,
+	});
+	if (!IS_ERR(vinode) && vinode->i_generation != fid.gen) {
 		iput(vinode);
-		return ERR_PTR(-ESTALE);
+		vinode = ERR_PTR(-ESTALE);
 	}
 	return vinode;
 }
 
-static struct dentry *bch2_fh_to_dentry(struct super_block *sb, struct fid *fid,
+static struct dentry *bch2_fh_to_dentry(struct super_block *sb, struct fid *_fid,
 		int fh_len, int fh_type)
 {
-	return generic_fh_to_dentry(sb, fid, fh_len, fh_type,
-				    bch2_nfs_get_inode);
+	struct bcachefs_fid *fid = (void *) _fid;
+
+	if (!bcachefs_fid_valid(fh_len, fh_type))
+		return NULL;
+
+	return d_obtain_alias(bch2_nfs_get_inode(sb, *fid));
 }
 
-static struct dentry *bch2_fh_to_parent(struct super_block *sb, struct fid *fid,
+static struct dentry *bch2_fh_to_parent(struct super_block *sb, struct fid *_fid,
 		int fh_len, int fh_type)
 {
-	return generic_fh_to_parent(sb, fid, fh_len, fh_type,
-				    bch2_nfs_get_inode);
+	struct bcachefs_fid_with_parent *fid = (void *) _fid;
+
+	if (!bcachefs_fid_valid(fh_len, fh_type) ||
+	    fh_type != FILEID_BCACHEFS_WITH_PARENT)
+		return NULL;
+
+	return d_obtain_alias(bch2_nfs_get_inode(sb, fid->dir));
+}
+
+static struct dentry *bch2_get_parent(struct dentry *child)
+{
+	struct bch_inode_info *inode = to_bch_ei(child->d_inode);
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	subvol_inum parent_inum = {
+		.subvol = inode->ei_inode.bi_parent_subvol ?:
+			inode->ei_subvol,
+		.inum = inode->ei_inode.bi_dir,
+	};
+
+	if (!parent_inum.inum)
+		return NULL;
+
+	return d_obtain_alias(bch2_vfs_inode_get(c, parent_inum));
+}
+
+static int bch2_get_name(struct dentry *parent, char *name, struct dentry *child)
+{
+	struct bch_inode_info *inode	= to_bch_ei(child->d_inode);
+	struct bch_inode_info *dir	= to_bch_ei(parent->d_inode);
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	struct btree_trans trans;
+	struct btree_iter iter1;
+	struct btree_iter iter2;
+	struct bkey_s_c k;
+	struct bkey_s_c_dirent d;
+	struct bch_inode_unpacked inode_u;
+	subvol_inum target;
+	u32 snapshot;
+	unsigned name_len;
+	int ret;
+
+	if (!S_ISDIR(dir->v.i_mode))
+		return -EINVAL;
+
+	bch2_trans_init(&trans, c, 0, 0);
+
+	bch2_trans_iter_init(&trans, &iter1, BTREE_ID_dirents,
+			     POS(dir->ei_inode.bi_inum, 0), 0);
+	bch2_trans_iter_init(&trans, &iter2, BTREE_ID_dirents,
+			     POS(dir->ei_inode.bi_inum, 0), 0);
+retry:
+	bch2_trans_begin(&trans);
+
+	ret = bch2_subvolume_get_snapshot(&trans, dir->ei_subvol, &snapshot);
+	if (ret)
+		goto err;
+
+	bch2_btree_iter_set_snapshot(&iter1, snapshot);
+	bch2_btree_iter_set_snapshot(&iter2, snapshot);
+
+	ret = bch2_inode_find_by_inum_trans(&trans, inode_inum(inode), &inode_u);
+	if (ret)
+		goto err;
+
+	if (inode_u.bi_dir == dir->ei_inode.bi_inum) {
+		bch2_btree_iter_set_pos(&iter1, POS(inode_u.bi_dir, inode_u.bi_dir_offset));
+
+		k = bch2_btree_iter_peek_slot(&iter1);
+		ret = bkey_err(k);
+		if (ret)
+			goto err;
+
+		if (k.k->type != KEY_TYPE_dirent) {
+			ret = -ENOENT;
+			goto err;
+		}
+
+		d = bkey_s_c_to_dirent(k);
+		ret = bch2_dirent_read_target(&trans, inode_inum(dir), d, &target);
+		if (ret > 0)
+			ret = -ENOENT;
+		if (ret)
+			goto err;
+
+		if (target.subvol	== inode->ei_subvol &&
+		    target.inum		== inode->ei_inode.bi_inum)
+			goto found;
+	} else {
+		/*
+		 * File with multiple hardlinks and our backref is to the wrong
+		 * directory - linear search:
+		 */
+		for_each_btree_key_continue_norestart(iter2, 0, k, ret) {
+			if (k.k->p.inode > dir->ei_inode.bi_inum)
+				break;
+
+			if (k.k->type != KEY_TYPE_dirent)
+				continue;
+
+			d = bkey_s_c_to_dirent(k);
+			ret = bch2_dirent_read_target(&trans, inode_inum(dir), d, &target);
+			if (ret < 0)
+				break;
+			if (ret)
+				continue;
+
+			if (target.subvol	== inode->ei_subvol &&
+			    target.inum		== inode->ei_inode.bi_inum)
+				goto found;
+		}
+	}
+
+	ret = -ENOENT;
+	goto err;
+found:
+	name_len = min_t(unsigned, bch2_dirent_name_bytes(d), NAME_MAX);
+
+	memcpy(name, d.v->d_name, name_len);
+	name[name_len] = '\0';
+err:
+	if (ret == -EINTR)
+		goto retry;
+
+	bch2_trans_iter_exit(&trans, &iter1);
+	bch2_trans_iter_exit(&trans, &iter2);
+	bch2_trans_exit(&trans);
+
+	return ret;
 }
-#endif
 
 static const struct export_operations bch_export_ops = {
-	//.fh_to_dentry	= bch2_fh_to_dentry,
-	//.fh_to_parent	= bch2_fh_to_parent,
-	//.get_parent	= bch2_get_parent,
+	.encode_fh	= bch2_encode_fh,
+	.fh_to_dentry	= bch2_fh_to_dentry,
+	.fh_to_parent	= bch2_fh_to_parent,
+	.get_parent	= bch2_get_parent,
+	.get_name	= bch2_get_name,
 };
 
 static void bch2_vfs_inode_init(struct btree_trans *trans, subvol_inum inum,
diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h
index 11fbd0ee1370..f49a7d31167e 100644
--- a/include/linux/exportfs.h
+++ b/include/linux/exportfs.h
@@ -98,6 +98,12 @@ enum fid_type {
 	 */
 	FILEID_FAT_WITH_PARENT = 0x72,
 
+	/*
+	 * 64 bit inode number, 32 bit subvolume, 32 bit generation number:
+	 */
+	FILEID_BCACHEFS_WITHOUT_PARENT = 0x80,
+	FILEID_BCACHEFS_WITH_PARENT = 0x81,
+
 	/*
 	 * 128 bit child FID (struct lu_fid)
 	 * 128 bit parent FID (struct lu_fid)
-- 
cgit 


From 0a84a066f9a1455ce703850ac5918270d7a4019d Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 15 Nov 2021 15:03:06 -0500
Subject: bcachefs: Also log device name in userspace

Change log messages in userspace to be closer to what they are in kernel
space, and include the device name - it's also useful in userspace.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/bcachefs.h | 4 ++--
 fs/bcachefs/recovery.c | 3 ++-
 2 files changed, 4 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 567270015008..966e185201d1 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -218,8 +218,8 @@
 #define bch2_fmt(_c, fmt)		"bcachefs (%s): " fmt "\n", ((_c)->name)
 #define bch2_fmt_inum(_c, _inum, fmt)	"bcachefs (%s inum %llu): " fmt "\n", ((_c)->name), (_inum)
 #else
-#define bch2_fmt(_c, fmt)		fmt "\n"
-#define bch2_fmt_inum(_c, _inum, fmt)	"inum %llu: " fmt "\n", (_inum)
+#define bch2_fmt(_c, fmt)		"%s: " fmt "\n", ((_c)->name)
+#define bch2_fmt_inum(_c, _inum, fmt)	"%s inum %llu: " fmt "\n", ((_c)->name), (_inum)
 #endif
 
 #define bch_info(c, fmt, ...) \
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 373e309299bb..be8912605527 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -1031,6 +1031,8 @@ int bch2_fs_recovery(struct bch_fs *c)
 	if (c->sb.clean)
 		bch_info(c, "recovering from clean shutdown, journal seq %llu",
 			 le64_to_cpu(clean->journal_seq));
+	else
+		bch_info(c, "recovering from unclean shutdown");
 
 	if (!(c->sb.features & (1ULL << BCH_FEATURE_new_extent_overwrite))) {
 		bch_err(c, "feature new_extent_overwrite not set, filesystem no longer supported");
@@ -1049,7 +1051,6 @@ int bch2_fs_recovery(struct bch_fs *c)
 		bch_err(c, "filesystem may have incompatible bkey formats; run fsck from the compat branch to fix");
 		ret = -EINVAL;
 		goto err;
-
 	}
 
 	if (!(c->sb.features & (1ULL << BCH_FEATURE_alloc_v2))) {
-- 
cgit 


From c714614bd06cc422f56c02475adf03dc618bf385 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 15 Nov 2021 15:02:13 -0500
Subject: bcachefs: Disk space accounting fix on brand-new fs

The filesystem initialization path first marks superblock and journal
buckets non transactionally, since the btree isn't functional yet. That
path was updating the per-journal-buf percpu counters via
bch2_dev_usage_update(), and updating the wrong set of counters so those
updates didn't get written out until journal entry 4.

The relevant code is going to get significantly rewritten in the future
as we transition away from the in memory bucket array, so this just
hacks around it for now.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/bcachefs.h |  1 +
 fs/bcachefs/buckets.c  | 11 +++++++++++
 fs/bcachefs/super-io.c |  8 ++++++++
 3 files changed, 20 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 966e185201d1..077d366961ff 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -495,6 +495,7 @@ struct bch_dev {
 
 enum {
 	/* startup: */
+	BCH_FS_INITIALIZED,
 	BCH_FS_ALLOC_READ_DONE,
 	BCH_FS_ALLOC_CLEAN,
 	BCH_FS_ALLOCATOR_RUNNING,
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index d4d41646b2e6..c3387689fbb6 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -117,6 +117,8 @@ static inline struct bch_dev_usage *dev_usage_ptr(struct bch_dev *ca,
 						  unsigned journal_seq,
 						  bool gc)
 {
+	BUG_ON(!gc && !journal_seq);
+
 	return this_cpu_ptr(gc
 			    ? ca->usage_gc
 			    : ca->usage[journal_seq & JOURNAL_BUF_MASK]);
@@ -142,6 +144,8 @@ static inline struct bch_fs_usage *fs_usage_ptr(struct bch_fs *c,
 						unsigned journal_seq,
 						bool gc)
 {
+	BUG_ON(!gc && !journal_seq);
+
 	return this_cpu_ptr(gc
 			    ? c->usage_gc
 			    : c->usage[journal_seq & JOURNAL_BUF_MASK]);
@@ -364,6 +368,13 @@ static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
 	struct bch_fs_usage *fs_usage;
 	struct bch_dev_usage *u;
 
+	/*
+	 * Hack for bch2_fs_initialize path, where we're first marking sb and
+	 * journal non-transactionally:
+	 */
+	if (!journal_seq && !test_bit(BCH_FS_INITIALIZED, &c->flags))
+		journal_seq = 1;
+
 	percpu_rwsem_assert_held(&c->mark_lock);
 
 	preempt_disable();
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index 637408d76270..3cc5d8ea743f 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -447,8 +447,16 @@ int bch2_sb_to_fs(struct bch_fs *c, struct bch_sb *src)
 
 	if (BCH_SB_HAS_ERRORS(c->disk_sb.sb))
 		set_bit(BCH_FS_ERROR, &c->flags);
+	else
+		clear_bit(BCH_FS_ERROR, &c->flags);
+
 	if (BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb))
 		set_bit(BCH_FS_TOPOLOGY_ERROR, &c->flags);
+	else
+		clear_bit(BCH_FS_TOPOLOGY_ERROR, &c->flags);
+
+	if (BCH_SB_INITIALIZED(c->disk_sb.sb))
+		set_bit(BCH_FS_INITIALIZED, &c->flags);
 
 	ret = bch2_sb_replicas_to_cpu_replicas(c);
 	if (ret)
-- 
cgit 


From f0c3f88b35e1fac6e3b7cec5635e43d4e595cf7a Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 27 Oct 2021 12:51:12 -0400
Subject: bcachefs: Run insert triggers before overwrite triggers

Currently, btree triggers are run in natural key order, which presents a
problem for fallocate in INSERT_RANGE mode: since we're moving existing
extents to higher offsets, the trigger for deleting the old extent runs
before the trigger that adds the new extent, potentially leading to
indirect extents being deleted that shouldn't be when the delete causes
the refcount to hit 0.

This changes the order we run triggers so that for a givin btree, we run
all insert triggers before overwrite triggers, nicely sidestepping this
issue.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_types.h       |   3 +-
 fs/bcachefs/btree_update_leaf.c | 133 ++++++++++++++++++++++++++++++++--------
 fs/bcachefs/buckets.c           |  35 -----------
 fs/bcachefs/buckets.h           |   2 -
 4 files changed, 109 insertions(+), 64 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 5331626e62a5..25b0df22366b 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -338,7 +338,8 @@ struct btree_insert_entry {
 	enum btree_id		btree_id:8;
 	u8			level;
 	bool			cached:1;
-	bool			trans_triggers_run:1;
+	bool			insert_trigger_run:1;
+	bool			overwrite_trigger_run:1;
 	struct bkey_i		*k;
 	struct btree_path	*path;
 	unsigned long		ip_allocated;
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 4e9f7e3b5a61..61c87525e48d 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -816,10 +816,112 @@ bch2_trans_commit_get_rw_cold(struct btree_trans *trans)
 	return 0;
 }
 
+static int bch2_trans_commit_run_triggers(struct btree_trans *trans)
+{
+	struct bkey		_deleted = KEY(0, 0, 0);
+	struct bkey_s_c		deleted = (struct bkey_s_c) { &_deleted, NULL };
+	struct bkey_s_c		old;
+	struct bkey		unpacked;
+	struct btree_insert_entry *i = NULL, *btree_id_start = trans->updates;
+	bool trans_trigger_run;
+	unsigned btree_id = 0;
+	int ret = 0;
+
+	/*
+	 *
+	 * For a given btree, this algorithm runs insert triggers before
+	 * overwrite triggers: this is so that when extents are being moved
+	 * (e.g. by FALLOCATE_FL_INSERT_RANGE), we don't drop references before
+	 * they are re-added.
+	 */
+	for (btree_id = 0; btree_id < BTREE_ID_NR; btree_id++) {
+		while (btree_id_start < trans->updates + trans->nr_updates &&
+		       btree_id_start->btree_id < btree_id)
+			btree_id_start++;
+
+		/*
+		 * Running triggers will append more updates to the list of updates as
+		 * we're walking it:
+		 */
+		do {
+			trans_trigger_run = false;
+
+			for (i = btree_id_start;
+			     i < trans->updates + trans->nr_updates && i->btree_id <= btree_id;
+			     i++) {
+				if (i->insert_trigger_run ||
+				    (i->flags & BTREE_TRIGGER_NORUN) ||
+				    !(BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type)))
+					continue;
+
+				BUG_ON(i->overwrite_trigger_run);
+
+				i->insert_trigger_run = true;
+				trans_trigger_run = true;
+
+				old = bch2_btree_path_peek_slot(i->path, &unpacked);
+				_deleted.p = i->path->pos;
+
+				if (old.k->type == i->k->k.type &&
+				    ((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) {
+					i->overwrite_trigger_run = true;
+					ret = bch2_trans_mark_key(trans, old, bkey_i_to_s_c(i->k),
+							BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|i->flags);
+				} else {
+					ret = bch2_trans_mark_key(trans, deleted, bkey_i_to_s_c(i->k),
+							BTREE_TRIGGER_INSERT|i->flags);
+				}
+
+				if (ret == -EINTR)
+					trace_trans_restart_mark(trans->ip, _RET_IP_,
+							i->btree_id, &i->path->pos);
+				if (ret)
+					return ret;
+			}
+		} while (trans_trigger_run);
+
+		do {
+			trans_trigger_run = false;
+
+			for (i = btree_id_start;
+			     i < trans->updates + trans->nr_updates && i->btree_id <= btree_id;
+			     i++) {
+				if (i->overwrite_trigger_run ||
+				    (i->flags & BTREE_TRIGGER_NORUN) ||
+				    !(BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type)))
+					continue;
+
+				BUG_ON(!i->insert_trigger_run);
+
+				i->overwrite_trigger_run = true;
+				trans_trigger_run = true;
+
+				old = bch2_btree_path_peek_slot(i->path, &unpacked);
+				_deleted.p = i->path->pos;
+
+				ret = bch2_trans_mark_key(trans, old, deleted,
+						BTREE_TRIGGER_OVERWRITE|i->flags);
+
+				if (ret == -EINTR)
+					trace_trans_restart_mark(trans->ip, _RET_IP_,
+							i->btree_id, &i->path->pos);
+				if (ret)
+					return ret;
+			}
+		} while (trans_trigger_run);
+	}
+
+	trans_for_each_update(trans, i)
+		BUG_ON(!(i->flags & BTREE_TRIGGER_NORUN) &&
+		       (BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type)) &&
+		       (!i->insert_trigger_run || !i->overwrite_trigger_run));
+
+	return 0;
+}
+
 int __bch2_trans_commit(struct btree_trans *trans)
 {
 	struct btree_insert_entry *i = NULL;
-	bool trans_trigger_run;
 	unsigned u64s;
 	int ret = 0;
 
@@ -854,30 +956,9 @@ int __bch2_trans_commit(struct btree_trans *trans)
 					i->btree_id, i->k->k.p);
 #endif
 
-	/*
-	 * Running triggers will append more updates to the list of updates as
-	 * we're walking it:
-	 */
-	do {
-		trans_trigger_run = false;
-
-		trans_for_each_update(trans, i) {
-			if ((BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type)) &&
-			    !i->trans_triggers_run) {
-				i->trans_triggers_run = true;
-				trans_trigger_run = true;
-
-				ret = bch2_trans_mark_update(trans, i->path,
-							     i->k, i->flags);
-				if (unlikely(ret)) {
-					if (ret == -EINTR)
-						trace_trans_restart_mark(trans->ip, _RET_IP_,
-								i->btree_id, &i->path->pos);
-					goto out;
-				}
-			}
-		}
-	} while (trans_trigger_run);
+	ret = bch2_trans_commit_run_triggers(trans);
+	if (ret)
+		goto out;
 
 	trans_for_each_update(trans, i) {
 		BUG_ON(!i->path->should_be_locked);
@@ -1297,7 +1378,7 @@ int bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter,
 
 	if (i < trans->updates + trans->nr_updates &&
 	    !btree_insert_entry_cmp(&n, i)) {
-		BUG_ON(i->trans_triggers_run);
+		BUG_ON(i->insert_trigger_run || i->overwrite_trigger_run);
 
 		/*
 		 * This is a hack to ensure that inode creates update the btree,
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index c3387689fbb6..2c0a385ace50 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -1882,41 +1882,6 @@ int bch2_trans_mark_key(struct btree_trans *trans, struct bkey_s_c old,
 	}
 }
 
-int bch2_trans_mark_update(struct btree_trans *trans,
-			   struct btree_path *path,
-			   struct bkey_i *new,
-			   unsigned flags)
-{
-	struct bkey		_deleted = KEY(0, 0, 0);
-	struct bkey_s_c		deleted = (struct bkey_s_c) { &_deleted, NULL };
-	struct bkey_s_c		old;
-	struct bkey		unpacked;
-	int ret;
-
-	_deleted.p = path->pos;
-
-	if (unlikely(flags & BTREE_TRIGGER_NORUN))
-		return 0;
-
-	if (!btree_node_type_needs_gc(path->btree_id))
-		return 0;
-
-	old = bch2_btree_path_peek_slot(path, &unpacked);
-
-	if (old.k->type == new->k.type &&
-	    ((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) {
-		ret   = bch2_trans_mark_key(trans, old, bkey_i_to_s_c(new),
-				BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|flags);
-	} else {
-		ret   = bch2_trans_mark_key(trans, deleted, bkey_i_to_s_c(new),
-				BTREE_TRIGGER_INSERT|flags) ?:
-			bch2_trans_mark_key(trans, old, deleted,
-				BTREE_TRIGGER_OVERWRITE|flags);
-	}
-
-	return ret;
-}
-
 static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
 				    struct bch_dev *ca, size_t b,
 				    enum bch_data_type type,
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index 41374463710c..54a29bf69d67 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -233,8 +233,6 @@ int bch2_mark_update(struct btree_trans *, struct btree_path *,
 
 int bch2_trans_mark_key(struct btree_trans *, struct bkey_s_c,
 			struct bkey_s_c, unsigned);
-int bch2_trans_mark_update(struct btree_trans *, struct btree_path *,
-			   struct bkey_i *, unsigned);
 void bch2_trans_fs_usage_apply(struct btree_trans *, struct replicas_delta_list *);
 
 int bch2_trans_mark_metadata_bucket(struct btree_trans *, struct bch_dev *,
-- 
cgit 


From 9be1efe9c57e3eed5fc569caee47d0ddc96530db Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 15 Nov 2021 17:30:11 -0500
Subject: bcachefs: Fix error reporting from bch2_journal_flush_seq

- bch2_journal_halt() was unconditionally overwriting j->err_seq, the
  sequence number that we failed to write
- journal_write_done was updating seq_ondisk and flushed_seq_ondisk even
  for writes that errored, which broke the way bch2_journal_flush_seq_async()
  locklessly checked for completions.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/journal.c    |  7 ++++++-
 fs/bcachefs/journal_io.c | 15 ++++++++-------
 fs/bcachefs/recovery.c   |  2 +-
 3 files changed, 15 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index 1ee012d94b4a..56c477bbce0f 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -106,7 +106,12 @@ void bch2_journal_halt(struct journal *j)
 	} while ((v = atomic64_cmpxchg(&j->reservations.counter,
 				       old.v, new.v)) != old.v);
 
-	j->err_seq = journal_cur_seq(j);
+	/*
+	 * XXX: we're not using j->lock here because this can be called from
+	 * interrupt context, this can race with journal_write_done()
+	 */
+	if (!j->err_seq)
+		j->err_seq = journal_cur_seq(j);
 	journal_wake(j);
 	closure_wake_up(&journal_cur_buf(j)->wait);
 }
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index ed8d7f90b607..0cd5ad3118e9 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -1258,14 +1258,15 @@ static void journal_write_done(struct closure *cl)
 	if (seq >= j->pin.front)
 		journal_seq_pin(j, seq)->devs = w->devs_written;
 
-	j->seq_ondisk		= seq;
-	if (err && (!j->err_seq || seq < j->err_seq))
-		j->err_seq	= seq;
+	if (!err) {
+		j->seq_ondisk		= seq;
 
-	if (!JSET_NO_FLUSH(w->data)) {
-		j->flushed_seq_ondisk = seq;
-		j->last_seq_ondisk = w->last_seq;
-	}
+		if (!JSET_NO_FLUSH(w->data)) {
+			j->flushed_seq_ondisk = seq;
+			j->last_seq_ondisk = w->last_seq;
+		}
+	} else if (!j->err_seq || seq < j->err_seq)
+		j->err_seq	= seq;
 
 	/*
 	 * Updating last_seq_ondisk may let bch2_journal_reclaim_work() discard
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index be8912605527..c3b4d116275c 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -1480,7 +1480,7 @@ int bch2_fs_initialize(struct bch_fs *c)
 	}
 
 	err = "error writing first journal entry";
-	ret = bch2_journal_meta(&c->journal);
+	ret = bch2_journal_flush(&c->journal);
 	if (ret)
 		goto err;
 
-- 
cgit 


From e5464a371d048865cd4fcba48879c4b37727df2e Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 20 Nov 2021 22:59:25 -0500
Subject: bcachefs: Add a bit of missing repair code

This adds repair code to drop very stale pointers.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_gc.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 197f5c0f3a9a..ec5b7e2c7271 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -498,6 +498,10 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id,
 	char buf[200];
 	int ret = 0;
 
+	/*
+	 * XXX
+	 * use check_bucket_ref here
+	 */
 	bkey_for_each_ptr_decode(k->k, ptrs, p, entry) {
 		struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev);
 		struct bucket *g = PTR_BUCKET(ca, &p.ptr, true);
@@ -553,6 +557,15 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id,
 			}
 		}
 
+		if (fsck_err_on(gen_cmp(g->mark.gen, p.ptr.gen) > BUCKET_GC_GEN_MAX, c,
+				"bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n"
+				"while marking %s",
+				p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->mark.gen,
+				bch2_data_types[ptr_data_type(k->k, &p.ptr)],
+				p.ptr.gen,
+				(bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf)))
+			do_update = true;
+
 		if (fsck_err_on(!p.ptr.cached &&
 				gen_cmp(p.ptr.gen, g->mark.gen) < 0, c,
 				"bucket %u:%zu data type %s stale dirty ptr: %u < %u\n"
@@ -644,6 +657,7 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id,
 				 (!g->gen_valid || gen_cmp(ptr->gen, g->mark.gen) > 0)) ||
 				(!ptr->cached &&
 				 gen_cmp(ptr->gen, g->mark.gen) < 0) ||
+				gen_cmp(g->mark.gen, ptr->gen) > BUCKET_GC_GEN_MAX ||
 				(g->mark.data_type &&
 				 g->mark.data_type != data_type);
 			}));
-- 
cgit 


From 7468c4effc8c93464ec0fd4336494312bebb8033 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 21 Nov 2021 16:15:48 -0500
Subject: bcachefs: Fix BCH_FS_ERROR flag handling

We were setting BCH_FS_ERROR on startup if the superblock was marked as
containing errors, which is not what we wanted - BCH_FS_ERROR indicates
whether errors have been found, so that after a successful fsck we're
able to clear the error bit in the superblock.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_gc.c |  2 +-
 fs/bcachefs/super-io.c | 10 ----------
 2 files changed, 1 insertion(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index ec5b7e2c7271..a15b3bfa9d47 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -1632,7 +1632,7 @@ again:
 
 	bch2_mark_superblocks(c);
 
-	if (test_bit(BCH_FS_TOPOLOGY_ERROR, &c->flags) &&
+	if (BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb) &&
 	    !test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags) &&
 	    c->opts.fix_errors != FSCK_OPT_NO) {
 		bch_info(c, "starting topology repair pass");
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index 3cc5d8ea743f..170f7d46fa34 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -445,16 +445,6 @@ int bch2_sb_to_fs(struct bch_fs *c, struct bch_sb *src)
 
 	__copy_super(&c->disk_sb, src);
 
-	if (BCH_SB_HAS_ERRORS(c->disk_sb.sb))
-		set_bit(BCH_FS_ERROR, &c->flags);
-	else
-		clear_bit(BCH_FS_ERROR, &c->flags);
-
-	if (BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb))
-		set_bit(BCH_FS_TOPOLOGY_ERROR, &c->flags);
-	else
-		clear_bit(BCH_FS_TOPOLOGY_ERROR, &c->flags);
-
 	if (BCH_SB_INITIALIZED(c->disk_sb.sb))
 		set_bit(BCH_FS_INITIALIZED, &c->flags);
 
-- 
cgit 


From 8810386f6bd55cf3287a1219901c2993b3bac959 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 21 Nov 2021 22:34:26 -0500
Subject: bcachefs: Fix an i_sectors accounting bug

We weren't checking for errors before calling i_sectors_acct()

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/fs-io.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index ac013bb99a43..dff6d7547212 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -2806,6 +2806,8 @@ static int __bchfs_fallocate(struct bch_inode_info *inode, int mode,
 					 &reservation.k_i,
 				&disk_res, NULL,
 				0, &i_sectors_delta, true);
+		if (ret)
+			goto bkey_err;
 		i_sectors_acct(c, inode, &quota_res, i_sectors_delta);
 bkey_err:
 		bch2_quota_reservation_put(c, inode, &quota_res);
-- 
cgit 


From b19d307dc11586df2e2b7430c0ccbfa439bf7a0b Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 22 Nov 2021 12:47:20 -0500
Subject: bcachefs: Fix i_sectors_leak in bch2_truncate_page

When bch2_truncate_page() discards dirty sectors in the page cache, we
need to account for that - we don't need to account for allocated
sectors because that'll be done by the bch2_fpunch() call when it
updates the btree.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/fs-io.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index dff6d7547212..68e707fcf9cd 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -2241,6 +2241,7 @@ static int __bch2_truncate_page(struct bch_inode_info *inode,
 	unsigned end_offset = ((end - 1) & (PAGE_SIZE - 1)) + 1;
 	unsigned i;
 	struct page *page;
+	s64 i_sectors_delta = 0;
 	int ret = 0;
 
 	/* Page boundary? Nothing to do */
@@ -2292,9 +2293,13 @@ static int __bch2_truncate_page(struct bch_inode_info *inode,
 	     i < round_down(end_offset, block_bytes(c)) >> 9;
 	     i++) {
 		s->s[i].nr_replicas	= 0;
+		if (s->s[i].state == SECTOR_DIRTY)
+			i_sectors_delta--;
 		s->s[i].state		= SECTOR_UNALLOCATED;
 	}
 
+	i_sectors_acct(c, inode, NULL, i_sectors_delta);
+
 	/*
 	 * Caller needs to know whether this page will be written out by
 	 * writeback - doing an i_size update if necessary - or whether it will
-- 
cgit 


From b44a66a64123efb3e6aebaa0cedec722ecbfbba4 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 23 Nov 2021 17:05:56 -0500
Subject: bcachefs: SECTOR_DIRTY_RESERVED

This fixes another i_sectors accounting bug - we need to differentiate
between dirty writes that overwrite a reservation and dirty writes to
unallocated space - dirty writes to unallocated space increase
i_sectors, dirty writes over a reservation do not.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/fs-io.c | 97 +++++++++++++++++++++++++++++++++--------------------
 1 file changed, 60 insertions(+), 37 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 68e707fcf9cd..c014f10885dc 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -231,6 +231,9 @@ static void i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode,
 		return;
 
 	mutex_lock(&inode->ei_quota_lock);
+	BUG_ON((s64) inode->v.i_blocks + sectors < 0);
+	inode->v.i_blocks += sectors;
+
 #ifdef CONFIG_BCACHEFS_QUOTA
 	if (quota_res && sectors > 0) {
 		BUG_ON(sectors > quota_res->sectors);
@@ -242,7 +245,6 @@ static void i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode,
 		bch2_quota_acct(c, inode->ei_qid, Q_SPC, sectors, KEY_TYPE_QUOTA_WARN);
 	}
 #endif
-	inode->v.i_blocks += sectors;
 	mutex_unlock(&inode->ei_quota_lock);
 }
 
@@ -251,19 +253,20 @@ static void i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode,
 /* stored in page->private: */
 
 struct bch_page_sector {
-	/* Uncompressed, fully allocated replicas: */
-	unsigned		nr_replicas:3;
+	/* Uncompressed, fully allocated replicas (or on disk reservation): */
+	unsigned		nr_replicas:4;
 
-	/* Owns PAGE_SECTORS * replicas_reserved sized reservation: */
-	unsigned		replicas_reserved:3;
+	/* Owns PAGE_SECTORS * replicas_reserved sized in memory reservation: */
+	unsigned		replicas_reserved:4;
 
 	/* i_sectors: */
 	enum {
 		SECTOR_UNALLOCATED,
 		SECTOR_RESERVED,
 		SECTOR_DIRTY,
+		SECTOR_DIRTY_RESERVED,
 		SECTOR_ALLOCATED,
-	}			state:2;
+	}			state:8;
 };
 
 struct bch_page_state {
@@ -319,6 +322,36 @@ static struct bch_page_state *bch2_page_state_create(struct page *page,
 	return bch2_page_state(page) ?: __bch2_page_state_create(page, gfp);
 }
 
+static unsigned bkey_to_sector_state(const struct bkey *k)
+{
+	if (k->type == KEY_TYPE_reservation)
+		return SECTOR_RESERVED;
+	if (bkey_extent_is_allocation(k))
+		return SECTOR_ALLOCATED;
+	return SECTOR_UNALLOCATED;
+}
+
+static void bch2_bio_page_state_set(struct bio *bio, struct bkey_s_c k)
+{
+	struct bvec_iter iter;
+	struct bio_vec bv;
+	unsigned nr_ptrs = k.k->type == KEY_TYPE_reflink_v
+		? 0 : bch2_bkey_nr_ptrs_fully_allocated(k);
+	unsigned state = bkey_to_sector_state(k.k);
+
+	bio_for_each_segment(bv, bio, iter) {
+		struct bch_page_state *s = bch2_page_state(bv.bv_page);
+		unsigned i;
+
+		for (i = bv.bv_offset >> 9;
+		     i < (bv.bv_offset + bv.bv_len) >> 9;
+		     i++) {
+			s->s[i].nr_replicas = nr_ptrs;
+			s->s[i].state = state;
+		}
+	}
+}
+
 static inline unsigned inode_nr_replicas(struct bch_fs *c, struct bch_inode_info *inode)
 {
 	/* XXX: this should not be open coded */
@@ -457,16 +490,23 @@ static void bch2_clear_page_bits(struct page *page)
 		disk_res.sectors += s->s[i].replicas_reserved;
 		s->s[i].replicas_reserved = 0;
 
-		if (s->s[i].state == SECTOR_DIRTY) {
-			dirty_sectors++;
+		switch (s->s[i].state) {
+		case SECTOR_DIRTY:
 			s->s[i].state = SECTOR_UNALLOCATED;
+			--dirty_sectors;
+			break;
+		case SECTOR_DIRTY_RESERVED:
+			s->s[i].state = SECTOR_RESERVED;
+			break;
+		default:
+			break;
 		}
 	}
 
 	bch2_disk_reservation_put(c, &disk_res);
 
 	if (dirty_sectors)
-		i_sectors_acct(c, inode, NULL, -dirty_sectors);
+		i_sectors_acct(c, inode, NULL, dirty_sectors);
 
 	bch2_page_state_release(page);
 }
@@ -499,10 +539,17 @@ static void bch2_set_page_dirty(struct bch_fs *c,
 		s->s[i].replicas_reserved += sectors;
 		res->disk.sectors -= sectors;
 
-		if (s->s[i].state == SECTOR_UNALLOCATED)
+		switch (s->s[i].state) {
+		case SECTOR_UNALLOCATED:
+			s->s[i].state = SECTOR_DIRTY;
 			dirty_sectors++;
-
-		s->s[i].state = max_t(unsigned, s->s[i].state, SECTOR_DIRTY);
+			break;
+		case SECTOR_RESERVED:
+			s->s[i].state = SECTOR_DIRTY_RESERVED;
+			break;
+		default:
+			break;
+		}
 	}
 
 	spin_unlock(&s->lock);
@@ -686,29 +733,6 @@ static inline struct page *readpage_iter_next(struct readpages_iter *iter)
 	return iter->pages[iter->idx];
 }
 
-static void bch2_add_page_sectors(struct bio *bio, struct bkey_s_c k)
-{
-	struct bvec_iter iter;
-	struct bio_vec bv;
-	unsigned nr_ptrs = k.k->type == KEY_TYPE_reflink_v
-		? 0 : bch2_bkey_nr_ptrs_fully_allocated(k);
-	unsigned state = k.k->type == KEY_TYPE_reservation
-		? SECTOR_RESERVED
-		: SECTOR_ALLOCATED;
-
-	bio_for_each_segment(bv, bio, iter) {
-		struct bch_page_state *s = bch2_page_state(bv.bv_page);
-		unsigned i;
-
-		for (i = bv.bv_offset >> 9;
-		     i < (bv.bv_offset + bv.bv_len) >> 9;
-		     i++) {
-			s->s[i].nr_replicas = nr_ptrs;
-			s->s[i].state = state;
-		}
-	}
-}
-
 static bool extent_partial_reads_expensive(struct bkey_s_c k)
 {
 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
@@ -847,8 +871,7 @@ retry:
 		if (rbio->bio.bi_iter.bi_size == bytes)
 			flags |= BCH_READ_LAST_FRAGMENT;
 
-		if (bkey_extent_is_allocation(k.k))
-			bch2_add_page_sectors(&rbio->bio, k);
+		bch2_bio_page_state_set(&rbio->bio, k);
 
 		bch2_read_extent(trans, rbio, iter.pos,
 				 data_btree, k, offset_into_extent, flags);
-- 
cgit 


From 9ca4853b98af5fa15a2ddc47a45f8e103027f95d Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 27 Oct 2021 13:05:56 -0400
Subject: bcachefs: Fix quota support for snapshots

Quota support was disabled when snapshots were released, because of some
tricky interactions with snpashots. We're sidestepping that for now -
we're simply disabling quota accounting on snapshot subvolumes.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/fs.c        | 28 ++++++++++++++------
 fs/bcachefs/fs.h        |  6 +++++
 fs/bcachefs/opts.h      | 12 ++++-----
 fs/bcachefs/quota.c     | 69 +++++++++++++++++++++++++++++++++++++------------
 fs/bcachefs/subvolume.c |  9 +++++++
 fs/bcachefs/subvolume.h |  2 ++
 6 files changed, 96 insertions(+), 30 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index 61027d349cd8..31adc0e0d452 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -39,7 +39,8 @@ static struct kmem_cache *bch2_inode_cache;
 
 static void bch2_vfs_inode_init(struct btree_trans *, subvol_inum,
 				struct bch_inode_info *,
-				struct bch_inode_unpacked *);
+				struct bch_inode_unpacked *,
+				struct bch_subvolume *);
 
 static void __pagecache_lock_put(struct pagecache_lock *lock, long i)
 {
@@ -225,6 +226,7 @@ struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum)
 	struct bch_inode_unpacked inode_u;
 	struct bch_inode_info *inode;
 	struct btree_trans trans;
+	struct bch_subvolume subvol;
 	int ret;
 
 	inode = to_bch_ei(iget5_locked(c->vfs_sb,
@@ -239,10 +241,11 @@ struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum)
 
 	bch2_trans_init(&trans, c, 8, 0);
 	ret = lockrestart_do(&trans,
+		bch2_subvolume_get(&trans, inum.subvol, true, 0, &subvol) ?:
 		bch2_inode_find_by_inum_trans(&trans, inum, &inode_u));
 
 	if (!ret)
-		bch2_vfs_inode_init(&trans, inum, inode, &inode_u);
+		bch2_vfs_inode_init(&trans, inum, inode, &inode_u, &subvol);
 	bch2_trans_exit(&trans);
 
 	if (ret) {
@@ -268,6 +271,7 @@ __bch2_create(struct mnt_idmap *idmap,
 	struct bch_inode_unpacked inode_u;
 	struct posix_acl *default_acl = NULL, *acl = NULL;
 	subvol_inum inum;
+	struct bch_subvolume subvol;
 	u64 journal_seq = 0;
 	int ret;
 
@@ -310,7 +314,12 @@ retry:
 	if (unlikely(ret))
 		goto err_before_quota;
 
-	ret   = bch2_trans_commit(&trans, NULL, &journal_seq, 0);
+	inum.subvol = inode_u.bi_subvol ?: dir->ei_subvol;
+	inum.inum = inode_u.bi_inum;
+
+	ret   = bch2_subvolume_get(&trans, inum.subvol, true,
+				   BTREE_ITER_WITH_UPDATES, &subvol) ?:
+		bch2_trans_commit(&trans, NULL, &journal_seq, 0);
 	if (unlikely(ret)) {
 		bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, -1,
 				KEY_TYPE_QUOTA_WARN);
@@ -326,11 +335,8 @@ err_before_quota:
 		mutex_unlock(&dir->ei_update_lock);
 	}
 
-	inum.subvol = inode_u.bi_subvol ?: dir->ei_subvol;
-	inum.inum = inode_u.bi_inum;
-
 	bch2_iget5_set(&inode->v, &inum);
-	bch2_vfs_inode_init(&trans, inum, inode, &inode_u);
+	bch2_vfs_inode_init(&trans, inum, inode, &inode_u, &subvol);
 
 	set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl);
 	set_cached_acl(&inode->v, ACL_TYPE_DEFAULT, default_acl);
@@ -1352,10 +1358,16 @@ static const struct export_operations bch_export_ops = {
 
 static void bch2_vfs_inode_init(struct btree_trans *trans, subvol_inum inum,
 				struct bch_inode_info *inode,
-				struct bch_inode_unpacked *bi)
+				struct bch_inode_unpacked *bi,
+				struct bch_subvolume *subvol)
 {
 	bch2_inode_update_after_write(trans, inode, bi, ~0);
 
+	if (BCH_SUBVOLUME_SNAP(subvol))
+		set_bit(EI_INODE_SNAPSHOT, &inode->ei_flags);
+	else
+		clear_bit(EI_INODE_SNAPSHOT, &inode->ei_flags);
+
 	inode->v.i_blocks	= bi->bi_sectors;
 	inode->v.i_ino		= bi->bi_inum;
 	inode->v.i_rdev		= bi->bi_dev;
diff --git a/fs/bcachefs/fs.h b/fs/bcachefs/fs.h
index 530238780a88..a67ab1ad2a31 100644
--- a/fs/bcachefs/fs.h
+++ b/fs/bcachefs/fs.h
@@ -63,6 +63,12 @@ static inline subvol_inum inode_inum(struct bch_inode_info *inode)
  */
 #define EI_INODE_ERROR			0
 
+/*
+ * Set in the inode is in a snapshot subvolume - we don't do quota accounting in
+ * those:
+ */
+#define EI_INODE_SNAPSHOT		1
+
 #define to_bch_ei(_inode)					\
 	container_of_or_null(_inode, struct bch_inode_info, v)
 
diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
index 10c022ec6ee0..896b8c9c1180 100644
--- a/fs/bcachefs/opts.h
+++ b/fs/bcachefs/opts.h
@@ -223,19 +223,19 @@ enum opt_type {
 	  BCH_SB_POSIX_ACL,		true,				\
 	  NULL,		"Enable POSIX acls")				\
 	x(usrquota,			u8,				\
-	  0,								\
+	  OPT_FORMAT|OPT_MOUNT,						\
 	  OPT_BOOL(),							\
-	  NO_SB_OPT,		false,					\
+	  BCH_SB_USRQUOTA,		false,				\
 	  NULL,		"Enable user quotas")				\
 	x(grpquota,			u8,				\
-	  0,								\
+	  OPT_FORMAT|OPT_MOUNT,						\
 	  OPT_BOOL(),							\
-	  NO_SB_OPT,		false,					\
+	  BCH_SB_GRPQUOTA,		false,				\
 	  NULL,		"Enable group quotas")				\
 	x(prjquota,			u8,				\
-	  0,								\
+	  OPT_FORMAT|OPT_MOUNT,						\
 	  OPT_BOOL(),							\
-	  NO_SB_OPT,		false,					\
+	  BCH_SB_PRJQUOTA,		false,				\
 	  NULL,		"Enable project quotas")			\
 	x(degraded,			u8,				\
 	  OPT_MOUNT,							\
diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c
index 5f1216da76d0..8f8f4b0accd6 100644
--- a/fs/bcachefs/quota.c
+++ b/fs/bcachefs/quota.c
@@ -3,6 +3,7 @@
 #include "btree_update.h"
 #include "inode.h"
 #include "quota.h"
+#include "subvolume.h"
 #include "super-io.h"
 
 static const char *bch2_sb_validate_quota(struct bch_sb *sb,
@@ -415,14 +416,55 @@ static void bch2_sb_quota_read(struct bch_fs *c)
 	}
 }
 
+static int bch2_fs_quota_read_inode(struct btree_trans *trans,
+				    struct btree_iter *iter)
+{
+	struct bch_fs *c = trans->c;
+	struct bch_inode_unpacked u;
+	struct bch_subvolume subvolume;
+	struct bkey_s_c k;
+	int ret;
+
+	k = bch2_btree_iter_peek(iter);
+	ret = bkey_err(k);
+	if (ret)
+		return ret;
+
+	if (!k.k)
+		return 1;
+
+	ret = bch2_snapshot_get_subvol(trans, k.k->p.snapshot, &subvolume);
+	if (ret)
+		return ret;
+
+	/*
+	 * We don't do quota accounting in snapshots:
+	 */
+	if (BCH_SUBVOLUME_SNAP(&subvolume))
+		goto advance;
+
+	if (!bkey_is_inode(k.k))
+		goto advance;
+
+	ret = bch2_inode_unpack(k, &u);
+	if (ret)
+		return ret;
+
+	bch2_quota_acct(c, bch_qid(&u), Q_SPC, u.bi_sectors,
+			KEY_TYPE_QUOTA_NOCHECK);
+	bch2_quota_acct(c, bch_qid(&u), Q_INO, 1,
+			KEY_TYPE_QUOTA_NOCHECK);
+advance:
+	bch2_btree_iter_set_pos(iter, POS(iter->pos.inode, iter->pos.offset + 1));
+	return 0;
+}
+
 int bch2_fs_quota_read(struct bch_fs *c)
 {
 	unsigned i, qtypes = enabled_qtypes(c);
 	struct bch_memquota_type *q;
 	struct btree_trans trans;
 	struct btree_iter iter;
-	struct bch_inode_unpacked u;
-	struct bkey_s_c k;
 	int ret;
 
 	mutex_lock(&c->sb_lock);
@@ -437,23 +479,18 @@ int bch2_fs_quota_read(struct bch_fs *c)
 
 	bch2_trans_init(&trans, c, 0, 0);
 
-	for_each_btree_key(&trans, iter, BTREE_ID_inodes, POS_MIN,
-			   BTREE_ITER_PREFETCH, k, ret) {
-		if (bkey_is_inode(k.k)) {
-			ret = bch2_inode_unpack(k, &u);
-			if (ret)
-				return ret;
-
-			bch2_quota_acct(c, bch_qid(&u), Q_SPC, u.bi_sectors,
-					KEY_TYPE_QUOTA_NOCHECK);
-			bch2_quota_acct(c, bch_qid(&u), Q_INO, 1,
-					KEY_TYPE_QUOTA_NOCHECK);
-		}
-	}
+	bch2_trans_iter_init(&trans, &iter, BTREE_ID_inodes, POS_MIN,
+			     BTREE_ITER_INTENT|
+			     BTREE_ITER_PREFETCH|
+			     BTREE_ITER_ALL_SNAPSHOTS);
+	do {
+		ret = lockrestart_do(&trans,
+				     bch2_fs_quota_read_inode(&trans, &iter));
+	} while (!ret);
 	bch2_trans_iter_exit(&trans, &iter);
 
 	bch2_trans_exit(&trans);
-	return ret;
+	return ret < 0 ? ret : 0;
 }
 
 /* Enable/disable/delete quotas for an entire filesystem: */
diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c
index 0ef625d21672..7e909a118189 100644
--- a/fs/bcachefs/subvolume.c
+++ b/fs/bcachefs/subvolume.c
@@ -789,6 +789,15 @@ int bch2_subvolume_get(struct btree_trans *trans, unsigned subvol,
 	return ret;
 }
 
+int bch2_snapshot_get_subvol(struct btree_trans *trans, u32 snapshot,
+			     struct bch_subvolume *subvol)
+{
+	struct bch_snapshot snap;
+
+	return  snapshot_lookup(trans, snapshot, &snap) ?:
+		bch2_subvolume_get(trans, le32_to_cpu(snap.subvol), true, 0, subvol);
+}
+
 int bch2_subvolume_get_snapshot(struct btree_trans *trans, u32 subvol,
 				u32 *snapid)
 {
diff --git a/fs/bcachefs/subvolume.h b/fs/bcachefs/subvolume.h
index dde755b45392..e4c3fdcdf22f 100644
--- a/fs/bcachefs/subvolume.h
+++ b/fs/bcachefs/subvolume.h
@@ -118,6 +118,8 @@ void bch2_subvolume_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c)
 
 int bch2_subvolume_get(struct btree_trans *, unsigned,
 		       bool, int, struct bch_subvolume *);
+int bch2_snapshot_get_subvol(struct btree_trans *, u32,
+			     struct bch_subvolume *);
 int bch2_subvolume_get_snapshot(struct btree_trans *, u32, u32 *);
 
 int bch2_subvolume_delete(struct btree_trans *, u32);
-- 
cgit 


From 084d42bbd67c5b2de607f56a94c7295459b16b61 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 23 Nov 2021 19:00:23 -0500
Subject: bcachefs: Apply workaround for too many btree iters to read path

Reading from cached data, which calls bch2_bucket_io_time_reset(), is
leading to transaction iterator overflows - this standardizes the
workaround.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_iter.h | 8 +++++++-
 fs/bcachefs/dirent.c     | 5 ++---
 fs/bcachefs/fs-io.c      | 4 ++++
 fs/bcachefs/io.c         | 4 ++++
 4 files changed, 17 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index 72b9605cf3e7..af1922c448ed 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -328,13 +328,19 @@ static inline struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter,
 		: bch2_btree_iter_peek(iter);
 }
 
+static inline int btree_trans_too_many_iters(struct btree_trans *trans)
+{
+	return hweight64(trans->paths_allocated) > BTREE_ITER_MAX / 2
+		? -EINTR : 0;
+}
+
 static inline struct bkey_s_c
 __bch2_btree_iter_peek_and_restart(struct btree_trans *trans,
 				   struct btree_iter *iter, unsigned flags)
 {
 	struct bkey_s_c k;
 
-	while ((hweight64(trans->paths_allocated) > BTREE_ITER_MAX / 2) ||
+	while (btree_trans_too_many_iters(trans) ||
 	       (k = __bch2_btree_iter_peek(iter, flags),
 		bkey_err(k) == -EINTR))
 		bch2_trans_begin(trans);
diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c
index 4dfcc955675b..fe4a85a6a8cb 100644
--- a/fs/bcachefs/dirent.c
+++ b/fs/bcachefs/dirent.c
@@ -531,10 +531,9 @@ retry:
 		 * read_target looks up subvolumes, we can overflow paths if the
 		 * directory has many subvolumes in it
 		 */
-		if (hweight64(trans.paths_allocated) > BTREE_ITER_MAX / 2) {
-			ret = -EINTR;
+		ret = btree_trans_too_many_iters(&trans);
+		if (ret)
 			break;
-		}
 	}
 	bch2_trans_iter_exit(&trans, &iter);
 err:
diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index c014f10885dc..b36685ebba69 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -881,6 +881,10 @@ retry:
 
 		swap(rbio->bio.bi_iter.bi_size, bytes);
 		bio_advance(&rbio->bio, bytes);
+
+		ret = btree_trans_too_many_iters(trans);
+		if (ret)
+			break;
 	}
 err:
 	bch2_trans_iter_exit(trans, &iter);
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index c5bd17f913f2..03bea2ddfb39 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -2395,6 +2395,10 @@ retry:
 
 		swap(bvec_iter.bi_size, bytes);
 		bio_advance_iter(&rbio->bio, &bvec_iter, bytes);
+
+		ret = btree_trans_too_many_iters(&trans);
+		if (ret)
+			break;
 	}
 err:
 	bch2_trans_iter_exit(&trans, &iter);
-- 
cgit 


From 7279c1a24c3dd523b5824aaf24cee9e2a55c76f0 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 23 Nov 2021 20:00:34 -0500
Subject: bcachefs: Kill PAGE_SECTOR_SHIFT

Replace it with the new, standard PAGE_SECTORS_SHIFT

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/fs-io.c | 12 ++++++------
 fs/bcachefs/util.h  |  2 --
 2 files changed, 6 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index b36685ebba69..16edd60f9b78 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -752,7 +752,7 @@ static void readpage_bio_extend(struct readpages_iter *iter,
 {
 	while (bio_sectors(bio) < sectors_this_extent &&
 	       bio->bi_vcnt < bio->bi_max_vecs) {
-		pgoff_t page_offset = bio_end_sector(bio) >> PAGE_SECTOR_SHIFT;
+		pgoff_t page_offset = bio_end_sector(bio) >> PAGE_SECTORS_SHIFT;
 		struct page *page = readpage_iter_next(iter);
 		int ret;
 
@@ -932,7 +932,7 @@ void bch2_readahead(struct readahead_control *ractl)
 
 		readpages_iter.idx++;
 
-		rbio->bio.bi_iter.bi_sector = (sector_t) index << PAGE_SECTOR_SHIFT;
+		rbio->bio.bi_iter.bi_sector = (sector_t) index << PAGE_SECTORS_SHIFT;
 		rbio->bio.bi_end_io = bch2_readpages_end_io;
 		BUG_ON(!bio_add_page(&rbio->bio, page, PAGE_SIZE, 0));
 
@@ -955,7 +955,7 @@ static void __bchfs_readpage(struct bch_fs *c, struct bch_read_bio *rbio,
 
 	rbio->bio.bi_opf = REQ_OP_READ|REQ_SYNC;
 	rbio->bio.bi_iter.bi_sector =
-		(sector_t) page->index << PAGE_SECTOR_SHIFT;
+		(sector_t) page->index << PAGE_SECTORS_SHIFT;
 	BUG_ON(!bio_add_page(&rbio->bio, page, PAGE_SIZE, 0));
 
 	bch2_trans_init(&trans, c, 0, 0);
@@ -1231,7 +1231,7 @@ do_io:
 		}
 		BUG_ON(!sectors);
 
-		sector = ((u64) page->index << PAGE_SECTOR_SHIFT) + offset;
+		sector = ((u64) page->index << PAGE_SECTORS_SHIFT) + offset;
 
 		if (w->io &&
 		    (w->io->op.res.nr_replicas != nr_replicas_this_write ||
@@ -2287,8 +2287,8 @@ static int __bch2_truncate_page(struct bch_inode_info *inode,
 		 * page
 		 */
 		ret = range_has_data(c, inode->ei_subvol,
-				POS(inode->v.i_ino, index << PAGE_SECTOR_SHIFT),
-				POS(inode->v.i_ino, (index + 1) << PAGE_SECTOR_SHIFT));
+				POS(inode->v.i_ino, index << PAGE_SECTORS_SHIFT),
+				POS(inode->v.i_ino, (index + 1) << PAGE_SECTORS_SHIFT));
 		if (ret <= 0)
 			return ret;
 
diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h
index 41dfc5867c9e..969139fef086 100644
--- a/fs/bcachefs/util.h
+++ b/fs/bcachefs/util.h
@@ -18,8 +18,6 @@
 #include <linux/vmalloc.h>
 #include <linux/workqueue.h>
 
-#define PAGE_SECTOR_SHIFT	(PAGE_SHIFT - 9)
-
 struct closure;
 
 #ifdef CONFIG_BCACHEFS_DEBUG
-- 
cgit 


From e6ec361f95fcbaffab65221cd4a56e0e10fdafc2 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 23 Nov 2021 18:17:04 -0500
Subject: bcachefs: Fix page state when reading into !PageUptodate pages

This patch adds code to read page state before writing to pages that
aren't uptodate, which corrects i_sectors being tempororarily too large
and means we may not need to get a disk reservation.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>

# Conflicts:
#	fs/bcachefs/fs-io.c
---
 fs/bcachefs/fs-io.c | 131 ++++++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 111 insertions(+), 20 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 16edd60f9b78..53998df4c14b 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -272,6 +272,7 @@ struct bch_page_sector {
 struct bch_page_state {
 	spinlock_t		lock;
 	atomic_t		write_count;
+	bool			uptodate;
 	struct bch_page_sector	s[PAGE_SECTORS];
 };
 
@@ -331,6 +332,86 @@ static unsigned bkey_to_sector_state(const struct bkey *k)
 	return SECTOR_UNALLOCATED;
 }
 
+static void __bch2_page_state_set(struct page *page,
+				  unsigned pg_offset, unsigned pg_len,
+				  unsigned nr_ptrs, unsigned state)
+{
+	struct bch_page_state *s = bch2_page_state_create(page, __GFP_NOFAIL);
+	unsigned i;
+
+	BUG_ON(pg_offset >= PAGE_SECTORS);
+	BUG_ON(pg_offset + pg_len > PAGE_SECTORS);
+
+	spin_lock(&s->lock);
+
+	for (i = pg_offset; i < pg_offset + pg_len; i++) {
+		s->s[i].nr_replicas = nr_ptrs;
+		s->s[i].state = state;
+	}
+
+	if (i == PAGE_SECTORS)
+		s->uptodate = true;
+
+	spin_unlock(&s->lock);
+}
+
+static int bch2_page_state_set(struct bch_fs *c, subvol_inum inum,
+			       struct page **pages, unsigned nr_pages)
+{
+	struct btree_trans trans;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	u64 offset = pages[0]->index << PAGE_SECTORS_SHIFT;
+	unsigned pg_idx = 0;
+	u32 snapshot;
+	int ret;
+
+	bch2_trans_init(&trans, c, 0, 0);
+retry:
+	bch2_trans_begin(&trans);
+
+	ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot);
+	if (ret)
+		goto err;
+
+	for_each_btree_key_norestart(&trans, iter, BTREE_ID_extents,
+			   SPOS(inum.inum, offset, snapshot),
+			   BTREE_ITER_SLOTS, k, ret) {
+		unsigned nr_ptrs = bch2_bkey_nr_ptrs_fully_allocated(k);
+		unsigned state = bkey_to_sector_state(k.k);
+
+		while (pg_idx < nr_pages) {
+			struct page *page = pages[pg_idx];
+			u64 pg_start = page->index << PAGE_SECTORS_SHIFT;
+			u64 pg_end = (page->index + 1) << PAGE_SECTORS_SHIFT;
+			unsigned pg_offset = max(bkey_start_offset(k.k), pg_start) - pg_start;
+			unsigned pg_len = min(k.k->p.offset, pg_end) - pg_offset - pg_start;
+
+			BUG_ON(k.k->p.offset < pg_start);
+			BUG_ON(bkey_start_offset(k.k) > pg_end);
+
+			if (!bch2_page_state_create(page, __GFP_NOFAIL)->uptodate)
+				__bch2_page_state_set(page, pg_offset, pg_len, nr_ptrs, state);
+
+			if (k.k->p.offset < pg_end)
+				break;
+			pg_idx++;
+		}
+
+		if (pg_idx == nr_pages)
+			break;
+	}
+
+	offset = iter.pos.offset;
+	bch2_trans_iter_exit(&trans, &iter);
+err:
+	if (ret == -EINTR)
+		goto retry;
+	bch2_trans_exit(&trans);
+
+	return ret;
+}
+
 static void bch2_bio_page_state_set(struct bio *bio, struct bkey_s_c k)
 {
 	struct bvec_iter iter;
@@ -339,17 +420,9 @@ static void bch2_bio_page_state_set(struct bio *bio, struct bkey_s_c k)
 		? 0 : bch2_bkey_nr_ptrs_fully_allocated(k);
 	unsigned state = bkey_to_sector_state(k.k);
 
-	bio_for_each_segment(bv, bio, iter) {
-		struct bch_page_state *s = bch2_page_state(bv.bv_page);
-		unsigned i;
-
-		for (i = bv.bv_offset >> 9;
-		     i < (bv.bv_offset + bv.bv_len) >> 9;
-		     i++) {
-			s->s[i].nr_replicas = nr_ptrs;
-			s->s[i].state = state;
-		}
-	}
+	bio_for_each_segment(bv, bio, iter)
+		__bch2_page_state_set(bv.bv_page, bv.bv_offset >> 9,
+				      bv.bv_len >> 9, nr_ptrs, state);
 }
 
 static inline unsigned inode_nr_replicas(struct bch_fs *c, struct bch_inode_info *inode)
@@ -436,6 +509,8 @@ static int bch2_page_reservation_get(struct bch_fs *c,
 	if (!s)
 		return -ENOMEM;
 
+	BUG_ON(!s->uptodate);
+
 	for (i = round_down(offset, block_bytes(c)) >> 9;
 	     i < round_up(offset + len, block_bytes(c)) >> 9;
 	     i++) {
@@ -609,7 +684,7 @@ vm_fault_t bch2_page_mkwrite(struct vm_fault *vmf)
 	struct bch2_page_reservation res;
 	unsigned len;
 	loff_t isize;
-	int ret = VM_FAULT_LOCKED;
+	int ret;
 
 	bch2_page_reservation_init(c, inode, &res);
 
@@ -635,6 +710,14 @@ vm_fault_t bch2_page_mkwrite(struct vm_fault *vmf)
 
 	len = min_t(loff_t, PAGE_SIZE, isize - page_offset(page));
 
+	if (!bch2_page_state_create(page, __GFP_NOFAIL)->uptodate) {
+		if (bch2_page_state_set(c, inode_inum(inode), &page, 1)) {
+			unlock_page(page);
+			ret = VM_FAULT_SIGBUS;
+			goto out;
+		}
+	}
+
 	if (bch2_page_reservation_get(c, inode, page, &res, 0, len, true)) {
 		unlock_page(page);
 		ret = VM_FAULT_SIGBUS;
@@ -645,6 +728,7 @@ vm_fault_t bch2_page_mkwrite(struct vm_fault *vmf)
 	bch2_page_reservation_put(c, inode, &res);
 
 	wait_for_stable_page(page);
+	ret = VM_FAULT_LOCKED;
 out:
 	bch2_pagecache_add_put(&inode->ei_pagecache_lock);
 	sb_end_pagefault(inode->v.i_sb);
@@ -1348,6 +1432,12 @@ readpage:
 	if (ret)
 		goto err;
 out:
+	if (!bch2_page_state_create(page, __GFP_NOFAIL)->uptodate) {
+		ret = bch2_page_state_set(c, inode_inum(inode), &page, 1);
+		if (ret)
+			goto out;
+	}
+
 	ret = bch2_page_reservation_get(c, inode, page, res,
 					offset, len, true);
 	if (ret) {
@@ -1477,20 +1567,21 @@ static int __bch2_buffered_write(struct bch_inode_info *inode,
 	}
 
 	while (reserved < len) {
-		struct page *page = pages[(offset + reserved) >> PAGE_SHIFT];
+		unsigned i = (offset + reserved) >> PAGE_SHIFT;
+		struct page *page = pages[i];
 		unsigned pg_offset = (offset + reserved) & (PAGE_SIZE - 1);
 		unsigned pg_len = min_t(unsigned, len - reserved,
 					PAGE_SIZE - pg_offset);
-retry_reservation:
-		ret = bch2_page_reservation_get(c, inode, page, &res,
-						pg_offset, pg_len, true);
 
-		if (ret && !PageUptodate(page)) {
-			ret = bch2_read_single_page(page, mapping);
-			if (!ret)
-				goto retry_reservation;
+		if (!bch2_page_state_create(page, __GFP_NOFAIL)->uptodate) {
+			ret = bch2_page_state_set(c, inode_inum(inode),
+						  pages + i, nr_pages - i);
+			if (ret)
+				goto out;
 		}
 
+		ret = bch2_page_reservation_get(c, inode, page, &res,
+						pg_offset, pg_len, true);
 		if (ret)
 			goto out;
 
-- 
cgit 


From dcfc593f7b3a35e340f0cefa3281a3285ddb48e8 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 23 Nov 2021 18:21:09 -0500
Subject: bcachefs: Fix page state after fallocate

This tweaks the fallocate code to also update the page cache to reflect
the new on disk reservations, giving us better i_sectors consistency.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/fs-io.c | 149 ++++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 111 insertions(+), 38 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 53998df4c14b..d5320719dc95 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -425,6 +425,108 @@ static void bch2_bio_page_state_set(struct bio *bio, struct bkey_s_c k)
 				      bv.bv_len >> 9, nr_ptrs, state);
 }
 
+static void mark_pagecache_unallocated(struct bch_inode_info *inode,
+				       u64 start, u64 end)
+{
+	pgoff_t index = start >> PAGE_SECTORS_SHIFT;
+	pgoff_t end_index = (end - 1) >> PAGE_SECTORS_SHIFT;
+	struct folio_batch fbatch;
+	unsigned i, j;
+
+	if (end <= start)
+		return;
+
+	folio_batch_init(&fbatch);
+
+	while (filemap_get_folios(inode->v.i_mapping,
+				  &index, end_index, &fbatch)) {
+		for (i = 0; i < folio_batch_count(&fbatch); i++) {
+			struct folio *folio = fbatch.folios[i];
+			u64 pg_start = folio->index << PAGE_SECTORS_SHIFT;
+			u64 pg_end = (folio->index + 1) << PAGE_SECTORS_SHIFT;
+			unsigned pg_offset = max(start, pg_start) - pg_start;
+			unsigned pg_len = min(end, pg_end) - pg_offset - pg_start;
+			struct bch_page_state *s;
+
+			BUG_ON(end <= pg_start);
+			BUG_ON(pg_offset >= PAGE_SECTORS);
+			BUG_ON(pg_offset + pg_len > PAGE_SECTORS);
+
+			folio_lock(folio);
+			s = bch2_page_state(&folio->page);
+
+			if (s) {
+				spin_lock(&s->lock);
+				for (j = pg_offset; j < pg_offset + pg_len; j++)
+					s->s[j].nr_replicas = 0;
+				spin_unlock(&s->lock);
+			}
+
+			folio_unlock(folio);
+		}
+		folio_batch_release(&fbatch);
+		cond_resched();
+	}
+}
+
+static void mark_pagecache_reserved(struct bch_inode_info *inode,
+				    u64 start, u64 end)
+{
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	pgoff_t index = start >> PAGE_SECTORS_SHIFT;
+	pgoff_t end_index = (end - 1) >> PAGE_SECTORS_SHIFT;
+	struct folio_batch fbatch;
+	s64 i_sectors_delta = 0;
+	unsigned i, j;
+
+	if (end <= start)
+		return;
+
+	folio_batch_init(&fbatch);
+
+	while (filemap_get_folios(inode->v.i_mapping,
+				  &index, end_index, &fbatch)) {
+		for (i = 0; i < folio_batch_count(&fbatch); i++) {
+			struct folio *folio = fbatch.folios[i];
+			u64 pg_start = folio->index << PAGE_SECTORS_SHIFT;
+			u64 pg_end = (folio->index + 1) << PAGE_SECTORS_SHIFT;
+			unsigned pg_offset = max(start, pg_start) - pg_start;
+			unsigned pg_len = min(end, pg_end) - pg_offset - pg_start;
+			struct bch_page_state *s;
+
+			BUG_ON(end <= pg_start);
+			BUG_ON(pg_offset >= PAGE_SECTORS);
+			BUG_ON(pg_offset + pg_len > PAGE_SECTORS);
+
+			folio_lock(folio);
+			s = bch2_page_state(&folio->page);
+
+			if (s) {
+				spin_lock(&s->lock);
+				for (j = pg_offset; j < pg_offset + pg_len; j++)
+					switch (s->s[j].state) {
+					case SECTOR_UNALLOCATED:
+						s->s[j].state = SECTOR_RESERVED;
+						break;
+					case SECTOR_DIRTY:
+						s->s[j].state = SECTOR_DIRTY_RESERVED;
+						i_sectors_delta--;
+						break;
+					default:
+						break;
+					}
+				spin_unlock(&s->lock);
+			}
+
+			folio_unlock(folio);
+		}
+		folio_batch_release(&fbatch);
+		cond_resched();
+	}
+
+	i_sectors_acct(c, inode, NULL, i_sectors_delta);
+}
+
 static inline unsigned inode_nr_replicas(struct bch_fs *c, struct bch_inode_info *inode)
 {
 	/* XXX: this should not be open coded */
@@ -580,8 +682,7 @@ static void bch2_clear_page_bits(struct page *page)
 
 	bch2_disk_reservation_put(c, &disk_res);
 
-	if (dirty_sectors)
-		i_sectors_acct(c, inode, NULL, dirty_sectors);
+	i_sectors_acct(c, inode, NULL, dirty_sectors);
 
 	bch2_page_state_release(page);
 }
@@ -629,8 +730,7 @@ static void bch2_set_page_dirty(struct bch_fs *c,
 
 	spin_unlock(&s->lock);
 
-	if (dirty_sectors)
-		i_sectors_acct(c, inode, &res->quota, dirty_sectors);
+	i_sectors_acct(c, inode, &res->quota, dirty_sectors);
 
 	if (!PageDirty(page))
 		filemap_dirty_folio(inode->v.i_mapping, page_folio(page));
@@ -2599,6 +2699,8 @@ int bch2_truncate(struct mnt_idmap *idmap,
 			U64_MAX, &i_sectors_delta);
 	i_sectors_acct(c, inode, NULL, i_sectors_delta);
 
+	BUG_ON(!inode->v.i_size && inode->v.i_blocks);
+
 	if (unlikely(ret))
 		goto err;
 
@@ -2939,6 +3041,9 @@ bkey_err:
 			ret = 0;
 	}
 
+	bch2_trans_unlock(&trans); /* lock ordering, before taking pagecache locks: */
+	mark_pagecache_reserved(inode, start_sector, iter.pos.offset);
+
 	if (ret == -ENOSPC && (mode & FALLOC_FL_ZERO_RANGE)) {
 		struct quota_res quota_res = { 0 };
 		s64 i_sectors_delta = 0;
@@ -3044,39 +3149,6 @@ long bch2_fallocate_dispatch(struct file *file, int mode,
 	return ret;
 }
 
-static void mark_range_unallocated(struct bch_inode_info *inode,
-				   loff_t start, loff_t end)
-{
-	pgoff_t index = start >> PAGE_SHIFT;
-	pgoff_t end_index = (end - 1) >> PAGE_SHIFT;
-	struct folio_batch fbatch;
-	unsigned i, j;
-
-	folio_batch_init(&fbatch);
-
-	while (filemap_get_folios(inode->v.i_mapping,
-				  &index, end_index, &fbatch)) {
-		for (i = 0; i < folio_batch_count(&fbatch); i++) {
-			struct folio *folio = fbatch.folios[i];
-			struct bch_page_state *s;
-
-			folio_lock(folio);
-			s = bch2_page_state(&folio->page);
-
-			if (s) {
-				spin_lock(&s->lock);
-				for (j = 0; j < PAGE_SECTORS; j++)
-					s->s[j].nr_replicas = 0;
-				spin_unlock(&s->lock);
-			}
-
-			folio_unlock(folio);
-		}
-		folio_batch_release(&fbatch);
-		cond_resched();
-	}
-}
-
 loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src,
 			     struct file *file_dst, loff_t pos_dst,
 			     loff_t len, unsigned remap_flags)
@@ -3122,7 +3194,8 @@ loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src,
 	if (ret)
 		goto err;
 
-	mark_range_unallocated(src, pos_src, pos_src + aligned_len);
+	mark_pagecache_unallocated(src, pos_src >> 9,
+				   (pos_src + aligned_len) >> 9);
 
 	ret = bch2_remap_range(c,
 			       inode_inum(dst), pos_dst >> 9,
-- 
cgit 


From fc6c01e2ea5292a740ddedb5b2b3805e8ecb3f4b Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 28 Nov 2021 13:42:05 -0500
Subject: bcachefs: Convert bucket_alloc_ret to negative error codes

Start a new header, errcode.h, for bcachefs-private error codes - more
error codes will be converted later.

This patch just converts bucket_alloc_ret so that they can be mixed with
standard error codes and passed as ERR_PTR errors - the ec.c code was
doing this already, but incorrectly.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/alloc_foreground.c | 35 ++++++++++++++++-------------------
 fs/bcachefs/alloc_foreground.h | 10 +---------
 fs/bcachefs/bcachefs.h         |  1 +
 fs/bcachefs/ec.c               |  9 ++++-----
 fs/bcachefs/errcode.h          | 12 ++++++++++++
 5 files changed, 34 insertions(+), 33 deletions(-)
 create mode 100644 fs/bcachefs/errcode.h

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index 6bf4140477a0..e3fe4d7bbe21 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -348,8 +348,7 @@ static void add_new_bucket(struct bch_fs *c,
 	ob_push(c, ptrs, ob);
 }
 
-enum bucket_alloc_ret
-bch2_bucket_alloc_set(struct bch_fs *c,
+int bch2_bucket_alloc_set(struct bch_fs *c,
 		      struct open_buckets *ptrs,
 		      struct dev_stripe_state *stripe,
 		      struct bch_devs_mask *devs_may_alloc,
@@ -363,7 +362,7 @@ bch2_bucket_alloc_set(struct bch_fs *c,
 	struct dev_alloc_list devs_sorted =
 		bch2_dev_alloc_list(c, stripe, devs_may_alloc);
 	struct bch_dev *ca;
-	enum bucket_alloc_ret ret = INSUFFICIENT_DEVICES;
+	int ret = -INSUFFICIENT_DEVICES;
 	unsigned i;
 
 	BUG_ON(*nr_effective >= nr_replicas);
@@ -381,7 +380,7 @@ bch2_bucket_alloc_set(struct bch_fs *c,
 		ob = bch2_bucket_alloc(c, ca, reserve,
 				flags & BUCKET_MAY_ALLOC_PARTIAL, cl);
 		if (IS_ERR(ob)) {
-			ret = -PTR_ERR(ob);
+			ret = PTR_ERR(ob);
 
 			if (cl)
 				return ret;
@@ -394,7 +393,7 @@ bch2_bucket_alloc_set(struct bch_fs *c,
 		bch2_dev_stripe_increment(ca, stripe);
 
 		if (*nr_effective >= nr_replicas)
-			return ALLOC_SUCCESS;
+			return 0;
 	}
 
 	return ret;
@@ -408,8 +407,7 @@ bch2_bucket_alloc_set(struct bch_fs *c,
  * it's to a device we don't want:
  */
 
-static enum bucket_alloc_ret
-bucket_alloc_from_stripe(struct bch_fs *c,
+static int bucket_alloc_from_stripe(struct bch_fs *c,
 			 struct open_buckets *ptrs,
 			 struct write_point *wp,
 			 struct bch_devs_mask *devs_may_alloc,
@@ -505,8 +503,7 @@ static void get_buckets_from_writepoint(struct bch_fs *c,
 	wp->ptrs = ptrs_skip;
 }
 
-static enum bucket_alloc_ret
-open_bucket_add_buckets(struct bch_fs *c,
+static int open_bucket_add_buckets(struct bch_fs *c,
 			struct open_buckets *ptrs,
 			struct write_point *wp,
 			struct bch_devs_list *devs_have,
@@ -522,7 +519,7 @@ open_bucket_add_buckets(struct bch_fs *c,
 	struct bch_devs_mask devs;
 	struct open_bucket *ob;
 	struct closure *cl = NULL;
-	enum bucket_alloc_ret ret;
+	int ret;
 	unsigned i;
 
 	rcu_read_lock();
@@ -550,8 +547,8 @@ open_bucket_add_buckets(struct bch_fs *c,
 						 target, erasure_code,
 						 nr_replicas, nr_effective,
 						 have_cache, flags, _cl);
-			if (ret == FREELIST_EMPTY ||
-			    ret == OPEN_BUCKETS_EMPTY)
+			if (ret == -FREELIST_EMPTY ||
+			    ret == -OPEN_BUCKETS_EMPTY)
 				return ret;
 			if (*nr_effective >= nr_replicas)
 				return 0;
@@ -575,7 +572,7 @@ retry_blocking:
 	ret = bch2_bucket_alloc_set(c, ptrs, &wp->stripe, &devs,
 				nr_replicas, nr_effective, have_cache,
 				reserve, flags, cl);
-	if (ret && ret != INSUFFICIENT_DEVICES && !cl && _cl) {
+	if (ret && ret != -INSUFFICIENT_DEVICES && !cl && _cl) {
 		cl = _cl;
 		goto retry_blocking;
 	}
@@ -772,7 +769,7 @@ struct write_point *bch2_alloc_sectors_start(struct bch_fs *c,
 	unsigned nr_effective, write_points_nr;
 	unsigned ob_flags = 0;
 	bool have_cache;
-	enum bucket_alloc_ret ret;
+	int ret;
 	int i;
 
 	if (!(flags & BCH_WRITE_ONLY_SPECIFIED_DEVS))
@@ -821,7 +818,7 @@ alloc_done:
 	if (erasure_code && !ec_open_bucket(c, &ptrs))
 		pr_debug("failed to get ec bucket: ret %u", ret);
 
-	if (ret == INSUFFICIENT_DEVICES &&
+	if (ret == -INSUFFICIENT_DEVICES &&
 	    nr_effective >= nr_replicas_required)
 		ret = 0;
 
@@ -854,15 +851,15 @@ err:
 
 	mutex_unlock(&wp->lock);
 
-	if (ret == FREELIST_EMPTY &&
+	if (ret == -FREELIST_EMPTY &&
 	    try_decrease_writepoints(c, write_points_nr))
 		goto retry;
 
 	switch (ret) {
-	case OPEN_BUCKETS_EMPTY:
-	case FREELIST_EMPTY:
+	case -OPEN_BUCKETS_EMPTY:
+	case -FREELIST_EMPTY:
 		return cl ? ERR_PTR(-EAGAIN) : ERR_PTR(-ENOSPC);
-	case INSUFFICIENT_DEVICES:
+	case -INSUFFICIENT_DEVICES:
 		return ERR_PTR(-EROFS);
 	default:
 		BUG();
diff --git a/fs/bcachefs/alloc_foreground.h b/fs/bcachefs/alloc_foreground.h
index c658295cb8e0..2e81712ba8d1 100644
--- a/fs/bcachefs/alloc_foreground.h
+++ b/fs/bcachefs/alloc_foreground.h
@@ -12,13 +12,6 @@ struct bch_dev;
 struct bch_fs;
 struct bch_devs_List;
 
-enum bucket_alloc_ret {
-	ALLOC_SUCCESS,
-	OPEN_BUCKETS_EMPTY,
-	FREELIST_EMPTY,		/* Allocator thread not keeping up */
-	INSUFFICIENT_DEVICES,
-};
-
 struct dev_alloc_list {
 	unsigned	nr;
 	u8		devs[BCH_SB_MEMBERS_MAX];
@@ -98,8 +91,7 @@ static inline void bch2_open_bucket_get(struct bch_fs *c,
 	}
 }
 
-enum bucket_alloc_ret
-bch2_bucket_alloc_set(struct bch_fs *, struct open_buckets *,
+int bch2_bucket_alloc_set(struct bch_fs *, struct open_buckets *,
 		      struct dev_stripe_state *, struct bch_devs_mask *,
 		      unsigned, unsigned *, bool *, enum alloc_reserve,
 		      unsigned, struct closure *);
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 077d366961ff..4d3cfb64a656 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -200,6 +200,7 @@
 #include <linux/zstd.h>
 
 #include "bcachefs_format.h"
+#include "errcode.h"
 #include "fifo.h"
 #include "opts.h"
 #include "util.h"
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index bc8bb963ae43..9624cd5e5ada 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -1272,16 +1272,15 @@ found:
 	return h;
 }
 
-static enum bucket_alloc_ret
-new_stripe_alloc_buckets(struct bch_fs *c, struct ec_stripe_head *h,
-			 struct closure *cl)
+static int new_stripe_alloc_buckets(struct bch_fs *c, struct ec_stripe_head *h,
+				    struct closure *cl)
 {
 	struct bch_devs_mask devs = h->devs;
 	struct open_bucket *ob;
 	struct open_buckets buckets;
 	unsigned i, j, nr_have_parity = 0, nr_have_data = 0;
 	bool have_cache = true;
-	enum bucket_alloc_ret ret = ALLOC_SUCCESS;
+	int ret = 0;
 
 	for (i = 0; i < h->s->new_stripe.key.v.nr_blocks; i++) {
 		if (test_bit(i, h->s->blocks_gotten)) {
@@ -1516,7 +1515,7 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *c,
 
 err:
 	bch2_ec_stripe_head_put(c, h);
-	return ERR_PTR(-ret);
+	return ERR_PTR(ret);
 }
 
 void bch2_ec_stop_dev(struct bch_fs *c, struct bch_dev *ca)
diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h
new file mode 100644
index 000000000000..f7d12915c1cc
--- /dev/null
+++ b/fs/bcachefs/errcode.h
@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_ERRCODE_H
+#define _BCACHEFS_ERRCODE_H
+
+enum {
+	/* Bucket allocator: */
+	OPEN_BUCKETS_EMPTY =	2048,
+	FREELIST_EMPTY,		/* Allocator thread not keeping up */
+	INSUFFICIENT_DEVICES,
+};
+
+#endif /* _BCACHFES_ERRCODE_H */
-- 
cgit 


From f449bedb068447d3e9b1d64a41852b2aaca36fda Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 29 Nov 2021 16:36:50 -0500
Subject: bcachefs: Fix reflink path for snapshots

make_extent_indirect() was missing the
BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE - it's updating the extent in the
original snapshot, not the curret one.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/reflink.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c
index 8dcac7815c9f..c8d6d73681e0 100644
--- a/fs/bcachefs/reflink.c
+++ b/fs/bcachefs/reflink.c
@@ -184,7 +184,8 @@ static int bch2_make_extent_indirect(struct btree_trans *trans,
 
 	r_p->v.idx = cpu_to_le64(bkey_start_offset(&r_v->k));
 
-	ret = bch2_trans_update(trans, extent_iter, &r_p->k_i, 0);
+	ret = bch2_trans_update(trans, extent_iter, &r_p->k_i,
+				BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
 err:
 	c->reflink_hint = reflink_iter.pos.offset;
 	bch2_trans_iter_exit(trans, &reflink_iter);
-- 
cgit 


From 502cfb3591ec1f3d133c7eb281b8b93ca2bb2768 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 28 Nov 2021 14:08:58 -0500
Subject: bcachefs: Kill bch2_replicas_delta_list_marked()

This changes bch2_trans_fs_usage_apply() to handle failure (replicas
entry missing) by reverting the changes it made - meaning we can make
the main transaction commit path a bit slimmer, and perhaps also
simplify some locking in upcoming patches.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_update_leaf.c | 30 ++++++++++++------------------
 fs/bcachefs/buckets.c           | 17 +++++++++++++----
 fs/bcachefs/buckets.h           |  2 +-
 fs/bcachefs/replicas.c          | 14 --------------
 fs/bcachefs/replicas.h          |  1 -
 5 files changed, 26 insertions(+), 38 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 61c87525e48d..c5cbd3a4d66b 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -438,17 +438,6 @@ bch2_trans_commit_write_locked(struct btree_trans *trans,
 			marking = true;
 	}
 
-	if (marking) {
-		percpu_down_read(&c->mark_lock);
-	}
-
-	/* Must be called under mark_lock: */
-	if (marking && trans->fs_usage_deltas &&
-	    !bch2_replicas_delta_list_marked(c, trans->fs_usage_deltas)) {
-		ret = BTREE_INSERT_NEED_MARK_REPLICAS;
-		goto err;
-	}
-
 	/*
 	 * Don't get journal reservation until after we know insert will
 	 * succeed:
@@ -457,7 +446,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans,
 		ret = bch2_trans_journal_res_get(trans,
 				JOURNAL_RES_GET_NONBLOCK);
 		if (ret)
-			goto err;
+			return ret;
 	} else {
 		trans->journal_res.seq = c->journal.replay_journal_seq;
 	}
@@ -485,22 +474,27 @@ bch2_trans_commit_write_locked(struct btree_trans *trans,
 				i->k->k.version = MAX_VERSION;
 	}
 
+	if (marking)
+		percpu_down_read(&c->mark_lock);
+
+	if (marking && trans->fs_usage_deltas &&
+	    bch2_trans_fs_usage_apply(trans, trans->fs_usage_deltas)) {
+		percpu_up_read(&c->mark_lock);
+		return BTREE_INSERT_NEED_MARK_REPLICAS;
+	}
+
 	trans_for_each_update(trans, i)
 		if (BTREE_NODE_TYPE_HAS_MEM_TRIGGERS & (1U << i->bkey_type))
 			bch2_mark_update(trans, i->path, i->k, i->flags);
 
-	if (marking && trans->fs_usage_deltas)
-		bch2_trans_fs_usage_apply(trans, trans->fs_usage_deltas);
-
 	if (unlikely(c->gc_pos.phase))
 		bch2_trans_mark_gc(trans);
 
 	trans_for_each_update(trans, i)
 		do_btree_insert_one(trans, i);
-err:
-	if (marking) {
+
+	if (marking)
 		percpu_up_read(&c->mark_lock);
-	}
 
 	return ret;
 }
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 2c0a385ace50..e919afe6a110 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -1363,14 +1363,14 @@ void fs_usage_apply_warn(struct btree_trans *trans,
 	__WARN();
 }
 
-void bch2_trans_fs_usage_apply(struct btree_trans *trans,
-			       struct replicas_delta_list *deltas)
+int bch2_trans_fs_usage_apply(struct btree_trans *trans,
+			      struct replicas_delta_list *deltas)
 {
 	struct bch_fs *c = trans->c;
 	static int warned_disk_usage = 0;
 	bool warn = false;
 	unsigned disk_res_sectors = trans->disk_res ? trans->disk_res->sectors : 0;
-	struct replicas_delta *d = deltas->d;
+	struct replicas_delta *d = deltas->d, *d2;
 	struct replicas_delta *top = (void *) deltas->d + deltas->used;
 	struct bch_fs_usage *dst;
 	s64 added = 0, should_not_have_added;
@@ -1389,7 +1389,8 @@ void bch2_trans_fs_usage_apply(struct btree_trans *trans,
 			added += d->delta;
 		}
 
-		BUG_ON(__update_replicas(c, dst, &d->r, d->delta));
+		if (__update_replicas(c, dst, &d->r, d->delta))
+			goto need_mark;
 	}
 
 	dst->nr_inodes += deltas->nr_inodes;
@@ -1427,6 +1428,14 @@ void bch2_trans_fs_usage_apply(struct btree_trans *trans,
 
 	if (unlikely(warn) && !xchg(&warned_disk_usage, 1))
 		fs_usage_apply_warn(trans, disk_res_sectors, should_not_have_added);
+	return 0;
+need_mark:
+	/* revert changes: */
+	for (d2 = deltas->d; d2 != d; d2 = replicas_delta_next(d2))
+		BUG_ON(__update_replicas(c, dst, &d2->r, -d2->delta));
+
+	preempt_enable();
+	return -1;
 }
 
 /* trans_mark: */
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index 54a29bf69d67..7b7d08af2253 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -233,7 +233,7 @@ int bch2_mark_update(struct btree_trans *, struct btree_path *,
 
 int bch2_trans_mark_key(struct btree_trans *, struct bkey_s_c,
 			struct bkey_s_c, unsigned);
-void bch2_trans_fs_usage_apply(struct btree_trans *, struct replicas_delta_list *);
+int bch2_trans_fs_usage_apply(struct btree_trans *, struct replicas_delta_list *);
 
 int bch2_trans_mark_metadata_bucket(struct btree_trans *, struct bch_dev *,
 				    size_t, enum bch_data_type, unsigned);
diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c
index 57e093983dfc..9bfe7fa51d32 100644
--- a/fs/bcachefs/replicas.c
+++ b/fs/bcachefs/replicas.c
@@ -475,20 +475,6 @@ static int __bch2_mark_bkey_replicas(struct bch_fs *c, struct bkey_s_c k,
 
 /* replicas delta list: */
 
-bool bch2_replicas_delta_list_marked(struct bch_fs *c,
-				     struct replicas_delta_list *r)
-{
-	struct replicas_delta *d = r->d;
-	struct replicas_delta *top = (void *) r->d + r->used;
-
-	percpu_rwsem_assert_held(&c->mark_lock);
-
-	for (d = r->d; d != top; d = replicas_delta_next(d))
-		if (bch2_replicas_entry_idx(c, &d->r) < 0)
-			return false;
-	return true;
-}
-
 int bch2_replicas_delta_list_mark(struct bch_fs *c,
 				  struct replicas_delta_list *r)
 {
diff --git a/fs/bcachefs/replicas.h b/fs/bcachefs/replicas.h
index 72ac544f16d8..66ca88deb0c0 100644
--- a/fs/bcachefs/replicas.h
+++ b/fs/bcachefs/replicas.h
@@ -48,7 +48,6 @@ replicas_delta_next(struct replicas_delta *d)
 	return (void *) d + replicas_entry_bytes(&d->r) + 8;
 }
 
-bool bch2_replicas_delta_list_marked(struct bch_fs *, struct replicas_delta_list *);
 int bch2_replicas_delta_list_mark(struct bch_fs *, struct replicas_delta_list *);
 
 void bch2_bkey_to_replicas(struct bch_replicas_entry *, struct bkey_s_c);
-- 
cgit 


From 58e1ea4bcb057388636b0098524d6e0647eb40c2 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 28 Nov 2021 14:31:19 -0500
Subject: bcachefs: Push c->mark_lock usage down to where it is needed

This changes the bch2_mark_key() and related paths to take mark lock
where it is needed, instead of taking it in the upper transaction commit
path - by pushing down locking we'll be able to handle fsck errors
locally instead of requiring a separate check in the btree_gc code for
replicas being marked.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_gc.c          |   6 +-
 fs/bcachefs/btree_update_leaf.c |  12 +---
 fs/bcachefs/buckets.c           | 124 +++++++++++++++++++++++-----------------
 fs/bcachefs/buckets.h           |   2 +-
 fs/bcachefs/ec.c                |   6 +-
 5 files changed, 84 insertions(+), 66 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index a15b3bfa9d47..a6cba09dae3e 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -710,12 +710,16 @@ static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id,
 	struct bch_fs *c = trans->c;
 	struct bkey_ptrs_c ptrs;
 	const struct bch_extent_ptr *ptr;
+	struct bkey deleted = KEY(0, 0, 0);
+	struct bkey_s_c old = (struct bkey_s_c) { &deleted, NULL };
 	unsigned flags =
 		BTREE_TRIGGER_GC|
 		(initial ? BTREE_TRIGGER_NOATOMIC : 0);
 	char buf[200];
 	int ret = 0;
 
+	deleted.p = k->k->p;
+
 	if (initial) {
 		BUG_ON(bch2_journal_seq_verify &&
 		       k->k->version.lo > journal_cur_seq(&c->journal));
@@ -754,7 +758,7 @@ static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id,
 		*max_stale = max(*max_stale, ptr_stale(ca, ptr));
 	}
 
-	ret = bch2_mark_key(trans, *k, flags);
+	ret = bch2_mark_key(trans, old, *k, flags);
 fsck_err:
 err:
 	if (ret)
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index c5cbd3a4d66b..205eaee11da0 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -474,14 +474,9 @@ bch2_trans_commit_write_locked(struct btree_trans *trans,
 				i->k->k.version = MAX_VERSION;
 	}
 
-	if (marking)
-		percpu_down_read(&c->mark_lock);
-
-	if (marking && trans->fs_usage_deltas &&
-	    bch2_trans_fs_usage_apply(trans, trans->fs_usage_deltas)) {
-		percpu_up_read(&c->mark_lock);
+	if (trans->fs_usage_deltas &&
+	    bch2_trans_fs_usage_apply(trans, trans->fs_usage_deltas))
 		return BTREE_INSERT_NEED_MARK_REPLICAS;
-	}
 
 	trans_for_each_update(trans, i)
 		if (BTREE_NODE_TYPE_HAS_MEM_TRIGGERS & (1U << i->bkey_type))
@@ -493,9 +488,6 @@ bch2_trans_commit_write_locked(struct btree_trans *trans,
 	trans_for_each_update(trans, i)
 		do_btree_insert_one(trans, i);
 
-	if (marking)
-		percpu_up_read(&c->mark_lock);
-
 	return ret;
 }
 
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index e919afe6a110..78a134d5e63b 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -144,6 +144,7 @@ static inline struct bch_fs_usage *fs_usage_ptr(struct bch_fs *c,
 						unsigned journal_seq,
 						bool gc)
 {
+	percpu_rwsem_assert_held(&c->mark_lock);
 	BUG_ON(!gc && !journal_seq);
 
 	return this_cpu_ptr(gc
@@ -375,8 +376,6 @@ static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
 	if (!journal_seq && !test_bit(BCH_FS_INITIALIZED, &c->flags))
 		journal_seq = 1;
 
-	percpu_rwsem_assert_held(&c->mark_lock);
-
 	preempt_disable();
 	fs_usage = fs_usage_ptr(c, journal_seq, gc);
 	u = dev_usage_ptr(ca, journal_seq, gc);
@@ -426,17 +425,24 @@ static inline int update_replicas(struct bch_fs *c,
 			unsigned journal_seq, bool gc)
 {
 	struct bch_fs_usage __percpu *fs_usage;
-	int idx = bch2_replicas_entry_idx(c, r);
+	int idx, ret = 0;
 
-	if (idx < 0)
-		return -1;
+	percpu_down_read(&c->mark_lock);
+
+	idx = bch2_replicas_entry_idx(c, r);
+	if (idx < 0) {
+		ret = -1;
+		goto err;
+	}
 
 	preempt_disable();
 	fs_usage = fs_usage_ptr(c, journal_seq, gc);
 	fs_usage_data_type_to_base(fs_usage, r->data_type, sectors);
 	fs_usage->replicas[idx]		+= sectors;
 	preempt_enable();
-	return 0;
+err:
+	percpu_up_read(&c->mark_lock);
+	return ret;
 }
 
 static inline int update_cached_sectors(struct bch_fs *c,
@@ -551,6 +557,7 @@ static int bch2_mark_alloc(struct btree_trans *trans,
 	struct bch_dev *ca;
 	struct bucket *g;
 	struct bucket_mark old_m, m;
+	int ret = 0;
 
 	/* We don't do anything for deletions - do we?: */
 	if (!bkey_is_alloc(new.k))
@@ -577,6 +584,7 @@ static int bch2_mark_alloc(struct btree_trans *trans,
 	if (new.k->p.offset >= ca->mi.nbuckets)
 		return 0;
 
+	percpu_down_read(&c->mark_lock);
 	g = __bucket(ca, new.k->p.offset, gc);
 	u = bch2_alloc_unpack(new);
 
@@ -601,6 +609,7 @@ static int bch2_mark_alloc(struct btree_trans *trans,
 	g->gen_valid		= 1;
 	g->stripe		= u.stripe;
 	g->stripe_redundancy	= u.stripe_redundancy;
+	percpu_up_read(&c->mark_lock);
 
 	/*
 	 * need to know if we're getting called from the invalidate path or
@@ -609,10 +618,11 @@ static int bch2_mark_alloc(struct btree_trans *trans,
 
 	if ((flags & BTREE_TRIGGER_BUCKET_INVALIDATE) &&
 	    old_m.cached_sectors) {
-		if (update_cached_sectors(c, ca->dev_idx, -old_m.cached_sectors,
-					  journal_seq, gc)) {
+		ret = update_cached_sectors(c, ca->dev_idx, -old_m.cached_sectors,
+					  journal_seq, gc);
+		if (ret) {
 			bch2_fs_fatal_error(c, "bch2_mark_alloc(): no replicas entry while updating cached sectors");
-			return -1;
+			return ret;
 		}
 
 		trace_invalidate(ca, bucket_to_sector(ca, new.k->p.offset),
@@ -786,24 +796,28 @@ static int mark_stripe_bucket(struct btree_trans *trans,
 	const struct bch_extent_ptr *ptr = s->ptrs + ptr_idx;
 	bool gc = flags & BTREE_TRIGGER_GC;
 	struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
-	struct bucket *g = PTR_BUCKET(ca, ptr, gc);
+	struct bucket *g;
 	struct bucket_mark new, old;
 	char buf[200];
-	int ret;
+	int ret = 0;
+
+	percpu_down_read(&c->mark_lock);
+	g = PTR_BUCKET(ca, ptr, gc);
 
 	if (g->stripe && g->stripe != k.k->p.offset) {
 		bch2_fs_inconsistent(c,
 			      "bucket %u:%zu gen %u: multiple stripes using same bucket\n%s",
 			      ptr->dev, PTR_BUCKET_NR(ca, ptr), g->mark.gen,
 			      (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
-		return -EINVAL;
+		ret = -EINVAL;
+		goto err;
 	}
 
 	old = bucket_cmpxchg(g, new, ({
 		ret = check_bucket_ref(c, k, ptr, 0, 0, new.gen, new.data_type,
 				       new.dirty_sectors, new.cached_sectors);
 		if (ret)
-			return ret;
+			goto err;
 
 		if (parity) {
 			new.data_type		= BCH_DATA_parity;
@@ -820,6 +834,9 @@ static int mark_stripe_bucket(struct btree_trans *trans,
 	g->stripe_redundancy	= s->nr_redundant;
 
 	bch2_dev_usage_update(c, ca, old, new, journal_seq, gc);
+err:
+	percpu_up_read(&c->mark_lock);
+
 	return 0;
 }
 
@@ -857,10 +874,13 @@ static int bch2_mark_pointer(struct btree_trans *trans,
 	struct bch_fs *c = trans->c;
 	struct bucket_mark old, new;
 	struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev);
-	struct bucket *g = PTR_BUCKET(ca, &p.ptr, gc);
+	struct bucket *g;
 	u8 bucket_data_type;
 	u64 v;
-	int ret;
+	int ret = 0;
+
+	percpu_down_read(&c->mark_lock);
+	g = PTR_BUCKET(ca, &p.ptr, gc);
 
 	v = atomic64_read(&g->_mark.v);
 	do {
@@ -873,7 +893,7 @@ static int bch2_mark_pointer(struct btree_trans *trans,
 				     &new.dirty_sectors,
 				     &new.cached_sectors);
 		if (ret)
-			return ret;
+			goto err;
 
 		new.data_type = bucket_data_type;
 
@@ -893,8 +913,10 @@ static int bch2_mark_pointer(struct btree_trans *trans,
 	bch2_dev_usage_update(c, ca, old, new, journal_seq, gc);
 
 	BUG_ON(!gc && bucket_became_unavailable(old, new));
+err:
+	percpu_up_read(&c->mark_lock);
 
-	return 0;
+	return ret;
 }
 
 static int bch2_mark_stripe_ptr(struct btree_trans *trans,
@@ -982,13 +1004,14 @@ static int bch2_mark_extent(struct btree_trans *trans,
 		stale = ret > 0;
 
 		if (p.ptr.cached) {
-			if (!stale)
-				if (update_cached_sectors(c, p.ptr.dev, disk_sectors,
-							  journal_seq, gc)) {
+			if (!stale) {
+				ret = update_cached_sectors(c, p.ptr.dev, disk_sectors,
+							    journal_seq, gc);
+				if (ret) {
 					bch2_fs_fatal_error(c, "bch2_mark_extent(): no replicas entry while updating cached sectors");
-					return -1;
-
+					return ret;
 				}
+			}
 		} else if (!p.has_ec) {
 			dirty_sectors	       += disk_sectors;
 			r.e.devs[r.e.nr_devs++]	= p.ptr.dev;
@@ -1008,12 +1031,13 @@ static int bch2_mark_extent(struct btree_trans *trans,
 	}
 
 	if (r.e.nr_devs) {
-		if (update_replicas(c, &r.e, dirty_sectors, journal_seq, gc)) {
+		ret = update_replicas(c, &r.e, dirty_sectors, journal_seq, gc);
+		if (ret) {
 			char buf[200];
 
 			bch2_bkey_val_to_text(&PBUF(buf), c, k);
 			bch2_fs_fatal_error(c, "no replicas entry for %s", buf);
-			return -1;
+			return ret;
 		}
 	}
 
@@ -1095,14 +1119,15 @@ static int bch2_mark_stripe(struct btree_trans *trans,
 				return ret;
 		}
 
-		if (update_replicas(c, &m->r.e,
-				    ((s64) m->sectors * m->nr_redundant),
-				    journal_seq, gc)) {
+		ret = update_replicas(c, &m->r.e,
+				      ((s64) m->sectors * m->nr_redundant),
+				      journal_seq, gc);
+		if (ret) {
 			char buf[200];
 
 			bch2_bkey_val_to_text(&PBUF(buf), c, new);
 			bch2_fs_fatal_error(c, "no replicas entry for %s", buf);
-			return -1;
+			return ret;
 		}
 	}
 
@@ -1127,11 +1152,15 @@ static int bch2_mark_inode(struct btree_trans *trans,
 	}
 
 	if (flags & BTREE_TRIGGER_GC) {
+		percpu_down_read(&c->mark_lock);
 		preempt_disable();
+
 		fs_usage = fs_usage_ptr(c, journal_seq, flags & BTREE_TRIGGER_GC);
 		fs_usage->nr_inodes += bkey_is_inode(new.k);
 		fs_usage->nr_inodes -= bkey_is_inode(old.k);
+
 		preempt_enable();
+		percpu_up_read(&c->mark_lock);
 	}
 	return 0;
 }
@@ -1150,14 +1179,18 @@ static int bch2_mark_reservation(struct btree_trans *trans,
 		sectors = -sectors;
 	sectors *= replicas;
 
+	percpu_down_read(&c->mark_lock);
 	preempt_disable();
+
 	fs_usage = fs_usage_ptr(c, trans->journal_res.seq, flags & BTREE_TRIGGER_GC);
 	replicas = clamp_t(unsigned, replicas, 1,
 			   ARRAY_SIZE(fs_usage->persistent_reserved));
 
 	fs_usage->reserved				+= sectors;
 	fs_usage->persistent_reserved[replicas - 1]	+= sectors;
+
 	preempt_enable();
+	percpu_up_read(&c->mark_lock);
 
 	return 0;
 }
@@ -1245,10 +1278,10 @@ static int bch2_mark_reflink_p(struct btree_trans *trans,
 	return ret;
 }
 
-static int bch2_mark_key_locked(struct btree_trans *trans,
-		   struct bkey_s_c old,
-		   struct bkey_s_c new,
-		   unsigned flags)
+int bch2_mark_key(struct btree_trans *trans,
+		  struct bkey_s_c old,
+		  struct bkey_s_c new,
+		  unsigned flags)
 {
 	struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old: new;
 
@@ -1278,22 +1311,6 @@ static int bch2_mark_key_locked(struct btree_trans *trans,
 	}
 }
 
-int bch2_mark_key(struct btree_trans *trans, struct bkey_s_c new, unsigned flags)
-{
-	struct bch_fs *c = trans->c;
-	struct bkey deleted = KEY(0, 0, 0);
-	struct bkey_s_c old = (struct bkey_s_c) { &deleted, NULL };
-	int ret;
-
-	deleted.p = new.k->p;
-
-	percpu_down_read(&c->mark_lock);
-	ret = bch2_mark_key_locked(trans, old, new, flags);
-	percpu_up_read(&c->mark_lock);
-
-	return ret;
-}
-
 int bch2_mark_update(struct btree_trans *trans, struct btree_path *path,
 		     struct bkey_i *new, unsigned flags)
 {
@@ -1315,12 +1332,12 @@ int bch2_mark_update(struct btree_trans *trans, struct btree_path *path,
 
 	if (old.k->type == new->k.type &&
 	    ((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) {
-		ret   = bch2_mark_key_locked(trans, old, bkey_i_to_s_c(new),
+		ret   = bch2_mark_key(trans, old, bkey_i_to_s_c(new),
 				BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|flags);
 	} else {
-		ret   = bch2_mark_key_locked(trans, deleted, bkey_i_to_s_c(new),
+		ret   = bch2_mark_key(trans, deleted, bkey_i_to_s_c(new),
 				BTREE_TRIGGER_INSERT|flags) ?:
-			bch2_mark_key_locked(trans, old, deleted,
+			bch2_mark_key(trans, old, deleted,
 				BTREE_TRIGGER_OVERWRITE|flags);
 	}
 
@@ -1376,8 +1393,7 @@ int bch2_trans_fs_usage_apply(struct btree_trans *trans,
 	s64 added = 0, should_not_have_added;
 	unsigned i;
 
-	percpu_rwsem_assert_held(&c->mark_lock);
-
+	percpu_down_read(&c->mark_lock);
 	preempt_disable();
 	dst = fs_usage_ptr(c, trans->journal_res.seq, false);
 
@@ -1425,6 +1441,7 @@ int bch2_trans_fs_usage_apply(struct btree_trans *trans,
 	}
 
 	preempt_enable();
+	percpu_up_read(&c->mark_lock);
 
 	if (unlikely(warn) && !xchg(&warned_disk_usage, 1))
 		fs_usage_apply_warn(trans, disk_res_sectors, should_not_have_added);
@@ -1435,6 +1452,7 @@ need_mark:
 		BUG_ON(__update_replicas(c, dst, &d2->r, -d2->delta));
 
 	preempt_enable();
+	percpu_up_read(&c->mark_lock);
 	return -1;
 }
 
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index 7b7d08af2253..cc3e8b9b8faf 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -226,7 +226,7 @@ void bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *,
 			       size_t, enum bch_data_type, unsigned,
 			       struct gc_pos, unsigned);
 
-int bch2_mark_key(struct btree_trans *, struct bkey_s_c, unsigned);
+int bch2_mark_key(struct btree_trans *, struct bkey_s_c, struct bkey_s_c, unsigned);
 
 int bch2_mark_update(struct btree_trans *, struct btree_path *,
 		     struct bkey_i *, unsigned);
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index 9624cd5e5ada..b8e9bc91bf0b 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -1635,12 +1635,16 @@ int bch2_stripes_write(struct bch_fs *c, unsigned flags)
 
 static int bch2_stripes_read_fn(struct btree_trans *trans, struct bkey_s_c k)
 {
+	struct bkey deleted = KEY(0, 0, 0);
+	struct bkey_s_c old = (struct bkey_s_c) { &deleted, NULL };
 	struct bch_fs *c = trans->c;
 	int ret = 0;
 
+	deleted.p = k.k->p;
+
 	if (k.k->type == KEY_TYPE_stripe)
 		ret = __ec_stripe_mem_alloc(c, k.k->p.offset, GFP_KERNEL) ?:
-			bch2_mark_key(trans, k,
+			bch2_mark_key(trans, old, k,
 				      BTREE_TRIGGER_NOATOMIC);
 
 	return ret;
-- 
cgit 


From 181fe42a75c60ecf37509f6c39162115cc66216b Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 28 Nov 2021 15:13:54 -0500
Subject: bcachefs: Handle replica marking fsck errors locally

This simplifies the code quite a bit and eliminates an inconsistency - a
given bkey doesn't necessarily translate to a single replicas entry for
disk space accounting.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_gc.c | 13 -------------
 fs/bcachefs/buckets.c  | 38 ++++++++++++++++++++++++++----------
 fs/bcachefs/replicas.c | 52 --------------------------------------------------
 fs/bcachefs/replicas.h |  2 --
 4 files changed, 28 insertions(+), 77 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index a6cba09dae3e..b692451f91b5 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -715,7 +715,6 @@ static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id,
 	unsigned flags =
 		BTREE_TRIGGER_GC|
 		(initial ? BTREE_TRIGGER_NOATOMIC : 0);
-	char buf[200];
 	int ret = 0;
 
 	deleted.p = k->k->p;
@@ -733,18 +732,6 @@ static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id,
 				k->k->version.lo,
 				atomic64_read(&c->key_version)))
 			atomic64_set(&c->key_version, k->k->version.lo);
-
-		if (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) ||
-		    fsck_err_on(!bch2_bkey_replicas_marked(c, *k), c,
-				"superblock not marked as containing replicas\n"
-				"  while marking %s",
-				(bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf))) {
-			ret = bch2_mark_bkey_replicas(c, *k);
-			if (ret) {
-				bch_err(c, "error marking bkey replicas: %i", ret);
-				goto err;
-			}
-		}
 	}
 
 	ptrs = bch2_bkey_ptrs_c(*k);
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 78a134d5e63b..c4d72a499955 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -420,16 +420,30 @@ static inline int __update_replicas(struct bch_fs *c,
 	return 0;
 }
 
-static inline int update_replicas(struct bch_fs *c,
+static inline int update_replicas(struct bch_fs *c, struct bkey_s_c k,
 			struct bch_replicas_entry *r, s64 sectors,
 			unsigned journal_seq, bool gc)
 {
 	struct bch_fs_usage __percpu *fs_usage;
 	int idx, ret = 0;
+	char buf[200];
 
 	percpu_down_read(&c->mark_lock);
 
 	idx = bch2_replicas_entry_idx(c, r);
+	if (idx < 0 &&
+	    (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) ||
+	     fsck_err(c, "no replicas entry\n"
+		      "  while marking %s",
+		      (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)))) {
+		percpu_up_read(&c->mark_lock);
+		ret = bch2_mark_replicas(c, r);
+		if (ret)
+			return ret;
+
+		percpu_down_read(&c->mark_lock);
+		idx = bch2_replicas_entry_idx(c, r);
+	}
 	if (idx < 0) {
 		ret = -1;
 		goto err;
@@ -441,11 +455,13 @@ static inline int update_replicas(struct bch_fs *c,
 	fs_usage->replicas[idx]		+= sectors;
 	preempt_enable();
 err:
+fsck_err:
 	percpu_up_read(&c->mark_lock);
 	return ret;
 }
 
 static inline int update_cached_sectors(struct bch_fs *c,
+			struct bkey_s_c k,
 			unsigned dev, s64 sectors,
 			unsigned journal_seq, bool gc)
 {
@@ -453,7 +469,7 @@ static inline int update_cached_sectors(struct bch_fs *c,
 
 	bch2_replicas_entry_cached(&r.e, dev);
 
-	return update_replicas(c, &r.e, sectors, journal_seq, gc);
+	return update_replicas(c, k, &r.e, sectors, journal_seq, gc);
 }
 
 static struct replicas_delta_list *
@@ -618,8 +634,9 @@ static int bch2_mark_alloc(struct btree_trans *trans,
 
 	if ((flags & BTREE_TRIGGER_BUCKET_INVALIDATE) &&
 	    old_m.cached_sectors) {
-		ret = update_cached_sectors(c, ca->dev_idx, -old_m.cached_sectors,
-					  journal_seq, gc);
+		ret = update_cached_sectors(c, new, ca->dev_idx,
+					    -old_m.cached_sectors,
+					    journal_seq, gc);
 		if (ret) {
 			bch2_fs_fatal_error(c, "bch2_mark_alloc(): no replicas entry while updating cached sectors");
 			return ret;
@@ -920,6 +937,7 @@ err:
 }
 
 static int bch2_mark_stripe_ptr(struct btree_trans *trans,
+				struct bkey_s_c k,
 				struct bch_extent_stripe_ptr p,
 				enum bch_data_type data_type,
 				s64 sectors,
@@ -959,7 +977,7 @@ static int bch2_mark_stripe_ptr(struct btree_trans *trans,
 	spin_unlock(&c->ec_stripes_heap_lock);
 
 	r.e.data_type = data_type;
-	update_replicas(c, &r.e, sectors, trans->journal_res.seq, gc);
+	update_replicas(c, k, &r.e, sectors, trans->journal_res.seq, gc);
 
 	return 0;
 }
@@ -1005,8 +1023,8 @@ static int bch2_mark_extent(struct btree_trans *trans,
 
 		if (p.ptr.cached) {
 			if (!stale) {
-				ret = update_cached_sectors(c, p.ptr.dev, disk_sectors,
-							    journal_seq, gc);
+				ret = update_cached_sectors(c, k, p.ptr.dev,
+						disk_sectors, journal_seq, gc);
 				if (ret) {
 					bch2_fs_fatal_error(c, "bch2_mark_extent(): no replicas entry while updating cached sectors");
 					return ret;
@@ -1016,7 +1034,7 @@ static int bch2_mark_extent(struct btree_trans *trans,
 			dirty_sectors	       += disk_sectors;
 			r.e.devs[r.e.nr_devs++]	= p.ptr.dev;
 		} else {
-			ret = bch2_mark_stripe_ptr(trans, p.ec, data_type,
+			ret = bch2_mark_stripe_ptr(trans, k, p.ec, data_type,
 					disk_sectors, flags);
 			if (ret)
 				return ret;
@@ -1031,7 +1049,7 @@ static int bch2_mark_extent(struct btree_trans *trans,
 	}
 
 	if (r.e.nr_devs) {
-		ret = update_replicas(c, &r.e, dirty_sectors, journal_seq, gc);
+		ret = update_replicas(c, k, &r.e, dirty_sectors, journal_seq, gc);
 		if (ret) {
 			char buf[200];
 
@@ -1119,7 +1137,7 @@ static int bch2_mark_stripe(struct btree_trans *trans,
 				return ret;
 		}
 
-		ret = update_replicas(c, &m->r.e,
+		ret = update_replicas(c, new, &m->r.e,
 				      ((s64) m->sectors * m->nr_redundant),
 				      journal_seq, gc);
 		if (ret) {
diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c
index 9bfe7fa51d32..33bba6fdb180 100644
--- a/fs/bcachefs/replicas.c
+++ b/fs/bcachefs/replicas.c
@@ -434,45 +434,6 @@ int bch2_mark_replicas(struct bch_fs *c, struct bch_replicas_entry *r)
 	return __bch2_mark_replicas(c, r, false);
 }
 
-static int __bch2_mark_bkey_replicas(struct bch_fs *c, struct bkey_s_c k,
-				     bool check)
-{
-	struct bch_replicas_padded search;
-	struct bch_devs_list cached = bch2_bkey_cached_devs(k);
-	unsigned i;
-	int ret;
-
-	memset(&search, 0, sizeof(search));
-
-	for (i = 0; i < cached.nr; i++) {
-		bch2_replicas_entry_cached(&search.e, cached.devs[i]);
-
-		ret = __bch2_mark_replicas(c, &search.e, check);
-		if (ret)
-			return ret;
-	}
-
-	bch2_bkey_to_replicas(&search.e, k);
-
-	ret = __bch2_mark_replicas(c, &search.e, check);
-	if (ret)
-		return ret;
-
-	if (search.e.data_type == BCH_DATA_parity) {
-		search.e.data_type = BCH_DATA_cached;
-		ret = __bch2_mark_replicas(c, &search.e, check);
-		if (ret)
-			return ret;
-
-		search.e.data_type = BCH_DATA_user;
-		ret = __bch2_mark_replicas(c, &search.e, check);
-		if (ret)
-			return ret;
-	}
-
-	return 0;
-}
-
 /* replicas delta list: */
 
 int bch2_replicas_delta_list_mark(struct bch_fs *c,
@@ -487,19 +448,6 @@ int bch2_replicas_delta_list_mark(struct bch_fs *c,
 	return ret;
 }
 
-/* bkey replicas: */
-
-bool bch2_bkey_replicas_marked(struct bch_fs *c,
-			       struct bkey_s_c k)
-{
-	return __bch2_mark_bkey_replicas(c, k, true) == 0;
-}
-
-int bch2_mark_bkey_replicas(struct bch_fs *c, struct bkey_s_c k)
-{
-	return __bch2_mark_bkey_replicas(c, k, false);
-}
-
 /*
  * Old replicas_gc mechanism: only used for journal replicas entries now, should
  * die at some point:
diff --git a/fs/bcachefs/replicas.h b/fs/bcachefs/replicas.h
index 66ca88deb0c0..d237d7c51ccb 100644
--- a/fs/bcachefs/replicas.h
+++ b/fs/bcachefs/replicas.h
@@ -51,8 +51,6 @@ replicas_delta_next(struct replicas_delta *d)
 int bch2_replicas_delta_list_mark(struct bch_fs *, struct replicas_delta_list *);
 
 void bch2_bkey_to_replicas(struct bch_replicas_entry *, struct bkey_s_c);
-bool bch2_bkey_replicas_marked(struct bch_fs *, struct bkey_s_c);
-int bch2_mark_bkey_replicas(struct bch_fs *, struct bkey_s_c);
 
 static inline void bch2_replicas_entry_cached(struct bch_replicas_entry *e,
 					      unsigned dev)
-- 
cgit 


From b547d005d54209dc3a14ffd7924c73e32ba2e3a2 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 29 Nov 2021 16:38:27 -0500
Subject: bcachefs: Erasure coding fixes

When we added the stripe and stripe_redundancy fields to alloc keys, we
neglected to add them to the functions that convert back and forth with
the in-memory types.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/alloc_background.c |   3 ++
 fs/bcachefs/alloc_background.h |   2 +
 fs/bcachefs/btree_gc.c         |  16 +++---
 fs/bcachefs/buckets.c          | 119 +++++++++++++++++++++++++++++------------
 fs/bcachefs/ec.c               |  39 +++++++++++---
 5 files changed, 130 insertions(+), 49 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 10514476cffe..dc1e09b138b6 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -336,6 +336,9 @@ static int bch2_alloc_read_fn(struct btree_trans *trans, struct bkey_s_c k)
 	g->_mark.data_type	= u.data_type;
 	g->_mark.dirty_sectors	= u.dirty_sectors;
 	g->_mark.cached_sectors	= u.cached_sectors;
+	g->_mark.stripe		= u.stripe != 0;
+	g->stripe		= u.stripe;
+	g->stripe_redundancy	= u.stripe_redundancy;
 	g->io_time[READ]	= u.read_time;
 	g->io_time[WRITE]	= u.write_time;
 	g->oldest_gen		= u.oldest_gen;
diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h
index 370573f8e05d..b1efc1494dc4 100644
--- a/fs/bcachefs/alloc_background.h
+++ b/fs/bcachefs/alloc_background.h
@@ -65,6 +65,8 @@ alloc_mem_to_key(struct btree_iter *iter,
 		.cached_sectors	= m.cached_sectors,
 		.read_time	= g->io_time[READ],
 		.write_time	= g->io_time[WRITE],
+		.stripe		= g->stripe,
+		.stripe_redundancy = g->stripe_redundancy,
 	};
 }
 
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index b692451f91b5..6cde4234f5e9 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -1176,14 +1176,14 @@ static int bch2_gc_done(struct bch_fs *c,
 		set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags);		\
 	}
 #define copy_bucket_field(_f)						\
-	if (dst->b[b].mark._f != src->b[b].mark._f) {			\
+	if (dst->b[b]._f != src->b[b]._f) {				\
 		if (verify)						\
 			fsck_err(c, "bucket %u:%zu gen %u data type %s has wrong " #_f	\
 				": got %u, should be %u", dev, b,	\
 				dst->b[b].mark.gen,			\
 				bch2_data_types[dst->b[b].mark.data_type],\
-				dst->b[b].mark._f, src->b[b].mark._f);	\
-		dst->b[b]._mark._f = src->b[b].mark._f;			\
+				dst->b[b]._f, src->b[b]._f);		\
+		dst->b[b]._f = src->b[b]._f;				\
 		set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags);		\
 	}
 #define copy_dev_field(_f, _msg, ...)					\
@@ -1229,11 +1229,13 @@ static int bch2_gc_done(struct bch_fs *c,
 		size_t b;
 
 		for (b = 0; b < src->nbuckets; b++) {
-			copy_bucket_field(gen);
-			copy_bucket_field(data_type);
+			copy_bucket_field(_mark.gen);
+			copy_bucket_field(_mark.data_type);
+			copy_bucket_field(_mark.stripe);
+			copy_bucket_field(_mark.dirty_sectors);
+			copy_bucket_field(_mark.cached_sectors);
+			copy_bucket_field(stripe_redundancy);
 			copy_bucket_field(stripe);
-			copy_bucket_field(dirty_sectors);
-			copy_bucket_field(cached_sectors);
 
 			dst->b[b].oldest_gen = src->b[b].oldest_gen;
 		}
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index c4d72a499955..66f072905173 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -810,6 +810,8 @@ static int mark_stripe_bucket(struct btree_trans *trans,
 	const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
 	unsigned nr_data = s->nr_blocks - s->nr_redundant;
 	bool parity = ptr_idx >= nr_data;
+	enum bch_data_type data_type = parity ? BCH_DATA_parity : 0;
+	s64 sectors = parity ? le16_to_cpu(s->sectors) : 0;
 	const struct bch_extent_ptr *ptr = s->ptrs + ptr_idx;
 	bool gc = flags & BTREE_TRIGGER_GC;
 	struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
@@ -818,10 +820,13 @@ static int mark_stripe_bucket(struct btree_trans *trans,
 	char buf[200];
 	int ret = 0;
 
+	/* * XXX doesn't handle deletion */
+
 	percpu_down_read(&c->mark_lock);
 	g = PTR_BUCKET(ca, ptr, gc);
 
-	if (g->stripe && g->stripe != k.k->p.offset) {
+	if (g->mark.dirty_sectors ||
+	    (g->stripe && g->stripe != k.k->p.offset)) {
 		bch2_fs_inconsistent(c,
 			      "bucket %u:%zu gen %u: multiple stripes using same bucket\n%s",
 			      ptr->dev, PTR_BUCKET_NR(ca, ptr), g->mark.gen,
@@ -831,20 +836,22 @@ static int mark_stripe_bucket(struct btree_trans *trans,
 	}
 
 	old = bucket_cmpxchg(g, new, ({
-		ret = check_bucket_ref(c, k, ptr, 0, 0, new.gen, new.data_type,
+		ret = check_bucket_ref(c, k, ptr, sectors, data_type,
+				       new.gen, new.data_type,
 				       new.dirty_sectors, new.cached_sectors);
 		if (ret)
 			goto err;
 
-		if (parity) {
-			new.data_type		= BCH_DATA_parity;
-			new.dirty_sectors	= le16_to_cpu(s->sectors);
-		}
+		new.dirty_sectors += sectors;
+		if (data_type)
+			new.data_type		= data_type;
 
 		if (journal_seq) {
 			new.journal_seq_valid	= 1;
 			new.journal_seq		= journal_seq;
 		}
+
+		new.stripe = true;
 	}));
 
 	g->stripe		= k.k->p.offset;
@@ -1124,6 +1131,11 @@ static int bch2_mark_stripe(struct btree_trans *trans,
 	}
 
 	if (gc) {
+		/*
+		 * This will be wrong when we bring back runtime gc: we should
+		 * be unmarking the old key and then marking the new key
+		 */
+
 		/*
 		 * gc recalculates this field from stripe ptr
 		 * references:
@@ -1656,50 +1668,75 @@ static int bch2_trans_mark_extent(struct btree_trans *trans,
 	return 0;
 }
 
-static int bch2_trans_mark_stripe_alloc_ref(struct btree_trans *trans,
-					    struct bkey_s_c_stripe s,
-					    unsigned idx, bool deleting)
+static int bch2_trans_mark_stripe_bucket(struct btree_trans *trans,
+					 struct bkey_s_c_stripe s,
+					 unsigned idx, bool deleting)
 {
 	struct bch_fs *c = trans->c;
 	const struct bch_extent_ptr *ptr = &s.v->ptrs[idx];
 	struct bkey_alloc_buf *a;
 	struct btree_iter iter;
 	struct bkey_alloc_unpacked u;
-	bool parity = idx >= s.v->nr_blocks - s.v->nr_redundant;
+	enum bch_data_type data_type = idx >= s.v->nr_blocks - s.v->nr_redundant
+		? BCH_DATA_parity : 0;
+	s64 sectors = data_type ? le16_to_cpu(s.v->sectors) : 0;
 	int ret = 0;
 
+	if (deleting)
+		sectors = -sectors;
+
 	a = bch2_trans_start_alloc_update(trans, &iter, ptr, &u);
 	if (IS_ERR(a))
 		return PTR_ERR(a);
 
-	if (parity) {
-		s64 sectors = le16_to_cpu(s.v->sectors);
-
-		if (deleting)
-			sectors = -sectors;
-
-		u.dirty_sectors += sectors;
-		u.data_type = u.dirty_sectors
-			? BCH_DATA_parity
-			: 0;
-	}
+	ret = check_bucket_ref(c, s.s_c, ptr, sectors, data_type,
+			       u.gen, u.data_type,
+			       u.dirty_sectors, u.cached_sectors);
+	if (ret)
+		goto err;
 
 	if (!deleting) {
-		if (bch2_fs_inconsistent_on(u.stripe && u.stripe != s.k->p.offset, c,
-				"bucket %llu:%llu gen %u: multiple stripes using same bucket (%u, %llu)",
+		if (bch2_fs_inconsistent_on(u.stripe ||
+					    u.stripe_redundancy, c,
+				"bucket %llu:%llu gen %u data type %s dirty_sectors %u: multiple stripes using same bucket (%u, %llu)",
 				iter.pos.inode, iter.pos.offset, u.gen,
+				bch2_data_types[u.data_type],
+				u.dirty_sectors,
 				u.stripe, s.k->p.offset)) {
 			ret = -EIO;
 			goto err;
 		}
 
+		if (bch2_fs_inconsistent_on(data_type && u.dirty_sectors, c,
+				"bucket %llu:%llu gen %u data type %s dirty_sectors %u: data already in stripe bucket %llu",
+				iter.pos.inode, iter.pos.offset, u.gen,
+				bch2_data_types[u.data_type],
+				u.dirty_sectors,
+				s.k->p.offset)) {
+			ret = -EIO;
+			goto err;
+		}
+
 		u.stripe		= s.k->p.offset;
 		u.stripe_redundancy	= s.v->nr_redundant;
 	} else {
+		if (bch2_fs_inconsistent_on(u.stripe != s.k->p.offset ||
+					    u.stripe_redundancy != s.v->nr_redundant, c,
+				"bucket %llu:%llu gen %u: not marked as stripe when deleting stripe %llu (got %u)",
+				iter.pos.inode, iter.pos.offset, u.gen,
+				s.k->p.offset, u.stripe)) {
+			ret = -EIO;
+			goto err;
+		}
+
 		u.stripe		= 0;
 		u.stripe_redundancy	= 0;
 	}
 
+	u.dirty_sectors += sectors;
+	if (data_type)
+		u.data_type = !deleting ? data_type : 0;
+
 	bch2_alloc_pack(c, a, u);
 	bch2_trans_update(trans, &iter, &a->k, 0);
 err:
@@ -1714,7 +1751,7 @@ static int bch2_trans_mark_stripe(struct btree_trans *trans,
 	struct bkey_s_c_stripe old_s = { .k = NULL };
 	struct bkey_s_c_stripe new_s = { .k = NULL };
 	struct bch_replicas_padded r;
-	unsigned i;
+	unsigned i, nr_blocks;
 	int ret = 0;
 
 	if (old.k->type == KEY_TYPE_stripe)
@@ -1732,18 +1769,17 @@ static int bch2_trans_mark_stripe(struct btree_trans *trans,
 		    new_s.v->nr_blocks * sizeof(struct bch_extent_ptr)))
 		return 0;
 
+	BUG_ON(new_s.k && old_s.k &&
+	       (new_s.v->nr_blocks	!= old_s.v->nr_blocks ||
+		new_s.v->nr_redundant	!= old_s.v->nr_redundant));
+
+	nr_blocks = new_s.k ? new_s.v->nr_blocks : old_s.v->nr_blocks;
+
 	if (new_s.k) {
 		s64 sectors = le16_to_cpu(new_s.v->sectors);
 
 		bch2_bkey_to_replicas(&r.e, new);
 		update_replicas_list(trans, &r.e, sectors * new_s.v->nr_redundant);
-
-		for (i = 0; i < new_s.v->nr_blocks; i++) {
-			ret = bch2_trans_mark_stripe_alloc_ref(trans, new_s,
-							       i, false);
-			if (ret)
-				return ret;
-		}
 	}
 
 	if (old_s.k) {
@@ -1751,12 +1787,25 @@ static int bch2_trans_mark_stripe(struct btree_trans *trans,
 
 		bch2_bkey_to_replicas(&r.e, old);
 		update_replicas_list(trans, &r.e, sectors * old_s.v->nr_redundant);
+	}
+
+	for (i = 0; i < nr_blocks; i++) {
+		if (new_s.k && old_s.k &&
+		    !memcmp(&new_s.v->ptrs[i],
+			    &old_s.v->ptrs[i],
+			    sizeof(new_s.v->ptrs[i])))
+			continue;
 
-		for (i = 0; i < old_s.v->nr_blocks; i++) {
-			ret = bch2_trans_mark_stripe_alloc_ref(trans, old_s,
-							       i, true);
+		if (new_s.k) {
+			ret = bch2_trans_mark_stripe_bucket(trans, new_s, i, false);
 			if (ret)
-				return ret;
+				break;
+		}
+
+		if (old_s.k) {
+			ret = bch2_trans_mark_stripe_bucket(trans, old_s, i, true);
+			if (ret)
+				break;
 		}
 	}
 
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index b8e9bc91bf0b..689602d18589 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -15,6 +15,7 @@
 #include "io.h"
 #include "keylist.h"
 #include "recovery.h"
+#include "replicas.h"
 #include "super-io.h"
 #include "util.h"
 
@@ -1635,17 +1636,41 @@ int bch2_stripes_write(struct bch_fs *c, unsigned flags)
 
 static int bch2_stripes_read_fn(struct btree_trans *trans, struct bkey_s_c k)
 {
-	struct bkey deleted = KEY(0, 0, 0);
-	struct bkey_s_c old = (struct bkey_s_c) { &deleted, NULL };
+	const struct bch_stripe *s;
 	struct bch_fs *c = trans->c;
+	struct stripe *m;
+	unsigned i;
 	int ret = 0;
 
-	deleted.p = k.k->p;
+	if (k.k->type != KEY_TYPE_stripe)
+		return 0;
+
+	ret = __ec_stripe_mem_alloc(c, k.k->p.offset, GFP_KERNEL);
+	if (ret)
+		return ret;
+
+	s = bkey_s_c_to_stripe(k).v;
+
+	m = genradix_ptr(&c->stripes[0], k.k->p.offset);
+	m->alive	= true;
+	m->sectors	= le16_to_cpu(s->sectors);
+	m->algorithm	= s->algorithm;
+	m->nr_blocks	= s->nr_blocks;
+	m->nr_redundant	= s->nr_redundant;
+	m->blocks_nonempty = 0;
+
+	for (i = 0; i < s->nr_blocks; i++) {
+		m->block_sectors[i] =
+			stripe_blockcount_get(s, i);
+		m->blocks_nonempty += !!m->block_sectors[i];
+		m->ptrs[i] = s->ptrs[i];
+	}
+
+	bch2_bkey_to_replicas(&m->r.e, k);
 
-	if (k.k->type == KEY_TYPE_stripe)
-		ret = __ec_stripe_mem_alloc(c, k.k->p.offset, GFP_KERNEL) ?:
-			bch2_mark_key(trans, old, k,
-				      BTREE_TRIGGER_NOATOMIC);
+	spin_lock(&c->ec_stripes_heap_lock);
+	bch2_stripes_heap_update(c, m, k.k->p.offset);
+	spin_unlock(&c->ec_stripes_heap_lock);
 
 	return ret;
 }
-- 
cgit 


From 92d2ec10926d2ba8c38ba0ecada69cfd7a4dd3c4 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 1 Dec 2021 03:47:54 -0500
Subject: bcachefs: Fix btree_path leaks in bch2_trans_update()

bch2_trans_update() had some dodgy gets() and puts() - this fixes a few
leaks.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_update_leaf.c | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 205eaee11da0..65facdd1536f 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -1336,8 +1336,6 @@ int bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter,
 		.ip_allocated	= _RET_IP_,
 	};
 
-	__btree_path_get(n.path, true);
-
 #ifdef CONFIG_BCACHEFS_DEBUG
 	trans_for_each_update(trans, i)
 		BUG_ON(i != trans->updates &&
@@ -1374,16 +1372,17 @@ int bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter,
 		if (n.cached && !i->cached) {
 			i->k = n.k;
 			i->flags = n.flags;
-
-			__btree_path_get(n.path, false);
-		} else {
-			bch2_path_put(trans, i->path, true);
-			*i = n;
+			return 0;
 		}
+
+		bch2_path_put(trans, i->path, true);
+		*i = n;
 	} else
 		array_insert_item(trans->updates, trans->nr_updates,
 				  i - trans->updates, n);
 
+	__btree_path_get(n.path, true);
+
 	return 0;
 }
 
-- 
cgit 


From 2430e72f42778a9448ff386686856b61b49f5074 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 4 Dec 2021 20:07:19 -0500
Subject: bcachefs: Convert journal sysfs params to regular options

This converts journal_write_delay, journal_flush_disabled, and
journal_reclaim_delay to normal filesystems options, and also adds them
to the superblock.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/bcachefs_format.h |  3 +++
 fs/bcachefs/journal.c         |  5 +----
 fs/bcachefs/journal_io.c      |  2 +-
 fs/bcachefs/journal_reclaim.c |  5 +++--
 fs/bcachefs/journal_types.h   |  2 --
 fs/bcachefs/opts.h            | 12 +++++++++++-
 fs/bcachefs/super.c           |  9 +++++++++
 fs/bcachefs/sysfs.c           | 20 --------------------
 8 files changed, 28 insertions(+), 30 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index 7c2846791286..bef924ab12a8 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -1440,6 +1440,9 @@ LE64_BITMASK(BCH_SB_ERASURE_CODE,	struct bch_sb, flags[3],  0, 16);
 LE64_BITMASK(BCH_SB_METADATA_TARGET,	struct bch_sb, flags[3], 16, 28);
 LE64_BITMASK(BCH_SB_SHARD_INUMS,	struct bch_sb, flags[3], 28, 29);
 LE64_BITMASK(BCH_SB_INODES_USE_KEY_CACHE,struct bch_sb, flags[3], 29, 30);
+LE64_BITMASK(BCH_SB_JOURNAL_FLUSH_DELAY,struct bch_sb, flags[3], 30, 62);
+LE64_BITMASK(BCH_SB_JOURNAL_FLUSH_DISABLED,struct bch_sb, flags[3], 62, 63);
+LE64_BITMASK(BCH_SB_JOURNAL_RECLAIM_DELAY,struct bch_sb, flags[4], 0, 32);
 
 /*
  * Features:
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index 56c477bbce0f..24d2ca676cad 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -310,7 +310,7 @@ static int journal_entry_open(struct journal *j)
 
 	mod_delayed_work(c->io_complete_wq,
 			 &j->write_work,
-			 msecs_to_jiffies(j->write_delay_ms));
+			 msecs_to_jiffies(c->opts.journal_flush_delay));
 	journal_wake(j);
 	return 0;
 }
@@ -1103,9 +1103,6 @@ int bch2_fs_journal_init(struct journal *j)
 
 	lockdep_init_map(&j->res_map, "journal res", &res_key, 0);
 
-	j->write_delay_ms	= 1000;
-	j->reclaim_delay_ms	= 100;
-
 	atomic64_set(&j->reservations.counter,
 		((union journal_res_state)
 		 { .cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL }).v);
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 0cd5ad3118e9..680ddba1889d 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -1395,7 +1395,7 @@ void bch2_journal_write(struct closure *cl)
 	spin_lock(&j->lock);
 	if (c->sb.features & (1ULL << BCH_FEATURE_journal_no_flush) &&
 	    !w->must_flush &&
-	    (jiffies - j->last_flush_write) < msecs_to_jiffies(j->write_delay_ms) &&
+	    (jiffies - j->last_flush_write) < msecs_to_jiffies(c->opts.journal_flush_delay) &&
 	    test_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags)) {
 		w->noflush = true;
 		SET_JSET_NO_FLUSH(jset, true);
diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
index 3f417af16e59..4462beb52461 100644
--- a/fs/bcachefs/journal_reclaim.c
+++ b/fs/bcachefs/journal_reclaim.c
@@ -637,7 +637,7 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct)
 		 * make sure to flush at least one journal pin:
 		 */
 		if (time_after(jiffies, j->last_flushed +
-			       msecs_to_jiffies(j->reclaim_delay_ms)))
+			       msecs_to_jiffies(c->opts.journal_reclaim_delay)))
 			min_nr = 1;
 
 		if (j->prereserved.reserved * 4 > j->prereserved.remaining)
@@ -683,6 +683,7 @@ int bch2_journal_reclaim(struct journal *j)
 static int bch2_journal_reclaim_thread(void *arg)
 {
 	struct journal *j = arg;
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
 	unsigned long delay, now;
 	int ret = 0;
 
@@ -700,7 +701,7 @@ static int bch2_journal_reclaim_thread(void *arg)
 		mutex_unlock(&j->reclaim_lock);
 
 		now = jiffies;
-		delay = msecs_to_jiffies(j->reclaim_delay_ms);
+		delay = msecs_to_jiffies(c->opts.journal_reclaim_delay);
 		j->next_reclaim = j->last_flushed + delay;
 
 		if (!time_in_range(j->next_reclaim, now, now + delay))
diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h
index 0fc6569ef149..9f59e4889f52 100644
--- a/fs/bcachefs/journal_types.h
+++ b/fs/bcachefs/journal_types.h
@@ -262,8 +262,6 @@ struct journal {
 	struct mutex		discard_lock;
 	bool			can_discard;
 
-	unsigned		write_delay_ms;
-	unsigned		reclaim_delay_ms;
 	unsigned long		last_flush_write;
 
 	u64			res_get_blocked_start;
diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
index 896b8c9c1180..187cb37cba2a 100644
--- a/fs/bcachefs/opts.h
+++ b/fs/bcachefs/opts.h
@@ -257,13 +257,23 @@ enum opt_type {
 	  OPT_BOOL(),							\
 	  NO_SB_OPT,			false,				\
 	  NULL,		"Extra debugging information during mount/recovery")\
+	x(journal_flush_delay,		u32,				\
+	  OPT_MOUNT|OPT_RUNTIME,					\
+	  OPT_UINT(0, U32_MAX),						\
+	  BCH_SB_JOURNAL_FLUSH_DELAY,	1000,				\
+	  NULL,		"Delay in milliseconds before automatic journal commits")\
 	x(journal_flush_disabled,	u8,				\
 	  OPT_MOUNT|OPT_RUNTIME,					\
 	  OPT_BOOL(),							\
-	  NO_SB_OPT,			false,				\
+	  BCH_SB_JOURNAL_FLUSH_DISABLED,false,				\
 	  NULL,		"Disable journal flush on sync/fsync\n"		\
 			"If enabled, writes can be lost, but only since the\n"\
 			"last journal write (default 1 second)")	\
+	x(journal_reclaim_delay,	u32,				\
+	  OPT_MOUNT|OPT_RUNTIME,					\
+	  OPT_UINT(0, U32_MAX),						\
+	  BCH_SB_JOURNAL_RECLAIM_DELAY,	100,				\
+	  NULL,		"Delay in milliseconds before automatic journal reclaim")\
 	x(fsck,				u8,				\
 	  OPT_MOUNT,							\
 	  OPT_BOOL(),							\
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index b24e64317a73..22dbbf77d687 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -752,6 +752,15 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 
 	scnprintf(c->name, sizeof(c->name), "%pU", &c->sb.user_uuid);
 
+	/* Compat: */
+	if (sb->version <= bcachefs_metadata_version_inode_v2 &&
+	    !BCH_SB_JOURNAL_FLUSH_DELAY(sb))
+		SET_BCH_SB_JOURNAL_FLUSH_DELAY(sb, 1000);
+
+	if (sb->version <= bcachefs_metadata_version_inode_v2 &&
+	    !BCH_SB_JOURNAL_RECLAIM_DELAY(sb))
+		SET_BCH_SB_JOURNAL_RECLAIM_DELAY(sb, 100);
+
 	c->opts = bch2_opts_default;
 	bch2_opts_apply(&c->opts, bch2_opts_from_sb(sb));
 	bch2_opts_apply(&c->opts, opts);
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index 864be8601868..fae2356061b0 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -140,8 +140,6 @@ rw_attribute(gc_gens_pos);
 read_attribute(uuid);
 read_attribute(minor);
 read_attribute(bucket_size);
-read_attribute(block_size);
-read_attribute(btree_node_size);
 read_attribute(first_bucket);
 read_attribute(nbuckets);
 read_attribute(durability);
@@ -178,9 +176,6 @@ read_attribute(read_realloc_races);
 read_attribute(extent_migrate_done);
 read_attribute(extent_migrate_raced);
 
-rw_attribute(journal_write_delay_ms);
-rw_attribute(journal_reclaim_delay_ms);
-
 rw_attribute(discard);
 rw_attribute(cache_replacement_policy);
 rw_attribute(label);
@@ -357,11 +352,6 @@ SHOW(bch2_fs)
 	sysfs_print(minor,			c->minor);
 	sysfs_printf(internal_uuid, "%pU",	c->sb.uuid.b);
 
-	sysfs_print(journal_write_delay_ms,	c->journal.write_delay_ms);
-	sysfs_print(journal_reclaim_delay_ms,	c->journal.reclaim_delay_ms);
-
-	sysfs_print(block_size,			block_bytes(c));
-	sysfs_print(btree_node_size,		btree_bytes(c));
 	sysfs_hprint(btree_cache_size,		bch2_btree_cache_size(c));
 	sysfs_hprint(btree_avg_write_size,	bch2_btree_avg_write_size(c));
 
@@ -475,9 +465,6 @@ STORE(bch2_fs)
 {
 	struct bch_fs *c = container_of(kobj, struct bch_fs, kobj);
 
-	sysfs_strtoul(journal_write_delay_ms, c->journal.write_delay_ms);
-	sysfs_strtoul(journal_reclaim_delay_ms, c->journal.reclaim_delay_ms);
-
 	if (attr == &sysfs_btree_gc_periodic) {
 		ssize_t ret = strtoul_safe(buf, c->btree_gc_periodic)
 			?: (ssize_t) size;
@@ -564,14 +551,9 @@ SYSFS_OPS(bch2_fs);
 
 struct attribute *bch2_fs_files[] = {
 	&sysfs_minor,
-	&sysfs_block_size,
-	&sysfs_btree_node_size,
 	&sysfs_btree_cache_size,
 	&sysfs_btree_avg_write_size,
 
-	&sysfs_journal_write_delay_ms,
-	&sysfs_journal_reclaim_delay_ms,
-
 	&sysfs_promote_whole_extents,
 
 	&sysfs_compression_stats,
@@ -846,7 +828,6 @@ SHOW(bch2_dev)
 	sysfs_printf(uuid,		"%pU\n", ca->uuid.b);
 
 	sysfs_print(bucket_size,	bucket_bytes(ca));
-	sysfs_print(block_size,		block_bytes(c));
 	sysfs_print(first_bucket,	ca->mi.first_bucket);
 	sysfs_print(nbuckets,		ca->mi.nbuckets);
 	sysfs_print(durability,		ca->mi.durability);
@@ -978,7 +959,6 @@ SYSFS_OPS(bch2_dev);
 struct attribute *bch2_dev_files[] = {
 	&sysfs_uuid,
 	&sysfs_bucket_size,
-	&sysfs_block_size,
 	&sysfs_first_bucket,
 	&sysfs_nbuckets,
 	&sysfs_durability,
-- 
cgit 


From 47b15c5760d5eec6c69f207eda7f779c2170e285 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 4 Dec 2021 21:52:09 -0500
Subject: bcachefs: Fix copygc sectors_to_move calculation

With erasure coding, copygc's count of sectors to move was off, which
matters for the debug statement it prints out when it's not able to move
all the data it tried to.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/move.c     |  3 +--
 fs/bcachefs/movinggc.c | 21 +++++++++++----------
 2 files changed, 12 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index 249d0b2be167..482dfc29385e 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -768,8 +768,7 @@ static int __bch2_move_data(struct bch_fs *c,
 		if (rate)
 			bch2_ratelimit_increment(rate, k.k->size);
 next:
-		atomic64_add(k.k->size * bch2_bkey_nr_ptrs_allocated(k),
-			     &stats->sectors_seen);
+		atomic64_add(k.k->size, &stats->sectors_seen);
 next_nondata:
 		bch2_btree_iter_advance(&iter);
 	}
diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c
index b05dcbbd1a47..346b9ee667ec 100644
--- a/fs/bcachefs/movinggc.c
+++ b/fs/bcachefs/movinggc.c
@@ -139,7 +139,7 @@ static int bch2_copygc(struct bch_fs *c)
 	struct copygc_heap_entry e, *i;
 	struct bucket_array *buckets;
 	struct bch_move_stats move_stats;
-	u64 sectors_to_move = 0, sectors_not_moved = 0;
+	u64 sectors_to_move = 0, sectors_to_write = 0, sectors_not_moved = 0;
 	u64 sectors_reserved = 0;
 	u64 buckets_to_move, buckets_not_moved = 0;
 	struct bch_dev *ca;
@@ -205,22 +205,23 @@ static int bch2_copygc(struct bch_fs *c)
 		up_read(&ca->bucket_lock);
 	}
 
+	/*
+	 * Our btree node allocations also come out of RESERVE_MOVINGGC:
+	 */
+	sectors_reserved = (sectors_reserved * 3) / 4;
 	if (!sectors_reserved) {
 		bch2_fs_fatal_error(c, "stuck, ran out of copygc reserve!");
 		return -1;
 	}
 
-	/*
-	 * Our btree node allocations also come out of RESERVE_MOVINGGC:
-	 */
-	sectors_to_move = (sectors_to_move * 3) / 4;
-
-	for (i = h->data; i < h->data + h->used; i++)
-		sectors_to_move += i->sectors * i->replicas;
+	for (i = h->data; i < h->data + h->used; i++) {
+		sectors_to_move += i->sectors;
+		sectors_to_write += i->sectors * i->replicas;
+	}
 
-	while (sectors_to_move > sectors_reserved) {
+	while (sectors_to_write > sectors_reserved) {
 		BUG_ON(!heap_pop(h, e, -fragmentation_cmp, NULL));
-		sectors_to_move -= e.sectors * e.replicas;
+		sectors_to_write -= e.sectors * e.replicas;
 	}
 
 	buckets_to_move = h->used;
-- 
cgit 


From 506717865bd68997dd0490409fd51a9f68915cf2 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 4 Dec 2021 21:53:13 -0500
Subject: bcachefs: Specify filesystem options

We've got three types of options now - filesystem, device and inode, and
a given option may belong to more than one of those types.

This patch changes the options to specify explicitly when they're a
filesystem option - in the future we'll probably be adding more device
options.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/opts.h  | 107 ++++++++++++++++++++++++++--------------------------
 fs/bcachefs/sysfs.c |   2 +-
 2 files changed, 55 insertions(+), 54 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
index 187cb37cba2a..aad57a82e5fd 100644
--- a/fs/bcachefs/opts.h
+++ b/fs/bcachefs/opts.h
@@ -45,11 +45,12 @@ LE64_BITMASK(NO_SB_OPT,		struct bch_sb, flags[0], 0, 0);
 
 /* When can be set: */
 enum opt_mode {
-	OPT_FORMAT	= (1 << 0),
-	OPT_MOUNT	= (1 << 1),
-	OPT_RUNTIME	= (1 << 2),
-	OPT_INODE	= (1 << 3),
-	OPT_DEVICE	= (1 << 4),
+	OPT_FS		= (1 << 0),	/* Filesystem option */
+	OPT_DEVICE	= (1 << 1),	/* Device option */
+	OPT_INODE	= (1 << 2),	/* Inode option */
+	OPT_FORMAT	= (1 << 3),	/* May be specified at format time */
+	OPT_MOUNT	= (1 << 4),	/* May be specified at mount time */
+	OPT_RUNTIME	= (1 << 5),	/* May be specified at runtime */
 };
 
 enum opt_type {
@@ -87,226 +88,226 @@ enum opt_type {
 
 #define BCH_OPTS()							\
 	x(block_size,			u16,				\
-	  OPT_FORMAT,							\
+	  OPT_FS|OPT_FORMAT,						\
 	  OPT_SECTORS(1, 128),						\
 	  BCH_SB_BLOCK_SIZE,		8,				\
 	  "size",	NULL)						\
 	x(btree_node_size,		u16,				\
-	  OPT_FORMAT,							\
+	  OPT_FS|OPT_FORMAT,						\
 	  OPT_SECTORS(1, 512),						\
 	  BCH_SB_BTREE_NODE_SIZE,	512,				\
 	  "size",	"Btree node size, default 256k")		\
 	x(errors,			u8,				\
-	  OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,				\
+	  OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,			\
 	  OPT_STR(bch2_error_actions),					\
 	  BCH_SB_ERROR_ACTION,		BCH_ON_ERROR_ro,		\
 	  NULL,		"Action to take on filesystem error")		\
 	x(metadata_replicas,		u8,				\
-	  OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,				\
+	  OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,			\
 	  OPT_UINT(1, BCH_REPLICAS_MAX),				\
 	  BCH_SB_META_REPLICAS_WANT,	1,				\
 	  "#",		"Number of metadata replicas")			\
 	x(data_replicas,		u8,				\
-	  OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE,			\
+	  OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,		\
 	  OPT_UINT(1, BCH_REPLICAS_MAX),				\
 	  BCH_SB_DATA_REPLICAS_WANT,	1,				\
 	  "#",		"Number of data replicas")			\
 	x(metadata_replicas_required, u8,				\
-	  OPT_FORMAT|OPT_MOUNT,						\
+	  OPT_FS|OPT_FORMAT|OPT_MOUNT,					\
 	  OPT_UINT(1, BCH_REPLICAS_MAX),				\
 	  BCH_SB_META_REPLICAS_REQ,	1,				\
 	  "#",		NULL)						\
 	x(data_replicas_required,	u8,				\
-	  OPT_FORMAT|OPT_MOUNT,						\
+	  OPT_FS|OPT_FORMAT|OPT_MOUNT,					\
 	  OPT_UINT(1, BCH_REPLICAS_MAX),				\
 	  BCH_SB_DATA_REPLICAS_REQ,	1,				\
 	  "#",		NULL)						\
 	x(metadata_checksum,		u8,				\
-	  OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,				\
+	  OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,			\
 	  OPT_STR(bch2_csum_opts),					\
 	  BCH_SB_META_CSUM_TYPE,	BCH_CSUM_OPT_crc32c,		\
 	  NULL,		NULL)						\
 	x(data_checksum,		u8,				\
-	  OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE,			\
+	  OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,		\
 	  OPT_STR(bch2_csum_opts),					\
 	  BCH_SB_DATA_CSUM_TYPE,	BCH_CSUM_OPT_crc32c,		\
 	  NULL,		NULL)						\
 	x(compression,			u8,				\
-	  OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE,			\
+	  OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,		\
 	  OPT_STR(bch2_compression_opts),				\
 	  BCH_SB_COMPRESSION_TYPE,	BCH_COMPRESSION_OPT_none,	\
 	  NULL,		NULL)						\
 	x(background_compression,	u8,				\
-	  OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE,			\
+	  OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,		\
 	  OPT_STR(bch2_compression_opts),				\
 	  BCH_SB_BACKGROUND_COMPRESSION_TYPE,BCH_COMPRESSION_OPT_none,	\
 	  NULL,		NULL)						\
 	x(str_hash,			u8,				\
-	  OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,				\
+	  OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,			\
 	  OPT_STR(bch2_str_hash_opts),					\
 	  BCH_SB_STR_HASH_TYPE,		BCH_STR_HASH_OPT_siphash,	\
 	  NULL,		"Hash function for directory entries and xattrs")\
 	x(metadata_target,		u16,				\
-	  OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE,			\
+	  OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,		\
 	  OPT_FN(bch2_opt_target),					\
 	  BCH_SB_METADATA_TARGET,	0,				\
 	  "(target)",	"Device or disk group for metadata writes")	\
 	x(foreground_target,		u16,				\
-	  OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE,			\
+	  OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,		\
 	  OPT_FN(bch2_opt_target),					\
 	  BCH_SB_FOREGROUND_TARGET,	0,				\
 	  "(target)",	"Device or disk group for foreground writes")	\
 	x(background_target,		u16,				\
-	  OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE,			\
+	  OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,		\
 	  OPT_FN(bch2_opt_target),					\
 	  BCH_SB_BACKGROUND_TARGET,	0,				\
 	  "(target)",	"Device or disk group to move data to in the background")\
 	x(promote_target,		u16,				\
-	  OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE,			\
+	  OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,		\
 	  OPT_FN(bch2_opt_target),					\
 	  BCH_SB_PROMOTE_TARGET,	0,				\
 	  "(target)",	"Device or disk group to promote data to on read")\
 	x(erasure_code,			u16,				\
-	  OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE,			\
+	  OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,		\
 	  OPT_BOOL(),							\
 	  BCH_SB_ERASURE_CODE,		false,				\
 	  NULL,		"Enable erasure coding (DO NOT USE YET)")	\
 	x(inodes_32bit,			u8,				\
-	  OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,				\
+	  OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,			\
 	  OPT_BOOL(),							\
 	  BCH_SB_INODE_32BIT,		true,				\
 	  NULL,		"Constrain inode numbers to 32 bits")		\
 	x(shard_inode_numbers,		u8,				\
-	  OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,				\
+	  OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,			\
 	  OPT_BOOL(),							\
 	  BCH_SB_SHARD_INUMS,		true,				\
 	  NULL,		"Shard new inode numbers by CPU id")		\
 	x(inodes_use_key_cache,	u8,					\
-	  OPT_FORMAT|OPT_MOUNT,						\
+	  OPT_FS|OPT_FORMAT|OPT_MOUNT,					\
 	  OPT_BOOL(),							\
 	  BCH_SB_INODES_USE_KEY_CACHE,	true,				\
 	  NULL,		"Use the btree key cache for the inodes btree")	\
 	x(btree_node_mem_ptr_optimization, u8,				\
-	  OPT_MOUNT|OPT_RUNTIME,					\
+	  OPT_FS|OPT_MOUNT|OPT_RUNTIME,					\
 	  OPT_BOOL(),							\
 	  NO_SB_OPT,			true,				\
 	  NULL,		"Stash pointer to in memory btree node in btree ptr")\
 	x(gc_reserve_percent,		u8,				\
-	  OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,				\
+	  OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,			\
 	  OPT_UINT(5, 21),						\
 	  BCH_SB_GC_RESERVE,		8,				\
 	  "%",		"Percentage of disk space to reserve for copygc")\
 	x(gc_reserve_bytes,		u64,				\
-	  OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,				\
+	  OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,			\
 	  OPT_SECTORS(0, U64_MAX),					\
 	  BCH_SB_GC_RESERVE_BYTES,	0,				\
 	  "%",		"Amount of disk space to reserve for copygc\n"	\
 			"Takes precedence over gc_reserve_percent if set")\
 	x(root_reserve_percent,		u8,				\
-	  OPT_FORMAT|OPT_MOUNT,						\
+	  OPT_FS|OPT_FORMAT|OPT_MOUNT,					\
 	  OPT_UINT(0, 100),						\
 	  BCH_SB_ROOT_RESERVE,		0,				\
 	  "%",		"Percentage of disk space to reserve for superuser")\
 	x(wide_macs,			u8,				\
-	  OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,				\
+	  OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,			\
 	  OPT_BOOL(),							\
 	  BCH_SB_128_BIT_MACS,		false,				\
 	  NULL,		"Store full 128 bits of cryptographic MACs, instead of 80")\
 	x(inline_data,			u8,				\
-	  OPT_MOUNT|OPT_RUNTIME,					\
+	  OPT_FS|OPT_MOUNT|OPT_RUNTIME,					\
 	  OPT_BOOL(),							\
 	  NO_SB_OPT,			true,				\
 	  NULL,		"Enable inline data extents")			\
 	x(acl,				u8,				\
-	  OPT_FORMAT|OPT_MOUNT,						\
+	  OPT_FS|OPT_FORMAT|OPT_MOUNT,					\
 	  OPT_BOOL(),							\
 	  BCH_SB_POSIX_ACL,		true,				\
 	  NULL,		"Enable POSIX acls")				\
 	x(usrquota,			u8,				\
-	  OPT_FORMAT|OPT_MOUNT,						\
+	  OPT_FS|OPT_FORMAT|OPT_MOUNT,					\
 	  OPT_BOOL(),							\
 	  BCH_SB_USRQUOTA,		false,				\
 	  NULL,		"Enable user quotas")				\
 	x(grpquota,			u8,				\
-	  OPT_FORMAT|OPT_MOUNT,						\
+	  OPT_FS|OPT_FORMAT|OPT_MOUNT,					\
 	  OPT_BOOL(),							\
 	  BCH_SB_GRPQUOTA,		false,				\
 	  NULL,		"Enable group quotas")				\
 	x(prjquota,			u8,				\
-	  OPT_FORMAT|OPT_MOUNT,						\
+	  OPT_FS|OPT_FORMAT|OPT_MOUNT,					\
 	  OPT_BOOL(),							\
 	  BCH_SB_PRJQUOTA,		false,				\
 	  NULL,		"Enable project quotas")			\
 	x(degraded,			u8,				\
-	  OPT_MOUNT,							\
+	  OPT_FS|OPT_MOUNT,						\
 	  OPT_BOOL(),							\
 	  NO_SB_OPT,			false,				\
 	  NULL,		"Allow mounting in degraded mode")		\
 	x(very_degraded,		u8,				\
-	  OPT_MOUNT,							\
+	  OPT_FS|OPT_MOUNT,						\
 	  OPT_BOOL(),							\
 	  NO_SB_OPT,			false,				\
 	  NULL,		"Allow mounting in when data will be missing")	\
 	x(discard,			u8,				\
-	  OPT_MOUNT|OPT_DEVICE,						\
+	  OPT_FS|OPT_MOUNT|OPT_DEVICE,					\
 	  OPT_BOOL(),							\
 	  NO_SB_OPT,			false,				\
 	  NULL,		"Enable discard/TRIM support")			\
 	x(verbose,			u8,				\
-	  OPT_MOUNT,							\
+	  OPT_FS|OPT_MOUNT,						\
 	  OPT_BOOL(),							\
 	  NO_SB_OPT,			false,				\
 	  NULL,		"Extra debugging information during mount/recovery")\
 	x(journal_flush_delay,		u32,				\
-	  OPT_MOUNT|OPT_RUNTIME,					\
+	  OPT_FS|OPT_MOUNT|OPT_RUNTIME,					\
 	  OPT_UINT(0, U32_MAX),						\
 	  BCH_SB_JOURNAL_FLUSH_DELAY,	1000,				\
 	  NULL,		"Delay in milliseconds before automatic journal commits")\
 	x(journal_flush_disabled,	u8,				\
-	  OPT_MOUNT|OPT_RUNTIME,					\
+	  OPT_FS|OPT_MOUNT|OPT_RUNTIME,					\
 	  OPT_BOOL(),							\
 	  BCH_SB_JOURNAL_FLUSH_DISABLED,false,				\
 	  NULL,		"Disable journal flush on sync/fsync\n"		\
 			"If enabled, writes can be lost, but only since the\n"\
 			"last journal write (default 1 second)")	\
 	x(journal_reclaim_delay,	u32,				\
-	  OPT_MOUNT|OPT_RUNTIME,					\
+	  OPT_FS|OPT_MOUNT|OPT_RUNTIME,					\
 	  OPT_UINT(0, U32_MAX),						\
 	  BCH_SB_JOURNAL_RECLAIM_DELAY,	100,				\
 	  NULL,		"Delay in milliseconds before automatic journal reclaim")\
 	x(fsck,				u8,				\
-	  OPT_MOUNT,							\
+	  OPT_FS|OPT_MOUNT,						\
 	  OPT_BOOL(),							\
 	  NO_SB_OPT,			false,				\
 	  NULL,		"Run fsck on mount")				\
 	x(fix_errors,			u8,				\
-	  OPT_MOUNT,							\
+	  OPT_FS|OPT_MOUNT,						\
 	  OPT_BOOL(),							\
 	  NO_SB_OPT,			false,				\
 	  NULL,		"Fix errors during fsck without asking")	\
 	x(ratelimit_errors,		u8,				\
-	  OPT_MOUNT,							\
+	  OPT_FS|OPT_MOUNT,						\
 	  OPT_BOOL(),							\
 	  NO_SB_OPT,			RATELIMIT_ERRORS,		\
 	  NULL,		"Ratelimit error messages during fsck")		\
 	x(nochanges,			u8,				\
-	  OPT_MOUNT,							\
+	  OPT_FS|OPT_MOUNT,						\
 	  OPT_BOOL(),							\
 	  NO_SB_OPT,			false,				\
 	  NULL,		"Super read only mode - no writes at all will be issued,\n"\
 			"even if we have to replay the journal")	\
 	x(norecovery,			u8,				\
-	  OPT_MOUNT,							\
+	  OPT_FS|OPT_MOUNT,						\
 	  OPT_BOOL(),							\
 	  NO_SB_OPT,			false,				\
 	  NULL,		"Don't replay the journal")			\
 	x(rebuild_replicas,		u8,				\
-	  OPT_MOUNT,							\
+	  OPT_FS|OPT_MOUNT,						\
 	  OPT_BOOL(),							\
 	  NO_SB_OPT,			false,				\
 	  NULL,		"Rebuild the superblock replicas section")	\
 	x(keep_journal,			u8,				\
-	  OPT_MOUNT,							\
+	  0,								\
 	  OPT_BOOL(),							\
 	  NO_SB_OPT,			false,				\
 	  NULL,		"Don't free journal entries/keys after startup")\
@@ -316,7 +317,7 @@ enum opt_type {
 	  NO_SB_OPT,			false,				\
 	  NULL,		"Read all journal entries, not just dirty ones")\
 	x(noexcl,			u8,				\
-	  OPT_MOUNT,							\
+	  OPT_FS|OPT_MOUNT,						\
 	  OPT_BOOL(),							\
 	  NO_SB_OPT,			false,				\
 	  NULL,		"Don't open device in exclusive mode")		\
@@ -326,7 +327,7 @@ enum opt_type {
 	  NO_SB_OPT,			BCH_SB_SECTOR,			\
 	  "offset",	"Sector offset of superblock")			\
 	x(read_only,			u8,				\
-	  0,								\
+	  OPT_FS,							\
 	  OPT_BOOL(),							\
 	  NO_SB_OPT,			false,				\
 	  NULL,		NULL)						\
@@ -336,12 +337,12 @@ enum opt_type {
 	  NO_SB_OPT,			false,				\
 	  NULL,		"Don\'t start filesystem, only open devices")	\
 	x(reconstruct_alloc,		u8,				\
-	  OPT_MOUNT,							\
+	  OPT_FS|OPT_MOUNT,						\
 	  OPT_BOOL(),							\
 	  NO_SB_OPT,			false,				\
 	  NULL,		"Reconstruct alloc btree")			\
 	x(version_upgrade,		u8,				\
-	  OPT_MOUNT,							\
+	  OPT_FS|OPT_MOUNT,						\
 	  OPT_BOOL(),							\
 	  NO_SB_OPT,			false,				\
 	  NULL,		"Set superblock to latest version,\n"		\
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index fae2356061b0..d5d32bf16d68 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -685,7 +685,7 @@ int bch2_opts_create_sysfs_files(struct kobject *kobj)
 	for (i = bch2_opt_table;
 	     i < bch2_opt_table + bch2_opts_nr;
 	     i++) {
-		if (!(i->mode & (OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME)))
+		if (!(i->mode & OPT_FS))
 			continue;
 
 		ret = sysfs_create_file(kobj, &i->attr);
-- 
cgit 


From 1d81313f22205bfd844bd2e13e7e3ea5d50cd673 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 4 Dec 2021 22:03:07 -0500
Subject: bcachefs: Make __bch2_journal_debug_to_text() more readable

Switch to one line of output per pr_buf() call - longer lines but quite
a bit more readable.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/journal.c | 83 +++++++++++++++++----------------------------------
 1 file changed, 28 insertions(+), 55 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index 24d2ca676cad..3ce6a78263ba 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -1134,44 +1134,29 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
 	union journal_res_state s;
 	struct bch_dev *ca;
+	unsigned long now = jiffies;
 	unsigned i;
 
 	rcu_read_lock();
 	s = READ_ONCE(j->reservations);
 
-	pr_buf(out,
-	       "active journal entries:\t%llu\n"
-	       "seq:\t\t\t%llu\n"
-	       "last_seq:\t\t%llu\n"
-	       "last_seq_ondisk:\t%llu\n"
-	       "flushed_seq_ondisk:\t%llu\n"
-	       "prereserved:\t\t%u/%u\n"
-	       "each entry reserved:\t%u\n"
-	       "nr flush writes:\t%llu\n"
-	       "nr noflush writes:\t%llu\n"
-	       "nr direct reclaim:\t%llu\n"
-	       "nr background reclaim:\t%llu\n"
-	       "reclaim kicked:\t\t%u\n"
-	       "reclaim runs in:\t%u ms\n"
-	       "current entry sectors:\t%u\n"
-	       "current entry error:\t%u\n"
-	       "current entry:\t\t",
-	       fifo_used(&j->pin),
-	       journal_cur_seq(j),
-	       journal_last_seq(j),
-	       j->last_seq_ondisk,
-	       j->flushed_seq_ondisk,
-	       j->prereserved.reserved,
-	       j->prereserved.remaining,
-	       j->entry_u64s_reserved,
-	       j->nr_flush_writes,
-	       j->nr_noflush_writes,
-	       j->nr_direct_reclaim,
-	       j->nr_background_reclaim,
-	       j->reclaim_kicked,
-	       jiffies_to_msecs(j->next_reclaim - jiffies),
-	       j->cur_entry_sectors,
-	       j->cur_entry_error);
+	pr_buf(out, "active journal entries:\t%llu\n",	fifo_used(&j->pin));
+	pr_buf(out, "seq:\t\t\t%llu\n",			journal_cur_seq(j));
+	pr_buf(out, "last_seq:\t\t%llu\n",		journal_last_seq(j));
+	pr_buf(out, "last_seq_ondisk:\t%llu\n",		j->last_seq_ondisk);
+	pr_buf(out, "flushed_seq_ondisk:\t%llu\n",	j->flushed_seq_ondisk);
+	pr_buf(out, "prereserved:\t\t%u/%u\n",		j->prereserved.reserved, j->prereserved.remaining);
+	pr_buf(out, "each entry reserved:\t%u\n",	j->entry_u64s_reserved);
+	pr_buf(out, "nr flush writes:\t%llu\n",		j->nr_flush_writes);
+	pr_buf(out, "nr noflush writes:\t%llu\n",	j->nr_noflush_writes);
+	pr_buf(out, "nr direct reclaim:\t%llu\n",	j->nr_direct_reclaim);
+	pr_buf(out, "nr background reclaim:\t%llu\n",	j->nr_background_reclaim);
+	pr_buf(out, "reclaim kicked:\t\t%u\n",		j->reclaim_kicked);
+	pr_buf(out, "reclaim runs in:\t%u ms\n",	time_after(j->next_reclaim, now)
+	       ? jiffies_to_msecs(j->next_reclaim - jiffies) : 0);
+	pr_buf(out, "current entry sectors:\t%u\n",	j->cur_entry_sectors);
+	pr_buf(out, "current entry error:\t%u\n",	j->cur_entry_error);
+	pr_buf(out, "current entry:\t\t");
 
 	switch (s.cur_entry_offset) {
 	case JOURNAL_ENTRY_ERROR_VAL:
@@ -1181,15 +1166,11 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
 		pr_buf(out, "closed\n");
 		break;
 	default:
-		pr_buf(out, "%u/%u\n",
-		       s.cur_entry_offset,
-		       j->cur_entry_u64s);
+		pr_buf(out, "%u/%u\n", s.cur_entry_offset, j->cur_entry_u64s);
 		break;
 	}
 
-	pr_buf(out,
-	       "current entry:\t\tidx %u refcount %u\n",
-	       s.idx, journal_state_count(s, s.idx));
+	pr_buf(out, "current entry:\t\tidx %u refcount %u\n", s.idx, journal_state_count(s, s.idx));
 
 	i = s.idx;
 	while (i != s.unwritten_idx) {
@@ -1229,22 +1210,14 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
 		if (!ja->nr)
 			continue;
 
-		pr_buf(out,
-		       "dev %u:\n"
-		       "\tnr\t\t%u\n"
-		       "\tbucket size\t%u\n"
-		       "\tavailable\t%u:%u\n"
-		       "\tdiscard_idx\t%u\n"
-		       "\tdirty_ondisk\t%u (seq %llu)\n"
-		       "\tdirty_idx\t%u (seq %llu)\n"
-		       "\tcur_idx\t\t%u (seq %llu)\n",
-		       i, ja->nr, ca->mi.bucket_size,
-		       bch2_journal_dev_buckets_available(j, ja, journal_space_discarded),
-		       ja->sectors_free,
-		       ja->discard_idx,
-		       ja->dirty_idx_ondisk,	ja->bucket_seq[ja->dirty_idx_ondisk],
-		       ja->dirty_idx,		ja->bucket_seq[ja->dirty_idx],
-		       ja->cur_idx,		ja->bucket_seq[ja->cur_idx]);
+		pr_buf(out, "dev %u:\n",		i);
+		pr_buf(out, "\tnr\t\t%u\n",		ja->nr);
+		pr_buf(out, "\tbucket size\t%u\n",	ca->mi.bucket_size);
+		pr_buf(out, "\tavailable\t%u:%u\n",	bch2_journal_dev_buckets_available(j, ja, journal_space_discarded), ja->sectors_free);
+		pr_buf(out, "\tdiscard_idx\t%u\n",	ja->discard_idx);
+		pr_buf(out, "\tdirty_ondisk\t%u (seq %llu)\n", ja->dirty_idx_ondisk,	ja->bucket_seq[ja->dirty_idx_ondisk]);
+		pr_buf(out, "\tdirty_idx\t%u (seq %llu)\n", ja->dirty_idx,		ja->bucket_seq[ja->dirty_idx]);
+		pr_buf(out, "\tcur_idx\t\t%u (seq %llu)\n", ja->cur_idx,		ja->bucket_seq[ja->cur_idx]);
 	}
 
 	rcu_read_unlock();
-- 
cgit 


From 94a3e1a6c1bd441b58972ee0216593fc0b09ee75 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 5 Dec 2021 00:30:49 -0500
Subject: bcachefs: bch2_trans_update() is now __must_check

With snapshots, bch2_trans_update() has to check if we need a whitout,
which can cause a transaction restart, so this is important now.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_update.h      |  4 ++--
 fs/bcachefs/btree_update_leaf.c |  4 ++--
 fs/bcachefs/buckets.c           | 16 ++++++++++++----
 fs/bcachefs/dirent.c            |  8 ++++++--
 fs/bcachefs/subvolume.c         | 21 +++++++++++++--------
 fs/bcachefs/tests.c             |  4 ++--
 6 files changed, 37 insertions(+), 20 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
index 4c1a1b617bf1..a61b64fc0859 100644
--- a/fs/bcachefs/btree_update.h
+++ b/fs/bcachefs/btree_update.h
@@ -73,8 +73,8 @@ int bch2_btree_node_update_key(struct btree_trans *, struct btree_iter *,
 int bch2_btree_node_update_key_get_iter(struct btree_trans *,
 				struct btree *, struct bkey_i *, bool);
 
-int bch2_trans_update(struct btree_trans *, struct btree_iter *,
-		      struct bkey_i *, enum btree_update_flags);
+int __must_check bch2_trans_update(struct btree_trans *, struct btree_iter *,
+				   struct bkey_i *, enum btree_update_flags);
 void bch2_trans_commit_hook(struct btree_trans *,
 			    struct btree_trans_commit_hook *);
 int __bch2_trans_commit(struct btree_trans *);
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 65facdd1536f..1079daef4e86 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -1312,8 +1312,8 @@ static int need_whiteout_for_snapshot(struct btree_trans *trans,
 	return ret;
 }
 
-int bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter,
-		      struct bkey_i *k, enum btree_update_flags flags)
+int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter,
+				   struct bkey_i *k, enum btree_update_flags flags)
 {
 	struct btree_insert_entry *i, n;
 
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 66f072905173..ef018c27d276 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -1549,7 +1549,9 @@ static int bch2_trans_mark_pointer(struct btree_trans *trans,
 		goto out;
 
 	bch2_alloc_pack(c, a, u);
-	bch2_trans_update(trans, &iter, &a->k, 0);
+	ret = bch2_trans_update(trans, &iter, &a->k, 0);
+	if (ret)
+		goto out;
 out:
 	bch2_trans_iter_exit(trans, &iter);
 	return ret;
@@ -1600,7 +1602,9 @@ static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans,
 	stripe_blockcount_set(&s->v, p.ec.block,
 		stripe_blockcount_get(&s->v, p.ec.block) +
 		sectors);
-	bch2_trans_update(trans, &iter, &s->k_i, 0);
+	ret = bch2_trans_update(trans, &iter, &s->k_i, 0);
+	if (ret)
+		goto err;
 
 	bch2_bkey_to_replicas(&r.e, bkey_i_to_s_c(&s->k_i));
 	r.e.data_type = data_type;
@@ -1738,7 +1742,9 @@ static int bch2_trans_mark_stripe_bucket(struct btree_trans *trans,
 		u.data_type = !deleting ? data_type : 0;
 
 	bch2_alloc_pack(c, a, u);
-	bch2_trans_update(trans, &iter, &a->k, 0);
+	ret = bch2_trans_update(trans, &iter, &a->k, 0);
+	if (ret)
+		goto err;
 err:
 	bch2_trans_iter_exit(trans, &iter);
 	return ret;
@@ -2017,7 +2023,9 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
 	u.dirty_sectors	= sectors;
 
 	bch2_alloc_pack(c, a, u);
-	bch2_trans_update(trans, &iter, &a->k, 0);
+	ret = bch2_trans_update(trans, &iter, &a->k, 0);
+	if (ret)
+		goto out;
 out:
 	bch2_trans_iter_exit(trans, &iter);
 	return ret;
diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c
index fe4a85a6a8cb..a165d08c3668 100644
--- a/fs/bcachefs/dirent.c
+++ b/fs/bcachefs/dirent.c
@@ -367,7 +367,9 @@ int bch2_dirent_rename(struct btree_trans *trans,
 		}
 	}
 
-	bch2_trans_update(trans, &dst_iter, &new_dst->k_i, 0);
+	ret = bch2_trans_update(trans, &dst_iter, &new_dst->k_i, 0);
+	if (ret)
+		goto out;
 out_set_src:
 
 	/*
@@ -384,7 +386,9 @@ out_set_src:
 		src_update_flags |= BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE;
 	}
 
-	bch2_trans_update(trans, &src_iter, &new_src->k_i, src_update_flags);
+	ret = bch2_trans_update(trans, &src_iter, &new_src->k_i, src_update_flags);
+	if (ret)
+		goto out;
 
 	if (mode == BCH_RENAME_EXCHANGE)
 		*src_offset = new_src->k.p.offset;
diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c
index 7e909a118189..8aeb2e417a15 100644
--- a/fs/bcachefs/subvolume.c
+++ b/fs/bcachefs/subvolume.c
@@ -488,7 +488,7 @@ static int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent,
 		n = bch2_trans_kmalloc(trans, sizeof(*n));
 		ret = PTR_ERR_OR_ZERO(n);
 		if (ret)
-			return ret;
+			goto err;
 
 		bkey_snapshot_init(&n->k_i);
 		n->k.p		= iter.pos;
@@ -498,11 +498,10 @@ static int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent,
 		n->v.pad	= 0;
 		SET_BCH_SNAPSHOT_SUBVOL(&n->v, true);
 
-		bch2_trans_update(trans, &iter, &n->k_i, 0);
-
-		ret = bch2_mark_snapshot(trans, bkey_s_c_null, bkey_i_to_s_c(&n->k_i), 0);
+		ret   = bch2_trans_update(trans, &iter, &n->k_i, 0) ?:
+			bch2_mark_snapshot(trans, bkey_s_c_null, bkey_i_to_s_c(&n->k_i), 0);
 		if (ret)
-			break;
+			goto err;
 
 		new_snapids[i]	= iter.pos.offset;
 	}
@@ -536,7 +535,9 @@ static int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent,
 		n->v.children[0] = cpu_to_le32(new_snapids[0]);
 		n->v.children[1] = cpu_to_le32(new_snapids[1]);
 		SET_BCH_SNAPSHOT_SUBVOL(&n->v, false);
-		bch2_trans_update(trans, &iter, &n->k_i, 0);
+		ret = bch2_trans_update(trans, &iter, &n->k_i, 0);
+		if (ret)
+			goto err;
 	}
 err:
 	bch2_trans_iter_exit(trans, &iter);
@@ -1049,7 +1050,9 @@ found_slot:
 
 	if (src_subvolid) {
 		src_subvol->v.snapshot = cpu_to_le32(new_nodes[1]);
-		bch2_trans_update(trans, &src_iter, &src_subvol->k_i, 0);
+		ret = bch2_trans_update(trans, &src_iter, &src_subvol->k_i, 0);
+		if (ret)
+			goto err;
 	}
 
 	new_subvol = bch2_trans_kmalloc(trans, sizeof(*new_subvol));
@@ -1064,7 +1067,9 @@ found_slot:
 	SET_BCH_SUBVOLUME_RO(&new_subvol->v, ro);
 	SET_BCH_SUBVOLUME_SNAP(&new_subvol->v, src_subvolid != 0);
 	new_subvol->k.p		= dst_iter.pos;
-	bch2_trans_update(trans, &dst_iter, &new_subvol->k_i, 0);
+	ret = bch2_trans_update(trans, &dst_iter, &new_subvol->k_i, 0);
+	if (ret)
+		goto err;
 
 	*new_subvolid	= new_subvol->k.p.offset;
 	*new_snapshotid	= new_nodes[0];
diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c
index 6023661ece16..145b85320d22 100644
--- a/fs/bcachefs/tests.c
+++ b/fs/bcachefs/tests.c
@@ -584,10 +584,10 @@ static int rand_mixed_trans(struct btree_trans *trans,
 	if (!(i & 3) && k.k) {
 		bkey_cookie_init(&cookie->k_i);
 		cookie->k.p = iter->pos;
-		bch2_trans_update(trans, iter, &cookie->k_i, 0);
+		ret = bch2_trans_update(trans, iter, &cookie->k_i, 0);
 	}
 
-	return 0;
+	return ret;
 }
 
 static int rand_mixed(struct bch_fs *c, u64 nr)
-- 
cgit 


From f54788cc8c79cad2ac8016d1c4a8a1373a4d7707 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 8 Dec 2021 13:31:36 -0500
Subject: bcachefs: Convert a BUG_ON() to a warning

A user reported hitting this assertion, and we can't reproduce it yet,
but it shouldn't be fatal - so convert it to a warning.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/fs-io.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index d5320719dc95..2f144fa3298d 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -1249,7 +1249,7 @@ static void bch2_writepage_io_done(struct bch_write_op *op)
 	 * racing with fallocate can cause us to add fewer sectors than
 	 * expected - but we shouldn't add more sectors than expected:
 	 */
-	BUG_ON(io->op.i_sectors_delta > 0);
+	WARN_ON(io->op.i_sectors_delta > 0);
 
 	/*
 	 * (error (due to going RO) halfway through a page can screw that up
-- 
cgit 


From 990d42d1873c16b6080c887f6bb27e56c0f885cf Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 4 Dec 2021 23:07:33 -0500
Subject: bcachefs: Split out struct gc_stripe from struct stripe

We have two radix trees of stripes - one that mirrors some information
from the stripes btree in normal operation, and another that GC uses to
recalculate block usage counts.

The normal one is now only used for finding partially empty stripes in
order to reuse them - the normal stripes radix tree and the GC stripes
radix tree are used significantly differently, so this patch splits them
into separate types.

In an upcoming patch we'll be replacing c->stripes with a btree that
indexes stripes by the order we want to reuse them.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/bcachefs.h |   3 +-
 fs/bcachefs/btree_gc.c | 110 ++++++++++++++++++++++++------------
 fs/bcachefs/buckets.c  | 116 +++++++++++++++++++-------------------
 fs/bcachefs/ec.c       | 147 ++++++++++---------------------------------------
 fs/bcachefs/ec.h       |   3 +-
 fs/bcachefs/ec_types.h |   9 +++
 fs/bcachefs/recovery.c |   3 +-
 7 files changed, 176 insertions(+), 215 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 4d3cfb64a656..5e9378843476 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -826,7 +826,8 @@ mempool_t		bio_bounce_pages;
 	struct mutex		data_progress_lock;
 
 	/* STRIPES: */
-	GENRADIX(struct stripe) stripes[2];
+	GENRADIX(struct stripe) stripes;
+	GENRADIX(struct gc_stripe) gc_stripes;
 
 	ec_stripes_heap		ec_stripes_heap;
 	spinlock_t		ec_stripes_heap_lock;
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 6cde4234f5e9..dc4562a1e122 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -597,7 +597,7 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id,
 		}
 
 		if (p.has_ec) {
-			struct stripe *m = genradix_ptr(&c->stripes[true], p.ec.idx);
+			struct gc_stripe *m = genradix_ptr(&c->gc_stripes, p.ec.idx);
 
 			if (fsck_err_on(!m || !m->alive, c,
 					"pointer to nonexistent stripe %llu\n"
@@ -665,7 +665,7 @@ again:
 			ptrs = bch2_bkey_ptrs(bkey_i_to_s(new));
 			bkey_extent_entry_for_each(ptrs, entry) {
 				if (extent_entry_type(entry) == BCH_EXTENT_ENTRY_stripe_ptr) {
-					struct stripe *m = genradix_ptr(&c->stripes[true],
+					struct gc_stripe *m = genradix_ptr(&c->gc_stripes,
 									entry->stripe_ptr.idx);
 					union bch_extent_entry *next_ptr;
 
@@ -1132,7 +1132,8 @@ static void bch2_gc_free(struct bch_fs *c)
 	struct bch_dev *ca;
 	unsigned i;
 
-	genradix_free(&c->stripes[1]);
+	genradix_free(&c->reflink_gc_table);
+	genradix_free(&c->gc_stripes);
 
 	for_each_member_device(ca, c, i) {
 		kvpfree(rcu_dereference_protected(ca->buckets[1], 1),
@@ -1191,35 +1192,6 @@ static int bch2_gc_done(struct bch_fs *c,
 #define copy_fs_field(_f, _msg, ...)					\
 	copy_field(_f, "fs has wrong " _msg, ##__VA_ARGS__)
 
-	if (!metadata_only) {
-		struct genradix_iter iter = genradix_iter_init(&c->stripes[1], 0);
-		struct stripe *dst, *src;
-
-		while ((src = genradix_iter_peek(&iter, &c->stripes[1]))) {
-			dst = genradix_ptr_alloc(&c->stripes[0], iter.pos, GFP_KERNEL);
-
-			if (dst->alive		!= src->alive ||
-			    dst->sectors	!= src->sectors ||
-			    dst->algorithm	!= src->algorithm ||
-			    dst->nr_blocks	!= src->nr_blocks ||
-			    dst->nr_redundant	!= src->nr_redundant) {
-				bch_err(c, "unexpected stripe inconsistency at bch2_gc_done, confused");
-				ret = -EINVAL;
-				goto fsck_err;
-			}
-
-			for (i = 0; i < ARRAY_SIZE(dst->block_sectors); i++)
-				copy_stripe_field(block_sectors[i],
-						  "block_sectors[%u]", i);
-
-			dst->blocks_nonempty = 0;
-			for (i = 0; i < dst->nr_blocks; i++)
-				dst->blocks_nonempty += dst->block_sectors[i] != 0;
-
-			genradix_iter_advance(&iter, &c->stripes[1]);
-		}
-	}
-
 	for (i = 0; i < ARRAY_SIZE(c->usage); i++)
 		bch2_fs_usage_acc_to_base(c, i);
 
@@ -1510,12 +1482,82 @@ static int bch2_gc_reflink_done(struct bch_fs *c, bool initial,
 fsck_err:
 	bch2_trans_iter_exit(&trans, &iter);
 out:
-	genradix_free(&c->reflink_gc_table);
 	c->reflink_gc_nr = 0;
 	bch2_trans_exit(&trans);
 	return ret;
 }
 
+static int bch2_gc_stripes_done_initial_fn(struct btree_trans *trans,
+					   struct bkey_s_c k)
+{
+	struct bch_fs *c = trans->c;
+	struct gc_stripe *m;
+	const struct bch_stripe *s;
+	char buf[200];
+	unsigned i;
+	int ret = 0;
+
+	if (k.k->type != KEY_TYPE_stripe)
+		return 0;
+
+	s = bkey_s_c_to_stripe(k).v;
+
+	m = genradix_ptr(&c->gc_stripes, k.k->p.offset);
+
+	for (i = 0; i < s->nr_blocks; i++)
+		if (stripe_blockcount_get(s, i) != (m ? m->block_sectors[i] : 0))
+			goto inconsistent;
+	return 0;
+inconsistent:
+	if (fsck_err_on(true, c,
+			"stripe has wrong block sector count %u:\n"
+			"  %s\n"
+			"  should be %u", i,
+			(bch2_bkey_val_to_text(&PBUF(buf), c, k), buf),
+			m ? m->block_sectors[i] : 0)) {
+		struct bkey_i_stripe *new;
+
+		new = kmalloc(bkey_bytes(k.k), GFP_KERNEL);
+		if (!new) {
+			ret = -ENOMEM;
+			goto fsck_err;
+		}
+
+		bkey_reassemble(&new->k_i, k);
+
+		for (i = 0; i < new->v.nr_blocks; i++)
+			stripe_blockcount_set(&new->v, i, m ? m->block_sectors[i] : 0);
+
+		ret = bch2_journal_key_insert(c, BTREE_ID_stripes, 0, &new->k_i);
+		if (ret)
+			kfree(new);
+	}
+fsck_err:
+	return ret;
+}
+
+static int bch2_gc_stripes_done(struct bch_fs *c, bool initial,
+				bool metadata_only)
+{
+	struct btree_trans trans;
+	int ret = 0;
+
+	if (metadata_only)
+		return 0;
+
+	bch2_trans_init(&trans, c, 0, 0);
+
+	if (initial) {
+		ret = bch2_btree_and_journal_walk(&trans, BTREE_ID_stripes,
+				bch2_gc_stripes_done_initial_fn);
+	} else {
+		BUG();
+	}
+
+	bch2_trans_exit(&trans);
+	return ret;
+}
+
 static int bch2_gc_reflink_start_initial_fn(struct btree_trans *trans,
 					    struct bkey_s_c k)
 {
@@ -1551,7 +1593,6 @@ static int bch2_gc_reflink_start(struct bch_fs *c, bool initial,
 		return 0;
 
 	bch2_trans_init(&trans, c, 0, 0);
-	genradix_free(&c->reflink_gc_table);
 	c->reflink_gc_nr = 0;
 
 	if (initial) {
@@ -1685,6 +1726,7 @@ out:
 
 		percpu_down_write(&c->mark_lock);
 		ret   = bch2_gc_reflink_done(c, initial, metadata_only) ?:
+			bch2_gc_stripes_done(c, initial, metadata_only) ?:
 			bch2_gc_done(c, initial, metadata_only);
 
 		bch2_journal_unblock(&c->journal);
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index ef018c27d276..bf564757aa28 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -953,39 +953,34 @@ static int bch2_mark_stripe_ptr(struct btree_trans *trans,
 	bool gc = flags & BTREE_TRIGGER_GC;
 	struct bch_fs *c = trans->c;
 	struct bch_replicas_padded r;
-	struct stripe *m;
-	unsigned i, blocks_nonempty = 0;
 
-	m = genradix_ptr(&c->stripes[gc], p.idx);
+	if (!gc) {
+		BUG();
+	} else {
+		struct gc_stripe *m = genradix_ptr_alloc(&c->gc_stripes, p.idx, GFP_KERNEL);
 
-	spin_lock(&c->ec_stripes_heap_lock);
+		if (!m)
+			return -ENOMEM;
 
-	if (!m || !m->alive) {
-		spin_unlock(&c->ec_stripes_heap_lock);
-		bch_err_ratelimited(c, "pointer to nonexistent stripe %llu",
-				    (u64) p.idx);
-		bch2_inconsistent_error(c);
-		return -EIO;
-	}
+		spin_lock(&c->ec_stripes_heap_lock);
 
-	m->block_sectors[p.block] += sectors;
+		if (!m || !m->alive) {
+			spin_unlock(&c->ec_stripes_heap_lock);
+			bch_err_ratelimited(c, "pointer to nonexistent stripe %llu",
+					    (u64) p.idx);
+			bch2_inconsistent_error(c);
+			return -EIO;
+		}
 
-	r = m->r;
+		m->block_sectors[p.block] += sectors;
 
-	for (i = 0; i < m->nr_blocks; i++)
-		blocks_nonempty += m->block_sectors[i] != 0;
+		r = m->r;
+		spin_unlock(&c->ec_stripes_heap_lock);
 
-	if (m->blocks_nonempty != blocks_nonempty) {
-		m->blocks_nonempty = blocks_nonempty;
-		if (!gc)
-			bch2_stripes_heap_update(c, m, p.idx);
+		r.e.data_type = data_type;
+		update_replicas(c, k, &r.e, sectors, trans->journal_res.seq, gc);
 	}
 
-	spin_unlock(&c->ec_stripes_heap_lock);
-
-	r.e.data_type = data_type;
-	update_replicas(c, k, &r.e, sectors, trans->journal_res.seq, gc);
-
 	return 0;
 }
 
@@ -1081,67 +1076,69 @@ static int bch2_mark_stripe(struct btree_trans *trans,
 		? bkey_s_c_to_stripe(old).v : NULL;
 	const struct bch_stripe *new_s = new.k->type == KEY_TYPE_stripe
 		? bkey_s_c_to_stripe(new).v : NULL;
-	struct stripe *m = genradix_ptr(&c->stripes[gc], idx);
 	unsigned i;
 	int ret;
 
 	BUG_ON(gc && old_s);
 
-	if (!m || (old_s && !m->alive)) {
-		char buf1[200], buf2[200];
+	if (!gc) {
+		struct stripe *m = genradix_ptr(&c->stripes, idx);
 
-		bch2_bkey_val_to_text(&PBUF(buf1), c, old);
-		bch2_bkey_val_to_text(&PBUF(buf2), c, new);
-		bch_err_ratelimited(c, "error marking nonexistent stripe %zu while marking\n"
-				    "old %s\n"
-				    "new %s", idx, buf1, buf2);
-		bch2_inconsistent_error(c);
-		return -1;
-	}
+		if (!m || (old_s && !m->alive)) {
+			char buf1[200], buf2[200];
 
-	if (!new_s) {
-		spin_lock(&c->ec_stripes_heap_lock);
-		bch2_stripes_heap_del(c, m, idx);
-		spin_unlock(&c->ec_stripes_heap_lock);
-
-		memset(m, 0, sizeof(*m));
-	} else {
-		m->alive	= true;
-		m->sectors	= le16_to_cpu(new_s->sectors);
-		m->algorithm	= new_s->algorithm;
-		m->nr_blocks	= new_s->nr_blocks;
-		m->nr_redundant	= new_s->nr_redundant;
-		m->blocks_nonempty = 0;
+			bch2_bkey_val_to_text(&PBUF(buf1), c, old);
+			bch2_bkey_val_to_text(&PBUF(buf2), c, new);
+			bch_err_ratelimited(c, "error marking nonexistent stripe %zu while marking\n"
+					    "old %s\n"
+					    "new %s", idx, buf1, buf2);
+			bch2_inconsistent_error(c);
+			return -1;
+		}
 
-		for (i = 0; i < new_s->nr_blocks; i++) {
-			m->block_sectors[i] =
-				stripe_blockcount_get(new_s, i);
-			m->blocks_nonempty += !!m->block_sectors[i];
+		if (!new_s) {
+			spin_lock(&c->ec_stripes_heap_lock);
+			bch2_stripes_heap_del(c, m, idx);
+			spin_unlock(&c->ec_stripes_heap_lock);
 
-			m->ptrs[i] = new_s->ptrs[i];
-		}
+			memset(m, 0, sizeof(*m));
+		} else {
+			m->alive	= true;
+			m->sectors	= le16_to_cpu(new_s->sectors);
+			m->algorithm	= new_s->algorithm;
+			m->nr_blocks	= new_s->nr_blocks;
+			m->nr_redundant	= new_s->nr_redundant;
+			m->blocks_nonempty = 0;
 
-		bch2_bkey_to_replicas(&m->r.e, new);
+			for (i = 0; i < new_s->nr_blocks; i++)
+				m->blocks_nonempty += !!stripe_blockcount_get(new_s, i);
 
-		if (!gc) {
 			spin_lock(&c->ec_stripes_heap_lock);
 			bch2_stripes_heap_update(c, m, idx);
 			spin_unlock(&c->ec_stripes_heap_lock);
 		}
-	}
+	} else {
+		struct gc_stripe *m = genradix_ptr(&c->gc_stripes, idx);
 
-	if (gc) {
 		/*
 		 * This will be wrong when we bring back runtime gc: we should
 		 * be unmarking the old key and then marking the new key
 		 */
+		m->alive	= true;
+		m->sectors	= le16_to_cpu(new_s->sectors);
+		m->nr_blocks	= new_s->nr_blocks;
+		m->nr_redundant	= new_s->nr_redundant;
+
+		for (i = 0; i < new_s->nr_blocks; i++)
+			m->ptrs[i] = new_s->ptrs[i];
+
+		bch2_bkey_to_replicas(&m->r.e, new);
 
 		/*
 		 * gc recalculates this field from stripe ptr
 		 * references:
 		 */
 		memset(m->block_sectors, 0, sizeof(m->block_sectors));
-		m->blocks_nonempty = 0;
 
 		for (i = 0; i < new_s->nr_blocks; i++) {
 			ret = mark_stripe_bucket(trans, new, i, journal_seq, flags);
@@ -1602,6 +1599,7 @@ static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans,
 	stripe_blockcount_set(&s->v, p.ec.block,
 		stripe_blockcount_get(&s->v, p.ec.block) +
 		sectors);
+
 	ret = bch2_trans_update(trans, &iter, &s->k_i, 0);
 	if (ret)
 		goto err;
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index 689602d18589..2b6a68b4c4d6 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -545,11 +545,11 @@ static int __ec_stripe_mem_alloc(struct bch_fs *c, size_t idx, gfp_t gfp)
 		free_heap(&n);
 	}
 
-	if (!genradix_ptr_alloc(&c->stripes[0], idx, gfp))
+	if (!genradix_ptr_alloc(&c->stripes, idx, gfp))
 		return -ENOMEM;
 
 	if (c->gc_pos.phase != GC_PHASE_NOT_RUNNING &&
-	    !genradix_ptr_alloc(&c->stripes[1], idx, gfp))
+	    !genradix_ptr_alloc(&c->gc_stripes, idx, gfp))
 		return -ENOMEM;
 
 	return 0;
@@ -594,13 +594,13 @@ static inline void ec_stripes_heap_set_backpointer(ec_stripes_heap *h,
 {
 	struct bch_fs *c = container_of(h, struct bch_fs, ec_stripes_heap);
 
-	genradix_ptr(&c->stripes[0], h->data[i].idx)->heap_idx = i;
+	genradix_ptr(&c->stripes, h->data[i].idx)->heap_idx = i;
 }
 
 static void heap_verify_backpointer(struct bch_fs *c, size_t idx)
 {
 	ec_stripes_heap *h = &c->ec_stripes_heap;
-	struct stripe *m = genradix_ptr(&c->stripes[0], idx);
+	struct stripe *m = genradix_ptr(&c->stripes, idx);
 
 	BUG_ON(!m->alive);
 	BUG_ON(m->heap_idx >= h->used);
@@ -692,7 +692,7 @@ static void ec_stripe_delete_work(struct work_struct *work)
 			break;
 		}
 
-		bch2_stripes_heap_del(c, genradix_ptr(&c->stripes[0], idx), idx);
+		bch2_stripes_heap_del(c, genradix_ptr(&c->stripes, idx), idx);
 		spin_unlock(&c->ec_stripes_heap_lock);
 
 		if (ec_stripe_delete(c, idx))
@@ -702,22 +702,18 @@ static void ec_stripe_delete_work(struct work_struct *work)
 
 /* stripe creation: */
 
-static int ec_stripe_bkey_insert(struct bch_fs *c,
+static int ec_stripe_bkey_insert(struct btree_trans *trans,
 				 struct bkey_i_stripe *stripe,
 				 struct disk_reservation *res)
 {
-	struct btree_trans trans;
+	struct bch_fs *c = trans->c;
 	struct btree_iter iter;
 	struct bkey_s_c k;
 	struct bpos min_pos = POS(0, 1);
 	struct bpos start_pos = bpos_max(min_pos, POS(0, c->ec_stripe_hint));
 	int ret;
 
-	bch2_trans_init(&trans, c, 0, 0);
-retry:
-	bch2_trans_begin(&trans);
-
-	for_each_btree_key(&trans, iter, BTREE_ID_stripes, start_pos,
+	for_each_btree_key(trans, iter, BTREE_ID_stripes, start_pos,
 			   BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) {
 		if (bkey_cmp(k.k->p, POS(0, U32_MAX)) > 0) {
 			if (start_pos.offset) {
@@ -738,29 +734,24 @@ retry:
 found_slot:
 	start_pos = iter.pos;
 
-	ret = ec_stripe_mem_alloc(&trans, &iter);
+	ret = ec_stripe_mem_alloc(trans, &iter);
 	if (ret)
 		goto err;
 
 	stripe->k.p = iter.pos;
 
-	ret   = bch2_trans_update(&trans, &iter, &stripe->k_i, 0) ?:
-		bch2_trans_commit(&trans, res, NULL,
-				BTREE_INSERT_NOFAIL);
-err:
-	bch2_trans_iter_exit(&trans, &iter);
+	ret = bch2_trans_update(trans, &iter, &stripe->k_i, 0);
 
-	if (ret == -EINTR)
-		goto retry;
-
-	c->ec_stripe_hint = ret ? start_pos.offset : start_pos.offset + 1;
-	bch2_trans_exit(&trans);
+	c->ec_stripe_hint = start_pos.offset;
+err:
+	bch2_trans_iter_exit(trans, &iter);
 
 	return ret;
 }
 
 static int ec_stripe_bkey_update(struct btree_trans *trans,
-				 struct bkey_i_stripe *new)
+				 struct bkey_i_stripe *new,
+				 struct disk_reservation *res)
 {
 	struct btree_iter iter;
 	struct bkey_s_c k;
@@ -947,10 +938,10 @@ static void ec_stripe_create(struct ec_stripe_new *s)
 		goto err_put_writes;
 	}
 
-	ret = s->have_existing_stripe
-		? bch2_trans_do(c, &s->res, NULL, BTREE_INSERT_NOFAIL,
-				ec_stripe_bkey_update(&trans, &s->new_stripe.key))
-		: ec_stripe_bkey_insert(c, &s->new_stripe.key, &s->res);
+	ret = bch2_trans_do(c, &s->res, NULL, BTREE_INSERT_NOFAIL,
+			    s->have_existing_stripe
+			    ? ec_stripe_bkey_update(&trans, &s->new_stripe.key, &s->res)
+			    : ec_stripe_bkey_insert(&trans, &s->new_stripe.key, &s->res));
 	if (ret) {
 		bch_err(c, "error creating stripe: error creating stripe key");
 		goto err_put_writes;
@@ -965,7 +956,7 @@ static void ec_stripe_create(struct ec_stripe_new *s)
 	}
 
 	spin_lock(&c->ec_stripes_heap_lock);
-	m = genradix_ptr(&c->stripes[0], s->new_stripe.key.k.p.offset);
+	m = genradix_ptr(&c->stripes, s->new_stripe.key.k.p.offset);
 
 	BUG_ON(m->on_heap);
 	bch2_stripes_heap_insert(c, m, s->new_stripe.key.k.p.offset);
@@ -1381,7 +1372,7 @@ static s64 get_existing_stripe(struct bch_fs *c,
 			continue;
 
 		stripe_idx = h->data[heap_idx].idx;
-		m = genradix_ptr(&c->stripes[0], stripe_idx);
+		m = genradix_ptr(&c->stripes, stripe_idx);
 
 		if (m->algorithm	== head->algo &&
 		    m->nr_redundant	== head->redundancy &&
@@ -1555,85 +1546,11 @@ void bch2_stripes_heap_start(struct bch_fs *c)
 	struct genradix_iter iter;
 	struct stripe *m;
 
-	genradix_for_each(&c->stripes[0], iter, m)
+	genradix_for_each(&c->stripes, iter, m)
 		if (m->alive)
 			bch2_stripes_heap_insert(c, m, iter.pos);
 }
 
-static int __bch2_stripe_write_key(struct btree_trans *trans,
-				   struct btree_iter *iter,
-				   struct stripe *m,
-				   size_t idx,
-				   struct bkey_i_stripe *new_key)
-{
-	const struct bch_stripe *v;
-	struct bkey_s_c k;
-	unsigned i;
-	int ret;
-
-	bch2_btree_iter_set_pos(iter, POS(0, idx));
-
-	k = bch2_btree_iter_peek_slot(iter);
-	ret = bkey_err(k);
-	if (ret)
-		return ret;
-
-	if (k.k->type != KEY_TYPE_stripe)
-		return -EIO;
-
-	v = bkey_s_c_to_stripe(k).v;
-	for (i = 0; i < v->nr_blocks; i++)
-		if (m->block_sectors[i] != stripe_blockcount_get(v, i))
-			goto write;
-	return 0;
-write:
-	bkey_reassemble(&new_key->k_i, k);
-
-	for (i = 0; i < new_key->v.nr_blocks; i++)
-		stripe_blockcount_set(&new_key->v, i,
-				      m->block_sectors[i]);
-
-	return bch2_trans_update(trans, iter, &new_key->k_i, 0);
-}
-
-int bch2_stripes_write(struct bch_fs *c, unsigned flags)
-{
-	struct btree_trans trans;
-	struct btree_iter iter;
-	struct genradix_iter giter;
-	struct bkey_i_stripe *new_key;
-	struct stripe *m;
-	int ret = 0;
-
-	new_key = kmalloc(255 * sizeof(u64), GFP_KERNEL);
-	BUG_ON(!new_key);
-
-	bch2_trans_init(&trans, c, 0, 0);
-
-	bch2_trans_iter_init(&trans, &iter, BTREE_ID_stripes, POS_MIN,
-			     BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
-
-	genradix_for_each(&c->stripes[0], giter, m) {
-		if (!m->alive)
-			continue;
-
-		ret = __bch2_trans_do(&trans, NULL, NULL,
-				      BTREE_INSERT_NOFAIL|flags,
-			__bch2_stripe_write_key(&trans, &iter, m,
-					giter.pos, new_key));
-
-		if (ret)
-			break;
-	}
-	bch2_trans_iter_exit(&trans, &iter);
-
-	bch2_trans_exit(&trans);
-
-	kfree(new_key);
-
-	return ret;
-}
-
 static int bch2_stripes_read_fn(struct btree_trans *trans, struct bkey_s_c k)
 {
 	const struct bch_stripe *s;
@@ -1651,7 +1568,7 @@ static int bch2_stripes_read_fn(struct btree_trans *trans, struct bkey_s_c k)
 
 	s = bkey_s_c_to_stripe(k).v;
 
-	m = genradix_ptr(&c->stripes[0], k.k->p.offset);
+	m = genradix_ptr(&c->stripes, k.k->p.offset);
 	m->alive	= true;
 	m->sectors	= le16_to_cpu(s->sectors);
 	m->algorithm	= s->algorithm;
@@ -1659,14 +1576,8 @@ static int bch2_stripes_read_fn(struct btree_trans *trans, struct bkey_s_c k)
 	m->nr_redundant	= s->nr_redundant;
 	m->blocks_nonempty = 0;
 
-	for (i = 0; i < s->nr_blocks; i++) {
-		m->block_sectors[i] =
-			stripe_blockcount_get(s, i);
-		m->blocks_nonempty += !!m->block_sectors[i];
-		m->ptrs[i] = s->ptrs[i];
-	}
-
-	bch2_bkey_to_replicas(&m->r.e, k);
+	for (i = 0; i < s->nr_blocks; i++)
+		m->blocks_nonempty += !!stripe_blockcount_get(s, i);
 
 	spin_lock(&c->ec_stripes_heap_lock);
 	bch2_stripes_heap_update(c, m, k.k->p.offset);
@@ -1722,7 +1633,9 @@ int bch2_ec_mem_alloc(struct bch_fs *c, bool gc)
 	ret = genradix_prealloc(&c->stripes[gc], idx, GFP_KERNEL);
 #else
 	for (i = 0; i < idx; i++)
-		if (!genradix_ptr_alloc(&c->stripes[gc], i, GFP_KERNEL))
+		if (!gc
+		    ? !genradix_ptr_alloc(&c->stripes, i, GFP_KERNEL)
+		    : !genradix_ptr_alloc(&c->gc_stripes, i, GFP_KERNEL))
 			return -ENOMEM;
 #endif
 	return 0;
@@ -1736,7 +1649,7 @@ void bch2_stripes_heap_to_text(struct printbuf *out, struct bch_fs *c)
 
 	spin_lock(&c->ec_stripes_heap_lock);
 	for (i = 0; i < min_t(size_t, h->used, 20); i++) {
-		m = genradix_ptr(&c->stripes[0], h->data[i].idx);
+		m = genradix_ptr(&c->stripes, h->data[i].idx);
 
 		pr_buf(out, "%zu %u/%u+%u\n", h->data[i].idx,
 		       h->data[i].blocks_nonempty,
@@ -1794,7 +1707,7 @@ void bch2_fs_ec_exit(struct bch_fs *c)
 	BUG_ON(!list_empty(&c->ec_stripe_new_list));
 
 	free_heap(&c->ec_stripes_heap);
-	genradix_free(&c->stripes[0]);
+	genradix_free(&c->stripes);
 	bioset_exit(&c->ec_bioset);
 }
 
diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h
index eb16e140e2c8..468141072bb4 100644
--- a/fs/bcachefs/ec.h
+++ b/fs/bcachefs/ec.h
@@ -108,7 +108,7 @@ static inline bool bch2_ptr_matches_stripe(const struct bch_stripe *s,
 					 le16_to_cpu(s->sectors));
 }
 
-static inline bool bch2_ptr_matches_stripe_m(const struct stripe *m,
+static inline bool bch2_ptr_matches_stripe_m(const struct gc_stripe *m,
 					     struct extent_ptr_decoded p)
 {
 	unsigned nr_data = m->nr_blocks - m->nr_redundant;
@@ -216,7 +216,6 @@ void bch2_ec_flush_new_stripes(struct bch_fs *);
 void bch2_stripes_heap_start(struct bch_fs *);
 
 int bch2_stripes_read(struct bch_fs *);
-int bch2_stripes_write(struct bch_fs *, unsigned);
 
 int bch2_ec_mem_alloc(struct bch_fs *, bool);
 
diff --git a/fs/bcachefs/ec_types.h b/fs/bcachefs/ec_types.h
index 3fc31222459a..edd93da663c1 100644
--- a/fs/bcachefs/ec_types.h
+++ b/fs/bcachefs/ec_types.h
@@ -21,6 +21,15 @@ struct stripe {
 	unsigned		alive:1; /* does a corresponding key exist in stripes btree? */
 	unsigned		on_heap:1;
 	u8			blocks_nonempty;
+};
+
+struct gc_stripe {
+	u16			sectors;
+
+	u8			nr_blocks;
+	u8			nr_redundant;
+
+	unsigned		alive:1; /* does a corresponding key exist in stripes btree? */
 	u16			block_sectors[BCH_BKEY_PTRS_MAX];
 	struct bch_extent_ptr	ptrs[BCH_BKEY_PTRS_MAX];
 
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index c3b4d116275c..460b1ba22c8e 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -1238,8 +1238,7 @@ use_clean:
 		 */
 		bch_verbose(c, "writing allocation info");
 		err = "error writing out alloc info";
-		ret = bch2_stripes_write(c, BTREE_INSERT_LAZY_RW) ?:
-			bch2_alloc_write(c, BTREE_INSERT_LAZY_RW);
+		ret = bch2_alloc_write(c, BTREE_INSERT_LAZY_RW);
 		if (ret) {
 			bch_err(c, "error writing alloc info");
 			goto err;
-- 
cgit 


From bf0fdb4d89bf16bbcd3a0a340a10ffde25b13d57 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 9 Dec 2021 14:19:18 -0500
Subject: bcachefs: Don't erasure code cached ptrs

It doesn't make much sense to be erasure coding cached pointers, we
should be erasure coding one of the dirty pointers in an extent. This
patch makes sure we're passing BCH_WRITE_CACHED when we expect the new
pointer to be a cached pointer, and tweaks the write path to not
allocate from a stripe when BCH_WRITE_CACHED is set - and fixes an
assertion we were hitting in the ec path where when adding the stripe to
an extent and deleting the other pointers the pointer to the stripe
didn't exist (because dropping all dirty pointers from an extent turns
it into a KEY_TYPE_error key).

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/ec.c   | 21 ++++++++++++++-------
 fs/bcachefs/io.c   |  2 +-
 fs/bcachefs/move.c | 12 ++++++++----
 3 files changed, 23 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index 2b6a68b4c4d6..4424cb3ac822 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -143,8 +143,8 @@ void bch2_stripe_to_text(struct printbuf *out, struct bch_fs *c,
 }
 
 /* returns blocknr in stripe that we matched: */
-static int bkey_matches_stripe(struct bch_stripe *s,
-			       struct bkey_s_c k)
+static const struct bch_extent_ptr *bkey_matches_stripe(struct bch_stripe *s,
+						struct bkey_s_c k, unsigned *block)
 {
 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
 	const struct bch_extent_ptr *ptr;
@@ -153,10 +153,12 @@ static int bkey_matches_stripe(struct bch_stripe *s,
 	bkey_for_each_ptr(ptrs, ptr)
 		for (i = 0; i < nr_data; i++)
 			if (__bch2_ptr_matches_stripe(&s->ptrs[i], ptr,
-						      le16_to_cpu(s->sectors)))
-				return i;
+						      le16_to_cpu(s->sectors))) {
+				*block = i;
+				return ptr;
+			}
 
-	return -1;
+	return NULL;
 }
 
 static bool extent_has_stripe_ptr(struct bkey_s_c k, u64 idx)
@@ -834,6 +836,7 @@ retry:
 	       (k = bch2_btree_iter_peek(&iter)).k &&
 	       !(ret = bkey_err(k)) &&
 	       bkey_cmp(bkey_start_pos(k.k), pos->p) < 0) {
+		const struct bch_extent_ptr *ptr_c;
 		struct bch_extent_ptr *ptr, *ec_ptr = NULL;
 
 		if (extent_has_stripe_ptr(k, s->key.k.p.offset)) {
@@ -841,8 +844,12 @@ retry:
 			continue;
 		}
 
-		block = bkey_matches_stripe(&s->key.v, k);
-		if (block < 0) {
+		ptr_c = bkey_matches_stripe(&s->key.v, k, &block);
+		/*
+		 * It doesn't generally make sense to erasure code cached ptrs:
+		 * XXX: should we be incrementing a counter?
+		 */
+		if (!ptr_c || ptr_c->cached) {
 			bch2_btree_iter_advance(&iter);
 			continue;
 		}
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index 03bea2ddfb39..814984ec608c 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -1179,7 +1179,7 @@ again:
 		 */
 		wp = bch2_alloc_sectors_start(c,
 			op->target,
-			op->opts.erasure_code,
+			op->opts.erasure_code && !(op->flags & BCH_WRITE_CACHED),
 			op->write_point,
 			&op->devs_have,
 			op->nr_replicas,
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index 482dfc29385e..8756df0414a8 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -394,10 +394,14 @@ int bch2_migrate_write_init(struct bch_fs *c, struct migrate_write *m,
 		unsigned compressed_sectors = 0;
 
 		bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
-			if (p.ptr.dev == data_opts.rewrite_dev &&
-			    !p.ptr.cached &&
-			    crc_is_compressed(p.crc))
-				compressed_sectors += p.crc.compressed_size;
+			if (p.ptr.dev == data_opts.rewrite_dev) {
+				if (p.ptr.cached)
+					m->op.flags |= BCH_WRITE_CACHED;
+
+				if (!p.ptr.cached &&
+				    crc_is_compressed(p.crc))
+					compressed_sectors += p.crc.compressed_size;
+			}
 
 		if (compressed_sectors) {
 			ret = bch2_disk_reservation_add(c, &m->op.res,
-- 
cgit 


From 198141e51cc0dadc5ddf7c7fe5e975c3bb6f546b Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 9 Dec 2021 15:21:26 -0500
Subject: bcachefs: Fix null ptr deref in fsck_inode_rm()

bch2_btree_delete_range() can split compressed extents, thus needs to
pass in a disk reservation when we're operating on extents btrees.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_update_leaf.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 1079daef4e86..96fc2cd13f21 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -1452,6 +1452,8 @@ retry:
 	       (k = bch2_btree_iter_peek(&iter)).k) &&
 	       !(ret = bkey_err(k)) &&
 	       bkey_cmp(iter.pos, end) < 0) {
+		struct disk_reservation disk_res =
+			bch2_disk_reservation_init(trans->c, 0);
 		struct bkey_i delete;
 
 		bkey_init(&delete.k);
@@ -1486,8 +1488,9 @@ retry:
 		}
 
 		ret   = bch2_trans_update(trans, &iter, &delete, 0) ?:
-			bch2_trans_commit(trans, NULL, journal_seq,
+			bch2_trans_commit(trans, &disk_res, journal_seq,
 					BTREE_INSERT_NOFAIL);
+		bch2_disk_reservation_put(trans->c, &disk_res);
 		if (ret)
 			break;
 	}
-- 
cgit 


From f44906775981e368b77474f0c0750e9d1a4f229b Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 10 Dec 2021 14:03:42 -0500
Subject: bcachefs: Print out OPT_SECTORS options in bytes

This matches the conversion the parsing code does.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/opts.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c
index a955ef2008c9..e81e07a383bb 100644
--- a/fs/bcachefs/opts.c
+++ b/fs/bcachefs/opts.c
@@ -291,7 +291,7 @@ void bch2_opt_to_text(struct printbuf *out, struct bch_fs *c,
 		pr_buf(out, "%lli", v);
 		break;
 	case BCH_OPT_SECTORS:
-		bch2_hprint(out, v);
+		bch2_hprint(out, v << 9);
 		break;
 	case BCH_OPT_STR:
 		if (flags & OPT_SHOW_FULL_LIST)
-- 
cgit 


From 991ba0211290884df42f9506499aba7e933a2bb8 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 10 Dec 2021 15:41:38 -0500
Subject: bcachefs: Add more time_stats

This adds more latency/event measurements and breaks some apart into
more events. Journal writes are broken apart into flush writes and
noflush writes, btree compactions are broken out from btree splits,
btree mergers are added, as well as btree_interior_updates - foreground
and total.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/bcachefs.h              |  8 ++++++--
 fs/bcachefs/btree_gc.c              |  3 +++
 fs/bcachefs/btree_update_interior.c | 25 +++++++++++++++++++++++--
 fs/bcachefs/btree_update_interior.h |  1 +
 fs/bcachefs/fs-common.c             |  1 +
 fs/bcachefs/journal.c               |  6 ++++++
 fs/bcachefs/journal_io.c            |  4 +++-
 fs/bcachefs/journal_types.h         |  4 ++--
 fs/bcachefs/opts.h                  |  6 +++---
 fs/bcachefs/super.c                 |  8 ++++----
 10 files changed, 52 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 5e9378843476..dde919a95585 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -321,8 +321,12 @@ BCH_DEBUG_PARAMS_DEBUG()
 #define BCH_TIME_STATS()			\
 	x(btree_node_mem_alloc)			\
 	x(btree_node_split)			\
+	x(btree_node_compact)			\
+	x(btree_node_merge)			\
 	x(btree_node_sort)			\
 	x(btree_node_read)			\
+	x(btree_interior_update_foreground)	\
+	x(btree_interior_update_total)		\
 	x(btree_gc)				\
 	x(btree_lock_contended_read)		\
 	x(btree_lock_contended_intent)		\
@@ -330,8 +334,8 @@ BCH_DEBUG_PARAMS_DEBUG()
 	x(data_write)				\
 	x(data_read)				\
 	x(data_promote)				\
-	x(journal_write)			\
-	x(journal_delay)			\
+	x(journal_flush_write)			\
+	x(journal_noflush_write)		\
 	x(journal_flush_seq)			\
 	x(blocked_journal)			\
 	x(blocked_allocate)			\
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index dc4562a1e122..ccb85850080b 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -1849,6 +1849,7 @@ int bch2_gc_gens(struct bch_fs *c)
 	struct bch_dev *ca;
 	struct bucket_array *buckets;
 	struct bucket *g;
+	u64 start_time = local_clock();
 	unsigned i;
 	int ret;
 
@@ -1892,6 +1893,8 @@ int bch2_gc_gens(struct bch_fs *c)
 	c->gc_gens_pos		= POS_MIN;
 
 	c->gc_count++;
+
+	bch2_time_stats_update(&c->times[BCH_TIME_btree_gc], start_time);
 err:
 	up_read(&c->gc_lock);
 	return ret;
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index a28c7cf381ce..8865ab7d087b 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -455,15 +455,23 @@ static void bch2_btree_update_free(struct btree_update *as)
 	bch2_disk_reservation_put(c, &as->disk_res);
 	bch2_btree_reserve_put(as);
 
+	bch2_time_stats_update(&c->times[BCH_TIME_btree_interior_update_total],
+			       as->start_time);
+
 	mutex_lock(&c->btree_interior_update_lock);
 	list_del(&as->unwritten_list);
 	list_del(&as->list);
-	mutex_unlock(&c->btree_interior_update_lock);
 
 	closure_debug_destroy(&as->cl);
 	mempool_free(as, &c->btree_interior_update_pool);
 
+	/*
+	 * Have to do the wakeup with btree_interior_update_lock still held,
+	 * since being on btree_interior_update_list is our ref on @c:
+	 */
 	closure_wake_up(&c->btree_interior_update_wait);
+
+	mutex_unlock(&c->btree_interior_update_lock);
 }
 
 static void btree_update_will_delete_key(struct btree_update *as,
@@ -902,6 +910,9 @@ static void bch2_btree_interior_update_will_free_node(struct btree_update *as,
 
 static void bch2_btree_update_done(struct btree_update *as)
 {
+	struct bch_fs *c = as->c;
+	u64 start_time = as->start_time;
+
 	BUG_ON(as->mode == BTREE_INTERIOR_NO_UPDATE);
 
 	if (as->took_gc_lock)
@@ -912,6 +923,9 @@ static void bch2_btree_update_done(struct btree_update *as)
 
 	continue_at(&as->cl, btree_update_set_nodes_written,
 		    as->c->btree_interior_update_worker);
+
+	bch2_time_stats_update(&c->times[BCH_TIME_btree_interior_update_foreground],
+			       start_time);
 }
 
 static struct btree_update *
@@ -921,6 +935,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
 	struct bch_fs *c = trans->c;
 	struct btree_update *as;
 	struct closure cl;
+	u64 start_time = local_clock();
 	int disk_res_flags = (flags & BTREE_INSERT_NOFAIL)
 		? BCH_DISK_RESERVATION_NOFAIL : 0;
 	int journal_flags = 0;
@@ -960,6 +975,7 @@ retry:
 	memset(as, 0, sizeof(*as));
 	closure_init(&as->cl, NULL);
 	as->c		= c;
+	as->start_time	= start_time;
 	as->mode	= BTREE_INTERIOR_NO_UPDATE;
 	as->took_gc_lock = !(flags & BTREE_INSERT_GC_LOCK_HELD);
 	as->btree_id	= path->btree_id;
@@ -1452,7 +1468,9 @@ static void btree_split(struct btree_update *as, struct btree_trans *trans,
 
 	bch2_trans_verify_locks(trans);
 
-	bch2_time_stats_update(&c->times[BCH_TIME_btree_node_split],
+	bch2_time_stats_update(&c->times[n2
+			       ? BCH_TIME_btree_node_split
+			       : BCH_TIME_btree_node_compact],
 			       start_time);
 }
 
@@ -1573,6 +1591,7 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans,
 	struct btree *b, *m, *n, *prev, *next, *parent;
 	struct bpos sib_pos;
 	size_t sib_u64s;
+	u64 start_time = local_clock();
 	int ret = 0;
 
 	BUG_ON(!path->should_be_locked);
@@ -1710,6 +1729,8 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans,
 	six_unlock_intent(&n->c.lock);
 
 	bch2_btree_update_done(as);
+
+	bch2_time_stats_update(&c->times[BCH_TIME_btree_node_merge], start_time);
 out:
 err:
 	bch2_path_put(trans, sib_path, true);
diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h
index 8e03bd987d6d..d4574161a733 100644
--- a/fs/bcachefs/btree_update_interior.h
+++ b/fs/bcachefs/btree_update_interior.h
@@ -35,6 +35,7 @@ bool bch2_btree_node_format_fits(struct bch_fs *c, struct btree *,
 struct btree_update {
 	struct closure			cl;
 	struct bch_fs			*c;
+	u64				start_time;
 
 	struct list_head		list;
 	struct list_head		unwritten_list;
diff --git a/fs/bcachefs/fs-common.c b/fs/bcachefs/fs-common.c
index 5f3429e99115..d543480be111 100644
--- a/fs/bcachefs/fs-common.c
+++ b/fs/bcachefs/fs-common.c
@@ -329,6 +329,7 @@ bool bch2_reinherit_attrs(struct bch_inode_unpacked *dst_u,
 	bool ret = false;
 
 	for (id = 0; id < Inode_opt_nr; id++) {
+		/* Skip attributes that were explicitly set on this inode */
 		if (dst_u->bi_fields_set & (1 << id))
 			continue;
 
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index 3ce6a78263ba..f15d265ef1b6 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -625,6 +625,12 @@ int bch2_journal_flush_seq(struct journal *j, u64 seq)
 	u64 start_time = local_clock();
 	int ret, ret2;
 
+	/*
+	 * Don't update time_stats when @seq is already flushed:
+	 */
+	if (seq <= j->flushed_seq_ondisk)
+		return 0;
+
 	ret = wait_event_interruptible(j->wait, (ret2 = bch2_journal_flush_seq_async(j, seq, NULL)));
 
 	if (!ret)
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 680ddba1889d..1a8c0a7eaca7 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -1237,7 +1237,9 @@ static void journal_write_done(struct closure *cl)
 	u64 v, seq;
 	int err = 0;
 
-	bch2_time_stats_update(j->write_time, j->write_start_time);
+	bch2_time_stats_update(!JSET_NO_FLUSH(w->data)
+			       ? j->flush_write_time
+			       : j->noflush_write_time, j->write_start_time);
 
 	if (!w->devs_written.nr) {
 		bch_err(c, "unable to write journal to sufficient devices");
diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h
index 9f59e4889f52..0c4df603280d 100644
--- a/fs/bcachefs/journal_types.h
+++ b/fs/bcachefs/journal_types.h
@@ -271,8 +271,8 @@ struct journal {
 	u64			nr_flush_writes;
 	u64			nr_noflush_writes;
 
-	struct bch2_time_stats	*write_time;
-	struct bch2_time_stats	*delay_time;
+	struct bch2_time_stats	*flush_write_time;
+	struct bch2_time_stats	*noflush_write_time;
 	struct bch2_time_stats	*blocked_time;
 	struct bch2_time_stats	*flush_seq_time;
 
diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
index aad57a82e5fd..bb2ecc778a8c 100644
--- a/fs/bcachefs/opts.h
+++ b/fs/bcachefs/opts.h
@@ -81,9 +81,9 @@ enum opt_type {
  */
 
 #ifdef __KERNEL__
-#define RATELIMIT_ERRORS true
+#define RATELIMIT_ERRORS_DEFAULT true
 #else
-#define RATELIMIT_ERRORS false
+#define RATELIMIT_ERRORS_DEFAULT false
 #endif
 
 #define BCH_OPTS()							\
@@ -288,7 +288,7 @@ enum opt_type {
 	x(ratelimit_errors,		u8,				\
 	  OPT_FS|OPT_MOUNT,						\
 	  OPT_BOOL(),							\
-	  NO_SB_OPT,			RATELIMIT_ERRORS,		\
+	  NO_SB_OPT,			RATELIMIT_ERRORS_DEFAULT,	\
 	  NULL,		"Ratelimit error messages during fsck")		\
 	x(nochanges,			u8,				\
 	  OPT_FS|OPT_MOUNT,						\
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 22dbbf77d687..afa1a8fa493b 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -730,10 +730,10 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 	c->rebalance.enabled		= 1;
 	c->promote_whole_extents	= true;
 
-	c->journal.write_time	= &c->times[BCH_TIME_journal_write];
-	c->journal.delay_time	= &c->times[BCH_TIME_journal_delay];
-	c->journal.blocked_time	= &c->times[BCH_TIME_blocked_journal];
-	c->journal.flush_seq_time = &c->times[BCH_TIME_journal_flush_seq];
+	c->journal.flush_write_time	= &c->times[BCH_TIME_journal_flush_write];
+	c->journal.noflush_write_time	= &c->times[BCH_TIME_journal_noflush_write];
+	c->journal.blocked_time		= &c->times[BCH_TIME_blocked_journal];
+	c->journal.flush_seq_time	= &c->times[BCH_TIME_journal_flush_seq];
 
 	bch2_fs_btree_cache_init_early(&c->btree_cache);
 
-- 
cgit 


From fb0e480872ac858d836d5d6d713d0f31ae08c64d Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 10 Dec 2021 20:58:44 -0500
Subject: bcachefs: bch2_alloc_write()

This adds a new helper that much like the one we have for inode updates,
that allocates the packed alloc key, packs it and calls
bch2_trans_update.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c | 54 +++++++++++++++++++++++-------------------
 fs/bcachefs/alloc_background.h | 15 +++---------
 fs/bcachefs/buckets.c          | 43 ++++++++++++---------------------
 fs/bcachefs/recovery.c         |  2 +-
 4 files changed, 49 insertions(+), 65 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index dc1e09b138b6..4953cbee2655 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -38,6 +38,15 @@ static const unsigned BCH_ALLOC_V1_FIELD_BYTES[] = {
 #undef x
 };
 
+struct bkey_alloc_buf {
+	struct bkey_i	k;
+	struct bch_alloc_v3 v;
+
+#define x(_name,  _bits)		+ _bits / 8
+	u8		_pad[0 + BCH_ALLOC_FIELDS_V2()];
+#undef  x
+} __attribute__((packed, aligned(8)));
+
 /* Persistent alloc info: */
 
 static inline u64 alloc_field_v1_get(const struct bch_alloc *a,
@@ -244,13 +253,26 @@ struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c k)
 	return ret;
 }
 
-void bch2_alloc_pack(struct bch_fs *c,
-		     struct bkey_alloc_buf *dst,
-		     const struct bkey_alloc_unpacked src)
+static void bch2_alloc_pack(struct bch_fs *c,
+			    struct bkey_alloc_buf *dst,
+			    const struct bkey_alloc_unpacked src)
 {
 	bch2_alloc_pack_v3(dst, src);
 }
 
+int bch2_alloc_write(struct btree_trans *trans, struct btree_iter *iter,
+		     struct bkey_alloc_unpacked *u, unsigned trigger_flags)
+{
+	struct bkey_alloc_buf *a;
+
+	a = bch2_trans_kmalloc(trans, sizeof(struct bkey_alloc_buf));
+	if (IS_ERR(a))
+		return PTR_ERR(a);
+
+	bch2_alloc_pack(trans->c, a, *u);
+	return bch2_trans_update(trans, iter, &a->k, trigger_flags);
+}
+
 static unsigned bch_alloc_v1_val_u64s(const struct bch_alloc *a)
 {
 	unsigned i, bytes = offsetof(struct bch_alloc, data);
@@ -375,7 +397,6 @@ static int bch2_alloc_write_key(struct btree_trans *trans,
 	struct bucket *g;
 	struct bucket_mark m;
 	struct bkey_alloc_unpacked old_u, new_u;
-	struct bkey_alloc_buf a;
 	int ret;
 retry:
 	bch2_trans_begin(trans);
@@ -402,8 +423,7 @@ retry:
 	if (!bkey_alloc_unpacked_cmp(old_u, new_u))
 		return 0;
 
-	bch2_alloc_pack(c, &a, new_u);
-	ret   = bch2_trans_update(trans, iter, &a.k,
+	ret   = bch2_alloc_write(trans, iter, &new_u,
 				  BTREE_TRIGGER_NORUN) ?:
 		bch2_trans_commit(trans, NULL, NULL,
 				BTREE_INSERT_NOFAIL|flags);
@@ -413,7 +433,7 @@ err:
 	return ret;
 }
 
-int bch2_alloc_write(struct bch_fs *c, unsigned flags)
+int bch2_alloc_write_all(struct bch_fs *c, unsigned flags)
 {
 	struct btree_trans trans;
 	struct btree_iter iter;
@@ -453,7 +473,6 @@ int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev,
 	struct bch_dev *ca = bch_dev_bkey_exists(c, dev);
 	struct btree_iter iter;
 	struct bucket *g;
-	struct bkey_alloc_buf *a;
 	struct bkey_alloc_unpacked u;
 	u64 *time, now;
 	int ret = 0;
@@ -466,11 +485,6 @@ int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev,
 	if (ret)
 		goto out;
 
-	a = bch2_trans_kmalloc(trans, sizeof(struct bkey_alloc_buf));
-	ret = PTR_ERR_OR_ZERO(a);
-	if (ret)
-		goto out;
-
 	percpu_down_read(&c->mark_lock);
 	g = bucket(ca, bucket_nr);
 	u = alloc_mem_to_key(&iter, g, READ_ONCE(g->mark));
@@ -483,8 +497,7 @@ int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev,
 
 	*time = now;
 
-	bch2_alloc_pack(c, a, u);
-	ret   = bch2_trans_update(trans, &iter, &a->k, 0) ?:
+	ret   = bch2_alloc_write(trans, &iter, &u, 0) ?:
 		bch2_trans_commit(trans, NULL, NULL, 0);
 out:
 	bch2_trans_iter_exit(trans, &iter);
@@ -752,7 +765,6 @@ static int bucket_invalidate_btree(struct btree_trans *trans,
 				   struct bch_dev *ca, u64 b)
 {
 	struct bch_fs *c = trans->c;
-	struct bkey_alloc_buf *a;
 	struct bkey_alloc_unpacked u;
 	struct bucket *g;
 	struct bucket_mark m;
@@ -765,11 +777,6 @@ static int bucket_invalidate_btree(struct btree_trans *trans,
 			     BTREE_ITER_CACHED_NOFILL|
 			     BTREE_ITER_INTENT);
 
-	a = bch2_trans_kmalloc(trans, sizeof(*a));
-	ret = PTR_ERR_OR_ZERO(a);
-	if (ret)
-		goto err;
-
 	ret = bch2_btree_iter_traverse(&iter);
 	if (ret)
 		goto err;
@@ -787,9 +794,8 @@ static int bucket_invalidate_btree(struct btree_trans *trans,
 	u.read_time	= atomic64_read(&c->io_clock[READ].now);
 	u.write_time	= atomic64_read(&c->io_clock[WRITE].now);
 
-	bch2_alloc_pack(c, a, u);
-	ret = bch2_trans_update(trans, &iter, &a->k,
-				BTREE_TRIGGER_BUCKET_INVALIDATE);
+	ret = bch2_alloc_write(trans, &iter, &u,
+			       BTREE_TRIGGER_BUCKET_INVALIDATE);
 err:
 	bch2_trans_iter_exit(trans, &iter);
 	return ret;
diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h
index b1efc1494dc4..6698d9c75d07 100644
--- a/fs/bcachefs/alloc_background.h
+++ b/fs/bcachefs/alloc_background.h
@@ -20,15 +20,6 @@ struct bkey_alloc_unpacked {
 #undef  x
 };
 
-struct bkey_alloc_buf {
-	struct bkey_i	k;
-	struct bch_alloc_v3 v;
-
-#define x(_name,  _bits)		+ _bits / 8
-	u8		_pad[0 + BCH_ALLOC_FIELDS_V2()];
-#undef  x
-} __attribute__((packed, aligned(8)));
-
 /* How out of date a pointer gen is allowed to be: */
 #define BUCKET_GC_GEN_MAX	96U
 
@@ -46,8 +37,8 @@ static inline bool bkey_alloc_unpacked_cmp(struct bkey_alloc_unpacked l,
 }
 
 struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c);
-void bch2_alloc_pack(struct bch_fs *, struct bkey_alloc_buf *,
-		     const struct bkey_alloc_unpacked);
+int bch2_alloc_write(struct btree_trans *, struct btree_iter *,
+		     struct bkey_alloc_unpacked *, unsigned);
 
 int bch2_bucket_io_time_reset(struct btree_trans *, unsigned, size_t, int);
 
@@ -137,7 +128,7 @@ void bch2_dev_allocator_quiesce(struct bch_fs *, struct bch_dev *);
 void bch2_dev_allocator_stop(struct bch_dev *);
 int bch2_dev_allocator_start(struct bch_dev *);
 
-int bch2_alloc_write(struct bch_fs *, unsigned);
+int bch2_alloc_write_all(struct bch_fs *, unsigned);
 void bch2_fs_allocator_background_init(struct bch_fs *);
 
 void bch2_open_buckets_to_text(struct printbuf *, struct bch_fs *);
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index bf564757aa28..6bbf088cd095 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -1485,8 +1485,7 @@ need_mark:
 
 /* trans_mark: */
 
-static struct bkey_alloc_buf *
-bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree_iter *iter,
+static int bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree_iter *iter,
 			      const struct bch_extent_ptr *ptr,
 			      struct bkey_alloc_unpacked *u)
 {
@@ -1494,14 +1493,9 @@ bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree_iter *iter
 	struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
 	struct bpos pos = POS(ptr->dev, PTR_BUCKET_NR(ca, ptr));
 	struct bucket *g;
-	struct bkey_alloc_buf *a;
 	struct bkey_i *update;
 	int ret;
 
-	a = bch2_trans_kmalloc(trans, sizeof(struct bkey_alloc_buf));
-	if (IS_ERR(a))
-		return a;
-
 	bch2_trans_iter_init(trans, iter, BTREE_ID_alloc, pos,
 			     BTREE_ITER_CACHED|
 			     BTREE_ITER_CACHED_NOFILL|
@@ -1509,7 +1503,7 @@ bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree_iter *iter
 	ret = bch2_btree_iter_traverse(iter);
 	if (ret) {
 		bch2_trans_iter_exit(trans, iter);
-		return ERR_PTR(ret);
+		return ret;
 	}
 
 	update = __bch2_btree_trans_peek_updates(iter);
@@ -1522,22 +1516,20 @@ bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree_iter *iter
 		percpu_up_read(&c->mark_lock);
 	}
 
-	return a;
+	return 0;
 }
 
 static int bch2_trans_mark_pointer(struct btree_trans *trans,
 			struct bkey_s_c k, struct extent_ptr_decoded p,
 			s64 sectors, enum bch_data_type data_type)
 {
-	struct bch_fs *c = trans->c;
 	struct btree_iter iter;
 	struct bkey_alloc_unpacked u;
-	struct bkey_alloc_buf *a;
 	int ret;
 
-	a = bch2_trans_start_alloc_update(trans, &iter, &p.ptr, &u);
-	if (IS_ERR(a))
-		return PTR_ERR(a);
+	ret = bch2_trans_start_alloc_update(trans, &iter, &p.ptr, &u);
+	if (ret)
+		return ret;
 
 	ret = __mark_pointer(trans, k, &p.ptr, sectors, data_type,
 			     u.gen, &u.data_type,
@@ -1545,8 +1537,7 @@ static int bch2_trans_mark_pointer(struct btree_trans *trans,
 	if (ret)
 		goto out;
 
-	bch2_alloc_pack(c, a, u);
-	ret = bch2_trans_update(trans, &iter, &a->k, 0);
+	ret = bch2_alloc_write(trans, &iter, &u, 0);
 	if (ret)
 		goto out;
 out:
@@ -1676,7 +1667,6 @@ static int bch2_trans_mark_stripe_bucket(struct btree_trans *trans,
 {
 	struct bch_fs *c = trans->c;
 	const struct bch_extent_ptr *ptr = &s.v->ptrs[idx];
-	struct bkey_alloc_buf *a;
 	struct btree_iter iter;
 	struct bkey_alloc_unpacked u;
 	enum bch_data_type data_type = idx >= s.v->nr_blocks - s.v->nr_redundant
@@ -1687,9 +1677,9 @@ static int bch2_trans_mark_stripe_bucket(struct btree_trans *trans,
 	if (deleting)
 		sectors = -sectors;
 
-	a = bch2_trans_start_alloc_update(trans, &iter, ptr, &u);
-	if (IS_ERR(a))
-		return PTR_ERR(a);
+	ret = bch2_trans_start_alloc_update(trans, &iter, ptr, &u);
+	if (ret)
+		return ret;
 
 	ret = check_bucket_ref(c, s.s_c, ptr, sectors, data_type,
 			       u.gen, u.data_type,
@@ -1739,8 +1729,7 @@ static int bch2_trans_mark_stripe_bucket(struct btree_trans *trans,
 	if (data_type)
 		u.data_type = !deleting ? data_type : 0;
 
-	bch2_alloc_pack(c, a, u);
-	ret = bch2_trans_update(trans, &iter, &a->k, 0);
+	ret = bch2_alloc_write(trans, &iter, &u, 0);
 	if (ret)
 		goto err;
 err:
@@ -1988,7 +1977,6 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
 	struct bch_fs *c = trans->c;
 	struct btree_iter iter;
 	struct bkey_alloc_unpacked u;
-	struct bkey_alloc_buf *a;
 	struct bch_extent_ptr ptr = {
 		.dev = ca->dev_idx,
 		.offset = bucket_to_sector(ca, b),
@@ -2001,9 +1989,9 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
 	if (b >= ca->mi.nbuckets)
 		return 0;
 
-	a = bch2_trans_start_alloc_update(trans, &iter, &ptr, &u);
-	if (IS_ERR(a))
-		return PTR_ERR(a);
+	ret = bch2_trans_start_alloc_update(trans, &iter, &ptr, &u);
+	if (ret)
+		return ret;
 
 	if (u.data_type && u.data_type != type) {
 		bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
@@ -2020,8 +2008,7 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
 	u.data_type	= type;
 	u.dirty_sectors	= sectors;
 
-	bch2_alloc_pack(c, a, u);
-	ret = bch2_trans_update(trans, &iter, &a->k, 0);
+	ret = bch2_alloc_write(trans, &iter, &u, 0);
 	if (ret)
 		goto out;
 out:
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 460b1ba22c8e..29fe6260ace5 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -1238,7 +1238,7 @@ use_clean:
 		 */
 		bch_verbose(c, "writing allocation info");
 		err = "error writing out alloc info";
-		ret = bch2_alloc_write(c, BTREE_INSERT_LAZY_RW);
+		ret = bch2_alloc_write_all(c, BTREE_INSERT_LAZY_RW);
 		if (ret) {
 			bch_err(c, "error writing alloc info");
 			goto err;
-- 
cgit 


From 20572300dcc537c22b435a1f01b810a9d7c140c9 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 10 Dec 2021 21:24:36 -0500
Subject: bcachefs: Improve alloc_mem_to_key()

This moves some common code into alloc_mem_to_key(), which translates
from the in-memory format for a bucket to the btree key format.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/alloc_background.c | 28 ++++------------------------
 fs/bcachefs/alloc_background.h | 25 ++++++++++++++++++-------
 fs/bcachefs/buckets.c          | 12 +++---------
 3 files changed, 25 insertions(+), 40 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 4953cbee2655..39538dada301 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -393,9 +393,6 @@ static int bch2_alloc_write_key(struct btree_trans *trans,
 {
 	struct bch_fs *c = trans->c;
 	struct bkey_s_c k;
-	struct bch_dev *ca;
-	struct bucket *g;
-	struct bucket_mark m;
 	struct bkey_alloc_unpacked old_u, new_u;
 	int ret;
 retry:
@@ -411,14 +408,8 @@ retry:
 	if (ret)
 		goto err;
 
-	old_u = bch2_alloc_unpack(k);
-
-	percpu_down_read(&c->mark_lock);
-	ca	= bch_dev_bkey_exists(c, iter->pos.inode);
-	g	= bucket(ca, iter->pos.offset);
-	m	= READ_ONCE(g->mark);
-	new_u	= alloc_mem_to_key(iter, g, m);
-	percpu_up_read(&c->mark_lock);
+	old_u	= bch2_alloc_unpack(k);
+	new_u	= alloc_mem_to_key(c, iter);
 
 	if (!bkey_alloc_unpacked_cmp(old_u, new_u))
 		return 0;
@@ -470,9 +461,7 @@ int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev,
 			      size_t bucket_nr, int rw)
 {
 	struct bch_fs *c = trans->c;
-	struct bch_dev *ca = bch_dev_bkey_exists(c, dev);
 	struct btree_iter iter;
-	struct bucket *g;
 	struct bkey_alloc_unpacked u;
 	u64 *time, now;
 	int ret = 0;
@@ -485,10 +474,7 @@ int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev,
 	if (ret)
 		goto out;
 
-	percpu_down_read(&c->mark_lock);
-	g = bucket(ca, bucket_nr);
-	u = alloc_mem_to_key(&iter, g, READ_ONCE(g->mark));
-	percpu_up_read(&c->mark_lock);
+	u = alloc_mem_to_key(c, &iter);
 
 	time = rw == READ ? &u.read_time : &u.write_time;
 	now = atomic64_read(&c->io_clock[rw].now);
@@ -766,8 +752,6 @@ static int bucket_invalidate_btree(struct btree_trans *trans,
 {
 	struct bch_fs *c = trans->c;
 	struct bkey_alloc_unpacked u;
-	struct bucket *g;
-	struct bucket_mark m;
 	struct btree_iter iter;
 	int ret;
 
@@ -781,11 +765,7 @@ static int bucket_invalidate_btree(struct btree_trans *trans,
 	if (ret)
 		goto err;
 
-	percpu_down_read(&c->mark_lock);
-	g = bucket(ca, b);
-	m = READ_ONCE(g->mark);
-	u = alloc_mem_to_key(&iter, g, m);
-	percpu_up_read(&c->mark_lock);
+	u = alloc_mem_to_key(c, &iter);
 
 	u.gen++;
 	u.data_type	= 0;
diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h
index 6698d9c75d07..e3cdb8bc1dd8 100644
--- a/fs/bcachefs/alloc_background.h
+++ b/fs/bcachefs/alloc_background.h
@@ -4,7 +4,9 @@
 
 #include "bcachefs.h"
 #include "alloc_types.h"
+#include "buckets.h"
 #include "debug.h"
+#include "super.h"
 
 extern const char * const bch2_allocator_states[];
 
@@ -43,22 +45,31 @@ int bch2_alloc_write(struct btree_trans *, struct btree_iter *,
 int bch2_bucket_io_time_reset(struct btree_trans *, unsigned, size_t, int);
 
 static inline struct bkey_alloc_unpacked
-alloc_mem_to_key(struct btree_iter *iter,
-		 struct bucket *g, struct bucket_mark m)
+alloc_mem_to_key(struct bch_fs *c, struct btree_iter *iter)
 {
-	return (struct bkey_alloc_unpacked) {
+	struct bch_dev *ca;
+	struct bucket *g;
+	struct bkey_alloc_unpacked ret;
+
+	percpu_down_read(&c->mark_lock);
+	ca	= bch_dev_bkey_exists(c, iter->pos.inode);
+	g	= bucket(ca, iter->pos.offset);
+	ret	= (struct bkey_alloc_unpacked) {
 		.dev		= iter->pos.inode,
 		.bucket		= iter->pos.offset,
-		.gen		= m.gen,
+		.gen		= g->mark.gen,
 		.oldest_gen	= g->oldest_gen,
-		.data_type	= m.data_type,
-		.dirty_sectors	= m.dirty_sectors,
-		.cached_sectors	= m.cached_sectors,
+		.data_type	= g->mark.data_type,
+		.dirty_sectors	= g->mark.dirty_sectors,
+		.cached_sectors	= g->mark.cached_sectors,
 		.read_time	= g->io_time[READ],
 		.write_time	= g->io_time[WRITE],
 		.stripe		= g->stripe,
 		.stripe_redundancy = g->stripe_redundancy,
 	};
+	percpu_up_read(&c->mark_lock);
+
+	return ret;
 }
 
 #define ALLOC_SCAN_BATCH(ca)		max_t(size_t, 1, (ca)->mi.nbuckets >> 9)
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 6bbf088cd095..4fef482ad60e 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -1492,7 +1492,6 @@ static int bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree
 	struct bch_fs *c = trans->c;
 	struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
 	struct bpos pos = POS(ptr->dev, PTR_BUCKET_NR(ca, ptr));
-	struct bucket *g;
 	struct bkey_i *update;
 	int ret;
 
@@ -1507,14 +1506,9 @@ static int bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree
 	}
 
 	update = __bch2_btree_trans_peek_updates(iter);
-	if (update && !bpos_cmp(update->k.p, pos)) {
-		*u = bch2_alloc_unpack(bkey_i_to_s_c(update));
-	} else {
-		percpu_down_read(&c->mark_lock);
-		g = bucket(ca, pos.offset);
-		*u = alloc_mem_to_key(iter, g, READ_ONCE(g->mark));
-		percpu_up_read(&c->mark_lock);
-	}
+	*u = update && !bpos_cmp(update->k.p, pos)
+		? bch2_alloc_unpack(bkey_i_to_s_c(update))
+		: alloc_mem_to_key(c, iter);
 
 	return 0;
 }
-- 
cgit 


From dbd8b46b90852c0dbaffc48fc1d8b3869b078cf2 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 14 Dec 2021 00:08:06 -0500
Subject: bcachefs: Add missing bch2_trans_iter_exit() call

This fixes a bug where the filesystem goes read only when reading from
debugfs.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/debug.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c
index 8b25ef9e1e05..5ea29528ab67 100644
--- a/fs/bcachefs/debug.c
+++ b/fs/bcachefs/debug.c
@@ -406,6 +406,8 @@ static ssize_t bch2_read_bfloat_failed(struct file *file, char __user *buf,
 		if (!i->size)
 			break;
 	}
+	bch2_trans_iter_exit(&trans, &iter);
+
 	bch2_trans_exit(&trans);
 
 	return err < 0 ? err : i->ret;
-- 
cgit 


From 2a863c6c80e3c14eb3920c0d8474ba112c82197a Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 14 Dec 2021 16:05:47 -0500
Subject: bcachefs: Fix debug build in userspace

This fixes some compiler warnings that only trigger in userspace - dead
code, a maybe uninitialed variable, a maybe null ptr passed to printk.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_io.c |  3 ++-
 fs/bcachefs/dirent.c   |  2 +-
 fs/bcachefs/inode.c    | 10 ----------
 3 files changed, 3 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index f01f78952942..5a678e6e993a 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -560,7 +560,8 @@ enum btree_validate_ret {
 									\
 	switch (write) {						\
 	case READ:							\
-		bch_err(c, "%s", _buf2);				\
+		if (_buf2)						\
+			bch_err(c, "%s", _buf2);			\
 									\
 		switch (type) {						\
 		case BTREE_ERR_FIXABLE:					\
diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c
index a165d08c3668..6f699b736b34 100644
--- a/fs/bcachefs/dirent.c
+++ b/fs/bcachefs/dirent.c
@@ -230,7 +230,7 @@ int bch2_dirent_rename(struct btree_trans *trans,
 {
 	struct btree_iter src_iter = { NULL };
 	struct btree_iter dst_iter = { NULL };
-	struct bkey_s_c old_src, old_dst;
+	struct bkey_s_c old_src, old_dst = bkey_s_c_null;
 	struct bkey_i_dirent *new_src = NULL, *new_dst = NULL;
 	struct bpos dst_pos =
 		POS(dst_dir.inum, bch2_dirent_hash(dst_hash, dst_name));
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index fb5ed3a07ad7..d9ccc7c063ac 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -24,16 +24,6 @@ const char * const bch2_inode_opts[] = {
 };
 
 static const u8 byte_table[8] = { 1, 2, 3, 4, 6, 8, 10, 13 };
-static const u8 bits_table[8] = {
-	1  * 8 - 1,
-	2  * 8 - 2,
-	3  * 8 - 3,
-	4  * 8 - 4,
-	6  * 8 - 5,
-	8  * 8 - 6,
-	10 * 8 - 7,
-	13 * 8 - 8,
-};
 
 static int inode_decode_field(const u8 *in, const u8 *end,
 			      u64 out[2], unsigned *out_bits)
-- 
cgit 


From 51c4e406aa5706cdb224ff16eef1d560c504c3ac Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 15 Dec 2021 20:35:45 -0500
Subject: bcachefs: Fix an assertion in bch2_truncate()

We recently added an assertion that when we truncate a file to 0,
i_blocks should also go to 0 - but that's not necessarily true if we're
doing an emergency shutdown, lots of invariants no longer hold true in
that case.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/fs-io.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 2f144fa3298d..28bbbac5cd67 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -2699,7 +2699,8 @@ int bch2_truncate(struct mnt_idmap *idmap,
 			U64_MAX, &i_sectors_delta);
 	i_sectors_acct(c, inode, NULL, i_sectors_delta);
 
-	BUG_ON(!inode->v.i_size && inode->v.i_blocks);
+	WARN_ON(!inode->v.i_size && inode->v.i_blocks &&
+		!bch2_journal_error(&c->journal));
 
 	if (unlikely(ret))
 		goto err;
-- 
cgit 


From b84d42c31f34094c74d6306e11e3a0bc224c7575 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 16 Dec 2021 20:36:26 -0500
Subject: bcachefs: Split out CONFIG_BCACHEFS_DEBUG_TRANSACTIONS

This puts the btree_transactions sysfs/debugfs file behind a separate
config option - it's highly useful, but not cheap enough to enable
permenantly.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/Kconfig       |  9 +++++++++
 fs/bcachefs/btree_iter.c  | 31 ++++++++++++++-----------------
 fs/bcachefs/btree_types.h |  2 --
 3 files changed, 23 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/Kconfig b/fs/bcachefs/Kconfig
index bfe7e6c9c064..d2eb65e9032b 100644
--- a/fs/bcachefs/Kconfig
+++ b/fs/bcachefs/Kconfig
@@ -35,6 +35,15 @@ config BCACHEFS_POSIX_ACL
 	depends on BCACHEFS_FS
 	select FS_POSIX_ACL
 
+config BCACHEFS_DEBUG_TRANSACTIONS
+	bool "bcachefs runtime info"
+	depends on BCACHEFS_FS
+	default y
+	help
+	This makes the list of running btree transactions available in debugfs.
+
+	This is a highly useful debugging feature but does add a small amount of overhead.
+
 config BCACHEFS_DEBUG
 	bool "bcachefs debugging"
 	depends on BCACHEFS_FS
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 1ad81cad36f1..cc1dd788cdd5 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -364,19 +364,16 @@ bool __bch2_btree_node_lock(struct btree_trans *trans,
 	if (six_trylock_type(&b->c.lock, type))
 		return true;
 
-#ifdef CONFIG_BCACHEFS_DEBUG
 	trans->locking_path_idx = path->idx;
 	trans->locking_pos	= pos;
 	trans->locking_btree_id	= path->btree_id;
 	trans->locking_level	= level;
 	trans->locking		= b;
-#endif
 
 	ret = six_lock_type(&b->c.lock, type, should_sleep_fn, p) == 0;
 
-#ifdef CONFIG_BCACHEFS_DEBUG
 	trans->locking = NULL;
-#endif
+
 	if (ret)
 		bch2_time_stats_update(&trans->c->times[lock_to_time_stat(type)],
 				       start_time);
@@ -2822,12 +2819,12 @@ void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c,
 
 	trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
 
-#ifdef CONFIG_BCACHEFS_DEBUG
-	trans->pid = current->pid;
-	mutex_lock(&c->btree_trans_lock);
-	list_add(&trans->list, &c->btree_trans_list);
-	mutex_unlock(&c->btree_trans_lock);
-#endif
+	if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG_TRANSACTIONS)) {
+		trans->pid = current->pid;
+		mutex_lock(&c->btree_trans_lock);
+		list_add(&trans->list, &c->btree_trans_list);
+		mutex_unlock(&c->btree_trans_lock);
+	}
 }
 
 static void check_btree_paths_leaked(struct btree_trans *trans)
@@ -2866,11 +2863,11 @@ void bch2_trans_exit(struct btree_trans *trans)
 
 	check_btree_paths_leaked(trans);
 
-#ifdef CONFIG_BCACHEFS_DEBUG
-	mutex_lock(&c->btree_trans_lock);
-	list_del(&trans->list);
-	mutex_unlock(&c->btree_trans_lock);
-#endif
+	if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG_TRANSACTIONS)) {
+		mutex_lock(&c->btree_trans_lock);
+		list_del(&trans->list);
+		mutex_unlock(&c->btree_trans_lock);
+	}
 
 	srcu_read_unlock(&c->btree_trans_barrier, trans->srcu_idx);
 
@@ -2914,7 +2911,7 @@ bch2_btree_path_node_to_text(struct printbuf *out,
 	bch2_bpos_to_text(out, btree_node_pos(_b, cached));
 }
 
-#ifdef CONFIG_BCACHEFS_DEBUG
+#ifdef CONFIG_BCACHEFS_DEBUG_TRANSACTIONS
 static bool trans_has_locks(struct btree_trans *trans)
 {
 	struct btree_path *path;
@@ -2928,7 +2925,7 @@ static bool trans_has_locks(struct btree_trans *trans)
 
 void bch2_btree_trans_to_text(struct printbuf *out, struct bch_fs *c)
 {
-#ifdef CONFIG_BCACHEFS_DEBUG
+#ifdef CONFIG_BCACHEFS_DEBUG_TRANSACTIONS
 	struct btree_trans *trans;
 	struct btree_path *path;
 	struct btree *b;
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 25b0df22366b..e1b417df4b73 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -363,7 +363,6 @@ struct btree_trans_commit_hook {
 
 struct btree_trans {
 	struct bch_fs		*c;
-#ifdef CONFIG_BCACHEFS_DEBUG
 	struct list_head	list;
 	struct btree		*locking;
 	unsigned		locking_path_idx;
@@ -371,7 +370,6 @@ struct btree_trans {
 	u8			locking_btree_id;
 	u8			locking_level;
 	pid_t			pid;
-#endif
 	unsigned long		ip;
 	int			srcu_idx;
 
-- 
cgit 


From 62d5bd955fd81320d1e03fdebb4342ee14df1d1f Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 19 Dec 2021 18:59:22 -0500
Subject: bcachefs: Kill bch2_sort_repack_merge()

The main function of bch2_sort_repack_merge() was to call .key_normalize
on every key, which drops stale (cached) pointers - it hasn't actually
merged extents in quite some time.

But bch2_gc_gens() now works on individual keys - we used to gc old gens
by rewriting entire btree nodes. With that gone, there's no need for
internal btree code to be calling .key_normalize anymore.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/bkey_sort.c | 58 -------------------------------------------------
 fs/bcachefs/bkey_sort.h |  5 -----
 fs/bcachefs/btree_io.c  | 14 ++++--------
 3 files changed, 4 insertions(+), 73 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bkey_sort.c b/fs/bcachefs/bkey_sort.c
index 537ab7919e88..da0b7a63b146 100644
--- a/fs/bcachefs/bkey_sort.c
+++ b/fs/bcachefs/bkey_sort.c
@@ -117,23 +117,6 @@ bch2_key_sort_fix_overlapping(struct bch_fs *c, struct bset *dst,
 	return nr;
 }
 
-static void extent_sort_append(struct bch_fs *c,
-			       struct bkey_format *f,
-			       struct btree_nr_keys *nr,
-			       struct bkey_packed **out,
-			       struct bkey_s k)
-{
-	if (!bkey_deleted(k.k)) {
-		if (!bch2_bkey_pack_key(*out, k.k, f))
-			memcpy_u64s_small(*out, k.k, BKEY_U64s);
-
-		memcpy_u64s_small(bkeyp_val(f, *out), k.v, bkey_val_u64s(k.k));
-
-		btree_keys_account_key_add(nr, 0, *out);
-		*out = bkey_next(*out);
-	}
-}
-
 /* Sort + repack in a new format: */
 struct btree_nr_keys
 bch2_sort_repack(struct bset *dst, struct btree *src,
@@ -165,47 +148,6 @@ bch2_sort_repack(struct bset *dst, struct btree *src,
 	return nr;
 }
 
-/* Sort, repack, and call bch2_bkey_normalize() to drop stale pointers: */
-struct btree_nr_keys
-bch2_sort_repack_merge(struct bch_fs *c,
-		       struct bset *dst, struct btree *src,
-		       struct btree_node_iter *iter,
-		       struct bkey_format *out_f,
-		       bool filter_whiteouts)
-{
-	struct bkey_packed *out = vstruct_last(dst), *k_packed;
-	struct bkey_buf k;
-	struct btree_nr_keys nr;
-
-	memset(&nr, 0, sizeof(nr));
-	bch2_bkey_buf_init(&k);
-
-	while ((k_packed = bch2_btree_node_iter_next_all(iter, src))) {
-		if (filter_whiteouts && bkey_deleted(k_packed))
-			continue;
-
-		/*
-		 * NOTE:
-		 * bch2_bkey_normalize may modify the key we pass it (dropping
-		 * stale pointers) and we don't have a write lock on the src
-		 * node; we have to make a copy of the entire key before calling
-		 * normalize
-		 */
-		bch2_bkey_buf_realloc(&k, c, k_packed->u64s + BKEY_U64s);
-		bch2_bkey_unpack(src, k.k, k_packed);
-
-		if (filter_whiteouts &&
-		    bch2_bkey_normalize(c, bkey_i_to_s(k.k)))
-			continue;
-
-		extent_sort_append(c, out_f, &nr, &out, bkey_i_to_s(k.k));
-	}
-
-	dst->u64s = cpu_to_le16((u64 *) out - dst->_data);
-	bch2_bkey_buf_exit(&k, c);
-	return nr;
-}
-
 static inline int sort_keys_cmp(struct btree *b,
 				struct bkey_packed *l,
 				struct bkey_packed *r)
diff --git a/fs/bcachefs/bkey_sort.h b/fs/bcachefs/bkey_sort.h
index 1059996dac78..79cf11d1b4e7 100644
--- a/fs/bcachefs/bkey_sort.h
+++ b/fs/bcachefs/bkey_sort.h
@@ -37,11 +37,6 @@ struct btree_nr_keys
 bch2_sort_repack(struct bset *, struct btree *,
 		 struct btree_node_iter *,
 		 struct bkey_format *, bool);
-struct btree_nr_keys
-bch2_sort_repack_merge(struct bch_fs *,
-		       struct bset *, struct btree *,
-		       struct btree_node_iter *,
-		       struct bkey_format *, bool);
 
 unsigned bch2_sort_keys(struct bkey_packed *,
 			struct sort_iter *, bool);
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 5a678e6e993a..45f7ec41a8f1 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -385,16 +385,10 @@ void bch2_btree_sort_into(struct bch_fs *c,
 
 	bch2_btree_node_iter_init_from_start(&src_iter, src);
 
-	if (btree_node_is_extents(src))
-		nr = bch2_sort_repack_merge(c, btree_bset_first(dst),
-				src, &src_iter,
-				&dst->format,
-				true);
-	else
-		nr = bch2_sort_repack(btree_bset_first(dst),
-				src, &src_iter,
-				&dst->format,
-				true);
+	nr = bch2_sort_repack(btree_bset_first(dst),
+			src, &src_iter,
+			&dst->format,
+			true);
 
 	bch2_time_stats_update(&c->times[BCH_TIME_btree_node_sort],
 			       start_time);
-- 
cgit 


From 7a0e4afb1a1116a3580144c7c902e6024333f20e Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 19 Dec 2021 19:01:41 -0500
Subject: bcachefs: Don't call bch2_bkey_transform() unnecessarily

If the packed format isn't changing, there's no need to call
bch2_bkey_transform().

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/bkey_sort.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bkey_sort.c b/fs/bcachefs/bkey_sort.c
index da0b7a63b146..b1385a77da11 100644
--- a/fs/bcachefs/bkey_sort.c
+++ b/fs/bcachefs/bkey_sort.c
@@ -127,6 +127,7 @@ bch2_sort_repack(struct bset *dst, struct btree *src,
 	struct bkey_format *in_f = &src->format;
 	struct bkey_packed *in, *out = vstruct_last(dst);
 	struct btree_nr_keys nr;
+	bool transform = memcmp(out_f, &src->format, sizeof(*out_f));
 
 	memset(&nr, 0, sizeof(nr));
 
@@ -134,8 +135,10 @@ bch2_sort_repack(struct bset *dst, struct btree *src,
 		if (filter_whiteouts && bkey_deleted(in))
 			continue;
 
-		if (bch2_bkey_transform(out_f, out, bkey_packed(in)
-				       ? in_f : &bch2_bkey_format_current, in))
+		if (!transform)
+			bkey_copy(out, in);
+		else if (bch2_bkey_transform(out_f, out, bkey_packed(in)
+					     ? in_f : &bch2_bkey_format_current, in))
 			out->format = KEY_FORMAT_LOCAL_BTREE;
 		else
 			bch2_bkey_unpack(src, (void *) out, in);
-- 
cgit 


From 6df893fb1115083765a877302cdc25866ce5a87e Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 14 Dec 2021 14:24:04 -0500
Subject: bcachefs: Kill some obsolete sysfs code

fs internal/alloc_debug doesn't show anything bcachefs fs usage shows.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/sysfs.c | 34 +++++++---------------------------
 1 file changed, 7 insertions(+), 27 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index d5d32bf16d68..3f51eda749f0 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -262,21 +262,6 @@ static long data_progress_to_text(struct printbuf *out, struct bch_fs *c)
 	return ret;
 }
 
-static int fs_alloc_debug_to_text(struct printbuf *out, struct bch_fs *c)
-{
-	struct bch_fs_usage_online *fs_usage = bch2_fs_usage_read(c);
-
-	if (!fs_usage)
-		return -ENOMEM;
-
-	bch2_fs_usage_to_text(out, c, fs_usage);
-
-	percpu_up_read(&c->mark_lock);
-
-	kfree(fs_usage);
-	return 0;
-}
-
 static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c)
 {
 	struct btree_trans trans;
@@ -386,9 +371,6 @@ SHOW(bch2_fs)
 
 	/* Debugging: */
 
-	if (attr == &sysfs_alloc_debug)
-		return fs_alloc_debug_to_text(&out, c) ?: out.pos - buf;
-
 	if (attr == &sysfs_journal_debug) {
 		bch2_journal_debug_to_text(&out, &c->journal);
 		return out.pos - buf;
@@ -580,7 +562,6 @@ STORE(bch2_fs_internal)
 SYSFS_OPS(bch2_fs_internal);
 
 struct attribute *bch2_fs_internal_files[] = {
-	&sysfs_alloc_debug,
 	&sysfs_journal_debug,
 	&sysfs_journal_pins,
 	&sysfs_btree_updates,
@@ -588,17 +569,21 @@ struct attribute *bch2_fs_internal_files[] = {
 	&sysfs_btree_cache,
 	&sysfs_btree_key_cache,
 	&sysfs_btree_transactions,
+	&sysfs_new_stripes,
 	&sysfs_stripes_heap,
 	&sysfs_open_buckets,
+	&sysfs_io_timers_read,
+	&sysfs_io_timers_write,
+
+	&sysfs_trigger_journal_flush,
+	&sysfs_trigger_gc,
+	&sysfs_prune_cache,
 
 	&sysfs_read_realloc_races,
 	&sysfs_extent_migrate_done,
 	&sysfs_extent_migrate_raced,
 
-	&sysfs_trigger_journal_flush,
-	&sysfs_trigger_gc,
 	&sysfs_gc_gens_pos,
-	&sysfs_prune_cache,
 
 	&sysfs_copy_gc_enabled,
 	&sysfs_copy_gc_wait,
@@ -607,11 +592,6 @@ struct attribute *bch2_fs_internal_files[] = {
 	&sysfs_rebalance_work,
 	sysfs_pd_controller_files(rebalance),
 
-	&sysfs_new_stripes,
-
-	&sysfs_io_timers_read,
-	&sysfs_io_timers_write,
-
 	&sysfs_data_op_data_progress,
 
 	&sysfs_internal_uuid,
-- 
cgit 


From 6be1b6d9df9dfe7065220b64b32de339b1120f1b Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 20 Dec 2021 12:53:06 -0500
Subject: bcachefs: Make sure bch2_bucket_alloc_new_fs() obeys buckets_nouse

This fixes the filesystem migrate tool.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/alloc_foreground.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index e3fe4d7bbe21..646d556a5c24 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -152,6 +152,7 @@ long bch2_bucket_alloc_new_fs(struct bch_dev *ca)
 
 	for (b = buckets->first_bucket; b < buckets->nbuckets; b++)
 		if (is_available_bucket(buckets->b[b].mark) &&
+		    (!ca->buckets_nouse || !test_bit(b, ca->buckets_nouse)) &&
 		    !buckets->b[b].mark.owned_by_allocator)
 			goto success;
 	b = -1;
-- 
cgit 


From 1aeed4549de41cabf8e7f52c62646bac7b20a385 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 19 Dec 2021 19:02:50 -0500
Subject: bcachefs: Optimize memory accesses in bch2_btree_node_get()

This puts a load behind some branches before where it's used, so that it
can execute in parallel with other loads.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_cache.c | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index 4e855ae51731..5bf493a315ca 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -775,16 +775,17 @@ struct btree *bch2_btree_node_get(struct btree_trans *trans, struct btree_path *
 
 	EBUG_ON(level >= BTREE_MAX_DEPTH);
 
-	if (c->opts.btree_node_mem_ptr_optimization) {
-		b = btree_node_mem_ptr(k);
-		/*
-		 * Check b->hash_val _before_ calling btree_node_lock() - this
-		 * might not be the node we want anymore, and trying to lock the
-		 * wrong node could cause an unneccessary transaction restart:
-		 */
-		if (b && b->hash_val == btree_ptr_hash_val(k))
+	b = btree_node_mem_ptr(k);
+
+	/*
+	 * Check b->hash_val _before_ calling btree_node_lock() - this might not
+	 * be the node we want anymore, and trying to lock the wrong node could
+	 * cause an unneccessary transaction restart:
+	 */
+	if (likely(c->opts.btree_node_mem_ptr_optimization &&
+		   b &&
+		   b->hash_val == btree_ptr_hash_val(k)))
 			goto lock_node;
-	}
 retry:
 	b = btree_cache_find(bc, k);
 	if (unlikely(!b)) {
-- 
cgit 


From 99fafb0425ea9c68b45699053d6124a3e32d844d Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 20 Dec 2021 18:18:35 -0500
Subject: bcachefs: Fix some shutdown path bugs

This fixes some bugs when we hit an error very early in the filesystem
startup path, before most things have been initialized.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/bcachefs.h        |  1 +
 fs/bcachefs/btree_iter.c      | 13 +++++++++----
 fs/bcachefs/btree_key_cache.c | 11 ++++++-----
 3 files changed, 16 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index dde919a95585..1ad5eafb2f76 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -705,6 +705,7 @@ struct bch_fs {
 	struct btree_path_buf  __percpu	*btree_paths_bufs;
 
 	struct srcu_struct	btree_trans_barrier;
+	bool			btree_trans_barrier_initialized;
 
 	struct btree_key_cache	btree_key_cache;
 
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index cc1dd788cdd5..a9db8d05dc93 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -2984,22 +2984,27 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct bch_fs *c)
 
 void bch2_fs_btree_iter_exit(struct bch_fs *c)
 {
+	if (c->btree_trans_barrier_initialized)
+		cleanup_srcu_struct(&c->btree_trans_barrier);
 	mempool_exit(&c->btree_trans_mem_pool);
 	mempool_exit(&c->btree_paths_pool);
-	cleanup_srcu_struct(&c->btree_trans_barrier);
 }
 
 int bch2_fs_btree_iter_init(struct bch_fs *c)
 {
 	unsigned nr = BTREE_ITER_MAX;
+	int ret;
 
 	INIT_LIST_HEAD(&c->btree_trans_list);
 	mutex_init(&c->btree_trans_lock);
 
-	return  init_srcu_struct(&c->btree_trans_barrier) ?:
-		mempool_init_kmalloc_pool(&c->btree_paths_pool, 1,
+	ret   = mempool_init_kmalloc_pool(&c->btree_paths_pool, 1,
 			sizeof(struct btree_path) * nr +
 			sizeof(struct btree_insert_entry) * nr) ?:
 		mempool_init_kmalloc_pool(&c->btree_trans_mem_pool, 1,
-					  BTREE_TRANS_MEM_MAX);
+					  BTREE_TRANS_MEM_MAX) ?:
+		init_srcu_struct(&c->btree_trans_barrier);
+	if (!ret)
+		c->btree_trans_barrier_initialized = true;
+	return ret;
 }
diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index 50b44e55dfe7..d045b3a5deed 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -663,11 +663,12 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc)
 
 	rcu_read_lock();
 	tbl = rht_dereference_rcu(bc->table.tbl, &bc->table);
-	for (i = 0; i < tbl->size; i++)
-		rht_for_each_entry_rcu(ck, pos, tbl, i, hash) {
-			bkey_cached_evict(bc, ck);
-			list_add(&ck->list, &bc->freed);
-		}
+	if (tbl)
+		for (i = 0; i < tbl->size; i++)
+			rht_for_each_entry_rcu(ck, pos, tbl, i, hash) {
+				bkey_cached_evict(bc, ck);
+				list_add(&ck->list, &bc->freed);
+			}
 	rcu_read_unlock();
 
 	list_for_each_entry_safe(ck, n, &bc->freed, list) {
-- 
cgit 


From f3e1f4443383f72975f12caece6f13e63f21a719 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 21 Dec 2021 20:48:26 -0500
Subject: bcachefs: BTREE_ITER_NOPRESERVE

This adds a flag to not mark the initial btree_path as preserve, for
paths that we expect to be cheap to reconstitute if necessary - this
solves a btree_path overflow caused by need_whiteout_for_snapshot().

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_iter.c            | 20 +++++++++-----------
 fs/bcachefs/btree_iter.h            |  4 ++--
 fs/bcachefs/btree_types.h           |  1 +
 fs/bcachefs/btree_update_interior.c |  4 ++--
 fs/bcachefs/btree_update_leaf.c     |  3 ++-
 5 files changed, 16 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index a9db8d05dc93..777197ec2656 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1807,12 +1807,14 @@ static struct btree_path *btree_path_alloc(struct btree_trans *trans,
 	return path;
 }
 
-struct btree_path *bch2_path_get(struct btree_trans *trans, bool cached,
+struct btree_path *bch2_path_get(struct btree_trans *trans,
 				 enum btree_id btree_id, struct bpos pos,
 				 unsigned locks_want, unsigned level,
-				 bool intent)
+				 unsigned flags)
 {
 	struct btree_path *path, *path_pos = NULL;
+	bool cached = flags & BTREE_ITER_CACHED;
+	bool intent = flags & BTREE_ITER_INTENT;
 	int i;
 
 	BUG_ON(trans->restarted);
@@ -1836,7 +1838,6 @@ struct btree_path *bch2_path_get(struct btree_trans *trans, bool cached,
 	    path_pos->level	== level) {
 		__btree_path_get(path_pos, intent);
 		path = btree_path_set_pos(trans, path_pos, pos, intent);
-		path->preserve = true;
 	} else {
 		path = btree_path_alloc(trans, path_pos);
 		path_pos = NULL;
@@ -1845,7 +1846,6 @@ struct btree_path *bch2_path_get(struct btree_trans *trans, bool cached,
 		path->pos			= pos;
 		path->btree_id			= btree_id;
 		path->cached			= cached;
-		path->preserve			= true;
 		path->uptodate			= BTREE_ITER_NEED_TRAVERSE;
 		path->should_be_locked		= false;
 		path->level			= level;
@@ -1860,6 +1860,9 @@ struct btree_path *bch2_path_get(struct btree_trans *trans, bool cached,
 		trans->paths_sorted		= false;
 	}
 
+	if (!(flags & BTREE_ITER_NOPRESERVE))
+		path->preserve = true;
+
 	if (path->intent_ref)
 		locks_want = max(locks_want, level + 1);
 
@@ -2642,13 +2645,8 @@ static void __bch2_trans_iter_init(struct btree_trans *trans,
 	iter->k.p	= pos;
 	iter->k.size	= 0;
 
-	iter->path = bch2_path_get(trans,
-				   flags & BTREE_ITER_CACHED,
-				   btree_id,
-				   iter->pos,
-				   locks_want,
-				   depth,
-				   flags & BTREE_ITER_INTENT);
+	iter->path = bch2_path_get(trans, btree_id, iter->pos,
+				   locks_want, depth, flags);
 }
 
 void bch2_trans_iter_init(struct btree_trans *trans,
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index af1922c448ed..457a7601b0ce 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -159,8 +159,8 @@ bch2_btree_path_make_mut(struct btree_trans *trans,
 
 int __must_check bch2_btree_path_traverse(struct btree_trans *,
 					  struct btree_path *, unsigned);
-struct btree_path *bch2_path_get(struct btree_trans *, bool, enum btree_id,
-				 struct bpos, unsigned, unsigned, bool);
+struct btree_path *bch2_path_get(struct btree_trans *, enum btree_id, struct bpos,
+				 unsigned, unsigned, unsigned);
 inline struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *, struct bkey *);
 
 #ifdef CONFIG_BCACHEFS_DEBUG
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index e1b417df4b73..223af7848fb4 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -210,6 +210,7 @@ struct btree_node_iter {
 #define __BTREE_ITER_ALL_SNAPSHOTS	(1 << 11)
 #define BTREE_ITER_ALL_SNAPSHOTS	(1 << 12)
 #define BTREE_ITER_FILTER_SNAPSHOTS	(1 << 13)
+#define BTREE_ITER_NOPRESERVE		(1 << 14)
 
 enum btree_path_uptodate {
 	BTREE_ITER_UPTODATE		= 0,
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 8865ab7d087b..3e6dd2ed1c03 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -1609,8 +1609,8 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans,
 		? bpos_predecessor(b->data->min_key)
 		: bpos_successor(b->data->max_key);
 
-	sib_path = bch2_path_get(trans, false, path->btree_id,
-				 sib_pos, U8_MAX, level, true);
+	sib_path = bch2_path_get(trans, path->btree_id, sib_pos,
+				 U8_MAX, level, BTREE_ITER_INTENT);
 	ret = bch2_btree_path_traverse(trans, sib_path, false);
 	if (ret)
 		goto err;
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 96fc2cd13f21..50c9caa729ff 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -1297,7 +1297,8 @@ static int need_whiteout_for_snapshot(struct btree_trans *trans,
 	pos.snapshot++;
 
 	for_each_btree_key_norestart(trans, iter, btree_id, pos,
-			   BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
+			   BTREE_ITER_ALL_SNAPSHOTS|
+			   BTREE_ITER_NOPRESERVE, k, ret) {
 		if (bkey_cmp(k.k->p, pos))
 			break;
 
-- 
cgit 


From d05117e36a7290cbfa8ebcc05c6facb60a5bcefb Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 21 Dec 2021 21:57:10 -0500
Subject: bcachefs: Fix debugfs -bfloat-failed

It wasn't updated for snapshots - it's iterating across keys in all
snapshots, so needs to be specifying BTREE_ITER_ALL_SNAPSHOTS.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/debug.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c
index 5ea29528ab67..02a5ef5ecb3e 100644
--- a/fs/bcachefs/debug.c
+++ b/fs/bcachefs/debug.c
@@ -373,7 +373,9 @@ static ssize_t bch2_read_bfloat_failed(struct file *file, char __user *buf,
 
 	bch2_trans_init(&trans, i->c, 0, 0);
 
-	bch2_trans_iter_init(&trans, &iter, i->id, i->from, BTREE_ITER_PREFETCH);
+	bch2_trans_iter_init(&trans, &iter, i->id, i->from,
+			     BTREE_ITER_PREFETCH|
+			     BTREE_ITER_ALL_SNAPSHOTS);
 
 	while ((k = bch2_btree_iter_peek(&iter)).k &&
 	       !(err = bkey_err(k))) {
-- 
cgit 


From 8244f3209b5b49a6bde9921d7825af9f57161b23 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 14 Dec 2021 14:24:41 -0500
Subject: bcachefs: Option improvements

This adds flags for options that must be a power of two (block size and
btree node size), and options that are stored in the superblock as a
power of two (encoded extent max).

Also: options are now stored in memory in the same units they're
displayed in (bytes): we now convert when getting and setting from the
superblock.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/bcachefs.h              |  18 +++-
 fs/bcachefs/btree_cache.h           |   4 +-
 fs/bcachefs/btree_io.c              |  18 ++--
 fs/bcachefs/btree_update_interior.c |   8 +-
 fs/bcachefs/btree_update_interior.h |   2 +-
 fs/bcachefs/buckets.c               |   6 +-
 fs/bcachefs/compress.c              |   2 +-
 fs/bcachefs/extents.c               |   2 +-
 fs/bcachefs/fs.c                    |   6 +-
 fs/bcachefs/io.c                    |   4 +-
 fs/bcachefs/journal_io.c            |   4 +-
 fs/bcachefs/opts.c                  | 171 +++++++++++++++++++++++++-----------
 fs/bcachefs/opts.h                  |  40 +++++----
 fs/bcachefs/super-io.c              |  17 ++--
 fs/bcachefs/super.c                 |  11 ++-
 fs/bcachefs/sysfs.c                 |  12 +--
 fs/bcachefs/xattr.c                 |   2 +-
 17 files changed, 205 insertions(+), 122 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 1ad5eafb2f76..95b590d9ee7f 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -928,10 +928,20 @@ static inline unsigned bucket_bytes(const struct bch_dev *ca)
 
 static inline unsigned block_bytes(const struct bch_fs *c)
 {
-	return c->opts.block_size << 9;
+	return c->opts.block_size;
 }
 
-static inline struct timespec64 bch2_time_to_timespec(struct bch_fs *c, s64 time)
+static inline unsigned block_sectors(const struct bch_fs *c)
+{
+	return c->opts.block_size >> 9;
+}
+
+static inline size_t btree_sectors(const struct bch_fs *c)
+{
+	return c->opts.btree_node_size >> 9;
+}
+
+static inline struct timespec64 bch2_time_to_timespec(const struct bch_fs *c, s64 time)
 {
 	struct timespec64 t;
 	s32 rem;
@@ -943,13 +953,13 @@ static inline struct timespec64 bch2_time_to_timespec(struct bch_fs *c, s64 time
 	return t;
 }
 
-static inline s64 timespec_to_bch2_time(struct bch_fs *c, struct timespec64 ts)
+static inline s64 timespec_to_bch2_time(const struct bch_fs *c, struct timespec64 ts)
 {
 	return (ts.tv_sec * c->sb.time_units_per_sec +
 		(int) ts.tv_nsec / c->sb.nsec_per_time_unit) - c->sb.time_base_lo;
 }
 
-static inline s64 bch2_current_time(struct bch_fs *c)
+static inline s64 bch2_current_time(const struct bch_fs *c)
 {
 	struct timespec64 now;
 
diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h
index 2f6e0ea87616..a08d12569075 100644
--- a/fs/bcachefs/btree_cache.h
+++ b/fs/bcachefs/btree_cache.h
@@ -69,7 +69,7 @@ static inline bool btree_node_hashed(struct btree *b)
 
 static inline size_t btree_bytes(struct bch_fs *c)
 {
-	return c->opts.btree_node_size << 9;
+	return c->opts.btree_node_size;
 }
 
 static inline size_t btree_max_u64s(struct bch_fs *c)
@@ -84,7 +84,7 @@ static inline size_t btree_pages(struct bch_fs *c)
 
 static inline unsigned btree_blocks(struct bch_fs *c)
 {
-	return c->opts.btree_node_size >> c->block_bits;
+	return btree_sectors(c) >> c->block_bits;
 }
 
 #define BTREE_SPLIT_THRESHOLD(c)		(btree_max_u64s(c) * 2 / 3)
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 45f7ec41a8f1..287c45253a33 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -682,7 +682,7 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
 		     BTREE_ERR_FATAL, c, ca, b, i,
 		     "BSET_SEPARATE_WHITEOUTS no longer supported");
 
-	if (btree_err_on(offset + sectors > c->opts.btree_node_size,
+	if (btree_err_on(offset + sectors > btree_sectors(c),
 			 BTREE_ERR_FIXABLE, c, ca, b, i,
 			 "bset past end of btree node")) {
 		i->u64s = 0;
@@ -896,7 +896,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
 			     b->data->keys.seq, bp->seq);
 	}
 
-	while (b->written < (ptr_written ?: c->opts.btree_node_size)) {
+	while (b->written < (ptr_written ?: btree_sectors(c))) {
 		unsigned sectors, whiteout_u64s = 0;
 		struct nonce nonce;
 		struct bch_csum csum;
@@ -1204,7 +1204,7 @@ static unsigned btree_node_sectors_written(struct bch_fs *c, void *data)
 	if (le64_to_cpu(bn->magic) !=  bset_magic(c))
 		return 0;
 
-	while (offset < c->opts.btree_node_size) {
+	while (offset < btree_sectors(c)) {
 		if (!offset) {
 			offset += vstruct_sectors(bn, c->block_bits);
 		} else {
@@ -1226,7 +1226,7 @@ static bool btree_node_has_extra_bsets(struct bch_fs *c, unsigned offset, void *
 	if (!offset)
 		return false;
 
-	while (offset < c->opts.btree_node_size) {
+	while (offset < btree_sectors(c)) {
 		bne = data + (offset << 9);
 		if (bne->keys.seq == bn->keys.seq)
 			return true;
@@ -1296,7 +1296,7 @@ fsck_err:
 			if (ra->err[i])
 				continue;
 
-			while (offset < c->opts.btree_node_size) {
+			while (offset < btree_sectors(c)) {
 				if (!offset) {
 					sectors = vstruct_sectors(bn, c->block_bits);
 				} else {
@@ -1313,7 +1313,7 @@ fsck_err:
 				offset += sectors;
 			}
 
-			while (offset < c->opts.btree_node_size) {
+			while (offset < btree_sectors(c)) {
 				bne = ra->buf[i] + (offset << 9);
 				if (bne->keys.seq == bn->keys.seq) {
 					if (!gap)
@@ -1793,8 +1793,8 @@ do_write:
 	BUG_ON(btree_node_fake(b));
 	BUG_ON((b->will_make_reachable != 0) != !b->written);
 
-	BUG_ON(b->written >= c->opts.btree_node_size);
-	BUG_ON(b->written & (c->opts.block_size - 1));
+	BUG_ON(b->written >= btree_sectors(c));
+	BUG_ON(b->written & (block_sectors(c) - 1));
 	BUG_ON(bset_written(b, btree_bset_last(b)));
 	BUG_ON(le64_to_cpu(b->data->magic) != bset_magic(c));
 	BUG_ON(memcmp(&b->data->format, &b->format, sizeof(b->format)));
@@ -1867,7 +1867,7 @@ do_write:
 	memset(data + bytes_to_write, 0,
 	       (sectors_to_write << 9) - bytes_to_write);
 
-	BUG_ON(b->written + sectors_to_write > c->opts.btree_node_size);
+	BUG_ON(b->written + sectors_to_write > btree_sectors(c));
 	BUG_ON(BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN);
 	BUG_ON(i->seq != b->data->keys.seq);
 
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 3e6dd2ed1c03..fd4089d19ad2 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -223,12 +223,12 @@ retry:
 	if (IS_ERR(wp))
 		return ERR_CAST(wp);
 
-	if (wp->sectors_free < c->opts.btree_node_size) {
+	if (wp->sectors_free < btree_sectors(c)) {
 		struct open_bucket *ob;
 		unsigned i;
 
 		open_bucket_for_each(c, &wp->ptrs, ob, i)
-			if (ob->sectors_free < c->opts.btree_node_size)
+			if (ob->sectors_free < btree_sectors(c))
 				ob->sectors_free = 0;
 
 		bch2_alloc_sectors_done(c, wp);
@@ -236,7 +236,7 @@ retry:
 	}
 
 	bkey_btree_ptr_v2_init(&tmp.k);
-	bch2_alloc_sectors_append_ptrs(c, wp, &tmp.k, c->opts.btree_node_size);
+	bch2_alloc_sectors_append_ptrs(c, wp, &tmp.k, btree_sectors(c));
 
 	bch2_open_bucket_get(c, wp, &ob);
 	bch2_alloc_sectors_done(c, wp);
@@ -1029,7 +1029,7 @@ retry:
 	}
 
 	ret = bch2_disk_reservation_get(c, &as->disk_res,
-			nr_nodes * c->opts.btree_node_size,
+			nr_nodes * btree_sectors(c),
 			c->opts.metadata_replicas,
 			disk_res_flags);
 	if (ret)
diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h
index d4574161a733..8cf59cee6e4e 100644
--- a/fs/bcachefs/btree_update_interior.h
+++ b/fs/bcachefs/btree_update_interior.h
@@ -218,7 +218,7 @@ static inline ssize_t __bch_btree_u64s_remaining(struct bch_fs *c,
 {
 	ssize_t used = bset_byte_offset(b, end) / sizeof(u64) +
 		b->whiteout_u64s;
-	ssize_t total = c->opts.btree_node_size << 6;
+	ssize_t total = c->opts.btree_node_size >> 3;
 
 	/* Always leave one extra u64 for bch2_varint_decode: */
 	used++;
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 4fef482ad60e..0d9d723c24bb 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -1000,7 +1000,7 @@ static int bch2_mark_extent(struct btree_trans *trans,
 		? BCH_DATA_btree
 		: BCH_DATA_user;
 	s64 sectors = bkey_is_btree_ptr(k.k)
-		? c->opts.btree_node_size
+		? btree_sectors(c)
 		: k.k->size;
 	s64 dirty_sectors = 0;
 	bool stale;
@@ -1609,7 +1609,7 @@ static int bch2_trans_mark_extent(struct btree_trans *trans,
 		? BCH_DATA_btree
 		: BCH_DATA_user;
 	s64 sectors = bkey_is_btree_ptr(k.k)
-		? c->opts.btree_node_size
+		? btree_sectors(c)
 		: k.k->size;
 	s64 dirty_sectors = 0;
 	bool stale;
@@ -2184,7 +2184,7 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
 	alloc_heap	alloc_heap;
 
 	size_t btree_reserve	= DIV_ROUND_UP(BTREE_NODE_RESERVE,
-			     ca->mi.bucket_size / c->opts.btree_node_size);
+			     ca->mi.bucket_size / btree_sectors(c));
 	/* XXX: these should be tunable */
 	size_t reserve_none	= max_t(size_t, 1, nbuckets >> 9);
 	size_t copygc_reserve	= max_t(size_t, 2, nbuckets >> 6);
diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c
index 78757dcede36..2d5dc2394bab 100644
--- a/fs/bcachefs/compress.c
+++ b/fs/bcachefs/compress.c
@@ -376,7 +376,7 @@ static unsigned __bio_compress(struct bch_fs *c,
 	BUG_ON(!mempool_initialized(&c->compress_workspace[compression_type]));
 
 	/* If it's only one block, don't bother trying to compress: */
-	if (bio_sectors(src) <= c->opts.block_size)
+	if (src->bi_iter.bi_size <= c->opts.block_size)
 		return 0;
 
 	dst_data = bio_map_or_bounce(c, dst, WRITE);
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 8592a0f6327e..161ae4fd59d9 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -1037,7 +1037,7 @@ const char *bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k)
 
 	if (k.k->type == KEY_TYPE_btree_ptr ||
 	    k.k->type == KEY_TYPE_btree_ptr_v2)
-		size_ondisk = c->opts.btree_node_size;
+		size_ondisk = btree_sectors(c);
 
 	bkey_extent_entry_for_each(ptrs, entry) {
 		if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX)
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index 31adc0e0d452..bbdfccf24e53 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -868,8 +868,8 @@ static int bch2_fill_extent(struct bch_fs *c,
 			else
 				offset += p.crc.offset;
 
-			if ((offset & (c->opts.block_size - 1)) ||
-			    (k.k->size & (c->opts.block_size - 1)))
+			if ((offset & (block_sectors(c) - 1)) ||
+			    (k.k->size & (block_sectors(c) - 1)))
 				flags2 |= FIEMAP_EXTENT_NOT_ALIGNED;
 
 			ret = fiemap_fill_next_extent(info,
@@ -1683,7 +1683,7 @@ static int bch2_show_options(struct seq_file *seq, struct dentry *root)
 		const struct bch_option *opt = &bch2_opt_table[i];
 		u64 v = bch2_opt_get_by_id(&c->opts, i);
 
-		if (!(opt->mode & OPT_MOUNT))
+		if (!(opt->flags & OPT_MOUNT))
 			continue;
 
 		if (v == bch2_opt_get_by_id(&bch2_opts_default, i))
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index 814984ec608c..1cfe433ded33 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -1357,7 +1357,7 @@ void bch2_write(struct closure *cl)
 	bch2_keylist_init(&op->insert_keys, op->inline_keys);
 	wbio_init(bio)->put_bio = false;
 
-	if (bio_sectors(bio) & (c->opts.block_size - 1)) {
+	if (bio->bi_iter.bi_size & (c->opts.block_size - 1)) {
 		bch_err_inum_ratelimited(c, op->pos.inode,
 					 "misaligned write");
 		op->error = -EIO;
@@ -2437,7 +2437,7 @@ int bch2_fs_io_init(struct bch_fs *c)
 			BIOSET_NEED_BVECS) ||
 	    mempool_init_page_pool(&c->bio_bounce_pages,
 				   max_t(unsigned,
-					 c->opts.btree_node_size,
+					 btree_sectors(c),
 					 c->sb.encoded_extent_max) /
 				   PAGE_SECTORS, 0) ||
 	    rhashtable_init(&c->promote_table, &bch_promote_params))
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 1a8c0a7eaca7..ae28cee127e3 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -709,7 +709,7 @@ reread:
 		case JOURNAL_ENTRY_NONE:
 			if (!saw_bad)
 				return 0;
-			sectors = c->opts.block_size;
+			sectors = block_sectors(c);
 			goto next_block;
 		case JOURNAL_ENTRY_BAD:
 			saw_bad = true;
@@ -718,7 +718,7 @@ reread:
 			 * field of the journal entry we read, so try reading
 			 * again at next block boundary:
 			 */
-			sectors = c->opts.block_size;
+			sectors = block_sectors(c);
 			break;
 		default:
 			return ret;
diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c
index e81e07a383bb..9b75c852bac8 100644
--- a/fs/bcachefs/opts.c
+++ b/fs/bcachefs/opts.c
@@ -141,41 +141,27 @@ void bch2_opt_set_by_id(struct bch_opts *opts, enum bch_opt_id id, u64 v)
 	}
 }
 
-/*
- * Initial options from superblock - here we don't want any options undefined,
- * any options the superblock doesn't specify are set to 0:
- */
-struct bch_opts bch2_opts_from_sb(struct bch_sb *sb)
-{
-	struct bch_opts opts = bch2_opts_empty();
-
-#define x(_name, _bits, _mode, _type, _sb_opt, ...)			\
-	if (_sb_opt != NO_SB_OPT)					\
-		opt_set(opts, _name, _sb_opt(sb));
-	BCH_OPTS()
-#undef x
-
-	return opts;
-}
-
 const struct bch_option bch2_opt_table[] = {
-#define OPT_BOOL()		.type = BCH_OPT_BOOL
-#define OPT_UINT(_min, _max)	.type = BCH_OPT_UINT, .min = _min, .max = _max
-#define OPT_SECTORS(_min, _max)	.type = BCH_OPT_SECTORS, .min = _min, .max = _max
-#define OPT_STR(_choices)	.type = BCH_OPT_STR, .choices = _choices
+#define OPT_BOOL()		.type = BCH_OPT_BOOL, .min = 0, .max = 2
+#define OPT_UINT(_min, _max)	.type = BCH_OPT_UINT,			\
+				.min = _min, .max = _max
+#define OPT_STR(_choices)	.type = BCH_OPT_STR,			\
+				.min = 0, .max = ARRAY_SIZE(_choices),\
+				.choices = _choices
 #define OPT_FN(_fn)		.type = BCH_OPT_FN,			\
 				.parse = _fn##_parse,			\
 				.to_text = _fn##_to_text
 
-#define x(_name, _bits, _mode, _type, _sb_opt, _default, _hint, _help)	\
+#define x(_name, _bits, _flags, _type, _sb_opt, _default, _hint, _help)	\
 	[Opt_##_name] = {						\
 		.attr	= {						\
 			.name	= #_name,				\
-			.mode = (_mode) & OPT_RUNTIME ? 0644 : 0444,	\
+			.mode = (_flags) & OPT_RUNTIME ? 0644 : 0444,	\
 		},							\
-		.mode	= _mode,					\
+		.flags	= _flags,					\
 		.hint	= _hint,					\
 		.help	= _help,					\
+		.get_sb = _sb_opt,					\
 		.set_sb	= SET_##_sb_opt,				\
 		_type							\
 	},
@@ -218,7 +204,41 @@ static int bch2_mount_opt_lookup(const char *name)
 	return bch2_opt_lookup(name);
 }
 
-int bch2_opt_parse(struct bch_fs *c, const struct bch_option *opt,
+static int bch2_opt_validate(const struct bch_option *opt, const char *msg, u64 v)
+{
+	if (v < opt->min) {
+		if (msg)
+			pr_err("invalid %s%s: too small (min %llu)",
+			       msg, opt->attr.name, opt->min);
+		return -ERANGE;
+	}
+
+	if (opt->max && v >= opt->max) {
+		if (msg)
+			pr_err("invalid %s%s: too big (max %llu)",
+			       msg, opt->attr.name, opt->max);
+		return -ERANGE;
+	}
+
+	if ((opt->flags & OPT_SB_FIELD_SECTORS) && (v & 511)) {
+		if (msg)
+			pr_err("invalid %s %s: not a multiple of 512",
+			       msg, opt->attr.name);
+		return -EINVAL;
+	}
+
+	if ((opt->flags & OPT_MUST_BE_POW_2) && !is_power_of_2(v)) {
+		if (msg)
+			pr_err("invalid %s%s: must be a power of two",
+			       msg, opt->attr.name);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+int bch2_opt_parse(struct bch_fs *c, const char *msg,
+		   const struct bch_option *opt,
 		   const char *val, u64 *res)
 {
 	ssize_t ret;
@@ -228,30 +248,13 @@ int bch2_opt_parse(struct bch_fs *c, const struct bch_option *opt,
 		ret = kstrtou64(val, 10, res);
 		if (ret < 0)
 			return ret;
-
-		if (*res > 1)
-			return -ERANGE;
 		break;
 	case BCH_OPT_UINT:
-		ret = kstrtou64(val, 10, res);
+		ret = opt->flags & OPT_HUMAN_READABLE
+			? bch2_strtou64_h(val, res)
+			: kstrtou64(val, 10, res);
 		if (ret < 0)
 			return ret;
-
-		if (*res < opt->min || *res >= opt->max)
-			return -ERANGE;
-		break;
-	case BCH_OPT_SECTORS:
-		ret = bch2_strtou64_h(val, res);
-		if (ret < 0)
-			return ret;
-
-		if (*res & 511)
-			return -EINVAL;
-
-		*res >>= 9;
-
-		if (*res < opt->min || *res >= opt->max)
-			return -ERANGE;
 		break;
 	case BCH_OPT_STR:
 		ret = match_string(opt->choices, -1, val);
@@ -264,10 +267,12 @@ int bch2_opt_parse(struct bch_fs *c, const struct bch_option *opt,
 		if (!c)
 			return 0;
 
-		return opt->parse(c, val, res);
+		ret = opt->parse(c, val, res);
+		if (ret < 0)
+			return ret;
 	}
 
-	return 0;
+	return bch2_opt_validate(opt, msg, *res);
 }
 
 void bch2_opt_to_text(struct printbuf *out, struct bch_fs *c,
@@ -288,10 +293,10 @@ void bch2_opt_to_text(struct printbuf *out, struct bch_fs *c,
 	switch (opt->type) {
 	case BCH_OPT_BOOL:
 	case BCH_OPT_UINT:
-		pr_buf(out, "%lli", v);
-		break;
-	case BCH_OPT_SECTORS:
-		bch2_hprint(out, v << 9);
+		if (opt->flags & OPT_HUMAN_READABLE)
+			bch2_hprint(out, v);
+		else
+			pr_buf(out, "%lli", v);
 		break;
 	case BCH_OPT_STR:
 		if (flags & OPT_SHOW_FULL_LIST)
@@ -365,7 +370,8 @@ int bch2_parse_mount_opts(struct bch_fs *c, struct bch_opts *opts,
 			if (id < 0)
 				goto bad_opt;
 
-			ret = bch2_opt_parse(c, &bch2_opt_table[id], val, &v);
+			ret = bch2_opt_parse(c, "mount option ",
+					     &bch2_opt_table[id], val, &v);
 			if (ret < 0)
 				goto bad_val;
 		} else {
@@ -385,7 +391,7 @@ int bch2_parse_mount_opts(struct bch_fs *c, struct bch_opts *opts,
 				goto no_val;
 		}
 
-		if (!(bch2_opt_table[id].mode & OPT_MOUNT))
+		if (!(bch2_opt_table[id].flags & OPT_MOUNT))
 			goto bad_opt;
 
 		if (id == Opt_acl &&
@@ -420,6 +426,65 @@ out:
 	return ret;
 }
 
+/*
+ * Initial options from superblock - here we don't want any options undefined,
+ * any options the superblock doesn't specify are set to 0:
+ */
+int bch2_opts_from_sb(struct bch_opts *opts, struct bch_sb *sb)
+{
+	unsigned id;
+	int ret;
+
+	for (id = 0; id < bch2_opts_nr; id++) {
+		const struct bch_option *opt = bch2_opt_table + id;
+		u64 v;
+
+		if (opt->get_sb == NO_SB_OPT)
+			continue;
+
+		v = opt->get_sb(sb);
+
+		if (opt->flags & OPT_SB_FIELD_ILOG2)
+			v = 1ULL << v;
+
+		if (opt->flags & OPT_SB_FIELD_SECTORS)
+			v <<= 9;
+
+		ret = bch2_opt_validate(opt, "superblock option ", v);
+		if (ret)
+			return ret;
+
+		bch2_opt_set_by_id(opts, id, v);
+	}
+
+	return 0;
+}
+
+void __bch2_opt_set_sb(struct bch_sb *sb, const struct bch_option *opt, u64 v)
+{
+	if (opt->set_sb == SET_NO_SB_OPT)
+		return;
+
+	if (opt->flags & OPT_SB_FIELD_SECTORS)
+		v >>= 9;
+
+	if (opt->flags & OPT_SB_FIELD_ILOG2)
+		v = ilog2(v);
+
+	opt->set_sb(sb, v);
+}
+
+void bch2_opt_set_sb(struct bch_fs *c, const struct bch_option *opt, u64 v)
+{
+	if (opt->set_sb == SET_NO_SB_OPT)
+		return;
+
+	mutex_lock(&c->sb_lock);
+	__bch2_opt_set_sb(c->disk_sb.sb, opt, v);
+	bch2_write_super(c);
+	mutex_unlock(&c->sb_lock);
+}
+
 /* io opts: */
 
 struct bch_io_opts bch2_opts_to_inode_opts(struct bch_opts src)
diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
index bb2ecc778a8c..45f73601e4a8 100644
--- a/fs/bcachefs/opts.h
+++ b/fs/bcachefs/opts.h
@@ -44,19 +44,22 @@ static inline const char *bch2_d_type_str(unsigned d_type)
 LE64_BITMASK(NO_SB_OPT,		struct bch_sb, flags[0], 0, 0);
 
 /* When can be set: */
-enum opt_mode {
+enum opt_flags {
 	OPT_FS		= (1 << 0),	/* Filesystem option */
 	OPT_DEVICE	= (1 << 1),	/* Device option */
 	OPT_INODE	= (1 << 2),	/* Inode option */
 	OPT_FORMAT	= (1 << 3),	/* May be specified at format time */
 	OPT_MOUNT	= (1 << 4),	/* May be specified at mount time */
 	OPT_RUNTIME	= (1 << 5),	/* May be specified at runtime */
+	OPT_HUMAN_READABLE = (1 << 6),
+	OPT_MUST_BE_POW_2 = (1 << 7),	/* Must be power of 2 */
+	OPT_SB_FIELD_SECTORS = (1 << 8),/* Superblock field is >> 9 of actual value */
+	OPT_SB_FIELD_ILOG2 = (1 << 9),	/* Superblock field is ilog2 of actual value */
 };
 
 enum opt_type {
 	BCH_OPT_BOOL,
 	BCH_OPT_UINT,
-	BCH_OPT_SECTORS,
 	BCH_OPT_STR,
 	BCH_OPT_FN,
 };
@@ -88,13 +91,15 @@ enum opt_type {
 
 #define BCH_OPTS()							\
 	x(block_size,			u16,				\
-	  OPT_FS|OPT_FORMAT,						\
-	  OPT_SECTORS(1, 128),						\
+	  OPT_FS|OPT_FORMAT|						\
+	  OPT_HUMAN_READABLE|OPT_MUST_BE_POW_2|OPT_SB_FIELD_SECTORS,	\
+	  OPT_UINT(512, 1U << 16),					\
 	  BCH_SB_BLOCK_SIZE,		8,				\
 	  "size",	NULL)						\
-	x(btree_node_size,		u16,				\
-	  OPT_FS|OPT_FORMAT,						\
-	  OPT_SECTORS(1, 512),						\
+	x(btree_node_size,		u32,				\
+	  OPT_FS|OPT_FORMAT|						\
+	  OPT_HUMAN_READABLE|OPT_MUST_BE_POW_2|OPT_SB_FIELD_SECTORS,	\
+	  OPT_UINT(512, 1U << 20),					\
 	  BCH_SB_BTREE_NODE_SIZE,	512,				\
 	  "size",	"Btree node size, default 256k")		\
 	x(errors,			u8,				\
@@ -198,8 +203,9 @@ enum opt_type {
 	  BCH_SB_GC_RESERVE,		8,				\
 	  "%",		"Percentage of disk space to reserve for copygc")\
 	x(gc_reserve_bytes,		u64,				\
-	  OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,			\
-	  OPT_SECTORS(0, U64_MAX),					\
+	  OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|			\
+	  OPT_HUMAN_READABLE|OPT_SB_FIELD_SECTORS,			\
+	  OPT_UINT(0, U64_MAX),						\
 	  BCH_SB_GC_RESERVE_BYTES,	0,				\
 	  "%",		"Amount of disk space to reserve for copygc\n"	\
 			"Takes precedence over gc_reserve_percent if set")\
@@ -360,12 +366,12 @@ enum opt_type {
 			"for performance testing purposes")		\
 	x(fs_size,			u64,				\
 	  OPT_DEVICE,							\
-	  OPT_SECTORS(0, S64_MAX),					\
+	  OPT_UINT(0, S64_MAX),						\
 	  NO_SB_OPT,			0,				\
 	  "size",	"Size of filesystem on device")			\
 	x(bucket,			u32,				\
 	  OPT_DEVICE,							\
-	  OPT_SECTORS(0, S64_MAX),					\
+	  OPT_UINT(0, S64_MAX),						\
 	  NO_SB_OPT,			0,				\
 	  "size",	"Size of filesystem on device")			\
 	x(durability,			u8,				\
@@ -424,13 +430,14 @@ struct printbuf;
 
 struct bch_option {
 	struct attribute	attr;
+	u64			(*get_sb)(const struct bch_sb *);
 	void			(*set_sb)(struct bch_sb *, u64);
-	enum opt_mode		mode;
 	enum opt_type		type;
+	enum opt_flags		flags;
+	u64			min, max;
 
 	union {
 	struct {
-		u64		min, max;
 	};
 	struct {
 		const char * const *choices;
@@ -452,10 +459,13 @@ bool bch2_opt_defined_by_id(const struct bch_opts *, enum bch_opt_id);
 u64 bch2_opt_get_by_id(const struct bch_opts *, enum bch_opt_id);
 void bch2_opt_set_by_id(struct bch_opts *, enum bch_opt_id, u64);
 
-struct bch_opts bch2_opts_from_sb(struct bch_sb *);
+int bch2_opts_from_sb(struct bch_opts *, struct bch_sb *);
+void __bch2_opt_set_sb(struct bch_sb *, const struct bch_option *, u64);
+void bch2_opt_set_sb(struct bch_fs *, const struct bch_option *, u64);
 
 int bch2_opt_lookup(const char *);
-int bch2_opt_parse(struct bch_fs *, const struct bch_option *, const char *, u64 *);
+int bch2_opt_parse(struct bch_fs *, const char *, const struct bch_option *,
+		   const char *, u64 *);
 
 #define OPT_SHOW_FULL_LIST	(1 << 0)
 #define OPT_SHOW_MOUNT_STYLE	(1 << 1)
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index 170f7d46fa34..c831d32c26fe 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -267,8 +267,7 @@ const char *bch2_sb_validate(struct bch_sb_handle *disk_sb)
 
 	block_size = le16_to_cpu(sb->block_size);
 
-	if (!is_power_of_2(block_size) ||
-	    block_size > PAGE_SECTORS)
+	if (block_size > PAGE_SECTORS)
 		return "Bad block size";
 
 	if (bch2_is_zero(sb->user_uuid.b, sizeof(sb->user_uuid)))
@@ -310,9 +309,6 @@ const char *bch2_sb_validate(struct bch_sb_handle *disk_sb)
 	if (!BCH_SB_BTREE_NODE_SIZE(sb))
 		return "Btree node size not set";
 
-	if (!is_power_of_2(BCH_SB_BTREE_NODE_SIZE(sb)))
-		return "Btree node size not a power of two";
-
 	if (BCH_SB_GC_RESERVE(sb) < 5)
 		return "gc reserve percentage too small";
 
@@ -627,8 +623,12 @@ got_super:
 	err = "Superblock block size smaller than device block size";
 	ret = -EINVAL;
 	if (le16_to_cpu(sb->sb->block_size) << 9 <
-	    bdev_logical_block_size(sb->bdev))
-		goto err;
+	    bdev_logical_block_size(sb->bdev)) {
+		pr_err("error reading superblock: Superblock block size (%u) smaller than device block size (%u)",
+		       le16_to_cpu(sb->sb->block_size) << 9,
+		       bdev_logical_block_size(sb->bdev));
+		goto err_no_print;
+	}
 
 	ret = 0;
 	sb->have_layout = true;
@@ -636,8 +636,9 @@ out:
 	pr_verbose_init(*opts, "ret %i", ret);
 	return ret;
 err:
-	bch2_free_super(sb);
 	pr_err("error reading superblock: %s", err);
+err_no_print:
+	bch2_free_super(sb);
 	goto out;
 }
 
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index afa1a8fa493b..e1d4fe5a8e49 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -762,10 +762,13 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 		SET_BCH_SB_JOURNAL_RECLAIM_DELAY(sb, 100);
 
 	c->opts = bch2_opts_default;
-	bch2_opts_apply(&c->opts, bch2_opts_from_sb(sb));
+	ret = bch2_opts_from_sb(&c->opts, sb);
+	if (ret)
+		goto err;
+
 	bch2_opts_apply(&c->opts, opts);
 
-	c->block_bits		= ilog2(c->opts.block_size);
+	c->block_bits		= ilog2(block_sectors(c));
 	c->btree_foreground_merge_threshold = BTREE_FOREGROUND_MERGE_THRESHOLD(c);
 
 	if (bch2_fs_init_fault("fs_alloc")) {
@@ -877,7 +880,7 @@ static void print_mount_opts(struct bch_fs *c)
 		const struct bch_option *opt = &bch2_opt_table[i];
 		u64 v = bch2_opt_get_by_id(&c->opts, i);
 
-		if (!(opt->mode & OPT_MOUNT))
+		if (!(opt->flags & OPT_MOUNT))
 			continue;
 
 		if (v == bch2_opt_get_by_id(&bch2_opts_default, i))
@@ -1003,7 +1006,7 @@ static const char *bch2_dev_may_add(struct bch_sb *sb, struct bch_fs *c)
 	if (!sb_mi)
 		return "Invalid superblock: member info area missing";
 
-	if (le16_to_cpu(sb->block_size) != c->opts.block_size)
+	if (le16_to_cpu(sb->block_size) != block_sectors(c))
 		return "mismatched block size";
 
 	if (le16_to_cpu(sb_mi->members[sb->dev_idx].bucket_size) <
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index 3f51eda749f0..0a0798bae4d6 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -626,7 +626,7 @@ STORE(bch2_fs_opts_dir)
 	if (!tmp)
 		return -ENOMEM;
 
-	ret = bch2_opt_parse(c, opt, strim(tmp), &v);
+	ret = bch2_opt_parse(c, NULL, opt, strim(tmp), &v);
 	kfree(tmp);
 
 	if (ret < 0)
@@ -636,13 +636,7 @@ STORE(bch2_fs_opts_dir)
 	if (ret < 0)
 		return ret;
 
-	if (opt->set_sb != SET_NO_SB_OPT) {
-		mutex_lock(&c->sb_lock);
-		opt->set_sb(c->disk_sb.sb, v);
-		bch2_write_super(c);
-		mutex_unlock(&c->sb_lock);
-	}
-
+	bch2_opt_set_sb(c, opt, v);
 	bch2_opt_set_by_id(&c->opts, id, v);
 
 	if ((id == Opt_background_target ||
@@ -665,7 +659,7 @@ int bch2_opts_create_sysfs_files(struct kobject *kobj)
 	for (i = bch2_opt_table;
 	     i < bch2_opt_table + bch2_opts_nr;
 	     i++) {
-		if (!(i->mode & OPT_FS))
+		if (!(i->flags & OPT_FS))
 			continue;
 
 		ret = sysfs_create_file(kobj, &i->attr);
diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c
index 21823ce69237..a5122dbb2eb9 100644
--- a/fs/bcachefs/xattr.c
+++ b/fs/bcachefs/xattr.c
@@ -525,7 +525,7 @@ static int bch2_xattr_bcachefs_set(const struct xattr_handler *handler,
 		memcpy(buf, value, size);
 		buf[size] = '\0';
 
-		ret = bch2_opt_parse(c, opt, buf, &v);
+		ret = bch2_opt_parse(c, NULL, opt, buf, &v);
 		kfree(buf);
 
 		if (ret < 0)
-- 
cgit 


From e409999069928bc1da79f32e7adff88f981c65a5 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 14 Dec 2021 14:34:03 -0500
Subject: bcachefs: Turn encoded_extent_max into a regular option

It'll now be handled at format time and in sysfs like other options - it
still can only be set at format time, though.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/bcachefs.h |  1 -
 fs/bcachefs/compress.c | 21 ++++++++++-----------
 fs/bcachefs/ec.c       |  2 +-
 fs/bcachefs/extents.c  |  2 +-
 fs/bcachefs/io.c       | 14 +++++++-------
 fs/bcachefs/opts.h     |  6 ++++++
 fs/bcachefs/super-io.c |  1 -
 7 files changed, 25 insertions(+), 22 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 95b590d9ee7f..f0a8a0cabc65 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -634,7 +634,6 @@ struct bch_fs {
 
 		u16		version;
 		u16		version_min;
-		u16		encoded_extent_max;
 
 		u8		nr_devices;
 		u8		clean;
diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c
index 2d5dc2394bab..f692f35a6a98 100644
--- a/fs/bcachefs/compress.c
+++ b/fs/bcachefs/compress.c
@@ -26,7 +26,7 @@ static struct bbuf __bounce_alloc(struct bch_fs *c, unsigned size, int rw)
 {
 	void *b;
 
-	BUG_ON(size > c->sb.encoded_extent_max << 9);
+	BUG_ON(size > c->opts.encoded_extent_max);
 
 	b = kmalloc(size, GFP_NOIO|__GFP_NOWARN);
 	if (b)
@@ -68,7 +68,7 @@ static struct bbuf __bio_map_or_bounce(struct bch_fs *c, struct bio *bio,
 	struct page **pages = NULL;
 	void *data;
 
-	BUG_ON(bvec_iter_sectors(start) > c->sb.encoded_extent_max);
+	BUG_ON(start.bi_size > c->opts.encoded_extent_max);
 
 	if (!PageHighMem(bio_iter_page(bio, start)) &&
 	    bio_phys_contig(bio, start))
@@ -231,8 +231,8 @@ int bch2_bio_uncompress_inplace(struct bch_fs *c, struct bio *bio,
 	BUG_ON(!bio->bi_vcnt);
 	BUG_ON(DIV_ROUND_UP(crc->live_size, PAGE_SECTORS) > bio->bi_max_vecs);
 
-	if (crc->uncompressed_size	> c->sb.encoded_extent_max ||
-	    crc->compressed_size	> c->sb.encoded_extent_max) {
+	if (crc->uncompressed_size << 9	> c->opts.encoded_extent_max ||
+	    crc->compressed_size << 9	> c->opts.encoded_extent_max) {
 		bch_err(c, "error rewriting existing data: extent too big");
 		return -EIO;
 	}
@@ -272,8 +272,8 @@ int bch2_bio_uncompress(struct bch_fs *c, struct bio *src,
 	size_t dst_len = crc.uncompressed_size << 9;
 	int ret = -ENOMEM;
 
-	if (crc.uncompressed_size	> c->sb.encoded_extent_max ||
-	    crc.compressed_size		> c->sb.encoded_extent_max)
+	if (crc.uncompressed_size << 9	> c->opts.encoded_extent_max ||
+	    crc.compressed_size << 9	> c->opts.encoded_extent_max)
 		return -EIO;
 
 	dst_data = dst_len == dst_iter.bi_size
@@ -466,7 +466,7 @@ unsigned bch2_bio_compress(struct bch_fs *c,
 
 	/* Don't consume more than BCH_ENCODED_EXTENT_MAX from @src: */
 	src->bi_iter.bi_size = min_t(unsigned, src->bi_iter.bi_size,
-				     c->sb.encoded_extent_max << 9);
+				     c->opts.encoded_extent_max);
 	/* Don't generate a bigger output than input: */
 	dst->bi_iter.bi_size = min(dst->bi_iter.bi_size, src->bi_iter.bi_size);
 
@@ -544,10 +544,9 @@ void bch2_fs_compress_exit(struct bch_fs *c)
 
 static int __bch2_fs_compress_init(struct bch_fs *c, u64 features)
 {
-	size_t max_extent = c->sb.encoded_extent_max << 9;
 	size_t decompress_workspace_size = 0;
 	bool decompress_workspace_needed;
-	ZSTD_parameters params = zstd_get_params(0, max_extent);
+	ZSTD_parameters params = zstd_get_params(0, c->opts.encoded_extent_max);
 	struct {
 		unsigned	feature;
 		unsigned	type;
@@ -579,14 +578,14 @@ have_compressed:
 
 	if (!mempool_initialized(&c->compression_bounce[READ])) {
 		ret = mempool_init_kvpmalloc_pool(&c->compression_bounce[READ],
-						  1, max_extent);
+						  1, c->opts.encoded_extent_max);
 		if (ret)
 			goto out;
 	}
 
 	if (!mempool_initialized(&c->compression_bounce[WRITE])) {
 		ret = mempool_init_kvpmalloc_pool(&c->compression_bounce[WRITE],
-						  1, max_extent);
+						  1, c->opts.encoded_extent_max);
 		if (ret)
 			goto out;
 	}
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index 4424cb3ac822..c3f86cc39842 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -1152,7 +1152,7 @@ static void ec_stripe_key_init(struct bch_fs *c,
 	s->v.algorithm			= 0;
 	s->v.nr_blocks			= nr_data + nr_parity;
 	s->v.nr_redundant		= nr_parity;
-	s->v.csum_granularity_bits	= ilog2(c->sb.encoded_extent_max);
+	s->v.csum_granularity_bits	= ilog2(c->opts.encoded_extent_max >> 9);
 	s->v.csum_type			= BCH_CSUM_crc32c;
 	s->v.pad			= 0;
 
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 161ae4fd59d9..3ed724e1fc98 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -302,7 +302,7 @@ bool bch2_extent_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r)
 
 			if (lp.crc.csum_type &&
 			    lp.crc.uncompressed_size +
-			    rp.crc.uncompressed_size > c->sb.encoded_extent_max)
+			    rp.crc.uncompressed_size > (c->opts.encoded_extent_max >> 9))
 				return false;
 
 			if (lp.crc.uncompressed_size + rp.crc.uncompressed_size >
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index 1cfe433ded33..649373acde73 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -806,7 +806,7 @@ static struct bio *bch2_write_bio_alloc(struct bch_fs *c,
 	 */
 	bch2_bio_alloc_pages_pool(c, bio,
 				  min_t(unsigned, output_available,
-					c->sb.encoded_extent_max << 9));
+					c->opts.encoded_extent_max));
 
 	if (bio->bi_iter.bi_size < output_available)
 		*page_alloc_failed =
@@ -1003,8 +1003,8 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp,
 		size_t dst_len, src_len;
 
 		if (page_alloc_failed &&
-		    bio_sectors(dst) < wp->sectors_free &&
-		    bio_sectors(dst) < c->sb.encoded_extent_max)
+		    dst->bi_iter.bi_size  < (wp->sectors_free << 9) &&
+		    dst->bi_iter.bi_size < c->opts.encoded_extent_max)
 			break;
 
 		BUG_ON(op->compression_type &&
@@ -1024,7 +1024,7 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp,
 
 			if (op->csum_type)
 				dst_len = min_t(unsigned, dst_len,
-						c->sb.encoded_extent_max << 9);
+						c->opts.encoded_extent_max);
 
 			if (bounce) {
 				swap(dst->bi_iter.bi_size, dst_len);
@@ -2437,9 +2437,9 @@ int bch2_fs_io_init(struct bch_fs *c)
 			BIOSET_NEED_BVECS) ||
 	    mempool_init_page_pool(&c->bio_bounce_pages,
 				   max_t(unsigned,
-					 btree_sectors(c),
-					 c->sb.encoded_extent_max) /
-				   PAGE_SECTORS, 0) ||
+					 c->opts.btree_node_size,
+					 c->opts.encoded_extent_max) /
+				   PAGE_SIZE, 0) ||
 	    rhashtable_init(&c->promote_table, &bch_promote_params))
 		return -ENOMEM;
 
diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
index 45f73601e4a8..617d9fd2ac0a 100644
--- a/fs/bcachefs/opts.h
+++ b/fs/bcachefs/opts.h
@@ -127,6 +127,12 @@ enum opt_type {
 	  OPT_UINT(1, BCH_REPLICAS_MAX),				\
 	  BCH_SB_DATA_REPLICAS_REQ,	1,				\
 	  "#",		NULL)						\
+	x(encoded_extent_max,		u32,				\
+	  OPT_FS|OPT_FORMAT|						\
+	  OPT_HUMAN_READABLE|OPT_MUST_BE_POW_2|OPT_SB_FIELD_SECTORS|OPT_SB_FIELD_ILOG2,\
+	  OPT_UINT(4096, 2U << 20),					\
+	  BCH_SB_ENCODED_EXTENT_MAX_BITS, 64 << 10,			\
+	  "size",	"Maximum size of checksummed/compressed extents")\
 	x(metadata_checksum,		u8,				\
 	  OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,			\
 	  OPT_STR(bch2_csum_opts),					\
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index c831d32c26fe..c69f25e1a867 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -368,7 +368,6 @@ static void bch2_sb_update(struct bch_fs *c)
 	c->sb.nr_devices	= src->nr_devices;
 	c->sb.clean		= BCH_SB_CLEAN(src);
 	c->sb.encryption_type	= BCH_SB_ENCRYPTION_TYPE(src);
-	c->sb.encoded_extent_max= 1 << BCH_SB_ENCODED_EXTENT_MAX_BITS(src);
 
 	c->sb.nsec_per_time_unit = le32_to_cpu(src->time_precision);
 	c->sb.time_units_per_sec = NSEC_PER_SEC / c->sb.nsec_per_time_unit;
-- 
cgit 


From 73b460977eb8782a769ff81200a8b40cf1a3a91d Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 22 Dec 2021 22:39:50 -0500
Subject: bcachefs: Fix a null ptr deref in bch2_inode_delete_keys()

Similarly to bch2_btree_delete_range_trans(), bch2_inode_delete_keys()
may sometimes split compressed extents, and needs to pass in a disk
reservation.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/inode.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index d9ccc7c063ac..104575734b96 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -4,6 +4,7 @@
 #include "btree_key_cache.h"
 #include "bkey_methods.h"
 #include "btree_update.h"
+#include "buckets.h"
 #include "error.h"
 #include "extents.h"
 #include "extent_update.h"
@@ -588,6 +589,8 @@ static int bch2_inode_delete_keys(struct btree_trans *trans,
 	int ret = 0;
 
 	while (!ret || ret == -EINTR) {
+		struct disk_reservation disk_res =
+			bch2_disk_reservation_init(trans->c, 0);
 		struct btree_iter iter;
 		struct bkey_s_c k;
 		struct bkey_i delete;
@@ -630,8 +633,9 @@ static int bch2_inode_delete_keys(struct btree_trans *trans,
 		}
 
 		ret = bch2_trans_update(trans, &iter, &delete, 0) ?:
-		      bch2_trans_commit(trans, NULL, NULL,
+		      bch2_trans_commit(trans, &disk_res, NULL,
 					BTREE_INSERT_NOFAIL);
+		bch2_disk_reservation_put(trans->c, &disk_res);
 err:
 		offset = iter.pos.offset;
 		bch2_trans_iter_exit(trans, &iter);
-- 
cgit 


From 7243498de74d32d0afe3b923cd893a6b49f70c3c Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 24 Dec 2021 02:55:11 -0500
Subject: bcachefs: Kill non-lru cache replacement policies

Prep work for persistent LRUs and getting rid of the in memory bucket
array.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/alloc_background.c | 82 +-----------------------------------------
 fs/bcachefs/bcachefs_format.h  | 15 +-------
 fs/bcachefs/opts.c             |  5 ---
 fs/bcachefs/opts.h             |  1 -
 fs/bcachefs/super-io.h         |  1 -
 fs/bcachefs/super_types.h      |  1 -
 fs/bcachefs/sysfs.c            | 26 --------------
 7 files changed, 2 insertions(+), 129 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 39538dada301..9f98860da5cc 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -628,76 +628,6 @@ static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca)
 	up_read(&ca->bucket_lock);
 }
 
-static void find_reclaimable_buckets_fifo(struct bch_fs *c, struct bch_dev *ca)
-{
-	struct bucket_array *buckets = bucket_array(ca);
-	struct bucket_mark m;
-	size_t b, start;
-
-	if (ca->fifo_last_bucket <  ca->mi.first_bucket ||
-	    ca->fifo_last_bucket >= ca->mi.nbuckets)
-		ca->fifo_last_bucket = ca->mi.first_bucket;
-
-	start = ca->fifo_last_bucket;
-
-	do {
-		ca->fifo_last_bucket++;
-		if (ca->fifo_last_bucket == ca->mi.nbuckets)
-			ca->fifo_last_bucket = ca->mi.first_bucket;
-
-		b = ca->fifo_last_bucket;
-		m = READ_ONCE(buckets->b[b].mark);
-
-		if (bch2_can_invalidate_bucket(ca, b, m)) {
-			struct alloc_heap_entry e = { .bucket = b, .nr = 1, };
-
-			heap_add(&ca->alloc_heap, e, bucket_alloc_cmp, NULL);
-			if (heap_full(&ca->alloc_heap))
-				break;
-		}
-
-		cond_resched();
-	} while (ca->fifo_last_bucket != start);
-}
-
-static void find_reclaimable_buckets_random(struct bch_fs *c, struct bch_dev *ca)
-{
-	struct bucket_array *buckets = bucket_array(ca);
-	struct bucket_mark m;
-	size_t checked, i;
-
-	for (checked = 0;
-	     checked < ca->mi.nbuckets / 2;
-	     checked++) {
-		size_t b = bch2_rand_range(ca->mi.nbuckets -
-					   ca->mi.first_bucket) +
-			ca->mi.first_bucket;
-
-		m = READ_ONCE(buckets->b[b].mark);
-
-		if (bch2_can_invalidate_bucket(ca, b, m)) {
-			struct alloc_heap_entry e = { .bucket = b, .nr = 1, };
-
-			heap_add(&ca->alloc_heap, e, bucket_alloc_cmp, NULL);
-			if (heap_full(&ca->alloc_heap))
-				break;
-		}
-
-		cond_resched();
-	}
-
-	sort(ca->alloc_heap.data,
-	     ca->alloc_heap.used,
-	     sizeof(ca->alloc_heap.data[0]),
-	     bucket_idx_cmp, NULL);
-
-	/* remove duplicates: */
-	for (i = 0; i + 1 < ca->alloc_heap.used; i++)
-		if (ca->alloc_heap.data[i].bucket ==
-		    ca->alloc_heap.data[i + 1].bucket)
-			ca->alloc_heap.data[i].nr = 0;
-}
-
 static size_t find_reclaimable_buckets(struct bch_fs *c, struct bch_dev *ca)
 {
 	size_t i, nr = 0;
@@ -705,17 +635,7 @@ static size_t find_reclaimable_buckets(struct bch_fs *c, struct bch_dev *ca)
 	ca->inc_gen_needs_gc			= 0;
 	ca->inc_gen_really_needs_gc		= 0;
 
-	switch (ca->mi.replacement) {
-	case BCH_CACHE_REPLACEMENT_lru:
-		find_reclaimable_buckets_lru(c, ca);
-		break;
-	case BCH_CACHE_REPLACEMENT_fifo:
-		find_reclaimable_buckets_fifo(c, ca);
-		break;
-	case BCH_CACHE_REPLACEMENT_random:
-		find_reclaimable_buckets_random(c, ca);
-		break;
-	}
+	find_reclaimable_buckets_lru(c, ca);
 
 	heap_resort(&ca->alloc_heap, bucket_alloc_cmp, NULL);
 
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index bef924ab12a8..3c0ba301dad5 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -1067,8 +1067,7 @@ struct bch_member {
 };
 
 LE64_BITMASK(BCH_MEMBER_STATE,		struct bch_member, flags[0],  0,  4)
-/* 4-10 unused, was TIER, HAS_(META)DATA */
-LE64_BITMASK(BCH_MEMBER_REPLACEMENT,	struct bch_member, flags[0], 10, 14)
+/* 4-14 unused, was TIER, HAS_(META)DATA, REPLACEMENT */
 LE64_BITMASK(BCH_MEMBER_DISCARD,	struct bch_member, flags[0], 14, 15)
 LE64_BITMASK(BCH_MEMBER_DATA_ALLOWED,	struct bch_member, flags[0], 15, 20)
 LE64_BITMASK(BCH_MEMBER_GROUP,		struct bch_member, flags[0], 20, 28)
@@ -1092,18 +1091,6 @@ enum bch_member_state {
 	BCH_MEMBER_STATE_NR
 };
 
-#define BCH_CACHE_REPLACEMENT_POLICIES()	\
-	x(lru,		0)			\
-	x(fifo,		1)			\
-	x(random,	2)
-
-enum bch_cache_replacement_policies {
-#define x(t, n) BCH_CACHE_REPLACEMENT_##t = n,
-	BCH_CACHE_REPLACEMENT_POLICIES()
-#undef x
-	BCH_CACHE_REPLACEMENT_NR
-};
-
 struct bch_sb_field_members {
 	struct bch_sb_field	field;
 	struct bch_member	members[0];
diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c
index 9b75c852bac8..d9ca69f2ecde 100644
--- a/fs/bcachefs/opts.c
+++ b/fs/bcachefs/opts.c
@@ -66,11 +66,6 @@ const char * const bch2_data_types[] = {
 	NULL
 };
 
-const char * const bch2_cache_replacement_policies[] = {
-	BCH_CACHE_REPLACEMENT_POLICIES()
-	NULL
-};
-
 const char * const bch2_member_states[] = {
 	BCH_MEMBER_STATES()
 	NULL
diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
index 617d9fd2ac0a..af61fe588d3f 100644
--- a/fs/bcachefs/opts.h
+++ b/fs/bcachefs/opts.h
@@ -19,7 +19,6 @@ extern const char * const bch2_compression_opts[];
 extern const char * const bch2_str_hash_types[];
 extern const char * const bch2_str_hash_opts[];
 extern const char * const bch2_data_types[];
-extern const char * const bch2_cache_replacement_policies[];
 extern const char * const bch2_member_states[];
 extern const char * const bch2_d_types[];
 
diff --git a/fs/bcachefs/super-io.h b/fs/bcachefs/super-io.h
index 62d040d571c0..f182711cc48f 100644
--- a/fs/bcachefs/super-io.h
+++ b/fs/bcachefs/super-io.h
@@ -110,7 +110,6 @@ static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi)
 		.bucket_size	= le16_to_cpu(mi->bucket_size),
 		.group		= BCH_MEMBER_GROUP(mi),
 		.state		= BCH_MEMBER_STATE(mi),
-		.replacement	= BCH_MEMBER_REPLACEMENT(mi),
 		.discard	= BCH_MEMBER_DISCARD(mi),
 		.data_allowed	= BCH_MEMBER_DATA_ALLOWED(mi),
 		.durability	= BCH_MEMBER_DURABILITY(mi)
diff --git a/fs/bcachefs/super_types.h b/fs/bcachefs/super_types.h
index b14b2d82c655..1c0241304f32 100644
--- a/fs/bcachefs/super_types.h
+++ b/fs/bcachefs/super_types.h
@@ -30,7 +30,6 @@ struct bch_member_cpu {
 	u16			bucket_size;	/* sectors */
 	u16			group;
 	u8			state;
-	u8			replacement;
 	u8			discard;
 	u8			data_allowed;
 	u8			durability;
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index 0a0798bae4d6..341ba3fdd6fc 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -177,7 +177,6 @@ read_attribute(extent_migrate_done);
 read_attribute(extent_migrate_raced);
 
 rw_attribute(discard);
-rw_attribute(cache_replacement_policy);
 rw_attribute(label);
 
 rw_attribute(copy_gc_enabled);
@@ -826,14 +825,6 @@ SHOW(bch2_dev)
 		return out.pos - buf;
 	}
 
-	if (attr == &sysfs_cache_replacement_policy) {
-		bch2_string_opt_to_text(&out,
-					bch2_cache_replacement_policies,
-					ca->mi.replacement);
-		pr_buf(&out, "\n");
-		return out.pos - buf;
-	}
-
 	if (attr == &sysfs_state_rw) {
 		bch2_string_opt_to_text(&out, bch2_member_states,
 					ca->mi.state);
@@ -893,22 +884,6 @@ STORE(bch2_dev)
 		mutex_unlock(&c->sb_lock);
 	}
 
-	if (attr == &sysfs_cache_replacement_policy) {
-		ssize_t v = __sysfs_match_string(bch2_cache_replacement_policies, -1, buf);
-
-		if (v < 0)
-			return v;
-
-		mutex_lock(&c->sb_lock);
-		mi = &bch2_sb_get_members(c->disk_sb.sb)->members[ca->dev_idx];
-
-		if ((unsigned) v != BCH_MEMBER_REPLACEMENT(mi)) {
-			SET_BCH_MEMBER_REPLACEMENT(mi, v);
-			bch2_write_super(c);
-		}
-		mutex_unlock(&c->sb_lock);
-	}
-
 	if (attr == &sysfs_label) {
 		char *tmp;
 		int ret;
@@ -939,7 +914,6 @@ struct attribute *bch2_dev_files[] = {
 
 	/* settings: */
 	&sysfs_discard,
-	&sysfs_cache_replacement_policy,
 	&sysfs_state_rw,
 	&sysfs_label,
 
-- 
cgit 


From 09943313d70fd04eb9475ef9a83c1538234874fe Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 24 Dec 2021 04:22:20 -0500
Subject: bcachefs: Rewrite bch2_bucket_alloc_new_fs()

This changes bch2_bucket_alloc_new_fs() to a simple bump allocator that
doesn't need to use the in memory bucket array, part of a larger patch
series to entirely get rid of the in memory bucket array, except for
gc/fsck.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/alloc_background.c | 12 ++++++++++++
 fs/bcachefs/alloc_foreground.c | 22 ++++++++--------------
 fs/bcachefs/bcachefs.h         |  1 +
 fs/bcachefs/recovery.c         |  2 ++
 fs/bcachefs/super.c            |  2 ++
 fs/bcachefs/super.h            | 21 +++++++++++++++++++++
 6 files changed, 46 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 9f98860da5cc..e81e05629ffc 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -513,6 +513,18 @@ static bool bch2_can_invalidate_bucket(struct bch_dev *ca, size_t b,
 	    test_bit(b, ca->buckets_nouse))
 		return false;
 
+	if (ca->new_fs_bucket_idx) {
+		/*
+		 * Device or filesystem is still being initialized, and we
+		 * haven't fully marked superblocks & journal:
+		 */
+		if (is_superblock_bucket(ca, b))
+			return false;
+
+		if (b < ca->new_fs_bucket_idx)
+			return false;
+	}
+
 	gc_gen = bucket_gc_gen(bucket(ca, b));
 
 	ca->inc_gen_needs_gc		+= gc_gen >= BUCKET_GC_GEN_MAX / 2;
diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index 646d556a5c24..e38ee6bf0c46 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -144,21 +144,15 @@ static void verify_not_stale(struct bch_fs *c, const struct open_buckets *obs)
 /* _only_ for allocating the journal on a new device: */
 long bch2_bucket_alloc_new_fs(struct bch_dev *ca)
 {
-	struct bucket_array *buckets;
-	ssize_t b;
+	while (ca->new_fs_bucket_idx < ca->mi.nbuckets) {
+		u64 b = ca->new_fs_bucket_idx++;
 
-	rcu_read_lock();
-	buckets = bucket_array(ca);
-
-	for (b = buckets->first_bucket; b < buckets->nbuckets; b++)
-		if (is_available_bucket(buckets->b[b].mark) &&
-		    (!ca->buckets_nouse || !test_bit(b, ca->buckets_nouse)) &&
-		    !buckets->b[b].mark.owned_by_allocator)
-			goto success;
-	b = -1;
-success:
-	rcu_read_unlock();
-	return b;
+		if (!is_superblock_bucket(ca, b) &&
+		    (!ca->buckets_nouse || !test_bit(b, ca->buckets_nouse)))
+			return b;
+	}
+
+	return -1;
 }
 
 static inline unsigned open_buckets_reserved(enum alloc_reserve reserve)
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index f0a8a0cabc65..c8c7f6b8ee21 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -453,6 +453,7 @@ struct bch_dev {
 	struct bch_dev_usage __percpu	*usage_gc;
 
 	/* Allocator: */
+	u64			new_fs_bucket_idx;
 	struct task_struct __rcu *alloc_thread;
 
 	/*
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 29fe6260ace5..bd552a942ac6 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -1429,6 +1429,8 @@ int bch2_fs_initialize(struct bch_fs *c)
 			percpu_ref_put(&ca->ref);
 			goto err;
 		}
+
+		ca->new_fs_bucket_idx = 0;
 	}
 
 	err = "error creating root snapshot node";
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index e1d4fe5a8e49..94429c00e87a 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -1726,6 +1726,8 @@ have_slot:
 	if (ret)
 		goto err_late;
 
+	ca->new_fs_bucket_idx = 0;
+
 	if (ca->mi.state == BCH_MEMBER_STATE_rw) {
 		ret = __bch2_dev_read_write(c, ca);
 		if (ret)
diff --git a/fs/bcachefs/super.h b/fs/bcachefs/super.h
index b151bffcd3a3..a5249c54426d 100644
--- a/fs/bcachefs/super.h
+++ b/fs/bcachefs/super.h
@@ -194,6 +194,27 @@ static inline struct bch_devs_mask bch2_online_devs(struct bch_fs *c)
 	return devs;
 }
 
+static inline bool is_superblock_bucket(struct bch_dev *ca, u64 b)
+{
+	struct bch_sb_layout *layout = &ca->disk_sb.sb->layout;
+	u64 b_offset	= bucket_to_sector(ca, b);
+	u64 b_end	= bucket_to_sector(ca, b + 1);
+	unsigned i;
+
+	if (!b)
+		return true;
+
+	for (i = 0; i < layout->nr_superblocks; i++) {
+		u64 offset = le64_to_cpu(layout->sb_offset[i]);
+		u64 end = offset + (1 << layout->sb_max_size_bits);
+
+		if (!(offset >= b_end || end <= b_offset))
+			return true;
+	}
+
+	return false;
+}
+
 struct bch_fs *bch2_dev_to_fs(dev_t);
 struct bch_fs *bch2_uuid_to_fs(__uuid_t);
 
-- 
cgit 


From 77170d0dd7020ed72cd748a0c354bf0c0345b6b3 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 24 Dec 2021 04:27:01 -0500
Subject: bcachefs: bch2_bucket_alloc_new_fs() no longer depends on bucket
 marks

Now that bch2_bucket_alloc_new_fs() isn't looking at bucket marks to
decide what buckets are eligible to allocate, we can clean up the
filesystem initialization and device add paths. Previously, we had to
use ancient code to mark superblock/journal buckets in the in memory
bucket marks as we allocated them, and then zero that out and re-do that
marking using the newer transational bucket mark paths. Now, we can
simply delete the in-memory bucket marking.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_gc.c | 17 ++-----------
 fs/bcachefs/btree_gc.h |  1 -
 fs/bcachefs/buckets.c  | 66 ++++++++++++--------------------------------------
 fs/bcachefs/journal.c  | 12 +--------
 fs/bcachefs/recovery.c |  3 ---
 fs/bcachefs/super.c    | 26 --------------------
 6 files changed, 19 insertions(+), 106 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index ccb85850080b..3fa9f5996fca 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -1056,23 +1056,13 @@ static void mark_metadata_sectors(struct bch_fs *c, struct bch_dev *ca,
 	} while (start < end);
 }
 
-void bch2_mark_dev_superblock(struct bch_fs *c, struct bch_dev *ca,
-			      unsigned flags)
+static void bch2_mark_dev_superblock(struct bch_fs *c, struct bch_dev *ca,
+				     unsigned flags)
 {
 	struct bch_sb_layout *layout = &ca->disk_sb.sb->layout;
 	unsigned i;
 	u64 b;
 
-	/*
-	 * This conditional is kind of gross, but we may be called from the
-	 * device add path, before the new device has actually been added to the
-	 * running filesystem:
-	 */
-	if (c) {
-		lockdep_assert_held(&c->sb_lock);
-		percpu_down_read(&c->mark_lock);
-	}
-
 	for (i = 0; i < layout->nr_superblocks; i++) {
 		u64 offset = le64_to_cpu(layout->sb_offset[i]);
 
@@ -1091,9 +1081,6 @@ void bch2_mark_dev_superblock(struct bch_fs *c, struct bch_dev *ca,
 					  ca->mi.bucket_size,
 					  gc_phase(GC_PHASE_SB), flags);
 	}
-
-	if (c)
-		percpu_up_read(&c->mark_lock);
 }
 
 static void bch2_mark_superblocks(struct bch_fs *c)
diff --git a/fs/bcachefs/btree_gc.h b/fs/bcachefs/btree_gc.h
index 59dfb069e699..0665f5941fcc 100644
--- a/fs/bcachefs/btree_gc.h
+++ b/fs/bcachefs/btree_gc.h
@@ -8,7 +8,6 @@ int bch2_gc(struct bch_fs *, bool, bool);
 int bch2_gc_gens(struct bch_fs *);
 void bch2_gc_thread_stop(struct bch_fs *);
 int bch2_gc_thread_start(struct bch_fs *);
-void bch2_mark_dev_superblock(struct bch_fs *, struct bch_dev *, unsigned);
 
 /*
  * For concurrent mark and sweep (with other index updates), we define a total
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 0d9d723c24bb..63409ddd975a 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -369,13 +369,6 @@ static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
 	struct bch_fs_usage *fs_usage;
 	struct bch_dev_usage *u;
 
-	/*
-	 * Hack for bch2_fs_initialize path, where we're first marking sb and
-	 * journal non-transactionally:
-	 */
-	if (!journal_seq && !test_bit(BCH_FS_INITIALIZED, &c->flags))
-		journal_seq = 1;
-
 	preempt_disable();
 	fs_usage = fs_usage_ptr(c, journal_seq, gc);
 	u = dev_usage_ptr(ca, journal_seq, gc);
@@ -536,19 +529,6 @@ static inline void update_cached_sectors_list(struct btree_trans *trans,
 	update_replicas_list(trans, &r.e, sectors);
 }
 
-#define do_mark_fn(fn, c, pos, flags, ...)				\
-({									\
-	int gc, ret = 0;						\
-									\
-	percpu_rwsem_assert_held(&c->mark_lock);			\
-									\
-	for (gc = 0; gc < 2 && !ret; gc++)				\
-		if (!gc == !(flags & BTREE_TRIGGER_GC) ||		\
-		    (gc && gc_visited(c, pos)))				\
-			ret = fn(c, __VA_ARGS__, gc);			\
-	ret;								\
-})
-
 void bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
 			    size_t b, bool owned_by_allocator)
 {
@@ -659,17 +639,27 @@ static int bch2_mark_alloc(struct btree_trans *trans,
 	overflow;						\
 })
 
-static int __bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
-				       size_t b, enum bch_data_type data_type,
-				       unsigned sectors, bool gc)
+void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
+			       size_t b, enum bch_data_type data_type,
+			       unsigned sectors, struct gc_pos pos,
+			       unsigned flags)
 {
-	struct bucket *g = __bucket(ca, b, gc);
+	struct bucket *g;
 	struct bucket_mark old, new;
 	bool overflow;
 
+	BUG_ON(!(flags & BTREE_TRIGGER_GC));
 	BUG_ON(data_type != BCH_DATA_sb &&
 	       data_type != BCH_DATA_journal);
 
+	/*
+	 * Backup superblock might be past the end of our normal usable space:
+	 */
+	if (b >= ca->mi.nbuckets)
+		return;
+
+	percpu_down_read(&c->mark_lock);
+	g = __bucket(ca, b, true);
 	old = bucket_cmpxchg(g, new, ({
 		new.data_type	= data_type;
 		overflow = checked_add(new.dirty_sectors, sectors);
@@ -687,32 +677,8 @@ static int __bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
 		bch2_data_types[old.data_type ?: data_type],
 		old.dirty_sectors, sectors);
 
-	if (c)
-		bch2_dev_usage_update(c, ca, old, new, 0, gc);
-
-	return 0;
-}
-
-void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
-			       size_t b, enum bch_data_type type,
-			       unsigned sectors, struct gc_pos pos,
-			       unsigned flags)
-{
-	BUG_ON(type != BCH_DATA_sb &&
-	       type != BCH_DATA_journal);
-
-	/*
-	 * Backup superblock might be past the end of our normal usable space:
-	 */
-	if (b >= ca->mi.nbuckets)
-		return;
-
-	if (likely(c)) {
-		do_mark_fn(__bch2_mark_metadata_bucket, c, pos, flags,
-			   ca, b, type, sectors);
-	} else {
-		__bch2_mark_metadata_bucket(c, ca, b, type, sectors, 0);
-	}
+	bch2_dev_usage_update(c, ca, old, new, 0, true);
+	percpu_up_read(&c->mark_lock);
 }
 
 static s64 ptr_disk_sectors(s64 sectors, struct extent_ptr_decoded p)
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index f15d265ef1b6..020c7b0a3469 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -769,11 +769,8 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
 		long b;
 
 		if (new_fs) {
-			if (c)
-				percpu_down_read(&c->mark_lock);
 			b = bch2_bucket_alloc_new_fs(ca);
 			if (b < 0) {
-				percpu_up_read(&c->mark_lock);
 				ret = -ENOSPC;
 				goto err;
 			}
@@ -821,14 +818,7 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
 		if (c)
 			spin_unlock(&c->journal.lock);
 
-		if (new_fs) {
-			bch2_mark_metadata_bucket(c, ca, b, BCH_DATA_journal,
-						  ca->mi.bucket_size,
-						  gc_phase(GC_PHASE_SB),
-						  0);
-			if (c)
-				percpu_up_read(&c->mark_lock);
-		} else {
+		if (!new_fs) {
 			ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_NOFAIL,
 				bch2_trans_mark_metadata_bucket(&trans, ca,
 						b, BCH_DATA_journal,
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index bd552a942ac6..9916fad292be 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -1383,9 +1383,6 @@ int bch2_fs_initialize(struct bch_fs *c)
 		c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALL);
 		bch2_write_super(c);
 	}
-
-	for_each_online_member(ca, c, i)
-		bch2_mark_dev_superblock(c, ca, 0);
 	mutex_unlock(&c->sb_lock);
 
 	set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags);
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 94429c00e87a..75410b5dba14 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -1606,8 +1606,6 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
 	struct bch_dev *ca = NULL;
 	struct bch_sb_field_members *mi;
 	struct bch_member dev_mi;
-	struct bucket_array *buckets;
-	struct bucket *g;
 	unsigned dev_idx, nr_devices, u64s;
 	int ret;
 
@@ -1637,20 +1635,6 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
 		return ret;
 	}
 
-	/*
-	 * We want to allocate journal on the new device before adding the new
-	 * device to the filesystem because allocating after we attach requires
-	 * spinning up the allocator thread, and the allocator thread requires
-	 * doing btree writes, which if the existing devices are RO isn't going
-	 * to work
-	 *
-	 * So we have to mark where the superblocks are, but marking allocated
-	 * data normally updates the filesystem usage too, so we have to mark,
-	 * allocate the journal, reset all the marks, then remark after we
-	 * attach...
-	 */
-	bch2_mark_dev_superblock(NULL, ca, 0);
-
 	err = "journal alloc failed";
 	ret = bch2_dev_journal_alloc(ca);
 	if (ret)
@@ -1711,16 +1695,6 @@ have_slot:
 
 	bch2_dev_usage_journal_reserve(c);
 
-	/*
-	 * Clear marks before marking transactionally in the btree, so that
-	 * per-device accounting gets done correctly:
-	 */
-	down_read(&ca->bucket_lock);
-	buckets = bucket_array(ca);
-	for_each_bucket(g, buckets)
-		atomic64_set(&g->_mark.v, 0);
-	up_read(&ca->bucket_lock);
-
 	err = "error marking superblock";
 	ret = bch2_trans_mark_dev_sb(c, ca);
 	if (ret)
-- 
cgit 


From c64740ef27cfe2092e3a56509b3bf44e9b10ae49 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 30 Dec 2021 19:30:42 -0500
Subject: bcachefs: Don't start allocator threads too early

If the allocator threads start before journal replay has finished
replaying alloc keys, journal replay might overwrite the allocator's
btree updates.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/alloc_background.c | 3 ++-
 fs/bcachefs/bcachefs.h         | 1 +
 fs/bcachefs/recovery.c         | 9 ++++++++-
 3 files changed, 11 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index e81e05629ffc..fe7bc3cdee30 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -900,7 +900,8 @@ static void discard_one_bucket(struct bch_fs *c, struct bch_dev *ca, u64 b)
 static bool allocator_thread_running(struct bch_dev *ca)
 {
 	unsigned state = ca->mi.state == BCH_MEMBER_STATE_rw &&
-		test_bit(BCH_FS_ALLOCATOR_RUNNING, &ca->fs->flags)
+		test_bit(BCH_FS_ALLOCATOR_RUNNING, &ca->fs->flags) &&
+		test_bit(BCH_FS_ALLOC_REPLAY_DONE, &ca->fs->flags)
 		? ALLOCATOR_running
 		: ALLOCATOR_stopped;
 	alloc_thread_set_state(ca, state);
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index c8c7f6b8ee21..5f18531dc34c 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -509,6 +509,7 @@ enum {
 	BCH_FS_INITIAL_GC_DONE,
 	BCH_FS_INITIAL_GC_UNFIXED,
 	BCH_FS_TOPOLOGY_REPAIR_DONE,
+	BCH_FS_ALLOC_REPLAY_DONE,
 	BCH_FS_BTREE_INTERIOR_REPLAY_DONE,
 	BCH_FS_FSCK_DONE,
 	BCH_FS_STARTED,
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 9916fad292be..d0ceac0f2b39 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -567,9 +567,10 @@ static int bch2_journal_replay(struct bch_fs *c,
 			       struct journal_keys keys)
 {
 	struct journal *j = &c->journal;
+	struct bch_dev *ca;
 	struct journal_key *i;
 	u64 seq;
-	int ret;
+	int ret, idx;
 
 	sort(keys.d, keys.nr, sizeof(keys.d[0]), journal_sort_seq_cmp, NULL);
 
@@ -593,6 +594,11 @@ static int bch2_journal_replay(struct bch_fs *c,
 		}
 	}
 
+	/* Now we can start the allocator threads: */
+	set_bit(BCH_FS_ALLOC_REPLAY_DONE, &c->flags);
+	for_each_member_device(ca, c, idx)
+		bch2_wake_allocator(ca);
+
 	/*
 	 * Next replay updates to interior btree nodes:
 	 */
@@ -1391,6 +1397,7 @@ int bch2_fs_initialize(struct bch_fs *c)
 	for (i = 0; i < BTREE_ID_NR; i++)
 		bch2_btree_root_alloc(c, i);
 
+	set_bit(BCH_FS_ALLOC_REPLAY_DONE, &c->flags);
 	set_bit(BCH_FS_BTREE_INTERIOR_REPLAY_DONE, &c->flags);
 	set_bit(JOURNAL_RECLAIM_STARTED, &c->journal.flags);
 
-- 
cgit 


From 4b674b09a950fb20aa30e902331e4eba12059b80 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 24 Dec 2021 03:08:06 -0500
Subject: bcachefs: Kill ptr_bucket_mark()

Only used in one place, we can just delete it.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/buckets.h | 20 +++++++-------------
 1 file changed, 7 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index cc3e8b9b8faf..afa29d6c5a73 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -91,18 +91,6 @@ static inline enum bch_data_type ptr_data_type(const struct bkey *k,
 	return ptr->cached ? BCH_DATA_cached : BCH_DATA_user;
 }
 
-static inline struct bucket_mark ptr_bucket_mark(struct bch_dev *ca,
-						 const struct bch_extent_ptr *ptr)
-{
-	struct bucket_mark m;
-
-	rcu_read_lock();
-	m = READ_ONCE(PTR_BUCKET(ca, ptr, 0)->mark);
-	rcu_read_unlock();
-
-	return m;
-}
-
 static inline int gen_cmp(u8 a, u8 b)
 {
 	return (s8) (a - b);
@@ -122,7 +110,13 @@ static inline int gen_after(u8 a, u8 b)
 static inline u8 ptr_stale(struct bch_dev *ca,
 			   const struct bch_extent_ptr *ptr)
 {
-	return gen_after(ptr_bucket_mark(ca, ptr).gen, ptr->gen);
+	u8 ret;
+
+	rcu_read_lock();
+	ret = gen_after(PTR_BUCKET(ca, ptr, 0)->mark.gen, ptr->gen);
+	rcu_read_unlock();
+
+	return ret;
 }
 
 /* bucket gc marks */
-- 
cgit 


From e75b2d4c1c829142f8e3e64a9b3cf7faedcfb640 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 23 Dec 2021 21:35:28 -0500
Subject: bcachefs: bch2_journal_key_insert() no longer transfers ownership

bch2_journal_key_insert() used to assume that the key passed to it was
allocated with kmalloc(), and on success took ownership. This patch
deletes that behaviour, making it more similar to
bch2_trans_update()/bch2_trans_commit().

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_gc.c | 12 +++++-------
 fs/bcachefs/buckets.c  | 18 ++++++------------
 fs/bcachefs/recovery.c | 35 +++++++++++++++++++++--------------
 fs/bcachefs/recovery.h |  2 ++
 4 files changed, 34 insertions(+), 33 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 3fa9f5996fca..d525a3045766 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -169,7 +169,7 @@ static int set_node_min(struct bch_fs *c, struct btree *b, struct bpos new_min)
 	new->v.min_key		= new_min;
 	SET_BTREE_PTR_RANGE_UPDATED(&new->v, true);
 
-	ret = bch2_journal_key_insert(c, b->c.btree_id, b->c.level + 1, &new->k_i);
+	ret = bch2_journal_key_insert_take(c, b->c.btree_id, b->c.level + 1, &new->k_i);
 	if (ret) {
 		kfree(new);
 		return ret;
@@ -198,7 +198,7 @@ static int set_node_max(struct bch_fs *c, struct btree *b, struct bpos new_max)
 	new->k.p		= new_max;
 	SET_BTREE_PTR_RANGE_UPDATED(&new->v, true);
 
-	ret = bch2_journal_key_insert(c, b->c.btree_id, b->c.level + 1, &new->k_i);
+	ret = bch2_journal_key_insert_take(c, b->c.btree_id, b->c.level + 1, &new->k_i);
 	if (ret) {
 		kfree(new);
 		return ret;
@@ -690,7 +690,7 @@ found:
 			}
 		}
 
-		ret = bch2_journal_key_insert(c, btree_id, level, new);
+		ret = bch2_journal_key_insert_take(c, btree_id, level, new);
 		if (ret)
 			kfree(new);
 		else
@@ -1390,8 +1390,7 @@ static int bch2_gc_reflink_done_initial_fn(struct btree_trans *trans,
 		}
 
 		ret = bch2_journal_key_insert(c, BTREE_ID_reflink, 0, new);
-		if (ret)
-			kfree(new);
+		kfree(new);
 	}
 fsck_err:
 	return ret;
@@ -1516,8 +1515,7 @@ inconsistent:
 			stripe_blockcount_set(&new->v, i, m ? m->block_sectors[i] : 0);
 
 		ret = bch2_journal_key_insert(c, BTREE_ID_stripes, 0, &new->k_i);
-		if (ret)
-			kfree(new);
+		kfree(new);
 	}
 fsck_err:
 	return ret;
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 63409ddd975a..1959601fe056 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -1217,19 +1217,13 @@ not_found:
 	 */
 	if (fsck_err(c, "%llu:%llu len %u points to nonexistent indirect extent %llu",
 		     p.k->p.inode, p.k->p.offset, p.k->size, *idx)) {
-		struct bkey_i_error *new;
+		struct bkey_i_error new;
 
-		new = kmalloc(sizeof(*new), GFP_KERNEL);
-		if (!new) {
-			bch_err(c, "%s: error allocating new key", __func__);
-			return -ENOMEM;
-		}
-
-		bkey_init(&new->k);
-		new->k.type	= KEY_TYPE_error;
-		new->k.p	= p.k->p;
-		new->k.size	= p.k->size;
-		ret = bch2_journal_key_insert(c, BTREE_ID_extents, 0, &new->k_i);
+		bkey_init(&new.k);
+		new.k.type	= KEY_TYPE_error;
+		new.k.p		= p.k->p;
+		new.k.size	= p.k->size;
+		ret = bch2_journal_key_insert(c, BTREE_ID_extents, 0, &new.k_i);
 	}
 fsck_err:
 	return ret;
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index d0ceac0f2b39..118d536b4376 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -109,8 +109,8 @@ static void journal_iter_fix(struct bch_fs *c, struct journal_iter *iter, unsign
 		iter->idx++;
 }
 
-int bch2_journal_key_insert(struct bch_fs *c, enum btree_id id,
-			    unsigned level, struct bkey_i *k)
+int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id,
+				 unsigned level, struct bkey_i *k)
 {
 	struct journal_key n = {
 		.btree_id	= id,
@@ -157,27 +157,34 @@ int bch2_journal_key_insert(struct bch_fs *c, enum btree_id id,
 	return 0;
 }
 
-int bch2_journal_key_delete(struct bch_fs *c, enum btree_id id,
-			    unsigned level, struct bpos pos)
+int bch2_journal_key_insert(struct bch_fs *c, enum btree_id id,
+			    unsigned level, struct bkey_i *k)
 {
-	struct bkey_i *whiteout =
-		kmalloc(sizeof(struct bkey), GFP_KERNEL);
+	struct bkey_i *n;
 	int ret;
 
-	if (!whiteout) {
-		bch_err(c, "%s: error allocating new key", __func__);
+	n = kmalloc(bkey_bytes(&k->k), GFP_KERNEL);
+	if (!n)
 		return -ENOMEM;
-	}
-
-	bkey_init(&whiteout->k);
-	whiteout->k.p = pos;
 
-	ret = bch2_journal_key_insert(c, id, level, whiteout);
+	bkey_copy(n, k);
+	ret = bch2_journal_key_insert_take(c, id, level, n);
 	if (ret)
-		kfree(whiteout);
+		kfree(n);
 	return ret;
 }
 
+int bch2_journal_key_delete(struct bch_fs *c, enum btree_id id,
+			    unsigned level, struct bpos pos)
+{
+	struct bkey_i whiteout;
+
+	bkey_init(&whiteout.k);
+	whiteout.k.p = pos;
+
+	return bch2_journal_key_insert(c, id, level, &whiteout);
+}
+
 static struct bkey_i *bch2_journal_iter_peek(struct journal_iter *iter)
 {
 	struct journal_key *k = iter->idx - iter->keys->nr
diff --git a/fs/bcachefs/recovery.h b/fs/bcachefs/recovery.h
index e45c70b3693f..1504e0bdb940 100644
--- a/fs/bcachefs/recovery.h
+++ b/fs/bcachefs/recovery.h
@@ -31,6 +31,8 @@ struct btree_and_journal_iter {
 	}			last;
 };
 
+int bch2_journal_key_insert_take(struct bch_fs *, enum btree_id,
+				 unsigned, struct bkey_i *);
 int bch2_journal_key_insert(struct bch_fs *, enum btree_id,
 			    unsigned, struct bkey_i *);
 int bch2_journal_key_delete(struct bch_fs *, enum btree_id,
-- 
cgit 


From 4141fde0be05beb529ee6433b9808f815254901b Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 25 Dec 2021 18:40:15 -0500
Subject: bcachefs: Fix bch2_journal_meta()

This patch ensures that the journal entry written gets written as flush
entry, which is important for the shutdown path - the last entry written
needs to be a flush entry.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/journal.c    | 5 +++++
 fs/bcachefs/journal_io.c | 2 +-
 fs/bcachefs/sysfs.c      | 5 -----
 3 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index 020c7b0a3469..d27d65a17e43 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -641,6 +641,7 @@ int bch2_journal_flush_seq(struct journal *j, u64 seq)
 
 int bch2_journal_meta(struct journal *j)
 {
+	struct journal_buf *buf;
 	struct journal_res res;
 	int ret;
 
@@ -650,6 +651,10 @@ int bch2_journal_meta(struct journal *j)
 	if (ret)
 		return ret;
 
+	buf = j->buf + (res.seq & JOURNAL_BUF_MASK);
+	buf->must_flush = true;
+	set_bit(JOURNAL_NEED_WRITE, &j->flags);
+
 	bch2_journal_res_put(j, &res);
 
 	return bch2_journal_flush_seq(j, res.seq);
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index ae28cee127e3..bda605095825 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -1445,7 +1445,7 @@ void bch2_journal_write(struct closure *cl)
 	SET_JSET_BIG_ENDIAN(jset, CPU_BIG_ENDIAN);
 	SET_JSET_CSUM_TYPE(jset, bch2_meta_checksum_type(c));
 
-	if (journal_entry_empty(jset))
+	if (!JSET_NO_FLUSH(jset) && journal_entry_empty(jset))
 		j->last_empty_seq = le64_to_cpu(jset->seq);
 
 	if (bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset)))
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index 341ba3fdd6fc..1d1e2c6fc2e2 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -131,7 +131,6 @@ do {									\
 		return strtoi_h(buf, &var) ?: (ssize_t) size;		\
 } while (0)
 
-write_attribute(trigger_journal_flush);
 write_attribute(trigger_gc);
 write_attribute(prune_cache);
 rw_attribute(btree_gc_periodic);
@@ -482,9 +481,6 @@ STORE(bch2_fs)
 
 	/* Debugging: */
 
-	if (attr == &sysfs_trigger_journal_flush)
-		bch2_journal_meta(&c->journal);
-
 	if (attr == &sysfs_trigger_gc) {
 		/*
 		 * Full gc is currently incompatible with btree key cache:
@@ -574,7 +570,6 @@ struct attribute *bch2_fs_internal_files[] = {
 	&sysfs_io_timers_read,
 	&sysfs_io_timers_write,
 
-	&sysfs_trigger_journal_flush,
 	&sysfs_trigger_gc,
 	&sysfs_prune_cache,
 
-- 
cgit 


From ffa7d26244e4100600e595e537f4f144a48ab517 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 25 Dec 2021 20:13:47 -0500
Subject: bcachefs: Use BTREE_ITER_NOPRESERVE in bch2_btree_iter_verify_ret()

This fixes a transaction path overflow.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_iter.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 777197ec2656..0b5bf75fbf89 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -731,6 +731,7 @@ static int bch2_btree_iter_verify_ret(struct btree_iter *iter, struct bkey_s_c k
 					  k.k->p.snapshot));
 
 	bch2_trans_iter_init(trans, &copy, iter->btree_id, iter->pos,
+			     BTREE_ITER_NOPRESERVE|
 			     BTREE_ITER_ALL_SNAPSHOTS);
 	prev = bch2_btree_iter_prev(&copy);
 	if (!prev.k)
-- 
cgit 


From 8511632d44a5846f8edc387e90858ae208715574 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 20 Dec 2021 16:55:49 -0500
Subject: bcachefs: Journal initialization fixes

This fixes a rare bug when mounting & unmounting RO - flushing a clean
filesystem that never went RO should be a no op.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/journal.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index d27d65a17e43..7c5ce5b47493 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -989,10 +989,14 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq,
 	j->replay_journal_seq	= last_seq;
 	j->replay_journal_seq_end = cur_seq;
 	j->last_seq_ondisk	= last_seq;
+	j->flushed_seq_ondisk	= cur_seq - 1;
 	j->pin.front		= last_seq;
 	j->pin.back		= cur_seq;
 	atomic64_set(&j->seq, cur_seq - 1);
 
+	if (list_empty(journal_entries))
+		j->last_empty_seq = cur_seq - 1;
+
 	fifo_for_each_entry_ptr(p, &j->pin, seq)
 		journal_pin_list_init(p, 1);
 
@@ -1005,6 +1009,9 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq,
 		if (seq < last_seq)
 			continue;
 
+		if (journal_entry_empty(&i->j))
+			j->last_empty_seq = le64_to_cpu(i->j.seq);
+
 		p = journal_seq_pin(j, seq);
 
 		p->devs.nr = 0;
@@ -1012,6 +1019,9 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq,
 			bch2_dev_list_add_dev(&p->devs, i->ptrs[ptr].dev);
 	}
 
+	if (list_empty(journal_entries))
+		j->last_empty_seq = cur_seq;
+
 	spin_lock(&j->lock);
 
 	set_bit(JOURNAL_STARTED, &j->flags);
-- 
cgit 


From 04f0f77df2324e5a2c3d60df4acda7cbe464d38e Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 26 Dec 2021 16:59:36 -0500
Subject: bcachefs: Delete some obsolete journal_seq_blacklist code

Since metadata version bcachefs_metadata_version_btree_ptr_sectors_written,
we haven't needed the journal seq blacklist mechanism for ignoring
blacklisted btree node writes - we now only need it for ignoring journal
entries that were written after the newest flush journal entry, and then
we only need to keep those blacklist entries around until journal replay
is finished.

That means we can delete the code for scanning btree nodes to GC
journal_seq_blacklist entries.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/bcachefs.h              |  1 -
 fs/bcachefs/journal_seq_blacklist.c | 78 -------------------------------------
 fs/bcachefs/journal_seq_blacklist.h |  2 -
 fs/bcachefs/recovery.c              | 26 ++++++-------
 fs/bcachefs/super.c                 |  5 ---
 5 files changed, 11 insertions(+), 101 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 5f18531dc34c..f41d9b3ac483 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -750,7 +750,6 @@ struct bch_fs {
 	/* JOURNAL SEQ BLACKLIST */
 	struct journal_seq_blacklist_table *
 				journal_seq_blacklist_table;
-	struct work_struct	journal_seq_blacklist_gc_work;
 
 	/* ALLOCATOR */
 	spinlock_t		freelist_lock;
diff --git a/fs/bcachefs/journal_seq_blacklist.c b/fs/bcachefs/journal_seq_blacklist.c
index 79bc0e49389b..10bd23e969d2 100644
--- a/fs/bcachefs/journal_seq_blacklist.c
+++ b/fs/bcachefs/journal_seq_blacklist.c
@@ -235,81 +235,3 @@ const struct bch_sb_field_ops bch_sb_field_ops_journal_seq_blacklist = {
 	.validate	= bch2_sb_journal_seq_blacklist_validate,
 	.to_text	= bch2_sb_journal_seq_blacklist_to_text
 };
-
-void bch2_blacklist_entries_gc(struct work_struct *work)
-{
-	struct bch_fs *c = container_of(work, struct bch_fs,
-					journal_seq_blacklist_gc_work);
-	struct journal_seq_blacklist_table *t;
-	struct bch_sb_field_journal_seq_blacklist *bl;
-	struct journal_seq_blacklist_entry *src, *dst;
-	struct btree_trans trans;
-	unsigned i, nr, new_nr;
-	int ret;
-
-	bch2_trans_init(&trans, c, 0, 0);
-
-	for (i = 0; i < BTREE_ID_NR; i++) {
-		struct btree_iter iter;
-		struct btree *b;
-
-		bch2_trans_node_iter_init(&trans, &iter, i, POS_MIN,
-					  0, 0, BTREE_ITER_PREFETCH);
-retry:
-		bch2_trans_begin(&trans);
-
-		b = bch2_btree_iter_peek_node(&iter);
-
-		while (!(ret = PTR_ERR_OR_ZERO(b)) &&
-		       b &&
-		       !test_bit(BCH_FS_STOPPING, &c->flags))
-			b = bch2_btree_iter_next_node(&iter);
-
-		if (ret == -EINTR)
-			goto retry;
-
-		bch2_trans_iter_exit(&trans, &iter);
-	}
-
-	bch2_trans_exit(&trans);
-	if (ret)
-		return;
-
-	mutex_lock(&c->sb_lock);
-	bl = bch2_sb_get_journal_seq_blacklist(c->disk_sb.sb);
-	if (!bl)
-		goto out;
-
-	nr = blacklist_nr_entries(bl);
-	dst = bl->start;
-
-	t = c->journal_seq_blacklist_table;
-	BUG_ON(nr != t->nr);
-
-	for (src = bl->start, i = eytzinger0_first(t->nr);
-	     src < bl->start + nr;
-	     src++, i = eytzinger0_next(i, nr)) {
-		BUG_ON(t->entries[i].start	!= le64_to_cpu(src->start));
-		BUG_ON(t->entries[i].end	!= le64_to_cpu(src->end));
-
-		if (t->entries[i].dirty)
-			*dst++ = *src;
-	}
-
-	new_nr = dst - bl->start;
-
-	bch_info(c, "nr blacklist entries was %u, now %u", nr, new_nr);
-
-	if (new_nr != nr) {
-		bl = bch2_sb_resize_journal_seq_blacklist(&c->disk_sb,
-				new_nr ? sb_blacklist_u64s(new_nr) : 0);
-		BUG_ON(new_nr && !bl);
-
-		if (!new_nr)
-			c->disk_sb.sb->features[0] &= cpu_to_le64(~(1ULL << BCH_FEATURE_journal_seq_blacklist_v3));
-
-		bch2_write_super(c);
-	}
-out:
-	mutex_unlock(&c->sb_lock);
-}
diff --git a/fs/bcachefs/journal_seq_blacklist.h b/fs/bcachefs/journal_seq_blacklist.h
index afb886ec8e25..b4f876a04586 100644
--- a/fs/bcachefs/journal_seq_blacklist.h
+++ b/fs/bcachefs/journal_seq_blacklist.h
@@ -17,6 +17,4 @@ int bch2_blacklist_table_initialize(struct bch_fs *);
 
 extern const struct bch_sb_field_ops bch_sb_field_ops_journal_seq_blacklist;
 
-void bch2_blacklist_entries_gc(struct work_struct *);
-
 #endif /* _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H */
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 118d536b4376..ffa8ab933a11 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -1162,16 +1162,6 @@ use_clean:
 	if (ret)
 		goto err;
 
-	/*
-	 * After an unclean shutdown, skip then next few journal sequence
-	 * numbers as they may have been referenced by btree writes that
-	 * happened before their corresponding journal writes - those btree
-	 * writes need to be ignored, by skipping and blacklisting the next few
-	 * journal sequence numbers:
-	 */
-	if (!c->sb.clean)
-		journal_seq += 8;
-
 	if (blacklist_seq != journal_seq) {
 		ret = bch2_journal_seq_blacklist_add(c,
 					blacklist_seq, journal_seq);
@@ -1309,7 +1299,8 @@ use_clean:
 	}
 
 	if (!(c->sb.compat & (1ULL << BCH_COMPAT_extents_above_btree_updates_done)) ||
-	    !(c->sb.compat & (1ULL << BCH_COMPAT_bformat_overflow_done))) {
+	    !(c->sb.compat & (1ULL << BCH_COMPAT_bformat_overflow_done)) ||
+	    le16_to_cpu(c->sb.version_min) < bcachefs_metadata_version_btree_ptr_sectors_written) {
 		struct bch_move_stats stats;
 
 		bch_move_stats_init(&stats, "recovery");
@@ -1326,6 +1317,15 @@ use_clean:
 	}
 
 	mutex_lock(&c->sb_lock);
+	/*
+	 * With journal replay done, we can clear the journal seq blacklist
+	 * table:
+	 */
+	BUG_ON(!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags));
+	BUG_ON(le16_to_cpu(c->sb.version_min) < bcachefs_metadata_version_btree_ptr_sectors_written);
+
+	bch2_sb_resize_journal_seq_blacklist(&c->disk_sb, 0);
+
 	if (c->opts.version_upgrade) {
 		c->disk_sb.sb->version = cpu_to_le16(bcachefs_metadata_version_current);
 		c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALL);
@@ -1349,10 +1349,6 @@ use_clean:
 		bch2_write_super(c);
 	mutex_unlock(&c->sb_lock);
 
-	if (c->journal_seq_blacklist_table &&
-	    c->journal_seq_blacklist_table->nr > 128)
-		queue_work(system_long_wq, &c->journal_seq_blacklist_gc_work);
-
 	ret = 0;
 out:
 	set_bit(BCH_FS_FSCK_DONE, &c->flags);
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 75410b5dba14..ec55ab865b8f 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -535,8 +535,6 @@ void __bch2_fs_stop(struct bch_fs *c)
 
 	set_bit(BCH_FS_STOPPING, &c->flags);
 
-	cancel_work_sync(&c->journal_seq_blacklist_gc_work);
-
 	down_write(&c->state_lock);
 	bch2_fs_read_only(c);
 	up_write(&c->state_lock);
@@ -700,9 +698,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 
 	spin_lock_init(&c->btree_write_error_lock);
 
-	INIT_WORK(&c->journal_seq_blacklist_gc_work,
-		  bch2_blacklist_entries_gc);
-
 	INIT_LIST_HEAD(&c->journal_entries);
 	INIT_LIST_HEAD(&c->journal_iters);
 
-- 
cgit 


From 57af63b286a532f425e425c0684eda6fb5f7c284 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 25 Dec 2021 21:14:49 -0500
Subject: bcachefs: bch2_alloc_sectors_append_ptrs() now takes cached flag

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_foreground.c      | 14 ++++++++------
 fs/bcachefs/alloc_foreground.h      |  2 +-
 fs/bcachefs/btree_update_interior.c |  2 +-
 fs/bcachefs/io.c                    |  8 ++------
 4 files changed, 12 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index e38ee6bf0c46..af651dd9a36f 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -866,7 +866,8 @@ err:
  * as allocated out of @ob
  */
 void bch2_alloc_sectors_append_ptrs(struct bch_fs *c, struct write_point *wp,
-				    struct bkey_i *k, unsigned sectors)
+				    struct bkey_i *k, unsigned sectors,
+				    bool cached)
 
 {
 	struct open_bucket *ob;
@@ -877,13 +878,14 @@ void bch2_alloc_sectors_append_ptrs(struct bch_fs *c, struct write_point *wp,
 
 	open_bucket_for_each(c, &wp->ptrs, ob, i) {
 		struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev);
-		struct bch_extent_ptr tmp = ob->ptr;
+		struct bch_extent_ptr ptr = ob->ptr;
 
-		tmp.cached = !ca->mi.durability &&
-			wp->type == BCH_DATA_user;
+		ptr.cached = cached ||
+			(!ca->mi.durability &&
+			 wp->type == BCH_DATA_user);
 
-		tmp.offset += ca->mi.bucket_size - ob->sectors_free;
-		bch2_bkey_append_ptr(k, tmp);
+		ptr.offset += ca->mi.bucket_size - ob->sectors_free;
+		bch2_bkey_append_ptr(k, ptr);
 
 		BUG_ON(sectors > ob->sectors_free);
 		ob->sectors_free -= sectors;
diff --git a/fs/bcachefs/alloc_foreground.h b/fs/bcachefs/alloc_foreground.h
index 2e81712ba8d1..d8888785676d 100644
--- a/fs/bcachefs/alloc_foreground.h
+++ b/fs/bcachefs/alloc_foreground.h
@@ -106,7 +106,7 @@ struct write_point *bch2_alloc_sectors_start(struct bch_fs *,
 					     struct closure *);
 
 void bch2_alloc_sectors_append_ptrs(struct bch_fs *, struct write_point *,
-				    struct bkey_i *, unsigned);
+				    struct bkey_i *, unsigned, bool);
 void bch2_alloc_sectors_done(struct bch_fs *, struct write_point *);
 
 void bch2_open_buckets_stop_dev(struct bch_fs *, struct bch_dev *,
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index fd4089d19ad2..6ef0711431a1 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -236,7 +236,7 @@ retry:
 	}
 
 	bkey_btree_ptr_v2_init(&tmp.k);
-	bch2_alloc_sectors_append_ptrs(c, wp, &tmp.k, btree_sectors(c));
+	bch2_alloc_sectors_append_ptrs(c, wp, &tmp.k, btree_sectors(c), false);
 
 	bch2_open_bucket_get(c, wp, &ob);
 	bch2_alloc_sectors_done(c, wp);
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index 649373acde73..d69e63e519bf 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -746,7 +746,6 @@ static void init_append_extent(struct bch_write_op *op,
 			       struct bch_extent_crc_unpacked crc)
 {
 	struct bkey_i_extent *e;
-	struct bch_extent_ptr *ptr;
 
 	op->pos.offset += crc.uncompressed_size;
 
@@ -760,11 +759,8 @@ static void init_append_extent(struct bch_write_op *op,
 	    crc.nonce)
 		bch2_extent_crc_append(&e->k_i, crc);
 
-	bch2_alloc_sectors_append_ptrs(op->c, wp, &e->k_i, crc.compressed_size);
-
-	if (op->flags & BCH_WRITE_CACHED)
-		extent_for_each_ptr(extent_i_to_s(e), ptr)
-			ptr->cached = true;
+	bch2_alloc_sectors_append_ptrs(op->c, wp, &e->k_i, crc.compressed_size,
+				       op->flags & BCH_WRITE_CACHED);
 
 	bch2_keylist_push(&op->insert_keys);
 }
-- 
cgit 


From abe19d458e8fffbebacaad3aad64604d2819913a Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 25 Dec 2021 21:21:46 -0500
Subject: bcachefs: Refactor open_bucket code

Prep work for adding a hash table of open buckets - instead of embedding
a bch_extent_ptr, we need to refer to the bucket directly so that we're
not calling sector_to_bucket() in the hash table lookup code, which has
an expensive divide.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/alloc_background.c |  21 +--------
 fs/bcachefs/alloc_background.h |   2 -
 fs/bcachefs/alloc_foreground.c | 100 ++++++++++++++++++++++++++---------------
 fs/bcachefs/alloc_foreground.h |   5 ++-
 fs/bcachefs/alloc_types.h      |   9 ++--
 fs/bcachefs/ec.c               |   8 ++--
 fs/bcachefs/journal.c          |   2 +-
 fs/bcachefs/sysfs.c            |   3 +-
 8 files changed, 83 insertions(+), 67 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index fe7bc3cdee30..8831b2a0303a 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -1066,7 +1066,7 @@ static bool bch2_dev_has_open_write_point(struct bch_fs *c, struct bch_dev *ca)
 	     ob++) {
 		spin_lock(&ob->lock);
 		if (ob->valid && !ob->on_partial_list &&
-		    ob->ptr.dev == ca->dev_idx)
+		    ob->dev == ca->dev_idx)
 			ret = true;
 		spin_unlock(&ob->lock);
 	}
@@ -1213,22 +1213,3 @@ void bch2_fs_allocator_background_init(struct bch_fs *c)
 {
 	spin_lock_init(&c->freelist_lock);
 }
-
-void bch2_open_buckets_to_text(struct printbuf *out, struct bch_fs *c)
-{
-	struct open_bucket *ob;
-
-	for (ob = c->open_buckets;
-	     ob < c->open_buckets + ARRAY_SIZE(c->open_buckets);
-	     ob++) {
-		spin_lock(&ob->lock);
-		if (ob->valid && !ob->on_partial_list) {
-			pr_buf(out, "%zu ref %u type %s\n",
-			       ob - c->open_buckets,
-			       atomic_read(&ob->pin),
-			       bch2_data_types[ob->type]);
-		}
-		spin_unlock(&ob->lock);
-	}
-
-}
diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h
index e3cdb8bc1dd8..86b64177b3d0 100644
--- a/fs/bcachefs/alloc_background.h
+++ b/fs/bcachefs/alloc_background.h
@@ -142,6 +142,4 @@ int bch2_dev_allocator_start(struct bch_dev *);
 int bch2_alloc_write_all(struct bch_fs *, unsigned);
 void bch2_fs_allocator_background_init(struct bch_fs *);
 
-void bch2_open_buckets_to_text(struct printbuf *, struct bch_fs *);
-
 #endif /* _BCACHEFS_ALLOC_BACKGROUND_H */
diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index af651dd9a36f..35a2683d8807 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -45,7 +45,7 @@
 
 void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob)
 {
-	struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev);
+	struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
 
 	if (ob->ec) {
 		bch2_ec_bucket_written(c, ob);
@@ -55,9 +55,9 @@ void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob)
 	percpu_down_read(&c->mark_lock);
 	spin_lock(&ob->lock);
 
-	bch2_mark_alloc_bucket(c, ca, PTR_BUCKET_NR(ca, &ob->ptr), false);
+	bch2_mark_alloc_bucket(c, ca, ob->bucket, false);
 	ob->valid = false;
-	ob->type = 0;
+	ob->data_type = 0;
 
 	spin_unlock(&ob->lock);
 	percpu_up_read(&c->mark_lock);
@@ -81,8 +81,7 @@ void bch2_open_bucket_write_error(struct bch_fs *c,
 	unsigned i;
 
 	open_bucket_for_each(c, obs, ob, i)
-		if (ob->ptr.dev == dev &&
-		    ob->ec)
+		if (ob->dev == dev && ob->ec)
 			bch2_ec_bucket_cancel(c, ob);
 }
 
@@ -95,18 +94,19 @@ static struct open_bucket *bch2_open_bucket_alloc(struct bch_fs *c)
 	ob = c->open_buckets + c->open_buckets_freelist;
 	c->open_buckets_freelist = ob->freelist;
 	atomic_set(&ob->pin, 1);
-	ob->type = 0;
+	ob->data_type = 0;
 
 	c->open_buckets_nr_free--;
 	return ob;
 }
 
+
 static void open_bucket_free_unused(struct bch_fs *c,
 				    struct write_point *wp,
 				    struct open_bucket *ob)
 {
-	struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev);
-	bool may_realloc = wp->type == BCH_DATA_user;
+	struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
+	bool may_realloc = wp->data_type == BCH_DATA_user;
 
 	BUG_ON(ca->open_buckets_partial_nr >
 	       ARRAY_SIZE(ca->open_buckets_partial));
@@ -133,11 +133,13 @@ static void verify_not_stale(struct bch_fs *c, const struct open_buckets *obs)
 	struct open_bucket *ob;
 	unsigned i;
 
+	rcu_read_lock();
 	open_bucket_for_each(c, obs, ob, i) {
-		struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev);
+		struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
 
-		BUG_ON(ptr_stale(ca, &ob->ptr));
+		BUG_ON(bucket(ca, ob->bucket)->mark.gen != ob->gen);
 	}
+	rcu_read_unlock();
 #endif
 }
 
@@ -246,13 +248,9 @@ out:
 	ob->valid	= true;
 	ob->sectors_free = ca->mi.bucket_size;
 	ob->alloc_reserve = reserve;
-	ob->ptr		= (struct bch_extent_ptr) {
-		.type	= 1 << BCH_EXTENT_ENTRY_ptr,
-		.gen	= bucket(ca, b)->mark.gen,
-		.offset	= bucket_to_sector(ca, b),
-		.dev	= ca->dev_idx,
-	};
-
+	ob->dev		= ca->dev_idx;
+	ob->gen		= bucket(ca, b)->mark.gen;
+	ob->bucket	= b;
 	spin_unlock(&ob->lock);
 
 	if (c->blocked_allocate_open_bucket) {
@@ -333,9 +331,9 @@ static void add_new_bucket(struct bch_fs *c,
 			   struct open_bucket *ob)
 {
 	unsigned durability =
-		bch_dev_bkey_exists(c, ob->ptr.dev)->mi.durability;
+		bch_dev_bkey_exists(c, ob->dev)->mi.durability;
 
-	__clear_bit(ob->ptr.dev, devs_may_alloc->d);
+	__clear_bit(ob->dev, devs_may_alloc->d);
 	*nr_effective	+= (flags & BUCKET_ALLOC_USE_DURABILITY)
 		? durability : 1;
 	*have_cache	|= !durability;
@@ -445,13 +443,13 @@ static int bucket_alloc_from_stripe(struct bch_fs *c,
 				continue;
 
 			ob = c->open_buckets + h->s->blocks[ec_idx];
-			if (ob->ptr.dev == devs_sorted.devs[i] &&
+			if (ob->dev == devs_sorted.devs[i] &&
 			    !test_and_set_bit(ec_idx, h->s->blocks_allocated))
 				goto got_bucket;
 		}
 	goto out_put_head;
 got_bucket:
-	ca = bch_dev_bkey_exists(c, ob->ptr.dev);
+	ca = bch_dev_bkey_exists(c, ob->dev);
 
 	ob->ec_idx	= ec_idx;
 	ob->ec		= h->s;
@@ -481,12 +479,12 @@ static void get_buckets_from_writepoint(struct bch_fs *c,
 	unsigned i;
 
 	open_bucket_for_each(c, &wp->ptrs, ob, i) {
-		struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev);
+		struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
 
 		if (*nr_effective < nr_replicas &&
-		    test_bit(ob->ptr.dev, devs_may_alloc->d) &&
+		    test_bit(ob->dev, devs_may_alloc->d) &&
 		    (ca->mi.durability ||
-		     (wp->type == BCH_DATA_user && !*have_cache)) &&
+		     (wp->data_type == BCH_DATA_user && !*have_cache)) &&
 		    (ob->ec || !need_ec)) {
 			add_new_bucket(c, ptrs, devs_may_alloc,
 				       nr_effective, have_cache,
@@ -518,7 +516,7 @@ static int open_bucket_add_buckets(struct bch_fs *c,
 	unsigned i;
 
 	rcu_read_lock();
-	devs = target_rw_devs(c, wp->type, target);
+	devs = target_rw_devs(c, wp->data_type, target);
 	rcu_read_unlock();
 
 	/* Don't allocate from devices we already have pointers to: */
@@ -526,7 +524,7 @@ static int open_bucket_add_buckets(struct bch_fs *c,
 		__clear_bit(devs_have->devs[i], devs.d);
 
 	open_bucket_for_each(c, ptrs, ob, i)
-		__clear_bit(ob->ptr.dev, devs.d);
+		__clear_bit(ob->dev, devs.d);
 
 	if (erasure_code) {
 		if (!ec_open_bucket(c, ptrs)) {
@@ -586,7 +584,7 @@ void bch2_open_buckets_stop_dev(struct bch_fs *c, struct bch_dev *ca,
 	unsigned i, j;
 
 	open_bucket_for_each(c, obs, ob, i) {
-		bool drop = !ca || ob->ptr.dev == ca->dev_idx;
+		bool drop = !ca || ob->dev == ca->dev_idx;
 
 		if (!drop && ob->ec) {
 			mutex_lock(&ob->ec->lock);
@@ -595,7 +593,7 @@ void bch2_open_buckets_stop_dev(struct bch_fs *c, struct bch_dev *ca,
 					continue;
 
 				ob2 = c->open_buckets + ob->ec->blocks[j];
-				drop |= ob2->ptr.dev == ca->dev_idx;
+				drop |= ob2->dev == ca->dev_idx;
 			}
 			mutex_unlock(&ob->ec->lock);
 		}
@@ -779,11 +777,11 @@ retry:
 
 	wp = writepoint_find(c, write_point.v);
 
-	if (wp->type == BCH_DATA_user)
+	if (wp->data_type == BCH_DATA_user)
 		ob_flags |= BUCKET_MAY_ALLOC_PARTIAL;
 
 	/* metadata may not allocate on cache devices: */
-	if (wp->type != BCH_DATA_user)
+	if (wp->data_type != BCH_DATA_user)
 		have_cache = true;
 
 	if (!target || (flags & BCH_WRITE_ONLY_SPECIFIED_DEVS)) {
@@ -861,6 +859,20 @@ err:
 	}
 }
 
+struct bch_extent_ptr bch2_ob_ptr(struct bch_fs *c, struct open_bucket *ob)
+{
+	struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
+
+	return (struct bch_extent_ptr) {
+		.type	= 1 << BCH_EXTENT_ENTRY_ptr,
+		.gen	= ob->gen,
+		.dev	= ob->dev,
+		.offset	= bucket_to_sector(ca, ob->bucket) +
+			ca->mi.bucket_size -
+			ob->sectors_free,
+	};
+}
+
 /*
  * Append pointers to the space we just allocated to @k, and mark @sectors space
  * as allocated out of @ob
@@ -877,14 +889,13 @@ void bch2_alloc_sectors_append_ptrs(struct bch_fs *c, struct write_point *wp,
 	wp->sectors_free -= sectors;
 
 	open_bucket_for_each(c, &wp->ptrs, ob, i) {
-		struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev);
-		struct bch_extent_ptr ptr = ob->ptr;
+		struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
+		struct bch_extent_ptr ptr = bch2_ob_ptr(c, ob);
 
 		ptr.cached = cached ||
 			(!ca->mi.durability &&
-			 wp->type == BCH_DATA_user);
+			 wp->data_type == BCH_DATA_user);
 
-		ptr.offset += ca->mi.bucket_size - ob->sectors_free;
 		bch2_bkey_append_ptr(k, ptr);
 
 		BUG_ON(sectors > ob->sectors_free);
@@ -915,7 +926,7 @@ static inline void writepoint_init(struct write_point *wp,
 				   enum bch_data_type type)
 {
 	mutex_init(&wp->lock);
-	wp->type = type;
+	wp->data_type = type;
 }
 
 void bch2_fs_allocator_foreground_init(struct bch_fs *c)
@@ -952,3 +963,22 @@ void bch2_fs_allocator_foreground_init(struct bch_fs *c)
 				   writepoint_hash(c, wp->write_point));
 	}
 }
+
+void bch2_open_buckets_to_text(struct printbuf *out, struct bch_fs *c)
+{
+	struct open_bucket *ob;
+
+	for (ob = c->open_buckets;
+	     ob < c->open_buckets + ARRAY_SIZE(c->open_buckets);
+	     ob++) {
+		spin_lock(&ob->lock);
+		if (ob->valid && !ob->on_partial_list) {
+			pr_buf(out, "%zu ref %u type %s\n",
+			       ob - c->open_buckets,
+			       atomic_read(&ob->pin),
+			       bch2_data_types[ob->data_type]);
+		}
+		spin_unlock(&ob->lock);
+	}
+
+}
diff --git a/fs/bcachefs/alloc_foreground.h b/fs/bcachefs/alloc_foreground.h
index d8888785676d..39d8ae5bbb96 100644
--- a/fs/bcachefs/alloc_foreground.h
+++ b/fs/bcachefs/alloc_foreground.h
@@ -85,7 +85,7 @@ static inline void bch2_open_bucket_get(struct bch_fs *c,
 	unsigned i;
 
 	open_bucket_for_each(c, &wp->ptrs, ob, i) {
-		ob->type = wp->type;
+		ob->data_type = wp->data_type;
 		atomic_inc(&ob->pin);
 		ob_push(c, ptrs, ob);
 	}
@@ -105,6 +105,7 @@ struct write_point *bch2_alloc_sectors_start(struct bch_fs *,
 					     unsigned,
 					     struct closure *);
 
+struct bch_extent_ptr bch2_ob_ptr(struct bch_fs *, struct open_bucket *);
 void bch2_alloc_sectors_append_ptrs(struct bch_fs *, struct write_point *,
 				    struct bkey_i *, unsigned, bool);
 void bch2_alloc_sectors_done(struct bch_fs *, struct write_point *);
@@ -127,4 +128,6 @@ static inline struct write_point_specifier writepoint_ptr(struct write_point *wp
 
 void bch2_fs_allocator_foreground_init(struct bch_fs *);
 
+void bch2_open_buckets_to_text(struct printbuf *, struct bch_fs *);
+
 #endif /* _BCACHEFS_ALLOC_FOREGROUND_H */
diff --git a/fs/bcachefs/alloc_types.h b/fs/bcachefs/alloc_types.h
index 4a1cd8b73d16..bd173c7c334b 100644
--- a/fs/bcachefs/alloc_types.h
+++ b/fs/bcachefs/alloc_types.h
@@ -49,12 +49,15 @@ struct open_bucket {
 	 * the block in the stripe this open_bucket corresponds to:
 	 */
 	u8			ec_idx;
-	u8			type;
+	enum bch_data_type	data_type:3;
 	unsigned		valid:1;
 	unsigned		on_partial_list:1;
 	int			alloc_reserve:3;
+
 	unsigned		sectors_free;
-	struct bch_extent_ptr	ptr;
+	u8			dev;
+	u8			gen;
+	u64			bucket;
 	struct ec_stripe_new	*ec;
 };
 
@@ -74,7 +77,7 @@ struct write_point {
 	struct mutex		lock;
 	u64			last_used;
 	unsigned long		write_point;
-	enum bch_data_type	type;
+	enum bch_data_type	data_type;
 
 	/* calculated based on how many pointers we're actually going to use: */
 	unsigned		sectors_free;
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index c3f86cc39842..05f55b74d641 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -1063,7 +1063,7 @@ void *bch2_writepoint_ec_buf(struct bch_fs *c, struct write_point *wp)
 	if (!ob)
 		return NULL;
 
-	ca	= bch_dev_bkey_exists(c, ob->ptr.dev);
+	ca	= bch_dev_bkey_exists(c, ob->dev);
 	offset	= ca->mi.bucket_size - ob->sectors_free;
 
 	return ob->ec->new_stripe.data[ob->ec_idx] + (offset << 9);
@@ -1318,7 +1318,7 @@ static int new_stripe_alloc_buckets(struct bch_fs *c, struct ec_stripe_head *h,
 			BUG_ON(j >= h->s->nr_data + h->s->nr_parity);
 
 			h->s->blocks[j] = buckets.v[i];
-			h->s->new_stripe.key.v.ptrs[j] = ob->ptr;
+			h->s->new_stripe.key.v.ptrs[j] = bch2_ob_ptr(c, ob);
 			__set_bit(j, h->s->blocks_gotten);
 		}
 
@@ -1346,7 +1346,7 @@ static int new_stripe_alloc_buckets(struct bch_fs *c, struct ec_stripe_head *h,
 			BUG_ON(j >= h->s->nr_data);
 
 			h->s->blocks[j] = buckets.v[i];
-			h->s->new_stripe.key.v.ptrs[j] = ob->ptr;
+			h->s->new_stripe.key.v.ptrs[j] = bch2_ob_ptr(c, ob);
 			__set_bit(j, h->s->blocks_gotten);
 		}
 
@@ -1535,7 +1535,7 @@ void bch2_ec_stop_dev(struct bch_fs *c, struct bch_dev *ca)
 				continue;
 
 			ob = c->open_buckets + h->s->blocks[i];
-			if (ob->ptr.dev == ca->dev_idx)
+			if (ob->dev == ca->dev_idx)
 				goto found;
 		}
 		goto unlock;
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index 7c5ce5b47493..4fadb41c4c1e 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -789,7 +789,7 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
 				goto err;
 			}
 
-			b = sector_to_bucket(ca, ob->ptr.offset);
+			b = ob->bucket;
 		}
 
 		if (c)
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index 1d1e2c6fc2e2..07e9b214bcb5 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -10,6 +10,7 @@
 
 #include "bcachefs.h"
 #include "alloc_background.h"
+#include "alloc_foreground.h"
 #include "sysfs.h"
 #include "btree_cache.h"
 #include "btree_io.h"
@@ -723,7 +724,7 @@ static void dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca)
 	memset(nr, 0, sizeof(nr));
 
 	for (i = 0; i < ARRAY_SIZE(c->open_buckets); i++)
-		nr[c->open_buckets[i].type]++;
+		nr[c->open_buckets[i].data_type]++;
 
 	pr_buf(out,
 	       "\t\t buckets\t sectors      fragmented\n"
-- 
cgit 


From 9ddffaf83b5ac7cf79917cfe9a1435cc07d071b6 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 25 Dec 2021 21:43:29 -0500
Subject: bcachefs: Put open_buckets in a hashtable

This is so that the copygc code doesn't have to refer to
bucket_mark.owned_by_allocator - assisting in getting rid of the in
memory bucket array.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/alloc_foreground.c | 30 ++++++++++++++++++++++++++++--
 fs/bcachefs/alloc_foreground.h | 24 ++++++++++++++++++++++++
 fs/bcachefs/alloc_types.h      |  4 ++++
 fs/bcachefs/bcachefs.h         |  2 ++
 4 files changed, 58 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index 35a2683d8807..7506d54c854b 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -43,6 +43,29 @@
  * reference _after_ doing the index update that makes its allocation reachable.
  */
 
+static void bch2_open_bucket_hash_add(struct bch_fs *c, struct open_bucket *ob)
+{
+	open_bucket_idx_t idx = ob - c->open_buckets;
+	open_bucket_idx_t *slot = open_bucket_hashslot(c, ob->dev, ob->bucket);
+
+	ob->hash = *slot;
+	*slot = idx;
+}
+
+static void bch2_open_bucket_hash_remove(struct bch_fs *c, struct open_bucket *ob)
+{
+	open_bucket_idx_t idx = ob - c->open_buckets;
+	open_bucket_idx_t *slot = open_bucket_hashslot(c, ob->dev, ob->bucket);
+
+	while (*slot != idx) {
+		BUG_ON(!*slot);
+		slot = &c->open_buckets[*slot].hash;
+	}
+
+	*slot = ob->hash;
+	ob->hash = 0;
+}
+
 void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob)
 {
 	struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
@@ -63,6 +86,8 @@ void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob)
 	percpu_up_read(&c->mark_lock);
 
 	spin_lock(&c->freelist_lock);
+	bch2_open_bucket_hash_remove(c, ob);
+
 	ob->freelist = c->open_buckets_freelist;
 	c->open_buckets_freelist = ob - c->open_buckets;
 
@@ -100,7 +125,6 @@ static struct open_bucket *bch2_open_bucket_alloc(struct bch_fs *c)
 	return ob;
 }
 
-
 static void open_bucket_free_unused(struct bch_fs *c,
 				    struct write_point *wp,
 				    struct open_bucket *ob)
@@ -253,6 +277,9 @@ out:
 	ob->bucket	= b;
 	spin_unlock(&ob->lock);
 
+	ca->nr_open_buckets++;
+	bch2_open_bucket_hash_add(c, ob);
+
 	if (c->blocked_allocate_open_bucket) {
 		bch2_time_stats_update(
 			&c->times[BCH_TIME_blocked_allocate_open_bucket],
@@ -267,7 +294,6 @@ out:
 		c->blocked_allocate = 0;
 	}
 
-	ca->nr_open_buckets++;
 	spin_unlock(&c->freelist_lock);
 
 	bch2_wake_allocator(ca);
diff --git a/fs/bcachefs/alloc_foreground.h b/fs/bcachefs/alloc_foreground.h
index 39d8ae5bbb96..d466bda9afc8 100644
--- a/fs/bcachefs/alloc_foreground.h
+++ b/fs/bcachefs/alloc_foreground.h
@@ -91,6 +91,30 @@ static inline void bch2_open_bucket_get(struct bch_fs *c,
 	}
 }
 
+static inline open_bucket_idx_t *open_bucket_hashslot(struct bch_fs *c,
+						  unsigned dev, u64 bucket)
+{
+	return c->open_buckets_hash +
+		(jhash_3words(dev, bucket, bucket >> 32, 0) &
+		 (OPEN_BUCKETS_COUNT - 1));
+}
+
+static inline bool bch2_bucket_is_open(struct bch_fs *c, unsigned dev, u64 bucket)
+{
+	open_bucket_idx_t slot = *open_bucket_hashslot(c, dev, bucket);
+
+	while (slot) {
+		struct open_bucket *ob = &c->open_buckets[slot];
+
+		if (ob->dev == dev && ob->bucket == bucket)
+			return true;
+
+		slot = ob->hash;
+	}
+
+	return false;
+}
+
 int bch2_bucket_alloc_set(struct bch_fs *, struct open_buckets *,
 		      struct dev_stripe_state *, struct bch_devs_mask *,
 		      unsigned, unsigned *, bool *, enum alloc_reserve,
diff --git a/fs/bcachefs/alloc_types.h b/fs/bcachefs/alloc_types.h
index bd173c7c334b..409232e3d998 100644
--- a/fs/bcachefs/alloc_types.h
+++ b/fs/bcachefs/alloc_types.h
@@ -37,12 +37,16 @@ typedef FIFO(long)	alloc_fifo;
 #define WRITE_POINT_HASH_NR	32
 #define WRITE_POINT_MAX		32
 
+/*
+ * 0 is never a valid open_bucket_idx_t:
+ */
 typedef u16			open_bucket_idx_t;
 
 struct open_bucket {
 	spinlock_t		lock;
 	atomic_t		pin;
 	open_bucket_idx_t	freelist;
+	open_bucket_idx_t	hash;
 
 	/*
 	 * When an open bucket has an ec_stripe attached, this is the index of
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index f41d9b3ac483..6c686be28b39 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -756,10 +756,12 @@ struct bch_fs {
 	struct closure_waitlist	freelist_wait;
 	u64			blocked_allocate;
 	u64			blocked_allocate_open_bucket;
+
 	open_bucket_idx_t	open_buckets_freelist;
 	open_bucket_idx_t	open_buckets_nr_free;
 	struct closure_waitlist	open_buckets_wait;
 	struct open_bucket	open_buckets[OPEN_BUCKETS_COUNT];
+	open_bucket_idx_t	open_buckets_hash[OPEN_BUCKETS_COUNT];
 
 	struct write_point	btree_write_point;
 	struct write_point	rebalance_write_point;
-- 
cgit 


From 47ac34ec988f01e1e0d00a5281abe0812bad4fcc Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 25 Dec 2021 22:37:19 -0500
Subject: bcachefs: Separate out gc_bucket()

Since the main in memory bucket array is going away, we don't want to be
calling bucket() or __bucket() when what we want is the GC in-memory
bucket.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_gc.c | 14 +++++-----
 fs/bcachefs/buckets.c  | 76 ++++++++++++++++++++++++--------------------------
 fs/bcachefs/buckets.h  | 18 +++++++++---
 3 files changed, 57 insertions(+), 51 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index d525a3045766..fcad6e38a599 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -504,8 +504,8 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id,
 	 */
 	bkey_for_each_ptr_decode(k->k, ptrs, p, entry) {
 		struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev);
-		struct bucket *g = PTR_BUCKET(ca, &p.ptr, true);
-		struct bucket *g2 = PTR_BUCKET(ca, &p.ptr, false);
+		struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr);
+		struct bucket *g2 = PTR_BUCKET(ca, &p.ptr);
 		enum bch_data_type data_type = bch2_bkey_ptr_data_type(*k, &entry->ptr);
 
 		if (fsck_err_on(!g->gen_valid, c,
@@ -643,14 +643,14 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id,
 			ptrs = bch2_bkey_ptrs(bkey_i_to_s(new));
 			bkey_for_each_ptr(ptrs, ptr) {
 				struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
-				struct bucket *g = PTR_BUCKET(ca, ptr, true);
+				struct bucket *g = PTR_GC_BUCKET(ca, ptr);
 
 				ptr->gen = g->mark.gen;
 			}
 		} else {
 			bch2_bkey_drop_ptrs(bkey_i_to_s(new), ptr, ({
 				struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
-				struct bucket *g = PTR_BUCKET(ca, ptr, true);
+				struct bucket *g = PTR_GC_BUCKET(ca, ptr);
 				enum bch_data_type data_type = bch2_bkey_ptr_data_type(*k, ptr);
 
 				(ptr->cached &&
@@ -737,7 +737,7 @@ static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id,
 	ptrs = bch2_bkey_ptrs_c(*k);
 	bkey_for_each_ptr(ptrs, ptr) {
 		struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
-		struct bucket *g = PTR_BUCKET(ca, ptr, true);
+		struct bucket *g = PTR_GC_BUCKET(ca, ptr);
 
 		if (gen_after(g->oldest_gen, ptr->gen))
 			g->oldest_gen = ptr->gen;
@@ -1753,7 +1753,7 @@ static bool gc_btree_gens_key(struct bch_fs *c, struct bkey_s_c k)
 	percpu_down_read(&c->mark_lock);
 	bkey_for_each_ptr(ptrs, ptr) {
 		struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
-		struct bucket *g = PTR_BUCKET(ca, ptr, false);
+		struct bucket *g = PTR_BUCKET(ca, ptr);
 
 		if (gen_after(g->mark.gen, ptr->gen) > 16) {
 			percpu_up_read(&c->mark_lock);
@@ -1763,7 +1763,7 @@ static bool gc_btree_gens_key(struct bch_fs *c, struct bkey_s_c k)
 
 	bkey_for_each_ptr(ptrs, ptr) {
 		struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
-		struct bucket *g = PTR_BUCKET(ca, ptr, false);
+		struct bucket *g = PTR_BUCKET(ca, ptr);
 
 		if (gen_after(g->gc_gen, ptr->gen))
 			g->gc_gen = ptr->gen;
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 1959601fe056..c1b0d0be07a6 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -344,13 +344,6 @@ static inline enum bch_data_type bucket_type(struct bucket_mark m)
 		: m.data_type;
 }
 
-static bool bucket_became_unavailable(struct bucket_mark old,
-				      struct bucket_mark new)
-{
-	return is_available_bucket(old) &&
-	       !is_available_bucket(new);
-}
-
 static inline void account_bucket(struct bch_fs_usage *fs_usage,
 				  struct bch_dev_usage *dev_usage,
 				  enum bch_data_type type,
@@ -659,7 +652,7 @@ void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
 		return;
 
 	percpu_down_read(&c->mark_lock);
-	g = __bucket(ca, b, true);
+	g = gc_bucket(ca, b);
 	old = bucket_cmpxchg(g, new, ({
 		new.data_type	= data_type;
 		overflow = checked_add(new.dirty_sectors, sectors);
@@ -779,17 +772,18 @@ static int mark_stripe_bucket(struct btree_trans *trans,
 	enum bch_data_type data_type = parity ? BCH_DATA_parity : 0;
 	s64 sectors = parity ? le16_to_cpu(s->sectors) : 0;
 	const struct bch_extent_ptr *ptr = s->ptrs + ptr_idx;
-	bool gc = flags & BTREE_TRIGGER_GC;
 	struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
 	struct bucket *g;
 	struct bucket_mark new, old;
 	char buf[200];
 	int ret = 0;
 
+	BUG_ON(!(flags & BTREE_TRIGGER_GC));
+
 	/* * XXX doesn't handle deletion */
 
 	percpu_down_read(&c->mark_lock);
-	g = PTR_BUCKET(ca, ptr, gc);
+	g = PTR_GC_BUCKET(ca, ptr);
 
 	if (g->mark.dirty_sectors ||
 	    (g->stripe && g->stripe != k.k->p.offset)) {
@@ -823,7 +817,7 @@ static int mark_stripe_bucket(struct btree_trans *trans,
 	g->stripe		= k.k->p.offset;
 	g->stripe_redundancy	= s->nr_redundant;
 
-	bch2_dev_usage_update(c, ca, old, new, journal_seq, gc);
+	bch2_dev_usage_update(c, ca, old, new, journal_seq, true);
 err:
 	percpu_up_read(&c->mark_lock);
 
@@ -859,7 +853,6 @@ static int bch2_mark_pointer(struct btree_trans *trans,
 			     s64 sectors, enum bch_data_type data_type,
 			     unsigned flags)
 {
-	bool gc = flags & BTREE_TRIGGER_GC;
 	u64 journal_seq = trans->journal_res.seq;
 	struct bch_fs *c = trans->c;
 	struct bucket_mark old, new;
@@ -869,8 +862,10 @@ static int bch2_mark_pointer(struct btree_trans *trans,
 	u64 v;
 	int ret = 0;
 
+	BUG_ON(!(flags & BTREE_TRIGGER_GC));
+
 	percpu_down_read(&c->mark_lock);
-	g = PTR_BUCKET(ca, &p.ptr, gc);
+	g = PTR_GC_BUCKET(ca, &p.ptr);
 
 	v = atomic64_read(&g->_mark.v);
 	do {
@@ -900,9 +895,7 @@ static int bch2_mark_pointer(struct btree_trans *trans,
 			      old.v.counter,
 			      new.v.counter)) != old.v.counter);
 
-	bch2_dev_usage_update(c, ca, old, new, journal_seq, gc);
-
-	BUG_ON(!gc && bucket_became_unavailable(old, new));
+	bch2_dev_usage_update(c, ca, old, new, journal_seq, true);
 err:
 	percpu_up_read(&c->mark_lock);
 
@@ -916,37 +909,35 @@ static int bch2_mark_stripe_ptr(struct btree_trans *trans,
 				s64 sectors,
 				unsigned flags)
 {
-	bool gc = flags & BTREE_TRIGGER_GC;
 	struct bch_fs *c = trans->c;
 	struct bch_replicas_padded r;
+	struct gc_stripe *m;
 
-	if (!gc) {
-		BUG();
-	} else {
-		struct gc_stripe *m = genradix_ptr_alloc(&c->gc_stripes, p.idx, GFP_KERNEL);
-
-		if (!m)
-			return -ENOMEM;
+	BUG_ON(!(flags & BTREE_TRIGGER_GC));
 
-		spin_lock(&c->ec_stripes_heap_lock);
+	m = genradix_ptr_alloc(&c->gc_stripes, p.idx, GFP_KERNEL);
 
-		if (!m || !m->alive) {
-			spin_unlock(&c->ec_stripes_heap_lock);
-			bch_err_ratelimited(c, "pointer to nonexistent stripe %llu",
-					    (u64) p.idx);
-			bch2_inconsistent_error(c);
-			return -EIO;
-		}
+	if (!m)
+		return -ENOMEM;
 
-		m->block_sectors[p.block] += sectors;
+	spin_lock(&c->ec_stripes_heap_lock);
 
-		r = m->r;
+	if (!m || !m->alive) {
 		spin_unlock(&c->ec_stripes_heap_lock);
-
-		r.e.data_type = data_type;
-		update_replicas(c, k, &r.e, sectors, trans->journal_res.seq, gc);
+		bch_err_ratelimited(c, "pointer to nonexistent stripe %llu",
+				    (u64) p.idx);
+		bch2_inconsistent_error(c);
+		return -EIO;
 	}
 
+	m->block_sectors[p.block] += sectors;
+
+	r = m->r;
+	spin_unlock(&c->ec_stripes_heap_lock);
+
+	r.e.data_type = data_type;
+	update_replicas(c, k, &r.e, sectors, trans->journal_res.seq, true);
+
 	return 0;
 }
 
@@ -954,7 +945,6 @@ static int bch2_mark_extent(struct btree_trans *trans,
 			    struct bkey_s_c old, struct bkey_s_c new,
 			    unsigned flags)
 {
-	bool gc = flags & BTREE_TRIGGER_GC;
 	u64 journal_seq = trans->journal_res.seq;
 	struct bch_fs *c = trans->c;
 	struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old: new;
@@ -972,6 +962,8 @@ static int bch2_mark_extent(struct btree_trans *trans,
 	bool stale;
 	int ret;
 
+	BUG_ON(!(flags & BTREE_TRIGGER_GC));
+
 	r.e.data_type	= data_type;
 	r.e.nr_devs	= 0;
 	r.e.nr_required	= 1;
@@ -992,7 +984,7 @@ static int bch2_mark_extent(struct btree_trans *trans,
 		if (p.ptr.cached) {
 			if (!stale) {
 				ret = update_cached_sectors(c, k, p.ptr.dev,
-						disk_sectors, journal_seq, gc);
+						disk_sectors, journal_seq, true);
 				if (ret) {
 					bch2_fs_fatal_error(c, "bch2_mark_extent(): no replicas entry while updating cached sectors");
 					return ret;
@@ -1017,7 +1009,7 @@ static int bch2_mark_extent(struct btree_trans *trans,
 	}
 
 	if (r.e.nr_devs) {
-		ret = update_replicas(c, k, &r.e, dirty_sectors, journal_seq, gc);
+		ret = update_replicas(c, k, &r.e, dirty_sectors, journal_seq, true);
 		if (ret) {
 			char buf[200];
 
@@ -1168,6 +1160,8 @@ static int bch2_mark_reservation(struct btree_trans *trans,
 	unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas;
 	s64 sectors = (s64) k.k->size;
 
+	BUG_ON(!(flags & BTREE_TRIGGER_GC));
+
 	if (flags & BTREE_TRIGGER_OVERWRITE)
 		sectors = -sectors;
 	sectors *= replicas;
@@ -1242,6 +1236,8 @@ static int bch2_mark_reflink_p(struct btree_trans *trans,
 	u64 end = le64_to_cpu(p.v->idx) + p.k->size;
 	int ret = 0;
 
+	BUG_ON(!(flags & BTREE_TRIGGER_GC));
+
 	if (c->sb.version >= bcachefs_metadata_version_reflink_p_fix) {
 		idx -= le32_to_cpu(p.v->front_pad);
 		end += le32_to_cpu(p.v->back_pad);
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index afa29d6c5a73..61baaa66b6b5 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -53,6 +53,11 @@ static inline struct bucket *__bucket(struct bch_dev *ca, size_t b, bool gc)
 	return buckets->b + b;
 }
 
+static inline struct bucket *gc_bucket(struct bch_dev *ca, size_t b)
+{
+	return __bucket(ca, b, true);
+}
+
 static inline struct bucket *bucket(struct bch_dev *ca, size_t b)
 {
 	return __bucket(ca, b, false);
@@ -75,10 +80,15 @@ static inline size_t PTR_BUCKET_NR(const struct bch_dev *ca,
 }
 
 static inline struct bucket *PTR_BUCKET(struct bch_dev *ca,
-					const struct bch_extent_ptr *ptr,
-					bool gc)
+					const struct bch_extent_ptr *ptr)
+{
+	return bucket(ca, PTR_BUCKET_NR(ca, ptr));
+}
+
+static inline struct bucket *PTR_GC_BUCKET(struct bch_dev *ca,
+					   const struct bch_extent_ptr *ptr)
 {
-	return __bucket(ca, PTR_BUCKET_NR(ca, ptr), gc);
+	return gc_bucket(ca, PTR_BUCKET_NR(ca, ptr));
 }
 
 static inline enum bch_data_type ptr_data_type(const struct bkey *k,
@@ -113,7 +123,7 @@ static inline u8 ptr_stale(struct bch_dev *ca,
 	u8 ret;
 
 	rcu_read_lock();
-	ret = gen_after(PTR_BUCKET(ca, ptr, 0)->mark.gen, ptr->gen);
+	ret = gen_after(PTR_BUCKET(ca, ptr)->mark.gen, ptr->gen);
 	rcu_read_unlock();
 
 	return ret;
-- 
cgit 


From a786087744fcff140ecce0e1dd93a43186edf8ad Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 25 Dec 2021 19:55:34 -0500
Subject: bcachefs: New in-memory array for bucket gens

The main in-memory bucket array is going away, but we'll still need to
keep bucket generations in memory, at least for now - ptr_stale() needs
to be an efficient operation.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/alloc_background.c |  2 ++
 fs/bcachefs/alloc_foreground.c |  4 ++--
 fs/bcachefs/bcachefs.h         |  1 +
 fs/bcachefs/buckets.c          | 43 ++++++++++++++++++++++++++++++++++++++++--
 fs/bcachefs/buckets.h          | 20 +++++++++++++++++++-
 fs/bcachefs/buckets_types.h    |  7 +++++++
 6 files changed, 72 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 8831b2a0303a..45becfb1ffe9 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -354,6 +354,7 @@ static int bch2_alloc_read_fn(struct btree_trans *trans, struct bkey_s_c k)
 	g = bucket(ca, k.k->p.offset);
 	u = bch2_alloc_unpack(k);
 
+	*bucket_gen(ca, k.k->p.offset) = u.gen;
 	g->_mark.gen		= u.gen;
 	g->_mark.data_type	= u.data_type;
 	g->_mark.dirty_sectors	= u.dirty_sectors;
@@ -748,6 +749,7 @@ static int bch2_invalidate_one_bucket(struct bch_fs *c, struct bch_dev *ca,
 	    !bucket_needs_journal_commit(m, c->journal.last_seq_ondisk)) {
 		BUG_ON(m.data_type);
 		bucket_cmpxchg(g, m, m.gen++);
+		*bucket_gen(ca, b) = m.gen;
 		percpu_up_read(&c->mark_lock);
 		goto out;
 	}
diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index 7506d54c854b..e2038032b872 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -161,7 +161,7 @@ static void verify_not_stale(struct bch_fs *c, const struct open_buckets *obs)
 	open_bucket_for_each(c, obs, ob, i) {
 		struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
 
-		BUG_ON(bucket(ca, ob->bucket)->mark.gen != ob->gen);
+		BUG_ON(*bucket_gen(ca, ob->bucket) != ob->gen);
 	}
 	rcu_read_unlock();
 #endif
@@ -273,7 +273,7 @@ out:
 	ob->sectors_free = ca->mi.bucket_size;
 	ob->alloc_reserve = reserve;
 	ob->dev		= ca->dev_idx;
-	ob->gen		= bucket(ca, b)->mark.gen;
+	ob->gen		= *bucket_gen(ca, b);
 	ob->bucket	= b;
 	spin_unlock(&ob->lock);
 
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 6c686be28b39..c282086079fb 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -445,6 +445,7 @@ struct bch_dev {
 	 * Or rcu_read_lock(), but only for ptr_stale():
 	 */
 	struct bucket_array __rcu *buckets[2];
+	struct bucket_gens	*bucket_gens;
 	unsigned long		*buckets_nouse;
 	struct rw_semaphore	bucket_lock;
 
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index c1b0d0be07a6..4b7fe4a5def9 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -535,6 +535,20 @@ void bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
 	BUG_ON(owned_by_allocator == old.owned_by_allocator);
 }
 
+static inline u8 bkey_alloc_gen(struct bkey_s_c k)
+{
+	switch (k.k->type) {
+	case KEY_TYPE_alloc:
+		return bkey_s_c_to_alloc(k).v->gen;
+	case KEY_TYPE_alloc_v2:
+		return bkey_s_c_to_alloc_v2(k).v->gen;
+	case KEY_TYPE_alloc_v3:
+		return bkey_s_c_to_alloc_v3(k).v->gen;
+	default:
+		return 0;
+	}
+}
+
 static int bch2_mark_alloc(struct btree_trans *trans,
 			   struct bkey_s_c old, struct bkey_s_c new,
 			   unsigned flags)
@@ -573,9 +587,13 @@ static int bch2_mark_alloc(struct btree_trans *trans,
 	if (new.k->p.offset >= ca->mi.nbuckets)
 		return 0;
 
+	u = bch2_alloc_unpack(new);
+
 	percpu_down_read(&c->mark_lock);
+	if (!gc && u.gen != bkey_alloc_gen(old))
+		*bucket_gen(ca, new.k->p.offset) = u.gen;
+
 	g = __bucket(ca, new.k->p.offset, gc);
-	u = bch2_alloc_unpack(new);
 
 	old_m = bucket_cmpxchg(g, m, ({
 		m.gen			= u.gen;
@@ -2131,9 +2149,18 @@ static void buckets_free_rcu(struct rcu_head *rcu)
 		buckets->nbuckets * sizeof(struct bucket));
 }
 
+static void bucket_gens_free_rcu(struct rcu_head *rcu)
+{
+	struct bucket_gens *buckets =
+		container_of(rcu, struct bucket_gens, rcu);
+
+	kvpfree(buckets, sizeof(struct bucket_array) + buckets->nbuckets);
+}
+
 int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
 {
 	struct bucket_array *buckets = NULL, *old_buckets = NULL;
+	struct bucket_gens *bucket_gens = NULL, *old_bucket_gens = NULL;
 	unsigned long *buckets_nouse = NULL;
 	alloc_fifo	free[RESERVE_NR];
 	alloc_fifo	free_inc;
@@ -2157,6 +2184,8 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
 	if (!(buckets		= kvpmalloc(sizeof(struct bucket_array) +
 					    nbuckets * sizeof(struct bucket),
 					    GFP_KERNEL|__GFP_ZERO)) ||
+	    !(bucket_gens	= kvpmalloc(sizeof(struct bucket_gens) + nbuckets,
+					    GFP_KERNEL|__GFP_ZERO)) ||
 	    !(buckets_nouse	= kvpmalloc(BITS_TO_LONGS(nbuckets) *
 					    sizeof(unsigned long),
 					    GFP_KERNEL|__GFP_ZERO)) ||
@@ -2169,6 +2198,8 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
 
 	buckets->first_bucket	= ca->mi.first_bucket;
 	buckets->nbuckets	= nbuckets;
+	bucket_gens->first_bucket = ca->mi.first_bucket;
+	bucket_gens->nbuckets	= nbuckets;
 
 	bch2_copygc_stop(c);
 
@@ -2179,6 +2210,7 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
 	}
 
 	old_buckets = bucket_array(ca);
+	old_bucket_gens = rcu_dereference_protected(ca->bucket_gens, 1);
 
 	if (resize) {
 		size_t n = min(buckets->nbuckets, old_buckets->nbuckets);
@@ -2186,13 +2218,18 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
 		memcpy(buckets->b,
 		       old_buckets->b,
 		       n * sizeof(struct bucket));
+		memcpy(bucket_gens->b,
+		       old_bucket_gens->b,
+		       n);
 		memcpy(buckets_nouse,
 		       ca->buckets_nouse,
 		       BITS_TO_LONGS(n) * sizeof(unsigned long));
 	}
 
 	rcu_assign_pointer(ca->buckets[0], buckets);
-	buckets = old_buckets;
+	rcu_assign_pointer(ca->bucket_gens, bucket_gens);
+	buckets		= old_buckets;
+	bucket_gens	= old_bucket_gens;
 
 	swap(ca->buckets_nouse, buckets_nouse);
 
@@ -2226,6 +2263,8 @@ err:
 		free_fifo(&free[i]);
 	kvpfree(buckets_nouse,
 		BITS_TO_LONGS(nbuckets) * sizeof(unsigned long));
+	if (bucket_gens)
+		call_rcu(&old_buckets->rcu, bucket_gens_free_rcu);
 	if (buckets)
 		call_rcu(&old_buckets->rcu, buckets_free_rcu);
 
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index 61baaa66b6b5..6eeb95068b3b 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -63,6 +63,24 @@ static inline struct bucket *bucket(struct bch_dev *ca, size_t b)
 	return __bucket(ca, b, false);
 }
 
+static inline struct bucket_gens *bucket_gens(struct bch_dev *ca)
+{
+	return rcu_dereference_check(ca->bucket_gens,
+				     !ca->fs ||
+				     percpu_rwsem_is_held(&ca->fs->mark_lock) ||
+				     lockdep_is_held(&ca->fs->gc_lock) ||
+				     lockdep_is_held(&ca->bucket_lock));
+
+}
+
+static inline u8 *bucket_gen(struct bch_dev *ca, size_t b)
+{
+	struct bucket_gens *gens = bucket_gens(ca);
+
+	BUG_ON(b < gens->first_bucket || b >= gens->nbuckets);
+	return gens->b + b;
+}
+
 /*
  * bucket_gc_gen() returns the difference between the bucket's current gen and
  * the oldest gen of any pointer into that bucket in the btree.
@@ -123,7 +141,7 @@ static inline u8 ptr_stale(struct bch_dev *ca,
 	u8 ret;
 
 	rcu_read_lock();
-	ret = gen_after(PTR_BUCKET(ca, ptr)->mark.gen, ptr->gen);
+	ret = gen_after(*bucket_gen(ca, PTR_BUCKET_NR(ca, ptr)), ptr->gen);
 	rcu_read_unlock();
 
 	return ret;
diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h
index b2de2995c5e7..18bca269b750 100644
--- a/fs/bcachefs/buckets_types.h
+++ b/fs/bcachefs/buckets_types.h
@@ -52,6 +52,13 @@ struct bucket_array {
 	struct bucket		b[];
 };
 
+struct bucket_gens {
+	struct rcu_head		rcu;
+	u16			first_bucket;
+	size_t			nbuckets;
+	u8			b[];
+};
+
 struct bch_dev_usage {
 	u64			buckets_ec;
 	u64			buckets_unavailable;
-- 
cgit 


From 36f035e90804d30dba4336daafe1b89c9a8ffe98 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 26 Dec 2021 21:41:09 -0500
Subject: bcachefs: Fix allocator + journal interaction

The allocator needs to wait until the last update touching a bucket has
been commited before writing to it again. However, the code was checking
against the last dirty journal sequence number, not the last flushed
journal sequence number.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/alloc_background.c | 2 +-
 fs/bcachefs/buckets.c          | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 45becfb1ffe9..30bf363d2ff3 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -594,7 +594,7 @@ static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca)
 	buckets = bucket_array(ca);
 	ca->alloc_heap.used = 0;
 	now = atomic64_read(&c->io_clock[READ].now);
-	last_seq_ondisk = c->journal.last_seq_ondisk;
+	last_seq_ondisk = c->journal.flushed_seq_ondisk;
 
 	/*
 	 * Find buckets with lowest read priority, by building a maxheap sorted
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 4b7fe4a5def9..917575597ce5 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -50,7 +50,7 @@ static inline void fs_usage_data_type_to_base(struct bch_fs_usage *fs_usage,
 void bch2_bucket_seq_cleanup(struct bch_fs *c)
 {
 	u64 journal_seq = atomic64_read(&c->journal.seq);
-	u16 last_seq_ondisk = c->journal.last_seq_ondisk;
+	u16 last_seq_ondisk = c->journal.flushed_seq_ondisk;
 	struct bch_dev *ca;
 	struct bucket_array *buckets;
 	struct bucket *g;
-- 
cgit 


From 13f914ecb99b84e6c10cbbeba375e41735239828 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 26 Dec 2021 22:27:10 -0500
Subject: bcachefs: Kill bch2_ec_mem_alloc()

bch2_ec_mem_alloc() was only used by GC, and there's no real need to
preallocate the stripes radix tree since we can cope fine with memory
allocation failure when we use the radix tree. This deletes a fair bit
of code, and it's also needed for the upcoming patch because
bch2_btree_iter_peek_prev() won't be working before journal replay
completes (and using it was incorrect previously, as well).

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_gc.c |  7 -------
 fs/bcachefs/buckets.c  |  6 +++++-
 fs/bcachefs/ec.c       | 40 ----------------------------------------
 fs/bcachefs/ec.h       |  2 --
 4 files changed, 5 insertions(+), 50 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index fcad6e38a599..12f2faca4fa3 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -1270,7 +1270,6 @@ static int bch2_gc_start(struct bch_fs *c,
 {
 	struct bch_dev *ca = NULL;
 	unsigned i;
-	int ret;
 
 	BUG_ON(c->usage_gc);
 
@@ -1302,12 +1301,6 @@ static int bch2_gc_start(struct bch_fs *c,
 		}
 	}
 
-	ret = bch2_ec_mem_alloc(c, true);
-	if (ret) {
-		bch_err(c, "error allocating ec gc mem");
-		return ret;
-	}
-
 	percpu_down_write(&c->mark_lock);
 
 	/*
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 917575597ce5..a0b455b343ac 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -1094,7 +1094,11 @@ static int bch2_mark_stripe(struct btree_trans *trans,
 			spin_unlock(&c->ec_stripes_heap_lock);
 		}
 	} else {
-		struct gc_stripe *m = genradix_ptr(&c->gc_stripes, idx);
+		struct gc_stripe *m =
+			genradix_ptr_alloc(&c->gc_stripes, idx, GFP_KERNEL);
+
+		if (!m)
+			return -ENOMEM;
 
 		/*
 		 * This will be wrong when we bring back runtime gc: we should
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index 05f55b74d641..16e1fb845ce5 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -1608,46 +1608,6 @@ int bch2_stripes_read(struct bch_fs *c)
 	return ret;
 }
 
-int bch2_ec_mem_alloc(struct bch_fs *c, bool gc)
-{
-	struct btree_trans trans;
-	struct btree_iter iter;
-	struct bkey_s_c k;
-	size_t i, idx = 0;
-	int ret = 0;
-
-	bch2_trans_init(&trans, c, 0, 0);
-	bch2_trans_iter_init(&trans, &iter, BTREE_ID_stripes, POS(0, U64_MAX), 0);
-
-	k = bch2_btree_iter_prev(&iter);
-	ret = bkey_err(k);
-	if (!ret && k.k)
-		idx = k.k->p.offset + 1;
-
-	bch2_trans_iter_exit(&trans, &iter);
-	bch2_trans_exit(&trans);
-	if (ret)
-		return ret;
-
-	if (!idx)
-		return 0;
-
-	if (!gc &&
-	    !init_heap(&c->ec_stripes_heap, roundup_pow_of_two(idx),
-		       GFP_KERNEL))
-		return -ENOMEM;
-#if 0
-	ret = genradix_prealloc(&c->stripes[gc], idx, GFP_KERNEL);
-#else
-	for (i = 0; i < idx; i++)
-		if (!gc
-		    ? !genradix_ptr_alloc(&c->stripes, i, GFP_KERNEL)
-		    : !genradix_ptr_alloc(&c->gc_stripes, i, GFP_KERNEL))
-			return -ENOMEM;
-#endif
-	return 0;
-}
-
 void bch2_stripes_heap_to_text(struct printbuf *out, struct bch_fs *c)
 {
 	ec_stripes_heap *h = &c->ec_stripes_heap;
diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h
index 468141072bb4..78d468c7680a 100644
--- a/fs/bcachefs/ec.h
+++ b/fs/bcachefs/ec.h
@@ -217,8 +217,6 @@ void bch2_stripes_heap_start(struct bch_fs *);
 
 int bch2_stripes_read(struct bch_fs *);
 
-int bch2_ec_mem_alloc(struct bch_fs *, bool);
-
 void bch2_stripes_heap_to_text(struct printbuf *, struct bch_fs *);
 void bch2_new_stripes_to_text(struct printbuf *, struct bch_fs *);
 
-- 
cgit 


From 862bfd5062a0a512369ae647b94310ee873f95f3 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 27 Dec 2021 19:58:12 -0500
Subject: bcachefs: Update sysfs compression_stats for snapshots

 - BTREE_ITER_ALL_SNAPSHOTS flag is required here
 - change it to also walk the reflink btree
 - change it to accumulate stats for all pointers in an extent
 - change it to account for incompressible data

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/sysfs.c | 85 +++++++++++++++++++++++++++++++++++------------------
 1 file changed, 57 insertions(+), 28 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index 07e9b214bcb5..6d1596322ee2 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -266,8 +266,12 @@ static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c
 	struct btree_trans trans;
 	struct btree_iter iter;
 	struct bkey_s_c k;
-	u64 nr_uncompressed_extents = 0, uncompressed_sectors = 0,
+	enum btree_id id;
+	u64 nr_uncompressed_extents = 0,
 	    nr_compressed_extents = 0,
+	    nr_incompressible_extents = 0,
+	    uncompressed_sectors = 0,
+	    incompressible_sectors = 0,
 	    compressed_sectors_compressed = 0,
 	    compressed_sectors_uncompressed = 0;
 	int ret;
@@ -277,47 +281,72 @@ static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c
 
 	bch2_trans_init(&trans, c, 0, 0);
 
-	for_each_btree_key(&trans, iter, BTREE_ID_extents, POS_MIN, 0, k, ret)
-		if (k.k->type == KEY_TYPE_extent) {
-			struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
+	for (id = 0; id < BTREE_ID_NR; id++) {
+		if (!((1U << id) & BTREE_ID_HAS_PTRS))
+			continue;
+
+		for_each_btree_key(&trans, iter, id, POS_MIN,
+				   BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
+			struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
 			const union bch_extent_entry *entry;
 			struct extent_ptr_decoded p;
-
-			extent_for_each_ptr_decode(e, p, entry) {
-				if (!crc_is_compressed(p.crc)) {
-					nr_uncompressed_extents++;
-					uncompressed_sectors += e.k->size;
-				} else {
-					nr_compressed_extents++;
+			bool compressed = false, uncompressed = false, incompressible = false;
+
+			bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+				switch (p.crc.compression_type) {
+				case BCH_COMPRESSION_TYPE_none:
+					uncompressed = true;
+					uncompressed_sectors += k.k->size;
+					break;
+				case BCH_COMPRESSION_TYPE_incompressible:
+					incompressible = true;
+					incompressible_sectors += k.k->size;
+					break;
+				default:
 					compressed_sectors_compressed +=
 						p.crc.compressed_size;
 					compressed_sectors_uncompressed +=
 						p.crc.uncompressed_size;
+					compressed = true;
+					break;
 				}
-
-				/* only looking at the first ptr */
-				break;
 			}
+
+			if (incompressible)
+				nr_incompressible_extents++;
+			else if (uncompressed)
+				nr_uncompressed_extents++;
+			else if (compressed)
+				nr_compressed_extents++;
 		}
-	bch2_trans_iter_exit(&trans, &iter);
+		bch2_trans_iter_exit(&trans, &iter);
+	}
 
 	bch2_trans_exit(&trans);
+
 	if (ret)
 		return ret;
 
-	pr_buf(out,
-	       "uncompressed data:\n"
-	       "	nr extents:			%llu\n"
-	       "	size (bytes):			%llu\n"
-	       "compressed data:\n"
-	       "	nr extents:			%llu\n"
-	       "	compressed size (bytes):	%llu\n"
-	       "	uncompressed size (bytes):	%llu\n",
-	       nr_uncompressed_extents,
-	       uncompressed_sectors << 9,
-	       nr_compressed_extents,
-	       compressed_sectors_compressed << 9,
-	       compressed_sectors_uncompressed << 9);
+	pr_buf(out, "uncompressed:\n");
+	pr_buf(out, "	nr extents:		%llu\n", nr_uncompressed_extents);
+	pr_buf(out, "	size:			");
+	bch2_hprint(out, uncompressed_sectors << 9);
+	pr_buf(out, "\n");
+
+	pr_buf(out, "compressed:\n");
+	pr_buf(out, "	nr extents:		%llu\n", nr_compressed_extents);
+	pr_buf(out, "	compressed size:	");
+	bch2_hprint(out, compressed_sectors_compressed << 9);
+	pr_buf(out, "\n");
+	pr_buf(out, "	uncompressed size:	");
+	bch2_hprint(out, compressed_sectors_uncompressed << 9);
+	pr_buf(out, "\n");
+
+	pr_buf(out, "incompressible:\n");
+	pr_buf(out, "	nr extents:		%llu\n", nr_incompressible_extents);
+	pr_buf(out, "	size:			");
+	bch2_hprint(out, incompressible_sectors << 9);
+	pr_buf(out, "\n");
 	return 0;
 }
 
-- 
cgit 


From d93cf6858fe67a84e90af76994a2fe4b390c0a0b Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 27 Dec 2021 20:05:07 -0500
Subject: bcachefs: Run scan_old_btree_nodes after version upgrade

In the recovery path, we scan for old btree nodes if we don't have
certain compat bits set. If we do this, we should be doing it after we
upgraded to the newest on disk format.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/recovery.c | 41 ++++++++++++++++++++---------------------
 1 file changed, 20 insertions(+), 21 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index ffa8ab933a11..f6dc557b7439 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -1298,33 +1298,14 @@ use_clean:
 		bch_verbose(c, "quotas done");
 	}
 
-	if (!(c->sb.compat & (1ULL << BCH_COMPAT_extents_above_btree_updates_done)) ||
-	    !(c->sb.compat & (1ULL << BCH_COMPAT_bformat_overflow_done)) ||
-	    le16_to_cpu(c->sb.version_min) < bcachefs_metadata_version_btree_ptr_sectors_written) {
-		struct bch_move_stats stats;
-
-		bch_move_stats_init(&stats, "recovery");
-
-		bch_info(c, "scanning for old btree nodes");
-		ret = bch2_fs_read_write(c);
-		if (ret)
-			goto err;
-
-		ret = bch2_scan_old_btree_nodes(c, &stats);
-		if (ret)
-			goto err;
-		bch_info(c, "scanning for old btree nodes done");
-	}
-
 	mutex_lock(&c->sb_lock);
 	/*
 	 * With journal replay done, we can clear the journal seq blacklist
 	 * table:
 	 */
 	BUG_ON(!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags));
-	BUG_ON(le16_to_cpu(c->sb.version_min) < bcachefs_metadata_version_btree_ptr_sectors_written);
-
-	bch2_sb_resize_journal_seq_blacklist(&c->disk_sb, 0);
+	if (le16_to_cpu(c->sb.version_min) >= bcachefs_metadata_version_btree_ptr_sectors_written)
+		bch2_sb_resize_journal_seq_blacklist(&c->disk_sb, 0);
 
 	if (c->opts.version_upgrade) {
 		c->disk_sb.sb->version = cpu_to_le16(bcachefs_metadata_version_current);
@@ -1349,6 +1330,24 @@ use_clean:
 		bch2_write_super(c);
 	mutex_unlock(&c->sb_lock);
 
+	if (!(c->sb.compat & (1ULL << BCH_COMPAT_extents_above_btree_updates_done)) ||
+	    !(c->sb.compat & (1ULL << BCH_COMPAT_bformat_overflow_done)) ||
+	    le16_to_cpu(c->sb.version_min) < bcachefs_metadata_version_btree_ptr_sectors_written) {
+		struct bch_move_stats stats;
+
+		bch_move_stats_init(&stats, "recovery");
+
+		bch_info(c, "scanning for old btree nodes");
+		ret = bch2_fs_read_write(c);
+		if (ret)
+			goto err;
+
+		ret = bch2_scan_old_btree_nodes(c, &stats);
+		if (ret)
+			goto err;
+		bch_info(c, "scanning for old btree nodes done");
+	}
+
 	ret = 0;
 out:
 	set_bit(BCH_FS_FSCK_DONE, &c->flags);
-- 
cgit 


From c7ce813fe49a58344ba11219c0bd3a2fdb2e8b9e Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 27 Dec 2021 20:45:07 -0500
Subject: bcachefs: Add a tracepoint for the btree cache shrinker

This is to help with diagnosing why the btree node can doesn't seem to
be shrinking - we've had issues in the past with granularity/batch size,
since btree nodes are so big.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_cache.c | 11 +++++++++--
 fs/bcachefs/trace.h       | 28 ++++++++++++++++++++++++++++
 2 files changed, 37 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index 5bf493a315ca..40061887f5d8 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -275,6 +275,7 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink,
 	unsigned long touched = 0;
 	unsigned long freed = 0;
 	unsigned i, flags;
+	unsigned long ret = SHRINK_STOP;
 
 	if (bch2_btree_shrinker_disabled)
 		return SHRINK_STOP;
@@ -283,7 +284,7 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink,
 	if (sc->gfp_mask & __GFP_FS)
 		mutex_lock(&bc->lock);
 	else if (!mutex_trylock(&bc->lock))
-		return -1;
+		goto out_norestore;
 
 	flags = memalloc_nofs_save();
 
@@ -358,8 +359,14 @@ restart:
 
 	mutex_unlock(&bc->lock);
 out:
+	ret = (unsigned long) freed * btree_pages(c);
 	memalloc_nofs_restore(flags);
-	return (unsigned long) freed * btree_pages(c);
+out_norestore:
+	trace_btree_cache_scan(sc->nr_to_scan,
+			       sc->nr_to_scan / btree_pages(c),
+			       btree_cache_can_free(bc),
+			       ret);
+	return ret;
 }
 
 static unsigned long bch2_btree_cache_count(struct shrinker *shrink,
diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h
index 21d026277540..a1122fa3ccc6 100644
--- a/fs/bcachefs/trace.h
+++ b/fs/bcachefs/trace.h
@@ -318,6 +318,34 @@ DEFINE_EVENT(btree_node, btree_set_root,
 	TP_ARGS(c, b)
 );
 
+TRACE_EVENT(btree_cache_scan,
+	TP_PROTO(unsigned long nr_to_scan_pages,
+		 unsigned long nr_to_scan_nodes,
+		 unsigned long can_free_nodes,
+		 long ret),
+	TP_ARGS(nr_to_scan_pages, nr_to_scan_nodes, can_free_nodes, ret),
+
+	TP_STRUCT__entry(
+		__field(unsigned long,	nr_to_scan_pages	)
+		__field(unsigned long,	nr_to_scan_nodes	)
+		__field(unsigned long,	can_free_nodes		)
+		__field(long,		ret			)
+	),
+
+	TP_fast_assign(
+		__entry->nr_to_scan_pages	= nr_to_scan_pages;
+		__entry->nr_to_scan_nodes	= nr_to_scan_nodes;
+		__entry->can_free_nodes		= can_free_nodes;
+		__entry->ret			= ret;
+	),
+
+	TP_printk("scanned for %lu pages, %lu nodes, can free %lu nodes, ret %li",
+		  __entry->nr_to_scan_pages,
+		  __entry->nr_to_scan_nodes,
+		  __entry->can_free_nodes,
+		  __entry->ret)
+);
+
 /* Garbage collection */
 
 DEFINE_EVENT(btree_node, btree_gc_rewrite_node,
-- 
cgit 


From 5b2e599f506891eec8163c1d90800168a0016d14 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 27 Dec 2021 23:51:48 -0500
Subject: bcachefs: bch2_journal_noflush_seq()

Add bch2_journal_noflush_seq(), for telling the journal that entries
before a given sequence number should not be flushes - to be used by an
upcoming allocator optimization.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/journal.c    | 38 ++++++++++++++++++++++++++++++++++++++
 fs/bcachefs/journal.h    |  1 +
 fs/bcachefs/journal_io.c |  7 ++++---
 3 files changed, 43 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index 4fadb41c4c1e..3c7dce3b31c1 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -704,6 +704,44 @@ int bch2_journal_flush(struct journal *j)
 	return bch2_journal_flush_seq(j, seq);
 }
 
+/*
+ * bch2_journal_noflush_seq - tell the journal not to issue any flushes before
+ * @seq
+ */
+bool bch2_journal_noflush_seq(struct journal *j, u64 seq)
+{
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
+	u64 unwritten_seq;
+	bool ret = false;
+
+	if (!(c->sb.features & (1ULL << BCH_FEATURE_journal_no_flush)))
+		return false;
+
+	if (seq <= c->journal.flushed_seq_ondisk)
+		return false;
+
+	spin_lock(&j->lock);
+	if (seq <= c->journal.flushed_seq_ondisk)
+		goto out;
+
+	for (unwritten_seq = last_unwritten_seq(j);
+	     unwritten_seq < seq;
+	     unwritten_seq++) {
+		struct journal_buf *buf = journal_seq_to_buf(j, unwritten_seq);
+
+		/* journal write is already in flight, and was a flush write: */
+		if (unwritten_seq == last_unwritten_seq(j) && !buf->noflush)
+			goto out;
+
+		buf->noflush = true;
+	}
+
+	ret = true;
+out:
+	spin_unlock(&j->lock);
+	return ret;
+}
+
 /* block/unlock the journal: */
 
 void bch2_journal_unblock(struct journal *j)
diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h
index 2cfb6c7f0d14..17f9037b404a 100644
--- a/fs/bcachefs/journal.h
+++ b/fs/bcachefs/journal.h
@@ -475,6 +475,7 @@ void bch2_journal_flush_async(struct journal *, struct closure *);
 
 int bch2_journal_flush_seq(struct journal *, u64);
 int bch2_journal_flush(struct journal *);
+bool bch2_journal_noflush_seq(struct journal *, u64);
 int bch2_journal_meta(struct journal *);
 
 void bch2_journal_halt(struct journal *);
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index bda605095825..4f8dd0130b37 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -1396,9 +1396,10 @@ void bch2_journal_write(struct closure *cl)
 
 	spin_lock(&j->lock);
 	if (c->sb.features & (1ULL << BCH_FEATURE_journal_no_flush) &&
-	    !w->must_flush &&
-	    (jiffies - j->last_flush_write) < msecs_to_jiffies(c->opts.journal_flush_delay) &&
-	    test_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags)) {
+	    (w->noflush ||
+	     (!w->must_flush &&
+	      (jiffies - j->last_flush_write) < msecs_to_jiffies(c->opts.journal_flush_delay) &&
+	      test_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags)))) {
 		w->noflush = true;
 		SET_JSET_NO_FLUSH(jset, true);
 		jset->last_seq	= 0;
-- 
cgit 


From 8d65e475b20610854419fef8dba155200b45a687 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 12 Feb 2022 02:32:11 -0500
Subject: bcachefs: Always check for bucket reuse after read

Since dirty extents can be moved or overwritten, it's not just cached
data that we need the ptr_stale() check in bc2h_read_endio for - this
fixes data checksum errors seen in the tiering ktest tests.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/io.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index d69e63e519bf..f172da922904 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -1972,9 +1972,8 @@ static void bch2_read_endio(struct bio *bio)
 		return;
 	}
 
-	if (rbio->pick.ptr.cached &&
-	    (((rbio->flags & BCH_READ_RETRY_IF_STALE) && race_fault()) ||
-	     ptr_stale(ca, &rbio->pick.ptr))) {
+	if (((rbio->flags & BCH_READ_RETRY_IF_STALE) && race_fault()) ||
+	    ptr_stale(ca, &rbio->pick.ptr)) {
 		atomic_long_inc(&c->read_realloc_races);
 
 		if (rbio->flags & BCH_READ_RETRY_IF_STALE)
-- 
cgit 


From e3ad29379e47014461d540629628c2cc158c025d Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 27 Dec 2021 23:56:13 -0500
Subject: bcachefs: Optimize bucket reuse

If the btree updates pointing to a bucket were never flushed by the
journal before the bucket became empty again, we can reuse the bucket
without a journal flush.

This tweaks the tracking of journal sequence numbers in alloc keys to
implement this optimization: now, we only update the journal sequence
number in alloc keys on transitions to and from empty. When a bucket
becomes empty, we check if we can tell the journal not to flush entries
starting from when the bucket was used.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/buckets.c | 64 ++++++++++++++++++++++-----------------------------
 1 file changed, 28 insertions(+), 36 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index a0b455b343ac..869f6dd19d08 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -535,20 +535,6 @@ void bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
 	BUG_ON(owned_by_allocator == old.owned_by_allocator);
 }
 
-static inline u8 bkey_alloc_gen(struct bkey_s_c k)
-{
-	switch (k.k->type) {
-	case KEY_TYPE_alloc:
-		return bkey_s_c_to_alloc(k).v->gen;
-	case KEY_TYPE_alloc_v2:
-		return bkey_s_c_to_alloc_v2(k).v->gen;
-	case KEY_TYPE_alloc_v3:
-		return bkey_s_c_to_alloc_v3(k).v->gen;
-	default:
-		return 0;
-	}
-}
-
 static int bch2_mark_alloc(struct btree_trans *trans,
 			   struct bkey_s_c old, struct bkey_s_c new,
 			   unsigned flags)
@@ -556,16 +542,13 @@ static int bch2_mark_alloc(struct btree_trans *trans,
 	bool gc = flags & BTREE_TRIGGER_GC;
 	u64 journal_seq = trans->journal_res.seq;
 	struct bch_fs *c = trans->c;
-	struct bkey_alloc_unpacked u;
+	struct bkey_alloc_unpacked old_u = bch2_alloc_unpack(old);
+	struct bkey_alloc_unpacked new_u = bch2_alloc_unpack(new);
 	struct bch_dev *ca;
 	struct bucket *g;
 	struct bucket_mark old_m, m;
 	int ret = 0;
 
-	/* We don't do anything for deletions - do we?: */
-	if (!bkey_is_alloc(new.k))
-		return 0;
-
 	/*
 	 * alloc btree is read in by bch2_alloc_read, not gc:
 	 */
@@ -573,13 +556,24 @@ static int bch2_mark_alloc(struct btree_trans *trans,
 	    !(flags & BTREE_TRIGGER_BUCKET_INVALIDATE))
 		return 0;
 
-	if (flags & BTREE_TRIGGER_INSERT) {
+	if ((flags & BTREE_TRIGGER_INSERT) &&
+	    !old_u.data_type != !new_u.data_type &&
+	    new.k->type == KEY_TYPE_alloc_v3) {
 		struct bch_alloc_v3 *v = (struct bch_alloc_v3 *) new.v;
+		u64 old_journal_seq = le64_to_cpu(v->journal_seq);
 
 		BUG_ON(!journal_seq);
-		BUG_ON(new.k->type != KEY_TYPE_alloc_v3);
 
-		v->journal_seq = cpu_to_le64(journal_seq);
+		/*
+		 * If the btree updates referring to a bucket weren't flushed
+		 * before the bucket became empty again, then the we don't have
+		 * to wait on a journal flush before we can reuse the bucket:
+		 */
+		new_u.journal_seq = !new_u.data_type &&
+			(journal_seq == old_journal_seq ||
+			 bch2_journal_noflush_seq(&c->journal, old_journal_seq))
+			? 0 : journal_seq;
+		v->journal_seq = cpu_to_le64(new_u.journal_seq);
 	}
 
 	ca = bch_dev_bkey_exists(c, new.k->p.inode);
@@ -587,20 +581,18 @@ static int bch2_mark_alloc(struct btree_trans *trans,
 	if (new.k->p.offset >= ca->mi.nbuckets)
 		return 0;
 
-	u = bch2_alloc_unpack(new);
-
 	percpu_down_read(&c->mark_lock);
-	if (!gc && u.gen != bkey_alloc_gen(old))
-		*bucket_gen(ca, new.k->p.offset) = u.gen;
+	if (!gc && new_u.gen != old_u.gen)
+		*bucket_gen(ca, new.k->p.offset) = new_u.gen;
 
 	g = __bucket(ca, new.k->p.offset, gc);
 
 	old_m = bucket_cmpxchg(g, m, ({
-		m.gen			= u.gen;
-		m.data_type		= u.data_type;
-		m.dirty_sectors		= u.dirty_sectors;
-		m.cached_sectors	= u.cached_sectors;
-		m.stripe		= u.stripe != 0;
+		m.gen			= new_u.gen;
+		m.data_type		= new_u.data_type;
+		m.dirty_sectors		= new_u.dirty_sectors;
+		m.cached_sectors	= new_u.cached_sectors;
+		m.stripe		= new_u.stripe != 0;
 
 		if (journal_seq) {
 			m.journal_seq_valid	= 1;
@@ -610,12 +602,12 @@ static int bch2_mark_alloc(struct btree_trans *trans,
 
 	bch2_dev_usage_update(c, ca, old_m, m, journal_seq, gc);
 
-	g->io_time[READ]	= u.read_time;
-	g->io_time[WRITE]	= u.write_time;
-	g->oldest_gen		= u.oldest_gen;
+	g->io_time[READ]	= new_u.read_time;
+	g->io_time[WRITE]	= new_u.write_time;
+	g->oldest_gen		= new_u.oldest_gen;
 	g->gen_valid		= 1;
-	g->stripe		= u.stripe;
-	g->stripe_redundancy	= u.stripe_redundancy;
+	g->stripe		= new_u.stripe;
+	g->stripe_redundancy	= new_u.stripe_redundancy;
 	percpu_up_read(&c->mark_lock);
 
 	/*
-- 
cgit 


From 042b0f38642f534fe92122c5b5695739390bbe34 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 28 Dec 2021 16:01:25 -0500
Subject: bcachefs: bch2_hprint(): don't print decimal if conversion was exact

There's places where we parse these numbers, and our parsing doesn't
cope with decimals currently - this is a hack to get the device_add path
working again where for the device blocksize there doesn't ever need to
be a decimal.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/util.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c
index 8211c9a1b6cb..e7675b4597db 100644
--- a/fs/bcachefs/util.c
+++ b/fs/bcachefs/util.c
@@ -114,7 +114,7 @@ void bch2_hprint(struct printbuf *buf, s64 v)
 	 * 103 is magic: t is in the range [-1023, 1023] and we want
 	 * to turn it into [-9, 9]
 	 */
-	if (u && v < 100 && v > -100)
+	if (u && t && v < 100 && v > -100)
 		pr_buf(buf, ".%i", t / 103);
 	if (u)
 		pr_buf(buf, "%c", si_units[u]);
-- 
cgit 


From e8536925884144f1966de2628f78c0b2a295d247 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 28 Dec 2021 16:31:57 -0500
Subject: bcachefs: Improve error messages in device add path

This converts the error messages in the device add to a better style,
and adds some missing ones.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/super.c | 44 +++++++++++++++++++++++++++-----------------
 1 file changed, 27 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index ec55ab865b8f..574c336e108b 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -1605,18 +1605,24 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
 	int ret;
 
 	ret = bch2_read_super(path, &opts, &sb);
-	if (ret)
+	if (ret) {
+		bch_err(c, "device add error: error reading super: %i", ret);
 		return ret;
+	}
 
 	err = bch2_sb_validate(&sb);
-	if (err)
+	if (err) {
+		bch_err(c, "device add error: error validating super: %s", err);
 		return -EINVAL;
+	}
 
 	dev_mi = bch2_sb_get_members(sb.sb)->members[sb.sb->dev_idx];
 
 	err = bch2_dev_may_add(sb.sb, c);
-	if (err)
+	if (err) {
+		bch_err(c, "device add error: %s", err);
 		return -EINVAL;
+	}
 
 	ca = __bch2_dev_alloc(c, &dev_mi);
 	if (!ca) {
@@ -1630,24 +1636,27 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
 		return ret;
 	}
 
-	err = "journal alloc failed";
 	ret = bch2_dev_journal_alloc(ca);
-	if (ret)
+	if (ret) {
+		bch_err(c, "device add error: journal alloc failed");
 		goto err;
+	}
 
 	down_write(&c->state_lock);
 	mutex_lock(&c->sb_lock);
 
-	err = "insufficient space in new superblock";
 	ret = bch2_sb_from_fs(c, ca);
-	if (ret)
+	if (ret) {
+		bch_err(c, "device add error: new device superblock too small");
 		goto err_unlock;
+	}
 
 	mi = bch2_sb_get_members(ca->disk_sb.sb);
 
 	if (!bch2_sb_resize_members(&ca->disk_sb,
 				le32_to_cpu(mi->field.u64s) +
 				sizeof(dev_mi) / sizeof(u64))) {
+		bch_err(c, "device add error: new device superblock too small");
 		ret = -ENOSPC;
 		goto err_unlock;
 	}
@@ -1660,7 +1669,7 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
 		if (!bch2_dev_exists(c->disk_sb.sb, mi, dev_idx))
 			goto have_slot;
 no_slot:
-	err = "no slots available in superblock";
+	bch_err(c, "device add error: already have maximum number of devices");
 	ret = -ENOSPC;
 	goto err_unlock;
 
@@ -1669,12 +1678,12 @@ have_slot:
 	u64s = (sizeof(struct bch_sb_field_members) +
 		sizeof(struct bch_member) * nr_devices) / sizeof(u64);
 
-	err = "no space in superblock for member info";
-	ret = -ENOSPC;
-
 	mi = bch2_sb_resize_members(&c->disk_sb, u64s);
-	if (!mi)
+	if (!mi) {
+		bch_err(c, "device add error: no room in superblock for member info");
+		ret = -ENOSPC;
 		goto err_unlock;
+	}
 
 	/* success: */
 
@@ -1690,17 +1699,20 @@ have_slot:
 
 	bch2_dev_usage_journal_reserve(c);
 
-	err = "error marking superblock";
 	ret = bch2_trans_mark_dev_sb(c, ca);
-	if (ret)
+	if (ret) {
+		bch_err(c, "device add error: error marking new superblock: %i", ret);
 		goto err_late;
+	}
 
 	ca->new_fs_bucket_idx = 0;
 
 	if (ca->mi.state == BCH_MEMBER_STATE_rw) {
 		ret = __bch2_dev_read_write(c, ca);
-		if (ret)
+		if (ret) {
+			bch_err(c, "device add error: error going RW on new device: %i", ret);
 			goto err_late;
+		}
 	}
 
 	up_write(&c->state_lock);
@@ -1713,11 +1725,9 @@ err:
 	if (ca)
 		bch2_dev_free(ca);
 	bch2_free_super(&sb);
-	bch_err(c, "Unable to add device: %s", err);
 	return ret;
 err_late:
 	up_write(&c->state_lock);
-	bch_err(c, "Error going rw after adding device: %s", err);
 	return -EINVAL;
 }
 
-- 
cgit 


From 74ef5b0d3f3f8e290686b309ae595559a8963fde Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 29 Dec 2021 11:27:47 -0500
Subject: bcachefs: Fix keylist size in btree_update

This fixes a buffer overrun, fortunately caught by a BUG_ON().

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_update_interior.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h
index 8cf59cee6e4e..8dc86fa636d6 100644
--- a/fs/bcachefs/btree_update_interior.h
+++ b/fs/bcachefs/btree_update_interior.h
@@ -82,12 +82,12 @@ struct btree_update {
 	/* Nodes being freed: */
 	struct keylist			old_keys;
 	u64				_old_keys[BTREE_UPDATE_NODES_MAX *
-						  BKEY_BTREE_PTR_VAL_U64s_MAX];
+						  BKEY_BTREE_PTR_U64s_MAX];
 
 	/* Nodes being added: */
 	struct keylist			new_keys;
 	u64				_new_keys[BTREE_UPDATE_NODES_MAX *
-						  BKEY_BTREE_PTR_VAL_U64s_MAX];
+						  BKEY_BTREE_PTR_U64s_MAX];
 
 	/* New nodes, that will be made reachable by this update: */
 	struct btree			*new_nodes[BTREE_UPDATE_NODES_MAX];
-- 
cgit 


From 200472e91c6c6745e6ddf42d1b33265f84b26e68 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 27 Dec 2021 21:28:50 -0500
Subject: bcachefs: Add an error message for copygc spinning

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/movinggc.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c
index 346b9ee667ec..46a0875135d5 100644
--- a/fs/bcachefs/movinggc.c
+++ b/fs/bcachefs/movinggc.c
@@ -205,6 +205,11 @@ static int bch2_copygc(struct bch_fs *c)
 		up_read(&ca->bucket_lock);
 	}
 
+	if (!h->used) {
+		bch_err_ratelimited(c, "copygc requested to run but found no buckets to move!");
+		return 0;
+	}
+
 	/*
 	 * Our btree node allocations also come out of RESERVE_MOVINGGC:
 	 */
-- 
cgit 


From d248ee5637d4cc7952e9e2ad5a6a9099b2d54c48 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 29 Dec 2021 13:49:34 -0500
Subject: bcachefs: Add iter_flags arg to bch2_btree_delete_range()

Will be used by the new snapshot tests, to pass in
BTREE_ITER_ALL_SNAPSHOTS.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_update.h      |  2 +-
 fs/bcachefs/btree_update_leaf.c |  6 ++++--
 fs/bcachefs/ec.c                |  2 +-
 fs/bcachefs/quota.c             |  6 +++---
 fs/bcachefs/super.c             |  2 +-
 fs/bcachefs/tests.c             | 12 +++++++-----
 6 files changed, 17 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
index a61b64fc0859..7d16c35112f3 100644
--- a/fs/bcachefs/btree_update.h
+++ b/fs/bcachefs/btree_update.h
@@ -63,7 +63,7 @@ int bch2_btree_insert(struct bch_fs *, enum btree_id, struct bkey_i *,
 int bch2_btree_delete_range_trans(struct btree_trans *, enum btree_id,
 				  struct bpos, struct bpos, unsigned, u64 *);
 int bch2_btree_delete_range(struct bch_fs *, enum btree_id,
-			    struct bpos, struct bpos, u64 *);
+			    struct bpos, struct bpos, unsigned, u64 *);
 
 int bch2_btree_node_rewrite(struct btree_trans *, struct btree_iter *,
 			    struct btree *, unsigned);
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 50c9caa729ff..e3e5b17e9191 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -1475,7 +1475,7 @@ retry:
 		 */
 		delete.k.p = iter.pos;
 
-		if (btree_node_type_is_extents(id)) {
+		if (iter.flags & BTREE_ITER_IS_EXTENTS) {
 			unsigned max_sectors =
 				KEY_SIZE_MAX & (~0 << trans->c->block_bits);
 
@@ -1512,8 +1512,10 @@ retry:
  */
 int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id,
 			    struct bpos start, struct bpos end,
+			    unsigned iter_flags,
 			    u64 *journal_seq)
 {
 	return bch2_trans_do(c, NULL, journal_seq, 0,
-			     bch2_btree_delete_range_trans(&trans, id, start, end, 0, journal_seq));
+			     bch2_btree_delete_range_trans(&trans, id, start, end,
+							   iter_flags, journal_seq));
 }
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index 16e1fb845ce5..e18d2ecf7f07 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -677,7 +677,7 @@ static int ec_stripe_delete(struct bch_fs *c, size_t idx)
 	return bch2_btree_delete_range(c, BTREE_ID_stripes,
 				       POS(0, idx),
 				       POS(0, idx + 1),
-				       NULL);
+				       0, NULL);
 }
 
 static void ec_stripe_delete_work(struct work_struct *work)
diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c
index 8f8f4b0accd6..54bb2a454a5e 100644
--- a/fs/bcachefs/quota.c
+++ b/fs/bcachefs/quota.c
@@ -570,7 +570,7 @@ static int bch2_quota_remove(struct super_block *sb, unsigned uflags)
 		ret = bch2_btree_delete_range(c, BTREE_ID_quotas,
 					      POS(QTYP_USR, 0),
 					      POS(QTYP_USR + 1, 0),
-					      NULL);
+					      0, NULL);
 		if (ret)
 			return ret;
 	}
@@ -582,7 +582,7 @@ static int bch2_quota_remove(struct super_block *sb, unsigned uflags)
 		ret = bch2_btree_delete_range(c, BTREE_ID_quotas,
 					      POS(QTYP_GRP, 0),
 					      POS(QTYP_GRP + 1, 0),
-					      NULL);
+					      0, NULL);
 		if (ret)
 			return ret;
 	}
@@ -594,7 +594,7 @@ static int bch2_quota_remove(struct super_block *sb, unsigned uflags)
 		ret = bch2_btree_delete_range(c, BTREE_ID_quotas,
 					      POS(QTYP_PRJ, 0),
 					      POS(QTYP_PRJ + 1, 0),
-					      NULL);
+					      0, NULL);
 		if (ret)
 			return ret;
 	}
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 574c336e108b..d92bb50d0960 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -1484,7 +1484,7 @@ static int bch2_dev_remove_alloc(struct bch_fs *c, struct bch_dev *ca)
 	return bch2_btree_delete_range(c, BTREE_ID_alloc,
 				       POS(ca->dev_idx, 0),
 				       POS(ca->dev_idx + 1, 0),
-				       NULL);
+				       0, NULL);
 }
 
 int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c
index 145b85320d22..0247309a25e6 100644
--- a/fs/bcachefs/tests.c
+++ b/fs/bcachefs/tests.c
@@ -14,14 +14,14 @@ static void delete_test_keys(struct bch_fs *c)
 	int ret;
 
 	ret = bch2_btree_delete_range(c, BTREE_ID_extents,
-				      SPOS(0, 0, U32_MAX),
-				      SPOS(0, U64_MAX, U32_MAX),
+				      POS_MIN, SPOS_MAX,
+				      BTREE_ITER_ALL_SNAPSHOTS,
 				      NULL);
 	BUG_ON(ret);
 
 	ret = bch2_btree_delete_range(c, BTREE_ID_xattrs,
-				      SPOS(0, 0, U32_MAX),
-				      SPOS(0, U64_MAX, U32_MAX),
+				      POS_MIN, SPOS_MAX,
+				      BTREE_ITER_ALL_SNAPSHOTS,
 				      NULL);
 	BUG_ON(ret);
 }
@@ -749,7 +749,9 @@ static int seq_delete(struct bch_fs *c, u64 nr)
 	int ret;
 
 	ret = bch2_btree_delete_range(c, BTREE_ID_xattrs,
-				      SPOS(0, 0, U32_MAX), POS_MAX, NULL);
+				      POS_MIN, SPOS_MAX,
+				      BTREE_ITER_ALL_SNAPSHOTS,
+				      NULL);
 	if (ret)
 		bch_err(c, "error in seq_delete: %i", ret);
 	return ret;
-- 
cgit 


From 5ba2fd1145444b354ee4d014e3766f642ac14d6e Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 29 Dec 2021 15:55:25 -0500
Subject: bcachefs: Journal replay does't resort main list of keys

The upcoming BTREE_ITER_WITH_JOURNAL patch will require journal keys to
stay in sorted order, so the btree iterator code can overlay them over
btree keys.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/recovery.c | 67 +++++++++++++++++++++++++++++++++-----------------
 1 file changed, 44 insertions(+), 23 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index f6dc557b7439..0b923037d236 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -561,8 +561,8 @@ static int bch2_journal_replay_key(struct bch_fs *c, struct journal_key *k)
 
 static int journal_sort_seq_cmp(const void *_l, const void *_r)
 {
-	const struct journal_key *l = _l;
-	const struct journal_key *r = _r;
+	const struct journal_key *l = *((const struct journal_key **)_l);
+	const struct journal_key *r = *((const struct journal_key **)_r);
 
 	return  cmp_int(r->level,	l->level) ?:
 		cmp_int(l->journal_seq, r->journal_seq) ?:
@@ -570,19 +570,30 @@ static int journal_sort_seq_cmp(const void *_l, const void *_r)
 		bpos_cmp(l->k->k.p,	r->k->k.p);
 }
 
-static int bch2_journal_replay(struct bch_fs *c,
-			       struct journal_keys keys)
+static int bch2_journal_replay(struct bch_fs *c)
 {
+	struct journal_keys *keys = &c->journal_keys;
+	struct journal_key **keys_sorted, *k;
 	struct journal *j = &c->journal;
 	struct bch_dev *ca;
-	struct journal_key *i;
+	unsigned idx;
+	size_t i;
 	u64 seq;
-	int ret, idx;
+	int ret;
+
+	keys_sorted = kmalloc_array(sizeof(*keys_sorted), keys->nr, GFP_KERNEL);
+	if (!keys_sorted)
+		return -ENOMEM;
 
-	sort(keys.d, keys.nr, sizeof(keys.d[0]), journal_sort_seq_cmp, NULL);
+	for (i = 0; i < keys->nr; i++)
+		keys_sorted[i] = &keys->d[i];
 
-	if (keys.nr)
-		replay_now_at(j, keys.journal_seq_base);
+	sort(keys_sorted, keys->nr,
+	     sizeof(keys_sorted[0]),
+	     journal_sort_seq_cmp, NULL);
+
+	if (keys->nr)
+		replay_now_at(j, keys->journal_seq_base);
 
 	seq = j->replay_journal_seq;
 
@@ -590,12 +601,14 @@ static int bch2_journal_replay(struct bch_fs *c,
 	 * First replay updates to the alloc btree - these will only update the
 	 * btree key cache:
 	 */
-	for_each_journal_key(keys, i) {
+	for (i = 0; i < keys->nr; i++) {
+		k = keys_sorted[i];
+
 		cond_resched();
 
-		if (!i->level && i->btree_id == BTREE_ID_alloc) {
-			j->replay_journal_seq = keys.journal_seq_base + i->journal_seq;
-			ret = bch2_journal_replay_key(c, i);
+		if (!k->level && k->btree_id == BTREE_ID_alloc) {
+			j->replay_journal_seq = keys->journal_seq_base + k->journal_seq;
+			ret = bch2_journal_replay_key(c, k);
 			if (ret)
 				goto err;
 		}
@@ -609,12 +622,14 @@ static int bch2_journal_replay(struct bch_fs *c,
 	/*
 	 * Next replay updates to interior btree nodes:
 	 */
-	for_each_journal_key(keys, i) {
+	for (i = 0; i < keys->nr; i++) {
+		k = keys_sorted[i];
+
 		cond_resched();
 
-		if (i->level) {
-			j->replay_journal_seq = keys.journal_seq_base + i->journal_seq;
-			ret = bch2_journal_replay_key(c, i);
+		if (k->level) {
+			j->replay_journal_seq = keys->journal_seq_base + k->journal_seq;
+			ret = bch2_journal_replay_key(c, k);
 			if (ret)
 				goto err;
 		}
@@ -634,15 +649,17 @@ static int bch2_journal_replay(struct bch_fs *c,
 	/*
 	 * Now replay leaf node updates:
 	 */
-	for_each_journal_key(keys, i) {
+	for (i = 0; i < keys->nr; i++) {
+		k = keys_sorted[i];
+
 		cond_resched();
 
-		if (i->level || i->btree_id == BTREE_ID_alloc)
+		if (k->level || k->btree_id == BTREE_ID_alloc)
 			continue;
 
-		replay_now_at(j, keys.journal_seq_base + i->journal_seq);
+		replay_now_at(j, keys->journal_seq_base + k->journal_seq);
 
-		ret = bch2_journal_replay_key(c, i);
+		ret = bch2_journal_replay_key(c, k);
 		if (ret)
 			goto err;
 	}
@@ -652,10 +669,14 @@ static int bch2_journal_replay(struct bch_fs *c,
 
 	bch2_journal_set_replay_done(j);
 	bch2_journal_flush_all_pins(j);
+	kfree(keys_sorted);
+
 	return bch2_journal_error(j);
 err:
 	bch_err(c, "journal replay: error %d while replaying key at btree %s level %u",
-		ret, bch2_btree_ids[i->btree_id], i->level);
+		ret, bch2_btree_ids[k->btree_id], k->level);
+	kfree(keys_sorted);
+
 	return ret;
 }
 
@@ -1227,7 +1248,7 @@ use_clean:
 
 	bch_verbose(c, "starting journal replay");
 	err = "journal replay failed";
-	ret = bch2_journal_replay(c, c->journal_keys);
+	ret = bch2_journal_replay(c);
 	if (ret)
 		goto err;
 	bch_verbose(c, "journal replay done");
-- 
cgit 


From f0f41a6d74f7f682327eead3708473c11577b131 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 30 Dec 2021 20:14:52 -0500
Subject: bcachefs: Add error messages for memory allocation failures

This adds some missing diagnostics from rare but annoying to debug
runtime allocation failure paths.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_key_cache.c   | 28 +++++++++++++++++-----------
 fs/bcachefs/btree_update_leaf.c |  8 ++++++--
 fs/bcachefs/buckets.c           | 16 ++++++++++------
 fs/bcachefs/fsck.c              | 25 +++++++++++++++++--------
 4 files changed, 50 insertions(+), 27 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index d045b3a5deed..e5029703240c 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -147,19 +147,23 @@ bkey_cached_reuse(struct btree_key_cache *c)
 }
 
 static struct bkey_cached *
-btree_key_cache_create(struct btree_key_cache *c,
+btree_key_cache_create(struct bch_fs *c,
 		       enum btree_id btree_id,
 		       struct bpos pos)
 {
+	struct btree_key_cache *bc = &c->btree_key_cache;
 	struct bkey_cached *ck;
 	bool was_new = true;
 
-	ck = bkey_cached_alloc(c);
+	ck = bkey_cached_alloc(bc);
 
 	if (unlikely(!ck)) {
-		ck = bkey_cached_reuse(c);
-		if (unlikely(!ck))
+		ck = bkey_cached_reuse(bc);
+		if (unlikely(!ck)) {
+			bch_err(c, "error allocating memory for key cache item, btree %s",
+				bch2_btree_ids[btree_id]);
 			return ERR_PTR(-ENOMEM);
+		}
 
 		was_new = false;
 	}
@@ -176,7 +180,7 @@ btree_key_cache_create(struct btree_key_cache *c,
 	ck->valid		= false;
 	ck->flags		= 1U << BKEY_CACHED_ACCESSED;
 
-	if (unlikely(rhashtable_lookup_insert_fast(&c->table,
+	if (unlikely(rhashtable_lookup_insert_fast(&bc->table,
 					  &ck->hash,
 					  bch2_btree_key_cache_params))) {
 		/* We raced with another fill: */
@@ -186,15 +190,15 @@ btree_key_cache_create(struct btree_key_cache *c,
 			six_unlock_intent(&ck->c.lock);
 			kfree(ck);
 		} else {
-			mutex_lock(&c->lock);
-			bkey_cached_free(c, ck);
-			mutex_unlock(&c->lock);
+			mutex_lock(&bc->lock);
+			bkey_cached_free(bc, ck);
+			mutex_unlock(&bc->lock);
 		}
 
 		return NULL;
 	}
 
-	atomic_long_inc(&c->nr_keys);
+	atomic_long_inc(&bc->nr_keys);
 
 	six_unlock_write(&ck->c.lock);
 
@@ -205,6 +209,7 @@ static int btree_key_cache_fill(struct btree_trans *trans,
 				struct btree_path *ck_path,
 				struct bkey_cached *ck)
 {
+	struct bch_fs *c = trans->c;
 	struct btree_iter iter;
 	struct bkey_s_c k;
 	unsigned new_u64s = 0;
@@ -234,6 +239,8 @@ static int btree_key_cache_fill(struct btree_trans *trans,
 		new_u64s = roundup_pow_of_two(new_u64s);
 		new_k = kmalloc(new_u64s * sizeof(u64), GFP_NOFS);
 		if (!new_k) {
+			bch_err(c, "error allocating memory for key cache key, btree %s u64s %u",
+				bch2_btree_ids[ck->key.btree_id], new_u64s);
 			ret = -ENOMEM;
 			goto err;
 		}
@@ -294,8 +301,7 @@ retry:
 			return 0;
 		}
 
-		ck = btree_key_cache_create(&c->btree_key_cache,
-					    path->btree_id, path->pos);
+		ck = btree_key_cache_create(c, path->btree_id, path->pos);
 		ret = PTR_ERR_OR_ZERO(ck);
 		if (ret)
 			goto err;
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index e3e5b17e9191..a70dc68d2fba 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -308,6 +308,7 @@ btree_key_can_insert_cached(struct btree_trans *trans,
 			    struct btree_path *path,
 			    unsigned u64s)
 {
+	struct bch_fs *c = trans->c;
 	struct bkey_cached *ck = (void *) path->l[0].b;
 	unsigned new_u64s;
 	struct bkey_i *new_k;
@@ -315,7 +316,7 @@ btree_key_can_insert_cached(struct btree_trans *trans,
 	EBUG_ON(path->level);
 
 	if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) &&
-	    bch2_btree_key_cache_must_wait(trans->c) &&
+	    bch2_btree_key_cache_must_wait(c) &&
 	    !(trans->flags & BTREE_INSERT_JOURNAL_RECLAIM))
 		return BTREE_INSERT_NEED_JOURNAL_RECLAIM;
 
@@ -330,8 +331,11 @@ btree_key_can_insert_cached(struct btree_trans *trans,
 
 	new_u64s	= roundup_pow_of_two(u64s);
 	new_k		= krealloc(ck->k, new_u64s * sizeof(u64), GFP_NOFS);
-	if (!new_k)
+	if (!new_k) {
+		bch_err(c, "error allocating memory for key cache key, btree %s u64s %u",
+			bch2_btree_ids[path->btree_id], new_u64s);
 		return -ENOMEM;
+	}
 
 	ck->u64s	= new_u64s;
 	ck->k		= new_k;
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 869f6dd19d08..35ab5a5d8183 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -926,9 +926,11 @@ static int bch2_mark_stripe_ptr(struct btree_trans *trans,
 	BUG_ON(!(flags & BTREE_TRIGGER_GC));
 
 	m = genradix_ptr_alloc(&c->gc_stripes, p.idx, GFP_KERNEL);
-
-	if (!m)
+	if (!m) {
+		bch_err(c, "error allocating memory for gc_stripes, idx %llu",
+			(u64) p.idx);
 		return -ENOMEM;
+	}
 
 	spin_lock(&c->ec_stripes_heap_lock);
 
@@ -1039,7 +1041,7 @@ static int bch2_mark_stripe(struct btree_trans *trans,
 	bool gc = flags & BTREE_TRIGGER_GC;
 	u64 journal_seq = trans->journal_res.seq;
 	struct bch_fs *c = trans->c;
-	size_t idx = new.k->p.offset;
+	u64 idx = new.k->p.offset;
 	const struct bch_stripe *old_s = old.k->type == KEY_TYPE_stripe
 		? bkey_s_c_to_stripe(old).v : NULL;
 	const struct bch_stripe *new_s = new.k->type == KEY_TYPE_stripe
@@ -1057,7 +1059,7 @@ static int bch2_mark_stripe(struct btree_trans *trans,
 
 			bch2_bkey_val_to_text(&PBUF(buf1), c, old);
 			bch2_bkey_val_to_text(&PBUF(buf2), c, new);
-			bch_err_ratelimited(c, "error marking nonexistent stripe %zu while marking\n"
+			bch_err_ratelimited(c, "error marking nonexistent stripe %llu while marking\n"
 					    "old %s\n"
 					    "new %s", idx, buf1, buf2);
 			bch2_inconsistent_error(c);
@@ -1089,9 +1091,11 @@ static int bch2_mark_stripe(struct btree_trans *trans,
 		struct gc_stripe *m =
 			genradix_ptr_alloc(&c->gc_stripes, idx, GFP_KERNEL);
 
-		if (!m)
+		if (!m) {
+			bch_err(c, "error allocating memory for gc_stripes, idx %llu",
+				idx);
 			return -ENOMEM;
-
+		}
 		/*
 		 * This will be wrong when we bring back runtime gc: we should
 		 * be unmarking the old key and then marking the new key
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 361dbf338023..43b6159be01b 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -564,14 +564,17 @@ static struct inode_walker inode_walker_init(void)
 	return (struct inode_walker) { 0, };
 }
 
-static int inode_walker_realloc(struct inode_walker *w)
+static int inode_walker_realloc(struct bch_fs *c, struct inode_walker *w)
 {
 	if (w->nr == w->size) {
 		size_t new_size = max_t(size_t, 8UL, w->size * 2);
 		void *d = krealloc(w->d, new_size * sizeof(w->d[0]),
 				   GFP_KERNEL);
-		if (!d)
+		if (!d) {
+			bch_err(c, "fsck: error allocating memory for inode_walker, size %zu",
+				new_size);
 			return -ENOMEM;
+		}
 
 		w->d = d;
 		w->size = new_size;
@@ -586,7 +589,7 @@ static int add_inode(struct bch_fs *c, struct inode_walker *w,
 	struct bch_inode_unpacked u;
 	int ret;
 
-	ret = inode_walker_realloc(w);
+	ret = inode_walker_realloc(c, w);
 	if (ret)
 		return ret;
 
@@ -647,7 +650,7 @@ found:
 		while (i && w->d[i - 1].snapshot > pos.snapshot)
 			--i;
 
-		ret = inode_walker_realloc(w);
+		ret = inode_walker_realloc(c, w);
 		if (ret)
 			return ret;
 
@@ -1812,7 +1815,8 @@ static bool path_is_dup(struct pathbuf *p, u64 inum, u32 snapshot)
 	return false;
 }
 
-static int path_down(struct pathbuf *p, u64 inum, u32 snapshot)
+static int path_down(struct bch_fs *c, struct pathbuf *p,
+		     u64 inum, u32 snapshot)
 {
 	if (p->nr == p->size) {
 		size_t new_size = max_t(size_t, 256UL, p->size * 2);
@@ -1820,6 +1824,8 @@ static int path_down(struct pathbuf *p, u64 inum, u32 snapshot)
 				   new_size * sizeof(p->entries[0]),
 				   GFP_KERNEL);
 		if (!n) {
+			bch_err(c, "fsck: error allocating memory for pathbuf, size %zu",
+				new_size);
 			return -ENOMEM;
 		}
 
@@ -1893,7 +1899,7 @@ static int check_path(struct btree_trans *trans,
 		if (!S_ISDIR(inode->bi_mode))
 			break;
 
-		ret = path_down(p, inode->bi_inum, snapshot);
+		ret = path_down(c, p, inode->bi_inum, snapshot);
 		if (ret) {
 			bch_err(c, "memory allocation failure");
 			return ret;
@@ -1998,12 +2004,15 @@ struct nlink_table {
 	}		*d;
 };
 
-static int add_nlink(struct nlink_table *t, u64 inum, u32 snapshot)
+static int add_nlink(struct bch_fs *c, struct nlink_table *t,
+		     u64 inum, u32 snapshot)
 {
 	if (t->nr == t->size) {
 		size_t new_size = max_t(size_t, 128UL, t->size * 2);
 		void *d = kvmalloc(new_size * sizeof(t->d[0]), GFP_KERNEL);
 		if (!d) {
+			bch_err(c, "fsck: error allocating memory for nlink_table, size %zu",
+				new_size);
 			return -ENOMEM;
 		}
 
@@ -2093,7 +2102,7 @@ static int check_nlinks_find_hardlinks(struct bch_fs *c,
 		if (!u.bi_nlink)
 			continue;
 
-		ret = add_nlink(t, k.k->p.offset, k.k->p.snapshot);
+		ret = add_nlink(c, t, k.k->p.offset, k.k->p.snapshot);
 		if (ret) {
 			*end = k.k->p.offset;
 			ret = 0;
-- 
cgit 


From fb64f3fdac7171d1b2c62239d512b749dec9582a Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 31 Dec 2021 16:12:54 -0500
Subject: bcachefs: BCH_JSET_ENTRY_log

Add a journal entry type for logging messages, and add an option to use
it to log the transaction name - this makes for a very handy debugging
tool, as with it we can use the 'bcachefs list_journal' command to see
not only what updates were done, but what was doing them.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/bcachefs_format.h   |  9 ++++++++-
 fs/bcachefs/btree_types.h       |  1 +
 fs/bcachefs/btree_update_leaf.c | 44 ++++++++++++++++++++++++++++++++++++-----
 fs/bcachefs/journal_io.c        |  8 ++++++++
 fs/bcachefs/opts.h              |  5 +++++
 5 files changed, 61 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index 3c0ba301dad5..7eeab46d21b5 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -1430,6 +1430,7 @@ LE64_BITMASK(BCH_SB_INODES_USE_KEY_CACHE,struct bch_sb, flags[3], 29, 30);
 LE64_BITMASK(BCH_SB_JOURNAL_FLUSH_DELAY,struct bch_sb, flags[3], 30, 62);
 LE64_BITMASK(BCH_SB_JOURNAL_FLUSH_DISABLED,struct bch_sb, flags[3], 62, 63);
 LE64_BITMASK(BCH_SB_JOURNAL_RECLAIM_DELAY,struct bch_sb, flags[4], 0, 32);
+LE64_BITMASK(BCH_SB_JOURNAL_TRANSACTION_NAMES,struct bch_sb, flags[4], 32, 33);
 
 /*
  * Features:
@@ -1667,7 +1668,8 @@ static inline __u64 __bset_magic(struct bch_sb *sb)
 	x(usage,		5)		\
 	x(data_usage,		6)		\
 	x(clock,		7)		\
-	x(dev_usage,		8)
+	x(dev_usage,		8)		\
+	x(log,			9)
 
 enum {
 #define x(f, nr)	BCH_JSET_ENTRY_##f	= nr,
@@ -1739,6 +1741,11 @@ struct jset_entry_dev_usage {
 	struct jset_entry_dev_usage_type d[];
 } __attribute__((packed));
 
+struct jset_entry_log {
+	struct jset_entry	entry;
+	u8			d[];
+} __attribute__((packed));
+
 /*
  * On disk format for a journal entry:
  * seq is monotonically increasing; every journal entry has its own unique
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 223af7848fb4..2c8b30949e6f 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -380,6 +380,7 @@ struct btree_trans {
 	bool			in_traverse_all:1;
 	bool			restarted:1;
 	bool			paths_sorted:1;
+	bool			journal_transaction_names:1;
 	/*
 	 * For when bch2_trans_update notices we'll be splitting a compressed
 	 * extent:
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index a70dc68d2fba..40deafced921 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -290,6 +290,31 @@ static inline int bch2_trans_journal_res_get(struct btree_trans *trans,
 	return ret == -EAGAIN ? BTREE_INSERT_NEED_JOURNAL_RES : ret;
 }
 
+#define JSET_ENTRY_LOG_U64s		4
+
+static noinline void journal_transaction_name(struct btree_trans *trans)
+{
+	struct bch_fs *c = trans->c;
+	struct jset_entry *entry = journal_res_entry(&c->journal, &trans->journal_res);
+	struct jset_entry_log *l = container_of(entry, struct jset_entry_log, entry);
+	unsigned u64s = JSET_ENTRY_LOG_U64s - 1;
+	unsigned b, buflen = u64s * sizeof(u64);
+
+	l->entry.u64s		= cpu_to_le16(u64s);
+	l->entry.btree_id	= 0;
+	l->entry.level		= 0;
+	l->entry.type		= BCH_JSET_ENTRY_log;
+	l->entry.pad[0]		= 0;
+	l->entry.pad[1]		= 0;
+	l->entry.pad[2]		= 0;
+	b = snprintf(l->d, buflen, "%ps", (void *) trans->ip);
+	while (b < buflen)
+		l->d[b++] = '\0';
+
+	trans->journal_res.offset	+= JSET_ENTRY_LOG_U64s;
+	trans->journal_res.u64s		-= JSET_ENTRY_LOG_U64s;
+}
+
 static inline enum btree_insert_ret
 btree_key_can_insert(struct btree_trans *trans,
 		     struct btree *b,
@@ -451,6 +476,9 @@ bch2_trans_commit_write_locked(struct btree_trans *trans,
 				JOURNAL_RES_GET_NONBLOCK);
 		if (ret)
 			return ret;
+
+		if (unlikely(trans->journal_transaction_names))
+			journal_transaction_name(trans);
 	} else {
 		trans->journal_res.seq = c->journal.replay_journal_seq;
 	}
@@ -911,6 +939,7 @@ static int bch2_trans_commit_run_triggers(struct btree_trans *trans)
 
 int __bch2_trans_commit(struct btree_trans *trans)
 {
+	struct bch_fs *c = trans->c;
 	struct btree_insert_entry *i = NULL;
 	unsigned u64s;
 	int ret = 0;
@@ -920,15 +949,20 @@ int __bch2_trans_commit(struct btree_trans *trans)
 		goto out_reset;
 
 	if (trans->flags & BTREE_INSERT_GC_LOCK_HELD)
-		lockdep_assert_held(&trans->c->gc_lock);
+		lockdep_assert_held(&c->gc_lock);
 
 	memset(&trans->journal_preres, 0, sizeof(trans->journal_preres));
 
 	trans->journal_u64s		= trans->extra_journal_entry_u64s;
 	trans->journal_preres_u64s	= 0;
 
+	trans->journal_transaction_names = READ_ONCE(c->opts.journal_transaction_names);
+
+	if (trans->journal_transaction_names)
+		trans->journal_u64s += JSET_ENTRY_LOG_U64s;
+
 	if (!(trans->flags & BTREE_INSERT_NOCHECK_RW) &&
-	    unlikely(!percpu_ref_tryget(&trans->c->writes))) {
+	    unlikely(!percpu_ref_tryget(&c->writes))) {
 		ret = bch2_trans_commit_get_rw_cold(trans);
 		if (ret)
 			goto out_reset;
@@ -972,7 +1006,7 @@ int __bch2_trans_commit(struct btree_trans *trans)
 	}
 
 	if (trans->extra_journal_res) {
-		ret = bch2_disk_reservation_add(trans->c, trans->disk_res,
+		ret = bch2_disk_reservation_add(c, trans->disk_res,
 				trans->extra_journal_res,
 				(trans->flags & BTREE_INSERT_NOFAIL)
 				? BCH_DISK_RESERVATION_NOFAIL : 0);
@@ -991,10 +1025,10 @@ retry:
 	if (ret)
 		goto err;
 out:
-	bch2_journal_preres_put(&trans->c->journal, &trans->journal_preres);
+	bch2_journal_preres_put(&c->journal, &trans->journal_preres);
 
 	if (likely(!(trans->flags & BTREE_INSERT_NOCHECK_RW)))
-		percpu_ref_put(&trans->c->writes);
+		percpu_ref_put(&c->writes);
 out_reset:
 	trans_for_each_update(trans, i)
 		bch2_path_put(trans, i->path, true);
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 4f8dd0130b37..012b89e9d3cd 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -478,6 +478,14 @@ fsck_err:
 	return ret;
 }
 
+static int journal_entry_validate_log(struct bch_fs *c,
+				      const char *where,
+				      struct jset_entry *entry,
+				      unsigned version, int big_endian, int write)
+{
+	return 0;
+}
+
 struct jset_entry_ops {
 	int (*validate)(struct bch_fs *, const char *,
 			struct jset_entry *, unsigned, int, int);
diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
index af61fe588d3f..f7355c455b62 100644
--- a/fs/bcachefs/opts.h
+++ b/fs/bcachefs/opts.h
@@ -327,6 +327,11 @@ enum opt_type {
 	  OPT_BOOL(),							\
 	  NO_SB_OPT,			false,				\
 	  NULL,		"Read all journal entries, not just dirty ones")\
+	x(journal_transaction_names,	u8,				\
+	  OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,			\
+	  OPT_BOOL(),							\
+	  BCH_SB_JOURNAL_TRANSACTION_NAMES, true,			\
+	  NULL,		"Log transaction function names in journal")	\
 	x(noexcl,			u8,				\
 	  OPT_FS|OPT_MOUNT,						\
 	  OPT_BOOL(),							\
-- 
cgit 


From 528b18e6d1c67ccf4ab01cdee94299f3ac61e1ec Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 31 Dec 2021 17:06:29 -0500
Subject: bcachefs: bch2_journal_entry_to_text()

This adds a _to_text() pretty printer for journal entries - including
every subtype - which will shortly be used by the 'bcachefs
list_journal' subcommand.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/bcachefs_format.h |  20 ++++--
 fs/bcachefs/journal_io.c      | 141 ++++++++++++++++++++++++++++++++++++++----
 fs/bcachefs/journal_io.h      |   6 +-
 fs/bcachefs/opts.c            |  10 +++
 fs/bcachefs/opts.h            |   2 +
 fs/bcachefs/recovery.c        |  11 ++--
 fs/bcachefs/super-io.c        |   6 +-
 7 files changed, 169 insertions(+), 27 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index 7eeab46d21b5..ace3df19950d 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -76,6 +76,7 @@
 #include <asm/byteorder.h>
 #include <linux/kernel.h>
 #include <linux/uuid.h>
+#include "vstructs.h"
 
 #ifdef __KERNEL__
 typedef uuid_t __uuid_t;
@@ -1699,11 +1700,16 @@ struct jset_entry_blacklist_v2 {
 	__le64			end;
 };
 
+#define BCH_FS_USAGE_TYPES()			\
+	x(reserved,		0)		\
+	x(inodes,		1)		\
+	x(key_version,		2)
+
 enum {
-	FS_USAGE_RESERVED		= 0,
-	FS_USAGE_INODES			= 1,
-	FS_USAGE_KEY_VERSION		= 2,
-	FS_USAGE_NR			= 3
+#define x(f, nr)	BCH_FS_USAGE_##f	= nr,
+	BCH_FS_USAGE_TYPES()
+#undef x
+	BCH_FS_USAGE_NR
 };
 
 struct jset_entry_usage {
@@ -1741,6 +1747,12 @@ struct jset_entry_dev_usage {
 	struct jset_entry_dev_usage_type d[];
 } __attribute__((packed));
 
+static inline unsigned jset_entry_dev_usage_nr_types(struct jset_entry_dev_usage *u)
+{
+	return (vstruct_bytes(&u->entry) - sizeof(struct jset_entry_dev_usage)) /
+		sizeof(struct jset_entry_dev_usage_type);
+}
+
 struct jset_entry_log {
 	struct jset_entry	entry;
 	u8			d[];
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 012b89e9d3cd..22feea751b00 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -273,7 +273,7 @@ fsck_err:
 	return ret;
 }
 
-static int journal_entry_validate_btree_keys(struct bch_fs *c,
+static int journal_entry_btree_keys_validate(struct bch_fs *c,
 					     const char *where,
 					     struct jset_entry *entry,
 					     unsigned version, int big_endian, int write)
@@ -294,7 +294,18 @@ static int journal_entry_validate_btree_keys(struct bch_fs *c,
 	return 0;
 }
 
-static int journal_entry_validate_btree_root(struct bch_fs *c,
+static void journal_entry_btree_keys_to_text(struct printbuf *out, struct bch_fs *c,
+					     struct jset_entry *entry)
+{
+	struct bkey_i *k;
+
+	pr_buf(out, "btree=%s l=%u ", bch2_btree_ids[entry->btree_id], entry->level);
+
+	vstruct_for_each(entry, k)
+		bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(k));
+}
+
+static int journal_entry_btree_root_validate(struct bch_fs *c,
 					     const char *where,
 					     struct jset_entry *entry,
 					     unsigned version, int big_endian, int write)
@@ -322,7 +333,13 @@ fsck_err:
 	return ret;
 }
 
-static int journal_entry_validate_prio_ptrs(struct bch_fs *c,
+static void journal_entry_btree_root_to_text(struct printbuf *out, struct bch_fs *c,
+					     struct jset_entry *entry)
+{
+	journal_entry_btree_keys_to_text(out, c, entry);
+}
+
+static int journal_entry_prio_ptrs_validate(struct bch_fs *c,
 					    const char *where,
 					    struct jset_entry *entry,
 					    unsigned version, int big_endian, int write)
@@ -331,7 +348,12 @@ static int journal_entry_validate_prio_ptrs(struct bch_fs *c,
 	return 0;
 }
 
-static int journal_entry_validate_blacklist(struct bch_fs *c,
+static void journal_entry_prio_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
+					    struct jset_entry *entry)
+{
+}
+
+static int journal_entry_blacklist_validate(struct bch_fs *c,
 					    const char *where,
 					    struct jset_entry *entry,
 					    unsigned version, int big_endian, int write)
@@ -346,7 +368,16 @@ fsck_err:
 	return ret;
 }
 
-static int journal_entry_validate_blacklist_v2(struct bch_fs *c,
+static void journal_entry_blacklist_to_text(struct printbuf *out, struct bch_fs *c,
+					    struct jset_entry *entry)
+{
+	struct jset_entry_blacklist *bl =
+		container_of(entry, struct jset_entry_blacklist, entry);
+
+	pr_buf(out, "seq=%llu", le64_to_cpu(bl->seq));
+}
+
+static int journal_entry_blacklist_v2_validate(struct bch_fs *c,
 					       const char *where,
 					       struct jset_entry *entry,
 					       unsigned version, int big_endian, int write)
@@ -372,7 +403,18 @@ fsck_err:
 	return ret;
 }
 
-static int journal_entry_validate_usage(struct bch_fs *c,
+static void journal_entry_blacklist_v2_to_text(struct printbuf *out, struct bch_fs *c,
+					       struct jset_entry *entry)
+{
+	struct jset_entry_blacklist_v2 *bl =
+		container_of(entry, struct jset_entry_blacklist_v2, entry);
+
+	pr_buf(out, "start=%llu end=%llu",
+	       le64_to_cpu(bl->start),
+	       le64_to_cpu(bl->end));
+}
+
+static int journal_entry_usage_validate(struct bch_fs *c,
 					const char *where,
 					struct jset_entry *entry,
 					unsigned version, int big_endian, int write)
@@ -393,7 +435,18 @@ fsck_err:
 	return ret;
 }
 
-static int journal_entry_validate_data_usage(struct bch_fs *c,
+static void journal_entry_usage_to_text(struct printbuf *out, struct bch_fs *c,
+					struct jset_entry *entry)
+{
+	struct jset_entry_usage *u =
+		container_of(entry, struct jset_entry_usage, entry);
+
+	pr_buf(out, "type=%s v=%llu",
+	       bch2_fs_usage_types[u->entry.btree_id],
+	       le64_to_cpu(u->v));
+}
+
+static int journal_entry_data_usage_validate(struct bch_fs *c,
 					const char *where,
 					struct jset_entry *entry,
 					unsigned version, int big_endian, int write)
@@ -415,7 +468,17 @@ fsck_err:
 	return ret;
 }
 
-static int journal_entry_validate_clock(struct bch_fs *c,
+static void journal_entry_data_usage_to_text(struct printbuf *out, struct bch_fs *c,
+					     struct jset_entry *entry)
+{
+	struct jset_entry_data_usage *u =
+		container_of(entry, struct jset_entry_data_usage, entry);
+
+	bch2_replicas_entry_to_text(out, &u->r);
+	pr_buf(out, "=%llu", le64_to_cpu(u->v));
+}
+
+static int journal_entry_clock_validate(struct bch_fs *c,
 					const char *where,
 					struct jset_entry *entry,
 					unsigned version, int big_endian, int write)
@@ -441,7 +504,16 @@ fsck_err:
 	return ret;
 }
 
-static int journal_entry_validate_dev_usage(struct bch_fs *c,
+static void journal_entry_clock_to_text(struct printbuf *out, struct bch_fs *c,
+					struct jset_entry *entry)
+{
+	struct jset_entry_clock *clock =
+		container_of(entry, struct jset_entry_clock, entry);
+
+	pr_buf(out, "%s=%llu", clock->rw ? "write" : "read", le64_to_cpu(clock->time));
+}
+
+static int journal_entry_dev_usage_validate(struct bch_fs *c,
 					    const char *where,
 					    struct jset_entry *entry,
 					    unsigned version, int big_endian, int write)
@@ -478,7 +550,32 @@ fsck_err:
 	return ret;
 }
 
-static int journal_entry_validate_log(struct bch_fs *c,
+static void journal_entry_dev_usage_to_text(struct printbuf *out, struct bch_fs *c,
+					    struct jset_entry *entry)
+{
+	struct jset_entry_dev_usage *u =
+		container_of(entry, struct jset_entry_dev_usage, entry);
+	unsigned i, nr_types = jset_entry_dev_usage_nr_types(u);
+
+	pr_buf(out, "dev=%u", le32_to_cpu(u->dev));
+
+	for (i = 0; i < nr_types; i++) {
+		if (i < BCH_DATA_NR)
+			pr_buf(out, " %s", bch2_data_types[i]);
+		else
+			pr_buf(out, " (unknown data type %u)", i);
+		pr_buf(out, ": buckets=%llu sectors=%llu fragmented=%llu",
+		       le64_to_cpu(u->d[i].buckets),
+		       le64_to_cpu(u->d[i].sectors),
+		       le64_to_cpu(u->d[i].fragmented));
+	}
+
+	pr_buf(out, " buckets_ec: %llu buckets_unavailable: %llu",
+	       le64_to_cpu(u->buckets_ec),
+	       le64_to_cpu(u->buckets_unavailable));
+}
+
+static int journal_entry_log_validate(struct bch_fs *c,
 				      const char *where,
 				      struct jset_entry *entry,
 				      unsigned version, int big_endian, int write)
@@ -486,15 +583,26 @@ static int journal_entry_validate_log(struct bch_fs *c,
 	return 0;
 }
 
+static void journal_entry_log_to_text(struct printbuf *out, struct bch_fs *c,
+				      struct jset_entry *entry)
+{
+	struct jset_entry_log *l = container_of(entry, struct jset_entry_log, entry);
+	unsigned bytes = vstruct_bytes(entry) - offsetof(struct jset_entry_log, d);
+
+	bch_scnmemcpy(out, l->d, strnlen(l->d, bytes));
+}
+
 struct jset_entry_ops {
 	int (*validate)(struct bch_fs *, const char *,
 			struct jset_entry *, unsigned, int, int);
+	void (*to_text)(struct printbuf *, struct bch_fs *, struct jset_entry *);
 };
 
 static const struct jset_entry_ops bch2_jset_entry_ops[] = {
 #define x(f, nr)						\
 	[BCH_JSET_ENTRY_##f]	= (struct jset_entry_ops) {	\
-		.validate	= journal_entry_validate_##f,	\
+		.validate	= journal_entry_##f##_validate,	\
+		.to_text	= journal_entry_##f##_to_text,	\
 	},
 	BCH_JSET_ENTRY_TYPES()
 #undef x
@@ -510,6 +618,17 @@ int bch2_journal_entry_validate(struct bch_fs *c, const char *where,
 		: 0;
 }
 
+void bch2_journal_entry_to_text(struct printbuf *out, struct bch_fs *c,
+				struct jset_entry *entry)
+{
+	if (entry->type < BCH_JSET_ENTRY_NR) {
+		pr_buf(out, "%s: ", bch2_jset_entry_types[entry->type]);
+		bch2_jset_entry_ops[entry->type].to_text(out, c, entry);
+	} else {
+		pr_buf(out, "(unknown type %u)", entry->type);
+	}
+}
+
 static int jset_validate_entries(struct bch_fs *c, struct jset *jset,
 				 int write)
 {
diff --git a/fs/bcachefs/journal_io.h b/fs/bcachefs/journal_io.h
index f34281a28f12..d8425fe0d67b 100644
--- a/fs/bcachefs/journal_io.h
+++ b/fs/bcachefs/journal_io.h
@@ -40,8 +40,10 @@ static inline struct jset_entry *__jset_entry_type_next(struct jset *jset,
 	for_each_jset_entry_type(entry, jset, BCH_JSET_ENTRY_btree_keys)	\
 		vstruct_for_each_safe(entry, k, _n)
 
-int bch2_journal_entry_validate(struct bch_fs *, const char *, struct jset_entry *,
-				unsigned, int, int);
+int bch2_journal_entry_validate(struct bch_fs *, const char *,
+				struct jset_entry *, unsigned, int, int);
+void bch2_journal_entry_to_text(struct printbuf *, struct bch_fs *,
+				struct jset_entry *);
 
 int bch2_journal_read(struct bch_fs *, struct list_head *, u64 *, u64 *);
 
diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c
index d9ca69f2ecde..71bf26eb13d5 100644
--- a/fs/bcachefs/opts.c
+++ b/fs/bcachefs/opts.c
@@ -71,6 +71,16 @@ const char * const bch2_member_states[] = {
 	NULL
 };
 
+const char * const bch2_jset_entry_types[] = {
+	BCH_JSET_ENTRY_TYPES()
+	NULL
+};
+
+const char * const bch2_fs_usage_types[] = {
+	BCH_FS_USAGE_TYPES()
+	NULL
+};
+
 #undef x
 
 const char * const bch2_d_types[BCH_DT_MAX] = {
diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
index f7355c455b62..4b438098aecb 100644
--- a/fs/bcachefs/opts.h
+++ b/fs/bcachefs/opts.h
@@ -20,6 +20,8 @@ extern const char * const bch2_str_hash_types[];
 extern const char * const bch2_str_hash_opts[];
 extern const char * const bch2_data_types[];
 extern const char * const bch2_member_states[];
+extern const char * const bch2_jset_entry_types[];
+extern const char * const bch2_fs_usage_types[];
 extern const char * const bch2_d_types[];
 
 static inline const char *bch2_d_type_str(unsigned d_type)
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 0b923037d236..d332fd16517b 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -714,15 +714,15 @@ static int journal_replay_entry_early(struct bch_fs *c,
 			container_of(entry, struct jset_entry_usage, entry);
 
 		switch (entry->btree_id) {
-		case FS_USAGE_RESERVED:
+		case BCH_FS_USAGE_reserved:
 			if (entry->level < BCH_REPLICAS_MAX)
 				c->usage_base->persistent_reserved[entry->level] =
 					le64_to_cpu(u->v);
 			break;
-		case FS_USAGE_INODES:
+		case BCH_FS_USAGE_inodes:
 			c->usage_base->nr_inodes = le64_to_cpu(u->v);
 			break;
-		case FS_USAGE_KEY_VERSION:
+		case BCH_FS_USAGE_key_version:
 			atomic64_set(&c->key_version,
 				     le64_to_cpu(u->v));
 			break;
@@ -742,10 +742,7 @@ static int journal_replay_entry_early(struct bch_fs *c,
 		struct jset_entry_dev_usage *u =
 			container_of(entry, struct jset_entry_dev_usage, entry);
 		struct bch_dev *ca = bch_dev_bkey_exists(c, le32_to_cpu(u->dev));
-		unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64);
-		unsigned nr_types = (bytes - sizeof(struct jset_entry_dev_usage)) /
-			sizeof(struct jset_entry_dev_usage_type);
-		unsigned i;
+		unsigned i, nr_types = jset_entry_dev_usage_nr_types(u);
 
 		ca->usage_base->buckets_ec		= le64_to_cpu(u->buckets_ec);
 		ca->usage_base->buckets_unavailable	= le64_to_cpu(u->buckets_unavailable);
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index c69f25e1a867..8928f1ba5354 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -1029,7 +1029,7 @@ void bch2_journal_super_entries_add_common(struct bch_fs *c,
 				     struct jset_entry_usage, entry);
 
 		u->entry.type	= BCH_JSET_ENTRY_usage;
-		u->entry.btree_id = FS_USAGE_INODES;
+		u->entry.btree_id = BCH_FS_USAGE_inodes;
 		u->v		= cpu_to_le64(c->usage_base->nr_inodes);
 	}
 
@@ -1039,7 +1039,7 @@ void bch2_journal_super_entries_add_common(struct bch_fs *c,
 				     struct jset_entry_usage, entry);
 
 		u->entry.type	= BCH_JSET_ENTRY_usage;
-		u->entry.btree_id = FS_USAGE_KEY_VERSION;
+		u->entry.btree_id = BCH_FS_USAGE_key_version;
 		u->v		= cpu_to_le64(atomic64_read(&c->key_version));
 	}
 
@@ -1049,7 +1049,7 @@ void bch2_journal_super_entries_add_common(struct bch_fs *c,
 				     struct jset_entry_usage, entry);
 
 		u->entry.type	= BCH_JSET_ENTRY_usage;
-		u->entry.btree_id = FS_USAGE_RESERVED;
+		u->entry.btree_id = BCH_FS_USAGE_reserved;
 		u->entry.level	= i;
 		u->v		= cpu_to_le64(c->usage_base->persistent_reserved[i]);
 	}
-- 
cgit 


From dfd41fb9f24699393a042f9c34bd46496da1174d Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 31 Dec 2021 17:54:13 -0500
Subject: bcachefs: Fix race between btree updates & journal replay

Add a flag to indicate whether a journal replay key has been
overwritten, and set/test it with appropriate btree locks held.

This fixes a race between the allocator - invalidating buckets, and
doing btree updates - and journal replay, which before this patch could
clobber the allocator thread's update with an older version of the key
from the journal.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/bcachefs.h          |  1 +
 fs/bcachefs/btree_update_leaf.c | 12 ++++++++++++
 fs/bcachefs/recovery.c          | 25 +++++++++++++++++++++++--
 fs/bcachefs/recovery.h          |  2 ++
 4 files changed, 38 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index c282086079fb..9452b6cf04a5 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -561,6 +561,7 @@ struct journal_keys {
 		enum btree_id	btree_id:8;
 		unsigned	level:8;
 		bool		allocated;
+		bool		overwritten;
 		struct bkey_i	*k;
 		u32		journal_seq;
 		u32		journal_offset;
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 40deafced921..8af9ba464b25 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -15,6 +15,7 @@
 #include "journal.h"
 #include "journal_reclaim.h"
 #include "keylist.h"
+#include "recovery.h"
 #include "subvolume.h"
 #include "replicas.h"
 #include "trace.h"
@@ -625,6 +626,14 @@ fail:
 	return btree_trans_restart(trans);
 }
 
+static noinline void bch2_drop_overwrites_from_journal(struct btree_trans *trans)
+{
+	struct btree_insert_entry *i;
+
+	trans_for_each_update(trans, i)
+		bch2_journal_key_overwritten(trans->c, i->btree_id, i->level, i->k->k.p);
+}
+
 /*
  * Get journal reservation, take write locks, and attempt to do btree update(s):
  */
@@ -702,6 +711,9 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans,
 
 	ret = bch2_trans_commit_write_locked(trans, stopped_at, trace_ip);
 
+	if (!ret && unlikely(!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)))
+		bch2_drop_overwrites_from_journal(trans);
+
 	trans_for_each_update(trans, i)
 		if (!same_leaf_as_prev(trans, i))
 			bch2_btree_node_unlock_write_inlined(trans, i->path,
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index d332fd16517b..fcacf166f900 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -185,6 +185,19 @@ int bch2_journal_key_delete(struct bch_fs *c, enum btree_id id,
 	return bch2_journal_key_insert(c, id, level, &whiteout);
 }
 
+void bch2_journal_key_overwritten(struct bch_fs *c, enum btree_id btree,
+				  unsigned level, struct bpos pos)
+{
+	struct journal_keys *keys = &c->journal_keys;
+	size_t idx = journal_key_search(keys, btree, level, pos);
+
+	if (idx < keys->nr &&
+	    keys->d[idx].btree_id	== btree &&
+	    keys->d[idx].level		== level &&
+	    !bpos_cmp(keys->d[idx].k->k.p, pos))
+		keys->d[idx].overwritten = true;
+}
+
 static struct bkey_i *bch2_journal_iter_peek(struct journal_iter *iter)
 {
 	struct journal_key *k = iter->idx - iter->keys->nr
@@ -539,8 +552,16 @@ static int __bch2_journal_replay_key(struct btree_trans *trans,
 	bch2_trans_node_iter_init(trans, &iter, k->btree_id, k->k->k.p,
 				  BTREE_MAX_DEPTH, k->level,
 				  iter_flags);
-	ret   = bch2_btree_iter_traverse(&iter) ?:
-		bch2_trans_update(trans, &iter, k->k, BTREE_TRIGGER_NORUN);
+	ret = bch2_btree_iter_traverse(&iter);
+	if (ret)
+		goto out;
+
+	/* Must be checked with btree locked: */
+	if (k->overwritten)
+		goto out;
+
+	ret = bch2_trans_update(trans, &iter, k->k, BTREE_TRIGGER_NORUN);
+out:
 	bch2_trans_iter_exit(trans, &iter);
 	return ret;
 }
diff --git a/fs/bcachefs/recovery.h b/fs/bcachefs/recovery.h
index 1504e0bdb940..a7a9496afb95 100644
--- a/fs/bcachefs/recovery.h
+++ b/fs/bcachefs/recovery.h
@@ -37,6 +37,8 @@ int bch2_journal_key_insert(struct bch_fs *, enum btree_id,
 			    unsigned, struct bkey_i *);
 int bch2_journal_key_delete(struct bch_fs *, enum btree_id,
 			    unsigned, struct bpos);
+void bch2_journal_key_overwritten(struct bch_fs *, enum btree_id,
+				  unsigned, struct bpos);
 
 void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *);
 struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *);
-- 
cgit 


From 2a84de33607d66b57a49581376c850dda5f3d9e4 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 1 Jan 2022 18:27:50 -0500
Subject: bcachefs: Log what we're doing when repairing

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_gc.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 12f2faca4fa3..77c30157792b 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -691,10 +691,16 @@ found:
 		}
 
 		ret = bch2_journal_key_insert_take(c, btree_id, level, new);
+
 		if (ret)
 			kfree(new);
-		else
+		else {
+			bch2_bkey_val_to_text(&PBUF(buf), c, *k);
+			bch_info(c, "updated %s", buf);
+			bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(new));
+			bch_info(c, "new key %s", buf);
 			*k = bkey_i_to_s_c(new);
+		}
 	}
 fsck_err:
 	return ret;
-- 
cgit 


From bf15946316757a4fd23c87e51434520bd6a87f64 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 1 Jan 2022 19:04:33 -0500
Subject: bcachefs: Improve error messages in superblock write path

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/super-io.c | 21 +++++++++++++++++----
 1 file changed, 17 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index 8928f1ba5354..e0b1dfadacd9 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -754,11 +754,24 @@ int bch2_write_super(struct bch_fs *c)
 	closure_sync(cl);
 
 	for_each_online_member(ca, c, i) {
-		if (!ca->sb_write_error &&
-		    ca->disk_sb.seq !=
-		    le64_to_cpu(ca->sb_read_scratch->seq)) {
+		if (ca->sb_write_error)
+			continue;
+
+		if (le64_to_cpu(ca->sb_read_scratch->seq) < ca->disk_sb.seq) {
+			bch2_fs_fatal_error(c,
+				"Superblock write was silently dropped! (seq %llu expected %llu)",
+				le64_to_cpu(ca->sb_read_scratch->seq),
+				ca->disk_sb.seq);
+			percpu_ref_put(&ca->io_ref);
+			ret = -EROFS;
+			goto out;
+		}
+
+		if (le64_to_cpu(ca->sb_read_scratch->seq) > ca->disk_sb.seq) {
 			bch2_fs_fatal_error(c,
-				"Superblock modified by another process");
+				"Superblock modified by another process (seq %llu expected %llu)",
+				le64_to_cpu(ca->sb_read_scratch->seq),
+				ca->disk_sb.seq);
 			percpu_ref_put(&ca->io_ref);
 			ret = -EROFS;
 			goto out;
-- 
cgit 


From cd7c2d3d8d31d4ed3076078457a3df6d9dcaede8 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 1 Jan 2022 19:46:12 -0500
Subject: bcachefs: Make sure BCH_FS_FSCK_DONE gets set

If we're not running fsck we still want to set BCH_FS_FSCK_DONE, so that
bch2_fsck_err() calls are interpreted as bch2_inconsistent_error()
calls().

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/recovery.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index fcacf166f900..39b5b97704b7 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -613,8 +613,10 @@ static int bch2_journal_replay(struct bch_fs *c)
 	     sizeof(keys_sorted[0]),
 	     journal_sort_seq_cmp, NULL);
 
-	if (keys->nr)
+	if (keys->nr) {
+		bch_verbose(c, "starting journal replay, %zu keys", keys->nr);
 		replay_now_at(j, keys->journal_seq_base);
+	}
 
 	seq = j->replay_journal_seq;
 
@@ -1235,6 +1237,13 @@ use_clean:
 
 	set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags);
 
+	/*
+	 * If we're not running fsck, this ensures bch2_fsck_err() calls are
+	 * instead interpreted as bch2_inconsistent_err() calls:
+	 */
+	if (!c->opts.fsck)
+		set_bit(BCH_FS_FSCK_DONE, &c->flags);
+
 	if (c->opts.fsck ||
 	    !(c->sb.compat & (1ULL << BCH_COMPAT_alloc_info)) ||
 	    !(c->sb.compat & (1ULL << BCH_COMPAT_alloc_metadata)) ||
@@ -1434,6 +1443,7 @@ int bch2_fs_initialize(struct bch_fs *c)
 
 	set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags);
 	set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags);
+	set_bit(BCH_FS_FSCK_DONE, &c->flags);
 
 	for (i = 0; i < BTREE_ID_NR; i++)
 		bch2_btree_root_alloc(c, i);
-- 
cgit 


From f28620c108a9476c7b4b25b8e36b94b6b2b29295 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 1 Jan 2022 20:45:30 -0500
Subject: bcachefs: Tweak journal reclaim order

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/recovery.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 39b5b97704b7..219351654564 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -116,12 +116,19 @@ int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id,
 		.btree_id	= id,
 		.level		= level,
 		.k		= k,
-		.allocated	= true
+		.allocated	= true,
+		/*
+		 * Ensure these keys are done last by journal replay, to unblock
+		 * journal reclaim:
+		 */
+		.journal_seq	= U32_MAX,
 	};
 	struct journal_keys *keys = &c->journal_keys;
 	struct journal_iter *iter;
 	unsigned idx = journal_key_search(keys, id, level, k->k.p);
 
+	BUG_ON(test_bit(BCH_FS_RW, &c->flags));
+
 	if (idx < keys->nr &&
 	    journal_key_cmp(&n, &keys->d[idx]) == 0) {
 		if (keys->d[idx].allocated)
-- 
cgit 


From 5222a4607cd8b9d8882e81796917c10193d10be0 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 25 Dec 2021 20:07:00 -0500
Subject: bcachefs: BTREE_ITER_WITH_JOURNAL

This adds a new btree iterator flag, BTREE_ITER_WITH_JOURNAL, that is
automatically enabled when initializing a btree iterator before journal
replay has completed - it overlays the contents of the journal with the
btree.

This lets us delete bch2_btree_and_journal_walk() and just use the
normal btree iterator interface instead - which also lets us delete a
significant amount of duplicated code.

Note that BTREE_ITER_WITH_JOURNAL is still unoptimized in this patch -
we're redoing the binary search over keys in the journal every time we
call bch2_btree_iter_peek().

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c      |  60 +++++------
 fs/bcachefs/bcachefs.h              |   1 -
 fs/bcachefs/btree_gc.c              | 185 +++++++++-------------------------
 fs/bcachefs/btree_iter.c            | 194 ++++++++++++++++++++++++++++++------
 fs/bcachefs/btree_types.h           |  10 +-
 fs/bcachefs/btree_update_interior.c |   4 +
 fs/bcachefs/btree_update_leaf.c     |   2 +-
 fs/bcachefs/ec.c                    |  60 ++++++-----
 fs/bcachefs/recovery.c              | 158 ++++++++---------------------
 fs/bcachefs/recovery.h              |  10 +-
 10 files changed, 331 insertions(+), 353 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 30bf363d2ff3..cb4b059e796c 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -340,46 +340,46 @@ void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c,
 #undef  x
 }
 
-static int bch2_alloc_read_fn(struct btree_trans *trans, struct bkey_s_c k)
+int bch2_alloc_read(struct bch_fs *c)
 {
-	struct bch_fs *c = trans->c;
+	struct btree_trans trans;
+	struct btree_iter iter;
+	struct bkey_s_c k;
 	struct bch_dev *ca;
 	struct bucket *g;
 	struct bkey_alloc_unpacked u;
-
-	if (!bkey_is_alloc(k.k))
-		return 0;
-
-	ca = bch_dev_bkey_exists(c, k.k->p.inode);
-	g = bucket(ca, k.k->p.offset);
-	u = bch2_alloc_unpack(k);
-
-	*bucket_gen(ca, k.k->p.offset) = u.gen;
-	g->_mark.gen		= u.gen;
-	g->_mark.data_type	= u.data_type;
-	g->_mark.dirty_sectors	= u.dirty_sectors;
-	g->_mark.cached_sectors	= u.cached_sectors;
-	g->_mark.stripe		= u.stripe != 0;
-	g->stripe		= u.stripe;
-	g->stripe_redundancy	= u.stripe_redundancy;
-	g->io_time[READ]	= u.read_time;
-	g->io_time[WRITE]	= u.write_time;
-	g->oldest_gen		= u.oldest_gen;
-	g->gen_valid		= 1;
-
-	return 0;
-}
-
-int bch2_alloc_read(struct bch_fs *c)
-{
-	struct btree_trans trans;
 	int ret;
 
 	bch2_trans_init(&trans, c, 0, 0);
 	down_read(&c->gc_lock);
-	ret = bch2_btree_and_journal_walk(&trans, BTREE_ID_alloc, bch2_alloc_read_fn);
+
+	for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN,
+			   BTREE_ITER_PREFETCH, k, ret) {
+		if (!bkey_is_alloc(k.k))
+			continue;
+
+		ca = bch_dev_bkey_exists(c, k.k->p.inode);
+		g = bucket(ca, k.k->p.offset);
+		u = bch2_alloc_unpack(k);
+
+		*bucket_gen(ca, k.k->p.offset) = u.gen;
+		g->_mark.gen		= u.gen;
+		g->_mark.data_type	= u.data_type;
+		g->_mark.dirty_sectors	= u.dirty_sectors;
+		g->_mark.cached_sectors	= u.cached_sectors;
+		g->_mark.stripe		= u.stripe != 0;
+		g->stripe		= u.stripe;
+		g->stripe_redundancy	= u.stripe_redundancy;
+		g->io_time[READ]	= u.read_time;
+		g->io_time[WRITE]	= u.write_time;
+		g->oldest_gen		= u.oldest_gen;
+		g->gen_valid		= 1;
+	}
+	bch2_trans_iter_exit(&trans, &iter);
+
 	up_read(&c->gc_lock);
 	bch2_trans_exit(&trans);
+
 	if (ret) {
 		bch_err(c, "error reading alloc info: %i", ret);
 		return ret;
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 9452b6cf04a5..431cf25b38db 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -860,7 +860,6 @@ mempool_t		bio_bounce_pages;
 	u64			reflink_hint;
 	reflink_gc_table	reflink_gc_table;
 	size_t			reflink_gc_nr;
-	size_t			reflink_gc_idx;
 
 	/* VFS IO PATH - fs-io.c */
 	struct bio_set		writepage_bioset;
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 77c30157792b..d7de00af81c9 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -1342,59 +1342,6 @@ static int bch2_gc_start(struct bch_fs *c,
 	return 0;
 }
 
-static int bch2_gc_reflink_done_initial_fn(struct btree_trans *trans,
-					   struct bkey_s_c k)
-{
-	struct bch_fs *c = trans->c;
-	struct reflink_gc *r;
-	const __le64 *refcount = bkey_refcount_c(k);
-	char buf[200];
-	int ret = 0;
-
-	if (!refcount)
-		return 0;
-
-	r = genradix_ptr(&c->reflink_gc_table, c->reflink_gc_idx++);
-	if (!r)
-		return -ENOMEM;
-
-	if (!r ||
-	    r->offset != k.k->p.offset ||
-	    r->size != k.k->size) {
-		bch_err(c, "unexpected inconsistency walking reflink table at gc finish");
-		return -EINVAL;
-	}
-
-	if (fsck_err_on(r->refcount != le64_to_cpu(*refcount), c,
-			"reflink key has wrong refcount:\n"
-			"  %s\n"
-			"  should be %u",
-			(bch2_bkey_val_to_text(&PBUF(buf), c, k), buf),
-			r->refcount)) {
-		struct bkey_i *new;
-
-		new = kmalloc(bkey_bytes(k.k), GFP_KERNEL);
-		if (!new) {
-			ret = -ENOMEM;
-			goto fsck_err;
-		}
-
-		bkey_reassemble(new, k);
-
-		if (!r->refcount) {
-			new->k.type = KEY_TYPE_deleted;
-			new->k.size = 0;
-		} else {
-			*bkey_refcount(new) = cpu_to_le64(r->refcount);
-		}
-
-		ret = bch2_journal_key_insert(c, BTREE_ID_reflink, 0, new);
-		kfree(new);
-	}
-fsck_err:
-	return ret;
-}
-
 static int bch2_gc_reflink_done(struct bch_fs *c, bool initial,
 				bool metadata_only)
 {
@@ -1411,14 +1358,6 @@ static int bch2_gc_reflink_done(struct bch_fs *c, bool initial,
 
 	bch2_trans_init(&trans, c, 0, 0);
 
-	if (initial) {
-		c->reflink_gc_idx = 0;
-
-		ret = bch2_btree_and_journal_walk(&trans, BTREE_ID_reflink,
-				bch2_gc_reflink_done_initial_fn);
-		goto out;
-	}
-
 	for_each_btree_key(&trans, iter, BTREE_ID_reflink, POS_MIN,
 			   BTREE_ITER_PREFETCH, k, ret) {
 		const __le64 *refcount = bkey_refcount_c(k);
@@ -1426,7 +1365,7 @@ static int bch2_gc_reflink_done(struct bch_fs *c, bool initial,
 		if (!refcount)
 			continue;
 
-		r = genradix_ptr(&c->reflink_gc_table, idx);
+		r = genradix_ptr(&c->reflink_gc_table, idx++);
 		if (!r ||
 		    r->offset != k.k->p.offset ||
 		    r->size != k.k->size) {
@@ -1456,7 +1395,9 @@ static int bch2_gc_reflink_done(struct bch_fs *c, bool initial,
 			else
 				*bkey_refcount(new) = cpu_to_le64(r->refcount);
 
-			ret = __bch2_trans_do(&trans, NULL, NULL, 0,
+			ret = initial
+			       ? bch2_journal_key_insert(c, BTREE_ID_stripes, 0, new)
+			       : __bch2_trans_do(&trans, NULL, NULL, 0,
 					__bch2_btree_insert(&trans, BTREE_ID_reflink, new));
 			kfree(new);
 
@@ -1466,104 +1407,74 @@ static int bch2_gc_reflink_done(struct bch_fs *c, bool initial,
 	}
 fsck_err:
 	bch2_trans_iter_exit(&trans, &iter);
-out:
 	c->reflink_gc_nr = 0;
 	bch2_trans_exit(&trans);
 	return ret;
 }
 
-static int bch2_gc_stripes_done_initial_fn(struct btree_trans *trans,
-					   struct bkey_s_c k)
+static int bch2_gc_stripes_done(struct bch_fs *c, bool initial,
+				bool metadata_only)
 {
-	struct bch_fs *c = trans->c;
+	struct btree_trans trans;
+	struct btree_iter iter;
+	struct bkey_s_c k;
 	struct gc_stripe *m;
 	const struct bch_stripe *s;
 	char buf[200];
 	unsigned i;
 	int ret = 0;
 
-	if (k.k->type != KEY_TYPE_stripe)
+	if (metadata_only)
 		return 0;
 
-	s = bkey_s_c_to_stripe(k).v;
+	bch2_trans_init(&trans, c, 0, 0);
+
+	for_each_btree_key(&trans, iter, BTREE_ID_stripes, POS_MIN,
+			   BTREE_ITER_PREFETCH, k, ret) {
+		if (k.k->type != KEY_TYPE_stripe)
+			continue;
 
-	m = genradix_ptr(&c->gc_stripes, k.k->p.offset);
+		s = bkey_s_c_to_stripe(k).v;
+		m = genradix_ptr(&c->gc_stripes, k.k->p.offset);
 
-	for (i = 0; i < s->nr_blocks; i++)
-		if (stripe_blockcount_get(s, i) != (m ? m->block_sectors[i] : 0))
-			goto inconsistent;
-	return 0;
+		for (i = 0; i < s->nr_blocks; i++)
+			if (stripe_blockcount_get(s, i) != (m ? m->block_sectors[i] : 0))
+				goto inconsistent;
+		continue;
 inconsistent:
-	if (fsck_err_on(true, c,
-			"stripe has wrong block sector count %u:\n"
-			"  %s\n"
-			"  should be %u", i,
-			(bch2_bkey_val_to_text(&PBUF(buf), c, k), buf),
-			m ? m->block_sectors[i] : 0)) {
-		struct bkey_i_stripe *new;
+		if (fsck_err_on(true, c,
+				"stripe has wrong block sector count %u:\n"
+				"  %s\n"
+				"  should be %u", i,
+				(bch2_bkey_val_to_text(&PBUF(buf), c, k), buf),
+				m ? m->block_sectors[i] : 0)) {
+			struct bkey_i_stripe *new;
 
-		new = kmalloc(bkey_bytes(k.k), GFP_KERNEL);
-		if (!new) {
-			ret = -ENOMEM;
-			goto fsck_err;
-		}
+			new = kmalloc(bkey_bytes(k.k), GFP_KERNEL);
+			if (!new) {
+				ret = -ENOMEM;
+				break;
+			}
 
-		bkey_reassemble(&new->k_i, k);
+			bkey_reassemble(&new->k_i, k);
 
-		for (i = 0; i < new->v.nr_blocks; i++)
-			stripe_blockcount_set(&new->v, i, m ? m->block_sectors[i] : 0);
+			for (i = 0; i < new->v.nr_blocks; i++)
+				stripe_blockcount_set(&new->v, i, m ? m->block_sectors[i] : 0);
 
-		ret = bch2_journal_key_insert(c, BTREE_ID_stripes, 0, &new->k_i);
-		kfree(new);
+			ret = initial
+				? bch2_journal_key_insert(c, BTREE_ID_stripes, 0, &new->k_i)
+				: __bch2_trans_do(&trans, NULL, NULL, 0,
+					__bch2_btree_insert(&trans, BTREE_ID_reflink, &new->k_i));
+			kfree(new);
+		}
 	}
 fsck_err:
-	return ret;
-}
-
-static int bch2_gc_stripes_done(struct bch_fs *c, bool initial,
-				bool metadata_only)
-{
-	struct btree_trans trans;
-	int ret = 0;
-
-	if (metadata_only)
-		return 0;
-
-	bch2_trans_init(&trans, c, 0, 0);
-
-	if (initial) {
-		ret = bch2_btree_and_journal_walk(&trans, BTREE_ID_stripes,
-				bch2_gc_stripes_done_initial_fn);
-	} else {
-		BUG();
-	}
+	bch2_trans_iter_exit(&trans, &iter);
 
 	bch2_trans_exit(&trans);
 	return ret;
 }
 
-static int bch2_gc_reflink_start_initial_fn(struct btree_trans *trans,
-					    struct bkey_s_c k)
-{
-
-	struct bch_fs *c = trans->c;
-	struct reflink_gc *r;
-	const __le64 *refcount = bkey_refcount_c(k);
-
-	if (!refcount)
-		return 0;
-
-	r = genradix_ptr_alloc(&c->reflink_gc_table, c->reflink_gc_nr++,
-			       GFP_KERNEL);
-	if (!r)
-		return -ENOMEM;
-
-	r->offset	= k.k->p.offset;
-	r->size		= k.k->size;
-	r->refcount	= 0;
-	return 0;
-}
-
 static int bch2_gc_reflink_start(struct bch_fs *c, bool initial,
 				 bool metadata_only)
 {
@@ -1579,12 +1490,6 @@ static int bch2_gc_reflink_start(struct bch_fs *c, bool initial,
 	bch2_trans_init(&trans, c, 0, 0);
 	c->reflink_gc_nr = 0;
 
-	if (initial) {
-		ret = bch2_btree_and_journal_walk(&trans, BTREE_ID_reflink,
-						bch2_gc_reflink_start_initial_fn);
-		goto out;
-	}
-
 	for_each_btree_key(&trans, iter, BTREE_ID_reflink, POS_MIN,
 			   BTREE_ITER_PREFETCH, k, ret) {
 		const __le64 *refcount = bkey_refcount_c(k);
@@ -1604,7 +1509,7 @@ static int bch2_gc_reflink_start(struct bch_fs *c, bool initial,
 		r->refcount	= 0;
 	}
 	bch2_trans_iter_exit(&trans, &iter);
-out:
+
 	bch2_trans_exit(&trans);
 	return ret;
 }
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 0b5bf75fbf89..01c130a3ce8d 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -12,6 +12,7 @@
 #include "error.h"
 #include "extents.h"
 #include "journal.h"
+#include "recovery.h"
 #include "replicas.h"
 #include "subvolume.h"
 #include "trace.h"
@@ -1064,6 +1065,7 @@ static inline bool btree_path_advance_to_pos(struct btree_path *path,
 static void btree_path_verify_new_node(struct btree_trans *trans,
 				       struct btree_path *path, struct btree *b)
 {
+	struct bch_fs *c = trans->c;
 	struct btree_path_level *l;
 	unsigned plevel;
 	bool parent_locked;
@@ -1072,6 +1074,9 @@ static void btree_path_verify_new_node(struct btree_trans *trans,
 	if (!IS_ENABLED(CONFIG_BCACHEFS_DEBUG))
 		return;
 
+	if (trans->journal_replay_not_finished)
+		return;
+
 	plevel = b->c.level + 1;
 	if (!btree_path_node(path, plevel))
 		return;
@@ -1092,7 +1097,7 @@ static void btree_path_verify_new_node(struct btree_trans *trans,
 		char buf4[100];
 		struct bkey uk = bkey_unpack_key(b, k);
 
-		bch2_dump_btree_node(trans->c, l->b);
+		bch2_dump_btree_node(c, l->b);
 		bch2_bpos_to_text(&PBUF(buf1), path->pos);
 		bch2_bkey_to_text(&PBUF(buf2), &uk);
 		bch2_bpos_to_text(&PBUF(buf3), b->data->min_key);
@@ -1283,6 +1288,41 @@ static int btree_path_prefetch(struct btree_trans *trans, struct btree_path *pat
 	return ret;
 }
 
+static int btree_path_prefetch_j(struct btree_trans *trans, struct btree_path *path,
+				 struct btree_and_journal_iter *jiter)
+{
+	struct bch_fs *c = trans->c;
+	struct bkey_s_c k;
+	struct bkey_buf tmp;
+	unsigned nr = test_bit(BCH_FS_STARTED, &c->flags)
+		? (path->level > 1 ? 0 :  2)
+		: (path->level > 1 ? 1 : 16);
+	bool was_locked = btree_node_locked(path, path->level);
+	int ret = 0;
+
+	bch2_bkey_buf_init(&tmp);
+
+	while (nr && !ret) {
+		if (!bch2_btree_node_relock(trans, path, path->level))
+			break;
+
+		bch2_btree_and_journal_iter_advance(jiter);
+		k = bch2_btree_and_journal_iter_peek(jiter);
+		if (!k.k)
+			break;
+
+		bch2_bkey_buf_reassemble(&tmp, c, k);
+		ret = bch2_btree_node_prefetch(c, trans, path, tmp.k, path->btree_id,
+					       path->level - 1);
+	}
+
+	if (!was_locked)
+		btree_node_unlock(path, path->level);
+
+	bch2_bkey_buf_exit(&tmp, c);
+	return ret;
+}
+
 static noinline void btree_node_mem_ptr_set(struct btree_trans *trans,
 					    struct btree_path *path,
 					    unsigned plevel, struct btree *b)
@@ -1305,6 +1345,30 @@ static noinline void btree_node_mem_ptr_set(struct btree_trans *trans,
 		btree_node_unlock(path, plevel);
 }
 
+static noinline int btree_node_iter_and_journal_peek(struct btree_trans *trans,
+						     struct btree_path *path,
+						     unsigned flags,
+						     struct bkey_buf *out)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_path_level *l = path_l(path);
+	struct btree_and_journal_iter jiter;
+	struct bkey_s_c k;
+	int ret = 0;
+
+	__bch2_btree_and_journal_iter_init_node_iter(&jiter, c, l->b, l->iter, path->pos);
+
+	k = bch2_btree_and_journal_iter_peek(&jiter);
+
+	bch2_bkey_buf_reassemble(out, c, k);
+
+	if (flags & BTREE_ITER_PREFETCH)
+		ret = btree_path_prefetch_j(trans, path, &jiter);
+
+	bch2_btree_and_journal_iter_exit(&jiter);
+	return ret;
+}
+
 static __always_inline int btree_path_down(struct btree_trans *trans,
 					   struct btree_path *path,
 					   unsigned flags,
@@ -1321,8 +1385,21 @@ static __always_inline int btree_path_down(struct btree_trans *trans,
 	EBUG_ON(!btree_node_locked(path, path->level));
 
 	bch2_bkey_buf_init(&tmp);
-	bch2_bkey_buf_unpack(&tmp, c, l->b,
-			 bch2_btree_node_iter_peek(&l->iter, l->b));
+
+	if (unlikely(trans->journal_replay_not_finished)) {
+		ret = btree_node_iter_and_journal_peek(trans, path, flags, &tmp);
+		if (ret)
+			goto err;
+	} else {
+		bch2_bkey_buf_unpack(&tmp, c, l->b,
+				 bch2_btree_node_iter_peek(&l->iter, l->b));
+
+		if (flags & BTREE_ITER_PREFETCH) {
+			ret = btree_path_prefetch(trans, path);
+			if (ret)
+				goto err;
+		}
+	}
 
 	b = bch2_btree_node_get(trans, path, tmp.k, level, lock_type, trace_ip);
 	ret = PTR_ERR_OR_ZERO(b);
@@ -1332,13 +1409,11 @@ static __always_inline int btree_path_down(struct btree_trans *trans,
 	mark_btree_node_locked(path, level, lock_type);
 	btree_path_level_init(trans, path, b);
 
-	if (tmp.k->k.type == KEY_TYPE_btree_ptr_v2 &&
+	if (likely(!trans->journal_replay_not_finished &&
+		   tmp.k->k.type == KEY_TYPE_btree_ptr_v2) &&
 	    unlikely(b != btree_node_mem_ptr(tmp.k)))
 		btree_node_mem_ptr_set(trans, path, level + 1, b);
 
-	if (flags & BTREE_ITER_PREFETCH)
-		ret = btree_path_prefetch(trans, path);
-
 	if (btree_node_read_locked(path, level + 1))
 		btree_node_unlock(path, level + 1);
 	path->level = level;
@@ -2113,6 +2188,55 @@ struct bkey_i *__bch2_btree_trans_peek_updates(struct btree_iter *iter)
 	return ret;
 }
 
+static struct bkey_i *__btree_trans_peek_journal(struct btree_trans *trans,
+						 struct btree_path *path)
+{
+	struct journal_keys *keys = &trans->c->journal_keys;
+	size_t idx = bch2_journal_key_search(keys, path->btree_id,
+					     path->level, path->pos);
+
+	while (idx < keys->nr && keys->d[idx].overwritten)
+		idx++;
+
+	return (idx < keys->nr &&
+		keys->d[idx].btree_id	== path->btree_id &&
+		keys->d[idx].level	== path->level)
+		? keys->d[idx].k
+		: NULL;
+}
+
+static noinline
+struct bkey_s_c btree_trans_peek_slot_journal(struct btree_trans *trans,
+					      struct btree_iter *iter)
+{
+	struct bkey_i *k = __btree_trans_peek_journal(trans, iter->path);
+
+	if (k && !bpos_cmp(k->k.p, iter->pos)) {
+		iter->k = k->k;
+		return bkey_i_to_s_c(k);
+	} else {
+		return bkey_s_c_null;
+	}
+}
+
+static noinline
+struct bkey_s_c btree_trans_peek_journal(struct btree_trans *trans,
+					 struct btree_iter *iter,
+					 struct bkey_s_c k)
+{
+	struct bkey_i *next_journal =
+		__btree_trans_peek_journal(trans, iter->path);
+
+	if (next_journal &&
+	    bpos_cmp(next_journal->k.p,
+		     k.k ? k.k->p : iter->path->l[0].b->key.k.p) <= 0) {
+		iter->k = next_journal->k;
+		k = bkey_i_to_s_c(next_journal);
+	}
+
+	return k;
+}
+
 /**
  * bch2_btree_iter_peek: returns first key greater than or equal to iterator's
  * current position
@@ -2141,16 +2265,12 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
 			goto out;
 		}
 
-		next_update = btree_trans_peek_updates(iter);
 		k = btree_path_level_peek_all(trans->c, &iter->path->l[0], &iter->k);
 
-		/* * In the btree, deleted keys sort before non deleted: */
-		if (k.k && bkey_deleted(k.k) &&
-		    (!next_update ||
-		     bpos_cmp(k.k->p, next_update->k.p) <= 0)) {
-			search_key = k.k->p;
-			continue;
-		}
+		if (unlikely(iter->flags & BTREE_ITER_WITH_JOURNAL))
+			k = btree_trans_peek_journal(trans, iter, k);
+
+		next_update = btree_trans_peek_updates(iter);
 
 		if (next_update &&
 		    bpos_cmp(next_update->k.p,
@@ -2159,6 +2279,20 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
 			k = bkey_i_to_s_c(next_update);
 		}
 
+		if (k.k && bkey_deleted(k.k)) {
+			/*
+			 * If we've got a whiteout, and it's after the search
+			 * key, advance the search key to the whiteout instead
+			 * of just after the whiteout - it might be a btree
+			 * whiteout, with a real key at the same position, since
+			 * in the btree deleted keys sort before non deleted.
+			 */
+			search_key = bpos_cmp(search_key, k.k->p)
+				? k.k->p
+				: bpos_successor(k.k->p);
+			continue;
+		}
+
 		if (likely(k.k)) {
 			/*
 			 * We can never have a key in a leaf node at POS_MAX, so
@@ -2249,6 +2383,10 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
 
 	EBUG_ON(iter->path->cached || iter->path->level);
 	EBUG_ON(iter->flags & BTREE_ITER_WITH_UPDATES);
+
+	if (iter->flags & BTREE_ITER_WITH_JOURNAL)
+		return bkey_s_c_err(-EIO);
+
 	bch2_btree_iter_verify(iter);
 	bch2_btree_iter_verify_entry_exit(iter);
 
@@ -2395,23 +2533,18 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
 	    !(iter->flags & (BTREE_ITER_IS_EXTENTS|BTREE_ITER_FILTER_SNAPSHOTS))) {
 		struct bkey_i *next_update;
 
-		next_update = btree_trans_peek_updates(iter);
-		if (next_update &&
+		if ((next_update = btree_trans_peek_updates(iter)) &&
 		    !bpos_cmp(next_update->k.p, iter->pos)) {
 			iter->k = next_update->k;
 			k = bkey_i_to_s_c(next_update);
-		} else {
-			k = bch2_btree_path_peek_slot(iter->path, &iter->k);
+			goto out;
 		}
 
-		if (!k.k ||
-		    ((iter->flags & BTREE_ITER_ALL_SNAPSHOTS)
-		     ? bpos_cmp(iter->pos, k.k->p)
-		     : bkey_cmp(iter->pos, k.k->p))) {
-			bkey_init(&iter->k);
-			iter->k.p = iter->pos;
-			k = (struct bkey_s_c) { &iter->k, NULL };
-		}
+		if (unlikely(iter->flags & BTREE_ITER_WITH_JOURNAL) &&
+		    (k = btree_trans_peek_slot_journal(trans, iter)).k)
+			goto out;
+
+		k = bch2_btree_path_peek_slot(iter->path, &iter->k);
 	} else {
 		struct bpos next;
 
@@ -2455,7 +2588,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
 			k = (struct bkey_s_c) { &iter->k, NULL };
 		}
 	}
-
+out:
 	iter->path->should_be_locked = true;
 
 	bch2_btree_iter_verify_entry_exit(iter);
@@ -2635,6 +2768,9 @@ static void __bch2_trans_iter_init(struct btree_trans *trans,
 	    btree_type_has_snapshots(btree_id))
 		flags |= BTREE_ITER_FILTER_SNAPSHOTS;
 
+	if (trans->journal_replay_not_finished)
+		flags |= BTREE_ITER_WITH_JOURNAL;
+
 	iter->trans	= trans;
 	iter->path	= NULL;
 	iter->btree_id	= btree_id;
@@ -2801,6 +2937,8 @@ void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c,
 	memset(trans, 0, sizeof(*trans));
 	trans->c		= c;
 	trans->ip		= _RET_IP_;
+	trans->journal_replay_not_finished =
+		!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags);
 
 	bch2_trans_alloc_paths(trans, c);
 
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 2c8b30949e6f..1fd0cebe30ac 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -207,10 +207,11 @@ struct btree_node_iter {
 #define BTREE_ITER_CACHED_NOFILL	(1 << 8)
 #define BTREE_ITER_CACHED_NOCREATE	(1 << 9)
 #define BTREE_ITER_WITH_UPDATES		(1 << 10)
-#define __BTREE_ITER_ALL_SNAPSHOTS	(1 << 11)
-#define BTREE_ITER_ALL_SNAPSHOTS	(1 << 12)
-#define BTREE_ITER_FILTER_SNAPSHOTS	(1 << 13)
-#define BTREE_ITER_NOPRESERVE		(1 << 14)
+#define BTREE_ITER_WITH_JOURNAL		(1 << 11)
+#define __BTREE_ITER_ALL_SNAPSHOTS	(1 << 12)
+#define BTREE_ITER_ALL_SNAPSHOTS	(1 << 13)
+#define BTREE_ITER_FILTER_SNAPSHOTS	(1 << 14)
+#define BTREE_ITER_NOPRESERVE		(1 << 15)
 
 enum btree_path_uptodate {
 	BTREE_ITER_UPTODATE		= 0,
@@ -381,6 +382,7 @@ struct btree_trans {
 	bool			restarted:1;
 	bool			paths_sorted:1;
 	bool			journal_transaction_names:1;
+	bool			journal_replay_not_finished:1;
 	/*
 	 * For when bch2_trans_update notices we'll be splitting a compressed
 	 * extent:
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 6ef0711431a1..17111c4228bd 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -16,6 +16,7 @@
 #include "journal.h"
 #include "journal_reclaim.h"
 #include "keylist.h"
+#include "recovery.h"
 #include "replicas.h"
 #include "super-io.h"
 #include "trace.h"
@@ -1146,6 +1147,9 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as,
 	BUG_ON(insert->k.type == KEY_TYPE_btree_ptr_v2 &&
 	       !btree_ptr_sectors_written(insert));
 
+	if (unlikely(!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)))
+		bch2_journal_key_overwritten(c, b->c.btree_id, b->c.level, insert->k.p);
+
 	invalid = bch2_bkey_invalid(c, bkey_i_to_s_c(insert), btree_node_type(b)) ?:
 		bch2_bkey_in_btree_node(b, bkey_i_to_s_c(insert));
 	if (invalid) {
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 8af9ba464b25..e95940ffad6b 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -711,7 +711,7 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans,
 
 	ret = bch2_trans_commit_write_locked(trans, stopped_at, trace_ip);
 
-	if (!ret && unlikely(!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)))
+	if (!ret && unlikely(trans->journal_replay_not_finished))
 		bch2_drop_overwrites_from_journal(trans);
 
 	trans_for_each_update(trans, i)
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index e18d2ecf7f07..86421f65d139 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -1558,50 +1558,48 @@ void bch2_stripes_heap_start(struct bch_fs *c)
 			bch2_stripes_heap_insert(c, m, iter.pos);
 }
 
-static int bch2_stripes_read_fn(struct btree_trans *trans, struct bkey_s_c k)
+int bch2_stripes_read(struct bch_fs *c)
 {
+	struct btree_trans trans;
+	struct btree_iter iter;
+	struct bkey_s_c k;
 	const struct bch_stripe *s;
-	struct bch_fs *c = trans->c;
 	struct stripe *m;
 	unsigned i;
-	int ret = 0;
+	int ret;
 
-	if (k.k->type != KEY_TYPE_stripe)
-		return 0;
+	bch2_trans_init(&trans, c, 0, 0);
 
-	ret = __ec_stripe_mem_alloc(c, k.k->p.offset, GFP_KERNEL);
-	if (ret)
-		return ret;
+	for_each_btree_key(&trans, iter, BTREE_ID_stripes, POS_MIN,
+			   BTREE_ITER_PREFETCH, k, ret) {
+		if (k.k->type != KEY_TYPE_stripe)
+			continue;
 
-	s = bkey_s_c_to_stripe(k).v;
+		ret = __ec_stripe_mem_alloc(c, k.k->p.offset, GFP_KERNEL);
+		if (ret)
+			break;
 
-	m = genradix_ptr(&c->stripes, k.k->p.offset);
-	m->alive	= true;
-	m->sectors	= le16_to_cpu(s->sectors);
-	m->algorithm	= s->algorithm;
-	m->nr_blocks	= s->nr_blocks;
-	m->nr_redundant	= s->nr_redundant;
-	m->blocks_nonempty = 0;
+		s = bkey_s_c_to_stripe(k).v;
 
-	for (i = 0; i < s->nr_blocks; i++)
-		m->blocks_nonempty += !!stripe_blockcount_get(s, i);
+		m = genradix_ptr(&c->stripes, k.k->p.offset);
+		m->alive	= true;
+		m->sectors	= le16_to_cpu(s->sectors);
+		m->algorithm	= s->algorithm;
+		m->nr_blocks	= s->nr_blocks;
+		m->nr_redundant	= s->nr_redundant;
+		m->blocks_nonempty = 0;
 
-	spin_lock(&c->ec_stripes_heap_lock);
-	bch2_stripes_heap_update(c, m, k.k->p.offset);
-	spin_unlock(&c->ec_stripes_heap_lock);
-
-	return ret;
-}
+		for (i = 0; i < s->nr_blocks; i++)
+			m->blocks_nonempty += !!stripe_blockcount_get(s, i);
 
-int bch2_stripes_read(struct bch_fs *c)
-{
-	struct btree_trans trans;
-	int ret;
+		spin_lock(&c->ec_stripes_heap_lock);
+		bch2_stripes_heap_update(c, m, k.k->p.offset);
+		spin_unlock(&c->ec_stripes_heap_lock);
+	}
+	bch2_trans_iter_exit(&trans, &iter);
 
-	bch2_trans_init(&trans, c, 0, 0);
-	ret = bch2_btree_and_journal_walk(&trans, BTREE_ID_stripes,
-					  bch2_stripes_read_fn);
 	bch2_trans_exit(&trans);
+
 	if (ret)
 		bch_err(c, "error reading stripes: %i", ret);
 
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 219351654564..57311ad283c7 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -59,23 +59,21 @@ static void zero_out_btree_mem_ptr(struct journal_keys *keys)
 static int __journal_key_cmp(enum btree_id	l_btree_id,
 			     unsigned		l_level,
 			     struct bpos	l_pos,
-			     struct journal_key *r)
+			     const struct journal_key *r)
 {
 	return (cmp_int(l_btree_id,	r->btree_id) ?:
 		cmp_int(l_level,	r->level) ?:
 		bpos_cmp(l_pos,	r->k->k.p));
 }
 
-static int journal_key_cmp(struct journal_key *l, struct journal_key *r)
+static int journal_key_cmp(const struct journal_key *l, const struct journal_key *r)
 {
-	return (cmp_int(l->btree_id,	r->btree_id) ?:
-		cmp_int(l->level,	r->level) ?:
-		bpos_cmp(l->k->k.p,	r->k->k.p));
+	return __journal_key_cmp(l->btree_id, l->level, l->k->k.p, r);
 }
 
-static size_t journal_key_search(struct journal_keys *journal_keys,
-				 enum btree_id id, unsigned level,
-				 struct bpos pos)
+size_t bch2_journal_key_search(struct journal_keys *journal_keys,
+			       enum btree_id id, unsigned level,
+			       struct bpos pos)
 {
 	size_t l = 0, r = journal_keys->nr, m;
 
@@ -125,7 +123,7 @@ int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id,
 	};
 	struct journal_keys *keys = &c->journal_keys;
 	struct journal_iter *iter;
-	unsigned idx = journal_key_search(keys, id, level, k->k.p);
+	size_t idx = bch2_journal_key_search(keys, id, level, k->k.p);
 
 	BUG_ON(test_bit(BCH_FS_RW, &c->flags));
 
@@ -164,6 +162,11 @@ int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id,
 	return 0;
 }
 
+/*
+ * Can only be used from the recovery thread while we're still RO - can't be
+ * used once we've got RW, as journal_keys is at that point used by multiple
+ * threads:
+ */
 int bch2_journal_key_insert(struct bch_fs *c, enum btree_id id,
 			    unsigned level, struct bkey_i *k)
 {
@@ -196,7 +199,7 @@ void bch2_journal_key_overwritten(struct bch_fs *c, enum btree_id btree,
 				  unsigned level, struct bpos pos)
 {
 	struct journal_keys *keys = &c->journal_keys;
-	size_t idx = journal_key_search(keys, btree, level, pos);
+	size_t idx = bch2_journal_key_search(keys, btree, level, pos);
 
 	if (idx < keys->nr &&
 	    keys->d[idx].btree_id	== btree &&
@@ -207,15 +210,18 @@ void bch2_journal_key_overwritten(struct bch_fs *c, enum btree_id btree,
 
 static struct bkey_i *bch2_journal_iter_peek(struct journal_iter *iter)
 {
-	struct journal_key *k = iter->idx - iter->keys->nr
-		? iter->keys->d + iter->idx : NULL;
+	struct journal_key *k = iter->keys->d + iter->idx;
 
-	if (k &&
-	    k->btree_id	== iter->btree_id &&
-	    k->level	== iter->level)
-		return k->k;
+	while (k < iter->keys->d + iter->keys->nr &&
+	       k->btree_id	== iter->btree_id &&
+	       k->level		== iter->level) {
+		if (!k->overwritten)
+			return k->k;
+
+		iter->idx++;
+		k = iter->keys->d + iter->idx;
+	}
 
-	iter->idx = iter->keys->nr;
 	return NULL;
 }
 
@@ -238,8 +244,7 @@ static void bch2_journal_iter_init(struct bch_fs *c,
 	iter->btree_id	= id;
 	iter->level	= level;
 	iter->keys	= &c->journal_keys;
-	iter->idx	= journal_key_search(&c->journal_keys, id, level, pos);
-	list_add(&iter->list, &c->journal_iters);
+	iter->idx	= bch2_journal_key_search(&c->journal_keys, id, level, pos);
 }
 
 static struct bkey_s_c bch2_journal_iter_peek_btree(struct btree_and_journal_iter *iter)
@@ -325,106 +330,33 @@ void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *iter)
 	bch2_journal_iter_exit(&iter->journal);
 }
 
-void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *iter,
-						struct bch_fs *c,
-						struct btree *b)
+void __bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *iter,
+						  struct bch_fs *c,
+						  struct btree *b,
+						  struct btree_node_iter node_iter,
+						  struct bpos pos)
 {
 	memset(iter, 0, sizeof(*iter));
 
 	iter->b = b;
-	bch2_btree_node_iter_init_from_start(&iter->node_iter, iter->b);
-	bch2_journal_iter_init(c, &iter->journal,
-			       b->c.btree_id, b->c.level, b->data->min_key);
-}
-
-/* Walk btree, overlaying keys from the journal: */
-
-static void btree_and_journal_iter_prefetch(struct bch_fs *c, struct btree *b,
-					   struct btree_and_journal_iter iter)
-{
-	unsigned i = 0, nr = b->c.level > 1 ? 2 : 16;
-	struct bkey_s_c k;
-	struct bkey_buf tmp;
-
-	BUG_ON(!b->c.level);
-
-	bch2_bkey_buf_init(&tmp);
-
-	while (i < nr &&
-	       (k = bch2_btree_and_journal_iter_peek(&iter)).k) {
-		bch2_bkey_buf_reassemble(&tmp, c, k);
-
-		bch2_btree_node_prefetch(c, NULL, NULL, tmp.k,
-					b->c.btree_id, b->c.level - 1);
-
-		bch2_btree_and_journal_iter_advance(&iter);
-		i++;
-	}
-
-	bch2_bkey_buf_exit(&tmp, c);
-}
-
-static int bch2_btree_and_journal_walk_recurse(struct btree_trans *trans, struct btree *b,
-				enum btree_id btree_id,
-				btree_walk_key_fn key_fn)
-{
-	struct bch_fs *c = trans->c;
-	struct btree_and_journal_iter iter;
-	struct bkey_s_c k;
-	struct bkey_buf tmp;
-	struct btree *child;
-	int ret = 0;
-
-	bch2_bkey_buf_init(&tmp);
-	bch2_btree_and_journal_iter_init_node_iter(&iter, c, b);
-
-	while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
-		if (b->c.level) {
-			bch2_bkey_buf_reassemble(&tmp, c, k);
-
-			child = bch2_btree_node_get_noiter(c, tmp.k,
-						b->c.btree_id, b->c.level - 1,
-						false);
-
-			ret = PTR_ERR_OR_ZERO(child);
-			if (ret)
-				break;
-
-			btree_and_journal_iter_prefetch(c, b, iter);
-
-			ret = bch2_btree_and_journal_walk_recurse(trans, child,
-					btree_id, key_fn);
-			six_unlock_read(&child->c.lock);
-		} else {
-			ret = key_fn(trans, k);
-		}
-
-		if (ret)
-			break;
-
-		bch2_btree_and_journal_iter_advance(&iter);
-	}
-
-	bch2_btree_and_journal_iter_exit(&iter);
-	bch2_bkey_buf_exit(&tmp, c);
-	return ret;
+	iter->node_iter = node_iter;
+	bch2_journal_iter_init(c, &iter->journal, b->c.btree_id, b->c.level, pos);
+	INIT_LIST_HEAD(&iter->journal.list);
 }
 
-int bch2_btree_and_journal_walk(struct btree_trans *trans, enum btree_id btree_id,
-				btree_walk_key_fn key_fn)
+/*
+ * this version is used by btree_gc before filesystem has gone RW and
+ * multithreaded, so uses the journal_iters list:
+ */
+void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *iter,
+						struct bch_fs *c,
+						struct btree *b)
 {
-	struct bch_fs *c = trans->c;
-	struct btree *b = c->btree_roots[btree_id].b;
-	int ret = 0;
-
-	if (btree_node_fake(b))
-		return 0;
-
-	six_lock_read(&b->c.lock, NULL, NULL);
-	ret = bch2_btree_and_journal_walk_recurse(trans, b, btree_id, key_fn);
-	six_unlock_read(&b->c.lock);
+	struct btree_node_iter node_iter;
 
-	return ret;
+	bch2_btree_node_iter_init_from_start(&node_iter, b);
+	__bch2_btree_and_journal_iter_init_node_iter(iter, c, b, node_iter, b->data->min_key);
+	list_add(&iter->journal.list, &c->journal_iters);
 }
 
 /* sort and dedup all keys in the journal: */
@@ -449,9 +381,7 @@ static int journal_sort_key_cmp(const void *_l, const void *_r)
 	const struct journal_key *l = _l;
 	const struct journal_key *r = _r;
 
-	return  cmp_int(l->btree_id,	r->btree_id) ?:
-		cmp_int(l->level,	r->level) ?:
-		bpos_cmp(l->k->k.p, r->k->k.p) ?:
+	return  journal_key_cmp(l, r) ?:
 		cmp_int(l->journal_seq, r->journal_seq) ?:
 		cmp_int(l->journal_offset, r->journal_offset);
 }
diff --git a/fs/bcachefs/recovery.h b/fs/bcachefs/recovery.h
index a7a9496afb95..21bdad9db249 100644
--- a/fs/bcachefs/recovery.h
+++ b/fs/bcachefs/recovery.h
@@ -31,6 +31,9 @@ struct btree_and_journal_iter {
 	}			last;
 };
 
+size_t bch2_journal_key_search(struct journal_keys *, enum btree_id,
+			       unsigned, struct bpos);
+
 int bch2_journal_key_insert_take(struct bch_fs *, enum btree_id,
 				 unsigned, struct bkey_i *);
 int bch2_journal_key_insert(struct bch_fs *, enum btree_id,
@@ -45,14 +48,13 @@ struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *
 struct bkey_s_c bch2_btree_and_journal_iter_next(struct btree_and_journal_iter *);
 
 void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *);
+void __bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *,
+				struct bch_fs *, struct btree *,
+				struct btree_node_iter, struct bpos);
 void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *,
 						struct bch_fs *,
 						struct btree *);
 
-typedef int (*btree_walk_key_fn)(struct btree_trans *, struct bkey_s_c);
-
-int bch2_btree_and_journal_walk(struct btree_trans *, enum btree_id, btree_walk_key_fn);
-
 void bch2_journal_keys_free(struct journal_keys *);
 void bch2_journal_entries_free(struct list_head *);
 
-- 
cgit 


From 8e432d98a5011de5b1304fa9c8591588bea59b96 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 10 Sep 2023 23:35:25 -0400
Subject: fixup! bcachefs: Factor out __bch2_btree_iter_set_pos()

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 01c130a3ce8d..281e5895bc30 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -2247,7 +2247,7 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
 	struct bpos search_key = btree_iter_search_key(iter);
 	struct bkey_i *next_update;
 	struct bkey_s_c k;
-	int ret, cmp;
+	int ret;
 
 	EBUG_ON(iter->path->cached || iter->path->level);
 	bch2_btree_iter_verify(iter);
@@ -2336,13 +2336,9 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
 	if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)
 		iter->pos.snapshot = iter->snapshot;
 
-	cmp = bpos_cmp(k.k->p, iter->path->pos);
-	if (cmp) {
-		iter->path = bch2_btree_path_make_mut(trans, iter->path,
-					iter->flags & BTREE_ITER_INTENT);
-		iter->path->pos = k.k->p;
-		trans->paths_sorted = false;
-	}
+	iter->path = btree_path_set_pos(trans, iter->path, k.k->p,
+				iter->flags & BTREE_ITER_INTENT);
+	BUG_ON(!iter->path->nodes_locked);
 out:
 	iter->path->should_be_locked = true;
 
-- 
cgit 


From d8601afca840d36203d0cf2da94ce4f92003956e Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 27 Dec 2021 23:10:06 -0500
Subject: bcachefs: Simplify journal replay

With BTREE_ITER_WITH_JOURNAL, there's no longer any restrictions on the
order we have to replay keys from the journal in, and we can also start
up journal reclaim right away - and delete a bunch of code.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/alloc_background.c      |   3 +-
 fs/bcachefs/bcachefs.h              |   2 -
 fs/bcachefs/btree_key_cache.h       |   3 +-
 fs/bcachefs/btree_update_interior.c |   5 +-
 fs/bcachefs/btree_update_leaf.c     |   3 -
 fs/bcachefs/journal_reclaim.c       |   5 --
 fs/bcachefs/journal_types.h         |   1 -
 fs/bcachefs/recovery.c              | 114 ++++++------------------------------
 8 files changed, 22 insertions(+), 114 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index cb4b059e796c..ab7d972aac3a 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -902,8 +902,7 @@ static void discard_one_bucket(struct bch_fs *c, struct bch_dev *ca, u64 b)
 static bool allocator_thread_running(struct bch_dev *ca)
 {
 	unsigned state = ca->mi.state == BCH_MEMBER_STATE_rw &&
-		test_bit(BCH_FS_ALLOCATOR_RUNNING, &ca->fs->flags) &&
-		test_bit(BCH_FS_ALLOC_REPLAY_DONE, &ca->fs->flags)
+		test_bit(BCH_FS_ALLOCATOR_RUNNING, &ca->fs->flags)
 		? ALLOCATOR_running
 		: ALLOCATOR_stopped;
 	alloc_thread_set_state(ca, state);
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 431cf25b38db..7771b4a4bb87 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -510,8 +510,6 @@ enum {
 	BCH_FS_INITIAL_GC_DONE,
 	BCH_FS_INITIAL_GC_UNFIXED,
 	BCH_FS_TOPOLOGY_REPAIR_DONE,
-	BCH_FS_ALLOC_REPLAY_DONE,
-	BCH_FS_BTREE_INTERIOR_REPLAY_DONE,
 	BCH_FS_FSCK_DONE,
 	BCH_FS_STARTED,
 	BCH_FS_RW,
diff --git a/fs/bcachefs/btree_key_cache.h b/fs/bcachefs/btree_key_cache.h
index 0768ef3ca776..b3d241b13453 100644
--- a/fs/bcachefs/btree_key_cache.h
+++ b/fs/bcachefs/btree_key_cache.h
@@ -16,8 +16,7 @@ static inline bool bch2_btree_key_cache_must_wait(struct bch_fs *c)
 	size_t nr_keys = atomic_long_read(&c->btree_key_cache.nr_keys);
 	size_t max_dirty = 4096 + (nr_keys * 3) / 4;
 
-	return nr_dirty > max_dirty &&
-		test_bit(JOURNAL_RECLAIM_STARTED, &c->journal.flags);
+	return nr_dirty > max_dirty;
 }
 
 int bch2_btree_key_cache_journal_flush(struct journal *,
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 17111c4228bd..51a2ea2c5cd6 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -45,7 +45,7 @@ static void btree_node_interior_verify(struct bch_fs *c, struct btree *b)
 
 	BUG_ON(!b->c.level);
 
-	if (!test_bit(BCH_FS_BTREE_INTERIOR_REPLAY_DONE, &c->flags))
+	if (!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags))
 		return;
 
 	bch2_btree_node_iter_init_from_start(&iter, b);
@@ -1851,9 +1851,6 @@ void bch2_btree_node_rewrite_async(struct bch_fs *c, struct btree *b)
 {
 	struct async_btree_rewrite *a;
 
-	if (!test_bit(BCH_FS_BTREE_INTERIOR_REPLAY_DONE, &c->flags))
-		return;
-
 	if (!percpu_ref_tryget(&c->writes))
 		return;
 
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index e95940ffad6b..1072acb0c9af 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -206,9 +206,6 @@ static bool btree_insert_key_leaf(struct btree_trans *trans,
 	int old_live_u64s = b->nr.live_u64s;
 	int live_u64s_added, u64s_added;
 
-	EBUG_ON(!insert->level &&
-		!test_bit(BCH_FS_BTREE_INTERIOR_REPLAY_DONE, &c->flags));
-
 	if (unlikely(!bch2_btree_bset_insert_key(trans, insert->path, b,
 					&insert_l(insert)->iter, insert->k)))
 		return false;
diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
index 4462beb52461..d72b17dc935a 100644
--- a/fs/bcachefs/journal_reclaim.c
+++ b/fs/bcachefs/journal_reclaim.c
@@ -489,9 +489,6 @@ static size_t journal_flush_pins(struct journal *j, u64 seq_to_flush,
 	u64 seq;
 	int err;
 
-	if (!test_bit(JOURNAL_RECLAIM_STARTED, &j->flags))
-		return 0;
-
 	lockdep_assert_held(&j->reclaim_lock);
 
 	while (1) {
@@ -689,8 +686,6 @@ static int bch2_journal_reclaim_thread(void *arg)
 
 	set_freezable();
 
-	kthread_wait_freezable(test_bit(JOURNAL_RECLAIM_STARTED, &j->flags));
-
 	j->last_flushed = jiffies;
 
 	while (!ret && !kthread_should_stop()) {
diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h
index 0c4df603280d..73e7fbc4f109 100644
--- a/fs/bcachefs/journal_types.h
+++ b/fs/bcachefs/journal_types.h
@@ -148,7 +148,6 @@ enum journal_space_from {
 enum {
 	JOURNAL_REPLAY_DONE,
 	JOURNAL_STARTED,
-	JOURNAL_RECLAIM_STARTED,
 	JOURNAL_NEED_WRITE,
 	JOURNAL_MAY_GET_UNRESERVED,
 	JOURNAL_MAY_SKIP_FLUSH,
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 57311ad283c7..cb0ba84711aa 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -474,8 +474,8 @@ static void replay_now_at(struct journal *j, u64 seq)
 		bch2_journal_pin_put(j, j->replay_journal_seq++);
 }
 
-static int __bch2_journal_replay_key(struct btree_trans *trans,
-				     struct journal_key *k)
+static int bch2_journal_replay_key(struct btree_trans *trans,
+				   struct journal_key *k)
 {
 	struct btree_iter iter;
 	unsigned iter_flags =
@@ -484,7 +484,7 @@ static int __bch2_journal_replay_key(struct btree_trans *trans,
 	int ret;
 
 	if (!k->level && k->btree_id == BTREE_ID_alloc)
-		iter_flags |= BTREE_ITER_CACHED|BTREE_ITER_CACHED_NOFILL;
+		iter_flags |= BTREE_ITER_CACHED;
 
 	bch2_trans_node_iter_init(trans, &iter, k->btree_id, k->k->k.p,
 				  BTREE_MAX_DEPTH, k->level,
@@ -503,29 +503,12 @@ out:
 	return ret;
 }
 
-static int bch2_journal_replay_key(struct bch_fs *c, struct journal_key *k)
-{
-	unsigned commit_flags =
-		BTREE_INSERT_LAZY_RW|
-		BTREE_INSERT_NOFAIL|
-		BTREE_INSERT_JOURNAL_RESERVED;
-
-	if (!k->allocated)
-		commit_flags |= BTREE_INSERT_JOURNAL_REPLAY;
-
-	return bch2_trans_do(c, NULL, NULL, commit_flags,
-			     __bch2_journal_replay_key(&trans, k));
-}
-
 static int journal_sort_seq_cmp(const void *_l, const void *_r)
 {
 	const struct journal_key *l = *((const struct journal_key **)_l);
 	const struct journal_key *r = *((const struct journal_key **)_r);
 
-	return  cmp_int(r->level,	l->level) ?:
-		cmp_int(l->journal_seq, r->journal_seq) ?:
-		cmp_int(l->btree_id,	r->btree_id) ?:
-		bpos_cmp(l->k->k.p,	r->k->k.p);
+	return cmp_int(l->journal_seq, r->journal_seq);
 }
 
 static int bch2_journal_replay(struct bch_fs *c)
@@ -533,10 +516,7 @@ static int bch2_journal_replay(struct bch_fs *c)
 	struct journal_keys *keys = &c->journal_keys;
 	struct journal_key **keys_sorted, *k;
 	struct journal *j = &c->journal;
-	struct bch_dev *ca;
-	unsigned idx;
 	size_t i;
-	u64 seq;
 	int ret;
 
 	keys_sorted = kmalloc_array(sizeof(*keys_sorted), keys->nr, GFP_KERNEL);
@@ -555,73 +535,25 @@ static int bch2_journal_replay(struct bch_fs *c)
 		replay_now_at(j, keys->journal_seq_base);
 	}
 
-	seq = j->replay_journal_seq;
-
-	/*
-	 * First replay updates to the alloc btree - these will only update the
-	 * btree key cache:
-	 */
-	for (i = 0; i < keys->nr; i++) {
-		k = keys_sorted[i];
-
-		cond_resched();
-
-		if (!k->level && k->btree_id == BTREE_ID_alloc) {
-			j->replay_journal_seq = keys->journal_seq_base + k->journal_seq;
-			ret = bch2_journal_replay_key(c, k);
-			if (ret)
-				goto err;
-		}
-	}
-
-	/* Now we can start the allocator threads: */
-	set_bit(BCH_FS_ALLOC_REPLAY_DONE, &c->flags);
-	for_each_member_device(ca, c, idx)
-		bch2_wake_allocator(ca);
-
-	/*
-	 * Next replay updates to interior btree nodes:
-	 */
-	for (i = 0; i < keys->nr; i++) {
-		k = keys_sorted[i];
-
-		cond_resched();
-
-		if (k->level) {
-			j->replay_journal_seq = keys->journal_seq_base + k->journal_seq;
-			ret = bch2_journal_replay_key(c, k);
-			if (ret)
-				goto err;
-		}
-	}
-
-	/*
-	 * Now that the btree is in a consistent state, we can start journal
-	 * reclaim (which will be flushing entries from the btree key cache back
-	 * to the btree:
-	 */
-	set_bit(BCH_FS_BTREE_INTERIOR_REPLAY_DONE, &c->flags);
-	set_bit(JOURNAL_RECLAIM_STARTED, &j->flags);
-	journal_reclaim_kick(j);
-
-	j->replay_journal_seq = seq;
-
-	/*
-	 * Now replay leaf node updates:
-	 */
 	for (i = 0; i < keys->nr; i++) {
 		k = keys_sorted[i];
 
 		cond_resched();
 
-		if (k->level || k->btree_id == BTREE_ID_alloc)
-			continue;
-
-		replay_now_at(j, keys->journal_seq_base + k->journal_seq);
+		if (!k->allocated)
+			replay_now_at(j, keys->journal_seq_base + k->journal_seq);
 
-		ret = bch2_journal_replay_key(c, k);
-		if (ret)
+		ret = bch2_trans_do(c, NULL, NULL,
+				    BTREE_INSERT_LAZY_RW|
+				    BTREE_INSERT_NOFAIL|
+				    BTREE_INSERT_JOURNAL_RESERVED|
+				    (!k->allocated ? BTREE_INSERT_JOURNAL_REPLAY : 0),
+			     bch2_journal_replay_key(&trans, k));
+		if (ret) {
+			bch_err(c, "journal replay: error %d while replaying key at btree %s level %u",
+				ret, bch2_btree_ids[k->btree_id], k->level);
 			goto err;
+		}
 	}
 
 	replay_now_at(j, j->replay_journal_seq_end);
@@ -629,14 +561,9 @@ static int bch2_journal_replay(struct bch_fs *c)
 
 	bch2_journal_set_replay_done(j);
 	bch2_journal_flush_all_pins(j);
-	kfree(keys_sorted);
-
-	return bch2_journal_error(j);
+	ret = bch2_journal_error(j);
 err:
-	bch_err(c, "journal replay: error %d while replaying key at btree %s level %u",
-		ret, bch2_btree_ids[k->btree_id], k->level);
 	kfree(keys_sorted);
-
 	return ret;
 }
 
@@ -1215,7 +1142,8 @@ use_clean:
 	ret = bch2_journal_replay(c);
 	if (ret)
 		goto err;
-	bch_verbose(c, "journal replay done");
+	if (c->opts.verbose || !c->sb.clean)
+		bch_info(c, "journal replay done");
 
 	if (test_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags) &&
 	    !c->opts.nochanges) {
@@ -1385,10 +1313,6 @@ int bch2_fs_initialize(struct bch_fs *c)
 	for (i = 0; i < BTREE_ID_NR; i++)
 		bch2_btree_root_alloc(c, i);
 
-	set_bit(BCH_FS_ALLOC_REPLAY_DONE, &c->flags);
-	set_bit(BCH_FS_BTREE_INTERIOR_REPLAY_DONE, &c->flags);
-	set_bit(JOURNAL_RECLAIM_STARTED, &c->journal.flags);
-
 	err = "unable to allocate journal buckets";
 	for_each_online_member(ca, c, i) {
 		ret = bch2_dev_journal_alloc(ca);
-- 
cgit 


From eacb2574f09f5b71acc468d44e7a1633847fd08d Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 2 Jan 2022 21:45:35 -0500
Subject: bcachefs: bch_dev->dev

Add a field to bch_dev for the dev_t of the underlying block device -
this fixes a null ptr deref in tracepoints.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/bcachefs.h | 1 +
 fs/bcachefs/chardev.c  | 5 ++++-
 fs/bcachefs/super.c    | 4 +++-
 fs/bcachefs/trace.h    | 6 +++---
 4 files changed, 11 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 7771b4a4bb87..7c48ebed1d35 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -432,6 +432,7 @@ struct bch_dev {
 	struct bch_sb_handle	disk_sb;
 	struct bch_sb		*sb_read_scratch;
 	int			sb_write_error;
+	dev_t			dev;
 
 	struct bch_devs_mask	self;
 
diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c
index aae9a2db8b0d..6cd0a2739ce5 100644
--- a/fs/bcachefs/chardev.c
+++ b/fs/bcachefs/chardev.c
@@ -568,8 +568,11 @@ static long bch2_ioctl_disk_get_idx(struct bch_fs *c,
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
+	if (!dev)
+		return -EINVAL;
+
 	for_each_online_member(ca, c, i)
-		if (ca->disk_sb.bdev->bd_dev == dev) {
+		if (ca->dev == dev) {
 			percpu_ref_put(&ca->io_ref);
 			return i;
 		}
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index d92bb50d0960..b0c2a8b847ef 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -1257,6 +1257,8 @@ static int __bch2_dev_attach_bdev(struct bch_dev *ca, struct bch_sb_handle *sb)
 	ca->disk_sb = *sb;
 	memset(sb, 0, sizeof(*sb));
 
+	ca->dev = ca->disk_sb.bdev->bd_dev;
+
 	percpu_ref_reinit(&ca->io_ref);
 
 	return 0;
@@ -1875,7 +1877,7 @@ struct bch_dev *bch2_dev_lookup(struct bch_fs *c, const char *path)
 
 	rcu_read_lock();
 	for_each_member_device_rcu(ca, c, i, NULL)
-		if (ca->disk_sb.bdev->bd_dev == dev)
+		if (ca->dev == dev)
 			goto found;
 	ca = ERR_PTR(-ENOENT);
 found:
diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h
index a1122fa3ccc6..ff67e8572ea4 100644
--- a/fs/bcachefs/trace.h
+++ b/fs/bcachefs/trace.h
@@ -387,7 +387,7 @@ TRACE_EVENT(alloc_scan,
 	),
 
 	TP_fast_assign(
-		__entry->dev		= ca->disk_sb.bdev->bd_dev;
+		__entry->dev		= ca->dev;
 		__entry->found		= found;
 		__entry->inc_gen	= inc_gen;
 		__entry->inc_gen_skipped = inc_gen_skipped;
@@ -409,7 +409,7 @@ TRACE_EVENT(invalidate,
 	),
 
 	TP_fast_assign(
-		__entry->dev		= ca->disk_sb.bdev->bd_dev;
+		__entry->dev		= ca->dev;
 		__entry->offset		= offset,
 		__entry->sectors	= sectors;
 	),
@@ -431,7 +431,7 @@ DECLARE_EVENT_CLASS(bucket_alloc,
 	),
 
 	TP_fast_assign(
-		__entry->dev		= ca->disk_sb.bdev->bd_dev;
+		__entry->dev		= ca->dev;
 		__entry->reserve	= reserve;
 	),
 
-- 
cgit 


From 6558e61dfe1ec1d54a39cb4604d00efa5bcaaa7e Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 2 Jan 2022 22:24:43 -0500
Subject: bcachefs: Fix an assertion

bch2_trans_commit() can legitimately return -ENOSPC with
BTREE_INSERT_NOFAIL set if BTREE_INSERT_NOWAIT was also set.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_update_leaf.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 1072acb0c9af..60897fc70c58 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -816,7 +816,9 @@ int bch2_trans_commit_error(struct btree_trans *trans,
 	}
 
 	BUG_ON((ret == EINTR || ret == -EAGAIN) && !trans->restarted);
-	BUG_ON(ret == -ENOSPC && (trans->flags & BTREE_INSERT_NOFAIL));
+	BUG_ON(ret == -ENOSPC &&
+	       !(trans->flags & BTREE_INSERT_NOWAIT) &&
+	       (trans->flags & BTREE_INSERT_NOFAIL));
 
 	return ret;
 }
-- 
cgit 


From 17563164b392b97fedd997cbc972337801ed678f Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 3 Jan 2022 00:14:39 -0500
Subject: bcachefs: Kill bch2_bset_fix_invalidated_key()

Was dead code, so delete it.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/bset.c | 127 +++--------------------------------------------------
 fs/bcachefs/bset.h |   1 -
 2 files changed, 7 insertions(+), 121 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c
index 2e0ad3a4fa67..b7b3e78bb528 100644
--- a/fs/bcachefs/bset.c
+++ b/fs/bcachefs/bset.c
@@ -595,10 +595,10 @@ static inline unsigned bkey_mantissa(const struct bkey_packed *k,
 }
 
 __always_inline
-static inline void __make_bfloat(struct btree *b, struct bset_tree *t,
-				 unsigned j,
-				 struct bkey_packed *min_key,
-				 struct bkey_packed *max_key)
+static inline void make_bfloat(struct btree *b, struct bset_tree *t,
+			       unsigned j,
+			       struct bkey_packed *min_key,
+			       struct bkey_packed *max_key)
 {
 	struct bkey_float *f = bkey_float(b, t, j);
 	struct bkey_packed *m = tree_to_bkey(b, t, j);
@@ -667,34 +667,6 @@ static inline void __make_bfloat(struct btree *b, struct bset_tree *t,
 	f->mantissa = mantissa;
 }
 
-static void make_bfloat(struct btree *b, struct bset_tree *t,
-			unsigned j,
-			struct bkey_packed *min_key,
-			struct bkey_packed *max_key)
-{
-	struct bkey_i *k;
-
-	if (is_power_of_2(j) &&
-	    !min_key->u64s) {
-		if (!bkey_pack_pos(min_key, b->data->min_key, b)) {
-			k = (void *) min_key;
-			bkey_init(&k->k);
-			k->k.p = b->data->min_key;
-		}
-	}
-
-	if (is_power_of_2(j + 1) &&
-	    !max_key->u64s) {
-		if (!bkey_pack_pos(max_key, b->data->max_key, b)) {
-			k = (void *) max_key;
-			bkey_init(&k->k);
-			k->k.p = b->data->max_key;
-		}
-	}
-
-	__make_bfloat(b, t, j, min_key, max_key);
-}
-
 /* bytes remaining - only valid for last bset: */
 static unsigned __bset_tree_capacity(const struct btree *b, const struct bset_tree *t)
 {
@@ -784,9 +756,9 @@ retry:
 
 	/* Then we build the tree */
 	eytzinger1_for_each(j, t->size)
-		__make_bfloat(b, t, j,
-			      bkey_to_packed(&min_key),
-			      bkey_to_packed(&max_key));
+		make_bfloat(b, t, j,
+			    bkey_to_packed(&min_key),
+			    bkey_to_packed(&max_key));
 }
 
 static void bset_alloc_tree(struct btree *b, struct bset_tree *t)
@@ -931,91 +903,6 @@ struct bkey_packed *bch2_bkey_prev_filter(struct btree *b,
 
 /* Insert */
 
-static void rw_aux_tree_fix_invalidated_key(struct btree *b,
-					    struct bset_tree *t,
-					    struct bkey_packed *k)
-{
-	unsigned offset = __btree_node_key_to_offset(b, k);
-	unsigned j = rw_aux_tree_bsearch(b, t, offset);
-
-	if (j < t->size &&
-	    rw_aux_tree(b, t)[j].offset == offset)
-		rw_aux_tree_set(b, t, j, k);
-
-	bch2_bset_verify_rw_aux_tree(b, t);
-}
-
-static void ro_aux_tree_fix_invalidated_key(struct btree *b,
-					    struct bset_tree *t,
-					    struct bkey_packed *k)
-{
-	struct bkey_packed min_key, max_key;
-	unsigned inorder, j;
-
-	EBUG_ON(bset_aux_tree_type(t) != BSET_RO_AUX_TREE);
-
-	/* signal to make_bfloat() that they're uninitialized: */
-	min_key.u64s = max_key.u64s = 0;
-
-	if (bkey_next(k) == btree_bkey_last(b, t)) {
-		for (j = 1; j < t->size; j = j * 2 + 1)
-			make_bfloat(b, t, j, &min_key, &max_key);
-	}
-
-	inorder = bkey_to_cacheline(b, t, k);
-
-	if (inorder &&
-	    inorder < t->size) {
-		j = __inorder_to_eytzinger1(inorder, t->size, t->extra);
-
-		if (k == tree_to_bkey(b, t, j)) {
-			/* Fix the node this key corresponds to */
-			make_bfloat(b, t, j, &min_key, &max_key);
-
-			/* Children for which this key is the right boundary */
-			for (j = eytzinger1_left_child(j);
-			     j < t->size;
-			     j = eytzinger1_right_child(j))
-				make_bfloat(b, t, j, &min_key, &max_key);
-		}
-	}
-
-	if (inorder + 1 < t->size) {
-		j = __inorder_to_eytzinger1(inorder + 1, t->size, t->extra);
-
-		if (k == tree_to_prev_bkey(b, t, j)) {
-			make_bfloat(b, t, j, &min_key, &max_key);
-
-			/* Children for which this key is the left boundary */
-			for (j = eytzinger1_right_child(j);
-			     j < t->size;
-			     j = eytzinger1_left_child(j))
-				make_bfloat(b, t, j, &min_key, &max_key);
-		}
-	}
-}
-
-/**
- * bch2_bset_fix_invalidated_key() - given an existing  key @k that has been
- * modified, fix any auxiliary search tree by remaking all the nodes in the
- * auxiliary search tree that @k corresponds to
- */
-void bch2_bset_fix_invalidated_key(struct btree *b, struct bkey_packed *k)
-{
-	struct bset_tree *t = bch2_bkey_to_bset_inlined(b, k);
-
-	switch (bset_aux_tree_type(t)) {
-	case BSET_NO_AUX_TREE:
-		break;
-	case BSET_RO_AUX_TREE:
-		ro_aux_tree_fix_invalidated_key(b, t, k);
-		break;
-	case BSET_RW_AUX_TREE:
-		rw_aux_tree_fix_invalidated_key(b, t, k);
-		break;
-	}
-}
-
 static void bch2_bset_fix_lookup_table(struct btree *b,
 				       struct bset_tree *t,
 				       struct bkey_packed *_where,
diff --git a/fs/bcachefs/bset.h b/fs/bcachefs/bset.h
index 8acbcb5d86c4..9708b9ffa4df 100644
--- a/fs/bcachefs/bset.h
+++ b/fs/bcachefs/bset.h
@@ -361,7 +361,6 @@ void bch2_bset_init_first(struct btree *, struct bset *);
 void bch2_bset_init_next(struct bch_fs *, struct btree *,
 			 struct btree_node_entry *);
 void bch2_bset_build_aux_tree(struct btree *, struct bset_tree *, bool);
-void bch2_bset_fix_invalidated_key(struct btree *, struct bkey_packed *);
 
 void bch2_bset_insert(struct btree *, struct btree_node_iter *,
 		     struct bkey_packed *, struct bkey_i *, unsigned);
-- 
cgit 


From 72492d55cea359c2fce3e372da5c7387b50a21ef Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 3 Jan 2022 00:22:29 -0500
Subject: bcachefs: Make eytzinger size parameter more conventional

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/bset.c      | 14 +++++++-------
 fs/bcachefs/eytzinger.h | 48 ++++++++++++++++++++++--------------------------
 2 files changed, 29 insertions(+), 33 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c
index b7b3e78bb528..58e510fa19bd 100644
--- a/fs/bcachefs/bset.c
+++ b/fs/bcachefs/bset.c
@@ -461,7 +461,7 @@ static inline struct bkey_packed *tree_to_bkey(const struct btree *b,
 					       unsigned j)
 {
 	return cacheline_to_bkey(b, t,
-			__eytzinger1_to_inorder(j, t->size, t->extra),
+			__eytzinger1_to_inorder(j, t->size - 1, t->extra),
 			bkey_float(b, t, j)->key_offset);
 }
 
@@ -723,7 +723,7 @@ retry:
 	t->extra = (t->size - rounddown_pow_of_two(t->size - 1)) << 1;
 
 	/* First we figure out where the first key in each cacheline is */
-	eytzinger1_for_each(j, t->size) {
+	eytzinger1_for_each(j, t->size - 1) {
 		while (bkey_to_cacheline(b, t, k) < cacheline)
 			prev = k, k = bkey_next(k);
 
@@ -755,7 +755,7 @@ retry:
 	}
 
 	/* Then we build the tree */
-	eytzinger1_for_each(j, t->size)
+	eytzinger1_for_each(j, t->size - 1)
 		make_bfloat(b, t, j,
 			    bkey_to_packed(&min_key),
 			    bkey_to_packed(&max_key));
@@ -857,7 +857,7 @@ static struct bkey_packed *__bkey_prev(struct btree *b, struct bset_tree *t,
 		do {
 			p = j ? tree_to_bkey(b, t,
 					__inorder_to_eytzinger1(j--,
-							t->size, t->extra))
+							t->size - 1, t->extra))
 			      : btree_bkey_first(b, t);
 		} while (p >= k);
 		break;
@@ -1137,7 +1137,7 @@ slowpath:
 		n = n * 2 + (cmp < 0);
 	} while (n < t->size);
 
-	inorder = __eytzinger1_to_inorder(n >> 1, t->size, t->extra);
+	inorder = __eytzinger1_to_inorder(n >> 1, t->size - 1, t->extra);
 
 	/*
 	 * n would have been the node we recursed to - the low bit tells us if
@@ -1148,7 +1148,7 @@ slowpath:
 		if (unlikely(!inorder))
 			return btree_bkey_first(b, t);
 
-		f = &base->f[eytzinger1_prev(n >> 1, t->size)];
+		f = &base->f[eytzinger1_prev(n >> 1, t->size - 1)];
 	}
 
 	return cacheline_to_bkey(b, t, inorder, f->key_offset);
@@ -1565,7 +1565,7 @@ void bch2_bfloat_to_text(struct printbuf *out, struct btree *b,
 	if (!inorder || inorder >= t->size)
 		return;
 
-	j = __inorder_to_eytzinger1(inorder, t->size, t->extra);
+	j = __inorder_to_eytzinger1(inorder, t->size - 1, t->extra);
 	if (k != tree_to_bkey(b, t, j))
 		return;
 
diff --git a/fs/bcachefs/eytzinger.h b/fs/bcachefs/eytzinger.h
index 26d5cad7e6a5..05429c9631cd 100644
--- a/fs/bcachefs/eytzinger.h
+++ b/fs/bcachefs/eytzinger.h
@@ -17,10 +17,6 @@
  *
  * With one based indexing each level of the tree starts at a power of two -
  * good for cacheline alignment:
- *
- * Size parameter is treated as if we were using 0 based indexing, however:
- * valid nodes, and inorder indices, are in the range [1..size) - that is, there
- * are actually size - 1 elements
  */
 
 static inline unsigned eytzinger1_child(unsigned i, unsigned child)
@@ -42,12 +38,12 @@ static inline unsigned eytzinger1_right_child(unsigned i)
 
 static inline unsigned eytzinger1_first(unsigned size)
 {
-	return rounddown_pow_of_two(size - 1);
+	return rounddown_pow_of_two(size);
 }
 
 static inline unsigned eytzinger1_last(unsigned size)
 {
-	return rounddown_pow_of_two(size) - 1;
+	return rounddown_pow_of_two(size + 1) - 1;
 }
 
 /*
@@ -62,13 +58,13 @@ static inline unsigned eytzinger1_last(unsigned size)
 
 static inline unsigned eytzinger1_next(unsigned i, unsigned size)
 {
-	EBUG_ON(i >= size);
+	EBUG_ON(i > size);
 
-	if (eytzinger1_right_child(i) < size) {
+	if (eytzinger1_right_child(i) <= size) {
 		i = eytzinger1_right_child(i);
 
-		i <<= __fls(size) - __fls(i);
-		i >>= i >= size;
+		i <<= __fls(size + 1) - __fls(i);
+		i >>= i > size;
 	} else {
 		i >>= ffz(i) + 1;
 	}
@@ -78,14 +74,14 @@ static inline unsigned eytzinger1_next(unsigned i, unsigned size)
 
 static inline unsigned eytzinger1_prev(unsigned i, unsigned size)
 {
-	EBUG_ON(i >= size);
+	EBUG_ON(i > size);
 
-	if (eytzinger1_left_child(i) < size) {
+	if (eytzinger1_left_child(i) <= size) {
 		i = eytzinger1_left_child(i) + 1;
 
-		i <<= __fls(size) - __fls(i);
+		i <<= __fls(size + 1) - __fls(i);
 		i -= 1;
-		i >>= i >= size;
+		i >>= i > size;
 	} else {
 		i >>= __ffs(i) + 1;
 	}
@@ -95,17 +91,17 @@ static inline unsigned eytzinger1_prev(unsigned i, unsigned size)
 
 static inline unsigned eytzinger1_extra(unsigned size)
 {
-	return (size - rounddown_pow_of_two(size - 1)) << 1;
+	return (size + 1 - rounddown_pow_of_two(size)) << 1;
 }
 
 static inline unsigned __eytzinger1_to_inorder(unsigned i, unsigned size,
 					      unsigned extra)
 {
 	unsigned b = __fls(i);
-	unsigned shift = __fls(size - 1) - b;
+	unsigned shift = __fls(size) - b;
 	int s;
 
-	EBUG_ON(!i || i >= size);
+	EBUG_ON(!i || i > size);
 
 	i  ^= 1U << b;
 	i <<= 1;
@@ -130,7 +126,7 @@ static inline unsigned __inorder_to_eytzinger1(unsigned i, unsigned size,
 	unsigned shift;
 	int s;
 
-	EBUG_ON(!i || i >= size);
+	EBUG_ON(!i || i > size);
 
 	/*
 	 * sign bit trick:
@@ -144,7 +140,7 @@ static inline unsigned __inorder_to_eytzinger1(unsigned i, unsigned size,
 	shift = __ffs(i);
 
 	i >>= shift + 1;
-	i  |= 1U << (__fls(size - 1) - shift);
+	i  |= 1U << (__fls(size) - shift);
 
 	return i;
 }
@@ -185,39 +181,39 @@ static inline unsigned eytzinger0_right_child(unsigned i)
 
 static inline unsigned eytzinger0_first(unsigned size)
 {
-	return eytzinger1_first(size + 1) - 1;
+	return eytzinger1_first(size) - 1;
 }
 
 static inline unsigned eytzinger0_last(unsigned size)
 {
-	return eytzinger1_last(size + 1) - 1;
+	return eytzinger1_last(size) - 1;
 }
 
 static inline unsigned eytzinger0_next(unsigned i, unsigned size)
 {
-	return eytzinger1_next(i + 1, size + 1) - 1;
+	return eytzinger1_next(i + 1, size) - 1;
 }
 
 static inline unsigned eytzinger0_prev(unsigned i, unsigned size)
 {
-	return eytzinger1_prev(i + 1, size + 1) - 1;
+	return eytzinger1_prev(i + 1, size) - 1;
 }
 
 static inline unsigned eytzinger0_extra(unsigned size)
 {
-	return eytzinger1_extra(size + 1);
+	return eytzinger1_extra(size);
 }
 
 static inline unsigned __eytzinger0_to_inorder(unsigned i, unsigned size,
 					       unsigned extra)
 {
-	return __eytzinger1_to_inorder(i + 1, size + 1, extra) - 1;
+	return __eytzinger1_to_inorder(i + 1, size, extra) - 1;
 }
 
 static inline unsigned __inorder_to_eytzinger0(unsigned i, unsigned size,
 					       unsigned extra)
 {
-	return __inorder_to_eytzinger1(i + 1, size + 1, extra) - 1;
+	return __inorder_to_eytzinger1(i + 1, size, extra) - 1;
 }
 
 static inline unsigned eytzinger0_to_inorder(unsigned i, unsigned size)
-- 
cgit 


From fe312f81ef62f8aec0c21dabb703baeb4a7533fc Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 3 Jan 2022 04:17:02 -0500
Subject: bcachefs: Use kvmalloc() for array of sorted keys in journal replay

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/recovery.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index cb0ba84711aa..e4ba3f0aef4a 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -519,7 +519,7 @@ static int bch2_journal_replay(struct bch_fs *c)
 	size_t i;
 	int ret;
 
-	keys_sorted = kmalloc_array(sizeof(*keys_sorted), keys->nr, GFP_KERNEL);
+	keys_sorted = kvmalloc_array(sizeof(*keys_sorted), keys->nr, GFP_KERNEL);
 	if (!keys_sorted)
 		return -ENOMEM;
 
@@ -563,7 +563,7 @@ static int bch2_journal_replay(struct bch_fs *c)
 	bch2_journal_flush_all_pins(j);
 	ret = bch2_journal_error(j);
 err:
-	kfree(keys_sorted);
+	kvfree(keys_sorted);
 	return ret;
 }
 
-- 
cgit 


From efe68e1d65c008dd1f19517378d0ad0688c6a643 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 3 Jan 2022 23:38:50 -0500
Subject: bcachefs: Improved superblock-related error messages

This patch converts bch2_sb_validate() and the .validate methods for the
various superblock sections to take printbuf, to which they can print
detailed error messages, including printing the entire section that was
invalid.

This is a great improvement over the previous situation, where we could
only return static strings that didn't have precise information about
what was wrong.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/disk_groups.c           |  62 ++---
 fs/bcachefs/journal_seq_blacklist.c |  37 +--
 fs/bcachefs/quota.c                 |  12 +-
 fs/bcachefs/replicas.c              | 137 +++++------
 fs/bcachefs/super-io.c              | 466 ++++++++++++++++++++++--------------
 fs/bcachefs/super-io.h              |   7 +-
 fs/bcachefs/super.c                 | 126 +++-------
 fs/bcachefs/super.h                 |   1 -
 8 files changed, 450 insertions(+), 398 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/disk_groups.c b/fs/bcachefs/disk_groups.c
index c47fa0a0f450..a27fc4fb60d5 100644
--- a/fs/bcachefs/disk_groups.c
+++ b/fs/bcachefs/disk_groups.c
@@ -17,24 +17,20 @@ static int group_cmp(const void *_l, const void *_r)
 		strncmp(l->label, r->label, sizeof(l->label));
 }
 
-static const char *bch2_sb_disk_groups_validate(struct bch_sb *sb,
-						struct bch_sb_field *f)
+static int bch2_sb_disk_groups_validate(struct bch_sb *sb,
+					struct bch_sb_field *f,
+					struct printbuf *err)
 {
 	struct bch_sb_field_disk_groups *groups =
 		field_to_type(f, disk_groups);
 	struct bch_disk_group *g, *sorted = NULL;
-	struct bch_sb_field_members *mi;
-	struct bch_member *m;
-	unsigned i, nr_groups, len;
-	const char *err = NULL;
-
-	mi		= bch2_sb_get_members(sb);
-	groups		= bch2_sb_get_disk_groups(sb);
-	nr_groups	= disk_groups_nr(groups);
+	struct bch_sb_field_members *mi = bch2_sb_get_members(sb);
+	unsigned nr_groups = disk_groups_nr(groups);
+	unsigned i, len;
+	int ret = -EINVAL;
 
-	for (m = mi->members;
-	     m < mi->members + sb->nr_devices;
-	     m++) {
+	for (i = 0; i < sb->nr_devices; i++) {
+		struct bch_member *m = mi->members + i;
 		unsigned g;
 
 		if (!BCH_MEMBER_GROUP(m))
@@ -42,45 +38,53 @@ static const char *bch2_sb_disk_groups_validate(struct bch_sb *sb,
 
 		g = BCH_MEMBER_GROUP(m) - 1;
 
-		if (g >= nr_groups ||
-		    BCH_GROUP_DELETED(&groups->entries[g]))
-			return "disk has invalid group";
+		if (g >= nr_groups) {
+			pr_buf(err, "disk %u has invalid label %u (have %u)",
+			       i, g, nr_groups);
+			return -EINVAL;
+		}
+
+		if (BCH_GROUP_DELETED(&groups->entries[g])) {
+			pr_buf(err, "disk %u has deleted label %u", i, g);
+			return -EINVAL;
+		}
 	}
 
 	if (!nr_groups)
-		return NULL;
+		return 0;
+
+	for (i = 0; i < nr_groups; i++) {
+		g = groups->entries + i;
 
-	for (g = groups->entries;
-	     g < groups->entries + nr_groups;
-	     g++) {
 		if (BCH_GROUP_DELETED(g))
 			continue;
 
 		len = strnlen(g->label, sizeof(g->label));
 		if (!len) {
-			err = "group with empty label";
-			goto err;
+			pr_buf(err, "label %u empty", i);
+			return -EINVAL;
 		}
 	}
 
 	sorted = kmalloc_array(nr_groups, sizeof(*sorted), GFP_KERNEL);
 	if (!sorted)
-		return "cannot allocate memory";
+		return -ENOMEM;
 
 	memcpy(sorted, groups->entries, nr_groups * sizeof(*sorted));
 	sort(sorted, nr_groups, sizeof(*sorted), group_cmp, NULL);
 
-	for (i = 0; i + 1 < nr_groups; i++)
-		if (!BCH_GROUP_DELETED(sorted + i) &&
-		    !group_cmp(sorted + i, sorted + i + 1)) {
-			err = "duplicate groups";
+	for (g = sorted; g + 1 < sorted + nr_groups; g++)
+		if (!BCH_GROUP_DELETED(g) &&
+		    !group_cmp(&g[0], &g[1])) {
+			pr_buf(err, "duplicate label %llu.", BCH_GROUP_PARENT(g));
+			bch_scnmemcpy(err, g->label, strnlen(g->label, sizeof(g->label)));
 			goto err;
 		}
 
-	err = NULL;
+	ret = 0;
 err:
 	kfree(sorted);
-	return err;
+	return 0;
 }
 
 static void bch2_sb_disk_groups_to_text(struct printbuf *out,
diff --git a/fs/bcachefs/journal_seq_blacklist.c b/fs/bcachefs/journal_seq_blacklist.c
index 10bd23e969d2..428377e73a8d 100644
--- a/fs/bcachefs/journal_seq_blacklist.c
+++ b/fs/bcachefs/journal_seq_blacklist.c
@@ -189,27 +189,34 @@ int bch2_blacklist_table_initialize(struct bch_fs *c)
 	return 0;
 }
 
-static const char *
-bch2_sb_journal_seq_blacklist_validate(struct bch_sb *sb,
-				       struct bch_sb_field *f)
+static int bch2_sb_journal_seq_blacklist_validate(struct bch_sb *sb,
+						  struct bch_sb_field *f,
+						  struct printbuf *err)
 {
 	struct bch_sb_field_journal_seq_blacklist *bl =
 		field_to_type(f, journal_seq_blacklist);
-	struct journal_seq_blacklist_entry *i;
-	unsigned nr = blacklist_nr_entries(bl);
+	unsigned i, nr = blacklist_nr_entries(bl);
 
-	for (i = bl->start; i < bl->start + nr; i++) {
-		if (le64_to_cpu(i->start) >=
-		    le64_to_cpu(i->end))
-			return "entry start >= end";
-
-		if (i + 1 < bl->start + nr &&
-		    le64_to_cpu(i[0].end) >
-		    le64_to_cpu(i[1].start))
-			return "entries out of order";
+	for (i = 0; i < nr; i++) {
+		struct journal_seq_blacklist_entry *e = bl->start + i;
+
+		if (le64_to_cpu(e->start) >=
+		    le64_to_cpu(e->end)) {
+			pr_buf(err, "entry %u start >= end (%llu >= %llu)",
+			       i, le64_to_cpu(e->start), le64_to_cpu(e->end));
+			return -EINVAL;
+		}
+
+		if (i + 1 < nr &&
+		    le64_to_cpu(e[0].end) >
+		    le64_to_cpu(e[1].start)) {
+			pr_buf(err, "entry %u out of order with next entry (%llu > %llu)",
+			       i + 1, le64_to_cpu(e[0].end), le64_to_cpu(e[1].start));
+			return -EINVAL;
+		}
 	}
 
-	return NULL;
+	return 0;
 }
 
 static void bch2_sb_journal_seq_blacklist_to_text(struct printbuf *out,
diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c
index 54bb2a454a5e..6fb8224f565e 100644
--- a/fs/bcachefs/quota.c
+++ b/fs/bcachefs/quota.c
@@ -6,15 +6,17 @@
 #include "subvolume.h"
 #include "super-io.h"
 
-static const char *bch2_sb_validate_quota(struct bch_sb *sb,
-					  struct bch_sb_field *f)
+static int bch2_sb_validate_quota(struct bch_sb *sb, struct bch_sb_field *f,
+				  struct printbuf *err)
 {
 	struct bch_sb_field_quota *q = field_to_type(f, quota);
 
-	if (vstruct_bytes(&q->field) != sizeof(*q))
-		return "invalid field quota: wrong size";
+	if (vstruct_bytes(&q->field) < sizeof(*q)) {
+		pr_buf(err, "wrong size (got %llu should be %zu)",
+		       vstruct_bytes(&q->field), sizeof(*q));
+	}
 
-	return NULL;
+	return 0;
 }
 
 const struct bch_sb_field_ops bch_sb_field_ops_quota = {
diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c
index 33bba6fdb180..0cdd67e9ebc4 100644
--- a/fs/bcachefs/replicas.c
+++ b/fs/bcachefs/replicas.c
@@ -41,18 +41,19 @@ void bch2_replicas_entry_to_text(struct printbuf *out,
 {
 	unsigned i;
 
-	pr_buf(out, "%s: %u/%u [",
-	       bch2_data_types[e->data_type],
-	       e->nr_required,
-	       e->nr_devs);
+	if (e->data_type < BCH_DATA_NR)
+		pr_buf(out, "%s", bch2_data_types[e->data_type]);
+	else
+		pr_buf(out, "(invalid data type %u)", e->data_type);
 
+	pr_buf(out, ": %u/%u [", e->nr_required, e->nr_devs);
 	for (i = 0; i < e->nr_devs; i++)
 		pr_buf(out, i ? " %u" : "%u", e->devs[i]);
 	pr_buf(out, "]");
 }
 
 void bch2_cpu_replicas_to_text(struct printbuf *out,
-			      struct bch_replicas_cpu *r)
+			       struct bch_replicas_cpu *r)
 {
 	struct bch_replicas_entry *e;
 	bool first = true;
@@ -815,67 +816,78 @@ static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *c,
 	return 0;
 }
 
-static const char *check_dup_replicas_entries(struct bch_replicas_cpu *cpu_r)
+static int bch2_cpu_replicas_validate(struct bch_replicas_cpu *cpu_r,
+				      struct bch_sb *sb,
+				      struct printbuf *err)
 {
-	unsigned i;
+	struct bch_sb_field_members *mi = bch2_sb_get_members(sb);
+	unsigned i, j;
 
 	sort_cmp_size(cpu_r->entries,
 		      cpu_r->nr,
 		      cpu_r->entry_size,
 		      memcmp, NULL);
 
-	for (i = 0; i + 1 < cpu_r->nr; i++) {
-		struct bch_replicas_entry *l =
+	for (i = 0; i < cpu_r->nr; i++) {
+		struct bch_replicas_entry *e =
 			cpu_replicas_entry(cpu_r, i);
-		struct bch_replicas_entry *r =
-			cpu_replicas_entry(cpu_r, i + 1);
-
-		BUG_ON(memcmp(l, r, cpu_r->entry_size) > 0);
 
-		if (!memcmp(l, r, cpu_r->entry_size))
-			return "duplicate replicas entry";
-	}
+		if (e->data_type >= BCH_DATA_NR) {
+			pr_buf(err, "invalid data type in entry ");
+			bch2_replicas_entry_to_text(err, e);
+			return -EINVAL;
+		}
 
-	return NULL;
-}
+		if (!e->nr_devs) {
+			pr_buf(err, "no devices in entry ");
+			bch2_replicas_entry_to_text(err, e);
+			return -EINVAL;
+		}
 
-static const char *bch2_sb_validate_replicas(struct bch_sb *sb, struct bch_sb_field *f)
-{
-	struct bch_sb_field_replicas *sb_r = field_to_type(f, replicas);
-	struct bch_sb_field_members *mi = bch2_sb_get_members(sb);
-	struct bch_replicas_cpu cpu_r = { .entries = NULL };
-	struct bch_replicas_entry *e;
-	const char *err;
-	unsigned i;
+		if (e->nr_required > 1 &&
+		    e->nr_required >= e->nr_devs) {
+			pr_buf(err, "bad nr_required in entry ");
+			bch2_replicas_entry_to_text(err, e);
+			return -EINVAL;
+		}
 
-	for_each_replicas_entry(sb_r, e) {
-		err = "invalid replicas entry: invalid data type";
-		if (e->data_type >= BCH_DATA_NR)
-			goto err;
+		for (j = 0; j < e->nr_devs; j++)
+			if (!bch2_dev_exists(sb, mi, e->devs[j])) {
+				pr_buf(err, "invalid device %u in entry ", e->devs[j]);
+				bch2_replicas_entry_to_text(err, e);
+				return -EINVAL;
+			}
 
-		err = "invalid replicas entry: no devices";
-		if (!e->nr_devs)
-			goto err;
+		if (i + 1 < cpu_r->nr) {
+			struct bch_replicas_entry *n =
+				cpu_replicas_entry(cpu_r, i + 1);
 
-		err = "invalid replicas entry: bad nr_required";
-		if (e->nr_required > 1 &&
-		    e->nr_required >= e->nr_devs)
-			goto err;
+			BUG_ON(memcmp(e, n, cpu_r->entry_size) > 0);
 
-		err = "invalid replicas entry: invalid device";
-		for (i = 0; i < e->nr_devs; i++)
-			if (!bch2_dev_exists(sb, mi, e->devs[i]))
-				goto err;
+			if (!memcmp(e, n, cpu_r->entry_size)) {
+				pr_buf(err, "duplicate replicas entry ");
+				bch2_replicas_entry_to_text(err, e);
+				return -EINVAL;
+			}
+		}
 	}
 
-	err = "cannot allocate memory";
+	return 0;
+}
+
+static int bch2_sb_validate_replicas(struct bch_sb *sb, struct bch_sb_field *f,
+				     struct printbuf *err)
+{
+	struct bch_sb_field_replicas *sb_r = field_to_type(f, replicas);
+	struct bch_replicas_cpu cpu_r;
+	int ret;
+
 	if (__bch2_sb_replicas_to_cpu_replicas(sb_r, &cpu_r))
-		goto err;
+		return -ENOMEM;
 
-	err = check_dup_replicas_entries(&cpu_r);
-err:
+	ret = bch2_cpu_replicas_validate(&cpu_r, sb, err);
 	kfree(cpu_r.entries);
-	return err;
+	return ret;
 }
 
 static void bch2_sb_replicas_to_text(struct printbuf *out,
@@ -900,38 +912,19 @@ const struct bch_sb_field_ops bch_sb_field_ops_replicas = {
 	.to_text	= bch2_sb_replicas_to_text,
 };
 
-static const char *bch2_sb_validate_replicas_v0(struct bch_sb *sb, struct bch_sb_field *f)
+static int bch2_sb_validate_replicas_v0(struct bch_sb *sb, struct bch_sb_field *f,
+					struct printbuf *err)
 {
 	struct bch_sb_field_replicas_v0 *sb_r = field_to_type(f, replicas_v0);
-	struct bch_sb_field_members *mi = bch2_sb_get_members(sb);
-	struct bch_replicas_cpu cpu_r = { .entries = NULL };
-	struct bch_replicas_entry_v0 *e;
-	const char *err;
-	unsigned i;
+	struct bch_replicas_cpu cpu_r;
+	int ret;
 
-	for_each_replicas_entry_v0(sb_r, e) {
-		err = "invalid replicas entry: invalid data type";
-		if (e->data_type >= BCH_DATA_NR)
-			goto err;
-
-		err = "invalid replicas entry: no devices";
-		if (!e->nr_devs)
-			goto err;
-
-		err = "invalid replicas entry: invalid device";
-		for (i = 0; i < e->nr_devs; i++)
-			if (!bch2_dev_exists(sb, mi, e->devs[i]))
-				goto err;
-	}
-
-	err = "cannot allocate memory";
 	if (__bch2_sb_replicas_v0_to_cpu_replicas(sb_r, &cpu_r))
-		goto err;
+		return -ENOMEM;
 
-	err = check_dup_replicas_entries(&cpu_r);
-err:
+	ret = bch2_cpu_replicas_validate(&cpu_r, sb, err);
 	kfree(cpu_r.entries);
-	return err;
+	return ret;
 }
 
 const struct bch_sb_field_ops bch_sb_field_ops_replicas_v0 = {
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index e0b1dfadacd9..414dfa59744f 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -30,8 +30,8 @@ const char * const bch2_sb_fields[] = {
 	NULL
 };
 
-static const char *bch2_sb_field_validate(struct bch_sb *,
-					  struct bch_sb_field *);
+static int bch2_sb_field_validate(struct bch_sb *, struct bch_sb_field *,
+				  struct printbuf *);
 
 struct bch_sb_field *bch2_sb_field_get(struct bch_sb *sb,
 				      enum bch_sb_field_type type)
@@ -207,23 +207,32 @@ static inline void __bch2_sb_layout_size_assert(void)
 	BUILD_BUG_ON(sizeof(struct bch_sb_layout) != 512);
 }
 
-static const char *validate_sb_layout(struct bch_sb_layout *layout)
+static int validate_sb_layout(struct bch_sb_layout *layout, struct printbuf *out)
 {
 	u64 offset, prev_offset, max_sectors;
 	unsigned i;
 
 	if (!uuid_equal(&layout->magic, &BCACHE_MAGIC) &&
-	    !uuid_equal(&layout->magic, &BCHFS_MAGIC))
-		return "Not a bcachefs superblock layout";
+	    !uuid_equal(&layout->magic, &BCHFS_MAGIC)) {
+		pr_buf(out, "Not a bcachefs superblock layout");
+		return -EINVAL;
+	}
 
-	if (layout->layout_type != 0)
-		return "Invalid superblock layout type";
+	if (layout->layout_type != 0) {
+		pr_buf(out, "Invalid superblock layout type %u",
+		       layout->layout_type);
+		return -EINVAL;
+	}
 
-	if (!layout->nr_superblocks)
-		return "Invalid superblock layout: no superblocks";
+	if (!layout->nr_superblocks) {
+		pr_buf(out, "Invalid superblock layout: no superblocks");
+		return -EINVAL;
+	}
 
-	if (layout->nr_superblocks > ARRAY_SIZE(layout->sb_offset))
-		return "Invalid superblock layout: too many superblocks";
+	if (layout->nr_superblocks > ARRAY_SIZE(layout->sb_offset)) {
+		pr_buf(out, "Invalid superblock layout: too many superblocks");
+		return -EINVAL;
+	}
 
 	max_sectors = 1 << layout->sb_max_size_bits;
 
@@ -232,122 +241,134 @@ static const char *validate_sb_layout(struct bch_sb_layout *layout)
 	for (i = 1; i < layout->nr_superblocks; i++) {
 		offset = le64_to_cpu(layout->sb_offset[i]);
 
-		if (offset < prev_offset + max_sectors)
-			return "Invalid superblock layout: superblocks overlap";
+		if (offset < prev_offset + max_sectors) {
+			pr_buf(out, "Invalid superblock layout: superblocks overlap\n"
+			       "  (sb %u ends at %llu next starts at %llu",
+			       i - 1, prev_offset + max_sectors, offset);
+			return -EINVAL;
+		}
 		prev_offset = offset;
 	}
 
-	return NULL;
+	return 0;
 }
 
-const char *bch2_sb_validate(struct bch_sb_handle *disk_sb)
+static int bch2_sb_validate(struct bch_sb_handle *disk_sb, struct printbuf *out)
 {
 	struct bch_sb *sb = disk_sb->sb;
 	struct bch_sb_field *f;
 	struct bch_sb_field_members *mi;
-	const char *err;
 	u32 version, version_min;
 	u16 block_size;
+	int ret;
 
 	version		= le16_to_cpu(sb->version);
 	version_min	= version >= bcachefs_metadata_version_new_versioning
 		? le16_to_cpu(sb->version_min)
 		: version;
 
-	if (version    >= bcachefs_metadata_version_max ||
-	    version_min < bcachefs_metadata_version_min)
-		return "Unsupported superblock version";
+	if (version    >= bcachefs_metadata_version_max) {
+		pr_buf(out, "Unsupported superblock version %u (min %u, max %u)",
+		       version, bcachefs_metadata_version_min, bcachefs_metadata_version_max);
+		return -EINVAL;
+	}
 
-	if (version_min > version)
-		return "Bad minimum version";
+	if (version_min < bcachefs_metadata_version_min) {
+		pr_buf(out, "Unsupported superblock version %u (min %u, max %u)",
+		       version_min, bcachefs_metadata_version_min, bcachefs_metadata_version_max);
+		return -EINVAL;
+	}
+
+	if (version_min > version) {
+		pr_buf(out, "Bad minimum version %u, greater than version field %u",
+		       version_min, version);
+		return -EINVAL;
+	}
 
 	if (sb->features[1] ||
-	    (le64_to_cpu(sb->features[0]) & (~0ULL << BCH_FEATURE_NR)))
-		return "Filesystem has incompatible features";
+	    (le64_to_cpu(sb->features[0]) & (~0ULL << BCH_FEATURE_NR))) {
+		pr_buf(out, "Filesystem has incompatible features");
+		return -EINVAL;
+	}
 
 	block_size = le16_to_cpu(sb->block_size);
 
-	if (block_size > PAGE_SECTORS)
-		return "Bad block size";
+	if (block_size > PAGE_SECTORS) {
+		pr_buf(out, "Block size too big (got %u, max %u)",
+		       block_size, PAGE_SECTORS);
+		return -EINVAL;
+	}
 
-	if (bch2_is_zero(sb->user_uuid.b, sizeof(sb->user_uuid)))
-		return "Bad user UUID";
+	if (bch2_is_zero(sb->user_uuid.b, sizeof(sb->user_uuid))) {
+		pr_buf(out, "Bad user UUID (got zeroes)");
+		return -EINVAL;
+	}
 
-	if (bch2_is_zero(sb->uuid.b, sizeof(sb->uuid)))
-		return "Bad internal UUID";
+	if (bch2_is_zero(sb->uuid.b, sizeof(sb->uuid))) {
+		pr_buf(out, "Bad intenal UUID (got zeroes)");
+		return -EINVAL;
+	}
 
 	if (!sb->nr_devices ||
-	    sb->nr_devices <= sb->dev_idx ||
-	    sb->nr_devices > BCH_SB_MEMBERS_MAX)
-		return "Bad number of member devices";
-
-	if (!BCH_SB_META_REPLICAS_WANT(sb) ||
-	    BCH_SB_META_REPLICAS_WANT(sb) > BCH_REPLICAS_MAX)
-		return "Invalid number of metadata replicas";
-
-	if (!BCH_SB_META_REPLICAS_REQ(sb) ||
-	    BCH_SB_META_REPLICAS_REQ(sb) > BCH_REPLICAS_MAX)
-		return "Invalid number of metadata replicas";
-
-	if (!BCH_SB_DATA_REPLICAS_WANT(sb) ||
-	    BCH_SB_DATA_REPLICAS_WANT(sb) > BCH_REPLICAS_MAX)
-		return "Invalid number of data replicas";
-
-	if (!BCH_SB_DATA_REPLICAS_REQ(sb) ||
-	    BCH_SB_DATA_REPLICAS_REQ(sb) > BCH_REPLICAS_MAX)
-		return "Invalid number of data replicas";
-
-	if (BCH_SB_META_CSUM_TYPE(sb) >= BCH_CSUM_OPT_NR)
-		return "Invalid metadata checksum type";
-
-	if (BCH_SB_DATA_CSUM_TYPE(sb) >= BCH_CSUM_OPT_NR)
-		return "Invalid metadata checksum type";
-
-	if (BCH_SB_COMPRESSION_TYPE(sb) >= BCH_COMPRESSION_OPT_NR)
-		return "Invalid compression type";
-
-	if (!BCH_SB_BTREE_NODE_SIZE(sb))
-		return "Btree node size not set";
+	    sb->nr_devices > BCH_SB_MEMBERS_MAX) {
+		pr_buf(out, "Bad number of member devices %u (max %u)",
+		       sb->nr_devices, BCH_SB_MEMBERS_MAX);
+		return -EINVAL;
+	}
 
-	if (BCH_SB_GC_RESERVE(sb) < 5)
-		return "gc reserve percentage too small";
+	if (sb->dev_idx >= sb->nr_devices) {
+		pr_buf(out, "Bad dev_idx (got %u, nr_devices %u)",
+		       sb->dev_idx, sb->nr_devices);
+		return -EINVAL;
+	}
 
 	if (!sb->time_precision ||
-	    le32_to_cpu(sb->time_precision) > NSEC_PER_SEC)
-		return "invalid time precision";
+	    le32_to_cpu(sb->time_precision) > NSEC_PER_SEC) {
+		pr_buf(out, "Invalid time precision: %u (min 1, max %lu)",
+		       le32_to_cpu(sb->time_precision), NSEC_PER_SEC);
+		return -EINVAL;
+	}
 
 	/* validate layout */
-	err = validate_sb_layout(&sb->layout);
-	if (err)
-		return err;
+	ret = validate_sb_layout(&sb->layout, out);
+	if (ret)
+		return ret;
 
 	vstruct_for_each(sb, f) {
-		if (!f->u64s)
-			return "Invalid superblock: invalid optional field";
+		if (!f->u64s) {
+			pr_buf(out, "Invalid superblock: optional with size 0 (type %u)",
+			       le32_to_cpu(f->type));
+			return -EINVAL;
+		}
 
-		if (vstruct_next(f) > vstruct_last(sb))
-			return "Invalid superblock: invalid optional field";
+		if (vstruct_next(f) > vstruct_last(sb)) {
+			pr_buf(out, "Invalid superblock: optional field extends past end of superblock (type %u)",
+			       le32_to_cpu(f->type));
+			return -EINVAL;
+		}
 	}
 
 	/* members must be validated first: */
 	mi = bch2_sb_get_members(sb);
-	if (!mi)
-		return "Invalid superblock: member info area missing";
+	if (!mi) {
+		pr_buf(out, "Invalid superblock: member info area missing");
+		return -EINVAL;
+	}
 
-	err = bch2_sb_field_validate(sb, &mi->field);
-	if (err)
-		return err;
+	ret = bch2_sb_field_validate(sb, &mi->field, out);
+	if (ret)
+		return ret;
 
 	vstruct_for_each(sb, f) {
 		if (le32_to_cpu(f->type) == BCH_SB_FIELD_members)
 			continue;
 
-		err = bch2_sb_field_validate(sb, f);
-		if (err)
-			return err;
+		ret = bch2_sb_field_validate(sb, f, out);
+		if (ret)
+			return ret;
 	}
 
-	return NULL;
+	return 0;
 }
 
 /* device open: */
@@ -476,50 +497,77 @@ int bch2_sb_from_fs(struct bch_fs *c, struct bch_dev *ca)
 
 /* read superblock: */
 
-static const char *read_one_super(struct bch_sb_handle *sb, u64 offset)
+static int read_one_super(struct bch_sb_handle *sb, u64 offset, struct printbuf *err)
 {
 	struct bch_csum csum;
+	u32 version, version_min;
 	size_t bytes;
+	int ret;
 reread:
 	bio_reset(sb->bio, sb->bdev, REQ_OP_READ|REQ_SYNC|REQ_META);
 	sb->bio->bi_iter.bi_sector = offset;
 	bch2_bio_map(sb->bio, sb->sb, sb->buffer_size);
 
-	if (submit_bio_wait(sb->bio))
-		return "IO error";
+	ret = submit_bio_wait(sb->bio);
+	if (ret) {
+		pr_buf(err, "IO error: %i", ret);
+		return ret;
+	}
 
 	if (!uuid_equal(&sb->sb->magic, &BCACHE_MAGIC) &&
-	    !uuid_equal(&sb->sb->magic, &BCHFS_MAGIC))
-		return "Not a bcachefs superblock";
+	    !uuid_equal(&sb->sb->magic, &BCHFS_MAGIC)) {
+		pr_buf(err, "Not a bcachefs superblock");
+		return -EINVAL;
+	}
 
-	if (le16_to_cpu(sb->sb->version) <  bcachefs_metadata_version_min ||
-	    le16_to_cpu(sb->sb->version) >= bcachefs_metadata_version_max)
-		return "Unsupported superblock version";
+	version		= le16_to_cpu(sb->sb->version);
+	version_min	= version >= bcachefs_metadata_version_new_versioning
+		? le16_to_cpu(sb->sb->version_min)
+		: version;
+
+	if (version    >= bcachefs_metadata_version_max) {
+		pr_buf(err, "Unsupported superblock version %u (min %u, max %u)",
+		       version, bcachefs_metadata_version_min, bcachefs_metadata_version_max);
+		return -EINVAL;
+	}
+
+	if (version_min < bcachefs_metadata_version_min) {
+		pr_buf(err, "Unsupported superblock version %u (min %u, max %u)",
+		       version_min, bcachefs_metadata_version_min, bcachefs_metadata_version_max);
+		return -EINVAL;
+	}
 
 	bytes = vstruct_bytes(sb->sb);
 
-	if (bytes > 512 << sb->sb->layout.sb_max_size_bits)
-		return "Bad superblock: too big";
+	if (bytes > 512 << sb->sb->layout.sb_max_size_bits) {
+		pr_buf(err, "Invalid superblock: too big (got %zu bytes, layout max %lu)",
+		       bytes, 512UL << sb->sb->layout.sb_max_size_bits);
+		return -EINVAL;
+	}
 
 	if (bytes > sb->buffer_size) {
 		if (bch2_sb_realloc(sb, le32_to_cpu(sb->sb->u64s)))
-			return "cannot allocate memory";
+			return -ENOMEM;
 		goto reread;
 	}
 
-	if (BCH_SB_CSUM_TYPE(sb->sb) >= BCH_CSUM_NR)
-		return "unknown csum type";
+	if (BCH_SB_CSUM_TYPE(sb->sb) >= BCH_CSUM_NR) {
+		pr_buf(err, "unknown checksum type %llu", BCH_SB_CSUM_TYPE(sb->sb));
+		return -EINVAL;
+	}
 
 	/* XXX: verify MACs */
 	csum = csum_vstruct(NULL, BCH_SB_CSUM_TYPE(sb->sb),
 			    null_nonce(), sb->sb);
 
-	if (bch2_crc_cmp(csum, sb->sb->csum))
-		return "bad checksum reading superblock";
+	if (bch2_crc_cmp(csum, sb->sb->csum)) {
+		pr_buf(err, "bad checksum");
+		return -EINVAL;
+	}
 
 	sb->seq = le64_to_cpu(sb->sb->seq);
 
-	return NULL;
+	return 0;
 }
 
 int bch2_read_super(const char *path, struct bch_opts *opts,
@@ -527,10 +575,16 @@ int bch2_read_super(const char *path, struct bch_opts *opts,
 {
 	u64 offset = opt_get(*opts, sb);
 	struct bch_sb_layout layout;
-	const char *err;
+	char *_err;
+	struct printbuf err;
 	__le64 *i;
 	int ret;
 
+	_err = kmalloc(4096, GFP_KERNEL);
+	if (!_err)
+		return -ENOMEM;
+	err = _PBUF(_err, 4096);
+
 	pr_verbose_init(*opts, "");
 
 	memset(sb, 0, sizeof(*sb));
@@ -562,25 +616,28 @@ int bch2_read_super(const char *path, struct bch_opts *opts,
 		goto out;
 	}
 
-	err = "cannot allocate memory";
 	ret = bch2_sb_realloc(sb, 0);
-	if (ret)
+	if (ret) {
+		pr_buf(&err, "error allocating memory for superblock");
 		goto err;
+	}
 
-	ret = -EFAULT;
-	err = "dynamic fault";
-	if (bch2_fs_init_fault("read_super"))
+	if (bch2_fs_init_fault("read_super")) {
+		pr_buf(&err, "dynamic fault");
+		ret = -EFAULT;
 		goto err;
+	}
 
-	ret = -EINVAL;
-	err = read_one_super(sb, offset);
-	if (!err)
+	ret = read_one_super(sb, offset, &err);
+	if (!ret)
 		goto got_super;
 
 	if (opt_defined(*opts, sb))
 		goto err;
 
-	pr_err("error reading default superblock: %s", err);
+	printk(KERN_ERR "bcachefs (%s): error reading default superblock: %s",
+	       path, _err);
+	err = _PBUF(_err, 4096);
 
 	/*
 	 * Error reading primary superblock - read location of backup
@@ -594,13 +651,15 @@ int bch2_read_super(const char *path, struct bch_opts *opts,
 	 */
 	bch2_bio_map(sb->bio, sb->sb, sizeof(struct bch_sb_layout));
 
-	err = "IO error";
-	if (submit_bio_wait(sb->bio))
+	ret = submit_bio_wait(sb->bio);
+	if (ret) {
+		pr_buf(&err, "IO error: %i", ret);
 		goto err;
+	}
 
 	memcpy(&layout, sb->sb, sizeof(layout));
-	err = validate_sb_layout(&layout);
-	if (err)
+	ret = validate_sb_layout(&layout, &err);
+	if (ret)
 		goto err;
 
 	for (i = layout.sb_offset;
@@ -610,32 +669,39 @@ int bch2_read_super(const char *path, struct bch_opts *opts,
 		if (offset == opt_get(*opts, sb))
 			continue;
 
-		err = read_one_super(sb, offset);
-		if (!err)
+		ret = read_one_super(sb, offset, &err);
+		if (!ret)
 			goto got_super;
 	}
 
-	ret = -EINVAL;
 	goto err;
 
 got_super:
-	err = "Superblock block size smaller than device block size";
-	ret = -EINVAL;
 	if (le16_to_cpu(sb->sb->block_size) << 9 <
 	    bdev_logical_block_size(sb->bdev)) {
-		pr_err("error reading superblock: Superblock block size (%u) smaller than device block size (%u)",
+		pr_buf(&err, "block size (%u) smaller than device block size (%u)",
 		       le16_to_cpu(sb->sb->block_size) << 9,
 		       bdev_logical_block_size(sb->bdev));
-		goto err_no_print;
+		ret = -EINVAL;
+		goto err;
 	}
 
 	ret = 0;
 	sb->have_layout = true;
+
+	ret = bch2_sb_validate(sb, &err);
+	if (ret) {
+		printk(KERN_ERR "bcachefs (%s): error validating superblock: %s",
+		       path, _err);
+		goto err_no_print;
+	}
 out:
 	pr_verbose_init(*opts, "ret %i", ret);
+	kfree(_err);
 	return ret;
 err:
-	pr_err("error reading superblock: %s", err);
+	printk(KERN_ERR "bcachefs (%s): error reading superblock: %s",
+	       path, _err);
 err_no_print:
 	bch2_free_super(sb);
 	goto out;
@@ -706,7 +772,6 @@ int bch2_write_super(struct bch_fs *c)
 	struct closure *cl = &c->sb_write;
 	struct bch_dev *ca;
 	unsigned i, sb = 0, nr_wrote;
-	const char *err;
 	struct bch_devs_mask sb_written;
 	bool wrote, can_mount_without_written, can_mount_with_written;
 	unsigned degraded_flags = BCH_FORCE_IF_DEGRADED;
@@ -733,10 +798,19 @@ int bch2_write_super(struct bch_fs *c)
 		bch2_sb_from_fs(c, ca);
 
 	for_each_online_member(ca, c, i) {
-		err = bch2_sb_validate(&ca->disk_sb);
-		if (err) {
-			bch2_fs_inconsistent(c, "sb invalid before write: %s", err);
-			ret = -1;
+		struct printbuf buf = { NULL, NULL };
+
+		ret = bch2_sb_validate(&ca->disk_sb, &buf);
+		if (ret) {
+			char *_buf = kmalloc(4096, GFP_NOFS);
+			if (_buf) {
+				buf = _PBUF(_buf, 4096);
+				bch2_sb_validate(&ca->disk_sb, &buf);
+			}
+
+			bch2_fs_inconsistent(c, "sb invalid before write: %s", _buf);
+			kfree(_buf);
+			percpu_ref_put(&ca->io_ref);
 			goto out;
 		}
 	}
@@ -849,54 +923,57 @@ static int u64_cmp(const void *_l, const void *_r)
 	return l < r ? -1 : l > r ? 1 : 0;
 }
 
-static const char *bch2_sb_validate_journal(struct bch_sb *sb,
-					    struct bch_sb_field *f)
+static int bch2_sb_validate_journal(struct bch_sb *sb,
+				    struct bch_sb_field *f,
+				    struct printbuf *err)
 {
 	struct bch_sb_field_journal *journal = field_to_type(f, journal);
 	struct bch_member *m = bch2_sb_get_members(sb)->members + sb->dev_idx;
-	const char *err;
+	int ret = -EINVAL;
 	unsigned nr;
 	unsigned i;
 	u64 *b;
 
-	journal = bch2_sb_get_journal(sb);
-	if (!journal)
-		return NULL;
-
 	nr = bch2_nr_journal_buckets(journal);
 	if (!nr)
-		return NULL;
+		return 0;
 
 	b = kmalloc_array(sizeof(u64), nr, GFP_KERNEL);
 	if (!b)
-		return "cannot allocate memory";
+		return -ENOMEM;
 
 	for (i = 0; i < nr; i++)
 		b[i] = le64_to_cpu(journal->buckets[i]);
 
 	sort(b, nr, sizeof(u64), u64_cmp, NULL);
 
-	err = "journal bucket at sector 0";
-	if (!b[0])
+	if (!b[0]) {
+		pr_buf(err, "journal bucket at sector 0");
 		goto err;
+	}
 
-	err = "journal bucket before first bucket";
-	if (m && b[0] < le16_to_cpu(m->first_bucket))
+	if (b[0] < le16_to_cpu(m->first_bucket)) {
+		pr_buf(err, "journal bucket %llu before first bucket %u",
+		       b[0], le16_to_cpu(m->first_bucket));
 		goto err;
+	}
 
-	err = "journal bucket past end of device";
-	if (m && b[nr - 1] >= le64_to_cpu(m->nbuckets))
+	if (b[nr - 1] >= le64_to_cpu(m->nbuckets)) {
+		pr_buf(err, "journal bucket %llu past end of device (nbuckets %llu)",
+		       b[nr - 1], le64_to_cpu(m->nbuckets));
 		goto err;
+	}
 
-	err = "duplicate journal buckets";
 	for (i = 0; i + 1 < nr; i++)
-		if (b[i] == b[i + 1])
+		if (b[i] == b[i + 1]) {
+			pr_buf(err, "duplicate journal buckets %llu", b[i]);
 			goto err;
+		}
 
-	err = NULL;
+	ret = 0;
 err:
 	kfree(b);
-	return err;
+	return ret;
 }
 
 static const struct bch_sb_field_ops bch_sb_field_ops_journal = {
@@ -905,39 +982,54 @@ static const struct bch_sb_field_ops bch_sb_field_ops_journal = {
 
 /* BCH_SB_FIELD_members: */
 
-static const char *bch2_sb_validate_members(struct bch_sb *sb,
-					    struct bch_sb_field *f)
+static int bch2_sb_validate_members(struct bch_sb *sb,
+				    struct bch_sb_field *f,
+				    struct printbuf *err)
 {
 	struct bch_sb_field_members *mi = field_to_type(f, members);
-	struct bch_member *m;
+	unsigned i;
 
 	if ((void *) (mi->members + sb->nr_devices) >
-	    vstruct_end(&mi->field))
-		return "Invalid superblock: bad member info";
+	    vstruct_end(&mi->field)) {
+		pr_buf(err, "too many devices for section size");
+		return -EINVAL;
+	}
+
+	for (i = 0; i < sb->nr_devices; i++) {
+		struct bch_member *m = mi->members + i;
 
-	for (m = mi->members;
-	     m < mi->members + sb->nr_devices;
-	     m++) {
 		if (!bch2_member_exists(m))
 			continue;
 
-		if (le64_to_cpu(m->nbuckets) > LONG_MAX)
-			return "Too many buckets";
+		if (le64_to_cpu(m->nbuckets) > LONG_MAX) {
+			pr_buf(err, "device %u: too many buckets (got %llu, max %lu)",
+			       i, le64_to_cpu(m->nbuckets), LONG_MAX);
+			return -EINVAL;
+		}
 
 		if (le64_to_cpu(m->nbuckets) -
-		    le16_to_cpu(m->first_bucket) < BCH_MIN_NR_NBUCKETS)
-			return "Not enough buckets";
+		    le16_to_cpu(m->first_bucket) < BCH_MIN_NR_NBUCKETS) {
+			pr_buf(err, "device %u: not enough buckets (got %llu, max %u)",
+			       i, le64_to_cpu(m->nbuckets), BCH_MIN_NR_NBUCKETS);
+			return -EINVAL;
+		}
 
 		if (le16_to_cpu(m->bucket_size) <
-		    le16_to_cpu(sb->block_size))
-			return "bucket size smaller than block size";
+		    le16_to_cpu(sb->block_size)) {
+			pr_buf(err, "device %u: bucket size %u smaller than block size %u",
+			       i, le16_to_cpu(m->bucket_size), le16_to_cpu(sb->block_size));
+			return -EINVAL;
+		}
 
 		if (le16_to_cpu(m->bucket_size) <
-		    BCH_SB_BTREE_NODE_SIZE(sb))
-			return "bucket size smaller than btree node size";
+		    BCH_SB_BTREE_NODE_SIZE(sb)) {
+			pr_buf(err, "device %u: bucket size %u smaller than btree node size %llu",
+			       i, le16_to_cpu(m->bucket_size), BCH_SB_BTREE_NODE_SIZE(sb));
+			return -EINVAL;
+		}
 	}
 
-	return NULL;
+	return 0;
 }
 
 static const struct bch_sb_field_ops bch_sb_field_ops_members = {
@@ -946,18 +1038,24 @@ static const struct bch_sb_field_ops bch_sb_field_ops_members = {
 
 /* BCH_SB_FIELD_crypt: */
 
-static const char *bch2_sb_validate_crypt(struct bch_sb *sb,
-					  struct bch_sb_field *f)
+static int bch2_sb_validate_crypt(struct bch_sb *sb,
+				  struct bch_sb_field *f,
+				  struct printbuf *err)
 {
 	struct bch_sb_field_crypt *crypt = field_to_type(f, crypt);
 
-	if (vstruct_bytes(&crypt->field) != sizeof(*crypt))
-		return "invalid field crypt: wrong size";
+	if (vstruct_bytes(&crypt->field) < sizeof(*crypt)) {
+		pr_buf(err, "wrong size (got %llu should be %zu)",
+		       vstruct_bytes(&crypt->field), sizeof(*crypt));
+		return -EINVAL;
+	}
 
-	if (BCH_CRYPT_KDF_TYPE(crypt))
-		return "invalid field crypt: bad kdf type";
+	if (BCH_CRYPT_KDF_TYPE(crypt)) {
+		pr_buf(err, "bad kdf type %llu", BCH_CRYPT_KDF_TYPE(crypt));
+		return -EINVAL;
+	}
 
-	return NULL;
+	return 0;
 }
 
 static const struct bch_sb_field_ops bch_sb_field_ops_crypt = {
@@ -1167,15 +1265,19 @@ out:
 	mutex_unlock(&c->sb_lock);
 }
 
-static const char *bch2_sb_validate_clean(struct bch_sb *sb,
-					  struct bch_sb_field *f)
+static int bch2_sb_validate_clean(struct bch_sb *sb,
+				  struct bch_sb_field *f,
+				  struct printbuf *err)
 {
 	struct bch_sb_field_clean *clean = field_to_type(f, clean);
 
-	if (vstruct_bytes(&clean->field) < sizeof(*clean))
-		return "invalid field crypt: wrong size";
+	if (vstruct_bytes(&clean->field) < sizeof(*clean)) {
+		pr_buf(err, "wrong size (got %llu should be %zu)",
+		       vstruct_bytes(&clean->field), sizeof(*clean));
+		return -EINVAL;
+	}
 
-	return NULL;
+	return 0;
 }
 
 static const struct bch_sb_field_ops bch_sb_field_ops_clean = {
@@ -1189,14 +1291,26 @@ static const struct bch_sb_field_ops *bch2_sb_field_ops[] = {
 #undef x
 };
 
-static const char *bch2_sb_field_validate(struct bch_sb *sb,
-					  struct bch_sb_field *f)
+static int bch2_sb_field_validate(struct bch_sb *sb, struct bch_sb_field *f,
+				  struct printbuf *orig_err)
 {
 	unsigned type = le32_to_cpu(f->type);
+	struct printbuf err = *orig_err;
+	int ret;
 
-	return type < BCH_SB_FIELD_NR
-		? bch2_sb_field_ops[type]->validate(sb, f)
-		: NULL;
+	if (type >= BCH_SB_FIELD_NR)
+		return 0;
+
+	pr_buf(&err, "Invalid superblock section %s: ", bch2_sb_fields[type]);
+
+	ret = bch2_sb_field_ops[type]->validate(sb, f, &err);
+	if (ret) {
+		pr_buf(&err, "\n");
+		bch2_sb_field_to_text(&err, sb, f);
+		*orig_err = err;
+	}
+
+	return ret;
 }
 
 void bch2_sb_field_to_text(struct printbuf *out, struct bch_sb *sb,
diff --git a/fs/bcachefs/super-io.h b/fs/bcachefs/super-io.h
index f182711cc48f..6170fa0990f1 100644
--- a/fs/bcachefs/super-io.h
+++ b/fs/bcachefs/super-io.h
@@ -38,9 +38,8 @@ BCH_SB_FIELDS()
 extern const char * const bch2_sb_fields[];
 
 struct bch_sb_field_ops {
-	const char *	(*validate)(struct bch_sb *, struct bch_sb_field *);
-	void		(*to_text)(struct printbuf *, struct bch_sb *,
-				   struct bch_sb_field *);
+	int	(*validate)(struct bch_sb *, struct bch_sb_field *, struct printbuf *);
+	void	(*to_text)(struct printbuf *, struct bch_sb *, struct bch_sb_field *);
 };
 
 static inline __le64 bch2_sb_magic(struct bch_fs *c)
@@ -66,8 +65,6 @@ int bch2_sb_from_fs(struct bch_fs *, struct bch_dev *);
 void bch2_free_super(struct bch_sb_handle *);
 int bch2_sb_realloc(struct bch_sb_handle *, unsigned);
 
-const char *bch2_sb_validate(struct bch_sb_handle *);
-
 int bch2_read_super(const char *, struct bch_opts *, struct bch_sb_handle *);
 int bch2_write_super(struct bch_fs *);
 void __bch2_check_set_feature(struct bch_fs *, unsigned);
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index b0c2a8b847ef..7b7902fbdcc6 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -1604,18 +1604,20 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
 	struct bch_sb_field_members *mi;
 	struct bch_member dev_mi;
 	unsigned dev_idx, nr_devices, u64s;
+	char *_errbuf;
+	struct printbuf errbuf;
 	int ret;
 
+	_errbuf = kmalloc(4096, GFP_KERNEL);
+	if (!_errbuf)
+		return -ENOMEM;
+
+	errbuf = _PBUF(_errbuf, 4096);
+
 	ret = bch2_read_super(path, &opts, &sb);
 	if (ret) {
 		bch_err(c, "device add error: error reading super: %i", ret);
-		return ret;
-	}
-
-	err = bch2_sb_validate(&sb);
-	if (err) {
-		bch_err(c, "device add error: error validating super: %s", err);
-		return -EINVAL;
+		goto err;
 	}
 
 	dev_mi = bch2_sb_get_members(sb.sb)->members[sb.sb->dev_idx];
@@ -1623,19 +1625,21 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
 	err = bch2_dev_may_add(sb.sb, c);
 	if (err) {
 		bch_err(c, "device add error: %s", err);
-		return -EINVAL;
+		ret = -EINVAL;
+		goto err;
 	}
 
 	ca = __bch2_dev_alloc(c, &dev_mi);
 	if (!ca) {
 		bch2_free_super(&sb);
-		return -ENOMEM;
+		ret = -ENOMEM;
+		goto err;
 	}
 
 	ret = __bch2_dev_attach_bdev(ca, &sb);
 	if (ret) {
 		bch2_dev_free(ca);
-		return ret;
+		goto err;
 	}
 
 	ret = bch2_dev_journal_alloc(ca);
@@ -1727,10 +1731,12 @@ err:
 	if (ca)
 		bch2_dev_free(ca);
 	bch2_free_super(&sb);
+	kfree(_errbuf);
 	return ret;
 err_late:
 	up_write(&c->state_lock);
-	return -EINVAL;
+	ca = NULL;
+	goto err;
 }
 
 /* Hot add existing device to running filesystem: */
@@ -1896,20 +1902,28 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices,
 	struct bch_sb_field_members *mi;
 	unsigned i, best_sb = 0;
 	const char *err;
+	char *_errbuf = NULL;
+	struct printbuf errbuf;
 	int ret = 0;
 
+	if (!try_module_get(THIS_MODULE))
+		return ERR_PTR(-ENODEV);
+
 	pr_verbose_init(opts, "");
 
 	if (!nr_devices) {
-		c = ERR_PTR(-EINVAL);
-		goto out2;
+		ret = -EINVAL;
+		goto err;
 	}
 
-	if (!try_module_get(THIS_MODULE)) {
-		c = ERR_PTR(-ENODEV);
-		goto out2;
+	_errbuf = kmalloc(4096, GFP_KERNEL);
+	if (!_errbuf) {
+		ret = -ENOMEM;
+		goto err;
 	}
 
+	errbuf = _PBUF(_errbuf, 4096);
+
 	sb = kcalloc(nr_devices, sizeof(*sb), GFP_KERNEL);
 	if (!sb) {
 		ret = -ENOMEM;
@@ -1921,9 +1935,6 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices,
 		if (ret)
 			goto err;
 
-		err = bch2_sb_validate(&sb[i]);
-		if (err)
-			goto err_print;
 	}
 
 	for (i = 1; i < nr_devices; i++)
@@ -1976,8 +1987,8 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices,
 	}
 out:
 	kfree(sb);
+	kfree(_errbuf);
 	module_put(THIS_MODULE);
-out2:
 	pr_verbose_init(opts, "ret %i", PTR_ERR_OR_ZERO(c));
 	return c;
 err_print:
@@ -1994,81 +2005,6 @@ err:
 	goto out;
 }
 
-static const char *__bch2_fs_open_incremental(struct bch_sb_handle *sb,
-					      struct bch_opts opts)
-{
-	const char *err;
-	struct bch_fs *c;
-	bool allocated_fs = false;
-	int ret;
-
-	err = bch2_sb_validate(sb);
-	if (err)
-		return err;
-
-	mutex_lock(&bch_fs_list_lock);
-	c = __bch2_uuid_to_fs(sb->sb->uuid);
-	if (c) {
-		closure_get(&c->cl);
-
-		err = bch2_dev_in_fs(c->disk_sb.sb, sb->sb);
-		if (err)
-			goto err;
-	} else {
-		allocated_fs = true;
-		c = bch2_fs_alloc(sb->sb, opts);
-
-		err = "bch2_fs_alloc() error";
-		if (IS_ERR(c))
-			goto err;
-	}
-
-	err = "bch2_dev_online() error";
-
-	mutex_lock(&c->sb_lock);
-	if (bch2_dev_attach_bdev(c, sb)) {
-		mutex_unlock(&c->sb_lock);
-		goto err;
-	}
-	mutex_unlock(&c->sb_lock);
-
-	if (!c->opts.nostart && bch2_fs_may_start(c)) {
-		err = "error starting filesystem";
-		ret = bch2_fs_start(c);
-		if (ret)
-			goto err;
-	}
-
-	closure_put(&c->cl);
-	mutex_unlock(&bch_fs_list_lock);
-
-	return NULL;
-err:
-	mutex_unlock(&bch_fs_list_lock);
-
-	if (allocated_fs && !IS_ERR(c))
-		bch2_fs_stop(c);
-	else if (c)
-		closure_put(&c->cl);
-
-	return err;
-}
-
-const char *bch2_fs_open_incremental(const char *path)
-{
-	struct bch_sb_handle sb;
-	struct bch_opts opts = bch2_opts_empty();
-	const char *err;
-
-	if (bch2_read_super(path, &opts, &sb))
-		return "error reading superblock";
-
-	err = __bch2_fs_open_incremental(&sb, opts);
-	bch2_free_super(&sb);
-
-	return err;
-}
-
 /* Global interfaces/init */
 
 static void bcachefs_exit(void)
diff --git a/fs/bcachefs/super.h b/fs/bcachefs/super.h
index a5249c54426d..6414f6a6bb91 100644
--- a/fs/bcachefs/super.h
+++ b/fs/bcachefs/super.h
@@ -254,6 +254,5 @@ void bch2_fs_stop(struct bch_fs *);
 
 int bch2_fs_start(struct bch_fs *);
 struct bch_fs *bch2_fs_open(char * const *, unsigned, struct bch_opts);
-const char *bch2_fs_open_incremental(const char *path);
 
 #endif /* _BCACHEFS_SUPER_H */
-- 
cgit 


From 365f64f36c55b79d8510a5f476b2740a22c682eb Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 4 Jan 2022 00:06:49 -0500
Subject: bcachefs: Add verbose log messages for journal read

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/bcachefs.h   | 4 ++++
 fs/bcachefs/journal_io.c | 2 ++
 fs/bcachefs/recovery.c   | 1 +
 3 files changed, 7 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 7c48ebed1d35..2dd3a0a1943a 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -177,7 +177,11 @@
  */
 
 #undef pr_fmt
+#ifdef __KERNEL__
 #define pr_fmt(fmt) "bcachefs: %s() " fmt "\n", __func__
+#else
+#define pr_fmt(fmt) "%s() " fmt "\n", __func__
+#endif
 
 #include <linux/backing-dev-defs.h>
 #include <linux/bug.h>
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 22feea751b00..c659a5a95b63 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -892,6 +892,7 @@ static void bch2_journal_read_device(struct closure *cl)
 	struct journal_device *ja =
 		container_of(cl, struct journal_device, read);
 	struct bch_dev *ca = container_of(ja, struct bch_dev, journal);
+	struct bch_fs *c = ca->fs;
 	struct journal_list *jlist =
 		container_of(cl->parent, struct journal_list, cl);
 	struct journal_read_buf buf = { NULL, 0 };
@@ -943,6 +944,7 @@ static void bch2_journal_read_device(struct closure *cl)
 	ja->discard_idx = ja->dirty_idx_ondisk =
 		ja->dirty_idx = (ja->cur_idx + 1) % ja->nr;
 out:
+	bch_verbose(c, "journal read done on device %s, ret %i", ca->name, ret);
 	kvpfree(buf.data, buf.size);
 	percpu_ref_put(&ca->io_ref);
 	closure_return(cl);
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index e4ba3f0aef4a..d11457c229ac 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -1008,6 +1008,7 @@ int bch2_fs_recovery(struct bch_fs *c)
 	if (!c->sb.clean || c->opts.fsck || c->opts.keep_journal) {
 		struct journal_replay *i;
 
+		bch_verbose(c, "starting journal read");
 		ret = bch2_journal_read(c, &c->journal_entries,
 					&blacklist_seq, &journal_seq);
 		if (ret)
-- 
cgit 


From 98c80d6df67168035e4e84080959f070e9055bd2 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 4 Jan 2022 00:07:23 -0500
Subject: bcachefs: Fix bch2_journal_seq_blacklist_add()

The old code correctly handled the case where we were blacklisting a
range that exactly matched an existing entry, but not the case where the
new range partially overlaps an existing entry.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/journal_seq_blacklist.c | 43 ++++++++++++++++++-------------------
 1 file changed, 21 insertions(+), 22 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/journal_seq_blacklist.c b/fs/bcachefs/journal_seq_blacklist.c
index 428377e73a8d..e10b2c7c7bae 100644
--- a/fs/bcachefs/journal_seq_blacklist.c
+++ b/fs/bcachefs/journal_seq_blacklist.c
@@ -66,6 +66,12 @@ blacklist_entry_try_merge(struct bch_fs *c,
 	return bl;
 }
 
+static bool bl_entry_contig_or_overlaps(struct journal_seq_blacklist_entry *e,
+					u64 start, u64 end)
+{
+	return !(end < le64_to_cpu(e->start) || le64_to_cpu(e->end) < start);
+}
+
 int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64 start, u64 end)
 {
 	struct bch_sb_field_journal_seq_blacklist *bl;
@@ -76,28 +82,21 @@ int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64 start, u64 end)
 	bl = bch2_sb_get_journal_seq_blacklist(c->disk_sb.sb);
 	nr = blacklist_nr_entries(bl);
 
-	if (bl) {
-		for (i = 0; i < nr; i++) {
-			struct journal_seq_blacklist_entry *e =
-				bl->start + i;
-
-			if (start == le64_to_cpu(e->start) &&
-			    end   == le64_to_cpu(e->end))
-				goto out;
-
-			if (start <= le64_to_cpu(e->start) &&
-			    end   >= le64_to_cpu(e->end)) {
-				e->start = cpu_to_le64(start);
-				e->end	= cpu_to_le64(end);
-
-				if (i + 1 < nr)
-					bl = blacklist_entry_try_merge(c,
-								bl, i);
-				if (i)
-					bl = blacklist_entry_try_merge(c,
-								bl, i - 1);
-				goto out_write_sb;
-			}
+	for (i = 0; i < nr; i++) {
+		struct journal_seq_blacklist_entry *e =
+			bl->start + i;
+
+		if (bl_entry_contig_or_overlaps(e, start, end)) {
+			e->start = cpu_to_le64(min(start, le64_to_cpu(e->start)));
+			e->end	= cpu_to_le64(max(end, le64_to_cpu(e->end)));
+
+			if (i + 1 < nr)
+				bl = blacklist_entry_try_merge(c,
+							bl, i);
+			if (i)
+				bl = blacklist_entry_try_merge(c,
+							bl, i - 1);
+			goto out_write_sb;
 		}
 	}
 
-- 
cgit 


From 669f87a5da1c7b91b64f3c6308820b316e241cc2 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 4 Jan 2022 00:33:52 -0500
Subject: bcachefs: Switch to __func__for recording where btree_trans was
 initialized

Symbol decoding, via %ps, isn't supported in userspace - this will also
be faster when we're using trans->fn in the fast path, as with the new
BCH_JSET_ENTRY_log journal messages.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_cache.c           |   2 +-
 fs/bcachefs/btree_iter.c            |  23 +++---
 fs/bcachefs/btree_iter.h            |   5 +-
 fs/bcachefs/btree_key_cache.c       |   6 +-
 fs/bcachefs/btree_types.h           |   2 +-
 fs/bcachefs/btree_update_interior.c |   4 +-
 fs/bcachefs/btree_update_leaf.c     |  30 ++++----
 fs/bcachefs/fs.c                    |   1 -
 fs/bcachefs/trace.h                 | 138 ++++++++++++++++--------------------
 9 files changed, 102 insertions(+), 109 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index 40061887f5d8..cad5d28fed09 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -857,7 +857,7 @@ lock_node:
 			if (bch2_btree_node_relock(trans, path, level + 1))
 				goto retry;
 
-			trace_trans_restart_btree_node_reused(trans->ip,
+			trace_trans_restart_btree_node_reused(trans->fn,
 							      trace_ip,
 							      path->btree_id,
 							      &path->pos);
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 281e5895bc30..bca677c02774 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -350,7 +350,7 @@ bool __bch2_btree_node_lock(struct btree_trans *trans,
 	}
 
 	if (unlikely(deadlock_path)) {
-		trace_trans_restart_would_deadlock(trans->ip, ip,
+		trace_trans_restart_would_deadlock(trans->fn, ip,
 				trans->in_traverse_all, reason,
 				deadlock_path->btree_id,
 				deadlock_path->cached,
@@ -535,7 +535,7 @@ bool bch2_trans_relock(struct btree_trans *trans)
 	trans_for_each_path(trans, path)
 		if (path->should_be_locked &&
 		    !bch2_btree_path_relock(trans, path, _RET_IP_)) {
-			trace_trans_restart_relock(trans->ip, _RET_IP_,
+			trace_trans_restart_relock(trans->fn, _RET_IP_,
 					path->btree_id, &path->pos);
 			BUG_ON(!trans->restarted);
 			return false;
@@ -1505,7 +1505,9 @@ retry_all:
 out:
 	bch2_btree_cache_cannibalize_unlock(c);
 
-	trace_trans_traverse_all(trans->ip, trace_ip);
+	trans->in_traverse_all = false;
+
+	trace_trans_traverse_all(trans->fn, trace_ip);
 	return ret;
 }
 
@@ -2842,7 +2844,7 @@ void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
 		trans->mem_bytes = new_bytes;
 
 		if (old_bytes) {
-			trace_trans_restart_mem_realloced(trans->ip, _RET_IP_, new_bytes);
+			trace_trans_restart_mem_realloced(trans->fn, _RET_IP_, new_bytes);
 			btree_trans_restart(trans);
 			return ERR_PTR(-EINTR);
 		}
@@ -2925,14 +2927,15 @@ static void bch2_trans_alloc_paths(struct btree_trans *trans, struct bch_fs *c)
 	trans->updates		= p; p += updates_bytes;
 }
 
-void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c,
-		     unsigned expected_nr_iters,
-		     size_t expected_mem_bytes)
+void __bch2_trans_init(struct btree_trans *trans, struct bch_fs *c,
+		       unsigned expected_nr_iters,
+		       size_t expected_mem_bytes,
+		       const char *fn)
 	__acquires(&c->btree_trans_barrier)
 {
 	memset(trans, 0, sizeof(*trans));
 	trans->c		= c;
-	trans->ip		= _RET_IP_;
+	trans->fn		= fn;
 	trans->journal_replay_not_finished =
 		!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags);
 
@@ -2971,7 +2974,7 @@ static void check_btree_paths_leaked(struct btree_trans *trans)
 			goto leaked;
 	return;
 leaked:
-	bch_err(c, "btree paths leaked from %pS!", (void *) trans->ip);
+	bch_err(c, "btree paths leaked from %s!", trans->fn);
 	trans_for_each_path(trans, path)
 		if (path->ref)
 			printk(KERN_ERR "  btree %s %pS\n",
@@ -3069,7 +3072,7 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct bch_fs *c)
 		if (!trans_has_locks(trans))
 			continue;
 
-		pr_buf(out, "%i %ps\n", trans->pid, (void *) trans->ip);
+		pr_buf(out, "%i %s\n", trans->pid, trans->fn);
 
 		trans_for_each_path(trans, path) {
 			if (!path->nodes_locked)
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index 457a7601b0ce..abbde3666942 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -379,9 +379,12 @@ __bch2_btree_iter_peek_and_restart(struct btree_trans *trans,
 /* new multiple iterator interface: */
 
 void bch2_dump_trans_paths_updates(struct btree_trans *);
-void bch2_trans_init(struct btree_trans *, struct bch_fs *, unsigned, size_t);
+void __bch2_trans_init(struct btree_trans *, struct bch_fs *,
+		       unsigned, size_t, const char *);
 void bch2_trans_exit(struct btree_trans *);
 
+#define bch2_trans_init(...)	__bch2_trans_init(__VA_ARGS__, __func__)
+
 void bch2_btree_trans_to_text(struct printbuf *, struct bch_fs *);
 
 void bch2_fs_btree_iter_exit(struct bch_fs *);
diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index e5029703240c..13012f26a677 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -224,7 +224,7 @@ static int btree_key_cache_fill(struct btree_trans *trans,
 		goto err;
 
 	if (!bch2_btree_node_relock(trans, ck_path, 0)) {
-		trace_transaction_restart_ip(trans->ip, _THIS_IP_);
+		trace_transaction_restart_ip(trans->fn, _THIS_IP_);
 		ret = btree_trans_restart(trans);
 		goto err;
 	}
@@ -319,7 +319,7 @@ retry:
 			if (!trans->restarted)
 				goto retry;
 
-			trace_transaction_restart_ip(trans->ip, _THIS_IP_);
+			trace_transaction_restart_ip(trans->fn, _THIS_IP_);
 			ret = -EINTR;
 			goto err;
 		}
@@ -339,7 +339,7 @@ fill:
 	if (!ck->valid && !(flags & BTREE_ITER_CACHED_NOFILL)) {
 		if (!path->locks_want &&
 		    !__bch2_btree_path_upgrade(trans, path, 1)) {
-			trace_transaction_restart_ip(trans->ip, _THIS_IP_);
+			trace_transaction_restart_ip(trans->fn, _THIS_IP_);
 			ret = btree_trans_restart(trans);
 			goto err;
 		}
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 1fd0cebe30ac..794726c4efd7 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -365,6 +365,7 @@ struct btree_trans_commit_hook {
 
 struct btree_trans {
 	struct bch_fs		*c;
+	const char		*fn;
 	struct list_head	list;
 	struct btree		*locking;
 	unsigned		locking_path_idx;
@@ -372,7 +373,6 @@ struct btree_trans {
 	u8			locking_btree_id;
 	u8			locking_level;
 	pid_t			pid;
-	unsigned long		ip;
 	int			srcu_idx;
 
 	u8			nr_sorted;
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 51a2ea2c5cd6..29dda2352afd 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -955,7 +955,7 @@ retry:
 	 * instead of locking/reserving all the way to the root:
 	 */
 	if (!bch2_btree_path_upgrade(trans, path, U8_MAX)) {
-		trace_trans_restart_iter_upgrade(trans->ip, _RET_IP_,
+		trace_trans_restart_iter_upgrade(trans->fn, _RET_IP_,
 						 path->btree_id, &path->pos);
 		ret = btree_trans_restart(trans);
 		return ERR_PTR(ret);
@@ -1019,7 +1019,7 @@ retry:
 				BTREE_UPDATE_JOURNAL_RES,
 				journal_flags);
 		if (ret) {
-			trace_trans_restart_journal_preres_get(trans->ip, _RET_IP_);
+			trace_trans_restart_journal_preres_get(trans->fn, _RET_IP_);
 			goto err;
 		}
 
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 60897fc70c58..de33491f2535 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -266,7 +266,7 @@ bch2_trans_journal_preres_get_cold(struct btree_trans *trans, unsigned u64s,
 		return ret;
 
 	if (!bch2_trans_relock(trans)) {
-		trace_trans_restart_journal_preres_get(trans->ip, trace_ip);
+		trace_trans_restart_journal_preres_get(trans->fn, trace_ip);
 		return -EINTR;
 	}
 
@@ -305,7 +305,8 @@ static noinline void journal_transaction_name(struct btree_trans *trans)
 	l->entry.pad[0]		= 0;
 	l->entry.pad[1]		= 0;
 	l->entry.pad[2]		= 0;
-	b = snprintf(l->d, buflen, "%ps", (void *) trans->ip);
+	b = min_t(unsigned, strlen(trans->fn), buflen);
+	memcpy(l->d, trans->fn, b);
 	while (b < buflen)
 		l->d[b++] = '\0';
 
@@ -426,7 +427,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans,
 	int ret;
 
 	if (race_fault()) {
-		trace_trans_restart_fault_inject(trans->ip, trace_ip);
+		trace_trans_restart_fault_inject(trans->fn, trace_ip);
 		trans->restarted = true;
 		return -EINTR;
 	}
@@ -619,7 +620,7 @@ fail:
 		bch2_btree_node_unlock_write_inlined(trans, i->path, insert_l(i)->b);
 	}
 
-	trace_trans_restart_would_deadlock_write(trans->ip);
+	trace_trans_restart_would_deadlock_write(trans->fn);
 	return btree_trans_restart(trans);
 }
 
@@ -650,9 +651,8 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans,
 			char buf[200];
 
 			bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(i->k));
-			bch_err(c, "invalid bkey %s on insert from %ps -> %ps: %s\n",
-				buf, (void *) trans->ip,
-				(void *) i->ip_allocated, invalid);
+			bch_err(c, "invalid bkey %s on insert from %s -> %ps: %s\n",
+				buf, trans->fn, (void *) i->ip_allocated, invalid);
 			bch2_fatal_error(c);
 			return -EINVAL;
 		}
@@ -758,7 +758,7 @@ int bch2_trans_commit_error(struct btree_trans *trans,
 			return 0;
 
 		if (ret == -EINTR)
-			trace_trans_restart_btree_node_split(trans->ip, trace_ip,
+			trace_trans_restart_btree_node_split(trans->fn, trace_ip,
 						i->btree_id, &i->path->pos);
 		break;
 	case BTREE_INSERT_NEED_MARK_REPLICAS:
@@ -771,7 +771,7 @@ int bch2_trans_commit_error(struct btree_trans *trans,
 		if (bch2_trans_relock(trans))
 			return 0;
 
-		trace_trans_restart_mark_replicas(trans->ip, trace_ip);
+		trace_trans_restart_mark_replicas(trans->fn, trace_ip);
 		ret = -EINTR;
 		break;
 	case BTREE_INSERT_NEED_JOURNAL_RES:
@@ -791,13 +791,13 @@ int bch2_trans_commit_error(struct btree_trans *trans,
 		if (bch2_trans_relock(trans))
 			return 0;
 
-		trace_trans_restart_journal_res_get(trans->ip, trace_ip);
+		trace_trans_restart_journal_res_get(trans->fn, trace_ip);
 		ret = -EINTR;
 		break;
 	case BTREE_INSERT_NEED_JOURNAL_RECLAIM:
 		bch2_trans_unlock(trans);
 
-		trace_trans_blocked_journal_reclaim(trans->ip, trace_ip);
+		trace_trans_blocked_journal_reclaim(trans->fn, trace_ip);
 
 		wait_event_freezable(c->journal.reclaim_wait,
 				     (ret = journal_reclaim_wait_done(c)));
@@ -807,7 +807,7 @@ int bch2_trans_commit_error(struct btree_trans *trans,
 		if (bch2_trans_relock(trans))
 			return 0;
 
-		trace_trans_restart_journal_reclaim(trans->ip, trace_ip);
+		trace_trans_restart_journal_reclaim(trans->fn, trace_ip);
 		ret = -EINTR;
 		break;
 	default:
@@ -902,7 +902,7 @@ static int bch2_trans_commit_run_triggers(struct btree_trans *trans)
 				}
 
 				if (ret == -EINTR)
-					trace_trans_restart_mark(trans->ip, _RET_IP_,
+					trace_trans_restart_mark(trans->fn, _RET_IP_,
 							i->btree_id, &i->path->pos);
 				if (ret)
 					return ret;
@@ -932,7 +932,7 @@ static int bch2_trans_commit_run_triggers(struct btree_trans *trans)
 						BTREE_TRIGGER_OVERWRITE|i->flags);
 
 				if (ret == -EINTR)
-					trace_trans_restart_mark(trans->ip, _RET_IP_,
+					trace_trans_restart_mark(trans->fn, _RET_IP_,
 							i->btree_id, &i->path->pos);
 				if (ret)
 					return ret;
@@ -999,7 +999,7 @@ int __bch2_trans_commit(struct btree_trans *trans)
 		BUG_ON(!i->path->should_be_locked);
 
 		if (unlikely(!bch2_btree_path_upgrade(trans, i->path, i->level + 1))) {
-			trace_trans_restart_upgrade(trans->ip, _RET_IP_,
+			trace_trans_restart_upgrade(trans->fn, _RET_IP_,
 						    i->btree_id, &i->path->pos);
 			ret = btree_trans_restart(trans);
 			goto out;
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index bbdfccf24e53..10a737965beb 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -135,7 +135,6 @@ int __must_check bch2_write_inode(struct bch_fs *c,
 	int ret;
 
 	bch2_trans_init(&trans, c, 0, 512);
-	trans.ip = _RET_IP_;
 retry:
 	bch2_trans_begin(&trans);
 
diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h
index ff67e8572ea4..69d1f42fe0f8 100644
--- a/fs/bcachefs/trace.h
+++ b/fs/bcachefs/trace.h
@@ -546,94 +546,81 @@ TRACE_EVENT(copygc_wait,
 		  __entry->wait_amount, __entry->until)
 );
 
-TRACE_EVENT(transaction_restart_ip,
-	TP_PROTO(unsigned long caller, unsigned long ip),
-	TP_ARGS(caller, ip),
-
-	TP_STRUCT__entry(
-		__field(unsigned long,		caller	)
-		__field(unsigned long,		ip	)
-	),
-
-	TP_fast_assign(
-		__entry->caller	= caller;
-		__entry->ip	= ip;
-	),
-
-	TP_printk("%pS %pS", (void *) __entry->caller, (void *) __entry->ip)
-);
-
 DECLARE_EVENT_CLASS(transaction_restart,
-	TP_PROTO(unsigned long trans_ip,
+	TP_PROTO(const char *trans_fn,
 		 unsigned long caller_ip),
-	TP_ARGS(trans_ip, caller_ip),
+	TP_ARGS(trans_fn, caller_ip),
 
 	TP_STRUCT__entry(
-		__field(unsigned long,		trans_ip	)
+		__array(char,			trans_fn, 24	)
 		__field(unsigned long,		caller_ip	)
 	),
 
 	TP_fast_assign(
-		__entry->trans_ip		= trans_ip;
+		strncpy(__entry->trans_fn, trans_fn, sizeof(__entry->trans_fn));
 		__entry->caller_ip		= caller_ip;
 	),
 
-	TP_printk("%pS %pS",
-		  (void *) __entry->trans_ip,
-		  (void *) __entry->caller_ip)
+	TP_printk("%s %pS", __entry->trans_fn, (void *) __entry->caller_ip)
+);
+
+DEFINE_EVENT(transaction_restart,	transaction_restart_ip,
+	TP_PROTO(const char *trans_fn,
+		 unsigned long caller_ip),
+	TP_ARGS(trans_fn, caller_ip)
 );
 
 DEFINE_EVENT(transaction_restart,	trans_blocked_journal_reclaim,
-	TP_PROTO(unsigned long trans_ip,
+	TP_PROTO(const char *trans_fn,
 		 unsigned long caller_ip),
-	TP_ARGS(trans_ip, caller_ip)
+	TP_ARGS(trans_fn, caller_ip)
 );
 
 DEFINE_EVENT(transaction_restart,	trans_restart_journal_res_get,
-	TP_PROTO(unsigned long trans_ip,
+	TP_PROTO(const char *trans_fn,
 		 unsigned long caller_ip),
-	TP_ARGS(trans_ip, caller_ip)
+	TP_ARGS(trans_fn, caller_ip)
 );
 
 DEFINE_EVENT(transaction_restart,	trans_restart_journal_preres_get,
-	TP_PROTO(unsigned long trans_ip,
+	TP_PROTO(const char *trans_fn,
 		 unsigned long caller_ip),
-	TP_ARGS(trans_ip, caller_ip)
+	TP_ARGS(trans_fn, caller_ip)
 );
 
 DEFINE_EVENT(transaction_restart,	trans_restart_journal_reclaim,
-	TP_PROTO(unsigned long trans_ip,
+	TP_PROTO(const char *trans_fn,
 		 unsigned long caller_ip),
-	TP_ARGS(trans_ip, caller_ip)
+	TP_ARGS(trans_fn, caller_ip)
 );
 
 DEFINE_EVENT(transaction_restart,	trans_restart_fault_inject,
-	TP_PROTO(unsigned long trans_ip,
+	TP_PROTO(const char *trans_fn,
 		 unsigned long caller_ip),
-	TP_ARGS(trans_ip, caller_ip)
+	TP_ARGS(trans_fn, caller_ip)
 );
 
 DEFINE_EVENT(transaction_restart,	trans_traverse_all,
-	TP_PROTO(unsigned long trans_ip,
+	TP_PROTO(const char *trans_fn,
 		 unsigned long caller_ip),
-	TP_ARGS(trans_ip, caller_ip)
+	TP_ARGS(trans_fn, caller_ip)
 );
 
 DEFINE_EVENT(transaction_restart,	trans_restart_mark_replicas,
-	TP_PROTO(unsigned long trans_ip,
+	TP_PROTO(const char *trans_fn,
 		 unsigned long caller_ip),
-	TP_ARGS(trans_ip, caller_ip)
+	TP_ARGS(trans_fn, caller_ip)
 );
 
 DECLARE_EVENT_CLASS(transaction_restart_iter,
-	TP_PROTO(unsigned long trans_ip,
+	TP_PROTO(const char *trans_fn,
 		 unsigned long caller_ip,
 		 enum btree_id btree_id,
 		 struct bpos *pos),
-	TP_ARGS(trans_ip, caller_ip, btree_id, pos),
+	TP_ARGS(trans_fn, caller_ip, btree_id, pos),
 
 	TP_STRUCT__entry(
-		__field(unsigned long,		trans_ip	)
+		__array(char,			trans_fn, 24	)
 		__field(unsigned long,		caller_ip	)
 		__field(u8,			btree_id	)
 		__field(u64,			pos_inode	)
@@ -642,7 +629,7 @@ DECLARE_EVENT_CLASS(transaction_restart_iter,
 	),
 
 	TP_fast_assign(
-		__entry->trans_ip		= trans_ip;
+		strncpy(__entry->trans_fn, trans_fn, sizeof(__entry->trans_fn));
 		__entry->caller_ip		= caller_ip;
 		__entry->btree_id		= btree_id;
 		__entry->pos_inode		= pos->inode;
@@ -650,8 +637,8 @@ DECLARE_EVENT_CLASS(transaction_restart_iter,
 		__entry->pos_snapshot		= pos->snapshot;
 	),
 
-	TP_printk("%ps %pS btree %u pos %llu:%llu:%u",
-		  (void *) __entry->trans_ip,
+	TP_printk("%s %pS btree %u pos %llu:%llu:%u",
+		  __entry->trans_fn,
 		  (void *) __entry->caller_ip,
 		  __entry->btree_id,
 		  __entry->pos_inode,
@@ -660,63 +647,63 @@ DECLARE_EVENT_CLASS(transaction_restart_iter,
 );
 
 DEFINE_EVENT(transaction_restart_iter,	trans_restart_btree_node_reused,
-	TP_PROTO(unsigned long trans_ip,
+	TP_PROTO(const char *trans_fn,
 		 unsigned long caller_ip,
 		 enum btree_id btree_id,
 		 struct bpos *pos),
-	TP_ARGS(trans_ip, caller_ip, btree_id, pos)
+	TP_ARGS(trans_fn, caller_ip, btree_id, pos)
 );
 
 DEFINE_EVENT(transaction_restart_iter,	trans_restart_btree_node_split,
-	TP_PROTO(unsigned long trans_ip,
+	TP_PROTO(const char *trans_fn,
 		 unsigned long caller_ip,
 		 enum btree_id btree_id,
 		 struct bpos *pos),
-	TP_ARGS(trans_ip, caller_ip, btree_id, pos)
+	TP_ARGS(trans_fn, caller_ip, btree_id, pos)
 );
 
 DEFINE_EVENT(transaction_restart_iter,	trans_restart_mark,
-	TP_PROTO(unsigned long trans_ip,
+	TP_PROTO(const char *trans_fn,
 		 unsigned long caller_ip,
 		 enum btree_id btree_id,
 		 struct bpos *pos),
-	TP_ARGS(trans_ip, caller_ip, btree_id, pos)
+	TP_ARGS(trans_fn, caller_ip, btree_id, pos)
 );
 
 DEFINE_EVENT(transaction_restart_iter,	trans_restart_upgrade,
-	TP_PROTO(unsigned long trans_ip,
+	TP_PROTO(const char *trans_fn,
 		 unsigned long caller_ip,
 		 enum btree_id btree_id,
 		 struct bpos *pos),
-	TP_ARGS(trans_ip, caller_ip, btree_id, pos)
+	TP_ARGS(trans_fn, caller_ip, btree_id, pos)
 );
 
 DEFINE_EVENT(transaction_restart_iter,	trans_restart_iter_upgrade,
-	TP_PROTO(unsigned long trans_ip,
+	TP_PROTO(const char *trans_fn,
 		 unsigned long caller_ip,
 		 enum btree_id btree_id,
 		 struct bpos *pos),
-	TP_ARGS(trans_ip, caller_ip, btree_id, pos)
+	TP_ARGS(trans_fn, caller_ip, btree_id, pos)
 );
 
 DEFINE_EVENT(transaction_restart_iter,	trans_restart_relock,
-	TP_PROTO(unsigned long trans_ip,
+	TP_PROTO(const char *trans_fn,
 		 unsigned long caller_ip,
 		 enum btree_id btree_id,
 		 struct bpos *pos),
-	TP_ARGS(trans_ip, caller_ip, btree_id, pos)
+	TP_ARGS(trans_fn, caller_ip, btree_id, pos)
 );
 
 DEFINE_EVENT(transaction_restart_iter,	trans_restart_traverse,
-	TP_PROTO(unsigned long trans_ip,
+	TP_PROTO(const char *trans_fn,
 		 unsigned long caller_ip,
 		 enum btree_id btree_id,
 		 struct bpos *pos),
-	TP_ARGS(trans_ip, caller_ip, btree_id, pos)
+	TP_ARGS(trans_fn, caller_ip, btree_id, pos)
 );
 
 TRACE_EVENT(trans_restart_would_deadlock,
-	TP_PROTO(unsigned long	trans_ip,
+	TP_PROTO(const char *trans_fn,
 		 unsigned long	caller_ip,
 		 bool		in_traverse_all,
 		 unsigned	reason,
@@ -726,12 +713,12 @@ TRACE_EVENT(trans_restart_would_deadlock,
 		 enum btree_id	want_btree_id,
 		 unsigned	want_iter_type,
 		 struct bpos	*want_pos),
-	TP_ARGS(trans_ip, caller_ip, in_traverse_all, reason,
+	TP_ARGS(trans_fn, caller_ip, in_traverse_all, reason,
 		have_btree_id, have_iter_type, have_pos,
 		want_btree_id, want_iter_type, want_pos),
 
 	TP_STRUCT__entry(
-		__field(unsigned long,		trans_ip	)
+		__array(char,			trans_fn, 24	)
 		__field(unsigned long,		caller_ip	)
 		__field(u8,			in_traverse_all	)
 		__field(u8,			reason		)
@@ -749,7 +736,7 @@ TRACE_EVENT(trans_restart_would_deadlock,
 	),
 
 	TP_fast_assign(
-		__entry->trans_ip		= trans_ip;
+		strncpy(__entry->trans_fn, trans_fn, sizeof(__entry->trans_fn));
 		__entry->caller_ip		= caller_ip;
 		__entry->in_traverse_all	= in_traverse_all;
 		__entry->reason			= reason;
@@ -767,8 +754,8 @@ TRACE_EVENT(trans_restart_would_deadlock,
 		__entry->want_pos_snapshot	= want_pos->snapshot;
 	),
 
-	TP_printk("%pS %pS traverse_all %u because %u have %u:%u %llu:%llu:%u want %u:%u %llu:%llu:%u",
-		  (void *) __entry->trans_ip,
+	TP_printk("%s %pS traverse_all %u because %u have %u:%u %llu:%llu:%u want %u:%u %llu:%llu:%u",
+		  __entry->trans_fn,
 		  (void *) __entry->caller_ip,
 		  __entry->in_traverse_all,
 		  __entry->reason,
@@ -785,39 +772,40 @@ TRACE_EVENT(trans_restart_would_deadlock,
 );
 
 TRACE_EVENT(trans_restart_would_deadlock_write,
-	TP_PROTO(unsigned long trans_ip),
-	TP_ARGS(trans_ip),
+	TP_PROTO(const char *trans_fn),
+	TP_ARGS(trans_fn),
 
 	TP_STRUCT__entry(
-		__field(unsigned long,		trans_ip	)
+		__array(char,			trans_fn, 24	)
 	),
 
 	TP_fast_assign(
-		__entry->trans_ip	= trans_ip;
+		strncpy(__entry->trans_fn, trans_fn, sizeof(__entry->trans_fn));
 	),
 
-	TP_printk("%ps", (void *) __entry->trans_ip)
+	TP_printk("%s", __entry->trans_fn)
 );
 
 TRACE_EVENT(trans_restart_mem_realloced,
-	TP_PROTO(unsigned long trans_ip, unsigned long caller_ip,
+	TP_PROTO(const char *trans_fn,
+		 unsigned long caller_ip,
 		 unsigned long bytes),
-	TP_ARGS(trans_ip, caller_ip, bytes),
+	TP_ARGS(trans_fn, caller_ip, bytes),
 
 	TP_STRUCT__entry(
-		__field(unsigned long,		trans_ip	)
+		__array(char,			trans_fn, 24	)
 		__field(unsigned long,		caller_ip	)
 		__field(unsigned long,		bytes		)
 	),
 
 	TP_fast_assign(
-		__entry->trans_ip	= trans_ip;
+		strncpy(__entry->trans_fn, trans_fn, sizeof(__entry->trans_fn));
 		__entry->caller_ip	= caller_ip;
 		__entry->bytes		= bytes;
 	),
 
-	TP_printk("%pS %pS bytes %lu",
-		  (void *) __entry->trans_ip,
+	TP_printk("%s %pS bytes %lu",
+		  __entry->trans_fn,
 		  (void *) __entry->caller_ip,
 		  __entry->bytes)
 );
-- 
cgit 


From 57cfdd8b54b945fe80191767e36595b46893e5e0 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 4 Jan 2022 18:24:55 -0500
Subject: bcachefs: BTREE_ITER_FILTER_SNAPSHOTS is selected automatically

It doesn't have to be specified - this patch deletes the two instances
where it was.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/fs-io.c | 2 +-
 fs/bcachefs/io.c    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 28bbbac5cd67..5fce958bafc9 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -1005,7 +1005,7 @@ retry:
 
 	bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
 			     SPOS(inum.inum, rbio->bio.bi_iter.bi_sector, snapshot),
-			     BTREE_ITER_SLOTS|BTREE_ITER_FILTER_SNAPSHOTS);
+			     BTREE_ITER_SLOTS);
 	while (1) {
 		struct bkey_s_c k;
 		unsigned bytes, sectors, offset_into_extent;
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index f172da922904..218934b4e19b 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -2326,7 +2326,7 @@ retry:
 
 	bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
 			     SPOS(inum.inum, bvec_iter.bi_sector, snapshot),
-			     BTREE_ITER_SLOTS|BTREE_ITER_FILTER_SNAPSHOTS);
+			     BTREE_ITER_SLOTS);
 	while (1) {
 		unsigned bytes, sectors, offset_into_extent;
 		enum btree_id data_btree = BTREE_ID_extents;
-- 
cgit 


From 03ea3962ab99adf0cf7de9949716e6baeda230f3 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 4 Jan 2022 19:05:08 -0500
Subject: bcachefs: Log & error message improvements

 - Add a shim uuid_unparse_lower() in the kernel, since %pU doesn't work
   in userspace

 - We don't need to print the bcachefs: or the filesystem name prefix in
   userspace

 - Improve a few error messages

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/bcachefs.h        |  4 ++--
 fs/bcachefs/btree_io.c        | 12 ++++++++----
 fs/bcachefs/btree_key_cache.c |  3 +--
 fs/bcachefs/checksum.c        | 25 +++++++++++++------------
 fs/bcachefs/recovery.c        |  9 +++------
 fs/bcachefs/super.c           |  2 +-
 fs/bcachefs/util.h            |  9 +++++++++
 7 files changed, 37 insertions(+), 27 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 2dd3a0a1943a..8ef874b3afbb 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -223,8 +223,8 @@
 #define bch2_fmt(_c, fmt)		"bcachefs (%s): " fmt "\n", ((_c)->name)
 #define bch2_fmt_inum(_c, _inum, fmt)	"bcachefs (%s inum %llu): " fmt "\n", ((_c)->name), (_inum)
 #else
-#define bch2_fmt(_c, fmt)		"%s: " fmt "\n", ((_c)->name)
-#define bch2_fmt_inum(_c, _inum, fmt)	"%s inum %llu: " fmt "\n", ((_c)->name), (_inum)
+#define bch2_fmt(_c, fmt)		fmt "\n"
+#define bch2_fmt_inum(_c, _inum, fmt)	"inum %llu: " fmt "\n", (_inum)
 #endif
 
 #define bch_info(c, fmt, ...) \
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 287c45253a33..b6551db03968 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -967,19 +967,23 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
 
 		SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN);
 
-		b->written += sectors;
-
 		blacklisted = bch2_journal_seq_is_blacklisted(c,
 					le64_to_cpu(i->journal_seq),
 					true);
 
 		btree_err_on(blacklisted && first,
 			     BTREE_ERR_FIXABLE, c, ca, b, i,
-			     "first btree node bset has blacklisted journal seq");
+			     "first btree node bset has blacklisted journal seq (%llu)",
+			     le64_to_cpu(i->journal_seq));
 
 		btree_err_on(blacklisted && ptr_written,
 			     BTREE_ERR_FIXABLE, c, ca, b, i,
-			     "found blacklisted bset in btree node with sectors_written");
+			     "found blacklisted bset (journal seq %llu) in btree node at offset %u-%u/%u",
+			     le64_to_cpu(i->journal_seq),
+			     b->written, b->written + sectors, ptr_written);
+
+		b->written += sectors;
+
 		if (blacklisted && !first)
 			continue;
 
diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index 13012f26a677..08df768fbebb 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -209,7 +209,6 @@ static int btree_key_cache_fill(struct btree_trans *trans,
 				struct btree_path *ck_path,
 				struct bkey_cached *ck)
 {
-	struct bch_fs *c = trans->c;
 	struct btree_iter iter;
 	struct bkey_s_c k;
 	unsigned new_u64s = 0;
@@ -239,7 +238,7 @@ static int btree_key_cache_fill(struct btree_trans *trans,
 		new_u64s = roundup_pow_of_two(new_u64s);
 		new_k = kmalloc(new_u64s * sizeof(u64), GFP_NOFS);
 		if (!new_k) {
-			bch_err(c, "error allocating memory for key cache key, btree %s u64s %u",
+			bch_err(trans->c, "error allocating memory for key cache key, btree %s u64s %u",
 				bch2_btree_ids[ck->key.btree_id], new_u64s);
 			ret = -ENOMEM;
 			goto err;
diff --git a/fs/bcachefs/checksum.c b/fs/bcachefs/checksum.c
index fbe8603cfb30..a1d89923d361 100644
--- a/fs/bcachefs/checksum.c
+++ b/fs/bcachefs/checksum.c
@@ -407,16 +407,12 @@ int bch2_rechecksum_bio(struct bch_fs *c, struct bio *bio,
 }
 
 #ifdef __KERNEL__
-int bch2_request_key(struct bch_sb *sb, struct bch_key *key)
+static int __bch2_request_key(char *key_description, struct bch_key *key)
 {
-	char key_description[60];
 	struct key *keyring_key;
 	const struct user_key_payload *ukp;
 	int ret;
 
-	snprintf(key_description, sizeof(key_description),
-		 "bcachefs:%pUb", &sb->user_uuid);
-
 	keyring_key = request_key(&key_type_logon, key_description, NULL);
 	if (IS_ERR(keyring_key))
 		return PTR_ERR(keyring_key);
@@ -436,16 +432,10 @@ int bch2_request_key(struct bch_sb *sb, struct bch_key *key)
 }
 #else
 #include <keyutils.h>
-#include <uuid/uuid.h>
 
-int bch2_request_key(struct bch_sb *sb, struct bch_key *key)
+static int __bch2_request_key(char *key_description, struct bch_key *key)
 {
 	key_serial_t key_id;
-	char key_description[60];
-	char uuid[40];
-
-	uuid_unparse_lower(sb->user_uuid.b, uuid);
-	sprintf(key_description, "bcachefs:%s", uuid);
 
 	key_id = request_key("user", key_description, NULL,
 			     KEY_SPEC_USER_KEYRING);
@@ -459,6 +449,17 @@ int bch2_request_key(struct bch_sb *sb, struct bch_key *key)
 }
 #endif
 
+int bch2_request_key(struct bch_sb *sb, struct bch_key *key)
+{
+	char key_description[60];
+	char uuid[40];
+
+	uuid_unparse_lower(sb->user_uuid.b, uuid);
+	sprintf(key_description, "bcachefs:%s", uuid);
+
+	return __bch2_request_key(key_description, key);
+}
+
 int bch2_decrypt_sb_key(struct bch_fs *c,
 			struct bch_sb_field_crypt *crypt,
 			struct bch_key *key)
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index d11457c229ac..7003cf77fdcd 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -530,10 +530,8 @@ static int bch2_journal_replay(struct bch_fs *c)
 	     sizeof(keys_sorted[0]),
 	     journal_sort_seq_cmp, NULL);
 
-	if (keys->nr) {
-		bch_verbose(c, "starting journal replay, %zu keys", keys->nr);
+	if (keys->nr)
 		replay_now_at(j, keys->journal_seq_base);
-	}
 
 	for (i = 0; i < keys->nr; i++) {
 		k = keys_sorted[i];
@@ -901,7 +899,6 @@ static int bch2_fs_initialize_subvolumes(struct bch_fs *c)
 
 static int bch2_fs_upgrade_for_subvolumes(struct btree_trans *trans)
 {
-	struct bch_fs *c = trans->c;
 	struct btree_iter iter;
 	struct bkey_s_c k;
 	struct bch_inode_unpacked inode;
@@ -915,7 +912,7 @@ static int bch2_fs_upgrade_for_subvolumes(struct btree_trans *trans)
 		goto err;
 
 	if (!bkey_is_inode(k.k)) {
-		bch_err(c, "root inode not found");
+		bch_err(trans->c, "root inode not found");
 		ret = -ENOENT;
 		goto err;
 	}
@@ -1138,7 +1135,7 @@ use_clean:
 	if (c->opts.norecovery)
 		goto out;
 
-	bch_verbose(c, "starting journal replay");
+	bch_verbose(c, "starting journal replay, %zu keys", c->journal_keys.nr);
 	err = "journal replay failed";
 	ret = bch2_journal_replay(c);
 	if (ret)
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 7b7902fbdcc6..47de774d18b8 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -745,7 +745,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 	if (ret)
 		goto err;
 
-	scnprintf(c->name, sizeof(c->name), "%pU", &c->sb.user_uuid);
+	uuid_unparse_lower(c->sb.user_uuid.b, c->name);
 
 	/* Compat: */
 	if (sb->version <= bcachefs_metadata_version_inode_v2 &&
diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h
index 969139fef086..fbe5b710e9c5 100644
--- a/fs/bcachefs/util.h
+++ b/fs/bcachefs/util.h
@@ -764,4 +764,13 @@ static inline int u8_cmp(u8 l, u8 r)
 	return cmp_int(l, r);
 }
 
+#ifdef __KERNEL__
+static inline void uuid_unparse_lower(u8 *uuid, char *out)
+{
+	sprintf(out, "%plU", uuid);
+}
+#else
+#include <uuid/uuid.h>
+#endif
+
 #endif /* _BCACHEFS_UTIL_H */
-- 
cgit 


From 9b6e2f1e7036d639ca07434fdb27a739b37beb76 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 4 Jan 2022 19:41:23 -0500
Subject: Revert "bcachefs: Delete some obsolete journal_seq_blacklist code"

This reverts commit f95b61228efd04c9c158123da5827c96e9773b29.

It turns out, we're seeing filesystems in the wild end up with
blacklisted btree node bsets - this should not be happening, and until
we understand why and fix it we need to keep this code around.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/bcachefs.h              |  1 +
 fs/bcachefs/journal_seq_blacklist.c | 78 +++++++++++++++++++++++++++++++++++++
 fs/bcachefs/journal_seq_blacklist.h |  2 +
 fs/bcachefs/recovery.c              | 22 +++++++----
 fs/bcachefs/super.c                 |  5 +++
 5 files changed, 100 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 8ef874b3afbb..4ebaefd408a4 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -755,6 +755,7 @@ struct bch_fs {
 	/* JOURNAL SEQ BLACKLIST */
 	struct journal_seq_blacklist_table *
 				journal_seq_blacklist_table;
+	struct work_struct	journal_seq_blacklist_gc_work;
 
 	/* ALLOCATOR */
 	spinlock_t		freelist_lock;
diff --git a/fs/bcachefs/journal_seq_blacklist.c b/fs/bcachefs/journal_seq_blacklist.c
index e10b2c7c7bae..3cc63fc202ab 100644
--- a/fs/bcachefs/journal_seq_blacklist.c
+++ b/fs/bcachefs/journal_seq_blacklist.c
@@ -241,3 +241,81 @@ const struct bch_sb_field_ops bch_sb_field_ops_journal_seq_blacklist = {
 	.validate	= bch2_sb_journal_seq_blacklist_validate,
 	.to_text	= bch2_sb_journal_seq_blacklist_to_text
 };
+
+void bch2_blacklist_entries_gc(struct work_struct *work)
+{
+	struct bch_fs *c = container_of(work, struct bch_fs,
+					journal_seq_blacklist_gc_work);
+	struct journal_seq_blacklist_table *t;
+	struct bch_sb_field_journal_seq_blacklist *bl;
+	struct journal_seq_blacklist_entry *src, *dst;
+	struct btree_trans trans;
+	unsigned i, nr, new_nr;
+	int ret;
+
+	bch2_trans_init(&trans, c, 0, 0);
+
+	for (i = 0; i < BTREE_ID_NR; i++) {
+		struct btree_iter iter;
+		struct btree *b;
+
+		bch2_trans_node_iter_init(&trans, &iter, i, POS_MIN,
+					  0, 0, BTREE_ITER_PREFETCH);
+retry:
+		bch2_trans_begin(&trans);
+
+		b = bch2_btree_iter_peek_node(&iter);
+
+		while (!(ret = PTR_ERR_OR_ZERO(b)) &&
+		       b &&
+		       !test_bit(BCH_FS_STOPPING, &c->flags))
+			b = bch2_btree_iter_next_node(&iter);
+
+		if (ret == -EINTR)
+			goto retry;
+
+		bch2_trans_iter_exit(&trans, &iter);
+	}
+
+	bch2_trans_exit(&trans);
+	if (ret)
+		return;
+
+	mutex_lock(&c->sb_lock);
+	bl = bch2_sb_get_journal_seq_blacklist(c->disk_sb.sb);
+	if (!bl)
+		goto out;
+
+	nr = blacklist_nr_entries(bl);
+	dst = bl->start;
+
+	t = c->journal_seq_blacklist_table;
+	BUG_ON(nr != t->nr);
+
+	for (src = bl->start, i = eytzinger0_first(t->nr);
+	     src < bl->start + nr;
+	     src++, i = eytzinger0_next(i, nr)) {
+		BUG_ON(t->entries[i].start	!= le64_to_cpu(src->start));
+		BUG_ON(t->entries[i].end	!= le64_to_cpu(src->end));
+
+		if (t->entries[i].dirty)
+			*dst++ = *src;
+	}
+
+	new_nr = dst - bl->start;
+
+	bch_info(c, "nr blacklist entries was %u, now %u", nr, new_nr);
+
+	if (new_nr != nr) {
+		bl = bch2_sb_resize_journal_seq_blacklist(&c->disk_sb,
+				new_nr ? sb_blacklist_u64s(new_nr) : 0);
+		BUG_ON(new_nr && !bl);
+
+		if (!new_nr)
+			c->disk_sb.sb->features[0] &= cpu_to_le64(~(1ULL << BCH_FEATURE_journal_seq_blacklist_v3));
+
+		bch2_write_super(c);
+	}
+out:
+	mutex_unlock(&c->sb_lock);
+}
diff --git a/fs/bcachefs/journal_seq_blacklist.h b/fs/bcachefs/journal_seq_blacklist.h
index b4f876a04586..afb886ec8e25 100644
--- a/fs/bcachefs/journal_seq_blacklist.h
+++ b/fs/bcachefs/journal_seq_blacklist.h
@@ -17,4 +17,6 @@ int bch2_blacklist_table_initialize(struct bch_fs *);
 
 extern const struct bch_sb_field_ops bch_sb_field_ops_journal_seq_blacklist;
 
+void bch2_blacklist_entries_gc(struct work_struct *);
+
 #endif /* _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H */
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 7003cf77fdcd..b818093eab39 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -1065,6 +1065,16 @@ use_clean:
 	if (ret)
 		goto err;
 
+	/*
+	 * After an unclean shutdown, skip then next few journal sequence
+	 * numbers as they may have been referenced by btree writes that
+	 * happened before their corresponding journal writes - those btree
+	 * writes need to be ignored, by skipping and blacklisting the next few
+	 * journal sequence numbers:
+	 */
+	if (!c->sb.clean)
+		journal_seq += 8;
+
 	if (blacklist_seq != journal_seq) {
 		ret = bch2_journal_seq_blacklist_add(c,
 					blacklist_seq, journal_seq);
@@ -1210,14 +1220,6 @@ use_clean:
 	}
 
 	mutex_lock(&c->sb_lock);
-	/*
-	 * With journal replay done, we can clear the journal seq blacklist
-	 * table:
-	 */
-	BUG_ON(!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags));
-	if (le16_to_cpu(c->sb.version_min) >= bcachefs_metadata_version_btree_ptr_sectors_written)
-		bch2_sb_resize_journal_seq_blacklist(&c->disk_sb, 0);
-
 	if (c->opts.version_upgrade) {
 		c->disk_sb.sb->version = cpu_to_le16(bcachefs_metadata_version_current);
 		c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALL);
@@ -1259,6 +1261,10 @@ use_clean:
 		bch_info(c, "scanning for old btree nodes done");
 	}
 
+	if (c->journal_seq_blacklist_table &&
+	    c->journal_seq_blacklist_table->nr > 128)
+		queue_work(system_long_wq, &c->journal_seq_blacklist_gc_work);
+
 	ret = 0;
 out:
 	set_bit(BCH_FS_FSCK_DONE, &c->flags);
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 47de774d18b8..55bb263a0906 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -535,6 +535,8 @@ void __bch2_fs_stop(struct bch_fs *c)
 
 	set_bit(BCH_FS_STOPPING, &c->flags);
 
+	cancel_work_sync(&c->journal_seq_blacklist_gc_work);
+
 	down_write(&c->state_lock);
 	bch2_fs_read_only(c);
 	up_write(&c->state_lock);
@@ -698,6 +700,9 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 
 	spin_lock_init(&c->btree_write_error_lock);
 
+	INIT_WORK(&c->journal_seq_blacklist_gc_work,
+		  bch2_blacklist_entries_gc);
+
 	INIT_LIST_HEAD(&c->journal_entries);
 	INIT_LIST_HEAD(&c->journal_iters);
 
-- 
cgit 


From 9714baaa52d63416d1f7577b630831fc885bfa1f Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 4 Jan 2022 19:45:39 -0500
Subject: bcachefs: Fix an uninitialized variable

Only userspace builds were complaining about it, oddly enough.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/journal_io.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index c659a5a95b63..4602f581198e 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -898,7 +898,7 @@ static void bch2_journal_read_device(struct closure *cl)
 	struct journal_read_buf buf = { NULL, 0 };
 	u64 min_seq = U64_MAX;
 	unsigned i;
-	int ret;
+	int ret = 0;
 
 	if (!ja->nr)
 		goto out;
-- 
cgit 


From 4e08446db05427ad0972eba58d6447b21c1ca7e1 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 4 Jan 2022 18:35:00 -0500
Subject: bcachefs: Fix bch2_check_fix_ptrs()

The repair for for btree_ptrs was saying one thing and doing another -
fortunately, that code can just be deleted.

Also, when we update a btree node pointer, we also have to update node
in memery, if it exists in the btree node cache - this fixes
bch2_check_fix_ptrs() to do that.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_gc.c | 62 ++++++++++++++++++++++++++++++++------------------
 1 file changed, 40 insertions(+), 22 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index d7de00af81c9..8c60f15fc63e 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -155,6 +155,34 @@ static void btree_ptr_to_v2(struct btree *b, struct bkey_i_btree_ptr_v2 *dst)
 	}
 }
 
+static void bch2_btree_node_update_key_early(struct bch_fs *c,
+					     enum btree_id btree, unsigned level,
+					     struct bkey_s_c old, struct bkey_i *new)
+{
+	struct btree *b;
+	struct bkey_buf tmp;
+	int ret;
+
+	bch2_bkey_buf_init(&tmp);
+	bch2_bkey_buf_reassemble(&tmp, c, old);
+
+	b = bch2_btree_node_get_noiter(c, tmp.k, btree, level, true);
+	if (!IS_ERR_OR_NULL(b)) {
+		mutex_lock(&c->btree_cache.lock);
+
+		bch2_btree_node_hash_remove(&c->btree_cache, b);
+
+		bkey_copy(&b->key, new);
+		ret = __bch2_btree_node_hash_insert(&c->btree_cache, b);
+		BUG_ON(ret);
+
+		mutex_unlock(&c->btree_cache.lock);
+		six_unlock_read(&b->c.lock);
+	}
+
+	bch2_bkey_buf_exit(&tmp, c);
+}
+
 static int set_node_min(struct bch_fs *c, struct btree *b, struct bpos new_min)
 {
 	struct bkey_i_btree_ptr_v2 *new;
@@ -524,19 +552,6 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id,
 			}
 		}
 
-		if (fsck_err_on(data_type == BCH_DATA_btree &&
-				g->mark.gen != p.ptr.gen, c,
-				"bucket %u:%zu data type %s has metadata but wrong gen: %u != %u\n"
-				"while marking %s",
-				p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
-				bch2_data_types[ptr_data_type(k->k, &p.ptr)],
-				p.ptr.gen, g->mark.gen,
-				(bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf))) {
-			g2->_mark.data_type	= g->_mark.data_type	= data_type;
-			g2->gen_valid		= g->gen_valid		= true;
-			set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags);
-		}
-
 		if (fsck_err_on(gen_cmp(p.ptr.gen, g->mark.gen) > 0, c,
 				"bucket %u:%zu data type %s ptr gen in the future: %u > %u\n"
 				"while marking %s",
@@ -576,7 +591,7 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id,
 				(bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf)))
 			do_update = true;
 
-		if (p.ptr.gen != g->mark.gen)
+		if (data_type != BCH_DATA_btree && p.ptr.gen != g->mark.gen)
 			continue;
 
 		if (fsck_err_on(g->mark.data_type &&
@@ -691,16 +706,19 @@ found:
 		}
 
 		ret = bch2_journal_key_insert_take(c, btree_id, level, new);
-
-		if (ret)
+		if (ret) {
 			kfree(new);
-		else {
-			bch2_bkey_val_to_text(&PBUF(buf), c, *k);
-			bch_info(c, "updated %s", buf);
-			bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(new));
-			bch_info(c, "new key %s", buf);
-			*k = bkey_i_to_s_c(new);
+			return ret;
 		}
+
+		if (level)
+			bch2_btree_node_update_key_early(c, btree_id, level - 1, *k, new);
+
+		bch2_bkey_val_to_text(&PBUF(buf), c, *k);
+		bch_info(c, "updated %s", buf);
+		bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(new));
+		bch_info(c, "new key %s", buf);
+		*k = bkey_i_to_s_c(new);
 	}
 fsck_err:
 	return ret;
-- 
cgit 


From 8f11548edbccc316939dddf7a52d0aa8151a5ba6 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 1 Jan 2022 23:16:15 -0500
Subject: bcachefs: Improve path for when btree_gc needs another pass

btree_gc sometimes needs another pass when it corrects bucket generation
numbers or data types - when it finds multiple pointers of different
data types to the same bucket, it may want to keep the second one it
found.

When this happens, we now clear out bucket sector counts _without_
resetting the bucket generation/data types that we already found,
instead of resetting them to what we have in the alloc btree.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_gc.c | 150 ++++++++++++++++++++++++++++++-------------------
 1 file changed, 92 insertions(+), 58 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 8c60f15fc63e..e92769e010c1 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -604,8 +604,8 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id,
 				(bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf))) {
 			if (data_type == BCH_DATA_btree) {
 				g2->_mark.data_type	= g->_mark.data_type	= data_type;
-				g2->gen_valid		= g->gen_valid		= true;
 				set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags);
+				set_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags);
 			} else {
 				do_update = true;
 			}
@@ -1327,12 +1327,6 @@ static int bch2_gc_start(struct bch_fs *c,
 
 	percpu_down_write(&c->mark_lock);
 
-	/*
-	 * indicate to stripe code that we need to allocate for the gc stripes
-	 * radix tree, too
-	 */
-	gc_pos_set(c, gc_phase(GC_PHASE_START));
-
 	for_each_member_device(ca, c, i) {
 		struct bucket_array *dst = __bucket_array(ca, 1);
 		struct bucket_array *src = __bucket_array(ca, 0);
@@ -1360,6 +1354,27 @@ static int bch2_gc_start(struct bch_fs *c,
 	return 0;
 }
 
+static void bch2_gc_alloc_reset(struct bch_fs *c, bool initial, bool metadata_only)
+{
+	struct bch_dev *ca;
+	unsigned i;
+
+	for_each_member_device(ca, c, i) {
+		struct bucket_array *buckets = __bucket_array(ca, true);
+		struct bucket *g;
+
+		for_each_bucket(g, buckets) {
+			if (metadata_only &&
+			    (g->mark.data_type == BCH_DATA_user ||
+			     g->mark.data_type == BCH_DATA_cached ||
+			     g->mark.data_type == BCH_DATA_parity))
+				continue;
+			g->_mark.dirty_sectors = 0;
+			g->_mark.cached_sectors = 0;
+		}
+	};
+}
+
 static int bch2_gc_reflink_done(struct bch_fs *c, bool initial,
 				bool metadata_only)
 {
@@ -1430,6 +1445,55 @@ fsck_err:
 	return ret;
 }
 
+static int bch2_gc_reflink_start(struct bch_fs *c, bool initial,
+				 bool metadata_only)
+{
+	struct btree_trans trans;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	struct reflink_gc *r;
+	int ret = 0;
+
+	if (metadata_only)
+		return 0;
+
+	bch2_trans_init(&trans, c, 0, 0);
+	c->reflink_gc_nr = 0;
+
+	for_each_btree_key(&trans, iter, BTREE_ID_reflink, POS_MIN,
+			   BTREE_ITER_PREFETCH, k, ret) {
+		const __le64 *refcount = bkey_refcount_c(k);
+
+		if (!refcount)
+			continue;
+
+		r = genradix_ptr_alloc(&c->reflink_gc_table, c->reflink_gc_nr++,
+				       GFP_KERNEL);
+		if (!r) {
+			ret = -ENOMEM;
+			break;
+		}
+
+		r->offset	= k.k->p.offset;
+		r->size		= k.k->size;
+		r->refcount	= 0;
+	}
+	bch2_trans_iter_exit(&trans, &iter);
+
+	bch2_trans_exit(&trans);
+	return ret;
+}
+
+static void bch2_gc_reflink_reset(struct bch_fs *c, bool initial,
+				  bool metadata_only)
+{
+	struct genradix_iter iter;
+	struct reflink_gc *r;
+
+	genradix_for_each(&c->reflink_gc_table, iter, r)
+		r->refcount = 0;
+}
+
 static int bch2_gc_stripes_done(struct bch_fs *c, bool initial,
 				bool metadata_only)
 {
@@ -1493,43 +1557,10 @@ fsck_err:
 	return ret;
 }
 
-static int bch2_gc_reflink_start(struct bch_fs *c, bool initial,
-				 bool metadata_only)
+static void bch2_gc_stripes_reset(struct bch_fs *c, bool initial,
+				bool metadata_only)
 {
-	struct btree_trans trans;
-	struct btree_iter iter;
-	struct bkey_s_c k;
-	struct reflink_gc *r;
-	int ret = 0;
-
-	if (metadata_only)
-		return 0;
-
-	bch2_trans_init(&trans, c, 0, 0);
-	c->reflink_gc_nr = 0;
-
-	for_each_btree_key(&trans, iter, BTREE_ID_reflink, POS_MIN,
-			   BTREE_ITER_PREFETCH, k, ret) {
-		const __le64 *refcount = bkey_refcount_c(k);
-
-		if (!refcount)
-			continue;
-
-		r = genradix_ptr_alloc(&c->reflink_gc_table, c->reflink_gc_nr++,
-				       GFP_KERNEL);
-		if (!r) {
-			ret = -ENOMEM;
-			break;
-		}
-
-		r->offset	= k.k->p.offset;
-		r->size		= k.k->size;
-		r->refcount	= 0;
-	}
-	bch2_trans_iter_exit(&trans, &iter);
-
-	bch2_trans_exit(&trans);
-	return ret;
+	genradix_free(&c->gc_stripes);
 }
 
 /**
@@ -1565,11 +1596,13 @@ int bch2_gc(struct bch_fs *c, bool initial, bool metadata_only)
 	/* flush interior btree updates: */
 	closure_wait_event(&c->btree_interior_update_wait,
 			   !bch2_btree_interior_updates_nr_pending(c));
-again:
+
 	ret   = bch2_gc_start(c, metadata_only) ?:
 		bch2_gc_reflink_start(c, initial, metadata_only);
 	if (ret)
 		goto out;
+again:
+	gc_pos_set(c, gc_phase(GC_PHASE_START));
 
 	bch2_mark_superblocks(c);
 
@@ -1607,25 +1640,26 @@ again:
 
 	if (test_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags) ||
 	    (!iter && bch2_test_restart_gc)) {
+		if (iter++ > 2) {
+			bch_info(c, "Unable to fix bucket gens, looping");
+			ret = -EINVAL;
+			goto out;
+		}
+
 		/*
 		 * XXX: make sure gens we fixed got saved
 		 */
-		if (iter++ <= 2) {
-			bch_info(c, "Second GC pass needed, restarting:");
-			clear_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags);
-			__gc_pos_set(c, gc_phase(GC_PHASE_NOT_RUNNING));
-
-			percpu_down_write(&c->mark_lock);
-			bch2_gc_free(c);
-			percpu_up_write(&c->mark_lock);
-			/* flush fsck errors, reset counters */
-			bch2_flush_fsck_errs(c);
+		bch_info(c, "Second GC pass needed, restarting:");
+		clear_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags);
+		__gc_pos_set(c, gc_phase(GC_PHASE_NOT_RUNNING));
 
-			goto again;
-		}
+		bch2_gc_stripes_reset(c, initial, metadata_only);
+		bch2_gc_alloc_reset(c, initial, metadata_only);
+		bch2_gc_reflink_reset(c, initial, metadata_only);
 
-		bch_info(c, "Unable to fix bucket gens, looping");
-		ret = -EINVAL;
+		/* flush fsck errors, reset counters */
+		bch2_flush_fsck_errs(c);
+		goto again;
 	}
 out:
 	if (!ret) {
-- 
cgit 


From f443fa66c98f012412b677afc4f7096ed24108de Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 13 Feb 2022 18:15:35 -0500
Subject: bcachefs: Also print out in-memory gen on stale dirty pointer

We're trying to track down a bug that shows itself as newly-created
extents having stale dirty pointers - possibly due to the in memory gen
and the btree gen being inconsistent. This patch changes the error
message to also print out the in memory bucket gen when this happens.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/buckets.c | 26 ++++++++++++++------------
 1 file changed, 14 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 35ab5a5d8183..b80ab1ed22f7 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -699,49 +699,51 @@ static int check_bucket_ref(struct bch_fs *c,
 			    struct bkey_s_c k,
 			    const struct bch_extent_ptr *ptr,
 			    s64 sectors, enum bch_data_type ptr_data_type,
-			    u8 bucket_gen, u8 bucket_data_type,
+			    u8 b_gen, u8 bucket_data_type,
 			    u16 dirty_sectors, u16 cached_sectors)
 {
-	size_t bucket_nr = PTR_BUCKET_NR(bch_dev_bkey_exists(c, ptr->dev), ptr);
+	struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+	size_t bucket_nr = PTR_BUCKET_NR(ca, ptr);
 	u16 bucket_sectors = !ptr->cached
 		? dirty_sectors
 		: cached_sectors;
 	char buf[200];
 
-	if (gen_after(ptr->gen, bucket_gen)) {
+	if (gen_after(ptr->gen, b_gen)) {
 		bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
 			"bucket %u:%zu gen %u data type %s: ptr gen %u newer than bucket gen\n"
 			"while marking %s",
-			ptr->dev, bucket_nr, bucket_gen,
+			ptr->dev, bucket_nr, b_gen,
 			bch2_data_types[bucket_data_type ?: ptr_data_type],
 			ptr->gen,
 			(bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
 		return -EIO;
 	}
 
-	if (gen_cmp(bucket_gen, ptr->gen) > BUCKET_GC_GEN_MAX) {
+	if (gen_cmp(b_gen, ptr->gen) > BUCKET_GC_GEN_MAX) {
 		bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
 			"bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n"
 			"while marking %s",
-			ptr->dev, bucket_nr, bucket_gen,
+			ptr->dev, bucket_nr, b_gen,
 			bch2_data_types[bucket_data_type ?: ptr_data_type],
 			ptr->gen,
 			(bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
 		return -EIO;
 	}
 
-	if (bucket_gen != ptr->gen && !ptr->cached) {
+	if (b_gen != ptr->gen && !ptr->cached) {
 		bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
-			"bucket %u:%zu gen %u data type %s: stale dirty ptr (gen %u)\n"
+			"bucket %u:%zu gen %u (mem gen %u) data type %s: stale dirty ptr (gen %u)\n"
 			"while marking %s",
-			ptr->dev, bucket_nr, bucket_gen,
+			ptr->dev, bucket_nr, b_gen,
+			*bucket_gen(ca, bucket_nr),
 			bch2_data_types[bucket_data_type ?: ptr_data_type],
 			ptr->gen,
 			(bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
 		return -EIO;
 	}
 
-	if (bucket_gen != ptr->gen)
+	if (b_gen != ptr->gen)
 		return 1;
 
 	if (bucket_data_type && ptr_data_type &&
@@ -749,7 +751,7 @@ static int check_bucket_ref(struct bch_fs *c,
 		bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
 			"bucket %u:%zu gen %u different types of data in same bucket: %s, %s\n"
 			"while marking %s",
-			ptr->dev, bucket_nr, bucket_gen,
+			ptr->dev, bucket_nr, b_gen,
 			bch2_data_types[bucket_data_type],
 			bch2_data_types[ptr_data_type],
 			(bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
@@ -760,7 +762,7 @@ static int check_bucket_ref(struct bch_fs *c,
 		bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
 			"bucket %u:%zu gen %u data type %s sector count overflow: %u + %lli > U16_MAX\n"
 			"while marking %s",
-			ptr->dev, bucket_nr, bucket_gen,
+			ptr->dev, bucket_nr, b_gen,
 			bch2_data_types[bucket_data_type ?: ptr_data_type],
 			bucket_sectors, sectors,
 			(bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
-- 
cgit 


From 21aec962dfec2df11694350e5b2d3a9a9c298e7d Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 4 Jan 2022 22:32:09 -0500
Subject: bcachefs: New data structure for buckets waiting on journal commit

Implement a hash table, using cuckoo hashing, for empty buckets that are
waiting on a journal commit before they can be reused.

This replaces the journal_seq field of bucket_mark, and is part of
eventually getting rid of the in memory bucket array.

We may need to make bch2_bucket_needs_journal_commit() lockless, pending
profiling and testing.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/Makefile                            |   1 +
 fs/bcachefs/alloc_background.c                  |  62 ++++-----
 fs/bcachefs/bcachefs.h                          |   4 +
 fs/bcachefs/btree_update_leaf.c                 |  28 ++--
 fs/bcachefs/buckets.c                           |  80 +++---------
 fs/bcachefs/buckets.h                           |   8 --
 fs/bcachefs/buckets_types.h                     |   9 --
 fs/bcachefs/buckets_waiting_for_journal.c       | 166 ++++++++++++++++++++++++
 fs/bcachefs/buckets_waiting_for_journal.h       |  15 +++
 fs/bcachefs/buckets_waiting_for_journal_types.h |  23 ++++
 fs/bcachefs/journal_io.c                        |   4 -
 fs/bcachefs/super.c                             |   3 +
 12 files changed, 279 insertions(+), 124 deletions(-)
 create mode 100644 fs/bcachefs/buckets_waiting_for_journal.c
 create mode 100644 fs/bcachefs/buckets_waiting_for_journal.h
 create mode 100644 fs/bcachefs/buckets_waiting_for_journal_types.h

(limited to 'fs')

diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile
index a2769a85b029..65eeab56cb4b 100644
--- a/fs/bcachefs/Makefile
+++ b/fs/bcachefs/Makefile
@@ -16,6 +16,7 @@ bcachefs-y		:=	\
 	btree_update_interior.o	\
 	btree_update_leaf.o	\
 	buckets.o		\
+	buckets_waiting_for_journal.o	\
 	chardev.o		\
 	checksum.o		\
 	clock.o			\
diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index ab7d972aac3a..bc5053ebe18f 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -9,6 +9,7 @@
 #include "btree_update_interior.h"
 #include "btree_gc.h"
 #include "buckets.h"
+#include "buckets_waiting_for_journal.h"
 #include "clock.h"
 #include "debug.h"
 #include "ec.h"
@@ -561,8 +562,7 @@ static unsigned bucket_sort_key(struct bucket *g, struct bucket_mark m,
 		 * keys when there's only a small difference, so that we can
 		 * keep sequential buckets together:
 		 */
-		return  (bucket_needs_journal_commit(m, last_seq_ondisk) << 4)|
-			(bucket_gc_gen(g) >> 4);
+		return bucket_gc_gen(g) >> 4;
 	}
 }
 
@@ -611,6 +611,14 @@ static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca)
 		if (!bch2_can_invalidate_bucket(ca, b, m))
 			continue;
 
+		if (!m.data_type &&
+		    bch2_bucket_needs_journal_commit(&c->buckets_waiting_for_journal,
+						     last_seq_ondisk,
+						     ca->dev_idx, b)) {
+			ca->buckets_waiting_on_journal++;
+			continue;
+		}
+
 		if (e.nr && e.bucket + e.nr == b && e.key == key) {
 			e.nr++;
 		} else {
@@ -647,6 +655,7 @@ static size_t find_reclaimable_buckets(struct bch_fs *c, struct bch_dev *ca)
 
 	ca->inc_gen_needs_gc			= 0;
 	ca->inc_gen_really_needs_gc		= 0;
+	ca->buckets_waiting_on_journal		= 0;
 
 	find_reclaimable_buckets_lru(c, ca);
 
@@ -658,28 +667,6 @@ static size_t find_reclaimable_buckets(struct bch_fs *c, struct bch_dev *ca)
 	return nr;
 }
 
-/*
- * returns sequence number of most recent journal entry that updated this
- * bucket:
- */
-static u64 bucket_journal_seq(struct bch_fs *c, struct bucket_mark m)
-{
-	if (m.journal_seq_valid) {
-		u64 journal_seq = atomic64_read(&c->journal.seq);
-		u64 bucket_seq	= journal_seq;
-
-		bucket_seq &= ~((u64) U16_MAX);
-		bucket_seq |= m.journal_seq;
-
-		if (bucket_seq > journal_seq)
-			bucket_seq -= 1 << 16;
-
-		return bucket_seq;
-	} else {
-		return 0;
-	}
-}
-
 static int bucket_invalidate_btree(struct btree_trans *trans,
 				   struct bch_dev *ca, u64 b)
 {
@@ -745,9 +732,10 @@ static int bch2_invalidate_one_bucket(struct bch_fs *c, struct bch_dev *ca,
 	 * gen in memory here, the incremented gen will be updated in the btree
 	 * by bch2_trans_mark_pointer():
 	 */
-	if (!m.cached_sectors &&
-	    !bucket_needs_journal_commit(m, c->journal.last_seq_ondisk)) {
-		BUG_ON(m.data_type);
+	if (!m.data_type &&
+	    !bch2_bucket_needs_journal_commit(&c->buckets_waiting_for_journal,
+					      c->journal.flushed_seq_ondisk,
+					      ca->dev_idx, b)) {
 		bucket_cmpxchg(g, m, m.gen++);
 		*bucket_gen(ca, b) = m.gen;
 		percpu_up_read(&c->mark_lock);
@@ -781,13 +769,6 @@ out:
 
 		if (!top->nr)
 			heap_pop(&ca->alloc_heap, e, bucket_alloc_cmp, NULL);
-
-		/*
-		 * Make sure we flush the last journal entry that updated this
-		 * bucket (i.e. deleting the last reference) before writing to
-		 * this bucket again:
-		 */
-		*journal_seq = max(*journal_seq, bucket_journal_seq(c, m));
 	} else {
 		size_t b2;
 
@@ -954,8 +935,14 @@ static int bch2_allocator_thread(void *arg)
 			gc_count = c->gc_count;
 			nr = find_reclaimable_buckets(c, ca);
 
-			trace_alloc_scan(ca, nr, ca->inc_gen_needs_gc,
-					 ca->inc_gen_really_needs_gc);
+			if (!nr && ca->buckets_waiting_on_journal) {
+				ret = bch2_journal_flush(&c->journal);
+				if (ret)
+					goto stop;
+			} else if (nr < (ca->mi.nbuckets >> 6) &&
+				   ca->buckets_waiting_on_journal >= nr / 2) {
+				bch2_journal_flush_async(&c->journal, NULL);
+			}
 
 			if ((ca->inc_gen_needs_gc >= ALLOC_SCAN_BATCH(ca) ||
 			     ca->inc_gen_really_needs_gc) &&
@@ -963,6 +950,9 @@ static int bch2_allocator_thread(void *arg)
 				atomic_inc(&c->kick_gc);
 				wake_up_process(c->gc_thread);
 			}
+
+			trace_alloc_scan(ca, nr, ca->inc_gen_needs_gc,
+					 ca->inc_gen_really_needs_gc);
 		}
 
 		ret = bch2_invalidate_buckets(c, ca);
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 4ebaefd408a4..3d1a6773393c 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -355,6 +355,7 @@ enum bch_time_stats {
 #include "alloc_types.h"
 #include "btree_types.h"
 #include "buckets_types.h"
+#include "buckets_waiting_for_journal_types.h"
 #include "clock_types.h"
 #include "ec_types.h"
 #include "journal_types.h"
@@ -482,6 +483,7 @@ struct bch_dev {
 
 	size_t			inc_gen_needs_gc;
 	size_t			inc_gen_really_needs_gc;
+	size_t			buckets_waiting_on_journal;
 
 	enum allocator_states	allocator_state;
 
@@ -777,6 +779,8 @@ struct bch_fs {
 	struct mutex		write_points_hash_lock;
 	unsigned		write_points_nr;
 
+	struct buckets_waiting_for_journal buckets_waiting_for_journal;
+
 	/* GARBAGE COLLECTION */
 	struct task_struct	*gc_thread;
 	atomic_t		kick_gc;
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index de33491f2535..24de8604740c 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -396,10 +396,11 @@ static inline void do_btree_insert_one(struct btree_trans *trans,
 	}
 }
 
-static noinline void bch2_trans_mark_gc(struct btree_trans *trans)
+static noinline int bch2_trans_mark_gc(struct btree_trans *trans)
 {
 	struct bch_fs *c = trans->c;
 	struct btree_insert_entry *i;
+	int ret = 0;
 
 	trans_for_each_update(trans, i) {
 		/*
@@ -408,10 +409,15 @@ static noinline void bch2_trans_mark_gc(struct btree_trans *trans)
 		 */
 		BUG_ON(i->cached || i->level);
 
-		if (gc_visited(c, gc_pos_btree_node(insert_l(i)->b)))
-			bch2_mark_update(trans, i->path, i->k,
-					 i->flags|BTREE_TRIGGER_GC);
+		if (gc_visited(c, gc_pos_btree_node(insert_l(i)->b))) {
+			ret = bch2_mark_update(trans, i->path, i->k,
+					       i->flags|BTREE_TRIGGER_GC);
+			if (ret)
+				break;
+		}
 	}
+
+	return ret;
 }
 
 static inline int
@@ -510,11 +516,17 @@ bch2_trans_commit_write_locked(struct btree_trans *trans,
 		return BTREE_INSERT_NEED_MARK_REPLICAS;
 
 	trans_for_each_update(trans, i)
-		if (BTREE_NODE_TYPE_HAS_MEM_TRIGGERS & (1U << i->bkey_type))
-			bch2_mark_update(trans, i->path, i->k, i->flags);
+		if (BTREE_NODE_TYPE_HAS_MEM_TRIGGERS & (1U << i->bkey_type)) {
+			ret = bch2_mark_update(trans, i->path, i->k, i->flags);
+			if (ret)
+				return ret;
+		}
 
-	if (unlikely(c->gc_pos.phase))
-		bch2_trans_mark_gc(trans);
+	if (unlikely(c->gc_pos.phase)) {
+		ret = bch2_trans_mark_gc(trans);
+		if  (ret)
+			return ret;
+	}
 
 	trans_for_each_update(trans, i)
 		do_btree_insert_one(trans, i);
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index b80ab1ed22f7..f7a750aff03f 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -11,6 +11,7 @@
 #include "btree_gc.h"
 #include "btree_update.h"
 #include "buckets.h"
+#include "buckets_waiting_for_journal.h"
 #include "ec.h"
 #include "error.h"
 #include "inode.h"
@@ -43,43 +44,6 @@ static inline void fs_usage_data_type_to_base(struct bch_fs_usage *fs_usage,
 	}
 }
 
-/*
- * Clear journal_seq_valid for buckets for which it's not needed, to prevent
- * wraparound:
- */
-void bch2_bucket_seq_cleanup(struct bch_fs *c)
-{
-	u64 journal_seq = atomic64_read(&c->journal.seq);
-	u16 last_seq_ondisk = c->journal.flushed_seq_ondisk;
-	struct bch_dev *ca;
-	struct bucket_array *buckets;
-	struct bucket *g;
-	struct bucket_mark m;
-	unsigned i;
-
-	if (journal_seq - c->last_bucket_seq_cleanup <
-	    (1U << (BUCKET_JOURNAL_SEQ_BITS - 2)))
-		return;
-
-	c->last_bucket_seq_cleanup = journal_seq;
-
-	for_each_member_device(ca, c, i) {
-		down_read(&ca->bucket_lock);
-		buckets = bucket_array(ca);
-
-		for_each_bucket(g, buckets) {
-			bucket_cmpxchg(g, m, ({
-				if (!m.journal_seq_valid ||
-				    bucket_needs_journal_commit(m, last_seq_ondisk))
-					break;
-
-				m.journal_seq_valid = 0;
-			}));
-		}
-		up_read(&ca->bucket_lock);
-	}
-}
-
 void bch2_fs_usage_initialize(struct bch_fs *c)
 {
 	struct bch_fs_usage *usage;
@@ -576,16 +540,28 @@ static int bch2_mark_alloc(struct btree_trans *trans,
 		v->journal_seq = cpu_to_le64(new_u.journal_seq);
 	}
 
-	ca = bch_dev_bkey_exists(c, new.k->p.inode);
+	if (old_u.data_type && !new_u.data_type && new_u.journal_seq) {
+		ret = bch2_set_bucket_needs_journal_commit(&c->buckets_waiting_for_journal,
+				c->journal.flushed_seq_ondisk,
+				new_u.dev, new_u.bucket,
+				new_u.journal_seq);
+		if (ret) {
+			bch2_fs_fatal_error(c,
+				"error setting bucket_needs_journal_commit: %i", ret);
+			return ret;
+		}
+	}
+
+	ca = bch_dev_bkey_exists(c, new_u.dev);
 
-	if (new.k->p.offset >= ca->mi.nbuckets)
+	if (new_u.bucket >= ca->mi.nbuckets)
 		return 0;
 
 	percpu_down_read(&c->mark_lock);
 	if (!gc && new_u.gen != old_u.gen)
-		*bucket_gen(ca, new.k->p.offset) = new_u.gen;
+		*bucket_gen(ca, new_u.bucket) = new_u.gen;
 
-	g = __bucket(ca, new.k->p.offset, gc);
+	g = __bucket(ca, new_u.bucket, gc);
 
 	old_m = bucket_cmpxchg(g, m, ({
 		m.gen			= new_u.gen;
@@ -593,11 +569,6 @@ static int bch2_mark_alloc(struct btree_trans *trans,
 		m.dirty_sectors		= new_u.dirty_sectors;
 		m.cached_sectors	= new_u.cached_sectors;
 		m.stripe		= new_u.stripe != 0;
-
-		if (journal_seq) {
-			m.journal_seq_valid	= 1;
-			m.journal_seq		= journal_seq;
-		}
 	}));
 
 	bch2_dev_usage_update(c, ca, old_m, m, journal_seq, gc);
@@ -625,7 +596,7 @@ static int bch2_mark_alloc(struct btree_trans *trans,
 			return ret;
 		}
 
-		trace_invalidate(ca, bucket_to_sector(ca, new.k->p.offset),
+		trace_invalidate(ca, bucket_to_sector(ca, new_u.bucket),
 				 old_m.cached_sectors);
 	}
 
@@ -775,9 +746,10 @@ static int check_bucket_ref(struct bch_fs *c,
 static int mark_stripe_bucket(struct btree_trans *trans,
 			      struct bkey_s_c k,
 			      unsigned ptr_idx,
-			      u64 journal_seq, unsigned flags)
+			      unsigned flags)
 {
 	struct bch_fs *c = trans->c;
+	u64 journal_seq = trans->journal_res.seq;
 	const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
 	unsigned nr_data = s->nr_blocks - s->nr_redundant;
 	bool parity = ptr_idx >= nr_data;
@@ -818,11 +790,6 @@ static int mark_stripe_bucket(struct btree_trans *trans,
 		if (data_type)
 			new.data_type		= data_type;
 
-		if (journal_seq) {
-			new.journal_seq_valid	= 1;
-			new.journal_seq		= journal_seq;
-		}
-
 		new.stripe = true;
 	}));
 
@@ -894,11 +861,6 @@ static int bch2_mark_pointer(struct btree_trans *trans,
 
 		new.data_type = bucket_data_type;
 
-		if (journal_seq) {
-			new.journal_seq_valid = 1;
-			new.journal_seq = journal_seq;
-		}
-
 		if (flags & BTREE_TRIGGER_NOATOMIC) {
 			g->_mark = new;
 			break;
@@ -1119,7 +1081,7 @@ static int bch2_mark_stripe(struct btree_trans *trans,
 		memset(m->block_sectors, 0, sizeof(m->block_sectors));
 
 		for (i = 0; i < new_s->nr_blocks; i++) {
-			ret = mark_stripe_bucket(trans, new, i, journal_seq, flags);
+			ret = mark_stripe_bucket(trans, new, i, flags);
 			if (ret)
 				return ret;
 		}
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index 6eeb95068b3b..4b5376684d2c 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -159,13 +159,6 @@ static inline bool is_available_bucket(struct bucket_mark mark)
 	return !mark.dirty_sectors && !mark.stripe;
 }
 
-static inline bool bucket_needs_journal_commit(struct bucket_mark m,
-					       u16 last_seq_ondisk)
-{
-	return m.journal_seq_valid &&
-		((s16) m.journal_seq - (s16) last_seq_ondisk > 0);
-}
-
 /* Device usage: */
 
 struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *);
@@ -240,7 +233,6 @@ bch2_fs_usage_read_short(struct bch_fs *);
 
 /* key/bucket marking: */
 
-void bch2_bucket_seq_cleanup(struct bch_fs *);
 void bch2_fs_usage_initialize(struct bch_fs *);
 
 void bch2_mark_alloc_bucket(struct bch_fs *, struct bch_dev *, size_t, bool);
diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h
index 18bca269b750..24139831226d 100644
--- a/fs/bcachefs/buckets_types.h
+++ b/fs/bcachefs/buckets_types.h
@@ -15,18 +15,9 @@ struct bucket_mark {
 	u8		gen;
 	u8		data_type:3,
 			owned_by_allocator:1,
-			journal_seq_valid:1,
 			stripe:1;
 	u16		dirty_sectors;
 	u16		cached_sectors;
-
-	/*
-	 * low bits of journal sequence number when this bucket was most
-	 * recently modified: if journal_seq_valid is set, this bucket can't be
-	 * reused until the journal sequence number written to disk is >= the
-	 * bucket's journal sequence number:
-	 */
-	u16		journal_seq;
 	};
 	};
 };
diff --git a/fs/bcachefs/buckets_waiting_for_journal.c b/fs/bcachefs/buckets_waiting_for_journal.c
new file mode 100644
index 000000000000..f3774e30b5cd
--- /dev/null
+++ b/fs/bcachefs/buckets_waiting_for_journal.c
@@ -0,0 +1,166 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "buckets_waiting_for_journal.h"
+#include <linux/hash.h>
+#include <linux/random.h>
+
+static inline struct bucket_hashed *
+bucket_hash(struct buckets_waiting_for_journal_table *t,
+	    unsigned hash_seed_idx, u64 dev_bucket)
+{
+	return t->d + hash_64(dev_bucket ^ t->hash_seeds[hash_seed_idx], t->bits);
+}
+
+static void bucket_table_init(struct buckets_waiting_for_journal_table *t, size_t bits)
+{
+	unsigned i;
+
+	t->bits = bits;
+	for (i = 0; i < ARRAY_SIZE(t->hash_seeds); i++)
+		get_random_bytes(&t->hash_seeds[i], sizeof(t->hash_seeds[i]));
+	memset(t->d, 0, sizeof(t->d[0]) << t->bits);
+}
+
+bool bch2_bucket_needs_journal_commit(struct buckets_waiting_for_journal *b,
+				      u64 flushed_seq,
+				      unsigned dev, u64 bucket)
+{
+	struct buckets_waiting_for_journal_table *t;
+	u64 dev_bucket = (u64) dev << 56 | bucket;
+	bool ret = false;
+	unsigned i;
+
+	mutex_lock(&b->lock);
+	t = b->t;
+
+	for (i = 0; i < ARRAY_SIZE(t->hash_seeds); i++) {
+		struct bucket_hashed *h = bucket_hash(t, i, dev_bucket);
+
+		if (h->dev_bucket == dev_bucket) {
+			ret = h->journal_seq > flushed_seq;
+			break;
+		}
+	}
+
+	mutex_unlock(&b->lock);
+
+	return ret;
+}
+
+static bool bucket_table_insert(struct buckets_waiting_for_journal_table *t,
+				struct bucket_hashed *new,
+				u64 flushed_seq)
+{
+	struct bucket_hashed *last_evicted = NULL;
+	unsigned tries, i;
+
+	for (tries = 0; tries < 10; tries++) {
+		struct bucket_hashed *old, *victim = NULL;
+
+		for (i = 0; i < ARRAY_SIZE(t->hash_seeds); i++) {
+			old = bucket_hash(t, i, new->dev_bucket);
+
+			if (old->dev_bucket == new->dev_bucket ||
+			    old->journal_seq <= flushed_seq) {
+				*old = *new;
+				return true;
+			}
+
+			if (last_evicted != old)
+				victim = old;
+		}
+
+		/* hashed to same slot 3 times: */
+		if (!victim)
+			break;
+
+		/* Failed to find an empty slot: */
+		swap(*new, *victim);
+		last_evicted = victim;
+	}
+
+	return false;
+}
+
+int bch2_set_bucket_needs_journal_commit(struct buckets_waiting_for_journal *b,
+					 u64 flushed_seq,
+					 unsigned dev, u64 bucket,
+					 u64 journal_seq)
+{
+	struct buckets_waiting_for_journal_table *t, *n;
+	struct bucket_hashed tmp, new = {
+		.dev_bucket	= (u64) dev << 56 | bucket,
+		.journal_seq	= journal_seq,
+	};
+	size_t i, size, new_bits, nr_elements = 1, nr_rehashes = 0;
+	int ret = 0;
+
+	mutex_lock(&b->lock);
+
+	if (likely(bucket_table_insert(b->t, &new, flushed_seq)))
+		goto out;
+
+	t = b->t;
+	size = 1UL << t->bits;
+	for (i = 0; i < size; i++)
+		nr_elements += t->d[i].journal_seq > flushed_seq;
+
+	new_bits = t->bits + (nr_elements * 3 > size);
+
+	n = kvmalloc(sizeof(*n) + (sizeof(n->d[0]) << new_bits), GFP_KERNEL);
+	if (!n) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+retry_rehash:
+	nr_rehashes++;
+	bucket_table_init(n, new_bits);
+
+	tmp = new;
+	BUG_ON(!bucket_table_insert(n, &tmp, flushed_seq));
+
+	for (i = 0; i < 1UL << t->bits; i++) {
+		if (t->d[i].journal_seq <= flushed_seq)
+			continue;
+
+		tmp = t->d[i];
+		if (!bucket_table_insert(n, &tmp, flushed_seq))
+			goto retry_rehash;
+	}
+
+	b->t = n;
+	kvfree(t);
+
+	pr_debug("took %zu rehashes, table at %zu/%zu elements",
+		 nr_rehashes, nr_elements, 1UL << b->t->bits);
+out:
+	mutex_unlock(&b->lock);
+
+	return ret;
+}
+
+void bch2_fs_buckets_waiting_for_journal_exit(struct bch_fs *c)
+{
+	struct buckets_waiting_for_journal *b = &c->buckets_waiting_for_journal;
+
+	kvfree(b->t);
+}
+
+#define INITIAL_TABLE_BITS		3
+
+int bch2_fs_buckets_waiting_for_journal_init(struct bch_fs *c)
+{
+	struct buckets_waiting_for_journal *b = &c->buckets_waiting_for_journal;
+
+	mutex_init(&b->lock);
+
+	b->t = kvmalloc(sizeof(*b->t) +
+			(sizeof(b->t->d[0]) << INITIAL_TABLE_BITS), GFP_KERNEL);
+	if (!b->t)
+		return -ENOMEM;
+
+	bucket_table_init(b->t, INITIAL_TABLE_BITS);
+	return 0;
+}
diff --git a/fs/bcachefs/buckets_waiting_for_journal.h b/fs/bcachefs/buckets_waiting_for_journal.h
new file mode 100644
index 000000000000..d2ae19cbe18c
--- /dev/null
+++ b/fs/bcachefs/buckets_waiting_for_journal.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BUCKETS_WAITING_FOR_JOURNAL_H
+#define _BUCKETS_WAITING_FOR_JOURNAL_H
+
+#include "buckets_waiting_for_journal_types.h"
+
+bool bch2_bucket_needs_journal_commit(struct buckets_waiting_for_journal *,
+				      u64, unsigned, u64);
+int bch2_set_bucket_needs_journal_commit(struct buckets_waiting_for_journal *,
+					 u64, unsigned, u64, u64);
+
+void bch2_fs_buckets_waiting_for_journal_exit(struct bch_fs *);
+int bch2_fs_buckets_waiting_for_journal_init(struct bch_fs *);
+
+#endif /* _BUCKETS_WAITING_FOR_JOURNAL_H */
diff --git a/fs/bcachefs/buckets_waiting_for_journal_types.h b/fs/bcachefs/buckets_waiting_for_journal_types.h
new file mode 100644
index 000000000000..e593db061d81
--- /dev/null
+++ b/fs/bcachefs/buckets_waiting_for_journal_types.h
@@ -0,0 +1,23 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BUCKETS_WAITING_FOR_JOURNAL_TYPES_H
+#define _BUCKETS_WAITING_FOR_JOURNAL_TYPES_H
+
+#include <linux/siphash.h>
+
+struct bucket_hashed {
+	u64			dev_bucket;
+	u64			journal_seq;
+};
+
+struct buckets_waiting_for_journal_table {
+	unsigned		bits;
+	u64			hash_seeds[3];
+	struct bucket_hashed	d[];
+};
+
+struct buckets_waiting_for_journal {
+	struct mutex		lock;
+	struct buckets_waiting_for_journal_table *t;
+};
+
+#endif /* _BUCKETS_WAITING_FOR_JOURNAL_TYPES_H */
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 4602f581198e..815310e2426f 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -1666,13 +1666,9 @@ retry_alloc:
 		}
 	}
 
-	bch2_bucket_seq_cleanup(c);
-
 	continue_at(cl, do_journal_write, c->io_complete_wq);
 	return;
 no_io:
-	bch2_bucket_seq_cleanup(c);
-
 	continue_at(cl, journal_write_done, c->io_complete_wq);
 	return;
 err:
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 55bb263a0906..3094eb1e3406 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -16,6 +16,7 @@
 #include "btree_key_cache.h"
 #include "btree_update_interior.h"
 #include "btree_io.h"
+#include "buckets_waiting_for_journal.h"
 #include "chardev.h"
 #include "checksum.h"
 #include "clock.h"
@@ -475,6 +476,7 @@ static void __bch2_fs_free(struct bch_fs *c)
 	bch2_fs_ec_exit(c);
 	bch2_fs_encryption_exit(c);
 	bch2_fs_io_exit(c);
+	bch2_fs_buckets_waiting_for_journal_exit(c);
 	bch2_fs_btree_interior_update_exit(c);
 	bch2_fs_btree_iter_exit(c);
 	bch2_fs_btree_key_cache_exit(&c->btree_key_cache);
@@ -818,6 +820,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 	    bch2_fs_btree_key_cache_init(&c->btree_key_cache) ?:
 	    bch2_fs_btree_iter_init(c) ?:
 	    bch2_fs_btree_interior_update_init(c) ?:
+	    bch2_fs_buckets_waiting_for_journal_init(c);
 	    bch2_fs_subvolumes_init(c) ?:
 	    bch2_fs_io_init(c) ?:
 	    bch2_fs_encryption_init(c) ?:
-- 
cgit 


From a74313481ae24cd301b79b5fca3161079e739a21 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 6 Jan 2022 00:04:56 -0500
Subject: bcachefs: Fix check_pos_snapshot_overwritten for !snapshots

It shouldn't run if the btree being checked doesn't have snapshots.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_update_leaf.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 24de8604740c..c29815e6183d 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -1088,6 +1088,9 @@ static int check_pos_snapshot_overwritten(struct btree_trans *trans,
 	struct bkey_s_c k;
 	int ret;
 
+	if (!btree_type_has_snapshots(id))
+		return 0;
+
 	if (!snapshot_t(c, pos.snapshot)->children[0])
 		return 0;
 
-- 
cgit 


From acc3e09b67a350bc9ce6dc9d0d96cc398a850e7c Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 6 Jan 2022 21:38:08 -0500
Subject: bcachefs: Rename data_op_data_progress -> data_jobs

Mild refactoring.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/movinggc.c |  5 ++++-
 fs/bcachefs/sysfs.c    | 36 ++++++++++++------------------------
 2 files changed, 16 insertions(+), 25 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c
index 46a0875135d5..4791e5099d93 100644
--- a/fs/bcachefs/movinggc.c
+++ b/fs/bcachefs/movinggc.c
@@ -231,8 +231,11 @@ static int bch2_copygc(struct bch_fs *c)
 
 	buckets_to_move = h->used;
 
-	if (!buckets_to_move)
+	if (!buckets_to_move) {
+		bch_err_ratelimited(c, "copygc cannot run - sectors_reserved %llu!",
+				    sectors_reserved);
 		return 0;
+	}
 
 	eytzinger0_sort(h->data, h->used,
 			sizeof(h->data[0]),
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index 6d1596322ee2..ed9a095063e8 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -192,7 +192,7 @@ read_attribute(new_stripes);
 read_attribute(io_timers_read);
 read_attribute(io_timers_write);
 
-read_attribute(data_op_data_progress);
+read_attribute(data_jobs);
 
 #ifdef CONFIG_BCACHEFS_TESTS
 write_attribute(perf_test);
@@ -230,32 +230,20 @@ static size_t bch2_btree_avg_write_size(struct bch_fs *c)
 	return nr ? div64_u64(sectors, nr) : 0;
 }
 
-static long stats_to_text(struct printbuf *out, struct bch_fs *c,
-			  struct bch_move_stats *stats)
-{
-	pr_buf(out, "%s: data type %s btree_id %s position: ",
-		stats->name,
-		bch2_data_types[stats->data_type],
-		bch2_btree_ids[stats->btree_id]);
-	bch2_bpos_to_text(out, stats->pos);
-	pr_buf(out, "%s", "\n");
-
-	return 0;
-}
-
 static long data_progress_to_text(struct printbuf *out, struct bch_fs *c)
 {
 	long ret = 0;
-	struct bch_move_stats *iter;
+	struct bch_move_stats *stats;
 
 	mutex_lock(&c->data_progress_lock);
-
-	if (list_empty(&c->data_progress_list))
-		pr_buf(out, "%s", "no progress to report\n");
-	else
-		list_for_each_entry(iter, &c->data_progress_list, list) {
-			stats_to_text(out, c, iter);
-		}
+	list_for_each_entry(stats, &c->data_progress_list, list) {
+		pr_buf(out, "%s: data type %s btree_id %s position: ",
+		       stats->name,
+		       bch2_data_types[stats->data_type],
+		       bch2_btree_ids[stats->btree_id]);
+		bch2_bpos_to_text(out, stats->pos);
+		pr_buf(out, "%s", "\n");
+	}
 
 	mutex_unlock(&c->data_progress_lock);
 	return ret;
@@ -463,7 +451,7 @@ SHOW(bch2_fs)
 		return out.pos - buf;
 	}
 
-	if (attr == &sysfs_data_op_data_progress) {
+	if (attr == &sysfs_data_jobs) {
 		data_progress_to_text(&out, c);
 		return out.pos - buf;
 	}
@@ -616,7 +604,7 @@ struct attribute *bch2_fs_internal_files[] = {
 	&sysfs_rebalance_work,
 	sysfs_pd_controller_files(rebalance),
 
-	&sysfs_data_op_data_progress,
+	&sysfs_data_jobs,
 
 	&sysfs_internal_uuid,
 	NULL
-- 
cgit 


From 7d782ae447ddbbadf02bb320691ee9cb92f61790 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 6 Jan 2022 01:20:12 -0500
Subject: bcachefs: Refactor trigger code

This breaks bch2_trans_commit_run_triggers() up into multiple functions,
and deletes a bit of duplication - prep work for triggers on alloc keys,
which will need to run last.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_update_leaf.c | 144 +++++++++++++++++++++-------------------
 1 file changed, 76 insertions(+), 68 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index c29815e6183d..e0e99018e5a1 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -857,28 +857,63 @@ bch2_trans_commit_get_rw_cold(struct btree_trans *trans)
 	return 0;
 }
 
-static int bch2_trans_commit_run_triggers(struct btree_trans *trans)
+static int run_one_trigger(struct btree_trans *trans, struct btree_insert_entry *i,
+			   bool overwrite)
 {
 	struct bkey		_deleted = KEY(0, 0, 0);
 	struct bkey_s_c		deleted = (struct bkey_s_c) { &_deleted, NULL };
 	struct bkey_s_c		old;
 	struct bkey		unpacked;
-	struct btree_insert_entry *i = NULL, *btree_id_start = trans->updates;
-	bool trans_trigger_run;
-	unsigned btree_id = 0;
 	int ret = 0;
 
-	/*
-	 *
-	 * For a given btree, this algorithm runs insert triggers before
-	 * overwrite triggers: this is so that when extents are being moved
-	 * (e.g. by FALLOCATE_FL_INSERT_RANGE), we don't drop references before
-	 * they are re-added.
-	 */
-	for (btree_id = 0; btree_id < BTREE_ID_NR; btree_id++) {
-		while (btree_id_start < trans->updates + trans->nr_updates &&
-		       btree_id_start->btree_id < btree_id)
-			btree_id_start++;
+	if ((i->flags & BTREE_TRIGGER_NORUN) ||
+	    !(BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type)))
+		return 0;
+
+	if (!overwrite) {
+		if (i->insert_trigger_run)
+			return 0;
+
+		BUG_ON(i->overwrite_trigger_run);
+		i->insert_trigger_run = true;
+	} else {
+		if (i->overwrite_trigger_run)
+			return 0;
+
+		BUG_ON(!i->insert_trigger_run);
+		i->overwrite_trigger_run = true;
+	}
+
+	old = bch2_btree_path_peek_slot(i->path, &unpacked);
+	_deleted.p = i->path->pos;
+
+	if (overwrite) {
+		ret = bch2_trans_mark_key(trans, old, deleted,
+				BTREE_TRIGGER_OVERWRITE|i->flags);
+	} else if (old.k->type == i->k->k.type &&
+	    ((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) {
+		i->overwrite_trigger_run = true;
+		ret = bch2_trans_mark_key(trans, old, bkey_i_to_s_c(i->k),
+				BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|i->flags);
+	} else {
+		ret = bch2_trans_mark_key(trans, deleted, bkey_i_to_s_c(i->k),
+				BTREE_TRIGGER_INSERT|i->flags);
+	}
+
+	if (ret == -EINTR)
+		trace_trans_restart_mark(trans->fn, _RET_IP_,
+					 i->btree_id, &i->path->pos);
+	return ret ?: 1;
+}
+
+static int run_btree_triggers(struct btree_trans *trans, enum btree_id btree_id,
+			      struct btree_insert_entry *btree_id_start)
+{
+	struct btree_insert_entry *i;
+	bool trans_trigger_run;
+	int ret, overwrite;
+
+	for (overwrite = 0; overwrite < 2; overwrite++) {
 
 		/*
 		 * Running triggers will append more updates to the list of updates as
@@ -890,66 +925,39 @@ static int bch2_trans_commit_run_triggers(struct btree_trans *trans)
 			for (i = btree_id_start;
 			     i < trans->updates + trans->nr_updates && i->btree_id <= btree_id;
 			     i++) {
-				if (i->insert_trigger_run ||
-				    (i->flags & BTREE_TRIGGER_NORUN) ||
-				    !(BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type)))
-					continue;
-
-				BUG_ON(i->overwrite_trigger_run);
-
-				i->insert_trigger_run = true;
-				trans_trigger_run = true;
-
-				old = bch2_btree_path_peek_slot(i->path, &unpacked);
-				_deleted.p = i->path->pos;
-
-				if (old.k->type == i->k->k.type &&
-				    ((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) {
-					i->overwrite_trigger_run = true;
-					ret = bch2_trans_mark_key(trans, old, bkey_i_to_s_c(i->k),
-							BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|i->flags);
-				} else {
-					ret = bch2_trans_mark_key(trans, deleted, bkey_i_to_s_c(i->k),
-							BTREE_TRIGGER_INSERT|i->flags);
-				}
-
-				if (ret == -EINTR)
-					trace_trans_restart_mark(trans->fn, _RET_IP_,
-							i->btree_id, &i->path->pos);
-				if (ret)
+				ret = run_one_trigger(trans, i, overwrite);
+				if (ret < 0)
 					return ret;
+				if (ret)
+					trans_trigger_run = true;
 			}
 		} while (trans_trigger_run);
+	}
 
-		do {
-			trans_trigger_run = false;
-
-			for (i = btree_id_start;
-			     i < trans->updates + trans->nr_updates && i->btree_id <= btree_id;
-			     i++) {
-				if (i->overwrite_trigger_run ||
-				    (i->flags & BTREE_TRIGGER_NORUN) ||
-				    !(BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type)))
-					continue;
-
-				BUG_ON(!i->insert_trigger_run);
-
-				i->overwrite_trigger_run = true;
-				trans_trigger_run = true;
+	return 0;
+}
 
-				old = bch2_btree_path_peek_slot(i->path, &unpacked);
-				_deleted.p = i->path->pos;
+static int bch2_trans_commit_run_triggers(struct btree_trans *trans)
+{
+	struct btree_insert_entry *i = NULL, *btree_id_start = trans->updates;
+	unsigned btree_id = 0;
+	int ret = 0;
 
-				ret = bch2_trans_mark_key(trans, old, deleted,
-						BTREE_TRIGGER_OVERWRITE|i->flags);
+	/*
+	 *
+	 * For a given btree, this algorithm runs insert triggers before
+	 * overwrite triggers: this is so that when extents are being moved
+	 * (e.g. by FALLOCATE_FL_INSERT_RANGE), we don't drop references before
+	 * they are re-added.
+	 */
+	for (btree_id = 0; btree_id < BTREE_ID_NR; btree_id++) {
+		while (btree_id_start < trans->updates + trans->nr_updates &&
+		       btree_id_start->btree_id < btree_id)
+			btree_id_start++;
 
-				if (ret == -EINTR)
-					trace_trans_restart_mark(trans->fn, _RET_IP_,
-							i->btree_id, &i->path->pos);
-				if (ret)
-					return ret;
-			}
-		} while (trans_trigger_run);
+		ret = run_btree_triggers(trans, btree_id, btree_id_start);
+		if (ret)
+			return ret;
 	}
 
 	trans_for_each_update(trans, i)
-- 
cgit 


From b674bfadd86aa7f815156f15cc01af95440380f9 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 8 Jan 2022 03:39:54 -0500
Subject: bcachefs: Use BTREE_INSERT_USE_RESERVE in btree_update_key()

bch2_btree_update_key() is used in the btree node write path - before
delivering the completion we have to update the parent pointer with the
number of sectors written.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_update_interior.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 29dda2352afd..0e7644a3a436 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -1937,6 +1937,7 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans,
 	ret = bch2_trans_commit(trans, NULL, NULL,
 				BTREE_INSERT_NOFAIL|
 				BTREE_INSERT_NOCHECK_RW|
+				BTREE_INSERT_USE_RESERVE|
 				BTREE_INSERT_JOURNAL_RECLAIM|
 				BTREE_INSERT_JOURNAL_RESERVED);
 	if (ret)
-- 
cgit 


From c4ecf802fbfae032730caf40fd74fb27c057a916 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 8 Jan 2022 19:07:32 -0500
Subject: bcachefs: Fix an error path in bch2_snapshot_node_create()

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/subvolume.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c
index 8aeb2e417a15..012d8e8c52c4 100644
--- a/fs/bcachefs/subvolume.c
+++ b/fs/bcachefs/subvolume.c
@@ -522,7 +522,7 @@ static int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent,
 		n = bch2_trans_kmalloc(trans, sizeof(*n));
 		ret = PTR_ERR_OR_ZERO(n);
 		if (ret)
-			return ret;
+			goto err;
 
 		bkey_reassemble(&n->k_i, k);
 
-- 
cgit 


From 7f6ff935f74e8bb3257314f7e31182b6ad96198a Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 29 Dec 2021 13:50:50 -0500
Subject: bcachefs: New snapshot unit test

This still needs to be expanded more, but this adds a basic test for
BTREE_ITER_FILTER_SNAPSHOTS.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/subvolume.c |  8 +++---
 fs/bcachefs/subvolume.h |  4 +++
 fs/bcachefs/tests.c     | 71 ++++++++++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 78 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c
index 012d8e8c52c4..69603327d93d 100644
--- a/fs/bcachefs/subvolume.c
+++ b/fs/bcachefs/subvolume.c
@@ -456,10 +456,10 @@ err:
 	return ret;
 }
 
-static int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent,
-				     u32 *new_snapids,
-				     u32 *snapshot_subvols,
-				     unsigned nr_snapids)
+int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent,
+			      u32 *new_snapids,
+			      u32 *snapshot_subvols,
+			      unsigned nr_snapids)
 {
 	struct btree_iter iter;
 	struct bkey_i_snapshot *n;
diff --git a/fs/bcachefs/subvolume.h b/fs/bcachefs/subvolume.h
index e4c3fdcdf22f..4abe53df2788 100644
--- a/fs/bcachefs/subvolume.h
+++ b/fs/bcachefs/subvolume.h
@@ -122,6 +122,10 @@ int bch2_snapshot_get_subvol(struct btree_trans *, u32,
 			     struct bch_subvolume *);
 int bch2_subvolume_get_snapshot(struct btree_trans *, u32, u32 *);
 
+/* only exported for tests: */
+int bch2_snapshot_node_create(struct btree_trans *, u32,
+			      u32 *, u32 *, unsigned);
+
 int bch2_subvolume_delete(struct btree_trans *, u32);
 int bch2_subvolume_unlink(struct btree_trans *, u32);
 int bch2_subvolume_create(struct btree_trans *, u64, u32,
diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c
index 0247309a25e6..1f7f2533e544 100644
--- a/fs/bcachefs/tests.c
+++ b/fs/bcachefs/tests.c
@@ -4,6 +4,7 @@
 #include "bcachefs.h"
 #include "btree_update.h"
 #include "journal_reclaim.h"
+#include "subvolume.h"
 #include "tests.h"
 
 #include "linux/kthread.h"
@@ -461,6 +462,70 @@ static int test_extent_overwrite_all(struct bch_fs *c, u64 nr)
 		__test_extent_overwrite(c, 32, 64, 32, 128);
 }
 
+/* snapshot unit tests */
+
+/* Test skipping over keys in unrelated snapshots: */
+static int test_snapshot_filter(struct bch_fs *c, u32 snapid_lo, u32 snapid_hi)
+{
+	struct btree_trans trans;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	struct bkey_i_cookie cookie;
+	int ret;
+
+	bkey_cookie_init(&cookie.k_i);
+	cookie.k.p.snapshot = snapid_hi;
+	ret = bch2_btree_insert(c, BTREE_ID_xattrs, &cookie.k_i,
+				NULL, NULL, 0);
+	if (ret)
+		return ret;
+
+	bch2_trans_init(&trans, c, 0, 0);
+	bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs,
+			     SPOS(0, 0, snapid_lo), 0);
+	k = bch2_btree_iter_peek(&iter);
+
+	BUG_ON(k.k->p.snapshot != U32_MAX);
+
+	bch2_trans_iter_exit(&trans, &iter);
+	bch2_trans_exit(&trans);
+	return ret;
+}
+
+static int test_snapshots(struct bch_fs *c, u64 nr)
+{
+	struct bkey_i_cookie cookie;
+	u32 snapids[2];
+	u32 snapid_subvols[2] = { 1, 1 };
+	int ret;
+
+	bkey_cookie_init(&cookie.k_i);
+	cookie.k.p.snapshot = U32_MAX;
+	ret = bch2_btree_insert(c, BTREE_ID_xattrs, &cookie.k_i,
+				NULL, NULL, 0);
+	if (ret)
+		return ret;
+
+	ret = bch2_trans_do(c, NULL, NULL, 0,
+		      bch2_snapshot_node_create(&trans, U32_MAX,
+						snapids,
+						snapid_subvols,
+						2));
+	if (ret)
+		return ret;
+
+	if (snapids[0] > snapids[1])
+		swap(snapids[0], snapids[1]);
+
+	ret = test_snapshot_filter(c, snapids[0], snapids[1]);
+	if (ret) {
+		bch_err(c, "err %i from test_snapshot_filter", ret);
+		return ret;
+	}
+
+	return 0;
+}
+
 /* perf tests */
 
 static u64 test_rand(void)
@@ -789,8 +854,10 @@ static int btree_perf_test_thread(void *data)
 	}
 
 	ret = j->fn(j->c, div64_u64(j->nr, j->nr_threads));
-	if (ret)
+	if (ret) {
+		bch_err(j->c, "%ps: error %i", j->fn, ret);
 		j->ret = ret;
+	}
 
 	if (atomic_dec_and_test(&j->done)) {
 		j->finish = sched_clock();
@@ -843,6 +910,8 @@ int bch2_btree_perf_test(struct bch_fs *c, const char *testname,
 	perf_test(test_extent_overwrite_middle);
 	perf_test(test_extent_overwrite_all);
 
+	perf_test(test_snapshots);
+
 	if (!j.fn) {
 		pr_err("unknown test %s", testname);
 		return -EINVAL;
-- 
cgit 


From bc82d08bae53b48ca64e204392f6d336fc9509a9 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 8 Jan 2022 22:59:58 -0500
Subject: bcachefs: Tracepoint improvements

This improves the transaction restart tracepoints - adding distinct
tracepoints for all the locations and reasons a transaction might have
been restarted, and ensures that there's a tracepoint for every
transaction restart.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_cache.c     |   4 ++
 fs/bcachefs/btree_iter.c      |  33 +++++++++-----
 fs/bcachefs/btree_key_cache.c |   3 +-
 fs/bcachefs/trace.h           | 102 ++++++++++++++++++++++++++++++++++++++++--
 4 files changed, 127 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index cad5d28fed09..36b82df79fc2 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -665,6 +665,8 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c,
 	 * been freed:
 	 */
 	if (trans && !bch2_btree_node_relock(trans, path, level + 1)) {
+		trace_trans_restart_relock_parent_for_fill(trans->fn,
+					_THIS_IP_, btree_id, &path->pos);
 		btree_trans_restart(trans);
 		return ERR_PTR(-EINTR);
 	}
@@ -712,6 +714,8 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c,
 	}
 
 	if (!six_relock_type(&b->c.lock, lock_type, seq)) {
+		trace_trans_restart_relock_after_fill(trans->fn, _THIS_IP_,
+					   btree_id, &path->pos);
 		btree_trans_restart(trans);
 		return ERR_PTR(-EINTR);
 	}
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index bca677c02774..a2377150e29e 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -166,19 +166,25 @@ bool __bch2_btree_node_relock(struct btree_trans *trans,
 	int want = __btree_lock_want(path, level);
 
 	if (!is_btree_node(path, level))
-		return false;
+		goto fail;
 
 	if (race_fault())
-		return false;
+		goto fail;
 
 	if (six_relock_type(&b->c.lock, want, path->l[level].lock_seq) ||
 	    (btree_node_lock_seq_matches(path, b, level) &&
 	     btree_node_lock_increment(trans, b, level, want))) {
 		mark_btree_node_locked(path, level, want);
 		return true;
-	} else {
-		return false;
 	}
+fail:
+	trace_btree_node_relock_fail(trans->fn, _RET_IP_,
+				     path->btree_id,
+				     &path->pos,
+				     (unsigned long) b,
+				     path->l[level].lock_seq,
+				     is_btree_node(path, level) ? b->c.lock.state.seq : 0);
+	return false;
 }
 
 bool bch2_btree_node_upgrade(struct btree_trans *trans,
@@ -225,7 +231,7 @@ success:
 
 static inline bool btree_path_get_locks(struct btree_trans *trans,
 					struct btree_path *path,
-					bool upgrade, unsigned long trace_ip)
+					bool upgrade)
 {
 	unsigned l = path->level;
 	int fail_idx = -1;
@@ -427,6 +433,8 @@ bool bch2_btree_path_relock_intent(struct btree_trans *trans,
 		if (!bch2_btree_node_relock(trans, path, l)) {
 			__bch2_btree_path_unlock(path);
 			btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
+			trace_trans_restart_relock_path_intent(trans->fn, _RET_IP_,
+						   path->btree_id, &path->pos);
 			btree_trans_restart(trans);
 			return false;
 		}
@@ -439,10 +447,13 @@ __flatten
 static bool bch2_btree_path_relock(struct btree_trans *trans,
 			struct btree_path *path, unsigned long trace_ip)
 {
-	bool ret = btree_path_get_locks(trans, path, false, trace_ip);
+	bool ret = btree_path_get_locks(trans, path, false);
 
-	if (!ret)
+	if (!ret) {
+		trace_trans_restart_relock_path(trans->fn, trace_ip,
+						path->btree_id, &path->pos);
 		btree_trans_restart(trans);
+	}
 	return ret;
 }
 
@@ -456,7 +467,7 @@ bool __bch2_btree_path_upgrade(struct btree_trans *trans,
 
 	path->locks_want = new_locks_want;
 
-	if (btree_path_get_locks(trans, path, true, _THIS_IP_))
+	if (btree_path_get_locks(trans, path, true))
 		return true;
 
 	/*
@@ -484,7 +495,7 @@ bool __bch2_btree_path_upgrade(struct btree_trans *trans,
 		    linked->btree_id == path->btree_id &&
 		    linked->locks_want < new_locks_want) {
 			linked->locks_want = new_locks_want;
-			btree_path_get_locks(trans, linked, true, _THIS_IP_);
+			btree_path_get_locks(trans, linked, true);
 		}
 
 	return false;
@@ -1955,7 +1966,7 @@ struct btree_path *bch2_path_get(struct btree_trans *trans,
 	locks_want = min(locks_want, BTREE_MAX_DEPTH);
 	if (locks_want > path->locks_want) {
 		path->locks_want = locks_want;
-		btree_path_get_locks(trans, path, true, _THIS_IP_);
+		btree_path_get_locks(trans, path, true);
 	}
 
 	return path;
@@ -2090,6 +2101,8 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
 		__bch2_btree_path_unlock(path);
 		path->l[path->level].b = BTREE_ITER_NO_NODE_GET_LOCKS;
 		path->l[path->level + 1].b = BTREE_ITER_NO_NODE_GET_LOCKS;
+		trace_trans_restart_relock_next_node(trans->fn, _THIS_IP_,
+					   path->btree_id, &path->pos);
 		btree_trans_restart(trans);
 		ret = -EINTR;
 		goto err;
diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index 08df768fbebb..684919125b2f 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -223,7 +223,8 @@ static int btree_key_cache_fill(struct btree_trans *trans,
 		goto err;
 
 	if (!bch2_btree_node_relock(trans, ck_path, 0)) {
-		trace_transaction_restart_ip(trans->fn, _THIS_IP_);
+		trace_trans_restart_relock_key_cache_fill(trans->fn,
+				_THIS_IP_, ck_path->btree_id, &ck_path->pos);
 		ret = btree_trans_restart(trans);
 		goto err;
 	}
diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h
index 69d1f42fe0f8..d432c90a1491 100644
--- a/fs/bcachefs/trace.h
+++ b/fs/bcachefs/trace.h
@@ -346,6 +346,52 @@ TRACE_EVENT(btree_cache_scan,
 		  __entry->ret)
 );
 
+TRACE_EVENT(btree_node_relock_fail,
+	TP_PROTO(const char *trans_fn,
+		 unsigned long caller_ip,
+		 enum btree_id btree_id,
+		 struct bpos *pos,
+		 unsigned long node,
+		 u32 iter_lock_seq,
+		 u32 node_lock_seq),
+	TP_ARGS(trans_fn, caller_ip, btree_id, pos, node, iter_lock_seq, node_lock_seq),
+
+	TP_STRUCT__entry(
+		__array(char,			trans_fn, 24	)
+		__array(char,			caller, 32	)
+		__field(u8,			btree_id	)
+		__field(u64,			pos_inode	)
+		__field(u64,			pos_offset	)
+		__field(u32,			pos_snapshot	)
+		__field(unsigned long,		node		)
+		__field(u32,			iter_lock_seq	)
+		__field(u32,			node_lock_seq	)
+	),
+
+	TP_fast_assign(
+		strncpy(__entry->trans_fn, trans_fn, sizeof(__entry->trans_fn));
+		snprintf(__entry->caller, sizeof(__entry->caller), "%pS", (void *) caller_ip);
+		__entry->btree_id		= btree_id;
+		__entry->pos_inode		= pos->inode;
+		__entry->pos_offset		= pos->offset;
+		__entry->pos_snapshot		= pos->snapshot;
+		__entry->node			= node;
+		__entry->iter_lock_seq		= iter_lock_seq;
+		__entry->node_lock_seq		= node_lock_seq;
+	),
+
+	TP_printk("%s %s btree %u pos %llu:%llu:%u, node %lu iter seq %u lock seq %u",
+		  __entry->trans_fn,
+		  __entry->caller,
+		  __entry->btree_id,
+		  __entry->pos_inode,
+		  __entry->pos_offset,
+		  __entry->pos_snapshot,
+		  __entry->node,
+		  __entry->iter_lock_seq,
+		  __entry->node_lock_seq)
+);
+
 /* Garbage collection */
 
 DEFINE_EVENT(btree_node, btree_gc_rewrite_node,
@@ -621,7 +667,7 @@ DECLARE_EVENT_CLASS(transaction_restart_iter,
 
 	TP_STRUCT__entry(
 		__array(char,			trans_fn, 24	)
-		__field(unsigned long,		caller_ip	)
+		__array(char,			caller, 32	)
 		__field(u8,			btree_id	)
 		__field(u64,			pos_inode	)
 		__field(u64,			pos_offset	)
@@ -630,16 +676,16 @@ DECLARE_EVENT_CLASS(transaction_restart_iter,
 
 	TP_fast_assign(
 		strncpy(__entry->trans_fn, trans_fn, sizeof(__entry->trans_fn));
-		__entry->caller_ip		= caller_ip;
+		snprintf(__entry->caller, sizeof(__entry->caller), "%pS", (void *) caller_ip);
 		__entry->btree_id		= btree_id;
 		__entry->pos_inode		= pos->inode;
 		__entry->pos_offset		= pos->offset;
 		__entry->pos_snapshot		= pos->snapshot;
 	),
 
-	TP_printk("%s %pS btree %u pos %llu:%llu:%u",
+	TP_printk("%s %s btree %u pos %llu:%llu:%u",
 		  __entry->trans_fn,
-		  (void *) __entry->caller_ip,
+		  __entry->caller,
 		  __entry->btree_id,
 		  __entry->pos_inode,
 		  __entry->pos_offset,
@@ -694,6 +740,54 @@ DEFINE_EVENT(transaction_restart_iter,	trans_restart_relock,
 	TP_ARGS(trans_fn, caller_ip, btree_id, pos)
 );
 
+DEFINE_EVENT(transaction_restart_iter,	trans_restart_relock_next_node,
+	TP_PROTO(const char *trans_fn,
+		 unsigned long caller_ip,
+		 enum btree_id btree_id,
+		 struct bpos *pos),
+	TP_ARGS(trans_fn, caller_ip, btree_id, pos)
+);
+
+DEFINE_EVENT(transaction_restart_iter,	trans_restart_relock_parent_for_fill,
+	TP_PROTO(const char *trans_fn,
+		 unsigned long caller_ip,
+		 enum btree_id btree_id,
+		 struct bpos *pos),
+	TP_ARGS(trans_fn, caller_ip, btree_id, pos)
+);
+
+DEFINE_EVENT(transaction_restart_iter,	trans_restart_relock_after_fill,
+	TP_PROTO(const char *trans_fn,
+		 unsigned long caller_ip,
+		 enum btree_id btree_id,
+		 struct bpos *pos),
+	TP_ARGS(trans_fn, caller_ip, btree_id, pos)
+);
+
+DEFINE_EVENT(transaction_restart_iter,	trans_restart_relock_key_cache_fill,
+	TP_PROTO(const char *trans_fn,
+		 unsigned long caller_ip,
+		 enum btree_id btree_id,
+		 struct bpos *pos),
+	TP_ARGS(trans_fn, caller_ip, btree_id, pos)
+);
+
+DEFINE_EVENT(transaction_restart_iter,	trans_restart_relock_path,
+	TP_PROTO(const char *trans_fn,
+		 unsigned long caller_ip,
+		 enum btree_id btree_id,
+		 struct bpos *pos),
+	TP_ARGS(trans_fn, caller_ip, btree_id, pos)
+);
+
+DEFINE_EVENT(transaction_restart_iter,	trans_restart_relock_path_intent,
+	TP_PROTO(const char *trans_fn,
+		 unsigned long caller_ip,
+		 enum btree_id btree_id,
+		 struct bpos *pos),
+	TP_ARGS(trans_fn, caller_ip, btree_id, pos)
+);
+
 DEFINE_EVENT(transaction_restart_iter,	trans_restart_traverse,
 	TP_PROTO(const char *trans_fn,
 		 unsigned long caller_ip,
-- 
cgit 


From a1e82d35f89793f6347945ab48d799ce1802df87 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 9 Jan 2022 01:07:29 -0500
Subject: bcachefs: Refactor bch2_btree_iter()

This splits bch2_btree_iter() up into two functions: an inner function
that handles BTREE_ITER_WITH_JOURNAL, BTREE_ITER_WITH_UPDATES, and
iterating acrcoss leaf nodes, and an outer one that implements
BTREE_ITER_FILTER_SNAPHSOTS.

This is prep work for remember a btree_path at our update position in
BTREE_ITER_FILTER_SNAPSHOTS mode.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c | 97 +++++++++++++++++++++++++++++-------------------
 fs/bcachefs/btree_iter.h | 21 +++++++----
 2 files changed, 71 insertions(+), 47 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index a2377150e29e..29ca9410c86c 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -697,9 +697,6 @@ static void bch2_btree_iter_verify(struct btree_iter *iter)
 
 	BUG_ON(!!(iter->flags & BTREE_ITER_CACHED) != iter->path->cached);
 
-	BUG_ON(!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS) &&
-	       iter->pos.snapshot != iter->snapshot);
-
 	BUG_ON((iter->flags & BTREE_ITER_IS_EXTENTS) &&
 	       (iter->flags & BTREE_ITER_ALL_SNAPSHOTS));
 
@@ -2252,21 +2249,15 @@ struct bkey_s_c btree_trans_peek_journal(struct btree_trans *trans,
 	return k;
 }
 
-/**
- * bch2_btree_iter_peek: returns first key greater than or equal to iterator's
- * current position
- */
-struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
+static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bpos search_key)
 {
 	struct btree_trans *trans = iter->trans;
-	struct bpos search_key = btree_iter_search_key(iter);
 	struct bkey_i *next_update;
 	struct bkey_s_c k;
 	int ret;
 
 	EBUG_ON(iter->path->cached || iter->path->level);
 	bch2_btree_iter_verify(iter);
-	bch2_btree_iter_verify_entry_exit(iter);
 
 	while (1) {
 		iter->path = btree_path_set_pos(trans, iter->path, search_key,
@@ -2309,24 +2300,6 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
 		}
 
 		if (likely(k.k)) {
-			/*
-			 * We can never have a key in a leaf node at POS_MAX, so
-			 * we don't have to check these successor() calls:
-			 */
-			if ((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) &&
-			    !bch2_snapshot_is_ancestor(trans->c,
-						       iter->snapshot,
-						       k.k->p.snapshot)) {
-				search_key = bpos_successor(k.k->p);
-				continue;
-			}
-
-			if (bkey_whiteout(k.k) &&
-			    !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)) {
-				search_key = bkey_successor(iter, k.k->p);
-				continue;
-			}
-
 			break;
 		} else if (likely(bpos_cmp(iter->path->l[0].b->key.k.p, SPOS_MAX))) {
 			/* Advance to next leaf node: */
@@ -2339,6 +2312,56 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
 		}
 	}
 
+	iter->path = btree_path_set_pos(trans, iter->path, k.k->p,
+				iter->flags & BTREE_ITER_INTENT);
+	BUG_ON(!iter->path->nodes_locked);
+out:
+	iter->path->should_be_locked = true;
+
+	bch2_btree_iter_verify(iter);
+
+	return k;
+}
+
+/**
+ * bch2_btree_iter_peek: returns first key greater than or equal to iterator's
+ * current position
+ */
+struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
+{
+	struct btree_trans *trans = iter->trans;
+	struct bpos search_key = btree_iter_search_key(iter);
+	struct bkey_s_c k;
+	int ret;
+
+	bch2_btree_iter_verify_entry_exit(iter);
+
+	while (1) {
+		k = __bch2_btree_iter_peek(iter, search_key);
+		if (!k.k || bkey_err(k))
+			goto out;
+
+		/*
+		 * We can never have a key in a leaf node at POS_MAX, so
+		 * we don't have to check these successor() calls:
+		 */
+		if ((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) &&
+		    !bch2_snapshot_is_ancestor(trans->c,
+					       iter->snapshot,
+					       k.k->p.snapshot)) {
+			search_key = bpos_successor(k.k->p);
+			continue;
+		}
+
+		if (bkey_whiteout(k.k) &&
+		    !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)) {
+			search_key = bkey_successor(iter, k.k->p);
+			continue;
+		}
+
+		break;
+	}
+
 	/*
 	 * iter->pos should be mononotically increasing, and always be equal to
 	 * the key we just returned - except extents can straddle iter->pos:
@@ -2347,21 +2370,17 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
 		iter->pos = k.k->p;
 	else if (bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0)
 		iter->pos = bkey_start_pos(k.k);
-
-	if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)
+out:
+	if (!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS))
 		iter->pos.snapshot = iter->snapshot;
 
-	iter->path = btree_path_set_pos(trans, iter->path, k.k->p,
-				iter->flags & BTREE_ITER_INTENT);
-	BUG_ON(!iter->path->nodes_locked);
-out:
-	iter->path->should_be_locked = true;
+	ret = bch2_btree_iter_verify_ret(iter, k);
+	if (unlikely(ret)) {
+		bch2_btree_iter_set_pos(iter, iter->pos);
+		k = bkey_s_c_err(ret);
+	}
 
 	bch2_btree_iter_verify_entry_exit(iter);
-	bch2_btree_iter_verify(iter);
-	ret = bch2_btree_iter_verify_ret(iter, k);
-	if (unlikely(ret))
-		return bkey_s_c_err(ret);
 
 	return k;
 }
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index abbde3666942..9bb1ef404bc9 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -247,11 +247,8 @@ struct bkey_s_c bch2_btree_iter_prev_slot(struct btree_iter *);
 bool bch2_btree_iter_advance(struct btree_iter *);
 bool bch2_btree_iter_rewind(struct btree_iter *);
 
-static inline void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos)
+static inline void __bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos)
 {
-	if (!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS))
-		new_pos.snapshot = iter->snapshot;
-
 	iter->k.type = KEY_TYPE_deleted;
 	iter->k.p.inode		= iter->pos.inode	= new_pos.inode;
 	iter->k.p.offset	= iter->pos.offset	= new_pos.offset;
@@ -259,6 +256,14 @@ static inline void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos
 	iter->k.size = 0;
 }
 
+static inline void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos)
+{
+	if (!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS))
+		new_pos.snapshot = iter->snapshot;
+
+	__bch2_btree_iter_set_pos(iter, new_pos);
+}
+
 static inline void bch2_btree_iter_set_pos_to_extent_start(struct btree_iter *iter)
 {
 	BUG_ON(!(iter->flags & BTREE_ITER_IS_EXTENTS));
@@ -320,7 +325,7 @@ static inline int bkey_err(struct bkey_s_c k)
 	return PTR_ERR_OR_ZERO(k.k);
 }
 
-static inline struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter,
+static inline struct bkey_s_c bch2_btree_iter_peek_type(struct btree_iter *iter,
 						     unsigned flags)
 {
 	return flags & BTREE_ITER_SLOTS
@@ -341,7 +346,7 @@ __bch2_btree_iter_peek_and_restart(struct btree_trans *trans,
 	struct bkey_s_c k;
 
 	while (btree_trans_too_many_iters(trans) ||
-	       (k = __bch2_btree_iter_peek(iter, flags),
+	       (k = bch2_btree_iter_peek_type(iter, flags),
 		bkey_err(k) == -EINTR))
 		bch2_trans_begin(trans);
 
@@ -360,7 +365,7 @@ __bch2_btree_iter_peek_and_restart(struct btree_trans *trans,
 			   _start, _flags, _k, _ret)			\
 	for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id),	\
 				  (_start), (_flags));			\
-	     (_k) = __bch2_btree_iter_peek(&(_iter), _flags),		\
+	     (_k) = bch2_btree_iter_peek_type(&(_iter), _flags),	\
 	     !((_ret) = bkey_err(_k)) && (_k).k;			\
 	     bch2_btree_iter_advance(&(_iter)))
 
@@ -372,7 +377,7 @@ __bch2_btree_iter_peek_and_restart(struct btree_trans *trans,
 
 #define for_each_btree_key_continue_norestart(_iter, _flags, _k, _ret)	\
 	for (;								\
-	     (_k) = __bch2_btree_iter_peek(&(_iter), _flags),		\
+	     (_k) = bch2_btree_iter_peek_type(&(_iter), _flags),	\
 	     !((_ret) = bkey_err(_k)) && (_k).k;			\
 	     bch2_btree_iter_advance(&(_iter)))
 
-- 
cgit 


From 1f2d9192502917a190ef9bbf7541960d129d30fe Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 8 Jan 2022 21:22:31 -0500
Subject: bcachefs: iter->update_path

With BTREE_ITER_FILTER_SNAPSHOTS, we have to distinguish between the
path where the key was found, and the path for inserting into the
current snapshot. This adds a new field to struct btree_iter for saving
a path for the current snapshot, and plumbs it through
bch2_trans_update().

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c        | 66 +++++++++++++++++++++++++++++++++++++----
 fs/bcachefs/btree_iter.h        |  5 ++++
 fs/bcachefs/btree_types.h       |  1 +
 fs/bcachefs/btree_update.h      |  6 ++++
 fs/bcachefs/btree_update_leaf.c | 64 ++++++++++++++++++++-------------------
 5 files changed, 105 insertions(+), 37 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 29ca9410c86c..ae1628918c57 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -704,6 +704,8 @@ static void bch2_btree_iter_verify(struct btree_iter *iter)
 	       (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) &&
 	       !btree_type_has_snapshots(iter->btree_id));
 
+	if (iter->update_path)
+		bch2_btree_path_verify(trans, iter->update_path);
 	bch2_btree_path_verify(trans, iter->path);
 }
 
@@ -2311,13 +2313,7 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp
 			goto out;
 		}
 	}
-
-	iter->path = btree_path_set_pos(trans, iter->path, k.k->p,
-				iter->flags & BTREE_ITER_INTENT);
-	BUG_ON(!iter->path->nodes_locked);
 out:
-	iter->path->should_be_locked = true;
-
 	bch2_btree_iter_verify(iter);
 
 	return k;
@@ -2334,6 +2330,12 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
 	struct bkey_s_c k;
 	int ret;
 
+	if (iter->update_path) {
+		bch2_path_put(trans, iter->update_path,
+			      iter->flags & BTREE_ITER_INTENT);
+		iter->update_path = NULL;
+	}
+
 	bch2_btree_iter_verify_entry_exit(iter);
 
 	while (1) {
@@ -2341,6 +2343,41 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
 		if (!k.k || bkey_err(k))
 			goto out;
 
+		if (iter->update_path &&
+		    bkey_cmp(iter->update_path->pos, k.k->p)) {
+			bch2_path_put(trans, iter->update_path,
+				      iter->flags & BTREE_ITER_INTENT);
+			iter->update_path = NULL;
+		}
+
+		if ((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) &&
+		    (iter->flags & BTREE_ITER_INTENT) &&
+		    !(iter->flags & BTREE_ITER_IS_EXTENTS) &&
+		    !iter->update_path) {
+			struct bpos pos = k.k->p;
+
+			if (pos.snapshot < iter->snapshot) {
+				search_key = bpos_successor(k.k->p);
+				continue;
+			}
+
+			pos.snapshot = iter->snapshot;
+
+			/*
+			 * advance, same as on exit for iter->path, but only up
+			 * to snapshot
+			 */
+			__btree_path_get(iter->path, iter->flags & BTREE_ITER_INTENT);
+			iter->update_path = iter->path;
+
+			iter->update_path = btree_path_set_pos(trans,
+						iter->update_path, pos,
+						iter->flags & BTREE_ITER_INTENT);
+
+			BUG_ON(!(iter->update_path->nodes_locked & 1));
+			iter->update_path->should_be_locked = true;
+		}
+
 		/*
 		 * We can never have a key in a leaf node at POS_MAX, so
 		 * we don't have to check these successor() calls:
@@ -2370,7 +2407,17 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
 		iter->pos = k.k->p;
 	else if (bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0)
 		iter->pos = bkey_start_pos(k.k);
+
+	iter->path = btree_path_set_pos(trans, iter->path, k.k->p,
+				iter->flags & BTREE_ITER_INTENT);
+	BUG_ON(!iter->path->nodes_locked);
 out:
+	if (iter->update_path) {
+		BUG_ON(!(iter->update_path->nodes_locked & 1));
+		iter->update_path->should_be_locked = true;
+	}
+	iter->path->should_be_locked = true;
+
 	if (!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS))
 		iter->pos.snapshot = iter->snapshot;
 
@@ -2774,7 +2821,11 @@ void bch2_trans_iter_exit(struct btree_trans *trans, struct btree_iter *iter)
 	if (iter->path)
 		bch2_path_put(trans, iter->path,
 			      iter->flags & BTREE_ITER_INTENT);
+	if (iter->update_path)
+		bch2_path_put(trans, iter->update_path,
+			      iter->flags & BTREE_ITER_INTENT);
 	iter->path = NULL;
+	iter->update_path = NULL;
 }
 
 static void __bch2_trans_iter_init(struct btree_trans *trans,
@@ -2803,6 +2854,7 @@ static void __bch2_trans_iter_init(struct btree_trans *trans,
 
 	iter->trans	= trans;
 	iter->path	= NULL;
+	iter->update_path = NULL;
 	iter->btree_id	= btree_id;
 	iter->min_depth	= depth;
 	iter->flags	= flags;
@@ -2848,6 +2900,8 @@ void bch2_trans_copy_iter(struct btree_iter *dst, struct btree_iter *src)
 	*dst = *src;
 	if (src->path)
 		__btree_path_get(src->path, src->flags & BTREE_ITER_INTENT);
+	if (src->update_path)
+		__btree_path_get(src->update_path, src->flags & BTREE_ITER_INTENT);
 }
 
 void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index 9bb1ef404bc9..c4fdfb382dcd 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -258,6 +258,11 @@ static inline void __bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpo
 
 static inline void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos)
 {
+	if (unlikely(iter->update_path))
+		bch2_path_put(iter->trans, iter->update_path,
+			      iter->flags & BTREE_ITER_INTENT);
+	iter->update_path = NULL;
+
 	if (!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS))
 		new_pos.snapshot = iter->snapshot;
 
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 794726c4efd7..9828bdd924af 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -276,6 +276,7 @@ static inline struct btree_path_level *path_l(struct btree_path *path)
 struct btree_iter {
 	struct btree_trans	*trans;
 	struct btree_path	*path;
+	struct btree_path	*update_path;
 
 	enum btree_id		btree_id:4;
 	unsigned		min_depth:4;
diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
index 7d16c35112f3..c8e1f43f71e3 100644
--- a/fs/bcachefs/btree_update.h
+++ b/fs/bcachefs/btree_update.h
@@ -73,8 +73,14 @@ int bch2_btree_node_update_key(struct btree_trans *, struct btree_iter *,
 int bch2_btree_node_update_key_get_iter(struct btree_trans *,
 				struct btree *, struct bkey_i *, bool);
 
+int bch2_trans_update_extent(struct btree_trans *, struct btree_iter *,
+			     struct bkey_i *, enum btree_update_flags);
+
+int __must_check bch2_trans_update_by_path(struct btree_trans *, struct btree_path *,
+				   struct bkey_i *, enum btree_update_flags);
 int __must_check bch2_trans_update(struct btree_trans *, struct btree_iter *,
 				   struct bkey_i *, enum btree_update_flags);
+
 void bch2_trans_commit_hook(struct btree_trans *,
 			    struct btree_trans_commit_hook *);
 int __bch2_trans_commit(struct btree_trans *);
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index e0e99018e5a1..41403942133a 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -1181,10 +1181,10 @@ static noinline int extent_back_merge(struct btree_trans *trans,
 	return 0;
 }
 
-static int bch2_trans_update_extent(struct btree_trans *trans,
-				    struct btree_iter *orig_iter,
-				    struct bkey_i *insert,
-				    enum btree_update_flags flags)
+int bch2_trans_update_extent(struct btree_trans *trans,
+			     struct btree_iter *orig_iter,
+			     struct bkey_i *insert,
+			     enum btree_update_flags flags)
 {
 	struct btree_iter iter, update_iter;
 	struct bpos start = bkey_start_pos(&insert->k);
@@ -1308,13 +1308,9 @@ static int bch2_trans_update_extent(struct btree_trans *trans,
 			bkey_reassemble(update, k);
 			bch2_cut_front(insert->k.p, update);
 
-			bch2_trans_copy_iter(&update_iter, &iter);
-			update_iter.pos = update->k.p;
-			ret   = bch2_trans_update(trans, &update_iter, update,
+			ret = bch2_trans_update_by_path(trans, iter.path, update,
 						  BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|
 						  flags);
-			bch2_trans_iter_exit(trans, &update_iter);
-
 			if (ret)
 				goto err;
 			goto out;
@@ -1385,26 +1381,23 @@ static int need_whiteout_for_snapshot(struct btree_trans *trans,
 	return ret;
 }
 
-int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter,
+int __must_check bch2_trans_update_by_path(struct btree_trans *trans, struct btree_path *path,
 				   struct bkey_i *k, enum btree_update_flags flags)
 {
 	struct btree_insert_entry *i, n;
 
-	BUG_ON(!iter->path->should_be_locked);
-
-	if (iter->flags & BTREE_ITER_IS_EXTENTS)
-		return bch2_trans_update_extent(trans, iter, k, flags);
+	BUG_ON(!path->should_be_locked);
 
 	BUG_ON(trans->nr_updates >= BTREE_ITER_MAX);
-	BUG_ON(bpos_cmp(k->k.p, iter->path->pos));
+	BUG_ON(bpos_cmp(k->k.p, path->pos));
 
 	n = (struct btree_insert_entry) {
 		.flags		= flags,
-		.bkey_type	= __btree_node_type(iter->path->level, iter->btree_id),
-		.btree_id	= iter->btree_id,
-		.level		= iter->path->level,
-		.cached		= iter->flags & BTREE_ITER_CACHED,
-		.path		= iter->path,
+		.bkey_type	= __btree_node_type(path->level, path->btree_id),
+		.btree_id	= path->btree_id,
+		.level		= path->level,
+		.cached		= path->cached,
+		.path		= path,
 		.k		= k,
 		.ip_allocated	= _RET_IP_,
 	};
@@ -1415,16 +1408,6 @@ int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter
 		       btree_insert_entry_cmp(i - 1, i) >= 0);
 #endif
 
-	if (bkey_deleted(&n.k->k) &&
-	    (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)) {
-		int ret = need_whiteout_for_snapshot(trans, n.btree_id, n.k->k.p);
-		if (unlikely(ret < 0))
-			return ret;
-
-		if (ret)
-			n.k->k.type = KEY_TYPE_whiteout;
-	}
-
 	/*
 	 * Pending updates are kept sorted: first, find position of new update,
 	 * then delete/trim any updates the new update overwrites:
@@ -1455,10 +1438,29 @@ int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter
 				  i - trans->updates, n);
 
 	__btree_path_get(n.path, true);
-
 	return 0;
 }
 
+int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter,
+				   struct bkey_i *k, enum btree_update_flags flags)
+{
+	if (iter->flags & BTREE_ITER_IS_EXTENTS)
+		return bch2_trans_update_extent(trans, iter, k, flags);
+
+	if (bkey_deleted(&k->k) &&
+	    (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)) {
+		int ret = need_whiteout_for_snapshot(trans, iter->btree_id, k->k.p);
+		if (unlikely(ret < 0))
+			return ret;
+
+		if (ret)
+			k->k.type = KEY_TYPE_whiteout;
+	}
+
+	return bch2_trans_update_by_path(trans, iter->update_path ?: iter->path,
+					 k, flags);
+}
+
 void bch2_trans_commit_hook(struct btree_trans *trans,
 			    struct btree_trans_commit_hook *h)
 {
-- 
cgit 


From d5030164ec53ab212f6acaff8938b352c654b67e Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 27 Dec 2021 18:25:23 -0500
Subject: bcachefs: Simplify bch2_inode_delete_keys()

Had a bug report that implies bch2_inode_delete_keys() returned -EINTR
before it completed, so this patch simplifies it and makes the flow
control a little more conventional.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/inode.c | 57 +++++++++++++++++++++--------------------------------
 1 file changed, 22 insertions(+), 35 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index 104575734b96..79ee9ca2f1d0 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -585,62 +585,49 @@ found_slot:
 static int bch2_inode_delete_keys(struct btree_trans *trans,
 				  subvol_inum inum, enum btree_id id)
 {
-	u64 offset = 0;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	struct bkey_i delete;
+	u32 snapshot;
 	int ret = 0;
 
-	while (!ret || ret == -EINTR) {
-		struct disk_reservation disk_res =
-			bch2_disk_reservation_init(trans->c, 0);
-		struct btree_iter iter;
-		struct bkey_s_c k;
-		struct bkey_i delete;
-		u32 snapshot;
+	/*
+	 * We're never going to be deleting extents, no need to use an extent
+	 * iterator:
+	 */
+	bch2_trans_iter_init(trans, &iter, id, POS(inum.inum, 0),
+			     BTREE_ITER_NOT_EXTENTS|
+			     BTREE_ITER_INTENT);
 
+	while (1) {
 		bch2_trans_begin(trans);
 
 		ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
 		if (ret)
-			continue;
+			goto err;
 
-		bch2_trans_iter_init(trans, &iter, id,
-				     SPOS(inum.inum, offset, snapshot),
-				     BTREE_ITER_INTENT);
-		k = bch2_btree_iter_peek(&iter);
-
-		if (!k.k || iter.pos.inode != inum.inum) {
-			bch2_trans_iter_exit(trans, &iter);
-			break;
-		}
+		bch2_btree_iter_set_snapshot(&iter, snapshot);
 
+		k = bch2_btree_iter_peek(&iter);
 		ret = bkey_err(k);
 		if (ret)
 			goto err;
 
+		if (!k.k || iter.pos.inode != inum.inum)
+			break;
+
 		bkey_init(&delete.k);
 		delete.k.p = iter.pos;
 
-		if (btree_node_type_is_extents(iter.btree_id)) {
-			unsigned max_sectors =
-				min_t(u64, U64_MAX - iter.pos.offset,
-				      KEY_SIZE_MAX & (~0 << trans->c->block_bits));
-
-			/* create the biggest key we can */
-			bch2_key_resize(&delete.k, max_sectors);
-
-			ret = bch2_extent_trim_atomic(trans, &iter, &delete);
-			if (ret)
-				goto err;
-		}
-
 		ret = bch2_trans_update(trans, &iter, &delete, 0) ?:
-		      bch2_trans_commit(trans, &disk_res, NULL,
+		      bch2_trans_commit(trans, NULL, NULL,
 					BTREE_INSERT_NOFAIL);
-		bch2_disk_reservation_put(trans->c, &disk_res);
 err:
-		offset = iter.pos.offset;
-		bch2_trans_iter_exit(trans, &iter);
+		if (ret && ret != -EINTR)
+			break;
 	}
 
+	bch2_trans_iter_exit(trans, &iter);
 	return ret;
 }
 
-- 
cgit 


From 8ede99101ec354053ac755419df9da5434a13733 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 9 Jan 2022 20:52:10 -0500
Subject: bcachefs: Handle transaction restarts in __bch2_move_data()

We weren't checking for -EINTR in the main loop in __bch2_move_data -
this code predates modern transaction restarts.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/move.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index 8756df0414a8..f428e2ff99f6 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -701,17 +701,20 @@ static int __bch2_move_data(struct bch_fs *c,
 		bch2_trans_begin(&trans);
 
 		k = bch2_btree_iter_peek(&iter);
-
-		stats->pos = iter.pos;
-
 		if (!k.k)
 			break;
+
 		ret = bkey_err(k);
+		if (ret == -EINTR)
+			continue;
 		if (ret)
 			break;
+
 		if (bkey_cmp(bkey_start_pos(k.k), end) >= 0)
 			break;
 
+		stats->pos = iter.pos;
+
 		if (!bkey_extent_is_direct_data(k.k))
 			goto next_nondata;
 
@@ -754,10 +757,8 @@ static int __bch2_move_data(struct bch_fs *c,
 		ret2 = bch2_move_extent(&trans, ctxt, wp, io_opts, btree_id, k,
 					data_cmd, data_opts);
 		if (ret2) {
-			if (ret2 == -EINTR) {
-				bch2_trans_begin(&trans);
+			if (ret2 == -EINTR)
 				continue;
-			}
 
 			if (ret2 == -ENOMEM) {
 				/* memory allocation failure, wait for some IO to finish */
-- 
cgit 


From 6214485b6f74c098615401ae3cde74f87396a298 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 9 Jan 2022 20:55:58 -0500
Subject: bcachefs: BTREE_INSERT_LAZY_RW is only for recovery path

BTREE_INSERT_LAZY_RW shouldn't do anything after the filesystem has
finished starting up - otherwise, it might interfere with going
read-only as part of shutting down.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_update_leaf.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 41403942133a..aa5b7960e214 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -841,7 +841,8 @@ bch2_trans_commit_get_rw_cold(struct btree_trans *trans)
 	struct bch_fs *c = trans->c;
 	int ret;
 
-	if (likely(!(trans->flags & BTREE_INSERT_LAZY_RW)))
+	if (likely(!(trans->flags & BTREE_INSERT_LAZY_RW)) ||
+	    test_bit(BCH_FS_STARTED, &c->flags))
 		return -EROFS;
 
 	bch2_trans_unlock(trans);
-- 
cgit 


From 1f5f52bd036ca019d77b2446b8e20981483d1b9d Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 24 Dec 2021 00:34:48 -0500
Subject: bcachefs: Kill allocator short-circuit invalidate

The allocator thread invalidates buckets (increments their generation
number) prior to discarding them and putting them on freelists. We've
had a short circuit path for some time to only update the in-memory
bucket mark when doing the invalidate if we're not invalidating cached
data, but that short-circuit path hasn't really been needed for quite
some time (likely since the btree key cache code was added).

We're deleting it now as part of deleting/converting code that uses the
in memory bucket array.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/alloc_background.c | 78 +++++++++++++++++++-----------------------
 1 file changed, 35 insertions(+), 43 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index bc5053ebe18f..48f4db591c06 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -668,10 +668,10 @@ static size_t find_reclaimable_buckets(struct bch_fs *c, struct bch_dev *ca)
 }
 
 static int bucket_invalidate_btree(struct btree_trans *trans,
-				   struct bch_dev *ca, u64 b)
+				   struct bch_dev *ca, u64 b,
+				   struct bkey_alloc_unpacked *u)
 {
 	struct bch_fs *c = trans->c;
-	struct bkey_alloc_unpacked u;
 	struct btree_iter iter;
 	int ret;
 
@@ -685,16 +685,16 @@ static int bucket_invalidate_btree(struct btree_trans *trans,
 	if (ret)
 		goto err;
 
-	u = alloc_mem_to_key(c, &iter);
+	*u = alloc_mem_to_key(c, &iter);
 
-	u.gen++;
-	u.data_type	= 0;
-	u.dirty_sectors	= 0;
-	u.cached_sectors = 0;
-	u.read_time	= atomic64_read(&c->io_clock[READ].now);
-	u.write_time	= atomic64_read(&c->io_clock[WRITE].now);
+	u->gen++;
+	u->data_type		= 0;
+	u->dirty_sectors	= 0;
+	u->cached_sectors	= 0;
+	u->read_time		= atomic64_read(&c->io_clock[READ].now);
+	u->write_time		= atomic64_read(&c->io_clock[WRITE].now);
 
-	ret = bch2_alloc_write(trans, &iter, &u,
+	ret = bch2_alloc_write(trans, &iter, u,
 			       BTREE_TRIGGER_BUCKET_INVALIDATE);
 err:
 	bch2_trans_iter_exit(trans, &iter);
@@ -704,21 +704,24 @@ err:
 static int bch2_invalidate_one_bucket(struct bch_fs *c, struct bch_dev *ca,
 				      u64 *journal_seq, unsigned flags)
 {
-	struct bucket *g;
-	struct bucket_mark m;
+	struct bkey_alloc_unpacked u;
 	size_t b;
+	u64 commit_seq = 0;
 	int ret = 0;
 
+	/*
+	 * If the read-only path is trying to shut down, we can't be generating
+	 * new btree updates:
+	 */
+	if (test_bit(BCH_FS_ALLOCATOR_STOPPING, &c->flags))
+		return 1;
+
 	BUG_ON(!ca->alloc_heap.used ||
 	       !ca->alloc_heap.data[0].nr);
 	b = ca->alloc_heap.data[0].bucket;
 
 	/* first, put on free_inc and mark as owned by allocator: */
 	percpu_down_read(&c->mark_lock);
-	g = bucket(ca, b);
-	m = READ_ONCE(g->mark);
-
-	BUG_ON(m.dirty_sectors);
 
 	bch2_mark_alloc_bucket(c, ca, b, true);
 
@@ -727,39 +730,15 @@ static int bch2_invalidate_one_bucket(struct bch_fs *c, struct bch_dev *ca,
 	BUG_ON(!fifo_push(&ca->free_inc, b));
 	spin_unlock(&c->freelist_lock);
 
-	/*
-	 * If we're not invalidating cached data, we only increment the bucket
-	 * gen in memory here, the incremented gen will be updated in the btree
-	 * by bch2_trans_mark_pointer():
-	 */
-	if (!m.data_type &&
-	    !bch2_bucket_needs_journal_commit(&c->buckets_waiting_for_journal,
-					      c->journal.flushed_seq_ondisk,
-					      ca->dev_idx, b)) {
-		bucket_cmpxchg(g, m, m.gen++);
-		*bucket_gen(ca, b) = m.gen;
-		percpu_up_read(&c->mark_lock);
-		goto out;
-	}
-
 	percpu_up_read(&c->mark_lock);
 
-	/*
-	 * If the read-only path is trying to shut down, we can't be generating
-	 * new btree updates:
-	 */
-	if (test_bit(BCH_FS_ALLOCATOR_STOPPING, &c->flags)) {
-		ret = 1;
-		goto out;
-	}
-
-	ret = bch2_trans_do(c, NULL, journal_seq,
+	ret = bch2_trans_do(c, NULL, &commit_seq,
 			    BTREE_INSERT_NOCHECK_RW|
 			    BTREE_INSERT_NOFAIL|
 			    BTREE_INSERT_JOURNAL_RESERVED|
 			    flags,
-			    bucket_invalidate_btree(&trans, ca, b));
-out:
+			    bucket_invalidate_btree(&trans, ca, b, &u));
+
 	if (!ret) {
 		/* remove from alloc_heap: */
 		struct alloc_heap_entry e, *top = ca->alloc_heap.data;
@@ -769,6 +748,19 @@ out:
 
 		if (!top->nr)
 			heap_pop(&ca->alloc_heap, e, bucket_alloc_cmp, NULL);
+
+		/*
+		 * If we invalidating cached data then we need to wait on the
+		 * journal commit:
+		 */
+		if (u.data_type)
+			*journal_seq = max(*journal_seq, commit_seq);
+
+		/*
+		 * We already waiting on u.alloc_seq when we filtered out
+		 * buckets that need journal commit:
+		 */
+		BUG_ON(*journal_seq > u.journal_seq);
 	} else {
 		size_t b2;
 
-- 
cgit 


From 3763cb9566a65966cd404cf3e0c5f218e5cf5d16 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 25 Dec 2021 20:36:47 -0500
Subject: bcachefs: Don't use in-memory bucket array for alloc updates

More prep work for getting rid of the in-memory bucket array: now that
we have BTREE_ITER_WITH_JOURNAL, the allocator code can do ntree lookups
before journal replay is finished, and there's no longer any need for it
to get allocation information from the in-memory bucket array.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/alloc_background.c | 15 ++++++++-------
 fs/bcachefs/btree_iter.c       |  8 ++++++++
 fs/bcachefs/btree_update.h     |  9 ---------
 fs/bcachefs/buckets.c          | 17 +++++++----------
 4 files changed, 23 insertions(+), 26 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 48f4db591c06..0cf71125c55f 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -464,19 +464,20 @@ int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev,
 {
 	struct bch_fs *c = trans->c;
 	struct btree_iter iter;
+	struct bkey_s_c k;
 	struct bkey_alloc_unpacked u;
 	u64 *time, now;
 	int ret = 0;
 
 	bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, POS(dev, bucket_nr),
 			     BTREE_ITER_CACHED|
-			     BTREE_ITER_CACHED_NOFILL|
 			     BTREE_ITER_INTENT);
-	ret = bch2_btree_iter_traverse(&iter);
+	k = bch2_btree_iter_peek_slot(&iter);
+	ret = bkey_err(k);
 	if (ret)
 		goto out;
 
-	u = alloc_mem_to_key(c, &iter);
+	u = bch2_alloc_unpack(k);
 
 	time = rw == READ ? &u.read_time : &u.write_time;
 	now = atomic64_read(&c->io_clock[rw].now);
@@ -673,20 +674,20 @@ static int bucket_invalidate_btree(struct btree_trans *trans,
 {
 	struct bch_fs *c = trans->c;
 	struct btree_iter iter;
+	struct bkey_s_c k;
 	int ret;
 
 	bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc,
 			     POS(ca->dev_idx, b),
 			     BTREE_ITER_CACHED|
-			     BTREE_ITER_CACHED_NOFILL|
 			     BTREE_ITER_INTENT);
 
-	ret = bch2_btree_iter_traverse(&iter);
+	k = bch2_btree_iter_peek_slot(&iter);
+	ret = bkey_err(k);
 	if (ret)
 		goto err;
 
-	*u = alloc_mem_to_key(c, &iter);
-
+	*u = bch2_alloc_unpack(k);
 	u->gen++;
 	u->data_type		= 0;
 	u->dirty_sectors	= 0;
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index ae1628918c57..100305cf93bf 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -2183,6 +2183,7 @@ inline bool bch2_btree_iter_rewind(struct btree_iter *iter)
 	return ret;
 }
 
+static noinline
 struct bkey_i *__bch2_btree_trans_peek_updates(struct btree_iter *iter)
 {
 	struct btree_insert_entry *i;
@@ -2202,6 +2203,13 @@ struct bkey_i *__bch2_btree_trans_peek_updates(struct btree_iter *iter)
 	return ret;
 }
 
+static inline struct bkey_i *btree_trans_peek_updates(struct btree_iter *iter)
+{
+	return iter->flags & BTREE_ITER_WITH_UPDATES
+		? __bch2_btree_trans_peek_updates(iter)
+		: NULL;
+}
+
 static struct bkey_i *__btree_trans_peek_journal(struct btree_trans *trans,
 						 struct btree_path *path)
 {
diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
index c8e1f43f71e3..5e5a1b5e750e 100644
--- a/fs/bcachefs/btree_update.h
+++ b/fs/bcachefs/btree_update.h
@@ -141,13 +141,4 @@ static inline int bch2_trans_commit(struct btree_trans *trans,
 	     (_i) < (_trans)->updates + (_trans)->nr_updates;		\
 	     (_i)++)
 
-struct bkey_i *__bch2_btree_trans_peek_updates(struct btree_iter *);
-
-static inline struct bkey_i *btree_trans_peek_updates(struct btree_iter *iter)
-{
-	return iter->flags & BTREE_ITER_WITH_UPDATES
-		? __bch2_btree_trans_peek_updates(iter)
-		: NULL;
-}
-
 #endif /* _BCACHEFS_BTREE_UPDATE_H */
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index f7a750aff03f..fb833d82222b 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -1429,25 +1429,22 @@ static int bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree
 {
 	struct bch_fs *c = trans->c;
 	struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
-	struct bpos pos = POS(ptr->dev, PTR_BUCKET_NR(ca, ptr));
-	struct bkey_i *update;
+	struct bkey_s_c k;
 	int ret;
 
-	bch2_trans_iter_init(trans, iter, BTREE_ID_alloc, pos,
+	bch2_trans_iter_init(trans, iter, BTREE_ID_alloc,
+			     POS(ptr->dev, PTR_BUCKET_NR(ca, ptr)),
+			     BTREE_ITER_WITH_UPDATES|
 			     BTREE_ITER_CACHED|
-			     BTREE_ITER_CACHED_NOFILL|
 			     BTREE_ITER_INTENT);
-	ret = bch2_btree_iter_traverse(iter);
+	k = bch2_btree_iter_peek_slot(iter);
+	ret = bkey_err(k);
 	if (ret) {
 		bch2_trans_iter_exit(trans, iter);
 		return ret;
 	}
 
-	update = __bch2_btree_trans_peek_updates(iter);
-	*u = update && !bpos_cmp(update->k.p, pos)
-		? bch2_alloc_unpack(bkey_i_to_s_c(update))
-		: alloc_mem_to_key(c, iter);
-
+	*u = bch2_alloc_unpack(k);
 	return 0;
 }
 
-- 
cgit 


From 0678cbe2cbc586c0055de2c04602bf8136bcc3fc Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 10 Jan 2022 19:46:39 -0500
Subject: bcachefs: Ignore cached data when calculating fragmentation

Previously, bucket fragmentation was considered to be bucket size -
total amount of live data, both dirty and cached.

This meant that if a bucket was full but only a small amount of data in
it was dirty - the rest cached, we'd get stuck: copygc wouldn't move the
dirty data out of the bucket and the allocator wouldn't be able to
invalidate and drop the cached data.

This changes fragmentation to exclude cached data, so that copygc will
evacuate these buckets and copygc/the allocator will always be able to
make forward progress.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/alloc_background.c |  2 +-
 fs/bcachefs/buckets.c          |  4 ++--
 fs/bcachefs/buckets.h          |  5 -----
 fs/bcachefs/movinggc.c         | 21 ++++++++++++---------
 4 files changed, 15 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 0cf71125c55f..1353e72bbfb0 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -544,7 +544,7 @@ static bool bch2_can_invalidate_bucket(struct bch_dev *ca, size_t b,
 static unsigned bucket_sort_key(struct bucket *g, struct bucket_mark m,
 				u64 now, u64 last_seq_ondisk)
 {
-	unsigned used = bucket_sectors_used(m);
+	unsigned used = m.cached_sectors;
 
 	if (used) {
 		/*
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index fb833d82222b..acdc95d8d4c7 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -291,8 +291,8 @@ static inline int is_unavailable_bucket(struct bucket_mark m)
 static inline int bucket_sectors_fragmented(struct bch_dev *ca,
 					    struct bucket_mark m)
 {
-	return bucket_sectors_used(m)
-		? max(0, (int) ca->mi.bucket_size - (int) bucket_sectors_used(m))
+	return m.dirty_sectors
+		? max(0, (int) ca->mi.bucket_size - (int) m.dirty_sectors)
 		: 0;
 }
 
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index 4b5376684d2c..483c8b24293f 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -149,11 +149,6 @@ static inline u8 ptr_stale(struct bch_dev *ca,
 
 /* bucket gc marks */
 
-static inline unsigned bucket_sectors_used(struct bucket_mark mark)
-{
-	return mark.dirty_sectors + mark.cached_sectors;
-}
-
 static inline bool is_available_bucket(struct bucket_mark mark)
 {
 	return !mark.dirty_sectors && !mark.stripe;
diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c
index 4791e5099d93..64cb10c3f3db 100644
--- a/fs/bcachefs/movinggc.c
+++ b/fs/bcachefs/movinggc.c
@@ -69,10 +69,14 @@ static enum data_cmd copygc_pred(struct bch_fs *c, void *arg,
 			.dev	= p.ptr.dev,
 			.offset	= p.ptr.offset,
 		};
+		ssize_t i;
 
-		ssize_t i = eytzinger0_find_le(h->data, h->used,
-					       sizeof(h->data[0]),
-					       bucket_offset_cmp, &search);
+		if (p.ptr.cached)
+			continue;
+
+		i = eytzinger0_find_le(h->data, h->used,
+				       sizeof(h->data[0]),
+				       bucket_offset_cmp, &search);
 #if 0
 		/* eytzinger search verify code: */
 		ssize_t j = -1, k;
@@ -185,8 +189,7 @@ static int bch2_copygc(struct bch_fs *c)
 
 			if (m.owned_by_allocator ||
 			    m.data_type != BCH_DATA_user ||
-			    !bucket_sectors_used(m) ||
-			    bucket_sectors_used(m) >= ca->mi.bucket_size)
+			    m.dirty_sectors >= ca->mi.bucket_size)
 				continue;
 
 			WARN_ON(m.stripe && !g->stripe_redundancy);
@@ -195,9 +198,9 @@ static int bch2_copygc(struct bch_fs *c)
 				.dev		= dev_idx,
 				.gen		= m.gen,
 				.replicas	= 1 + g->stripe_redundancy,
-				.fragmentation	= bucket_sectors_used(m) * (1U << 15)
+				.fragmentation	= m.dirty_sectors * (1U << 15)
 					/ ca->mi.bucket_size,
-				.sectors	= bucket_sectors_used(m),
+				.sectors	= m.dirty_sectors,
 				.offset		= bucket_to_sector(ca, b),
 			};
 			heap_add_or_replace(h, e, -fragmentation_cmp, NULL);
@@ -263,8 +266,8 @@ static int bch2_copygc(struct bch_fs *c)
 			m = READ_ONCE(buckets->b[b].mark);
 
 			if (i->gen == m.gen &&
-			    bucket_sectors_used(m)) {
-				sectors_not_moved += bucket_sectors_used(m);
+			    m.dirty_sectors) {
+				sectors_not_moved += m.dirty_sectors;
 				buckets_not_moved++;
 			}
 		}
-- 
cgit 


From 54460a6292b08a8045d8681ac4331dfb9c385017 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 11 Jan 2022 00:19:52 -0500
Subject: bcachefs: Delete some dead code

__bch2_mark_replicas() is now only used in one place, so inline it into
the caller.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/replicas.c | 12 ++----------
 1 file changed, 2 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c
index 0cdd67e9ebc4..c192e31d5d68 100644
--- a/fs/bcachefs/replicas.c
+++ b/fs/bcachefs/replicas.c
@@ -421,18 +421,10 @@ err:
 	goto out;
 }
 
-static int __bch2_mark_replicas(struct bch_fs *c,
-				struct bch_replicas_entry *r,
-				bool check)
-{
-	return likely(bch2_replicas_marked(c, r))	? 0
-		: check					? -1
-		: bch2_mark_replicas_slowpath(c, r);
-}
-
 int bch2_mark_replicas(struct bch_fs *c, struct bch_replicas_entry *r)
 {
-	return __bch2_mark_replicas(c, r, false);
+	return likely(bch2_replicas_marked(c, r))
+		? 0 : bch2_mark_replicas_slowpath(c, r);
 }
 
 /* replicas delta list: */
-- 
cgit 


From b74b147ddabe29a91a00d9f2cefeb6892e6a5a0a Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 11 Jan 2022 22:08:44 -0500
Subject: bcachefs: Log message improvements

Change the error messages in bch2_inconsistent_error() and
bch2_fatal_error() so we can distinguish them.

Also, prefer bch2_fs_fatal_error() (which also logs an error message) to
bch2_fatal_error(), and change a call to bch2_inconsistent_error() to
bch2_fatal_error() when we can't continue.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_update_leaf.c | 5 ++---
 fs/bcachefs/error.c             | 4 ++--
 fs/bcachefs/journal_io.c        | 2 +-
 3 files changed, 5 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index aa5b7960e214..cfcaa58f728c 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -663,9 +663,8 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans,
 			char buf[200];
 
 			bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(i->k));
-			bch_err(c, "invalid bkey %s on insert from %s -> %ps: %s\n",
-				buf, trans->fn, (void *) i->ip_allocated, invalid);
-			bch2_fatal_error(c);
+			bch2_fs_fatal_error(c, "invalid bkey %s on insert from %s -> %ps: %s\n",
+					    buf, trans->fn, (void *) i->ip_allocated, invalid);
 			return -EINVAL;
 		}
 		btree_insert_entry_checks(trans, i);
diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c
index 2cea694575e9..8279a9ba76a5 100644
--- a/fs/bcachefs/error.c
+++ b/fs/bcachefs/error.c
@@ -15,7 +15,7 @@ bool bch2_inconsistent_error(struct bch_fs *c)
 		return false;
 	case BCH_ON_ERROR_ro:
 		if (bch2_fs_emergency_read_only(c))
-			bch_err(c, "emergency read only");
+			bch_err(c, "inconsistency detected - emergency read only");
 		return true;
 	case BCH_ON_ERROR_panic:
 		panic(bch2_fmt(c, "panic after error"));
@@ -35,7 +35,7 @@ void bch2_topology_error(struct bch_fs *c)
 void bch2_fatal_error(struct bch_fs *c)
 {
 	if (bch2_fs_emergency_read_only(c))
-		bch_err(c, "emergency read only");
+		bch_err(c, "fatal error - emergency read only");
 }
 
 void bch2_io_error_work(struct work_struct *work)
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 815310e2426f..75b805732c21 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -1672,6 +1672,6 @@ no_io:
 	continue_at(cl, journal_write_done, c->io_complete_wq);
 	return;
 err:
-	bch2_inconsistent_error(c);
+	bch2_fatal_error(c);
 	continue_at(cl, journal_write_done, c->io_complete_wq);
 }
-- 
cgit 


From 35228ecb7e4d45822c0e2acbb0fb9555da31ef31 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 7 Feb 2022 01:19:39 -0500
Subject: bcachefs: Don't keep nodes in btree_reserve locked

These nodes aren't reachable by other threads, so there's no need to
keep it locked - and this fixes a bug with the assertion in
bch2_trans_unlock() firing on transaction restart.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_update_interior.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 0e7644a3a436..7d5efb32b082 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -243,6 +243,8 @@ retry:
 	bch2_alloc_sectors_done(c, wp);
 mem_alloc:
 	b = bch2_btree_node_mem_alloc(c);
+	six_unlock_write(&b->c.lock);
+	six_unlock_intent(&b->c.lock);
 
 	/* we hold cannibalize_lock: */
 	BUG_ON(IS_ERR(b));
@@ -265,6 +267,9 @@ static struct btree *bch2_btree_node_alloc(struct btree_update *as, unsigned lev
 
 	b = as->prealloc_nodes[--as->nr_prealloc_nodes];
 
+	six_lock_intent(&b->c.lock, NULL, NULL);
+	six_lock_write(&b->c.lock, NULL, NULL);
+
 	set_btree_node_accessed(b);
 	set_btree_node_dirty(c, b);
 	set_btree_node_need_write(b);
@@ -378,7 +383,8 @@ static void bch2_btree_reserve_put(struct btree_update *as)
 	while (as->nr_prealloc_nodes) {
 		struct btree *b = as->prealloc_nodes[--as->nr_prealloc_nodes];
 
-		six_unlock_write(&b->c.lock);
+		six_lock_intent(&b->c.lock, NULL, NULL);
+		six_lock_write(&b->c.lock, NULL, NULL);
 
 		if (c->btree_reserve_cache_nr <
 		    ARRAY_SIZE(c->btree_reserve_cache)) {
@@ -392,10 +398,8 @@ static void bch2_btree_reserve_put(struct btree_update *as)
 			bch2_open_buckets_put(c, &b->ob);
 		}
 
-		btree_node_lock_type(c, b, SIX_LOCK_write);
 		__btree_node_free(c, b);
 		six_unlock_write(&b->c.lock);
-
 		six_unlock_intent(&b->c.lock);
 	}
 
-- 
cgit 


From 80bf2f345411b9952a984b6105cd860500b01228 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 6 Feb 2022 19:20:36 -0500
Subject: bcachefs: Fix freeing in bch2_dev_buckets_resize()

We were double-freeing old_buckets and not freeing old_buckets_gens:
also, the code was supposed to free buckets, not old_buckets;
old_buckets is only needed because we have to use rcu_assign_pointer()
instead of swap(), and won't be set if we hit the error path.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/bcachefs.h |  2 +-
 fs/bcachefs/buckets.c  | 10 ++++++----
 2 files changed, 7 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 3d1a6773393c..59c0963f785f 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -451,7 +451,7 @@ struct bch_dev {
 	 * Or rcu_read_lock(), but only for ptr_stale():
 	 */
 	struct bucket_array __rcu *buckets[2];
-	struct bucket_gens	*bucket_gens;
+	struct bucket_gens __rcu *bucket_gens;
 	unsigned long		*buckets_nouse;
 	struct rw_semaphore	bucket_lock;
 
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index acdc95d8d4c7..ae5760315223 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -2106,7 +2106,7 @@ static void buckets_free_rcu(struct rcu_head *rcu)
 		container_of(rcu, struct bucket_array, rcu);
 
 	kvpfree(buckets,
-		sizeof(struct bucket_array) +
+		sizeof(*buckets) +
 		buckets->nbuckets * sizeof(struct bucket));
 }
 
@@ -2115,7 +2115,7 @@ static void bucket_gens_free_rcu(struct rcu_head *rcu)
 	struct bucket_gens *buckets =
 		container_of(rcu, struct bucket_gens, rcu);
 
-	kvpfree(buckets, sizeof(struct bucket_array) + buckets->nbuckets);
+	kvpfree(buckets, sizeof(*buckets) + buckets->nbuckets);
 }
 
 int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
@@ -2225,9 +2225,9 @@ err:
 	kvpfree(buckets_nouse,
 		BITS_TO_LONGS(nbuckets) * sizeof(unsigned long));
 	if (bucket_gens)
-		call_rcu(&old_buckets->rcu, bucket_gens_free_rcu);
+		call_rcu(&bucket_gens->rcu, bucket_gens_free_rcu);
 	if (buckets)
-		call_rcu(&old_buckets->rcu, buckets_free_rcu);
+		call_rcu(&buckets->rcu, buckets_free_rcu);
 
 	return ret;
 }
@@ -2242,6 +2242,8 @@ void bch2_dev_buckets_free(struct bch_dev *ca)
 		free_fifo(&ca->free[i]);
 	kvpfree(ca->buckets_nouse,
 		BITS_TO_LONGS(ca->mi.nbuckets) * sizeof(unsigned long));
+	kvpfree(rcu_dereference_protected(ca->bucket_gens, 1),
+		sizeof(struct bucket_gens) + ca->mi.nbuckets);
 	kvpfree(rcu_dereference_protected(ca->buckets[0], 1),
 		sizeof(struct bucket_array) +
 		ca->mi.nbuckets * sizeof(struct bucket));
-- 
cgit 


From a9c0b125d8162bf648f7a004f70d4cff6e84ddd9 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 12 Jan 2022 00:49:23 -0500
Subject: bcachefs: Improve btree_key_cache_flush_pos()

btree_key_cache_flush_pos() uses BTREE_ITER_CACHED_NOFILL - but it
wasn't checking for !ck->valid. It does check for the entry being dirty,
so it shouldn't matter, but this refactor it a bit and adds and
assertion.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_key_cache.c | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index 684919125b2f..ba50cad14757 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -390,16 +390,20 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans,
 		goto out;
 
 	ck = (void *) c_iter.path->l[0].b;
-	if (!ck ||
-	    (journal_seq && ck->journal.seq != journal_seq))
+	if (!ck)
 		goto out;
 
 	if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
-		if (!evict)
-			goto out;
-		goto evict;
+		if (evict)
+			goto evict;
+		goto out;
 	}
 
+	BUG_ON(!ck->valid);
+
+	if (journal_seq && ck->journal.seq != journal_seq)
+		goto out;
+
 	/*
 	 * Since journal reclaim depends on us making progress here, and the
 	 * allocator/copygc depend on journal reclaim making progress, we need
-- 
cgit 


From 7c8f6f980dc85fefea69dc1aa161fd2af2d8b3d5 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 12 Jan 2022 02:13:21 -0500
Subject: bcachefs: btree_id_cached()

Add a new helper that returns true if the given btree ID uses the btree
key cache. This enables some new cleanups, since the helper can check
the options for whether caching is enabled on a given btree.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/bcachefs.h   |  6 ++++++
 fs/bcachefs/btree_iter.c |  3 +++
 fs/bcachefs/fs.c         |  4 ++--
 fs/bcachefs/inode.c      | 15 +++++----------
 fs/bcachefs/inode.h      |  2 +-
 fs/bcachefs/super.c      |  4 ++++
 6 files changed, 21 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 59c0963f785f..55db3c00f8dc 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -716,6 +716,7 @@ struct bch_fs {
 	bool			btree_trans_barrier_initialized;
 
 	struct btree_key_cache	btree_key_cache;
+	unsigned		btree_key_cache_btrees;
 
 	struct workqueue_struct	*btree_update_wq;
 	struct workqueue_struct	*btree_io_complete_wq;
@@ -952,6 +953,11 @@ static inline size_t btree_sectors(const struct bch_fs *c)
 	return c->opts.btree_node_size >> 9;
 }
 
+static inline bool btree_id_cached(const struct bch_fs *c, enum btree_id btree)
+{
+	return c->btree_key_cache_btrees & (1U << btree);
+}
+
 static inline struct timespec64 bch2_time_to_timespec(const struct bch_fs *c, s64 time)
 {
 	struct timespec64 t;
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 100305cf93bf..986ee0927e4e 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -2860,6 +2860,9 @@ static void __bch2_trans_iter_init(struct btree_trans *trans,
 	if (trans->journal_replay_not_finished)
 		flags |= BTREE_ITER_WITH_JOURNAL;
 
+	if (!btree_id_cached(trans->c, btree_id))
+		flags &= ~BTREE_ITER_CACHED;
+
 	iter->trans	= trans;
 	iter->path	= NULL;
 	iter->update_path = NULL;
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index 10a737965beb..9e8b085e36d7 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -105,7 +105,7 @@ void bch2_inode_update_after_write(struct btree_trans *trans,
 
 	bch2_assert_pos_locked(trans, BTREE_ID_inodes,
 			       POS(0, bi->bi_inum),
-			       0 && c->opts.inodes_use_key_cache);
+			       c->opts.inodes_use_key_cache);
 
 	set_nlink(&inode->v, bch2_inode_nlink_get(bi));
 	i_uid_write(&inode->v, bi->bi_uid);
@@ -1473,7 +1473,7 @@ static void bch2_evict_inode(struct inode *vinode)
 				KEY_TYPE_QUOTA_WARN);
 		bch2_quota_acct(c, inode->ei_qid, Q_INO, -1,
 				KEY_TYPE_QUOTA_WARN);
-		bch2_inode_rm(c, inode_inum(inode), true);
+		bch2_inode_rm(c, inode_inum(inode));
 	}
 }
 
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index 79ee9ca2f1d0..9214f68f017c 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -252,15 +252,13 @@ int bch2_inode_peek(struct btree_trans *trans,
 	u32 snapshot;
 	int ret;
 
-	if (trans->c->opts.inodes_use_key_cache)
-		flags |= BTREE_ITER_CACHED;
-
 	ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
 	if (ret)
 		return ret;
 
 	bch2_trans_iter_init(trans, iter, BTREE_ID_inodes,
-			     SPOS(0, inum.inum, snapshot), flags);
+			     SPOS(0, inum.inum, snapshot),
+			     flags|BTREE_ITER_CACHED);
 	k = bch2_btree_iter_peek_slot(iter);
 	ret = bkey_err(k);
 	if (ret)
@@ -631,20 +629,16 @@ err:
 	return ret;
 }
 
-int bch2_inode_rm(struct bch_fs *c, subvol_inum inum, bool cached)
+int bch2_inode_rm(struct bch_fs *c, subvol_inum inum)
 {
 	struct btree_trans trans;
 	struct btree_iter iter = { NULL };
 	struct bkey_i_inode_generation delete;
 	struct bch_inode_unpacked inode_u;
 	struct bkey_s_c k;
-	unsigned iter_flags = BTREE_ITER_INTENT;
 	u32 snapshot;
 	int ret;
 
-	if (cached && c->opts.inodes_use_key_cache)
-		iter_flags |= BTREE_ITER_CACHED;
-
 	bch2_trans_init(&trans, c, 0, 1024);
 
 	/*
@@ -668,7 +662,8 @@ retry:
 		goto err;
 
 	bch2_trans_iter_init(&trans, &iter, BTREE_ID_inodes,
-			     SPOS(0, inum.inum, snapshot), iter_flags);
+			     SPOS(0, inum.inum, snapshot),
+			     BTREE_ITER_INTENT|BTREE_ITER_CACHED);
 	k = bch2_btree_iter_peek_slot(&iter);
 
 	ret = bkey_err(k);
diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h
index 723186d8afb6..77957cc7f9dd 100644
--- a/fs/bcachefs/inode.h
+++ b/fs/bcachefs/inode.h
@@ -87,7 +87,7 @@ void bch2_inode_init(struct bch_fs *, struct bch_inode_unpacked *,
 int bch2_inode_create(struct btree_trans *, struct btree_iter *,
 		      struct bch_inode_unpacked *, u32, u64);
 
-int bch2_inode_rm(struct bch_fs *, subvol_inum, bool);
+int bch2_inode_rm(struct bch_fs *, subvol_inum);
 
 int bch2_inode_find_by_inum_trans(struct btree_trans *, subvol_inum,
 				  struct bch_inode_unpacked *);
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 3094eb1e3406..a90fa0ae550b 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -770,6 +770,10 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 
 	bch2_opts_apply(&c->opts, opts);
 
+	c->btree_key_cache_btrees |= 1U << BTREE_ID_alloc;
+	if (c->opts.inodes_use_key_cache)
+		c->btree_key_cache_btrees |= 1U << BTREE_ID_inodes;
+
 	c->block_bits		= ilog2(block_sectors(c));
 	c->btree_foreground_merge_threshold = BTREE_FOREGROUND_MERGE_THRESHOLD(c);
 
-- 
cgit 


From ce91abd60b0aa26e50e6b44b599a0e232b80a8b9 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 6 Feb 2022 22:21:44 -0500
Subject: bcachefs: bch2_btree_path_set_pos()

bch2_btree_path_set_pos() is now available outside of btree_iter.c

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c | 36 ++++++++++++------------------------
 fs/bcachefs/btree_iter.h | 16 ++++++++++++++++
 2 files changed, 28 insertions(+), 24 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 986ee0927e4e..e057c9b15ee0 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1701,7 +1701,7 @@ __bch2_btree_path_make_mut(struct btree_trans *trans,
 	return path;
 }
 
-static struct btree_path * __must_check
+struct btree_path * __must_check
 __bch2_btree_path_set_pos(struct btree_trans *trans,
 			  struct btree_path *path, struct bpos new_pos,
 			  bool intent, int cmp)
@@ -1747,18 +1747,6 @@ out:
 	return path;
 }
 
-static inline struct btree_path * __must_check
-btree_path_set_pos(struct btree_trans *trans,
-		   struct btree_path *path, struct bpos new_pos,
-		   bool intent)
-{
-	int cmp = bpos_cmp(new_pos, path->pos);
-
-	return cmp
-		? __bch2_btree_path_set_pos(trans, path, new_pos, intent, cmp)
-		: path;
-}
-
 /* Btree path: main interface: */
 
 static struct btree_path *have_path_at_pos(struct btree_trans *trans, struct btree_path *path)
@@ -1925,7 +1913,7 @@ struct btree_path *bch2_path_get(struct btree_trans *trans,
 	    path_pos->btree_id	== btree_id &&
 	    path_pos->level	== level) {
 		__btree_path_get(path_pos, intent);
-		path = btree_path_set_pos(trans, path_pos, pos, intent);
+		path = bch2_btree_path_set_pos(trans, path_pos, pos, intent);
 	} else {
 		path = btree_path_alloc(trans, path_pos);
 		path_pos = NULL;
@@ -2022,7 +2010,7 @@ bch2_btree_iter_traverse(struct btree_iter *iter)
 {
 	int ret;
 
-	iter->path = btree_path_set_pos(iter->trans, iter->path,
+	iter->path = bch2_btree_path_set_pos(iter->trans, iter->path,
 					btree_iter_search_key(iter),
 					iter->flags & BTREE_ITER_INTENT);
 
@@ -2058,7 +2046,7 @@ struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter)
 	bkey_init(&iter->k);
 	iter->k.p = iter->pos = b->key.k.p;
 
-	iter->path = btree_path_set_pos(trans, iter->path, b->key.k.p,
+	iter->path = bch2_btree_path_set_pos(trans, iter->path, b->key.k.p,
 					iter->flags & BTREE_ITER_INTENT);
 	iter->path->should_be_locked = true;
 	BUG_ON(iter->path->uptodate);
@@ -2119,7 +2107,7 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
 		 * the next child node
 		 */
 		path = iter->path =
-			btree_path_set_pos(trans, path, bpos_successor(iter->pos),
+			bch2_btree_path_set_pos(trans, path, bpos_successor(iter->pos),
 					   iter->flags & BTREE_ITER_INTENT);
 
 		path->level = iter->min_depth;
@@ -2141,7 +2129,7 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
 	bkey_init(&iter->k);
 	iter->k.p = iter->pos = b->key.k.p;
 
-	iter->path = btree_path_set_pos(trans, iter->path, b->key.k.p,
+	iter->path = bch2_btree_path_set_pos(trans, iter->path, b->key.k.p,
 					iter->flags & BTREE_ITER_INTENT);
 	iter->path->should_be_locked = true;
 	BUG_ON(iter->path->uptodate);
@@ -2270,8 +2258,8 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp
 	bch2_btree_iter_verify(iter);
 
 	while (1) {
-		iter->path = btree_path_set_pos(trans, iter->path, search_key,
-				   iter->flags & BTREE_ITER_INTENT);
+		iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key,
+					iter->flags & BTREE_ITER_INTENT);
 
 		ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
 		if (unlikely(ret)) {
@@ -2378,7 +2366,7 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
 			__btree_path_get(iter->path, iter->flags & BTREE_ITER_INTENT);
 			iter->update_path = iter->path;
 
-			iter->update_path = btree_path_set_pos(trans,
+			iter->update_path = bch2_btree_path_set_pos(trans,
 						iter->update_path, pos,
 						iter->flags & BTREE_ITER_INTENT);
 
@@ -2416,7 +2404,7 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
 	else if (bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0)
 		iter->pos = bkey_start_pos(k.k);
 
-	iter->path = btree_path_set_pos(trans, iter->path, k.k->p,
+	iter->path = bch2_btree_path_set_pos(trans, iter->path, k.k->p,
 				iter->flags & BTREE_ITER_INTENT);
 	BUG_ON(!iter->path->nodes_locked);
 out:
@@ -2479,7 +2467,7 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
 		search_key.snapshot = U32_MAX;
 
 	while (1) {
-		iter->path = btree_path_set_pos(trans, iter->path, search_key,
+		iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key,
 						iter->flags & BTREE_ITER_INTENT);
 
 		ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
@@ -2607,7 +2595,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
 	}
 
 	search_key = btree_iter_search_key(iter);
-	iter->path = btree_path_set_pos(trans, iter->path, search_key,
+	iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key,
 					iter->flags & BTREE_ITER_INTENT);
 
 	ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index c4fdfb382dcd..3f8aaccc5208 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -157,6 +157,22 @@ bch2_btree_path_make_mut(struct btree_trans *trans,
 	return path;
 }
 
+struct btree_path * __must_check
+__bch2_btree_path_set_pos(struct btree_trans *, struct btree_path *,
+			  struct bpos, bool, int);
+
+static inline struct btree_path * __must_check
+bch2_btree_path_set_pos(struct btree_trans *trans,
+		   struct btree_path *path, struct bpos new_pos,
+		   bool intent)
+{
+	int cmp = bpos_cmp(new_pos, path->pos);
+
+	return cmp
+		? __bch2_btree_path_set_pos(trans, path, new_pos, intent, cmp)
+		: path;
+}
+
 int __must_check bch2_btree_path_traverse(struct btree_trans *,
 					  struct btree_path *, unsigned);
 struct btree_path *bch2_path_get(struct btree_trans *, enum btree_id, struct bpos,
-- 
cgit 


From 2e63e180665d527f52b2200acca4aeced065e63f Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 24 Feb 2022 11:02:58 -0500
Subject: bcachefs: Stash a copy of key being overwritten in btree_insert_entry

We currently need to call bch2_btree_path_peek_slot() multiple times in
the transaction commit path - and some of those need to be updated to
also check the keys from journal replay, too. Let's consolidate this and
stash the key being overwritten in btree_insert_entry.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_iter.c        |  1 +
 fs/bcachefs/btree_types.h       | 10 +++++++++-
 fs/bcachefs/btree_update_leaf.c | 28 ++++++++++++----------------
 fs/bcachefs/buckets.c           | 21 +++++----------------
 4 files changed, 27 insertions(+), 33 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index e057c9b15ee0..75815a1e90ec 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1987,6 +1987,7 @@ inline struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *path, struct
 		if (unlikely(!ck->valid))
 			goto hole;
 
+		*u = ck->k->k;
 		k = bkey_i_to_s_c(ck->k);
 	}
 
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 9828bdd924af..587307ff5321 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -339,12 +339,20 @@ struct btree_insert_entry {
 	unsigned		flags;
 	u8			bkey_type;
 	enum btree_id		btree_id:8;
-	u8			level;
+	u8			level:4;
 	bool			cached:1;
 	bool			insert_trigger_run:1;
 	bool			overwrite_trigger_run:1;
+	/*
+	 * @old_k may be a key from the journal; @old_btree_u64s always refers
+	 * to the size of the key being overwritten in the btree:
+	 */
+	u8			old_btree_u64s;
 	struct bkey_i		*k;
 	struct btree_path	*path;
+	/* key being overwritten: */
+	struct bkey		old_k;
+	const struct bch_val	*old_v;
 	unsigned long		ip_allocated;
 };
 
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index cfcaa58f728c..5dd86c41c631 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -653,7 +653,6 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans,
 {
 	struct bch_fs *c = trans->c;
 	struct btree_insert_entry *i;
-	struct bkey_s_c old;
 	int ret, u64s_delta = 0;
 
 	trans_for_each_update(trans, i) {
@@ -671,22 +670,11 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans,
 	}
 
 	trans_for_each_update(trans, i) {
-		struct bkey u;
-
-		/*
-		 * peek_slot() doesn't yet work on iterators that point to
-		 * interior nodes:
-		 */
-		if (i->cached || i->level)
+		if (i->cached)
 			continue;
 
-		old = bch2_btree_path_peek_slot(i->path, &u);
-		ret = bkey_err(old);
-		if (unlikely(ret))
-			return ret;
-
 		u64s_delta += !bkey_deleted(&i->k->k) ? i->k->k.u64s : 0;
-		u64s_delta -= !bkey_deleted(old.k) ? old.k->u64s : 0;
+		u64s_delta -= i->old_btree_u64s;
 
 		if (!same_leaf_as_next(trans, i)) {
 			if (u64s_delta <= 0) {
@@ -1432,11 +1420,19 @@ int __must_check bch2_trans_update_by_path(struct btree_trans *trans, struct btr
 		}
 
 		bch2_path_put(trans, i->path, true);
-		*i = n;
-	} else
+		i->flags	= n.flags;
+		i->cached	= n.cached;
+		i->k		= n.k;
+		i->path		= n.path;
+		i->ip_allocated	= n.ip_allocated;
+	} else {
 		array_insert_item(trans->updates, trans->nr_updates,
 				  i - trans->updates, n);
 
+		i->old_v = bch2_btree_path_peek_slot(path, &i->old_k).v;
+		i->old_btree_u64s = !bkey_deleted(&i->old_k) ? i->old_k.u64s : 0;
+	}
+
 	__btree_path_get(n.path, true);
 	return 0;
 }
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index ae5760315223..7ca1087b5bb3 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -1322,25 +1322,14 @@ void fs_usage_apply_warn(struct btree_trans *trans,
 		should_not_have_added, disk_res_sectors);
 
 	trans_for_each_update(trans, i) {
+		struct bkey_s_c old = { &i->old_k, i->old_v };
+
 		pr_err("while inserting");
 		bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(i->k));
-		pr_err("%s", buf);
+		pr_err("  %s", buf);
 		pr_err("overlapping with");
-
-		if (!i->cached) {
-			struct bkey u;
-			struct bkey_s_c k = bch2_btree_path_peek_slot(i->path, &u);
-
-			bch2_bkey_val_to_text(&PBUF(buf), c, k);
-			pr_err("%s", buf);
-		} else {
-			struct bkey_cached *ck = (void *) i->path->l[0].b;
-
-			if (ck->valid) {
-				bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(ck->k));
-				pr_err("%s", buf);
-			}
-		}
+		bch2_bkey_val_to_text(&PBUF(buf), c, old);
+		pr_err("  %s", buf);
 	}
 	__WARN();
 }
-- 
cgit 


From 45e4cd9e3a088d476929c5ee245e83baeee6cdd5 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 24 Feb 2022 08:08:53 -0500
Subject: bcachefs: run_one_trigger() now checks journal keys

Previously, when doing updates and running triggers before journal
replay completes, triggers would see the incorrect key for the old key
being overwritten - this patch updates the trigger code to check the
journal keys when necessary, needed for the upcoming allocator rewrite.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_iter.c        | 25 +++++--------------------
 fs/bcachefs/btree_update_leaf.c | 11 +++++++++++
 fs/bcachefs/recovery.c          | 18 ++++++++++++++++++
 fs/bcachefs/recovery.h          |  2 ++
 4 files changed, 36 insertions(+), 20 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 75815a1e90ec..200108c0c778 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -2199,30 +2199,14 @@ static inline struct bkey_i *btree_trans_peek_updates(struct btree_iter *iter)
 		: NULL;
 }
 
-static struct bkey_i *__btree_trans_peek_journal(struct btree_trans *trans,
-						 struct btree_path *path)
-{
-	struct journal_keys *keys = &trans->c->journal_keys;
-	size_t idx = bch2_journal_key_search(keys, path->btree_id,
-					     path->level, path->pos);
-
-	while (idx < keys->nr && keys->d[idx].overwritten)
-		idx++;
-
-	return (idx < keys->nr &&
-		keys->d[idx].btree_id	== path->btree_id &&
-		keys->d[idx].level	== path->level)
-		? keys->d[idx].k
-		: NULL;
-}
-
 static noinline
 struct bkey_s_c btree_trans_peek_slot_journal(struct btree_trans *trans,
 					      struct btree_iter *iter)
 {
-	struct bkey_i *k = __btree_trans_peek_journal(trans, iter->path);
+	struct bkey_i *k = bch2_journal_keys_peek(trans->c, iter->btree_id, 0,
+						  iter->path->pos);
 
-	if (k && !bpos_cmp(k->k.p, iter->pos)) {
+	if (k && !bpos_cmp(k->k.p, iter->path->pos)) {
 		iter->k = k->k;
 		return bkey_i_to_s_c(k);
 	} else {
@@ -2236,7 +2220,8 @@ struct bkey_s_c btree_trans_peek_journal(struct btree_trans *trans,
 					 struct bkey_s_c k)
 {
 	struct bkey_i *next_journal =
-		__btree_trans_peek_journal(trans, iter->path);
+		bch2_journal_keys_peek(trans->c, iter->btree_id, 0,
+				       iter->path->pos);
 
 	if (next_journal &&
 	    bpos_cmp(next_journal->k.p,
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 5dd86c41c631..2af2d75a06c5 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -1372,6 +1372,7 @@ static int need_whiteout_for_snapshot(struct btree_trans *trans,
 int __must_check bch2_trans_update_by_path(struct btree_trans *trans, struct btree_path *path,
 				   struct bkey_i *k, enum btree_update_flags flags)
 {
+	struct bch_fs *c = trans->c;
 	struct btree_insert_entry *i, n;
 
 	BUG_ON(!path->should_be_locked);
@@ -1431,6 +1432,16 @@ int __must_check bch2_trans_update_by_path(struct btree_trans *trans, struct btr
 
 		i->old_v = bch2_btree_path_peek_slot(path, &i->old_k).v;
 		i->old_btree_u64s = !bkey_deleted(&i->old_k) ? i->old_k.u64s : 0;
+
+		if (unlikely(trans->journal_replay_not_finished)) {
+			struct bkey_i *j_k =
+				bch2_journal_keys_peek(c, n.btree_id, n.level, k->k.p);
+
+			if (j_k && !bpos_cmp(j_k->k.p, i->k->k.p)) {
+				i->old_k = j_k->k;
+				i->old_v = &j_k->v;
+			}
+		}
 	}
 
 	__btree_path_get(n.path, true);
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index b818093eab39..383838d66edf 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -94,6 +94,24 @@ size_t bch2_journal_key_search(struct journal_keys *journal_keys,
 	return l;
 }
 
+struct bkey_i *bch2_journal_keys_peek(struct bch_fs *c, enum btree_id btree_id,
+				      unsigned level, struct bpos pos)
+{
+	struct journal_keys *keys = &c->journal_keys;
+	struct journal_key *end = keys->d + keys->nr;
+	struct journal_key *k = keys->d +
+		bch2_journal_key_search(keys, btree_id, level, pos);
+
+	while (k < end && k->overwritten)
+		k++;
+
+	if (k < end &&
+	    k->btree_id	== btree_id &&
+	    k->level	== level)
+		return k->k;
+	return NULL;
+}
+
 static void journal_iter_fix(struct bch_fs *c, struct journal_iter *iter, unsigned idx)
 {
 	struct bkey_i *n = iter->keys->d[idx].k;
diff --git a/fs/bcachefs/recovery.h b/fs/bcachefs/recovery.h
index 21bdad9db249..e6927a918df3 100644
--- a/fs/bcachefs/recovery.h
+++ b/fs/bcachefs/recovery.h
@@ -33,6 +33,8 @@ struct btree_and_journal_iter {
 
 size_t bch2_journal_key_search(struct journal_keys *, enum btree_id,
 			       unsigned, struct bpos);
+struct bkey_i *bch2_journal_keys_peek(struct bch_fs *, enum btree_id,
+				      unsigned, struct bpos pos);
 
 int bch2_journal_key_insert_take(struct bch_fs *, enum btree_id,
 				 unsigned, struct bkey_i *);
-- 
cgit 


From f7b6ca23b6456b8b441b506ef977ff53972b35c2 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 6 Feb 2022 23:15:12 -0500
Subject: bcachefs: BTREE_ITER_WITH_KEY_CACHE

This is the start of cache coherency with the btree key cache - this
adds a btree iterator flag that causes lookups to also check the key
cache when we're iterating over the btree (not iterating over the key
cache).

Note that we could still race with another thread creating at item in
the key cache and updating it, since we aren't holding the key cache
locked if it wasn't found. The next patch for the update path will
address this by causing the transaction to restart if the key cache is
found to be dirty.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c      | 89 +++++++++++++++++++++++++++++++++++++------
 fs/bcachefs/btree_iter.h      |  5 ---
 fs/bcachefs/btree_key_cache.c | 17 +++++----
 fs/bcachefs/btree_types.h     |  9 +++--
 4 files changed, 93 insertions(+), 27 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 200108c0c778..ff98024e76fc 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1964,13 +1964,13 @@ inline struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *path, struct
 
 	struct bkey_s_c k;
 
-	BUG_ON(path->uptodate != BTREE_ITER_UPTODATE);
-
 	if (!path->cached) {
 		struct btree_path_level *l = path_l(path);
-		struct bkey_packed *_k =
-			bch2_btree_node_iter_peek_all(&l->iter, l->b);
+		struct bkey_packed *_k;
+
+		EBUG_ON(path->uptodate != BTREE_ITER_UPTODATE);
 
+		_k = bch2_btree_node_iter_peek_all(&l->iter, l->b);
 		k = _k ? bkey_disassemble(l->b, _k, u) : bkey_s_c_null;
 
 		EBUG_ON(k.k && bkey_deleted(k.k) && bpos_cmp(k.k->p, path->pos) == 0);
@@ -1980,12 +1980,15 @@ inline struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *path, struct
 	} else {
 		struct bkey_cached *ck = (void *) path->l[0].b;
 
-		EBUG_ON(path->btree_id != ck->key.btree_id ||
-			bkey_cmp(path->pos, ck->key.pos));
+		EBUG_ON(ck &&
+			(path->btree_id != ck->key.btree_id ||
+			 bkey_cmp(path->pos, ck->key.pos)));
 
-		/* BTREE_ITER_CACHED_NOFILL? */
-		if (unlikely(!ck->valid))
-			goto hole;
+		/* BTREE_ITER_CACHED_NOFILL|BTREE_ITER_CACHED_NOCREATE? */
+		if (unlikely(!ck || !ck->valid))
+			return bkey_s_c_null;
+
+		EBUG_ON(path->uptodate != BTREE_ITER_UPTODATE);
 
 		*u = ck->k->k;
 		k = bkey_i_to_s_c(ck->k);
@@ -2233,11 +2236,43 @@ struct bkey_s_c btree_trans_peek_journal(struct btree_trans *trans,
 	return k;
 }
 
+/*
+ * Checks btree key cache for key at iter->pos and returns it if present, or
+ * bkey_s_c_null:
+ */
+static noinline
+struct bkey_s_c btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos pos)
+{
+	struct btree_trans *trans = iter->trans;
+	struct bch_fs *c = trans->c;
+	struct bkey u;
+	int ret;
+
+	if (!bch2_btree_key_cache_find(c, iter->btree_id, pos))
+		return bkey_s_c_null;
+
+	if (!iter->key_cache_path)
+		iter->key_cache_path = bch2_path_get(trans, iter->btree_id, pos,
+						     iter->flags & BTREE_ITER_INTENT, 0,
+						     iter->flags|BTREE_ITER_CACHED);
+
+	iter->key_cache_path = bch2_btree_path_set_pos(trans, iter->key_cache_path, pos,
+					iter->flags & BTREE_ITER_INTENT);
+
+	ret = bch2_btree_path_traverse(trans, iter->key_cache_path, iter->flags|BTREE_ITER_CACHED);
+	if (unlikely(ret))
+		return bkey_s_c_err(ret);
+
+	iter->key_cache_path->should_be_locked = true;
+
+	return bch2_btree_path_peek_slot(iter->key_cache_path, &u);
+}
+
 static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bpos search_key)
 {
 	struct btree_trans *trans = iter->trans;
 	struct bkey_i *next_update;
-	struct bkey_s_c k;
+	struct bkey_s_c k, k2;
 	int ret;
 
 	EBUG_ON(iter->path->cached || iter->path->level);
@@ -2255,8 +2290,24 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp
 			goto out;
 		}
 
+		iter->path->should_be_locked = true;
+
 		k = btree_path_level_peek_all(trans->c, &iter->path->l[0], &iter->k);
 
+		if (unlikely(iter->flags & BTREE_ITER_WITH_KEY_CACHE) &&
+		    k.k &&
+		    (k2 = btree_trans_peek_key_cache(iter, k.k->p)).k) {
+			ret = bkey_err(k2);
+			if (ret) {
+				k = k2;
+				bch2_btree_iter_set_pos(iter, iter->pos);
+				goto out;
+			}
+
+			k = k2;
+			iter->k = *k.k;
+		}
+
 		if (unlikely(iter->flags & BTREE_ITER_WITH_JOURNAL))
 			k = btree_trans_peek_journal(trans, iter, k);
 
@@ -2603,6 +2654,13 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
 		    (k = btree_trans_peek_slot_journal(trans, iter)).k)
 			goto out;
 
+		if (unlikely(iter->flags & BTREE_ITER_WITH_KEY_CACHE) &&
+		    (k = btree_trans_peek_key_cache(iter, iter->pos)).k) {
+			if (!bkey_err(k))
+				iter->k = *k.k;
+			goto out;
+		}
+
 		k = bch2_btree_path_peek_slot(iter->path, &iter->k);
 	} else {
 		struct bpos next;
@@ -2806,8 +2864,12 @@ void bch2_trans_iter_exit(struct btree_trans *trans, struct btree_iter *iter)
 	if (iter->update_path)
 		bch2_path_put(trans, iter->update_path,
 			      iter->flags & BTREE_ITER_INTENT);
+	if (iter->key_cache_path)
+		bch2_path_put(trans, iter->key_cache_path,
+			      iter->flags & BTREE_ITER_INTENT);
 	iter->path = NULL;
 	iter->update_path = NULL;
+	iter->key_cache_path = NULL;
 }
 
 static void __bch2_trans_iter_init(struct btree_trans *trans,
@@ -2834,12 +2896,16 @@ static void __bch2_trans_iter_init(struct btree_trans *trans,
 	if (trans->journal_replay_not_finished)
 		flags |= BTREE_ITER_WITH_JOURNAL;
 
-	if (!btree_id_cached(trans->c, btree_id))
+	if (!btree_id_cached(trans->c, btree_id)) {
 		flags &= ~BTREE_ITER_CACHED;
+		flags &= ~BTREE_ITER_WITH_KEY_CACHE;
+	} else if (!(flags & BTREE_ITER_CACHED))
+		flags |= BTREE_ITER_WITH_KEY_CACHE;
 
 	iter->trans	= trans;
 	iter->path	= NULL;
 	iter->update_path = NULL;
+	iter->key_cache_path = NULL;
 	iter->btree_id	= btree_id;
 	iter->min_depth	= depth;
 	iter->flags	= flags;
@@ -2887,6 +2953,7 @@ void bch2_trans_copy_iter(struct btree_iter *dst, struct btree_iter *src)
 		__btree_path_get(src->path, src->flags & BTREE_ITER_INTENT);
 	if (src->update_path)
 		__btree_path_get(src->update_path, src->flags & BTREE_ITER_INTENT);
+	dst->key_cache_path = NULL;
 }
 
 void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index 3f8aaccc5208..1e3172a2885a 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -50,11 +50,6 @@ static inline struct btree *btree_node_parent(struct btree_path *path,
 	return btree_path_node(path, b->c.level + 1);
 }
 
-static inline int btree_iter_err(const struct btree_iter *iter)
-{
-	return iter->flags & BTREE_ITER_ERROR ? -EIO : 0;
-}
-
 /* Iterate over paths within a transaction: */
 
 void __bch2_btree_trans_sort_paths(struct btree_trans *);
diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index ba50cad14757..29d46d0aa5d3 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -209,19 +209,20 @@ static int btree_key_cache_fill(struct btree_trans *trans,
 				struct btree_path *ck_path,
 				struct bkey_cached *ck)
 {
-	struct btree_iter iter;
+	struct btree_path *path;
 	struct bkey_s_c k;
 	unsigned new_u64s = 0;
 	struct bkey_i *new_k = NULL;
+	struct bkey u;
 	int ret;
 
-	bch2_trans_iter_init(trans, &iter, ck->key.btree_id,
-			     ck->key.pos, BTREE_ITER_SLOTS);
-	k = bch2_btree_iter_peek_slot(&iter);
-	ret = bkey_err(k);
+	path = bch2_path_get(trans, ck->key.btree_id, ck->key.pos, 0, 0, 0);
+	ret = bch2_btree_path_traverse(trans, path, 0);
 	if (ret)
 		goto err;
 
+	k = bch2_btree_path_peek_slot(path, &u);
+
 	if (!bch2_btree_node_relock(trans, ck_path, 0)) {
 		trace_trans_restart_relock_key_cache_fill(trans->fn,
 				_THIS_IP_, ck_path->btree_id, &ck_path->pos);
@@ -262,9 +263,9 @@ static int btree_key_cache_fill(struct btree_trans *trans,
 	bch2_btree_node_unlock_write(trans, ck_path, ck_path->l[0].b);
 
 	/* We're not likely to need this iterator again: */
-	set_btree_iter_dontneed(&iter);
+	path->preserve = false;
 err:
-	bch2_trans_iter_exit(trans, &iter);
+	bch2_path_put(trans, path, 0);
 	return ret;
 }
 
@@ -385,6 +386,8 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans,
 			     BTREE_ITER_CACHED_NOFILL|
 			     BTREE_ITER_CACHED_NOCREATE|
 			     BTREE_ITER_INTENT);
+	b_iter.flags &= ~BTREE_ITER_WITH_KEY_CACHE;
+
 	ret = bch2_btree_iter_traverse(&c_iter);
 	if (ret)
 		goto out;
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 587307ff5321..6db2ac49ee3f 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -202,10 +202,10 @@ struct btree_node_iter {
  */
 #define BTREE_ITER_IS_EXTENTS		(1 << 4)
 #define BTREE_ITER_NOT_EXTENTS		(1 << 5)
-#define BTREE_ITER_ERROR		(1 << 6)
-#define BTREE_ITER_CACHED		(1 << 7)
-#define BTREE_ITER_CACHED_NOFILL	(1 << 8)
-#define BTREE_ITER_CACHED_NOCREATE	(1 << 9)
+#define BTREE_ITER_CACHED		(1 << 6)
+#define BTREE_ITER_CACHED_NOFILL	(1 << 7)
+#define BTREE_ITER_CACHED_NOCREATE	(1 << 8)
+#define BTREE_ITER_WITH_KEY_CACHE	(1 << 9)
 #define BTREE_ITER_WITH_UPDATES		(1 << 10)
 #define BTREE_ITER_WITH_JOURNAL		(1 << 11)
 #define __BTREE_ITER_ALL_SNAPSHOTS	(1 << 12)
@@ -277,6 +277,7 @@ struct btree_iter {
 	struct btree_trans	*trans;
 	struct btree_path	*path;
 	struct btree_path	*update_path;
+	struct btree_path	*key_cache_path;
 
 	enum btree_id		btree_id:4;
 	unsigned		min_depth:4;
-- 
cgit 


From 12ce5b7df1e0e432bcac22079e4493cab5cd8b23 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 12 Jan 2022 01:14:47 -0500
Subject: bcachefs: Btree key cache coherency

 - Updates to non key cache iterators will now be transparently
   redirected to the key cache for cached btrees.

 - Except when creating new keys: then the update goes to underlying
   btree

For for iterating over a cached btree to work, we need to ensure that if
a key exists in the key cache, it also exists in the btree - otherwise
the iterator code will skip past it and not check the key cache.

Otherwise, for consistency, all updates should go to the same place -
the key cache.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c  |   3 +-
 fs/bcachefs/btree_iter.c        |   2 +
 fs/bcachefs/btree_key_cache.c   |  13 ++--
 fs/bcachefs/btree_key_cache.h   |  10 +--
 fs/bcachefs/btree_types.h       |   6 ++
 fs/bcachefs/btree_update.h      |   2 -
 fs/bcachefs/btree_update_leaf.c | 136 ++++++++++++++++++++++++++++++----------
 fs/bcachefs/trace.h             |   6 ++
 8 files changed, 128 insertions(+), 50 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 1353e72bbfb0..55af41a63ff7 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -271,7 +271,8 @@ int bch2_alloc_write(struct btree_trans *trans, struct btree_iter *iter,
 		return PTR_ERR(a);
 
 	bch2_alloc_pack(trans->c, a, *u);
-	return bch2_trans_update(trans, iter, &a->k, trigger_flags);
+	return bch2_trans_update(trans, iter, &a->k, trigger_flags|
+				 BTREE_UPDATE_NO_KEY_CACHE_COHERENCY);
 }
 
 static unsigned bch_alloc_v1_val_u64s(const struct bch_alloc *a)
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index ff98024e76fc..c7ba6ce27007 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -2188,6 +2188,8 @@ struct bkey_i *__bch2_btree_trans_peek_updates(struct btree_iter *iter)
 			break;
 		if (bpos_cmp(i->k->k.p, iter->path->pos) < 0)
 			continue;
+		if (i->key_cache_already_flushed)
+			continue;
 		if (!ret || bpos_cmp(i->k->k.p, ret->k.p) < 0)
 			ret = i->k;
 	}
diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index 29d46d0aa5d3..72a54b9d1335 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -414,6 +414,7 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans,
 	 * */
 	ret   = bch2_btree_iter_traverse(&b_iter) ?:
 		bch2_trans_update(trans, &b_iter, ck->k,
+				  BTREE_UPDATE_KEY_CACHE_RECLAIM|
 				  BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|
 				  BTREE_TRIGGER_NORUN) ?:
 		bch2_trans_commit(trans, NULL, NULL,
@@ -555,13 +556,15 @@ bool bch2_btree_insert_key_cached(struct btree_trans *trans,
 	return true;
 }
 
-#ifdef CONFIG_BCACHEFS_DEBUG
-void bch2_btree_key_cache_verify_clean(struct btree_trans *trans,
-			       enum btree_id id, struct bpos pos)
+void bch2_btree_key_cache_drop(struct btree_trans *trans,
+			       struct btree_path *path)
 {
-	BUG_ON(bch2_btree_key_cache_find(trans->c, id, pos));
+	struct bkey_cached *ck = (void *) path->l[0].b;
+
+	ck->valid = false;
+
+	BUG_ON(test_bit(BKEY_CACHED_DIRTY, &ck->flags));
 }
-#endif
 
 static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
 					   struct shrink_control *sc)
diff --git a/fs/bcachefs/btree_key_cache.h b/fs/bcachefs/btree_key_cache.h
index b3d241b13453..670746e72dab 100644
--- a/fs/bcachefs/btree_key_cache.h
+++ b/fs/bcachefs/btree_key_cache.h
@@ -32,14 +32,8 @@ bool bch2_btree_insert_key_cached(struct btree_trans *,
 			struct btree_path *, struct bkey_i *);
 int bch2_btree_key_cache_flush(struct btree_trans *,
 			       enum btree_id, struct bpos);
-#ifdef CONFIG_BCACHEFS_DEBUG
-void bch2_btree_key_cache_verify_clean(struct btree_trans *,
-				enum btree_id, struct bpos);
-#else
-static inline void
-bch2_btree_key_cache_verify_clean(struct btree_trans *trans,
-				enum btree_id id, struct bpos pos) {}
-#endif
+void bch2_btree_key_cache_drop(struct btree_trans *,
+			       struct btree_path *);
 
 void bch2_fs_btree_key_cache_exit(struct btree_key_cache *);
 void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *);
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 6db2ac49ee3f..0afade4f61f4 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -344,6 +344,7 @@ struct btree_insert_entry {
 	bool			cached:1;
 	bool			insert_trigger_run:1;
 	bool			overwrite_trigger_run:1;
+	bool			key_cache_already_flushed:1;
 	/*
 	 * @old_k may be a key from the journal; @old_btree_u64s always refers
 	 * to the size of the key being overwritten in the btree:
@@ -645,6 +646,8 @@ static inline bool btree_type_has_snapshots(enum btree_id id)
 enum btree_update_flags {
 	__BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE,
 	__BTREE_UPDATE_NOJOURNAL,
+	__BTREE_UPDATE_KEY_CACHE_RECLAIM,
+	__BTREE_UPDATE_NO_KEY_CACHE_COHERENCY,
 
 	__BTREE_TRIGGER_NORUN,		/* Don't run triggers at all */
 
@@ -658,6 +661,9 @@ enum btree_update_flags {
 
 #define BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE (1U << __BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE)
 #define BTREE_UPDATE_NOJOURNAL		(1U << __BTREE_UPDATE_NOJOURNAL)
+#define BTREE_UPDATE_KEY_CACHE_RECLAIM	(1U << __BTREE_UPDATE_KEY_CACHE_RECLAIM)
+#define BTREE_UPDATE_NO_KEY_CACHE_COHERENCY	\
+	(1U << __BTREE_UPDATE_NO_KEY_CACHE_COHERENCY)
 
 #define BTREE_TRIGGER_NORUN		(1U << __BTREE_TRIGGER_NORUN)
 
diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
index 5e5a1b5e750e..d9a406a28f47 100644
--- a/fs/bcachefs/btree_update.h
+++ b/fs/bcachefs/btree_update.h
@@ -76,8 +76,6 @@ int bch2_btree_node_update_key_get_iter(struct btree_trans *,
 int bch2_trans_update_extent(struct btree_trans *, struct btree_iter *,
 			     struct bkey_i *, enum btree_update_flags);
 
-int __must_check bch2_trans_update_by_path(struct btree_trans *, struct btree_path *,
-				   struct bkey_i *, enum btree_update_flags);
 int __must_check bch2_trans_update(struct btree_trans *, struct btree_iter *,
 				   struct bkey_i *, enum btree_update_flags);
 
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 2af2d75a06c5..dc033991a4ec 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -23,10 +23,15 @@
 #include <linux/prefetch.h>
 #include <linux/sort.h>
 
+static int __must_check
+bch2_trans_update_by_path(struct btree_trans *, struct btree_path *,
+			  struct bkey_i *, enum btree_update_flags);
+
 static inline int btree_insert_entry_cmp(const struct btree_insert_entry *l,
 					 const struct btree_insert_entry *r)
 {
 	return   cmp_int(l->btree_id,	r->btree_id) ?:
+		 cmp_int(l->cached,	r->cached) ?:
 		 -cmp_int(l->level,	r->level) ?:
 		 bpos_cmp(l->k->k.p,	r->k->k.p);
 }
@@ -378,9 +383,14 @@ static inline void do_btree_insert_one(struct btree_trans *trans,
 
 	i->k->k.needs_whiteout = false;
 
-	did_work = !i->cached
-		? btree_insert_key_leaf(trans, i)
-		: bch2_btree_insert_key_cached(trans, i->path, i->k);
+	if (!i->cached)
+		did_work = btree_insert_key_leaf(trans, i);
+	else if (!i->key_cache_already_flushed)
+		did_work = bch2_btree_insert_key_cached(trans, i->path, i->k);
+	else {
+		bch2_btree_key_cache_drop(trans, i->path);
+		did_work = false;
+	}
 	if (!did_work)
 		return;
 
@@ -987,18 +997,6 @@ int __bch2_trans_commit(struct btree_trans *trans)
 			goto out_reset;
 	}
 
-#ifdef CONFIG_BCACHEFS_DEBUG
-	/*
-	 * if BTREE_TRIGGER_NORUN is set, it means we're probably being called
-	 * from the key cache flush code:
-	 */
-	trans_for_each_update(trans, i)
-		if (!i->cached &&
-		    !(i->flags & BTREE_TRIGGER_NORUN))
-			bch2_btree_key_cache_verify_clean(trans,
-					i->btree_id, i->k->k.p);
-#endif
-
 	ret = bch2_trans_commit_run_triggers(trans);
 	if (ret)
 		goto out;
@@ -1369,11 +1367,14 @@ static int need_whiteout_for_snapshot(struct btree_trans *trans,
 	return ret;
 }
 
-int __must_check bch2_trans_update_by_path(struct btree_trans *trans, struct btree_path *path,
-				   struct bkey_i *k, enum btree_update_flags flags)
+static int __must_check
+bch2_trans_update_by_path_trace(struct btree_trans *trans, struct btree_path *path,
+				struct bkey_i *k, enum btree_update_flags flags,
+				unsigned long ip)
 {
 	struct bch_fs *c = trans->c;
 	struct btree_insert_entry *i, n;
+	int ret = 0;
 
 	BUG_ON(!path->should_be_locked);
 
@@ -1388,7 +1389,7 @@ int __must_check bch2_trans_update_by_path(struct btree_trans *trans, struct btr
 		.cached		= path->cached,
 		.path		= path,
 		.k		= k,
-		.ip_allocated	= _RET_IP_,
+		.ip_allocated	= ip,
 	};
 
 #ifdef CONFIG_BCACHEFS_DEBUG
@@ -1409,17 +1410,6 @@ int __must_check bch2_trans_update_by_path(struct btree_trans *trans, struct btr
 	    !btree_insert_entry_cmp(&n, i)) {
 		BUG_ON(i->insert_trigger_run || i->overwrite_trigger_run);
 
-		/*
-		 * This is a hack to ensure that inode creates update the btree,
-		 * not the key cache, which helps with cache coherency issues in
-		 * other areas:
-		 */
-		if (n.cached && !i->cached) {
-			i->k = n.k;
-			i->flags = n.flags;
-			return 0;
-		}
-
 		bch2_path_put(trans, i->path, true);
 		i->flags	= n.flags;
 		i->cached	= n.cached;
@@ -1444,19 +1434,60 @@ int __must_check bch2_trans_update_by_path(struct btree_trans *trans, struct btr
 		}
 	}
 
-	__btree_path_get(n.path, true);
-	return 0;
+	__btree_path_get(i->path, true);
+
+	/*
+	 * If a key is present in the key cache, it must also exist in the
+	 * btree - this is necessary for cache coherency. When iterating over
+	 * a btree that's cached in the key cache, the btree iter code checks
+	 * the key cache - but the key has to exist in the btree for that to
+	 * work:
+	 */
+	if (path->cached &&
+	    bkey_deleted(&i->old_k) &&
+	    !(flags & BTREE_UPDATE_NO_KEY_CACHE_COHERENCY)) {
+		struct btree_path *btree_path;
+
+		i->key_cache_already_flushed = true;
+		i->flags |= BTREE_TRIGGER_NORUN;
+
+		btree_path = bch2_path_get(trans, path->btree_id, path->pos,
+					   1, 0, BTREE_ITER_INTENT);
+
+		ret = bch2_btree_path_traverse(trans, btree_path, 0);
+		if (ret)
+			goto err;
+
+		btree_path->should_be_locked = true;
+		ret = bch2_trans_update_by_path_trace(trans, btree_path, k, flags, ip);
+err:
+		bch2_path_put(trans, btree_path, true);
+	}
+
+	return ret;
+}
+
+static int __must_check
+bch2_trans_update_by_path(struct btree_trans *trans, struct btree_path *path,
+			  struct bkey_i *k, enum btree_update_flags flags)
+{
+	return bch2_trans_update_by_path_trace(trans, path, k, flags, _RET_IP_);
 }
 
 int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter,
 				   struct bkey_i *k, enum btree_update_flags flags)
 {
+	struct btree_path *path = iter->update_path ?: iter->path;
+	struct bkey_cached *ck;
+	int ret;
+
 	if (iter->flags & BTREE_ITER_IS_EXTENTS)
 		return bch2_trans_update_extent(trans, iter, k, flags);
 
 	if (bkey_deleted(&k->k) &&
+	    !(flags & BTREE_UPDATE_KEY_CACHE_RECLAIM) &&
 	    (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)) {
-		int ret = need_whiteout_for_snapshot(trans, iter->btree_id, k->k.p);
+		ret = need_whiteout_for_snapshot(trans, iter->btree_id, k->k.p);
 		if (unlikely(ret < 0))
 			return ret;
 
@@ -1464,8 +1495,45 @@ int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter
 			k->k.type = KEY_TYPE_whiteout;
 	}
 
-	return bch2_trans_update_by_path(trans, iter->update_path ?: iter->path,
-					 k, flags);
+	/*
+	 * Ensure that updates to cached btrees go to the key cache:
+	 */
+	if (!(flags & BTREE_UPDATE_KEY_CACHE_RECLAIM) &&
+	    !path->cached &&
+	    !path->level &&
+	    btree_id_cached(trans->c, path->btree_id)) {
+		if (!iter->key_cache_path ||
+		    !iter->key_cache_path->should_be_locked ||
+		    bpos_cmp(iter->key_cache_path->pos, k->k.p)) {
+			if (!iter->key_cache_path)
+				iter->key_cache_path =
+					bch2_path_get(trans, path->btree_id, path->pos, 1, 0,
+						      BTREE_ITER_INTENT|BTREE_ITER_CACHED);
+
+			iter->key_cache_path =
+				bch2_btree_path_set_pos(trans, iter->key_cache_path, path->pos,
+							iter->flags & BTREE_ITER_INTENT);
+
+			ret = bch2_btree_path_traverse(trans, iter->key_cache_path,
+						       BTREE_ITER_CACHED);
+			if (unlikely(ret))
+				return ret;
+
+			ck = (void *) iter->key_cache_path->l[0].b;
+
+			if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
+				trace_trans_restart_key_cache_raced(trans->fn, _RET_IP_);
+				btree_trans_restart(trans);
+				return -EINTR;
+			}
+
+			iter->key_cache_path->should_be_locked = true;
+		}
+
+		path = iter->key_cache_path;
+	}
+
+	return bch2_trans_update_by_path(trans, path, k, flags);
 }
 
 void bch2_trans_commit_hook(struct btree_trans *trans,
diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h
index d432c90a1491..5e78c396e24c 100644
--- a/fs/bcachefs/trace.h
+++ b/fs/bcachefs/trace.h
@@ -658,6 +658,12 @@ DEFINE_EVENT(transaction_restart,	trans_restart_mark_replicas,
 	TP_ARGS(trans_fn, caller_ip)
 );
 
+DEFINE_EVENT(transaction_restart,	trans_restart_key_cache_raced,
+	TP_PROTO(const char *trans_fn,
+		 unsigned long caller_ip),
+	TP_ARGS(trans_fn, caller_ip)
+);
+
 DECLARE_EVENT_CLASS(transaction_restart_iter,
 	TP_PROTO(const char *trans_fn,
 		 unsigned long caller_ip,
-- 
cgit 


From 63a2edce9487b1fcea7257676614456846f9ab09 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 9 Jan 2023 02:25:08 -0500
Subject: bcachefs: Inode create no longer needs to probe key cache

Now that we have full key cache coherency, we can simplify
bch2_inode_create().

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/inode.c | 28 ++++------------------------
 1 file changed, 4 insertions(+), 24 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index 9214f68f017c..ee14ba5ee73d 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -519,19 +519,8 @@ again:
 	while ((k = bch2_btree_iter_peek(iter)).k &&
 	       !(ret = bkey_err(k)) &&
 	       bkey_cmp(k.k->p, POS(0, max)) < 0) {
-		while (pos < iter->pos.offset) {
-			if (!bch2_btree_key_cache_find(c, BTREE_ID_inodes, POS(0, pos)))
-				goto found_slot;
-
-			pos++;
-		}
-
-		if (k.k->p.snapshot == snapshot &&
-		    !bkey_is_inode(k.k) &&
-		    !bch2_btree_key_cache_find(c, BTREE_ID_inodes, SPOS(0, pos, snapshot))) {
-			bch2_btree_iter_advance(iter);
-			continue;
-		}
+		if (pos < iter->pos.offset)
+			goto found_slot;
 
 		/*
 		 * We don't need to iterate over keys in every snapshot once
@@ -541,12 +530,8 @@ again:
 		bch2_btree_iter_set_pos(iter, POS(0, pos));
 	}
 
-	while (!ret && pos < max) {
-		if (!bch2_btree_key_cache_find(c, BTREE_ID_inodes, POS(0, pos)))
-			goto found_slot;
-
-		pos++;
-	}
+	if (!ret && pos < max)
+		goto found_slot;
 
 	if (!ret && start == min)
 		ret = -ENOSPC;
@@ -569,11 +554,6 @@ found_slot:
 		return ret;
 	}
 
-	/* We may have raced while the iterator wasn't pointing at pos: */
-	if (bkey_is_inode(k.k) ||
-	    bch2_btree_key_cache_find(c, BTREE_ID_inodes, k.k->p))
-		goto again;
-
 	*hint			= k.k->p.offset;
 	inode_u->bi_inum	= k.k->p.offset;
 	inode_u->bi_generation	= bkey_generation(k);
-- 
cgit 


From ec061b215d63b5e85ebf1a4ecfae661c01578c2e Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 25 Dec 2021 20:39:19 -0500
Subject: bcachefs: btree_gc no longer uses main in-memory bucket array

This changes the btree_gc code to only use the second bucket array, the
one dedicated to GC. On completion, it compares what's in its in memory
bucket array to the allocation information in the btree and writes it
directly, instead of updating the main in-memory bucket array and
writing that.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/alloc_background.c | 145 +++++++----------------
 fs/bcachefs/alloc_background.h |  42 ++-----
 fs/bcachefs/bcachefs.h         |   1 -
 fs/bcachefs/btree_gc.c         | 254 +++++++++++++++++++++++++++--------------
 fs/bcachefs/recovery.c         |  23 +---
 5 files changed, 222 insertions(+), 243 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 55af41a63ff7..700d1e00aaf9 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -39,15 +39,6 @@ static const unsigned BCH_ALLOC_V1_FIELD_BYTES[] = {
 #undef x
 };
 
-struct bkey_alloc_buf {
-	struct bkey_i	k;
-	struct bch_alloc_v3 v;
-
-#define x(_name,  _bits)		+ _bits / 8
-	u8		_pad[0 + BCH_ALLOC_FIELDS_V2()];
-#undef  x
-} __attribute__((packed, aligned(8)));
-
 /* Persistent alloc info: */
 
 static inline u64 alloc_field_v1_get(const struct bch_alloc *a,
@@ -254,24 +245,31 @@ struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c k)
 	return ret;
 }
 
-static void bch2_alloc_pack(struct bch_fs *c,
-			    struct bkey_alloc_buf *dst,
-			    const struct bkey_alloc_unpacked src)
+struct bkey_alloc_buf *bch2_alloc_pack(struct btree_trans *trans,
+				       const struct bkey_alloc_unpacked src)
 {
-	bch2_alloc_pack_v3(dst, src);
+	struct bkey_alloc_buf *dst;
+
+	dst = bch2_trans_kmalloc(trans, sizeof(struct bkey_alloc_buf));
+	if (!IS_ERR(dst))
+		bch2_alloc_pack_v3(dst, src);
+
+	return dst;
 }
 
 int bch2_alloc_write(struct btree_trans *trans, struct btree_iter *iter,
 		     struct bkey_alloc_unpacked *u, unsigned trigger_flags)
 {
-	struct bkey_alloc_buf *a;
-
-	a = bch2_trans_kmalloc(trans, sizeof(struct bkey_alloc_buf));
-	if (IS_ERR(a))
-		return PTR_ERR(a);
+	struct bkey_alloc_buf *a = bch2_alloc_pack(trans, *u);
 
-	bch2_alloc_pack(trans->c, a, *u);
-	return bch2_trans_update(trans, iter, &a->k, trigger_flags|
+	/*
+	 * Without BTREE_UPDATE_NO_KEY_CACHE_COHERENCY, we may end up updating
+	 * the btree instead of the key cache - this can casue the allocator to
+	 * self-deadlock, since updating the btree may require allocating new
+	 * btree nodes:
+	 */
+	return PTR_ERR_OR_ZERO(a) ?:
+		bch2_trans_update(trans, iter, &a->k, trigger_flags|
 				 BTREE_UPDATE_NO_KEY_CACHE_COHERENCY);
 }
 
@@ -342,7 +340,7 @@ void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c,
 #undef  x
 }
 
-int bch2_alloc_read(struct bch_fs *c)
+int bch2_alloc_read(struct bch_fs *c, bool gc, bool metadata_only)
 {
 	struct btree_trans trans;
 	struct btree_iter iter;
@@ -353,108 +351,43 @@ int bch2_alloc_read(struct bch_fs *c)
 	int ret;
 
 	bch2_trans_init(&trans, c, 0, 0);
-	down_read(&c->gc_lock);
 
 	for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN,
 			   BTREE_ITER_PREFETCH, k, ret) {
-		if (!bkey_is_alloc(k.k))
-			continue;
-
 		ca = bch_dev_bkey_exists(c, k.k->p.inode);
-		g = bucket(ca, k.k->p.offset);
+		g = __bucket(ca, k.k->p.offset, gc);
 		u = bch2_alloc_unpack(k);
 
-		*bucket_gen(ca, k.k->p.offset) = u.gen;
+		if (!gc)
+			*bucket_gen(ca, k.k->p.offset) = u.gen;
+
 		g->_mark.gen		= u.gen;
-		g->_mark.data_type	= u.data_type;
-		g->_mark.dirty_sectors	= u.dirty_sectors;
-		g->_mark.cached_sectors	= u.cached_sectors;
-		g->_mark.stripe		= u.stripe != 0;
-		g->stripe		= u.stripe;
-		g->stripe_redundancy	= u.stripe_redundancy;
 		g->io_time[READ]	= u.read_time;
 		g->io_time[WRITE]	= u.write_time;
-		g->oldest_gen		= u.oldest_gen;
+		g->oldest_gen		= !gc ? u.oldest_gen : u.gen;
 		g->gen_valid		= 1;
-	}
-	bch2_trans_iter_exit(&trans, &iter);
 
-	up_read(&c->gc_lock);
-	bch2_trans_exit(&trans);
+		if (!gc ||
+		    (metadata_only &&
+		     (u.data_type == BCH_DATA_user ||
+		      u.data_type == BCH_DATA_cached ||
+		      u.data_type == BCH_DATA_parity))) {
+			g->_mark.data_type	= u.data_type;
+			g->_mark.dirty_sectors	= u.dirty_sectors;
+			g->_mark.cached_sectors	= u.cached_sectors;
+			g->_mark.stripe		= u.stripe != 0;
+			g->stripe		= u.stripe;
+			g->stripe_redundancy	= u.stripe_redundancy;
+		}
 
-	if (ret) {
-		bch_err(c, "error reading alloc info: %i", ret);
-		return ret;
 	}
+	bch2_trans_iter_exit(&trans, &iter);
 
-	return 0;
-}
-
-static int bch2_alloc_write_key(struct btree_trans *trans,
-				struct btree_iter *iter,
-				unsigned flags)
-{
-	struct bch_fs *c = trans->c;
-	struct bkey_s_c k;
-	struct bkey_alloc_unpacked old_u, new_u;
-	int ret;
-retry:
-	bch2_trans_begin(trans);
-
-	ret = bch2_btree_key_cache_flush(trans,
-			BTREE_ID_alloc, iter->pos);
-	if (ret)
-		goto err;
+	bch2_trans_exit(&trans);
 
-	k = bch2_btree_iter_peek_slot(iter);
-	ret = bkey_err(k);
 	if (ret)
-		goto err;
-
-	old_u	= bch2_alloc_unpack(k);
-	new_u	= alloc_mem_to_key(c, iter);
-
-	if (!bkey_alloc_unpacked_cmp(old_u, new_u))
-		return 0;
-
-	ret   = bch2_alloc_write(trans, iter, &new_u,
-				  BTREE_TRIGGER_NORUN) ?:
-		bch2_trans_commit(trans, NULL, NULL,
-				BTREE_INSERT_NOFAIL|flags);
-err:
-	if (ret == -EINTR)
-		goto retry;
-	return ret;
-}
-
-int bch2_alloc_write_all(struct bch_fs *c, unsigned flags)
-{
-	struct btree_trans trans;
-	struct btree_iter iter;
-	struct bch_dev *ca;
-	unsigned i;
-	int ret = 0;
-
-	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
-	bch2_trans_iter_init(&trans, &iter, BTREE_ID_alloc, POS_MIN,
-			     BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
-
-	for_each_member_device(ca, c, i) {
-		bch2_btree_iter_set_pos(&iter,
-			POS(ca->dev_idx, ca->mi.first_bucket));
+		bch_err(c, "error reading alloc info: %i", ret);
 
-		while (iter.pos.offset < ca->mi.nbuckets) {
-			ret = bch2_alloc_write_key(&trans, &iter, flags);
-			if (ret) {
-				percpu_ref_put(&ca->ref);
-				goto err;
-			}
-			bch2_btree_iter_advance(&iter);
-		}
-	}
-err:
-	bch2_trans_iter_exit(&trans, &iter);
-	bch2_trans_exit(&trans);
 	return ret;
 }
 
diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h
index 86b64177b3d0..98c7866e20b5 100644
--- a/fs/bcachefs/alloc_background.h
+++ b/fs/bcachefs/alloc_background.h
@@ -38,40 +38,23 @@ static inline bool bkey_alloc_unpacked_cmp(struct bkey_alloc_unpacked l,
 	;
 }
 
+struct bkey_alloc_buf {
+	struct bkey_i	k;
+	struct bch_alloc_v3 v;
+
+#define x(_name,  _bits)		+ _bits / 8
+	u8		_pad[0 + BCH_ALLOC_FIELDS_V2()];
+#undef  x
+} __attribute__((packed, aligned(8)));
+
 struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c);
+struct bkey_alloc_buf *bch2_alloc_pack(struct btree_trans *,
+				       const struct bkey_alloc_unpacked);
 int bch2_alloc_write(struct btree_trans *, struct btree_iter *,
 		     struct bkey_alloc_unpacked *, unsigned);
 
 int bch2_bucket_io_time_reset(struct btree_trans *, unsigned, size_t, int);
 
-static inline struct bkey_alloc_unpacked
-alloc_mem_to_key(struct bch_fs *c, struct btree_iter *iter)
-{
-	struct bch_dev *ca;
-	struct bucket *g;
-	struct bkey_alloc_unpacked ret;
-
-	percpu_down_read(&c->mark_lock);
-	ca	= bch_dev_bkey_exists(c, iter->pos.inode);
-	g	= bucket(ca, iter->pos.offset);
-	ret	= (struct bkey_alloc_unpacked) {
-		.dev		= iter->pos.inode,
-		.bucket		= iter->pos.offset,
-		.gen		= g->mark.gen,
-		.oldest_gen	= g->oldest_gen,
-		.data_type	= g->mark.data_type,
-		.dirty_sectors	= g->mark.dirty_sectors,
-		.cached_sectors	= g->mark.cached_sectors,
-		.read_time	= g->io_time[READ],
-		.write_time	= g->io_time[WRITE],
-		.stripe		= g->stripe,
-		.stripe_redundancy = g->stripe_redundancy,
-	};
-	percpu_up_read(&c->mark_lock);
-
-	return ret;
-}
-
 #define ALLOC_SCAN_BATCH(ca)		max_t(size_t, 1, (ca)->mi.nbuckets >> 9)
 
 const char *bch2_alloc_v1_invalid(const struct bch_fs *, struct bkey_s_c);
@@ -101,7 +84,7 @@ static inline bool bkey_is_alloc(const struct bkey *k)
 		k->type == KEY_TYPE_alloc_v3;
 }
 
-int bch2_alloc_read(struct bch_fs *);
+int bch2_alloc_read(struct bch_fs *, bool, bool);
 
 static inline void bch2_wake_allocator(struct bch_dev *ca)
 {
@@ -139,7 +122,6 @@ void bch2_dev_allocator_quiesce(struct bch_fs *, struct bch_dev *);
 void bch2_dev_allocator_stop(struct bch_dev *);
 int bch2_dev_allocator_start(struct bch_dev *);
 
-int bch2_alloc_write_all(struct bch_fs *, unsigned);
 void bch2_fs_allocator_background_init(struct bch_fs *);
 
 #endif /* _BCACHEFS_ALLOC_BACKGROUND_H */
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 55db3c00f8dc..91514365d72b 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -536,7 +536,6 @@ enum {
 	/* misc: */
 	BCH_FS_NEED_ANOTHER_GC,
 	BCH_FS_DELETED_NODES,
-	BCH_FS_NEED_ALLOC_WRITE,
 	BCH_FS_REBUILD_REPLICAS,
 	BCH_FS_HOLD_BTREE_WRITES,
 };
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index e92769e010c1..d4b2d2657340 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -9,6 +9,7 @@
 #include "alloc_foreground.h"
 #include "bkey_methods.h"
 #include "bkey_buf.h"
+#include "btree_key_cache.h"
 #include "btree_locking.h"
 #include "btree_update_interior.h"
 #include "btree_io.h"
@@ -533,7 +534,6 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id,
 	bkey_for_each_ptr_decode(k->k, ptrs, p, entry) {
 		struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev);
 		struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr);
-		struct bucket *g2 = PTR_BUCKET(ca, &p.ptr);
 		enum bch_data_type data_type = bch2_bkey_ptr_data_type(*k, &entry->ptr);
 
 		if (fsck_err_on(!g->gen_valid, c,
@@ -544,9 +544,8 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id,
 				p.ptr.gen,
 				(bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf))) {
 			if (!p.ptr.cached) {
-				g2->_mark.gen	= g->_mark.gen		= p.ptr.gen;
-				g2->gen_valid	= g->gen_valid		= true;
-				set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags);
+				g->_mark.gen		= p.ptr.gen;
+				g->gen_valid		= true;
 			} else {
 				do_update = true;
 			}
@@ -560,13 +559,12 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id,
 				p.ptr.gen, g->mark.gen,
 				(bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf))) {
 			if (!p.ptr.cached) {
-				g2->_mark.gen	= g->_mark.gen	= p.ptr.gen;
-				g2->gen_valid	= g->gen_valid	= true;
-				g2->_mark.data_type		= 0;
-				g2->_mark.dirty_sectors		= 0;
-				g2->_mark.cached_sectors	= 0;
+				g->_mark.gen		= p.ptr.gen;
+				g->gen_valid		= true;
+				g->_mark.data_type	= 0;
+				g->_mark.dirty_sectors	= 0;
+				g->_mark.cached_sectors	= 0;
 				set_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags);
-				set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags);
 			} else {
 				do_update = true;
 			}
@@ -603,8 +601,7 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id,
 				bch2_data_types[data_type],
 				(bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf))) {
 			if (data_type == BCH_DATA_btree) {
-				g2->_mark.data_type	= g->_mark.data_type	= data_type;
-				set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags);
+				g->_mark.data_type	= data_type;
 				set_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags);
 			} else {
 				do_update = true;
@@ -1169,13 +1166,14 @@ static int bch2_gc_done(struct bch_fs *c,
 	unsigned i, dev;
 	int ret = 0;
 
+	percpu_down_write(&c->mark_lock);
+
 #define copy_field(_f, _msg, ...)					\
 	if (dst->_f != src->_f) {					\
 		if (verify)						\
 			fsck_err(c, _msg ": got %llu, should be %llu"	\
 				, ##__VA_ARGS__, dst->_f, src->_f);	\
 		dst->_f = src->_f;					\
-		set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags);		\
 	}
 #define copy_stripe_field(_f, _msg, ...)				\
 	if (dst->_f != src->_f) {					\
@@ -1185,18 +1183,6 @@ static int bch2_gc_done(struct bch_fs *c,
 				iter.pos, ##__VA_ARGS__,		\
 				dst->_f, src->_f);			\
 		dst->_f = src->_f;					\
-		set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags);		\
-	}
-#define copy_bucket_field(_f)						\
-	if (dst->b[b]._f != src->b[b]._f) {				\
-		if (verify)						\
-			fsck_err(c, "bucket %u:%zu gen %u data type %s has wrong " #_f	\
-				": got %u, should be %u", dev, b,	\
-				dst->b[b].mark.gen,			\
-				bch2_data_types[dst->b[b].mark.data_type],\
-				dst->b[b]._f, src->b[b]._f);		\
-		dst->b[b]._f = src->b[b]._f;				\
-		set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags);		\
 	}
 #define copy_dev_field(_f, _msg, ...)					\
 	copy_field(_f, "dev %u has wrong " _msg, dev, ##__VA_ARGS__)
@@ -1207,36 +1193,18 @@ static int bch2_gc_done(struct bch_fs *c,
 		bch2_fs_usage_acc_to_base(c, i);
 
 	for_each_member_device(ca, c, dev) {
-		struct bucket_array *dst = __bucket_array(ca, 0);
-		struct bucket_array *src = __bucket_array(ca, 1);
-		size_t b;
-
-		for (b = 0; b < src->nbuckets; b++) {
-			copy_bucket_field(_mark.gen);
-			copy_bucket_field(_mark.data_type);
-			copy_bucket_field(_mark.stripe);
-			copy_bucket_field(_mark.dirty_sectors);
-			copy_bucket_field(_mark.cached_sectors);
-			copy_bucket_field(stripe_redundancy);
-			copy_bucket_field(stripe);
-
-			dst->b[b].oldest_gen = src->b[b].oldest_gen;
-		}
-
-		{
-			struct bch_dev_usage *dst = ca->usage_base;
-			struct bch_dev_usage *src = (void *)
-				bch2_acc_percpu_u64s((void *) ca->usage_gc,
-						     dev_usage_u64s());
-
-			copy_dev_field(buckets_ec,		"buckets_ec");
-			copy_dev_field(buckets_unavailable,	"buckets_unavailable");
-
-			for (i = 0; i < BCH_DATA_NR; i++) {
-				copy_dev_field(d[i].buckets,	"%s buckets", bch2_data_types[i]);
-				copy_dev_field(d[i].sectors,	"%s sectors", bch2_data_types[i]);
-				copy_dev_field(d[i].fragmented,	"%s fragmented", bch2_data_types[i]);
-			}
+		struct bch_dev_usage *dst = ca->usage_base;
+		struct bch_dev_usage *src = (void *)
+			bch2_acc_percpu_u64s((void *) ca->usage_gc,
+					     dev_usage_u64s());
+
+		copy_dev_field(buckets_ec,		"buckets_ec");
+		copy_dev_field(buckets_unavailable,	"buckets_unavailable");
+
+		for (i = 0; i < BCH_DATA_NR; i++) {
+			copy_dev_field(d[i].buckets,	"%s buckets", bch2_data_types[i]);
+			copy_dev_field(d[i].sectors,	"%s sectors", bch2_data_types[i]);
+			copy_dev_field(d[i].fragmented,	"%s fragmented", bch2_data_types[i]);
 		}
 	};
 
@@ -1278,7 +1246,6 @@ static int bch2_gc_done(struct bch_fs *c,
 
 #undef copy_fs_field
 #undef copy_dev_field
-#undef copy_bucket_field
 #undef copy_stripe_field
 #undef copy_field
 fsck_err:
@@ -1286,6 +1253,8 @@ fsck_err:
 		percpu_ref_put(&ca->ref);
 	if (ret)
 		bch_err(c, "%s: ret %i", __func__, ret);
+
+	percpu_up_write(&c->mark_lock);
 	return ret;
 }
 
@@ -1308,15 +1277,6 @@ static int bch2_gc_start(struct bch_fs *c,
 		BUG_ON(ca->buckets[1]);
 		BUG_ON(ca->usage_gc);
 
-		ca->buckets[1] = kvpmalloc(sizeof(struct bucket_array) +
-				ca->mi.nbuckets * sizeof(struct bucket),
-				GFP_KERNEL|__GFP_ZERO);
-		if (!ca->buckets[1]) {
-			percpu_ref_put(&ca->ref);
-			bch_err(c, "error allocating ca->buckets[gc]");
-			return -ENOMEM;
-		}
-
 		ca->usage_gc = alloc_percpu(struct bch_dev_usage);
 		if (!ca->usage_gc) {
 			bch_err(c, "error allocating ca->usage_gc");
@@ -1325,33 +1285,151 @@ static int bch2_gc_start(struct bch_fs *c,
 		}
 	}
 
-	percpu_down_write(&c->mark_lock);
+	return 0;
+}
+
+static int bch2_alloc_write_key(struct btree_trans *trans,
+				struct btree_iter *iter,
+				bool initial, bool metadata_only)
+{
+	struct bch_fs *c = trans->c;
+	struct bch_dev *ca = bch_dev_bkey_exists(c, iter->pos.inode);
+	struct bucket *g;
+	struct bkey_s_c k;
+	struct bkey_alloc_unpacked old_u, new_u, gc_u;
+	struct bkey_alloc_buf *a;
+	int ret;
+
+	k = bch2_btree_iter_peek_slot(iter);
+	ret = bkey_err(k);
+	if (ret)
+		return ret;
+
+	old_u = new_u = bch2_alloc_unpack(k);
+
+	percpu_down_read(&c->mark_lock);
+	g	= gc_bucket(ca, iter->pos.offset);
+	gc_u = (struct bkey_alloc_unpacked) {
+		.dev		= iter->pos.inode,
+		.bucket		= iter->pos.offset,
+		.gen		= g->mark.gen,
+		.oldest_gen	= g->oldest_gen,
+		.data_type	= g->mark.data_type,
+		.dirty_sectors	= g->mark.dirty_sectors,
+		.cached_sectors	= g->mark.cached_sectors,
+		.read_time	= g->io_time[READ],
+		.write_time	= g->io_time[WRITE],
+		.stripe		= g->stripe,
+		.stripe_redundancy = g->stripe_redundancy,
+	};
+	percpu_up_read(&c->mark_lock);
+
+	if (metadata_only &&
+	    gc_u.data_type != BCH_DATA_sb &&
+	    gc_u.data_type != BCH_DATA_journal &&
+	    gc_u.data_type != BCH_DATA_btree)
+		return 0;
+
+	if (!bkey_alloc_unpacked_cmp(old_u, gc_u) ||
+	    gen_after(old_u.gen, gc_u.gen))
+		return 0;
+
+#define copy_bucket_field(_f)						\
+	if (fsck_err_on(new_u._f != gc_u._f, c,				\
+			"bucket %llu:%llu gen %u data type %s has wrong " #_f	\
+			": got %u, should be %u",			\
+			iter->pos.inode, iter->pos.offset,		\
+			new_u.gen,					\
+			bch2_data_types[new_u.data_type],		\
+			new_u._f, gc_u._f))				\
+		new_u._f = gc_u._f;					\
+
+	copy_bucket_field(gen);
+	copy_bucket_field(data_type);
+	copy_bucket_field(stripe);
+	copy_bucket_field(dirty_sectors);
+	copy_bucket_field(cached_sectors);
+	copy_bucket_field(stripe_redundancy);
+	copy_bucket_field(stripe);
+#undef copy_bucket_field
+
+	new_u.oldest_gen = gc_u.oldest_gen;
+
+	if (!bkey_alloc_unpacked_cmp(old_u, new_u))
+		return 0;
+
+	a = bch2_alloc_pack(trans, new_u);
+	if (IS_ERR(a))
+		return PTR_ERR(a);
+
+	ret = initial
+		? bch2_journal_key_insert(c, BTREE_ID_alloc, 0, &a->k)
+		: bch2_trans_update(trans, iter, &a->k, BTREE_TRIGGER_NORUN);
+fsck_err:
+	return ret;
+}
+
+static int bch2_gc_alloc_done(struct bch_fs *c, bool initial, bool metadata_only)
+{
+	struct btree_trans trans;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	struct bch_dev *ca;
+	unsigned i;
+	int ret = 0;
+
+	bch2_trans_init(&trans, c, 0, 0);
 
 	for_each_member_device(ca, c, i) {
-		struct bucket_array *dst = __bucket_array(ca, 1);
-		struct bucket_array *src = __bucket_array(ca, 0);
-		size_t b;
+		for_each_btree_key(&trans, iter, BTREE_ID_alloc,
+				   POS(ca->dev_idx, ca->mi.first_bucket),
+				   BTREE_ITER_SLOTS|
+				   BTREE_ITER_PREFETCH, k, ret) {
+			if (bkey_cmp(iter.pos, POS(ca->dev_idx, ca->mi.nbuckets)) >= 0)
+				break;
 
-		dst->first_bucket	= src->first_bucket;
-		dst->nbuckets		= src->nbuckets;
+			ret = __bch2_trans_do(&trans, NULL, NULL,
+					      BTREE_INSERT_LAZY_RW,
+					bch2_alloc_write_key(&trans, &iter,
+							     initial, metadata_only));
+			if (ret)
+				break;
+		}
+		bch2_trans_iter_exit(&trans, &iter);
 
-		for (b = 0; b < src->nbuckets; b++) {
-			struct bucket *d = &dst->b[b];
-			struct bucket *s = &src->b[b];
+		if (ret) {
+			bch_err(c, "error writing alloc info: %i", ret);
+			percpu_ref_put(&ca->ref);
+			break;
+		}
+	}
 
-			d->_mark.gen = dst->b[b].oldest_gen = s->mark.gen;
-			d->gen_valid = s->gen_valid;
+	bch2_trans_exit(&trans);
+	return ret;
+}
 
-			if (metadata_only &&
-			    (s->mark.data_type == BCH_DATA_user ||
-			     s->mark.data_type == BCH_DATA_cached))
-				d->_mark = s->mark;
+static int bch2_gc_alloc_start(struct bch_fs *c, bool initial, bool metadata_only)
+{
+	struct bch_dev *ca;
+	unsigned i;
+
+	for_each_member_device(ca, c, i) {
+		struct bucket_array *buckets = kvpmalloc(sizeof(struct bucket_array) +
+				ca->mi.nbuckets * sizeof(struct bucket),
+				GFP_KERNEL|__GFP_ZERO);
+		if (!buckets) {
+			percpu_ref_put(&ca->ref);
+			percpu_up_write(&c->mark_lock);
+			bch_err(c, "error allocating ca->buckets[gc]");
+			return -ENOMEM;
 		}
-	};
 
-	percpu_up_write(&c->mark_lock);
+		buckets->first_bucket	= ca->mi.first_bucket;
+		buckets->nbuckets	= ca->mi.nbuckets;
+		rcu_assign_pointer(ca->buckets[1], buckets);
+	};
 
-	return 0;
+	return bch2_alloc_read(c, true, metadata_only);
 }
 
 static void bch2_gc_alloc_reset(struct bch_fs *c, bool initial, bool metadata_only)
@@ -1598,6 +1676,7 @@ int bch2_gc(struct bch_fs *c, bool initial, bool metadata_only)
 			   !bch2_btree_interior_updates_nr_pending(c));
 
 	ret   = bch2_gc_start(c, metadata_only) ?:
+		bch2_gc_alloc_start(c, initial, metadata_only) ?:
 		bch2_gc_reflink_start(c, initial, metadata_only);
 	if (ret)
 		goto out;
@@ -1665,16 +1744,15 @@ out:
 	if (!ret) {
 		bch2_journal_block(&c->journal);
 
-		percpu_down_write(&c->mark_lock);
-		ret   = bch2_gc_reflink_done(c, initial, metadata_only) ?:
-			bch2_gc_stripes_done(c, initial, metadata_only) ?:
+		ret   = bch2_gc_stripes_done(c, initial, metadata_only) ?:
+			bch2_gc_reflink_done(c, initial, metadata_only) ?:
+			bch2_gc_alloc_done(c, initial, metadata_only) ?:
 			bch2_gc_done(c, initial, metadata_only);
 
 		bch2_journal_unblock(&c->journal);
-	} else {
-		percpu_down_write(&c->mark_lock);
 	}
 
+	percpu_down_write(&c->mark_lock);
 	/* Indicates that gc is no longer in progress: */
 	__gc_pos_set(c, gc_phase(GC_PHASE_NOT_RUNNING));
 
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 383838d66edf..feafb7296ddf 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -1113,7 +1113,11 @@ use_clean:
 
 	bch_verbose(c, "starting alloc read");
 	err = "error reading allocation information";
-	ret = bch2_alloc_read(c);
+
+	down_read(&c->gc_lock);
+	ret = bch2_alloc_read(c, false, false);
+	up_read(&c->gc_lock);
+
 	if (ret)
 		goto err;
 	bch_verbose(c, "alloc read done");
@@ -1171,23 +1175,6 @@ use_clean:
 	if (c->opts.verbose || !c->sb.clean)
 		bch_info(c, "journal replay done");
 
-	if (test_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags) &&
-	    !c->opts.nochanges) {
-		/*
-		 * note that even when filesystem was clean there might be work
-		 * to do here, if we ran gc (because of fsck) which recalculated
-		 * oldest_gen:
-		 */
-		bch_verbose(c, "writing allocation info");
-		err = "error writing out alloc info";
-		ret = bch2_alloc_write_all(c, BTREE_INSERT_LAZY_RW);
-		if (ret) {
-			bch_err(c, "error writing alloc info");
-			goto err;
-		}
-		bch_verbose(c, "alloc write done");
-	}
-
 	if (c->sb.version < bcachefs_metadata_version_snapshot_2) {
 		bch2_fs_lazy_rw(c);
 
-- 
cgit 


From d73e0d2cd185c313b8a9063b11b3fb91df9db261 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 25 Dec 2021 03:37:52 -0500
Subject: bcachefs: Copygc no longer uses bucket array

This converts the copygc code to use the alloc btree directly to find
buckets that need to be evacuated instead of the in-memory bucket array,
which is finally going away soon.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/movinggc.c | 155 +++++++++++++++++++++++++++++++++----------------
 1 file changed, 105 insertions(+), 50 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c
index 64cb10c3f3db..dd71c0ce0a84 100644
--- a/fs/bcachefs/movinggc.c
+++ b/fs/bcachefs/movinggc.c
@@ -6,6 +6,7 @@
  */
 
 #include "bcachefs.h"
+#include "alloc_background.h"
 #include "alloc_foreground.h"
 #include "btree_iter.h"
 #include "btree_update.h"
@@ -137,18 +138,106 @@ static inline int fragmentation_cmp(copygc_heap *heap,
 	return cmp_int(l.fragmentation, r.fragmentation);
 }
 
+static int walk_buckets_to_copygc(struct bch_fs *c)
+{
+	copygc_heap *h = &c->copygc_heap;
+	struct btree_trans trans;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	struct bkey_alloc_unpacked u;
+	int ret;
+
+	bch2_trans_init(&trans, c, 0, 0);
+
+	for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN,
+			   BTREE_ITER_PREFETCH, k, ret) {
+		struct bch_dev *ca = bch_dev_bkey_exists(c, iter.pos.inode);
+		struct copygc_heap_entry e;
+
+		u = bch2_alloc_unpack(k);
+
+		if (u.data_type != BCH_DATA_user ||
+		    u.dirty_sectors >= ca->mi.bucket_size ||
+		    bch2_bucket_is_open(c, iter.pos.inode, iter.pos.offset))
+			continue;
+
+		e = (struct copygc_heap_entry) {
+			.dev		= iter.pos.inode,
+			.gen		= u.gen,
+			.replicas	= 1 + u.stripe_redundancy,
+			.fragmentation	= u.dirty_sectors * (1U << 15)
+				/ ca->mi.bucket_size,
+			.sectors	= u.dirty_sectors,
+			.offset		= bucket_to_sector(ca, iter.pos.offset),
+		};
+		heap_add_or_replace(h, e, -fragmentation_cmp, NULL);
+
+	}
+	bch2_trans_iter_exit(&trans, &iter);
+
+	bch2_trans_exit(&trans);
+	return ret;
+}
+
+static int bucket_inorder_cmp(const void *_l, const void *_r)
+{
+	const struct copygc_heap_entry *l = _l;
+	const struct copygc_heap_entry *r = _r;
+
+	return cmp_int(l->dev, r->dev) ?: cmp_int(l->offset, r->offset);
+}
+
+static int check_copygc_was_done(struct bch_fs *c,
+				 u64 *sectors_not_moved,
+				 u64 *buckets_not_moved)
+{
+	copygc_heap *h = &c->copygc_heap;
+	struct btree_trans trans;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	struct bkey_alloc_unpacked u;
+	struct copygc_heap_entry *i;
+	int ret = 0;
+
+	sort(h->data, h->used, sizeof(h->data[0]), bucket_inorder_cmp, NULL);
+
+	bch2_trans_init(&trans, c, 0, 0);
+	bch2_trans_iter_init(&trans, &iter, BTREE_ID_alloc, POS_MIN, 0);
+
+	for (i = h->data; i < h->data + h->used; i++) {
+		struct bch_dev *ca = bch_dev_bkey_exists(c, i->dev);
+
+		bch2_btree_iter_set_pos(&iter, POS(i->dev, sector_to_bucket(ca, i->offset)));
+
+		ret = lockrestart_do(&trans,
+				bkey_err(k = bch2_btree_iter_peek_slot(&iter)));
+		if (ret)
+			break;
+
+		u = bch2_alloc_unpack(k);
+
+		if (u.gen == i->gen && u.dirty_sectors) {
+			*sectors_not_moved += u.dirty_sectors;
+			*buckets_not_moved += 1;
+		}
+	}
+	bch2_trans_iter_exit(&trans, &iter);
+
+	bch2_trans_exit(&trans);
+	return ret;
+}
+
 static int bch2_copygc(struct bch_fs *c)
 {
 	copygc_heap *h = &c->copygc_heap;
 	struct copygc_heap_entry e, *i;
-	struct bucket_array *buckets;
 	struct bch_move_stats move_stats;
 	u64 sectors_to_move = 0, sectors_to_write = 0, sectors_not_moved = 0;
 	u64 sectors_reserved = 0;
 	u64 buckets_to_move, buckets_not_moved = 0;
 	struct bch_dev *ca;
 	unsigned dev_idx;
-	size_t b, heap_size = 0;
+	size_t heap_size = 0;
 	int ret;
 
 	bch_move_stats_init(&move_stats, "copygc");
@@ -178,34 +267,12 @@ static int bch2_copygc(struct bch_fs *c)
 		spin_lock(&ca->fs->freelist_lock);
 		sectors_reserved += fifo_used(&ca->free[RESERVE_MOVINGGC]) * ca->mi.bucket_size;
 		spin_unlock(&ca->fs->freelist_lock);
+	}
 
-		down_read(&ca->bucket_lock);
-		buckets = bucket_array(ca);
-
-		for (b = buckets->first_bucket; b < buckets->nbuckets; b++) {
-			struct bucket *g = buckets->b + b;
-			struct bucket_mark m = READ_ONCE(g->mark);
-			struct copygc_heap_entry e;
-
-			if (m.owned_by_allocator ||
-			    m.data_type != BCH_DATA_user ||
-			    m.dirty_sectors >= ca->mi.bucket_size)
-				continue;
-
-			WARN_ON(m.stripe && !g->stripe_redundancy);
-
-			e = (struct copygc_heap_entry) {
-				.dev		= dev_idx,
-				.gen		= m.gen,
-				.replicas	= 1 + g->stripe_redundancy,
-				.fragmentation	= m.dirty_sectors * (1U << 15)
-					/ ca->mi.bucket_size,
-				.sectors	= m.dirty_sectors,
-				.offset		= bucket_to_sector(ca, b),
-			};
-			heap_add_or_replace(h, e, -fragmentation_cmp, NULL);
-		}
-		up_read(&ca->bucket_lock);
+	ret = walk_buckets_to_copygc(c);
+	if (ret) {
+		bch2_fs_fatal_error(c, "error walking buckets to copygc!");
+		return ret;
 	}
 
 	if (!h->used) {
@@ -251,30 +318,18 @@ static int bch2_copygc(struct bch_fs *c)
 			     writepoint_ptr(&c->copygc_write_point),
 			     copygc_pred, NULL,
 			     &move_stats);
+	if (ret) {
+		bch_err(c, "error %i from bch2_move_data() in copygc", ret);
+		return ret;
+	}
 
-	for_each_rw_member(ca, c, dev_idx) {
-		down_read(&ca->bucket_lock);
-		buckets = bucket_array(ca);
-		for (i = h->data; i < h->data + h->used; i++) {
-			struct bucket_mark m;
-			size_t b;
-
-			if (i->dev != dev_idx)
-				continue;
-
-			b = sector_to_bucket(ca, i->offset);
-			m = READ_ONCE(buckets->b[b].mark);
-
-			if (i->gen == m.gen &&
-			    m.dirty_sectors) {
-				sectors_not_moved += m.dirty_sectors;
-				buckets_not_moved++;
-			}
-		}
-		up_read(&ca->bucket_lock);
+	ret = check_copygc_was_done(c, &sectors_not_moved, &buckets_not_moved);
+	if (ret) {
+		bch_err(c, "error %i from check_copygc_was_done()", ret);
+		return ret;
 	}
 
-	if (sectors_not_moved && !ret)
+	if (sectors_not_moved)
 		bch_warn_ratelimited(c,
 			"copygc finished but %llu/%llu sectors, %llu/%llu buckets not moved (move stats: moved %llu sectors, raced %llu keys, %llu sectors)",
 			 sectors_not_moved, sectors_to_move,
-- 
cgit 


From c45c866761671ddfc180a7fffa2e9f96bb8affd2 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 24 Dec 2021 04:51:10 -0500
Subject: bcachefs: bch2_gc_gens() no longer uses bucket array

Like the previous patches, this converts bch2_gc_gens() to use the alloc
btree directly, and private arrays of generation numbers for its own
recalculation of oldest_gen.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/bcachefs.h      |   2 +
 fs/bcachefs/btree_gc.c      | 108 +++++++++++++++++++++++++++++++-------------
 fs/bcachefs/buckets.h       |   6 ---
 fs/bcachefs/buckets_types.h |   1 -
 fs/bcachefs/super.c         |   1 +
 fs/bcachefs/sysfs.c         |  19 ++++----
 6 files changed, 90 insertions(+), 47 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 91514365d72b..1985af8311dc 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -452,6 +452,7 @@ struct bch_dev {
 	 */
 	struct bucket_array __rcu *buckets[2];
 	struct bucket_gens __rcu *bucket_gens;
+	u8			*oldest_gen;
 	unsigned long		*buckets_nouse;
 	struct rw_semaphore	bucket_lock;
 
@@ -806,6 +807,7 @@ struct bch_fs {
 	 * it's not while a gc is in progress.
 	 */
 	struct rw_semaphore	gc_lock;
+	struct mutex		gc_gens_lock;
 
 	/* IO PATH */
 	struct semaphore	io_in_flight;
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index d4b2d2657340..9c33341c7947 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -1787,9 +1787,8 @@ static bool gc_btree_gens_key(struct bch_fs *c, struct bkey_s_c k)
 	percpu_down_read(&c->mark_lock);
 	bkey_for_each_ptr(ptrs, ptr) {
 		struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
-		struct bucket *g = PTR_BUCKET(ca, ptr);
 
-		if (gen_after(g->mark.gen, ptr->gen) > 16) {
+		if (ptr_stale(ca, ptr) > 16) {
 			percpu_up_read(&c->mark_lock);
 			return true;
 		}
@@ -1797,10 +1796,10 @@ static bool gc_btree_gens_key(struct bch_fs *c, struct bkey_s_c k)
 
 	bkey_for_each_ptr(ptrs, ptr) {
 		struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
-		struct bucket *g = PTR_BUCKET(ca, ptr);
+		u8 *gen = &ca->oldest_gen[PTR_BUCKET_NR(ca, ptr)];
 
-		if (gen_after(g->gc_gen, ptr->gen))
-			g->gc_gen = ptr->gen;
+		if (gen_after(*gen, ptr->gen))
+			*gen = ptr->gen;
 	}
 	percpu_up_read(&c->mark_lock);
 
@@ -1811,23 +1810,22 @@ static bool gc_btree_gens_key(struct bch_fs *c, struct bkey_s_c k)
  * For recalculating oldest gen, we only need to walk keys in leaf nodes; btree
  * node pointers currently never have cached pointers that can become stale:
  */
-static int bch2_gc_btree_gens(struct bch_fs *c, enum btree_id btree_id)
+static int bch2_gc_btree_gens(struct btree_trans *trans, enum btree_id btree_id)
 {
-	struct btree_trans trans;
+	struct bch_fs *c = trans->c;
 	struct btree_iter iter;
 	struct bkey_s_c k;
 	struct bkey_buf sk;
 	int ret = 0, commit_err = 0;
 
 	bch2_bkey_buf_init(&sk);
-	bch2_trans_init(&trans, c, 0, 0);
 
-	bch2_trans_iter_init(&trans, &iter, btree_id, POS_MIN,
+	bch2_trans_iter_init(trans, &iter, btree_id, POS_MIN,
 			     BTREE_ITER_PREFETCH|
 			     BTREE_ITER_NOT_EXTENTS|
 			     BTREE_ITER_ALL_SNAPSHOTS);
 
-	while ((bch2_trans_begin(&trans),
+	while ((bch2_trans_begin(trans),
 		k = bch2_btree_iter_peek(&iter)).k) {
 		ret = bkey_err(k);
 
@@ -1843,10 +1841,10 @@ static int bch2_gc_btree_gens(struct bch_fs *c, enum btree_id btree_id)
 			bch2_extent_normalize(c, bkey_i_to_s(sk.k));
 
 			commit_err =
-				bch2_trans_update(&trans, &iter, sk.k, 0) ?:
-				bch2_trans_commit(&trans, NULL, NULL,
-						       BTREE_INSERT_NOWAIT|
-						       BTREE_INSERT_NOFAIL);
+				bch2_trans_update(trans, &iter, sk.k, 0) ?:
+				bch2_trans_commit(trans, NULL, NULL,
+						  BTREE_INSERT_NOWAIT|
+						  BTREE_INSERT_NOFAIL);
 			if (commit_err == -EINTR) {
 				commit_err = 0;
 				continue;
@@ -1855,20 +1853,42 @@ static int bch2_gc_btree_gens(struct bch_fs *c, enum btree_id btree_id)
 
 		bch2_btree_iter_advance(&iter);
 	}
-	bch2_trans_iter_exit(&trans, &iter);
+	bch2_trans_iter_exit(trans, &iter);
 
-	bch2_trans_exit(&trans);
 	bch2_bkey_buf_exit(&sk, c);
 
 	return ret;
 }
 
+static int bch2_alloc_write_oldest_gen(struct btree_trans *trans, struct btree_iter *iter)
+{
+	struct bch_dev *ca = bch_dev_bkey_exists(trans->c, iter->pos.inode);
+	struct bkey_s_c k;
+	struct bkey_alloc_unpacked u;
+	int ret;
+
+	k = bch2_btree_iter_peek_slot(iter);
+	ret = bkey_err(k);
+	if (ret)
+		return ret;
+
+	u = bch2_alloc_unpack(k);
+
+	if (u.oldest_gen == ca->oldest_gen[iter->pos.offset])
+		return 0;
+
+	u.oldest_gen = ca->oldest_gen[iter->pos.offset];
+
+	return bch2_alloc_write(trans, iter, &u, BTREE_TRIGGER_NORUN);
+}
+
 int bch2_gc_gens(struct bch_fs *c)
 {
+	struct btree_trans trans;
+	struct btree_iter iter;
+	struct bkey_s_c k;
 	struct bch_dev *ca;
-	struct bucket_array *buckets;
-	struct bucket *g;
-	u64 start_time = local_clock();
+	u64 b, start_time = local_clock();
 	unsigned i;
 	int ret;
 
@@ -1877,36 +1897,53 @@ int bch2_gc_gens(struct bch_fs *c)
 	 * introduces a deadlock in the RO path - we currently take the state
 	 * lock at the start of going RO, thus the gc thread may get stuck:
 	 */
+	if (!mutex_trylock(&c->gc_gens_lock))
+		return 0;
+
 	down_read(&c->gc_lock);
+	bch2_trans_init(&trans, c, 0, 0);
 
 	for_each_member_device(ca, c, i) {
-		down_read(&ca->bucket_lock);
-		buckets = bucket_array(ca);
+		struct bucket_gens *gens;
+
+		BUG_ON(ca->oldest_gen);
+
+		ca->oldest_gen = kvmalloc(ca->mi.nbuckets, GFP_KERNEL);
+		if (!ca->oldest_gen) {
+			percpu_ref_put(&ca->ref);
+			ret = -ENOMEM;
+			goto err;
+		}
+
+		gens = bucket_gens(ca);
 
-		for_each_bucket(g, buckets)
-			g->gc_gen = g->mark.gen;
-		up_read(&ca->bucket_lock);
+		for (b = gens->first_bucket;
+		     b < gens->nbuckets; b++)
+			ca->oldest_gen[b] = gens->b[b];
 	}
 
 	for (i = 0; i < BTREE_ID_NR; i++)
 		if ((1 << i) & BTREE_ID_HAS_PTRS) {
 			c->gc_gens_btree = i;
 			c->gc_gens_pos = POS_MIN;
-			ret = bch2_gc_btree_gens(c, i);
+			ret = bch2_gc_btree_gens(&trans, i);
 			if (ret) {
 				bch_err(c, "error recalculating oldest_gen: %i", ret);
 				goto err;
 			}
 		}
 
-	for_each_member_device(ca, c, i) {
-		down_read(&ca->bucket_lock);
-		buckets = bucket_array(ca);
-
-		for_each_bucket(g, buckets)
-			g->oldest_gen = g->gc_gen;
-		up_read(&ca->bucket_lock);
+	for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN,
+			   BTREE_ITER_PREFETCH, k, ret) {
+		ret = __bch2_trans_do(&trans, NULL, NULL,
+				      BTREE_INSERT_NOFAIL,
+				bch2_alloc_write_oldest_gen(&trans, &iter));
+		if (ret) {
+			bch_err(c, "error writing oldest_gen: %i", ret);
+			break;
+		}
 	}
+	bch2_trans_iter_exit(&trans, &iter);
 
 	c->gc_gens_btree	= 0;
 	c->gc_gens_pos		= POS_MIN;
@@ -1915,7 +1952,14 @@ int bch2_gc_gens(struct bch_fs *c)
 
 	bch2_time_stats_update(&c->times[BCH_TIME_btree_gc], start_time);
 err:
+	for_each_member_device(ca, c, i) {
+		kvfree(ca->oldest_gen);
+		ca->oldest_gen = NULL;
+	}
+
+	bch2_trans_exit(&trans);
 	up_read(&c->gc_lock);
+	mutex_unlock(&c->gc_gens_lock);
 	return ret;
 }
 
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index 483c8b24293f..8a3cea6f94df 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -97,12 +97,6 @@ static inline size_t PTR_BUCKET_NR(const struct bch_dev *ca,
 	return sector_to_bucket(ca, ptr->offset);
 }
 
-static inline struct bucket *PTR_BUCKET(struct bch_dev *ca,
-					const struct bch_extent_ptr *ptr)
-{
-	return bucket(ca, PTR_BUCKET_NR(ca, ptr));
-}
-
 static inline struct bucket *PTR_GC_BUCKET(struct bch_dev *ca,
 					   const struct bch_extent_ptr *ptr)
 {
diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h
index 24139831226d..2c73dc60b838 100644
--- a/fs/bcachefs/buckets_types.h
+++ b/fs/bcachefs/buckets_types.h
@@ -30,7 +30,6 @@ struct bucket {
 
 	u64				io_time[2];
 	u8				oldest_gen;
-	u8				gc_gen;
 	unsigned			gen_valid:1;
 	u8				stripe_redundancy;
 	u32				stripe;
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index a90fa0ae550b..d35547fbefdb 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -682,6 +682,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 	INIT_WORK(&c->read_only_work, bch2_fs_read_only_work);
 
 	init_rwsem(&c->gc_lock);
+	mutex_init(&c->gc_gens_lock);
 
 	for (i = 0; i < BCH_TIME_STAT_NR; i++)
 		bch2_time_stats_init(&c->times[i]);
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index ed9a095063e8..b727845dd64b 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -499,6 +499,17 @@ STORE(bch2_fs)
 
 	/* Debugging: */
 
+	if (!test_bit(BCH_FS_RW, &c->flags))
+		return -EROFS;
+
+	if (attr == &sysfs_prune_cache) {
+		struct shrink_control sc;
+
+		sc.gfp_mask = GFP_KERNEL;
+		sc.nr_to_scan = strtoul_or_return(buf);
+		c->btree_cache.shrink.scan_objects(&c->btree_cache.shrink, &sc);
+	}
+
 	if (attr == &sysfs_trigger_gc) {
 		/*
 		 * Full gc is currently incompatible with btree key cache:
@@ -512,14 +523,6 @@ STORE(bch2_fs)
 #endif
 	}
 
-	if (attr == &sysfs_prune_cache) {
-		struct shrink_control sc;
-
-		sc.gfp_mask = GFP_KERNEL;
-		sc.nr_to_scan = strtoul_or_return(buf);
-		c->btree_cache.shrink.scan_objects(&c->btree_cache.shrink, &sc);
-	}
-
 #ifdef CONFIG_BCACHEFS_TESTS
 	if (attr == &sysfs_perf_test) {
 		char *tmp = kstrdup(buf, GFP_KERNEL), *p = tmp;
-- 
cgit 


From aa8982c3f2cbfca89fb73daad9d6e65f7be022c2 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 10 Feb 2022 03:40:44 -0500
Subject: bcachefs: Fix reflink repair code

The reflink repair code was incorrectly inserting a nonzero deleted key
via journal replay - this is due to bch2_journal_key_insert() being
somewhat hacky, and so this fix is also hacky for now.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_gc.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 9c33341c7947..d1fbe3b77379 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -1501,10 +1501,18 @@ static int bch2_gc_reflink_done(struct bch_fs *c, bool initial,
 
 			bkey_reassemble(new, k);
 
-			if (!r->refcount)
+			if (!r->refcount) {
 				new->k.type = KEY_TYPE_deleted;
-			else
+				/*
+				 * XXX ugly: bch2_journal_key_insert() queues up
+				 * the key for the journal replay code, which
+				 * doesn't run the extent overwrite pass
+				 */
+				if (initial)
+					new->k.size = 0;
+			} else {
 				*bkey_refcount(new) = cpu_to_le64(r->refcount);
+			}
 
 			ret = initial
 			       ? bch2_journal_key_insert(c, BTREE_ID_stripes, 0, new)
-- 
cgit 


From 9e34316156a2c148b0675087beeaca26f7eb79f9 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 13 Feb 2022 20:42:12 -0500
Subject: bcachefs: Small fsck fix

The check_dirents pass handles transaction restarts at the toplevel -
check_subdir_count() was incorrectly handling transaction restarts
without returning -EINTR, meaning that the iterator pointing to the
dirent being checked was left invalid.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/fsck.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 43b6159be01b..ced4d671eb8d 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -1316,8 +1316,9 @@ static int check_subdir_count(struct btree_trans *trans, struct inode_walker *w)
 		if (i->inode.bi_nlink == i->count)
 			continue;
 
-		count2 = lockrestart_do(trans,
-				bch2_count_subdirs(trans, w->cur_inum, i->snapshot));
+		count2 = bch2_count_subdirs(trans, w->cur_inum, i->snapshot);
+		if (count2 < 0)
+			return count2;
 
 		if (i->count != count2) {
 			bch_err(c, "fsck counted subdirectories wrong: got %llu should be %llu",
-- 
cgit 


From 0f78264a6b84733cc9ef36d22c547133cab21270 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 13 Feb 2022 20:47:05 -0500
Subject: bcachefs: Print a better message for mark and sweep pass

Btree gc, aka mark and sweep, checks allocations - so let's just print
that.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/recovery.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index feafb7296ddf..939f7565d290 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -1144,12 +1144,12 @@ use_clean:
 	    test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags)) {
 		bool metadata_only = c->opts.norecovery;
 
-		bch_info(c, "starting mark and sweep");
+		bch_info(c, "checking allocations");
 		err = "error in mark and sweep";
 		ret = bch2_gc(c, true, metadata_only);
 		if (ret)
 			goto err;
-		bch_verbose(c, "mark and sweep done");
+		bch_verbose(c, "done checking allocations");
 	}
 
 	bch2_stripes_heap_start(c);
-- 
cgit 


From 2ce8fbd9bbfaea9786f56012a633dedb86349c5a Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 13 Feb 2022 22:16:45 -0500
Subject: bcachefs: Kill bch2_bkey_debugcheck

The old .debugcheck methods are no more and this just calls the .invalid
method, which doesn't add much since we already check that when doing
btree updates and when reading metadata in.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/bcachefs.h     |  3 ---
 fs/bcachefs/bkey_methods.c | 16 ----------------
 fs/bcachefs/bkey_methods.h |  2 --
 fs/bcachefs/btree_iter.c   | 16 +---------------
 4 files changed, 1 insertion(+), 36 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 1985af8311dc..d9ba48ce7601 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -281,9 +281,6 @@ do {									\
 		"significantly affect performance")			\
 	BCH_DEBUG_PARAM(debug_check_iterators,				\
 		"Enables extra verification for btree iterators")	\
-	BCH_DEBUG_PARAM(debug_check_bkeys,				\
-		"Run bkey_debugcheck (primarily checking GC/allocation "\
-		"information) when iterating over keys")		\
 	BCH_DEBUG_PARAM(debug_check_btree_accounting,			\
 		"Verify btree accounting for keys within a node")	\
 	BCH_DEBUG_PARAM(journal_seq_verify,				\
diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c
index c93004741b87..f11b6d9f08d6 100644
--- a/fs/bcachefs/bkey_methods.c
+++ b/fs/bcachefs/bkey_methods.c
@@ -212,22 +212,6 @@ const char *bch2_bkey_in_btree_node(struct btree *b, struct bkey_s_c k)
 	return NULL;
 }
 
-void bch2_bkey_debugcheck(struct bch_fs *c, struct btree *b, struct bkey_s_c k)
-{
-	const char *invalid;
-
-	BUG_ON(!k.k->u64s);
-
-	invalid = bch2_bkey_invalid(c, k, btree_node_type(b)) ?:
-		bch2_bkey_in_btree_node(b, k);
-	if (invalid) {
-		char buf[160];
-
-		bch2_bkey_val_to_text(&PBUF(buf), c, k);
-		bch2_fs_inconsistent(c, "invalid bkey %s: %s", buf, invalid);
-	}
-}
-
 void bch2_bpos_to_text(struct printbuf *out, struct bpos pos)
 {
 	if (!bpos_cmp(pos, POS_MIN))
diff --git a/fs/bcachefs/bkey_methods.h b/fs/bcachefs/bkey_methods.h
index 4e316c2f6954..520f7d93993d 100644
--- a/fs/bcachefs/bkey_methods.h
+++ b/fs/bcachefs/bkey_methods.h
@@ -34,8 +34,6 @@ const char *bch2_bkey_invalid(struct bch_fs *, struct bkey_s_c,
 			      enum btree_node_type);
 const char *bch2_bkey_in_btree_node(struct btree *, struct bkey_s_c);
 
-void bch2_bkey_debugcheck(struct bch_fs *, struct btree *, struct bkey_s_c);
-
 void bch2_bpos_to_text(struct printbuf *, struct bpos);
 void bch2_bkey_to_text(struct printbuf *, const struct bkey *);
 void bch2_val_to_text(struct printbuf *, struct bch_fs *,
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index c7ba6ce27007..c56f9e101b42 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -990,8 +990,6 @@ static inline struct bkey_s_c __btree_iter_unpack(struct bch_fs *c,
 						  struct bkey *u,
 						  struct bkey_packed *k)
 {
-	struct bkey_s_c ret;
-
 	if (unlikely(!k)) {
 		/*
 		 * signal to bch2_btree_iter_peek_slot() that we're currently at
@@ -1001,19 +999,7 @@ static inline struct bkey_s_c __btree_iter_unpack(struct bch_fs *c,
 		return bkey_s_c_null;
 	}
 
-	ret = bkey_disassemble(l->b, k, u);
-
-	/*
-	 * XXX: bch2_btree_bset_insert_key() generates invalid keys when we
-	 * overwrite extents - it sets k->type = KEY_TYPE_deleted on the key
-	 * being overwritten but doesn't change k->size. But this is ok, because
-	 * those keys are never written out, we just have to avoid a spurious
-	 * assertion here:
-	 */
-	if (bch2_debug_check_bkeys && !bkey_deleted(ret.k))
-		bch2_bkey_debugcheck(c, l->b, ret);
-
-	return ret;
+	return bkey_disassemble(l->b, k, u);
 }
 
 static inline struct bkey_s_c btree_path_level_peek_all(struct bch_fs *c,
-- 
cgit 


From 52eef42c5fecb037b626cbab2dd06f34e5f0fddb Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 15 Feb 2022 23:40:30 -0500
Subject: bcachefs: Fix locking in data move path

We need to ensure we don't have any btree locks held when calling
do_pending_writes() - besides issuing IOs, upcoming allocator changes
will have allocations doing btree lookups directly.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/move.c | 37 ++++++++++++++++++++-----------------
 1 file changed, 20 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index f428e2ff99f6..04971bf847bf 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -486,19 +486,22 @@ static void move_read_endio(struct bio *bio)
 	closure_put(&ctxt->cl);
 }
 
-static void do_pending_writes(struct moving_context *ctxt)
+static void do_pending_writes(struct moving_context *ctxt, struct btree_trans *trans)
 {
 	struct moving_io *io;
 
+	if (trans)
+		bch2_trans_unlock(trans);
+
 	while ((io = next_pending_write(ctxt))) {
 		list_del(&io->list);
 		move_write(io);
 	}
 }
 
-#define move_ctxt_wait_event(_ctxt, _cond)			\
+#define move_ctxt_wait_event(_ctxt, _trans, _cond)		\
 do {								\
-	do_pending_writes(_ctxt);				\
+	do_pending_writes(_ctxt, _trans);			\
 								\
 	if (_cond)						\
 		break;						\
@@ -506,11 +509,12 @@ do {								\
 		     next_pending_write(_ctxt) || (_cond));	\
 } while (1)
 
-static void bch2_move_ctxt_wait_for_io(struct moving_context *ctxt)
+static void bch2_move_ctxt_wait_for_io(struct moving_context *ctxt,
+				       struct btree_trans *trans)
 {
 	unsigned sectors_pending = atomic_read(&ctxt->write_sectors);
 
-	move_ctxt_wait_event(ctxt,
+	move_ctxt_wait_event(ctxt, trans,
 		!atomic_read(&ctxt->write_sectors) ||
 		atomic_read(&ctxt->write_sectors) != sectors_pending);
 }
@@ -532,14 +536,6 @@ static int bch2_move_extent(struct btree_trans *trans,
 	unsigned sectors = k.k->size, pages;
 	int ret = -ENOMEM;
 
-	move_ctxt_wait_event(ctxt,
-		atomic_read(&ctxt->write_sectors) <
-		SECTORS_IN_FLIGHT_PER_DEVICE);
-
-	move_ctxt_wait_event(ctxt,
-		atomic_read(&ctxt->read_sectors) <
-		SECTORS_IN_FLIGHT_PER_DEVICE);
-
 	/* write path might have to decompress data: */
 	bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
 		sectors = max_t(unsigned, sectors, p.crc.uncompressed_size);
@@ -692,12 +688,19 @@ static int __bch2_move_data(struct bch_fs *c,
 				schedule_timeout(delay);
 
 			if (unlikely(freezing(current))) {
-				bch2_trans_unlock(&trans);
-				move_ctxt_wait_event(ctxt, list_empty(&ctxt->reads));
+				move_ctxt_wait_event(ctxt, &trans, list_empty(&ctxt->reads));
 				try_to_freeze();
 			}
 		} while (delay);
 
+		move_ctxt_wait_event(ctxt, &trans,
+			atomic_read(&ctxt->write_sectors) <
+			SECTORS_IN_FLIGHT_PER_DEVICE);
+
+		move_ctxt_wait_event(ctxt, &trans,
+			atomic_read(&ctxt->read_sectors) <
+			SECTORS_IN_FLIGHT_PER_DEVICE);
+
 		bch2_trans_begin(&trans);
 
 		k = bch2_btree_iter_peek(&iter);
@@ -762,7 +765,7 @@ static int __bch2_move_data(struct bch_fs *c,
 
 			if (ret2 == -ENOMEM) {
 				/* memory allocation failure, wait for some IO to finish */
-				bch2_move_ctxt_wait_for_io(ctxt);
+				bch2_move_ctxt_wait_for_io(ctxt, &trans);
 				continue;
 			}
 
@@ -847,7 +850,7 @@ int bch2_move_data(struct bch_fs *c,
 	}
 
 
-	move_ctxt_wait_event(&ctxt, list_empty(&ctxt.reads));
+	move_ctxt_wait_event(&ctxt, NULL, list_empty(&ctxt.reads));
 	closure_sync(&ctxt.cl);
 
 	EBUG_ON(atomic_read(&ctxt.write_sectors));
-- 
cgit 


From 8be1aff0092a1f747973bf978ab8411b257af461 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 15 Feb 2022 21:45:04 -0500
Subject: bcachefs: Delete redundant tracepoint

We were emitting two trace events on transaction restart in this code
path - delete the redundant one.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_key_cache.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index 72a54b9d1335..1841760237ec 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -320,7 +320,6 @@ retry:
 			if (!trans->restarted)
 				goto retry;
 
-			trace_transaction_restart_ip(trans->fn, _THIS_IP_);
 			ret = -EINTR;
 			goto err;
 		}
-- 
cgit 


From c7ce27328ba133d6cce76a4df7667088009d4543 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 15 Feb 2022 22:28:37 -0500
Subject: bcachefs: Also show when blocked on write locks

This consolidates some of the btree node lock path, so that when we're
blocked taking a write lock on a node it shows up in
bch2_btree_trans_to_text(), along with intent and read locks.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_iter.c            | 27 +++++-----------------
 fs/bcachefs/btree_locking.h         | 46 +++++++++++++++++++++++--------------
 fs/bcachefs/btree_types.h           |  1 +
 fs/bcachefs/btree_update_interior.c |  6 ++---
 fs/bcachefs/btree_update_leaf.c     |  8 ++++---
 5 files changed, 44 insertions(+), 44 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index c56f9e101b42..6c1fbe3e3bda 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -150,7 +150,7 @@ void __bch2_btree_node_lock_write(struct btree_trans *trans, struct btree *b)
 	else
 		this_cpu_sub(*b->c.lock.readers, readers);
 
-	btree_node_lock_type(trans->c, b, SIX_LOCK_write);
+	six_lock_write(&b->c.lock, NULL, NULL);
 
 	if (!b->c.lock.readers)
 		atomic64_add(__SIX_VAL(read_lock, readers),
@@ -289,9 +289,7 @@ bool __bch2_btree_node_lock(struct btree_trans *trans,
 			    unsigned long ip)
 {
 	struct btree_path *linked, *deadlock_path = NULL;
-	u64 start_time = local_clock();
 	unsigned reason = 9;
-	bool ret;
 
 	/* Check if it's safe to block: */
 	trans_for_each_path(trans, linked) {
@@ -368,23 +366,8 @@ bool __bch2_btree_node_lock(struct btree_trans *trans,
 		return false;
 	}
 
-	if (six_trylock_type(&b->c.lock, type))
-		return true;
-
-	trans->locking_path_idx = path->idx;
-	trans->locking_pos	= pos;
-	trans->locking_btree_id	= path->btree_id;
-	trans->locking_level	= level;
-	trans->locking		= b;
-
-	ret = six_lock_type(&b->c.lock, type, should_sleep_fn, p) == 0;
-
-	trans->locking = NULL;
-
-	if (ret)
-		bch2_time_stats_update(&trans->c->times[lock_to_time_stat(type)],
-				       start_time);
-	return ret;
+	return btree_node_lock_type(trans, path, b, pos, level,
+				    type, should_sleep_fn, p);
 }
 
 /* Btree iterator locking: */
@@ -3191,6 +3174,7 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct bch_fs *c)
 	struct btree_trans *trans;
 	struct btree_path *path;
 	struct btree *b;
+	static char lock_types[] = { 'r', 'i', 'w' };
 	unsigned l;
 
 	mutex_lock(&c->btree_trans_lock);
@@ -3227,10 +3211,11 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct bch_fs *c)
 		b = READ_ONCE(trans->locking);
 		if (b) {
 			path = &trans->paths[trans->locking_path_idx];
-			pr_buf(out, "  locking path %u %c l=%u %s:",
+			pr_buf(out, "  locking path %u %c l=%u %c %s:",
 			       trans->locking_path_idx,
 			       path->cached ? 'c' : 'b',
 			       trans->locking_level,
+			       lock_types[trans->locking_lock_type],
 			       bch2_btree_ids[trans->locking_btree_id]);
 			bch2_bpos_to_text(out, trans->locking_pos);
 
diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h
index d05689180c63..4a87fa625d7a 100644
--- a/fs/bcachefs/btree_locking.h
+++ b/fs/bcachefs/btree_locking.h
@@ -127,23 +127,35 @@ static inline enum bch_time_stats lock_to_time_stat(enum six_lock_type type)
 	}
 }
 
-/*
- * wrapper around six locks that just traces lock contended time
- */
-static inline void __btree_node_lock_type(struct bch_fs *c, struct btree *b,
-					  enum six_lock_type type)
-{
-	u64 start_time = local_clock();
-
-	six_lock_type(&b->c.lock, type, NULL, NULL);
-	bch2_time_stats_update(&c->times[lock_to_time_stat(type)], start_time);
-}
-
-static inline void btree_node_lock_type(struct bch_fs *c, struct btree *b,
-					enum six_lock_type type)
-{
-	if (!six_trylock_type(&b->c.lock, type))
-		__btree_node_lock_type(c, b, type);
+static inline bool btree_node_lock_type(struct btree_trans *trans,
+				       struct btree_path *path,
+				       struct btree *b,
+				       struct bpos pos, unsigned level,
+				       enum six_lock_type type,
+				       six_lock_should_sleep_fn should_sleep_fn, void *p)
+{
+	struct bch_fs *c = trans->c;
+	u64 start_time;
+	bool ret;
+
+	if (six_trylock_type(&b->c.lock, type))
+		return true;
+
+	start_time = local_clock();
+
+	trans->locking_path_idx = path->idx;
+	trans->locking_pos	= pos;
+	trans->locking_btree_id	= path->btree_id;
+	trans->locking_level	= level;
+	trans->locking_lock_type = type;
+	trans->locking		= b;
+	ret = six_lock_type(&b->c.lock, type, should_sleep_fn, p) == 0;
+	trans->locking = NULL;
+
+	if (ret)
+		bch2_time_stats_update(&c->times[lock_to_time_stat(type)], start_time);
+
+	return ret;
 }
 
 /*
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 0afade4f61f4..7e5b70f60444 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -383,6 +383,7 @@ struct btree_trans {
 	struct bpos		locking_pos;
 	u8			locking_btree_id;
 	u8			locking_level;
+	u8			locking_lock_type;
 	pid_t			pid;
 	int			srcu_idx;
 
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 7d5efb32b082..07bece908691 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -607,8 +607,8 @@ err:
 		 * we're in journal error state:
 		 */
 
-		btree_node_lock_type(c, b, SIX_LOCK_intent);
-		btree_node_lock_type(c, b, SIX_LOCK_write);
+		six_lock_intent(&b->c.lock, NULL, NULL);
+		six_lock_write(&b->c.lock, NULL, NULL);
 		mutex_lock(&c->btree_interior_update_lock);
 
 		list_del(&as->write_blocked_list);
@@ -662,7 +662,7 @@ err:
 	for (i = 0; i < as->nr_new_nodes; i++) {
 		b = as->new_nodes[i];
 
-		btree_node_lock_type(c, b, SIX_LOCK_read);
+		six_lock_read(&b->c.lock, NULL, NULL);
 		btree_node_write_if_need(c, b, SIX_LOCK_read);
 		six_unlock_read(&b->c.lock);
 	}
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index dc033991a4ec..bde4bb2b7fcc 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -169,7 +169,7 @@ static int __btree_node_flush(struct journal *j, struct journal_entry_pin *pin,
 	struct btree_write *w = container_of(pin, struct btree_write, journal);
 	struct btree *b = container_of(w, struct btree, writes[i]);
 
-	btree_node_lock_type(c, b, SIX_LOCK_read);
+	six_lock_read(&b->c.lock, NULL, NULL);
 	bch2_btree_node_write_cond(c, b,
 		(btree_current_write(b) == w && w->journal.seq == seq));
 	six_unlock_read(&b->c.lock);
@@ -626,8 +626,10 @@ static inline int trans_lock_write(struct btree_trans *trans)
 			if (have_conflicting_read_lock(trans, i->path))
 				goto fail;
 
-			__btree_node_lock_type(trans->c, insert_l(i)->b,
-					       SIX_LOCK_write);
+			btree_node_lock_type(trans, i->path,
+					     insert_l(i)->b,
+					     i->path->pos, i->level,
+					     SIX_LOCK_write, NULL, NULL);
 		}
 
 		bch2_btree_node_prep_for_write(trans, i->path, insert_l(i)->b);
-- 
cgit 


From 7abda8c1d8af41266e543160bb3290dea963fdd0 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 15 Feb 2022 22:01:33 -0500
Subject: bcachefs: Fix __bch2_btree_node_lock

__bch2_btree_node_lock() was implementing the wrong lock ordering for
cached vs. non cached paths - this fixes it to match the btree path sort
order as defined by __btree_path_cmp(), and also simplifies the code
some.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_iter.c | 61 ++++++++++++++++++++++++------------------------
 1 file changed, 31 insertions(+), 30 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 6c1fbe3e3bda..7b54d662f4cb 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -46,6 +46,9 @@ static inline int __btree_path_cmp(const struct btree_path *l,
 				   struct bpos		r_pos,
 				   unsigned		r_level)
 {
+	/*
+	 * Must match lock ordering as defined by __bch2_btree_node_lock:
+	 */
 	return   cmp_int(l->btree_id,	r_btree_id) ?:
 		 cmp_int((int) l->cached,	(int) r_cached) ?:
 		 bpos_cmp(l->pos,	r_pos) ?:
@@ -288,8 +291,8 @@ bool __bch2_btree_node_lock(struct btree_trans *trans,
 			    six_lock_should_sleep_fn should_sleep_fn, void *p,
 			    unsigned long ip)
 {
-	struct btree_path *linked, *deadlock_path = NULL;
-	unsigned reason = 9;
+	struct btree_path *linked;
+	unsigned reason;
 
 	/* Check if it's safe to block: */
 	trans_for_each_path(trans, linked) {
@@ -310,28 +313,28 @@ bool __bch2_btree_node_lock(struct btree_trans *trans,
 		 */
 		if (type == SIX_LOCK_intent &&
 		    linked->nodes_locked != linked->nodes_intent_locked) {
-			deadlock_path = linked;
 			reason = 1;
+			goto deadlock;
 		}
 
 		if (linked->btree_id != path->btree_id) {
-			if (linked->btree_id > path->btree_id) {
-				deadlock_path = linked;
-				reason = 3;
-			}
-			continue;
+			if (linked->btree_id < path->btree_id)
+				continue;
+
+			reason = 3;
+			goto deadlock;
 		}
 
 		/*
-		 * Within the same btree, cached paths come before non
-		 * cached paths:
+		 * Within the same btree, non-cached paths come before cached
+		 * paths:
 		 */
 		if (linked->cached != path->cached) {
-			if (path->cached) {
-				deadlock_path = linked;
-				reason = 4;
-			}
-			continue;
+			if (!linked->cached)
+				continue;
+
+			reason = 4;
+			goto deadlock;
 		}
 
 		/*
@@ -340,34 +343,32 @@ bool __bch2_btree_node_lock(struct btree_trans *trans,
 		 * we're about to lock, it must have the ancestors locked too:
 		 */
 		if (level > __fls(linked->nodes_locked)) {
-			deadlock_path = linked;
 			reason = 5;
+			goto deadlock;
 		}
 
 		/* Must lock btree nodes in key order: */
 		if (btree_node_locked(linked, level) &&
 		    bpos_cmp(pos, btree_node_pos((void *) linked->l[level].b,
 						 linked->cached)) <= 0) {
-			deadlock_path = linked;
 			reason = 7;
+			goto deadlock;
 		}
 	}
 
-	if (unlikely(deadlock_path)) {
-		trace_trans_restart_would_deadlock(trans->fn, ip,
-				trans->in_traverse_all, reason,
-				deadlock_path->btree_id,
-				deadlock_path->cached,
-				&deadlock_path->pos,
-				path->btree_id,
-				path->cached,
-				&pos);
-		btree_trans_restart(trans);
-		return false;
-	}
-
 	return btree_node_lock_type(trans, path, b, pos, level,
 				    type, should_sleep_fn, p);
+deadlock:
+	trace_trans_restart_would_deadlock(trans->fn, ip,
+			trans->in_traverse_all, reason,
+			linked->btree_id,
+			linked->cached,
+			&linked->pos,
+			path->btree_id,
+			path->cached,
+			&pos);
+	btree_trans_restart(trans);
+	return false;
 }
 
 /* Btree iterator locking: */
-- 
cgit 


From fcf01959eaa828b1005f8f30732949e64edb8c4d Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 14 Feb 2022 04:20:39 -0500
Subject: bcachefs: Kill verify_not_stale()

This is ancient code that's more effectively checked in other places
now.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/alloc_foreground.c | 18 ------------------
 1 file changed, 18 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index e2038032b872..dc2f153f60c6 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -151,22 +151,6 @@ static void open_bucket_free_unused(struct bch_fs *c,
 	}
 }
 
-static void verify_not_stale(struct bch_fs *c, const struct open_buckets *obs)
-{
-#ifdef CONFIG_BCACHEFS_DEBUG
-	struct open_bucket *ob;
-	unsigned i;
-
-	rcu_read_lock();
-	open_bucket_for_each(c, obs, ob, i) {
-		struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
-
-		BUG_ON(*bucket_gen(ca, ob->bucket) != ob->gen);
-	}
-	rcu_read_unlock();
-#endif
-}
-
 /* _only_ for allocating the journal on a new device: */
 long bch2_bucket_alloc_new_fs(struct bch_dev *ca)
 {
@@ -857,8 +841,6 @@ alloc_done:
 
 	BUG_ON(!wp->sectors_free || wp->sectors_free == UINT_MAX);
 
-	verify_not_stale(c, &wp->ptrs);
-
 	return wp;
 err:
 	open_bucket_for_each(c, &wp->ptrs, ob, i)
-- 
cgit 


From eb331fe5a4e801dc11d96ba7fbda0a91c8bd626c Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 15 Feb 2022 00:06:59 -0500
Subject: bcachefs: Check for stale dirty pointer before reads

Since we retry reads when we discover we read from a pointer that went
stale, if a dirty pointer is erroniously stale it would cause us to loop
retrying that read forever - unless we check before issuing the read,
while the btree is still locked, when we know that a dirty pointer
should never be stale.

This patch adds that check, along with printing some helpful debug info.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/fs-io.c |  2 --
 fs/bcachefs/io.c    | 60 ++++++++++++++++++++++++++++++++++++++++++++---------
 fs/bcachefs/move.c  |  6 ++++--
 3 files changed, 54 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 5fce958bafc9..9161125aec17 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -1043,8 +1043,6 @@ retry:
 
 		sectors = min(sectors, k.k->size - offset_into_extent);
 
-		bch2_trans_unlock(trans);
-
 		if (readpages_iter)
 			readpage_bio_extend(readpages_iter, &rbio->bio, sectors,
 					    extent_partial_reads_expensive(k));
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index 218934b4e19b..914e22c5c247 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -2032,6 +2032,33 @@ err:
 	return ret;
 }
 
+static noinline void read_from_stale_dirty_pointer(struct btree_trans *trans,
+						   struct bkey_s_c k,
+						   struct bch_extent_ptr ptr)
+{
+	struct bch_fs *c = trans->c;
+	struct bch_dev *ca = bch_dev_bkey_exists(c, ptr.dev);
+	struct btree_iter iter;
+	char buf[200];
+	int ret;
+
+	bch2_bkey_val_to_text(&PBUF(buf), c, k);
+	bch2_fs_inconsistent(c, "Attempting to read from stale dirty pointer: %s", buf);
+
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc,
+			     POS(ptr.dev, PTR_BUCKET_NR(ca, &ptr)),
+			     BTREE_ITER_CACHED);
+
+	ret = lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_slot(&iter)));
+	if (ret)
+		return;
+
+	bch2_bkey_val_to_text(&PBUF(buf), c, k);
+	bch_err(c, "%s", buf);
+	bch_err(c, "memory gen: %u", *bucket_gen(ca, iter.pos.offset));
+	bch2_trans_iter_exit(trans, &iter);
+}
+
 int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig,
 		       struct bvec_iter iter, struct bpos read_pos,
 		       enum btree_id data_btree, struct bkey_s_c k,
@@ -2041,7 +2068,7 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig,
 	struct bch_fs *c = trans->c;
 	struct extent_ptr_decoded pick;
 	struct bch_read_bio *rbio = NULL;
-	struct bch_dev *ca;
+	struct bch_dev *ca = NULL;
 	struct promote_op *promote = NULL;
 	bool bounce = false, read_full = false, narrow_crcs = false;
 	struct bpos data_pos = bkey_start_pos(k.k);
@@ -2058,7 +2085,7 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig,
 		zero_fill_bio_iter(&orig->bio, iter);
 		goto out_read_done;
 	}
-
+retry_pick:
 	pick_ret = bch2_bkey_pick_read_device(c, k, failed, &pick);
 
 	/* hole or reservation - just zero fill: */
@@ -2071,8 +2098,27 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig,
 		goto err;
 	}
 
-	if (pick_ret > 0)
-		ca = bch_dev_bkey_exists(c, pick.ptr.dev);
+	ca = bch_dev_bkey_exists(c, pick.ptr.dev);
+
+	/*
+	 * Stale dirty pointers are treated as IO errors, but @failed isn't
+	 * allocated unless we're in the retry path - so if we're not in the
+	 * retry path, don't check here, it'll be caught in bch2_read_endio()
+	 * and we'll end up in the retry path:
+	 */
+	if ((flags & BCH_READ_IN_RETRY) &&
+	    !pick.ptr.cached &&
+	    unlikely(ptr_stale(ca, &pick.ptr))) {
+		read_from_stale_dirty_pointer(trans, k, pick.ptr);
+		bch2_mark_io_failure(failed, &pick);
+		goto retry_pick;
+	}
+
+	/*
+	 * Unlock the iterator while the btree node's lock is still in
+	 * cache, before doing the IO:
+	 */
+	bch2_trans_unlock(trans);
 
 	if (flags & BCH_READ_NODECODE) {
 		/*
@@ -2367,12 +2413,6 @@ retry:
 		 */
 		sectors = min(sectors, k.k->size - offset_into_extent);
 
-		/*
-		 * Unlock the iterator while the btree node's lock is still in
-		 * cache, before doing the IO:
-		 */
-		bch2_trans_unlock(&trans);
-
 		bytes = min(sectors, bvec_iter_sectors(bvec_iter)) << 9;
 		swap(bvec_iter.bi_size, bytes);
 
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index 04971bf847bf..4751d79219cb 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -752,10 +752,12 @@ static int __bch2_move_data(struct bch_fs *c,
 			BUG();
 		}
 
-		/* unlock before doing IO: */
+		/*
+		 * The iterator gets unlocked by __bch2_read_extent - need to
+		 * save a copy of @k elsewhere:
+		  */
 		bch2_bkey_buf_reassemble(&sk, c, k);
 		k = bkey_i_to_s_c(sk.k);
-		bch2_trans_unlock(&trans);
 
 		ret2 = bch2_move_extent(&trans, ctxt, wp, io_opts, btree_id, k,
 					data_cmd, data_opts);
-- 
cgit 


From 4b59a319ad29815aa8f629513df2c291c2108bf9 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 16 Feb 2022 00:42:34 -0500
Subject: bcachefs: Fix slow tracepoints

Some of our tracepoints were calling snprintf("pS") - which does symbol
table lookups - in TP_fast_assign(), which turns out to be a really bad
idea.

This was done because perf trace wasn't correctly printing tracepoints
that use %pS anymore - but it turns out trace-cmd does handle it
correctly.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/trace.h | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h
index 5e78c396e24c..64b7d9364fd9 100644
--- a/fs/bcachefs/trace.h
+++ b/fs/bcachefs/trace.h
@@ -358,7 +358,7 @@ TRACE_EVENT(btree_node_relock_fail,
 
 	TP_STRUCT__entry(
 		__array(char,			trans_fn, 24	)
-		__array(char,			caller, 32	)
+		__field(unsigned long,		caller_ip	)
 		__field(u8,			btree_id	)
 		__field(u64,			pos_inode	)
 		__field(u64,			pos_offset	)
@@ -370,7 +370,7 @@ TRACE_EVENT(btree_node_relock_fail,
 
 	TP_fast_assign(
 		strncpy(__entry->trans_fn, trans_fn, sizeof(__entry->trans_fn));
-		snprintf(__entry->caller, sizeof(__entry->caller), "%pS", (void *) caller_ip);
+		__entry->caller_ip		= caller_ip;
 		__entry->btree_id		= btree_id;
 		__entry->pos_inode		= pos->inode;
 		__entry->pos_offset		= pos->offset;
@@ -380,9 +380,9 @@ TRACE_EVENT(btree_node_relock_fail,
 		__entry->node_lock_seq		= node_lock_seq;
 	),
 
-	TP_printk("%s %s btree %u pos %llu:%llu:%u, node %lu iter seq %u lock seq %u",
+	TP_printk("%s %pS btree %u pos %llu:%llu:%u, node %lu iter seq %u lock seq %u",
 		  __entry->trans_fn,
-		  __entry->caller,
+		  (void *) __entry->caller_ip,
 		  __entry->btree_id,
 		  __entry->pos_inode,
 		  __entry->pos_offset,
@@ -673,7 +673,7 @@ DECLARE_EVENT_CLASS(transaction_restart_iter,
 
 	TP_STRUCT__entry(
 		__array(char,			trans_fn, 24	)
-		__array(char,			caller, 32	)
+		__field(unsigned long,		caller_ip	)
 		__field(u8,			btree_id	)
 		__field(u64,			pos_inode	)
 		__field(u64,			pos_offset	)
@@ -682,16 +682,16 @@ DECLARE_EVENT_CLASS(transaction_restart_iter,
 
 	TP_fast_assign(
 		strncpy(__entry->trans_fn, trans_fn, sizeof(__entry->trans_fn));
-		snprintf(__entry->caller, sizeof(__entry->caller), "%pS", (void *) caller_ip);
+		__entry->caller_ip		= caller_ip;
 		__entry->btree_id		= btree_id;
 		__entry->pos_inode		= pos->inode;
 		__entry->pos_offset		= pos->offset;
 		__entry->pos_snapshot		= pos->snapshot;
 	),
 
-	TP_printk("%s %s btree %u pos %llu:%llu:%u",
+	TP_printk("%s %pS btree %u pos %llu:%llu:%u",
 		  __entry->trans_fn,
-		  __entry->caller,
+		  (void *) __entry->caller_ip,
 		  __entry->btree_id,
 		  __entry->pos_inode,
 		  __entry->pos_offset,
-- 
cgit 


From 33aa419db96077993af90eddb49adac1270a96e0 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 16 Feb 2022 03:13:36 -0500
Subject: bcachefs: Fix __btree_path_traverse_all

The loop that traverses paths in traverse_all() needs to be a little bit
tricky, because traversing a path can cause other paths to be added (or
perhaps removed) at about the same position.

The old logic was buggy, replace it with simpler logic.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_iter.c | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 7b54d662f4cb..c6c1c9da45f1 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1462,17 +1462,17 @@ retry_all:
 	while (i < trans->nr_sorted) {
 		path = trans->paths + trans->sorted[i];
 
-		EBUG_ON(!(trans->paths_allocated & (1ULL << path->idx)));
-
-		ret = btree_path_traverse_one(trans, path, 0, _THIS_IP_);
-		if (ret)
-			goto retry_all;
-
-		EBUG_ON(!(trans->paths_allocated & (1ULL << path->idx)));
-
-		if (path->nodes_locked ||
-		    !btree_path_node(path, path->level))
+		/*
+		 * Traversing a path can cause another path to be added at about
+		 * the same position:
+		 */
+		if (path->uptodate) {
+			ret = btree_path_traverse_one(trans, path, 0, _THIS_IP_);
+			if (ret)
+				goto retry_all;
+		} else {
 			i++;
+		}
 	}
 
 	/*
-- 
cgit 


From e7bc7cdff813719479d555d9a3ebb62bef3050ce Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 16 Feb 2022 02:50:39 -0500
Subject: bcachefs: Improve journal_entry_btree_keys_to_text()

This improves the formatting of journal_entry_btree_keys_to_text() by
putting each key on its own line.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/journal_io.c | 12 +++++++++---
 fs/bcachefs/util.h       | 22 ++++++++++++++++++++++
 2 files changed, 31 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 75b805732c21..03bc94b586f2 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -298,11 +298,17 @@ static void journal_entry_btree_keys_to_text(struct printbuf *out, struct bch_fs
 					     struct jset_entry *entry)
 {
 	struct bkey_i *k;
+	bool first = true;
 
-	pr_buf(out, "btree=%s l=%u ", bch2_btree_ids[entry->btree_id], entry->level);
-
-	vstruct_for_each(entry, k)
+	vstruct_for_each(entry, k) {
+		if (!first) {
+			printbuf_newline(out);
+			pr_buf(out, "%s: ", bch2_jset_entry_types[entry->type]);
+		}
+		pr_buf(out, "btree=%s l=%u ", bch2_btree_ids[entry->btree_id], entry->level);
 		bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(k));
+		first = false;
+	}
 }
 
 static int journal_entry_btree_root_validate(struct bch_fs *c,
diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h
index fbe5b710e9c5..e047e7860584 100644
--- a/fs/bcachefs/util.h
+++ b/fs/bcachefs/util.h
@@ -238,6 +238,7 @@ do {									\
 struct printbuf {
 	char		*pos;
 	char		*end;
+	unsigned	indent;
 };
 
 static inline size_t printbuf_remaining(struct printbuf *buf)
@@ -259,6 +260,27 @@ do {									\
 				 __VA_ARGS__);				\
 } while (0)
 
+static inline void printbuf_indent_push(struct printbuf *buf, unsigned spaces)
+{
+	buf->indent += spaces;
+	while (spaces--)
+		pr_buf(buf, " ");
+}
+
+static inline void printbuf_indent_pop(struct printbuf *buf, unsigned spaces)
+{
+	buf->indent -= spaces;
+}
+
+static inline void printbuf_newline(struct printbuf *buf)
+{
+	unsigned i;
+
+	pr_buf(buf, "\n");
+	for (i = 0; i < buf->indent; i++)
+		pr_buf(buf, " ");
+}
+
 void bch_scnmemcpy(struct printbuf *, const char *, size_t);
 
 int bch2_strtoint_h(const char *, int *);
-- 
cgit 


From c929f2306e61500bf68a39cb2a16006bfe844d52 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 13 Feb 2022 01:58:12 -0500
Subject: bcachefs: Stale ptr cleanup is now done by gc_gens

Before we had dedicated gc code for bucket->oldest_gen this was
btree_gc's responsibility, but now that we have that we can rip it out,
simplifying the already overcomplicated btree_gc.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_gc.c | 55 +++++++++-----------------------------------------
 1 file changed, 10 insertions(+), 45 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index d1fbe3b77379..ba22c36e30a1 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -726,11 +726,9 @@ fsck_err:
 static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id,
 			    unsigned level, bool is_root,
 			    struct bkey_s_c *k,
-			    u8 *max_stale, bool initial)
+			    bool initial)
 {
 	struct bch_fs *c = trans->c;
-	struct bkey_ptrs_c ptrs;
-	const struct bch_extent_ptr *ptr;
 	struct bkey deleted = KEY(0, 0, 0);
 	struct bkey_s_c old = (struct bkey_s_c) { &deleted, NULL };
 	unsigned flags =
@@ -755,17 +753,6 @@ static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id,
 			atomic64_set(&c->key_version, k->k->version.lo);
 	}
 
-	ptrs = bch2_bkey_ptrs_c(*k);
-	bkey_for_each_ptr(ptrs, ptr) {
-		struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
-		struct bucket *g = PTR_GC_BUCKET(ca, ptr);
-
-		if (gen_after(g->oldest_gen, ptr->gen))
-			g->oldest_gen = ptr->gen;
-
-		*max_stale = max(*max_stale, ptr_stale(ca, ptr));
-	}
-
 	ret = bch2_mark_key(trans, old, *k, flags);
 fsck_err:
 err:
@@ -774,8 +761,7 @@ err:
 	return ret;
 }
 
-static int btree_gc_mark_node(struct btree_trans *trans, struct btree *b, u8 *max_stale,
-			      bool initial)
+static int btree_gc_mark_node(struct btree_trans *trans, struct btree *b, bool initial)
 {
 	struct bch_fs *c = trans->c;
 	struct btree_node_iter iter;
@@ -784,8 +770,6 @@ static int btree_gc_mark_node(struct btree_trans *trans, struct btree *b, u8 *ma
 	struct bkey_buf prev, cur;
 	int ret = 0;
 
-	*max_stale = 0;
-
 	if (!btree_node_type_needs_gc(btree_node_type(b)))
 		return 0;
 
@@ -796,7 +780,7 @@ static int btree_gc_mark_node(struct btree_trans *trans, struct btree *b, u8 *ma
 
 	while ((k = bch2_btree_node_iter_peek_unpack(&iter, b, &unpacked)).k) {
 		ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level, false,
-				       &k, max_stale, initial);
+				       &k, initial);
 		if (ret)
 			break;
 
@@ -827,7 +811,6 @@ static int bch2_gc_btree(struct btree_trans *trans, enum btree_id btree_id,
 		: bch2_expensive_debug_checks		? 0
 		: !btree_node_type_needs_gc(btree_id)	? 1
 		: 0;
-	u8 max_stale = 0;
 	int ret = 0;
 
 	gc_pos_set(c, gc_pos_btree(btree_id, POS_MIN, 0));
@@ -838,21 +821,9 @@ static int bch2_gc_btree(struct btree_trans *trans, enum btree_id btree_id,
 
 		gc_pos_set(c, gc_pos_btree_node(b));
 
-		ret = btree_gc_mark_node(trans, b, &max_stale, initial);
+		ret = btree_gc_mark_node(trans, b, initial);
 		if (ret)
 			break;
-
-		if (!initial) {
-			if (max_stale > 64)
-				bch2_btree_node_rewrite(trans, &iter, b,
-						BTREE_INSERT_NOWAIT|
-						BTREE_INSERT_GC_LOCK_HELD);
-			else if (!bch2_btree_gc_rewrite_disabled &&
-				 (bch2_btree_gc_always_rewrite || max_stale > 16))
-				bch2_btree_node_rewrite(trans, &iter,
-						b, BTREE_INSERT_NOWAIT|
-						BTREE_INSERT_GC_LOCK_HELD);
-		}
 	}
 	bch2_trans_iter_exit(trans, &iter);
 
@@ -864,8 +835,8 @@ static int bch2_gc_btree(struct btree_trans *trans, enum btree_id btree_id,
 	if (!btree_node_fake(b)) {
 		struct bkey_s_c k = bkey_i_to_s_c(&b->key);
 
-		ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level, true,
-				       &k, &max_stale, initial);
+		ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level,
+				       true, &k, initial);
 	}
 	gc_pos_set(c, gc_pos_btree_root(b->c.btree_id));
 	mutex_unlock(&c->btree_root_lock);
@@ -880,7 +851,6 @@ static int bch2_gc_btree_init_recurse(struct btree_trans *trans, struct btree *b
 	struct btree_and_journal_iter iter;
 	struct bkey_s_c k;
 	struct bkey_buf cur, prev;
-	u8 max_stale = 0;
 	char buf[200];
 	int ret = 0;
 
@@ -893,8 +863,8 @@ static int bch2_gc_btree_init_recurse(struct btree_trans *trans, struct btree *b
 		BUG_ON(bpos_cmp(k.k->p, b->data->min_key) < 0);
 		BUG_ON(bpos_cmp(k.k->p, b->data->max_key) > 0);
 
-		ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level, false,
-				       &k, &max_stale, true);
+		ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level,
+				       false, &k, true);
 		if (ret) {
 			bch_err(c, "%s: error %i from bch2_gc_mark_key", __func__, ret);
 			goto fsck_err;
@@ -985,7 +955,6 @@ static int bch2_gc_btree_init(struct btree_trans *trans,
 		: bch2_expensive_debug_checks		? 0
 		: !btree_node_type_needs_gc(btree_id)	? 1
 		: 0;
-	u8 max_stale = 0;
 	char buf[100];
 	int ret = 0;
 
@@ -1018,7 +987,7 @@ static int bch2_gc_btree_init(struct btree_trans *trans,
 		struct bkey_s_c k = bkey_i_to_s_c(&b->key);
 
 		ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level, true,
-				       &k, &max_stale, true);
+				       &k, true);
 	}
 fsck_err:
 	six_unlock_read(&b->c.lock);
@@ -1313,7 +1282,6 @@ static int bch2_alloc_write_key(struct btree_trans *trans,
 		.dev		= iter->pos.inode,
 		.bucket		= iter->pos.offset,
 		.gen		= g->mark.gen,
-		.oldest_gen	= g->oldest_gen,
 		.data_type	= g->mark.data_type,
 		.dirty_sectors	= g->mark.dirty_sectors,
 		.cached_sectors	= g->mark.cached_sectors,
@@ -1330,8 +1298,7 @@ static int bch2_alloc_write_key(struct btree_trans *trans,
 	    gc_u.data_type != BCH_DATA_btree)
 		return 0;
 
-	if (!bkey_alloc_unpacked_cmp(old_u, gc_u) ||
-	    gen_after(old_u.gen, gc_u.gen))
+	if (gen_after(old_u.gen, gc_u.gen))
 		return 0;
 
 #define copy_bucket_field(_f)						\
@@ -1353,8 +1320,6 @@ static int bch2_alloc_write_key(struct btree_trans *trans,
 	copy_bucket_field(stripe);
 #undef copy_bucket_field
 
-	new_u.oldest_gen = gc_u.oldest_gen;
-
 	if (!bkey_alloc_unpacked_cmp(old_u, new_u))
 		return 0;
 
-- 
cgit 


From 2232fa397c2be92ed80ee48d52de98a1a2916b06 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 14 Feb 2022 01:42:31 -0500
Subject: bcachefs: Only allocate buckets_nouse when requested

It's only needed by the migrate tool - this patch adds an option to
enable allocating it.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/buckets.c | 12 +++++++-----
 fs/bcachefs/opts.h    |  5 +++++
 2 files changed, 12 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 7ca1087b5bb3..edc1918cf140 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -2136,9 +2136,10 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
 					    GFP_KERNEL|__GFP_ZERO)) ||
 	    !(bucket_gens	= kvpmalloc(sizeof(struct bucket_gens) + nbuckets,
 					    GFP_KERNEL|__GFP_ZERO)) ||
-	    !(buckets_nouse	= kvpmalloc(BITS_TO_LONGS(nbuckets) *
+	    (c->opts.buckets_nouse &&
+	     !(buckets_nouse	= kvpmalloc(BITS_TO_LONGS(nbuckets) *
 					    sizeof(unsigned long),
-					    GFP_KERNEL|__GFP_ZERO)) ||
+					    GFP_KERNEL|__GFP_ZERO))) ||
 	    !init_fifo(&free[RESERVE_MOVINGGC],
 		       copygc_reserve, GFP_KERNEL) ||
 	    !init_fifo(&free[RESERVE_NONE], reserve_none, GFP_KERNEL) ||
@@ -2171,9 +2172,10 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
 		memcpy(bucket_gens->b,
 		       old_bucket_gens->b,
 		       n);
-		memcpy(buckets_nouse,
-		       ca->buckets_nouse,
-		       BITS_TO_LONGS(n) * sizeof(unsigned long));
+		if (buckets_nouse)
+			memcpy(buckets_nouse,
+			       ca->buckets_nouse,
+			       BITS_TO_LONGS(n) * sizeof(unsigned long));
 	}
 
 	rcu_assign_pointer(ca->buckets[0], buckets);
diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
index 4b438098aecb..57c829b6eee1 100644
--- a/fs/bcachefs/opts.h
+++ b/fs/bcachefs/opts.h
@@ -365,6 +365,11 @@ enum opt_type {
 	  NO_SB_OPT,			false,				\
 	  NULL,		"Set superblock to latest version,\n"		\
 			"allowing any new features to be used")		\
+	x(buckets_nouse,		u8,				\
+	  0,								\
+	  OPT_BOOL(),							\
+	  NO_SB_OPT,			false,				\
+	  NULL,		"Allocate the buckets_nouse bitmap")		\
 	x(project,			u8,				\
 	  OPT_INODE,							\
 	  OPT_BOOL(),							\
-- 
cgit 


From bf7e49a4ae564108d08d314e514a6f802748d73b Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 16 Feb 2022 06:23:06 -0500
Subject: bcachefs: Change bch2_dev_lookup() to not use lookup_bdev()

bch2_dev_lookup() is used from the extended attribute set methods, for
setting the target options, where we're already holding an inode lock -
it turns out pathname lookups also take inode locks, so that was
susceptible to deadlocks.

Fortunately we already stash the device name in ca->name. This does
change user-visible behaviour though: instead of specifying e.g.
/dev/sda1, user must now specify sda1.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/super.c | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index d35547fbefdb..6a32b9a5dc0e 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -1883,20 +1883,14 @@ err:
 }
 
 /* return with ref on ca->ref: */
-struct bch_dev *bch2_dev_lookup(struct bch_fs *c, const char *path)
+struct bch_dev *bch2_dev_lookup(struct bch_fs *c, const char *name)
 {
 	struct bch_dev *ca;
-	dev_t dev;
 	unsigned i;
-	int ret;
-
-	ret = lookup_bdev(path, &dev);
-	if (ret)
-		return ERR_PTR(ret);
 
 	rcu_read_lock();
 	for_each_member_device_rcu(ca, c, i, NULL)
-		if (ca->dev == dev)
+		if (!strcmp(name, ca->name))
 			goto found;
 	ca = ERR_PTR(-ENOENT);
 found:
-- 
cgit 


From 8f9ad91a02c4fd1391ce852cadd9a0227fdd624a Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 18 Feb 2022 00:47:45 -0500
Subject: bcachefs: Fix failure to allocate btree node in cache

The error code when we fail to allocate a node in the btree node cache
doesn't make it to bch2_btree_path_traverse_all(). Instead, we need to
stash a flag in btree_trans so we know we have to take the cannibalize
lock.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_cache.c |  9 +++++++++
 fs/bcachefs/btree_iter.c  | 22 +++++-----------------
 fs/bcachefs/btree_types.h |  1 +
 fs/bcachefs/trace.h       |  8 ++++++++
 4 files changed, 23 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index 36b82df79fc2..c17db1d07187 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -672,6 +672,15 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c,
 	}
 
 	b = bch2_btree_node_mem_alloc(c);
+
+	if (trans && b == ERR_PTR(-ENOMEM)) {
+		trans->memory_allocation_failure = true;
+		trace_trans_restart_memory_allocation_failure(trans->fn,
+				_THIS_IP_, btree_id, &path->pos);
+		btree_trans_restart(trans);
+		return ERR_PTR(-EINTR);
+	}
+
 	if (IS_ERR(b))
 		return b;
 
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index c6c1c9da45f1..1015e89d2d68 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1407,12 +1407,12 @@ err:
 static int btree_path_traverse_one(struct btree_trans *, struct btree_path *,
 				   unsigned, unsigned long);
 
-static int __btree_path_traverse_all(struct btree_trans *trans, int ret,
-				     unsigned long trace_ip)
+static int bch2_btree_path_traverse_all(struct btree_trans *trans)
 {
 	struct bch_fs *c = trans->c;
 	struct btree_path *path, *prev = NULL;
-	int i;
+	unsigned long trace_ip = _RET_IP_;
+	int i, ret = 0;
 
 	if (trans->in_traverse_all)
 		return -EINTR;
@@ -1441,7 +1441,7 @@ retry_all:
 	bch2_trans_unlock(trans);
 	cond_resched();
 
-	if (unlikely(ret == -ENOMEM)) {
+	if (unlikely(trans->memory_allocation_failure)) {
 		struct closure cl;
 
 		closure_init_stack(&cl);
@@ -1452,11 +1452,6 @@ retry_all:
 		} while (ret);
 	}
 
-	if (unlikely(ret == -EIO))
-		goto out;
-
-	BUG_ON(ret && ret != -EINTR);
-
 	/* Now, redo traversals in correct order: */
 	i = 0;
 	while (i < trans->nr_sorted) {
@@ -1482,7 +1477,7 @@ retry_all:
 	 */
 	trans_for_each_path(trans, path)
 		BUG_ON(path->uptodate >= BTREE_ITER_NEED_TRAVERSE);
-out:
+
 	bch2_btree_cache_cannibalize_unlock(c);
 
 	trans->in_traverse_all = false;
@@ -1491,11 +1486,6 @@ out:
 	return ret;
 }
 
-static int bch2_btree_path_traverse_all(struct btree_trans *trans)
-{
-	return __btree_path_traverse_all(trans, 0, _RET_IP_);
-}
-
 static inline bool btree_path_good_node(struct btree_trans *trans,
 					struct btree_path *path,
 					unsigned l, int check_pos)
@@ -1619,8 +1609,6 @@ out:
 	return ret;
 }
 
-static int __btree_path_traverse_all(struct btree_trans *, int, unsigned long);
-
 int __must_check bch2_btree_path_traverse(struct btree_trans *trans,
 					  struct btree_path *path, unsigned flags)
 {
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 7e5b70f60444..89c0d2272d91 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -393,6 +393,7 @@ struct btree_trans {
 	bool			in_traverse_all:1;
 	bool			restarted:1;
 	bool			paths_sorted:1;
+	bool			memory_allocation_failure:1;
 	bool			journal_transaction_names:1;
 	bool			journal_replay_not_finished:1;
 	/*
diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h
index 64b7d9364fd9..b35022dc66c2 100644
--- a/fs/bcachefs/trace.h
+++ b/fs/bcachefs/trace.h
@@ -802,6 +802,14 @@ DEFINE_EVENT(transaction_restart_iter,	trans_restart_traverse,
 	TP_ARGS(trans_fn, caller_ip, btree_id, pos)
 );
 
+DEFINE_EVENT(transaction_restart_iter,	trans_restart_memory_allocation_failure,
+	TP_PROTO(const char *trans_fn,
+		 unsigned long caller_ip,
+		 enum btree_id btree_id,
+		 struct bpos *pos),
+	TP_ARGS(trans_fn, caller_ip, btree_id, pos)
+);
+
 TRACE_EVENT(trans_restart_would_deadlock,
 	TP_PROTO(const char *trans_fn,
 		 unsigned long	caller_ip,
-- 
cgit 


From a9de137bf63107245b43e9046cddc1acc447221a Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 19 Feb 2022 00:42:12 -0500
Subject: bcachefs: Check for errors from crypto_skcipher_encrypt()

Apparently it actually is possible for crypto_skcipher_encrypt() to
return an error - not sure why that would be - but we need to replace
our assertion with actual error handling.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_io.c   | 16 +++++++++++++---
 fs/bcachefs/btree_io.h   | 13 ++++++++-----
 fs/bcachefs/checksum.c   | 47 +++++++++++++++++++++++++++++------------------
 fs/bcachefs/checksum.h   |  6 +++---
 fs/bcachefs/io.c         | 32 +++++++++++++++++++++++++-------
 fs/bcachefs/journal_io.c |  9 +++++++--
 6 files changed, 85 insertions(+), 38 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index b6551db03968..a0446df0d3da 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -917,7 +917,10 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
 				     BTREE_ERR_WANT_RETRY, c, ca, b, i,
 				     "invalid checksum");
 
-			bset_encrypt(c, i, b->written << 9);
+			ret = bset_encrypt(c, i, b->written << 9);
+			if (bch2_fs_fatal_err_on(ret, c,
+					"error decrypting btree node: %i", ret))
+				goto fsck_err;
 
 			btree_err_on(btree_node_is_extents(b) &&
 				     !BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data),
@@ -944,7 +947,10 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
 				     BTREE_ERR_WANT_RETRY, c, ca, b, i,
 				     "invalid checksum");
 
-			bset_encrypt(c, i, b->written << 9);
+			ret = bset_encrypt(c, i, b->written << 9);
+			if (bch2_fs_fatal_err_on(ret, c,
+					"error decrypting btree node: %i\n", ret))
+				goto fsck_err;
 
 			sectors = vstruct_sectors(bne, c->block_bits);
 		}
@@ -1753,6 +1759,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, bool already_sta
 	unsigned long old, new;
 	bool validate_before_checksum = false;
 	void *data;
+	int ret;
 
 	if (already_started)
 		goto do_write;
@@ -1893,7 +1900,10 @@ do_write:
 	    validate_bset_for_write(c, b, i, sectors_to_write))
 		goto err;
 
-	bset_encrypt(c, i, b->written << 9);
+	ret = bset_encrypt(c, i, b->written << 9);
+	if (bch2_fs_fatal_err_on(ret, c,
+			"error encrypting btree node: %i\n", ret))
+		goto err;
 
 	nonce = btree_nonce(i, b->written << 9);
 
diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h
index 0f20224e2a77..095ad505338d 100644
--- a/fs/bcachefs/btree_io.h
+++ b/fs/bcachefs/btree_io.h
@@ -111,22 +111,25 @@ static inline struct nonce btree_nonce(struct bset *i, unsigned offset)
 	}};
 }
 
-static inline void bset_encrypt(struct bch_fs *c, struct bset *i, unsigned offset)
+static inline int bset_encrypt(struct bch_fs *c, struct bset *i, unsigned offset)
 {
 	struct nonce nonce = btree_nonce(i, offset);
+	int ret;
 
 	if (!offset) {
 		struct btree_node *bn = container_of(i, struct btree_node, keys);
 		unsigned bytes = (void *) &bn->keys - (void *) &bn->flags;
 
-		bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, &bn->flags,
-			     bytes);
+		ret = bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce,
+				   &bn->flags, bytes);
+		if (ret)
+			return ret;
 
 		nonce = nonce_add(nonce, round_up(bytes, CHACHA_BLOCK_SIZE));
 	}
 
-	bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, i->_data,
-		     vstruct_end(i) - (void *) i->_data);
+	return bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, i->_data,
+			    vstruct_end(i) - (void *) i->_data);
 }
 
 void bch2_btree_sort_into(struct bch_fs *, struct btree *, struct btree *);
diff --git a/fs/bcachefs/checksum.c b/fs/bcachefs/checksum.c
index a1d89923d361..425582f60d7a 100644
--- a/fs/bcachefs/checksum.c
+++ b/fs/bcachefs/checksum.c
@@ -93,9 +93,9 @@ static void bch2_checksum_update(struct bch2_checksum_state *state, const void *
 	}
 }
 
-static inline void do_encrypt_sg(struct crypto_sync_skcipher *tfm,
-				 struct nonce nonce,
-				 struct scatterlist *sg, size_t len)
+static inline int do_encrypt_sg(struct crypto_sync_skcipher *tfm,
+				struct nonce nonce,
+				struct scatterlist *sg, size_t len)
 {
 	SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm);
 	int ret;
@@ -104,17 +104,20 @@ static inline void do_encrypt_sg(struct crypto_sync_skcipher *tfm,
 	skcipher_request_set_crypt(req, sg, sg, len, nonce.d);
 
 	ret = crypto_skcipher_encrypt(req);
-	BUG_ON(ret);
+	if (ret)
+		pr_err("got error %i from crypto_skcipher_encrypt()", ret);
+
+	return ret;
 }
 
-static inline void do_encrypt(struct crypto_sync_skcipher *tfm,
+static inline int do_encrypt(struct crypto_sync_skcipher *tfm,
 			      struct nonce nonce,
 			      void *buf, size_t len)
 {
 	struct scatterlist sg;
 
 	sg_init_one(&sg, buf, len);
-	do_encrypt_sg(tfm, nonce, &sg, len);
+	return do_encrypt_sg(tfm, nonce, &sg, len);
 }
 
 int bch2_chacha_encrypt_key(struct bch_key *key, struct nonce nonce,
@@ -136,25 +139,29 @@ int bch2_chacha_encrypt_key(struct bch_key *key, struct nonce nonce,
 		goto err;
 	}
 
-	do_encrypt(chacha20, nonce, buf, len);
+	ret = do_encrypt(chacha20, nonce, buf, len);
 err:
 	crypto_free_sync_skcipher(chacha20);
 	return ret;
 }
 
-static void gen_poly_key(struct bch_fs *c, struct shash_desc *desc,
-			 struct nonce nonce)
+static int gen_poly_key(struct bch_fs *c, struct shash_desc *desc,
+			struct nonce nonce)
 {
 	u8 key[POLY1305_KEY_SIZE];
+	int ret;
 
 	nonce.d[3] ^= BCH_NONCE_POLY;
 
 	memset(key, 0, sizeof(key));
-	do_encrypt(c->chacha20, nonce, key, sizeof(key));
+	ret = do_encrypt(c->chacha20, nonce, key, sizeof(key));
+	if (ret)
+		return ret;
 
 	desc->tfm = c->poly1305;
 	crypto_shash_init(desc);
 	crypto_shash_update(desc, key, sizeof(key));
+	return 0;
 }
 
 struct bch_csum bch2_checksum(struct bch_fs *c, unsigned type,
@@ -196,13 +203,13 @@ struct bch_csum bch2_checksum(struct bch_fs *c, unsigned type,
 	}
 }
 
-void bch2_encrypt(struct bch_fs *c, unsigned type,
+int bch2_encrypt(struct bch_fs *c, unsigned type,
 		  struct nonce nonce, void *data, size_t len)
 {
 	if (!bch2_csum_type_is_encryption(type))
-		return;
+		return 0;
 
-	do_encrypt(c->chacha20, nonce, data, len);
+	return do_encrypt(c->chacha20, nonce, data, len);
 }
 
 static struct bch_csum __bch2_checksum_bio(struct bch_fs *c, unsigned type,
@@ -277,23 +284,27 @@ struct bch_csum bch2_checksum_bio(struct bch_fs *c, unsigned type,
 	return __bch2_checksum_bio(c, type, nonce, bio, &iter);
 }
 
-void bch2_encrypt_bio(struct bch_fs *c, unsigned type,
-		      struct nonce nonce, struct bio *bio)
+int bch2_encrypt_bio(struct bch_fs *c, unsigned type,
+		     struct nonce nonce, struct bio *bio)
 {
 	struct bio_vec bv;
 	struct bvec_iter iter;
 	struct scatterlist sgl[16], *sg = sgl;
 	size_t bytes = 0;
+	int ret = 0;
 
 	if (!bch2_csum_type_is_encryption(type))
-		return;
+		return 0;
 
 	sg_init_table(sgl, ARRAY_SIZE(sgl));
 
 	bio_for_each_segment(bv, bio, iter) {
 		if (sg == sgl + ARRAY_SIZE(sgl)) {
 			sg_mark_end(sg - 1);
-			do_encrypt_sg(c->chacha20, nonce, sgl, bytes);
+
+			ret = do_encrypt_sg(c->chacha20, nonce, sgl, bytes);
+			if (ret)
+				return ret;
 
 			nonce = nonce_add(nonce, bytes);
 			bytes = 0;
@@ -307,7 +318,7 @@ void bch2_encrypt_bio(struct bch_fs *c, unsigned type,
 	}
 
 	sg_mark_end(sg - 1);
-	do_encrypt_sg(c->chacha20, nonce, sgl, bytes);
+	return do_encrypt_sg(c->chacha20, nonce, sgl, bytes);
 }
 
 struct bch_csum bch2_checksum_merge(unsigned type, struct bch_csum a,
diff --git a/fs/bcachefs/checksum.h b/fs/bcachefs/checksum.h
index f5c1a609c5c4..c86c3c05d620 100644
--- a/fs/bcachefs/checksum.h
+++ b/fs/bcachefs/checksum.h
@@ -49,7 +49,7 @@ struct bch_csum bch2_checksum(struct bch_fs *, unsigned, struct nonce,
 int bch2_chacha_encrypt_key(struct bch_key *, struct nonce, void *, size_t);
 int bch2_request_key(struct bch_sb *, struct bch_key *);
 
-void bch2_encrypt(struct bch_fs *, unsigned, struct nonce,
+int bch2_encrypt(struct bch_fs *, unsigned, struct nonce,
 		 void *data, size_t);
 
 struct bch_csum bch2_checksum_bio(struct bch_fs *, unsigned,
@@ -61,8 +61,8 @@ int bch2_rechecksum_bio(struct bch_fs *, struct bio *, struct bversion,
 			struct bch_extent_crc_unpacked *,
 			unsigned, unsigned, unsigned);
 
-void bch2_encrypt_bio(struct bch_fs *, unsigned,
-		    struct nonce, struct bio *);
+int bch2_encrypt_bio(struct bch_fs *, unsigned,
+		     struct nonce, struct bio *);
 
 int bch2_decrypt_sb_key(struct bch_fs *, struct bch_sb_field_crypt *,
 			struct bch_key *);
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index 914e22c5c247..4b9ff76dd19f 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -846,6 +846,7 @@ static int bch2_write_decrypt(struct bch_write_op *op)
 	struct bch_fs *c = op->c;
 	struct nonce nonce = extent_nonce(op->version, op->crc);
 	struct bch_csum csum;
+	int ret;
 
 	if (!bch2_csum_type_is_encryption(op->crc.csum_type))
 		return 0;
@@ -860,10 +861,10 @@ static int bch2_write_decrypt(struct bch_write_op *op)
 	if (bch2_crc_cmp(op->crc.csum, csum))
 		return -EIO;
 
-	bch2_encrypt_bio(c, op->crc.csum_type, nonce, &op->wbio.bio);
+	ret = bch2_encrypt_bio(c, op->crc.csum_type, nonce, &op->wbio.bio);
 	op->crc.csum_type = 0;
 	op->crc.csum = (struct bch_csum) { 0, 0 };
-	return 0;
+	return ret;
 }
 
 static enum prep_encoded_ret {
@@ -1078,8 +1079,11 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp,
 			crc.live_size		= src_len >> 9;
 
 			swap(dst->bi_iter.bi_size, dst_len);
-			bch2_encrypt_bio(c, op->csum_type,
-					 extent_nonce(version, crc), dst);
+			ret = bch2_encrypt_bio(c, op->csum_type,
+					       extent_nonce(version, crc), dst);
+			if (ret)
+				goto err;
+
 			crc.csum = bch2_checksum_bio(c, op->csum_type,
 					 extent_nonce(version, crc), dst);
 			crc.csum_type = op->csum_type;
@@ -1851,6 +1855,7 @@ static void __bch2_read_endio(struct work_struct *work)
 	struct nonce nonce = extent_nonce(rbio->version, crc);
 	unsigned nofs_flags;
 	struct bch_csum csum;
+	int ret;
 
 	nofs_flags = memalloc_nofs_save();
 
@@ -1885,7 +1890,10 @@ static void __bch2_read_endio(struct work_struct *work)
 	crc.live_size	= bvec_iter_sectors(rbio->bvec_iter);
 
 	if (crc_is_compressed(crc)) {
-		bch2_encrypt_bio(c, crc.csum_type, nonce, src);
+		ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src);
+		if (ret)
+			goto decrypt_err;
+
 		if (bch2_bio_uncompress(c, src, dst, dst_iter, crc))
 			goto decompression_err;
 	} else {
@@ -1896,7 +1904,9 @@ static void __bch2_read_endio(struct work_struct *work)
 		BUG_ON(src->bi_iter.bi_size < dst_iter.bi_size);
 		src->bi_iter.bi_size = dst_iter.bi_size;
 
-		bch2_encrypt_bio(c, crc.csum_type, nonce, src);
+		ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src);
+		if (ret)
+			goto decrypt_err;
 
 		if (rbio->bounce) {
 			struct bvec_iter src_iter = src->bi_iter;
@@ -1909,7 +1919,10 @@ static void __bch2_read_endio(struct work_struct *work)
 		 * Re encrypt data we decrypted, so it's consistent with
 		 * rbio->crc:
 		 */
-		bch2_encrypt_bio(c, crc.csum_type, nonce, src);
+		ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src);
+		if (ret)
+			goto decrypt_err;
+
 		promote_start(rbio->promote, rbio);
 		rbio->promote = NULL;
 	}
@@ -1944,6 +1957,11 @@ decompression_err:
 				 "decompression error");
 	bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR);
 	goto out;
+decrypt_err:
+	bch_err_inum_ratelimited(c, rbio->read_pos.inode,
+				 "decrypt error");
+	bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR);
+	goto out;
 }
 
 static void bch2_read_endio(struct bio *bio)
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 03bc94b586f2..231f2e4bd1b9 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -724,9 +724,11 @@ static int jset_validate(struct bch_fs *c,
 				 sector, le64_to_cpu(jset->seq)))
 		ret = JOURNAL_ENTRY_BAD;
 
-	bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset),
+	ret = bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset),
 		     jset->encrypted_start,
 		     vstruct_end(jset) - (void *) jset->encrypted_start);
+	bch2_fs_fatal_err_on(ret, c,
+			"error decrypting journal entry: %i", ret);
 csum_done:
 	/* last_seq is ignored when JSET_NO_FLUSH is true */
 	if (journal_entry_err_on(!JSET_NO_FLUSH(jset) &&
@@ -1594,9 +1596,12 @@ void bch2_journal_write(struct closure *cl)
 	    jset_validate_for_write(c, jset))
 		goto err;
 
-	bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset),
+	ret = bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset),
 		    jset->encrypted_start,
 		    vstruct_end(jset) - (void *) jset->encrypted_start);
+	if (bch2_fs_fatal_err_on(ret, c,
+			"error decrypting journal entry: %i", ret))
+		goto err;
 
 	jset->csum = csum_vstruct(c, JSET_CSUM_TYPE(jset),
 				  journal_nonce(jset), jset);
-- 
cgit 


From 72b7d6332b0a769b0b76c78b372aa733a3715c42 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 19 Feb 2022 01:18:18 -0500
Subject: bcachefs: Store logical location of journal entries

When viewing what's in the journal, it's more useful to have the logical
location - journal bucket and offset within that bucket - than just the
offset on that device.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/journal_io.c | 25 +++++++++++++++----------
 fs/bcachefs/journal_io.h | 10 +++++++++-
 2 files changed, 24 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 231f2e4bd1b9..56ba82156c70 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -46,12 +46,12 @@ struct journal_list {
  * be replayed:
  */
 static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca,
-			     struct bch_extent_ptr entry_ptr,
+			     struct journal_ptr entry_ptr,
 			     struct journal_list *jlist, struct jset *j,
 			     bool bad)
 {
 	struct journal_replay *i, *pos, *dup = NULL;
-	struct bch_extent_ptr *ptr;
+	struct journal_ptr *ptr;
 	struct list_head *where;
 	size_t bytes = vstruct_bytes(j);
 	u64 last_seq = 0;
@@ -871,9 +871,12 @@ reread:
 		ja->bucket_seq[bucket] = le64_to_cpu(j->seq);
 
 		mutex_lock(&jlist->lock);
-		ret = journal_entry_add(c, ca, (struct bch_extent_ptr) {
-					.dev = ca->dev_idx,
-					.offset	= offset,
+		ret = journal_entry_add(c, ca, (struct journal_ptr) {
+					.dev		= ca->dev_idx,
+					.bucket		= bucket,
+					.bucket_offset	= offset -
+						bucket_to_sector(ca, ja->buckets[bucket]),
+					.sector		= offset,
 					}, jlist, j, ret != 0);
 		mutex_unlock(&jlist->lock);
 
@@ -964,8 +967,8 @@ err:
 	goto out;
 }
 
-static void bch2_journal_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
-				      struct journal_replay *j)
+void bch2_journal_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
+			       struct journal_replay *j)
 {
 	unsigned i;
 
@@ -973,13 +976,15 @@ static void bch2_journal_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
 		struct bch_dev *ca = bch_dev_bkey_exists(c, j->ptrs[i].dev);
 		u64 offset;
 
-		div64_u64_rem(j->ptrs[i].offset, ca->mi.bucket_size, &offset);
+		div64_u64_rem(j->ptrs[i].sector, ca->mi.bucket_size, &offset);
 
 		if (i)
 			pr_buf(out, " ");
-		pr_buf(out, "%u:%llu (offset %llu)",
+		pr_buf(out, "%u:%u:%u (sector %llu)",
 		       j->ptrs[i].dev,
-		       (u64) j->ptrs[i].offset, offset);
+		       j->ptrs[i].bucket,
+		       j->ptrs[i].bucket_offset,
+		       j->ptrs[i].sector);
 	}
 }
 
diff --git a/fs/bcachefs/journal_io.h b/fs/bcachefs/journal_io.h
index d8425fe0d67b..f2001835e43e 100644
--- a/fs/bcachefs/journal_io.h
+++ b/fs/bcachefs/journal_io.h
@@ -8,7 +8,12 @@
  */
 struct journal_replay {
 	struct list_head	list;
-	struct bch_extent_ptr	ptrs[BCH_REPLICAS_MAX];
+	struct journal_ptr {
+		u8		dev;
+		u32		bucket;
+		u32		bucket_offset;
+		u64		sector;
+	}			ptrs[BCH_REPLICAS_MAX];
 	unsigned		nr_ptrs;
 
 	/* checksum error, but we may want to try using it anyways: */
@@ -45,6 +50,9 @@ int bch2_journal_entry_validate(struct bch_fs *, const char *,
 void bch2_journal_entry_to_text(struct printbuf *, struct bch_fs *,
 				struct jset_entry *);
 
+void bch2_journal_ptrs_to_text(struct printbuf *, struct bch_fs *,
+			       struct journal_replay *);
+
 int bch2_journal_read(struct bch_fs *, struct list_head *, u64 *, u64 *);
 
 void bch2_journal_write(struct closure *);
-- 
cgit 


From 10b93677d35281766fe5439dcd7982fce67f82a7 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 19 Feb 2022 02:39:56 -0500
Subject: bcachefs: Delete some flag bits that are no longer used

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/bcachefs.h | 2 --
 fs/bcachefs/recovery.c | 3 ---
 fs/bcachefs/super-io.c | 3 ---
 3 files changed, 8 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index d9ba48ce7601..c5d972b16b39 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -507,8 +507,6 @@ struct bch_dev {
 
 enum {
 	/* startup: */
-	BCH_FS_INITIALIZED,
-	BCH_FS_ALLOC_READ_DONE,
 	BCH_FS_ALLOC_CLEAN,
 	BCH_FS_ALLOCATOR_RUNNING,
 	BCH_FS_ALLOCATOR_STOPPING,
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 939f7565d290..3fef06faf32f 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -1129,8 +1129,6 @@ use_clean:
 		goto err;
 	bch_verbose(c, "stripes_read done");
 
-	set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags);
-
 	/*
 	 * If we're not running fsck, this ensures bch2_fsck_err() calls are
 	 * instead interpreted as bch2_inconsistent_err() calls:
@@ -1315,7 +1313,6 @@ int bch2_fs_initialize(struct bch_fs *c)
 	}
 	mutex_unlock(&c->sb_lock);
 
-	set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags);
 	set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags);
 	set_bit(BCH_FS_FSCK_DONE, &c->flags);
 
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index 414dfa59744f..f89e883ff2e2 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -461,9 +461,6 @@ int bch2_sb_to_fs(struct bch_fs *c, struct bch_sb *src)
 
 	__copy_super(&c->disk_sb, src);
 
-	if (BCH_SB_INITIALIZED(c->disk_sb.sb))
-		set_bit(BCH_FS_INITIALIZED, &c->flags);
-
 	ret = bch2_sb_replicas_to_cpu_replicas(c);
 	if (ret)
 		return ret;
-- 
cgit 


From 06a98c966f9ae5d978b53986eca2a9cd99d2a6f3 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 19 Feb 2022 02:40:45 -0500
Subject: bcachefs: Change __bch2_trans_commit() to run triggers then get RW

This is prep work for the next patch, which is going to change
__bch2_trans_commit() to use bch2_journal_key_insert() when very early
in the recovery process, so that we have a unified interface for doing
btree updates.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_update_leaf.c | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index bde4bb2b7fcc..9d41711c4e9b 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -982,6 +982,17 @@ int __bch2_trans_commit(struct btree_trans *trans)
 	if (trans->flags & BTREE_INSERT_GC_LOCK_HELD)
 		lockdep_assert_held(&c->gc_lock);
 
+	ret = bch2_trans_commit_run_triggers(trans);
+	if (ret)
+		goto out_reset;
+
+	if (!(trans->flags & BTREE_INSERT_NOCHECK_RW) &&
+	    unlikely(!percpu_ref_tryget(&c->writes))) {
+		ret = bch2_trans_commit_get_rw_cold(trans);
+		if (ret)
+			goto out_reset;
+	}
+
 	memset(&trans->journal_preres, 0, sizeof(trans->journal_preres));
 
 	trans->journal_u64s		= trans->extra_journal_entry_u64s;
@@ -992,17 +1003,6 @@ int __bch2_trans_commit(struct btree_trans *trans)
 	if (trans->journal_transaction_names)
 		trans->journal_u64s += JSET_ENTRY_LOG_U64s;
 
-	if (!(trans->flags & BTREE_INSERT_NOCHECK_RW) &&
-	    unlikely(!percpu_ref_tryget(&c->writes))) {
-		ret = bch2_trans_commit_get_rw_cold(trans);
-		if (ret)
-			goto out_reset;
-	}
-
-	ret = bch2_trans_commit_run_triggers(trans);
-	if (ret)
-		goto out;
-
 	trans_for_each_update(trans, i) {
 		BUG_ON(!i->path->should_be_locked);
 
-- 
cgit 


From 8ccf4dff09e49b34c6ed2e161720634e8dafb99f Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 19 Feb 2022 05:15:53 -0500
Subject: bcachefs: opts.read_journal_only

Add an option that tells recovery to only read the journal, to be used
by the list_journal command.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/opts.h     | 5 +++++
 fs/bcachefs/recovery.c | 3 +++
 2 files changed, 8 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
index 57c829b6eee1..b03cac016f0b 100644
--- a/fs/bcachefs/opts.h
+++ b/fs/bcachefs/opts.h
@@ -329,6 +329,11 @@ enum opt_type {
 	  OPT_BOOL(),							\
 	  NO_SB_OPT,			false,				\
 	  NULL,		"Read all journal entries, not just dirty ones")\
+	x(read_journal_only,		u8,				\
+	  0,								\
+	  OPT_BOOL(),							\
+	  NO_SB_OPT,			false,				\
+	  NULL,		"Only read the journal, skip the rest of recovery")\
 	x(journal_transaction_names,	u8,				\
 	  OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,			\
 	  OPT_BOOL(),							\
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 3fef06faf32f..7def5938e24d 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -1072,6 +1072,9 @@ use_clean:
 		blacklist_seq = journal_seq = le64_to_cpu(clean->journal_seq) + 1;
 	}
 
+	if (c->opts.read_journal_only)
+		goto out;
+
 	if (c->opts.reconstruct_alloc) {
 		c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
 		drop_alloc_keys(&c->journal_keys);
-- 
cgit 


From 3117db99f30b26ebf09ecc323cbefcd51d83467b Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 21 Feb 2022 05:05:29 -0500
Subject: bcachefs: Don't issue discards when in nochanges mode

When the nochanges option is selected, we're supposed to never issue
writes. Unfortunately, it seems discards were missed when implemnting
this, leading to some painful filesystem corruption.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/alloc_background.c | 3 ++-
 fs/bcachefs/journal_reclaim.c  | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 700d1e00aaf9..fac040aa0d5a 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -801,7 +801,8 @@ static int push_invalidated_bucket(struct bch_fs *c, struct bch_dev *ca, u64 b)
 
 static void discard_one_bucket(struct bch_fs *c, struct bch_dev *ca, u64 b)
 {
-	if (ca->mi.discard &&
+	if (!c->opts.nochanges &&
+	    ca->mi.discard &&
 	    bdev_max_discard_sectors(ca->disk_sb.bdev))
 		blkdev_issue_discard(ca->disk_sb.bdev, bucket_to_sector(ca, b),
 				     ca->mi.bucket_size, GFP_NOFS);
diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
index d72b17dc935a..84cc952a7ac5 100644
--- a/fs/bcachefs/journal_reclaim.c
+++ b/fs/bcachefs/journal_reclaim.c
@@ -286,7 +286,8 @@ void bch2_journal_do_discards(struct journal *j)
 		struct journal_device *ja = &ca->journal;
 
 		while (should_discard_bucket(j, ja)) {
-			if (ca->mi.discard &&
+			if (!c->opts.nochanges &&
+			    ca->mi.discard &&
 			    bdev_max_discard_sectors(ca->disk_sb.bdev))
 				blkdev_issue_discard(ca->disk_sb.bdev,
 					bucket_to_sector(ca,
-- 
cgit 


From d4b691522c4b60220087a01c276f3fa9781405b0 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 20 Feb 2022 04:52:44 -0500
Subject: bcachefs: Kill bch_scnmemcpy()

bch_scnmemcpy was for printing length-limited strings that might not
have a terminating null - turns out sprintf & pr_buf can do this with
%.*s.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/dirent.c      |  6 +++---
 fs/bcachefs/disk_groups.c |  9 ++++-----
 fs/bcachefs/journal_io.c  |  2 +-
 fs/bcachefs/util.c        | 13 -------------
 fs/bcachefs/util.h        |  2 --
 fs/bcachefs/xattr.c       | 10 +++++-----
 6 files changed, 13 insertions(+), 29 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c
index 6f699b736b34..a43a24409d37 100644
--- a/fs/bcachefs/dirent.c
+++ b/fs/bcachefs/dirent.c
@@ -122,9 +122,9 @@ void bch2_dirent_to_text(struct printbuf *out, struct bch_fs *c,
 {
 	struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
 
-	bch_scnmemcpy(out, d.v->d_name,
-		      bch2_dirent_name_bytes(d));
-	pr_buf(out, " -> %llu type %s",
+	pr_buf(out, "%.*s -> %llu type %s",
+	       bch2_dirent_name_bytes(d),
+	       d.v->d_name,
 	       d.v->d_type != DT_SUBVOL
 	       ? le64_to_cpu(d.v->d_inum)
 	       : le32_to_cpu(d.v->d_child_subvol),
diff --git a/fs/bcachefs/disk_groups.c b/fs/bcachefs/disk_groups.c
index a27fc4fb60d5..e411606fd38d 100644
--- a/fs/bcachefs/disk_groups.c
+++ b/fs/bcachefs/disk_groups.c
@@ -76,8 +76,9 @@ static int bch2_sb_disk_groups_validate(struct bch_sb *sb,
 	for (g = sorted; g + 1 < sorted + nr_groups; g++)
 		if (!BCH_GROUP_DELETED(g) &&
 		    !group_cmp(&g[0], &g[1])) {
-			pr_buf(err, "duplicate label %llu.", BCH_GROUP_PARENT(g));
-			bch_scnmemcpy(err, g->label, strnlen(g->label, sizeof(g->label)));
+			pr_buf(err, "duplicate label %llu.%.*s",
+			       BCH_GROUP_PARENT(g),
+			       (int) sizeof(g->label), g->label);
 			goto err;
 		}
 
@@ -376,9 +377,7 @@ void bch2_disk_path_to_text(struct printbuf *out,
 		v = path[--nr];
 		g = groups->entries + v;
 
-		bch_scnmemcpy(out, g->label,
-			      strnlen(g->label, sizeof(g->label)));
-
+		pr_buf(out, "%.*s", (int) sizeof(g->label), g->label);
 		if (nr)
 			pr_buf(out, ".");
 	}
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 56ba82156c70..4f0904a515a7 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -595,7 +595,7 @@ static void journal_entry_log_to_text(struct printbuf *out, struct bch_fs *c,
 	struct jset_entry_log *l = container_of(entry, struct jset_entry_log, entry);
 	unsigned bytes = vstruct_bytes(entry) - offsetof(struct jset_entry_log, d);
 
-	bch_scnmemcpy(out, l->d, strnlen(l->d, bytes));
+	pr_buf(out, "%.*s", bytes, l->d);
 }
 
 struct jset_entry_ops {
diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c
index e7675b4597db..971f404a01e3 100644
--- a/fs/bcachefs/util.c
+++ b/fs/bcachefs/util.c
@@ -581,19 +581,6 @@ void memcpy_from_bio(void *dst, struct bio *src, struct bvec_iter src_iter)
 	}
 }
 
-void bch_scnmemcpy(struct printbuf *out,
-		   const char *src, size_t len)
-{
-	size_t n = printbuf_remaining(out);
-
-	if (n) {
-		n = min(n - 1, len);
-		memcpy(out->pos, src, n);
-		out->pos += n;
-		*out->pos = '\0';
-	}
-}
-
 #include "eytzinger.h"
 
 static int alignment_ok(const void *base, size_t align)
diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h
index e047e7860584..fc8ffa61bbeb 100644
--- a/fs/bcachefs/util.h
+++ b/fs/bcachefs/util.h
@@ -281,8 +281,6 @@ static inline void printbuf_newline(struct printbuf *buf)
 		pr_buf(buf, " ");
 }
 
-void bch_scnmemcpy(struct printbuf *, const char *, size_t);
-
 int bch2_strtoint_h(const char *, int *);
 int bch2_strtouint_h(const char *, unsigned int *);
 int bch2_strtoll_h(const char *, long long *);
diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c
index a5122dbb2eb9..9cce3953ee0c 100644
--- a/fs/bcachefs/xattr.c
+++ b/fs/bcachefs/xattr.c
@@ -111,11 +111,11 @@ void bch2_xattr_to_text(struct printbuf *out, struct bch_fs *c,
 	else
 		pr_buf(out, "(unknown type %u)", xattr.v->x_type);
 
-	bch_scnmemcpy(out, xattr.v->x_name,
-		      xattr.v->x_name_len);
-	pr_buf(out, ":");
-	bch_scnmemcpy(out, xattr_val(xattr.v),
-		      le16_to_cpu(xattr.v->x_val_len));
+	pr_buf(out, "%.*s:%.*s",
+	       xattr.v->x_name_len,
+	       xattr.v->x_name,
+	       le16_to_cpu(xattr.v->x_val_len),
+	       (char *) xattr_val(xattr.v));
 }
 
 static int bch2_xattr_get_trans(struct btree_trans *trans, struct bch_inode_info *inode,
-- 
cgit 


From 12bf93a429c981cf337ce2c27504ec0171157f76 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 20 Feb 2022 05:00:45 -0500
Subject: bcachefs: Add .to_text() methods for all superblock sections

This patch improves the superblock .to_text() methods and adds methods
for all types that were missing them. It also improves printbufs by
allowing them to specfiy what units we want to be printing in, and adds
new wrapper methods for unifying our kernel and userspace environments.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/disk_groups.c           |  40 +++-
 fs/bcachefs/disk_groups.h           |   5 +-
 fs/bcachefs/extents.c               |  18 +-
 fs/bcachefs/journal_io.c            |   2 +-
 fs/bcachefs/journal_seq_blacklist.c |   1 +
 fs/bcachefs/quota.c                 |  43 ++++-
 fs/bcachefs/recovery.c              |   2 +-
 fs/bcachefs/replicas.c              |  85 +++++++--
 fs/bcachefs/replicas.h              |   1 +
 fs/bcachefs/super-io.c              | 364 ++++++++++++++++++++++++++++++++++--
 fs/bcachefs/super-io.h              |   4 +-
 fs/bcachefs/sysfs.c                 |   2 +-
 fs/bcachefs/util.c                  |  21 +++
 fs/bcachefs/util.h                  |  64 +++++--
 14 files changed, 588 insertions(+), 64 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/disk_groups.c b/fs/bcachefs/disk_groups.c
index e411606fd38d..e9ee37f1e07d 100644
--- a/fs/bcachefs/disk_groups.c
+++ b/fs/bcachefs/disk_groups.c
@@ -343,12 +343,10 @@ int bch2_disk_path_find_or_create(struct bch_sb_handle *sb, const char *name)
 	return v;
 }
 
-void bch2_disk_path_to_text(struct printbuf *out,
-			    struct bch_sb_handle *sb,
-			    unsigned v)
+void bch2_disk_path_to_text(struct printbuf *out, struct bch_sb *sb, unsigned v)
 {
 	struct bch_sb_field_disk_groups *groups =
-		bch2_sb_get_disk_groups(sb->sb);
+		bch2_sb_get_disk_groups(sb);
 	struct bch_disk_group *g;
 	unsigned nr = 0;
 	u16 path[32];
@@ -383,7 +381,7 @@ void bch2_disk_path_to_text(struct printbuf *out,
 	}
 	return;
 inval:
-	pr_buf(out, "invalid group %u", v);
+	pr_buf(out, "invalid label %u", v);
 }
 
 int bch2_dev_group_set(struct bch_fs *c, struct bch_dev *ca, const char *name)
@@ -447,6 +445,36 @@ int bch2_opt_target_parse(struct bch_fs *c, const char *buf, u64 *v)
 	return -EINVAL;
 }
 
+void bch2_sb_target_to_text(struct printbuf *out, struct bch_sb *sb, u64 v)
+{
+	struct target t = target_decode(v);
+
+	switch (t.type) {
+	case TARGET_NULL:
+		pr_buf(out, "none");
+		break;
+	case TARGET_DEV: {
+		struct bch_sb_field_members *mi = bch2_sb_get_members(sb);
+		struct bch_member *m = mi->members + t.dev;
+
+		if (bch2_dev_exists(sb, mi, t.dev)) {
+			pr_buf(out, "Device ");
+			pr_uuid(out, m->uuid.b);
+			pr_buf(out, " (%u)", t.dev);
+		} else {
+			pr_buf(out, "Bad device %u", t.dev);
+		}
+
+		break;
+	}
+	case TARGET_GROUP:
+		bch2_disk_path_to_text(out, sb, t.group);
+		break;
+	default:
+		BUG();
+	}
+}
+
 void bch2_opt_target_to_text(struct printbuf *out, struct bch_fs *c, u64 v)
 {
 	struct target t = target_decode(v);
@@ -477,7 +505,7 @@ void bch2_opt_target_to_text(struct printbuf *out, struct bch_fs *c, u64 v)
 	}
 	case TARGET_GROUP:
 		mutex_lock(&c->sb_lock);
-		bch2_disk_path_to_text(out, &c->disk_sb, t.group);
+		bch2_disk_path_to_text(out, c->disk_sb.sb, t.group);
 		mutex_unlock(&c->sb_lock);
 		break;
 	default:
diff --git a/fs/bcachefs/disk_groups.h b/fs/bcachefs/disk_groups.h
index 3d84f23c34ed..a274aacbdf92 100644
--- a/fs/bcachefs/disk_groups.h
+++ b/fs/bcachefs/disk_groups.h
@@ -75,8 +75,9 @@ int bch2_disk_path_find(struct bch_sb_handle *, const char *);
 /* Exported for userspace bcachefs-tools: */
 int bch2_disk_path_find_or_create(struct bch_sb_handle *, const char *);
 
-void bch2_disk_path_to_text(struct printbuf *, struct bch_sb_handle *,
-			    unsigned);
+void bch2_disk_path_to_text(struct printbuf *, struct bch_sb *, unsigned);
+
+void bch2_sb_target_to_text(struct printbuf *, struct bch_sb *, u64);
 
 int bch2_opt_target_parse(struct bch_fs *, const char *, u64 *);
 void bch2_opt_target_to_text(struct printbuf *, struct bch_fs *, u64);
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 3ed724e1fc98..c78e10e8ec2c 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -953,15 +953,19 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
 		switch (__extent_entry_type(entry)) {
 		case BCH_EXTENT_ENTRY_ptr:
 			ptr = entry_to_ptr(entry);
-			ca = ptr->dev < c->sb.nr_devices && c->devs[ptr->dev]
-				? bch_dev_bkey_exists(c, ptr->dev)
-				: NULL;
 
-			pr_buf(out, "ptr: %u:%llu gen %u%s%s", ptr->dev,
+			pr_buf(out, "ptr: %u:%llu gen %u%s", ptr->dev,
 			       (u64) ptr->offset, ptr->gen,
-			       ptr->cached ? " cached" : "",
-			       ca && ptr_stale(ca, ptr)
-			       ? " stale" : "");
+			       ptr->cached ? " cached" : "");
+
+			if (c) {
+				ca = ptr->dev < c->sb.nr_devices && c->devs[ptr->dev]
+					? bch_dev_bkey_exists(c, ptr->dev)
+					: NULL;
+
+				if (ca && ptr_stale(ca, ptr))
+					pr_buf(out, " stale");
+			}
 			break;
 		case BCH_EXTENT_ENTRY_crc32:
 		case BCH_EXTENT_ENTRY_crc64:
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 4f0904a515a7..491300e3c48f 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -302,7 +302,7 @@ static void journal_entry_btree_keys_to_text(struct printbuf *out, struct bch_fs
 
 	vstruct_for_each(entry, k) {
 		if (!first) {
-			printbuf_newline(out);
+			pr_newline(out);
 			pr_buf(out, "%s: ", bch2_jset_entry_types[entry->type]);
 		}
 		pr_buf(out, "btree=%s l=%u ", bch2_btree_ids[entry->btree_id], entry->level);
diff --git a/fs/bcachefs/journal_seq_blacklist.c b/fs/bcachefs/journal_seq_blacklist.c
index 3cc63fc202ab..3140c8731431 100644
--- a/fs/bcachefs/journal_seq_blacklist.c
+++ b/fs/bcachefs/journal_seq_blacklist.c
@@ -235,6 +235,7 @@ static void bch2_sb_journal_seq_blacklist_to_text(struct printbuf *out,
 		       le64_to_cpu(i->start),
 		       le64_to_cpu(i->end));
 	}
+	pr_newline(out);
 }
 
 const struct bch_sb_field_ops bch_sb_field_ops_journal_seq_blacklist = {
diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c
index 6fb8224f565e..b7ef8fa7bbc9 100644
--- a/fs/bcachefs/quota.c
+++ b/fs/bcachefs/quota.c
@@ -6,7 +6,18 @@
 #include "subvolume.h"
 #include "super-io.h"
 
-static int bch2_sb_validate_quota(struct bch_sb *sb, struct bch_sb_field *f,
+static const char * const bch2_quota_types[] = {
+	"user",
+	"group",
+	"project",
+};
+
+static const char * const bch2_quota_counters[] = {
+	"space",
+	"inodes",
+};
+
+static int bch2_sb_quota_validate(struct bch_sb *sb, struct bch_sb_field *f,
 				  struct printbuf *err)
 {
 	struct bch_sb_field_quota *q = field_to_type(f, quota);
@@ -14,13 +25,36 @@ static int bch2_sb_validate_quota(struct bch_sb *sb, struct bch_sb_field *f,
 	if (vstruct_bytes(&q->field) < sizeof(*q)) {
 		pr_buf(err, "wrong size (got %llu should be %zu)",
 		       vstruct_bytes(&q->field), sizeof(*q));
+		return -EINVAL;
 	}
 
 	return 0;
 }
 
+static void bch2_sb_quota_to_text(struct printbuf *out, struct bch_sb *sb,
+				  struct bch_sb_field *f)
+{
+	struct bch_sb_field_quota *q = field_to_type(f, quota);
+	unsigned qtyp, counter;
+
+	for (qtyp = 0; qtyp < ARRAY_SIZE(q->q); qtyp++) {
+		pr_buf(out, "%s: flags %llx",
+		       bch2_quota_types[qtyp],
+		       le64_to_cpu(q->q[qtyp].flags));
+
+		for (counter = 0; counter < Q_COUNTERS; counter++)
+			pr_buf(out, " %s timelimit %u warnlimit %u",
+			       bch2_quota_counters[counter],
+			       le32_to_cpu(q->q[qtyp].c[counter].timelimit),
+			       le32_to_cpu(q->q[qtyp].c[counter].warnlimit));
+
+		pr_newline(out);
+	}
+}
+
 const struct bch_sb_field_ops bch_sb_field_ops_quota = {
-	.validate	= bch2_sb_validate_quota,
+	.validate	= bch2_sb_quota_validate,
+	.to_text	= bch2_sb_quota_to_text,
 };
 
 const char *bch2_quota_invalid(const struct bch_fs *c, struct bkey_s_c k)
@@ -34,11 +68,6 @@ const char *bch2_quota_invalid(const struct bch_fs *c, struct bkey_s_c k)
 	return NULL;
 }
 
-static const char * const bch2_quota_counters[] = {
-	"space",
-	"inodes",
-};
-
 void bch2_quota_to_text(struct printbuf *out, struct bch_fs *c,
 			struct bkey_s_c k)
 {
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 7def5938e24d..d33b9e2bb1e3 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -821,7 +821,7 @@ static struct bch_sb_field_clean *read_superblock_clean(struct bch_fs *c)
 		return ERR_PTR(-ENOMEM);
 	}
 
-	ret = bch2_sb_clean_validate(c, clean, READ);
+	ret = bch2_sb_clean_validate_late(c, clean, READ);
 	if (ret) {
 		mutex_unlock(&c->sb_lock);
 		return ERR_PTR(ret);
diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c
index c192e31d5d68..7cc2414893fc 100644
--- a/fs/bcachefs/replicas.c
+++ b/fs/bcachefs/replicas.c
@@ -36,6 +36,22 @@ static void bch2_cpu_replicas_sort(struct bch_replicas_cpu *r)
 	eytzinger0_sort(r->entries, r->nr, r->entry_size, memcmp, NULL);
 }
 
+void bch2_replicas_entry_v0_to_text(struct printbuf *out,
+				    struct bch_replicas_entry_v0 *e)
+{
+	unsigned i;
+
+	if (e->data_type < BCH_DATA_NR)
+		pr_buf(out, "%s", bch2_data_types[e->data_type]);
+	else
+		pr_buf(out, "(invalid data type %u)", e->data_type);
+
+	pr_buf(out, ": %u [", e->nr_devs);
+	for (i = 0; i < e->nr_devs; i++)
+		pr_buf(out, i ? " %u" : "%u", e->devs[i]);
+	pr_buf(out, "]");
+}
+
 void bch2_replicas_entry_to_text(struct printbuf *out,
 				 struct bch_replicas_entry *e)
 {
@@ -867,7 +883,7 @@ static int bch2_cpu_replicas_validate(struct bch_replicas_cpu *cpu_r,
 	return 0;
 }
 
-static int bch2_sb_validate_replicas(struct bch_sb *sb, struct bch_sb_field *f,
+static int bch2_sb_replicas_validate(struct bch_sb *sb, struct bch_sb_field *f,
 				     struct printbuf *err)
 {
 	struct bch_sb_field_replicas *sb_r = field_to_type(f, replicas);
@@ -897,14 +913,15 @@ static void bch2_sb_replicas_to_text(struct printbuf *out,
 
 		bch2_replicas_entry_to_text(out, e);
 	}
+	pr_newline(out);
 }
 
 const struct bch_sb_field_ops bch_sb_field_ops_replicas = {
-	.validate	= bch2_sb_validate_replicas,
+	.validate	= bch2_sb_replicas_validate,
 	.to_text	= bch2_sb_replicas_to_text,
 };
 
-static int bch2_sb_validate_replicas_v0(struct bch_sb *sb, struct bch_sb_field *f,
+static int bch2_sb_replicas_v0_validate(struct bch_sb *sb, struct bch_sb_field *f,
 					struct printbuf *err)
 {
 	struct bch_sb_field_replicas_v0 *sb_r = field_to_type(f, replicas_v0);
@@ -919,8 +936,27 @@ static int bch2_sb_validate_replicas_v0(struct bch_sb *sb, struct bch_sb_field *
 	return ret;
 }
 
+static void bch2_sb_replicas_v0_to_text(struct printbuf *out,
+					struct bch_sb *sb,
+					struct bch_sb_field *f)
+{
+	struct bch_sb_field_replicas_v0 *sb_r = field_to_type(f, replicas_v0);
+	struct bch_replicas_entry_v0 *e;
+	bool first = true;
+
+	for_each_replicas_entry(sb_r, e) {
+		if (!first)
+			pr_buf(out, " ");
+		first = false;
+
+		bch2_replicas_entry_v0_to_text(out, e);
+	}
+	pr_newline(out);
+}
+
 const struct bch_sb_field_ops bch_sb_field_ops_replicas_v0 = {
-	.validate	= bch2_sb_validate_replicas_v0,
+	.validate	= bch2_sb_replicas_v0_validate,
+	.to_text	= bch2_sb_replicas_v0_to_text,
 };
 
 /* Query replicas: */
@@ -977,19 +1013,42 @@ bool bch2_have_enough_devs(struct bch_fs *c, struct bch_devs_mask devs,
 	return ret;
 }
 
-unsigned bch2_dev_has_data(struct bch_fs *c, struct bch_dev *ca)
+unsigned bch2_sb_dev_has_data(struct bch_sb *sb, unsigned dev)
 {
-	struct bch_replicas_entry *e;
-	unsigned i, ret = 0;
+	struct bch_sb_field_replicas *replicas;
+	struct bch_sb_field_replicas_v0 *replicas_v0;
+	unsigned i, data_has = 0;
+
+	replicas = bch2_sb_get_replicas(sb);
+	replicas_v0 = bch2_sb_get_replicas_v0(sb);
+
+	if (replicas) {
+		struct bch_replicas_entry *r;
+
+		for_each_replicas_entry(replicas, r)
+			for (i = 0; i < r->nr_devs; i++)
+				if (r->devs[i] == dev)
+					data_has |= 1 << r->data_type;
+	} else if (replicas_v0) {
+		struct bch_replicas_entry_v0 *r;
+
+		for_each_replicas_entry_v0(replicas_v0, r)
+			for (i = 0; i < r->nr_devs; i++)
+				if (r->devs[i] == dev)
+					data_has |= 1 << r->data_type;
+	}
 
-	percpu_down_read(&c->mark_lock);
 
-	for_each_cpu_replicas_entry(&c->replicas, e)
-		for (i = 0; i < e->nr_devs; i++)
-			if (e->devs[i] == ca->dev_idx)
-				ret |= 1 << e->data_type;
+	return data_has;
+}
 
-	percpu_up_read(&c->mark_lock);
+unsigned bch2_dev_has_data(struct bch_fs *c, struct bch_dev *ca)
+{
+	unsigned ret;
+
+	mutex_lock(&c->sb_lock);
+	ret = bch2_sb_dev_has_data(c->disk_sb.sb, ca->dev_idx);
+	mutex_unlock(&c->sb_lock);
 
 	return ret;
 }
diff --git a/fs/bcachefs/replicas.h b/fs/bcachefs/replicas.h
index d237d7c51ccb..87820b2e1ad3 100644
--- a/fs/bcachefs/replicas.h
+++ b/fs/bcachefs/replicas.h
@@ -64,6 +64,7 @@ static inline void bch2_replicas_entry_cached(struct bch_replicas_entry *e,
 bool bch2_have_enough_devs(struct bch_fs *, struct bch_devs_mask,
 			   unsigned, bool);
 
+unsigned bch2_sb_dev_has_data(struct bch_sb *, unsigned);
 unsigned bch2_dev_has_data(struct bch_fs *, struct bch_dev *);
 
 int bch2_replicas_gc_end(struct bch_fs *, int);
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index f89e883ff2e2..e1ff14eedaea 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -920,7 +920,7 @@ static int u64_cmp(const void *_l, const void *_r)
 	return l < r ? -1 : l > r ? 1 : 0;
 }
 
-static int bch2_sb_validate_journal(struct bch_sb *sb,
+static int bch2_sb_journal_validate(struct bch_sb *sb,
 				    struct bch_sb_field *f,
 				    struct printbuf *err)
 {
@@ -973,13 +973,26 @@ err:
 	return ret;
 }
 
+static void bch2_sb_journal_to_text(struct printbuf *out, struct bch_sb *sb,
+				    struct bch_sb_field *f)
+{
+	struct bch_sb_field_journal *journal = field_to_type(f, journal);
+	unsigned i, nr = bch2_nr_journal_buckets(journal);
+
+	pr_buf(out, "Buckets: ");
+	for (i = 0; i < nr; i++)
+		pr_buf(out, " %llu", le64_to_cpu(journal->buckets[i]));
+	pr_newline(out);
+}
+
 static const struct bch_sb_field_ops bch_sb_field_ops_journal = {
-	.validate	= bch2_sb_validate_journal,
+	.validate	= bch2_sb_journal_validate,
+	.to_text	= bch2_sb_journal_to_text,
 };
 
 /* BCH_SB_FIELD_members: */
 
-static int bch2_sb_validate_members(struct bch_sb *sb,
+static int bch2_sb_members_validate(struct bch_sb *sb,
 				    struct bch_sb_field *f,
 				    struct printbuf *err)
 {
@@ -1029,13 +1042,105 @@ static int bch2_sb_validate_members(struct bch_sb *sb,
 	return 0;
 }
 
+static void bch2_sb_members_to_text(struct printbuf *out, struct bch_sb *sb,
+				    struct bch_sb_field *f)
+{
+	struct bch_sb_field_members *mi = field_to_type(f, members);
+	struct bch_sb_field_disk_groups *gi = bch2_sb_get_disk_groups(sb);
+	unsigned i;
+
+	for (i = 0; i < sb->nr_devices; i++) {
+		struct bch_member *m = mi->members + i;
+		unsigned data_have = bch2_sb_dev_has_data(sb, i);
+		u64 bucket_size = le16_to_cpu(m->bucket_size);
+		u64 device_size = le64_to_cpu(m->nbuckets) * bucket_size;
+
+		if (!bch2_member_exists(m))
+			continue;
+
+		pr_buf(out, "Device:                  %u", i);
+		pr_newline(out);
+
+		printbuf_indent_push(out, 2);
+
+		pr_buf(out, "UUID:                  ");
+		pr_uuid(out, m->uuid.b);
+		pr_newline(out);
+
+		pr_buf(out, "Size:                  ");
+		pr_units(out, device_size, device_size << 9);
+		pr_newline(out);
+
+		pr_buf(out, "Bucket size:           ");
+		pr_units(out, bucket_size, bucket_size << 9);
+		pr_newline(out);
+
+		pr_buf(out, "First bucket:          %u",
+		       le16_to_cpu(m->first_bucket));
+		pr_newline(out);
+
+		pr_buf(out, "Buckets:               %llu",
+		       le64_to_cpu(m->nbuckets));
+		pr_newline(out);
+
+		pr_buf(out, "Last mount:            ");
+		if (m->last_mount)
+			pr_time(out, le64_to_cpu(m->last_mount));
+		else
+			pr_buf(out, "(never)");
+		pr_newline(out);
+
+		pr_buf(out, "State:                 %s",
+		       BCH_MEMBER_STATE(m) < BCH_MEMBER_STATE_NR
+		       ? bch2_member_states[BCH_MEMBER_STATE(m)]
+		       : "unknown");
+		pr_newline(out);
+
+		pr_buf(out, "Group:                 ");
+		if (BCH_MEMBER_GROUP(m)) {
+			unsigned idx = BCH_MEMBER_GROUP(m) - 1;
+
+			if (idx < disk_groups_nr(gi))
+				pr_buf(out, "%s (%u)",
+				       gi->entries[idx].label, idx);
+			else
+				pr_buf(out, "(bad disk labels section)");
+		} else {
+			pr_buf(out, "(none)");
+		}
+		pr_newline(out);
+
+		pr_buf(out, "Data allowed:          ");
+		if (BCH_MEMBER_DATA_ALLOWED(m))
+			bch2_flags_to_text(out, bch2_data_types,
+					   BCH_MEMBER_DATA_ALLOWED(m));
+		else
+			pr_buf(out, "(none)");
+		pr_newline(out);
+
+		pr_buf(out, "Has data:              ");
+		if (data_have)
+			bch2_flags_to_text(out, bch2_data_types, data_have);
+		else
+			pr_buf(out, "(none)");
+		pr_newline(out);
+
+		pr_buf(out, "Discard:               %llu",
+		       BCH_MEMBER_DISCARD(m));
+		pr_newline(out);
+
+		printbuf_indent_pop(out, 2);
+	}
+}
+
 static const struct bch_sb_field_ops bch_sb_field_ops_members = {
-	.validate	= bch2_sb_validate_members,
+	.validate	= bch2_sb_members_validate,
+	.to_text	= bch2_sb_members_to_text,
 };
 
 /* BCH_SB_FIELD_crypt: */
 
-static int bch2_sb_validate_crypt(struct bch_sb *sb,
+static int bch2_sb_crypt_validate(struct bch_sb *sb,
 				  struct bch_sb_field *f,
 				  struct printbuf *err)
 {
@@ -1055,13 +1160,29 @@ static int bch2_sb_validate_crypt(struct bch_sb *sb,
 	return 0;
 }
 
+static void bch2_sb_crypt_to_text(struct printbuf *out, struct bch_sb *sb,
+				  struct bch_sb_field *f)
+{
+	struct bch_sb_field_crypt *crypt = field_to_type(f, crypt);
+
+	pr_buf(out, "KFD:               %llu", BCH_CRYPT_KDF_TYPE(crypt));
+	pr_newline(out);
+	pr_buf(out, "scrypt n:          %llu", BCH_KDF_SCRYPT_N(crypt));
+	pr_newline(out);
+	pr_buf(out, "scrypt r:          %llu", BCH_KDF_SCRYPT_R(crypt));
+	pr_newline(out);
+	pr_buf(out, "scrypt p:          %llu", BCH_KDF_SCRYPT_P(crypt));
+	pr_newline(out);
+}
+
 static const struct bch_sb_field_ops bch_sb_field_ops_crypt = {
-	.validate	= bch2_sb_validate_crypt,
+	.validate	= bch2_sb_crypt_validate,
+	.to_text	= bch2_sb_crypt_to_text,
 };
 
 /* BCH_SB_FIELD_clean: */
 
-int bch2_sb_clean_validate(struct bch_fs *c, struct bch_sb_field_clean *clean, int write)
+int bch2_sb_clean_validate_late(struct bch_fs *c, struct bch_sb_field_clean *clean, int write)
 {
 	struct jset_entry *entry;
 	int ret;
@@ -1251,7 +1372,7 @@ void bch2_fs_mark_clean(struct bch_fs *c)
 	 * this should be in the write path, and we should be validating every
 	 * superblock section:
 	 */
-	ret = bch2_sb_clean_validate(c, sb_clean, WRITE);
+	ret = bch2_sb_clean_validate_late(c, sb_clean, WRITE);
 	if (ret) {
 		bch_err(c, "error writing marking filesystem clean: validate error");
 		goto out;
@@ -1262,7 +1383,7 @@ out:
 	mutex_unlock(&c->sb_lock);
 }
 
-static int bch2_sb_validate_clean(struct bch_sb *sb,
+static int bch2_sb_clean_validate(struct bch_sb *sb,
 				  struct bch_sb_field *f,
 				  struct printbuf *err)
 {
@@ -1277,8 +1398,32 @@ static int bch2_sb_validate_clean(struct bch_sb *sb,
 	return 0;
 }
 
+static void bch2_sb_clean_to_text(struct printbuf *out, struct bch_sb *sb,
+				  struct bch_sb_field *f)
+{
+	struct bch_sb_field_clean *clean = field_to_type(f, clean);
+	struct jset_entry *entry;
+
+	pr_buf(out, "flags:          %x",	le32_to_cpu(clean->flags));
+	pr_newline(out);
+	pr_buf(out, "journal_seq:    %llu",	le64_to_cpu(clean->journal_seq));
+	pr_newline(out);
+
+	for (entry = clean->start;
+	     entry != vstruct_end(&clean->field);
+	     entry = vstruct_next(entry)) {
+		if (entry->type == BCH_JSET_ENTRY_btree_keys &&
+		    !entry->u64s)
+			continue;
+
+		bch2_journal_entry_to_text(out, NULL, entry);
+		pr_newline(out);
+	}
+}
+
 static const struct bch_sb_field_ops bch_sb_field_ops_clean = {
-	.validate	= bch2_sb_validate_clean,
+	.validate	= bch2_sb_clean_validate,
+	.to_text	= bch2_sb_clean_to_text,
 };
 
 static const struct bch_sb_field_ops *bch2_sb_field_ops[] = {
@@ -1302,7 +1447,7 @@ static int bch2_sb_field_validate(struct bch_sb *sb, struct bch_sb_field *f,
 
 	ret = bch2_sb_field_ops[type]->validate(sb, f, &err);
 	if (ret) {
-		pr_buf(&err, "\n");
+		pr_newline(&err);
 		bch2_sb_field_to_text(&err, sb, f);
 		*orig_err = err;
 	}
@@ -1323,7 +1468,202 @@ void bch2_sb_field_to_text(struct printbuf *out, struct bch_sb *sb,
 		pr_buf(out, "(unknown field %u)", type);
 
 	pr_buf(out, " (size %llu):", vstruct_bytes(f));
+	pr_newline(out);
 
-	if (ops && ops->to_text)
+	if (ops && ops->to_text) {
+		printbuf_indent_push(out, 2);
 		bch2_sb_field_ops[type]->to_text(out, sb, f);
+		printbuf_indent_pop(out, 2);
+	}
+}
+
+void bch2_sb_layout_to_text(struct printbuf *out, struct bch_sb_layout *l)
+{
+	unsigned i;
+
+	pr_buf(out, "Type:                    %u", l->layout_type);
+	pr_newline(out);
+
+	pr_buf(out, "Superblock max size:     ");
+	pr_units(out,
+		 1 << l->sb_max_size_bits,
+		 512 << l->sb_max_size_bits);
+	pr_newline(out);
+
+	pr_buf(out, "Nr superblocks:          %u", l->nr_superblocks);
+	pr_newline(out);
+
+	pr_buf(out, "Offsets:                 ");
+	for (i = 0; i < l->nr_superblocks; i++) {
+		if (i)
+			pr_buf(out, ", ");
+		pr_buf(out, "%llu", le64_to_cpu(l->sb_offset[i]));
+	}
+	pr_newline(out);
+}
+
+void bch2_sb_to_text(struct printbuf *out, struct bch_sb *sb,
+		     bool print_layout, unsigned fields)
+{
+	struct bch_sb_field_members *mi;
+	struct bch_sb_field *f;
+	u64 fields_have = 0;
+	unsigned nr_devices = 0;
+
+	mi = bch2_sb_get_members(sb);
+	if (mi) {
+		struct bch_member *m;
+
+		for (m = mi->members;
+		     m < mi->members + sb->nr_devices;
+		     m++)
+			nr_devices += bch2_member_exists(m);
+	}
+
+	pr_buf(out, "External UUID:             ");
+	pr_uuid(out, sb->user_uuid.b);
+	pr_newline(out);
+
+	pr_buf(out, "Internal UUID:             ");
+	pr_uuid(out, sb->uuid.b);
+	pr_newline(out);
+
+	pr_buf(out, "Device index:              %u", sb->dev_idx);
+	pr_newline(out);
+
+	pr_buf(out, "Label:                     ");
+	pr_buf(out, "%.*s", (int) sizeof(sb->label), sb->label);
+	pr_newline(out);
+
+	pr_buf(out, "Version:                   %u", le16_to_cpu(sb->version));
+	pr_newline(out);
+
+	pr_buf(out, "Oldest version on disk:    %u", le16_to_cpu(sb->version_min));
+	pr_newline(out);
+
+	pr_buf(out, "Created:                   ");
+	if (sb->time_base_lo)
+		pr_time(out, le64_to_cpu(sb->time_base_lo) / NSEC_PER_SEC);
+	else
+		pr_buf(out, "(not set)");
+	pr_newline(out);
+
+	pr_buf(out, "Squence number:            %llu", le64_to_cpu(sb->seq));
+	pr_newline(out);
+
+	pr_buf(out, "Block_size:                ");
+	pr_units(out, le16_to_cpu(sb->block_size),
+		 (u32) le16_to_cpu(sb->block_size) << 9);
+	pr_newline(out);
+
+	pr_buf(out, "Btree node size:           ");
+	pr_units(out, BCH_SB_BTREE_NODE_SIZE(sb),
+		 BCH_SB_BTREE_NODE_SIZE(sb) << 9);
+	pr_newline(out);
+
+	pr_buf(out, "Error action:              %s",
+	       BCH_SB_ERROR_ACTION(sb) < BCH_ON_ERROR_NR
+	       ? bch2_error_actions[BCH_SB_ERROR_ACTION(sb)]
+	       : "unknown");
+	pr_newline(out);
+
+	pr_buf(out, "Clean:                     %llu", BCH_SB_CLEAN(sb));
+	pr_newline(out);
+
+	pr_buf(out, "Features:                  ");
+	bch2_flags_to_text(out, bch2_sb_features,
+			   le64_to_cpu(sb->features[0]));
+	pr_newline(out);
+
+	pr_buf(out, "Compat features:           ");
+	bch2_flags_to_text(out, bch2_sb_compat,
+			   le64_to_cpu(sb->compat[0]));
+	pr_newline(out);
+
+	pr_buf(out, "Metadata replicas:         %llu", BCH_SB_META_REPLICAS_WANT(sb));
+	pr_newline(out);
+
+	pr_buf(out, "Data replicas:             %llu", BCH_SB_DATA_REPLICAS_WANT(sb));
+	pr_newline(out);
+
+	pr_buf(out, "Metadata checksum type:    %s (%llu)",
+	       BCH_SB_META_CSUM_TYPE(sb) < BCH_CSUM_OPT_NR
+	       ? bch2_csum_opts[BCH_SB_META_CSUM_TYPE(sb)]
+	       : "unknown",
+	       BCH_SB_META_CSUM_TYPE(sb));
+	pr_newline(out);
+
+	pr_buf(out, "Data checksum type:        %s (%llu)",
+	       BCH_SB_DATA_CSUM_TYPE(sb) < BCH_CSUM_OPT_NR
+	       ? bch2_csum_opts[BCH_SB_DATA_CSUM_TYPE(sb)]
+	       : "unknown",
+	       BCH_SB_DATA_CSUM_TYPE(sb));
+	pr_newline(out);
+
+	pr_buf(out, "Compression type:          %s (%llu)",
+	       BCH_SB_COMPRESSION_TYPE(sb) < BCH_COMPRESSION_OPT_NR
+	       ? bch2_compression_opts[BCH_SB_COMPRESSION_TYPE(sb)]
+	       : "unknown",
+	       BCH_SB_COMPRESSION_TYPE(sb));
+	pr_newline(out);
+
+	pr_buf(out, "Foreground write target:   ");
+	bch2_sb_target_to_text(out, sb, BCH_SB_FOREGROUND_TARGET(sb));
+	pr_newline(out);
+
+	pr_buf(out, "Background write target:   ");
+	bch2_sb_target_to_text(out, sb, BCH_SB_BACKGROUND_TARGET(sb));
+	pr_newline(out);
+
+	pr_buf(out, "Promote target:            ");
+	bch2_sb_target_to_text(out, sb, BCH_SB_PROMOTE_TARGET(sb));
+	pr_newline(out);
+
+	pr_buf(out, "Metadata target:           ");
+	bch2_sb_target_to_text(out, sb, BCH_SB_METADATA_TARGET(sb));
+	pr_newline(out);
+
+	pr_buf(out, "String hash type:          %s (%llu)",
+	       BCH_SB_STR_HASH_TYPE(sb) < BCH_STR_HASH_NR
+	       ? bch2_str_hash_types[BCH_SB_STR_HASH_TYPE(sb)]
+	       : "unknown",
+	       BCH_SB_STR_HASH_TYPE(sb));
+	pr_newline(out);
+
+	pr_buf(out, "32 bit inodes:             %llu", BCH_SB_INODE_32BIT(sb));
+	pr_newline(out);
+
+	pr_buf(out, "GC reserve percentage:     %llu%%", BCH_SB_GC_RESERVE(sb));
+	pr_newline(out);
+
+	pr_buf(out, "Root reserve percentage:   %llu%%", BCH_SB_ROOT_RESERVE(sb));
+	pr_newline(out);
+
+	pr_buf(out, "Devices:                   %u live, %u total",
+	       nr_devices, sb->nr_devices);
+	pr_newline(out);
+
+	pr_buf(out, "Sections:                  ");
+	vstruct_for_each(sb, f)
+		fields_have |= 1 << le32_to_cpu(f->type);
+	bch2_flags_to_text(out, bch2_sb_fields, fields_have);
+	pr_newline(out);
+
+	pr_buf(out, "Superblock size:           %llu", vstruct_bytes(sb));
+	pr_newline(out);
+
+	if (print_layout) {
+		pr_newline(out);
+		pr_buf(out, "layout:");
+		pr_newline(out);
+		printbuf_indent_push(out, 2);
+		bch2_sb_layout_to_text(out, &sb->layout);
+		printbuf_indent_pop(out, 2);
+	}
+
+	vstruct_for_each(sb, f)
+		if (fields & (1 << le32_to_cpu(f->type))) {
+			pr_newline(out);
+			bch2_sb_field_to_text(out, sb, f);
+		}
 }
diff --git a/fs/bcachefs/super-io.h b/fs/bcachefs/super-io.h
index 6170fa0990f1..ccd6fe7fdf29 100644
--- a/fs/bcachefs/super-io.h
+++ b/fs/bcachefs/super-io.h
@@ -121,12 +121,14 @@ static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi)
 void bch2_journal_super_entries_add_common(struct bch_fs *,
 					   struct jset_entry **, u64);
 
-int bch2_sb_clean_validate(struct bch_fs *, struct bch_sb_field_clean *, int);
+int bch2_sb_clean_validate_late(struct bch_fs *, struct bch_sb_field_clean *, int);
 
 int bch2_fs_mark_dirty(struct bch_fs *);
 void bch2_fs_mark_clean(struct bch_fs *);
 
 void bch2_sb_field_to_text(struct printbuf *, struct bch_sb *,
 			   struct bch_sb_field *);
+void bch2_sb_layout_to_text(struct printbuf *, struct bch_sb_layout *);
+void bch2_sb_to_text(struct printbuf *, struct bch_sb *, bool, unsigned);
 
 #endif /* _BCACHEFS_SUPER_IO_H */
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index b727845dd64b..1a3068f658a1 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -825,7 +825,7 @@ SHOW(bch2_dev)
 	if (attr == &sysfs_label) {
 		if (ca->mi.group) {
 			mutex_lock(&c->sb_lock);
-			bch2_disk_path_to_text(&out, &c->disk_sb,
+			bch2_disk_path_to_text(&out, c->disk_sb.sb,
 					       ca->mi.group - 1);
 			mutex_unlock(&c->sb_lock);
 		}
diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c
index 971f404a01e3..f170cf9d5052 100644
--- a/fs/bcachefs/util.c
+++ b/fs/bcachefs/util.c
@@ -120,6 +120,27 @@ void bch2_hprint(struct printbuf *buf, s64 v)
 		pr_buf(buf, "%c", si_units[u]);
 }
 
+void bch2_pr_units(struct printbuf *out, s64 raw, s64 bytes)
+{
+	if (raw < 0) {
+		pr_buf(out, "-");
+		raw	= -raw;
+		bytes	= -bytes;
+	}
+
+	switch (out->units) {
+	case PRINTBUF_UNITS_RAW:
+		pr_buf(out, "%llu", raw);
+		break;
+	case PRINTBUF_UNITS_BYTES:
+		pr_buf(out, "%llu", bytes);
+		break;
+	case PRINTBUF_UNITS_HUMAN_READABLE:
+		bch2_hprint(out, bytes);
+		break;
+	}
+}
+
 void bch2_string_opt_to_text(struct printbuf *out,
 			     const char * const list[],
 			     size_t selected)
diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h
index fc8ffa61bbeb..3d5a9e04b3ad 100644
--- a/fs/bcachefs/util.h
+++ b/fs/bcachefs/util.h
@@ -235,10 +235,17 @@ do {									\
 #define ANYSINT_MAX(t)							\
 	((((t) 1 << (sizeof(t) * 8 - 2)) - (t) 1) * (t) 2 + (t) 1)
 
+enum printbuf_units {
+	PRINTBUF_UNITS_RAW,
+	PRINTBUF_UNITS_BYTES,
+	PRINTBUF_UNITS_HUMAN_READABLE,
+};
+
 struct printbuf {
-	char		*pos;
-	char		*end;
-	unsigned	indent;
+	char			*pos;
+	char			*end;
+	unsigned		indent;
+	enum printbuf_units	units;
 };
 
 static inline size_t printbuf_remaining(struct printbuf *buf)
@@ -272,7 +279,7 @@ static inline void printbuf_indent_pop(struct printbuf *buf, unsigned spaces)
 	buf->indent -= spaces;
 }
 
-static inline void printbuf_newline(struct printbuf *buf)
+static inline void pr_newline(struct printbuf *buf)
 {
 	unsigned i;
 
@@ -281,6 +288,46 @@ static inline void printbuf_newline(struct printbuf *buf)
 		pr_buf(buf, " ");
 }
 
+void bch2_pr_units(struct printbuf *, s64, s64);
+#define pr_units(...) bch2_pr_units(__VA_ARGS__)
+
+#ifdef __KERNEL__
+static inline void pr_time(struct printbuf *out, u64 time)
+{
+	pr_buf(out, "%llu", time);
+}
+#else
+#include <time.h>
+static inline void pr_time(struct printbuf *out, u64 _time)
+{
+	char time_str[64];
+	time_t time = _time;
+	struct tm *tm = localtime(&time);
+	size_t err = strftime(time_str, sizeof(time_str), "%c", tm);
+	if (!err)
+		pr_buf(out, "(formatting error)");
+	else
+		pr_buf(out, "%s", time_str);
+}
+#endif
+
+#ifdef __KERNEL__
+static inline void uuid_unparse_lower(u8 *uuid, char *out)
+{
+	sprintf(out, "%plU", uuid);
+}
+#else
+#include <uuid/uuid.h>
+#endif
+
+static inline void pr_uuid(struct printbuf *out, u8 *uuid)
+{
+	char uuid_str[40];
+
+	uuid_unparse_lower(uuid, uuid_str);
+	pr_buf(out, uuid_str);
+}
+
 int bch2_strtoint_h(const char *, int *);
 int bch2_strtouint_h(const char *, unsigned int *);
 int bch2_strtoll_h(const char *, long long *);
@@ -784,13 +831,4 @@ static inline int u8_cmp(u8 l, u8 r)
 	return cmp_int(l, r);
 }
 
-#ifdef __KERNEL__
-static inline void uuid_unparse_lower(u8 *uuid, char *out)
-{
-	sprintf(out, "%plU", uuid);
-}
-#else
-#include <uuid/uuid.h>
-#endif
-
 #endif /* _BCACHEFS_UTIL_H */
-- 
cgit 


From f61816d0fc6091e14b3f4ffce962dc5084a1b6cd Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 21 Feb 2022 13:22:11 -0500
Subject: bcachefs: Fix a use after free

In move_read_endio, we were checking if the next pending write has its
read completed - but this can turn after a use after free (and we were
accessing the list without a lock), so instead just better to just
unconditionally do the wakeup.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/move.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index 4751d79219cb..2eb192da8e1d 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -480,9 +480,7 @@ static void move_read_endio(struct bio *bio)
 	atomic_sub(io->read_sectors, &ctxt->read_sectors);
 	io->read_completed = true;
 
-	if (next_pending_write(ctxt))
-		wake_up(&ctxt->wait);
-
+	wake_up(&ctxt->wait);
 	closure_put(&ctxt->cl);
 }
 
-- 
cgit 


From 702a4ef07774fbc565f3e567073d2f83f9602667 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 22 Feb 2022 04:53:48 -0500
Subject: bcachefs: Add tabstops to printbufs

Now, when outputting to printbufs, we can set tabstops and left or right
justify text to them - this is to be used by the userspace 'bcachefs fs
usage' command.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/super-io.c | 12 ++++----
 fs/bcachefs/util.c     |  8 +----
 fs/bcachefs/util.h     | 84 +++++++++++++++++++++++++++++++++++++++++++++-----
 3 files changed, 84 insertions(+), 20 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index e1ff14eedaea..f95c9d754530 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -1061,7 +1061,7 @@ static void bch2_sb_members_to_text(struct printbuf *out, struct bch_sb *sb,
 		pr_buf(out, "Device:                  %u", i);
 		pr_newline(out);
 
-		printbuf_indent_push(out, 2);
+		pr_indent_push(out, 2);
 
 		pr_buf(out, "UUID:                  ");
 		pr_uuid(out, m->uuid.b);
@@ -1129,7 +1129,7 @@ static void bch2_sb_members_to_text(struct printbuf *out, struct bch_sb *sb,
 		       BCH_MEMBER_DISCARD(m));
 		pr_newline(out);
 
-		printbuf_indent_pop(out, 2);
+		pr_indent_pop(out, 2);
 	}
 }
 
@@ -1471,9 +1471,9 @@ void bch2_sb_field_to_text(struct printbuf *out, struct bch_sb *sb,
 	pr_newline(out);
 
 	if (ops && ops->to_text) {
-		printbuf_indent_push(out, 2);
+		pr_indent_push(out, 2);
 		bch2_sb_field_ops[type]->to_text(out, sb, f);
-		printbuf_indent_pop(out, 2);
+		pr_indent_pop(out, 2);
 	}
 }
 
@@ -1656,9 +1656,9 @@ void bch2_sb_to_text(struct printbuf *out, struct bch_sb *sb,
 		pr_newline(out);
 		pr_buf(out, "layout:");
 		pr_newline(out);
-		printbuf_indent_push(out, 2);
+		pr_indent_push(out, 2);
 		bch2_sb_layout_to_text(out, &sb->layout);
-		printbuf_indent_pop(out, 2);
+		pr_indent_pop(out, 2);
 	}
 
 	vstruct_for_each(sb, f)
diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c
index f170cf9d5052..a330fa30cd79 100644
--- a/fs/bcachefs/util.c
+++ b/fs/bcachefs/util.c
@@ -117,17 +117,11 @@ void bch2_hprint(struct printbuf *buf, s64 v)
 	if (u && t && v < 100 && v > -100)
 		pr_buf(buf, ".%i", t / 103);
 	if (u)
-		pr_buf(buf, "%c", si_units[u]);
+		pr_char(buf, si_units[u]);
 }
 
 void bch2_pr_units(struct printbuf *out, s64 raw, s64 bytes)
 {
-	if (raw < 0) {
-		pr_buf(out, "-");
-		raw	= -raw;
-		bytes	= -bytes;
-	}
-
 	switch (out->units) {
 	case PRINTBUF_UNITS_RAW:
 		pr_buf(out, "%llu", raw);
diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h
index 3d5a9e04b3ad..426c3009f292 100644
--- a/fs/bcachefs/util.h
+++ b/fs/bcachefs/util.h
@@ -244,8 +244,12 @@ enum printbuf_units {
 struct printbuf {
 	char			*pos;
 	char			*end;
+	char			*last_newline;
+	char			*last_field;
 	unsigned		indent;
 	enum printbuf_units	units;
+	unsigned		tabstop;
+	unsigned		tabstops[4];
 };
 
 static inline size_t printbuf_remaining(struct printbuf *buf)
@@ -253,29 +257,49 @@ static inline size_t printbuf_remaining(struct printbuf *buf)
 	return buf->end - buf->pos;
 }
 
+static inline size_t printbuf_linelen(struct printbuf *buf)
+{
+	return buf->pos - buf->last_newline;
+}
+
 #define _PBUF(_buf, _len)						\
 	((struct printbuf) {						\
-		.pos	= _buf,						\
-		.end	= _buf + _len,					\
+		.pos		= _buf,					\
+		.end		= _buf + _len,				\
+		.last_newline	= _buf,					\
+		.last_field	= _buf,					\
 	})
 
 #define PBUF(_buf) _PBUF(_buf, sizeof(_buf))
 
+
 #define pr_buf(_out, ...)						\
 do {									\
 	(_out)->pos += scnprintf((_out)->pos, printbuf_remaining(_out),	\
 				 __VA_ARGS__);				\
 } while (0)
 
-static inline void printbuf_indent_push(struct printbuf *buf, unsigned spaces)
+static inline void pr_char(struct printbuf *out, char c)
+{
+	if (printbuf_remaining(out) > 1) {
+		*out->pos = c;
+		out->pos++;
+	}
+}
+
+static inline void pr_indent_push(struct printbuf *buf, unsigned spaces)
 {
 	buf->indent += spaces;
 	while (spaces--)
-		pr_buf(buf, " ");
+		pr_char(buf, ' ');
 }
 
-static inline void printbuf_indent_pop(struct printbuf *buf, unsigned spaces)
+static inline void pr_indent_pop(struct printbuf *buf, unsigned spaces)
 {
+	if (buf->last_newline + buf->indent == buf->pos) {
+		buf->pos -= spaces;
+		buf->pos = '\0';
+	}
 	buf->indent -= spaces;
 }
 
@@ -283,14 +307,60 @@ static inline void pr_newline(struct printbuf *buf)
 {
 	unsigned i;
 
-	pr_buf(buf, "\n");
+	pr_char(buf, '\n');
+
+	buf->last_newline	= buf->pos;
+
 	for (i = 0; i < buf->indent; i++)
-		pr_buf(buf, " ");
+		pr_char(buf, ' ');
+
+	buf->last_field		= buf->pos;
+	buf->tabstop = 0;
+}
+
+static inline void pr_tab(struct printbuf *buf)
+{
+	BUG_ON(buf->tabstop > ARRAY_SIZE(buf->tabstops));
+
+	while (printbuf_remaining(buf) > 1 &&
+	       printbuf_linelen(buf) < buf->tabstops[buf->tabstop])
+		pr_char(buf, ' ');
+
+	buf->last_field = buf->pos;
+	buf->tabstop++;
+}
+
+static inline void pr_tab_rjust(struct printbuf *buf)
+{
+	ssize_t shift = min_t(ssize_t, buf->tabstops[buf->tabstop] -
+			      printbuf_linelen(buf),
+			      printbuf_remaining(buf));
+	ssize_t move = min_t(ssize_t, buf->pos - buf->last_field,
+			     printbuf_remaining(buf) - shift);
+
+	BUG_ON(buf->tabstop > ARRAY_SIZE(buf->tabstops));
+
+	if (shift > 0) {
+		memmove(buf->last_field + shift,
+			buf->last_field,
+			move);
+		memset(buf->last_field, ' ', shift);
+		buf->pos += shift;
+		*buf->pos = 0;
+	}
+
+	buf->last_field = buf->pos;
+	buf->tabstop++;
 }
 
 void bch2_pr_units(struct printbuf *, s64, s64);
 #define pr_units(...) bch2_pr_units(__VA_ARGS__)
 
+static inline void pr_sectors(struct printbuf *out, u64 v)
+{
+	bch2_pr_units(out, v, v << 9);
+}
+
 #ifdef __KERNEL__
 static inline void pr_time(struct printbuf *out, u64 time)
 {
-- 
cgit 


From 8322a9376eb21c47829128684fd900016a0e0169 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 4 Jan 2023 04:34:16 -0500
Subject: bcachefs: Btree key cache optimization

This helps with lock contention in the journalling code: instead of
updating our journal pin on every write, only get a journal pin if we
don't have one.

This means we can avoid hammering on journal locks nearly so much, at
the cost of carrying around a journal pin for an older entry than the
one we actually need. To handle that, if needed we update our journal
pin to the correct one when flushed by journal reclaim.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_key_cache.c | 12 ++++++++++--
 fs/bcachefs/btree_types.h     |  1 +
 2 files changed, 11 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index 1841760237ec..70f31b5379e7 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -487,6 +487,13 @@ int bch2_btree_key_cache_journal_flush(struct journal *j,
 		six_unlock_read(&ck->c.lock);
 		goto unlock;
 	}
+
+	if (ck->seq != seq) {
+		bch2_journal_pin_update(&c->journal, ck->seq, &ck->journal,
+					bch2_btree_key_cache_journal_flush);
+		six_unlock_read(&ck->c.lock);
+		goto unlock;
+	}
 	six_unlock_read(&ck->c.lock);
 
 	ret = bch2_trans_do(c, NULL, NULL, 0,
@@ -547,8 +554,9 @@ bool bch2_btree_insert_key_cached(struct btree_trans *trans,
 			kick_reclaim = true;
 	}
 
-	bch2_journal_pin_update(&c->journal, trans->journal_res.seq,
-				&ck->journal, bch2_btree_key_cache_journal_flush);
+	bch2_journal_pin_add(&c->journal, trans->journal_res.seq,
+			     &ck->journal, bch2_btree_key_cache_journal_flush);
+	ck->seq = trans->journal_res.seq;
 
 	if (kick_reclaim)
 		journal_reclaim_kick(&c->journal);
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 89c0d2272d91..d5c2a776ee1b 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -332,6 +332,7 @@ struct bkey_cached {
 
 	struct journal_preres	res;
 	struct journal_entry_pin journal;
+	u64			seq;
 
 	struct bkey_i		*k;
 };
-- 
cgit 


From 5838c1702b7d99741273888644a8cd4423b8a440 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 23 Feb 2022 07:00:34 -0500
Subject: bcachefs: Drop journal_write_compact()

Long ago it was possible to get a journal reservation and not use it,
but that's no longer allowed, which means journal_write_compact() has
very little work to do, and isn't really worth the code anymore.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/journal_io.c | 45 ---------------------------------------------
 1 file changed, 45 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 491300e3c48f..43a60f5c23b3 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -1298,49 +1298,6 @@ done:
 	return replicas >= c->opts.metadata_replicas_required ? 0 : -EROFS;
 }
 
-static void journal_write_compact(struct jset *jset)
-{
-	struct jset_entry *i, *next, *prev = NULL;
-
-	/*
-	 * Simple compaction, dropping empty jset_entries (from journal
-	 * reservations that weren't fully used) and merging jset_entries that
-	 * can be.
-	 *
-	 * If we wanted to be really fancy here, we could sort all the keys in
-	 * the jset and drop keys that were overwritten - probably not worth it:
-	 */
-	vstruct_for_each_safe(jset, i, next) {
-		unsigned u64s = le16_to_cpu(i->u64s);
-
-		/* Empty entry: */
-		if (!u64s)
-			continue;
-
-		/* Can we merge with previous entry? */
-		if (prev &&
-		    i->btree_id == prev->btree_id &&
-		    i->level	== prev->level &&
-		    i->type	== prev->type &&
-		    i->type	== BCH_JSET_ENTRY_btree_keys &&
-		    le16_to_cpu(prev->u64s) + u64s <= U16_MAX) {
-			memmove_u64s_down(vstruct_next(prev),
-					  i->_data,
-					  u64s);
-			le16_add_cpu(&prev->u64s, u64s);
-			continue;
-		}
-
-		/* Couldn't merge, move i into new position (after prev): */
-		prev = prev ? vstruct_next(prev) : jset->start;
-		if (i != prev)
-			memmove_u64s_down(prev, i, jset_u64s(u64s));
-	}
-
-	prev = prev ? vstruct_next(prev) : jset->start;
-	jset->u64s = cpu_to_le32((u64 *) prev - jset->_data);
-}
-
 static void journal_buf_realloc(struct journal *j, struct journal_buf *buf)
 {
 	/* we aren't holding j->lock: */
@@ -1578,8 +1535,6 @@ void bch2_journal_write(struct closure *cl)
 	le32_add_cpu(&jset->u64s, u64s);
 	BUG_ON(vstruct_sectors(jset, c->block_bits) > w->sectors);
 
-	journal_write_compact(jset);
-
 	jset->magic		= cpu_to_le64(jset_magic(c));
 	jset->version		= c->sb.version < bcachefs_metadata_version_new_versioning
 		? cpu_to_le32(BCH_JSET_VERSION_OLD)
-- 
cgit 


From 6e44568cc311b39613ed292c9dc1dd8cbec86db7 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 22 Feb 2022 17:16:45 -0500
Subject: bcachefs: Set BTREE_NODE_SEQ() correctly in merge path

BTREE_NODE_SEQ() is supposed to give us a time ordering of btree nodes
on disk, so that we can tell which btree node is newer if we ever have
to scan the entire device to find btree nodes.

The btree node merge path wasn't setting it correctly on the new node -
oops.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_update_interior.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 07bece908691..644ac4e5d1d1 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -1697,6 +1697,10 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans,
 	n = bch2_btree_node_alloc(as, b->c.level);
 	bch2_btree_update_add_new_node(as, n);
 
+	SET_BTREE_NODE_SEQ(n->data,
+			   max(BTREE_NODE_SEQ(b->data),
+			       BTREE_NODE_SEQ(m->data)) + 1);
+
 	btree_set_min(n, prev->data->min_key);
 	btree_set_max(n, next->data->max_key);
 	n->data->format		= new_f;
-- 
cgit 


From e201f70b116513cb0d17ba32e1f00c234dee9d7e Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 23 Feb 2022 10:26:10 -0500
Subject: bcachefs: Fix for journal getting stuck

The journal can get stuck if we need to get a journal reservation for
something we have a pre-reservation for, but aren't able to reclaim
space, or if the pin fifo is full - it's impractical to resize the pin
fifo at runtime.

Previously, we reserved 8 entries in the pin fifo for pre-reservations,
but that seems small - we're seeing the journal occasionally get stuck.
Let's reserve a quarter of it.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/journal.c | 2 +-
 fs/bcachefs/journal.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index 3c7dce3b31c1..1f26d351697a 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -416,7 +416,7 @@ unlock:
 	    (flags & JOURNAL_RES_GET_RESERVED)) {
 		char *journal_debug_buf = kmalloc(4096, GFP_ATOMIC);
 
-		bch_err(c, "Journal stuck!");
+		bch_err(c, "Journal stuck! Hava a pre-reservation but journal full");
 		if (journal_debug_buf) {
 			bch2_journal_debug_to_text(&_PBUF(journal_debug_buf, 4096), j);
 			bch_err(c, "%s", journal_debug_buf);
diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h
index 17f9037b404a..9d4c6d86d5c1 100644
--- a/fs/bcachefs/journal.h
+++ b/fs/bcachefs/journal.h
@@ -372,7 +372,7 @@ static inline bool journal_check_may_get_unreserved(struct journal *j)
 {
 	union journal_preres_state s = READ_ONCE(j->prereserved);
 	bool ret = s.reserved < s.remaining &&
-		fifo_free(&j->pin) > 8;
+		fifo_free(&j->pin) > j->pin.size / 4;
 
 	lockdep_assert_held(&j->lock);
 
-- 
cgit 


From b66b2bc0f64a57c042ea1fa51dbd5904557bf67f Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 23 Feb 2022 06:56:35 -0500
Subject: bcachefs: Revert "Ensure journal doesn't get stuck in nochanges mode"

This patch was originally to work around the journal geting stuck in
nochanges mode - but that was just a hack, we needed to fix the actual
bug. It should be fixed now, so revert it.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/journal.h         | 1 -
 fs/bcachefs/journal_io.c      | 2 +-
 fs/bcachefs/journal_reclaim.c | 6 ++----
 fs/bcachefs/journal_types.h   | 1 -
 fs/bcachefs/super.c           | 3 ---
 5 files changed, 3 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h
index 9d4c6d86d5c1..5d263a5b8685 100644
--- a/fs/bcachefs/journal.h
+++ b/fs/bcachefs/journal.h
@@ -431,7 +431,6 @@ static inline int bch2_journal_preres_get_fast(struct journal *j,
 		ret = 0;
 
 		if ((flags & JOURNAL_RES_GET_RESERVED) ||
-		    test_bit(JOURNAL_NOCHANGES, &j->flags) ||
 		    new.reserved + d < new.remaining) {
 			new.reserved += d;
 			ret = 1;
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 43a60f5c23b3..b8fcc801a666 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -1616,7 +1616,7 @@ retry_alloc:
 
 	w->devs_written = bch2_bkey_devs(bkey_i_to_s_c(&w->key));
 
-	if (test_bit(JOURNAL_NOCHANGES, &j->flags))
+	if (c->opts.nochanges)
 		goto no_io;
 
 	for_each_rw_member(ca, c, i)
diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
index 84cc952a7ac5..c15b18831512 100644
--- a/fs/bcachefs/journal_reclaim.c
+++ b/fs/bcachefs/journal_reclaim.c
@@ -34,10 +34,8 @@ unsigned bch2_journal_dev_buckets_available(struct journal *j,
 					    struct journal_device *ja,
 					    enum journal_space_from from)
 {
-	unsigned available = !test_bit(JOURNAL_NOCHANGES, &j->flags)
-		? ((journal_space_from(ja, from) -
-		    ja->cur_idx - 1 + ja->nr) % ja->nr)
-		: ja->nr;
+	unsigned available = (journal_space_from(ja, from) -
+			      ja->cur_idx - 1 + ja->nr) % ja->nr;
 
 	/*
 	 * Don't use the last bucket unless writing the new last_seq
diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h
index 73e7fbc4f109..9facd3f128bb 100644
--- a/fs/bcachefs/journal_types.h
+++ b/fs/bcachefs/journal_types.h
@@ -151,7 +151,6 @@ enum {
 	JOURNAL_NEED_WRITE,
 	JOURNAL_MAY_GET_UNRESERVED,
 	JOURNAL_MAY_SKIP_FLUSH,
-	JOURNAL_NOCHANGES,
 };
 
 /* Embedded in struct bch_fs */
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 6a32b9a5dc0e..d9b69c4244d5 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -835,9 +835,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 	if (ret)
 		goto err;
 
-	if (c->opts.nochanges)
-		set_bit(JOURNAL_NOCHANGES, &c->journal.flags);
-
 	mi = bch2_sb_get_members(c->disk_sb.sb);
 	for (i = 0; i < c->sb.nr_devices; i++)
 		if (bch2_dev_exists(c->disk_sb.sb, mi, i) &&
-- 
cgit 


From 78c8fe20be12d0e4b6427d9149fd1eb9a69e2290 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 19 Feb 2022 02:48:27 -0500
Subject: bcachefs: Normal update/commit path now works before going RW

This improves __bch2_trans_commit - early in the recovery process, when
we're running btree_gc and before we want to go RW, it now uses
bch2_journal_key_insert() to add the update to the list of updates for
journal replay to do, instead of btree_gc having to use separate
interfaces depending on whether we're running at bringup or, later,
runtime.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/bcachefs.h          |  1 +
 fs/bcachefs/btree_gc.c          | 71 ++++++++++++++++-------------------------
 fs/bcachefs/btree_update_leaf.c | 26 +++++++++++++++
 fs/bcachefs/buckets.c           |  8 +++--
 fs/bcachefs/recovery.c          |  2 ++
 5 files changed, 61 insertions(+), 47 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index c5d972b16b39..378061712c76 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -515,6 +515,7 @@ enum {
 	BCH_FS_TOPOLOGY_REPAIR_DONE,
 	BCH_FS_FSCK_DONE,
 	BCH_FS_STARTED,
+	BCH_FS_MAY_GO_RW,
 	BCH_FS_RW,
 	BCH_FS_WAS_RW,
 
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index ba22c36e30a1..fbd54ac790ba 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -753,7 +753,8 @@ static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id,
 			atomic64_set(&c->key_version, k->k->version.lo);
 	}
 
-	ret = bch2_mark_key(trans, old, *k, flags);
+	ret = __bch2_trans_do(trans, NULL, NULL, 0,
+			bch2_mark_key(trans, old, *k, flags));
 fsck_err:
 err:
 	if (ret)
@@ -1259,7 +1260,7 @@ static int bch2_gc_start(struct bch_fs *c,
 
 static int bch2_alloc_write_key(struct btree_trans *trans,
 				struct btree_iter *iter,
-				bool initial, bool metadata_only)
+				bool metadata_only)
 {
 	struct bch_fs *c = trans->c;
 	struct bch_dev *ca = bch_dev_bkey_exists(c, iter->pos.inode);
@@ -1327,14 +1328,12 @@ static int bch2_alloc_write_key(struct btree_trans *trans,
 	if (IS_ERR(a))
 		return PTR_ERR(a);
 
-	ret = initial
-		? bch2_journal_key_insert(c, BTREE_ID_alloc, 0, &a->k)
-		: bch2_trans_update(trans, iter, &a->k, BTREE_TRIGGER_NORUN);
+	ret = bch2_trans_update(trans, iter, &a->k, BTREE_TRIGGER_NORUN);
 fsck_err:
 	return ret;
 }
 
-static int bch2_gc_alloc_done(struct bch_fs *c, bool initial, bool metadata_only)
+static int bch2_gc_alloc_done(struct bch_fs *c, bool metadata_only)
 {
 	struct btree_trans trans;
 	struct btree_iter iter;
@@ -1356,7 +1355,7 @@ static int bch2_gc_alloc_done(struct bch_fs *c, bool initial, bool metadata_only
 			ret = __bch2_trans_do(&trans, NULL, NULL,
 					      BTREE_INSERT_LAZY_RW,
 					bch2_alloc_write_key(&trans, &iter,
-							     initial, metadata_only));
+							     metadata_only));
 			if (ret)
 				break;
 		}
@@ -1373,7 +1372,7 @@ static int bch2_gc_alloc_done(struct bch_fs *c, bool initial, bool metadata_only
 	return ret;
 }
 
-static int bch2_gc_alloc_start(struct bch_fs *c, bool initial, bool metadata_only)
+static int bch2_gc_alloc_start(struct bch_fs *c, bool metadata_only)
 {
 	struct bch_dev *ca;
 	unsigned i;
@@ -1397,7 +1396,7 @@ static int bch2_gc_alloc_start(struct bch_fs *c, bool initial, bool metadata_onl
 	return bch2_alloc_read(c, true, metadata_only);
 }
 
-static void bch2_gc_alloc_reset(struct bch_fs *c, bool initial, bool metadata_only)
+static void bch2_gc_alloc_reset(struct bch_fs *c, bool metadata_only)
 {
 	struct bch_dev *ca;
 	unsigned i;
@@ -1418,8 +1417,7 @@ static void bch2_gc_alloc_reset(struct bch_fs *c, bool initial, bool metadata_on
 	};
 }
 
-static int bch2_gc_reflink_done(struct bch_fs *c, bool initial,
-				bool metadata_only)
+static int bch2_gc_reflink_done(struct bch_fs *c, bool metadata_only)
 {
 	struct btree_trans trans;
 	struct btree_iter iter;
@@ -1466,23 +1464,13 @@ static int bch2_gc_reflink_done(struct bch_fs *c, bool initial,
 
 			bkey_reassemble(new, k);
 
-			if (!r->refcount) {
+			if (!r->refcount)
 				new->k.type = KEY_TYPE_deleted;
-				/*
-				 * XXX ugly: bch2_journal_key_insert() queues up
-				 * the key for the journal replay code, which
-				 * doesn't run the extent overwrite pass
-				 */
-				if (initial)
-					new->k.size = 0;
-			} else {
+			else
 				*bkey_refcount(new) = cpu_to_le64(r->refcount);
-			}
 
-			ret = initial
-			       ? bch2_journal_key_insert(c, BTREE_ID_stripes, 0, new)
-			       : __bch2_trans_do(&trans, NULL, NULL, 0,
-					__bch2_btree_insert(&trans, BTREE_ID_reflink, new));
+			ret = __bch2_trans_do(&trans, NULL, NULL, 0,
+				__bch2_btree_insert(&trans, BTREE_ID_reflink, new));
 			kfree(new);
 
 			if (ret)
@@ -1496,7 +1484,7 @@ fsck_err:
 	return ret;
 }
 
-static int bch2_gc_reflink_start(struct bch_fs *c, bool initial,
+static int bch2_gc_reflink_start(struct bch_fs *c,
 				 bool metadata_only)
 {
 	struct btree_trans trans;
@@ -1535,8 +1523,7 @@ static int bch2_gc_reflink_start(struct bch_fs *c, bool initial,
 	return ret;
 }
 
-static void bch2_gc_reflink_reset(struct bch_fs *c, bool initial,
-				  bool metadata_only)
+static void bch2_gc_reflink_reset(struct bch_fs *c, bool metadata_only)
 {
 	struct genradix_iter iter;
 	struct reflink_gc *r;
@@ -1545,8 +1532,7 @@ static void bch2_gc_reflink_reset(struct bch_fs *c, bool initial,
 		r->refcount = 0;
 }
 
-static int bch2_gc_stripes_done(struct bch_fs *c, bool initial,
-				bool metadata_only)
+static int bch2_gc_stripes_done(struct bch_fs *c, bool metadata_only)
 {
 	struct btree_trans trans;
 	struct btree_iter iter;
@@ -1594,10 +1580,8 @@ inconsistent:
 			for (i = 0; i < new->v.nr_blocks; i++)
 				stripe_blockcount_set(&new->v, i, m ? m->block_sectors[i] : 0);
 
-			ret = initial
-				? bch2_journal_key_insert(c, BTREE_ID_stripes, 0, &new->k_i)
-				: __bch2_trans_do(&trans, NULL, NULL, 0,
-					__bch2_btree_insert(&trans, BTREE_ID_reflink, &new->k_i));
+			ret = __bch2_trans_do(&trans, NULL, NULL, 0,
+				__bch2_btree_insert(&trans, BTREE_ID_reflink, &new->k_i));
 			kfree(new);
 		}
 	}
@@ -1608,8 +1592,7 @@ fsck_err:
 	return ret;
 }
 
-static void bch2_gc_stripes_reset(struct bch_fs *c, bool initial,
-				bool metadata_only)
+static void bch2_gc_stripes_reset(struct bch_fs *c, bool metadata_only)
 {
 	genradix_free(&c->gc_stripes);
 }
@@ -1649,8 +1632,8 @@ int bch2_gc(struct bch_fs *c, bool initial, bool metadata_only)
 			   !bch2_btree_interior_updates_nr_pending(c));
 
 	ret   = bch2_gc_start(c, metadata_only) ?:
-		bch2_gc_alloc_start(c, initial, metadata_only) ?:
-		bch2_gc_reflink_start(c, initial, metadata_only);
+		bch2_gc_alloc_start(c, metadata_only) ?:
+		bch2_gc_reflink_start(c, metadata_only);
 	if (ret)
 		goto out;
 again:
@@ -1705,9 +1688,9 @@ again:
 		clear_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags);
 		__gc_pos_set(c, gc_phase(GC_PHASE_NOT_RUNNING));
 
-		bch2_gc_stripes_reset(c, initial, metadata_only);
-		bch2_gc_alloc_reset(c, initial, metadata_only);
-		bch2_gc_reflink_reset(c, initial, metadata_only);
+		bch2_gc_stripes_reset(c, metadata_only);
+		bch2_gc_alloc_reset(c, metadata_only);
+		bch2_gc_reflink_reset(c, metadata_only);
 
 		/* flush fsck errors, reset counters */
 		bch2_flush_fsck_errs(c);
@@ -1717,9 +1700,9 @@ out:
 	if (!ret) {
 		bch2_journal_block(&c->journal);
 
-		ret   = bch2_gc_stripes_done(c, initial, metadata_only) ?:
-			bch2_gc_reflink_done(c, initial, metadata_only) ?:
-			bch2_gc_alloc_done(c, initial, metadata_only) ?:
+		ret   = bch2_gc_stripes_done(c, metadata_only) ?:
+			bch2_gc_reflink_done(c, metadata_only) ?:
+			bch2_gc_alloc_done(c, metadata_only) ?:
 			bch2_gc_done(c, initial, metadata_only);
 
 		bch2_journal_unblock(&c->journal);
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 9d41711c4e9b..fcc56235ae90 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -968,6 +968,27 @@ static int bch2_trans_commit_run_triggers(struct btree_trans *trans)
 	return 0;
 }
 
+/*
+ * This is for updates done in the early part of fsck - btree_gc - before we've
+ * gone RW. we only add the new key to the list of keys for journal replay to
+ * do.
+ */
+static noinline int
+do_bch2_trans_commit_to_journal_replay(struct btree_trans *trans)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_insert_entry *i;
+	int ret = 0;
+
+	trans_for_each_update(trans, i) {
+		ret = bch2_journal_key_insert(c, i->btree_id, i->level, i->k);
+		if (ret)
+			break;
+	}
+
+	return ret;
+}
+
 int __bch2_trans_commit(struct btree_trans *trans)
 {
 	struct bch_fs *c = trans->c;
@@ -986,6 +1007,11 @@ int __bch2_trans_commit(struct btree_trans *trans)
 	if (ret)
 		goto out_reset;
 
+	if (unlikely(!test_bit(BCH_FS_MAY_GO_RW, &c->flags))) {
+		ret = do_bch2_trans_commit_to_journal_replay(trans);
+		goto out_reset;
+	}
+
 	if (!(trans->flags & BTREE_INSERT_NOCHECK_RW) &&
 	    unlikely(!percpu_ref_tryget(&c->writes))) {
 		ret = bch2_trans_commit_get_rw_cold(trans);
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index edc1918cf140..1c1266fb80df 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -1164,9 +1164,11 @@ static int bch2_mark_reservation(struct btree_trans *trans,
 	return 0;
 }
 
-static s64 __bch2_mark_reflink_p(struct bch_fs *c, struct bkey_s_c_reflink_p p,
+static s64 __bch2_mark_reflink_p(struct btree_trans *trans,
+				 struct bkey_s_c_reflink_p p,
 				 u64 *idx, unsigned flags, size_t r_idx)
 {
+	struct bch_fs *c = trans->c;
 	struct reflink_gc *r;
 	int add = !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1;
 	s64 ret = 0;
@@ -1199,7 +1201,7 @@ not_found:
 		new.k.type	= KEY_TYPE_error;
 		new.k.p		= p.k->p;
 		new.k.size	= p.k->size;
-		ret = bch2_journal_key_insert(c, BTREE_ID_extents, 0, &new.k_i);
+		ret = __bch2_btree_insert(trans, BTREE_ID_extents, &new.k_i);
 	}
 fsck_err:
 	return ret;
@@ -1238,7 +1240,7 @@ static int bch2_mark_reflink_p(struct btree_trans *trans,
 	}
 
 	while (idx < end && !ret)
-		ret = __bch2_mark_reflink_p(c, p, &idx, flags, l++);
+		ret = __bch2_mark_reflink_p(trans, p, &idx, flags, l++);
 
 	return ret;
 }
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index d33b9e2bb1e3..ae9ae1c7138c 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -1157,6 +1157,7 @@ use_clean:
 
 	clear_bit(BCH_FS_REBUILD_REPLICAS, &c->flags);
 	set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags);
+	set_bit(BCH_FS_MAY_GO_RW, &c->flags);
 
 	/*
 	 * Skip past versions that might have possibly been used (as nonces),
@@ -1317,6 +1318,7 @@ int bch2_fs_initialize(struct bch_fs *c)
 	mutex_unlock(&c->sb_lock);
 
 	set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags);
+	set_bit(BCH_FS_MAY_GO_RW, &c->flags);
 	set_bit(BCH_FS_FSCK_DONE, &c->flags);
 
 	for (i = 0; i < BTREE_ID_NR; i++)
-- 
cgit 


From b0551285e11edbf86bebe6df0396adf84e032f5c Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 19 Feb 2022 03:06:28 -0500
Subject: bcachefs: Improve reflink repair code

When a reflink pointer points to a missing indirect extent, we replace
it with an error key. Instead of replacing the entire reflink pointer
with an error key, this patch replaces only the missing range with an
error key.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/buckets.c | 32 ++++++++++++++++++--------------
 1 file changed, 18 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 1c1266fb80df..f4403011f626 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -1166,18 +1166,22 @@ static int bch2_mark_reservation(struct btree_trans *trans,
 
 static s64 __bch2_mark_reflink_p(struct btree_trans *trans,
 				 struct bkey_s_c_reflink_p p,
+				 u64 start, u64 end,
 				 u64 *idx, unsigned flags, size_t r_idx)
 {
 	struct bch_fs *c = trans->c;
 	struct reflink_gc *r;
 	int add = !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1;
+	u64 next_idx = end;
 	s64 ret = 0;
+	char buf[200];
 
 	if (r_idx >= c->reflink_gc_nr)
 		goto not_found;
 
 	r = genradix_ptr(&c->reflink_gc_table, r_idx);
-	if (*idx < r->offset - r->size)
+	next_idx = min(next_idx, r->offset - r->size);
+	if (*idx < next_idx)
 		goto not_found;
 
 	BUG_ON((s64) r->refcount + add < 0);
@@ -1186,23 +1190,22 @@ static s64 __bch2_mark_reflink_p(struct btree_trans *trans,
 	*idx = r->offset;
 	return 0;
 not_found:
-	*idx = U64_MAX;
-	ret = -EIO;
-
-	/*
-	 * XXX: we're replacing the entire reflink pointer with an error
-	 * key, we should just be replacing the part that was missing:
-	 */
-	if (fsck_err(c, "%llu:%llu len %u points to nonexistent indirect extent %llu",
-		     p.k->p.inode, p.k->p.offset, p.k->size, *idx)) {
+	if (fsck_err(c, "pointer to missing indirect extent\n"
+		     "  %s\n"
+		     "  missing range %llu-%llu",
+		     (bch2_bkey_val_to_text(&PBUF(buf), c, p.s_c), buf),
+		     *idx, next_idx)) {
 		struct bkey_i_error new;
 
 		bkey_init(&new.k);
 		new.k.type	= KEY_TYPE_error;
-		new.k.p		= p.k->p;
-		new.k.size	= p.k->size;
+		new.k.p		= bkey_start_pos(p.k);
+		new.k.p.offset += *idx - start;
+		bch2_key_resize(&new.k, next_idx - *idx);
 		ret = __bch2_btree_insert(trans, BTREE_ID_extents, &new.k_i);
 	}
+
+	*idx = next_idx;
 fsck_err:
 	return ret;
 }
@@ -1216,7 +1219,7 @@ static int bch2_mark_reflink_p(struct btree_trans *trans,
 	struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
 	struct reflink_gc *ref;
 	size_t l, r, m;
-	u64 idx = le64_to_cpu(p.v->idx);
+	u64 idx = le64_to_cpu(p.v->idx), start = idx;
 	u64 end = le64_to_cpu(p.v->idx) + p.k->size;
 	int ret = 0;
 
@@ -1240,7 +1243,8 @@ static int bch2_mark_reflink_p(struct btree_trans *trans,
 	}
 
 	while (idx < end && !ret)
-		ret = __bch2_mark_reflink_p(trans, p, &idx, flags, l++);
+		ret = __bch2_mark_reflink_p(trans, p, start, end,
+					    &idx, flags, l++);
 
 	return ret;
 }
-- 
cgit 


From a69e7e6a8732ab336548359020fe865150ae8a5a Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 19 Feb 2022 03:56:44 -0500
Subject: bcachefs: Use unlikely() in err_on() macros

Should be obviously a good thing.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/error.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h
index 986938298adc..4ab3cfe1292c 100644
--- a/fs/bcachefs/error.h
+++ b/fs/bcachefs/error.h
@@ -39,7 +39,7 @@ void bch2_topology_error(struct bch_fs *);
 
 #define bch2_fs_inconsistent_on(cond, c, ...)				\
 ({									\
-	int _ret = !!(cond);						\
+	bool _ret = unlikely(!!(cond));					\
 									\
 	if (_ret)							\
 		bch2_fs_inconsistent(c, __VA_ARGS__);			\
@@ -59,7 +59,7 @@ do {									\
 
 #define bch2_dev_inconsistent_on(cond, ca, ...)				\
 ({									\
-	int _ret = !!(cond);						\
+	bool _ret = unlikely(!!(cond));					\
 									\
 	if (_ret)							\
 		bch2_dev_inconsistent(ca, __VA_ARGS__);			\
@@ -129,7 +129,7 @@ void bch2_flush_fsck_errs(struct bch_fs *);
 /* XXX: mark in superblock that filesystem contains errors, if we ignore: */
 
 #define __fsck_err_on(cond, c, _flags, ...)				\
-	((cond) ? __fsck_err(c, _flags,	##__VA_ARGS__) : false)
+	(unlikely(cond) ? __fsck_err(c, _flags,	##__VA_ARGS__) : false)
 
 #define need_fsck_err_on(cond, c, ...)					\
 	__fsck_err_on(cond, c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, ##__VA_ARGS__)
@@ -164,7 +164,7 @@ do {									\
 
 #define bch2_fs_fatal_err_on(cond, c, ...)				\
 ({									\
-	int _ret = !!(cond);						\
+	bool _ret = unlikely(!!(cond));					\
 									\
 	if (_ret)							\
 		bch2_fs_fatal_error(c, __VA_ARGS__);			\
-- 
cgit 


From 78a8f36280e178df4e78382c82a20e3af1704e65 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 23 Feb 2022 10:32:43 -0500
Subject: bcachefs: Improve some btree node read error messages

On btree node read error, it's helpful to see what we were trying to
read - was it all zeroes?

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_io.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index a0446df0d3da..c65c640753b6 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -880,11 +880,12 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
 
 	btree_err_on(le64_to_cpu(b->data->magic) != bset_magic(c),
 		     BTREE_ERR_MUST_RETRY, c, ca, b, NULL,
-		     "bad magic");
+		     "bad magic: want %llx, got %llx",
+		     bset_magic(c), le64_to_cpu(b->data->magic));
 
 	btree_err_on(!b->data->keys.seq,
 		     BTREE_ERR_MUST_RETRY, c, ca, b, NULL,
-		     "bad btree header");
+		     "bad btree header: seq 0");
 
 	if (b->key.k.type == KEY_TYPE_btree_ptr_v2) {
 		struct bch_btree_ptr_v2 *bp =
-- 
cgit 


From 82697a10dd4b9a6f7c6f98a525778d032db2f2fb Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 23 Feb 2022 11:46:34 -0500
Subject: bcachefs: Fix 32 bit build

vstruct_bytes() was returning a u64 - it should be a size_t, the corect
type for the size of anything that fits in memory.

Also replace a 64 bit divide with div_u64().

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/journal_io.c |  2 +-
 fs/bcachefs/quota.c      |  2 +-
 fs/bcachefs/super-io.c   | 10 +++++-----
 fs/bcachefs/vstructs.h   |  2 +-
 4 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index b8fcc801a666..302af332b632 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -1105,7 +1105,7 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list,
 				struct journal_replay *p = list_prev_entry(i, list);
 
 				bch2_journal_ptrs_to_text(&out, c, p);
-				pr_buf(&out, " size %llu", vstruct_sectors(&p->j, c->block_bits));
+				pr_buf(&out, " size %zu", vstruct_sectors(&p->j, c->block_bits));
 			} else
 				sprintf(buf1, "(none)");
 			bch2_journal_ptrs_to_text(&PBUF(buf2), c, i);
diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c
index b7ef8fa7bbc9..ca029a00e7b8 100644
--- a/fs/bcachefs/quota.c
+++ b/fs/bcachefs/quota.c
@@ -23,7 +23,7 @@ static int bch2_sb_quota_validate(struct bch_sb *sb, struct bch_sb_field *f,
 	struct bch_sb_field_quota *q = field_to_type(f, quota);
 
 	if (vstruct_bytes(&q->field) < sizeof(*q)) {
-		pr_buf(err, "wrong size (got %llu should be %zu)",
+		pr_buf(err, "wrong size (got %zu should be %zu)",
 		       vstruct_bytes(&q->field), sizeof(*q));
 		return -EINVAL;
 	}
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index f95c9d754530..47eeb48c8c60 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -1147,7 +1147,7 @@ static int bch2_sb_crypt_validate(struct bch_sb *sb,
 	struct bch_sb_field_crypt *crypt = field_to_type(f, crypt);
 
 	if (vstruct_bytes(&crypt->field) < sizeof(*crypt)) {
-		pr_buf(err, "wrong size (got %llu should be %zu)",
+		pr_buf(err, "wrong size (got %zu should be %zu)",
 		       vstruct_bytes(&crypt->field), sizeof(*crypt));
 		return -EINVAL;
 	}
@@ -1390,7 +1390,7 @@ static int bch2_sb_clean_validate(struct bch_sb *sb,
 	struct bch_sb_field_clean *clean = field_to_type(f, clean);
 
 	if (vstruct_bytes(&clean->field) < sizeof(*clean)) {
-		pr_buf(err, "wrong size (got %llu should be %zu)",
+		pr_buf(err, "wrong size (got %zu should be %zu)",
 		       vstruct_bytes(&clean->field), sizeof(*clean));
 		return -EINVAL;
 	}
@@ -1467,7 +1467,7 @@ void bch2_sb_field_to_text(struct printbuf *out, struct bch_sb *sb,
 	else
 		pr_buf(out, "(unknown field %u)", type);
 
-	pr_buf(out, " (size %llu):", vstruct_bytes(f));
+	pr_buf(out, " (size %zu):", vstruct_bytes(f));
 	pr_newline(out);
 
 	if (ops && ops->to_text) {
@@ -1543,7 +1543,7 @@ void bch2_sb_to_text(struct printbuf *out, struct bch_sb *sb,
 
 	pr_buf(out, "Created:                   ");
 	if (sb->time_base_lo)
-		pr_time(out, le64_to_cpu(sb->time_base_lo) / NSEC_PER_SEC);
+		pr_time(out, div_u64(le64_to_cpu(sb->time_base_lo), NSEC_PER_SEC));
 	else
 		pr_buf(out, "(not set)");
 	pr_newline(out);
@@ -1649,7 +1649,7 @@ void bch2_sb_to_text(struct printbuf *out, struct bch_sb *sb,
 	bch2_flags_to_text(out, bch2_sb_fields, fields_have);
 	pr_newline(out);
 
-	pr_buf(out, "Superblock size:           %llu", vstruct_bytes(sb));
+	pr_buf(out, "Superblock size:           %zu", vstruct_bytes(sb));
 	pr_newline(out);
 
 	if (print_layout) {
diff --git a/fs/bcachefs/vstructs.h b/fs/bcachefs/vstructs.h
index c099cdc0605f..53a694d71967 100644
--- a/fs/bcachefs/vstructs.h
+++ b/fs/bcachefs/vstructs.h
@@ -20,7 +20,7 @@
 ({									\
 	BUILD_BUG_ON(offsetof(_type, _data) % sizeof(u64));		\
 									\
-	(offsetof(_type, _data) + (_u64s) * sizeof(u64));		\
+	(size_t) (offsetof(_type, _data) + (_u64s) * sizeof(u64));	\
 })
 
 #define vstruct_bytes(_s)						\
-- 
cgit 


From ae94c78fb1d5acc8315b7d17583ddb92df29bd3a Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 10 Dec 2021 17:04:26 -0500
Subject: bcachefs: bch2_trans_mark_key() now takes a bkey_i *

We're now coming up with triggers that modify the update being done. A
bkey_s_c is const - bkey_i is the correct type to be using here.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_update_interior.c | 20 +++--------
 fs/bcachefs/btree_update_leaf.c     | 11 ++----
 fs/bcachefs/buckets.c               | 72 +++++++++++++++++++------------------
 fs/bcachefs/buckets.h               | 27 +++++++++++++-
 4 files changed, 71 insertions(+), 59 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 644ac4e5d1d1..255753b2dc0e 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -510,19 +510,13 @@ static int btree_update_nodes_written_trans(struct btree_trans *trans,
 	trans->journal_pin = &as->journal;
 
 	for_each_keylist_key(&as->new_keys, k) {
-		ret = bch2_trans_mark_key(trans,
-					  bkey_s_c_null,
-					  bkey_i_to_s_c(k),
-					  BTREE_TRIGGER_INSERT);
+		ret = bch2_trans_mark_new(trans, k, 0);
 		if (ret)
 			return ret;
 	}
 
 	for_each_keylist_key(&as->old_keys, k) {
-		ret = bch2_trans_mark_key(trans,
-					  bkey_i_to_s_c(k),
-					  bkey_s_c_null,
-					  BTREE_TRIGGER_OVERWRITE);
+		ret = bch2_trans_mark_old(trans, bkey_i_to_s_c(k), 0);
 		if (ret)
 			return ret;
 	}
@@ -1891,17 +1885,11 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans,
 	int ret;
 
 	if (!skip_triggers) {
-		ret = bch2_trans_mark_key(trans,
-					  bkey_s_c_null,
-					  bkey_i_to_s_c(new_key),
-					  BTREE_TRIGGER_INSERT);
+		ret = bch2_trans_mark_new(trans, new_key, 0);
 		if (ret)
 			return ret;
 
-		ret = bch2_trans_mark_key(trans,
-					  bkey_i_to_s_c(&b->key),
-					  bkey_s_c_null,
-					  BTREE_TRIGGER_OVERWRITE);
+		ret = bch2_trans_mark_old(trans, bkey_i_to_s_c(&b->key), 0);
 		if (ret)
 			return ret;
 	}
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index fcc56235ae90..5ed0b0296ad4 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -860,8 +860,6 @@ bch2_trans_commit_get_rw_cold(struct btree_trans *trans)
 static int run_one_trigger(struct btree_trans *trans, struct btree_insert_entry *i,
 			   bool overwrite)
 {
-	struct bkey		_deleted = KEY(0, 0, 0);
-	struct bkey_s_c		deleted = (struct bkey_s_c) { &_deleted, NULL };
 	struct bkey_s_c		old;
 	struct bkey		unpacked;
 	int ret = 0;
@@ -885,19 +883,16 @@ static int run_one_trigger(struct btree_trans *trans, struct btree_insert_entry
 	}
 
 	old = bch2_btree_path_peek_slot(i->path, &unpacked);
-	_deleted.p = i->path->pos;
 
 	if (overwrite) {
-		ret = bch2_trans_mark_key(trans, old, deleted,
-				BTREE_TRIGGER_OVERWRITE|i->flags);
+		ret = bch2_trans_mark_old(trans, old, i->flags);
 	} else if (old.k->type == i->k->k.type &&
 	    ((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) {
 		i->overwrite_trigger_run = true;
-		ret = bch2_trans_mark_key(trans, old, bkey_i_to_s_c(i->k),
+		ret = bch2_trans_mark_key(trans, old, i->k,
 				BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|i->flags);
 	} else {
-		ret = bch2_trans_mark_key(trans, deleted, bkey_i_to_s_c(i->k),
-				BTREE_TRIGGER_INSERT|i->flags);
+		ret = bch2_trans_mark_new(trans, i->k, i->flags);
 	}
 
 	if (ret == -EINTR)
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index f4403011f626..ed1632c75e56 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -1662,65 +1662,67 @@ err:
 }
 
 static int bch2_trans_mark_stripe(struct btree_trans *trans,
-				  struct bkey_s_c old, struct bkey_s_c new,
+				  struct bkey_s_c old, struct bkey_i *new,
 				  unsigned flags)
 {
-	struct bkey_s_c_stripe old_s = { .k = NULL };
-	struct bkey_s_c_stripe new_s = { .k = NULL };
+	const struct bch_stripe *old_s = NULL;
+	struct bch_stripe *new_s = NULL;
 	struct bch_replicas_padded r;
 	unsigned i, nr_blocks;
 	int ret = 0;
 
 	if (old.k->type == KEY_TYPE_stripe)
-		old_s = bkey_s_c_to_stripe(old);
-	if (new.k->type == KEY_TYPE_stripe)
-		new_s = bkey_s_c_to_stripe(new);
+		old_s = bkey_s_c_to_stripe(old).v;
+	if (new->k.type == KEY_TYPE_stripe)
+		new_s = &bkey_i_to_stripe(new)->v;
 
 	/*
 	 * If the pointers aren't changing, we don't need to do anything:
 	 */
-	if (new_s.k && old_s.k &&
-	    new_s.v->nr_blocks		== old_s.v->nr_blocks &&
-	    new_s.v->nr_redundant	== old_s.v->nr_redundant &&
-	    !memcmp(old_s.v->ptrs, new_s.v->ptrs,
-		    new_s.v->nr_blocks * sizeof(struct bch_extent_ptr)))
+	if (new_s && old_s &&
+	    new_s->nr_blocks	== old_s->nr_blocks &&
+	    new_s->nr_redundant	== old_s->nr_redundant &&
+	    !memcmp(old_s->ptrs, new_s->ptrs,
+		    new_s->nr_blocks * sizeof(struct bch_extent_ptr)))
 		return 0;
 
-	BUG_ON(new_s.k && old_s.k &&
-	       (new_s.v->nr_blocks	!= old_s.v->nr_blocks ||
-		new_s.v->nr_redundant	!= old_s.v->nr_redundant));
+	BUG_ON(new_s && old_s &&
+	       (new_s->nr_blocks	!= old_s->nr_blocks ||
+		new_s->nr_redundant	!= old_s->nr_redundant));
 
-	nr_blocks = new_s.k ? new_s.v->nr_blocks : old_s.v->nr_blocks;
+	nr_blocks = new_s ? new_s->nr_blocks : old_s->nr_blocks;
 
-	if (new_s.k) {
-		s64 sectors = le16_to_cpu(new_s.v->sectors);
+	if (new_s) {
+		s64 sectors = le16_to_cpu(new_s->sectors);
 
-		bch2_bkey_to_replicas(&r.e, new);
-		update_replicas_list(trans, &r.e, sectors * new_s.v->nr_redundant);
+		bch2_bkey_to_replicas(&r.e, bkey_i_to_s_c(new));
+		update_replicas_list(trans, &r.e, sectors * new_s->nr_redundant);
 	}
 
-	if (old_s.k) {
-		s64 sectors = -((s64) le16_to_cpu(old_s.v->sectors));
+	if (old_s) {
+		s64 sectors = -((s64) le16_to_cpu(old_s->sectors));
 
 		bch2_bkey_to_replicas(&r.e, old);
-		update_replicas_list(trans, &r.e, sectors * old_s.v->nr_redundant);
+		update_replicas_list(trans, &r.e, sectors * old_s->nr_redundant);
 	}
 
 	for (i = 0; i < nr_blocks; i++) {
-		if (new_s.k && old_s.k &&
-		    !memcmp(&new_s.v->ptrs[i],
-			    &old_s.v->ptrs[i],
-			    sizeof(new_s.v->ptrs[i])))
+		if (new_s && old_s &&
+		    !memcmp(&new_s->ptrs[i],
+			    &old_s->ptrs[i],
+			    sizeof(new_s->ptrs[i])))
 			continue;
 
-		if (new_s.k) {
-			ret = bch2_trans_mark_stripe_bucket(trans, new_s, i, false);
+		if (new_s) {
+			ret = bch2_trans_mark_stripe_bucket(trans,
+					bkey_i_to_s_c_stripe(new), i, false);
 			if (ret)
 				break;
 		}
 
-		if (old_s.k) {
-			ret = bch2_trans_mark_stripe_bucket(trans, old_s, i, true);
+		if (old_s) {
+			ret = bch2_trans_mark_stripe_bucket(trans,
+					bkey_s_c_to_stripe(old), i, true);
 			if (ret)
 				break;
 		}
@@ -1731,10 +1733,10 @@ static int bch2_trans_mark_stripe(struct btree_trans *trans,
 
 static int bch2_trans_mark_inode(struct btree_trans *trans,
 				 struct bkey_s_c old,
-				 struct bkey_s_c new,
+				 struct bkey_i *new,
 				 unsigned flags)
 {
-	int nr = bkey_is_inode(new.k) - bkey_is_inode(old.k);
+	int nr = bkey_is_inode(&new->k) - bkey_is_inode(old.k);
 
 	if (nr) {
 		struct replicas_delta_list *d =
@@ -1869,9 +1871,11 @@ static int bch2_trans_mark_reflink_p(struct btree_trans *trans,
 }
 
 int bch2_trans_mark_key(struct btree_trans *trans, struct bkey_s_c old,
-			struct bkey_s_c new, unsigned flags)
+			struct bkey_i *new, unsigned flags)
 {
-	struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old: new;
+	struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE
+		? old
+		: bkey_i_to_s_c(new);
 
 	switch (k.k->type) {
 	case KEY_TYPE_btree_ptr:
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index 8a3cea6f94df..a04d15154304 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -235,7 +235,32 @@ int bch2_mark_update(struct btree_trans *, struct btree_path *,
 		     struct bkey_i *, unsigned);
 
 int bch2_trans_mark_key(struct btree_trans *, struct bkey_s_c,
-			struct bkey_s_c, unsigned);
+			struct bkey_i *, unsigned);
+
+static inline int bch2_trans_mark_old(struct btree_trans *trans,
+				      struct bkey_s_c old, unsigned flags)
+{
+	struct bkey_i deleted;
+
+	bkey_init(&deleted.k);
+	deleted.k.p = old.k->p;
+
+	return bch2_trans_mark_key(trans, old, &deleted,
+				   BTREE_TRIGGER_OVERWRITE|flags);
+}
+
+static inline int bch2_trans_mark_new(struct btree_trans *trans,
+				      struct bkey_i *new, unsigned flags)
+{
+	struct bkey_i deleted;
+
+	bkey_init(&deleted.k);
+	deleted.k.p = new->k.p;
+
+	return bch2_trans_mark_key(trans, bkey_i_to_s_c(&deleted), new,
+				   BTREE_TRIGGER_INSERT|flags);
+}
+
 int bch2_trans_fs_usage_apply(struct btree_trans *, struct replicas_delta_list *);
 
 int bch2_trans_mark_metadata_bucket(struct btree_trans *, struct bch_dev *,
-- 
cgit 


From 3598c56eb93b9774d3aa06b3e3c0eab0bbbc26f0 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 24 Feb 2022 11:30:17 -0500
Subject: bcachefs: Consolidate trigger code a bit

Upcoming patches are doing more work on the triggers code, this patch
just moves code around.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_update_leaf.c | 259 +++++++++++++++++++++++-----------------
 fs/bcachefs/buckets.c           |  33 -----
 fs/bcachefs/buckets.h           |   3 -
 3 files changed, 148 insertions(+), 147 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 5ed0b0296ad4..78f538327b2a 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -406,7 +406,151 @@ static inline void do_btree_insert_one(struct btree_trans *trans,
 	}
 }
 
-static noinline int bch2_trans_mark_gc(struct btree_trans *trans)
+/* Triggers: */
+
+static int run_one_mem_trigger(struct btree_trans *trans,
+			       struct btree_insert_entry *i,
+			       unsigned flags)
+{
+	struct bkey		_deleted = KEY(0, 0, 0);
+	struct bkey_s_c		deleted = (struct bkey_s_c) { &_deleted, NULL };
+	struct bkey_s_c		old;
+	struct bkey		unpacked;
+	struct bkey_i *new = i->k;
+	int ret;
+
+	_deleted.p = i->path->pos;
+
+	if (unlikely(flags & BTREE_TRIGGER_NORUN))
+		return 0;
+
+	if (!btree_node_type_needs_gc(i->path->btree_id))
+		return 0;
+
+	old = bch2_btree_path_peek_slot(i->path, &unpacked);
+
+	if (old.k->type == new->k.type &&
+	    ((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) {
+		ret   = bch2_mark_key(trans, old, bkey_i_to_s_c(new),
+				BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|flags);
+	} else {
+		ret   = bch2_mark_key(trans, deleted, bkey_i_to_s_c(new),
+				BTREE_TRIGGER_INSERT|flags) ?:
+			bch2_mark_key(trans, old, deleted,
+				BTREE_TRIGGER_OVERWRITE|flags);
+	}
+
+	return ret;
+}
+
+static int run_one_trans_trigger(struct btree_trans *trans,
+				 struct btree_insert_entry *i,
+				 bool overwrite)
+{
+	struct bkey_s_c		old;
+	struct bkey		unpacked;
+	int ret = 0;
+
+	if ((i->flags & BTREE_TRIGGER_NORUN) ||
+	    !(BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type)))
+		return 0;
+
+	if (!overwrite) {
+		if (i->insert_trigger_run)
+			return 0;
+
+		BUG_ON(i->overwrite_trigger_run);
+		i->insert_trigger_run = true;
+	} else {
+		if (i->overwrite_trigger_run)
+			return 0;
+
+		BUG_ON(!i->insert_trigger_run);
+		i->overwrite_trigger_run = true;
+	}
+
+	old = bch2_btree_path_peek_slot(i->path, &unpacked);
+
+	if (overwrite) {
+		ret = bch2_trans_mark_old(trans, old, i->flags);
+	} else if (old.k->type == i->k->k.type &&
+	    ((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) {
+		i->overwrite_trigger_run = true;
+		ret = bch2_trans_mark_key(trans, old, i->k,
+				BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|i->flags);
+	} else {
+		ret = bch2_trans_mark_new(trans, i->k, i->flags);
+	}
+
+	if (ret == -EINTR)
+		trace_trans_restart_mark(trans->fn, _RET_IP_,
+					 i->btree_id, &i->path->pos);
+	return ret ?: 1;
+}
+
+static int run_btree_triggers(struct btree_trans *trans, enum btree_id btree_id,
+			      struct btree_insert_entry *btree_id_start)
+{
+	struct btree_insert_entry *i;
+	bool trans_trigger_run;
+	int ret, overwrite;
+
+	for (overwrite = 0; overwrite < 2; overwrite++) {
+
+		/*
+		 * Running triggers will append more updates to the list of updates as
+		 * we're walking it:
+		 */
+		do {
+			trans_trigger_run = false;
+
+			for (i = btree_id_start;
+			     i < trans->updates + trans->nr_updates && i->btree_id <= btree_id;
+			     i++) {
+				ret = run_one_trans_trigger(trans, i, overwrite);
+				if (ret < 0)
+					return ret;
+				if (ret)
+					trans_trigger_run = true;
+			}
+		} while (trans_trigger_run);
+	}
+
+	return 0;
+}
+
+static int bch2_trans_commit_run_triggers(struct btree_trans *trans)
+{
+	struct btree_insert_entry *i = NULL, *btree_id_start = trans->updates;
+	unsigned btree_id = 0;
+	int ret = 0;
+
+	/*
+	 *
+	 * For a given btree, this algorithm runs insert triggers before
+	 * overwrite triggers: this is so that when extents are being moved
+	 * (e.g. by FALLOCATE_FL_INSERT_RANGE), we don't drop references before
+	 * they are re-added.
+	 */
+	for (btree_id = 0; btree_id < BTREE_ID_NR; btree_id++) {
+		while (btree_id_start < trans->updates + trans->nr_updates &&
+		       btree_id_start->btree_id < btree_id)
+			btree_id_start++;
+
+		ret = run_btree_triggers(trans, btree_id, btree_id_start);
+		if (ret)
+			return ret;
+	}
+
+	trans_for_each_update(trans, i)
+		BUG_ON(!(i->flags & BTREE_TRIGGER_NORUN) &&
+		       (BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type)) &&
+		       (!i->insert_trigger_run || !i->overwrite_trigger_run));
+
+	return 0;
+}
+
+static noinline int bch2_trans_commit_run_gc_triggers(struct btree_trans *trans)
 {
 	struct bch_fs *c = trans->c;
 	struct btree_insert_entry *i;
@@ -420,8 +564,7 @@ static noinline int bch2_trans_mark_gc(struct btree_trans *trans)
 		BUG_ON(i->cached || i->level);
 
 		if (gc_visited(c, gc_pos_btree_node(insert_l(i)->b))) {
-			ret = bch2_mark_update(trans, i->path, i->k,
-					       i->flags|BTREE_TRIGGER_GC);
+			ret = run_one_mem_trigger(trans, i, i->flags|BTREE_TRIGGER_GC);
 			if (ret)
 				break;
 		}
@@ -527,13 +670,13 @@ bch2_trans_commit_write_locked(struct btree_trans *trans,
 
 	trans_for_each_update(trans, i)
 		if (BTREE_NODE_TYPE_HAS_MEM_TRIGGERS & (1U << i->bkey_type)) {
-			ret = bch2_mark_update(trans, i->path, i->k, i->flags);
+			ret = run_one_mem_trigger(trans, i, i->flags);
 			if (ret)
 				return ret;
 		}
 
 	if (unlikely(c->gc_pos.phase)) {
-		ret = bch2_trans_mark_gc(trans);
+		ret = bch2_trans_commit_run_gc_triggers(trans);
 		if  (ret)
 			return ret;
 	}
@@ -857,112 +1000,6 @@ bch2_trans_commit_get_rw_cold(struct btree_trans *trans)
 	return 0;
 }
 
-static int run_one_trigger(struct btree_trans *trans, struct btree_insert_entry *i,
-			   bool overwrite)
-{
-	struct bkey_s_c		old;
-	struct bkey		unpacked;
-	int ret = 0;
-
-	if ((i->flags & BTREE_TRIGGER_NORUN) ||
-	    !(BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type)))
-		return 0;
-
-	if (!overwrite) {
-		if (i->insert_trigger_run)
-			return 0;
-
-		BUG_ON(i->overwrite_trigger_run);
-		i->insert_trigger_run = true;
-	} else {
-		if (i->overwrite_trigger_run)
-			return 0;
-
-		BUG_ON(!i->insert_trigger_run);
-		i->overwrite_trigger_run = true;
-	}
-
-	old = bch2_btree_path_peek_slot(i->path, &unpacked);
-
-	if (overwrite) {
-		ret = bch2_trans_mark_old(trans, old, i->flags);
-	} else if (old.k->type == i->k->k.type &&
-	    ((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) {
-		i->overwrite_trigger_run = true;
-		ret = bch2_trans_mark_key(trans, old, i->k,
-				BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|i->flags);
-	} else {
-		ret = bch2_trans_mark_new(trans, i->k, i->flags);
-	}
-
-	if (ret == -EINTR)
-		trace_trans_restart_mark(trans->fn, _RET_IP_,
-					 i->btree_id, &i->path->pos);
-	return ret ?: 1;
-}
-
-static int run_btree_triggers(struct btree_trans *trans, enum btree_id btree_id,
-			      struct btree_insert_entry *btree_id_start)
-{
-	struct btree_insert_entry *i;
-	bool trans_trigger_run;
-	int ret, overwrite;
-
-	for (overwrite = 0; overwrite < 2; overwrite++) {
-
-		/*
-		 * Running triggers will append more updates to the list of updates as
-		 * we're walking it:
-		 */
-		do {
-			trans_trigger_run = false;
-
-			for (i = btree_id_start;
-			     i < trans->updates + trans->nr_updates && i->btree_id <= btree_id;
-			     i++) {
-				ret = run_one_trigger(trans, i, overwrite);
-				if (ret < 0)
-					return ret;
-				if (ret)
-					trans_trigger_run = true;
-			}
-		} while (trans_trigger_run);
-	}
-
-	return 0;
-}
-
-static int bch2_trans_commit_run_triggers(struct btree_trans *trans)
-{
-	struct btree_insert_entry *i = NULL, *btree_id_start = trans->updates;
-	unsigned btree_id = 0;
-	int ret = 0;
-
-	/*
-	 *
-	 * For a given btree, this algorithm runs insert triggers before
-	 * overwrite triggers: this is so that when extents are being moved
-	 * (e.g. by FALLOCATE_FL_INSERT_RANGE), we don't drop references before
-	 * they are re-added.
-	 */
-	for (btree_id = 0; btree_id < BTREE_ID_NR; btree_id++) {
-		while (btree_id_start < trans->updates + trans->nr_updates &&
-		       btree_id_start->btree_id < btree_id)
-			btree_id_start++;
-
-		ret = run_btree_triggers(trans, btree_id, btree_id_start);
-		if (ret)
-			return ret;
-	}
-
-	trans_for_each_update(trans, i)
-		BUG_ON(!(i->flags & BTREE_TRIGGER_NORUN) &&
-		       (BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type)) &&
-		       (!i->insert_trigger_run || !i->overwrite_trigger_run));
-
-	return 0;
-}
-
 /*
  * This is for updates done in the early part of fsck - btree_gc - before we've
  * gone RW. we only add the new key to the list of keys for journal replay to
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index ed1632c75e56..136a5727ea20 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -1282,39 +1282,6 @@ int bch2_mark_key(struct btree_trans *trans,
 	}
 }
 
-int bch2_mark_update(struct btree_trans *trans, struct btree_path *path,
-		     struct bkey_i *new, unsigned flags)
-{
-	struct bkey		_deleted = KEY(0, 0, 0);
-	struct bkey_s_c		deleted = (struct bkey_s_c) { &_deleted, NULL };
-	struct bkey_s_c		old;
-	struct bkey		unpacked;
-	int ret;
-
-	_deleted.p = path->pos;
-
-	if (unlikely(flags & BTREE_TRIGGER_NORUN))
-		return 0;
-
-	if (!btree_node_type_needs_gc(path->btree_id))
-		return 0;
-
-	old = bch2_btree_path_peek_slot(path, &unpacked);
-
-	if (old.k->type == new->k.type &&
-	    ((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) {
-		ret   = bch2_mark_key(trans, old, bkey_i_to_s_c(new),
-				BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|flags);
-	} else {
-		ret   = bch2_mark_key(trans, deleted, bkey_i_to_s_c(new),
-				BTREE_TRIGGER_INSERT|flags) ?:
-			bch2_mark_key(trans, old, deleted,
-				BTREE_TRIGGER_OVERWRITE|flags);
-	}
-
-	return ret;
-}
-
 static noinline __cold
 void fs_usage_apply_warn(struct btree_trans *trans,
 			 unsigned disk_res_sectors,
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index a04d15154304..ca34d5d3b961 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -231,9 +231,6 @@ void bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *,
 
 int bch2_mark_key(struct btree_trans *, struct bkey_s_c, struct bkey_s_c, unsigned);
 
-int bch2_mark_update(struct btree_trans *, struct btree_path *,
-		     struct bkey_i *, unsigned);
-
 int bch2_trans_mark_key(struct btree_trans *, struct bkey_s_c,
 			struct bkey_i *, unsigned);
 
-- 
cgit 


From 96d3a0afe04af0c50243e08f9a889c889f9cd131 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 24 Feb 2022 11:02:58 -0500
Subject: bcachefs: Trigger code uses stashed copy of old key

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_update_leaf.c | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 78f538327b2a..ed5be81c3254 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -412,28 +412,26 @@ static int run_one_mem_trigger(struct btree_trans *trans,
 			       struct btree_insert_entry *i,
 			       unsigned flags)
 {
-	struct bkey		_deleted = KEY(0, 0, 0);
-	struct bkey_s_c		deleted = (struct bkey_s_c) { &_deleted, NULL };
-	struct bkey_s_c		old;
-	struct bkey		unpacked;
+	struct bkey_s_c old = { &i->old_k, i->old_v };
 	struct bkey_i *new = i->k;
 	int ret;
 
-	_deleted.p = i->path->pos;
-
 	if (unlikely(flags & BTREE_TRIGGER_NORUN))
 		return 0;
 
-	if (!btree_node_type_needs_gc(i->path->btree_id))
+	if (!btree_node_type_needs_gc(i->btree_id))
 		return 0;
 
-	old = bch2_btree_path_peek_slot(i->path, &unpacked);
-
 	if (old.k->type == new->k.type &&
 	    ((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) {
 		ret   = bch2_mark_key(trans, old, bkey_i_to_s_c(new),
 				BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|flags);
 	} else {
+		struct bkey		_deleted = KEY(0, 0, 0);
+		struct bkey_s_c		deleted = (struct bkey_s_c) { &_deleted, NULL };
+
+		_deleted.p = i->path->pos;
+
 		ret   = bch2_mark_key(trans, deleted, bkey_i_to_s_c(new),
 				BTREE_TRIGGER_INSERT|flags) ?:
 			bch2_mark_key(trans, old, deleted,
@@ -443,12 +441,16 @@ static int run_one_mem_trigger(struct btree_trans *trans,
 	return ret;
 }
 
-static int run_one_trans_trigger(struct btree_trans *trans,
-				 struct btree_insert_entry *i,
-				 bool overwrite)
+static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_entry *i,
+			   bool overwrite)
 {
-	struct bkey_s_c		old;
-	struct bkey		unpacked;
+	/*
+	 * Transactional triggers create new btree_insert_entries, so we can't
+	 * pass them a pointer to a btree_insert_entry, that memory is going to
+	 * move:
+	 */
+	struct bkey old_k = i->old_k;
+	struct bkey_s_c old = { &old_k, i->old_v };
 	int ret = 0;
 
 	if ((i->flags & BTREE_TRIGGER_NORUN) ||
@@ -469,8 +471,6 @@ static int run_one_trans_trigger(struct btree_trans *trans,
 		i->overwrite_trigger_run = true;
 	}
 
-	old = bch2_btree_path_peek_slot(i->path, &unpacked);
-
 	if (overwrite) {
 		ret = bch2_trans_mark_old(trans, old, i->flags);
 	} else if (old.k->type == i->k->k.type &&
-- 
cgit 


From 0c10cf852551edca80a2b711de1c84fd001ffd02 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 6 Jan 2022 01:20:41 -0500
Subject: bcachefs: Run alloc triggers last

Triggers can generate additional btree updates - we need to run alloc
triggers after all other triggers have run, because they generate
updates for the alloc btree.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_update_leaf.c | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index ed5be81c3254..e9e10df8ee95 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -507,6 +507,9 @@ static int run_btree_triggers(struct btree_trans *trans, enum btree_id btree_id,
 			for (i = btree_id_start;
 			     i < trans->updates + trans->nr_updates && i->btree_id <= btree_id;
 			     i++) {
+				if (i->btree_id != btree_id)
+					continue;
+
 				ret = run_one_trans_trigger(trans, i, overwrite);
 				if (ret < 0)
 					return ret;
@@ -533,6 +536,9 @@ static int bch2_trans_commit_run_triggers(struct btree_trans *trans)
 	 * they are re-added.
 	 */
 	for (btree_id = 0; btree_id < BTREE_ID_NR; btree_id++) {
+		if (btree_id == BTREE_ID_alloc)
+			continue;
+
 		while (btree_id_start < trans->updates + trans->nr_updates &&
 		       btree_id_start->btree_id < btree_id)
 			btree_id_start++;
@@ -542,6 +548,17 @@ static int bch2_trans_commit_run_triggers(struct btree_trans *trans)
 			return ret;
 	}
 
+	trans_for_each_update(trans, i) {
+		if (i->btree_id > BTREE_ID_alloc)
+			break;
+		if (i->btree_id == BTREE_ID_alloc) {
+			ret = run_btree_triggers(trans, BTREE_ID_alloc, i);
+			if (ret)
+				return ret;
+			break;
+		}
+	}
+
 	trans_for_each_update(trans, i)
 		BUG_ON(!(i->flags & BTREE_TRIGGER_NORUN) &&
 		       (BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type)) &&
-- 
cgit 


From 25a7723182ee62a8e74b204acbd117e4d6c12341 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 24 Feb 2022 13:27:31 -0500
Subject: bcachefs: Always clear should_be_locked in bch2_trans_begin()

bch2_trans_begin() invalidates all iterators, until they're revalidated
by calling peek() or traverse().

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_iter.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 1015e89d2d68..8aacaa05fc14 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -2988,6 +2988,8 @@ void bch2_trans_begin(struct btree_trans *trans)
 	}
 
 	trans_for_each_path(trans, path) {
+		path->should_be_locked = false;
+
 		/*
 		 * XXX: we probably shouldn't be doing this if the transaction
 		 * was restarted, but currently we still overflow transaction
@@ -2996,7 +2998,7 @@ void bch2_trans_begin(struct btree_trans *trans)
 		if (!path->ref && !path->preserve)
 			__bch2_path_free(trans, path);
 		else
-			path->preserve = path->should_be_locked = false;
+			path->preserve = false;
 	}
 
 	bch2_trans_cond_resched(trans);
-- 
cgit 


From eac91bf27f088ecb0676873ff298db2dcd5ff9fa Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 24 Feb 2022 18:19:32 -0500
Subject: bcachefs: Fix bch2_journal_pins_to_text()

When key cache pins were put onto their own list, we neglected to update
bch2_journal_pins_to_text() to print them.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/journal.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index 1f26d351697a..279e960f2307 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -1290,6 +1290,10 @@ void bch2_journal_pins_to_text(struct printbuf *out, struct journal *j)
 		pr_buf(out, "%llu: count %u\n",
 		       i, atomic_read(&pin_list->count));
 
+		list_for_each_entry(pin, &pin_list->key_cache_list, list)
+			pr_buf(out, "\t%px %ps\n",
+			       pin, pin->flush);
+
 		list_for_each_entry(pin, &pin_list->list, list)
 			pr_buf(out, "\t%px %ps\n",
 			       pin, pin->flush);
-- 
cgit 


From eb7bd15fe4e7a4a6cf05d9086722aad49f80c259 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 24 Feb 2022 19:04:11 -0500
Subject: bcachefs: Improve debug assertion

We're hitting a strange bug with transaction paths not being sorted
correctly - this dumps transaction paths in the order we thought was
sorted, which will hopefully shed some light as to what's going on.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_iter.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 8aacaa05fc14..b58219292f34 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1410,7 +1410,7 @@ static int btree_path_traverse_one(struct btree_trans *, struct btree_path *,
 static int bch2_btree_path_traverse_all(struct btree_trans *trans)
 {
 	struct bch_fs *c = trans->c;
-	struct btree_path *path, *prev = NULL;
+	struct btree_path *path, *prev;
 	unsigned long trace_ip = _RET_IP_;
 	int i, ret = 0;
 
@@ -1419,6 +1419,7 @@ static int bch2_btree_path_traverse_all(struct btree_trans *trans)
 
 	trans->in_traverse_all = true;
 retry_all:
+	prev = NULL;
 	trans->restarted = false;
 
 	trans_for_each_path(trans, path)
@@ -1852,6 +1853,7 @@ struct btree_path *bch2_path_get(struct btree_trans *trans,
 	int i;
 
 	BUG_ON(trans->restarted);
+	btree_trans_sort_paths(trans);
 
 	btree_trans_sort_paths(trans);
 
@@ -2722,7 +2724,10 @@ static void btree_trans_verify_sorted(struct btree_trans *trans)
 	unsigned i;
 
 	trans_for_each_path_inorder(trans, path, i) {
-		BUG_ON(prev && btree_path_cmp(prev, path) > 0);
+		if (prev && btree_path_cmp(prev, path) > 0) {
+			bch2_dump_trans_paths_updates(trans);
+			panic("trans paths out of order!\n");
+		}
 		prev = path;
 	}
 }
-- 
cgit 


From 2be7b16eee9442f2c45ebde19bd3b50fcd030515 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 25 Feb 2022 13:17:48 -0500
Subject: bcachefs: Convert bch2_pd_controller_print_debug() to a printbuf

Fewer random on-stack char arrays.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/util.c | 68 ++++++++++++++++++++++++++++++------------------------
 fs/bcachefs/util.h |  8 ++++---
 2 files changed, 43 insertions(+), 33 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c
index a330fa30cd79..2296658b9f0d 100644
--- a/fs/bcachefs/util.c
+++ b/fs/bcachefs/util.c
@@ -484,36 +484,44 @@ void bch2_pd_controller_init(struct bch_pd_controller *pd)
 	pd->backpressure	= 1;
 }
 
-size_t bch2_pd_controller_print_debug(struct bch_pd_controller *pd, char *buf)
-{
-	/* 2^64 - 1 is 20 digits, plus null byte */
-	char rate[21];
-	char actual[21];
-	char target[21];
-	char proportional[21];
-	char derivative[21];
-	char change[21];
-	s64 next_io;
-
-	bch2_hprint(&PBUF(rate),	pd->rate.rate);
-	bch2_hprint(&PBUF(actual),	pd->last_actual);
-	bch2_hprint(&PBUF(target),	pd->last_target);
-	bch2_hprint(&PBUF(proportional), pd->last_proportional);
-	bch2_hprint(&PBUF(derivative),	pd->last_derivative);
-	bch2_hprint(&PBUF(change),	pd->last_change);
-
-	next_io = div64_s64(pd->rate.next - local_clock(), NSEC_PER_MSEC);
-
-	return sprintf(buf,
-		       "rate:\t\t%s/sec\n"
-		       "target:\t\t%s\n"
-		       "actual:\t\t%s\n"
-		       "proportional:\t%s\n"
-		       "derivative:\t%s\n"
-		       "change:\t\t%s/sec\n"
-		       "next io:\t%llims\n",
-		       rate, target, actual, proportional,
-		       derivative, change, next_io);
+void bch2_pd_controller_debug_to_text(struct printbuf *out, struct bch_pd_controller *pd)
+{
+	out->tabstops[0] = 20;
+
+	pr_buf(out, "rate:");
+	pr_tab(out);
+	bch2_hprint(out, pd->rate.rate);
+	pr_newline(out);
+
+	pr_buf(out, "target:");
+	pr_tab(out);
+	bch2_hprint(out, pd->last_target);
+	pr_newline(out);
+
+	pr_buf(out, "actual:");
+	pr_tab(out);
+	bch2_hprint(out, pd->last_actual);
+	pr_newline(out);
+
+	pr_buf(out, "proportional:");
+	pr_tab(out);
+	bch2_hprint(out, pd->last_proportional);
+	pr_newline(out);
+
+	pr_buf(out, "derivative:");
+	pr_tab(out);
+	bch2_hprint(out, pd->last_derivative);
+	pr_newline(out);
+
+	pr_buf(out, "change:");
+	pr_tab(out);
+	bch2_hprint(out, pd->last_change);
+	pr_newline(out);
+
+	pr_buf(out, "next io:");
+	pr_tab(out);
+	pr_buf(out, "%llims", div64_s64(pd->rate.next - local_clock(), NSEC_PER_MSEC));
+	pr_newline(out);
 }
 
 /* misc: */
diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h
index 426c3009f292..58427edcfaa4 100644
--- a/fs/bcachefs/util.h
+++ b/fs/bcachefs/util.h
@@ -582,7 +582,7 @@ struct bch_pd_controller {
 
 void bch2_pd_controller_update(struct bch_pd_controller *, s64, s64, int);
 void bch2_pd_controller_init(struct bch_pd_controller *);
-size_t bch2_pd_controller_print_debug(struct bch_pd_controller *, char *);
+void bch2_pd_controller_debug_to_text(struct printbuf *, struct bch_pd_controller *);
 
 #define sysfs_pd_controller_attribute(name)				\
 	rw_attribute(name##_rate);					\
@@ -605,8 +605,10 @@ do {									\
 	sysfs_print(name##_rate_d_term,		(var)->d_term);		\
 	sysfs_print(name##_rate_p_term_inverse,	(var)->p_term_inverse);	\
 									\
-	if (attr == &sysfs_##name##_rate_debug)				\
-		return bch2_pd_controller_print_debug(var, buf);		\
+	if (attr == &sysfs_##name##_rate_debug) {			\
+		bch2_pd_controller_debug_to_text(&out, var);		\
+		return out.pos - buf;					\
+	}								\
 } while (0)
 
 #define sysfs_pd_controller_store(name, var)				\
-- 
cgit 


From fa8e94faeece12c20b541f647059f29867e98bc0 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 25 Feb 2022 13:18:19 -0500
Subject: bcachefs: Heap allocate printbufs

This patch changes printbufs dynamically allocate and reallocate a
buffer as needed. Stack usage has become a bit of a problem, and a major
cause of that has been static size string buffers on the stack.

The most involved part of this refactoring is that printbufs must now be
exited with printbuf_exit().

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bkey.c                  |   9 +-
 fs/bcachefs/bset.c                  |  51 +++++-----
 fs/bcachefs/btree_cache.c           |  18 ++--
 fs/bcachefs/btree_gc.c              | 181 +++++++++++++++++++++------------
 fs/bcachefs/btree_io.c              | 103 ++++++++++---------
 fs/bcachefs/btree_iter.c            |  76 ++++++++------
 fs/bcachefs/btree_update_interior.c |  31 +++---
 fs/bcachefs/btree_update_leaf.c     |   7 +-
 fs/bcachefs/buckets.c               | 119 +++++++++++++---------
 fs/bcachefs/clock.c                 |   2 +
 fs/bcachefs/debug.c                 |  42 ++++----
 fs/bcachefs/ec.c                    |   7 +-
 fs/bcachefs/fs.c                    |  13 ++-
 fs/bcachefs/fsck.c                  | 178 ++++++++++++++++++++-------------
 fs/bcachefs/io.c                    |  14 +--
 fs/bcachefs/journal.c               |  24 +++--
 fs/bcachefs/journal_io.c            |  53 +++++-----
 fs/bcachefs/journal_reclaim.c       |  11 +-
 fs/bcachefs/rebalance.c             |  42 +++++---
 fs/bcachefs/recovery.c              |  22 +++-
 fs/bcachefs/replicas.c              |   7 +-
 fs/bcachefs/super-io.c              |  33 ++----
 fs/bcachefs/super.c                 |  43 +++-----
 fs/bcachefs/sysfs.c                 | 193 +++++++++++++++---------------------
 fs/bcachefs/tests.c                 |  14 ++-
 fs/bcachefs/util.c                  |  35 ++++++-
 fs/bcachefs/util.h                  |  78 ++++++++-------
 fs/bcachefs/xattr.c                 |  22 ++--
 28 files changed, 808 insertions(+), 620 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bkey.c b/fs/bcachefs/bkey.c
index 3e62eeb6774e..a1115abf83bb 100644
--- a/fs/bcachefs/bkey.c
+++ b/fs/bcachefs/bkey.c
@@ -57,11 +57,12 @@ static void bch2_bkey_pack_verify(const struct bkey_packed *packed,
 	tmp = __bch2_bkey_unpack_key(format, packed);
 
 	if (memcmp(&tmp, unpacked, sizeof(struct bkey))) {
-		char buf1[160], buf2[160];
+		struct printbuf buf1 = PRINTBUF;
+		struct printbuf buf2 = PRINTBUF;
 		char buf3[160], buf4[160];
 
-		bch2_bkey_to_text(&PBUF(buf1), unpacked);
-		bch2_bkey_to_text(&PBUF(buf2), &tmp);
+		bch2_bkey_to_text(&buf1, unpacked);
+		bch2_bkey_to_text(&buf2, &tmp);
 		bch2_to_binary(buf3, (void *) unpacked, 80);
 		bch2_to_binary(buf4, high_word(format, packed), 80);
 
@@ -72,7 +73,7 @@ static void bch2_bkey_pack_verify(const struct bkey_packed *packed,
 		      format->bits_per_field[2],
 		      format->bits_per_field[3],
 		      format->bits_per_field[4],
-		      buf1, buf2, buf3, buf4);
+		      buf1.buf, buf2.buf, buf3, buf4);
 	}
 }
 
diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c
index 58e510fa19bd..adea3cea343b 100644
--- a/fs/bcachefs/bset.c
+++ b/fs/bcachefs/bset.c
@@ -58,7 +58,7 @@ void bch2_dump_bset(struct bch_fs *c, struct btree *b,
 	struct bkey_packed *_k, *_n;
 	struct bkey uk, n;
 	struct bkey_s_c k;
-	char buf[200];
+	struct printbuf buf = PRINTBUF;
 
 	if (!i->u64s)
 		return;
@@ -69,12 +69,14 @@ void bch2_dump_bset(struct bch_fs *c, struct btree *b,
 		_n = bkey_next(_k);
 
 		k = bkey_disassemble(b, _k, &uk);
+
+		printbuf_reset(&buf);
 		if (c)
-			bch2_bkey_val_to_text(&PBUF(buf), c, k);
+			bch2_bkey_val_to_text(&buf, c, k);
 		else
-			bch2_bkey_to_text(&PBUF(buf), k.k);
+			bch2_bkey_to_text(&buf, k.k);
 		printk(KERN_ERR "block %u key %5zu: %s\n", set,
-		       _k->_data - i->_data, buf);
+		       _k->_data - i->_data, buf.buf);
 
 		if (_n == vstruct_last(i))
 			continue;
@@ -90,6 +92,8 @@ void bch2_dump_bset(struct bch_fs *c, struct btree *b,
 		    !bpos_cmp(n.p, k.k->p))
 			printk(KERN_ERR "Duplicate keys\n");
 	}
+
+	printbuf_exit(&buf);
 }
 
 void bch2_dump_btree_node(struct bch_fs *c, struct btree *b)
@@ -106,6 +110,7 @@ void bch2_dump_btree_node_iter(struct btree *b,
 			      struct btree_node_iter *iter)
 {
 	struct btree_node_iter_set *set;
+	struct printbuf buf = PRINTBUF;
 
 	printk(KERN_ERR "btree node iter with %u/%u sets:\n",
 	       __btree_node_iter_used(iter), b->nsets);
@@ -114,12 +119,14 @@ void bch2_dump_btree_node_iter(struct btree *b,
 		struct bkey_packed *k = __btree_node_offset_to_key(b, set->k);
 		struct bset_tree *t = bch2_bkey_to_bset(b, k);
 		struct bkey uk = bkey_unpack_key(b, k);
-		char buf[100];
 
-		bch2_bkey_to_text(&PBUF(buf), &uk);
+		printbuf_reset(&buf);
+		bch2_bkey_to_text(&buf, &uk);
 		printk(KERN_ERR "set %zu key %u: %s\n",
-		       t - b->set, set->k, buf);
+		       t - b->set, set->k, buf.buf);
 	}
+
+	printbuf_exit(&buf);
 }
 
 #ifdef CONFIG_BCACHEFS_DEBUG
@@ -155,13 +162,14 @@ static void bch2_btree_node_iter_next_check(struct btree_node_iter *_iter,
 		struct btree_node_iter_set *set;
 		struct bkey ku = bkey_unpack_key(b, k);
 		struct bkey nu = bkey_unpack_key(b, n);
-		char buf1[80], buf2[80];
+		struct printbuf buf1 = PRINTBUF;
+		struct printbuf buf2 = PRINTBUF;
 
 		bch2_dump_btree_node(NULL, b);
-		bch2_bkey_to_text(&PBUF(buf1), &ku);
-		bch2_bkey_to_text(&PBUF(buf2), &nu);
+		bch2_bkey_to_text(&buf1, &ku);
+		bch2_bkey_to_text(&buf2, &nu);
 		printk(KERN_ERR "out of order/overlapping:\n%s\n%s\n",
-		       buf1, buf2);
+		       buf1.buf, buf2.buf);
 		printk(KERN_ERR "iter was:");
 
 		btree_node_iter_for_each(_iter, set) {
@@ -226,6 +234,8 @@ void bch2_verify_insert_pos(struct btree *b, struct bkey_packed *where,
 	struct bset_tree *t = bch2_bkey_to_bset(b, where);
 	struct bkey_packed *prev = bch2_bkey_prev_all(b, t, where);
 	struct bkey_packed *next = (void *) (where->_data + clobber_u64s);
+	struct printbuf buf1 = PRINTBUF;
+	struct printbuf buf2 = PRINTBUF;
 #if 0
 	BUG_ON(prev &&
 	       bkey_iter_cmp(b, prev, insert) > 0);
@@ -234,17 +244,15 @@ void bch2_verify_insert_pos(struct btree *b, struct bkey_packed *where,
 	    bkey_iter_cmp(b, prev, insert) > 0) {
 		struct bkey k1 = bkey_unpack_key(b, prev);
 		struct bkey k2 = bkey_unpack_key(b, insert);
-		char buf1[100];
-		char buf2[100];
 
 		bch2_dump_btree_node(NULL, b);
-		bch2_bkey_to_text(&PBUF(buf1), &k1);
-		bch2_bkey_to_text(&PBUF(buf2), &k2);
+		bch2_bkey_to_text(&buf1, &k1);
+		bch2_bkey_to_text(&buf2, &k2);
 
 		panic("prev > insert:\n"
 		      "prev    key %s\n"
 		      "insert  key %s\n",
-		      buf1, buf2);
+		      buf1.buf, buf2.buf);
 	}
 #endif
 #if 0
@@ -255,17 +263,15 @@ void bch2_verify_insert_pos(struct btree *b, struct bkey_packed *where,
 	    bkey_iter_cmp(b, insert, next) > 0) {
 		struct bkey k1 = bkey_unpack_key(b, insert);
 		struct bkey k2 = bkey_unpack_key(b, next);
-		char buf1[100];
-		char buf2[100];
 
 		bch2_dump_btree_node(NULL, b);
-		bch2_bkey_to_text(&PBUF(buf1), &k1);
-		bch2_bkey_to_text(&PBUF(buf2), &k2);
+		bch2_bkey_to_text(&buf1, &k1);
+		bch2_bkey_to_text(&buf2, &k2);
 
 		panic("insert > next:\n"
 		      "insert  key %s\n"
 		      "next    key %s\n",
-		      buf1, buf2);
+		      buf1.buf, buf2.buf);
 	}
 #endif
 }
@@ -1555,9 +1561,6 @@ void bch2_bfloat_to_text(struct printbuf *out, struct btree *b,
 	struct bkey uk;
 	unsigned j, inorder;
 
-	if (out->pos != out->end)
-		*out->pos = '\0';
-
 	if (!bset_has_ro_aux_tree(t))
 		return;
 
diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index c17db1d07187..dbf3b084478f 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -742,14 +742,16 @@ static int lock_node_check_fn(struct six_lock *lock, void *p)
 
 static noinline void btree_bad_header(struct bch_fs *c, struct btree *b)
 {
-	char buf1[200], buf2[100], buf3[100];
+	struct printbuf buf1 = PRINTBUF;
+	struct printbuf buf2 = PRINTBUF;
+	struct printbuf buf3 = PRINTBUF;
 
 	if (!test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags))
 		return;
 
-	bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(&b->key));
-	bch2_bpos_to_text(&PBUF(buf2), b->data->min_key);
-	bch2_bpos_to_text(&PBUF(buf3), b->data->max_key);
+	bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(&b->key));
+	bch2_bpos_to_text(&buf2, b->data->min_key);
+	bch2_bpos_to_text(&buf3, b->data->max_key);
 
 	bch2_fs_inconsistent(c, "btree node header doesn't match ptr\n"
 			     "btree %s level %u\n"
@@ -757,10 +759,14 @@ static noinline void btree_bad_header(struct bch_fs *c, struct btree *b)
 			     "header: btree %s level %llu\n"
 			     "min %s max %s\n",
 			     bch2_btree_ids[b->c.btree_id], b->c.level,
-			     buf1,
+			     buf1.buf,
 			     bch2_btree_ids[BTREE_NODE_ID(b->data)],
 			     BTREE_NODE_LEVEL(b->data),
-			     buf2, buf3);
+			     buf2.buf, buf3.buf);
+
+	printbuf_exit(&buf3);
+	printbuf_exit(&buf2);
+	printbuf_exit(&buf1);
 }
 
 static inline void btree_check_header(struct bch_fs *c, struct btree *b)
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index fbd54ac790ba..8eae5fb35c84 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -70,23 +70,23 @@ static int bch2_gc_check_topology(struct bch_fs *c,
 	struct bpos expected_start = bkey_deleted(&prev->k->k)
 		? node_start
 		: bpos_successor(prev->k->k.p);
-	char buf1[200], buf2[200];
+	struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF;
 	int ret = 0;
 
 	if (cur.k->k.type == KEY_TYPE_btree_ptr_v2) {
 		struct bkey_i_btree_ptr_v2 *bp = bkey_i_to_btree_ptr_v2(cur.k);
 
-		if (bkey_deleted(&prev->k->k)) {
-			struct printbuf out = PBUF(buf1);
-			pr_buf(&out, "start of node: ");
-			bch2_bpos_to_text(&out, node_start);
-		} else {
-			bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(prev->k));
-		}
-
 		if (bpos_cmp(expected_start, bp->v.min_key)) {
 			bch2_topology_error(c);
 
+			if (bkey_deleted(&prev->k->k)) {
+				pr_buf(&buf1, "start of node: ");
+				bch2_bpos_to_text(&buf1, node_start);
+			} else {
+				bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(prev->k));
+			}
+			bch2_bkey_val_to_text(&buf2, c, bkey_i_to_s_c(cur.k));
+
 			if (__fsck_err(c,
 				  FSCK_CAN_FIX|
 				  FSCK_CAN_IGNORE|
@@ -95,11 +95,11 @@ static int bch2_gc_check_topology(struct bch_fs *c,
 				  "  prev %s\n"
 				  "  cur %s",
 				  bch2_btree_ids[b->c.btree_id], b->c.level,
-				  buf1,
-				  (bch2_bkey_val_to_text(&PBUF(buf2), c, bkey_i_to_s_c(cur.k)), buf2)) &&
+				  buf1.buf, buf2.buf) &&
 			    !test_bit(BCH_FS_TOPOLOGY_REPAIR_DONE, &c->flags)) {
 				bch_info(c, "Halting mark and sweep to start topology repair pass");
-				return FSCK_ERR_START_TOPOLOGY_REPAIR;
+				ret = FSCK_ERR_START_TOPOLOGY_REPAIR;
+				goto err;
 			} else {
 				set_bit(BCH_FS_INITIAL_GC_UNFIXED, &c->flags);
 			}
@@ -109,6 +109,12 @@ static int bch2_gc_check_topology(struct bch_fs *c,
 	if (is_last && bpos_cmp(cur.k->k.p, node_end)) {
 		bch2_topology_error(c);
 
+		printbuf_reset(&buf1);
+		printbuf_reset(&buf2);
+
+		bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(cur.k));
+		bch2_bpos_to_text(&buf2, node_end);
+
 		if (__fsck_err(c,
 			  FSCK_CAN_FIX|
 			  FSCK_CAN_IGNORE|
@@ -117,18 +123,21 @@ static int bch2_gc_check_topology(struct bch_fs *c,
 			  "  %s\n"
 			  "  expected %s",
 			  bch2_btree_ids[b->c.btree_id], b->c.level,
-			  (bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(cur.k)), buf1),
-			  (bch2_bpos_to_text(&PBUF(buf2), node_end), buf2)) &&
+			  buf1.buf, buf2.buf) &&
 		    !test_bit(BCH_FS_TOPOLOGY_REPAIR_DONE, &c->flags)) {
 			bch_info(c, "Halting mark and sweep to start topology repair pass");
-			return FSCK_ERR_START_TOPOLOGY_REPAIR;
+			ret = FSCK_ERR_START_TOPOLOGY_REPAIR;
+			goto err;
 		} else {
 			set_bit(BCH_FS_INITIAL_GC_UNFIXED, &c->flags);
 		}
 	}
 
 	bch2_bkey_buf_copy(prev, c, cur.k);
+err:
 fsck_err:
+	printbuf_exit(&buf2);
+	printbuf_exit(&buf1);
 	return ret;
 }
 
@@ -251,18 +260,17 @@ static int btree_repair_node_boundaries(struct bch_fs *c, struct btree *b,
 	struct bpos expected_start = !prev
 		? b->data->min_key
 		: bpos_successor(prev->key.k.p);
-	char buf1[200], buf2[200];
+	struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF;
 	int ret = 0;
 
 	if (!prev) {
-		struct printbuf out = PBUF(buf1);
-		pr_buf(&out, "start of node: ");
-		bch2_bpos_to_text(&out, b->data->min_key);
+		pr_buf(&buf1, "start of node: ");
+		bch2_bpos_to_text(&buf1, b->data->min_key);
 	} else {
-		bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(&prev->key));
+		bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(&prev->key));
 	}
 
-	bch2_bkey_val_to_text(&PBUF(buf2), c, bkey_i_to_s_c(&cur->key));
+	bch2_bkey_val_to_text(&buf2, c, bkey_i_to_s_c(&cur->key));
 
 	if (prev &&
 	    bpos_cmp(expected_start, cur->data->min_key) > 0 &&
@@ -275,8 +283,10 @@ static int btree_repair_node_boundaries(struct bch_fs *c, struct btree *b,
 				"  node %s\n"
 				"  next %s",
 				bch2_btree_ids[b->c.btree_id], b->c.level,
-				buf1, buf2))
-			return DROP_PREV_NODE;
+				buf1.buf, buf2.buf)) {
+			ret = DROP_PREV_NODE;
+			goto out;
+		}
 
 		if (mustfix_fsck_err_on(bpos_cmp(prev->key.k.p,
 						 bpos_predecessor(cur->data->min_key)), c,
@@ -284,7 +294,7 @@ static int btree_repair_node_boundaries(struct bch_fs *c, struct btree *b,
 				"  node %s\n"
 				"  next %s",
 				bch2_btree_ids[b->c.btree_id], b->c.level,
-				buf1, buf2))
+				buf1.buf, buf2.buf))
 			ret = set_node_max(c, prev,
 					   bpos_predecessor(cur->data->min_key));
 	} else {
@@ -296,39 +306,49 @@ static int btree_repair_node_boundaries(struct bch_fs *c, struct btree *b,
 				"  prev %s\n"
 				"  node %s",
 				bch2_btree_ids[b->c.btree_id], b->c.level,
-				buf1, buf2))
-			return DROP_THIS_NODE;
+				buf1.buf, buf2.buf)) {
+			ret = DROP_THIS_NODE;
+			goto out;
+		}
 
 		if (mustfix_fsck_err_on(bpos_cmp(expected_start, cur->data->min_key), c,
 				"btree node with incorrect min_key at btree %s level %u:\n"
 				"  prev %s\n"
 				"  node %s",
 				bch2_btree_ids[b->c.btree_id], b->c.level,
-				buf1, buf2))
+				buf1.buf, buf2.buf))
 		    ret = set_node_min(c, cur, expected_start);
 	}
+out:
 fsck_err:
+	printbuf_exit(&buf2);
+	printbuf_exit(&buf1);
 	return ret;
 }
 
 static int btree_repair_node_end(struct bch_fs *c, struct btree *b,
 				 struct btree *child)
 {
-	char buf1[200], buf2[200];
+	struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF;
 	int ret = 0;
 
+	bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(&child->key));
+	bch2_bpos_to_text(&buf2, b->key.k.p);
+
 	if (mustfix_fsck_err_on(bpos_cmp(child->key.k.p, b->key.k.p), c,
 			"btree node with incorrect max_key at btree %s level %u:\n"
 			"  %s\n"
 			"  expected %s",
 			bch2_btree_ids[b->c.btree_id], b->c.level,
-			(bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(&child->key)), buf1),
-			(bch2_bpos_to_text(&PBUF(buf2), b->key.k.p), buf2))) {
+			buf1.buf, buf2.buf)) {
 		ret = set_node_max(c, child, b->key.k.p);
 		if (ret)
-			return ret;
+			goto err;
 	}
+err:
 fsck_err:
+	printbuf_exit(&buf2);
+	printbuf_exit(&buf1);
 	return ret;
 }
 
@@ -339,7 +359,7 @@ static int bch2_btree_repair_topology_recurse(struct bch_fs *c, struct btree *b)
 	struct bkey_buf prev_k, cur_k;
 	struct btree *prev = NULL, *cur = NULL;
 	bool have_child, dropped_children = false;
-	char buf[200];
+	struct printbuf buf;
 	int ret = 0;
 
 	if (!b->c.level)
@@ -363,12 +383,15 @@ again:
 					false);
 		ret = PTR_ERR_OR_ZERO(cur);
 
+		printbuf_reset(&buf);
+		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(cur_k.k));
+
 		if (mustfix_fsck_err_on(ret == -EIO, c,
 				"Unreadable btree node at btree %s level %u:\n"
 				"  %s",
 				bch2_btree_ids[b->c.btree_id],
 				b->c.level - 1,
-				(bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(cur_k.k)), buf))) {
+				buf.buf)) {
 			bch2_btree_node_evict(c, cur_k.k);
 			ret = bch2_journal_key_delete(c, b->c.btree_id,
 						      b->c.level, cur_k.k->k.p);
@@ -468,12 +491,14 @@ again:
 		have_child = true;
 	}
 
+	printbuf_reset(&buf);
+	bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
+
 	if (mustfix_fsck_err_on(!have_child, c,
 			"empty interior btree node at btree %s level %u\n"
 			"  %s",
 			bch2_btree_ids[b->c.btree_id],
-			b->c.level,
-			(bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(&b->key)), buf)))
+			b->c.level, buf.buf))
 		ret = DROP_THIS_NODE;
 err:
 fsck_err:
@@ -489,6 +514,7 @@ fsck_err:
 	if (!ret && dropped_children)
 		goto again;
 
+	printbuf_exit(&buf);
 	return ret;
 }
 
@@ -524,7 +550,7 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id,
 	const union bch_extent_entry *entry;
 	struct extent_ptr_decoded p = { 0 };
 	bool do_update = false;
-	char buf[200];
+	struct printbuf buf = PRINTBUF;
 	int ret = 0;
 
 	/*
@@ -542,7 +568,8 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id,
 				p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
 				bch2_data_types[ptr_data_type(k->k, &p.ptr)],
 				p.ptr.gen,
-				(bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf))) {
+				(printbuf_reset(&buf),
+				 bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) {
 			if (!p.ptr.cached) {
 				g->_mark.gen		= p.ptr.gen;
 				g->gen_valid		= true;
@@ -557,7 +584,8 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id,
 				p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
 				bch2_data_types[ptr_data_type(k->k, &p.ptr)],
 				p.ptr.gen, g->mark.gen,
-				(bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf))) {
+				(printbuf_reset(&buf),
+				 bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) {
 			if (!p.ptr.cached) {
 				g->_mark.gen		= p.ptr.gen;
 				g->gen_valid		= true;
@@ -576,7 +604,8 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id,
 				p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->mark.gen,
 				bch2_data_types[ptr_data_type(k->k, &p.ptr)],
 				p.ptr.gen,
-				(bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf)))
+				(printbuf_reset(&buf),
+				 bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))
 			do_update = true;
 
 		if (fsck_err_on(!p.ptr.cached &&
@@ -586,7 +615,8 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id,
 				p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
 				bch2_data_types[ptr_data_type(k->k, &p.ptr)],
 				p.ptr.gen, g->mark.gen,
-				(bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf)))
+				(printbuf_reset(&buf),
+				 bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))
 			do_update = true;
 
 		if (data_type != BCH_DATA_btree && p.ptr.gen != g->mark.gen)
@@ -599,7 +629,8 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id,
 				p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
 				bch2_data_types[g->mark.data_type],
 				bch2_data_types[data_type],
-				(bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf))) {
+				(printbuf_reset(&buf),
+				 bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) {
 			if (data_type == BCH_DATA_btree) {
 				g->_mark.data_type	= data_type;
 				set_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags);
@@ -615,14 +646,16 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id,
 					"pointer to nonexistent stripe %llu\n"
 					"while marking %s",
 					(u64) p.ec.idx,
-					(bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf)))
+					(printbuf_reset(&buf),
+					 bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))
 				do_update = true;
 
 			if (fsck_err_on(!bch2_ptr_matches_stripe_m(m, p), c,
 					"pointer does not match stripe %llu\n"
 					"while marking %s",
 					(u64) p.ec.idx,
-					(bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf)))
+					(printbuf_reset(&buf),
+					 bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))
 				do_update = true;
 		}
 	}
@@ -635,13 +668,15 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id,
 
 		if (is_root) {
 			bch_err(c, "cannot update btree roots yet");
-			return -EINVAL;
+			ret = -EINVAL;
+			goto err;
 		}
 
 		new = kmalloc(bkey_bytes(k->k), GFP_KERNEL);
 		if (!new) {
 			bch_err(c, "%s: error allocating new key", __func__);
-			return -ENOMEM;
+			ret = -ENOMEM;
+			goto err;
 		}
 
 		bkey_reassemble(new, *k);
@@ -705,19 +740,25 @@ found:
 		ret = bch2_journal_key_insert_take(c, btree_id, level, new);
 		if (ret) {
 			kfree(new);
-			return ret;
+			goto err;
 		}
 
 		if (level)
 			bch2_btree_node_update_key_early(c, btree_id, level - 1, *k, new);
 
-		bch2_bkey_val_to_text(&PBUF(buf), c, *k);
-		bch_info(c, "updated %s", buf);
-		bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(new));
-		bch_info(c, "new key %s", buf);
+		printbuf_reset(&buf);
+		bch2_bkey_val_to_text(&buf, c, *k);
+		bch_info(c, "updated %s", buf.buf);
+
+		printbuf_reset(&buf);
+		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(new));
+		bch_info(c, "new key %s", buf.buf);
+
 		*k = bkey_i_to_s_c(new);
 	}
+err:
 fsck_err:
+	printbuf_exit(&buf);
 	return ret;
 }
 
@@ -852,7 +893,7 @@ static int bch2_gc_btree_init_recurse(struct btree_trans *trans, struct btree *b
 	struct btree_and_journal_iter iter;
 	struct bkey_s_c k;
 	struct bkey_buf cur, prev;
-	char buf[200];
+	struct printbuf buf = PRINTBUF;
 	int ret = 0;
 
 	bch2_btree_and_journal_iter_init_node_iter(&iter, c, b);
@@ -913,7 +954,8 @@ static int bch2_gc_btree_init_recurse(struct btree_trans *trans, struct btree *b
 					  "  %s",
 					  bch2_btree_ids[b->c.btree_id],
 					  b->c.level - 1,
-					  (bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(cur.k)), buf)) &&
+					  (printbuf_reset(&buf),
+					   bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(cur.k)), buf.buf)) &&
 				    !test_bit(BCH_FS_TOPOLOGY_REPAIR_DONE, &c->flags)) {
 					ret = FSCK_ERR_START_TOPOLOGY_REPAIR;
 					bch_info(c, "Halting mark and sweep to start topology repair pass");
@@ -943,6 +985,7 @@ fsck_err:
 	bch2_bkey_buf_exit(&cur, c);
 	bch2_bkey_buf_exit(&prev, c);
 	bch2_btree_and_journal_iter_exit(&iter);
+	printbuf_exit(&buf);
 	return ret;
 }
 
@@ -956,7 +999,7 @@ static int bch2_gc_btree_init(struct btree_trans *trans,
 		: bch2_expensive_debug_checks		? 0
 		: !btree_node_type_needs_gc(btree_id)	? 1
 		: 0;
-	char buf[100];
+	struct printbuf buf = PRINTBUF;
 	int ret = 0;
 
 	b = c->btree_roots[btree_id].b;
@@ -965,17 +1008,19 @@ static int bch2_gc_btree_init(struct btree_trans *trans,
 		return 0;
 
 	six_lock_read(&b->c.lock, NULL, NULL);
+	printbuf_reset(&buf);
+	bch2_bpos_to_text(&buf, b->data->min_key);
 	if (mustfix_fsck_err_on(bpos_cmp(b->data->min_key, POS_MIN), c,
-			"btree root with incorrect min_key: %s",
-			(bch2_bpos_to_text(&PBUF(buf), b->data->min_key), buf))) {
+			"btree root with incorrect min_key: %s", buf.buf)) {
 		bch_err(c, "repair unimplemented");
 		ret = FSCK_ERR_EXIT;
 		goto fsck_err;
 	}
 
+	printbuf_reset(&buf);
+	bch2_bpos_to_text(&buf, b->data->max_key);
 	if (mustfix_fsck_err_on(bpos_cmp(b->data->max_key, SPOS_MAX), c,
-			"btree root with incorrect max_key: %s",
-			(bch2_bpos_to_text(&PBUF(buf), b->data->max_key), buf))) {
+			"btree root with incorrect max_key: %s", buf.buf)) {
 		bch_err(c, "repair unimplemented");
 		ret = FSCK_ERR_EXIT;
 		goto fsck_err;
@@ -995,6 +1040,7 @@ fsck_err:
 
 	if (ret < 0)
 		bch_err(c, "%s: ret %i", __func__, ret);
+	printbuf_exit(&buf);
 	return ret;
 }
 
@@ -1131,6 +1177,7 @@ static int bch2_gc_done(struct bch_fs *c,
 			bool initial, bool metadata_only)
 {
 	struct bch_dev *ca = NULL;
+	struct printbuf buf = PRINTBUF;
 	bool verify = !metadata_only && (!initial ||
 		       (c->sb.compat & (1ULL << BCH_COMPAT_alloc_info)));
 	unsigned i, dev;
@@ -1201,16 +1248,16 @@ static int bch2_gc_done(struct bch_fs *c,
 		for (i = 0; i < c->replicas.nr; i++) {
 			struct bch_replicas_entry *e =
 				cpu_replicas_entry(&c->replicas, i);
-			char buf[80];
 
 			if (metadata_only &&
 			    (e->data_type == BCH_DATA_user ||
 			     e->data_type == BCH_DATA_cached))
 				continue;
 
-			bch2_replicas_entry_to_text(&PBUF(buf), e);
+			printbuf_reset(&buf);
+			bch2_replicas_entry_to_text(&buf, e);
 
-			copy_fs_field(replicas[i], "%s", buf);
+			copy_fs_field(replicas[i], "%s", buf.buf);
 		}
 	}
 
@@ -1225,6 +1272,7 @@ fsck_err:
 		bch_err(c, "%s: ret %i", __func__, ret);
 
 	percpu_up_write(&c->mark_lock);
+	printbuf_exit(&buf);
 	return ret;
 }
 
@@ -1424,7 +1472,7 @@ static int bch2_gc_reflink_done(struct bch_fs *c, bool metadata_only)
 	struct bkey_s_c k;
 	struct reflink_gc *r;
 	size_t idx = 0;
-	char buf[200];
+	struct printbuf buf = PRINTBUF;
 	int ret = 0;
 
 	if (metadata_only)
@@ -1452,7 +1500,8 @@ static int bch2_gc_reflink_done(struct bch_fs *c, bool metadata_only)
 				"reflink key has wrong refcount:\n"
 				"  %s\n"
 				"  should be %u",
-				(bch2_bkey_val_to_text(&PBUF(buf), c, k), buf),
+				(printbuf_reset(&buf),
+				 bch2_bkey_val_to_text(&buf, c, k), buf.buf),
 				r->refcount)) {
 			struct bkey_i *new;
 
@@ -1481,6 +1530,7 @@ fsck_err:
 	bch2_trans_iter_exit(&trans, &iter);
 	c->reflink_gc_nr = 0;
 	bch2_trans_exit(&trans);
+	printbuf_exit(&buf);
 	return ret;
 }
 
@@ -1539,7 +1589,7 @@ static int bch2_gc_stripes_done(struct bch_fs *c, bool metadata_only)
 	struct bkey_s_c k;
 	struct gc_stripe *m;
 	const struct bch_stripe *s;
-	char buf[200];
+	struct printbuf buf = PRINTBUF;
 	unsigned i;
 	int ret = 0;
 
@@ -1565,7 +1615,8 @@ inconsistent:
 				"stripe has wrong block sector count %u:\n"
 				"  %s\n"
 				"  should be %u", i,
-				(bch2_bkey_val_to_text(&PBUF(buf), c, k), buf),
+				(printbuf_reset(&buf),
+				 bch2_bkey_val_to_text(&buf, c, k), buf.buf),
 				m ? m->block_sectors[i] : 0)) {
 			struct bkey_i_stripe *new;
 
@@ -1589,6 +1640,8 @@ fsck_err:
 	bch2_trans_iter_exit(&trans, &iter);
 
 	bch2_trans_exit(&trans);
+
+	printbuf_exit(&buf);
 	return ret;
 }
 
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index c65c640753b6..1dc21b5948ea 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -534,13 +534,7 @@ enum btree_validate_ret {
 #define btree_err(type, c, ca, b, i, msg, ...)				\
 ({									\
 	__label__ out;							\
-	char _buf[300];							\
-	char *_buf2 = _buf;						\
-	struct printbuf out = PBUF(_buf);				\
-									\
-	_buf2 = kmalloc(4096, GFP_ATOMIC);				\
-	if (_buf2)							\
-		out = _PBUF(_buf2, 4986);				\
+	struct printbuf out = PRINTBUF;					\
 									\
 	btree_err_msg(&out, c, ca, b, i, b->written, write);		\
 	pr_buf(&out, ": " msg, ##__VA_ARGS__);				\
@@ -548,14 +542,13 @@ enum btree_validate_ret {
 	if (type == BTREE_ERR_FIXABLE &&				\
 	    write == READ &&						\
 	    !test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags)) {		\
-		mustfix_fsck_err(c, "%s", _buf2);			\
+		mustfix_fsck_err(c, "%s", out.buf);			\
 		goto out;						\
 	}								\
 									\
 	switch (write) {						\
 	case READ:							\
-		if (_buf2)						\
-			bch_err(c, "%s", _buf2);			\
+		bch_err(c, "%s", out.buf);				\
 									\
 		switch (type) {						\
 		case BTREE_ERR_FIXABLE:					\
@@ -576,7 +569,7 @@ enum btree_validate_ret {
 		}							\
 		break;							\
 	case WRITE:							\
-		bch_err(c, "corrupt metadata before write: %s", _buf2);	\
+		bch_err(c, "corrupt metadata before write: %s", out.buf);\
 									\
 		if (bch2_fs_inconsistent(c)) {				\
 			ret = BCH_FSCK_ERRORS_NOT_FIXED;		\
@@ -585,8 +578,7 @@ enum btree_validate_ret {
 		break;							\
 	}								\
 out:									\
-	if (_buf2 != _buf)						\
-		kfree(_buf2);						\
+	printbuf_exit(&out);						\
 	true;								\
 })
 
@@ -648,8 +640,8 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
 {
 	unsigned version = le16_to_cpu(i->version);
 	const char *err;
-	char buf1[100];
-	char buf2[100];
+	struct printbuf buf1 = PRINTBUF;
+	struct printbuf buf2 = PRINTBUF;
 	int ret = 0;
 
 	btree_err_on((version != BCH_BSET_VERSION_OLD &&
@@ -686,7 +678,8 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
 			 BTREE_ERR_FIXABLE, c, ca, b, i,
 			 "bset past end of btree node")) {
 		i->u64s = 0;
-		return 0;
+		ret = 0;
+		goto out;
 	}
 
 	btree_err_on(offset && !i->u64s,
@@ -737,14 +730,17 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
 			btree_err_on(bpos_cmp(b->data->min_key, bp->min_key),
 				     BTREE_ERR_MUST_RETRY, c, ca, b, NULL,
 				     "incorrect min_key: got %s should be %s",
-				     (bch2_bpos_to_text(&PBUF(buf1), bn->min_key), buf1),
-				     (bch2_bpos_to_text(&PBUF(buf2), bp->min_key), buf2));
+				     (printbuf_reset(&buf1),
+				      bch2_bpos_to_text(&buf1, bn->min_key), buf1.buf),
+				     (printbuf_reset(&buf2),
+				      bch2_bpos_to_text(&buf2, bp->min_key), buf2.buf));
 		}
 
 		btree_err_on(bpos_cmp(bn->max_key, b->key.k.p),
 			     BTREE_ERR_MUST_RETRY, c, ca, b, i,
 			     "incorrect max key %s",
-			     (bch2_bpos_to_text(&PBUF(buf1), bn->max_key), buf1));
+			     (printbuf_reset(&buf1),
+			      bch2_bpos_to_text(&buf1, bn->max_key), buf1.buf));
 
 		if (write)
 			compat_btree_node(b->c.level, b->c.btree_id, version,
@@ -759,7 +755,10 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
 			       BSET_BIG_ENDIAN(i), write,
 			       &bn->format);
 	}
+out:
 fsck_err:
+	printbuf_exit(&buf2);
+	printbuf_exit(&buf1);
 	return ret;
 }
 
@@ -769,6 +768,8 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b,
 {
 	unsigned version = le16_to_cpu(i->version);
 	struct bkey_packed *k, *prev = NULL;
+	struct printbuf buf1 = PRINTBUF;
+	struct printbuf buf2 = PRINTBUF;
 	bool updated_range = b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
 		BTREE_PTR_RANGE_UPDATED(&bkey_i_to_btree_ptr_v2(&b->key)->v);
 	int ret = 0;
@@ -807,11 +808,10 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b,
 			(!updated_range ?  bch2_bkey_in_btree_node(b, u.s_c) : NULL) ?:
 			(write ? bch2_bkey_val_invalid(c, u.s_c) : NULL);
 		if (invalid) {
-			char buf[160];
-
-			bch2_bkey_val_to_text(&PBUF(buf), c, u.s_c);
+			printbuf_reset(&buf1);
+			bch2_bkey_val_to_text(&buf1, c, u.s_c);
 			btree_err(BTREE_ERR_FIXABLE, c, NULL, b, i,
-				  "invalid bkey: %s\n%s", invalid, buf);
+				  "invalid bkey: %s\n%s", invalid, buf1.buf);
 
 			i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
 			memmove_u64s_down(k, bkey_next(k),
@@ -825,18 +825,18 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b,
 				    &b->format, k);
 
 		if (prev && bkey_iter_cmp(b, prev, k) > 0) {
-			char buf1[80];
-			char buf2[80];
 			struct bkey up = bkey_unpack_key(b, prev);
 
-			bch2_bkey_to_text(&PBUF(buf1), &up);
-			bch2_bkey_to_text(&PBUF(buf2), u.k);
+			printbuf_reset(&buf1);
+			bch2_bkey_to_text(&buf1, &up);
+			printbuf_reset(&buf2);
+			bch2_bkey_to_text(&buf2, u.k);
 
 			bch2_dump_bset(c, b, i, 0);
 
 			if (btree_err(BTREE_ERR_FIXABLE, c, NULL, b, i,
 				      "keys out of order: %s > %s",
-				      buf1, buf2)) {
+				      buf1.buf, buf2.buf)) {
 				i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
 				memmove_u64s_down(k, bkey_next(k),
 						  (u64 *) vstruct_end(i) - (u64 *) k);
@@ -848,6 +848,8 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b,
 		k = bkey_next(k);
 	}
 fsck_err:
+	printbuf_exit(&buf2);
+	printbuf_exit(&buf1);
 	return ret;
 }
 
@@ -1063,11 +1065,12 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
 		if (invalid ||
 		    (bch2_inject_invalid_keys &&
 		     !bversion_cmp(u.k->version, MAX_VERSION))) {
-			char buf[160];
+			struct printbuf buf = PRINTBUF;
 
-			bch2_bkey_val_to_text(&PBUF(buf), c, u.s_c);
+			bch2_bkey_val_to_text(&buf, c, u.s_c);
 			btree_err(BTREE_ERR_FIXABLE, c, NULL, b, i,
 				  "invalid bkey %s: %s", buf, invalid);
+			printbuf_exit(&buf);
 
 			btree_keys_account_key_drop(&b->nr, 0, k);
 
@@ -1124,8 +1127,7 @@ static void btree_node_read_work(struct work_struct *work)
 	struct bch_dev *ca	= bch_dev_bkey_exists(c, rb->pick.ptr.dev);
 	struct bio *bio		= &rb->bio;
 	struct bch_io_failures failed = { .nr = 0 };
-	char buf[200];
-	struct printbuf out;
+	struct printbuf buf = PRINTBUF;
 	bool saw_error = false;
 	bool can_retry;
 
@@ -1145,10 +1147,10 @@ static void btree_node_read_work(struct work_struct *work)
 			bio->bi_status = BLK_STS_REMOVED;
 		}
 start:
-		out = PBUF(buf);
-		btree_pos_to_text(&out, c, b);
+		printbuf_reset(&buf);
+		btree_pos_to_text(&buf, c, b);
 		bch2_dev_io_err_on(bio->bi_status, ca, "btree read error %s for %s",
-				   bch2_blk_status_to_str(bio->bi_status), buf);
+				   bch2_blk_status_to_str(bio->bi_status), buf.buf);
 		if (rb->have_ioref)
 			percpu_ref_put(&ca->io_ref);
 		rb->have_ioref = false;
@@ -1174,6 +1176,7 @@ start:
 	bch2_time_stats_update(&c->times[BCH_TIME_btree_node_read],
 			       rb->start_time);
 	bio_put(&rb->bio);
+	printbuf_exit(&buf);
 
 	if (saw_error && !btree_node_read_error(b))
 		bch2_btree_node_rewrite_async(c, b);
@@ -1254,6 +1257,7 @@ static void btree_node_read_all_replicas_done(struct closure *cl)
 		container_of(cl, struct btree_node_read_all, cl);
 	struct bch_fs *c = ra->c;
 	struct btree *b = ra->b;
+	struct printbuf buf = PRINTBUF;
 	bool dump_bset_maps = false;
 	bool have_retry = false;
 	int ret = 0, best = -1, write = READ;
@@ -1297,8 +1301,6 @@ static void btree_node_read_all_replicas_done(struct closure *cl)
 fsck_err:
 	if (dump_bset_maps) {
 		for (i = 0; i < ra->nr; i++) {
-			char buf[200];
-			struct printbuf out = PBUF(buf);
 			struct btree_node *bn = ra->buf[i];
 			struct btree_node_entry *bne = NULL;
 			unsigned offset = 0, sectors;
@@ -1307,6 +1309,8 @@ fsck_err:
 			if (ra->err[i])
 				continue;
 
+			printbuf_reset(&buf);
+
 			while (offset < btree_sectors(c)) {
 				if (!offset) {
 					sectors = vstruct_sectors(bn, c->block_bits);
@@ -1317,10 +1321,10 @@ fsck_err:
 					sectors = vstruct_sectors(bne, c->block_bits);
 				}
 
-				pr_buf(&out, " %u-%u", offset, offset + sectors);
+				pr_buf(&buf, " %u-%u", offset, offset + sectors);
 				if (bne && bch2_journal_seq_is_blacklisted(c,
 							le64_to_cpu(bne->keys.journal_seq), false))
-					pr_buf(&out, "*");
+					pr_buf(&buf, "*");
 				offset += sectors;
 			}
 
@@ -1328,19 +1332,19 @@ fsck_err:
 				bne = ra->buf[i] + (offset << 9);
 				if (bne->keys.seq == bn->keys.seq) {
 					if (!gap)
-						pr_buf(&out, " GAP");
+						pr_buf(&buf, " GAP");
 					gap = true;
 
 					sectors = vstruct_sectors(bne, c->block_bits);
-					pr_buf(&out, " %u-%u", offset, offset + sectors);
+					pr_buf(&buf, " %u-%u", offset, offset + sectors);
 					if (bch2_journal_seq_is_blacklisted(c,
 							le64_to_cpu(bne->keys.journal_seq), false))
-						pr_buf(&out, "*");
+						pr_buf(&buf, "*");
 				}
 				offset++;
 			}
 
-			bch_err(c, "replica %u:%s", i, buf);
+			bch_err(c, "replica %u:%s", i, buf.buf);
 		}
 	}
 
@@ -1361,6 +1365,7 @@ fsck_err:
 
 	closure_debug_destroy(&ra->cl);
 	kfree(ra);
+	printbuf_exit(&buf);
 
 	clear_btree_node_read_in_flight(b);
 	wake_up_bit(&b->flags, BTREE_NODE_read_in_flight);
@@ -1461,23 +1466,23 @@ void bch2_btree_node_read(struct bch_fs *c, struct btree *b,
 	struct btree_read_bio *rb;
 	struct bch_dev *ca;
 	struct bio *bio;
-	char buf[200];
+	struct printbuf buf = PRINTBUF;
 	int ret;
 
-	btree_pos_to_text(&PBUF(buf), c, b);
+	btree_pos_to_text(&buf, c, b);
 	trace_btree_read(c, b);
 
 	if (bch2_verify_all_btree_replicas &&
 	    !btree_node_read_all_replicas(c, b, sync))
-		return;
+		goto out;
 
 	ret = bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key),
 					 NULL, &pick);
 	if (bch2_fs_fatal_err_on(ret <= 0, c,
 			"btree node read error: no device to read from\n"
-			" at %s", buf)) {
+			" at %s", buf.buf)) {
 		set_btree_node_read_error(b);
-		return;
+		goto out;
 	}
 
 	ca = bch_dev_bkey_exists(c, pick.ptr.dev);
@@ -1519,6 +1524,8 @@ void bch2_btree_node_read(struct bch_fs *c, struct btree *b,
 		else
 			queue_work(c->io_complete_wq, &rb->work);
 	}
+out:
+	printbuf_exit(&buf);
 }
 
 int bch2_btree_root_read(struct bch_fs *c, enum btree_id id,
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index b58219292f34..92258281fdc7 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -574,7 +574,9 @@ static void bch2_btree_path_verify_level(struct btree_trans *trans,
 	struct btree_node_iter tmp;
 	bool locked;
 	struct bkey_packed *p, *k;
-	char buf1[100], buf2[100], buf3[100];
+	struct printbuf buf1 = PRINTBUF;
+	struct printbuf buf2 = PRINTBUF;
+	struct printbuf buf3 = PRINTBUF;
 	const char *msg;
 
 	if (!bch2_debug_check_iterators)
@@ -622,26 +624,27 @@ static void bch2_btree_path_verify_level(struct btree_trans *trans,
 		btree_node_unlock(path, level);
 	return;
 err:
-	strcpy(buf2, "(none)");
-	strcpy(buf3, "(none)");
-
-	bch2_bpos_to_text(&PBUF(buf1), path->pos);
+	bch2_bpos_to_text(&buf1, path->pos);
 
 	if (p) {
 		struct bkey uk = bkey_unpack_key(l->b, p);
-		bch2_bkey_to_text(&PBUF(buf2), &uk);
+		bch2_bkey_to_text(&buf2, &uk);
+	} else {
+		pr_buf(&buf2, "(none)");
 	}
 
 	if (k) {
 		struct bkey uk = bkey_unpack_key(l->b, k);
-		bch2_bkey_to_text(&PBUF(buf3), &uk);
+		bch2_bkey_to_text(&buf3, &uk);
+	} else {
+		pr_buf(&buf3, "(none)");
 	}
 
 	panic("path should be %s key at level %u:\n"
 	      "path pos %s\n"
 	      "prev key %s\n"
 	      "cur  key %s\n",
-	      msg, level, buf1, buf2, buf3);
+	      msg, level, buf1.buf, buf2.buf, buf3.buf);
 }
 
 static void bch2_btree_path_verify(struct btree_trans *trans,
@@ -739,16 +742,16 @@ static int bch2_btree_iter_verify_ret(struct btree_iter *iter, struct bkey_s_c k
 	if (!bkey_cmp(prev.k->p, k.k->p) &&
 	    bch2_snapshot_is_ancestor(trans->c, iter->snapshot,
 				      prev.k->p.snapshot) > 0) {
-		char buf1[100], buf2[200];
+		struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF;
 
-		bch2_bkey_to_text(&PBUF(buf1), k.k);
-		bch2_bkey_to_text(&PBUF(buf2), prev.k);
+		bch2_bkey_to_text(&buf1, k.k);
+		bch2_bkey_to_text(&buf2, prev.k);
 
 		panic("iter snap %u\n"
 		      "k    %s\n"
 		      "prev %s\n",
 		      iter->snapshot,
-		      buf1, buf2);
+		      buf1.buf, buf2.buf);
 	}
 out:
 	bch2_trans_iter_exit(trans, &copy);
@@ -760,7 +763,7 @@ void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id,
 {
 	struct btree_path *path;
 	unsigned idx;
-	char buf[100];
+	struct printbuf buf = PRINTBUF;
 
 	trans_for_each_path_inorder(trans, path, idx) {
 		int cmp = cmp_int(path->btree_id, id) ?:
@@ -786,9 +789,10 @@ void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id,
 	}
 
 	bch2_dump_trans_paths_updates(trans);
+	bch2_bpos_to_text(&buf, pos);
+
 	panic("not locked: %s %s%s\n",
-	      bch2_btree_ids[id],
-	      (bch2_bpos_to_text(&PBUF(buf), pos), buf),
+	      bch2_btree_ids[id], buf.buf,
 	      key_cache ? " cached" : "");
 }
 
@@ -1071,23 +1075,23 @@ static void btree_path_verify_new_node(struct btree_trans *trans,
 	if (!k ||
 	    bkey_deleted(k) ||
 	    bkey_cmp_left_packed(l->b, k, &b->key.k.p)) {
-		char buf1[100];
-		char buf2[100];
-		char buf3[100];
-		char buf4[100];
+		struct printbuf buf1 = PRINTBUF;
+		struct printbuf buf2 = PRINTBUF;
+		struct printbuf buf3 = PRINTBUF;
+		struct printbuf buf4 = PRINTBUF;
 		struct bkey uk = bkey_unpack_key(b, k);
 
 		bch2_dump_btree_node(c, l->b);
-		bch2_bpos_to_text(&PBUF(buf1), path->pos);
-		bch2_bkey_to_text(&PBUF(buf2), &uk);
-		bch2_bpos_to_text(&PBUF(buf3), b->data->min_key);
-		bch2_bpos_to_text(&PBUF(buf3), b->data->max_key);
+		bch2_bpos_to_text(&buf1, path->pos);
+		bch2_bkey_to_text(&buf2, &uk);
+		bch2_bpos_to_text(&buf3, b->data->min_key);
+		bch2_bpos_to_text(&buf3, b->data->max_key);
 		panic("parent iter doesn't point to new node:\n"
 		      "iter pos %s %s\n"
 		      "iter key %s\n"
 		      "new node %s-%s\n",
-		      bch2_btree_ids[path->btree_id], buf1,
-		      buf2, buf3, buf4);
+		      bch2_btree_ids[path->btree_id],
+		      buf1.buf, buf2.buf, buf3.buf, buf4.buf);
 	}
 
 	if (!parent_locked)
@@ -1783,18 +1787,22 @@ void bch2_dump_trans_paths_updates(struct btree_trans *trans)
 {
 	struct btree_path *path;
 	struct btree_insert_entry *i;
+	struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF;
 	unsigned idx;
-	char buf1[300], buf2[300];
 
 	btree_trans_sort_paths(trans);
 
-	trans_for_each_path_inorder(trans, path, idx)
+	trans_for_each_path_inorder(trans, path, idx) {
+		printbuf_reset(&buf1);
+
+		bch2_bpos_to_text(&buf1, path->pos);
+
 		printk(KERN_ERR "path: idx %u ref %u:%u%s%s btree %s pos %s locks %u %pS\n",
 		       path->idx, path->ref, path->intent_ref,
 		       path->should_be_locked ? " S" : "",
 		       path->preserve ? " P" : "",
 		       bch2_btree_ids[path->btree_id],
-		       (bch2_bpos_to_text(&PBUF(buf1), path->pos), buf1),
+		       buf1.buf,
 		       path->nodes_locked,
 #ifdef CONFIG_BCACHEFS_DEBUG
 		       (void *) path->ip_allocated
@@ -1802,17 +1810,25 @@ void bch2_dump_trans_paths_updates(struct btree_trans *trans)
 		       NULL
 #endif
 		       );
+	}
 
 	trans_for_each_update(trans, i) {
 		struct bkey u;
 		struct bkey_s_c old = bch2_btree_path_peek_slot(i->path, &u);
 
+		printbuf_reset(&buf1);
+		printbuf_reset(&buf2);
+		bch2_bkey_val_to_text(&buf1, trans->c, old);
+		bch2_bkey_val_to_text(&buf2, trans->c, bkey_i_to_s_c(i->k));
+
 		printk(KERN_ERR "update: btree %s %pS\n  old %s\n  new %s",
 		       bch2_btree_ids[i->btree_id],
 		       (void *) i->ip_allocated,
-		       (bch2_bkey_val_to_text(&PBUF(buf1), trans->c, old), buf1),
-		       (bch2_bkey_val_to_text(&PBUF(buf2), trans->c, bkey_i_to_s_c(i->k)), buf2));
+		       buf1.buf, buf2.buf);
 	}
+
+	printbuf_exit(&buf2);
+	printbuf_exit(&buf1);
 }
 
 static struct btree_path *btree_path_alloc(struct btree_trans *trans,
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 255753b2dc0e..ed0a70f7ea68 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -41,7 +41,7 @@ static void btree_node_interior_verify(struct bch_fs *c, struct btree *b)
 	struct bkey_s_c k;
 	struct bkey_s_c_btree_ptr_v2 bp;
 	struct bkey unpacked;
-	char buf1[100], buf2[100];
+	struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF;
 
 	BUG_ON(!b->c.level);
 
@@ -58,9 +58,9 @@ static void btree_node_interior_verify(struct bch_fs *c, struct btree *b)
 
 		if (bpos_cmp(next_node, bp.v->min_key)) {
 			bch2_dump_btree_node(c, b);
-			panic("expected next min_key %s got %s\n",
-			      (bch2_bpos_to_text(&PBUF(buf1), next_node), buf1),
-			      (bch2_bpos_to_text(&PBUF(buf2), bp.v->min_key), buf2));
+			bch2_bpos_to_text(&buf1, next_node);
+			bch2_bpos_to_text(&buf2, bp.v->min_key);
+			panic("expected next min_key %s got %s\n", buf1.buf, buf2.buf);
 		}
 
 		bch2_btree_node_iter_advance(&iter, b);
@@ -68,9 +68,9 @@ static void btree_node_interior_verify(struct bch_fs *c, struct btree *b)
 		if (bch2_btree_node_iter_end(&iter)) {
 			if (bpos_cmp(k.k->p, b->key.k.p)) {
 				bch2_dump_btree_node(c, b);
-				panic("expected end %s got %s\n",
-				      (bch2_bpos_to_text(&PBUF(buf1), b->key.k.p), buf1),
-				      (bch2_bpos_to_text(&PBUF(buf2), k.k->p), buf2));
+				bch2_bpos_to_text(&buf1, b->key.k.p);
+				bch2_bpos_to_text(&buf2, k.k->p);
+				panic("expected end %s got %s\n", buf1.buf, buf2.buf);
 			}
 			break;
 		}
@@ -1151,10 +1151,11 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as,
 	invalid = bch2_bkey_invalid(c, bkey_i_to_s_c(insert), btree_node_type(b)) ?:
 		bch2_bkey_in_btree_node(b, bkey_i_to_s_c(insert));
 	if (invalid) {
-		char buf[160];
+		struct printbuf buf = PRINTBUF;
 
-		bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(insert));
-		bch2_fs_inconsistent(c, "inserting invalid bkey %s: %s", buf, invalid);
+		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert));
+		bch2_fs_inconsistent(c, "inserting invalid bkey %s: %s", buf.buf, invalid);
+		printbuf_exit(&buf);
 		dump_stack();
 	}
 
@@ -1636,15 +1637,17 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans,
 	}
 
 	if (bkey_cmp(bpos_successor(prev->data->max_key), next->data->min_key)) {
-		char buf1[100], buf2[100];
+		struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF;
 
-		bch2_bpos_to_text(&PBUF(buf1), prev->data->max_key);
-		bch2_bpos_to_text(&PBUF(buf2), next->data->min_key);
+		bch2_bpos_to_text(&buf1, prev->data->max_key);
+		bch2_bpos_to_text(&buf2, next->data->min_key);
 		bch_err(c,
 			"btree topology error in btree merge:\n"
 			"  prev ends at   %s\n"
 			"  next starts at %s",
-			buf1, buf2);
+			buf1.buf, buf2.buf);
+		printbuf_exit(&buf1);
+		printbuf_exit(&buf2);
 		bch2_topology_error(c);
 		ret = -EIO;
 		goto err;
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index e9e10df8ee95..4b0e00f32a96 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -831,11 +831,12 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans,
 		const char *invalid = bch2_bkey_invalid(c,
 				bkey_i_to_s_c(i->k), i->bkey_type);
 		if (invalid) {
-			char buf[200];
+			struct printbuf buf = PRINTBUF;
 
-			bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(i->k));
+			bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(i->k));
 			bch2_fs_fatal_error(c, "invalid bkey %s on insert from %s -> %ps: %s\n",
-					    buf, trans->fn, (void *) i->ip_allocated, invalid);
+					    buf.buf, trans->fn, (void *) i->ip_allocated, invalid);
+			printbuf_exit(&buf);
 			return -EINVAL;
 		}
 		btree_insert_entry_checks(trans, i);
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 136a5727ea20..7d3636e20c81 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -376,22 +376,23 @@ static inline int update_replicas(struct bch_fs *c, struct bkey_s_c k,
 {
 	struct bch_fs_usage __percpu *fs_usage;
 	int idx, ret = 0;
-	char buf[200];
+	struct printbuf buf = PRINTBUF;
 
 	percpu_down_read(&c->mark_lock);
+	buf.atomic++;
 
 	idx = bch2_replicas_entry_idx(c, r);
 	if (idx < 0 &&
 	    (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) ||
 	     fsck_err(c, "no replicas entry\n"
 		      "  while marking %s",
-		      (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)))) {
+		      (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))) {
 		percpu_up_read(&c->mark_lock);
 		ret = bch2_mark_replicas(c, r);
-		if (ret)
-			return ret;
-
 		percpu_down_read(&c->mark_lock);
+
+		if (ret)
+			goto err;
 		idx = bch2_replicas_entry_idx(c, r);
 	}
 	if (idx < 0) {
@@ -407,6 +408,7 @@ static inline int update_replicas(struct bch_fs *c, struct bkey_s_c k,
 err:
 fsck_err:
 	percpu_up_read(&c->mark_lock);
+	printbuf_exit(&buf);
 	return ret;
 }
 
@@ -678,7 +680,8 @@ static int check_bucket_ref(struct bch_fs *c,
 	u16 bucket_sectors = !ptr->cached
 		? dirty_sectors
 		: cached_sectors;
-	char buf[200];
+	struct printbuf buf = PRINTBUF;
+	int ret = 0;
 
 	if (gen_after(ptr->gen, b_gen)) {
 		bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
@@ -687,8 +690,9 @@ static int check_bucket_ref(struct bch_fs *c,
 			ptr->dev, bucket_nr, b_gen,
 			bch2_data_types[bucket_data_type ?: ptr_data_type],
 			ptr->gen,
-			(bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
-		return -EIO;
+			(bch2_bkey_val_to_text(&buf, c, k), buf.buf));
+		ret = -EIO;
+		goto err;
 	}
 
 	if (gen_cmp(b_gen, ptr->gen) > BUCKET_GC_GEN_MAX) {
@@ -698,8 +702,10 @@ static int check_bucket_ref(struct bch_fs *c,
 			ptr->dev, bucket_nr, b_gen,
 			bch2_data_types[bucket_data_type ?: ptr_data_type],
 			ptr->gen,
-			(bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
-		return -EIO;
+			(printbuf_reset(&buf),
+			 bch2_bkey_val_to_text(&buf, c, k), buf.buf));
+		ret = -EIO;
+		goto err;
 	}
 
 	if (b_gen != ptr->gen && !ptr->cached) {
@@ -710,12 +716,16 @@ static int check_bucket_ref(struct bch_fs *c,
 			*bucket_gen(ca, bucket_nr),
 			bch2_data_types[bucket_data_type ?: ptr_data_type],
 			ptr->gen,
-			(bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
-		return -EIO;
+			(printbuf_reset(&buf),
+			 bch2_bkey_val_to_text(&buf, c, k), buf.buf));
+		ret = -EIO;
+		goto err;
 	}
 
-	if (b_gen != ptr->gen)
-		return 1;
+	if (b_gen != ptr->gen) {
+		ret = 1;
+		goto err;
+	}
 
 	if (bucket_data_type && ptr_data_type &&
 	    bucket_data_type != ptr_data_type) {
@@ -725,8 +735,10 @@ static int check_bucket_ref(struct bch_fs *c,
 			ptr->dev, bucket_nr, b_gen,
 			bch2_data_types[bucket_data_type],
 			bch2_data_types[ptr_data_type],
-			(bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
-		return -EIO;
+			(printbuf_reset(&buf),
+			 bch2_bkey_val_to_text(&buf, c, k), buf.buf));
+		ret = -EIO;
+		goto err;
 	}
 
 	if ((unsigned) (bucket_sectors + sectors) > U16_MAX) {
@@ -736,11 +748,14 @@ static int check_bucket_ref(struct bch_fs *c,
 			ptr->dev, bucket_nr, b_gen,
 			bch2_data_types[bucket_data_type ?: ptr_data_type],
 			bucket_sectors, sectors,
-			(bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
-		return -EIO;
+			(printbuf_reset(&buf),
+			 bch2_bkey_val_to_text(&buf, c, k), buf.buf));
+		ret = -EIO;
+		goto err;
 	}
-
-	return 0;
+err:
+	printbuf_exit(&buf);
+	return ret;
 }
 
 static int mark_stripe_bucket(struct btree_trans *trans,
@@ -759,7 +774,7 @@ static int mark_stripe_bucket(struct btree_trans *trans,
 	struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
 	struct bucket *g;
 	struct bucket_mark new, old;
-	char buf[200];
+	struct printbuf buf = PRINTBUF;
 	int ret = 0;
 
 	BUG_ON(!(flags & BTREE_TRIGGER_GC));
@@ -767,6 +782,7 @@ static int mark_stripe_bucket(struct btree_trans *trans,
 	/* * XXX doesn't handle deletion */
 
 	percpu_down_read(&c->mark_lock);
+	buf.atomic++;
 	g = PTR_GC_BUCKET(ca, ptr);
 
 	if (g->mark.dirty_sectors ||
@@ -774,7 +790,7 @@ static int mark_stripe_bucket(struct btree_trans *trans,
 		bch2_fs_inconsistent(c,
 			      "bucket %u:%zu gen %u: multiple stripes using same bucket\n%s",
 			      ptr->dev, PTR_BUCKET_NR(ca, ptr), g->mark.gen,
-			      (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
+			      (bch2_bkey_val_to_text(&buf, c, k), buf.buf));
 		ret = -EINVAL;
 		goto err;
 	}
@@ -799,8 +815,8 @@ static int mark_stripe_bucket(struct btree_trans *trans,
 	bch2_dev_usage_update(c, ca, old, new, journal_seq, true);
 err:
 	percpu_up_read(&c->mark_lock);
-
-	return 0;
+	printbuf_exit(&buf);
+	return ret;
 }
 
 static int __mark_pointer(struct btree_trans *trans,
@@ -987,10 +1003,11 @@ static int bch2_mark_extent(struct btree_trans *trans,
 	if (r.e.nr_devs) {
 		ret = update_replicas(c, k, &r.e, dirty_sectors, journal_seq, true);
 		if (ret) {
-			char buf[200];
+			struct printbuf buf = PRINTBUF;
 
-			bch2_bkey_val_to_text(&PBUF(buf), c, k);
-			bch2_fs_fatal_error(c, "no replicas entry for %s", buf);
+			bch2_bkey_val_to_text(&buf, c, k);
+			bch2_fs_fatal_error(c, "no replicas entry for %s", buf.buf);
+			printbuf_exit(&buf);
 			return ret;
 		}
 	}
@@ -1019,13 +1036,16 @@ static int bch2_mark_stripe(struct btree_trans *trans,
 		struct stripe *m = genradix_ptr(&c->stripes, idx);
 
 		if (!m || (old_s && !m->alive)) {
-			char buf1[200], buf2[200];
+			struct printbuf buf1 = PRINTBUF;
+			struct printbuf buf2 = PRINTBUF;
 
-			bch2_bkey_val_to_text(&PBUF(buf1), c, old);
-			bch2_bkey_val_to_text(&PBUF(buf2), c, new);
+			bch2_bkey_val_to_text(&buf1, c, old);
+			bch2_bkey_val_to_text(&buf2, c, new);
 			bch_err_ratelimited(c, "error marking nonexistent stripe %llu while marking\n"
 					    "old %s\n"
-					    "new %s", idx, buf1, buf2);
+					    "new %s", idx, buf1.buf, buf2.buf);
+			printbuf_exit(&buf2);
+			printbuf_exit(&buf1);
 			bch2_inconsistent_error(c);
 			return -1;
 		}
@@ -1090,10 +1110,11 @@ static int bch2_mark_stripe(struct btree_trans *trans,
 				      ((s64) m->sectors * m->nr_redundant),
 				      journal_seq, gc);
 		if (ret) {
-			char buf[200];
+			struct printbuf buf = PRINTBUF;
 
-			bch2_bkey_val_to_text(&PBUF(buf), c, new);
-			bch2_fs_fatal_error(c, "no replicas entry for %s", buf);
+			bch2_bkey_val_to_text(&buf, c, new);
+			bch2_fs_fatal_error(c, "no replicas entry for %s", buf.buf);
+			printbuf_exit(&buf);
 			return ret;
 		}
 	}
@@ -1174,7 +1195,7 @@ static s64 __bch2_mark_reflink_p(struct btree_trans *trans,
 	int add = !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1;
 	u64 next_idx = end;
 	s64 ret = 0;
-	char buf[200];
+	struct printbuf buf = PRINTBUF;
 
 	if (r_idx >= c->reflink_gc_nr)
 		goto not_found;
@@ -1193,7 +1214,7 @@ not_found:
 	if (fsck_err(c, "pointer to missing indirect extent\n"
 		     "  %s\n"
 		     "  missing range %llu-%llu",
-		     (bch2_bkey_val_to_text(&PBUF(buf), c, p.s_c), buf),
+		     (bch2_bkey_val_to_text(&buf, c, p.s_c), buf.buf),
 		     *idx, next_idx)) {
 		struct bkey_i_error new;
 
@@ -1207,6 +1228,7 @@ not_found:
 
 	*idx = next_idx;
 fsck_err:
+	printbuf_exit(&buf);
 	return ret;
 }
 
@@ -1289,7 +1311,7 @@ void fs_usage_apply_warn(struct btree_trans *trans,
 {
 	struct bch_fs *c = trans->c;
 	struct btree_insert_entry *i;
-	char buf[200];
+	struct printbuf buf = PRINTBUF;
 
 	bch_err(c, "disk usage increased %lli more than %u sectors reserved",
 		should_not_have_added, disk_res_sectors);
@@ -1298,13 +1320,17 @@ void fs_usage_apply_warn(struct btree_trans *trans,
 		struct bkey_s_c old = { &i->old_k, i->old_v };
 
 		pr_err("while inserting");
-		bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(i->k));
-		pr_err("  %s", buf);
+		printbuf_reset(&buf);
+		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(i->k));
+		pr_err("  %s", buf.buf);
 		pr_err("overlapping with");
-		bch2_bkey_val_to_text(&PBUF(buf), c, old);
-		pr_err("  %s", buf);
+		printbuf_reset(&buf);
+		bch2_bkey_val_to_text(&buf, c, old);
+		pr_err("  %s", buf.buf);
 	}
+
 	__WARN();
+	printbuf_exit(&buf);
 }
 
 int bch2_trans_fs_usage_apply(struct btree_trans *trans,
@@ -1744,7 +1770,7 @@ static int __bch2_trans_mark_reflink_p(struct btree_trans *trans,
 	struct bkey_i *n;
 	__le64 *refcount;
 	int add = !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1;
-	char buf[200];
+	struct printbuf buf = PRINTBUF;
 	int ret;
 
 	bch2_trans_iter_init(trans, &iter, BTREE_ID_reflink, POS(0, *idx),
@@ -1764,19 +1790,19 @@ static int __bch2_trans_mark_reflink_p(struct btree_trans *trans,
 
 	refcount = bkey_refcount(n);
 	if (!refcount) {
-		bch2_bkey_val_to_text(&PBUF(buf), c, p.s_c);
+		bch2_bkey_val_to_text(&buf, c, p.s_c);
 		bch2_fs_inconsistent(c,
 			"nonexistent indirect extent at %llu while marking\n  %s",
-			*idx, buf);
+			*idx, buf.buf);
 		ret = -EIO;
 		goto err;
 	}
 
 	if (!*refcount && (flags & BTREE_TRIGGER_OVERWRITE)) {
-		bch2_bkey_val_to_text(&PBUF(buf), c, p.s_c);
+		bch2_bkey_val_to_text(&buf, c, p.s_c);
 		bch2_fs_inconsistent(c,
 			"indirect extent refcount underflow at %llu while marking\n  %s",
-			*idx, buf);
+			*idx, buf.buf);
 		ret = -EIO;
 		goto err;
 	}
@@ -1811,6 +1837,7 @@ static int __bch2_trans_mark_reflink_p(struct btree_trans *trans,
 	*idx = k.k->p.offset;
 err:
 	bch2_trans_iter_exit(trans, &iter);
+	printbuf_exit(&buf);
 	return ret;
 }
 
diff --git a/fs/bcachefs/clock.c b/fs/bcachefs/clock.c
index da91c95e3ffc..342797303415 100644
--- a/fs/bcachefs/clock.c
+++ b/fs/bcachefs/clock.c
@@ -157,6 +157,7 @@ void bch2_io_timers_to_text(struct printbuf *out, struct io_clock *clock)
 	unsigned long now;
 	unsigned i;
 
+	out->atomic++;
 	spin_lock(&clock->timer_lock);
 	now = atomic64_read(&clock->now);
 
@@ -165,6 +166,7 @@ void bch2_io_timers_to_text(struct printbuf *out, struct io_clock *clock)
 		       clock->timers.data[i]->fn,
 		       clock->timers.data[i]->expire - now);
 	spin_unlock(&clock->timer_lock);
+	--out->atomic;
 }
 
 void bch2_io_clock_exit(struct io_clock *clock)
diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c
index 02a5ef5ecb3e..c3bfa7f1d77d 100644
--- a/fs/bcachefs/debug.c
+++ b/fs/bcachefs/debug.c
@@ -169,10 +169,11 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b)
 		failed |= bch2_btree_verify_replica(c, b, p);
 
 	if (failed) {
-		char buf[200];
+		struct printbuf buf = PRINTBUF;
 
-		bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(&b->key));
-		bch2_fs_fatal_error(c, "btree node verify failed for : %s\n", buf);
+		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
+		bch2_fs_fatal_error(c, "btree node verify failed for : %s\n", buf.buf);
+		printbuf_exit(&buf);
 	}
 out:
 	mutex_unlock(&c->verify_lock);
@@ -188,8 +189,7 @@ struct dump_iter {
 	struct bch_fs	*c;
 	enum btree_id		id;
 
-	char			buf[1 << 12];
-	size_t			bytes;	/* what's currently in buf */
+	struct printbuf		buf;
 
 	char __user		*ubuf;	/* destination user buffer */
 	size_t			size;	/* size of requested read */
@@ -198,9 +198,9 @@ struct dump_iter {
 
 static int flush_buf(struct dump_iter *i)
 {
-	if (i->bytes) {
-		size_t bytes = min(i->bytes, i->size);
-		int err = copy_to_user(i->ubuf, i->buf, bytes);
+	if (i->buf.pos) {
+		size_t bytes = min_t(size_t, i->buf.pos, i->size);
+		int err = copy_to_user(i->ubuf, i->buf.buf, bytes);
 
 		if (err)
 			return err;
@@ -208,8 +208,8 @@ static int flush_buf(struct dump_iter *i)
 		i->ret	 += bytes;
 		i->ubuf	 += bytes;
 		i->size	 -= bytes;
-		i->bytes -= bytes;
-		memmove(i->buf, i->buf + bytes, i->bytes);
+		i->buf.pos -= bytes;
+		memmove(i->buf.buf, i->buf.buf + bytes, i->buf.pos);
 	}
 
 	return 0;
@@ -228,13 +228,17 @@ static int bch2_dump_open(struct inode *inode, struct file *file)
 	i->from = POS_MIN;
 	i->c	= container_of(bd, struct bch_fs, btree_debug[bd->id]);
 	i->id	= bd->id;
+	i->buf	= PRINTBUF;
 
 	return 0;
 }
 
 static int bch2_dump_release(struct inode *inode, struct file *file)
 {
-	kfree(file->private_data);
+	struct dump_iter *i = file->private_data;
+
+	printbuf_exit(&i->buf);
+	kfree(i);
 	return 0;
 }
 
@@ -266,11 +270,8 @@ static ssize_t bch2_read_btree(struct file *file, char __user *buf,
 	k = bch2_btree_iter_peek(&iter);
 
 	while (k.k && !(err = bkey_err(k))) {
-		bch2_bkey_val_to_text(&PBUF(i->buf), i->c, k);
-		i->bytes = strlen(i->buf);
-		BUG_ON(i->bytes >= sizeof(i->buf));
-		i->buf[i->bytes] = '\n';
-		i->bytes++;
+		bch2_bkey_val_to_text(&i->buf, i->c, k);
+		pr_char(&i->buf, '\n');
 
 		k = bch2_btree_iter_next(&iter);
 		i->from = iter.pos;
@@ -319,8 +320,7 @@ static ssize_t bch2_read_btree_formats(struct file *file, char __user *buf,
 	bch2_trans_init(&trans, i->c, 0, 0);
 
 	for_each_btree_node(&trans, iter, i->id, i->from, 0, b, err) {
-		bch2_btree_node_to_text(&PBUF(i->buf), i->c, b);
-		i->bytes = strlen(i->buf);
+		bch2_btree_node_to_text(&i->buf, i->c, b);
 		err = flush_buf(i);
 		if (err)
 			break;
@@ -384,16 +384,14 @@ static ssize_t bch2_read_bfloat_failed(struct file *file, char __user *buf,
 			bch2_btree_node_iter_peek(&l->iter, l->b);
 
 		if (l->b != prev_node) {
-			bch2_btree_node_to_text(&PBUF(i->buf), i->c, l->b);
-			i->bytes = strlen(i->buf);
+			bch2_btree_node_to_text(&i->buf, i->c, l->b);
 			err = flush_buf(i);
 			if (err)
 				break;
 		}
 		prev_node = l->b;
 
-		bch2_bfloat_to_text(&PBUF(i->buf), l->b, _k);
-		i->bytes = strlen(i->buf);
+		bch2_bfloat_to_text(&i->buf, l->b, _k);
 		err = flush_buf(i);
 		if (err)
 			break;
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index 86421f65d139..b220b523d856 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -286,14 +286,15 @@ static void ec_validate_checksums(struct bch_fs *c, struct ec_stripe_buf *buf)
 			struct bch_csum got = ec_block_checksum(buf, i, offset);
 
 			if (bch2_crc_cmp(want, got)) {
-				char buf2[200];
+				struct printbuf buf2 = PRINTBUF;
 
-				bch2_bkey_val_to_text(&PBUF(buf2), c, bkey_i_to_s_c(&buf->key.k_i));
+				bch2_bkey_val_to_text(&buf2, c, bkey_i_to_s_c(&buf->key.k_i));
 
 				bch_err_ratelimited(c,
 					"stripe checksum error for %ps at %u:%u: csum type %u, expected %llx got %llx\n%s",
 					(void *) _RET_IP_, i, j, v->csum_type,
-					want.lo, got.lo, buf2);
+					want.lo, got.lo, buf2.buf);
+				printbuf_exit(&buf2);
 				clear_bit(i, buf->valid);
 				break;
 			}
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index 9e8b085e36d7..2aaeee585157 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -1676,7 +1676,8 @@ static int bch2_show_options(struct seq_file *seq, struct dentry *root)
 {
 	struct bch_fs *c = root->d_sb->s_fs_info;
 	enum bch_opt_id i;
-	char buf[512];
+	struct printbuf buf = PRINTBUF;
+	int ret = 0;
 
 	for (i = 0; i < bch2_opts_nr; i++) {
 		const struct bch_option *opt = &bch2_opt_table[i];
@@ -1688,13 +1689,17 @@ static int bch2_show_options(struct seq_file *seq, struct dentry *root)
 		if (v == bch2_opt_get_by_id(&bch2_opts_default, i))
 			continue;
 
-		bch2_opt_to_text(&PBUF(buf), c, opt, v,
+		printbuf_reset(&buf);
+		bch2_opt_to_text(&buf, c, opt, v,
 				 OPT_SHOW_MOUNT_STYLE);
 		seq_putc(seq, ',');
-		seq_puts(seq, buf);
+		seq_puts(seq, buf.buf);
 	}
 
-	return 0;
+	if (buf.allocation_failure)
+		ret = -ENOMEM;
+	printbuf_exit(&buf);
+	return ret;
 }
 
 static void bch2_put_super(struct super_block *sb)
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index ced4d671eb8d..8783b950055e 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -698,15 +698,16 @@ static int check_key_has_snapshot(struct btree_trans *trans,
 				  struct bkey_s_c k)
 {
 	struct bch_fs *c = trans->c;
-	char buf[200];
+	struct printbuf buf = PRINTBUF;
 	int ret = 0;
 
 	if (mustfix_fsck_err_on(!snapshot_t(c, k.k->p.snapshot)->equiv, c,
 			"key in missing snapshot: %s",
-			(bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)))
-		return bch2_btree_delete_at(trans, iter,
+			(bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
+		ret = bch2_btree_delete_at(trans, iter,
 					    BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: 1;
 fsck_err:
+	printbuf_exit(&buf);
 	return ret;
 }
 
@@ -746,7 +747,7 @@ static int hash_check_key(struct btree_trans *trans,
 {
 	struct bch_fs *c = trans->c;
 	struct btree_iter iter = { NULL };
-	char buf[200];
+	struct printbuf buf = PRINTBUF;
 	struct bkey_s_c k;
 	u64 hash;
 	int ret = 0;
@@ -770,8 +771,9 @@ static int hash_check_key(struct btree_trans *trans,
 		if (fsck_err_on(k.k->type == desc.key_type &&
 				!desc.cmp_bkey(k, hash_k), c,
 				"duplicate hash table keys:\n%s",
-				(bch2_bkey_val_to_text(&PBUF(buf), c,
-						       hash_k), buf))) {
+				(printbuf_reset(&buf),
+				 bch2_bkey_val_to_text(&buf, c, hash_k),
+				 buf.buf))) {
 			ret = bch2_hash_delete_at(trans, desc, hash_info, k_iter, 0) ?: 1;
 			break;
 		}
@@ -782,13 +784,16 @@ static int hash_check_key(struct btree_trans *trans,
 		}
 
 	}
+out:
 	bch2_trans_iter_exit(trans, &iter);
+	printbuf_exit(&buf);
 	return ret;
 bad_hash:
 	if (fsck_err(c, "hash table key at wrong offset: btree %u inode %llu offset %llu, "
 		     "hashed to %llu\n%s",
 		     desc.btree_id, hash_k.k->p.inode, hash_k.k->p.offset, hash,
-		     (bch2_bkey_val_to_text(&PBUF(buf), c, hash_k), buf)) == FSCK_ERR_IGNORE)
+		     (printbuf_reset(&buf),
+		      bch2_bkey_val_to_text(&buf, c, hash_k), buf.buf)) == FSCK_ERR_IGNORE)
 		return 0;
 
 	ret = hash_redo_key(trans, desc, hash_info, k_iter, hash_k);
@@ -796,9 +801,9 @@ bad_hash:
 		bch_err(c, "hash_redo_key err %i", ret);
 		return ret;
 	}
-	return -EINTR;
+	ret = -EINTR;
 fsck_err:
-	return ret;
+	goto out;
 }
 
 static int check_inode(struct btree_trans *trans,
@@ -1166,32 +1171,34 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
 	struct bch_fs *c = trans->c;
 	struct bkey_s_c k;
 	struct inode_walker_entry *i;
-	char buf[200];
+	struct printbuf buf = PRINTBUF;
 	int ret = 0;
 
 	k = bch2_btree_iter_peek(iter);
 	if (!k.k)
-		return 0;
+		goto out;
 
 	ret = bkey_err(k);
 	if (ret)
-		return ret;
+		goto err;
 
 	ret = check_key_has_snapshot(trans, iter, k);
-	if (ret)
-		return ret < 0 ? ret : 0;
+	if (ret) {
+		ret = ret < 0 ? ret : 0;
+		goto out;
+	}
 
 	ret = snapshots_seen_update(c, s, k.k->p);
 	if (ret)
-		return ret;
+		goto err;
 
 	if (k.k->type == KEY_TYPE_whiteout)
-		return 0;
+		goto out;
 
 	if (inode->cur_inum != k.k->p.inode) {
 		ret = check_i_sectors(trans, inode);
 		if (ret)
-			return ret;
+			goto err;
 	}
 #if 0
 	if (bkey_cmp(prev.k->k.p, bkey_start_pos(k.k)) > 0) {
@@ -1201,22 +1208,29 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
 		bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(prev.k));
 		bch2_bkey_val_to_text(&PBUF(buf2), c, k);
 
-		if (fsck_err(c, "overlapping extents:\n%s\n%s", buf1, buf2))
-			return fix_overlapping_extent(trans, k, prev.k->k.p) ?: -EINTR;
+		if (fsck_err(c, "overlapping extents:\n%s\n%s", buf1, buf2)) {
+			ret = fix_overlapping_extent(trans, k, prev.k->k.p) ?: -EINTR;
+			goto out;
+		}
 	}
 #endif
 	ret = __walk_inode(trans, inode, k.k->p);
 	if (ret < 0)
-		return ret;
+		goto err;
 
 	if (fsck_err_on(ret == INT_MAX, c,
 			"extent in missing inode:\n  %s",
-			(bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)))
-		return bch2_btree_delete_at(trans, iter,
+			(printbuf_reset(&buf),
+			 bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
+		ret = bch2_btree_delete_at(trans, iter,
 					    BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+		goto out;
+	}
 
-	if (ret == INT_MAX)
-		return 0;
+	if (ret == INT_MAX) {
+		ret = 0;
+		goto out;
+	}
 
 	i = inode->d + ret;
 	ret = 0;
@@ -1225,9 +1239,12 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
 			!S_ISLNK(i->inode.bi_mode), c,
 			"extent in non regular inode mode %o:\n  %s",
 			i->inode.bi_mode,
-			(bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)))
-		return bch2_btree_delete_at(trans, iter,
+			(printbuf_reset(&buf),
+			 bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
+		ret = bch2_btree_delete_at(trans, iter,
 					    BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+		goto out;
+	}
 
 	if (!bch2_snapshot_internal_node(c, k.k->p.snapshot)) {
 		for_each_visible_inode(c, s, inode, k.k->p.snapshot, i) {
@@ -1237,11 +1254,12 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
 					"extent type %u offset %llu past end of inode %llu, i_size %llu",
 					k.k->type, k.k->p.offset, k.k->p.inode, i->inode.bi_size)) {
 				bch2_fs_lazy_rw(c);
-				return bch2_btree_delete_range_trans(trans, BTREE_ID_extents,
+				ret = bch2_btree_delete_range_trans(trans, BTREE_ID_extents,
 						SPOS(k.k->p.inode, round_up(i->inode.bi_size, block_bytes(c)) >> 9,
 						     k.k->p.snapshot),
 						POS(k.k->p.inode, U64_MAX),
 						0, NULL) ?: -EINTR;
+				goto out;
 			}
 		}
 	}
@@ -1253,7 +1271,10 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
 	bch2_bkey_buf_reassemble(&prev, c, k);
 #endif
 
+out:
+err:
 fsck_err:
+	printbuf_exit(&buf);
 	return ret;
 }
 
@@ -1351,7 +1372,7 @@ static int check_dirent_target(struct btree_trans *trans,
 	struct bch_fs *c = trans->c;
 	struct bkey_i_dirent *n;
 	bool backpointer_exists = true;
-	char buf[200];
+	struct printbuf buf = PRINTBUF;
 	int ret = 0;
 
 	if (!target->bi_dir &&
@@ -1377,9 +1398,7 @@ static int check_dirent_target(struct btree_trans *trans,
 				"directory %llu with multiple links",
 				target->bi_inum)) {
 			ret = __remove_dirent(trans, d.k->p);
-			if (ret)
-				goto err;
-			return 0;
+			goto out;
 		}
 
 		if (fsck_err_on(backpointer_exists &&
@@ -1416,18 +1435,19 @@ static int check_dirent_target(struct btree_trans *trans,
 			"incorrect d_type: got %s, should be %s:\n%s",
 			bch2_d_type_str(d.v->d_type),
 			bch2_d_type_str(inode_d_type(target)),
-			(bch2_bkey_val_to_text(&PBUF(buf), c, d.s_c), buf))) {
+			(printbuf_reset(&buf),
+			 bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) {
 		n = bch2_trans_kmalloc(trans, bkey_bytes(d.k));
 		ret = PTR_ERR_OR_ZERO(n);
 		if (ret)
-			return ret;
+			goto err;
 
 		bkey_reassemble(&n->k_i, d.s_c);
 		n->v.d_type = inode_d_type(target);
 
 		ret = bch2_trans_update(trans, iter, &n->k_i, 0);
 		if (ret)
-			return ret;
+			goto err;
 
 		d = dirent_i_to_s_c(n);
 	}
@@ -1441,19 +1461,21 @@ static int check_dirent_target(struct btree_trans *trans,
 		n = bch2_trans_kmalloc(trans, bkey_bytes(d.k));
 		ret = PTR_ERR_OR_ZERO(n);
 		if (ret)
-			return ret;
+			goto err;
 
 		bkey_reassemble(&n->k_i, d.s_c);
 		n->v.d_parent_subvol = cpu_to_le32(target->bi_parent_subvol);
 
 		ret = bch2_trans_update(trans, iter, &n->k_i, 0);
 		if (ret)
-			return ret;
+			goto err;
 
 		d = dirent_i_to_s_c(n);
 	}
+out:
 err:
 fsck_err:
+	printbuf_exit(&buf);
 	return ret;
 }
 
@@ -1467,46 +1489,53 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
 	struct bkey_s_c k;
 	struct bkey_s_c_dirent d;
 	struct inode_walker_entry *i;
-	char buf[200];
-	int ret;
+	struct printbuf buf = PRINTBUF;
+	int ret = 0;
 
 	k = bch2_btree_iter_peek(iter);
 	if (!k.k)
-		return 0;
+		goto out;
 
 	ret = bkey_err(k);
 	if (ret)
-		return ret;
+		goto err;
 
 	ret = check_key_has_snapshot(trans, iter, k);
-	if (ret)
-		return ret < 0 ? ret : 0;
+	if (ret) {
+		ret = ret < 0 ? ret : 0;
+		goto out;
+	}
 
 	ret = snapshots_seen_update(c, s, k.k->p);
 	if (ret)
-		return ret;
+		goto err;
 
 	if (k.k->type == KEY_TYPE_whiteout)
-		return 0;
+		goto out;
 
 	if (dir->cur_inum != k.k->p.inode) {
 		ret = check_subdir_count(trans, dir);
 		if (ret)
-			return ret;
+			goto err;
 	}
 
 	ret = __walk_inode(trans, dir, k.k->p);
 	if (ret < 0)
-		return ret;
+		goto err;
 
 	if (fsck_err_on(ret == INT_MAX, c,
 			"dirent in nonexisting directory:\n%s",
-			(bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)))
-		return bch2_btree_delete_at(trans, iter,
+			(printbuf_reset(&buf),
+			 bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
+		ret = bch2_btree_delete_at(trans, iter,
 				BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+		goto out;
+	}
 
-	if (ret == INT_MAX)
-		return 0;
+	if (ret == INT_MAX) {
+		ret = 0;
+		goto out;
+	}
 
 	i = dir->d + ret;
 	ret = 0;
@@ -1514,8 +1543,11 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
 	if (fsck_err_on(!S_ISDIR(i->inode.bi_mode), c,
 			"dirent in non directory inode type %s:\n%s",
 			bch2_d_type_str(inode_d_type(&i->inode)),
-			(bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)))
-		return bch2_btree_delete_at(trans, iter, 0);
+			(printbuf_reset(&buf),
+			 bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
+		ret = bch2_btree_delete_at(trans, iter, 0);
+		goto out;
+	}
 
 	if (dir->first_this_inode)
 		*hash_info = bch2_hash_info_init(c, &dir->d[0].inode);
@@ -1523,12 +1555,15 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
 	ret = hash_check_key(trans, bch2_dirent_hash_desc,
 			     hash_info, iter, k);
 	if (ret < 0)
-		return ret;
-	if (ret) /* dirent has been deleted */
-		return 0;
+		goto err;
+	if (ret) {
+		/* dirent has been deleted */
+		ret = 0;
+		goto out;
+	}
 
 	if (k.k->type != KEY_TYPE_dirent)
-		return 0;
+		goto out;
 
 	d = bkey_s_c_to_dirent(k);
 
@@ -1541,24 +1576,27 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
 		ret = __subvol_lookup(trans, target_subvol,
 				      &target_snapshot, &target_inum);
 		if (ret && ret != -ENOENT)
-			return ret;
+			goto err;
 
 		if (fsck_err_on(ret, c,
 				"dirent points to missing subvolume %llu",
-				le64_to_cpu(d.v->d_child_subvol)))
-			return __remove_dirent(trans, d.k->p);
+				le64_to_cpu(d.v->d_child_subvol))) {
+			ret = __remove_dirent(trans, d.k->p);
+			goto err;
+		}
 
 		ret = __lookup_inode(trans, target_inum,
 				   &subvol_root, &target_snapshot);
 		if (ret && ret != -ENOENT)
-			return ret;
+			goto err;
 
 		if (fsck_err_on(ret, c,
 				"subvolume %u points to missing subvolume root %llu",
 				target_subvol,
 				target_inum)) {
 			bch_err(c, "repair not implemented yet");
-			return -EINVAL;
+			ret = -EINVAL;
+			goto err;
 		}
 
 		if (fsck_err_on(subvol_root.bi_subvol != target_subvol, c,
@@ -1568,32 +1606,33 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
 			subvol_root.bi_subvol = target_subvol;
 			ret = __write_inode(trans, &subvol_root, target_snapshot);
 			if (ret)
-				return ret;
+				goto err;
 		}
 
 		ret = check_dirent_target(trans, iter, d, &subvol_root,
 					  target_snapshot);
 		if (ret)
-			return ret;
+			goto err;
 	} else {
 		ret = __get_visible_inodes(trans, target, s, le64_to_cpu(d.v->d_inum));
 		if (ret)
-			return ret;
+			goto err;
 
 		if (fsck_err_on(!target->nr, c,
 				"dirent points to missing inode:\n%s",
-				(bch2_bkey_val_to_text(&PBUF(buf), c,
-						       k), buf))) {
+				(printbuf_reset(&buf),
+				 bch2_bkey_val_to_text(&buf, c, k),
+				 buf.buf))) {
 			ret = __remove_dirent(trans, d.k->p);
 			if (ret)
-				return ret;
+				goto err;
 		}
 
 		for (i = target->d; i < target->d + target->nr; i++) {
 			ret = check_dirent_target(trans, iter, d,
 						  &i->inode, i->snapshot);
 			if (ret)
-				return ret;
+				goto err;
 		}
 	}
 
@@ -1601,7 +1640,10 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
 		for_each_visible_inode(c, s, dir, d.k->p.snapshot, i)
 			i->count++;
 
+out:
+err:
 fsck_err:
+	printbuf_exit(&buf);
 	return ret;
 }
 
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index 4b9ff76dd19f..c9204cab055d 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -2057,11 +2057,11 @@ static noinline void read_from_stale_dirty_pointer(struct btree_trans *trans,
 	struct bch_fs *c = trans->c;
 	struct bch_dev *ca = bch_dev_bkey_exists(c, ptr.dev);
 	struct btree_iter iter;
-	char buf[200];
+	struct printbuf buf = PRINTBUF;
 	int ret;
 
-	bch2_bkey_val_to_text(&PBUF(buf), c, k);
-	bch2_fs_inconsistent(c, "Attempting to read from stale dirty pointer: %s", buf);
+	bch2_bkey_val_to_text(&buf, c, k);
+	bch2_fs_inconsistent(c, "Attempting to read from stale dirty pointer: %s", buf.buf);
 
 	bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc,
 			     POS(ptr.dev, PTR_BUCKET_NR(ca, &ptr)),
@@ -2069,12 +2069,14 @@ static noinline void read_from_stale_dirty_pointer(struct btree_trans *trans,
 
 	ret = lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_slot(&iter)));
 	if (ret)
-		return;
+		goto out;
 
-	bch2_bkey_val_to_text(&PBUF(buf), c, k);
-	bch_err(c, "%s", buf);
+	bch2_bkey_val_to_text(&buf, c, k);
+	bch_err(c, "%s", buf.buf);
 	bch_err(c, "memory gen: %u", *bucket_gen(ca, iter.pos.offset));
 	bch2_trans_iter_exit(trans, &iter);
+out:
+	printbuf_exit(&buf);
 }
 
 int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig,
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index 279e960f2307..a579e6483d1e 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -414,18 +414,18 @@ unlock:
 	    !can_discard &&
 	    j->reservations.idx == j->reservations.unwritten_idx &&
 	    (flags & JOURNAL_RES_GET_RESERVED)) {
-		char *journal_debug_buf = kmalloc(4096, GFP_ATOMIC);
+		struct printbuf buf = PRINTBUF;
 
 		bch_err(c, "Journal stuck! Hava a pre-reservation but journal full");
-		if (journal_debug_buf) {
-			bch2_journal_debug_to_text(&_PBUF(journal_debug_buf, 4096), j);
-			bch_err(c, "%s", journal_debug_buf);
 
-			bch2_journal_pins_to_text(&_PBUF(journal_debug_buf, 4096), j);
-			bch_err(c, "Journal pins:\n%s", journal_debug_buf);
-			kfree(journal_debug_buf);
-		}
+		bch2_journal_debug_to_text(&buf, j);
+		bch_err(c, "%s", buf.buf);
+
+		printbuf_reset(&buf);
+		bch2_journal_pins_to_text(&buf, j);
+		bch_err(c, "Journal pins:\n%s", buf.buf);
 
+		printbuf_exit(&buf);
 		bch2_fatal_error(c);
 		dump_stack();
 	}
@@ -1186,6 +1186,8 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
 	unsigned long now = jiffies;
 	unsigned i;
 
+	out->atomic++;
+
 	rcu_read_lock();
 	s = READ_ONCE(j->reservations);
 
@@ -1270,6 +1272,8 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
 	}
 
 	rcu_read_unlock();
+
+	--out->atomic;
 }
 
 void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
@@ -1286,6 +1290,8 @@ void bch2_journal_pins_to_text(struct printbuf *out, struct journal *j)
 	u64 i;
 
 	spin_lock(&j->lock);
+	out->atomic++;
+
 	fifo_for_each_entry_ptr(pin_list, &j->pin, i) {
 		pr_buf(out, "%llu: count %u\n",
 		       i, atomic_read(&pin_list->count));
@@ -1305,5 +1311,7 @@ void bch2_journal_pins_to_text(struct printbuf *out, struct journal *j)
 			pr_buf(out, "\t%px %ps\n",
 			       pin, pin->flush);
 	}
+
+	--out->atomic;
 	spin_unlock(&j->lock);
 }
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 302af332b632..bbec4d85b6bc 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -251,14 +251,15 @@ static int journal_validate_key(struct bch_fs *c, const char *where,
 	invalid = bch2_bkey_invalid(c, bkey_i_to_s_c(k),
 				    __btree_node_type(level, btree_id));
 	if (invalid) {
-		char buf[160];
+		struct printbuf buf = PRINTBUF;
 
-		bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(k));
+		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k));
 		mustfix_fsck_err(c, "invalid %s in %s entry offset %zi/%u: %s\n%s",
 				 type, where,
 				 (u64 *) k - entry->_data,
 				 le16_to_cpu(entry->u64s),
-				 invalid, buf);
+				 invalid, buf.buf);
+		printbuf_exit(&buf);
 
 		le16_add_cpu(&entry->u64s, -((u16) k->k.u64s));
 		memmove(k, bkey_next(k), next - (void *) bkey_next(k));
@@ -995,6 +996,7 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list,
 	struct journal_replay *i, *t;
 	struct bch_dev *ca;
 	unsigned iter;
+	struct printbuf buf = PRINTBUF;
 	size_t keys = 0, entries = 0;
 	bool degraded = false;
 	u64 seq, last_seq = 0;
@@ -1053,7 +1055,8 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list,
 
 	if (!last_seq) {
 		fsck_err(c, "journal read done, but no entries found after dropping non-flushes");
-		return -1;
+		ret = -1;
+		goto err;
 	}
 
 	/* Drop blacklisted entries and entries older than last_seq: */
@@ -1085,7 +1088,7 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list,
 
 		while (seq < le64_to_cpu(i->j.seq)) {
 			u64 missing_start, missing_end;
-			char buf1[200], buf2[200];
+			struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF;
 
 			while (seq < le64_to_cpu(i->j.seq) &&
 			       bch2_journal_seq_is_blacklisted(c, seq, false))
@@ -1101,14 +1104,13 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list,
 				seq++;
 
 			if (i->list.prev != list) {
-				struct printbuf out = PBUF(buf1);
 				struct journal_replay *p = list_prev_entry(i, list);
 
-				bch2_journal_ptrs_to_text(&out, c, p);
-				pr_buf(&out, " size %zu", vstruct_sectors(&p->j, c->block_bits));
+				bch2_journal_ptrs_to_text(&buf1, c, p);
+				pr_buf(&buf1, " size %zu", vstruct_sectors(&p->j, c->block_bits));
 			} else
-				sprintf(buf1, "(none)");
-			bch2_journal_ptrs_to_text(&PBUF(buf2), c, i);
+				pr_buf(&buf1, "(none)");
+			bch2_journal_ptrs_to_text(&buf2, c, i);
 
 			missing_end = seq - 1;
 			fsck_err(c, "journal entries %llu-%llu missing! (replaying %llu-%llu)\n"
@@ -1116,7 +1118,10 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list,
 				 "  next at %s",
 				 missing_start, missing_end,
 				 last_seq, *blacklist_seq - 1,
-				 buf1, buf2);
+				 buf1.buf, buf2.buf);
+
+			printbuf_exit(&buf1);
+			printbuf_exit(&buf2);
 		}
 
 		seq++;
@@ -1130,14 +1135,13 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list,
 			.e.nr_required = 1,
 		};
 		unsigned ptr;
-		char buf[80];
 
 		if (i->ignore)
 			continue;
 
 		ret = jset_validate_entries(c, &i->j, READ);
 		if (ret)
-			goto fsck_err;
+			goto err;
 
 		for (ptr = 0; ptr < i->nr_ptrs; ptr++)
 			replicas.e.devs[replicas.e.nr_devs++] = i->ptrs[ptr].dev;
@@ -1149,15 +1153,17 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list,
 		 * the devices - this is wrong:
 		 */
 
+		printbuf_reset(&buf);
+		bch2_replicas_entry_to_text(&buf, &replicas.e);
+
 		if (!degraded &&
 		    (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) ||
 		     fsck_err_on(!bch2_replicas_marked(c, &replicas.e), c,
 				 "superblock not marked as containing replicas %s",
-				 (bch2_replicas_entry_to_text(&PBUF(buf),
-							      &replicas.e), buf)))) {
+				 buf.buf))) {
 			ret = bch2_mark_replicas(c, &replicas.e);
 			if (ret)
-				return ret;
+				goto err;
 		}
 
 		for_each_jset_key(k, _n, entry, &i->j)
@@ -1171,7 +1177,9 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list,
 	if (*start_seq != *blacklist_seq)
 		bch_info(c, "dropped unflushed entries %llu-%llu",
 			 *blacklist_seq, *start_seq - 1);
+err:
 fsck_err:
+	printbuf_exit(&buf);
 	return ret;
 }
 
@@ -1481,7 +1489,7 @@ void bch2_journal_write(struct closure *cl)
 	struct jset_entry *start, *end;
 	struct jset *jset;
 	struct bio *bio;
-	char *journal_debug_buf = NULL;
+	struct printbuf journal_debug_buf = PRINTBUF;
 	bool validate_before_checksum = false;
 	unsigned i, sectors, bytes, u64s, nr_rw_members = 0;
 	int ret;
@@ -1586,11 +1594,8 @@ retry_alloc:
 		goto retry_alloc;
 	}
 
-	if (ret) {
-		journal_debug_buf = kmalloc(4096, GFP_ATOMIC);
-		if (journal_debug_buf)
-			__bch2_journal_debug_to_text(&_PBUF(journal_debug_buf, 4096), j);
-	}
+	if (ret)
+		__bch2_journal_debug_to_text(&journal_debug_buf, j);
 
 	/*
 	 * write is allocated, no longer need to account for it in
@@ -1607,8 +1612,8 @@ retry_alloc:
 
 	if (ret) {
 		bch_err(c, "Unable to allocate journal write:\n%s",
-			journal_debug_buf);
-		kfree(journal_debug_buf);
+			journal_debug_buf.buf);
+		printbuf_exit(&journal_debug_buf);
 		bch2_fatal_error(c);
 		continue_at(cl, journal_write_done, c->io_complete_wq);
 		return;
diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
index c15b18831512..2d5382a83003 100644
--- a/fs/bcachefs/journal_reclaim.c
+++ b/fs/bcachefs/journal_reclaim.c
@@ -216,14 +216,11 @@ void bch2_journal_space_available(struct journal *j)
 	if (!clean_ondisk &&
 	    j->reservations.idx ==
 	    j->reservations.unwritten_idx) {
-		char *buf = kmalloc(4096, GFP_ATOMIC);
+		struct printbuf buf = PRINTBUF;
 
-		bch_err(c, "journal stuck");
-		if (buf) {
-			__bch2_journal_debug_to_text(&_PBUF(buf, 4096), j);
-			pr_err("\n%s", buf);
-			kfree(buf);
-		}
+		__bch2_journal_debug_to_text(&buf, j);
+		bch_err(c, "journal stuck\n%s", buf.buf);
+		printbuf_exit(&buf);
 
 		bch2_fatal_error(c);
 		ret = cur_entry_journal_stuck;
diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c
index fe0a1dbac199..babf98894e87 100644
--- a/fs/bcachefs/rebalance.c
+++ b/fs/bcachefs/rebalance.c
@@ -257,35 +257,47 @@ void bch2_rebalance_work_to_text(struct printbuf *out, struct bch_fs *c)
 {
 	struct bch_fs_rebalance *r = &c->rebalance;
 	struct rebalance_work w = rebalance_work(c);
-	char h1[21], h2[21];
 
-	bch2_hprint(&PBUF(h1), w.dev_most_full_work << 9);
-	bch2_hprint(&PBUF(h2), w.dev_most_full_capacity << 9);
-	pr_buf(out, "fullest_dev (%i):\t%s/%s\n",
-	       w.dev_most_full_idx, h1, h2);
+	out->tabstops[0] = 20;
 
-	bch2_hprint(&PBUF(h1), w.total_work << 9);
-	bch2_hprint(&PBUF(h2), c->capacity << 9);
-	pr_buf(out, "total work:\t\t%s/%s\n", h1, h2);
+	pr_buf(out, "fullest_dev (%i):", w.dev_most_full_idx);
+	pr_tab(out);
 
-	pr_buf(out, "rate:\t\t\t%u\n", r->pd.rate.rate);
+	bch2_hprint(out, w.dev_most_full_work << 9);
+	pr_buf(out, "/");
+	bch2_hprint(out, w.dev_most_full_capacity << 9);
+	pr_newline(out);
+
+	pr_buf(out, "total work:");
+	pr_tab(out);
+
+	bch2_hprint(out, w.total_work << 9);
+	pr_buf(out, "/");
+	bch2_hprint(out, c->capacity << 9);
+	pr_newline(out);
+
+	pr_buf(out, "rate:");
+	pr_tab(out);
+	pr_buf(out, "%u", r->pd.rate.rate);
+	pr_newline(out);
 
 	switch (r->state) {
 	case REBALANCE_WAITING:
-		pr_buf(out, "waiting\n");
+		pr_buf(out, "waiting");
 		break;
 	case REBALANCE_THROTTLED:
-		bch2_hprint(&PBUF(h1),
+		pr_buf(out, "throttled for %lu sec or ",
+		       (r->throttled_until_cputime - jiffies) / HZ);
+		bch2_hprint(out,
 			    (r->throttled_until_iotime -
 			     atomic64_read(&c->io_clock[WRITE].now)) << 9);
-		pr_buf(out, "throttled for %lu sec or %s io\n",
-		       (r->throttled_until_cputime - jiffies) / HZ,
-		       h1);
+		pr_buf(out, " io");
 		break;
 	case REBALANCE_RUNNING:
-		pr_buf(out, "running\n");
+		pr_buf(out, "running");
 		break;
 	}
+	pr_newline(out);
 }
 
 void bch2_rebalance_stop(struct bch_fs *c)
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index ae9ae1c7138c..6c4ffc5abdc5 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -760,6 +760,8 @@ static int verify_superblock_clean(struct bch_fs *c,
 {
 	unsigned i;
 	struct bch_sb_field_clean *clean = *cleanp;
+	struct printbuf buf1 = PRINTBUF;
+	struct printbuf buf2 = PRINTBUF;
 	int ret = 0;
 
 	if (mustfix_fsck_err_on(j->seq != clean->journal_seq, c,
@@ -772,7 +774,6 @@ static int verify_superblock_clean(struct bch_fs *c,
 	}
 
 	for (i = 0; i < BTREE_ID_NR; i++) {
-		char buf1[200], buf2[200];
 		struct bkey_i *k1, *k2;
 		unsigned l1 = 0, l2 = 0;
 
@@ -782,6 +783,19 @@ static int verify_superblock_clean(struct bch_fs *c,
 		if (!k1 && !k2)
 			continue;
 
+		printbuf_reset(&buf1);
+		printbuf_reset(&buf2);
+
+		if (k1)
+			bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(k1));
+		else
+			pr_buf(&buf1, "(none)");
+
+		if (k2)
+			bch2_bkey_val_to_text(&buf2, c, bkey_i_to_s_c(k2));
+		else
+			pr_buf(&buf2, "(none)");
+
 		mustfix_fsck_err_on(!k1 || !k2 ||
 				    IS_ERR(k1) ||
 				    IS_ERR(k2) ||
@@ -791,10 +805,12 @@ static int verify_superblock_clean(struct bch_fs *c,
 			"superblock btree root %u doesn't match journal after clean shutdown\n"
 			"sb:      l=%u %s\n"
 			"journal: l=%u %s\n", i,
-			l1, (bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(k1)), buf1),
-			l2, (bch2_bkey_val_to_text(&PBUF(buf2), c, bkey_i_to_s_c(k2)), buf2));
+			l1, buf1.buf,
+			l2, buf2.buf);
 	}
 fsck_err:
+	printbuf_exit(&buf2);
+	printbuf_exit(&buf1);
 	return ret;
 }
 
diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c
index 7cc2414893fc..e26642c01fd7 100644
--- a/fs/bcachefs/replicas.c
+++ b/fs/bcachefs/replicas.c
@@ -997,11 +997,12 @@ bool bch2_have_enough_devs(struct bch_fs *c, struct bch_devs_mask devs,
 
 		if (dflags & ~flags) {
 			if (print) {
-				char buf[100];
+				struct printbuf buf = PRINTBUF;
 
-				bch2_replicas_entry_to_text(&PBUF(buf), e);
+				bch2_replicas_entry_to_text(&buf, e);
 				bch_err(c, "insufficient devices online (%u) for replicas entry %s",
-					nr_online, buf);
+					nr_online, buf.buf);
+				printbuf_exit(&buf);
 			}
 			ret = false;
 			break;
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index 47eeb48c8c60..c616ce5ed194 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -572,16 +572,10 @@ int bch2_read_super(const char *path, struct bch_opts *opts,
 {
 	u64 offset = opt_get(*opts, sb);
 	struct bch_sb_layout layout;
-	char *_err;
-	struct printbuf err;
+	struct printbuf err = PRINTBUF;
 	__le64 *i;
 	int ret;
 
-	_err = kmalloc(4096, GFP_KERNEL);
-	if (!_err)
-		return -ENOMEM;
-	err = _PBUF(_err, 4096);
-
 	pr_verbose_init(*opts, "");
 
 	memset(sb, 0, sizeof(*sb));
@@ -633,8 +627,8 @@ int bch2_read_super(const char *path, struct bch_opts *opts,
 		goto err;
 
 	printk(KERN_ERR "bcachefs (%s): error reading default superblock: %s",
-	       path, _err);
-	err = _PBUF(_err, 4096);
+	       path, err.buf);
+	printbuf_reset(&err);
 
 	/*
 	 * Error reading primary superblock - read location of backup
@@ -689,16 +683,16 @@ got_super:
 	ret = bch2_sb_validate(sb, &err);
 	if (ret) {
 		printk(KERN_ERR "bcachefs (%s): error validating superblock: %s",
-		       path, _err);
+		       path, err.buf);
 		goto err_no_print;
 	}
 out:
 	pr_verbose_init(*opts, "ret %i", ret);
-	kfree(_err);
+	printbuf_exit(&err);
 	return ret;
 err:
 	printk(KERN_ERR "bcachefs (%s): error reading superblock: %s",
-	       path, _err);
+	       path, err.buf);
 err_no_print:
 	bch2_free_super(sb);
 	goto out;
@@ -768,6 +762,7 @@ int bch2_write_super(struct bch_fs *c)
 {
 	struct closure *cl = &c->sb_write;
 	struct bch_dev *ca;
+	struct printbuf err = PRINTBUF;
 	unsigned i, sb = 0, nr_wrote;
 	struct bch_devs_mask sb_written;
 	bool wrote, can_mount_without_written, can_mount_with_written;
@@ -795,18 +790,11 @@ int bch2_write_super(struct bch_fs *c)
 		bch2_sb_from_fs(c, ca);
 
 	for_each_online_member(ca, c, i) {
-		struct printbuf buf = { NULL, NULL };
+		printbuf_reset(&err);
 
-		ret = bch2_sb_validate(&ca->disk_sb, &buf);
+		ret = bch2_sb_validate(&ca->disk_sb, &err);
 		if (ret) {
-			char *_buf = kmalloc(4096, GFP_NOFS);
-			if (_buf) {
-				buf = _PBUF(_buf, 4096);
-				bch2_sb_validate(&ca->disk_sb, &buf);
-			}
-
-			bch2_fs_inconsistent(c, "sb invalid before write: %s", _buf);
-			kfree(_buf);
+			bch2_fs_inconsistent(c, "sb invalid before write: %s", err.buf);
 			percpu_ref_put(&ca->io_ref);
 			goto out;
 		}
@@ -897,6 +885,7 @@ int bch2_write_super(struct bch_fs *c)
 out:
 	/* Make new options visible after they're persistent: */
 	bch2_sb_update(c);
+	printbuf_exit(&err);
 	return ret;
 }
 
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index d9b69c4244d5..27716d6e962d 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -870,12 +870,9 @@ noinline_for_stack
 static void print_mount_opts(struct bch_fs *c)
 {
 	enum bch_opt_id i;
-	char buf[512];
-	struct printbuf p = PBUF(buf);
+	struct printbuf p = PRINTBUF;
 	bool first = true;
 
-	strcpy(buf, "(null)");
-
 	if (c->opts.read_only) {
 		pr_buf(&p, "ro");
 		first = false;
@@ -897,7 +894,11 @@ static void print_mount_opts(struct bch_fs *c)
 		bch2_opt_to_text(&p, c, opt, v, OPT_SHOW_MOUNT_STYLE);
 	}
 
-	bch_info(c, "mounted with opts: %s", buf);
+	if (!p.pos)
+		pr_buf(&p, "(null)");
+
+	bch_info(c, "mounted with opts: %s", p.buf);
+	printbuf_exit(&p);
 }
 
 int bch2_fs_start(struct bch_fs *c)
@@ -1561,11 +1562,11 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
 
 	data = bch2_dev_has_data(c, ca);
 	if (data) {
-		char data_has_str[100];
+		struct printbuf data_has = PRINTBUF;
 
-		bch2_flags_to_text(&PBUF(data_has_str),
-				   bch2_data_types, data);
-		bch_err(ca, "Remove failed, still has data (%s)", data_has_str);
+		bch2_flags_to_text(&data_has, bch2_data_types, data);
+		bch_err(ca, "Remove failed, still has data (%s)", data_has.buf);
+		printbuf_exit(&data_has);
 		ret = -EBUSY;
 		goto err;
 	}
@@ -1614,16 +1615,9 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
 	struct bch_sb_field_members *mi;
 	struct bch_member dev_mi;
 	unsigned dev_idx, nr_devices, u64s;
-	char *_errbuf;
-	struct printbuf errbuf;
+	struct printbuf errbuf = PRINTBUF;
 	int ret;
 
-	_errbuf = kmalloc(4096, GFP_KERNEL);
-	if (!_errbuf)
-		return -ENOMEM;
-
-	errbuf = _PBUF(_errbuf, 4096);
-
 	ret = bch2_read_super(path, &opts, &sb);
 	if (ret) {
 		bch_err(c, "device add error: error reading super: %i", ret);
@@ -1741,7 +1735,7 @@ err:
 	if (ca)
 		bch2_dev_free(ca);
 	bch2_free_super(&sb);
-	kfree(_errbuf);
+	printbuf_exit(&errbuf);
 	return ret;
 err_late:
 	up_write(&c->state_lock);
@@ -1906,8 +1900,7 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices,
 	struct bch_sb_field_members *mi;
 	unsigned i, best_sb = 0;
 	const char *err;
-	char *_errbuf = NULL;
-	struct printbuf errbuf;
+	struct printbuf errbuf = PRINTBUF;
 	int ret = 0;
 
 	if (!try_module_get(THIS_MODULE))
@@ -1920,14 +1913,6 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices,
 		goto err;
 	}
 
-	_errbuf = kmalloc(4096, GFP_KERNEL);
-	if (!_errbuf) {
-		ret = -ENOMEM;
-		goto err;
-	}
-
-	errbuf = _PBUF(_errbuf, 4096);
-
 	sb = kcalloc(nr_devices, sizeof(*sb), GFP_KERNEL);
 	if (!sb) {
 		ret = -ENOMEM;
@@ -1991,7 +1976,7 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices,
 	}
 out:
 	kfree(sb);
-	kfree(_errbuf);
+	printbuf_exit(&errbuf);
 	module_put(THIS_MODULE);
 	pr_verbose_init(opts, "ret %i", PTR_ERR_OR_ZERO(c));
 	return c;
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index 1a3068f658a1..ce32b9068518 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -46,8 +46,28 @@ struct sysfs_ops type ## _sysfs_ops = {					\
 }
 
 #define SHOW(fn)							\
+static ssize_t fn ## _to_text(struct printbuf *,			\
+			      struct kobject *, struct attribute *);\
+									\
 static ssize_t fn ## _show(struct kobject *kobj, struct attribute *attr,\
 			   char *buf)					\
+{									\
+	struct printbuf out = PRINTBUF;					\
+	ssize_t ret = fn ## _to_text(&out, kobj, attr);			\
+									\
+	if (!ret && out.allocation_failure)				\
+		ret = -ENOMEM;						\
+									\
+	if (!ret) {							\
+		ret = min_t(size_t, out.pos, PAGE_SIZE - 1);		\
+		memcpy(buf, out.buf, ret);				\
+	}								\
+	printbuf_exit(&out);						\
+	return ret;							\
+}									\
+									\
+static ssize_t fn ## _to_text(struct printbuf *out, struct kobject *kobj,\
+			      struct attribute *attr)
 
 #define STORE(fn)							\
 static ssize_t fn ## _store(struct kobject *kobj, struct attribute *attr,\
@@ -64,22 +84,19 @@ static ssize_t fn ## _store(struct kobject *kobj, struct attribute *attr,\
 #define sysfs_printf(file, fmt, ...)					\
 do {									\
 	if (attr == &sysfs_ ## file)					\
-		return scnprintf(buf, PAGE_SIZE, fmt "\n", __VA_ARGS__);\
+		pr_buf(out, fmt "\n", __VA_ARGS__);			\
 } while (0)
 
 #define sysfs_print(file, var)						\
 do {									\
 	if (attr == &sysfs_ ## file)					\
-		return snprint(buf, PAGE_SIZE, var);			\
+		snprint(out, var);					\
 } while (0)
 
 #define sysfs_hprint(file, val)						\
 do {									\
-	if (attr == &sysfs_ ## file) {					\
-		bch2_hprint(&out, val);					\
-		pr_buf(&out, "\n");					\
-		return out.pos - buf;					\
-	}								\
+	if (attr == &sysfs_ ## file)					\
+		bch2_hprint(out, val);					\
 } while (0)
 
 #define var_printf(_var, fmt)	sysfs_printf(_var, fmt, var(_var))
@@ -348,7 +365,6 @@ static void bch2_gc_gens_pos_to_text(struct printbuf *out, struct bch_fs *c)
 SHOW(bch2_fs)
 {
 	struct bch_fs *c = container_of(kobj, struct bch_fs, kobj);
-	struct printbuf out = _PBUF(buf, PAGE_SIZE);
 
 	sysfs_print(minor,			c->minor);
 	sysfs_printf(internal_uuid, "%pU",	c->sb.uuid.b);
@@ -365,10 +381,8 @@ SHOW(bch2_fs)
 
 	sysfs_printf(btree_gc_periodic, "%u",	(int) c->btree_gc_periodic);
 
-	if (attr == &sysfs_gc_gens_pos) {
-		bch2_gc_gens_pos_to_text(&out, c);
-		return out.pos - buf;
-	}
+	if (attr == &sysfs_gc_gens_pos)
+		bch2_gc_gens_pos_to_text(out, c);
 
 	sysfs_printf(copy_gc_enabled, "%i", c->copy_gc_enabled);
 
@@ -378,83 +392,54 @@ SHOW(bch2_fs)
 		     max(0LL, c->copygc_wait -
 			 atomic64_read(&c->io_clock[WRITE].now)) << 9);
 
-	if (attr == &sysfs_rebalance_work) {
-		bch2_rebalance_work_to_text(&out, c);
-		return out.pos - buf;
-	}
+	if (attr == &sysfs_rebalance_work)
+		bch2_rebalance_work_to_text(out, c);
 
 	sysfs_print(promote_whole_extents,	c->promote_whole_extents);
 
 	/* Debugging: */
 
-	if (attr == &sysfs_journal_debug) {
-		bch2_journal_debug_to_text(&out, &c->journal);
-		return out.pos - buf;
-	}
+	if (attr == &sysfs_journal_debug)
+		bch2_journal_debug_to_text(out, &c->journal);
 
-	if (attr == &sysfs_journal_pins) {
-		bch2_journal_pins_to_text(&out, &c->journal);
-		return out.pos - buf;
-	}
+	if (attr == &sysfs_journal_pins)
+		bch2_journal_pins_to_text(out, &c->journal);
 
-	if (attr == &sysfs_btree_updates) {
-		bch2_btree_updates_to_text(&out, c);
-		return out.pos - buf;
-	}
+	if (attr == &sysfs_btree_updates)
+		bch2_btree_updates_to_text(out, c);
 
-	if (attr == &sysfs_dirty_btree_nodes) {
-		bch2_dirty_btree_nodes_to_text(&out, c);
-		return out.pos - buf;
-	}
+	if (attr == &sysfs_dirty_btree_nodes)
+		bch2_dirty_btree_nodes_to_text(out, c);
 
-	if (attr == &sysfs_btree_cache) {
-		bch2_btree_cache_to_text(&out, c);
-		return out.pos - buf;
-	}
+	if (attr == &sysfs_btree_cache)
+		bch2_btree_cache_to_text(out, c);
 
-	if (attr == &sysfs_btree_key_cache) {
-		bch2_btree_key_cache_to_text(&out, &c->btree_key_cache);
-		return out.pos - buf;
-	}
+	if (attr == &sysfs_btree_key_cache)
+		bch2_btree_key_cache_to_text(out, &c->btree_key_cache);
 
-	if (attr == &sysfs_btree_transactions) {
-		bch2_btree_trans_to_text(&out, c);
-		return out.pos - buf;
-	}
+	if (attr == &sysfs_btree_transactions)
+		bch2_btree_trans_to_text(out, c);
 
-	if (attr == &sysfs_stripes_heap) {
-		bch2_stripes_heap_to_text(&out, c);
-		return out.pos - buf;
-	}
+	if (attr == &sysfs_stripes_heap)
+		bch2_stripes_heap_to_text(out, c);
 
-	if (attr == &sysfs_open_buckets) {
-		bch2_open_buckets_to_text(&out, c);
-		return out.pos - buf;
-	}
+	if (attr == &sysfs_open_buckets)
+		bch2_open_buckets_to_text(out, c);
 
-	if (attr == &sysfs_compression_stats) {
-		bch2_compression_stats_to_text(&out, c);
-		return out.pos - buf;
-	}
+	if (attr == &sysfs_compression_stats)
+		bch2_compression_stats_to_text(out, c);
 
-	if (attr == &sysfs_new_stripes) {
-		bch2_new_stripes_to_text(&out, c);
-		return out.pos - buf;
-	}
+	if (attr == &sysfs_new_stripes)
+		bch2_new_stripes_to_text(out, c);
 
-	if (attr == &sysfs_io_timers_read) {
-		bch2_io_timers_to_text(&out, &c->io_clock[READ]);
-		return out.pos - buf;
-	}
-	if (attr == &sysfs_io_timers_write) {
-		bch2_io_timers_to_text(&out, &c->io_clock[WRITE]);
-		return out.pos - buf;
-	}
+	if (attr == &sysfs_io_timers_read)
+		bch2_io_timers_to_text(out, &c->io_clock[READ]);
 
-	if (attr == &sysfs_data_jobs) {
-		data_progress_to_text(&out, c);
-		return out.pos - buf;
-	}
+	if (attr == &sysfs_io_timers_write)
+		bch2_io_timers_to_text(out, &c->io_clock[WRITE]);
+
+	if (attr == &sysfs_data_jobs)
+		data_progress_to_text(out, c);
 
 	return 0;
 }
@@ -567,7 +552,7 @@ struct attribute *bch2_fs_files[] = {
 SHOW(bch2_fs_internal)
 {
 	struct bch_fs *c = container_of(kobj, struct bch_fs, internal);
-	return bch2_fs_show(&c->kobj, attr, buf);
+	return bch2_fs_to_text(out, &c->kobj, attr);
 }
 
 STORE(bch2_fs_internal)
@@ -617,16 +602,15 @@ struct attribute *bch2_fs_internal_files[] = {
 
 SHOW(bch2_fs_opts_dir)
 {
-	struct printbuf out = _PBUF(buf, PAGE_SIZE);
 	struct bch_fs *c = container_of(kobj, struct bch_fs, opts_dir);
 	const struct bch_option *opt = container_of(attr, struct bch_option, attr);
 	int id = opt - bch2_opt_table;
 	u64 v = bch2_opt_get_by_id(&c->opts, id);
 
-	bch2_opt_to_text(&out, c, opt, v, OPT_SHOW_FULL_LIST);
-	pr_buf(&out, "\n");
+	bch2_opt_to_text(out, c, opt, v, OPT_SHOW_FULL_LIST);
+	pr_char(out, '\n');
 
-	return out.pos - buf;
+	return 0;
 }
 
 STORE(bch2_fs_opts_dir)
@@ -690,13 +674,10 @@ int bch2_opts_create_sysfs_files(struct kobject *kobj)
 SHOW(bch2_fs_time_stats)
 {
 	struct bch_fs *c = container_of(kobj, struct bch_fs, time_stats);
-	struct printbuf out = _PBUF(buf, PAGE_SIZE);
 
 #define x(name)								\
-	if (attr == &sysfs_time_stat_##name) {				\
-		bch2_time_stats_to_text(&out, &c->times[BCH_TIME_##name]);\
-		return out.pos - buf;					\
-	}
+	if (attr == &sysfs_time_stat_##name)				\
+		bch2_time_stats_to_text(out, &c->times[BCH_TIME_##name]);
 	BCH_TIME_STATS()
 #undef x
 
@@ -812,7 +793,6 @@ SHOW(bch2_dev)
 {
 	struct bch_dev *ca = container_of(kobj, struct bch_dev, kobj);
 	struct bch_fs *c = ca->fs;
-	struct printbuf out = _PBUF(buf, PAGE_SIZE);
 
 	sysfs_printf(uuid,		"%pU\n", ca->uuid.b);
 
@@ -825,58 +805,47 @@ SHOW(bch2_dev)
 	if (attr == &sysfs_label) {
 		if (ca->mi.group) {
 			mutex_lock(&c->sb_lock);
-			bch2_disk_path_to_text(&out, c->disk_sb.sb,
+			bch2_disk_path_to_text(out, c->disk_sb.sb,
 					       ca->mi.group - 1);
 			mutex_unlock(&c->sb_lock);
 		}
 
-		pr_buf(&out, "\n");
-		return out.pos - buf;
+		pr_char(out, '\n');
 	}
 
 	if (attr == &sysfs_has_data) {
-		bch2_flags_to_text(&out, bch2_data_types,
+		bch2_flags_to_text(out, bch2_data_types,
 				   bch2_dev_has_data(c, ca));
-		pr_buf(&out, "\n");
-		return out.pos - buf;
+		pr_char(out, '\n');
 	}
 
 	if (attr == &sysfs_state_rw) {
-		bch2_string_opt_to_text(&out, bch2_member_states,
+		bch2_string_opt_to_text(out, bch2_member_states,
 					ca->mi.state);
-		pr_buf(&out, "\n");
-		return out.pos - buf;
+		pr_char(out, '\n');
 	}
 
-	if (attr == &sysfs_iodone) {
-		dev_iodone_to_text(&out, ca);
-		return out.pos - buf;
-	}
+	if (attr == &sysfs_iodone)
+		dev_iodone_to_text(out, ca);
 
 	sysfs_print(io_latency_read,		atomic64_read(&ca->cur_latency[READ]));
 	sysfs_print(io_latency_write,		atomic64_read(&ca->cur_latency[WRITE]));
 
-	if (attr == &sysfs_io_latency_stats_read) {
-		bch2_time_stats_to_text(&out, &ca->io_latency[READ]);
-		return out.pos - buf;
-	}
-	if (attr == &sysfs_io_latency_stats_write) {
-		bch2_time_stats_to_text(&out, &ca->io_latency[WRITE]);
-		return out.pos - buf;
-	}
+	if (attr == &sysfs_io_latency_stats_read)
+		bch2_time_stats_to_text(out, &ca->io_latency[READ]);
+
+	if (attr == &sysfs_io_latency_stats_write)
+		bch2_time_stats_to_text(out, &ca->io_latency[WRITE]);
 
 	sysfs_printf(congested,			"%u%%",
 		     clamp(atomic_read(&ca->congested), 0, CONGESTED_MAX)
 		     * 100 / CONGESTED_MAX);
 
-	if (attr == &sysfs_reserve_stats) {
-		reserve_stats_to_text(&out, ca);
-		return out.pos - buf;
-	}
-	if (attr == &sysfs_alloc_debug) {
-		dev_alloc_debug_to_text(&out, ca);
-		return out.pos - buf;
-	}
+	if (attr == &sysfs_reserve_stats)
+		reserve_stats_to_text(out, ca);
+
+	if (attr == &sysfs_alloc_debug)
+		dev_alloc_debug_to_text(out, ca);
 
 	return 0;
 }
diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c
index 1f7f2533e544..978d92e0b5eb 100644
--- a/fs/bcachefs/tests.c
+++ b/fs/bcachefs/tests.c
@@ -871,7 +871,9 @@ int bch2_btree_perf_test(struct bch_fs *c, const char *testname,
 			 u64 nr, unsigned nr_threads)
 {
 	struct test_job j = { .c = c, .nr = nr, .nr_threads = nr_threads };
-	char name_buf[20], nr_buf[20], per_sec_buf[20];
+	char name_buf[20];
+	struct printbuf nr_buf = PRINTBUF;
+	struct printbuf per_sec_buf = PRINTBUF;
 	unsigned i;
 	u64 time;
 
@@ -932,13 +934,15 @@ int bch2_btree_perf_test(struct bch_fs *c, const char *testname,
 	time = j.finish - j.start;
 
 	scnprintf(name_buf, sizeof(name_buf), "%s:", testname);
-	bch2_hprint(&PBUF(nr_buf), nr);
-	bch2_hprint(&PBUF(per_sec_buf), div64_u64(nr * NSEC_PER_SEC, time));
+	bch2_hprint(&nr_buf, nr);
+	bch2_hprint(&per_sec_buf, div64_u64(nr * NSEC_PER_SEC, time));
 	printk(KERN_INFO "%-12s %s with %u threads in %5llu sec, %5llu nsec per iter, %5s per sec\n",
-		name_buf, nr_buf, nr_threads,
+		name_buf, nr_buf.buf, nr_threads,
 		div_u64(time, NSEC_PER_SEC),
 		div_u64(time * nr_threads, nr),
-		per_sec_buf);
+		per_sec_buf.buf);
+	printbuf_exit(&per_sec_buf);
+	printbuf_exit(&nr_buf);
 	return j.ret;
 }
 
diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c
index 2296658b9f0d..7a896ddc9a22 100644
--- a/fs/bcachefs/util.c
+++ b/fs/bcachefs/util.c
@@ -99,6 +99,38 @@ STRTO_H(strtoll, long long)
 STRTO_H(strtoull, unsigned long long)
 STRTO_H(strtou64, u64)
 
+static int bch2_printbuf_realloc(struct printbuf *out, unsigned extra)
+{
+	unsigned new_size = roundup_pow_of_two(out->size + extra);
+	char *buf = krealloc(out->buf, new_size, !out->atomic ? GFP_KERNEL : GFP_ATOMIC);
+
+	if (!buf) {
+		out->allocation_failure = true;
+		return -ENOMEM;
+	}
+
+	out->buf	= buf;
+	out->size	= new_size;
+	return 0;
+}
+
+void bch2_pr_buf(struct printbuf *out, const char *fmt, ...)
+{
+	va_list args;
+	int len;
+
+	do {
+		va_start(args, fmt);
+		len = vsnprintf(out->buf + out->pos, printbuf_remaining(out), fmt, args);
+		va_end(args);
+	} while (len + 1 >= printbuf_remaining(out) &&
+		 !bch2_printbuf_realloc(out, len + 1));
+
+	len = min_t(size_t, len,
+		  printbuf_remaining(out) ? printbuf_remaining(out) - 1 : 0);
+	out->pos += len;
+}
+
 void bch2_hprint(struct printbuf *buf, s64 v)
 {
 	int u, t = 0;
@@ -151,9 +183,6 @@ void bch2_flags_to_text(struct printbuf *out,
 	unsigned bit, nr = 0;
 	bool first = true;
 
-	if (out->pos != out->end)
-		*out->pos = '\0';
-
 	while (list[nr])
 		nr++;
 
diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h
index 58427edcfaa4..7667944f9ae4 100644
--- a/fs/bcachefs/util.h
+++ b/fs/bcachefs/util.h
@@ -242,19 +242,39 @@ enum printbuf_units {
 };
 
 struct printbuf {
-	char			*pos;
-	char			*end;
-	char			*last_newline;
-	char			*last_field;
+	char			*buf;
+	unsigned		size;
+	unsigned		pos;
+	unsigned		last_newline;
+	unsigned		last_field;
 	unsigned		indent;
-	enum printbuf_units	units;
-	unsigned		tabstop;
-	unsigned		tabstops[4];
+	enum printbuf_units	units:8;
+	u8			atomic;
+	bool			allocation_failure:1;
+	u8			tabstop;
+	u8			tabstops[4];
 };
 
+#define PRINTBUF ((struct printbuf) { NULL })
+
+static inline void printbuf_exit(struct printbuf *buf)
+{
+	kfree(buf->buf);
+	buf->buf = ERR_PTR(-EINTR); /* poison value */
+}
+
+static inline void printbuf_reset(struct printbuf *buf)
+{
+	buf->pos		= 0;
+	buf->last_newline	= 0;
+	buf->last_field		= 0;
+	buf->indent		= 0;
+	buf->tabstop		= 0;
+}
+
 static inline size_t printbuf_remaining(struct printbuf *buf)
 {
-	return buf->end - buf->pos;
+	return buf->size - buf->pos;
 }
 
 static inline size_t printbuf_linelen(struct printbuf *buf)
@@ -262,29 +282,13 @@ static inline size_t printbuf_linelen(struct printbuf *buf)
 	return buf->pos - buf->last_newline;
 }
 
-#define _PBUF(_buf, _len)						\
-	((struct printbuf) {						\
-		.pos		= _buf,					\
-		.end		= _buf + _len,				\
-		.last_newline	= _buf,					\
-		.last_field	= _buf,					\
-	})
+void bch2_pr_buf(struct printbuf *out, const char *fmt, ...);
 
-#define PBUF(_buf) _PBUF(_buf, sizeof(_buf))
-
-
-#define pr_buf(_out, ...)						\
-do {									\
-	(_out)->pos += scnprintf((_out)->pos, printbuf_remaining(_out),	\
-				 __VA_ARGS__);				\
-} while (0)
+#define pr_buf(_out, ...) bch2_pr_buf(_out, __VA_ARGS__)
 
 static inline void pr_char(struct printbuf *out, char c)
 {
-	if (printbuf_remaining(out) > 1) {
-		*out->pos = c;
-		out->pos++;
-	}
+	bch2_pr_buf(out, "%c", c);
 }
 
 static inline void pr_indent_push(struct printbuf *buf, unsigned spaces)
@@ -298,7 +302,7 @@ static inline void pr_indent_pop(struct printbuf *buf, unsigned spaces)
 {
 	if (buf->last_newline + buf->indent == buf->pos) {
 		buf->pos -= spaces;
-		buf->pos = '\0';
+		buf->buf[buf->pos] = '\0';
 	}
 	buf->indent -= spaces;
 }
@@ -341,12 +345,12 @@ static inline void pr_tab_rjust(struct printbuf *buf)
 	BUG_ON(buf->tabstop > ARRAY_SIZE(buf->tabstops));
 
 	if (shift > 0) {
-		memmove(buf->last_field + shift,
-			buf->last_field,
+		memmove(buf->buf + buf->last_field + shift,
+			buf->buf + buf->last_field,
 			move);
-		memset(buf->last_field, ' ', shift);
+		memset(buf->buf + buf->last_field, ' ', shift);
 		buf->pos += shift;
-		*buf->pos = 0;
+		buf->buf[buf->pos] = 0;
 	}
 
 	buf->last_field = buf->pos;
@@ -460,8 +464,8 @@ static inline int bch2_strtoul_h(const char *cp, long *res)
 	_r;								\
 })
 
-#define snprint(buf, size, var)						\
-	snprintf(buf, size,						\
+#define snprint(out, var)						\
+	pr_buf(out,							\
 		   type_is(var, int)		? "%i\n"		\
 		 : type_is(var, unsigned)	? "%u\n"		\
 		 : type_is(var, long)		? "%li\n"		\
@@ -605,10 +609,8 @@ do {									\
 	sysfs_print(name##_rate_d_term,		(var)->d_term);		\
 	sysfs_print(name##_rate_p_term_inverse,	(var)->p_term_inverse);	\
 									\
-	if (attr == &sysfs_##name##_rate_debug) {			\
-		bch2_pd_controller_debug_to_text(&out, var);		\
-		return out.pos - buf;					\
-	}								\
+	if (attr == &sysfs_##name##_rate_debug)				\
+		bch2_pd_controller_debug_to_text(out, var);		\
 } while (0)
 
 #define sysfs_pd_controller_store(name, var)				\
diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c
index 9cce3953ee0c..f4e20e796ba0 100644
--- a/fs/bcachefs/xattr.c
+++ b/fs/bcachefs/xattr.c
@@ -426,9 +426,8 @@ static int __bch2_xattr_bcachefs_get(const struct xattr_handler *handler,
 		bch2_inode_opts_to_opts(bch2_inode_opts_get(&inode->ei_inode));
 	const struct bch_option *opt;
 	int id, inode_opt_id;
-	char buf[512];
-	struct printbuf out = PBUF(buf);
-	unsigned val_len;
+	struct printbuf out = PRINTBUF;
+	int ret;
 	u64 v;
 
 	id = bch2_opt_lookup(name);
@@ -451,14 +450,19 @@ static int __bch2_xattr_bcachefs_get(const struct xattr_handler *handler,
 	v = bch2_opt_get_by_id(&opts, id);
 	bch2_opt_to_text(&out, c, opt, v, 0);
 
-	val_len = out.pos - buf;
+	ret = out.pos;
 
-	if (buffer && val_len > size)
-		return -ERANGE;
+	if (out.allocation_failure) {
+		ret = -ENOMEM;
+	} else if (buffer) {
+		if (out.pos > size)
+			ret = -ERANGE;
+		else
+			memcpy(buffer, out.buf, out.pos);
+	}
 
-	if (buffer)
-		memcpy(buffer, buf, val_len);
-	return val_len;
+	printbuf_exit(&out);
+	return ret;
 }
 
 static int bch2_xattr_bcachefs_get(const struct xattr_handler *handler,
-- 
cgit 


From cb598111836fbca03b1353a6238cde8a66e5ddf0 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 25 Feb 2022 22:14:35 -0500
Subject: bcachefs: Fix journal_flush_done()

journal_flush_done() was overwriting did_work, thus occasionally
returning false when it did do work and occasional assertions in the
shutdown sequence because we didn't completely flush the key cache.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/journal_reclaim.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
index 2d5382a83003..3cc980b07285 100644
--- a/fs/bcachefs/journal_reclaim.c
+++ b/fs/bcachefs/journal_reclaim.c
@@ -759,7 +759,8 @@ static int journal_flush_done(struct journal *j, u64 seq_to_flush,
 
 	mutex_lock(&j->reclaim_lock);
 
-	*did_work = journal_flush_pins(j, seq_to_flush, 0, 0) != 0;
+	if (journal_flush_pins(j, seq_to_flush, 0, 0))
+		*did_work = true;
 
 	spin_lock(&j->lock);
 	/*
-- 
cgit 


From a0a07c59f5b4646b9371a1c119feeb6ee52b0012 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 25 Feb 2022 22:33:01 -0500
Subject: bcachefs: Fix btree path sorting

In btree_update_interior.c, we were changing a path's level directly -
which affects path sort order - without re-sorting paths, leading to
assertions when bch2_path_get() verified paths were sorted correctly.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_iter.c            | 3 ++-
 fs/bcachefs/btree_update_interior.c | 2 ++
 2 files changed, 4 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 92258281fdc7..b65cd3566872 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1797,11 +1797,12 @@ void bch2_dump_trans_paths_updates(struct btree_trans *trans)
 
 		bch2_bpos_to_text(&buf1, path->pos);
 
-		printk(KERN_ERR "path: idx %u ref %u:%u%s%s btree %s pos %s locks %u %pS\n",
+		printk(KERN_ERR "path: idx %u ref %u:%u%s%s btree=%s l=%u pos %s locks %u %pS\n",
 		       path->idx, path->ref, path->intent_ref,
 		       path->should_be_locked ? " S" : "",
 		       path->preserve ? " P" : "",
 		       bch2_btree_ids[path->btree_id],
+		       path->level,
 		       buf1.buf,
 		       path->nodes_locked,
 #ifdef CONFIG_BCACHEFS_DEBUG
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index ed0a70f7ea68..49e475c15451 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -1918,6 +1918,8 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans,
 		path_l(iter2.path)->b = BTREE_ITER_NO_NODE_UP;
 		iter2.path->level++;
 
+		trans->paths_sorted = false;
+
 		ret   = bch2_btree_iter_traverse(&iter2) ?:
 			bch2_trans_update(trans, &iter2, new_key, BTREE_TRIGGER_NORUN);
 		if (ret)
-- 
cgit 


From 2975cd4701b71d5b28753861a9388bf67db26231 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 25 Feb 2022 22:45:58 -0500
Subject: bcachefs: Don't spin in journal reclaim

If we're not able to flush anything, we shouldn't keep looping.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/journal_reclaim.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
index 3cc980b07285..449f4fbfa326 100644
--- a/fs/bcachefs/journal_reclaim.c
+++ b/fs/bcachefs/journal_reclaim.c
@@ -661,7 +661,7 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct)
 
 		if (nr_flushed)
 			wake_up(&j->reclaim_wait);
-	} while ((min_nr || min_key_cache) && !direct);
+	} while ((min_nr || min_key_cache) && nr_flushed && !direct);
 
 	memalloc_noreclaim_restore(flags);
 
-- 
cgit 


From 55334d78974fa44735bb59229eedde0bcc300ed6 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 26 Feb 2022 20:25:15 -0500
Subject: bcachefs: Kill BCH_FS_HOLD_BTREE_WRITES

This was just dead code.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/bcachefs.h              | 1 -
 fs/bcachefs/btree_cache.c           | 3 +--
 fs/bcachefs/btree_io.c              | 3 ---
 fs/bcachefs/btree_update_interior.c | 3 +--
 4 files changed, 2 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 378061712c76..e5bc09870c57 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -534,7 +534,6 @@ enum {
 	BCH_FS_NEED_ANOTHER_GC,
 	BCH_FS_DELETED_NODES,
 	BCH_FS_REBUILD_REPLICAS,
-	BCH_FS_HOLD_BTREE_WRITES,
 };
 
 struct btree_debug {
diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index dbf3b084478f..a8d5c06541d0 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -223,8 +223,7 @@ wait_on_io:
 		goto out_unlock;
 
 	if (btree_node_dirty(b)) {
-		if (!flush ||
-		    test_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags))
+		if (!flush)
 			goto out_unlock;
 		/*
 		 * Using the underscore version because we don't want to compact
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 1dc21b5948ea..06704299640d 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -1772,9 +1772,6 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, bool already_sta
 	if (already_started)
 		goto do_write;
 
-	if (test_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags))
-		return;
-
 	/*
 	 * We may only have a read lock on the btree node - the dirty bit is our
 	 * "lock" against racing with other threads that may be trying to start
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 49e475c15451..6b793c9e95f4 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -1104,8 +1104,7 @@ static void bch2_btree_set_root(struct btree_update *as,
 	struct btree *old;
 
 	trace_btree_set_root(c, b);
-	BUG_ON(!b->written &&
-	       !test_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags));
+	BUG_ON(!b->written);
 
 	old = btree_node_root(c, b);
 
-- 
cgit 


From de517c95513d420d465ca26d354a56d9e6ed6e17 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 26 Feb 2022 11:10:20 -0500
Subject: bcachefs: Use x-macros for btree node flags

This is for adding an array of strings for btree node flag names.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_cache.c           |  9 +++++-
 fs/bcachefs/btree_cache.h           |  2 ++
 fs/bcachefs/btree_io.h              |  9 ++----
 fs/bcachefs/btree_types.h           | 56 +++++++++++++++++--------------------
 fs/bcachefs/btree_update_interior.c |  6 ++--
 fs/bcachefs/btree_update_leaf.c     |  2 +-
 6 files changed, 41 insertions(+), 43 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index a8d5c06541d0..a6b8ca85fc94 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -13,6 +13,13 @@
 #include <linux/prefetch.h>
 #include <linux/sched/mm.h>
 
+const char * const bch2_btree_node_flags[] = {
+#define x(f)	#f,
+	BTREE_FLAGS()
+#undef x
+	NULL
+};
+
 void bch2_recalc_btree_reserve(struct bch_fs *c)
 {
 	unsigned i, reserve = 16;
@@ -413,7 +420,7 @@ void bch2_fs_btree_cache_exit(struct bch_fs *c)
 
 		if (btree_node_dirty(b))
 			bch2_btree_complete_write(c, b, btree_current_write(b));
-		clear_btree_node_dirty(c, b);
+		clear_btree_node_dirty_acct(c, b);
 
 		btree_node_data_free(c, b);
 	}
diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h
index a08d12569075..96f8f90e85a1 100644
--- a/fs/bcachefs/btree_cache.h
+++ b/fs/bcachefs/btree_cache.h
@@ -5,6 +5,8 @@
 #include "bcachefs.h"
 #include "btree_types.h"
 
+extern const char * const bch2_btree_node_flags[];
+
 struct btree_iter;
 
 void bch2_recalc_btree_reserve(struct bch_fs *);
diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h
index 095ad505338d..a1dea8e85e4d 100644
--- a/fs/bcachefs/btree_io.h
+++ b/fs/bcachefs/btree_io.h
@@ -15,18 +15,13 @@ struct btree;
 struct btree_iter;
 struct btree_node_read_all;
 
-static inline bool btree_node_dirty(struct btree *b)
-{
-	return test_bit(BTREE_NODE_dirty, &b->flags);
-}
-
-static inline void set_btree_node_dirty(struct bch_fs *c, struct btree *b)
+static inline void set_btree_node_dirty_acct(struct bch_fs *c, struct btree *b)
 {
 	if (!test_and_set_bit(BTREE_NODE_dirty, &b->flags))
 		atomic_inc(&c->btree_cache.dirty);
 }
 
-static inline void clear_btree_node_dirty(struct bch_fs *c, struct btree *b)
+static inline void clear_btree_node_dirty_acct(struct bch_fs *c, struct btree *b)
 {
 	if (test_and_clear_bit(BTREE_NODE_dirty, &b->flags))
 		atomic_dec(&c->btree_cache.dirty);
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index d5c2a776ee1b..165466db222d 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -429,7 +429,29 @@ struct btree_trans {
 	struct replicas_delta_list *fs_usage_deltas;
 };
 
-#define BTREE_FLAG(flag)						\
+#define BTREE_FLAGS()							\
+	x(read_in_flight)						\
+	x(read_error)							\
+	x(dirty)							\
+	x(need_write)							\
+	x(noevict)							\
+	x(write_idx)							\
+	x(accessed)							\
+	x(write_in_flight)						\
+	x(write_in_flight_inner)					\
+	x(just_written)							\
+	x(dying)							\
+	x(fake)								\
+	x(need_rewrite)							\
+	x(never_write)
+
+enum btree_flags {
+#define x(flag)	BTREE_NODE_##flag,
+	BTREE_FLAGS()
+#undef x
+};
+
+#define x(flag)								\
 static inline bool btree_node_ ## flag(struct btree *b)			\
 {	return test_bit(BTREE_NODE_ ## flag, &b->flags); }		\
 									\
@@ -439,36 +461,8 @@ static inline void set_btree_node_ ## flag(struct btree *b)		\
 static inline void clear_btree_node_ ## flag(struct btree *b)		\
 {	clear_bit(BTREE_NODE_ ## flag, &b->flags); }
 
-enum btree_flags {
-	BTREE_NODE_read_in_flight,
-	BTREE_NODE_read_error,
-	BTREE_NODE_dirty,
-	BTREE_NODE_need_write,
-	BTREE_NODE_noevict,
-	BTREE_NODE_write_idx,
-	BTREE_NODE_accessed,
-	BTREE_NODE_write_in_flight,
-	BTREE_NODE_write_in_flight_inner,
-	BTREE_NODE_just_written,
-	BTREE_NODE_dying,
-	BTREE_NODE_fake,
-	BTREE_NODE_need_rewrite,
-	BTREE_NODE_never_write,
-};
-
-BTREE_FLAG(read_in_flight);
-BTREE_FLAG(read_error);
-BTREE_FLAG(need_write);
-BTREE_FLAG(noevict);
-BTREE_FLAG(write_idx);
-BTREE_FLAG(accessed);
-BTREE_FLAG(write_in_flight);
-BTREE_FLAG(write_in_flight_inner);
-BTREE_FLAG(just_written);
-BTREE_FLAG(dying);
-BTREE_FLAG(fake);
-BTREE_FLAG(need_rewrite);
-BTREE_FLAG(never_write);
+BTREE_FLAGS()
+#undef x
 
 static inline struct btree_write *btree_current_write(struct btree *b)
 {
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 6b793c9e95f4..f4ee78e84f71 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -271,7 +271,7 @@ static struct btree *bch2_btree_node_alloc(struct btree_update *as, unsigned lev
 	six_lock_write(&b->c.lock, NULL, NULL);
 
 	set_btree_node_accessed(b);
-	set_btree_node_dirty(c, b);
+	set_btree_node_dirty_acct(c, b);
 	set_btree_node_need_write(b);
 
 	bch2_bset_init_first(b, &b->data->keys);
@@ -868,7 +868,7 @@ static void bch2_btree_interior_update_will_free_node(struct btree_update *as,
 		closure_wake_up(&c->btree_interior_update_wait);
 	}
 
-	clear_btree_node_dirty(c, b);
+	clear_btree_node_dirty_acct(c, b);
 	clear_btree_node_need_write(b);
 
 	/*
@@ -1172,7 +1172,7 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as,
 		bch2_btree_node_iter_advance(node_iter, b);
 
 	bch2_btree_bset_insert_key(trans, path, b, node_iter, insert);
-	set_btree_node_dirty(c, b);
+	set_btree_node_dirty_acct(c, b);
 	set_btree_node_need_write(b);
 }
 
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 4b0e00f32a96..94d0b8bd014b 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -221,7 +221,7 @@ static bool btree_insert_key_leaf(struct btree_trans *trans,
 	bch2_btree_add_journal_pin(c, b, trans->journal_res.seq);
 
 	if (unlikely(!btree_node_dirty(b)))
-		set_btree_node_dirty(c, b);
+		set_btree_node_dirty_acct(c, b);
 
 	live_u64s_added = (int) b->nr.live_u64s - old_live_u64s;
 	u64s_added = (int) bset_u64s(t) - old_u64s;
-- 
cgit 


From 734f7141ce45360203ddaa64782279653c1c9588 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 28 Dec 2022 17:13:08 -0500
Subject: bcachefs: Improve struct journal layout

This cacheline aligns struct journal, and puts j->reservations and
j->prereserved on their own cacheline - we may want to split them up in
a separate patch.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/journal_types.h | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h
index 9facd3f128bb..3012b374625f 100644
--- a/fs/bcachefs/journal_types.h
+++ b/fs/bcachefs/journal_types.h
@@ -156,15 +156,24 @@ enum {
 /* Embedded in struct bch_fs */
 struct journal {
 	/* Fastpath stuff up front: */
-
-	unsigned long		flags;
+	struct {
 
 	union journal_res_state reservations;
 
+	union journal_preres_state prereserved;
+
+	} __aligned(SMP_CACHE_BYTES);
+
+	unsigned long		flags;
+
 	/* Max size of current journal entry */
 	unsigned		cur_entry_u64s;
 	unsigned		cur_entry_sectors;
 
+	/* Reserved space in journal entry to be used just prior to write */
+	unsigned		entry_u64s_reserved;
+
+
 	/*
 	 * 0, or -ENOSPC if waiting on journal reclaim, or -EROFS if
 	 * insufficient devices:
@@ -178,13 +187,7 @@ struct journal {
 		cur_entry_insufficient_devices,
 	}			cur_entry_error;
 
-	union journal_preres_state prereserved;
-
-	/* Reserved space in journal entry to be used just prior to write */
-	unsigned		entry_u64s_reserved;
-
 	unsigned		buf_size_want;
-
 	/*
 	 * Two journal entries -- one is currently open for new entries, the
 	 * other is possibly being written out.
@@ -277,7 +280,7 @@ struct journal {
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 	struct lockdep_map	res_map;
 #endif
-};
+} __aligned(SMP_CACHE_BYTES);
 
 /*
  * Embedded in struct bch_dev. First three fields refer to the array of journal
-- 
cgit 


From 75ef2c59bc2f4d3c3ecd48286ac36ee7b868321c Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 26 Feb 2022 11:48:34 -0500
Subject: bcachefs: Start moving debug info from sysfs to debugfs

In sysfs, files can only output at most PAGE_SIZE. This is a problem for
debug info that needs to list an arbitrary number of times, and because
of this limit some of our debug info has been terser and harder to read
than we'd like.

This patch moves info about journal pins and cached btree nodes to
debugfs, and greatly expands and improves the output we return.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/bcachefs.h |   6 +-
 fs/bcachefs/btree_io.c |  27 --------
 fs/bcachefs/btree_io.h |   1 -
 fs/bcachefs/debug.c    | 176 +++++++++++++++++++++++++++++++++++++++++++++----
 fs/bcachefs/journal.c  |  56 +++++++++++-----
 fs/bcachefs/journal.h  |   1 +
 fs/bcachefs/sysfs.c    |  10 ---
 7 files changed, 206 insertions(+), 71 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index e5bc09870c57..6cda77ad4342 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -538,9 +538,6 @@ enum {
 
 struct btree_debug {
 	unsigned		id;
-	struct dentry		*btree;
-	struct dentry		*btree_format;
-	struct dentry		*failed;
 };
 
 struct bch_fs_pcpu {
@@ -885,7 +882,8 @@ mempool_t		bio_bounce_pages;
 	struct bch_memquota_type quotas[QTYP_NR];
 
 	/* DEBUG JUNK */
-	struct dentry		*debug;
+	struct dentry		*fs_debug_dir;
+	struct dentry		*btree_debug_dir;
 	struct btree_debug	btree_debug[BTREE_ID_NR];
 	struct btree		*verify_data;
 	struct btree_node	*verify_ondisk;
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 06704299640d..fd7f2a78473c 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -2106,30 +2106,3 @@ void bch2_btree_flush_all_writes(struct bch_fs *c)
 {
 	__bch2_btree_flush_all(c, BTREE_NODE_write_in_flight);
 }
-
-void bch2_dirty_btree_nodes_to_text(struct printbuf *out, struct bch_fs *c)
-{
-	struct bucket_table *tbl;
-	struct rhash_head *pos;
-	struct btree *b;
-	unsigned i;
-
-	rcu_read_lock();
-	for_each_cached_btree(b, c, tbl, i, pos) {
-		unsigned long flags = READ_ONCE(b->flags);
-
-		if (!(flags & (1 << BTREE_NODE_dirty)))
-			continue;
-
-		pr_buf(out, "%p d %u n %u l %u w %u b %u r %u:%lu\n",
-		       b,
-		       (flags & (1 << BTREE_NODE_dirty)) != 0,
-		       (flags & (1 << BTREE_NODE_need_write)) != 0,
-		       b->c.level,
-		       b->written,
-		       !list_empty_careful(&b->write_blocked),
-		       b->will_make_reachable != 0,
-		       b->will_make_reachable & 1);
-	}
-	rcu_read_unlock();
-}
diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h
index a1dea8e85e4d..638a9b30f0cb 100644
--- a/fs/bcachefs/btree_io.h
+++ b/fs/bcachefs/btree_io.h
@@ -177,7 +177,6 @@ do {									\
 
 void bch2_btree_flush_all_reads(struct bch_fs *);
 void bch2_btree_flush_all_writes(struct bch_fs *);
-void bch2_dirty_btree_nodes_to_text(struct printbuf *, struct bch_fs *);
 
 static inline void compat_bformat(unsigned level, enum btree_id btree_id,
 				  unsigned version, unsigned big_endian,
diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c
index c3bfa7f1d77d..1fff03d301a9 100644
--- a/fs/bcachefs/debug.c
+++ b/fs/bcachefs/debug.c
@@ -185,9 +185,10 @@ out:
 /* XXX: bch_fs refcounting */
 
 struct dump_iter {
-	struct bpos		from;
-	struct bch_fs	*c;
+	struct bch_fs		*c;
 	enum btree_id		id;
+	struct bpos		from;
+	u64			iter;
 
 	struct printbuf		buf;
 
@@ -226,6 +227,7 @@ static int bch2_dump_open(struct inode *inode, struct file *file)
 
 	file->private_data = i;
 	i->from = POS_MIN;
+	i->iter	= 0;
 	i->c	= container_of(bd, struct bch_fs, btree_debug[bd->id]);
 	i->id	= bd->id;
 	i->buf	= PRINTBUF;
@@ -420,10 +422,148 @@ static const struct file_operations bfloat_failed_debug_ops = {
 	.read		= bch2_read_bfloat_failed,
 };
 
+static void bch2_cached_btree_node_to_text(struct printbuf *out, struct bch_fs *c,
+					   struct btree *b)
+{
+	out->tabstops[0] = 32;
+
+	pr_buf(out, "%px btree=%s l=%u ",
+	       b,
+	       bch2_btree_ids[b->c.btree_id],
+	       b->c.level);
+	pr_newline(out);
+
+	pr_indent_push(out, 2);
+
+	bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&b->key));
+	pr_newline(out);
+
+	pr_buf(out, "flags: ");
+	pr_tab(out);
+	bch2_flags_to_text(out, bch2_btree_node_flags, b->flags);
+	pr_newline(out);
+
+	pr_buf(out, "written:");
+	pr_tab(out);
+	pr_buf(out, "%u", b->written);
+	pr_newline(out);
+
+	pr_buf(out, "writes blocked:");
+	pr_tab(out);
+	pr_buf(out, "%u", !list_empty_careful(&b->write_blocked));
+	pr_newline(out);
+
+	pr_buf(out, "will make reachable:");
+	pr_tab(out);
+	pr_buf(out, "%lx", b->will_make_reachable);
+	pr_newline(out);
+
+	pr_buf(out, "journal pin %px:", &b->writes[0].journal);
+	pr_tab(out);
+	pr_buf(out, "%llu", b->writes[0].journal.seq);
+	pr_newline(out);
+
+	pr_buf(out, "journal pin %px:", &b->writes[1].journal);
+	pr_tab(out);
+	pr_buf(out, "%llu", b->writes[1].journal.seq);
+	pr_newline(out);
+
+	pr_indent_pop(out, 2);
+}
+
+static ssize_t bch2_cached_btree_nodes_read(struct file *file, char __user *buf,
+					    size_t size, loff_t *ppos)
+{
+	struct dump_iter *i = file->private_data;
+	struct bch_fs *c = i->c;
+	bool done = false;
+	int err;
+
+	i->ubuf = buf;
+	i->size	= size;
+	i->ret	= 0;
+
+	do {
+		struct bucket_table *tbl;
+		struct rhash_head *pos;
+		struct btree *b;
+
+		err = flush_buf(i);
+		if (err)
+			return err;
+
+		if (!i->size)
+			break;
+
+		rcu_read_lock();
+		i->buf.atomic++;
+		tbl = rht_dereference_rcu(c->btree_cache.table.tbl,
+					  &c->btree_cache.table);
+		if (i->iter < tbl->size) {
+			rht_for_each_entry_rcu(b, pos, tbl, i->iter, hash)
+				bch2_cached_btree_node_to_text(&i->buf, c, b);
+			i->iter++;;
+		} else {
+			done = true;
+		}
+		--i->buf.atomic;
+		rcu_read_unlock();
+	} while (!done);
+
+	if (i->buf.allocation_failure)
+		return -ENOMEM;
+
+	return i->ret;
+}
+
+static const struct file_operations cached_btree_nodes_ops = {
+	.owner		= THIS_MODULE,
+	.open		= bch2_dump_open,
+	.release	= bch2_dump_release,
+	.read		= bch2_cached_btree_nodes_read,
+};
+
+static ssize_t bch2_journal_pins_read(struct file *file, char __user *buf,
+				      size_t size, loff_t *ppos)
+{
+	struct dump_iter *i = file->private_data;
+	struct bch_fs *c = i->c;
+	bool done = false;
+	int err;
+
+	i->ubuf = buf;
+	i->size	= size;
+	i->ret	= 0;
+
+	do {
+		err = flush_buf(i);
+		if (err)
+			return err;
+
+		if (!i->size)
+			break;
+
+		done = bch2_journal_seq_pins_to_text(&i->buf, &c->journal, &i->iter);
+		i->iter++;
+	} while (!done);
+
+	if (i->buf.allocation_failure)
+		return -ENOMEM;
+
+	return i->ret;
+}
+
+static const struct file_operations journal_pins_ops = {
+	.owner		= THIS_MODULE,
+	.open		= bch2_dump_open,
+	.release	= bch2_dump_release,
+	.read		= bch2_journal_pins_read,
+};
+
 void bch2_fs_debug_exit(struct bch_fs *c)
 {
-	if (!IS_ERR_OR_NULL(c->debug))
-		debugfs_remove_recursive(c->debug);
+	if (!IS_ERR_OR_NULL(c->fs_debug_dir))
+		debugfs_remove_recursive(c->fs_debug_dir);
 }
 
 void bch2_fs_debug_init(struct bch_fs *c)
@@ -435,29 +575,39 @@ void bch2_fs_debug_init(struct bch_fs *c)
 		return;
 
 	snprintf(name, sizeof(name), "%pU", c->sb.user_uuid.b);
-	c->debug = debugfs_create_dir(name, bch_debug);
-	if (IS_ERR_OR_NULL(c->debug))
+	c->fs_debug_dir = debugfs_create_dir(name, bch_debug);
+	if (IS_ERR_OR_NULL(c->fs_debug_dir))
+		return;
+
+	debugfs_create_file("cached_btree_nodes", 0400, c->fs_debug_dir,
+			    c->btree_debug, &cached_btree_nodes_ops);
+
+	debugfs_create_file("journal_pins", 0400, c->fs_debug_dir,
+			    c->btree_debug, &journal_pins_ops);
+
+	c->btree_debug_dir = debugfs_create_dir("btrees", c->fs_debug_dir);
+	if (IS_ERR_OR_NULL(c->btree_debug_dir))
 		return;
 
 	for (bd = c->btree_debug;
 	     bd < c->btree_debug + ARRAY_SIZE(c->btree_debug);
 	     bd++) {
 		bd->id = bd - c->btree_debug;
-		bd->btree = debugfs_create_file(bch2_btree_ids[bd->id],
-						0400, c->debug, bd,
-						&btree_debug_ops);
+		debugfs_create_file(bch2_btree_ids[bd->id],
+				    0400, c->btree_debug_dir, bd,
+				    &btree_debug_ops);
 
 		snprintf(name, sizeof(name), "%s-formats",
 			 bch2_btree_ids[bd->id]);
 
-		bd->btree_format = debugfs_create_file(name, 0400, c->debug, bd,
-						       &btree_format_debug_ops);
+		debugfs_create_file(name, 0400, c->btree_debug_dir, bd,
+				    &btree_format_debug_ops);
 
 		snprintf(name, sizeof(name), "%s-bfloat-failed",
 			 bch2_btree_ids[bd->id]);
 
-		bd->failed = debugfs_create_file(name, 0400, c->debug, bd,
-						 &bfloat_failed_debug_ops);
+		debugfs_create_file(name, 0400, c->btree_debug_dir, bd,
+				    &bfloat_failed_debug_ops);
 	}
 }
 
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index a579e6483d1e..0cbd86d04636 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -1283,35 +1283,59 @@ void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
 	spin_unlock(&j->lock);
 }
 
-void bch2_journal_pins_to_text(struct printbuf *out, struct journal *j)
+bool bch2_journal_seq_pins_to_text(struct printbuf *out, struct journal *j, u64 *seq)
 {
 	struct journal_entry_pin_list *pin_list;
 	struct journal_entry_pin *pin;
-	u64 i;
 
 	spin_lock(&j->lock);
+	*seq = max(*seq, j->pin.front);
+
+	if (*seq >= j->pin.back) {
+		spin_unlock(&j->lock);
+		return true;
+	}
+
 	out->atomic++;
 
-	fifo_for_each_entry_ptr(pin_list, &j->pin, i) {
-		pr_buf(out, "%llu: count %u\n",
-		       i, atomic_read(&pin_list->count));
+	pin_list = journal_seq_pin(j, *seq);
 
-		list_for_each_entry(pin, &pin_list->key_cache_list, list)
-			pr_buf(out, "\t%px %ps\n",
-			       pin, pin->flush);
+	pr_buf(out, "%llu: count %u", *seq, atomic_read(&pin_list->count));
+	pr_newline(out);
+	pr_indent_push(out, 2);
 
-		list_for_each_entry(pin, &pin_list->list, list)
-			pr_buf(out, "\t%px %ps\n",
-			       pin, pin->flush);
+	list_for_each_entry(pin, &pin_list->list, list) {
+		pr_buf(out, "\t%px %ps", pin, pin->flush);
+		pr_newline(out);
+	}
+
+	list_for_each_entry(pin, &pin_list->key_cache_list, list) {
+		pr_buf(out, "\t%px %ps", pin, pin->flush);
+		pr_newline(out);
+	}
 
-		if (!list_empty(&pin_list->flushed))
-			pr_buf(out, "flushed:\n");
+	if (!list_empty(&pin_list->flushed)) {
+		pr_buf(out, "flushed:");
+		pr_newline(out);
+	}
 
-		list_for_each_entry(pin, &pin_list->flushed, list)
-			pr_buf(out, "\t%px %ps\n",
-			       pin, pin->flush);
+	list_for_each_entry(pin, &pin_list->flushed, list) {
+		pr_buf(out, "\t%px %ps", pin, pin->flush);
+		pr_newline(out);
 	}
 
+	pr_indent_pop(out, 2);
+
 	--out->atomic;
 	spin_unlock(&j->lock);
+
+	return false;
+}
+
+void bch2_journal_pins_to_text(struct printbuf *out, struct journal *j)
+{
+	u64 seq = 0;
+
+	while (!bch2_journal_seq_pins_to_text(out, j, &seq))
+		seq++;
 }
diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h
index 5d263a5b8685..6c7a38ad2195 100644
--- a/fs/bcachefs/journal.h
+++ b/fs/bcachefs/journal.h
@@ -499,6 +499,7 @@ void bch2_journal_block(struct journal *);
 void __bch2_journal_debug_to_text(struct printbuf *, struct journal *);
 void bch2_journal_debug_to_text(struct printbuf *, struct journal *);
 void bch2_journal_pins_to_text(struct printbuf *, struct journal *);
+bool bch2_journal_seq_pins_to_text(struct printbuf *, struct journal *, u64 *);
 
 int bch2_set_nr_journal_buckets(struct bch_fs *, struct bch_dev *,
 				unsigned nr);
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index ce32b9068518..3018250d421b 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -174,9 +174,7 @@ read_attribute(reserve_stats);
 read_attribute(btree_cache_size);
 read_attribute(compression_stats);
 read_attribute(journal_debug);
-read_attribute(journal_pins);
 read_attribute(btree_updates);
-read_attribute(dirty_btree_nodes);
 read_attribute(btree_cache);
 read_attribute(btree_key_cache);
 read_attribute(btree_transactions);
@@ -402,15 +400,9 @@ SHOW(bch2_fs)
 	if (attr == &sysfs_journal_debug)
 		bch2_journal_debug_to_text(out, &c->journal);
 
-	if (attr == &sysfs_journal_pins)
-		bch2_journal_pins_to_text(out, &c->journal);
-
 	if (attr == &sysfs_btree_updates)
 		bch2_btree_updates_to_text(out, c);
 
-	if (attr == &sysfs_dirty_btree_nodes)
-		bch2_dirty_btree_nodes_to_text(out, c);
-
 	if (attr == &sysfs_btree_cache)
 		bch2_btree_cache_to_text(out, c);
 
@@ -564,9 +556,7 @@ SYSFS_OPS(bch2_fs_internal);
 
 struct attribute *bch2_fs_internal_files[] = {
 	&sysfs_journal_debug,
-	&sysfs_journal_pins,
 	&sysfs_btree_updates,
-	&sysfs_dirty_btree_nodes,
 	&sysfs_btree_cache,
 	&sysfs_btree_key_cache,
 	&sysfs_btree_transactions,
-- 
cgit 


From 39dcace83889f43d5619d07c2ec76c286c88a85b Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 26 Feb 2022 21:35:16 -0500
Subject: bcachefs: Fix locking in btree_node_write_done()

There was a rare recursive locking bug, in __bch2_btree_node_write()
nowrite path -> btree_node_write_done(), in the path that kicks off
another write.

This splits out an inner __btree_node_write_done() that expects to be
run with the btree node lock held.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_io.c | 25 +++++++------------------
 1 file changed, 7 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index fd7f2a78473c..f4d6a6c5096d 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -1592,29 +1592,13 @@ void bch2_btree_complete_write(struct bch_fs *c, struct btree *b,
 	bch2_journal_pin_drop(&c->journal, &w->journal);
 }
 
-static void btree_node_write_done(struct bch_fs *c, struct btree *b)
+static void __btree_node_write_done(struct bch_fs *c, struct btree *b)
 {
 	struct btree_write *w = btree_prev_write(b);
 	unsigned long old, new, v;
 
 	bch2_btree_complete_write(c, b, w);
 
-	v = READ_ONCE(b->flags);
-	do {
-		old = new = v;
-
-		if (old & (1U << BTREE_NODE_need_write))
-			goto do_write;
-
-		new &= ~(1U << BTREE_NODE_write_in_flight);
-		new &= ~(1U << BTREE_NODE_write_in_flight_inner);
-	} while ((v = cmpxchg(&b->flags, old, new)) != old);
-
-	wake_up_bit(&b->flags, BTREE_NODE_write_in_flight);
-	return;
-
-do_write:
-	six_lock_read(&b->c.lock, NULL, NULL);
 	v = READ_ONCE(b->flags);
 	do {
 		old = new = v;
@@ -1637,7 +1621,12 @@ do_write:
 
 	if (new & (1U << BTREE_NODE_write_in_flight))
 		__bch2_btree_node_write(c, b, true);
+}
 
+static void btree_node_write_done(struct bch_fs *c, struct btree *b)
+{
+	six_lock_read(&b->c.lock, NULL, NULL);
+	__btree_node_write_done(c, b);
 	six_unlock_read(&b->c.lock);
 }
 
@@ -1992,7 +1981,7 @@ err:
 	b->written += sectors_to_write;
 nowrite:
 	btree_bounce_free(c, bytes, used_mempool, data);
-	btree_node_write_done(c, b);
+	__btree_node_write_done(c, b);
 }
 
 /*
-- 
cgit 


From 82732ef510b8455bbf9e9292b6fd04cb724bdadf Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 26 Feb 2022 21:46:41 -0500
Subject: bcachefs: Improve btree_node_write_if_need()

btree_node_write_if_need() kicks off a btree node write only if
need_write is set; this makes the locking easier to reason about by
moving the check into the cmpxchg loop in __bch2_btree_node_write().

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_cache.c           |  6 +++---
 fs/bcachefs/btree_io.c              | 22 ++++++++++++++--------
 fs/bcachefs/btree_io.h              | 13 ++++++-------
 fs/bcachefs/btree_update_interior.c | 12 ++++++------
 4 files changed, 29 insertions(+), 24 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index a6b8ca85fc94..7b264619c276 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -239,9 +239,9 @@ wait_on_io:
 		 * the post write cleanup:
 		 */
 		if (bch2_verify_btree_ondisk)
-			bch2_btree_node_write(c, b, SIX_LOCK_intent);
+			bch2_btree_node_write(c, b, SIX_LOCK_intent, 0);
 		else
-			__bch2_btree_node_write(c, b, false);
+			__bch2_btree_node_write(c, b, 0);
 
 		six_unlock_write(&b->c.lock);
 		six_unlock_intent(&b->c.lock);
@@ -1064,7 +1064,7 @@ wait_on_io:
 	six_lock_write(&b->c.lock, NULL, NULL);
 
 	if (btree_node_dirty(b)) {
-		__bch2_btree_node_write(c, b, false);
+		__bch2_btree_node_write(c, b, 0);
 		six_unlock_write(&b->c.lock);
 		six_unlock_intent(&b->c.lock);
 		goto wait_on_io;
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index f4d6a6c5096d..540bfe07c128 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -471,7 +471,7 @@ void bch2_btree_init_next(struct btree_trans *trans, struct btree *b)
 		};
 
 		if (log_u64s[1] >= (log_u64s[0] + log_u64s[2]) / 2) {
-			bch2_btree_node_write(c, b, SIX_LOCK_write);
+			bch2_btree_node_write(c, b, SIX_LOCK_write, 0);
 			reinit_iter = true;
 		}
 	}
@@ -1620,7 +1620,7 @@ static void __btree_node_write_done(struct bch_fs *c, struct btree *b)
 	} while ((v = cmpxchg(&b->flags, old, new)) != old);
 
 	if (new & (1U << BTREE_NODE_write_in_flight))
-		__bch2_btree_node_write(c, b, true);
+		__bch2_btree_node_write(c, b, BTREE_WRITE_ALREADY_STARTED);
 }
 
 static void btree_node_write_done(struct bch_fs *c, struct btree *b)
@@ -1741,7 +1741,7 @@ static void btree_write_submit(struct work_struct *work)
 	bch2_submit_wbio_replicas(&wbio->wbio, wbio->wbio.c, BCH_DATA_btree, &tmp.k);
 }
 
-void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, bool already_started)
+void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, unsigned flags)
 {
 	struct btree_write_bio *wbio;
 	struct bset_tree *t;
@@ -1758,7 +1758,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, bool already_sta
 	void *data;
 	int ret;
 
-	if (already_started)
+	if (flags & BTREE_WRITE_ALREADY_STARTED)
 		goto do_write;
 
 	/*
@@ -1774,13 +1774,18 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, bool already_sta
 		if (!(old & (1 << BTREE_NODE_dirty)))
 			return;
 
+		if ((flags & BTREE_WRITE_ONLY_IF_NEED) &&
+		    !(old & (1 << BTREE_NODE_need_write)))
+			return;
+
 		if (!btree_node_may_write(b))
 			return;
 
 		if (old & (1 << BTREE_NODE_never_write))
 			return;
 
-		BUG_ON(old & (1 << BTREE_NODE_write_in_flight));
+		if (old & (1 << BTREE_NODE_write_in_flight))
+			return;
 
 		new &= ~(1 << BTREE_NODE_dirty);
 		new &= ~(1 << BTREE_NODE_need_write);
@@ -2044,12 +2049,13 @@ bool bch2_btree_post_write_cleanup(struct bch_fs *c, struct btree *b)
  * Use this one if the node is intent locked:
  */
 void bch2_btree_node_write(struct bch_fs *c, struct btree *b,
-			   enum six_lock_type lock_type_held)
+			   enum six_lock_type lock_type_held,
+			   unsigned flags)
 {
 	if (lock_type_held == SIX_LOCK_intent ||
 	    (lock_type_held == SIX_LOCK_read &&
 	     six_lock_tryupgrade(&b->c.lock))) {
-		__bch2_btree_node_write(c, b, false);
+		__bch2_btree_node_write(c, b, flags);
 
 		/* don't cycle lock unnecessarily: */
 		if (btree_node_just_written(b) &&
@@ -2061,7 +2067,7 @@ void bch2_btree_node_write(struct bch_fs *c, struct btree *b,
 		if (lock_type_held == SIX_LOCK_read)
 			six_lock_downgrade(&b->c.lock);
 	} else {
-		__bch2_btree_node_write(c, b, false);
+		__bch2_btree_node_write(c, b, flags);
 		if (lock_type_held == SIX_LOCK_write &&
 		    btree_node_just_written(b))
 			bch2_btree_post_write_cleanup(c, b);
diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h
index 638a9b30f0cb..3dbb518c4da4 100644
--- a/fs/bcachefs/btree_io.h
+++ b/fs/bcachefs/btree_io.h
@@ -143,20 +143,19 @@ int bch2_btree_root_read(struct bch_fs *, enum btree_id,
 void bch2_btree_complete_write(struct bch_fs *, struct btree *,
 			      struct btree_write *);
 
-void __bch2_btree_node_write(struct bch_fs *, struct btree *, bool);
 bool bch2_btree_post_write_cleanup(struct bch_fs *, struct btree *);
 
+#define BTREE_WRITE_ONLY_IF_NEED	(1U << 0)
+#define BTREE_WRITE_ALREADY_STARTED	(1U << 1)
+
+void __bch2_btree_node_write(struct bch_fs *, struct btree *, unsigned);
 void bch2_btree_node_write(struct bch_fs *, struct btree *,
-			  enum six_lock_type);
+			   enum six_lock_type, unsigned);
 
 static inline void btree_node_write_if_need(struct bch_fs *c, struct btree *b,
 					    enum six_lock_type lock_held)
 {
-	if (b->written &&
-	    btree_node_need_write(b) &&
-	    btree_node_may_write(b) &&
-	    !btree_node_write_in_flight(b))
-		bch2_btree_node_write(c, b, lock_held);
+	bch2_btree_node_write(c, b, lock_held, BTREE_WRITE_ONLY_IF_NEED);
 }
 
 #define bch2_btree_node_write_cond(_c, _b, cond)			\
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index f4ee78e84f71..fe0fc5ff1549 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -1393,8 +1393,8 @@ static void btree_split(struct btree_update *as, struct btree_trans *trans,
 		six_unlock_write(&n2->c.lock);
 		six_unlock_write(&n1->c.lock);
 
-		bch2_btree_node_write(c, n1, SIX_LOCK_intent);
-		bch2_btree_node_write(c, n2, SIX_LOCK_intent);
+		bch2_btree_node_write(c, n1, SIX_LOCK_intent, 0);
+		bch2_btree_node_write(c, n2, SIX_LOCK_intent, 0);
 
 		/*
 		 * Note that on recursive parent_keys == keys, so we
@@ -1413,7 +1413,7 @@ static void btree_split(struct btree_update *as, struct btree_trans *trans,
 
 			btree_split_insert_keys(as, trans, path, n3, &as->parent_keys);
 
-			bch2_btree_node_write(c, n3, SIX_LOCK_intent);
+			bch2_btree_node_write(c, n3, SIX_LOCK_intent, 0);
 		}
 	} else {
 		trace_btree_compact(c, b);
@@ -1421,7 +1421,7 @@ static void btree_split(struct btree_update *as, struct btree_trans *trans,
 		bch2_btree_build_aux_trees(n1);
 		six_unlock_write(&n1->c.lock);
 
-		bch2_btree_node_write(c, n1, SIX_LOCK_intent);
+		bch2_btree_node_write(c, n1, SIX_LOCK_intent, 0);
 
 		if (parent)
 			bch2_keylist_add(&as->parent_keys, &n1->key);
@@ -1709,7 +1709,7 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans,
 	bch2_btree_build_aux_trees(n);
 	six_unlock_write(&n->c.lock);
 
-	bch2_btree_node_write(c, n, SIX_LOCK_intent);
+	bch2_btree_node_write(c, n, SIX_LOCK_intent, 0);
 
 	bkey_init(&delete.k);
 	delete.k.p = prev->key.k.p;
@@ -1783,7 +1783,7 @@ int bch2_btree_node_rewrite(struct btree_trans *trans,
 
 	trace_btree_gc_rewrite_node(c, b);
 
-	bch2_btree_node_write(c, n, SIX_LOCK_intent);
+	bch2_btree_node_write(c, n, SIX_LOCK_intent, 0);
 
 	if (parent) {
 		bch2_keylist_add(&as->parent_keys, &n->key);
-- 
cgit 


From 6f5f747c318be4adf3824ee7716a7886da35f9a3 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 27 Feb 2022 09:42:46 -0500
Subject: bcachefs: Kill bch2_btree_node_write_cond()

bch2_btree_node_write_cond() was only used in one place - this inlines
it into __btree_node_flush() and makes the cmpxchg loop actually
correct.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_io.h          | 16 ----------------
 fs/bcachefs/btree_update_leaf.c | 18 ++++++++++++++++--
 2 files changed, 16 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h
index 3dbb518c4da4..7ed88089f6f9 100644
--- a/fs/bcachefs/btree_io.h
+++ b/fs/bcachefs/btree_io.h
@@ -158,22 +158,6 @@ static inline void btree_node_write_if_need(struct bch_fs *c, struct btree *b,
 	bch2_btree_node_write(c, b, lock_held, BTREE_WRITE_ONLY_IF_NEED);
 }
 
-#define bch2_btree_node_write_cond(_c, _b, cond)			\
-do {									\
-	unsigned long old, new, v = READ_ONCE((_b)->flags);		\
-									\
-	do {								\
-		old = new = v;						\
-									\
-		if (!(old & (1 << BTREE_NODE_dirty)) || !(cond))	\
-			break;						\
-									\
-		new |= (1 << BTREE_NODE_need_write);			\
-	} while ((v = cmpxchg(&(_b)->flags, old, new)) != old);		\
-									\
-	btree_node_write_if_need(_c, _b, SIX_LOCK_read);		\
-} while (0)
-
 void bch2_btree_flush_all_reads(struct bch_fs *);
 void bch2_btree_flush_all_writes(struct bch_fs *);
 
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 94d0b8bd014b..dc4dfcda8f21 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -168,10 +168,24 @@ static int __btree_node_flush(struct journal *j, struct journal_entry_pin *pin,
 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
 	struct btree_write *w = container_of(pin, struct btree_write, journal);
 	struct btree *b = container_of(w, struct btree, writes[i]);
+	unsigned long old, new, v;
+	unsigned idx = w - b->writes;
 
 	six_lock_read(&b->c.lock, NULL, NULL);
-	bch2_btree_node_write_cond(c, b,
-		(btree_current_write(b) == w && w->journal.seq == seq));
+	v = READ_ONCE(b->flags);
+
+	do {
+		old = new = v;
+
+		if (!(old & (1 << BTREE_NODE_dirty)) ||
+		    !!(old & (1 << BTREE_NODE_write_idx)) != idx ||
+		    w->journal.seq != seq)
+			break;
+
+		new |= 1 << BTREE_NODE_need_write;
+	} while ((v = cmpxchg(&b->flags, old, new)) != old);
+
+	btree_node_write_if_need(c, b, SIX_LOCK_read);
 	six_unlock_read(&b->c.lock);
 	return 0;
 }
-- 
cgit 


From bf3efff5e4fc2dcd6e6c15578d3f08c301a13229 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 27 Feb 2022 09:56:33 -0500
Subject: bcachefs: Fix race leading to btree node write getting stuck

Checking btree_node_may_write() isn't atomic with the other btree flags,
dirty and need_write in particular. There was a rare race where we'd
unblock a node from writing while __btree_node_flush() was setting
need_write, and no thread would notice that the node was now both able
to write and needed to be written.

Fix this by adding btree node flags for will_make_reachable and
write_blocked that can be checked in the cmpxchg loop in
__bch2_btree_node_write.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_cache.c           |  7 +++----
 fs/bcachefs/btree_io.c              | 10 +++++++---
 fs/bcachefs/btree_io.h              |  6 ------
 fs/bcachefs/btree_types.h           |  2 ++
 fs/bcachefs/btree_update_interior.c |  7 +++++++
 5 files changed, 19 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index 7b264619c276..5f96c5d1a064 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -223,10 +223,9 @@ wait_on_io:
 		goto wait_on_io;
 	}
 
-	if (btree_node_noevict(b))
-		goto out_unlock;
-
-	if (!btree_node_may_write(b))
+	if (btree_node_noevict(b) ||
+	    btree_node_write_blocked(b) ||
+	    btree_node_will_make_reachable(b))
 		goto out_unlock;
 
 	if (btree_node_dirty(b)) {
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 540bfe07c128..53f83340f69a 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -1606,7 +1606,8 @@ static void __btree_node_write_done(struct bch_fs *c, struct btree *b)
 		if ((old & (1U << BTREE_NODE_dirty)) &&
 		    (old & (1U << BTREE_NODE_need_write)) &&
 		    !(old & (1U << BTREE_NODE_never_write)) &&
-		    btree_node_may_write(b)) {
+		    !(old & (1U << BTREE_NODE_write_blocked)) &&
+		    !(old & (1U << BTREE_NODE_will_make_reachable))) {
 			new &= ~(1U << BTREE_NODE_dirty);
 			new &= ~(1U << BTREE_NODE_need_write);
 			new |=  (1U << BTREE_NODE_write_in_flight);
@@ -1778,10 +1779,13 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, unsigned flags)
 		    !(old & (1 << BTREE_NODE_need_write)))
 			return;
 
-		if (!btree_node_may_write(b))
+		if (old &
+		    ((1 << BTREE_NODE_never_write)|
+		     (1 << BTREE_NODE_write_blocked)))
 			return;
 
-		if (old & (1 << BTREE_NODE_never_write))
+		if (b->written &&
+		    (old & (1 << BTREE_NODE_will_make_reachable)))
 			return;
 
 		if (old & (1 << BTREE_NODE_write_in_flight))
diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h
index 7ed88089f6f9..d818d87661e8 100644
--- a/fs/bcachefs/btree_io.h
+++ b/fs/bcachefs/btree_io.h
@@ -62,12 +62,6 @@ void __bch2_btree_node_wait_on_write(struct btree *);
 void bch2_btree_node_wait_on_read(struct btree *);
 void bch2_btree_node_wait_on_write(struct btree *);
 
-static inline bool btree_node_may_write(struct btree *b)
-{
-	return list_empty_careful(&b->write_blocked) &&
-		(!b->written || !b->will_make_reachable);
-}
-
 enum compact_mode {
 	COMPACT_LAZY,
 	COMPACT_ALL,
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 165466db222d..561406b4b7c2 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -434,6 +434,8 @@ struct btree_trans {
 	x(read_error)							\
 	x(dirty)							\
 	x(need_write)							\
+	x(write_blocked)						\
+	x(will_make_reachable)						\
 	x(noevict)							\
 	x(write_idx)							\
 	x(accessed)							\
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index fe0fc5ff1549..17d65c9e2bd4 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -606,6 +606,8 @@ err:
 		mutex_lock(&c->btree_interior_update_lock);
 
 		list_del(&as->write_blocked_list);
+		if (list_empty(&b->write_blocked))
+			clear_btree_node_write_blocked(b);
 
 		/*
 		 * Node might have been freed, recheck under
@@ -650,6 +652,7 @@ err:
 
 		BUG_ON(b->will_make_reachable != (unsigned long) as);
 		b->will_make_reachable = 0;
+		clear_btree_node_will_make_reachable(b);
 	}
 	mutex_unlock(&c->btree_interior_update_lock);
 
@@ -716,6 +719,8 @@ static void btree_update_updated_node(struct btree_update *as, struct btree *b)
 
 	as->mode	= BTREE_INTERIOR_UPDATING_NODE;
 	as->b		= b;
+
+	set_btree_node_write_blocked(b);
 	list_add(&as->write_blocked_list, &b->write_blocked);
 
 	mutex_unlock(&c->btree_interior_update_lock);
@@ -781,6 +786,7 @@ static void bch2_btree_update_add_new_node(struct btree_update *as, struct btree
 
 	as->new_nodes[as->nr_new_nodes++] = b;
 	b->will_make_reachable = 1UL|(unsigned long) as;
+	set_btree_node_will_make_reachable(b);
 
 	mutex_unlock(&c->btree_interior_update_lock);
 
@@ -803,6 +809,7 @@ static void btree_update_drop_new_node(struct bch_fs *c, struct btree *b)
 	 * xchg() is for synchronization with bch2_btree_complete_write:
 	 */
 	v = xchg(&b->will_make_reachable, 0);
+	clear_btree_node_will_make_reachable(b);
 	as = (struct btree_update *) (v & ~1UL);
 
 	if (!as) {
-- 
cgit 


From 7db4cbd0a52554ddec4cabf2ebd69fc7bcd53a31 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 27 Feb 2022 11:57:42 -0500
Subject: bcachefs: Fix a memory leak

This fixes a regression from "bcachefs: Heap allocate printbufs" -
bch2_sb_field_validate() was leaking an error string.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/super-io.c | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index c616ce5ed194..31b175a8fcd0 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -1423,24 +1423,25 @@ static const struct bch_sb_field_ops *bch2_sb_field_ops[] = {
 };
 
 static int bch2_sb_field_validate(struct bch_sb *sb, struct bch_sb_field *f,
-				  struct printbuf *orig_err)
+				  struct printbuf *err)
 {
 	unsigned type = le32_to_cpu(f->type);
-	struct printbuf err = *orig_err;
+	struct printbuf field_err = PRINTBUF;
 	int ret;
 
 	if (type >= BCH_SB_FIELD_NR)
 		return 0;
 
-	pr_buf(&err, "Invalid superblock section %s: ", bch2_sb_fields[type]);
-
-	ret = bch2_sb_field_ops[type]->validate(sb, f, &err);
+	ret = bch2_sb_field_ops[type]->validate(sb, f, &field_err);
 	if (ret) {
-		pr_newline(&err);
-		bch2_sb_field_to_text(&err, sb, f);
-		*orig_err = err;
+		pr_buf(err, "Invalid superblock section %s: %s",
+		       bch2_sb_fields[type],
+		       field_err.buf);
+		pr_newline(err);
+		bch2_sb_field_to_text(err, sb, f);
 	}
 
+	printbuf_exit(&field_err);
 	return ret;
 }
 
-- 
cgit 


From ddf11d8c60b0d46dd25520d388243b508d6e9016 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 27 Feb 2022 11:34:21 -0500
Subject: bcachefs: Fix a use after free

This fixes a regression from "bcachefs: Stash a copy of key being
overwritten in btree_insert_entry". In btree_key_can_insert_cached(), we
may reallocate the key cache key, invalidating pointers previously
returned by peek() - fix it by issuing a transaction restart.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_update_leaf.c | 13 ++++++++++++-
 fs/bcachefs/trace.h             |  8 ++++++++
 2 files changed, 20 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index dc4dfcda8f21..42ee54cf390d 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -382,7 +382,18 @@ btree_key_can_insert_cached(struct btree_trans *trans,
 
 	ck->u64s	= new_u64s;
 	ck->k		= new_k;
-	return BTREE_INSERT_OK;
+	/*
+	 * Keys returned by peek() are no longer valid pointers, so we need a
+	 * transaction restart:
+	 */
+	trace_trans_restart_key_cache_key_realloced(trans->fn, _RET_IP_,
+					     path->btree_id, &path->pos);
+	/*
+	 * Not using btree_trans_restart() because we can't unlock here, we have
+	 * write locks held:
+	 */
+	trans->restarted = true;
+	return -EINTR;
 }
 
 static inline void do_btree_insert_one(struct btree_trans *trans,
diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h
index b35022dc66c2..af3785254c71 100644
--- a/fs/bcachefs/trace.h
+++ b/fs/bcachefs/trace.h
@@ -918,6 +918,14 @@ TRACE_EVENT(trans_restart_mem_realloced,
 		  __entry->bytes)
 );
 
+DEFINE_EVENT(transaction_restart_iter,	trans_restart_key_cache_key_realloced,
+	TP_PROTO(const char *trans_fn,
+		 unsigned long caller_ip,
+		 enum btree_id btree_id,
+		 struct bpos *pos),
+	TP_ARGS(trans_fn, caller_ip, btree_id, pos)
+);
+
 #endif /* _TRACE_BCACHEFS_H */
 
 /* This part must be outside protection */
-- 
cgit 


From 506bac7e59d93cfd883dab0697ed91850f319be6 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 28 Feb 2022 15:51:24 -0500
Subject: bcachefs: Delete some dead journal code

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/journal.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index 0cbd86d04636..64875c8150f7 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -272,6 +272,12 @@ static int journal_entry_open(struct journal *j)
 
 	BUG_ON(!j->cur_entry_sectors);
 
+	/* We used to add things to the first journal entry before opening it,
+	 * as a way to deal with a chicken-and-the-egg problem, but we shouldn't
+	 * be anymore:
+	 */
+	BUG_ON(buf->data->u64s);
+
 	buf->u64s_reserved	= j->entry_u64s_reserved;
 	buf->disk_sectors	= j->cur_entry_sectors;
 	buf->sectors		= min(buf->disk_sectors, buf->buf_size >> 9);
@@ -280,7 +286,7 @@ static int journal_entry_open(struct journal *j)
 		journal_entry_overhead(j);
 	u64s  = clamp_t(int, u64s, 0, JOURNAL_ENTRY_CLOSED_VAL - 1);
 
-	if (u64s <= le32_to_cpu(buf->data->u64s))
+	if (u64s <= 0)
 		return cur_entry_journal_full;
 
 	/*
@@ -295,11 +301,9 @@ static int journal_entry_open(struct journal *j)
 		if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL)
 			return cur_entry_insufficient_devices;
 
-		/* Handle any already added entries */
-		new.cur_entry_offset = le32_to_cpu(buf->data->u64s);
-
 		EBUG_ON(journal_state_count(new, new.idx));
 		journal_state_inc(&new);
+		new.cur_entry_offset = 0;
 	} while ((v = atomic64_cmpxchg(&j->reservations.counter,
 				       old.v, new.v)) != old.v);
 
-- 
cgit 


From fbec3b8800ac8244ce751d0ba5b83d94ee48fc76 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 25 Feb 2022 10:28:20 -0500
Subject: bcachefs: Kill JOURNAL_NEED_WRITE

This replaces the journal flag JOURNAL_NEED_WRITE with per-journal buf
state - more explicit, and solving a race in the old code that would
lead to entries being opened and written unnecessarily.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/journal.c       | 63 ++++++++++++++++++++++++++++++++-------------
 fs/bcachefs/journal_io.c    | 12 +++++----
 fs/bcachefs/journal_types.h | 10 ++-----
 3 files changed, 54 insertions(+), 31 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index 64875c8150f7..880ca2061012 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -86,6 +86,7 @@ static void bch2_journal_buf_init(struct journal *j)
 	buf->noflush	= false;
 	buf->must_flush	= false;
 	buf->separate_flush = false;
+	buf->flush_time	= 0;
 
 	memset(buf->data, 0, sizeof(*buf->data));
 	buf->data->seq	= cpu_to_le64(journal_cur_seq(j));
@@ -152,11 +153,6 @@ static bool __journal_entry_close(struct journal *j)
 			return true;
 		}
 
-		if (!test_bit(JOURNAL_NEED_WRITE, &j->flags)) {
-			set_bit(JOURNAL_NEED_WRITE, &j->flags);
-			j->need_write_time = local_clock();
-		}
-
 		new.cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL;
 		new.idx++;
 
@@ -205,7 +201,6 @@ static bool __journal_entry_close(struct journal *j)
 	bch2_journal_buf_init(j);
 
 	cancel_delayed_work(&j->write_work);
-	clear_bit(JOURNAL_NEED_WRITE, &j->flags);
 
 	bch2_journal_space_available(j);
 
@@ -216,15 +211,16 @@ static bool __journal_entry_close(struct journal *j)
 static bool journal_entry_want_write(struct journal *j)
 {
 	union journal_res_state s = READ_ONCE(j->reservations);
+	struct journal_buf *buf = journal_cur_buf(j);
 	bool ret = false;
 
-	/*
-	 * Don't close it yet if we already have a write in flight, but do set
-	 * NEED_WRITE:
-	 */
-	if (s.idx != s.unwritten_idx)
-		set_bit(JOURNAL_NEED_WRITE, &j->flags);
-	else
+	if (!buf->flush_time) {
+		buf->flush_time	= local_clock() ?: 1;
+		buf->expires = jiffies;
+	}
+
+	/* Don't close it yet if we already have a write in flight: */
+	if (s.idx == s.unwritten_idx)
 		ret = __journal_entry_close(j);
 
 	return ret;
@@ -278,6 +274,8 @@ static int journal_entry_open(struct journal *j)
 	 */
 	BUG_ON(buf->data->u64s);
 
+	buf->expires		= jiffies +
+		msecs_to_jiffies(c->opts.journal_flush_delay);
 	buf->u64s_reserved	= j->entry_u64s_reserved;
 	buf->disk_sectors	= j->cur_entry_sectors;
 	buf->sectors		= min(buf->disk_sectors, buf->buf_size >> 9);
@@ -337,8 +335,19 @@ static void journal_quiesce(struct journal *j)
 static void journal_write_work(struct work_struct *work)
 {
 	struct journal *j = container_of(work, struct journal, write_work.work);
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
+	struct journal_buf *buf;
+	long delta;
 
-	journal_entry_close(j);
+	spin_lock(&j->lock);
+	buf = journal_cur_buf(j);
+	delta = buf->expires - jiffies;
+
+	if (delta > 0)
+		mod_delayed_work(c->io_complete_wq, &j->write_work, delta);
+	else
+		__journal_entry_close(j);
+	spin_unlock(&j->lock);
 }
 
 static int __journal_res_get(struct journal *j, struct journal_res *res,
@@ -591,7 +600,11 @@ recheck_need_open:
 		seq = res.seq;
 		buf = j->buf + (seq & JOURNAL_BUF_MASK);
 		buf->must_flush = true;
-		set_bit(JOURNAL_NEED_WRITE, &j->flags);
+
+		if (!buf->flush_time) {
+			buf->flush_time	= local_clock() ?: 1;
+			buf->expires = jiffies;
+		}
 
 		if (parent && !closure_wait(&buf->wait, parent))
 			BUG();
@@ -657,7 +670,11 @@ int bch2_journal_meta(struct journal *j)
 
 	buf = j->buf + (res.seq & JOURNAL_BUF_MASK);
 	buf->must_flush = true;
-	set_bit(JOURNAL_NEED_WRITE, &j->flags);
+
+	if (!buf->flush_time) {
+		buf->flush_time	= local_clock() ?: 1;
+		buf->expires = jiffies;
+	}
 
 	bch2_journal_res_put(j, &res);
 
@@ -1233,12 +1250,22 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
 
 		pr_buf(out, "unwritten entry:\tidx %u refcount %u sectors %u\n",
 		       i, journal_state_count(s, i), j->buf[i].sectors);
+		pr_indent_push(out, 2);
+
+		pr_buf(out, "refcount %u", journal_state_count(s, i));
+		pr_newline(out);
+
+		pr_buf(out, "sectors %u", j->buf[i].sectors);
+		pr_newline(out);
+
+		pr_buf(out, "expires %li ms", jiffies_to_msecs(j->buf[i].expires - jiffies));
+		pr_newline(out);
+
+		pr_indent_pop(out, 2);
 	}
 
 	pr_buf(out,
-	       "need write:\t\t%i\n"
 	       "replay done:\t\t%i\n",
-	       test_bit(JOURNAL_NEED_WRITE,	&j->flags),
 	       test_bit(JOURNAL_REPLAY_DONE,	&j->flags));
 
 	pr_buf(out, "space:\n");
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index bbec4d85b6bc..724a8bb69978 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -1403,13 +1403,15 @@ static void journal_write_done(struct closure *cl)
 	closure_wake_up(&w->wait);
 	journal_wake(j);
 
-	if (test_bit(JOURNAL_NEED_WRITE, &j->flags))
-		mod_delayed_work(c->io_complete_wq, &j->write_work, 0);
-	spin_unlock(&j->lock);
+	if (new.unwritten_idx == new.idx) {
+		struct journal_buf *buf = journal_cur_buf(j);
+		long delta = buf->expires - jiffies;
 
-	if (new.unwritten_idx != new.idx &&
-	    !journal_state_count(new, new.unwritten_idx))
+		mod_delayed_work(c->io_complete_wq, &j->write_work, max(0L, delta));
+	} else if (!journal_state_count(new, new.unwritten_idx))
 		closure_call(&j->io, bch2_journal_write, c->io_complete_wq, NULL);
+
+	spin_unlock(&j->lock);
 }
 
 static void journal_write_endio(struct bio *bio)
diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h
index 3012b374625f..36843fd0c7da 100644
--- a/fs/bcachefs/journal_types.h
+++ b/fs/bcachefs/journal_types.h
@@ -25,6 +25,8 @@ struct journal_buf {
 
 	struct closure_waitlist	wait;
 	u64			last_seq;	/* copy of data->last_seq */
+	unsigned long		expires;
+	u64			flush_time;
 
 	unsigned		buf_size;	/* size in bytes of @data */
 	unsigned		sectors;	/* maximum size for current entry */
@@ -139,16 +141,9 @@ enum journal_space_from {
 	journal_space_nr,
 };
 
-/*
- * JOURNAL_NEED_WRITE - current (pending) journal entry should be written ASAP,
- * either because something's waiting on the write to complete or because it's
- * been dirty too long and the timer's expired.
- */
-
 enum {
 	JOURNAL_REPLAY_DONE,
 	JOURNAL_STARTED,
-	JOURNAL_NEED_WRITE,
 	JOURNAL_MAY_GET_UNRESERVED,
 	JOURNAL_MAY_SKIP_FLUSH,
 };
@@ -266,7 +261,6 @@ struct journal {
 	unsigned long		last_flush_write;
 
 	u64			res_get_blocked_start;
-	u64			need_write_time;
 	u64			write_start_time;
 
 	u64			nr_flush_writes;
-- 
cgit 


From dfc0f7ea00a71e12772d174e5f070dd5b1bf8981 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 28 Feb 2022 16:21:07 -0500
Subject: bcachefs: bch2_journal_halt() now takes journal lock

This change is prep work for moving some work from
__journal_entry_close() to journal_entry_open(): without this change,
journal_entry_open() doesn't know if it's going to be able to open a new
journal entry until the cmpxchg loop, meaning it can't create the new
journal pin entry and update other global state because those have to be
done prior to the cmpxchg opening the new journal entry.

Fortunately, we don't call bch2_journal_halt() from interrupt context.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/journal.c | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index 880ca2061012..0570cd1cb8cf 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -96,12 +96,15 @@ static void bch2_journal_buf_init(struct journal *j)
 void bch2_journal_halt(struct journal *j)
 {
 	union journal_res_state old, new;
-	u64 v = atomic64_read(&j->reservations.counter);
+	u64 v;
+
+	spin_lock(&j->lock);
 
+	v = atomic64_read(&j->reservations.counter);
 	do {
 		old.v = new.v = v;
 		if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL)
-			return;
+			goto out;
 
 		new.cur_entry_offset = JOURNAL_ENTRY_ERROR_VAL;
 	} while ((v = atomic64_cmpxchg(&j->reservations.counter,
@@ -115,6 +118,8 @@ void bch2_journal_halt(struct journal *j)
 		j->err_seq = journal_cur_seq(j);
 	journal_wake(j);
 	closure_wake_up(&journal_cur_buf(j)->wait);
+out:
+	spin_unlock(&j->lock);
 }
 
 /* journal entry close/open: */
@@ -266,6 +271,9 @@ static int journal_entry_open(struct journal *j)
 	if (j->cur_entry_error)
 		return j->cur_entry_error;
 
+	if (bch2_journal_error(j))
+		return cur_entry_insufficient_devices; /* -EROFS */
+
 	BUG_ON(!j->cur_entry_sectors);
 
 	/* We used to add things to the first journal entry before opening it,
@@ -296,8 +304,7 @@ static int journal_entry_open(struct journal *j)
 	do {
 		old.v = new.v = v;
 
-		if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL)
-			return cur_entry_insufficient_devices;
+		BUG_ON(old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL);
 
 		EBUG_ON(journal_state_count(new, new.idx));
 		journal_state_inc(&new);
-- 
cgit 


From b66fbf33425f30aacbbb95182c22d2df5b1d3b12 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 1 Mar 2022 15:31:20 -0500
Subject: bcachefs: Drop unneeded journal pin in bch2_btree_update_start()

When we do an interior btree update, we create new btree nodes and link
them into the btree in memory, but they don't become reachable on disk
until later, when btree_update_nodes_written_trans() runs.

Updates to the new nodes can thus happen before they're reachable on
disk, and if the updates to those new nodes are written before the nodes
become reachable, we would then drop the journal pin for those updates
before the btree has them.

This is what the journal pin in bch2_btree_update_start() was protecting
against. However, it's not actually needed because we don't allow
subsequent append writes to btree nodes until the node is reachable on
disk.

Dropping this unneeded pin also fixes a bug introduced by "bcachefs:
Journal seq now incremented at entry open, not close" - in the new code,
if the journal is completely empty a journal pin list for
journal_cur_seq() won't exist.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_update_interior.c | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 17d65c9e2bd4..7e876a904c10 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -544,8 +544,6 @@ static void btree_update_nodes_written(struct btree_update *as)
 	if (ret)
 		goto err;
 
-	BUG_ON(!journal_pin_active(&as->journal));
-
 	/*
 	 * Wait for any in flight writes to finish before we free the old nodes
 	 * on disk:
@@ -1045,10 +1043,6 @@ retry:
 	if (ret)
 		goto err;
 
-	bch2_journal_pin_add(&c->journal,
-			     atomic64_read(&c->journal.seq),
-			     &as->journal, NULL);
-
 	return as;
 err:
 	bch2_btree_update_free(as);
-- 
cgit 


From f0a3a2ccabc5ae1e2c7d588a6a4f77d216b1d4cf Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 28 Feb 2022 16:35:42 -0500
Subject: bcachefs: Journal seq now incremented at entry open, not close

This patch changes journal_entry_open() to initialize the new journal
entry, not __journal_entry_close().

This also means that journal_cur_seq() refers to the sequence number of
the last journal entry when we don't have an open journal entry, not the
next one.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_gc.c        |   2 +-
 fs/bcachefs/journal.c         | 126 +++++++++++++-----------------------------
 fs/bcachefs/journal_io.c      |   4 +-
 fs/bcachefs/journal_reclaim.c |   8 +--
 fs/bcachefs/super-io.c        |   2 +-
 5 files changed, 44 insertions(+), 98 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 8eae5fb35c84..8ec9c43d98e1 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -781,7 +781,7 @@ static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id,
 
 	if (initial) {
 		BUG_ON(bch2_journal_seq_verify &&
-		       k->k->version.lo > journal_cur_seq(&c->journal));
+		       k->k->version.lo > atomic64_read(&c->journal.seq));
 
 		ret = bch2_check_fix_ptrs(c, btree_id, level, is_root, k);
 		if (ret)
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index 0570cd1cb8cf..41616dba982d 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -21,16 +21,12 @@
 
 static u64 last_unwritten_seq(struct journal *j)
 {
-	union journal_res_state s = READ_ONCE(j->reservations);
-
-	lockdep_assert_held(&j->lock);
-
-	return journal_cur_seq(j) - ((s.idx - s.unwritten_idx) & JOURNAL_BUF_MASK);
+	return j->seq_ondisk + 1;
 }
 
 static inline bool journal_seq_unwritten(struct journal *j, u64 seq)
 {
-	return seq >= last_unwritten_seq(j);
+	return seq > j->seq_ondisk;
 }
 
 static bool __journal_entry_is_open(union journal_res_state state)
@@ -49,8 +45,6 @@ journal_seq_to_buf(struct journal *j, u64 seq)
 	struct journal_buf *buf = NULL;
 
 	EBUG_ON(seq > journal_cur_seq(j));
-	EBUG_ON(seq == journal_cur_seq(j) &&
-		j->reservations.cur_entry_offset == JOURNAL_ENTRY_CLOSED_VAL);
 
 	if (journal_seq_unwritten(j, seq)) {
 		buf = j->buf + (seq & JOURNAL_BUF_MASK);
@@ -68,31 +62,6 @@ static void journal_pin_list_init(struct journal_entry_pin_list *p, int count)
 	p->devs.nr = 0;
 }
 
-static void journal_pin_new_entry(struct journal *j)
-{
-	/*
-	 * The fifo_push() needs to happen at the same time as j->seq is
-	 * incremented for journal_last_seq() to be calculated correctly
-	 */
-	atomic64_inc(&j->seq);
-	journal_pin_list_init(fifo_push_ref(&j->pin), 1);
-}
-
-static void bch2_journal_buf_init(struct journal *j)
-{
-	struct journal_buf *buf = journal_cur_buf(j);
-
-	bkey_extent_init(&buf->key);
-	buf->noflush	= false;
-	buf->must_flush	= false;
-	buf->separate_flush = false;
-	buf->flush_time	= 0;
-
-	memset(buf->data, 0, sizeof(*buf->data));
-	buf->data->seq	= cpu_to_le64(journal_cur_seq(j));
-	buf->data->u64s	= 0;
-}
-
 void bch2_journal_halt(struct journal *j)
 {
 	union journal_res_state old, new;
@@ -200,11 +169,6 @@ static bool __journal_entry_close(struct journal *j)
 
 	__bch2_journal_pin_put(j, le64_to_cpu(buf->data->seq));
 
-	/* Initialize new buffer: */
-	journal_pin_new_entry(j);
-
-	bch2_journal_buf_init(j);
-
 	cancel_delayed_work(&j->write_work);
 
 	bch2_journal_space_available(j);
@@ -274,27 +238,47 @@ static int journal_entry_open(struct journal *j)
 	if (bch2_journal_error(j))
 		return cur_entry_insufficient_devices; /* -EROFS */
 
-	BUG_ON(!j->cur_entry_sectors);
+	if (!fifo_free(&j->pin))
+		return cur_entry_journal_pin_full;
 
-	/* We used to add things to the first journal entry before opening it,
-	 * as a way to deal with a chicken-and-the-egg problem, but we shouldn't
-	 * be anymore:
-	 */
-	BUG_ON(buf->data->u64s);
+	BUG_ON(!j->cur_entry_sectors);
 
-	buf->expires		= jiffies +
+	buf->expires		=
+		(journal_cur_seq(j) == j->flushed_seq_ondisk
+		 ? jiffies
+		 : j->last_flush_write) +
 		msecs_to_jiffies(c->opts.journal_flush_delay);
+
 	buf->u64s_reserved	= j->entry_u64s_reserved;
 	buf->disk_sectors	= j->cur_entry_sectors;
 	buf->sectors		= min(buf->disk_sectors, buf->buf_size >> 9);
 
 	u64s = (int) (buf->sectors << 9) / sizeof(u64) -
 		journal_entry_overhead(j);
-	u64s  = clamp_t(int, u64s, 0, JOURNAL_ENTRY_CLOSED_VAL - 1);
+	u64s = clamp_t(int, u64s, 0, JOURNAL_ENTRY_CLOSED_VAL - 1);
 
 	if (u64s <= 0)
 		return cur_entry_journal_full;
 
+	/*
+	 * The fifo_push() needs to happen at the same time as j->seq is
+	 * incremented for journal_last_seq() to be calculated correctly
+	 */
+	atomic64_inc(&j->seq);
+	journal_pin_list_init(fifo_push_ref(&j->pin), 1);
+
+	BUG_ON(j->buf + (journal_cur_seq(j) & JOURNAL_BUF_MASK) != buf);
+
+	bkey_extent_init(&buf->key);
+	buf->noflush	= false;
+	buf->must_flush	= false;
+	buf->separate_flush = false;
+	buf->flush_time	= 0;
+
+	memset(buf->data, 0, sizeof(*buf->data));
+	buf->data->seq	= cpu_to_le64(journal_cur_seq(j));
+	buf->data->u64s	= 0;
+
 	/*
 	 * Must be set before marking the journal entry as open:
 	 */
@@ -305,8 +289,8 @@ static int journal_entry_open(struct journal *j)
 		old.v = new.v = v;
 
 		BUG_ON(old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL);
+		BUG_ON(journal_state_count(new, new.idx));
 
-		EBUG_ON(journal_state_count(new, new.idx));
 		journal_state_inc(&new);
 		new.cur_entry_offset = 0;
 	} while ((v = atomic64_cmpxchg(&j->reservations.counter,
@@ -595,9 +579,12 @@ int bch2_journal_flush_seq_async(struct journal *j, u64 seq,
 	seq = max(seq, last_unwritten_seq(j));
 
 recheck_need_open:
-	if (seq == journal_cur_seq(j) && !journal_entry_is_open(j)) {
+	if (seq > journal_cur_seq(j)) {
 		struct journal_res res = { 0 };
 
+		if (journal_entry_is_open(j))
+			__journal_entry_close(j);
+
 		spin_unlock(&j->lock);
 
 		ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0);
@@ -694,42 +681,12 @@ int bch2_journal_meta(struct journal *j)
  */
 void bch2_journal_flush_async(struct journal *j, struct closure *parent)
 {
-	u64 seq, journal_seq;
-
-	spin_lock(&j->lock);
-	journal_seq = journal_cur_seq(j);
-
-	if (journal_entry_is_open(j)) {
-		seq = journal_seq;
-	} else if (journal_seq) {
-		seq = journal_seq - 1;
-	} else {
-		spin_unlock(&j->lock);
-		return;
-	}
-	spin_unlock(&j->lock);
-
-	bch2_journal_flush_seq_async(j, seq, parent);
+	bch2_journal_flush_seq_async(j, atomic64_read(&j->seq), parent);
 }
 
 int bch2_journal_flush(struct journal *j)
 {
-	u64 seq, journal_seq;
-
-	spin_lock(&j->lock);
-	journal_seq = journal_cur_seq(j);
-
-	if (journal_entry_is_open(j)) {
-		seq = journal_seq;
-	} else if (journal_seq) {
-		seq = journal_seq - 1;
-	} else {
-		spin_unlock(&j->lock);
-		return 0;
-	}
-	spin_unlock(&j->lock);
-
-	return bch2_journal_flush_seq(j, seq);
+	return bch2_journal_flush_seq(j, atomic64_read(&j->seq));
 }
 
 /*
@@ -1022,8 +979,7 @@ void bch2_fs_journal_stop(struct journal *j)
 
 	BUG_ON(!bch2_journal_error(j) &&
 	       test_bit(JOURNAL_REPLAY_DONE, &j->flags) &&
-	       (journal_entry_is_open(j) ||
-		j->last_empty_seq + 1 != journal_cur_seq(j)));
+	       j->last_empty_seq != journal_cur_seq(j));
 
 	cancel_delayed_work_sync(&j->write_work);
 	bch2_journal_reclaim_stop(j);
@@ -1093,11 +1049,7 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq,
 	set_bit(JOURNAL_STARTED, &j->flags);
 	j->last_flush_write = jiffies;
 
-	journal_pin_new_entry(j);
-
-	j->reservations.idx = j->reservations.unwritten_idx = journal_cur_seq(j);
-
-	bch2_journal_buf_init(j);
+	j->reservations.idx = j->reservations.unwritten_idx = journal_cur_seq(j) + 1;
 
 	c->last_bucket_seq_cleanup = journal_cur_seq(j);
 
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 724a8bb69978..0d6bede8abfa 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -1368,8 +1368,6 @@ static void journal_write_done(struct closure *cl)
 		journal_seq_pin(j, seq)->devs = w->devs_written;
 
 	if (!err) {
-		j->seq_ondisk		= seq;
-
 		if (!JSET_NO_FLUSH(w->data)) {
 			j->flushed_seq_ondisk = seq;
 			j->last_seq_ondisk = w->last_seq;
@@ -1377,6 +1375,8 @@ static void journal_write_done(struct closure *cl)
 	} else if (!j->err_seq || seq < j->err_seq)
 		j->err_seq	= seq;
 
+	j->seq_ondisk		= seq;
+
 	/*
 	 * Updating last_seq_ondisk may let bch2_journal_reclaim_work() discard
 	 * more buckets:
diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
index 449f4fbfa326..213047bb2b00 100644
--- a/fs/bcachefs/journal_reclaim.c
+++ b/fs/bcachefs/journal_reclaim.c
@@ -226,8 +226,6 @@ void bch2_journal_space_available(struct journal *j)
 		ret = cur_entry_journal_stuck;
 	} else if (!j->space[journal_space_discarded].next_entry)
 		ret = cur_entry_journal_full;
-	else if (!fifo_free(&j->pin))
-		ret = cur_entry_journal_pin_full;
 
 	if ((j->space[journal_space_clean_ondisk].next_entry <
 	     j->space[journal_space_clean_ondisk].total) &&
@@ -369,9 +367,6 @@ static inline void __journal_pin_drop(struct journal *j,
 	if (atomic_dec_and_test(&pin_list->count) &&
 	    pin_list == &fifo_peek_front(&j->pin))
 		bch2_journal_reclaim_fast(j);
-	else if (fifo_used(&j->pin) == 1 &&
-		 atomic_read(&pin_list->count) == 1)
-		journal_wake(j);
 }
 
 void bch2_journal_pin_drop(struct journal *j,
@@ -769,8 +764,7 @@ static int journal_flush_done(struct journal *j, u64 seq_to_flush,
 	 */
 	ret = !test_bit(JOURNAL_REPLAY_DONE, &j->flags) ||
 		journal_last_seq(j) > seq_to_flush ||
-		(fifo_used(&j->pin) == 1 &&
-		 atomic_read(&fifo_peek_front(&j->pin).count) == 1);
+		!fifo_used(&j->pin);
 
 	spin_unlock(&j->lock);
 	mutex_unlock(&j->reclaim_lock);
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index 31b175a8fcd0..03a8ae496668 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -1344,7 +1344,7 @@ void bch2_fs_mark_clean(struct bch_fs *c)
 	}
 
 	sb_clean->flags		= 0;
-	sb_clean->journal_seq	= cpu_to_le64(journal_cur_seq(&c->journal) - 1);
+	sb_clean->journal_seq	= cpu_to_le64(atomic64_read(&c->journal.seq));
 
 	/* Trying to catch outstanding bug: */
 	BUG_ON(le64_to_cpu(sb_clean->journal_seq) > S64_MAX);
-- 
cgit 


From 30ef633a0b46e06860f46bf7df0f5a313e6e1a19 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 28 Feb 2022 19:17:27 -0500
Subject: bcachefs: Refactor journal code to not use unwritten_idx

It makes the code more readable if we work off of sequence numbers,
instead of direct indexes into the array of journal buffers.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/journal.c         | 96 +++++++++++++++++++++++++------------------
 fs/bcachefs/journal.h         |  5 +++
 fs/bcachefs/journal_io.c      |  7 ++--
 fs/bcachefs/journal_reclaim.c | 28 +++++--------
 fs/bcachefs/journal_types.h   |  2 +-
 5 files changed, 76 insertions(+), 62 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index 41616dba982d..3de1a7488d5e 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -19,11 +19,6 @@
 #include "super-io.h"
 #include "trace.h"
 
-static u64 last_unwritten_seq(struct journal *j)
-{
-	return j->seq_ondisk + 1;
-}
-
 static inline bool journal_seq_unwritten(struct journal *j, u64 seq)
 {
 	return seq > j->seq_ondisk;
@@ -34,6 +29,11 @@ static bool __journal_entry_is_open(union journal_res_state state)
 	return state.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL;
 }
 
+static inline unsigned nr_unwritten_journal_entries(struct journal *j)
+{
+	return atomic64_read(&j->seq) - j->seq_ondisk;
+}
+
 static bool journal_entry_is_open(struct journal *j)
 {
 	return __journal_entry_is_open(j->reservations);
@@ -166,6 +166,7 @@ static bool __journal_entry_close(struct journal *j)
 	 */
 	buf->last_seq		= journal_last_seq(j);
 	buf->data->last_seq	= cpu_to_le64(buf->last_seq);
+	BUG_ON(buf->last_seq > le64_to_cpu(buf->data->seq));
 
 	__bch2_journal_pin_put(j, le64_to_cpu(buf->data->seq));
 
@@ -179,18 +180,19 @@ static bool __journal_entry_close(struct journal *j)
 
 static bool journal_entry_want_write(struct journal *j)
 {
-	union journal_res_state s = READ_ONCE(j->reservations);
-	struct journal_buf *buf = journal_cur_buf(j);
-	bool ret = false;
-
-	if (!buf->flush_time) {
-		buf->flush_time	= local_clock() ?: 1;
-		buf->expires = jiffies;
-	}
+	bool ret = !journal_entry_is_open(j) ||
+		(journal_cur_seq(j) == journal_last_unwritten_seq(j) &&
+		 __journal_entry_close(j));
 
 	/* Don't close it yet if we already have a write in flight: */
-	if (s.idx == s.unwritten_idx)
-		ret = __journal_entry_close(j);
+	if (!ret && nr_unwritten_journal_entries(j)) {
+		struct journal_buf *buf = journal_cur_buf(j);
+
+		if (!buf->flush_time) {
+			buf->flush_time	= local_clock() ?: 1;
+			buf->expires = jiffies;
+		}
+	}
 
 	return ret;
 }
@@ -310,8 +312,8 @@ static int journal_entry_open(struct journal *j)
 
 static bool journal_quiesced(struct journal *j)
 {
-	union journal_res_state s = READ_ONCE(j->reservations);
-	bool ret = s.idx == s.unwritten_idx && !__journal_entry_is_open(s);
+	bool ret = atomic64_read(&j->seq) == j->seq_ondisk ||
+		bch2_journal_error(j);
 
 	if (!ret)
 		journal_entry_close(j);
@@ -416,7 +418,7 @@ unlock:
 	if ((ret == cur_entry_journal_full ||
 	     ret == cur_entry_journal_pin_full) &&
 	    !can_discard &&
-	    j->reservations.idx == j->reservations.unwritten_idx &&
+	    !nr_unwritten_journal_entries(j) &&
 	    (flags & JOURNAL_RES_GET_RESERVED)) {
 		struct printbuf buf = PRINTBUF;
 
@@ -576,7 +578,7 @@ int bch2_journal_flush_seq_async(struct journal *j, u64 seq,
 	}
 
 	/* if seq was written, but not flushed - flush a newer one instead */
-	seq = max(seq, last_unwritten_seq(j));
+	seq = max(seq, journal_last_unwritten_seq(j));
 
 recheck_need_open:
 	if (seq > journal_cur_seq(j)) {
@@ -709,13 +711,13 @@ bool bch2_journal_noflush_seq(struct journal *j, u64 seq)
 	if (seq <= c->journal.flushed_seq_ondisk)
 		goto out;
 
-	for (unwritten_seq = last_unwritten_seq(j);
+	for (unwritten_seq = journal_last_unwritten_seq(j);
 	     unwritten_seq < seq;
 	     unwritten_seq++) {
 		struct journal_buf *buf = journal_seq_to_buf(j, unwritten_seq);
 
 		/* journal write is already in flight, and was a flush write: */
-		if (unwritten_seq == last_unwritten_seq(j) && !buf->noflush)
+		if (unwritten_seq == journal_last_unwritten_seq(j) && !buf->noflush)
 			goto out;
 
 		buf->noflush = true;
@@ -940,17 +942,16 @@ int bch2_dev_journal_alloc(struct bch_dev *ca)
 
 static bool bch2_journal_writing_to_device(struct journal *j, unsigned dev_idx)
 {
-	union journal_res_state state;
 	bool ret = false;
-	unsigned i;
+	u64 seq;
 
 	spin_lock(&j->lock);
-	state = READ_ONCE(j->reservations);
-	i = state.idx;
+	for (seq = journal_last_unwritten_seq(j);
+	     seq <= journal_cur_seq(j) && !ret;
+	     seq++) {
+		struct journal_buf *buf = journal_seq_to_buf(j, seq);
 
-	while (i != state.unwritten_idx) {
-		i = (i - 1) & JOURNAL_BUF_MASK;
-		if (bch2_bkey_has_device(bkey_i_to_s_c(&j->buf[i].key), dev_idx))
+		if (bch2_bkey_has_device(bkey_i_to_s_c(&buf->key), dev_idx))
 			ret = true;
 	}
 	spin_unlock(&j->lock);
@@ -1012,6 +1013,7 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq,
 	j->replay_journal_seq_end = cur_seq;
 	j->last_seq_ondisk	= last_seq;
 	j->flushed_seq_ondisk	= cur_seq - 1;
+	j->seq_ondisk		= cur_seq - 1;
 	j->pin.front		= last_seq;
 	j->pin.back		= cur_seq;
 	atomic64_set(&j->seq, cur_seq - 1);
@@ -1164,15 +1166,18 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
 	union journal_res_state s;
 	struct bch_dev *ca;
 	unsigned long now = jiffies;
+	u64 seq;
 	unsigned i;
 
 	out->atomic++;
+	out->tabstops[0] = 24;
 
 	rcu_read_lock();
 	s = READ_ONCE(j->reservations);
 
-	pr_buf(out, "active journal entries:\t%llu\n",	fifo_used(&j->pin));
+	pr_buf(out, "dirty journal entries:\t%llu\n",	fifo_used(&j->pin));
 	pr_buf(out, "seq:\t\t\t%llu\n",			journal_cur_seq(j));
+	pr_buf(out, "seq_ondisk:\t\t%llu\n",		j->seq_ondisk);
 	pr_buf(out, "last_seq:\t\t%llu\n",		journal_last_seq(j));
 	pr_buf(out, "last_seq_ondisk:\t%llu\n",		j->last_seq_ondisk);
 	pr_buf(out, "flushed_seq_ondisk:\t%llu\n",	j->flushed_seq_ondisk);
@@ -1191,33 +1196,42 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
 
 	switch (s.cur_entry_offset) {
 	case JOURNAL_ENTRY_ERROR_VAL:
-		pr_buf(out, "error\n");
+		pr_buf(out, "error");
 		break;
 	case JOURNAL_ENTRY_CLOSED_VAL:
-		pr_buf(out, "closed\n");
+		pr_buf(out, "closed");
 		break;
 	default:
-		pr_buf(out, "%u/%u\n", s.cur_entry_offset, j->cur_entry_u64s);
+		pr_buf(out, "%u/%u", s.cur_entry_offset, j->cur_entry_u64s);
 		break;
 	}
 
-	pr_buf(out, "current entry:\t\tidx %u refcount %u\n", s.idx, journal_state_count(s, s.idx));
+	pr_newline(out);
 
-	i = s.idx;
-	while (i != s.unwritten_idx) {
-		i = (i - 1) & JOURNAL_BUF_MASK;
+	for (seq = journal_cur_seq(j);
+	     seq >= journal_last_unwritten_seq(j);
+	     --seq) {
+		i = seq & JOURNAL_BUF_MASK;
 
-		pr_buf(out, "unwritten entry:\tidx %u refcount %u sectors %u\n",
-		       i, journal_state_count(s, i), j->buf[i].sectors);
+		pr_buf(out, "unwritten entry:");
+		pr_tab(out);
+		pr_buf(out, "%llu", seq);
+		pr_newline(out);
 		pr_indent_push(out, 2);
 
-		pr_buf(out, "refcount %u", journal_state_count(s, i));
+		pr_buf(out, "refcount:");
+		pr_tab(out);
+		pr_buf(out, "%u", journal_state_count(s, i));
 		pr_newline(out);
 
-		pr_buf(out, "sectors %u", j->buf[i].sectors);
+		pr_buf(out, "sectors:");
+		pr_tab(out);
+		pr_buf(out, "%u", j->buf[i].sectors);
 		pr_newline(out);
 
-		pr_buf(out, "expires %li ms", jiffies_to_msecs(j->buf[i].expires - jiffies));
+		pr_buf(out, "expires");
+		pr_tab(out);
+		pr_buf(out, "%li jiffies", j->buf[i].expires - jiffies);
 		pr_newline(out);
 
 		pr_indent_pop(out, 2);
diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h
index 6c7a38ad2195..409d32b784d2 100644
--- a/fs/bcachefs/journal.h
+++ b/fs/bcachefs/journal.h
@@ -141,6 +141,11 @@ static inline u64 journal_cur_seq(struct journal *j)
 	return j->pin.back - 1;
 }
 
+static inline u64 journal_last_unwritten_seq(struct journal *j)
+{
+	return j->seq_ondisk + 1;
+}
+
 static inline int journal_state_count(union journal_res_state s, int idx)
 {
 	switch (idx) {
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 0d6bede8abfa..7c8298ddad25 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -1331,7 +1331,7 @@ static void journal_buf_realloc(struct journal *j, struct journal_buf *buf)
 
 static inline struct journal_buf *journal_last_unwritten_buf(struct journal *j)
 {
-	return j->buf + j->reservations.unwritten_idx;
+	return j->buf + (journal_last_unwritten_seq(j) & JOURNAL_BUF_MASK);
 }
 
 static void journal_write_done(struct closure *cl)
@@ -1403,12 +1403,13 @@ static void journal_write_done(struct closure *cl)
 	closure_wake_up(&w->wait);
 	journal_wake(j);
 
-	if (new.unwritten_idx == new.idx) {
+	if (journal_last_unwritten_seq(j) == journal_cur_seq(j)) {
 		struct journal_buf *buf = journal_cur_buf(j);
 		long delta = buf->expires - jiffies;
 
 		mod_delayed_work(c->io_complete_wq, &j->write_work, max(0L, delta));
-	} else if (!journal_state_count(new, new.unwritten_idx))
+	} else if (journal_last_unwritten_seq(j) < journal_cur_seq(j) &&
+		   !journal_state_count(new, new.unwritten_idx))
 		closure_call(&j->io, bch2_journal_write, c->io_complete_wq, NULL);
 
 	spin_unlock(&j->lock);
diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
index 213047bb2b00..39f4b2eebac0 100644
--- a/fs/bcachefs/journal_reclaim.c
+++ b/fs/bcachefs/journal_reclaim.c
@@ -59,25 +59,13 @@ static void journal_set_remaining(struct journal *j, unsigned u64s_remaining)
 				       old.v, new.v)) != old.v);
 }
 
-static inline unsigned get_unwritten_sectors(struct journal *j, unsigned *idx)
-{
-	unsigned sectors = 0;
-
-	while (!sectors && *idx != j->reservations.idx) {
-		sectors = j->buf[*idx].sectors;
-
-		*idx = (*idx + 1) & JOURNAL_BUF_MASK;
-	}
-
-	return sectors;
-}
-
 static struct journal_space
 journal_dev_space_available(struct journal *j, struct bch_dev *ca,
 			    enum journal_space_from from)
 {
 	struct journal_device *ja = &ca->journal;
-	unsigned sectors, buckets, unwritten, idx = j->reservations.unwritten_idx;
+	unsigned sectors, buckets, unwritten;
+	u64 seq;
 
 	if (from == journal_space_total)
 		return (struct journal_space) {
@@ -92,7 +80,14 @@ journal_dev_space_available(struct journal *j, struct bch_dev *ca,
 	 * We that we don't allocate the space for a journal entry
 	 * until we write it out - thus, account for it here:
 	 */
-	while ((unwritten = get_unwritten_sectors(j, &idx))) {
+	for (seq = journal_last_unwritten_seq(j);
+	     seq <= journal_cur_seq(j);
+	     seq++) {
+		unwritten = j->buf[seq & JOURNAL_BUF_MASK].sectors;
+
+		if (!unwritten)
+			continue;
+
 		/* entry won't fit on this device, skip: */
 		if (unwritten > ca->mi.bucket_size)
 			continue;
@@ -214,8 +209,7 @@ void bch2_journal_space_available(struct journal *j)
 	total		= j->space[journal_space_total].total;
 
 	if (!clean_ondisk &&
-	    j->reservations.idx ==
-	    j->reservations.unwritten_idx) {
+	    journal_cur_seq(j) == j->seq_ondisk) {
 		struct printbuf buf = PRINTBUF;
 
 		__bch2_journal_debug_to_text(&buf, j);
diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h
index 36843fd0c7da..3d9810e48e9d 100644
--- a/fs/bcachefs/journal_types.h
+++ b/fs/bcachefs/journal_types.h
@@ -25,7 +25,7 @@ struct journal_buf {
 
 	struct closure_waitlist	wait;
 	u64			last_seq;	/* copy of data->last_seq */
-	unsigned long		expires;
+	long			expires;
 	u64			flush_time;
 
 	unsigned		buf_size;	/* size in bytes of @data */
-- 
cgit 


From 24a3d53b28398d2edd4dc717bede21eaf4a3b874 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 28 Feb 2022 19:29:19 -0500
Subject: bcachefs: __journal_entry_close() never fails

Previous patch just moved responsibility for incrementing the journal
sequence number and initializing the new journal entry from
__journal_entry_close() to journal_entry_open(); this patch makes the
analagous change for journal reservation state, incrementing the index
into array of journal_bufs at open time.

This means that __journal_entry_close() never fails to close an open
journal entry, which is important for the next patch that will change
our emergency shutdown behaviour.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/journal.c       | 50 ++++++++++++++++++++-------------------------
 fs/bcachefs/journal.h       |  3 ---
 fs/bcachefs/journal_io.c    | 18 +++++++++++-----
 fs/bcachefs/journal_types.h |  1 +
 4 files changed, 36 insertions(+), 36 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index 3de1a7488d5e..b427e252ec8e 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -106,7 +106,7 @@ void __bch2_journal_buf_put(struct journal *j)
  * We don't close a journal_buf until the next journal_buf is finished writing,
  * and can be opened again - this also initializes the next journal_buf:
  */
-static bool __journal_entry_close(struct journal *j)
+static void __journal_entry_close(struct journal *j)
 {
 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
 	struct journal_buf *buf = journal_cur_buf(j);
@@ -119,21 +119,15 @@ static bool __journal_entry_close(struct journal *j)
 	do {
 		old.v = new.v = v;
 		if (old.cur_entry_offset == JOURNAL_ENTRY_CLOSED_VAL)
-			return true;
+			return;
 
 		if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL) {
 			/* this entry will never be written: */
 			closure_wake_up(&buf->wait);
-			return true;
+			return;
 		}
 
 		new.cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL;
-		new.idx++;
-
-		if (new.idx == new.unwritten_idx)
-			return false;
-
-		BUG_ON(journal_state_count(new, new.idx));
 	} while ((v = atomic64_cmpxchg(&j->reservations.counter,
 				       old.v, new.v)) != old.v);
 
@@ -175,17 +169,17 @@ static bool __journal_entry_close(struct journal *j)
 	bch2_journal_space_available(j);
 
 	bch2_journal_buf_put(j, old.idx);
-	return true;
 }
 
 static bool journal_entry_want_write(struct journal *j)
 {
 	bool ret = !journal_entry_is_open(j) ||
-		(journal_cur_seq(j) == journal_last_unwritten_seq(j) &&
-		 __journal_entry_close(j));
+		journal_cur_seq(j) == journal_last_unwritten_seq(j);
 
 	/* Don't close it yet if we already have a write in flight: */
-	if (!ret && nr_unwritten_journal_entries(j)) {
+	if (ret)
+		__journal_entry_close(j);
+	else if (nr_unwritten_journal_entries(j)) {
 		struct journal_buf *buf = journal_cur_buf(j);
 
 		if (!buf->flush_time) {
@@ -221,15 +215,15 @@ static bool journal_entry_close(struct journal *j)
 static int journal_entry_open(struct journal *j)
 {
 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
-	struct journal_buf *buf = journal_cur_buf(j);
+	struct journal_buf *buf = j->buf +
+		((journal_cur_seq(j) + 1) & JOURNAL_BUF_MASK);
 	union journal_res_state old, new;
 	int u64s;
 	u64 v;
 
-	BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb));
-
 	lockdep_assert_held(&j->lock);
 	BUG_ON(journal_entry_is_open(j));
+	BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb));
 
 	if (j->blocked)
 		return cur_entry_blocked;
@@ -243,6 +237,9 @@ static int journal_entry_open(struct journal *j)
 	if (!fifo_free(&j->pin))
 		return cur_entry_journal_pin_full;
 
+	if (nr_unwritten_journal_entries(j) == ARRAY_SIZE(j->buf))
+		return cur_entry_max_in_flight;
+
 	BUG_ON(!j->cur_entry_sectors);
 
 	buf->expires		=
@@ -291,7 +288,10 @@ static int journal_entry_open(struct journal *j)
 		old.v = new.v = v;
 
 		BUG_ON(old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL);
+
+		new.idx++;
 		BUG_ON(journal_state_count(new, new.idx));
+		BUG_ON(new.idx != (journal_cur_seq(j) & JOURNAL_BUF_MASK));
 
 		journal_state_inc(&new);
 		new.cur_entry_offset = 0;
@@ -390,18 +390,11 @@ retry:
 	    buf->buf_size < JOURNAL_ENTRY_SIZE_MAX)
 		j->buf_size_want = max(j->buf_size_want, buf->buf_size << 1);
 
-	if (journal_entry_is_open(j) &&
-	    !__journal_entry_close(j)) {
-		/*
-		 * We failed to get a reservation on the current open journal
-		 * entry because it's full, and we can't close it because
-		 * there's still a previous one in flight:
-		 */
+	__journal_entry_close(j);
+	ret = journal_entry_open(j);
+
+	if (ret == cur_entry_max_in_flight)
 		trace_journal_entry_full(c);
-		ret = cur_entry_blocked;
-	} else {
-		ret = journal_entry_open(j);
-	}
 unlock:
 	if ((ret && ret != cur_entry_insufficient_devices) &&
 	    !j->res_get_blocked_start) {
@@ -1051,7 +1044,8 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq,
 	set_bit(JOURNAL_STARTED, &j->flags);
 	j->last_flush_write = jiffies;
 
-	j->reservations.idx = j->reservations.unwritten_idx = journal_cur_seq(j) + 1;
+	j->reservations.idx = j->reservations.unwritten_idx = journal_cur_seq(j);
+	j->reservations.unwritten_idx++;
 
 	c->last_bucket_seq_cleanup = journal_cur_seq(j);
 
diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h
index 409d32b784d2..948e8b53dffd 100644
--- a/fs/bcachefs/journal.h
+++ b/fs/bcachefs/journal.h
@@ -264,9 +264,6 @@ static inline void bch2_journal_buf_put(struct journal *j, unsigned idx)
 				    .buf3_count = idx == 3,
 				    }).v, &j->reservations.counter);
 
-	EBUG_ON(((s.idx - idx) & 3) >
-		((s.idx - s.unwritten_idx) & 3));
-
 	if (!journal_state_count(s, idx) && idx == s.unwritten_idx)
 		__bch2_journal_buf_put(j);
 }
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 7c8298ddad25..90743fa13ff4 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -1392,7 +1392,7 @@ static void journal_write_done(struct closure *cl)
 	v = atomic64_read(&j->reservations.counter);
 	do {
 		old.v = new.v = v;
-		BUG_ON(new.idx == new.unwritten_idx);
+		BUG_ON(journal_state_count(new, new.unwritten_idx));
 
 		new.unwritten_idx++;
 	} while ((v = atomic64_cmpxchg(&j->reservations.counter,
@@ -1403,14 +1403,22 @@ static void journal_write_done(struct closure *cl)
 	closure_wake_up(&w->wait);
 	journal_wake(j);
 
-	if (journal_last_unwritten_seq(j) == journal_cur_seq(j)) {
+	if (!journal_state_count(new, new.unwritten_idx) &&
+	    journal_last_unwritten_seq(j) <= journal_cur_seq(j)) {
+		closure_call(&j->io, bch2_journal_write, c->io_complete_wq, NULL);
+	} else if (journal_last_unwritten_seq(j) == journal_cur_seq(j) &&
+		   new.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL) {
 		struct journal_buf *buf = journal_cur_buf(j);
 		long delta = buf->expires - jiffies;
 
+		/*
+		 * We don't close a journal entry to write it while there's
+		 * previous entries still in flight - the current journal entry
+		 * might want to be written now:
+		 */
+
 		mod_delayed_work(c->io_complete_wq, &j->write_work, max(0L, delta));
-	} else if (journal_last_unwritten_seq(j) < journal_cur_seq(j) &&
-		   !journal_state_count(new, new.unwritten_idx))
-		closure_call(&j->io, bch2_journal_write, c->io_complete_wq, NULL);
+	}
 
 	spin_unlock(&j->lock);
 }
diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h
index 3d9810e48e9d..330c5d79e645 100644
--- a/fs/bcachefs/journal_types.h
+++ b/fs/bcachefs/journal_types.h
@@ -176,6 +176,7 @@ struct journal {
 	enum {
 		cur_entry_ok,
 		cur_entry_blocked,
+		cur_entry_max_in_flight,
 		cur_entry_journal_full,
 		cur_entry_journal_pin_full,
 		cur_entry_journal_stuck,
-- 
cgit 


From e0c014e7e4ccd4d865b637721e3e580505c29b07 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 28 Feb 2022 18:48:33 -0500
Subject: bcachefs: Finish writing journal after journal error

After emergency shutdown, all journal entries will be written as noflush
entries, meaning they will never be used - but they'll still exist for
debugging tools to examine.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/journal.c    | 70 +++++++++++++++++-------------------------------
 fs/bcachefs/journal_io.c | 10 +++----
 2 files changed, 30 insertions(+), 50 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index b427e252ec8e..9df600d55da0 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -62,35 +62,6 @@ static void journal_pin_list_init(struct journal_entry_pin_list *p, int count)
 	p->devs.nr = 0;
 }
 
-void bch2_journal_halt(struct journal *j)
-{
-	union journal_res_state old, new;
-	u64 v;
-
-	spin_lock(&j->lock);
-
-	v = atomic64_read(&j->reservations.counter);
-	do {
-		old.v = new.v = v;
-		if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL)
-			goto out;
-
-		new.cur_entry_offset = JOURNAL_ENTRY_ERROR_VAL;
-	} while ((v = atomic64_cmpxchg(&j->reservations.counter,
-				       old.v, new.v)) != old.v);
-
-	/*
-	 * XXX: we're not using j->lock here because this can be called from
-	 * interrupt context, this can race with journal_write_done()
-	 */
-	if (!j->err_seq)
-		j->err_seq = journal_cur_seq(j);
-	journal_wake(j);
-	closure_wake_up(&journal_cur_buf(j)->wait);
-out:
-	spin_unlock(&j->lock);
-}
-
 /* journal entry close/open: */
 
 void __bch2_journal_buf_put(struct journal *j)
@@ -106,7 +77,7 @@ void __bch2_journal_buf_put(struct journal *j)
  * We don't close a journal_buf until the next journal_buf is finished writing,
  * and can be opened again - this also initializes the next journal_buf:
  */
-static void __journal_entry_close(struct journal *j)
+static void __journal_entry_close(struct journal *j, unsigned closed_val)
 {
 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
 	struct journal_buf *buf = journal_cur_buf(j);
@@ -114,23 +85,24 @@ static void __journal_entry_close(struct journal *j)
 	u64 v = atomic64_read(&j->reservations.counter);
 	unsigned sectors;
 
+	BUG_ON(closed_val != JOURNAL_ENTRY_CLOSED_VAL &&
+	       closed_val != JOURNAL_ENTRY_ERROR_VAL);
+
 	lockdep_assert_held(&j->lock);
 
 	do {
 		old.v = new.v = v;
-		if (old.cur_entry_offset == JOURNAL_ENTRY_CLOSED_VAL)
-			return;
+		new.cur_entry_offset = closed_val;
 
-		if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL) {
-			/* this entry will never be written: */
-			closure_wake_up(&buf->wait);
+		if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL ||
+		    old.cur_entry_offset == new.cur_entry_offset)
 			return;
-		}
-
-		new.cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL;
 	} while ((v = atomic64_cmpxchg(&j->reservations.counter,
 				       old.v, new.v)) != old.v);
 
+	if (!__journal_entry_is_open(old))
+		return;
+
 	/* Close out old buffer: */
 	buf->data->u64s		= cpu_to_le32(old.cur_entry_offset);
 
@@ -171,6 +143,15 @@ static void __journal_entry_close(struct journal *j)
 	bch2_journal_buf_put(j, old.idx);
 }
 
+void bch2_journal_halt(struct journal *j)
+{
+	spin_lock(&j->lock);
+	__journal_entry_close(j, JOURNAL_ENTRY_ERROR_VAL);
+	if (!j->err_seq)
+		j->err_seq = journal_cur_seq(j);
+	spin_unlock(&j->lock);
+}
+
 static bool journal_entry_want_write(struct journal *j)
 {
 	bool ret = !journal_entry_is_open(j) ||
@@ -178,7 +159,7 @@ static bool journal_entry_want_write(struct journal *j)
 
 	/* Don't close it yet if we already have a write in flight: */
 	if (ret)
-		__journal_entry_close(j);
+		__journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL);
 	else if (nr_unwritten_journal_entries(j)) {
 		struct journal_buf *buf = journal_cur_buf(j);
 
@@ -312,8 +293,7 @@ static int journal_entry_open(struct journal *j)
 
 static bool journal_quiesced(struct journal *j)
 {
-	bool ret = atomic64_read(&j->seq) == j->seq_ondisk ||
-		bch2_journal_error(j);
+	bool ret = atomic64_read(&j->seq) == j->seq_ondisk;
 
 	if (!ret)
 		journal_entry_close(j);
@@ -339,7 +319,7 @@ static void journal_write_work(struct work_struct *work)
 	if (delta > 0)
 		mod_delayed_work(c->io_complete_wq, &j->write_work, delta);
 	else
-		__journal_entry_close(j);
+		__journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL);
 	spin_unlock(&j->lock);
 }
 
@@ -390,7 +370,7 @@ retry:
 	    buf->buf_size < JOURNAL_ENTRY_SIZE_MAX)
 		j->buf_size_want = max(j->buf_size_want, buf->buf_size << 1);
 
-	__journal_entry_close(j);
+	__journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL);
 	ret = journal_entry_open(j);
 
 	if (ret == cur_entry_max_in_flight)
@@ -526,7 +506,7 @@ void bch2_journal_entry_res_resize(struct journal *j,
 		/*
 		 * Not enough room in current journal entry, have to flush it:
 		 */
-		__journal_entry_close(j);
+		__journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL);
 	} else {
 		journal_cur_buf(j)->u64s_reserved += d;
 	}
@@ -578,7 +558,7 @@ recheck_need_open:
 		struct journal_res res = { 0 };
 
 		if (journal_entry_is_open(j))
-			__journal_entry_close(j);
+			__journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL);
 
 		spin_unlock(&j->lock);
 
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 90743fa13ff4..54587ff29771 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -1513,11 +1513,11 @@ void bch2_journal_write(struct closure *cl)
 	j->write_start_time = local_clock();
 
 	spin_lock(&j->lock);
-	if (c->sb.features & (1ULL << BCH_FEATURE_journal_no_flush) &&
-	    (w->noflush ||
-	     (!w->must_flush &&
-	      (jiffies - j->last_flush_write) < msecs_to_jiffies(c->opts.journal_flush_delay) &&
-	      test_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags)))) {
+	if (bch2_journal_error(j) ||
+	    w->noflush ||
+	    (!w->must_flush &&
+	     (jiffies - j->last_flush_write) < msecs_to_jiffies(c->opts.journal_flush_delay) &&
+	     test_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags))) {
 		w->noflush = true;
 		SET_JSET_NO_FLUSH(jset, true);
 		jset->last_seq	= 0;
-- 
cgit 


From 05a49d22750ec4977b52c9da09039a931c0f2644 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 3 Mar 2022 11:04:01 -0500
Subject: bcachefs: Make bch2_btree_cache_scan() try harder

Previously, when bch2_btree_cache_scan() attempted to reclaim a node but
failed (because trylock failed, because it was dirty, etc.), it would
count that against the number of nodes it was scanning and attempting to
free. This patch changes that behaviour, so that now we only count nodes
that we then don't free if they have the accessed bit (which we also
clear).

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_cache.c | 28 +++++++++++++++++-----------
 1 file changed, 17 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index 5f96c5d1a064..0976b9d7a619 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -327,17 +327,13 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink,
 	}
 restart:
 	list_for_each_entry_safe(b, t, &bc->live, list) {
-		touched++;
-
-		if (touched >= nr) {
-			/* Save position */
-			if (&t->list != &bc->live)
-				list_move_tail(&bc->live, &t->list);
-			break;
+		/* tweak this */
+		if (btree_node_accessed(b)) {
+			clear_btree_node_accessed(b);
+			goto touched;
 		}
 
-		if (!btree_node_accessed(b) &&
-		    !btree_node_reclaim(c, b)) {
+		if (!btree_node_reclaim(c, b)) {
 			/* can't call bch2_btree_node_hash_remove under lock  */
 			freed++;
 			if (&t->list != &bc->live)
@@ -358,8 +354,18 @@ restart:
 			else if (!mutex_trylock(&bc->lock))
 				goto out;
 			goto restart;
-		} else
-			clear_btree_node_accessed(b);
+		} else {
+			continue;
+		}
+touched:
+		touched++;
+
+		if (touched >= nr) {
+			/* Save position */
+			if (&t->list != &bc->live)
+				list_move_tail(&bc->live, &t->list);
+			break;
+		}
 	}
 
 	mutex_unlock(&bc->lock);
-- 
cgit 


From ee68105f619b90ef7daef9f9ebab1270209d6151 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 4 Mar 2022 19:15:46 -0500
Subject: bcachefs: Simplify parameters to bch2_btree_update_start()

We don't need to pass the number of nodes required to
bch2_btree_update_start, just whether we're doing a split at @level.
This is prep work for a fix to our usage of six lock's percpu mode,
which is going to require us to count up and allocate interior nodes and
leaf nodes seperately.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update_interior.c | 42 ++++++++++++++++++++++++-------------
 1 file changed, 27 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 7e876a904c10..523d1146b2e2 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -934,7 +934,7 @@ static void bch2_btree_update_done(struct btree_update *as)
 
 static struct btree_update *
 bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
-			unsigned level, unsigned nr_nodes, unsigned flags)
+			unsigned level, bool split, unsigned flags)
 {
 	struct bch_fs *c = trans->c;
 	struct btree_update *as;
@@ -942,6 +942,8 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
 	u64 start_time = local_clock();
 	int disk_res_flags = (flags & BTREE_INSERT_NOFAIL)
 		? BCH_DISK_RESERVATION_NOFAIL : 0;
+	unsigned nr_nodes;
+	unsigned update_level = level;
 	int journal_flags = 0;
 	int ret = 0;
 
@@ -952,11 +954,26 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
 
 	closure_init_stack(&cl);
 retry:
+	nr_nodes = 0;
+
+	while (1) {
+		nr_nodes += 1 + split;
+		update_level++;
+
+		if (!btree_path_node(path, update_level))
+			break;
+
+		/*
+		 * XXX: figure out how far we might need to split,
+		 * instead of locking/reserving all the way to the root:
+		 */
+		split = update_level + 1 < BTREE_MAX_DEPTH;
+	}
+
+	/* Might have to allocate a new root: */
+	if (update_level < BTREE_MAX_DEPTH)
+		nr_nodes += 1;
 
-	/*
-	 * XXX: figure out how far we might need to split,
-	 * instead of locking/reserving all the way to the root:
-	 */
 	if (!bch2_btree_path_upgrade(trans, path, U8_MAX)) {
 		trace_trans_restart_iter_upgrade(trans->fn, _RET_IP_,
 						 path->btree_id, &path->pos);
@@ -1559,14 +1576,13 @@ int bch2_btree_split_leaf(struct btree_trans *trans,
 			  struct btree_path *path,
 			  unsigned flags)
 {
-	struct bch_fs *c = trans->c;
 	struct btree *b = path_l(path)->b;
 	struct btree_update *as;
 	unsigned l;
 	int ret = 0;
 
 	as = bch2_btree_update_start(trans, path, path->level,
-		btree_update_reserve_required(c, b), flags);
+				     true, flags);
 	if (IS_ERR(as))
 		return PTR_ERR(as);
 
@@ -1677,11 +1693,10 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans,
 		goto out;
 
 	parent = btree_node_parent(path, b);
-	as = bch2_btree_update_start(trans, path, level,
-			 btree_update_reserve_required(c, parent) + 1,
-			 flags|
+	as = bch2_btree_update_start(trans, path, level, false,
 			 BTREE_INSERT_NOFAIL|
-			 BTREE_INSERT_USE_RESERVE);
+			 BTREE_INSERT_USE_RESERVE|
+			 flags);
 	ret = PTR_ERR_OR_ZERO(as);
 	if (ret)
 		goto err;
@@ -1764,10 +1779,7 @@ int bch2_btree_node_rewrite(struct btree_trans *trans,
 
 	parent = btree_node_parent(iter->path, b);
 	as = bch2_btree_update_start(trans, iter->path, b->c.level,
-		(parent
-		 ? btree_update_reserve_required(c, parent)
-		 : 0) + 1,
-		flags);
+				     false, flags);
 	ret = PTR_ERR_OR_ZERO(as);
 	if (ret) {
 		trace_btree_gc_rewrite_node_fail(c, b);
-- 
cgit 


From 5b3f780540aa5e39859a0c00ace61713da054a0f Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 4 Mar 2022 19:50:28 -0500
Subject: bcachefs: Refactor bch2_btree_node_mem_alloc()

This is prep work for the next patch, which is going to fix our usage of
the percpu mode of six locks by never switching struct btree between the
two modes - which means we need separate freed lists.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_cache.c | 84 +++++++++++++++++++++++++----------------------
 1 file changed, 45 insertions(+), 39 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index 0976b9d7a619..42253ca17f04 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -565,52 +565,54 @@ static struct btree *btree_node_cannibalize(struct bch_fs *c)
 struct btree *bch2_btree_node_mem_alloc(struct bch_fs *c)
 {
 	struct btree_cache *bc = &c->btree_cache;
-	struct btree *b;
+	struct btree *b, *b2;
 	u64 start_time = local_clock();
 	unsigned flags;
 
 	flags = memalloc_nofs_save();
 	mutex_lock(&bc->lock);
 
-	/*
-	 * btree_free() doesn't free memory; it sticks the node on the end of
-	 * the list. Check if there's any freed nodes there:
-	 */
-	list_for_each_entry(b, &bc->freeable, list)
-		if (!btree_node_reclaim(c, b))
-			goto got_node;
-
 	/*
 	 * We never free struct btree itself, just the memory that holds the on
 	 * disk node. Check the freed list before allocating a new one:
 	 */
 	list_for_each_entry(b, &bc->freed, list)
-		if (!btree_node_reclaim(c, b))
+		if (!btree_node_reclaim(c, b)) {
+			list_del_init(&b->list);
 			goto got_node;
+		}
+
+	b = __btree_node_mem_alloc(c);
+	if (!b)
+		goto err_locked;
 
-	b = NULL;
+	BUG_ON(!six_trylock_intent(&b->c.lock));
+	BUG_ON(!six_trylock_write(&b->c.lock));
 got_node:
-	if (b)
-		list_del_init(&b->list);
-	mutex_unlock(&bc->lock);
 
-	if (!b) {
-		b = __btree_node_mem_alloc(c);
-		if (!b)
-			goto err;
+	/*
+	 * btree_free() doesn't free memory; it sticks the node on the end of
+	 * the list. Check if there's any freed nodes there:
+	 */
+	list_for_each_entry(b2, &bc->freeable, list)
+		if (!btree_node_reclaim(c, b2)) {
+			swap(b->data, b2->data);
+			swap(b->aux_data, b2->aux_data);
+			list_move(&b2->list, &bc->freed);
+			six_unlock_write(&b2->c.lock);
+			six_unlock_intent(&b2->c.lock);
+			goto got_mem;
+		}
 
-		BUG_ON(!six_trylock_intent(&b->c.lock));
-		BUG_ON(!six_trylock_write(&b->c.lock));
-	}
+	mutex_unlock(&bc->lock);
 
-	if (!b->data) {
-		if (btree_node_data_alloc(c, b, __GFP_NOWARN|GFP_KERNEL))
-			goto err;
+	if (btree_node_data_alloc(c, b, __GFP_NOWARN|GFP_KERNEL))
+		goto err;
 
-		mutex_lock(&bc->lock);
-		bc->used++;
-		mutex_unlock(&bc->lock);
-	}
+	mutex_lock(&bc->lock);
+	bc->used++;
+got_mem:
+	mutex_unlock(&bc->lock);
 
 	BUG_ON(btree_node_hashed(b));
 	BUG_ON(btree_node_dirty(b));
@@ -632,20 +634,24 @@ out:
 	return b;
 err:
 	mutex_lock(&bc->lock);
-
-	if (b) {
-		list_add(&b->list, &bc->freed);
-		six_unlock_write(&b->c.lock);
-		six_unlock_intent(&b->c.lock);
-	}
-
+err_locked:
 	/* Try to cannibalize another cached btree node: */
 	if (bc->alloc_lock == current) {
-		b = btree_node_cannibalize(c);
-		list_del_init(&b->list);
-		mutex_unlock(&bc->lock);
+		b2 = btree_node_cannibalize(c);
+		bch2_btree_node_hash_remove(bc, b2);
+
+		if (b) {
+			swap(b->data, b2->data);
+			swap(b->aux_data, b2->aux_data);
+			list_move(&b2->list, &bc->freed);
+			six_unlock_write(&b2->c.lock);
+			six_unlock_intent(&b2->c.lock);
+		} else {
+			b = b2;
+			list_del_init(&b->list);
+		}
 
-		bch2_btree_node_hash_remove(bc, b);
+		mutex_unlock(&bc->lock);
 
 		trace_btree_node_cannibalize(c);
 		goto out;
-- 
cgit 


From 3098553776a16c08446c408005090423d62e6b54 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 4 Mar 2022 19:16:04 -0500
Subject: bcachefs: Fix usage of six lock's percpu mode

Six locks have a percpu mode, which we use for interior btree nodes, as
well as btree key cache keys for the subvolumes btree. We've been
switching locks back and forth between percpu and non percpu mode as
needed, but it turns out this is racy - when we're reusing an existing
node, other threads could be attempting to lock it while we're switching
it between modes.

This patch fixes this by never switching 'struct btree' between the two
modes, and instead segragating them between two different freed lists.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_cache.c           | 41 +++++++++------
 fs/bcachefs/btree_cache.h           |  2 +-
 fs/bcachefs/btree_io.c              |  2 +-
 fs/bcachefs/btree_key_cache.c       | 10 ++--
 fs/bcachefs/btree_types.h           |  3 +-
 fs/bcachefs/btree_update_interior.c | 99 ++++++++++++++++++++-----------------
 fs/bcachefs/btree_update_interior.h |  6 ++-
 fs/bcachefs/debug.c                 |  5 ++
 8 files changed, 99 insertions(+), 69 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index 42253ca17f04..92a8cc704cab 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -40,6 +40,14 @@ static inline unsigned btree_cache_can_free(struct btree_cache *bc)
 	return max_t(int, 0, bc->used - bc->reserve);
 }
 
+static void btree_node_to_freedlist(struct btree_cache *bc, struct btree *b)
+{
+	if (b->c.lock.readers)
+		list_move(&b->list, &bc->freed_pcpu);
+	else
+		list_move(&b->list, &bc->freed_nonpcpu);
+}
+
 static void btree_node_data_free(struct bch_fs *c, struct btree *b)
 {
 	struct btree_cache *bc = &c->btree_cache;
@@ -56,7 +64,8 @@ static void btree_node_data_free(struct bch_fs *c, struct btree *b)
 	b->aux_data = NULL;
 
 	bc->used--;
-	list_move(&b->list, &bc->freed);
+
+	btree_node_to_freedlist(bc, b);
 }
 
 static int bch2_btree_cache_cmp_fn(struct rhashtable_compare_arg *arg,
@@ -162,11 +171,6 @@ int bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b,
 	b->c.level	= level;
 	b->c.btree_id	= id;
 
-	if (level)
-		six_lock_pcpu_alloc(&b->c.lock);
-	else
-		six_lock_pcpu_free_rcu(&b->c.lock);
-
 	mutex_lock(&bc->lock);
 	ret = __bch2_btree_node_hash_insert(bc, b);
 	if (!ret)
@@ -432,8 +436,10 @@ void bch2_fs_btree_cache_exit(struct bch_fs *c)
 
 	BUG_ON(atomic_read(&c->btree_cache.dirty));
 
-	while (!list_empty(&bc->freed)) {
-		b = list_first_entry(&bc->freed, struct btree, list);
+	list_splice(&bc->freed_pcpu, &bc->freed_nonpcpu);
+
+	while (!list_empty(&bc->freed_nonpcpu)) {
+		b = list_first_entry(&bc->freed_nonpcpu, struct btree, list);
 		list_del(&b->list);
 		six_lock_pcpu_free(&b->c.lock);
 		kfree(b);
@@ -487,7 +493,8 @@ void bch2_fs_btree_cache_init_early(struct btree_cache *bc)
 	mutex_init(&bc->lock);
 	INIT_LIST_HEAD(&bc->live);
 	INIT_LIST_HEAD(&bc->freeable);
-	INIT_LIST_HEAD(&bc->freed);
+	INIT_LIST_HEAD(&bc->freed_pcpu);
+	INIT_LIST_HEAD(&bc->freed_nonpcpu);
 }
 
 /*
@@ -562,9 +569,12 @@ static struct btree *btree_node_cannibalize(struct bch_fs *c)
 	}
 }
 
-struct btree *bch2_btree_node_mem_alloc(struct bch_fs *c)
+struct btree *bch2_btree_node_mem_alloc(struct bch_fs *c, bool pcpu_read_locks)
 {
 	struct btree_cache *bc = &c->btree_cache;
+	struct list_head *freed = pcpu_read_locks
+		? &bc->freed_pcpu
+		: &bc->freed_nonpcpu;
 	struct btree *b, *b2;
 	u64 start_time = local_clock();
 	unsigned flags;
@@ -576,7 +586,7 @@ struct btree *bch2_btree_node_mem_alloc(struct bch_fs *c)
 	 * We never free struct btree itself, just the memory that holds the on
 	 * disk node. Check the freed list before allocating a new one:
 	 */
-	list_for_each_entry(b, &bc->freed, list)
+	list_for_each_entry(b, freed, list)
 		if (!btree_node_reclaim(c, b)) {
 			list_del_init(&b->list);
 			goto got_node;
@@ -586,6 +596,9 @@ struct btree *bch2_btree_node_mem_alloc(struct bch_fs *c)
 	if (!b)
 		goto err_locked;
 
+	if (pcpu_read_locks)
+		six_lock_pcpu_alloc(&b->c.lock);
+
 	BUG_ON(!six_trylock_intent(&b->c.lock));
 	BUG_ON(!six_trylock_write(&b->c.lock));
 got_node:
@@ -598,7 +611,7 @@ got_node:
 		if (!btree_node_reclaim(c, b2)) {
 			swap(b->data, b2->data);
 			swap(b->aux_data, b2->aux_data);
-			list_move(&b2->list, &bc->freed);
+			btree_node_to_freedlist(bc, b2);
 			six_unlock_write(&b2->c.lock);
 			six_unlock_intent(&b2->c.lock);
 			goto got_mem;
@@ -643,7 +656,7 @@ err_locked:
 		if (b) {
 			swap(b->data, b2->data);
 			swap(b->aux_data, b2->aux_data);
-			list_move(&b2->list, &bc->freed);
+			btree_node_to_freedlist(bc, b2);
 			six_unlock_write(&b2->c.lock);
 			six_unlock_intent(&b2->c.lock);
 		} else {
@@ -688,7 +701,7 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c,
 		return ERR_PTR(-EINTR);
 	}
 
-	b = bch2_btree_node_mem_alloc(c);
+	b = bch2_btree_node_mem_alloc(c, level != 0);
 
 	if (trans && b == ERR_PTR(-ENOMEM)) {
 		trans->memory_allocation_failure = true;
diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h
index 96f8f90e85a1..83723805f12a 100644
--- a/fs/bcachefs/btree_cache.h
+++ b/fs/bcachefs/btree_cache.h
@@ -20,7 +20,7 @@ void bch2_btree_cache_cannibalize_unlock(struct bch_fs *);
 int bch2_btree_cache_cannibalize_lock(struct bch_fs *, struct closure *);
 
 struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *);
-struct btree *bch2_btree_node_mem_alloc(struct bch_fs *);
+struct btree *bch2_btree_node_mem_alloc(struct bch_fs *, bool);
 
 struct btree *bch2_btree_node_get(struct btree_trans *, struct btree_path *,
 				  const struct bkey_i *, unsigned,
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 53f83340f69a..3031b566a112 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -1542,7 +1542,7 @@ int bch2_btree_root_read(struct bch_fs *c, enum btree_id id,
 		closure_sync(&cl);
 	} while (ret);
 
-	b = bch2_btree_node_mem_alloc(c);
+	b = bch2_btree_node_mem_alloc(c, level != 0);
 	bch2_btree_cache_cannibalize_unlock(c);
 
 	BUG_ON(IS_ERR(b));
diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index 70f31b5379e7..7e41552a57df 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -166,13 +166,13 @@ btree_key_cache_create(struct bch_fs *c,
 		}
 
 		was_new = false;
+	} else {
+		if (btree_id == BTREE_ID_subvolumes)
+			six_lock_pcpu_alloc(&ck->c.lock);
+		else
+			six_lock_pcpu_free(&ck->c.lock);
 	}
 
-	if (btree_id == BTREE_ID_subvolumes)
-		six_lock_pcpu_alloc(&ck->c.lock);
-	else
-		six_lock_pcpu_free(&ck->c.lock);
-
 	ck->c.level		= 0;
 	ck->c.btree_id		= btree_id;
 	ck->key.btree_id	= btree_id;
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 561406b4b7c2..51eb686331bf 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -152,7 +152,8 @@ struct btree_cache {
 	struct mutex		lock;
 	struct list_head	live;
 	struct list_head	freeable;
-	struct list_head	freed;
+	struct list_head	freed_pcpu;
+	struct list_head	freed_nonpcpu;
 
 	/* Number of elements in live + freeable lists */
 	unsigned		used;
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 523d1146b2e2..43022b340f4e 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -181,6 +181,7 @@ static void bch2_btree_node_free_inmem(struct btree_trans *trans,
 static struct btree *__bch2_btree_node_alloc(struct bch_fs *c,
 					     struct disk_reservation *res,
 					     struct closure *cl,
+					     bool interior_node,
 					     unsigned flags)
 {
 	struct write_point *wp;
@@ -242,7 +243,7 @@ retry:
 	bch2_open_bucket_get(c, wp, &ob);
 	bch2_alloc_sectors_done(c, wp);
 mem_alloc:
-	b = bch2_btree_node_mem_alloc(c);
+	b = bch2_btree_node_mem_alloc(c, interior_node);
 	six_unlock_write(&b->c.lock);
 	six_unlock_intent(&b->c.lock);
 
@@ -260,12 +261,13 @@ static struct btree *bch2_btree_node_alloc(struct btree_update *as, unsigned lev
 {
 	struct bch_fs *c = as->c;
 	struct btree *b;
+	struct prealloc_nodes *p = &as->prealloc_nodes[!!level];
 	int ret;
 
 	BUG_ON(level >= BTREE_MAX_DEPTH);
-	BUG_ON(!as->nr_prealloc_nodes);
+	BUG_ON(!p->nr);
 
-	b = as->prealloc_nodes[--as->nr_prealloc_nodes];
+	b = p->b[--p->nr];
 
 	six_lock_intent(&b->c.lock, NULL, NULL);
 	six_lock_write(&b->c.lock, NULL, NULL);
@@ -377,43 +379,49 @@ static struct btree *__btree_root_alloc(struct btree_update *as, unsigned level)
 static void bch2_btree_reserve_put(struct btree_update *as)
 {
 	struct bch_fs *c = as->c;
+	struct prealloc_nodes *p;
 
 	mutex_lock(&c->btree_reserve_cache_lock);
 
-	while (as->nr_prealloc_nodes) {
-		struct btree *b = as->prealloc_nodes[--as->nr_prealloc_nodes];
+	for (p = as->prealloc_nodes;
+	     p < as->prealloc_nodes + ARRAY_SIZE(as->prealloc_nodes);
+	     p++) {
+		while (p->nr) {
+			struct btree *b = p->b[--p->nr];
 
-		six_lock_intent(&b->c.lock, NULL, NULL);
-		six_lock_write(&b->c.lock, NULL, NULL);
+			six_lock_intent(&b->c.lock, NULL, NULL);
+			six_lock_write(&b->c.lock, NULL, NULL);
 
-		if (c->btree_reserve_cache_nr <
-		    ARRAY_SIZE(c->btree_reserve_cache)) {
-			struct btree_alloc *a =
-				&c->btree_reserve_cache[c->btree_reserve_cache_nr++];
+			if (c->btree_reserve_cache_nr <
+			    ARRAY_SIZE(c->btree_reserve_cache)) {
+				struct btree_alloc *a =
+					&c->btree_reserve_cache[c->btree_reserve_cache_nr++];
 
-			a->ob = b->ob;
-			b->ob.nr = 0;
-			bkey_copy(&a->k, &b->key);
-		} else {
-			bch2_open_buckets_put(c, &b->ob);
-		}
+				a->ob = b->ob;
+				b->ob.nr = 0;
+				bkey_copy(&a->k, &b->key);
+			} else {
+				bch2_open_buckets_put(c, &b->ob);
+			}
 
-		__btree_node_free(c, b);
-		six_unlock_write(&b->c.lock);
-		six_unlock_intent(&b->c.lock);
+			__btree_node_free(c, b);
+			six_unlock_write(&b->c.lock);
+			six_unlock_intent(&b->c.lock);
+		}
 	}
 
 	mutex_unlock(&c->btree_reserve_cache_lock);
 }
 
-static int bch2_btree_reserve_get(struct btree_update *as, unsigned nr_nodes,
+static int bch2_btree_reserve_get(struct btree_update *as, unsigned nr_nodes[2],
 				  unsigned flags, struct closure *cl)
 {
 	struct bch_fs *c = as->c;
 	struct btree *b;
+	unsigned interior;
 	int ret;
 
-	BUG_ON(nr_nodes > BTREE_RESERVE_MAX);
+	BUG_ON(nr_nodes[0] + nr_nodes[1] > BTREE_RESERVE_MAX);
 
 	/*
 	 * Protects reaping from the btree node cache and using the btree node
@@ -423,23 +431,28 @@ static int bch2_btree_reserve_get(struct btree_update *as, unsigned nr_nodes,
 	if (ret)
 		return ret;
 
-	while (as->nr_prealloc_nodes < nr_nodes) {
-		b = __bch2_btree_node_alloc(c, &as->disk_res,
-					    flags & BTREE_INSERT_NOWAIT
-					    ? NULL : cl, flags);
-		if (IS_ERR(b)) {
-			ret = PTR_ERR(b);
-			goto err_free;
-		}
+	for (interior = 0; interior < 2; interior++) {
+		struct prealloc_nodes *p = as->prealloc_nodes + interior;
+
+		while (p->nr < nr_nodes[interior]) {
+			b = __bch2_btree_node_alloc(c, &as->disk_res,
+						    flags & BTREE_INSERT_NOWAIT
+						    ? NULL : cl,
+						    interior, flags);
+			if (IS_ERR(b)) {
+				ret = PTR_ERR(b);
+				goto err;
+			}
 
-		as->prealloc_nodes[as->nr_prealloc_nodes++] = b;
+			p->b[p->nr++] = b;
+		}
 	}
 
 	bch2_btree_cache_cannibalize_unlock(c);
 	return 0;
-err_free:
+err:
 	bch2_btree_cache_cannibalize_unlock(c);
-	trace_btree_reserve_get_fail(c, nr_nodes, cl);
+	trace_btree_reserve_get_fail(c, nr_nodes[0] + nr_nodes[1], cl);
 	return ret;
 }
 
@@ -942,7 +955,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
 	u64 start_time = local_clock();
 	int disk_res_flags = (flags & BTREE_INSERT_NOFAIL)
 		? BCH_DISK_RESERVATION_NOFAIL : 0;
-	unsigned nr_nodes;
+	unsigned nr_nodes[2];
 	unsigned update_level = level;
 	int journal_flags = 0;
 	int ret = 0;
@@ -954,10 +967,11 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
 
 	closure_init_stack(&cl);
 retry:
-	nr_nodes = 0;
+	nr_nodes[0] = nr_nodes[1] = 0;
+	update_level = level;
 
 	while (1) {
-		nr_nodes += 1 + split;
+		nr_nodes[!!update_level] += 1 + split;
 		update_level++;
 
 		if (!btree_path_node(path, update_level))
@@ -972,7 +986,7 @@ retry:
 
 	/* Might have to allocate a new root: */
 	if (update_level < BTREE_MAX_DEPTH)
-		nr_nodes += 1;
+		nr_nodes[1] += 1;
 
 	if (!bch2_btree_path_upgrade(trans, path, U8_MAX)) {
 		trace_trans_restart_iter_upgrade(trans->fn, _RET_IP_,
@@ -1050,7 +1064,7 @@ retry:
 	}
 
 	ret = bch2_disk_reservation_get(c, &as->disk_res,
-			nr_nodes * btree_sectors(c),
+			(nr_nodes[0] + nr_nodes[1]) * btree_sectors(c),
 			c->opts.metadata_replicas,
 			disk_res_flags);
 	if (ret)
@@ -1085,11 +1099,6 @@ static void bch2_btree_set_root_inmem(struct bch_fs *c, struct btree *b)
 	list_del_init(&b->list);
 	mutex_unlock(&c->btree_cache.lock);
 
-	if (b->c.level)
-		six_lock_pcpu_alloc(&b->c.lock);
-	else
-		six_lock_pcpu_free(&b->c.lock);
-
 	mutex_lock(&c->btree_root_lock);
 	BUG_ON(btree_node_root(c, b) &&
 	       (b->c.level < btree_node_root(c, b)->c.level ||
@@ -2015,7 +2024,7 @@ int bch2_btree_node_update_key(struct btree_trans *trans, struct btree_iter *ite
 				return -EINTR;
 		}
 
-		new_hash = bch2_btree_node_mem_alloc(c);
+		new_hash = bch2_btree_node_mem_alloc(c, false);
 	}
 
 	path->intent_ref++;
@@ -2091,7 +2100,7 @@ void bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id)
 		closure_sync(&cl);
 	} while (ret);
 
-	b = bch2_btree_node_mem_alloc(c);
+	b = bch2_btree_node_mem_alloc(c, false);
 	bch2_btree_cache_cannibalize_unlock(c);
 
 	set_btree_node_fake(b);
diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h
index 8dc86fa636d6..e72eb8795616 100644
--- a/fs/bcachefs/btree_update_interior.h
+++ b/fs/bcachefs/btree_update_interior.h
@@ -76,8 +76,10 @@ struct btree_update {
 	struct journal_entry_pin	journal;
 
 	/* Preallocated nodes we reserve when we start the update: */
-	struct btree			*prealloc_nodes[BTREE_UPDATE_NODES_MAX];
-	unsigned			nr_prealloc_nodes;
+	struct prealloc_nodes {
+		struct btree		*b[BTREE_UPDATE_NODES_MAX];
+		unsigned		nr;
+	}				prealloc_nodes[2];
 
 	/* Nodes being freed: */
 	struct keylist			old_keys;
diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c
index 1fff03d301a9..457fcee7d8e1 100644
--- a/fs/bcachefs/debug.c
+++ b/fs/bcachefs/debug.c
@@ -443,6 +443,11 @@ static void bch2_cached_btree_node_to_text(struct printbuf *out, struct bch_fs *
 	bch2_flags_to_text(out, bch2_btree_node_flags, b->flags);
 	pr_newline(out);
 
+	pr_buf(out, "pcpu read locks: ");
+	pr_tab(out);
+	pr_buf(out, "%u", b->c.lock.readers != NULL);
+	pr_newline(out);
+
 	pr_buf(out, "written:");
 	pr_tab(out);
 	pr_buf(out, "%u", b->written);
-- 
cgit 


From 102a6a8f69b06df0e6594af4932ef47804f645c6 Mon Sep 17 00:00:00 2001
From: Daniel Hill <daniel@gluo.nz>
Date: Sat, 5 Mar 2022 17:45:27 +1300
Subject: bcachefs: respect superblock discard flag.

We were accidentally using default mount options and overwriting the
discard flag.

Signed-off-by: Daniel Hill <daniel@gluo.nz>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/super.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 27716d6e962d..0bc78c50150a 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -1167,9 +1167,6 @@ static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c,
 	ca->mi = bch2_mi_to_cpu(member);
 	ca->uuid = member->uuid;
 
-	if (opt_defined(c->opts, discard))
-		ca->mi.discard = opt_get(c->opts, discard);
-
 	if (percpu_ref_init(&ca->ref, bch2_dev_ref_complete,
 			    0, GFP_KERNEL) ||
 	    percpu_ref_init(&ca->io_ref, bch2_dev_io_ref_complete,
-- 
cgit 


From 4eea53de8a1882e75d3640dce06c8c2874a77b05 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 4 Mar 2022 21:57:11 -0500
Subject: bcachefs: Fix transaction path overflow in fiemap

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/fs.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index 2aaeee585157..310e317738b9 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -935,7 +935,8 @@ retry:
 	bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
 			     SPOS(ei->v.i_ino, start, snapshot), 0);
 
-	while ((k = bch2_btree_iter_peek(&iter)).k &&
+	while (!(ret = btree_trans_too_many_iters(&trans)) &&
+	       (k = bch2_btree_iter_peek(&iter)).k &&
 	       !(ret = bkey_err(k)) &&
 	       bkey_cmp(iter.pos, end) < 0) {
 		enum btree_id data_btree = BTREE_ID_extents;
-- 
cgit 


From 5521b1dfa20262a9cb8d1214c095c9ca2a4cb127 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 5 Mar 2022 12:01:16 -0500
Subject: bcachefs: Convert bch2_sb_to_text to master option list

Options no longer have to be manually added to bch2_sb_to_text() - it
now uses the master list of options in opts.h. Also, improve some of the
formatting by converting it to tabstops.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/disk_groups.c |  87 +++++++++------------
 fs/bcachefs/disk_groups.h |   4 +-
 fs/bcachefs/fs.c          |   2 +-
 fs/bcachefs/opts.c        |  45 ++++++++---
 fs/bcachefs/opts.h        |  62 +++++++--------
 fs/bcachefs/super-io.c    | 191 ++++++++++++++++++++++------------------------
 fs/bcachefs/super.c       |   2 +-
 fs/bcachefs/sysfs.c       |   2 +-
 fs/bcachefs/xattr.c       |   2 +-
 9 files changed, 198 insertions(+), 199 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/disk_groups.c b/fs/bcachefs/disk_groups.c
index e9ee37f1e07d..97eb21827cb3 100644
--- a/fs/bcachefs/disk_groups.c
+++ b/fs/bcachefs/disk_groups.c
@@ -445,7 +445,10 @@ int bch2_opt_target_parse(struct bch_fs *c, const char *buf, u64 *v)
 	return -EINVAL;
 }
 
-void bch2_sb_target_to_text(struct printbuf *out, struct bch_sb *sb, u64 v)
+void bch2_opt_target_to_text(struct printbuf *out,
+			     struct bch_fs *c,
+			     struct bch_sb *sb,
+			     u64 v)
 {
 	struct target t = target_decode(v);
 
@@ -453,60 +456,46 @@ void bch2_sb_target_to_text(struct printbuf *out, struct bch_sb *sb, u64 v)
 	case TARGET_NULL:
 		pr_buf(out, "none");
 		break;
-	case TARGET_DEV: {
-		struct bch_sb_field_members *mi = bch2_sb_get_members(sb);
-		struct bch_member *m = mi->members + t.dev;
-
-		if (bch2_dev_exists(sb, mi, t.dev)) {
-			pr_buf(out, "Device ");
-			pr_uuid(out, m->uuid.b);
-			pr_buf(out, " (%u)", t.dev);
+	case TARGET_DEV:
+		if (c) {
+			struct bch_dev *ca;
+
+			rcu_read_lock();
+			ca = t.dev < c->sb.nr_devices
+				? rcu_dereference(c->devs[t.dev])
+				: NULL;
+
+			if (ca && percpu_ref_tryget(&ca->io_ref)) {
+				pr_buf(out, "/dev/%pg", ca->disk_sb.bdev);
+				percpu_ref_put(&ca->io_ref);
+			} else if (ca) {
+				pr_buf(out, "offline device %u", t.dev);
+			} else {
+				pr_buf(out, "invalid device %u", t.dev);
+			}
+
+			rcu_read_unlock();
 		} else {
-			pr_buf(out, "Bad device %u", t.dev);
+			struct bch_sb_field_members *mi = bch2_sb_get_members(sb);
+			struct bch_member *m = mi->members + t.dev;
+
+			if (bch2_dev_exists(sb, mi, t.dev)) {
+				pr_buf(out, "Device ");
+				pr_uuid(out, m->uuid.b);
+				pr_buf(out, " (%u)", t.dev);
+			} else {
+				pr_buf(out, "Bad device %u", t.dev);
+			}
 		}
-
 		break;
-	}
 	case TARGET_GROUP:
-		bch2_disk_path_to_text(out, sb, t.group);
-		break;
-	default:
-		BUG();
-	}
-}
-
-void bch2_opt_target_to_text(struct printbuf *out, struct bch_fs *c, u64 v)
-{
-	struct target t = target_decode(v);
-
-	switch (t.type) {
-	case TARGET_NULL:
-		pr_buf(out, "none");
-		break;
-	case TARGET_DEV: {
-		struct bch_dev *ca;
-
-		rcu_read_lock();
-		ca = t.dev < c->sb.nr_devices
-			? rcu_dereference(c->devs[t.dev])
-			: NULL;
-
-		if (ca && percpu_ref_tryget(&ca->io_ref)) {
-			pr_buf(out, "/dev/%pg", ca->disk_sb.bdev);
-			percpu_ref_put(&ca->io_ref);
-		} else if (ca) {
-			pr_buf(out, "offline device %u", t.dev);
+		if (c) {
+			mutex_lock(&c->sb_lock);
+			bch2_disk_path_to_text(out, c->disk_sb.sb, t.group);
+			mutex_unlock(&c->sb_lock);
 		} else {
-			pr_buf(out, "invalid device %u", t.dev);
+			bch2_disk_path_to_text(out, sb, t.group);
 		}
-
-		rcu_read_unlock();
-		break;
-	}
-	case TARGET_GROUP:
-		mutex_lock(&c->sb_lock);
-		bch2_disk_path_to_text(out, c->disk_sb.sb, t.group);
-		mutex_unlock(&c->sb_lock);
 		break;
 	default:
 		BUG();
diff --git a/fs/bcachefs/disk_groups.h b/fs/bcachefs/disk_groups.h
index a274aacbdf92..de915480514b 100644
--- a/fs/bcachefs/disk_groups.h
+++ b/fs/bcachefs/disk_groups.h
@@ -77,10 +77,8 @@ int bch2_disk_path_find_or_create(struct bch_sb_handle *, const char *);
 
 void bch2_disk_path_to_text(struct printbuf *, struct bch_sb *, unsigned);
 
-void bch2_sb_target_to_text(struct printbuf *, struct bch_sb *, u64);
-
 int bch2_opt_target_parse(struct bch_fs *, const char *, u64 *);
-void bch2_opt_target_to_text(struct printbuf *, struct bch_fs *, u64);
+void bch2_opt_target_to_text(struct printbuf *, struct bch_fs *, struct bch_sb *, u64);
 
 int bch2_sb_disk_groups_to_cpu(struct bch_fs *);
 
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index 310e317738b9..4c68cee013e3 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -1691,7 +1691,7 @@ static int bch2_show_options(struct seq_file *seq, struct dentry *root)
 			continue;
 
 		printbuf_reset(&buf);
-		bch2_opt_to_text(&buf, c, opt, v,
+		bch2_opt_to_text(&buf, c, c->disk_sb.sb, opt, v,
 				 OPT_SHOW_MOUNT_STYLE);
 		seq_putc(seq, ',');
 		seq_puts(seq, buf.buf);
diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c
index 71bf26eb13d5..e78d3b75f6fb 100644
--- a/fs/bcachefs/opts.c
+++ b/fs/bcachefs/opts.c
@@ -96,6 +96,16 @@ const char * const bch2_d_types[BCH_DT_MAX] = {
 	[DT_SUBVOL]	= "subvol",
 };
 
+u64 BCH2_NO_SB_OPT(const struct bch_sb *sb)
+{
+	BUG();
+}
+
+void SET_BCH2_NO_SB_OPT(struct bch_sb *sb, u64 v)
+{
+	BUG();
+}
+
 void bch2_opts_apply(struct bch_opts *dst, struct bch_opts src)
 {
 #define x(_name, ...)						\
@@ -280,7 +290,8 @@ int bch2_opt_parse(struct bch_fs *c, const char *msg,
 	return bch2_opt_validate(opt, msg, *res);
 }
 
-void bch2_opt_to_text(struct printbuf *out, struct bch_fs *c,
+void bch2_opt_to_text(struct printbuf *out,
+		      struct bch_fs *c, struct bch_sb *sb,
 		      const struct bch_option *opt, u64 v,
 		      unsigned flags)
 {
@@ -310,7 +321,7 @@ void bch2_opt_to_text(struct printbuf *out, struct bch_fs *c,
 			pr_buf(out, opt->choices[v]);
 		break;
 	case BCH_OPT_FN:
-		opt->to_text(out, c, v);
+		opt->to_text(out, c, sb, v);
 		break;
 	default:
 		BUG();
@@ -431,6 +442,22 @@ out:
 	return ret;
 }
 
+u64 bch2_opt_from_sb(struct bch_sb *sb, enum bch_opt_id id)
+{
+	const struct bch_option *opt = bch2_opt_table + id;
+	u64 v;
+
+	v = opt->get_sb(sb);
+
+	if (opt->flags & OPT_SB_FIELD_ILOG2)
+		v = 1ULL << v;
+
+	if (opt->flags & OPT_SB_FIELD_SECTORS)
+		v <<= 9;
+
+	return v;
+}
+
 /*
  * Initial options from superblock - here we don't want any options undefined,
  * any options the superblock doesn't specify are set to 0:
@@ -444,16 +471,10 @@ int bch2_opts_from_sb(struct bch_opts *opts, struct bch_sb *sb)
 		const struct bch_option *opt = bch2_opt_table + id;
 		u64 v;
 
-		if (opt->get_sb == NO_SB_OPT)
+		if (opt->get_sb == BCH2_NO_SB_OPT)
 			continue;
 
-		v = opt->get_sb(sb);
-
-		if (opt->flags & OPT_SB_FIELD_ILOG2)
-			v = 1ULL << v;
-
-		if (opt->flags & OPT_SB_FIELD_SECTORS)
-			v <<= 9;
+		v = bch2_opt_from_sb(sb, id);
 
 		ret = bch2_opt_validate(opt, "superblock option ", v);
 		if (ret)
@@ -467,7 +488,7 @@ int bch2_opts_from_sb(struct bch_opts *opts, struct bch_sb *sb)
 
 void __bch2_opt_set_sb(struct bch_sb *sb, const struct bch_option *opt, u64 v)
 {
-	if (opt->set_sb == SET_NO_SB_OPT)
+	if (opt->set_sb == SET_BCH2_NO_SB_OPT)
 		return;
 
 	if (opt->flags & OPT_SB_FIELD_SECTORS)
@@ -481,7 +502,7 @@ void __bch2_opt_set_sb(struct bch_sb *sb, const struct bch_option *opt, u64 v)
 
 void bch2_opt_set_sb(struct bch_fs *c, const struct bch_option *opt, u64 v)
 {
-	if (opt->set_sb == SET_NO_SB_OPT)
+	if (opt->set_sb == SET_BCH2_NO_SB_OPT)
 		return;
 
 	mutex_lock(&c->sb_lock);
diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
index b03cac016f0b..fffe3e066864 100644
--- a/fs/bcachefs/opts.h
+++ b/fs/bcachefs/opts.h
@@ -42,7 +42,8 @@ static inline const char *bch2_d_type_str(unsigned d_type)
  */
 
 /* dummy option, for options that aren't stored in the superblock */
-LE64_BITMASK(NO_SB_OPT,		struct bch_sb, flags[0], 0, 0);
+u64 BCH2_NO_SB_OPT(const struct bch_sb *);
+void SET_BCH2_NO_SB_OPT(struct bch_sb *, u64);
 
 /* When can be set: */
 enum opt_flags {
@@ -202,7 +203,7 @@ enum opt_type {
 	x(btree_node_mem_ptr_optimization, u8,				\
 	  OPT_FS|OPT_MOUNT|OPT_RUNTIME,					\
 	  OPT_BOOL(),							\
-	  NO_SB_OPT,			true,				\
+	  BCH2_NO_SB_OPT,		true,				\
 	  NULL,		"Stash pointer to in memory btree node in btree ptr")\
 	x(gc_reserve_percent,		u8,				\
 	  OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,			\
@@ -229,7 +230,7 @@ enum opt_type {
 	x(inline_data,			u8,				\
 	  OPT_FS|OPT_MOUNT|OPT_RUNTIME,					\
 	  OPT_BOOL(),							\
-	  NO_SB_OPT,			true,				\
+	  BCH2_NO_SB_OPT,		true,				\
 	  NULL,		"Enable inline data extents")			\
 	x(acl,				u8,				\
 	  OPT_FS|OPT_FORMAT|OPT_MOUNT,					\
@@ -254,22 +255,22 @@ enum opt_type {
 	x(degraded,			u8,				\
 	  OPT_FS|OPT_MOUNT,						\
 	  OPT_BOOL(),							\
-	  NO_SB_OPT,			false,				\
+	  BCH2_NO_SB_OPT,		false,				\
 	  NULL,		"Allow mounting in degraded mode")		\
 	x(very_degraded,		u8,				\
 	  OPT_FS|OPT_MOUNT,						\
 	  OPT_BOOL(),							\
-	  NO_SB_OPT,			false,				\
+	  BCH2_NO_SB_OPT,		false,				\
 	  NULL,		"Allow mounting in when data will be missing")	\
 	x(discard,			u8,				\
 	  OPT_FS|OPT_MOUNT|OPT_DEVICE,					\
 	  OPT_BOOL(),							\
-	  NO_SB_OPT,			false,				\
+	  BCH2_NO_SB_OPT,		false,				\
 	  NULL,		"Enable discard/TRIM support")			\
 	x(verbose,			u8,				\
 	  OPT_FS|OPT_MOUNT,						\
 	  OPT_BOOL(),							\
-	  NO_SB_OPT,			false,				\
+	  BCH2_NO_SB_OPT,		false,				\
 	  NULL,		"Extra debugging information during mount/recovery")\
 	x(journal_flush_delay,		u32,				\
 	  OPT_FS|OPT_MOUNT|OPT_RUNTIME,					\
@@ -291,48 +292,48 @@ enum opt_type {
 	x(fsck,				u8,				\
 	  OPT_FS|OPT_MOUNT,						\
 	  OPT_BOOL(),							\
-	  NO_SB_OPT,			false,				\
+	  BCH2_NO_SB_OPT,		false,				\
 	  NULL,		"Run fsck on mount")				\
 	x(fix_errors,			u8,				\
 	  OPT_FS|OPT_MOUNT,						\
 	  OPT_BOOL(),							\
-	  NO_SB_OPT,			false,				\
+	  BCH2_NO_SB_OPT,		false,				\
 	  NULL,		"Fix errors during fsck without asking")	\
 	x(ratelimit_errors,		u8,				\
 	  OPT_FS|OPT_MOUNT,						\
 	  OPT_BOOL(),							\
-	  NO_SB_OPT,			RATELIMIT_ERRORS_DEFAULT,	\
+	  BCH2_NO_SB_OPT,		RATELIMIT_ERRORS_DEFAULT,	\
 	  NULL,		"Ratelimit error messages during fsck")		\
 	x(nochanges,			u8,				\
 	  OPT_FS|OPT_MOUNT,						\
 	  OPT_BOOL(),							\
-	  NO_SB_OPT,			false,				\
+	  BCH2_NO_SB_OPT,		false,				\
 	  NULL,		"Super read only mode - no writes at all will be issued,\n"\
 			"even if we have to replay the journal")	\
 	x(norecovery,			u8,				\
 	  OPT_FS|OPT_MOUNT,						\
 	  OPT_BOOL(),							\
-	  NO_SB_OPT,			false,				\
+	  BCH2_NO_SB_OPT,		false,				\
 	  NULL,		"Don't replay the journal")			\
 	x(rebuild_replicas,		u8,				\
 	  OPT_FS|OPT_MOUNT,						\
 	  OPT_BOOL(),							\
-	  NO_SB_OPT,			false,				\
+	  BCH2_NO_SB_OPT,		false,				\
 	  NULL,		"Rebuild the superblock replicas section")	\
 	x(keep_journal,			u8,				\
 	  0,								\
 	  OPT_BOOL(),							\
-	  NO_SB_OPT,			false,				\
+	  BCH2_NO_SB_OPT,		false,				\
 	  NULL,		"Don't free journal entries/keys after startup")\
 	x(read_entire_journal,		u8,				\
 	  0,								\
 	  OPT_BOOL(),							\
-	  NO_SB_OPT,			false,				\
+	  BCH2_NO_SB_OPT,		false,				\
 	  NULL,		"Read all journal entries, not just dirty ones")\
 	x(read_journal_only,		u8,				\
 	  0,								\
 	  OPT_BOOL(),							\
-	  NO_SB_OPT,			false,				\
+	  BCH2_NO_SB_OPT,		false,				\
 	  NULL,		"Only read the journal, skip the rest of recovery")\
 	x(journal_transaction_names,	u8,				\
 	  OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,			\
@@ -342,64 +343,64 @@ enum opt_type {
 	x(noexcl,			u8,				\
 	  OPT_FS|OPT_MOUNT,						\
 	  OPT_BOOL(),							\
-	  NO_SB_OPT,			false,				\
+	  BCH2_NO_SB_OPT,		false,				\
 	  NULL,		"Don't open device in exclusive mode")		\
 	x(sb,				u64,				\
 	  OPT_MOUNT,							\
 	  OPT_UINT(0, S64_MAX),						\
-	  NO_SB_OPT,			BCH_SB_SECTOR,			\
+	  BCH2_NO_SB_OPT,		BCH_SB_SECTOR,			\
 	  "offset",	"Sector offset of superblock")			\
 	x(read_only,			u8,				\
 	  OPT_FS,							\
 	  OPT_BOOL(),							\
-	  NO_SB_OPT,			false,				\
+	  BCH2_NO_SB_OPT,		false,				\
 	  NULL,		NULL)						\
 	x(nostart,			u8,				\
 	  0,								\
 	  OPT_BOOL(),							\
-	  NO_SB_OPT,			false,				\
+	  BCH2_NO_SB_OPT,		false,				\
 	  NULL,		"Don\'t start filesystem, only open devices")	\
 	x(reconstruct_alloc,		u8,				\
 	  OPT_FS|OPT_MOUNT,						\
 	  OPT_BOOL(),							\
-	  NO_SB_OPT,			false,				\
+	  BCH2_NO_SB_OPT,		false,				\
 	  NULL,		"Reconstruct alloc btree")			\
 	x(version_upgrade,		u8,				\
 	  OPT_FS|OPT_MOUNT,						\
 	  OPT_BOOL(),							\
-	  NO_SB_OPT,			false,				\
+	  BCH2_NO_SB_OPT,		false,				\
 	  NULL,		"Set superblock to latest version,\n"		\
 			"allowing any new features to be used")		\
 	x(buckets_nouse,		u8,				\
 	  0,								\
 	  OPT_BOOL(),							\
-	  NO_SB_OPT,			false,				\
+	  BCH2_NO_SB_OPT,		false,				\
 	  NULL,		"Allocate the buckets_nouse bitmap")		\
 	x(project,			u8,				\
 	  OPT_INODE,							\
 	  OPT_BOOL(),							\
-	  NO_SB_OPT,			false,				\
+	  BCH2_NO_SB_OPT,		false,				\
 	  NULL,		NULL)						\
 	x(no_data_io,			u8,				\
 	  OPT_MOUNT,							\
 	  OPT_BOOL(),							\
-	  NO_SB_OPT,			false,				\
+	  BCH2_NO_SB_OPT,		false,				\
 	  NULL,		"Skip submit_bio() for data reads and writes, "	\
 			"for performance testing purposes")		\
 	x(fs_size,			u64,				\
 	  OPT_DEVICE,							\
 	  OPT_UINT(0, S64_MAX),						\
-	  NO_SB_OPT,			0,				\
+	  BCH2_NO_SB_OPT,		0,				\
 	  "size",	"Size of filesystem on device")			\
 	x(bucket,			u32,				\
 	  OPT_DEVICE,							\
 	  OPT_UINT(0, S64_MAX),						\
-	  NO_SB_OPT,			0,				\
+	  BCH2_NO_SB_OPT,		0,				\
 	  "size",	"Size of filesystem on device")			\
 	x(durability,			u8,				\
 	  OPT_DEVICE,							\
 	  OPT_UINT(0, BCH_REPLICAS_MAX),				\
-	  NO_SB_OPT,			1,				\
+	  BCH2_NO_SB_OPT,		1,				\
 	  "n",		"Data written to this device will be considered\n"\
 			"to have already been replicated n times")
 
@@ -466,7 +467,7 @@ struct bch_option {
 	};
 	struct {
 		int (*parse)(struct bch_fs *, const char *, u64 *);
-		void (*to_text)(struct printbuf *, struct bch_fs *, u64);
+		void (*to_text)(struct printbuf *, struct bch_fs *, struct bch_sb *, u64);
 	};
 	};
 
@@ -481,6 +482,7 @@ bool bch2_opt_defined_by_id(const struct bch_opts *, enum bch_opt_id);
 u64 bch2_opt_get_by_id(const struct bch_opts *, enum bch_opt_id);
 void bch2_opt_set_by_id(struct bch_opts *, enum bch_opt_id, u64);
 
+u64 bch2_opt_from_sb(struct bch_sb *, enum bch_opt_id);
 int bch2_opts_from_sb(struct bch_opts *, struct bch_sb *);
 void __bch2_opt_set_sb(struct bch_sb *, const struct bch_option *, u64);
 void bch2_opt_set_sb(struct bch_fs *, const struct bch_option *, u64);
@@ -492,7 +494,7 @@ int bch2_opt_parse(struct bch_fs *, const char *, const struct bch_option *,
 #define OPT_SHOW_FULL_LIST	(1 << 0)
 #define OPT_SHOW_MOUNT_STYLE	(1 << 1)
 
-void bch2_opt_to_text(struct printbuf *, struct bch_fs *,
+void bch2_opt_to_text(struct printbuf *, struct bch_fs *, struct bch_sb *,
 		      const struct bch_option *, u64, unsigned);
 
 int bch2_opt_check_may_set(struct bch_fs *, int, u64);
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index 03a8ae496668..bb61a288b7fd 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -1047,45 +1047,56 @@ static void bch2_sb_members_to_text(struct printbuf *out, struct bch_sb *sb,
 		if (!bch2_member_exists(m))
 			continue;
 
-		pr_buf(out, "Device:                  %u", i);
+		pr_buf(out, "Device:");
+		pr_tab(out);
+		pr_buf(out, "%u", i);
 		pr_newline(out);
 
 		pr_indent_push(out, 2);
 
-		pr_buf(out, "UUID:                  ");
+		pr_buf(out, "UUID:");
+		pr_tab(out);
 		pr_uuid(out, m->uuid.b);
 		pr_newline(out);
 
-		pr_buf(out, "Size:                  ");
+		pr_buf(out, "Size:");
+		pr_tab(out);
 		pr_units(out, device_size, device_size << 9);
 		pr_newline(out);
 
-		pr_buf(out, "Bucket size:           ");
+		pr_buf(out, "Bucket size:");
+		pr_tab(out);
 		pr_units(out, bucket_size, bucket_size << 9);
 		pr_newline(out);
 
-		pr_buf(out, "First bucket:          %u",
-		       le16_to_cpu(m->first_bucket));
+		pr_buf(out, "First bucket:");
+		pr_tab(out);
+		pr_buf(out, "%u", le16_to_cpu(m->first_bucket));
 		pr_newline(out);
 
-		pr_buf(out, "Buckets:               %llu",
-		       le64_to_cpu(m->nbuckets));
+		pr_buf(out, "Buckets:");
+		pr_tab(out);
+		pr_buf(out, "%llu", le64_to_cpu(m->nbuckets));
 		pr_newline(out);
 
-		pr_buf(out, "Last mount:            ");
+		pr_buf(out, "Last mount:");
+		pr_tab(out);
 		if (m->last_mount)
 			pr_time(out, le64_to_cpu(m->last_mount));
 		else
 			pr_buf(out, "(never)");
 		pr_newline(out);
 
-		pr_buf(out, "State:                 %s",
+		pr_buf(out, "State:");
+		pr_tab(out);
+		pr_buf(out, "%s",
 		       BCH_MEMBER_STATE(m) < BCH_MEMBER_STATE_NR
 		       ? bch2_member_states[BCH_MEMBER_STATE(m)]
 		       : "unknown");
 		pr_newline(out);
 
-		pr_buf(out, "Group:                 ");
+		pr_buf(out, "Group:");
+		pr_tab(out);
 		if (BCH_MEMBER_GROUP(m)) {
 			unsigned idx = BCH_MEMBER_GROUP(m) - 1;
 
@@ -1099,7 +1110,8 @@ static void bch2_sb_members_to_text(struct printbuf *out, struct bch_sb *sb,
 		}
 		pr_newline(out);
 
-		pr_buf(out, "Data allowed:          ");
+		pr_buf(out, "Data allowed:");
+		pr_tab(out);
 		if (BCH_MEMBER_DATA_ALLOWED(m))
 			bch2_flags_to_text(out, bch2_data_types,
 					   BCH_MEMBER_DATA_ALLOWED(m));
@@ -1107,15 +1119,17 @@ static void bch2_sb_members_to_text(struct printbuf *out, struct bch_sb *sb,
 			pr_buf(out, "(none)");
 		pr_newline(out);
 
-		pr_buf(out, "Has data:              ");
+		pr_buf(out, "Has data:");
+		pr_tab(out);
 		if (data_have)
 			bch2_flags_to_text(out, bch2_data_types, data_have);
 		else
 			pr_buf(out, "(none)");
 		pr_newline(out);
 
-		pr_buf(out, "Discard:               %llu",
-		       BCH_MEMBER_DISCARD(m));
+		pr_buf(out, "Discard:");
+		pr_tab(out);
+		pr_buf(out, "%llu", BCH_MEMBER_DISCARD(m));
 		pr_newline(out);
 
 		pr_indent_pop(out, 2);
@@ -1452,6 +1466,9 @@ void bch2_sb_field_to_text(struct printbuf *out, struct bch_sb *sb,
 	const struct bch_sb_field_ops *ops = type < BCH_SB_FIELD_NR
 		? bch2_sb_field_ops[type] : NULL;
 
+	if (!out->tabstops[0])
+		out->tabstops[0] = 32;
+
 	if (ops)
 		pr_buf(out, "%s", bch2_sb_fields[type]);
 	else
@@ -1500,6 +1517,9 @@ void bch2_sb_to_text(struct printbuf *out, struct bch_sb *sb,
 	u64 fields_have = 0;
 	unsigned nr_devices = 0;
 
+	if (!out->tabstops[0])
+		out->tabstops[0] = 32;
+
 	mi = bch2_sb_get_members(sb);
 	if (mi) {
 		struct bch_member *m;
@@ -1510,137 +1530,106 @@ void bch2_sb_to_text(struct printbuf *out, struct bch_sb *sb,
 			nr_devices += bch2_member_exists(m);
 	}
 
-	pr_buf(out, "External UUID:             ");
+	pr_buf(out, "External UUID:");
+	pr_tab(out);
 	pr_uuid(out, sb->user_uuid.b);
 	pr_newline(out);
 
-	pr_buf(out, "Internal UUID:             ");
+	pr_buf(out, "Internal UUID:");
+	pr_tab(out);
 	pr_uuid(out, sb->uuid.b);
 	pr_newline(out);
 
-	pr_buf(out, "Device index:              %u", sb->dev_idx);
+	pr_buf(out, "Device index:");
+	pr_tab(out);
+	pr_buf(out, "%u", sb->dev_idx);
 	pr_newline(out);
 
-	pr_buf(out, "Label:                     ");
+	pr_buf(out, "Label:");
+	pr_tab(out);
 	pr_buf(out, "%.*s", (int) sizeof(sb->label), sb->label);
 	pr_newline(out);
 
-	pr_buf(out, "Version:                   %u", le16_to_cpu(sb->version));
+	pr_buf(out, "Version:");
+	pr_tab(out);
+	pr_buf(out, "%u", le16_to_cpu(sb->version));
 	pr_newline(out);
 
-	pr_buf(out, "Oldest version on disk:    %u", le16_to_cpu(sb->version_min));
+	pr_buf(out, "Oldest version on disk:");
+	pr_tab(out);
+	pr_buf(out, "%u", le16_to_cpu(sb->version_min));
 	pr_newline(out);
 
-	pr_buf(out, "Created:                   ");
+	pr_buf(out, "Created:");
+	pr_tab(out);
 	if (sb->time_base_lo)
 		pr_time(out, div_u64(le64_to_cpu(sb->time_base_lo), NSEC_PER_SEC));
 	else
 		pr_buf(out, "(not set)");
 	pr_newline(out);
 
-	pr_buf(out, "Squence number:            %llu", le64_to_cpu(sb->seq));
+	pr_buf(out, "Sequence number:");
+	pr_tab(out);
+	pr_buf(out, "%llu", le64_to_cpu(sb->seq));
 	pr_newline(out);
 
-	pr_buf(out, "Block_size:                ");
-	pr_units(out, le16_to_cpu(sb->block_size),
-		 (u32) le16_to_cpu(sb->block_size) << 9);
+	pr_buf(out, "Superblock size:");
+	pr_tab(out);
+	pr_buf(out, "%zu", vstruct_bytes(sb));
 	pr_newline(out);
 
-	pr_buf(out, "Btree node size:           ");
-	pr_units(out, BCH_SB_BTREE_NODE_SIZE(sb),
-		 BCH_SB_BTREE_NODE_SIZE(sb) << 9);
+	pr_buf(out, "Clean:");
+	pr_tab(out);
+	pr_buf(out, "%llu", BCH_SB_CLEAN(sb));
 	pr_newline(out);
 
-	pr_buf(out, "Error action:              %s",
-	       BCH_SB_ERROR_ACTION(sb) < BCH_ON_ERROR_NR
-	       ? bch2_error_actions[BCH_SB_ERROR_ACTION(sb)]
-	       : "unknown");
+	pr_buf(out, "Devices:");
+	pr_tab(out);
+	pr_buf(out, "%u", nr_devices);
 	pr_newline(out);
 
-	pr_buf(out, "Clean:                     %llu", BCH_SB_CLEAN(sb));
+	pr_buf(out, "Sections:");
+	vstruct_for_each(sb, f)
+		fields_have |= 1 << le32_to_cpu(f->type);
+	pr_tab(out);
+	bch2_flags_to_text(out, bch2_sb_fields, fields_have);
 	pr_newline(out);
 
-	pr_buf(out, "Features:                  ");
+	pr_buf(out, "Features:");
+	pr_tab(out);
 	bch2_flags_to_text(out, bch2_sb_features,
 			   le64_to_cpu(sb->features[0]));
 	pr_newline(out);
 
-	pr_buf(out, "Compat features:           ");
+	pr_buf(out, "Compat features:");
+	pr_tab(out);
 	bch2_flags_to_text(out, bch2_sb_compat,
 			   le64_to_cpu(sb->compat[0]));
 	pr_newline(out);
 
-	pr_buf(out, "Metadata replicas:         %llu", BCH_SB_META_REPLICAS_WANT(sb));
 	pr_newline(out);
-
-	pr_buf(out, "Data replicas:             %llu", BCH_SB_DATA_REPLICAS_WANT(sb));
-	pr_newline(out);
-
-	pr_buf(out, "Metadata checksum type:    %s (%llu)",
-	       BCH_SB_META_CSUM_TYPE(sb) < BCH_CSUM_OPT_NR
-	       ? bch2_csum_opts[BCH_SB_META_CSUM_TYPE(sb)]
-	       : "unknown",
-	       BCH_SB_META_CSUM_TYPE(sb));
-	pr_newline(out);
-
-	pr_buf(out, "Data checksum type:        %s (%llu)",
-	       BCH_SB_DATA_CSUM_TYPE(sb) < BCH_CSUM_OPT_NR
-	       ? bch2_csum_opts[BCH_SB_DATA_CSUM_TYPE(sb)]
-	       : "unknown",
-	       BCH_SB_DATA_CSUM_TYPE(sb));
-	pr_newline(out);
-
-	pr_buf(out, "Compression type:          %s (%llu)",
-	       BCH_SB_COMPRESSION_TYPE(sb) < BCH_COMPRESSION_OPT_NR
-	       ? bch2_compression_opts[BCH_SB_COMPRESSION_TYPE(sb)]
-	       : "unknown",
-	       BCH_SB_COMPRESSION_TYPE(sb));
-	pr_newline(out);
-
-	pr_buf(out, "Foreground write target:   ");
-	bch2_sb_target_to_text(out, sb, BCH_SB_FOREGROUND_TARGET(sb));
-	pr_newline(out);
-
-	pr_buf(out, "Background write target:   ");
-	bch2_sb_target_to_text(out, sb, BCH_SB_BACKGROUND_TARGET(sb));
-	pr_newline(out);
-
-	pr_buf(out, "Promote target:            ");
-	bch2_sb_target_to_text(out, sb, BCH_SB_PROMOTE_TARGET(sb));
-	pr_newline(out);
-
-	pr_buf(out, "Metadata target:           ");
-	bch2_sb_target_to_text(out, sb, BCH_SB_METADATA_TARGET(sb));
-	pr_newline(out);
-
-	pr_buf(out, "String hash type:          %s (%llu)",
-	       BCH_SB_STR_HASH_TYPE(sb) < BCH_STR_HASH_NR
-	       ? bch2_str_hash_types[BCH_SB_STR_HASH_TYPE(sb)]
-	       : "unknown",
-	       BCH_SB_STR_HASH_TYPE(sb));
-	pr_newline(out);
-
-	pr_buf(out, "32 bit inodes:             %llu", BCH_SB_INODE_32BIT(sb));
-	pr_newline(out);
-
-	pr_buf(out, "GC reserve percentage:     %llu%%", BCH_SB_GC_RESERVE(sb));
+	pr_buf(out, "Options:");
 	pr_newline(out);
+	pr_indent_push(out, 2);
+	{
+		enum bch_opt_id id;
 
-	pr_buf(out, "Root reserve percentage:   %llu%%", BCH_SB_ROOT_RESERVE(sb));
-	pr_newline(out);
+		for (id = 0; id < bch2_opts_nr; id++) {
+			const struct bch_option *opt = bch2_opt_table + id;
 
-	pr_buf(out, "Devices:                   %u live, %u total",
-	       nr_devices, sb->nr_devices);
-	pr_newline(out);
+			if (opt->get_sb != BCH2_NO_SB_OPT) {
+				u64 v = bch2_opt_from_sb(sb, id);
 
-	pr_buf(out, "Sections:                  ");
-	vstruct_for_each(sb, f)
-		fields_have |= 1 << le32_to_cpu(f->type);
-	bch2_flags_to_text(out, bch2_sb_fields, fields_have);
-	pr_newline(out);
+				pr_buf(out, "%s:", opt->attr.name);
+				pr_tab(out);
+				bch2_opt_to_text(out, NULL, sb, opt, v,
+						 OPT_HUMAN_READABLE|OPT_SHOW_FULL_LIST);
+				pr_newline(out);
+			}
+		}
+	}
 
-	pr_buf(out, "Superblock size:           %zu", vstruct_bytes(sb));
-	pr_newline(out);
+	pr_indent_pop(out, 2);
 
 	if (print_layout) {
 		pr_newline(out);
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 0bc78c50150a..56b01624d5fb 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -891,7 +891,7 @@ static void print_mount_opts(struct bch_fs *c)
 		if (!first)
 			pr_buf(&p, ",");
 		first = false;
-		bch2_opt_to_text(&p, c, opt, v, OPT_SHOW_MOUNT_STYLE);
+		bch2_opt_to_text(&p, c, c->disk_sb.sb, opt, v, OPT_SHOW_MOUNT_STYLE);
 	}
 
 	if (!p.pos)
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index 3018250d421b..49e38859bff8 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -597,7 +597,7 @@ SHOW(bch2_fs_opts_dir)
 	int id = opt - bch2_opt_table;
 	u64 v = bch2_opt_get_by_id(&c->opts, id);
 
-	bch2_opt_to_text(out, c, opt, v, OPT_SHOW_FULL_LIST);
+	bch2_opt_to_text(out, c, c->disk_sb.sb, opt, v, OPT_SHOW_FULL_LIST);
 	pr_char(out, '\n');
 
 	return 0;
diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c
index f4e20e796ba0..08b33ab8489f 100644
--- a/fs/bcachefs/xattr.c
+++ b/fs/bcachefs/xattr.c
@@ -448,7 +448,7 @@ static int __bch2_xattr_bcachefs_get(const struct xattr_handler *handler,
 		return -ENODATA;
 
 	v = bch2_opt_get_by_id(&opts, id);
-	bch2_opt_to_text(&out, c, opt, v, 0);
+	bch2_opt_to_text(&out, c, c->disk_sb.sb, opt, v, 0);
 
 	ret = out.pos;
 
-- 
cgit 


From b60c380bca5458be9e4c0ff77289f0979fbbb52f Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 5 Mar 2022 13:38:54 -0500
Subject: bcachefs: Don't arm journal->write_work when journal entry !open

This fixes a shutdown race where we were rearming journal->write_work
after the journal has already shut down.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/journal.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index 9df600d55da0..1c43ec1d4f6c 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -309,17 +309,19 @@ static void journal_write_work(struct work_struct *work)
 {
 	struct journal *j = container_of(work, struct journal, write_work.work);
 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
-	struct journal_buf *buf;
 	long delta;
 
 	spin_lock(&j->lock);
-	buf = journal_cur_buf(j);
-	delta = buf->expires - jiffies;
+	if (!__journal_entry_is_open(j->reservations))
+		goto unlock;
+
+	delta = journal_cur_buf(j)->expires - jiffies;
 
 	if (delta > 0)
 		mod_delayed_work(c->io_complete_wq, &j->write_work, delta);
 	else
 		__journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL);
+unlock:
 	spin_unlock(&j->lock);
 }
 
@@ -939,6 +941,7 @@ void bch2_dev_journal_stop(struct journal *j, struct bch_dev *ca)
 
 void bch2_fs_journal_stop(struct journal *j)
 {
+	bch2_journal_reclaim_stop(j);
 	bch2_journal_flush_all_pins(j);
 
 	wait_event(j->wait, journal_entry_close(j));
@@ -956,7 +959,6 @@ void bch2_fs_journal_stop(struct journal *j)
 	       j->last_empty_seq != journal_cur_seq(j));
 
 	cancel_delayed_work_sync(&j->write_work);
-	bch2_journal_reclaim_stop(j);
 }
 
 int bch2_fs_journal_start(struct journal *j, u64 cur_seq,
-- 
cgit 


From e1f7fa06a8ed48feedd5f538fc4724734c6e1869 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 5 Mar 2022 15:21:07 -0500
Subject: bcachefs: Don't keep around btree_paths unnecessarily

When bch2_trans_begin() is called and there hasn't been a transaction
restart, we presume that we're now doing something new - iterating over
different keys, and we now shouldn't keep aruond paths related to the
previous transaction, excepting the subvolumes btree.

This should fix some of our "transaction path overflow" bugs.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_iter.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index b65cd3566872..109efa73cd4c 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -3012,6 +3012,14 @@ void bch2_trans_begin(struct btree_trans *trans)
 	trans_for_each_path(trans, path) {
 		path->should_be_locked = false;
 
+		/*
+		 * If the transaction wasn't restarted, we're presuming to be
+		 * doing something new: dont keep iterators excpt the ones that
+		 * are in use - except for the subvolumes btree:
+		 */
+		if (!trans->restarted && path->btree_id != BTREE_ID_subvolumes)
+			path->preserve = false;
+
 		/*
 		 * XXX: we probably shouldn't be doing this if the transaction
 		 * was restarted, but currently we still overflow transaction
-- 
cgit 


From 07b8121f07056480c54fca99046870d84a657d13 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 6 Mar 2022 14:04:34 -0500
Subject: bcachefs: Fix pr_tab_rjust()

pr_tab_rjust() was broken and leaving a null somewhere in the output
string - this patch fixes it and simplifies it a bit.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/util.c | 37 +++++++++++++++++++++++++++++++++++--
 fs/bcachefs/util.h | 22 +++-------------------
 2 files changed, 38 insertions(+), 21 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c
index 7a896ddc9a22..f290c069c683 100644
--- a/fs/bcachefs/util.c
+++ b/fs/bcachefs/util.c
@@ -101,8 +101,14 @@ STRTO_H(strtou64, u64)
 
 static int bch2_printbuf_realloc(struct printbuf *out, unsigned extra)
 {
-	unsigned new_size = roundup_pow_of_two(out->size + extra);
-	char *buf = krealloc(out->buf, new_size, !out->atomic ? GFP_KERNEL : GFP_ATOMIC);
+	unsigned new_size;
+	char *buf;
+
+	if (out->pos + extra + 1 < out->size)
+		return 0;
+
+	new_size = roundup_pow_of_two(out->size + extra);
+	buf = krealloc(out->buf, new_size, !out->atomic ? GFP_KERNEL : GFP_ATOMIC);
 
 	if (!buf) {
 		out->allocation_failure = true;
@@ -131,6 +137,33 @@ void bch2_pr_buf(struct printbuf *out, const char *fmt, ...)
 	out->pos += len;
 }
 
+void bch2_pr_tab_rjust(struct printbuf *buf)
+{
+	BUG_ON(buf->tabstop > ARRAY_SIZE(buf->tabstops));
+
+	if (printbuf_linelen(buf) < buf->tabstops[buf->tabstop]) {
+		unsigned move = buf->pos - buf->last_field;
+		unsigned shift = buf->tabstops[buf->tabstop] -
+			printbuf_linelen(buf);
+
+		bch2_printbuf_realloc(buf, shift);
+
+		if (buf->last_field + shift + 1 < buf->size) {
+			move = min(move, buf->size - 1 - buf->last_field - shift);
+
+			memmove(buf->buf + buf->last_field + shift,
+				buf->buf + buf->last_field,
+				move);
+			memset(buf->buf + buf->last_field, ' ', shift);
+			buf->pos += shift;
+			buf->buf[buf->pos] = 0;
+		}
+	}
+
+	buf->last_field = buf->pos;
+	buf->tabstop++;
+}
+
 void bch2_hprint(struct printbuf *buf, s64 v)
 {
 	int u, t = 0;
diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h
index 7667944f9ae4..ba0c4d29c038 100644
--- a/fs/bcachefs/util.h
+++ b/fs/bcachefs/util.h
@@ -334,27 +334,11 @@ static inline void pr_tab(struct printbuf *buf)
 	buf->tabstop++;
 }
 
+void bch2_pr_tab_rjust(struct printbuf *);
+
 static inline void pr_tab_rjust(struct printbuf *buf)
 {
-	ssize_t shift = min_t(ssize_t, buf->tabstops[buf->tabstop] -
-			      printbuf_linelen(buf),
-			      printbuf_remaining(buf));
-	ssize_t move = min_t(ssize_t, buf->pos - buf->last_field,
-			     printbuf_remaining(buf) - shift);
-
-	BUG_ON(buf->tabstop > ARRAY_SIZE(buf->tabstops));
-
-	if (shift > 0) {
-		memmove(buf->buf + buf->last_field + shift,
-			buf->buf + buf->last_field,
-			move);
-		memset(buf->buf + buf->last_field, ' ', shift);
-		buf->pos += shift;
-		buf->buf[buf->pos] = 0;
-	}
-
-	buf->last_field = buf->pos;
-	buf->tabstop++;
+	bch2_pr_tab_rjust(buf);
 }
 
 void bch2_pr_units(struct printbuf *, s64, s64);
-- 
cgit 


From f0cc5d2931378b7a2a7e797c726a2ab760d4a84d Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 6 Mar 2022 15:15:41 -0500
Subject: bcachefs: Check for rw before setting opts via sysfs

This isn't a correctness issue, it just eliminates errors in the dmesg
log when we're RO.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/sysfs.c | 22 +++++++++++++++++-----
 1 file changed, 17 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index 49e38859bff8..afcb5ad1aa62 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -611,19 +611,28 @@ STORE(bch2_fs_opts_dir)
 	char *tmp;
 	u64 v;
 
+	/*
+	 * We don't need to take c->writes for correctness, but it eliminates an
+	 * unsightly error message in the dmesg log when we're RO:
+	 */
+	if (unlikely(!percpu_ref_tryget(&c->writes)))
+		return -EROFS;
+
 	tmp = kstrdup(buf, GFP_KERNEL);
-	if (!tmp)
-		return -ENOMEM;
+	if (!tmp) {
+		ret = -ENOMEM;
+		goto err;
+	}
 
 	ret = bch2_opt_parse(c, NULL, opt, strim(tmp), &v);
 	kfree(tmp);
 
 	if (ret < 0)
-		return ret;
+		goto err;
 
 	ret = bch2_opt_check_may_set(c, id, v);
 	if (ret < 0)
-		return ret;
+		goto err;
 
 	bch2_opt_set_sb(c, opt, v);
 	bch2_opt_set_by_id(&c->opts, id, v);
@@ -634,7 +643,10 @@ STORE(bch2_fs_opts_dir)
 		rebalance_wakeup(c);
 	}
 
-	return size;
+	ret = size;
+err:
+	percpu_ref_put(&c->writes);
+	return ret;
 }
 SYSFS_OPS(bch2_fs_opts_dir);
 
-- 
cgit 


From 718ce1eb8a84f47f66d0c89de43c6d0f0b14a20e Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 6 Mar 2022 17:20:39 -0500
Subject: bcachefs: Skip periodic wakeup of journal reclaim when journal empty

Less system noise.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/journal.c         |  3 +++
 fs/bcachefs/journal_reclaim.c | 14 +++++++++++---
 fs/bcachefs/journal_types.h   |  4 ++++
 3 files changed, 18 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index 1c43ec1d4f6c..54a318a841a1 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -240,6 +240,9 @@ static int journal_entry_open(struct journal *j)
 	if (u64s <= 0)
 		return cur_entry_journal_full;
 
+	if (fifo_empty(&j->pin) && j->reclaim_thread)
+		wake_up_process(j->reclaim_thread);
+
 	/*
 	 * The fifo_push() needs to happen at the same time as j->seq is
 	 * incremented for journal_last_seq() to be calculated correctly
diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
index 39f4b2eebac0..56b0c018ac26 100644
--- a/fs/bcachefs/journal_reclaim.c
+++ b/fs/bcachefs/journal_reclaim.c
@@ -667,6 +667,7 @@ static int bch2_journal_reclaim_thread(void *arg)
 	struct journal *j = arg;
 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
 	unsigned long delay, now;
+	bool journal_empty;
 	int ret = 0;
 
 	set_freezable();
@@ -693,10 +694,17 @@ static int bch2_journal_reclaim_thread(void *arg)
 				break;
 			if (j->reclaim_kicked)
 				break;
-			if (time_after_eq(jiffies, j->next_reclaim))
-				break;
-			schedule_timeout(j->next_reclaim - jiffies);
 
+			spin_lock(&j->lock);
+			journal_empty = fifo_empty(&j->pin);
+			spin_unlock(&j->lock);
+
+			if (journal_empty)
+				schedule();
+			else if (time_after(j->next_reclaim, jiffies))
+				schedule_timeout(j->next_reclaim - jiffies);
+			else
+				break;
 		}
 		__set_current_state(TASK_RUNNING);
 	}
diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h
index 330c5d79e645..91f829adf862 100644
--- a/fs/bcachefs/journal_types.h
+++ b/fs/bcachefs/journal_types.h
@@ -243,6 +243,10 @@ struct journal {
 	spinlock_t		err_lock;
 
 	struct mutex		reclaim_lock;
+	/*
+	 * Used for waiting until journal reclaim has freed up space in the
+	 * journal:
+	 */
 	wait_queue_head_t	reclaim_wait;
 	struct task_struct	*reclaim_thread;
 	bool			reclaim_kicked;
-- 
cgit 


From 590b91cf3fa419eefc917f4e37152af616c3ba5f Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 7 Mar 2022 14:13:22 -0500
Subject: bcachefs: Revert UUID format-specifier change

"bcachefs: Log & error message improvements" accidentally changed the
format specifier we use for converting UUIDs to strings, which broke
mounting of encrypted filesystems - this patch reverts that change.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/util.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h
index ba0c4d29c038..465ba030133b 100644
--- a/fs/bcachefs/util.h
+++ b/fs/bcachefs/util.h
@@ -372,7 +372,7 @@ static inline void pr_time(struct printbuf *out, u64 _time)
 #ifdef __KERNEL__
 static inline void uuid_unparse_lower(u8 *uuid, char *out)
 {
-	sprintf(out, "%plU", uuid);
+	sprintf(out, "%pUb", uuid);
 }
 #else
 #include <uuid/uuid.h>
-- 
cgit 


From 4d126dc8b30c2c1c69cbf600d604e7ceb8ca7f8d Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 8 Mar 2022 13:52:58 -0500
Subject: bcachefs: Use bio_iov_vecs_to_alloc()

This fixes a bug in the DIO read path where, when using a loopback
device in DIO mode, we'd allocate a biovec that would get overwritten
and leaked in bio_iov_iter_get_pages() -> bio_iov_bvec_set().

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/fs-io.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 9161125aec17..8231c29a7534 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -1889,7 +1889,7 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter)
 	iter->count -= shorten;
 
 	bio = bio_alloc_bioset(NULL,
-			       iov_iter_npages(iter, BIO_MAX_VECS),
+			       bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS),
 			       REQ_OP_READ,
 			       GFP_KERNEL,
 			       &c->dio_read_bioset);
@@ -1926,7 +1926,7 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter)
 	goto start;
 	while (iter->count) {
 		bio = bio_alloc_bioset(NULL,
-				       iov_iter_npages(iter, BIO_MAX_VECS),
+				       bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS),
 				       REQ_OP_READ,
 				       GFP_KERNEL,
 				       &c->bio_read);
@@ -2297,9 +2297,7 @@ ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter)
 	}
 
 	bio = bio_alloc_bioset(NULL,
-			       iov_iter_is_bvec(iter)
-			       ? 0
-			       : iov_iter_npages(iter, BIO_MAX_VECS),
+			       bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS),
 			       REQ_OP_WRITE,
 			       GFP_KERNEL,
 			       &c->dio_write_bioset);
-- 
cgit 


From 9552e19f6fff86d9907bb088f8b1eb786562f9d3 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 9 Mar 2022 15:37:42 -0500
Subject: bcachefs: Fix dio write path with loopback dio mode

When the iov_iter is a bvec iter, it's possible the IO was submitted
from a kthread that didn't have an mm to switch to.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/fs-io.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 8231c29a7534..77a893260fd8 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -2110,7 +2110,7 @@ static long bch2_dio_write_loop(struct dio_write *dio)
 	while (1) {
 		iter_count = dio->iter.count;
 
-		if (kthread)
+		if (kthread && dio->mm)
 			kthread_use_mm(dio->mm);
 		BUG_ON(current->faults_disabled_mapping);
 		current->faults_disabled_mapping = mapping;
@@ -2120,7 +2120,7 @@ static long bch2_dio_write_loop(struct dio_write *dio)
 		dropped_locks = fdm_dropped_locks();
 
 		current->faults_disabled_mapping = NULL;
-		if (kthread)
+		if (kthread && dio->mm)
 			kthread_unuse_mm(dio->mm);
 
 		/*
-- 
cgit 


From a897ef682781824c9b82f01d107d653f3dbf38e5 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 7 Mar 2022 22:05:49 -0500
Subject: bcachefs: Fix error handling in traverse_all()

In btree_path_traverse_all() we were failing to check for -EIO in the
retry loop, and after btree node read error we'd go into an infinite
loop.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_iter.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 109efa73cd4c..90f5d306566b 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1468,8 +1468,10 @@ retry_all:
 		 */
 		if (path->uptodate) {
 			ret = btree_path_traverse_one(trans, path, 0, _THIS_IP_);
-			if (ret)
+			if (ret == -EINTR || ret == -ENOMEM)
 				goto retry_all;
+			if (ret)
+				goto err;
 		} else {
 			i++;
 		}
@@ -1482,7 +1484,7 @@ retry_all:
 	 */
 	trans_for_each_path(trans, path)
 		BUG_ON(path->uptodate >= BTREE_ITER_NEED_TRAVERSE);
-
+err:
 	bch2_btree_cache_cannibalize_unlock(c);
 
 	trans->in_traverse_all = false;
-- 
cgit 


From 61a66469a4bf63a1357b6af36242682ae339ca2a Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 6 Mar 2022 21:17:43 -0500
Subject: bcachefs: Fix lock ordering under traverse_all()

traverse_all() traverses btree paths in sorted order, so it should never
see transaction restarts due to lock ordering violations. But some code
in __bch2_btree_path_upgrade(), while necessary when not running under
traverse_all(), was causing some confusing lock ordering violations -
disabling this code under traverse_all() will let us put in some more
assertions.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_iter.c | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 90f5d306566b..1cfd2e9015b1 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -473,14 +473,15 @@ bool __bch2_btree_path_upgrade(struct btree_trans *trans,
 	 * before interior nodes - now that's handled by
 	 * bch2_btree_path_traverse_all().
 	 */
-	trans_for_each_path(trans, linked)
-		if (linked != path &&
-		    linked->cached == path->cached &&
-		    linked->btree_id == path->btree_id &&
-		    linked->locks_want < new_locks_want) {
-			linked->locks_want = new_locks_want;
-			btree_path_get_locks(trans, linked, true);
-		}
+	if (!path->cached && !trans->in_traverse_all)
+		trans_for_each_path(trans, linked)
+			if (linked != path &&
+			    linked->cached == path->cached &&
+			    linked->btree_id == path->btree_id &&
+			    linked->locks_want < new_locks_want) {
+				linked->locks_want = new_locks_want;
+				btree_path_get_locks(trans, linked, true);
+			}
 
 	return false;
 }
-- 
cgit 


From a9bae40fda067eae70751302cbbc9f362453f310 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 5 Mar 2022 18:23:47 -0500
Subject: bcachefs: Change flags param to bch2_btree_delete_range to
 update_flags

It wasn't used as iter_flags (excepting the unit tests, which this patch
fixes), and the next patch is going to need to pass in
BTREE_TRIGGER_NORUN.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_update_leaf.c | 12 ++++++------
 fs/bcachefs/tests.c             | 14 ++++++--------
 2 files changed, 12 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 42ee54cf390d..9f1ff5f8635d 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -1692,14 +1692,14 @@ int bch2_btree_delete_at(struct btree_trans *trans,
 
 int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id id,
 				  struct bpos start, struct bpos end,
-				  unsigned iter_flags,
+				  unsigned update_flags,
 				  u64 *journal_seq)
 {
 	struct btree_iter iter;
 	struct bkey_s_c k;
 	int ret = 0;
 
-	bch2_trans_iter_init(trans, &iter, id, start, BTREE_ITER_INTENT|iter_flags);
+	bch2_trans_iter_init(trans, &iter, id, start, BTREE_ITER_INTENT);
 retry:
 	while ((bch2_trans_begin(trans),
 	       (k = bch2_btree_iter_peek(&iter)).k) &&
@@ -1740,9 +1740,9 @@ retry:
 				break;
 		}
 
-		ret   = bch2_trans_update(trans, &iter, &delete, 0) ?:
+		ret   = bch2_trans_update(trans, &iter, &delete, update_flags) ?:
 			bch2_trans_commit(trans, &disk_res, journal_seq,
-					BTREE_INSERT_NOFAIL);
+					  BTREE_INSERT_NOFAIL);
 		bch2_disk_reservation_put(trans->c, &disk_res);
 		if (ret)
 			break;
@@ -1764,10 +1764,10 @@ retry:
  */
 int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id,
 			    struct bpos start, struct bpos end,
-			    unsigned iter_flags,
+			    unsigned update_flags,
 			    u64 *journal_seq)
 {
 	return bch2_trans_do(c, NULL, journal_seq, 0,
 			     bch2_btree_delete_range_trans(&trans, id, start, end,
-							   iter_flags, journal_seq));
+							   update_flags, journal_seq));
 }
diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c
index 978d92e0b5eb..8ed28bf5e82c 100644
--- a/fs/bcachefs/tests.c
+++ b/fs/bcachefs/tests.c
@@ -15,15 +15,14 @@ static void delete_test_keys(struct bch_fs *c)
 	int ret;
 
 	ret = bch2_btree_delete_range(c, BTREE_ID_extents,
-				      POS_MIN, SPOS_MAX,
-				      BTREE_ITER_ALL_SNAPSHOTS,
+				      SPOS(0, 0, U32_MAX), SPOS_MAX,
+				      0,
 				      NULL);
 	BUG_ON(ret);
 
 	ret = bch2_btree_delete_range(c, BTREE_ID_xattrs,
-				      POS_MIN, SPOS_MAX,
-				      BTREE_ITER_ALL_SNAPSHOTS,
-				      NULL);
+				      SPOS(0, 0, U32_MAX), SPOS_MAX,
+				      0, NULL);
 	BUG_ON(ret);
 }
 
@@ -814,9 +813,8 @@ static int seq_delete(struct bch_fs *c, u64 nr)
 	int ret;
 
 	ret = bch2_btree_delete_range(c, BTREE_ID_xattrs,
-				      POS_MIN, SPOS_MAX,
-				      BTREE_ITER_ALL_SNAPSHOTS,
-				      NULL);
+				      SPOS(0, 0, U32_MAX), SPOS_MAX,
+				      0, NULL);
 	if (ret)
 		bch_err(c, "error in seq_delete: %i", ret);
 	return ret;
-- 
cgit 


From d5d3be7dc5d09f9cf8d12b3e3cefbcd8020cddae Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 10 Mar 2022 14:25:16 -0500
Subject: bcachefs: bch2_journal_log_msg()

This adds bch2_journal_log_msg(), which just logs a message to the
journal, and uses it to mark startup and when journal replay finishes.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/journal.c  | 83 +++++++++++++++++++++++++++++++++++---------------
 fs/bcachefs/journal.h  |  1 +
 fs/bcachefs/recovery.c |  3 ++
 3 files changed, 62 insertions(+), 25 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index 54a318a841a1..9d16b9d30ad7 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -630,31 +630,6 @@ int bch2_journal_flush_seq(struct journal *j, u64 seq)
 	return ret ?: ret2 < 0 ? ret2 : 0;
 }
 
-int bch2_journal_meta(struct journal *j)
-{
-	struct journal_buf *buf;
-	struct journal_res res;
-	int ret;
-
-	memset(&res, 0, sizeof(res));
-
-	ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0);
-	if (ret)
-		return ret;
-
-	buf = j->buf + (res.seq & JOURNAL_BUF_MASK);
-	buf->must_flush = true;
-
-	if (!buf->flush_time) {
-		buf->flush_time	= local_clock() ?: 1;
-		buf->expires = jiffies;
-	}
-
-	bch2_journal_res_put(j, &res);
-
-	return bch2_journal_flush_seq(j, res.seq);
-}
-
 /*
  * bch2_journal_flush_async - if there is an open journal entry, or a journal
  * still being written, write it and wait for the write to complete
@@ -707,6 +682,64 @@ out:
 	return ret;
 }
 
+int bch2_journal_meta(struct journal *j)
+{
+	struct journal_buf *buf;
+	struct journal_res res;
+	int ret;
+
+	memset(&res, 0, sizeof(res));
+
+	ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0);
+	if (ret)
+		return ret;
+
+	buf = j->buf + (res.seq & JOURNAL_BUF_MASK);
+	buf->must_flush = true;
+
+	if (!buf->flush_time) {
+		buf->flush_time	= local_clock() ?: 1;
+		buf->expires = jiffies;
+	}
+
+	bch2_journal_res_put(j, &res);
+
+	return bch2_journal_flush_seq(j, res.seq);
+}
+
+int bch2_journal_log_msg(struct journal *j, const char *fmt, ...)
+{
+	struct jset_entry_log *entry;
+	struct journal_res res = { 0 };
+	unsigned msglen, u64s;
+	va_list args;
+	int ret;
+
+	va_start(args, fmt);
+	msglen = vsnprintf(NULL, 0, fmt, args) + 1;
+	va_end(args);
+
+	u64s = jset_u64s(DIV_ROUND_UP(msglen, sizeof(u64)));
+
+	ret = bch2_journal_res_get(j, &res, u64s, 0);
+	if (ret)
+		return ret;
+
+	entry = container_of(journal_res_entry(j, &res),
+			     struct jset_entry_log, entry);;
+	memset(entry, 0, u64s * sizeof(u64));
+	entry->entry.type = BCH_JSET_ENTRY_log;
+	entry->entry.u64s = u64s - 1;
+
+	va_start(args, fmt);
+	vsnprintf(entry->d, INT_MAX, fmt, args);
+	va_end(args);
+
+	bch2_journal_res_put(j, &res);
+
+	return bch2_journal_flush_seq(j, res.seq);
+}
+
 /* block/unlock the journal: */
 
 void bch2_journal_unblock(struct journal *j)
diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h
index 948e8b53dffd..243349f4ac1c 100644
--- a/fs/bcachefs/journal.h
+++ b/fs/bcachefs/journal.h
@@ -478,6 +478,7 @@ int bch2_journal_flush_seq(struct journal *, u64);
 int bch2_journal_flush(struct journal *);
 bool bch2_journal_noflush_seq(struct journal *, u64);
 int bch2_journal_meta(struct journal *);
+int bch2_journal_log_msg(struct journal *, const char *, ...);
 
 void bch2_journal_halt(struct journal *);
 
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 6c4ffc5abdc5..887971559214 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -578,6 +578,9 @@ static int bch2_journal_replay(struct bch_fs *c)
 	bch2_journal_set_replay_done(j);
 	bch2_journal_flush_all_pins(j);
 	ret = bch2_journal_error(j);
+
+	if (keys->nr && !ret)
+		bch2_journal_log_msg(&c->journal, "journal replay finished");
 err:
 	kvfree(keys_sorted);
 	return ret;
-- 
cgit 


From f6c92ebbb8f84ad9b993691b02d5b38736b7a922 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 10 Mar 2022 15:49:03 -0500
Subject: bcachefs: Allocate journal buckets sequentially

This tweaks __bch2_set_nr_journal_buckets() so that we aren't reversing
their order in the jorunal anymore - nice for rotating disks.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/journal.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index 9d16b9d30ad7..11b44467aeab 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -837,7 +837,7 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
 		 * superblock before inserting into the journal array
 		 */
 
-		pos = ja->nr ? (ja->cur_idx + 1) % ja->nr : 0;
+		pos = ja->discard_idx ?: ja->nr;
 		__array_insert_item(ja->buckets,		ja->nr, pos);
 		__array_insert_item(ja->bucket_seq,		ja->nr, pos);
 		__array_insert_item(journal_buckets->buckets,	ja->nr, pos);
-- 
cgit 


From cc23255e9a9fcaf7423e0fe7e197605bf10a3f06 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 10 Mar 2022 17:35:06 -0500
Subject: bcachefs: Add a missing wakeup

This fixes a rare bug with bch2_btree_flush_all_writes() getting stuck.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_io.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 3031b566a112..887a1b145cdc 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -1622,6 +1622,8 @@ static void __btree_node_write_done(struct bch_fs *c, struct btree *b)
 
 	if (new & (1U << BTREE_NODE_write_in_flight))
 		__bch2_btree_node_write(c, b, BTREE_WRITE_ALREADY_STARTED);
+	else
+		wake_up_bit(&b->flags, BTREE_NODE_write_in_flight);
 }
 
 static void btree_node_write_done(struct bch_fs *c, struct btree *b)
@@ -2091,7 +2093,6 @@ restart:
 			rcu_read_unlock();
 			wait_on_bit_io(&b->flags, flag, TASK_UNINTERRUPTIBLE);
 			goto restart;
-
 		}
 	rcu_read_unlock();
 }
-- 
cgit 


From d4d24a6509548a6457f185fddd927df7d148464a Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 10 Mar 2022 23:22:49 -0500
Subject: bcachefs: Delay setting path->should_be_locked

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c | 23 ++++++++++++++++-------
 1 file changed, 16 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 1cfd2e9015b1..b18e4fcc46e5 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -427,8 +427,8 @@ bool bch2_btree_path_relock_intent(struct btree_trans *trans,
 	return true;
 }
 
-__flatten
-static bool bch2_btree_path_relock(struct btree_trans *trans,
+noinline __flatten
+static bool __bch2_btree_path_relock(struct btree_trans *trans,
 			struct btree_path *path, unsigned long trace_ip)
 {
 	bool ret = btree_path_get_locks(trans, path, false);
@@ -441,6 +441,14 @@ static bool bch2_btree_path_relock(struct btree_trans *trans,
 	return ret;
 }
 
+static inline bool bch2_btree_path_relock(struct btree_trans *trans,
+			struct btree_path *path, unsigned long trace_ip)
+{
+	return btree_node_locked(path, path->level)
+		? true
+		: __bch2_btree_path_relock(trans, path, trace_ip);
+}
+
 bool __bch2_btree_path_upgrade(struct btree_trans *trans,
 			       struct btree_path *path,
 			       unsigned new_locks_want)
@@ -2388,9 +2396,6 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
 			iter->update_path = bch2_btree_path_set_pos(trans,
 						iter->update_path, pos,
 						iter->flags & BTREE_ITER_INTENT);
-
-			BUG_ON(!(iter->update_path->nodes_locked & 1));
-			iter->update_path->should_be_locked = true;
 		}
 
 		/*
@@ -2428,8 +2433,12 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
 	BUG_ON(!iter->path->nodes_locked);
 out:
 	if (iter->update_path) {
-		BUG_ON(!(iter->update_path->nodes_locked & 1));
-		iter->update_path->should_be_locked = true;
+		if (unlikely(!bch2_btree_path_relock(trans, iter->update_path, _THIS_IP_))) {
+			k = bkey_s_c_err(-EINTR);
+		} else {
+			BUG_ON(!(iter->update_path->nodes_locked & 1));
+			iter->update_path->should_be_locked = true;
+		}
 	}
 	iter->path->should_be_locked = true;
 
-- 
cgit 


From 85d8cf161f98993f544c0b2c614873caf7b9c14f Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 11 Mar 2022 12:31:52 -0500
Subject: bcachefs: bch2_btree_iter_peek_upto()

In BTREE_ITER_FILTER_SNAPHOTS mode, we skip over keys in unrelated
snapshots. When we hit the end of an inode, if the next inode(s) are in
a different subvolume, we could potentially have to skip past many keys
before finding a key we can return to the caller, so they can terminate
the iteration.

This adds a peek_upto() variant to solve this problem, to be used when
we know the range we're searching within.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_iter.c        | 36 ++++++++++++++++++++++++++----------
 fs/bcachefs/btree_iter.h        | 30 ++++++++++++++++++++++++++++--
 fs/bcachefs/btree_update_leaf.c |  5 +++--
 fs/bcachefs/dirent.c            | 17 ++++++-----------
 fs/bcachefs/fs.c                |  5 ++---
 fs/bcachefs/inode.c             |  4 ++--
 fs/bcachefs/str_hash.h          | 21 +++++++--------------
 fs/bcachefs/xattr.c             | 10 +++-------
 8 files changed, 77 insertions(+), 51 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index b18e4fcc46e5..317c8066f3fc 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -2346,11 +2346,12 @@ out:
  * bch2_btree_iter_peek: returns first key greater than or equal to iterator's
  * current position
  */
-struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
+struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos end)
 {
 	struct btree_trans *trans = iter->trans;
 	struct bpos search_key = btree_iter_search_key(iter);
 	struct bkey_s_c k;
+	struct bpos iter_pos;
 	int ret;
 
 	if (iter->update_path) {
@@ -2366,6 +2367,24 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
 		if (!k.k || bkey_err(k))
 			goto out;
 
+		/*
+		 * iter->pos should be mononotically increasing, and always be
+		 * equal to the key we just returned - except extents can
+		 * straddle iter->pos:
+		 */
+		if (!(iter->flags & BTREE_ITER_IS_EXTENTS))
+			iter_pos = k.k->p;
+		else if (bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0)
+			iter_pos = bkey_start_pos(k.k);
+		else
+			iter_pos = iter->pos;
+
+		if (bkey_cmp(iter_pos, end) > 0) {
+			bch2_btree_iter_set_pos(iter, end);
+			k = bkey_s_c_null;
+			goto out;
+		}
+
 		if (iter->update_path &&
 		    bkey_cmp(iter->update_path->pos, k.k->p)) {
 			bch2_path_put(trans, iter->update_path,
@@ -2419,14 +2438,7 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
 		break;
 	}
 
-	/*
-	 * iter->pos should be mononotically increasing, and always be equal to
-	 * the key we just returned - except extents can straddle iter->pos:
-	 */
-	if (!(iter->flags & BTREE_ITER_IS_EXTENTS))
-		iter->pos = k.k->p;
-	else if (bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0)
-		iter->pos = bkey_start_pos(k.k);
+	iter->pos = iter_pos;
 
 	iter->path = bch2_btree_path_set_pos(trans, iter->path, k.k->p,
 				iter->flags & BTREE_ITER_INTENT);
@@ -2658,9 +2670,13 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
 
 		if (iter->flags & BTREE_ITER_INTENT) {
 			struct btree_iter iter2;
+			struct bpos end = iter->pos;
+
+			if (iter->flags & BTREE_ITER_IS_EXTENTS)
+				end.offset = U64_MAX;
 
 			bch2_trans_copy_iter(&iter2, iter);
-			k = bch2_btree_iter_peek(&iter2);
+			k = bch2_btree_iter_peek_upto(&iter2, end);
 
 			if (k.k && !bkey_err(k)) {
 				iter->k = iter2.k;
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index 1e3172a2885a..27b3b82f7df3 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -245,9 +245,14 @@ int __must_check bch2_btree_iter_traverse(struct btree_iter *);
 struct btree *bch2_btree_iter_peek_node(struct btree_iter *);
 struct btree *bch2_btree_iter_next_node(struct btree_iter *);
 
-struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *);
+struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *, struct bpos);
 struct bkey_s_c bch2_btree_iter_next(struct btree_iter *);
 
+static inline struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
+{
+	return bch2_btree_iter_peek_upto(iter, SPOS_MAX);
+}
+
 struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *);
 struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *);
 
@@ -342,13 +347,26 @@ static inline int bkey_err(struct bkey_s_c k)
 }
 
 static inline struct bkey_s_c bch2_btree_iter_peek_type(struct btree_iter *iter,
-						     unsigned flags)
+							unsigned flags)
 {
 	return flags & BTREE_ITER_SLOTS
 		? bch2_btree_iter_peek_slot(iter)
 		: bch2_btree_iter_peek(iter);
 }
 
+static inline struct bkey_s_c bch2_btree_iter_peek_upto_type(struct btree_iter *iter,
+							     struct bpos end,
+							     unsigned flags)
+{
+	if (!(flags & BTREE_ITER_SLOTS))
+		return bch2_btree_iter_peek_upto(iter, end);
+
+	if (bkey_cmp(iter->pos, end) > 0)
+		return bkey_s_c_null;
+
+	return bch2_btree_iter_peek_slot(iter);
+}
+
 static inline int btree_trans_too_many_iters(struct btree_trans *trans)
 {
 	return hweight64(trans->paths_allocated) > BTREE_ITER_MAX / 2
@@ -385,6 +403,14 @@ __bch2_btree_iter_peek_and_restart(struct btree_trans *trans,
 	     !((_ret) = bkey_err(_k)) && (_k).k;			\
 	     bch2_btree_iter_advance(&(_iter)))
 
+#define for_each_btree_key_upto_norestart(_trans, _iter, _btree_id,	\
+			   _start, _end, _flags, _k, _ret)		\
+	for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id),	\
+				  (_start), (_flags));			\
+	     (_k) = bch2_btree_iter_peek_upto_type(&(_iter), _end, _flags),\
+	     !((_ret) = bkey_err(_k)) && (_k).k;			\
+	     bch2_btree_iter_advance(&(_iter)))
+
 #define for_each_btree_key_continue(_trans, _iter, _flags, _k, _ret)	\
 	for (;								\
 	     (_k) = __bch2_btree_iter_peek_and_restart((_trans), &(_iter), _flags),\
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 9f1ff5f8635d..c9cddba0f999 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -1286,7 +1286,7 @@ int bch2_trans_update_extent(struct btree_trans *trans,
 			     BTREE_ITER_INTENT|
 			     BTREE_ITER_WITH_UPDATES|
 			     BTREE_ITER_NOT_EXTENTS);
-	k = bch2_btree_iter_peek(&iter);
+	k = bch2_btree_iter_peek_upto(&iter, POS(insert->k.p.inode, U64_MAX));
 	if ((ret = bkey_err(k)))
 		goto err;
 	if (!k.k)
@@ -1405,7 +1405,8 @@ int bch2_trans_update_extent(struct btree_trans *trans,
 			goto out;
 		}
 next:
-		k = bch2_btree_iter_next(&iter);
+		bch2_btree_iter_advance(&iter);
+		k = bch2_btree_iter_peek_upto(&iter, POS(insert->k.p.inode, U64_MAX));
 		if ((ret = bkey_err(k)))
 			goto err;
 		if (!k.k)
diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c
index a43a24409d37..760e4f74715f 100644
--- a/fs/bcachefs/dirent.c
+++ b/fs/bcachefs/dirent.c
@@ -470,16 +470,13 @@ int bch2_empty_dir_trans(struct btree_trans *trans, subvol_inum dir)
 	if (ret)
 		return ret;
 
-	for_each_btree_key_norestart(trans, iter, BTREE_ID_dirents,
-			   SPOS(dir.inum, 0, snapshot), 0, k, ret) {
-		if (k.k->p.inode > dir.inum)
-			break;
-
+	for_each_btree_key_upto_norestart(trans, iter, BTREE_ID_dirents,
+			   SPOS(dir.inum, 0, snapshot),
+			   POS(dir.inum, U64_MAX), 0, k, ret)
 		if (k.k->type == KEY_TYPE_dirent) {
 			ret = -ENOTEMPTY;
 			break;
 		}
-	}
 	bch2_trans_iter_exit(trans, &iter);
 
 	return ret;
@@ -503,11 +500,9 @@ retry:
 	if (ret)
 		goto err;
 
-	for_each_btree_key_norestart(&trans, iter, BTREE_ID_dirents,
-			   SPOS(inum.inum, ctx->pos, snapshot), 0, k, ret) {
-		if (k.k->p.inode > inum.inum)
-			break;
-
+	for_each_btree_key_upto_norestart(&trans, iter, BTREE_ID_dirents,
+			   SPOS(inum.inum, ctx->pos, snapshot),
+			   POS(inum.inum, U64_MAX), 0, k, ret) {
 		if (k.k->type != KEY_TYPE_dirent)
 			continue;
 
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index 4c68cee013e3..afaee020e7e3 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -936,9 +936,8 @@ retry:
 			     SPOS(ei->v.i_ino, start, snapshot), 0);
 
 	while (!(ret = btree_trans_too_many_iters(&trans)) &&
-	       (k = bch2_btree_iter_peek(&iter)).k &&
-	       !(ret = bkey_err(k)) &&
-	       bkey_cmp(iter.pos, end) < 0) {
+	       (k = bch2_btree_iter_peek_upto(&iter, end)).k &&
+	       !(ret = bkey_err(k))) {
 		enum btree_id data_btree = BTREE_ID_extents;
 
 		if (!bkey_extent_is_data(k.k) &&
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index ee14ba5ee73d..3735397ee9c5 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -586,12 +586,12 @@ static int bch2_inode_delete_keys(struct btree_trans *trans,
 
 		bch2_btree_iter_set_snapshot(&iter, snapshot);
 
-		k = bch2_btree_iter_peek(&iter);
+		k = bch2_btree_iter_peek_upto(&iter, POS(inum.inum, U64_MAX));
 		ret = bkey_err(k);
 		if (ret)
 			goto err;
 
-		if (!k.k || iter.pos.inode != inum.inum)
+		if (!k.k)
 			break;
 
 		bkey_init(&delete.k);
diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h
index 57d636740d2f..591bbb9f8beb 100644
--- a/fs/bcachefs/str_hash.h
+++ b/fs/bcachefs/str_hash.h
@@ -163,12 +163,10 @@ bch2_hash_lookup(struct btree_trans *trans,
 	if (ret)
 		return ret;
 
-	for_each_btree_key_norestart(trans, *iter, desc.btree_id,
+	for_each_btree_key_upto_norestart(trans, *iter, desc.btree_id,
 			   SPOS(inum.inum, desc.hash_key(info, key), snapshot),
+			   POS(inum.inum, U64_MAX),
 			   BTREE_ITER_SLOTS|flags, k, ret) {
-		if (iter->pos.inode != inum.inum)
-			break;
-
 		if (is_visible_key(desc, inum, k)) {
 			if (!desc.cmp_key(k, key))
 				return 0;
@@ -199,15 +197,12 @@ bch2_hash_hole(struct btree_trans *trans,
 	if (ret)
 		return ret;
 
-	for_each_btree_key_norestart(trans, *iter, desc.btree_id,
+	for_each_btree_key_upto_norestart(trans, *iter, desc.btree_id,
 			   SPOS(inum.inum, desc.hash_key(info, key), snapshot),
-			   BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) {
-		if (iter->pos.inode != inum.inum)
-			break;
-
+			   POS(inum.inum, U64_MAX),
+			   BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret)
 		if (!is_visible_key(desc, inum, k))
 			return 0;
-	}
 	bch2_trans_iter_exit(trans, iter);
 
 	return ret ?: -ENOSPC;
@@ -260,14 +255,12 @@ int bch2_hash_set(struct btree_trans *trans,
 	if (ret)
 		return ret;
 
-	for_each_btree_key_norestart(trans, iter, desc.btree_id,
+	for_each_btree_key_upto_norestart(trans, iter, desc.btree_id,
 			   SPOS(inum.inum,
 				desc.hash_bkey(info, bkey_i_to_s_c(insert)),
 				snapshot),
+			   POS(inum.inum, U64_MAX),
 			   BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) {
-		if (iter.pos.inode != inum.inum)
-			break;
-
 		if (is_visible_key(desc, inum, k)) {
 			if (!desc.cmp_bkey(k, bkey_i_to_s_c(insert)))
 				goto found;
diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c
index 08b33ab8489f..ecce10342126 100644
--- a/fs/bcachefs/xattr.c
+++ b/fs/bcachefs/xattr.c
@@ -311,13 +311,9 @@ retry:
 	if (ret)
 		goto err;
 
-	for_each_btree_key_norestart(&trans, iter, BTREE_ID_xattrs,
-			   SPOS(inum, offset, snapshot), 0, k, ret) {
-		BUG_ON(k.k->p.inode < inum);
-
-		if (k.k->p.inode > inum)
-			break;
-
+	for_each_btree_key_upto_norestart(&trans, iter, BTREE_ID_xattrs,
+			   SPOS(inum, offset, snapshot),
+			   POS(inum, U64_MAX), 0, k, ret) {
 		if (k.k->type != KEY_TYPE_xattr)
 			continue;
 
-- 
cgit 


From 0576ba9ae7c7939d2402cdad9614f39785b70d2b Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 11 Mar 2022 18:16:42 -0500
Subject: bcachefs: Drop !did_work path from do_btree_insert_one()

As we've already reserved space in the journal this optimization doesn't
actually buy us anything, and when doing list_journal debugging it
deletes information we want.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_update_leaf.c | 15 +++++----------
 1 file changed, 5 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index c9cddba0f999..9925254c5446 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -214,7 +214,7 @@ inline void bch2_btree_add_journal_pin(struct bch_fs *c,
 /**
  * btree_insert_key - insert a key one key into a leaf node
  */
-static bool btree_insert_key_leaf(struct btree_trans *trans,
+static void btree_insert_key_leaf(struct btree_trans *trans,
 				  struct btree_insert_entry *insert)
 {
 	struct bch_fs *c = trans->c;
@@ -227,7 +227,7 @@ static bool btree_insert_key_leaf(struct btree_trans *trans,
 
 	if (unlikely(!bch2_btree_bset_insert_key(trans, insert->path, b,
 					&insert_l(insert)->iter, insert->k)))
-		return false;
+		return;
 
 	i->journal_seq = cpu_to_le64(max(trans->journal_res.seq,
 					 le64_to_cpu(i->journal_seq)));
@@ -248,8 +248,6 @@ static bool btree_insert_key_leaf(struct btree_trans *trans,
 	if (u64s_added > live_u64s_added &&
 	    bch2_maybe_compact_whiteouts(c, b))
 		bch2_trans_node_reinit_iter(trans, b);
-
-	return true;
 }
 
 /* Cached btree updates: */
@@ -401,7 +399,6 @@ static inline void do_btree_insert_one(struct btree_trans *trans,
 {
 	struct bch_fs *c = trans->c;
 	struct journal *j = &c->journal;
-	bool did_work;
 
 	EBUG_ON(trans->journal_res.ref !=
 		!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY));
@@ -409,15 +406,13 @@ static inline void do_btree_insert_one(struct btree_trans *trans,
 	i->k->k.needs_whiteout = false;
 
 	if (!i->cached)
-		did_work = btree_insert_key_leaf(trans, i);
+		btree_insert_key_leaf(trans, i);
 	else if (!i->key_cache_already_flushed)
-		did_work = bch2_btree_insert_key_cached(trans, i->path, i->k);
+		bch2_btree_insert_key_cached(trans, i->path, i->k);
 	else {
 		bch2_btree_key_cache_drop(trans, i->path);
-		did_work = false;
-	}
-	if (!did_work)
 		return;
+	}
 
 	if (likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)) &&
 	    !(i->flags & BTREE_UPDATE_NOJOURNAL)) {
-- 
cgit 


From 2158fe463b9d78c7cf90f74b8b5e9b81249d4347 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 2 Mar 2022 22:18:56 -0500
Subject: bcachefs: bch2_trans_inconsistent()

Add a new error macro that also dumps transaction updates in addition to
doing an emergency shutdown - when a transaction update discovers or is
causing a fs inconsistency, it's helpful to see what updates it was
doing.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_iter.c | 53 +++++++++++++++++++++++++++++-------------------
 fs/bcachefs/btree_iter.h |  1 +
 fs/bcachefs/buckets.c    | 32 ++++++++++++++---------------
 fs/bcachefs/error.h      | 20 ++++++++++++++++++
 4 files changed, 68 insertions(+), 38 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 317c8066f3fc..f33dc4657590 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1794,19 +1794,44 @@ free:
 }
 
 noinline __cold
-void bch2_dump_trans_paths_updates(struct btree_trans *trans)
+void bch2_dump_trans_updates(struct btree_trans *trans)
 {
-	struct btree_path *path;
 	struct btree_insert_entry *i;
 	struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF;
+
+	bch_err(trans->c, "transaction updates:");
+
+	trans_for_each_update(trans, i) {
+		struct bkey_s_c old = { &i->old_k, i->old_v };
+
+		printbuf_reset(&buf1);
+		printbuf_reset(&buf2);
+		bch2_bkey_val_to_text(&buf1, trans->c, old);
+		bch2_bkey_val_to_text(&buf2, trans->c, bkey_i_to_s_c(i->k));
+
+		printk(KERN_ERR "update: btree %s %pS\n  old %s\n  new %s",
+		       bch2_btree_ids[i->btree_id],
+		       (void *) i->ip_allocated,
+		       buf1.buf, buf2.buf);
+	}
+
+	printbuf_exit(&buf2);
+	printbuf_exit(&buf1);
+}
+
+noinline __cold
+void bch2_dump_trans_paths_updates(struct btree_trans *trans)
+{
+	struct btree_path *path;
+	struct printbuf buf = PRINTBUF;
 	unsigned idx;
 
 	btree_trans_sort_paths(trans);
 
 	trans_for_each_path_inorder(trans, path, idx) {
-		printbuf_reset(&buf1);
+		printbuf_reset(&buf);
 
-		bch2_bpos_to_text(&buf1, path->pos);
+		bch2_bpos_to_text(&buf, path->pos);
 
 		printk(KERN_ERR "path: idx %u ref %u:%u%s%s btree=%s l=%u pos %s locks %u %pS\n",
 		       path->idx, path->ref, path->intent_ref,
@@ -1814,7 +1839,7 @@ void bch2_dump_trans_paths_updates(struct btree_trans *trans)
 		       path->preserve ? " P" : "",
 		       bch2_btree_ids[path->btree_id],
 		       path->level,
-		       buf1.buf,
+		       buf.buf,
 		       path->nodes_locked,
 #ifdef CONFIG_BCACHEFS_DEBUG
 		       (void *) path->ip_allocated
@@ -1824,23 +1849,9 @@ void bch2_dump_trans_paths_updates(struct btree_trans *trans)
 		       );
 	}
 
-	trans_for_each_update(trans, i) {
-		struct bkey u;
-		struct bkey_s_c old = bch2_btree_path_peek_slot(i->path, &u);
+	printbuf_exit(&buf);
 
-		printbuf_reset(&buf1);
-		printbuf_reset(&buf2);
-		bch2_bkey_val_to_text(&buf1, trans->c, old);
-		bch2_bkey_val_to_text(&buf2, trans->c, bkey_i_to_s_c(i->k));
-
-		printk(KERN_ERR "update: btree %s %pS\n  old %s\n  new %s",
-		       bch2_btree_ids[i->btree_id],
-		       (void *) i->ip_allocated,
-		       buf1.buf, buf2.buf);
-	}
-
-	printbuf_exit(&buf2);
-	printbuf_exit(&buf1);
+	bch2_dump_trans_updates(trans);
 }
 
 static struct btree_path *btree_path_alloc(struct btree_trans *trans,
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index 27b3b82f7df3..72cb694f76fd 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -425,6 +425,7 @@ __bch2_btree_iter_peek_and_restart(struct btree_trans *trans,
 
 /* new multiple iterator interface: */
 
+void bch2_dump_trans_updates(struct btree_trans *);
 void bch2_dump_trans_paths_updates(struct btree_trans *);
 void __bch2_trans_init(struct btree_trans *, struct bch_fs *,
 		       unsigned, size_t, const char *);
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 7d3636e20c81..2ff64276304f 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -510,11 +510,16 @@ static int bch2_mark_alloc(struct btree_trans *trans,
 	struct bch_fs *c = trans->c;
 	struct bkey_alloc_unpacked old_u = bch2_alloc_unpack(old);
 	struct bkey_alloc_unpacked new_u = bch2_alloc_unpack(new);
-	struct bch_dev *ca;
+	struct bch_dev *ca = bch_dev_bkey_exists(c, new_u.dev);
 	struct bucket *g;
 	struct bucket_mark old_m, m;
 	int ret = 0;
 
+	if (bch2_trans_inconsistent_on(new_u.bucket < ca->mi.first_bucket ||
+				       new_u.bucket >= ca->mi.nbuckets, trans,
+				       "alloc key outside range of device's buckets"))
+		return -EIO;
+
 	/*
 	 * alloc btree is read in by bch2_alloc_read, not gc:
 	 */
@@ -554,11 +559,6 @@ static int bch2_mark_alloc(struct btree_trans *trans,
 		}
 	}
 
-	ca = bch_dev_bkey_exists(c, new_u.dev);
-
-	if (new_u.bucket >= ca->mi.nbuckets)
-		return 0;
-
 	percpu_down_read(&c->mark_lock);
 	if (!gc && new_u.gen != old_u.gen)
 		*bucket_gen(ca, new_u.bucket) = new_u.gen;
@@ -1466,7 +1466,6 @@ static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans,
 			struct extent_ptr_decoded p,
 			s64 sectors, enum bch_data_type data_type)
 {
-	struct bch_fs *c = trans->c;
 	struct btree_iter iter;
 	struct bkey_s_c k;
 	struct bkey_i_stripe *s;
@@ -1482,16 +1481,15 @@ static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans,
 		goto err;
 
 	if (k.k->type != KEY_TYPE_stripe) {
-		bch2_fs_inconsistent(c,
+		bch2_trans_inconsistent(trans,
 			"pointer to nonexistent stripe %llu",
 			(u64) p.ec.idx);
-		bch2_inconsistent_error(c);
 		ret = -EIO;
 		goto err;
 	}
 
 	if (!bch2_ptr_matches_stripe(bkey_s_c_to_stripe(k).v, p)) {
-		bch2_fs_inconsistent(c,
+		bch2_trans_inconsistent(trans,
 			"stripe pointer doesn't match stripe %llu",
 			(u64) p.ec.idx);
 		ret = -EIO;
@@ -1605,8 +1603,8 @@ static int bch2_trans_mark_stripe_bucket(struct btree_trans *trans,
 		goto err;
 
 	if (!deleting) {
-		if (bch2_fs_inconsistent_on(u.stripe ||
-					    u.stripe_redundancy, c,
+		if (bch2_trans_inconsistent_on(u.stripe ||
+					    u.stripe_redundancy, trans,
 				"bucket %llu:%llu gen %u data type %s dirty_sectors %u: multiple stripes using same bucket (%u, %llu)",
 				iter.pos.inode, iter.pos.offset, u.gen,
 				bch2_data_types[u.data_type],
@@ -1616,7 +1614,7 @@ static int bch2_trans_mark_stripe_bucket(struct btree_trans *trans,
 			goto err;
 		}
 
-		if (bch2_fs_inconsistent_on(data_type && u.dirty_sectors, c,
+		if (bch2_trans_inconsistent_on(data_type && u.dirty_sectors, trans,
 				"bucket %llu:%llu gen %u data type %s dirty_sectors %u: data already in stripe bucket %llu",
 				iter.pos.inode, iter.pos.offset, u.gen,
 				bch2_data_types[u.data_type],
@@ -1629,8 +1627,8 @@ static int bch2_trans_mark_stripe_bucket(struct btree_trans *trans,
 		u.stripe		= s.k->p.offset;
 		u.stripe_redundancy	= s.v->nr_redundant;
 	} else {
-		if (bch2_fs_inconsistent_on(u.stripe != s.k->p.offset ||
-					    u.stripe_redundancy != s.v->nr_redundant, c,
+		if (bch2_trans_inconsistent_on(u.stripe != s.k->p.offset ||
+					    u.stripe_redundancy != s.v->nr_redundant, trans,
 				"bucket %llu:%llu gen %u: not marked as stripe when deleting stripe %llu (got %u)",
 				iter.pos.inode, iter.pos.offset, u.gen,
 				s.k->p.offset, u.stripe)) {
@@ -1791,7 +1789,7 @@ static int __bch2_trans_mark_reflink_p(struct btree_trans *trans,
 	refcount = bkey_refcount(n);
 	if (!refcount) {
 		bch2_bkey_val_to_text(&buf, c, p.s_c);
-		bch2_fs_inconsistent(c,
+		bch2_trans_inconsistent(trans,
 			"nonexistent indirect extent at %llu while marking\n  %s",
 			*idx, buf.buf);
 		ret = -EIO;
@@ -1800,7 +1798,7 @@ static int __bch2_trans_mark_reflink_p(struct btree_trans *trans,
 
 	if (!*refcount && (flags & BTREE_TRIGGER_OVERWRITE)) {
 		bch2_bkey_val_to_text(&buf, c, p.s_c);
-		bch2_fs_inconsistent(c,
+		bch2_trans_inconsistent(trans,
 			"indirect extent refcount underflow at %llu while marking\n  %s",
 			*idx, buf.buf);
 		ret = -EIO;
diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h
index 4ab3cfe1292c..6e63c38186f3 100644
--- a/fs/bcachefs/error.h
+++ b/fs/bcachefs/error.h
@@ -66,6 +66,26 @@ do {									\
 	_ret;								\
 })
 
+/*
+ * When a transaction update discovers or is causing a fs inconsistency, it's
+ * helpful to also dump the pending updates:
+ */
+#define bch2_trans_inconsistent(trans, ...)				\
+({									\
+	bch_err(trans->c, __VA_ARGS__);					\
+	bch2_inconsistent_error(trans->c);				\
+	bch2_dump_trans_updates(trans);					\
+})
+
+#define bch2_trans_inconsistent_on(cond, trans, ...)			\
+({									\
+	bool _ret = unlikely(!!(cond));					\
+									\
+	if (_ret)							\
+		bch2_trans_inconsistent(trans, __VA_ARGS__);		\
+	_ret;								\
+})
+
 /*
  * Fsck errors: inconsistency errors we detect at mount time, and should ideally
  * be able to repair:
-- 
cgit 


From 8570d775ca90192f8663ddd828a09d0c6698c71c Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 11 Mar 2022 18:38:24 -0500
Subject: bcachefs: bch2_trans_updates_to_text()

This turns bch2_dump_trans_updates() into a to_text() method - this way
it can be used by debug tracing.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c | 42 ++++++++++++++++++++++++++++--------------
 fs/bcachefs/btree_iter.h |  1 +
 2 files changed, 29 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index f33dc4657590..cfaab8cbcad0 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1793,30 +1793,44 @@ free:
 	__bch2_path_free(trans, path);
 }
 
-noinline __cold
-void bch2_dump_trans_updates(struct btree_trans *trans)
+void bch2_trans_updates_to_text(struct printbuf *buf, struct btree_trans *trans)
 {
 	struct btree_insert_entry *i;
-	struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF;
 
-	bch_err(trans->c, "transaction updates:");
+	pr_buf(buf, "transaction updates for %s journal seq %llu",
+	       trans->fn, trans->journal_res.seq);
+	pr_newline(buf);
+	pr_indent_push(buf, 2);
 
 	trans_for_each_update(trans, i) {
 		struct bkey_s_c old = { &i->old_k, i->old_v };
 
-		printbuf_reset(&buf1);
-		printbuf_reset(&buf2);
-		bch2_bkey_val_to_text(&buf1, trans->c, old);
-		bch2_bkey_val_to_text(&buf2, trans->c, bkey_i_to_s_c(i->k));
-
-		printk(KERN_ERR "update: btree %s %pS\n  old %s\n  new %s",
+		pr_buf(buf, "update: btree=%s cached=%u %pS",
 		       bch2_btree_ids[i->btree_id],
-		       (void *) i->ip_allocated,
-		       buf1.buf, buf2.buf);
+		       i->cached,
+		       (void *) i->ip_allocated);
+		pr_newline(buf);
+
+		pr_buf(buf, "  old ");
+		bch2_bkey_val_to_text(buf, trans->c, old);
+		pr_newline(buf);
+
+		pr_buf(buf, "  new ");
+		bch2_bkey_val_to_text(buf, trans->c, bkey_i_to_s_c(i->k));
+		pr_newline(buf);
 	}
 
-	printbuf_exit(&buf2);
-	printbuf_exit(&buf1);
+	pr_indent_pop(buf, 2);
+}
+
+noinline __cold
+void bch2_dump_trans_updates(struct btree_trans *trans)
+{
+	struct printbuf buf = PRINTBUF;
+
+	bch2_trans_updates_to_text(&buf, trans);
+	bch_err(trans->c, "%s", buf.buf);
+	printbuf_exit(&buf);
 }
 
 noinline __cold
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index 72cb694f76fd..30a2a2cef29b 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -425,6 +425,7 @@ __bch2_btree_iter_peek_and_restart(struct btree_trans *trans,
 
 /* new multiple iterator interface: */
 
+void bch2_trans_updates_to_text(struct printbuf *, struct btree_trans *);
 void bch2_dump_trans_updates(struct btree_trans *);
 void bch2_dump_trans_paths_updates(struct btree_trans *);
 void __bch2_trans_init(struct btree_trans *, struct bch_fs *,
-- 
cgit 


From d13f9ee61dac0b5d663844c5778309ec8af1561f Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 12 Mar 2022 16:14:55 -0500
Subject: bcachefs: Revalidate pointer to old bkey val before calling mem
 triggers

We recently started stashing a copy of the key being overwritten in
btree_insert_entry: this is helpful for avoiding multiple calls to
bch2_btree_path_peek_slot() and bch2_journal_keys_peek() in the
transaction commit path.

But it turns out this has a problem - when we run mem/atomic triggers,
we've done a couple things that can invalidate the pointer to the old
key's value. This makes the optimization of stashing a pointer to the
old value questionable, but for now this patch revalidates that pointer
before running mem triggers.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_update_leaf.c | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 9925254c5446..a789a7cf4c74 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -660,6 +660,32 @@ bch2_trans_commit_write_locked(struct btree_trans *trans,
 
 		if (btree_node_type_needs_gc(i->bkey_type))
 			marking = true;
+
+		/*
+		 * Revalidate before calling mem triggers - XXX, ugly:
+		 *
+		 * - successful btree node splits don't cause transaction
+		 *   restarts and will have invalidated the pointer to the bkey
+		 *   value
+		 * - btree_node_lock_for_insert() -> btree_node_prep_for_write()
+		 *   when it has to resort
+		 * - btree_key_can_insert_cached() when it has to reallocate
+		 *
+		 *   Ugly because we currently have no way to tell if the
+		 *   pointer's been invalidated, which means it's debatabale
+		 *   whether we should be stashing the old key at all.
+		 */
+		i->old_v = bch2_btree_path_peek_slot(i->path, &i->old_k).v;
+
+		if (unlikely(!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags))) {
+			struct bkey_i *j_k =
+				bch2_journal_keys_peek(c, i->btree_id, i->level, i->k->k.p);
+
+			if (j_k && !bpos_cmp(j_k->k.p, i->k->k.p)) {
+				i->old_k = j_k->k;
+				i->old_v = &j_k->v;
+			}
+		}
 	}
 
 	/*
-- 
cgit 


From 880e2275f9f1461c87cd113a8da291861cc01400 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 13 Mar 2022 00:26:52 -0500
Subject: bcachefs: Move trigger fns to bkey_ops

This replaces the switch statements in bch2_mark_key(),
bch2_trans_mark_key() with new bkey methods - prep work for the next
patch, which fixes BTREE_TRIGGER_WANTS_OLD_AND_NEW.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/alloc_background.h |   3 +
 fs/bcachefs/bkey_methods.h     |  27 ++++++++
 fs/bcachefs/buckets.c          | 136 ++++++++++++++---------------------------
 fs/bcachefs/buckets.h          |  13 ++++
 fs/bcachefs/ec.h               |   2 +
 fs/bcachefs/extents.h          |   8 +++
 fs/bcachefs/inode.h            |   4 ++
 fs/bcachefs/reflink.h          |   6 +-
 8 files changed, 107 insertions(+), 92 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h
index 98c7866e20b5..3eaa6d204286 100644
--- a/fs/bcachefs/alloc_background.h
+++ b/fs/bcachefs/alloc_background.h
@@ -65,16 +65,19 @@ void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 #define bch2_bkey_ops_alloc (struct bkey_ops) {		\
 	.key_invalid	= bch2_alloc_v1_invalid,	\
 	.val_to_text	= bch2_alloc_to_text,		\
+	.atomic_trigger	= bch2_mark_alloc,		\
 }
 
 #define bch2_bkey_ops_alloc_v2 (struct bkey_ops) {	\
 	.key_invalid	= bch2_alloc_v2_invalid,	\
 	.val_to_text	= bch2_alloc_to_text,		\
+	.atomic_trigger	= bch2_mark_alloc,		\
 }
 
 #define bch2_bkey_ops_alloc_v3 (struct bkey_ops) {	\
 	.key_invalid	= bch2_alloc_v3_invalid,	\
 	.val_to_text	= bch2_alloc_to_text,		\
+	.atomic_trigger	= bch2_mark_alloc,		\
 }
 
 static inline bool bkey_is_alloc(const struct bkey *k)
diff --git a/fs/bcachefs/bkey_methods.h b/fs/bcachefs/bkey_methods.h
index 520f7d93993d..2b1086971bbb 100644
--- a/fs/bcachefs/bkey_methods.h
+++ b/fs/bcachefs/bkey_methods.h
@@ -6,6 +6,7 @@
 
 struct bch_fs;
 struct btree;
+struct btree_trans;
 struct bkey;
 enum btree_node_type;
 
@@ -20,6 +21,10 @@ struct bkey_ops {
 	void		(*swab)(struct bkey_s);
 	bool		(*key_normalize)(struct bch_fs *, struct bkey_s);
 	bool		(*key_merge)(struct bch_fs *, struct bkey_s, struct bkey_s_c);
+	int		(*trans_trigger)(struct btree_trans *, struct bkey_s_c,
+					 struct bkey_i *, unsigned);
+	int		(*atomic_trigger)(struct btree_trans *, struct bkey_s_c,
+					  struct bkey_s_c, unsigned);
 	void		(*compat)(enum btree_id id, unsigned version,
 				  unsigned big_endian, int write,
 				  struct bkey_s);
@@ -54,6 +59,28 @@ static inline bool bch2_bkey_maybe_mergable(const struct bkey *l, const struct b
 
 bool bch2_bkey_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);
 
+static inline int bch2_mark_key(struct btree_trans *trans,
+		  struct bkey_s_c old,
+		  struct bkey_s_c new,
+		  unsigned flags)
+{
+	const struct bkey_ops *ops = &bch2_bkey_ops[old.k->type ?: new.k->type];
+
+	return ops->atomic_trigger
+		? ops->atomic_trigger(trans, old, new, flags)
+		: 0;
+}
+
+static inline int bch2_trans_mark_key(struct btree_trans *trans, struct bkey_s_c old,
+			struct bkey_i *new, unsigned flags)
+{
+	const struct bkey_ops *ops = &bch2_bkey_ops[old.k->type ?: new->k.type];
+
+	return ops->trans_trigger
+		? ops->trans_trigger(trans, old, new, flags)
+		: 0;
+}
+
 void bch2_bkey_renumber(enum btree_node_type, struct bkey_packed *, int);
 
 void __bch2_bkey_compat(unsigned, enum btree_id, unsigned, unsigned,
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 2ff64276304f..a681a6045dc9 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -501,9 +501,9 @@ void bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
 	BUG_ON(owned_by_allocator == old.owned_by_allocator);
 }
 
-static int bch2_mark_alloc(struct btree_trans *trans,
-			   struct bkey_s_c old, struct bkey_s_c new,
-			   unsigned flags)
+int bch2_mark_alloc(struct btree_trans *trans,
+		    struct bkey_s_c old, struct bkey_s_c new,
+		    unsigned flags)
 {
 	bool gc = flags & BTREE_TRIGGER_GC;
 	u64 journal_seq = trans->journal_res.seq;
@@ -933,9 +933,9 @@ static int bch2_mark_stripe_ptr(struct btree_trans *trans,
 	return 0;
 }
 
-static int bch2_mark_extent(struct btree_trans *trans,
-			    struct bkey_s_c old, struct bkey_s_c new,
-			    unsigned flags)
+int bch2_mark_extent(struct btree_trans *trans,
+		     struct bkey_s_c old, struct bkey_s_c new,
+		     unsigned flags)
 {
 	u64 journal_seq = trans->journal_res.seq;
 	struct bch_fs *c = trans->c;
@@ -1015,9 +1015,9 @@ static int bch2_mark_extent(struct btree_trans *trans,
 	return 0;
 }
 
-static int bch2_mark_stripe(struct btree_trans *trans,
-			    struct bkey_s_c old, struct bkey_s_c new,
-			    unsigned flags)
+int bch2_mark_stripe(struct btree_trans *trans,
+		     struct bkey_s_c old, struct bkey_s_c new,
+		     unsigned flags)
 {
 	bool gc = flags & BTREE_TRIGGER_GC;
 	u64 journal_seq = trans->journal_res.seq;
@@ -1122,9 +1122,9 @@ static int bch2_mark_stripe(struct btree_trans *trans,
 	return 0;
 }
 
-static int bch2_mark_inode(struct btree_trans *trans,
-			   struct bkey_s_c old, struct bkey_s_c new,
-			   unsigned flags)
+int bch2_mark_inode(struct btree_trans *trans,
+		    struct bkey_s_c old, struct bkey_s_c new,
+		    unsigned flags)
 {
 	struct bch_fs *c = trans->c;
 	struct bch_fs_usage __percpu *fs_usage;
@@ -1153,9 +1153,9 @@ static int bch2_mark_inode(struct btree_trans *trans,
 	return 0;
 }
 
-static int bch2_mark_reservation(struct btree_trans *trans,
-				 struct bkey_s_c old, struct bkey_s_c new,
-				 unsigned flags)
+int bch2_mark_reservation(struct btree_trans *trans,
+			  struct bkey_s_c old, struct bkey_s_c new,
+			  unsigned flags)
 {
 	struct bch_fs *c = trans->c;
 	struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old: new;
@@ -1232,9 +1232,9 @@ fsck_err:
 	return ret;
 }
 
-static int bch2_mark_reflink_p(struct btree_trans *trans,
-			       struct bkey_s_c old, struct bkey_s_c new,
-			       unsigned flags)
+int bch2_mark_reflink_p(struct btree_trans *trans,
+			struct bkey_s_c old, struct bkey_s_c new,
+			unsigned flags)
 {
 	struct bch_fs *c = trans->c;
 	struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old: new;
@@ -1271,39 +1271,6 @@ static int bch2_mark_reflink_p(struct btree_trans *trans,
 	return ret;
 }
 
-int bch2_mark_key(struct btree_trans *trans,
-		  struct bkey_s_c old,
-		  struct bkey_s_c new,
-		  unsigned flags)
-{
-	struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old: new;
-
-	switch (k.k->type) {
-	case KEY_TYPE_alloc:
-	case KEY_TYPE_alloc_v2:
-	case KEY_TYPE_alloc_v3:
-		return bch2_mark_alloc(trans, old, new, flags);
-	case KEY_TYPE_btree_ptr:
-	case KEY_TYPE_btree_ptr_v2:
-	case KEY_TYPE_extent:
-	case KEY_TYPE_reflink_v:
-		return bch2_mark_extent(trans, old, new, flags);
-	case KEY_TYPE_stripe:
-		return bch2_mark_stripe(trans, old, new, flags);
-	case KEY_TYPE_inode:
-	case KEY_TYPE_inode_v2:
-		return bch2_mark_inode(trans, old, new, flags);
-	case KEY_TYPE_reservation:
-		return bch2_mark_reservation(trans, old, new, flags);
-	case KEY_TYPE_reflink_p:
-		return bch2_mark_reflink_p(trans, old, new, flags);
-	case KEY_TYPE_snapshot:
-		return bch2_mark_snapshot(trans, old, new, flags);
-	default:
-		return 0;
-	}
-}
-
 static noinline __cold
 void fs_usage_apply_warn(struct btree_trans *trans,
 			 unsigned disk_res_sectors,
@@ -1518,10 +1485,14 @@ err:
 	return ret;
 }
 
-static int bch2_trans_mark_extent(struct btree_trans *trans,
-			struct bkey_s_c k, unsigned flags)
+int bch2_trans_mark_extent(struct btree_trans *trans,
+			   struct bkey_s_c old, struct bkey_i *new,
+			   unsigned flags)
 {
 	struct bch_fs *c = trans->c;
+	struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE
+		? old
+		: bkey_i_to_s_c(new);
 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
 	const union bch_extent_entry *entry;
 	struct extent_ptr_decoded p;
@@ -1652,9 +1623,9 @@ err:
 	return ret;
 }
 
-static int bch2_trans_mark_stripe(struct btree_trans *trans,
-				  struct bkey_s_c old, struct bkey_i *new,
-				  unsigned flags)
+int bch2_trans_mark_stripe(struct btree_trans *trans,
+			   struct bkey_s_c old, struct bkey_i *new,
+			   unsigned flags)
 {
 	const struct bch_stripe *old_s = NULL;
 	struct bch_stripe *new_s = NULL;
@@ -1722,10 +1693,10 @@ static int bch2_trans_mark_stripe(struct btree_trans *trans,
 	return ret;
 }
 
-static int bch2_trans_mark_inode(struct btree_trans *trans,
-				 struct bkey_s_c old,
-				 struct bkey_i *new,
-				 unsigned flags)
+int bch2_trans_mark_inode(struct btree_trans *trans,
+			  struct bkey_s_c old,
+			  struct bkey_i *new,
+			  unsigned flags)
 {
 	int nr = bkey_is_inode(&new->k) - bkey_is_inode(old.k);
 
@@ -1738,9 +1709,14 @@ static int bch2_trans_mark_inode(struct btree_trans *trans,
 	return 0;
 }
 
-static int bch2_trans_mark_reservation(struct btree_trans *trans,
-				       struct bkey_s_c k, unsigned flags)
+int bch2_trans_mark_reservation(struct btree_trans *trans,
+				struct bkey_s_c old,
+				struct bkey_i *new,
+				unsigned flags)
 {
+	struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE
+		? old
+		: bkey_i_to_s_c(new);
 	unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas;
 	s64 sectors = (s64) k.k->size;
 	struct replicas_delta_list *d;
@@ -1839,9 +1815,14 @@ err:
 	return ret;
 }
 
-static int bch2_trans_mark_reflink_p(struct btree_trans *trans,
-				     struct bkey_s_c k, unsigned flags)
+int bch2_trans_mark_reflink_p(struct btree_trans *trans,
+			      struct bkey_s_c old,
+			      struct bkey_i *new,
+			      unsigned flags)
 {
+	struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE
+		? old
+		: bkey_i_to_s_c(new);
 	struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
 	u64 idx, end_idx;
 	int ret = 0;
@@ -1862,33 +1843,6 @@ static int bch2_trans_mark_reflink_p(struct btree_trans *trans,
 	return ret;
 }
 
-int bch2_trans_mark_key(struct btree_trans *trans, struct bkey_s_c old,
-			struct bkey_i *new, unsigned flags)
-{
-	struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE
-		? old
-		: bkey_i_to_s_c(new);
-
-	switch (k.k->type) {
-	case KEY_TYPE_btree_ptr:
-	case KEY_TYPE_btree_ptr_v2:
-	case KEY_TYPE_extent:
-	case KEY_TYPE_reflink_v:
-		return bch2_trans_mark_extent(trans, k, flags);
-	case KEY_TYPE_stripe:
-		return bch2_trans_mark_stripe(trans, old, new, flags);
-	case KEY_TYPE_inode:
-	case KEY_TYPE_inode_v2:
-		return bch2_trans_mark_inode(trans, old, new, flags);
-	case KEY_TYPE_reservation:
-		return bch2_trans_mark_reservation(trans, k, flags);
-	case KEY_TYPE_reflink_p:
-		return bch2_trans_mark_reflink_p(trans, k, flags);
-	default:
-		return 0;
-	}
-}
-
 static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
 				    struct bch_dev *ca, size_t b,
 				    enum bch_data_type type,
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index ca34d5d3b961..90f53e677281 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -229,6 +229,19 @@ void bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *,
 			       size_t, enum bch_data_type, unsigned,
 			       struct gc_pos, unsigned);
 
+int bch2_mark_alloc(struct btree_trans *, struct bkey_s_c, struct bkey_s_c, unsigned);
+int bch2_mark_extent(struct btree_trans *, struct bkey_s_c, struct bkey_s_c, unsigned);
+int bch2_mark_stripe(struct btree_trans *, struct bkey_s_c, struct bkey_s_c, unsigned);
+int bch2_mark_inode(struct btree_trans *, struct bkey_s_c, struct bkey_s_c, unsigned);
+int bch2_mark_reservation(struct btree_trans *, struct bkey_s_c, struct bkey_s_c, unsigned);
+int bch2_mark_reflink_p(struct btree_trans *, struct bkey_s_c, struct bkey_s_c, unsigned);
+
+int bch2_trans_mark_extent(struct btree_trans *, struct bkey_s_c, struct bkey_i *, unsigned);
+int bch2_trans_mark_stripe(struct btree_trans *, struct bkey_s_c, struct bkey_i *, unsigned);
+int bch2_trans_mark_inode(struct btree_trans *, struct bkey_s_c, struct bkey_i *, unsigned);
+int bch2_trans_mark_reservation(struct btree_trans *, struct bkey_s_c, struct bkey_i *, unsigned);
+int bch2_trans_mark_reflink_p(struct btree_trans *, struct bkey_s_c, struct bkey_i *, unsigned);
+
 int bch2_mark_key(struct btree_trans *, struct bkey_s_c, struct bkey_s_c, unsigned);
 
 int bch2_trans_mark_key(struct btree_trans *, struct bkey_s_c,
diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h
index 78d468c7680a..9d508a2f3bbc 100644
--- a/fs/bcachefs/ec.h
+++ b/fs/bcachefs/ec.h
@@ -14,6 +14,8 @@ void bch2_stripe_to_text(struct printbuf *, struct bch_fs *,
 	.key_invalid	= bch2_stripe_invalid,		\
 	.val_to_text	= bch2_stripe_to_text,		\
 	.swab		= bch2_ptr_swab,		\
+	.trans_trigger	= bch2_trans_mark_stripe,	\
+	.atomic_trigger	= bch2_mark_stripe,		\
 }
 
 static inline unsigned stripe_csums_per_device(const struct bch_stripe *s)
diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h
index 9c2567274a2b..ae650849d98a 100644
--- a/fs/bcachefs/extents.h
+++ b/fs/bcachefs/extents.h
@@ -381,6 +381,8 @@ void bch2_btree_ptr_v2_compat(enum btree_id, unsigned, unsigned,
 	.key_invalid	= bch2_btree_ptr_invalid,		\
 	.val_to_text	= bch2_btree_ptr_to_text,		\
 	.swab		= bch2_ptr_swab,			\
+	.trans_trigger	= bch2_trans_mark_extent,		\
+	.atomic_trigger	= bch2_mark_extent,			\
 }
 
 #define bch2_bkey_ops_btree_ptr_v2 (struct bkey_ops) {		\
@@ -388,6 +390,8 @@ void bch2_btree_ptr_v2_compat(enum btree_id, unsigned, unsigned,
 	.val_to_text	= bch2_btree_ptr_v2_to_text,		\
 	.swab		= bch2_ptr_swab,			\
 	.compat		= bch2_btree_ptr_v2_compat,		\
+	.trans_trigger	= bch2_trans_mark_extent,		\
+	.atomic_trigger	= bch2_mark_extent,			\
 }
 
 /* KEY_TYPE_extent: */
@@ -402,6 +406,8 @@ bool bch2_extent_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);
 	.swab		= bch2_ptr_swab,			\
 	.key_normalize	= bch2_extent_normalize,		\
 	.key_merge	= bch2_extent_merge,			\
+	.trans_trigger	= bch2_trans_mark_extent,		\
+	.atomic_trigger	= bch2_mark_extent,			\
 }
 
 /* KEY_TYPE_reservation: */
@@ -414,6 +420,8 @@ bool bch2_reservation_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);
 	.key_invalid	= bch2_reservation_invalid,		\
 	.val_to_text	= bch2_reservation_to_text,		\
 	.key_merge	= bch2_reservation_merge,		\
+	.trans_trigger	= bch2_trans_mark_reservation,		\
+	.atomic_trigger	= bch2_mark_reservation,		\
 }
 
 /* Extent checksum entries: */
diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h
index 77957cc7f9dd..2337ecfc600e 100644
--- a/fs/bcachefs/inode.h
+++ b/fs/bcachefs/inode.h
@@ -13,11 +13,15 @@ void bch2_inode_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 #define bch2_bkey_ops_inode (struct bkey_ops) {		\
 	.key_invalid	= bch2_inode_invalid,		\
 	.val_to_text	= bch2_inode_to_text,		\
+	.trans_trigger	= bch2_trans_mark_inode,	\
+	.atomic_trigger	= bch2_mark_inode,		\
 }
 
 #define bch2_bkey_ops_inode_v2 (struct bkey_ops) {	\
 	.key_invalid	= bch2_inode_v2_invalid,	\
 	.val_to_text	= bch2_inode_to_text,		\
+	.trans_trigger	= bch2_trans_mark_inode,	\
+	.atomic_trigger	= bch2_mark_inode,		\
 }
 
 static inline bool bkey_is_inode(const struct bkey *k)
diff --git a/fs/bcachefs/reflink.h b/fs/bcachefs/reflink.h
index 3745873fd88d..4da4330014a8 100644
--- a/fs/bcachefs/reflink.h
+++ b/fs/bcachefs/reflink.h
@@ -10,7 +10,9 @@ bool bch2_reflink_p_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);
 #define bch2_bkey_ops_reflink_p (struct bkey_ops) {		\
 	.key_invalid	= bch2_reflink_p_invalid,		\
 	.val_to_text	= bch2_reflink_p_to_text,		\
-	.key_merge	= bch2_reflink_p_merge,		\
+	.key_merge	= bch2_reflink_p_merge,			\
+	.trans_trigger	= bch2_trans_mark_reflink_p,		\
+	.atomic_trigger	= bch2_mark_reflink_p,			\
 }
 
 const char *bch2_reflink_v_invalid(const struct bch_fs *, struct bkey_s_c);
@@ -21,6 +23,8 @@ void bch2_reflink_v_to_text(struct printbuf *, struct bch_fs *,
 	.key_invalid	= bch2_reflink_v_invalid,		\
 	.val_to_text	= bch2_reflink_v_to_text,		\
 	.swab		= bch2_ptr_swab,			\
+	.trans_trigger	= bch2_trans_mark_extent,		\
+	.atomic_trigger	= bch2_mark_extent,			\
 }
 
 const char *bch2_indirect_inline_data_invalid(const struct bch_fs *,
-- 
cgit 


From 5d93a842c1eb292e2cde9f5025628269d7d386e0 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 13 Mar 2022 00:30:16 -0500
Subject: bcachefs: Fix BTREE_TRIGGER_WANTS_OLD_AND_NEW

BTREE_TRIGGER_WANTS_OLD_AND_NEW didn't work correctly when the old and
new key were both alloc keys, but different versions - it required old
and new key type to be identical, and this bug is a problem for the new
allocator rewrite.

This patch fixes it by checking if the old and new key have the same
trigger functions - the different versions of alloc (and inode) keys
have the same trigger functions.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_update_leaf.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index a789a7cf4c74..cb0cab71b99b 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -442,7 +442,8 @@ static int run_one_mem_trigger(struct btree_trans *trans,
 	if (!btree_node_type_needs_gc(i->btree_id))
 		return 0;
 
-	if (old.k->type == new->k.type &&
+	if (bch2_bkey_ops[old.k->type].atomic_trigger ==
+	    bch2_bkey_ops[i->k->k.type].atomic_trigger &&
 	    ((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) {
 		ret   = bch2_mark_key(trans, old, bkey_i_to_s_c(new),
 				BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|flags);
@@ -493,7 +494,8 @@ static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_
 
 	if (overwrite) {
 		ret = bch2_trans_mark_old(trans, old, i->flags);
-	} else if (old.k->type == i->k->k.type &&
+	} else if (bch2_bkey_ops[old.k->type].trans_trigger ==
+		   bch2_bkey_ops[i->k->k.type].trans_trigger &&
 	    ((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) {
 		i->overwrite_trigger_run = true;
 		ret = bch2_trans_mark_key(trans, old, i->k,
-- 
cgit 


From 91d961badfd123b6759488bc4aa7a4d014b739f1 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 29 Mar 2022 15:48:45 -0400
Subject: bcachefs: darrays

Inspired by CCAN darray - simple, stupid resizable (dynamic) arrays.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/bcachefs.h        |   2 +-
 fs/bcachefs/darray.h          |  77 +++++++++++++++++++++
 fs/bcachefs/fs.c              |   2 +-
 fs/bcachefs/fs.h              |   4 +-
 fs/bcachefs/fsck.c            | 153 ++++++++++++++++--------------------------
 fs/bcachefs/move.c            |   8 +--
 fs/bcachefs/subvolume.c       |  41 ++++-------
 fs/bcachefs/subvolume.h       |  38 ++++-------
 fs/bcachefs/subvolume_types.h |   8 +--
 9 files changed, 170 insertions(+), 163 deletions(-)
 create mode 100644 fs/bcachefs/darray.h

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 6cda77ad4342..01e9ed5dfc61 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -665,7 +665,7 @@ struct bch_fs {
 	struct mutex		snapshot_table_lock;
 	struct work_struct	snapshot_delete_work;
 	struct work_struct	snapshot_wait_for_pagecache_and_delete_work;
-	struct snapshot_id_list	snapshots_unlinked;
+	snapshot_id_list	snapshots_unlinked;
 	struct mutex		snapshots_unlinked_lock;
 
 	/* BTREE CACHE */
diff --git a/fs/bcachefs/darray.h b/fs/bcachefs/darray.h
new file mode 100644
index 000000000000..519ab9b96e67
--- /dev/null
+++ b/fs/bcachefs/darray.h
@@ -0,0 +1,77 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_DARRAY_H
+#define _BCACHEFS_DARRAY_H
+
+/*
+ * Dynamic arrays:
+ *
+ * Inspired by CCAN's darray
+ */
+
+#include "util.h"
+#include <linux/slab.h>
+
+#define DARRAY(type)							\
+struct {								\
+	size_t nr, size;						\
+	type *data;							\
+}
+
+typedef DARRAY(void) darray_void;
+
+static inline int __darray_make_room(darray_void *d, size_t t_size, size_t more)
+{
+	if (d->nr + more > d->size) {
+		size_t new_size = roundup_pow_of_two(d->nr + more);
+		void *data = krealloc_array(d->data, new_size, t_size, GFP_KERNEL);
+
+		if (!data)
+			return -ENOMEM;
+
+		d->data	= data;
+		d->size = new_size;
+	}
+
+	return 0;
+}
+
+#define darray_make_room(_d, _more)					\
+	__darray_make_room((darray_void *) (_d), sizeof((_d)->data[0]), (_more))
+
+#define darray_top(_d)		((_d).data[(_d).nr])
+
+#define darray_push(_d, _item)						\
+({									\
+	int _ret = darray_make_room((_d), 1);				\
+									\
+	if (!_ret)							\
+		(_d)->data[(_d)->nr++] = (_item);			\
+	_ret;								\
+})
+
+#define darray_insert_item(_d, _pos, _item)				\
+({									\
+	size_t pos = (_pos);						\
+	int _ret = darray_make_room((_d), 1);				\
+									\
+	if (!_ret)							\
+		array_insert_item((_d)->data, (_d)->nr, pos, (_item));	\
+	_ret;								\
+})
+
+#define darray_for_each(_d, _i)						\
+	for (_i = (_d).data; _i < (_d).data + (_d).nr; _i++)
+
+#define darray_init(_d)							\
+do {									\
+	(_d)->data = NULL;						\
+	(_d)->nr = (_d)->size = 0;					\
+} while (0)
+
+#define darray_exit(_d)							\
+do {									\
+	kfree((_d)->data);						\
+	darray_init(_d);						\
+} while (0)
+
+#endif /* _BCACHEFS_DARRAY_H */
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index afaee020e7e3..d8cd32b5d765 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -1478,7 +1478,7 @@ static void bch2_evict_inode(struct inode *vinode)
 }
 
 void bch2_evict_subvolume_inodes(struct bch_fs *c,
-				 struct snapshot_id_list *s)
+				 snapshot_id_list *s)
 {
 	struct super_block *sb = c->vfs_sb;
 	struct inode *inode;
diff --git a/fs/bcachefs/fs.h b/fs/bcachefs/fs.h
index a67ab1ad2a31..73b96d0b5d83 100644
--- a/fs/bcachefs/fs.h
+++ b/fs/bcachefs/fs.h
@@ -190,7 +190,7 @@ int bch2_setattr_nonsize(struct mnt_idmap *,
 			 struct iattr *);
 int __bch2_unlink(struct inode *, struct dentry *, bool);
 
-void bch2_evict_subvolume_inodes(struct bch_fs *, struct snapshot_id_list *);
+void bch2_evict_subvolume_inodes(struct bch_fs *, snapshot_id_list *);
 
 void bch2_vfs_exit(void);
 int bch2_vfs_init(void);
@@ -198,7 +198,7 @@ int bch2_vfs_init(void);
 #else
 
 static inline void bch2_evict_subvolume_inodes(struct bch_fs *c,
-					       struct snapshot_id_list *s) {}
+					       snapshot_id_list *s) {}
 static inline void bch2_vfs_exit(void) {}
 static inline int bch2_vfs_init(void) { return 0; }
 
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 8783b950055e..10754b13ec15 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -3,6 +3,7 @@
 #include "bcachefs.h"
 #include "bkey_buf.h"
 #include "btree_update.h"
+#include "darray.h"
 #include "dirent.h"
 #include "error.h"
 #include "fs-common.h"
@@ -471,11 +472,11 @@ static int snapshots_seen_update(struct bch_fs *c, struct snapshots_seen *s, str
 	pos.snapshot = snapshot_t(c, pos.snapshot)->equiv;
 
 	if (bkey_cmp(s->pos, pos))
-		s->nr = 0;
+		s->ids.nr = 0;
 	s->pos = pos;
 
 	/* Might get called multiple times due to lock restarts */
-	if (s->nr && s->d[s->nr - 1] == pos.snapshot)
+	if (s->ids.nr && s->ids.data[s->ids.nr - 1] == pos.snapshot)
 		return 0;
 
 	return snapshots_seen_add(c, s, pos.snapshot);
@@ -498,7 +499,7 @@ static bool key_visible_in_snapshot(struct bch_fs *c, struct snapshots_seen *see
 	ancestor	= snapshot_t(c, ancestor)->equiv;
 
 	/* @ancestor should be the snapshot most recently added to @seen */
-	BUG_ON(!seen->nr || seen->d[seen->nr - 1] != ancestor);
+	BUG_ON(!seen->ids.nr || seen->ids.data[seen->ids.nr - 1] != ancestor);
 	BUG_ON(seen->pos.snapshot != ancestor);
 
 	if (id == ancestor)
@@ -507,11 +508,11 @@ static bool key_visible_in_snapshot(struct bch_fs *c, struct snapshots_seen *see
 	if (!bch2_snapshot_is_ancestor(c, id, ancestor))
 		return false;
 
-	for (i = seen->nr - 2;
-	     i >= 0 && seen->d[i] >= id;
+	for (i = seen->ids.nr - 2;
+	     i >= 0 && seen->ids.data[i] >= id;
 	     --i)
-		if (bch2_snapshot_is_ancestor(c, id, seen->d[i]) &&
-		    bch2_snapshot_is_ancestor(c, seen->d[i], ancestor))
+		if (bch2_snapshot_is_ancestor(c, id, seen->ids.data[i]) &&
+		    bch2_snapshot_is_ancestor(c, seen->ids.data[i], ancestor))
 			return false;
 
 	return true;
@@ -537,26 +538,25 @@ static int ref_visible(struct bch_fs *c, struct snapshots_seen *s,
 }
 
 #define for_each_visible_inode(_c, _s, _w, _snapshot, _i)	\
-	for (_i = (_w)->d; _i < (_w)->d + (_w)->nr && (_i)->snapshot <= (_snapshot); _i++)\
+	for (_i = (_w)->inodes.data; _i < (_w)->inodes.data + (_w)->inodes.nr && (_i)->snapshot <= (_snapshot); _i++)\
 		if (key_visible_in_snapshot(_c, _s, _i->snapshot, _snapshot))
 
+struct inode_walker_entry {
+	struct bch_inode_unpacked inode;
+	u32			snapshot;
+	u64			count;
+};
+
 struct inode_walker {
 	bool				first_this_inode;
 	u64				cur_inum;
 
-	size_t				nr;
-	size_t				size;
-	struct inode_walker_entry {
-		struct bch_inode_unpacked inode;
-		u32			snapshot;
-		u64			count;
-	} *d;
+	DARRAY(struct inode_walker_entry) inodes;
 };
 
 static void inode_walker_exit(struct inode_walker *w)
 {
-	kfree(w->d);
-	w->d = NULL;
+	darray_exit(&w->inodes);
 }
 
 static struct inode_walker inode_walker_init(void)
@@ -564,43 +564,17 @@ static struct inode_walker inode_walker_init(void)
 	return (struct inode_walker) { 0, };
 }
 
-static int inode_walker_realloc(struct bch_fs *c, struct inode_walker *w)
-{
-	if (w->nr == w->size) {
-		size_t new_size = max_t(size_t, 8UL, w->size * 2);
-		void *d = krealloc(w->d, new_size * sizeof(w->d[0]),
-				   GFP_KERNEL);
-		if (!d) {
-			bch_err(c, "fsck: error allocating memory for inode_walker, size %zu",
-				new_size);
-			return -ENOMEM;
-		}
-
-		w->d = d;
-		w->size = new_size;
-	}
-
-	return 0;
-}
-
 static int add_inode(struct bch_fs *c, struct inode_walker *w,
 		     struct bkey_s_c inode)
 {
 	struct bch_inode_unpacked u;
-	int ret;
-
-	ret = inode_walker_realloc(c, w);
-	if (ret)
-		return ret;
 
 	BUG_ON(bch2_inode_unpack(inode, &u));
 
-	w->d[w->nr++] = (struct inode_walker_entry) {
+	return darray_push(&w->inodes, ((struct inode_walker_entry) {
 		.inode		= u,
 		.snapshot	= snapshot_t(c, inode.k->p.snapshot)->equiv,
-	};
-
-	return 0;
+	}));
 }
 
 static int __walk_inode(struct btree_trans *trans,
@@ -619,7 +593,7 @@ static int __walk_inode(struct btree_trans *trans,
 		goto lookup_snapshot;
 	}
 
-	w->nr = 0;
+	w->inodes.nr = 0;
 
 	for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, pos.inode),
 			   BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
@@ -637,26 +611,25 @@ static int __walk_inode(struct btree_trans *trans,
 	w->cur_inum		= pos.inode;
 	w->first_this_inode	= true;
 lookup_snapshot:
-	for (i = 0; i < w->nr; i++)
-		if (bch2_snapshot_is_ancestor(c, pos.snapshot, w->d[i].snapshot))
+	for (i = 0; i < w->inodes.nr; i++)
+		if (bch2_snapshot_is_ancestor(c, pos.snapshot, w->inodes.data[i].snapshot))
 			goto found;
 	return INT_MAX;
 found:
-	BUG_ON(pos.snapshot > w->d[i].snapshot);
+	BUG_ON(pos.snapshot > w->inodes.data[i].snapshot);
 
-	if (pos.snapshot != w->d[i].snapshot) {
+	if (pos.snapshot != w->inodes.data[i].snapshot) {
 		ancestor_pos = i;
 
-		while (i && w->d[i - 1].snapshot > pos.snapshot)
+		while (i && w->inodes.data[i - 1].snapshot > pos.snapshot)
 			--i;
 
-		ret = inode_walker_realloc(c, w);
+		ret = darray_insert_item(&w->inodes, i, w->inodes.data[ancestor_pos]);
 		if (ret)
 			return ret;
 
-		array_insert_item(w->d, w->nr, i, w->d[ancestor_pos]);
-		w->d[i].snapshot = pos.snapshot;
-		w->d[i].count	= 0;
+		w->inodes.data[i].snapshot = pos.snapshot;
+		w->inodes.data[i].count	= 0;
 	}
 
 	return i;
@@ -672,7 +645,7 @@ static int __get_visible_inodes(struct btree_trans *trans,
 	struct bkey_s_c k;
 	int ret;
 
-	w->nr = 0;
+	w->inodes.nr = 0;
 
 	for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, inum),
 			   BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
@@ -1133,7 +1106,7 @@ static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w)
 	int ret = 0, ret2 = 0;
 	s64 count2;
 
-	for (i = w->d; i < w->d + w->nr; i++) {
+	darray_for_each(w->inodes, i) {
 		if (i->inode.bi_sectors == i->count)
 			continue;
 
@@ -1232,7 +1205,7 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
 		goto out;
 	}
 
-	i = inode->d + ret;
+	i = inode->inodes.data + ret;
 	ret = 0;
 
 	if (fsck_err_on(!S_ISREG(i->inode.bi_mode) &&
@@ -1333,7 +1306,7 @@ static int check_subdir_count(struct btree_trans *trans, struct inode_walker *w)
 	int ret = 0, ret2 = 0;
 	s64 count2;
 
-	for (i = w->d; i < w->d + w->nr; i++) {
+	darray_for_each(w->inodes, i) {
 		if (i->inode.bi_nlink == i->count)
 			continue;
 
@@ -1537,7 +1510,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
 		goto out;
 	}
 
-	i = dir->d + ret;
+	i = dir->inodes.data + ret;
 	ret = 0;
 
 	if (fsck_err_on(!S_ISDIR(i->inode.bi_mode), c,
@@ -1550,7 +1523,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
 	}
 
 	if (dir->first_this_inode)
-		*hash_info = bch2_hash_info_init(c, &dir->d[0].inode);
+		*hash_info = bch2_hash_info_init(c, &dir->inodes.data[0].inode);
 
 	ret = hash_check_key(trans, bch2_dirent_hash_desc,
 			     hash_info, iter, k);
@@ -1618,7 +1591,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
 		if (ret)
 			goto err;
 
-		if (fsck_err_on(!target->nr, c,
+		if (fsck_err_on(!target->inodes.nr, c,
 				"dirent points to missing inode:\n%s",
 				(printbuf_reset(&buf),
 				 bch2_bkey_val_to_text(&buf, c, k),
@@ -1628,7 +1601,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
 				goto err;
 		}
 
-		for (i = target->d; i < target->d + target->nr; i++) {
+		darray_for_each(target->inodes, i) {
 			ret = check_dirent_target(trans, iter, d,
 						  &i->inode, i->snapshot);
 			if (ret)
@@ -1726,7 +1699,7 @@ static int check_xattr(struct btree_trans *trans, struct btree_iter *iter,
 	ret = 0;
 
 	if (inode->first_this_inode)
-		*hash_info = bch2_hash_info_init(c, &inode->d[0].inode);
+		*hash_info = bch2_hash_info_init(c, &inode->inodes.data[0].inode);
 
 	ret = hash_check_key(trans, bch2_xattr_hash_desc, hash_info, iter, k);
 fsck_err:
@@ -1836,21 +1809,18 @@ static int check_root(struct bch_fs *c)
 		check_root_trans(&trans));
 }
 
-struct pathbuf {
-	size_t		nr;
-	size_t		size;
-
-	struct pathbuf_entry {
-		u64	inum;
-		u32	snapshot;
-	}		*entries;
+struct pathbuf_entry {
+	u64	inum;
+	u32	snapshot;
 };
 
-static bool path_is_dup(struct pathbuf *p, u64 inum, u32 snapshot)
+typedef DARRAY(struct pathbuf_entry) pathbuf;
+
+static bool path_is_dup(pathbuf *p, u64 inum, u32 snapshot)
 {
 	struct pathbuf_entry *i;
 
-	for (i = p->entries; i < p->entries + p->nr; i++)
+	darray_for_each(*p, i)
 		if (i->inum	== inum &&
 		    i->snapshot	== snapshot)
 			return true;
@@ -1858,29 +1828,18 @@ static bool path_is_dup(struct pathbuf *p, u64 inum, u32 snapshot)
 	return false;
 }
 
-static int path_down(struct bch_fs *c, struct pathbuf *p,
+static int path_down(struct bch_fs *c, pathbuf *p,
 		     u64 inum, u32 snapshot)
 {
-	if (p->nr == p->size) {
-		size_t new_size = max_t(size_t, 256UL, p->size * 2);
-		void *n = krealloc(p->entries,
-				   new_size * sizeof(p->entries[0]),
-				   GFP_KERNEL);
-		if (!n) {
-			bch_err(c, "fsck: error allocating memory for pathbuf, size %zu",
-				new_size);
-			return -ENOMEM;
-		}
-
-		p->entries = n;
-		p->size = new_size;
-	};
-
-	p->entries[p->nr++] = (struct pathbuf_entry) {
+	int ret = darray_push(p, ((struct pathbuf_entry) {
 		.inum		= inum,
 		.snapshot	= snapshot,
-	};
-	return 0;
+	}));
+
+	if (ret)
+		bch_err(c, "fsck: error allocating memory for pathbuf, size %zu",
+			p->size);
+	return ret;
 }
 
 /*
@@ -1889,7 +1848,7 @@ static int path_down(struct bch_fs *c, struct pathbuf *p,
  * XXX: we should also be verifying that inodes are in the right subvolumes
  */
 static int check_path(struct btree_trans *trans,
-		      struct pathbuf *p,
+		      pathbuf *p,
 		      struct bch_inode_unpacked *inode,
 		      u32 snapshot)
 {
@@ -1963,7 +1922,7 @@ static int check_path(struct btree_trans *trans,
 			/* XXX print path */
 			bch_err(c, "directory structure loop");
 
-			for (i = p->entries; i < p->entries + p->nr; i++)
+			darray_for_each(*p, i)
 				pr_err("%llu:%u", i->inum, i->snapshot);
 			pr_err("%llu:%u", inode->bi_inum, snapshot);
 
@@ -2000,7 +1959,7 @@ static int check_directory_structure(struct bch_fs *c)
 	struct btree_iter iter;
 	struct bkey_s_c k;
 	struct bch_inode_unpacked u;
-	struct pathbuf path = { 0, 0, NULL };
+	pathbuf path = { 0, };
 	int ret;
 
 	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
@@ -2030,7 +1989,7 @@ static int check_directory_structure(struct bch_fs *c)
 
 	BUG_ON(ret == -EINTR);
 
-	kfree(path.entries);
+	darray_exit(&path);
 
 	bch2_trans_exit(&trans);
 	return ret;
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index 2eb192da8e1d..b916ee35ee37 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -91,10 +91,10 @@ next:
 
 		if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, old_pos.snapshot)) {
 			struct bkey_i *update;
-			size_t i;
+			u32 *i;
 
-			for (i = 0; i < s.nr; i++)
-				if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, s.d[i]))
+			darray_for_each(s.ids, i)
+				if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, *i))
 					goto next;
 
 			update = bch2_trans_kmalloc(trans, sizeof(struct bkey_i));
@@ -124,7 +124,7 @@ next:
 		}
 	}
 	bch2_trans_iter_exit(trans, &iter);
-	kfree(s.d);
+	darray_exit(&s.ids);
 
 	return ret;
 }
diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c
index 69603327d93d..2c5f7e7793a7 100644
--- a/fs/bcachefs/subvolume.c
+++ b/fs/bcachefs/subvolume.c
@@ -544,36 +544,21 @@ err:
 	return ret;
 }
 
-static int snapshot_id_add(struct snapshot_id_list *s, u32 id)
+static int snapshot_id_add(snapshot_id_list *s, u32 id)
 {
 	BUG_ON(snapshot_list_has_id(s, id));
 
-	if (s->nr == s->size) {
-		size_t new_size = max(8U, s->size * 2);
-		void *n = krealloc(s->d,
-				   new_size * sizeof(s->d[0]),
-				   GFP_KERNEL);
-		if (!n) {
-			pr_err("error allocating snapshot ID list");
-			return -ENOMEM;
-		}
-
-		s->d	= n;
-		s->size = new_size;
-	};
-
-	s->d[s->nr++] = id;
-	return 0;
+	return darray_push(s, id);
 }
 
 static int bch2_snapshot_delete_keys_btree(struct btree_trans *trans,
-					   struct snapshot_id_list *deleted,
+					   snapshot_id_list *deleted,
 					   enum btree_id btree_id)
 {
 	struct bch_fs *c = trans->c;
 	struct btree_iter iter;
 	struct bkey_s_c k;
-	struct snapshot_id_list equiv_seen = { 0 };
+	snapshot_id_list equiv_seen = { 0 };
 	struct bpos last_pos = POS_MIN;
 	int ret = 0;
 
@@ -620,7 +605,7 @@ static int bch2_snapshot_delete_keys_btree(struct btree_trans *trans,
 	}
 	bch2_trans_iter_exit(trans, &iter);
 
-	kfree(equiv_seen.d);
+	darray_exit(&equiv_seen);
 
 	return ret;
 }
@@ -632,7 +617,7 @@ static void bch2_delete_dead_snapshots_work(struct work_struct *work)
 	struct btree_iter iter;
 	struct bkey_s_c k;
 	struct bkey_s_c_snapshot snap;
-	struct snapshot_id_list deleted = { 0 };
+	snapshot_id_list deleted = { 0 };
 	u32 i, id, children[2];
 	int ret = 0;
 
@@ -712,15 +697,15 @@ static void bch2_delete_dead_snapshots_work(struct work_struct *work)
 
 	for (i = 0; i < deleted.nr; i++) {
 		ret = __bch2_trans_do(&trans, NULL, NULL, 0,
-			bch2_snapshot_node_delete(&trans, deleted.d[i]));
+			bch2_snapshot_node_delete(&trans, deleted.data[i]));
 		if (ret) {
 			bch_err(c, "error deleting snapshot %u: %i",
-				deleted.d[i], ret);
+				deleted.data[i], ret);
 			goto err;
 		}
 	}
 err:
-	kfree(deleted.d);
+	darray_exit(&deleted);
 	bch2_trans_exit(&trans);
 	percpu_ref_put(&c->writes);
 }
@@ -875,14 +860,14 @@ void bch2_subvolume_wait_for_pagecache_and_delete(struct work_struct *work)
 {
 	struct bch_fs *c = container_of(work, struct bch_fs,
 				snapshot_wait_for_pagecache_and_delete_work);
-	struct snapshot_id_list s;
+	snapshot_id_list s;
 	u32 *id;
 	int ret = 0;
 
 	while (!ret) {
 		mutex_lock(&c->snapshots_unlinked_lock);
 		s = c->snapshots_unlinked;
-		memset(&c->snapshots_unlinked, 0, sizeof(c->snapshots_unlinked));
+		darray_init(&c->snapshots_unlinked);
 		mutex_unlock(&c->snapshots_unlinked_lock);
 
 		if (!s.nr)
@@ -890,7 +875,7 @@ void bch2_subvolume_wait_for_pagecache_and_delete(struct work_struct *work)
 
 		bch2_evict_subvolume_inodes(c, &s);
 
-		for (id = s.d; id < s.d + s.nr; id++) {
+		for (id = s.data; id < s.data + s.nr; id++) {
 			ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_NOFAIL,
 				      bch2_subvolume_delete(&trans, *id));
 			if (ret) {
@@ -899,7 +884,7 @@ void bch2_subvolume_wait_for_pagecache_and_delete(struct work_struct *work)
 			}
 		}
 
-		kfree(s.d);
+		darray_exit(&s);
 	}
 
 	percpu_ref_put(&c->writes);
diff --git a/fs/bcachefs/subvolume.h b/fs/bcachefs/subvolume.h
index 4abe53df2788..b3d5ae49101d 100644
--- a/fs/bcachefs/subvolume.h
+++ b/fs/bcachefs/subvolume.h
@@ -2,6 +2,7 @@
 #ifndef _BCACHEFS_SUBVOLUME_H
 #define _BCACHEFS_SUBVOLUME_H
 
+#include "darray.h"
 #include "subvolume_types.h"
 
 void bch2_snapshot_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
@@ -58,15 +59,13 @@ static inline bool bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ances
 
 struct snapshots_seen {
 	struct bpos			pos;
-	size_t				nr;
-	size_t				size;
-	u32				*d;
+	DARRAY(u32)			ids;
 };
 
 static inline void snapshots_seen_exit(struct snapshots_seen *s)
 {
-	kfree(s->d);
-	s->d = NULL;
+	kfree(s->ids.data);
+	s->ids.data = NULL;
 }
 
 static inline void snapshots_seen_init(struct snapshots_seen *s)
@@ -76,30 +75,19 @@ static inline void snapshots_seen_init(struct snapshots_seen *s)
 
 static inline int snapshots_seen_add(struct bch_fs *c, struct snapshots_seen *s, u32 id)
 {
-	if (s->nr == s->size) {
-		size_t new_size = max(s->size, (size_t) 128) * 2;
-		u32 *d = krealloc(s->d, new_size * sizeof(s->d[0]), GFP_KERNEL);
-
-		if (!d) {
-			bch_err(c, "error reallocating snapshots_seen table (new size %zu)",
-				new_size);
-			return -ENOMEM;
-		}
-
-		s->size = new_size;
-		s->d	= d;
-	}
-
-	s->d[s->nr++] = id;
-	return 0;
+	int ret = darray_push(&s->ids, id);
+	if (ret)
+		bch_err(c, "error reallocating snapshots_seen table (size %zu)",
+			s->ids.size);
+	return ret;
 }
 
-static inline bool snapshot_list_has_id(struct snapshot_id_list *s, u32 id)
+static inline bool snapshot_list_has_id(snapshot_id_list *s, u32 id)
 {
-	unsigned i;
+	u32 *i;
 
-	for (i = 0; i < s->nr; i++)
-		if (id == s->d[i])
+	darray_for_each(*s, i)
+		if (*i == id)
 			return true;
 	return false;
 }
diff --git a/fs/bcachefs/subvolume_types.h b/fs/bcachefs/subvolume_types.h
index 9410b9587591..f7562b5d51df 100644
--- a/fs/bcachefs/subvolume_types.h
+++ b/fs/bcachefs/subvolume_types.h
@@ -2,10 +2,8 @@
 #ifndef _BCACHEFS_SUBVOLUME_TYPES_H
 #define _BCACHEFS_SUBVOLUME_TYPES_H
 
-struct snapshot_id_list {
-	u32		nr;
-	u32		size;
-	u32		*d;
-};
+#include "darray.h"
+
+typedef DARRAY(u32) snapshot_id_list;
 
 #endif /* _BCACHEFS_SUBVOLUME_TYPES_H */
-- 
cgit 


From 062afcbae3b269a7d01cf5087df92d5bd8732012 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 15 Mar 2022 02:41:21 -0400
Subject: bcachefs: Restore journal write point at startup

This patch tweaks the journal recovery path so that we start writing
right after where we left off, instead of the next empty bucket. This is
partly prep work for supporting zoned devices, but it's also good to do
in general to avoid the journal completely filling up and getting stuck.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/journal_io.c | 23 +++++++++++++++++++++--
 1 file changed, 21 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 54587ff29771..e3b3d0b72232 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -907,6 +907,7 @@ static void bch2_journal_read_device(struct closure *cl)
 	struct bch_fs *c = ca->fs;
 	struct journal_list *jlist =
 		container_of(cl->parent, struct journal_list, cl);
+	struct journal_replay *r;
 	struct journal_read_buf buf = { NULL, 0 };
 	u64 min_seq = U64_MAX;
 	unsigned i;
@@ -942,11 +943,29 @@ static void bch2_journal_read_device(struct closure *cl)
 	 * allocate
 	 */
 	while (ja->bucket_seq[ja->cur_idx] > min_seq &&
-	       ja->bucket_seq[ja->cur_idx] >
+	       ja->bucket_seq[ja->cur_idx] ==
 	       ja->bucket_seq[(ja->cur_idx + 1) % ja->nr])
 		ja->cur_idx = (ja->cur_idx + 1) % ja->nr;
 
-	ja->sectors_free = 0;
+	ja->sectors_free = ca->mi.bucket_size;
+
+	mutex_lock(&jlist->lock);
+	list_for_each_entry(r, jlist->head, list) {
+		for (i = 0; i < r->nr_ptrs; i++) {
+			if (r->ptrs[i].dev == ca->dev_idx &&
+			    sector_to_bucket(ca, r->ptrs[i].sector) == ja->buckets[ja->cur_idx]) {
+				unsigned wrote = (r->ptrs[i].sector % ca->mi.bucket_size) +
+					vstruct_sectors(&r->j, c->block_bits);
+
+				ja->sectors_free = min(ja->sectors_free,
+						       ca->mi.bucket_size - wrote);
+			}
+		}
+	}
+	mutex_unlock(&jlist->lock);
+
+	BUG_ON(ja->bucket_seq[ja->cur_idx] &&
+	       ja->sectors_free == ca->mi.bucket_size);
 
 	/*
 	 * Set dirty_idx to indicate the entire journal is full and needs to be
-- 
cgit 


From f8494d253534d3c49e80a483b74469bbeb01367f Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 16 Mar 2022 20:31:15 -0400
Subject: bcachefs: Convert some WARN_ONs to WARN_ON_ONCE

These warnings are symptomatic of something else going wrong, we don't
want them spamming up the logs as that'll make it harder to find the
real issue.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/fs-io.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 77a893260fd8..14550ac610c6 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -1247,7 +1247,7 @@ static void bch2_writepage_io_done(struct bch_write_op *op)
 	 * racing with fallocate can cause us to add fewer sectors than
 	 * expected - but we shouldn't add more sectors than expected:
 	 */
-	WARN_ON(io->op.i_sectors_delta > 0);
+	WARN_ON_ONCE(io->op.i_sectors_delta > 0);
 
 	/*
 	 * (error (due to going RO) halfway through a page can screw that up
@@ -1434,8 +1434,8 @@ do_io:
 				     sectors << 9, offset << 9));
 
 		/* Check for writing past i_size: */
-		WARN_ON((bio_end_sector(&w->io->op.wbio.bio) << 9) >
-			round_up(i_size, block_bytes(c)));
+		WARN_ON_ONCE((bio_end_sector(&w->io->op.wbio.bio) << 9) >
+			     round_up(i_size, block_bytes(c)));
 
 		w->io->op.res.sectors += reserved_sectors;
 		w->io->op.i_sectors_delta -= dirty_sectors;
-- 
cgit 


From 3a306f3c2c303febffefea4caf09b2326107507a Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 17 Mar 2022 21:35:51 -0400
Subject: bcachefs: Fix large key cache keys

Previously, we'd go into an infinite loop when attempting to cache a
bkey in the key cache larger than 128 u64s - since we were only using a
u8 for the size field, it'd get rounded up to 256 then truncated to 0.
Oops.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_types.h       |  2 +-
 fs/bcachefs/btree_update_leaf.c |  5 +++--
 fs/bcachefs/trace.h             | 40 +++++++++++++++++++++++++++++++++++++---
 3 files changed, 41 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 51eb686331bf..a8b08955cedc 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -323,7 +323,7 @@ struct bkey_cached {
 	struct btree_bkey_cached_common c;
 
 	unsigned long		flags;
-	u8			u64s;
+	u16			u64s;
 	bool			valid;
 	u32			btree_trans_barrier_seq;
 	struct bkey_cached_key	key;
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index cb0cab71b99b..e482d1b5cdc1 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -351,7 +351,7 @@ btree_key_can_insert_cached(struct btree_trans *trans,
 {
 	struct bch_fs *c = trans->c;
 	struct bkey_cached *ck = (void *) path->l[0].b;
-	unsigned new_u64s;
+	unsigned old_u64s = ck->u64s, new_u64s;
 	struct bkey_i *new_k;
 
 	EBUG_ON(path->level);
@@ -385,7 +385,8 @@ btree_key_can_insert_cached(struct btree_trans *trans,
 	 * transaction restart:
 	 */
 	trace_trans_restart_key_cache_key_realloced(trans->fn, _RET_IP_,
-					     path->btree_id, &path->pos);
+					     path->btree_id, &path->pos,
+					     old_u64s, new_u64s);
 	/*
 	 * Not using btree_trans_restart() because we can't unlock here, we have
 	 * write locks held:
diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h
index af3785254c71..bb938dd8cdf2 100644
--- a/fs/bcachefs/trace.h
+++ b/fs/bcachefs/trace.h
@@ -918,12 +918,46 @@ TRACE_EVENT(trans_restart_mem_realloced,
 		  __entry->bytes)
 );
 
-DEFINE_EVENT(transaction_restart_iter,	trans_restart_key_cache_key_realloced,
+TRACE_EVENT(trans_restart_key_cache_key_realloced,
 	TP_PROTO(const char *trans_fn,
 		 unsigned long caller_ip,
 		 enum btree_id btree_id,
-		 struct bpos *pos),
-	TP_ARGS(trans_fn, caller_ip, btree_id, pos)
+		 struct bpos *pos,
+		 unsigned old_u64s,
+		 unsigned new_u64s),
+	TP_ARGS(trans_fn, caller_ip, btree_id, pos, old_u64s, new_u64s),
+
+	TP_STRUCT__entry(
+		__array(char,			trans_fn, 24	)
+		__field(unsigned long,		caller_ip	)
+		__field(enum btree_id,		btree_id	)
+		__field(u64,			inode		)
+		__field(u64,			offset		)
+		__field(u32,			snapshot	)
+		__field(u32,			old_u64s	)
+		__field(u32,			new_u64s	)
+	),
+
+	TP_fast_assign(
+		strncpy(__entry->trans_fn, trans_fn, sizeof(__entry->trans_fn));
+		__entry->caller_ip	= caller_ip;
+		__entry->btree_id	= btree_id;
+		__entry->inode		= pos->inode;
+		__entry->offset		= pos->offset;
+		__entry->snapshot	= pos->snapshot;
+		__entry->old_u64s	= old_u64s;
+		__entry->new_u64s	= new_u64s;
+	),
+
+	TP_printk("%s %pS btree %s pos %llu:%llu:%u old_u64s %u new_u64s %u",
+		  __entry->trans_fn,
+		  (void *) __entry->caller_ip,
+		  bch2_btree_ids[__entry->btree_id],
+		  __entry->inode,
+		  __entry->offset,
+		  __entry->snapshot,
+		  __entry->old_u64s,
+		  __entry->new_u64s)
 );
 
 #endif /* _TRACE_BCACHEFS_H */
-- 
cgit 


From 74b33393db54dd321c807c621fad966eb9ec54e5 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 20 Mar 2022 23:34:11 -0400
Subject: bcachefs: x-macro metadata version enum

Now we've got strings for metadata versions - this changes
bch2_sb_to_text() and our mount log message to use it.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/bcachefs_format.h | 28 ++++++++++++++++------------
 fs/bcachefs/btree_io.c        |  2 +-
 fs/bcachefs/journal_io.c      |  2 +-
 fs/bcachefs/opts.c            |  7 ++++++-
 fs/bcachefs/opts.h            |  1 +
 fs/bcachefs/super-io.c        |  8 ++++----
 fs/bcachefs/super.c           |  2 +-
 7 files changed, 30 insertions(+), 20 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index ace3df19950d..2c9243031dab 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -1279,19 +1279,23 @@ struct bch_sb_field_journal_seq_blacklist {
 #define BCH_JSET_VERSION_OLD			2
 #define BCH_BSET_VERSION_OLD			3
 
+#define BCH_METADATA_VERSIONS()				\
+	x(bkey_renumber,		10)		\
+	x(inode_btree_change,		11)		\
+	x(snapshot,			12)		\
+	x(inode_backpointers,		13)		\
+	x(btree_ptr_sectors_written,	14)		\
+	x(snapshot_2,			15)		\
+	x(reflink_p_fix,		16)		\
+	x(subvol_dirent,		17)		\
+	x(inode_v2,			18)
+
 enum bcachefs_metadata_version {
-	bcachefs_metadata_version_min			= 9,
-	bcachefs_metadata_version_new_versioning	= 10,
-	bcachefs_metadata_version_bkey_renumber		= 10,
-	bcachefs_metadata_version_inode_btree_change	= 11,
-	bcachefs_metadata_version_snapshot		= 12,
-	bcachefs_metadata_version_inode_backpointers	= 13,
-	bcachefs_metadata_version_btree_ptr_sectors_written = 14,
-	bcachefs_metadata_version_snapshot_2		= 15,
-	bcachefs_metadata_version_reflink_p_fix		= 16,
-	bcachefs_metadata_version_subvol_dirent		= 17,
-	bcachefs_metadata_version_inode_v2		= 18,
-	bcachefs_metadata_version_max			= 19,
+	bcachefs_metadata_version_min = 9,
+#define x(t, n)	bcachefs_metadata_version_##t = n,
+	BCH_METADATA_VERSIONS()
+#undef x
+	bcachefs_metadata_version_max
 };
 
 #define bcachefs_metadata_version_current	(bcachefs_metadata_version_max - 1)
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 887a1b145cdc..c8f6e120fb43 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -1887,7 +1887,7 @@ do_write:
 	BUG_ON(BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN);
 	BUG_ON(i->seq != b->data->keys.seq);
 
-	i->version = c->sb.version < bcachefs_metadata_version_new_versioning
+	i->version = c->sb.version < bcachefs_metadata_version_bkey_renumber
 		? cpu_to_le16(BCH_BSET_VERSION_OLD)
 		: cpu_to_le16(c->sb.version);
 	SET_BSET_OFFSET(i, b->written);
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index e3b3d0b72232..ba43e5771c7c 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -1574,7 +1574,7 @@ void bch2_journal_write(struct closure *cl)
 	BUG_ON(vstruct_sectors(jset, c->block_bits) > w->sectors);
 
 	jset->magic		= cpu_to_le64(jset_magic(c));
-	jset->version		= c->sb.version < bcachefs_metadata_version_new_versioning
+	jset->version		= c->sb.version < bcachefs_metadata_version_bkey_renumber
 		? cpu_to_le32(BCH_JSET_VERSION_OLD)
 		: cpu_to_le32(c->sb.version);
 
diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c
index e78d3b75f6fb..ce5cb7edcbd3 100644
--- a/fs/bcachefs/opts.c
+++ b/fs/bcachefs/opts.c
@@ -9,7 +9,12 @@
 #include "super-io.h"
 #include "util.h"
 
-#define x(t, n) #t,
+#define x(t, n) [n] = #t,
+
+const char * const bch2_metadata_versions[] = {
+	BCH_METADATA_VERSIONS()
+	NULL
+};
 
 const char * const bch2_error_actions[] = {
 	BCH_ERROR_ACTIONS()
diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
index fffe3e066864..eeab4bb22597 100644
--- a/fs/bcachefs/opts.h
+++ b/fs/bcachefs/opts.h
@@ -8,6 +8,7 @@
 #include <linux/sysfs.h>
 #include "bcachefs_format.h"
 
+extern const char * const bch2_metadata_versions[];
 extern const char * const bch2_error_actions[];
 extern const char * const bch2_sb_features[];
 extern const char * const bch2_sb_compat[];
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index bb61a288b7fd..eaa54167d6b3 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -263,7 +263,7 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb, struct printbuf *out)
 	int ret;
 
 	version		= le16_to_cpu(sb->version);
-	version_min	= version >= bcachefs_metadata_version_new_versioning
+	version_min	= version >= bcachefs_metadata_version_bkey_renumber
 		? le16_to_cpu(sb->version_min)
 		: version;
 
@@ -518,7 +518,7 @@ reread:
 	}
 
 	version		= le16_to_cpu(sb->sb->version);
-	version_min	= version >= bcachefs_metadata_version_new_versioning
+	version_min	= version >= bcachefs_metadata_version_bkey_renumber
 		? le16_to_cpu(sb->sb->version_min)
 		: version;
 
@@ -1552,12 +1552,12 @@ void bch2_sb_to_text(struct printbuf *out, struct bch_sb *sb,
 
 	pr_buf(out, "Version:");
 	pr_tab(out);
-	pr_buf(out, "%u", le16_to_cpu(sb->version));
+	pr_buf(out, "%s", bch2_metadata_versions[le16_to_cpu(sb->version)]);
 	pr_newline(out);
 
 	pr_buf(out, "Oldest version on disk:");
 	pr_tab(out);
-	pr_buf(out, "%u", le16_to_cpu(sb->version_min));
+	pr_buf(out, "%u", bch2_metadata_versions[le16_to_cpu(sb->version_min)]);
 	pr_newline(out);
 
 	pr_buf(out, "Created:");
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 56b01624d5fb..e4201aecdba1 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -897,7 +897,7 @@ static void print_mount_opts(struct bch_fs *c)
 	if (!p.pos)
 		pr_buf(&p, "(null)");
 
-	bch_info(c, "mounted with opts: %s", p.buf);
+	bch_info(c, "mounted version=%s opts=%s", bch2_metadata_versions[c->sb.version], p.buf);
 	printbuf_exit(&p);
 }
 
-- 
cgit 


From 63c4b25453828ee0670162d35f928ab43635e7fc Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 21 Mar 2022 00:15:38 -0400
Subject: bcachefs: Better superblock opt validation

This moves validation of superblock options to bch2_sb_validate(), so
they'll be checked in the write path as well.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/opts.c     | 50 ++++++++++++++++++++++----------------------------
 fs/bcachefs/opts.h     |  5 +++--
 fs/bcachefs/super-io.c | 16 ++++++++++++++++
 fs/bcachefs/sysfs.c    |  2 +-
 fs/bcachefs/xattr.c    |  2 +-
 5 files changed, 43 insertions(+), 32 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c
index ce5cb7edcbd3..77fbb7d2194e 100644
--- a/fs/bcachefs/opts.c
+++ b/fs/bcachefs/opts.c
@@ -224,42 +224,43 @@ static int bch2_mount_opt_lookup(const char *name)
 	return bch2_opt_lookup(name);
 }
 
-static int bch2_opt_validate(const struct bch_option *opt, const char *msg, u64 v)
+int bch2_opt_validate(const struct bch_option *opt, u64 v, struct printbuf *err)
 {
 	if (v < opt->min) {
-		if (msg)
-			pr_err("invalid %s%s: too small (min %llu)",
-			       msg, opt->attr.name, opt->min);
+		if (err)
+			pr_buf(err, "%s: too small (min %llu)",
+			       opt->attr.name, opt->min);
 		return -ERANGE;
 	}
 
 	if (opt->max && v >= opt->max) {
-		if (msg)
-			pr_err("invalid %s%s: too big (max %llu)",
-			       msg, opt->attr.name, opt->max);
+		if (err)
+			pr_buf(err, "%s: too big (max %llu)",
+			       opt->attr.name, opt->max);
 		return -ERANGE;
 	}
 
 	if ((opt->flags & OPT_SB_FIELD_SECTORS) && (v & 511)) {
-		if (msg)
-			pr_err("invalid %s %s: not a multiple of 512",
-			       msg, opt->attr.name);
+		if (err)
+			pr_buf(err, "%s: not a multiple of 512",
+			       opt->attr.name);
 		return -EINVAL;
 	}
 
 	if ((opt->flags & OPT_MUST_BE_POW_2) && !is_power_of_2(v)) {
-		if (msg)
-			pr_err("invalid %s%s: must be a power of two",
-			       msg, opt->attr.name);
+		if (err)
+			pr_buf(err, "%s: must be a power of two",
+			       opt->attr.name);
 		return -EINVAL;
 	}
 
 	return 0;
 }
 
-int bch2_opt_parse(struct bch_fs *c, const char *msg,
+int bch2_opt_parse(struct bch_fs *c,
 		   const struct bch_option *opt,
-		   const char *val, u64 *res)
+		   const char *val, u64 *res,
+		   struct printbuf *err)
 {
 	ssize_t ret;
 
@@ -292,7 +293,7 @@ int bch2_opt_parse(struct bch_fs *c, const char *msg,
 			return ret;
 	}
 
-	return bch2_opt_validate(opt, msg, *res);
+	return bch2_opt_validate(opt, *res, err);
 }
 
 void bch2_opt_to_text(struct printbuf *out,
@@ -372,6 +373,7 @@ int bch2_parse_mount_opts(struct bch_fs *c, struct bch_opts *opts,
 	char *copied_opts, *copied_opts_start;
 	char *opt, *name, *val;
 	int ret, id;
+	struct printbuf err = PRINTBUF;
 	u64 v;
 
 	if (!options)
@@ -391,8 +393,7 @@ int bch2_parse_mount_opts(struct bch_fs *c, struct bch_opts *opts,
 			if (id < 0)
 				goto bad_opt;
 
-			ret = bch2_opt_parse(c, "mount option ",
-					     &bch2_opt_table[id], val, &v);
+			ret = bch2_opt_parse(c, &bch2_opt_table[id], val, &v, &err);
 			if (ret < 0)
 				goto bad_val;
 		} else {
@@ -435,7 +436,7 @@ bad_opt:
 	ret = -1;
 	goto out;
 bad_val:
-	pr_err("Invalid value %s for mount option %s", val, name);
+	pr_err("Invalid mount option %s", err.buf);
 	ret = -1;
 	goto out;
 no_val:
@@ -444,6 +445,7 @@ no_val:
 	goto out;
 out:
 	kfree(copied_opts_start);
+	printbuf_exit(&err);
 	return ret;
 }
 
@@ -470,22 +472,14 @@ u64 bch2_opt_from_sb(struct bch_sb *sb, enum bch_opt_id id)
 int bch2_opts_from_sb(struct bch_opts *opts, struct bch_sb *sb)
 {
 	unsigned id;
-	int ret;
 
 	for (id = 0; id < bch2_opts_nr; id++) {
 		const struct bch_option *opt = bch2_opt_table + id;
-		u64 v;
 
 		if (opt->get_sb == BCH2_NO_SB_OPT)
 			continue;
 
-		v = bch2_opt_from_sb(sb, id);
-
-		ret = bch2_opt_validate(opt, "superblock option ", v);
-		if (ret)
-			return ret;
-
-		bch2_opt_set_by_id(opts, id, v);
+		bch2_opt_set_by_id(opts, id, bch2_opt_from_sb(sb, id));
 	}
 
 	return 0;
diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
index eeab4bb22597..69ca75429943 100644
--- a/fs/bcachefs/opts.h
+++ b/fs/bcachefs/opts.h
@@ -489,8 +489,9 @@ void __bch2_opt_set_sb(struct bch_sb *, const struct bch_option *, u64);
 void bch2_opt_set_sb(struct bch_fs *, const struct bch_option *, u64);
 
 int bch2_opt_lookup(const char *);
-int bch2_opt_parse(struct bch_fs *, const char *, const struct bch_option *,
-		   const char *, u64 *);
+int bch2_opt_validate(const struct bch_option *, u64, struct printbuf *);
+int bch2_opt_parse(struct bch_fs *, const struct bch_option *,
+		   const char *, u64 *, struct printbuf *);
 
 #define OPT_SHOW_FULL_LIST	(1 << 0)
 #define OPT_SHOW_MOUNT_STYLE	(1 << 1)
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index eaa54167d6b3..224653f129f8 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -258,6 +258,7 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb, struct printbuf *out)
 	struct bch_sb *sb = disk_sb->sb;
 	struct bch_sb_field *f;
 	struct bch_sb_field_members *mi;
+	enum bch_opt_id opt_id;
 	u32 version, version_min;
 	u16 block_size;
 	int ret;
@@ -329,6 +330,21 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb, struct printbuf *out)
 		return -EINVAL;
 	}
 
+	for (opt_id = 0; opt_id < bch2_opts_nr; opt_id++) {
+		const struct bch_option *opt = bch2_opt_table + opt_id;
+
+		if (opt->get_sb != BCH2_NO_SB_OPT) {
+			u64 v = bch2_opt_from_sb(sb, opt_id);
+
+			pr_buf(out, "Invalid option ");
+			ret = bch2_opt_validate(opt, v, out);
+			if (ret)
+				return ret;
+
+			printbuf_reset(out);
+		}
+	}
+
 	/* validate layout */
 	ret = validate_sb_layout(&sb->layout, out);
 	if (ret)
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index afcb5ad1aa62..dc67506e08d7 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -624,7 +624,7 @@ STORE(bch2_fs_opts_dir)
 		goto err;
 	}
 
-	ret = bch2_opt_parse(c, NULL, opt, strim(tmp), &v);
+	ret = bch2_opt_parse(c, opt, strim(tmp), &v, NULL);
 	kfree(tmp);
 
 	if (ret < 0)
diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c
index ecce10342126..270276a0289f 100644
--- a/fs/bcachefs/xattr.c
+++ b/fs/bcachefs/xattr.c
@@ -525,7 +525,7 @@ static int bch2_xattr_bcachefs_set(const struct xattr_handler *handler,
 		memcpy(buf, value, size);
 		buf[size] = '\0';
 
-		ret = bch2_opt_parse(c, NULL, opt, buf, &v);
+		ret = bch2_opt_parse(c, opt, buf, &v, NULL);
 		kfree(buf);
 
 		if (ret < 0)
-- 
cgit 


From 7a6f4411aeaae888a7b2880f0c046f9efb0d83cc Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 21 Mar 2022 00:27:10 -0400
Subject: bcachefs: Make minimum journal_flush_delay nonzero

We're seeing a very strange bug where journal_flush_delay sometimes gets
set to 0 in the superblock. Together with the preceding patch, this
should help us track it down.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/opts.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
index 69ca75429943..b45740ec3c67 100644
--- a/fs/bcachefs/opts.h
+++ b/fs/bcachefs/opts.h
@@ -275,7 +275,7 @@ enum opt_type {
 	  NULL,		"Extra debugging information during mount/recovery")\
 	x(journal_flush_delay,		u32,				\
 	  OPT_FS|OPT_MOUNT|OPT_RUNTIME,					\
-	  OPT_UINT(0, U32_MAX),						\
+	  OPT_UINT(1, U32_MAX),						\
 	  BCH_SB_JOURNAL_FLUSH_DELAY,	1000,				\
 	  NULL,		"Delay in milliseconds before automatic journal commits")\
 	x(journal_flush_disabled,	u8,				\
-- 
cgit 


From b0be2fcfb425022025203c27ed75a20db8c6feda Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 20 Mar 2022 20:12:53 -0400
Subject: bcachefs: Change journal_io.c assertion to error message

Something funny is going on with the new code for restoring the journal
write point, and it's hard to reproduce.

We do want to debug this because resuming writing to the journal in the
wrong spot could be something serious. For now, replace the assertion
with an error message and revert to old behaviour when it happens.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/journal_io.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index ba43e5771c7c..b2c3ee336c1f 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -964,8 +964,16 @@ static void bch2_journal_read_device(struct closure *cl)
 	}
 	mutex_unlock(&jlist->lock);
 
-	BUG_ON(ja->bucket_seq[ja->cur_idx] &&
-	       ja->sectors_free == ca->mi.bucket_size);
+	if (ja->bucket_seq[ja->cur_idx] &&
+	    ja->sectors_free == ca->mi.bucket_size) {
+		bch_err(c, "ja->sectors_free == ca->mi.bucket_size");
+		bch_err(c, "cur_idx %u/%u", ja->cur_idx, ja->nr);
+		for (i = 0; i < 3; i++) {
+			unsigned idx = (ja->cur_idx + ja->nr - 1 + i) % ja->nr;
+			bch_err(c, "bucket_seq[%u] = %llu", idx, ja->bucket_seq[idx]);
+		}
+		ja->sectors_free = 0;
+	}
 
 	/*
 	 * Set dirty_idx to indicate the entire journal is full and needs to be
-- 
cgit 


From b8559f1a212a7035b430b83e0a01e94a872adc23 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 21 Mar 2022 03:03:03 -0400
Subject: bcachefs: Reset journal flush delay to default value if zeroed

We've been seeing a very strange bug where journal flush & reclaim delay
end up getting inexplicably zeroed, in the superblock. We're now
validating all the options in bch2_validate_super(), and 0 is no longer
a valid value for those options, but we need to be careful not to
prevent people's filesystems from mounting because of the new
validation.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/super-io.c | 19 ++++++++++++++++---
 1 file changed, 16 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index 224653f129f8..6d54319a95e6 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -253,7 +253,8 @@ static int validate_sb_layout(struct bch_sb_layout *layout, struct printbuf *out
 	return 0;
 }
 
-static int bch2_sb_validate(struct bch_sb_handle *disk_sb, struct printbuf *out)
+static int bch2_sb_validate(struct bch_sb_handle *disk_sb, struct printbuf *out,
+			    int rw)
 {
 	struct bch_sb *sb = disk_sb->sb;
 	struct bch_sb_field *f;
@@ -330,6 +331,18 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb, struct printbuf *out)
 		return -EINVAL;
 	}
 
+	if (rw == READ) {
+		/*
+		 * Been seeing a bug where these are getting inexplicably
+		 * zeroed, so we'r now validating them, but we have to be
+		 * careful not to preven people's filesystems from mounting:
+		 */
+		if (!BCH_SB_JOURNAL_FLUSH_DELAY(sb))
+			SET_BCH_SB_JOURNAL_FLUSH_DELAY(sb, 1000);
+		if (!BCH_SB_JOURNAL_RECLAIM_DELAY(sb))
+			SET_BCH_SB_JOURNAL_RECLAIM_DELAY(sb, 1000);
+	}
+
 	for (opt_id = 0; opt_id < bch2_opts_nr; opt_id++) {
 		const struct bch_option *opt = bch2_opt_table + opt_id;
 
@@ -696,7 +709,7 @@ got_super:
 	ret = 0;
 	sb->have_layout = true;
 
-	ret = bch2_sb_validate(sb, &err);
+	ret = bch2_sb_validate(sb, &err, READ);
 	if (ret) {
 		printk(KERN_ERR "bcachefs (%s): error validating superblock: %s",
 		       path, err.buf);
@@ -808,7 +821,7 @@ int bch2_write_super(struct bch_fs *c)
 	for_each_online_member(ca, c, i) {
 		printbuf_reset(&err);
 
-		ret = bch2_sb_validate(&ca->disk_sb, &err);
+		ret = bch2_sb_validate(&ca->disk_sb, &err, WRITE);
 		if (ret) {
 			bch2_fs_inconsistent(c, "sb invalid before write: %s", err.buf);
 			percpu_ref_put(&ca->io_ref);
-- 
cgit 


From 3756111d138b6c5983d0c7cc2de12a7ec3d1e3d4 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 21 Mar 2022 18:05:39 -0400
Subject: bcachefs: Add printf format attribute to bch2_pr_buf()

This tells the compiler to check printf format strings, and catches a
few bugs.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_io.c | 2 +-
 fs/bcachefs/super-io.c | 2 +-
 fs/bcachefs/util.h     | 3 ++-
 3 files changed, 4 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index c8f6e120fb43..b6f0f6dec8e8 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -1069,7 +1069,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
 
 			bch2_bkey_val_to_text(&buf, c, u.s_c);
 			btree_err(BTREE_ERR_FIXABLE, c, NULL, b, i,
-				  "invalid bkey %s: %s", buf, invalid);
+				  "invalid bkey %s: %s", buf.buf, invalid);
 			printbuf_exit(&buf);
 
 			btree_keys_account_key_drop(&b->nr, 0, k);
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index 6d54319a95e6..5c87c7308274 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -1586,7 +1586,7 @@ void bch2_sb_to_text(struct printbuf *out, struct bch_sb *sb,
 
 	pr_buf(out, "Oldest version on disk:");
 	pr_tab(out);
-	pr_buf(out, "%u", bch2_metadata_versions[le16_to_cpu(sb->version_min)]);
+	pr_buf(out, "%s", bch2_metadata_versions[le16_to_cpu(sb->version_min)]);
 	pr_newline(out);
 
 	pr_buf(out, "Created:");
diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h
index 465ba030133b..f2df4d7fbec9 100644
--- a/fs/bcachefs/util.h
+++ b/fs/bcachefs/util.h
@@ -282,7 +282,8 @@ static inline size_t printbuf_linelen(struct printbuf *buf)
 	return buf->pos - buf->last_newline;
 }
 
-void bch2_pr_buf(struct printbuf *out, const char *fmt, ...);
+void bch2_pr_buf(struct printbuf *out, const char *fmt, ...)
+	__attribute__ ((format (printf, 2, 3)));
 
 #define pr_buf(_out, ...) bch2_pr_buf(_out, __VA_ARGS__)
 
-- 
cgit 


From fd1e9c69959ec2dd0946f3d2285548c87f9d6ea3 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 21 Mar 2022 19:34:48 -0400
Subject: bcachefs: Fix an unitialized var warning in userspace

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/io.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index c9204cab055d..2cc56979fcb3 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -1141,7 +1141,7 @@ static void __bch2_write(struct closure *cl)
 	struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
 	struct bch_fs *c = op->c;
 	struct write_point *wp;
-	struct bio *bio;
+	struct bio *bio = NULL;
 	bool skip_put = true;
 	unsigned nofs_flags;
 	int ret;
-- 
cgit 


From 30690c441a6b481aadb0284d1b9bf487f8d28bfc Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 28 Mar 2022 12:31:22 -0400
Subject: bcachefs: Heap code fix

When deleting an entry from a heap that was at entry h->used - 1, we'd
end up calling heap_sift() on an entry outside the heap - the entry we
just removed - which would end up re-adding it to the heap and deleting
something we didn't want to delete. Oops...

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/util.h | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h
index f2df4d7fbec9..fd776fb281b7 100644
--- a/fs/bcachefs/util.h
+++ b/fs/bcachefs/util.h
@@ -210,9 +210,11 @@ do {									\
 									\
 	BUG_ON(_i >= (h)->used);					\
 	(h)->used--;							\
-	heap_swap(h, _i, (h)->used, set_backpointer);			\
-	heap_sift_up(h, _i, cmp, set_backpointer);			\
-	heap_sift_down(h, _i, cmp, set_backpointer);			\
+	if ((_i) < (h)->used) {						\
+		heap_swap(h, _i, (h)->used, set_backpointer);		\
+		heap_sift_up(h, _i, cmp, set_backpointer);		\
+		heap_sift_down(h, _i, cmp, set_backpointer);		\
+	}								\
 } while (0)
 
 #define heap_pop(h, d, cmp, set_backpointer)				\
-- 
cgit 


From 7fda0f08fa86731f057367ca36054d29d0c0344c Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 28 Mar 2022 16:21:26 -0400
Subject: bcachefs: Work around a journal self-deadlock

bch2_journal_space_available -> bch2_journal_halt() self deadlocks on
journal lock; work around this by dropping/retaking journal lock before
we call bch2_fatal_error().

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/journal_reclaim.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
index 56b0c018ac26..b30730ce58c5 100644
--- a/fs/bcachefs/journal_reclaim.c
+++ b/fs/bcachefs/journal_reclaim.c
@@ -216,7 +216,14 @@ void bch2_journal_space_available(struct journal *j)
 		bch_err(c, "journal stuck\n%s", buf.buf);
 		printbuf_exit(&buf);
 
+		/*
+		 * Hack: bch2_fatal_error() calls bch2_journal_halt() which
+		 * takes journal lock:
+		 */
+		spin_unlock(&j->lock);
 		bch2_fatal_error(c);
+		spin_lock(&j->lock);
+
 		ret = cur_entry_journal_stuck;
 	} else if (!j->space[journal_space_discarded].next_entry)
 		ret = cur_entry_journal_full;
-- 
cgit 


From 81cdc8f3070561786a906c66d697666e6a3319b9 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 28 Mar 2022 16:31:26 -0400
Subject: bcachefs: Fix error path in bch2_snapshot_set_equiv()

We weren't properly catching errors from snapshot_live() - oops.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/subvolume.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c
index 2c5f7e7793a7..20c6b21e54d3 100644
--- a/fs/bcachefs/subvolume.c
+++ b/fs/bcachefs/subvolume.c
@@ -139,7 +139,7 @@ static int bch2_snapshots_set_equiv(struct btree_trans *trans)
 	for_each_btree_key(trans, iter, BTREE_ID_snapshots,
 			   POS_MIN, 0, k, ret) {
 		u32 id = k.k->p.offset, child[2];
-		unsigned nr_live = 0, live_idx;
+		unsigned nr_live = 0, live_idx = 0;
 
 		if (k.k->type != KEY_TYPE_snapshot)
 			continue;
@@ -151,7 +151,7 @@ static int bch2_snapshots_set_equiv(struct btree_trans *trans)
 		for (i = 0; i < 2; i++) {
 			ret = snapshot_live(trans, child[i]);
 			if (ret < 0)
-				break;
+				goto err;
 
 			if (ret)
 				live_idx = i;
@@ -162,6 +162,7 @@ static int bch2_snapshots_set_equiv(struct btree_trans *trans)
 			? snapshot_t(c, child[live_idx])->equiv
 			: id;
 	}
+err:
 	bch2_trans_iter_exit(trans, &iter);
 
 	if (ret)
-- 
cgit 


From 7071878bab9cbb38b03ac8612af9ea3021c34137 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 30 Mar 2022 13:10:03 -0400
Subject: bcachefs: Add a missing btree_path_set_dirty() calls

bch2_btree_iter_next_node() was mucking with other btree_path state
without setting path->update to be consistent with the fact that the
path is very much no longer uptodate - oops.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_iter.c            | 5 +++++
 fs/bcachefs/btree_update_interior.c | 1 +
 2 files changed, 6 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index cfaab8cbcad0..bfba10b1c127 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1701,6 +1701,7 @@ __bch2_btree_path_set_pos(struct btree_trans *trans,
 	l = btree_path_up_until_good_node(trans, path, cmp);
 
 	if (btree_path_node(path, l)) {
+		BUG_ON(!btree_node_locked(path, l));
 		/*
 		 * We might have to skip over many keys, or just a few: try
 		 * advancing the node iterator, and if we have to skip over too
@@ -1906,6 +1907,8 @@ struct btree_path *bch2_path_get(struct btree_trans *trans,
 	int i;
 
 	BUG_ON(trans->restarted);
+	bch2_trans_verify_locks(trans);
+
 	btree_trans_sort_paths(trans);
 
 	btree_trans_sort_paths(trans);
@@ -2098,6 +2101,7 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
 		btree_node_unlock(path, path->level);
 		path->l[path->level].b = BTREE_ITER_NO_NODE_UP;
 		path->level++;
+		btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
 		return NULL;
 	}
 
@@ -2105,6 +2109,7 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
 		__bch2_btree_path_unlock(path);
 		path->l[path->level].b = BTREE_ITER_NO_NODE_GET_LOCKS;
 		path->l[path->level + 1].b = BTREE_ITER_NO_NODE_GET_LOCKS;
+		btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
 		trace_trans_restart_relock_next_node(trans->fn, _THIS_IP_,
 					   path->btree_id, &path->pos);
 		btree_trans_restart(trans);
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 43022b340f4e..53e35d878657 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -1938,6 +1938,7 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans,
 		btree_node_unlock(iter2.path, iter2.path->level);
 		path_l(iter2.path)->b = BTREE_ITER_NO_NODE_UP;
 		iter2.path->level++;
+		btree_path_set_dirty(iter2.path, BTREE_ITER_NEED_TRAVERSE);
 
 		trans->paths_sorted = false;
 
-- 
cgit 


From d864842581e70d2280f2a51ceb2ad6e7cefeed94 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 30 Mar 2022 13:47:07 -0400
Subject: bcachefs: btree_path_make_mut() clears should_be_locked

This fixes a bug where __bch2_btree_node_update_key() wasn't clearing
should_be_locked, leading to bch2_btree_path_traverse() always failing -
all callers of btree_path_make_mut() want should_be_locked cleared, so
do it there.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_iter.c | 7 +++----
 fs/bcachefs/btree_iter.h | 1 +
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index bfba10b1c127..3a4ed2f70cc2 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1662,9 +1662,8 @@ static struct btree_path *btree_path_clone(struct btree_trans *trans, struct btr
 	return new;
 }
 
-struct btree_path * __must_check
-__bch2_btree_path_make_mut(struct btree_trans *trans,
-			   struct btree_path *path, bool intent)
+struct btree_path *__bch2_btree_path_make_mut(struct btree_trans *trans,
+			 struct btree_path *path, bool intent)
 {
 	__btree_path_put(path, intent);
 	path = btree_path_clone(trans, path, intent);
@@ -1672,6 +1671,7 @@ __bch2_btree_path_make_mut(struct btree_trans *trans,
 #ifdef CONFIG_BCACHEFS_DEBUG
 	path->ip_allocated = _RET_IP_;
 #endif
+	path->should_be_locked = false;
 	return path;
 }
 
@@ -1688,7 +1688,6 @@ __bch2_btree_path_set_pos(struct btree_trans *trans,
 	path = bch2_btree_path_make_mut(trans, path, intent);
 
 	path->pos		= new_pos;
-	path->should_be_locked	= false;
 	trans->paths_sorted	= false;
 
 	if (unlikely(path->cached)) {
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index 30a2a2cef29b..29c1df83b35e 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -149,6 +149,7 @@ bch2_btree_path_make_mut(struct btree_trans *trans,
 {
 	if (path->ref > 1 || path->preserve)
 		path = __bch2_btree_path_make_mut(trans, path, intent);
+	path->should_be_locked = false;
 	return path;
 }
 
-- 
cgit 


From 2a6870ada4340c3a72e381143bc67252f4d243a4 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 29 Mar 2022 16:29:10 -0400
Subject: bcachefs: Use darray for extra_journal_entries

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_iter.c            |  5 +++--
 fs/bcachefs/btree_types.h           |  4 ++--
 fs/bcachefs/btree_update_interior.c | 28 +++++++++++++++++++---------
 fs/bcachefs/btree_update_leaf.c     | 17 ++++++++---------
 4 files changed, 32 insertions(+), 22 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 3a4ed2f70cc2..1c0560ecd120 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -3055,8 +3055,7 @@ void bch2_trans_begin(struct btree_trans *trans)
 	trans->mem_top			= 0;
 
 	trans->hooks			= NULL;
-	trans->extra_journal_entries	= NULL;
-	trans->extra_journal_entry_u64s	= 0;
+	trans->extra_journal_entries.nr	= 0;
 
 	if (trans->fs_usage_deltas) {
 		trans->fs_usage_deltas->used = 0;
@@ -3196,6 +3195,8 @@ void bch2_trans_exit(struct btree_trans *trans)
 
 	bch2_journal_preres_put(&c->journal, &trans->journal_preres);
 
+	kfree(trans->extra_journal_entries.data);
+
 	if (trans->fs_usage_deltas) {
 		if (trans->fs_usage_deltas->size + sizeof(trans->fs_usage_deltas) ==
 		    REPLICAS_DELTA_LIST_MAX)
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index a8b08955cedc..e848b153ae93 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -7,6 +7,7 @@
 
 #include "bkey_methods.h"
 #include "buckets_types.h"
+#include "darray.h"
 #include "journal_types.h"
 #include "six.h"
 
@@ -416,8 +417,7 @@ struct btree_trans {
 
 	/* update path: */
 	struct btree_trans_commit_hook *hooks;
-	struct jset_entry	*extra_journal_entries;
-	unsigned		extra_journal_entry_u64s;
+	DARRAY(u64)		extra_journal_entries;
 	struct journal_entry_pin *journal_pin;
 
 	struct journal_res	journal_res;
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 53e35d878657..e0af39ee4b47 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -518,8 +518,15 @@ static int btree_update_nodes_written_trans(struct btree_trans *trans,
 	struct bkey_i *k;
 	int ret;
 
-	trans->extra_journal_entries = (void *) &as->journal_entries[0];
-	trans->extra_journal_entry_u64s = as->journal_u64s;
+	ret = darray_make_room(&trans->extra_journal_entries, as->journal_u64s);
+	if (ret)
+		return ret;
+
+	memcpy(&darray_top(trans->extra_journal_entries),
+	       as->journal_entries,
+	       as->journal_u64s * sizeof(u64));
+	trans->extra_journal_entries.nr += as->journal_u64s;
+
 	trans->journal_pin = &as->journal;
 
 	for_each_keylist_key(&as->new_keys, k) {
@@ -1905,7 +1912,6 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans,
 	struct bch_fs *c = trans->c;
 	struct btree_iter iter2 = { NULL };
 	struct btree *parent;
-	u64 journal_entries[BKEY_BTREE_PTR_U64s_MAX];
 	int ret;
 
 	if (!skip_triggers) {
@@ -1949,12 +1955,16 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans,
 	} else {
 		BUG_ON(btree_node_root(c, b) != b);
 
-		trans->extra_journal_entries = (void *) &journal_entries[0];
-		trans->extra_journal_entry_u64s =
-			journal_entry_set((void *) &journal_entries[0],
-					  BCH_JSET_ENTRY_btree_root,
-					  b->c.btree_id, b->c.level,
-					  new_key, new_key->k.u64s);
+		ret = darray_make_room(&trans->extra_journal_entries,
+				       jset_u64s(new_key->k.u64s));
+		if (ret)
+			return ret;
+
+		journal_entry_set((void *) &darray_top(trans->extra_journal_entries),
+				  BCH_JSET_ENTRY_btree_root,
+				  b->c.btree_id, b->c.level,
+				  new_key, new_key->k.u64s);
+		trans->extra_journal_entries.nr += jset_u64s(new_key->k.u64s);
 	}
 
 	ret = bch2_trans_commit(trans, NULL, NULL,
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index e482d1b5cdc1..d6ec3f6c9be8 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -707,13 +707,13 @@ bch2_trans_commit_write_locked(struct btree_trans *trans,
 		trans->journal_res.seq = c->journal.replay_journal_seq;
 	}
 
-	if (unlikely(trans->extra_journal_entry_u64s)) {
+	if (unlikely(trans->extra_journal_entries.nr)) {
 		memcpy_u64s_small(journal_res_entry(&c->journal, &trans->journal_res),
-				  trans->extra_journal_entries,
-				  trans->extra_journal_entry_u64s);
+				  trans->extra_journal_entries.data,
+				  trans->extra_journal_entries.nr);
 
-		trans->journal_res.offset	+= trans->extra_journal_entry_u64s;
-		trans->journal_res.u64s		-= trans->extra_journal_entry_u64s;
+		trans->journal_res.offset	+= trans->extra_journal_entries.nr;
+		trans->journal_res.u64s		-= trans->extra_journal_entries.nr;
 	}
 
 	/*
@@ -1096,7 +1096,7 @@ int __bch2_trans_commit(struct btree_trans *trans)
 	int ret = 0;
 
 	if (!trans->nr_updates &&
-	    !trans->extra_journal_entry_u64s)
+	    !trans->extra_journal_entries.nr)
 		goto out_reset;
 
 	if (trans->flags & BTREE_INSERT_GC_LOCK_HELD)
@@ -1120,7 +1120,7 @@ int __bch2_trans_commit(struct btree_trans *trans)
 
 	memset(&trans->journal_preres, 0, sizeof(trans->journal_preres));
 
-	trans->journal_u64s		= trans->extra_journal_entry_u64s;
+	trans->journal_u64s		= trans->extra_journal_entries.nr;
 	trans->journal_preres_u64s	= 0;
 
 	trans->journal_transaction_names = READ_ONCE(c->opts.journal_transaction_names);
@@ -1180,8 +1180,7 @@ out_reset:
 	trans->extra_journal_res	= 0;
 	trans->nr_updates		= 0;
 	trans->hooks			= NULL;
-	trans->extra_journal_entries	= NULL;
-	trans->extra_journal_entry_u64s	= 0;
+	trans->extra_journal_entries.nr	= 0;
 
 	if (trans->fs_usage_deltas) {
 		trans->fs_usage_deltas->used = 0;
-- 
cgit 


From 5aabb32442c44c9cfc41a2c29638aebeb3e9cb26 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 30 Mar 2022 15:44:12 -0400
Subject: bcachefs: bch2_trans_log_msg()

Add a new helper for logging messages to the journal - a new debugging
tool, an alternative to trace_printk().

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_update.h      |  2 ++
 fs/bcachefs/btree_update_leaf.c | 27 +++++++++++++++++++++++++++
 2 files changed, 29 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
index d9a406a28f47..3cf4cc4f2350 100644
--- a/fs/bcachefs/btree_update.h
+++ b/fs/bcachefs/btree_update.h
@@ -83,6 +83,8 @@ void bch2_trans_commit_hook(struct btree_trans *,
 			    struct btree_trans_commit_hook *);
 int __bch2_trans_commit(struct btree_trans *);
 
+int bch2_trans_log_msg(struct btree_trans *, const char *);
+
 /**
  * bch2_trans_commit - insert keys at given iterator positions
  *
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index d6ec3f6c9be8..2640d3e38a76 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -1795,3 +1795,30 @@ int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id,
 			     bch2_btree_delete_range_trans(&trans, id, start, end,
 							   update_flags, journal_seq));
 }
+
+int bch2_trans_log_msg(struct btree_trans *trans, const char *msg)
+{
+	unsigned len = strlen(msg);
+	unsigned u64s = DIV_ROUND_UP(len, sizeof(u64));
+	struct jset_entry_log *l;
+	int ret;
+
+	ret = darray_make_room(&trans->extra_journal_entries, jset_u64s(u64s));
+	if (ret)
+		return ret;
+
+	l = (void *) &darray_top(trans->extra_journal_entries);
+	l->entry.u64s		= cpu_to_le16(u64s);
+	l->entry.btree_id	= 0;
+	l->entry.level		= 1;
+	l->entry.type		= BCH_JSET_ENTRY_log;
+	l->entry.pad[0]		= 0;
+	l->entry.pad[1]		= 0;
+	l->entry.pad[2]		= 0;
+	memcpy(l->d, msg, len);
+	while (len & 7)
+		l->d[len++] = '\0';
+
+	trans->extra_journal_entries.nr += jset_u64s(u64s);
+	return 0;
+}
-- 
cgit 


From 1296ab552040e09b35f37b8c523d4f1711ac8435 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 30 Mar 2022 23:40:19 -0400
Subject: bcachefs: Improve bch2_bkey_ptrs_to_text()

Print bucket:offset when the filesystem is online; this makes debugging
easier when correlating with alloc updates.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/extents.c | 22 ++++++++++++++--------
 fs/bcachefs/super.h   |  6 ++++++
 2 files changed, 20 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index c78e10e8ec2c..01d14645579b 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -953,15 +953,21 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
 		switch (__extent_entry_type(entry)) {
 		case BCH_EXTENT_ENTRY_ptr:
 			ptr = entry_to_ptr(entry);
+			ca = c && ptr->dev < c->sb.nr_devices && c->devs[ptr->dev]
+				? bch_dev_bkey_exists(c, ptr->dev)
+				: NULL;
+
+			if (!ca) {
+				pr_buf(out, "ptr: %u:%llu gen %u%s", ptr->dev,
+				       (u64) ptr->offset, ptr->gen,
+				       ptr->cached ? " cached" : "");
+			} else {
+				u32 offset;
+				u64 b = sector_to_bucket_and_offset(ca, ptr->offset, &offset);
 
-			pr_buf(out, "ptr: %u:%llu gen %u%s", ptr->dev,
-			       (u64) ptr->offset, ptr->gen,
-			       ptr->cached ? " cached" : "");
-
-			if (c) {
-				ca = ptr->dev < c->sb.nr_devices && c->devs[ptr->dev]
-					? bch_dev_bkey_exists(c, ptr->dev)
-					: NULL;
+				pr_buf(out, "ptr: %u:%llu:%u gen %u%s", ptr->dev,
+				       b, offset, ptr->gen,
+				       ptr->cached ? " cached" : "");
 
 				if (ca && ptr_stale(ca, ptr))
 					pr_buf(out, " stale");
diff --git a/fs/bcachefs/super.h b/fs/bcachefs/super.h
index 6414f6a6bb91..359fa1e7fc18 100644
--- a/fs/bcachefs/super.h
+++ b/fs/bcachefs/super.h
@@ -26,6 +26,12 @@ static inline sector_t bucket_remainder(const struct bch_dev *ca, sector_t s)
 	return remainder;
 }
 
+static inline size_t sector_to_bucket_and_offset(const struct bch_dev *ca, sector_t s,
+						 u32 *offset)
+{
+	return div_u64_rem(s, ca->mi.bucket_size, offset);
+}
+
 static inline bool bch2_dev_is_online(struct bch_dev *ca)
 {
 	return !percpu_ref_is_zero(&ca->io_ref);
-- 
cgit 


From 78668fe0bbd9bd04c0dbc7b9f60dd2c36a9a16a9 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 31 Mar 2022 00:03:37 -0400
Subject: bcachefs: Move deletion of refcount=0 indirect extents to their
 triggers

For backpointers, we need to switch the order triggers are run in: we
need to run triggers for deletions/overwrites before triggers for
inserts.

To avoid breaking the reflink triggers, this patch moves deleting of
indirect extents with refcount=0 to their triggers, instead of doing it
when we update those keys.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/buckets.c |  5 -----
 fs/bcachefs/reflink.c | 36 ++++++++++++++++++++++++++++++++++++
 fs/bcachefs/reflink.h |  8 +++++++-
 3 files changed, 43 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index a681a6045dc9..0f2dd4b8b47d 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -1798,11 +1798,6 @@ static int __bch2_trans_mark_reflink_p(struct btree_trans *trans,
 
 	le64_add_cpu(refcount, add);
 
-	if (!*refcount) {
-		n->k.type = KEY_TYPE_deleted;
-		set_bkey_val_u64s(&n->k, 0);
-	}
-
 	bch2_btree_iter_set_pos_to_extent_start(&iter);
 	ret = bch2_trans_update(trans, &iter, n, 0);
 	if (ret)
diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c
index c8d6d73681e0..6824730945d4 100644
--- a/fs/bcachefs/reflink.c
+++ b/fs/bcachefs/reflink.c
@@ -98,6 +98,24 @@ bool bch2_reflink_v_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r
 	return l.v->refcount == r.v->refcount && bch2_extent_merge(c, _l, _r);
 }
 
+int bch2_trans_mark_reflink_v(struct btree_trans *trans,
+			      struct bkey_s_c old, struct bkey_i *new,
+			      unsigned flags)
+{
+	if (!(flags & BTREE_TRIGGER_OVERWRITE)) {
+		struct bkey_i_reflink_v *r = bkey_i_to_reflink_v(new);
+
+		if (!r->v.refcount) {
+			r->k.type = KEY_TYPE_deleted;
+			r->k.size = 0;
+			set_bkey_val_u64s(&r->k, 0);
+			return 0;
+		}
+	}
+
+	return bch2_trans_mark_extent(trans, old, new, flags);
+}
+
 /* indirect inline data */
 
 const char *bch2_indirect_inline_data_invalid(const struct bch_fs *c,
@@ -119,6 +137,24 @@ void bch2_indirect_inline_data_to_text(struct printbuf *out,
 	       min(datalen, 32U), d.v->data);
 }
 
+int bch2_trans_mark_indirect_inline_data(struct btree_trans *trans,
+			      struct bkey_s_c old, struct bkey_i *new,
+			      unsigned flags)
+{
+	if (!(flags & BTREE_TRIGGER_OVERWRITE)) {
+		struct bkey_i_indirect_inline_data *r =
+			bkey_i_to_indirect_inline_data(new);
+
+		if (!r->v.refcount) {
+			r->k.type = KEY_TYPE_deleted;
+			r->k.size = 0;
+			set_bkey_val_u64s(&r->k, 0);
+		}
+	}
+
+	return 0;
+}
+
 static int bch2_make_extent_indirect(struct btree_trans *trans,
 				     struct btree_iter *extent_iter,
 				     struct bkey_i *orig)
diff --git a/fs/bcachefs/reflink.h b/fs/bcachefs/reflink.h
index 4da4330014a8..8eb41c0292eb 100644
--- a/fs/bcachefs/reflink.h
+++ b/fs/bcachefs/reflink.h
@@ -18,12 +18,14 @@ bool bch2_reflink_p_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);
 const char *bch2_reflink_v_invalid(const struct bch_fs *, struct bkey_s_c);
 void bch2_reflink_v_to_text(struct printbuf *, struct bch_fs *,
 			    struct bkey_s_c);
+int bch2_trans_mark_reflink_v(struct btree_trans *, struct bkey_s_c,
+			      struct bkey_i *, unsigned);
 
 #define bch2_bkey_ops_reflink_v (struct bkey_ops) {		\
 	.key_invalid	= bch2_reflink_v_invalid,		\
 	.val_to_text	= bch2_reflink_v_to_text,		\
 	.swab		= bch2_ptr_swab,			\
-	.trans_trigger	= bch2_trans_mark_extent,		\
+	.trans_trigger	= bch2_trans_mark_reflink_v,		\
 	.atomic_trigger	= bch2_mark_extent,			\
 }
 
@@ -31,10 +33,14 @@ const char *bch2_indirect_inline_data_invalid(const struct bch_fs *,
 					      struct bkey_s_c);
 void bch2_indirect_inline_data_to_text(struct printbuf *,
 				struct bch_fs *, struct bkey_s_c);
+int bch2_trans_mark_indirect_inline_data(struct btree_trans *,
+			      struct bkey_s_c, struct bkey_i *,
+			      unsigned);
 
 #define bch2_bkey_ops_indirect_inline_data (struct bkey_ops) {	\
 	.key_invalid	= bch2_indirect_inline_data_invalid,	\
 	.val_to_text	= bch2_indirect_inline_data_to_text,	\
+	.trans_trigger	= bch2_trans_mark_indirect_inline_data,	\
 }
 
 static inline const __le64 *bkey_refcount_c(struct bkey_s_c k)
-- 
cgit 


From f13fd87a39225eae57d4ddf824a09acb1955abd1 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 30 Mar 2022 23:39:48 -0400
Subject: bcachefs: Run overwrite triggers before insert

For backpointers, we'll need to delete old backpointers before adding
new backpointers - otherwise we'll run into spurious duplicate
backpointer errors.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_update_leaf.c | 47 ++++++++++++++++-------------------------
 fs/bcachefs/trace.h             |  8 -------
 2 files changed, 18 insertions(+), 37 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 2640d3e38a76..f534d7e649fd 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -464,7 +464,7 @@ static int run_one_mem_trigger(struct btree_trans *trans,
 }
 
 static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_entry *i,
-			   bool overwrite)
+				 bool overwrite)
 {
 	/*
 	 * Transactional triggers create new btree_insert_entries, so we can't
@@ -473,42 +473,31 @@ static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_
 	 */
 	struct bkey old_k = i->old_k;
 	struct bkey_s_c old = { &old_k, i->old_v };
-	int ret = 0;
 
 	if ((i->flags & BTREE_TRIGGER_NORUN) ||
 	    !(BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type)))
 		return 0;
 
-	if (!overwrite) {
-		if (i->insert_trigger_run)
-			return 0;
-
-		BUG_ON(i->overwrite_trigger_run);
-		i->insert_trigger_run = true;
-	} else {
-		if (i->overwrite_trigger_run)
-			return 0;
-
-		BUG_ON(!i->insert_trigger_run);
-		i->overwrite_trigger_run = true;
-	}
-
-	if (overwrite) {
-		ret = bch2_trans_mark_old(trans, old, i->flags);
-	} else if (bch2_bkey_ops[old.k->type].trans_trigger ==
-		   bch2_bkey_ops[i->k->k.type].trans_trigger &&
+	if (!i->insert_trigger_run &&
+	    !i->overwrite_trigger_run &&
+	    bch2_bkey_ops[old.k->type].trans_trigger ==
+	    bch2_bkey_ops[i->k->k.type].trans_trigger &&
 	    ((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) {
 		i->overwrite_trigger_run = true;
-		ret = bch2_trans_mark_key(trans, old, i->k,
-				BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|i->flags);
+		i->insert_trigger_run = true;
+		return bch2_trans_mark_key(trans, old, i->k,
+					   BTREE_TRIGGER_INSERT|
+					   BTREE_TRIGGER_OVERWRITE|
+					   i->flags) ?: 1;
+	} else if (overwrite && !i->overwrite_trigger_run) {
+		i->overwrite_trigger_run = true;
+		return bch2_trans_mark_old(trans, old, i->flags) ?: 1;
+	} else if (!overwrite && !i->insert_trigger_run) {
+		i->insert_trigger_run = true;
+		return bch2_trans_mark_new(trans, i->k, i->flags) ?: 1;
 	} else {
-		ret = bch2_trans_mark_new(trans, i->k, i->flags);
+		return 0;
 	}
-
-	if (ret == -EINTR)
-		trace_trans_restart_mark(trans->fn, _RET_IP_,
-					 i->btree_id, &i->path->pos);
-	return ret ?: 1;
 }
 
 static int run_btree_triggers(struct btree_trans *trans, enum btree_id btree_id,
@@ -518,7 +507,7 @@ static int run_btree_triggers(struct btree_trans *trans, enum btree_id btree_id,
 	bool trans_trigger_run;
 	int ret, overwrite;
 
-	for (overwrite = 0; overwrite < 2; overwrite++) {
+	for (overwrite = 1; overwrite >= 0; --overwrite) {
 
 		/*
 		 * Running triggers will append more updates to the list of updates as
diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h
index bb938dd8cdf2..6a2626a05815 100644
--- a/fs/bcachefs/trace.h
+++ b/fs/bcachefs/trace.h
@@ -714,14 +714,6 @@ DEFINE_EVENT(transaction_restart_iter,	trans_restart_btree_node_split,
 	TP_ARGS(trans_fn, caller_ip, btree_id, pos)
 );
 
-DEFINE_EVENT(transaction_restart_iter,	trans_restart_mark,
-	TP_PROTO(const char *trans_fn,
-		 unsigned long caller_ip,
-		 enum btree_id btree_id,
-		 struct bpos *pos),
-	TP_ARGS(trans_fn, caller_ip, btree_id, pos)
-);
-
 DEFINE_EVENT(transaction_restart_iter,	trans_restart_upgrade,
 	TP_PROTO(const char *trans_fn,
 		 unsigned long caller_ip,
-- 
cgit 


From 3e1547116fe70f49c88e1ee400966a1c7b1bec3a Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 13 Mar 2022 19:27:55 -0400
Subject: bcachefs: x-macroize alloc_reserve enum

This makes an array of strings available, like our other enums.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c      |  4 ++--
 fs/bcachefs/alloc_foreground.c      | 27 +++++++++++++++++----------
 fs/bcachefs/alloc_foreground.h      |  2 ++
 fs/bcachefs/alloc_types.h           | 15 ++++++++++-----
 fs/bcachefs/btree_update_interior.c |  4 ++--
 fs/bcachefs/buckets.c               |  4 ++--
 fs/bcachefs/ec.c                    |  8 ++++----
 fs/bcachefs/io.h                    |  4 ++--
 fs/bcachefs/journal.c               |  2 +-
 fs/bcachefs/move.c                  |  2 +-
 fs/bcachefs/movinggc.c              | 21 +++------------------
 fs/bcachefs/sysfs.c                 |  4 ++--
 fs/bcachefs/trace.h                 | 22 +++++++++++-----------
 13 files changed, 59 insertions(+), 60 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index fac040aa0d5a..a53aeb4ee648 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -780,7 +780,7 @@ static int push_invalidated_bucket(struct bch_fs *c, struct bch_dev *ca, u64 b)
 		 * Don't strand buckets on the copygc freelist until
 		 * after recovery is finished:
 		 */
-		if (i == RESERVE_MOVINGGC &&
+		if (i == RESERVE_movinggc &&
 		    !test_bit(BCH_FS_STARTED, &c->flags))
 			continue;
 
@@ -941,7 +941,7 @@ void bch2_recalc_capacity(struct bch_fs *c)
 		 * allocations for foreground writes must wait -
 		 * not -ENOSPC calculations.
 		 */
-		for (j = 0; j < RESERVE_NONE; j++)
+		for (j = 0; j < RESERVE_none; j++)
 			dev_reserve += ca->free[j].size;
 
 		dev_reserve += 1;	/* btree write point */
diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index dc2f153f60c6..76a4b8029bdf 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -27,6 +27,13 @@
 #include <linux/rculist.h>
 #include <linux/rcupdate.h>
 
+const char * const bch2_alloc_reserves[] = {
+#define x(t) #t,
+	BCH_ALLOC_RESERVES()
+#undef x
+	NULL
+};
+
 /*
  * Open buckets represent a bucket that's currently being allocated from.  They
  * serve two purposes:
@@ -168,10 +175,10 @@ long bch2_bucket_alloc_new_fs(struct bch_dev *ca)
 static inline unsigned open_buckets_reserved(enum alloc_reserve reserve)
 {
 	switch (reserve) {
-	case RESERVE_BTREE:
-	case RESERVE_BTREE_MOVINGGC:
+	case RESERVE_btree:
+	case RESERVE_btree_movinggc:
 		return 0;
-	case RESERVE_MOVINGGC:
+	case RESERVE_movinggc:
 		return OPEN_BUCKETS_COUNT / 4;
 	default:
 		return OPEN_BUCKETS_COUNT / 2;
@@ -219,17 +226,17 @@ struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca,
 			c->blocked_allocate_open_bucket = local_clock();
 
 		spin_unlock(&c->freelist_lock);
-		trace_open_bucket_alloc_fail(ca, reserve);
+		trace_open_bucket_alloc_fail(ca, bch2_alloc_reserves[reserve]);
 		return ERR_PTR(-OPEN_BUCKETS_EMPTY);
 	}
 
-	if (likely(fifo_pop(&ca->free[RESERVE_NONE], b)))
+	if (likely(fifo_pop(&ca->free[RESERVE_none], b)))
 		goto out;
 
 	switch (reserve) {
-	case RESERVE_BTREE_MOVINGGC:
-	case RESERVE_MOVINGGC:
-		if (fifo_pop(&ca->free[RESERVE_MOVINGGC], b))
+	case RESERVE_btree_movinggc:
+	case RESERVE_movinggc:
+		if (fifo_pop(&ca->free[RESERVE_movinggc], b))
 			goto out;
 		break;
 	default:
@@ -244,7 +251,7 @@ struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca,
 
 	spin_unlock(&c->freelist_lock);
 
-	trace_bucket_alloc_fail(ca, reserve);
+	trace_bucket_alloc_fail(ca, bch2_alloc_reserves[reserve]);
 	return ERR_PTR(-FREELIST_EMPTY);
 out:
 	verify_not_on_freelist(c, ca, b);
@@ -282,7 +289,7 @@ out:
 
 	bch2_wake_allocator(ca);
 
-	trace_bucket_alloc(ca, reserve);
+	trace_bucket_alloc(ca, bch2_alloc_reserves[reserve]);
 	return ob;
 }
 
diff --git a/fs/bcachefs/alloc_foreground.h b/fs/bcachefs/alloc_foreground.h
index d466bda9afc8..3598c70b93b4 100644
--- a/fs/bcachefs/alloc_foreground.h
+++ b/fs/bcachefs/alloc_foreground.h
@@ -12,6 +12,8 @@ struct bch_dev;
 struct bch_fs;
 struct bch_devs_List;
 
+extern const char * const bch2_alloc_reserves[];
+
 struct dev_alloc_list {
 	unsigned	nr;
 	u8		devs[BCH_SB_MEMBERS_MAX];
diff --git a/fs/bcachefs/alloc_types.h b/fs/bcachefs/alloc_types.h
index 409232e3d998..e3a3eb271158 100644
--- a/fs/bcachefs/alloc_types.h
+++ b/fs/bcachefs/alloc_types.h
@@ -22,12 +22,17 @@ enum allocator_states {
 #undef x
 };
 
+#define BCH_ALLOC_RESERVES()		\
+	x(btree_movinggc)		\
+	x(btree)			\
+	x(movinggc)			\
+	x(none)
+
 enum alloc_reserve {
-	RESERVE_BTREE_MOVINGGC	= -2,
-	RESERVE_BTREE		= -1,
-	RESERVE_MOVINGGC	= 0,
-	RESERVE_NONE		= 1,
-	RESERVE_NR		= 2,
+#define x(name)	RESERVE_##name,
+	BCH_ALLOC_RESERVES()
+#undef x
+	RESERVE_NR
 };
 
 typedef FIFO(long)	alloc_fifo;
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index e0af39ee4b47..1c53f965539d 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -194,10 +194,10 @@ static struct btree *__bch2_btree_node_alloc(struct bch_fs *c,
 
 	if (flags & BTREE_INSERT_USE_RESERVE) {
 		nr_reserve	= 0;
-		alloc_reserve	= RESERVE_BTREE_MOVINGGC;
+		alloc_reserve	= RESERVE_btree_movinggc;
 	} else {
 		nr_reserve	= BTREE_NODE_RESERVE;
-		alloc_reserve	= RESERVE_BTREE;
+		alloc_reserve	= RESERVE_btree;
 	}
 
 	mutex_lock(&c->btree_reserve_cache_lock);
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 0f2dd4b8b47d..8eeabb5a66bd 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -2091,9 +2091,9 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
 	     !(buckets_nouse	= kvpmalloc(BITS_TO_LONGS(nbuckets) *
 					    sizeof(unsigned long),
 					    GFP_KERNEL|__GFP_ZERO))) ||
-	    !init_fifo(&free[RESERVE_MOVINGGC],
+	    !init_fifo(&free[RESERVE_movinggc],
 		       copygc_reserve, GFP_KERNEL) ||
-	    !init_fifo(&free[RESERVE_NONE], reserve_none, GFP_KERNEL) ||
+	    !init_fifo(&free[RESERVE_none], reserve_none, GFP_KERNEL) ||
 	    !init_fifo(&free_inc,	free_inc_nr, GFP_KERNEL) ||
 	    !init_heap(&alloc_heap,	ALLOC_SCAN_BATCH(ca) << 1, GFP_KERNEL))
 		goto err;
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index b220b523d856..9dc2f9f822c8 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -1307,8 +1307,8 @@ static int new_stripe_alloc_buckets(struct bch_fs *c, struct ec_stripe_head *h,
 					    &nr_have_parity,
 					    &have_cache,
 					    h->copygc
-					    ? RESERVE_MOVINGGC
-					    : RESERVE_NONE,
+					    ? RESERVE_movinggc
+					    : RESERVE_none,
 					    0,
 					    cl);
 
@@ -1336,8 +1336,8 @@ static int new_stripe_alloc_buckets(struct bch_fs *c, struct ec_stripe_head *h,
 					    &nr_have_data,
 					    &have_cache,
 					    h->copygc
-					    ? RESERVE_MOVINGGC
-					    : RESERVE_NONE,
+					    ? RESERVE_movinggc
+					    : RESERVE_none,
 					    0,
 					    cl);
 
diff --git a/fs/bcachefs/io.h b/fs/bcachefs/io.h
index 8be77561badb..f8ce9543c9e3 100644
--- a/fs/bcachefs/io.h
+++ b/fs/bcachefs/io.h
@@ -70,7 +70,7 @@ static inline u64 *op_journal_seq(struct bch_write_op *op)
 
 static inline struct workqueue_struct *index_update_wq(struct bch_write_op *op)
 {
-	return op->alloc_reserve == RESERVE_MOVINGGC
+	return op->alloc_reserve == RESERVE_movinggc
 		? op->c->copygc_wq
 		: op->c->btree_update_wq;
 }
@@ -97,7 +97,7 @@ static inline void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c,
 	op->compression_type	= bch2_compression_opt_to_type[opts.compression];
 	op->nr_replicas		= 0;
 	op->nr_replicas_required = c->opts.data_replicas_required;
-	op->alloc_reserve	= RESERVE_NONE;
+	op->alloc_reserve	= RESERVE_none;
 	op->incompressible	= 0;
 	op->open_buckets.nr	= 0;
 	op->devs_have.nr	= 0;
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index 11b44467aeab..750509661d79 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -817,7 +817,7 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
 			}
 		} else {
 			rcu_read_lock();
-			ob = bch2_bucket_alloc(c, ca, RESERVE_NONE,
+			ob = bch2_bucket_alloc(c, ca, RESERVE_none,
 					       false, cl);
 			rcu_read_unlock();
 			if (IS_ERR(ob)) {
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index b916ee35ee37..3a5c81f3697b 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -351,7 +351,7 @@ int bch2_migrate_write_init(struct bch_fs *c, struct migrate_write *m,
 		}
 
 	if (m->data_opts.btree_insert_flags & BTREE_INSERT_USE_RESERVE) {
-		m->op.alloc_reserve = RESERVE_MOVINGGC;
+		m->op.alloc_reserve = RESERVE_movinggc;
 		m->op.flags |= BCH_WRITE_ALLOC_NOWAIT;
 	} else {
 		/* XXX: this should probably be passed in */
diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c
index dd71c0ce0a84..b43e54133b15 100644
--- a/fs/bcachefs/movinggc.c
+++ b/fs/bcachefs/movinggc.c
@@ -30,21 +30,6 @@
 #include <linux/sort.h>
 #include <linux/wait.h>
 
-/*
- * We can't use the entire copygc reserve in one iteration of copygc: we may
- * need the buckets we're freeing up to go back into the copygc reserve to make
- * forward progress, but if the copygc reserve is full they'll be available for
- * any allocation - and it's possible that in a given iteration, we free up most
- * of the buckets we're going to free before we allocate most of the buckets
- * we're going to allocate.
- *
- * If we only use half of the reserve per iteration, then in steady state we'll
- * always have room in the reserve for the buckets we're going to need in the
- * next iteration:
- */
-#define COPYGC_BUCKETS_PER_ITER(ca)					\
-	((ca)->free[RESERVE_MOVINGGC].size / 2)
-
 static int bucket_offset_cmp(const void *_l, const void *_r, size_t size)
 {
 	const struct copygc_heap_entry *l = _l;
@@ -124,7 +109,7 @@ static bool have_copygc_reserve(struct bch_dev *ca)
 	bool ret;
 
 	spin_lock(&ca->fs->freelist_lock);
-	ret = fifo_full(&ca->free[RESERVE_MOVINGGC]) ||
+	ret = fifo_full(&ca->free[RESERVE_movinggc]) ||
 		ca->allocator_state != ALLOCATOR_running;
 	spin_unlock(&ca->fs->freelist_lock);
 
@@ -265,7 +250,7 @@ static int bch2_copygc(struct bch_fs *c)
 		closure_wait_event(&c->freelist_wait, have_copygc_reserve(ca));
 
 		spin_lock(&ca->fs->freelist_lock);
-		sectors_reserved += fifo_used(&ca->free[RESERVE_MOVINGGC]) * ca->mi.bucket_size;
+		sectors_reserved += fifo_used(&ca->free[RESERVE_movinggc]) * ca->mi.bucket_size;
 		spin_unlock(&ca->fs->freelist_lock);
 	}
 
@@ -281,7 +266,7 @@ static int bch2_copygc(struct bch_fs *c)
 	}
 
 	/*
-	 * Our btree node allocations also come out of RESERVE_MOVINGGC:
+	 * Our btree node allocations also come out of RESERVE_movingc:
 	 */
 	sectors_reserved = (sectors_reserved * 3) / 4;
 	if (!sectors_reserved) {
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index dc67506e08d7..7e10adba5c75 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -758,8 +758,8 @@ static void dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca)
 	       stats.buckets_ec,
 	       __dev_buckets_available(ca, stats),
 	       fifo_used(&ca->free_inc),		ca->free_inc.size,
-	       fifo_used(&ca->free[RESERVE_MOVINGGC]),	ca->free[RESERVE_MOVINGGC].size,
-	       fifo_used(&ca->free[RESERVE_NONE]),	ca->free[RESERVE_NONE].size,
+	       fifo_used(&ca->free[RESERVE_movinggc]),	ca->free[RESERVE_movinggc].size,
+	       fifo_used(&ca->free[RESERVE_none]),	ca->free[RESERVE_none].size,
 	       c->freelist_wait.list.first		? "waiting" : "empty",
 	       OPEN_BUCKETS_COUNT - c->open_buckets_nr_free,
 	       ca->nr_open_buckets,
diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h
index 6a2626a05815..54260349c07e 100644
--- a/fs/bcachefs/trace.h
+++ b/fs/bcachefs/trace.h
@@ -468,37 +468,37 @@ TRACE_EVENT(invalidate,
 );
 
 DECLARE_EVENT_CLASS(bucket_alloc,
-	TP_PROTO(struct bch_dev *ca, enum alloc_reserve reserve),
-	TP_ARGS(ca, reserve),
+	TP_PROTO(struct bch_dev *ca, const char *alloc_reserve),
+	TP_ARGS(ca, alloc_reserve),
 
 	TP_STRUCT__entry(
 		__field(dev_t,			dev	)
-		__field(enum alloc_reserve,	reserve	)
+		__array(char,	reserve,	16	)
 	),
 
 	TP_fast_assign(
 		__entry->dev		= ca->dev;
-		__entry->reserve	= reserve;
+		strlcpy(__entry->reserve, alloc_reserve, sizeof(__entry->reserve));
 	),
 
-	TP_printk("%d,%d reserve %d",
+	TP_printk("%d,%d reserve %s",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  __entry->reserve)
 );
 
 DEFINE_EVENT(bucket_alloc, bucket_alloc,
-	TP_PROTO(struct bch_dev *ca, enum alloc_reserve reserve),
-	TP_ARGS(ca, reserve)
+	TP_PROTO(struct bch_dev *ca, const char *alloc_reserve),
+	TP_ARGS(ca, alloc_reserve)
 );
 
 DEFINE_EVENT(bucket_alloc, bucket_alloc_fail,
-	TP_PROTO(struct bch_dev *ca, enum alloc_reserve reserve),
-	TP_ARGS(ca, reserve)
+	TP_PROTO(struct bch_dev *ca, const char *alloc_reserve),
+	TP_ARGS(ca, alloc_reserve)
 );
 
 DEFINE_EVENT(bucket_alloc, open_bucket_alloc_fail,
-	TP_PROTO(struct bch_dev *ca, enum alloc_reserve reserve),
-	TP_ARGS(ca, reserve)
+	TP_PROTO(struct bch_dev *ca, const char *alloc_reserve),
+	TP_ARGS(ca, alloc_reserve)
 );
 
 /* Moving IO */
-- 
cgit 


From 70a9953c424ccba616a3b74368780de13a80dabd Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 5 Jan 2023 10:13:37 -0500
Subject: bcachefs: Fix bch2_journal_pin_set()

When bch2_journal_pin_set() is updating an existing pin, we shouldn't
call bch2_journal_reclaim_fast() after dropping the old pin and before
dropping the new pin - that could reclaim the entry we're trying to pin.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/journal_reclaim.c | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
index b30730ce58c5..f55fc0b11977 100644
--- a/fs/bcachefs/journal_reclaim.c
+++ b/fs/bcachefs/journal_reclaim.c
@@ -346,13 +346,13 @@ void bch2_journal_pin_put(struct journal *j, u64 seq)
 	}
 }
 
-static inline void __journal_pin_drop(struct journal *j,
+static inline bool __journal_pin_drop(struct journal *j,
 				      struct journal_entry_pin *pin)
 {
 	struct journal_entry_pin_list *pin_list;
 
 	if (!journal_pin_active(pin))
-		return;
+		return false;
 
 	if (j->flush_in_progress == pin)
 		j->flush_in_progress_dropped = true;
@@ -365,16 +365,16 @@ static inline void __journal_pin_drop(struct journal *j,
 	 * Unpinning a journal entry make make journal_next_bucket() succeed, if
 	 * writing a new last_seq will now make another bucket available:
 	 */
-	if (atomic_dec_and_test(&pin_list->count) &&
-	    pin_list == &fifo_peek_front(&j->pin))
-		bch2_journal_reclaim_fast(j);
+	return atomic_dec_and_test(&pin_list->count) &&
+		pin_list == &fifo_peek_front(&j->pin);
 }
 
 void bch2_journal_pin_drop(struct journal *j,
 			   struct journal_entry_pin *pin)
 {
 	spin_lock(&j->lock);
-	__journal_pin_drop(j, pin);
+	if (__journal_pin_drop(j, pin))
+		bch2_journal_reclaim_fast(j);
 	spin_unlock(&j->lock);
 }
 
@@ -383,6 +383,7 @@ void bch2_journal_pin_set(struct journal *j, u64 seq,
 			  journal_pin_flush_fn flush_fn)
 {
 	struct journal_entry_pin_list *pin_list;
+	bool reclaim;
 
 	spin_lock(&j->lock);
 
@@ -399,7 +400,7 @@ void bch2_journal_pin_set(struct journal *j, u64 seq,
 
 	pin_list = journal_seq_pin(j, seq);
 
-	__journal_pin_drop(j, pin);
+	reclaim = __journal_pin_drop(j, pin);
 
 	atomic_inc(&pin_list->count);
 	pin->seq	= seq;
@@ -411,6 +412,9 @@ void bch2_journal_pin_set(struct journal *j, u64 seq,
 		list_add(&pin->list, &pin_list->list);
 	else
 		list_add(&pin->list, &pin_list->flushed);
+
+	if (reclaim)
+		bch2_journal_reclaim_fast(j);
 	spin_unlock(&j->lock);
 
 	/*
-- 
cgit 


From d905f67ec89fda758bcfa70d0b5c3d3006bbdb3e Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 15 Mar 2022 16:40:55 -0400
Subject: bcachefs: Copygc allocations shouldn't be nowait

We don't actually want copygc allocations to be nowait - an allocation
for copygc might fail and then later succeed due to a bucket needing to
wait on journal commit, or to be discarded.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/move.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index 3a5c81f3697b..a219c10a7135 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -352,7 +352,6 @@ int bch2_migrate_write_init(struct bch_fs *c, struct migrate_write *m,
 
 	if (m->data_opts.btree_insert_flags & BTREE_INSERT_USE_RESERVE) {
 		m->op.alloc_reserve = RESERVE_movinggc;
-		m->op.flags |= BCH_WRITE_ALLOC_NOWAIT;
 	} else {
 		/* XXX: this should probably be passed in */
 		m->op.flags |= BCH_WRITE_ONLY_SPECIFIED_DEVS;
-- 
cgit 


From 31f63fd1244d9609265eb5cfc522c142b35cdacc Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 14 Mar 2022 21:48:42 -0400
Subject: bcachefs: Introduce a separate journal watermark for copygc

Since journal reclaim -> btree key cache flushing may require the
allocation of new btree nodes, it has an implicit dependency on copygc
in order to make forward progress - so we should avoid blocking copygc
unless the journal is really close to full.

This introduces watermarks to replace our single MAY_GET_UNRESERVED bit
in the journal, and adds a watermark for copygc and plumbs it through.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c      |  1 -
 fs/bcachefs/btree_key_cache.c       |  2 +-
 fs/bcachefs/btree_update.h          |  7 ++---
 fs/bcachefs/btree_update_interior.c |  9 +++----
 fs/bcachefs/btree_update_leaf.c     | 12 ++++-----
 fs/bcachefs/journal.c               | 51 ++++++++++++++++++++++-------------
 fs/bcachefs/journal.h               | 53 ++++++++++++++++++-------------------
 fs/bcachefs/journal_reclaim.c       |  8 +++---
 fs/bcachefs/journal_types.h         | 41 +++++++++++++++++++++-------
 fs/bcachefs/movinggc.c              |  2 +-
 fs/bcachefs/recovery.c              |  5 ++--
 11 files changed, 108 insertions(+), 83 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index a53aeb4ee648..33b2e4d7da3b 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -670,7 +670,6 @@ static int bch2_invalidate_one_bucket(struct bch_fs *c, struct bch_dev *ca,
 	ret = bch2_trans_do(c, NULL, &commit_seq,
 			    BTREE_INSERT_NOCHECK_RW|
 			    BTREE_INSERT_NOFAIL|
-			    BTREE_INSERT_JOURNAL_RESERVED|
 			    flags,
 			    bucket_invalidate_btree(&trans, ca, b, &u));
 
diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index 7e41552a57df..f856dee0c3aa 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -421,7 +421,7 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans,
 				  BTREE_INSERT_NOFAIL|
 				  BTREE_INSERT_USE_RESERVE|
 				  (ck->journal.seq == journal_last_seq(j)
-				   ? BTREE_INSERT_JOURNAL_RESERVED
+				   ? JOURNAL_WATERMARK_reserved
 				   : 0)|
 				  commit_flags);
 	if (ret) {
diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
index 3cf4cc4f2350..ad13b0739a68 100644
--- a/fs/bcachefs/btree_update.h
+++ b/fs/bcachefs/btree_update.h
@@ -16,12 +16,12 @@ bool bch2_btree_bset_insert_key(struct btree_trans *, struct btree_path *,
 void bch2_btree_add_journal_pin(struct bch_fs *, struct btree *, u64);
 
 enum btree_insert_flags {
-	__BTREE_INSERT_NOFAIL,
+	/* First two bits for journal watermark: */
+	__BTREE_INSERT_NOFAIL = 2,
 	__BTREE_INSERT_NOCHECK_RW,
 	__BTREE_INSERT_LAZY_RW,
 	__BTREE_INSERT_USE_RESERVE,
 	__BTREE_INSERT_JOURNAL_REPLAY,
-	__BTREE_INSERT_JOURNAL_RESERVED,
 	__BTREE_INSERT_JOURNAL_RECLAIM,
 	__BTREE_INSERT_NOWAIT,
 	__BTREE_INSERT_GC_LOCK_HELD,
@@ -41,9 +41,6 @@ enum btree_insert_flags {
 /* Insert is for journal replay - don't get journal reservations: */
 #define BTREE_INSERT_JOURNAL_REPLAY	(1 << __BTREE_INSERT_JOURNAL_REPLAY)
 
-/* Indicates that we have pre-reserved space in the journal: */
-#define BTREE_INSERT_JOURNAL_RESERVED	(1 << __BTREE_INSERT_JOURNAL_RESERVED)
-
 /* Insert is being called from journal reclaim path: */
 #define BTREE_INSERT_JOURNAL_RECLAIM (1 << __BTREE_INSERT_JOURNAL_RECLAIM)
 
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 1c53f965539d..cd4332f891dc 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -599,7 +599,7 @@ static void btree_update_nodes_written(struct btree_update *as)
 			      BTREE_INSERT_NOFAIL|
 			      BTREE_INSERT_NOCHECK_RW|
 			      BTREE_INSERT_JOURNAL_RECLAIM|
-			      BTREE_INSERT_JOURNAL_RESERVED,
+			      JOURNAL_WATERMARK_reserved,
 			      btree_update_nodes_written_trans(&trans, as));
 	bch2_trans_exit(&trans);
 
@@ -964,14 +964,11 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
 		? BCH_DISK_RESERVATION_NOFAIL : 0;
 	unsigned nr_nodes[2];
 	unsigned update_level = level;
-	int journal_flags = 0;
+	int journal_flags = flags & JOURNAL_WATERMARK_MASK;
 	int ret = 0;
 
 	BUG_ON(!path->should_be_locked);
 
-	if (flags & BTREE_INSERT_JOURNAL_RESERVED)
-		journal_flags |= JOURNAL_RES_GET_RESERVED;
-
 	closure_init_stack(&cl);
 retry:
 	nr_nodes[0] = nr_nodes[1] = 0;
@@ -1972,7 +1969,7 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans,
 				BTREE_INSERT_NOCHECK_RW|
 				BTREE_INSERT_USE_RESERVE|
 				BTREE_INSERT_JOURNAL_RECLAIM|
-				BTREE_INSERT_JOURNAL_RESERVED);
+				JOURNAL_WATERMARK_reserved);
 	if (ret)
 		goto err;
 
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index f534d7e649fd..90e6e5130672 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -296,11 +296,10 @@ static inline int bch2_trans_journal_res_get(struct btree_trans *trans,
 	struct bch_fs *c = trans->c;
 	int ret;
 
-	if (trans->flags & BTREE_INSERT_JOURNAL_RESERVED)
-		flags |= JOURNAL_RES_GET_RESERVED;
-
 	ret = bch2_journal_res_get(&c->journal, &trans->journal_res,
-				   trans->journal_u64s, flags);
+				   trans->journal_u64s,
+				   flags|
+				   (trans->flags & JOURNAL_WATERMARK_MASK));
 
 	return ret == -EAGAIN ? BTREE_INSERT_NEED_JOURNAL_RES : ret;
 }
@@ -902,8 +901,7 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans,
 	ret = bch2_journal_preres_get(&c->journal,
 			&trans->journal_preres, trans->journal_preres_u64s,
 			JOURNAL_RES_GET_NONBLOCK|
-			((trans->flags & BTREE_INSERT_JOURNAL_RESERVED)
-			 ? JOURNAL_RES_GET_RESERVED : 0));
+			(trans->flags & JOURNAL_WATERMARK_MASK));
 	if (unlikely(ret == -EAGAIN))
 		ret = bch2_trans_journal_preres_get_cold(trans,
 						trans->journal_preres_u64s, trace_ip);
@@ -988,7 +986,7 @@ int bch2_trans_commit_error(struct btree_trans *trans,
 		bch2_trans_unlock(trans);
 
 		if ((trans->flags & BTREE_INSERT_JOURNAL_RECLAIM) &&
-		    !(trans->flags & BTREE_INSERT_JOURNAL_RESERVED)) {
+		    !(trans->flags & JOURNAL_WATERMARK_reserved)) {
 			trans->restarted = true;
 			ret = -EAGAIN;
 			break;
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index 750509661d79..c7f1674ed596 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -19,6 +19,18 @@
 #include "super-io.h"
 #include "trace.h"
 
+#define x(n)	#n,
+static const char * const bch2_journal_watermarks[] = {
+	JOURNAL_WATERMARKS()
+	NULL
+};
+
+static const char * const bch2_journal_errors[] = {
+	JOURNAL_ERRORS()
+	NULL
+};
+#undef x
+
 static inline bool journal_seq_unwritten(struct journal *j, u64 seq)
 {
 	return seq > j->seq_ondisk;
@@ -207,19 +219,19 @@ static int journal_entry_open(struct journal *j)
 	BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb));
 
 	if (j->blocked)
-		return cur_entry_blocked;
+		return JOURNAL_ERR_blocked;
 
 	if (j->cur_entry_error)
 		return j->cur_entry_error;
 
 	if (bch2_journal_error(j))
-		return cur_entry_insufficient_devices; /* -EROFS */
+		return JOURNAL_ERR_insufficient_devices; /* -EROFS */
 
 	if (!fifo_free(&j->pin))
-		return cur_entry_journal_pin_full;
+		return JOURNAL_ERR_journal_pin_full;
 
 	if (nr_unwritten_journal_entries(j) == ARRAY_SIZE(j->buf))
-		return cur_entry_max_in_flight;
+		return JOURNAL_ERR_max_in_flight;
 
 	BUG_ON(!j->cur_entry_sectors);
 
@@ -238,7 +250,7 @@ static int journal_entry_open(struct journal *j)
 	u64s = clamp_t(int, u64s, 0, JOURNAL_ENTRY_CLOSED_VAL - 1);
 
 	if (u64s <= 0)
-		return cur_entry_journal_full;
+		return JOURNAL_ERR_journal_full;
 
 	if (fifo_empty(&j->pin) && j->reclaim_thread)
 		wake_up_process(j->reclaim_thread);
@@ -354,13 +366,12 @@ retry:
 		return 0;
 	}
 
-	if (!(flags & JOURNAL_RES_GET_RESERVED) &&
-	    !test_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags)) {
+	if ((flags & JOURNAL_WATERMARK_MASK) < j->watermark) {
 		/*
 		 * Don't want to close current journal entry, just need to
 		 * invoke reclaim:
 		 */
-		ret = cur_entry_journal_full;
+		ret = JOURNAL_ERR_journal_full;
 		goto unlock;
 	}
 
@@ -378,10 +389,10 @@ retry:
 	__journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL);
 	ret = journal_entry_open(j);
 
-	if (ret == cur_entry_max_in_flight)
+	if (ret == JOURNAL_ERR_max_in_flight)
 		trace_journal_entry_full(c);
 unlock:
-	if ((ret && ret != cur_entry_insufficient_devices) &&
+	if ((ret && ret != JOURNAL_ERR_insufficient_devices) &&
 	    !j->res_get_blocked_start) {
 		j->res_get_blocked_start = local_clock() ?: 1;
 		trace_journal_full(c);
@@ -393,14 +404,15 @@ unlock:
 	if (!ret)
 		goto retry;
 
-	if ((ret == cur_entry_journal_full ||
-	     ret == cur_entry_journal_pin_full) &&
+	if ((ret == JOURNAL_ERR_journal_full ||
+	     ret == JOURNAL_ERR_journal_pin_full) &&
 	    !can_discard &&
 	    !nr_unwritten_journal_entries(j) &&
-	    (flags & JOURNAL_RES_GET_RESERVED)) {
+	    (flags & JOURNAL_WATERMARK_MASK) == JOURNAL_WATERMARK_reserved) {
 		struct printbuf buf = PRINTBUF;
 
-		bch_err(c, "Journal stuck! Hava a pre-reservation but journal full");
+		bch_err(c, "Journal stuck! Hava a pre-reservation but journal full (ret %s)",
+			bch2_journal_errors[ret]);
 
 		bch2_journal_debug_to_text(&buf, j);
 		bch_err(c, "%s", buf.buf);
@@ -418,8 +430,8 @@ unlock:
 	 * Journal is full - can't rely on reclaim from work item due to
 	 * freezing:
 	 */
-	if ((ret == cur_entry_journal_full ||
-	     ret == cur_entry_journal_pin_full) &&
+	if ((ret == JOURNAL_ERR_journal_full ||
+	     ret == JOURNAL_ERR_journal_pin_full) &&
 	    !(flags & JOURNAL_RES_GET_NONBLOCK)) {
 		if (can_discard) {
 			bch2_journal_do_discards(j);
@@ -432,7 +444,7 @@ unlock:
 		}
 	}
 
-	return ret == cur_entry_insufficient_devices ? -EROFS : -EAGAIN;
+	return ret == JOURNAL_ERR_insufficient_devices ? -EROFS : -EAGAIN;
 }
 
 /*
@@ -1187,13 +1199,14 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
 	rcu_read_lock();
 	s = READ_ONCE(j->reservations);
 
-	pr_buf(out, "dirty journal entries:\t%llu\n",	fifo_used(&j->pin));
+	pr_buf(out, "dirty journal entries:\t%llu/%llu\n",fifo_used(&j->pin), j->pin.size);
 	pr_buf(out, "seq:\t\t\t%llu\n",			journal_cur_seq(j));
 	pr_buf(out, "seq_ondisk:\t\t%llu\n",		j->seq_ondisk);
 	pr_buf(out, "last_seq:\t\t%llu\n",		journal_last_seq(j));
 	pr_buf(out, "last_seq_ondisk:\t%llu\n",		j->last_seq_ondisk);
 	pr_buf(out, "flushed_seq_ondisk:\t%llu\n",	j->flushed_seq_ondisk);
 	pr_buf(out, "prereserved:\t\t%u/%u\n",		j->prereserved.reserved, j->prereserved.remaining);
+	pr_buf(out, "watermark:\t\t%s\n",		bch2_journal_watermarks[j->watermark]);
 	pr_buf(out, "each entry reserved:\t%u\n",	j->entry_u64s_reserved);
 	pr_buf(out, "nr flush writes:\t%llu\n",		j->nr_flush_writes);
 	pr_buf(out, "nr noflush writes:\t%llu\n",	j->nr_noflush_writes);
@@ -1203,7 +1216,7 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
 	pr_buf(out, "reclaim runs in:\t%u ms\n",	time_after(j->next_reclaim, now)
 	       ? jiffies_to_msecs(j->next_reclaim - jiffies) : 0);
 	pr_buf(out, "current entry sectors:\t%u\n",	j->cur_entry_sectors);
-	pr_buf(out, "current entry error:\t%u\n",	j->cur_entry_error);
+	pr_buf(out, "current entry error:\t%s\n",	bch2_journal_errors[j->cur_entry_error]);
 	pr_buf(out, "current entry:\t\t");
 
 	switch (s.cur_entry_offset) {
diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h
index 243349f4ac1c..c287ecf643aa 100644
--- a/fs/bcachefs/journal.h
+++ b/fs/bcachefs/journal.h
@@ -293,9 +293,9 @@ static inline void bch2_journal_res_put(struct journal *j,
 int bch2_journal_res_get_slowpath(struct journal *, struct journal_res *,
 				  unsigned);
 
-#define JOURNAL_RES_GET_NONBLOCK	(1 << 0)
-#define JOURNAL_RES_GET_CHECK		(1 << 1)
-#define JOURNAL_RES_GET_RESERVED	(1 << 2)
+/* First two bits for JOURNAL_WATERMARK: */
+#define JOURNAL_RES_GET_NONBLOCK	(1 << 2)
+#define JOURNAL_RES_GET_CHECK		(1 << 3)
 
 static inline int journal_res_get_fast(struct journal *j,
 				       struct journal_res *res,
@@ -316,8 +316,7 @@ static inline int journal_res_get_fast(struct journal *j,
 
 		EBUG_ON(!journal_state_count(new, new.idx));
 
-		if (!(flags & JOURNAL_RES_GET_RESERVED) &&
-		    !test_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags))
+		if ((flags & JOURNAL_WATERMARK_MASK) < j->watermark)
 			return 0;
 
 		new.cur_entry_offset += res->u64s;
@@ -370,23 +369,27 @@ out:
 
 /* journal_preres: */
 
-static inline bool journal_check_may_get_unreserved(struct journal *j)
+static inline void journal_set_watermark(struct journal *j)
 {
 	union journal_preres_state s = READ_ONCE(j->prereserved);
-	bool ret = s.reserved < s.remaining &&
-		fifo_free(&j->pin) > j->pin.size / 4;
-
-	lockdep_assert_held(&j->lock);
-
-	if (ret != test_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags)) {
-		if (ret) {
-			set_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags);
-			journal_wake(j);
-		} else {
-			clear_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags);
-		}
-	}
-	return ret;
+	unsigned watermark = JOURNAL_WATERMARK_any;
+
+	if (fifo_free(&j->pin) < j->pin.size / 4)
+		watermark = max_t(unsigned, watermark, JOURNAL_WATERMARK_copygc);
+	if (fifo_free(&j->pin) < j->pin.size / 8)
+		watermark = max_t(unsigned, watermark, JOURNAL_WATERMARK_reserved);
+
+	if (s.reserved > s.remaining)
+		watermark = max_t(unsigned, watermark, JOURNAL_WATERMARK_copygc);
+	if (!s.remaining)
+		watermark = max_t(unsigned, watermark, JOURNAL_WATERMARK_reserved);
+
+	if (watermark == j->watermark)
+		return;
+
+	swap(watermark, j->watermark);
+	if (watermark > j->watermark)
+		journal_wake(j);
 }
 
 static inline void bch2_journal_preres_put(struct journal *j,
@@ -406,12 +409,8 @@ static inline void bch2_journal_preres_put(struct journal *j,
 		closure_wake_up(&j->preres_wait);
 	}
 
-	if (s.reserved <= s.remaining &&
-	    !test_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags)) {
-		spin_lock(&j->lock);
-		journal_check_may_get_unreserved(j);
-		spin_unlock(&j->lock);
-	}
+	if (s.reserved <= s.remaining && j->watermark)
+		journal_set_watermark(j);
 }
 
 int __bch2_journal_preres_get(struct journal *,
@@ -432,7 +431,7 @@ static inline int bch2_journal_preres_get_fast(struct journal *j,
 		old.v = new.v = v;
 		ret = 0;
 
-		if ((flags & JOURNAL_RES_GET_RESERVED) ||
+		if ((flags & JOURNAL_WATERMARK_reserved) ||
 		    new.reserved + d < new.remaining) {
 			new.reserved += d;
 			ret = 1;
diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
index f55fc0b11977..e99a01e3b5fb 100644
--- a/fs/bcachefs/journal_reclaim.c
+++ b/fs/bcachefs/journal_reclaim.c
@@ -195,7 +195,7 @@ void bch2_journal_space_available(struct journal *j)
 	j->can_discard = can_discard;
 
 	if (nr_online < c->opts.metadata_replicas_required) {
-		ret = cur_entry_insufficient_devices;
+		ret = JOURNAL_ERR_insufficient_devices;
 		goto out;
 	}
 
@@ -224,9 +224,9 @@ void bch2_journal_space_available(struct journal *j)
 		bch2_fatal_error(c);
 		spin_lock(&j->lock);
 
-		ret = cur_entry_journal_stuck;
+		ret = JOURNAL_ERR_journal_stuck;
 	} else if (!j->space[journal_space_discarded].next_entry)
-		ret = cur_entry_journal_full;
+		ret = JOURNAL_ERR_journal_full;
 
 	if ((j->space[journal_space_clean_ondisk].next_entry <
 	     j->space[journal_space_clean_ondisk].total) &&
@@ -245,7 +245,7 @@ out:
 	j->cur_entry_sectors	= !ret ? j->space[journal_space_discarded].next_entry : 0;
 	j->cur_entry_error	= ret;
 	journal_set_remaining(j, u64s_remaining);
-	journal_check_may_get_unreserved(j);
+	journal_set_watermark(j);
 
 	if (!ret)
 		journal_wake(j);
diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h
index 91f829adf862..a41b915b3ac6 100644
--- a/fs/bcachefs/journal_types.h
+++ b/fs/bcachefs/journal_types.h
@@ -144,16 +144,45 @@ enum journal_space_from {
 enum {
 	JOURNAL_REPLAY_DONE,
 	JOURNAL_STARTED,
-	JOURNAL_MAY_GET_UNRESERVED,
 	JOURNAL_MAY_SKIP_FLUSH,
 };
 
+#define JOURNAL_WATERMARKS()		\
+	x(any)				\
+	x(copygc)			\
+	x(reserved)
+
+enum journal_watermark {
+#define x(n)	JOURNAL_WATERMARK_##n,
+	JOURNAL_WATERMARKS()
+#undef x
+};
+
+#define JOURNAL_WATERMARK_MASK	3
+
+/* Reasons we may fail to get a journal reservation: */
+#define JOURNAL_ERRORS()		\
+	x(ok)				\
+	x(blocked)			\
+	x(max_in_flight)		\
+	x(journal_full)			\
+	x(journal_pin_full)		\
+	x(journal_stuck)		\
+	x(insufficient_devices)
+
+enum journal_errors {
+#define x(n)	JOURNAL_ERR_##n,
+	JOURNAL_ERRORS()
+#undef x
+};
+
 /* Embedded in struct bch_fs */
 struct journal {
 	/* Fastpath stuff up front: */
 	struct {
 
 	union journal_res_state reservations;
+	enum journal_watermark	watermark;
 
 	union journal_preres_state prereserved;
 
@@ -173,15 +202,7 @@ struct journal {
 	 * 0, or -ENOSPC if waiting on journal reclaim, or -EROFS if
 	 * insufficient devices:
 	 */
-	enum {
-		cur_entry_ok,
-		cur_entry_blocked,
-		cur_entry_max_in_flight,
-		cur_entry_journal_full,
-		cur_entry_journal_pin_full,
-		cur_entry_journal_stuck,
-		cur_entry_insufficient_devices,
-	}			cur_entry_error;
+	enum journal_errors	cur_entry_error;
 
 	unsigned		buf_size_want;
 	/*
diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c
index b43e54133b15..a54a83d3247b 100644
--- a/fs/bcachefs/movinggc.c
+++ b/fs/bcachefs/movinggc.c
@@ -91,7 +91,7 @@ static enum data_cmd copygc_pred(struct bch_fs *c, void *arg,
 			data_opts->target		= io_opts->background_target;
 			data_opts->nr_replicas		= 1;
 			data_opts->btree_insert_flags	= BTREE_INSERT_USE_RESERVE|
-				BTREE_INSERT_JOURNAL_RESERVED;
+				JOURNAL_WATERMARK_copygc;
 			data_opts->rewrite_dev		= p.ptr.dev;
 
 			if (p.has_ec)
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 887971559214..93882e6a2ae4 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -562,8 +562,9 @@ static int bch2_journal_replay(struct bch_fs *c)
 		ret = bch2_trans_do(c, NULL, NULL,
 				    BTREE_INSERT_LAZY_RW|
 				    BTREE_INSERT_NOFAIL|
-				    BTREE_INSERT_JOURNAL_RESERVED|
-				    (!k->allocated ? BTREE_INSERT_JOURNAL_REPLAY : 0),
+				    (!k->allocated
+				     ? BTREE_INSERT_JOURNAL_REPLAY|JOURNAL_WATERMARK_reserved
+				     : 0),
 			     bch2_journal_replay_key(&trans, k));
 		if (ret) {
 			bch_err(c, "journal replay: error %d while replaying key at btree %s level %u",
-- 
cgit 


From 5f417394033a0d8bfb31d02b3becf7381dc13867 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 11 Jan 2022 23:24:43 -0500
Subject: bcachefs: bch2_btree_update_start() refactoring

This simplifies the logic in bch2_btree_update_start() a bit, handling
the unlock/block logic more locally.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update_interior.c | 80 +++++++++++++++++++------------------
 fs/bcachefs/trace.h                 | 23 ++++++-----
 2 files changed, 54 insertions(+), 49 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index cd4332f891dc..ff56c374ff2e 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -413,19 +413,24 @@ static void bch2_btree_reserve_put(struct btree_update *as)
 	mutex_unlock(&c->btree_reserve_cache_lock);
 }
 
-static int bch2_btree_reserve_get(struct btree_update *as, unsigned nr_nodes[2],
-				  unsigned flags, struct closure *cl)
+static int bch2_btree_reserve_get(struct btree_update *as,
+				  unsigned nr_nodes[2],
+				  unsigned flags,
+				  struct closure *cl)
 {
 	struct bch_fs *c = as->c;
 	struct btree *b;
 	unsigned interior;
-	int ret;
+	int ret = 0;
 
 	BUG_ON(nr_nodes[0] + nr_nodes[1] > BTREE_RESERVE_MAX);
 
 	/*
 	 * Protects reaping from the btree node cache and using the btree node
 	 * open bucket reserve:
+	 *
+	 * BTREE_INSERT_NOWAIT only applies to btree node allocation, not
+	 * blocking on this lock:
 	 */
 	ret = bch2_btree_cache_cannibalize_lock(c, cl);
 	if (ret)
@@ -436,9 +441,8 @@ static int bch2_btree_reserve_get(struct btree_update *as, unsigned nr_nodes[2],
 
 		while (p->nr < nr_nodes[interior]) {
 			b = __bch2_btree_node_alloc(c, &as->disk_res,
-						    flags & BTREE_INSERT_NOWAIT
-						    ? NULL : cl,
-						    interior, flags);
+					flags & BTREE_INSERT_NOWAIT ? NULL : cl,
+					interior, flags);
 			if (IS_ERR(b)) {
 				ret = PTR_ERR(b);
 				goto err;
@@ -447,12 +451,8 @@ static int bch2_btree_reserve_get(struct btree_update *as, unsigned nr_nodes[2],
 			p->b[p->nr++] = b;
 		}
 	}
-
-	bch2_btree_cache_cannibalize_unlock(c);
-	return 0;
 err:
 	bch2_btree_cache_cannibalize_unlock(c);
-	trace_btree_reserve_get_fail(c, nr_nodes[0] + nr_nodes[1], cl);
 	return ret;
 }
 
@@ -958,21 +958,18 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
 {
 	struct bch_fs *c = trans->c;
 	struct btree_update *as;
-	struct closure cl;
 	u64 start_time = local_clock();
 	int disk_res_flags = (flags & BTREE_INSERT_NOFAIL)
 		? BCH_DISK_RESERVATION_NOFAIL : 0;
-	unsigned nr_nodes[2];
+	unsigned nr_nodes[2] = { 0, 0 };
 	unsigned update_level = level;
 	int journal_flags = flags & JOURNAL_WATERMARK_MASK;
 	int ret = 0;
 
 	BUG_ON(!path->should_be_locked);
 
-	closure_init_stack(&cl);
-retry:
-	nr_nodes[0] = nr_nodes[1] = 0;
-	update_level = level;
+	if (flags & BTREE_INSERT_JOURNAL_RECLAIM)
+		journal_flags |= JOURNAL_RES_GET_NONBLOCK;
 
 	while (1) {
 		nr_nodes[!!update_level] += 1 + split;
@@ -1044,27 +1041,21 @@ retry:
 	ret = bch2_journal_preres_get(&c->journal, &as->journal_preres,
 				      BTREE_UPDATE_JOURNAL_RES,
 				      journal_flags|JOURNAL_RES_GET_NONBLOCK);
-	if (ret == -EAGAIN) {
+	if (ret) {
 		bch2_trans_unlock(trans);
 
-		if (flags & BTREE_INSERT_JOURNAL_RECLAIM) {
-			bch2_btree_update_free(as);
-			btree_trans_restart(trans);
-			return ERR_PTR(ret);
-		}
-
 		ret = bch2_journal_preres_get(&c->journal, &as->journal_preres,
-				BTREE_UPDATE_JOURNAL_RES,
-				journal_flags);
+					      BTREE_UPDATE_JOURNAL_RES,
+					      journal_flags);
 		if (ret) {
 			trace_trans_restart_journal_preres_get(trans->fn, _RET_IP_);
+			btree_trans_restart(trans);
 			goto err;
 		}
 
-		if (!bch2_trans_relock(trans)) {
-			ret = -EINTR;
+		ret = bch2_trans_relock(trans);
+		if (ret)
 			goto err;
-		}
 	}
 
 	ret = bch2_disk_reservation_get(c, &as->disk_res,
@@ -1074,23 +1065,34 @@ retry:
 	if (ret)
 		goto err;
 
-	ret = bch2_btree_reserve_get(as, nr_nodes, flags, &cl);
-	if (ret)
-		goto err;
+	ret = bch2_btree_reserve_get(as, nr_nodes, flags, NULL);
+	if (ret) {
+		struct closure cl;
 
-	return as;
-err:
-	bch2_btree_update_free(as);
+		closure_init_stack(&cl);
 
-	if (ret == -EAGAIN) {
 		bch2_trans_unlock(trans);
-		closure_sync(&cl);
-		ret = -EINTR;
+
+		do {
+			ret = bch2_btree_reserve_get(as, nr_nodes, flags, &cl);
+			closure_sync(&cl);
+		} while (ret == -EAGAIN);
 	}
 
-	if (ret == -EINTR && bch2_trans_relock(trans))
-		goto retry;
+	if (ret) {
+		trace_btree_reserve_get_fail(trans->fn, _RET_IP_,
+					     nr_nodes[0] + nr_nodes[1]);
+		goto err;
+	}
 
+	if (!bch2_trans_relock(trans)) {
+		ret = -EINTR;
+		goto err;
+	}
+
+	return as;
+err:
+	bch2_btree_update_free(as);
 	return ERR_PTR(ret);
 }
 
diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h
index 54260349c07e..89207fd7b617 100644
--- a/fs/bcachefs/trace.h
+++ b/fs/bcachefs/trace.h
@@ -278,24 +278,27 @@ DEFINE_EVENT(bch_fs, btree_node_cannibalize_unlock,
 );
 
 TRACE_EVENT(btree_reserve_get_fail,
-	TP_PROTO(struct bch_fs *c, size_t required, struct closure *cl),
-	TP_ARGS(c, required, cl),
+	TP_PROTO(const char *trans_fn,
+		 unsigned long caller_ip,
+		 size_t required),
+	TP_ARGS(trans_fn, caller_ip, required),
 
 	TP_STRUCT__entry(
-		__field(dev_t,		dev			)
+		__array(char,			trans_fn, 24	)
+		__field(unsigned long,		caller_ip	)
 		__field(size_t,			required	)
-		__field(struct closure *,	cl		)
 	),
 
 	TP_fast_assign(
-		__entry->dev		= c->dev;
-		__entry->required = required;
-		__entry->cl = cl;
+		strlcpy(__entry->trans_fn, trans_fn, sizeof(__entry->trans_fn));
+		__entry->caller_ip	= caller_ip;
+		__entry->required	= required;
 	),
 
-	TP_printk("%d,%d required %zu by %p",
-		  MAJOR(__entry->dev), MINOR(__entry->dev),
-		  __entry->required, __entry->cl)
+	TP_printk("%s %pS required %zu",
+		  __entry->trans_fn,
+		  (void *) __entry->caller_ip,
+		  __entry->required)
 );
 
 DEFINE_EVENT(btree_node, btree_split,
-- 
cgit 


From b17d3cec14b487924df709dbeffb900f124a2607 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 31 Oct 2022 16:13:05 -0400
Subject: bcachefs: Run btree updates after write out of write_point

In the write path, after the write to the block device(s) complete we
have to punt to process context to do the btree update.

Instead of using the work item embedded in op->cl, this patch switches
to a per write-point work item. This helps with two different issues:

 - lock contention: btree updates to the same writepoint will (usually)
   be updating the same alloc keys
 - context switch overhead: when we're bottlenecked on btree updates,
   having a thread (running out of a work item) checking the write point
   for completed ops is cheaper than queueing up a new work item and
   waking up a kworker.

In an arbitrary benchmark, 4k random writes with fio running inside a
VM, this patch resulted in a 10% improvement in total iops.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_foreground.c      |  67 ++++++++---
 fs/bcachefs/alloc_foreground.h      |  19 ++--
 fs/bcachefs/alloc_types.h           |  46 ++++++--
 fs/bcachefs/btree_update_interior.c |   9 +-
 fs/bcachefs/io.c                    | 215 ++++++++++++++++++++----------------
 fs/bcachefs/io.h                    |   6 +-
 fs/bcachefs/io_types.h              |   4 +
 fs/bcachefs/super.c                 |   2 +-
 fs/bcachefs/sysfs.c                 |   5 +
 fs/bcachefs/util.c                  |   8 +-
 fs/bcachefs/util.h                  |   2 +
 11 files changed, 246 insertions(+), 137 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index 76a4b8029bdf..c4b4689fdd0f 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -762,16 +762,17 @@ out:
 /*
  * Get us an open_bucket we can allocate from, return with it locked:
  */
-struct write_point *bch2_alloc_sectors_start(struct bch_fs *c,
-				unsigned target,
-				unsigned erasure_code,
-				struct write_point_specifier write_point,
-				struct bch_devs_list *devs_have,
-				unsigned nr_replicas,
-				unsigned nr_replicas_required,
-				enum alloc_reserve reserve,
-				unsigned flags,
-				struct closure *cl)
+int bch2_alloc_sectors_start(struct bch_fs *c,
+			     unsigned target,
+			     unsigned erasure_code,
+			     struct write_point_specifier write_point,
+			     struct bch_devs_list *devs_have,
+			     unsigned nr_replicas,
+			     unsigned nr_replicas_required,
+			     enum alloc_reserve reserve,
+			     unsigned flags,
+			     struct closure *cl,
+			     struct write_point **wp_ret)
 {
 	struct write_point *wp;
 	struct open_bucket *ob;
@@ -792,7 +793,7 @@ retry:
 	write_points_nr = c->write_points_nr;
 	have_cache	= false;
 
-	wp = writepoint_find(c, write_point.v);
+	*wp_ret = wp = writepoint_find(c, write_point.v);
 
 	if (wp->data_type == BCH_DATA_user)
 		ob_flags |= BUCKET_MAY_ALLOC_PARTIAL;
@@ -848,7 +849,7 @@ alloc_done:
 
 	BUG_ON(!wp->sectors_free || wp->sectors_free == UINT_MAX);
 
-	return wp;
+	return 0;
 err:
 	open_bucket_for_each(c, &wp->ptrs, ob, i)
 		if (ptrs.nr < ARRAY_SIZE(ptrs.v))
@@ -866,9 +867,9 @@ err:
 	switch (ret) {
 	case -OPEN_BUCKETS_EMPTY:
 	case -FREELIST_EMPTY:
-		return cl ? ERR_PTR(-EAGAIN) : ERR_PTR(-ENOSPC);
+		return cl ? -EAGAIN : -ENOSPC;
 	case -INSUFFICIENT_DEVICES:
-		return ERR_PTR(-EROFS);
+		return -EROFS;
 	default:
 		BUG();
 	}
@@ -895,13 +896,13 @@ struct bch_extent_ptr bch2_ob_ptr(struct bch_fs *c, struct open_bucket *ob)
 void bch2_alloc_sectors_append_ptrs(struct bch_fs *c, struct write_point *wp,
 				    struct bkey_i *k, unsigned sectors,
 				    bool cached)
-
 {
 	struct open_bucket *ob;
 	unsigned i;
 
 	BUG_ON(sectors > wp->sectors_free);
-	wp->sectors_free -= sectors;
+	wp->sectors_free	-= sectors;
+	wp->sectors_allocated	+= sectors;
 
 	open_bucket_for_each(c, &wp->ptrs, ob, i) {
 		struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
@@ -942,6 +943,10 @@ static inline void writepoint_init(struct write_point *wp,
 {
 	mutex_init(&wp->lock);
 	wp->data_type = type;
+
+	INIT_WORK(&wp->index_update_work, bch2_write_point_do_index_updates);
+	INIT_LIST_HEAD(&wp->writes);
+	spin_lock_init(&wp->writes_lock);
 }
 
 void bch2_fs_allocator_foreground_init(struct bch_fs *c)
@@ -997,3 +1002,33 @@ void bch2_open_buckets_to_text(struct printbuf *out, struct bch_fs *c)
 	}
 
 }
+
+static const char * const bch2_write_point_states[] = {
+#define x(n)	#n,
+	WRITE_POINT_STATES()
+#undef x
+	NULL
+};
+
+void bch2_write_points_to_text(struct printbuf *out, struct bch_fs *c)
+{
+	struct write_point *wp;
+	unsigned i;
+
+	for (wp = c->write_points;
+	     wp < c->write_points + ARRAY_SIZE(c->write_points);
+	     wp++) {
+		pr_buf(out, "%lu: ", wp->write_point);
+		bch2_hprint(out, wp->sectors_allocated);
+
+		pr_buf(out, " last wrote: ");
+		bch2_pr_time_units(out, sched_clock() - wp->last_used);
+
+		for (i = 0; i < WRITE_POINT_STATE_NR; i++) {
+			pr_buf(out, " %s: ", bch2_write_point_states[i]);
+			bch2_pr_time_units(out, wp->time[i]);
+		}
+
+		pr_newline(out);
+	}
+}
diff --git a/fs/bcachefs/alloc_foreground.h b/fs/bcachefs/alloc_foreground.h
index 3598c70b93b4..9b4389b09cbb 100644
--- a/fs/bcachefs/alloc_foreground.h
+++ b/fs/bcachefs/alloc_foreground.h
@@ -122,14 +122,15 @@ int bch2_bucket_alloc_set(struct bch_fs *, struct open_buckets *,
 		      unsigned, unsigned *, bool *, enum alloc_reserve,
 		      unsigned, struct closure *);
 
-struct write_point *bch2_alloc_sectors_start(struct bch_fs *,
-					     unsigned, unsigned,
-					     struct write_point_specifier,
-					     struct bch_devs_list *,
-					     unsigned, unsigned,
-					     enum alloc_reserve,
-					     unsigned,
-					     struct closure *);
+int bch2_alloc_sectors_start(struct bch_fs *,
+			     unsigned, unsigned,
+			     struct write_point_specifier,
+			     struct bch_devs_list *,
+			     unsigned, unsigned,
+			     enum alloc_reserve,
+			     unsigned,
+			     struct closure *,
+			     struct write_point **);
 
 struct bch_extent_ptr bch2_ob_ptr(struct bch_fs *, struct open_bucket *);
 void bch2_alloc_sectors_append_ptrs(struct bch_fs *, struct write_point *,
@@ -156,4 +157,6 @@ void bch2_fs_allocator_foreground_init(struct bch_fs *);
 
 void bch2_open_buckets_to_text(struct printbuf *, struct bch_fs *);
 
+void bch2_write_points_to_text(struct printbuf *, struct bch_fs *);
+
 #endif /* _BCACHEFS_ALLOC_FOREGROUND_H */
diff --git a/fs/bcachefs/alloc_types.h b/fs/bcachefs/alloc_types.h
index e3a3eb271158..9e00afb17559 100644
--- a/fs/bcachefs/alloc_types.h
+++ b/fs/bcachefs/alloc_types.h
@@ -81,18 +81,46 @@ struct dev_stripe_state {
 	u64			next_alloc[BCH_SB_MEMBERS_MAX];
 };
 
+#define WRITE_POINT_STATES()		\
+	x(stopped)			\
+	x(waiting_io)			\
+	x(waiting_work)			\
+	x(running)
+
+enum write_point_state {
+#define x(n)	WRITE_POINT_##n,
+	WRITE_POINT_STATES()
+#undef x
+	WRITE_POINT_STATE_NR
+};
+
 struct write_point {
-	struct hlist_node	node;
-	struct mutex		lock;
-	u64			last_used;
-	unsigned long		write_point;
-	enum bch_data_type	data_type;
+	struct {
+		struct hlist_node	node;
+		struct mutex		lock;
+		u64			last_used;
+		unsigned long		write_point;
+		enum bch_data_type	data_type;
 
-	/* calculated based on how many pointers we're actually going to use: */
-	unsigned		sectors_free;
+		/* calculated based on how many pointers we're actually going to use: */
+		unsigned		sectors_free;
+
+		struct open_buckets	ptrs;
+		struct dev_stripe_state	stripe;
+
+		u64			sectors_allocated;
+	} __attribute__((__aligned__(SMP_CACHE_BYTES)));
+
+	struct {
+		struct work_struct	index_update_work;
+
+		struct list_head	writes;
+		spinlock_t		writes_lock;
 
-	struct open_buckets	ptrs;
-	struct dev_stripe_state	stripe;
+		enum write_point_state	state;
+		u64			last_state_change;
+		u64			time[WRITE_POINT_STATE_NR];
+	} __attribute__((__aligned__(SMP_CACHE_BYTES)));
 };
 
 struct write_point_specifier {
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index ff56c374ff2e..d1e3e2c76e30 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -191,6 +191,7 @@ static struct btree *__bch2_btree_node_alloc(struct bch_fs *c,
 	struct bch_devs_list devs_have = (struct bch_devs_list) { 0 };
 	unsigned nr_reserve;
 	enum alloc_reserve alloc_reserve;
+	int ret;
 
 	if (flags & BTREE_INSERT_USE_RESERVE) {
 		nr_reserve	= 0;
@@ -213,7 +214,7 @@ static struct btree *__bch2_btree_node_alloc(struct bch_fs *c,
 	mutex_unlock(&c->btree_reserve_cache_lock);
 
 retry:
-	wp = bch2_alloc_sectors_start(c,
+	ret = bch2_alloc_sectors_start(c,
 				      c->opts.metadata_target ?:
 				      c->opts.foreground_target,
 				      0,
@@ -221,9 +222,9 @@ retry:
 				      &devs_have,
 				      res->nr_replicas,
 				      c->opts.metadata_replicas_required,
-				      alloc_reserve, 0, cl);
-	if (IS_ERR(wp))
-		return ERR_CAST(wp);
+				      alloc_reserve, 0, cl, &wp);
+	if (unlikely(ret))
+		return ERR_PTR(ret);
 
 	if (wp->sectors_free < btree_sectors(c)) {
 		struct open_bucket *ob;
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index 2cc56979fcb3..6bebbd44ccc8 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -589,7 +589,7 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
 	}
 }
 
-static void __bch2_write(struct closure *);
+static void __bch2_write(struct bch_write_op *);
 
 static void bch2_write_done(struct closure *cl)
 {
@@ -686,22 +686,86 @@ err:
 	goto out;
 }
 
+static inline void __wp_update_state(struct write_point *wp, enum write_point_state state)
+{
+	if (state != wp->state) {
+		u64 now = ktime_get_ns();
+
+		if (wp->last_state_change &&
+		    time_after64(now, wp->last_state_change))
+			wp->time[wp->state] += now - wp->last_state_change;
+		wp->state = state;
+		wp->last_state_change = now;
+	}
+}
+
+static inline void wp_update_state(struct write_point *wp, bool running)
+{
+	enum write_point_state state;
+
+	state = running			 ? WRITE_POINT_running :
+		!list_empty(&wp->writes) ? WRITE_POINT_waiting_io
+					 : WRITE_POINT_stopped;
+
+	__wp_update_state(wp, state);
+}
+
 static void bch2_write_index(struct closure *cl)
 {
 	struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
-	struct bch_fs *c = op->c;
+	struct write_point *wp = op->wp;
+	struct workqueue_struct *wq = index_update_wq(op);
 
-	__bch2_write_index(op);
+	barrier();
 
-	if (!(op->flags & BCH_WRITE_DONE)) {
-		continue_at(cl, __bch2_write, index_update_wq(op));
-	} else if (!op->error && (op->flags & BCH_WRITE_FLUSH)) {
-		bch2_journal_flush_seq_async(&c->journal,
-					     *op_journal_seq(op),
-					     cl);
-		continue_at(cl, bch2_write_done, index_update_wq(op));
-	} else {
-		continue_at_nobarrier(cl, bch2_write_done, NULL);
+	/*
+	 * We're not using wp->writes_lock here, so this is racey: that's ok,
+	 * because this is just for diagnostic purposes, and we're running out
+	 * of interrupt context here so if we were to take the log we'd have to
+	 * switch to spin_lock_irq()/irqsave(), which is not free:
+	 */
+	if (wp->state == WRITE_POINT_waiting_io)
+		__wp_update_state(wp, WRITE_POINT_waiting_work);
+
+	op->btree_update_ready = true;
+	queue_work(wq, &wp->index_update_work);
+}
+
+void bch2_write_point_do_index_updates(struct work_struct *work)
+{
+	struct write_point *wp =
+		container_of(work, struct write_point, index_update_work);
+	struct bch_write_op *op;
+
+	while (1) {
+		spin_lock(&wp->writes_lock);
+		list_for_each_entry(op, &wp->writes, wp_list)
+			if (op->btree_update_ready) {
+				list_del(&op->wp_list);
+				goto unlock;
+			}
+		op = NULL;
+unlock:
+		wp_update_state(wp, op != NULL);
+		spin_unlock(&wp->writes_lock);
+
+		if (!op)
+			break;
+
+		op->flags |= BCH_WRITE_IN_WORKER;
+
+		__bch2_write_index(op);
+
+		if (!(op->flags & BCH_WRITE_DONE)) {
+			__bch2_write(op);
+		} else if (!op->error && (op->flags & BCH_WRITE_FLUSH)) {
+			bch2_journal_flush_seq_async(&op->c->journal,
+						     *op_journal_seq(op),
+						     &op->cl);
+			continue_at(&op->cl, bch2_write_done, index_update_wq(op));
+		} else {
+			bch2_write_done(&op->cl);
+		}
 	}
 }
 
@@ -734,10 +798,8 @@ static void bch2_write_endio(struct bio *bio)
 
 	if (parent)
 		bio_endio(&parent->bio);
-	else if (!(op->flags & BCH_WRITE_SKIP_CLOSURE_PUT))
-		closure_put(cl);
 	else
-		continue_at_nobarrier(cl, bch2_write_index, index_update_wq(op));
+		closure_put(cl);
 }
 
 static void init_append_extent(struct bch_write_op *op,
@@ -1136,19 +1198,18 @@ err:
 	return ret;
 }
 
-static void __bch2_write(struct closure *cl)
+static void __bch2_write(struct bch_write_op *op)
 {
-	struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
 	struct bch_fs *c = op->c;
-	struct write_point *wp;
+	struct write_point *wp = NULL;
 	struct bio *bio = NULL;
-	bool skip_put = true;
 	unsigned nofs_flags;
 	int ret;
 
 	nofs_flags = memalloc_nofs_save();
 again:
 	memset(&op->failed, 0, sizeof(op->failed));
+	op->btree_update_ready = false;
 
 	do {
 		struct bkey_i *key_to_write;
@@ -1158,13 +1219,13 @@ again:
 		/* +1 for possible cache device: */
 		if (op->open_buckets.nr + op->nr_replicas + 1 >
 		    ARRAY_SIZE(op->open_buckets.v))
-			goto flush_io;
+			break;
 
 		if (bch2_keylist_realloc(&op->insert_keys,
 					op->inline_keys,
 					ARRAY_SIZE(op->inline_keys),
 					BKEY_EXTENT_U64s_MAX))
-			goto flush_io;
+			break;
 
 		if ((op->flags & BCH_WRITE_FROM_INTERNAL) &&
 		    percpu_ref_is_dying(&c->writes)) {
@@ -1177,7 +1238,7 @@ again:
 		 * freeing up space on specific disks, which means that
 		 * allocations for specific disks may hang arbitrarily long:
 		 */
-		wp = bch2_alloc_sectors_start(c,
+		ret = bch2_alloc_sectors_start(c,
 			op->target,
 			op->opts.erasure_code && !(op->flags & BCH_WRITE_CACHED),
 			op->write_point,
@@ -1187,53 +1248,34 @@ again:
 			op->alloc_reserve,
 			op->flags,
 			(op->flags & (BCH_WRITE_ALLOC_NOWAIT|
-				      BCH_WRITE_ONLY_SPECIFIED_DEVS)) ? NULL : cl);
-		EBUG_ON(!wp);
-
-		if (unlikely(IS_ERR(wp))) {
-			if (unlikely(PTR_ERR(wp) != -EAGAIN)) {
-				ret = PTR_ERR(wp);
+				      BCH_WRITE_ONLY_SPECIFIED_DEVS))
+			? NULL : &op->cl,
+			&wp);
+		if (unlikely(ret)) {
+			if (unlikely(ret != -EAGAIN))
 				goto err;
-			}
 
-			goto flush_io;
+			break;
 		}
 
-		/*
-		 * It's possible for the allocator to fail, put us on the
-		 * freelist waitlist, and then succeed in one of various retry
-		 * paths: if that happens, we need to disable the skip_put
-		 * optimization because otherwise there won't necessarily be a
-		 * barrier before we free the bch_write_op:
-		 */
-		if (atomic_read(&cl->remaining) & CLOSURE_WAITING)
-			skip_put = false;
+		EBUG_ON(!wp);
 
 		bch2_open_bucket_get(c, wp, &op->open_buckets);
 		ret = bch2_write_extent(op, wp, &bio);
+
 		bch2_alloc_sectors_done(c, wp);
 
 		if (ret < 0)
 			goto err;
 
-		if (ret) {
-			skip_put = false;
-		} else {
-			/*
-			 * for the skip_put optimization this has to be set
-			 * before we submit the bio:
-			 */
+		if (!ret)
 			op->flags |= BCH_WRITE_DONE;
-		}
 
 		bio->bi_end_io	= bch2_write_endio;
 		bio->bi_private	= &op->cl;
 		bio->bi_opf |= REQ_OP_WRITE;
 
-		if (!skip_put)
-			closure_get(bio->bi_private);
-		else
-			op->flags |= BCH_WRITE_SKIP_CLOSURE_PUT;
+		closure_get(bio->bi_private);
 
 		key_to_write = (void *) (op->insert_keys.keys_p +
 					 key_to_write_offset);
@@ -1241,55 +1283,49 @@ again:
 		bch2_submit_wbio_replicas(to_wbio(bio), c, BCH_DATA_user,
 					  key_to_write);
 	} while (ret);
-
-	if (!skip_put)
-		continue_at(cl, bch2_write_index, index_update_wq(op));
 out:
-	memalloc_nofs_restore(nofs_flags);
-	return;
-err:
-	op->error = ret;
-	op->flags |= BCH_WRITE_DONE;
-
-	continue_at(cl, bch2_write_index, index_update_wq(op));
-	goto out;
-flush_io:
 	/*
 	 * If the write can't all be submitted at once, we generally want to
 	 * block synchronously as that signals backpressure to the caller.
-	 *
-	 * However, if we're running out of a workqueue, we can't block here
-	 * because we'll be blocking other work items from completing:
 	 */
-	if (current->flags & PF_WQ_WORKER) {
-		continue_at(cl, bch2_write_index, index_update_wq(op));
-		goto out;
-	}
-
-	closure_sync(cl);
-
-	if (!bch2_keylist_empty(&op->insert_keys)) {
+	if (!(op->flags & BCH_WRITE_DONE) &&
+	    !(op->flags & BCH_WRITE_IN_WORKER)) {
+		closure_sync(&op->cl);
 		__bch2_write_index(op);
 
-		if (op->error) {
-			op->flags |= BCH_WRITE_DONE;
-			continue_at_nobarrier(cl, bch2_write_done, NULL);
-			goto out;
-		}
+		if (!(op->flags & BCH_WRITE_DONE))
+			goto again;
+		bch2_write_done(&op->cl);
+	} else {
+		spin_lock(&wp->writes_lock);
+		op->wp = wp;
+		list_add_tail(&op->wp_list, &wp->writes);
+		if (wp->state == WRITE_POINT_stopped)
+			__wp_update_state(wp, WRITE_POINT_waiting_io);
+		spin_unlock(&wp->writes_lock);
+
+		continue_at(&op->cl, bch2_write_index, NULL);
 	}
 
-	goto again;
+	memalloc_nofs_restore(nofs_flags);
+	return;
+err:
+	op->error = ret;
+	op->flags |= BCH_WRITE_DONE;
+	goto out;
 }
 
 static void bch2_write_data_inline(struct bch_write_op *op, unsigned data_len)
 {
-	struct closure *cl = &op->cl;
 	struct bio *bio = &op->wbio.bio;
 	struct bvec_iter iter;
 	struct bkey_i_inline_data *id;
 	unsigned sectors;
 	int ret;
 
+	op->flags |= BCH_WRITE_WROTE_DATA_INLINE;
+	op->flags |= BCH_WRITE_DONE;
+
 	bch2_check_set_feature(op->c, BCH_FEATURE_inline_data);
 
 	ret = bch2_keylist_realloc(&op->insert_keys, op->inline_keys,
@@ -1317,11 +1353,7 @@ static void bch2_write_data_inline(struct bch_write_op *op, unsigned data_len)
 	set_bkey_val_bytes(&id->k, data_len);
 	bch2_keylist_push(&op->insert_keys);
 
-	op->flags |= BCH_WRITE_WROTE_DATA_INLINE;
-	op->flags |= BCH_WRITE_DONE;
-
-	continue_at_nobarrier(cl, bch2_write_index, NULL);
-	return;
+	__bch2_write_index(op);
 err:
 	bch2_write_done(&op->cl);
 }
@@ -1349,6 +1381,7 @@ void bch2_write(struct closure *cl)
 	struct bch_fs *c = op->c;
 	unsigned data_len;
 
+	EBUG_ON(op->cl.parent);
 	BUG_ON(!op->nr_replicas);
 	BUG_ON(!op->write_point.v);
 	BUG_ON(!bkey_cmp(op->pos, POS_MAX));
@@ -1381,18 +1414,14 @@ void bch2_write(struct closure *cl)
 		return;
 	}
 
-	continue_at_nobarrier(cl, __bch2_write, NULL);
+	__bch2_write(op);
 	return;
 err:
 	bch2_disk_reservation_put(c, &op->res);
 
-	if (op->end_io) {
-		EBUG_ON(cl->parent);
-		closure_debug_destroy(cl);
+	closure_debug_destroy(&op->cl);
+	if (op->end_io)
 		op->end_io(op);
-	} else {
-		closure_return(cl);
-	}
 }
 
 /* Cache promotion on read */
diff --git a/fs/bcachefs/io.h b/fs/bcachefs/io.h
index f8ce9543c9e3..b484d3387968 100644
--- a/fs/bcachefs/io.h
+++ b/fs/bcachefs/io.h
@@ -41,7 +41,7 @@ enum bch_write_flags {
 	__BCH_WRITE_CHECK_ENOSPC,
 	__BCH_WRITE_MOVE,
 	__BCH_WRITE_JOURNAL_SEQ_PTR,
-	__BCH_WRITE_SKIP_CLOSURE_PUT,
+	__BCH_WRITE_IN_WORKER,
 	__BCH_WRITE_DONE,
 };
 
@@ -59,7 +59,7 @@ enum bch_write_flags {
 
 /* Internal: */
 #define BCH_WRITE_JOURNAL_SEQ_PTR	(1U << __BCH_WRITE_JOURNAL_SEQ_PTR)
-#define BCH_WRITE_SKIP_CLOSURE_PUT	(1U << __BCH_WRITE_SKIP_CLOSURE_PUT)
+#define BCH_WRITE_IN_WORKER		(1U << __BCH_WRITE_IN_WORKER)
 #define BCH_WRITE_DONE			(1U << __BCH_WRITE_DONE)
 
 static inline u64 *op_journal_seq(struct bch_write_op *op)
@@ -115,6 +115,8 @@ static inline void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c,
 
 void bch2_write(struct closure *);
 
+void bch2_write_point_do_index_updates(struct work_struct *);
+
 static inline struct bch_write_bio *wbio_init(struct bio *bio)
 {
 	struct bch_write_bio *wbio = to_wbio(bio);
diff --git a/fs/bcachefs/io_types.h b/fs/bcachefs/io_types.h
index 53270f0a08a3..c316a39d381a 100644
--- a/fs/bcachefs/io_types.h
+++ b/fs/bcachefs/io_types.h
@@ -119,6 +119,7 @@ struct bch_write_op {
 	unsigned		nr_replicas_required:4;
 	unsigned		alloc_reserve:3;
 	unsigned		incompressible:1;
+	unsigned		btree_update_ready:1;
 
 	struct bch_devs_list	devs_have;
 	u16			target;
@@ -134,6 +135,9 @@ struct bch_write_op {
 
 	struct write_point_specifier write_point;
 
+	struct write_point	*wp;
+	struct list_head	wp_list;
+
 	struct disk_reservation	res;
 
 	struct open_buckets	open_buckets;
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index e4201aecdba1..208482db3683 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -791,7 +791,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 	c->inode_shard_bits = ilog2(roundup_pow_of_two(num_possible_cpus()));
 
 	if (!(c->btree_update_wq = alloc_workqueue("bcachefs",
-				WQ_FREEZABLE|WQ_MEM_RECLAIM, 1)) ||
+				WQ_FREEZABLE|WQ_UNBOUND|WQ_MEM_RECLAIM, 512)) ||
 	    !(c->btree_io_complete_wq = alloc_workqueue("bcachefs_btree_io",
 				WQ_FREEZABLE|WQ_MEM_RECLAIM, 1)) ||
 	    !(c->copygc_wq = alloc_workqueue("bcachefs_copygc",
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index 7e10adba5c75..ec672134cb18 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -180,6 +180,7 @@ read_attribute(btree_key_cache);
 read_attribute(btree_transactions);
 read_attribute(stripes_heap);
 read_attribute(open_buckets);
+read_attribute(write_points);
 
 read_attribute(internal_uuid);
 
@@ -418,6 +419,9 @@ SHOW(bch2_fs)
 	if (attr == &sysfs_open_buckets)
 		bch2_open_buckets_to_text(out, c);
 
+	if (attr == &sysfs_write_points)
+		bch2_write_points_to_text(out, c);
+
 	if (attr == &sysfs_compression_stats)
 		bch2_compression_stats_to_text(out, c);
 
@@ -563,6 +567,7 @@ struct attribute *bch2_fs_internal_files[] = {
 	&sysfs_new_stripes,
 	&sysfs_stripes_heap,
 	&sysfs_open_buckets,
+	&sysfs_write_points,
 	&sysfs_io_timers_read,
 	&sysfs_io_timers_write,
 
diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c
index f290c069c683..e1d36d9b092c 100644
--- a/fs/bcachefs/util.c
+++ b/fs/bcachefs/util.c
@@ -390,7 +390,7 @@ static const struct time_unit *pick_time_units(u64 ns)
 	return u;
 }
 
-static void pr_time_units(struct printbuf *out, u64 ns)
+void bch2_pr_time_units(struct printbuf *out, u64 ns)
 {
 	const struct time_unit *u = pick_time_units(ns);
 
@@ -410,13 +410,13 @@ void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats
 	       freq ?  div64_u64(NSEC_PER_SEC, freq) : 0);
 
 	pr_buf(out, "frequency:\t");
-	pr_time_units(out, freq);
+	bch2_pr_time_units(out, freq);
 
 	pr_buf(out, "\navg duration:\t");
-	pr_time_units(out, stats->average_duration);
+	bch2_pr_time_units(out, stats->average_duration);
 
 	pr_buf(out, "\nmax duration:\t");
-	pr_time_units(out, stats->max_duration);
+	bch2_pr_time_units(out, stats->max_duration);
 
 	i = eytzinger0_first(NR_QUANTILES);
 	u = pick_time_units(stats->quantiles.entries[i].m);
diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h
index fd776fb281b7..085f1c357383 100644
--- a/fs/bcachefs/util.h
+++ b/fs/bcachefs/util.h
@@ -352,6 +352,8 @@ static inline void pr_sectors(struct printbuf *out, u64 v)
 	bch2_pr_units(out, v, v << 9);
 }
 
+void bch2_pr_time_units(struct printbuf *, u64);
+
 #ifdef __KERNEL__
 static inline void pr_time(struct printbuf *out, u64 time)
 {
-- 
cgit 


From 25be2e5d4a051ff2408c7ab007394e96798cf559 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 10 Mar 2022 16:43:52 -0500
Subject: bcachefs: bch_sb_field_journal_v2

Add a new superblock field which represents journal buckets as ranges:
also move code for the superblock journal fields to journal_sb.c.

This also reworks the code for resizing the journal to write the new
superblock before using the new journal buckets, and thus be a bit
safer.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/Makefile          |   1 +
 fs/bcachefs/bcachefs_format.h |  38 ++++++--
 fs/bcachefs/journal.c         | 187 ++++++++++++++++++++++-------------
 fs/bcachefs/journal_sb.c      | 220 ++++++++++++++++++++++++++++++++++++++++++
 fs/bcachefs/journal_sb.h      |  24 +++++
 fs/bcachefs/super-io.c        |  82 +---------------
 fs/bcachefs/super-io.h        |   9 --
 7 files changed, 395 insertions(+), 166 deletions(-)
 create mode 100644 fs/bcachefs/journal_sb.c
 create mode 100644 fs/bcachefs/journal_sb.h

(limited to 'fs')

diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile
index 65eeab56cb4b..d3808249948c 100644
--- a/fs/bcachefs/Makefile
+++ b/fs/bcachefs/Makefile
@@ -38,6 +38,7 @@ bcachefs-y		:=	\
 	journal.o		\
 	journal_io.o		\
 	journal_reclaim.o	\
+	journal_sb.o		\
 	journal_seq_blacklist.o	\
 	keylist.o		\
 	migrate.o		\
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index 2c9243031dab..838754ad60c3 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -1027,16 +1027,17 @@ struct bch_sb_field {
 	__le32			type;
 };
 
-#define BCH_SB_FIELDS()		\
-	x(journal,	0)	\
-	x(members,	1)	\
-	x(crypt,	2)	\
-	x(replicas_v0,	3)	\
-	x(quota,	4)	\
-	x(disk_groups,	5)	\
-	x(clean,	6)	\
-	x(replicas,	7)	\
-	x(journal_seq_blacklist, 8)
+#define BCH_SB_FIELDS()				\
+	x(journal,	0)			\
+	x(members,	1)			\
+	x(crypt,	2)			\
+	x(replicas_v0,	3)			\
+	x(quota,	4)			\
+	x(disk_groups,	5)			\
+	x(clean,	6)			\
+	x(replicas,	7)			\
+	x(journal_seq_blacklist, 8)		\
+	x(journal_v2,	9)
 
 enum bch_sb_field_type {
 #define x(f, nr)	BCH_SB_FIELD_##f = nr,
@@ -1045,6 +1046,14 @@ enum bch_sb_field_type {
 	BCH_SB_FIELD_NR
 };
 
+/*
+ * Most superblock fields are replicated in all device's superblocks - a few are
+ * not:
+ */
+#define BCH_SINGLE_DEVICE_SB_FIELDS		\
+	((1U << BCH_SB_FIELD_journal)|		\
+	 (1U << BCH_SB_FIELD_journal_v2))
+
 /* BCH_SB_FIELD_journal: */
 
 struct bch_sb_field_journal {
@@ -1052,6 +1061,15 @@ struct bch_sb_field_journal {
 	__le64			buckets[0];
 };
 
+struct bch_sb_field_journal_v2 {
+	struct bch_sb_field	field;
+
+	struct bch_sb_field_journal_v2_entry {
+		__le64		start;
+		__le64		nr;
+	}			d[0];
+};
+
 /* BCH_SB_FIELD_members: */
 
 #define BCH_MIN_NR_NBUCKETS	(1 << 6)
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index c7f1674ed596..cb15d1c8a135 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -15,8 +15,8 @@
 #include "journal.h"
 #include "journal_io.h"
 #include "journal_reclaim.h"
+#include "journal_sb.h"
 #include "journal_seq_blacklist.h"
-#include "super-io.h"
 #include "trace.h"
 
 #define x(n)	#n,
@@ -779,28 +779,55 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
 {
 	struct bch_fs *c = ca->fs;
 	struct journal_device *ja = &ca->journal;
-	struct bch_sb_field_journal *journal_buckets;
 	u64 *new_bucket_seq = NULL, *new_buckets = NULL;
+	struct open_bucket **ob = NULL;
+	long *bu = NULL;
+	unsigned i, nr_got = 0, nr_want = nr - ja->nr;
+	unsigned old_nr			= ja->nr;
+	unsigned old_discard_idx	= ja->discard_idx;
+	unsigned old_dirty_idx_ondisk	= ja->dirty_idx_ondisk;
+	unsigned old_dirty_idx		= ja->dirty_idx;
+	unsigned old_cur_idx		= ja->cur_idx;
 	int ret = 0;
 
-	/* don't handle reducing nr of buckets yet: */
-	if (nr <= ja->nr)
-		return 0;
+	if (c) {
+		bch2_journal_flush_all_pins(&c->journal);
+		bch2_journal_block(&c->journal);
+	}
 
+	bu		= kzalloc(nr_want * sizeof(*bu), GFP_KERNEL);
+	ob		= kzalloc(nr_want * sizeof(*ob), GFP_KERNEL);
 	new_buckets	= kzalloc(nr * sizeof(u64), GFP_KERNEL);
 	new_bucket_seq	= kzalloc(nr * sizeof(u64), GFP_KERNEL);
-	if (!new_buckets || !new_bucket_seq) {
+	if (!bu || !ob || !new_buckets || !new_bucket_seq) {
 		ret = -ENOMEM;
-		goto err;
+		goto err_unblock;
 	}
 
-	journal_buckets = bch2_sb_resize_journal(&ca->disk_sb,
-					nr + sizeof(*journal_buckets) / sizeof(u64));
-	if (!journal_buckets) {
-		ret = -ENOSPC;
-		goto err;
+	for (nr_got = 0; nr_got < nr_want; nr_got++) {
+		if (new_fs) {
+			bu[nr_got] = bch2_bucket_alloc_new_fs(ca);
+			if (bu[nr_got] < 0) {
+				ret = -ENOSPC;
+				break;
+			}
+		} else {
+			rcu_read_lock();
+			ob[nr_got] = bch2_bucket_alloc(c, ca, RESERVE_none,
+					       false, cl);
+			rcu_read_unlock();
+			if (IS_ERR(ob[nr_got])) {
+				ret = cl ? -EAGAIN : -ENOSPC;
+				break;
+			}
+
+			bu[nr_got] = ob[nr_got]->bucket;
+		}
 	}
 
+	if (!nr_got)
+		goto err_unblock;
+
 	/*
 	 * We may be called from the device add path, before the new device has
 	 * actually been added to the running filesystem:
@@ -813,51 +840,16 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
 	swap(new_buckets,	ja->buckets);
 	swap(new_bucket_seq,	ja->bucket_seq);
 
-	if (!new_fs)
-		spin_unlock(&c->journal.lock);
-
-	while (ja->nr < nr) {
-		struct open_bucket *ob = NULL;
-		unsigned pos;
-		long b;
-
-		if (new_fs) {
-			b = bch2_bucket_alloc_new_fs(ca);
-			if (b < 0) {
-				ret = -ENOSPC;
-				goto err;
-			}
-		} else {
-			rcu_read_lock();
-			ob = bch2_bucket_alloc(c, ca, RESERVE_none,
-					       false, cl);
-			rcu_read_unlock();
-			if (IS_ERR(ob)) {
-				ret = cl ? -EAGAIN : -ENOSPC;
-				goto err;
-			}
-
-			b = ob->bucket;
-		}
+	for (i = 0; i < nr_got; i++) {
+		unsigned pos = ja->discard_idx ?: ja->nr;
+		long b = bu[i];
 
-		if (c)
-			spin_lock(&c->journal.lock);
-
-		/*
-		 * XXX
-		 * For resize at runtime, we should be writing the new
-		 * superblock before inserting into the journal array
-		 */
-
-		pos = ja->discard_idx ?: ja->nr;
 		__array_insert_item(ja->buckets,		ja->nr, pos);
 		__array_insert_item(ja->bucket_seq,		ja->nr, pos);
-		__array_insert_item(journal_buckets->buckets,	ja->nr, pos);
 		ja->nr++;
 
 		ja->buckets[pos] = b;
 		ja->bucket_seq[pos] = 0;
-		journal_buckets->buckets[pos] = cpu_to_le64(b);
 
 		if (pos <= ja->discard_idx)
 			ja->discard_idx = (ja->discard_idx + 1) % ja->nr;
@@ -867,29 +859,56 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
 			ja->dirty_idx = (ja->dirty_idx + 1) % ja->nr;
 		if (pos <= ja->cur_idx)
 			ja->cur_idx = (ja->cur_idx + 1) % ja->nr;
+	}
 
-		if (c)
-			spin_unlock(&c->journal.lock);
+	ret = bch2_journal_buckets_to_sb(c, ca);
+	if (ret) {
+		/* Revert: */
+		swap(new_buckets,	ja->buckets);
+		swap(new_bucket_seq,	ja->bucket_seq);
+		ja->nr			= old_nr;
+		ja->discard_idx		= old_discard_idx;
+		ja->dirty_idx_ondisk	= old_dirty_idx_ondisk;
+		ja->dirty_idx		= old_dirty_idx;
+		ja->cur_idx		= old_cur_idx;
+	}
+
+	if (!new_fs)
+		spin_unlock(&c->journal.lock);
 
-		if (!new_fs) {
+	if (c)
+		bch2_journal_unblock(&c->journal);
+
+	if (ret)
+		goto err;
+
+	if (!new_fs) {
+		for (i = 0; i < nr_got; i++) {
 			ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_NOFAIL,
 				bch2_trans_mark_metadata_bucket(&trans, ca,
-						b, BCH_DATA_journal,
+						bu[i], BCH_DATA_journal,
 						ca->mi.bucket_size));
-
-			bch2_open_bucket_put(c, ob);
-
-			if (ret)
+			if (ret) {
+				bch2_fs_inconsistent(c, "error marking new journal buckets: %i", ret);
 				goto err;
+			}
 		}
 	}
 err:
-	bch2_sb_resize_journal(&ca->disk_sb,
-		ja->nr + sizeof(*journal_buckets) / sizeof(u64));
+	if (ob && !new_fs)
+		for (i = 0; i < nr_got; i++)
+			bch2_open_bucket_put(c, ob[i]);
+
 	kfree(new_bucket_seq);
 	kfree(new_buckets);
+	kfree(ob);
+	kfree(bu);
 
 	return ret;
+err_unblock:
+	if (c)
+		bch2_journal_unblock(&c->journal);
+	goto err;
 }
 
 /*
@@ -902,11 +921,15 @@ int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca,
 	struct journal_device *ja = &ca->journal;
 	struct closure cl;
 	unsigned current_nr;
-	int ret;
+	int ret = 0;
+
+	/* don't handle reducing nr of buckets yet: */
+	if (nr < ja->nr)
+		return 0;
 
 	closure_init_stack(&cl);
 
-	do {
+	while (ja->nr != nr && (ret == 0 || ret == -EAGAIN)) {
 		struct disk_reservation disk_res = { 0, 0 };
 
 		closure_sync(&cl);
@@ -934,7 +957,7 @@ int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca,
 		if (ja->nr != current_nr)
 			bch2_write_super(c);
 		mutex_unlock(&c->sb_lock);
-	} while (ret == -EAGAIN);
+	}
 
 	return ret;
 }
@@ -942,6 +965,7 @@ int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca,
 int bch2_dev_journal_alloc(struct bch_dev *ca)
 {
 	unsigned nr;
+	int ret;
 
 	if (dynamic_fault("bcachefs:add:journal_alloc"))
 		return -ENOMEM;
@@ -958,7 +982,15 @@ int bch2_dev_journal_alloc(struct bch_dev *ca)
 		     min(1 << 13,
 			 (1 << 24) / ca->mi.bucket_size));
 
-	return __bch2_set_nr_journal_buckets(ca, nr, true, NULL);
+	if (ca->fs)
+		mutex_lock(&ca->fs->sb_lock);
+
+	ret = __bch2_set_nr_journal_buckets(ca, nr, true, NULL);
+
+	if (ca->fs)
+		mutex_unlock(&ca->fs->sb_lock);
+
+	return ret;
 }
 
 /* startup/shutdown: */
@@ -1103,9 +1135,20 @@ int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb)
 	struct journal_device *ja = &ca->journal;
 	struct bch_sb_field_journal *journal_buckets =
 		bch2_sb_get_journal(sb);
+	struct bch_sb_field_journal_v2 *journal_buckets_v2 =
+		bch2_sb_get_journal_v2(sb);
 	unsigned i, nr_bvecs;
 
-	ja->nr = bch2_nr_journal_buckets(journal_buckets);
+	ja->nr = 0;
+
+	if (journal_buckets_v2) {
+		unsigned nr = bch2_sb_field_journal_v2_nr_entries(journal_buckets_v2);
+
+		for (i = 0; i < nr; i++)
+			ja->nr += le64_to_cpu(journal_buckets_v2->d[i].nr);
+	} else if (journal_buckets) {
+		ja->nr = bch2_nr_journal_buckets(journal_buckets);
+	}
 
 	ja->bucket_seq = kcalloc(ja->nr, sizeof(u64), GFP_KERNEL);
 	if (!ja->bucket_seq)
@@ -1123,8 +1166,18 @@ int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb)
 	if (!ja->buckets)
 		return -ENOMEM;
 
-	for (i = 0; i < ja->nr; i++)
-		ja->buckets[i] = le64_to_cpu(journal_buckets->buckets[i]);
+	if (journal_buckets_v2) {
+		unsigned nr = bch2_sb_field_journal_v2_nr_entries(journal_buckets_v2);
+		unsigned j, dst = 0;
+
+		for (i = 0; i < nr; i++)
+			for (j = 0; j < le64_to_cpu(journal_buckets_v2->d[i].nr); j++)
+				ja->buckets[dst++] =
+					le64_to_cpu(journal_buckets_v2->d[i].start) + j;
+	} else if (journal_buckets) {
+		for (i = 0; i < ja->nr; i++)
+			ja->buckets[i] = le64_to_cpu(journal_buckets->buckets[i]);
+	}
 
 	return 0;
 }
diff --git a/fs/bcachefs/journal_sb.c b/fs/bcachefs/journal_sb.c
new file mode 100644
index 000000000000..6d984313d4b5
--- /dev/null
+++ b/fs/bcachefs/journal_sb.c
@@ -0,0 +1,220 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "journal_sb.h"
+#include "darray.h"
+
+#include <linux/sort.h>
+
+/* BCH_SB_FIELD_journal: */
+
+static int u64_cmp(const void *_l, const void *_r)
+{
+	const u64 *l = _l;
+	const u64 *r = _r;
+
+	return cmp_int(*l, *r);
+}
+
+static int bch2_sb_journal_validate(struct bch_sb *sb,
+				    struct bch_sb_field *f,
+				    struct printbuf *err)
+{
+	struct bch_sb_field_journal *journal = field_to_type(f, journal);
+	struct bch_member *m = bch2_sb_get_members(sb)->members + sb->dev_idx;
+	int ret = -EINVAL;
+	unsigned nr;
+	unsigned i;
+	u64 *b;
+
+	nr = bch2_nr_journal_buckets(journal);
+	if (!nr)
+		return 0;
+
+	b = kmalloc_array(sizeof(u64), nr, GFP_KERNEL);
+	if (!b)
+		return -ENOMEM;
+
+	for (i = 0; i < nr; i++)
+		b[i] = le64_to_cpu(journal->buckets[i]);
+
+	sort(b, nr, sizeof(u64), u64_cmp, NULL);
+
+	if (!b[0]) {
+		pr_buf(err, "journal bucket at sector 0");
+		goto err;
+	}
+
+	if (b[0] < le16_to_cpu(m->first_bucket)) {
+		pr_buf(err, "journal bucket %llu before first bucket %u",
+		       b[0], le16_to_cpu(m->first_bucket));
+		goto err;
+	}
+
+	if (b[nr - 1] >= le64_to_cpu(m->nbuckets)) {
+		pr_buf(err, "journal bucket %llu past end of device (nbuckets %llu)",
+		       b[nr - 1], le64_to_cpu(m->nbuckets));
+		goto err;
+	}
+
+	for (i = 0; i + 1 < nr; i++)
+		if (b[i] == b[i + 1]) {
+			pr_buf(err, "duplicate journal buckets %llu", b[i]);
+			goto err;
+		}
+
+	ret = 0;
+err:
+	kfree(b);
+	return ret;
+}
+
+static void bch2_sb_journal_to_text(struct printbuf *out, struct bch_sb *sb,
+				    struct bch_sb_field *f)
+{
+	struct bch_sb_field_journal *journal = field_to_type(f, journal);
+	unsigned i, nr = bch2_nr_journal_buckets(journal);
+
+	pr_buf(out, "Buckets: ");
+	for (i = 0; i < nr; i++)
+		pr_buf(out, " %llu", le64_to_cpu(journal->buckets[i]));
+	pr_newline(out);
+}
+
+const struct bch_sb_field_ops bch_sb_field_ops_journal = {
+	.validate	= bch2_sb_journal_validate,
+	.to_text	= bch2_sb_journal_to_text,
+};
+
+struct u64_range {
+	u64	start;
+	u64	end;
+};
+
+static int u64_range_cmp(const void *_l, const void *_r)
+{
+	const struct u64_range *l = _l;
+	const struct u64_range *r = _r;
+
+	return cmp_int(l->start, r->start);
+}
+
+static int bch2_sb_journal_v2_validate(struct bch_sb *sb,
+				    struct bch_sb_field *f,
+				    struct printbuf *err)
+{
+	struct bch_sb_field_journal_v2 *journal = field_to_type(f, journal_v2);
+	struct bch_member *m = bch2_sb_get_members(sb)->members + sb->dev_idx;
+	int ret = -EINVAL;
+	unsigned nr;
+	unsigned i;
+	struct u64_range *b;
+
+	nr = bch2_sb_field_journal_v2_nr_entries(journal);
+	if (!nr)
+		return 0;
+
+	b = kmalloc_array(sizeof(*b), nr, GFP_KERNEL);
+	if (!b)
+		return -ENOMEM;
+
+	for (i = 0; i < nr; i++) {
+		b[i].start = le64_to_cpu(journal->d[i].start);
+		b[i].end = b[i].start + le64_to_cpu(journal->d[i].nr);
+	}
+
+	sort(b, nr, sizeof(*b), u64_range_cmp, NULL);
+
+	if (!b[0].start) {
+		pr_buf(err, "journal bucket at sector 0");
+		goto err;
+	}
+
+	if (b[0].start < le16_to_cpu(m->first_bucket)) {
+		pr_buf(err, "journal bucket %llu before first bucket %u",
+		       b[0].start, le16_to_cpu(m->first_bucket));
+		goto err;
+	}
+
+	if (b[nr - 1].end > le64_to_cpu(m->nbuckets)) {
+		pr_buf(err, "journal bucket %llu past end of device (nbuckets %llu)",
+		       b[nr - 1].end - 1, le64_to_cpu(m->nbuckets));
+		goto err;
+	}
+
+	for (i = 0; i + 1 < nr; i++) {
+		if (b[i].end > b[i + 1].start) {
+			pr_buf(err, "duplicate journal buckets in ranges %llu-%llu, %llu-%llu",
+			       b[i].start, b[i].end, b[i + 1].start, b[i + 1].end);
+			goto err;
+		}
+	}
+
+	ret = 0;
+err:
+	kfree(b);
+	return ret;
+}
+
+static void bch2_sb_journal_v2_to_text(struct printbuf *out, struct bch_sb *sb,
+				    struct bch_sb_field *f)
+{
+	struct bch_sb_field_journal_v2 *journal = field_to_type(f, journal_v2);
+	unsigned i, nr = bch2_sb_field_journal_v2_nr_entries(journal);
+
+	pr_buf(out, "Buckets: ");
+	for (i = 0; i < nr; i++)
+		pr_buf(out, " %llu-%llu",
+		       le64_to_cpu(journal->d[i].start),
+		       le64_to_cpu(journal->d[i].start) + le64_to_cpu(journal->d[i].nr));
+	pr_newline(out);
+}
+
+const struct bch_sb_field_ops bch_sb_field_ops_journal_v2 = {
+	.validate	= bch2_sb_journal_v2_validate,
+	.to_text	= bch2_sb_journal_v2_to_text,
+};
+
+int bch2_journal_buckets_to_sb(struct bch_fs *c, struct bch_dev *ca)
+{
+	struct journal_device *ja = &ca->journal;
+	struct bch_sb_field_journal_v2 *j;
+	unsigned i, dst = 0, nr = 1;
+
+	if (c)
+		lockdep_assert_held(&c->sb_lock);
+
+	if (!ja->nr) {
+		bch2_sb_field_delete(&ca->disk_sb, BCH_SB_FIELD_journal);
+		bch2_sb_field_delete(&ca->disk_sb, BCH_SB_FIELD_journal_v2);
+		return 0;
+	}
+
+	for (i = 0; i + 1 < ja->nr; i++)
+		if (ja->buckets[i] + 1 != ja->buckets[i + 1])
+			nr++;
+
+	j = bch2_sb_resize_journal_v2(&ca->disk_sb,
+				 (sizeof(*j) + sizeof(j->d[0]) * nr) / sizeof(u64));
+	if (!j)
+		return -ENOSPC;
+
+	bch2_sb_field_delete(&ca->disk_sb, BCH_SB_FIELD_journal);
+
+	j->d[dst].start = le64_to_cpu(ja->buckets[0]);
+	j->d[dst].nr	= le64_to_cpu(1);
+
+	for (i = 1; i < ja->nr; i++) {
+		if (ja->buckets[i] == ja->buckets[i - 1] + 1) {
+			le64_add_cpu(&j->d[dst].nr, 1);
+		} else {
+			dst++;
+			j->d[dst].start = le64_to_cpu(ja->buckets[i]);
+			j->d[dst].nr	= le64_to_cpu(1);
+		}
+	}
+
+	BUG_ON(dst + 1 != nr);
+
+	return 0;
+}
diff --git a/fs/bcachefs/journal_sb.h b/fs/bcachefs/journal_sb.h
new file mode 100644
index 000000000000..a39192e9f6f4
--- /dev/null
+++ b/fs/bcachefs/journal_sb.h
@@ -0,0 +1,24 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#include "super-io.h"
+#include "vstructs.h"
+
+static inline unsigned bch2_nr_journal_buckets(struct bch_sb_field_journal *j)
+{
+	return j
+		? (__le64 *) vstruct_end(&j->field) - j->buckets
+		: 0;
+}
+
+static inline unsigned bch2_sb_field_journal_v2_nr_entries(struct bch_sb_field_journal_v2 *j)
+{
+	if (!j)
+		return 0;
+
+	return (struct bch_sb_field_journal_v2_entry *) vstruct_end(&j->field) - &j->d[0];
+}
+
+extern const struct bch_sb_field_ops bch_sb_field_ops_journal;
+extern const struct bch_sb_field_ops bch_sb_field_ops_journal_v2;
+
+int bch2_journal_buckets_to_sb(struct bch_fs *, struct bch_dev *);
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index 5c87c7308274..bc845c42768e 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -10,6 +10,7 @@
 #include "io.h"
 #include "journal.h"
 #include "journal_io.h"
+#include "journal_sb.h"
 #include "journal_seq_blacklist.h"
 #include "replicas.h"
 #include "quota.h"
@@ -459,7 +460,7 @@ static void __copy_super(struct bch_sb_handle *dst_handle, struct bch_sb *src)
 	memcpy(dst->compat,	src->compat,	sizeof(dst->compat));
 
 	for (i = 0; i < BCH_SB_FIELD_NR; i++) {
-		if (i == BCH_SB_FIELD_journal)
+		if ((1U << i) & BCH_SINGLE_DEVICE_SB_FIELDS)
 			continue;
 
 		src_f = bch2_sb_field_get(src, i);
@@ -929,85 +930,6 @@ void __bch2_check_set_feature(struct bch_fs *c, unsigned feat)
 	mutex_unlock(&c->sb_lock);
 }
 
-/* BCH_SB_FIELD_journal: */
-
-static int u64_cmp(const void *_l, const void *_r)
-{
-	u64 l = *((const u64 *) _l), r = *((const u64 *) _r);
-
-	return l < r ? -1 : l > r ? 1 : 0;
-}
-
-static int bch2_sb_journal_validate(struct bch_sb *sb,
-				    struct bch_sb_field *f,
-				    struct printbuf *err)
-{
-	struct bch_sb_field_journal *journal = field_to_type(f, journal);
-	struct bch_member *m = bch2_sb_get_members(sb)->members + sb->dev_idx;
-	int ret = -EINVAL;
-	unsigned nr;
-	unsigned i;
-	u64 *b;
-
-	nr = bch2_nr_journal_buckets(journal);
-	if (!nr)
-		return 0;
-
-	b = kmalloc_array(sizeof(u64), nr, GFP_KERNEL);
-	if (!b)
-		return -ENOMEM;
-
-	for (i = 0; i < nr; i++)
-		b[i] = le64_to_cpu(journal->buckets[i]);
-
-	sort(b, nr, sizeof(u64), u64_cmp, NULL);
-
-	if (!b[0]) {
-		pr_buf(err, "journal bucket at sector 0");
-		goto err;
-	}
-
-	if (b[0] < le16_to_cpu(m->first_bucket)) {
-		pr_buf(err, "journal bucket %llu before first bucket %u",
-		       b[0], le16_to_cpu(m->first_bucket));
-		goto err;
-	}
-
-	if (b[nr - 1] >= le64_to_cpu(m->nbuckets)) {
-		pr_buf(err, "journal bucket %llu past end of device (nbuckets %llu)",
-		       b[nr - 1], le64_to_cpu(m->nbuckets));
-		goto err;
-	}
-
-	for (i = 0; i + 1 < nr; i++)
-		if (b[i] == b[i + 1]) {
-			pr_buf(err, "duplicate journal buckets %llu", b[i]);
-			goto err;
-		}
-
-	ret = 0;
-err:
-	kfree(b);
-	return ret;
-}
-
-static void bch2_sb_journal_to_text(struct printbuf *out, struct bch_sb *sb,
-				    struct bch_sb_field *f)
-{
-	struct bch_sb_field_journal *journal = field_to_type(f, journal);
-	unsigned i, nr = bch2_nr_journal_buckets(journal);
-
-	pr_buf(out, "Buckets: ");
-	for (i = 0; i < nr; i++)
-		pr_buf(out, " %llu", le64_to_cpu(journal->buckets[i]));
-	pr_newline(out);
-}
-
-static const struct bch_sb_field_ops bch_sb_field_ops_journal = {
-	.validate	= bch2_sb_journal_validate,
-	.to_text	= bch2_sb_journal_to_text,
-};
-
 /* BCH_SB_FIELD_members: */
 
 static int bch2_sb_members_validate(struct bch_sb *sb,
diff --git a/fs/bcachefs/super-io.h b/fs/bcachefs/super-io.h
index ccd6fe7fdf29..bed61a01e4be 100644
--- a/fs/bcachefs/super-io.h
+++ b/fs/bcachefs/super-io.h
@@ -75,15 +75,6 @@ static inline void bch2_check_set_feature(struct bch_fs *c, unsigned feat)
 		__bch2_check_set_feature(c, feat);
 }
 
-/* BCH_SB_FIELD_journal: */
-
-static inline unsigned bch2_nr_journal_buckets(struct bch_sb_field_journal *j)
-{
-	return j
-		? (__le64 *) vstruct_end(&j->field) - j->buckets
-		: 0;
-}
-
 /* BCH_SB_FIELD_members: */
 
 static inline bool bch2_member_exists(struct bch_member *m)
-- 
cgit 


From 179e3434fac14a100bad2edba4fd401bffb67802 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 5 Jan 2022 22:13:13 -0500
Subject: bcachefs: KEY_TYPE_set

A new empty key type, to be used when using a btree as a set.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/bcachefs_format.h |  7 ++++++-
 fs/bcachefs/bkey_methods.c    | 18 ++++++++++++++++++
 2 files changed, 24 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index 838754ad60c3..350317e8b34f 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -351,7 +351,8 @@ static inline void bkey_init(struct bkey *k)
 	x(subvolume,		21)			\
 	x(snapshot,		22)			\
 	x(inode_v2,		23)			\
-	x(alloc_v3,		24)
+	x(alloc_v3,		24)			\
+	x(set,			25)
 
 enum bch_bkey_type {
 #define x(name, nr) KEY_TYPE_##name	= nr,
@@ -381,6 +382,10 @@ struct bch_hash_whiteout {
 	struct bch_val		v;
 };
 
+struct bch_set {
+	struct bch_val		v;
+};
+
 /* Extents */
 
 /*
diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c
index f11b6d9f08d6..d938ee826c75 100644
--- a/fs/bcachefs/bkey_methods.c
+++ b/fs/bcachefs/bkey_methods.c
@@ -85,6 +85,24 @@ static void key_type_inline_data_to_text(struct printbuf *out, struct bch_fs *c,
 	.val_to_text	= key_type_inline_data_to_text,	\
 }
 
+static const char *key_type_set_invalid(const struct bch_fs *c, struct bkey_s_c k)
+{
+	if (bkey_val_bytes(k.k))
+		return "nonempty value";
+	return NULL;
+}
+
+static bool key_type_set_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r)
+{
+	bch2_key_resize(l.k, l.k->size + r.k->size);
+	return true;
+}
+
+#define bch2_bkey_ops_set (struct bkey_ops) {		\
+	.key_invalid	= key_type_set_invalid,		\
+	.key_merge	= key_type_set_merge,		\
+}
+
 const struct bkey_ops bch2_bkey_ops[] = {
 #define x(name, nr) [KEY_TYPE_##name]	= bch2_bkey_ops_##name,
 	BCH_BKEY_TYPES()
-- 
cgit 


From d326ab2f5de201b9b7e790c653a2b925e7032d3b Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 5 Dec 2021 00:31:54 -0500
Subject: bcachefs: LRU btree

This implements new persistent LRUs, to be used for buckets containing
cached data, as well as stripes ordered by time when a block became
empty.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/Makefile          |   1 +
 fs/bcachefs/bcachefs.h        |   1 +
 fs/bcachefs/bcachefs_format.h |  15 +++++-
 fs/bcachefs/bkey_methods.c    |   4 ++
 fs/bcachefs/lru.c             | 119 ++++++++++++++++++++++++++++++++++++++++++
 fs/bcachefs/lru.h             |  15 ++++++
 6 files changed, 153 insertions(+), 2 deletions(-)
 create mode 100644 fs/bcachefs/lru.c
 create mode 100644 fs/bcachefs/lru.h

(limited to 'fs')

diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile
index d3808249948c..17423584a3f3 100644
--- a/fs/bcachefs/Makefile
+++ b/fs/bcachefs/Makefile
@@ -41,6 +41,7 @@ bcachefs-y		:=	\
 	journal_sb.o		\
 	journal_seq_blacklist.o	\
 	keylist.o		\
+	lru.o			\
 	migrate.o		\
 	move.o			\
 	movinggc.o		\
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 01e9ed5dfc61..7891ad208a33 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -391,6 +391,7 @@ enum gc_phase {
 	GC_PHASE_BTREE_reflink,
 	GC_PHASE_BTREE_subvolumes,
 	GC_PHASE_BTREE_snapshots,
+	GC_PHASE_BTREE_lru,
 
 	GC_PHASE_PENDING_DELETE,
 };
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index 350317e8b34f..982409ed940e 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -352,7 +352,8 @@ static inline void bkey_init(struct bkey *k)
 	x(snapshot,		22)			\
 	x(inode_v2,		23)			\
 	x(alloc_v3,		24)			\
-	x(set,			25)
+	x(set,			25)			\
+	x(lru,			26)
 
 enum bch_bkey_type {
 #define x(name, nr) KEY_TYPE_##name	= nr,
@@ -1024,6 +1025,15 @@ LE32_BITMASK(BCH_SNAPSHOT_DELETED,	struct bch_snapshot, flags,  0,  1)
 /* True if a subvolume points to this snapshot node: */
 LE32_BITMASK(BCH_SNAPSHOT_SUBVOL,	struct bch_snapshot, flags,  1,  2)
 
+/* LRU btree: */
+
+struct bch_lru {
+	struct bch_val		v;
+	__le64			idx;
+} __attribute__((packed, aligned(8)));
+
+#define LRU_ID_STRIPES		(1U << 16)
+
 /* Optional/variable size superblock sections: */
 
 struct bch_sb_field {
@@ -1838,7 +1848,8 @@ LE32_BITMASK(JSET_NO_FLUSH,	struct jset, flags, 5, 6);
 	x(stripes,	6)			\
 	x(reflink,	7)			\
 	x(subvolumes,	8)			\
-	x(snapshots,	9)
+	x(snapshots,	9)			\
+	x(lru,		10)
 
 enum btree_id {
 #define x(kwd, val) BTREE_ID_##kwd = val,
diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c
index d938ee826c75..8757218e571b 100644
--- a/fs/bcachefs/bkey_methods.c
+++ b/fs/bcachefs/bkey_methods.c
@@ -9,6 +9,7 @@
 #include "error.h"
 #include "extents.h"
 #include "inode.h"
+#include "lru.h"
 #include "quota.h"
 #include "reflink.h"
 #include "subvolume.h"
@@ -165,6 +166,9 @@ static unsigned bch2_key_types_allowed[] = {
 	[BKEY_TYPE_snapshots] =
 		(1U << KEY_TYPE_deleted)|
 		(1U << KEY_TYPE_snapshot),
+	[BKEY_TYPE_lru] =
+		(1U << KEY_TYPE_deleted)|
+		(1U << KEY_TYPE_lru),
 	[BKEY_TYPE_btree] =
 		(1U << KEY_TYPE_deleted)|
 		(1U << KEY_TYPE_btree_ptr)|
diff --git a/fs/bcachefs/lru.c b/fs/bcachefs/lru.c
new file mode 100644
index 000000000000..2ababca5efe5
--- /dev/null
+++ b/fs/bcachefs/lru.c
@@ -0,0 +1,119 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "btree_iter.h"
+#include "btree_update.h"
+#include "error.h"
+#include "lru.h"
+
+const char *bch2_lru_invalid(const struct bch_fs *c, struct bkey_s_c k)
+{
+	const struct bch_lru *lru = bkey_s_c_to_lru(k).v;
+
+	if (bkey_val_bytes(k.k) < sizeof(*lru))
+		return "incorrect value size";
+
+	return NULL;
+}
+
+void bch2_lru_to_text(struct printbuf *out, struct bch_fs *c,
+		      struct bkey_s_c k)
+{
+	const struct bch_lru *lru = bkey_s_c_to_lru(k).v;
+
+	pr_buf(out, "idx %llu", le64_to_cpu(lru->idx));
+}
+
+static int lru_delete(struct btree_trans *trans, u64 id, u64 idx, u64 time)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	u64 existing_idx;
+	int ret = 0;
+
+	if (!time)
+		return 0;
+
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_lru,
+			     POS(id, time),
+			     BTREE_ITER_INTENT|
+			     BTREE_ITER_WITH_UPDATES);
+	k = bch2_btree_iter_peek_slot(&iter);
+	ret = bkey_err(k);
+	if (ret)
+		goto err;
+
+	if (k.k->type != KEY_TYPE_lru) {
+		bch2_fs_inconsistent(c,
+			"pointer to nonexistent lru %llu:%llu",
+			id, time);
+		ret = -EIO;
+		goto err;
+	}
+
+	existing_idx = le64_to_cpu(bkey_s_c_to_lru(k).v->idx);
+	if (existing_idx != idx) {
+		bch2_fs_inconsistent(c,
+			"lru %llu:%llu with wrong backpointer: got %llu, should be %llu",
+			id, time, existing_idx, idx);
+		ret = -EIO;
+		goto err;
+	}
+
+	ret = bch2_btree_delete_at(trans, &iter, 0);
+err:
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+static int lru_set(struct btree_trans *trans, u64 lru_id, u64 idx, u64 *time)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	struct bkey_i_lru *lru;
+	int ret = 0;
+
+	if (!*time)
+		return 0;
+
+	for_each_btree_key_norestart(trans, iter, BTREE_ID_lru,
+			POS(lru_id, *time),
+			BTREE_ITER_SLOTS|
+			BTREE_ITER_INTENT|
+			BTREE_ITER_WITH_UPDATES, k, ret)
+		if (bkey_deleted(k.k))
+			break;
+
+	if (ret)
+		goto err;
+
+	BUG_ON(iter.pos.inode != lru_id);
+	*time = iter.pos.offset;
+
+	lru = bch2_trans_kmalloc(trans, sizeof(*lru));
+	ret = PTR_ERR_OR_ZERO(lru);
+	if (ret)
+		goto err;
+
+	bkey_lru_init(&lru->k_i);
+	lru->k.p	= iter.pos;
+	lru->v.idx	= cpu_to_le64(idx);
+
+	ret = bch2_trans_update(trans, &iter, &lru->k_i, 0);
+	if (ret)
+		goto err;
+err:
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+int bch2_lru_change(struct btree_trans *trans, u64 id, u64 idx,
+		    u64 old_time, u64 *new_time)
+{
+	if (old_time == *new_time)
+		return 0;
+
+	return  lru_delete(trans, id, idx, old_time) ?:
+		lru_set(trans, id, idx, new_time);
+}
diff --git a/fs/bcachefs/lru.h b/fs/bcachefs/lru.h
new file mode 100644
index 000000000000..c3121cfee285
--- /dev/null
+++ b/fs/bcachefs/lru.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_LRU_H
+#define _BCACHEFS_LRU_H
+
+const char *bch2_lru_invalid(const struct bch_fs *, struct bkey_s_c);
+void bch2_lru_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
+
+#define bch2_bkey_ops_lru (struct bkey_ops) {	\
+	.key_invalid	= bch2_lru_invalid,	\
+	.val_to_text	= bch2_lru_to_text,	\
+}
+
+int bch2_lru_change(struct btree_trans *, u64, u64, u64, u64 *);
+
+#endif /* _BCACHEFS_LRU_H */
-- 
cgit 


From 3d48a7f85f83a51a0eb0d0a6537be26a20691260 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 31 Dec 2021 20:03:29 -0500
Subject: bcachefs: KEY_TYPE_alloc_v4

This introduces a new alloc key which doesn't use varints. Soon we'll be
adding backpointers and storing them in alloc keys, which means our
pack/unpack workflow for alloc keys won't really work - we'll need to be
mutating alloc keys in place.

Instead of bch2_alloc_unpack(), we now have bch2_alloc_to_v4() that
converts older types of alloc keys to v4 if needed.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c | 285 ++++++++++++++++++++++++-----------------
 fs/bcachefs/alloc_background.h |  51 ++------
 fs/bcachefs/bcachefs_format.h  |  42 +++++-
 fs/bcachefs/bkey_methods.c     |   3 +-
 fs/bcachefs/btree_gc.c         |  76 +++++++----
 fs/bcachefs/btree_types.h      |   1 +
 fs/bcachefs/buckets.c          | 186 ++++++++++++---------------
 fs/bcachefs/buckets.h          |   8 ++
 fs/bcachefs/buckets_types.h    |   2 +-
 fs/bcachefs/movinggc.c         |  26 ++--
 10 files changed, 379 insertions(+), 301 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 33b2e4d7da3b..099be1290b4c 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -33,13 +33,27 @@ const char * const bch2_allocator_states[] = {
 	NULL
 };
 
+/* Persistent alloc info: */
+
 static const unsigned BCH_ALLOC_V1_FIELD_BYTES[] = {
 #define x(name, bits) [BCH_ALLOC_FIELD_V1_##name] = bits / 8,
 	BCH_ALLOC_FIELDS_V1()
 #undef x
 };
 
-/* Persistent alloc info: */
+struct bkey_alloc_unpacked {
+	u64		journal_seq;
+	u64		bucket;
+	u8		dev;
+	u8		gen;
+	u8		oldest_gen;
+	u8		data_type;
+	bool		need_discard:1;
+	bool		need_inc_gen:1;
+#define x(_name, _bits)	u##_bits _name;
+	BCH_ALLOC_FIELDS_V2()
+#undef  x
+};
 
 static inline u64 alloc_field_v1_get(const struct bch_alloc *a,
 				     const void **p, unsigned field)
@@ -161,6 +175,8 @@ static int bch2_alloc_unpack_v3(struct bkey_alloc_unpacked *out,
 	out->gen	= a.v->gen;
 	out->oldest_gen	= a.v->oldest_gen;
 	out->data_type	= a.v->data_type;
+	out->need_discard = BCH_ALLOC_V3_NEED_DISCARD(a.v);
+	out->need_inc_gen = BCH_ALLOC_V3_NEED_INC_GEN(a.v);
 	out->journal_seq = le64_to_cpu(a.v->journal_seq);
 
 #define x(_name, _bits)							\
@@ -182,47 +198,7 @@ static int bch2_alloc_unpack_v3(struct bkey_alloc_unpacked *out,
 	return 0;
 }
 
-static void bch2_alloc_pack_v3(struct bkey_alloc_buf *dst,
-			       const struct bkey_alloc_unpacked src)
-{
-	struct bkey_i_alloc_v3 *a = bkey_alloc_v3_init(&dst->k);
-	unsigned nr_fields = 0, last_nonzero_fieldnr = 0;
-	u8 *out = a->v.data;
-	u8 *end = (void *) &dst[1];
-	u8 *last_nonzero_field = out;
-	unsigned bytes;
-
-	a->k.p		= POS(src.dev, src.bucket);
-	a->v.gen	= src.gen;
-	a->v.oldest_gen	= src.oldest_gen;
-	a->v.data_type	= src.data_type;
-	a->v.journal_seq = cpu_to_le64(src.journal_seq);
-
-#define x(_name, _bits)							\
-	nr_fields++;							\
-									\
-	if (src._name) {						\
-		out += bch2_varint_encode_fast(out, src._name);		\
-									\
-		last_nonzero_field = out;				\
-		last_nonzero_fieldnr = nr_fields;			\
-	} else {							\
-		*out++ = 0;						\
-	}
-
-	BCH_ALLOC_FIELDS_V2()
-#undef  x
-	BUG_ON(out > end);
-
-	out = last_nonzero_field;
-	a->v.nr_fields = last_nonzero_fieldnr;
-
-	bytes = (u8 *) out - (u8 *) &a->v;
-	set_bkey_val_bytes(&a->k, bytes);
-	memset_u64s_tail(&a->v, 0, bytes);
-}
-
-struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c k)
+static struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c k)
 {
 	struct bkey_alloc_unpacked ret = {
 		.dev	= k.k->p.inode,
@@ -245,32 +221,71 @@ struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c k)
 	return ret;
 }
 
-struct bkey_alloc_buf *bch2_alloc_pack(struct btree_trans *trans,
-				       const struct bkey_alloc_unpacked src)
+void bch2_alloc_to_v4(struct bkey_s_c k, struct bch_alloc_v4 *out)
 {
-	struct bkey_alloc_buf *dst;
+	if (k.k->type == KEY_TYPE_alloc_v4) {
+		*out = *bkey_s_c_to_alloc_v4(k).v;
+	} else {
+		struct bkey_alloc_unpacked u = bch2_alloc_unpack(k);
+
+		*out = (struct bch_alloc_v4) {
+			.journal_seq		= u.journal_seq,
+			.flags			= u.need_discard,
+			.gen			= u.gen,
+			.oldest_gen		= u.oldest_gen,
+			.data_type		= u.data_type,
+			.stripe_redundancy	= u.stripe_redundancy,
+			.dirty_sectors		= u.dirty_sectors,
+			.cached_sectors		= u.cached_sectors,
+			.io_time[READ]		= u.read_time,
+			.io_time[WRITE]		= u.write_time,
+			.stripe			= u.stripe,
+		};
+	}
+}
 
-	dst = bch2_trans_kmalloc(trans, sizeof(struct bkey_alloc_buf));
-	if (!IS_ERR(dst))
-		bch2_alloc_pack_v3(dst, src);
+struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut(struct btree_trans *trans, struct bkey_s_c k)
+{
+	struct bkey_i_alloc_v4 *ret;
 
-	return dst;
+	if (k.k->type == KEY_TYPE_alloc_v4) {
+		ret = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
+		if (!IS_ERR(ret))
+			bkey_reassemble(&ret->k_i, k);
+	} else {
+		ret = bch2_trans_kmalloc(trans, sizeof(*ret));
+		if (!IS_ERR(ret)) {
+			bkey_alloc_v4_init(&ret->k_i);
+			ret->k.p = k.k->p;
+			bch2_alloc_to_v4(k, &ret->v);
+		}
+	}
+	return ret;
 }
 
-int bch2_alloc_write(struct btree_trans *trans, struct btree_iter *iter,
-		     struct bkey_alloc_unpacked *u, unsigned trigger_flags)
+struct bkey_i_alloc_v4 *
+bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree_iter *iter,
+			      struct bpos pos)
 {
-	struct bkey_alloc_buf *a = bch2_alloc_pack(trans, *u);
+	struct bkey_s_c k;
+	struct bkey_i_alloc_v4 *a;
+	int ret;
 
-	/*
-	 * Without BTREE_UPDATE_NO_KEY_CACHE_COHERENCY, we may end up updating
-	 * the btree instead of the key cache - this can casue the allocator to
-	 * self-deadlock, since updating the btree may require allocating new
-	 * btree nodes:
-	 */
-	return PTR_ERR_OR_ZERO(a) ?:
-		bch2_trans_update(trans, iter, &a->k, trigger_flags|
-				 BTREE_UPDATE_NO_KEY_CACHE_COHERENCY);
+	bch2_trans_iter_init(trans, iter, BTREE_ID_alloc, pos,
+			     BTREE_ITER_WITH_UPDATES|
+			     BTREE_ITER_CACHED|
+			     BTREE_ITER_INTENT);
+	k = bch2_btree_iter_peek_slot(iter);
+	ret = bkey_err(k);
+	if (ret) {
+		bch2_trans_iter_exit(trans, iter);
+		return ERR_PTR(ret);
+	}
+
+	a = bch2_alloc_to_v4_mut(trans, k);
+	if (IS_ERR(a))
+		bch2_trans_iter_exit(trans, iter);
+	return a;
 }
 
 static unsigned bch_alloc_v1_val_u64s(const struct bch_alloc *a)
@@ -316,28 +331,70 @@ const char *bch2_alloc_v2_invalid(const struct bch_fs *c, struct bkey_s_c k)
 const char *bch2_alloc_v3_invalid(const struct bch_fs *c, struct bkey_s_c k)
 {
 	struct bkey_alloc_unpacked u;
+	struct bch_dev *ca;
 
 	if (k.k->p.inode >= c->sb.nr_devices ||
 	    !c->devs[k.k->p.inode])
 		return "invalid device";
 
+	ca = bch_dev_bkey_exists(c, k.k->p.inode);
+
+	if (k.k->p.offset < ca->mi.first_bucket ||
+	    k.k->p.offset >= ca->mi.nbuckets)
+		return "invalid bucket";
+
 	if (bch2_alloc_unpack_v3(&u, k))
 		return "unpack error";
 
 	return NULL;
 }
 
-void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c,
-			   struct bkey_s_c k)
+const char *bch2_alloc_v4_invalid(const struct bch_fs *c, struct bkey_s_c k)
 {
-	struct bkey_alloc_unpacked u = bch2_alloc_unpack(k);
+	struct bch_dev *ca;
 
-	pr_buf(out, "gen %u oldest_gen %u data_type %s journal_seq %llu",
-	       u.gen, u.oldest_gen, bch2_data_types[u.data_type],
-	       u.journal_seq);
-#define x(_name, ...)	pr_buf(out, " " #_name " %llu", (u64) u._name);
-	BCH_ALLOC_FIELDS_V2()
-#undef  x
+	if (k.k->p.inode >= c->sb.nr_devices ||
+	    !c->devs[k.k->p.inode])
+		return "invalid device";
+
+	ca = bch_dev_bkey_exists(c, k.k->p.inode);
+
+	if (k.k->p.offset < ca->mi.first_bucket ||
+	    k.k->p.offset >= ca->mi.nbuckets)
+		return "invalid bucket";
+
+	return NULL;
+}
+
+void bch2_alloc_v4_swab(struct bkey_s k)
+{
+	struct bch_alloc_v4 *a = bkey_s_to_alloc_v4(k).v;
+
+	a->journal_seq		= swab64(a->journal_seq);
+	a->flags		= swab32(a->flags);
+	a->dirty_sectors	= swab32(a->dirty_sectors);
+	a->cached_sectors	= swab32(a->cached_sectors);
+	a->io_time[0]		= swab64(a->io_time[0]);
+	a->io_time[1]		= swab64(a->io_time[1]);
+	a->stripe		= swab32(a->stripe);
+	a->nr_external_backpointers = swab32(a->nr_external_backpointers);
+}
+
+void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
+{
+	struct bch_alloc_v4 a;
+
+	bch2_alloc_to_v4(k, &a);
+
+	pr_buf(out, "gen %u oldest_gen %u data_type %s journal_seq %llu need_discard %llu",
+	       a.gen, a.oldest_gen, bch2_data_types[a.data_type],
+	       a.journal_seq, BCH_ALLOC_V4_NEED_DISCARD(&a));
+	pr_buf(out, " dirty_sectors %u",	a.dirty_sectors);
+	pr_buf(out, " cached_sectors %u",	a.cached_sectors);
+	pr_buf(out, " stripe %u",		a.stripe);
+	pr_buf(out, " stripe_redundancy %u",	a.stripe_redundancy);
+	pr_buf(out, " read_time %llu",		a.io_time[READ]);
+	pr_buf(out, " write_time %llu",		a.io_time[WRITE]);
 }
 
 int bch2_alloc_read(struct bch_fs *c, bool gc, bool metadata_only)
@@ -345,9 +402,9 @@ int bch2_alloc_read(struct bch_fs *c, bool gc, bool metadata_only)
 	struct btree_trans trans;
 	struct btree_iter iter;
 	struct bkey_s_c k;
+	struct bch_alloc_v4 a;
 	struct bch_dev *ca;
 	struct bucket *g;
-	struct bkey_alloc_unpacked u;
 	int ret;
 
 	bch2_trans_init(&trans, c, 0, 0);
@@ -356,28 +413,28 @@ int bch2_alloc_read(struct bch_fs *c, bool gc, bool metadata_only)
 			   BTREE_ITER_PREFETCH, k, ret) {
 		ca = bch_dev_bkey_exists(c, k.k->p.inode);
 		g = __bucket(ca, k.k->p.offset, gc);
-		u = bch2_alloc_unpack(k);
+		bch2_alloc_to_v4(k, &a);
 
 		if (!gc)
-			*bucket_gen(ca, k.k->p.offset) = u.gen;
+			*bucket_gen(ca, k.k->p.offset) = a.gen;
 
-		g->_mark.gen		= u.gen;
-		g->io_time[READ]	= u.read_time;
-		g->io_time[WRITE]	= u.write_time;
-		g->oldest_gen		= !gc ? u.oldest_gen : u.gen;
+		g->_mark.gen		= a.gen;
+		g->io_time[READ]	= a.io_time[READ];
+		g->io_time[WRITE]	= a.io_time[WRITE];
+		g->oldest_gen		= !gc ? a.oldest_gen : a.gen;
 		g->gen_valid		= 1;
 
 		if (!gc ||
 		    (metadata_only &&
-		     (u.data_type == BCH_DATA_user ||
-		      u.data_type == BCH_DATA_cached ||
-		      u.data_type == BCH_DATA_parity))) {
-			g->_mark.data_type	= u.data_type;
-			g->_mark.dirty_sectors	= u.dirty_sectors;
-			g->_mark.cached_sectors	= u.cached_sectors;
-			g->_mark.stripe		= u.stripe != 0;
-			g->stripe		= u.stripe;
-			g->stripe_redundancy	= u.stripe_redundancy;
+		     (a.data_type == BCH_DATA_user ||
+		      a.data_type == BCH_DATA_cached ||
+		      a.data_type == BCH_DATA_parity))) {
+			g->_mark.data_type	= a.data_type;
+			g->_mark.dirty_sectors	= a.dirty_sectors;
+			g->_mark.cached_sectors	= a.cached_sectors;
+			g->_mark.stripe		= a.stripe != 0;
+			g->stripe		= a.stripe;
+			g->stripe_redundancy	= a.stripe_redundancy;
 		}
 
 	}
@@ -398,29 +455,22 @@ int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev,
 {
 	struct bch_fs *c = trans->c;
 	struct btree_iter iter;
-	struct bkey_s_c k;
-	struct bkey_alloc_unpacked u;
-	u64 *time, now;
+	struct bkey_i_alloc_v4 *a;
+	u64 now;
 	int ret = 0;
 
-	bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, POS(dev, bucket_nr),
-			     BTREE_ITER_CACHED|
-			     BTREE_ITER_INTENT);
-	k = bch2_btree_iter_peek_slot(&iter);
-	ret = bkey_err(k);
+	a = bch2_trans_start_alloc_update(trans, &iter,  POS(dev, bucket_nr));
+	ret = PTR_ERR_OR_ZERO(a);
 	if (ret)
-		goto out;
-
-	u = bch2_alloc_unpack(k);
+		return ret;
 
-	time = rw == READ ? &u.read_time : &u.write_time;
 	now = atomic64_read(&c->io_clock[rw].now);
-	if (*time == now)
+	if (a->v.io_time[rw] == now)
 		goto out;
 
-	*time = now;
+	a->v.io_time[rw] = now;
 
-	ret   = bch2_alloc_write(trans, &iter, &u, 0) ?:
+	ret   = bch2_trans_update(trans, &iter, &a->k_i, 0) ?:
 		bch2_trans_commit(trans, NULL, NULL, 0);
 out:
 	bch2_trans_iter_exit(trans, &iter);
@@ -604,7 +654,7 @@ static size_t find_reclaimable_buckets(struct bch_fs *c, struct bch_dev *ca)
 
 static int bucket_invalidate_btree(struct btree_trans *trans,
 				   struct bch_dev *ca, u64 b,
-				   struct bkey_alloc_unpacked *u)
+				   struct bkey_i_alloc_v4 *a)
 {
 	struct bch_fs *c = trans->c;
 	struct btree_iter iter;
@@ -621,16 +671,19 @@ static int bucket_invalidate_btree(struct btree_trans *trans,
 	if (ret)
 		goto err;
 
-	*u = bch2_alloc_unpack(k);
-	u->gen++;
-	u->data_type		= 0;
-	u->dirty_sectors	= 0;
-	u->cached_sectors	= 0;
-	u->read_time		= atomic64_read(&c->io_clock[READ].now);
-	u->write_time		= atomic64_read(&c->io_clock[WRITE].now);
-
-	ret = bch2_alloc_write(trans, &iter, u,
-			       BTREE_TRIGGER_BUCKET_INVALIDATE);
+	bkey_alloc_v4_init(&a->k_i);
+	a->k.p = iter.pos;
+	bch2_alloc_to_v4(k, &a->v);
+	a->v.gen++;
+	a->v.data_type		= 0;
+	a->v.dirty_sectors	= 0;
+	a->v.cached_sectors	= 0;
+	a->v.io_time[READ]	= atomic64_read(&c->io_clock[READ].now);
+	a->v.io_time[WRITE]	= atomic64_read(&c->io_clock[WRITE].now);
+
+	ret = bch2_trans_update(trans, &iter, &a->k_i,
+			       BTREE_TRIGGER_BUCKET_INVALIDATE|
+			       BTREE_UPDATE_NO_KEY_CACHE_COHERENCY);
 err:
 	bch2_trans_iter_exit(trans, &iter);
 	return ret;
@@ -639,7 +692,7 @@ err:
 static int bch2_invalidate_one_bucket(struct bch_fs *c, struct bch_dev *ca,
 				      u64 *journal_seq, unsigned flags)
 {
-	struct bkey_alloc_unpacked u;
+	struct bkey_i_alloc_v4 a;
 	size_t b;
 	u64 commit_seq = 0;
 	int ret = 0;
@@ -671,7 +724,7 @@ static int bch2_invalidate_one_bucket(struct bch_fs *c, struct bch_dev *ca,
 			    BTREE_INSERT_NOCHECK_RW|
 			    BTREE_INSERT_NOFAIL|
 			    flags,
-			    bucket_invalidate_btree(&trans, ca, b, &u));
+			    bucket_invalidate_btree(&trans, ca, b, &a));
 
 	if (!ret) {
 		/* remove from alloc_heap: */
@@ -687,14 +740,14 @@ static int bch2_invalidate_one_bucket(struct bch_fs *c, struct bch_dev *ca,
 		 * If we invalidating cached data then we need to wait on the
 		 * journal commit:
 		 */
-		if (u.data_type)
+		if (a.v.data_type)
 			*journal_seq = max(*journal_seq, commit_seq);
 
 		/*
 		 * We already waiting on u.alloc_seq when we filtered out
 		 * buckets that need journal commit:
 		 */
-		BUG_ON(*journal_seq > u.journal_seq);
+		BUG_ON(*journal_seq > a.v.journal_seq);
 	} else {
 		size_t b2;
 
diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h
index 3eaa6d204286..b66c8cf0341e 100644
--- a/fs/bcachefs/alloc_background.h
+++ b/fs/bcachefs/alloc_background.h
@@ -10,48 +10,14 @@
 
 extern const char * const bch2_allocator_states[];
 
-struct bkey_alloc_unpacked {
-	u64		journal_seq;
-	u64		bucket;
-	u8		dev;
-	u8		gen;
-	u8		oldest_gen;
-	u8		data_type;
-#define x(_name, _bits)	u##_bits _name;
-	BCH_ALLOC_FIELDS_V2()
-#undef  x
-};
-
 /* How out of date a pointer gen is allowed to be: */
 #define BUCKET_GC_GEN_MAX	96U
 
-/* returns true if not equal */
-static inline bool bkey_alloc_unpacked_cmp(struct bkey_alloc_unpacked l,
-					   struct bkey_alloc_unpacked r)
-{
-	return  l.gen != r.gen			||
-		l.oldest_gen != r.oldest_gen	||
-		l.data_type != r.data_type
-#define x(_name, ...)	|| l._name != r._name
-	BCH_ALLOC_FIELDS_V2()
-#undef  x
-	;
-}
-
-struct bkey_alloc_buf {
-	struct bkey_i	k;
-	struct bch_alloc_v3 v;
-
-#define x(_name,  _bits)		+ _bits / 8
-	u8		_pad[0 + BCH_ALLOC_FIELDS_V2()];
-#undef  x
-} __attribute__((packed, aligned(8)));
+struct bkey_i_alloc_v4 *
+bch2_trans_start_alloc_update(struct btree_trans *, struct btree_iter *, struct bpos);
 
-struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c);
-struct bkey_alloc_buf *bch2_alloc_pack(struct btree_trans *,
-				       const struct bkey_alloc_unpacked);
-int bch2_alloc_write(struct btree_trans *, struct btree_iter *,
-		     struct bkey_alloc_unpacked *, unsigned);
+void bch2_alloc_to_v4(struct bkey_s_c, struct bch_alloc_v4 *);
+struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut(struct btree_trans *, struct bkey_s_c);
 
 int bch2_bucket_io_time_reset(struct btree_trans *, unsigned, size_t, int);
 
@@ -60,6 +26,8 @@ int bch2_bucket_io_time_reset(struct btree_trans *, unsigned, size_t, int);
 const char *bch2_alloc_v1_invalid(const struct bch_fs *, struct bkey_s_c);
 const char *bch2_alloc_v2_invalid(const struct bch_fs *, struct bkey_s_c);
 const char *bch2_alloc_v3_invalid(const struct bch_fs *, struct bkey_s_c);
+const char *bch2_alloc_v4_invalid(const struct bch_fs *, struct bkey_s_c k);
+void bch2_alloc_v4_swab(struct bkey_s);
 void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 
 #define bch2_bkey_ops_alloc (struct bkey_ops) {		\
@@ -80,6 +48,13 @@ void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 	.atomic_trigger	= bch2_mark_alloc,		\
 }
 
+#define bch2_bkey_ops_alloc_v4 (struct bkey_ops) {	\
+	.key_invalid	= bch2_alloc_v4_invalid,	\
+	.val_to_text	= bch2_alloc_to_text,		\
+	.swab		= bch2_alloc_v4_swab,		\
+	.atomic_trigger	= bch2_mark_alloc,		\
+}
+
 static inline bool bkey_is_alloc(const struct bkey *k)
 {
 	return  k->type == KEY_TYPE_alloc ||
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index 982409ed940e..a640a45a123a 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -82,6 +82,21 @@
 typedef uuid_t __uuid_t;
 #endif
 
+#define BITMASK(name, type, field, offset, end)				\
+static const unsigned	name##_OFFSET = offset;				\
+static const unsigned	name##_BITS = (end - offset);			\
+									\
+static inline __u64 name(const type *k)					\
+{									\
+	return (k->field >> offset) & ~(~0ULL << (end - offset));	\
+}									\
+									\
+static inline void SET_##name(type *k, __u64 v)				\
+{									\
+	k->field &= ~(~(~0ULL << (end - offset)) << offset);		\
+	k->field |= (v & ~(~0ULL << (end - offset))) << offset;		\
+}
+
 #define LE_BITMASK(_bits, name, type, field, offset, end)		\
 static const unsigned	name##_OFFSET = offset;				\
 static const unsigned	name##_BITS = (end - offset);			\
@@ -353,7 +368,8 @@ static inline void bkey_init(struct bkey *k)
 	x(inode_v2,		23)			\
 	x(alloc_v3,		24)			\
 	x(set,			25)			\
-	x(lru,			26)
+	x(lru,			26)			\
+	x(alloc_v4,		27)
 
 enum bch_bkey_type {
 #define x(name, nr) KEY_TYPE_##name	= nr,
@@ -903,6 +919,30 @@ struct bch_alloc_v3 {
 	__u8			data[];
 } __attribute__((packed, aligned(8)));
 
+struct bch_alloc_v4 {
+	struct bch_val		v;
+	__u64			journal_seq;
+	__u32			flags;
+	__u8			gen;
+	__u8			oldest_gen;
+	__u8			data_type;
+	__u8			stripe_redundancy;
+	__u32			dirty_sectors;
+	__u32			cached_sectors;
+	__u64			io_time[2];
+	__u32			stripe;
+	__u32			nr_external_backpointers;
+	struct bpos		backpointers[0];
+} __attribute__((packed, aligned(8)));
+
+LE32_BITMASK(BCH_ALLOC_V3_NEED_DISCARD,struct bch_alloc_v3, flags,  0,  1)
+LE32_BITMASK(BCH_ALLOC_V3_NEED_INC_GEN,struct bch_alloc_v3, flags,  1,  2)
+
+BITMASK(BCH_ALLOC_V4_NEED_DISCARD,	struct bch_alloc_v4, flags,  0,  1)
+BITMASK(BCH_ALLOC_V4_NEED_INC_GEN,	struct bch_alloc_v4, flags,  1,  2)
+BITMASK(BCH_ALLOC_V4_BACKPOINTERS_START,struct bch_alloc_v4, flags,  2,  8)
+BITMASK(BCH_ALLOC_V4_NR_BACKPOINTERS,	struct bch_alloc_v4, flags,  8,  14)
+
 enum {
 #define x(name, _bits) BCH_ALLOC_FIELD_V1_##name,
 	BCH_ALLOC_FIELDS_V1()
diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c
index 8757218e571b..62774e4f6dbb 100644
--- a/fs/bcachefs/bkey_methods.c
+++ b/fs/bcachefs/bkey_methods.c
@@ -149,7 +149,8 @@ static unsigned bch2_key_types_allowed[] = {
 		(1U << KEY_TYPE_deleted)|
 		(1U << KEY_TYPE_alloc)|
 		(1U << KEY_TYPE_alloc_v2)|
-		(1U << KEY_TYPE_alloc_v3),
+		(1U << KEY_TYPE_alloc_v3)|
+		(1U << KEY_TYPE_alloc_v4),
 	[BKEY_TYPE_quotas] =
 		(1U << KEY_TYPE_deleted)|
 		(1U << KEY_TYPE_quota),
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 8ec9c43d98e1..c3d6c62ef062 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -1306,6 +1306,19 @@ static int bch2_gc_start(struct bch_fs *c,
 	return 0;
 }
 
+/* returns true if not equal */
+static inline bool bch2_alloc_v4_cmp(struct bch_alloc_v4 l,
+				     struct bch_alloc_v4 r)
+{
+	return  l.gen != r.gen				||
+		l.oldest_gen != r.oldest_gen		||
+		l.data_type != r.data_type		||
+		l.dirty_sectors	!= r.dirty_sectors	||
+		l.cached_sectors != r.cached_sectors	 ||
+		l.stripe_redundancy != r.stripe_redundancy ||
+		l.stripe != r.stripe;
+}
+
 static int bch2_alloc_write_key(struct btree_trans *trans,
 				struct btree_iter *iter,
 				bool metadata_only)
@@ -1314,8 +1327,8 @@ static int bch2_alloc_write_key(struct btree_trans *trans,
 	struct bch_dev *ca = bch_dev_bkey_exists(c, iter->pos.inode);
 	struct bucket *g;
 	struct bkey_s_c k;
-	struct bkey_alloc_unpacked old_u, new_u, gc_u;
-	struct bkey_alloc_buf *a;
+	struct bkey_i_alloc_v4 *a;
+	struct bch_alloc_v4 old, new, gc;
 	int ret;
 
 	k = bch2_btree_iter_peek_slot(iter);
@@ -1323,60 +1336,61 @@ static int bch2_alloc_write_key(struct btree_trans *trans,
 	if (ret)
 		return ret;
 
-	old_u = new_u = bch2_alloc_unpack(k);
+	bch2_alloc_to_v4(k, &old);
+	new = old;
 
 	percpu_down_read(&c->mark_lock);
 	g	= gc_bucket(ca, iter->pos.offset);
-	gc_u = (struct bkey_alloc_unpacked) {
-		.dev		= iter->pos.inode,
-		.bucket		= iter->pos.offset,
+	gc = (struct bch_alloc_v4) {
 		.gen		= g->mark.gen,
 		.data_type	= g->mark.data_type,
 		.dirty_sectors	= g->mark.dirty_sectors,
 		.cached_sectors	= g->mark.cached_sectors,
-		.read_time	= g->io_time[READ],
-		.write_time	= g->io_time[WRITE],
+		.io_time[READ]	= g->io_time[READ],
+		.io_time[WRITE]	= g->io_time[WRITE],
 		.stripe		= g->stripe,
 		.stripe_redundancy = g->stripe_redundancy,
 	};
 	percpu_up_read(&c->mark_lock);
 
 	if (metadata_only &&
-	    gc_u.data_type != BCH_DATA_sb &&
-	    gc_u.data_type != BCH_DATA_journal &&
-	    gc_u.data_type != BCH_DATA_btree)
+	    gc.data_type != BCH_DATA_sb &&
+	    gc.data_type != BCH_DATA_journal &&
+	    gc.data_type != BCH_DATA_btree)
 		return 0;
 
-	if (gen_after(old_u.gen, gc_u.gen))
+	if (gen_after(old.gen, gc.gen))
 		return 0;
 
 #define copy_bucket_field(_f)						\
-	if (fsck_err_on(new_u._f != gc_u._f, c,				\
+	if (fsck_err_on(new._f != gc._f, c,				\
 			"bucket %llu:%llu gen %u data type %s has wrong " #_f	\
 			": got %u, should be %u",			\
 			iter->pos.inode, iter->pos.offset,		\
-			new_u.gen,					\
-			bch2_data_types[new_u.data_type],		\
-			new_u._f, gc_u._f))				\
-		new_u._f = gc_u._f;					\
+			new.gen,					\
+			bch2_data_types[new.data_type],			\
+			new._f, gc._f))					\
+		new._f = gc._f;						\
 
 	copy_bucket_field(gen);
 	copy_bucket_field(data_type);
-	copy_bucket_field(stripe);
 	copy_bucket_field(dirty_sectors);
 	copy_bucket_field(cached_sectors);
 	copy_bucket_field(stripe_redundancy);
 	copy_bucket_field(stripe);
 #undef copy_bucket_field
 
-	if (!bkey_alloc_unpacked_cmp(old_u, new_u))
+	if (!bch2_alloc_v4_cmp(old, new))
 		return 0;
 
-	a = bch2_alloc_pack(trans, new_u);
-	if (IS_ERR(a))
-		return PTR_ERR(a);
+	a = bch2_alloc_to_v4_mut(trans, k);
+	ret = PTR_ERR_OR_ZERO(a);
+	if (ret)
+		return ret;
+
+	a->v = new;
 
-	ret = bch2_trans_update(trans, iter, &a->k, BTREE_TRIGGER_NORUN);
+	ret = bch2_trans_update(trans, iter, &a->k_i, BTREE_TRIGGER_NORUN);
 fsck_err:
 	return ret;
 }
@@ -1873,7 +1887,8 @@ static int bch2_alloc_write_oldest_gen(struct btree_trans *trans, struct btree_i
 {
 	struct bch_dev *ca = bch_dev_bkey_exists(trans->c, iter->pos.inode);
 	struct bkey_s_c k;
-	struct bkey_alloc_unpacked u;
+	struct bch_alloc_v4 a;
+	struct bkey_i_alloc_v4 *a_mut;
 	int ret;
 
 	k = bch2_btree_iter_peek_slot(iter);
@@ -1881,14 +1896,19 @@ static int bch2_alloc_write_oldest_gen(struct btree_trans *trans, struct btree_i
 	if (ret)
 		return ret;
 
-	u = bch2_alloc_unpack(k);
+	bch2_alloc_to_v4(k, &a);
 
-	if (u.oldest_gen == ca->oldest_gen[iter->pos.offset])
+	if (a.oldest_gen == ca->oldest_gen[iter->pos.offset])
 		return 0;
 
-	u.oldest_gen = ca->oldest_gen[iter->pos.offset];
+	a_mut = bch2_alloc_to_v4_mut(trans, k);
+	ret = PTR_ERR_OR_ZERO(a_mut);
+	if (ret)
+		return ret;
+
+	a_mut->v.oldest_gen = ca->oldest_gen[iter->pos.offset];
 
-	return bch2_alloc_write(trans, iter, &u, BTREE_TRIGGER_NORUN);
+	return bch2_trans_update(trans, iter, &a_mut->k_i, 0);
 }
 
 int bch2_gc_gens(struct bch_fs *c)
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index e848b153ae93..d5be6004071a 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -678,6 +678,7 @@ enum btree_update_flags {
 	((1U << KEY_TYPE_alloc)|		\
 	 (1U << KEY_TYPE_alloc_v2)|		\
 	 (1U << KEY_TYPE_alloc_v3)|		\
+	 (1U << KEY_TYPE_alloc_v4)|		\
 	 (1U << KEY_TYPE_stripe)|		\
 	 (1U << KEY_TYPE_inode)|		\
 	 (1U << KEY_TYPE_inode_v2)|		\
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 8eeabb5a66bd..b4252c2f028a 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -508,15 +508,14 @@ int bch2_mark_alloc(struct btree_trans *trans,
 	bool gc = flags & BTREE_TRIGGER_GC;
 	u64 journal_seq = trans->journal_res.seq;
 	struct bch_fs *c = trans->c;
-	struct bkey_alloc_unpacked old_u = bch2_alloc_unpack(old);
-	struct bkey_alloc_unpacked new_u = bch2_alloc_unpack(new);
-	struct bch_dev *ca = bch_dev_bkey_exists(c, new_u.dev);
+	struct bch_alloc_v4 old_a, new_a;
+	struct bch_dev *ca = bch_dev_bkey_exists(c, new.k->p.inode);
 	struct bucket *g;
 	struct bucket_mark old_m, m;
 	int ret = 0;
 
-	if (bch2_trans_inconsistent_on(new_u.bucket < ca->mi.first_bucket ||
-				       new_u.bucket >= ca->mi.nbuckets, trans,
+	if (bch2_trans_inconsistent_on(new.k->p.offset < ca->mi.first_bucket ||
+				       new.k->p.offset >= ca->mi.nbuckets, trans,
 				       "alloc key outside range of device's buckets"))
 		return -EIO;
 
@@ -527,11 +526,13 @@ int bch2_mark_alloc(struct btree_trans *trans,
 	    !(flags & BTREE_TRIGGER_BUCKET_INVALIDATE))
 		return 0;
 
+	bch2_alloc_to_v4(old, &old_a);
+	bch2_alloc_to_v4(new, &new_a);
+
 	if ((flags & BTREE_TRIGGER_INSERT) &&
-	    !old_u.data_type != !new_u.data_type &&
-	    new.k->type == KEY_TYPE_alloc_v3) {
-		struct bch_alloc_v3 *v = (struct bch_alloc_v3 *) new.v;
-		u64 old_journal_seq = le64_to_cpu(v->journal_seq);
+	    !old_a.data_type != !new_a.data_type &&
+	    new.k->type == KEY_TYPE_alloc_v4) {
+		struct bch_alloc_v4 *v = (struct bch_alloc_v4 *) new.v;
 
 		BUG_ON(!journal_seq);
 
@@ -540,18 +541,18 @@ int bch2_mark_alloc(struct btree_trans *trans,
 		 * before the bucket became empty again, then the we don't have
 		 * to wait on a journal flush before we can reuse the bucket:
 		 */
-		new_u.journal_seq = !new_u.data_type &&
-			(journal_seq == old_journal_seq ||
-			 bch2_journal_noflush_seq(&c->journal, old_journal_seq))
+		new_a.journal_seq = !new_a.data_type &&
+			(journal_seq == v->journal_seq ||
+			 bch2_journal_noflush_seq(&c->journal, v->journal_seq))
 			? 0 : journal_seq;
-		v->journal_seq = cpu_to_le64(new_u.journal_seq);
+		v->journal_seq = new_a.journal_seq;
 	}
 
-	if (old_u.data_type && !new_u.data_type && new_u.journal_seq) {
+	if (old_a.data_type && !new_a.data_type && new_a.journal_seq) {
 		ret = bch2_set_bucket_needs_journal_commit(&c->buckets_waiting_for_journal,
 				c->journal.flushed_seq_ondisk,
-				new_u.dev, new_u.bucket,
-				new_u.journal_seq);
+				new.k->p.inode, new.k->p.offset,
+				new_a.journal_seq);
 		if (ret) {
 			bch2_fs_fatal_error(c,
 				"error setting bucket_needs_journal_commit: %i", ret);
@@ -560,27 +561,27 @@ int bch2_mark_alloc(struct btree_trans *trans,
 	}
 
 	percpu_down_read(&c->mark_lock);
-	if (!gc && new_u.gen != old_u.gen)
-		*bucket_gen(ca, new_u.bucket) = new_u.gen;
+	if (!gc && new_a.gen != old_a.gen)
+		*bucket_gen(ca, new.k->p.offset) = new_a.gen;
 
-	g = __bucket(ca, new_u.bucket, gc);
+	g = __bucket(ca, new.k->p.offset, gc);
 
 	old_m = bucket_cmpxchg(g, m, ({
-		m.gen			= new_u.gen;
-		m.data_type		= new_u.data_type;
-		m.dirty_sectors		= new_u.dirty_sectors;
-		m.cached_sectors	= new_u.cached_sectors;
-		m.stripe		= new_u.stripe != 0;
+		m.gen			= new_a.gen;
+		m.data_type		= new_a.data_type;
+		m.dirty_sectors		= new_a.dirty_sectors;
+		m.cached_sectors	= new_a.cached_sectors;
+		m.stripe		= new_a.stripe != 0;
 	}));
 
 	bch2_dev_usage_update(c, ca, old_m, m, journal_seq, gc);
 
-	g->io_time[READ]	= new_u.read_time;
-	g->io_time[WRITE]	= new_u.write_time;
-	g->oldest_gen		= new_u.oldest_gen;
+	g->io_time[READ]	= new_a.io_time[READ];
+	g->io_time[WRITE]	= new_a.io_time[WRITE];
+	g->oldest_gen		= new_a.oldest_gen;
 	g->gen_valid		= 1;
-	g->stripe		= new_u.stripe;
-	g->stripe_redundancy	= new_u.stripe_redundancy;
+	g->stripe		= new_a.stripe;
+	g->stripe_redundancy	= new_a.stripe_redundancy;
 	percpu_up_read(&c->mark_lock);
 
 	/*
@@ -598,7 +599,7 @@ int bch2_mark_alloc(struct btree_trans *trans,
 			return ret;
 		}
 
-		trace_invalidate(ca, bucket_to_sector(ca, new_u.bucket),
+		trace_invalidate(ca, bucket_to_sector(ca, new.k->p.offset),
 				 old_m.cached_sectors);
 	}
 
@@ -1378,50 +1379,32 @@ need_mark:
 
 /* trans_mark: */
 
-static int bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree_iter *iter,
-			      const struct bch_extent_ptr *ptr,
-			      struct bkey_alloc_unpacked *u)
-{
-	struct bch_fs *c = trans->c;
-	struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
-	struct bkey_s_c k;
-	int ret;
-
-	bch2_trans_iter_init(trans, iter, BTREE_ID_alloc,
-			     POS(ptr->dev, PTR_BUCKET_NR(ca, ptr)),
-			     BTREE_ITER_WITH_UPDATES|
-			     BTREE_ITER_CACHED|
-			     BTREE_ITER_INTENT);
-	k = bch2_btree_iter_peek_slot(iter);
-	ret = bkey_err(k);
-	if (ret) {
-		bch2_trans_iter_exit(trans, iter);
-		return ret;
-	}
-
-	*u = bch2_alloc_unpack(k);
-	return 0;
-}
-
 static int bch2_trans_mark_pointer(struct btree_trans *trans,
 			struct bkey_s_c k, struct extent_ptr_decoded p,
 			s64 sectors, enum bch_data_type data_type)
 {
 	struct btree_iter iter;
-	struct bkey_alloc_unpacked u;
+	struct bkey_i_alloc_v4 *a;
+	u16 dirty_sectors, cached_sectors;
 	int ret;
 
-	ret = bch2_trans_start_alloc_update(trans, &iter, &p.ptr, &u);
-	if (ret)
-		return ret;
+	a = bch2_trans_start_alloc_update(trans, &iter, PTR_BUCKET_POS(trans->c, &p.ptr));
+	if (IS_ERR(a))
+		return PTR_ERR(a);
+
+	dirty_sectors	= a->v.dirty_sectors;
+	cached_sectors	= a->v.cached_sectors;
 
 	ret = __mark_pointer(trans, k, &p.ptr, sectors, data_type,
-			     u.gen, &u.data_type,
-			     &u.dirty_sectors, &u.cached_sectors);
+			     a->v.gen, &a->v.data_type,
+			     &dirty_sectors, &cached_sectors);
 	if (ret)
 		goto out;
 
-	ret = bch2_alloc_write(trans, &iter, &u, 0);
+	a->v.dirty_sectors	= dirty_sectors;
+	a->v.cached_sectors	= cached_sectors;
+
+	ret = bch2_trans_update(trans, &iter, &a->k_i, 0);
 	if (ret)
 		goto out;
 out:
@@ -1554,7 +1537,7 @@ static int bch2_trans_mark_stripe_bucket(struct btree_trans *trans,
 	struct bch_fs *c = trans->c;
 	const struct bch_extent_ptr *ptr = &s.v->ptrs[idx];
 	struct btree_iter iter;
-	struct bkey_alloc_unpacked u;
+	struct bkey_i_alloc_v4 *a;
 	enum bch_data_type data_type = idx >= s.v->nr_blocks - s.v->nr_redundant
 		? BCH_DATA_parity : 0;
 	s64 sectors = data_type ? le16_to_cpu(s.v->sectors) : 0;
@@ -1563,59 +1546,59 @@ static int bch2_trans_mark_stripe_bucket(struct btree_trans *trans,
 	if (deleting)
 		sectors = -sectors;
 
-	ret = bch2_trans_start_alloc_update(trans, &iter, ptr, &u);
-	if (ret)
-		return ret;
+	a = bch2_trans_start_alloc_update(trans, &iter, PTR_BUCKET_POS(c, ptr));
+	if (IS_ERR(a))
+		return PTR_ERR(a);
 
 	ret = check_bucket_ref(c, s.s_c, ptr, sectors, data_type,
-			       u.gen, u.data_type,
-			       u.dirty_sectors, u.cached_sectors);
+			       a->v.gen, a->v.data_type,
+			       a->v.dirty_sectors, a->v.cached_sectors);
 	if (ret)
 		goto err;
 
 	if (!deleting) {
-		if (bch2_trans_inconsistent_on(u.stripe ||
-					    u.stripe_redundancy, trans,
+		if (bch2_trans_inconsistent_on(a->v.stripe ||
+					       a->v.stripe_redundancy, trans,
 				"bucket %llu:%llu gen %u data type %s dirty_sectors %u: multiple stripes using same bucket (%u, %llu)",
-				iter.pos.inode, iter.pos.offset, u.gen,
-				bch2_data_types[u.data_type],
-				u.dirty_sectors,
-				u.stripe, s.k->p.offset)) {
+				iter.pos.inode, iter.pos.offset, a->v.gen,
+				bch2_data_types[a->v.data_type],
+				a->v.dirty_sectors,
+				a->v.stripe, s.k->p.offset)) {
 			ret = -EIO;
 			goto err;
 		}
 
-		if (bch2_trans_inconsistent_on(data_type && u.dirty_sectors, trans,
+		if (bch2_trans_inconsistent_on(data_type && a->v.dirty_sectors, trans,
 				"bucket %llu:%llu gen %u data type %s dirty_sectors %u: data already in stripe bucket %llu",
-				iter.pos.inode, iter.pos.offset, u.gen,
-				bch2_data_types[u.data_type],
-				u.dirty_sectors,
+				iter.pos.inode, iter.pos.offset, a->v.gen,
+				bch2_data_types[a->v.data_type],
+				a->v.dirty_sectors,
 				s.k->p.offset)) {
 			ret = -EIO;
 			goto err;
 		}
 
-		u.stripe		= s.k->p.offset;
-		u.stripe_redundancy	= s.v->nr_redundant;
+		a->v.stripe		= s.k->p.offset;
+		a->v.stripe_redundancy	= s.v->nr_redundant;
 	} else {
-		if (bch2_trans_inconsistent_on(u.stripe != s.k->p.offset ||
-					    u.stripe_redundancy != s.v->nr_redundant, trans,
+		if (bch2_trans_inconsistent_on(a->v.stripe != s.k->p.offset ||
+					       a->v.stripe_redundancy != s.v->nr_redundant, trans,
 				"bucket %llu:%llu gen %u: not marked as stripe when deleting stripe %llu (got %u)",
-				iter.pos.inode, iter.pos.offset, u.gen,
-				s.k->p.offset, u.stripe)) {
+				iter.pos.inode, iter.pos.offset, a->v.gen,
+				s.k->p.offset, a->v.stripe)) {
 			ret = -EIO;
 			goto err;
 		}
 
-		u.stripe		= 0;
-		u.stripe_redundancy	= 0;
+		a->v.stripe		= 0;
+		a->v.stripe_redundancy	= 0;
 	}
 
-	u.dirty_sectors += sectors;
+	a->v.dirty_sectors += sectors;
 	if (data_type)
-		u.data_type = !deleting ? data_type : 0;
+		a->v.data_type = !deleting ? data_type : 0;
 
-	ret = bch2_alloc_write(trans, &iter, &u, 0);
+	ret = bch2_trans_update(trans, &iter, &a->k_i, 0);
 	if (ret)
 		goto err;
 err:
@@ -1845,11 +1828,7 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
 {
 	struct bch_fs *c = trans->c;
 	struct btree_iter iter;
-	struct bkey_alloc_unpacked u;
-	struct bch_extent_ptr ptr = {
-		.dev = ca->dev_idx,
-		.offset = bucket_to_sector(ca, b),
-	};
+	struct bkey_i_alloc_v4 *a;
 	int ret = 0;
 
 	/*
@@ -1858,26 +1837,27 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
 	if (b >= ca->mi.nbuckets)
 		return 0;
 
-	ret = bch2_trans_start_alloc_update(trans, &iter, &ptr, &u);
-	if (ret)
-		return ret;
+	a = bch2_trans_start_alloc_update(trans, &iter, POS(ca->dev_idx, b));
+	if (IS_ERR(a))
+		return PTR_ERR(a);
 
-	if (u.data_type && u.data_type != type) {
+	if (a->v.data_type && a->v.data_type != type) {
 		bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
 			"bucket %llu:%llu gen %u different types of data in same bucket: %s, %s\n"
 			"while marking %s",
-			iter.pos.inode, iter.pos.offset, u.gen,
-			bch2_data_types[u.data_type],
+			iter.pos.inode, iter.pos.offset, a->v.gen,
+			bch2_data_types[a->v.data_type],
 			bch2_data_types[type],
 			bch2_data_types[type]);
 		ret = -EIO;
 		goto out;
 	}
 
-	u.data_type	= type;
-	u.dirty_sectors	= sectors;
+	a->v.data_type		= type;
+	a->v.dirty_sectors	= sectors;
 
-	ret = bch2_alloc_write(trans, &iter, &u, 0);
+	ret = bch2_trans_update(trans, &iter, &a->k_i,
+				BTREE_UPDATE_NO_KEY_CACHE_COHERENCY);
 	if (ret)
 		goto out;
 out:
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index 90f53e677281..4937d7939c2b 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -97,6 +97,14 @@ static inline size_t PTR_BUCKET_NR(const struct bch_dev *ca,
 	return sector_to_bucket(ca, ptr->offset);
 }
 
+static inline struct bpos PTR_BUCKET_POS(const struct bch_fs *c,
+				   const struct bch_extent_ptr *ptr)
+{
+	struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+
+	return POS(ptr->dev, PTR_BUCKET_NR(ca, ptr));
+}
+
 static inline struct bucket *PTR_GC_BUCKET(struct bch_dev *ca,
 					   const struct bch_extent_ptr *ptr)
 {
diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h
index 2c73dc60b838..4f7018398385 100644
--- a/fs/bcachefs/buckets_types.h
+++ b/fs/bcachefs/buckets_types.h
@@ -111,7 +111,7 @@ struct copygc_heap_entry {
 	u8			dev;
 	u8			gen;
 	u8			replicas;
-	u16			fragmentation;
+	u32			fragmentation;
 	u32			sectors;
 	u64			offset;
 };
diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c
index a54a83d3247b..aecec55eb421 100644
--- a/fs/bcachefs/movinggc.c
+++ b/fs/bcachefs/movinggc.c
@@ -129,7 +129,7 @@ static int walk_buckets_to_copygc(struct bch_fs *c)
 	struct btree_trans trans;
 	struct btree_iter iter;
 	struct bkey_s_c k;
-	struct bkey_alloc_unpacked u;
+	struct bch_alloc_v4 a;
 	int ret;
 
 	bch2_trans_init(&trans, c, 0, 0);
@@ -139,20 +139,20 @@ static int walk_buckets_to_copygc(struct bch_fs *c)
 		struct bch_dev *ca = bch_dev_bkey_exists(c, iter.pos.inode);
 		struct copygc_heap_entry e;
 
-		u = bch2_alloc_unpack(k);
+		bch2_alloc_to_v4(k, &a);
 
-		if (u.data_type != BCH_DATA_user ||
-		    u.dirty_sectors >= ca->mi.bucket_size ||
+		if (a.data_type != BCH_DATA_user ||
+		    a.dirty_sectors >= ca->mi.bucket_size ||
 		    bch2_bucket_is_open(c, iter.pos.inode, iter.pos.offset))
 			continue;
 
 		e = (struct copygc_heap_entry) {
 			.dev		= iter.pos.inode,
-			.gen		= u.gen,
-			.replicas	= 1 + u.stripe_redundancy,
-			.fragmentation	= u.dirty_sectors * (1U << 15)
-				/ ca->mi.bucket_size,
-			.sectors	= u.dirty_sectors,
+			.gen		= a.gen,
+			.replicas	= 1 + a.stripe_redundancy,
+			.fragmentation	= div_u64((u64) a.dirty_sectors * (1ULL << 31),
+						  ca->mi.bucket_size),
+			.sectors	= a.dirty_sectors,
 			.offset		= bucket_to_sector(ca, iter.pos.offset),
 		};
 		heap_add_or_replace(h, e, -fragmentation_cmp, NULL);
@@ -180,7 +180,7 @@ static int check_copygc_was_done(struct bch_fs *c,
 	struct btree_trans trans;
 	struct btree_iter iter;
 	struct bkey_s_c k;
-	struct bkey_alloc_unpacked u;
+	struct bch_alloc_v4 a;
 	struct copygc_heap_entry *i;
 	int ret = 0;
 
@@ -199,10 +199,10 @@ static int check_copygc_was_done(struct bch_fs *c,
 		if (ret)
 			break;
 
-		u = bch2_alloc_unpack(k);
+		bch2_alloc_to_v4(k, &a);
 
-		if (u.gen == i->gen && u.dirty_sectors) {
-			*sectors_not_moved += u.dirty_sectors;
+		if (a.gen == i->gen && a.dirty_sectors) {
+			*sectors_not_moved += a.dirty_sectors;
 			*buckets_not_moved += 1;
 		}
 	}
-- 
cgit 


From c6b2826cd14c5421bc50a768e923d078a71139c1 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 11 Dec 2021 17:13:09 -0500
Subject: bcachefs: Freespace, need_discard btrees

This adds two new btrees for the upcoming allocator rewrite: an extents
btree of free buckets, and a btree for buckets awaiting discards.

We also add a new trigger for alloc keys to keep the new btrees up to
date, and a compatibility path to initialize them on existing
filesystems.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c | 231 +++++++++++++++++++++++++++++++++++++++++
 fs/bcachefs/alloc_background.h |  53 ++++++++++
 fs/bcachefs/alloc_foreground.h |  14 +++
 fs/bcachefs/bcachefs.h         |   2 +
 fs/bcachefs/bcachefs_format.h  |  11 +-
 fs/bcachefs/bkey_methods.c     |   6 ++
 fs/bcachefs/btree_io.c         |   2 +-
 fs/bcachefs/btree_types.h      |  27 ++---
 fs/bcachefs/buckets.c          |   5 +
 fs/bcachefs/buckets.h          |  10 --
 fs/bcachefs/extent_update.c    |  13 ++-
 fs/bcachefs/recovery.c         |  14 ++-
 fs/bcachefs/super-io.c         |   5 +
 fs/bcachefs/super-io.h         |   1 +
 fs/bcachefs/super.c            |  36 +++----
 fs/bcachefs/super_types.h      |   1 +
 16 files changed, 377 insertions(+), 54 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 099be1290b4c..5d553d9b6151 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -14,6 +14,7 @@
 #include "debug.h"
 #include "ec.h"
 #include "error.h"
+#include "lru.h"
 #include "recovery.h"
 #include "trace.h"
 #include "varint.h"
@@ -41,6 +42,15 @@ static const unsigned BCH_ALLOC_V1_FIELD_BYTES[] = {
 #undef x
 };
 
+const char * const bch2_bucket_states[] = {
+	"free",
+	"need gc gens",
+	"need discard",
+	"cached",
+	"dirty",
+	NULL
+};
+
 struct bkey_alloc_unpacked {
 	u64		journal_seq;
 	u64		bucket;
@@ -448,6 +458,217 @@ int bch2_alloc_read(struct bch_fs *c, bool gc, bool metadata_only)
 	return ret;
 }
 
+/* Free space/discard btree: */
+
+static int bch2_bucket_do_index(struct btree_trans *trans,
+				struct bkey_s_c alloc_k,
+				struct bch_alloc_v4 a,
+				bool set)
+{
+	struct bch_fs *c = trans->c;
+	struct bch_dev *ca = bch_dev_bkey_exists(c, alloc_k.k->p.inode);
+	struct btree_iter iter;
+	struct bkey_s_c old;
+	struct bkey_i *k;
+	enum bucket_state state = bucket_state(a);
+	enum btree_id btree;
+	enum bch_bkey_type old_type = !set ? KEY_TYPE_set : KEY_TYPE_deleted;
+	enum bch_bkey_type new_type =  set ? KEY_TYPE_set : KEY_TYPE_deleted;
+	struct printbuf buf = PRINTBUF;
+	int ret;
+
+	if (state != BUCKET_free &&
+	    state != BUCKET_need_discard)
+		return 0;
+
+	k = bch2_trans_kmalloc(trans, sizeof(*k));
+	if (IS_ERR(k))
+		return PTR_ERR(k);
+
+	bkey_init(&k->k);
+	k->k.type = new_type;
+
+	switch (state) {
+	case BUCKET_free:
+		btree = BTREE_ID_freespace;
+		k->k.p = alloc_freespace_pos(alloc_k.k->p, a);
+		bch2_key_resize(&k->k, 1);
+		break;
+	case BUCKET_need_discard:
+		btree = BTREE_ID_need_discard;
+		k->k.p = alloc_k.k->p;
+		break;
+	default:
+		return 0;
+	}
+
+	bch2_trans_iter_init(trans, &iter, btree,
+			     bkey_start_pos(&k->k),
+			     BTREE_ITER_INTENT);
+	old = bch2_btree_iter_peek_slot(&iter);
+	ret = bkey_err(old);
+	if (ret)
+		goto err;
+
+	if (ca->mi.freespace_initialized &&
+	    bch2_fs_inconsistent_on(old.k->type != old_type, c,
+			"incorrect key when %s %s btree (got %s should be %s)\n"
+			"  for %s",
+			set ? "setting" : "clearing",
+			bch2_btree_ids[btree],
+			bch2_bkey_types[old.k->type],
+			bch2_bkey_types[old_type],
+			(bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) {
+		ret = -EIO;
+		goto err;
+	}
+
+	ret = bch2_trans_update(trans, &iter, k, 0);
+err:
+	bch2_trans_iter_exit(trans, &iter);
+	printbuf_exit(&buf);
+	return ret;
+}
+
+int bch2_trans_mark_alloc(struct btree_trans *trans,
+			  struct bkey_s_c old, struct bkey_i *new,
+			  unsigned flags)
+{
+	struct bch_fs *c = trans->c;
+	struct bch_alloc_v4 old_a, *new_a;
+	u64 old_lru, new_lru;
+	int ret = 0;
+
+	/*
+	 * Deletion only happens in the device removal path, with
+	 * BTREE_TRIGGER_NORUN:
+	 */
+	BUG_ON(new->k.type != KEY_TYPE_alloc_v4);
+
+	bch2_alloc_to_v4(old, &old_a);
+	new_a = &bkey_i_to_alloc_v4(new)->v;
+
+	if (new_a->dirty_sectors > old_a.dirty_sectors ||
+	    new_a->cached_sectors > old_a.cached_sectors) {
+		new_a->io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now));
+		new_a->io_time[WRITE]= max_t(u64, 1, atomic64_read(&c->io_clock[WRITE].now));
+		SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, true);
+		SET_BCH_ALLOC_V4_NEED_DISCARD(new_a, true);
+	}
+
+	if (old_a.data_type && !new_a->data_type &&
+	    old_a.gen == new_a->gen &&
+	    !bch2_bucket_is_open_safe(c, new->k.p.inode, new->k.p.offset)) {
+		new_a->gen++;
+		SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, false);
+	}
+
+	if (bucket_state(old_a) != bucket_state(*new_a) ||
+	    (bucket_state(*new_a) == BUCKET_free &&
+	     alloc_freespace_genbits(old_a) != alloc_freespace_genbits(*new_a))) {
+		ret =   bch2_bucket_do_index(trans, old, old_a, false) ?:
+			bch2_bucket_do_index(trans, bkey_i_to_s_c(new), *new_a, true);
+		if (ret)
+			return ret;
+	}
+
+	old_lru = alloc_lru_idx(old_a);
+	new_lru = alloc_lru_idx(*new_a);
+
+	if (old_lru != new_lru) {
+		ret = bch2_lru_change(trans, new->k.p.inode, new->k.p.offset,
+				      old_lru, &new_lru);
+		if (ret)
+			return ret;
+
+		if (new_lru && new_a->io_time[READ] != new_lru)
+			new_a->io_time[READ] = new_lru;
+	}
+
+	return 0;
+}
+
+static int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca)
+{
+	struct btree_trans trans;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	struct bch_alloc_v4 a;
+	struct bch_member *m;
+	int ret;
+
+	bch2_trans_init(&trans, c, 0, 0);
+
+	for_each_btree_key(&trans, iter, BTREE_ID_alloc,
+			   POS(ca->dev_idx, ca->mi.first_bucket),
+			   BTREE_ITER_SLOTS|
+			   BTREE_ITER_PREFETCH, k, ret) {
+		if (iter.pos.offset >= ca->mi.nbuckets)
+			break;
+
+		bch2_alloc_to_v4(k, &a);
+		ret = __bch2_trans_do(&trans, NULL, NULL,
+				      BTREE_INSERT_LAZY_RW,
+				 bch2_bucket_do_index(&trans, k, a, true));
+		if (ret)
+			break;
+	}
+	bch2_trans_iter_exit(&trans, &iter);
+
+	bch2_trans_exit(&trans);
+
+	if (ret) {
+		bch_err(ca, "error initializing free space: %i", ret);
+		return ret;
+	}
+
+	mutex_lock(&c->sb_lock);
+	m = bch2_sb_get_members(c->disk_sb.sb)->members + ca->dev_idx;
+	SET_BCH_MEMBER_FREESPACE_INITIALIZED(m, true);
+	mutex_unlock(&c->sb_lock);
+
+	return ret;
+}
+
+int bch2_fs_freespace_init(struct bch_fs *c)
+{
+	struct bch_dev *ca;
+	unsigned i;
+	int ret = 0;
+	bool doing_init = false;
+
+	/*
+	 * We can crash during the device add path, so we need to check this on
+	 * every mount:
+	 */
+
+	for_each_member_device(ca, c, i) {
+		if (ca->mi.freespace_initialized)
+			continue;
+
+		if (!doing_init) {
+			bch_info(c, "initializing freespace");
+			doing_init = true;
+		}
+
+		ret = bch2_dev_freespace_init(c, ca);
+		if (ret) {
+			percpu_ref_put(&ca->ref);
+			return ret;
+		}
+	}
+
+	if (doing_init) {
+		mutex_lock(&c->sb_lock);
+		bch2_write_super(c);
+		mutex_unlock(&c->sb_lock);
+
+		bch_verbose(c, "done initializing freespace");
+	}
+
+	return ret;
+}
+
 /* Bucket IO clocks: */
 
 int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev,
@@ -485,6 +706,16 @@ out:
  * commands to the newly free buckets, then puts them on the various freelists.
  */
 
+/*
+ * bucket_gc_gen() returns the difference between the bucket's current gen and
+ * the oldest gen of any pointer into that bucket in the btree.
+ */
+
+static inline u8 bucket_gc_gen(struct bucket *g)
+{
+	return g->mark.gen - g->oldest_gen;
+}
+
 static bool bch2_can_invalidate_bucket(struct bch_dev *ca, size_t b,
 				       struct bucket_mark m)
 {
diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h
index b66c8cf0341e..8de109674f0f 100644
--- a/fs/bcachefs/alloc_background.h
+++ b/fs/bcachefs/alloc_background.h
@@ -13,6 +13,51 @@ extern const char * const bch2_allocator_states[];
 /* How out of date a pointer gen is allowed to be: */
 #define BUCKET_GC_GEN_MAX	96U
 
+static inline u8 alloc_gc_gen(struct bch_alloc_v4 a)
+{
+	return a.gen - a.oldest_gen;
+}
+
+enum bucket_state {
+	BUCKET_free,
+	BUCKET_need_gc_gens,
+	BUCKET_need_discard,
+	BUCKET_cached,
+	BUCKET_dirty,
+};
+
+extern const char * const bch2_bucket_states[];
+
+static inline enum bucket_state bucket_state(struct bch_alloc_v4 a)
+{
+	if (a.dirty_sectors || a.stripe)
+		return BUCKET_dirty;
+	if (a.cached_sectors)
+		return BUCKET_cached;
+	BUG_ON(a.data_type);
+	if (BCH_ALLOC_V4_NEED_DISCARD(&a))
+		return BUCKET_need_discard;
+	if (alloc_gc_gen(a) >= BUCKET_GC_GEN_MAX)
+		return BUCKET_need_gc_gens;
+	return BUCKET_free;
+}
+
+static inline u64 alloc_lru_idx(struct bch_alloc_v4 a)
+{
+	return bucket_state(a) == BUCKET_cached ? a.io_time[READ] : 0;
+}
+
+static inline u64 alloc_freespace_genbits(struct bch_alloc_v4 a)
+{
+	return ((u64) alloc_gc_gen(a) >> 4) << 56;
+}
+
+static inline struct bpos alloc_freespace_pos(struct bpos pos, struct bch_alloc_v4 a)
+{
+	pos.offset |= alloc_freespace_genbits(a);
+	return pos;
+}
+
 struct bkey_i_alloc_v4 *
 bch2_trans_start_alloc_update(struct btree_trans *, struct btree_iter *, struct bpos);
 
@@ -33,18 +78,21 @@ void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 #define bch2_bkey_ops_alloc (struct bkey_ops) {		\
 	.key_invalid	= bch2_alloc_v1_invalid,	\
 	.val_to_text	= bch2_alloc_to_text,		\
+	.trans_trigger	= bch2_trans_mark_alloc,	\
 	.atomic_trigger	= bch2_mark_alloc,		\
 }
 
 #define bch2_bkey_ops_alloc_v2 (struct bkey_ops) {	\
 	.key_invalid	= bch2_alloc_v2_invalid,	\
 	.val_to_text	= bch2_alloc_to_text,		\
+	.trans_trigger	= bch2_trans_mark_alloc,	\
 	.atomic_trigger	= bch2_mark_alloc,		\
 }
 
 #define bch2_bkey_ops_alloc_v3 (struct bkey_ops) {	\
 	.key_invalid	= bch2_alloc_v3_invalid,	\
 	.val_to_text	= bch2_alloc_to_text,		\
+	.trans_trigger	= bch2_trans_mark_alloc,	\
 	.atomic_trigger	= bch2_mark_alloc,		\
 }
 
@@ -52,6 +100,7 @@ void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 	.key_invalid	= bch2_alloc_v4_invalid,	\
 	.val_to_text	= bch2_alloc_to_text,		\
 	.swab		= bch2_alloc_v4_swab,		\
+	.trans_trigger	= bch2_trans_mark_alloc,	\
 	.atomic_trigger	= bch2_mark_alloc,		\
 }
 
@@ -64,6 +113,10 @@ static inline bool bkey_is_alloc(const struct bkey *k)
 
 int bch2_alloc_read(struct bch_fs *, bool, bool);
 
+int bch2_trans_mark_alloc(struct btree_trans *, struct bkey_s_c,
+			  struct bkey_i *, unsigned);
+int bch2_fs_freespace_init(struct bch_fs *);
+
 static inline void bch2_wake_allocator(struct bch_dev *ca)
 {
 	struct task_struct *p;
diff --git a/fs/bcachefs/alloc_foreground.h b/fs/bcachefs/alloc_foreground.h
index 9b4389b09cbb..efb6d155bd00 100644
--- a/fs/bcachefs/alloc_foreground.h
+++ b/fs/bcachefs/alloc_foreground.h
@@ -117,6 +117,20 @@ static inline bool bch2_bucket_is_open(struct bch_fs *c, unsigned dev, u64 bucke
 	return false;
 }
 
+static inline bool bch2_bucket_is_open_safe(struct bch_fs *c, unsigned dev, u64 bucket)
+{
+	bool ret;
+
+	if (bch2_bucket_is_open(c, dev, bucket))
+		return true;
+
+	spin_lock(&c->freelist_lock);
+	ret = bch2_bucket_is_open(c, dev, bucket);
+	spin_unlock(&c->freelist_lock);
+
+	return ret;
+}
+
 int bch2_bucket_alloc_set(struct bch_fs *, struct open_buckets *,
 		      struct dev_stripe_state *, struct bch_devs_mask *,
 		      unsigned, unsigned *, bool *, enum alloc_reserve,
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 7891ad208a33..6c11ebee73a9 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -392,6 +392,8 @@ enum gc_phase {
 	GC_PHASE_BTREE_subvolumes,
 	GC_PHASE_BTREE_snapshots,
 	GC_PHASE_BTREE_lru,
+	GC_PHASE_BTREE_freespace,
+	GC_PHASE_BTREE_need_discard,
 
 	GC_PHASE_PENDING_DELETE,
 };
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index a640a45a123a..ee683d08e8ae 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -947,7 +947,6 @@ enum {
 #define x(name, _bits) BCH_ALLOC_FIELD_V1_##name,
 	BCH_ALLOC_FIELDS_V1()
 #undef x
-	BCH_ALLOC_FIELD_NR
 };
 
 /* Quotas: */
@@ -1146,6 +1145,8 @@ LE64_BITMASK(BCH_MEMBER_DISCARD,	struct bch_member, flags[0], 14, 15)
 LE64_BITMASK(BCH_MEMBER_DATA_ALLOWED,	struct bch_member, flags[0], 15, 20)
 LE64_BITMASK(BCH_MEMBER_GROUP,		struct bch_member, flags[0], 20, 28)
 LE64_BITMASK(BCH_MEMBER_DURABILITY,	struct bch_member, flags[0], 28, 30)
+LE64_BITMASK(BCH_MEMBER_FREESPACE_INITIALIZED,
+					struct bch_member, flags[0], 30, 31)
 
 #if 0
 LE64_BITMASK(BCH_MEMBER_NR_READ_ERRORS,	struct bch_member, flags[1], 0,  20);
@@ -1361,7 +1362,9 @@ struct bch_sb_field_journal_seq_blacklist {
 	x(snapshot_2,			15)		\
 	x(reflink_p_fix,		16)		\
 	x(subvol_dirent,		17)		\
-	x(inode_v2,			18)
+	x(inode_v2,			18)		\
+	x(freespace,			19)		\
+	x(alloc_v4,			20)
 
 enum bcachefs_metadata_version {
 	bcachefs_metadata_version_min = 9,
@@ -1889,7 +1892,9 @@ LE32_BITMASK(JSET_NO_FLUSH,	struct jset, flags, 5, 6);
 	x(reflink,	7)			\
 	x(subvolumes,	8)			\
 	x(snapshots,	9)			\
-	x(lru,		10)
+	x(lru,		10)			\
+	x(freespace,	11)			\
+	x(need_discard,	12)
 
 enum btree_id {
 #define x(kwd, val) BTREE_ID_##kwd = val,
diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c
index 62774e4f6dbb..9a1819147749 100644
--- a/fs/bcachefs/bkey_methods.c
+++ b/fs/bcachefs/bkey_methods.c
@@ -170,6 +170,12 @@ static unsigned bch2_key_types_allowed[] = {
 	[BKEY_TYPE_lru] =
 		(1U << KEY_TYPE_deleted)|
 		(1U << KEY_TYPE_lru),
+	[BKEY_TYPE_freespace] =
+		(1U << KEY_TYPE_deleted)|
+		(1U << KEY_TYPE_set),
+	[BKEY_TYPE_need_discard] =
+		(1U << KEY_TYPE_deleted)|
+		(1U << KEY_TYPE_set),
 	[BKEY_TYPE_btree] =
 		(1U << KEY_TYPE_deleted)|
 		(1U << KEY_TYPE_btree_ptr)|
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index b6f0f6dec8e8..b1099958ed5e 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -925,7 +925,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
 					"error decrypting btree node: %i", ret))
 				goto fsck_err;
 
-			btree_err_on(btree_node_is_extents(b) &&
+			btree_err_on(btree_node_type_is_extents(btree_node_type(b)) &&
 				     !BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data),
 				     BTREE_ERR_FATAL, c, NULL, b, NULL,
 				     "btree node does not have NEW_EXTENT_OVERWRITE set");
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index d5be6004071a..38c9148f608d 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -595,24 +595,9 @@ static inline enum btree_node_type btree_node_type(struct btree *b)
 	return __btree_node_type(b->c.level, b->c.btree_id);
 }
 
-static inline bool btree_node_type_is_extents(enum btree_node_type type)
-{
-	switch (type) {
-	case BKEY_TYPE_extents:
-	case BKEY_TYPE_reflink:
-		return true;
-	default:
-		return false;
-	}
-}
-
-static inline bool btree_node_is_extents(struct btree *b)
-{
-	return btree_node_type_is_extents(btree_node_type(b));
-}
-
 #define BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS		\
 	((1U << BKEY_TYPE_extents)|			\
+	 (1U << BKEY_TYPE_alloc)|			\
 	 (1U << BKEY_TYPE_inodes)|			\
 	 (1U << BKEY_TYPE_stripes)|			\
 	 (1U << BKEY_TYPE_reflink)|			\
@@ -628,6 +613,16 @@ static inline bool btree_node_is_extents(struct btree *b)
 	(BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS|		\
 	 BTREE_NODE_TYPE_HAS_MEM_TRIGGERS)
 
+#define BTREE_ID_IS_EXTENTS				\
+	((1U << BTREE_ID_extents)|			\
+	 (1U << BTREE_ID_reflink)|			\
+	 (1U << BTREE_ID_freespace))
+
+static inline bool btree_node_type_is_extents(enum btree_node_type type)
+{
+	return (1U << type) & BTREE_ID_IS_EXTENTS;
+}
+
 #define BTREE_ID_HAS_SNAPSHOTS				\
 	((1U << BTREE_ID_extents)|			\
 	 (1U << BTREE_ID_inodes)|			\
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index b4252c2f028a..5e247235ab69 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -560,6 +560,11 @@ int bch2_mark_alloc(struct btree_trans *trans,
 		}
 	}
 
+	if (bucket_state(new_a) == BUCKET_need_gc_gens) {
+		atomic_inc(&c->kick_gc);
+		wake_up_process(c->gc_thread);
+	}
+
 	percpu_down_read(&c->mark_lock);
 	if (!gc && new_a.gen != old_a.gen)
 		*bucket_gen(ca, new.k->p.offset) = new_a.gen;
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index 4937d7939c2b..757919d5e20f 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -81,16 +81,6 @@ static inline u8 *bucket_gen(struct bch_dev *ca, size_t b)
 	return gens->b + b;
 }
 
-/*
- * bucket_gc_gen() returns the difference between the bucket's current gen and
- * the oldest gen of any pointer into that bucket in the btree.
- */
-
-static inline u8 bucket_gc_gen(struct bucket *g)
-{
-	return g->mark.gen - g->oldest_gen;
-}
-
 static inline size_t PTR_BUCKET_NR(const struct bch_dev *ca,
 				   const struct bch_extent_ptr *ptr)
 {
diff --git a/fs/bcachefs/extent_update.c b/fs/bcachefs/extent_update.c
index 58b2c96f450c..2fd5d9672a44 100644
--- a/fs/bcachefs/extent_update.c
+++ b/fs/bcachefs/extent_update.c
@@ -15,17 +15,26 @@ static unsigned bch2_bkey_nr_alloc_ptrs(struct bkey_s_c k)
 {
 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
 	const union bch_extent_entry *entry;
-	unsigned ret = 0;
+	unsigned ret = 0, lru = 0;
 
 	bkey_extent_entry_for_each(ptrs, entry) {
 		switch (__extent_entry_type(entry)) {
 		case BCH_EXTENT_ENTRY_ptr:
+			/* Might also be updating LRU btree */
+			if (entry->ptr.cached)
+				lru++;
+
+			fallthrough;
 		case BCH_EXTENT_ENTRY_stripe_ptr:
 			ret++;
 		}
 	}
 
-	return ret;
+	/*
+	 * Updating keys in the alloc btree may also update keys in the
+	 * freespace or discard btrees:
+	 */
+	return lru + ret * 2;
 }
 
 static int count_iters_for_insert(struct btree_trans *trans,
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 93882e6a2ae4..690a36ea1383 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -1028,8 +1028,8 @@ int bch2_fs_recovery(struct bch_fs *c)
 			bch_info(c, "filesystem version is prior to subvol_dirent - upgrading");
 			c->opts.version_upgrade = true;
 			c->opts.fsck		= true;
-		} else if (c->sb.version < bcachefs_metadata_version_inode_v2) {
-			bch_info(c, "filesystem version is prior to inode_v2 - upgrading");
+		} else if (c->sb.version < bcachefs_metadata_version_alloc_v4) {
+			bch_info(c, "filesystem version is prior to alloc_v4 - upgrading");
 			c->opts.version_upgrade = true;
 		}
 	}
@@ -1197,6 +1197,11 @@ use_clean:
 	if (c->opts.verbose || !c->sb.clean)
 		bch_info(c, "journal replay done");
 
+	err = "error initializing freespace";
+	ret = bch2_fs_freespace_init(c);
+	if (ret)
+		goto err;
+
 	if (c->sb.version < bcachefs_metadata_version_snapshot_2) {
 		bch2_fs_lazy_rw(c);
 
@@ -1380,6 +1385,11 @@ int bch2_fs_initialize(struct bch_fs *c)
 		ca->new_fs_bucket_idx = 0;
 	}
 
+	err = "error initializing freespace";
+	ret = bch2_fs_freespace_init(c);
+	if (ret)
+		goto err;
+
 	err = "error creating root snapshot node";
 	ret = bch2_fs_initialize_subvolumes(c);
 	if (ret)
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index bc845c42768e..c7962266f495 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -1083,6 +1083,11 @@ static void bch2_sb_members_to_text(struct printbuf *out, struct bch_sb *sb,
 		pr_buf(out, "%llu", BCH_MEMBER_DISCARD(m));
 		pr_newline(out);
 
+		pr_buf(out, "Freespace initialized:");
+		pr_tab(out);
+		pr_buf(out, "%llu", BCH_MEMBER_FREESPACE_INITIALIZED(m));
+		pr_newline(out);
+
 		pr_indent_pop(out, 2);
 	}
 }
diff --git a/fs/bcachefs/super-io.h b/fs/bcachefs/super-io.h
index bed61a01e4be..ab0ad3248e8f 100644
--- a/fs/bcachefs/super-io.h
+++ b/fs/bcachefs/super-io.h
@@ -103,6 +103,7 @@ static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi)
 		.durability	= BCH_MEMBER_DURABILITY(mi)
 			? BCH_MEMBER_DURABILITY(mi) - 1
 			: 1,
+		.freespace_initialized = BCH_MEMBER_FREESPACE_INITIALIZED(mi),
 		.valid		= bch2_member_exists(mi),
 	};
 }
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 208482db3683..c6585034f4d4 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -1471,30 +1471,20 @@ int bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca,
 
 static int bch2_dev_remove_alloc(struct bch_fs *c, struct bch_dev *ca)
 {
-	struct btree_trans trans;
-	size_t i;
+	struct bpos start	= POS(ca->dev_idx, 0);
+	struct bpos end		= POS(ca->dev_idx, U64_MAX);
 	int ret;
 
-	bch2_trans_init(&trans, c, 0, 0);
-
-	for (i = 0; i < ca->mi.nbuckets; i++) {
-		ret = lockrestart_do(&trans,
-			bch2_btree_key_cache_flush(&trans,
-				BTREE_ID_alloc, POS(ca->dev_idx, i)));
-		if (ret)
-			break;
-	}
-	bch2_trans_exit(&trans);
-
-	if (ret) {
+	ret =   bch2_btree_delete_range(c, BTREE_ID_alloc, start, end,
+					BTREE_TRIGGER_NORUN, NULL) ?:
+		bch2_btree_delete_range(c, BTREE_ID_freespace, start, end,
+					BTREE_TRIGGER_NORUN, NULL) ?:
+		bch2_btree_delete_range(c, BTREE_ID_need_discard, start, end,
+					BTREE_TRIGGER_NORUN, NULL);
+	if (ret)
 		bch_err(c, "error %i removing dev alloc info", ret);
-		return ret;
-	}
 
-	return bch2_btree_delete_range(c, BTREE_ID_alloc,
-				       POS(ca->dev_idx, 0),
-				       POS(ca->dev_idx + 1, 0),
-				       0, NULL);
+	return ret;
 }
 
 int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
@@ -1712,6 +1702,12 @@ have_slot:
 		goto err_late;
 	}
 
+	ret = bch2_fs_freespace_init(c);
+	if (ret) {
+		bch_err(c, "device add error: error initializing free space: %i", ret);
+		goto err_late;
+	}
+
 	ca->new_fs_bucket_idx = 0;
 
 	if (ca->mi.state == BCH_MEMBER_STATE_rw) {
diff --git a/fs/bcachefs/super_types.h b/fs/bcachefs/super_types.h
index 1c0241304f32..08faeedba326 100644
--- a/fs/bcachefs/super_types.h
+++ b/fs/bcachefs/super_types.h
@@ -33,6 +33,7 @@ struct bch_member_cpu {
 	u8			discard;
 	u8			data_allowed;
 	u8			durability;
+	u8			freespace_initialized;
 	u8			valid;
 };
 
-- 
cgit 


From f25d8215f499418c17dfde0b3158a66e03c758dc Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 9 Jan 2022 20:48:31 -0500
Subject: bcachefs: Kill allocator threads & freelists

Now that we have new persistent data structures for the allocator, this
patch converts the allocator to use them.

Now, foreground bucket allocation uses the freespace btree to find
buckets to allocate, instead of popping buckets off the freelist.

The background allocator threads are no longer needed and are deleted,
as well as the allocator freelists. Now we only need background tasks
for invalidating buckets containing cached data (when we are low on
empty buckets), and for issuing discards.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c      | 560 +----------------------------------
 fs/bcachefs/alloc_background.h      |  33 ---
 fs/bcachefs/alloc_foreground.c      | 561 ++++++++++++++++++++++++++++++------
 fs/bcachefs/alloc_foreground.h      |  11 +
 fs/bcachefs/alloc_types.h           |  23 --
 fs/bcachefs/bcachefs.h              |  23 +-
 fs/bcachefs/btree_gc.c              |  10 +-
 fs/bcachefs/btree_update_interior.c |  17 +-
 fs/bcachefs/buckets.c               |  72 +----
 fs/bcachefs/buckets.h               |  62 ++--
 fs/bcachefs/buckets_types.h         |   2 -
 fs/bcachefs/ec.c                    |  13 +-
 fs/bcachefs/journal.c               |   2 -
 fs/bcachefs/journal_io.c            |   4 +
 fs/bcachefs/movinggc.c              |  23 +-
 fs/bcachefs/recovery.c              |   2 +
 fs/bcachefs/super.c                 |  82 +-----
 fs/bcachefs/sysfs.c                 |  47 +--
 fs/bcachefs/trace.h                 |  67 ++++-
 19 files changed, 615 insertions(+), 999 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 5d553d9b6151..3ba2b35fad53 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -27,13 +27,6 @@
 #include <linux/sched/task.h>
 #include <linux/sort.h>
 
-const char * const bch2_allocator_states[] = {
-#define x(n)	#n,
-	ALLOC_THREAD_STATES()
-#undef x
-	NULL
-};
-
 /* Persistent alloc info: */
 
 static const unsigned BCH_ALLOC_V1_FIELD_BYTES[] = {
@@ -431,7 +424,6 @@ int bch2_alloc_read(struct bch_fs *c, bool gc, bool metadata_only)
 		g->_mark.gen		= a.gen;
 		g->io_time[READ]	= a.io_time[READ];
 		g->io_time[WRITE]	= a.io_time[WRITE];
-		g->oldest_gen		= !gc ? a.oldest_gen : a.gen;
 		g->gen_valid		= 1;
 
 		if (!gc ||
@@ -553,7 +545,6 @@ int bch2_trans_mark_alloc(struct btree_trans *trans,
 		new_a->io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now));
 		new_a->io_time[WRITE]= max_t(u64, 1, atomic64_read(&c->io_clock[WRITE].now));
 		SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, true);
-		SET_BCH_ALLOC_V4_NEED_DISCARD(new_a, true);
 	}
 
 	if (old_a.data_type && !new_a->data_type &&
@@ -698,493 +689,6 @@ out:
 	return ret;
 }
 
-/* Background allocator thread: */
-
-/*
- * Scans for buckets to be invalidated, invalidates them, rewrites prios/gens
- * (marking them as invalidated on disk), then optionally issues discard
- * commands to the newly free buckets, then puts them on the various freelists.
- */
-
-/*
- * bucket_gc_gen() returns the difference between the bucket's current gen and
- * the oldest gen of any pointer into that bucket in the btree.
- */
-
-static inline u8 bucket_gc_gen(struct bucket *g)
-{
-	return g->mark.gen - g->oldest_gen;
-}
-
-static bool bch2_can_invalidate_bucket(struct bch_dev *ca, size_t b,
-				       struct bucket_mark m)
-{
-	u8 gc_gen;
-
-	if (!is_available_bucket(m))
-		return false;
-
-	if (m.owned_by_allocator)
-		return false;
-
-	if (ca->buckets_nouse &&
-	    test_bit(b, ca->buckets_nouse))
-		return false;
-
-	if (ca->new_fs_bucket_idx) {
-		/*
-		 * Device or filesystem is still being initialized, and we
-		 * haven't fully marked superblocks & journal:
-		 */
-		if (is_superblock_bucket(ca, b))
-			return false;
-
-		if (b < ca->new_fs_bucket_idx)
-			return false;
-	}
-
-	gc_gen = bucket_gc_gen(bucket(ca, b));
-
-	ca->inc_gen_needs_gc		+= gc_gen >= BUCKET_GC_GEN_MAX / 2;
-	ca->inc_gen_really_needs_gc	+= gc_gen >= BUCKET_GC_GEN_MAX;
-
-	return gc_gen < BUCKET_GC_GEN_MAX;
-}
-
-/*
- * Determines what order we're going to reuse buckets, smallest bucket_key()
- * first.
- */
-
-static unsigned bucket_sort_key(struct bucket *g, struct bucket_mark m,
-				u64 now, u64 last_seq_ondisk)
-{
-	unsigned used = m.cached_sectors;
-
-	if (used) {
-		/*
-		 * Prefer to keep buckets that have been read more recently, and
-		 * buckets that have more data in them:
-		 */
-		u64 last_read = max_t(s64, 0, now - g->io_time[READ]);
-		u32 last_read_scaled = max_t(u64, U32_MAX, div_u64(last_read, used));
-
-		return -last_read_scaled;
-	} else {
-		/*
-		 * Prefer to use buckets with smaller gc_gen so that we don't
-		 * have to walk the btree and recalculate oldest_gen - but shift
-		 * off the low bits so that buckets will still have equal sort
-		 * keys when there's only a small difference, so that we can
-		 * keep sequential buckets together:
-		 */
-		return bucket_gc_gen(g) >> 4;
-	}
-}
-
-static inline int bucket_alloc_cmp(alloc_heap *h,
-				   struct alloc_heap_entry l,
-				   struct alloc_heap_entry r)
-{
-	return  cmp_int(l.key, r.key) ?:
-		cmp_int(r.nr, l.nr) ?:
-		cmp_int(l.bucket, r.bucket);
-}
-
-static inline int bucket_idx_cmp(const void *_l, const void *_r)
-{
-	const struct alloc_heap_entry *l = _l, *r = _r;
-
-	return cmp_int(l->bucket, r->bucket);
-}
-
-static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca)
-{
-	struct bucket_array *buckets;
-	struct alloc_heap_entry e = { 0 };
-	u64 now, last_seq_ondisk;
-	size_t b, i, nr = 0;
-
-	down_read(&ca->bucket_lock);
-
-	buckets = bucket_array(ca);
-	ca->alloc_heap.used = 0;
-	now = atomic64_read(&c->io_clock[READ].now);
-	last_seq_ondisk = c->journal.flushed_seq_ondisk;
-
-	/*
-	 * Find buckets with lowest read priority, by building a maxheap sorted
-	 * by read priority and repeatedly replacing the maximum element until
-	 * all buckets have been visited.
-	 */
-	for (b = ca->mi.first_bucket; b < ca->mi.nbuckets; b++) {
-		struct bucket *g = &buckets->b[b];
-		struct bucket_mark m = READ_ONCE(g->mark);
-		unsigned key = bucket_sort_key(g, m, now, last_seq_ondisk);
-
-		cond_resched();
-
-		if (!bch2_can_invalidate_bucket(ca, b, m))
-			continue;
-
-		if (!m.data_type &&
-		    bch2_bucket_needs_journal_commit(&c->buckets_waiting_for_journal,
-						     last_seq_ondisk,
-						     ca->dev_idx, b)) {
-			ca->buckets_waiting_on_journal++;
-			continue;
-		}
-
-		if (e.nr && e.bucket + e.nr == b && e.key == key) {
-			e.nr++;
-		} else {
-			if (e.nr)
-				heap_add_or_replace(&ca->alloc_heap, e,
-					-bucket_alloc_cmp, NULL);
-
-			e = (struct alloc_heap_entry) {
-				.bucket = b,
-				.nr	= 1,
-				.key	= key,
-			};
-		}
-	}
-
-	if (e.nr)
-		heap_add_or_replace(&ca->alloc_heap, e,
-				-bucket_alloc_cmp, NULL);
-
-	for (i = 0; i < ca->alloc_heap.used; i++)
-		nr += ca->alloc_heap.data[i].nr;
-
-	while (nr - ca->alloc_heap.data[0].nr >= ALLOC_SCAN_BATCH(ca)) {
-		nr -= ca->alloc_heap.data[0].nr;
-		heap_pop(&ca->alloc_heap, e, -bucket_alloc_cmp, NULL);
-	}
-
-	up_read(&ca->bucket_lock);
-}
-
-static size_t find_reclaimable_buckets(struct bch_fs *c, struct bch_dev *ca)
-{
-	size_t i, nr = 0;
-
-	ca->inc_gen_needs_gc			= 0;
-	ca->inc_gen_really_needs_gc		= 0;
-	ca->buckets_waiting_on_journal		= 0;
-
-	find_reclaimable_buckets_lru(c, ca);
-
-	heap_resort(&ca->alloc_heap, bucket_alloc_cmp, NULL);
-
-	for (i = 0; i < ca->alloc_heap.used; i++)
-		nr += ca->alloc_heap.data[i].nr;
-
-	return nr;
-}
-
-static int bucket_invalidate_btree(struct btree_trans *trans,
-				   struct bch_dev *ca, u64 b,
-				   struct bkey_i_alloc_v4 *a)
-{
-	struct bch_fs *c = trans->c;
-	struct btree_iter iter;
-	struct bkey_s_c k;
-	int ret;
-
-	bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc,
-			     POS(ca->dev_idx, b),
-			     BTREE_ITER_CACHED|
-			     BTREE_ITER_INTENT);
-
-	k = bch2_btree_iter_peek_slot(&iter);
-	ret = bkey_err(k);
-	if (ret)
-		goto err;
-
-	bkey_alloc_v4_init(&a->k_i);
-	a->k.p = iter.pos;
-	bch2_alloc_to_v4(k, &a->v);
-	a->v.gen++;
-	a->v.data_type		= 0;
-	a->v.dirty_sectors	= 0;
-	a->v.cached_sectors	= 0;
-	a->v.io_time[READ]	= atomic64_read(&c->io_clock[READ].now);
-	a->v.io_time[WRITE]	= atomic64_read(&c->io_clock[WRITE].now);
-
-	ret = bch2_trans_update(trans, &iter, &a->k_i,
-			       BTREE_TRIGGER_BUCKET_INVALIDATE|
-			       BTREE_UPDATE_NO_KEY_CACHE_COHERENCY);
-err:
-	bch2_trans_iter_exit(trans, &iter);
-	return ret;
-}
-
-static int bch2_invalidate_one_bucket(struct bch_fs *c, struct bch_dev *ca,
-				      u64 *journal_seq, unsigned flags)
-{
-	struct bkey_i_alloc_v4 a;
-	size_t b;
-	u64 commit_seq = 0;
-	int ret = 0;
-
-	/*
-	 * If the read-only path is trying to shut down, we can't be generating
-	 * new btree updates:
-	 */
-	if (test_bit(BCH_FS_ALLOCATOR_STOPPING, &c->flags))
-		return 1;
-
-	BUG_ON(!ca->alloc_heap.used ||
-	       !ca->alloc_heap.data[0].nr);
-	b = ca->alloc_heap.data[0].bucket;
-
-	/* first, put on free_inc and mark as owned by allocator: */
-	percpu_down_read(&c->mark_lock);
-
-	bch2_mark_alloc_bucket(c, ca, b, true);
-
-	spin_lock(&c->freelist_lock);
-	verify_not_on_freelist(c, ca, b);
-	BUG_ON(!fifo_push(&ca->free_inc, b));
-	spin_unlock(&c->freelist_lock);
-
-	percpu_up_read(&c->mark_lock);
-
-	ret = bch2_trans_do(c, NULL, &commit_seq,
-			    BTREE_INSERT_NOCHECK_RW|
-			    BTREE_INSERT_NOFAIL|
-			    flags,
-			    bucket_invalidate_btree(&trans, ca, b, &a));
-
-	if (!ret) {
-		/* remove from alloc_heap: */
-		struct alloc_heap_entry e, *top = ca->alloc_heap.data;
-
-		top->bucket++;
-		top->nr--;
-
-		if (!top->nr)
-			heap_pop(&ca->alloc_heap, e, bucket_alloc_cmp, NULL);
-
-		/*
-		 * If we invalidating cached data then we need to wait on the
-		 * journal commit:
-		 */
-		if (a.v.data_type)
-			*journal_seq = max(*journal_seq, commit_seq);
-
-		/*
-		 * We already waiting on u.alloc_seq when we filtered out
-		 * buckets that need journal commit:
-		 */
-		BUG_ON(*journal_seq > a.v.journal_seq);
-	} else {
-		size_t b2;
-
-		/* remove from free_inc: */
-		percpu_down_read(&c->mark_lock);
-		spin_lock(&c->freelist_lock);
-
-		bch2_mark_alloc_bucket(c, ca, b, false);
-
-		BUG_ON(!fifo_pop_back(&ca->free_inc, b2));
-		BUG_ON(b != b2);
-
-		spin_unlock(&c->freelist_lock);
-		percpu_up_read(&c->mark_lock);
-	}
-
-	return ret < 0 ? ret : 0;
-}
-
-/*
- * Pull buckets off ca->alloc_heap, invalidate them, move them to ca->free_inc:
- */
-static int bch2_invalidate_buckets(struct bch_fs *c, struct bch_dev *ca)
-{
-	u64 journal_seq = 0;
-	int ret = 0;
-
-	/* Only use nowait if we've already invalidated at least one bucket: */
-	while (!ret &&
-	       !fifo_full(&ca->free_inc) &&
-	       ca->alloc_heap.used) {
-		if (kthread_should_stop()) {
-			ret = 1;
-			break;
-		}
-
-		ret = bch2_invalidate_one_bucket(c, ca, &journal_seq,
-				(!fifo_empty(&ca->free_inc)
-				 ? BTREE_INSERT_NOWAIT : 0));
-		/*
-		 * We only want to batch up invalidates when they're going to
-		 * require flushing the journal:
-		 */
-		if (!journal_seq)
-			break;
-	}
-
-	/* If we used NOWAIT, don't return the error: */
-	if (!fifo_empty(&ca->free_inc))
-		ret = 0;
-	if (ret < 0)
-		bch_err(ca, "error invalidating buckets: %i", ret);
-	if (ret)
-		return ret;
-
-	if (journal_seq)
-		ret = bch2_journal_flush_seq(&c->journal, journal_seq);
-	if (ret) {
-		bch_err(ca, "journal error: %i", ret);
-		return ret;
-	}
-
-	return 0;
-}
-
-static void alloc_thread_set_state(struct bch_dev *ca, unsigned new_state)
-{
-	if (ca->allocator_state != new_state) {
-		ca->allocator_state = new_state;
-		closure_wake_up(&ca->fs->freelist_wait);
-	}
-}
-
-static int push_invalidated_bucket(struct bch_fs *c, struct bch_dev *ca, u64 b)
-{
-	unsigned i;
-	int ret = 0;
-
-	spin_lock(&c->freelist_lock);
-	for (i = 0; i < RESERVE_NR; i++) {
-		/*
-		 * Don't strand buckets on the copygc freelist until
-		 * after recovery is finished:
-		 */
-		if (i == RESERVE_movinggc &&
-		    !test_bit(BCH_FS_STARTED, &c->flags))
-			continue;
-
-		if (fifo_push(&ca->free[i], b)) {
-			fifo_pop(&ca->free_inc, b);
-			ret = 1;
-			break;
-		}
-	}
-	spin_unlock(&c->freelist_lock);
-
-	ca->allocator_state = ret
-		? ALLOCATOR_running
-		: ALLOCATOR_blocked_full;
-	closure_wake_up(&c->freelist_wait);
-	return ret;
-}
-
-static void discard_one_bucket(struct bch_fs *c, struct bch_dev *ca, u64 b)
-{
-	if (!c->opts.nochanges &&
-	    ca->mi.discard &&
-	    bdev_max_discard_sectors(ca->disk_sb.bdev))
-		blkdev_issue_discard(ca->disk_sb.bdev, bucket_to_sector(ca, b),
-				     ca->mi.bucket_size, GFP_NOFS);
-}
-
-static bool allocator_thread_running(struct bch_dev *ca)
-{
-	unsigned state = ca->mi.state == BCH_MEMBER_STATE_rw &&
-		test_bit(BCH_FS_ALLOCATOR_RUNNING, &ca->fs->flags)
-		? ALLOCATOR_running
-		: ALLOCATOR_stopped;
-	alloc_thread_set_state(ca, state);
-	return state == ALLOCATOR_running;
-}
-
-static int buckets_available(struct bch_dev *ca, unsigned long gc_count)
-{
-	s64 available = dev_buckets_reclaimable(ca) -
-		(gc_count == ca->fs->gc_count ? ca->inc_gen_really_needs_gc : 0);
-	bool ret = available > 0;
-
-	alloc_thread_set_state(ca, ret
-			       ? ALLOCATOR_running
-			       : ALLOCATOR_blocked);
-	return ret;
-}
-
-/**
- * bch_allocator_thread - move buckets from free_inc to reserves
- *
- * The free_inc FIFO is populated by find_reclaimable_buckets(), and
- * the reserves are depleted by bucket allocation. When we run out
- * of free_inc, try to invalidate some buckets and write out
- * prios and gens.
- */
-static int bch2_allocator_thread(void *arg)
-{
-	struct bch_dev *ca = arg;
-	struct bch_fs *c = ca->fs;
-	unsigned long gc_count = c->gc_count;
-	size_t nr;
-	int ret;
-
-	set_freezable();
-
-	while (1) {
-		ret = kthread_wait_freezable(allocator_thread_running(ca));
-		if (ret)
-			goto stop;
-
-		while (!ca->alloc_heap.used) {
-			cond_resched();
-
-			ret = kthread_wait_freezable(buckets_available(ca, gc_count));
-			if (ret)
-				goto stop;
-
-			gc_count = c->gc_count;
-			nr = find_reclaimable_buckets(c, ca);
-
-			if (!nr && ca->buckets_waiting_on_journal) {
-				ret = bch2_journal_flush(&c->journal);
-				if (ret)
-					goto stop;
-			} else if (nr < (ca->mi.nbuckets >> 6) &&
-				   ca->buckets_waiting_on_journal >= nr / 2) {
-				bch2_journal_flush_async(&c->journal, NULL);
-			}
-
-			if ((ca->inc_gen_needs_gc >= ALLOC_SCAN_BATCH(ca) ||
-			     ca->inc_gen_really_needs_gc) &&
-			    c->gc_thread) {
-				atomic_inc(&c->kick_gc);
-				wake_up_process(c->gc_thread);
-			}
-
-			trace_alloc_scan(ca, nr, ca->inc_gen_needs_gc,
-					 ca->inc_gen_really_needs_gc);
-		}
-
-		ret = bch2_invalidate_buckets(c, ca);
-		if (ret)
-			goto stop;
-
-		while (!fifo_empty(&ca->free_inc)) {
-			u64 b = fifo_peek(&ca->free_inc);
-
-			discard_one_bucket(c, ca, b);
-
-			ret = kthread_wait_freezable(push_invalidated_bucket(c, ca, b));
-			if (ret)
-				goto stop;
-		}
-	}
-stop:
-	alloc_thread_set_state(ca, ALLOCATOR_stopped);
-	return 0;
-}
-
 /* Startup/shutdown (ro/rw): */
 
 void bch2_recalc_capacity(struct bch_fs *c)
@@ -1193,7 +697,7 @@ void bch2_recalc_capacity(struct bch_fs *c)
 	u64 capacity = 0, reserved_sectors = 0, gc_reserve;
 	unsigned bucket_size_max = 0;
 	unsigned long ra_pages = 0;
-	unsigned i, j;
+	unsigned i;
 
 	lockdep_assert_held(&c->state_lock);
 
@@ -1224,8 +728,9 @@ void bch2_recalc_capacity(struct bch_fs *c)
 		 * allocations for foreground writes must wait -
 		 * not -ENOSPC calculations.
 		 */
-		for (j = 0; j < RESERVE_none; j++)
-			dev_reserve += ca->free[j].size;
+
+		dev_reserve += ca->nr_btree_reserve * 2;
+		dev_reserve += ca->mi.nbuckets >> 6; /* copygc reserve */
 
 		dev_reserve += 1;	/* btree write point */
 		dev_reserve += 1;	/* copygc write point */
@@ -1281,8 +786,6 @@ void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca)
 {
 	unsigned i;
 
-	BUG_ON(ca->alloc_thread);
-
 	/* First, remove device from allocation groups: */
 
 	for (i = 0; i < ARRAY_SIZE(c->rw_devs); i++)
@@ -1356,61 +859,6 @@ void bch2_dev_allocator_add(struct bch_fs *c, struct bch_dev *ca)
 			set_bit(ca->dev_idx, c->rw_devs[i].d);
 }
 
-void bch2_dev_allocator_quiesce(struct bch_fs *c, struct bch_dev *ca)
-{
-	if (ca->alloc_thread)
-		closure_wait_event(&c->freelist_wait,
-				   ca->allocator_state != ALLOCATOR_running);
-}
-
-/* stop allocator thread: */
-void bch2_dev_allocator_stop(struct bch_dev *ca)
-{
-	struct task_struct *p;
-
-	p = rcu_dereference_protected(ca->alloc_thread, 1);
-	ca->alloc_thread = NULL;
-
-	/*
-	 * We need an rcu barrier between setting ca->alloc_thread = NULL and
-	 * the thread shutting down to avoid bch2_wake_allocator() racing:
-	 *
-	 * XXX: it would be better to have the rcu barrier be asynchronous
-	 * instead of blocking us here
-	 */
-	synchronize_rcu();
-
-	if (p) {
-		kthread_stop(p);
-		put_task_struct(p);
-	}
-}
-
-/* start allocator thread: */
-int bch2_dev_allocator_start(struct bch_dev *ca)
-{
-	struct task_struct *p;
-
-	/*
-	 * allocator thread already started?
-	 */
-	if (ca->alloc_thread)
-		return 0;
-
-	p = kthread_create(bch2_allocator_thread, ca,
-			   "bch-alloc/%s", ca->name);
-	if (IS_ERR(p)) {
-		bch_err(ca->fs, "error creating allocator thread: %li",
-			PTR_ERR(p));
-		return PTR_ERR(p);
-	}
-
-	get_task_struct(p);
-	rcu_assign_pointer(ca->alloc_thread, p);
-	wake_up_process(p);
-	return 0;
-}
-
 void bch2_fs_allocator_background_init(struct bch_fs *c)
 {
 	spin_lock_init(&c->freelist_lock);
diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h
index 8de109674f0f..74b23f9b1bd3 100644
--- a/fs/bcachefs/alloc_background.h
+++ b/fs/bcachefs/alloc_background.h
@@ -8,8 +8,6 @@
 #include "debug.h"
 #include "super.h"
 
-extern const char * const bch2_allocator_states[];
-
 /* How out of date a pointer gen is allowed to be: */
 #define BUCKET_GC_GEN_MAX	96U
 
@@ -117,42 +115,11 @@ int bch2_trans_mark_alloc(struct btree_trans *, struct bkey_s_c,
 			  struct bkey_i *, unsigned);
 int bch2_fs_freespace_init(struct bch_fs *);
 
-static inline void bch2_wake_allocator(struct bch_dev *ca)
-{
-	struct task_struct *p;
-
-	rcu_read_lock();
-	p = rcu_dereference(ca->alloc_thread);
-	if (p)
-		wake_up_process(p);
-	rcu_read_unlock();
-}
-
-static inline void verify_not_on_freelist(struct bch_fs *c, struct bch_dev *ca,
-					  size_t bucket)
-{
-	if (bch2_expensive_debug_checks) {
-		size_t iter;
-		long i;
-		unsigned j;
-
-		for (j = 0; j < RESERVE_NR; j++)
-			fifo_for_each_entry(i, &ca->free[j], iter)
-				BUG_ON(i == bucket);
-		fifo_for_each_entry(i, &ca->free_inc, iter)
-			BUG_ON(i == bucket);
-	}
-}
-
 void bch2_recalc_capacity(struct bch_fs *);
 
 void bch2_dev_allocator_remove(struct bch_fs *, struct bch_dev *);
 void bch2_dev_allocator_add(struct bch_fs *, struct bch_dev *);
 
-void bch2_dev_allocator_quiesce(struct bch_fs *, struct bch_dev *);
-void bch2_dev_allocator_stop(struct bch_dev *);
-int bch2_dev_allocator_start(struct bch_dev *);
-
 void bch2_fs_allocator_background_init(struct bch_fs *);
 
 #endif /* _BCACHEFS_ALLOC_BACKGROUND_H */
diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index c4b4689fdd0f..01abcf43341f 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -14,13 +14,18 @@
 #include "bcachefs.h"
 #include "alloc_background.h"
 #include "alloc_foreground.h"
+#include "btree_iter.h"
+#include "btree_update.h"
 #include "btree_gc.h"
 #include "buckets.h"
+#include "buckets_waiting_for_journal.h"
 #include "clock.h"
 #include "debug.h"
 #include "disk_groups.h"
 #include "ec.h"
+#include "error.h"
 #include "io.h"
+#include "journal.h"
 #include "trace.h"
 
 #include <linux/math64.h>
@@ -50,6 +55,17 @@ const char * const bch2_alloc_reserves[] = {
  * reference _after_ doing the index update that makes its allocation reachable.
  */
 
+void bch2_reset_alloc_cursors(struct bch_fs *c)
+{
+	struct bch_dev *ca;
+	unsigned i;
+
+	rcu_read_lock();
+	for_each_member_device_rcu(ca, c, i, NULL)
+		ca->alloc_cursor = 0;
+	rcu_read_unlock();
+}
+
 static void bch2_open_bucket_hash_add(struct bch_fs *c, struct open_bucket *ob)
 {
 	open_bucket_idx_t idx = ob - c->open_buckets;
@@ -85,7 +101,6 @@ void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob)
 	percpu_down_read(&c->mark_lock);
 	spin_lock(&ob->lock);
 
-	bch2_mark_alloc_bucket(c, ca, ob->bucket, false);
 	ob->valid = false;
 	ob->data_type = 0;
 
@@ -185,39 +200,35 @@ static inline unsigned open_buckets_reserved(enum alloc_reserve reserve)
 	}
 }
 
-/**
- * bch_bucket_alloc - allocate a single bucket from a specific device
- *
- * Returns index of bucket on success, 0 on failure
- * */
-struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca,
-				      enum alloc_reserve reserve,
-				      bool may_alloc_partial,
-				      struct closure *cl)
+static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
+					      u64 bucket,
+					      enum alloc_reserve reserve,
+					      struct bch_alloc_v4 *a,
+					      u64 *skipped_open,
+					      u64 *skipped_need_journal_commit,
+					      u64 *skipped_nouse,
+					      struct closure *cl)
 {
 	struct open_bucket *ob;
-	long b = 0;
 
-	spin_lock(&c->freelist_lock);
+	if (unlikely(ca->buckets_nouse && test_bit(bucket, ca->buckets_nouse))) {
+		(*skipped_nouse)++;
+		return NULL;
+	}
 
-	if (may_alloc_partial) {
-		int i;
-
-		for (i = ca->open_buckets_partial_nr - 1; i >= 0; --i) {
-			ob = c->open_buckets + ca->open_buckets_partial[i];
-
-			if (reserve <= ob->alloc_reserve) {
-				array_remove_item(ca->open_buckets_partial,
-						  ca->open_buckets_partial_nr,
-						  i);
-				ob->on_partial_list = false;
-				ob->alloc_reserve = reserve;
-				spin_unlock(&c->freelist_lock);
-				return ob;
-			}
-		}
+	if (bch2_bucket_is_open(c, ca->dev_idx, bucket)) {
+		(*skipped_open)++;
+		return NULL;
+	}
+
+	if (bch2_bucket_needs_journal_commit(&c->buckets_waiting_for_journal,
+			c->journal.flushed_seq_ondisk, ca->dev_idx, bucket)) {
+		(*skipped_need_journal_commit)++;
+		return NULL;
 	}
 
+	spin_lock(&c->freelist_lock);
+
 	if (unlikely(c->open_buckets_nr_free <= open_buckets_reserved(reserve))) {
 		if (cl)
 			closure_wait(&c->open_buckets_wait, cl);
@@ -226,36 +237,16 @@ struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca,
 			c->blocked_allocate_open_bucket = local_clock();
 
 		spin_unlock(&c->freelist_lock);
-		trace_open_bucket_alloc_fail(ca, bch2_alloc_reserves[reserve]);
 		return ERR_PTR(-OPEN_BUCKETS_EMPTY);
 	}
 
-	if (likely(fifo_pop(&ca->free[RESERVE_none], b)))
-		goto out;
-
-	switch (reserve) {
-	case RESERVE_btree_movinggc:
-	case RESERVE_movinggc:
-		if (fifo_pop(&ca->free[RESERVE_movinggc], b))
-			goto out;
-		break;
-	default:
-		break;
+	/* Recheck under lock: */
+	if (bch2_bucket_is_open(c, ca->dev_idx, bucket)) {
+		spin_unlock(&c->freelist_lock);
+		(*skipped_open)++;
+		return NULL;
 	}
 
-	if (cl)
-		closure_wait(&c->freelist_wait, cl);
-
-	if (!c->blocked_allocate)
-		c->blocked_allocate = local_clock();
-
-	spin_unlock(&c->freelist_lock);
-
-	trace_bucket_alloc_fail(ca, bch2_alloc_reserves[reserve]);
-	return ERR_PTR(-FREELIST_EMPTY);
-out:
-	verify_not_on_freelist(c, ca, b);
-
 	ob = bch2_open_bucket_alloc(c);
 
 	spin_lock(&ob->lock);
@@ -264,8 +255,8 @@ out:
 	ob->sectors_free = ca->mi.bucket_size;
 	ob->alloc_reserve = reserve;
 	ob->dev		= ca->dev_idx;
-	ob->gen		= *bucket_gen(ca, b);
-	ob->bucket	= b;
+	ob->gen		= a->gen;
+	ob->bucket	= bucket;
 	spin_unlock(&ob->lock);
 
 	ca->nr_open_buckets++;
@@ -286,10 +277,326 @@ out:
 	}
 
 	spin_unlock(&c->freelist_lock);
+	return ob;
+}
+
+static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bch_dev *ca,
+					    enum alloc_reserve reserve, u64 free_entry,
+					    u64 *skipped_open,
+					    u64 *skipped_need_journal_commit,
+					    u64 *skipped_nouse,
+					    struct bkey_s_c freespace_k,
+					    struct closure *cl)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter iter = { NULL };
+	struct bkey_s_c k;
+	struct open_bucket *ob;
+	struct bch_alloc_v4 a;
+	u64 b = free_entry & ~(~0ULL << 56);
+	unsigned genbits = free_entry >> 56;
+	struct printbuf buf = PRINTBUF;
+	int ret;
+
+	if (b < ca->mi.first_bucket || b >= ca->mi.nbuckets) {
+		pr_buf(&buf, "freespace btree has bucket outside allowed range %u-%llu\n"
+		       "  freespace key ",
+			ca->mi.first_bucket, ca->mi.nbuckets);
+		bch2_bkey_val_to_text(&buf, c, freespace_k);
+		bch2_trans_inconsistent(trans, "%s", buf.buf);
+		ob = ERR_PTR(-EIO);
+		goto err;
+	}
+
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, POS(ca->dev_idx, b), BTREE_ITER_CACHED);
+	k = bch2_btree_iter_peek_slot(&iter);
+	ret = bkey_err(k);
+	if (ret) {
+		ob = ERR_PTR(ret);
+		goto err;
+	}
+
+	bch2_alloc_to_v4(k, &a);
+
+	if (genbits != (alloc_freespace_genbits(a) >> 56)) {
+		pr_buf(&buf, "bucket in freespace btree with wrong genbits (got %u should be %llu)\n"
+		       "  freespace key ",
+		       genbits, alloc_freespace_genbits(a) >> 56);
+		bch2_bkey_val_to_text(&buf, c, freespace_k);
+		pr_buf(&buf, "\n  ");
+		bch2_bkey_val_to_text(&buf, c, k);
+		bch2_trans_inconsistent(trans, "%s", buf.buf);
+		ob = ERR_PTR(-EIO);
+		goto err;
+
+	}
+
+	if (a.data_type != BUCKET_free) {
+		pr_buf(&buf, "non free bucket in freespace btree\n"
+		       "  freespace key ");
+		bch2_bkey_val_to_text(&buf, c, freespace_k);
+		pr_buf(&buf, "\n  ");
+		bch2_bkey_val_to_text(&buf, c, k);
+		bch2_trans_inconsistent(trans, "%s", buf.buf);
+		ob = ERR_PTR(-EIO);
+		goto err;
+	}
+
+	ob = __try_alloc_bucket(c, ca, b, reserve, &a,
+				skipped_open,
+				skipped_need_journal_commit,
+				skipped_nouse,
+				cl);
+	if (!ob)
+		iter.path->preserve = false;
+err:
+	bch2_trans_iter_exit(trans, &iter);
+	printbuf_exit(&buf);
+	return ob;
+}
+
+static struct open_bucket *try_alloc_partial_bucket(struct bch_fs *c, struct bch_dev *ca,
+						    enum alloc_reserve reserve)
+{
+	struct open_bucket *ob;
+	int i;
+
+	spin_lock(&c->freelist_lock);
+
+	for (i = ca->open_buckets_partial_nr - 1; i >= 0; --i) {
+		ob = c->open_buckets + ca->open_buckets_partial[i];
+
+		if (reserve <= ob->alloc_reserve) {
+			array_remove_item(ca->open_buckets_partial,
+					  ca->open_buckets_partial_nr,
+					  i);
+			ob->on_partial_list = false;
+			ob->alloc_reserve = reserve;
+			spin_unlock(&c->freelist_lock);
+			return ob;
+		}
+	}
+
+	spin_unlock(&c->freelist_lock);
+	return NULL;
+}
+
+/*
+ * This path is for before the freespace btree is initialized:
+ *
+ * If ca->new_fs_bucket_idx is nonzero, we haven't yet marked superblock &
+ * journal buckets - journal buckets will be < ca->new_fs_bucket_idx
+ */
+static noinline struct open_bucket *
+bch2_bucket_alloc_early(struct btree_trans *trans,
+			struct bch_dev *ca,
+			enum alloc_reserve reserve,
+			u64 *buckets_seen,
+			u64 *skipped_open,
+			u64 *skipped_need_journal_commit,
+			u64 *skipped_nouse,
+			struct closure *cl)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	struct open_bucket *ob = NULL;
+	u64 alloc_start = max_t(u64, ca->mi.first_bucket, ca->new_fs_bucket_idx);
+	u64 alloc_cursor = max(alloc_start, READ_ONCE(ca->alloc_cursor));
+	int ret;
+again:
+	for_each_btree_key(trans, iter, BTREE_ID_alloc, POS(ca->dev_idx, alloc_cursor),
+			   BTREE_ITER_SLOTS, k, ret) {
+		struct bch_alloc_v4 a;
+
+		if (bkey_cmp(k.k->p, POS(ca->dev_idx, ca->mi.nbuckets)) >= 0)
+			break;
+
+		if (ca->new_fs_bucket_idx &&
+		    is_superblock_bucket(ca, k.k->p.offset))
+			continue;
+
+		bch2_alloc_to_v4(k, &a);
+
+		if (bucket_state(a) != BUCKET_free)
+			continue;
+
+		(*buckets_seen)++;
+
+		ob = __try_alloc_bucket(trans->c, ca, k.k->p.offset, reserve, &a,
+					skipped_open,
+					skipped_need_journal_commit,
+					skipped_nouse,
+					cl);
+		if (ob)
+			break;
+	}
+	bch2_trans_iter_exit(trans, &iter);
+
+	ca->alloc_cursor = alloc_cursor;
+
+	if (!ob && alloc_cursor > alloc_start) {
+		alloc_cursor = alloc_start;
+		goto again;
+	}
+
+	return ob ?: ERR_PTR(ret ?: -FREELIST_EMPTY);
+}
+
+static struct open_bucket *bch2_bucket_alloc_freelist(struct btree_trans *trans,
+						   struct bch_dev *ca,
+						   enum alloc_reserve reserve,
+						   u64 *buckets_seen,
+						   u64 *skipped_open,
+						   u64 *skipped_need_journal_commit,
+						   u64 *skipped_nouse,
+						   struct closure *cl)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	struct open_bucket *ob = NULL;
+	u64 alloc_start = max_t(u64, ca->mi.first_bucket, READ_ONCE(ca->alloc_cursor));
+	u64 alloc_cursor = alloc_start;
+	int ret;
+
+	BUG_ON(ca->new_fs_bucket_idx);
+again:
+	for_each_btree_key_norestart(trans, iter, BTREE_ID_freespace,
+				     POS(ca->dev_idx, alloc_cursor), 0, k, ret) {
+		if (k.k->p.inode != ca->dev_idx)
+			break;
+
+		for (alloc_cursor = max(alloc_cursor, bkey_start_offset(k.k));
+		     alloc_cursor < k.k->p.offset;
+		     alloc_cursor++) {
+			if (btree_trans_too_many_iters(trans)) {
+				ob = ERR_PTR(-EINTR);
+				break;
+			}
+
+			(*buckets_seen)++;
+
+			ob = try_alloc_bucket(trans, ca, reserve,
+					      alloc_cursor,
+					      skipped_open,
+					      skipped_need_journal_commit,
+					      skipped_nouse,
+					      k, cl);
+			if (ob) {
+				iter.path->preserve = false;
+				break;
+			}
+		}
+		if (ob)
+			break;
+	}
+	bch2_trans_iter_exit(trans, &iter);
+
+	ca->alloc_cursor = alloc_cursor;
+
+	if (!ob && ret)
+		ob = ERR_PTR(ret);
+
+	if (!ob && alloc_start > ca->mi.first_bucket) {
+		alloc_cursor = alloc_start = ca->mi.first_bucket;
+		goto again;
+	}
+
+	return ob;
+}
+
+/**
+ * bch_bucket_alloc - allocate a single bucket from a specific device
+ *
+ * Returns index of bucket on success, 0 on failure
+ * */
+static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans,
+				      struct bch_dev *ca,
+				      enum alloc_reserve reserve,
+				      bool may_alloc_partial,
+				      struct closure *cl)
+{
+	struct bch_fs *c = trans->c;
+	struct open_bucket *ob = NULL;
+	u64 avail = dev_buckets_available(ca, reserve);
+	u64 buckets_seen = 0;
+	u64 skipped_open = 0;
+	u64 skipped_need_journal_commit = 0;
+	u64 skipped_nouse = 0;
+
+	if (may_alloc_partial) {
+		ob = try_alloc_partial_bucket(c, ca, reserve);
+		if (ob)
+			return ob;
+	}
+again:
+	if (!avail) {
+		if (cl) {
+			closure_wait(&c->freelist_wait, cl);
+			/* recheck after putting ourself on waitlist */
+			avail = dev_buckets_available(ca, reserve);
+			if (avail) {
+				closure_wake_up(&c->freelist_wait);
+				goto again;
+			}
+		}
+
+		if (!c->blocked_allocate)
+			c->blocked_allocate = local_clock();
+
+		ob = ERR_PTR(-FREELIST_EMPTY);
+		goto err;
+	}
+
+	ob = likely(ca->mi.freespace_initialized)
+		? bch2_bucket_alloc_freelist(trans, ca, reserve,
+					&buckets_seen,
+					&skipped_open,
+					&skipped_need_journal_commit,
+					&skipped_nouse,
+					cl)
+		: bch2_bucket_alloc_early(trans, ca, reserve,
+					&buckets_seen,
+					&skipped_open,
+					&skipped_need_journal_commit,
+					&skipped_nouse,
+					cl);
+
+	if (skipped_need_journal_commit * 2 > avail)
+		bch2_journal_flush_async(&c->journal, NULL);
+err:
+	if (!ob)
+		ob = ERR_PTR(-FREELIST_EMPTY);
+
+	if (!IS_ERR(ob)) {
+		trace_bucket_alloc(ca, bch2_alloc_reserves[reserve], avail,
+				   buckets_seen,
+				   skipped_open,
+				   skipped_need_journal_commit,
+				   skipped_nouse,
+				   cl == NULL, PTR_ERR_OR_ZERO(ob));
+	} else {
+		trace_bucket_alloc_fail(ca, bch2_alloc_reserves[reserve], avail,
+				   buckets_seen,
+				   skipped_open,
+				   skipped_need_journal_commit,
+				   skipped_nouse,
+				   cl == NULL, PTR_ERR_OR_ZERO(ob));
+		atomic_long_inc(&c->bucket_alloc_fail);
+	}
+
+	return ob;
+}
 
-	bch2_wake_allocator(ca);
+struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca,
+				      enum alloc_reserve reserve,
+				      bool may_alloc_partial,
+				      struct closure *cl)
+{
+	struct open_bucket *ob;
 
-	trace_bucket_alloc(ca, bch2_alloc_reserves[reserve]);
+	bch2_trans_do(c, NULL, NULL, 0,
+		      PTR_ERR_OR_ZERO(ob = bch2_bucket_alloc_trans(&trans, ca, reserve,
+								   may_alloc_partial, cl)));
 	return ob;
 }
 
@@ -320,7 +627,7 @@ void bch2_dev_stripe_increment(struct bch_dev *ca,
 			       struct dev_stripe_state *stripe)
 {
 	u64 *v = stripe->next_alloc + ca->dev_idx;
-	u64 free_space = dev_buckets_available(ca);
+	u64 free_space = dev_buckets_available(ca, RESERVE_none);
 	u64 free_space_inv = free_space
 		? div64_u64(1ULL << 48, free_space)
 		: 1ULL << 48;
@@ -358,7 +665,7 @@ static void add_new_bucket(struct bch_fs *c,
 	ob_push(c, ptrs, ob);
 }
 
-int bch2_bucket_alloc_set(struct bch_fs *c,
+static int bch2_bucket_alloc_set_trans(struct btree_trans *trans,
 		      struct open_buckets *ptrs,
 		      struct dev_stripe_state *stripe,
 		      struct bch_devs_mask *devs_may_alloc,
@@ -369,10 +676,12 @@ int bch2_bucket_alloc_set(struct bch_fs *c,
 		      unsigned flags,
 		      struct closure *cl)
 {
+	struct bch_fs *c = trans->c;
 	struct dev_alloc_list devs_sorted =
 		bch2_dev_alloc_list(c, stripe, devs_may_alloc);
+	unsigned dev;
 	struct bch_dev *ca;
-	int ret = -INSUFFICIENT_DEVICES;
+	int ret = 0;
 	unsigned i;
 
 	BUG_ON(*nr_effective >= nr_replicas);
@@ -380,35 +689,68 @@ int bch2_bucket_alloc_set(struct bch_fs *c,
 	for (i = 0; i < devs_sorted.nr; i++) {
 		struct open_bucket *ob;
 
-		ca = rcu_dereference(c->devs[devs_sorted.devs[i]]);
+		dev = devs_sorted.devs[i];
+
+		rcu_read_lock();
+		ca = rcu_dereference(c->devs[dev]);
+		if (ca)
+			percpu_ref_get(&ca->ref);
+		rcu_read_unlock();
+
 		if (!ca)
 			continue;
 
-		if (!ca->mi.durability && *have_cache)
+		if (!ca->mi.durability && *have_cache) {
+			percpu_ref_put(&ca->ref);
 			continue;
+		}
 
-		ob = bch2_bucket_alloc(c, ca, reserve,
+		ob = bch2_bucket_alloc_trans(trans, ca, reserve,
 				flags & BUCKET_MAY_ALLOC_PARTIAL, cl);
-		if (IS_ERR(ob)) {
-			ret = PTR_ERR(ob);
-
-			if (cl)
-				return ret;
+		if (!IS_ERR(ob))
+			bch2_dev_stripe_increment(ca, stripe);
+		percpu_ref_put(&ca->ref);
+
+		ret = PTR_ERR_OR_ZERO(ob);
+		if (ret) {
+			if (ret == -EINTR || cl)
+				break;
 			continue;
 		}
 
 		add_new_bucket(c, ptrs, devs_may_alloc,
 			       nr_effective, have_cache, flags, ob);
 
-		bch2_dev_stripe_increment(ca, stripe);
-
 		if (*nr_effective >= nr_replicas)
-			return 0;
+			break;
 	}
 
+	if (*nr_effective >= nr_replicas)
+		ret = 0;
+	else if (!ret)
+		ret = -INSUFFICIENT_DEVICES;
+
 	return ret;
 }
 
+int bch2_bucket_alloc_set(struct bch_fs *c,
+		      struct open_buckets *ptrs,
+		      struct dev_stripe_state *stripe,
+		      struct bch_devs_mask *devs_may_alloc,
+		      unsigned nr_replicas,
+		      unsigned *nr_effective,
+		      bool *have_cache,
+		      enum alloc_reserve reserve,
+		      unsigned flags,
+		      struct closure *cl)
+{
+	return bch2_trans_do(c, NULL, NULL, 0,
+		      bch2_bucket_alloc_set_trans(&trans, ptrs, stripe,
+					      devs_may_alloc, nr_replicas,
+					      nr_effective, have_cache, reserve,
+					      flags, cl));
+}
+
 /* Allocate from stripes: */
 
 /*
@@ -513,7 +855,7 @@ static void get_buckets_from_writepoint(struct bch_fs *c,
 	wp->ptrs = ptrs_skip;
 }
 
-static int open_bucket_add_buckets(struct bch_fs *c,
+static int open_bucket_add_buckets(struct btree_trans *trans,
 			struct open_buckets *ptrs,
 			struct write_point *wp,
 			struct bch_devs_list *devs_have,
@@ -526,6 +868,7 @@ static int open_bucket_add_buckets(struct bch_fs *c,
 			unsigned flags,
 			struct closure *_cl)
 {
+	struct bch_fs *c = trans->c;
 	struct bch_devs_mask devs;
 	struct open_bucket *ob;
 	struct closure *cl = NULL;
@@ -557,7 +900,8 @@ static int open_bucket_add_buckets(struct bch_fs *c,
 						 target, erasure_code,
 						 nr_replicas, nr_effective,
 						 have_cache, flags, _cl);
-			if (ret == -FREELIST_EMPTY ||
+			if (ret == -EINTR ||
+			    ret == -FREELIST_EMPTY ||
 			    ret == -OPEN_BUCKETS_EMPTY)
 				return ret;
 			if (*nr_effective >= nr_replicas)
@@ -571,25 +915,22 @@ static int open_bucket_add_buckets(struct bch_fs *c,
 	if (*nr_effective >= nr_replicas)
 		return 0;
 
-	percpu_down_read(&c->mark_lock);
-	rcu_read_lock();
-
 retry_blocking:
 	/*
 	 * Try nonblocking first, so that if one device is full we'll try from
 	 * other devices:
 	 */
-	ret = bch2_bucket_alloc_set(c, ptrs, &wp->stripe, &devs,
+	ret = bch2_bucket_alloc_set_trans(trans, ptrs, &wp->stripe, &devs,
 				nr_replicas, nr_effective, have_cache,
 				reserve, flags, cl);
-	if (ret && ret != -INSUFFICIENT_DEVICES && !cl && _cl) {
+	if (ret &&
+	    ret != -EINTR &&
+	    ret != -INSUFFICIENT_DEVICES &&
+	    !cl && _cl) {
 		cl = _cl;
 		goto retry_blocking;
 	}
 
-	rcu_read_unlock();
-	percpu_up_read(&c->mark_lock);
-
 	return ret;
 }
 
@@ -703,15 +1044,25 @@ static bool try_decrease_writepoints(struct bch_fs *c,
 	return true;
 }
 
-static struct write_point *writepoint_find(struct bch_fs *c,
+static void bch2_trans_mutex_lock(struct btree_trans *trans,
+				  struct mutex *lock)
+{
+	if (!mutex_trylock(lock)) {
+		bch2_trans_unlock(trans);
+		mutex_lock(lock);
+	}
+}
+
+static struct write_point *writepoint_find(struct btree_trans *trans,
 					   unsigned long write_point)
 {
+	struct bch_fs *c = trans->c;
 	struct write_point *wp, *oldest;
 	struct hlist_head *head;
 
 	if (!(write_point & 1UL)) {
 		wp = (struct write_point *) write_point;
-		mutex_lock(&wp->lock);
+		bch2_trans_mutex_lock(trans, &wp->lock);
 		return wp;
 	}
 
@@ -720,7 +1071,7 @@ restart_find:
 	wp = __writepoint_find(head, write_point);
 	if (wp) {
 lock_wp:
-		mutex_lock(&wp->lock);
+		bch2_trans_mutex_lock(trans, &wp->lock);
 		if (wp->write_point == write_point)
 			goto out;
 		mutex_unlock(&wp->lock);
@@ -733,8 +1084,8 @@ restart_find_oldest:
 		if (!oldest || time_before64(wp->last_used, oldest->last_used))
 			oldest = wp;
 
-	mutex_lock(&oldest->lock);
-	mutex_lock(&c->write_points_hash_lock);
+	bch2_trans_mutex_lock(trans, &oldest->lock);
+	bch2_trans_mutex_lock(trans, &c->write_points_hash_lock);
 	if (oldest >= c->write_points + c->write_points_nr ||
 	    try_increase_writepoints(c)) {
 		mutex_unlock(&c->write_points_hash_lock);
@@ -762,7 +1113,7 @@ out:
 /*
  * Get us an open_bucket we can allocate from, return with it locked:
  */
-int bch2_alloc_sectors_start(struct bch_fs *c,
+int bch2_alloc_sectors_start_trans(struct btree_trans *trans,
 			     unsigned target,
 			     unsigned erasure_code,
 			     struct write_point_specifier write_point,
@@ -774,6 +1125,7 @@ int bch2_alloc_sectors_start(struct bch_fs *c,
 			     struct closure *cl,
 			     struct write_point **wp_ret)
 {
+	struct bch_fs *c = trans->c;
 	struct write_point *wp;
 	struct open_bucket *ob;
 	struct open_buckets ptrs;
@@ -793,7 +1145,7 @@ retry:
 	write_points_nr = c->write_points_nr;
 	have_cache	= false;
 
-	*wp_ret = wp = writepoint_find(c, write_point.v);
+	*wp_ret = wp = writepoint_find(trans, write_point.v);
 
 	if (wp->data_type == BCH_DATA_user)
 		ob_flags |= BUCKET_MAY_ALLOC_PARTIAL;
@@ -803,21 +1155,21 @@ retry:
 		have_cache = true;
 
 	if (!target || (flags & BCH_WRITE_ONLY_SPECIFIED_DEVS)) {
-		ret = open_bucket_add_buckets(c, &ptrs, wp, devs_have,
+		ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have,
 					      target, erasure_code,
 					      nr_replicas, &nr_effective,
 					      &have_cache, reserve,
 					      ob_flags, cl);
 	} else {
-		ret = open_bucket_add_buckets(c, &ptrs, wp, devs_have,
+		ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have,
 					      target, erasure_code,
 					      nr_replicas, &nr_effective,
 					      &have_cache, reserve,
 					      ob_flags, NULL);
-		if (!ret)
+		if (!ret || ret == -EINTR)
 			goto alloc_done;
 
-		ret = open_bucket_add_buckets(c, &ptrs, wp, devs_have,
+		ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have,
 					      0, erasure_code,
 					      nr_replicas, &nr_effective,
 					      &have_cache, reserve,
@@ -871,10 +1223,33 @@ err:
 	case -INSUFFICIENT_DEVICES:
 		return -EROFS;
 	default:
-		BUG();
+		return ret;
 	}
 }
 
+int bch2_alloc_sectors_start(struct bch_fs *c,
+			     unsigned target,
+			     unsigned erasure_code,
+			     struct write_point_specifier write_point,
+			     struct bch_devs_list *devs_have,
+			     unsigned nr_replicas,
+			     unsigned nr_replicas_required,
+			     enum alloc_reserve reserve,
+			     unsigned flags,
+			     struct closure *cl,
+			     struct write_point **wp_ret)
+{
+	return bch2_trans_do(c, NULL, NULL, 0,
+			     bch2_alloc_sectors_start_trans(&trans, target,
+							    erasure_code,
+							    write_point,
+							    devs_have,
+							    nr_replicas,
+							    nr_replicas_required,
+							    reserve,
+							    flags, cl, wp_ret));
+}
+
 struct bch_extent_ptr bch2_ob_ptr(struct bch_fs *c, struct open_bucket *ob)
 {
 	struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
diff --git a/fs/bcachefs/alloc_foreground.h b/fs/bcachefs/alloc_foreground.h
index efb6d155bd00..12583a7e7aa3 100644
--- a/fs/bcachefs/alloc_foreground.h
+++ b/fs/bcachefs/alloc_foreground.h
@@ -14,6 +14,8 @@ struct bch_devs_List;
 
 extern const char * const bch2_alloc_reserves[];
 
+void bch2_reset_alloc_cursors(struct bch_fs *);
+
 struct dev_alloc_list {
 	unsigned	nr;
 	u8		devs[BCH_SB_MEMBERS_MAX];
@@ -136,6 +138,15 @@ int bch2_bucket_alloc_set(struct bch_fs *, struct open_buckets *,
 		      unsigned, unsigned *, bool *, enum alloc_reserve,
 		      unsigned, struct closure *);
 
+int bch2_alloc_sectors_start_trans(struct btree_trans *,
+				   unsigned, unsigned,
+				   struct write_point_specifier,
+				   struct bch_devs_list *,
+				   unsigned, unsigned,
+				   enum alloc_reserve,
+				   unsigned,
+				   struct closure *,
+				   struct write_point **);
 int bch2_alloc_sectors_start(struct bch_fs *,
 			     unsigned, unsigned,
 			     struct write_point_specifier,
diff --git a/fs/bcachefs/alloc_types.h b/fs/bcachefs/alloc_types.h
index 9e00afb17559..b3bef7074511 100644
--- a/fs/bcachefs/alloc_types.h
+++ b/fs/bcachefs/alloc_types.h
@@ -10,18 +10,6 @@
 
 struct ec_bucket_buf;
 
-#define ALLOC_THREAD_STATES()		\
-	x(stopped)			\
-	x(running)			\
-	x(blocked)			\
-	x(blocked_full)
-
-enum allocator_states {
-#define x(n)	ALLOCATOR_##n,
-	ALLOC_THREAD_STATES()
-#undef x
-};
-
 #define BCH_ALLOC_RESERVES()		\
 	x(btree_movinggc)		\
 	x(btree)			\
@@ -32,11 +20,8 @@ enum alloc_reserve {
 #define x(name)	RESERVE_##name,
 	BCH_ALLOC_RESERVES()
 #undef x
-	RESERVE_NR
 };
 
-typedef FIFO(long)	alloc_fifo;
-
 #define OPEN_BUCKETS_COUNT	1024
 
 #define WRITE_POINT_HASH_NR	32
@@ -127,12 +112,4 @@ struct write_point_specifier {
 	unsigned long		v;
 };
 
-struct alloc_heap_entry {
-	size_t			bucket;
-	size_t			nr;
-	unsigned long		key;
-};
-
-typedef HEAP(struct alloc_heap_entry) alloc_heap;
-
 #endif /* _BCACHEFS_ALLOC_TYPES_H */
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 6c11ebee73a9..879b2adc8b42 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -462,34 +462,18 @@ struct bch_dev {
 
 	/* Allocator: */
 	u64			new_fs_bucket_idx;
-	struct task_struct __rcu *alloc_thread;
+	u64			alloc_cursor;
 
-	/*
-	 * free: Buckets that are ready to be used
-	 *
-	 * free_inc: Incoming buckets - these are buckets that currently have
-	 * cached data in them, and we can't reuse them until after we write
-	 * their new gen to disk. After prio_write() finishes writing the new
-	 * gens/prios, they'll be moved to the free list (and possibly discarded
-	 * in the process)
-	 */
-	alloc_fifo		free[RESERVE_NR];
-	alloc_fifo		free_inc;
 	unsigned		nr_open_buckets;
+	unsigned		nr_btree_reserve;
 
 	open_bucket_idx_t	open_buckets_partial[OPEN_BUCKETS_COUNT];
 	open_bucket_idx_t	open_buckets_partial_nr;
 
-	size_t			fifo_last_bucket;
-
 	size_t			inc_gen_needs_gc;
 	size_t			inc_gen_really_needs_gc;
 	size_t			buckets_waiting_on_journal;
 
-	enum allocator_states	allocator_state;
-
-	alloc_heap		alloc_heap;
-
 	atomic64_t		rebalance_work;
 
 	struct journal_device	journal;
@@ -511,8 +495,6 @@ struct bch_dev {
 enum {
 	/* startup: */
 	BCH_FS_ALLOC_CLEAN,
-	BCH_FS_ALLOCATOR_RUNNING,
-	BCH_FS_ALLOCATOR_STOPPING,
 	BCH_FS_INITIAL_GC_DONE,
 	BCH_FS_INITIAL_GC_UNFIXED,
 	BCH_FS_TOPOLOGY_REPAIR_DONE,
@@ -914,6 +896,7 @@ mempool_t		bio_bounce_pages;
 	atomic_long_t		read_realloc_races;
 	atomic_long_t		extent_migrate_done;
 	atomic_long_t		extent_migrate_raced;
+	atomic_long_t		bucket_alloc_fail;
 
 	unsigned		btree_gc_periodic:1;
 	unsigned		copy_gc_enabled:1;
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index c3d6c62ef062..7078b277e23b 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -1684,9 +1684,8 @@ static void bch2_gc_stripes_reset(struct bch_fs *c, bool metadata_only)
  */
 int bch2_gc(struct bch_fs *c, bool initial, bool metadata_only)
 {
-	struct bch_dev *ca;
 	u64 start_time = local_clock();
-	unsigned i, iter = 0;
+	unsigned iter = 0;
 	int ret;
 
 	lockdep_assert_held(&c->state_lock);
@@ -1787,13 +1786,6 @@ out:
 	trace_gc_end(c);
 	bch2_time_stats_update(&c->times[BCH_TIME_btree_gc], start_time);
 
-	/*
-	 * Wake up allocator in case it was waiting for buckets
-	 * because of not being able to inc gens
-	 */
-	for_each_member_device(ca, c, i)
-		bch2_wake_allocator(ca);
-
 	/*
 	 * At startup, allocations can happen directly instead of via the
 	 * allocator thread - issue wakeup in case they blocked on gc_lock:
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index d1e3e2c76e30..2e958f88777b 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -178,12 +178,13 @@ static void bch2_btree_node_free_inmem(struct btree_trans *trans,
 	six_unlock_intent(&b->c.lock);
 }
 
-static struct btree *__bch2_btree_node_alloc(struct bch_fs *c,
+static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans,
 					     struct disk_reservation *res,
 					     struct closure *cl,
 					     bool interior_node,
 					     unsigned flags)
 {
+	struct bch_fs *c = trans->c;
 	struct write_point *wp;
 	struct btree *b;
 	__BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp;
@@ -214,7 +215,7 @@ static struct btree *__bch2_btree_node_alloc(struct bch_fs *c,
 	mutex_unlock(&c->btree_reserve_cache_lock);
 
 retry:
-	ret = bch2_alloc_sectors_start(c,
+	ret = bch2_alloc_sectors_start_trans(trans,
 				      c->opts.metadata_target ?:
 				      c->opts.foreground_target,
 				      0,
@@ -414,7 +415,8 @@ static void bch2_btree_reserve_put(struct btree_update *as)
 	mutex_unlock(&c->btree_reserve_cache_lock);
 }
 
-static int bch2_btree_reserve_get(struct btree_update *as,
+static int bch2_btree_reserve_get(struct btree_trans *trans,
+				  struct btree_update *as,
 				  unsigned nr_nodes[2],
 				  unsigned flags,
 				  struct closure *cl)
@@ -441,7 +443,7 @@ static int bch2_btree_reserve_get(struct btree_update *as,
 		struct prealloc_nodes *p = as->prealloc_nodes + interior;
 
 		while (p->nr < nr_nodes[interior]) {
-			b = __bch2_btree_node_alloc(c, &as->disk_res,
+			b = __bch2_btree_node_alloc(trans, &as->disk_res,
 					flags & BTREE_INSERT_NOWAIT ? NULL : cl,
 					interior, flags);
 			if (IS_ERR(b)) {
@@ -1066,8 +1068,9 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
 	if (ret)
 		goto err;
 
-	ret = bch2_btree_reserve_get(as, nr_nodes, flags, NULL);
-	if (ret) {
+	ret = bch2_btree_reserve_get(trans, as, nr_nodes, flags, NULL);
+	if (ret == -EAGAIN ||
+	    ret == -ENOMEM) {
 		struct closure cl;
 
 		closure_init_stack(&cl);
@@ -1075,7 +1078,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
 		bch2_trans_unlock(trans);
 
 		do {
-			ret = bch2_btree_reserve_get(as, nr_nodes, flags, &cl);
+			ret = bch2_btree_reserve_get(trans, as, nr_nodes, flags, &cl);
 			closure_sync(&cl);
 		} while (ret == -EAGAIN);
 	}
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 5e247235ab69..2c6fdf385ba3 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -296,11 +296,6 @@ static inline int bucket_sectors_fragmented(struct bch_dev *ca,
 		: 0;
 }
 
-static inline int is_stripe_data_bucket(struct bucket_mark m)
-{
-	return m.stripe && m.data_type != BCH_DATA_parity;
-}
-
 static inline enum bch_data_type bucket_type(struct bucket_mark m)
 {
 	return m.cached_sectors && !m.dirty_sectors
@@ -350,9 +345,6 @@ static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
 	u->d[new.data_type].fragmented += bucket_sectors_fragmented(ca, new);
 
 	preempt_enable();
-
-	if (!is_available_bucket(old) && is_available_bucket(new))
-		bch2_wake_allocator(ca);
 }
 
 static inline int __update_replicas(struct bch_fs *c,
@@ -488,19 +480,6 @@ static inline void update_cached_sectors_list(struct btree_trans *trans,
 	update_replicas_list(trans, &r.e, sectors);
 }
 
-void bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
-			    size_t b, bool owned_by_allocator)
-{
-	struct bucket *g = bucket(ca, b);
-	struct bucket_mark old, new;
-
-	old = bucket_cmpxchg(g, new, ({
-		new.owned_by_allocator	= owned_by_allocator;
-	}));
-
-	BUG_ON(owned_by_allocator == old.owned_by_allocator);
-}
-
 int bch2_mark_alloc(struct btree_trans *trans,
 		    struct bkey_s_c old, struct bkey_s_c new,
 		    unsigned flags)
@@ -560,6 +539,10 @@ int bch2_mark_alloc(struct btree_trans *trans,
 		}
 	}
 
+	if (!new_a.data_type &&
+	    (!new_a.journal_seq || new_a.journal_seq < c->journal.flushed_seq_ondisk))
+		closure_wake_up(&c->freelist_wait);
+
 	if (bucket_state(new_a) == BUCKET_need_gc_gens) {
 		atomic_inc(&c->kick_gc);
 		wake_up_process(c->gc_thread);
@@ -583,7 +566,6 @@ int bch2_mark_alloc(struct btree_trans *trans,
 
 	g->io_time[READ]	= new_a.io_time[READ];
 	g->io_time[WRITE]	= new_a.io_time[WRITE];
-	g->oldest_gen		= new_a.oldest_gen;
 	g->gen_valid		= 1;
 	g->stripe		= new_a.stripe;
 	g->stripe_redundancy	= new_a.stripe_redundancy;
@@ -1861,8 +1843,7 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
 	a->v.data_type		= type;
 	a->v.dirty_sectors	= sectors;
 
-	ret = bch2_trans_update(trans, &iter, &a->k_i,
-				BTREE_UPDATE_NO_KEY_CACHE_COHERENCY);
+	ret = bch2_trans_update(trans, &iter, &a->k_i, 0);
 	if (ret)
 		goto out;
 out:
@@ -2048,24 +2029,8 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
 	struct bucket_array *buckets = NULL, *old_buckets = NULL;
 	struct bucket_gens *bucket_gens = NULL, *old_bucket_gens = NULL;
 	unsigned long *buckets_nouse = NULL;
-	alloc_fifo	free[RESERVE_NR];
-	alloc_fifo	free_inc;
-	alloc_heap	alloc_heap;
-
-	size_t btree_reserve	= DIV_ROUND_UP(BTREE_NODE_RESERVE,
-			     ca->mi.bucket_size / btree_sectors(c));
-	/* XXX: these should be tunable */
-	size_t reserve_none	= max_t(size_t, 1, nbuckets >> 9);
-	size_t copygc_reserve	= max_t(size_t, 2, nbuckets >> 6);
-	size_t free_inc_nr	= max(max_t(size_t, 1, nbuckets >> 12),
-				      btree_reserve * 2);
 	bool resize = ca->buckets[0] != NULL;
 	int ret = -ENOMEM;
-	unsigned i;
-
-	memset(&free,		0, sizeof(free));
-	memset(&free_inc,	0, sizeof(free_inc));
-	memset(&alloc_heap,	0, sizeof(alloc_heap));
 
 	if (!(buckets		= kvpmalloc(sizeof(struct bucket_array) +
 					    nbuckets * sizeof(struct bucket),
@@ -2075,12 +2040,7 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
 	    (c->opts.buckets_nouse &&
 	     !(buckets_nouse	= kvpmalloc(BITS_TO_LONGS(nbuckets) *
 					    sizeof(unsigned long),
-					    GFP_KERNEL|__GFP_ZERO))) ||
-	    !init_fifo(&free[RESERVE_movinggc],
-		       copygc_reserve, GFP_KERNEL) ||
-	    !init_fifo(&free[RESERVE_none], reserve_none, GFP_KERNEL) ||
-	    !init_fifo(&free_inc,	free_inc_nr, GFP_KERNEL) ||
-	    !init_heap(&alloc_heap,	ALLOC_SCAN_BATCH(ca) << 1, GFP_KERNEL))
+					    GFP_KERNEL|__GFP_ZERO))))
 		goto err;
 
 	buckets->first_bucket	= ca->mi.first_bucket;
@@ -2126,18 +2086,6 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
 		up_write(&c->gc_lock);
 	}
 
-	spin_lock(&c->freelist_lock);
-	for (i = 0; i < RESERVE_NR; i++) {
-		fifo_move(&free[i], &ca->free[i]);
-		swap(ca->free[i], free[i]);
-	}
-	fifo_move(&free_inc, &ca->free_inc);
-	swap(ca->free_inc, free_inc);
-	spin_unlock(&c->freelist_lock);
-
-	/* with gc lock held, alloc_heap can't be in use: */
-	swap(ca->alloc_heap, alloc_heap);
-
 	nbuckets = ca->mi.nbuckets;
 
 	if (resize)
@@ -2145,10 +2093,6 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
 
 	ret = 0;
 err:
-	free_heap(&alloc_heap);
-	free_fifo(&free_inc);
-	for (i = 0; i < RESERVE_NR; i++)
-		free_fifo(&free[i]);
 	kvpfree(buckets_nouse,
 		BITS_TO_LONGS(nbuckets) * sizeof(unsigned long));
 	if (bucket_gens)
@@ -2163,10 +2107,6 @@ void bch2_dev_buckets_free(struct bch_dev *ca)
 {
 	unsigned i;
 
-	free_heap(&ca->alloc_heap);
-	free_fifo(&ca->free_inc);
-	for (i = 0; i < RESERVE_NR; i++)
-		free_fifo(&ca->free[i]);
 	kvpfree(ca->buckets_nouse,
 		BITS_TO_LONGS(ca->mi.nbuckets) * sizeof(unsigned long));
 	kvpfree(rcu_dereference_protected(ca->bucket_gens, 1),
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index 757919d5e20f..bcb40f15f82e 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -58,11 +58,6 @@ static inline struct bucket *gc_bucket(struct bch_dev *ca, size_t b)
 	return __bucket(ca, b, true);
 }
 
-static inline struct bucket *bucket(struct bch_dev *ca, size_t b)
-{
-	return __bucket(ca, b, false);
-}
-
 static inline struct bucket_gens *bucket_gens(struct bch_dev *ca)
 {
 	return rcu_dereference_check(ca->bucket_gens,
@@ -151,50 +146,50 @@ static inline bool is_available_bucket(struct bucket_mark mark)
 struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *);
 
 static inline u64 __dev_buckets_available(struct bch_dev *ca,
-					  struct bch_dev_usage stats)
+					  struct bch_dev_usage stats,
+					  enum alloc_reserve reserve)
 {
-	u64 total = ca->mi.nbuckets - ca->mi.first_bucket;
+	s64 total = ca->mi.nbuckets - ca->mi.first_bucket;
+	s64 reserved = 0;
+
+	switch (reserve) {
+	case RESERVE_none:
+		reserved += ca->mi.nbuckets >> 6;
+		fallthrough;
+	case RESERVE_movinggc:
+		reserved += ca->nr_btree_reserve;
+		fallthrough;
+	case RESERVE_btree:
+		reserved += ca->nr_btree_reserve;
+		fallthrough;
+	case RESERVE_btree_movinggc:
+		break;
+	default:
+		BUG();
+	}
 
 	if (WARN_ONCE(stats.buckets_unavailable > total,
 		      "buckets_unavailable overflow (%llu > %llu)\n",
 		      stats.buckets_unavailable, total))
 		return 0;
 
-	return total - stats.buckets_unavailable;
+	return max_t(s64, 0,
+		     total -
+		     stats.buckets_unavailable -
+		     ca->nr_open_buckets -
+		     reserved);
 }
 
-static inline u64 dev_buckets_available(struct bch_dev *ca)
+static inline u64 dev_buckets_available(struct bch_dev *ca,
+					enum alloc_reserve reserve)
 {
-	return __dev_buckets_available(ca, bch2_dev_usage_read(ca));
-}
-
-static inline u64 __dev_buckets_reclaimable(struct bch_dev *ca,
-					    struct bch_dev_usage stats)
-{
-	struct bch_fs *c = ca->fs;
-	s64 available = __dev_buckets_available(ca, stats);
-	unsigned i;
-
-	spin_lock(&c->freelist_lock);
-	for (i = 0; i < RESERVE_NR; i++)
-		available -= fifo_used(&ca->free[i]);
-	available -= fifo_used(&ca->free_inc);
-	available -= ca->nr_open_buckets;
-	spin_unlock(&c->freelist_lock);
-
-	return max(available, 0LL);
-}
-
-static inline u64 dev_buckets_reclaimable(struct bch_dev *ca)
-{
-	return __dev_buckets_reclaimable(ca, bch2_dev_usage_read(ca));
+	return __dev_buckets_available(ca, bch2_dev_usage_read(ca), reserve);
 }
 
 /* Filesystem usage: */
 
 static inline unsigned fs_usage_u64s(struct bch_fs *c)
 {
-
 	return sizeof(struct bch_fs_usage) / sizeof(u64) +
 		READ_ONCE(c->replicas.nr);
 }
@@ -222,7 +217,6 @@ bch2_fs_usage_read_short(struct bch_fs *);
 
 void bch2_fs_usage_initialize(struct bch_fs *);
 
-void bch2_mark_alloc_bucket(struct bch_fs *, struct bch_dev *, size_t, bool);
 void bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *,
 			       size_t, enum bch_data_type, unsigned,
 			       struct gc_pos, unsigned);
diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h
index 4f7018398385..6ddbea4da7d1 100644
--- a/fs/bcachefs/buckets_types.h
+++ b/fs/bcachefs/buckets_types.h
@@ -14,7 +14,6 @@ struct bucket_mark {
 	struct {
 	u8		gen;
 	u8		data_type:3,
-			owned_by_allocator:1,
 			stripe:1;
 	u16		dirty_sectors;
 	u16		cached_sectors;
@@ -29,7 +28,6 @@ struct bucket {
 	};
 
 	u64				io_time[2];
-	u8				oldest_gen;
 	unsigned			gen_valid:1;
 	u8				stripe_redundancy;
 	u32				stripe;
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index 9dc2f9f822c8..5030a5b831af 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -1295,9 +1295,6 @@ static int new_stripe_alloc_buckets(struct bch_fs *c, struct ec_stripe_head *h,
 	BUG_ON(nr_have_data	> h->s->nr_data);
 	BUG_ON(nr_have_parity	> h->s->nr_parity);
 
-	percpu_down_read(&c->mark_lock);
-	rcu_read_lock();
-
 	buckets.nr = 0;
 	if (nr_have_parity < h->s->nr_parity) {
 		ret = bch2_bucket_alloc_set(c, &buckets,
@@ -1324,7 +1321,7 @@ static int new_stripe_alloc_buckets(struct bch_fs *c, struct ec_stripe_head *h,
 		}
 
 		if (ret)
-			goto err;
+			return ret;
 	}
 
 	buckets.nr = 0;
@@ -1352,12 +1349,10 @@ static int new_stripe_alloc_buckets(struct bch_fs *c, struct ec_stripe_head *h,
 		}
 
 		if (ret)
-			goto err;
+			return ret;
 	}
-err:
-	rcu_read_unlock();
-	percpu_up_read(&c->mark_lock);
-	return ret;
+
+	return 0;
 }
 
 /* XXX: doesn't obey target: */
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index cb15d1c8a135..f87f76553bf4 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -812,10 +812,8 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
 				break;
 			}
 		} else {
-			rcu_read_lock();
 			ob[nr_got] = bch2_bucket_alloc(c, ca, RESERVE_none,
 					       false, cl);
-			rcu_read_unlock();
 			if (IS_ERR(ob[nr_got])) {
 				ret = cl ? -EAGAIN : -ENOSPC;
 				break;
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index b2c3ee336c1f..3e418342ee67 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -1398,6 +1398,10 @@ static void journal_write_done(struct closure *cl)
 		if (!JSET_NO_FLUSH(w->data)) {
 			j->flushed_seq_ondisk = seq;
 			j->last_seq_ondisk = w->last_seq;
+
+			closure_wake_up(&c->freelist_wait);
+
+			bch2_reset_alloc_cursors(c);
 		}
 	} else if (!j->err_seq || seq < j->err_seq)
 		j->err_seq	= seq;
diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c
index aecec55eb421..b9e1bd7b1d05 100644
--- a/fs/bcachefs/movinggc.c
+++ b/fs/bcachefs/movinggc.c
@@ -104,18 +104,6 @@ static enum data_cmd copygc_pred(struct bch_fs *c, void *arg,
 	return DATA_SKIP;
 }
 
-static bool have_copygc_reserve(struct bch_dev *ca)
-{
-	bool ret;
-
-	spin_lock(&ca->fs->freelist_lock);
-	ret = fifo_full(&ca->free[RESERVE_movinggc]) ||
-		ca->allocator_state != ALLOCATOR_running;
-	spin_unlock(&ca->fs->freelist_lock);
-
-	return ret;
-}
-
 static inline int fragmentation_cmp(copygc_heap *heap,
 				   struct copygc_heap_entry l,
 				   struct copygc_heap_entry r)
@@ -247,11 +235,10 @@ static int bch2_copygc(struct bch_fs *c)
 	}
 
 	for_each_rw_member(ca, c, dev_idx) {
-		closure_wait_event(&c->freelist_wait, have_copygc_reserve(ca));
+		s64 avail = min(dev_buckets_available(ca, RESERVE_movinggc),
+				ca->mi.nbuckets >> 6);
 
-		spin_lock(&ca->fs->freelist_lock);
-		sectors_reserved += fifo_used(&ca->free[RESERVE_movinggc]) * ca->mi.bucket_size;
-		spin_unlock(&ca->fs->freelist_lock);
+		sectors_reserved += avail * ca->mi.bucket_size;
 	}
 
 	ret = walk_buckets_to_copygc(c);
@@ -352,8 +339,8 @@ unsigned long bch2_copygc_wait_amount(struct bch_fs *c)
 	for_each_rw_member(ca, c, dev_idx) {
 		struct bch_dev_usage usage = bch2_dev_usage_read(ca);
 
-		fragmented_allowed = ((__dev_buckets_reclaimable(ca, usage) *
-					ca->mi.bucket_size) >> 1);
+		fragmented_allowed = ((__dev_buckets_available(ca, usage, RESERVE_none) *
+				       ca->mi.bucket_size) >> 1);
 		fragmented = usage.d[BCH_DATA_user].fragmented;
 
 		wait = min(wait, max(0LL, fragmented_allowed - fragmented));
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 690a36ea1383..50e5c5e852f7 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -1374,6 +1374,7 @@ int bch2_fs_initialize(struct bch_fs *c)
 	 * Write out the superblock and journal buckets, now that we can do
 	 * btree updates
 	 */
+	bch_verbose(c, "marking superblocks");
 	err = "error marking superblock and journal";
 	for_each_member_device(ca, c, i) {
 		ret = bch2_trans_mark_dev_sb(c, ca);
@@ -1385,6 +1386,7 @@ int bch2_fs_initialize(struct bch_fs *c)
 		ca->new_fs_bucket_idx = 0;
 	}
 
+	bch_verbose(c, "initializing freespace");
 	err = "error initializing freespace";
 	ret = bch2_fs_freespace_init(c);
 	if (ret)
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index c6585034f4d4..3a8740fde9de 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -206,17 +206,9 @@ static void __bch2_fs_read_only(struct bch_fs *c)
 	 */
 	bch2_journal_flush_all_pins(&c->journal);
 
-	/*
-	 * If the allocator threads didn't all start up, the btree updates to
-	 * write out alloc info aren't going to work:
-	 */
-	if (!test_bit(BCH_FS_ALLOCATOR_RUNNING, &c->flags))
-		goto nowrote_alloc;
-
 	bch_verbose(c, "flushing journal and stopping allocators");
 
 	bch2_journal_flush_all_pins(&c->journal);
-	set_bit(BCH_FS_ALLOCATOR_STOPPING, &c->flags);
 
 	do {
 		clean_passes++;
@@ -241,17 +233,11 @@ static void __bch2_fs_read_only(struct bch_fs *c)
 	bch_verbose(c, "flushing journal and stopping allocators complete");
 
 	set_bit(BCH_FS_ALLOC_CLEAN, &c->flags);
-nowrote_alloc:
+
 	closure_wait_event(&c->btree_interior_update_wait,
 			   !bch2_btree_interior_updates_nr_pending(c));
 	flush_work(&c->btree_interior_update_work);
 
-	for_each_member_device(ca, c, i)
-		bch2_dev_allocator_stop(ca);
-
-	clear_bit(BCH_FS_ALLOCATOR_RUNNING, &c->flags);
-	clear_bit(BCH_FS_ALLOCATOR_STOPPING, &c->flags);
-
 	bch2_fs_journal_stop(&c->journal);
 
 	/*
@@ -287,10 +273,6 @@ void bch2_fs_read_only(struct bch_fs *c)
 	/*
 	 * Block new foreground-end write operations from starting - any new
 	 * writes will return -EROFS:
-	 *
-	 * (This is really blocking new _allocations_, writes to previously
-	 * allocated space can still happen until stopping the allocator in
-	 * bch2_dev_allocator_stop()).
 	 */
 	percpu_ref_kill(&c->writes);
 
@@ -419,20 +401,6 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early)
 		bch2_dev_allocator_add(c, ca);
 	bch2_recalc_capacity(c);
 
-	for_each_rw_member(ca, c, i) {
-		ret = bch2_dev_allocator_start(ca);
-		if (ret) {
-			bch_err(c, "error starting allocator threads");
-			percpu_ref_put(&ca->io_ref);
-			goto err;
-		}
-	}
-
-	set_bit(BCH_FS_ALLOCATOR_RUNNING, &c->flags);
-
-	for_each_rw_member(ca, c, i)
-		bch2_wake_allocator(ca);
-
 	if (!early) {
 		ret = bch2_fs_read_write_late(c);
 		if (ret)
@@ -946,20 +914,6 @@ int bch2_fs_start(struct bch_fs *c)
 
 	set_bit(BCH_FS_STARTED, &c->flags);
 
-	/*
-	 * Allocator threads don't start filling copygc reserve until after we
-	 * set BCH_FS_STARTED - wake them now:
-	 *
-	 * XXX ugly hack:
-	 * Need to set ca->allocator_state here instead of relying on the
-	 * allocator threads to do it to avoid racing with the copygc threads
-	 * checking it and thinking they have no alloc reserve:
-	 */
-	for_each_online_member(ca, c, i) {
-		ca->allocator_state = ALLOCATOR_running;
-		bch2_wake_allocator(ca);
-	}
-
 	if (c->opts.read_only || c->opts.nochanges) {
 		bch2_fs_read_only(c);
 	} else {
@@ -1051,8 +1005,6 @@ static void bch2_dev_release(struct kobject *kobj)
 
 static void bch2_dev_free(struct bch_dev *ca)
 {
-	bch2_dev_allocator_stop(ca);
-
 	cancel_work_sync(&ca->io_error_work);
 
 	if (ca->kobj.state_in_sysfs &&
@@ -1167,6 +1119,9 @@ static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c,
 	ca->mi = bch2_mi_to_cpu(member);
 	ca->uuid = member->uuid;
 
+	ca->nr_btree_reserve = DIV_ROUND_UP(BTREE_NODE_RESERVE,
+			     ca->mi.bucket_size / btree_sectors(c));
+
 	if (percpu_ref_init(&ca->ref, bch2_dev_ref_complete,
 			    0, GFP_KERNEL) ||
 	    percpu_ref_init(&ca->io_ref, bch2_dev_io_ref_complete,
@@ -1216,12 +1171,6 @@ static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx)
 
 	ca->fs = c;
 
-	if (ca->mi.state == BCH_MEMBER_STATE_rw &&
-	    bch2_dev_allocator_start(ca)) {
-		bch2_dev_free(ca);
-		goto err;
-	}
-
 	bch2_dev_attach(c, ca, dev_idx);
 out:
 	pr_verbose_init(c->opts, "ret %i", ret);
@@ -1405,14 +1354,13 @@ static void __bch2_dev_read_only(struct bch_fs *c, struct bch_dev *ca)
 	/*
 	 * The allocator thread itself allocates btree nodes, so stop it first:
 	 */
-	bch2_dev_allocator_stop(ca);
 	bch2_dev_allocator_remove(c, ca);
 	bch2_dev_journal_stop(&c->journal, ca);
 
 	bch2_copygc_start(c);
 }
 
-static int __bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca)
+static void __bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca)
 {
 	lockdep_assert_held(&c->state_lock);
 
@@ -1420,8 +1368,6 @@ static int __bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca)
 
 	bch2_dev_allocator_add(c, ca);
 	bch2_recalc_capacity(c);
-
-	return bch2_dev_allocator_start(ca);
 }
 
 int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca,
@@ -1448,7 +1394,7 @@ int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca,
 	mutex_unlock(&c->sb_lock);
 
 	if (new_state == BCH_MEMBER_STATE_rw)
-		ret = __bch2_dev_read_write(c, ca);
+		__bch2_dev_read_write(c, ca);
 
 	rebalance_wakeup(c);
 
@@ -1710,13 +1656,8 @@ have_slot:
 
 	ca->new_fs_bucket_idx = 0;
 
-	if (ca->mi.state == BCH_MEMBER_STATE_rw) {
-		ret = __bch2_dev_read_write(c, ca);
-		if (ret) {
-			bch_err(c, "device add error: error going RW on new device: %i", ret);
-			goto err_late;
-		}
-	}
+	if (ca->mi.state == BCH_MEMBER_STATE_rw)
+		__bch2_dev_read_write(c, ca);
 
 	up_write(&c->state_lock);
 	return 0;
@@ -1776,11 +1717,8 @@ int bch2_dev_online(struct bch_fs *c, const char *path)
 		goto err;
 	}
 
-	if (ca->mi.state == BCH_MEMBER_STATE_rw) {
-		ret = __bch2_dev_read_write(c, ca);
-		if (ret)
-			goto err;
-	}
+	if (ca->mi.state == BCH_MEMBER_STATE_rw)
+		__bch2_dev_read_write(c, ca);
 
 	mutex_lock(&c->sb_lock);
 	mi = bch2_sb_get_members(c->disk_sb.sb);
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index ec672134cb18..e995b84b6172 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -170,7 +170,6 @@ read_attribute(congested);
 
 read_attribute(btree_avg_write_size);
 
-read_attribute(reserve_stats);
 read_attribute(btree_cache_size);
 read_attribute(compression_stats);
 read_attribute(journal_debug);
@@ -186,11 +185,11 @@ read_attribute(internal_uuid);
 
 read_attribute(has_data);
 read_attribute(alloc_debug);
-write_attribute(wake_allocator);
 
 read_attribute(read_realloc_races);
 read_attribute(extent_migrate_done);
 read_attribute(extent_migrate_raced);
+read_attribute(bucket_alloc_fail);
 
 rw_attribute(discard);
 rw_attribute(label);
@@ -377,6 +376,8 @@ SHOW(bch2_fs)
 		    atomic_long_read(&c->extent_migrate_done));
 	sysfs_print(extent_migrate_raced,
 		    atomic_long_read(&c->extent_migrate_raced));
+	sysfs_print(bucket_alloc_fail,
+		    atomic_long_read(&c->bucket_alloc_fail));
 
 	sysfs_printf(btree_gc_periodic, "%u",	(int) c->btree_gc_periodic);
 
@@ -577,6 +578,7 @@ struct attribute *bch2_fs_internal_files[] = {
 	&sysfs_read_realloc_races,
 	&sysfs_extent_migrate_done,
 	&sysfs_extent_migrate_raced,
+	&sysfs_bucket_alloc_fail,
 
 	&sysfs_gc_gens_pos,
 
@@ -705,24 +707,6 @@ struct attribute *bch2_fs_time_stats_files[] = {
 	NULL
 };
 
-static void reserve_stats_to_text(struct printbuf *out, struct bch_dev *ca)
-{
-	enum alloc_reserve i;
-
-	spin_lock(&ca->fs->freelist_lock);
-
-	pr_buf(out, "free_inc:\t%zu\t%zu\n",
-	       fifo_used(&ca->free_inc),
-	       ca->free_inc.size);
-
-	for (i = 0; i < RESERVE_NR; i++)
-		pr_buf(out, "free[%u]:\t%zu\t%zu\n", i,
-		       fifo_used(&ca->free[i]),
-		       ca->free[i].size);
-
-	spin_unlock(&ca->fs->freelist_lock);
-}
-
 static void dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca)
 {
 	struct bch_fs *c = ca->fs;
@@ -748,9 +732,6 @@ static void dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca)
 	       "ec\t%16llu\n"
 	       "available%15llu\n"
 	       "\n"
-	       "free_inc\t\t%zu/%zu\n"
-	       "free[RESERVE_MOVINGGC]\t%zu/%zu\n"
-	       "free[RESERVE_NONE]\t%zu/%zu\n"
 	       "freelist_wait\t\t%s\n"
 	       "open buckets allocated\t%u\n"
 	       "open buckets this dev\t%u\n"
@@ -758,13 +739,9 @@ static void dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca)
 	       "open_buckets_wait\t%s\n"
 	       "open_buckets_btree\t%u\n"
 	       "open_buckets_user\t%u\n"
-	       "btree reserve cache\t%u\n"
-	       "thread state:\t\t%s\n",
+	       "btree reserve cache\t%u\n",
 	       stats.buckets_ec,
-	       __dev_buckets_available(ca, stats),
-	       fifo_used(&ca->free_inc),		ca->free_inc.size,
-	       fifo_used(&ca->free[RESERVE_movinggc]),	ca->free[RESERVE_movinggc].size,
-	       fifo_used(&ca->free[RESERVE_none]),	ca->free[RESERVE_none].size,
+	       __dev_buckets_available(ca, stats, RESERVE_none),
 	       c->freelist_wait.list.first		? "waiting" : "empty",
 	       OPEN_BUCKETS_COUNT - c->open_buckets_nr_free,
 	       ca->nr_open_buckets,
@@ -772,8 +749,7 @@ static void dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca)
 	       c->open_buckets_wait.list.first		? "waiting" : "empty",
 	       nr[BCH_DATA_btree],
 	       nr[BCH_DATA_user],
-	       c->btree_reserve_cache_nr,
-	       bch2_allocator_states[ca->allocator_state]);
+	       c->btree_reserve_cache_nr);
 }
 
 static const char * const bch2_rw[] = {
@@ -848,9 +824,6 @@ SHOW(bch2_dev)
 		     clamp(atomic_read(&ca->congested), 0, CONGESTED_MAX)
 		     * 100 / CONGESTED_MAX);
 
-	if (attr == &sysfs_reserve_stats)
-		reserve_stats_to_text(out, ca);
-
 	if (attr == &sysfs_alloc_debug)
 		dev_alloc_debug_to_text(out, ca);
 
@@ -890,9 +863,6 @@ STORE(bch2_dev)
 			return ret;
 	}
 
-	if (attr == &sysfs_wake_allocator)
-		bch2_wake_allocator(ca);
-
 	return size;
 }
 SYSFS_OPS(bch2_dev);
@@ -918,11 +888,8 @@ struct attribute *bch2_dev_files[] = {
 	&sysfs_io_latency_stats_write,
 	&sysfs_congested,
 
-	&sysfs_reserve_stats,
-
 	/* debug: */
 	&sysfs_alloc_debug,
-	&sysfs_wake_allocator,
 	NULL
 };
 
diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h
index 89207fd7b617..caf59b977e2f 100644
--- a/fs/bcachefs/trace.h
+++ b/fs/bcachefs/trace.h
@@ -471,37 +471,74 @@ TRACE_EVENT(invalidate,
 );
 
 DECLARE_EVENT_CLASS(bucket_alloc,
-	TP_PROTO(struct bch_dev *ca, const char *alloc_reserve),
-	TP_ARGS(ca, alloc_reserve),
+	TP_PROTO(struct bch_dev *ca, const char *alloc_reserve,
+		 u64 avail,
+		 u64 seen,
+		 u64 open,
+		 u64 need_journal_commit,
+		 u64 nouse,
+		 bool nonblocking,
+		 int ret),
+	TP_ARGS(ca, alloc_reserve, avail, seen, open, need_journal_commit, nouse, nonblocking, ret),
 
 	TP_STRUCT__entry(
-		__field(dev_t,			dev	)
-		__array(char,	reserve,	16	)
+		__field(dev_t,			dev			)
+		__array(char,	reserve,	16			)
+		__field(u64,			avail			)
+		__field(u64,			seen			)
+		__field(u64,			open			)
+		__field(u64,			need_journal_commit	)
+		__field(u64,			nouse			)
+		__field(bool,			nonblocking		)
+		__field(int,			ret			)
 	),
 
 	TP_fast_assign(
 		__entry->dev		= ca->dev;
 		strlcpy(__entry->reserve, alloc_reserve, sizeof(__entry->reserve));
+		__entry->avail		= avail;
+		__entry->seen		= seen;
+		__entry->open		= open;
+		__entry->need_journal_commit = need_journal_commit;
+		__entry->nouse		= nouse;
+		__entry->nonblocking	= nonblocking;
+		__entry->ret		= ret;
 	),
 
-	TP_printk("%d,%d reserve %s",
+	TP_printk("%d,%d reserve %s avail %llu seen %llu open %llu need_journal_commit %llu nouse %llu nonblocking %u ret %i",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
-		  __entry->reserve)
+		  __entry->reserve,
+		  __entry->avail,
+		  __entry->seen,
+		  __entry->open,
+		  __entry->need_journal_commit,
+		  __entry->nouse,
+		  __entry->nonblocking,
+		  __entry->ret)
 );
 
 DEFINE_EVENT(bucket_alloc, bucket_alloc,
-	TP_PROTO(struct bch_dev *ca, const char *alloc_reserve),
-	TP_ARGS(ca, alloc_reserve)
+	TP_PROTO(struct bch_dev *ca, const char *alloc_reserve,
+		 u64 avail,
+		 u64 seen,
+		 u64 open,
+		 u64 need_journal_commit,
+		 u64 nouse,
+		 bool nonblocking,
+		 int ret),
+	TP_ARGS(ca, alloc_reserve, avail, seen, open, need_journal_commit, nouse, nonblocking, ret)
 );
 
 DEFINE_EVENT(bucket_alloc, bucket_alloc_fail,
-	TP_PROTO(struct bch_dev *ca, const char *alloc_reserve),
-	TP_ARGS(ca, alloc_reserve)
-);
-
-DEFINE_EVENT(bucket_alloc, open_bucket_alloc_fail,
-	TP_PROTO(struct bch_dev *ca, const char *alloc_reserve),
-	TP_ARGS(ca, alloc_reserve)
+	TP_PROTO(struct bch_dev *ca, const char *alloc_reserve,
+		 u64 avail,
+		 u64 seen,
+		 u64 open,
+		 u64 need_journal_commit,
+		 u64 nouse,
+		 bool nonblocking,
+		 int ret),
+	TP_ARGS(ca, alloc_reserve, avail, seen, open, need_journal_commit, nouse, nonblocking, ret)
 );
 
 /* Moving IO */
-- 
cgit 


From 59cc38b8d43b529d91c249c2eef35c8c3fc9fbd8 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 10 Feb 2022 04:32:19 -0500
Subject: bcachefs: New discard implementation

In the old allocator code, buckets would be discarded just prior to
being used - this made sense in bcache where we were discarding buckets
just after invalidating the cached data they contain, but in a
filesystem where we typically have more free space we want to be
discarding buckets when they become empty.

This patch implements the new behaviour - it checks the need_discard
btree for buckets awaiting discards, and then clears the appropriate
bit in the alloc btree, which moves the buckets to the freespace btree.

Additionally, discards are now enabled by default.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c | 140 +++++++++++++++++++++++++++++++++++++++++
 fs/bcachefs/alloc_background.h |   2 +
 fs/bcachefs/bcachefs.h         |   1 +
 fs/bcachefs/buckets.c          |   5 ++
 fs/bcachefs/journal_io.c       |   2 +
 fs/bcachefs/opts.h             |   2 +-
 fs/bcachefs/super.c            |   2 +
 fs/bcachefs/trace.h            |  34 ++++++++++
 8 files changed, 187 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 3ba2b35fad53..9514c2e5f01e 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -545,6 +545,7 @@ int bch2_trans_mark_alloc(struct btree_trans *trans,
 		new_a->io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now));
 		new_a->io_time[WRITE]= max_t(u64, 1, atomic64_read(&c->io_clock[WRITE].now));
 		SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, true);
+		SET_BCH_ALLOC_V4_NEED_DISCARD(new_a, true);
 	}
 
 	if (old_a.data_type && !new_a->data_type &&
@@ -579,6 +580,144 @@ int bch2_trans_mark_alloc(struct btree_trans *trans,
 	return 0;
 }
 
+static int bch2_clear_need_discard(struct btree_trans *trans, struct bpos pos,
+				   struct bch_dev *ca, bool *discard_done)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	struct bkey_i_alloc_v4 *a;
+	struct printbuf buf = PRINTBUF;
+	int ret;
+
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, pos,
+			     BTREE_ITER_CACHED);
+	k = bch2_btree_iter_peek_slot(&iter);
+	ret = bkey_err(k);
+	if (ret)
+		goto out;
+
+	a = bch2_alloc_to_v4_mut(trans, k);
+	ret = PTR_ERR_OR_ZERO(a);
+	if (ret)
+		goto out;
+
+	if (BCH_ALLOC_V4_NEED_INC_GEN(&a->v)) {
+		a->v.gen++;
+		SET_BCH_ALLOC_V4_NEED_INC_GEN(&a->v, false);
+		goto write;
+	}
+
+	BUG_ON(a->v.journal_seq > c->journal.flushed_seq_ondisk);
+
+	if (bch2_fs_inconsistent_on(!BCH_ALLOC_V4_NEED_DISCARD(&a->v), c,
+			"%s\n  incorrectly set in need_discard btree",
+			(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
+		ret = -EIO;
+		goto out;
+	}
+
+	if (!*discard_done && ca->mi.discard && !c->opts.nochanges) {
+		/*
+		 * This works without any other locks because this is the only
+		 * thread that removes items from the need_discard tree
+		 */
+		bch2_trans_unlock(trans);
+		blkdev_issue_discard(ca->disk_sb.bdev,
+				     k.k->p.offset * ca->mi.bucket_size,
+				     ca->mi.bucket_size,
+				     GFP_KERNEL);
+		*discard_done = true;
+
+		ret = bch2_trans_relock(trans) ? 0 : -EINTR;
+		if (ret)
+			goto out;
+	}
+
+	SET_BCH_ALLOC_V4_NEED_DISCARD(&a->v, false);
+write:
+	ret = bch2_trans_update(trans, &iter, &a->k_i, 0);
+out:
+	bch2_trans_iter_exit(trans, &iter);
+	printbuf_exit(&buf);
+	return ret;
+}
+
+static void bch2_do_discards_work(struct work_struct *work)
+{
+	struct bch_fs *c = container_of(work, struct bch_fs, discard_work);
+	struct bch_dev *ca = NULL;
+	struct btree_trans trans;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	u64 seen = 0, open = 0, need_journal_commit = 0, discarded = 0;
+	int ret;
+
+	bch2_trans_init(&trans, c, 0, 0);
+
+	for_each_btree_key(&trans, iter, BTREE_ID_need_discard,
+			   POS_MIN, 0, k, ret) {
+		bool discard_done = false;
+
+		if (ca && k.k->p.inode != ca->dev_idx) {
+			percpu_ref_put(&ca->io_ref);
+			ca = NULL;
+		}
+
+		if (!ca) {
+			ca = bch_dev_bkey_exists(c, k.k->p.inode);
+			if (!percpu_ref_tryget(&ca->io_ref)) {
+				ca = NULL;
+				bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode + 1, 0));
+				continue;
+			}
+		}
+
+		seen++;
+
+		if (bch2_bucket_is_open_safe(c, k.k->p.inode, k.k->p.offset)) {
+			open++;
+			continue;
+		}
+
+		if (bch2_bucket_needs_journal_commit(&c->buckets_waiting_for_journal,
+				c->journal.flushed_seq_ondisk,
+				k.k->p.inode, k.k->p.offset)) {
+			need_journal_commit++;
+			continue;
+		}
+
+		ret = __bch2_trans_do(&trans, NULL, NULL,
+				      BTREE_INSERT_USE_RESERVE|
+				      BTREE_INSERT_NOFAIL,
+				bch2_clear_need_discard(&trans, k.k->p, ca, &discard_done));
+		if (ret)
+			break;
+
+		discarded++;
+	}
+	bch2_trans_iter_exit(&trans, &iter);
+
+	if (ca)
+		percpu_ref_put(&ca->io_ref);
+
+	bch2_trans_exit(&trans);
+
+	if (need_journal_commit * 2 > seen)
+		bch2_journal_flush_async(&c->journal, NULL);
+
+	percpu_ref_put(&c->writes);
+
+	trace_do_discards(c, seen, open, need_journal_commit, discarded, ret);
+}
+
+void bch2_do_discards(struct bch_fs *c)
+{
+	if (percpu_ref_tryget(&c->writes) &&
+	    !queue_work(system_long_wq, &c->discard_work))
+		percpu_ref_put(&c->writes);
+}
+
 static int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca)
 {
 	struct btree_trans trans;
@@ -862,4 +1001,5 @@ void bch2_dev_allocator_add(struct bch_fs *c, struct bch_dev *ca)
 void bch2_fs_allocator_background_init(struct bch_fs *c)
 {
 	spin_lock_init(&c->freelist_lock);
+	INIT_WORK(&c->discard_work, bch2_do_discards_work);
 }
diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h
index 74b23f9b1bd3..8ba9bf853c2f 100644
--- a/fs/bcachefs/alloc_background.h
+++ b/fs/bcachefs/alloc_background.h
@@ -113,6 +113,8 @@ int bch2_alloc_read(struct bch_fs *, bool, bool);
 
 int bch2_trans_mark_alloc(struct btree_trans *, struct bkey_s_c,
 			  struct bkey_i *, unsigned);
+void bch2_do_discards(struct bch_fs *);
+
 int bch2_fs_freespace_init(struct bch_fs *);
 
 void bch2_recalc_capacity(struct bch_fs *);
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 879b2adc8b42..ca48b3f86304 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -758,6 +758,7 @@ struct bch_fs {
 	unsigned		write_points_nr;
 
 	struct buckets_waiting_for_journal buckets_waiting_for_journal;
+	struct work_struct	discard_work;
 
 	/* GARBAGE COLLECTION */
 	struct task_struct	*gc_thread;
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 2c6fdf385ba3..0e86b45b6c55 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -543,6 +543,11 @@ int bch2_mark_alloc(struct btree_trans *trans,
 	    (!new_a.journal_seq || new_a.journal_seq < c->journal.flushed_seq_ondisk))
 		closure_wake_up(&c->freelist_wait);
 
+	if ((flags & BTREE_TRIGGER_INSERT) &&
+	    BCH_ALLOC_V4_NEED_DISCARD(&new_a) &&
+	    !new_a.journal_seq)
+		bch2_do_discards(c);
+
 	if (bucket_state(new_a) == BUCKET_need_gc_gens) {
 		atomic_inc(&c->kick_gc);
 		wake_up_process(c->gc_thread);
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 3e418342ee67..3974d043fd8a 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -1,5 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 #include "bcachefs.h"
+#include "alloc_background.h"
 #include "alloc_foreground.h"
 #include "btree_io.h"
 #include "btree_update_interior.h"
@@ -1399,6 +1400,7 @@ static void journal_write_done(struct closure *cl)
 			j->flushed_seq_ondisk = seq;
 			j->last_seq_ondisk = w->last_seq;
 
+			bch2_do_discards(c);
 			closure_wake_up(&c->freelist_wait);
 
 			bch2_reset_alloc_cursors(c);
diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
index b45740ec3c67..ce79e1a12bd0 100644
--- a/fs/bcachefs/opts.h
+++ b/fs/bcachefs/opts.h
@@ -266,7 +266,7 @@ enum opt_type {
 	x(discard,			u8,				\
 	  OPT_FS|OPT_MOUNT|OPT_DEVICE,					\
 	  OPT_BOOL(),							\
-	  BCH2_NO_SB_OPT,		false,				\
+	  BCH2_NO_SB_OPT,		true,				\
 	  NULL,		"Enable discard/TRIM support")			\
 	x(verbose,			u8,				\
 	  OPT_FS|OPT_MOUNT,						\
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 3a8740fde9de..037923bca742 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -401,6 +401,8 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early)
 		bch2_dev_allocator_add(c, ca);
 	bch2_recalc_capacity(c);
 
+	bch2_do_discards(c);
+
 	if (!early) {
 		ret = bch2_fs_read_write_late(c);
 		if (ret)
diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h
index caf59b977e2f..ef2096fd147d 100644
--- a/fs/bcachefs/trace.h
+++ b/fs/bcachefs/trace.h
@@ -182,6 +182,40 @@ TRACE_EVENT(journal_reclaim_finish,
 		  __entry->nr_flushed)
 );
 
+/* allocator: */
+
+TRACE_EVENT(do_discards,
+	TP_PROTO(struct bch_fs *c, u64 seen, u64 open,
+		 u64 need_journal_commit, u64 discarded, int ret),
+	TP_ARGS(c, seen, open, need_journal_commit, discarded, ret),
+
+	TP_STRUCT__entry(
+		__field(dev_t,		dev			)
+		__field(u64,		seen			)
+		__field(u64,		open			)
+		__field(u64,		need_journal_commit	)
+		__field(u64,		discarded		)
+		__field(int,		ret			)
+	),
+
+	TP_fast_assign(
+		__entry->dev			= c->dev;
+		__entry->seen			= seen;
+		__entry->open			= open;
+		__entry->need_journal_commit	= need_journal_commit;
+		__entry->discarded		= discarded;
+		__entry->ret			= ret;
+	),
+
+	TP_printk("%d%d seen %llu open %llu need_journal_commit %llu discarded %llu ret %i",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->seen,
+		  __entry->open,
+		  __entry->need_journal_commit,
+		  __entry->discarded,
+		  __entry->ret)
+);
+
 /* bset.c: */
 
 DEFINE_EVENT(bpos, bkey_pack_pos_fail,
-- 
cgit 


From caece7fe3f1199f0da42b4537434166f99f0c11f Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 10 Feb 2022 18:18:41 -0500
Subject: bcachefs: New bucket invalidate path

In the old allocator code, preparing an existing empty bucket was part
of the same code path that invalidated buckets containing cached data.
In the new allocator code this is no longer the case: the main allocator
path finds empty buckets (via the new freespace btree), and can't
allocate buckets that contain cached data.

We now need a separate code path to invalidate buckets containing cached
data when we're low on empty buckets, which this patch implements. When
the number of free buckets decreases that triggers the new invalidate
path to run, which uses the LRU btree to pick cached data buckets to
invalidate until we're above our watermark.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c | 81 ++++++++++++++++++++++++++++++++++++++++++
 fs/bcachefs/alloc_background.h | 11 ++++++
 fs/bcachefs/bcachefs.h         |  1 +
 fs/bcachefs/buckets.c          |  5 +++
 4 files changed, 98 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 9514c2e5f01e..fac9337dc543 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -718,6 +718,86 @@ void bch2_do_discards(struct bch_fs *c)
 		percpu_ref_put(&c->writes);
 }
 
+static int invalidate_one_bucket(struct btree_trans *trans, struct bch_dev *ca)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter lru_iter, alloc_iter = { NULL };
+	struct bkey_s_c k;
+	struct bkey_i_alloc_v4 *a;
+	u64 bucket, idx;
+	int ret;
+
+	bch2_trans_iter_init(trans, &lru_iter, BTREE_ID_lru,
+			     POS(ca->dev_idx, 0), 0);
+	k = bch2_btree_iter_peek(&lru_iter);
+	ret = bkey_err(k);
+	if (ret)
+		goto out;
+
+	if (!k.k || k.k->p.inode != ca->dev_idx)
+		goto out;
+
+	if (bch2_fs_inconsistent_on(k.k->type != KEY_TYPE_lru, c,
+				    "non lru key in lru btree"))
+		goto out;
+
+	idx	= k.k->p.offset;
+	bucket	= le64_to_cpu(bkey_s_c_to_lru(k).v->idx);
+
+	a = bch2_trans_start_alloc_update(trans, &alloc_iter,
+					  POS(ca->dev_idx, bucket));
+	ret = PTR_ERR_OR_ZERO(a);
+	if (ret)
+		goto out;
+
+	if (bch2_fs_inconsistent_on(idx != alloc_lru_idx(a->v), c,
+			"invalidating bucket with wrong lru idx (got %llu should be %llu",
+			idx, alloc_lru_idx(a->v)))
+		goto out;
+
+	SET_BCH_ALLOC_V4_NEED_INC_GEN(&a->v, false);
+	a->v.gen++;
+	a->v.data_type		= 0;
+	a->v.dirty_sectors	= 0;
+	a->v.cached_sectors	= 0;
+	a->v.io_time[READ]	= atomic64_read(&c->io_clock[READ].now);
+	a->v.io_time[WRITE]	= atomic64_read(&c->io_clock[WRITE].now);
+
+	ret = bch2_trans_update(trans, &alloc_iter, &a->k_i,
+				BTREE_TRIGGER_BUCKET_INVALIDATE);
+out:
+	bch2_trans_iter_exit(trans, &alloc_iter);
+	bch2_trans_iter_exit(trans, &lru_iter);
+	return ret;
+}
+
+static void bch2_do_invalidates_work(struct work_struct *work)
+{
+	struct bch_fs *c = container_of(work, struct bch_fs, invalidate_work);
+	struct bch_dev *ca;
+	struct btree_trans trans;
+	unsigned i;
+	int ret = 0;
+
+	bch2_trans_init(&trans, c, 0, 0);
+
+	for_each_member_device(ca, c, i)
+		while (!ret && should_invalidate_buckets(ca))
+			ret = __bch2_trans_do(&trans, NULL, NULL,
+					      BTREE_INSERT_USE_RESERVE|
+					      BTREE_INSERT_NOFAIL,
+					invalidate_one_bucket(&trans, ca));
+
+	bch2_trans_exit(&trans);
+	percpu_ref_put(&c->writes);
+}
+
+void bch2_do_invalidates(struct bch_fs *c)
+{
+	if (percpu_ref_tryget(&c->writes))
+		queue_work(system_long_wq, &c->invalidate_work);
+}
+
 static int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca)
 {
 	struct btree_trans trans;
@@ -1002,4 +1082,5 @@ void bch2_fs_allocator_background_init(struct bch_fs *c)
 {
 	spin_lock_init(&c->freelist_lock);
 	INIT_WORK(&c->discard_work, bch2_do_discards_work);
+	INIT_WORK(&c->invalidate_work, bch2_do_invalidates_work);
 }
diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h
index 8ba9bf853c2f..d4883d3cd642 100644
--- a/fs/bcachefs/alloc_background.h
+++ b/fs/bcachefs/alloc_background.h
@@ -115,6 +115,17 @@ int bch2_trans_mark_alloc(struct btree_trans *, struct bkey_s_c,
 			  struct bkey_i *, unsigned);
 void bch2_do_discards(struct bch_fs *);
 
+static inline bool should_invalidate_buckets(struct bch_dev *ca)
+{
+	struct bch_dev_usage u = bch2_dev_usage_read(ca);
+
+	return u.d[BCH_DATA_cached].buckets &&
+		u.buckets_unavailable + u.d[BCH_DATA_cached].buckets <
+		ca->mi.nbuckets >> 7;
+}
+
+void bch2_do_invalidates(struct bch_fs *);
+
 int bch2_fs_freespace_init(struct bch_fs *);
 
 void bch2_recalc_capacity(struct bch_fs *);
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index ca48b3f86304..66d9c209252e 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -759,6 +759,7 @@ struct bch_fs {
 
 	struct buckets_waiting_for_journal buckets_waiting_for_journal;
 	struct work_struct	discard_work;
+	struct work_struct	invalidate_work;
 
 	/* GARBAGE COLLECTION */
 	struct task_struct	*gc_thread;
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 0e86b45b6c55..bfab5d88550b 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -548,6 +548,11 @@ int bch2_mark_alloc(struct btree_trans *trans,
 	    !new_a.journal_seq)
 		bch2_do_discards(c);
 
+	if (!old_a.data_type &&
+	    new_a.data_type &&
+	    should_invalidate_buckets(ca))
+		bch2_do_invalidates(c);
+
 	if (bucket_state(new_a) == BUCKET_need_gc_gens) {
 		atomic_inc(&c->kick_gc);
 		wake_up_process(c->gc_thread);
-- 
cgit 


From 5add07d56a5e714a6ac1bedffa9b999d2966708c Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 17 Feb 2022 03:11:39 -0500
Subject: bcachefs: Fsck for need_discard & freespace btrees

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c | 326 +++++++++++++++++++++++++++++++++++++++++
 fs/bcachefs/alloc_background.h |   2 +
 fs/bcachefs/lru.c              |  84 +++++++++++
 fs/bcachefs/lru.h              |   2 +
 fs/bcachefs/recovery.c         |  29 +++-
 5 files changed, 442 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index fac9337dc543..e8de96e4adf3 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -580,6 +580,332 @@ int bch2_trans_mark_alloc(struct btree_trans *trans,
 	return 0;
 }
 
+static int bch2_check_alloc_key(struct btree_trans *trans,
+				struct btree_iter *alloc_iter)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter discard_iter, freespace_iter;
+	struct bch_alloc_v4 a;
+	unsigned discard_key_type, freespace_key_type;
+	struct bkey_s_c alloc_k, k;
+	struct printbuf buf = PRINTBUF;
+	struct printbuf buf2 = PRINTBUF;
+	int ret;
+
+	alloc_k = bch2_btree_iter_peek(alloc_iter);
+	if (!alloc_k.k)
+		return 0;
+
+	ret = bkey_err(alloc_k);
+	if (ret)
+		return ret;
+
+	bch2_alloc_to_v4(alloc_k, &a);
+	discard_key_type = bucket_state(a) == BUCKET_need_discard
+		? KEY_TYPE_set : 0;
+	freespace_key_type = bucket_state(a) == BUCKET_free
+		? KEY_TYPE_set : 0;
+
+	bch2_trans_iter_init(trans, &discard_iter, BTREE_ID_need_discard,
+			     alloc_k.k->p, 0);
+	bch2_trans_iter_init(trans, &freespace_iter, BTREE_ID_freespace,
+			     alloc_freespace_pos(alloc_k.k->p, a), 0);
+
+	k = bch2_btree_iter_peek_slot(&discard_iter);
+	ret = bkey_err(k);
+	if (ret)
+		goto err;
+
+	if (fsck_err_on(k.k->type != discard_key_type, c,
+			"incorrect key in need_discard btree (got %s should be %s)\n"
+			"  %s",
+			bch2_bkey_types[k.k->type],
+			bch2_bkey_types[discard_key_type],
+			(bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) {
+		struct bkey_i *update =
+			bch2_trans_kmalloc(trans, sizeof(*update));
+
+		ret = PTR_ERR_OR_ZERO(update);
+		if (ret)
+			goto err;
+
+		bkey_init(&update->k);
+		update->k.type	= discard_key_type;
+		update->k.p	= discard_iter.pos;
+
+		ret = bch2_trans_update(trans, &discard_iter, update, 0);
+		if (ret)
+			goto err;
+	}
+
+	k = bch2_btree_iter_peek_slot(&freespace_iter);
+	ret = bkey_err(k);
+	if (ret)
+		goto err;
+
+	if (fsck_err_on(k.k->type != freespace_key_type, c,
+			"incorrect key in freespace btree (got %s should be %s)\n"
+			"  %s",
+			bch2_bkey_types[k.k->type],
+			bch2_bkey_types[freespace_key_type],
+			(printbuf_reset(&buf),
+			 bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) {
+		struct bkey_i *update =
+			bch2_trans_kmalloc(trans, sizeof(*update));
+
+		ret = PTR_ERR_OR_ZERO(update);
+		if (ret)
+			goto err;
+
+		bkey_init(&update->k);
+		update->k.type	= freespace_key_type;
+		update->k.p	= freespace_iter.pos;
+		bch2_key_resize(&update->k, 1);
+
+		ret = bch2_trans_update(trans, &freespace_iter, update, 0);
+		if (ret)
+			goto err;
+	}
+err:
+fsck_err:
+	bch2_trans_iter_exit(trans, &freespace_iter);
+	bch2_trans_iter_exit(trans, &discard_iter);
+	printbuf_exit(&buf2);
+	printbuf_exit(&buf);
+	return ret;
+}
+
+static inline bool bch2_dev_bucket_exists(struct bch_fs *c, struct bpos pos)
+{
+	struct bch_dev *ca;
+
+	if (pos.inode >= c->sb.nr_devices || !c->devs[pos.inode])
+		return false;
+
+	ca = bch_dev_bkey_exists(c, pos.inode);
+	return pos.offset >= ca->mi.first_bucket &&
+		pos.offset < ca->mi.nbuckets;
+}
+
+static int bch2_check_freespace_key(struct btree_trans *trans,
+				    struct btree_iter *freespace_iter,
+				    bool initial)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter alloc_iter;
+	struct bkey_s_c k, freespace_k;
+	struct bch_alloc_v4 a;
+	u64 genbits;
+	struct bpos pos;
+	struct bkey_i *update;
+	struct printbuf buf = PRINTBUF;
+	int ret;
+
+	freespace_k = bch2_btree_iter_peek(freespace_iter);
+	if (!freespace_k.k)
+		return 1;
+
+	ret = bkey_err(freespace_k);
+	if (ret)
+		return ret;
+
+	pos = freespace_iter->pos;
+	pos.offset &= ~(~0ULL << 56);
+	genbits = freespace_iter->pos.offset & (~0ULL << 56);
+
+	bch2_trans_iter_init(trans, &alloc_iter, BTREE_ID_alloc, pos, 0);
+
+	if (fsck_err_on(!bch2_dev_bucket_exists(c, pos), c,
+			"%llu:%llu set in freespace btree but device or bucket does not exist",
+			pos.inode, pos.offset))
+		goto delete;
+
+	k = bch2_btree_iter_peek_slot(&alloc_iter);
+	ret = bkey_err(k);
+	if (ret)
+		goto err;
+
+	bch2_alloc_to_v4(k, &a);
+
+	if (fsck_err_on(bucket_state(a) != BUCKET_free ||
+			genbits != alloc_freespace_genbits(a), c,
+			"%s\n  incorrectly set in freespace index (free %u, genbits %llu should be %llu)",
+			(bch2_bkey_val_to_text(&buf, c, k), buf.buf),
+			bucket_state(a) == BUCKET_free,
+			genbits >> 56, alloc_freespace_genbits(a) >> 56))
+		goto delete;
+out:
+err:
+fsck_err:
+	bch2_trans_iter_exit(trans, &alloc_iter);
+	printbuf_exit(&buf);
+	return ret;
+delete:
+	update = bch2_trans_kmalloc(trans, sizeof(*update));
+	ret = PTR_ERR_OR_ZERO(update);
+	if (ret)
+		goto err;
+
+	bkey_init(&update->k);
+	update->k.p = freespace_iter->pos;
+	bch2_key_resize(&update->k, 1);
+
+	ret   = bch2_trans_update(trans, freespace_iter, update, 0) ?:
+		bch2_trans_commit(trans, NULL, NULL, 0);
+	goto out;
+}
+
+int bch2_check_alloc_info(struct bch_fs *c, bool initial)
+{
+	struct btree_trans trans;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	int ret = 0, last_dev = -1;
+
+	bch2_trans_init(&trans, c, 0, 0);
+
+	for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN,
+			   BTREE_ITER_PREFETCH, k, ret) {
+		if (k.k->p.inode != last_dev) {
+			struct bch_dev *ca = bch_dev_bkey_exists(c, k.k->p.inode);
+
+			if (!ca->mi.freespace_initialized) {
+				bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode + 1, 0));
+				continue;
+			}
+
+			last_dev = k.k->p.inode;
+		}
+
+		ret = __bch2_trans_do(&trans, NULL, NULL, 0,
+			bch2_check_alloc_key(&trans, &iter));
+		if (ret)
+			break;
+	}
+	bch2_trans_iter_exit(&trans, &iter);
+
+	if (ret)
+		goto err;
+
+	bch2_trans_iter_init(&trans, &iter, BTREE_ID_freespace, POS_MIN,
+			     BTREE_ITER_PREFETCH);
+	while (1) {
+		ret = __bch2_trans_do(&trans, NULL, NULL, 0,
+			bch2_check_freespace_key(&trans, &iter, initial));
+		if (ret)
+			break;
+
+		bch2_btree_iter_set_pos(&iter, bpos_nosnap_successor(iter.pos));
+	}
+	bch2_trans_iter_exit(&trans, &iter);
+err:
+	bch2_trans_exit(&trans);
+	return ret < 0 ? ret : 0;
+}
+
+static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans,
+				       struct btree_iter *alloc_iter)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter lru_iter;
+	struct bch_alloc_v4 a;
+	struct bkey_s_c alloc_k, k;
+	struct printbuf buf = PRINTBUF;
+	struct printbuf buf2 = PRINTBUF;
+	int ret;
+
+	alloc_k = bch2_btree_iter_peek(alloc_iter);
+	if (!alloc_k.k)
+		return 0;
+
+	ret = bkey_err(alloc_k);
+	if (ret)
+		return ret;
+
+	bch2_alloc_to_v4(alloc_k, &a);
+
+	if (bucket_state(a) != BUCKET_cached)
+		return 0;
+
+	bch2_trans_iter_init(trans, &lru_iter, BTREE_ID_lru,
+			     POS(alloc_k.k->p.inode, a.io_time[READ]), 0);
+
+	k = bch2_btree_iter_peek_slot(&lru_iter);
+	ret = bkey_err(k);
+	if (ret)
+		goto err;
+
+	if (fsck_err_on(!a.io_time[READ], c,
+			"cached bucket with read_time 0\n"
+			"  %s",
+		(printbuf_reset(&buf),
+		 bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf)) ||
+	    fsck_err_on(k.k->type != KEY_TYPE_lru ||
+			le64_to_cpu(bkey_s_c_to_lru(k).v->idx) != alloc_k.k->p.offset, c,
+			"incorrect/missing lru entry\n"
+			"  %s\n"
+			"  %s",
+			(printbuf_reset(&buf),
+			 bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf),
+			(bch2_bkey_val_to_text(&buf2, c, k), buf2.buf))) {
+		u64 read_time = a.io_time[READ];
+
+		if (!a.io_time[READ])
+			a.io_time[READ] = atomic64_read(&c->io_clock[READ].now);
+
+		ret = bch2_lru_change(trans,
+				      alloc_k.k->p.inode,
+				      alloc_k.k->p.offset,
+				      0, &a.io_time[READ]);
+		if (ret)
+			goto err;
+
+		if (a.io_time[READ] != read_time) {
+			struct bkey_i_alloc_v4 *a_mut =
+				bch2_alloc_to_v4_mut(trans, alloc_k);
+			ret = PTR_ERR_OR_ZERO(a_mut);
+			if (ret)
+				goto err;
+
+			a_mut->v.io_time[READ] = a.io_time[READ];
+			ret = bch2_trans_update(trans, alloc_iter,
+						&a_mut->k_i, BTREE_TRIGGER_NORUN);
+			if (ret)
+				goto err;
+		}
+	}
+err:
+fsck_err:
+	bch2_trans_iter_exit(trans, &lru_iter);
+	printbuf_exit(&buf2);
+	printbuf_exit(&buf);
+	return ret;
+}
+
+int bch2_check_alloc_to_lru_refs(struct bch_fs *c)
+{
+	struct btree_trans trans;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	int ret = 0;
+
+	bch2_trans_init(&trans, c, 0, 0);
+
+	for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN,
+			   BTREE_ITER_PREFETCH, k, ret) {
+		ret = __bch2_trans_do(&trans, NULL, NULL,
+				      BTREE_INSERT_NOFAIL|
+				      BTREE_INSERT_LAZY_RW,
+			bch2_check_alloc_to_lru_ref(&trans, &iter));
+		if (ret)
+			break;
+	}
+	bch2_trans_iter_exit(&trans, &iter);
+
+	bch2_trans_exit(&trans);
+	return ret < 0 ? ret : 0;
+}
+
 static int bch2_clear_need_discard(struct btree_trans *trans, struct bpos pos,
 				   struct bch_dev *ca, bool *discard_done)
 {
diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h
index d4883d3cd642..d82e80218b8e 100644
--- a/fs/bcachefs/alloc_background.h
+++ b/fs/bcachefs/alloc_background.h
@@ -113,6 +113,8 @@ int bch2_alloc_read(struct bch_fs *, bool, bool);
 
 int bch2_trans_mark_alloc(struct btree_trans *, struct bkey_s_c,
 			  struct bkey_i *, unsigned);
+int bch2_check_alloc_info(struct bch_fs *, bool);
+int bch2_check_alloc_to_lru_refs(struct bch_fs *);
 void bch2_do_discards(struct bch_fs *);
 
 static inline bool should_invalidate_buckets(struct bch_dev *ca)
diff --git a/fs/bcachefs/lru.c b/fs/bcachefs/lru.c
index 2ababca5efe5..4f0e6960e597 100644
--- a/fs/bcachefs/lru.c
+++ b/fs/bcachefs/lru.c
@@ -1,10 +1,12 @@
 // SPDX-License-Identifier: GPL-2.0
 
 #include "bcachefs.h"
+#include "alloc_background.h"
 #include "btree_iter.h"
 #include "btree_update.h"
 #include "error.h"
 #include "lru.h"
+#include "recovery.h"
 
 const char *bch2_lru_invalid(const struct bch_fs *c, struct bkey_s_c k)
 {
@@ -117,3 +119,85 @@ int bch2_lru_change(struct btree_trans *trans, u64 id, u64 idx,
 	return  lru_delete(trans, id, idx, old_time) ?:
 		lru_set(trans, id, idx, new_time);
 }
+
+static int bch2_check_lru_key(struct btree_trans *trans,
+			      struct btree_iter *lru_iter, bool initial)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter iter;
+	struct bkey_s_c lru_k, k;
+	struct bch_alloc_v4 a;
+	struct printbuf buf1 = PRINTBUF;
+	struct printbuf buf2 = PRINTBUF;
+	u64 idx;
+	int ret;
+
+	lru_k = bch2_btree_iter_peek(lru_iter);
+	if (!lru_k.k)
+		return 0;
+
+	ret = bkey_err(lru_k);
+	if (ret)
+		return ret;
+
+	idx = le64_to_cpu(bkey_s_c_to_lru(lru_k).v->idx);
+
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc,
+			     POS(lru_k.k->p.inode, idx), 0);
+	k = bch2_btree_iter_peek_slot(&iter);
+	ret = bkey_err(k);
+	if (ret)
+		goto err;
+
+	bch2_alloc_to_v4(k, &a);
+
+	if (fsck_err_on(bucket_state(a) != BUCKET_cached ||
+			a.io_time[READ] != lru_k.k->p.offset, c,
+			"incorrect lru entry %s\n"
+			"  for %s",
+			(bch2_bkey_val_to_text(&buf1, c, lru_k), buf1.buf),
+			(bch2_bkey_val_to_text(&buf2, c, k), buf2.buf))) {
+		struct bkey_i *update =
+			bch2_trans_kmalloc(trans, sizeof(*update));
+
+		ret = PTR_ERR_OR_ZERO(update);
+		if (ret)
+			goto err;
+
+		bkey_init(&update->k);
+		update->k.p = lru_iter->pos;
+
+		ret = bch2_trans_update(trans, lru_iter, update, 0);
+		if (ret)
+			goto err;
+	}
+err:
+fsck_err:
+	bch2_trans_iter_exit(trans, &iter);
+	printbuf_exit(&buf2);
+	printbuf_exit(&buf1);
+	return ret;
+}
+
+int bch2_check_lrus(struct bch_fs *c, bool initial)
+{
+	struct btree_trans trans;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	int ret = 0;
+
+	bch2_trans_init(&trans, c, 0, 0);
+
+	for_each_btree_key(&trans, iter, BTREE_ID_lru, POS_MIN,
+			   BTREE_ITER_PREFETCH, k, ret) {
+		ret = __bch2_trans_do(&trans, NULL, NULL, 0,
+			bch2_check_lru_key(&trans, &iter, initial));
+		if (ret)
+			break;
+	}
+	bch2_trans_iter_exit(&trans, &iter);
+
+	bch2_trans_exit(&trans);
+	return ret;
+
+}
diff --git a/fs/bcachefs/lru.h b/fs/bcachefs/lru.h
index c3121cfee285..4db6a8399332 100644
--- a/fs/bcachefs/lru.h
+++ b/fs/bcachefs/lru.h
@@ -12,4 +12,6 @@ void bch2_lru_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 
 int bch2_lru_change(struct btree_trans *, u64, u64, u64, u64 *);
 
+int bch2_check_lrus(struct bch_fs *, bool);
+
 #endif /* _BCACHEFS_LRU_H */
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 50e5c5e852f7..14edc0bf5112 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -16,6 +16,7 @@
 #include "journal_io.h"
 #include "journal_reclaim.h"
 #include "journal_seq_blacklist.h"
+#include "lru.h"
 #include "move.h"
 #include "quota.h"
 #include "recovery.h"
@@ -1166,13 +1167,26 @@ use_clean:
 		bool metadata_only = c->opts.norecovery;
 
 		bch_info(c, "checking allocations");
-		err = "error in mark and sweep";
+		err = "error checking allocations";
 		ret = bch2_gc(c, true, metadata_only);
 		if (ret)
 			goto err;
 		bch_verbose(c, "done checking allocations");
 	}
 
+	if (c->opts.fsck) {
+		bch_info(c, "checking need_discard and freespace btrees");
+		err = "error checking need_discard and freespace btrees";
+		ret = bch2_check_alloc_info(c, true);
+		if (ret)
+			goto err;
+
+		ret = bch2_check_lrus(c, true);
+		if (ret)
+			goto err;
+		bch_verbose(c, "done checking need_discard and freespace btrees");
+	}
+
 	bch2_stripes_heap_start(c);
 
 	clear_bit(BCH_FS_REBUILD_REPLICAS, &c->flags);
@@ -1202,6 +1216,19 @@ use_clean:
 	if (ret)
 		goto err;
 
+	if (c->opts.fsck) {
+		bch_info(c, "checking alloc to lru refs");
+		err = "error checking alloc to lru refs";
+		ret = bch2_check_alloc_to_lru_refs(c);
+		if (ret)
+			goto err;
+
+		ret = bch2_check_lrus(c, true);
+		if (ret)
+			goto err;
+		bch_verbose(c, "done checking alloc to lru refs");
+	}
+
 	if (c->sb.version < bcachefs_metadata_version_snapshot_2) {
 		bch2_fs_lazy_rw(c);
 
-- 
cgit 


From 5f43f99c6ef74f592c380b39069ee68dcfe3ee58 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 10 Feb 2022 19:09:40 -0500
Subject: bcachefs: bch2_dev_usage_update() no longer depends on bucket_mark

This is one of the last steps in getting rid of the main in-memory
bucket array.

This changes bch2_dev_usage_update() to take bkey_alloc_unpacked instead
of bucket_mark, and for the places where we are in fact working with
bucket_mark and don't have bkey_alloc_unpacked, we add a wrapper that
takes bucket_mark and converts to bkey_alloc_unpacked.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/buckets.c | 51 +++++++++++++++++++++++++++++++++++++--------------
 fs/bcachefs/buckets.h |  7 -------
 2 files changed, 37 insertions(+), 21 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index bfab5d88550b..60ad873da54f 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -283,24 +283,24 @@ bch2_fs_usage_read_short(struct bch_fs *c)
 	return ret;
 }
 
-static inline int is_unavailable_bucket(struct bucket_mark m)
+static inline int is_unavailable_bucket(struct bch_alloc_v4 a)
 {
-	return !is_available_bucket(m);
+	return a.dirty_sectors || a.stripe;
 }
 
 static inline int bucket_sectors_fragmented(struct bch_dev *ca,
-					    struct bucket_mark m)
+					    struct bch_alloc_v4 a)
 {
-	return m.dirty_sectors
-		? max(0, (int) ca->mi.bucket_size - (int) m.dirty_sectors)
+	return a.dirty_sectors
+		? max(0, (int) ca->mi.bucket_size - (int) a.dirty_sectors)
 		: 0;
 }
 
-static inline enum bch_data_type bucket_type(struct bucket_mark m)
+static inline enum bch_data_type bucket_type(struct bch_alloc_v4 a)
 {
-	return m.cached_sectors && !m.dirty_sectors
+	return a.cached_sectors && !a.dirty_sectors
 		? BCH_DATA_cached
-		: m.data_type;
+		: a.data_type;
 }
 
 static inline void account_bucket(struct bch_fs_usage *fs_usage,
@@ -315,7 +315,8 @@ static inline void account_bucket(struct bch_fs_usage *fs_usage,
 }
 
 static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
-				  struct bucket_mark old, struct bucket_mark new,
+				  struct bch_alloc_v4 old,
+				  struct bch_alloc_v4 new,
 				  u64 journal_seq, bool gc)
 {
 	struct bch_fs_usage *fs_usage;
@@ -347,6 +348,28 @@ static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
 	preempt_enable();
 }
 
+static void bch2_dev_usage_update_m(struct bch_fs *c, struct bch_dev *ca,
+				    struct bucket_mark old, struct bucket_mark new,
+				    u64 journal_seq, bool gc)
+{
+	struct bch_alloc_v4 old_a = {
+		.gen		= old.gen,
+		.data_type	= old.data_type,
+		.dirty_sectors	= old.dirty_sectors,
+		.cached_sectors	= old.cached_sectors,
+		.stripe		= old.stripe,
+	};
+	struct bch_alloc_v4 new_a = {
+		.gen		= new.gen,
+		.data_type	= new.data_type,
+		.dirty_sectors	= new.dirty_sectors,
+		.cached_sectors	= new.cached_sectors,
+		.stripe		= new.stripe,
+	};
+
+	bch2_dev_usage_update(c, ca, old_a, new_a, journal_seq, gc);
+}
+
 static inline int __update_replicas(struct bch_fs *c,
 				    struct bch_fs_usage *fs_usage,
 				    struct bch_replicas_entry *r,
@@ -562,6 +585,8 @@ int bch2_mark_alloc(struct btree_trans *trans,
 	if (!gc && new_a.gen != old_a.gen)
 		*bucket_gen(ca, new.k->p.offset) = new_a.gen;
 
+	bch2_dev_usage_update(c, ca, old_a, new_a, journal_seq, gc);
+
 	g = __bucket(ca, new.k->p.offset, gc);
 
 	old_m = bucket_cmpxchg(g, m, ({
@@ -572,8 +597,6 @@ int bch2_mark_alloc(struct btree_trans *trans,
 		m.stripe		= new_a.stripe != 0;
 	}));
 
-	bch2_dev_usage_update(c, ca, old_m, m, journal_seq, gc);
-
 	g->io_time[READ]	= new_a.io_time[READ];
 	g->io_time[WRITE]	= new_a.io_time[WRITE];
 	g->gen_valid		= 1;
@@ -651,7 +674,7 @@ void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
 		bch2_data_types[old.data_type ?: data_type],
 		old.dirty_sectors, sectors);
 
-	bch2_dev_usage_update(c, ca, old, new, 0, true);
+	bch2_dev_usage_update_m(c, ca, old, new, 0, true);
 	percpu_up_read(&c->mark_lock);
 }
 
@@ -810,7 +833,7 @@ static int mark_stripe_bucket(struct btree_trans *trans,
 	g->stripe		= k.k->p.offset;
 	g->stripe_redundancy	= s->nr_redundant;
 
-	bch2_dev_usage_update(c, ca, old, new, journal_seq, true);
+	bch2_dev_usage_update_m(c, ca, old, new, journal_seq, true);
 err:
 	percpu_up_read(&c->mark_lock);
 	printbuf_exit(&buf);
@@ -883,7 +906,7 @@ static int bch2_mark_pointer(struct btree_trans *trans,
 			      old.v.counter,
 			      new.v.counter)) != old.v.counter);
 
-	bch2_dev_usage_update(c, ca, old, new, journal_seq, true);
+	bch2_dev_usage_update_m(c, ca, old, new, journal_seq, true);
 err:
 	percpu_up_read(&c->mark_lock);
 
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index bcb40f15f82e..9cc6c16bcc64 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -134,13 +134,6 @@ static inline u8 ptr_stale(struct bch_dev *ca,
 	return ret;
 }
 
-/* bucket gc marks */
-
-static inline bool is_available_bucket(struct bucket_mark mark)
-{
-	return !mark.dirty_sectors && !mark.stripe;
-}
-
 /* Device usage: */
 
 struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *);
-- 
cgit 


From 5735608c14e791c10ebcb6a20fab1c8fa4cf3123 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 10 Feb 2022 19:26:55 -0500
Subject: bcachefs: Kill main in-memory bucket array

All code using the in-memory bucket array, excluding GC, has now been
converted to use the alloc btree directly - so we can finally delete it.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c | 26 ++------------
 fs/bcachefs/alloc_background.h |  2 +-
 fs/bcachefs/bcachefs.h         |  2 +-
 fs/bcachefs/btree_gc.c         | 52 ++++++++++++++++++++++-----
 fs/bcachefs/buckets.c          | 80 +++++++++++++-----------------------------
 fs/bcachefs/buckets.h          | 20 +++--------
 fs/bcachefs/buckets_types.h    |  1 -
 fs/bcachefs/recovery.c         |  2 +-
 8 files changed, 77 insertions(+), 108 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index e8de96e4adf3..b0f49044ea24 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -400,14 +400,13 @@ void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c
 	pr_buf(out, " write_time %llu",		a.io_time[WRITE]);
 }
 
-int bch2_alloc_read(struct bch_fs *c, bool gc, bool metadata_only)
+int bch2_alloc_read(struct bch_fs *c)
 {
 	struct btree_trans trans;
 	struct btree_iter iter;
 	struct bkey_s_c k;
 	struct bch_alloc_v4 a;
 	struct bch_dev *ca;
-	struct bucket *g;
 	int ret;
 
 	bch2_trans_init(&trans, c, 0, 0);
@@ -415,30 +414,9 @@ int bch2_alloc_read(struct bch_fs *c, bool gc, bool metadata_only)
 	for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN,
 			   BTREE_ITER_PREFETCH, k, ret) {
 		ca = bch_dev_bkey_exists(c, k.k->p.inode);
-		g = __bucket(ca, k.k->p.offset, gc);
 		bch2_alloc_to_v4(k, &a);
 
-		if (!gc)
-			*bucket_gen(ca, k.k->p.offset) = a.gen;
-
-		g->_mark.gen		= a.gen;
-		g->io_time[READ]	= a.io_time[READ];
-		g->io_time[WRITE]	= a.io_time[WRITE];
-		g->gen_valid		= 1;
-
-		if (!gc ||
-		    (metadata_only &&
-		     (a.data_type == BCH_DATA_user ||
-		      a.data_type == BCH_DATA_cached ||
-		      a.data_type == BCH_DATA_parity))) {
-			g->_mark.data_type	= a.data_type;
-			g->_mark.dirty_sectors	= a.dirty_sectors;
-			g->_mark.cached_sectors	= a.cached_sectors;
-			g->_mark.stripe		= a.stripe != 0;
-			g->stripe		= a.stripe;
-			g->stripe_redundancy	= a.stripe_redundancy;
-		}
-
+		*bucket_gen(ca, k.k->p.offset) = a.gen;
 	}
 	bch2_trans_iter_exit(&trans, &iter);
 
diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h
index d82e80218b8e..3b49abf1bbc0 100644
--- a/fs/bcachefs/alloc_background.h
+++ b/fs/bcachefs/alloc_background.h
@@ -109,7 +109,7 @@ static inline bool bkey_is_alloc(const struct bkey *k)
 		k->type == KEY_TYPE_alloc_v3;
 }
 
-int bch2_alloc_read(struct bch_fs *, bool, bool);
+int bch2_alloc_read(struct bch_fs *);
 
 int bch2_trans_mark_alloc(struct btree_trans *, struct bkey_s_c,
 			  struct bkey_i *, unsigned);
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 66d9c209252e..c06837612bdf 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -450,7 +450,7 @@ struct bch_dev {
 	 * gc_lock, for device resize - holding any is sufficient for access:
 	 * Or rcu_read_lock(), but only for ptr_stale():
 	 */
-	struct bucket_array __rcu *buckets[2];
+	struct bucket_array __rcu *buckets_gc;
 	struct bucket_gens __rcu *bucket_gens;
 	u8			*oldest_gen;
 	unsigned long		*buckets_nouse;
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 7078b277e23b..f66b2ef03c3a 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -1160,10 +1160,10 @@ static void bch2_gc_free(struct bch_fs *c)
 	genradix_free(&c->gc_stripes);
 
 	for_each_member_device(ca, c, i) {
-		kvpfree(rcu_dereference_protected(ca->buckets[1], 1),
+		kvpfree(rcu_dereference_protected(ca->buckets_gc, 1),
 			sizeof(struct bucket_array) +
 			ca->mi.nbuckets * sizeof(struct bucket));
-		ca->buckets[1] = NULL;
+		ca->buckets_gc = NULL;
 
 		free_percpu(ca->usage_gc);
 		ca->usage_gc = NULL;
@@ -1292,7 +1292,7 @@ static int bch2_gc_start(struct bch_fs *c,
 	}
 
 	for_each_member_device(ca, c, i) {
-		BUG_ON(ca->buckets[1]);
+		BUG_ON(ca->buckets_gc);
 		BUG_ON(ca->usage_gc);
 
 		ca->usage_gc = alloc_percpu(struct bch_dev_usage);
@@ -1346,8 +1346,6 @@ static int bch2_alloc_write_key(struct btree_trans *trans,
 		.data_type	= g->mark.data_type,
 		.dirty_sectors	= g->mark.dirty_sectors,
 		.cached_sectors	= g->mark.cached_sectors,
-		.io_time[READ]	= g->io_time[READ],
-		.io_time[WRITE]	= g->io_time[WRITE],
 		.stripe		= g->stripe,
 		.stripe_redundancy = g->stripe_redundancy,
 	};
@@ -1437,7 +1435,13 @@ static int bch2_gc_alloc_done(struct bch_fs *c, bool metadata_only)
 static int bch2_gc_alloc_start(struct bch_fs *c, bool metadata_only)
 {
 	struct bch_dev *ca;
+	struct btree_trans trans;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	struct bucket *g;
+	struct bch_alloc_v4 a;
 	unsigned i;
+	int ret;
 
 	for_each_member_device(ca, c, i) {
 		struct bucket_array *buckets = kvpmalloc(sizeof(struct bucket_array) +
@@ -1445,17 +1449,47 @@ static int bch2_gc_alloc_start(struct bch_fs *c, bool metadata_only)
 				GFP_KERNEL|__GFP_ZERO);
 		if (!buckets) {
 			percpu_ref_put(&ca->ref);
-			percpu_up_write(&c->mark_lock);
 			bch_err(c, "error allocating ca->buckets[gc]");
 			return -ENOMEM;
 		}
 
 		buckets->first_bucket	= ca->mi.first_bucket;
 		buckets->nbuckets	= ca->mi.nbuckets;
-		rcu_assign_pointer(ca->buckets[1], buckets);
+		rcu_assign_pointer(ca->buckets_gc, buckets);
 	};
 
-	return bch2_alloc_read(c, true, metadata_only);
+	bch2_trans_init(&trans, c, 0, 0);
+
+	for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN,
+			   BTREE_ITER_PREFETCH, k, ret) {
+		ca = bch_dev_bkey_exists(c, k.k->p.inode);
+		g = gc_bucket(ca, k.k->p.offset);
+
+		bch2_alloc_to_v4(k, &a);
+
+		g->_mark.gen		= a.gen;
+		g->gen_valid		= 1;
+
+		if (metadata_only &&
+		    (a.data_type == BCH_DATA_user ||
+		     a.data_type == BCH_DATA_cached ||
+		     a.data_type == BCH_DATA_parity)) {
+			g->_mark.data_type	= a.data_type;
+			g->_mark.dirty_sectors	= a.dirty_sectors;
+			g->_mark.cached_sectors	= a.cached_sectors;
+			g->_mark.stripe		= a.stripe != 0;
+			g->stripe		= a.stripe;
+			g->stripe_redundancy	= a.stripe_redundancy;
+		}
+	}
+	bch2_trans_iter_exit(&trans, &iter);
+
+	bch2_trans_exit(&trans);
+
+	if (ret)
+		bch_err(c, "error reading alloc info at gc start: %i", ret);
+
+	return ret;
 }
 
 static void bch2_gc_alloc_reset(struct bch_fs *c, bool metadata_only)
@@ -1464,7 +1498,7 @@ static void bch2_gc_alloc_reset(struct bch_fs *c, bool metadata_only)
 	unsigned i;
 
 	for_each_member_device(ca, c, i) {
-		struct bucket_array *buckets = __bucket_array(ca, true);
+		struct bucket_array *buckets = gc_bucket_array(ca);
 		struct bucket *g;
 
 		for_each_bucket(g, buckets) {
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 60ad873da54f..572d56676c69 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -512,8 +512,6 @@ int bch2_mark_alloc(struct btree_trans *trans,
 	struct bch_fs *c = trans->c;
 	struct bch_alloc_v4 old_a, new_a;
 	struct bch_dev *ca = bch_dev_bkey_exists(c, new.k->p.inode);
-	struct bucket *g;
-	struct bucket_mark old_m, m;
 	int ret = 0;
 
 	if (bch2_trans_inconsistent_on(new.k->p.offset < ca->mi.first_bucket ||
@@ -587,21 +585,22 @@ int bch2_mark_alloc(struct btree_trans *trans,
 
 	bch2_dev_usage_update(c, ca, old_a, new_a, journal_seq, gc);
 
-	g = __bucket(ca, new.k->p.offset, gc);
-
-	old_m = bucket_cmpxchg(g, m, ({
-		m.gen			= new_a.gen;
-		m.data_type		= new_a.data_type;
-		m.dirty_sectors		= new_a.dirty_sectors;
-		m.cached_sectors	= new_a.cached_sectors;
-		m.stripe		= new_a.stripe != 0;
-	}));
-
-	g->io_time[READ]	= new_a.io_time[READ];
-	g->io_time[WRITE]	= new_a.io_time[WRITE];
-	g->gen_valid		= 1;
-	g->stripe		= new_a.stripe;
-	g->stripe_redundancy	= new_a.stripe_redundancy;
+	if (gc) {
+		struct bucket_mark old_m, m;
+		struct bucket *g = gc_bucket(ca, new.k->p.offset);
+
+		old_m = bucket_cmpxchg(g, m, ({
+			m.gen			= new_a.gen;
+			m.data_type		= new_a.data_type;
+			m.dirty_sectors		= new_a.dirty_sectors;
+			m.cached_sectors	= new_a.cached_sectors;
+			m.stripe		= new_a.stripe != 0;
+		}));
+
+		g->gen_valid		= 1;
+		g->stripe		= new_a.stripe;
+		g->stripe_redundancy	= new_a.stripe_redundancy;
+	}
 	percpu_up_read(&c->mark_lock);
 
 	/*
@@ -610,9 +609,9 @@ int bch2_mark_alloc(struct btree_trans *trans,
 	 */
 
 	if ((flags & BTREE_TRIGGER_BUCKET_INVALIDATE) &&
-	    old_m.cached_sectors) {
+	    old_a.cached_sectors) {
 		ret = update_cached_sectors(c, new, ca->dev_idx,
-					    -old_m.cached_sectors,
+					    -old_a.cached_sectors,
 					    journal_seq, gc);
 		if (ret) {
 			bch2_fs_fatal_error(c, "bch2_mark_alloc(): no replicas entry while updating cached sectors");
@@ -620,7 +619,7 @@ int bch2_mark_alloc(struct btree_trans *trans,
 		}
 
 		trace_invalidate(ca, bucket_to_sector(ca, new.k->p.offset),
-				 old_m.cached_sectors);
+				 old_a.cached_sectors);
 	}
 
 	return 0;
@@ -2039,16 +2038,6 @@ recalculate:
 
 /* Startup/shutdown: */
 
-static void buckets_free_rcu(struct rcu_head *rcu)
-{
-	struct bucket_array *buckets =
-		container_of(rcu, struct bucket_array, rcu);
-
-	kvpfree(buckets,
-		sizeof(*buckets) +
-		buckets->nbuckets * sizeof(struct bucket));
-}
-
 static void bucket_gens_free_rcu(struct rcu_head *rcu)
 {
 	struct bucket_gens *buckets =
@@ -2059,16 +2048,12 @@ static void bucket_gens_free_rcu(struct rcu_head *rcu)
 
 int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
 {
-	struct bucket_array *buckets = NULL, *old_buckets = NULL;
 	struct bucket_gens *bucket_gens = NULL, *old_bucket_gens = NULL;
 	unsigned long *buckets_nouse = NULL;
-	bool resize = ca->buckets[0] != NULL;
+	bool resize = ca->bucket_gens != NULL;
 	int ret = -ENOMEM;
 
-	if (!(buckets		= kvpmalloc(sizeof(struct bucket_array) +
-					    nbuckets * sizeof(struct bucket),
-					    GFP_KERNEL|__GFP_ZERO)) ||
-	    !(bucket_gens	= kvpmalloc(sizeof(struct bucket_gens) + nbuckets,
+	if (!(bucket_gens	= kvpmalloc(sizeof(struct bucket_gens) + nbuckets,
 					    GFP_KERNEL|__GFP_ZERO)) ||
 	    (c->opts.buckets_nouse &&
 	     !(buckets_nouse	= kvpmalloc(BITS_TO_LONGS(nbuckets) *
@@ -2076,8 +2061,6 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
 					    GFP_KERNEL|__GFP_ZERO))))
 		goto err;
 
-	buckets->first_bucket	= ca->mi.first_bucket;
-	buckets->nbuckets	= nbuckets;
 	bucket_gens->first_bucket = ca->mi.first_bucket;
 	bucket_gens->nbuckets	= nbuckets;
 
@@ -2089,15 +2072,11 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
 		percpu_down_write(&c->mark_lock);
 	}
 
-	old_buckets = bucket_array(ca);
 	old_bucket_gens = rcu_dereference_protected(ca->bucket_gens, 1);
 
 	if (resize) {
-		size_t n = min(buckets->nbuckets, old_buckets->nbuckets);
+		size_t n = min(bucket_gens->nbuckets, old_bucket_gens->nbuckets);
 
-		memcpy(buckets->b,
-		       old_buckets->b,
-		       n * sizeof(struct bucket));
 		memcpy(bucket_gens->b,
 		       old_bucket_gens->b,
 		       n);
@@ -2107,31 +2086,25 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
 			       BITS_TO_LONGS(n) * sizeof(unsigned long));
 	}
 
-	rcu_assign_pointer(ca->buckets[0], buckets);
 	rcu_assign_pointer(ca->bucket_gens, bucket_gens);
-	buckets		= old_buckets;
 	bucket_gens	= old_bucket_gens;
 
 	swap(ca->buckets_nouse, buckets_nouse);
 
+	nbuckets = ca->mi.nbuckets;
+
 	if (resize) {
 		percpu_up_write(&c->mark_lock);
+		up_write(&ca->bucket_lock);
 		up_write(&c->gc_lock);
 	}
 
-	nbuckets = ca->mi.nbuckets;
-
-	if (resize)
-		up_write(&ca->bucket_lock);
-
 	ret = 0;
 err:
 	kvpfree(buckets_nouse,
 		BITS_TO_LONGS(nbuckets) * sizeof(unsigned long));
 	if (bucket_gens)
 		call_rcu(&bucket_gens->rcu, bucket_gens_free_rcu);
-	if (buckets)
-		call_rcu(&buckets->rcu, buckets_free_rcu);
 
 	return ret;
 }
@@ -2144,9 +2117,6 @@ void bch2_dev_buckets_free(struct bch_dev *ca)
 		BITS_TO_LONGS(ca->mi.nbuckets) * sizeof(unsigned long));
 	kvpfree(rcu_dereference_protected(ca->bucket_gens, 1),
 		sizeof(struct bucket_gens) + ca->mi.nbuckets);
-	kvpfree(rcu_dereference_protected(ca->buckets[0], 1),
-		sizeof(struct bucket_array) +
-		ca->mi.nbuckets * sizeof(struct bucket));
 
 	for (i = 0; i < ARRAY_SIZE(ca->usage); i++)
 		free_percpu(ca->usage[i]);
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index 9cc6c16bcc64..7ae1feadf4c0 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -30,34 +30,23 @@
 	_old;							\
 })
 
-static inline struct bucket_array *__bucket_array(struct bch_dev *ca,
-						  bool gc)
+static inline struct bucket_array *gc_bucket_array(struct bch_dev *ca)
 {
-	return rcu_dereference_check(ca->buckets[gc],
+	return rcu_dereference_check(ca->buckets_gc,
 				     !ca->fs ||
 				     percpu_rwsem_is_held(&ca->fs->mark_lock) ||
 				     lockdep_is_held(&ca->fs->gc_lock) ||
 				     lockdep_is_held(&ca->bucket_lock));
 }
 
-static inline struct bucket_array *bucket_array(struct bch_dev *ca)
-{
-	return __bucket_array(ca, false);
-}
-
-static inline struct bucket *__bucket(struct bch_dev *ca, size_t b, bool gc)
+static inline struct bucket *gc_bucket(struct bch_dev *ca, size_t b)
 {
-	struct bucket_array *buckets = __bucket_array(ca, gc);
+	struct bucket_array *buckets = gc_bucket_array(ca);
 
 	BUG_ON(b < buckets->first_bucket || b >= buckets->nbuckets);
 	return buckets->b + b;
 }
 
-static inline struct bucket *gc_bucket(struct bch_dev *ca, size_t b)
-{
-	return __bucket(ca, b, true);
-}
-
 static inline struct bucket_gens *bucket_gens(struct bch_dev *ca)
 {
 	return rcu_dereference_check(ca->bucket_gens,
@@ -65,7 +54,6 @@ static inline struct bucket_gens *bucket_gens(struct bch_dev *ca)
 				     percpu_rwsem_is_held(&ca->fs->mark_lock) ||
 				     lockdep_is_held(&ca->fs->gc_lock) ||
 				     lockdep_is_held(&ca->bucket_lock));
-
 }
 
 static inline u8 *bucket_gen(struct bch_dev *ca, size_t b)
diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h
index 6ddbea4da7d1..f7bf5c1d732f 100644
--- a/fs/bcachefs/buckets_types.h
+++ b/fs/bcachefs/buckets_types.h
@@ -27,7 +27,6 @@ struct bucket {
 		const struct bucket_mark mark;
 	};
 
-	u64				io_time[2];
 	unsigned			gen_valid:1;
 	u8				stripe_redundancy;
 	u32				stripe;
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 14edc0bf5112..8291e58089fd 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -1139,7 +1139,7 @@ use_clean:
 	err = "error reading allocation information";
 
 	down_read(&c->gc_lock);
-	ret = bch2_alloc_read(c, false, false);
+	ret = bch2_alloc_read(c);
 	up_read(&c->gc_lock);
 
 	if (ret)
-- 
cgit 


From 66d90823857ed9196ef52361518ab703e468c53b Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 14 Feb 2022 00:07:38 -0500
Subject: bcachefs: Kill struct bucket_mark

This switches struct bucket to using a lock, instead of cmpxchg. And now
that the protected members no longer need to fit into a u64, we can
expand the sector counts to 32 bits.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs_format.h |   4 +-
 fs/bcachefs/btree_gc.c        |  83 ++++++++++++--------------
 fs/bcachefs/buckets.c         | 135 +++++++++++++++++++-----------------------
 fs/bcachefs/buckets.h         |  24 ++++----
 fs/bcachefs/buckets_types.h   |  30 +++-------
 5 files changed, 117 insertions(+), 159 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index ee683d08e8ae..5faa42baeeba 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -903,8 +903,8 @@ struct bch_alloc_v2 {
 #define BCH_ALLOC_FIELDS_V2()			\
 	x(read_time,		64)		\
 	x(write_time,		64)		\
-	x(dirty_sectors,	16)		\
-	x(cached_sectors,	16)		\
+	x(dirty_sectors,	32)		\
+	x(cached_sectors,	32)		\
 	x(stripe,		32)		\
 	x(stripe_redundancy,	8)
 
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index f66b2ef03c3a..747667ce131d 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -571,37 +571,37 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id,
 				(printbuf_reset(&buf),
 				 bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) {
 			if (!p.ptr.cached) {
-				g->_mark.gen		= p.ptr.gen;
 				g->gen_valid		= true;
+				g->gen			= p.ptr.gen;
 			} else {
 				do_update = true;
 			}
 		}
 
-		if (fsck_err_on(gen_cmp(p.ptr.gen, g->mark.gen) > 0, c,
+		if (fsck_err_on(gen_cmp(p.ptr.gen, g->gen) > 0, c,
 				"bucket %u:%zu data type %s ptr gen in the future: %u > %u\n"
 				"while marking %s",
 				p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
 				bch2_data_types[ptr_data_type(k->k, &p.ptr)],
-				p.ptr.gen, g->mark.gen,
+				p.ptr.gen, g->gen,
 				(printbuf_reset(&buf),
 				 bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) {
 			if (!p.ptr.cached) {
-				g->_mark.gen		= p.ptr.gen;
 				g->gen_valid		= true;
-				g->_mark.data_type	= 0;
-				g->_mark.dirty_sectors	= 0;
-				g->_mark.cached_sectors	= 0;
+				g->gen			= p.ptr.gen;
+				g->data_type		= 0;
+				g->dirty_sectors	= 0;
+				g->cached_sectors	= 0;
 				set_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags);
 			} else {
 				do_update = true;
 			}
 		}
 
-		if (fsck_err_on(gen_cmp(g->mark.gen, p.ptr.gen) > BUCKET_GC_GEN_MAX, c,
+		if (fsck_err_on(gen_cmp(g->gen, p.ptr.gen) > BUCKET_GC_GEN_MAX, c,
 				"bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n"
 				"while marking %s",
-				p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->mark.gen,
+				p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->gen,
 				bch2_data_types[ptr_data_type(k->k, &p.ptr)],
 				p.ptr.gen,
 				(printbuf_reset(&buf),
@@ -609,30 +609,30 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id,
 			do_update = true;
 
 		if (fsck_err_on(!p.ptr.cached &&
-				gen_cmp(p.ptr.gen, g->mark.gen) < 0, c,
+				gen_cmp(p.ptr.gen, g->gen) < 0, c,
 				"bucket %u:%zu data type %s stale dirty ptr: %u < %u\n"
 				"while marking %s",
 				p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
 				bch2_data_types[ptr_data_type(k->k, &p.ptr)],
-				p.ptr.gen, g->mark.gen,
+				p.ptr.gen, g->gen,
 				(printbuf_reset(&buf),
 				 bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))
 			do_update = true;
 
-		if (data_type != BCH_DATA_btree && p.ptr.gen != g->mark.gen)
+		if (data_type != BCH_DATA_btree && p.ptr.gen != g->gen)
 			continue;
 
-		if (fsck_err_on(g->mark.data_type &&
-				g->mark.data_type != data_type, c,
+		if (fsck_err_on(g->data_type &&
+				g->data_type != data_type, c,
 				"bucket %u:%zu different types of data in same bucket: %s, %s\n"
 				"while marking %s",
 				p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
-				bch2_data_types[g->mark.data_type],
+				bch2_data_types[g->data_type],
 				bch2_data_types[data_type],
 				(printbuf_reset(&buf),
 				 bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) {
 			if (data_type == BCH_DATA_btree) {
-				g->_mark.data_type	= data_type;
+				g->data_type	= data_type;
 				set_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags);
 			} else {
 				do_update = true;
@@ -692,7 +692,7 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id,
 				struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
 				struct bucket *g = PTR_GC_BUCKET(ca, ptr);
 
-				ptr->gen = g->mark.gen;
+				ptr->gen = g->gen;
 			}
 		} else {
 			bch2_bkey_drop_ptrs(bkey_i_to_s(new), ptr, ({
@@ -701,12 +701,12 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id,
 				enum bch_data_type data_type = bch2_bkey_ptr_data_type(*k, ptr);
 
 				(ptr->cached &&
-				 (!g->gen_valid || gen_cmp(ptr->gen, g->mark.gen) > 0)) ||
+				 (!g->gen_valid || gen_cmp(ptr->gen, g->gen) > 0)) ||
 				(!ptr->cached &&
-				 gen_cmp(ptr->gen, g->mark.gen) < 0) ||
-				gen_cmp(g->mark.gen, ptr->gen) > BUCKET_GC_GEN_MAX ||
-				(g->mark.data_type &&
-				 g->mark.data_type != data_type);
+				 gen_cmp(ptr->gen, g->gen) < 0) ||
+				gen_cmp(g->gen, ptr->gen) > BUCKET_GC_GEN_MAX ||
+				(g->data_type &&
+				 g->data_type != data_type);
 			}));
 again:
 			ptrs = bch2_bkey_ptrs(bkey_i_to_s(new));
@@ -1325,10 +1325,10 @@ static int bch2_alloc_write_key(struct btree_trans *trans,
 {
 	struct bch_fs *c = trans->c;
 	struct bch_dev *ca = bch_dev_bkey_exists(c, iter->pos.inode);
-	struct bucket *g;
+	struct bucket gc;
 	struct bkey_s_c k;
 	struct bkey_i_alloc_v4 *a;
-	struct bch_alloc_v4 old, new, gc;
+	struct bch_alloc_v4 old, new;
 	int ret;
 
 	k = bch2_btree_iter_peek_slot(iter);
@@ -1340,15 +1340,7 @@ static int bch2_alloc_write_key(struct btree_trans *trans,
 	new = old;
 
 	percpu_down_read(&c->mark_lock);
-	g	= gc_bucket(ca, iter->pos.offset);
-	gc = (struct bch_alloc_v4) {
-		.gen		= g->mark.gen,
-		.data_type	= g->mark.data_type,
-		.dirty_sectors	= g->mark.dirty_sectors,
-		.cached_sectors	= g->mark.cached_sectors,
-		.stripe		= g->stripe,
-		.stripe_redundancy = g->stripe_redundancy,
-	};
+	gc = *gc_bucket(ca, iter->pos.offset);
 	percpu_up_read(&c->mark_lock);
 
 	if (metadata_only &&
@@ -1365,8 +1357,8 @@ static int bch2_alloc_write_key(struct btree_trans *trans,
 			"bucket %llu:%llu gen %u data type %s has wrong " #_f	\
 			": got %u, should be %u",			\
 			iter->pos.inode, iter->pos.offset,		\
-			new.gen,					\
-			bch2_data_types[new.data_type],			\
+			gc.gen,						\
+			bch2_data_types[gc.data_type],			\
 			new._f, gc._f))					\
 		new._f = gc._f;						\
 
@@ -1467,17 +1459,16 @@ static int bch2_gc_alloc_start(struct bch_fs *c, bool metadata_only)
 
 		bch2_alloc_to_v4(k, &a);
 
-		g->_mark.gen		= a.gen;
-		g->gen_valid		= 1;
+		g->gen_valid	= 1;
+		g->gen		= a.gen;
 
 		if (metadata_only &&
 		    (a.data_type == BCH_DATA_user ||
 		     a.data_type == BCH_DATA_cached ||
 		     a.data_type == BCH_DATA_parity)) {
-			g->_mark.data_type	= a.data_type;
-			g->_mark.dirty_sectors	= a.dirty_sectors;
-			g->_mark.cached_sectors	= a.cached_sectors;
-			g->_mark.stripe		= a.stripe != 0;
+			g->data_type		= a.data_type;
+			g->dirty_sectors	= a.dirty_sectors;
+			g->cached_sectors	= a.cached_sectors;
 			g->stripe		= a.stripe;
 			g->stripe_redundancy	= a.stripe_redundancy;
 		}
@@ -1503,12 +1494,12 @@ static void bch2_gc_alloc_reset(struct bch_fs *c, bool metadata_only)
 
 		for_each_bucket(g, buckets) {
 			if (metadata_only &&
-			    (g->mark.data_type == BCH_DATA_user ||
-			     g->mark.data_type == BCH_DATA_cached ||
-			     g->mark.data_type == BCH_DATA_parity))
+			    (g->data_type == BCH_DATA_user ||
+			     g->data_type == BCH_DATA_cached ||
+			     g->data_type == BCH_DATA_parity))
 				continue;
-			g->_mark.dirty_sectors = 0;
-			g->_mark.cached_sectors = 0;
+			g->dirty_sectors = 0;
+			g->cached_sectors = 0;
 		}
 	};
 }
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 572d56676c69..31de8035e86d 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -349,7 +349,7 @@ static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
 }
 
 static void bch2_dev_usage_update_m(struct bch_fs *c, struct bch_dev *ca,
-				    struct bucket_mark old, struct bucket_mark new,
+				    struct bucket old, struct bucket new,
 				    u64 journal_seq, bool gc)
 {
 	struct bch_alloc_v4 old_a = {
@@ -586,20 +586,19 @@ int bch2_mark_alloc(struct btree_trans *trans,
 	bch2_dev_usage_update(c, ca, old_a, new_a, journal_seq, gc);
 
 	if (gc) {
-		struct bucket_mark old_m, m;
 		struct bucket *g = gc_bucket(ca, new.k->p.offset);
 
-		old_m = bucket_cmpxchg(g, m, ({
-			m.gen			= new_a.gen;
-			m.data_type		= new_a.data_type;
-			m.dirty_sectors		= new_a.dirty_sectors;
-			m.cached_sectors	= new_a.cached_sectors;
-			m.stripe		= new_a.stripe != 0;
-		}));
+		bucket_lock(g);
 
 		g->gen_valid		= 1;
+		g->gen			= new_a.gen;
+		g->data_type		= new_a.data_type;
 		g->stripe		= new_a.stripe;
 		g->stripe_redundancy	= new_a.stripe_redundancy;
+		g->dirty_sectors	= new_a.dirty_sectors;
+		g->cached_sectors	= new_a.cached_sectors;
+
+		bucket_unlock(g);
 	}
 	percpu_up_read(&c->mark_lock);
 
@@ -625,23 +624,12 @@ int bch2_mark_alloc(struct btree_trans *trans,
 	return 0;
 }
 
-#define checked_add(a, b)					\
-({								\
-	unsigned _res = (unsigned) (a) + (b);			\
-	bool overflow = _res > U16_MAX;				\
-	if (overflow)						\
-		_res = U16_MAX;					\
-	(a) = _res;						\
-	overflow;						\
-})
-
 void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
 			       size_t b, enum bch_data_type data_type,
 			       unsigned sectors, struct gc_pos pos,
 			       unsigned flags)
 {
-	struct bucket *g;
-	struct bucket_mark old, new;
+	struct bucket old, new, *g;
 	bool overflow;
 
 	BUG_ON(!(flags & BTREE_TRIGGER_GC));
@@ -656,10 +644,16 @@ void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
 
 	percpu_down_read(&c->mark_lock);
 	g = gc_bucket(ca, b);
-	old = bucket_cmpxchg(g, new, ({
-		new.data_type	= data_type;
-		overflow = checked_add(new.dirty_sectors, sectors);
-	}));
+
+	bucket_lock(g);
+	old = *g;
+
+	g->data_type = data_type;
+	g->dirty_sectors += sectors;
+	overflow = g->dirty_sectors < sectors;
+
+	new = *g;
+	bucket_unlock(g);
 
 	bch2_fs_inconsistent_on(old.data_type &&
 				old.data_type != data_type, c,
@@ -693,7 +687,7 @@ static int check_bucket_ref(struct bch_fs *c,
 			    const struct bch_extent_ptr *ptr,
 			    s64 sectors, enum bch_data_type ptr_data_type,
 			    u8 b_gen, u8 bucket_data_type,
-			    u16 dirty_sectors, u16 cached_sectors)
+			    u32 dirty_sectors, u32 cached_sectors)
 {
 	struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
 	size_t bucket_nr = PTR_BUCKET_NR(ca, ptr);
@@ -761,7 +755,7 @@ static int check_bucket_ref(struct bch_fs *c,
 		goto err;
 	}
 
-	if ((unsigned) (bucket_sectors + sectors) > U16_MAX) {
+	if ((unsigned) (bucket_sectors + sectors) > U32_MAX) {
 		bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
 			"bucket %u:%zu gen %u data type %s sector count overflow: %u + %lli > U16_MAX\n"
 			"while marking %s",
@@ -792,8 +786,7 @@ static int mark_stripe_bucket(struct btree_trans *trans,
 	s64 sectors = parity ? le16_to_cpu(s->sectors) : 0;
 	const struct bch_extent_ptr *ptr = s->ptrs + ptr_idx;
 	struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
-	struct bucket *g;
-	struct bucket_mark new, old;
+	struct bucket old, new, *g;
 	struct printbuf buf = PRINTBUF;
 	int ret = 0;
 
@@ -805,33 +798,37 @@ static int mark_stripe_bucket(struct btree_trans *trans,
 	buf.atomic++;
 	g = PTR_GC_BUCKET(ca, ptr);
 
-	if (g->mark.dirty_sectors ||
+	if (g->dirty_sectors ||
 	    (g->stripe && g->stripe != k.k->p.offset)) {
 		bch2_fs_inconsistent(c,
 			      "bucket %u:%zu gen %u: multiple stripes using same bucket\n%s",
-			      ptr->dev, PTR_BUCKET_NR(ca, ptr), g->mark.gen,
+			      ptr->dev, PTR_BUCKET_NR(ca, ptr), g->gen,
 			      (bch2_bkey_val_to_text(&buf, c, k), buf.buf));
 		ret = -EINVAL;
 		goto err;
 	}
 
-	old = bucket_cmpxchg(g, new, ({
-		ret = check_bucket_ref(c, k, ptr, sectors, data_type,
-				       new.gen, new.data_type,
-				       new.dirty_sectors, new.cached_sectors);
-		if (ret)
-			goto err;
+	bucket_lock(g);
+	old = *g;
 
-		new.dirty_sectors += sectors;
-		if (data_type)
-			new.data_type		= data_type;
+	ret = check_bucket_ref(c, k, ptr, sectors, data_type,
+			       new.gen, new.data_type,
+			       new.dirty_sectors, new.cached_sectors);
+	if (ret) {
+		bucket_unlock(g);
+		goto err;
+	}
 
-		new.stripe = true;
-	}));
+	new.dirty_sectors += sectors;
+	if (data_type)
+		new.data_type = data_type;
 
 	g->stripe		= k.k->p.offset;
 	g->stripe_redundancy	= s->nr_redundant;
 
+	new = *g;
+	bucket_unlock(g);
+
 	bch2_dev_usage_update_m(c, ca, old, new, journal_seq, true);
 err:
 	percpu_up_read(&c->mark_lock);
@@ -844,9 +841,9 @@ static int __mark_pointer(struct btree_trans *trans,
 			  const struct bch_extent_ptr *ptr,
 			  s64 sectors, enum bch_data_type ptr_data_type,
 			  u8 bucket_gen, u8 *bucket_data_type,
-			  u16 *dirty_sectors, u16 *cached_sectors)
+			  u32 *dirty_sectors, u32 *cached_sectors)
 {
-	u16 *dst_sectors = !ptr->cached
+	u32 *dst_sectors = !ptr->cached
 		? dirty_sectors
 		: cached_sectors;
 	int ret = check_bucket_ref(trans->c, k, ptr, sectors, ptr_data_type,
@@ -870,11 +867,9 @@ static int bch2_mark_pointer(struct btree_trans *trans,
 {
 	u64 journal_seq = trans->journal_res.seq;
 	struct bch_fs *c = trans->c;
-	struct bucket_mark old, new;
 	struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev);
-	struct bucket *g;
+	struct bucket old, new, *g;
 	u8 bucket_data_type;
-	u64 v;
 	int ret = 0;
 
 	BUG_ON(!(flags & BTREE_TRIGGER_GC));
@@ -882,28 +877,25 @@ static int bch2_mark_pointer(struct btree_trans *trans,
 	percpu_down_read(&c->mark_lock);
 	g = PTR_GC_BUCKET(ca, &p.ptr);
 
-	v = atomic64_read(&g->_mark.v);
-	do {
-		new.v.counter = old.v.counter = v;
-		bucket_data_type = new.data_type;
-
-		ret = __mark_pointer(trans, k, &p.ptr, sectors,
-				     data_type, new.gen,
-				     &bucket_data_type,
-				     &new.dirty_sectors,
-				     &new.cached_sectors);
-		if (ret)
-			goto err;
+	bucket_lock(g);
+	old = *g;
 
-		new.data_type = bucket_data_type;
+	bucket_data_type = g->data_type;
 
-		if (flags & BTREE_TRIGGER_NOATOMIC) {
-			g->_mark = new;
-			break;
-		}
-	} while ((v = atomic64_cmpxchg(&g->_mark.v,
-			      old.v.counter,
-			      new.v.counter)) != old.v.counter);
+	ret = __mark_pointer(trans, k, &p.ptr, sectors,
+			     data_type, g->gen,
+			     &bucket_data_type,
+			     &g->dirty_sectors,
+			     &g->cached_sectors);
+	if (ret) {
+		bucket_unlock(g);
+		goto err;
+	}
+
+	g->data_type = bucket_data_type;
+
+	new = *g;
+	bucket_unlock(g);
 
 	bch2_dev_usage_update_m(c, ca, old, new, journal_seq, true);
 err:
@@ -1404,25 +1396,18 @@ static int bch2_trans_mark_pointer(struct btree_trans *trans,
 {
 	struct btree_iter iter;
 	struct bkey_i_alloc_v4 *a;
-	u16 dirty_sectors, cached_sectors;
 	int ret;
 
 	a = bch2_trans_start_alloc_update(trans, &iter, PTR_BUCKET_POS(trans->c, &p.ptr));
 	if (IS_ERR(a))
 		return PTR_ERR(a);
 
-	dirty_sectors	= a->v.dirty_sectors;
-	cached_sectors	= a->v.cached_sectors;
-
 	ret = __mark_pointer(trans, k, &p.ptr, sectors, data_type,
 			     a->v.gen, &a->v.data_type,
-			     &dirty_sectors, &cached_sectors);
+			     &a->v.dirty_sectors, &a->v.cached_sectors);
 	if (ret)
 		goto out;
 
-	a->v.dirty_sectors	= dirty_sectors;
-	a->v.cached_sectors	= cached_sectors;
-
 	ret = bch2_trans_update(trans, &iter, &a->k_i, 0);
 	if (ret)
 		goto out;
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index 7ae1feadf4c0..31a56f1f4fca 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -15,20 +15,16 @@
 	for (_b = (_buckets)->b + (_buckets)->first_bucket;	\
 	     _b < (_buckets)->b + (_buckets)->nbuckets; _b++)
 
-#define bucket_cmpxchg(g, new, expr)				\
-({								\
-	struct bucket *_g = g;					\
-	u64 _v = atomic64_read(&(g)->_mark.v);			\
-	struct bucket_mark _old;				\
-								\
-	do {							\
-		(new).v.counter = _old.v.counter = _v;		\
-		expr;						\
-	} while ((_v = atomic64_cmpxchg(&(_g)->_mark.v,		\
-			       _old.v.counter,			\
-			       (new).v.counter)) != _old.v.counter);\
-	_old;							\
-})
+static inline void bucket_unlock(struct bucket *b)
+{
+	smp_store_release(&b->lock, 0);
+}
+
+static inline void bucket_lock(struct bucket *b)
+{
+	while (xchg(&b->lock, 1))
+		cpu_relax();
+}
 
 static inline struct bucket_array *gc_bucket_array(struct bch_dev *ca)
 {
diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h
index f7bf5c1d732f..e79a33795bf9 100644
--- a/fs/bcachefs/buckets_types.h
+++ b/fs/bcachefs/buckets_types.h
@@ -7,29 +7,15 @@
 
 #define BUCKET_JOURNAL_SEQ_BITS		16
 
-struct bucket_mark {
-	union {
-	atomic64_t	v;
-
-	struct {
-	u8		gen;
-	u8		data_type:3,
-			stripe:1;
-	u16		dirty_sectors;
-	u16		cached_sectors;
-	};
-	};
-};
-
 struct bucket {
-	union {
-		struct bucket_mark	_mark;
-		const struct bucket_mark mark;
-	};
-
-	unsigned			gen_valid:1;
-	u8				stripe_redundancy;
-	u32				stripe;
+	u8			lock;
+	u8			gen_valid:1;
+	u8			data_type:7;
+	u8			gen;
+	u8			stripe_redundancy;
+	u32			stripe;
+	u32			dirty_sectors;
+	u32			cached_sectors;
 };
 
 struct bucket_array {
-- 
cgit 


From c32fc674d4ca13f3b889693a2b59365d93a77144 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 2 Apr 2022 16:30:37 -0400
Subject: bcachefs: Fix pr_buf() calls

In a few places we were passing a variable to pr_buf() for the format
string - oops.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/opts.c | 2 +-
 fs/bcachefs/util.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c
index 77fbb7d2194e..385451ef865e 100644
--- a/fs/bcachefs/opts.c
+++ b/fs/bcachefs/opts.c
@@ -324,7 +324,7 @@ void bch2_opt_to_text(struct printbuf *out,
 		if (flags & OPT_SHOW_FULL_LIST)
 			bch2_string_opt_to_text(out, opt->choices, v);
 		else
-			pr_buf(out, opt->choices[v]);
+			pr_buf(out, "%s", opt->choices[v]);
 		break;
 	case BCH_OPT_FN:
 		opt->to_text(out, c, sb, v);
diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h
index 085f1c357383..1629d279f494 100644
--- a/fs/bcachefs/util.h
+++ b/fs/bcachefs/util.h
@@ -388,7 +388,7 @@ static inline void pr_uuid(struct printbuf *out, u8 *uuid)
 	char uuid_str[40];
 
 	uuid_unparse_lower(uuid, uuid_str);
-	pr_buf(out, uuid_str);
+	pr_buf(out, "%s", uuid_str);
 }
 
 int bch2_strtoint_h(const char *, int *);
-- 
cgit 


From 75f02de43f4f7e76b9453096c5f4255d4476eda8 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 31 Mar 2022 22:05:33 -0400
Subject: bcachefs: Use crc_is_compressed()

Trivial cleanup.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/buckets.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 31de8035e86d..dcb2ea3de4b8 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -675,10 +675,9 @@ static s64 ptr_disk_sectors(s64 sectors, struct extent_ptr_decoded p)
 {
 	EBUG_ON(sectors < 0);
 
-	return p.crc.compression_type &&
-		p.crc.compression_type != BCH_COMPRESSION_TYPE_incompressible
+	return crc_is_compressed(p.crc)
 		? DIV_ROUND_UP_ULL(sectors * p.crc.compressed_size,
-			       p.crc.uncompressed_size)
+				   p.crc.uncompressed_size)
 		: sectors;
 }
 
-- 
cgit 


From 64afbbc9096f819ee5d2c0d98bef203612630e1d Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 2 Apr 2022 16:57:29 -0400
Subject: bcachefs: Improve read_from_stale_dirty_pointer() message

With printbufs, it's now easy to build up multi-line log messages and
emit them with one call, which is good because it prevents multiple
multi-line log messages from getting Interspersed in the log buffer;
this patch also improves the formatting and converts it to latest style.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/io.c | 26 ++++++++++++++++----------
 1 file changed, 16 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index 6bebbd44ccc8..0f80255e59bd 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -2089,22 +2089,28 @@ static noinline void read_from_stale_dirty_pointer(struct btree_trans *trans,
 	struct printbuf buf = PRINTBUF;
 	int ret;
 
-	bch2_bkey_val_to_text(&buf, c, k);
-	bch2_fs_inconsistent(c, "Attempting to read from stale dirty pointer: %s", buf.buf);
-
 	bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc,
-			     POS(ptr.dev, PTR_BUCKET_NR(ca, &ptr)),
+			     PTR_BUCKET_POS(c, &ptr),
 			     BTREE_ITER_CACHED);
 
-	ret = lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_slot(&iter)));
-	if (ret)
-		goto out;
+	pr_buf(&buf, "Attempting to read from stale dirty pointer:");
+	pr_indent_push(&buf, 2);
+	pr_newline(&buf);
 
 	bch2_bkey_val_to_text(&buf, c, k);
-	bch_err(c, "%s", buf.buf);
-	bch_err(c, "memory gen: %u", *bucket_gen(ca, iter.pos.offset));
+	pr_newline(&buf);
+
+	pr_buf(&buf, "memory gen: %u", *bucket_gen(ca, iter.pos.offset));
+
+	ret = lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_slot(&iter)));
+	if (!ret) {
+		pr_newline(&buf);
+		bch2_bkey_val_to_text(&buf, c, k);
+	}
+
+	bch2_fs_inconsistent(c, "%s", buf.buf);
+
 	bch2_trans_iter_exit(trans, &iter);
-out:
 	printbuf_exit(&buf);
 }
 
-- 
cgit 


From 80c80164a5f131307b9b870f7a366ed45b901b88 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 2 Apr 2022 17:24:25 -0400
Subject: bcachefs: Don't write partially-initialized superblocks

This neatly avoids bugs where we fail partway through initializing a new
filesystem, if we just don't write out partly-initialized state.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/super-io.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index c7962266f495..4fb2bede39f6 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -833,6 +833,13 @@ int bch2_write_super(struct bch_fs *c)
 	if (c->opts.nochanges)
 		goto out;
 
+	/*
+	 * Defer writing the superblock until filesystem initialization is
+	 * complete - don't write out a partly initialized superblock:
+	 */
+	if (!BCH_SB_INITIALIZED(c->disk_sb.sb))
+		goto out;
+
 	for_each_online_member(ca, c, i) {
 		__set_bit(ca->dev_idx, sb_written.d);
 		ca->sb_write_error = 0;
-- 
cgit 


From c6b6d416126da015e4b6b6a66b4c6fd3eda40f1f Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 2 Apr 2022 18:00:04 -0400
Subject: bcachefs: gc mark fn fixes, cleanups

mark_stripe_bucket() was busted; it was using @new unitialized.

Also, clean up all the gc mark functions, and convert them to the same
style.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/buckets.c | 85 +++++++++++++++++++++++++--------------------------
 fs/bcachefs/buckets.h |  6 ++--
 2 files changed, 44 insertions(+), 47 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index dcb2ea3de4b8..5b78e8f983a1 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -624,13 +624,13 @@ int bch2_mark_alloc(struct btree_trans *trans,
 	return 0;
 }
 
-void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
-			       size_t b, enum bch_data_type data_type,
-			       unsigned sectors, struct gc_pos pos,
-			       unsigned flags)
+int bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
+			      size_t b, enum bch_data_type data_type,
+			      unsigned sectors, struct gc_pos pos,
+			      unsigned flags)
 {
 	struct bucket old, new, *g;
-	bool overflow;
+	int ret = 0;
 
 	BUG_ON(!(flags & BTREE_TRIGGER_GC));
 	BUG_ON(data_type != BCH_DATA_sb &&
@@ -640,7 +640,7 @@ void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
 	 * Backup superblock might be past the end of our normal usable space:
 	 */
 	if (b >= ca->mi.nbuckets)
-		return;
+		return 0;
 
 	percpu_down_read(&c->mark_lock);
 	g = gc_bucket(ca, b);
@@ -648,27 +648,34 @@ void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
 	bucket_lock(g);
 	old = *g;
 
+	if (bch2_fs_inconsistent_on(g->data_type &&
+			g->data_type != data_type, c,
+			"different types of data in same bucket: %s, %s",
+			bch2_data_types[g->data_type],
+			bch2_data_types[data_type])) {
+		ret = -EIO;
+		goto err;
+	}
+
+	if (bch2_fs_inconsistent_on((u64) g->dirty_sectors + sectors > ca->mi.bucket_size, c,
+			"bucket %u:%zu gen %u data type %s sector count overflow: %u + %u > bucket size",
+			ca->dev_idx, b, g->gen,
+			bch2_data_types[g->data_type ?: data_type],
+			g->dirty_sectors, sectors)) {
+		ret = -EIO;
+		goto err;
+	}
+
+
 	g->data_type = data_type;
 	g->dirty_sectors += sectors;
-	overflow = g->dirty_sectors < sectors;
-
 	new = *g;
+err:
 	bucket_unlock(g);
-
-	bch2_fs_inconsistent_on(old.data_type &&
-				old.data_type != data_type, c,
-		"different types of data in same bucket: %s, %s",
-		bch2_data_types[old.data_type],
-		bch2_data_types[data_type]);
-
-	bch2_fs_inconsistent_on(overflow, c,
-		"bucket %u:%zu gen %u data type %s sector count overflow: %u + %u > U16_MAX",
-		ca->dev_idx, b, new.gen,
-		bch2_data_types[old.data_type ?: data_type],
-		old.dirty_sectors, sectors);
-
-	bch2_dev_usage_update_m(c, ca, old, new, 0, true);
+	if (!ret)
+		bch2_dev_usage_update_m(c, ca, old, new, 0, true);
 	percpu_up_read(&c->mark_lock);
+	return ret;
 }
 
 static s64 ptr_disk_sectors(s64 sectors, struct extent_ptr_decoded p)
@@ -811,25 +818,22 @@ static int mark_stripe_bucket(struct btree_trans *trans,
 	old = *g;
 
 	ret = check_bucket_ref(c, k, ptr, sectors, data_type,
-			       new.gen, new.data_type,
-			       new.dirty_sectors, new.cached_sectors);
-	if (ret) {
-		bucket_unlock(g);
+			       g->gen, g->data_type,
+			       g->dirty_sectors, g->cached_sectors);
+	if (ret)
 		goto err;
-	}
 
-	new.dirty_sectors += sectors;
 	if (data_type)
-		new.data_type = data_type;
+		g->data_type = data_type;
+	g->dirty_sectors += sectors;
 
 	g->stripe		= k.k->p.offset;
 	g->stripe_redundancy	= s->nr_redundant;
-
 	new = *g;
-	bucket_unlock(g);
-
-	bch2_dev_usage_update_m(c, ca, old, new, journal_seq, true);
 err:
+	bucket_unlock(g);
+	if (!ret)
+		bch2_dev_usage_update_m(c, ca, old, new, journal_seq, true);
 	percpu_up_read(&c->mark_lock);
 	printbuf_exit(&buf);
 	return ret;
@@ -875,29 +879,22 @@ static int bch2_mark_pointer(struct btree_trans *trans,
 
 	percpu_down_read(&c->mark_lock);
 	g = PTR_GC_BUCKET(ca, &p.ptr);
-
 	bucket_lock(g);
 	old = *g;
 
 	bucket_data_type = g->data_type;
-
 	ret = __mark_pointer(trans, k, &p.ptr, sectors,
 			     data_type, g->gen,
 			     &bucket_data_type,
 			     &g->dirty_sectors,
 			     &g->cached_sectors);
-	if (ret) {
-		bucket_unlock(g);
-		goto err;
-	}
-
-	g->data_type = bucket_data_type;
+	if (!ret)
+		g->data_type = bucket_data_type;
 
 	new = *g;
 	bucket_unlock(g);
-
-	bch2_dev_usage_update_m(c, ca, old, new, journal_seq, true);
-err:
+	if (!ret)
+		bch2_dev_usage_update_m(c, ca, old, new, journal_seq, true);
 	percpu_up_read(&c->mark_lock);
 
 	return ret;
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index 31a56f1f4fca..4675a1f5d189 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -194,9 +194,9 @@ bch2_fs_usage_read_short(struct bch_fs *);
 
 void bch2_fs_usage_initialize(struct bch_fs *);
 
-void bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *,
-			       size_t, enum bch_data_type, unsigned,
-			       struct gc_pos, unsigned);
+int bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *,
+			      size_t, enum bch_data_type, unsigned,
+			      struct gc_pos, unsigned);
 
 int bch2_mark_alloc(struct btree_trans *, struct bkey_s_c, struct bkey_s_c, unsigned);
 int bch2_mark_extent(struct btree_trans *, struct bkey_s_c, struct bkey_s_c, unsigned);
-- 
cgit 


From 4254f5bf6e3d62ab7108a556d5afc54188e17041 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 3 Apr 2022 15:13:20 -0400
Subject: bcachefs: Add a tracepoint for superblock writes

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/super-io.c |  3 +++
 fs/bcachefs/trace.h    | 20 ++++++++++++++++++++
 2 files changed, 23 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index 4fb2bede39f6..7e885b51349e 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -16,6 +16,7 @@
 #include "quota.h"
 #include "super-io.h"
 #include "super.h"
+#include "trace.h"
 #include "vstructs.h"
 
 #include <linux/backing-dev.h>
@@ -799,6 +800,8 @@ int bch2_write_super(struct bch_fs *c)
 	unsigned degraded_flags = BCH_FORCE_IF_DEGRADED;
 	int ret = 0;
 
+	trace_write_super(c, _RET_IP_);
+
 	if (c->opts.very_degraded)
 		degraded_flags |= BCH_FORCE_IF_LOST;
 
diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h
index ef2096fd147d..eeacb5291764 100644
--- a/fs/bcachefs/trace.h
+++ b/fs/bcachefs/trace.h
@@ -82,6 +82,26 @@ DECLARE_EVENT_CLASS(bio,
 		  (unsigned long long)__entry->sector, __entry->nr_sector)
 );
 
+/* super-io.c: */
+TRACE_EVENT(write_super,
+	TP_PROTO(struct bch_fs *c, unsigned long ip),
+	TP_ARGS(c, ip),
+
+	TP_STRUCT__entry(
+		__field(dev_t,		dev	)
+		__field(unsigned long,	ip	)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= c->dev;
+		__entry->ip		= ip;
+	),
+
+	TP_printk("%d,%d for %pS",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  (void *) __entry->ip)
+);
+
 /* io.c: */
 
 DEFINE_EVENT(bio, read_split,
-- 
cgit 


From 7c7e071d90ac278e462640570d739dd165d3acd0 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 3 Apr 2022 20:36:32 -0400
Subject: bcachefs: Don't normalize to pages in btree cache shrinker

This behavior dates from the early, early days of bcache, and upon
further delving appears to not make any sense. The shrinker only works
in terms of 'objects' of unknown size; normalizing to pages only had the
effect of changing the batch size, which we could do directly - if we
wanted; we probably don't. Normalizing to pages meant our batch size was
very small, which seems to have been keeping us from doing as much
shrinking as we should be under heavy memory pressure; this patch
appears to alleviate some OOMs we've been seeing.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_cache.c | 13 ++++---------
 fs/bcachefs/trace.h       | 28 ++++++++++------------------
 2 files changed, 14 insertions(+), 27 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index 92a8cc704cab..0e3db9ee65d2 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -280,7 +280,7 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink,
 	struct btree_cache *bc = &c->btree_cache;
 	struct btree *b, *t;
 	unsigned long nr = sc->nr_to_scan;
-	unsigned long can_free;
+	unsigned long can_free = 0;
 	unsigned long touched = 0;
 	unsigned long freed = 0;
 	unsigned i, flags;
@@ -304,7 +304,6 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink,
 	 * succeed, so that inserting keys into the btree can always succeed and
 	 * IO can always make forward progress:
 	 */
-	nr /= btree_pages(c);
 	can_free = btree_cache_can_free(bc);
 	nr = min_t(unsigned long, nr, can_free);
 
@@ -374,13 +373,10 @@ touched:
 
 	mutex_unlock(&bc->lock);
 out:
-	ret = (unsigned long) freed * btree_pages(c);
+	ret = freed;
 	memalloc_nofs_restore(flags);
 out_norestore:
-	trace_btree_cache_scan(sc->nr_to_scan,
-			       sc->nr_to_scan / btree_pages(c),
-			       btree_cache_can_free(bc),
-			       ret);
+	trace_btree_cache_scan(sc->nr_to_scan, can_free, ret);
 	return ret;
 }
 
@@ -394,7 +390,7 @@ static unsigned long bch2_btree_cache_count(struct shrinker *shrink,
 	if (bch2_btree_shrinker_disabled)
 		return 0;
 
-	return btree_cache_can_free(bc) * btree_pages(c);
+	return btree_cache_can_free(bc);
 }
 
 void bch2_fs_btree_cache_exit(struct bch_fs *c)
@@ -481,7 +477,6 @@ int bch2_fs_btree_cache_init(struct bch_fs *c)
 	bc->shrink.count_objects	= bch2_btree_cache_count;
 	bc->shrink.scan_objects		= bch2_btree_cache_scan;
 	bc->shrink.seeks		= 4;
-	bc->shrink.batch		= btree_pages(c) * 2;
 	ret = register_shrinker(&bc->shrink, "%s/btree_cache", c->name);
 out:
 	pr_verbose_init(c->opts, "ret %i", ret);
diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h
index eeacb5291764..de6a17c92f5f 100644
--- a/fs/bcachefs/trace.h
+++ b/fs/bcachefs/trace.h
@@ -376,31 +376,23 @@ DEFINE_EVENT(btree_node, btree_set_root,
 );
 
 TRACE_EVENT(btree_cache_scan,
-	TP_PROTO(unsigned long nr_to_scan_pages,
-		 unsigned long nr_to_scan_nodes,
-		 unsigned long can_free_nodes,
-		 long ret),
-	TP_ARGS(nr_to_scan_pages, nr_to_scan_nodes, can_free_nodes, ret),
+	TP_PROTO(long nr_to_scan, long can_free, long ret),
+	TP_ARGS(nr_to_scan, can_free, ret),
 
 	TP_STRUCT__entry(
-		__field(unsigned long,	nr_to_scan_pages	)
-		__field(unsigned long,	nr_to_scan_nodes	)
-		__field(unsigned long,	can_free_nodes		)
-		__field(long,		ret			)
+		__field(long,	nr_to_scan		)
+		__field(long,	can_free		)
+		__field(long,	ret			)
 	),
 
 	TP_fast_assign(
-		__entry->nr_to_scan_pages	= nr_to_scan_pages;
-		__entry->nr_to_scan_nodes	= nr_to_scan_nodes;
-		__entry->can_free_nodes		= can_free_nodes;
-		__entry->ret			= ret;
+		__entry->nr_to_scan	= nr_to_scan;
+		__entry->can_free	= can_free;
+		__entry->ret		= ret;
 	),
 
-	TP_printk("scanned for %lu pages, %lu nodes, can free %lu nodes, ret %li",
-		  __entry->nr_to_scan_pages,
-		  __entry->nr_to_scan_nodes,
-		  __entry->can_free_nodes,
-		  __entry->ret)
+	TP_printk("scanned for %li nodes, can free %li, ret %li",
+		  __entry->nr_to_scan, __entry->can_free, __entry->ret)
 );
 
 TRACE_EVENT(btree_node_relock_fail,
-- 
cgit 


From d1d7737fd9df0cc57cd276b0189faf8c92c1426f Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 4 Apr 2022 01:09:26 -0400
Subject: bcachefs: Gap buffer for journal keys

Btree updates before we go RW work by inserting into the array of keys
that journal replay will insert - but inserting into a flat array is
O(n), meaning if btree_gc needs to update many alloc keys, we're O(n^2).

Fortunately, the updates btree_gc does happens in sequential order,
which means a gap buffer works nicely here - this patch implements a gap
buffer for journal keys.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/bcachefs.h |   6 +++
 fs/bcachefs/recovery.c | 144 ++++++++++++++++++++++++++++++++++---------------
 fs/bcachefs/recovery.h |   3 --
 fs/bcachefs/util.h     |  25 +++++++++
 4 files changed, 133 insertions(+), 45 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index c06837612bdf..f2bb23162b4a 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -548,6 +548,12 @@ struct journal_keys {
 		u32		journal_seq;
 		u32		journal_offset;
 	}			*d;
+	/*
+	 * Gap buffer: instead of all the empty space in the array being at the
+	 * end of the buffer - from @nr to @size - the empty space is at @gap.
+	 * This means that sequential insertions are O(n) instead of O(n^2).
+	 */
+	size_t			gap;
 	size_t			nr;
 	size_t			size;
 	u64			journal_seq_base;
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 8291e58089fd..f9215cc7cb09 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -72,58 +72,97 @@ static int journal_key_cmp(const struct journal_key *l, const struct journal_key
 	return __journal_key_cmp(l->btree_id, l->level, l->k->k.p, r);
 }
 
-size_t bch2_journal_key_search(struct journal_keys *journal_keys,
+static inline size_t idx_to_pos(struct journal_keys *keys, size_t idx)
+{
+	size_t gap_size = keys->size - keys->nr;
+
+	if (idx >= keys->gap)
+		idx += gap_size;
+	return idx;
+}
+
+static inline struct journal_key *idx_to_key(struct journal_keys *keys, size_t idx)
+{
+	return keys->d + idx_to_pos(keys, idx);
+}
+
+size_t bch2_journal_key_search(struct journal_keys *keys,
 			       enum btree_id id, unsigned level,
 			       struct bpos pos)
 {
-	size_t l = 0, r = journal_keys->nr, m;
+	size_t l = 0, r = keys->nr, m;
 
 	while (l < r) {
 		m = l + ((r - l) >> 1);
-		if (__journal_key_cmp(id, level, pos, &journal_keys->d[m]) > 0)
+		if (__journal_key_cmp(id, level, pos, idx_to_key(keys, m)) > 0)
 			l = m + 1;
 		else
 			r = m;
 	}
 
-	BUG_ON(l < journal_keys->nr &&
-	       __journal_key_cmp(id, level, pos, &journal_keys->d[l]) > 0);
+	BUG_ON(l < keys->nr &&
+	       __journal_key_cmp(id, level, pos, idx_to_key(keys, l)) > 0);
 
 	BUG_ON(l &&
-	       __journal_key_cmp(id, level, pos, &journal_keys->d[l - 1]) <= 0);
+	       __journal_key_cmp(id, level, pos, idx_to_key(keys, l - 1)) <= 0);
 
-	return l;
+	return idx_to_pos(keys, l);
 }
 
 struct bkey_i *bch2_journal_keys_peek(struct bch_fs *c, enum btree_id btree_id,
 				      unsigned level, struct bpos pos)
 {
 	struct journal_keys *keys = &c->journal_keys;
-	struct journal_key *end = keys->d + keys->nr;
-	struct journal_key *k = keys->d +
-		bch2_journal_key_search(keys, btree_id, level, pos);
+	size_t idx = bch2_journal_key_search(keys, btree_id, level, pos);
 
-	while (k < end && k->overwritten)
-		k++;
+	while (idx < keys->size &&
+	       keys->d[idx].overwritten) {
+		idx++;
+		if (idx == keys->gap)
+			idx += keys->size - keys->nr;
+	}
 
-	if (k < end &&
-	    k->btree_id	== btree_id &&
-	    k->level	== level)
-		return k->k;
+	if (idx < keys->size &&
+	    keys->d[idx].btree_id == btree_id &&
+	    keys->d[idx].level == level)
+		return keys->d[idx].k;
 	return NULL;
 }
 
-static void journal_iter_fix(struct bch_fs *c, struct journal_iter *iter, unsigned idx)
+static void journal_iters_fix(struct bch_fs *c)
 {
-	struct bkey_i *n = iter->keys->d[idx].k;
-	struct btree_and_journal_iter *biter =
-		container_of(iter, struct btree_and_journal_iter, journal);
-
-	if (iter->idx > idx ||
-	    (iter->idx == idx &&
-	     biter->last &&
-	     bpos_cmp(n->k.p, biter->unpacked.p) <= 0))
-		iter->idx++;
+	struct journal_keys *keys = &c->journal_keys;
+	/* The key we just inserted is immediately before the gap: */
+	struct journal_key *n = &keys->d[keys->gap - 1];
+	size_t gap_end = keys->gap + (keys->size - keys->nr);
+	struct btree_and_journal_iter *iter;
+
+	/*
+	 * If an iterator points one after the key we just inserted,
+	 * and the key we just inserted compares >= the iterator's position,
+	 * decrement the iterator so it points at the key we just inserted:
+	 */
+	list_for_each_entry(iter, &c->journal_iters, journal.list)
+		if (iter->journal.idx == gap_end &&
+		    iter->last &&
+		    iter->b->c.btree_id == n->btree_id &&
+		    iter->b->c.level	== n->level &&
+		    bpos_cmp(n->k->k.p, iter->unpacked.p) >= 0)
+			iter->journal.idx = keys->gap - 1;
+}
+
+static void journal_iters_move_gap(struct bch_fs *c, size_t old_gap, size_t new_gap)
+{
+	struct journal_keys *keys = &c->journal_keys;
+	struct journal_iter *iter;
+	size_t gap_size = keys->size - keys->nr;
+
+	list_for_each_entry(iter, &c->journal_iters, list) {
+		if (iter->idx > old_gap)
+			iter->idx -= gap_size;
+		if (iter->idx >= new_gap)
+			iter->idx += gap_size;
+	}
 }
 
 int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id,
@@ -141,12 +180,11 @@ int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id,
 		.journal_seq	= U32_MAX,
 	};
 	struct journal_keys *keys = &c->journal_keys;
-	struct journal_iter *iter;
 	size_t idx = bch2_journal_key_search(keys, id, level, k->k.p);
 
 	BUG_ON(test_bit(BCH_FS_RW, &c->flags));
 
-	if (idx < keys->nr &&
+	if (idx < keys->size &&
 	    journal_key_cmp(&n, &keys->d[idx]) == 0) {
 		if (keys->d[idx].allocated)
 			kfree(keys->d[idx].k);
@@ -154,6 +192,9 @@ int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id,
 		return 0;
 	}
 
+	if (idx > keys->gap)
+		idx -= keys->size - keys->nr;
+
 	if (keys->nr == keys->size) {
 		struct journal_keys new_keys = {
 			.nr			= keys->nr,
@@ -168,15 +209,24 @@ int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id,
 			return -ENOMEM;
 		}
 
+		/* Since @keys was full, there was no gap: */
 		memcpy(new_keys.d, keys->d, sizeof(keys->d[0]) * keys->nr);
 		kvfree(keys->d);
 		*keys = new_keys;
+
+		/* And now the gap is at the end: */
+		keys->gap = keys->nr;
 	}
 
-	array_insert_item(keys->d, keys->nr, idx, n);
+	journal_iters_move_gap(c, keys->gap, idx);
+
+	move_gap(keys->d, keys->nr, keys->size, keys->gap, idx);
+	keys->gap = idx;
+
+	keys->nr++;
+	keys->d[keys->gap++] = n;
 
-	list_for_each_entry(iter, &c->journal_iters, list)
-		journal_iter_fix(c, iter, idx);
+	journal_iters_fix(c);
 
 	return 0;
 }
@@ -220,36 +270,39 @@ void bch2_journal_key_overwritten(struct bch_fs *c, enum btree_id btree,
 	struct journal_keys *keys = &c->journal_keys;
 	size_t idx = bch2_journal_key_search(keys, btree, level, pos);
 
-	if (idx < keys->nr &&
+	if (idx < keys->size &&
 	    keys->d[idx].btree_id	== btree &&
 	    keys->d[idx].level		== level &&
 	    !bpos_cmp(keys->d[idx].k->k.p, pos))
 		keys->d[idx].overwritten = true;
 }
 
-static struct bkey_i *bch2_journal_iter_peek(struct journal_iter *iter)
+static void bch2_journal_iter_advance(struct journal_iter *iter)
+{
+	if (iter->idx < iter->keys->size) {
+		iter->idx++;
+		if (iter->idx == iter->keys->gap)
+			iter->idx += iter->keys->size - iter->keys->nr;
+	}
+}
+
+struct bkey_i *bch2_journal_iter_peek(struct journal_iter *iter)
 {
 	struct journal_key *k = iter->keys->d + iter->idx;
 
-	while (k < iter->keys->d + iter->keys->nr &&
+	while (k < iter->keys->d + iter->keys->size &&
 	       k->btree_id	== iter->btree_id &&
 	       k->level		== iter->level) {
 		if (!k->overwritten)
 			return k->k;
 
-		iter->idx++;
+		bch2_journal_iter_advance(iter);
 		k = iter->keys->d + iter->idx;
 	}
 
 	return NULL;
 }
 
-static void bch2_journal_iter_advance(struct journal_iter *iter)
-{
-	if (iter->idx < iter->keys->nr)
-		iter->idx++;
-}
-
 static void bch2_journal_iter_exit(struct journal_iter *iter)
 {
 	list_del(&iter->list);
@@ -409,13 +462,16 @@ void bch2_journal_keys_free(struct journal_keys *keys)
 {
 	struct journal_key *i;
 
+	move_gap(keys->d, keys->nr, keys->size, keys->gap, keys->nr);
+	keys->gap = keys->nr;
+
 	for (i = keys->d; i < keys->d + keys->nr; i++)
 		if (i->allocated)
 			kfree(i->k);
 
 	kvfree(keys->d);
 	keys->d = NULL;
-	keys->nr = 0;
+	keys->nr = keys->gap = keys->size = 0;
 }
 
 static struct journal_keys journal_keys_sort(struct list_head *journal_entries)
@@ -478,6 +534,7 @@ static struct journal_keys journal_keys_sort(struct list_head *journal_entries)
 	}
 
 	keys.nr = dst - keys.d;
+	keys.gap = keys.nr;
 err:
 	return keys;
 }
@@ -538,6 +595,9 @@ static int bch2_journal_replay(struct bch_fs *c)
 	size_t i;
 	int ret;
 
+	move_gap(keys->d, keys->nr, keys->size, keys->gap, keys->nr);
+	keys->gap = keys->nr;
+
 	keys_sorted = kvmalloc_array(sizeof(*keys_sorted), keys->nr, GFP_KERNEL);
 	if (!keys_sorted)
 		return -ENOMEM;
diff --git a/fs/bcachefs/recovery.h b/fs/bcachefs/recovery.h
index e6927a918df3..30580a8984a1 100644
--- a/fs/bcachefs/recovery.h
+++ b/fs/bcachefs/recovery.h
@@ -2,9 +2,6 @@
 #ifndef _BCACHEFS_RECOVERY_H
 #define _BCACHEFS_RECOVERY_H
 
-#define for_each_journal_key(keys, i)				\
-	for (i = (keys).d; i < (keys).d + (keys).nr; (i)++)
-
 struct journal_iter {
 	struct list_head	list;
 	enum btree_id		btree_id;
diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h
index 1629d279f494..74bfa5faf470 100644
--- a/fs/bcachefs/util.h
+++ b/fs/bcachefs/util.h
@@ -826,6 +826,31 @@ do {									\
 #define array_remove_item(_array, _nr, _pos)				\
 	array_remove_items(_array, _nr, _pos, 1)
 
+static inline void __move_gap(void *array, size_t element_size,
+			      size_t nr, size_t size,
+			      size_t old_gap, size_t new_gap)
+{
+	size_t gap_end = old_gap + size - nr;
+
+	if (new_gap < old_gap) {
+		size_t move = old_gap - new_gap;
+
+		memmove(array + element_size * (gap_end - move),
+			array + element_size * (old_gap - move),
+				element_size * move);
+	} else if (new_gap > old_gap) {
+		size_t move = new_gap - old_gap;
+
+		memmove(array + element_size * old_gap,
+			array + element_size * gap_end,
+				element_size * move);
+	}
+}
+
+/* Move the gap in a gap buffer: */
+#define move_gap(_array, _nr, _size, _old_gap, _new_gap)	\
+	__move_gap(_array, sizeof(_array[0]), _nr, _size, _old_gap, _new_gap)
+
 #define bubble_sort(_base, _nr, _cmp)					\
 do {									\
 	ssize_t _i, _end;						\
-- 
cgit 


From f0ac7df23d04f3c6d4cd82899aad7f06f6a0b1d4 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 3 Apr 2022 17:50:01 -0400
Subject: bcachefs: Convert .key_invalid methods to printbufs

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/alloc_background.c      |  75 +++++++++------
 fs/bcachefs/alloc_background.h      |   8 +-
 fs/bcachefs/bkey_methods.c          | 147 +++++++++++++++++-----------
 fs/bcachefs/bkey_methods.h          |  16 +--
 fs/bcachefs/btree_io.c              |  74 ++++++++------
 fs/bcachefs/btree_update_interior.c |  20 ++--
 fs/bcachefs/btree_update_leaf.c     |  20 ++--
 fs/bcachefs/buckets.h               |   4 +-
 fs/bcachefs/dirent.c                |  56 +++++++----
 fs/bcachefs/dirent.h                |   2 +-
 fs/bcachefs/ec.c                    |  32 +++---
 fs/bcachefs/ec.h                    |   3 +-
 fs/bcachefs/extents.c               | 187 +++++++++++++++++++++---------------
 fs/bcachefs/extents.h               |  17 ++--
 fs/bcachefs/inode.c                 | 130 ++++++++++++++-----------
 fs/bcachefs/inode.h                 |  10 +-
 fs/bcachefs/journal_io.c            |  29 +++---
 fs/bcachefs/lru.c                   |  12 ++-
 fs/bcachefs/lru.h                   |   2 +-
 fs/bcachefs/quota.c                 |  19 ++--
 fs/bcachefs/quota.h                 |   2 +-
 fs/bcachefs/reflink.c               |  45 ++++++---
 fs/bcachefs/reflink.h               |   8 +-
 fs/bcachefs/subvolume.c             |  66 ++++++++-----
 fs/bcachefs/subvolume.h             |   4 +-
 fs/bcachefs/xattr.c                 |  43 ++++++---
 fs/bcachefs/xattr.h                 |   2 +-
 27 files changed, 629 insertions(+), 404 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index b0f49044ea24..42ef752932eb 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -302,71 +302,86 @@ static unsigned bch_alloc_v1_val_u64s(const struct bch_alloc *a)
 	return DIV_ROUND_UP(bytes, sizeof(u64));
 }
 
-const char *bch2_alloc_v1_invalid(const struct bch_fs *c, struct bkey_s_c k)
+int bch2_alloc_v1_invalid(const struct bch_fs *c, struct bkey_s_c k, struct printbuf *err)
 {
 	struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k);
 
-	if (k.k->p.inode >= c->sb.nr_devices ||
-	    !c->devs[k.k->p.inode])
-		return "invalid device";
+	if (!bch2_dev_exists2(c, k.k->p.inode)) {
+		pr_buf(err, "invalid device (%llu)", k.k->p.inode);
+		return -EINVAL;
+	}
 
 	/* allow for unknown fields */
-	if (bkey_val_u64s(a.k) < bch_alloc_v1_val_u64s(a.v))
-		return "incorrect value size";
+	if (bkey_val_u64s(a.k) < bch_alloc_v1_val_u64s(a.v)) {
+		pr_buf(err, "incorrect value size (%zu < %u)",
+		       bkey_val_u64s(a.k), bch_alloc_v1_val_u64s(a.v));
+		return -EINVAL;
+	}
 
-	return NULL;
+	return 0;
 }
 
-const char *bch2_alloc_v2_invalid(const struct bch_fs *c, struct bkey_s_c k)
+int bch2_alloc_v2_invalid(const struct bch_fs *c, struct bkey_s_c k, struct printbuf *err)
 {
 	struct bkey_alloc_unpacked u;
 
-	if (k.k->p.inode >= c->sb.nr_devices ||
-	    !c->devs[k.k->p.inode])
-		return "invalid device";
+	if (!bch2_dev_exists2(c, k.k->p.inode)) {
+		pr_buf(err, "invalid device (%llu)", k.k->p.inode);
+		return -EINVAL;
+	}
 
-	if (bch2_alloc_unpack_v2(&u, k))
-		return "unpack error";
+	if (bch2_alloc_unpack_v2(&u, k)) {
+		pr_buf(err, "unpack error");
+		return -EINVAL;
+	}
 
-	return NULL;
+	return 0;
 }
 
-const char *bch2_alloc_v3_invalid(const struct bch_fs *c, struct bkey_s_c k)
+int bch2_alloc_v3_invalid(const struct bch_fs *c, struct bkey_s_c k, struct printbuf *err)
 {
 	struct bkey_alloc_unpacked u;
 	struct bch_dev *ca;
 
-	if (k.k->p.inode >= c->sb.nr_devices ||
-	    !c->devs[k.k->p.inode])
-		return "invalid device";
+	if (!bch2_dev_exists2(c, k.k->p.inode)) {
+		pr_buf(err, "invalid device (%llu)", k.k->p.inode);
+		return -EINVAL;
+	}
 
 	ca = bch_dev_bkey_exists(c, k.k->p.inode);
 
 	if (k.k->p.offset < ca->mi.first_bucket ||
-	    k.k->p.offset >= ca->mi.nbuckets)
-		return "invalid bucket";
+	    k.k->p.offset >= ca->mi.nbuckets) {
+		pr_buf(err, "invalid bucket");
+		return -EINVAL;
+	}
 
-	if (bch2_alloc_unpack_v3(&u, k))
-		return "unpack error";
+	if (bch2_alloc_unpack_v3(&u, k)) {
+		pr_buf(err, "unpack error");
+		return -EINVAL;
+	}
 
-	return NULL;
+	return 0;
 }
 
-const char *bch2_alloc_v4_invalid(const struct bch_fs *c, struct bkey_s_c k)
+int bch2_alloc_v4_invalid(const struct bch_fs *c, struct bkey_s_c k, struct printbuf *err)
 {
 	struct bch_dev *ca;
 
-	if (k.k->p.inode >= c->sb.nr_devices ||
-	    !c->devs[k.k->p.inode])
-		return "invalid device";
+	if (!bch2_dev_exists2(c, k.k->p.inode)) {
+		pr_buf(err, "invalid device (%llu)", k.k->p.inode);
+		return -EINVAL;
+	}
 
 	ca = bch_dev_bkey_exists(c, k.k->p.inode);
 
 	if (k.k->p.offset < ca->mi.first_bucket ||
-	    k.k->p.offset >= ca->mi.nbuckets)
-		return "invalid bucket";
+	    k.k->p.offset >= ca->mi.nbuckets) {
+		pr_buf(err, "invalid bucket");
+		return -EINVAL;
+	}
 
-	return NULL;
+	return 0;
 }
 
 void bch2_alloc_v4_swab(struct bkey_s k)
diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h
index 3b49abf1bbc0..93bd8feb9ebc 100644
--- a/fs/bcachefs/alloc_background.h
+++ b/fs/bcachefs/alloc_background.h
@@ -66,10 +66,10 @@ int bch2_bucket_io_time_reset(struct btree_trans *, unsigned, size_t, int);
 
 #define ALLOC_SCAN_BATCH(ca)		max_t(size_t, 1, (ca)->mi.nbuckets >> 9)
 
-const char *bch2_alloc_v1_invalid(const struct bch_fs *, struct bkey_s_c);
-const char *bch2_alloc_v2_invalid(const struct bch_fs *, struct bkey_s_c);
-const char *bch2_alloc_v3_invalid(const struct bch_fs *, struct bkey_s_c);
-const char *bch2_alloc_v4_invalid(const struct bch_fs *, struct bkey_s_c k);
+int bch2_alloc_v1_invalid(const struct bch_fs *, struct bkey_s_c, struct printbuf *);
+int bch2_alloc_v2_invalid(const struct bch_fs *, struct bkey_s_c, struct printbuf *);
+int bch2_alloc_v3_invalid(const struct bch_fs *, struct bkey_s_c, struct printbuf *);
+int bch2_alloc_v4_invalid(const struct bch_fs *, struct bkey_s_c k, struct printbuf *);
 void bch2_alloc_v4_swab(struct bkey_s);
 void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 
diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c
index 9a1819147749..0351cbe7d48e 100644
--- a/fs/bcachefs/bkey_methods.c
+++ b/fs/bcachefs/bkey_methods.c
@@ -22,10 +22,10 @@ const char * const bch2_bkey_types[] = {
 	NULL
 };
 
-static const char *deleted_key_invalid(const struct bch_fs *c,
-					struct bkey_s_c k)
+static int deleted_key_invalid(const struct bch_fs *c, struct bkey_s_c k,
+			       struct printbuf *err)
 {
-	return NULL;
+	return 0;
 }
 
 #define bch2_bkey_ops_deleted (struct bkey_ops) {	\
@@ -36,25 +36,32 @@ static const char *deleted_key_invalid(const struct bch_fs *c,
 	.key_invalid = deleted_key_invalid,		\
 }
 
-static const char *empty_val_key_invalid(const struct bch_fs *c, struct bkey_s_c k)
+static int empty_val_key_invalid(const struct bch_fs *c, struct bkey_s_c k,
+				 struct printbuf *err)
 {
-	if (bkey_val_bytes(k.k))
-		return "value size should be zero";
+	if (bkey_val_bytes(k.k)) {
+		pr_buf(err, "incorrect value size (%zu != 0)",
+		       bkey_val_bytes(k.k));
+		return -EINVAL;
+	}
 
-	return NULL;
+	return 0;
 }
 
 #define bch2_bkey_ops_error (struct bkey_ops) {		\
 	.key_invalid = empty_val_key_invalid,		\
 }
 
-static const char *key_type_cookie_invalid(const struct bch_fs *c,
-					   struct bkey_s_c k)
+static int key_type_cookie_invalid(const struct bch_fs *c, struct bkey_s_c k,
+				   struct printbuf *err)
 {
-	if (bkey_val_bytes(k.k) != sizeof(struct bch_cookie))
-		return "incorrect value size";
+	if (bkey_val_bytes(k.k) != sizeof(struct bch_cookie)) {
+		pr_buf(err, "incorrect value size (%zu != %zu)",
+		       bkey_val_bytes(k.k), sizeof(struct bch_cookie));
+		return -EINVAL;
+	}
 
-	return NULL;
+	return 0;
 }
 
 #define bch2_bkey_ops_cookie (struct bkey_ops) {	\
@@ -65,10 +72,10 @@ static const char *key_type_cookie_invalid(const struct bch_fs *c,
 	.key_invalid = empty_val_key_invalid,		\
 }
 
-static const char *key_type_inline_data_invalid(const struct bch_fs *c,
-					   struct bkey_s_c k)
+static int key_type_inline_data_invalid(const struct bch_fs *c, struct bkey_s_c k,
+					struct printbuf *err)
 {
-	return NULL;
+	return 0;
 }
 
 static void key_type_inline_data_to_text(struct printbuf *out, struct bch_fs *c,
@@ -86,11 +93,16 @@ static void key_type_inline_data_to_text(struct printbuf *out, struct bch_fs *c,
 	.val_to_text	= key_type_inline_data_to_text,	\
 }
 
-static const char *key_type_set_invalid(const struct bch_fs *c, struct bkey_s_c k)
+static int key_type_set_invalid(const struct bch_fs *c, struct bkey_s_c k,
+				struct printbuf *err)
 {
-	if (bkey_val_bytes(k.k))
-		return "nonempty value";
-	return NULL;
+	if (bkey_val_bytes(k.k)) {
+		pr_buf(err, "incorrect value size (%zu != %zu)",
+		       bkey_val_bytes(k.k), sizeof(struct bch_cookie));
+		return -EINVAL;
+	}
+
+	return 0;
 }
 
 static bool key_type_set_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r)
@@ -110,12 +122,14 @@ const struct bkey_ops bch2_bkey_ops[] = {
 #undef x
 };
 
-const char *bch2_bkey_val_invalid(struct bch_fs *c, struct bkey_s_c k)
+int bch2_bkey_val_invalid(struct bch_fs *c, struct bkey_s_c k, struct printbuf *err)
 {
-	if (k.k->type >= KEY_TYPE_MAX)
-		return "invalid type";
+	if (k.k->type >= KEY_TYPE_MAX) {
+		pr_buf(err, "invalid type (%u >= %u)", k.k->type, KEY_TYPE_MAX);
+		return -EINVAL;
+	}
 
-	return bch2_bkey_ops[k.k->type].key_invalid(c, k);
+	return bch2_bkey_ops[k.k->type].key_invalid(c, k, err);
 }
 
 static unsigned bch2_key_types_allowed[] = {
@@ -182,63 +196,84 @@ static unsigned bch2_key_types_allowed[] = {
 		(1U << KEY_TYPE_btree_ptr_v2),
 };
 
-const char *__bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k,
-				enum btree_node_type type)
+int __bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k,
+			enum btree_node_type type,
+			struct printbuf *err)
 {
-	if (k.k->u64s < BKEY_U64s)
-		return "u64s too small";
-
-	if (!(bch2_key_types_allowed[type] & (1U << k.k->type)))
-		return "invalid key type for this btree";
+	if (k.k->u64s < BKEY_U64s) {
+		pr_buf(err, "u64s too small (%u < %zu)", k.k->u64s, BKEY_U64s);
+		return -EINVAL;
+	}
 
-	if (type == BKEY_TYPE_btree &&
-	    bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX)
-		return "value too big";
+	if (!(bch2_key_types_allowed[type] & (1U << k.k->type))) {
+		pr_buf(err, "invalid key type for this btree (%s)",
+		       bch2_bkey_types[type]);
+		return -EINVAL;
+	}
 
 	if (btree_node_type_is_extents(type) && !bkey_whiteout(k.k)) {
-		if (k.k->size == 0)
-			return "bad size field";
+		if (k.k->size == 0) {
+			pr_buf(err, "size == 0");
+			return -EINVAL;
+		}
 
-		if (k.k->size > k.k->p.offset)
-			return "size greater than offset";
+		if (k.k->size > k.k->p.offset) {
+			pr_buf(err, "size greater than offset (%u > %llu)",
+			       k.k->size, k.k->p.offset);
+			return -EINVAL;
+		}
 	} else {
-		if (k.k->size)
-			return "nonzero size field";
+		if (k.k->size) {
+			pr_buf(err, "size != 0");
+			return -EINVAL;
+		}
 	}
 
 	if (type != BKEY_TYPE_btree &&
 	    !btree_type_has_snapshots(type) &&
-	    k.k->p.snapshot)
-		return "nonzero snapshot";
+	    k.k->p.snapshot) {
+		pr_buf(err, "nonzero snapshot");
+		return -EINVAL;
+	}
 
 	if (type != BKEY_TYPE_btree &&
 	    btree_type_has_snapshots(type) &&
-	    !k.k->p.snapshot)
-		return "invalid snapshot field";
+	    !k.k->p.snapshot) {
+		pr_buf(err, "snapshot == 0");
+		return -EINVAL;
+	}
 
 	if (type != BKEY_TYPE_btree &&
-	    !bkey_cmp(k.k->p, POS_MAX))
-		return "POS_MAX key";
+	    !bkey_cmp(k.k->p, POS_MAX)) {
+		pr_buf(err, "key at POS_MAX");
+		return -EINVAL;
+	}
 
-	return NULL;
+	return 0;
 }
 
-const char *bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k,
-			      enum btree_node_type type)
+int bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k,
+		      enum btree_node_type type,
+		      struct printbuf *err)
 {
-	return __bch2_bkey_invalid(c, k, type) ?:
-		bch2_bkey_val_invalid(c, k);
+	return __bch2_bkey_invalid(c, k, type, err) ?:
+		bch2_bkey_val_invalid(c, k, err);
 }
 
-const char *bch2_bkey_in_btree_node(struct btree *b, struct bkey_s_c k)
+int bch2_bkey_in_btree_node(struct btree *b, struct bkey_s_c k,
+			    struct printbuf *err)
 {
-	if (bpos_cmp(k.k->p, b->data->min_key) < 0)
-		return "key before start of btree node";
+	if (bpos_cmp(k.k->p, b->data->min_key) < 0) {
+		pr_buf(err, "key before start of btree node");
+		return -EINVAL;
+	}
 
-	if (bpos_cmp(k.k->p, b->data->max_key) > 0)
-		return "key past end of btree node";
+	if (bpos_cmp(k.k->p, b->data->max_key) > 0) {
+		pr_buf(err, "key past end of btree node");
+		return -EINVAL;
+	}
 
-	return NULL;
+	return 0;
 }
 
 void bch2_bpos_to_text(struct printbuf *out, struct bpos pos)
diff --git a/fs/bcachefs/bkey_methods.h b/fs/bcachefs/bkey_methods.h
index 2b1086971bbb..4b90d0873be6 100644
--- a/fs/bcachefs/bkey_methods.h
+++ b/fs/bcachefs/bkey_methods.h
@@ -14,8 +14,8 @@ extern const char * const bch2_bkey_types[];
 
 struct bkey_ops {
 	/* Returns reason for being invalid if invalid, else NULL: */
-	const char *	(*key_invalid)(const struct bch_fs *,
-				       struct bkey_s_c);
+	int		(*key_invalid)(const struct bch_fs *, struct bkey_s_c,
+				       struct printbuf *);
 	void		(*val_to_text)(struct printbuf *, struct bch_fs *,
 				       struct bkey_s_c);
 	void		(*swab)(struct bkey_s);
@@ -32,12 +32,12 @@ struct bkey_ops {
 
 extern const struct bkey_ops bch2_bkey_ops[];
 
-const char *bch2_bkey_val_invalid(struct bch_fs *, struct bkey_s_c);
-const char *__bch2_bkey_invalid(struct bch_fs *, struct bkey_s_c,
-				enum btree_node_type);
-const char *bch2_bkey_invalid(struct bch_fs *, struct bkey_s_c,
-			      enum btree_node_type);
-const char *bch2_bkey_in_btree_node(struct btree *, struct bkey_s_c);
+int bch2_bkey_val_invalid(struct bch_fs *, struct bkey_s_c, struct printbuf *);
+int __bch2_bkey_invalid(struct bch_fs *, struct bkey_s_c,
+				enum btree_node_type, struct printbuf *);
+int bch2_bkey_invalid(struct bch_fs *, struct bkey_s_c,
+			      enum btree_node_type, struct printbuf *);
+int bch2_bkey_in_btree_node(struct btree *, struct bkey_s_c, struct printbuf *);
 
 void bch2_bpos_to_text(struct printbuf *, struct bpos);
 void bch2_bkey_to_text(struct printbuf *, const struct bkey *);
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index b1099958ed5e..d2b3ff6b9b15 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -762,14 +762,23 @@ fsck_err:
 	return ret;
 }
 
+static int bset_key_invalid(struct bch_fs *c, struct btree *b,
+			    struct bkey_s_c k,
+			    bool updated_range, int write,
+			    struct printbuf *err)
+{
+	return __bch2_bkey_invalid(c, k, btree_node_type(b), err) ?:
+		(!updated_range ? bch2_bkey_in_btree_node(b, k, err) : 0) ?:
+		(write ? bch2_bkey_val_invalid(c, k, err) : 0);
+}
+
 static int validate_bset_keys(struct bch_fs *c, struct btree *b,
 			 struct bset *i, unsigned *whiteout_u64s,
 			 int write, bool have_retry)
 {
 	unsigned version = le16_to_cpu(i->version);
 	struct bkey_packed *k, *prev = NULL;
-	struct printbuf buf1 = PRINTBUF;
-	struct printbuf buf2 = PRINTBUF;
+	struct printbuf buf = PRINTBUF;
 	bool updated_range = b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
 		BTREE_PTR_RANGE_UPDATED(&bkey_i_to_btree_ptr_v2(&b->key)->v);
 	int ret = 0;
@@ -778,7 +787,6 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b,
 	     k != vstruct_last(i);) {
 		struct bkey_s u;
 		struct bkey tmp;
-		const char *invalid;
 
 		if (btree_err_on(bkey_next(k) > vstruct_last(i),
 				 BTREE_ERR_FIXABLE, c, NULL, b, i,
@@ -804,14 +812,15 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b,
 
 		u = __bkey_disassemble(b, k, &tmp);
 
-		invalid = __bch2_bkey_invalid(c, u.s_c, btree_node_type(b)) ?:
-			(!updated_range ?  bch2_bkey_in_btree_node(b, u.s_c) : NULL) ?:
-			(write ? bch2_bkey_val_invalid(c, u.s_c) : NULL);
-		if (invalid) {
-			printbuf_reset(&buf1);
-			bch2_bkey_val_to_text(&buf1, c, u.s_c);
-			btree_err(BTREE_ERR_FIXABLE, c, NULL, b, i,
-				  "invalid bkey: %s\n%s", invalid, buf1.buf);
+		printbuf_reset(&buf);
+		if (bset_key_invalid(c, b, u.s_c, updated_range, write, &buf)) {
+			printbuf_reset(&buf);
+			pr_buf(&buf, "invalid bkey:\n  ");
+			bch2_bkey_val_to_text(&buf, c, u.s_c);
+			pr_buf(&buf, "  \n");
+			bset_key_invalid(c, b, u.s_c, updated_range, write, &buf);
+
+			btree_err(BTREE_ERR_FIXABLE, c, NULL, b, i, "%s", buf.buf);
 
 			i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
 			memmove_u64s_down(k, bkey_next(k),
@@ -827,16 +836,15 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b,
 		if (prev && bkey_iter_cmp(b, prev, k) > 0) {
 			struct bkey up = bkey_unpack_key(b, prev);
 
-			printbuf_reset(&buf1);
-			bch2_bkey_to_text(&buf1, &up);
-			printbuf_reset(&buf2);
-			bch2_bkey_to_text(&buf2, u.k);
+			printbuf_reset(&buf);
+			pr_buf(&buf, "keys out of order: ");
+			bch2_bkey_to_text(&buf, &up);
+			pr_buf(&buf, " > ");
+			bch2_bkey_to_text(&buf, u.k);
 
 			bch2_dump_bset(c, b, i, 0);
 
-			if (btree_err(BTREE_ERR_FIXABLE, c, NULL, b, i,
-				      "keys out of order: %s > %s",
-				      buf1.buf, buf2.buf)) {
+			if (btree_err(BTREE_ERR_FIXABLE, c, NULL, b, i, "%s", buf.buf)) {
 				i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
 				memmove_u64s_down(k, bkey_next(k),
 						  (u64 *) vstruct_end(i) - (u64 *) k);
@@ -848,8 +856,7 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b,
 		k = bkey_next(k);
 	}
 fsck_err:
-	printbuf_exit(&buf2);
-	printbuf_exit(&buf1);
+	printbuf_exit(&buf);
 	return ret;
 }
 
@@ -868,6 +875,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
 	unsigned u64s;
 	unsigned blacklisted_written, nonblacklisted_written = 0;
 	unsigned ptr_written = btree_ptr_sectors_written(&b->key);
+	struct printbuf buf = PRINTBUF;
 	int ret, retry_read = 0, write = READ;
 
 	b->version_ondisk = U16_MAX;
@@ -1060,17 +1068,20 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
 	for (k = i->start; k != vstruct_last(i);) {
 		struct bkey tmp;
 		struct bkey_s u = __bkey_disassemble(b, k, &tmp);
-		const char *invalid = bch2_bkey_val_invalid(c, u.s_c);
 
-		if (invalid ||
+		printbuf_reset(&buf);
+
+		if (bch2_bkey_val_invalid(c, u.s_c, &buf) ||
 		    (bch2_inject_invalid_keys &&
 		     !bversion_cmp(u.k->version, MAX_VERSION))) {
-			struct printbuf buf = PRINTBUF;
+			printbuf_reset(&buf);
 
+			pr_buf(&buf, "invalid bkey\n  ");
 			bch2_bkey_val_to_text(&buf, c, u.s_c);
-			btree_err(BTREE_ERR_FIXABLE, c, NULL, b, i,
-				  "invalid bkey %s: %s", buf.buf, invalid);
-			printbuf_exit(&buf);
+			pr_buf(&buf, "\n  ");
+			bch2_bkey_val_invalid(c, u.s_c, &buf);
+
+			btree_err(BTREE_ERR_FIXABLE, c, NULL, b, i, "%s", buf.buf);
 
 			btree_keys_account_key_drop(&b->nr, 0, k);
 
@@ -1107,6 +1118,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
 		set_btree_node_need_rewrite(b);
 out:
 	mempool_free(iter, &c->fill_iter);
+	printbuf_exit(&buf);
 	return retry_read;
 fsck_err:
 	if (ret == BTREE_RETRY_READ) {
@@ -1715,10 +1727,16 @@ static int validate_bset_for_write(struct bch_fs *c, struct btree *b,
 				   struct bset *i, unsigned sectors)
 {
 	unsigned whiteout_u64s = 0;
+	struct printbuf buf = PRINTBUF;
 	int ret;
 
-	if (bch2_bkey_invalid(c, bkey_i_to_s_c(&b->key), BKEY_TYPE_btree))
-		return -1;
+	ret = bch2_bkey_invalid(c, bkey_i_to_s_c(&b->key), BKEY_TYPE_btree, &buf);
+
+	if (ret)
+		bch2_fs_inconsistent(c, "invalid btree node key before write: %s", buf.buf);
+	printbuf_exit(&buf);
+	if (ret)
+		return ret;
 
 	ret = validate_bset_keys(c, b, i, &whiteout_u64s, WRITE, false) ?:
 		validate_bset(c, NULL, b, i, b->written, sectors, WRITE, false);
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 2e958f88777b..0e3b3565be59 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -1176,7 +1176,7 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as,
 {
 	struct bch_fs *c = as->c;
 	struct bkey_packed *k;
-	const char *invalid;
+	struct printbuf buf = PRINTBUF;
 
 	BUG_ON(insert->k.type == KEY_TYPE_btree_ptr_v2 &&
 	       !btree_ptr_sectors_written(insert));
@@ -1184,14 +1184,16 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as,
 	if (unlikely(!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)))
 		bch2_journal_key_overwritten(c, b->c.btree_id, b->c.level, insert->k.p);
 
-	invalid = bch2_bkey_invalid(c, bkey_i_to_s_c(insert), btree_node_type(b)) ?:
-		bch2_bkey_in_btree_node(b, bkey_i_to_s_c(insert));
-	if (invalid) {
-		struct printbuf buf = PRINTBUF;
-
+	if (bch2_bkey_invalid(c, bkey_i_to_s_c(insert), btree_node_type(b), &buf) ?:
+	    bch2_bkey_in_btree_node(b, bkey_i_to_s_c(insert), &buf)) {
+		printbuf_reset(&buf);
+		pr_buf(&buf, "inserting invalid bkey\n  ");
 		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert));
-		bch2_fs_inconsistent(c, "inserting invalid bkey %s: %s", buf.buf, invalid);
-		printbuf_exit(&buf);
+		pr_buf(&buf, "\n  ");
+		bch2_bkey_invalid(c, bkey_i_to_s_c(insert), btree_node_type(b), &buf);
+		bch2_bkey_in_btree_node(b, bkey_i_to_s_c(insert), &buf);
+
+		bch2_fs_inconsistent(c, "%s", buf.buf);
 		dump_stack();
 	}
 
@@ -1211,6 +1213,8 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as,
 	bch2_btree_bset_insert_key(trans, path, b, node_iter, insert);
 	set_btree_node_dirty_acct(c, b);
 	set_btree_node_need_write(b);
+
+	printbuf_exit(&buf);
 }
 
 static void
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 90e6e5130672..fce93ed65ed9 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -862,23 +862,31 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans,
 {
 	struct bch_fs *c = trans->c;
 	struct btree_insert_entry *i;
+	struct printbuf buf = PRINTBUF;
 	int ret, u64s_delta = 0;
 
 	trans_for_each_update(trans, i) {
-		const char *invalid = bch2_bkey_invalid(c,
-				bkey_i_to_s_c(i->k), i->bkey_type);
-		if (invalid) {
-			struct printbuf buf = PRINTBUF;
+		if (bch2_bkey_invalid(c, bkey_i_to_s_c(i->k), i->bkey_type, &buf)) {
+			printbuf_reset(&buf);
+			pr_buf(&buf, "invalid bkey on insert from %s -> %ps",
+			       trans->fn, (void *) i->ip_allocated);
+			pr_newline(&buf);
+			pr_indent_push(&buf, 2);
 
 			bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(i->k));
-			bch2_fs_fatal_error(c, "invalid bkey %s on insert from %s -> %ps: %s\n",
-					    buf.buf, trans->fn, (void *) i->ip_allocated, invalid);
+			pr_newline(&buf);
+
+			bch2_bkey_invalid(c, bkey_i_to_s_c(i->k), i->bkey_type, &buf);
+
+			bch2_fs_fatal_error(c, "%s", buf.buf);
 			printbuf_exit(&buf);
 			return -EINVAL;
 		}
 		btree_insert_entry_checks(trans, i);
 	}
 
+	printbuf_exit(&buf);
+
 	trans_for_each_update(trans, i) {
 		if (i->cached)
 			continue;
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index 4675a1f5d189..053b6dc215b3 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -9,6 +9,7 @@
 #define _BUCKETS_H
 
 #include "buckets_types.h"
+#include "extents.h"
 #include "super.h"
 
 #define for_each_bucket(_b, _buckets)				\
@@ -83,8 +84,7 @@ static inline struct bucket *PTR_GC_BUCKET(struct bch_dev *ca,
 static inline enum bch_data_type ptr_data_type(const struct bkey *k,
 					       const struct bch_extent_ptr *ptr)
 {
-	if (k->type == KEY_TYPE_btree_ptr ||
-	    k->type == KEY_TYPE_btree_ptr_v2)
+	if (bkey_is_btree_ptr(k))
 		return BCH_DATA_btree;
 
 	return ptr->cached ? BCH_DATA_cached : BCH_DATA_user;
diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c
index 760e4f74715f..e8a284a69be4 100644
--- a/fs/bcachefs/dirent.c
+++ b/fs/bcachefs/dirent.c
@@ -83,38 +83,58 @@ const struct bch_hash_desc bch2_dirent_hash_desc = {
 	.is_visible	= dirent_is_visible,
 };
 
-const char *bch2_dirent_invalid(const struct bch_fs *c, struct bkey_s_c k)
+int bch2_dirent_invalid(const struct bch_fs *c, struct bkey_s_c k,
+			struct printbuf *err)
 {
 	struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
 	unsigned len;
 
-	if (bkey_val_bytes(k.k) < sizeof(struct bch_dirent))
-		return "value too small";
+	if (bkey_val_bytes(k.k) < sizeof(struct bch_dirent)) {
+		pr_buf(err, "incorrect value size (%zu < %zu)",
+		       bkey_val_bytes(k.k), sizeof(*d.v));
+		return -EINVAL;
+	}
 
 	len = bch2_dirent_name_bytes(d);
-	if (!len)
-		return "empty name";
+	if (!len) {
+		pr_buf(err, "empty name");
+		return -EINVAL;
+	}
 
-	if (bkey_val_u64s(k.k) > dirent_val_u64s(len))
-		return "value too big";
+	if (bkey_val_u64s(k.k) > dirent_val_u64s(len)) {
+		pr_buf(err, "value too big (%zu > %u)",
+		       bkey_val_u64s(k.k),dirent_val_u64s(len));
+		return -EINVAL;
+	}
 
-	if (len > BCH_NAME_MAX)
-		return "dirent name too big";
+	if (len > BCH_NAME_MAX) {
+		pr_buf(err, "dirent name too big (%u > %lu)",
+		       len, BCH_NAME_MAX);
+		return -EINVAL;
+	}
 
-	if (len == 1 && !memcmp(d.v->d_name, ".", 1))
-		return "invalid name";
+	if (len == 1 && !memcmp(d.v->d_name, ".", 1)) {
+		pr_buf(err, "invalid name");
+		return -EINVAL;
+	}
 
-	if (len == 2 && !memcmp(d.v->d_name, "..", 2))
-		return "invalid name";
+	if (len == 2 && !memcmp(d.v->d_name, "..", 2)) {
+		pr_buf(err, "invalid name");
+		return -EINVAL;
+	}
 
-	if (memchr(d.v->d_name, '/', len))
-		return "invalid name";
+	if (memchr(d.v->d_name, '/', len)) {
+		pr_buf(err, "invalid name");
+		return -EINVAL;
+	}
 
 	if (d.v->d_type != DT_SUBVOL &&
-	    le64_to_cpu(d.v->d_inum) == d.k->p.inode)
-		return "dirent points to own directory";
+	    le64_to_cpu(d.v->d_inum) == d.k->p.inode) {
+		pr_buf(err, "dirent points to own directory");
+		return -EINVAL;
+	}
 
-	return NULL;
+	return 0;
 }
 
 void bch2_dirent_to_text(struct printbuf *out, struct bch_fs *c,
diff --git a/fs/bcachefs/dirent.h b/fs/bcachefs/dirent.h
index 1bb4d802bc1d..046f297a4eff 100644
--- a/fs/bcachefs/dirent.h
+++ b/fs/bcachefs/dirent.h
@@ -6,7 +6,7 @@
 
 extern const struct bch_hash_desc bch2_dirent_hash_desc;
 
-const char *bch2_dirent_invalid(const struct bch_fs *, struct bkey_s_c);
+int bch2_dirent_invalid(const struct bch_fs *, struct bkey_s_c, struct printbuf *);
 void bch2_dirent_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 
 #define bch2_bkey_ops_dirent (struct bkey_ops) {	\
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index 5030a5b831af..cf9ecb7711c6 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -102,24 +102,34 @@ struct ec_bio {
 
 /* Stripes btree keys: */
 
-const char *bch2_stripe_invalid(const struct bch_fs *c, struct bkey_s_c k)
+int bch2_stripe_invalid(const struct bch_fs *c, struct bkey_s_c k,
+			struct printbuf *err)
 {
 	const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
 
-	if (!bkey_cmp(k.k->p, POS_MIN))
-		return "stripe at pos 0";
+	if (!bkey_cmp(k.k->p, POS_MIN)) {
+		pr_buf(err, "stripe at POS_MIN");
+		return -EINVAL;
+	}
 
-	if (k.k->p.inode)
-		return "invalid stripe key";
+	if (k.k->p.inode) {
+		pr_buf(err, "nonzero inode field");
+		return -EINVAL;
+	}
 
-	if (bkey_val_bytes(k.k) < sizeof(*s))
-		return "incorrect value size";
+	if (bkey_val_bytes(k.k) < sizeof(*s)) {
+		pr_buf(err, "incorrect value size (%zu < %zu)",
+		       bkey_val_bytes(k.k), sizeof(*s));
+		return -EINVAL;
+	}
 
-	if (bkey_val_bytes(k.k) < sizeof(*s) ||
-	    bkey_val_u64s(k.k) < stripe_val_u64s(s))
-		return "incorrect value size";
+	if (bkey_val_u64s(k.k) < stripe_val_u64s(s)) {
+		pr_buf(err, "incorrect value size (%zu < %u)",
+		       bkey_val_u64s(k.k), stripe_val_u64s(s));
+		return -EINVAL;
+	}
 
-	return bch2_bkey_ptrs_invalid(c, k);
+	return bch2_bkey_ptrs_invalid(c, k, err);
 }
 
 void bch2_stripe_to_text(struct printbuf *out, struct bch_fs *c,
diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h
index 9d508a2f3bbc..8e866460f8a0 100644
--- a/fs/bcachefs/ec.h
+++ b/fs/bcachefs/ec.h
@@ -6,7 +6,8 @@
 #include "buckets_types.h"
 #include "keylist_types.h"
 
-const char *bch2_stripe_invalid(const struct bch_fs *, struct bkey_s_c);
+int bch2_stripe_invalid(const struct bch_fs *, struct bkey_s_c,
+			struct printbuf *);
 void bch2_stripe_to_text(struct printbuf *, struct bch_fs *,
 			 struct bkey_s_c);
 
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 01d14645579b..e09636023882 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -155,12 +155,16 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k,
 
 /* KEY_TYPE_btree_ptr: */
 
-const char *bch2_btree_ptr_invalid(const struct bch_fs *c, struct bkey_s_c k)
+int bch2_btree_ptr_invalid(const struct bch_fs *c, struct bkey_s_c k,
+			   struct printbuf *err)
 {
-	if (bkey_val_u64s(k.k) > BCH_REPLICAS_MAX)
-		return "value too big";
+	if (bkey_val_u64s(k.k) > BCH_REPLICAS_MAX) {
+		pr_buf(err, "value too big (%zu > %u)",
+		       bkey_val_u64s(k.k), BCH_REPLICAS_MAX);
+		return -EINVAL;
+	}
 
-	return bch2_bkey_ptrs_invalid(c, k);
+	return bch2_bkey_ptrs_invalid(c, k, err);
 }
 
 void bch2_btree_ptr_to_text(struct printbuf *out, struct bch_fs *c,
@@ -169,21 +173,31 @@ void bch2_btree_ptr_to_text(struct printbuf *out, struct bch_fs *c,
 	bch2_bkey_ptrs_to_text(out, c, k);
 }
 
-const char *bch2_btree_ptr_v2_invalid(const struct bch_fs *c, struct bkey_s_c k)
+int bch2_btree_ptr_v2_invalid(const struct bch_fs *c, struct bkey_s_c k,
+				      struct printbuf *err)
 {
 	struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k);
 
-	if (bkey_val_bytes(k.k) <= sizeof(*bp.v))
-		return "value too small";
+	if (bkey_val_bytes(k.k) <= sizeof(*bp.v)) {
+		pr_buf(err, "value too small (%zu <= %zu)",
+		       bkey_val_bytes(k.k), sizeof(*bp.v));
+		return -EINVAL;
+	}
 
-	if (bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX)
-		return "value too big";
+	if (bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX) {
+		pr_buf(err, "value too big (%zu > %zu)",
+		       bkey_val_u64s(k.k), BKEY_BTREE_PTR_VAL_U64s_MAX);
+		return -EINVAL;
+	}
 
 	if (c->sb.version < bcachefs_metadata_version_snapshot &&
-	    bp.v->min_key.snapshot)
-		return "invalid min_key.snapshot";
+	    bp.v->min_key.snapshot) {
+		pr_buf(err, "invalid min_key.snapshot (%u != 0)",
+		       bp.v->min_key.snapshot);
+		return -EINVAL;
+	}
 
-	return bch2_bkey_ptrs_invalid(c, k);
+	return bch2_bkey_ptrs_invalid(c, k, err);
 }
 
 void bch2_btree_ptr_v2_to_text(struct printbuf *out, struct bch_fs *c,
@@ -219,17 +233,6 @@ void bch2_btree_ptr_v2_compat(enum btree_id btree_id, unsigned version,
 
 /* KEY_TYPE_extent: */
 
-const char *bch2_extent_invalid(const struct bch_fs *c, struct bkey_s_c k)
-{
-	return bch2_bkey_ptrs_invalid(c, k);
-}
-
-void bch2_extent_to_text(struct printbuf *out, struct bch_fs *c,
-			 struct bkey_s_c k)
-{
-	bch2_bkey_ptrs_to_text(out, c, k);
-}
-
 bool bch2_extent_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r)
 {
 	struct bkey_ptrs   l_ptrs = bch2_bkey_ptrs(l);
@@ -362,17 +365,24 @@ bool bch2_extent_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r)
 
 /* KEY_TYPE_reservation: */
 
-const char *bch2_reservation_invalid(const struct bch_fs *c, struct bkey_s_c k)
+int bch2_reservation_invalid(const struct bch_fs *c, struct bkey_s_c k,
+			     struct printbuf *err)
 {
 	struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k);
 
-	if (bkey_val_bytes(k.k) != sizeof(struct bch_reservation))
-		return "incorrect value size";
+	if (bkey_val_bytes(k.k) != sizeof(struct bch_reservation)) {
+		pr_buf(err, "incorrect value size (%zu != %zu)",
+		       bkey_val_bytes(k.k), sizeof(*r.v));
+		return -EINVAL;
+	}
 
-	if (!r.v->nr_replicas || r.v->nr_replicas > BCH_REPLICAS_MAX)
-		return "invalid nr_replicas";
+	if (!r.v->nr_replicas || r.v->nr_replicas > BCH_REPLICAS_MAX) {
+		pr_buf(err, "invalid nr_replicas (%u)",
+		       r.v->nr_replicas);
+		return -EINVAL;
+	}
 
-	return NULL;
+	return 0;
 }
 
 void bch2_reservation_to_text(struct printbuf *out, struct bch_fs *c,
@@ -1000,69 +1010,86 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
 	}
 }
 
-static const char *extent_ptr_invalid(const struct bch_fs *c,
-				      struct bkey_s_c k,
-				      const struct bch_extent_ptr *ptr,
-				      unsigned size_ondisk,
-				      bool metadata)
+static int extent_ptr_invalid(const struct bch_fs *c,
+			      struct bkey_s_c k,
+			      const struct bch_extent_ptr *ptr,
+			      unsigned size_ondisk,
+			      bool metadata,
+			      struct printbuf *err)
 {
 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
 	const struct bch_extent_ptr *ptr2;
+	u64 bucket;
+	u32 bucket_offset;
 	struct bch_dev *ca;
 
-	if (!bch2_dev_exists2(c, ptr->dev))
-		return "pointer to invalid device";
+	if (!bch2_dev_exists2(c, ptr->dev)) {
+		pr_buf(err, "pointer to invalid device (%u)", ptr->dev);
+		return -EINVAL;
+	}
 
 	ca = bch_dev_bkey_exists(c, ptr->dev);
-	if (!ca)
-		return "pointer to invalid device";
-
 	bkey_for_each_ptr(ptrs, ptr2)
-		if (ptr != ptr2 && ptr->dev == ptr2->dev)
-			return "multiple pointers to same device";
+		if (ptr != ptr2 && ptr->dev == ptr2->dev) {
+			pr_buf(err, "multiple pointers to same device (%u)", ptr->dev);
+			return -EINVAL;
+		}
 
-	if (ptr->offset + size_ondisk > bucket_to_sector(ca, ca->mi.nbuckets))
-		return "offset past end of device";
+	bucket = sector_to_bucket_and_offset(ca, ptr->offset, &bucket_offset);
 
-	if (ptr->offset < bucket_to_sector(ca, ca->mi.first_bucket))
-		return "offset before first bucket";
+	if (bucket >= ca->mi.nbuckets) {
+		pr_buf(err, "pointer past last bucket (%llu > %llu)",
+		       bucket, ca->mi.nbuckets);
+		return -EINVAL;
+	}
 
-	if (bucket_remainder(ca, ptr->offset) +
-	    size_ondisk > ca->mi.bucket_size)
-		return "spans multiple buckets";
+	if (ptr->offset < bucket_to_sector(ca, ca->mi.first_bucket)) {
+		pr_buf(err, "pointer before first bucket (%llu < %u)",
+		       bucket, ca->mi.first_bucket);
+		return -EINVAL;
+	}
 
-	return NULL;
+	if (bucket_offset + size_ondisk > ca->mi.bucket_size) {
+		pr_buf(err, "pointer spans multiple buckets (%u + %u > %u)",
+		       bucket_offset, size_ondisk, ca->mi.bucket_size);
+		return -EINVAL;
+	}
+
+	return 0;
 }
 
-const char *bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k)
+int bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k,
+			   struct printbuf *err)
 {
 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-	struct bch_devs_list devs;
 	const union bch_extent_entry *entry;
 	struct bch_extent_crc_unpacked crc;
 	unsigned size_ondisk = k.k->size;
-	const char *reason;
 	unsigned nonce = UINT_MAX;
-	unsigned i;
+	int ret;
 
-	if (k.k->type == KEY_TYPE_btree_ptr ||
-	    k.k->type == KEY_TYPE_btree_ptr_v2)
+	if (bkey_is_btree_ptr(k.k))
 		size_ondisk = btree_sectors(c);
 
 	bkey_extent_entry_for_each(ptrs, entry) {
-		if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX)
-			return "invalid extent entry type";
+		if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX) {
+			pr_buf(err, "invalid extent entry type (got %u, max %u)",
+			       __extent_entry_type(entry), BCH_EXTENT_ENTRY_MAX);
+			return -EINVAL;
+		}
 
-		if (k.k->type == KEY_TYPE_btree_ptr &&
-		    !extent_entry_is_ptr(entry))
-			return "has non ptr field";
+		if (bkey_is_btree_ptr(k.k) &&
+		    !extent_entry_is_ptr(entry)) {
+			pr_buf(err, "has non ptr field");
+			return -EINVAL;
+		}
 
 		switch (extent_entry_type(entry)) {
 		case BCH_EXTENT_ENTRY_ptr:
-			reason = extent_ptr_invalid(c, k, &entry->ptr,
-						    size_ondisk, false);
-			if (reason)
-				return reason;
+			ret = extent_ptr_invalid(c, k, &entry->ptr, size_ondisk,
+						 false, err);
+			if (ret)
+				return ret;
 			break;
 		case BCH_EXTENT_ENTRY_crc32:
 		case BCH_EXTENT_ENTRY_crc64:
@@ -1070,22 +1097,30 @@ const char *bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k)
 			crc = bch2_extent_crc_unpack(k.k, entry_to_crc(entry));
 
 			if (crc.offset + crc.live_size >
-			    crc.uncompressed_size)
-				return "checksum offset + key size > uncompressed size";
+			    crc.uncompressed_size) {
+				pr_buf(err, "checksum offset + key size > uncompressed size");
+				return -EINVAL;
+			}
 
 			size_ondisk = crc.compressed_size;
 
-			if (!bch2_checksum_type_valid(c, crc.csum_type))
-				return "invalid checksum type";
+			if (!bch2_checksum_type_valid(c, crc.csum_type)) {
+				pr_buf(err, "invalid checksum type");
+				return -EINVAL;
+			}
 
-			if (crc.compression_type >= BCH_COMPRESSION_TYPE_NR)
-				return "invalid compression type";
+			if (crc.compression_type >= BCH_COMPRESSION_TYPE_NR) {
+				pr_buf(err, "invalid compression type");
+				return -EINVAL;
+			}
 
 			if (bch2_csum_type_is_encryption(crc.csum_type)) {
 				if (nonce == UINT_MAX)
 					nonce = crc.offset + crc.nonce;
-				else if (nonce != crc.offset + crc.nonce)
-					return "incorrect nonce";
+				else if (nonce != crc.offset + crc.nonce) {
+					pr_buf(err, "incorrect nonce");
+					return -EINVAL;
+				}
 			}
 			break;
 		case BCH_EXTENT_ENTRY_stripe_ptr:
@@ -1093,13 +1128,7 @@ const char *bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k)
 		}
 	}
 
-	devs = bch2_bkey_devs(k);
-	bubble_sort(devs.devs, devs.nr, u8_cmp);
-	for (i = 0; i + 1 < devs.nr; i++)
-		if (devs.devs[i] == devs.devs[i + 1])
-			return "multiple ptrs to same device";
-
-	return NULL;
+	return 0;
 }
 
 void bch2_ptr_swab(struct bkey_s k)
diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h
index ae650849d98a..21f79e663c74 100644
--- a/fs/bcachefs/extents.h
+++ b/fs/bcachefs/extents.h
@@ -367,13 +367,12 @@ int bch2_bkey_pick_read_device(struct bch_fs *, struct bkey_s_c,
 
 /* KEY_TYPE_btree_ptr: */
 
-const char *bch2_btree_ptr_invalid(const struct bch_fs *, struct bkey_s_c);
+int bch2_btree_ptr_invalid(const struct bch_fs *, struct bkey_s_c, struct printbuf *);
 void bch2_btree_ptr_to_text(struct printbuf *, struct bch_fs *,
 			    struct bkey_s_c);
 
-const char *bch2_btree_ptr_v2_invalid(const struct bch_fs *, struct bkey_s_c);
-void bch2_btree_ptr_v2_to_text(struct printbuf *, struct bch_fs *,
-			    struct bkey_s_c);
+int bch2_btree_ptr_v2_invalid(const struct bch_fs *, struct bkey_s_c, struct printbuf *);
+void bch2_btree_ptr_v2_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 void bch2_btree_ptr_v2_compat(enum btree_id, unsigned, unsigned,
 			      int, struct bkey_s);
 
@@ -396,13 +395,11 @@ void bch2_btree_ptr_v2_compat(enum btree_id, unsigned, unsigned,
 
 /* KEY_TYPE_extent: */
 
-const char *bch2_extent_invalid(const struct bch_fs *, struct bkey_s_c);
-void bch2_extent_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 bool bch2_extent_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);
 
 #define bch2_bkey_ops_extent (struct bkey_ops) {		\
-	.key_invalid	= bch2_extent_invalid,			\
-	.val_to_text	= bch2_extent_to_text,			\
+	.key_invalid	= bch2_bkey_ptrs_invalid,		\
+	.val_to_text	= bch2_bkey_ptrs_to_text,		\
 	.swab		= bch2_ptr_swab,			\
 	.key_normalize	= bch2_extent_normalize,		\
 	.key_merge	= bch2_extent_merge,			\
@@ -412,7 +409,7 @@ bool bch2_extent_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);
 
 /* KEY_TYPE_reservation: */
 
-const char *bch2_reservation_invalid(const struct bch_fs *, struct bkey_s_c);
+int bch2_reservation_invalid(const struct bch_fs *, struct bkey_s_c, struct printbuf *);
 void bch2_reservation_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 bool bch2_reservation_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);
 
@@ -618,7 +615,7 @@ bool bch2_bkey_matches_ptr(struct bch_fs *, struct bkey_s_c,
 bool bch2_extent_normalize(struct bch_fs *, struct bkey_s);
 void bch2_bkey_ptrs_to_text(struct printbuf *, struct bch_fs *,
 			    struct bkey_s_c);
-const char *bch2_bkey_ptrs_invalid(const struct bch_fs *, struct bkey_s_c);
+int bch2_bkey_ptrs_invalid(const struct bch_fs *, struct bkey_s_c, struct printbuf *);
 
 void bch2_ptr_swab(struct bkey_s);
 
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index 3735397ee9c5..2f7bafc7db13 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -293,76 +293,89 @@ int bch2_inode_write(struct btree_trans *trans,
 	return bch2_trans_update(trans, iter, &inode_p->inode.k_i, 0);
 }
 
-const char *bch2_inode_invalid(const struct bch_fs *c, struct bkey_s_c k)
+static int __bch2_inode_invalid(struct bkey_s_c k, struct printbuf *err)
 {
-	struct bkey_s_c_inode inode = bkey_s_c_to_inode(k);
 	struct bch_inode_unpacked unpacked;
 
-	if (k.k->p.inode)
-		return "nonzero k.p.inode";
-
-	if (bkey_val_bytes(k.k) < sizeof(struct bch_inode))
-		return "incorrect value size";
-
-	if (k.k->p.offset < BLOCKDEV_INODE_MAX)
-		return "fs inode in blockdev range";
+	if (k.k->p.inode) {
+		pr_buf(err, "nonzero k.p.inode");
+		return -EINVAL;
+	}
 
-	if (INODE_STR_HASH(inode.v) >= BCH_STR_HASH_NR)
-		return "invalid str hash type";
+	if (k.k->p.offset < BLOCKDEV_INODE_MAX) {
+		pr_buf(err, "fs inode in blockdev range");
+		return -EINVAL;
+	}
 
-	if (bch2_inode_unpack(k, &unpacked))
-		return "invalid variable length fields";
+	if (bch2_inode_unpack(k, &unpacked)){
+		pr_buf(err, "invalid variable length fields");
+		return -EINVAL;
+	}
 
-	if (unpacked.bi_data_checksum >= BCH_CSUM_OPT_NR + 1)
-		return "invalid data checksum type";
+	if (unpacked.bi_data_checksum >= BCH_CSUM_OPT_NR + 1) {
+		pr_buf(err, "invalid data checksum type (%u >= %u",
+			unpacked.bi_data_checksum, BCH_CSUM_OPT_NR + 1);
+		return -EINVAL;
+	}
 
-	if (unpacked.bi_compression >= BCH_COMPRESSION_OPT_NR + 1)
-		return "invalid data checksum type";
+	if (unpacked.bi_compression >= BCH_COMPRESSION_OPT_NR + 1) {
+		pr_buf(err, "invalid data checksum type (%u >= %u)",
+		       unpacked.bi_compression, BCH_COMPRESSION_OPT_NR + 1);
+		return -EINVAL;
+	}
 
 	if ((unpacked.bi_flags & BCH_INODE_UNLINKED) &&
-	    unpacked.bi_nlink != 0)
-		return "flagged as unlinked but bi_nlink != 0";
+	    unpacked.bi_nlink != 0) {
+		pr_buf(err, "flagged as unlinked but bi_nlink != 0");
+		return -EINVAL;
+	}
 
-	if (unpacked.bi_subvol && !S_ISDIR(unpacked.bi_mode))
-		return "subvolume root but not a directory";
+	if (unpacked.bi_subvol && !S_ISDIR(unpacked.bi_mode)) {
+		pr_buf(err, "subvolume root but not a directory");
+		return -EINVAL;
+	}
 
-	return NULL;
+	return 0;
 }
 
-const char *bch2_inode_v2_invalid(const struct bch_fs *c, struct bkey_s_c k)
+int bch2_inode_invalid(const struct bch_fs *c, struct bkey_s_c k,
+		       struct printbuf *err)
 {
-	struct bkey_s_c_inode_v2 inode = bkey_s_c_to_inode_v2(k);
-	struct bch_inode_unpacked unpacked;
-
-	if (k.k->p.inode)
-		return "nonzero k.p.inode";
-
-	if (bkey_val_bytes(k.k) < sizeof(struct bch_inode))
-		return "incorrect value size";
-
-	if (k.k->p.offset < BLOCKDEV_INODE_MAX)
-		return "fs inode in blockdev range";
+	struct bkey_s_c_inode inode = bkey_s_c_to_inode(k);
 
-	if (INODEv2_STR_HASH(inode.v) >= BCH_STR_HASH_NR)
-		return "invalid str hash type";
+	if (bkey_val_bytes(k.k) < sizeof(*inode.v)) {
+		pr_buf(err, "incorrect value size (%zu < %zu)",
+		       bkey_val_bytes(k.k), sizeof(*inode.v));
+		return -EINVAL;
+	}
 
-	if (bch2_inode_unpack(k, &unpacked))
-		return "invalid variable length fields";
+	if (INODE_STR_HASH(inode.v) >= BCH_STR_HASH_NR) {
+		pr_buf(err, "invalid str hash type (%llu >= %u)",
+		       INODE_STR_HASH(inode.v), BCH_STR_HASH_NR);
+		return -EINVAL;
+	}
 
-	if (unpacked.bi_data_checksum >= BCH_CSUM_OPT_NR + 1)
-		return "invalid data checksum type";
+	return __bch2_inode_invalid(k, err);
+}
 
-	if (unpacked.bi_compression >= BCH_COMPRESSION_OPT_NR + 1)
-		return "invalid data checksum type";
+int bch2_inode_v2_invalid(const struct bch_fs *c, struct bkey_s_c k,
+			  struct printbuf *err)
+{
+	struct bkey_s_c_inode_v2 inode = bkey_s_c_to_inode_v2(k);
 
-	if ((unpacked.bi_flags & BCH_INODE_UNLINKED) &&
-	    unpacked.bi_nlink != 0)
-		return "flagged as unlinked but bi_nlink != 0";
+	if (bkey_val_bytes(k.k) < sizeof(*inode.v)) {
+		pr_buf(err, "incorrect value size (%zu < %zu)",
+		       bkey_val_bytes(k.k), sizeof(*inode.v));
+		return -EINVAL;
+	}
 
-	if (unpacked.bi_subvol && !S_ISDIR(unpacked.bi_mode))
-		return "subvolume root but not a directory";
+	if (INODEv2_STR_HASH(inode.v) >= BCH_STR_HASH_NR) {
+		pr_buf(err, "invalid str hash type (%llu >= %u)",
+		       INODEv2_STR_HASH(inode.v), BCH_STR_HASH_NR);
+		return -EINVAL;
+	}
 
-	return NULL;
+	return __bch2_inode_invalid(k, err);
 }
 
 static void __bch2_inode_unpacked_to_text(struct printbuf *out, struct bch_inode_unpacked *inode)
@@ -396,16 +409,21 @@ void bch2_inode_to_text(struct printbuf *out, struct bch_fs *c,
 	__bch2_inode_unpacked_to_text(out, &inode);
 }
 
-const char *bch2_inode_generation_invalid(const struct bch_fs *c,
-					  struct bkey_s_c k)
+int bch2_inode_generation_invalid(const struct bch_fs *c, struct bkey_s_c k,
+				  struct printbuf *err)
 {
-	if (k.k->p.inode)
-		return "nonzero k.p.inode";
+	if (k.k->p.inode) {
+		pr_buf(err, "nonzero k.p.inode");
+		return -EINVAL;
+	}
 
-	if (bkey_val_bytes(k.k) != sizeof(struct bch_inode_generation))
-		return "incorrect value size";
+	if (bkey_val_bytes(k.k) != sizeof(struct bch_inode_generation)) {
+		pr_buf(err, "incorrect value size (%zu != %zu)",
+		       bkey_val_bytes(k.k), sizeof(struct bch_inode_generation));
+		return -EINVAL;
+	}
 
-	return NULL;
+	return 0;
 }
 
 void bch2_inode_generation_to_text(struct printbuf *out, struct bch_fs *c,
diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h
index 2337ecfc600e..e3418dc4a1e9 100644
--- a/fs/bcachefs/inode.h
+++ b/fs/bcachefs/inode.h
@@ -6,8 +6,8 @@
 
 extern const char * const bch2_inode_opts[];
 
-const char *bch2_inode_invalid(const struct bch_fs *, struct bkey_s_c);
-const char *bch2_inode_v2_invalid(const struct bch_fs *, struct bkey_s_c);
+int bch2_inode_invalid(const struct bch_fs *, struct bkey_s_c, struct printbuf *);
+int bch2_inode_v2_invalid(const struct bch_fs *, struct bkey_s_c, struct printbuf *);
 void bch2_inode_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 
 #define bch2_bkey_ops_inode (struct bkey_ops) {		\
@@ -30,10 +30,8 @@ static inline bool bkey_is_inode(const struct bkey *k)
 		k->type == KEY_TYPE_inode_v2;
 }
 
-const char *bch2_inode_generation_invalid(const struct bch_fs *,
-					  struct bkey_s_c);
-void bch2_inode_generation_to_text(struct printbuf *, struct bch_fs *,
-				   struct bkey_s_c);
+int bch2_inode_generation_invalid(const struct bch_fs *, struct bkey_s_c, struct printbuf *);
+void bch2_inode_generation_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 
 #define bch2_bkey_ops_inode_generation (struct bkey_ops) {	\
 	.key_invalid	= bch2_inode_generation_invalid,	\
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 3974d043fd8a..56221e316ee6 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -209,7 +209,7 @@ static int journal_validate_key(struct bch_fs *c, const char *where,
 				unsigned version, int big_endian, int write)
 {
 	void *next = vstruct_next(entry);
-	const char *invalid;
+	struct printbuf buf = PRINTBUF;
 	int ret = 0;
 
 	if (journal_entry_err_on(!k->k.u64s, c,
@@ -249,22 +249,28 @@ static int journal_validate_key(struct bch_fs *c, const char *where,
 		bch2_bkey_compat(level, btree_id, version, big_endian,
 				 write, NULL, bkey_to_packed(k));
 
-	invalid = bch2_bkey_invalid(c, bkey_i_to_s_c(k),
-				    __btree_node_type(level, btree_id));
-	if (invalid) {
-		struct printbuf buf = PRINTBUF;
+	if (bch2_bkey_invalid(c, bkey_i_to_s_c(k),
+			      __btree_node_type(level, btree_id), &buf)) {
+		printbuf_reset(&buf);
+		pr_buf(&buf, "invalid %s in %s entry offset %zi/%u:",
+		       type, where,
+		       (u64 *) k - entry->_data,
+		       le16_to_cpu(entry->u64s));
+		pr_newline(&buf);
+		pr_indent_push(&buf, 2);
 
 		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k));
-		mustfix_fsck_err(c, "invalid %s in %s entry offset %zi/%u: %s\n%s",
-				 type, where,
-				 (u64 *) k - entry->_data,
-				 le16_to_cpu(entry->u64s),
-				 invalid, buf.buf);
-		printbuf_exit(&buf);
+		pr_newline(&buf);
+		bch2_bkey_invalid(c, bkey_i_to_s_c(k),
+				  __btree_node_type(level, btree_id), &buf);
+
+		mustfix_fsck_err(c, "%s", buf.buf);
 
 		le16_add_cpu(&entry->u64s, -((u16) k->k.u64s));
 		memmove(k, bkey_next(k), next - (void *) bkey_next(k));
 		journal_entry_null_range(vstruct_next(entry), next);
+
+		printbuf_exit(&buf);
 		return FSCK_DELETED_KEY;
 	}
 
@@ -272,6 +278,7 @@ static int journal_validate_key(struct bch_fs *c, const char *where,
 		bch2_bkey_compat(level, btree_id, version, big_endian,
 				 write, NULL, bkey_to_packed(k));
 fsck_err:
+	printbuf_exit(&buf);
 	return ret;
 }
 
diff --git a/fs/bcachefs/lru.c b/fs/bcachefs/lru.c
index 4f0e6960e597..c20a3bc2336b 100644
--- a/fs/bcachefs/lru.c
+++ b/fs/bcachefs/lru.c
@@ -8,14 +8,18 @@
 #include "lru.h"
 #include "recovery.h"
 
-const char *bch2_lru_invalid(const struct bch_fs *c, struct bkey_s_c k)
+int bch2_lru_invalid(const struct bch_fs *c, struct bkey_s_c k,
+		     struct printbuf *err)
 {
 	const struct bch_lru *lru = bkey_s_c_to_lru(k).v;
 
-	if (bkey_val_bytes(k.k) < sizeof(*lru))
-		return "incorrect value size";
+	if (bkey_val_bytes(k.k) < sizeof(*lru)) {
+		pr_buf(err, "incorrect value size (%zu < %zu)",
+		       bkey_val_bytes(k.k), sizeof(*lru));
+		return -EINVAL;
+	}
 
-	return NULL;
+	return 0;
 }
 
 void bch2_lru_to_text(struct printbuf *out, struct bch_fs *c,
diff --git a/fs/bcachefs/lru.h b/fs/bcachefs/lru.h
index 4db6a8399332..0af62ecf6638 100644
--- a/fs/bcachefs/lru.h
+++ b/fs/bcachefs/lru.h
@@ -2,7 +2,7 @@
 #ifndef _BCACHEFS_LRU_H
 #define _BCACHEFS_LRU_H
 
-const char *bch2_lru_invalid(const struct bch_fs *, struct bkey_s_c);
+int bch2_lru_invalid(const struct bch_fs *, struct bkey_s_c, struct printbuf *);
 void bch2_lru_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 
 #define bch2_bkey_ops_lru (struct bkey_ops) {	\
diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c
index ca029a00e7b8..5f370da2f3d2 100644
--- a/fs/bcachefs/quota.c
+++ b/fs/bcachefs/quota.c
@@ -57,15 +57,22 @@ const struct bch_sb_field_ops bch_sb_field_ops_quota = {
 	.to_text	= bch2_sb_quota_to_text,
 };
 
-const char *bch2_quota_invalid(const struct bch_fs *c, struct bkey_s_c k)
+int bch2_quota_invalid(const struct bch_fs *c, struct bkey_s_c k,
+		       struct printbuf *err)
 {
-	if (k.k->p.inode >= QTYP_NR)
-		return "invalid quota type";
+	if (k.k->p.inode >= QTYP_NR) {
+		pr_buf(err, "invalid quota type (%llu >= %u)",
+		       k.k->p.inode, QTYP_NR);
+		return -EINVAL;
+	}
 
-	if (bkey_val_bytes(k.k) != sizeof(struct bch_quota))
-		return "incorrect value size";
+	if (bkey_val_bytes(k.k) != sizeof(struct bch_quota)) {
+		pr_buf(err, "incorrect value size (%zu != %zu)",
+		       bkey_val_bytes(k.k), sizeof(struct bch_quota));
+		return -EINVAL;
+	}
 
-	return NULL;
+	return 0;
 }
 
 void bch2_quota_to_text(struct printbuf *out, struct bch_fs *c,
diff --git a/fs/bcachefs/quota.h b/fs/bcachefs/quota.h
index 51e4f9713ef0..4ba40fce39a8 100644
--- a/fs/bcachefs/quota.h
+++ b/fs/bcachefs/quota.h
@@ -7,7 +7,7 @@
 
 extern const struct bch_sb_field_ops bch_sb_field_ops_quota;
 
-const char *bch2_quota_invalid(const struct bch_fs *, struct bkey_s_c);
+int bch2_quota_invalid(const struct bch_fs *, struct bkey_s_c, struct printbuf *);
 void bch2_quota_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 
 #define bch2_bkey_ops_quota (struct bkey_ops) {		\
diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c
index 6824730945d4..e07f0339d87e 100644
--- a/fs/bcachefs/reflink.c
+++ b/fs/bcachefs/reflink.c
@@ -25,18 +25,25 @@ static inline unsigned bkey_type_to_indirect(const struct bkey *k)
 
 /* reflink pointers */
 
-const char *bch2_reflink_p_invalid(const struct bch_fs *c, struct bkey_s_c k)
+int bch2_reflink_p_invalid(const struct bch_fs *c, struct bkey_s_c k,
+			   struct printbuf *err)
 {
 	struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
 
-	if (bkey_val_bytes(p.k) != sizeof(*p.v))
-		return "incorrect value size";
+	if (bkey_val_bytes(p.k) != sizeof(*p.v)) {
+		pr_buf(err, "incorrect value size (%zu != %zu)",
+		       bkey_val_bytes(p.k), sizeof(*p.v));
+		return -EINVAL;
+	}
 
 	if (c->sb.version >= bcachefs_metadata_version_reflink_p_fix &&
-	    le64_to_cpu(p.v->idx) < le32_to_cpu(p.v->front_pad))
-		return "idx < front_pad";
+	    le64_to_cpu(p.v->idx) < le32_to_cpu(p.v->front_pad)) {
+		pr_buf(err, "idx < front_pad (%llu < %u)",
+		       le64_to_cpu(p.v->idx), le32_to_cpu(p.v->front_pad));
+		return -EINVAL;
+	}
 
-	return NULL;
+	return 0;
 }
 
 void bch2_reflink_p_to_text(struct printbuf *out, struct bch_fs *c,
@@ -70,14 +77,18 @@ bool bch2_reflink_p_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r
 
 /* indirect extents */
 
-const char *bch2_reflink_v_invalid(const struct bch_fs *c, struct bkey_s_c k)
+int bch2_reflink_v_invalid(const struct bch_fs *c, struct bkey_s_c k,
+			   struct printbuf *err)
 {
 	struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(k);
 
-	if (bkey_val_bytes(r.k) < sizeof(*r.v))
-		return "incorrect value size";
+	if (bkey_val_bytes(r.k) < sizeof(*r.v)) {
+		pr_buf(err, "incorrect value size (%zu < %zu)",
+		       bkey_val_bytes(r.k), sizeof(*r.v));
+		return -EINVAL;
+	}
 
-	return bch2_bkey_ptrs_invalid(c, k);
+	return bch2_bkey_ptrs_invalid(c, k, err);
 }
 
 void bch2_reflink_v_to_text(struct printbuf *out, struct bch_fs *c,
@@ -118,12 +129,16 @@ int bch2_trans_mark_reflink_v(struct btree_trans *trans,
 
 /* indirect inline data */
 
-const char *bch2_indirect_inline_data_invalid(const struct bch_fs *c,
-					      struct bkey_s_c k)
+int bch2_indirect_inline_data_invalid(const struct bch_fs *c, struct bkey_s_c k,
+				      struct printbuf *err)
 {
-	if (bkey_val_bytes(k.k) < sizeof(struct bch_indirect_inline_data))
-		return "incorrect value size";
-	return NULL;
+	if (bkey_val_bytes(k.k) < sizeof(struct bch_indirect_inline_data)) {
+		pr_buf(err, "incorrect value size (%zu < %zu)",
+		       bkey_val_bytes(k.k), sizeof(struct bch_indirect_inline_data));
+		return -EINVAL;
+	}
+
+	return 0;
 }
 
 void bch2_indirect_inline_data_to_text(struct printbuf *out,
diff --git a/fs/bcachefs/reflink.h b/fs/bcachefs/reflink.h
index 8eb41c0292eb..d292761f8a98 100644
--- a/fs/bcachefs/reflink.h
+++ b/fs/bcachefs/reflink.h
@@ -2,7 +2,7 @@
 #ifndef _BCACHEFS_REFLINK_H
 #define _BCACHEFS_REFLINK_H
 
-const char *bch2_reflink_p_invalid(const struct bch_fs *, struct bkey_s_c);
+int bch2_reflink_p_invalid(const struct bch_fs *, struct bkey_s_c, struct printbuf *);
 void bch2_reflink_p_to_text(struct printbuf *, struct bch_fs *,
 			    struct bkey_s_c);
 bool bch2_reflink_p_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);
@@ -15,7 +15,7 @@ bool bch2_reflink_p_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);
 	.atomic_trigger	= bch2_mark_reflink_p,			\
 }
 
-const char *bch2_reflink_v_invalid(const struct bch_fs *, struct bkey_s_c);
+int bch2_reflink_v_invalid(const struct bch_fs *, struct bkey_s_c, struct printbuf *);
 void bch2_reflink_v_to_text(struct printbuf *, struct bch_fs *,
 			    struct bkey_s_c);
 int bch2_trans_mark_reflink_v(struct btree_trans *, struct bkey_s_c,
@@ -29,8 +29,8 @@ int bch2_trans_mark_reflink_v(struct btree_trans *, struct bkey_s_c,
 	.atomic_trigger	= bch2_mark_extent,			\
 }
 
-const char *bch2_indirect_inline_data_invalid(const struct bch_fs *,
-					      struct bkey_s_c);
+int bch2_indirect_inline_data_invalid(const struct bch_fs *, struct bkey_s_c,
+				      struct printbuf *);
 void bch2_indirect_inline_data_to_text(struct printbuf *,
 				struct bch_fs *, struct bkey_s_c);
 int bch2_trans_mark_indirect_inline_data(struct btree_trans *,
diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c
index 20c6b21e54d3..d3f043f90110 100644
--- a/fs/bcachefs/subvolume.c
+++ b/fs/bcachefs/subvolume.c
@@ -26,39 +26,55 @@ void bch2_snapshot_to_text(struct printbuf *out, struct bch_fs *c,
 	       le32_to_cpu(s.v->subvol));
 }
 
-const char *bch2_snapshot_invalid(const struct bch_fs *c, struct bkey_s_c k)
+int bch2_snapshot_invalid(const struct bch_fs *c, struct bkey_s_c k,
+			  struct printbuf *err)
 {
 	struct bkey_s_c_snapshot s;
 	u32 i, id;
 
 	if (bkey_cmp(k.k->p, POS(0, U32_MAX)) > 0 ||
-	    bkey_cmp(k.k->p, POS(0, 1)) < 0)
-		return "bad pos";
+	    bkey_cmp(k.k->p, POS(0, 1)) < 0) {
+		pr_buf(err, "bad pos");
+		return -EINVAL;
+	}
 
-	if (bkey_val_bytes(k.k) != sizeof(struct bch_snapshot))
-		return "bad val size";
+	if (bkey_val_bytes(k.k) != sizeof(struct bch_snapshot)) {
+		pr_buf(err, "bad val size (%zu != %zu)",
+		       bkey_val_bytes(k.k), sizeof(struct bch_snapshot));
+		return -EINVAL;
+	}
 
 	s = bkey_s_c_to_snapshot(k);
 
 	id = le32_to_cpu(s.v->parent);
-	if (id && id <= k.k->p.offset)
-		return "bad parent node";
+	if (id && id <= k.k->p.offset) {
+		pr_buf(err, "bad parent node (%u <= %llu)",
+		       id, k.k->p.offset);
+		return -EINVAL;
+	}
 
-	if (le32_to_cpu(s.v->children[0]) < le32_to_cpu(s.v->children[1]))
-		return "children not normalized";
+	if (le32_to_cpu(s.v->children[0]) < le32_to_cpu(s.v->children[1])) {
+		pr_buf(err, "children not normalized");
+		return -EINVAL;
+	}
 
 	if (s.v->children[0] &&
-	    s.v->children[0] == s.v->children[1])
-		return "duplicate child nodes";
+	    s.v->children[0] == s.v->children[1]) {
+		pr_buf(err, "duplicate child nodes");
+		return -EINVAL;
+	}
 
 	for (i = 0; i < 2; i++) {
 		id = le32_to_cpu(s.v->children[i]);
 
-		if (id >= k.k->p.offset)
-			return "bad child node";
+		if (id >= k.k->p.offset) {
+			pr_buf(err, "bad child node (%u >= %llu)",
+			       id, k.k->p.offset);
+			return -EINVAL;
+		}
 	}
 
-	return NULL;
+	return 0;
 }
 
 int bch2_mark_snapshot(struct btree_trans *trans,
@@ -729,18 +745,22 @@ static int bch2_delete_dead_snapshots_hook(struct btree_trans *trans,
 
 /* Subvolumes: */
 
-const char *bch2_subvolume_invalid(const struct bch_fs *c, struct bkey_s_c k)
+int bch2_subvolume_invalid(const struct bch_fs *c, struct bkey_s_c k,
+			   struct printbuf *err)
 {
-	if (bkey_cmp(k.k->p, SUBVOL_POS_MIN) < 0)
-		return "invalid pos";
-
-	if (bkey_cmp(k.k->p, SUBVOL_POS_MAX) > 0)
-		return "invalid pos";
+	if (bkey_cmp(k.k->p, SUBVOL_POS_MIN) < 0 ||
+	    bkey_cmp(k.k->p, SUBVOL_POS_MAX) > 0) {
+		pr_buf(err, "invalid pos");
+		return -EINVAL;
+	}
 
-	if (bkey_val_bytes(k.k) != sizeof(struct bch_subvolume))
-		return "bad val size";
+	if (bkey_val_bytes(k.k) != sizeof(struct bch_subvolume)) {
+		pr_buf(err, "incorrect value size (%zu != %zu)",
+		       bkey_val_bytes(k.k), sizeof(struct bch_subvolume));
+		return -EINVAL;
+	}
 
-	return NULL;
+	return 0;
 }
 
 void bch2_subvolume_to_text(struct printbuf *out, struct bch_fs *c,
diff --git a/fs/bcachefs/subvolume.h b/fs/bcachefs/subvolume.h
index b3d5ae49101d..f466bf7e4543 100644
--- a/fs/bcachefs/subvolume.h
+++ b/fs/bcachefs/subvolume.h
@@ -6,7 +6,7 @@
 #include "subvolume_types.h"
 
 void bch2_snapshot_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
-const char *bch2_snapshot_invalid(const struct bch_fs *, struct bkey_s_c);
+int bch2_snapshot_invalid(const struct bch_fs *, struct bkey_s_c, struct printbuf *);
 
 #define bch2_bkey_ops_snapshot (struct bkey_ops) {		\
 	.key_invalid	= bch2_snapshot_invalid,		\
@@ -96,7 +96,7 @@ int bch2_fs_snapshots_check(struct bch_fs *);
 void bch2_fs_snapshots_exit(struct bch_fs *);
 int bch2_fs_snapshots_start(struct bch_fs *);
 
-const char *bch2_subvolume_invalid(const struct bch_fs *, struct bkey_s_c);
+int bch2_subvolume_invalid(const struct bch_fs *, struct bkey_s_c, struct printbuf *);
 void bch2_subvolume_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 
 #define bch2_bkey_ops_subvolume (struct bkey_ops) {		\
diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c
index 270276a0289f..55c4d48f8b38 100644
--- a/fs/bcachefs/xattr.c
+++ b/fs/bcachefs/xattr.c
@@ -69,32 +69,51 @@ const struct bch_hash_desc bch2_xattr_hash_desc = {
 	.cmp_bkey	= xattr_cmp_bkey,
 };
 
-const char *bch2_xattr_invalid(const struct bch_fs *c, struct bkey_s_c k)
+int bch2_xattr_invalid(const struct bch_fs *c, struct bkey_s_c k,
+		       struct printbuf *err)
 {
 	const struct xattr_handler *handler;
 	struct bkey_s_c_xattr xattr = bkey_s_c_to_xattr(k);
 
-	if (bkey_val_bytes(k.k) < sizeof(struct bch_xattr))
-		return "value too small";
+	if (bkey_val_bytes(k.k) < sizeof(struct bch_xattr)) {
+		pr_buf(err, "incorrect value size (%zu < %zu)",
+		       bkey_val_bytes(k.k), sizeof(*xattr.v));
+		return -EINVAL;
+	}
 
 	if (bkey_val_u64s(k.k) <
 	    xattr_val_u64s(xattr.v->x_name_len,
-			   le16_to_cpu(xattr.v->x_val_len)))
-		return "value too small";
+			   le16_to_cpu(xattr.v->x_val_len))) {
+		pr_buf(err, "value too small (%zu < %u)",
+		       bkey_val_u64s(k.k),
+		       xattr_val_u64s(xattr.v->x_name_len,
+				      le16_to_cpu(xattr.v->x_val_len)));
+		return -EINVAL;
+	}
 
+	/* XXX why +4 ? */
 	if (bkey_val_u64s(k.k) >
 	    xattr_val_u64s(xattr.v->x_name_len,
-			   le16_to_cpu(xattr.v->x_val_len) + 4))
-		return "value too big";
+			   le16_to_cpu(xattr.v->x_val_len) + 4)) {
+		pr_buf(err, "value too big (%zu > %u)",
+		       bkey_val_u64s(k.k),
+		       xattr_val_u64s(xattr.v->x_name_len,
+				      le16_to_cpu(xattr.v->x_val_len) + 4));
+		return -EINVAL;
+	}
 
 	handler = bch2_xattr_type_to_handler(xattr.v->x_type);
-	if (!handler)
-		return "invalid type";
+	if (!handler) {
+		pr_buf(err, "invalid type (%u)", xattr.v->x_type);
+		return -EINVAL;
+	}
 
-	if (memchr(xattr.v->x_name, '\0', xattr.v->x_name_len))
-		return "xattr name has invalid characters";
+	if (memchr(xattr.v->x_name, '\0', xattr.v->x_name_len)) {
+		pr_buf(err, "xattr name has invalid characters");
+		return -EINVAL;
+	}
 
-	return NULL;
+	return 0;
 }
 
 void bch2_xattr_to_text(struct printbuf *out, struct bch_fs *c,
diff --git a/fs/bcachefs/xattr.h b/fs/bcachefs/xattr.h
index f4f896545e1c..3fd03018fdd8 100644
--- a/fs/bcachefs/xattr.h
+++ b/fs/bcachefs/xattr.h
@@ -6,7 +6,7 @@
 
 extern const struct bch_hash_desc bch2_xattr_hash_desc;
 
-const char *bch2_xattr_invalid(const struct bch_fs *, struct bkey_s_c);
+int bch2_xattr_invalid(const struct bch_fs *, struct bkey_s_c, struct printbuf *);
 void bch2_xattr_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 
 #define bch2_bkey_ops_xattr (struct bkey_ops) {		\
-- 
cgit 


From afb6f7f61ba38f4d4d96e8d1bf5fb9e7809e6c10 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 4 Apr 2022 23:36:56 -0400
Subject: bcachefs: Silence spurious copygc err when shutting down

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/movinggc.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c
index b9e1bd7b1d05..cd7a9d81dfe8 100644
--- a/fs/bcachefs/movinggc.c
+++ b/fs/bcachefs/movinggc.c
@@ -290,10 +290,10 @@ static int bch2_copygc(struct bch_fs *c)
 			     writepoint_ptr(&c->copygc_write_point),
 			     copygc_pred, NULL,
 			     &move_stats);
-	if (ret) {
+	if (ret < 0)
 		bch_err(c, "error %i from bch2_move_data() in copygc", ret);
+	if (ret)
 		return ret;
-	}
 
 	ret = check_copygc_was_done(c, &sectors_not_moved, &buckets_not_moved);
 	if (ret) {
-- 
cgit 


From e1effd42a1cb40048002f594c12e823b5e33ed5d Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 5 Apr 2022 13:44:18 -0400
Subject: bcachefs: More improvements for alloc info checks

 - Move checks for whether the device & bucket are valid from the
   .key_invalid method to bch2_check_alloc_key(). This is because
   .key_invalid() is called on keys that may no longer exist (post
   journal replay), which is a problem when removing/resizing devices.

 - We weren't checking the need_discard btree to ensure that every set
   bucket has a corresponding alloc key. This refactors the code for
   checking the freespace btree, so that it now checks both.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/alloc_background.c | 142 +++++++++++++++++------------------------
 fs/bcachefs/alloc_background.h |  14 +++-
 fs/bcachefs/buckets.c          |  13 ++--
 fs/bcachefs/recovery.c         |   2 +-
 fs/bcachefs/super.c            |   9 ---
 5 files changed, 81 insertions(+), 99 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 42ef752932eb..588f43830a36 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -306,11 +306,6 @@ int bch2_alloc_v1_invalid(const struct bch_fs *c, struct bkey_s_c k, struct prin
 {
 	struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k);
 
-	if (!bch2_dev_exists2(c, k.k->p.inode)) {
-		pr_buf(err, "invalid device (%llu)", k.k->p.inode);
-		return -EINVAL;
-	}
-
 	/* allow for unknown fields */
 	if (bkey_val_u64s(a.k) < bch_alloc_v1_val_u64s(a.v)) {
 		pr_buf(err, "incorrect value size (%zu < %u)",
@@ -325,11 +320,6 @@ int bch2_alloc_v2_invalid(const struct bch_fs *c, struct bkey_s_c k, struct prin
 {
 	struct bkey_alloc_unpacked u;
 
-	if (!bch2_dev_exists2(c, k.k->p.inode)) {
-		pr_buf(err, "invalid device (%llu)", k.k->p.inode);
-		return -EINVAL;
-	}
-
 	if (bch2_alloc_unpack_v2(&u, k)) {
 		pr_buf(err, "unpack error");
 		return -EINVAL;
@@ -341,20 +331,6 @@ int bch2_alloc_v2_invalid(const struct bch_fs *c, struct bkey_s_c k, struct prin
 int bch2_alloc_v3_invalid(const struct bch_fs *c, struct bkey_s_c k, struct printbuf *err)
 {
 	struct bkey_alloc_unpacked u;
-	struct bch_dev *ca;
-
-	if (!bch2_dev_exists2(c, k.k->p.inode)) {
-		pr_buf(err, "invalid device (%llu)", k.k->p.inode);
-		return -EINVAL;
-	}
-
-	ca = bch_dev_bkey_exists(c, k.k->p.inode);
-
-	if (k.k->p.offset < ca->mi.first_bucket ||
-	    k.k->p.offset >= ca->mi.nbuckets) {
-		pr_buf(err, "invalid bucket");
-		return -EINVAL;
-	}
 
 	if (bch2_alloc_unpack_v3(&u, k)) {
 		pr_buf(err, "unpack error");
@@ -366,18 +342,9 @@ int bch2_alloc_v3_invalid(const struct bch_fs *c, struct bkey_s_c k, struct prin
 
 int bch2_alloc_v4_invalid(const struct bch_fs *c, struct bkey_s_c k, struct printbuf *err)
 {
-	struct bch_dev *ca;
-
-	if (!bch2_dev_exists2(c, k.k->p.inode)) {
-		pr_buf(err, "invalid device (%llu)", k.k->p.inode);
-		return -EINVAL;
-	}
-
-	ca = bch_dev_bkey_exists(c, k.k->p.inode);
-
-	if (k.k->p.offset < ca->mi.first_bucket ||
-	    k.k->p.offset >= ca->mi.nbuckets) {
-		pr_buf(err, "invalid bucket");
+	if (bkey_val_bytes(k.k) != sizeof(struct bch_alloc_v4)) {
+		pr_buf(err, "bad val size (%zu != %zu)",
+		       bkey_val_bytes(k.k), sizeof(struct bch_alloc_v4));
 		return -EINVAL;
 	}
 
@@ -577,6 +544,7 @@ static int bch2_check_alloc_key(struct btree_trans *trans,
 				struct btree_iter *alloc_iter)
 {
 	struct bch_fs *c = trans->c;
+	struct bch_dev *ca;
 	struct btree_iter discard_iter, freespace_iter;
 	struct bch_alloc_v4 a;
 	unsigned discard_key_type, freespace_key_type;
@@ -593,7 +561,16 @@ static int bch2_check_alloc_key(struct btree_trans *trans,
 	if (ret)
 		return ret;
 
+	if (fsck_err_on(!bch2_dev_bucket_exists(c, alloc_k.k->p), c,
+			"alloc key for invalid device or bucket"))
+		return bch2_btree_delete_at(trans, alloc_iter, 0);
+
+	ca = bch_dev_bkey_exists(c, alloc_k.k->p.inode);
+	if (!ca->mi.freespace_initialized)
+		return 0;
+
 	bch2_alloc_to_v4(alloc_k, &a);
+
 	discard_key_type = bucket_state(a) == BUCKET_need_discard
 		? KEY_TYPE_set : 0;
 	freespace_key_type = bucket_state(a) == BUCKET_free
@@ -668,21 +645,8 @@ fsck_err:
 	return ret;
 }
 
-static inline bool bch2_dev_bucket_exists(struct bch_fs *c, struct bpos pos)
-{
-	struct bch_dev *ca;
-
-	if (pos.inode >= c->sb.nr_devices || !c->devs[pos.inode])
-		return false;
-
-	ca = bch_dev_bkey_exists(c, pos.inode);
-	return pos.offset >= ca->mi.first_bucket &&
-		pos.offset < ca->mi.nbuckets;
-}
-
-static int bch2_check_freespace_key(struct btree_trans *trans,
-				    struct btree_iter *freespace_iter,
-				    bool initial)
+static int bch2_check_discard_freespace_key(struct btree_trans *trans,
+					    struct btree_iter *iter)
 {
 	struct bch_fs *c = trans->c;
 	struct btree_iter alloc_iter;
@@ -691,10 +655,13 @@ static int bch2_check_freespace_key(struct btree_trans *trans,
 	u64 genbits;
 	struct bpos pos;
 	struct bkey_i *update;
+	enum bucket_state state = iter->btree_id == BTREE_ID_need_discard
+		? BUCKET_need_discard
+		: BUCKET_free;
 	struct printbuf buf = PRINTBUF;
 	int ret;
 
-	freespace_k = bch2_btree_iter_peek(freespace_iter);
+	freespace_k = bch2_btree_iter_peek(iter);
 	if (!freespace_k.k)
 		return 1;
 
@@ -702,15 +669,16 @@ static int bch2_check_freespace_key(struct btree_trans *trans,
 	if (ret)
 		return ret;
 
-	pos = freespace_iter->pos;
+	pos = iter->pos;
 	pos.offset &= ~(~0ULL << 56);
-	genbits = freespace_iter->pos.offset & (~0ULL << 56);
+	genbits = iter->pos.offset & (~0ULL << 56);
 
 	bch2_trans_iter_init(trans, &alloc_iter, BTREE_ID_alloc, pos, 0);
 
 	if (fsck_err_on(!bch2_dev_bucket_exists(c, pos), c,
-			"%llu:%llu set in freespace btree but device or bucket does not exist",
-			pos.inode, pos.offset))
+			"%llu:%llu set in %s btree but device or bucket does not exist",
+			pos.inode, pos.offset,
+			bch2_btree_ids[iter->btree_id]))
 		goto delete;
 
 	k = bch2_btree_iter_peek_slot(&alloc_iter);
@@ -720,11 +688,13 @@ static int bch2_check_freespace_key(struct btree_trans *trans,
 
 	bch2_alloc_to_v4(k, &a);
 
-	if (fsck_err_on(bucket_state(a) != BUCKET_free ||
-			genbits != alloc_freespace_genbits(a), c,
-			"%s\n  incorrectly set in freespace index (free %u, genbits %llu should be %llu)",
+	if (fsck_err_on(bucket_state(a) != state ||
+			(state == BUCKET_free &&
+			 genbits != alloc_freespace_genbits(a)), c,
+			"%s\n  incorrectly set in %s index (free %u, genbits %llu should be %llu)",
 			(bch2_bkey_val_to_text(&buf, c, k), buf.buf),
-			bucket_state(a) == BUCKET_free,
+			bch2_btree_ids[iter->btree_id],
+			bucket_state(a) == state,
 			genbits >> 56, alloc_freespace_genbits(a) >> 56))
 		goto delete;
 out:
@@ -734,46 +704,54 @@ fsck_err:
 	printbuf_exit(&buf);
 	return ret;
 delete:
-	update = bch2_trans_kmalloc(trans, sizeof(*update));
-	ret = PTR_ERR_OR_ZERO(update);
-	if (ret)
-		goto err;
+	if (iter->btree_id == BTREE_ID_freespace) {
+		/* should probably add a helper for deleting extents */
+		update = bch2_trans_kmalloc(trans, sizeof(*update));
+		ret = PTR_ERR_OR_ZERO(update);
+		if (ret)
+			goto err;
 
-	bkey_init(&update->k);
-	update->k.p = freespace_iter->pos;
-	bch2_key_resize(&update->k, 1);
+		bkey_init(&update->k);
+		update->k.p = iter->pos;
+		bch2_key_resize(&update->k, 1);
 
-	ret   = bch2_trans_update(trans, freespace_iter, update, 0) ?:
-		bch2_trans_commit(trans, NULL, NULL, 0);
+		ret = bch2_trans_update(trans, iter, update, 0);
+	} else {
+		ret = bch2_btree_delete_at(trans, iter, 0);
+	}
 	goto out;
 }
 
-int bch2_check_alloc_info(struct bch_fs *c, bool initial)
+int bch2_check_alloc_info(struct bch_fs *c)
 {
 	struct btree_trans trans;
 	struct btree_iter iter;
 	struct bkey_s_c k;
-	int ret = 0, last_dev = -1;
+	int ret = 0;
 
 	bch2_trans_init(&trans, c, 0, 0);
 
 	for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN,
 			   BTREE_ITER_PREFETCH, k, ret) {
-		if (k.k->p.inode != last_dev) {
-			struct bch_dev *ca = bch_dev_bkey_exists(c, k.k->p.inode);
-
-			if (!ca->mi.freespace_initialized) {
-				bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode + 1, 0));
-				continue;
-			}
+		ret = __bch2_trans_do(&trans, NULL, NULL, 0,
+			bch2_check_alloc_key(&trans, &iter));
+		if (ret)
+			break;
+	}
+	bch2_trans_iter_exit(&trans, &iter);
 
-			last_dev = k.k->p.inode;
-		}
+	if (ret)
+		goto err;
 
+	bch2_trans_iter_init(&trans, &iter, BTREE_ID_need_discard, POS_MIN,
+			     BTREE_ITER_PREFETCH);
+	while (1) {
 		ret = __bch2_trans_do(&trans, NULL, NULL, 0,
-			bch2_check_alloc_key(&trans, &iter));
+			bch2_check_discard_freespace_key(&trans, &iter));
 		if (ret)
 			break;
+
+		bch2_btree_iter_set_pos(&iter, bpos_nosnap_successor(iter.pos));
 	}
 	bch2_trans_iter_exit(&trans, &iter);
 
@@ -784,7 +762,7 @@ int bch2_check_alloc_info(struct bch_fs *c, bool initial)
 			     BTREE_ITER_PREFETCH);
 	while (1) {
 		ret = __bch2_trans_do(&trans, NULL, NULL, 0,
-			bch2_check_freespace_key(&trans, &iter, initial));
+			bch2_check_discard_freespace_key(&trans, &iter));
 		if (ret)
 			break;
 
diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h
index 93bd8feb9ebc..7ca5bfd37027 100644
--- a/fs/bcachefs/alloc_background.h
+++ b/fs/bcachefs/alloc_background.h
@@ -11,6 +11,18 @@
 /* How out of date a pointer gen is allowed to be: */
 #define BUCKET_GC_GEN_MAX	96U
 
+static inline bool bch2_dev_bucket_exists(struct bch_fs *c, struct bpos pos)
+{
+	struct bch_dev *ca;
+
+	if (!bch2_dev_exists2(c, pos.inode))
+		return false;
+
+	ca = bch_dev_bkey_exists(c, pos.inode);
+	return pos.offset >= ca->mi.first_bucket &&
+		pos.offset < ca->mi.nbuckets;
+}
+
 static inline u8 alloc_gc_gen(struct bch_alloc_v4 a)
 {
 	return a.gen - a.oldest_gen;
@@ -113,7 +125,7 @@ int bch2_alloc_read(struct bch_fs *);
 
 int bch2_trans_mark_alloc(struct btree_trans *, struct bkey_s_c,
 			  struct bkey_i *, unsigned);
-int bch2_check_alloc_info(struct bch_fs *, bool);
+int bch2_check_alloc_info(struct bch_fs *);
 int bch2_check_alloc_to_lru_refs(struct bch_fs *);
 void bch2_do_discards(struct bch_fs *);
 
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 5b78e8f983a1..31720093de45 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -511,14 +511,9 @@ int bch2_mark_alloc(struct btree_trans *trans,
 	u64 journal_seq = trans->journal_res.seq;
 	struct bch_fs *c = trans->c;
 	struct bch_alloc_v4 old_a, new_a;
-	struct bch_dev *ca = bch_dev_bkey_exists(c, new.k->p.inode);
+	struct bch_dev *ca;
 	int ret = 0;
 
-	if (bch2_trans_inconsistent_on(new.k->p.offset < ca->mi.first_bucket ||
-				       new.k->p.offset >= ca->mi.nbuckets, trans,
-				       "alloc key outside range of device's buckets"))
-		return -EIO;
-
 	/*
 	 * alloc btree is read in by bch2_alloc_read, not gc:
 	 */
@@ -526,6 +521,12 @@ int bch2_mark_alloc(struct btree_trans *trans,
 	    !(flags & BTREE_TRIGGER_BUCKET_INVALIDATE))
 		return 0;
 
+	if (bch2_trans_inconsistent_on(!bch2_dev_bucket_exists(c, new.k->p), trans,
+				       "alloc key for invalid device or bucket"))
+		return -EIO;
+
+	ca = bch_dev_bkey_exists(c, new.k->p.inode);
+
 	bch2_alloc_to_v4(old, &old_a);
 	bch2_alloc_to_v4(new, &new_a);
 
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index f9215cc7cb09..1fe3e81eaa3d 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -1237,7 +1237,7 @@ use_clean:
 	if (c->opts.fsck) {
 		bch_info(c, "checking need_discard and freespace btrees");
 		err = "error checking need_discard and freespace btrees";
-		ret = bch2_check_alloc_info(c, true);
+		ret = bch2_check_alloc_info(c);
 		if (ret)
 			goto err;
 
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 037923bca742..3183f49a488f 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -1474,15 +1474,6 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
 		goto err;
 	}
 
-	/*
-	 * must flush all existing journal entries, they might have
-	 * (overwritten) keys that point to the device we're removing:
-	 */
-	bch2_journal_flush_all_pins(&c->journal);
-	/*
-	 * hack to ensure bch2_replicas_gc2() clears out entries to this device
-	 */
-	bch2_journal_meta(&c->journal);
 	ret = bch2_journal_error(&c->journal);
 	if (ret) {
 		bch_err(ca, "Remove failed, journal error");
-- 
cgit 


From 275c8426fb8fd475e9991b3aa1b20f66069e594f Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 3 Apr 2022 21:50:25 -0400
Subject: bcachefs: Add rw to .key_invalid()

This adds a new parameter to .key_invalid() methods for whether the key
is being read or written; the idea being that methods can do more
aggressive checks when a key is newly created and being written, when we
wouldn't want to delete the key because of those checks.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/alloc_background.c      | 12 ++++++++----
 fs/bcachefs/alloc_background.h      |  8 ++++----
 fs/bcachefs/bkey_methods.c          | 23 ++++++++++++-----------
 fs/bcachefs/bkey_methods.h          | 18 ++++++++++++------
 fs/bcachefs/btree_io.c              | 13 +++++++------
 fs/bcachefs/btree_update_interior.c |  6 ++++--
 fs/bcachefs/btree_update_leaf.c     |  6 ++++--
 fs/bcachefs/dirent.c                |  2 +-
 fs/bcachefs/dirent.h                |  2 +-
 fs/bcachefs/ec.c                    |  4 ++--
 fs/bcachefs/ec.h                    |  2 +-
 fs/bcachefs/extents.c               | 14 +++++++-------
 fs/bcachefs/extents.h               | 10 ++++++----
 fs/bcachefs/inode.c                 |  6 +++---
 fs/bcachefs/inode.h                 |  7 ++++---
 fs/bcachefs/journal_io.c            |  4 ++--
 fs/bcachefs/lru.c                   |  2 +-
 fs/bcachefs/lru.h                   |  2 +-
 fs/bcachefs/quota.c                 |  2 +-
 fs/bcachefs/quota.h                 |  2 +-
 fs/bcachefs/reflink.c               |  8 ++++----
 fs/bcachefs/reflink.h               |  8 +++++---
 fs/bcachefs/subvolume.c             |  4 ++--
 fs/bcachefs/subvolume.h             |  6 ++++--
 fs/bcachefs/xattr.c                 |  2 +-
 fs/bcachefs/xattr.h                 |  2 +-
 26 files changed, 99 insertions(+), 76 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 588f43830a36..cad39119949a 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -302,7 +302,8 @@ static unsigned bch_alloc_v1_val_u64s(const struct bch_alloc *a)
 	return DIV_ROUND_UP(bytes, sizeof(u64));
 }
 
-int bch2_alloc_v1_invalid(const struct bch_fs *c, struct bkey_s_c k, struct printbuf *err)
+int bch2_alloc_v1_invalid(const struct bch_fs *c, struct bkey_s_c k,
+			  int rw, struct printbuf *err)
 {
 	struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k);
 
@@ -316,7 +317,8 @@ int bch2_alloc_v1_invalid(const struct bch_fs *c, struct bkey_s_c k, struct prin
 	return 0;
 }
 
-int bch2_alloc_v2_invalid(const struct bch_fs *c, struct bkey_s_c k, struct printbuf *err)
+int bch2_alloc_v2_invalid(const struct bch_fs *c, struct bkey_s_c k,
+			  int rw, struct printbuf *err)
 {
 	struct bkey_alloc_unpacked u;
 
@@ -328,7 +330,8 @@ int bch2_alloc_v2_invalid(const struct bch_fs *c, struct bkey_s_c k, struct prin
 	return 0;
 }
 
-int bch2_alloc_v3_invalid(const struct bch_fs *c, struct bkey_s_c k, struct printbuf *err)
+int bch2_alloc_v3_invalid(const struct bch_fs *c, struct bkey_s_c k,
+			  int rw, struct printbuf *err)
 {
 	struct bkey_alloc_unpacked u;
 
@@ -340,7 +343,8 @@ int bch2_alloc_v3_invalid(const struct bch_fs *c, struct bkey_s_c k, struct prin
 	return 0;
 }
 
-int bch2_alloc_v4_invalid(const struct bch_fs *c, struct bkey_s_c k, struct printbuf *err)
+int bch2_alloc_v4_invalid(const struct bch_fs *c, struct bkey_s_c k,
+			  int rw, struct printbuf *err)
 {
 	if (bkey_val_bytes(k.k) != sizeof(struct bch_alloc_v4)) {
 		pr_buf(err, "bad val size (%zu != %zu)",
diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h
index 7ca5bfd37027..9c6a590fa073 100644
--- a/fs/bcachefs/alloc_background.h
+++ b/fs/bcachefs/alloc_background.h
@@ -78,10 +78,10 @@ int bch2_bucket_io_time_reset(struct btree_trans *, unsigned, size_t, int);
 
 #define ALLOC_SCAN_BATCH(ca)		max_t(size_t, 1, (ca)->mi.nbuckets >> 9)
 
-int bch2_alloc_v1_invalid(const struct bch_fs *, struct bkey_s_c, struct printbuf *);
-int bch2_alloc_v2_invalid(const struct bch_fs *, struct bkey_s_c, struct printbuf *);
-int bch2_alloc_v3_invalid(const struct bch_fs *, struct bkey_s_c, struct printbuf *);
-int bch2_alloc_v4_invalid(const struct bch_fs *, struct bkey_s_c k, struct printbuf *);
+int bch2_alloc_v1_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *);
+int bch2_alloc_v2_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *);
+int bch2_alloc_v3_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *);
+int bch2_alloc_v4_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *);
 void bch2_alloc_v4_swab(struct bkey_s);
 void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 
diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c
index 0351cbe7d48e..62ce1264731a 100644
--- a/fs/bcachefs/bkey_methods.c
+++ b/fs/bcachefs/bkey_methods.c
@@ -23,7 +23,7 @@ const char * const bch2_bkey_types[] = {
 };
 
 static int deleted_key_invalid(const struct bch_fs *c, struct bkey_s_c k,
-			       struct printbuf *err)
+			       int rw, struct printbuf *err)
 {
 	return 0;
 }
@@ -37,7 +37,7 @@ static int deleted_key_invalid(const struct bch_fs *c, struct bkey_s_c k,
 }
 
 static int empty_val_key_invalid(const struct bch_fs *c, struct bkey_s_c k,
-				 struct printbuf *err)
+				 int rw, struct printbuf *err)
 {
 	if (bkey_val_bytes(k.k)) {
 		pr_buf(err, "incorrect value size (%zu != 0)",
@@ -53,7 +53,7 @@ static int empty_val_key_invalid(const struct bch_fs *c, struct bkey_s_c k,
 }
 
 static int key_type_cookie_invalid(const struct bch_fs *c, struct bkey_s_c k,
-				   struct printbuf *err)
+				   int rw, struct printbuf *err)
 {
 	if (bkey_val_bytes(k.k) != sizeof(struct bch_cookie)) {
 		pr_buf(err, "incorrect value size (%zu != %zu)",
@@ -73,7 +73,7 @@ static int key_type_cookie_invalid(const struct bch_fs *c, struct bkey_s_c k,
 }
 
 static int key_type_inline_data_invalid(const struct bch_fs *c, struct bkey_s_c k,
-					struct printbuf *err)
+					int rw, struct printbuf *err)
 {
 	return 0;
 }
@@ -94,7 +94,7 @@ static void key_type_inline_data_to_text(struct printbuf *out, struct bch_fs *c,
 }
 
 static int key_type_set_invalid(const struct bch_fs *c, struct bkey_s_c k,
-				struct printbuf *err)
+				int rw, struct printbuf *err)
 {
 	if (bkey_val_bytes(k.k)) {
 		pr_buf(err, "incorrect value size (%zu != %zu)",
@@ -122,14 +122,15 @@ const struct bkey_ops bch2_bkey_ops[] = {
 #undef x
 };
 
-int bch2_bkey_val_invalid(struct bch_fs *c, struct bkey_s_c k, struct printbuf *err)
+int bch2_bkey_val_invalid(struct bch_fs *c, struct bkey_s_c k,
+			  int rw, struct printbuf *err)
 {
 	if (k.k->type >= KEY_TYPE_MAX) {
 		pr_buf(err, "invalid type (%u >= %u)", k.k->type, KEY_TYPE_MAX);
 		return -EINVAL;
 	}
 
-	return bch2_bkey_ops[k.k->type].key_invalid(c, k, err);
+	return bch2_bkey_ops[k.k->type].key_invalid(c, k, rw, err);
 }
 
 static unsigned bch2_key_types_allowed[] = {
@@ -198,7 +199,7 @@ static unsigned bch2_key_types_allowed[] = {
 
 int __bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k,
 			enum btree_node_type type,
-			struct printbuf *err)
+			int rw, struct printbuf *err)
 {
 	if (k.k->u64s < BKEY_U64s) {
 		pr_buf(err, "u64s too small (%u < %zu)", k.k->u64s, BKEY_U64s);
@@ -254,10 +255,10 @@ int __bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k,
 
 int bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k,
 		      enum btree_node_type type,
-		      struct printbuf *err)
+		      int rw, struct printbuf *err)
 {
-	return __bch2_bkey_invalid(c, k, type, err) ?:
-		bch2_bkey_val_invalid(c, k, err);
+	return __bch2_bkey_invalid(c, k, type, rw, err) ?:
+		bch2_bkey_val_invalid(c, k, rw, err);
 }
 
 int bch2_bkey_in_btree_node(struct btree *b, struct bkey_s_c k,
diff --git a/fs/bcachefs/bkey_methods.h b/fs/bcachefs/bkey_methods.h
index 4b90d0873be6..5c55e8bfe158 100644
--- a/fs/bcachefs/bkey_methods.h
+++ b/fs/bcachefs/bkey_methods.h
@@ -12,10 +12,16 @@ enum btree_node_type;
 
 extern const char * const bch2_bkey_types[];
 
+/*
+ * key_invalid: checks validity of @k, returns 0 if good or -EINVAL if bad. If
+ * invalid, entire key will be deleted.
+ *
+ * When invalid, error string is returned via @err. @rw indicates whether key is
+ * being read or written; more aggressive checks can be enabled when rw == WRITE.
+*/
 struct bkey_ops {
-	/* Returns reason for being invalid if invalid, else NULL: */
-	int		(*key_invalid)(const struct bch_fs *, struct bkey_s_c,
-				       struct printbuf *);
+	int		(*key_invalid)(const struct bch_fs *c, struct bkey_s_c k,
+				       int rw, struct printbuf *err);
 	void		(*val_to_text)(struct printbuf *, struct bch_fs *,
 				       struct bkey_s_c);
 	void		(*swab)(struct bkey_s);
@@ -32,11 +38,11 @@ struct bkey_ops {
 
 extern const struct bkey_ops bch2_bkey_ops[];
 
-int bch2_bkey_val_invalid(struct bch_fs *, struct bkey_s_c, struct printbuf *);
+int bch2_bkey_val_invalid(struct bch_fs *, struct bkey_s_c, int, struct printbuf *);
 int __bch2_bkey_invalid(struct bch_fs *, struct bkey_s_c,
-				enum btree_node_type, struct printbuf *);
+			enum btree_node_type, int, struct printbuf *);
 int bch2_bkey_invalid(struct bch_fs *, struct bkey_s_c,
-			      enum btree_node_type, struct printbuf *);
+		      enum btree_node_type, int, struct printbuf *);
 int bch2_bkey_in_btree_node(struct btree *, struct bkey_s_c, struct printbuf *);
 
 void bch2_bpos_to_text(struct printbuf *, struct bpos);
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index d2b3ff6b9b15..ba1d775039a3 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -764,12 +764,12 @@ fsck_err:
 
 static int bset_key_invalid(struct bch_fs *c, struct btree *b,
 			    struct bkey_s_c k,
-			    bool updated_range, int write,
+			    bool updated_range, int rw,
 			    struct printbuf *err)
 {
-	return __bch2_bkey_invalid(c, k, btree_node_type(b), err) ?:
+	return __bch2_bkey_invalid(c, k, btree_node_type(b), rw, err) ?:
 		(!updated_range ? bch2_bkey_in_btree_node(b, k, err) : 0) ?:
-		(write ? bch2_bkey_val_invalid(c, k, err) : 0);
+		(rw == WRITE ? bch2_bkey_val_invalid(c, k, rw, err) : 0);
 }
 
 static int validate_bset_keys(struct bch_fs *c, struct btree *b,
@@ -1071,7 +1071,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
 
 		printbuf_reset(&buf);
 
-		if (bch2_bkey_val_invalid(c, u.s_c, &buf) ||
+		if (bch2_bkey_val_invalid(c, u.s_c, READ, &buf) ||
 		    (bch2_inject_invalid_keys &&
 		     !bversion_cmp(u.k->version, MAX_VERSION))) {
 			printbuf_reset(&buf);
@@ -1079,7 +1079,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
 			pr_buf(&buf, "invalid bkey\n  ");
 			bch2_bkey_val_to_text(&buf, c, u.s_c);
 			pr_buf(&buf, "\n  ");
-			bch2_bkey_val_invalid(c, u.s_c, &buf);
+			bch2_bkey_val_invalid(c, u.s_c, READ, &buf);
 
 			btree_err(BTREE_ERR_FIXABLE, c, NULL, b, i, "%s", buf.buf);
 
@@ -1730,7 +1730,8 @@ static int validate_bset_for_write(struct bch_fs *c, struct btree *b,
 	struct printbuf buf = PRINTBUF;
 	int ret;
 
-	ret = bch2_bkey_invalid(c, bkey_i_to_s_c(&b->key), BKEY_TYPE_btree, &buf);
+	ret = bch2_bkey_invalid(c, bkey_i_to_s_c(&b->key),
+				BKEY_TYPE_btree, WRITE, &buf);
 
 	if (ret)
 		bch2_fs_inconsistent(c, "invalid btree node key before write: %s", buf.buf);
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 0e3b3565be59..10d22d9b8c0d 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -1184,13 +1184,15 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as,
 	if (unlikely(!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)))
 		bch2_journal_key_overwritten(c, b->c.btree_id, b->c.level, insert->k.p);
 
-	if (bch2_bkey_invalid(c, bkey_i_to_s_c(insert), btree_node_type(b), &buf) ?:
+	if (bch2_bkey_invalid(c, bkey_i_to_s_c(insert),
+			      btree_node_type(b), WRITE, &buf) ?:
 	    bch2_bkey_in_btree_node(b, bkey_i_to_s_c(insert), &buf)) {
 		printbuf_reset(&buf);
 		pr_buf(&buf, "inserting invalid bkey\n  ");
 		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert));
 		pr_buf(&buf, "\n  ");
-		bch2_bkey_invalid(c, bkey_i_to_s_c(insert), btree_node_type(b), &buf);
+		bch2_bkey_invalid(c, bkey_i_to_s_c(insert),
+				  btree_node_type(b), WRITE, &buf);
 		bch2_bkey_in_btree_node(b, bkey_i_to_s_c(insert), &buf);
 
 		bch2_fs_inconsistent(c, "%s", buf.buf);
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index fce93ed65ed9..2c20ad89ca0a 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -866,7 +866,8 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans,
 	int ret, u64s_delta = 0;
 
 	trans_for_each_update(trans, i) {
-		if (bch2_bkey_invalid(c, bkey_i_to_s_c(i->k), i->bkey_type, &buf)) {
+		if (bch2_bkey_invalid(c, bkey_i_to_s_c(i->k),
+				      i->bkey_type, WRITE, &buf)) {
 			printbuf_reset(&buf);
 			pr_buf(&buf, "invalid bkey on insert from %s -> %ps",
 			       trans->fn, (void *) i->ip_allocated);
@@ -876,7 +877,8 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans,
 			bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(i->k));
 			pr_newline(&buf);
 
-			bch2_bkey_invalid(c, bkey_i_to_s_c(i->k), i->bkey_type, &buf);
+			bch2_bkey_invalid(c, bkey_i_to_s_c(i->k),
+					  i->bkey_type, WRITE, &buf);
 
 			bch2_fs_fatal_error(c, "%s", buf.buf);
 			printbuf_exit(&buf);
diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c
index e8a284a69be4..281959885bb0 100644
--- a/fs/bcachefs/dirent.c
+++ b/fs/bcachefs/dirent.c
@@ -84,7 +84,7 @@ const struct bch_hash_desc bch2_dirent_hash_desc = {
 };
 
 int bch2_dirent_invalid(const struct bch_fs *c, struct bkey_s_c k,
-			struct printbuf *err)
+			int rw, struct printbuf *err)
 {
 	struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
 	unsigned len;
diff --git a/fs/bcachefs/dirent.h b/fs/bcachefs/dirent.h
index 046f297a4eff..b1466932c768 100644
--- a/fs/bcachefs/dirent.h
+++ b/fs/bcachefs/dirent.h
@@ -6,7 +6,7 @@
 
 extern const struct bch_hash_desc bch2_dirent_hash_desc;
 
-int bch2_dirent_invalid(const struct bch_fs *, struct bkey_s_c, struct printbuf *);
+int bch2_dirent_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *);
 void bch2_dirent_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 
 #define bch2_bkey_ops_dirent (struct bkey_ops) {	\
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index cf9ecb7711c6..7a524f604875 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -103,7 +103,7 @@ struct ec_bio {
 /* Stripes btree keys: */
 
 int bch2_stripe_invalid(const struct bch_fs *c, struct bkey_s_c k,
-			struct printbuf *err)
+			int rw, struct printbuf *err)
 {
 	const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
 
@@ -129,7 +129,7 @@ int bch2_stripe_invalid(const struct bch_fs *c, struct bkey_s_c k,
 		return -EINVAL;
 	}
 
-	return bch2_bkey_ptrs_invalid(c, k, err);
+	return bch2_bkey_ptrs_invalid(c, k, rw, err);
 }
 
 void bch2_stripe_to_text(struct printbuf *out, struct bch_fs *c,
diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h
index 8e866460f8a0..af7f8eee94b0 100644
--- a/fs/bcachefs/ec.h
+++ b/fs/bcachefs/ec.h
@@ -7,7 +7,7 @@
 #include "keylist_types.h"
 
 int bch2_stripe_invalid(const struct bch_fs *, struct bkey_s_c,
-			struct printbuf *);
+			int rw, struct printbuf *);
 void bch2_stripe_to_text(struct printbuf *, struct bch_fs *,
 			 struct bkey_s_c);
 
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index e09636023882..c56925d94bfe 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -156,7 +156,7 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k,
 /* KEY_TYPE_btree_ptr: */
 
 int bch2_btree_ptr_invalid(const struct bch_fs *c, struct bkey_s_c k,
-			   struct printbuf *err)
+			   int rw, struct printbuf *err)
 {
 	if (bkey_val_u64s(k.k) > BCH_REPLICAS_MAX) {
 		pr_buf(err, "value too big (%zu > %u)",
@@ -164,7 +164,7 @@ int bch2_btree_ptr_invalid(const struct bch_fs *c, struct bkey_s_c k,
 		return -EINVAL;
 	}
 
-	return bch2_bkey_ptrs_invalid(c, k, err);
+	return bch2_bkey_ptrs_invalid(c, k, rw, err);
 }
 
 void bch2_btree_ptr_to_text(struct printbuf *out, struct bch_fs *c,
@@ -174,7 +174,7 @@ void bch2_btree_ptr_to_text(struct printbuf *out, struct bch_fs *c,
 }
 
 int bch2_btree_ptr_v2_invalid(const struct bch_fs *c, struct bkey_s_c k,
-				      struct printbuf *err)
+			      int rw, struct printbuf *err)
 {
 	struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k);
 
@@ -197,11 +197,11 @@ int bch2_btree_ptr_v2_invalid(const struct bch_fs *c, struct bkey_s_c k,
 		return -EINVAL;
 	}
 
-	return bch2_bkey_ptrs_invalid(c, k, err);
+	return bch2_bkey_ptrs_invalid(c, k, rw, err);
 }
 
 void bch2_btree_ptr_v2_to_text(struct printbuf *out, struct bch_fs *c,
-			    struct bkey_s_c k)
+			       struct bkey_s_c k)
 {
 	struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k);
 
@@ -366,7 +366,7 @@ bool bch2_extent_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r)
 /* KEY_TYPE_reservation: */
 
 int bch2_reservation_invalid(const struct bch_fs *c, struct bkey_s_c k,
-			     struct printbuf *err)
+			     int rw, struct printbuf *err)
 {
 	struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k);
 
@@ -1059,7 +1059,7 @@ static int extent_ptr_invalid(const struct bch_fs *c,
 }
 
 int bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k,
-			   struct printbuf *err)
+			   int rw, struct printbuf *err)
 {
 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
 	const union bch_extent_entry *entry;
diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h
index 21f79e663c74..4f41f0fd6cb1 100644
--- a/fs/bcachefs/extents.h
+++ b/fs/bcachefs/extents.h
@@ -367,11 +367,11 @@ int bch2_bkey_pick_read_device(struct bch_fs *, struct bkey_s_c,
 
 /* KEY_TYPE_btree_ptr: */
 
-int bch2_btree_ptr_invalid(const struct bch_fs *, struct bkey_s_c, struct printbuf *);
+int bch2_btree_ptr_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *);
 void bch2_btree_ptr_to_text(struct printbuf *, struct bch_fs *,
 			    struct bkey_s_c);
 
-int bch2_btree_ptr_v2_invalid(const struct bch_fs *, struct bkey_s_c, struct printbuf *);
+int bch2_btree_ptr_v2_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *);
 void bch2_btree_ptr_v2_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 void bch2_btree_ptr_v2_compat(enum btree_id, unsigned, unsigned,
 			      int, struct bkey_s);
@@ -409,7 +409,8 @@ bool bch2_extent_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);
 
 /* KEY_TYPE_reservation: */
 
-int bch2_reservation_invalid(const struct bch_fs *, struct bkey_s_c, struct printbuf *);
+int bch2_reservation_invalid(const struct bch_fs *, struct bkey_s_c,
+			     int, struct printbuf *);
 void bch2_reservation_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 bool bch2_reservation_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);
 
@@ -615,7 +616,8 @@ bool bch2_bkey_matches_ptr(struct bch_fs *, struct bkey_s_c,
 bool bch2_extent_normalize(struct bch_fs *, struct bkey_s);
 void bch2_bkey_ptrs_to_text(struct printbuf *, struct bch_fs *,
 			    struct bkey_s_c);
-int bch2_bkey_ptrs_invalid(const struct bch_fs *, struct bkey_s_c, struct printbuf *);
+int bch2_bkey_ptrs_invalid(const struct bch_fs *, struct bkey_s_c,
+			   int, struct printbuf *);
 
 void bch2_ptr_swab(struct bkey_s);
 
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index 2f7bafc7db13..28f4f192772f 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -339,7 +339,7 @@ static int __bch2_inode_invalid(struct bkey_s_c k, struct printbuf *err)
 }
 
 int bch2_inode_invalid(const struct bch_fs *c, struct bkey_s_c k,
-		       struct printbuf *err)
+		       int rw, struct printbuf *err)
 {
 	struct bkey_s_c_inode inode = bkey_s_c_to_inode(k);
 
@@ -359,7 +359,7 @@ int bch2_inode_invalid(const struct bch_fs *c, struct bkey_s_c k,
 }
 
 int bch2_inode_v2_invalid(const struct bch_fs *c, struct bkey_s_c k,
-			  struct printbuf *err)
+			  int rw, struct printbuf *err)
 {
 	struct bkey_s_c_inode_v2 inode = bkey_s_c_to_inode_v2(k);
 
@@ -410,7 +410,7 @@ void bch2_inode_to_text(struct printbuf *out, struct bch_fs *c,
 }
 
 int bch2_inode_generation_invalid(const struct bch_fs *c, struct bkey_s_c k,
-				  struct printbuf *err)
+				  int rw, struct printbuf *err)
 {
 	if (k.k->p.inode) {
 		pr_buf(err, "nonzero k.p.inode");
diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h
index e3418dc4a1e9..9442600a7440 100644
--- a/fs/bcachefs/inode.h
+++ b/fs/bcachefs/inode.h
@@ -6,8 +6,8 @@
 
 extern const char * const bch2_inode_opts[];
 
-int bch2_inode_invalid(const struct bch_fs *, struct bkey_s_c, struct printbuf *);
-int bch2_inode_v2_invalid(const struct bch_fs *, struct bkey_s_c, struct printbuf *);
+int bch2_inode_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *);
+int bch2_inode_v2_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *);
 void bch2_inode_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 
 #define bch2_bkey_ops_inode (struct bkey_ops) {		\
@@ -30,7 +30,8 @@ static inline bool bkey_is_inode(const struct bkey *k)
 		k->type == KEY_TYPE_inode_v2;
 }
 
-int bch2_inode_generation_invalid(const struct bch_fs *, struct bkey_s_c, struct printbuf *);
+int bch2_inode_generation_invalid(const struct bch_fs *, struct bkey_s_c,
+				  int, struct printbuf *);
 void bch2_inode_generation_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 
 #define bch2_bkey_ops_inode_generation (struct bkey_ops) {	\
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 56221e316ee6..5ea685fd15e7 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -250,7 +250,7 @@ static int journal_validate_key(struct bch_fs *c, const char *where,
 				 write, NULL, bkey_to_packed(k));
 
 	if (bch2_bkey_invalid(c, bkey_i_to_s_c(k),
-			      __btree_node_type(level, btree_id), &buf)) {
+			      __btree_node_type(level, btree_id), write, &buf)) {
 		printbuf_reset(&buf);
 		pr_buf(&buf, "invalid %s in %s entry offset %zi/%u:",
 		       type, where,
@@ -262,7 +262,7 @@ static int journal_validate_key(struct bch_fs *c, const char *where,
 		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k));
 		pr_newline(&buf);
 		bch2_bkey_invalid(c, bkey_i_to_s_c(k),
-				  __btree_node_type(level, btree_id), &buf);
+				  __btree_node_type(level, btree_id), write, &buf);
 
 		mustfix_fsck_err(c, "%s", buf.buf);
 
diff --git a/fs/bcachefs/lru.c b/fs/bcachefs/lru.c
index c20a3bc2336b..c6f433153286 100644
--- a/fs/bcachefs/lru.c
+++ b/fs/bcachefs/lru.c
@@ -9,7 +9,7 @@
 #include "recovery.h"
 
 int bch2_lru_invalid(const struct bch_fs *c, struct bkey_s_c k,
-		     struct printbuf *err)
+		     int rw, struct printbuf *err)
 {
 	const struct bch_lru *lru = bkey_s_c_to_lru(k).v;
 
diff --git a/fs/bcachefs/lru.h b/fs/bcachefs/lru.h
index 0af62ecf6638..e8f508174b0a 100644
--- a/fs/bcachefs/lru.h
+++ b/fs/bcachefs/lru.h
@@ -2,7 +2,7 @@
 #ifndef _BCACHEFS_LRU_H
 #define _BCACHEFS_LRU_H
 
-int bch2_lru_invalid(const struct bch_fs *, struct bkey_s_c, struct printbuf *);
+int bch2_lru_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *);
 void bch2_lru_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 
 #define bch2_bkey_ops_lru (struct bkey_ops) {	\
diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c
index 5f370da2f3d2..364ef6314651 100644
--- a/fs/bcachefs/quota.c
+++ b/fs/bcachefs/quota.c
@@ -58,7 +58,7 @@ const struct bch_sb_field_ops bch_sb_field_ops_quota = {
 };
 
 int bch2_quota_invalid(const struct bch_fs *c, struct bkey_s_c k,
-		       struct printbuf *err)
+		       int rw, struct printbuf *err)
 {
 	if (k.k->p.inode >= QTYP_NR) {
 		pr_buf(err, "invalid quota type (%llu >= %u)",
diff --git a/fs/bcachefs/quota.h b/fs/bcachefs/quota.h
index 4ba40fce39a8..8c67ae1da7c7 100644
--- a/fs/bcachefs/quota.h
+++ b/fs/bcachefs/quota.h
@@ -7,7 +7,7 @@
 
 extern const struct bch_sb_field_ops bch_sb_field_ops_quota;
 
-int bch2_quota_invalid(const struct bch_fs *, struct bkey_s_c, struct printbuf *);
+int bch2_quota_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *);
 void bch2_quota_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 
 #define bch2_bkey_ops_quota (struct bkey_ops) {		\
diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c
index e07f0339d87e..6a81eb9b41a0 100644
--- a/fs/bcachefs/reflink.c
+++ b/fs/bcachefs/reflink.c
@@ -26,7 +26,7 @@ static inline unsigned bkey_type_to_indirect(const struct bkey *k)
 /* reflink pointers */
 
 int bch2_reflink_p_invalid(const struct bch_fs *c, struct bkey_s_c k,
-			   struct printbuf *err)
+			   int rw, struct printbuf *err)
 {
 	struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
 
@@ -78,7 +78,7 @@ bool bch2_reflink_p_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r
 /* indirect extents */
 
 int bch2_reflink_v_invalid(const struct bch_fs *c, struct bkey_s_c k,
-			   struct printbuf *err)
+			   int rw, struct printbuf *err)
 {
 	struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(k);
 
@@ -88,7 +88,7 @@ int bch2_reflink_v_invalid(const struct bch_fs *c, struct bkey_s_c k,
 		return -EINVAL;
 	}
 
-	return bch2_bkey_ptrs_invalid(c, k, err);
+	return bch2_bkey_ptrs_invalid(c, k, rw, err);
 }
 
 void bch2_reflink_v_to_text(struct printbuf *out, struct bch_fs *c,
@@ -130,7 +130,7 @@ int bch2_trans_mark_reflink_v(struct btree_trans *trans,
 /* indirect inline data */
 
 int bch2_indirect_inline_data_invalid(const struct bch_fs *c, struct bkey_s_c k,
-				      struct printbuf *err)
+				      int rw, struct printbuf *err)
 {
 	if (bkey_val_bytes(k.k) < sizeof(struct bch_indirect_inline_data)) {
 		pr_buf(err, "incorrect value size (%zu < %zu)",
diff --git a/fs/bcachefs/reflink.h b/fs/bcachefs/reflink.h
index d292761f8a98..e0a9d8e4d1ca 100644
--- a/fs/bcachefs/reflink.h
+++ b/fs/bcachefs/reflink.h
@@ -2,7 +2,8 @@
 #ifndef _BCACHEFS_REFLINK_H
 #define _BCACHEFS_REFLINK_H
 
-int bch2_reflink_p_invalid(const struct bch_fs *, struct bkey_s_c, struct printbuf *);
+int bch2_reflink_p_invalid(const struct bch_fs *, struct bkey_s_c,
+			   int, struct printbuf *);
 void bch2_reflink_p_to_text(struct printbuf *, struct bch_fs *,
 			    struct bkey_s_c);
 bool bch2_reflink_p_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);
@@ -15,7 +16,8 @@ bool bch2_reflink_p_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);
 	.atomic_trigger	= bch2_mark_reflink_p,			\
 }
 
-int bch2_reflink_v_invalid(const struct bch_fs *, struct bkey_s_c, struct printbuf *);
+int bch2_reflink_v_invalid(const struct bch_fs *, struct bkey_s_c,
+			   int, struct printbuf *);
 void bch2_reflink_v_to_text(struct printbuf *, struct bch_fs *,
 			    struct bkey_s_c);
 int bch2_trans_mark_reflink_v(struct btree_trans *, struct bkey_s_c,
@@ -30,7 +32,7 @@ int bch2_trans_mark_reflink_v(struct btree_trans *, struct bkey_s_c,
 }
 
 int bch2_indirect_inline_data_invalid(const struct bch_fs *, struct bkey_s_c,
-				      struct printbuf *);
+				      int, struct printbuf *);
 void bch2_indirect_inline_data_to_text(struct printbuf *,
 				struct bch_fs *, struct bkey_s_c);
 int bch2_trans_mark_indirect_inline_data(struct btree_trans *,
diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c
index d3f043f90110..81bdcb7795ae 100644
--- a/fs/bcachefs/subvolume.c
+++ b/fs/bcachefs/subvolume.c
@@ -27,7 +27,7 @@ void bch2_snapshot_to_text(struct printbuf *out, struct bch_fs *c,
 }
 
 int bch2_snapshot_invalid(const struct bch_fs *c, struct bkey_s_c k,
-			  struct printbuf *err)
+			  int rw, struct printbuf *err)
 {
 	struct bkey_s_c_snapshot s;
 	u32 i, id;
@@ -746,7 +746,7 @@ static int bch2_delete_dead_snapshots_hook(struct btree_trans *trans,
 /* Subvolumes: */
 
 int bch2_subvolume_invalid(const struct bch_fs *c, struct bkey_s_c k,
-			   struct printbuf *err)
+			   int rw, struct printbuf *err)
 {
 	if (bkey_cmp(k.k->p, SUBVOL_POS_MIN) < 0 ||
 	    bkey_cmp(k.k->p, SUBVOL_POS_MAX) > 0) {
diff --git a/fs/bcachefs/subvolume.h b/fs/bcachefs/subvolume.h
index f466bf7e4543..b1739d29c7d4 100644
--- a/fs/bcachefs/subvolume.h
+++ b/fs/bcachefs/subvolume.h
@@ -6,7 +6,8 @@
 #include "subvolume_types.h"
 
 void bch2_snapshot_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
-int bch2_snapshot_invalid(const struct bch_fs *, struct bkey_s_c, struct printbuf *);
+int bch2_snapshot_invalid(const struct bch_fs *, struct bkey_s_c,
+			  int rw, struct printbuf *);
 
 #define bch2_bkey_ops_snapshot (struct bkey_ops) {		\
 	.key_invalid	= bch2_snapshot_invalid,		\
@@ -96,7 +97,8 @@ int bch2_fs_snapshots_check(struct bch_fs *);
 void bch2_fs_snapshots_exit(struct bch_fs *);
 int bch2_fs_snapshots_start(struct bch_fs *);
 
-int bch2_subvolume_invalid(const struct bch_fs *, struct bkey_s_c, struct printbuf *);
+int bch2_subvolume_invalid(const struct bch_fs *, struct bkey_s_c,
+			   int rw, struct printbuf *);
 void bch2_subvolume_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 
 #define bch2_bkey_ops_subvolume (struct bkey_ops) {		\
diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c
index 55c4d48f8b38..b5e42ca35dea 100644
--- a/fs/bcachefs/xattr.c
+++ b/fs/bcachefs/xattr.c
@@ -70,7 +70,7 @@ const struct bch_hash_desc bch2_xattr_hash_desc = {
 };
 
 int bch2_xattr_invalid(const struct bch_fs *c, struct bkey_s_c k,
-		       struct printbuf *err)
+		       int rw, struct printbuf *err)
 {
 	const struct xattr_handler *handler;
 	struct bkey_s_c_xattr xattr = bkey_s_c_to_xattr(k);
diff --git a/fs/bcachefs/xattr.h b/fs/bcachefs/xattr.h
index 3fd03018fdd8..66d7a1e30350 100644
--- a/fs/bcachefs/xattr.h
+++ b/fs/bcachefs/xattr.h
@@ -6,7 +6,7 @@
 
 extern const struct bch_hash_desc bch2_xattr_hash_desc;
 
-int bch2_xattr_invalid(const struct bch_fs *, struct bkey_s_c, struct printbuf *);
+int bch2_xattr_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *);
 void bch2_xattr_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 
 #define bch2_bkey_ops_xattr (struct bkey_ops) {		\
-- 
cgit 


From 292dea86dfc974e96a4b4972f4268611c2470d28 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 6 Apr 2022 14:35:10 -0400
Subject: bcachefs: fsck: Work around transaction restarts

In check_extents() and check_dirents(), we're working towards only
handling transaction restarts in one place, at the top level - but we're
not there yet. check_i_sectors() and check_subdir_count() handle
transaction restarts locally, which means the iterator for the
dirent/extent is left unlocked (should_be_locked == 0), leading to
asserts popping when we go to do updates.

This patch hacks around this for now, until we can delete the offending
code.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/fsck.c | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 10754b13ec15..6a89b0694e50 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -1146,7 +1146,7 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
 	struct inode_walker_entry *i;
 	struct printbuf buf = PRINTBUF;
 	int ret = 0;
-
+peek:
 	k = bch2_btree_iter_peek(iter);
 	if (!k.k)
 		goto out;
@@ -1173,6 +1173,15 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
 		if (ret)
 			goto err;
 	}
+
+	if (!iter->path->should_be_locked) {
+		/*
+		 * hack: check_i_sectors may have handled a transaction restart,
+		 * it shouldn't be but we need to fix the new i_sectors check
+		 * code and delete the old bch2_count_inode_sectors() first
+		 */
+		goto peek;
+	}
 #if 0
 	if (bkey_cmp(prev.k->k.p, bkey_start_pos(k.k)) > 0) {
 		char buf1[200];
@@ -1464,7 +1473,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
 	struct inode_walker_entry *i;
 	struct printbuf buf = PRINTBUF;
 	int ret = 0;
-
+peek:
 	k = bch2_btree_iter_peek(iter);
 	if (!k.k)
 		goto out;
@@ -1492,6 +1501,11 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
 			goto err;
 	}
 
+	if (!iter->path->should_be_locked) {
+		/* hack: see check_extent() */
+		goto peek;
+	}
+
 	ret = __walk_inode(trans, dir, k.k->p);
 	if (ret < 0)
 		goto err;
-- 
cgit 


From 11c7d3e8176a2e674faefa9d9d14210f5062326c Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 6 Apr 2022 17:22:47 -0400
Subject: bcachefs: Check for read_time == 0 in bch2_alloc_v4_invalid()

We've been seeing this error in fsck and we weren't able to track down
where it came from - but now that .key_invalid methods take a rw
argument, we can safely check for this.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/alloc_background.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index cad39119949a..f030030a8b50 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -346,12 +346,23 @@ int bch2_alloc_v3_invalid(const struct bch_fs *c, struct bkey_s_c k,
 int bch2_alloc_v4_invalid(const struct bch_fs *c, struct bkey_s_c k,
 			  int rw, struct printbuf *err)
 {
+	struct bkey_s_c_alloc_v4 a = bkey_s_c_to_alloc_v4(k);
+
 	if (bkey_val_bytes(k.k) != sizeof(struct bch_alloc_v4)) {
 		pr_buf(err, "bad val size (%zu != %zu)",
 		       bkey_val_bytes(k.k), sizeof(struct bch_alloc_v4));
 		return -EINVAL;
 	}
 
+	if (rw == WRITE) {
+		if (a.v->cached_sectors &&
+		    !a.v->dirty_sectors &&
+		    !a.v->io_time[READ]) {
+			pr_buf(err, "cached bucket with read_time == 0");
+			return -EINVAL;
+		}
+	}
+
 	return 0;
 }
 
-- 
cgit 


From 1d8a268940045b73f43f86e6332bdfdfe64d0fa0 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 7 Apr 2022 17:28:09 -0400
Subject: bcachefs: Improve btree_bad_header()

In the future printbufs will be mempool-ified, so we shouldn't be using
more than one at a time if we don't have to.

This also fixes an extra trailing newline.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_cache.c | 40 +++++++++++++++++++---------------------
 1 file changed, 19 insertions(+), 21 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index 0e3db9ee65d2..6557fcb24b21 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -767,31 +767,29 @@ static int lock_node_check_fn(struct six_lock *lock, void *p)
 
 static noinline void btree_bad_header(struct bch_fs *c, struct btree *b)
 {
-	struct printbuf buf1 = PRINTBUF;
-	struct printbuf buf2 = PRINTBUF;
-	struct printbuf buf3 = PRINTBUF;
+	struct printbuf buf = PRINTBUF;
 
 	if (!test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags))
 		return;
 
-	bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(&b->key));
-	bch2_bpos_to_text(&buf2, b->data->min_key);
-	bch2_bpos_to_text(&buf3, b->data->max_key);
-
-	bch2_fs_inconsistent(c, "btree node header doesn't match ptr\n"
-			     "btree %s level %u\n"
-			     "ptr: %s\n"
-			     "header: btree %s level %llu\n"
-			     "min %s max %s\n",
-			     bch2_btree_ids[b->c.btree_id], b->c.level,
-			     buf1.buf,
-			     bch2_btree_ids[BTREE_NODE_ID(b->data)],
-			     BTREE_NODE_LEVEL(b->data),
-			     buf2.buf, buf3.buf);
-
-	printbuf_exit(&buf3);
-	printbuf_exit(&buf2);
-	printbuf_exit(&buf1);
+	pr_buf(&buf,
+	       "btree node header doesn't match ptr\n"
+	       "btree %s level %u\n"
+	       "ptr: ",
+	       bch2_btree_ids[b->c.btree_id], b->c.level);
+	bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
+
+	pr_buf(&buf, "\nheader: btree %s level %llu\n"
+	       "min ",
+	       bch2_btree_ids[BTREE_NODE_ID(b->data)],
+	       BTREE_NODE_LEVEL(b->data));
+	bch2_bpos_to_text(&buf, b->data->min_key);
+
+	pr_buf(&buf, "\nmax ");
+	bch2_bpos_to_text(&buf, b->data->max_key);
+
+	bch2_fs_inconsistent(c, "%s", buf.buf);
+	printbuf_exit(&buf);
 }
 
 static inline void btree_check_header(struct bch_fs *c, struct btree *b)
-- 
cgit 


From 62491956f48e1afda98f50250d4690131e87d6ea Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 7 Apr 2022 17:32:57 -0400
Subject: bcachefs: Move alloc assertion to .key_invalid()

.key_invalid is a better place for this assertion.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/alloc_background.c | 8 ++++++++
 fs/bcachefs/alloc_background.h | 1 -
 2 files changed, 8 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index f030030a8b50..090fdee58157 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -361,6 +361,14 @@ int bch2_alloc_v4_invalid(const struct bch_fs *c, struct bkey_s_c k,
 			pr_buf(err, "cached bucket with read_time == 0");
 			return -EINVAL;
 		}
+
+		if (!a.v->dirty_sectors &&
+		    !a.v->cached_sectors &&
+		    !a.v->stripe &&
+		    a.v->data_type) {
+			pr_buf(err, "empty, but data_type nonzero");
+			return -EINVAL;
+		}
 	}
 
 	return 0;
diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h
index 9c6a590fa073..11e0bca3e7f2 100644
--- a/fs/bcachefs/alloc_background.h
+++ b/fs/bcachefs/alloc_background.h
@@ -44,7 +44,6 @@ static inline enum bucket_state bucket_state(struct bch_alloc_v4 a)
 		return BUCKET_dirty;
 	if (a.cached_sectors)
 		return BUCKET_cached;
-	BUG_ON(a.data_type);
 	if (BCH_ALLOC_V4_NEED_DISCARD(&a))
 		return BUCKET_need_discard;
 	if (alloc_gc_gen(a) >= BUCKET_GC_GEN_MAX)
-- 
cgit 


From 5e05d7ed3d128c3d4dadee3260cd5b3f3fa1bb0d Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 7 Apr 2022 17:34:57 -0400
Subject: bcachefs: Use bch2_trans_inconsistent() more

This gets us better error messages.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_update_leaf.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 2c20ad89ca0a..c502e96748d8 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -880,7 +880,7 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans,
 			bch2_bkey_invalid(c, bkey_i_to_s_c(i->k),
 					  i->bkey_type, WRITE, &buf);
 
-			bch2_fs_fatal_error(c, "%s", buf.buf);
+			bch2_trans_inconsistent(trans, "%s", buf.buf);
 			printbuf_exit(&buf);
 			return -EINVAL;
 		}
-- 
cgit 


From 48620e5177ae7cd91722f5c504c5138160b90df4 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 7 Apr 2022 17:41:02 -0400
Subject: bcachefs: Topology repair fixes

 - We were failing to start topology repair, because we hadn't set the
   superblock flag indicating it needed to run
 - set_node_min() forget to update the btree node's key
 - bch2_gc_alloc_reset() didn't reset data type, leading to inserting an
   invalid key that was empty but had nonzero data type

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_gc.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 747667ce131d..14b772cd8fe5 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -214,7 +214,7 @@ static int set_node_min(struct bch_fs *c, struct btree *b, struct bpos new_min)
 	}
 
 	bch2_btree_node_drop_keys_outside_node(b);
-
+	bkey_copy(&b->key, &new->k_i);
 	return 0;
 }
 
@@ -359,7 +359,7 @@ static int bch2_btree_repair_topology_recurse(struct bch_fs *c, struct btree *b)
 	struct bkey_buf prev_k, cur_k;
 	struct btree *prev = NULL, *cur = NULL;
 	bool have_child, dropped_children = false;
-	struct printbuf buf;
+	struct printbuf buf = PRINTBUF;
 	int ret = 0;
 
 	if (!b->c.level)
@@ -387,7 +387,7 @@ again:
 		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(cur_k.k));
 
 		if (mustfix_fsck_err_on(ret == -EIO, c,
-				"Unreadable btree node at btree %s level %u:\n"
+				"Topology repair: unreadable btree node at btree %s level %u:\n"
 				"  %s",
 				bch2_btree_ids[b->c.btree_id],
 				b->c.level - 1,
@@ -1498,6 +1498,7 @@ static void bch2_gc_alloc_reset(struct bch_fs *c, bool metadata_only)
 			     g->data_type == BCH_DATA_cached ||
 			     g->data_type == BCH_DATA_parity))
 				continue;
+			g->data_type = 0;
 			g->dirty_sectors = 0;
 			g->cached_sectors = 0;
 		}
@@ -1735,11 +1736,11 @@ again:
 	if (BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb) &&
 	    !test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags) &&
 	    c->opts.fix_errors != FSCK_OPT_NO) {
-		bch_info(c, "starting topology repair pass");
+		bch_info(c, "Starting topology repair pass");
 		ret = bch2_repair_topology(c);
 		if (ret)
 			goto out;
-		bch_info(c, "topology repair pass done");
+		bch_info(c, "Topology repair pass done");
 
 		set_bit(BCH_FS_TOPOLOGY_REPAIR_DONE, &c->flags);
 	}
@@ -1750,6 +1751,7 @@ again:
 	    !test_bit(BCH_FS_TOPOLOGY_REPAIR_DONE, &c->flags) &&
 	    !test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags)) {
 		set_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags);
+		SET_BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb, true);
 		ret = 0;
 	}
 
-- 
cgit 


From 8058ea64c31c8700eaab48c38a143d1c3817f1de Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 7 Apr 2022 18:38:16 -0400
Subject: bcachefs: Add a sysfs attr for triggering discards

We're currently debugging an issue with discards not getting run; this
patch adds a manual trigger so we can then watch the tracepoint while it
runs.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/sysfs.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index e995b84b6172..872d7bed7b6b 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -150,6 +150,7 @@ do {									\
 } while (0)
 
 write_attribute(trigger_gc);
+write_attribute(trigger_discards);
 write_attribute(prune_cache);
 rw_attribute(btree_gc_periodic);
 rw_attribute(gc_gens_pos);
@@ -505,6 +506,9 @@ STORE(bch2_fs)
 #endif
 	}
 
+	if (attr == &sysfs_trigger_discards)
+		bch2_do_discards(c);
+
 #ifdef CONFIG_BCACHEFS_TESTS
 	if (attr == &sysfs_perf_test) {
 		char *tmp = kstrdup(buf, GFP_KERNEL), *p = tmp;
@@ -573,6 +577,7 @@ struct attribute *bch2_fs_internal_files[] = {
 	&sysfs_io_timers_write,
 
 	&sysfs_trigger_gc,
+	&sysfs_trigger_discards,
 	&sysfs_prune_cache,
 
 	&sysfs_read_realloc_races,
-- 
cgit 


From 822835ffeae411bbc8af104da9331fdf63a7bc12 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 1 Apr 2022 01:29:59 -0400
Subject: bcachefs: Fold bucket_state in to BCH_DATA_TYPES()

Previously, we were missing accounting for buckets in need_gc_gens and
need_discard states. This matters because buckets in those states need
other btree operations done before they can be used, so they can't be
conuted when checking current number of free buckets against the
allocation watermark.

Also, we weren't directly counting free buckets at all. Now, data type 0
== BCH_DATA_free, and free buckets are counted; this means we can get
rid of the separate (poorly defined) count of unavailable buckets.

This is a new on disk format version, with upgrade and fsck required for
the accounting changes.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/alloc_background.c | 143 +++++++++++++++++++++++++++--------------
 fs/bcachefs/alloc_background.h |  52 ++++++++-------
 fs/bcachefs/alloc_foreground.c |  45 ++++++++-----
 fs/bcachefs/alloc_types.h      |   6 +-
 fs/bcachefs/bcachefs_format.h  |  35 ++++++++--
 fs/bcachefs/bcachefs_ioctl.h   |  11 ++--
 fs/bcachefs/btree_gc.c         |  32 ++++++++-
 fs/bcachefs/btree_gc.h         |   6 ++
 fs/bcachefs/buckets.c          |  90 +++++++++++---------------
 fs/bcachefs/buckets.h          |  23 +++----
 fs/bcachefs/buckets_types.h    |   1 -
 fs/bcachefs/chardev.c          |   9 ++-
 fs/bcachefs/journal_io.c       |   4 +-
 fs/bcachefs/lru.c              |   2 +-
 fs/bcachefs/movinggc.c         |  11 +++-
 fs/bcachefs/recovery.c         |  15 ++---
 fs/bcachefs/super-io.c         |   1 -
 fs/bcachefs/super.c            |   2 +
 fs/bcachefs/sysfs.c            |  12 ++--
 19 files changed, 298 insertions(+), 202 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 090fdee58157..3feaac33aaff 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -35,15 +35,6 @@ static const unsigned BCH_ALLOC_V1_FIELD_BYTES[] = {
 #undef x
 };
 
-const char * const bch2_bucket_states[] = {
-	"free",
-	"need gc gens",
-	"need discard",
-	"cached",
-	"dirty",
-	NULL
-};
-
 struct bkey_alloc_unpacked {
 	u64		journal_seq;
 	u64		bucket;
@@ -355,19 +346,54 @@ int bch2_alloc_v4_invalid(const struct bch_fs *c, struct bkey_s_c k,
 	}
 
 	if (rw == WRITE) {
-		if (a.v->cached_sectors &&
-		    !a.v->dirty_sectors &&
-		    !a.v->io_time[READ]) {
-			pr_buf(err, "cached bucket with read_time == 0");
+		if (alloc_data_type(*a.v, a.v->data_type) != a.v->data_type) {
+			pr_buf(err, "invalid data type (got %u should be %u)",
+			       a.v->data_type, alloc_data_type(*a.v, a.v->data_type));
 			return -EINVAL;
 		}
 
-		if (!a.v->dirty_sectors &&
-		    !a.v->cached_sectors &&
-		    !a.v->stripe &&
-		    a.v->data_type) {
-			pr_buf(err, "empty, but data_type nonzero");
-			return -EINVAL;
+		switch (a.v->data_type) {
+		case BCH_DATA_free:
+		case BCH_DATA_need_gc_gens:
+		case BCH_DATA_need_discard:
+			if (a.v->dirty_sectors ||
+			    a.v->cached_sectors ||
+			    a.v->stripe) {
+				pr_buf(err, "empty data type free but have data");
+				return -EINVAL;
+			}
+			break;
+		case BCH_DATA_sb:
+		case BCH_DATA_journal:
+		case BCH_DATA_btree:
+		case BCH_DATA_user:
+		case BCH_DATA_parity:
+			if (!a.v->dirty_sectors) {
+				pr_buf(err, "data_type %s but dirty_sectors==0",
+				       bch2_data_types[a.v->data_type]);
+				return -EINVAL;
+			}
+			break;
+		case BCH_DATA_cached:
+			if (!a.v->cached_sectors ||
+			    a.v->dirty_sectors ||
+			    a.v->stripe) {
+				pr_buf(err, "data type inconsistency");
+				return -EINVAL;
+			}
+
+			if (!a.v->io_time[READ]) {
+				pr_buf(err, "cached bucket with read_time == 0");
+				return -EINVAL;
+			}
+			break;
+		case BCH_DATA_stripe:
+			if (!a.v->stripe) {
+				pr_buf(err, "data_type %s but stripe==0",
+				       bch2_data_types[a.v->data_type]);
+				return -EINVAL;
+			}
+			break;
 		}
 	}
 
@@ -394,9 +420,11 @@ void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c
 
 	bch2_alloc_to_v4(k, &a);
 
-	pr_buf(out, "gen %u oldest_gen %u data_type %s journal_seq %llu need_discard %llu",
+	pr_buf(out, "gen %u oldest_gen %u data_type %s journal_seq %llu need_discard %llu need_inc_gen %llu",
 	       a.gen, a.oldest_gen, bch2_data_types[a.data_type],
-	       a.journal_seq, BCH_ALLOC_V4_NEED_DISCARD(&a));
+	       a.journal_seq,
+	       BCH_ALLOC_V4_NEED_DISCARD(&a),
+	       BCH_ALLOC_V4_NEED_INC_GEN(&a));
 	pr_buf(out, " dirty_sectors %u",	a.dirty_sectors);
 	pr_buf(out, " cached_sectors %u",	a.cached_sectors);
 	pr_buf(out, " stripe %u",		a.stripe);
@@ -437,7 +465,7 @@ int bch2_alloc_read(struct bch_fs *c)
 
 static int bch2_bucket_do_index(struct btree_trans *trans,
 				struct bkey_s_c alloc_k,
-				struct bch_alloc_v4 a,
+				const struct bch_alloc_v4 *a,
 				bool set)
 {
 	struct bch_fs *c = trans->c;
@@ -445,15 +473,14 @@ static int bch2_bucket_do_index(struct btree_trans *trans,
 	struct btree_iter iter;
 	struct bkey_s_c old;
 	struct bkey_i *k;
-	enum bucket_state state = bucket_state(a);
 	enum btree_id btree;
 	enum bch_bkey_type old_type = !set ? KEY_TYPE_set : KEY_TYPE_deleted;
 	enum bch_bkey_type new_type =  set ? KEY_TYPE_set : KEY_TYPE_deleted;
 	struct printbuf buf = PRINTBUF;
 	int ret;
 
-	if (state != BUCKET_free &&
-	    state != BUCKET_need_discard)
+	if (a->data_type != BCH_DATA_free &&
+	    a->data_type != BCH_DATA_need_discard)
 		return 0;
 
 	k = bch2_trans_kmalloc(trans, sizeof(*k));
@@ -463,13 +490,13 @@ static int bch2_bucket_do_index(struct btree_trans *trans,
 	bkey_init(&k->k);
 	k->k.type = new_type;
 
-	switch (state) {
-	case BUCKET_free:
+	switch (a->data_type) {
+	case BCH_DATA_free:
 		btree = BTREE_ID_freespace;
-		k->k.p = alloc_freespace_pos(alloc_k.k->p, a);
+		k->k.p = alloc_freespace_pos(alloc_k.k->p, *a);
 		bch2_key_resize(&k->k, 1);
 		break;
-	case BUCKET_need_discard:
+	case BCH_DATA_need_discard:
 		btree = BTREE_ID_need_discard;
 		k->k.p = alloc_k.k->p;
 		break;
@@ -523,6 +550,8 @@ int bch2_trans_mark_alloc(struct btree_trans *trans,
 	bch2_alloc_to_v4(old, &old_a);
 	new_a = &bkey_i_to_alloc_v4(new)->v;
 
+	new_a->data_type = alloc_data_type(*new_a, new_a->data_type);
+
 	if (new_a->dirty_sectors > old_a.dirty_sectors ||
 	    new_a->cached_sectors > old_a.cached_sectors) {
 		new_a->io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now));
@@ -531,18 +560,18 @@ int bch2_trans_mark_alloc(struct btree_trans *trans,
 		SET_BCH_ALLOC_V4_NEED_DISCARD(new_a, true);
 	}
 
-	if (old_a.data_type && !new_a->data_type &&
-	    old_a.gen == new_a->gen &&
+	if (data_type_is_empty(new_a->data_type) &&
+	    BCH_ALLOC_V4_NEED_INC_GEN(new_a) &&
 	    !bch2_bucket_is_open_safe(c, new->k.p.inode, new->k.p.offset)) {
 		new_a->gen++;
 		SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, false);
 	}
 
-	if (bucket_state(old_a) != bucket_state(*new_a) ||
-	    (bucket_state(*new_a) == BUCKET_free &&
+	if (old_a.data_type != new_a->data_type ||
+	    (new_a->data_type == BCH_DATA_free &&
 	     alloc_freespace_genbits(old_a) != alloc_freespace_genbits(*new_a))) {
-		ret =   bch2_bucket_do_index(trans, old, old_a, false) ?:
-			bch2_bucket_do_index(trans, bkey_i_to_s_c(new), *new_a, true);
+		ret =   bch2_bucket_do_index(trans, old, &old_a, false) ?:
+			bch2_bucket_do_index(trans, bkey_i_to_s_c(new), new_a, true);
 		if (ret)
 			return ret;
 	}
@@ -594,9 +623,9 @@ static int bch2_check_alloc_key(struct btree_trans *trans,
 
 	bch2_alloc_to_v4(alloc_k, &a);
 
-	discard_key_type = bucket_state(a) == BUCKET_need_discard
+	discard_key_type = a.data_type == BCH_DATA_need_discard
 		? KEY_TYPE_set : 0;
-	freespace_key_type = bucket_state(a) == BUCKET_free
+	freespace_key_type = a.data_type == BCH_DATA_free
 		? KEY_TYPE_set : 0;
 
 	bch2_trans_iter_init(trans, &discard_iter, BTREE_ID_need_discard,
@@ -678,9 +707,9 @@ static int bch2_check_discard_freespace_key(struct btree_trans *trans,
 	u64 genbits;
 	struct bpos pos;
 	struct bkey_i *update;
-	enum bucket_state state = iter->btree_id == BTREE_ID_need_discard
-		? BUCKET_need_discard
-		: BUCKET_free;
+	enum bch_data_type state = iter->btree_id == BTREE_ID_need_discard
+		? BCH_DATA_need_discard
+		: BCH_DATA_free;
 	struct printbuf buf = PRINTBUF;
 	int ret;
 
@@ -711,13 +740,13 @@ static int bch2_check_discard_freespace_key(struct btree_trans *trans,
 
 	bch2_alloc_to_v4(k, &a);
 
-	if (fsck_err_on(bucket_state(a) != state ||
-			(state == BUCKET_free &&
+	if (fsck_err_on(a.data_type != state ||
+			(state == BCH_DATA_free &&
 			 genbits != alloc_freespace_genbits(a)), c,
 			"%s\n  incorrectly set in %s index (free %u, genbits %llu should be %llu)",
 			(bch2_bkey_val_to_text(&buf, c, k), buf.buf),
 			bch2_btree_ids[iter->btree_id],
-			bucket_state(a) == state,
+			a.data_type == state,
 			genbits >> 56, alloc_freespace_genbits(a) >> 56))
 		goto delete;
 out:
@@ -818,7 +847,7 @@ static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans,
 
 	bch2_alloc_to_v4(alloc_k, &a);
 
-	if (bucket_state(a) != BUCKET_cached)
+	if (a.data_type != BCH_DATA_cached)
 		return 0;
 
 	bch2_trans_iter_init(trans, &lru_iter, BTREE_ID_lru,
@@ -928,10 +957,19 @@ static int bch2_clear_need_discard(struct btree_trans *trans, struct bpos pos,
 		goto write;
 	}
 
-	BUG_ON(a->v.journal_seq > c->journal.flushed_seq_ondisk);
+	if (bch2_fs_inconsistent_on(a->v.journal_seq > c->journal.flushed_seq_ondisk, c,
+			"clearing need_discard but journal_seq %llu > flushed_seq %llu\n"
+			"%s",
+			a->v.journal_seq,
+			c->journal.flushed_seq_ondisk,
+			(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
+		ret = -EIO;
+		goto out;
+	}
 
-	if (bch2_fs_inconsistent_on(!BCH_ALLOC_V4_NEED_DISCARD(&a->v), c,
-			"%s\n  incorrectly set in need_discard btree",
+	if (bch2_fs_inconsistent_on(a->v.data_type != BCH_DATA_need_discard, c,
+			"bucket incorrectly set in need_discard btree\n"
+			"%s",
 			(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
 		ret = -EIO;
 		goto out;
@@ -955,6 +993,7 @@ static int bch2_clear_need_discard(struct btree_trans *trans, struct bpos pos,
 	}
 
 	SET_BCH_ALLOC_V4_NEED_DISCARD(&a->v, false);
+	a->v.data_type = alloc_data_type(a->v, a->v.data_type);
 write:
 	ret = bch2_trans_update(trans, &iter, &a->k_i, 0);
 out:
@@ -1101,12 +1140,16 @@ static void bch2_do_invalidates_work(struct work_struct *work)
 
 	bch2_trans_init(&trans, c, 0, 0);
 
-	for_each_member_device(ca, c, i)
-		while (!ret && should_invalidate_buckets(ca))
+	for_each_member_device(ca, c, i) {
+		s64 nr_to_invalidate =
+			should_invalidate_buckets(ca, bch2_dev_usage_read(ca));
+
+		while (!ret && nr_to_invalidate-- >= 0)
 			ret = __bch2_trans_do(&trans, NULL, NULL,
 					      BTREE_INSERT_USE_RESERVE|
 					      BTREE_INSERT_NOFAIL,
 					invalidate_one_bucket(&trans, ca));
+	}
 
 	bch2_trans_exit(&trans);
 	percpu_ref_put(&c->writes);
@@ -1139,7 +1182,7 @@ static int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca)
 		bch2_alloc_to_v4(k, &a);
 		ret = __bch2_trans_do(&trans, NULL, NULL,
 				      BTREE_INSERT_LAZY_RW,
-				 bch2_bucket_do_index(&trans, k, a, true));
+				 bch2_bucket_do_index(&trans, k, &a, true));
 		if (ret)
 			break;
 	}
diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h
index 11e0bca3e7f2..2bc622b305c2 100644
--- a/fs/bcachefs/alloc_background.h
+++ b/fs/bcachefs/alloc_background.h
@@ -28,32 +28,35 @@ static inline u8 alloc_gc_gen(struct bch_alloc_v4 a)
 	return a.gen - a.oldest_gen;
 }
 
-enum bucket_state {
-	BUCKET_free,
-	BUCKET_need_gc_gens,
-	BUCKET_need_discard,
-	BUCKET_cached,
-	BUCKET_dirty,
-};
-
-extern const char * const bch2_bucket_states[];
-
-static inline enum bucket_state bucket_state(struct bch_alloc_v4 a)
+static inline enum bch_data_type __alloc_data_type(u32 dirty_sectors,
+						   u32 cached_sectors,
+						   u32 stripe,
+						   struct bch_alloc_v4 a,
+						   enum bch_data_type data_type)
 {
-	if (a.dirty_sectors || a.stripe)
-		return BUCKET_dirty;
-	if (a.cached_sectors)
-		return BUCKET_cached;
+	if (dirty_sectors)
+		return data_type;
+	if (stripe)
+		return BCH_DATA_stripe;
+	if (cached_sectors)
+		return BCH_DATA_cached;
 	if (BCH_ALLOC_V4_NEED_DISCARD(&a))
-		return BUCKET_need_discard;
+		return BCH_DATA_need_discard;
 	if (alloc_gc_gen(a) >= BUCKET_GC_GEN_MAX)
-		return BUCKET_need_gc_gens;
-	return BUCKET_free;
+		return BCH_DATA_need_gc_gens;
+	return BCH_DATA_free;
+}
+
+static inline enum bch_data_type alloc_data_type(struct bch_alloc_v4 a,
+						 enum bch_data_type data_type)
+{
+	return __alloc_data_type(a.dirty_sectors, a.cached_sectors,
+				 a.stripe, a, data_type);
 }
 
 static inline u64 alloc_lru_idx(struct bch_alloc_v4 a)
 {
-	return bucket_state(a) == BUCKET_cached ? a.io_time[READ] : 0;
+	return a.data_type == BCH_DATA_cached ? a.io_time[READ] : 0;
 }
 
 static inline u64 alloc_freespace_genbits(struct bch_alloc_v4 a)
@@ -128,13 +131,14 @@ int bch2_check_alloc_info(struct bch_fs *);
 int bch2_check_alloc_to_lru_refs(struct bch_fs *);
 void bch2_do_discards(struct bch_fs *);
 
-static inline bool should_invalidate_buckets(struct bch_dev *ca)
+static inline u64 should_invalidate_buckets(struct bch_dev *ca,
+					    struct bch_dev_usage u)
 {
-	struct bch_dev_usage u = bch2_dev_usage_read(ca);
+	u64 free = u.d[BCH_DATA_free].buckets +
+		u.d[BCH_DATA_need_discard].buckets;
 
-	return u.d[BCH_DATA_cached].buckets &&
-		u.buckets_unavailable + u.d[BCH_DATA_cached].buckets <
-		ca->mi.nbuckets >> 7;
+	return clamp_t(s64, (ca->mi.nbuckets >> 7) - free,
+		       0, u.d[BCH_DATA_cached].buckets);
 }
 
 void bch2_do_invalidates(struct bch_fs *);
diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index 01abcf43341f..14162dd4d696 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -331,7 +331,7 @@ static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bc
 
 	}
 
-	if (a.data_type != BUCKET_free) {
+	if (a.data_type != BCH_DATA_free) {
 		pr_buf(&buf, "non free bucket in freespace btree\n"
 		       "  freespace key ");
 		bch2_bkey_val_to_text(&buf, c, freespace_k);
@@ -417,7 +417,7 @@ again:
 
 		bch2_alloc_to_v4(k, &a);
 
-		if (bucket_state(a) != BUCKET_free)
+		if (a.data_type != BCH_DATA_free)
 			continue;
 
 		(*buckets_seen)++;
@@ -517,27 +517,31 @@ static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans,
 {
 	struct bch_fs *c = trans->c;
 	struct open_bucket *ob = NULL;
-	u64 avail = dev_buckets_available(ca, reserve);
+	struct bch_dev_usage usage;
+	u64 avail;
 	u64 buckets_seen = 0;
 	u64 skipped_open = 0;
 	u64 skipped_need_journal_commit = 0;
 	u64 skipped_nouse = 0;
-
-	if (may_alloc_partial) {
-		ob = try_alloc_partial_bucket(c, ca, reserve);
-		if (ob)
-			return ob;
-	}
+	bool waiting = false;
 again:
+	usage = bch2_dev_usage_read(ca);
+	avail = __dev_buckets_available(ca, usage,reserve);
+
+	if (usage.d[BCH_DATA_need_discard].buckets > avail)
+		bch2_do_discards(c);
+
+	if (usage.d[BCH_DATA_need_gc_gens].buckets > avail)
+		bch2_do_gc_gens(c);
+
+	if (should_invalidate_buckets(ca, usage))
+		bch2_do_invalidates(c);
+
 	if (!avail) {
-		if (cl) {
+		if (cl && !waiting) {
 			closure_wait(&c->freelist_wait, cl);
-			/* recheck after putting ourself on waitlist */
-			avail = dev_buckets_available(ca, reserve);
-			if (avail) {
-				closure_wake_up(&c->freelist_wait);
-				goto again;
-			}
+			waiting = true;
+			goto again;
 		}
 
 		if (!c->blocked_allocate)
@@ -547,6 +551,15 @@ again:
 		goto err;
 	}
 
+	if (waiting)
+		closure_wake_up(&c->freelist_wait);
+
+	if (may_alloc_partial) {
+		ob = try_alloc_partial_bucket(c, ca, reserve);
+		if (ob)
+			return ob;
+	}
+
 	ob = likely(ca->mi.freespace_initialized)
 		? bch2_bucket_alloc_freelist(trans, ca, reserve,
 					&buckets_seen,
diff --git a/fs/bcachefs/alloc_types.h b/fs/bcachefs/alloc_types.h
index b3bef7074511..5eed5ce67c57 100644
--- a/fs/bcachefs/alloc_types.h
+++ b/fs/bcachefs/alloc_types.h
@@ -43,14 +43,14 @@ struct open_bucket {
 	 * the block in the stripe this open_bucket corresponds to:
 	 */
 	u8			ec_idx;
-	enum bch_data_type	data_type:3;
+	enum bch_data_type	data_type:8;
 	unsigned		valid:1;
 	unsigned		on_partial_list:1;
-	int			alloc_reserve:3;
+	unsigned		alloc_reserve:3;
 
-	unsigned		sectors_free;
 	u8			dev;
 	u8			gen;
+	u32			sectors_free;
 	u64			bucket;
 	struct ec_stripe_new	*ec;
 };
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index 5faa42baeeba..a84a8e088953 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -1222,13 +1222,16 @@ LE64_BITMASK(BCH_KDF_SCRYPT_P,	struct bch_sb_field_crypt, kdf_flags, 32, 48);
 /* BCH_SB_FIELD_replicas: */
 
 #define BCH_DATA_TYPES()		\
-	x(none,		0)		\
+	x(free,		0)		\
 	x(sb,		1)		\
 	x(journal,	2)		\
 	x(btree,	3)		\
 	x(user,		4)		\
 	x(cached,	5)		\
-	x(parity,	6)
+	x(parity,	6)		\
+	x(stripe,	7)		\
+	x(need_gc_gens,	8)		\
+	x(need_discard,	9)
 
 enum bch_data_type {
 #define x(t, n) BCH_DATA_##t,
@@ -1237,6 +1240,29 @@ enum bch_data_type {
 	BCH_DATA_NR
 };
 
+static inline bool data_type_is_empty(enum bch_data_type type)
+{
+	switch (type) {
+	case BCH_DATA_free:
+	case BCH_DATA_need_gc_gens:
+	case BCH_DATA_need_discard:
+		return true;
+	default:
+		return false;
+	}
+}
+
+static inline bool data_type_is_hidden(enum bch_data_type type)
+{
+	switch (type) {
+	case BCH_DATA_sb:
+	case BCH_DATA_journal:
+		return true;
+	default:
+		return false;
+	}
+}
+
 struct bch_replicas_entry_v0 {
 	__u8			data_type;
 	__u8			nr_devs;
@@ -1364,7 +1390,8 @@ struct bch_sb_field_journal_seq_blacklist {
 	x(subvol_dirent,		17)		\
 	x(inode_v2,			18)		\
 	x(freespace,			19)		\
-	x(alloc_v4,			20)
+	x(alloc_v4,			20)		\
+	x(new_data_types,		21)
 
 enum bcachefs_metadata_version {
 	bcachefs_metadata_version_min = 9,
@@ -1822,7 +1849,7 @@ struct jset_entry_dev_usage {
 	__u32			pad;
 
 	__le64			buckets_ec;
-	__le64			buckets_unavailable;
+	__le64			_buckets_unavailable; /* No longer used */
 
 	struct jset_entry_dev_usage_type d[];
 } __attribute__((packed));
diff --git a/fs/bcachefs/bcachefs_ioctl.h b/fs/bcachefs/bcachefs_ioctl.h
index 66ab3aea9767..5e0062c6ec5c 100644
--- a/fs/bcachefs/bcachefs_ioctl.h
+++ b/fs/bcachefs/bcachefs_ioctl.h
@@ -285,13 +285,14 @@ struct bch_ioctl_dev_usage {
 
 	__u32			bucket_size;
 	__u64			nr_buckets;
-	__u64			available_buckets;
 
-	__u64			buckets[BCH_DATA_NR];
-	__u64			sectors[BCH_DATA_NR];
+	__u64			buckets_ec;
 
-	__u64			ec_buckets;
-	__u64			ec_sectors;
+	struct bch_ioctl_dev_usage_type {
+		__u64		buckets;
+		__u64		sectors;
+		__u64		fragmented;
+	}			d[BCH_DATA_NR];
 };
 
 /*
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 14b772cd8fe5..0b1717120cc3 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -1216,7 +1216,6 @@ static int bch2_gc_done(struct bch_fs *c,
 					     dev_usage_u64s());
 
 		copy_dev_field(buckets_ec,		"buckets_ec");
-		copy_dev_field(buckets_unavailable,	"buckets_unavailable");
 
 		for (i = 0; i < BCH_DATA_NR; i++) {
 			copy_dev_field(d[i].buckets,	"%s buckets", bch2_data_types[i]);
@@ -1301,6 +1300,9 @@ static int bch2_gc_start(struct bch_fs *c,
 			percpu_ref_put(&ca->ref);
 			return -ENOMEM;
 		}
+
+		this_cpu_write(ca->usage_gc->d[BCH_DATA_free].buckets,
+			       ca->mi.nbuckets - ca->mi.first_bucket);
 	}
 
 	return 0;
@@ -1325,10 +1327,11 @@ static int bch2_alloc_write_key(struct btree_trans *trans,
 {
 	struct bch_fs *c = trans->c;
 	struct bch_dev *ca = bch_dev_bkey_exists(c, iter->pos.inode);
-	struct bucket gc;
+	struct bucket gc, *b;
 	struct bkey_s_c k;
 	struct bkey_i_alloc_v4 *a;
 	struct bch_alloc_v4 old, new;
+	enum bch_data_type type;
 	int ret;
 
 	k = bch2_btree_iter_peek_slot(iter);
@@ -1340,7 +1343,29 @@ static int bch2_alloc_write_key(struct btree_trans *trans,
 	new = old;
 
 	percpu_down_read(&c->mark_lock);
-	gc = *gc_bucket(ca, iter->pos.offset);
+	b = gc_bucket(ca, iter->pos.offset);
+
+	/*
+	 * b->data_type doesn't yet include need_discard & need_gc_gen states -
+	 * fix that here:
+	 */
+	type = __alloc_data_type(b->dirty_sectors,
+				 b->cached_sectors,
+				 b->stripe,
+				 old,
+				 b->data_type);
+	if (b->data_type != type) {
+		struct bch_dev_usage *u;
+
+		preempt_disable();
+		u = this_cpu_ptr(ca->usage_gc);
+		u->d[b->data_type].buckets--;
+		b->data_type = type;
+		u->d[b->data_type].buckets++;
+		preempt_enable();
+	}
+
+	gc = *b;
 	percpu_up_read(&c->mark_lock);
 
 	if (metadata_only &&
@@ -1926,6 +1951,7 @@ static int bch2_alloc_write_oldest_gen(struct btree_trans *trans, struct btree_i
 		return ret;
 
 	a_mut->v.oldest_gen = ca->oldest_gen[iter->pos.offset];
+	a_mut->v.data_type = alloc_data_type(a_mut->v, a_mut->v.data_type);
 
 	return bch2_trans_update(trans, iter, &a_mut->k_i, 0);
 }
diff --git a/fs/bcachefs/btree_gc.h b/fs/bcachefs/btree_gc.h
index 0665f5941fcc..8de54005e4ea 100644
--- a/fs/bcachefs/btree_gc.h
+++ b/fs/bcachefs/btree_gc.h
@@ -102,4 +102,10 @@ static inline bool gc_visited(struct bch_fs *c, struct gc_pos pos)
 	return ret;
 }
 
+static inline void bch2_do_gc_gens(struct bch_fs *c)
+{
+	atomic_inc(&c->kick_gc);
+	wake_up_process(c->gc_thread);
+}
+
 #endif /* _BCACHEFS_BTREE_GC_H */
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 31720093de45..7fa76e737aa7 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -283,9 +283,9 @@ bch2_fs_usage_read_short(struct bch_fs *c)
 	return ret;
 }
 
-static inline int is_unavailable_bucket(struct bch_alloc_v4 a)
+void bch2_dev_usage_init(struct bch_dev *ca)
 {
-	return a.dirty_sectors || a.stripe;
+	ca->usage_base->d[BCH_DATA_free].buckets = ca->mi.nbuckets - ca->mi.first_bucket;
 }
 
 static inline int bucket_sectors_fragmented(struct bch_dev *ca,
@@ -296,24 +296,6 @@ static inline int bucket_sectors_fragmented(struct bch_dev *ca,
 		: 0;
 }
 
-static inline enum bch_data_type bucket_type(struct bch_alloc_v4 a)
-{
-	return a.cached_sectors && !a.dirty_sectors
-		? BCH_DATA_cached
-		: a.data_type;
-}
-
-static inline void account_bucket(struct bch_fs_usage *fs_usage,
-				  struct bch_dev_usage *dev_usage,
-				  enum bch_data_type type,
-				  int nr, s64 size)
-{
-	if (type == BCH_DATA_sb || type == BCH_DATA_journal)
-		fs_usage->hidden	+= size;
-
-	dev_usage->d[type].buckets	+= nr;
-}
-
 static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
 				  struct bch_alloc_v4 old,
 				  struct bch_alloc_v4 new,
@@ -324,23 +306,25 @@ static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
 
 	preempt_disable();
 	fs_usage = fs_usage_ptr(c, journal_seq, gc);
-	u = dev_usage_ptr(ca, journal_seq, gc);
 
-	if (bucket_type(old))
-		account_bucket(fs_usage, u, bucket_type(old),
-			       -1, -ca->mi.bucket_size);
+	if (data_type_is_hidden(old.data_type))
+		fs_usage->hidden -= ca->mi.bucket_size;
+	if (data_type_is_hidden(new.data_type))
+		fs_usage->hidden += ca->mi.bucket_size;
 
-	if (bucket_type(new))
-		account_bucket(fs_usage, u, bucket_type(new),
-			       1, ca->mi.bucket_size);
+	u = dev_usage_ptr(ca, journal_seq, gc);
 
-	u->buckets_unavailable +=
-		is_unavailable_bucket(new) - is_unavailable_bucket(old);
+	u->d[old.data_type].buckets--;
+	u->d[new.data_type].buckets++;
+
+	u->buckets_ec -= (int) !!old.stripe;
+	u->buckets_ec += (int) !!new.stripe;
 
 	u->d[old.data_type].sectors -= old.dirty_sectors;
 	u->d[new.data_type].sectors += new.dirty_sectors;
-	u->d[BCH_DATA_cached].sectors +=
-		(int) new.cached_sectors - (int) old.cached_sectors;
+
+	u->d[BCH_DATA_cached].sectors += new.cached_sectors;
+	u->d[BCH_DATA_cached].sectors -= old.cached_sectors;
 
 	u->d[old.data_type].fragmented -= bucket_sectors_fragmented(ca, old);
 	u->d[new.data_type].fragmented += bucket_sectors_fragmented(ca, new);
@@ -531,7 +515,8 @@ int bch2_mark_alloc(struct btree_trans *trans,
 	bch2_alloc_to_v4(new, &new_a);
 
 	if ((flags & BTREE_TRIGGER_INSERT) &&
-	    !old_a.data_type != !new_a.data_type &&
+	    data_type_is_empty(old_a.data_type) !=
+	    data_type_is_empty(new_a.data_type) &&
 	    new.k->type == KEY_TYPE_alloc_v4) {
 		struct bch_alloc_v4 *v = (struct bch_alloc_v4 *) new.v;
 
@@ -542,14 +527,16 @@ int bch2_mark_alloc(struct btree_trans *trans,
 		 * before the bucket became empty again, then the we don't have
 		 * to wait on a journal flush before we can reuse the bucket:
 		 */
-		new_a.journal_seq = !new_a.data_type &&
+		new_a.journal_seq = data_type_is_empty(new_a.data_type) &&
 			(journal_seq == v->journal_seq ||
 			 bch2_journal_noflush_seq(&c->journal, v->journal_seq))
 			? 0 : journal_seq;
 		v->journal_seq = new_a.journal_seq;
 	}
 
-	if (old_a.data_type && !new_a.data_type && new_a.journal_seq) {
+	if (!data_type_is_empty(old_a.data_type) &&
+	    data_type_is_empty(new_a.data_type) &&
+	    new_a.journal_seq) {
 		ret = bch2_set_bucket_needs_journal_commit(&c->buckets_waiting_for_journal,
 				c->journal.flushed_seq_ondisk,
 				new.k->p.inode, new.k->p.offset,
@@ -561,24 +548,21 @@ int bch2_mark_alloc(struct btree_trans *trans,
 		}
 	}
 
-	if (!new_a.data_type &&
+	if (new_a.data_type == BCH_DATA_free &&
 	    (!new_a.journal_seq || new_a.journal_seq < c->journal.flushed_seq_ondisk))
 		closure_wake_up(&c->freelist_wait);
 
-	if ((flags & BTREE_TRIGGER_INSERT) &&
-	    BCH_ALLOC_V4_NEED_DISCARD(&new_a) &&
-	    !new_a.journal_seq)
+	if (new_a.data_type == BCH_DATA_need_discard &&
+	    (!new_a.journal_seq || new_a.journal_seq < c->journal.flushed_seq_ondisk))
 		bch2_do_discards(c);
 
-	if (!old_a.data_type &&
-	    new_a.data_type &&
-	    should_invalidate_buckets(ca))
+	if (old_a.data_type != BCH_DATA_cached &&
+	    new_a.data_type == BCH_DATA_cached &&
+	    should_invalidate_buckets(ca, bch2_dev_usage_read(ca)))
 		bch2_do_invalidates(c);
 
-	if (bucket_state(new_a) == BUCKET_need_gc_gens) {
-		atomic_inc(&c->kick_gc);
-		wake_up_process(c->gc_thread);
-	}
+	if (new_a.data_type == BCH_DATA_need_gc_gens)
+		bch2_do_gc_gens(c);
 
 	percpu_down_read(&c->mark_lock);
 	if (!gc && new_a.gen != old_a.gen)
@@ -704,6 +688,9 @@ static int check_bucket_ref(struct bch_fs *c,
 	struct printbuf buf = PRINTBUF;
 	int ret = 0;
 
+	if (bucket_data_type == BCH_DATA_cached)
+		bucket_data_type = BCH_DATA_user;
+
 	if (gen_after(ptr->gen, b_gen)) {
 		bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
 			"bucket %u:%zu gen %u data type %s: ptr gen %u newer than bucket gen\n"
@@ -748,7 +735,8 @@ static int check_bucket_ref(struct bch_fs *c,
 		goto err;
 	}
 
-	if (bucket_data_type && ptr_data_type &&
+	if (!data_type_is_empty(bucket_data_type) &&
+	    ptr_data_type &&
 	    bucket_data_type != ptr_data_type) {
 		bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
 			"bucket %u:%zu gen %u different types of data in same bucket: %s, %s\n"
@@ -1401,14 +1389,8 @@ static int bch2_trans_mark_pointer(struct btree_trans *trans,
 
 	ret = __mark_pointer(trans, k, &p.ptr, sectors, data_type,
 			     a->v.gen, &a->v.data_type,
-			     &a->v.dirty_sectors, &a->v.cached_sectors);
-	if (ret)
-		goto out;
-
-	ret = bch2_trans_update(trans, &iter, &a->k_i, 0);
-	if (ret)
-		goto out;
-out:
+			     &a->v.dirty_sectors, &a->v.cached_sectors) ?:
+		bch2_trans_update(trans, &iter, &a->k_i, 0);
 	bch2_trans_iter_exit(trans, &iter);
 	return ret;
 }
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index 053b6dc215b3..518f5104a2f7 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -121,12 +121,10 @@ static inline u8 ptr_stale(struct bch_dev *ca,
 /* Device usage: */
 
 struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *);
+void bch2_dev_usage_init(struct bch_dev *);
 
-static inline u64 __dev_buckets_available(struct bch_dev *ca,
-					  struct bch_dev_usage stats,
-					  enum alloc_reserve reserve)
+static inline u64 bch2_dev_buckets_reserved(struct bch_dev *ca, enum alloc_reserve reserve)
 {
-	s64 total = ca->mi.nbuckets - ca->mi.first_bucket;
 	s64 reserved = 0;
 
 	switch (reserve) {
@@ -141,20 +139,19 @@ static inline u64 __dev_buckets_available(struct bch_dev *ca,
 		fallthrough;
 	case RESERVE_btree_movinggc:
 		break;
-	default:
-		BUG();
 	}
 
-	if (WARN_ONCE(stats.buckets_unavailable > total,
-		      "buckets_unavailable overflow (%llu > %llu)\n",
-		      stats.buckets_unavailable, total))
-		return 0;
+	return reserved;
+}
 
+static inline u64 __dev_buckets_available(struct bch_dev *ca,
+					  struct bch_dev_usage usage,
+					  enum alloc_reserve reserve)
+{
 	return max_t(s64, 0,
-		     total -
-		     stats.buckets_unavailable -
+		     usage.d[BCH_DATA_free].buckets -
 		     ca->nr_open_buckets -
-		     reserved);
+		     bch2_dev_buckets_reserved(ca, reserve));
 }
 
 static inline u64 dev_buckets_available(struct bch_dev *ca,
diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h
index e79a33795bf9..0a9dd5af3524 100644
--- a/fs/bcachefs/buckets_types.h
+++ b/fs/bcachefs/buckets_types.h
@@ -34,7 +34,6 @@ struct bucket_gens {
 
 struct bch_dev_usage {
 	u64			buckets_ec;
-	u64			buckets_unavailable;
 
 	struct {
 		u64		buckets;
diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c
index 6cd0a2739ce5..7b448b9551b6 100644
--- a/fs/bcachefs/chardev.c
+++ b/fs/bcachefs/chardev.c
@@ -501,13 +501,12 @@ static long bch2_ioctl_dev_usage(struct bch_fs *c,
 	arg.state		= ca->mi.state;
 	arg.bucket_size		= ca->mi.bucket_size;
 	arg.nr_buckets		= ca->mi.nbuckets - ca->mi.first_bucket;
-	arg.available_buckets	= arg.nr_buckets - src.buckets_unavailable;
-	arg.ec_buckets		= src.buckets_ec;
-	arg.ec_sectors		= 0;
+	arg.buckets_ec		= src.buckets_ec;
 
 	for (i = 0; i < BCH_DATA_NR; i++) {
-		arg.buckets[i] = src.d[i].buckets;
-		arg.sectors[i] = src.d[i].sectors;
+		arg.d[i].buckets	= src.d[i].buckets;
+		arg.d[i].sectors	= src.d[i].sectors;
+		arg.d[i].fragmented	= src.d[i].fragmented;
 	}
 
 	percpu_ref_put(&ca->ref);
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 5ea685fd15e7..fad142196daa 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -585,9 +585,7 @@ static void journal_entry_dev_usage_to_text(struct printbuf *out, struct bch_fs
 		       le64_to_cpu(u->d[i].fragmented));
 	}
 
-	pr_buf(out, " buckets_ec: %llu buckets_unavailable: %llu",
-	       le64_to_cpu(u->buckets_ec),
-	       le64_to_cpu(u->buckets_unavailable));
+	pr_buf(out, " buckets_ec: %llu", le64_to_cpu(u->buckets_ec));
 }
 
 static int journal_entry_log_validate(struct bch_fs *c,
diff --git a/fs/bcachefs/lru.c b/fs/bcachefs/lru.c
index c6f433153286..267f2f8fb13b 100644
--- a/fs/bcachefs/lru.c
+++ b/fs/bcachefs/lru.c
@@ -155,7 +155,7 @@ static int bch2_check_lru_key(struct btree_trans *trans,
 
 	bch2_alloc_to_v4(k, &a);
 
-	if (fsck_err_on(bucket_state(a) != BUCKET_cached ||
+	if (fsck_err_on(a.data_type != BCH_DATA_cached ||
 			a.io_time[READ] != lru_k.k->p.offset, c,
 			"incorrect lru entry %s\n"
 			"  for %s",
diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c
index cd7a9d81dfe8..6209cb51efcb 100644
--- a/fs/bcachefs/movinggc.c
+++ b/fs/bcachefs/movinggc.c
@@ -235,8 +235,15 @@ static int bch2_copygc(struct bch_fs *c)
 	}
 
 	for_each_rw_member(ca, c, dev_idx) {
-		s64 avail = min(dev_buckets_available(ca, RESERVE_movinggc),
-				ca->mi.nbuckets >> 6);
+		struct bch_dev_usage usage = bch2_dev_usage_read(ca);
+
+		u64 avail = max_t(s64, 0,
+				  usage.d[BCH_DATA_free].buckets +
+				  usage.d[BCH_DATA_need_discard].buckets -
+				  ca->nr_open_buckets -
+				  bch2_dev_buckets_reserved(ca, RESERVE_movinggc));
+
+		avail = min(avail, ca->mi.nbuckets >> 6);
 
 		sectors_reserved += avail * ca->mi.bucket_size;
 	}
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 1fe3e81eaa3d..fd0c2a203619 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -713,7 +713,6 @@ static int journal_replay_entry_early(struct bch_fs *c,
 		unsigned i, nr_types = jset_entry_dev_usage_nr_types(u);
 
 		ca->usage_base->buckets_ec		= le64_to_cpu(u->buckets_ec);
-		ca->usage_base->buckets_unavailable	= le64_to_cpu(u->buckets_unavailable);
 
 		for (i = 0; i < min_t(unsigned, nr_types, BCH_DATA_NR); i++) {
 			ca->usage_base->d[i].buckets	= le64_to_cpu(u->d[i].buckets);
@@ -1080,18 +1079,11 @@ int bch2_fs_recovery(struct bch_fs *c)
 	}
 
 	if (!c->opts.nochanges) {
-		if (c->sb.version < bcachefs_metadata_version_inode_backpointers) {
-			bch_info(c, "version prior to inode backpointers, upgrade and fsck required");
+		if (c->sb.version < bcachefs_metadata_version_new_data_types) {
+			bch_info(c, "version prior to new_data_types, upgrade and fsck required");
 			c->opts.version_upgrade	= true;
 			c->opts.fsck		= true;
 			c->opts.fix_errors	= FSCK_OPT_YES;
-		} else if (c->sb.version < bcachefs_metadata_version_subvol_dirent) {
-			bch_info(c, "filesystem version is prior to subvol_dirent - upgrading");
-			c->opts.version_upgrade = true;
-			c->opts.fsck		= true;
-		} else if (c->sb.version < bcachefs_metadata_version_alloc_v4) {
-			bch_info(c, "filesystem version is prior to alloc_v4 - upgrading");
-			c->opts.version_upgrade = true;
 		}
 	}
 
@@ -1436,6 +1428,9 @@ int bch2_fs_initialize(struct bch_fs *c)
 	for (i = 0; i < BTREE_ID_NR; i++)
 		bch2_btree_root_alloc(c, i);
 
+	for_each_online_member(ca, c, i)
+		bch2_dev_usage_init(ca);
+
 	err = "unable to allocate journal buckets";
 	for_each_online_member(ca, c, i) {
 		ret = bch2_dev_journal_alloc(ca);
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index 7e885b51349e..c3c7043d7426 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -1275,7 +1275,6 @@ void bch2_journal_super_entries_add_common(struct bch_fs *c,
 		u->entry.type = BCH_JSET_ENTRY_dev_usage;
 		u->dev = cpu_to_le32(dev);
 		u->buckets_ec		= cpu_to_le64(ca->usage_base->buckets_ec);
-		u->buckets_unavailable	= cpu_to_le64(ca->usage_base->buckets_unavailable);
 
 		for (i = 0; i < BCH_DATA_NR; i++) {
 			u->d[i].buckets = cpu_to_le64(ca->usage_base->d[i].buckets);
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 3183f49a488f..2c3d0546f2b6 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -1566,6 +1566,8 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
 		goto err;
 	}
 
+	bch2_dev_usage_init(ca);
+
 	ret = __bch2_dev_attach_bdev(ca, &sb);
 	if (ret) {
 		bch2_dev_free(ca);
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index 872d7bed7b6b..c0cc6e9a3e05 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -724,18 +724,17 @@ static void dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca)
 		nr[c->open_buckets[i].data_type]++;
 
 	pr_buf(out,
-	       "\t\t buckets\t sectors      fragmented\n"
-	       "capacity%16llu\n",
+	       "\t\t\t buckets\t sectors      fragmented\n"
+	       "capacity\t%16llu\n",
 	       ca->mi.nbuckets - ca->mi.first_bucket);
 
-	for (i = 1; i < BCH_DATA_NR; i++)
-		pr_buf(out, "%-8s%16llu%16llu%16llu\n",
+	for (i = 0; i < BCH_DATA_NR; i++)
+		pr_buf(out, "%-16s%16llu%16llu%16llu\n",
 		       bch2_data_types[i], stats.d[i].buckets,
 		       stats.d[i].sectors, stats.d[i].fragmented);
 
 	pr_buf(out,
-	       "ec\t%16llu\n"
-	       "available%15llu\n"
+	       "ec\t\t%16llu\n"
 	       "\n"
 	       "freelist_wait\t\t%s\n"
 	       "open buckets allocated\t%u\n"
@@ -746,7 +745,6 @@ static void dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca)
 	       "open_buckets_user\t%u\n"
 	       "btree reserve cache\t%u\n",
 	       stats.buckets_ec,
-	       __dev_buckets_available(ca, stats, RESERVE_none),
 	       c->freelist_wait.list.first		? "waiting" : "empty",
 	       OPEN_BUCKETS_COUNT - c->open_buckets_nr_free,
 	       ca->nr_open_buckets,
-- 
cgit 


From 95752a02cb5d38bc97d76625de2607510ac94e69 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 21 Mar 2022 00:15:53 -0400
Subject: bcachefs: Refactor journal_keys_sort() to return an error code

When there weren't any keys in the journal there's no need to allocate
the buffer - but doing that causes a spurious -ENOMEM.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/recovery.c | 55 ++++++++++++++++++++++++--------------------------
 1 file changed, 26 insertions(+), 29 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index fd0c2a203619..9120ed26250e 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -198,7 +198,7 @@ int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id,
 	if (keys->nr == keys->size) {
 		struct journal_keys new_keys = {
 			.nr			= keys->nr,
-			.size			= keys->size * 2,
+			.size			= max(keys->size, 8UL) * 2,
 			.journal_seq_base	= keys->journal_seq_base,
 		};
 
@@ -474,57 +474,57 @@ void bch2_journal_keys_free(struct journal_keys *keys)
 	keys->nr = keys->gap = keys->size = 0;
 }
 
-static struct journal_keys journal_keys_sort(struct list_head *journal_entries)
+static int journal_keys_sort(struct bch_fs *c)
 {
 	struct journal_replay *i;
 	struct jset_entry *entry;
 	struct bkey_i *k, *_n;
-	struct journal_keys keys = { NULL };
+	struct journal_keys *keys = &c->journal_keys;
 	struct journal_key *src, *dst;
 	size_t nr_keys = 0;
 
-	if (list_empty(journal_entries))
-		return keys;
-
-	list_for_each_entry(i, journal_entries, list) {
+	list_for_each_entry(i, &c->journal_entries, list) {
 		if (i->ignore)
 			continue;
 
-		if (!keys.journal_seq_base)
-			keys.journal_seq_base = le64_to_cpu(i->j.seq);
+		if (!keys->journal_seq_base)
+			keys->journal_seq_base = le64_to_cpu(i->j.seq);
 
 		for_each_jset_key(k, _n, entry, &i->j)
 			nr_keys++;
 	}
 
-	keys.size = roundup_pow_of_two(nr_keys);
+	if (!nr_keys)
+		return 0;
+
+	keys->size = roundup_pow_of_two(nr_keys);
 
-	keys.d = kvmalloc(sizeof(keys.d[0]) * keys.size, GFP_KERNEL);
-	if (!keys.d)
-		goto err;
+	keys->d = kvmalloc(sizeof(keys->d[0]) * keys->size, GFP_KERNEL);
+	if (!keys->d)
+		return -ENOMEM;
 
-	list_for_each_entry(i, journal_entries, list) {
+	list_for_each_entry(i, &c->journal_entries, list) {
 		if (i->ignore)
 			continue;
 
-		BUG_ON(le64_to_cpu(i->j.seq) - keys.journal_seq_base > U32_MAX);
+		BUG_ON(le64_to_cpu(i->j.seq) - keys->journal_seq_base > U32_MAX);
 
 		for_each_jset_key(k, _n, entry, &i->j)
-			keys.d[keys.nr++] = (struct journal_key) {
+			keys->d[keys->nr++] = (struct journal_key) {
 				.btree_id	= entry->btree_id,
 				.level		= entry->level,
 				.k		= k,
 				.journal_seq	= le64_to_cpu(i->j.seq) -
-					keys.journal_seq_base,
+					keys->journal_seq_base,
 				.journal_offset	= k->_data - i->j._data,
 			};
 	}
 
-	sort(keys.d, keys.nr, sizeof(keys.d[0]), journal_sort_key_cmp, NULL);
+	sort(keys->d, keys->nr, sizeof(keys->d[0]), journal_sort_key_cmp, NULL);
 
-	src = dst = keys.d;
-	while (src < keys.d + keys.nr) {
-		while (src + 1 < keys.d + keys.nr &&
+	src = dst = keys->d;
+	while (src < keys->d + keys->nr) {
+		while (src + 1 < keys->d + keys->nr &&
 		       src[0].btree_id	== src[1].btree_id &&
 		       src[0].level	== src[1].level &&
 		       !bpos_cmp(src[0].k->k.p, src[1].k->k.p))
@@ -533,10 +533,9 @@ static struct journal_keys journal_keys_sort(struct list_head *journal_entries)
 		*dst++ = *src++;
 	}
 
-	keys.nr = dst - keys.d;
-	keys.gap = keys.nr;
-err:
-	return keys;
+	keys->nr = dst - keys->d;
+	keys->gap = keys->nr;
+	return 0;
 }
 
 /* journal replay: */
@@ -1122,11 +1121,9 @@ int bch2_fs_recovery(struct bch_fs *c)
 			goto use_clean;
 		}
 
-		c->journal_keys = journal_keys_sort(&c->journal_entries);
-		if (!c->journal_keys.d) {
-			ret = -ENOMEM;
+		ret = journal_keys_sort(c);
+		if (ret)
 			goto err;
-		}
 
 		if (c->sb.clean && last_journal_entry) {
 			ret = verify_superblock_clean(c, &clean,
-- 
cgit 


From ce6201c456571d919e722eec3c17f868f0575b05 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 21 Mar 2022 00:15:53 -0400
Subject: bcachefs: Use a genradix for reading journal entries

Previously, the journal read path used a linked list for storing the
journal entries we read from disk. But there's been a bug that's been
causing journal_flush_delay to incorrectly be set to 0, leading to far
more journal entries than is normal being written out, which then means
filesystems are no longer able to start due to the O(n^2) behaviour of
inserting into/searching that linked list.

Fix this by switching to a radix tree.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/bcachefs.h   |   3 +-
 fs/bcachefs/journal.c    |  34 +++++++-----
 fs/bcachefs/journal.h    |   2 +-
 fs/bcachefs/journal_io.c | 138 +++++++++++++++++++++++++++++------------------
 fs/bcachefs/journal_io.h |   3 +-
 fs/bcachefs/recovery.c   |  68 ++++++++++++-----------
 fs/bcachefs/recovery.h   |   2 +-
 fs/bcachefs/super.c      |   3 +-
 8 files changed, 151 insertions(+), 102 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index f2bb23162b4a..43e921b91d85 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -894,7 +894,8 @@ mempool_t		bio_bounce_pages;
 	mempool_t		btree_bounce_pool;
 
 	struct journal		journal;
-	struct list_head	journal_entries;
+	GENRADIX(struct journal_replay *) journal_entries;
+	u64			journal_entries_base_seq;
 	struct journal_keys	journal_keys;
 	struct list_head	journal_iters;
 
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index f87f76553bf4..f8b57de31d93 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -1039,17 +1039,25 @@ void bch2_fs_journal_stop(struct journal *j)
 	cancel_delayed_work_sync(&j->write_work);
 }
 
-int bch2_fs_journal_start(struct journal *j, u64 cur_seq,
-			  struct list_head *journal_entries)
+int bch2_fs_journal_start(struct journal *j, u64 cur_seq)
 {
 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
 	struct journal_entry_pin_list *p;
-	struct journal_replay *i;
+	struct journal_replay *i, **_i;
+	struct genradix_iter iter;
+	bool had_entries = false;
+	unsigned ptr;
 	u64 last_seq = cur_seq, nr, seq;
 
-	if (!list_empty(journal_entries))
-		last_seq = le64_to_cpu(list_last_entry(journal_entries,
-				struct journal_replay, list)->j.last_seq);
+	genradix_for_each_reverse(&c->journal_entries, iter, _i) {
+		i = *_i;
+
+		if (!i || i->ignore)
+			continue;
+
+		last_seq = le64_to_cpu(i->j.last_seq);
+		break;
+	}
 
 	nr = cur_seq - last_seq;
 
@@ -1071,14 +1079,14 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq,
 	j->pin.back		= cur_seq;
 	atomic64_set(&j->seq, cur_seq - 1);
 
-	if (list_empty(journal_entries))
-		j->last_empty_seq = cur_seq - 1;
-
 	fifo_for_each_entry_ptr(p, &j->pin, seq)
 		journal_pin_list_init(p, 1);
 
-	list_for_each_entry(i, journal_entries, list) {
-		unsigned ptr;
+	genradix_for_each(&c->journal_entries, iter, _i) {
+		i = *_i;
+
+		if (!i || i->ignore)
+			continue;
 
 		seq = le64_to_cpu(i->j.seq);
 		BUG_ON(seq >= cur_seq);
@@ -1094,9 +1102,11 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq,
 		p->devs.nr = 0;
 		for (ptr = 0; ptr < i->nr_ptrs; ptr++)
 			bch2_dev_list_add_dev(&p->devs, i->ptrs[ptr].dev);
+
+		had_entries = true;
 	}
 
-	if (list_empty(journal_entries))
+	if (!had_entries)
 		j->last_empty_seq = cur_seq;
 
 	spin_lock(&j->lock);
diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h
index c287ecf643aa..59453dcfa4e9 100644
--- a/fs/bcachefs/journal.h
+++ b/fs/bcachefs/journal.h
@@ -510,7 +510,7 @@ int bch2_dev_journal_alloc(struct bch_dev *);
 void bch2_dev_journal_stop(struct journal *, struct bch_dev *);
 
 void bch2_fs_journal_stop(struct journal *);
-int bch2_fs_journal_start(struct journal *, u64, struct list_head *);
+int bch2_fs_journal_start(struct journal *, u64);
 
 void bch2_dev_journal_exit(struct bch_dev *);
 int bch2_dev_journal_init(struct bch_dev *, struct bch_sb *);
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index fad142196daa..c84c0b840906 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -16,12 +16,22 @@
 #include "replicas.h"
 #include "trace.h"
 
-static void __journal_replay_free(struct journal_replay *i)
+static inline u32 journal_entry_radix_idx(struct bch_fs *c,
+					  struct jset *j)
 {
-	list_del(&i->list);
+	return (le64_to_cpu(j->seq) - c->journal_entries_base_seq) & (~0U >> 1);
+}
+
+static void __journal_replay_free(struct bch_fs *c,
+				  struct journal_replay *i)
+{
+	struct journal_replay **p =
+		genradix_ptr(&c->journal_entries, journal_entry_radix_idx(c, &i->j));
+
+	BUG_ON(*p != i);
+	*p = NULL;
 	kvpfree(i, offsetof(struct journal_replay, j) +
 		vstruct_bytes(&i->j));
-
 }
 
 static void journal_replay_free(struct bch_fs *c, struct journal_replay *i)
@@ -29,13 +39,12 @@ static void journal_replay_free(struct bch_fs *c, struct journal_replay *i)
 	i->ignore = true;
 
 	if (!c->opts.read_entire_journal)
-		__journal_replay_free(i);
+		__journal_replay_free(c, i);
 }
 
 struct journal_list {
 	struct closure		cl;
 	struct mutex		lock;
-	struct list_head	*head;
 	int			ret;
 };
 
@@ -51,19 +60,30 @@ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca,
 			     struct journal_list *jlist, struct jset *j,
 			     bool bad)
 {
-	struct journal_replay *i, *pos, *dup = NULL;
+	struct genradix_iter iter;
+	struct journal_replay **_i, *i, *dup;
 	struct journal_ptr *ptr;
-	struct list_head *where;
 	size_t bytes = vstruct_bytes(j);
 	u64 last_seq = 0;
 	int ret = JOURNAL_ENTRY_ADD_OK;
 
+	/*
+	 * Xarrays are indexed by a ulong, not a u64, so we can't index them by
+	 * sequence number directly:
+	 * Assume instead that they will all fall within the range of +-2billion
+	 * of the filrst one we find.
+	 */
+	if (!c->journal_entries_base_seq)
+		c->journal_entries_base_seq = max_t(s64, 1, le64_to_cpu(j->seq) - S32_MAX);
+
+#if 0
 	list_for_each_entry_reverse(i, jlist->head, list) {
 		if (!JSET_NO_FLUSH(&i->j)) {
 			last_seq = le64_to_cpu(i->j.last_seq);
 			break;
 		}
 	}
+#endif
 
 	/* Is this entry older than the range we need? */
 	if (!c->opts.read_entire_journal &&
@@ -73,29 +93,21 @@ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca,
 	}
 
 	/* Drop entries we don't need anymore */
-	if (!JSET_NO_FLUSH(j)) {
-		list_for_each_entry_safe(i, pos, jlist->head, list) {
+	if (!JSET_NO_FLUSH(j) && !c->opts.read_entire_journal) {
+		genradix_for_each(&c->journal_entries, iter, _i) {
+			i = *_i;
+
+			if (!i)
+				continue;
+
 			if (le64_to_cpu(i->j.seq) >= le64_to_cpu(j->last_seq))
 				break;
 			journal_replay_free(c, i);
 		}
 	}
 
-	list_for_each_entry_reverse(i, jlist->head, list) {
-		if (le64_to_cpu(j->seq) > le64_to_cpu(i->j.seq)) {
-			where = &i->list;
-			goto add;
-		}
-	}
-
-	where = jlist->head;
-add:
-	dup = where->next != jlist->head
-		? container_of(where->next, struct journal_replay, list)
-		: NULL;
-
-	if (dup && le64_to_cpu(j->seq) != le64_to_cpu(dup->j.seq))
-		dup = NULL;
+	_i = genradix_ptr(&c->journal_entries, journal_entry_radix_idx(c, j));
+	dup = _i ? *_i : NULL;
 
 	/*
 	 * Duplicate journal entries? If so we want the one that didn't have a
@@ -131,10 +143,19 @@ add:
 	if (dup) {
 		i->nr_ptrs = dup->nr_ptrs;
 		memcpy(i->ptrs, dup->ptrs, sizeof(dup->ptrs));
-		__journal_replay_free(dup);
+		__journal_replay_free(c, dup);
 	}
 
-	list_add(&i->list, where);
+	_i = genradix_ptr_alloc(&c->journal_entries,
+				journal_entry_radix_idx(c, &i->j),
+				GFP_KERNEL);
+	if (!_i) {
+		bch_err(c, "failed to allocate c->journal_entries entry");
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	*_i = i;
 found:
 	for (ptr = i->ptrs; ptr < i->ptrs + i->nr_ptrs; ptr++) {
 		if (ptr->dev == ca->dev_idx) {
@@ -913,7 +934,8 @@ static void bch2_journal_read_device(struct closure *cl)
 	struct bch_fs *c = ca->fs;
 	struct journal_list *jlist =
 		container_of(cl->parent, struct journal_list, cl);
-	struct journal_replay *r;
+	struct journal_replay *r, **_r;
+	struct genradix_iter iter;
 	struct journal_read_buf buf = { NULL, 0 };
 	u64 min_seq = U64_MAX;
 	unsigned i;
@@ -956,7 +978,12 @@ static void bch2_journal_read_device(struct closure *cl)
 	ja->sectors_free = ca->mi.bucket_size;
 
 	mutex_lock(&jlist->lock);
-	list_for_each_entry(r, jlist->head, list) {
+	genradix_for_each(&c->journal_entries, iter, _r) {
+		r = *_r;
+
+		if (!r)
+			continue;
+
 		for (i = 0; i < r->nr_ptrs; i++) {
 			if (r->ptrs[i].dev == ca->dev_idx &&
 			    sector_to_bucket(ca, r->ptrs[i].sector) == ja->buckets[ja->cur_idx]) {
@@ -1022,11 +1049,11 @@ void bch2_journal_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
 	}
 }
 
-int bch2_journal_read(struct bch_fs *c, struct list_head *list,
-		      u64 *blacklist_seq, u64 *start_seq)
+int bch2_journal_read(struct bch_fs *c, u64 *blacklist_seq, u64 *start_seq)
 {
 	struct journal_list jlist;
-	struct journal_replay *i, *t;
+	struct journal_replay *i, **_i, *prev = NULL;
+	struct genradix_iter radix_iter;
 	struct bch_dev *ca;
 	unsigned iter;
 	struct printbuf buf = PRINTBUF;
@@ -1037,7 +1064,6 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list,
 
 	closure_init_stack(&jlist.cl);
 	mutex_init(&jlist.lock);
-	jlist.head = list;
 	jlist.ret = 0;
 
 	for_each_member_device(ca, c, iter) {
@@ -1061,22 +1087,21 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list,
 	if (jlist.ret)
 		return jlist.ret;
 
-	if (list_empty(list)) {
-		bch_info(c, "journal read done, but no entries found");
-		return 0;
-	}
-
-	i = list_last_entry(list, struct journal_replay, list);
-	*start_seq = le64_to_cpu(i->j.seq) + 1;
+	*start_seq = 0;
 
 	/*
 	 * Find most recent flush entry, and ignore newer non flush entries -
 	 * those entries will be blacklisted:
 	 */
-	list_for_each_entry_safe_reverse(i, t, list, list) {
-		if (i->ignore)
+	genradix_for_each_reverse(&c->journal_entries, radix_iter, _i) {
+		i = *_i;
+
+		if (!i || i->ignore)
 			continue;
 
+		if (!*start_seq)
+			*start_seq = le64_to_cpu(i->j.seq) + 1;
+
 		if (!JSET_NO_FLUSH(&i->j)) {
 			last_seq	= le64_to_cpu(i->j.last_seq);
 			*blacklist_seq	= le64_to_cpu(i->j.seq) + 1;
@@ -1086,6 +1111,11 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list,
 		journal_replay_free(c, i);
 	}
 
+	if (!*start_seq) {
+		bch_info(c, "journal read done, but no entries found");
+		return 0;
+	}
+
 	if (!last_seq) {
 		fsck_err(c, "journal read done, but no entries found after dropping non-flushes");
 		ret = -1;
@@ -1093,8 +1123,10 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list,
 	}
 
 	/* Drop blacklisted entries and entries older than last_seq: */
-	list_for_each_entry_safe(i, t, list, list) {
-		if (i->ignore)
+	genradix_for_each(&c->journal_entries, radix_iter, _i) {
+		i = *_i;
+
+		if (!i || i->ignore)
 			continue;
 
 		seq = le64_to_cpu(i->j.seq);
@@ -1113,8 +1145,10 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list,
 
 	/* Check for missing entries: */
 	seq = last_seq;
-	list_for_each_entry(i, list, list) {
-		if (i->ignore)
+	genradix_for_each(&c->journal_entries, radix_iter, _i) {
+		i = *_i;
+
+		if (!i || i->ignore)
 			continue;
 
 		BUG_ON(seq > le64_to_cpu(i->j.seq));
@@ -1136,11 +1170,9 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list,
 			       !bch2_journal_seq_is_blacklisted(c, seq, false))
 				seq++;
 
-			if (i->list.prev != list) {
-				struct journal_replay *p = list_prev_entry(i, list);
-
-				bch2_journal_ptrs_to_text(&buf1, c, p);
-				pr_buf(&buf1, " size %zu", vstruct_sectors(&p->j, c->block_bits));
+			if (prev) {
+				bch2_journal_ptrs_to_text(&buf1, c, prev);
+				pr_buf(&buf1, " size %zu", vstruct_sectors(&prev->j, c->block_bits));
 			} else
 				pr_buf(&buf1, "(none)");
 			bch2_journal_ptrs_to_text(&buf2, c, i);
@@ -1157,10 +1189,11 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list,
 			printbuf_exit(&buf2);
 		}
 
+		prev = i;
 		seq++;
 	}
 
-	list_for_each_entry(i, list, list) {
+	genradix_for_each(&c->journal_entries, radix_iter, _i) {
 		struct jset_entry *entry;
 		struct bkey_i *k, *_n;
 		struct bch_replicas_padded replicas = {
@@ -1169,7 +1202,8 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list,
 		};
 		unsigned ptr;
 
-		if (i->ignore)
+		i = *_i;
+		if (!i || i->ignore)
 			continue;
 
 		ret = jset_validate_entries(c, &i->j, READ);
diff --git a/fs/bcachefs/journal_io.h b/fs/bcachefs/journal_io.h
index f2001835e43e..30e995c81fc4 100644
--- a/fs/bcachefs/journal_io.h
+++ b/fs/bcachefs/journal_io.h
@@ -7,7 +7,6 @@
  * during cache_registration
  */
 struct journal_replay {
-	struct list_head	list;
 	struct journal_ptr {
 		u8		dev;
 		u32		bucket;
@@ -53,7 +52,7 @@ void bch2_journal_entry_to_text(struct printbuf *, struct bch_fs *,
 void bch2_journal_ptrs_to_text(struct printbuf *, struct bch_fs *,
 			       struct journal_replay *);
 
-int bch2_journal_read(struct bch_fs *, struct list_head *, u64 *, u64 *);
+int bch2_journal_read(struct bch_fs *, u64 *, u64 *);
 
 void bch2_journal_write(struct closure *);
 
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 9120ed26250e..07ce6a540856 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -433,16 +433,16 @@ void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *i
 
 /* sort and dedup all keys in the journal: */
 
-void bch2_journal_entries_free(struct list_head *list)
+void bch2_journal_entries_free(struct bch_fs *c)
 {
-
-	while (!list_empty(list)) {
-		struct journal_replay *i =
-			list_first_entry(list, struct journal_replay, list);
-		list_del(&i->list);
-		kvpfree(i, offsetof(struct journal_replay, j) +
-			vstruct_bytes(&i->j));
-	}
+	struct journal_replay **i;
+	struct genradix_iter iter;
+
+	genradix_for_each(&c->journal_entries, iter, i)
+		if (*i)
+			kvpfree(*i, offsetof(struct journal_replay, j) +
+				vstruct_bytes(&(*i)->j));
+	genradix_free(&c->journal_entries);
 }
 
 /*
@@ -476,15 +476,18 @@ void bch2_journal_keys_free(struct journal_keys *keys)
 
 static int journal_keys_sort(struct bch_fs *c)
 {
-	struct journal_replay *i;
+	struct genradix_iter iter;
+	struct journal_replay *i, **_i;
 	struct jset_entry *entry;
 	struct bkey_i *k, *_n;
 	struct journal_keys *keys = &c->journal_keys;
 	struct journal_key *src, *dst;
 	size_t nr_keys = 0;
 
-	list_for_each_entry(i, &c->journal_entries, list) {
-		if (i->ignore)
+	genradix_for_each(&c->journal_entries, iter, _i) {
+		i = *_i;
+
+		if (!i || i->ignore)
 			continue;
 
 		if (!keys->journal_seq_base)
@@ -503,8 +506,10 @@ static int journal_keys_sort(struct bch_fs *c)
 	if (!keys->d)
 		return -ENOMEM;
 
-	list_for_each_entry(i, &c->journal_entries, list) {
-		if (i->ignore)
+	genradix_for_each(&c->journal_entries, iter, _i) {
+		i = *_i;
+
+		if (!i || i->ignore)
 			continue;
 
 		BUG_ON(le64_to_cpu(i->j.seq) - keys->journal_seq_base > U32_MAX);
@@ -751,10 +756,8 @@ static int journal_replay_entry_early(struct bch_fs *c,
 }
 
 static int journal_replay_early(struct bch_fs *c,
-				struct bch_sb_field_clean *clean,
-				struct list_head *journal)
+				struct bch_sb_field_clean *clean)
 {
-	struct journal_replay *i;
 	struct jset_entry *entry;
 	int ret;
 
@@ -767,8 +770,13 @@ static int journal_replay_early(struct bch_fs *c,
 				return ret;
 		}
 	} else {
-		list_for_each_entry(i, journal, list) {
-			if (i->ignore)
+		struct genradix_iter iter;
+		struct journal_replay *i, **_i;
+
+		genradix_for_each(&c->journal_entries, iter, _i) {
+			i = *_i;
+
+			if (!i || i->ignore)
 				continue;
 
 			vstruct_for_each(&i->j, entry) {
@@ -1093,17 +1101,17 @@ int bch2_fs_recovery(struct bch_fs *c)
 	}
 
 	if (!c->sb.clean || c->opts.fsck || c->opts.keep_journal) {
-		struct journal_replay *i;
+		struct genradix_iter iter;
+		struct journal_replay **i;
 
 		bch_verbose(c, "starting journal read");
-		ret = bch2_journal_read(c, &c->journal_entries,
-					&blacklist_seq, &journal_seq);
+		ret = bch2_journal_read(c, &blacklist_seq, &journal_seq);
 		if (ret)
 			goto err;
 
-		list_for_each_entry_reverse(i, &c->journal_entries, list)
-			if (!i->ignore) {
-				last_journal_entry = &i->j;
+		genradix_for_each_reverse(&c->journal_entries, iter, i)
+			if (*i && !(*i)->ignore) {
+				last_journal_entry = &(*i)->j;
 				break;
 			}
 
@@ -1152,7 +1160,7 @@ use_clean:
 
 	zero_out_btree_mem_ptr(&c->journal_keys);
 
-	ret = journal_replay_early(c, clean, &c->journal_entries);
+	ret = journal_replay_early(c, clean);
 	if (ret)
 		goto err;
 
@@ -1175,8 +1183,7 @@ use_clean:
 		}
 	}
 
-	ret = bch2_fs_journal_start(&c->journal, journal_seq,
-				    &c->journal_entries);
+	ret = bch2_fs_journal_start(&c->journal, journal_seq);
 	if (ret)
 		goto err;
 
@@ -1380,7 +1387,7 @@ out:
 
 	if (!c->opts.keep_journal) {
 		bch2_journal_keys_free(&c->journal_keys);
-		bch2_journal_entries_free(&c->journal_entries);
+		bch2_journal_entries_free(c);
 	}
 	kfree(clean);
 	if (ret)
@@ -1401,7 +1408,6 @@ int bch2_fs_initialize(struct bch_fs *c)
 	struct qstr lostfound = QSTR("lost+found");
 	const char *err = "cannot allocate memory";
 	struct bch_dev *ca;
-	LIST_HEAD(journal);
 	unsigned i;
 	int ret;
 
@@ -1441,7 +1447,7 @@ int bch2_fs_initialize(struct bch_fs *c)
 	 * journal_res_get() will crash if called before this has
 	 * set up the journal.pin FIFO and journal.cur pointer:
 	 */
-	bch2_fs_journal_start(&c->journal, 1, &journal);
+	bch2_fs_journal_start(&c->journal, 1);
 	bch2_journal_set_replay_done(&c->journal);
 
 	err = "error going read-write";
diff --git a/fs/bcachefs/recovery.h b/fs/bcachefs/recovery.h
index 30580a8984a1..ab8b116ac7db 100644
--- a/fs/bcachefs/recovery.h
+++ b/fs/bcachefs/recovery.h
@@ -55,7 +55,7 @@ void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *,
 						struct btree *);
 
 void bch2_journal_keys_free(struct journal_keys *);
-void bch2_journal_entries_free(struct list_head *);
+void bch2_journal_entries_free(struct bch_fs *);
 
 int bch2_fs_recovery(struct bch_fs *);
 int bch2_fs_initialize(struct bch_fs *);
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 2c3d0546f2b6..689c82f6cb5d 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -457,7 +457,7 @@ static void __bch2_fs_free(struct bch_fs *c)
 	bch2_io_clock_exit(&c->io_clock[READ]);
 	bch2_fs_compress_exit(c);
 	bch2_journal_keys_free(&c->journal_keys);
-	bch2_journal_entries_free(&c->journal_entries);
+	bch2_journal_entries_free(c);
 	percpu_free_rwsem(&c->mark_lock);
 	free_percpu(c->online_reserved);
 
@@ -676,7 +676,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 	INIT_WORK(&c->journal_seq_blacklist_gc_work,
 		  bch2_blacklist_entries_gc);
 
-	INIT_LIST_HEAD(&c->journal_entries);
 	INIT_LIST_HEAD(&c->journal_iters);
 
 	INIT_LIST_HEAD(&c->fsck_errors);
-- 
cgit 


From 84c72755b9aab31ed43e50eb5c7229d7ef042f7d Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 9 Apr 2022 01:23:50 -0400
Subject: bcachefs: Initialize ec work structs early

We need to ensure that work structs in bch_fs always get initialized -
otherwise an error in filesystem initialization can pop a warning in the
workqueue code when we try to cancel a work struct that wasn't
initialized.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/ec.c    | 5 ++++-
 fs/bcachefs/ec.h    | 1 +
 fs/bcachefs/super.c | 1 +
 3 files changed, 6 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index 7a524f604875..ae33d3ea8ec1 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -1682,11 +1682,14 @@ void bch2_fs_ec_exit(struct bch_fs *c)
 	bioset_exit(&c->ec_bioset);
 }
 
-int bch2_fs_ec_init(struct bch_fs *c)
+void bch2_fs_ec_init_early(struct bch_fs *c)
 {
 	INIT_WORK(&c->ec_stripe_create_work, ec_stripe_create_work);
 	INIT_WORK(&c->ec_stripe_delete_work, ec_stripe_delete_work);
+}
 
+int bch2_fs_ec_init(struct bch_fs *c)
+{
 	return bioset_init(&c->ec_bioset, 1, offsetof(struct ec_bio, bio),
 			   BIOSET_NEED_BVECS);
 }
diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h
index af7f8eee94b0..a4c13d61af10 100644
--- a/fs/bcachefs/ec.h
+++ b/fs/bcachefs/ec.h
@@ -224,6 +224,7 @@ void bch2_stripes_heap_to_text(struct printbuf *, struct bch_fs *);
 void bch2_new_stripes_to_text(struct printbuf *, struct bch_fs *);
 
 void bch2_fs_ec_exit(struct bch_fs *);
+void bch2_fs_ec_init_early(struct bch_fs *);
 int bch2_fs_ec_init(struct bch_fs *);
 
 #endif /* _BCACHEFS_EC_H */
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 689c82f6cb5d..d2776efa9985 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -663,6 +663,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 	bch2_fs_allocator_foreground_init(c);
 	bch2_fs_rebalance_init(c);
 	bch2_fs_quota_init(c);
+	bch2_fs_ec_init_early(c);
 
 	INIT_LIST_HEAD(&c->list);
 
-- 
cgit 


From 7c4ca54ae68c4ae24dbfb8b209657a5249a5f0b7 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 8 Apr 2022 23:54:14 -0400
Subject: bcachefs: Don't skip triggers in fcollapse()

With backpointers this doesn't work anymore - backpointers always need
to be updated to point to the new extent position.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/fs-io.c | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 14550ac610c6..85a4484bec65 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -2894,13 +2894,7 @@ reassemble:
 
 		next_pos = insert ? bkey_start_pos(&delete.k) : delete.k.p;
 
-		if (copy.k->k.size == k.k->size) {
-			/*
-			 * If we're moving the entire extent, we can skip
-			 * running triggers:
-			 */
-			trigger_flags |= BTREE_TRIGGER_NORUN;
-		} else {
+		if (copy.k->k.size != k.k->size) {
 			/* We might end up splitting compressed extents: */
 			unsigned nr_ptrs =
 				bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(copy.k));
-- 
cgit 


From aae29082c63a4bfb7b6be5bc22b4727b7da14a7f Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 9 Apr 2022 15:07:11 -0400
Subject: bcachefs: bch2_btree_delete_extent_at()

New helper, for deleting extents.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/alloc_background.c  | 18 ++----------------
 fs/bcachefs/btree_update.h      |  2 ++
 fs/bcachefs/btree_update_leaf.c | 11 +++++++++--
 3 files changed, 13 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 3feaac33aaff..6110d4ce4e5f 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -706,7 +706,6 @@ static int bch2_check_discard_freespace_key(struct btree_trans *trans,
 	struct bch_alloc_v4 a;
 	u64 genbits;
 	struct bpos pos;
-	struct bkey_i *update;
 	enum bch_data_type state = iter->btree_id == BTREE_ID_need_discard
 		? BCH_DATA_need_discard
 		: BCH_DATA_free;
@@ -756,21 +755,8 @@ fsck_err:
 	printbuf_exit(&buf);
 	return ret;
 delete:
-	if (iter->btree_id == BTREE_ID_freespace) {
-		/* should probably add a helper for deleting extents */
-		update = bch2_trans_kmalloc(trans, sizeof(*update));
-		ret = PTR_ERR_OR_ZERO(update);
-		if (ret)
-			goto err;
-
-		bkey_init(&update->k);
-		update->k.p = iter->pos;
-		bch2_key_resize(&update->k, 1);
-
-		ret = bch2_trans_update(trans, iter, update, 0);
-	} else {
-		ret = bch2_btree_delete_at(trans, iter, 0);
-	}
+	ret = bch2_btree_delete_extent_at(trans, iter,
+			iter->btree_id == BTREE_ID_freespace ? 1 : 0, 0);
 	goto out;
 }
 
diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
index ad13b0739a68..a40f3460fd62 100644
--- a/fs/bcachefs/btree_update.h
+++ b/fs/bcachefs/btree_update.h
@@ -51,6 +51,8 @@ enum btree_insert_flags {
 #define BCH_HASH_SET_MUST_CREATE	(1 << __BCH_HASH_SET_MUST_CREATE)
 #define BCH_HASH_SET_MUST_REPLACE	(1 << __BCH_HASH_SET_MUST_REPLACE)
 
+int bch2_btree_delete_extent_at(struct btree_trans *, struct btree_iter *,
+				unsigned, unsigned);
 int bch2_btree_delete_at(struct btree_trans *, struct btree_iter *, unsigned);
 
 int __bch2_btree_insert(struct btree_trans *, enum btree_id, struct bkey_i *);
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index c502e96748d8..449fbae585cb 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -1697,8 +1697,8 @@ int bch2_btree_insert(struct bch_fs *c, enum btree_id id,
 			     __bch2_btree_insert(&trans, id, k));
 }
 
-int bch2_btree_delete_at(struct btree_trans *trans,
-			 struct btree_iter *iter, unsigned update_flags)
+int bch2_btree_delete_extent_at(struct btree_trans *trans, struct btree_iter *iter,
+				unsigned len, unsigned update_flags)
 {
 	struct bkey_i *k;
 
@@ -1708,9 +1708,16 @@ int bch2_btree_delete_at(struct btree_trans *trans,
 
 	bkey_init(&k->k);
 	k->k.p = iter->pos;
+	bch2_key_resize(&k->k, len);
 	return bch2_trans_update(trans, iter, k, update_flags);
 }
 
+int bch2_btree_delete_at(struct btree_trans *trans,
+			 struct btree_iter *iter, unsigned update_flags)
+{
+	return bch2_btree_delete_extent_at(trans, iter, 0, update_flags);
+}
+
 int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id id,
 				  struct bpos start, struct bpos end,
 				  unsigned update_flags,
-- 
cgit 


From 502f973dba660ed04f295e5ba129f2d369cc1aa6 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 9 Apr 2022 20:29:26 -0400
Subject: bcachefs: Fix a few warnings on 32 bit

These showed up when building for mips.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/bcachefs_format.h | 5 ++---
 fs/bcachefs/btree_key_cache.c | 4 ++--
 fs/bcachefs/dirent.c          | 2 +-
 fs/bcachefs/journal_io.c      | 2 +-
 fs/bcachefs/recovery.c        | 2 +-
 5 files changed, 7 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index a84a8e088953..969507c42c55 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -851,10 +851,9 @@ struct bch_dirent {
 #define DT_SUBVOL	16
 #define BCH_DT_MAX	17
 
-#define BCH_NAME_MAX	(U8_MAX * sizeof(u64) -				\
+#define BCH_NAME_MAX	((unsigned) (U8_MAX * sizeof(u64) -		\
 			 sizeof(struct bkey) -				\
-			 offsetof(struct bch_dirent, d_name))
-
+			 offsetof(struct bch_dirent, d_name)))
 
 /* Xattrs */
 
diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index f856dee0c3aa..f86d57d1ace0 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -742,8 +742,8 @@ int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc)
 void bch2_btree_key_cache_to_text(struct printbuf *out, struct btree_key_cache *c)
 {
 	pr_buf(out, "nr_freed:\t%zu\n",	c->nr_freed);
-	pr_buf(out, "nr_keys:\t%zu\n",	atomic_long_read(&c->nr_keys));
-	pr_buf(out, "nr_dirty:\t%zu\n",	atomic_long_read(&c->nr_dirty));
+	pr_buf(out, "nr_keys:\t%lu\n",	atomic_long_read(&c->nr_keys));
+	pr_buf(out, "nr_dirty:\t%lu\n",	atomic_long_read(&c->nr_dirty));
 }
 
 void bch2_btree_key_cache_exit(void)
diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c
index 281959885bb0..716c85062cea 100644
--- a/fs/bcachefs/dirent.c
+++ b/fs/bcachefs/dirent.c
@@ -108,7 +108,7 @@ int bch2_dirent_invalid(const struct bch_fs *c, struct bkey_s_c k,
 	}
 
 	if (len > BCH_NAME_MAX) {
-		pr_buf(err, "dirent name too big (%u > %lu)",
+		pr_buf(err, "dirent name too big (%u > %u)",
 		       len, BCH_NAME_MAX);
 		return -EINVAL;
 	}
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index c84c0b840906..64341da75963 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -987,7 +987,7 @@ static void bch2_journal_read_device(struct closure *cl)
 		for (i = 0; i < r->nr_ptrs; i++) {
 			if (r->ptrs[i].dev == ca->dev_idx &&
 			    sector_to_bucket(ca, r->ptrs[i].sector) == ja->buckets[ja->cur_idx]) {
-				unsigned wrote = (r->ptrs[i].sector % ca->mi.bucket_size) +
+				unsigned wrote = bucket_remainder(ca, r->ptrs[i].sector) +
 					vstruct_sectors(&r->j, c->block_bits);
 
 				ja->sectors_free = min(ja->sectors_free,
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 07ce6a540856..9269cb686e4d 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -198,7 +198,7 @@ int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id,
 	if (keys->nr == keys->size) {
 		struct journal_keys new_keys = {
 			.nr			= keys->nr,
-			.size			= max(keys->size, 8UL) * 2,
+			.size			= max_t(size_t, keys->size, 8) * 2,
 			.journal_seq_base	= keys->journal_seq_base,
 		};
 
-- 
cgit 


From a9c0a4cbf1ceb9842fee5d7084817509a5e962aa Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 9 Apr 2022 15:15:36 -0400
Subject: bcachefs: Minor device removal fixes

 - We weren't clearing the LRU btree
 - bch2_alloc_read() runs before bch2_check_alloc_key() deletes alloc
   keys for devices/buckets that don't exists, so it needs to check for
   that
 - bch2_check_lrus() needs to check that buckets exists
 - improve some error messages

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/alloc_background.c | 15 +++++++++++----
 fs/bcachefs/lru.c              | 13 +++++++++----
 fs/bcachefs/super.c            | 20 +++++++++++++-------
 3 files changed, 33 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 6110d4ce4e5f..108b98c9fe45 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -446,6 +446,13 @@ int bch2_alloc_read(struct bch_fs *c)
 
 	for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN,
 			   BTREE_ITER_PREFETCH, k, ret) {
+		/*
+		 * Not a fsck error because this is checked/repaired by
+		 * bch2_check_alloc_key() which runs later:
+		 */
+		if (!bch2_dev_bucket_exists(c, k.k->p))
+			continue;
+
 		ca = bch_dev_bkey_exists(c, k.k->p.inode);
 		bch2_alloc_to_v4(k, &a);
 
@@ -614,7 +621,8 @@ static int bch2_check_alloc_key(struct btree_trans *trans,
 		return ret;
 
 	if (fsck_err_on(!bch2_dev_bucket_exists(c, alloc_k.k->p), c,
-			"alloc key for invalid device or bucket"))
+			"alloc key for invalid device:bucket %llu:%llu",
+			alloc_k.k->p.inode, alloc_k.k->p.offset))
 		return bch2_btree_delete_at(trans, alloc_iter, 0);
 
 	ca = bch_dev_bkey_exists(c, alloc_k.k->p.inode);
@@ -727,9 +735,8 @@ static int bch2_check_discard_freespace_key(struct btree_trans *trans,
 	bch2_trans_iter_init(trans, &alloc_iter, BTREE_ID_alloc, pos, 0);
 
 	if (fsck_err_on(!bch2_dev_bucket_exists(c, pos), c,
-			"%llu:%llu set in %s btree but device or bucket does not exist",
-			pos.inode, pos.offset,
-			bch2_btree_ids[iter->btree_id]))
+			"entry in %s btree for nonexistant dev:bucket %llu:%llu",
+			bch2_btree_ids[iter->btree_id], pos.inode, pos.offset))
 		goto delete;
 
 	k = bch2_btree_iter_peek_slot(&alloc_iter);
diff --git a/fs/bcachefs/lru.c b/fs/bcachefs/lru.c
index 267f2f8fb13b..ef4b4a9f0d5f 100644
--- a/fs/bcachefs/lru.c
+++ b/fs/bcachefs/lru.c
@@ -133,7 +133,7 @@ static int bch2_check_lru_key(struct btree_trans *trans,
 	struct bch_alloc_v4 a;
 	struct printbuf buf1 = PRINTBUF;
 	struct printbuf buf2 = PRINTBUF;
-	u64 idx;
+	struct bpos alloc_pos;
 	int ret;
 
 	lru_k = bch2_btree_iter_peek(lru_iter);
@@ -144,10 +144,15 @@ static int bch2_check_lru_key(struct btree_trans *trans,
 	if (ret)
 		return ret;
 
-	idx = le64_to_cpu(bkey_s_c_to_lru(lru_k).v->idx);
+	alloc_pos = POS(lru_k.k->p.inode,
+			le64_to_cpu(bkey_s_c_to_lru(lru_k).v->idx));
 
-	bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc,
-			     POS(lru_k.k->p.inode, idx), 0);
+	if (fsck_err_on(!bch2_dev_bucket_exists(c, alloc_pos), c,
+			"lru key points to nonexistent device:bucket %llu:%llu",
+			alloc_pos.inode, alloc_pos.offset))
+		return bch2_btree_delete_at(trans, lru_iter, 0);
+
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, alloc_pos, 0);
 	k = bch2_btree_iter_peek_slot(&iter);
 	ret = bkey_err(k);
 	if (ret)
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index d2776efa9985..e4ccdc966fdb 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -1423,11 +1423,17 @@ static int bch2_dev_remove_alloc(struct bch_fs *c, struct bch_dev *ca)
 	struct bpos end		= POS(ca->dev_idx, U64_MAX);
 	int ret;
 
-	ret =   bch2_btree_delete_range(c, BTREE_ID_alloc, start, end,
+	/*
+	 * We clear the LRU and need_discard btrees first so that we don't race
+	 * with bch2_do_invalidates() and bch2_do_discards()
+	 */
+	ret =   bch2_btree_delete_range(c, BTREE_ID_lru, start, end,
+					BTREE_TRIGGER_NORUN, NULL) ?:
+		bch2_btree_delete_range(c, BTREE_ID_need_discard, start, end,
 					BTREE_TRIGGER_NORUN, NULL) ?:
 		bch2_btree_delete_range(c, BTREE_ID_freespace, start, end,
 					BTREE_TRIGGER_NORUN, NULL) ?:
-		bch2_btree_delete_range(c, BTREE_ID_need_discard, start, end,
+		bch2_btree_delete_range(c, BTREE_ID_alloc, start, end,
 					BTREE_TRIGGER_NORUN, NULL);
 	if (ret)
 		bch_err(c, "error %i removing dev alloc info", ret);
@@ -1462,19 +1468,19 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
 		goto err;
 	}
 
-	ret = bch2_journal_flush_device_pins(&c->journal, ca->dev_idx);
+	ret = bch2_dev_remove_alloc(c, ca);
 	if (ret) {
-		bch_err(ca, "Remove failed: error %i flushing journal", ret);
+		bch_err(ca, "Remove failed, error deleting alloc info");
 		goto err;
 	}
 
-	ret = bch2_dev_remove_alloc(c, ca);
+	ret = bch2_journal_flush_device_pins(&c->journal, ca->dev_idx);
 	if (ret) {
-		bch_err(ca, "Remove failed, error deleting alloc info");
+		bch_err(ca, "Remove failed: error %i flushing journal", ret);
 		goto err;
 	}
 
-	ret = bch2_journal_error(&c->journal);
+	ret = bch2_journal_flush(&c->journal);
 	if (ret) {
 		bch_err(ca, "Remove failed, journal error");
 		goto err;
-- 
cgit 


From cf0dd697ebc9090d0aeafa933e0e688e291a24fc Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 10 Apr 2022 00:48:36 -0400
Subject: bcachefs: Don't trigger extra assertions in journal replay

We now pass a rw argument to .key_invalid methods so they can trigger
assertions for updates but not on existing keys. We shouldn't trigger
these extra assertions in journal replay - this patch changes the
transaction commit path accordingly.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_io.c          | 4 ++--
 fs/bcachefs/btree_update_leaf.c | 5 +++--
 2 files changed, 5 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index ba1d775039a3..f847928ab743 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -767,9 +767,9 @@ static int bset_key_invalid(struct bch_fs *c, struct btree *b,
 			    bool updated_range, int rw,
 			    struct printbuf *err)
 {
-	return __bch2_bkey_invalid(c, k, btree_node_type(b), rw, err) ?:
+	return __bch2_bkey_invalid(c, k, btree_node_type(b), READ, err) ?:
 		(!updated_range ? bch2_bkey_in_btree_node(b, k, err) : 0) ?:
-		(rw == WRITE ? bch2_bkey_val_invalid(c, k, rw, err) : 0);
+		(rw == WRITE ? bch2_bkey_val_invalid(c, k, READ, err) : 0);
 }
 
 static int validate_bset_keys(struct bch_fs *c, struct btree *b,
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 449fbae585cb..bc7faf29b3bc 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -864,10 +864,11 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans,
 	struct btree_insert_entry *i;
 	struct printbuf buf = PRINTBUF;
 	int ret, u64s_delta = 0;
+	int rw = (trans->flags & BTREE_INSERT_JOURNAL_REPLAY) ? READ : WRITE;
 
 	trans_for_each_update(trans, i) {
 		if (bch2_bkey_invalid(c, bkey_i_to_s_c(i->k),
-				      i->bkey_type, WRITE, &buf)) {
+				      i->bkey_type, rw, &buf)) {
 			printbuf_reset(&buf);
 			pr_buf(&buf, "invalid bkey on insert from %s -> %ps",
 			       trans->fn, (void *) i->ip_allocated);
@@ -878,7 +879,7 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans,
 			pr_newline(&buf);
 
 			bch2_bkey_invalid(c, bkey_i_to_s_c(i->k),
-					  i->bkey_type, WRITE, &buf);
+					  i->bkey_type, rw, &buf);
 
 			bch2_trans_inconsistent(trans, "%s", buf.buf);
 			printbuf_exit(&buf);
-- 
cgit 


From 6e811bbbc2ea3b20854c45fa29811640881bb534 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 10 Apr 2022 14:36:10 -0400
Subject: bcachefs: Fix a null ptr deref

We start doing allocations before the GC thread is created, which means
we need to check for that to avoid a null ptr deref.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_gc.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_gc.h b/fs/bcachefs/btree_gc.h
index 8de54005e4ea..95d803b5743d 100644
--- a/fs/bcachefs/btree_gc.h
+++ b/fs/bcachefs/btree_gc.h
@@ -105,7 +105,8 @@ static inline bool gc_visited(struct bch_fs *c, struct gc_pos pos)
 static inline void bch2_do_gc_gens(struct bch_fs *c)
 {
 	atomic_inc(&c->kick_gc);
-	wake_up_process(c->gc_thread);
+	if (c->gc_thread)
+		wake_up_process(c->gc_thread);
 }
 
 #endif /* _BCACHEFS_BTREE_GC_H */
-- 
cgit 


From ec7ccbde6baa50760c2679a5c54b2fccfd5ca64e Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 10 Apr 2022 16:26:34 -0400
Subject: bcachefs: Fix CPU usage in journal read path

In journal_entry_add(), we were repeatedly scanning the journal entries
radix tree to scan for old entries that can be freed, with O(n^2)
behaviour. This patch tweaks things to remember the previous last_seq,
so we don't have to scan for entries to free from the start.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/journal_io.c | 73 ++++++++++++++++++++----------------------------
 1 file changed, 30 insertions(+), 43 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 64341da75963..9e43914ebd6a 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -16,17 +16,17 @@
 #include "replicas.h"
 #include "trace.h"
 
-static inline u32 journal_entry_radix_idx(struct bch_fs *c,
-					  struct jset *j)
+static inline u32 journal_entry_radix_idx(struct bch_fs *c, u64 seq)
 {
-	return (le64_to_cpu(j->seq) - c->journal_entries_base_seq) & (~0U >> 1);
+	return (seq - c->journal_entries_base_seq) & (~0U >> 1);
 }
 
 static void __journal_replay_free(struct bch_fs *c,
 				  struct journal_replay *i)
 {
 	struct journal_replay **p =
-		genradix_ptr(&c->journal_entries, journal_entry_radix_idx(c, &i->j));
+		genradix_ptr(&c->journal_entries,
+			     journal_entry_radix_idx(c, le64_to_cpu(i->j.seq)));
 
 	BUG_ON(*p != i);
 	*p = NULL;
@@ -44,6 +44,7 @@ static void journal_replay_free(struct bch_fs *c, struct journal_replay *i)
 
 struct journal_list {
 	struct closure		cl;
+	u64			last_seq;
 	struct mutex		lock;
 	int			ret;
 };
@@ -64,55 +65,50 @@ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca,
 	struct journal_replay **_i, *i, *dup;
 	struct journal_ptr *ptr;
 	size_t bytes = vstruct_bytes(j);
-	u64 last_seq = 0;
+	u64 last_seq = !JSET_NO_FLUSH(j) ? le64_to_cpu(j->last_seq) : 0;
 	int ret = JOURNAL_ENTRY_ADD_OK;
 
+	/* Is this entry older than the range we need? */
+	if (!c->opts.read_entire_journal &&
+	    le64_to_cpu(j->seq) < jlist->last_seq)
+		return JOURNAL_ENTRY_ADD_OUT_OF_RANGE;
+
 	/*
-	 * Xarrays are indexed by a ulong, not a u64, so we can't index them by
-	 * sequence number directly:
-	 * Assume instead that they will all fall within the range of +-2billion
-	 * of the filrst one we find.
+	 * genradixes are indexed by a ulong, not a u64, so we can't index them
+	 * by sequence number directly: Assume instead that they will all fall
+	 * within the range of +-2billion of the filrst one we find.
 	 */
 	if (!c->journal_entries_base_seq)
 		c->journal_entries_base_seq = max_t(s64, 1, le64_to_cpu(j->seq) - S32_MAX);
 
-#if 0
-	list_for_each_entry_reverse(i, jlist->head, list) {
-		if (!JSET_NO_FLUSH(&i->j)) {
-			last_seq = le64_to_cpu(i->j.last_seq);
-			break;
-		}
-	}
-#endif
-
-	/* Is this entry older than the range we need? */
-	if (!c->opts.read_entire_journal &&
-	    le64_to_cpu(j->seq) < last_seq) {
-		ret = JOURNAL_ENTRY_ADD_OUT_OF_RANGE;
-		goto out;
-	}
-
 	/* Drop entries we don't need anymore */
-	if (!JSET_NO_FLUSH(j) && !c->opts.read_entire_journal) {
-		genradix_for_each(&c->journal_entries, iter, _i) {
+	if (last_seq > jlist->last_seq && !c->opts.read_entire_journal) {
+		genradix_for_each_from(&c->journal_entries, iter, _i,
+				       journal_entry_radix_idx(c, jlist->last_seq)) {
 			i = *_i;
 
-			if (!i)
+			if (!i || i->ignore)
 				continue;
 
-			if (le64_to_cpu(i->j.seq) >= le64_to_cpu(j->last_seq))
+			if (le64_to_cpu(i->j.seq) >= last_seq)
 				break;
 			journal_replay_free(c, i);
 		}
 	}
 
-	_i = genradix_ptr(&c->journal_entries, journal_entry_radix_idx(c, j));
-	dup = _i ? *_i : NULL;
+	jlist->last_seq = max(jlist->last_seq, last_seq);
+
+	_i = genradix_ptr_alloc(&c->journal_entries,
+				journal_entry_radix_idx(c, le64_to_cpu(j->seq)),
+				GFP_KERNEL);
+	if (!_i)
+		return -ENOMEM;
 
 	/*
 	 * Duplicate journal entries? If so we want the one that didn't have a
 	 * checksum error:
 	 */
+	dup = *_i;
 	if (dup) {
 		if (dup->bad) {
 			/* we'll replace @dup: */
@@ -130,10 +126,8 @@ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca,
 	}
 
 	i = kvpmalloc(offsetof(struct journal_replay, j) + bytes, GFP_KERNEL);
-	if (!i) {
-		ret = -ENOMEM;
-		goto out;
-	}
+	if (!i)
+		return -ENOMEM;
 
 	i->nr_ptrs	 = 0;
 	i->bad		= bad;
@@ -146,14 +140,6 @@ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca,
 		__journal_replay_free(c, dup);
 	}
 
-	_i = genradix_ptr_alloc(&c->journal_entries,
-				journal_entry_radix_idx(c, &i->j),
-				GFP_KERNEL);
-	if (!_i) {
-		bch_err(c, "failed to allocate c->journal_entries entry");
-		ret = -ENOMEM;
-		goto out;
-	}
 
 	*_i = i;
 found:
@@ -1064,6 +1050,7 @@ int bch2_journal_read(struct bch_fs *c, u64 *blacklist_seq, u64 *start_seq)
 
 	closure_init_stack(&jlist.cl);
 	mutex_init(&jlist.lock);
+	jlist.last_seq = 0;
 	jlist.ret = 0;
 
 	for_each_member_device(ca, c, iter) {
-- 
cgit 


From 3518e6faeff20d1de1f0c7388d9d9c6f2fe7f5a7 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 10 Apr 2022 18:04:24 -0400
Subject: bcachefs: Improve bch2_open_buckets_to_text()

This patch updates bch2_open_buckets_to_text() to include the device and
bucket the open_bucket owns.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/alloc_foreground.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index 14162dd4d696..d871e1f11f29 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -1381,14 +1381,14 @@ void bch2_open_buckets_to_text(struct printbuf *out, struct bch_fs *c)
 	     ob++) {
 		spin_lock(&ob->lock);
 		if (ob->valid && !ob->on_partial_list) {
-			pr_buf(out, "%zu ref %u type %s\n",
+			pr_buf(out, "%zu ref %u type %s %u:%llu:%u\n",
 			       ob - c->open_buckets,
 			       atomic_read(&ob->pin),
-			       bch2_data_types[ob->data_type]);
+			       bch2_data_types[ob->data_type],
+			       ob->dev, ob->bucket, ob->gen);
 		}
 		spin_unlock(&ob->lock);
 	}
-
 }
 
 static const char * const bch2_write_point_states[] = {
-- 
cgit 


From 84befe8ef9a07be1cd9bac4e1a1c66c667f71499 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 10 Apr 2022 18:12:04 -0400
Subject: bcachefs: Use bch2_trans_inconsistent_on() in more places

This gets us better error messages.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/alloc_background.c | 12 ++++++------
 fs/bcachefs/lru.c              |  5 ++---
 2 files changed, 8 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 108b98c9fe45..3be6f0fa89de 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -520,7 +520,7 @@ static int bch2_bucket_do_index(struct btree_trans *trans,
 		goto err;
 
 	if (ca->mi.freespace_initialized &&
-	    bch2_fs_inconsistent_on(old.k->type != old_type, c,
+	    bch2_trans_inconsistent_on(old.k->type != old_type, trans,
 			"incorrect key when %s %s btree (got %s should be %s)\n"
 			"  for %s",
 			set ? "setting" : "clearing",
@@ -950,7 +950,7 @@ static int bch2_clear_need_discard(struct btree_trans *trans, struct bpos pos,
 		goto write;
 	}
 
-	if (bch2_fs_inconsistent_on(a->v.journal_seq > c->journal.flushed_seq_ondisk, c,
+	if (bch2_trans_inconsistent_on(a->v.journal_seq > c->journal.flushed_seq_ondisk, trans,
 			"clearing need_discard but journal_seq %llu > flushed_seq %llu\n"
 			"%s",
 			a->v.journal_seq,
@@ -960,7 +960,7 @@ static int bch2_clear_need_discard(struct btree_trans *trans, struct bpos pos,
 		goto out;
 	}
 
-	if (bch2_fs_inconsistent_on(a->v.data_type != BCH_DATA_need_discard, c,
+	if (bch2_trans_inconsistent_on(a->v.data_type != BCH_DATA_need_discard, trans,
 			"bucket incorrectly set in need_discard btree\n"
 			"%s",
 			(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
@@ -1089,8 +1089,8 @@ static int invalidate_one_bucket(struct btree_trans *trans, struct bch_dev *ca)
 	if (!k.k || k.k->p.inode != ca->dev_idx)
 		goto out;
 
-	if (bch2_fs_inconsistent_on(k.k->type != KEY_TYPE_lru, c,
-				    "non lru key in lru btree"))
+	if (bch2_trans_inconsistent_on(k.k->type != KEY_TYPE_lru, trans,
+				       "non lru key in lru btree"))
 		goto out;
 
 	idx	= k.k->p.offset;
@@ -1102,7 +1102,7 @@ static int invalidate_one_bucket(struct btree_trans *trans, struct bch_dev *ca)
 	if (ret)
 		goto out;
 
-	if (bch2_fs_inconsistent_on(idx != alloc_lru_idx(a->v), c,
+	if (bch2_trans_inconsistent_on(idx != alloc_lru_idx(a->v), trans,
 			"invalidating bucket with wrong lru idx (got %llu should be %llu",
 			idx, alloc_lru_idx(a->v)))
 		goto out;
diff --git a/fs/bcachefs/lru.c b/fs/bcachefs/lru.c
index ef4b4a9f0d5f..d8180bc1c6b1 100644
--- a/fs/bcachefs/lru.c
+++ b/fs/bcachefs/lru.c
@@ -32,7 +32,6 @@ void bch2_lru_to_text(struct printbuf *out, struct bch_fs *c,
 
 static int lru_delete(struct btree_trans *trans, u64 id, u64 idx, u64 time)
 {
-	struct bch_fs *c = trans->c;
 	struct btree_iter iter;
 	struct bkey_s_c k;
 	u64 existing_idx;
@@ -51,7 +50,7 @@ static int lru_delete(struct btree_trans *trans, u64 id, u64 idx, u64 time)
 		goto err;
 
 	if (k.k->type != KEY_TYPE_lru) {
-		bch2_fs_inconsistent(c,
+		bch2_trans_inconsistent(trans,
 			"pointer to nonexistent lru %llu:%llu",
 			id, time);
 		ret = -EIO;
@@ -60,7 +59,7 @@ static int lru_delete(struct btree_trans *trans, u64 id, u64 idx, u64 time)
 
 	existing_idx = le64_to_cpu(bkey_s_c_to_lru(k).v->idx);
 	if (existing_idx != idx) {
-		bch2_fs_inconsistent(c,
+		bch2_trans_inconsistent(trans,
 			"lru %llu:%llu with wrong backpointer: got %llu, should be %llu",
 			id, time, existing_idx, idx);
 		ret = -EIO;
-- 
cgit 


From 7003589dabcdfd10345ede31044ce5e13ee65e7f Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 10 Apr 2022 19:59:26 -0400
Subject: bcachefs: Ensure buckets have io_time[READ] set

It's an error if a bucket is in state BCH_DATA_cached but not on the LRU
btree - i.e io_time[READ] == 0 - so, make sure it's set before adding
it.

Also, make some of the LRU code a bit clearer and more direct.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/alloc_background.c | 15 ++++++++++-----
 fs/bcachefs/btree_gc.c         |  7 +++++++
 fs/bcachefs/lru.c              |  8 ++++----
 fs/bcachefs/lru.h              |  2 ++
 4 files changed, 23 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 3be6f0fa89de..17b147d15320 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -583,6 +583,11 @@ int bch2_trans_mark_alloc(struct btree_trans *trans,
 			return ret;
 	}
 
+	if (new_a->data_type == BCH_DATA_cached &&
+	    !new_a->io_time[READ])
+		new_a->io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now));
+
+
 	old_lru = alloc_lru_idx(old_a);
 	new_lru = alloc_lru_idx(*new_a);
 
@@ -592,7 +597,7 @@ int bch2_trans_mark_alloc(struct btree_trans *trans,
 		if (ret)
 			return ret;
 
-		if (new_lru && new_a->io_time[READ] != new_lru)
+		if (new_a->data_type == BCH_DATA_cached)
 			new_a->io_time[READ] = new_lru;
 	}
 
@@ -869,10 +874,10 @@ static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans,
 		if (!a.io_time[READ])
 			a.io_time[READ] = atomic64_read(&c->io_clock[READ].now);
 
-		ret = bch2_lru_change(trans,
-				      alloc_k.k->p.inode,
-				      alloc_k.k->p.offset,
-				      0, &a.io_time[READ]);
+		ret = bch2_lru_set(trans,
+				   alloc_k.k->p.inode,
+				   alloc_k.k->p.offset,
+				   &a.io_time[READ]);
 		if (ret)
 			goto err;
 
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 0b1717120cc3..5199f0240fcd 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -1405,6 +1405,13 @@ static int bch2_alloc_write_key(struct btree_trans *trans,
 
 	a->v = new;
 
+	/*
+	 * The trigger normally makes sure this is set, but we're not running
+	 * triggers:
+	 */
+	if (a->v.data_type == BCH_DATA_cached && !a->v.io_time[READ])
+		a->v.io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now));
+
 	ret = bch2_trans_update(trans, iter, &a->k_i, BTREE_TRIGGER_NORUN);
 fsck_err:
 	return ret;
diff --git a/fs/bcachefs/lru.c b/fs/bcachefs/lru.c
index d8180bc1c6b1..49a0f0d69664 100644
--- a/fs/bcachefs/lru.c
+++ b/fs/bcachefs/lru.c
@@ -30,7 +30,7 @@ void bch2_lru_to_text(struct printbuf *out, struct bch_fs *c,
 	pr_buf(out, "idx %llu", le64_to_cpu(lru->idx));
 }
 
-static int lru_delete(struct btree_trans *trans, u64 id, u64 idx, u64 time)
+int bch2_lru_delete(struct btree_trans *trans, u64 id, u64 idx, u64 time)
 {
 	struct btree_iter iter;
 	struct bkey_s_c k;
@@ -72,7 +72,7 @@ err:
 	return ret;
 }
 
-static int lru_set(struct btree_trans *trans, u64 lru_id, u64 idx, u64 *time)
+int bch2_lru_set(struct btree_trans *trans, u64 lru_id, u64 idx, u64 *time)
 {
 	struct btree_iter iter;
 	struct bkey_s_c k;
@@ -119,8 +119,8 @@ int bch2_lru_change(struct btree_trans *trans, u64 id, u64 idx,
 	if (old_time == *new_time)
 		return 0;
 
-	return  lru_delete(trans, id, idx, old_time) ?:
-		lru_set(trans, id, idx, new_time);
+	return  bch2_lru_delete(trans, id, idx, old_time) ?:
+		bch2_lru_set(trans, id, idx, new_time);
 }
 
 static int bch2_check_lru_key(struct btree_trans *trans,
diff --git a/fs/bcachefs/lru.h b/fs/bcachefs/lru.h
index e8f508174b0a..0a01836c07c1 100644
--- a/fs/bcachefs/lru.h
+++ b/fs/bcachefs/lru.h
@@ -10,6 +10,8 @@ void bch2_lru_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 	.val_to_text	= bch2_lru_to_text,	\
 }
 
+int bch2_lru_delete(struct btree_trans *, u64, u64, u64);
+int bch2_lru_set(struct btree_trans *, u64, u64, u64 *);
 int bch2_lru_change(struct btree_trans *, u64, u64, u64, u64 *);
 
 int bch2_check_lrus(struct bch_fs *, bool);
-- 
cgit 


From 9b93596c33f6c23de96c05dce82b9aead271a286 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 11 Apr 2022 17:23:39 -0400
Subject: bcachefs: Improve error message when alloc key doesn't match lru
 entry

Error messages should always print out the full key when available -
this gives us a starting point when looking through the journal to debug
what went wrong.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/alloc_background.c | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 17b147d15320..1e6283b55e3b 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -1082,6 +1082,7 @@ static int invalidate_one_bucket(struct btree_trans *trans, struct bch_dev *ca)
 	struct bkey_s_c k;
 	struct bkey_i_alloc_v4 *a;
 	u64 bucket, idx;
+	struct printbuf buf = PRINTBUF;
 	int ret;
 
 	bch2_trans_iter_init(trans, &lru_iter, BTREE_ID_lru,
@@ -1107,10 +1108,16 @@ static int invalidate_one_bucket(struct btree_trans *trans, struct bch_dev *ca)
 	if (ret)
 		goto out;
 
-	if (bch2_trans_inconsistent_on(idx != alloc_lru_idx(a->v), trans,
-			"invalidating bucket with wrong lru idx (got %llu should be %llu",
-			idx, alloc_lru_idx(a->v)))
+	if (idx != alloc_lru_idx(a->v)) {
+		pr_buf(&buf, "alloc key does not point back to lru entry when invalidating bucket:\n  ");
+
+		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&a->k_i));
+		pr_buf(&buf, "\n  ");
+		bch2_bkey_val_to_text(&buf, c, k);
+		bch2_trans_inconsistent(trans, "%s", buf.buf);
+		ret = -EINVAL;
 		goto out;
+	}
 
 	SET_BCH_ALLOC_V4_NEED_INC_GEN(&a->v, false);
 	a->v.gen++;
@@ -1125,6 +1132,7 @@ static int invalidate_one_bucket(struct btree_trans *trans, struct bch_dev *ca)
 out:
 	bch2_trans_iter_exit(trans, &alloc_iter);
 	bch2_trans_iter_exit(trans, &lru_iter);
+	printbuf_exit(&buf);
 	return ret;
 }
 
-- 
cgit 


From 5650bb46be89a1254609d47e4c87d1e9cf9121fb Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 11 Apr 2022 20:28:13 -0400
Subject: bcachefs: Introduce bch2_journal_keys_peek_(upto|slot)()

When many journal replay keys have been overwritten,
bch2_journal_keys_peek() was taking excessively long to scan before it
found a key to return.

Fix this by introducing bch2_journal_keys_peek_upto() which takes a
parameter for the end of the range we want, so that we can terminate the
search much sooner, and replace all uses of bch2_journal_keys_peek()
with peek_upto() or peek_slot().

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_iter.c        | 15 +++++++--------
 fs/bcachefs/btree_update_leaf.c |  9 +++++----
 fs/bcachefs/recovery.c          | 22 +++++++++++++++-------
 fs/bcachefs/recovery.h          |  6 ++++--
 4 files changed, 31 insertions(+), 21 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 1c0560ecd120..80ea97e21cf3 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -2225,10 +2225,10 @@ static noinline
 struct bkey_s_c btree_trans_peek_slot_journal(struct btree_trans *trans,
 					      struct btree_iter *iter)
 {
-	struct bkey_i *k = bch2_journal_keys_peek(trans->c, iter->btree_id, 0,
-						  iter->path->pos);
+	struct bkey_i *k = bch2_journal_keys_peek_slot(trans->c, iter->btree_id,
+						       0, iter->path->pos);
 
-	if (k && !bpos_cmp(k->k.p, iter->path->pos)) {
+	if (k) {
 		iter->k = k->k;
 		return bkey_i_to_s_c(k);
 	} else {
@@ -2242,12 +2242,11 @@ struct bkey_s_c btree_trans_peek_journal(struct btree_trans *trans,
 					 struct bkey_s_c k)
 {
 	struct bkey_i *next_journal =
-		bch2_journal_keys_peek(trans->c, iter->btree_id, 0,
-				       iter->path->pos);
+		bch2_journal_keys_peek_upto(trans->c, iter->btree_id, 0,
+				iter->path->pos,
+				k.k ? k.k->p : iter->path->l[0].b->key.k.p);
 
-	if (next_journal &&
-	    bpos_cmp(next_journal->k.p,
-		     k.k ? k.k->p : iter->path->l[0].b->key.k.p) <= 0) {
+	if (next_journal) {
 		iter->k = next_journal->k;
 		k = bkey_i_to_s_c(next_journal);
 	}
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index bc7faf29b3bc..a63c2f36bae4 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -670,9 +670,10 @@ bch2_trans_commit_write_locked(struct btree_trans *trans,
 
 		if (unlikely(!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags))) {
 			struct bkey_i *j_k =
-				bch2_journal_keys_peek(c, i->btree_id, i->level, i->k->k.p);
+				bch2_journal_keys_peek_slot(c, i->btree_id, i->level,
+							    i->k->k.p);
 
-			if (j_k && !bpos_cmp(j_k->k.p, i->k->k.p)) {
+			if (j_k) {
 				i->old_k = j_k->k;
 				i->old_v = &j_k->v;
 			}
@@ -1550,9 +1551,9 @@ bch2_trans_update_by_path_trace(struct btree_trans *trans, struct btree_path *pa
 
 		if (unlikely(trans->journal_replay_not_finished)) {
 			struct bkey_i *j_k =
-				bch2_journal_keys_peek(c, n.btree_id, n.level, k->k.p);
+				bch2_journal_keys_peek_slot(c, n.btree_id, n.level, k->k.p);
 
-			if (j_k && !bpos_cmp(j_k->k.p, i->k->k.p)) {
+			if (j_k) {
 				i->old_k = j_k->k;
 				i->old_v = &j_k->v;
 			}
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 9269cb686e4d..16ba5d24a86d 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -109,26 +109,34 @@ size_t bch2_journal_key_search(struct journal_keys *keys,
 	return idx_to_pos(keys, l);
 }
 
-struct bkey_i *bch2_journal_keys_peek(struct bch_fs *c, enum btree_id btree_id,
-				      unsigned level, struct bpos pos)
+struct bkey_i *bch2_journal_keys_peek_upto(struct bch_fs *c, enum btree_id btree_id,
+					   unsigned level, struct bpos pos,
+					   struct bpos end_pos)
 {
 	struct journal_keys *keys = &c->journal_keys;
 	size_t idx = bch2_journal_key_search(keys, btree_id, level, pos);
 
 	while (idx < keys->size &&
-	       keys->d[idx].overwritten) {
+	       keys->d[idx].btree_id == btree_id &&
+	       keys->d[idx].level == level &&
+	       bpos_cmp(keys->d[idx].k->k.p, end_pos) <= 0) {
+		if (!keys->d[idx].overwritten)
+			return keys->d[idx].k;
+
 		idx++;
 		if (idx == keys->gap)
 			idx += keys->size - keys->nr;
 	}
 
-	if (idx < keys->size &&
-	    keys->d[idx].btree_id == btree_id &&
-	    keys->d[idx].level == level)
-		return keys->d[idx].k;
 	return NULL;
 }
 
+struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *c, enum btree_id btree_id,
+					   unsigned level, struct bpos pos)
+{
+	return bch2_journal_keys_peek_upto(c, btree_id, level, pos, pos);
+}
+
 static void journal_iters_fix(struct bch_fs *c)
 {
 	struct journal_keys *keys = &c->journal_keys;
diff --git a/fs/bcachefs/recovery.h b/fs/bcachefs/recovery.h
index ab8b116ac7db..e05aac64185d 100644
--- a/fs/bcachefs/recovery.h
+++ b/fs/bcachefs/recovery.h
@@ -30,8 +30,10 @@ struct btree_and_journal_iter {
 
 size_t bch2_journal_key_search(struct journal_keys *, enum btree_id,
 			       unsigned, struct bpos);
-struct bkey_i *bch2_journal_keys_peek(struct bch_fs *, enum btree_id,
-				      unsigned, struct bpos pos);
+struct bkey_i *bch2_journal_keys_peek_upto(struct bch_fs *, enum btree_id,
+					   unsigned, struct bpos, struct bpos);
+struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *, enum btree_id,
+					   unsigned, struct bpos);
 
 int bch2_journal_key_insert_take(struct bch_fs *, enum btree_id,
 				 unsigned, struct bkey_i *);
-- 
cgit 


From 0b090326535c8fe5a1da6ca3d7bd4a3fa9dfd6c9 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 11 Apr 2022 20:57:01 -0400
Subject: bcachefs: Improve bch2_lru_delete() error messages

When we detect a filesystem inconsistency, we should include the
relevent keys in the error message. This patch adds a parameter to pass
the key with the lru entry to bch2_lru_delete(), so that it can be
printed.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/alloc_background.c |  2 +-
 fs/bcachefs/lru.c              | 20 +++++++++++++-------
 fs/bcachefs/lru.h              |  4 ++--
 3 files changed, 16 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 1e6283b55e3b..ce8803658369 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -593,7 +593,7 @@ int bch2_trans_mark_alloc(struct btree_trans *trans,
 
 	if (old_lru != new_lru) {
 		ret = bch2_lru_change(trans, new->k.p.inode, new->k.p.offset,
-				      old_lru, &new_lru);
+				      old_lru, &new_lru, old);
 		if (ret)
 			return ret;
 
diff --git a/fs/bcachefs/lru.c b/fs/bcachefs/lru.c
index 49a0f0d69664..fe9d15742947 100644
--- a/fs/bcachefs/lru.c
+++ b/fs/bcachefs/lru.c
@@ -30,11 +30,13 @@ void bch2_lru_to_text(struct printbuf *out, struct bch_fs *c,
 	pr_buf(out, "idx %llu", le64_to_cpu(lru->idx));
 }
 
-int bch2_lru_delete(struct btree_trans *trans, u64 id, u64 idx, u64 time)
+int bch2_lru_delete(struct btree_trans *trans, u64 id, u64 idx, u64 time,
+		    struct bkey_s_c orig_k)
 {
 	struct btree_iter iter;
 	struct bkey_s_c k;
 	u64 existing_idx;
+	struct printbuf buf = PRINTBUF;
 	int ret = 0;
 
 	if (!time)
@@ -50,18 +52,20 @@ int bch2_lru_delete(struct btree_trans *trans, u64 id, u64 idx, u64 time)
 		goto err;
 
 	if (k.k->type != KEY_TYPE_lru) {
+		bch2_bkey_val_to_text(&buf, trans->c, orig_k);
 		bch2_trans_inconsistent(trans,
-			"pointer to nonexistent lru %llu:%llu",
-			id, time);
+			"pointer to nonexistent lru %llu:%llu\n%s",
+			id, time, buf.buf);
 		ret = -EIO;
 		goto err;
 	}
 
 	existing_idx = le64_to_cpu(bkey_s_c_to_lru(k).v->idx);
 	if (existing_idx != idx) {
+		bch2_bkey_val_to_text(&buf, trans->c, orig_k);
 		bch2_trans_inconsistent(trans,
-			"lru %llu:%llu with wrong backpointer: got %llu, should be %llu",
-			id, time, existing_idx, idx);
+			"lru %llu:%llu with wrong backpointer: got %llu, should be %llu\n%s",
+			id, time, existing_idx, idx, buf.buf);
 		ret = -EIO;
 		goto err;
 	}
@@ -69,6 +73,7 @@ int bch2_lru_delete(struct btree_trans *trans, u64 id, u64 idx, u64 time)
 	ret = bch2_btree_delete_at(trans, &iter, 0);
 err:
 	bch2_trans_iter_exit(trans, &iter);
+	printbuf_exit(&buf);
 	return ret;
 }
 
@@ -114,12 +119,13 @@ err:
 }
 
 int bch2_lru_change(struct btree_trans *trans, u64 id, u64 idx,
-		    u64 old_time, u64 *new_time)
+		    u64 old_time, u64 *new_time,
+		    struct bkey_s_c k)
 {
 	if (old_time == *new_time)
 		return 0;
 
-	return  bch2_lru_delete(trans, id, idx, old_time) ?:
+	return  bch2_lru_delete(trans, id, idx, old_time, k) ?:
 		bch2_lru_set(trans, id, idx, new_time);
 }
 
diff --git a/fs/bcachefs/lru.h b/fs/bcachefs/lru.h
index 0a01836c07c1..bfe38a67e585 100644
--- a/fs/bcachefs/lru.h
+++ b/fs/bcachefs/lru.h
@@ -10,9 +10,9 @@ void bch2_lru_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 	.val_to_text	= bch2_lru_to_text,	\
 }
 
-int bch2_lru_delete(struct btree_trans *, u64, u64, u64);
+int bch2_lru_delete(struct btree_trans *, u64, u64, u64, struct bkey_s_c);
 int bch2_lru_set(struct btree_trans *, u64, u64, u64 *);
-int bch2_lru_change(struct btree_trans *, u64, u64, u64, u64 *);
+int bch2_lru_change(struct btree_trans *, u64, u64, u64, u64 *, struct bkey_s_c);
 
 int bch2_check_lrus(struct bch_fs *, bool);
 
-- 
cgit 


From e296b1f9cadfc4ee7ebe5933fb98497263ce9999 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 11 Apr 2022 22:36:13 -0400
Subject: bcachefs: Fix inode_backpointer_exists()

If the dirent an inode points to doesn't exist, we shouldn't be
returning an error - just 0/false.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/fsck.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 6a89b0694e50..ccbf3ac4b1d6 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -1092,7 +1092,7 @@ static int inode_backpointer_exists(struct btree_trans *trans,
 			SPOS(inode->bi_dir, inode->bi_dir_offset, snapshot));
 	ret = bkey_err(d.s_c);
 	if (ret)
-		return ret;
+		return ret == -ENOENT ? 0 : ret;
 
 	ret = dirent_points_to_inode(d, inode);
 	bch2_trans_iter_exit(trans, &iter);
-- 
cgit 


From e492e7b6f64fe128b83e165ef82f7d4b9fcc12cd Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 11 Apr 2022 22:36:53 -0400
Subject: bcachefs: Improve error logging in fsck.c

This adds error logging to a bunch of functions in fsck.c - in fsck,
reduntant error messages is probably better than not enough.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/fsck.c | 37 +++++++++++++++++++++++++++++++++++--
 1 file changed, 35 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index ccbf3ac4b1d6..abdcff172aec 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -304,7 +304,7 @@ static int __remove_dirent(struct btree_trans *trans, struct bpos pos)
 
 	ret = lookup_first_inode(trans, pos.inode, &dir_inode);
 	if (ret)
-		return ret;
+		goto err;
 
 	dir_hash_info = bch2_hash_info_init(c, &dir_inode);
 
@@ -313,6 +313,9 @@ static int __remove_dirent(struct btree_trans *trans, struct bpos pos)
 	ret = bch2_hash_delete_at(trans, bch2_dirent_hash_desc,
 				  &dir_hash_info, &iter, 0);
 	bch2_trans_iter_exit(trans, &iter);
+err:
+	if (ret && ret != -EINTR)
+		bch_err(c, "error %i from __remove_dirent()", ret);
 	return ret;
 }
 
@@ -799,8 +802,10 @@ static int check_inode(struct btree_trans *trans,
 		return ret;
 
 	ret = check_key_has_snapshot(trans, iter, k);
+	if (ret < 0)
+		goto err;
 	if (ret)
-		return ret < 0 ? ret : 0;
+		return 0;
 
 	/*
 	 * if snapshot id isn't a leaf node, skip it - deletion in
@@ -911,7 +916,10 @@ static int check_inode(struct btree_trans *trans,
 			bch_err(c, "error in fsck: error %i "
 				"updating inode", ret);
 	}
+err:
 fsck_err:
+	if (ret)
+		bch_err(c, "error %i from check_inode()", ret);
 	return ret;
 }
 
@@ -941,6 +949,8 @@ static int check_inodes(struct bch_fs *c, bool full)
 	bch2_trans_iter_exit(&trans, &iter);
 
 	bch2_trans_exit(&trans);
+	if (ret)
+		bch_err(c, "error %i from check_inodes()", ret);
 	return ret;
 }
 
@@ -1134,6 +1144,8 @@ static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w)
 		ret2 = -EINTR;
 	}
 fsck_err:
+	if (ret)
+		bch_err(c, "error %i from check_i_sectors()", ret);
 	return ret ?: ret2;
 }
 
@@ -1257,6 +1269,9 @@ out:
 err:
 fsck_err:
 	printbuf_exit(&buf);
+
+	if (ret && ret != -EINTR)
+		bch_err(c, "error %i from check_extent()", ret);
 	return ret;
 }
 
@@ -1305,6 +1320,8 @@ static int check_extents(struct bch_fs *c)
 	bch2_trans_exit(&trans);
 	snapshots_seen_exit(&s);
 
+	if (ret)
+		bch_err(c, "error %i from check_extents()", ret);
 	return ret;
 }
 
@@ -1342,6 +1359,8 @@ static int check_subdir_count(struct btree_trans *trans, struct inode_walker *w)
 		}
 	}
 fsck_err:
+	if (ret)
+		bch_err(c, "error %i from check_subdir_count()", ret);
 	return ret ?: ret2;
 }
 
@@ -1458,6 +1477,9 @@ out:
 err:
 fsck_err:
 	printbuf_exit(&buf);
+
+	if (ret && ret != -EINTR)
+		bch_err(c, "error %i from check_target()", ret);
 	return ret;
 }
 
@@ -1631,6 +1653,9 @@ out:
 err:
 fsck_err:
 	printbuf_exit(&buf);
+
+	if (ret && ret != -EINTR)
+		bch_err(c, "error %i from check_dirent()", ret);
 	return ret;
 }
 
@@ -1675,6 +1700,9 @@ static int check_dirents(struct bch_fs *c)
 	snapshots_seen_exit(&s);
 	inode_walker_exit(&dir);
 	inode_walker_exit(&target);
+
+	if (ret)
+		bch_err(c, "error %i from check_dirents()", ret);
 	return ret;
 }
 
@@ -1717,6 +1745,8 @@ static int check_xattr(struct btree_trans *trans, struct btree_iter *iter,
 
 	ret = hash_check_key(trans, bch2_xattr_hash_desc, hash_info, iter, k);
 fsck_err:
+	if (ret && ret != -EINTR)
+		bch_err(c, "error %i from check_xattr()", ret);
 	return ret;
 }
 
@@ -1754,6 +1784,9 @@ static int check_xattrs(struct bch_fs *c)
 	bch2_trans_iter_exit(&trans, &iter);
 
 	bch2_trans_exit(&trans);
+
+	if (ret)
+		bch_err(c, "error %i from check_xattrs()", ret);
 	return ret;
 }
 
-- 
cgit 


From c609947b5eae4bee096fde660a53a719ee65e191 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 12 Apr 2022 01:31:33 -0400
Subject: bcachefs: Fix for getting stuck in journal replay

In journal replay, we weren't immediately dropping journal pins when we
start doing updates that ewern't from journal replay - leading to
journal reclaim getting stuck.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/recovery.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 16ba5d24a86d..dc11eae1bcaa 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -556,7 +556,8 @@ static int journal_keys_sort(struct bch_fs *c)
 static void replay_now_at(struct journal *j, u64 seq)
 {
 	BUG_ON(seq < j->replay_journal_seq);
-	BUG_ON(seq > j->replay_journal_seq_end);
+
+	seq = min(seq, j->replay_journal_seq_end);
 
 	while (j->replay_journal_seq < seq)
 		bch2_journal_pin_put(j, j->replay_journal_seq++);
@@ -629,8 +630,7 @@ static int bch2_journal_replay(struct bch_fs *c)
 
 		cond_resched();
 
-		if (!k->allocated)
-			replay_now_at(j, keys->journal_seq_base + k->journal_seq);
+		replay_now_at(j, keys->journal_seq_base + k->journal_seq);
 
 		ret = bch2_trans_do(c, NULL, NULL,
 				    BTREE_INSERT_LAZY_RW|
-- 
cgit 


From 41fc86222480c34d8647661b36d3fb1e9312fd33 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 12 Apr 2022 13:09:09 -0400
Subject: bcachefs: In fsck, pass BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE when
 deleting dirents

A user reported an error where we hit an assertion due to deleting a key
in an internal snapshot node, when deleting a dirent that points to a
nonexisting inode.

We try to avoid doing updates to keys for internal snapshot nodes, but
upon inspection of the places where we remove dirents in fsck it appears
BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE is correct for all of them: either
the target dirent doesn't exist, or it's a directory with multiple
dirents pointing to it.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/fsck.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index abdcff172aec..cf9e6f595d53 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -311,7 +311,8 @@ static int __remove_dirent(struct btree_trans *trans, struct bpos pos)
 	bch2_trans_iter_init(trans, &iter, BTREE_ID_dirents, pos, BTREE_ITER_INTENT);
 
 	ret = bch2_hash_delete_at(trans, bch2_dirent_hash_desc,
-				  &dir_hash_info, &iter, 0);
+				  &dir_hash_info, &iter,
+				  BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
 	bch2_trans_iter_exit(trans, &iter);
 err:
 	if (ret && ret != -EINTR)
-- 
cgit 


From 75c8d0305a5eecbe84b8ffef20e1c049f30f4123 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 12 Apr 2022 20:03:19 -0400
Subject: bcachefs: Kill old rebuild_replicas option

This option was useful when the replicas mechism was new and still being
debugged, but hasn't been used in ages - let's delete it.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/bcachefs.h   |  1 -
 fs/bcachefs/buckets.c    |  7 +++----
 fs/bcachefs/journal_io.c |  9 ++++-----
 fs/bcachefs/opts.h       |  5 -----
 fs/bcachefs/recovery.c   | 19 +++++--------------
 5 files changed, 12 insertions(+), 29 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 43e921b91d85..04d297b1da94 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -518,7 +518,6 @@ enum {
 	/* misc: */
 	BCH_FS_NEED_ANOTHER_GC,
 	BCH_FS_DELETED_NODES,
-	BCH_FS_REBUILD_REPLICAS,
 };
 
 struct btree_debug {
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 7fa76e737aa7..14c9c1098522 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -382,10 +382,9 @@ static inline int update_replicas(struct bch_fs *c, struct bkey_s_c k,
 
 	idx = bch2_replicas_entry_idx(c, r);
 	if (idx < 0 &&
-	    (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) ||
-	     fsck_err(c, "no replicas entry\n"
-		      "  while marking %s",
-		      (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))) {
+	    fsck_err(c, "no replicas entry\n"
+		     "  while marking %s",
+		     (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
 		percpu_up_read(&c->mark_lock);
 		ret = bch2_mark_replicas(c, r);
 		percpu_down_read(&c->mark_lock);
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 9e43914ebd6a..a6a8737e92ad 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -1054,7 +1054,7 @@ int bch2_journal_read(struct bch_fs *c, u64 *blacklist_seq, u64 *start_seq)
 	jlist.ret = 0;
 
 	for_each_member_device(ca, c, iter) {
-		if (!test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) &&
+		if (!c->opts.fsck &&
 		    !(bch2_dev_has_data(c, ca) & (1 << BCH_DATA_journal)))
 			continue;
 
@@ -1211,10 +1211,9 @@ int bch2_journal_read(struct bch_fs *c, u64 *blacklist_seq, u64 *start_seq)
 		bch2_replicas_entry_to_text(&buf, &replicas.e);
 
 		if (!degraded &&
-		    (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) ||
-		     fsck_err_on(!bch2_replicas_marked(c, &replicas.e), c,
-				 "superblock not marked as containing replicas %s",
-				 buf.buf))) {
+		    fsck_err_on(!bch2_replicas_marked(c, &replicas.e), c,
+				"superblock not marked as containing replicas %s",
+				buf.buf)) {
 			ret = bch2_mark_replicas(c, &replicas.e);
 			if (ret)
 				goto err;
diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
index ce79e1a12bd0..863891dcb554 100644
--- a/fs/bcachefs/opts.h
+++ b/fs/bcachefs/opts.h
@@ -316,11 +316,6 @@ enum opt_type {
 	  OPT_BOOL(),							\
 	  BCH2_NO_SB_OPT,		false,				\
 	  NULL,		"Don't replay the journal")			\
-	x(rebuild_replicas,		u8,				\
-	  OPT_FS|OPT_MOUNT,						\
-	  OPT_BOOL(),							\
-	  BCH2_NO_SB_OPT,		false,				\
-	  NULL,		"Rebuild the superblock replicas section")	\
 	x(keep_journal,			u8,				\
 	  0,								\
 	  OPT_BOOL(),							\
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index dc11eae1bcaa..e2474ff99702 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -1087,12 +1087,6 @@ int bch2_fs_recovery(struct bch_fs *c)
 		c->opts.fix_errors = FSCK_OPT_YES;
 	}
 
-	if (!c->replicas.entries ||
-	    c->opts.rebuild_replicas) {
-		bch_info(c, "building replicas info");
-		set_bit(BCH_FS_REBUILD_REPLICAS, &c->flags);
-	}
-
 	if (!c->opts.nochanges) {
 		if (c->sb.version < bcachefs_metadata_version_new_data_types) {
 			bch_info(c, "version prior to new_data_types, upgrade and fsck required");
@@ -1224,10 +1218,7 @@ use_clean:
 	if (!c->opts.fsck)
 		set_bit(BCH_FS_FSCK_DONE, &c->flags);
 
-	if (c->opts.fsck ||
-	    !(c->sb.compat & (1ULL << BCH_COMPAT_alloc_info)) ||
-	    !(c->sb.compat & (1ULL << BCH_COMPAT_alloc_metadata)) ||
-	    test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags)) {
+	if (c->opts.fsck) {
 		bool metadata_only = c->opts.norecovery;
 
 		bch_info(c, "checking allocations");
@@ -1236,24 +1227,24 @@ use_clean:
 		if (ret)
 			goto err;
 		bch_verbose(c, "done checking allocations");
-	}
 
-	if (c->opts.fsck) {
 		bch_info(c, "checking need_discard and freespace btrees");
 		err = "error checking need_discard and freespace btrees";
 		ret = bch2_check_alloc_info(c);
 		if (ret)
 			goto err;
+		bch_verbose(c, "done checking need_discard and freespace btrees");
 
+		bch_info(c, "checking lrus");
+		err = "error checking lrus";
 		ret = bch2_check_lrus(c, true);
 		if (ret)
 			goto err;
-		bch_verbose(c, "done checking need_discard and freespace btrees");
+		bch_verbose(c, "done checking lrus");
 	}
 
 	bch2_stripes_heap_start(c);
 
-	clear_bit(BCH_FS_REBUILD_REPLICAS, &c->flags);
 	set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags);
 	set_bit(BCH_FS_MAY_GO_RW, &c->flags);
 
-- 
cgit 


From 42796f74f42ea5b7d9f2d7df661a87d1425968bf Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 14 Apr 2022 20:30:30 -0400
Subject: bcachefs: Ensure sysfs show fns print a newline

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/sysfs.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index c0cc6e9a3e05..24180d98fe81 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -55,6 +55,9 @@ static ssize_t fn ## _show(struct kobject *kobj, struct attribute *attr,\
 	struct printbuf out = PRINTBUF;					\
 	ssize_t ret = fn ## _to_text(&out, kobj, attr);			\
 									\
+	if (out.pos && out.buf[out.pos - 1] != '\n')			\
+		pr_newline(&out);					\
+									\
 	if (!ret && out.allocation_failure)				\
 		ret = -ENOMEM;						\
 									\
-- 
cgit 


From b33bf1bc0d1e81d614aad0f73f46f10e02906d20 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 16 Apr 2022 16:06:59 -0400
Subject: bcachefs: Go emergency RO when i_blocks underflows

This improves some of our warnings and assertions - they imply possible
filesystem inconsistencies, so they should be calling
bch2_fs_inconsistent().

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/fs-io.c | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 85a4484bec65..256b3dd0d4aa 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -231,7 +231,10 @@ static void i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode,
 		return;
 
 	mutex_lock(&inode->ei_quota_lock);
-	BUG_ON((s64) inode->v.i_blocks + sectors < 0);
+	bch2_fs_inconsistent_on((s64) inode->v.i_blocks + sectors < 0, c,
+				"inode %lu i_blocks underflow: %llu + %lli < 0 (ondisk %lli)",
+				inode->v.i_ino, (u64) inode->v.i_blocks, sectors,
+				inode->ei_inode.bi_sectors);
 	inode->v.i_blocks += sectors;
 
 #ifdef CONFIG_BCACHEFS_QUOTA
@@ -2695,9 +2698,11 @@ int bch2_truncate(struct mnt_idmap *idmap,
 			U64_MAX, &i_sectors_delta);
 	i_sectors_acct(c, inode, NULL, i_sectors_delta);
 
-	WARN_ON(!inode->v.i_size && inode->v.i_blocks &&
-		!bch2_journal_error(&c->journal));
-
+	bch2_fs_inconsistent_on(!inode->v.i_size && inode->v.i_blocks &&
+				!bch2_journal_error(&c->journal), c,
+				"inode %lu truncated to 0 but i_blocks %llu (ondisk %lli)",
+				inode->v.i_ino, (u64) inode->v.i_blocks,
+				inode->ei_inode.bi_sectors);
 	if (unlikely(ret))
 		goto err;
 
-- 
cgit 


From 0095aa94bca372b411d616a1aa1101ffa38ad09d Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 16 Apr 2022 18:59:58 -0400
Subject: bcachefs: Improve some fsck error messages

We have string names for d_type; use it.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/fsck.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index cf9e6f595d53..d507b9fdd32e 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -1405,8 +1405,8 @@ static int check_dirent_target(struct btree_trans *trans,
 
 		if (fsck_err_on(backpointer_exists &&
 				!target->bi_nlink, c,
-				"inode %llu has multiple links but i_nlink 0",
-				target->bi_inum)) {
+				"inode %llu type %s has multiple links but i_nlink 0",
+				target->bi_inum, bch2_d_types[d.v->d_type])) {
 			target->bi_nlink++;
 			target->bi_flags &= ~BCH_INODE_UNLINKED;
 
@@ -2254,8 +2254,8 @@ static int check_nlinks_update_hardlinks(struct bch_fs *c,
 		}
 
 		if (fsck_err_on(bch2_inode_nlink_get(&u) != link->count, c,
-				"inode %llu has wrong i_nlink (type %u i_nlink %u, should be %u)",
-				u.bi_inum, mode_to_type(u.bi_mode),
+				"inode %llu type %s has wrong i_nlink (%u, should be %u)",
+				u.bi_inum, bch2_d_types[mode_to_type(u.bi_mode)],
 				bch2_inode_nlink_get(&u), link->count)) {
 			bch2_inode_nlink_set(&u, link->count);
 
-- 
cgit 


From e1b8f5f5ca247f65211ca4e3e0e493dd3a54c98e Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 31 Mar 2022 21:44:55 -0400
Subject: bcachefs: Plumb btree_id & level to trans_mark

For backpointers, we'll need the full key location - that means btree_id
and btree level. This patch plumbs it through.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/alloc_background.c      |  1 +
 fs/bcachefs/alloc_background.h      |  4 +-
 fs/bcachefs/bkey_methods.h          | 79 ++++++++++++++++++++++++++++++++++---
 fs/bcachefs/btree_types.h           | 41 -------------------
 fs/bcachefs/btree_update_interior.c | 50 +++++++++++++++--------
 fs/bcachefs/btree_update_leaf.c     |  6 +--
 fs/bcachefs/buckets.c               |  5 +++
 fs/bcachefs/buckets.h               | 37 +++--------------
 fs/bcachefs/reflink.c               |  4 +-
 fs/bcachefs/reflink.h               |  5 ++-
 10 files changed, 129 insertions(+), 103 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index ce8803658369..a63c1664c3f2 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -540,6 +540,7 @@ err:
 }
 
 int bch2_trans_mark_alloc(struct btree_trans *trans,
+			  enum btree_id btree_id, unsigned level,
 			  struct bkey_s_c old, struct bkey_i *new,
 			  unsigned flags)
 {
diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h
index 2bc622b305c2..ff366e61ace5 100644
--- a/fs/bcachefs/alloc_background.h
+++ b/fs/bcachefs/alloc_background.h
@@ -125,8 +125,8 @@ static inline bool bkey_is_alloc(const struct bkey *k)
 
 int bch2_alloc_read(struct bch_fs *);
 
-int bch2_trans_mark_alloc(struct btree_trans *, struct bkey_s_c,
-			  struct bkey_i *, unsigned);
+int bch2_trans_mark_alloc(struct btree_trans *, enum btree_id, unsigned,
+			  struct bkey_s_c, struct bkey_i *, unsigned);
 int bch2_check_alloc_info(struct bch_fs *);
 int bch2_check_alloc_to_lru_refs(struct bch_fs *);
 void bch2_do_discards(struct bch_fs *);
diff --git a/fs/bcachefs/bkey_methods.h b/fs/bcachefs/bkey_methods.h
index 5c55e8bfe158..cff6f6dc44c4 100644
--- a/fs/bcachefs/bkey_methods.h
+++ b/fs/bcachefs/bkey_methods.h
@@ -27,8 +27,8 @@ struct bkey_ops {
 	void		(*swab)(struct bkey_s);
 	bool		(*key_normalize)(struct bch_fs *, struct bkey_s);
 	bool		(*key_merge)(struct bch_fs *, struct bkey_s, struct bkey_s_c);
-	int		(*trans_trigger)(struct btree_trans *, struct bkey_s_c,
-					 struct bkey_i *, unsigned);
+	int		(*trans_trigger)(struct btree_trans *, enum btree_id, unsigned,
+					 struct bkey_s_c, struct bkey_i *, unsigned);
 	int		(*atomic_trigger)(struct btree_trans *, struct bkey_s_c,
 					  struct bkey_s_c, unsigned);
 	void		(*compat)(enum btree_id id, unsigned version,
@@ -77,16 +77,85 @@ static inline int bch2_mark_key(struct btree_trans *trans,
 		: 0;
 }
 
-static inline int bch2_trans_mark_key(struct btree_trans *trans, struct bkey_s_c old,
-			struct bkey_i *new, unsigned flags)
+enum btree_update_flags {
+	__BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE,
+	__BTREE_UPDATE_NOJOURNAL,
+	__BTREE_UPDATE_KEY_CACHE_RECLAIM,
+	__BTREE_UPDATE_NO_KEY_CACHE_COHERENCY,
+
+	__BTREE_TRIGGER_NORUN,		/* Don't run triggers at all */
+
+	__BTREE_TRIGGER_INSERT,
+	__BTREE_TRIGGER_OVERWRITE,
+
+	__BTREE_TRIGGER_GC,
+	__BTREE_TRIGGER_BUCKET_INVALIDATE,
+	__BTREE_TRIGGER_NOATOMIC,
+};
+
+#define BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE (1U << __BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE)
+#define BTREE_UPDATE_NOJOURNAL		(1U << __BTREE_UPDATE_NOJOURNAL)
+#define BTREE_UPDATE_KEY_CACHE_RECLAIM	(1U << __BTREE_UPDATE_KEY_CACHE_RECLAIM)
+#define BTREE_UPDATE_NO_KEY_CACHE_COHERENCY	\
+	(1U << __BTREE_UPDATE_NO_KEY_CACHE_COHERENCY)
+
+#define BTREE_TRIGGER_NORUN		(1U << __BTREE_TRIGGER_NORUN)
+
+#define BTREE_TRIGGER_INSERT		(1U << __BTREE_TRIGGER_INSERT)
+#define BTREE_TRIGGER_OVERWRITE		(1U << __BTREE_TRIGGER_OVERWRITE)
+
+#define BTREE_TRIGGER_GC		(1U << __BTREE_TRIGGER_GC)
+#define BTREE_TRIGGER_BUCKET_INVALIDATE	(1U << __BTREE_TRIGGER_BUCKET_INVALIDATE)
+#define BTREE_TRIGGER_NOATOMIC		(1U << __BTREE_TRIGGER_NOATOMIC)
+
+#define BTREE_TRIGGER_WANTS_OLD_AND_NEW		\
+	((1U << KEY_TYPE_alloc)|		\
+	 (1U << KEY_TYPE_alloc_v2)|		\
+	 (1U << KEY_TYPE_alloc_v3)|		\
+	 (1U << KEY_TYPE_alloc_v4)|		\
+	 (1U << KEY_TYPE_stripe)|		\
+	 (1U << KEY_TYPE_inode)|		\
+	 (1U << KEY_TYPE_inode_v2)|		\
+	 (1U << KEY_TYPE_snapshot))
+
+static inline int bch2_trans_mark_key(struct btree_trans *trans,
+				      enum btree_id btree_id, unsigned level,
+				      struct bkey_s_c old, struct bkey_i *new,
+				      unsigned flags)
 {
 	const struct bkey_ops *ops = &bch2_bkey_ops[old.k->type ?: new->k.type];
 
 	return ops->trans_trigger
-		? ops->trans_trigger(trans, old, new, flags)
+		? ops->trans_trigger(trans, btree_id, level, old, new, flags)
 		: 0;
 }
 
+static inline int bch2_trans_mark_old(struct btree_trans *trans,
+				      enum btree_id btree_id, unsigned level,
+				      struct bkey_s_c old, unsigned flags)
+{
+	struct bkey_i deleted;
+
+	bkey_init(&deleted.k);
+	deleted.k.p = old.k->p;
+
+	return bch2_trans_mark_key(trans, btree_id, level, old, &deleted,
+				   BTREE_TRIGGER_OVERWRITE|flags);
+}
+
+static inline int bch2_trans_mark_new(struct btree_trans *trans,
+				      enum btree_id btree_id, unsigned level,
+				      struct bkey_i *new, unsigned flags)
+{
+	struct bkey_i deleted;
+
+	bkey_init(&deleted.k);
+	deleted.k.p = new->k.p;
+
+	return bch2_trans_mark_key(trans, btree_id, level, bkey_i_to_s_c(&deleted), new,
+				   BTREE_TRIGGER_INSERT|flags);
+}
+
 void bch2_bkey_renumber(enum btree_node_type, struct bkey_packed *, int);
 
 void __bch2_bkey_compat(unsigned, enum btree_id, unsigned, unsigned,
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 38c9148f608d..a475b1a9467a 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -638,47 +638,6 @@ static inline bool btree_type_has_snapshots(enum btree_id id)
 	return (1 << id) & BTREE_ID_HAS_SNAPSHOTS;
 }
 
-enum btree_update_flags {
-	__BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE,
-	__BTREE_UPDATE_NOJOURNAL,
-	__BTREE_UPDATE_KEY_CACHE_RECLAIM,
-	__BTREE_UPDATE_NO_KEY_CACHE_COHERENCY,
-
-	__BTREE_TRIGGER_NORUN,		/* Don't run triggers at all */
-
-	__BTREE_TRIGGER_INSERT,
-	__BTREE_TRIGGER_OVERWRITE,
-
-	__BTREE_TRIGGER_GC,
-	__BTREE_TRIGGER_BUCKET_INVALIDATE,
-	__BTREE_TRIGGER_NOATOMIC,
-};
-
-#define BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE (1U << __BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE)
-#define BTREE_UPDATE_NOJOURNAL		(1U << __BTREE_UPDATE_NOJOURNAL)
-#define BTREE_UPDATE_KEY_CACHE_RECLAIM	(1U << __BTREE_UPDATE_KEY_CACHE_RECLAIM)
-#define BTREE_UPDATE_NO_KEY_CACHE_COHERENCY	\
-	(1U << __BTREE_UPDATE_NO_KEY_CACHE_COHERENCY)
-
-#define BTREE_TRIGGER_NORUN		(1U << __BTREE_TRIGGER_NORUN)
-
-#define BTREE_TRIGGER_INSERT		(1U << __BTREE_TRIGGER_INSERT)
-#define BTREE_TRIGGER_OVERWRITE		(1U << __BTREE_TRIGGER_OVERWRITE)
-
-#define BTREE_TRIGGER_GC		(1U << __BTREE_TRIGGER_GC)
-#define BTREE_TRIGGER_BUCKET_INVALIDATE	(1U << __BTREE_TRIGGER_BUCKET_INVALIDATE)
-#define BTREE_TRIGGER_NOATOMIC		(1U << __BTREE_TRIGGER_NOATOMIC)
-
-#define BTREE_TRIGGER_WANTS_OLD_AND_NEW		\
-	((1U << KEY_TYPE_alloc)|		\
-	 (1U << KEY_TYPE_alloc_v2)|		\
-	 (1U << KEY_TYPE_alloc_v3)|		\
-	 (1U << KEY_TYPE_alloc_v4)|		\
-	 (1U << KEY_TYPE_stripe)|		\
-	 (1U << KEY_TYPE_inode)|		\
-	 (1U << KEY_TYPE_inode_v2)|		\
-	 (1U << KEY_TYPE_snapshot))
-
 static inline bool btree_node_type_needs_gc(enum btree_node_type type)
 {
 	return BTREE_NODE_TYPE_HAS_TRIGGERS & (1U << type);
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 10d22d9b8c0d..29de7afa9616 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -495,20 +495,30 @@ static void bch2_btree_update_free(struct btree_update *as)
 	mutex_unlock(&c->btree_interior_update_lock);
 }
 
-static void btree_update_will_delete_key(struct btree_update *as,
-					 struct bkey_i *k)
+static void btree_update_add_key(struct btree_update *as,
+				 struct keylist *keys, struct btree *b)
 {
-	BUG_ON(bch2_keylist_u64s(&as->old_keys) + k->k.u64s >
+	struct bkey_i *k = &b->key;
+
+	BUG_ON(bch2_keylist_u64s(keys) + k->k.u64s >
 	       ARRAY_SIZE(as->_old_keys));
-	bch2_keylist_add(&as->old_keys, k);
+
+	bkey_copy(keys->top, k);
+	bkey_i_to_btree_ptr_v2(keys->top)->v.mem_ptr = b->c.level + 1;
+
+	bch2_keylist_push(keys);
+}
+
+static void btree_update_will_delete_key(struct btree_update *as,
+					 struct btree *b)
+{
+	btree_update_add_key(as, &as->old_keys, b);
 }
 
 static void btree_update_will_add_key(struct btree_update *as,
-				      struct bkey_i *k)
+				      struct btree *b)
 {
-	BUG_ON(bch2_keylist_u64s(&as->new_keys) + k->k.u64s >
-	       ARRAY_SIZE(as->_new_keys));
-	bch2_keylist_add(&as->new_keys, k);
+	btree_update_add_key(as, &as->new_keys, b);
 }
 
 /*
@@ -533,13 +543,17 @@ static int btree_update_nodes_written_trans(struct btree_trans *trans,
 	trans->journal_pin = &as->journal;
 
 	for_each_keylist_key(&as->new_keys, k) {
-		ret = bch2_trans_mark_new(trans, k, 0);
+		unsigned level = bkey_i_to_btree_ptr_v2(k)->v.mem_ptr;
+
+		ret = bch2_trans_mark_new(trans, as->btree_id, level, k, 0);
 		if (ret)
 			return ret;
 	}
 
 	for_each_keylist_key(&as->old_keys, k) {
-		ret = bch2_trans_mark_old(trans, bkey_i_to_s_c(k), 0);
+		unsigned level = bkey_i_to_btree_ptr_v2(k)->v.mem_ptr;
+
+		ret = bch2_trans_mark_old(trans, as->btree_id, level, bkey_i_to_s_c(k), 0);
 		if (ret)
 			return ret;
 	}
@@ -642,8 +656,8 @@ err:
 
 			if (!ret) {
 				i->journal_seq = cpu_to_le64(
-					max(journal_seq,
-					    le64_to_cpu(i->journal_seq)));
+							     max(journal_seq,
+								 le64_to_cpu(i->journal_seq)));
 
 				bch2_btree_add_journal_pin(c, b, journal_seq);
 			} else {
@@ -811,7 +825,7 @@ static void bch2_btree_update_add_new_node(struct btree_update *as, struct btree
 
 	mutex_unlock(&c->btree_interior_update_lock);
 
-	btree_update_will_add_key(as, &b->key);
+	btree_update_will_add_key(as, b);
 }
 
 /*
@@ -864,7 +878,7 @@ static void bch2_btree_update_get_open_buckets(struct btree_update *as, struct b
  * btree_updates to point to this btree_update:
  */
 static void bch2_btree_interior_update_will_free_node(struct btree_update *as,
-					       struct btree *b)
+						      struct btree *b)
 {
 	struct bch_fs *c = as->c;
 	struct btree_update *p, *n;
@@ -928,7 +942,7 @@ static void bch2_btree_interior_update_will_free_node(struct btree_update *as,
 	 */
 	btree_update_drop_new_node(c, b);
 
-	btree_update_will_delete_key(as, &b->key);
+	btree_update_will_delete_key(as, b);
 
 	as->old_nodes[as->nr_old_nodes] = b;
 	as->old_nodes_seq[as->nr_old_nodes] = b->data->keys.seq;
@@ -1924,11 +1938,13 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans,
 	int ret;
 
 	if (!skip_triggers) {
-		ret = bch2_trans_mark_new(trans, new_key, 0);
+		ret = bch2_trans_mark_new(trans, b->c.btree_id, b->c.level + 1,
+					  new_key, 0);
 		if (ret)
 			return ret;
 
-		ret = bch2_trans_mark_old(trans, bkey_i_to_s_c(&b->key), 0);
+		ret = bch2_trans_mark_old(trans, b->c.btree_id, b->c.level + 1,
+					  bkey_i_to_s_c(&b->key), 0);
 		if (ret)
 			return ret;
 	}
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index a63c2f36bae4..ba6a9218610a 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -484,16 +484,16 @@ static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_
 	    ((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) {
 		i->overwrite_trigger_run = true;
 		i->insert_trigger_run = true;
-		return bch2_trans_mark_key(trans, old, i->k,
+		return bch2_trans_mark_key(trans, i->btree_id, i->level, old, i->k,
 					   BTREE_TRIGGER_INSERT|
 					   BTREE_TRIGGER_OVERWRITE|
 					   i->flags) ?: 1;
 	} else if (overwrite && !i->overwrite_trigger_run) {
 		i->overwrite_trigger_run = true;
-		return bch2_trans_mark_old(trans, old, i->flags) ?: 1;
+		return bch2_trans_mark_old(trans, i->btree_id, i->level, old, i->flags) ?: 1;
 	} else if (!overwrite && !i->insert_trigger_run) {
 		i->insert_trigger_run = true;
-		return bch2_trans_mark_new(trans, i->k, i->flags) ?: 1;
+		return bch2_trans_mark_new(trans, i->btree_id, i->level, i->k, i->flags) ?: 1;
 	} else {
 		return 0;
 	}
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 14c9c1098522..71e5d893fe6a 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -1451,6 +1451,7 @@ err:
 }
 
 int bch2_trans_mark_extent(struct btree_trans *trans,
+			   enum btree_id btree_id, unsigned level,
 			   struct bkey_s_c old, struct bkey_i *new,
 			   unsigned flags)
 {
@@ -1589,6 +1590,7 @@ err:
 }
 
 int bch2_trans_mark_stripe(struct btree_trans *trans,
+			   enum btree_id btree_id, unsigned level,
 			   struct bkey_s_c old, struct bkey_i *new,
 			   unsigned flags)
 {
@@ -1659,6 +1661,7 @@ int bch2_trans_mark_stripe(struct btree_trans *trans,
 }
 
 int bch2_trans_mark_inode(struct btree_trans *trans,
+			  enum btree_id btree_id, unsigned level,
 			  struct bkey_s_c old,
 			  struct bkey_i *new,
 			  unsigned flags)
@@ -1675,6 +1678,7 @@ int bch2_trans_mark_inode(struct btree_trans *trans,
 }
 
 int bch2_trans_mark_reservation(struct btree_trans *trans,
+				enum btree_id btree_id, unsigned level,
 				struct bkey_s_c old,
 				struct bkey_i *new,
 				unsigned flags)
@@ -1776,6 +1780,7 @@ err:
 }
 
 int bch2_trans_mark_reflink_p(struct btree_trans *trans,
+			      enum btree_id btree_id, unsigned level,
 			      struct bkey_s_c old,
 			      struct bkey_i *new,
 			      unsigned flags)
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index 518f5104a2f7..327022cd0f7a 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -202,41 +202,14 @@ int bch2_mark_inode(struct btree_trans *, struct bkey_s_c, struct bkey_s_c, unsi
 int bch2_mark_reservation(struct btree_trans *, struct bkey_s_c, struct bkey_s_c, unsigned);
 int bch2_mark_reflink_p(struct btree_trans *, struct bkey_s_c, struct bkey_s_c, unsigned);
 
-int bch2_trans_mark_extent(struct btree_trans *, struct bkey_s_c, struct bkey_i *, unsigned);
-int bch2_trans_mark_stripe(struct btree_trans *, struct bkey_s_c, struct bkey_i *, unsigned);
-int bch2_trans_mark_inode(struct btree_trans *, struct bkey_s_c, struct bkey_i *, unsigned);
-int bch2_trans_mark_reservation(struct btree_trans *, struct bkey_s_c, struct bkey_i *, unsigned);
-int bch2_trans_mark_reflink_p(struct btree_trans *, struct bkey_s_c, struct bkey_i *, unsigned);
+int bch2_trans_mark_extent(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_i *, unsigned);
+int bch2_trans_mark_stripe(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_i *, unsigned);
+int bch2_trans_mark_inode(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_i *, unsigned);
+int bch2_trans_mark_reservation(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_i *, unsigned);
+int bch2_trans_mark_reflink_p(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_i *, unsigned);
 
 int bch2_mark_key(struct btree_trans *, struct bkey_s_c, struct bkey_s_c, unsigned);
 
-int bch2_trans_mark_key(struct btree_trans *, struct bkey_s_c,
-			struct bkey_i *, unsigned);
-
-static inline int bch2_trans_mark_old(struct btree_trans *trans,
-				      struct bkey_s_c old, unsigned flags)
-{
-	struct bkey_i deleted;
-
-	bkey_init(&deleted.k);
-	deleted.k.p = old.k->p;
-
-	return bch2_trans_mark_key(trans, old, &deleted,
-				   BTREE_TRIGGER_OVERWRITE|flags);
-}
-
-static inline int bch2_trans_mark_new(struct btree_trans *trans,
-				      struct bkey_i *new, unsigned flags)
-{
-	struct bkey_i deleted;
-
-	bkey_init(&deleted.k);
-	deleted.k.p = new->k.p;
-
-	return bch2_trans_mark_key(trans, bkey_i_to_s_c(&deleted), new,
-				   BTREE_TRIGGER_INSERT|flags);
-}
-
 int bch2_trans_fs_usage_apply(struct btree_trans *, struct replicas_delta_list *);
 
 int bch2_trans_mark_metadata_bucket(struct btree_trans *, struct bch_dev *,
diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c
index 6a81eb9b41a0..a53a3d53c8da 100644
--- a/fs/bcachefs/reflink.c
+++ b/fs/bcachefs/reflink.c
@@ -110,6 +110,7 @@ bool bch2_reflink_v_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r
 }
 
 int bch2_trans_mark_reflink_v(struct btree_trans *trans,
+			      enum btree_id btree_id, unsigned level,
 			      struct bkey_s_c old, struct bkey_i *new,
 			      unsigned flags)
 {
@@ -124,7 +125,7 @@ int bch2_trans_mark_reflink_v(struct btree_trans *trans,
 		}
 	}
 
-	return bch2_trans_mark_extent(trans, old, new, flags);
+	return bch2_trans_mark_extent(trans, btree_id, level, old, new, flags);
 }
 
 /* indirect inline data */
@@ -153,6 +154,7 @@ void bch2_indirect_inline_data_to_text(struct printbuf *out,
 }
 
 int bch2_trans_mark_indirect_inline_data(struct btree_trans *trans,
+			      enum btree_id btree_id, unsigned level,
 			      struct bkey_s_c old, struct bkey_i *new,
 			      unsigned flags)
 {
diff --git a/fs/bcachefs/reflink.h b/fs/bcachefs/reflink.h
index e0a9d8e4d1ca..f9848dc3eebb 100644
--- a/fs/bcachefs/reflink.h
+++ b/fs/bcachefs/reflink.h
@@ -20,8 +20,8 @@ int bch2_reflink_v_invalid(const struct bch_fs *, struct bkey_s_c,
 			   int, struct printbuf *);
 void bch2_reflink_v_to_text(struct printbuf *, struct bch_fs *,
 			    struct bkey_s_c);
-int bch2_trans_mark_reflink_v(struct btree_trans *, struct bkey_s_c,
-			      struct bkey_i *, unsigned);
+int bch2_trans_mark_reflink_v(struct btree_trans *, enum btree_id, unsigned,
+			      struct bkey_s_c, struct bkey_i *, unsigned);
 
 #define bch2_bkey_ops_reflink_v (struct bkey_ops) {		\
 	.key_invalid	= bch2_reflink_v_invalid,		\
@@ -36,6 +36,7 @@ int bch2_indirect_inline_data_invalid(const struct bch_fs *, struct bkey_s_c,
 void bch2_indirect_inline_data_to_text(struct printbuf *,
 				struct bch_fs *, struct bkey_s_c);
 int bch2_trans_mark_indirect_inline_data(struct btree_trans *,
+					 enum btree_id, unsigned,
 			      struct bkey_s_c, struct bkey_i *,
 			      unsigned);
 
-- 
cgit 


From 7419646b254a599d7881e7815fc5e61b18c94b7e Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 14 Apr 2022 15:37:16 -0400
Subject: bcachefs: btree_update_interior.c prep for backpointers

Previously, btree_update_interior.c passed keys to bch2_trans_mark_*
that hadn't been fully initialized - they didn't have the key field
filled out, just the value.

With backpointers, we need to make sure keys are fully initialized
before marking them - because the backpointer points back to the
original key.

This patch tweaks the interior update paths to fix this.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_update_interior.c | 45 ++++++++++++++++---------------------
 1 file changed, 19 insertions(+), 26 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 29de7afa9616..7a092d852930 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -509,18 +509,6 @@ static void btree_update_add_key(struct btree_update *as,
 	bch2_keylist_push(keys);
 }
 
-static void btree_update_will_delete_key(struct btree_update *as,
-					 struct btree *b)
-{
-	btree_update_add_key(as, &as->old_keys, b);
-}
-
-static void btree_update_will_add_key(struct btree_update *as,
-				      struct btree *b)
-{
-	btree_update_add_key(as, &as->new_keys, b);
-}
-
 /*
  * The transactional part of an interior btree node update, where we journal the
  * update we did to the interior node and update alloc info:
@@ -542,18 +530,18 @@ static int btree_update_nodes_written_trans(struct btree_trans *trans,
 
 	trans->journal_pin = &as->journal;
 
-	for_each_keylist_key(&as->new_keys, k) {
+	for_each_keylist_key(&as->old_keys, k) {
 		unsigned level = bkey_i_to_btree_ptr_v2(k)->v.mem_ptr;
 
-		ret = bch2_trans_mark_new(trans, as->btree_id, level, k, 0);
+		ret = bch2_trans_mark_old(trans, as->btree_id, level, bkey_i_to_s_c(k), 0);
 		if (ret)
 			return ret;
 	}
 
-	for_each_keylist_key(&as->old_keys, k) {
+	for_each_keylist_key(&as->new_keys, k) {
 		unsigned level = bkey_i_to_btree_ptr_v2(k)->v.mem_ptr;
 
-		ret = bch2_trans_mark_old(trans, as->btree_id, level, bkey_i_to_s_c(k), 0);
+		ret = bch2_trans_mark_new(trans, as->btree_id, level, k, 0);
 		if (ret)
 			return ret;
 	}
@@ -825,7 +813,7 @@ static void bch2_btree_update_add_new_node(struct btree_update *as, struct btree
 
 	mutex_unlock(&c->btree_interior_update_lock);
 
-	btree_update_will_add_key(as, b);
+	btree_update_add_key(as, &as->new_keys, b);
 }
 
 /*
@@ -942,7 +930,7 @@ static void bch2_btree_interior_update_will_free_node(struct btree_update *as,
 	 */
 	btree_update_drop_new_node(c, b);
 
-	btree_update_will_delete_key(as, b);
+	btree_update_add_key(as, &as->old_keys, b);
 
 	as->old_nodes[as->nr_old_nodes] = b;
 	as->old_nodes_seq[as->nr_old_nodes] = b->data->keys.seq;
@@ -1272,13 +1260,14 @@ static struct btree *__btree_split_node(struct btree_update *as,
 	struct bpos n1_pos;
 
 	n2 = bch2_btree_node_alloc(as, n1->c.level);
-	bch2_btree_update_add_new_node(as, n2);
 
 	n2->data->max_key	= n1->data->max_key;
 	n2->data->format	= n1->format;
 	SET_BTREE_NODE_SEQ(n2->data, BTREE_NODE_SEQ(n1->data));
 	n2->key.k.p = n1->key.k.p;
 
+	bch2_btree_update_add_new_node(as, n2);
+
 	set1 = btree_bset_first(n1);
 	set2 = btree_bset_first(n2);
 
@@ -1435,7 +1424,6 @@ static void btree_split(struct btree_update *as, struct btree_trans *trans,
 	bch2_btree_interior_update_will_free_node(as, b);
 
 	n1 = bch2_btree_node_alloc_replacement(as, b);
-	bch2_btree_update_add_new_node(as, n1);
 
 	if (keys)
 		btree_split_insert_keys(as, trans, path, n1, keys);
@@ -1450,6 +1438,8 @@ static void btree_split(struct btree_update *as, struct btree_trans *trans,
 		six_unlock_write(&n2->c.lock);
 		six_unlock_write(&n1->c.lock);
 
+		bch2_btree_update_add_new_node(as, n1);
+
 		bch2_btree_node_write(c, n1, SIX_LOCK_intent, 0);
 		bch2_btree_node_write(c, n2, SIX_LOCK_intent, 0);
 
@@ -1478,6 +1468,8 @@ static void btree_split(struct btree_update *as, struct btree_trans *trans,
 		bch2_btree_build_aux_trees(n1);
 		six_unlock_write(&n1->c.lock);
 
+		bch2_btree_update_add_new_node(as, n1);
+
 		bch2_btree_node_write(c, n1, SIX_LOCK_intent, 0);
 
 		if (parent)
@@ -1746,7 +1738,6 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans,
 	bch2_btree_interior_update_will_free_node(as, m);
 
 	n = bch2_btree_node_alloc(as, b->c.level);
-	bch2_btree_update_add_new_node(as, n);
 
 	SET_BTREE_NODE_SEQ(n->data,
 			   max(BTREE_NODE_SEQ(b->data),
@@ -1754,8 +1745,10 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans,
 
 	btree_set_min(n, prev->data->min_key);
 	btree_set_max(n, next->data->max_key);
-	n->data->format		= new_f;
 
+	bch2_btree_update_add_new_node(as, n);
+
+	n->data->format	 = new_f;
 	btree_node_set_format(n, new_f);
 
 	bch2_btree_sort_into(c, n, prev);
@@ -1938,13 +1931,13 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans,
 	int ret;
 
 	if (!skip_triggers) {
-		ret = bch2_trans_mark_new(trans, b->c.btree_id, b->c.level + 1,
-					  new_key, 0);
+		ret = bch2_trans_mark_old(trans, b->c.btree_id, b->c.level + 1,
+					  bkey_i_to_s_c(&b->key), 0);
 		if (ret)
 			return ret;
 
-		ret = bch2_trans_mark_old(trans, b->c.btree_id, b->c.level + 1,
-					  bkey_i_to_s_c(&b->key), 0);
+		ret = bch2_trans_mark_new(trans, b->c.btree_id, b->c.level + 1,
+					  new_key, 0);
 		if (ret)
 			return ret;
 	}
-- 
cgit 


From 2ae4573e57384f4df256f52c6ec2e0305136aa8b Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 14 Apr 2022 15:45:00 -0400
Subject: bcachefs: bch2_btree_iter_peek_slot() now works on interior nodes

The new backpointers code will be using bch2_btree_iter_peek_slot() on
interior nodes - this patch updates peek_slot() to make that work.

 - Pass the correct level to bch2_journal_keys_peek_slot()
 - We should only set BTREE_ITER_CACHED or BTREE_ITER_WITH_KEY_CACHE
   when using bch2_trans_iter_init(), not bch2_trans_node_iter_init()
 - Update assertions

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_iter.c | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 80ea97e21cf3..7ca4f6d17504 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -2226,7 +2226,8 @@ struct bkey_s_c btree_trans_peek_slot_journal(struct btree_trans *trans,
 					      struct btree_iter *iter)
 {
 	struct bkey_i *k = bch2_journal_keys_peek_slot(trans->c, iter->btree_id,
-						       0, iter->path->pos);
+						       iter->path->level,
+						       iter->path->pos);
 
 	if (k) {
 		iter->k = k->k;
@@ -2649,9 +2650,9 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
 	struct bkey_s_c k;
 	int ret;
 
-	EBUG_ON(iter->path->level);
 	bch2_btree_iter_verify(iter);
 	bch2_btree_iter_verify_entry_exit(iter);
+	EBUG_ON(iter->path->level && (iter->flags & BTREE_ITER_WITH_KEY_CACHE));
 
 	/* extents can't span inode numbers: */
 	if ((iter->flags & BTREE_ITER_IS_EXTENTS) &&
@@ -2696,6 +2697,8 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
 	} else {
 		struct bpos next;
 
+		EBUG_ON(iter->path->level);
+
 		if (iter->flags & BTREE_ITER_INTENT) {
 			struct btree_iter iter2;
 			struct bpos end = iter->pos;
@@ -2934,12 +2937,6 @@ static void __bch2_trans_iter_init(struct btree_trans *trans,
 	if (trans->journal_replay_not_finished)
 		flags |= BTREE_ITER_WITH_JOURNAL;
 
-	if (!btree_id_cached(trans->c, btree_id)) {
-		flags &= ~BTREE_ITER_CACHED;
-		flags &= ~BTREE_ITER_WITH_KEY_CACHE;
-	} else if (!(flags & BTREE_ITER_CACHED))
-		flags |= BTREE_ITER_WITH_KEY_CACHE;
-
 	iter->trans	= trans;
 	iter->path	= NULL;
 	iter->update_path = NULL;
@@ -2962,6 +2959,12 @@ void bch2_trans_iter_init(struct btree_trans *trans,
 			  unsigned btree_id, struct bpos pos,
 			  unsigned flags)
 {
+	if (!btree_id_cached(trans->c, btree_id)) {
+		flags &= ~BTREE_ITER_CACHED;
+		flags &= ~BTREE_ITER_WITH_KEY_CACHE;
+	} else if (!(flags & BTREE_ITER_CACHED))
+		flags |= BTREE_ITER_WITH_KEY_CACHE;
+
 	__bch2_trans_iter_init(trans, iter, btree_id, pos,
 			       0, 0, flags);
 }
-- 
cgit 


From c4bce586752376b226cff28139cbefdd7346497d Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 14 Apr 2022 15:43:37 -0400
Subject: bcachefs: btree_path_set_level_(up|down)

This adds two new helpers to btree_iter.c for changing the level of a
path up or down - to be used by the new
bch2_btree_iter_peek_all_levels().

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_iter.c | 39 ++++++++++++++++++++++++++-------------
 1 file changed, 26 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 7ca4f6d17504..74d41fe5c074 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1517,6 +1517,30 @@ static inline bool btree_path_good_node(struct btree_trans *trans,
 	return true;
 }
 
+static void btree_path_set_level_up(struct btree_path *path)
+{
+	btree_node_unlock(path, path->level);
+	path->l[path->level].b = BTREE_ITER_NO_NODE_UP;
+	path->level++;
+	btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
+}
+
+static void btree_path_set_level_down(struct btree_trans *trans,
+				      struct btree_path *path,
+				      unsigned new_level)
+{
+	unsigned l;
+
+	path->level = new_level;
+
+	for (l = path->level + 1; l < BTREE_MAX_DEPTH; l++)
+		if (btree_lock_want(path, l) == BTREE_NODE_UNLOCKED)
+			btree_node_unlock(path, l);
+
+	btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
+	bch2_btree_path_verify(trans, path);
+}
+
 static inline unsigned btree_path_up_until_good_node(struct btree_trans *trans,
 						     struct btree_path *path,
 						     int check_pos)
@@ -2084,7 +2108,6 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
 	struct btree_trans *trans = iter->trans;
 	struct btree_path *path = iter->path;
 	struct btree *b = NULL;
-	unsigned l;
 	int ret;
 
 	BUG_ON(trans->restarted);
@@ -2097,10 +2120,7 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
 
 	/* got to end? */
 	if (!btree_path_node(path, path->level + 1)) {
-		btree_node_unlock(path, path->level);
-		path->l[path->level].b = BTREE_ITER_NO_NODE_UP;
-		path->level++;
-		btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
+		btree_path_set_level_up(path);
 		return NULL;
 	}
 
@@ -2131,14 +2151,7 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
 			bch2_btree_path_set_pos(trans, path, bpos_successor(iter->pos),
 					   iter->flags & BTREE_ITER_INTENT);
 
-		path->level = iter->min_depth;
-
-		for (l = path->level + 1; l < BTREE_MAX_DEPTH; l++)
-			if (btree_lock_want(path, l) == BTREE_NODE_UNLOCKED)
-				btree_node_unlock(path, l);
-
-		btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
-		bch2_btree_iter_verify(iter);
+		btree_path_set_level_down(trans, path, iter->min_depth);
 
 		ret = bch2_btree_path_traverse(trans, path, iter->flags);
 		if (ret)
-- 
cgit 


From b0babf2a34233c651060e54b68fa3cd0b9e7a6e7 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 12 Apr 2022 18:04:08 -0400
Subject: bcachefs: bch2_btree_iter_peek_all_levels()

This adds bch2_btree_iter_peek_all_levels(), which returns keys from
every level of the btree - interior nodes included - in monotonically
increasing order, soon to be used by the backpointers check & repair
code.

 - BTREE_ITER_ALL_LEVELS can now be passed to for_each_btree_key() to
   iterate thusly, much like BTREE_ITER_SLOTS

 - The existing algorithm in bch2_btree_iter_advance() doesn't work with
   peek_all_levels(): we have to defer the actual advancing until the
   next time we call peek, where we have the btree path traversed and
   uptodate. So, we add an advanced bit to btree_iter; when
   BTREE_ITER_ALL_LEVELS is set bch2_btree_iter_advanced() just marks
   the iterator as advanced.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c  | 123 +++++++++++++++++++++++++++++++++++++++++++---
 fs/bcachefs/btree_iter.h  |   8 +--
 fs/bcachefs/btree_types.h |  15 ++----
 3 files changed, 125 insertions(+), 21 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 74d41fe5c074..cd85c3ad2ab7 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -2181,15 +2181,23 @@ err:
 
 inline bool bch2_btree_iter_advance(struct btree_iter *iter)
 {
-	struct bpos pos = iter->k.p;
-	bool ret = (iter->flags & BTREE_ITER_ALL_SNAPSHOTS
-		    ? bpos_cmp(pos, SPOS_MAX)
-		    : bkey_cmp(pos, SPOS_MAX)) != 0;
+	if (likely(!(iter->flags & BTREE_ITER_ALL_LEVELS))) {
+		struct bpos pos = iter->k.p;
+		bool ret = (iter->flags & BTREE_ITER_ALL_SNAPSHOTS
+			    ? bpos_cmp(pos, SPOS_MAX)
+			    : bkey_cmp(pos, SPOS_MAX)) != 0;
+
+		if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS))
+			pos = bkey_successor(iter, pos);
+		bch2_btree_iter_set_pos(iter, pos);
+		return ret;
+	} else {
+		if (!btree_path_node(iter->path, iter->path->level))
+			return true;
 
-	if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS))
-		pos = bkey_successor(iter, pos);
-	bch2_btree_iter_set_pos(iter, pos);
-	return ret;
+		iter->advanced = true;
+		return false;
+	}
 }
 
 inline bool bch2_btree_iter_rewind(struct btree_iter *iter)
@@ -2396,6 +2404,8 @@ struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos e
 	struct bpos iter_pos;
 	int ret;
 
+	EBUG_ON(iter->flags & BTREE_ITER_ALL_LEVELS);
+
 	if (iter->update_path) {
 		bch2_path_put(trans, iter->update_path,
 			      iter->flags & BTREE_ITER_INTENT);
@@ -2510,6 +2520,99 @@ out:
 	return k;
 }
 
+/**
+ * bch2_btree_iter_peek_all_levels: returns the first key greater than or equal
+ * to iterator's current position, returning keys from every level of the btree.
+ * For keys at different levels of the btree that compare equal, the key from
+ * the lower level (leaf) is returned first.
+ */
+struct bkey_s_c bch2_btree_iter_peek_all_levels(struct btree_iter *iter)
+{
+	struct btree_trans *trans = iter->trans;
+	struct bkey_s_c k;
+	int ret;
+
+	EBUG_ON(iter->path->cached);
+	bch2_btree_iter_verify(iter);
+	BUG_ON(iter->path->level < iter->min_depth);
+	BUG_ON(!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS));
+	EBUG_ON(!(iter->flags & BTREE_ITER_ALL_LEVELS));
+
+	while (1) {
+		iter->path = bch2_btree_path_set_pos(trans, iter->path, iter->pos,
+					iter->flags & BTREE_ITER_INTENT);
+
+		ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
+		if (unlikely(ret)) {
+			/* ensure that iter->k is consistent with iter->pos: */
+			bch2_btree_iter_set_pos(iter, iter->pos);
+			k = bkey_s_c_err(ret);
+			goto out;
+		}
+
+		/* Already at end? */
+		if (!btree_path_node(iter->path, iter->path->level)) {
+			k = bkey_s_c_null;
+			goto out;
+		}
+
+		k = btree_path_level_peek_all(trans->c,
+				&iter->path->l[iter->path->level], &iter->k);
+
+		/* Check if we should go up to the parent node: */
+		if (!k.k ||
+		    (iter->advanced &&
+		     !bpos_cmp(path_l(iter->path)->b->key.k.p, iter->pos))) {
+			iter->pos = path_l(iter->path)->b->key.k.p;
+			btree_path_set_level_up(iter->path);
+			iter->advanced = false;
+			continue;
+		}
+
+		/*
+		 * Check if we should go back down to a leaf:
+		 * If we're not in a leaf node, we only return the current key
+		 * if it exactly matches iter->pos - otherwise we first have to
+		 * go back to the leaf:
+		 */
+		if (iter->path->level != iter->min_depth &&
+		    (iter->advanced ||
+		     !k.k ||
+		     bpos_cmp(iter->pos, k.k->p))) {
+			btree_path_set_level_down(trans, iter->path, iter->min_depth);
+			iter->pos = bpos_successor(iter->pos);
+			iter->advanced = false;
+			continue;
+		}
+
+		/* Check if we should go to the next key: */
+		if (iter->path->level == iter->min_depth &&
+		    iter->advanced &&
+		    k.k &&
+		    !bpos_cmp(iter->pos, k.k->p)) {
+			iter->pos = bpos_successor(iter->pos);
+			iter->advanced = false;
+			continue;
+		}
+
+		if (iter->advanced &&
+		    iter->path->level == iter->min_depth &&
+		    bpos_cmp(k.k->p, iter->pos))
+			iter->advanced = false;
+
+		BUG_ON(iter->advanced);
+		BUG_ON(!k.k);
+		break;
+	}
+
+	iter->pos = k.k->p;
+out:
+	iter->path->should_be_locked = true;
+	bch2_btree_iter_verify(iter);
+
+	return k;
+}
+
 /**
  * bch2_btree_iter_next: returns first key greater than iterator's current
  * position
@@ -2665,6 +2768,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
 
 	bch2_btree_iter_verify(iter);
 	bch2_btree_iter_verify_entry_exit(iter);
+	EBUG_ON(iter->flags & BTREE_ITER_ALL_LEVELS);
 	EBUG_ON(iter->path->level && (iter->flags & BTREE_ITER_WITH_KEY_CACHE));
 
 	/* extents can't span inode numbers: */
@@ -2935,6 +3039,9 @@ static void __bch2_trans_iter_init(struct btree_trans *trans,
 {
 	EBUG_ON(trans->restarted);
 
+	if (flags & BTREE_ITER_ALL_LEVELS)
+		flags |= BTREE_ITER_ALL_SNAPSHOTS|__BTREE_ITER_ALL_SNAPSHOTS;
+
 	if (!(flags & (BTREE_ITER_ALL_SNAPSHOTS|BTREE_ITER_NOT_EXTENTS)) &&
 	    btree_node_type_is_extents(btree_id))
 		flags |= BTREE_ITER_IS_EXTENTS;
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index 29c1df83b35e..dc6f07492bc9 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -249,6 +249,8 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *);
 struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *, struct bpos);
 struct bkey_s_c bch2_btree_iter_next(struct btree_iter *);
 
+struct bkey_s_c bch2_btree_iter_peek_all_levels(struct btree_iter *);
+
 static inline struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
 {
 	return bch2_btree_iter_peek_upto(iter, SPOS_MAX);
@@ -350,9 +352,9 @@ static inline int bkey_err(struct bkey_s_c k)
 static inline struct bkey_s_c bch2_btree_iter_peek_type(struct btree_iter *iter,
 							unsigned flags)
 {
-	return flags & BTREE_ITER_SLOTS
-		? bch2_btree_iter_peek_slot(iter)
-		: bch2_btree_iter_peek(iter);
+	return  flags & BTREE_ITER_ALL_LEVELS ? bch2_btree_iter_peek_all_levels(iter) :
+		flags & BTREE_ITER_SLOTS      ? bch2_btree_iter_peek_slot(iter) :
+						bch2_btree_iter_peek(iter);
 }
 
 static inline struct bkey_s_c bch2_btree_iter_peek_upto_type(struct btree_iter *iter,
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index a475b1a9467a..4f359ff79334 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -182,22 +182,16 @@ struct btree_node_iter {
  * Iterate over all possible positions, synthesizing deleted keys for holes:
  */
 #define BTREE_ITER_SLOTS		(1 << 0)
+#define BTREE_ITER_ALL_LEVELS		(1 << 1)
 /*
  * Indicates that intent locks should be taken on leaf nodes, because we expect
  * to be doing updates:
  */
-#define BTREE_ITER_INTENT		(1 << 1)
+#define BTREE_ITER_INTENT		(1 << 2)
 /*
  * Causes the btree iterator code to prefetch additional btree nodes from disk:
  */
-#define BTREE_ITER_PREFETCH		(1 << 2)
-/*
- * Indicates that this iterator should not be reused until transaction commit,
- * either because a pending update references it or because the update depends
- * on that particular key being locked (e.g. by the str_hash code, for hash
- * table consistency)
- */
-#define BTREE_ITER_KEEP_UNTIL_COMMIT	(1 << 3)
+#define BTREE_ITER_PREFETCH		(1 << 3)
 /*
  * Used in bch2_btree_iter_traverse(), to indicate whether we're searching for
  * @pos or the first key strictly greater than @pos
@@ -282,7 +276,8 @@ struct btree_iter {
 	struct btree_path	*key_cache_path;
 
 	enum btree_id		btree_id:4;
-	unsigned		min_depth:4;
+	unsigned		min_depth:3;
+	unsigned		advanced:1;
 
 	/* btree_iter_copy starts here: */
 	u16			flags;
-- 
cgit 


From a729e489ab0805fb93047508fb9439cd1464cf70 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 17 Apr 2022 17:50:47 -0400
Subject: bcachefs: Allocate some extra room in btree_key_cache_fill()

If we allocate a buffer that's a bit bigger than necessary the
transaction commit path will be much less likely to have to reallocate -
which requires a transaction restart.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_key_cache.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index f86d57d1ace0..d316e9b9ae02 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -236,6 +236,13 @@ static int btree_key_cache_fill(struct btree_trans *trans,
 	 */
 	new_u64s = k.k->u64s + 1;
 
+	/*
+	 * Allocate some extra space so that the transaction commit path is less
+	 * likely to have to reallocate, since that requires a transaction
+	 * restart:
+	 */
+	new_u64s = min(256U, (new_u64s * 3) / 2);
+
 	if (new_u64s > ck->u64s) {
 		new_u64s = roundup_pow_of_two(new_u64s);
 		new_k = kmalloc(new_u64s * sizeof(u64), GFP_NOFS);
-- 
cgit 


From d8f31407c842331a13c48404bc030f49d60f25aa Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 17 Apr 2022 19:02:04 -0400
Subject: bcachefs: Fix hash_check_key()

hash_check_key() was incorrectly handling transaction restarts - switch
it to for_each_btree_key_norestart().

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/fsck.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index d507b9fdd32e..f1abec95a740 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -740,8 +740,9 @@ static int hash_check_key(struct btree_trans *trans,
 	if (hash_k.k->p.offset < hash)
 		goto bad_hash;
 
-	for_each_btree_key(trans, iter, desc.btree_id, POS(hash_k.k->p.inode, hash),
-			   BTREE_ITER_SLOTS, k, ret) {
+	for_each_btree_key_norestart(trans, iter, desc.btree_id,
+				     POS(hash_k.k->p.inode, hash),
+				     BTREE_ITER_SLOTS, k, ret) {
 		if (!bkey_cmp(k.k->p, hash_k.k->p))
 			break;
 
@@ -759,16 +760,15 @@ static int hash_check_key(struct btree_trans *trans,
 			bch2_trans_iter_exit(trans, &iter);
 			goto bad_hash;
 		}
-
 	}
 out:
 	bch2_trans_iter_exit(trans, &iter);
 	printbuf_exit(&buf);
 	return ret;
 bad_hash:
-	if (fsck_err(c, "hash table key at wrong offset: btree %u inode %llu offset %llu, "
+	if (fsck_err(c, "hash table key at wrong offset: btree %s inode %llu offset %llu, "
 		     "hashed to %llu\n%s",
-		     desc.btree_id, hash_k.k->p.inode, hash_k.k->p.offset, hash,
+		     bch2_btree_ids[desc.btree_id], hash_k.k->p.inode, hash_k.k->p.offset, hash,
 		     (printbuf_reset(&buf),
 		      bch2_bkey_val_to_text(&buf, c, hash_k), buf.buf)) == FSCK_ERR_IGNORE)
 		return 0;
-- 
cgit 


From c0960603e2d42d097fea4afd6b720619441061bd Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 17 Apr 2022 17:30:49 -0400
Subject: bcachefs: Shutdown path improvements

We're seeing occasional firings of the assertion in the key cache
shutdown code that nr_dirty == 0, which means we must sometimes be doing
transaction commits after we've gone read only.

Cleanups & changes:
 - BCH_FS_ALLOC_CLEAN renamed to BCH_FS_CLEAN_SHUTDOWN
 - new helper bch2_btree_interior_updates_flush(), which returns true if
   it had to wait
 - bch2_btree_flush_writes() now also returns true if there were btree
   writes in flight
 - __bch2_fs_read_only now checks if btree writes were in flight in the
   shutdown loop: btree write completion does a transaction update, to
   update the pointer in the parent node
 - assert that !BCH_FS_CLEAN_SHUTDOWN in __bch2_trans_commit

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/bcachefs.h              |  2 +-
 fs/bcachefs/btree_gc.c              |  4 +---
 fs/bcachefs/btree_io.c              | 14 +++++++----
 fs/bcachefs/btree_io.h              |  4 ++--
 fs/bcachefs/btree_update_interior.c | 18 ++++++++++----
 fs/bcachefs/btree_update_interior.h |  2 +-
 fs/bcachefs/btree_update_leaf.c     |  2 ++
 fs/bcachefs/migrate.c               |  5 +---
 fs/bcachefs/move.c                  |  4 +---
 fs/bcachefs/super.c                 | 48 ++++++++++---------------------------
 10 files changed, 43 insertions(+), 60 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 04d297b1da94..e7300a9f427c 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -494,7 +494,7 @@ struct bch_dev {
 
 enum {
 	/* startup: */
-	BCH_FS_ALLOC_CLEAN,
+	BCH_FS_CLEAN_SHUTDOWN,
 	BCH_FS_INITIAL_GC_DONE,
 	BCH_FS_INITIAL_GC_UNFIXED,
 	BCH_FS_TOPOLOGY_REPAIR_DONE,
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 5199f0240fcd..21afc7200570 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -1751,9 +1751,7 @@ int bch2_gc(struct bch_fs *c, bool initial, bool metadata_only)
 
 	down_write(&c->gc_lock);
 
-	/* flush interior btree updates: */
-	closure_wait_event(&c->btree_interior_update_wait,
-			   !bch2_btree_interior_updates_nr_pending(c));
+	bch2_btree_interior_updates_flush(c);
 
 	ret   = bch2_gc_start(c, metadata_only) ?:
 		bch2_gc_alloc_start(c, metadata_only) ?:
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index f847928ab743..33c54803e0a2 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -2099,29 +2099,33 @@ void bch2_btree_node_write(struct bch_fs *c, struct btree *b,
 	}
 }
 
-static void __bch2_btree_flush_all(struct bch_fs *c, unsigned flag)
+static bool __bch2_btree_flush_all(struct bch_fs *c, unsigned flag)
 {
 	struct bucket_table *tbl;
 	struct rhash_head *pos;
 	struct btree *b;
 	unsigned i;
+	bool ret = false;
 restart:
 	rcu_read_lock();
 	for_each_cached_btree(b, c, tbl, i, pos)
 		if (test_bit(flag, &b->flags)) {
 			rcu_read_unlock();
 			wait_on_bit_io(&b->flags, flag, TASK_UNINTERRUPTIBLE);
+			ret = true;
 			goto restart;
 		}
 	rcu_read_unlock();
+
+	return ret;
 }
 
-void bch2_btree_flush_all_reads(struct bch_fs *c)
+bool bch2_btree_flush_all_reads(struct bch_fs *c)
 {
-	__bch2_btree_flush_all(c, BTREE_NODE_read_in_flight);
+	return __bch2_btree_flush_all(c, BTREE_NODE_read_in_flight);
 }
 
-void bch2_btree_flush_all_writes(struct bch_fs *c)
+bool bch2_btree_flush_all_writes(struct bch_fs *c)
 {
-	__bch2_btree_flush_all(c, BTREE_NODE_write_in_flight);
+	return __bch2_btree_flush_all(c, BTREE_NODE_write_in_flight);
 }
diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h
index d818d87661e8..8af853642123 100644
--- a/fs/bcachefs/btree_io.h
+++ b/fs/bcachefs/btree_io.h
@@ -152,8 +152,8 @@ static inline void btree_node_write_if_need(struct bch_fs *c, struct btree *b,
 	bch2_btree_node_write(c, b, lock_held, BTREE_WRITE_ONLY_IF_NEED);
 }
 
-void bch2_btree_flush_all_reads(struct bch_fs *);
-void bch2_btree_flush_all_writes(struct bch_fs *);
+bool bch2_btree_flush_all_reads(struct bch_fs *);
+bool bch2_btree_flush_all_writes(struct bch_fs *);
 
 static inline void compat_bformat(unsigned level, enum btree_id btree_id,
 				  unsigned version, unsigned big_endian,
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 7a092d852930..27ab1cde2217 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -2175,19 +2175,27 @@ void bch2_btree_updates_to_text(struct printbuf *out, struct bch_fs *c)
 	mutex_unlock(&c->btree_interior_update_lock);
 }
 
-size_t bch2_btree_interior_updates_nr_pending(struct bch_fs *c)
+static bool bch2_btree_interior_updates_pending(struct bch_fs *c)
 {
-	size_t ret = 0;
-	struct list_head *i;
+	bool ret;
 
 	mutex_lock(&c->btree_interior_update_lock);
-	list_for_each(i, &c->btree_interior_update_list)
-		ret++;
+	ret = !list_empty(&c->btree_interior_update_list);
 	mutex_unlock(&c->btree_interior_update_lock);
 
 	return ret;
 }
 
+bool bch2_btree_interior_updates_flush(struct bch_fs *c)
+{
+	bool ret = bch2_btree_interior_updates_pending(c);
+
+	if (ret)
+		closure_wait_event(&c->btree_interior_update_wait,
+				   !bch2_btree_interior_updates_pending(c));
+	return ret;
+}
+
 void bch2_journal_entries_to_btree_roots(struct bch_fs *c, struct jset *jset)
 {
 	struct btree_root *r;
diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h
index e72eb8795616..adfc6c24a7a4 100644
--- a/fs/bcachefs/btree_update_interior.h
+++ b/fs/bcachefs/btree_update_interior.h
@@ -309,7 +309,7 @@ static inline bool bch2_btree_node_insert_fits(struct bch_fs *c,
 
 void bch2_btree_updates_to_text(struct printbuf *, struct bch_fs *);
 
-size_t bch2_btree_interior_updates_nr_pending(struct bch_fs *);
+bool bch2_btree_interior_updates_flush(struct bch_fs *);
 
 void bch2_journal_entries_to_btree_roots(struct bch_fs *, struct jset *);
 struct jset_entry *bch2_btree_roots_to_journal_entries(struct bch_fs *,
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index ba6a9218610a..9a2955f4ae6b 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -1117,6 +1117,8 @@ int __bch2_trans_commit(struct btree_trans *trans)
 			goto out_reset;
 	}
 
+	EBUG_ON(test_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags));
+
 	memset(&trans->journal_preres, 0, sizeof(trans->journal_preres));
 
 	trans->journal_u64s		= trans->extra_journal_entries.nr;
diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c
index 6defc33322b3..5345697f2712 100644
--- a/fs/bcachefs/migrate.c
+++ b/fs/bcachefs/migrate.c
@@ -175,10 +175,7 @@ next:
 			goto err;
 	}
 
-	/* flush relevant btree updates */
-	closure_wait_event(&c->btree_interior_update_wait,
-			   !bch2_btree_interior_updates_nr_pending(c));
-
+	bch2_btree_interior_updates_flush(c);
 	ret = 0;
 err:
 	bch2_trans_exit(&trans);
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index a219c10a7135..f18d603624c0 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -942,9 +942,7 @@ next:
 	if (ret)
 		bch_err(c, "error %i in bch2_move_btree", ret);
 
-	/* flush relevant btree updates */
-	closure_wait_event(&c->btree_interior_update_wait,
-			   !bch2_btree_interior_updates_nr_pending(c));
+	bch2_btree_interior_updates_flush(c);
 
 	progress_list_del(c, stats);
 	return ret;
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index e4ccdc966fdb..77b7bd61bf43 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -195,57 +195,33 @@ static void __bch2_fs_read_only(struct bch_fs *c)
 {
 	struct bch_dev *ca;
 	unsigned i, clean_passes = 0;
+	u64 seq = 0;
 
 	bch2_rebalance_stop(c);
 	bch2_copygc_stop(c);
 	bch2_gc_thread_stop(c);
 
-	/*
-	 * Flush journal before stopping allocators, because flushing journal
-	 * blacklist entries involves allocating new btree nodes:
-	 */
-	bch2_journal_flush_all_pins(&c->journal);
-
 	bch_verbose(c, "flushing journal and stopping allocators");
 
-	bch2_journal_flush_all_pins(&c->journal);
-
 	do {
 		clean_passes++;
 
-		if (bch2_journal_flush_all_pins(&c->journal))
-			clean_passes = 0;
-
-		/*
-		 * In flight interior btree updates will generate more journal
-		 * updates and btree updates (alloc btree):
-		 */
-		if (bch2_btree_interior_updates_nr_pending(c)) {
-			closure_wait_event(&c->btree_interior_update_wait,
-					   !bch2_btree_interior_updates_nr_pending(c));
+		if (bch2_btree_interior_updates_flush(c) ||
+		    bch2_journal_flush_all_pins(&c->journal) ||
+		    bch2_btree_flush_all_writes(c) ||
+		    seq != atomic64_read(&c->journal.seq)) {
+			seq = atomic64_read(&c->journal.seq);
 			clean_passes = 0;
 		}
-		flush_work(&c->btree_interior_update_work);
-
-		if (bch2_journal_flush_all_pins(&c->journal))
-			clean_passes = 0;
 	} while (clean_passes < 2);
-	bch_verbose(c, "flushing journal and stopping allocators complete");
 
-	set_bit(BCH_FS_ALLOC_CLEAN, &c->flags);
-
-	closure_wait_event(&c->btree_interior_update_wait,
-			   !bch2_btree_interior_updates_nr_pending(c));
-	flush_work(&c->btree_interior_update_work);
+	bch_verbose(c, "flushing journal and stopping allocators complete");
 
+	if (test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags) &&
+	    !test_bit(BCH_FS_EMERGENCY_RO, &c->flags))
+		set_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags);
 	bch2_fs_journal_stop(&c->journal);
 
-	/*
-	 * the journal kicks off btree writes via reclaim - wait for in flight
-	 * writes after stopping journal:
-	 */
-	bch2_btree_flush_all_writes(c);
-
 	/*
 	 * After stopping journal:
 	 */
@@ -304,7 +280,7 @@ void bch2_fs_read_only(struct bch_fs *c)
 	    !test_bit(BCH_FS_ERROR, &c->flags) &&
 	    !test_bit(BCH_FS_EMERGENCY_RO, &c->flags) &&
 	    test_bit(BCH_FS_STARTED, &c->flags) &&
-	    test_bit(BCH_FS_ALLOC_CLEAN, &c->flags) &&
+	    test_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags) &&
 	    !c->opts.norecovery) {
 		bch_verbose(c, "marking filesystem clean");
 		bch2_fs_mark_clean(c);
@@ -395,7 +371,7 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early)
 	if (ret)
 		goto err;
 
-	clear_bit(BCH_FS_ALLOC_CLEAN, &c->flags);
+	clear_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags);
 
 	for_each_rw_member(ca, c, i)
 		bch2_dev_allocator_add(c, ca);
-- 
cgit 


From fd4cecd2583e784ff28b851de2ff5046201c57d7 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 18 Apr 2022 21:50:19 -0400
Subject: bcachefs: Lock ordering fix

Can't take btree node locks while holding btree_reserve_cache_lock - it
would be nice if we could check this with lockdep.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_update_interior.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 27ab1cde2217..9696eb2b91e7 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -383,16 +383,13 @@ static void bch2_btree_reserve_put(struct btree_update *as)
 	struct bch_fs *c = as->c;
 	struct prealloc_nodes *p;
 
-	mutex_lock(&c->btree_reserve_cache_lock);
-
 	for (p = as->prealloc_nodes;
 	     p < as->prealloc_nodes + ARRAY_SIZE(as->prealloc_nodes);
 	     p++) {
 		while (p->nr) {
 			struct btree *b = p->b[--p->nr];
 
-			six_lock_intent(&b->c.lock, NULL, NULL);
-			six_lock_write(&b->c.lock, NULL, NULL);
+			mutex_lock(&c->btree_reserve_cache_lock);
 
 			if (c->btree_reserve_cache_nr <
 			    ARRAY_SIZE(c->btree_reserve_cache)) {
@@ -406,13 +403,15 @@ static void bch2_btree_reserve_put(struct btree_update *as)
 				bch2_open_buckets_put(c, &b->ob);
 			}
 
+			mutex_unlock(&c->btree_reserve_cache_lock);
+
+			six_lock_intent(&b->c.lock, NULL, NULL);
+			six_lock_write(&b->c.lock, NULL, NULL);
 			__btree_node_free(c, b);
 			six_unlock_write(&b->c.lock);
 			six_unlock_intent(&b->c.lock);
 		}
 	}
-
-	mutex_unlock(&c->btree_reserve_cache_lock);
 }
 
 static int bch2_btree_reserve_get(struct btree_trans *trans,
-- 
cgit 


From 8cc052db636d5502319d967198d84f64e7e5f65d Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 18 Apr 2022 22:03:17 -0400
Subject: bcachefs: Don't kick journal reclaim unless low on space

We shouldn't kick journal reclaim unnecessarily, it's got its own timer
for that.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/journal_io.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index a6a8737e92ad..351d5d9d8225 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -1442,7 +1442,8 @@ static void journal_write_done(struct closure *cl)
 	 * Must come before signaling write completion, for
 	 * bch2_fs_journal_stop():
 	 */
-	journal_reclaim_kick(&c->journal);
+	if (j->watermark)
+		journal_reclaim_kick(&c->journal);
 
 	/* also must come before signalling write completion: */
 	closure_debug_destroy(cl);
-- 
cgit 


From 1f93726e6347938343190913cb959623e67ecf78 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 17 Apr 2022 18:06:31 -0400
Subject: bcachefs: Tracepoint improvements

Delete some obsolete tracepoints, organize alloc tracepoints better,
make a few tracepoints more consistent.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/alloc_background.c      |   6 +-
 fs/bcachefs/btree_gc.c              |   7 +-
 fs/bcachefs/btree_update_interior.c |   6 +-
 fs/bcachefs/btree_update_leaf.c     |   2 +
 fs/bcachefs/buckets.c               |   3 -
 fs/bcachefs/journal_reclaim.c       |  16 +--
 fs/bcachefs/move.c                  |   2 +-
 fs/bcachefs/trace.h                 | 201 ++++++++++++++++--------------------
 8 files changed, 112 insertions(+), 131 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index a63c1664c3f2..d9cf676da030 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -1066,7 +1066,7 @@ static void bch2_do_discards_work(struct work_struct *work)
 
 	percpu_ref_put(&c->writes);
 
-	trace_do_discards(c, seen, open, need_journal_commit, discarded, ret);
+	trace_discard_buckets(c, seen, open, need_journal_commit, discarded, ret);
 }
 
 void bch2_do_discards(struct bch_fs *c)
@@ -1130,6 +1130,10 @@ static int invalidate_one_bucket(struct btree_trans *trans, struct bch_dev *ca)
 
 	ret = bch2_trans_update(trans, &alloc_iter, &a->k_i,
 				BTREE_TRIGGER_BUCKET_INVALIDATE);
+	if (ret)
+		goto out;
+
+	trace_invalidate_bucket(c, a->k.p.inode, a->k.p.offset);
 out:
 	bch2_trans_iter_exit(trans, &alloc_iter);
 	bch2_trans_iter_exit(trans, &lru_iter);
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 21afc7200570..187787359316 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -1742,12 +1742,10 @@ static void bch2_gc_stripes_reset(struct bch_fs *c, bool metadata_only)
  */
 int bch2_gc(struct bch_fs *c, bool initial, bool metadata_only)
 {
-	u64 start_time = local_clock();
 	unsigned iter = 0;
 	int ret;
 
 	lockdep_assert_held(&c->state_lock);
-	trace_gc_start(c);
 
 	down_write(&c->gc_lock);
 
@@ -1840,9 +1838,6 @@ out:
 
 	up_write(&c->gc_lock);
 
-	trace_gc_end(c);
-	bch2_time_stats_update(&c->times[BCH_TIME_btree_gc], start_time);
-
 	/*
 	 * At startup, allocations can happen directly instead of via the
 	 * allocator thread - issue wakeup in case they blocked on gc_lock:
@@ -1979,6 +1974,7 @@ int bch2_gc_gens(struct bch_fs *c)
 	if (!mutex_trylock(&c->gc_gens_lock))
 		return 0;
 
+	trace_gc_gens_start(c);
 	down_read(&c->gc_lock);
 	bch2_trans_init(&trans, c, 0, 0);
 
@@ -2030,6 +2026,7 @@ int bch2_gc_gens(struct bch_fs *c)
 	c->gc_count++;
 
 	bch2_time_stats_update(&c->times[BCH_TIME_btree_gc], start_time);
+	trace_gc_gens_end(c);
 err:
 	for_each_member_device(ca, c, i) {
 		kvfree(ca->oldest_gen);
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 9696eb2b91e7..d4308f3c530b 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -1812,10 +1812,8 @@ int bch2_btree_node_rewrite(struct btree_trans *trans,
 	as = bch2_btree_update_start(trans, iter->path, b->c.level,
 				     false, flags);
 	ret = PTR_ERR_OR_ZERO(as);
-	if (ret) {
-		trace_btree_gc_rewrite_node_fail(c, b);
+	if (ret)
 		goto out;
-	}
 
 	bch2_btree_interior_update_will_free_node(as, b);
 
@@ -1825,7 +1823,7 @@ int bch2_btree_node_rewrite(struct btree_trans *trans,
 	bch2_btree_build_aux_trees(n);
 	six_unlock_write(&n->c.lock);
 
-	trace_btree_gc_rewrite_node(c, b);
+	trace_btree_rewrite(c, b);
 
 	bch2_btree_node_write(c, n, SIX_LOCK_intent, 0);
 
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 9a2955f4ae6b..d84769353f65 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -1169,6 +1169,8 @@ retry:
 
 	if (ret)
 		goto err;
+
+	trace_transaction_commit(trans->fn, _RET_IP_);
 out:
 	bch2_journal_preres_put(&c->journal, &trans->journal_preres);
 
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 71e5d893fe6a..230344e0a534 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -600,9 +600,6 @@ int bch2_mark_alloc(struct btree_trans *trans,
 			bch2_fs_fatal_error(c, "bch2_mark_alloc(): no replicas entry while updating cached sectors");
 			return ret;
 		}
-
-		trace_invalidate(ca, bucket_to_sector(ca, new.k->p.offset),
-				 old_a.cached_sectors);
 	}
 
 	return 0;
diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
index e99a01e3b5fb..873cc14e2ae9 100644
--- a/fs/bcachefs/journal_reclaim.c
+++ b/fs/bcachefs/journal_reclaim.c
@@ -593,7 +593,7 @@ static u64 journal_seq_to_flush(struct journal *j)
  * 512 journal entries or 25% of all journal buckets, then
  * journal_next_bucket() should not stall.
  */
-static int __bch2_journal_reclaim(struct journal *j, bool direct)
+static int __bch2_journal_reclaim(struct journal *j, bool direct, bool kicked)
 {
 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
 	bool kthread = (current->flags & PF_KTHREAD) != 0;
@@ -639,8 +639,10 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct)
 		if (fifo_free(&j->pin) <= 32)
 			min_nr = 1;
 
-		trace_journal_reclaim_start(c,
-				min_nr,
+		min_key_cache = min(bch2_nr_btree_keys_need_flush(c), (size_t) 128);
+
+		trace_journal_reclaim_start(c, direct, kicked,
+				min_nr, min_key_cache,
 				j->prereserved.reserved,
 				j->prereserved.remaining,
 				atomic_read(&c->btree_cache.dirty),
@@ -648,8 +650,6 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct)
 				atomic_long_read(&c->btree_key_cache.nr_dirty),
 				atomic_long_read(&c->btree_key_cache.nr_keys));
 
-		min_key_cache = min(bch2_nr_btree_keys_need_flush(c), (size_t) 128);
-
 		nr_flushed = journal_flush_pins(j, seq_to_flush,
 						min_nr, min_key_cache);
 
@@ -670,7 +670,7 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct)
 
 int bch2_journal_reclaim(struct journal *j)
 {
-	return __bch2_journal_reclaim(j, true);
+	return __bch2_journal_reclaim(j, true, true);
 }
 
 static int bch2_journal_reclaim_thread(void *arg)
@@ -686,10 +686,12 @@ static int bch2_journal_reclaim_thread(void *arg)
 	j->last_flushed = jiffies;
 
 	while (!ret && !kthread_should_stop()) {
+		bool kicked = j->reclaim_kicked;
+
 		j->reclaim_kicked = false;
 
 		mutex_lock(&j->reclaim_lock);
-		ret = __bch2_journal_reclaim(j, false);
+		ret = __bch2_journal_reclaim(j, false, kicked);
 		mutex_unlock(&j->reclaim_lock);
 
 		now = jiffies;
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index f18d603624c0..2cb8775b4ed7 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -597,7 +597,7 @@ err_free_pages:
 err_free:
 	kfree(io);
 err:
-	trace_move_alloc_fail(k.k);
+	trace_move_alloc_mem_fail(k.k);
 	return ret;
 }
 
diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h
index de6a17c92f5f..f5aa10762611 100644
--- a/fs/bcachefs/trace.h
+++ b/fs/bcachefs/trace.h
@@ -142,17 +142,21 @@ DEFINE_EVENT(bio, journal_write,
 );
 
 TRACE_EVENT(journal_reclaim_start,
-	TP_PROTO(struct bch_fs *c, u64 min_nr,
+	TP_PROTO(struct bch_fs *c, bool direct, bool kicked,
+		 u64 min_nr, u64 min_key_cache,
 		 u64 prereserved, u64 prereserved_total,
 		 u64 btree_cache_dirty, u64 btree_cache_total,
 		 u64 btree_key_cache_dirty, u64 btree_key_cache_total),
-	TP_ARGS(c, min_nr, prereserved, prereserved_total,
+	TP_ARGS(c, direct, kicked, min_nr, min_key_cache, prereserved, prereserved_total,
 		btree_cache_dirty, btree_cache_total,
 		btree_key_cache_dirty, btree_key_cache_total),
 
 	TP_STRUCT__entry(
 		__field(dev_t,		dev			)
+		__field(bool,		direct			)
+		__field(bool,		kicked			)
 		__field(u64,		min_nr			)
+		__field(u64,		min_key_cache		)
 		__field(u64,		prereserved		)
 		__field(u64,		prereserved_total	)
 		__field(u64,		btree_cache_dirty	)
@@ -163,7 +167,10 @@ TRACE_EVENT(journal_reclaim_start,
 
 	TP_fast_assign(
 		__entry->dev			= c->dev;
+		__entry->direct			= direct;
+		__entry->kicked			= kicked;
 		__entry->min_nr			= min_nr;
+		__entry->min_key_cache		= min_key_cache;
 		__entry->prereserved		= prereserved;
 		__entry->prereserved_total	= prereserved_total;
 		__entry->btree_cache_dirty	= btree_cache_dirty;
@@ -172,9 +179,12 @@ TRACE_EVENT(journal_reclaim_start,
 		__entry->btree_key_cache_total	= btree_key_cache_total;
 	),
 
-	TP_printk("%d,%d min %llu prereserved %llu/%llu btree cache %llu/%llu key cache %llu/%llu",
+	TP_printk("%d,%d direct %u kicked %u min %llu key cache %llu prereserved %llu/%llu btree cache %llu/%llu key cache %llu/%llu",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->direct,
+		  __entry->kicked,
 		  __entry->min_nr,
+		  __entry->min_key_cache,
 		  __entry->prereserved,
 		  __entry->prereserved_total,
 		  __entry->btree_cache_dirty,
@@ -197,45 +207,13 @@ TRACE_EVENT(journal_reclaim_finish,
 		__entry->nr_flushed	= nr_flushed;
 	),
 
-	TP_printk("%d%d flushed %llu",
+	TP_printk("%d,%d flushed %llu",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  __entry->nr_flushed)
 );
 
 /* allocator: */
 
-TRACE_EVENT(do_discards,
-	TP_PROTO(struct bch_fs *c, u64 seen, u64 open,
-		 u64 need_journal_commit, u64 discarded, int ret),
-	TP_ARGS(c, seen, open, need_journal_commit, discarded, ret),
-
-	TP_STRUCT__entry(
-		__field(dev_t,		dev			)
-		__field(u64,		seen			)
-		__field(u64,		open			)
-		__field(u64,		need_journal_commit	)
-		__field(u64,		discarded		)
-		__field(int,		ret			)
-	),
-
-	TP_fast_assign(
-		__entry->dev			= c->dev;
-		__entry->seen			= seen;
-		__entry->open			= open;
-		__entry->need_journal_commit	= need_journal_commit;
-		__entry->discarded		= discarded;
-		__entry->ret			= ret;
-	),
-
-	TP_printk("%d%d seen %llu open %llu need_journal_commit %llu discarded %llu ret %i",
-		  MAJOR(__entry->dev), MINOR(__entry->dev),
-		  __entry->seen,
-		  __entry->open,
-		  __entry->need_journal_commit,
-		  __entry->discarded,
-		  __entry->ret)
-);
-
 /* bset.c: */
 
 DEFINE_EVENT(bpos, bkey_pack_pos_fail,
@@ -370,6 +348,11 @@ DEFINE_EVENT(btree_node, btree_merge,
 	TP_ARGS(c, b)
 );
 
+DEFINE_EVENT(btree_node, btree_rewrite,
+	TP_PROTO(struct bch_fs *c, struct btree *b),
+	TP_ARGS(c, b)
+);
+
 DEFINE_EVENT(btree_node, btree_set_root,
 	TP_PROTO(struct bch_fs *c, struct btree *b),
 	TP_ARGS(c, b)
@@ -443,79 +426,18 @@ TRACE_EVENT(btree_node_relock_fail,
 
 /* Garbage collection */
 
-DEFINE_EVENT(btree_node, btree_gc_rewrite_node,
-	TP_PROTO(struct bch_fs *c, struct btree *b),
-	TP_ARGS(c, b)
-);
-
-DEFINE_EVENT(btree_node, btree_gc_rewrite_node_fail,
-	TP_PROTO(struct bch_fs *c, struct btree *b),
-	TP_ARGS(c, b)
-);
-
-DEFINE_EVENT(bch_fs, gc_start,
+DEFINE_EVENT(bch_fs, gc_gens_start,
 	TP_PROTO(struct bch_fs *c),
 	TP_ARGS(c)
 );
 
-DEFINE_EVENT(bch_fs, gc_end,
-	TP_PROTO(struct bch_fs *c),
-	TP_ARGS(c)
-);
-
-DEFINE_EVENT(bch_fs, gc_cannot_inc_gens,
+DEFINE_EVENT(bch_fs, gc_gens_end,
 	TP_PROTO(struct bch_fs *c),
 	TP_ARGS(c)
 );
 
 /* Allocator */
 
-TRACE_EVENT(alloc_scan,
-	TP_PROTO(struct bch_dev *ca, u64 found, u64 inc_gen, u64 inc_gen_skipped),
-	TP_ARGS(ca, found, inc_gen, inc_gen_skipped),
-
-	TP_STRUCT__entry(
-		__field(dev_t,		dev		)
-		__field(u64,		found		)
-		__field(u64,		inc_gen		)
-		__field(u64,		inc_gen_skipped	)
-	),
-
-	TP_fast_assign(
-		__entry->dev		= ca->dev;
-		__entry->found		= found;
-		__entry->inc_gen	= inc_gen;
-		__entry->inc_gen_skipped = inc_gen_skipped;
-	),
-
-	TP_printk("%d,%d found %llu inc_gen %llu inc_gen_skipped %llu",
-		  MAJOR(__entry->dev), MINOR(__entry->dev),
-		  __entry->found, __entry->inc_gen, __entry->inc_gen_skipped)
-);
-
-TRACE_EVENT(invalidate,
-	TP_PROTO(struct bch_dev *ca, u64 offset, unsigned sectors),
-	TP_ARGS(ca, offset, sectors),
-
-	TP_STRUCT__entry(
-		__field(unsigned,	sectors			)
-		__field(dev_t,		dev			)
-		__field(__u64,		offset			)
-	),
-
-	TP_fast_assign(
-		__entry->dev		= ca->dev;
-		__entry->offset		= offset,
-		__entry->sectors	= sectors;
-	),
-
-	TP_printk("invalidated %u sectors at %d,%d sector=%llu",
-		  __entry->sectors,
-		  MAJOR(__entry->dev),
-		  MINOR(__entry->dev),
-		  __entry->offset)
-);
-
 DECLARE_EVENT_CLASS(bucket_alloc,
 	TP_PROTO(struct bch_dev *ca, const char *alloc_reserve,
 		 u64 avail,
@@ -587,6 +509,59 @@ DEFINE_EVENT(bucket_alloc, bucket_alloc_fail,
 	TP_ARGS(ca, alloc_reserve, avail, seen, open, need_journal_commit, nouse, nonblocking, ret)
 );
 
+TRACE_EVENT(discard_buckets,
+	TP_PROTO(struct bch_fs *c, u64 seen, u64 open,
+		 u64 need_journal_commit, u64 discarded, int ret),
+	TP_ARGS(c, seen, open, need_journal_commit, discarded, ret),
+
+	TP_STRUCT__entry(
+		__field(dev_t,		dev			)
+		__field(u64,		seen			)
+		__field(u64,		open			)
+		__field(u64,		need_journal_commit	)
+		__field(u64,		discarded		)
+		__field(int,		ret			)
+	),
+
+	TP_fast_assign(
+		__entry->dev			= c->dev;
+		__entry->seen			= seen;
+		__entry->open			= open;
+		__entry->need_journal_commit	= need_journal_commit;
+		__entry->discarded		= discarded;
+		__entry->ret			= ret;
+	),
+
+	TP_printk("%d%d seen %llu open %llu need_journal_commit %llu discarded %llu ret %i",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->seen,
+		  __entry->open,
+		  __entry->need_journal_commit,
+		  __entry->discarded,
+		  __entry->ret)
+);
+
+TRACE_EVENT(invalidate_bucket,
+	TP_PROTO(struct bch_fs *c, unsigned dev, u64 bucket),
+	TP_ARGS(c, dev, bucket),
+
+	TP_STRUCT__entry(
+		__field(dev_t,		dev			)
+		__field(u32,		dev_idx			)
+		__field(u64,		bucket			)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= c->dev;
+		__entry->dev_idx	= dev;
+		__entry->bucket		= bucket;
+	),
+
+	TP_printk("%d:%d invalidated %u:%llu",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->dev_idx, __entry->bucket)
+);
+
 /* Moving IO */
 
 DEFINE_EVENT(bkey, move_extent,
@@ -594,7 +569,7 @@ DEFINE_EVENT(bkey, move_extent,
 	TP_ARGS(k)
 );
 
-DEFINE_EVENT(bkey, move_alloc_fail,
+DEFINE_EVENT(bkey, move_alloc_mem_fail,
 	TP_PROTO(const struct bkey *k),
 	TP_ARGS(k)
 );
@@ -678,7 +653,7 @@ TRACE_EVENT(copygc_wait,
 		  __entry->wait_amount, __entry->until)
 );
 
-DECLARE_EVENT_CLASS(transaction_restart,
+DECLARE_EVENT_CLASS(transaction_event,
 	TP_PROTO(const char *trans_fn,
 		 unsigned long caller_ip),
 	TP_ARGS(trans_fn, caller_ip),
@@ -696,55 +671,61 @@ DECLARE_EVENT_CLASS(transaction_restart,
 	TP_printk("%s %pS", __entry->trans_fn, (void *) __entry->caller_ip)
 );
 
-DEFINE_EVENT(transaction_restart,	transaction_restart_ip,
+DEFINE_EVENT(transaction_event,	transaction_commit,
+	TP_PROTO(const char *trans_fn,
+		 unsigned long caller_ip),
+	TP_ARGS(trans_fn, caller_ip)
+);
+
+DEFINE_EVENT(transaction_event,	transaction_restart_ip,
 	TP_PROTO(const char *trans_fn,
 		 unsigned long caller_ip),
 	TP_ARGS(trans_fn, caller_ip)
 );
 
-DEFINE_EVENT(transaction_restart,	trans_blocked_journal_reclaim,
+DEFINE_EVENT(transaction_event,	trans_blocked_journal_reclaim,
 	TP_PROTO(const char *trans_fn,
 		 unsigned long caller_ip),
 	TP_ARGS(trans_fn, caller_ip)
 );
 
-DEFINE_EVENT(transaction_restart,	trans_restart_journal_res_get,
+DEFINE_EVENT(transaction_event,	trans_restart_journal_res_get,
 	TP_PROTO(const char *trans_fn,
 		 unsigned long caller_ip),
 	TP_ARGS(trans_fn, caller_ip)
 );
 
-DEFINE_EVENT(transaction_restart,	trans_restart_journal_preres_get,
+DEFINE_EVENT(transaction_event,	trans_restart_journal_preres_get,
 	TP_PROTO(const char *trans_fn,
 		 unsigned long caller_ip),
 	TP_ARGS(trans_fn, caller_ip)
 );
 
-DEFINE_EVENT(transaction_restart,	trans_restart_journal_reclaim,
+DEFINE_EVENT(transaction_event,	trans_restart_journal_reclaim,
 	TP_PROTO(const char *trans_fn,
 		 unsigned long caller_ip),
 	TP_ARGS(trans_fn, caller_ip)
 );
 
-DEFINE_EVENT(transaction_restart,	trans_restart_fault_inject,
+DEFINE_EVENT(transaction_event,	trans_restart_fault_inject,
 	TP_PROTO(const char *trans_fn,
 		 unsigned long caller_ip),
 	TP_ARGS(trans_fn, caller_ip)
 );
 
-DEFINE_EVENT(transaction_restart,	trans_traverse_all,
+DEFINE_EVENT(transaction_event,	trans_traverse_all,
 	TP_PROTO(const char *trans_fn,
 		 unsigned long caller_ip),
 	TP_ARGS(trans_fn, caller_ip)
 );
 
-DEFINE_EVENT(transaction_restart,	trans_restart_mark_replicas,
+DEFINE_EVENT(transaction_event,	trans_restart_mark_replicas,
 	TP_PROTO(const char *trans_fn,
 		 unsigned long caller_ip),
 	TP_ARGS(trans_fn, caller_ip)
 );
 
-DEFINE_EVENT(transaction_restart,	trans_restart_key_cache_raced,
+DEFINE_EVENT(transaction_event,	trans_restart_key_cache_raced,
 	TP_PROTO(const char *trans_fn,
 		 unsigned long caller_ip),
 	TP_ARGS(trans_fn, caller_ip)
-- 
cgit 


From 104c69745fdf7e5f8aa022f60bc9d568987bd8b8 Mon Sep 17 00:00:00 2001
From: Daniel Hill <daniel@gluo.nz>
Date: Tue, 15 Mar 2022 21:36:33 +1300
Subject: bcachefs: Add persistent counters

This adds a new superblock field for persisting counters
and adds a sysfs interface in counters/ exposing these counters.

The superblock field is ignored by older versions letting us avoid
an on disk version bump.

Each sysfs file outputs a counter that tracks since filesystem
creation and a counter for the current mount session.

Signed-off-by: Daniel Hill <daniel@gluo.nz>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/Makefile          |   1 +
 fs/bcachefs/bcachefs.h        |   6 ++-
 fs/bcachefs/bcachefs_format.h |  22 ++++++++-
 fs/bcachefs/counters.c        | 107 ++++++++++++++++++++++++++++++++++++++++++
 fs/bcachefs/counters.h        |  17 +++++++
 fs/bcachefs/io.c              |   2 +
 fs/bcachefs/move.c            |   1 +
 fs/bcachefs/super-io.c        |   3 ++
 fs/bcachefs/super.c           |  12 ++++-
 fs/bcachefs/sysfs.c           |  47 ++++++++++++++++++-
 fs/bcachefs/sysfs.h           |  14 ++++--
 11 files changed, 223 insertions(+), 9 deletions(-)
 create mode 100644 fs/bcachefs/counters.c
 create mode 100644 fs/bcachefs/counters.h

(limited to 'fs')

diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile
index 17423584a3f3..76aecdc5df71 100644
--- a/fs/bcachefs/Makefile
+++ b/fs/bcachefs/Makefile
@@ -21,6 +21,7 @@ bcachefs-y		:=	\
 	checksum.o		\
 	clock.o			\
 	compress.o		\
+	counters.o		\
 	debug.o			\
 	dirent.o		\
 	disk_groups.o		\
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index e7300a9f427c..5dda57afa802 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -584,6 +584,7 @@ struct bch_fs {
 
 	struct list_head	list;
 	struct kobject		kobj;
+	struct kobject		counters_kobj;
 	struct kobject		internal;
 	struct kobject		opts_dir;
 	struct kobject		time_stats;
@@ -900,12 +901,15 @@ mempool_t		bio_bounce_pages;
 
 	u64			last_bucket_seq_cleanup;
 
-	/* The rest of this all shows up in sysfs */
+	/* TODO rewrite as counters - The rest of this all shows up in sysfs */
 	atomic_long_t		read_realloc_races;
 	atomic_long_t		extent_migrate_done;
 	atomic_long_t		extent_migrate_raced;
 	atomic_long_t		bucket_alloc_fail;
 
+	u64			counters_on_mount[BCH_COUNTER_NR];
+	u64 __percpu		*counters;
+
 	unsigned		btree_gc_periodic:1;
 	unsigned		copy_gc_enabled:1;
 	bool			promote_whole_extents;
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index 969507c42c55..d77a45041ff0 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -1090,7 +1090,8 @@ struct bch_sb_field {
 	x(clean,	6)			\
 	x(replicas,	7)			\
 	x(journal_seq_blacklist, 8)		\
-	x(journal_v2,	9)
+	x(journal_v2,	9)			\
+	x(counters,	10)
 
 enum bch_sb_field_type {
 #define x(f, nr)	BCH_SB_FIELD_##f = nr,
@@ -1323,6 +1324,25 @@ struct bch_sb_field_disk_groups {
 	struct bch_disk_group	entries[0];
 } __attribute__((packed, aligned(8)));
 
+/* BCH_SB_FIELD_counters */
+
+#define BCH_PERSISTENT_COUNTERS()	\
+	x(io_read,  0)			\
+	x(io_write, 1)			\
+	x(io_move,  2)
+
+enum bch_persistent_counters {
+#define x(t, n, ...) BCH_COUNTER_##t,
+	BCH_PERSISTENT_COUNTERS()
+#undef x
+	BCH_COUNTER_NR
+};
+
+struct bch_sb_field_counters {
+	struct bch_sb_field	field;
+	__le64			d[0];
+};
+
 /*
  * On clean shutdown, store btree roots and current journal sequence number in
  * the superblock:
diff --git a/fs/bcachefs/counters.c b/fs/bcachefs/counters.c
new file mode 100644
index 000000000000..6bf267dfd051
--- /dev/null
+++ b/fs/bcachefs/counters.c
@@ -0,0 +1,107 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "bcachefs.h"
+#include "super-io.h"
+#include "counters.h"
+
+/* BCH_SB_FIELD_counters */
+
+const char * const bch2_counter_names[] = {
+#define x(t, n, ...) (#t),
+	BCH_PERSISTENT_COUNTERS()
+#undef x
+	NULL
+};
+
+static size_t bch2_sb_counter_nr_entries(struct bch_sb_field_counters *ctrs)
+{
+	if (!ctrs)
+		return 0;
+
+	return (__le64 *) vstruct_end(&ctrs->field) - &ctrs->d[0];
+};
+
+static int bch2_sb_counters_validate(struct bch_sb *sb,
+				     struct bch_sb_field *f,
+				     struct printbuf *err)
+{
+	return 0;
+};
+
+void bch2_sb_counters_to_text(struct printbuf *out, struct bch_sb *sb,
+			      struct bch_sb_field *f)
+{
+	struct bch_sb_field_counters *ctrs = field_to_type(f, counters);
+	unsigned int i;
+	unsigned int nr = bch2_sb_counter_nr_entries(ctrs);
+
+	for (i = 0; i < nr; i++) {
+		if (i < BCH_COUNTER_NR)
+			pr_buf(out, "%s", bch2_counter_names[i]);
+		else
+			pr_buf(out, "(unknown)");
+
+		pr_tab(out);
+		pr_buf(out, "%llu", le64_to_cpu(ctrs->d[i]));
+		pr_newline(out);
+	};
+};
+
+int bch2_sb_counters_to_cpu(struct bch_fs *c)
+{
+	struct bch_sb_field_counters *ctrs = bch2_sb_get_counters(c->disk_sb.sb);
+	unsigned int i;
+	unsigned int nr = bch2_sb_counter_nr_entries(ctrs);
+	u64 val = 0;
+
+	for (i = 0; i < BCH_COUNTER_NR; i++)
+		c->counters_on_mount[i] = 0;
+
+	for (i = 0; i < min_t(unsigned int, nr, BCH_COUNTER_NR); i++) {
+		val = le64_to_cpu(ctrs->d[i]);
+		percpu_u64_set(&c->counters[i], val);
+		c->counters_on_mount[i] = val;
+	}
+	return 0;
+};
+
+int bch2_sb_counters_from_cpu(struct bch_fs *c)
+{
+	struct bch_sb_field_counters *ctrs = bch2_sb_get_counters(c->disk_sb.sb);
+	struct bch_sb_field_counters *ret;
+	unsigned int i;
+	unsigned int nr = bch2_sb_counter_nr_entries(ctrs);
+
+	if (nr < BCH_COUNTER_NR) {
+		ret = bch2_sb_resize_counters(&c->disk_sb,
+					       sizeof(*ctrs) / sizeof(u64) + BCH_COUNTER_NR);
+
+		if (ret) {
+			ctrs = ret;
+			nr = bch2_sb_counter_nr_entries(ctrs);
+		}
+	}
+
+
+	for (i = 0; i < min_t(unsigned int, nr, BCH_COUNTER_NR); i++)
+		ctrs->d[i] = cpu_to_le64(percpu_u64_get(&c->counters[i]));
+	return 0;
+}
+
+void bch2_fs_counters_exit(struct bch_fs *c)
+{
+	free_percpu(c->counters);
+}
+
+int bch2_fs_counters_init(struct bch_fs *c)
+{
+	c->counters = __alloc_percpu(sizeof(u64) * BCH_COUNTER_NR, sizeof(u64));
+	if (!c->counters)
+		return -ENOMEM;
+
+	return bch2_sb_counters_to_cpu(c);
+}
+
+const struct bch_sb_field_ops bch_sb_field_ops_counters = {
+	.validate	= bch2_sb_counters_validate,
+	.to_text	= bch2_sb_counters_to_text,
+};
diff --git a/fs/bcachefs/counters.h b/fs/bcachefs/counters.h
new file mode 100644
index 000000000000..4778aa19bf34
--- /dev/null
+++ b/fs/bcachefs/counters.h
@@ -0,0 +1,17 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_COUNTERS_H
+#define _BCACHEFS_COUNTERS_H
+
+#include "bcachefs.h"
+#include "super-io.h"
+
+
+int bch2_sb_counters_to_cpu(struct bch_fs *);
+int bch2_sb_counters_from_cpu(struct bch_fs *);
+
+void bch2_fs_counters_exit(struct bch_fs *);
+int bch2_fs_counters_init(struct bch_fs *);
+
+extern const struct bch_sb_field_ops bch_sb_field_ops_counters;
+
+#endif // _BCACHEFS_COUNTERS_H
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index 0f80255e59bd..f20891d48ca8 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -1403,6 +1403,7 @@ void bch2_write(struct closure *cl)
 		goto err;
 	}
 
+	this_cpu_add(c->counters[BCH_COUNTER_io_write], bio_sectors(bio));
 	bch2_increment_clock(c, bio_sectors(bio), WRITE);
 
 	data_len = min_t(u64, bio->bi_iter.bi_size,
@@ -2310,6 +2311,7 @@ get_bio:
 	if (rbio->bounce)
 		trace_read_bounce(&rbio->bio);
 
+	this_cpu_add(c->counters[BCH_COUNTER_io_read], bio_sectors(&rbio->bio));
 	bch2_increment_clock(c, bio_sectors(&rbio->bio), READ);
 
 	/*
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index 2cb8775b4ed7..a852e07affdc 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -575,6 +575,7 @@ static int bch2_move_extent(struct btree_trans *trans,
 
 	atomic64_inc(&ctxt->stats->keys_moved);
 	atomic64_add(k.k->size, &ctxt->stats->sectors_moved);
+	this_cpu_add(c->counters[BCH_COUNTER_io_move], k.k->size);
 
 	trace_move_extent(k.k);
 
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index c3c7043d7426..56a6c925543a 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -18,6 +18,7 @@
 #include "super.h"
 #include "trace.h"
 #include "vstructs.h"
+#include "counters.h"
 
 #include <linux/backing-dev.h>
 #include <linux/sort.h>
@@ -819,6 +820,8 @@ int bch2_write_super(struct bch_fs *c)
 
 	SET_BCH_SB_BIG_ENDIAN(c->disk_sb.sb, CPU_BIG_ENDIAN);
 
+	bch2_sb_counters_from_cpu(c);
+
 	for_each_online_member(ca, c, i)
 		bch2_sb_from_fs(c, ca);
 
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 77b7bd61bf43..159d47d129a2 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -21,6 +21,7 @@
 #include "checksum.h"
 #include "clock.h"
 #include "compress.h"
+#include "counters.h"
 #include "debug.h"
 #include "disk_groups.h"
 #include "ec.h"
@@ -78,6 +79,9 @@ static const struct kobj_type type ## _ktype = {			\
 
 static void bch2_fs_release(struct kobject *);
 static void bch2_dev_release(struct kobject *);
+static void bch2_fs_counters_release(struct kobject *k)
+{
+}
 
 static void bch2_fs_internal_release(struct kobject *k)
 {
@@ -92,6 +96,7 @@ static void bch2_fs_time_stats_release(struct kobject *k)
 }
 
 KTYPE(bch2_fs);
+KTYPE(bch2_fs_counters);
 KTYPE(bch2_fs_internal);
 KTYPE(bch2_fs_opts_dir);
 KTYPE(bch2_fs_time_stats);
@@ -416,6 +421,7 @@ static void __bch2_fs_free(struct bch_fs *c)
 	for (i = 0; i < BCH_TIME_STAT_NR; i++)
 		bch2_time_stats_exit(&c->times[i]);
 
+	bch2_fs_counters_exit(c);
 	bch2_fs_snapshots_exit(c);
 	bch2_fs_quota_exit(c);
 	bch2_fs_fsio_exit(c);
@@ -500,6 +506,7 @@ void __bch2_fs_stop(struct bch_fs *c)
 	bch2_fs_debug_exit(c);
 	bch2_fs_chardev_exit(c);
 
+	kobject_put(&c->counters_kobj);
 	kobject_put(&c->time_stats);
 	kobject_put(&c->opts_dir);
 	kobject_put(&c->internal);
@@ -569,6 +576,7 @@ static int bch2_fs_online(struct bch_fs *c)
 	    kobject_add(&c->internal, &c->kobj, "internal") ?:
 	    kobject_add(&c->opts_dir, &c->kobj, "options") ?:
 	    kobject_add(&c->time_stats, &c->kobj, "time_stats") ?:
+	    kobject_add(&c->counters_kobj, &c->kobj, "counters") ?:
 	    bch2_opts_create_sysfs_files(&c->opts_dir);
 	if (ret) {
 		bch_err(c, "error creating sysfs objects");
@@ -617,6 +625,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 	kobject_init(&c->internal, &bch2_fs_internal_ktype);
 	kobject_init(&c->opts_dir, &bch2_fs_opts_dir_ktype);
 	kobject_init(&c->time_stats, &bch2_fs_time_stats_ktype);
+	kobject_init(&c->counters_kobj, &bch2_fs_counters_ktype);
 
 	c->minor		= -1;
 	c->disk_sb.fs_sb	= true;
@@ -777,7 +786,8 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 	    bch2_fs_encryption_init(c) ?:
 	    bch2_fs_compress_init(c) ?:
 	    bch2_fs_ec_init(c) ?:
-	    bch2_fs_fsio_init(c);
+	    bch2_fs_fsio_init(c) ?:
+	    bch2_fs_counters_init(c);
 	if (ret)
 		goto err;
 
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index 24180d98fe81..6b5b20d18012 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -40,7 +40,7 @@
 #include "util.h"
 
 #define SYSFS_OPS(type)							\
-struct sysfs_ops type ## _sysfs_ops = {					\
+const struct sysfs_ops type ## _sysfs_ops = {					\
 	.show	= type ## _show,					\
 	.store	= type ## _store					\
 }
@@ -195,6 +195,10 @@ read_attribute(extent_migrate_done);
 read_attribute(extent_migrate_raced);
 read_attribute(bucket_alloc_fail);
 
+#define x(t, n, ...) read_attribute(t);
+BCH_PERSISTENT_COUNTERS()
+#undef x
+
 rw_attribute(discard);
 rw_attribute(label);
 
@@ -551,6 +555,47 @@ struct attribute *bch2_fs_files[] = {
 	NULL
 };
 
+/* counters dir */
+
+SHOW(bch2_fs_counters)
+{
+	struct bch_fs *c = container_of(kobj, struct bch_fs, counters_kobj);
+	u64 counter = 0;
+	u64 counter_since_mount = 0;
+
+	out->tabstops[0] = 32;
+	#define x(t, ...) \
+		if (attr == &sysfs_##t) {					\
+			counter             = percpu_u64_get(&c->counters[BCH_COUNTER_##t]);\
+			counter_since_mount = counter - c->counters_on_mount[BCH_COUNTER_##t];\
+			pr_buf(out, "since mount:");				\
+			pr_tab(out);						\
+			bch2_hprint(out, counter_since_mount << 9);		\
+			pr_newline(out);					\
+										\
+			pr_buf(out, "since filesystem creation:");		\
+			pr_tab(out);						\
+			bch2_hprint(out, counter << 9);				\
+			pr_newline(out);					\
+		}
+	BCH_PERSISTENT_COUNTERS()
+	#undef x
+	return 0;
+}
+
+STORE(bch2_fs_counters) {
+	return 0;
+}
+
+SYSFS_OPS(bch2_fs_counters);
+
+struct attribute *bch2_fs_counters_files[] = {
+#define x(t, ...) \
+	&sysfs_##t,
+	BCH_PERSISTENT_COUNTERS()
+#undef x
+	NULL
+};
 /* internal dir - just a wrapper */
 
 SHOW(bch2_fs_internal)
diff --git a/fs/bcachefs/sysfs.h b/fs/bcachefs/sysfs.h
index 525fd05d91f7..222cd5062702 100644
--- a/fs/bcachefs/sysfs.h
+++ b/fs/bcachefs/sysfs.h
@@ -10,28 +10,32 @@ struct attribute;
 struct sysfs_ops;
 
 extern struct attribute *bch2_fs_files[];
+extern struct attribute *bch2_fs_counters_files[];
 extern struct attribute *bch2_fs_internal_files[];
 extern struct attribute *bch2_fs_opts_dir_files[];
 extern struct attribute *bch2_fs_time_stats_files[];
 extern struct attribute *bch2_dev_files[];
 
-extern struct sysfs_ops bch2_fs_sysfs_ops;
-extern struct sysfs_ops bch2_fs_internal_sysfs_ops;
-extern struct sysfs_ops bch2_fs_opts_dir_sysfs_ops;
-extern struct sysfs_ops bch2_fs_time_stats_sysfs_ops;
-extern struct sysfs_ops bch2_dev_sysfs_ops;
+extern const struct sysfs_ops bch2_fs_sysfs_ops;
+extern const struct sysfs_ops bch2_fs_counters_sysfs_ops;
+extern const struct sysfs_ops bch2_fs_internal_sysfs_ops;
+extern const struct sysfs_ops bch2_fs_opts_dir_sysfs_ops;
+extern const struct sysfs_ops bch2_fs_time_stats_sysfs_ops;
+extern const struct sysfs_ops bch2_dev_sysfs_ops;
 
 int bch2_opts_create_sysfs_files(struct kobject *);
 
 #else
 
 static struct attribute *bch2_fs_files[] = {};
+static struct attribute *bch2_fs_counters_files[] = {};
 static struct attribute *bch2_fs_internal_files[] = {};
 static struct attribute *bch2_fs_opts_dir_files[] = {};
 static struct attribute *bch2_fs_time_stats_files[] = {};
 static struct attribute *bch2_dev_files[] = {};
 
 static const struct sysfs_ops bch2_fs_sysfs_ops;
+static const struct sysfs_ops bch2_fs_counters_sysfs_ops;
 static const struct sysfs_ops bch2_fs_internal_sysfs_ops;
 static const struct sysfs_ops bch2_fs_opts_dir_sysfs_ops;
 static const struct sysfs_ops bch2_fs_time_stats_sysfs_ops;
-- 
cgit 


From 1cab5a82cc67a09705fbe0607e6ab751f6663524 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 21 Apr 2022 13:13:57 -0400
Subject: bcachefs: Go RW before bch2_check_lrus()

btree updates before going RW are expensive if they're in random order,
since they use the list of keys for journal replay to insert, which is
just a gap buffer.

This patch improves the bucket invalidate path so that if
bch2_check_lrus() hasn't finished it only prints warnings instead of
doing an emergency shutdown, which means we can now set BCH_FS_MAY_GO_RW
before bch2_check_lrus().

Also, the filesystem state bits are reorganized a bit.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/alloc_background.c | 36 +++++++++++++-----
 fs/bcachefs/bcachefs.h         | 19 +++++-----
 fs/bcachefs/lru.c              |  4 +-
 fs/bcachefs/recovery.c         | 86 ++++++++++++++++++++++++------------------
 4 files changed, 90 insertions(+), 55 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index d9cf676da030..eb03b4135c3d 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -382,7 +382,8 @@ int bch2_alloc_v4_invalid(const struct bch_fs *c, struct bkey_s_c k,
 				return -EINVAL;
 			}
 
-			if (!a.v->io_time[READ]) {
+			if (!a.v->io_time[READ] &&
+			    test_bit(BCH_FS_CHECK_ALLOC_TO_LRU_REFS_DONE, &c->flags)) {
 				pr_buf(err, "cached bucket with read_time == 0");
 				return -EINVAL;
 			}
@@ -588,7 +589,6 @@ int bch2_trans_mark_alloc(struct btree_trans *trans,
 	    !new_a->io_time[READ])
 		new_a->io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now));
 
-
 	old_lru = alloc_lru_idx(old_a);
 	new_lru = alloc_lru_idx(*new_a);
 
@@ -1088,6 +1088,7 @@ static int invalidate_one_bucket(struct btree_trans *trans, struct bch_dev *ca)
 
 	bch2_trans_iter_init(trans, &lru_iter, BTREE_ID_lru,
 			     POS(ca->dev_idx, 0), 0);
+next_lru:
 	k = bch2_btree_iter_peek(&lru_iter);
 	ret = bkey_err(k);
 	if (ret)
@@ -1096,9 +1097,20 @@ static int invalidate_one_bucket(struct btree_trans *trans, struct bch_dev *ca)
 	if (!k.k || k.k->p.inode != ca->dev_idx)
 		goto out;
 
-	if (bch2_trans_inconsistent_on(k.k->type != KEY_TYPE_lru, trans,
-				       "non lru key in lru btree"))
-		goto out;
+	if (k.k->type != KEY_TYPE_lru) {
+		pr_buf(&buf, "non lru key in lru btree:\n  ");
+		bch2_bkey_val_to_text(&buf, c, k);
+
+		if (!test_bit(BCH_FS_CHECK_LRUS_DONE, &c->flags)) {
+			bch_err(c, "%s", buf.buf);
+			bch2_btree_iter_advance(&lru_iter);
+			goto next_lru;
+		} else {
+			bch2_trans_inconsistent(trans, "%s", buf.buf);
+			ret = -EINVAL;
+			goto out;
+		}
+	}
 
 	idx	= k.k->p.offset;
 	bucket	= le64_to_cpu(bkey_s_c_to_lru(k).v->idx);
@@ -1111,13 +1123,19 @@ static int invalidate_one_bucket(struct btree_trans *trans, struct bch_dev *ca)
 
 	if (idx != alloc_lru_idx(a->v)) {
 		pr_buf(&buf, "alloc key does not point back to lru entry when invalidating bucket:\n  ");
-
 		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&a->k_i));
 		pr_buf(&buf, "\n  ");
 		bch2_bkey_val_to_text(&buf, c, k);
-		bch2_trans_inconsistent(trans, "%s", buf.buf);
-		ret = -EINVAL;
-		goto out;
+
+		if (!test_bit(BCH_FS_CHECK_LRUS_DONE, &c->flags)) {
+			bch_err(c, "%s", buf.buf);
+			bch2_btree_iter_advance(&lru_iter);
+			goto next_lru;
+		} else {
+			bch2_trans_inconsistent(trans, "%s", buf.buf);
+			ret = -EINVAL;
+			goto out;
+		}
 	}
 
 	SET_BCH_ALLOC_V4_NEED_INC_GEN(&a->v, false);
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 5dda57afa802..127323b677df 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -494,11 +494,6 @@ struct bch_dev {
 
 enum {
 	/* startup: */
-	BCH_FS_CLEAN_SHUTDOWN,
-	BCH_FS_INITIAL_GC_DONE,
-	BCH_FS_INITIAL_GC_UNFIXED,
-	BCH_FS_TOPOLOGY_REPAIR_DONE,
-	BCH_FS_FSCK_DONE,
 	BCH_FS_STARTED,
 	BCH_FS_MAY_GO_RW,
 	BCH_FS_RW,
@@ -508,16 +503,22 @@ enum {
 	BCH_FS_STOPPING,
 	BCH_FS_EMERGENCY_RO,
 	BCH_FS_WRITE_DISABLE_COMPLETE,
+	BCH_FS_CLEAN_SHUTDOWN,
+
+	/* fsck passes: */
+	BCH_FS_TOPOLOGY_REPAIR_DONE,
+	BCH_FS_INITIAL_GC_DONE,		/* kill when we enumerate fsck passes */
+	BCH_FS_CHECK_LRUS_DONE,
+	BCH_FS_CHECK_ALLOC_TO_LRU_REFS_DONE,
+	BCH_FS_FSCK_DONE,
+	BCH_FS_INITIAL_GC_UNFIXED,	/* kill when we enumerate fsck errors */
+	BCH_FS_NEED_ANOTHER_GC,
 
 	/* errors: */
 	BCH_FS_ERROR,
 	BCH_FS_TOPOLOGY_ERROR,
 	BCH_FS_ERRORS_FIXED,
 	BCH_FS_ERRORS_NOT_FIXED,
-
-	/* misc: */
-	BCH_FS_NEED_ANOTHER_GC,
-	BCH_FS_DELETED_NODES,
 };
 
 struct btree_debug {
diff --git a/fs/bcachefs/lru.c b/fs/bcachefs/lru.c
index fe9d15742947..ce23b38382f5 100644
--- a/fs/bcachefs/lru.c
+++ b/fs/bcachefs/lru.c
@@ -204,7 +204,9 @@ int bch2_check_lrus(struct bch_fs *c, bool initial)
 
 	for_each_btree_key(&trans, iter, BTREE_ID_lru, POS_MIN,
 			   BTREE_ITER_PREFETCH, k, ret) {
-		ret = __bch2_trans_do(&trans, NULL, NULL, 0,
+		ret = __bch2_trans_do(&trans, NULL, NULL,
+				      BTREE_INSERT_NOFAIL|
+				      BTREE_INSERT_LAZY_RW,
 			bch2_check_lru_key(&trans, &iter, initial));
 		if (ret)
 			break;
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index e2474ff99702..5831ab53a982 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -994,7 +994,6 @@ static int bch2_fs_initialize_subvolumes(struct bch_fs *c)
 	if (ret)
 		return ret;
 
-
 	bkey_subvolume_init(&root_volume.k_i);
 	root_volume.k.p.offset = BCACHEFS_ROOT_SUBVOL;
 	root_volume.v.flags	= 0;
@@ -1096,6 +1095,12 @@ int bch2_fs_recovery(struct bch_fs *c)
 		}
 	}
 
+	if (c->opts.fsck && c->opts.norecovery) {
+		bch_err(c, "cannot select both norecovery and fsck");
+		ret = -EINVAL;
+		goto err;
+	}
+
 	ret = bch2_blacklist_table_initialize(c);
 	if (ret) {
 		bch_err(c, "error initializing blacklist table");
@@ -1189,6 +1194,13 @@ use_clean:
 	if (ret)
 		goto err;
 
+	/*
+	 * Skip past versions that might have possibly been used (as nonces),
+	 * but hadn't had their pointers written:
+	 */
+	if (c->sb.encryption_type && !c->sb.clean)
+		atomic64_add(1 << 16, &c->key_version);
+
 	ret = read_btree_roots(c);
 	if (ret)
 		goto err;
@@ -1211,12 +1223,7 @@ use_clean:
 		goto err;
 	bch_verbose(c, "stripes_read done");
 
-	/*
-	 * If we're not running fsck, this ensures bch2_fsck_err() calls are
-	 * instead interpreted as bch2_inconsistent_err() calls:
-	 */
-	if (!c->opts.fsck)
-		set_bit(BCH_FS_FSCK_DONE, &c->flags);
+	bch2_stripes_heap_start(c);
 
 	if (c->opts.fsck) {
 		bool metadata_only = c->opts.norecovery;
@@ -1228,6 +1235,8 @@ use_clean:
 			goto err;
 		bch_verbose(c, "done checking allocations");
 
+		set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags);
+
 		bch_info(c, "checking need_discard and freespace btrees");
 		err = "error checking need_discard and freespace btrees";
 		ret = bch2_check_alloc_info(c);
@@ -1235,55 +1244,60 @@ use_clean:
 			goto err;
 		bch_verbose(c, "done checking need_discard and freespace btrees");
 
+		set_bit(BCH_FS_MAY_GO_RW, &c->flags);
+
+		bch_verbose(c, "starting journal replay, %zu keys", c->journal_keys.nr);
+		err = "journal replay failed";
+		ret = bch2_journal_replay(c);
+		if (ret)
+			goto err;
+		if (c->opts.verbose || !c->sb.clean)
+			bch_info(c, "journal replay done");
+
 		bch_info(c, "checking lrus");
 		err = "error checking lrus";
 		ret = bch2_check_lrus(c, true);
 		if (ret)
 			goto err;
 		bch_verbose(c, "done checking lrus");
-	}
 
-	bch2_stripes_heap_start(c);
+		set_bit(BCH_FS_CHECK_LRUS_DONE, &c->flags);
 
-	set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags);
-	set_bit(BCH_FS_MAY_GO_RW, &c->flags);
-
-	/*
-	 * Skip past versions that might have possibly been used (as nonces),
-	 * but hadn't had their pointers written:
-	 */
-	if (c->sb.encryption_type && !c->sb.clean)
-		atomic64_add(1 << 16, &c->key_version);
-
-	if (c->opts.norecovery)
-		goto out;
-
-	bch_verbose(c, "starting journal replay, %zu keys", c->journal_keys.nr);
-	err = "journal replay failed";
-	ret = bch2_journal_replay(c);
-	if (ret)
-		goto err;
-	if (c->opts.verbose || !c->sb.clean)
-		bch_info(c, "journal replay done");
-
-	err = "error initializing freespace";
-	ret = bch2_fs_freespace_init(c);
-	if (ret)
-		goto err;
-
-	if (c->opts.fsck) {
 		bch_info(c, "checking alloc to lru refs");
 		err = "error checking alloc to lru refs";
 		ret = bch2_check_alloc_to_lru_refs(c);
 		if (ret)
 			goto err;
+		set_bit(BCH_FS_CHECK_ALLOC_TO_LRU_REFS_DONE, &c->flags);
 
 		ret = bch2_check_lrus(c, true);
 		if (ret)
 			goto err;
 		bch_verbose(c, "done checking alloc to lru refs");
+	} else {
+		set_bit(BCH_FS_MAY_GO_RW, &c->flags);
+		set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags);
+		set_bit(BCH_FS_CHECK_LRUS_DONE, &c->flags);
+		set_bit(BCH_FS_CHECK_ALLOC_TO_LRU_REFS_DONE, &c->flags);
+		set_bit(BCH_FS_FSCK_DONE, &c->flags);
+
+		if (c->opts.norecovery)
+			goto out;
+
+		bch_verbose(c, "starting journal replay, %zu keys", c->journal_keys.nr);
+		err = "journal replay failed";
+		ret = bch2_journal_replay(c);
+		if (ret)
+			goto err;
+		if (c->opts.verbose || !c->sb.clean)
+			bch_info(c, "journal replay done");
 	}
 
+	err = "error initializing freespace";
+	ret = bch2_fs_freespace_init(c);
+	if (ret)
+		goto err;
+
 	if (c->sb.version < bcachefs_metadata_version_snapshot_2) {
 		bch2_fs_lazy_rw(c);
 
-- 
cgit 


From 099989c1b230e0f36ee7146d1df948822c999f6a Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 21 Apr 2022 00:34:58 -0400
Subject: bcachefs: Fix journal_iters_fix()

journal_iters_fix() was incorrectly rewinding iterators past keys they
had already returned, leading to those keys being double counted in the
bch2_gc() path - oops.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/recovery.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 5831ab53a982..f54859b49416 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -147,7 +147,7 @@ static void journal_iters_fix(struct bch_fs *c)
 
 	/*
 	 * If an iterator points one after the key we just inserted,
-	 * and the key we just inserted compares >= the iterator's position,
+	 * and the key we just inserted compares > the iterator's position,
 	 * decrement the iterator so it points at the key we just inserted:
 	 */
 	list_for_each_entry(iter, &c->journal_iters, journal.list)
@@ -155,7 +155,7 @@ static void journal_iters_fix(struct bch_fs *c)
 		    iter->last &&
 		    iter->b->c.btree_id == n->btree_id &&
 		    iter->b->c.level	== n->level &&
-		    bpos_cmp(n->k->k.p, iter->unpacked.p) >= 0)
+		    bpos_cmp(n->k->k.p, iter->unpacked.p) > 0)
 			iter->journal.idx = keys->gap - 1;
 }
 
-- 
cgit 


From ae21f74e3135efacf73d37919b5cc9ceadda7219 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 18 Apr 2022 23:43:08 -0400
Subject: bcachefs: Improve invalid bkey error message

Bkeys have gotten a lot bigger since this code was written and now are
often formatted across multiple lines - while the reason a bkey is
invalid will still be short and fit on a single line. This patch prints
the error bfore the bkey, making it a bit more readable.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_io.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 33c54803e0a2..e86285c320ed 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -815,10 +815,10 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b,
 		printbuf_reset(&buf);
 		if (bset_key_invalid(c, b, u.s_c, updated_range, write, &buf)) {
 			printbuf_reset(&buf);
-			pr_buf(&buf, "invalid bkey:\n  ");
-			bch2_bkey_val_to_text(&buf, c, u.s_c);
-			pr_buf(&buf, "  \n");
+			pr_buf(&buf, "invalid bkey:  ");
 			bset_key_invalid(c, b, u.s_c, updated_range, write, &buf);
+			pr_buf(&buf, "\n  ");
+			bch2_bkey_val_to_text(&buf, c, u.s_c);
 
 			btree_err(BTREE_ERR_FIXABLE, c, NULL, b, i, "%s", buf.buf);
 
@@ -1076,10 +1076,10 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
 		     !bversion_cmp(u.k->version, MAX_VERSION))) {
 			printbuf_reset(&buf);
 
-			pr_buf(&buf, "invalid bkey\n  ");
-			bch2_bkey_val_to_text(&buf, c, u.s_c);
-			pr_buf(&buf, "\n  ");
+			pr_buf(&buf, "invalid bkey: ");
 			bch2_bkey_val_invalid(c, u.s_c, READ, &buf);
+			pr_buf(&buf, "\n  ");
+			bch2_bkey_val_to_text(&buf, c, u.s_c);
 
 			btree_err(BTREE_ERR_FIXABLE, c, NULL, b, i, "%s", buf.buf);
 
-- 
cgit 


From e320b42dfeb5797a618edbba5186060b3907ba89 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 24 Apr 2022 23:03:02 -0400
Subject: bcachefs: Fix extent merging

When merging extents, we have to check that we won't overflow size
fields in any CRC entries - but the check for this was wrong, because in
the loop it was in we weren't keeping a pointer to the (packed, encoded)
CRC field.

Fix this by moving it to its own loop.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/extents.c | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index c56925d94bfe..d8f429ffe57c 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -307,8 +307,20 @@ bool bch2_extent_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r)
 			    lp.crc.uncompressed_size +
 			    rp.crc.uncompressed_size > (c->opts.encoded_extent_max >> 9))
 				return false;
+		}
+
+		en_l = extent_entry_next(en_l);
+		en_r = extent_entry_next(en_r);
+	}
+
+	en_l = l_ptrs.start;
+	en_r = r_ptrs.start;
+	while (en_l < l_ptrs.end && en_r < r_ptrs.end) {
+		if (extent_entry_is_crc(en_l)) {
+			struct bch_extent_crc_unpacked crc_l = bch2_extent_crc_unpack(l.k, entry_to_crc(en_l));
+			struct bch_extent_crc_unpacked crc_r = bch2_extent_crc_unpack(r.k, entry_to_crc(en_r));
 
-			if (lp.crc.uncompressed_size + rp.crc.uncompressed_size >
+			if (crc_l.uncompressed_size + crc_r.uncompressed_size >
 			    bch2_crc_field_size_max[extent_entry_type(en_l)])
 				return false;
 		}
-- 
cgit 


From ee4d17d0325c5806c7ef0f4b3c8604d5ebf65000 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 25 Apr 2022 02:12:03 -0400
Subject: bcachefs: Put btree_trans_verify_sorted() behind
 debug_check_iterators

This is pretty expensive, and we've tested sufficiently with it now that
it doesn't need to be on by default.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_iter.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index cd85c3ad2ab7..b840035dca55 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -2916,6 +2916,9 @@ static void btree_trans_verify_sorted(struct btree_trans *trans)
 	struct btree_path *path, *prev = NULL;
 	unsigned i;
 
+	if (!bch2_debug_check_iterators)
+		return;
+
 	trans_for_each_path_inorder(trans, path, i) {
 		if (prev && btree_path_cmp(prev, path) > 0) {
 			bch2_dump_trans_paths_updates(trans);
-- 
cgit 


From 372c11125a2e07485fcd4cb08601f24d8a3bc3c6 Mon Sep 17 00:00:00 2001
From: Brett Holman <bholman.devel@gmail.com>
Date: Tue, 3 May 2022 16:50:57 -0600
Subject: bcachefs: Make bch_option compatible with Rust ffi

Rust FFI lacks support for unnamed structs and unions. The space
saved in bch_option is not enough to be significant.

Signed-off-by: Brett Holman <bholman.devel@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/opts.h | 14 +++-----------
 1 file changed, 3 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
index 863891dcb554..e15ffb07416b 100644
--- a/fs/bcachefs/opts.h
+++ b/fs/bcachefs/opts.h
@@ -455,17 +455,9 @@ struct bch_option {
 	enum opt_flags		flags;
 	u64			min, max;
 
-	union {
-	struct {
-	};
-	struct {
-		const char * const *choices;
-	};
-	struct {
-		int (*parse)(struct bch_fs *, const char *, u64 *);
-		void (*to_text)(struct printbuf *, struct bch_fs *, struct bch_sb *, u64);
-	};
-	};
+	const char * const *choices;
+	int (*parse)(struct bch_fs *, const char *, u64 *);
+	void (*to_text)(struct printbuf *, struct bch_fs *, struct bch_sb *, u64);
 
 	const char		*hint;
 	const char		*help;
-- 
cgit 


From facc81479cab081cbcb962bfbe5d61f25230d013 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 5 May 2022 17:20:41 -0400
Subject: bcachefs: Delete bch_writepage

Per Dave Chinner and the xfs folks, .writepage is no longer needed, and
it's better not to define it if .writepages is the intended path.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/fs-io.c | 14 --------------
 fs/bcachefs/fs-io.h |  1 -
 fs/bcachefs/fs.c    |  1 -
 3 files changed, 16 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 256b3dd0d4aa..ad51483ad764 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -1469,20 +1469,6 @@ int bch2_writepages(struct address_space *mapping, struct writeback_control *wbc
 	return ret;
 }
 
-int bch2_writepage(struct page *page, struct writeback_control *wbc)
-{
-	struct bch_fs *c = page->mapping->host->i_sb->s_fs_info;
-	struct bch_writepage_state w =
-		bch_writepage_state_init(c, to_bch_ei(page->mapping->host));
-	int ret;
-
-	ret = __bch2_writepage(page_folio(page), wbc, &w);
-	if (w.io)
-		bch2_writepage_do_io(&w);
-
-	return ret;
-}
-
 /* buffered writes: */
 
 int bch2_write_begin(struct file *file, struct address_space *mapping,
diff --git a/fs/bcachefs/fs-io.h b/fs/bcachefs/fs-io.h
index 64b16b44e25a..af905331542d 100644
--- a/fs/bcachefs/fs-io.h
+++ b/fs/bcachefs/fs-io.h
@@ -15,7 +15,6 @@ int __must_check bch2_write_inode_size(struct bch_fs *,
 				       struct bch_inode_info *,
 				       loff_t, unsigned);
 
-int bch2_writepage(struct page *, struct writeback_control *);
 int bch2_read_folio(struct file *, struct folio *);
 
 int bch2_writepages(struct address_space *, struct writeback_control *);
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index d8cd32b5d765..b2bc28d0cf05 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -1113,7 +1113,6 @@ static const struct inode_operations bch_special_inode_operations = {
 };
 
 static const struct address_space_operations bch_address_space_operations = {
-	.writepage	= bch2_writepage,
 	.read_folio	= bch2_read_folio,
 	.writepages	= bch2_writepages,
 	.readahead	= bch2_readahead,
-- 
cgit 


From d8a161ad5493016aa6eba8853990456aa78316c9 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 14 May 2022 06:58:51 -0400
Subject: bcachefs: LRU repair tweaks

 - Drop old unneeded parameter for whether we're in initial GC - which
   was from when btree updates had to be done differently before we
   went RW.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/lru.c      | 6 +++---
 fs/bcachefs/lru.h      | 2 +-
 fs/bcachefs/recovery.c | 9 ++-------
 3 files changed, 6 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/lru.c b/fs/bcachefs/lru.c
index ce23b38382f5..d278331776dd 100644
--- a/fs/bcachefs/lru.c
+++ b/fs/bcachefs/lru.c
@@ -130,7 +130,7 @@ int bch2_lru_change(struct btree_trans *trans, u64 id, u64 idx,
 }
 
 static int bch2_check_lru_key(struct btree_trans *trans,
-			      struct btree_iter *lru_iter, bool initial)
+			      struct btree_iter *lru_iter)
 {
 	struct bch_fs *c = trans->c;
 	struct btree_iter iter;
@@ -193,7 +193,7 @@ fsck_err:
 	return ret;
 }
 
-int bch2_check_lrus(struct bch_fs *c, bool initial)
+int bch2_check_lrus(struct bch_fs *c)
 {
 	struct btree_trans trans;
 	struct btree_iter iter;
@@ -207,7 +207,7 @@ int bch2_check_lrus(struct bch_fs *c, bool initial)
 		ret = __bch2_trans_do(&trans, NULL, NULL,
 				      BTREE_INSERT_NOFAIL|
 				      BTREE_INSERT_LAZY_RW,
-			bch2_check_lru_key(&trans, &iter, initial));
+			bch2_check_lru_key(&trans, &iter));
 		if (ret)
 			break;
 	}
diff --git a/fs/bcachefs/lru.h b/fs/bcachefs/lru.h
index bfe38a67e585..3decb7b1dde2 100644
--- a/fs/bcachefs/lru.h
+++ b/fs/bcachefs/lru.h
@@ -14,6 +14,6 @@ int bch2_lru_delete(struct btree_trans *, u64, u64, u64, struct bkey_s_c);
 int bch2_lru_set(struct btree_trans *, u64, u64, u64 *);
 int bch2_lru_change(struct btree_trans *, u64, u64, u64, u64 *, struct bkey_s_c);
 
-int bch2_check_lrus(struct bch_fs *, bool);
+int bch2_check_lrus(struct bch_fs *);
 
 #endif /* _BCACHEFS_LRU_H */
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index f54859b49416..87a4bced853a 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -1256,24 +1256,19 @@ use_clean:
 
 		bch_info(c, "checking lrus");
 		err = "error checking lrus";
-		ret = bch2_check_lrus(c, true);
+		ret = bch2_check_lrus(c);
 		if (ret)
 			goto err;
 		bch_verbose(c, "done checking lrus");
-
 		set_bit(BCH_FS_CHECK_LRUS_DONE, &c->flags);
 
 		bch_info(c, "checking alloc to lru refs");
 		err = "error checking alloc to lru refs";
 		ret = bch2_check_alloc_to_lru_refs(c);
-		if (ret)
-			goto err;
-		set_bit(BCH_FS_CHECK_ALLOC_TO_LRU_REFS_DONE, &c->flags);
-
-		ret = bch2_check_lrus(c, true);
 		if (ret)
 			goto err;
 		bch_verbose(c, "done checking alloc to lru refs");
+		set_bit(BCH_FS_CHECK_ALLOC_TO_LRU_REFS_DONE, &c->flags);
 	} else {
 		set_bit(BCH_FS_MAY_GO_RW, &c->flags);
 		set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags);
-- 
cgit 


From 232697ab9ded81d56afa8a2e47edb48deea5f9e6 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 14 May 2022 07:00:22 -0400
Subject: bcachefs: Switch to key_type_user, not logon

The only difference key_type_logon and key_type_user is that
key_type_logon keys can't be read by userspace.

However, userspace has actually been adding keys to both the logon and
user keychains, because userspace fsck requires the keychain interface -
so we might as well just use user and drop the logon keychain.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/checksum.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/checksum.c b/fs/bcachefs/checksum.c
index 425582f60d7a..50157b4013a5 100644
--- a/fs/bcachefs/checksum.c
+++ b/fs/bcachefs/checksum.c
@@ -424,7 +424,7 @@ static int __bch2_request_key(char *key_description, struct bch_key *key)
 	const struct user_key_payload *ukp;
 	int ret;
 
-	keyring_key = request_key(&key_type_logon, key_description, NULL);
+	keyring_key = request_key(&key_type_user, key_description, NULL);
 	if (IS_ERR(keyring_key))
 		return PTR_ERR(keyring_key);
 
-- 
cgit 


From c346def9af1d3890ee604905fb08d689e8383855 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 23 May 2022 22:37:01 -0400
Subject: bcachefs: Fix encryption path on arm

flush_dcache_page() is not a noop on arm, but we were using
virt_to_page() instead of vmalloc_to_page() for an address on the kernel
stack - vmalloc memory, leading to an oops in flush_dcache_page().

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/checksum.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/checksum.c b/fs/bcachefs/checksum.c
index 50157b4013a5..317efd047a46 100644
--- a/fs/bcachefs/checksum.c
+++ b/fs/bcachefs/checksum.c
@@ -116,7 +116,12 @@ static inline int do_encrypt(struct crypto_sync_skcipher *tfm,
 {
 	struct scatterlist sg;
 
-	sg_init_one(&sg, buf, len);
+	sg_init_table(&sg, 1);
+	sg_set_page(&sg,
+		    is_vmalloc_addr(buf)
+		    ? vmalloc_to_page(buf)
+		    : virt_to_page(buf),
+		    len, offset_in_page(buf));
 	return do_encrypt_sg(tfm, nonce, &sg, len);
 }
 
-- 
cgit 


From a8dea22703b18822662befa676e5c7bfef1f7759 Mon Sep 17 00:00:00 2001
From: Daniel Hill <daniel@gluo.nz>
Date: Wed, 25 May 2022 14:57:39 +1200
Subject: bcachefs: Rename group to label for remaining strings.

Signed-off-by: Daniel Hill <daniel@gluo.nz>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/opts.h     | 8 ++++----
 fs/bcachefs/super-io.c | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
index e15ffb07416b..dee75d7e6fe8 100644
--- a/fs/bcachefs/opts.h
+++ b/fs/bcachefs/opts.h
@@ -165,22 +165,22 @@ enum opt_type {
 	  OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,		\
 	  OPT_FN(bch2_opt_target),					\
 	  BCH_SB_METADATA_TARGET,	0,				\
-	  "(target)",	"Device or disk group for metadata writes")	\
+	  "(target)",	"Device or label for metadata writes")		\
 	x(foreground_target,		u16,				\
 	  OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,		\
 	  OPT_FN(bch2_opt_target),					\
 	  BCH_SB_FOREGROUND_TARGET,	0,				\
-	  "(target)",	"Device or disk group for foreground writes")	\
+	  "(target)",	"Device or label for foreground writes")	\
 	x(background_target,		u16,				\
 	  OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,		\
 	  OPT_FN(bch2_opt_target),					\
 	  BCH_SB_BACKGROUND_TARGET,	0,				\
-	  "(target)",	"Device or disk group to move data to in the background")\
+	  "(target)",	"Device or label to move data to in the background")\
 	x(promote_target,		u16,				\
 	  OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,		\
 	  OPT_FN(bch2_opt_target),					\
 	  BCH_SB_PROMOTE_TARGET,	0,				\
-	  "(target)",	"Device or disk group to promote data to on read")\
+	  "(target)",	"Device or label to promote data to on read")	\
 	x(erasure_code,			u16,				\
 	  OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,		\
 	  OPT_BOOL(),							\
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index 56a6c925543a..54502e392dfc 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -1059,7 +1059,7 @@ static void bch2_sb_members_to_text(struct printbuf *out, struct bch_sb *sb,
 		       : "unknown");
 		pr_newline(out);
 
-		pr_buf(out, "Group:");
+		pr_buf(out, "Label:");
 		pr_tab(out);
 		if (BCH_MEMBER_GROUP(m)) {
 			unsigned idx = BCH_MEMBER_GROUP(m) - 1;
-- 
cgit 


From 11f5e595bf7cd11c395f0041cdd6448f238a5614 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 25 May 2022 21:34:11 -0400
Subject: bcachefs: Always print when doing journal replay in fsck

This logging improvement helps see when the previous fsck pass has
completed.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/recovery.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 87a4bced853a..2e782d5d968e 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -1246,7 +1246,7 @@ use_clean:
 
 		set_bit(BCH_FS_MAY_GO_RW, &c->flags);
 
-		bch_verbose(c, "starting journal replay, %zu keys", c->journal_keys.nr);
+		bch_info(c, "starting journal replay, %zu keys", c->journal_keys.nr);
 		err = "journal replay failed";
 		ret = bch2_journal_replay(c);
 		if (ret)
-- 
cgit 


From 30525f68633740e071c0960c11c4380f1f6851af Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 21 May 2022 13:10:39 -0400
Subject: bcachefs: Fix journal_keys_search() overhead

Previously, on every btree_iter_peek() operation we were searching the
journal keys, doing a full binary search - which was slow.

This patch fixes that by saving our position in the journal keys, so
that we only do a full binary search when moving our position backwards
or a large jump forwards.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_iter.c  | 28 +++++++++++++++++++-----
 fs/bcachefs/btree_iter.h  |  3 +++
 fs/bcachefs/btree_types.h |  4 ++++
 fs/bcachefs/recovery.c    | 54 +++++++++++++++++++++++++++++++----------------
 fs/bcachefs/recovery.h    |  4 +---
 5 files changed, 67 insertions(+), 26 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index b840035dca55..3ce0571651b5 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -2242,13 +2242,30 @@ static inline struct bkey_i *btree_trans_peek_updates(struct btree_iter *iter)
 		: NULL;
 }
 
+struct bkey_i *bch2_btree_journal_peek(struct btree_trans *trans,
+				       struct btree_iter *iter,
+				       struct bpos end_pos)
+{
+	struct bkey_i *k;
+
+	if (bpos_cmp(iter->path->pos, iter->journal_pos) < 0)
+		iter->journal_idx = 0;
+
+	k = bch2_journal_keys_peek_upto(trans->c, iter->btree_id,
+					iter->path->level,
+					iter->path->pos,
+					end_pos,
+					&iter->journal_idx);
+
+	iter->journal_pos = k ? k->k.p : end_pos;
+	return k;
+}
+
 static noinline
 struct bkey_s_c btree_trans_peek_slot_journal(struct btree_trans *trans,
 					      struct btree_iter *iter)
 {
-	struct bkey_i *k = bch2_journal_keys_peek_slot(trans->c, iter->btree_id,
-						       iter->path->level,
-						       iter->path->pos);
+	struct bkey_i *k = bch2_btree_journal_peek(trans, iter, iter->path->pos);
 
 	if (k) {
 		iter->k = k->k;
@@ -2264,8 +2281,7 @@ struct bkey_s_c btree_trans_peek_journal(struct btree_trans *trans,
 					 struct bkey_s_c k)
 {
 	struct bkey_i *next_journal =
-		bch2_journal_keys_peek_upto(trans->c, iter->btree_id, 0,
-				iter->path->pos,
+		bch2_btree_journal_peek(trans, iter,
 				k.k ? k.k->p : iter->path->l[0].b->key.k.p);
 
 	if (next_journal) {
@@ -3072,6 +3088,8 @@ static void __bch2_trans_iter_init(struct btree_trans *trans,
 	iter->k.type	= KEY_TYPE_deleted;
 	iter->k.p	= pos;
 	iter->k.size	= 0;
+	iter->journal_idx = 0;
+	iter->journal_pos = POS_MIN;
 
 	iter->path = bch2_path_get(trans, btree_id, iter->pos,
 				   locks_want, depth, flags);
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index dc6f07492bc9..83587383a41f 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -175,6 +175,9 @@ struct btree_path *bch2_path_get(struct btree_trans *, enum btree_id, struct bpo
 				 unsigned, unsigned, unsigned);
 inline struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *, struct bkey *);
 
+struct bkey_i *bch2_btree_journal_peek_slot(struct btree_trans *,
+					struct btree_iter *, struct bpos);
+
 #ifdef CONFIG_BCACHEFS_DEBUG
 void bch2_trans_verify_paths(struct btree_trans *);
 void bch2_trans_verify_locks(struct btree_trans *);
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 4f359ff79334..82c8c148c4bc 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -292,6 +292,10 @@ struct btree_iter {
 	 * bch2_btree_iter_next_slot() can correctly advance pos.
 	 */
 	struct bkey		k;
+
+	/* BTREE_ITER_WITH_JOURNAL: */
+	size_t			journal_idx;
+	struct bpos		journal_pos;
 };
 
 struct btree_key_cache {
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 2e782d5d968e..edb04f65a148 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -86,9 +86,9 @@ static inline struct journal_key *idx_to_key(struct journal_keys *keys, size_t i
 	return keys->d + idx_to_pos(keys, idx);
 }
 
-size_t bch2_journal_key_search(struct journal_keys *keys,
-			       enum btree_id id, unsigned level,
-			       struct bpos pos)
+static size_t __bch2_journal_key_search(struct journal_keys *keys,
+					enum btree_id id, unsigned level,
+					struct bpos pos)
 {
 	size_t l = 0, r = keys->nr, m;
 
@@ -106,26 +106,42 @@ size_t bch2_journal_key_search(struct journal_keys *keys,
 	BUG_ON(l &&
 	       __journal_key_cmp(id, level, pos, idx_to_key(keys, l - 1)) <= 0);
 
-	return idx_to_pos(keys, l);
+	return l;
+}
+
+static size_t bch2_journal_key_search(struct journal_keys *keys,
+				      enum btree_id id, unsigned level,
+				      struct bpos pos)
+{
+	return idx_to_pos(keys, __bch2_journal_key_search(keys, id, level, pos));
 }
 
 struct bkey_i *bch2_journal_keys_peek_upto(struct bch_fs *c, enum btree_id btree_id,
 					   unsigned level, struct bpos pos,
-					   struct bpos end_pos)
+					   struct bpos end_pos, size_t *idx)
 {
 	struct journal_keys *keys = &c->journal_keys;
-	size_t idx = bch2_journal_key_search(keys, btree_id, level, pos);
-
-	while (idx < keys->size &&
-	       keys->d[idx].btree_id == btree_id &&
-	       keys->d[idx].level == level &&
-	       bpos_cmp(keys->d[idx].k->k.p, end_pos) <= 0) {
-		if (!keys->d[idx].overwritten)
-			return keys->d[idx].k;
-
-		idx++;
-		if (idx == keys->gap)
-			idx += keys->size - keys->nr;
+	unsigned iters = 0;
+	struct journal_key *k;
+search:
+	if (!*idx)
+		*idx = __bch2_journal_key_search(keys, btree_id, level, pos);
+
+	while (*idx < keys->nr &&
+	       (k = idx_to_key(keys, *idx),
+		k->btree_id == btree_id &&
+		k->level == level &&
+		bpos_cmp(k->k->k.p, end_pos) <= 0)) {
+		if (bpos_cmp(k->k->k.p, pos) >= 0 &&
+		    !k->overwritten)
+			return k->k;
+
+		(*idx)++;
+		iters++;
+		if (iters == 10) {
+			*idx = 0;
+			goto search;
+		}
 	}
 
 	return NULL;
@@ -134,7 +150,9 @@ struct bkey_i *bch2_journal_keys_peek_upto(struct bch_fs *c, enum btree_id btree
 struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *c, enum btree_id btree_id,
 					   unsigned level, struct bpos pos)
 {
-	return bch2_journal_keys_peek_upto(c, btree_id, level, pos, pos);
+	size_t idx = 0;
+
+	return bch2_journal_keys_peek_upto(c, btree_id, level, pos, pos, &idx);
 }
 
 static void journal_iters_fix(struct bch_fs *c)
diff --git a/fs/bcachefs/recovery.h b/fs/bcachefs/recovery.h
index e05aac64185d..52db06b29310 100644
--- a/fs/bcachefs/recovery.h
+++ b/fs/bcachefs/recovery.h
@@ -28,10 +28,8 @@ struct btree_and_journal_iter {
 	}			last;
 };
 
-size_t bch2_journal_key_search(struct journal_keys *, enum btree_id,
-			       unsigned, struct bpos);
 struct bkey_i *bch2_journal_keys_peek_upto(struct bch_fs *, enum btree_id,
-					   unsigned, struct bpos, struct bpos);
+				unsigned, struct bpos, struct bpos, size_t *);
 struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *, enum btree_id,
 					   unsigned, struct bpos);
 
-- 
cgit 


From c737267821c15f1678a9575f23fe2fee0c2d9053 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 26 May 2022 15:19:20 -0400
Subject: bcachefs: Print message on btree node read retry success

Right now, we print an error message on btree node read error, and we
print that we're retrying, but we don't explicitly say if the retry
succeeded - this makes things a little clearer.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_io.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index e86285c320ed..a1043492dbd9 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -1141,10 +1141,12 @@ static void btree_node_read_work(struct work_struct *work)
 	struct bch_io_failures failed = { .nr = 0 };
 	struct printbuf buf = PRINTBUF;
 	bool saw_error = false;
+	bool retry = false;
 	bool can_retry;
 
 	goto start;
 	while (1) {
+		retry = true;
 		bch_info(c, "retrying read");
 		ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev);
 		rb->have_ioref		= bch2_dev_get_ioref(ca, READ);
@@ -1174,8 +1176,11 @@ start:
 				&failed, &rb->pick) > 0;
 
 		if (!bio->bi_status &&
-		    !bch2_btree_node_read_done(c, ca, b, can_retry))
+		    !bch2_btree_node_read_done(c, ca, b, can_retry)) {
+			if (retry)
+				bch_info(c, "retry success");
 			break;
+		}
 
 		saw_error = true;
 
-- 
cgit 


From 9b688da35072910ec205697d2f51226cd3fd9f90 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 28 May 2022 16:21:01 -0400
Subject: bcachefs: Fix error checking in bch2_fs_alloc()

One of the init calls had a ; instead of a ?:, and errors after that got
dropped - oops.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/super.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 159d47d129a2..8d02a8158520 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -780,7 +780,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 	    bch2_fs_btree_key_cache_init(&c->btree_key_cache) ?:
 	    bch2_fs_btree_iter_init(c) ?:
 	    bch2_fs_btree_interior_update_init(c) ?:
-	    bch2_fs_buckets_waiting_for_journal_init(c);
+	    bch2_fs_buckets_waiting_for_journal_init(c) ?:
 	    bch2_fs_subvolumes_init(c) ?:
 	    bch2_fs_io_init(c) ?:
 	    bch2_fs_encryption_init(c) ?:
-- 
cgit 


From 0fbf71f80d60c077f491f0ac97000c3a0c9be3aa Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 29 May 2022 11:38:48 -0400
Subject: bcachefs: bch2_trans_reset_updates()

Factor out a new helper.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_iter.c        | 10 +---------
 fs/bcachefs/btree_update.h      | 13 +++++++++++++
 fs/bcachefs/btree_update_leaf.c |  8 +-------
 3 files changed, 15 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 3ce0571651b5..0f1efc024878 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -3186,20 +3186,12 @@ void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
  */
 void bch2_trans_begin(struct btree_trans *trans)
 {
-	struct btree_insert_entry *i;
 	struct btree_path *path;
 
-	trans_for_each_update(trans, i)
-		__btree_path_put(i->path, true);
+	bch2_trans_reset_updates(trans);
 
-	memset(&trans->journal_res, 0, sizeof(trans->journal_res));
-	trans->extra_journal_res	= 0;
-	trans->nr_updates		= 0;
 	trans->mem_top			= 0;
 
-	trans->hooks			= NULL;
-	trans->extra_journal_entries.nr	= 0;
-
 	if (trans->fs_usage_deltas) {
 		trans->fs_usage_deltas->used = 0;
 		memset((void *) trans->fs_usage_deltas +
diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
index a40f3460fd62..28f958577006 100644
--- a/fs/bcachefs/btree_update.h
+++ b/fs/bcachefs/btree_update.h
@@ -140,4 +140,17 @@ static inline int bch2_trans_commit(struct btree_trans *trans,
 	     (_i) < (_trans)->updates + (_trans)->nr_updates;		\
 	     (_i)++)
 
+static inline void bch2_trans_reset_updates(struct btree_trans *trans)
+{
+	struct btree_insert_entry *i;
+
+	trans_for_each_update(trans, i)
+		bch2_path_put(trans, i->path, true);
+
+	trans->extra_journal_res	= 0;
+	trans->nr_updates		= 0;
+	trans->hooks			= NULL;
+	trans->extra_journal_entries.nr	= 0;
+}
+
 #endif /* _BCACHEFS_BTREE_UPDATE_H */
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index d84769353f65..1a84bdbfda0b 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -1177,13 +1177,7 @@ out:
 	if (likely(!(trans->flags & BTREE_INSERT_NOCHECK_RW)))
 		percpu_ref_put(&c->writes);
 out_reset:
-	trans_for_each_update(trans, i)
-		bch2_path_put(trans, i->path, true);
-
-	trans->extra_journal_res	= 0;
-	trans->nr_updates		= 0;
-	trans->hooks			= NULL;
-	trans->extra_journal_entries.nr	= 0;
+	bch2_trans_reset_updates(trans);
 
 	if (trans->fs_usage_deltas) {
 		trans->fs_usage_deltas->used = 0;
-- 
cgit 


From 636d4eef1eefe447deef134bdf8e34c979ff009e Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 3 Jun 2022 02:34:14 -0400
Subject: bcachefs: Fix memory corruption in encryption path

When do_encrypt() was passed a vmalloc address and the buffer spanned
more than a single page, we were encrypting/decrypting completely
different pages than the ones intended.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/checksum.c | 44 +++++++++++++++++++++++++++++++++++---------
 1 file changed, 35 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/checksum.c b/fs/bcachefs/checksum.c
index 317efd047a46..e9a444f75b93 100644
--- a/fs/bcachefs/checksum.c
+++ b/fs/bcachefs/checksum.c
@@ -114,15 +114,41 @@ static inline int do_encrypt(struct crypto_sync_skcipher *tfm,
 			      struct nonce nonce,
 			      void *buf, size_t len)
 {
-	struct scatterlist sg;
-
-	sg_init_table(&sg, 1);
-	sg_set_page(&sg,
-		    is_vmalloc_addr(buf)
-		    ? vmalloc_to_page(buf)
-		    : virt_to_page(buf),
-		    len, offset_in_page(buf));
-	return do_encrypt_sg(tfm, nonce, &sg, len);
+	if (!is_vmalloc_addr(buf)) {
+		struct scatterlist sg;
+
+		sg_init_table(&sg, 1);
+		sg_set_page(&sg,
+			    is_vmalloc_addr(buf)
+			    ? vmalloc_to_page(buf)
+			    : virt_to_page(buf),
+			    len, offset_in_page(buf));
+		return do_encrypt_sg(tfm, nonce, &sg, len);
+	} else {
+		unsigned pages = buf_pages(buf, len);
+		struct scatterlist *sg;
+		size_t orig_len = len;
+		int ret, i;
+
+		sg = kmalloc_array(sizeof(*sg), pages, GFP_KERNEL);
+		if (!sg)
+			return -ENOMEM;
+
+		sg_init_table(sg, pages);
+
+		for (i = 0; i < pages; i++) {
+			unsigned offset = offset_in_page(buf);
+			unsigned pg_len = min(len, PAGE_SIZE - offset);
+
+			sg_set_page(sg + i, vmalloc_to_page(buf), pg_len, offset);
+			buf += pg_len;
+			len -= pg_len;
+		}
+
+		ret = do_encrypt_sg(tfm, nonce, sg, orig_len);
+		kfree(sg);
+		return ret;
+	}
 }
 
 int bch2_chacha_encrypt_key(struct bch_key *key, struct nonce nonce,
-- 
cgit 


From 4a7a7ea1f59032e182b9faba06df61d6375a1f97 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 4 Jun 2022 14:49:02 -0400
Subject: bcachefs: Add some missing error messages

bch2_opt_parse() was failing to generate error messages in error path.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/opts.c | 24 ++++++++++++++++++++----
 1 file changed, 20 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c
index 385451ef865e..c4ccb42d7851 100644
--- a/fs/bcachefs/opts.c
+++ b/fs/bcachefs/opts.c
@@ -267,20 +267,32 @@ int bch2_opt_parse(struct bch_fs *c,
 	switch (opt->type) {
 	case BCH_OPT_BOOL:
 		ret = kstrtou64(val, 10, res);
-		if (ret < 0)
+		if (ret < 0 || (*res != 0 && *res != 1)) {
+			if (err)
+				pr_buf(err, "%s: must be bool",
+				       opt->attr.name);
 			return ret;
+		}
 		break;
 	case BCH_OPT_UINT:
 		ret = opt->flags & OPT_HUMAN_READABLE
 			? bch2_strtou64_h(val, res)
 			: kstrtou64(val, 10, res);
-		if (ret < 0)
+		if (ret < 0) {
+			if (err)
+				pr_buf(err, "%s: must be a number",
+				       opt->attr.name);
 			return ret;
+		}
 		break;
 	case BCH_OPT_STR:
 		ret = match_string(opt->choices, -1, val);
-		if (ret < 0)
+		if (ret < 0) {
+			if (err)
+				pr_buf(err, "%s: invalid selection",
+				       opt->attr.name);
 			return ret;
+		}
 
 		*res = ret;
 		break;
@@ -289,8 +301,12 @@ int bch2_opt_parse(struct bch_fs *c,
 			return 0;
 
 		ret = opt->parse(c, val, res);
-		if (ret < 0)
+		if (ret < 0) {
+			if (err)
+				pr_buf(err, "%s: parse error",
+				       opt->attr.name);
 			return ret;
+		}
 	}
 
 	return bch2_opt_validate(opt, *res, err);
-- 
cgit 


From 43ddf4483491b3dff8e050f57b595c34822cb6de Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 5 Jun 2022 15:29:00 -0400
Subject: bcachefs: Refactor journal entry adding

This takes copying the payload out of bch2_journal_add_entry(), which
means we can use it for journal_transaction_name() - also prep work for
journalling overwrites.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_update_leaf.c | 41 +++++++++++++++++------------------------
 fs/bcachefs/journal.h           | 37 +++++++++++++++++++------------------
 2 files changed, 36 insertions(+), 42 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 1a84bdbfda0b..df2b21245d00 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -309,25 +309,15 @@ static inline int bch2_trans_journal_res_get(struct btree_trans *trans,
 static noinline void journal_transaction_name(struct btree_trans *trans)
 {
 	struct bch_fs *c = trans->c;
-	struct jset_entry *entry = journal_res_entry(&c->journal, &trans->journal_res);
-	struct jset_entry_log *l = container_of(entry, struct jset_entry_log, entry);
-	unsigned u64s = JSET_ENTRY_LOG_U64s - 1;
-	unsigned b, buflen = u64s * sizeof(u64);
-
-	l->entry.u64s		= cpu_to_le16(u64s);
-	l->entry.btree_id	= 0;
-	l->entry.level		= 0;
-	l->entry.type		= BCH_JSET_ENTRY_log;
-	l->entry.pad[0]		= 0;
-	l->entry.pad[1]		= 0;
-	l->entry.pad[2]		= 0;
-	b = min_t(unsigned, strlen(trans->fn), buflen);
-	memcpy(l->d, trans->fn, b);
-	while (b < buflen)
-		l->d[b++] = '\0';
-
-	trans->journal_res.offset	+= JSET_ENTRY_LOG_U64s;
-	trans->journal_res.u64s		-= JSET_ENTRY_LOG_U64s;
+	struct journal *j = &c->journal;
+	struct jset_entry *entry =
+		bch2_journal_add_entry(j, &trans->journal_res,
+				       BCH_JSET_ENTRY_log, 0, 0,
+				       JSET_ENTRY_LOG_U64s);
+	struct jset_entry_log *l =
+		container_of(entry, struct jset_entry_log, entry);
+
+	strncpy(l->d, trans->fn, JSET_ENTRY_LOG_U64s * sizeof(u64));
 }
 
 static inline enum btree_insert_ret
@@ -416,10 +406,13 @@ static inline void do_btree_insert_one(struct btree_trans *trans,
 
 	if (likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)) &&
 	    !(i->flags & BTREE_UPDATE_NOJOURNAL)) {
-		bch2_journal_add_keys(j, &trans->journal_res,
-				      i->btree_id,
-				      i->level,
-				      i->k);
+		struct jset_entry *entry;
+
+		entry = bch2_journal_add_entry(j, &trans->journal_res,
+				       BCH_JSET_ENTRY_btree_keys,
+				       i->btree_id, i->level,
+				       i->k->k.u64s);
+		bkey_copy(&entry->start[0], i->k);
 
 		if (trans->journal_seq)
 			*trans->journal_seq = trans->journal_res.seq;
@@ -1127,7 +1120,7 @@ int __bch2_trans_commit(struct btree_trans *trans)
 	trans->journal_transaction_names = READ_ONCE(c->opts.journal_transaction_names);
 
 	if (trans->journal_transaction_names)
-		trans->journal_u64s += JSET_ENTRY_LOG_U64s;
+		trans->journal_u64s += jset_u64s(JSET_ENTRY_LOG_U64s);
 
 	trans_for_each_update(trans, i) {
 		BUG_ON(!i->path->should_be_locked);
diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h
index 59453dcfa4e9..d3caa7ea7ce9 100644
--- a/fs/bcachefs/journal.h
+++ b/fs/bcachefs/journal.h
@@ -199,9 +199,9 @@ journal_res_entry(struct journal *j, struct journal_res *res)
 	return vstruct_idx(j->buf[res->idx].data, res->offset);
 }
 
-static inline unsigned journal_entry_set(struct jset_entry *entry, unsigned type,
+static inline unsigned journal_entry_init(struct jset_entry *entry, unsigned type,
 					  enum btree_id id, unsigned level,
-					  const void *data, unsigned u64s)
+					  unsigned u64s)
 {
 	entry->u64s	= cpu_to_le16(u64s);
 	entry->btree_id = id;
@@ -210,32 +210,33 @@ static inline unsigned journal_entry_set(struct jset_entry *entry, unsigned type
 	entry->pad[0]	= 0;
 	entry->pad[1]	= 0;
 	entry->pad[2]	= 0;
-	memcpy_u64s_small(entry->_data, data, u64s);
-
 	return jset_u64s(u64s);
 }
 
-static inline void bch2_journal_add_entry(struct journal *j, struct journal_res *res,
-					  unsigned type, enum btree_id id,
-					  unsigned level,
+static inline unsigned journal_entry_set(struct jset_entry *entry, unsigned type,
+					  enum btree_id id, unsigned level,
 					  const void *data, unsigned u64s)
 {
-	unsigned actual = journal_entry_set(journal_res_entry(j, res),
-			       type, id, level, data, u64s);
+	unsigned ret = journal_entry_init(entry, type, id, level, u64s);
+
+	memcpy_u64s_small(entry->_data, data, u64s);
+	return ret;
+}
+
+static inline struct jset_entry *
+bch2_journal_add_entry(struct journal *j, struct journal_res *res,
+			 unsigned type, enum btree_id id,
+			 unsigned level, unsigned u64s)
+{
+	struct jset_entry *entry = journal_res_entry(j, res);
+	unsigned actual = journal_entry_init(entry, type, id, level, u64s);
 
 	EBUG_ON(!res->ref);
 	EBUG_ON(actual > res->u64s);
 
 	res->offset	+= actual;
 	res->u64s	-= actual;
-}
-
-static inline void bch2_journal_add_keys(struct journal *j, struct journal_res *res,
-					enum btree_id id, unsigned level,
-					const struct bkey_i *k)
-{
-	bch2_journal_add_entry(j, res, BCH_JSET_ENTRY_btree_keys,
-			       id, level, k, k->k.u64s);
+	return entry;
 }
 
 static inline bool journal_entry_empty(struct jset *j)
@@ -283,7 +284,7 @@ static inline void bch2_journal_res_put(struct journal *j,
 	while (res->u64s)
 		bch2_journal_add_entry(j, res,
 				       BCH_JSET_ENTRY_btree_keys,
-				       0, 0, NULL, 0);
+				       0, 0, 0);
 
 	bch2_journal_buf_put(j, res->idx);
 
-- 
cgit 


From cb685ce72cc7e91733d3197346bbfe61e8ad54eb Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 5 Jun 2022 15:32:57 -0400
Subject: bcachefs: Also log overwrites in journal

Lately we've been doing a lot of debugging by looking at the journal to
see what was changed, and by what code path. This patch adds a new
journal entry type for recording overwrites, so that we don't have to
search backwards through the journal to see what was being overwritten
in order to work out what the triggers were supposed to be doing.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/bcachefs_format.h   |  3 +-
 fs/bcachefs/btree_update_leaf.c | 92 ++++++++++++++++++++++++-----------------
 fs/bcachefs/journal_io.c        | 35 +++++++++++-----
 3 files changed, 79 insertions(+), 51 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index d77a45041ff0..079ad93ab34e 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -1796,7 +1796,8 @@ static inline __u64 __bset_magic(struct bch_sb *sb)
 	x(data_usage,		6)		\
 	x(clock,		7)		\
 	x(dev_usage,		8)		\
-	x(log,			9)
+	x(log,			9)		\
+	x(overwrite,		10)
 
 enum {
 #define x(f, nr)	BCH_JSET_ENTRY_##f	= nr,
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index df2b21245d00..eac601d6a397 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -384,41 +384,6 @@ btree_key_can_insert_cached(struct btree_trans *trans,
 	return -EINTR;
 }
 
-static inline void do_btree_insert_one(struct btree_trans *trans,
-				       struct btree_insert_entry *i)
-{
-	struct bch_fs *c = trans->c;
-	struct journal *j = &c->journal;
-
-	EBUG_ON(trans->journal_res.ref !=
-		!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY));
-
-	i->k->k.needs_whiteout = false;
-
-	if (!i->cached)
-		btree_insert_key_leaf(trans, i);
-	else if (!i->key_cache_already_flushed)
-		bch2_btree_insert_key_cached(trans, i->path, i->k);
-	else {
-		bch2_btree_key_cache_drop(trans, i->path);
-		return;
-	}
-
-	if (likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)) &&
-	    !(i->flags & BTREE_UPDATE_NOJOURNAL)) {
-		struct jset_entry *entry;
-
-		entry = bch2_journal_add_entry(j, &trans->journal_res,
-				       BCH_JSET_ENTRY_btree_keys,
-				       i->btree_id, i->level,
-				       i->k->k.u64s);
-		bkey_copy(&entry->start[0], i->k);
-
-		if (trans->journal_seq)
-			*trans->journal_seq = trans->journal_res.seq;
-	}
-}
-
 /* Triggers: */
 
 static int run_one_mem_trigger(struct btree_trans *trans,
@@ -729,8 +694,47 @@ bch2_trans_commit_write_locked(struct btree_trans *trans,
 			return ret;
 	}
 
-	trans_for_each_update(trans, i)
-		do_btree_insert_one(trans, i);
+	if (likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) {
+		trans_for_each_update(trans, i) {
+			struct journal *j = &c->journal;
+			struct jset_entry *entry;
+
+			if (i->key_cache_already_flushed)
+				continue;
+
+			if (i->flags & BTREE_UPDATE_NOJOURNAL)
+				continue;
+
+			if (trans->journal_transaction_names) {
+				entry = bch2_journal_add_entry(j, &trans->journal_res,
+						       BCH_JSET_ENTRY_overwrite,
+						       i->btree_id, i->level,
+						       i->old_k.u64s);
+				bkey_reassemble(&entry->start[0],
+						(struct bkey_s_c) { &i->old_k, i->old_v });
+			}
+
+			entry = bch2_journal_add_entry(j, &trans->journal_res,
+					       BCH_JSET_ENTRY_btree_keys,
+					       i->btree_id, i->level,
+					       i->k->k.u64s);
+			bkey_copy(&entry->start[0], i->k);
+		}
+
+		if (trans->journal_seq)
+			*trans->journal_seq = trans->journal_res.seq;
+	}
+
+	trans_for_each_update(trans, i) {
+		i->k->k.needs_whiteout = false;
+
+		if (!i->cached)
+			btree_insert_key_leaf(trans, i);
+		else if (!i->key_cache_already_flushed)
+			bch2_btree_insert_key_cached(trans, i->path, i->k);
+		else
+			bch2_btree_key_cache_drop(trans, i->path);
+	}
 
 	return ret;
 }
@@ -1134,13 +1138,23 @@ int __bch2_trans_commit(struct btree_trans *trans)
 
 		BUG_ON(!btree_node_intent_locked(i->path, i->level));
 
+		if (i->key_cache_already_flushed)
+			continue;
+
+		/* we're going to journal the key being updated: */
 		u64s = jset_u64s(i->k->k.u64s);
 		if (i->cached &&
 		    likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)))
 			trans->journal_preres_u64s += u64s;
 
-		if (!(i->flags & BTREE_UPDATE_NOJOURNAL))
-			trans->journal_u64s += u64s;
+		if (i->flags & BTREE_UPDATE_NOJOURNAL)
+			continue;
+
+		trans->journal_u64s += u64s;
+
+		/* and we're also going to log the overwrite: */
+		if (trans->journal_transaction_names)
+			trans->journal_u64s += jset_u64s(i->old_k.u64s);
 	}
 
 	if (trans->extra_journal_res) {
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 351d5d9d8225..163b18340fa1 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -212,7 +212,7 @@ static void journal_entry_null_range(void *start, void *end)
 static int journal_validate_key(struct bch_fs *c, const char *where,
 				struct jset_entry *entry,
 				unsigned level, enum btree_id btree_id,
-				struct bkey_i *k, const char *type,
+				struct bkey_i *k,
 				unsigned version, int big_endian, int write)
 {
 	void *next = vstruct_next(entry);
@@ -220,8 +220,8 @@ static int journal_validate_key(struct bch_fs *c, const char *where,
 	int ret = 0;
 
 	if (journal_entry_err_on(!k->k.u64s, c,
-			"invalid %s in %s entry offset %zi/%u: k->u64s 0",
-			type, where,
+			"invalid key in %s at %s offset %zi/%u: k->u64s 0",
+			bch2_jset_entry_types[entry->type], where,
 			(u64 *) k - entry->_data,
 			le16_to_cpu(entry->u64s))) {
 		entry->u64s = cpu_to_le16((u64 *) k - entry->_data);
@@ -231,8 +231,8 @@ static int journal_validate_key(struct bch_fs *c, const char *where,
 
 	if (journal_entry_err_on((void *) bkey_next(k) >
 				(void *) vstruct_next(entry), c,
-			"invalid %s in %s entry offset %zi/%u: extends past end of journal entry",
-			type, where,
+			"invalid key in %s at %s offset %zi/%u: extends past end of journal entry",
+			bch2_jset_entry_types[entry->type], where,
 			(u64 *) k - entry->_data,
 			le16_to_cpu(entry->u64s))) {
 		entry->u64s = cpu_to_le16((u64 *) k - entry->_data);
@@ -241,8 +241,8 @@ static int journal_validate_key(struct bch_fs *c, const char *where,
 	}
 
 	if (journal_entry_err_on(k->k.format != KEY_FORMAT_CURRENT, c,
-			"invalid %s in %s entry offset %zi/%u: bad format %u",
-			type, where,
+			"invalid key in %s at %s offset %zi/%u: bad format %u",
+			bch2_jset_entry_types[entry->type], where,
 			(u64 *) k - entry->_data,
 			le16_to_cpu(entry->u64s),
 			k->k.format)) {
@@ -259,8 +259,8 @@ static int journal_validate_key(struct bch_fs *c, const char *where,
 	if (bch2_bkey_invalid(c, bkey_i_to_s_c(k),
 			      __btree_node_type(level, btree_id), write, &buf)) {
 		printbuf_reset(&buf);
-		pr_buf(&buf, "invalid %s in %s entry offset %zi/%u:",
-		       type, where,
+		pr_buf(&buf, "invalid key in %s at %s offset %zi/%u:",
+		       bch2_jset_entry_types[entry->type], where,
 		       (u64 *) k - entry->_data,
 		       le16_to_cpu(entry->u64s));
 		pr_newline(&buf);
@@ -300,7 +300,7 @@ static int journal_entry_btree_keys_validate(struct bch_fs *c,
 		int ret = journal_validate_key(c, where, entry,
 					       entry->level,
 					       entry->btree_id,
-					       k, "key", version, big_endian, write);
+					       k, version, big_endian, write);
 		if (ret == FSCK_DELETED_KEY)
 			continue;
 
@@ -350,7 +350,7 @@ static int journal_entry_btree_root_validate(struct bch_fs *c,
 	}
 
 	return journal_validate_key(c, where, entry, 1, entry->btree_id, k,
-				    "btree root", version, big_endian, write);
+				    version, big_endian, write);
 fsck_err:
 	return ret;
 }
@@ -612,6 +612,19 @@ static void journal_entry_log_to_text(struct printbuf *out, struct bch_fs *c,
 	pr_buf(out, "%.*s", bytes, l->d);
 }
 
+static int journal_entry_overwrite_validate(struct bch_fs *c, const char *where,
+				      struct jset_entry *entry,
+				      unsigned version, int big_endian, int write)
+{
+	return journal_entry_btree_keys_validate(c, where, entry, version, big_endian, write);
+}
+
+static void journal_entry_overwrite_to_text(struct printbuf *out, struct bch_fs *c,
+					    struct jset_entry *entry)
+{
+	journal_entry_btree_keys_to_text(out, c, entry);
+}
+
 struct jset_entry_ops {
 	int (*validate)(struct bch_fs *, const char *,
 			struct jset_entry *, unsigned, int, int);
-- 
cgit 


From f2aa02657561b0e6d96089eb8ee44e4154f4acad Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 6 Jun 2022 23:04:33 -0400
Subject: bcachefs: Fix for cmd_list_journal

cmd_list_journal wasn't correctly listing the most recent journal
entries as blacklisted - because in the recovery path when just reading
the journal, we were failing to add those to the blacklist table.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/recovery.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index edb04f65a148..5fe7595d36be 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -1175,9 +1175,6 @@ use_clean:
 		blacklist_seq = journal_seq = le64_to_cpu(clean->journal_seq) + 1;
 	}
 
-	if (c->opts.read_journal_only)
-		goto out;
-
 	if (c->opts.reconstruct_alloc) {
 		c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
 		drop_alloc_keys(&c->journal_keys);
@@ -1208,6 +1205,13 @@ use_clean:
 		}
 	}
 
+	/*
+	 * note: cmd_list_journal needs the blacklist table fully up to date so
+	 * it can asterisk ignored journal entries:
+	 */
+	if (c->opts.read_journal_only)
+		goto out;
+
 	ret = bch2_fs_journal_start(&c->journal, journal_seq);
 	if (ret)
 		goto err;
-- 
cgit 


From 576179021c90bea808ac12c491bd9b239ca80c2e Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 6 Jun 2022 21:59:34 -0400
Subject: bcachefs: Fix btree_and_journal_iter

We had a bug where btree_and_journal_iter would return the same key
twice - after deleting it (perhaps because it was present in both the
btree and the journal?)

This reworks btree_and_journal_iter to track the current position, much
like btree_paths, which makes the logic considerably simpler and more
robust.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/recovery.c | 100 +++++++++++++++++++------------------------------
 fs/bcachefs/recovery.h |   9 +----
 2 files changed, 40 insertions(+), 69 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 5fe7595d36be..d755da42d6c5 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -159,21 +159,17 @@ static void journal_iters_fix(struct bch_fs *c)
 {
 	struct journal_keys *keys = &c->journal_keys;
 	/* The key we just inserted is immediately before the gap: */
-	struct journal_key *n = &keys->d[keys->gap - 1];
 	size_t gap_end = keys->gap + (keys->size - keys->nr);
 	struct btree_and_journal_iter *iter;
 
 	/*
-	 * If an iterator points one after the key we just inserted,
-	 * and the key we just inserted compares > the iterator's position,
-	 * decrement the iterator so it points at the key we just inserted:
+	 * If an iterator points one after the key we just inserted, decrement
+	 * the iterator so it points at the key we just inserted - if the
+	 * decrement was unnecessary, bch2_btree_and_journal_iter_peek() will
+	 * handle that:
 	 */
 	list_for_each_entry(iter, &c->journal_iters, journal.list)
-		if (iter->journal.idx == gap_end &&
-		    iter->last &&
-		    iter->b->c.btree_id == n->btree_id &&
-		    iter->b->c.level	== n->level &&
-		    bpos_cmp(n->k->k.p, iter->unpacked.p) > 0)
+		if (iter->journal.idx == gap_end)
 			iter->journal.idx = keys->gap - 1;
 }
 
@@ -312,7 +308,7 @@ static void bch2_journal_iter_advance(struct journal_iter *iter)
 	}
 }
 
-struct bkey_i *bch2_journal_iter_peek(struct journal_iter *iter)
+struct bkey_s_c bch2_journal_iter_peek(struct journal_iter *iter)
 {
 	struct journal_key *k = iter->keys->d + iter->idx;
 
@@ -320,13 +316,13 @@ struct bkey_i *bch2_journal_iter_peek(struct journal_iter *iter)
 	       k->btree_id	== iter->btree_id &&
 	       k->level		== iter->level) {
 		if (!k->overwritten)
-			return k->k;
+			return bkey_i_to_s_c(k->k);
 
 		bch2_journal_iter_advance(iter);
 		k = iter->keys->d + iter->idx;
 	}
 
-	return NULL;
+	return bkey_s_c_null;
 }
 
 static void bch2_journal_iter_exit(struct journal_iter *iter)
@@ -358,71 +354,49 @@ static void bch2_journal_iter_advance_btree(struct btree_and_journal_iter *iter)
 
 void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *iter)
 {
-	switch (iter->last) {
-	case none:
-		break;
-	case btree:
-		bch2_journal_iter_advance_btree(iter);
-		break;
-	case journal:
-		bch2_journal_iter_advance(&iter->journal);
-		break;
-	}
-
-	iter->last = none;
+	if (!bpos_cmp(iter->pos, SPOS_MAX))
+		iter->at_end = true;
+	else
+		iter->pos = bpos_successor(iter->pos);
 }
 
 struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *iter)
 {
-	struct bkey_s_c ret;
-
-	while (1) {
-		struct bkey_s_c btree_k		=
-			bch2_journal_iter_peek_btree(iter);
-		struct bkey_s_c journal_k	=
-			bkey_i_to_s_c(bch2_journal_iter_peek(&iter->journal));
+	struct bkey_s_c btree_k, journal_k, ret;
+again:
+	if (iter->at_end)
+		return bkey_s_c_null;
 
-		if (btree_k.k && journal_k.k) {
-			int cmp = bpos_cmp(btree_k.k->p, journal_k.k->p);
+	while ((btree_k = bch2_journal_iter_peek_btree(iter)).k &&
+	       bpos_cmp(btree_k.k->p, iter->pos) < 0)
+		bch2_journal_iter_advance_btree(iter);
 
-			if (!cmp)
-				bch2_journal_iter_advance_btree(iter);
+	while ((journal_k = bch2_journal_iter_peek(&iter->journal)).k &&
+	       bpos_cmp(journal_k.k->p, iter->pos) < 0)
+		bch2_journal_iter_advance(&iter->journal);
 
-			iter->last = cmp < 0 ? btree : journal;
-		} else if (btree_k.k) {
-			iter->last = btree;
-		} else if (journal_k.k) {
-			iter->last = journal;
-		} else {
-			iter->last = none;
-			return bkey_s_c_null;
-		}
+	ret = journal_k.k &&
+		(!btree_k.k || bpos_cmp(journal_k.k->p, btree_k.k->p) <= 0)
+		? journal_k
+		: btree_k;
 
-		ret = iter->last == journal ? journal_k : btree_k;
+	if (ret.k && iter->b && bpos_cmp(ret.k->p, iter->b->data->max_key) > 0)
+		ret = bkey_s_c_null;
 
-		if (iter->b &&
-		    bpos_cmp(ret.k->p, iter->b->data->max_key) > 0) {
-			iter->journal.idx = iter->journal.keys->nr;
-			iter->last = none;
-			return bkey_s_c_null;
+	if (ret.k) {
+		iter->pos = ret.k->p;
+		if (bkey_deleted(ret.k)) {
+			bch2_btree_and_journal_iter_advance(iter);
+			goto again;
 		}
-
-		if (!bkey_deleted(ret.k))
-			break;
-
-		bch2_btree_and_journal_iter_advance(iter);
+	} else {
+		iter->pos = SPOS_MAX;
+		iter->at_end = true;
 	}
 
 	return ret;
 }
 
-struct bkey_s_c bch2_btree_and_journal_iter_next(struct btree_and_journal_iter *iter)
-{
-	bch2_btree_and_journal_iter_advance(iter);
-
-	return bch2_btree_and_journal_iter_peek(iter);
-}
-
 void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *iter)
 {
 	bch2_journal_iter_exit(&iter->journal);
@@ -440,6 +414,8 @@ void __bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter
 	iter->node_iter = node_iter;
 	bch2_journal_iter_init(c, &iter->journal, b->c.btree_id, b->c.level, pos);
 	INIT_LIST_HEAD(&iter->journal.list);
+	iter->pos = b->data->min_key;
+	iter->at_end = false;
 }
 
 /*
diff --git a/fs/bcachefs/recovery.h b/fs/bcachefs/recovery.h
index 52db06b29310..8c0348e8b84c 100644
--- a/fs/bcachefs/recovery.h
+++ b/fs/bcachefs/recovery.h
@@ -20,12 +20,8 @@ struct btree_and_journal_iter {
 	struct bkey		unpacked;
 
 	struct journal_iter	journal;
-
-	enum last_key_returned {
-		none,
-		btree,
-		journal,
-	}			last;
+	struct bpos		pos;
+	bool			at_end;
 };
 
 struct bkey_i *bch2_journal_keys_peek_upto(struct bch_fs *, enum btree_id,
@@ -44,7 +40,6 @@ void bch2_journal_key_overwritten(struct bch_fs *, enum btree_id,
 
 void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *);
 struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *);
-struct bkey_s_c bch2_btree_and_journal_iter_next(struct btree_and_journal_iter *);
 
 void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *);
 void __bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *,
-- 
cgit 


From 652018d66190412669a898c2dc3e75073eac8679 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 6 Jun 2022 22:09:11 -0400
Subject: bcachefs: Fix btree node read error path

We were forgetting to clear the read_in_flight flag - oops. This also
fixes it to not call bch2_fatal_error() before topology repair has had a
chance to do its thing.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_io.c | 25 ++++++++++++++++---------
 1 file changed, 16 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index a1043492dbd9..9d4d6a65e70c 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -1483,23 +1483,32 @@ void bch2_btree_node_read(struct bch_fs *c, struct btree *b,
 	struct btree_read_bio *rb;
 	struct bch_dev *ca;
 	struct bio *bio;
-	struct printbuf buf = PRINTBUF;
 	int ret;
 
-	btree_pos_to_text(&buf, c, b);
 	trace_btree_read(c, b);
 
 	if (bch2_verify_all_btree_replicas &&
 	    !btree_node_read_all_replicas(c, b, sync))
-		goto out;
+		return;
 
 	ret = bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key),
 					 NULL, &pick);
-	if (bch2_fs_fatal_err_on(ret <= 0, c,
-			"btree node read error: no device to read from\n"
-			" at %s", buf.buf)) {
+
+	if (ret <= 0) {
+		struct printbuf buf = PRINTBUF;
+
+		pr_buf(&buf, "btree node read error: no device to read from\n at ");
+		btree_pos_to_text(&buf, c, b);
+		bch_err(c, "%s", buf.buf);
+
+		if (test_bit(BCH_FS_TOPOLOGY_REPAIR_DONE, &c->flags))
+			bch2_fatal_error(c);
+
 		set_btree_node_read_error(b);
-		goto out;
+		clear_btree_node_read_in_flight(b);
+		wake_up_bit(&b->flags, BTREE_NODE_read_in_flight);
+		printbuf_exit(&buf);
+		return;
 	}
 
 	ca = bch_dev_bkey_exists(c, pick.ptr.dev);
@@ -1541,8 +1550,6 @@ void bch2_btree_node_read(struct bch_fs *c, struct btree *b,
 		else
 			queue_work(c->io_complete_wq, &rb->work);
 	}
-out:
-	printbuf_exit(&buf);
 }
 
 int bch2_btree_root_read(struct bch_fs *c, enum btree_id id,
-- 
cgit 


From 401ec4db630802729f10d53ad995083ced98caca Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 3 Feb 2023 21:01:40 -0500
Subject: bcachefs: Printbuf rework

This converts bcachefs to the modern printbuf interface/implementation,
synced with the version to be submitted upstream.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/Makefile                |   1 +
 fs/bcachefs/alloc_background.c      |  40 ++--
 fs/bcachefs/alloc_foreground.c      |  22 +-
 fs/bcachefs/bkey_methods.c          |  66 +++---
 fs/bcachefs/bset.c                  |   4 +-
 fs/bcachefs/btree_cache.c           |  20 +-
 fs/bcachefs/btree_gc.c              |   4 +-
 fs/bcachefs/btree_io.c              |  40 ++--
 fs/bcachefs/btree_iter.c            |  42 ++--
 fs/bcachefs/btree_key_cache.c       |   6 +-
 fs/bcachefs/btree_update_interior.c |   6 +-
 fs/bcachefs/btree_update_leaf.c     |   8 +-
 fs/bcachefs/buckets.c               |  22 +-
 fs/bcachefs/checksum.c              |  12 +-
 fs/bcachefs/clock.c                 |   2 +-
 fs/bcachefs/counters.c              |  10 +-
 fs/bcachefs/debug.c                 |  87 +++----
 fs/bcachefs/dirent.c                |  18 +-
 fs/bcachefs/disk_groups.c           |  34 +--
 fs/bcachefs/ec.c                    |  20 +-
 fs/bcachefs/extents.c               |  54 ++---
 fs/bcachefs/inode.c                 |  36 +--
 fs/bcachefs/io.c                    |  12 +-
 fs/bcachefs/journal.c               | 135 +++++------
 fs/bcachefs/journal_io.c            |  48 ++--
 fs/bcachefs/journal_sb.c            |  28 +--
 fs/bcachefs/journal_seq_blacklist.c |  10 +-
 fs/bcachefs/lru.c                   |   4 +-
 fs/bcachefs/opts.c                  |  36 +--
 fs/bcachefs/printbuf.c              | 415 ++++++++++++++++++++++++++++++++++
 fs/bcachefs/printbuf.h              | 284 +++++++++++++++++++++++
 fs/bcachefs/quota.c                 |  14 +-
 fs/bcachefs/rebalance.c             |  47 ++--
 fs/bcachefs/recovery.c              |   4 +-
 fs/bcachefs/reflink.c               |  14 +-
 fs/bcachefs/replicas.c              |  40 ++--
 fs/bcachefs/subvolume.c             |  20 +-
 fs/bcachefs/super-io.c              | 439 ++++++++++++++++++------------------
 fs/bcachefs/super.c                 |  17 +-
 fs/bcachefs/sysfs.c                 |  97 ++++----
 fs/bcachefs/tests.c                 |   4 +-
 fs/bcachefs/util.c                  | 218 ++++--------------
 fs/bcachefs/util.h                  | 146 +++---------
 fs/bcachefs/xattr.c                 |  18 +-
 44 files changed, 1544 insertions(+), 1060 deletions(-)
 create mode 100644 fs/bcachefs/printbuf.c
 create mode 100644 fs/bcachefs/printbuf.h

(limited to 'fs')

diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile
index 76aecdc5df71..fada601c10db 100644
--- a/fs/bcachefs/Makefile
+++ b/fs/bcachefs/Makefile
@@ -47,6 +47,7 @@ bcachefs-y		:=	\
 	move.o			\
 	movinggc.o		\
 	opts.o			\
+	printbuf.o		\
 	quota.o			\
 	rebalance.o		\
 	recovery.o		\
diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index eb03b4135c3d..bffbddbdacea 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -300,7 +300,7 @@ int bch2_alloc_v1_invalid(const struct bch_fs *c, struct bkey_s_c k,
 
 	/* allow for unknown fields */
 	if (bkey_val_u64s(a.k) < bch_alloc_v1_val_u64s(a.v)) {
-		pr_buf(err, "incorrect value size (%zu < %u)",
+		prt_printf(err, "incorrect value size (%zu < %u)",
 		       bkey_val_u64s(a.k), bch_alloc_v1_val_u64s(a.v));
 		return -EINVAL;
 	}
@@ -314,7 +314,7 @@ int bch2_alloc_v2_invalid(const struct bch_fs *c, struct bkey_s_c k,
 	struct bkey_alloc_unpacked u;
 
 	if (bch2_alloc_unpack_v2(&u, k)) {
-		pr_buf(err, "unpack error");
+		prt_printf(err, "unpack error");
 		return -EINVAL;
 	}
 
@@ -327,7 +327,7 @@ int bch2_alloc_v3_invalid(const struct bch_fs *c, struct bkey_s_c k,
 	struct bkey_alloc_unpacked u;
 
 	if (bch2_alloc_unpack_v3(&u, k)) {
-		pr_buf(err, "unpack error");
+		prt_printf(err, "unpack error");
 		return -EINVAL;
 	}
 
@@ -340,14 +340,14 @@ int bch2_alloc_v4_invalid(const struct bch_fs *c, struct bkey_s_c k,
 	struct bkey_s_c_alloc_v4 a = bkey_s_c_to_alloc_v4(k);
 
 	if (bkey_val_bytes(k.k) != sizeof(struct bch_alloc_v4)) {
-		pr_buf(err, "bad val size (%zu != %zu)",
+		prt_printf(err, "bad val size (%zu != %zu)",
 		       bkey_val_bytes(k.k), sizeof(struct bch_alloc_v4));
 		return -EINVAL;
 	}
 
 	if (rw == WRITE) {
 		if (alloc_data_type(*a.v, a.v->data_type) != a.v->data_type) {
-			pr_buf(err, "invalid data type (got %u should be %u)",
+			prt_printf(err, "invalid data type (got %u should be %u)",
 			       a.v->data_type, alloc_data_type(*a.v, a.v->data_type));
 			return -EINVAL;
 		}
@@ -359,7 +359,7 @@ int bch2_alloc_v4_invalid(const struct bch_fs *c, struct bkey_s_c k,
 			if (a.v->dirty_sectors ||
 			    a.v->cached_sectors ||
 			    a.v->stripe) {
-				pr_buf(err, "empty data type free but have data");
+				prt_printf(err, "empty data type free but have data");
 				return -EINVAL;
 			}
 			break;
@@ -369,7 +369,7 @@ int bch2_alloc_v4_invalid(const struct bch_fs *c, struct bkey_s_c k,
 		case BCH_DATA_user:
 		case BCH_DATA_parity:
 			if (!a.v->dirty_sectors) {
-				pr_buf(err, "data_type %s but dirty_sectors==0",
+				prt_printf(err, "data_type %s but dirty_sectors==0",
 				       bch2_data_types[a.v->data_type]);
 				return -EINVAL;
 			}
@@ -378,19 +378,19 @@ int bch2_alloc_v4_invalid(const struct bch_fs *c, struct bkey_s_c k,
 			if (!a.v->cached_sectors ||
 			    a.v->dirty_sectors ||
 			    a.v->stripe) {
-				pr_buf(err, "data type inconsistency");
+				prt_printf(err, "data type inconsistency");
 				return -EINVAL;
 			}
 
 			if (!a.v->io_time[READ] &&
 			    test_bit(BCH_FS_CHECK_ALLOC_TO_LRU_REFS_DONE, &c->flags)) {
-				pr_buf(err, "cached bucket with read_time == 0");
+				prt_printf(err, "cached bucket with read_time == 0");
 				return -EINVAL;
 			}
 			break;
 		case BCH_DATA_stripe:
 			if (!a.v->stripe) {
-				pr_buf(err, "data_type %s but stripe==0",
+				prt_printf(err, "data_type %s but stripe==0",
 				       bch2_data_types[a.v->data_type]);
 				return -EINVAL;
 			}
@@ -421,17 +421,17 @@ void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c
 
 	bch2_alloc_to_v4(k, &a);
 
-	pr_buf(out, "gen %u oldest_gen %u data_type %s journal_seq %llu need_discard %llu need_inc_gen %llu",
+	prt_printf(out, "gen %u oldest_gen %u data_type %s journal_seq %llu need_discard %llu need_inc_gen %llu",
 	       a.gen, a.oldest_gen, bch2_data_types[a.data_type],
 	       a.journal_seq,
 	       BCH_ALLOC_V4_NEED_DISCARD(&a),
 	       BCH_ALLOC_V4_NEED_INC_GEN(&a));
-	pr_buf(out, " dirty_sectors %u",	a.dirty_sectors);
-	pr_buf(out, " cached_sectors %u",	a.cached_sectors);
-	pr_buf(out, " stripe %u",		a.stripe);
-	pr_buf(out, " stripe_redundancy %u",	a.stripe_redundancy);
-	pr_buf(out, " read_time %llu",		a.io_time[READ]);
-	pr_buf(out, " write_time %llu",		a.io_time[WRITE]);
+	prt_printf(out, " dirty_sectors %u",	a.dirty_sectors);
+	prt_printf(out, " cached_sectors %u",	a.cached_sectors);
+	prt_printf(out, " stripe %u",		a.stripe);
+	prt_printf(out, " stripe_redundancy %u",	a.stripe_redundancy);
+	prt_printf(out, " read_time %llu",		a.io_time[READ]);
+	prt_printf(out, " write_time %llu",		a.io_time[WRITE]);
 }
 
 int bch2_alloc_read(struct bch_fs *c)
@@ -1098,7 +1098,7 @@ next_lru:
 		goto out;
 
 	if (k.k->type != KEY_TYPE_lru) {
-		pr_buf(&buf, "non lru key in lru btree:\n  ");
+		prt_printf(&buf, "non lru key in lru btree:\n  ");
 		bch2_bkey_val_to_text(&buf, c, k);
 
 		if (!test_bit(BCH_FS_CHECK_LRUS_DONE, &c->flags)) {
@@ -1122,9 +1122,9 @@ next_lru:
 		goto out;
 
 	if (idx != alloc_lru_idx(a->v)) {
-		pr_buf(&buf, "alloc key does not point back to lru entry when invalidating bucket:\n  ");
+		prt_printf(&buf, "alloc key does not point back to lru entry when invalidating bucket:\n  ");
 		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&a->k_i));
-		pr_buf(&buf, "\n  ");
+		prt_printf(&buf, "\n  ");
 		bch2_bkey_val_to_text(&buf, c, k);
 
 		if (!test_bit(BCH_FS_CHECK_LRUS_DONE, &c->flags)) {
diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index d871e1f11f29..ef8f10a51489 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -299,7 +299,7 @@ static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bc
 	int ret;
 
 	if (b < ca->mi.first_bucket || b >= ca->mi.nbuckets) {
-		pr_buf(&buf, "freespace btree has bucket outside allowed range %u-%llu\n"
+		prt_printf(&buf, "freespace btree has bucket outside allowed range %u-%llu\n"
 		       "  freespace key ",
 			ca->mi.first_bucket, ca->mi.nbuckets);
 		bch2_bkey_val_to_text(&buf, c, freespace_k);
@@ -319,11 +319,11 @@ static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bc
 	bch2_alloc_to_v4(k, &a);
 
 	if (genbits != (alloc_freespace_genbits(a) >> 56)) {
-		pr_buf(&buf, "bucket in freespace btree with wrong genbits (got %u should be %llu)\n"
+		prt_printf(&buf, "bucket in freespace btree with wrong genbits (got %u should be %llu)\n"
 		       "  freespace key ",
 		       genbits, alloc_freespace_genbits(a) >> 56);
 		bch2_bkey_val_to_text(&buf, c, freespace_k);
-		pr_buf(&buf, "\n  ");
+		prt_printf(&buf, "\n  ");
 		bch2_bkey_val_to_text(&buf, c, k);
 		bch2_trans_inconsistent(trans, "%s", buf.buf);
 		ob = ERR_PTR(-EIO);
@@ -332,10 +332,10 @@ static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bc
 	}
 
 	if (a.data_type != BCH_DATA_free) {
-		pr_buf(&buf, "non free bucket in freespace btree\n"
+		prt_printf(&buf, "non free bucket in freespace btree\n"
 		       "  freespace key ");
 		bch2_bkey_val_to_text(&buf, c, freespace_k);
-		pr_buf(&buf, "\n  ");
+		prt_printf(&buf, "\n  ");
 		bch2_bkey_val_to_text(&buf, c, k);
 		bch2_trans_inconsistent(trans, "%s", buf.buf);
 		ob = ERR_PTR(-EIO);
@@ -1381,7 +1381,7 @@ void bch2_open_buckets_to_text(struct printbuf *out, struct bch_fs *c)
 	     ob++) {
 		spin_lock(&ob->lock);
 		if (ob->valid && !ob->on_partial_list) {
-			pr_buf(out, "%zu ref %u type %s %u:%llu:%u\n",
+			prt_printf(out, "%zu ref %u type %s %u:%llu:%u\n",
 			       ob - c->open_buckets,
 			       atomic_read(&ob->pin),
 			       bch2_data_types[ob->data_type],
@@ -1406,17 +1406,17 @@ void bch2_write_points_to_text(struct printbuf *out, struct bch_fs *c)
 	for (wp = c->write_points;
 	     wp < c->write_points + ARRAY_SIZE(c->write_points);
 	     wp++) {
-		pr_buf(out, "%lu: ", wp->write_point);
-		bch2_hprint(out, wp->sectors_allocated);
+		prt_printf(out, "%lu: ", wp->write_point);
+		prt_human_readable_u64(out, wp->sectors_allocated);
 
-		pr_buf(out, " last wrote: ");
+		prt_printf(out, " last wrote: ");
 		bch2_pr_time_units(out, sched_clock() - wp->last_used);
 
 		for (i = 0; i < WRITE_POINT_STATE_NR; i++) {
-			pr_buf(out, " %s: ", bch2_write_point_states[i]);
+			prt_printf(out, " %s: ", bch2_write_point_states[i]);
 			bch2_pr_time_units(out, wp->time[i]);
 		}
 
-		pr_newline(out);
+		prt_newline(out);
 	}
 }
diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c
index 62ce1264731a..390ea41414bc 100644
--- a/fs/bcachefs/bkey_methods.c
+++ b/fs/bcachefs/bkey_methods.c
@@ -40,7 +40,7 @@ static int empty_val_key_invalid(const struct bch_fs *c, struct bkey_s_c k,
 				 int rw, struct printbuf *err)
 {
 	if (bkey_val_bytes(k.k)) {
-		pr_buf(err, "incorrect value size (%zu != 0)",
+		prt_printf(err, "incorrect value size (%zu != 0)",
 		       bkey_val_bytes(k.k));
 		return -EINVAL;
 	}
@@ -56,7 +56,7 @@ static int key_type_cookie_invalid(const struct bch_fs *c, struct bkey_s_c k,
 				   int rw, struct printbuf *err)
 {
 	if (bkey_val_bytes(k.k) != sizeof(struct bch_cookie)) {
-		pr_buf(err, "incorrect value size (%zu != %zu)",
+		prt_printf(err, "incorrect value size (%zu != %zu)",
 		       bkey_val_bytes(k.k), sizeof(struct bch_cookie));
 		return -EINVAL;
 	}
@@ -84,7 +84,7 @@ static void key_type_inline_data_to_text(struct printbuf *out, struct bch_fs *c,
 	struct bkey_s_c_inline_data d = bkey_s_c_to_inline_data(k);
 	unsigned datalen = bkey_inline_data_bytes(k.k);
 
-	pr_buf(out, "datalen %u: %*phN",
+	prt_printf(out, "datalen %u: %*phN",
 	       datalen, min(datalen, 32U), d.v->data);
 }
 
@@ -97,7 +97,7 @@ static int key_type_set_invalid(const struct bch_fs *c, struct bkey_s_c k,
 				int rw, struct printbuf *err)
 {
 	if (bkey_val_bytes(k.k)) {
-		pr_buf(err, "incorrect value size (%zu != %zu)",
+		prt_printf(err, "incorrect value size (%zu != %zu)",
 		       bkey_val_bytes(k.k), sizeof(struct bch_cookie));
 		return -EINVAL;
 	}
@@ -126,7 +126,7 @@ int bch2_bkey_val_invalid(struct bch_fs *c, struct bkey_s_c k,
 			  int rw, struct printbuf *err)
 {
 	if (k.k->type >= KEY_TYPE_MAX) {
-		pr_buf(err, "invalid type (%u >= %u)", k.k->type, KEY_TYPE_MAX);
+		prt_printf(err, "invalid type (%u >= %u)", k.k->type, KEY_TYPE_MAX);
 		return -EINVAL;
 	}
 
@@ -202,30 +202,30 @@ int __bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k,
 			int rw, struct printbuf *err)
 {
 	if (k.k->u64s < BKEY_U64s) {
-		pr_buf(err, "u64s too small (%u < %zu)", k.k->u64s, BKEY_U64s);
+		prt_printf(err, "u64s too small (%u < %zu)", k.k->u64s, BKEY_U64s);
 		return -EINVAL;
 	}
 
 	if (!(bch2_key_types_allowed[type] & (1U << k.k->type))) {
-		pr_buf(err, "invalid key type for this btree (%s)",
+		prt_printf(err, "invalid key type for this btree (%s)",
 		       bch2_bkey_types[type]);
 		return -EINVAL;
 	}
 
 	if (btree_node_type_is_extents(type) && !bkey_whiteout(k.k)) {
 		if (k.k->size == 0) {
-			pr_buf(err, "size == 0");
+			prt_printf(err, "size == 0");
 			return -EINVAL;
 		}
 
 		if (k.k->size > k.k->p.offset) {
-			pr_buf(err, "size greater than offset (%u > %llu)",
+			prt_printf(err, "size greater than offset (%u > %llu)",
 			       k.k->size, k.k->p.offset);
 			return -EINVAL;
 		}
 	} else {
 		if (k.k->size) {
-			pr_buf(err, "size != 0");
+			prt_printf(err, "size != 0");
 			return -EINVAL;
 		}
 	}
@@ -233,20 +233,20 @@ int __bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k,
 	if (type != BKEY_TYPE_btree &&
 	    !btree_type_has_snapshots(type) &&
 	    k.k->p.snapshot) {
-		pr_buf(err, "nonzero snapshot");
+		prt_printf(err, "nonzero snapshot");
 		return -EINVAL;
 	}
 
 	if (type != BKEY_TYPE_btree &&
 	    btree_type_has_snapshots(type) &&
 	    !k.k->p.snapshot) {
-		pr_buf(err, "snapshot == 0");
+		prt_printf(err, "snapshot == 0");
 		return -EINVAL;
 	}
 
 	if (type != BKEY_TYPE_btree &&
 	    !bkey_cmp(k.k->p, POS_MAX)) {
-		pr_buf(err, "key at POS_MAX");
+		prt_printf(err, "key at POS_MAX");
 		return -EINVAL;
 	}
 
@@ -265,12 +265,12 @@ int bch2_bkey_in_btree_node(struct btree *b, struct bkey_s_c k,
 			    struct printbuf *err)
 {
 	if (bpos_cmp(k.k->p, b->data->min_key) < 0) {
-		pr_buf(err, "key before start of btree node");
+		prt_printf(err, "key before start of btree node");
 		return -EINVAL;
 	}
 
 	if (bpos_cmp(k.k->p, b->data->max_key) > 0) {
-		pr_buf(err, "key past end of btree node");
+		prt_printf(err, "key past end of btree node");
 		return -EINVAL;
 	}
 
@@ -280,44 +280,44 @@ int bch2_bkey_in_btree_node(struct btree *b, struct bkey_s_c k,
 void bch2_bpos_to_text(struct printbuf *out, struct bpos pos)
 {
 	if (!bpos_cmp(pos, POS_MIN))
-		pr_buf(out, "POS_MIN");
+		prt_printf(out, "POS_MIN");
 	else if (!bpos_cmp(pos, POS_MAX))
-		pr_buf(out, "POS_MAX");
+		prt_printf(out, "POS_MAX");
 	else if (!bpos_cmp(pos, SPOS_MAX))
-		pr_buf(out, "SPOS_MAX");
+		prt_printf(out, "SPOS_MAX");
 	else {
 		if (pos.inode == U64_MAX)
-			pr_buf(out, "U64_MAX");
+			prt_printf(out, "U64_MAX");
 		else
-			pr_buf(out, "%llu", pos.inode);
-		pr_buf(out, ":");
+			prt_printf(out, "%llu", pos.inode);
+		prt_printf(out, ":");
 		if (pos.offset == U64_MAX)
-			pr_buf(out, "U64_MAX");
+			prt_printf(out, "U64_MAX");
 		else
-			pr_buf(out, "%llu", pos.offset);
-		pr_buf(out, ":");
+			prt_printf(out, "%llu", pos.offset);
+		prt_printf(out, ":");
 		if (pos.snapshot == U32_MAX)
-			pr_buf(out, "U32_MAX");
+			prt_printf(out, "U32_MAX");
 		else
-			pr_buf(out, "%u", pos.snapshot);
+			prt_printf(out, "%u", pos.snapshot);
 	}
 }
 
 void bch2_bkey_to_text(struct printbuf *out, const struct bkey *k)
 {
 	if (k) {
-		pr_buf(out, "u64s %u type ", k->u64s);
+		prt_printf(out, "u64s %u type ", k->u64s);
 
 		if (k->type < KEY_TYPE_MAX)
-			pr_buf(out, "%s ", bch2_bkey_types[k->type]);
+			prt_printf(out, "%s ", bch2_bkey_types[k->type]);
 		else
-			pr_buf(out, "%u ", k->type);
+			prt_printf(out, "%u ", k->type);
 
 		bch2_bpos_to_text(out, k->p);
 
-		pr_buf(out, " len %u ver %llu", k->size, k->version.lo);
+		prt_printf(out, " len %u ver %llu", k->size, k->version.lo);
 	} else {
-		pr_buf(out, "(null)");
+		prt_printf(out, "(null)");
 	}
 }
 
@@ -330,7 +330,7 @@ void bch2_val_to_text(struct printbuf *out, struct bch_fs *c,
 		if (likely(ops->val_to_text))
 			ops->val_to_text(out, c, k);
 	} else {
-		pr_buf(out, "(invalid type %u)", k.k->type);
+		prt_printf(out, "(invalid type %u)", k.k->type);
 	}
 }
 
@@ -340,7 +340,7 @@ void bch2_bkey_val_to_text(struct printbuf *out, struct bch_fs *c,
 	bch2_bkey_to_text(out, k.k);
 
 	if (bkey_val_bytes(k.k)) {
-		pr_buf(out, ": ");
+		prt_printf(out, ": ");
 		bch2_val_to_text(out, c, k);
 	}
 }
diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c
index adea3cea343b..f29fb9327cf7 100644
--- a/fs/bcachefs/bset.c
+++ b/fs/bcachefs/bset.c
@@ -1575,12 +1575,12 @@ void bch2_bfloat_to_text(struct printbuf *out, struct btree *b,
 	switch (bkey_float(b, t, j)->exponent) {
 	case BFLOAT_FAILED:
 		uk = bkey_unpack_key(b, k);
-		pr_buf(out,
+		prt_printf(out,
 		       "    failed unpacked at depth %u\n"
 		       "\t",
 		       ilog2(j));
 		bch2_bpos_to_text(out, uk.p);
-		pr_buf(out, "\n");
+		prt_printf(out, "\n");
 		break;
 	}
 }
diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index 6557fcb24b21..00eb69dd16e9 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -772,20 +772,20 @@ static noinline void btree_bad_header(struct bch_fs *c, struct btree *b)
 	if (!test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags))
 		return;
 
-	pr_buf(&buf,
+	prt_printf(&buf,
 	       "btree node header doesn't match ptr\n"
 	       "btree %s level %u\n"
 	       "ptr: ",
 	       bch2_btree_ids[b->c.btree_id], b->c.level);
 	bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
 
-	pr_buf(&buf, "\nheader: btree %s level %llu\n"
+	prt_printf(&buf, "\nheader: btree %s level %llu\n"
 	       "min ",
 	       bch2_btree_ids[BTREE_NODE_ID(b->data)],
 	       BTREE_NODE_LEVEL(b->data));
 	bch2_bpos_to_text(&buf, b->data->min_key);
 
-	pr_buf(&buf, "\nmax ");
+	prt_printf(&buf, "\nmax ");
 	bch2_bpos_to_text(&buf, b->data->max_key);
 
 	bch2_fs_inconsistent(c, "%s", buf.buf);
@@ -1108,15 +1108,15 @@ void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c,
 
 	bch2_btree_keys_stats(b, &stats);
 
-	pr_buf(out, "l %u ", b->c.level);
+	prt_printf(out, "l %u ", b->c.level);
 	bch2_bpos_to_text(out, b->data->min_key);
-	pr_buf(out, " - ");
+	prt_printf(out, " - ");
 	bch2_bpos_to_text(out, b->data->max_key);
-	pr_buf(out, ":\n"
+	prt_printf(out, ":\n"
 	       "    ptrs: ");
 	bch2_val_to_text(out, c, bkey_i_to_s_c(&b->key));
 
-	pr_buf(out, "\n"
+	prt_printf(out, "\n"
 	       "    format: u64s %u fields %u %u %u %u %u\n"
 	       "    unpack fn len: %u\n"
 	       "    bytes used %zu/%zu (%zu%% full)\n"
@@ -1146,7 +1146,7 @@ void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c,
 
 void bch2_btree_cache_to_text(struct printbuf *out, struct bch_fs *c)
 {
-	pr_buf(out, "nr nodes:\t\t%u\n", c->btree_cache.used);
-	pr_buf(out, "nr dirty:\t\t%u\n", atomic_read(&c->btree_cache.dirty));
-	pr_buf(out, "cannibalize lock:\t%p\n", c->btree_cache.alloc_lock);
+	prt_printf(out, "nr nodes:\t\t%u\n", c->btree_cache.used);
+	prt_printf(out, "nr dirty:\t\t%u\n", atomic_read(&c->btree_cache.dirty));
+	prt_printf(out, "cannibalize lock:\t%p\n", c->btree_cache.alloc_lock);
 }
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 187787359316..123644ffe93c 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -80,7 +80,7 @@ static int bch2_gc_check_topology(struct bch_fs *c,
 			bch2_topology_error(c);
 
 			if (bkey_deleted(&prev->k->k)) {
-				pr_buf(&buf1, "start of node: ");
+				prt_printf(&buf1, "start of node: ");
 				bch2_bpos_to_text(&buf1, node_start);
 			} else {
 				bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(prev->k));
@@ -264,7 +264,7 @@ static int btree_repair_node_boundaries(struct bch_fs *c, struct btree *b,
 	int ret = 0;
 
 	if (!prev) {
-		pr_buf(&buf1, "start of node: ");
+		prt_printf(&buf1, "start of node: ");
 		bch2_bpos_to_text(&buf1, b->data->min_key);
 	} else {
 		bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(&prev->key));
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 9d4d6a65e70c..598c30b7ab8b 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -495,7 +495,7 @@ void bch2_btree_init_next(struct btree_trans *trans, struct btree *b)
 static void btree_pos_to_text(struct printbuf *out, struct bch_fs *c,
 			  struct btree *b)
 {
-	pr_buf(out, "%s level %u/%u\n  ",
+	prt_printf(out, "%s level %u/%u\n  ",
 	       bch2_btree_ids[b->c.btree_id],
 	       b->c.level,
 	       c->btree_roots[b->c.btree_id].level);
@@ -507,17 +507,17 @@ static void btree_err_msg(struct printbuf *out, struct bch_fs *c,
 			  struct btree *b, struct bset *i,
 			  unsigned offset, int write)
 {
-	pr_buf(out, "error validating btree node ");
+	prt_printf(out, "error validating btree node ");
 	if (write)
-		pr_buf(out, "before write ");
+		prt_printf(out, "before write ");
 	if (ca)
-		pr_buf(out, "on %s ", ca->name);
-	pr_buf(out, "at btree ");
+		prt_printf(out, "on %s ", ca->name);
+	prt_printf(out, "at btree ");
 	btree_pos_to_text(out, c, b);
 
-	pr_buf(out, "\n  node offset %u", b->written);
+	prt_printf(out, "\n  node offset %u", b->written);
 	if (i)
-		pr_buf(out, " bset u64s %u", le16_to_cpu(i->u64s));
+		prt_printf(out, " bset u64s %u", le16_to_cpu(i->u64s));
 }
 
 enum btree_err_type {
@@ -537,7 +537,7 @@ enum btree_validate_ret {
 	struct printbuf out = PRINTBUF;					\
 									\
 	btree_err_msg(&out, c, ca, b, i, b->written, write);		\
-	pr_buf(&out, ": " msg, ##__VA_ARGS__);				\
+	prt_printf(&out, ": " msg, ##__VA_ARGS__);				\
 									\
 	if (type == BTREE_ERR_FIXABLE &&				\
 	    write == READ &&						\
@@ -815,9 +815,9 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b,
 		printbuf_reset(&buf);
 		if (bset_key_invalid(c, b, u.s_c, updated_range, write, &buf)) {
 			printbuf_reset(&buf);
-			pr_buf(&buf, "invalid bkey:  ");
+			prt_printf(&buf, "invalid bkey:  ");
 			bset_key_invalid(c, b, u.s_c, updated_range, write, &buf);
-			pr_buf(&buf, "\n  ");
+			prt_printf(&buf, "\n  ");
 			bch2_bkey_val_to_text(&buf, c, u.s_c);
 
 			btree_err(BTREE_ERR_FIXABLE, c, NULL, b, i, "%s", buf.buf);
@@ -837,9 +837,9 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b,
 			struct bkey up = bkey_unpack_key(b, prev);
 
 			printbuf_reset(&buf);
-			pr_buf(&buf, "keys out of order: ");
+			prt_printf(&buf, "keys out of order: ");
 			bch2_bkey_to_text(&buf, &up);
-			pr_buf(&buf, " > ");
+			prt_printf(&buf, " > ");
 			bch2_bkey_to_text(&buf, u.k);
 
 			bch2_dump_bset(c, b, i, 0);
@@ -1076,9 +1076,9 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
 		     !bversion_cmp(u.k->version, MAX_VERSION))) {
 			printbuf_reset(&buf);
 
-			pr_buf(&buf, "invalid bkey: ");
+			prt_printf(&buf, "invalid bkey: ");
 			bch2_bkey_val_invalid(c, u.s_c, READ, &buf);
-			pr_buf(&buf, "\n  ");
+			prt_printf(&buf, "\n  ");
 			bch2_bkey_val_to_text(&buf, c, u.s_c);
 
 			btree_err(BTREE_ERR_FIXABLE, c, NULL, b, i, "%s", buf.buf);
@@ -1338,10 +1338,10 @@ fsck_err:
 					sectors = vstruct_sectors(bne, c->block_bits);
 				}
 
-				pr_buf(&buf, " %u-%u", offset, offset + sectors);
+				prt_printf(&buf, " %u-%u", offset, offset + sectors);
 				if (bne && bch2_journal_seq_is_blacklisted(c,
 							le64_to_cpu(bne->keys.journal_seq), false))
-					pr_buf(&buf, "*");
+					prt_printf(&buf, "*");
 				offset += sectors;
 			}
 
@@ -1349,14 +1349,14 @@ fsck_err:
 				bne = ra->buf[i] + (offset << 9);
 				if (bne->keys.seq == bn->keys.seq) {
 					if (!gap)
-						pr_buf(&buf, " GAP");
+						prt_printf(&buf, " GAP");
 					gap = true;
 
 					sectors = vstruct_sectors(bne, c->block_bits);
-					pr_buf(&buf, " %u-%u", offset, offset + sectors);
+					prt_printf(&buf, " %u-%u", offset, offset + sectors);
 					if (bch2_journal_seq_is_blacklisted(c,
 							le64_to_cpu(bne->keys.journal_seq), false))
-						pr_buf(&buf, "*");
+						prt_printf(&buf, "*");
 				}
 				offset++;
 			}
@@ -1497,7 +1497,7 @@ void bch2_btree_node_read(struct bch_fs *c, struct btree *b,
 	if (ret <= 0) {
 		struct printbuf buf = PRINTBUF;
 
-		pr_buf(&buf, "btree node read error: no device to read from\n at ");
+		prt_str(&buf, "btree node read error: no device to read from\n at ");
 		btree_pos_to_text(&buf, c, b);
 		bch_err(c, "%s", buf.buf);
 
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 0f1efc024878..a2219c13aee5 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -639,14 +639,14 @@ err:
 		struct bkey uk = bkey_unpack_key(l->b, p);
 		bch2_bkey_to_text(&buf2, &uk);
 	} else {
-		pr_buf(&buf2, "(none)");
+		prt_printf(&buf2, "(none)");
 	}
 
 	if (k) {
 		struct bkey uk = bkey_unpack_key(l->b, k);
 		bch2_bkey_to_text(&buf3, &uk);
 	} else {
-		pr_buf(&buf3, "(none)");
+		prt_printf(&buf3, "(none)");
 	}
 
 	panic("path should be %s key at level %u:\n"
@@ -1821,30 +1821,30 @@ void bch2_trans_updates_to_text(struct printbuf *buf, struct btree_trans *trans)
 {
 	struct btree_insert_entry *i;
 
-	pr_buf(buf, "transaction updates for %s journal seq %llu",
+	prt_printf(buf, "transaction updates for %s journal seq %llu",
 	       trans->fn, trans->journal_res.seq);
-	pr_newline(buf);
-	pr_indent_push(buf, 2);
+	prt_newline(buf);
+	printbuf_indent_add(buf, 2);
 
 	trans_for_each_update(trans, i) {
 		struct bkey_s_c old = { &i->old_k, i->old_v };
 
-		pr_buf(buf, "update: btree=%s cached=%u %pS",
+		prt_printf(buf, "update: btree=%s cached=%u %pS",
 		       bch2_btree_ids[i->btree_id],
 		       i->cached,
 		       (void *) i->ip_allocated);
-		pr_newline(buf);
+		prt_newline(buf);
 
-		pr_buf(buf, "  old ");
+		prt_printf(buf, "  old ");
 		bch2_bkey_val_to_text(buf, trans->c, old);
-		pr_newline(buf);
+		prt_newline(buf);
 
-		pr_buf(buf, "  new ");
+		prt_printf(buf, "  new ");
 		bch2_bkey_val_to_text(buf, trans->c, bkey_i_to_s_c(i->k));
-		pr_newline(buf);
+		prt_newline(buf);
 	}
 
-	pr_indent_pop(buf, 2);
+	printbuf_indent_sub(buf, 2);
 }
 
 noinline __cold
@@ -3365,7 +3365,7 @@ bch2_btree_path_node_to_text(struct printbuf *out,
 			     struct btree_bkey_cached_common *_b,
 			     bool cached)
 {
-	pr_buf(out, "    l=%u %s:",
+	prt_printf(out, "    l=%u %s:",
 	       _b->level, bch2_btree_ids[_b->btree_id]);
 	bch2_bpos_to_text(out, btree_node_pos(_b, cached));
 }
@@ -3396,28 +3396,28 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct bch_fs *c)
 		if (!trans_has_locks(trans))
 			continue;
 
-		pr_buf(out, "%i %s\n", trans->pid, trans->fn);
+		prt_printf(out, "%i %s\n", trans->pid, trans->fn);
 
 		trans_for_each_path(trans, path) {
 			if (!path->nodes_locked)
 				continue;
 
-			pr_buf(out, "  path %u %c l=%u %s:",
+			prt_printf(out, "  path %u %c l=%u %s:",
 			       path->idx,
 			       path->cached ? 'c' : 'b',
 			       path->level,
 			       bch2_btree_ids[path->btree_id]);
 			bch2_bpos_to_text(out, path->pos);
-			pr_buf(out, "\n");
+			prt_printf(out, "\n");
 
 			for (l = 0; l < BTREE_MAX_DEPTH; l++) {
 				if (btree_node_locked(path, l)) {
-					pr_buf(out, "    %s l=%u ",
+					prt_printf(out, "    %s l=%u ",
 					       btree_node_intent_locked(path, l) ? "i" : "r", l);
 					bch2_btree_path_node_to_text(out,
 							(void *) path->l[l].b,
 							path->cached);
-					pr_buf(out, "\n");
+					prt_printf(out, "\n");
 				}
 			}
 		}
@@ -3425,7 +3425,7 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct bch_fs *c)
 		b = READ_ONCE(trans->locking);
 		if (b) {
 			path = &trans->paths[trans->locking_path_idx];
-			pr_buf(out, "  locking path %u %c l=%u %c %s:",
+			prt_printf(out, "  locking path %u %c l=%u %c %s:",
 			       trans->locking_path_idx,
 			       path->cached ? 'c' : 'b',
 			       trans->locking_level,
@@ -3433,10 +3433,10 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct bch_fs *c)
 			       bch2_btree_ids[trans->locking_btree_id]);
 			bch2_bpos_to_text(out, trans->locking_pos);
 
-			pr_buf(out, " node ");
+			prt_printf(out, " node ");
 			bch2_btree_path_node_to_text(out,
 					(void *) b, path->cached);
-			pr_buf(out, "\n");
+			prt_printf(out, "\n");
 		}
 	}
 	mutex_unlock(&c->btree_trans_lock);
diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index d316e9b9ae02..bc0c8386e403 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -748,9 +748,9 @@ int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc)
 
 void bch2_btree_key_cache_to_text(struct printbuf *out, struct btree_key_cache *c)
 {
-	pr_buf(out, "nr_freed:\t%zu\n",	c->nr_freed);
-	pr_buf(out, "nr_keys:\t%lu\n",	atomic_long_read(&c->nr_keys));
-	pr_buf(out, "nr_dirty:\t%lu\n",	atomic_long_read(&c->nr_dirty));
+	prt_printf(out, "nr_freed:\t%zu\n",	c->nr_freed);
+	prt_printf(out, "nr_keys:\t%lu\n",	atomic_long_read(&c->nr_keys));
+	prt_printf(out, "nr_dirty:\t%lu\n",	atomic_long_read(&c->nr_dirty));
 }
 
 void bch2_btree_key_cache_exit(void)
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index d4308f3c530b..a4f66e7cbb45 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -1189,9 +1189,9 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as,
 			      btree_node_type(b), WRITE, &buf) ?:
 	    bch2_bkey_in_btree_node(b, bkey_i_to_s_c(insert), &buf)) {
 		printbuf_reset(&buf);
-		pr_buf(&buf, "inserting invalid bkey\n  ");
+		prt_printf(&buf, "inserting invalid bkey\n  ");
 		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert));
-		pr_buf(&buf, "\n  ");
+		prt_printf(&buf, "\n  ");
 		bch2_bkey_invalid(c, bkey_i_to_s_c(insert),
 				  btree_node_type(b), WRITE, &buf);
 		bch2_bkey_in_btree_node(b, bkey_i_to_s_c(insert), &buf);
@@ -2163,7 +2163,7 @@ void bch2_btree_updates_to_text(struct printbuf *out, struct bch_fs *c)
 
 	mutex_lock(&c->btree_interior_update_lock);
 	list_for_each_entry(as, &c->btree_interior_update_list, list)
-		pr_buf(out, "%p m %u w %u r %u j %llu\n",
+		prt_printf(out, "%p m %u w %u r %u j %llu\n",
 		       as,
 		       as->mode,
 		       as->nodes_written,
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index eac601d6a397..3425e3c007dd 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -868,13 +868,13 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans,
 		if (bch2_bkey_invalid(c, bkey_i_to_s_c(i->k),
 				      i->bkey_type, rw, &buf)) {
 			printbuf_reset(&buf);
-			pr_buf(&buf, "invalid bkey on insert from %s -> %ps",
+			prt_printf(&buf, "invalid bkey on insert from %s -> %ps",
 			       trans->fn, (void *) i->ip_allocated);
-			pr_newline(&buf);
-			pr_indent_push(&buf, 2);
+			prt_newline(&buf);
+			printbuf_indent_add(&buf, 2);
 
 			bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(i->k));
-			pr_newline(&buf);
+			prt_newline(&buf);
 
 			bch2_bkey_invalid(c, bkey_i_to_s_c(i->k),
 					  i->bkey_type, rw, &buf);
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 230344e0a534..eab01cc09337 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -201,26 +201,26 @@ void bch2_fs_usage_to_text(struct printbuf *out,
 {
 	unsigned i;
 
-	pr_buf(out, "capacity:\t\t\t%llu\n", c->capacity);
+	prt_printf(out, "capacity:\t\t\t%llu\n", c->capacity);
 
-	pr_buf(out, "hidden:\t\t\t\t%llu\n",
+	prt_printf(out, "hidden:\t\t\t\t%llu\n",
 	       fs_usage->u.hidden);
-	pr_buf(out, "data:\t\t\t\t%llu\n",
+	prt_printf(out, "data:\t\t\t\t%llu\n",
 	       fs_usage->u.data);
-	pr_buf(out, "cached:\t\t\t\t%llu\n",
+	prt_printf(out, "cached:\t\t\t\t%llu\n",
 	       fs_usage->u.cached);
-	pr_buf(out, "reserved:\t\t\t%llu\n",
+	prt_printf(out, "reserved:\t\t\t%llu\n",
 	       fs_usage->u.reserved);
-	pr_buf(out, "nr_inodes:\t\t\t%llu\n",
+	prt_printf(out, "nr_inodes:\t\t\t%llu\n",
 	       fs_usage->u.nr_inodes);
-	pr_buf(out, "online reserved:\t\t%llu\n",
+	prt_printf(out, "online reserved:\t\t%llu\n",
 	       fs_usage->online_reserved);
 
 	for (i = 0;
 	     i < ARRAY_SIZE(fs_usage->u.persistent_reserved);
 	     i++) {
-		pr_buf(out, "%u replicas:\n", i + 1);
-		pr_buf(out, "\treserved:\t\t%llu\n",
+		prt_printf(out, "%u replicas:\n", i + 1);
+		prt_printf(out, "\treserved:\t\t%llu\n",
 		       fs_usage->u.persistent_reserved[i]);
 	}
 
@@ -228,9 +228,9 @@ void bch2_fs_usage_to_text(struct printbuf *out,
 		struct bch_replicas_entry *e =
 			cpu_replicas_entry(&c->replicas, i);
 
-		pr_buf(out, "\t");
+		prt_printf(out, "\t");
 		bch2_replicas_entry_to_text(out, e);
-		pr_buf(out, ":\t%llu\n", fs_usage->u.replicas[i]);
+		prt_printf(out, ":\t%llu\n", fs_usage->u.replicas[i]);
 	}
 }
 
diff --git a/fs/bcachefs/checksum.c b/fs/bcachefs/checksum.c
index e9a444f75b93..e23b221cd377 100644
--- a/fs/bcachefs/checksum.c
+++ b/fs/bcachefs/checksum.c
@@ -493,13 +493,15 @@ static int __bch2_request_key(char *key_description, struct bch_key *key)
 
 int bch2_request_key(struct bch_sb *sb, struct bch_key *key)
 {
-	char key_description[60];
-	char uuid[40];
+	struct printbuf key_description = PRINTBUF;
+	int ret;
 
-	uuid_unparse_lower(sb->user_uuid.b, uuid);
-	sprintf(key_description, "bcachefs:%s", uuid);
+	prt_printf(&key_description, "bcachefs:");
+	pr_uuid(&key_description, sb->user_uuid.b);
 
-	return __bch2_request_key(key_description, key);
+	ret = __bch2_request_key(key_description.buf, key);
+	printbuf_exit(&key_description);
+	return ret;
 }
 
 int bch2_decrypt_sb_key(struct bch_fs *c,
diff --git a/fs/bcachefs/clock.c b/fs/bcachefs/clock.c
index 342797303415..00d0e6725910 100644
--- a/fs/bcachefs/clock.c
+++ b/fs/bcachefs/clock.c
@@ -162,7 +162,7 @@ void bch2_io_timers_to_text(struct printbuf *out, struct io_clock *clock)
 	now = atomic64_read(&clock->now);
 
 	for (i = 0; i < clock->timers.used; i++)
-		pr_buf(out, "%ps:\t%li\n",
+		prt_printf(out, "%ps:\t%li\n",
 		       clock->timers.data[i]->fn,
 		       clock->timers.data[i]->expire - now);
 	spin_unlock(&clock->timer_lock);
diff --git a/fs/bcachefs/counters.c b/fs/bcachefs/counters.c
index 6bf267dfd051..745f856e6d3e 100644
--- a/fs/bcachefs/counters.c
+++ b/fs/bcachefs/counters.c
@@ -36,13 +36,13 @@ void bch2_sb_counters_to_text(struct printbuf *out, struct bch_sb *sb,
 
 	for (i = 0; i < nr; i++) {
 		if (i < BCH_COUNTER_NR)
-			pr_buf(out, "%s", bch2_counter_names[i]);
+			prt_printf(out, "%s", bch2_counter_names[i]);
 		else
-			pr_buf(out, "(unknown)");
+			prt_printf(out, "(unknown)");
 
-		pr_tab(out);
-		pr_buf(out, "%llu", le64_to_cpu(ctrs->d[i]));
-		pr_newline(out);
+		prt_tab(out);
+		prt_printf(out, "%llu", le64_to_cpu(ctrs->d[i]));
+		prt_newline(out);
 	};
 };
 
diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c
index 457fcee7d8e1..878f4e541f83 100644
--- a/fs/bcachefs/debug.c
+++ b/fs/bcachefs/debug.c
@@ -273,7 +273,7 @@ static ssize_t bch2_read_btree(struct file *file, char __user *buf,
 
 	while (k.k && !(err = bkey_err(k))) {
 		bch2_bkey_val_to_text(&i->buf, i->c, k);
-		pr_char(&i->buf, '\n');
+		prt_char(&i->buf, '\n');
 
 		k = bch2_btree_iter_next(&iter);
 		i->from = iter.pos;
@@ -425,55 +425,56 @@ static const struct file_operations bfloat_failed_debug_ops = {
 static void bch2_cached_btree_node_to_text(struct printbuf *out, struct bch_fs *c,
 					   struct btree *b)
 {
-	out->tabstops[0] = 32;
+	if (!out->nr_tabstops)
+		printbuf_tabstop_push(out, 32);
 
-	pr_buf(out, "%px btree=%s l=%u ",
+	prt_printf(out, "%px btree=%s l=%u ",
 	       b,
 	       bch2_btree_ids[b->c.btree_id],
 	       b->c.level);
-	pr_newline(out);
+	prt_newline(out);
 
-	pr_indent_push(out, 2);
+	printbuf_indent_add(out, 2);
 
 	bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&b->key));
-	pr_newline(out);
-
-	pr_buf(out, "flags: ");
-	pr_tab(out);
-	bch2_flags_to_text(out, bch2_btree_node_flags, b->flags);
-	pr_newline(out);
-
-	pr_buf(out, "pcpu read locks: ");
-	pr_tab(out);
-	pr_buf(out, "%u", b->c.lock.readers != NULL);
-	pr_newline(out);
-
-	pr_buf(out, "written:");
-	pr_tab(out);
-	pr_buf(out, "%u", b->written);
-	pr_newline(out);
-
-	pr_buf(out, "writes blocked:");
-	pr_tab(out);
-	pr_buf(out, "%u", !list_empty_careful(&b->write_blocked));
-	pr_newline(out);
-
-	pr_buf(out, "will make reachable:");
-	pr_tab(out);
-	pr_buf(out, "%lx", b->will_make_reachable);
-	pr_newline(out);
-
-	pr_buf(out, "journal pin %px:", &b->writes[0].journal);
-	pr_tab(out);
-	pr_buf(out, "%llu", b->writes[0].journal.seq);
-	pr_newline(out);
-
-	pr_buf(out, "journal pin %px:", &b->writes[1].journal);
-	pr_tab(out);
-	pr_buf(out, "%llu", b->writes[1].journal.seq);
-	pr_newline(out);
-
-	pr_indent_pop(out, 2);
+	prt_newline(out);
+
+	prt_printf(out, "flags: ");
+	prt_tab(out);
+	prt_bitflags(out, bch2_btree_node_flags, b->flags);
+	prt_newline(out);
+
+	prt_printf(out, "pcpu read locks: ");
+	prt_tab(out);
+	prt_printf(out, "%u", b->c.lock.readers != NULL);
+	prt_newline(out);
+
+	prt_printf(out, "written:");
+	prt_tab(out);
+	prt_printf(out, "%u", b->written);
+	prt_newline(out);
+
+	prt_printf(out, "writes blocked:");
+	prt_tab(out);
+	prt_printf(out, "%u", !list_empty_careful(&b->write_blocked));
+	prt_newline(out);
+
+	prt_printf(out, "will make reachable:");
+	prt_tab(out);
+	prt_printf(out, "%lx", b->will_make_reachable);
+	prt_newline(out);
+
+	prt_printf(out, "journal pin %px:", &b->writes[0].journal);
+	prt_tab(out);
+	prt_printf(out, "%llu", b->writes[0].journal.seq);
+	prt_newline(out);
+
+	prt_printf(out, "journal pin %px:", &b->writes[1].journal);
+	prt_tab(out);
+	prt_printf(out, "%llu", b->writes[1].journal.seq);
+	prt_newline(out);
+
+	printbuf_indent_sub(out, 2);
 }
 
 static ssize_t bch2_cached_btree_nodes_read(struct file *file, char __user *buf,
diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c
index 716c85062cea..0cbb765cde54 100644
--- a/fs/bcachefs/dirent.c
+++ b/fs/bcachefs/dirent.c
@@ -90,47 +90,47 @@ int bch2_dirent_invalid(const struct bch_fs *c, struct bkey_s_c k,
 	unsigned len;
 
 	if (bkey_val_bytes(k.k) < sizeof(struct bch_dirent)) {
-		pr_buf(err, "incorrect value size (%zu < %zu)",
+		prt_printf(err, "incorrect value size (%zu < %zu)",
 		       bkey_val_bytes(k.k), sizeof(*d.v));
 		return -EINVAL;
 	}
 
 	len = bch2_dirent_name_bytes(d);
 	if (!len) {
-		pr_buf(err, "empty name");
+		prt_printf(err, "empty name");
 		return -EINVAL;
 	}
 
 	if (bkey_val_u64s(k.k) > dirent_val_u64s(len)) {
-		pr_buf(err, "value too big (%zu > %u)",
+		prt_printf(err, "value too big (%zu > %u)",
 		       bkey_val_u64s(k.k),dirent_val_u64s(len));
 		return -EINVAL;
 	}
 
 	if (len > BCH_NAME_MAX) {
-		pr_buf(err, "dirent name too big (%u > %u)",
+		prt_printf(err, "dirent name too big (%u > %u)",
 		       len, BCH_NAME_MAX);
 		return -EINVAL;
 	}
 
 	if (len == 1 && !memcmp(d.v->d_name, ".", 1)) {
-		pr_buf(err, "invalid name");
+		prt_printf(err, "invalid name");
 		return -EINVAL;
 	}
 
 	if (len == 2 && !memcmp(d.v->d_name, "..", 2)) {
-		pr_buf(err, "invalid name");
+		prt_printf(err, "invalid name");
 		return -EINVAL;
 	}
 
 	if (memchr(d.v->d_name, '/', len)) {
-		pr_buf(err, "invalid name");
+		prt_printf(err, "invalid name");
 		return -EINVAL;
 	}
 
 	if (d.v->d_type != DT_SUBVOL &&
 	    le64_to_cpu(d.v->d_inum) == d.k->p.inode) {
-		pr_buf(err, "dirent points to own directory");
+		prt_printf(err, "dirent points to own directory");
 		return -EINVAL;
 	}
 
@@ -142,7 +142,7 @@ void bch2_dirent_to_text(struct printbuf *out, struct bch_fs *c,
 {
 	struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
 
-	pr_buf(out, "%.*s -> %llu type %s",
+	prt_printf(out, "%.*s -> %llu type %s",
 	       bch2_dirent_name_bytes(d),
 	       d.v->d_name,
 	       d.v->d_type != DT_SUBVOL
diff --git a/fs/bcachefs/disk_groups.c b/fs/bcachefs/disk_groups.c
index 97eb21827cb3..33d2702e6849 100644
--- a/fs/bcachefs/disk_groups.c
+++ b/fs/bcachefs/disk_groups.c
@@ -39,13 +39,13 @@ static int bch2_sb_disk_groups_validate(struct bch_sb *sb,
 		g = BCH_MEMBER_GROUP(m) - 1;
 
 		if (g >= nr_groups) {
-			pr_buf(err, "disk %u has invalid label %u (have %u)",
+			prt_printf(err, "disk %u has invalid label %u (have %u)",
 			       i, g, nr_groups);
 			return -EINVAL;
 		}
 
 		if (BCH_GROUP_DELETED(&groups->entries[g])) {
-			pr_buf(err, "disk %u has deleted label %u", i, g);
+			prt_printf(err, "disk %u has deleted label %u", i, g);
 			return -EINVAL;
 		}
 	}
@@ -61,7 +61,7 @@ static int bch2_sb_disk_groups_validate(struct bch_sb *sb,
 
 		len = strnlen(g->label, sizeof(g->label));
 		if (!len) {
-			pr_buf(err, "label %u empty", i);
+			prt_printf(err, "label %u empty", i);
 			return -EINVAL;
 		}
 	}
@@ -76,7 +76,7 @@ static int bch2_sb_disk_groups_validate(struct bch_sb *sb,
 	for (g = sorted; g + 1 < sorted + nr_groups; g++)
 		if (!BCH_GROUP_DELETED(g) &&
 		    !group_cmp(&g[0], &g[1])) {
-			pr_buf(err, "duplicate label %llu.%.*s",
+			prt_printf(err, "duplicate label %llu.%.*s",
 			       BCH_GROUP_PARENT(g),
 			       (int) sizeof(g->label), g->label);
 			goto err;
@@ -101,12 +101,12 @@ static void bch2_sb_disk_groups_to_text(struct printbuf *out,
 	     g < groups->entries + nr_groups;
 	     g++) {
 		if (g != groups->entries)
-			pr_buf(out, " ");
+			prt_printf(out, " ");
 
 		if (BCH_GROUP_DELETED(g))
-			pr_buf(out, "[deleted]");
+			prt_printf(out, "[deleted]");
 		else
-			pr_buf(out, "[parent %llu name %s]",
+			prt_printf(out, "[parent %llu name %s]",
 			       BCH_GROUP_PARENT(g), g->label);
 	}
 }
@@ -375,13 +375,13 @@ void bch2_disk_path_to_text(struct printbuf *out, struct bch_sb *sb, unsigned v)
 		v = path[--nr];
 		g = groups->entries + v;
 
-		pr_buf(out, "%.*s", (int) sizeof(g->label), g->label);
+		prt_printf(out, "%.*s", (int) sizeof(g->label), g->label);
 		if (nr)
-			pr_buf(out, ".");
+			prt_printf(out, ".");
 	}
 	return;
 inval:
-	pr_buf(out, "invalid label %u", v);
+	prt_printf(out, "invalid label %u", v);
 }
 
 int bch2_dev_group_set(struct bch_fs *c, struct bch_dev *ca, const char *name)
@@ -454,7 +454,7 @@ void bch2_opt_target_to_text(struct printbuf *out,
 
 	switch (t.type) {
 	case TARGET_NULL:
-		pr_buf(out, "none");
+		prt_printf(out, "none");
 		break;
 	case TARGET_DEV:
 		if (c) {
@@ -466,12 +466,12 @@ void bch2_opt_target_to_text(struct printbuf *out,
 				: NULL;
 
 			if (ca && percpu_ref_tryget(&ca->io_ref)) {
-				pr_buf(out, "/dev/%pg", ca->disk_sb.bdev);
+				prt_printf(out, "/dev/%pg", ca->disk_sb.bdev);
 				percpu_ref_put(&ca->io_ref);
 			} else if (ca) {
-				pr_buf(out, "offline device %u", t.dev);
+				prt_printf(out, "offline device %u", t.dev);
 			} else {
-				pr_buf(out, "invalid device %u", t.dev);
+				prt_printf(out, "invalid device %u", t.dev);
 			}
 
 			rcu_read_unlock();
@@ -480,11 +480,11 @@ void bch2_opt_target_to_text(struct printbuf *out,
 			struct bch_member *m = mi->members + t.dev;
 
 			if (bch2_dev_exists(sb, mi, t.dev)) {
-				pr_buf(out, "Device ");
+				prt_printf(out, "Device ");
 				pr_uuid(out, m->uuid.b);
-				pr_buf(out, " (%u)", t.dev);
+				prt_printf(out, " (%u)", t.dev);
 			} else {
-				pr_buf(out, "Bad device %u", t.dev);
+				prt_printf(out, "Bad device %u", t.dev);
 			}
 		}
 		break;
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index ae33d3ea8ec1..faabaa64dcdb 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -108,23 +108,23 @@ int bch2_stripe_invalid(const struct bch_fs *c, struct bkey_s_c k,
 	const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
 
 	if (!bkey_cmp(k.k->p, POS_MIN)) {
-		pr_buf(err, "stripe at POS_MIN");
+		prt_printf(err, "stripe at POS_MIN");
 		return -EINVAL;
 	}
 
 	if (k.k->p.inode) {
-		pr_buf(err, "nonzero inode field");
+		prt_printf(err, "nonzero inode field");
 		return -EINVAL;
 	}
 
 	if (bkey_val_bytes(k.k) < sizeof(*s)) {
-		pr_buf(err, "incorrect value size (%zu < %zu)",
+		prt_printf(err, "incorrect value size (%zu < %zu)",
 		       bkey_val_bytes(k.k), sizeof(*s));
 		return -EINVAL;
 	}
 
 	if (bkey_val_u64s(k.k) < stripe_val_u64s(s)) {
-		pr_buf(err, "incorrect value size (%zu < %u)",
+		prt_printf(err, "incorrect value size (%zu < %u)",
 		       bkey_val_u64s(k.k), stripe_val_u64s(s));
 		return -EINVAL;
 	}
@@ -138,7 +138,7 @@ void bch2_stripe_to_text(struct printbuf *out, struct bch_fs *c,
 	const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
 	unsigned i;
 
-	pr_buf(out, "algo %u sectors %u blocks %u:%u csum %u gran %u",
+	prt_printf(out, "algo %u sectors %u blocks %u:%u csum %u gran %u",
 	       s->algorithm,
 	       le16_to_cpu(s->sectors),
 	       s->nr_blocks - s->nr_redundant,
@@ -147,7 +147,7 @@ void bch2_stripe_to_text(struct printbuf *out, struct bch_fs *c,
 	       1U << s->csum_granularity_bits);
 
 	for (i = 0; i < s->nr_blocks; i++)
-		pr_buf(out, " %u:%llu:%u", s->ptrs[i].dev,
+		prt_printf(out, " %u:%llu:%u", s->ptrs[i].dev,
 		       (u64) s->ptrs[i].offset,
 		       stripe_blockcount_get(s, i));
 }
@@ -1622,7 +1622,7 @@ void bch2_stripes_heap_to_text(struct printbuf *out, struct bch_fs *c)
 	for (i = 0; i < min_t(size_t, h->used, 20); i++) {
 		m = genradix_ptr(&c->stripes, h->data[i].idx);
 
-		pr_buf(out, "%zu %u/%u+%u\n", h->data[i].idx,
+		prt_printf(out, "%zu %u/%u+%u\n", h->data[i].idx,
 		       h->data[i].blocks_nonempty,
 		       m->nr_blocks - m->nr_redundant,
 		       m->nr_redundant);
@@ -1637,11 +1637,11 @@ void bch2_new_stripes_to_text(struct printbuf *out, struct bch_fs *c)
 
 	mutex_lock(&c->ec_stripe_head_lock);
 	list_for_each_entry(h, &c->ec_stripe_head_list, list) {
-		pr_buf(out, "target %u algo %u redundancy %u:\n",
+		prt_printf(out, "target %u algo %u redundancy %u:\n",
 		       h->target, h->algo, h->redundancy);
 
 		if (h->s)
-			pr_buf(out, "\tpending: blocks %u+%u allocated %u\n",
+			prt_printf(out, "\tpending: blocks %u+%u allocated %u\n",
 			       h->s->nr_data, h->s->nr_parity,
 			       bitmap_weight(h->s->blocks_allocated,
 					     h->s->nr_data));
@@ -1650,7 +1650,7 @@ void bch2_new_stripes_to_text(struct printbuf *out, struct bch_fs *c)
 
 	mutex_lock(&c->ec_stripe_new_lock);
 	list_for_each_entry(s, &c->ec_stripe_new_list, list) {
-		pr_buf(out, "\tin flight: blocks %u+%u pin %u\n",
+		prt_printf(out, "\tin flight: blocks %u+%u pin %u\n",
 		       s->nr_data, s->nr_parity,
 		       atomic_read(&s->pin));
 	}
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index d8f429ffe57c..b0226118077a 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -159,7 +159,7 @@ int bch2_btree_ptr_invalid(const struct bch_fs *c, struct bkey_s_c k,
 			   int rw, struct printbuf *err)
 {
 	if (bkey_val_u64s(k.k) > BCH_REPLICAS_MAX) {
-		pr_buf(err, "value too big (%zu > %u)",
+		prt_printf(err, "value too big (%zu > %u)",
 		       bkey_val_u64s(k.k), BCH_REPLICAS_MAX);
 		return -EINVAL;
 	}
@@ -179,20 +179,20 @@ int bch2_btree_ptr_v2_invalid(const struct bch_fs *c, struct bkey_s_c k,
 	struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k);
 
 	if (bkey_val_bytes(k.k) <= sizeof(*bp.v)) {
-		pr_buf(err, "value too small (%zu <= %zu)",
+		prt_printf(err, "value too small (%zu <= %zu)",
 		       bkey_val_bytes(k.k), sizeof(*bp.v));
 		return -EINVAL;
 	}
 
 	if (bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX) {
-		pr_buf(err, "value too big (%zu > %zu)",
+		prt_printf(err, "value too big (%zu > %zu)",
 		       bkey_val_u64s(k.k), BKEY_BTREE_PTR_VAL_U64s_MAX);
 		return -EINVAL;
 	}
 
 	if (c->sb.version < bcachefs_metadata_version_snapshot &&
 	    bp.v->min_key.snapshot) {
-		pr_buf(err, "invalid min_key.snapshot (%u != 0)",
+		prt_printf(err, "invalid min_key.snapshot (%u != 0)",
 		       bp.v->min_key.snapshot);
 		return -EINVAL;
 	}
@@ -205,13 +205,13 @@ void bch2_btree_ptr_v2_to_text(struct printbuf *out, struct bch_fs *c,
 {
 	struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k);
 
-	pr_buf(out, "seq %llx written %u min_key %s",
+	prt_printf(out, "seq %llx written %u min_key %s",
 	       le64_to_cpu(bp.v->seq),
 	       le16_to_cpu(bp.v->sectors_written),
 	       BTREE_PTR_RANGE_UPDATED(bp.v) ? "R " : "");
 
 	bch2_bpos_to_text(out, bp.v->min_key);
-	pr_buf(out, " ");
+	prt_printf(out, " ");
 	bch2_bkey_ptrs_to_text(out, c, k);
 }
 
@@ -383,13 +383,13 @@ int bch2_reservation_invalid(const struct bch_fs *c, struct bkey_s_c k,
 	struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k);
 
 	if (bkey_val_bytes(k.k) != sizeof(struct bch_reservation)) {
-		pr_buf(err, "incorrect value size (%zu != %zu)",
+		prt_printf(err, "incorrect value size (%zu != %zu)",
 		       bkey_val_bytes(k.k), sizeof(*r.v));
 		return -EINVAL;
 	}
 
 	if (!r.v->nr_replicas || r.v->nr_replicas > BCH_REPLICAS_MAX) {
-		pr_buf(err, "invalid nr_replicas (%u)",
+		prt_printf(err, "invalid nr_replicas (%u)",
 		       r.v->nr_replicas);
 		return -EINVAL;
 	}
@@ -402,7 +402,7 @@ void bch2_reservation_to_text(struct printbuf *out, struct bch_fs *c,
 {
 	struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k);
 
-	pr_buf(out, "generation %u replicas %u",
+	prt_printf(out, "generation %u replicas %u",
 	       le32_to_cpu(r.v->generation),
 	       r.v->nr_replicas);
 }
@@ -970,7 +970,7 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
 
 	bkey_extent_entry_for_each(ptrs, entry) {
 		if (!first)
-			pr_buf(out, " ");
+			prt_printf(out, " ");
 
 		switch (__extent_entry_type(entry)) {
 		case BCH_EXTENT_ENTRY_ptr:
@@ -980,19 +980,19 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
 				: NULL;
 
 			if (!ca) {
-				pr_buf(out, "ptr: %u:%llu gen %u%s", ptr->dev,
+				prt_printf(out, "ptr: %u:%llu gen %u%s", ptr->dev,
 				       (u64) ptr->offset, ptr->gen,
 				       ptr->cached ? " cached" : "");
 			} else {
 				u32 offset;
 				u64 b = sector_to_bucket_and_offset(ca, ptr->offset, &offset);
 
-				pr_buf(out, "ptr: %u:%llu:%u gen %u%s", ptr->dev,
+				prt_printf(out, "ptr: %u:%llu:%u gen %u%s", ptr->dev,
 				       b, offset, ptr->gen,
 				       ptr->cached ? " cached" : "");
 
 				if (ca && ptr_stale(ca, ptr))
-					pr_buf(out, " stale");
+					prt_printf(out, " stale");
 			}
 			break;
 		case BCH_EXTENT_ENTRY_crc32:
@@ -1000,7 +1000,7 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
 		case BCH_EXTENT_ENTRY_crc128:
 			crc = bch2_extent_crc_unpack(k.k, entry_to_crc(entry));
 
-			pr_buf(out, "crc: c_size %u size %u offset %u nonce %u csum %s compress %s",
+			prt_printf(out, "crc: c_size %u size %u offset %u nonce %u csum %s compress %s",
 			       crc.compressed_size,
 			       crc.uncompressed_size,
 			       crc.offset, crc.nonce,
@@ -1010,11 +1010,11 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
 		case BCH_EXTENT_ENTRY_stripe_ptr:
 			ec = &entry->stripe_ptr;
 
-			pr_buf(out, "ec: idx %llu block %u",
+			prt_printf(out, "ec: idx %llu block %u",
 			       (u64) ec->idx, ec->block);
 			break;
 		default:
-			pr_buf(out, "(invalid extent entry %.16llx)", *((u64 *) entry));
+			prt_printf(out, "(invalid extent entry %.16llx)", *((u64 *) entry));
 			return;
 		}
 
@@ -1036,33 +1036,33 @@ static int extent_ptr_invalid(const struct bch_fs *c,
 	struct bch_dev *ca;
 
 	if (!bch2_dev_exists2(c, ptr->dev)) {
-		pr_buf(err, "pointer to invalid device (%u)", ptr->dev);
+		prt_printf(err, "pointer to invalid device (%u)", ptr->dev);
 		return -EINVAL;
 	}
 
 	ca = bch_dev_bkey_exists(c, ptr->dev);
 	bkey_for_each_ptr(ptrs, ptr2)
 		if (ptr != ptr2 && ptr->dev == ptr2->dev) {
-			pr_buf(err, "multiple pointers to same device (%u)", ptr->dev);
+			prt_printf(err, "multiple pointers to same device (%u)", ptr->dev);
 			return -EINVAL;
 		}
 
 	bucket = sector_to_bucket_and_offset(ca, ptr->offset, &bucket_offset);
 
 	if (bucket >= ca->mi.nbuckets) {
-		pr_buf(err, "pointer past last bucket (%llu > %llu)",
+		prt_printf(err, "pointer past last bucket (%llu > %llu)",
 		       bucket, ca->mi.nbuckets);
 		return -EINVAL;
 	}
 
 	if (ptr->offset < bucket_to_sector(ca, ca->mi.first_bucket)) {
-		pr_buf(err, "pointer before first bucket (%llu < %u)",
+		prt_printf(err, "pointer before first bucket (%llu < %u)",
 		       bucket, ca->mi.first_bucket);
 		return -EINVAL;
 	}
 
 	if (bucket_offset + size_ondisk > ca->mi.bucket_size) {
-		pr_buf(err, "pointer spans multiple buckets (%u + %u > %u)",
+		prt_printf(err, "pointer spans multiple buckets (%u + %u > %u)",
 		       bucket_offset, size_ondisk, ca->mi.bucket_size);
 		return -EINVAL;
 	}
@@ -1085,14 +1085,14 @@ int bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k,
 
 	bkey_extent_entry_for_each(ptrs, entry) {
 		if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX) {
-			pr_buf(err, "invalid extent entry type (got %u, max %u)",
+			prt_printf(err, "invalid extent entry type (got %u, max %u)",
 			       __extent_entry_type(entry), BCH_EXTENT_ENTRY_MAX);
 			return -EINVAL;
 		}
 
 		if (bkey_is_btree_ptr(k.k) &&
 		    !extent_entry_is_ptr(entry)) {
-			pr_buf(err, "has non ptr field");
+			prt_printf(err, "has non ptr field");
 			return -EINVAL;
 		}
 
@@ -1110,19 +1110,19 @@ int bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k,
 
 			if (crc.offset + crc.live_size >
 			    crc.uncompressed_size) {
-				pr_buf(err, "checksum offset + key size > uncompressed size");
+				prt_printf(err, "checksum offset + key size > uncompressed size");
 				return -EINVAL;
 			}
 
 			size_ondisk = crc.compressed_size;
 
 			if (!bch2_checksum_type_valid(c, crc.csum_type)) {
-				pr_buf(err, "invalid checksum type");
+				prt_printf(err, "invalid checksum type");
 				return -EINVAL;
 			}
 
 			if (crc.compression_type >= BCH_COMPRESSION_TYPE_NR) {
-				pr_buf(err, "invalid compression type");
+				prt_printf(err, "invalid compression type");
 				return -EINVAL;
 			}
 
@@ -1130,7 +1130,7 @@ int bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k,
 				if (nonce == UINT_MAX)
 					nonce = crc.offset + crc.nonce;
 				else if (nonce != crc.offset + crc.nonce) {
-					pr_buf(err, "incorrect nonce");
+					prt_printf(err, "incorrect nonce");
 					return -EINVAL;
 				}
 			}
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index 28f4f192772f..6c0547151d50 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -298,40 +298,40 @@ static int __bch2_inode_invalid(struct bkey_s_c k, struct printbuf *err)
 	struct bch_inode_unpacked unpacked;
 
 	if (k.k->p.inode) {
-		pr_buf(err, "nonzero k.p.inode");
+		prt_printf(err, "nonzero k.p.inode");
 		return -EINVAL;
 	}
 
 	if (k.k->p.offset < BLOCKDEV_INODE_MAX) {
-		pr_buf(err, "fs inode in blockdev range");
+		prt_printf(err, "fs inode in blockdev range");
 		return -EINVAL;
 	}
 
 	if (bch2_inode_unpack(k, &unpacked)){
-		pr_buf(err, "invalid variable length fields");
+		prt_printf(err, "invalid variable length fields");
 		return -EINVAL;
 	}
 
 	if (unpacked.bi_data_checksum >= BCH_CSUM_OPT_NR + 1) {
-		pr_buf(err, "invalid data checksum type (%u >= %u",
+		prt_printf(err, "invalid data checksum type (%u >= %u",
 			unpacked.bi_data_checksum, BCH_CSUM_OPT_NR + 1);
 		return -EINVAL;
 	}
 
 	if (unpacked.bi_compression >= BCH_COMPRESSION_OPT_NR + 1) {
-		pr_buf(err, "invalid data checksum type (%u >= %u)",
+		prt_printf(err, "invalid data checksum type (%u >= %u)",
 		       unpacked.bi_compression, BCH_COMPRESSION_OPT_NR + 1);
 		return -EINVAL;
 	}
 
 	if ((unpacked.bi_flags & BCH_INODE_UNLINKED) &&
 	    unpacked.bi_nlink != 0) {
-		pr_buf(err, "flagged as unlinked but bi_nlink != 0");
+		prt_printf(err, "flagged as unlinked but bi_nlink != 0");
 		return -EINVAL;
 	}
 
 	if (unpacked.bi_subvol && !S_ISDIR(unpacked.bi_mode)) {
-		pr_buf(err, "subvolume root but not a directory");
+		prt_printf(err, "subvolume root but not a directory");
 		return -EINVAL;
 	}
 
@@ -344,13 +344,13 @@ int bch2_inode_invalid(const struct bch_fs *c, struct bkey_s_c k,
 	struct bkey_s_c_inode inode = bkey_s_c_to_inode(k);
 
 	if (bkey_val_bytes(k.k) < sizeof(*inode.v)) {
-		pr_buf(err, "incorrect value size (%zu < %zu)",
+		prt_printf(err, "incorrect value size (%zu < %zu)",
 		       bkey_val_bytes(k.k), sizeof(*inode.v));
 		return -EINVAL;
 	}
 
 	if (INODE_STR_HASH(inode.v) >= BCH_STR_HASH_NR) {
-		pr_buf(err, "invalid str hash type (%llu >= %u)",
+		prt_printf(err, "invalid str hash type (%llu >= %u)",
 		       INODE_STR_HASH(inode.v), BCH_STR_HASH_NR);
 		return -EINVAL;
 	}
@@ -364,13 +364,13 @@ int bch2_inode_v2_invalid(const struct bch_fs *c, struct bkey_s_c k,
 	struct bkey_s_c_inode_v2 inode = bkey_s_c_to_inode_v2(k);
 
 	if (bkey_val_bytes(k.k) < sizeof(*inode.v)) {
-		pr_buf(err, "incorrect value size (%zu < %zu)",
+		prt_printf(err, "incorrect value size (%zu < %zu)",
 		       bkey_val_bytes(k.k), sizeof(*inode.v));
 		return -EINVAL;
 	}
 
 	if (INODEv2_STR_HASH(inode.v) >= BCH_STR_HASH_NR) {
-		pr_buf(err, "invalid str hash type (%llu >= %u)",
+		prt_printf(err, "invalid str hash type (%llu >= %u)",
 		       INODEv2_STR_HASH(inode.v), BCH_STR_HASH_NR);
 		return -EINVAL;
 	}
@@ -380,19 +380,19 @@ int bch2_inode_v2_invalid(const struct bch_fs *c, struct bkey_s_c k,
 
 static void __bch2_inode_unpacked_to_text(struct printbuf *out, struct bch_inode_unpacked *inode)
 {
-	pr_buf(out, "mode %o flags %x journal_seq %llu",
+	prt_printf(out, "mode %o flags %x journal_seq %llu",
 	       inode->bi_mode, inode->bi_flags,
 	       inode->bi_journal_seq);
 
 #define x(_name, _bits)						\
-	pr_buf(out, " "#_name " %llu", (u64) inode->_name);
+	prt_printf(out, " "#_name " %llu", (u64) inode->_name);
 	BCH_INODE_FIELDS()
 #undef  x
 }
 
 void bch2_inode_unpacked_to_text(struct printbuf *out, struct bch_inode_unpacked *inode)
 {
-	pr_buf(out, "inum: %llu ", inode->bi_inum);
+	prt_printf(out, "inum: %llu ", inode->bi_inum);
 	__bch2_inode_unpacked_to_text(out, inode);
 }
 
@@ -402,7 +402,7 @@ void bch2_inode_to_text(struct printbuf *out, struct bch_fs *c,
 	struct bch_inode_unpacked inode;
 
 	if (bch2_inode_unpack(k, &inode)) {
-		pr_buf(out, "(unpack error)");
+		prt_printf(out, "(unpack error)");
 		return;
 	}
 
@@ -413,12 +413,12 @@ int bch2_inode_generation_invalid(const struct bch_fs *c, struct bkey_s_c k,
 				  int rw, struct printbuf *err)
 {
 	if (k.k->p.inode) {
-		pr_buf(err, "nonzero k.p.inode");
+		prt_printf(err, "nonzero k.p.inode");
 		return -EINVAL;
 	}
 
 	if (bkey_val_bytes(k.k) != sizeof(struct bch_inode_generation)) {
-		pr_buf(err, "incorrect value size (%zu != %zu)",
+		prt_printf(err, "incorrect value size (%zu != %zu)",
 		       bkey_val_bytes(k.k), sizeof(struct bch_inode_generation));
 		return -EINVAL;
 	}
@@ -431,7 +431,7 @@ void bch2_inode_generation_to_text(struct printbuf *out, struct bch_fs *c,
 {
 	struct bkey_s_c_inode_generation gen = bkey_s_c_to_inode_generation(k);
 
-	pr_buf(out, "generation: %u", le32_to_cpu(gen.v->bi_generation));
+	prt_printf(out, "generation: %u", le32_to_cpu(gen.v->bi_generation));
 }
 
 void bch2_inode_init_early(struct bch_fs *c,
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index f20891d48ca8..f41d7943fb4f 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -2094,18 +2094,18 @@ static noinline void read_from_stale_dirty_pointer(struct btree_trans *trans,
 			     PTR_BUCKET_POS(c, &ptr),
 			     BTREE_ITER_CACHED);
 
-	pr_buf(&buf, "Attempting to read from stale dirty pointer:");
-	pr_indent_push(&buf, 2);
-	pr_newline(&buf);
+	prt_printf(&buf, "Attempting to read from stale dirty pointer:");
+	printbuf_indent_add(&buf, 2);
+	prt_newline(&buf);
 
 	bch2_bkey_val_to_text(&buf, c, k);
-	pr_newline(&buf);
+	prt_newline(&buf);
 
-	pr_buf(&buf, "memory gen: %u", *bucket_gen(ca, iter.pos.offset));
+	prt_printf(&buf, "memory gen: %u", *bucket_gen(ca, iter.pos.offset));
 
 	ret = lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_slot(&iter)));
 	if (!ret) {
-		pr_newline(&buf);
+		prt_newline(&buf);
 		bch2_bkey_val_to_text(&buf, c, k);
 	}
 
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index f8b57de31d93..a71bd1bb4066 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -1254,90 +1254,91 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
 	u64 seq;
 	unsigned i;
 
+	if (!out->nr_tabstops)
+		printbuf_tabstop_push(out, 24);
 	out->atomic++;
-	out->tabstops[0] = 24;
 
 	rcu_read_lock();
 	s = READ_ONCE(j->reservations);
 
-	pr_buf(out, "dirty journal entries:\t%llu/%llu\n",fifo_used(&j->pin), j->pin.size);
-	pr_buf(out, "seq:\t\t\t%llu\n",			journal_cur_seq(j));
-	pr_buf(out, "seq_ondisk:\t\t%llu\n",		j->seq_ondisk);
-	pr_buf(out, "last_seq:\t\t%llu\n",		journal_last_seq(j));
-	pr_buf(out, "last_seq_ondisk:\t%llu\n",		j->last_seq_ondisk);
-	pr_buf(out, "flushed_seq_ondisk:\t%llu\n",	j->flushed_seq_ondisk);
-	pr_buf(out, "prereserved:\t\t%u/%u\n",		j->prereserved.reserved, j->prereserved.remaining);
-	pr_buf(out, "watermark:\t\t%s\n",		bch2_journal_watermarks[j->watermark]);
-	pr_buf(out, "each entry reserved:\t%u\n",	j->entry_u64s_reserved);
-	pr_buf(out, "nr flush writes:\t%llu\n",		j->nr_flush_writes);
-	pr_buf(out, "nr noflush writes:\t%llu\n",	j->nr_noflush_writes);
-	pr_buf(out, "nr direct reclaim:\t%llu\n",	j->nr_direct_reclaim);
-	pr_buf(out, "nr background reclaim:\t%llu\n",	j->nr_background_reclaim);
-	pr_buf(out, "reclaim kicked:\t\t%u\n",		j->reclaim_kicked);
-	pr_buf(out, "reclaim runs in:\t%u ms\n",	time_after(j->next_reclaim, now)
+	prt_printf(out, "dirty journal entries:\t%llu/%llu\n",fifo_used(&j->pin), j->pin.size);
+	prt_printf(out, "seq:\t\t\t%llu\n",			journal_cur_seq(j));
+	prt_printf(out, "seq_ondisk:\t\t%llu\n",		j->seq_ondisk);
+	prt_printf(out, "last_seq:\t\t%llu\n",		journal_last_seq(j));
+	prt_printf(out, "last_seq_ondisk:\t%llu\n",		j->last_seq_ondisk);
+	prt_printf(out, "flushed_seq_ondisk:\t%llu\n",	j->flushed_seq_ondisk);
+	prt_printf(out, "prereserved:\t\t%u/%u\n",		j->prereserved.reserved, j->prereserved.remaining);
+	prt_printf(out, "watermark:\t\t%s\n",		bch2_journal_watermarks[j->watermark]);
+	prt_printf(out, "each entry reserved:\t%u\n",	j->entry_u64s_reserved);
+	prt_printf(out, "nr flush writes:\t%llu\n",		j->nr_flush_writes);
+	prt_printf(out, "nr noflush writes:\t%llu\n",	j->nr_noflush_writes);
+	prt_printf(out, "nr direct reclaim:\t%llu\n",	j->nr_direct_reclaim);
+	prt_printf(out, "nr background reclaim:\t%llu\n",	j->nr_background_reclaim);
+	prt_printf(out, "reclaim kicked:\t\t%u\n",		j->reclaim_kicked);
+	prt_printf(out, "reclaim runs in:\t%u ms\n",	time_after(j->next_reclaim, now)
 	       ? jiffies_to_msecs(j->next_reclaim - jiffies) : 0);
-	pr_buf(out, "current entry sectors:\t%u\n",	j->cur_entry_sectors);
-	pr_buf(out, "current entry error:\t%s\n",	bch2_journal_errors[j->cur_entry_error]);
-	pr_buf(out, "current entry:\t\t");
+	prt_printf(out, "current entry sectors:\t%u\n",	j->cur_entry_sectors);
+	prt_printf(out, "current entry error:\t%s\n",	bch2_journal_errors[j->cur_entry_error]);
+	prt_printf(out, "current entry:\t\t");
 
 	switch (s.cur_entry_offset) {
 	case JOURNAL_ENTRY_ERROR_VAL:
-		pr_buf(out, "error");
+		prt_printf(out, "error");
 		break;
 	case JOURNAL_ENTRY_CLOSED_VAL:
-		pr_buf(out, "closed");
+		prt_printf(out, "closed");
 		break;
 	default:
-		pr_buf(out, "%u/%u", s.cur_entry_offset, j->cur_entry_u64s);
+		prt_printf(out, "%u/%u", s.cur_entry_offset, j->cur_entry_u64s);
 		break;
 	}
 
-	pr_newline(out);
+	prt_newline(out);
 
 	for (seq = journal_cur_seq(j);
 	     seq >= journal_last_unwritten_seq(j);
 	     --seq) {
 		i = seq & JOURNAL_BUF_MASK;
 
-		pr_buf(out, "unwritten entry:");
-		pr_tab(out);
-		pr_buf(out, "%llu", seq);
-		pr_newline(out);
-		pr_indent_push(out, 2);
+		prt_printf(out, "unwritten entry:");
+		prt_tab(out);
+		prt_printf(out, "%llu", seq);
+		prt_newline(out);
+		printbuf_indent_add(out, 2);
 
-		pr_buf(out, "refcount:");
-		pr_tab(out);
-		pr_buf(out, "%u", journal_state_count(s, i));
-		pr_newline(out);
+		prt_printf(out, "refcount:");
+		prt_tab(out);
+		prt_printf(out, "%u", journal_state_count(s, i));
+		prt_newline(out);
 
-		pr_buf(out, "sectors:");
-		pr_tab(out);
-		pr_buf(out, "%u", j->buf[i].sectors);
-		pr_newline(out);
+		prt_printf(out, "sectors:");
+		prt_tab(out);
+		prt_printf(out, "%u", j->buf[i].sectors);
+		prt_newline(out);
 
-		pr_buf(out, "expires");
-		pr_tab(out);
-		pr_buf(out, "%li jiffies", j->buf[i].expires - jiffies);
-		pr_newline(out);
+		prt_printf(out, "expires");
+		prt_tab(out);
+		prt_printf(out, "%li jiffies", j->buf[i].expires - jiffies);
+		prt_newline(out);
 
-		pr_indent_pop(out, 2);
+		printbuf_indent_sub(out, 2);
 	}
 
-	pr_buf(out,
+	prt_printf(out,
 	       "replay done:\t\t%i\n",
 	       test_bit(JOURNAL_REPLAY_DONE,	&j->flags));
 
-	pr_buf(out, "space:\n");
-	pr_buf(out, "\tdiscarded\t%u:%u\n",
+	prt_printf(out, "space:\n");
+	prt_printf(out, "\tdiscarded\t%u:%u\n",
 	       j->space[journal_space_discarded].next_entry,
 	       j->space[journal_space_discarded].total);
-	pr_buf(out, "\tclean ondisk\t%u:%u\n",
+	prt_printf(out, "\tclean ondisk\t%u:%u\n",
 	       j->space[journal_space_clean_ondisk].next_entry,
 	       j->space[journal_space_clean_ondisk].total);
-	pr_buf(out, "\tclean\t\t%u:%u\n",
+	prt_printf(out, "\tclean\t\t%u:%u\n",
 	       j->space[journal_space_clean].next_entry,
 	       j->space[journal_space_clean].total);
-	pr_buf(out, "\ttotal\t\t%u:%u\n",
+	prt_printf(out, "\ttotal\t\t%u:%u\n",
 	       j->space[journal_space_total].next_entry,
 	       j->space[journal_space_total].total);
 
@@ -1351,14 +1352,14 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
 		if (!ja->nr)
 			continue;
 
-		pr_buf(out, "dev %u:\n",		i);
-		pr_buf(out, "\tnr\t\t%u\n",		ja->nr);
-		pr_buf(out, "\tbucket size\t%u\n",	ca->mi.bucket_size);
-		pr_buf(out, "\tavailable\t%u:%u\n",	bch2_journal_dev_buckets_available(j, ja, journal_space_discarded), ja->sectors_free);
-		pr_buf(out, "\tdiscard_idx\t%u\n",	ja->discard_idx);
-		pr_buf(out, "\tdirty_ondisk\t%u (seq %llu)\n", ja->dirty_idx_ondisk,	ja->bucket_seq[ja->dirty_idx_ondisk]);
-		pr_buf(out, "\tdirty_idx\t%u (seq %llu)\n", ja->dirty_idx,		ja->bucket_seq[ja->dirty_idx]);
-		pr_buf(out, "\tcur_idx\t\t%u (seq %llu)\n", ja->cur_idx,		ja->bucket_seq[ja->cur_idx]);
+		prt_printf(out, "dev %u:\n",		i);
+		prt_printf(out, "\tnr\t\t%u\n",		ja->nr);
+		prt_printf(out, "\tbucket size\t%u\n",	ca->mi.bucket_size);
+		prt_printf(out, "\tavailable\t%u:%u\n",	bch2_journal_dev_buckets_available(j, ja, journal_space_discarded), ja->sectors_free);
+		prt_printf(out, "\tdiscard_idx\t%u\n",	ja->discard_idx);
+		prt_printf(out, "\tdirty_ondisk\t%u (seq %llu)\n", ja->dirty_idx_ondisk,	ja->bucket_seq[ja->dirty_idx_ondisk]);
+		prt_printf(out, "\tdirty_idx\t%u (seq %llu)\n", ja->dirty_idx,		ja->bucket_seq[ja->dirty_idx]);
+		prt_printf(out, "\tcur_idx\t\t%u (seq %llu)\n", ja->cur_idx,		ja->bucket_seq[ja->cur_idx]);
 	}
 
 	rcu_read_unlock();
@@ -1390,31 +1391,31 @@ bool bch2_journal_seq_pins_to_text(struct printbuf *out, struct journal *j, u64
 
 	pin_list = journal_seq_pin(j, *seq);
 
-	pr_buf(out, "%llu: count %u", *seq, atomic_read(&pin_list->count));
-	pr_newline(out);
-	pr_indent_push(out, 2);
+	prt_printf(out, "%llu: count %u", *seq, atomic_read(&pin_list->count));
+	prt_newline(out);
+	printbuf_indent_add(out, 2);
 
 	list_for_each_entry(pin, &pin_list->list, list) {
-		pr_buf(out, "\t%px %ps", pin, pin->flush);
-		pr_newline(out);
+		prt_printf(out, "\t%px %ps", pin, pin->flush);
+		prt_newline(out);
 	}
 
 	list_for_each_entry(pin, &pin_list->key_cache_list, list) {
-		pr_buf(out, "\t%px %ps", pin, pin->flush);
-		pr_newline(out);
+		prt_printf(out, "\t%px %ps", pin, pin->flush);
+		prt_newline(out);
 	}
 
 	if (!list_empty(&pin_list->flushed)) {
-		pr_buf(out, "flushed:");
-		pr_newline(out);
+		prt_printf(out, "flushed:");
+		prt_newline(out);
 	}
 
 	list_for_each_entry(pin, &pin_list->flushed, list) {
-		pr_buf(out, "\t%px %ps", pin, pin->flush);
-		pr_newline(out);
+		prt_printf(out, "\t%px %ps", pin, pin->flush);
+		prt_newline(out);
 	}
 
-	pr_indent_pop(out, 2);
+	printbuf_indent_sub(out, 2);
 
 	--out->atomic;
 	spin_unlock(&j->lock);
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 163b18340fa1..4b4a1d000219 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -259,15 +259,15 @@ static int journal_validate_key(struct bch_fs *c, const char *where,
 	if (bch2_bkey_invalid(c, bkey_i_to_s_c(k),
 			      __btree_node_type(level, btree_id), write, &buf)) {
 		printbuf_reset(&buf);
-		pr_buf(&buf, "invalid key in %s at %s offset %zi/%u:",
+		prt_printf(&buf, "invalid key in %s at %s offset %zi/%u:",
 		       bch2_jset_entry_types[entry->type], where,
 		       (u64 *) k - entry->_data,
 		       le16_to_cpu(entry->u64s));
-		pr_newline(&buf);
-		pr_indent_push(&buf, 2);
+		prt_newline(&buf);
+		printbuf_indent_add(&buf, 2);
 
 		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k));
-		pr_newline(&buf);
+		prt_newline(&buf);
 		bch2_bkey_invalid(c, bkey_i_to_s_c(k),
 				  __btree_node_type(level, btree_id), write, &buf);
 
@@ -318,10 +318,10 @@ static void journal_entry_btree_keys_to_text(struct printbuf *out, struct bch_fs
 
 	vstruct_for_each(entry, k) {
 		if (!first) {
-			pr_newline(out);
-			pr_buf(out, "%s: ", bch2_jset_entry_types[entry->type]);
+			prt_newline(out);
+			prt_printf(out, "%s: ", bch2_jset_entry_types[entry->type]);
 		}
-		pr_buf(out, "btree=%s l=%u ", bch2_btree_ids[entry->btree_id], entry->level);
+		prt_printf(out, "btree=%s l=%u ", bch2_btree_ids[entry->btree_id], entry->level);
 		bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(k));
 		first = false;
 	}
@@ -396,7 +396,7 @@ static void journal_entry_blacklist_to_text(struct printbuf *out, struct bch_fs
 	struct jset_entry_blacklist *bl =
 		container_of(entry, struct jset_entry_blacklist, entry);
 
-	pr_buf(out, "seq=%llu", le64_to_cpu(bl->seq));
+	prt_printf(out, "seq=%llu", le64_to_cpu(bl->seq));
 }
 
 static int journal_entry_blacklist_v2_validate(struct bch_fs *c,
@@ -431,7 +431,7 @@ static void journal_entry_blacklist_v2_to_text(struct printbuf *out, struct bch_
 	struct jset_entry_blacklist_v2 *bl =
 		container_of(entry, struct jset_entry_blacklist_v2, entry);
 
-	pr_buf(out, "start=%llu end=%llu",
+	prt_printf(out, "start=%llu end=%llu",
 	       le64_to_cpu(bl->start),
 	       le64_to_cpu(bl->end));
 }
@@ -463,7 +463,7 @@ static void journal_entry_usage_to_text(struct printbuf *out, struct bch_fs *c,
 	struct jset_entry_usage *u =
 		container_of(entry, struct jset_entry_usage, entry);
 
-	pr_buf(out, "type=%s v=%llu",
+	prt_printf(out, "type=%s v=%llu",
 	       bch2_fs_usage_types[u->entry.btree_id],
 	       le64_to_cpu(u->v));
 }
@@ -497,7 +497,7 @@ static void journal_entry_data_usage_to_text(struct printbuf *out, struct bch_fs
 		container_of(entry, struct jset_entry_data_usage, entry);
 
 	bch2_replicas_entry_to_text(out, &u->r);
-	pr_buf(out, "=%llu", le64_to_cpu(u->v));
+	prt_printf(out, "=%llu", le64_to_cpu(u->v));
 }
 
 static int journal_entry_clock_validate(struct bch_fs *c,
@@ -532,7 +532,7 @@ static void journal_entry_clock_to_text(struct printbuf *out, struct bch_fs *c,
 	struct jset_entry_clock *clock =
 		container_of(entry, struct jset_entry_clock, entry);
 
-	pr_buf(out, "%s=%llu", clock->rw ? "write" : "read", le64_to_cpu(clock->time));
+	prt_printf(out, "%s=%llu", clock->rw ? "write" : "read", le64_to_cpu(clock->time));
 }
 
 static int journal_entry_dev_usage_validate(struct bch_fs *c,
@@ -579,20 +579,20 @@ static void journal_entry_dev_usage_to_text(struct printbuf *out, struct bch_fs
 		container_of(entry, struct jset_entry_dev_usage, entry);
 	unsigned i, nr_types = jset_entry_dev_usage_nr_types(u);
 
-	pr_buf(out, "dev=%u", le32_to_cpu(u->dev));
+	prt_printf(out, "dev=%u", le32_to_cpu(u->dev));
 
 	for (i = 0; i < nr_types; i++) {
 		if (i < BCH_DATA_NR)
-			pr_buf(out, " %s", bch2_data_types[i]);
+			prt_printf(out, " %s", bch2_data_types[i]);
 		else
-			pr_buf(out, " (unknown data type %u)", i);
-		pr_buf(out, ": buckets=%llu sectors=%llu fragmented=%llu",
+			prt_printf(out, " (unknown data type %u)", i);
+		prt_printf(out, ": buckets=%llu sectors=%llu fragmented=%llu",
 		       le64_to_cpu(u->d[i].buckets),
 		       le64_to_cpu(u->d[i].sectors),
 		       le64_to_cpu(u->d[i].fragmented));
 	}
 
-	pr_buf(out, " buckets_ec: %llu", le64_to_cpu(u->buckets_ec));
+	prt_printf(out, " buckets_ec: %llu", le64_to_cpu(u->buckets_ec));
 }
 
 static int journal_entry_log_validate(struct bch_fs *c,
@@ -609,7 +609,7 @@ static void journal_entry_log_to_text(struct printbuf *out, struct bch_fs *c,
 	struct jset_entry_log *l = container_of(entry, struct jset_entry_log, entry);
 	unsigned bytes = vstruct_bytes(entry) - offsetof(struct jset_entry_log, d);
 
-	pr_buf(out, "%.*s", bytes, l->d);
+	prt_printf(out, "%.*s", bytes, l->d);
 }
 
 static int journal_entry_overwrite_validate(struct bch_fs *c, const char *where,
@@ -655,10 +655,10 @@ void bch2_journal_entry_to_text(struct printbuf *out, struct bch_fs *c,
 				struct jset_entry *entry)
 {
 	if (entry->type < BCH_JSET_ENTRY_NR) {
-		pr_buf(out, "%s: ", bch2_jset_entry_types[entry->type]);
+		prt_printf(out, "%s: ", bch2_jset_entry_types[entry->type]);
 		bch2_jset_entry_ops[entry->type].to_text(out, c, entry);
 	} else {
-		pr_buf(out, "(unknown type %u)", entry->type);
+		prt_printf(out, "(unknown type %u)", entry->type);
 	}
 }
 
@@ -1039,8 +1039,8 @@ void bch2_journal_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
 		div64_u64_rem(j->ptrs[i].sector, ca->mi.bucket_size, &offset);
 
 		if (i)
-			pr_buf(out, " ");
-		pr_buf(out, "%u:%u:%u (sector %llu)",
+			prt_printf(out, " ");
+		prt_printf(out, "%u:%u:%u (sector %llu)",
 		       j->ptrs[i].dev,
 		       j->ptrs[i].bucket,
 		       j->ptrs[i].bucket_offset,
@@ -1172,9 +1172,9 @@ int bch2_journal_read(struct bch_fs *c, u64 *blacklist_seq, u64 *start_seq)
 
 			if (prev) {
 				bch2_journal_ptrs_to_text(&buf1, c, prev);
-				pr_buf(&buf1, " size %zu", vstruct_sectors(&prev->j, c->block_bits));
+				prt_printf(&buf1, " size %zu", vstruct_sectors(&prev->j, c->block_bits));
 			} else
-				pr_buf(&buf1, "(none)");
+				prt_printf(&buf1, "(none)");
 			bch2_journal_ptrs_to_text(&buf2, c, i);
 
 			missing_end = seq - 1;
diff --git a/fs/bcachefs/journal_sb.c b/fs/bcachefs/journal_sb.c
index 6d984313d4b5..001cecec1291 100644
--- a/fs/bcachefs/journal_sb.c
+++ b/fs/bcachefs/journal_sb.c
@@ -41,25 +41,25 @@ static int bch2_sb_journal_validate(struct bch_sb *sb,
 	sort(b, nr, sizeof(u64), u64_cmp, NULL);
 
 	if (!b[0]) {
-		pr_buf(err, "journal bucket at sector 0");
+		prt_printf(err, "journal bucket at sector 0");
 		goto err;
 	}
 
 	if (b[0] < le16_to_cpu(m->first_bucket)) {
-		pr_buf(err, "journal bucket %llu before first bucket %u",
+		prt_printf(err, "journal bucket %llu before first bucket %u",
 		       b[0], le16_to_cpu(m->first_bucket));
 		goto err;
 	}
 
 	if (b[nr - 1] >= le64_to_cpu(m->nbuckets)) {
-		pr_buf(err, "journal bucket %llu past end of device (nbuckets %llu)",
+		prt_printf(err, "journal bucket %llu past end of device (nbuckets %llu)",
 		       b[nr - 1], le64_to_cpu(m->nbuckets));
 		goto err;
 	}
 
 	for (i = 0; i + 1 < nr; i++)
 		if (b[i] == b[i + 1]) {
-			pr_buf(err, "duplicate journal buckets %llu", b[i]);
+			prt_printf(err, "duplicate journal buckets %llu", b[i]);
 			goto err;
 		}
 
@@ -75,10 +75,10 @@ static void bch2_sb_journal_to_text(struct printbuf *out, struct bch_sb *sb,
 	struct bch_sb_field_journal *journal = field_to_type(f, journal);
 	unsigned i, nr = bch2_nr_journal_buckets(journal);
 
-	pr_buf(out, "Buckets: ");
+	prt_printf(out, "Buckets: ");
 	for (i = 0; i < nr; i++)
-		pr_buf(out, " %llu", le64_to_cpu(journal->buckets[i]));
-	pr_newline(out);
+		prt_printf(out, " %llu", le64_to_cpu(journal->buckets[i]));
+	prt_newline(out);
 }
 
 const struct bch_sb_field_ops bch_sb_field_ops_journal = {
@@ -126,25 +126,25 @@ static int bch2_sb_journal_v2_validate(struct bch_sb *sb,
 	sort(b, nr, sizeof(*b), u64_range_cmp, NULL);
 
 	if (!b[0].start) {
-		pr_buf(err, "journal bucket at sector 0");
+		prt_printf(err, "journal bucket at sector 0");
 		goto err;
 	}
 
 	if (b[0].start < le16_to_cpu(m->first_bucket)) {
-		pr_buf(err, "journal bucket %llu before first bucket %u",
+		prt_printf(err, "journal bucket %llu before first bucket %u",
 		       b[0].start, le16_to_cpu(m->first_bucket));
 		goto err;
 	}
 
 	if (b[nr - 1].end > le64_to_cpu(m->nbuckets)) {
-		pr_buf(err, "journal bucket %llu past end of device (nbuckets %llu)",
+		prt_printf(err, "journal bucket %llu past end of device (nbuckets %llu)",
 		       b[nr - 1].end - 1, le64_to_cpu(m->nbuckets));
 		goto err;
 	}
 
 	for (i = 0; i + 1 < nr; i++) {
 		if (b[i].end > b[i + 1].start) {
-			pr_buf(err, "duplicate journal buckets in ranges %llu-%llu, %llu-%llu",
+			prt_printf(err, "duplicate journal buckets in ranges %llu-%llu, %llu-%llu",
 			       b[i].start, b[i].end, b[i + 1].start, b[i + 1].end);
 			goto err;
 		}
@@ -162,12 +162,12 @@ static void bch2_sb_journal_v2_to_text(struct printbuf *out, struct bch_sb *sb,
 	struct bch_sb_field_journal_v2 *journal = field_to_type(f, journal_v2);
 	unsigned i, nr = bch2_sb_field_journal_v2_nr_entries(journal);
 
-	pr_buf(out, "Buckets: ");
+	prt_printf(out, "Buckets: ");
 	for (i = 0; i < nr; i++)
-		pr_buf(out, " %llu-%llu",
+		prt_printf(out, " %llu-%llu",
 		       le64_to_cpu(journal->d[i].start),
 		       le64_to_cpu(journal->d[i].start) + le64_to_cpu(journal->d[i].nr));
-	pr_newline(out);
+	prt_newline(out);
 }
 
 const struct bch_sb_field_ops bch_sb_field_ops_journal_v2 = {
diff --git a/fs/bcachefs/journal_seq_blacklist.c b/fs/bcachefs/journal_seq_blacklist.c
index 3140c8731431..d9b4042a2e4a 100644
--- a/fs/bcachefs/journal_seq_blacklist.c
+++ b/fs/bcachefs/journal_seq_blacklist.c
@@ -201,7 +201,7 @@ static int bch2_sb_journal_seq_blacklist_validate(struct bch_sb *sb,
 
 		if (le64_to_cpu(e->start) >=
 		    le64_to_cpu(e->end)) {
-			pr_buf(err, "entry %u start >= end (%llu >= %llu)",
+			prt_printf(err, "entry %u start >= end (%llu >= %llu)",
 			       i, le64_to_cpu(e->start), le64_to_cpu(e->end));
 			return -EINVAL;
 		}
@@ -209,7 +209,7 @@ static int bch2_sb_journal_seq_blacklist_validate(struct bch_sb *sb,
 		if (i + 1 < nr &&
 		    le64_to_cpu(e[0].end) >
 		    le64_to_cpu(e[1].start)) {
-			pr_buf(err, "entry %u out of order with next entry (%llu > %llu)",
+			prt_printf(err, "entry %u out of order with next entry (%llu > %llu)",
 			       i + 1, le64_to_cpu(e[0].end), le64_to_cpu(e[1].start));
 			return -EINVAL;
 		}
@@ -229,13 +229,13 @@ static void bch2_sb_journal_seq_blacklist_to_text(struct printbuf *out,
 
 	for (i = bl->start; i < bl->start + nr; i++) {
 		if (i != bl->start)
-			pr_buf(out, " ");
+			prt_printf(out, " ");
 
-		pr_buf(out, "%llu-%llu",
+		prt_printf(out, "%llu-%llu",
 		       le64_to_cpu(i->start),
 		       le64_to_cpu(i->end));
 	}
-	pr_newline(out);
+	prt_newline(out);
 }
 
 const struct bch_sb_field_ops bch_sb_field_ops_journal_seq_blacklist = {
diff --git a/fs/bcachefs/lru.c b/fs/bcachefs/lru.c
index d278331776dd..5a09b55006ff 100644
--- a/fs/bcachefs/lru.c
+++ b/fs/bcachefs/lru.c
@@ -14,7 +14,7 @@ int bch2_lru_invalid(const struct bch_fs *c, struct bkey_s_c k,
 	const struct bch_lru *lru = bkey_s_c_to_lru(k).v;
 
 	if (bkey_val_bytes(k.k) < sizeof(*lru)) {
-		pr_buf(err, "incorrect value size (%zu < %zu)",
+		prt_printf(err, "incorrect value size (%zu < %zu)",
 		       bkey_val_bytes(k.k), sizeof(*lru));
 		return -EINVAL;
 	}
@@ -27,7 +27,7 @@ void bch2_lru_to_text(struct printbuf *out, struct bch_fs *c,
 {
 	const struct bch_lru *lru = bkey_s_c_to_lru(k).v;
 
-	pr_buf(out, "idx %llu", le64_to_cpu(lru->idx));
+	prt_printf(out, "idx %llu", le64_to_cpu(lru->idx));
 }
 
 int bch2_lru_delete(struct btree_trans *trans, u64 id, u64 idx, u64 time,
diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c
index c4ccb42d7851..5540d5d98d84 100644
--- a/fs/bcachefs/opts.c
+++ b/fs/bcachefs/opts.c
@@ -228,28 +228,28 @@ int bch2_opt_validate(const struct bch_option *opt, u64 v, struct printbuf *err)
 {
 	if (v < opt->min) {
 		if (err)
-			pr_buf(err, "%s: too small (min %llu)",
+			prt_printf(err, "%s: too small (min %llu)",
 			       opt->attr.name, opt->min);
 		return -ERANGE;
 	}
 
 	if (opt->max && v >= opt->max) {
 		if (err)
-			pr_buf(err, "%s: too big (max %llu)",
+			prt_printf(err, "%s: too big (max %llu)",
 			       opt->attr.name, opt->max);
 		return -ERANGE;
 	}
 
 	if ((opt->flags & OPT_SB_FIELD_SECTORS) && (v & 511)) {
 		if (err)
-			pr_buf(err, "%s: not a multiple of 512",
+			prt_printf(err, "%s: not a multiple of 512",
 			       opt->attr.name);
 		return -EINVAL;
 	}
 
 	if ((opt->flags & OPT_MUST_BE_POW_2) && !is_power_of_2(v)) {
 		if (err)
-			pr_buf(err, "%s: must be a power of two",
+			prt_printf(err, "%s: must be a power of two",
 			       opt->attr.name);
 		return -EINVAL;
 	}
@@ -269,8 +269,8 @@ int bch2_opt_parse(struct bch_fs *c,
 		ret = kstrtou64(val, 10, res);
 		if (ret < 0 || (*res != 0 && *res != 1)) {
 			if (err)
-				pr_buf(err, "%s: must be bool",
-				       opt->attr.name);
+				prt_printf(err, "%s: must be bool",
+					   opt->attr.name);
 			return ret;
 		}
 		break;
@@ -280,8 +280,8 @@ int bch2_opt_parse(struct bch_fs *c,
 			: kstrtou64(val, 10, res);
 		if (ret < 0) {
 			if (err)
-				pr_buf(err, "%s: must be a number",
-				       opt->attr.name);
+				prt_printf(err, "%s: must be a number",
+					   opt->attr.name);
 			return ret;
 		}
 		break;
@@ -289,8 +289,8 @@ int bch2_opt_parse(struct bch_fs *c,
 		ret = match_string(opt->choices, -1, val);
 		if (ret < 0) {
 			if (err)
-				pr_buf(err, "%s: invalid selection",
-				       opt->attr.name);
+				prt_printf(err, "%s: invalid selection",
+					   opt->attr.name);
 			return ret;
 		}
 
@@ -303,8 +303,8 @@ int bch2_opt_parse(struct bch_fs *c,
 		ret = opt->parse(c, val, res);
 		if (ret < 0) {
 			if (err)
-				pr_buf(err, "%s: parse error",
-				       opt->attr.name);
+				prt_printf(err, "%s: parse error",
+					   opt->attr.name);
 			return ret;
 		}
 	}
@@ -319,28 +319,28 @@ void bch2_opt_to_text(struct printbuf *out,
 {
 	if (flags & OPT_SHOW_MOUNT_STYLE) {
 		if (opt->type == BCH_OPT_BOOL) {
-			pr_buf(out, "%s%s",
+			prt_printf(out, "%s%s",
 			       v ? "" : "no",
 			       opt->attr.name);
 			return;
 		}
 
-		pr_buf(out, "%s=", opt->attr.name);
+		prt_printf(out, "%s=", opt->attr.name);
 	}
 
 	switch (opt->type) {
 	case BCH_OPT_BOOL:
 	case BCH_OPT_UINT:
 		if (opt->flags & OPT_HUMAN_READABLE)
-			bch2_hprint(out, v);
+			prt_human_readable_u64(out, v);
 		else
-			pr_buf(out, "%lli", v);
+			prt_printf(out, "%lli", v);
 		break;
 	case BCH_OPT_STR:
 		if (flags & OPT_SHOW_FULL_LIST)
-			bch2_string_opt_to_text(out, opt->choices, v);
+			prt_string_option(out, opt->choices, v);
 		else
-			pr_buf(out, "%s", opt->choices[v]);
+			prt_printf(out, "%s", opt->choices[v]);
 		break;
 	case BCH_OPT_FN:
 		opt->to_text(out, c, sb, v);
diff --git a/fs/bcachefs/printbuf.c b/fs/bcachefs/printbuf.c
new file mode 100644
index 000000000000..c41daa180682
--- /dev/null
+++ b/fs/bcachefs/printbuf.c
@@ -0,0 +1,415 @@
+// SPDX-License-Identifier: LGPL-2.1+
+/* Copyright (C) 2022 Kent Overstreet */
+
+#include <linux/err.h>
+#include <linux/export.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/string_helpers.h>
+
+#include "printbuf.h"
+
+static inline unsigned printbuf_linelen(struct printbuf *buf)
+{
+	return buf->pos - buf->last_newline;
+}
+
+int bch2_printbuf_make_room(struct printbuf *out, unsigned extra)
+{
+	unsigned new_size;
+	char *buf;
+
+	if (!out->heap_allocated)
+		return 0;
+
+	/* Reserved space for terminating nul: */
+	extra += 1;
+
+	if (out->pos + extra < out->size)
+		return 0;
+
+	new_size = roundup_pow_of_two(out->size + extra);
+
+	/*
+	 * Note: output buffer must be freeable with kfree(), it's not required
+	 * that the user use printbuf_exit().
+	 */
+	buf = krealloc(out->buf, new_size, !out->atomic ? GFP_KERNEL : GFP_NOWAIT);
+
+	if (!buf) {
+		out->allocation_failure = true;
+		return -ENOMEM;
+	}
+
+	out->buf	= buf;
+	out->size	= new_size;
+	return 0;
+}
+
+void bch2_prt_vprintf(struct printbuf *out, const char *fmt, va_list args)
+{
+	int len;
+
+	do {
+		va_list args2;
+
+		va_copy(args2, args);
+		len = vsnprintf(out->buf + out->pos, printbuf_remaining(out), fmt, args2);
+	} while (len + 1 >= printbuf_remaining(out) &&
+		 !bch2_printbuf_make_room(out, len + 1));
+
+	len = min_t(size_t, len,
+		  printbuf_remaining(out) ? printbuf_remaining(out) - 1 : 0);
+	out->pos += len;
+}
+
+void bch2_prt_printf(struct printbuf *out, const char *fmt, ...)
+{
+	va_list args;
+	int len;
+
+	do {
+		va_start(args, fmt);
+		len = vsnprintf(out->buf + out->pos, printbuf_remaining(out), fmt, args);
+		va_end(args);
+	} while (len + 1 >= printbuf_remaining(out) &&
+		 !bch2_printbuf_make_room(out, len + 1));
+
+	len = min_t(size_t, len,
+		  printbuf_remaining(out) ? printbuf_remaining(out) - 1 : 0);
+	out->pos += len;
+}
+
+/**
+ * printbuf_str - returns printbuf's buf as a C string, guaranteed to be null
+ * terminated
+ */
+const char *bch2_printbuf_str(const struct printbuf *buf)
+{
+	/*
+	 * If we've written to a printbuf then it's guaranteed to be a null
+	 * terminated string - but if we haven't, then we might not have
+	 * allocated a buffer at all:
+	 */
+	return buf->pos
+		? buf->buf
+		: "";
+}
+
+/**
+ * printbuf_exit - exit a printbuf, freeing memory it owns and poisoning it
+ * against accidental use.
+ */
+void bch2_printbuf_exit(struct printbuf *buf)
+{
+	if (buf->heap_allocated) {
+		kfree(buf->buf);
+		buf->buf = ERR_PTR(-EINTR); /* poison value */
+	}
+}
+
+void bch2_printbuf_tabstops_reset(struct printbuf *buf)
+{
+	buf->nr_tabstops = 0;
+}
+
+void bch2_printbuf_tabstop_pop(struct printbuf *buf)
+{
+	if (buf->nr_tabstops)
+		--buf->nr_tabstops;
+}
+
+/*
+ * printbuf_tabstop_set - add a tabstop, n spaces from the previous tabstop
+ *
+ * @buf: printbuf to control
+ * @spaces: number of spaces from previous tabpstop
+ *
+ * In the future this function may allocate memory if setting more than
+ * PRINTBUF_INLINE_TABSTOPS or setting tabstops more than 255 spaces from start
+ * of line.
+ */
+int bch2_printbuf_tabstop_push(struct printbuf *buf, unsigned spaces)
+{
+	unsigned prev_tabstop = buf->nr_tabstops
+		? buf->_tabstops[buf->nr_tabstops - 1]
+		: 0;
+
+	if (WARN_ON(buf->nr_tabstops >= ARRAY_SIZE(buf->_tabstops)))
+		return -EINVAL;
+
+	buf->_tabstops[buf->nr_tabstops++] = prev_tabstop + spaces;
+	buf->has_indent_or_tabstops = true;
+	return 0;
+}
+
+/**
+ * printbuf_indent_add - add to the current indent level
+ *
+ * @buf: printbuf to control
+ * @spaces: number of spaces to add to the current indent level
+ *
+ * Subsequent lines, and the current line if the output position is at the start
+ * of the current line, will be indented by @spaces more spaces.
+ */
+void bch2_printbuf_indent_add(struct printbuf *buf, unsigned spaces)
+{
+	if (WARN_ON_ONCE(buf->indent + spaces < buf->indent))
+		spaces = 0;
+
+	buf->indent += spaces;
+	prt_chars(buf, ' ', spaces);
+
+	buf->has_indent_or_tabstops = true;
+}
+
+/**
+ * printbuf_indent_sub - subtract from the current indent level
+ *
+ * @buf: printbuf to control
+ * @spaces: number of spaces to subtract from the current indent level
+ *
+ * Subsequent lines, and the current line if the output position is at the start
+ * of the current line, will be indented by @spaces less spaces.
+ */
+void bch2_printbuf_indent_sub(struct printbuf *buf, unsigned spaces)
+{
+	if (WARN_ON_ONCE(spaces > buf->indent))
+		spaces = buf->indent;
+
+	if (buf->last_newline + buf->indent == buf->pos) {
+		buf->pos -= spaces;
+		printbuf_nul_terminate(buf);
+	}
+	buf->indent -= spaces;
+
+	if (!buf->indent && !buf->nr_tabstops)
+		buf->has_indent_or_tabstops = false;
+}
+
+void bch2_prt_newline(struct printbuf *buf)
+{
+	unsigned i;
+
+	bch2_printbuf_make_room(buf, 1 + buf->indent);
+
+	__prt_char(buf, '\n');
+
+	buf->last_newline	= buf->pos;
+
+	for (i = 0; i < buf->indent; i++)
+		__prt_char(buf, ' ');
+
+	printbuf_nul_terminate(buf);
+
+	buf->last_field		= buf->pos;
+	buf->cur_tabstop	= 0;
+}
+
+/*
+ * Returns spaces from start of line, if set, or 0 if unset:
+ */
+static inline unsigned cur_tabstop(struct printbuf *buf)
+{
+	return buf->cur_tabstop < buf->nr_tabstops
+		? buf->_tabstops[buf->cur_tabstop]
+		: 0;
+}
+
+static void __prt_tab(struct printbuf *out)
+{
+	int spaces = max_t(int, 0, cur_tabstop(out) - printbuf_linelen(out));
+
+	prt_chars(out, ' ', spaces);
+
+	out->last_field = out->pos;
+	out->cur_tabstop++;
+}
+
+/**
+ * prt_tab - Advance printbuf to the next tabstop
+ *
+ * @buf: printbuf to control
+ *
+ * Advance output to the next tabstop by printing spaces.
+ */
+void bch2_prt_tab(struct printbuf *out)
+{
+	if (WARN_ON(!cur_tabstop(out)))
+		return;
+
+	__prt_tab(out);
+}
+
+static void __prt_tab_rjust(struct printbuf *buf)
+{
+	unsigned move = buf->pos - buf->last_field;
+	int pad = (int) cur_tabstop(buf) - (int) printbuf_linelen(buf);
+
+	if (pad > 0) {
+		bch2_printbuf_make_room(buf, pad);
+
+		if (buf->last_field + pad < buf->size)
+			memmove(buf->buf + buf->last_field + pad,
+				buf->buf + buf->last_field,
+				min(move, buf->size - 1 - buf->last_field - pad));
+
+		if (buf->last_field < buf->size)
+			memset(buf->buf + buf->last_field, ' ',
+			       min((unsigned) pad, buf->size - buf->last_field));
+
+		buf->pos += pad;
+		printbuf_nul_terminate(buf);
+	}
+
+	buf->last_field = buf->pos;
+	buf->cur_tabstop++;
+}
+
+/**
+ * prt_tab_rjust - Advance printbuf to the next tabstop, right justifying
+ * previous output
+ *
+ * @buf: printbuf to control
+ *
+ * Advance output to the next tabstop by inserting spaces immediately after the
+ * previous tabstop, right justifying previously outputted text.
+ */
+void bch2_prt_tab_rjust(struct printbuf *buf)
+{
+	if (WARN_ON(!cur_tabstop(buf)))
+		return;
+
+	__prt_tab_rjust(buf);
+}
+
+/**
+ * prt_bytes_indented - Print an array of chars, handling embedded control characters
+ *
+ * @out: printbuf to output to
+ * @str: string to print
+ * @count: number of bytes to print
+ *
+ * The following contol characters are handled as so:
+ *   \n: prt_newline	newline that obeys current indent level
+ *   \t: prt_tab	advance to next tabstop
+ *   \r: prt_tab_rjust	advance to next tabstop, with right justification
+ */
+void bch2_prt_bytes_indented(struct printbuf *out, const char *str, unsigned count)
+{
+	const char *unprinted_start = str;
+	const char *end = str + count;
+
+	if (!out->has_indent_or_tabstops || out->suppress_indent_tabstop_handling) {
+		prt_bytes(out, str, count);
+		return;
+	}
+
+	while (str != end) {
+		switch (*str) {
+		case '\n':
+			prt_bytes(out, unprinted_start, str - unprinted_start);
+			unprinted_start = str + 1;
+			bch2_prt_newline(out);
+			break;
+		case '\t':
+			if (likely(cur_tabstop(out))) {
+				prt_bytes(out, unprinted_start, str - unprinted_start);
+				unprinted_start = str + 1;
+				__prt_tab(out);
+			}
+			break;
+		case '\r':
+			if (likely(cur_tabstop(out))) {
+				prt_bytes(out, unprinted_start, str - unprinted_start);
+				unprinted_start = str + 1;
+				__prt_tab_rjust(out);
+			}
+			break;
+		}
+
+		str++;
+	}
+
+	prt_bytes(out, unprinted_start, str - unprinted_start);
+}
+
+/**
+ * prt_human_readable_u64 - Print out a u64 in human readable units
+ *
+ * Units of 2^10 (default) or 10^3 are controlled via @buf->si_units
+ */
+void bch2_prt_human_readable_u64(struct printbuf *buf, u64 v)
+{
+	bch2_printbuf_make_room(buf, 10);
+	buf->pos += string_get_size(v, 1, !buf->si_units,
+				    buf->buf + buf->pos,
+				    printbuf_remaining_size(buf));
+}
+
+/**
+ * prt_human_readable_s64 - Print out a s64 in human readable units
+ *
+ * Units of 2^10 (default) or 10^3 are controlled via @buf->si_units
+ */
+void bch2_prt_human_readable_s64(struct printbuf *buf, s64 v)
+{
+	if (v < 0)
+		prt_char(buf, '-');
+	bch2_prt_human_readable_u64(buf, abs(v));
+}
+
+/**
+ * prt_units_u64 - Print out a u64 according to printbuf unit options
+ *
+ * Units are either raw (default), or human reabable units (controlled via
+ * @buf->human_readable_units)
+ */
+void bch2_prt_units_u64(struct printbuf *out, u64 v)
+{
+	if (out->human_readable_units)
+		bch2_prt_human_readable_u64(out, v);
+	else
+		bch2_prt_printf(out, "%llu", v);
+}
+
+/**
+ * prt_units_s64 - Print out a s64 according to printbuf unit options
+ *
+ * Units are either raw (default), or human reabable units (controlled via
+ * @buf->human_readable_units)
+ */
+void bch2_prt_units_s64(struct printbuf *out, s64 v)
+{
+	if (v < 0)
+		prt_char(out, '-');
+	bch2_prt_units_u64(out, abs(v));
+}
+
+void bch2_prt_string_option(struct printbuf *out,
+			    const char * const list[],
+			    size_t selected)
+{
+	size_t i;
+
+	for (i = 0; list[i]; i++)
+		bch2_prt_printf(out, i == selected ? "[%s] " : "%s ", list[i]);
+}
+
+void bch2_prt_bitflags(struct printbuf *out,
+		       const char * const list[], u64 flags)
+{
+	unsigned bit, nr = 0;
+	bool first = true;
+
+	while (list[nr])
+		nr++;
+
+	while (flags && (bit = __ffs(flags)) < nr) {
+		if (!first)
+			bch2_prt_printf(out, ",");
+		first = false;
+		bch2_prt_printf(out, "%s", list[bit]);
+		flags ^= 1 << bit;
+	}
+}
diff --git a/fs/bcachefs/printbuf.h b/fs/bcachefs/printbuf.h
new file mode 100644
index 000000000000..2e9939957833
--- /dev/null
+++ b/fs/bcachefs/printbuf.h
@@ -0,0 +1,284 @@
+/* SPDX-License-Identifier: LGPL-2.1+ */
+/* Copyright (C) 2022 Kent Overstreet */
+
+#ifndef _BCACHEFS_PRINTBUF_H
+#define _BCACHEFS_PRINTBUF_H
+
+/*
+ * Printbufs: Simple strings for printing to, with optional heap allocation
+ *
+ * This code has provisions for use in userspace, to aid in making other code
+ * portable between kernelspace and userspace.
+ *
+ * Basic example:
+ *   struct printbuf buf = PRINTBUF;
+ *
+ *   prt_printf(&buf, "foo=");
+ *   foo_to_text(&buf, foo);
+ *   printk("%s", buf.buf);
+ *   printbuf_exit(&buf);
+ *
+ * Or
+ *   struct printbuf buf = PRINTBUF_EXTERN(char_buf, char_buf_size)
+ *
+ * We can now write pretty printers instead of writing code that dumps
+ * everything to the kernel log buffer, and then those pretty-printers can be
+ * used by other code that outputs to kernel log, sysfs, debugfs, etc.
+ *
+ * Memory allocation: Outputing to a printbuf may allocate memory. This
+ * allocation is done with GFP_KERNEL, by default: use the newer
+ * memalloc_*_(save|restore) functions as needed.
+ *
+ * Since no equivalent yet exists for GFP_ATOMIC/GFP_NOWAIT, memory allocations
+ * will be done with GFP_NOWAIT if printbuf->atomic is nonzero.
+ *
+ * It's allowed to grab the output buffer and free it later with kfree() instead
+ * of using printbuf_exit(), if the user just needs a heap allocated string at
+ * the end.
+ *
+ * Memory allocation failures: We don't return errors directly, because on
+ * memory allocation failure we usually don't want to bail out and unwind - we
+ * want to print what we've got, on a best-effort basis. But code that does want
+ * to return -ENOMEM may check printbuf.allocation_failure.
+ *
+ * Indenting, tabstops:
+ *
+ * To aid is writing multi-line pretty printers spread across multiple
+ * functions, printbufs track the current indent level.
+ *
+ * printbuf_indent_push() and printbuf_indent_pop() increase and decrease the current indent
+ * level, respectively.
+ *
+ * To use tabstops, set printbuf->tabstops[]; they are in units of spaces, from
+ * start of line. Once set, prt_tab() will output spaces up to the next tabstop.
+ * prt_tab_rjust() will also advance the current line of text up to the next
+ * tabstop, but it does so by shifting text since the previous tabstop up to the
+ * next tabstop - right justifying it.
+ *
+ * Make sure you use prt_newline() instead of \n in the format string for indent
+ * level and tabstops to work corretly.
+ *
+ * Output units: printbuf->units exists to tell pretty-printers how to output
+ * numbers: a raw value (e.g. directly from a superblock field), as bytes, or as
+ * human readable bytes. prt_units() obeys it.
+ */
+
+#include <linux/kernel.h>
+#include <linux/string.h>
+
+enum printbuf_si {
+	PRINTBUF_UNITS_2,	/* use binary powers of 2^10 */
+	PRINTBUF_UNITS_10,	/* use powers of 10^3 (standard SI) */
+};
+
+#define PRINTBUF_INLINE_TABSTOPS	4
+
+struct printbuf {
+	char			*buf;
+	unsigned		size;
+	unsigned		pos;
+	unsigned		last_newline;
+	unsigned		last_field;
+	unsigned		indent;
+	/*
+	 * If nonzero, allocations will be done with GFP_ATOMIC:
+	 */
+	u8			atomic;
+	bool			allocation_failure:1;
+	bool			heap_allocated:1;
+	enum printbuf_si	si_units:1;
+	bool			human_readable_units:1;
+	bool			has_indent_or_tabstops:1;
+	bool			suppress_indent_tabstop_handling:1;
+	u8			nr_tabstops;
+
+	/*
+	 * Do not modify directly: use printbuf_tabstop_add(),
+	 * printbuf_tabstop_get()
+	 */
+	u8			cur_tabstop;
+	u8			_tabstops[PRINTBUF_INLINE_TABSTOPS];
+};
+
+int bch2_printbuf_make_room(struct printbuf *, unsigned);
+__printf(2, 3) void bch2_prt_printf(struct printbuf *out, const char *fmt, ...);
+__printf(2, 0) void bch2_prt_vprintf(struct printbuf *out, const char *fmt, va_list);
+const char *bch2_printbuf_str(const struct printbuf *);
+void bch2_printbuf_exit(struct printbuf *);
+
+void bch2_printbuf_tabstops_reset(struct printbuf *);
+void bch2_printbuf_tabstop_pop(struct printbuf *);
+int bch2_printbuf_tabstop_push(struct printbuf *, unsigned);
+
+void bch2_printbuf_indent_add(struct printbuf *, unsigned);
+void bch2_printbuf_indent_sub(struct printbuf *, unsigned);
+
+void bch2_prt_newline(struct printbuf *);
+void bch2_prt_tab(struct printbuf *);
+void bch2_prt_tab_rjust(struct printbuf *);
+
+void bch2_prt_bytes_indented(struct printbuf *, const char *, unsigned);
+void bch2_prt_human_readable_u64(struct printbuf *, u64);
+void bch2_prt_human_readable_s64(struct printbuf *, s64);
+void bch2_prt_units_u64(struct printbuf *, u64);
+void bch2_prt_units_s64(struct printbuf *, s64);
+void bch2_prt_string_option(struct printbuf *, const char * const[], size_t);
+void bch2_prt_bitflags(struct printbuf *, const char * const[], u64);
+
+/* Initializer for a heap allocated printbuf: */
+#define PRINTBUF ((struct printbuf) { .heap_allocated = true })
+
+/* Initializer a printbuf that points to an external buffer: */
+#define PRINTBUF_EXTERN(_buf, _size)			\
+((struct printbuf) {					\
+	.buf	= _buf,					\
+	.size	= _size,				\
+})
+
+/*
+ * Returns size remaining of output buffer:
+ */
+static inline unsigned printbuf_remaining_size(struct printbuf *out)
+{
+	return out->pos < out->size ? out->size - out->pos : 0;
+}
+
+/*
+ * Returns number of characters we can print to the output buffer - i.e.
+ * excluding the terminating nul:
+ */
+static inline unsigned printbuf_remaining(struct printbuf *out)
+{
+	return out->pos < out->size ? out->size - out->pos - 1 : 0;
+}
+
+static inline unsigned printbuf_written(struct printbuf *out)
+{
+	return out->size ? min(out->pos, out->size - 1) : 0;
+}
+
+/*
+ * Returns true if output was truncated:
+ */
+static inline bool printbuf_overflowed(struct printbuf *out)
+{
+	return out->pos >= out->size;
+}
+
+static inline void printbuf_nul_terminate(struct printbuf *out)
+{
+	bch2_printbuf_make_room(out, 1);
+
+	if (out->pos < out->size)
+		out->buf[out->pos] = 0;
+	else if (out->size)
+		out->buf[out->size - 1] = 0;
+}
+
+/* Doesn't call bch2_printbuf_make_room(), doesn't nul terminate: */
+static inline void __prt_char_reserved(struct printbuf *out, char c)
+{
+	if (printbuf_remaining(out))
+		out->buf[out->pos] = c;
+	out->pos++;
+}
+
+/* Doesn't nul terminate: */
+static inline void __prt_char(struct printbuf *out, char c)
+{
+	bch2_printbuf_make_room(out, 1);
+	__prt_char_reserved(out, c);
+}
+
+static inline void prt_char(struct printbuf *out, char c)
+{
+	__prt_char(out, c);
+	printbuf_nul_terminate(out);
+}
+
+static inline void __prt_chars_reserved(struct printbuf *out, char c, unsigned n)
+{
+	unsigned i, can_print = min(n, printbuf_remaining(out));
+
+	for (i = 0; i < can_print; i++)
+		out->buf[out->pos++] = c;
+	out->pos += n - can_print;
+}
+
+static inline void prt_chars(struct printbuf *out, char c, unsigned n)
+{
+	bch2_printbuf_make_room(out, n);
+	__prt_chars_reserved(out, c, n);
+	printbuf_nul_terminate(out);
+}
+
+static inline void prt_bytes(struct printbuf *out, const void *b, unsigned n)
+{
+	unsigned i, can_print;
+
+	bch2_printbuf_make_room(out, n);
+
+	can_print = min(n, printbuf_remaining(out));
+
+	for (i = 0; i < can_print; i++)
+		out->buf[out->pos++] = ((char *) b)[i];
+	out->pos += n - can_print;
+
+	printbuf_nul_terminate(out);
+}
+
+static inline void prt_str(struct printbuf *out, const char *str)
+{
+	prt_bytes(out, str, strlen(str));
+}
+
+static inline void prt_str_indented(struct printbuf *out, const char *str)
+{
+	bch2_prt_bytes_indented(out, str, strlen(str));
+}
+
+static inline void prt_hex_byte(struct printbuf *out, u8 byte)
+{
+	bch2_printbuf_make_room(out, 2);
+	__prt_char_reserved(out, hex_asc_hi(byte));
+	__prt_char_reserved(out, hex_asc_lo(byte));
+	printbuf_nul_terminate(out);
+}
+
+static inline void prt_hex_byte_upper(struct printbuf *out, u8 byte)
+{
+	bch2_printbuf_make_room(out, 2);
+	__prt_char_reserved(out, hex_asc_upper_hi(byte));
+	__prt_char_reserved(out, hex_asc_upper_lo(byte));
+	printbuf_nul_terminate(out);
+}
+
+/**
+ * printbuf_reset - re-use a printbuf without freeing and re-initializing it:
+ */
+static inline void printbuf_reset(struct printbuf *buf)
+{
+	buf->pos		= 0;
+	buf->allocation_failure	= 0;
+	buf->indent		= 0;
+	buf->nr_tabstops	= 0;
+	buf->cur_tabstop	= 0;
+}
+
+/**
+ * printbuf_atomic_inc - mark as entering an atomic section
+ */
+static inline void printbuf_atomic_inc(struct printbuf *buf)
+{
+	buf->atomic++;
+}
+
+/**
+ * printbuf_atomic_inc - mark as leaving an atomic section
+ */
+static inline void printbuf_atomic_dec(struct printbuf *buf)
+{
+	buf->atomic--;
+}
+
+#endif /* _BCACHEFS_PRINTBUF_H */
diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c
index 364ef6314651..d764dc7abfe8 100644
--- a/fs/bcachefs/quota.c
+++ b/fs/bcachefs/quota.c
@@ -23,7 +23,7 @@ static int bch2_sb_quota_validate(struct bch_sb *sb, struct bch_sb_field *f,
 	struct bch_sb_field_quota *q = field_to_type(f, quota);
 
 	if (vstruct_bytes(&q->field) < sizeof(*q)) {
-		pr_buf(err, "wrong size (got %zu should be %zu)",
+		prt_printf(err, "wrong size (got %zu should be %zu)",
 		       vstruct_bytes(&q->field), sizeof(*q));
 		return -EINVAL;
 	}
@@ -38,17 +38,17 @@ static void bch2_sb_quota_to_text(struct printbuf *out, struct bch_sb *sb,
 	unsigned qtyp, counter;
 
 	for (qtyp = 0; qtyp < ARRAY_SIZE(q->q); qtyp++) {
-		pr_buf(out, "%s: flags %llx",
+		prt_printf(out, "%s: flags %llx",
 		       bch2_quota_types[qtyp],
 		       le64_to_cpu(q->q[qtyp].flags));
 
 		for (counter = 0; counter < Q_COUNTERS; counter++)
-			pr_buf(out, " %s timelimit %u warnlimit %u",
+			prt_printf(out, " %s timelimit %u warnlimit %u",
 			       bch2_quota_counters[counter],
 			       le32_to_cpu(q->q[qtyp].c[counter].timelimit),
 			       le32_to_cpu(q->q[qtyp].c[counter].warnlimit));
 
-		pr_newline(out);
+		prt_newline(out);
 	}
 }
 
@@ -61,13 +61,13 @@ int bch2_quota_invalid(const struct bch_fs *c, struct bkey_s_c k,
 		       int rw, struct printbuf *err)
 {
 	if (k.k->p.inode >= QTYP_NR) {
-		pr_buf(err, "invalid quota type (%llu >= %u)",
+		prt_printf(err, "invalid quota type (%llu >= %u)",
 		       k.k->p.inode, QTYP_NR);
 		return -EINVAL;
 	}
 
 	if (bkey_val_bytes(k.k) != sizeof(struct bch_quota)) {
-		pr_buf(err, "incorrect value size (%zu != %zu)",
+		prt_printf(err, "incorrect value size (%zu != %zu)",
 		       bkey_val_bytes(k.k), sizeof(struct bch_quota));
 		return -EINVAL;
 	}
@@ -82,7 +82,7 @@ void bch2_quota_to_text(struct printbuf *out, struct bch_fs *c,
 	unsigned i;
 
 	for (i = 0; i < Q_COUNTERS; i++)
-		pr_buf(out, "%s hardlimit %llu softlimit %llu",
+		prt_printf(out, "%s hardlimit %llu softlimit %llu",
 		       bch2_quota_counters[i],
 		       le64_to_cpu(dq.v->c[i].hardlimit),
 		       le64_to_cpu(dq.v->c[i].softlimit));
diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c
index babf98894e87..1724ae36c0f4 100644
--- a/fs/bcachefs/rebalance.c
+++ b/fs/bcachefs/rebalance.c
@@ -258,46 +258,47 @@ void bch2_rebalance_work_to_text(struct printbuf *out, struct bch_fs *c)
 	struct bch_fs_rebalance *r = &c->rebalance;
 	struct rebalance_work w = rebalance_work(c);
 
-	out->tabstops[0] = 20;
+	if (!out->nr_tabstops)
+		printbuf_tabstop_push(out, 20);
 
-	pr_buf(out, "fullest_dev (%i):", w.dev_most_full_idx);
-	pr_tab(out);
+	prt_printf(out, "fullest_dev (%i):", w.dev_most_full_idx);
+	prt_tab(out);
 
-	bch2_hprint(out, w.dev_most_full_work << 9);
-	pr_buf(out, "/");
-	bch2_hprint(out, w.dev_most_full_capacity << 9);
-	pr_newline(out);
+	prt_human_readable_u64(out, w.dev_most_full_work << 9);
+	prt_printf(out, "/");
+	prt_human_readable_u64(out, w.dev_most_full_capacity << 9);
+	prt_newline(out);
 
-	pr_buf(out, "total work:");
-	pr_tab(out);
+	prt_printf(out, "total work:");
+	prt_tab(out);
 
-	bch2_hprint(out, w.total_work << 9);
-	pr_buf(out, "/");
-	bch2_hprint(out, c->capacity << 9);
-	pr_newline(out);
+	prt_human_readable_u64(out, w.total_work << 9);
+	prt_printf(out, "/");
+	prt_human_readable_u64(out, c->capacity << 9);
+	prt_newline(out);
 
-	pr_buf(out, "rate:");
-	pr_tab(out);
-	pr_buf(out, "%u", r->pd.rate.rate);
-	pr_newline(out);
+	prt_printf(out, "rate:");
+	prt_tab(out);
+	prt_printf(out, "%u", r->pd.rate.rate);
+	prt_newline(out);
 
 	switch (r->state) {
 	case REBALANCE_WAITING:
-		pr_buf(out, "waiting");
+		prt_printf(out, "waiting");
 		break;
 	case REBALANCE_THROTTLED:
-		pr_buf(out, "throttled for %lu sec or ",
+		prt_printf(out, "throttled for %lu sec or ",
 		       (r->throttled_until_cputime - jiffies) / HZ);
-		bch2_hprint(out,
+		prt_human_readable_u64(out,
 			    (r->throttled_until_iotime -
 			     atomic64_read(&c->io_clock[WRITE].now)) << 9);
-		pr_buf(out, " io");
+		prt_printf(out, " io");
 		break;
 	case REBALANCE_RUNNING:
-		pr_buf(out, "running");
+		prt_printf(out, "running");
 		break;
 	}
-	pr_newline(out);
+	prt_newline(out);
 }
 
 void bch2_rebalance_stop(struct bch_fs *c)
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index d755da42d6c5..e6aed8d79bea 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -862,12 +862,12 @@ static int verify_superblock_clean(struct bch_fs *c,
 		if (k1)
 			bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(k1));
 		else
-			pr_buf(&buf1, "(none)");
+			prt_printf(&buf1, "(none)");
 
 		if (k2)
 			bch2_bkey_val_to_text(&buf2, c, bkey_i_to_s_c(k2));
 		else
-			pr_buf(&buf2, "(none)");
+			prt_printf(&buf2, "(none)");
 
 		mustfix_fsck_err_on(!k1 || !k2 ||
 				    IS_ERR(k1) ||
diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c
index a53a3d53c8da..4e589c02a93b 100644
--- a/fs/bcachefs/reflink.c
+++ b/fs/bcachefs/reflink.c
@@ -31,14 +31,14 @@ int bch2_reflink_p_invalid(const struct bch_fs *c, struct bkey_s_c k,
 	struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
 
 	if (bkey_val_bytes(p.k) != sizeof(*p.v)) {
-		pr_buf(err, "incorrect value size (%zu != %zu)",
+		prt_printf(err, "incorrect value size (%zu != %zu)",
 		       bkey_val_bytes(p.k), sizeof(*p.v));
 		return -EINVAL;
 	}
 
 	if (c->sb.version >= bcachefs_metadata_version_reflink_p_fix &&
 	    le64_to_cpu(p.v->idx) < le32_to_cpu(p.v->front_pad)) {
-		pr_buf(err, "idx < front_pad (%llu < %u)",
+		prt_printf(err, "idx < front_pad (%llu < %u)",
 		       le64_to_cpu(p.v->idx), le32_to_cpu(p.v->front_pad));
 		return -EINVAL;
 	}
@@ -51,7 +51,7 @@ void bch2_reflink_p_to_text(struct printbuf *out, struct bch_fs *c,
 {
 	struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
 
-	pr_buf(out, "idx %llu front_pad %u back_pad %u",
+	prt_printf(out, "idx %llu front_pad %u back_pad %u",
 	       le64_to_cpu(p.v->idx),
 	       le32_to_cpu(p.v->front_pad),
 	       le32_to_cpu(p.v->back_pad));
@@ -83,7 +83,7 @@ int bch2_reflink_v_invalid(const struct bch_fs *c, struct bkey_s_c k,
 	struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(k);
 
 	if (bkey_val_bytes(r.k) < sizeof(*r.v)) {
-		pr_buf(err, "incorrect value size (%zu < %zu)",
+		prt_printf(err, "incorrect value size (%zu < %zu)",
 		       bkey_val_bytes(r.k), sizeof(*r.v));
 		return -EINVAL;
 	}
@@ -96,7 +96,7 @@ void bch2_reflink_v_to_text(struct printbuf *out, struct bch_fs *c,
 {
 	struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(k);
 
-	pr_buf(out, "refcount: %llu ", le64_to_cpu(r.v->refcount));
+	prt_printf(out, "refcount: %llu ", le64_to_cpu(r.v->refcount));
 
 	bch2_bkey_ptrs_to_text(out, c, k);
 }
@@ -134,7 +134,7 @@ int bch2_indirect_inline_data_invalid(const struct bch_fs *c, struct bkey_s_c k,
 				      int rw, struct printbuf *err)
 {
 	if (bkey_val_bytes(k.k) < sizeof(struct bch_indirect_inline_data)) {
-		pr_buf(err, "incorrect value size (%zu < %zu)",
+		prt_printf(err, "incorrect value size (%zu < %zu)",
 		       bkey_val_bytes(k.k), sizeof(struct bch_indirect_inline_data));
 		return -EINVAL;
 	}
@@ -148,7 +148,7 @@ void bch2_indirect_inline_data_to_text(struct printbuf *out,
 	struct bkey_s_c_indirect_inline_data d = bkey_s_c_to_indirect_inline_data(k);
 	unsigned datalen = bkey_inline_data_bytes(k.k);
 
-	pr_buf(out, "refcount %llu datalen %u: %*phN",
+	prt_printf(out, "refcount %llu datalen %u: %*phN",
 	       le64_to_cpu(d.v->refcount), datalen,
 	       min(datalen, 32U), d.v->data);
 }
diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c
index e26642c01fd7..4ede807e2fb7 100644
--- a/fs/bcachefs/replicas.c
+++ b/fs/bcachefs/replicas.c
@@ -42,14 +42,14 @@ void bch2_replicas_entry_v0_to_text(struct printbuf *out,
 	unsigned i;
 
 	if (e->data_type < BCH_DATA_NR)
-		pr_buf(out, "%s", bch2_data_types[e->data_type]);
+		prt_printf(out, "%s", bch2_data_types[e->data_type]);
 	else
-		pr_buf(out, "(invalid data type %u)", e->data_type);
+		prt_printf(out, "(invalid data type %u)", e->data_type);
 
-	pr_buf(out, ": %u [", e->nr_devs);
+	prt_printf(out, ": %u [", e->nr_devs);
 	for (i = 0; i < e->nr_devs; i++)
-		pr_buf(out, i ? " %u" : "%u", e->devs[i]);
-	pr_buf(out, "]");
+		prt_printf(out, i ? " %u" : "%u", e->devs[i]);
+	prt_printf(out, "]");
 }
 
 void bch2_replicas_entry_to_text(struct printbuf *out,
@@ -58,14 +58,14 @@ void bch2_replicas_entry_to_text(struct printbuf *out,
 	unsigned i;
 
 	if (e->data_type < BCH_DATA_NR)
-		pr_buf(out, "%s", bch2_data_types[e->data_type]);
+		prt_printf(out, "%s", bch2_data_types[e->data_type]);
 	else
-		pr_buf(out, "(invalid data type %u)", e->data_type);
+		prt_printf(out, "(invalid data type %u)", e->data_type);
 
-	pr_buf(out, ": %u/%u [", e->nr_required, e->nr_devs);
+	prt_printf(out, ": %u/%u [", e->nr_required, e->nr_devs);
 	for (i = 0; i < e->nr_devs; i++)
-		pr_buf(out, i ? " %u" : "%u", e->devs[i]);
-	pr_buf(out, "]");
+		prt_printf(out, i ? " %u" : "%u", e->devs[i]);
+	prt_printf(out, "]");
 }
 
 void bch2_cpu_replicas_to_text(struct printbuf *out,
@@ -76,7 +76,7 @@ void bch2_cpu_replicas_to_text(struct printbuf *out,
 
 	for_each_cpu_replicas_entry(r, e) {
 		if (!first)
-			pr_buf(out, " ");
+			prt_printf(out, " ");
 		first = false;
 
 		bch2_replicas_entry_to_text(out, e);
@@ -841,27 +841,27 @@ static int bch2_cpu_replicas_validate(struct bch_replicas_cpu *cpu_r,
 			cpu_replicas_entry(cpu_r, i);
 
 		if (e->data_type >= BCH_DATA_NR) {
-			pr_buf(err, "invalid data type in entry ");
+			prt_printf(err, "invalid data type in entry ");
 			bch2_replicas_entry_to_text(err, e);
 			return -EINVAL;
 		}
 
 		if (!e->nr_devs) {
-			pr_buf(err, "no devices in entry ");
+			prt_printf(err, "no devices in entry ");
 			bch2_replicas_entry_to_text(err, e);
 			return -EINVAL;
 		}
 
 		if (e->nr_required > 1 &&
 		    e->nr_required >= e->nr_devs) {
-			pr_buf(err, "bad nr_required in entry ");
+			prt_printf(err, "bad nr_required in entry ");
 			bch2_replicas_entry_to_text(err, e);
 			return -EINVAL;
 		}
 
 		for (j = 0; j < e->nr_devs; j++)
 			if (!bch2_dev_exists(sb, mi, e->devs[j])) {
-				pr_buf(err, "invalid device %u in entry ", e->devs[j]);
+				prt_printf(err, "invalid device %u in entry ", e->devs[j]);
 				bch2_replicas_entry_to_text(err, e);
 				return -EINVAL;
 			}
@@ -873,7 +873,7 @@ static int bch2_cpu_replicas_validate(struct bch_replicas_cpu *cpu_r,
 			BUG_ON(memcmp(e, n, cpu_r->entry_size) > 0);
 
 			if (!memcmp(e, n, cpu_r->entry_size)) {
-				pr_buf(err, "duplicate replicas entry ");
+				prt_printf(err, "duplicate replicas entry ");
 				bch2_replicas_entry_to_text(err, e);
 				return -EINVAL;
 			}
@@ -908,12 +908,12 @@ static void bch2_sb_replicas_to_text(struct printbuf *out,
 
 	for_each_replicas_entry(r, e) {
 		if (!first)
-			pr_buf(out, " ");
+			prt_printf(out, " ");
 		first = false;
 
 		bch2_replicas_entry_to_text(out, e);
 	}
-	pr_newline(out);
+	prt_newline(out);
 }
 
 const struct bch_sb_field_ops bch_sb_field_ops_replicas = {
@@ -946,12 +946,12 @@ static void bch2_sb_replicas_v0_to_text(struct printbuf *out,
 
 	for_each_replicas_entry(sb_r, e) {
 		if (!first)
-			pr_buf(out, " ");
+			prt_printf(out, " ");
 		first = false;
 
 		bch2_replicas_entry_v0_to_text(out, e);
 	}
-	pr_newline(out);
+	prt_newline(out);
 }
 
 const struct bch_sb_field_ops bch_sb_field_ops_replicas_v0 = {
diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c
index 81bdcb7795ae..8f41a06c3e11 100644
--- a/fs/bcachefs/subvolume.c
+++ b/fs/bcachefs/subvolume.c
@@ -17,7 +17,7 @@ void bch2_snapshot_to_text(struct printbuf *out, struct bch_fs *c,
 {
 	struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(k);
 
-	pr_buf(out, "is_subvol %llu deleted %llu parent %u children %u %u subvol %u",
+	prt_printf(out, "is_subvol %llu deleted %llu parent %u children %u %u subvol %u",
 	       BCH_SNAPSHOT_SUBVOL(s.v),
 	       BCH_SNAPSHOT_DELETED(s.v),
 	       le32_to_cpu(s.v->parent),
@@ -34,12 +34,12 @@ int bch2_snapshot_invalid(const struct bch_fs *c, struct bkey_s_c k,
 
 	if (bkey_cmp(k.k->p, POS(0, U32_MAX)) > 0 ||
 	    bkey_cmp(k.k->p, POS(0, 1)) < 0) {
-		pr_buf(err, "bad pos");
+		prt_printf(err, "bad pos");
 		return -EINVAL;
 	}
 
 	if (bkey_val_bytes(k.k) != sizeof(struct bch_snapshot)) {
-		pr_buf(err, "bad val size (%zu != %zu)",
+		prt_printf(err, "bad val size (%zu != %zu)",
 		       bkey_val_bytes(k.k), sizeof(struct bch_snapshot));
 		return -EINVAL;
 	}
@@ -48,19 +48,19 @@ int bch2_snapshot_invalid(const struct bch_fs *c, struct bkey_s_c k,
 
 	id = le32_to_cpu(s.v->parent);
 	if (id && id <= k.k->p.offset) {
-		pr_buf(err, "bad parent node (%u <= %llu)",
+		prt_printf(err, "bad parent node (%u <= %llu)",
 		       id, k.k->p.offset);
 		return -EINVAL;
 	}
 
 	if (le32_to_cpu(s.v->children[0]) < le32_to_cpu(s.v->children[1])) {
-		pr_buf(err, "children not normalized");
+		prt_printf(err, "children not normalized");
 		return -EINVAL;
 	}
 
 	if (s.v->children[0] &&
 	    s.v->children[0] == s.v->children[1]) {
-		pr_buf(err, "duplicate child nodes");
+		prt_printf(err, "duplicate child nodes");
 		return -EINVAL;
 	}
 
@@ -68,7 +68,7 @@ int bch2_snapshot_invalid(const struct bch_fs *c, struct bkey_s_c k,
 		id = le32_to_cpu(s.v->children[i]);
 
 		if (id >= k.k->p.offset) {
-			pr_buf(err, "bad child node (%u >= %llu)",
+			prt_printf(err, "bad child node (%u >= %llu)",
 			       id, k.k->p.offset);
 			return -EINVAL;
 		}
@@ -750,12 +750,12 @@ int bch2_subvolume_invalid(const struct bch_fs *c, struct bkey_s_c k,
 {
 	if (bkey_cmp(k.k->p, SUBVOL_POS_MIN) < 0 ||
 	    bkey_cmp(k.k->p, SUBVOL_POS_MAX) > 0) {
-		pr_buf(err, "invalid pos");
+		prt_printf(err, "invalid pos");
 		return -EINVAL;
 	}
 
 	if (bkey_val_bytes(k.k) != sizeof(struct bch_subvolume)) {
-		pr_buf(err, "incorrect value size (%zu != %zu)",
+		prt_printf(err, "incorrect value size (%zu != %zu)",
 		       bkey_val_bytes(k.k), sizeof(struct bch_subvolume));
 		return -EINVAL;
 	}
@@ -768,7 +768,7 @@ void bch2_subvolume_to_text(struct printbuf *out, struct bch_fs *c,
 {
 	struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k);
 
-	pr_buf(out, "root %llu snapshot id %u",
+	prt_printf(out, "root %llu snapshot id %u",
 	       le64_to_cpu(s.v->inode),
 	       le32_to_cpu(s.v->snapshot));
 }
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index 54502e392dfc..48ad158637e5 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -217,23 +217,23 @@ static int validate_sb_layout(struct bch_sb_layout *layout, struct printbuf *out
 
 	if (!uuid_equal(&layout->magic, &BCACHE_MAGIC) &&
 	    !uuid_equal(&layout->magic, &BCHFS_MAGIC)) {
-		pr_buf(out, "Not a bcachefs superblock layout");
+		prt_printf(out, "Not a bcachefs superblock layout");
 		return -EINVAL;
 	}
 
 	if (layout->layout_type != 0) {
-		pr_buf(out, "Invalid superblock layout type %u",
+		prt_printf(out, "Invalid superblock layout type %u",
 		       layout->layout_type);
 		return -EINVAL;
 	}
 
 	if (!layout->nr_superblocks) {
-		pr_buf(out, "Invalid superblock layout: no superblocks");
+		prt_printf(out, "Invalid superblock layout: no superblocks");
 		return -EINVAL;
 	}
 
 	if (layout->nr_superblocks > ARRAY_SIZE(layout->sb_offset)) {
-		pr_buf(out, "Invalid superblock layout: too many superblocks");
+		prt_printf(out, "Invalid superblock layout: too many superblocks");
 		return -EINVAL;
 	}
 
@@ -245,7 +245,7 @@ static int validate_sb_layout(struct bch_sb_layout *layout, struct printbuf *out
 		offset = le64_to_cpu(layout->sb_offset[i]);
 
 		if (offset < prev_offset + max_sectors) {
-			pr_buf(out, "Invalid superblock layout: superblocks overlap\n"
+			prt_printf(out, "Invalid superblock layout: superblocks overlap\n"
 			       "  (sb %u ends at %llu next starts at %llu",
 			       i - 1, prev_offset + max_sectors, offset);
 			return -EINVAL;
@@ -273,63 +273,63 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb, struct printbuf *out,
 		: version;
 
 	if (version    >= bcachefs_metadata_version_max) {
-		pr_buf(out, "Unsupported superblock version %u (min %u, max %u)",
+		prt_printf(out, "Unsupported superblock version %u (min %u, max %u)",
 		       version, bcachefs_metadata_version_min, bcachefs_metadata_version_max);
 		return -EINVAL;
 	}
 
 	if (version_min < bcachefs_metadata_version_min) {
-		pr_buf(out, "Unsupported superblock version %u (min %u, max %u)",
+		prt_printf(out, "Unsupported superblock version %u (min %u, max %u)",
 		       version_min, bcachefs_metadata_version_min, bcachefs_metadata_version_max);
 		return -EINVAL;
 	}
 
 	if (version_min > version) {
-		pr_buf(out, "Bad minimum version %u, greater than version field %u",
+		prt_printf(out, "Bad minimum version %u, greater than version field %u",
 		       version_min, version);
 		return -EINVAL;
 	}
 
 	if (sb->features[1] ||
 	    (le64_to_cpu(sb->features[0]) & (~0ULL << BCH_FEATURE_NR))) {
-		pr_buf(out, "Filesystem has incompatible features");
+		prt_printf(out, "Filesystem has incompatible features");
 		return -EINVAL;
 	}
 
 	block_size = le16_to_cpu(sb->block_size);
 
 	if (block_size > PAGE_SECTORS) {
-		pr_buf(out, "Block size too big (got %u, max %u)",
+		prt_printf(out, "Block size too big (got %u, max %u)",
 		       block_size, PAGE_SECTORS);
 		return -EINVAL;
 	}
 
 	if (bch2_is_zero(sb->user_uuid.b, sizeof(sb->user_uuid))) {
-		pr_buf(out, "Bad user UUID (got zeroes)");
+		prt_printf(out, "Bad user UUID (got zeroes)");
 		return -EINVAL;
 	}
 
 	if (bch2_is_zero(sb->uuid.b, sizeof(sb->uuid))) {
-		pr_buf(out, "Bad intenal UUID (got zeroes)");
+		prt_printf(out, "Bad intenal UUID (got zeroes)");
 		return -EINVAL;
 	}
 
 	if (!sb->nr_devices ||
 	    sb->nr_devices > BCH_SB_MEMBERS_MAX) {
-		pr_buf(out, "Bad number of member devices %u (max %u)",
+		prt_printf(out, "Bad number of member devices %u (max %u)",
 		       sb->nr_devices, BCH_SB_MEMBERS_MAX);
 		return -EINVAL;
 	}
 
 	if (sb->dev_idx >= sb->nr_devices) {
-		pr_buf(out, "Bad dev_idx (got %u, nr_devices %u)",
+		prt_printf(out, "Bad dev_idx (got %u, nr_devices %u)",
 		       sb->dev_idx, sb->nr_devices);
 		return -EINVAL;
 	}
 
 	if (!sb->time_precision ||
 	    le32_to_cpu(sb->time_precision) > NSEC_PER_SEC) {
-		pr_buf(out, "Invalid time precision: %u (min 1, max %lu)",
+		prt_printf(out, "Invalid time precision: %u (min 1, max %lu)",
 		       le32_to_cpu(sb->time_precision), NSEC_PER_SEC);
 		return -EINVAL;
 	}
@@ -352,7 +352,7 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb, struct printbuf *out,
 		if (opt->get_sb != BCH2_NO_SB_OPT) {
 			u64 v = bch2_opt_from_sb(sb, opt_id);
 
-			pr_buf(out, "Invalid option ");
+			prt_printf(out, "Invalid option ");
 			ret = bch2_opt_validate(opt, v, out);
 			if (ret)
 				return ret;
@@ -368,13 +368,13 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb, struct printbuf *out,
 
 	vstruct_for_each(sb, f) {
 		if (!f->u64s) {
-			pr_buf(out, "Invalid superblock: optional with size 0 (type %u)",
+			prt_printf(out, "Invalid superblock: optional with size 0 (type %u)",
 			       le32_to_cpu(f->type));
 			return -EINVAL;
 		}
 
 		if (vstruct_next(f) > vstruct_last(sb)) {
-			pr_buf(out, "Invalid superblock: optional field extends past end of superblock (type %u)",
+			prt_printf(out, "Invalid superblock: optional field extends past end of superblock (type %u)",
 			       le32_to_cpu(f->type));
 			return -EINVAL;
 		}
@@ -383,7 +383,7 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb, struct printbuf *out,
 	/* members must be validated first: */
 	mi = bch2_sb_get_members(sb);
 	if (!mi) {
-		pr_buf(out, "Invalid superblock: member info area missing");
+		prt_printf(out, "Invalid superblock: member info area missing");
 		return -EINVAL;
 	}
 
@@ -539,13 +539,13 @@ reread:
 
 	ret = submit_bio_wait(sb->bio);
 	if (ret) {
-		pr_buf(err, "IO error: %i", ret);
+		prt_printf(err, "IO error: %i", ret);
 		return ret;
 	}
 
 	if (!uuid_equal(&sb->sb->magic, &BCACHE_MAGIC) &&
 	    !uuid_equal(&sb->sb->magic, &BCHFS_MAGIC)) {
-		pr_buf(err, "Not a bcachefs superblock");
+		prt_printf(err, "Not a bcachefs superblock");
 		return -EINVAL;
 	}
 
@@ -555,13 +555,13 @@ reread:
 		: version;
 
 	if (version    >= bcachefs_metadata_version_max) {
-		pr_buf(err, "Unsupported superblock version %u (min %u, max %u)",
+		prt_printf(err, "Unsupported superblock version %u (min %u, max %u)",
 		       version, bcachefs_metadata_version_min, bcachefs_metadata_version_max);
 		return -EINVAL;
 	}
 
 	if (version_min < bcachefs_metadata_version_min) {
-		pr_buf(err, "Unsupported superblock version %u (min %u, max %u)",
+		prt_printf(err, "Unsupported superblock version %u (min %u, max %u)",
 		       version_min, bcachefs_metadata_version_min, bcachefs_metadata_version_max);
 		return -EINVAL;
 	}
@@ -569,7 +569,7 @@ reread:
 	bytes = vstruct_bytes(sb->sb);
 
 	if (bytes > 512 << sb->sb->layout.sb_max_size_bits) {
-		pr_buf(err, "Invalid superblock: too big (got %zu bytes, layout max %lu)",
+		prt_printf(err, "Invalid superblock: too big (got %zu bytes, layout max %lu)",
 		       bytes, 512UL << sb->sb->layout.sb_max_size_bits);
 		return -EINVAL;
 	}
@@ -581,7 +581,7 @@ reread:
 	}
 
 	if (BCH_SB_CSUM_TYPE(sb->sb) >= BCH_CSUM_NR) {
-		pr_buf(err, "unknown checksum type %llu", BCH_SB_CSUM_TYPE(sb->sb));
+		prt_printf(err, "unknown checksum type %llu", BCH_SB_CSUM_TYPE(sb->sb));
 		return -EINVAL;
 	}
 
@@ -590,7 +590,7 @@ reread:
 			    null_nonce(), sb->sb);
 
 	if (bch2_crc_cmp(csum, sb->sb->csum)) {
-		pr_buf(err, "bad checksum");
+		prt_printf(err, "bad checksum");
 		return -EINVAL;
 	}
 
@@ -641,12 +641,12 @@ int bch2_read_super(const char *path, struct bch_opts *opts,
 
 	ret = bch2_sb_realloc(sb, 0);
 	if (ret) {
-		pr_buf(&err, "error allocating memory for superblock");
+		prt_printf(&err, "error allocating memory for superblock");
 		goto err;
 	}
 
 	if (bch2_fs_init_fault("read_super")) {
-		pr_buf(&err, "dynamic fault");
+		prt_printf(&err, "dynamic fault");
 		ret = -EFAULT;
 		goto err;
 	}
@@ -676,7 +676,7 @@ int bch2_read_super(const char *path, struct bch_opts *opts,
 
 	ret = submit_bio_wait(sb->bio);
 	if (ret) {
-		pr_buf(&err, "IO error: %i", ret);
+		prt_printf(&err, "IO error: %i", ret);
 		goto err;
 	}
 
@@ -702,7 +702,7 @@ int bch2_read_super(const char *path, struct bch_opts *opts,
 got_super:
 	if (le16_to_cpu(sb->sb->block_size) << 9 <
 	    bdev_logical_block_size(sb->bdev)) {
-		pr_buf(&err, "block size (%u) smaller than device block size (%u)",
+		prt_printf(&err, "block size (%u) smaller than device block size (%u)",
 		       le16_to_cpu(sb->sb->block_size) << 9,
 		       bdev_logical_block_size(sb->bdev));
 		ret = -EINVAL;
@@ -954,7 +954,7 @@ static int bch2_sb_members_validate(struct bch_sb *sb,
 
 	if ((void *) (mi->members + sb->nr_devices) >
 	    vstruct_end(&mi->field)) {
-		pr_buf(err, "too many devices for section size");
+		prt_printf(err, "too many devices for section size");
 		return -EINVAL;
 	}
 
@@ -965,28 +965,28 @@ static int bch2_sb_members_validate(struct bch_sb *sb,
 			continue;
 
 		if (le64_to_cpu(m->nbuckets) > LONG_MAX) {
-			pr_buf(err, "device %u: too many buckets (got %llu, max %lu)",
+			prt_printf(err, "device %u: too many buckets (got %llu, max %lu)",
 			       i, le64_to_cpu(m->nbuckets), LONG_MAX);
 			return -EINVAL;
 		}
 
 		if (le64_to_cpu(m->nbuckets) -
 		    le16_to_cpu(m->first_bucket) < BCH_MIN_NR_NBUCKETS) {
-			pr_buf(err, "device %u: not enough buckets (got %llu, max %u)",
+			prt_printf(err, "device %u: not enough buckets (got %llu, max %u)",
 			       i, le64_to_cpu(m->nbuckets), BCH_MIN_NR_NBUCKETS);
 			return -EINVAL;
 		}
 
 		if (le16_to_cpu(m->bucket_size) <
 		    le16_to_cpu(sb->block_size)) {
-			pr_buf(err, "device %u: bucket size %u smaller than block size %u",
+			prt_printf(err, "device %u: bucket size %u smaller than block size %u",
 			       i, le16_to_cpu(m->bucket_size), le16_to_cpu(sb->block_size));
 			return -EINVAL;
 		}
 
 		if (le16_to_cpu(m->bucket_size) <
 		    BCH_SB_BTREE_NODE_SIZE(sb)) {
-			pr_buf(err, "device %u: bucket size %u smaller than btree node size %llu",
+			prt_printf(err, "device %u: bucket size %u smaller than btree node size %llu",
 			       i, le16_to_cpu(m->bucket_size), BCH_SB_BTREE_NODE_SIZE(sb));
 			return -EINVAL;
 		}
@@ -1011,97 +1011,96 @@ static void bch2_sb_members_to_text(struct printbuf *out, struct bch_sb *sb,
 		if (!bch2_member_exists(m))
 			continue;
 
-		pr_buf(out, "Device:");
-		pr_tab(out);
-		pr_buf(out, "%u", i);
-		pr_newline(out);
+		prt_printf(out, "Device:");
+		prt_tab(out);
+		prt_printf(out, "%u", i);
+		prt_newline(out);
 
-		pr_indent_push(out, 2);
+		printbuf_indent_add(out, 2);
 
-		pr_buf(out, "UUID:");
-		pr_tab(out);
+		prt_printf(out, "UUID:");
+		prt_tab(out);
 		pr_uuid(out, m->uuid.b);
-		pr_newline(out);
-
-		pr_buf(out, "Size:");
-		pr_tab(out);
-		pr_units(out, device_size, device_size << 9);
-		pr_newline(out);
-
-		pr_buf(out, "Bucket size:");
-		pr_tab(out);
-		pr_units(out, bucket_size, bucket_size << 9);
-		pr_newline(out);
-
-		pr_buf(out, "First bucket:");
-		pr_tab(out);
-		pr_buf(out, "%u", le16_to_cpu(m->first_bucket));
-		pr_newline(out);
-
-		pr_buf(out, "Buckets:");
-		pr_tab(out);
-		pr_buf(out, "%llu", le64_to_cpu(m->nbuckets));
-		pr_newline(out);
-
-		pr_buf(out, "Last mount:");
-		pr_tab(out);
+		prt_newline(out);
+
+		prt_printf(out, "Size:");
+		prt_tab(out);
+		prt_units_u64(out, device_size << 9);
+		prt_newline(out);
+
+		prt_printf(out, "Bucket size:");
+		prt_tab(out);
+		prt_units_u64(out, bucket_size << 9);
+		prt_newline(out);
+
+		prt_printf(out, "First bucket:");
+		prt_tab(out);
+		prt_printf(out, "%u", le16_to_cpu(m->first_bucket));
+		prt_newline(out);
+
+		prt_printf(out, "Buckets:");
+		prt_tab(out);
+		prt_printf(out, "%llu", le64_to_cpu(m->nbuckets));
+		prt_newline(out);
+
+		prt_printf(out, "Last mount:");
+		prt_tab(out);
 		if (m->last_mount)
 			pr_time(out, le64_to_cpu(m->last_mount));
 		else
-			pr_buf(out, "(never)");
-		pr_newline(out);
+			prt_printf(out, "(never)");
+		prt_newline(out);
 
-		pr_buf(out, "State:");
-		pr_tab(out);
-		pr_buf(out, "%s",
+		prt_printf(out, "State:");
+		prt_tab(out);
+		prt_printf(out, "%s",
 		       BCH_MEMBER_STATE(m) < BCH_MEMBER_STATE_NR
 		       ? bch2_member_states[BCH_MEMBER_STATE(m)]
 		       : "unknown");
-		pr_newline(out);
+		prt_newline(out);
 
-		pr_buf(out, "Label:");
-		pr_tab(out);
+		prt_printf(out, "Label:");
+		prt_tab(out);
 		if (BCH_MEMBER_GROUP(m)) {
 			unsigned idx = BCH_MEMBER_GROUP(m) - 1;
 
 			if (idx < disk_groups_nr(gi))
-				pr_buf(out, "%s (%u)",
+				prt_printf(out, "%s (%u)",
 				       gi->entries[idx].label, idx);
 			else
-				pr_buf(out, "(bad disk labels section)");
+				prt_printf(out, "(bad disk labels section)");
 		} else {
-			pr_buf(out, "(none)");
+			prt_printf(out, "(none)");
 		}
-		pr_newline(out);
+		prt_newline(out);
 
-		pr_buf(out, "Data allowed:");
-		pr_tab(out);
+		prt_printf(out, "Data allowed:");
+		prt_tab(out);
 		if (BCH_MEMBER_DATA_ALLOWED(m))
-			bch2_flags_to_text(out, bch2_data_types,
-					   BCH_MEMBER_DATA_ALLOWED(m));
+			prt_bitflags(out, bch2_data_types, BCH_MEMBER_DATA_ALLOWED(m));
 		else
-			pr_buf(out, "(none)");
-		pr_newline(out);
+			prt_printf(out, "(none)");
+		prt_newline(out);
 
-		pr_buf(out, "Has data:");
-		pr_tab(out);
+		prt_printf(out, "Has data:");
+		prt_tab(out);
 		if (data_have)
-			bch2_flags_to_text(out, bch2_data_types, data_have);
+			prt_bitflags(out, bch2_data_types, data_have);
 		else
-			pr_buf(out, "(none)");
-		pr_newline(out);
+			prt_printf(out, "(none)");
+		prt_newline(out);
 
-		pr_buf(out, "Discard:");
-		pr_tab(out);
-		pr_buf(out, "%llu", BCH_MEMBER_DISCARD(m));
-		pr_newline(out);
+		prt_printf(out, "Discard:");
+		prt_tab(out);
+		prt_printf(out, "%llu", BCH_MEMBER_DISCARD(m));
+		prt_newline(out);
 
-		pr_buf(out, "Freespace initialized:");
-		pr_tab(out);
-		pr_buf(out, "%llu", BCH_MEMBER_FREESPACE_INITIALIZED(m));
-		pr_newline(out);
+		prt_printf(out, "Freespace initialized:");
+		prt_tab(out);
+		prt_printf(out, "%llu", BCH_MEMBER_FREESPACE_INITIALIZED(m));
+		prt_newline(out);
 
-		pr_indent_pop(out, 2);
+		printbuf_indent_sub(out, 2);
 	}
 }
 
@@ -1119,13 +1118,13 @@ static int bch2_sb_crypt_validate(struct bch_sb *sb,
 	struct bch_sb_field_crypt *crypt = field_to_type(f, crypt);
 
 	if (vstruct_bytes(&crypt->field) < sizeof(*crypt)) {
-		pr_buf(err, "wrong size (got %zu should be %zu)",
+		prt_printf(err, "wrong size (got %zu should be %zu)",
 		       vstruct_bytes(&crypt->field), sizeof(*crypt));
 		return -EINVAL;
 	}
 
 	if (BCH_CRYPT_KDF_TYPE(crypt)) {
-		pr_buf(err, "bad kdf type %llu", BCH_CRYPT_KDF_TYPE(crypt));
+		prt_printf(err, "bad kdf type %llu", BCH_CRYPT_KDF_TYPE(crypt));
 		return -EINVAL;
 	}
 
@@ -1137,14 +1136,14 @@ static void bch2_sb_crypt_to_text(struct printbuf *out, struct bch_sb *sb,
 {
 	struct bch_sb_field_crypt *crypt = field_to_type(f, crypt);
 
-	pr_buf(out, "KFD:               %llu", BCH_CRYPT_KDF_TYPE(crypt));
-	pr_newline(out);
-	pr_buf(out, "scrypt n:          %llu", BCH_KDF_SCRYPT_N(crypt));
-	pr_newline(out);
-	pr_buf(out, "scrypt r:          %llu", BCH_KDF_SCRYPT_R(crypt));
-	pr_newline(out);
-	pr_buf(out, "scrypt p:          %llu", BCH_KDF_SCRYPT_P(crypt));
-	pr_newline(out);
+	prt_printf(out, "KFD:               %llu", BCH_CRYPT_KDF_TYPE(crypt));
+	prt_newline(out);
+	prt_printf(out, "scrypt n:          %llu", BCH_KDF_SCRYPT_N(crypt));
+	prt_newline(out);
+	prt_printf(out, "scrypt r:          %llu", BCH_KDF_SCRYPT_R(crypt));
+	prt_newline(out);
+	prt_printf(out, "scrypt p:          %llu", BCH_KDF_SCRYPT_P(crypt));
+	prt_newline(out);
 }
 
 static const struct bch_sb_field_ops bch_sb_field_ops_crypt = {
@@ -1361,7 +1360,7 @@ static int bch2_sb_clean_validate(struct bch_sb *sb,
 	struct bch_sb_field_clean *clean = field_to_type(f, clean);
 
 	if (vstruct_bytes(&clean->field) < sizeof(*clean)) {
-		pr_buf(err, "wrong size (got %zu should be %zu)",
+		prt_printf(err, "wrong size (got %zu should be %zu)",
 		       vstruct_bytes(&clean->field), sizeof(*clean));
 		return -EINVAL;
 	}
@@ -1375,10 +1374,10 @@ static void bch2_sb_clean_to_text(struct printbuf *out, struct bch_sb *sb,
 	struct bch_sb_field_clean *clean = field_to_type(f, clean);
 	struct jset_entry *entry;
 
-	pr_buf(out, "flags:          %x",	le32_to_cpu(clean->flags));
-	pr_newline(out);
-	pr_buf(out, "journal_seq:    %llu",	le64_to_cpu(clean->journal_seq));
-	pr_newline(out);
+	prt_printf(out, "flags:          %x",	le32_to_cpu(clean->flags));
+	prt_newline(out);
+	prt_printf(out, "journal_seq:    %llu",	le64_to_cpu(clean->journal_seq));
+	prt_newline(out);
 
 	for (entry = clean->start;
 	     entry != vstruct_end(&clean->field);
@@ -1388,7 +1387,7 @@ static void bch2_sb_clean_to_text(struct printbuf *out, struct bch_sb *sb,
 			continue;
 
 		bch2_journal_entry_to_text(out, NULL, entry);
-		pr_newline(out);
+		prt_newline(out);
 	}
 }
 
@@ -1416,10 +1415,10 @@ static int bch2_sb_field_validate(struct bch_sb *sb, struct bch_sb_field *f,
 
 	ret = bch2_sb_field_ops[type]->validate(sb, f, &field_err);
 	if (ret) {
-		pr_buf(err, "Invalid superblock section %s: %s",
+		prt_printf(err, "Invalid superblock section %s: %s",
 		       bch2_sb_fields[type],
 		       field_err.buf);
-		pr_newline(err);
+		prt_newline(err);
 		bch2_sb_field_to_text(err, sb, f);
 	}
 
@@ -1434,21 +1433,21 @@ void bch2_sb_field_to_text(struct printbuf *out, struct bch_sb *sb,
 	const struct bch_sb_field_ops *ops = type < BCH_SB_FIELD_NR
 		? bch2_sb_field_ops[type] : NULL;
 
-	if (!out->tabstops[0])
-		out->tabstops[0] = 32;
+	if (!out->nr_tabstops)
+		printbuf_tabstop_push(out, 32);
 
 	if (ops)
-		pr_buf(out, "%s", bch2_sb_fields[type]);
+		prt_printf(out, "%s", bch2_sb_fields[type]);
 	else
-		pr_buf(out, "(unknown field %u)", type);
+		prt_printf(out, "(unknown field %u)", type);
 
-	pr_buf(out, " (size %zu):", vstruct_bytes(f));
-	pr_newline(out);
+	prt_printf(out, " (size %zu):", vstruct_bytes(f));
+	prt_newline(out);
 
 	if (ops && ops->to_text) {
-		pr_indent_push(out, 2);
+		printbuf_indent_add(out, 2);
 		bch2_sb_field_ops[type]->to_text(out, sb, f);
-		pr_indent_pop(out, 2);
+		printbuf_indent_sub(out, 2);
 	}
 }
 
@@ -1456,25 +1455,23 @@ void bch2_sb_layout_to_text(struct printbuf *out, struct bch_sb_layout *l)
 {
 	unsigned i;
 
-	pr_buf(out, "Type:                    %u", l->layout_type);
-	pr_newline(out);
+	prt_printf(out, "Type:                    %u", l->layout_type);
+	prt_newline(out);
 
-	pr_buf(out, "Superblock max size:     ");
-	pr_units(out,
-		 1 << l->sb_max_size_bits,
-		 512 << l->sb_max_size_bits);
-	pr_newline(out);
+	prt_str(out, "Superblock max size:     ");
+	prt_units_u64(out, 512 << l->sb_max_size_bits);
+	prt_newline(out);
 
-	pr_buf(out, "Nr superblocks:          %u", l->nr_superblocks);
-	pr_newline(out);
+	prt_printf(out, "Nr superblocks:          %u", l->nr_superblocks);
+	prt_newline(out);
 
-	pr_buf(out, "Offsets:                 ");
+	prt_str(out, "Offsets:                 ");
 	for (i = 0; i < l->nr_superblocks; i++) {
 		if (i)
-			pr_buf(out, ", ");
-		pr_buf(out, "%llu", le64_to_cpu(l->sb_offset[i]));
+			prt_str(out, ", ");
+		prt_printf(out, "%llu", le64_to_cpu(l->sb_offset[i]));
 	}
-	pr_newline(out);
+	prt_newline(out);
 }
 
 void bch2_sb_to_text(struct printbuf *out, struct bch_sb *sb,
@@ -1485,8 +1482,8 @@ void bch2_sb_to_text(struct printbuf *out, struct bch_sb *sb,
 	u64 fields_have = 0;
 	unsigned nr_devices = 0;
 
-	if (!out->tabstops[0])
-		out->tabstops[0] = 32;
+	if (!out->nr_tabstops)
+		printbuf_tabstop_push(out, 32);
 
 	mi = bch2_sb_get_members(sb);
 	if (mi) {
@@ -1498,87 +1495,85 @@ void bch2_sb_to_text(struct printbuf *out, struct bch_sb *sb,
 			nr_devices += bch2_member_exists(m);
 	}
 
-	pr_buf(out, "External UUID:");
-	pr_tab(out);
+	prt_printf(out, "External UUID:");
+	prt_tab(out);
 	pr_uuid(out, sb->user_uuid.b);
-	pr_newline(out);
+	prt_newline(out);
 
-	pr_buf(out, "Internal UUID:");
-	pr_tab(out);
+	prt_printf(out, "Internal UUID:");
+	prt_tab(out);
 	pr_uuid(out, sb->uuid.b);
-	pr_newline(out);
-
-	pr_buf(out, "Device index:");
-	pr_tab(out);
-	pr_buf(out, "%u", sb->dev_idx);
-	pr_newline(out);
-
-	pr_buf(out, "Label:");
-	pr_tab(out);
-	pr_buf(out, "%.*s", (int) sizeof(sb->label), sb->label);
-	pr_newline(out);
-
-	pr_buf(out, "Version:");
-	pr_tab(out);
-	pr_buf(out, "%s", bch2_metadata_versions[le16_to_cpu(sb->version)]);
-	pr_newline(out);
-
-	pr_buf(out, "Oldest version on disk:");
-	pr_tab(out);
-	pr_buf(out, "%s", bch2_metadata_versions[le16_to_cpu(sb->version_min)]);
-	pr_newline(out);
-
-	pr_buf(out, "Created:");
-	pr_tab(out);
+	prt_newline(out);
+
+	prt_str(out, "Device index:");
+	prt_tab(out);
+	prt_printf(out, "%u", sb->dev_idx);
+	prt_newline(out);
+
+	prt_str(out, "Label:");
+	prt_tab(out);
+	prt_printf(out, "%.*s", (int) sizeof(sb->label), sb->label);
+	prt_newline(out);
+
+	prt_str(out, "Version:");
+	prt_tab(out);
+	prt_printf(out, "%s", bch2_metadata_versions[le16_to_cpu(sb->version)]);
+	prt_newline(out);
+
+	prt_printf(out, "Oldest version on disk:");
+	prt_tab(out);
+	prt_printf(out, "%s", bch2_metadata_versions[le16_to_cpu(sb->version_min)]);
+	prt_newline(out);
+
+	prt_printf(out, "Created:");
+	prt_tab(out);
 	if (sb->time_base_lo)
 		pr_time(out, div_u64(le64_to_cpu(sb->time_base_lo), NSEC_PER_SEC));
 	else
-		pr_buf(out, "(not set)");
-	pr_newline(out);
-
-	pr_buf(out, "Sequence number:");
-	pr_tab(out);
-	pr_buf(out, "%llu", le64_to_cpu(sb->seq));
-	pr_newline(out);
-
-	pr_buf(out, "Superblock size:");
-	pr_tab(out);
-	pr_buf(out, "%zu", vstruct_bytes(sb));
-	pr_newline(out);
-
-	pr_buf(out, "Clean:");
-	pr_tab(out);
-	pr_buf(out, "%llu", BCH_SB_CLEAN(sb));
-	pr_newline(out);
-
-	pr_buf(out, "Devices:");
-	pr_tab(out);
-	pr_buf(out, "%u", nr_devices);
-	pr_newline(out);
-
-	pr_buf(out, "Sections:");
+		prt_printf(out, "(not set)");
+	prt_newline(out);
+
+	prt_printf(out, "Sequence number:");
+	prt_tab(out);
+	prt_printf(out, "%llu", le64_to_cpu(sb->seq));
+	prt_newline(out);
+
+	prt_printf(out, "Superblock size:");
+	prt_tab(out);
+	prt_printf(out, "%zu", vstruct_bytes(sb));
+	prt_newline(out);
+
+	prt_printf(out, "Clean:");
+	prt_tab(out);
+	prt_printf(out, "%llu", BCH_SB_CLEAN(sb));
+	prt_newline(out);
+
+	prt_printf(out, "Devices:");
+	prt_tab(out);
+	prt_printf(out, "%u", nr_devices);
+	prt_newline(out);
+
+	prt_printf(out, "Sections:");
 	vstruct_for_each(sb, f)
 		fields_have |= 1 << le32_to_cpu(f->type);
-	pr_tab(out);
-	bch2_flags_to_text(out, bch2_sb_fields, fields_have);
-	pr_newline(out);
-
-	pr_buf(out, "Features:");
-	pr_tab(out);
-	bch2_flags_to_text(out, bch2_sb_features,
-			   le64_to_cpu(sb->features[0]));
-	pr_newline(out);
-
-	pr_buf(out, "Compat features:");
-	pr_tab(out);
-	bch2_flags_to_text(out, bch2_sb_compat,
-			   le64_to_cpu(sb->compat[0]));
-	pr_newline(out);
-
-	pr_newline(out);
-	pr_buf(out, "Options:");
-	pr_newline(out);
-	pr_indent_push(out, 2);
+	prt_tab(out);
+	prt_bitflags(out, bch2_sb_fields, fields_have);
+	prt_newline(out);
+
+	prt_printf(out, "Features:");
+	prt_tab(out);
+	prt_bitflags(out, bch2_sb_features, le64_to_cpu(sb->features[0]));
+	prt_newline(out);
+
+	prt_printf(out, "Compat features:");
+	prt_tab(out);
+	prt_bitflags(out, bch2_sb_compat, le64_to_cpu(sb->compat[0]));
+	prt_newline(out);
+
+	prt_newline(out);
+	prt_printf(out, "Options:");
+	prt_newline(out);
+	printbuf_indent_add(out, 2);
 	{
 		enum bch_opt_id id;
 
@@ -1588,29 +1583,29 @@ void bch2_sb_to_text(struct printbuf *out, struct bch_sb *sb,
 			if (opt->get_sb != BCH2_NO_SB_OPT) {
 				u64 v = bch2_opt_from_sb(sb, id);
 
-				pr_buf(out, "%s:", opt->attr.name);
-				pr_tab(out);
+				prt_printf(out, "%s:", opt->attr.name);
+				prt_tab(out);
 				bch2_opt_to_text(out, NULL, sb, opt, v,
 						 OPT_HUMAN_READABLE|OPT_SHOW_FULL_LIST);
-				pr_newline(out);
+				prt_newline(out);
 			}
 		}
 	}
 
-	pr_indent_pop(out, 2);
+	printbuf_indent_sub(out, 2);
 
 	if (print_layout) {
-		pr_newline(out);
-		pr_buf(out, "layout:");
-		pr_newline(out);
-		pr_indent_push(out, 2);
+		prt_newline(out);
+		prt_printf(out, "layout:");
+		prt_newline(out);
+		printbuf_indent_add(out, 2);
 		bch2_sb_layout_to_text(out, &sb->layout);
-		pr_indent_pop(out, 2);
+		printbuf_indent_sub(out, 2);
 	}
 
 	vstruct_for_each(sb, f)
 		if (fields & (1 << le32_to_cpu(f->type))) {
-			pr_newline(out);
+			prt_newline(out);
 			bch2_sb_field_to_text(out, sb, f);
 		}
 }
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 8d02a8158520..cc887a1b8c8f 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -605,6 +605,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 {
 	struct bch_sb_field_members *mi;
 	struct bch_fs *c;
+	struct printbuf name = PRINTBUF;
 	unsigned i, iter_size;
 	int ret = 0;
 
@@ -708,7 +709,13 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 	if (ret)
 		goto err;
 
-	uuid_unparse_lower(c->sb.user_uuid.b, c->name);
+	pr_uuid(&name, c->sb.user_uuid.b);
+	strlcpy(c->name, name.buf, sizeof(c->name));
+	printbuf_exit(&name);
+
+	ret = name.allocation_failure ? -ENOMEM : 0;
+	if (ret)
+		goto err;
 
 	/* Compat: */
 	if (sb->version <= bcachefs_metadata_version_inode_v2 &&
@@ -830,7 +837,7 @@ static void print_mount_opts(struct bch_fs *c)
 	bool first = true;
 
 	if (c->opts.read_only) {
-		pr_buf(&p, "ro");
+		prt_printf(&p, "ro");
 		first = false;
 	}
 
@@ -845,13 +852,13 @@ static void print_mount_opts(struct bch_fs *c)
 			continue;
 
 		if (!first)
-			pr_buf(&p, ",");
+			prt_printf(&p, ",");
 		first = false;
 		bch2_opt_to_text(&p, c, c->disk_sb.sb, opt, v, OPT_SHOW_MOUNT_STYLE);
 	}
 
 	if (!p.pos)
-		pr_buf(&p, "(null)");
+		prt_printf(&p, "(null)");
 
 	bch_info(c, "mounted version=%s opts=%s", bch2_metadata_versions[c->sb.version], p.buf);
 	printbuf_exit(&p);
@@ -1482,7 +1489,7 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
 	if (data) {
 		struct printbuf data_has = PRINTBUF;
 
-		bch2_flags_to_text(&data_has, bch2_data_types, data);
+		prt_bitflags(&data_has, bch2_data_types, data);
 		bch_err(ca, "Remove failed, still has data (%s)", data_has.buf);
 		printbuf_exit(&data_has);
 		ret = -EBUSY;
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index 6b5b20d18012..4a85fffdfa4c 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -56,7 +56,7 @@ static ssize_t fn ## _show(struct kobject *kobj, struct attribute *attr,\
 	ssize_t ret = fn ## _to_text(&out, kobj, attr);			\
 									\
 	if (out.pos && out.buf[out.pos - 1] != '\n')			\
-		pr_newline(&out);					\
+		prt_newline(&out);					\
 									\
 	if (!ret && out.allocation_failure)				\
 		ret = -ENOMEM;						\
@@ -87,7 +87,7 @@ static ssize_t fn ## _store(struct kobject *kobj, struct attribute *attr,\
 #define sysfs_printf(file, fmt, ...)					\
 do {									\
 	if (attr == &sysfs_ ## file)					\
-		pr_buf(out, fmt "\n", __VA_ARGS__);			\
+		prt_printf(out, fmt "\n", __VA_ARGS__);			\
 } while (0)
 
 #define sysfs_print(file, var)						\
@@ -99,7 +99,7 @@ do {									\
 #define sysfs_hprint(file, val)						\
 do {									\
 	if (attr == &sysfs_ ## file)					\
-		bch2_hprint(out, val);					\
+		prt_human_readable_s64(out, val);			\
 } while (0)
 
 #define var_printf(_var, fmt)	sysfs_printf(_var, fmt, var(_var))
@@ -260,12 +260,12 @@ static long data_progress_to_text(struct printbuf *out, struct bch_fs *c)
 
 	mutex_lock(&c->data_progress_lock);
 	list_for_each_entry(stats, &c->data_progress_list, list) {
-		pr_buf(out, "%s: data type %s btree_id %s position: ",
+		prt_printf(out, "%s: data type %s btree_id %s position: ",
 		       stats->name,
 		       bch2_data_types[stats->data_type],
 		       bch2_btree_ids[stats->btree_id]);
 		bch2_bpos_to_text(out, stats->pos);
-		pr_buf(out, "%s", "\n");
+		prt_printf(out, "%s", "\n");
 	}
 
 	mutex_unlock(&c->data_progress_lock);
@@ -338,34 +338,34 @@ static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c
 	if (ret)
 		return ret;
 
-	pr_buf(out, "uncompressed:\n");
-	pr_buf(out, "	nr extents:		%llu\n", nr_uncompressed_extents);
-	pr_buf(out, "	size:			");
-	bch2_hprint(out, uncompressed_sectors << 9);
-	pr_buf(out, "\n");
-
-	pr_buf(out, "compressed:\n");
-	pr_buf(out, "	nr extents:		%llu\n", nr_compressed_extents);
-	pr_buf(out, "	compressed size:	");
-	bch2_hprint(out, compressed_sectors_compressed << 9);
-	pr_buf(out, "\n");
-	pr_buf(out, "	uncompressed size:	");
-	bch2_hprint(out, compressed_sectors_uncompressed << 9);
-	pr_buf(out, "\n");
-
-	pr_buf(out, "incompressible:\n");
-	pr_buf(out, "	nr extents:		%llu\n", nr_incompressible_extents);
-	pr_buf(out, "	size:			");
-	bch2_hprint(out, incompressible_sectors << 9);
-	pr_buf(out, "\n");
+	prt_printf(out, "uncompressed:\n");
+	prt_printf(out, "	nr extents:		%llu\n", nr_uncompressed_extents);
+	prt_printf(out, "	size:			");
+	prt_human_readable_u64(out, uncompressed_sectors << 9);
+	prt_printf(out, "\n");
+
+	prt_printf(out, "compressed:\n");
+	prt_printf(out, "	nr extents:		%llu\n", nr_compressed_extents);
+	prt_printf(out, "	compressed size:	");
+	prt_human_readable_u64(out, compressed_sectors_compressed << 9);
+	prt_printf(out, "\n");
+	prt_printf(out, "	uncompressed size:	");
+	prt_human_readable_u64(out, compressed_sectors_uncompressed << 9);
+	prt_printf(out, "\n");
+
+	prt_printf(out, "incompressible:\n");
+	prt_printf(out, "	nr extents:		%llu\n", nr_incompressible_extents);
+	prt_printf(out, "	size:			");
+	prt_human_readable_u64(out, incompressible_sectors << 9);
+	prt_printf(out, "\n");
 	return 0;
 }
 
 static void bch2_gc_gens_pos_to_text(struct printbuf *out, struct bch_fs *c)
 {
-	pr_buf(out, "%s: ", bch2_btree_ids[c->gc_gens_btree]);
+	prt_printf(out, "%s: ", bch2_btree_ids[c->gc_gens_btree]);
 	bch2_bpos_to_text(out, c->gc_gens_pos);
-	pr_buf(out, "\n");
+	prt_printf(out, "\n");
 }
 
 SHOW(bch2_fs)
@@ -563,20 +563,21 @@ SHOW(bch2_fs_counters)
 	u64 counter = 0;
 	u64 counter_since_mount = 0;
 
-	out->tabstops[0] = 32;
+	printbuf_tabstop_push(out, 32);
+
 	#define x(t, ...) \
 		if (attr == &sysfs_##t) {					\
 			counter             = percpu_u64_get(&c->counters[BCH_COUNTER_##t]);\
 			counter_since_mount = counter - c->counters_on_mount[BCH_COUNTER_##t];\
-			pr_buf(out, "since mount:");				\
-			pr_tab(out);						\
-			bch2_hprint(out, counter_since_mount << 9);		\
-			pr_newline(out);					\
+			prt_printf(out, "since mount:");				\
+			prt_tab(out);						\
+			prt_human_readable_u64(out, counter_since_mount << 9);	\
+			prt_newline(out);					\
 										\
-			pr_buf(out, "since filesystem creation:");		\
-			pr_tab(out);						\
-			bch2_hprint(out, counter << 9);				\
-			pr_newline(out);					\
+			prt_printf(out, "since filesystem creation:");		\
+			prt_tab(out);						\
+			prt_human_readable_u64(out, counter << 9);		\
+			prt_newline(out);					\
 		}
 	BCH_PERSISTENT_COUNTERS()
 	#undef x
@@ -658,7 +659,7 @@ SHOW(bch2_fs_opts_dir)
 	u64 v = bch2_opt_get_by_id(&c->opts, id);
 
 	bch2_opt_to_text(out, c, c->disk_sb.sb, opt, v, OPT_SHOW_FULL_LIST);
-	pr_char(out, '\n');
+	prt_char(out, '\n');
 
 	return 0;
 }
@@ -771,17 +772,17 @@ static void dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca)
 	for (i = 0; i < ARRAY_SIZE(c->open_buckets); i++)
 		nr[c->open_buckets[i].data_type]++;
 
-	pr_buf(out,
+	prt_printf(out,
 	       "\t\t\t buckets\t sectors      fragmented\n"
 	       "capacity\t%16llu\n",
 	       ca->mi.nbuckets - ca->mi.first_bucket);
 
 	for (i = 0; i < BCH_DATA_NR; i++)
-		pr_buf(out, "%-16s%16llu%16llu%16llu\n",
+		prt_printf(out, "%-16s%16llu%16llu%16llu\n",
 		       bch2_data_types[i], stats.d[i].buckets,
 		       stats.d[i].sectors, stats.d[i].fragmented);
 
-	pr_buf(out,
+	prt_printf(out,
 	       "ec\t\t%16llu\n"
 	       "\n"
 	       "freelist_wait\t\t%s\n"
@@ -814,10 +815,10 @@ static void dev_iodone_to_text(struct printbuf *out, struct bch_dev *ca)
 	int rw, i;
 
 	for (rw = 0; rw < 2; rw++) {
-		pr_buf(out, "%s:\n", bch2_rw[rw]);
+		prt_printf(out, "%s:\n", bch2_rw[rw]);
 
 		for (i = 1; i < BCH_DATA_NR; i++)
-			pr_buf(out, "%-12s:%12llu\n",
+			prt_printf(out, "%-12s:%12llu\n",
 			       bch2_data_types[i],
 			       percpu_u64_get(&ca->io_done->sectors[rw][i]) << 9);
 	}
@@ -844,19 +845,17 @@ SHOW(bch2_dev)
 			mutex_unlock(&c->sb_lock);
 		}
 
-		pr_char(out, '\n');
+		prt_char(out, '\n');
 	}
 
 	if (attr == &sysfs_has_data) {
-		bch2_flags_to_text(out, bch2_data_types,
-				   bch2_dev_has_data(c, ca));
-		pr_char(out, '\n');
+		prt_bitflags(out, bch2_data_types, bch2_dev_has_data(c, ca));
+		prt_char(out, '\n');
 	}
 
 	if (attr == &sysfs_state_rw) {
-		bch2_string_opt_to_text(out, bch2_member_states,
-					ca->mi.state);
-		pr_char(out, '\n');
+		prt_string_option(out, bch2_member_states, ca->mi.state);
+		prt_char(out, '\n');
 	}
 
 	if (attr == &sysfs_iodone)
diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c
index 8ed28bf5e82c..fa3712a1478c 100644
--- a/fs/bcachefs/tests.c
+++ b/fs/bcachefs/tests.c
@@ -932,8 +932,8 @@ int bch2_btree_perf_test(struct bch_fs *c, const char *testname,
 	time = j.finish - j.start;
 
 	scnprintf(name_buf, sizeof(name_buf), "%s:", testname);
-	bch2_hprint(&nr_buf, nr);
-	bch2_hprint(&per_sec_buf, div64_u64(nr * NSEC_PER_SEC, time));
+	prt_human_readable_u64(&nr_buf, nr);
+	prt_human_readable_u64(&per_sec_buf, div64_u64(nr * NSEC_PER_SEC, time));
 	printk(KERN_INFO "%-12s %s with %u threads in %5llu sec, %5llu nsec per iter, %5s per sec\n",
 		name_buf, nr_buf.buf, nr_threads,
 		div_u64(time, NSEC_PER_SEC),
diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c
index e1d36d9b092c..dac7dfd0b806 100644
--- a/fs/bcachefs/util.c
+++ b/fs/bcachefs/util.c
@@ -99,135 +99,6 @@ STRTO_H(strtoll, long long)
 STRTO_H(strtoull, unsigned long long)
 STRTO_H(strtou64, u64)
 
-static int bch2_printbuf_realloc(struct printbuf *out, unsigned extra)
-{
-	unsigned new_size;
-	char *buf;
-
-	if (out->pos + extra + 1 < out->size)
-		return 0;
-
-	new_size = roundup_pow_of_two(out->size + extra);
-	buf = krealloc(out->buf, new_size, !out->atomic ? GFP_KERNEL : GFP_ATOMIC);
-
-	if (!buf) {
-		out->allocation_failure = true;
-		return -ENOMEM;
-	}
-
-	out->buf	= buf;
-	out->size	= new_size;
-	return 0;
-}
-
-void bch2_pr_buf(struct printbuf *out, const char *fmt, ...)
-{
-	va_list args;
-	int len;
-
-	do {
-		va_start(args, fmt);
-		len = vsnprintf(out->buf + out->pos, printbuf_remaining(out), fmt, args);
-		va_end(args);
-	} while (len + 1 >= printbuf_remaining(out) &&
-		 !bch2_printbuf_realloc(out, len + 1));
-
-	len = min_t(size_t, len,
-		  printbuf_remaining(out) ? printbuf_remaining(out) - 1 : 0);
-	out->pos += len;
-}
-
-void bch2_pr_tab_rjust(struct printbuf *buf)
-{
-	BUG_ON(buf->tabstop > ARRAY_SIZE(buf->tabstops));
-
-	if (printbuf_linelen(buf) < buf->tabstops[buf->tabstop]) {
-		unsigned move = buf->pos - buf->last_field;
-		unsigned shift = buf->tabstops[buf->tabstop] -
-			printbuf_linelen(buf);
-
-		bch2_printbuf_realloc(buf, shift);
-
-		if (buf->last_field + shift + 1 < buf->size) {
-			move = min(move, buf->size - 1 - buf->last_field - shift);
-
-			memmove(buf->buf + buf->last_field + shift,
-				buf->buf + buf->last_field,
-				move);
-			memset(buf->buf + buf->last_field, ' ', shift);
-			buf->pos += shift;
-			buf->buf[buf->pos] = 0;
-		}
-	}
-
-	buf->last_field = buf->pos;
-	buf->tabstop++;
-}
-
-void bch2_hprint(struct printbuf *buf, s64 v)
-{
-	int u, t = 0;
-
-	for (u = 0; v >= 1024 || v <= -1024; u++) {
-		t = v & ~(~0U << 10);
-		v >>= 10;
-	}
-
-	pr_buf(buf, "%lli", v);
-
-	/*
-	 * 103 is magic: t is in the range [-1023, 1023] and we want
-	 * to turn it into [-9, 9]
-	 */
-	if (u && t && v < 100 && v > -100)
-		pr_buf(buf, ".%i", t / 103);
-	if (u)
-		pr_char(buf, si_units[u]);
-}
-
-void bch2_pr_units(struct printbuf *out, s64 raw, s64 bytes)
-{
-	switch (out->units) {
-	case PRINTBUF_UNITS_RAW:
-		pr_buf(out, "%llu", raw);
-		break;
-	case PRINTBUF_UNITS_BYTES:
-		pr_buf(out, "%llu", bytes);
-		break;
-	case PRINTBUF_UNITS_HUMAN_READABLE:
-		bch2_hprint(out, bytes);
-		break;
-	}
-}
-
-void bch2_string_opt_to_text(struct printbuf *out,
-			     const char * const list[],
-			     size_t selected)
-{
-	size_t i;
-
-	for (i = 0; list[i]; i++)
-		pr_buf(out, i == selected ? "[%s] " : "%s ", list[i]);
-}
-
-void bch2_flags_to_text(struct printbuf *out,
-			const char * const list[], u64 flags)
-{
-	unsigned bit, nr = 0;
-	bool first = true;
-
-	while (list[nr])
-		nr++;
-
-	while (flags && (bit = __ffs(flags)) < nr) {
-		if (!first)
-			pr_buf(out, ",");
-		first = false;
-		pr_buf(out, "%s", list[bit]);
-		flags ^= 1 << bit;
-	}
-}
-
 u64 bch2_read_flag_list(char *opt, const char * const list[])
 {
 	u64 ret = 0;
@@ -394,7 +265,7 @@ void bch2_pr_time_units(struct printbuf *out, u64 ns)
 {
 	const struct time_unit *u = pick_time_units(ns);
 
-	pr_buf(out, "%llu %s", div_u64(ns, u->nsecs), u->name);
+	prt_printf(out, "%llu %s", div_u64(ns, u->nsecs), u->name);
 }
 
 void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats)
@@ -404,29 +275,29 @@ void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats
 	u64 q, last_q = 0;
 	int i;
 
-	pr_buf(out, "count:\t\t%llu\n",
+	prt_printf(out, "count:\t\t%llu\n",
 			 stats->count);
-	pr_buf(out, "rate:\t\t%llu/sec\n",
+	prt_printf(out, "rate:\t\t%llu/sec\n",
 	       freq ?  div64_u64(NSEC_PER_SEC, freq) : 0);
 
-	pr_buf(out, "frequency:\t");
+	prt_printf(out, "frequency:\t");
 	bch2_pr_time_units(out, freq);
 
-	pr_buf(out, "\navg duration:\t");
+	prt_printf(out, "\navg duration:\t");
 	bch2_pr_time_units(out, stats->average_duration);
 
-	pr_buf(out, "\nmax duration:\t");
+	prt_printf(out, "\nmax duration:\t");
 	bch2_pr_time_units(out, stats->max_duration);
 
 	i = eytzinger0_first(NR_QUANTILES);
 	u = pick_time_units(stats->quantiles.entries[i].m);
 
-	pr_buf(out, "\nquantiles (%s):\t", u->name);
+	prt_printf(out, "\nquantiles (%s):\t", u->name);
 	eytzinger0_for_each(i, NR_QUANTILES) {
 		bool is_last = eytzinger0_next(i, NR_QUANTILES) == -1;
 
 		q = max(stats->quantiles.entries[i].m, last_q);
-		pr_buf(out, "%llu%s",
+		prt_printf(out, "%llu%s",
 		       div_u64(q, u->nsecs),
 		       is_last ? "\n" : " ");
 		last_q = q;
@@ -548,42 +419,43 @@ void bch2_pd_controller_init(struct bch_pd_controller *pd)
 
 void bch2_pd_controller_debug_to_text(struct printbuf *out, struct bch_pd_controller *pd)
 {
-	out->tabstops[0] = 20;
-
-	pr_buf(out, "rate:");
-	pr_tab(out);
-	bch2_hprint(out, pd->rate.rate);
-	pr_newline(out);
-
-	pr_buf(out, "target:");
-	pr_tab(out);
-	bch2_hprint(out, pd->last_target);
-	pr_newline(out);
-
-	pr_buf(out, "actual:");
-	pr_tab(out);
-	bch2_hprint(out, pd->last_actual);
-	pr_newline(out);
-
-	pr_buf(out, "proportional:");
-	pr_tab(out);
-	bch2_hprint(out, pd->last_proportional);
-	pr_newline(out);
-
-	pr_buf(out, "derivative:");
-	pr_tab(out);
-	bch2_hprint(out, pd->last_derivative);
-	pr_newline(out);
-
-	pr_buf(out, "change:");
-	pr_tab(out);
-	bch2_hprint(out, pd->last_change);
-	pr_newline(out);
-
-	pr_buf(out, "next io:");
-	pr_tab(out);
-	pr_buf(out, "%llims", div64_s64(pd->rate.next - local_clock(), NSEC_PER_MSEC));
-	pr_newline(out);
+	if (!out->nr_tabstops)
+		printbuf_tabstop_push(out, 20);
+
+	prt_printf(out, "rate:");
+	prt_tab(out);
+	prt_human_readable_s64(out, pd->rate.rate);
+	prt_newline(out);
+
+	prt_printf(out, "target:");
+	prt_tab(out);
+	prt_human_readable_u64(out, pd->last_target);
+	prt_newline(out);
+
+	prt_printf(out, "actual:");
+	prt_tab(out);
+	prt_human_readable_u64(out, pd->last_actual);
+	prt_newline(out);
+
+	prt_printf(out, "proportional:");
+	prt_tab(out);
+	prt_human_readable_s64(out, pd->last_proportional);
+	prt_newline(out);
+
+	prt_printf(out, "derivative:");
+	prt_tab(out);
+	prt_human_readable_s64(out, pd->last_derivative);
+	prt_newline(out);
+
+	prt_printf(out, "change:");
+	prt_tab(out);
+	prt_human_readable_s64(out, pd->last_change);
+	prt_newline(out);
+
+	prt_printf(out, "next io:");
+	prt_tab(out);
+	prt_printf(out, "%llims", div64_s64(pd->rate.next - local_clock(), NSEC_PER_MSEC));
+	prt_newline(out);
 }
 
 /* misc: */
diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h
index 74bfa5faf470..d5b19b1b2020 100644
--- a/fs/bcachefs/util.h
+++ b/fs/bcachefs/util.h
@@ -237,127 +237,39 @@ do {									\
 #define ANYSINT_MAX(t)							\
 	((((t) 1 << (sizeof(t) * 8 - 2)) - (t) 1) * (t) 2 + (t) 1)
 
-enum printbuf_units {
-	PRINTBUF_UNITS_RAW,
-	PRINTBUF_UNITS_BYTES,
-	PRINTBUF_UNITS_HUMAN_READABLE,
-};
-
-struct printbuf {
-	char			*buf;
-	unsigned		size;
-	unsigned		pos;
-	unsigned		last_newline;
-	unsigned		last_field;
-	unsigned		indent;
-	enum printbuf_units	units:8;
-	u8			atomic;
-	bool			allocation_failure:1;
-	u8			tabstop;
-	u8			tabstops[4];
-};
-
-#define PRINTBUF ((struct printbuf) { NULL })
-
-static inline void printbuf_exit(struct printbuf *buf)
-{
-	kfree(buf->buf);
-	buf->buf = ERR_PTR(-EINTR); /* poison value */
-}
-
-static inline void printbuf_reset(struct printbuf *buf)
-{
-	buf->pos		= 0;
-	buf->last_newline	= 0;
-	buf->last_field		= 0;
-	buf->indent		= 0;
-	buf->tabstop		= 0;
-}
-
-static inline size_t printbuf_remaining(struct printbuf *buf)
-{
-	return buf->size - buf->pos;
-}
-
-static inline size_t printbuf_linelen(struct printbuf *buf)
-{
-	return buf->pos - buf->last_newline;
-}
-
-void bch2_pr_buf(struct printbuf *out, const char *fmt, ...)
-	__attribute__ ((format (printf, 2, 3)));
-
-#define pr_buf(_out, ...) bch2_pr_buf(_out, __VA_ARGS__)
-
-static inline void pr_char(struct printbuf *out, char c)
-{
-	bch2_pr_buf(out, "%c", c);
-}
-
-static inline void pr_indent_push(struct printbuf *buf, unsigned spaces)
-{
-	buf->indent += spaces;
-	while (spaces--)
-		pr_char(buf, ' ');
-}
-
-static inline void pr_indent_pop(struct printbuf *buf, unsigned spaces)
-{
-	if (buf->last_newline + buf->indent == buf->pos) {
-		buf->pos -= spaces;
-		buf->buf[buf->pos] = '\0';
-	}
-	buf->indent -= spaces;
-}
-
-static inline void pr_newline(struct printbuf *buf)
-{
-	unsigned i;
-
-	pr_char(buf, '\n');
+#include "printbuf.h"
 
-	buf->last_newline	= buf->pos;
+#define prt_vprintf(_out, ...)		bch2_prt_vprintf(_out, __VA_ARGS__)
+#define prt_printf(_out, ...)		bch2_prt_printf(_out, __VA_ARGS__)
+#define printbuf_str(_buf)		bch2_printbuf_str(_buf)
+#define printbuf_exit(_buf)		bch2_printbuf_exit(_buf)
 
-	for (i = 0; i < buf->indent; i++)
-		pr_char(buf, ' ');
+#define printbuf_tabstops_reset(_buf)	bch2_printbuf_tabstops_reset(_buf)
+#define printbuf_tabstop_pop(_buf)	bch2_printbuf_tabstop_pop(_buf)
+#define printbuf_tabstop_push(_buf, _n)	bch2_printbuf_tabstop_push(_buf, _n)
 
-	buf->last_field		= buf->pos;
-	buf->tabstop = 0;
-}
+#define printbuf_indent_add(_out, _n)	bch2_printbuf_indent_add(_out, _n)
+#define printbuf_indent_sub(_out, _n)	bch2_printbuf_indent_sub(_out, _n)
 
-static inline void pr_tab(struct printbuf *buf)
-{
-	BUG_ON(buf->tabstop > ARRAY_SIZE(buf->tabstops));
+#define prt_newline(_out)		bch2_prt_newline(_out)
+#define prt_tab(_out)			bch2_prt_tab(_out)
+#define prt_tab_rjust(_out)		bch2_prt_tab_rjust(_out)
 
-	while (printbuf_remaining(buf) > 1 &&
-	       printbuf_linelen(buf) < buf->tabstops[buf->tabstop])
-		pr_char(buf, ' ');
-
-	buf->last_field = buf->pos;
-	buf->tabstop++;
-}
-
-void bch2_pr_tab_rjust(struct printbuf *);
-
-static inline void pr_tab_rjust(struct printbuf *buf)
-{
-	bch2_pr_tab_rjust(buf);
-}
-
-void bch2_pr_units(struct printbuf *, s64, s64);
-#define pr_units(...) bch2_pr_units(__VA_ARGS__)
-
-static inline void pr_sectors(struct printbuf *out, u64 v)
-{
-	bch2_pr_units(out, v, v << 9);
-}
+#define prt_bytes_indented(...)		bch2_prt_bytes_indented(__VA_ARGS__)
+#define prt_u64(_out, _v)		prt_printf(_out, "%llu", _v)
+#define prt_human_readable_u64(...)	bch2_prt_human_readable_u64(__VA_ARGS__)
+#define prt_human_readable_s64(...)	bch2_prt_human_readable_s64(__VA_ARGS__)
+#define prt_units_u64(...)		bch2_prt_units_u64(__VA_ARGS__)
+#define prt_units_s64(...)		bch2_prt_units_s64(__VA_ARGS__)
+#define prt_string_option(...)		bch2_prt_string_option(__VA_ARGS__)
+#define prt_bitflags(...)		bch2_prt_bitflags(__VA_ARGS__)
 
 void bch2_pr_time_units(struct printbuf *, u64);
 
 #ifdef __KERNEL__
 static inline void pr_time(struct printbuf *out, u64 time)
 {
-	pr_buf(out, "%llu", time);
+	prt_printf(out, "%llu", time);
 }
 #else
 #include <time.h>
@@ -368,9 +280,9 @@ static inline void pr_time(struct printbuf *out, u64 _time)
 	struct tm *tm = localtime(&time);
 	size_t err = strftime(time_str, sizeof(time_str), "%c", tm);
 	if (!err)
-		pr_buf(out, "(formatting error)");
+		prt_printf(out, "(formatting error)");
 	else
-		pr_buf(out, "%s", time_str);
+		prt_printf(out, "%s", time_str);
 }
 #endif
 
@@ -388,7 +300,7 @@ static inline void pr_uuid(struct printbuf *out, u8 *uuid)
 	char uuid_str[40];
 
 	uuid_unparse_lower(uuid, uuid_str);
-	pr_buf(out, "%s", uuid_str);
+	prt_printf(out, "%s", uuid_str);
 }
 
 int bch2_strtoint_h(const char *, int *);
@@ -454,7 +366,7 @@ static inline int bch2_strtoul_h(const char *cp, long *res)
 })
 
 #define snprint(out, var)						\
-	pr_buf(out,							\
+	prt_printf(out,							\
 		   type_is(var, int)		? "%i\n"		\
 		 : type_is(var, unsigned)	? "%u\n"		\
 		 : type_is(var, long)		? "%li\n"		\
@@ -464,14 +376,8 @@ static inline int bch2_strtoul_h(const char *cp, long *res)
 		 : type_is(var, char *)		? "%s\n"		\
 		 : "%i\n", var)
 
-void bch2_hprint(struct printbuf *, s64);
-
 bool bch2_is_zero(const void *, size_t);
 
-void bch2_string_opt_to_text(struct printbuf *,
-			     const char * const [], size_t);
-
-void bch2_flags_to_text(struct printbuf *, const char * const[], u64);
 u64 bch2_read_flag_list(char *, const char * const[]);
 
 #define NR_QUANTILES	15
diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c
index b5e42ca35dea..5df61b6b4a3c 100644
--- a/fs/bcachefs/xattr.c
+++ b/fs/bcachefs/xattr.c
@@ -76,7 +76,7 @@ int bch2_xattr_invalid(const struct bch_fs *c, struct bkey_s_c k,
 	struct bkey_s_c_xattr xattr = bkey_s_c_to_xattr(k);
 
 	if (bkey_val_bytes(k.k) < sizeof(struct bch_xattr)) {
-		pr_buf(err, "incorrect value size (%zu < %zu)",
+		prt_printf(err, "incorrect value size (%zu < %zu)",
 		       bkey_val_bytes(k.k), sizeof(*xattr.v));
 		return -EINVAL;
 	}
@@ -84,7 +84,7 @@ int bch2_xattr_invalid(const struct bch_fs *c, struct bkey_s_c k,
 	if (bkey_val_u64s(k.k) <
 	    xattr_val_u64s(xattr.v->x_name_len,
 			   le16_to_cpu(xattr.v->x_val_len))) {
-		pr_buf(err, "value too small (%zu < %u)",
+		prt_printf(err, "value too small (%zu < %u)",
 		       bkey_val_u64s(k.k),
 		       xattr_val_u64s(xattr.v->x_name_len,
 				      le16_to_cpu(xattr.v->x_val_len)));
@@ -95,7 +95,7 @@ int bch2_xattr_invalid(const struct bch_fs *c, struct bkey_s_c k,
 	if (bkey_val_u64s(k.k) >
 	    xattr_val_u64s(xattr.v->x_name_len,
 			   le16_to_cpu(xattr.v->x_val_len) + 4)) {
-		pr_buf(err, "value too big (%zu > %u)",
+		prt_printf(err, "value too big (%zu > %u)",
 		       bkey_val_u64s(k.k),
 		       xattr_val_u64s(xattr.v->x_name_len,
 				      le16_to_cpu(xattr.v->x_val_len) + 4));
@@ -104,12 +104,12 @@ int bch2_xattr_invalid(const struct bch_fs *c, struct bkey_s_c k,
 
 	handler = bch2_xattr_type_to_handler(xattr.v->x_type);
 	if (!handler) {
-		pr_buf(err, "invalid type (%u)", xattr.v->x_type);
+		prt_printf(err, "invalid type (%u)", xattr.v->x_type);
 		return -EINVAL;
 	}
 
 	if (memchr(xattr.v->x_name, '\0', xattr.v->x_name_len)) {
-		pr_buf(err, "xattr name has invalid characters");
+		prt_printf(err, "xattr name has invalid characters");
 		return -EINVAL;
 	}
 
@@ -124,13 +124,13 @@ void bch2_xattr_to_text(struct printbuf *out, struct bch_fs *c,
 
 	handler = bch2_xattr_type_to_handler(xattr.v->x_type);
 	if (handler && handler->prefix)
-		pr_buf(out, "%s", handler->prefix);
+		prt_printf(out, "%s", handler->prefix);
 	else if (handler)
-		pr_buf(out, "(type %u)", xattr.v->x_type);
+		prt_printf(out, "(type %u)", xattr.v->x_type);
 	else
-		pr_buf(out, "(unknown type %u)", xattr.v->x_type);
+		prt_printf(out, "(unknown type %u)", xattr.v->x_type);
 
-	pr_buf(out, "%.*s:%.*s",
+	prt_printf(out, "%.*s:%.*s",
 	       xattr.v->x_name_len,
 	       xattr.v->x_name,
 	       le16_to_cpu(xattr.v->x_val_len),
-- 
cgit 


From df8c2ccb9309f342724f3104fd9d68d7e4ce65da Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 10 Jun 2022 19:39:00 -0400
Subject: bcachefs: Fix freespace initialization

bch2_dev_freespace_init() was using __bch2_trans_do() incorrectly, and
calling bch2_bucket_do_index() with a stale alloc key.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/alloc_background.c | 19 ++++++++++++++++---
 1 file changed, 16 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index bffbddbdacea..982cd185b1af 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -1190,12 +1190,26 @@ void bch2_do_invalidates(struct bch_fs *c)
 		queue_work(system_long_wq, &c->invalidate_work);
 }
 
+static int bucket_freespace_init(struct btree_trans *trans, struct btree_iter *iter)
+{
+	struct bch_alloc_v4 a;
+	struct bkey_s_c k;
+	int ret;
+
+	k = bch2_btree_iter_peek_slot(iter);
+	ret = bkey_err(k);
+	if (ret)
+		return ret;
+
+	bch2_alloc_to_v4(k, &a);
+	return bch2_bucket_do_index(trans, k, &a, true);
+}
+
 static int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca)
 {
 	struct btree_trans trans;
 	struct btree_iter iter;
 	struct bkey_s_c k;
-	struct bch_alloc_v4 a;
 	struct bch_member *m;
 	int ret;
 
@@ -1208,10 +1222,9 @@ static int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca)
 		if (iter.pos.offset >= ca->mi.nbuckets)
 			break;
 
-		bch2_alloc_to_v4(k, &a);
 		ret = __bch2_trans_do(&trans, NULL, NULL,
 				      BTREE_INSERT_LAZY_RW,
-				 bch2_bucket_do_index(&trans, k, &a, true));
+				 bucket_freespace_init(&trans, &iter));
 		if (ret)
 			break;
 	}
-- 
cgit 


From a5d18f9ec0e2cfce1e0dc50f07d614ba21edaae9 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 10 Jun 2022 19:45:19 -0400
Subject: bcachefs: Improved human readable integer parsing

Printbufs recently switched to using string_get_size() for printing
integers in human readable units. This updates __bch2_strtoh() to parse
numbers printed by string_get_size() - we now have to handle floating
point numbers, and new unit suffixes.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/util.c | 131 +++++++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 117 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c
index dac7dfd0b806..a2e42ae77371 100644
--- a/fs/bcachefs/util.c
+++ b/fs/bcachefs/util.c
@@ -27,16 +27,19 @@
 
 static const char si_units[] = "?kMGTPEZY";
 
-static int __bch2_strtoh(const char *cp, u64 *res,
-			 u64 t_max, bool t_signed)
+/* string_get_size units: */
+static const char *const units_2[] = {
+	"B", "KiB", "MiB", "GiB", "TiB", "PiB", "EiB", "ZiB", "YiB"
+};
+static const char *const units_10[] = {
+	"B", "kB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB"
+};
+
+static int parse_u64(const char *cp, u64 *res)
 {
-	bool positive = *cp != '-';
-	unsigned u;
+	const char *start = cp;
 	u64 v = 0;
 
-	if (*cp == '+' || *cp == '-')
-		cp++;
-
 	if (!isdigit(*cp))
 		return -EINVAL;
 
@@ -50,22 +53,122 @@ static int __bch2_strtoh(const char *cp, u64 *res,
 		cp++;
 	} while (isdigit(*cp));
 
+	*res = v;
+	return cp - start;
+}
+
+static int bch2_pow(u64 n, u64 p, u64 *res)
+{
+	*res = 1;
+
+	while (p--) {
+		if (*res > div_u64(U64_MAX, n))
+			return -ERANGE;
+		*res *= n;
+	}
+	return 0;
+}
+
+static int parse_unit_suffix(const char *cp, u64 *res)
+{
+	const char *start = cp;
+	u64 base = 1024;
+	unsigned u;
+	int ret;
+
+	if (*cp == ' ')
+		cp++;
+
 	for (u = 1; u < strlen(si_units); u++)
 		if (*cp == si_units[u]) {
 			cp++;
 			goto got_unit;
 		}
-	u = 0;
+
+	for (u = 0; u < ARRAY_SIZE(units_2); u++)
+		if (!strncmp(cp, units_2[u], strlen(units_2[u]))) {
+			cp += strlen(units_2[u]);
+			goto got_unit;
+		}
+
+	for (u = 0; u < ARRAY_SIZE(units_10); u++)
+		if (!strncmp(cp, units_10[u], strlen(units_10[u]))) {
+			cp += strlen(units_10[u]);
+			base = 1000;
+			goto got_unit;
+		}
+
+	*res = 1;
+	return 0;
 got_unit:
-	if (*cp == '\n')
+	ret = bch2_pow(base, u, res);
+	if (ret)
+		return ret;
+
+	return cp - start;
+}
+
+#define parse_or_ret(cp, _f)			\
+do {						\
+	int ret = _f;				\
+	if (ret < 0)				\
+		return ret;			\
+	cp += ret;				\
+} while (0)
+
+static int __bch2_strtou64_h(const char *cp, u64 *res)
+{
+	const char *start = cp;
+	u64 v = 0, b, f_n = 0, f_d = 1;
+	int ret;
+
+	parse_or_ret(cp, parse_u64(cp, &v));
+
+	if (*cp == '.') {
 		cp++;
-	if (*cp)
-		return -EINVAL;
+		ret = parse_u64(cp, &f_n);
+		if (ret < 0)
+			return ret;
+		cp += ret;
+
+		ret = bch2_pow(10, ret, &f_d);
+		if (ret)
+			return ret;
+	}
+
+	parse_or_ret(cp, parse_unit_suffix(cp, &b));
+
+	if (v > div_u64(U64_MAX, b))
+		return -ERANGE;
+	v *= b;
+
+	if (f_n > div_u64(U64_MAX, b))
+		return -ERANGE;
 
-	if (fls64(v) + u * 10 > 64)
+	f_n = div_u64(f_n * b, f_d);
+	if (v + f_n < v)
 		return -ERANGE;
+	v += f_n;
 
-	v <<= u * 10;
+	*res = v;
+	return cp - start;
+}
+
+static int __bch2_strtoh(const char *cp, u64 *res,
+			 u64 t_max, bool t_signed)
+{
+	bool positive = *cp != '-';
+	u64 v = 0;
+
+	if (*cp == '+' || *cp == '-')
+		cp++;
+
+	parse_or_ret(cp, __bch2_strtou64_h(cp, &v));
+
+	if (*cp == '\n')
+		cp++;
+	if (*cp)
+		return -EINVAL;
 
 	if (positive) {
 		if (v > t_max)
@@ -86,7 +189,7 @@ got_unit:
 #define STRTO_H(name, type)					\
 int bch2_ ## name ## _h(const char *cp, type *res)		\
 {								\
-	u64 v;							\
+	u64 v = 0;						\
 	int ret = __bch2_strtoh(cp, &v, ANYSINT_MAX(type),	\
 			ANYSINT_MAX(type) != ((type) ~0ULL));	\
 	*res = v;						\
-- 
cgit 


From 0e96f5dcd7ff678c5448f64cb957117399754d2e Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 13 Jun 2022 19:34:17 -0400
Subject: bcachefs: Call bch2_do_invalidates() when going read write

Like bch2_do_discards(), we should check if this needs to be done when
going rw.

Also, add some sysfs code for debugging bucket invalidation.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/super.c | 1 +
 fs/bcachefs/sysfs.c | 7 +++++++
 2 files changed, 8 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index cc887a1b8c8f..b926fb1b14a9 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -383,6 +383,7 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early)
 	bch2_recalc_capacity(c);
 
 	bch2_do_discards(c);
+	bch2_do_invalidates(c);
 
 	if (!early) {
 		ret = bch2_fs_read_write_late(c);
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index 4a85fffdfa4c..b2d6a5c49a4d 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -154,6 +154,7 @@ do {									\
 
 write_attribute(trigger_gc);
 write_attribute(trigger_discards);
+write_attribute(trigger_invalidates);
 write_attribute(prune_cache);
 rw_attribute(btree_gc_periodic);
 rw_attribute(gc_gens_pos);
@@ -516,6 +517,9 @@ STORE(bch2_fs)
 	if (attr == &sysfs_trigger_discards)
 		bch2_do_discards(c);
 
+	if (attr == &sysfs_trigger_invalidates)
+		bch2_do_invalidates(c);
+
 #ifdef CONFIG_BCACHEFS_TESTS
 	if (attr == &sysfs_perf_test) {
 		char *tmp = kstrdup(buf, GFP_KERNEL), *p = tmp;
@@ -627,6 +631,7 @@ struct attribute *bch2_fs_internal_files[] = {
 
 	&sysfs_trigger_gc,
 	&sysfs_trigger_discards,
+	&sysfs_trigger_invalidates,
 	&sysfs_prune_cache,
 
 	&sysfs_read_realloc_races,
@@ -792,6 +797,7 @@ static void dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca)
 	       "open_buckets_wait\t%s\n"
 	       "open_buckets_btree\t%u\n"
 	       "open_buckets_user\t%u\n"
+	       "buckets_to_invalidate\t%llu\n"
 	       "btree reserve cache\t%u\n",
 	       stats.buckets_ec,
 	       c->freelist_wait.list.first		? "waiting" : "empty",
@@ -801,6 +807,7 @@ static void dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca)
 	       c->open_buckets_wait.list.first		? "waiting" : "empty",
 	       nr[BCH_DATA_btree],
 	       nr[BCH_DATA_user],
+	       should_invalidate_buckets(ca, stats),
 	       c->btree_reserve_cache_nr);
 }
 
-- 
cgit 


From 440c15cc912d630320bfb684d799ab225be4178e Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 13 Jun 2022 19:45:07 -0400
Subject: bcachefs: Add a persistent counter for bucket invalidation

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/alloc_background.c | 7 ++++++-
 fs/bcachefs/bcachefs_format.h  | 9 +++++----
 2 files changed, 11 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 982cd185b1af..543ec0c45883 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -1173,11 +1173,16 @@ static void bch2_do_invalidates_work(struct work_struct *work)
 		s64 nr_to_invalidate =
 			should_invalidate_buckets(ca, bch2_dev_usage_read(ca));
 
-		while (!ret && nr_to_invalidate-- >= 0)
+		while (nr_to_invalidate-- >= 0) {
 			ret = __bch2_trans_do(&trans, NULL, NULL,
 					      BTREE_INSERT_USE_RESERVE|
 					      BTREE_INSERT_NOFAIL,
 					invalidate_one_bucket(&trans, ca));
+			if (ret)
+				break;
+
+			this_cpu_inc(c->counters[BCH_COUNTER_bucket_invalidate]);
+		}
 	}
 
 	bch2_trans_exit(&trans);
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index 079ad93ab34e..944cc6247a84 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -1326,10 +1326,11 @@ struct bch_sb_field_disk_groups {
 
 /* BCH_SB_FIELD_counters */
 
-#define BCH_PERSISTENT_COUNTERS()	\
-	x(io_read,  0)			\
-	x(io_write, 1)			\
-	x(io_move,  2)
+#define BCH_PERSISTENT_COUNTERS()			\
+	x(io_read,		0)			\
+	x(io_write,		1)			\
+	x(io_move,		2)			\
+	x(bucket_invalidate,	3)
 
 enum bch_persistent_counters {
 #define x(t, n, ...) BCH_COUNTER_##t,
-- 
cgit 


From c9bd67321e9b9bae0a9ba151a3906086878159b6 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 14 Jun 2022 01:37:16 -0400
Subject: bcachefs: Fix btree node read retries

b->written wasn't being reset to 0 in the btree node read retry path,
causing decrypting & validation of previously read bsets to not be
re-run - ouch.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_io.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 598c30b7ab8b..b7441677dc33 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -879,6 +879,8 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
 	int ret, retry_read = 0, write = READ;
 
 	b->version_ondisk = U16_MAX;
+	/* We might get called multiple times on read retry: */
+	b->written = 0;
 
 	iter = mempool_alloc(&c->fill_iter, GFP_NOIO);
 	sort_iter_init(iter, b);
-- 
cgit 


From 6f44a9940c600c5a0663b825bdd89aad0639e3a0 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 13 Jun 2022 23:32:09 -0400
Subject: bcachefs: Add a persistent counter for bucket discards

Like the previous patch for bucket invalidates, add another counter for
a core allocator path.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/alloc_background.c | 1 +
 fs/bcachefs/bcachefs_format.h  | 3 ++-
 2 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 543ec0c45883..012607cefb6f 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -1052,6 +1052,7 @@ static void bch2_do_discards_work(struct work_struct *work)
 		if (ret)
 			break;
 
+		this_cpu_inc(c->counters[BCH_COUNTER_bucket_discard]);
 		discarded++;
 	}
 	bch2_trans_iter_exit(&trans, &iter);
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index 944cc6247a84..b9d614f608b5 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -1330,7 +1330,8 @@ struct bch_sb_field_disk_groups {
 	x(io_read,		0)			\
 	x(io_write,		1)			\
 	x(io_move,		2)			\
-	x(bucket_invalidate,	3)
+	x(bucket_invalidate,	3)			\
+	x(bucket_discard,	4)
 
 enum bch_persistent_counters {
 #define x(t, n, ...) BCH_COUNTER_##t,
-- 
cgit 


From b7c1104612e4ab46d8e481e323fbe1cca07f5cbd Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 19 May 2022 15:29:50 -0400
Subject: bcachefs: Increase max size for btree_trans bump allocator

With backpointers, alloc keys have gotten bigger, so we're needing more
memory here.

We're probably going to need to go with something more sophisticated
than a bump allocator, but - let's see if we can avoid doing that just
yet.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_types.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 82c8c148c4bc..ede5661b62a5 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -374,7 +374,7 @@ struct btree_trans_commit_hook {
 	struct btree_trans_commit_hook	*next;
 };
 
-#define BTREE_TRANS_MEM_MAX	(1U << 14)
+#define BTREE_TRANS_MEM_MAX	(1U << 16)
 
 struct btree_trans {
 	struct bch_fs		*c;
-- 
cgit 


From 2ed6248ab3a58dbbe5819cbd0e60e4e3a5b72c47 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 17 Jun 2022 19:15:26 -0400
Subject: bcachefs: Fix assertion in bch2_dev_list_add_dev()

We were only allowing 4 devices in a dev_list, not 16.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/super.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/super.h b/fs/bcachefs/super.h
index 359fa1e7fc18..d66de6f589ac 100644
--- a/fs/bcachefs/super.h
+++ b/fs/bcachefs/super.h
@@ -89,7 +89,7 @@ static inline void bch2_dev_list_add_dev(struct bch_devs_list *devs,
 					 unsigned dev)
 {
 	BUG_ON(bch2_dev_list_has_dev(*devs, dev));
-	BUG_ON(devs->nr >= BCH_REPLICAS_MAX);
+	BUG_ON(devs->nr >= ARRAY_SIZE(devs->devs));
 	devs->devs[devs->nr++] = dev;
 }
 
-- 
cgit 


From 50b13beef09f445e1fb8fbf1e1f852df06baf05a Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 17 Jun 2022 20:12:02 -0400
Subject: bcachefs: Improve an error message

When inserting a key type that's not valid for a given btree, we should
print out which btree we were inserting into.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bkey_methods.c |   4 +-
 fs/bcachefs/btree_iter.c   | 109 ++++++++++++++++++++-------------------------
 fs/bcachefs/btree_iter.h   |   2 +-
 fs/bcachefs/btree_types.h  |   2 +-
 fs/bcachefs/debug.c        |  77 ++++++++++++++++++++++++++++++++
 fs/bcachefs/opts.c         |   1 +
 fs/bcachefs/sysfs.c        |   5 ---
 7 files changed, 131 insertions(+), 69 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c
index 390ea41414bc..f2351e5ee7c1 100644
--- a/fs/bcachefs/bkey_methods.c
+++ b/fs/bcachefs/bkey_methods.c
@@ -207,8 +207,8 @@ int __bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k,
 	}
 
 	if (!(bch2_key_types_allowed[type] & (1U << k.k->type))) {
-		prt_printf(err, "invalid key type for this btree (%s)",
-		       bch2_bkey_types[type]);
+		prt_printf(err, "invalid key type for btree %s (%s)",
+			   bch2_btree_ids[type], bch2_bkey_types[k.k->type]);
 		return -EINVAL;
 	}
 
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index a2219c13aee5..fc989b46b67e 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -3257,6 +3257,7 @@ void __bch2_trans_init(struct btree_trans *trans, struct bch_fs *c,
 	memset(trans, 0, sizeof(*trans));
 	trans->c		= c;
 	trans->fn		= fn;
+	trans->task		= current;
 	trans->journal_replay_not_finished =
 		!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags);
 
@@ -3277,9 +3278,17 @@ void __bch2_trans_init(struct btree_trans *trans, struct bch_fs *c,
 	trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
 
 	if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG_TRANSACTIONS)) {
-		trans->pid = current->pid;
+		struct btree_trans *pos;
+
 		mutex_lock(&c->btree_trans_lock);
-		list_add(&trans->list, &c->btree_trans_list);
+		list_for_each_entry(pos, &c->btree_trans_list, list) {
+			if (trans->task->pid < pos->task->pid) {
+				list_add_tail(&trans->list, &pos->list);
+				goto list_add_done;
+			}
+		}
+		list_add_tail(&trans->list, &c->btree_trans_list);
+list_add_done:
 		mutex_unlock(&c->btree_trans_lock);
 	}
 }
@@ -3371,77 +3380,57 @@ bch2_btree_path_node_to_text(struct printbuf *out,
 }
 
 #ifdef CONFIG_BCACHEFS_DEBUG_TRANSACTIONS
-static bool trans_has_locks(struct btree_trans *trans)
+void bch2_btree_trans_to_text(struct printbuf *out, struct btree_trans *trans)
 {
 	struct btree_path *path;
-
-	trans_for_each_path(trans, path)
-		if (path->nodes_locked)
-			return true;
-	return false;
-}
-#endif
-
-void bch2_btree_trans_to_text(struct printbuf *out, struct bch_fs *c)
-{
-#ifdef CONFIG_BCACHEFS_DEBUG_TRANSACTIONS
-	struct btree_trans *trans;
-	struct btree_path *path;
 	struct btree *b;
 	static char lock_types[] = { 'r', 'i', 'w' };
 	unsigned l;
 
-	mutex_lock(&c->btree_trans_lock);
-	list_for_each_entry(trans, &c->btree_trans_list, list) {
-		if (!trans_has_locks(trans))
-			continue;
-
-		prt_printf(out, "%i %s\n", trans->pid, trans->fn);
+	prt_printf(out, "%i %s\n", trans->task->pid, trans->fn);
 
-		trans_for_each_path(trans, path) {
-			if (!path->nodes_locked)
-				continue;
+	trans_for_each_path(trans, path) {
+		if (!path->nodes_locked)
+			continue;
 
-			prt_printf(out, "  path %u %c l=%u %s:",
-			       path->idx,
-			       path->cached ? 'c' : 'b',
-			       path->level,
-			       bch2_btree_ids[path->btree_id]);
-			bch2_bpos_to_text(out, path->pos);
-			prt_printf(out, "\n");
-
-			for (l = 0; l < BTREE_MAX_DEPTH; l++) {
-				if (btree_node_locked(path, l)) {
-					prt_printf(out, "    %s l=%u ",
-					       btree_node_intent_locked(path, l) ? "i" : "r", l);
-					bch2_btree_path_node_to_text(out,
-							(void *) path->l[l].b,
-							path->cached);
-					prt_printf(out, "\n");
-				}
+		prt_printf(out, "  path %u %c l=%u %s:",
+		       path->idx,
+		       path->cached ? 'c' : 'b',
+		       path->level,
+		       bch2_btree_ids[path->btree_id]);
+		bch2_bpos_to_text(out, path->pos);
+		prt_printf(out, "\n");
+
+		for (l = 0; l < BTREE_MAX_DEPTH; l++) {
+			if (btree_node_locked(path, l)) {
+				prt_printf(out, "    %s l=%u ",
+				       btree_node_intent_locked(path, l) ? "i" : "r", l);
+				bch2_btree_path_node_to_text(out,
+						(void *) path->l[l].b,
+						path->cached);
+				prt_printf(out, "\n");
 			}
 		}
+	}
 
-		b = READ_ONCE(trans->locking);
-		if (b) {
-			path = &trans->paths[trans->locking_path_idx];
-			prt_printf(out, "  locking path %u %c l=%u %c %s:",
-			       trans->locking_path_idx,
-			       path->cached ? 'c' : 'b',
-			       trans->locking_level,
-			       lock_types[trans->locking_lock_type],
-			       bch2_btree_ids[trans->locking_btree_id]);
-			bch2_bpos_to_text(out, trans->locking_pos);
-
-			prt_printf(out, " node ");
-			bch2_btree_path_node_to_text(out,
-					(void *) b, path->cached);
-			prt_printf(out, "\n");
-		}
+	b = READ_ONCE(trans->locking);
+	if (b) {
+		path = &trans->paths[trans->locking_path_idx];
+		prt_printf(out, "  locking path %u %c l=%u %c %s:",
+		       trans->locking_path_idx,
+		       path->cached ? 'c' : 'b',
+		       trans->locking_level,
+		       lock_types[trans->locking_lock_type],
+		       bch2_btree_ids[trans->locking_btree_id]);
+		bch2_bpos_to_text(out, trans->locking_pos);
+
+		prt_printf(out, " node ");
+		bch2_btree_path_node_to_text(out,
+				(void *) b, path->cached);
+		prt_printf(out, "\n");
 	}
-	mutex_unlock(&c->btree_trans_lock);
-#endif
 }
+#endif
 
 void bch2_fs_btree_iter_exit(struct bch_fs *c)
 {
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index 83587383a41f..39f241e25881 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -440,7 +440,7 @@ void bch2_trans_exit(struct btree_trans *);
 
 #define bch2_trans_init(...)	__bch2_trans_init(__VA_ARGS__, __func__)
 
-void bch2_btree_trans_to_text(struct printbuf *, struct bch_fs *);
+void bch2_btree_trans_to_text(struct printbuf *, struct btree_trans *);
 
 void bch2_fs_btree_iter_exit(struct bch_fs *);
 int bch2_fs_btree_iter_init(struct bch_fs *);
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index ede5661b62a5..4f3e1086a86b 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -386,7 +386,7 @@ struct btree_trans {
 	u8			locking_btree_id;
 	u8			locking_level;
 	u8			locking_lock_type;
-	pid_t			pid;
+	struct task_struct	*task;
 	int			srcu_idx;
 
 	u8			nr_sorted;
diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c
index 878f4e541f83..0f25b75e3de7 100644
--- a/fs/bcachefs/debug.c
+++ b/fs/bcachefs/debug.c
@@ -529,6 +529,78 @@ static const struct file_operations cached_btree_nodes_ops = {
 	.read		= bch2_cached_btree_nodes_read,
 };
 
+#ifdef CONFIG_BCACHEFS_DEBUG_TRANSACTIONS
+static int prt_backtrace(struct printbuf *out, struct task_struct *task)
+{
+	unsigned long entries[32];
+	unsigned i, nr_entries;
+	int ret;
+
+	ret = down_read_killable(&task->signal->exec_update_lock);
+	if (ret)
+		return ret;
+
+	nr_entries = stack_trace_save_tsk(task, entries, ARRAY_SIZE(entries), 0);
+	for (i = 0; i < nr_entries; i++) {
+		prt_printf(out, "[<0>] %pB", (void *)entries[i]);
+		prt_newline(out);
+	}
+
+	up_read(&task->signal->exec_update_lock);
+	return 0;
+}
+
+static ssize_t bch2_btree_transactions_read(struct file *file, char __user *buf,
+					    size_t size, loff_t *ppos)
+{
+	struct dump_iter *i = file->private_data;
+	struct bch_fs *c = i->c;
+	struct btree_trans *trans;
+	int err;
+
+	i->ubuf = buf;
+	i->size	= size;
+	i->ret	= 0;
+
+	mutex_lock(&c->btree_trans_lock);
+	list_for_each_entry(trans, &c->btree_trans_list, list) {
+		if (trans->task->pid <= i->iter)
+			continue;
+
+		err = flush_buf(i);
+		if (err)
+			return err;
+
+		if (!i->size)
+			break;
+
+		bch2_btree_trans_to_text(&i->buf, trans);
+
+		prt_printf(&i->buf, "backtrace:");
+		prt_newline(&i->buf);
+		printbuf_indent_add(&i->buf, 2);
+		prt_backtrace(&i->buf, trans->task);
+		printbuf_indent_sub(&i->buf, 2);
+		prt_newline(&i->buf);
+
+		i->iter = trans->task->pid;
+	}
+	mutex_unlock(&c->btree_trans_lock);
+
+	if (i->buf.allocation_failure)
+		return -ENOMEM;
+
+	return i->ret;
+}
+
+static const struct file_operations btree_transactions_ops = {
+	.owner		= THIS_MODULE,
+	.open		= bch2_dump_open,
+	.release	= bch2_dump_release,
+	.read		= bch2_btree_transactions_read,
+};
+#endif /* CONFIG_BCACHEFS_DEBUG_TRANSACTIONS */
+
 static ssize_t bch2_journal_pins_read(struct file *file, char __user *buf,
 				      size_t size, loff_t *ppos)
 {
@@ -588,6 +660,11 @@ void bch2_fs_debug_init(struct bch_fs *c)
 	debugfs_create_file("cached_btree_nodes", 0400, c->fs_debug_dir,
 			    c->btree_debug, &cached_btree_nodes_ops);
 
+#ifdef CONFIG_BCACHEFS_DEBUG_TRANSACTIONS
+	debugfs_create_file("btree_transactions", 0400, c->fs_debug_dir,
+			    c->btree_debug, &btree_transactions_ops);
+#endif
+
 	debugfs_create_file("journal_pins", 0400, c->fs_debug_dir,
 			    c->btree_debug, &journal_pins_ops);
 
diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c
index 5540d5d98d84..98568f21d6d0 100644
--- a/fs/bcachefs/opts.c
+++ b/fs/bcachefs/opts.c
@@ -33,6 +33,7 @@ const char * const bch2_sb_compat[] = {
 
 const char * const bch2_btree_ids[] = {
 	BCH_BTREE_IDS()
+	"interior btree node",
 	NULL
 };
 
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index b2d6a5c49a4d..173289c34de0 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -181,7 +181,6 @@ read_attribute(journal_debug);
 read_attribute(btree_updates);
 read_attribute(btree_cache);
 read_attribute(btree_key_cache);
-read_attribute(btree_transactions);
 read_attribute(stripes_heap);
 read_attribute(open_buckets);
 read_attribute(write_points);
@@ -420,9 +419,6 @@ SHOW(bch2_fs)
 	if (attr == &sysfs_btree_key_cache)
 		bch2_btree_key_cache_to_text(out, &c->btree_key_cache);
 
-	if (attr == &sysfs_btree_transactions)
-		bch2_btree_trans_to_text(out, c);
-
 	if (attr == &sysfs_stripes_heap)
 		bch2_stripes_heap_to_text(out, c);
 
@@ -621,7 +617,6 @@ struct attribute *bch2_fs_internal_files[] = {
 	&sysfs_btree_updates,
 	&sysfs_btree_cache,
 	&sysfs_btree_key_cache,
-	&sysfs_btree_transactions,
 	&sysfs_new_stripes,
 	&sysfs_stripes_heap,
 	&sysfs_open_buckets,
-- 
cgit 


From 23189da9eb6b34fc1ceb077edb32e308d0ad6760 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 18 Jun 2022 19:03:25 -0400
Subject: bcachefs: Improve checksum error messages

We're seeing checksum errors in the bch2_rechecksum_bio() path - give it
a better error message to help track this down.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/checksum.c | 11 ++++++++++-
 fs/bcachefs/io.c       |  7 +++----
 2 files changed, 13 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/checksum.c b/fs/bcachefs/checksum.c
index e23b221cd377..7c2af6754aea 100644
--- a/fs/bcachefs/checksum.c
+++ b/fs/bcachefs/checksum.c
@@ -425,8 +425,17 @@ int bch2_rechecksum_bio(struct bch_fs *c, struct bio *bio,
 		merged = bch2_checksum_bio(c, crc_old.csum_type,
 				extent_nonce(version, crc_old), bio);
 
-	if (bch2_crc_cmp(merged, crc_old.csum))
+	if (bch2_crc_cmp(merged, crc_old.csum)) {
+		bch_err(c, "checksum error in bch2_rechecksum_bio() (memory corruption or bug?)\n"
+			"expected %0llx:%0llx got %0llx:%0llx (old type %s new type %s)",
+			crc_old.csum.hi,
+			crc_old.csum.lo,
+			merged.hi,
+			merged.lo,
+			bch2_csum_types[crc_old.csum_type],
+			bch2_csum_types[new_csum_type]);
 		return -EIO;
+	}
 
 	for (i = splits; i < splits + ARRAY_SIZE(splits); i++) {
 		if (i->crc)
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index f41d7943fb4f..91789185c78e 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -1186,8 +1186,7 @@ do_write:
 	*_dst = dst;
 	return more;
 csum_err:
-	bch_err(c, "error verifying existing checksum while "
-		"rewriting existing data (memory corruption?)");
+	bch_err(c, "error verifying existing checksum while rewriting existing data (memory corruption?)");
 	ret = -EIO;
 err:
 	if (to_wbio(dst)->bounce)
@@ -1977,9 +1976,9 @@ csum_err:
 	}
 
 	bch2_dev_inum_io_error(ca, rbio->read_pos.inode, (u64) rbio->bvec_iter.bi_sector,
-		"data checksum error: expected %0llx:%0llx got %0llx:%0llx (type %u)",
+		"data checksum error: expected %0llx:%0llx got %0llx:%0llx (type %s)",
 		rbio->pick.crc.csum.hi, rbio->pick.crc.csum.lo,
-		csum.hi, csum.lo, crc.csum_type);
+		csum.hi, csum.lo, bch2_csum_types[crc.csum_type]);
 	bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
 	goto out;
 decompression_err:
-- 
cgit 


From a3d7afa5c1b62140168982747fd15c1999d991f4 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 18 Jun 2022 19:55:32 -0400
Subject: bcachefs: Always use percpu_ref_tryget_live() on c->writes

If we're trying to get a ref and the refcount has been killed, it means
we're doing an emergency shutdown - we always want tryget_live().

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/alloc_background.c      | 4 ++--
 fs/bcachefs/btree_update_interior.c | 2 +-
 fs/bcachefs/btree_update_leaf.c     | 2 +-
 fs/bcachefs/ec.c                    | 2 +-
 fs/bcachefs/fs-io.c                 | 2 +-
 fs/bcachefs/io.c                    | 4 ++--
 fs/bcachefs/reflink.c               | 2 +-
 fs/bcachefs/subvolume.c             | 4 ++--
 fs/bcachefs/sysfs.c                 | 2 +-
 9 files changed, 12 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 012607cefb6f..3084081966b6 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -1072,7 +1072,7 @@ static void bch2_do_discards_work(struct work_struct *work)
 
 void bch2_do_discards(struct bch_fs *c)
 {
-	if (percpu_ref_tryget(&c->writes) &&
+	if (percpu_ref_tryget_live(&c->writes) &&
 	    !queue_work(system_long_wq, &c->discard_work))
 		percpu_ref_put(&c->writes);
 }
@@ -1192,7 +1192,7 @@ static void bch2_do_invalidates_work(struct work_struct *work)
 
 void bch2_do_invalidates(struct bch_fs *c)
 {
-	if (percpu_ref_tryget(&c->writes))
+	if (percpu_ref_tryget_live(&c->writes))
 		queue_work(system_long_wq, &c->invalidate_work);
 }
 
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index a4f66e7cbb45..eeaea292bd80 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -1897,7 +1897,7 @@ void bch2_btree_node_rewrite_async(struct bch_fs *c, struct btree *b)
 {
 	struct async_btree_rewrite *a;
 
-	if (!percpu_ref_tryget(&c->writes))
+	if (!percpu_ref_tryget_live(&c->writes))
 		return;
 
 	a = kmalloc(sizeof(*a), GFP_NOFS);
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 3425e3c007dd..c6fe24f424de 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -1108,7 +1108,7 @@ int __bch2_trans_commit(struct btree_trans *trans)
 	}
 
 	if (!(trans->flags & BTREE_INSERT_NOCHECK_RW) &&
-	    unlikely(!percpu_ref_tryget(&c->writes))) {
+	    unlikely(!percpu_ref_tryget_live(&c->writes))) {
 		ret = bch2_trans_commit_get_rw_cold(trans);
 		if (ret)
 			goto out_reset;
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index faabaa64dcdb..6ce352c526f0 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -939,7 +939,7 @@ static void ec_stripe_create(struct ec_stripe_new *s)
 
 	BUG_ON(!s->allocated);
 
-	if (!percpu_ref_tryget(&c->writes))
+	if (!percpu_ref_tryget_live(&c->writes))
 		goto err;
 
 	ec_generate_ec(&s->new_stripe);
diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index ad51483ad764..c0dda29dabb4 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -3105,7 +3105,7 @@ long bch2_fallocate_dispatch(struct file *file, int mode,
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
 	long ret;
 
-	if (!percpu_ref_tryget(&c->writes))
+	if (!percpu_ref_tryget_live(&c->writes))
 		return -EROFS;
 
 	inode_lock(&inode->v);
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index 91789185c78e..ca72a31da502 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -1397,7 +1397,7 @@ void bch2_write(struct closure *cl)
 	}
 
 	if (c->opts.nochanges ||
-	    !percpu_ref_tryget(&c->writes)) {
+	    !percpu_ref_tryget_live(&c->writes)) {
 		op->error = -EROFS;
 		goto err;
 	}
@@ -1527,7 +1527,7 @@ static struct promote_op *__promote_alloc(struct bch_fs *c,
 	unsigned pages = DIV_ROUND_UP(sectors, PAGE_SECTORS);
 	int ret;
 
-	if (!percpu_ref_tryget(&c->writes))
+	if (!percpu_ref_tryget_live(&c->writes))
 		return NULL;
 
 	op = kzalloc(sizeof(*op) + sizeof(struct bio_vec) * pages, GFP_NOIO);
diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c
index 4e589c02a93b..2038e3502d8c 100644
--- a/fs/bcachefs/reflink.c
+++ b/fs/bcachefs/reflink.c
@@ -282,7 +282,7 @@ s64 bch2_remap_range(struct bch_fs *c,
 	u32 dst_snapshot, src_snapshot;
 	int ret = 0, ret2 = 0;
 
-	if (!percpu_ref_tryget(&c->writes))
+	if (!percpu_ref_tryget_live(&c->writes))
 		return -EROFS;
 
 	bch2_check_set_feature(c, BCH_FEATURE_reflink);
diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c
index 8f41a06c3e11..60b60de83f3e 100644
--- a/fs/bcachefs/subvolume.c
+++ b/fs/bcachefs/subvolume.c
@@ -729,7 +729,7 @@ err:
 
 static void bch2_delete_dead_snapshots(struct bch_fs *c)
 {
-	if (unlikely(!percpu_ref_tryget(&c->writes)))
+	if (unlikely(!percpu_ref_tryget_live(&c->writes)))
 		return;
 
 	if (!queue_work(system_long_wq, &c->snapshot_delete_work))
@@ -931,7 +931,7 @@ int bch2_subvolume_wait_for_pagecache_and_delete_hook(struct btree_trans *trans,
 	if (ret)
 		return ret;
 
-	if (unlikely(!percpu_ref_tryget(&c->writes)))
+	if (unlikely(!percpu_ref_tryget_live(&c->writes)))
 		return -EROFS;
 
 	if (!queue_work(system_long_wq, &c->snapshot_wait_for_pagecache_and_delete_work))
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index 173289c34de0..4e2b6285cf3a 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -676,7 +676,7 @@ STORE(bch2_fs_opts_dir)
 	 * We don't need to take c->writes for correctness, but it eliminates an
 	 * unsightly error message in the dmesg log when we're RO:
 	 */
-	if (unlikely(!percpu_ref_tryget(&c->writes)))
+	if (unlikely(!percpu_ref_tryget_live(&c->writes)))
 		return -EROFS;
 
 	tmp = kstrdup(buf, GFP_KERNEL);
-- 
cgit 


From 1c6ff39445553f84f9a46c2c60a4768c7f4ef226 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 20 Jun 2022 19:48:16 -0400
Subject: bcachefs: Fix refcount leak in bch2_do_invalidates()

If we fail to queue the work item because it's already in process, we
need to drop the ref we just took.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/alloc_background.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 3084081966b6..933334fed960 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -1192,8 +1192,9 @@ static void bch2_do_invalidates_work(struct work_struct *work)
 
 void bch2_do_invalidates(struct bch_fs *c)
 {
-	if (percpu_ref_tryget_live(&c->writes))
-		queue_work(system_long_wq, &c->invalidate_work);
+	if (percpu_ref_tryget_live(&c->writes) &&
+	    !queue_work(system_long_wq, &c->invalidate_work))
+		percpu_ref_put(&c->writes);
 }
 
 static int bucket_freespace_init(struct btree_trans *trans, struct btree_iter *iter)
-- 
cgit 


From b5f73fd79f4c710024ef4385626aecbaee6fc3f1 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 16 Jun 2022 22:38:10 -0400
Subject: bcachefs: Check for extents with too many ptrs

We have a hardcoded maximum on number of pointers in an extent that's
used by some other data structures - notably bch_devs_list - but we
weren't actually checking for it. Oops.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/extents.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index b0226118077a..4e44234a2b2c 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -1078,6 +1078,7 @@ int bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k,
 	struct bch_extent_crc_unpacked crc;
 	unsigned size_ondisk = k.k->size;
 	unsigned nonce = UINT_MAX;
+	unsigned nr_ptrs = 0;
 	int ret;
 
 	if (bkey_is_btree_ptr(k.k))
@@ -1102,6 +1103,7 @@ int bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k,
 						 false, err);
 			if (ret)
 				return ret;
+			nr_ptrs++;
 			break;
 		case BCH_EXTENT_ENTRY_crc32:
 		case BCH_EXTENT_ENTRY_crc64:
@@ -1140,6 +1142,11 @@ int bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k,
 		}
 	}
 
+	if (nr_ptrs >= BCH_BKEY_PTRS_MAX) {
+		prt_str(err, "too many ptrs");
+		return -EINVAL;
+	}
+
 	return 0;
 }
 
-- 
cgit 


From 7bb61e8c0e37fdf5684bc1fa1f6e0b5644cc7f75 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 19 Jun 2022 23:03:07 -0400
Subject: bcachefs: Make IO in flight by copygc/rebalance configurable

This adds a new option, move_bytes_in_flight, for configuring the amount
of IO in flight by copygc/rebalance - users with many devices in their
filesystem will want to increase this.

In the future we should be smarter about this, but this is an easy
improvement.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/move.c | 6 ++----
 fs/bcachefs/opts.h | 5 +++++
 2 files changed, 7 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index a852e07affdc..08fb8c71893f 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -22,8 +22,6 @@
 #include <linux/ioprio.h>
 #include <linux/kthread.h>
 
-#define SECTORS_IN_FLIGHT_PER_DEVICE	2048
-
 struct moving_io {
 	struct list_head	list;
 	struct closure		cl;
@@ -693,11 +691,11 @@ static int __bch2_move_data(struct bch_fs *c,
 
 		move_ctxt_wait_event(ctxt, &trans,
 			atomic_read(&ctxt->write_sectors) <
-			SECTORS_IN_FLIGHT_PER_DEVICE);
+			c->opts.move_bytes_in_flight >> 9);
 
 		move_ctxt_wait_event(ctxt, &trans,
 			atomic_read(&ctxt->read_sectors) <
-			SECTORS_IN_FLIGHT_PER_DEVICE);
+			c->opts.move_bytes_in_flight >> 9);
 
 		bch2_trans_begin(&trans);
 
diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
index dee75d7e6fe8..43b4488b8c6f 100644
--- a/fs/bcachefs/opts.h
+++ b/fs/bcachefs/opts.h
@@ -290,6 +290,11 @@ enum opt_type {
 	  OPT_UINT(0, U32_MAX),						\
 	  BCH_SB_JOURNAL_RECLAIM_DELAY,	100,				\
 	  NULL,		"Delay in milliseconds before automatic journal reclaim")\
+	x(move_bytes_in_flight,		u32,				\
+	  OPT_HUMAN_READABLE|OPT_FS|OPT_MOUNT|OPT_RUNTIME,		\
+	  OPT_UINT(1024, U32_MAX),					\
+	  BCH2_NO_SB_OPT,		1U << 20,			\
+	  NULL,		"Amount of IO in flight to keep in flight by the move path")\
 	x(fsck,				u8,				\
 	  OPT_FS|OPT_MOUNT,						\
 	  OPT_BOOL(),							\
-- 
cgit 


From 8f7f566f5774d36196bfa87bc097522fd497d4dc Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 17 Jun 2022 01:07:54 -0400
Subject: bcachefs: btree key cache pcpu freedlist

Originally, the btree key cache code would always allocate new entries
by reusing from the recently-freed list, if that list wasn't empty. But
that behaviour was dropped, for lock contention reasons.

But it seems that entries stranded on the freed list have been
contributing to some of our oom issues, because long running btree
transactions will prevent them from being freed.

This patch re-adds allocating from the freed list, but it also adds
percpu buffers to solve the lock contention issues - and the new percpu
freed lists will improve the evict paths, too.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_key_cache.c | 121 +++++++++++++++++++++++++++++++++++-------
 fs/bcachefs/btree_types.h     |   8 ++-
 2 files changed, 108 insertions(+), 21 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index bc0c8386e403..97c72f3917ec 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -84,7 +84,7 @@ static void bkey_cached_free(struct btree_key_cache *bc,
 		start_poll_synchronize_srcu(&c->btree_trans_barrier);
 
 	list_move_tail(&ck->list, &bc->freed);
-	bc->nr_freed++;
+	atomic_long_inc(&bc->nr_freed);
 
 	kfree(ck->k);
 	ck->k		= NULL;
@@ -94,10 +94,88 @@ static void bkey_cached_free(struct btree_key_cache *bc,
 	six_unlock_intent(&ck->c.lock);
 }
 
+static void bkey_cached_free_fast(struct btree_key_cache *bc,
+				  struct bkey_cached *ck)
+{
+	struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
+	struct btree_key_cache_freelist *f;
+	bool freed = false;
+
+	BUG_ON(test_bit(BKEY_CACHED_DIRTY, &ck->flags));
+
+	ck->btree_trans_barrier_seq =
+		start_poll_synchronize_srcu(&c->btree_trans_barrier);
+
+	list_del_init(&ck->list);
+	atomic_long_inc(&bc->nr_freed);
+
+	kfree(ck->k);
+	ck->k		= NULL;
+	ck->u64s	= 0;
+
+	preempt_disable();
+	f = this_cpu_ptr(bc->pcpu_freed);
+
+	if (f->nr < ARRAY_SIZE(f->objs)) {
+		f->objs[f->nr++] = ck;
+		freed = true;
+	}
+	preempt_enable();
+
+	if (!freed) {
+		mutex_lock(&bc->lock);
+		preempt_disable();
+		f = this_cpu_ptr(bc->pcpu_freed);
+
+		while (f->nr > ARRAY_SIZE(f->objs) / 2) {
+			struct bkey_cached *ck2 = f->objs[--f->nr];
+
+			list_move_tail(&ck2->list, &bc->freed);
+		}
+		preempt_enable();
+
+		list_move_tail(&ck->list, &bc->freed);
+		mutex_unlock(&bc->lock);
+	}
+
+	six_unlock_write(&ck->c.lock);
+	six_unlock_intent(&ck->c.lock);
+}
+
 static struct bkey_cached *
 bkey_cached_alloc(struct btree_key_cache *c)
 {
-	struct bkey_cached *ck;
+	struct bkey_cached *ck = NULL;
+	struct btree_key_cache_freelist *f;
+
+	preempt_disable();
+	f = this_cpu_ptr(c->pcpu_freed);
+	if (f->nr)
+		ck = f->objs[--f->nr];
+	preempt_enable();
+
+	if (!ck) {
+		mutex_lock(&c->lock);
+		preempt_disable();
+		f = this_cpu_ptr(c->pcpu_freed);
+
+		while (!list_empty(&c->freed) &&
+		       f->nr < ARRAY_SIZE(f->objs) / 2) {
+			ck = list_last_entry(&c->freed, struct bkey_cached, list);
+			list_del_init(&ck->list);
+			f->objs[f->nr++] = ck;
+		}
+
+		ck = f->nr ? f->objs[--f->nr] : NULL;
+		preempt_enable();
+		mutex_unlock(&c->lock);
+	}
+
+	if (ck) {
+		six_lock_intent(&ck->c.lock, NULL, NULL);
+		six_lock_write(&ck->c.lock, NULL, NULL);
+		return ck;
+	}
 
 	ck = kmem_cache_alloc(bch2_key_cache, GFP_NOFS|__GFP_ZERO);
 	if (likely(ck)) {
@@ -120,16 +198,6 @@ bkey_cached_reuse(struct btree_key_cache *c)
 	struct bkey_cached *ck;
 	unsigned i;
 
-	mutex_lock(&c->lock);
-	list_for_each_entry_reverse(ck, &c->freed, list)
-		if (bkey_cached_lock_for_evict(ck)) {
-			c->nr_freed--;
-			list_del(&ck->list);
-			mutex_unlock(&c->lock);
-			return ck;
-		}
-	mutex_unlock(&c->lock);
-
 	rcu_read_lock();
 	tbl = rht_dereference_rcu(c->table.tbl, &c->table);
 	for (i = 0; i < tbl->size; i++)
@@ -190,9 +258,7 @@ btree_key_cache_create(struct bch_fs *c,
 			six_unlock_intent(&ck->c.lock);
 			kfree(ck);
 		} else {
-			mutex_lock(&bc->lock);
-			bkey_cached_free(bc, ck);
-			mutex_unlock(&bc->lock);
+			bkey_cached_free_fast(bc, ck);
 		}
 
 		return NULL;
@@ -465,9 +531,7 @@ evict:
 
 		bkey_cached_evict(&c->btree_key_cache, ck);
 
-		mutex_lock(&c->btree_key_cache.lock);
-		bkey_cached_free(&c->btree_key_cache, ck);
-		mutex_unlock(&c->btree_key_cache.lock);
+		bkey_cached_free_fast(&c->btree_key_cache, ck);
 	}
 out:
 	bch2_trans_iter_exit(trans, &b_iter);
@@ -612,7 +676,7 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
 
 		list_del(&ck->list);
 		kmem_cache_free(bch2_key_cache, ck);
-		bc->nr_freed--;
+		atomic_long_dec(&bc->nr_freed);
 		scanned++;
 		freed++;
 	}
@@ -685,6 +749,7 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc)
 	struct bkey_cached *ck, *n;
 	struct rhash_head *pos;
 	unsigned i;
+	int cpu;
 
 	if (bc->shrink.list.next)
 		unregister_shrinker(&bc->shrink);
@@ -701,6 +766,16 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc)
 			}
 	rcu_read_unlock();
 
+	for_each_possible_cpu(cpu) {
+		struct btree_key_cache_freelist *f =
+			per_cpu_ptr(bc->pcpu_freed, cpu);
+
+		for (i = 0; i < f->nr; i++) {
+			ck = f->objs[i];
+			list_add(&ck->list, &bc->freed);
+		}
+	}
+
 	list_for_each_entry_safe(ck, n, &bc->freed, list) {
 		cond_resched();
 
@@ -721,6 +796,8 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc)
 
 	if (bc->table_init_done)
 		rhashtable_destroy(&bc->table);
+
+	free_percpu(bc->pcpu_freed);
 }
 
 void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *c)
@@ -734,6 +811,10 @@ int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc)
 	struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
 	int ret;
 
+	bc->pcpu_freed = alloc_percpu(struct btree_key_cache_freelist);
+	if (!bc->pcpu_freed)
+		return -ENOMEM;
+
 	ret = rhashtable_init(&bc->table, &bch2_btree_key_cache_params);
 	if (ret)
 		return ret;
@@ -748,7 +829,7 @@ int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc)
 
 void bch2_btree_key_cache_to_text(struct printbuf *out, struct btree_key_cache *c)
 {
-	prt_printf(out, "nr_freed:\t%zu\n",	c->nr_freed);
+	prt_printf(out, "nr_freed:\t%zu\n",	atomic_long_read(&c->nr_freed));
 	prt_printf(out, "nr_keys:\t%lu\n",	atomic_long_read(&c->nr_keys));
 	prt_printf(out, "nr_dirty:\t%lu\n",	atomic_long_read(&c->nr_dirty));
 }
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 4f3e1086a86b..2eb8cc11aec4 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -298,6 +298,11 @@ struct btree_iter {
 	struct bpos		journal_pos;
 };
 
+struct btree_key_cache_freelist {
+	struct bkey_cached	*objs[16];
+	unsigned		nr;
+};
+
 struct btree_key_cache {
 	struct mutex		lock;
 	struct rhashtable	table;
@@ -305,8 +310,9 @@ struct btree_key_cache {
 	struct list_head	freed;
 	struct shrinker		shrink;
 	unsigned		shrink_iter;
+	struct btree_key_cache_freelist __percpu *pcpu_freed;
 
-	size_t			nr_freed;
+	atomic_long_t		nr_freed;
 	atomic_long_t		nr_keys;
 	atomic_long_t		nr_dirty;
 };
-- 
cgit 


From 30f0349d62429effd729ae9272c6fb57f47d1436 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 14 Jun 2022 17:51:20 -0400
Subject: bcachefs: Split out dev_buckets_free()

Previously, dev_buckets_available() only counted buckets that are
eligible to be allocated right now - i.e. buckets that don't have cached
data, or need discard, or need gc gens, etc.

But most users of this function want to know how many buckets are
eligible to be allocated from without moving data around - copygc,
allocator striping, which means we should be including cached data
buckets etc.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/alloc_foreground.c |  2 +-
 fs/bcachefs/buckets.h          | 13 +++++++++++++
 2 files changed, 14 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index ef8f10a51489..174b3a745ab8 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -526,7 +526,7 @@ static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans,
 	bool waiting = false;
 again:
 	usage = bch2_dev_usage_read(ca);
-	avail = __dev_buckets_available(ca, usage,reserve);
+	avail = dev_buckets_free(ca, usage,reserve);
 
 	if (usage.d[BCH_DATA_need_discard].buckets > avail)
 		bch2_do_discards(c);
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index 327022cd0f7a..080bcb20a5b0 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -144,12 +144,25 @@ static inline u64 bch2_dev_buckets_reserved(struct bch_dev *ca, enum alloc_reser
 	return reserved;
 }
 
+static inline u64 dev_buckets_free(struct bch_dev *ca,
+				   struct bch_dev_usage usage,
+				   enum alloc_reserve reserve)
+{
+	return max_t(s64, 0,
+		     usage.d[BCH_DATA_free].buckets -
+		     ca->nr_open_buckets -
+		     bch2_dev_buckets_reserved(ca, reserve));
+}
+
 static inline u64 __dev_buckets_available(struct bch_dev *ca,
 					  struct bch_dev_usage usage,
 					  enum alloc_reserve reserve)
 {
 	return max_t(s64, 0,
 		     usage.d[BCH_DATA_free].buckets -
+		     usage.d[BCH_DATA_cached].buckets -
+		     usage.d[BCH_DATA_need_gc_gens].buckets -
+		     usage.d[BCH_DATA_need_discard].buckets -
 		     ca->nr_open_buckets -
 		     bch2_dev_buckets_reserved(ca, reserve));
 }
-- 
cgit 


From c501fef6deb1de13d45d22a3df32906adf17275b Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 13 Jun 2022 19:07:19 -0400
Subject: bcachefs: Pull out data_update.c

This is the start of reorganizing the data IO paths. The plan is to also
break apart io.c into data_read.c and data_write.c, and migrate_write
will be renamed to the data_update path.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/Makefile      |   1 +
 fs/bcachefs/data_update.c | 383 ++++++++++++++++++++++++++++++++++++++++++++
 fs/bcachefs/data_update.h |  48 ++++++
 fs/bcachefs/ec.h          |   1 +
 fs/bcachefs/io.c          |   9 +-
 fs/bcachefs/move.c        | 393 +---------------------------------------------
 fs/bcachefs/move.h        |  45 ++----
 7 files changed, 452 insertions(+), 428 deletions(-)
 create mode 100644 fs/bcachefs/data_update.c
 create mode 100644 fs/bcachefs/data_update.h

(limited to 'fs')

diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile
index fada601c10db..95b990ad0196 100644
--- a/fs/bcachefs/Makefile
+++ b/fs/bcachefs/Makefile
@@ -25,6 +25,7 @@ bcachefs-y		:=	\
 	debug.o			\
 	dirent.o		\
 	disk_groups.o		\
+	data_update.o		\
 	ec.o			\
 	error.o			\
 	extents.o		\
diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c
new file mode 100644
index 000000000000..0161b0a9f36e
--- /dev/null
+++ b/fs/bcachefs/data_update.c
@@ -0,0 +1,383 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "alloc_foreground.h"
+#include "bkey_buf.h"
+#include "btree_update.h"
+#include "buckets.h"
+#include "data_update.h"
+#include "ec.h"
+#include "extents.h"
+#include "io.h"
+#include "keylist.h"
+#include "move.h"
+#include "subvolume.h"
+#include "trace.h"
+
+static int insert_snapshot_whiteouts(struct btree_trans *trans,
+				     enum btree_id id,
+				     struct bpos old_pos,
+				     struct bpos new_pos)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter iter, update_iter;
+	struct bkey_s_c k;
+	struct snapshots_seen s;
+	int ret;
+
+	if (!btree_type_has_snapshots(id))
+		return 0;
+
+	snapshots_seen_init(&s);
+
+	if (!bkey_cmp(old_pos, new_pos))
+		return 0;
+
+	if (!snapshot_t(c, old_pos.snapshot)->children[0])
+		return 0;
+
+	bch2_trans_iter_init(trans, &iter, id, old_pos,
+			     BTREE_ITER_NOT_EXTENTS|
+			     BTREE_ITER_ALL_SNAPSHOTS);
+	while (1) {
+next:
+		k = bch2_btree_iter_prev(&iter);
+		ret = bkey_err(k);
+		if (ret)
+			break;
+
+		if (bkey_cmp(old_pos, k.k->p))
+			break;
+
+		if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, old_pos.snapshot)) {
+			struct bkey_i *update;
+			u32 *i;
+
+			darray_for_each(s.ids, i)
+				if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, *i))
+					goto next;
+
+			update = bch2_trans_kmalloc(trans, sizeof(struct bkey_i));
+
+			ret = PTR_ERR_OR_ZERO(update);
+			if (ret)
+				break;
+
+			bkey_init(&update->k);
+			update->k.p = new_pos;
+			update->k.p.snapshot = k.k->p.snapshot;
+
+			bch2_trans_iter_init(trans, &update_iter, id, update->k.p,
+					     BTREE_ITER_NOT_EXTENTS|
+					     BTREE_ITER_ALL_SNAPSHOTS|
+					     BTREE_ITER_INTENT);
+			ret   = bch2_btree_iter_traverse(&update_iter) ?:
+				bch2_trans_update(trans, &update_iter, update,
+					  BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+			bch2_trans_iter_exit(trans, &update_iter);
+			if (ret)
+				break;
+
+			ret = snapshots_seen_add(c, &s, k.k->p.snapshot);
+			if (ret)
+				break;
+		}
+	}
+	bch2_trans_iter_exit(trans, &iter);
+	darray_exit(&s.ids);
+
+	return ret;
+}
+
+int bch2_data_update_index_update(struct bch_write_op *op)
+{
+	struct bch_fs *c = op->c;
+	struct btree_trans trans;
+	struct btree_iter iter;
+	struct data_update *m =
+		container_of(op, struct data_update, op);
+	struct open_bucket *ec_ob = ec_open_bucket(c, &op->open_buckets);
+	struct keylist *keys = &op->insert_keys;
+	struct bkey_buf _new, _insert;
+	int ret = 0;
+
+	bch2_bkey_buf_init(&_new);
+	bch2_bkey_buf_init(&_insert);
+	bch2_bkey_buf_realloc(&_insert, c, U8_MAX);
+
+	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024);
+
+	bch2_trans_iter_init(&trans, &iter, m->btree_id,
+			     bkey_start_pos(&bch2_keylist_front(keys)->k),
+			     BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+
+	while (1) {
+		struct bkey_s_c k;
+		struct bkey_i *insert;
+		struct bkey_i_extent *new;
+		const union bch_extent_entry *entry;
+		struct extent_ptr_decoded p;
+		struct bpos next_pos;
+		bool did_work = false;
+		bool extending = false, should_check_enospc;
+		s64 i_sectors_delta = 0, disk_sectors_delta = 0;
+
+		bch2_trans_begin(&trans);
+
+		k = bch2_btree_iter_peek_slot(&iter);
+		ret = bkey_err(k);
+		if (ret)
+			goto err;
+
+		new = bkey_i_to_extent(bch2_keylist_front(keys));
+
+		if (bversion_cmp(k.k->version, new->k.version) ||
+		    !bch2_bkey_matches_ptr(c, k, m->ptr, m->offset))
+			goto nomatch;
+
+		bkey_reassemble(_insert.k, k);
+		insert = _insert.k;
+
+		bch2_bkey_buf_copy(&_new, c, bch2_keylist_front(keys));
+		new = bkey_i_to_extent(_new.k);
+		bch2_cut_front(iter.pos, &new->k_i);
+
+		bch2_cut_front(iter.pos,	insert);
+		bch2_cut_back(new->k.p,		insert);
+		bch2_cut_back(insert->k.p,	&new->k_i);
+
+		if (m->data_cmd == DATA_REWRITE) {
+			struct bch_extent_ptr *new_ptr, *old_ptr = (void *)
+				bch2_bkey_has_device(bkey_i_to_s_c(insert),
+						     m->data_opts.rewrite_dev);
+			if (!old_ptr)
+				goto nomatch;
+
+			if (old_ptr->cached)
+				extent_for_each_ptr(extent_i_to_s(new), new_ptr)
+					new_ptr->cached = true;
+
+			__bch2_bkey_drop_ptr(bkey_i_to_s(insert), old_ptr);
+		}
+
+		extent_for_each_ptr_decode(extent_i_to_s(new), p, entry) {
+			if (bch2_bkey_has_device(bkey_i_to_s_c(insert), p.ptr.dev)) {
+				/*
+				 * raced with another move op? extent already
+				 * has a pointer to the device we just wrote
+				 * data to
+				 */
+				continue;
+			}
+
+			bch2_extent_ptr_decoded_append(insert, &p);
+			did_work = true;
+		}
+
+		if (!did_work)
+			goto nomatch;
+
+		bch2_bkey_narrow_crcs(insert,
+				(struct bch_extent_crc_unpacked) { 0 });
+		bch2_extent_normalize(c, bkey_i_to_s(insert));
+		bch2_bkey_mark_replicas_cached(c, bkey_i_to_s(insert),
+					       op->opts.background_target,
+					       op->opts.data_replicas);
+
+		ret = bch2_sum_sector_overwrites(&trans, &iter, insert,
+						 &extending,
+						 &should_check_enospc,
+						 &i_sectors_delta,
+						 &disk_sectors_delta);
+		if (ret)
+			goto err;
+
+		if (disk_sectors_delta > (s64) op->res.sectors) {
+			ret = bch2_disk_reservation_add(c, &op->res,
+						disk_sectors_delta - op->res.sectors,
+						!should_check_enospc
+						? BCH_DISK_RESERVATION_NOFAIL : 0);
+			if (ret)
+				goto out;
+		}
+
+		next_pos = insert->k.p;
+
+		ret   = insert_snapshot_whiteouts(&trans, m->btree_id,
+						  k.k->p, insert->k.p) ?:
+			bch2_trans_update(&trans, &iter, insert,
+				BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
+			bch2_trans_commit(&trans, &op->res,
+				op_journal_seq(op),
+				BTREE_INSERT_NOFAIL|
+				m->data_opts.btree_insert_flags);
+		if (!ret) {
+			bch2_btree_iter_set_pos(&iter, next_pos);
+			atomic_long_inc(&c->extent_migrate_done);
+			if (ec_ob)
+				bch2_ob_add_backpointer(c, ec_ob, &insert->k);
+		}
+err:
+		if (ret == -EINTR)
+			ret = 0;
+		if (ret)
+			break;
+next:
+		while (bkey_cmp(iter.pos, bch2_keylist_front(keys)->k.p) >= 0) {
+			bch2_keylist_pop_front(keys);
+			if (bch2_keylist_empty(keys))
+				goto out;
+		}
+		continue;
+nomatch:
+		if (m->ctxt) {
+			BUG_ON(k.k->p.offset <= iter.pos.offset);
+			atomic64_inc(&m->ctxt->stats->keys_raced);
+			atomic64_add(k.k->p.offset - iter.pos.offset,
+				     &m->ctxt->stats->sectors_raced);
+		}
+		atomic_long_inc(&c->extent_migrate_raced);
+		trace_move_race(&new->k);
+		bch2_btree_iter_advance(&iter);
+		goto next;
+	}
+out:
+	bch2_trans_iter_exit(&trans, &iter);
+	bch2_trans_exit(&trans);
+	bch2_bkey_buf_exit(&_insert, c);
+	bch2_bkey_buf_exit(&_new, c);
+	BUG_ON(ret == -EINTR);
+	return ret;
+}
+
+void bch2_data_update_read_done(struct data_update *m, struct bch_read_bio *rbio)
+{
+	/* write bio must own pages: */
+	BUG_ON(!m->op.wbio.bio.bi_vcnt);
+
+	m->ptr		= rbio->pick.ptr;
+	m->offset	= rbio->data_pos.offset - rbio->pick.crc.offset;
+	m->op.devs_have	= rbio->devs_have;
+	m->op.pos	= rbio->data_pos;
+	m->op.version	= rbio->version;
+	m->op.crc	= rbio->pick.crc;
+	m->op.wbio.bio.bi_iter.bi_size = m->op.crc.compressed_size << 9;
+
+	if (m->data_cmd == DATA_REWRITE)
+		bch2_dev_list_drop_dev(&m->op.devs_have, m->data_opts.rewrite_dev);
+}
+
+int bch2_data_update_init(struct bch_fs *c, struct data_update *m,
+			  struct write_point_specifier wp,
+			  struct bch_io_opts io_opts,
+			  enum data_cmd data_cmd,
+			  struct data_opts data_opts,
+			  enum btree_id btree_id,
+			  struct bkey_s_c k)
+{
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+	const union bch_extent_entry *entry;
+	struct bch_extent_crc_unpacked crc;
+	struct extent_ptr_decoded p;
+	int ret;
+
+	m->btree_id	= btree_id;
+	m->data_cmd	= data_cmd;
+	m->data_opts	= data_opts;
+	m->nr_ptrs_reserved = 0;
+
+	bch2_write_op_init(&m->op, c, io_opts);
+
+	if (!bch2_bkey_is_incompressible(k))
+		m->op.compression_type =
+			bch2_compression_opt_to_type[io_opts.background_compression ?:
+						     io_opts.compression];
+	else
+		m->op.incompressible = true;
+
+	m->op.target	= data_opts.target,
+	m->op.write_point = wp;
+
+	/*
+	 * op->csum_type is normally initialized from the fs/file's current
+	 * options - but if an extent is encrypted, we require that it stays
+	 * encrypted:
+	 */
+	bkey_for_each_crc(k.k, ptrs, crc, entry)
+		if (bch2_csum_type_is_encryption(crc.csum_type)) {
+			m->op.nonce	= crc.nonce + crc.offset;
+			m->op.csum_type = crc.csum_type;
+			break;
+		}
+
+	if (m->data_opts.btree_insert_flags & BTREE_INSERT_USE_RESERVE) {
+		m->op.alloc_reserve = RESERVE_movinggc;
+	} else {
+		/* XXX: this should probably be passed in */
+		m->op.flags |= BCH_WRITE_ONLY_SPECIFIED_DEVS;
+	}
+
+	m->op.flags |= BCH_WRITE_PAGES_STABLE|
+		BCH_WRITE_PAGES_OWNED|
+		BCH_WRITE_DATA_ENCODED|
+		BCH_WRITE_FROM_INTERNAL|
+		BCH_WRITE_MOVE;
+
+	m->op.nr_replicas	= data_opts.nr_replicas;
+	m->op.nr_replicas_required = data_opts.nr_replicas;
+
+	switch (data_cmd) {
+	case DATA_ADD_REPLICAS: {
+		/*
+		 * DATA_ADD_REPLICAS is used for moving data to a different
+		 * device in the background, and due to compression the new copy
+		 * might take up more space than the old copy:
+		 */
+#if 0
+		int nr = (int) io_opts.data_replicas -
+			bch2_bkey_nr_ptrs_allocated(k);
+#endif
+		int nr = (int) io_opts.data_replicas;
+
+		if (nr > 0) {
+			m->op.nr_replicas = m->nr_ptrs_reserved = nr;
+
+			ret = bch2_disk_reservation_get(c, &m->op.res,
+					k.k->size, m->op.nr_replicas, 0);
+			if (ret)
+				return ret;
+		}
+		break;
+	}
+	case DATA_REWRITE: {
+		unsigned compressed_sectors = 0;
+
+		bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
+			if (p.ptr.dev == data_opts.rewrite_dev) {
+				if (p.ptr.cached)
+					m->op.flags |= BCH_WRITE_CACHED;
+
+				if (!p.ptr.cached &&
+				    crc_is_compressed(p.crc))
+					compressed_sectors += p.crc.compressed_size;
+			}
+
+		if (compressed_sectors) {
+			ret = bch2_disk_reservation_add(c, &m->op.res,
+					k.k->size * m->op.nr_replicas,
+					BCH_DISK_RESERVATION_NOFAIL);
+			if (ret)
+				return ret;
+		}
+		break;
+	}
+	case DATA_PROMOTE:
+		m->op.flags	|= BCH_WRITE_ALLOC_NOWAIT;
+		m->op.flags	|= BCH_WRITE_CACHED;
+		break;
+	default:
+		BUG();
+	}
+
+	return 0;
+}
diff --git a/fs/bcachefs/data_update.h b/fs/bcachefs/data_update.h
new file mode 100644
index 000000000000..03b4ca5a4ee8
--- /dev/null
+++ b/fs/bcachefs/data_update.h
@@ -0,0 +1,48 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef _BCACHEFS_DATA_UPDATE_H
+#define _BCACHEFS_DATA_UPDATE_H
+
+#include "io_types.h"
+
+enum data_cmd {
+	DATA_SKIP,
+	DATA_SCRUB,
+	DATA_ADD_REPLICAS,
+	DATA_REWRITE,
+	DATA_PROMOTE,
+};
+
+struct data_opts {
+	u16		target;
+	u8		rewrite_dev;
+	u8		nr_replicas;
+	int		btree_insert_flags;
+};
+
+struct data_update {
+	enum btree_id		btree_id;
+	enum data_cmd		data_cmd;
+	struct data_opts	data_opts;
+
+	unsigned		nr_ptrs_reserved;
+
+	struct moving_context	*ctxt;
+
+	/* what we read: */
+	struct bch_extent_ptr	ptr;
+	u64			offset;
+
+	struct bch_write_op	op;
+};
+
+int bch2_data_update_index_update(struct bch_write_op *);
+
+void bch2_data_update_read_done(struct data_update *, struct bch_read_bio *);
+int bch2_data_update_init(struct bch_fs *, struct data_update *,
+			  struct write_point_specifier,
+			  struct bch_io_opts,
+			  enum data_cmd, struct data_opts,
+			  enum btree_id, struct bkey_s_c);
+
+#endif /* _BCACHEFS_DATA_UPDATE_H */
diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h
index a4c13d61af10..c53187df4651 100644
--- a/fs/bcachefs/ec.h
+++ b/fs/bcachefs/ec.h
@@ -4,6 +4,7 @@
 
 #include "ec_types.h"
 #include "buckets_types.h"
+#include "extents_types.h"
 #include "keylist_types.h"
 
 int bch2_stripe_invalid(const struct bch_fs *, struct bkey_s_c,
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index ca72a31da502..743449ed7fae 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -16,6 +16,7 @@
 #include "checksum.h"
 #include "compress.h"
 #include "clock.h"
+#include "data_update.h"
 #include "debug.h"
 #include "disk_groups.h"
 #include "ec.h"
@@ -660,7 +661,7 @@ static void __bch2_write_index(struct bch_write_op *op)
 
 		ret = !(op->flags & BCH_WRITE_MOVE)
 			? bch2_write_index_default(op)
-			: bch2_migrate_index_update(op);
+			: bch2_data_update_index_update(op);
 
 		BUG_ON(ret == -EINTR);
 		BUG_ON(keylist_sectors(keys) && !ret);
@@ -1433,7 +1434,7 @@ struct promote_op {
 	struct rhash_head	hash;
 	struct bpos		pos;
 
-	struct migrate_write	write;
+	struct data_update	write;
 	struct bio_vec		bi_inline_vecs[0]; /* must be last */
 };
 
@@ -1508,7 +1509,7 @@ static void promote_start(struct promote_op *op, struct bch_read_bio *rbio)
 	       sizeof(struct bio_vec) * rbio->bio.bi_vcnt);
 	swap(bio->bi_vcnt, rbio->bio.bi_vcnt);
 
-	bch2_migrate_read_done(&op->write, rbio);
+	bch2_data_update_read_done(&op->write, rbio);
 
 	closure_call(&op->write.op.cl, bch2_write, c->btree_update_wq, NULL);
 }
@@ -1565,7 +1566,7 @@ static struct promote_op *__promote_alloc(struct bch_fs *c,
 	bio = &op->write.op.wbio.bio;
 	bio_init(bio, NULL, bio->bi_inline_vecs, pages, 0);
 
-	ret = bch2_migrate_write_init(c, &op->write,
+	ret = bch2_data_update_init(c, &op->write,
 			writepoint_hashed((unsigned long) current),
 			opts,
 			DATA_PROMOTE,
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index 08fb8c71893f..a3a486cff28e 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -6,7 +6,6 @@
 #include "btree_gc.h"
 #include "btree_update.h"
 #include "btree_update_interior.h"
-#include "buckets.h"
 #include "disk_groups.h"
 #include "ec.h"
 #include "inode.h"
@@ -15,7 +14,6 @@
 #include "keylist.h"
 #include "move.h"
 #include "replicas.h"
-#include "subvolume.h"
 #include "super-io.h"
 #include "trace.h"
 
@@ -32,394 +30,11 @@ struct moving_io {
 
 	struct bch_read_bio	rbio;
 
-	struct migrate_write	write;
+	struct data_update	write;
 	/* Must be last since it is variable size */
 	struct bio_vec		bi_inline_vecs[0];
 };
 
-struct moving_context {
-	/* Closure for waiting on all reads and writes to complete */
-	struct closure		cl;
-
-	struct bch_move_stats	*stats;
-
-	struct list_head	reads;
-
-	/* in flight sectors: */
-	atomic_t		read_sectors;
-	atomic_t		write_sectors;
-
-	wait_queue_head_t	wait;
-};
-
-static int insert_snapshot_whiteouts(struct btree_trans *trans,
-				     enum btree_id id,
-				     struct bpos old_pos,
-				     struct bpos new_pos)
-{
-	struct bch_fs *c = trans->c;
-	struct btree_iter iter, update_iter;
-	struct bkey_s_c k;
-	struct snapshots_seen s;
-	int ret;
-
-	if (!btree_type_has_snapshots(id))
-		return 0;
-
-	snapshots_seen_init(&s);
-
-	if (!bkey_cmp(old_pos, new_pos))
-		return 0;
-
-	if (!snapshot_t(c, old_pos.snapshot)->children[0])
-		return 0;
-
-	bch2_trans_iter_init(trans, &iter, id, old_pos,
-			     BTREE_ITER_NOT_EXTENTS|
-			     BTREE_ITER_ALL_SNAPSHOTS);
-	while (1) {
-next:
-		k = bch2_btree_iter_prev(&iter);
-		ret = bkey_err(k);
-		if (ret)
-			break;
-
-		if (bkey_cmp(old_pos, k.k->p))
-			break;
-
-		if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, old_pos.snapshot)) {
-			struct bkey_i *update;
-			u32 *i;
-
-			darray_for_each(s.ids, i)
-				if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, *i))
-					goto next;
-
-			update = bch2_trans_kmalloc(trans, sizeof(struct bkey_i));
-
-			ret = PTR_ERR_OR_ZERO(update);
-			if (ret)
-				break;
-
-			bkey_init(&update->k);
-			update->k.p = new_pos;
-			update->k.p.snapshot = k.k->p.snapshot;
-
-			bch2_trans_iter_init(trans, &update_iter, id, update->k.p,
-					     BTREE_ITER_NOT_EXTENTS|
-					     BTREE_ITER_ALL_SNAPSHOTS|
-					     BTREE_ITER_INTENT);
-			ret   = bch2_btree_iter_traverse(&update_iter) ?:
-				bch2_trans_update(trans, &update_iter, update,
-					  BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
-			bch2_trans_iter_exit(trans, &update_iter);
-			if (ret)
-				break;
-
-			ret = snapshots_seen_add(c, &s, k.k->p.snapshot);
-			if (ret)
-				break;
-		}
-	}
-	bch2_trans_iter_exit(trans, &iter);
-	darray_exit(&s.ids);
-
-	return ret;
-}
-
-int bch2_migrate_index_update(struct bch_write_op *op)
-{
-	struct bch_fs *c = op->c;
-	struct btree_trans trans;
-	struct btree_iter iter;
-	struct migrate_write *m =
-		container_of(op, struct migrate_write, op);
-	struct open_bucket *ec_ob = ec_open_bucket(c, &op->open_buckets);
-	struct keylist *keys = &op->insert_keys;
-	struct bkey_buf _new, _insert;
-	int ret = 0;
-
-	bch2_bkey_buf_init(&_new);
-	bch2_bkey_buf_init(&_insert);
-	bch2_bkey_buf_realloc(&_insert, c, U8_MAX);
-
-	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024);
-
-	bch2_trans_iter_init(&trans, &iter, m->btree_id,
-			     bkey_start_pos(&bch2_keylist_front(keys)->k),
-			     BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
-
-	while (1) {
-		struct bkey_s_c k;
-		struct bkey_i *insert;
-		struct bkey_i_extent *new;
-		const union bch_extent_entry *entry;
-		struct extent_ptr_decoded p;
-		struct bpos next_pos;
-		bool did_work = false;
-		bool extending = false, should_check_enospc;
-		s64 i_sectors_delta = 0, disk_sectors_delta = 0;
-
-		bch2_trans_begin(&trans);
-
-		k = bch2_btree_iter_peek_slot(&iter);
-		ret = bkey_err(k);
-		if (ret)
-			goto err;
-
-		new = bkey_i_to_extent(bch2_keylist_front(keys));
-
-		if (bversion_cmp(k.k->version, new->k.version) ||
-		    !bch2_bkey_matches_ptr(c, k, m->ptr, m->offset))
-			goto nomatch;
-
-		bkey_reassemble(_insert.k, k);
-		insert = _insert.k;
-
-		bch2_bkey_buf_copy(&_new, c, bch2_keylist_front(keys));
-		new = bkey_i_to_extent(_new.k);
-		bch2_cut_front(iter.pos, &new->k_i);
-
-		bch2_cut_front(iter.pos,	insert);
-		bch2_cut_back(new->k.p,		insert);
-		bch2_cut_back(insert->k.p,	&new->k_i);
-
-		if (m->data_cmd == DATA_REWRITE) {
-			struct bch_extent_ptr *new_ptr, *old_ptr = (void *)
-				bch2_bkey_has_device(bkey_i_to_s_c(insert),
-						     m->data_opts.rewrite_dev);
-			if (!old_ptr)
-				goto nomatch;
-
-			if (old_ptr->cached)
-				extent_for_each_ptr(extent_i_to_s(new), new_ptr)
-					new_ptr->cached = true;
-
-			__bch2_bkey_drop_ptr(bkey_i_to_s(insert), old_ptr);
-		}
-
-		extent_for_each_ptr_decode(extent_i_to_s(new), p, entry) {
-			if (bch2_bkey_has_device(bkey_i_to_s_c(insert), p.ptr.dev)) {
-				/*
-				 * raced with another move op? extent already
-				 * has a pointer to the device we just wrote
-				 * data to
-				 */
-				continue;
-			}
-
-			bch2_extent_ptr_decoded_append(insert, &p);
-			did_work = true;
-		}
-
-		if (!did_work)
-			goto nomatch;
-
-		bch2_bkey_narrow_crcs(insert,
-				(struct bch_extent_crc_unpacked) { 0 });
-		bch2_extent_normalize(c, bkey_i_to_s(insert));
-		bch2_bkey_mark_replicas_cached(c, bkey_i_to_s(insert),
-					       op->opts.background_target,
-					       op->opts.data_replicas);
-
-		ret = bch2_sum_sector_overwrites(&trans, &iter, insert,
-						 &extending,
-						 &should_check_enospc,
-						 &i_sectors_delta,
-						 &disk_sectors_delta);
-		if (ret)
-			goto err;
-
-		if (disk_sectors_delta > (s64) op->res.sectors) {
-			ret = bch2_disk_reservation_add(c, &op->res,
-						disk_sectors_delta - op->res.sectors,
-						!should_check_enospc
-						? BCH_DISK_RESERVATION_NOFAIL : 0);
-			if (ret)
-				goto out;
-		}
-
-		next_pos = insert->k.p;
-
-		ret   = insert_snapshot_whiteouts(&trans, m->btree_id,
-						  k.k->p, insert->k.p) ?:
-			bch2_trans_update(&trans, &iter, insert,
-				BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
-			bch2_trans_commit(&trans, &op->res,
-				op_journal_seq(op),
-				BTREE_INSERT_NOFAIL|
-				m->data_opts.btree_insert_flags);
-		if (!ret) {
-			bch2_btree_iter_set_pos(&iter, next_pos);
-			atomic_long_inc(&c->extent_migrate_done);
-			if (ec_ob)
-				bch2_ob_add_backpointer(c, ec_ob, &insert->k);
-		}
-err:
-		if (ret == -EINTR)
-			ret = 0;
-		if (ret)
-			break;
-next:
-		while (bkey_cmp(iter.pos, bch2_keylist_front(keys)->k.p) >= 0) {
-			bch2_keylist_pop_front(keys);
-			if (bch2_keylist_empty(keys))
-				goto out;
-		}
-		continue;
-nomatch:
-		if (m->ctxt) {
-			BUG_ON(k.k->p.offset <= iter.pos.offset);
-			atomic64_inc(&m->ctxt->stats->keys_raced);
-			atomic64_add(k.k->p.offset - iter.pos.offset,
-				     &m->ctxt->stats->sectors_raced);
-		}
-		atomic_long_inc(&c->extent_migrate_raced);
-		trace_move_race(&new->k);
-		bch2_btree_iter_advance(&iter);
-		goto next;
-	}
-out:
-	bch2_trans_iter_exit(&trans, &iter);
-	bch2_trans_exit(&trans);
-	bch2_bkey_buf_exit(&_insert, c);
-	bch2_bkey_buf_exit(&_new, c);
-	BUG_ON(ret == -EINTR);
-	return ret;
-}
-
-void bch2_migrate_read_done(struct migrate_write *m, struct bch_read_bio *rbio)
-{
-	/* write bio must own pages: */
-	BUG_ON(!m->op.wbio.bio.bi_vcnt);
-
-	m->ptr		= rbio->pick.ptr;
-	m->offset	= rbio->data_pos.offset - rbio->pick.crc.offset;
-	m->op.devs_have	= rbio->devs_have;
-	m->op.pos	= rbio->data_pos;
-	m->op.version	= rbio->version;
-	m->op.crc	= rbio->pick.crc;
-	m->op.wbio.bio.bi_iter.bi_size = m->op.crc.compressed_size << 9;
-
-	if (m->data_cmd == DATA_REWRITE)
-		bch2_dev_list_drop_dev(&m->op.devs_have, m->data_opts.rewrite_dev);
-}
-
-int bch2_migrate_write_init(struct bch_fs *c, struct migrate_write *m,
-			    struct write_point_specifier wp,
-			    struct bch_io_opts io_opts,
-			    enum data_cmd data_cmd,
-			    struct data_opts data_opts,
-			    enum btree_id btree_id,
-			    struct bkey_s_c k)
-{
-	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-	const union bch_extent_entry *entry;
-	struct bch_extent_crc_unpacked crc;
-	struct extent_ptr_decoded p;
-	int ret;
-
-	m->btree_id	= btree_id;
-	m->data_cmd	= data_cmd;
-	m->data_opts	= data_opts;
-	m->nr_ptrs_reserved = 0;
-
-	bch2_write_op_init(&m->op, c, io_opts);
-
-	if (!bch2_bkey_is_incompressible(k))
-		m->op.compression_type =
-			bch2_compression_opt_to_type[io_opts.background_compression ?:
-						     io_opts.compression];
-	else
-		m->op.incompressible = true;
-
-	m->op.target	= data_opts.target,
-	m->op.write_point = wp;
-
-	/*
-	 * op->csum_type is normally initialized from the fs/file's current
-	 * options - but if an extent is encrypted, we require that it stays
-	 * encrypted:
-	 */
-	bkey_for_each_crc(k.k, ptrs, crc, entry)
-		if (bch2_csum_type_is_encryption(crc.csum_type)) {
-			m->op.nonce	= crc.nonce + crc.offset;
-			m->op.csum_type = crc.csum_type;
-			break;
-		}
-
-	if (m->data_opts.btree_insert_flags & BTREE_INSERT_USE_RESERVE) {
-		m->op.alloc_reserve = RESERVE_movinggc;
-	} else {
-		/* XXX: this should probably be passed in */
-		m->op.flags |= BCH_WRITE_ONLY_SPECIFIED_DEVS;
-	}
-
-	m->op.flags |= BCH_WRITE_PAGES_STABLE|
-		BCH_WRITE_PAGES_OWNED|
-		BCH_WRITE_DATA_ENCODED|
-		BCH_WRITE_FROM_INTERNAL|
-		BCH_WRITE_MOVE;
-
-	m->op.nr_replicas	= data_opts.nr_replicas;
-	m->op.nr_replicas_required = data_opts.nr_replicas;
-
-	switch (data_cmd) {
-	case DATA_ADD_REPLICAS: {
-		/*
-		 * DATA_ADD_REPLICAS is used for moving data to a different
-		 * device in the background, and due to compression the new copy
-		 * might take up more space than the old copy:
-		 */
-#if 0
-		int nr = (int) io_opts.data_replicas -
-			bch2_bkey_nr_ptrs_allocated(k);
-#endif
-		int nr = (int) io_opts.data_replicas;
-
-		if (nr > 0) {
-			m->op.nr_replicas = m->nr_ptrs_reserved = nr;
-
-			ret = bch2_disk_reservation_get(c, &m->op.res,
-					k.k->size, m->op.nr_replicas, 0);
-			if (ret)
-				return ret;
-		}
-		break;
-	}
-	case DATA_REWRITE: {
-		unsigned compressed_sectors = 0;
-
-		bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
-			if (p.ptr.dev == data_opts.rewrite_dev) {
-				if (p.ptr.cached)
-					m->op.flags |= BCH_WRITE_CACHED;
-
-				if (!p.ptr.cached &&
-				    crc_is_compressed(p.crc))
-					compressed_sectors += p.crc.compressed_size;
-			}
-
-		if (compressed_sectors) {
-			ret = bch2_disk_reservation_add(c, &m->op.res,
-					k.k->size * m->op.nr_replicas,
-					BCH_DISK_RESERVATION_NOFAIL);
-			if (ret)
-				return ret;
-		}
-		break;
-	}
-	case DATA_PROMOTE:
-		m->op.flags	|= BCH_WRITE_ALLOC_NOWAIT;
-		m->op.flags	|= BCH_WRITE_CACHED;
-		break;
-	default:
-		BUG();
-	}
-
-	return 0;
-}
-
 static void move_free(struct moving_io *io)
 {
 	struct moving_context *ctxt = io->write.ctxt;
@@ -457,7 +72,7 @@ static void move_write(struct moving_io *io)
 	closure_get(&io->write.ctxt->cl);
 	atomic_add(io->write_sectors, &io->write.ctxt->write_sectors);
 
-	bch2_migrate_read_done(&io->write, &io->rbio);
+	bch2_data_update_read_done(&io->write, &io->rbio);
 	closure_call(&io->write.op.cl, bch2_write, NULL, NULL);
 }
 
@@ -564,8 +179,8 @@ static int bch2_move_extent(struct btree_trans *trans,
 	io->rbio.bio.bi_iter.bi_sector	= bkey_start_offset(k.k);
 	io->rbio.bio.bi_end_io		= move_read_endio;
 
-	ret = bch2_migrate_write_init(c, &io->write, wp, io_opts,
-				      data_cmd, data_opts, btree_id, k);
+	ret = bch2_data_update_init(c, &io->write, wp, io_opts,
+				    data_cmd, data_opts, btree_id, k);
 	if (ret)
 		goto err_free_pages;
 
diff --git a/fs/bcachefs/move.h b/fs/bcachefs/move.h
index 901d8f875946..6d273f67a82c 100644
--- a/fs/bcachefs/move.h
+++ b/fs/bcachefs/move.h
@@ -4,51 +4,26 @@
 
 #include "btree_iter.h"
 #include "buckets.h"
-#include "io_types.h"
+#include "data_update.h"
 #include "move_types.h"
 
 struct bch_read_bio;
-struct moving_context;
 
-enum data_cmd {
-	DATA_SKIP,
-	DATA_SCRUB,
-	DATA_ADD_REPLICAS,
-	DATA_REWRITE,
-	DATA_PROMOTE,
-};
-
-struct data_opts {
-	u16		target;
-	u8		rewrite_dev;
-	u8		nr_replicas;
-	int		btree_insert_flags;
-};
+struct moving_context {
+	/* Closure for waiting on all reads and writes to complete */
+	struct closure		cl;
 
-struct migrate_write {
-	enum btree_id		btree_id;
-	enum data_cmd		data_cmd;
-	struct data_opts	data_opts;
+	struct bch_move_stats	*stats;
 
-	unsigned		nr_ptrs_reserved;
+	struct list_head	reads;
 
-	struct moving_context	*ctxt;
+	/* in flight sectors: */
+	atomic_t		read_sectors;
+	atomic_t		write_sectors;
 
-	/* what we read: */
-	struct bch_extent_ptr	ptr;
-	u64			offset;
-
-	struct bch_write_op	op;
+	wait_queue_head_t	wait;
 };
 
-int bch2_migrate_index_update(struct bch_write_op *);
-void bch2_migrate_read_done(struct migrate_write *, struct bch_read_bio *);
-int bch2_migrate_write_init(struct bch_fs *, struct migrate_write *,
-			    struct write_point_specifier,
-			    struct bch_io_opts,
-			    enum data_cmd, struct data_opts,
-			    enum btree_id, struct bkey_s_c);
-
 typedef enum data_cmd (*move_pred_fn)(struct bch_fs *, void *,
 				struct bkey_s_c,
 				struct bch_io_opts *, struct data_opts *);
-- 
cgit 


From 54feff0a7ac5ae44b99e697f0fadf81471b33801 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 17 Jun 2022 18:30:17 -0400
Subject: bcachefs: Improve "copygc requested to run" error message

This improves the "copygc requested to run but no buckets found" to show
the device that requires copygc to be run on - we'll definitely need to
improve this more.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/movinggc.c | 23 ++++++++++++++++++++++-
 1 file changed, 22 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c
index 6209cb51efcb..1e2de1e818c1 100644
--- a/fs/bcachefs/movinggc.c
+++ b/fs/bcachefs/movinggc.c
@@ -255,7 +255,28 @@ static int bch2_copygc(struct bch_fs *c)
 	}
 
 	if (!h->used) {
-		bch_err_ratelimited(c, "copygc requested to run but found no buckets to move!");
+		s64 wait = S64_MAX, dev_wait;
+		u64 dev_min_wait_fragmented = 0;
+		u64 dev_min_wait_allowed = 0;
+		int dev_min_wait = -1;
+
+		for_each_rw_member(ca, c, dev_idx) {
+			struct bch_dev_usage usage = bch2_dev_usage_read(ca);
+			s64 allowed = ((__dev_buckets_available(ca, usage, RESERVE_none) *
+					       ca->mi.bucket_size) >> 1);
+			s64 fragmented = usage.d[BCH_DATA_user].fragmented;
+
+			dev_wait = max(0LL, allowed - fragmented);
+
+			if (dev_min_wait < 0 || dev_wait < wait) {
+				dev_min_wait = dev_idx;
+				dev_min_wait_fragmented = fragmented;
+				dev_min_wait_allowed	= allowed;
+			}
+		}
+
+		bch_err_ratelimited(c, "copygc requested to run but found no buckets to move! dev %u fragmented %llu allowed %llu",
+				    dev_min_wait, dev_min_wait_fragmented, dev_min_wait_allowed);
 		return 0;
 	}
 
-- 
cgit 


From 5a3c24714c8bd5e0d01d1547c0848147f2f79f69 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 22 Jun 2022 17:56:25 -0400
Subject: bcachefs: Make verbose option settable at runtime

-o verbose is very useful, and we're starting to use it more for runtime
debug statements - making it possible to enable at runtime is a no
brainer.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/opts.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
index 43b4488b8c6f..30cf9a2d9dc1 100644
--- a/fs/bcachefs/opts.h
+++ b/fs/bcachefs/opts.h
@@ -269,7 +269,7 @@ enum opt_type {
 	  BCH2_NO_SB_OPT,		true,				\
 	  NULL,		"Enable discard/TRIM support")			\
 	x(verbose,			u8,				\
-	  OPT_FS|OPT_MOUNT,						\
+	  OPT_FS|OPT_MOUNT|OPT_RUNTIME,					\
 	  OPT_BOOL(),							\
 	  BCH2_NO_SB_OPT,		false,				\
 	  NULL,		"Extra debugging information during mount/recovery")\
-- 
cgit 


From 2817d453819a10654ac72e5f4937d4653dea1648 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 22 Jun 2022 21:33:43 -0400
Subject: bcachefs: Fix assertion in topology repair

If we were at the end of the node, when breaking out of the loop we'd
pop the assertion on line 446 when cur wasn't NULL.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_gc.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 123644ffe93c..2de5d97a9d2c 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -395,6 +395,7 @@ again:
 			bch2_btree_node_evict(c, cur_k.k);
 			ret = bch2_journal_key_delete(c, b->c.btree_id,
 						      b->c.level, cur_k.k->k.p);
+			cur = NULL;
 			if (ret)
 				break;
 			continue;
@@ -413,6 +414,7 @@ again:
 			bch2_btree_node_evict(c, cur_k.k);
 			ret = bch2_journal_key_delete(c, b->c.btree_id,
 						      b->c.level, cur_k.k->k.p);
+			cur = NULL;
 			if (ret)
 				break;
 			continue;
-- 
cgit 


From 58aaa0836be3be279f17db9ba07f3a58da033a9f Mon Sep 17 00:00:00 2001
From: Daniel Hill <daniel@gluo.nz>
Date: Thu, 23 Jun 2022 10:28:30 +1200
Subject: bcachefs: fix __dev_available().

__dev_available() now calculates available buckets correctly. Previously
it would almost always return 0 when we have cached data.

Signed-off-by: Daniel Hill <daniel@gluo.nz>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/buckets.h | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index 080bcb20a5b0..b4cf10a47c52 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -159,12 +159,12 @@ static inline u64 __dev_buckets_available(struct bch_dev *ca,
 					  enum alloc_reserve reserve)
 {
 	return max_t(s64, 0,
-		     usage.d[BCH_DATA_free].buckets -
-		     usage.d[BCH_DATA_cached].buckets -
-		     usage.d[BCH_DATA_need_gc_gens].buckets -
-		     usage.d[BCH_DATA_need_discard].buckets -
-		     ca->nr_open_buckets -
-		     bch2_dev_buckets_reserved(ca, reserve));
+		       usage.d[BCH_DATA_free].buckets
+		     + usage.d[BCH_DATA_cached].buckets
+		     + usage.d[BCH_DATA_need_gc_gens].buckets
+		     + usage.d[BCH_DATA_need_discard].buckets
+		     - ca->nr_open_buckets
+		     - bch2_dev_buckets_reserved(ca, reserve));
 }
 
 static inline u64 dev_buckets_available(struct bch_dev *ca,
-- 
cgit 


From 7a47d0993be95bf68e2e04653f40d311e3c25bed Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 22 Jun 2022 22:53:43 -0400
Subject: bcachefs: Always descend to leaf nodes it btree_gc

If a btree node is unreadable, it's the topology repair that fixes that
and it's kicked off by btree_gc, so btree_gc needs to touch every node
and very that they can be read.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_gc.c | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 2de5d97a9d2c..0447f5a51b5e 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -851,10 +851,7 @@ static int bch2_gc_btree(struct btree_trans *trans, enum btree_id btree_id,
 	struct bch_fs *c = trans->c;
 	struct btree_iter iter;
 	struct btree *b;
-	unsigned depth = metadata_only			? 1
-		: bch2_expensive_debug_checks		? 0
-		: !btree_node_type_needs_gc(btree_id)	? 1
-		: 0;
+	unsigned depth = metadata_only ? 1 : 0;
 	int ret = 0;
 
 	gc_pos_set(c, gc_pos_btree(btree_id, POS_MIN, 0));
@@ -997,10 +994,7 @@ static int bch2_gc_btree_init(struct btree_trans *trans,
 {
 	struct bch_fs *c = trans->c;
 	struct btree *b;
-	unsigned target_depth = metadata_only		? 1
-		: bch2_expensive_debug_checks		? 0
-		: !btree_node_type_needs_gc(btree_id)	? 1
-		: 0;
+	unsigned target_depth = metadata_only ? 1 : 0;
 	struct printbuf buf = PRINTBUF;
 	int ret = 0;
 
-- 
cgit 


From 962ad1a76669443126c6531352380f56d6e5d7d2 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 23 Jun 2022 18:26:01 -0400
Subject: bcachefs: Don't BUG_ON() inode link count underflow

This switches that assertion to a bch2_trans_inconsistent() call, as it
should be.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/fs-common.c |  8 +++++---
 fs/bcachefs/inode.c     | 33 +++++++++++++++++++++++++++++++++
 fs/bcachefs/inode.h     | 20 +++-----------------
 3 files changed, 41 insertions(+), 20 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-common.c b/fs/bcachefs/fs-common.c
index d543480be111..53ffc684223c 100644
--- a/fs/bcachefs/fs-common.c
+++ b/fs/bcachefs/fs-common.c
@@ -204,7 +204,9 @@ int bch2_link_trans(struct btree_trans *trans,
 		goto err;
 
 	inode_u->bi_ctime = now;
-	bch2_inode_nlink_inc(inode_u);
+	ret = bch2_inode_nlink_inc(inode_u);
+	if (ret)
+		return ret;
 
 	ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir, BTREE_ITER_INTENT);
 	if (ret)
@@ -297,7 +299,7 @@ int bch2_unlink_trans(struct btree_trans *trans,
 		if (ret)
 			goto err;
 	} else {
-		bch2_inode_nlink_dec(inode_u);
+		bch2_inode_nlink_dec(trans, inode_u);
 	}
 
 	if (inode_u->bi_dir		== dirent_iter.pos.inode &&
@@ -462,7 +464,7 @@ int bch2_rename_trans(struct btree_trans *trans,
 	}
 
 	if (mode == BCH_RENAME_OVERWRITE)
-		bch2_inode_nlink_dec(dst_inode_u);
+		bch2_inode_nlink_dec(trans, dst_inode_u);
 
 	src_dir_u->bi_mtime		= now;
 	src_dir_u->bi_ctime		= now;
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index 6c0547151d50..5de66d62028b 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -716,3 +716,36 @@ int bch2_inode_find_by_inum(struct bch_fs *c, subvol_inum inum,
 	return bch2_trans_do(c, NULL, NULL, 0,
 		bch2_inode_find_by_inum_trans(&trans, inum, inode));
 }
+
+int bch2_inode_nlink_inc(struct bch_inode_unpacked *bi)
+{
+	if (bi->bi_flags & BCH_INODE_UNLINKED)
+		bi->bi_flags &= ~BCH_INODE_UNLINKED;
+	else {
+		if (bi->bi_nlink == U32_MAX)
+			return -EINVAL;
+
+		bi->bi_nlink++;
+	}
+
+	return 0;
+}
+
+void bch2_inode_nlink_dec(struct btree_trans *trans, struct bch_inode_unpacked *bi)
+{
+	if (bi->bi_nlink && (bi->bi_flags & BCH_INODE_UNLINKED)) {
+		bch2_trans_inconsistent(trans, "inode %llu unlinked but link count nonzero",
+					bi->bi_inum);
+		return;
+	}
+
+	if (bi->bi_flags & BCH_INODE_UNLINKED) {
+		bch2_trans_inconsistent(trans, "inode %llu link count underflow", bi->bi_inum);
+		return;
+	}
+
+	if (bi->bi_nlink)
+		bi->bi_nlink--;
+	else
+		bi->bi_flags |= BCH_INODE_UNLINKED;
+}
diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h
index 9442600a7440..2ac2fc10513b 100644
--- a/fs/bcachefs/inode.h
+++ b/fs/bcachefs/inode.h
@@ -164,23 +164,6 @@ static inline unsigned nlink_bias(umode_t mode)
 	return S_ISDIR(mode) ? 2 : 1;
 }
 
-static inline void bch2_inode_nlink_inc(struct bch_inode_unpacked *bi)
-{
-	if (bi->bi_flags & BCH_INODE_UNLINKED)
-		bi->bi_flags &= ~BCH_INODE_UNLINKED;
-	else
-		bi->bi_nlink++;
-}
-
-static inline void bch2_inode_nlink_dec(struct bch_inode_unpacked *bi)
-{
-	BUG_ON(bi->bi_flags & BCH_INODE_UNLINKED);
-	if (bi->bi_nlink)
-		bi->bi_nlink--;
-	else
-		bi->bi_flags |= BCH_INODE_UNLINKED;
-}
-
 static inline unsigned bch2_inode_nlink_get(struct bch_inode_unpacked *bi)
 {
 	return bi->bi_flags & BCH_INODE_UNLINKED
@@ -200,4 +183,7 @@ static inline void bch2_inode_nlink_set(struct bch_inode_unpacked *bi,
 	}
 }
 
+int bch2_inode_nlink_inc(struct bch_inode_unpacked *);
+void bch2_inode_nlink_dec(struct btree_trans *, struct bch_inode_unpacked *);
+
 #endif /* _BCACHEFS_INODE_H */
-- 
cgit 


From 38585367442f09606d7a529be3290f2fd4cbcf84 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 20 Jun 2022 22:26:41 -0400
Subject: bcachefs: Bucket invalidate path improvements

 - invalidate_one_bucket() now returns 1 when we don't have any buckets
   on this device to invalidate, ensuring we don't spin
 - the tracepoint invocation is moved to after the transaction commit,
   and we now include the number of cached sectors in the tracepoint

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/alloc_background.c | 26 ++++++++++++++++++--------
 fs/bcachefs/trace.h            | 11 +++++++----
 2 files changed, 25 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 933334fed960..b784fe7c5b81 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -1077,7 +1077,8 @@ void bch2_do_discards(struct bch_fs *c)
 		percpu_ref_put(&c->writes);
 }
 
-static int invalidate_one_bucket(struct btree_trans *trans, struct bch_dev *ca)
+static int invalidate_one_bucket(struct btree_trans *trans, struct bch_dev *ca,
+				 struct bpos *bucket_pos, unsigned *cached_sectors)
 {
 	struct bch_fs *c = trans->c;
 	struct btree_iter lru_iter, alloc_iter = { NULL };
@@ -1095,8 +1096,10 @@ next_lru:
 	if (ret)
 		goto out;
 
-	if (!k.k || k.k->p.inode != ca->dev_idx)
+	if (!k.k || k.k->p.inode != ca->dev_idx) {
+		ret = 1;
 		goto out;
+	}
 
 	if (k.k->type != KEY_TYPE_lru) {
 		prt_printf(&buf, "non lru key in lru btree:\n  ");
@@ -1116,8 +1119,9 @@ next_lru:
 	idx	= k.k->p.offset;
 	bucket	= le64_to_cpu(bkey_s_c_to_lru(k).v->idx);
 
-	a = bch2_trans_start_alloc_update(trans, &alloc_iter,
-					  POS(ca->dev_idx, bucket));
+	*bucket_pos = POS(ca->dev_idx, bucket);
+
+	a = bch2_trans_start_alloc_update(trans, &alloc_iter, *bucket_pos);
 	ret = PTR_ERR_OR_ZERO(a);
 	if (ret)
 		goto out;
@@ -1139,6 +1143,11 @@ next_lru:
 		}
 	}
 
+	if (!a->v.cached_sectors)
+		bch_err(c, "invalidating empty bucket, confused");
+
+	*cached_sectors = a->v.cached_sectors;
+
 	SET_BCH_ALLOC_V4_NEED_INC_GEN(&a->v, false);
 	a->v.gen++;
 	a->v.data_type		= 0;
@@ -1151,8 +1160,6 @@ next_lru:
 				BTREE_TRIGGER_BUCKET_INVALIDATE);
 	if (ret)
 		goto out;
-
-	trace_invalidate_bucket(c, a->k.p.inode, a->k.p.offset);
 out:
 	bch2_trans_iter_exit(trans, &alloc_iter);
 	bch2_trans_iter_exit(trans, &lru_iter);
@@ -1165,7 +1172,8 @@ static void bch2_do_invalidates_work(struct work_struct *work)
 	struct bch_fs *c = container_of(work, struct bch_fs, invalidate_work);
 	struct bch_dev *ca;
 	struct btree_trans trans;
-	unsigned i;
+	struct bpos bucket;
+	unsigned i, sectors;
 	int ret = 0;
 
 	bch2_trans_init(&trans, c, 0, 0);
@@ -1178,10 +1186,12 @@ static void bch2_do_invalidates_work(struct work_struct *work)
 			ret = __bch2_trans_do(&trans, NULL, NULL,
 					      BTREE_INSERT_USE_RESERVE|
 					      BTREE_INSERT_NOFAIL,
-					invalidate_one_bucket(&trans, ca));
+					invalidate_one_bucket(&trans, ca, &bucket,
+							      &sectors));
 			if (ret)
 				break;
 
+			trace_invalidate_bucket(c, bucket.inode, bucket.offset, sectors);
 			this_cpu_inc(c->counters[BCH_COUNTER_bucket_invalidate]);
 		}
 	}
diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h
index f5aa10762611..b0ecf18fa139 100644
--- a/fs/bcachefs/trace.h
+++ b/fs/bcachefs/trace.h
@@ -542,24 +542,27 @@ TRACE_EVENT(discard_buckets,
 );
 
 TRACE_EVENT(invalidate_bucket,
-	TP_PROTO(struct bch_fs *c, unsigned dev, u64 bucket),
-	TP_ARGS(c, dev, bucket),
+	TP_PROTO(struct bch_fs *c, unsigned dev, u64 bucket, u32 sectors),
+	TP_ARGS(c, dev, bucket, sectors),
 
 	TP_STRUCT__entry(
 		__field(dev_t,		dev			)
 		__field(u32,		dev_idx			)
+		__field(u32,		sectors			)
 		__field(u64,		bucket			)
 	),
 
 	TP_fast_assign(
 		__entry->dev		= c->dev;
 		__entry->dev_idx	= dev;
+		__entry->sectors	= sectors;
 		__entry->bucket		= bucket;
 	),
 
-	TP_printk("%d:%d invalidated %u:%llu",
+	TP_printk("%d:%d invalidated %u:%llu cached sectors %u",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
-		  __entry->dev_idx, __entry->bucket)
+		  __entry->dev_idx, __entry->bucket,
+		  __entry->sectors)
 );
 
 /* Moving IO */
-- 
cgit 


From 22add2ec6705cd66977717c49105e0910b2ef144 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 26 Jun 2022 18:31:51 -0400
Subject: bcachefs: Use BTREE_INSERT_LAZY_RW in bch2_check_alloc_info()

This runs before we go rw for journal replay, but after we're allowed to
go rw. It might be time to consider killing BTREE_INSERT_LAZY_RW,
though.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/alloc_background.c | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index b784fe7c5b81..caaf2ab702e6 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -784,7 +784,9 @@ int bch2_check_alloc_info(struct bch_fs *c)
 
 	for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN,
 			   BTREE_ITER_PREFETCH, k, ret) {
-		ret = __bch2_trans_do(&trans, NULL, NULL, 0,
+		ret = __bch2_trans_do(&trans, NULL, NULL,
+				      BTREE_INSERT_NOFAIL|
+				      BTREE_INSERT_LAZY_RW,
 			bch2_check_alloc_key(&trans, &iter));
 		if (ret)
 			break;
@@ -797,7 +799,9 @@ int bch2_check_alloc_info(struct bch_fs *c)
 	bch2_trans_iter_init(&trans, &iter, BTREE_ID_need_discard, POS_MIN,
 			     BTREE_ITER_PREFETCH);
 	while (1) {
-		ret = __bch2_trans_do(&trans, NULL, NULL, 0,
+		ret = __bch2_trans_do(&trans, NULL, NULL,
+				      BTREE_INSERT_NOFAIL|
+				      BTREE_INSERT_LAZY_RW,
 			bch2_check_discard_freespace_key(&trans, &iter));
 		if (ret)
 			break;
@@ -812,7 +816,9 @@ int bch2_check_alloc_info(struct bch_fs *c)
 	bch2_trans_iter_init(&trans, &iter, BTREE_ID_freespace, POS_MIN,
 			     BTREE_ITER_PREFETCH);
 	while (1) {
-		ret = __bch2_trans_do(&trans, NULL, NULL, 0,
+		ret = __bch2_trans_do(&trans, NULL, NULL,
+				      BTREE_INSERT_NOFAIL|
+				      BTREE_INSERT_LAZY_RW,
 			bch2_check_discard_freespace_key(&trans, &iter));
 		if (ret)
 			break;
-- 
cgit 


From e34da43e33f8c0b42b74bf9aa86042d16884183b Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 19 Jun 2022 22:43:00 -0400
Subject: bcachefs: Improve bch2_check_alloc_info

 - In check_alloc_key(), previously we were re-initializing iterators
   for the need_discard and freespace btrees for every alloc key we
   checked. But this was causing us to redo lookups into the journal
   keys every time, since those lookups are cached in struct btree_iter.
   This initializes the iterators in bch2_check_alloc_info and passes
   them into check_alloc_key().

 - Make the looping more consistent/efficient in bch2_check_alloc_info()

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/alloc_background.c | 57 +++++++++++++++++++++++-------------------
 1 file changed, 31 insertions(+), 26 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index caaf2ab702e6..9ede98a3dc64 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -606,21 +606,21 @@ int bch2_trans_mark_alloc(struct btree_trans *trans,
 }
 
 static int bch2_check_alloc_key(struct btree_trans *trans,
-				struct btree_iter *alloc_iter)
+				struct btree_iter *alloc_iter,
+				struct btree_iter *discard_iter,
+				struct btree_iter *freespace_iter)
 {
 	struct bch_fs *c = trans->c;
 	struct bch_dev *ca;
-	struct btree_iter discard_iter, freespace_iter;
 	struct bch_alloc_v4 a;
 	unsigned discard_key_type, freespace_key_type;
 	struct bkey_s_c alloc_k, k;
 	struct printbuf buf = PRINTBUF;
-	struct printbuf buf2 = PRINTBUF;
 	int ret;
 
 	alloc_k = bch2_btree_iter_peek(alloc_iter);
 	if (!alloc_k.k)
-		return 0;
+		return 1;
 
 	ret = bkey_err(alloc_k);
 	if (ret)
@@ -642,12 +642,10 @@ static int bch2_check_alloc_key(struct btree_trans *trans,
 	freespace_key_type = a.data_type == BCH_DATA_free
 		? KEY_TYPE_set : 0;
 
-	bch2_trans_iter_init(trans, &discard_iter, BTREE_ID_need_discard,
-			     alloc_k.k->p, 0);
-	bch2_trans_iter_init(trans, &freespace_iter, BTREE_ID_freespace,
-			     alloc_freespace_pos(alloc_k.k->p, a), 0);
+	bch2_btree_iter_set_pos(discard_iter, alloc_k.k->p);
+	bch2_btree_iter_set_pos(freespace_iter, alloc_freespace_pos(alloc_k.k->p, a));
 
-	k = bch2_btree_iter_peek_slot(&discard_iter);
+	k = bch2_btree_iter_peek_slot(discard_iter);
 	ret = bkey_err(k);
 	if (ret)
 		goto err;
@@ -667,14 +665,14 @@ static int bch2_check_alloc_key(struct btree_trans *trans,
 
 		bkey_init(&update->k);
 		update->k.type	= discard_key_type;
-		update->k.p	= discard_iter.pos;
+		update->k.p	= discard_iter->pos;
 
-		ret = bch2_trans_update(trans, &discard_iter, update, 0);
+		ret = bch2_trans_update(trans, discard_iter, update, 0);
 		if (ret)
 			goto err;
 	}
 
-	k = bch2_btree_iter_peek_slot(&freespace_iter);
+	k = bch2_btree_iter_peek_slot(freespace_iter);
 	ret = bkey_err(k);
 	if (ret)
 		goto err;
@@ -695,18 +693,15 @@ static int bch2_check_alloc_key(struct btree_trans *trans,
 
 		bkey_init(&update->k);
 		update->k.type	= freespace_key_type;
-		update->k.p	= freespace_iter.pos;
+		update->k.p	= freespace_iter->pos;
 		bch2_key_resize(&update->k, 1);
 
-		ret = bch2_trans_update(trans, &freespace_iter, update, 0);
+		ret = bch2_trans_update(trans, freespace_iter, update, 0);
 		if (ret)
 			goto err;
 	}
 err:
 fsck_err:
-	bch2_trans_iter_exit(trans, &freespace_iter);
-	bch2_trans_iter_exit(trans, &discard_iter);
-	printbuf_exit(&buf2);
 	printbuf_exit(&buf);
 	return ret;
 }
@@ -776,24 +771,34 @@ delete:
 int bch2_check_alloc_info(struct bch_fs *c)
 {
 	struct btree_trans trans;
-	struct btree_iter iter;
-	struct bkey_s_c k;
+	struct btree_iter iter, discard_iter, freespace_iter;
 	int ret = 0;
 
 	bch2_trans_init(&trans, c, 0, 0);
 
-	for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN,
-			   BTREE_ITER_PREFETCH, k, ret) {
+	bch2_trans_iter_init(&trans, &iter, BTREE_ID_alloc, POS_MIN,
+			     BTREE_ITER_PREFETCH);
+	bch2_trans_iter_init(&trans, &discard_iter, BTREE_ID_need_discard, POS_MIN,
+			     BTREE_ITER_PREFETCH);
+	bch2_trans_iter_init(&trans, &freespace_iter, BTREE_ID_freespace, POS_MIN,
+			     BTREE_ITER_PREFETCH);
+	while (1) {
 		ret = __bch2_trans_do(&trans, NULL, NULL,
 				      BTREE_INSERT_NOFAIL|
 				      BTREE_INSERT_LAZY_RW,
-			bch2_check_alloc_key(&trans, &iter));
+			bch2_check_alloc_key(&trans, &iter,
+					     &discard_iter,
+					     &freespace_iter));
 		if (ret)
 			break;
+
+		bch2_btree_iter_advance(&iter);
 	}
+	bch2_trans_iter_exit(&trans, &freespace_iter);
+	bch2_trans_iter_exit(&trans, &discard_iter);
 	bch2_trans_iter_exit(&trans, &iter);
 
-	if (ret)
+	if (ret < 0)
 		goto err;
 
 	bch2_trans_iter_init(&trans, &iter, BTREE_ID_need_discard, POS_MIN,
@@ -806,11 +811,11 @@ int bch2_check_alloc_info(struct bch_fs *c)
 		if (ret)
 			break;
 
-		bch2_btree_iter_set_pos(&iter, bpos_nosnap_successor(iter.pos));
+		bch2_btree_iter_advance(&iter);
 	}
 	bch2_trans_iter_exit(&trans, &iter);
 
-	if (ret)
+	if (ret < 0)
 		goto err;
 
 	bch2_trans_iter_init(&trans, &iter, BTREE_ID_freespace, POS_MIN,
@@ -823,7 +828,7 @@ int bch2_check_alloc_info(struct bch_fs *c)
 		if (ret)
 			break;
 
-		bch2_btree_iter_set_pos(&iter, bpos_nosnap_successor(iter.pos));
+		bch2_btree_iter_advance(&iter);
 	}
 	bch2_trans_iter_exit(&trans, &iter);
 err:
-- 
cgit 


From 47ab0c5f6a1e6ac1e9387181585fb39393fec4ea Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 26 Jun 2022 20:34:34 -0400
Subject: bcachefs: Fix bch2_check_alloc_key()

bch2_check_alloc_key() was failing to check buckets that didn't have
alloc keys yet (because they'd never been used) - they still need to be
added to the freespace btree.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/alloc_background.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 9ede98a3dc64..f4457d62d75e 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -618,7 +618,9 @@ static int bch2_check_alloc_key(struct btree_trans *trans,
 	struct printbuf buf = PRINTBUF;
 	int ret;
 
-	alloc_k = bch2_btree_iter_peek(alloc_iter);
+	alloc_k = bch2_dev_bucket_exists(c, alloc_iter->pos)
+		? bch2_btree_iter_peek_slot(alloc_iter)
+		: bch2_btree_iter_peek(alloc_iter);
 	if (!alloc_k.k)
 		return 1;
 
-- 
cgit 


From 7f5c5d20f01483ba53233e3e2c54848e0b2d9ecd Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 13 Jun 2022 19:17:45 -0400
Subject: bcachefs: Redo data_update interface

This patch significantly cleans up and simplifies the data_update
interface. Instead of only being able to specify a single pointer by
device to rewrite, we're now able to specify any or all of the pointers
in the original extent to be rewrited, as a bitmask.

data_cmd is no more: the various pred functions now just return true if
the extent should be moved/updated. All the data_update path does is
rewrite existing replicas, or add new ones.

This fixes a bug where with background compression on replicated
filesystems, where rebalance -> data_update would incorrectly drop the
wrong old replica, and keep trying to recompress an extent pointer and
each time failing to drop the right replica. Oops.

Now, the data update path doesn't look at the io options to decide which
pointers to keep and which to drop - it only goes off of the
data_update_options passed to it.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/data_update.c | 209 ++++++++++++++++++++++------------------------
 fs/bcachefs/data_update.h |  39 ++++-----
 fs/bcachefs/extents.c     |  83 ++++++++++--------
 fs/bcachefs/extents.h     |   8 +-
 fs/bcachefs/io.c          |  13 ++-
 fs/bcachefs/move.c        | 140 ++++++++++++-------------------
 fs/bcachefs/move.h        |   5 +-
 fs/bcachefs/movinggc.c    |  63 +++++++-------
 fs/bcachefs/rebalance.c   |  84 ++++++++++---------
 9 files changed, 304 insertions(+), 340 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c
index 0161b0a9f36e..f7bce89f84ed 100644
--- a/fs/bcachefs/data_update.c
+++ b/fs/bcachefs/data_update.c
@@ -89,6 +89,16 @@ next:
 	return ret;
 }
 
+static void bch2_bkey_mark_dev_cached(struct bkey_s k, unsigned dev)
+{
+	struct bkey_ptrs ptrs = bch2_bkey_ptrs(k);
+	struct bch_extent_ptr *ptr;
+
+	bkey_for_each_ptr(ptrs, ptr)
+		if (ptr->dev == dev)
+			ptr->cached = true;
+}
+
 int bch2_data_update_index_update(struct bch_write_op *op)
 {
 	struct bch_fs *c = op->c;
@@ -113,6 +123,7 @@ int bch2_data_update_index_update(struct bch_write_op *op)
 
 	while (1) {
 		struct bkey_s_c k;
+		struct bkey_s_c old = bkey_i_to_s_c(m->k.k);
 		struct bkey_i *insert;
 		struct bkey_i_extent *new;
 		const union bch_extent_entry *entry;
@@ -121,6 +132,7 @@ int bch2_data_update_index_update(struct bch_write_op *op)
 		bool did_work = false;
 		bool extending = false, should_check_enospc;
 		s64 i_sectors_delta = 0, disk_sectors_delta = 0;
+		unsigned i;
 
 		bch2_trans_begin(&trans);
 
@@ -131,8 +143,7 @@ int bch2_data_update_index_update(struct bch_write_op *op)
 
 		new = bkey_i_to_extent(bch2_keylist_front(keys));
 
-		if (bversion_cmp(k.k->version, new->k.version) ||
-		    !bch2_bkey_matches_ptr(c, k, m->ptr, m->offset))
+		if (!bch2_extents_match(k, old))
 			goto nomatch;
 
 		bkey_reassemble(_insert.k, k);
@@ -146,20 +157,34 @@ int bch2_data_update_index_update(struct bch_write_op *op)
 		bch2_cut_back(new->k.p,		insert);
 		bch2_cut_back(insert->k.p,	&new->k_i);
 
-		if (m->data_cmd == DATA_REWRITE) {
-			struct bch_extent_ptr *new_ptr, *old_ptr = (void *)
-				bch2_bkey_has_device(bkey_i_to_s_c(insert),
-						     m->data_opts.rewrite_dev);
-			if (!old_ptr)
-				goto nomatch;
-
-			if (old_ptr->cached)
-				extent_for_each_ptr(extent_i_to_s(new), new_ptr)
-					new_ptr->cached = true;
-
-			__bch2_bkey_drop_ptr(bkey_i_to_s(insert), old_ptr);
+		/*
+		 * @old: extent that we read from
+		 * @insert: key that we're going to update, initialized from
+		 * extent currently in btree - same as @old unless we raced with
+		 * other updates
+		 * @new: extent with new pointers that we'll be adding to @insert
+		 *
+		 * Fist, drop rewrite_ptrs from @new:
+		 */
+		i = 0;
+		bkey_for_each_ptr_decode(old.k, bch2_bkey_ptrs_c(old), p, entry) {
+			if (((1U << i) & m->data_opts.rewrite_ptrs) &&
+			    bch2_extent_has_ptr(old, p, bkey_i_to_s_c(insert))) {
+				/*
+				 * If we're going to be adding a pointer to the
+				 * same device, we have to drop the old one -
+				 * otherwise, we can just mark it cached:
+				 */
+				if (bch2_bkey_has_device(bkey_i_to_s_c(&new->k_i), p.ptr.dev))
+					bch2_bkey_drop_device_noerror(bkey_i_to_s(insert), p.ptr.dev);
+				else
+					bch2_bkey_mark_dev_cached(bkey_i_to_s(insert), p.ptr.dev);
+			}
+			i++;
 		}
 
+
+		/* Add new ptrs: */
 		extent_for_each_ptr_decode(extent_i_to_s(new), p, entry) {
 			if (bch2_bkey_has_device(bkey_i_to_s_c(insert), p.ptr.dev)) {
 				/*
@@ -177,12 +202,8 @@ int bch2_data_update_index_update(struct bch_write_op *op)
 		if (!did_work)
 			goto nomatch;
 
-		bch2_bkey_narrow_crcs(insert,
-				(struct bch_extent_crc_unpacked) { 0 });
+		bch2_bkey_narrow_crcs(insert, (struct bch_extent_crc_unpacked) { 0 });
 		bch2_extent_normalize(c, bkey_i_to_s(insert));
-		bch2_bkey_mark_replicas_cached(c, bkey_i_to_s(insert),
-					       op->opts.background_target,
-					       op->opts.data_replicas);
 
 		ret = bch2_sum_sector_overwrites(&trans, &iter, insert,
 						 &extending,
@@ -250,134 +271,100 @@ out:
 	return ret;
 }
 
-void bch2_data_update_read_done(struct data_update *m, struct bch_read_bio *rbio)
+void bch2_data_update_read_done(struct data_update *m,
+				struct bch_extent_crc_unpacked crc)
 {
 	/* write bio must own pages: */
 	BUG_ON(!m->op.wbio.bio.bi_vcnt);
 
-	m->ptr		= rbio->pick.ptr;
-	m->offset	= rbio->data_pos.offset - rbio->pick.crc.offset;
-	m->op.devs_have	= rbio->devs_have;
-	m->op.pos	= rbio->data_pos;
-	m->op.version	= rbio->version;
-	m->op.crc	= rbio->pick.crc;
-	m->op.wbio.bio.bi_iter.bi_size = m->op.crc.compressed_size << 9;
+	m->op.crc = crc;
+	m->op.wbio.bio.bi_iter.bi_size = crc.compressed_size << 9;
 
-	if (m->data_cmd == DATA_REWRITE)
-		bch2_dev_list_drop_dev(&m->op.devs_have, m->data_opts.rewrite_dev);
+	closure_call(&m->op.cl, bch2_write, NULL, NULL);
+}
+
+void bch2_data_update_exit(struct data_update *update)
+{
+	struct bch_fs *c = update->op.c;
+
+	bch2_bkey_buf_exit(&update->k, c);
+	bch2_disk_reservation_put(c, &update->op.res);
+	bch2_bio_free_pages_pool(c, &update->op.wbio.bio);
 }
 
 int bch2_data_update_init(struct bch_fs *c, struct data_update *m,
 			  struct write_point_specifier wp,
 			  struct bch_io_opts io_opts,
-			  enum data_cmd data_cmd,
-			  struct data_opts data_opts,
+			  struct data_update_opts data_opts,
 			  enum btree_id btree_id,
 			  struct bkey_s_c k)
 {
 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
 	const union bch_extent_entry *entry;
-	struct bch_extent_crc_unpacked crc;
 	struct extent_ptr_decoded p;
+	unsigned i, reserve_sectors = k.k->size * data_opts.extra_replicas;
 	int ret;
 
+	bch2_bkey_buf_init(&m->k);
+	bch2_bkey_buf_reassemble(&m->k, c, k);
 	m->btree_id	= btree_id;
-	m->data_cmd	= data_cmd;
 	m->data_opts	= data_opts;
-	m->nr_ptrs_reserved = 0;
 
 	bch2_write_op_init(&m->op, c, io_opts);
-
-	if (!bch2_bkey_is_incompressible(k))
-		m->op.compression_type =
-			bch2_compression_opt_to_type[io_opts.background_compression ?:
-						     io_opts.compression];
-	else
-		m->op.incompressible = true;
-
+	m->op.pos	= bkey_start_pos(k.k);
+	m->op.version	= k.k->version;
 	m->op.target	= data_opts.target,
 	m->op.write_point = wp;
-
-	/*
-	 * op->csum_type is normally initialized from the fs/file's current
-	 * options - but if an extent is encrypted, we require that it stays
-	 * encrypted:
-	 */
-	bkey_for_each_crc(k.k, ptrs, crc, entry)
-		if (bch2_csum_type_is_encryption(crc.csum_type)) {
-			m->op.nonce	= crc.nonce + crc.offset;
-			m->op.csum_type = crc.csum_type;
-			break;
-		}
-
-	if (m->data_opts.btree_insert_flags & BTREE_INSERT_USE_RESERVE) {
-		m->op.alloc_reserve = RESERVE_movinggc;
-	} else {
-		/* XXX: this should probably be passed in */
-		m->op.flags |= BCH_WRITE_ONLY_SPECIFIED_DEVS;
-	}
-
-	m->op.flags |= BCH_WRITE_PAGES_STABLE|
+	m->op.flags	|= BCH_WRITE_PAGES_STABLE|
 		BCH_WRITE_PAGES_OWNED|
 		BCH_WRITE_DATA_ENCODED|
 		BCH_WRITE_FROM_INTERNAL|
-		BCH_WRITE_MOVE;
+		BCH_WRITE_MOVE|
+		m->data_opts.write_flags;
+	m->op.compression_type =
+		bch2_compression_opt_to_type[io_opts.background_compression ?:
+					     io_opts.compression];
+	if (m->data_opts.btree_insert_flags & BTREE_INSERT_USE_RESERVE)
+		m->op.alloc_reserve = RESERVE_movinggc;
 
-	m->op.nr_replicas	= data_opts.nr_replicas;
-	m->op.nr_replicas_required = data_opts.nr_replicas;
+	i = 0;
+	bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+		if (p.ptr.cached)
+			m->data_opts.rewrite_ptrs &= ~(1U << i);
 
-	switch (data_cmd) {
-	case DATA_ADD_REPLICAS: {
-		/*
-		 * DATA_ADD_REPLICAS is used for moving data to a different
-		 * device in the background, and due to compression the new copy
-		 * might take up more space than the old copy:
-		 */
-#if 0
-		int nr = (int) io_opts.data_replicas -
-			bch2_bkey_nr_ptrs_allocated(k);
-#endif
-		int nr = (int) io_opts.data_replicas;
+		if (!((1U << i) & m->data_opts.rewrite_ptrs))
+			bch2_dev_list_add_dev(&m->op.devs_have, p.ptr.dev);
 
-		if (nr > 0) {
-			m->op.nr_replicas = m->nr_ptrs_reserved = nr;
+		if (((1U << i) & m->data_opts.rewrite_ptrs) &&
+		    crc_is_compressed(p.crc))
+			reserve_sectors += k.k->size;
 
-			ret = bch2_disk_reservation_get(c, &m->op.res,
-					k.k->size, m->op.nr_replicas, 0);
-			if (ret)
-				return ret;
+		/*
+		 * op->csum_type is normally initialized from the fs/file's
+		 * current options - but if an extent is encrypted, we require
+		 * that it stays encrypted:
+		 */
+		if (bch2_csum_type_is_encryption(p.crc.csum_type)) {
+			m->op.nonce	= p.crc.nonce + p.crc.offset;
+			m->op.csum_type = p.crc.csum_type;
 		}
-		break;
-	}
-	case DATA_REWRITE: {
-		unsigned compressed_sectors = 0;
 
-		bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
-			if (p.ptr.dev == data_opts.rewrite_dev) {
-				if (p.ptr.cached)
-					m->op.flags |= BCH_WRITE_CACHED;
-
-				if (!p.ptr.cached &&
-				    crc_is_compressed(p.crc))
-					compressed_sectors += p.crc.compressed_size;
-			}
+		if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible)
+			m->op.incompressible = true;
 
-		if (compressed_sectors) {
-			ret = bch2_disk_reservation_add(c, &m->op.res,
-					k.k->size * m->op.nr_replicas,
-					BCH_DISK_RESERVATION_NOFAIL);
-			if (ret)
-				return ret;
-		}
-		break;
+		i++;
 	}
-	case DATA_PROMOTE:
-		m->op.flags	|= BCH_WRITE_ALLOC_NOWAIT;
-		m->op.flags	|= BCH_WRITE_CACHED;
-		break;
-	default:
-		BUG();
+
+	if (reserve_sectors) {
+		ret = bch2_disk_reservation_add(c, &m->op.res, reserve_sectors,
+				m->data_opts.extra_replicas
+				? 0
+				: BCH_DISK_RESERVATION_NOFAIL);
+		if (ret)
+			return ret;
 	}
 
+	m->op.nr_replicas = m->op.nr_replicas_required =
+		hweight32(m->data_opts.rewrite_ptrs) + m->data_opts.extra_replicas;
 	return 0;
 }
diff --git a/fs/bcachefs/data_update.h b/fs/bcachefs/data_update.h
index 03b4ca5a4ee8..ee38bd655af1 100644
--- a/fs/bcachefs/data_update.h
+++ b/fs/bcachefs/data_update.h
@@ -3,46 +3,37 @@
 #ifndef _BCACHEFS_DATA_UPDATE_H
 #define _BCACHEFS_DATA_UPDATE_H
 
+#include "bkey_buf.h"
 #include "io_types.h"
 
-enum data_cmd {
-	DATA_SKIP,
-	DATA_SCRUB,
-	DATA_ADD_REPLICAS,
-	DATA_REWRITE,
-	DATA_PROMOTE,
-};
+struct moving_context;
 
-struct data_opts {
+struct data_update_opts {
+	unsigned	rewrite_ptrs;
 	u16		target;
-	u8		rewrite_dev;
-	u8		nr_replicas;
-	int		btree_insert_flags;
+	u8		extra_replicas;
+	unsigned	btree_insert_flags;
+	unsigned	write_flags;
 };
 
 struct data_update {
+	/* extent being updated: */
 	enum btree_id		btree_id;
-	enum data_cmd		data_cmd;
-	struct data_opts	data_opts;
-
-	unsigned		nr_ptrs_reserved;
-
+	struct bkey_buf		k;
+	struct data_update_opts	data_opts;
 	struct moving_context	*ctxt;
-
-	/* what we read: */
-	struct bch_extent_ptr	ptr;
-	u64			offset;
-
 	struct bch_write_op	op;
 };
 
 int bch2_data_update_index_update(struct bch_write_op *);
 
-void bch2_data_update_read_done(struct data_update *, struct bch_read_bio *);
+void bch2_data_update_read_done(struct data_update *,
+				struct bch_extent_crc_unpacked);
+
+void bch2_data_update_exit(struct data_update *);
 int bch2_data_update_init(struct bch_fs *, struct data_update *,
 			  struct write_point_specifier,
-			  struct bch_io_opts,
-			  enum data_cmd, struct data_opts,
+			  struct bch_io_opts, struct data_update_opts,
 			  enum btree_id, struct bkey_s_c);
 
 #endif /* _BCACHEFS_DATA_UPDATE_H */
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 4e44234a2b2c..38836c1990aa 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -25,6 +25,8 @@
 #include "trace.h"
 #include "util.h"
 
+static union bch_extent_entry *__bch2_bkey_drop_ptr(struct bkey_s, struct bch_extent_ptr *);
+
 static unsigned bch2_crc_field_size_max[] = {
 	[BCH_EXTENT_ENTRY_crc32] = CRC32_SIZE_MAX,
 	[BCH_EXTENT_ENTRY_crc64] = CRC64_SIZE_MAX,
@@ -687,37 +689,6 @@ unsigned bch2_bkey_durability(struct bch_fs *c, struct bkey_s_c k)
 	return durability;
 }
 
-void bch2_bkey_mark_replicas_cached(struct bch_fs *c, struct bkey_s k,
-				    unsigned target,
-				    unsigned nr_desired_replicas)
-{
-	struct bkey_ptrs ptrs = bch2_bkey_ptrs(k);
-	union bch_extent_entry *entry;
-	struct extent_ptr_decoded p;
-	int extra = bch2_bkey_durability(c, k.s_c) - nr_desired_replicas;
-
-	if (target && extra > 0)
-		bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
-			int n = bch2_extent_ptr_durability(c, p);
-
-			if (n && n <= extra &&
-			    !bch2_dev_in_target(c, p.ptr.dev, target)) {
-				entry->ptr.cached = true;
-				extra -= n;
-			}
-		}
-
-	if (extra > 0)
-		bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
-			int n = bch2_extent_ptr_durability(c, p);
-
-			if (n && n <= extra) {
-				entry->ptr.cached = true;
-				extra -= n;
-			}
-		}
-}
-
 void bch2_bkey_extent_entry_drop(struct bkey_i *k, union bch_extent_entry *entry)
 {
 	union bch_extent_entry *end = bkey_val_end(bkey_i_to_s(k));
@@ -821,8 +792,8 @@ static void extent_entry_drop(struct bkey_s k, union bch_extent_entry *entry)
 /*
  * Returns pointer to the next entry after the one being dropped:
  */
-union bch_extent_entry *__bch2_bkey_drop_ptr(struct bkey_s k,
-					     struct bch_extent_ptr *ptr)
+static union bch_extent_entry *__bch2_bkey_drop_ptr(struct bkey_s k,
+					   struct bch_extent_ptr *ptr)
 {
 	struct bkey_ptrs ptrs = bch2_bkey_ptrs(k);
 	union bch_extent_entry *entry = to_entry(ptr), *next;
@@ -894,6 +865,14 @@ void bch2_bkey_drop_device(struct bkey_s k, unsigned dev)
 	bch2_bkey_drop_ptrs(k, ptr, ptr->dev == dev);
 }
 
+void bch2_bkey_drop_device_noerror(struct bkey_s k, unsigned dev)
+{
+	struct bch_extent_ptr *ptr = (void *) bch2_bkey_has_device(k.s_c, dev);
+
+	if (ptr)
+		__bch2_bkey_drop_ptr(k, ptr);
+}
+
 const struct bch_extent_ptr *
 bch2_bkey_has_device(struct bkey_s_c k, unsigned dev)
 {
@@ -938,6 +917,44 @@ bool bch2_bkey_matches_ptr(struct bch_fs *c, struct bkey_s_c k,
 	return false;
 }
 
+/*
+ * Returns true if two extents refer to the same data:
+ */
+bool bch2_extents_match(struct bkey_s_c k1, struct bkey_s_c k2)
+{
+	struct bkey_ptrs_c ptrs1 = bch2_bkey_ptrs_c(k1);
+	struct bkey_ptrs_c ptrs2 = bch2_bkey_ptrs_c(k2);
+	const union bch_extent_entry *entry1, *entry2;
+	struct extent_ptr_decoded p1, p2;
+
+	bkey_for_each_ptr_decode(k1.k, ptrs1, p1, entry1)
+		bkey_for_each_ptr_decode(k2.k, ptrs2, p2, entry2)
+			if (p1.ptr.dev		== p2.ptr.dev &&
+			    p1.ptr.gen		== p2.ptr.gen &&
+			    (s64) p1.ptr.offset + p1.crc.offset - bkey_start_offset(k1.k) ==
+			    (s64) p2.ptr.offset + p2.crc.offset - bkey_start_offset(k2.k))
+				return true;
+
+	return false;
+}
+
+bool bch2_extent_has_ptr(struct bkey_s_c k1, struct extent_ptr_decoded p1,
+			 struct bkey_s_c k2)
+{
+	struct bkey_ptrs_c ptrs2 = bch2_bkey_ptrs_c(k2);
+	const union bch_extent_entry *entry2;
+	struct extent_ptr_decoded p2;
+
+	bkey_for_each_ptr_decode(k2.k, ptrs2, p2, entry2)
+		if (p1.ptr.dev		== p2.ptr.dev &&
+		    p1.ptr.gen		== p2.ptr.gen &&
+		    (s64) p1.ptr.offset + p1.crc.offset - bkey_start_offset(k1.k) ==
+		    (s64) p2.ptr.offset + p2.crc.offset - bkey_start_offset(k2.k))
+			return true;
+
+	return false;
+}
+
 /*
  * bch_extent_normalize - clean up an extent, dropping stale pointers etc.
  *
diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h
index 4f41f0fd6cb1..3c17b81130bb 100644
--- a/fs/bcachefs/extents.h
+++ b/fs/bcachefs/extents.h
@@ -577,15 +577,10 @@ unsigned bch2_bkey_sectors_compressed(struct bkey_s_c);
 unsigned bch2_bkey_replicas(struct bch_fs *, struct bkey_s_c);
 unsigned bch2_bkey_durability(struct bch_fs *, struct bkey_s_c);
 
-void bch2_bkey_mark_replicas_cached(struct bch_fs *, struct bkey_s,
-				    unsigned, unsigned);
-
 void bch2_bkey_extent_entry_drop(struct bkey_i *, union bch_extent_entry *);
 void bch2_bkey_append_ptr(struct bkey_i *, struct bch_extent_ptr);
 void bch2_extent_ptr_decoded_append(struct bkey_i *,
 				    struct extent_ptr_decoded *);
-union bch_extent_entry *__bch2_bkey_drop_ptr(struct bkey_s,
-					     struct bch_extent_ptr *);
 union bch_extent_entry *bch2_bkey_drop_ptr(struct bkey_s,
 					   struct bch_extent_ptr *);
 
@@ -607,11 +602,14 @@ do {									\
 } while (0)
 
 void bch2_bkey_drop_device(struct bkey_s, unsigned);
+void bch2_bkey_drop_device_noerror(struct bkey_s, unsigned);
 const struct bch_extent_ptr *bch2_bkey_has_device(struct bkey_s_c, unsigned);
 bool bch2_bkey_has_target(struct bch_fs *, struct bkey_s_c, unsigned);
 
 bool bch2_bkey_matches_ptr(struct bch_fs *, struct bkey_s_c,
 			   struct bch_extent_ptr, u64);
+bool bch2_extents_match(struct bkey_s_c, struct bkey_s_c);
+bool bch2_extent_has_ptr(struct bkey_s_c, struct extent_ptr_decoded, struct bkey_s_c);
 
 bool bch2_extent_normalize(struct bch_fs *, struct bkey_s);
 void bch2_bkey_ptrs_to_text(struct printbuf *, struct bch_fs *,
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index 743449ed7fae..c22ce1eb6b8b 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -1490,13 +1490,12 @@ static void promote_done(struct bch_write_op *wop)
 	bch2_time_stats_update(&c->times[BCH_TIME_data_promote],
 			       op->start_time);
 
-	bch2_bio_free_pages_pool(c, &op->write.op.wbio.bio);
+	bch2_data_update_exit(&op->write);
 	promote_free(c, op);
 }
 
 static void promote_start(struct promote_op *op, struct bch_read_bio *rbio)
 {
-	struct bch_fs *c = rbio->c;
 	struct bio *bio = &op->write.op.wbio.bio;
 
 	trace_promote(&rbio->bio);
@@ -1509,9 +1508,7 @@ static void promote_start(struct promote_op *op, struct bch_read_bio *rbio)
 	       sizeof(struct bio_vec) * rbio->bio.bi_vcnt);
 	swap(bio->bi_vcnt, rbio->bio.bi_vcnt);
 
-	bch2_data_update_read_done(&op->write, rbio);
-
-	closure_call(&op->write.op.cl, bch2_write, c->btree_update_wq, NULL);
+	bch2_data_update_read_done(&op->write, rbio->pick.crc);
 }
 
 static struct promote_op *__promote_alloc(struct bch_fs *c,
@@ -1569,10 +1566,10 @@ static struct promote_op *__promote_alloc(struct bch_fs *c,
 	ret = bch2_data_update_init(c, &op->write,
 			writepoint_hashed((unsigned long) current),
 			opts,
-			DATA_PROMOTE,
-			(struct data_opts) {
+			(struct data_update_opts) {
 				.target		= opts.promote_target,
-				.nr_replicas	= 1,
+				.extra_replicas	= 1,
+				.write_flags	= BCH_WRITE_ALLOC_NOWAIT|BCH_WRITE_CACHED,
 			},
 			btree_id, k);
 	BUG_ON(ret);
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index a3a486cff28e..4060678cf716 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -38,17 +38,9 @@ struct moving_io {
 static void move_free(struct moving_io *io)
 {
 	struct moving_context *ctxt = io->write.ctxt;
-	struct bvec_iter_all iter;
-	struct bio_vec *bv;
-
-	bch2_disk_reservation_put(io->write.op.c, &io->write.op.res);
-
-	bio_for_each_segment_all(bv, &io->write.op.wbio.bio, iter)
-		if (bv->bv_page)
-			__free_page(bv->bv_page);
 
+	bch2_data_update_exit(&io->write);
 	wake_up(&ctxt->wait);
-
 	kfree(io);
 }
 
@@ -72,8 +64,7 @@ static void move_write(struct moving_io *io)
 	closure_get(&io->write.ctxt->cl);
 	atomic_add(io->write_sectors, &io->write.ctxt->write_sectors);
 
-	bch2_data_update_read_done(&io->write, &io->rbio);
-	closure_call(&io->write.op.cl, bch2_write, NULL, NULL);
+	bch2_data_update_read_done(&io->write, io->rbio.pick.crc);
 }
 
 static inline struct moving_io *next_pending_write(struct moving_context *ctxt)
@@ -135,8 +126,7 @@ static int bch2_move_extent(struct btree_trans *trans,
 			    struct bch_io_opts io_opts,
 			    enum btree_id btree_id,
 			    struct bkey_s_c k,
-			    enum data_cmd data_cmd,
-			    struct data_opts data_opts)
+			    struct data_update_opts data_opts)
 {
 	struct bch_fs *c = trans->c;
 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
@@ -180,10 +170,11 @@ static int bch2_move_extent(struct btree_trans *trans,
 	io->rbio.bio.bi_end_io		= move_read_endio;
 
 	ret = bch2_data_update_init(c, &io->write, wp, io_opts,
-				    data_cmd, data_opts, btree_id, k);
+				    data_opts, btree_id, k);
 	if (ret)
 		goto err_free_pages;
 
+	io->write.ctxt = ctxt;
 	io->write.op.end_io = move_write_done;
 
 	atomic64_inc(&ctxt->stats->keys_moved);
@@ -262,8 +253,7 @@ static int __bch2_move_data(struct bch_fs *c,
 	struct btree_trans trans;
 	struct btree_iter iter;
 	struct bkey_s_c k;
-	struct data_opts data_opts;
-	enum data_cmd data_cmd;
+	struct data_update_opts data_opts;
 	u64 delay, cur_inum = U64_MAX;
 	int ret = 0, ret2;
 
@@ -350,18 +340,9 @@ static int __bch2_move_data(struct bch_fs *c,
 			cur_inum = k.k->p.inode;
 		}
 
-		switch ((data_cmd = pred(c, arg, k, &io_opts, &data_opts))) {
-		case DATA_SKIP:
+		memset(&data_opts, 0, sizeof(data_opts));
+		if (!pred(c, arg, k, &io_opts, &data_opts))
 			goto next;
-		case DATA_SCRUB:
-			BUG();
-		case DATA_ADD_REPLICAS:
-		case DATA_REWRITE:
-		case DATA_PROMOTE:
-			break;
-		default:
-			BUG();
-		}
 
 		/*
 		 * The iterator gets unlocked by __bch2_read_extent - need to
@@ -370,8 +351,8 @@ static int __bch2_move_data(struct bch_fs *c,
 		bch2_bkey_buf_reassemble(&sk, c, k);
 		k = bkey_i_to_s_c(sk.k);
 
-		ret2 = bch2_move_extent(&trans, ctxt, wp, io_opts, btree_id, k,
-					data_cmd, data_opts);
+		ret2 = bch2_move_extent(&trans, ctxt, wp, io_opts,
+					btree_id, k, data_opts);
 		if (ret2) {
 			if (ret2 == -EINTR)
 				continue;
@@ -476,9 +457,9 @@ int bch2_move_data(struct bch_fs *c,
 	return ret;
 }
 
-typedef enum data_cmd (*move_btree_pred)(struct bch_fs *, void *,
-					 struct btree *, struct bch_io_opts *,
-					 struct data_opts *);
+typedef bool (*move_btree_pred)(struct bch_fs *, void *,
+				struct btree *, struct bch_io_opts *,
+				struct data_update_opts *);
 
 static int bch2_move_btree(struct bch_fs *c,
 			   enum btree_id start_btree_id, struct bpos start_pos,
@@ -492,8 +473,7 @@ static int bch2_move_btree(struct bch_fs *c,
 	struct btree_iter iter;
 	struct btree *b;
 	enum btree_id id;
-	struct data_opts data_opts;
-	enum data_cmd cmd;
+	struct data_update_opts data_opts;
 	int ret = 0;
 
 	bch2_trans_init(&trans, c, 0, 0);
@@ -522,17 +502,8 @@ retry:
 
 			stats->pos = iter.pos;
 
-			switch ((cmd = pred(c, arg, b, &io_opts, &data_opts))) {
-			case DATA_SKIP:
+			if (!pred(c, arg, b, &io_opts, &data_opts))
 				goto next;
-			case DATA_SCRUB:
-				BUG();
-			case DATA_ADD_REPLICAS:
-			case DATA_REWRITE:
-				break;
-			default:
-				BUG();
-			}
 
 			ret = bch2_btree_node_rewrite(&trans, &iter, b, 0) ?: ret;
 			if (ret == -EINTR)
@@ -562,20 +533,10 @@ next:
 	return ret;
 }
 
-#if 0
-static enum data_cmd scrub_pred(struct bch_fs *c, void *arg,
-				struct bkey_s_c k,
-				struct bch_io_opts *io_opts,
-				struct data_opts *data_opts)
-{
-	return DATA_SCRUB;
-}
-#endif
-
-static enum data_cmd rereplicate_pred(struct bch_fs *c, void *arg,
-				      struct bkey_s_c k,
-				      struct bch_io_opts *io_opts,
-				      struct data_opts *data_opts)
+static bool rereplicate_pred(struct bch_fs *c, void *arg,
+			     struct bkey_s_c k,
+			     struct bch_io_opts *io_opts,
+			     struct data_update_opts *data_opts)
 {
 	unsigned nr_good = bch2_bkey_durability(c, k);
 	unsigned replicas = bkey_is_btree_ptr(k.k)
@@ -583,43 +544,50 @@ static enum data_cmd rereplicate_pred(struct bch_fs *c, void *arg,
 		: io_opts->data_replicas;
 
 	if (!nr_good || nr_good >= replicas)
-		return DATA_SKIP;
+		return false;
 
 	data_opts->target		= 0;
-	data_opts->nr_replicas		= 1;
+	data_opts->extra_replicas	= replicas - nr_good;
 	data_opts->btree_insert_flags	= 0;
-	return DATA_ADD_REPLICAS;
+	return true;
 }
 
-static enum data_cmd migrate_pred(struct bch_fs *c, void *arg,
-				  struct bkey_s_c k,
-				  struct bch_io_opts *io_opts,
-				  struct data_opts *data_opts)
+static bool migrate_pred(struct bch_fs *c, void *arg,
+			 struct bkey_s_c k,
+			 struct bch_io_opts *io_opts,
+			 struct data_update_opts *data_opts)
 {
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+	const struct bch_extent_ptr *ptr;
 	struct bch_ioctl_data *op = arg;
+	unsigned i = 0;
 
-	if (!bch2_bkey_has_device(k, op->migrate.dev))
-		return DATA_SKIP;
-
+	data_opts->rewrite_ptrs		= 0;
 	data_opts->target		= 0;
-	data_opts->nr_replicas		= 1;
+	data_opts->extra_replicas	= 0;
 	data_opts->btree_insert_flags	= 0;
-	data_opts->rewrite_dev		= op->migrate.dev;
-	return DATA_REWRITE;
+
+	bkey_for_each_ptr(ptrs, ptr) {
+		if (ptr->dev == op->migrate.dev)
+			data_opts->rewrite_ptrs |= 1U << i;
+		i++;
+	}
+
+	return data_opts->rewrite_ptrs != 0;;
 }
 
-static enum data_cmd rereplicate_btree_pred(struct bch_fs *c, void *arg,
-					    struct btree *b,
-					    struct bch_io_opts *io_opts,
-					    struct data_opts *data_opts)
+static bool rereplicate_btree_pred(struct bch_fs *c, void *arg,
+				   struct btree *b,
+				   struct bch_io_opts *io_opts,
+				   struct data_update_opts *data_opts)
 {
 	return rereplicate_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts);
 }
 
-static enum data_cmd migrate_btree_pred(struct bch_fs *c, void *arg,
-					struct btree *b,
-					struct bch_io_opts *io_opts,
-					struct data_opts *data_opts)
+static bool migrate_btree_pred(struct bch_fs *c, void *arg,
+			       struct btree *b,
+			       struct bch_io_opts *io_opts,
+			       struct data_update_opts *data_opts)
 {
 	return migrate_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts);
 }
@@ -648,21 +616,21 @@ static bool bformat_needs_redo(struct bkey_format *f)
 	return false;
 }
 
-static enum data_cmd rewrite_old_nodes_pred(struct bch_fs *c, void *arg,
-					    struct btree *b,
-					    struct bch_io_opts *io_opts,
-					    struct data_opts *data_opts)
+static bool rewrite_old_nodes_pred(struct bch_fs *c, void *arg,
+				   struct btree *b,
+				   struct bch_io_opts *io_opts,
+				   struct data_update_opts *data_opts)
 {
 	if (b->version_ondisk != c->sb.version ||
 	    btree_node_need_rewrite(b) ||
 	    bformat_needs_redo(&b->format)) {
 		data_opts->target		= 0;
-		data_opts->nr_replicas		= 1;
+		data_opts->extra_replicas	= 0;
 		data_opts->btree_insert_flags	= 0;
-		return DATA_REWRITE;
+		return true;
 	}
 
-	return DATA_SKIP;
+	return false;
 }
 
 int bch2_scan_old_btree_nodes(struct bch_fs *c, struct bch_move_stats *stats)
diff --git a/fs/bcachefs/move.h b/fs/bcachefs/move.h
index 6d273f67a82c..fd5562909382 100644
--- a/fs/bcachefs/move.h
+++ b/fs/bcachefs/move.h
@@ -24,9 +24,8 @@ struct moving_context {
 	wait_queue_head_t	wait;
 };
 
-typedef enum data_cmd (*move_pred_fn)(struct bch_fs *, void *,
-				struct bkey_s_c,
-				struct bch_io_opts *, struct data_opts *);
+typedef bool (*move_pred_fn)(struct bch_fs *, void *, struct bkey_s_c,
+				struct bch_io_opts *, struct data_update_opts *);
 
 int bch2_scan_old_btree_nodes(struct bch_fs *, struct bch_move_stats *);
 
diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c
index 1e2de1e818c1..d63b9fea4f05 100644
--- a/fs/bcachefs/movinggc.c
+++ b/fs/bcachefs/movinggc.c
@@ -39,15 +39,32 @@ static int bucket_offset_cmp(const void *_l, const void *_r, size_t size)
 		cmp_int(l->offset, r->offset);
 }
 
-static enum data_cmd copygc_pred(struct bch_fs *c, void *arg,
-				 struct bkey_s_c k,
-				 struct bch_io_opts *io_opts,
-				 struct data_opts *data_opts)
+static bool copygc_pred(struct bch_fs *c, void *arg,
+			struct bkey_s_c k,
+			struct bch_io_opts *io_opts,
+			struct data_update_opts *data_opts)
 {
 	copygc_heap *h = &c->copygc_heap;
 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
 	const union bch_extent_entry *entry;
 	struct extent_ptr_decoded p = { 0 };
+	unsigned i = 0;
+
+	/*
+	 * We need to use the journal reserve here, because
+	 *  - journal reclaim depends on btree key cache
+	 *    flushing to make forward progress,
+	 *  - which has to make forward progress when the
+	 *    journal is pre-reservation full,
+	 *  - and depends on allocation - meaning allocator and
+	 *    copygc
+	 */
+
+	data_opts->rewrite_ptrs		= 0;
+	data_opts->target		= io_opts->background_target;
+	data_opts->extra_replicas	= 0;
+	data_opts->btree_insert_flags	= BTREE_INSERT_USE_RESERVE|
+		JOURNAL_WATERMARK_copygc;
 
 	bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
 		struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev);
@@ -55,12 +72,12 @@ static enum data_cmd copygc_pred(struct bch_fs *c, void *arg,
 			.dev	= p.ptr.dev,
 			.offset	= p.ptr.offset,
 		};
-		ssize_t i;
+		ssize_t eytz;
 
 		if (p.ptr.cached)
 			continue;
 
-		i = eytzinger0_find_le(h->data, h->used,
+		eytz = eytzinger0_find_le(h->data, h->used,
 				       sizeof(h->data[0]),
 				       bucket_offset_cmp, &search);
 #if 0
@@ -74,34 +91,16 @@ static enum data_cmd copygc_pred(struct bch_fs *c, void *arg,
 
 		BUG_ON(i != j);
 #endif
-		if (i >= 0 &&
-		    p.ptr.dev == h->data[i].dev &&
-		    p.ptr.offset < h->data[i].offset + ca->mi.bucket_size &&
-		    p.ptr.gen == h->data[i].gen) {
-			/*
-			 * We need to use the journal reserve here, because
-			 *  - journal reclaim depends on btree key cache
-			 *    flushing to make forward progress,
-			 *  - which has to make forward progress when the
-			 *    journal is pre-reservation full,
-			 *  - and depends on allocation - meaning allocator and
-			 *    copygc
-			 */
-
-			data_opts->target		= io_opts->background_target;
-			data_opts->nr_replicas		= 1;
-			data_opts->btree_insert_flags	= BTREE_INSERT_USE_RESERVE|
-				JOURNAL_WATERMARK_copygc;
-			data_opts->rewrite_dev		= p.ptr.dev;
-
-			if (p.has_ec)
-				data_opts->nr_replicas += p.ec.redundancy;
-
-			return DATA_REWRITE;
-		}
+		if (eytz >= 0 &&
+		    p.ptr.dev == h->data[eytz].dev &&
+		    p.ptr.offset < h->data[eytz].offset + ca->mi.bucket_size &&
+		    p.ptr.gen == h->data[eytz].gen)
+			data_opts->rewrite_ptrs |= 1U << i;
+
+		i++;
 	}
 
-	return DATA_SKIP;
+	return data_opts->rewrite_ptrs != 0;
 }
 
 static inline int fragmentation_cmp(copygc_heap *heap,
diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c
index 1724ae36c0f4..63b24dc9c917 100644
--- a/fs/bcachefs/rebalance.c
+++ b/fs/bcachefs/rebalance.c
@@ -22,62 +22,70 @@
  * returns -1 if it should not be moved, or
  * device of pointer that should be moved, if known, or INT_MAX if unknown
  */
-static int __bch2_rebalance_pred(struct bch_fs *c,
-				 struct bkey_s_c k,
-				 struct bch_io_opts *io_opts)
+static bool rebalance_pred(struct bch_fs *c, void *arg,
+			   struct bkey_s_c k,
+			   struct bch_io_opts *io_opts,
+			   struct data_update_opts *data_opts)
 {
 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-	const union bch_extent_entry *entry;
-	struct extent_ptr_decoded p;
+	unsigned i;
+
+	data_opts->rewrite_ptrs		= 0;
+	data_opts->target		= io_opts->background_target;
+	data_opts->extra_replicas	= 0;
+	data_opts->btree_insert_flags	= 0;
 
 	if (io_opts->background_compression &&
-	    !bch2_bkey_is_incompressible(k))
-		bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
+	    !bch2_bkey_is_incompressible(k)) {
+		const union bch_extent_entry *entry;
+		struct extent_ptr_decoded p;
+
+		i = 0;
+		bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
 			if (!p.ptr.cached &&
 			    p.crc.compression_type !=
 			    bch2_compression_opt_to_type[io_opts->background_compression])
-				return p.ptr.dev;
+				data_opts->rewrite_ptrs |= 1U << i;
+			i++;
+		}
+	}
 
-	if (io_opts->background_target)
-		bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
-			if (!p.ptr.cached &&
-			    !bch2_dev_in_target(c, p.ptr.dev, io_opts->background_target))
-				return p.ptr.dev;
+	if (io_opts->background_target) {
+		const struct bch_extent_ptr *ptr;
 
-	return -1;
+		i = 0;
+		bkey_for_each_ptr(ptrs, ptr) {
+			if (!ptr->cached &&
+			    !bch2_dev_in_target(c, ptr->dev, io_opts->background_target))
+				data_opts->rewrite_ptrs |= 1U << i;
+			i++;
+		}
+	}
+
+	return data_opts->rewrite_ptrs != 0;
 }
 
 void bch2_rebalance_add_key(struct bch_fs *c,
 			    struct bkey_s_c k,
 			    struct bch_io_opts *io_opts)
 {
-	atomic64_t *counter;
-	int dev;
+	struct data_update_opts update_opts = { 0 };
+	struct bkey_ptrs_c ptrs;
+	const struct bch_extent_ptr *ptr;
+	unsigned i;
 
-	dev = __bch2_rebalance_pred(c, k, io_opts);
-	if (dev < 0)
+	if (!rebalance_pred(c, NULL, k, io_opts, &update_opts))
 		return;
 
-	counter = dev < INT_MAX
-		? &bch_dev_bkey_exists(c, dev)->rebalance_work
-		: &c->rebalance.work_unknown_dev;
-
-	if (atomic64_add_return(k.k->size, counter) == k.k->size)
-		rebalance_wakeup(c);
-}
-
-static enum data_cmd rebalance_pred(struct bch_fs *c, void *arg,
-				    struct bkey_s_c k,
-				    struct bch_io_opts *io_opts,
-				    struct data_opts *data_opts)
-{
-	if (__bch2_rebalance_pred(c, k, io_opts) >= 0) {
-		data_opts->target		= io_opts->background_target;
-		data_opts->nr_replicas		= 1;
-		data_opts->btree_insert_flags	= 0;
-		return DATA_ADD_REPLICAS;
-	} else {
-		return DATA_SKIP;
+	i = 0;
+	ptrs = bch2_bkey_ptrs_c(k);
+	bkey_for_each_ptr(ptrs, ptr) {
+		if ((1U << i) && update_opts.rewrite_ptrs)
+			if (atomic64_add_return(k.k->size,
+					&bch_dev_bkey_exists(c, ptr->dev)->rebalance_work) ==
+			    k.k->size)
+				rebalance_wakeup(c);
+		i++;
 	}
 }
 
-- 
cgit 


From c91996c50a9ad6569cf9cb52e79c171f0d34814d Mon Sep 17 00:00:00 2001
From: Daniel Hill <daniel@gluo.nz>
Date: Thu, 16 Jun 2022 02:06:43 +1200
Subject: bcachefs: data jobs, including rebalance wait for copygc.

move_ratelimit() now has a bool that specifies whether we want to
wait for copygc to finish.

When copygc is running, we're probably low on free buckets instead
of consuming the remaining buckets, we want to wait for copygc to
finish.

This should help with performance, and run away bucket fragmentation.

Signed-off-by: Daniel Hill <daniel@gluo.nz>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs.h  |   2 +
 fs/bcachefs/move.c      | 110 ++++++++++++++++++++++++++++--------------------
 fs/bcachefs/move.h      |   3 +-
 fs/bcachefs/movinggc.c  |  15 +++++--
 fs/bcachefs/rebalance.c |   2 +-
 5 files changed, 80 insertions(+), 52 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 127323b677df..c07ea9af561d 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -825,6 +825,8 @@ mempool_t		bio_bounce_pages;
 	copygc_heap		copygc_heap;
 	struct write_point	copygc_write_point;
 	s64			copygc_wait;
+	bool			copygc_running;
+	wait_queue_head_t	copygc_running_wq;
 
 	/* DATA PROGRESS STATS */
 	struct list_head	data_progress_list;
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index 4060678cf716..fad15ba7d239 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -237,24 +237,72 @@ err:
 	return ret;
 }
 
+static int move_ratelimit(struct btree_trans *trans,
+			  struct moving_context *ctxt,
+			  struct bch_ratelimit *rate,
+			  bool wait_on_copygc)
+{
+	struct bch_fs *c = trans->c;
+	u64 delay;
+
+	if (wait_on_copygc) {
+		bch2_trans_unlock(trans);
+		wait_event_killable(c->copygc_running_wq,
+				    !c->copygc_running ||
+				    kthread_should_stop());
+	}
+
+	do {
+		delay = rate ? bch2_ratelimit_delay(rate) : 0;
+
+		if (delay) {
+			bch2_trans_unlock(trans);
+			set_current_state(TASK_INTERRUPTIBLE);
+		}
+
+		if ((current->flags & PF_KTHREAD) && kthread_should_stop()) {
+			__set_current_state(TASK_RUNNING);
+			return 1;
+		}
+
+		if (delay)
+			schedule_timeout(delay);
+
+		if (unlikely(freezing(current))) {
+			move_ctxt_wait_event(ctxt, trans, list_empty(&ctxt->reads));
+			try_to_freeze();
+		}
+	} while (delay);
+
+	move_ctxt_wait_event(ctxt, trans,
+		atomic_read(&ctxt->write_sectors) <
+		c->opts.move_bytes_in_flight >> 9);
+
+	move_ctxt_wait_event(ctxt, trans,
+		atomic_read(&ctxt->read_sectors) <
+		c->opts.move_bytes_in_flight >> 9);
+
+	return 0;
+}
+
 static int __bch2_move_data(struct bch_fs *c,
-		struct moving_context *ctxt,
-		struct bch_ratelimit *rate,
-		struct write_point_specifier wp,
-		struct bpos start,
-		struct bpos end,
-		move_pred_fn pred, void *arg,
-		struct bch_move_stats *stats,
-		enum btree_id btree_id)
+			    struct moving_context *ctxt,
+			    struct bch_ratelimit *rate,
+			    struct write_point_specifier wp,
+			    struct bpos start,
+			    struct bpos end,
+			    move_pred_fn pred, void *arg,
+			    struct bch_move_stats *stats,
+			    enum btree_id btree_id,
+			    bool wait_on_copygc)
 {
-	bool kthread = (current->flags & PF_KTHREAD) != 0;
 	struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts);
 	struct bkey_buf sk;
 	struct btree_trans trans;
 	struct btree_iter iter;
 	struct bkey_s_c k;
 	struct data_update_opts data_opts;
-	u64 delay, cur_inum = U64_MAX;
+	u64 cur_inum = U64_MAX;
 	int ret = 0, ret2;
 
 	bch2_bkey_buf_init(&sk);
@@ -271,37 +319,7 @@ static int __bch2_move_data(struct bch_fs *c,
 	if (rate)
 		bch2_ratelimit_reset(rate);
 
-	while (1) {
-		do {
-			delay = rate ? bch2_ratelimit_delay(rate) : 0;
-
-			if (delay) {
-				bch2_trans_unlock(&trans);
-				set_current_state(TASK_INTERRUPTIBLE);
-			}
-
-			if (kthread && (ret = kthread_should_stop())) {
-				__set_current_state(TASK_RUNNING);
-				goto out;
-			}
-
-			if (delay)
-				schedule_timeout(delay);
-
-			if (unlikely(freezing(current))) {
-				move_ctxt_wait_event(ctxt, &trans, list_empty(&ctxt->reads));
-				try_to_freeze();
-			}
-		} while (delay);
-
-		move_ctxt_wait_event(ctxt, &trans,
-			atomic_read(&ctxt->write_sectors) <
-			c->opts.move_bytes_in_flight >> 9);
-
-		move_ctxt_wait_event(ctxt, &trans,
-			atomic_read(&ctxt->read_sectors) <
-			c->opts.move_bytes_in_flight >> 9);
-
+	while (!move_ratelimit(&trans, ctxt, rate, wait_on_copygc)) {
 		bch2_trans_begin(&trans);
 
 		k = bch2_btree_iter_peek(&iter);
@@ -374,7 +392,6 @@ next:
 next_nondata:
 		bch2_btree_iter_advance(&iter);
 	}
-out:
 
 	bch2_trans_iter_exit(&trans, &iter);
 	bch2_trans_exit(&trans);
@@ -413,7 +430,8 @@ int bch2_move_data(struct bch_fs *c,
 		   struct bch_ratelimit *rate,
 		   struct write_point_specifier wp,
 		   move_pred_fn pred, void *arg,
-		   struct bch_move_stats *stats)
+		   struct bch_move_stats *stats,
+		   bool wait_on_copygc)
 {
 	struct moving_context ctxt = { .stats = stats };
 	enum btree_id id;
@@ -438,7 +456,7 @@ int bch2_move_data(struct bch_fs *c,
 		ret = __bch2_move_data(c, &ctxt, rate, wp,
 				       id == start_btree_id ? start_pos : POS_MIN,
 				       id == end_btree_id   ? end_pos   : POS_MAX,
-				       pred, arg, stats, id);
+				       pred, arg, stats, id, wait_on_copygc);
 		if (ret)
 			break;
 	}
@@ -675,7 +693,7 @@ int bch2_data_job(struct bch_fs *c,
 				     op.start_btree,	op.start_pos,
 				     op.end_btree,	op.end_pos,
 				     NULL, writepoint_hashed((unsigned long) current),
-				     rereplicate_pred, c, stats) ?: ret;
+				     rereplicate_pred, c, stats, true) ?: ret;
 		ret = bch2_replicas_gc2(c) ?: ret;
 		break;
 	case BCH_DATA_OP_MIGRATE:
@@ -696,7 +714,7 @@ int bch2_data_job(struct bch_fs *c,
 				     op.start_btree,	op.start_pos,
 				     op.end_btree,	op.end_pos,
 				     NULL, writepoint_hashed((unsigned long) current),
-				     migrate_pred, &op, stats) ?: ret;
+				     migrate_pred, &op, stats, true) ?: ret;
 		ret = bch2_replicas_gc2(c) ?: ret;
 		break;
 	case BCH_DATA_OP_REWRITE_OLD_NODES:
diff --git a/fs/bcachefs/move.h b/fs/bcachefs/move.h
index fd5562909382..d362cb545c0b 100644
--- a/fs/bcachefs/move.h
+++ b/fs/bcachefs/move.h
@@ -35,7 +35,8 @@ int bch2_move_data(struct bch_fs *,
 		   struct bch_ratelimit *,
 		   struct write_point_specifier,
 		   move_pred_fn, void *,
-		   struct bch_move_stats *);
+		   struct bch_move_stats *,
+		   bool);
 
 int bch2_data_job(struct bch_fs *,
 		  struct bch_move_stats *,
diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c
index d63b9fea4f05..8b6ad9ec72af 100644
--- a/fs/bcachefs/movinggc.c
+++ b/fs/bcachefs/movinggc.c
@@ -316,7 +316,8 @@ static int bch2_copygc(struct bch_fs *c)
 			     NULL,
 			     writepoint_ptr(&c->copygc_write_point),
 			     copygc_pred, NULL,
-			     &move_stats);
+			     &move_stats,
+			     false);
 	if (ret < 0)
 		bch_err(c, "error %i from bch2_move_data() in copygc", ret);
 	if (ret)
@@ -381,10 +382,11 @@ static int bch2_copygc_thread(void *arg)
 	struct bch_fs *c = arg;
 	struct io_clock *clock = &c->io_clock[WRITE];
 	u64 last, wait;
+	int ret = 0;
 
 	set_freezable();
 
-	while (!kthread_should_stop()) {
+	while (!ret && !kthread_should_stop()) {
 		cond_resched();
 
 		if (kthread_wait_freezable(c->copy_gc_enabled))
@@ -403,8 +405,11 @@ static int bch2_copygc_thread(void *arg)
 
 		c->copygc_wait = 0;
 
-		if (bch2_copygc(c))
-			break;
+		c->copygc_running = true;
+		ret = bch2_copygc(c);
+		c->copygc_running = false;
+
+		wake_up(&c->copygc_running_wq);
 	}
 
 	return 0;
@@ -448,4 +453,6 @@ int bch2_copygc_start(struct bch_fs *c)
 
 void bch2_fs_copygc_init(struct bch_fs *c)
 {
+	init_waitqueue_head(&c->copygc_running_wq);
+	c->copygc_running = false;
 }
diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c
index 63b24dc9c917..57082260fc00 100644
--- a/fs/bcachefs/rebalance.c
+++ b/fs/bcachefs/rebalance.c
@@ -255,7 +255,7 @@ static int bch2_rebalance_thread(void *arg)
 			       NULL, /*  &r->pd.rate, */
 			       writepoint_ptr(&c->rebalance_write_point),
 			       rebalance_pred, NULL,
-			       &move_stats);
+			       &move_stats, true);
 	}
 
 	return 0;
-- 
cgit 


From 0337cc7eeed19e81e50414b5199bb65029ca0ed5 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 20 Jun 2022 15:40:26 -0400
Subject: bcachefs: move.c refactoring

 - add bch2_moving_ctxt_(init|exit)
 - split out __bch2_evacutae_bucket() which takes an existing
   moving_ctxt, this will be used for improving copygc performance by
   pipelining across multiple buckets

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/move.c      | 168 +++++++++++++++++++++++++++---------------------
 fs/bcachefs/move.h      |  22 +++++--
 fs/bcachefs/movinggc.c  |   6 +-
 fs/bcachefs/rebalance.c |   5 +-
 4 files changed, 114 insertions(+), 87 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index fad15ba7d239..a19c3117f9fe 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -20,6 +20,20 @@
 #include <linux/ioprio.h>
 #include <linux/kthread.h>
 
+static void progress_list_add(struct bch_fs *c, struct bch_move_stats *stats)
+{
+	mutex_lock(&c->data_progress_lock);
+	list_add(&stats->list, &c->data_progress_list);
+	mutex_unlock(&c->data_progress_lock);
+}
+
+static void progress_list_del(struct bch_fs *c, struct bch_move_stats *stats)
+{
+	mutex_lock(&c->data_progress_lock);
+	list_del(&stats->list);
+	mutex_unlock(&c->data_progress_lock);
+}
+
 struct moving_io {
 	struct list_head	list;
 	struct closure		cl;
@@ -120,9 +134,51 @@ static void bch2_move_ctxt_wait_for_io(struct moving_context *ctxt,
 		atomic_read(&ctxt->write_sectors) != sectors_pending);
 }
 
+void bch2_moving_ctxt_exit(struct moving_context *ctxt)
+{
+	move_ctxt_wait_event(ctxt, NULL, list_empty(&ctxt->reads));
+	closure_sync(&ctxt->cl);
+	progress_list_del(ctxt->c, ctxt->stats);
+
+	EBUG_ON(atomic_read(&ctxt->write_sectors));
+
+	trace_move_data(ctxt->c,
+			atomic64_read(&ctxt->stats->sectors_moved),
+			atomic64_read(&ctxt->stats->keys_moved));
+}
+
+void bch2_moving_ctxt_init(struct moving_context *ctxt,
+			   struct bch_fs *c,
+			   struct bch_ratelimit *rate,
+			   struct bch_move_stats *stats,
+			   struct write_point_specifier wp,
+			   bool wait_on_copygc)
+{
+	memset(ctxt, 0, sizeof(*ctxt));
+
+	ctxt->c		= c;
+	ctxt->rate	= rate;
+	ctxt->stats	= stats;
+	ctxt->wp	= wp;
+	ctxt->wait_on_copygc = wait_on_copygc;
+
+	progress_list_add(c, stats);
+	closure_init_stack(&ctxt->cl);
+	INIT_LIST_HEAD(&ctxt->reads);
+	init_waitqueue_head(&ctxt->wait);
+
+	if (stats)
+		stats->data_type = BCH_DATA_user;
+}
+
+void bch_move_stats_init(struct bch_move_stats *stats, char *name)
+{
+	memset(stats, 0, sizeof(*stats));
+	scnprintf(stats->name, sizeof(stats->name), "%s", name);
+}
+
 static int bch2_move_extent(struct btree_trans *trans,
 			    struct moving_context *ctxt,
-			    struct write_point_specifier wp,
 			    struct bch_io_opts io_opts,
 			    enum btree_id btree_id,
 			    struct bkey_s_c k,
@@ -169,7 +225,7 @@ static int bch2_move_extent(struct btree_trans *trans,
 	io->rbio.bio.bi_iter.bi_sector	= bkey_start_offset(k.k);
 	io->rbio.bio.bi_end_io		= move_read_endio;
 
-	ret = bch2_data_update_init(c, &io->write, wp, io_opts,
+	ret = bch2_data_update_init(c, &io->write, ctxt->wp, io_opts,
 				    data_opts, btree_id, k);
 	if (ret)
 		goto err_free_pages;
@@ -238,14 +294,12 @@ err:
 }
 
 static int move_ratelimit(struct btree_trans *trans,
-			  struct moving_context *ctxt,
-			  struct bch_ratelimit *rate,
-			  bool wait_on_copygc)
+			  struct moving_context *ctxt)
 {
 	struct bch_fs *c = trans->c;
 	u64 delay;
 
-	if (wait_on_copygc) {
+	if (ctxt->wait_on_copygc) {
 		bch2_trans_unlock(trans);
 		wait_event_killable(c->copygc_running_wq,
 				    !c->copygc_running ||
@@ -253,7 +307,7 @@ static int move_ratelimit(struct btree_trans *trans,
 	}
 
 	do {
-		delay = rate ? bch2_ratelimit_delay(rate) : 0;
+		delay = ctxt->rate ? bch2_ratelimit_delay(ctxt->rate) : 0;
 
 		if (delay) {
 			bch2_trans_unlock(trans);
@@ -285,17 +339,13 @@ static int move_ratelimit(struct btree_trans *trans,
 	return 0;
 }
 
-static int __bch2_move_data(struct bch_fs *c,
-			    struct moving_context *ctxt,
-			    struct bch_ratelimit *rate,
-			    struct write_point_specifier wp,
+static int __bch2_move_data(struct moving_context *ctxt,
 			    struct bpos start,
 			    struct bpos end,
 			    move_pred_fn pred, void *arg,
-			    struct bch_move_stats *stats,
-			    enum btree_id btree_id,
-			    bool wait_on_copygc)
+			    enum btree_id btree_id)
 {
+	struct bch_fs *c = ctxt->c;
 	struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts);
 	struct bkey_buf sk;
 	struct btree_trans trans;
@@ -308,18 +358,18 @@ static int __bch2_move_data(struct bch_fs *c,
 	bch2_bkey_buf_init(&sk);
 	bch2_trans_init(&trans, c, 0, 0);
 
-	stats->data_type = BCH_DATA_user;
-	stats->btree_id	= btree_id;
-	stats->pos	= start;
+	ctxt->stats->data_type	= BCH_DATA_user;
+	ctxt->stats->btree_id	= btree_id;
+	ctxt->stats->pos	= start;
 
 	bch2_trans_iter_init(&trans, &iter, btree_id, start,
 			     BTREE_ITER_PREFETCH|
 			     BTREE_ITER_ALL_SNAPSHOTS);
 
-	if (rate)
-		bch2_ratelimit_reset(rate);
+	if (ctxt->rate)
+		bch2_ratelimit_reset(ctxt->rate);
 
-	while (!move_ratelimit(&trans, ctxt, rate, wait_on_copygc)) {
+	while (!move_ratelimit(&trans, ctxt)) {
 		bch2_trans_begin(&trans);
 
 		k = bch2_btree_iter_peek(&iter);
@@ -335,7 +385,7 @@ static int __bch2_move_data(struct bch_fs *c,
 		if (bkey_cmp(bkey_start_pos(k.k), end) >= 0)
 			break;
 
-		stats->pos = iter.pos;
+		ctxt->stats->pos = iter.pos;
 
 		if (!bkey_extent_is_direct_data(k.k))
 			goto next_nondata;
@@ -369,7 +419,7 @@ static int __bch2_move_data(struct bch_fs *c,
 		bch2_bkey_buf_reassemble(&sk, c, k);
 		k = bkey_i_to_s_c(sk.k);
 
-		ret2 = bch2_move_extent(&trans, ctxt, wp, io_opts,
+		ret2 = bch2_move_extent(&trans, ctxt, io_opts,
 					btree_id, k, data_opts);
 		if (ret2) {
 			if (ret2 == -EINTR)
@@ -385,10 +435,10 @@ static int __bch2_move_data(struct bch_fs *c,
 			goto next;
 		}
 
-		if (rate)
-			bch2_ratelimit_increment(rate, k.k->size);
+		if (ctxt->rate)
+			bch2_ratelimit_increment(ctxt->rate, k.k->size);
 next:
-		atomic64_add(k.k->size, &stats->sectors_seen);
+		atomic64_add(k.k->size, &ctxt->stats->sectors_seen);
 next_nondata:
 		bch2_btree_iter_advance(&iter);
 	}
@@ -400,49 +450,20 @@ next_nondata:
 	return ret;
 }
 
-inline void bch_move_stats_init(struct bch_move_stats *stats, char *name)
-{
-	memset(stats, 0, sizeof(*stats));
-
-	scnprintf(stats->name, sizeof(stats->name),
-			"%s", name);
-}
-
-static inline void progress_list_add(struct bch_fs *c,
-				     struct bch_move_stats *stats)
-{
-	mutex_lock(&c->data_progress_lock);
-	list_add(&stats->list, &c->data_progress_list);
-	mutex_unlock(&c->data_progress_lock);
-}
-
-static inline void progress_list_del(struct bch_fs *c,
-				     struct bch_move_stats *stats)
-{
-	mutex_lock(&c->data_progress_lock);
-	list_del(&stats->list);
-	mutex_unlock(&c->data_progress_lock);
-}
-
 int bch2_move_data(struct bch_fs *c,
 		   enum btree_id start_btree_id, struct bpos start_pos,
 		   enum btree_id end_btree_id,   struct bpos end_pos,
 		   struct bch_ratelimit *rate,
-		   struct write_point_specifier wp,
-		   move_pred_fn pred, void *arg,
 		   struct bch_move_stats *stats,
-		   bool wait_on_copygc)
+		   struct write_point_specifier wp,
+		   bool wait_on_copygc,
+		   move_pred_fn pred, void *arg)
 {
-	struct moving_context ctxt = { .stats = stats };
+	struct moving_context ctxt;
 	enum btree_id id;
 	int ret;
 
-	progress_list_add(c, stats);
-	closure_init_stack(&ctxt.cl);
-	INIT_LIST_HEAD(&ctxt.reads);
-	init_waitqueue_head(&ctxt.wait);
-
-	stats->data_type = BCH_DATA_user;
+	bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc);
 
 	for (id = start_btree_id;
 	     id <= min_t(unsigned, end_btree_id, BTREE_ID_NR - 1);
@@ -453,25 +474,16 @@ int bch2_move_data(struct bch_fs *c,
 		    id != BTREE_ID_reflink)
 			continue;
 
-		ret = __bch2_move_data(c, &ctxt, rate, wp,
+		ret = __bch2_move_data(&ctxt,
 				       id == start_btree_id ? start_pos : POS_MIN,
 				       id == end_btree_id   ? end_pos   : POS_MAX,
-				       pred, arg, stats, id, wait_on_copygc);
+				       pred, arg, id);
 		if (ret)
 			break;
 	}
 
+	bch2_moving_ctxt_exit(&ctxt);
 
-	move_ctxt_wait_event(&ctxt, NULL, list_empty(&ctxt.reads));
-	closure_sync(&ctxt.cl);
-
-	EBUG_ON(atomic_read(&ctxt.write_sectors));
-
-	trace_move_data(c,
-			atomic64_read(&stats->sectors_moved),
-			atomic64_read(&stats->keys_moved));
-
-	progress_list_del(c, stats);
 	return ret;
 }
 
@@ -692,8 +704,11 @@ int bch2_data_job(struct bch_fs *c,
 		ret = bch2_move_data(c,
 				     op.start_btree,	op.start_pos,
 				     op.end_btree,	op.end_pos,
-				     NULL, writepoint_hashed((unsigned long) current),
-				     rereplicate_pred, c, stats, true) ?: ret;
+				     NULL,
+				     stats,
+				     writepoint_hashed((unsigned long) current),
+				     true,
+				     rereplicate_pred, c) ?: ret;
 		ret = bch2_replicas_gc2(c) ?: ret;
 		break;
 	case BCH_DATA_OP_MIGRATE:
@@ -713,8 +728,11 @@ int bch2_data_job(struct bch_fs *c,
 		ret = bch2_move_data(c,
 				     op.start_btree,	op.start_pos,
 				     op.end_btree,	op.end_pos,
-				     NULL, writepoint_hashed((unsigned long) current),
-				     migrate_pred, &op, stats, true) ?: ret;
+				     NULL,
+				     stats,
+				     writepoint_hashed((unsigned long) current),
+				     true,
+				     migrate_pred, &op) ?: ret;
 		ret = bch2_replicas_gc2(c) ?: ret;
 		break;
 	case BCH_DATA_OP_REWRITE_OLD_NODES:
diff --git a/fs/bcachefs/move.h b/fs/bcachefs/move.h
index d362cb545c0b..6250c75618c4 100644
--- a/fs/bcachefs/move.h
+++ b/fs/bcachefs/move.h
@@ -10,11 +10,14 @@
 struct bch_read_bio;
 
 struct moving_context {
-	/* Closure for waiting on all reads and writes to complete */
-	struct closure		cl;
-
+	struct bch_fs		*c;
+	struct bch_ratelimit	*rate;
 	struct bch_move_stats	*stats;
+	struct write_point_specifier wp;
+	bool			wait_on_copygc;
 
+	/* For waiting on outstanding reads and writes: */
+	struct closure		cl;
 	struct list_head	reads;
 
 	/* in flight sectors: */
@@ -25,7 +28,12 @@ struct moving_context {
 };
 
 typedef bool (*move_pred_fn)(struct bch_fs *, void *, struct bkey_s_c,
-				struct bch_io_opts *, struct data_update_opts *);
+			     struct bch_io_opts *, struct data_update_opts *);
+
+void bch2_moving_ctxt_exit(struct moving_context *);
+void bch2_moving_ctxt_init(struct moving_context *, struct bch_fs *,
+			   struct bch_ratelimit *, struct bch_move_stats *,
+			   struct write_point_specifier, bool);
 
 int bch2_scan_old_btree_nodes(struct bch_fs *, struct bch_move_stats *);
 
@@ -33,10 +41,10 @@ int bch2_move_data(struct bch_fs *,
 		   enum btree_id, struct bpos,
 		   enum btree_id, struct bpos,
 		   struct bch_ratelimit *,
-		   struct write_point_specifier,
-		   move_pred_fn, void *,
 		   struct bch_move_stats *,
-		   bool);
+		   struct write_point_specifier,
+		   bool,
+		   move_pred_fn, void *);
 
 int bch2_data_job(struct bch_fs *,
 		  struct bch_move_stats *,
diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c
index 8b6ad9ec72af..49fb405c1430 100644
--- a/fs/bcachefs/movinggc.c
+++ b/fs/bcachefs/movinggc.c
@@ -314,10 +314,10 @@ static int bch2_copygc(struct bch_fs *c)
 			     0,			POS_MIN,
 			     BTREE_ID_NR,	POS_MAX,
 			     NULL,
-			     writepoint_ptr(&c->copygc_write_point),
-			     copygc_pred, NULL,
 			     &move_stats,
-			     false);
+			     writepoint_ptr(&c->copygc_write_point),
+			     false,
+			     copygc_pred, NULL);
 	if (ret < 0)
 		bch_err(c, "error %i from bch2_move_data() in copygc", ret);
 	if (ret)
diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c
index 57082260fc00..1de8183ea295 100644
--- a/fs/bcachefs/rebalance.c
+++ b/fs/bcachefs/rebalance.c
@@ -253,9 +253,10 @@ static int bch2_rebalance_thread(void *arg)
 			       BTREE_ID_NR,	POS_MAX,
 			       /* ratelimiting disabled for now */
 			       NULL, /*  &r->pd.rate, */
+			       &move_stats,
 			       writepoint_ptr(&c->rebalance_write_point),
-			       rebalance_pred, NULL,
-			       &move_stats, true);
+			       true,
+			       rebalance_pred, NULL);
 	}
 
 	return 0;
-- 
cgit 


From 4081ace307c15fb0c15bcc65fce2e3792c1979d9 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 20 Jun 2022 19:43:35 -0400
Subject: bcachefs: Get ref on c->writes in move.c

There's no point reading an extent in order to move it if the write is
going to fail because we're shutting down. This patch changes the move
path so that moving_io now owns a ref on c->writes - as a bonus,
rebalance and copygc will now notice that we're shutting down and exit
quicker.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/io.c   | 6 ------
 fs/bcachefs/move.c | 6 ++++++
 2 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index c22ce1eb6b8b..f137a8e90f07 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -1227,12 +1227,6 @@ again:
 					BKEY_EXTENT_U64s_MAX))
 			break;
 
-		if ((op->flags & BCH_WRITE_FROM_INTERNAL) &&
-		    percpu_ref_is_dying(&c->writes)) {
-			ret = -EROFS;
-			goto err;
-		}
-
 		/*
 		 * The copygc thread is now global, which means it's no longer
 		 * freeing up space on specific disks, which means that
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index a19c3117f9fe..eae93c65e1c7 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -52,9 +52,11 @@ struct moving_io {
 static void move_free(struct moving_io *io)
 {
 	struct moving_context *ctxt = io->write.ctxt;
+	struct bch_fs *c = ctxt->c;
 
 	bch2_data_update_exit(&io->write);
 	wake_up(&ctxt->wait);
+	percpu_ref_put(&c->writes);
 	kfree(io);
 }
 
@@ -192,6 +194,9 @@ static int bch2_move_extent(struct btree_trans *trans,
 	unsigned sectors = k.k->size, pages;
 	int ret = -ENOMEM;
 
+	if (!percpu_ref_tryget_live(&c->writes))
+		return -EROFS;
+
 	/* write path might have to decompress data: */
 	bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
 		sectors = max_t(unsigned, sectors, p.crc.uncompressed_size);
@@ -258,6 +263,7 @@ err_free_pages:
 err_free:
 	kfree(io);
 err:
+	percpu_ref_put(&c->writes);
 	trace_move_alloc_mem_fail(k.k);
 	return ret;
 }
-- 
cgit 


From 7c0732b88dfb87b5f32dcf5fdd7984d2acb992b9 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 29 Jun 2022 17:14:06 -0400
Subject: bcachefs: Fix move path when move_stats == NULL

This isn't done very often, but it is legitimate

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/move.c | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index eae93c65e1c7..8b44d95c32ce 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -140,13 +140,15 @@ void bch2_moving_ctxt_exit(struct moving_context *ctxt)
 {
 	move_ctxt_wait_event(ctxt, NULL, list_empty(&ctxt->reads));
 	closure_sync(&ctxt->cl);
-	progress_list_del(ctxt->c, ctxt->stats);
-
 	EBUG_ON(atomic_read(&ctxt->write_sectors));
 
-	trace_move_data(ctxt->c,
-			atomic64_read(&ctxt->stats->sectors_moved),
-			atomic64_read(&ctxt->stats->keys_moved));
+	if (ctxt->stats) {
+		progress_list_del(ctxt->c, ctxt->stats);
+
+		trace_move_data(ctxt->c,
+				atomic64_read(&ctxt->stats->sectors_moved),
+				atomic64_read(&ctxt->stats->keys_moved));
+	}
 }
 
 void bch2_moving_ctxt_init(struct moving_context *ctxt,
@@ -164,13 +166,14 @@ void bch2_moving_ctxt_init(struct moving_context *ctxt,
 	ctxt->wp	= wp;
 	ctxt->wait_on_copygc = wait_on_copygc;
 
-	progress_list_add(c, stats);
 	closure_init_stack(&ctxt->cl);
 	INIT_LIST_HEAD(&ctxt->reads);
 	init_waitqueue_head(&ctxt->wait);
 
-	if (stats)
+	if (stats) {
+		progress_list_add(c, stats);
 		stats->data_type = BCH_DATA_user;
+	}
 }
 
 void bch_move_stats_init(struct bch_move_stats *stats, char *name)
-- 
cgit 


From e28307a10656d90a2d33fbf4dc64b881f81c68cf Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 5 Jul 2022 16:46:40 -0400
Subject: bcachefs: Silence unimportant tracepoints

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_iter.c | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index fc989b46b67e..b90aff2ad775 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -181,12 +181,14 @@ bool __bch2_btree_node_relock(struct btree_trans *trans,
 		return true;
 	}
 fail:
-	trace_btree_node_relock_fail(trans->fn, _RET_IP_,
-				     path->btree_id,
-				     &path->pos,
-				     (unsigned long) b,
-				     path->l[level].lock_seq,
-				     is_btree_node(path, level) ? b->c.lock.state.seq : 0);
+	if (b != BTREE_ITER_NO_NODE_CACHED &&
+	    b != BTREE_ITER_NO_NODE_INIT)
+		trace_btree_node_relock_fail(trans->fn, _RET_IP_,
+					     path->btree_id,
+					     &path->pos,
+					     (unsigned long) b,
+					     path->l[level].lock_seq,
+					     is_btree_node(path, level) ? b->c.lock.state.seq : 0);
 	return false;
 }
 
-- 
cgit 


From 1534ebb706efe9765787c0899dfa5ec52331ba8f Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 11 Jul 2022 22:32:04 -0400
Subject: bcachefs: Put some repair messages behind opts->verbose

These messages log the updates we're doing in bch2_check_fix_ptrs(),
which is useful when debugging but not usually needed.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_gc.c | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 0447f5a51b5e..1499885d899f 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -748,13 +748,15 @@ found:
 		if (level)
 			bch2_btree_node_update_key_early(c, btree_id, level - 1, *k, new);
 
-		printbuf_reset(&buf);
-		bch2_bkey_val_to_text(&buf, c, *k);
-		bch_info(c, "updated %s", buf.buf);
+		if (c->opts.verbose) {
+			printbuf_reset(&buf);
+			bch2_bkey_val_to_text(&buf, c, *k);
+			bch_info(c, "updated %s", buf.buf);
 
-		printbuf_reset(&buf);
-		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(new));
-		bch_info(c, "new key %s", buf.buf);
+			printbuf_reset(&buf);
+			bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(new));
+			bch_info(c, "new key %s", buf.buf);
+		}
 
 		*k = bkey_i_to_s_c(new);
 	}
-- 
cgit 


From 80b3bf33d35e8f5bf4323be71777e9aab66c3a90 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 11 Jul 2022 21:06:52 -0400
Subject: bcachefs: Silence some fsck errors when reconstructing alloc info

There's no need to print fsck errors for errors that are expected, and
the user has already opted to repair.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/alloc_background.c | 28 +++++++++++++++-------------
 fs/bcachefs/btree_gc.c         | 37 +++++++++++++++++++------------------
 2 files changed, 34 insertions(+), 31 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index f4457d62d75e..a511ab9e4e7c 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -652,12 +652,13 @@ static int bch2_check_alloc_key(struct btree_trans *trans,
 	if (ret)
 		goto err;
 
-	if (fsck_err_on(k.k->type != discard_key_type, c,
-			"incorrect key in need_discard btree (got %s should be %s)\n"
-			"  %s",
-			bch2_bkey_types[k.k->type],
-			bch2_bkey_types[discard_key_type],
-			(bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) {
+	if (k.k->type != discard_key_type &&
+	    (c->opts.reconstruct_alloc ||
+	     fsck_err(c, "incorrect key in need_discard btree (got %s should be %s)\n"
+		      "  %s",
+		      bch2_bkey_types[k.k->type],
+		      bch2_bkey_types[discard_key_type],
+		      (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf)))) {
 		struct bkey_i *update =
 			bch2_trans_kmalloc(trans, sizeof(*update));
 
@@ -679,13 +680,14 @@ static int bch2_check_alloc_key(struct btree_trans *trans,
 	if (ret)
 		goto err;
 
-	if (fsck_err_on(k.k->type != freespace_key_type, c,
-			"incorrect key in freespace btree (got %s should be %s)\n"
-			"  %s",
-			bch2_bkey_types[k.k->type],
-			bch2_bkey_types[freespace_key_type],
-			(printbuf_reset(&buf),
-			 bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) {
+	if (k.k->type != freespace_key_type &&
+	    (c->opts.reconstruct_alloc ||
+	     fsck_err(c, "incorrect key in freespace btree (got %s should be %s)\n"
+		      "  %s",
+		      bch2_bkey_types[k.k->type],
+		      bch2_bkey_types[freespace_key_type],
+		      (printbuf_reset(&buf),
+		       bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf)))) {
 		struct bkey_i *update =
 			bch2_trans_kmalloc(trans, sizeof(*update));
 
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 1499885d899f..8be1c9f2664d 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -564,7 +564,8 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id,
 		struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr);
 		enum bch_data_type data_type = bch2_bkey_ptr_data_type(*k, &entry->ptr);
 
-		if (fsck_err_on(!g->gen_valid, c,
+		if (c->opts.reconstruct_alloc ||
+		    fsck_err_on(!g->gen_valid, c,
 				"bucket %u:%zu data type %s ptr gen %u missing in alloc btree\n"
 				"while marking %s",
 				p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
@@ -1176,29 +1177,28 @@ static int bch2_gc_done(struct bch_fs *c,
 {
 	struct bch_dev *ca = NULL;
 	struct printbuf buf = PRINTBUF;
-	bool verify = !metadata_only && (!initial ||
-		       (c->sb.compat & (1ULL << BCH_COMPAT_alloc_info)));
+	bool verify = !metadata_only &&
+		!c->opts.reconstruct_alloc &&
+		(!initial || (c->sb.compat & (1ULL << BCH_COMPAT_alloc_info)));
 	unsigned i, dev;
 	int ret = 0;
 
 	percpu_down_write(&c->mark_lock);
 
 #define copy_field(_f, _msg, ...)					\
-	if (dst->_f != src->_f) {					\
-		if (verify)						\
-			fsck_err(c, _msg ": got %llu, should be %llu"	\
-				, ##__VA_ARGS__, dst->_f, src->_f);	\
-		dst->_f = src->_f;					\
-	}
+	if (dst->_f != src->_f &&					\
+	    (!verify ||							\
+	     fsck_err(c, _msg ": got %llu, should be %llu"		\
+		      , ##__VA_ARGS__, dst->_f, src->_f)))		\
+		dst->_f = src->_f
 #define copy_stripe_field(_f, _msg, ...)				\
-	if (dst->_f != src->_f) {					\
-		if (verify)						\
-			fsck_err(c, "stripe %zu has wrong "_msg		\
-				": got %u, should be %u",		\
-				iter.pos, ##__VA_ARGS__,		\
-				dst->_f, src->_f);			\
-		dst->_f = src->_f;					\
-	}
+	if (dst->_f != src->_f &&					\
+	    (!verify ||							\
+	     fsck_err(c, "stripe %zu has wrong "_msg			\
+		      ": got %u, should be %u",				\
+		      iter.pos, ##__VA_ARGS__,				\
+		      dst->_f, src->_f)))				\
+		dst->_f = src->_f
 #define copy_dev_field(_f, _msg, ...)					\
 	copy_field(_f, "dev %u has wrong " _msg, dev, ##__VA_ARGS__)
 #define copy_fs_field(_f, _msg, ...)					\
@@ -1376,7 +1376,8 @@ static int bch2_alloc_write_key(struct btree_trans *trans,
 		return 0;
 
 #define copy_bucket_field(_f)						\
-	if (fsck_err_on(new._f != gc._f, c,				\
+	if (c->opts.reconstruct_alloc ||				\
+	    fsck_err_on(new._f != gc._f, c,				\
 			"bucket %llu:%llu gen %u data type %s has wrong " #_f	\
 			": got %u, should be %u",			\
 			iter->pos.inode, iter->pos.offset,		\
-- 
cgit 


From e68914ca849fa51167e2136ad9f6b43c22956d3c Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 13 Jul 2022 05:25:29 -0400
Subject: bcachefs: Rename __bch2_trans_do() -> commit_do()

Better/more descriptive naming, and prep for adding
nested_lockrestart_do() and nested_commit_do().

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/alloc_background.c      | 14 +++++++-------
 fs/bcachefs/btree_gc.c              | 10 +++++-----
 fs/bcachefs/btree_update.h          |  5 ++---
 fs/bcachefs/btree_update_interior.c |  2 +-
 fs/bcachefs/buckets.c               |  2 +-
 fs/bcachefs/fs.c                    |  6 +++---
 fs/bcachefs/fsck.c                  | 20 ++++++++++----------
 fs/bcachefs/lru.c                   |  2 +-
 fs/bcachefs/subvolume.c             |  6 +++---
 fs/bcachefs/tests.c                 | 22 +++++++++++-----------
 10 files changed, 44 insertions(+), 45 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index a511ab9e4e7c..f515e679a90c 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -787,7 +787,7 @@ int bch2_check_alloc_info(struct bch_fs *c)
 	bch2_trans_iter_init(&trans, &freespace_iter, BTREE_ID_freespace, POS_MIN,
 			     BTREE_ITER_PREFETCH);
 	while (1) {
-		ret = __bch2_trans_do(&trans, NULL, NULL,
+		ret = commit_do(&trans, NULL, NULL,
 				      BTREE_INSERT_NOFAIL|
 				      BTREE_INSERT_LAZY_RW,
 			bch2_check_alloc_key(&trans, &iter,
@@ -808,7 +808,7 @@ int bch2_check_alloc_info(struct bch_fs *c)
 	bch2_trans_iter_init(&trans, &iter, BTREE_ID_need_discard, POS_MIN,
 			     BTREE_ITER_PREFETCH);
 	while (1) {
-		ret = __bch2_trans_do(&trans, NULL, NULL,
+		ret = commit_do(&trans, NULL, NULL,
 				      BTREE_INSERT_NOFAIL|
 				      BTREE_INSERT_LAZY_RW,
 			bch2_check_discard_freespace_key(&trans, &iter));
@@ -825,7 +825,7 @@ int bch2_check_alloc_info(struct bch_fs *c)
 	bch2_trans_iter_init(&trans, &iter, BTREE_ID_freespace, POS_MIN,
 			     BTREE_ITER_PREFETCH);
 	while (1) {
-		ret = __bch2_trans_do(&trans, NULL, NULL,
+		ret = commit_do(&trans, NULL, NULL,
 				      BTREE_INSERT_NOFAIL|
 				      BTREE_INSERT_LAZY_RW,
 			bch2_check_discard_freespace_key(&trans, &iter));
@@ -930,7 +930,7 @@ int bch2_check_alloc_to_lru_refs(struct bch_fs *c)
 
 	for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN,
 			   BTREE_ITER_PREFETCH, k, ret) {
-		ret = __bch2_trans_do(&trans, NULL, NULL,
+		ret = commit_do(&trans, NULL, NULL,
 				      BTREE_INSERT_NOFAIL|
 				      BTREE_INSERT_LAZY_RW,
 			bch2_check_alloc_to_lru_ref(&trans, &iter));
@@ -1060,7 +1060,7 @@ static void bch2_do_discards_work(struct work_struct *work)
 			continue;
 		}
 
-		ret = __bch2_trans_do(&trans, NULL, NULL,
+		ret = commit_do(&trans, NULL, NULL,
 				      BTREE_INSERT_USE_RESERVE|
 				      BTREE_INSERT_NOFAIL,
 				bch2_clear_need_discard(&trans, k.k->p, ca, &discard_done));
@@ -1198,7 +1198,7 @@ static void bch2_do_invalidates_work(struct work_struct *work)
 			should_invalidate_buckets(ca, bch2_dev_usage_read(ca));
 
 		while (nr_to_invalidate-- >= 0) {
-			ret = __bch2_trans_do(&trans, NULL, NULL,
+			ret = commit_do(&trans, NULL, NULL,
 					      BTREE_INSERT_USE_RESERVE|
 					      BTREE_INSERT_NOFAIL,
 					invalidate_one_bucket(&trans, ca, &bucket,
@@ -1254,7 +1254,7 @@ static int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca)
 		if (iter.pos.offset >= ca->mi.nbuckets)
 			break;
 
-		ret = __bch2_trans_do(&trans, NULL, NULL,
+		ret = commit_do(&trans, NULL, NULL,
 				      BTREE_INSERT_LAZY_RW,
 				 bucket_freespace_init(&trans, &iter));
 		if (ret)
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 8be1c9f2664d..ebb1ad4b8abe 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -799,7 +799,7 @@ static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id,
 			atomic64_set(&c->key_version, k->k->version.lo);
 	}
 
-	ret = __bch2_trans_do(trans, NULL, NULL, 0,
+	ret = commit_do(trans, NULL, NULL, 0,
 			bch2_mark_key(trans, old, *k, flags));
 fsck_err:
 err:
@@ -1435,7 +1435,7 @@ static int bch2_gc_alloc_done(struct bch_fs *c, bool metadata_only)
 			if (bkey_cmp(iter.pos, POS(ca->dev_idx, ca->mi.nbuckets)) >= 0)
 				break;
 
-			ret = __bch2_trans_do(&trans, NULL, NULL,
+			ret = commit_do(&trans, NULL, NULL,
 					      BTREE_INSERT_LAZY_RW,
 					bch2_alloc_write_key(&trans, &iter,
 							     metadata_only));
@@ -1589,7 +1589,7 @@ static int bch2_gc_reflink_done(struct bch_fs *c, bool metadata_only)
 			else
 				*bkey_refcount(new) = cpu_to_le64(r->refcount);
 
-			ret = __bch2_trans_do(&trans, NULL, NULL, 0,
+			ret = commit_do(&trans, NULL, NULL, 0,
 				__bch2_btree_insert(&trans, BTREE_ID_reflink, new));
 			kfree(new);
 
@@ -1702,7 +1702,7 @@ inconsistent:
 			for (i = 0; i < new->v.nr_blocks; i++)
 				stripe_blockcount_set(&new->v, i, m ? m->block_sectors[i] : 0);
 
-			ret = __bch2_trans_do(&trans, NULL, NULL, 0,
+			ret = commit_do(&trans, NULL, NULL, 0,
 				__bch2_btree_insert(&trans, BTREE_ID_reflink, &new->k_i));
 			kfree(new);
 		}
@@ -2009,7 +2009,7 @@ int bch2_gc_gens(struct bch_fs *c)
 
 	for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN,
 			   BTREE_ITER_PREFETCH, k, ret) {
-		ret = __bch2_trans_do(&trans, NULL, NULL,
+		ret = commit_do(&trans, NULL, NULL,
 				      BTREE_INSERT_NOFAIL,
 				bch2_alloc_write_oldest_gen(&trans, &iter));
 		if (ret) {
diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
index 28f958577006..e9127dbf7e24 100644
--- a/fs/bcachefs/btree_update.h
+++ b/fs/bcachefs/btree_update.h
@@ -118,7 +118,7 @@ static inline int bch2_trans_commit(struct btree_trans *trans,
 	_ret;								\
 })
 
-#define __bch2_trans_do(_trans, _disk_res, _journal_seq, _flags, _do)	\
+#define commit_do(_trans, _disk_res, _journal_seq, _flags, _do)	\
 	lockrestart_do(_trans, _do ?: bch2_trans_commit(_trans, (_disk_res),\
 					(_journal_seq), (_flags)))
 
@@ -128,8 +128,7 @@ static inline int bch2_trans_commit(struct btree_trans *trans,
 	int _ret;							\
 									\
 	bch2_trans_init(&trans, (_c), 0, 0);				\
-	_ret = __bch2_trans_do(&trans, _disk_res, _journal_seq, _flags,	\
-			       _do);					\
+	_ret = commit_do(&trans, _disk_res, _journal_seq, _flags, _do);	\
 	bch2_trans_exit(&trans);					\
 									\
 	_ret;								\
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index eeaea292bd80..ee95a79dc13e 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -599,7 +599,7 @@ static void btree_update_nodes_written(struct btree_update *as)
 	 * which may require allocations as well.
 	 */
 	bch2_trans_init(&trans, c, 0, 512);
-	ret = __bch2_trans_do(&trans, &as->disk_res, &journal_seq,
+	ret = commit_do(&trans, &as->disk_res, &journal_seq,
 			      BTREE_INSERT_NOFAIL|
 			      BTREE_INSERT_NOCHECK_RW|
 			      BTREE_INSERT_JOURNAL_RECLAIM|
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index eab01cc09337..99c9d5b14d48 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -1853,7 +1853,7 @@ int bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
 				    enum bch_data_type type,
 				    unsigned sectors)
 {
-	return __bch2_trans_do(trans, NULL, NULL, 0,
+	return commit_do(trans, NULL, NULL, 0,
 			__bch2_trans_mark_metadata_bucket(trans, ca, b, type, sectors));
 }
 
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index b2bc28d0cf05..08268fe1074f 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -443,7 +443,7 @@ static int __bch2_link(struct bch_fs *c,
 	mutex_lock(&inode->ei_update_lock);
 	bch2_trans_init(&trans, c, 4, 1024);
 
-	ret = __bch2_trans_do(&trans, NULL, NULL, 0,
+	ret = commit_do(&trans, NULL, NULL, 0,
 			bch2_link_trans(&trans,
 					inode_inum(dir),   &dir_u,
 					inode_inum(inode), &inode_u,
@@ -492,7 +492,7 @@ int __bch2_unlink(struct inode *vdir, struct dentry *dentry,
 	bch2_lock_inodes(INODE_UPDATE_LOCK, dir, inode);
 	bch2_trans_init(&trans, c, 4, 1024);
 
-	ret = __bch2_trans_do(&trans, NULL, NULL,
+	ret = commit_do(&trans, NULL, NULL,
 			      BTREE_INSERT_NOFAIL,
 			bch2_unlink_trans(&trans,
 					  inode_inum(dir), &dir_u,
@@ -614,7 +614,7 @@ static int bch2_rename2(struct mnt_idmap *idmap,
 			goto err;
 	}
 
-	ret = __bch2_trans_do(&trans, NULL, NULL, 0,
+	ret = commit_do(&trans, NULL, NULL, 0,
 			bch2_rename_trans(&trans,
 					  inode_inum(src_dir), &src_dir_u,
 					  inode_inum(dst_dir), &dst_dir_u,
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index f1abec95a740..bdf0183d5d21 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -220,7 +220,7 @@ static int write_inode(struct btree_trans *trans,
 		       struct bch_inode_unpacked *inode,
 		       u32 snapshot)
 {
-	int ret = __bch2_trans_do(trans, NULL, NULL,
+	int ret = commit_do(trans, NULL, NULL,
 				  BTREE_INSERT_NOFAIL|
 				  BTREE_INSERT_LAZY_RW,
 				  __write_inode(trans, inode, snapshot));
@@ -434,7 +434,7 @@ static int reattach_inode(struct btree_trans *trans,
 			  struct bch_inode_unpacked *inode,
 			  u32 inode_snapshot)
 {
-	int ret = __bch2_trans_do(trans, NULL, NULL,
+	int ret = commit_do(trans, NULL, NULL,
 				  BTREE_INSERT_LAZY_RW|
 				  BTREE_INSERT_NOFAIL,
 			__reattach_inode(trans, inode, inode_snapshot));
@@ -940,7 +940,7 @@ static int check_inodes(struct bch_fs *c, bool full)
 			     BTREE_ITER_ALL_SNAPSHOTS);
 
 	do {
-		ret = __bch2_trans_do(&trans, NULL, NULL,
+		ret = commit_do(&trans, NULL, NULL,
 				      BTREE_INSERT_LAZY_RW|
 				      BTREE_INSERT_NOFAIL,
 			check_inode(&trans, &iter, &prev, full));
@@ -1002,7 +1002,7 @@ static int check_subvols(struct bch_fs *c)
 			     BTREE_ITER_PREFETCH);
 
 	do {
-		ret = __bch2_trans_do(&trans, NULL, NULL,
+		ret = commit_do(&trans, NULL, NULL,
 				      BTREE_INSERT_LAZY_RW|
 				      BTREE_INSERT_NOFAIL,
 				      check_subvol(&trans, &iter));
@@ -1306,7 +1306,7 @@ static int check_extents(struct bch_fs *c)
 			     BTREE_ITER_ALL_SNAPSHOTS);
 
 	do {
-		ret = __bch2_trans_do(&trans, NULL, NULL,
+		ret = commit_do(&trans, NULL, NULL,
 				      BTREE_INSERT_LAZY_RW|
 				      BTREE_INSERT_NOFAIL,
 			check_extent(&trans, &iter, &w, &s));
@@ -1687,7 +1687,7 @@ static int check_dirents(struct bch_fs *c)
 			     BTREE_ITER_ALL_SNAPSHOTS);
 
 	do {
-		ret = __bch2_trans_do(&trans, NULL, NULL,
+		ret = commit_do(&trans, NULL, NULL,
 				      BTREE_INSERT_LAZY_RW|
 				      BTREE_INSERT_NOFAIL,
 			check_dirent(&trans, &iter, &hash_info,
@@ -1774,7 +1774,7 @@ static int check_xattrs(struct bch_fs *c)
 			     BTREE_ITER_ALL_SNAPSHOTS);
 
 	do {
-		ret = __bch2_trans_do(&trans, NULL, NULL,
+		ret = commit_do(&trans, NULL, NULL,
 				      BTREE_INSERT_LAZY_RW|
 				      BTREE_INSERT_NOFAIL,
 				      check_xattr(&trans, &iter, &hash_info,
@@ -1814,7 +1814,7 @@ static int check_root_trans(struct btree_trans *trans)
 		root_subvol.v.flags	= 0;
 		root_subvol.v.snapshot	= cpu_to_le32(snapshot);
 		root_subvol.v.inode	= cpu_to_le64(inum);
-		ret = __bch2_trans_do(trans, NULL, NULL,
+		ret = commit_do(trans, NULL, NULL,
 				      BTREE_INSERT_NOFAIL|
 				      BTREE_INSERT_LAZY_RW,
 			__bch2_btree_insert(trans, BTREE_ID_subvolumes, &root_subvol.k_i));
@@ -1977,7 +1977,7 @@ static int check_path(struct btree_trans *trans,
 			if (!fsck_err(c, "directory structure loop"))
 				return 0;
 
-			ret = __bch2_trans_do(trans, NULL, NULL,
+			ret = commit_do(trans, NULL, NULL,
 					      BTREE_INSERT_NOFAIL|
 					      BTREE_INSERT_LAZY_RW,
 					remove_backpointer(trans, inode));
@@ -2366,7 +2366,7 @@ static int fix_reflink_p(struct bch_fs *c)
 			   BTREE_ITER_PREFETCH|
 			   BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
 		if (k.k->type == KEY_TYPE_reflink_p) {
-			ret = __bch2_trans_do(&trans, NULL, NULL,
+			ret = commit_do(&trans, NULL, NULL,
 					      BTREE_INSERT_NOFAIL|
 					      BTREE_INSERT_LAZY_RW,
 					      fix_reflink_p_key(&trans, &iter));
diff --git a/fs/bcachefs/lru.c b/fs/bcachefs/lru.c
index 5a09b55006ff..94ecb3a39760 100644
--- a/fs/bcachefs/lru.c
+++ b/fs/bcachefs/lru.c
@@ -204,7 +204,7 @@ int bch2_check_lrus(struct bch_fs *c)
 
 	for_each_btree_key(&trans, iter, BTREE_ID_lru, POS_MIN,
 			   BTREE_ITER_PREFETCH, k, ret) {
-		ret = __bch2_trans_do(&trans, NULL, NULL,
+		ret = commit_do(&trans, NULL, NULL,
 				      BTREE_INSERT_NOFAIL|
 				      BTREE_INSERT_LAZY_RW,
 			bch2_check_lru_key(&trans, &iter));
diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c
index 60b60de83f3e..d74dc9843028 100644
--- a/fs/bcachefs/subvolume.c
+++ b/fs/bcachefs/subvolume.c
@@ -605,7 +605,7 @@ static int bch2_snapshot_delete_keys_btree(struct btree_trans *trans,
 			    bch2_btree_key_cache_flush(trans, btree_id, iter.pos))
 				continue;
 
-			ret = __bch2_trans_do(trans, NULL, NULL,
+			ret = commit_do(trans, NULL, NULL,
 					      BTREE_INSERT_NOFAIL,
 				bch2_btree_iter_traverse(&iter) ?:
 				bch2_btree_delete_at(trans, &iter,
@@ -664,7 +664,7 @@ static void bch2_delete_dead_snapshots_work(struct work_struct *work)
 		if (ret)
 			continue;
 
-		ret = __bch2_trans_do(&trans, NULL, NULL, 0,
+		ret = commit_do(&trans, NULL, NULL, 0,
 			bch2_snapshot_node_set_deleted(&trans, iter.pos.offset));
 		if (ret) {
 			bch_err(c, "error deleting snapshot %llu: %i", iter.pos.offset, ret);
@@ -713,7 +713,7 @@ static void bch2_delete_dead_snapshots_work(struct work_struct *work)
 	}
 
 	for (i = 0; i < deleted.nr; i++) {
-		ret = __bch2_trans_do(&trans, NULL, NULL, 0,
+		ret = commit_do(&trans, NULL, NULL, 0,
 			bch2_snapshot_node_delete(&trans, deleted.data[i]));
 		if (ret) {
 			bch_err(c, "error deleting snapshot %u: %i",
diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c
index fa3712a1478c..bfcb133ff483 100644
--- a/fs/bcachefs/tests.c
+++ b/fs/bcachefs/tests.c
@@ -42,7 +42,7 @@ static int test_delete(struct bch_fs *c, u64 nr)
 	bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs, k.k.p,
 			     BTREE_ITER_INTENT);
 
-	ret = __bch2_trans_do(&trans, NULL, NULL, 0,
+	ret = commit_do(&trans, NULL, NULL, 0,
 		bch2_btree_iter_traverse(&iter) ?:
 		bch2_trans_update(&trans, &iter, &k.k_i, 0));
 	if (ret) {
@@ -51,7 +51,7 @@ static int test_delete(struct bch_fs *c, u64 nr)
 	}
 
 	pr_info("deleting once");
-	ret = __bch2_trans_do(&trans, NULL, NULL, 0,
+	ret = commit_do(&trans, NULL, NULL, 0,
 		bch2_btree_iter_traverse(&iter) ?:
 		bch2_btree_delete_at(&trans, &iter, 0));
 	if (ret) {
@@ -60,7 +60,7 @@ static int test_delete(struct bch_fs *c, u64 nr)
 	}
 
 	pr_info("deleting twice");
-	ret = __bch2_trans_do(&trans, NULL, NULL, 0,
+	ret = commit_do(&trans, NULL, NULL, 0,
 		bch2_btree_iter_traverse(&iter) ?:
 		bch2_btree_delete_at(&trans, &iter, 0));
 	if (ret) {
@@ -88,7 +88,7 @@ static int test_delete_written(struct bch_fs *c, u64 nr)
 	bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs, k.k.p,
 			     BTREE_ITER_INTENT);
 
-	ret = __bch2_trans_do(&trans, NULL, NULL, 0,
+	ret = commit_do(&trans, NULL, NULL, 0,
 		bch2_btree_iter_traverse(&iter) ?:
 		bch2_trans_update(&trans, &iter, &k.k_i, 0));
 	if (ret) {
@@ -99,7 +99,7 @@ static int test_delete_written(struct bch_fs *c, u64 nr)
 	bch2_trans_unlock(&trans);
 	bch2_journal_flush_all_pins(&c->journal);
 
-	ret = __bch2_trans_do(&trans, NULL, NULL, 0,
+	ret = commit_do(&trans, NULL, NULL, 0,
 		bch2_btree_iter_traverse(&iter) ?:
 		bch2_btree_delete_at(&trans, &iter, 0));
 	if (ret) {
@@ -552,7 +552,7 @@ static int rand_insert(struct bch_fs *c, u64 nr)
 		k.k.p.offset = test_rand();
 		k.k.p.snapshot = U32_MAX;
 
-		ret = __bch2_trans_do(&trans, NULL, NULL, 0,
+		ret = commit_do(&trans, NULL, NULL, 0,
 			__bch2_btree_insert(&trans, BTREE_ID_xattrs, &k.k_i));
 		if (ret) {
 			bch_err(c, "error in rand_insert: %i", ret);
@@ -581,7 +581,7 @@ static int rand_insert_multi(struct bch_fs *c, u64 nr)
 			k[j].k.p.snapshot = U32_MAX;
 		}
 
-		ret = __bch2_trans_do(&trans, NULL, NULL, 0,
+		ret = commit_do(&trans, NULL, NULL, 0,
 			__bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[0].k_i) ?:
 			__bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[1].k_i) ?:
 			__bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[2].k_i) ?:
@@ -668,7 +668,7 @@ static int rand_mixed(struct bch_fs *c, u64 nr)
 
 	for (i = 0; i < nr; i++) {
 		rand = test_rand();
-		ret = __bch2_trans_do(&trans, NULL, NULL, 0,
+		ret = commit_do(&trans, NULL, NULL, 0,
 			rand_mixed_trans(&trans, &iter, &cookie, i, rand));
 		if (ret) {
 			bch_err(c, "update error in rand_mixed: %i", ret);
@@ -714,7 +714,7 @@ static int rand_delete(struct bch_fs *c, u64 nr)
 	for (i = 0; i < nr; i++) {
 		struct bpos pos = SPOS(0, test_rand(), U32_MAX);
 
-		ret = __bch2_trans_do(&trans, NULL, NULL, 0,
+		ret = commit_do(&trans, NULL, NULL, 0,
 			__do_delete(&trans, pos));
 		if (ret) {
 			bch_err(c, "error in rand_delete: %i", ret);
@@ -743,7 +743,7 @@ static int seq_insert(struct bch_fs *c, u64 nr)
 			   BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) {
 		insert.k.p = iter.pos;
 
-		ret = __bch2_trans_do(&trans, NULL, NULL, 0,
+		ret = commit_do(&trans, NULL, NULL, 0,
 			bch2_btree_iter_traverse(&iter) ?:
 			bch2_trans_update(&trans, &iter, &insert.k_i, 0));
 		if (ret) {
@@ -794,7 +794,7 @@ static int seq_overwrite(struct bch_fs *c, u64 nr)
 
 		bkey_reassemble(&u.k_i, k);
 
-		ret = __bch2_trans_do(&trans, NULL, NULL, 0,
+		ret = commit_do(&trans, NULL, NULL, 0,
 			bch2_btree_iter_traverse(&iter) ?:
 			bch2_trans_update(&trans, &iter, &u.k_i, 0));
 		if (ret) {
-- 
cgit 


From 416cc426c0d79c65d85de52d3548a32de06ab3e8 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 12 Jul 2022 09:11:52 -0400
Subject: bcachefs: Fix snapshot deletion

Snapshots being deleted won't in general have a corresponding subvolume:
this fixes a spurious fsck error where we'd complain about a snapshot
pointing to a missing subvolume - but the subvolume had been deleted,
and the snapshot was pending deletion as well.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/subvolume.c | 40 ++++++++++++++++++----------------------
 1 file changed, 18 insertions(+), 22 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c
index d74dc9843028..083eb4324583 100644
--- a/fs/bcachefs/subvolume.c
+++ b/fs/bcachefs/subvolume.c
@@ -17,7 +17,7 @@ void bch2_snapshot_to_text(struct printbuf *out, struct bch_fs *c,
 {
 	struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(k);
 
-	prt_printf(out, "is_subvol %llu deleted %llu parent %u children %u %u subvol %u",
+	prt_printf(out, "is_subvol %llu deleted %llu parent %10u children %10u %10u subvol %u",
 	       BCH_SNAPSHOT_SUBVOL(s.v),
 	       BCH_SNAPSHOT_DELETED(s.v),
 	       le32_to_cpu(s.v->parent),
@@ -196,18 +196,20 @@ static int bch2_snapshot_check(struct btree_trans *trans,
 	u32 i, id;
 	int ret;
 
-	id = le32_to_cpu(s.v->subvol);
-	ret = lockrestart_do(trans, bch2_subvolume_get(trans, id, 0, false, &subvol));
-	if (ret == -ENOENT)
-		bch_err(trans->c, "snapshot node %llu has nonexistent subvolume %u",
-			s.k->p.offset, id);
-	if (ret)
-		return ret;
+	if (!BCH_SNAPSHOT_DELETED(s.v)) {
+		id = le32_to_cpu(s.v->subvol);
+		ret = lockrestart_do(trans, bch2_subvolume_get(trans, id, 0, false, &subvol));
+		if (ret == -ENOENT)
+			bch_err(trans->c, "snapshot node %llu has nonexistent subvolume %u",
+				s.k->p.offset, id);
+		if (ret)
+			return ret;
 
-	if (BCH_SNAPSHOT_SUBVOL(s.v) != (le32_to_cpu(subvol.snapshot) == s.k->p.offset)) {
-		bch_err(trans->c, "snapshot node %llu has wrong BCH_SNAPSHOT_SUBVOL",
-			s.k->p.offset);
-		return -EINVAL;
+		if (BCH_SNAPSHOT_SUBVOL(s.v) != (le32_to_cpu(subvol.snapshot) == s.k->p.offset)) {
+			bch_err(trans->c, "snapshot node %llu has wrong BCH_SNAPSHOT_SUBVOL",
+				s.k->p.offset);
+			return -EINVAL;
+		}
 	}
 
 	id = le32_to_cpu(s.v->parent);
@@ -386,8 +388,10 @@ static int bch2_snapshot_node_set_deleted(struct btree_trans *trans, u32 id)
 		goto err;
 
 	bkey_reassemble(&s->k_i, k);
-
 	SET_BCH_SNAPSHOT_DELETED(&s->v, true);
+	SET_BCH_SNAPSHOT_SUBVOL(&s->v, false);
+	s->v.subvol = 0;
+
 	ret = bch2_trans_update(trans, &iter, &s->k_i, 0);
 	if (ret)
 		goto err;
@@ -830,7 +834,6 @@ int bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid)
 	struct bkey_s_c k;
 	struct bkey_s_c_subvolume subvol;
 	struct btree_trans_commit_hook *h;
-	struct bkey_i *delete;
 	u32 snapid;
 	int ret = 0;
 
@@ -852,14 +855,7 @@ int bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid)
 	subvol = bkey_s_c_to_subvolume(k);
 	snapid = le32_to_cpu(subvol.v->snapshot);
 
-	delete = bch2_trans_kmalloc(trans, sizeof(*delete));
-	ret = PTR_ERR_OR_ZERO(delete);
-	if (ret)
-		goto err;
-
-	bkey_init(&delete->k);
-	delete->k.p = iter.pos;
-	ret = bch2_trans_update(trans, &iter, delete, 0);
+	ret = bch2_btree_delete_at(trans, &iter, 0);
 	if (ret)
 		goto err;
 
-- 
cgit 


From 597dee1cd67d591cd5aeba184fdb69d0da0ceb78 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 14 Jul 2022 02:34:48 -0400
Subject: bcachefs: Switch data_update path to snapshot_id_list

snapshots_seen is becoming private to fsck, and snapshot_id_list is
actually what the data update path needs.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/data_update.c | 15 ++++++---------
 fs/bcachefs/subvolume.c   | 13 +++----------
 fs/bcachefs/subvolume.h   | 21 +++++++++++++++++++++
 3 files changed, 30 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c
index f7bce89f84ed..6726bd6b9b07 100644
--- a/fs/bcachefs/data_update.c
+++ b/fs/bcachefs/data_update.c
@@ -22,13 +22,13 @@ static int insert_snapshot_whiteouts(struct btree_trans *trans,
 	struct bch_fs *c = trans->c;
 	struct btree_iter iter, update_iter;
 	struct bkey_s_c k;
-	struct snapshots_seen s;
+	snapshot_id_list s;
 	int ret;
 
 	if (!btree_type_has_snapshots(id))
 		return 0;
 
-	snapshots_seen_init(&s);
+	darray_init(&s);
 
 	if (!bkey_cmp(old_pos, new_pos))
 		return 0;
@@ -40,7 +40,6 @@ static int insert_snapshot_whiteouts(struct btree_trans *trans,
 			     BTREE_ITER_NOT_EXTENTS|
 			     BTREE_ITER_ALL_SNAPSHOTS);
 	while (1) {
-next:
 		k = bch2_btree_iter_prev(&iter);
 		ret = bkey_err(k);
 		if (ret)
@@ -51,11 +50,9 @@ next:
 
 		if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, old_pos.snapshot)) {
 			struct bkey_i *update;
-			u32 *i;
 
-			darray_for_each(s.ids, i)
-				if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, *i))
-					goto next;
+			if (snapshot_list_has_ancestor(c, &s, k.k->p.snapshot))
+				continue;
 
 			update = bch2_trans_kmalloc(trans, sizeof(struct bkey_i));
 
@@ -78,13 +75,13 @@ next:
 			if (ret)
 				break;
 
-			ret = snapshots_seen_add(c, &s, k.k->p.snapshot);
+			ret = snapshot_list_add(c, &s, k.k->p.snapshot);
 			if (ret)
 				break;
 		}
 	}
 	bch2_trans_iter_exit(trans, &iter);
-	darray_exit(&s.ids);
+	darray_exit(&s);
 
 	return ret;
 }
diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c
index 083eb4324583..1865c5b3a2c5 100644
--- a/fs/bcachefs/subvolume.c
+++ b/fs/bcachefs/subvolume.c
@@ -565,13 +565,6 @@ err:
 	return ret;
 }
 
-static int snapshot_id_add(snapshot_id_list *s, u32 id)
-{
-	BUG_ON(snapshot_list_has_id(s, id));
-
-	return darray_push(s, id);
-}
-
 static int bch2_snapshot_delete_keys_btree(struct btree_trans *trans,
 					   snapshot_id_list *deleted,
 					   enum btree_id btree_id)
@@ -617,7 +610,7 @@ static int bch2_snapshot_delete_keys_btree(struct btree_trans *trans,
 			if (ret)
 				break;
 		} else {
-			ret = snapshot_id_add(&equiv_seen, equiv);
+			ret = snapshot_list_add(c, &equiv_seen, equiv);
 			if (ret)
 				break;
 		}
@@ -693,7 +686,7 @@ static void bch2_delete_dead_snapshots_work(struct work_struct *work)
 
 		snap = bkey_s_c_to_snapshot(k);
 		if (BCH_SNAPSHOT_DELETED(snap.v)) {
-			ret = snapshot_id_add(&deleted, k.k->p.offset);
+			ret = snapshot_list_add(c, &deleted, k.k->p.offset);
 			if (ret)
 				break;
 		}
@@ -921,7 +914,7 @@ int bch2_subvolume_wait_for_pagecache_and_delete_hook(struct btree_trans *trans,
 
 	mutex_lock(&c->snapshots_unlinked_lock);
 	if (!snapshot_list_has_id(&c->snapshots_unlinked, h->subvol))
-		ret = snapshot_id_add(&c->snapshots_unlinked, h->subvol);
+		ret = snapshot_list_add(c, &c->snapshots_unlinked, h->subvol);
 	mutex_unlock(&c->snapshots_unlinked_lock);
 
 	if (ret)
diff --git a/fs/bcachefs/subvolume.h b/fs/bcachefs/subvolume.h
index b1739d29c7d4..28dbd0968f3d 100644
--- a/fs/bcachefs/subvolume.h
+++ b/fs/bcachefs/subvolume.h
@@ -93,6 +93,27 @@ static inline bool snapshot_list_has_id(snapshot_id_list *s, u32 id)
 	return false;
 }
 
+static inline bool snapshot_list_has_ancestor(struct bch_fs *c, snapshot_id_list *s, u32 id)
+{
+	u32 *i;
+
+	darray_for_each(*s, i)
+		if (bch2_snapshot_is_ancestor(c, id, *i))
+			return true;
+	return false;
+}
+
+static inline int snapshot_list_add(struct bch_fs *c, snapshot_id_list *s, u32 id)
+{
+	int ret;
+
+	BUG_ON(snapshot_list_has_id(s, id));
+	ret = darray_push(s, id);
+	if (ret)
+		bch_err(c, "error reallocating snapshot_id_list (size %zu)", s->size);
+	return ret;
+}
+
 int bch2_fs_snapshots_check(struct bch_fs *);
 void bch2_fs_snapshots_exit(struct bch_fs *);
 int bch2_fs_snapshots_start(struct bch_fs *);
-- 
cgit 


From e4085b70f21f0e4b578a50a9fd7e84f2a055010f Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 14 Jul 2022 00:44:09 -0400
Subject: bcachefs: fsck_inode_rm() shouldn't delete subvols

We should never see an inode marked as unlinked that's a subvolume root
(or a directory) in fsck, but even if we do it's not correct for fsck to
delete the subvolume: subvolumes are owned by dirents, and if we find a
dangling subvolume (not marked as unlinked) we want fsck to reattach it.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/fsck.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index bdf0183d5d21..609ac37ff1e9 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -231,6 +231,7 @@ static int write_inode(struct btree_trans *trans,
 
 static int fsck_inode_rm(struct btree_trans *trans, u64 inum, u32 snapshot)
 {
+	struct bch_fs *c = trans->c;
 	struct btree_iter iter = { NULL };
 	struct bkey_i_inode_generation delete;
 	struct bch_inode_unpacked inode_u;
@@ -263,7 +264,7 @@ retry:
 		goto err;
 
 	if (!bkey_is_inode(k.k)) {
-		bch2_fs_inconsistent(trans->c,
+		bch2_fs_inconsistent(c,
 				     "inode %llu:%u not found when deleting",
 				     inum, snapshot);
 		ret = -EIO;
@@ -273,11 +274,8 @@ retry:
 	bch2_inode_unpack(k, &inode_u);
 
 	/* Subvolume root? */
-	if (inode_u.bi_subvol) {
-		ret = bch2_subvolume_delete(trans, inode_u.bi_subvol);
-		if (ret)
-			goto err;
-	}
+	if (inode_u.bi_subvol)
+		bch_warn(c, "deleting inode %llu marked as unlinked, but also a subvolume root!?", inode_u.bi_inum);
 
 	bkey_inode_generation_init(&delete.k_i);
 	delete.k.p = iter.pos;
-- 
cgit 


From 4ab35c34d5ab258fdd7325315fe5d94699e51eb4 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 14 Jul 2022 01:10:24 -0400
Subject: bcachefs: Fix subvol/snapshot deleting in recovery

fsck doesn't want to run while we're cleaning up deleted snapshots - if
that work needs to be done, we want it to have finished before fsck
runs, otherwise fsck will get confused when it finds multiple keys in
the same snapshot ID equivalence class (i.e. the mechanism that
snapshot deletion uses for cleaning up redundant keys).

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/bcachefs.h  |   2 +
 fs/bcachefs/fsck.c      |  65 ++------------------------
 fs/bcachefs/recovery.c  |   6 +++
 fs/bcachefs/subvolume.c | 119 ++++++++++++++++++++++++++++++++++++++----------
 fs/bcachefs/subvolume.h |   7 ++-
 5 files changed, 111 insertions(+), 88 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index c07ea9af561d..7020eee5de21 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -514,6 +514,8 @@ enum {
 	BCH_FS_INITIAL_GC_UNFIXED,	/* kill when we enumerate fsck errors */
 	BCH_FS_NEED_ANOTHER_GC,
 
+	BCH_FS_HAVE_DELETED_SNAPSHOTS,
+
 	/* errors: */
 	BCH_FS_ERROR,
 	BCH_FS_TOPOLOGY_ERROR,
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 609ac37ff1e9..eda6a6ac3c6e 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -953,66 +953,6 @@ static int check_inodes(struct bch_fs *c, bool full)
 	return ret;
 }
 
-static int check_subvol(struct btree_trans *trans,
-			struct btree_iter *iter)
-{
-	struct bkey_s_c k;
-	struct bkey_s_c_subvolume subvol;
-	int ret;
-
-	k = bch2_btree_iter_peek(iter);
-	if (!k.k)
-		return 0;
-
-	ret = bkey_err(k);
-	if (ret)
-		return ret;
-
-	if (k.k->type != KEY_TYPE_subvolume)
-		return 0;
-
-	subvol = bkey_s_c_to_subvolume(k);
-
-	if (BCH_SUBVOLUME_UNLINKED(subvol.v)) {
-		ret = bch2_subvolume_delete(trans, iter->pos.offset);
-		if (ret && ret != -EINTR)
-			bch_err(trans->c, "error deleting subvolume %llu: %i",
-				iter->pos.offset, ret);
-		if (ret)
-			return ret;
-	}
-
-	return 0;
-}
-
-noinline_for_stack
-static int check_subvols(struct bch_fs *c)
-{
-	struct btree_trans trans;
-	struct btree_iter iter;
-	int ret;
-
-	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
-
-	bch2_trans_iter_init(&trans, &iter, BTREE_ID_subvolumes,
-			     POS_MIN,
-			     BTREE_ITER_INTENT|
-			     BTREE_ITER_PREFETCH);
-
-	do {
-		ret = commit_do(&trans, NULL, NULL,
-				      BTREE_INSERT_LAZY_RW|
-				      BTREE_INSERT_NOFAIL,
-				      check_subvol(&trans, &iter));
-		if (ret)
-			break;
-	} while (bch2_btree_iter_advance(&iter));
-	bch2_trans_iter_exit(&trans, &iter);
-
-	bch2_trans_exit(&trans);
-	return ret;
-}
-
 /*
  * Checking for overlapping extents needs to be reimplemented
  */
@@ -2384,9 +2324,10 @@ static int fix_reflink_p(struct bch_fs *c)
  */
 int bch2_fsck_full(struct bch_fs *c)
 {
-	return  bch2_fs_snapshots_check(c) ?:
+	return  bch2_fs_check_snapshots(c) ?:
+		bch2_fs_check_subvols(c) ?:
+		bch2_delete_dead_snapshots(c) ?:
 		check_inodes(c, true) ?:
-		check_subvols(c) ?:
 		check_extents(c) ?:
 		check_dirents(c) ?:
 		check_xattrs(c) ?:
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index e6aed8d79bea..b7598e26c683 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -1396,6 +1396,12 @@ out:
 		bch2_journal_entries_free(c);
 	}
 	kfree(clean);
+
+	if (!ret && test_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags)) {
+		bch2_fs_read_write_early(c);
+		bch2_delete_dead_snapshots_async(c);
+	}
+
 	if (ret)
 		bch_err(c, "Error in recovery: %s (%i)", err, ret);
 	else
diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c
index 1865c5b3a2c5..91133b3de325 100644
--- a/fs/bcachefs/subvolume.c
+++ b/fs/bcachefs/subvolume.c
@@ -9,9 +9,6 @@
 
 /* Snapshot tree: */
 
-static void bch2_delete_dead_snapshots_work(struct work_struct *);
-static void bch2_delete_dead_snapshots(struct bch_fs *);
-
 void bch2_snapshot_to_text(struct printbuf *out, struct bch_fs *c,
 			   struct bkey_s_c k)
 {
@@ -249,7 +246,7 @@ static int bch2_snapshot_check(struct btree_trans *trans,
 	return 0;
 }
 
-int bch2_fs_snapshots_check(struct bch_fs *c)
+int bch2_fs_check_snapshots(struct bch_fs *c)
 {
 	struct btree_trans trans;
 	struct btree_iter iter;
@@ -299,6 +296,66 @@ err:
 	return ret;
 }
 
+static int check_subvol(struct btree_trans *trans,
+			struct btree_iter *iter)
+{
+	struct bkey_s_c k;
+	struct bkey_s_c_subvolume subvol;
+	int ret;
+
+	k = bch2_btree_iter_peek(iter);
+	if (!k.k)
+		return 0;
+
+	ret = bkey_err(k);
+	if (ret)
+		return ret;
+
+	if (k.k->type != KEY_TYPE_subvolume)
+		return 0;
+
+	subvol = bkey_s_c_to_subvolume(k);
+
+	if (BCH_SUBVOLUME_UNLINKED(subvol.v)) {
+		ret = bch2_subvolume_delete(trans, iter->pos.offset);
+		if (ret && ret != -EINTR)
+			bch_err(trans->c, "error deleting subvolume %llu: %i",
+				iter->pos.offset, ret);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+int bch2_fs_check_subvols(struct bch_fs *c)
+{
+	struct btree_trans trans;
+	struct btree_iter iter;
+	int ret;
+
+	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
+
+	bch2_trans_iter_init(&trans, &iter, BTREE_ID_subvolumes,
+			     POS_MIN,
+			     BTREE_ITER_INTENT|
+			     BTREE_ITER_PREFETCH);
+
+	do {
+		ret = commit_do(&trans, NULL, NULL,
+				      BTREE_INSERT_LAZY_RW|
+				      BTREE_INSERT_NOFAIL,
+				      check_subvol(&trans, &iter));
+		if (ret)
+			break;
+	} while (bch2_btree_iter_advance(&iter));
+	bch2_trans_iter_exit(&trans, &iter);
+
+	bch2_trans_exit(&trans);
+
+	return ret;
+}
+
 void bch2_fs_snapshots_exit(struct bch_fs *c)
 {
 	genradix_free(&c->snapshots);
@@ -309,7 +366,6 @@ int bch2_fs_snapshots_start(struct bch_fs *c)
 	struct btree_trans trans;
 	struct btree_iter iter;
 	struct bkey_s_c k;
-	bool have_deleted = false;
 	int ret = 0;
 
 	bch2_trans_init(&trans, c, 0, 0);
@@ -326,7 +382,7 @@ int bch2_fs_snapshots_start(struct bch_fs *c)
 		}
 
 		if (BCH_SNAPSHOT_DELETED(bkey_s_c_to_snapshot(k).v))
-			have_deleted = true;
+			set_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags);
 
 		ret = bch2_mark_snapshot(&trans, bkey_s_c_null, k, 0);
 		if (ret)
@@ -342,16 +398,6 @@ int bch2_fs_snapshots_start(struct bch_fs *c)
 		goto err;
 err:
 	bch2_trans_exit(&trans);
-
-	if (!ret && have_deleted) {
-		bch_info(c, "restarting deletion of dead snapshots");
-		if (c->opts.fsck) {
-			bch2_delete_dead_snapshots_work(&c->snapshot_delete_work);
-		} else {
-			bch2_delete_dead_snapshots(c);
-		}
-	}
-
 	return ret;
 }
 
@@ -598,10 +644,6 @@ static int bch2_snapshot_delete_keys_btree(struct btree_trans *trans,
 
 		if (snapshot_list_has_id(deleted, k.k->p.snapshot) ||
 		    snapshot_list_has_id(&equiv_seen, equiv)) {
-			if (btree_id == BTREE_ID_inodes &&
-			    bch2_btree_key_cache_flush(trans, btree_id, iter.pos))
-				continue;
-
 			ret = commit_do(trans, NULL, NULL,
 					      BTREE_INSERT_NOFAIL,
 				bch2_btree_iter_traverse(&iter) ?:
@@ -624,9 +666,8 @@ static int bch2_snapshot_delete_keys_btree(struct btree_trans *trans,
 	return ret;
 }
 
-static void bch2_delete_dead_snapshots_work(struct work_struct *work)
+int bch2_delete_dead_snapshots(struct bch_fs *c)
 {
-	struct bch_fs *c = container_of(work, struct bch_fs, snapshot_delete_work);
 	struct btree_trans trans;
 	struct btree_iter iter;
 	struct bkey_s_c k;
@@ -635,6 +676,17 @@ static void bch2_delete_dead_snapshots_work(struct work_struct *work)
 	u32 i, id, children[2];
 	int ret = 0;
 
+	if (!test_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags))
+		return 0;
+
+	if (!test_bit(BCH_FS_STARTED, &c->flags)) {
+		ret = bch2_fs_read_write_early(c);
+		if (ret) {
+			bch_err(c, "error deleleting dead snapshots: error going rw: %i", ret);
+			return ret;
+		}
+	}
+
 	bch2_trans_init(&trans, c, 0, 0);
 
 	/*
@@ -718,15 +770,25 @@ static void bch2_delete_dead_snapshots_work(struct work_struct *work)
 			goto err;
 		}
 	}
+
+	clear_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags);
 err:
 	darray_exit(&deleted);
 	bch2_trans_exit(&trans);
+	return ret;
+}
+
+static void bch2_delete_dead_snapshots_work(struct work_struct *work)
+{
+	struct bch_fs *c = container_of(work, struct bch_fs, snapshot_delete_work);
+
+	bch2_delete_dead_snapshots(c);
 	percpu_ref_put(&c->writes);
 }
 
-static void bch2_delete_dead_snapshots(struct bch_fs *c)
+void bch2_delete_dead_snapshots_async(struct bch_fs *c)
 {
-	if (unlikely(!percpu_ref_tryget_live(&c->writes)))
+	if (!percpu_ref_tryget_live(&c->writes))
 		return;
 
 	if (!queue_work(system_long_wq, &c->snapshot_delete_work))
@@ -736,7 +798,14 @@ static void bch2_delete_dead_snapshots(struct bch_fs *c)
 static int bch2_delete_dead_snapshots_hook(struct btree_trans *trans,
 					   struct btree_trans_commit_hook *h)
 {
-	bch2_delete_dead_snapshots(trans->c);
+	struct bch_fs *c = trans->c;
+
+	set_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags);
+
+	if (!test_bit(BCH_FS_FSCK_DONE, &c->flags))
+		return 0;
+
+	bch2_delete_dead_snapshots_async(c);
 	return 0;
 }
 
diff --git a/fs/bcachefs/subvolume.h b/fs/bcachefs/subvolume.h
index 28dbd0968f3d..7823040c9641 100644
--- a/fs/bcachefs/subvolume.h
+++ b/fs/bcachefs/subvolume.h
@@ -114,7 +114,9 @@ static inline int snapshot_list_add(struct bch_fs *c, snapshot_id_list *s, u32 i
 	return ret;
 }
 
-int bch2_fs_snapshots_check(struct bch_fs *);
+int bch2_fs_check_snapshots(struct bch_fs *);
+int bch2_fs_check_subvols(struct bch_fs *);
+
 void bch2_fs_snapshots_exit(struct bch_fs *);
 int bch2_fs_snapshots_start(struct bch_fs *);
 
@@ -137,6 +139,9 @@ int bch2_subvolume_get_snapshot(struct btree_trans *, u32, u32 *);
 int bch2_snapshot_node_create(struct btree_trans *, u32,
 			      u32 *, u32 *, unsigned);
 
+int bch2_delete_dead_snapshots(struct bch_fs *);
+void bch2_delete_dead_snapshots_async(struct bch_fs *);
+
 int bch2_subvolume_delete(struct btree_trans *, u32);
 int bch2_subvolume_unlink(struct btree_trans *, u32);
 int bch2_subvolume_create(struct btree_trans *, u64, u32,
-- 
cgit 


From 49124d8a7f3bc0f2bd33ba6cdfa2e9514a74b109 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 14 Jul 2022 02:47:36 -0400
Subject: bcachefs: Improve snapshots_seen

This makes the snapshots_seen data structure fsck private and improves
it; we now also track the equivalence class for each snapshot id we've
seen, which means we can detect when snapshot deletion hasn't finished
or run correctly (which will otherwise confuse fsck).

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/fsck.c      | 152 ++++++++++++++++++++++++++++++++++--------------
 fs/bcachefs/subvolume.h |  35 ++++-------
 2 files changed, 118 insertions(+), 69 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index eda6a6ac3c6e..1cb5787f5a6c 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -469,19 +469,60 @@ out:
 	return ret;
 }
 
-static int snapshots_seen_update(struct bch_fs *c, struct snapshots_seen *s, struct bpos pos)
+struct snapshots_seen_entry {
+	u32				id;
+	u32				equiv;
+};
+
+struct snapshots_seen {
+	struct bpos			pos;
+	DARRAY(struct snapshots_seen_entry) ids;
+};
+
+static inline void snapshots_seen_exit(struct snapshots_seen *s)
+{
+	darray_exit(&s->ids);
+}
+
+static inline void snapshots_seen_init(struct snapshots_seen *s)
+{
+	memset(s, 0, sizeof(*s));
+}
+
+static int snapshots_seen_update(struct bch_fs *c, struct snapshots_seen *s,
+				 enum btree_id btree_id, struct bpos pos)
 {
-	pos.snapshot = snapshot_t(c, pos.snapshot)->equiv;
+	struct snapshots_seen_entry *i, n = {
+		.id	= pos.snapshot,
+		.equiv	= bch2_snapshot_equiv(c, pos.snapshot),
+	};
+	int ret;
 
 	if (bkey_cmp(s->pos, pos))
 		s->ids.nr = 0;
+
+	pos.snapshot = n.equiv;
 	s->pos = pos;
 
-	/* Might get called multiple times due to lock restarts */
-	if (s->ids.nr && s->ids.data[s->ids.nr - 1] == pos.snapshot)
-		return 0;
+	darray_for_each(s->ids, i)
+		if (i->equiv == n.equiv) {
+			if (i->id != n.id) {
+				bch_err(c, "snapshot deletion did not run correctly:\n"
+					"  duplicate keys in btree %s at %llu:%llu snapshots %u, %u (equiv %u)\n",
+					bch2_btree_ids[btree_id],
+					pos.inode, pos.offset,
+					i->id, n.id, n.equiv);
+				return -EINVAL;
+			}
 
-	return snapshots_seen_add(c, s, pos.snapshot);
+			return 0;
+		}
+
+	ret = darray_push(&s->ids, n);
+	if (ret)
+		bch_err(c, "error reallocating snapshots_seen table (size %zu)",
+			s->ids.size);
+	return ret;
 }
 
 /**
@@ -494,15 +535,15 @@ static bool key_visible_in_snapshot(struct bch_fs *c, struct snapshots_seen *see
 				    u32 id, u32 ancestor)
 {
 	ssize_t i;
+	u32 top = seen->ids.nr ? seen->ids.data[seen->ids.nr - 1].equiv : 0;
 
 	BUG_ON(id > ancestor);
-
-	id		= snapshot_t(c, id)->equiv;
-	ancestor	= snapshot_t(c, ancestor)->equiv;
+	BUG_ON(!bch2_snapshot_is_equiv(c, id));
+	BUG_ON(!bch2_snapshot_is_equiv(c, ancestor));
 
 	/* @ancestor should be the snapshot most recently added to @seen */
-	BUG_ON(!seen->ids.nr || seen->ids.data[seen->ids.nr - 1] != ancestor);
-	BUG_ON(seen->pos.snapshot != ancestor);
+	BUG_ON(ancestor != seen->pos.snapshot);
+	BUG_ON(ancestor != top);
 
 	if (id == ancestor)
 		return true;
@@ -511,10 +552,10 @@ static bool key_visible_in_snapshot(struct bch_fs *c, struct snapshots_seen *see
 		return false;
 
 	for (i = seen->ids.nr - 2;
-	     i >= 0 && seen->ids.data[i] >= id;
+	     i >= 0 && seen->ids.data[i].equiv >= id;
 	     --i)
-		if (bch2_snapshot_is_ancestor(c, id, seen->ids.data[i]) &&
-		    bch2_snapshot_is_ancestor(c, seen->ids.data[i], ancestor))
+		if (bch2_snapshot_is_ancestor(c, id, seen->ids.data[i].equiv) &&
+		    bch2_snapshot_is_ancestor(c, seen->ids.data[i].equiv, ancestor))
 			return false;
 
 	return true;
@@ -539,8 +580,9 @@ static int ref_visible(struct bch_fs *c, struct snapshots_seen *s,
 		: bch2_snapshot_is_ancestor(c, src, dst);
 }
 
-#define for_each_visible_inode(_c, _s, _w, _snapshot, _i)	\
-	for (_i = (_w)->inodes.data; _i < (_w)->inodes.data + (_w)->inodes.nr && (_i)->snapshot <= (_snapshot); _i++)\
+#define for_each_visible_inode(_c, _s, _w, _snapshot, _i)				\
+	for (_i = (_w)->inodes.data; _i < (_w)->inodes.data + (_w)->inodes.nr &&	\
+	     (_i)->snapshot <= (_snapshot); _i++)					\
 		if (key_visible_in_snapshot(_c, _s, _i->snapshot, _snapshot))
 
 struct inode_walker_entry {
@@ -575,7 +617,7 @@ static int add_inode(struct bch_fs *c, struct inode_walker *w,
 
 	return darray_push(&w->inodes, ((struct inode_walker_entry) {
 		.inode		= u,
-		.snapshot	= snapshot_t(c, inode.k->p.snapshot)->equiv,
+		.snapshot	= bch2_snapshot_equiv(c, inode.k->p.snapshot),
 	}));
 }
 
@@ -585,10 +627,10 @@ static int __walk_inode(struct btree_trans *trans,
 	struct bch_fs *c = trans->c;
 	struct btree_iter iter;
 	struct bkey_s_c k;
-	unsigned i, ancestor_pos;
+	unsigned i;
 	int ret;
 
-	pos.snapshot = snapshot_t(c, pos.snapshot)->equiv;
+	pos.snapshot = bch2_snapshot_equiv(c, pos.snapshot);
 
 	if (pos.inode == w->cur_inum) {
 		w->first_this_inode = false;
@@ -621,17 +663,20 @@ found:
 	BUG_ON(pos.snapshot > w->inodes.data[i].snapshot);
 
 	if (pos.snapshot != w->inodes.data[i].snapshot) {
-		ancestor_pos = i;
+		struct inode_walker_entry e = w->inodes.data[i];
+
+		e.snapshot = pos.snapshot;
+		e.count = 0;
+
+		bch_info(c, "have key for inode %llu:%u but have inode in ancestor snapshot %u",
+			 pos.inode, pos.snapshot, w->inodes.data[i].snapshot);
 
 		while (i && w->inodes.data[i - 1].snapshot > pos.snapshot)
 			--i;
 
-		ret = darray_insert_item(&w->inodes, i, w->inodes.data[ancestor_pos]);
+		ret = darray_insert_item(&w->inodes, i, e);
 		if (ret)
 			return ret;
-
-		w->inodes.data[i].snapshot = pos.snapshot;
-		w->inodes.data[i].count	= 0;
 	}
 
 	return i;
@@ -651,17 +696,19 @@ static int __get_visible_inodes(struct btree_trans *trans,
 
 	for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, inum),
 			   BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
+		u32 equiv = bch2_snapshot_equiv(c, k.k->p.snapshot);
+
 		if (k.k->p.offset != inum)
 			break;
 
-		if (!bkey_is_inode(k.k))
+		if (!ref_visible(c, s, s->pos.snapshot, equiv))
 			continue;
 
-		if (ref_visible(c, s, s->pos.snapshot, k.k->p.snapshot)) {
+		if (bkey_is_inode(k.k))
 			add_inode(c, w, k);
-			if (k.k->p.snapshot >= s->pos.snapshot)
-				break;
-		}
+
+		if (equiv >= s->pos.snapshot)
+			break;
 	}
 	bch2_trans_iter_exit(trans, &iter);
 
@@ -676,7 +723,7 @@ static int check_key_has_snapshot(struct btree_trans *trans,
 	struct printbuf buf = PRINTBUF;
 	int ret = 0;
 
-	if (mustfix_fsck_err_on(!snapshot_t(c, k.k->p.snapshot)->equiv, c,
+	if (mustfix_fsck_err_on(!bch2_snapshot_equiv(c, k.k->p.snapshot), c,
 			"key in missing snapshot: %s",
 			(bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
 		ret = bch2_btree_delete_at(trans, iter,
@@ -784,6 +831,7 @@ fsck_err:
 static int check_inode(struct btree_trans *trans,
 		       struct btree_iter *iter,
 		       struct bch_inode_unpacked *prev,
+		       struct snapshots_seen *s,
 		       bool full)
 {
 	struct bch_fs *c = trans->c;
@@ -806,6 +854,10 @@ static int check_inode(struct btree_trans *trans,
 	if (ret)
 		return 0;
 
+	ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p);
+	if (ret)
+		goto err;
+
 	/*
 	 * if snapshot id isn't a leaf node, skip it - deletion in
 	 * particular is not atomic, so on the internal snapshot nodes
@@ -928,8 +980,10 @@ static int check_inodes(struct bch_fs *c, bool full)
 	struct btree_trans trans;
 	struct btree_iter iter;
 	struct bch_inode_unpacked prev = { 0 };
+	struct snapshots_seen s;
 	int ret;
 
+	snapshots_seen_init(&s);
 	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
 
 	bch2_trans_iter_init(&trans, &iter, BTREE_ID_inodes, POS_MIN,
@@ -941,13 +995,14 @@ static int check_inodes(struct bch_fs *c, bool full)
 		ret = commit_do(&trans, NULL, NULL,
 				      BTREE_INSERT_LAZY_RW|
 				      BTREE_INSERT_NOFAIL,
-			check_inode(&trans, &iter, &prev, full));
+			check_inode(&trans, &iter, &prev, &s, full));
 		if (ret)
 			break;
 	} while (bch2_btree_iter_advance(&iter));
 	bch2_trans_iter_exit(&trans, &iter);
 
 	bch2_trans_exit(&trans);
+	snapshots_seen_exit(&s);
 	if (ret)
 		bch_err(c, "error %i from check_inodes()", ret);
 	return ret;
@@ -1096,6 +1151,7 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
 	struct bkey_s_c k;
 	struct inode_walker_entry *i;
 	struct printbuf buf = PRINTBUF;
+	struct bpos equiv;
 	int ret = 0;
 peek:
 	k = bch2_btree_iter_peek(iter);
@@ -1112,7 +1168,10 @@ peek:
 		goto out;
 	}
 
-	ret = snapshots_seen_update(c, s, k.k->p);
+	equiv = k.k->p;
+	equiv.snapshot = bch2_snapshot_equiv(c, k.k->p.snapshot);
+
+	ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p);
 	if (ret)
 		goto err;
 
@@ -1147,7 +1206,7 @@ peek:
 		}
 	}
 #endif
-	ret = __walk_inode(trans, inode, k.k->p);
+	ret = __walk_inode(trans, inode, equiv);
 	if (ret < 0)
 		goto err;
 
@@ -1179,8 +1238,8 @@ peek:
 		goto out;
 	}
 
-	if (!bch2_snapshot_internal_node(c, k.k->p.snapshot)) {
-		for_each_visible_inode(c, s, inode, k.k->p.snapshot, i) {
+	if (!bch2_snapshot_internal_node(c, equiv.snapshot)) {
+		for_each_visible_inode(c, s, inode, equiv.snapshot, i) {
 			if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_I_SIZE_DIRTY) &&
 					k.k->type != KEY_TYPE_reservation &&
 					k.k->p.offset > round_up(i->inode.bi_size, block_bytes(c)) >> 9, c,
@@ -1189,7 +1248,7 @@ peek:
 				bch2_fs_lazy_rw(c);
 				ret = bch2_btree_delete_range_trans(trans, BTREE_ID_extents,
 						SPOS(k.k->p.inode, round_up(i->inode.bi_size, block_bytes(c)) >> 9,
-						     k.k->p.snapshot),
+						     equiv.snapshot),
 						POS(k.k->p.inode, U64_MAX),
 						0, NULL) ?: -EINTR;
 				goto out;
@@ -1198,7 +1257,7 @@ peek:
 	}
 
 	if (bkey_extent_is_allocation(k.k))
-		for_each_visible_inode(c, s, inode, k.k->p.snapshot, i)
+		for_each_visible_inode(c, s, inode, equiv.snapshot, i)
 			i->count += k.k->size;
 #if 0
 	bch2_bkey_buf_reassemble(&prev, c, k);
@@ -1433,6 +1492,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
 	struct bkey_s_c_dirent d;
 	struct inode_walker_entry *i;
 	struct printbuf buf = PRINTBUF;
+	struct bpos equiv;
 	int ret = 0;
 peek:
 	k = bch2_btree_iter_peek(iter);
@@ -1449,7 +1509,10 @@ peek:
 		goto out;
 	}
 
-	ret = snapshots_seen_update(c, s, k.k->p);
+	equiv = k.k->p;
+	equiv.snapshot = bch2_snapshot_equiv(c, k.k->p.snapshot);
+
+	ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p);
 	if (ret)
 		goto err;
 
@@ -1467,7 +1530,7 @@ peek:
 		goto peek;
 	}
 
-	ret = __walk_inode(trans, dir, k.k->p);
+	ret = __walk_inode(trans, dir, equiv);
 	if (ret < 0)
 		goto err;
 
@@ -1567,7 +1630,8 @@ peek:
 			goto err;
 
 		if (fsck_err_on(!target->inodes.nr, c,
-				"dirent points to missing inode:\n%s",
+				"dirent points to missing inode: (equiv %u)\n%s",
+				equiv.snapshot,
 				(printbuf_reset(&buf),
 				 bch2_bkey_val_to_text(&buf, c, k),
 				 buf.buf))) {
@@ -1585,7 +1649,7 @@ peek:
 	}
 
 	if (d.v->d_type == DT_DIR)
-		for_each_visible_inode(c, s, dir, d.k->p.snapshot, i)
+		for_each_visible_inode(c, s, dir, equiv.snapshot, i)
 			i->count++;
 
 out:
@@ -1841,7 +1905,7 @@ static int check_path(struct btree_trans *trans,
 	struct bch_fs *c = trans->c;
 	int ret = 0;
 
-	snapshot = snapshot_t(c, snapshot)->equiv;
+	snapshot = bch2_snapshot_equiv(c, snapshot);
 	p->nr = 0;
 
 	while (!(inode->bi_inum == BCACHEFS_ROOT_INO &&
@@ -2126,7 +2190,7 @@ static int check_nlinks_walk_dirents(struct bch_fs *c, struct nlink_table *links
 			   BTREE_ITER_INTENT|
 			   BTREE_ITER_PREFETCH|
 			   BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
-		ret = snapshots_seen_update(c, &s, k.k->p);
+		ret = snapshots_seen_update(c, &s, iter.btree_id, k.k->p);
 		if (ret)
 			break;
 
@@ -2138,7 +2202,7 @@ static int check_nlinks_walk_dirents(struct bch_fs *c, struct nlink_table *links
 			    d.v->d_type != DT_SUBVOL)
 				inc_link(c, &s, links, range_start, range_end,
 					 le64_to_cpu(d.v->d_inum),
-					 d.k->p.snapshot);
+					 bch2_snapshot_equiv(c, d.k->p.snapshot));
 			break;
 		}
 	}
diff --git a/fs/bcachefs/subvolume.h b/fs/bcachefs/subvolume.h
index 7823040c9641..02a636644988 100644
--- a/fs/bcachefs/subvolume.h
+++ b/fs/bcachefs/subvolume.h
@@ -27,6 +27,16 @@ static inline u32 bch2_snapshot_parent(struct bch_fs *c, u32 id)
 	return snapshot_t(c, id)->parent;
 }
 
+static inline u32 bch2_snapshot_equiv(struct bch_fs *c, u32 id)
+{
+	return snapshot_t(c, id)->equiv;
+}
+
+static inline bool bch2_snapshot_is_equiv(struct bch_fs *c, u32 id)
+{
+	return id == snapshot_t(c, id)->equiv;
+}
+
 static inline u32 bch2_snapshot_internal_node(struct bch_fs *c, u32 id)
 {
 	struct snapshot_t *s = snapshot_t(c, id);
@@ -58,31 +68,6 @@ static inline bool bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ances
 	return id == ancestor;
 }
 
-struct snapshots_seen {
-	struct bpos			pos;
-	DARRAY(u32)			ids;
-};
-
-static inline void snapshots_seen_exit(struct snapshots_seen *s)
-{
-	kfree(s->ids.data);
-	s->ids.data = NULL;
-}
-
-static inline void snapshots_seen_init(struct snapshots_seen *s)
-{
-	memset(s, 0, sizeof(*s));
-}
-
-static inline int snapshots_seen_add(struct bch_fs *c, struct snapshots_seen *s, u32 id)
-{
-	int ret = darray_push(&s->ids, id);
-	if (ret)
-		bch_err(c, "error reallocating snapshots_seen table (size %zu)",
-			s->ids.size);
-	return ret;
-}
-
 static inline bool snapshot_list_has_id(snapshot_id_list *s, u32 id)
 {
 	u32 *i;
-- 
cgit 


From 35f1a5034d81416ca820032452bed583f78f1f5e Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 14 Jul 2022 05:44:10 -0400
Subject: bcachefs: Improve fsck for subvols/snapshots

 - Bunch of refactoring, and move some code out of
   bch2_snapshots_start() and into bch2_snapshots_check(), for constency
   with the rest of fsck

 - Interior snapshot nodes no longer point to a subvolume; this is so we
   don't end up with dangling subvol references when deleting or require
   scanning the full snapshots btree.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/fsck.c      |   5 +-
 fs/bcachefs/subvolume.c | 228 +++++++++++++++++++++++++++---------------------
 2 files changed, 131 insertions(+), 102 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 1cb5787f5a6c..43575d7e050e 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -2403,5 +2403,8 @@ int bch2_fsck_full(struct bch_fs *c)
 
 int bch2_fsck_walk_inodes_only(struct bch_fs *c)
 {
-	return check_inodes(c, false);
+	return  bch2_fs_check_snapshots(c) ?:
+		bch2_fs_check_subvols(c) ?:
+		bch2_delete_dead_snapshots(c) ?:
+		check_inodes(c, false);
 }
diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c
index 91133b3de325..463b5afd3fc7 100644
--- a/fs/bcachefs/subvolume.c
+++ b/fs/bcachefs/subvolume.c
@@ -140,89 +140,96 @@ static int snapshot_live(struct btree_trans *trans, u32 id)
 	return !BCH_SNAPSHOT_DELETED(&v);
 }
 
-static int bch2_snapshots_set_equiv(struct btree_trans *trans)
+static int bch2_snapshot_set_equiv(struct btree_trans *trans,
+				   struct bkey_s_c_snapshot snap)
 {
 	struct bch_fs *c = trans->c;
+	unsigned i, nr_live = 0, live_idx = 0;
+	u32 id = snap.k->p.offset, child[2] = {
+		[0] = le32_to_cpu(snap.v->children[0]),
+		[1] = le32_to_cpu(snap.v->children[1])
+	};
+
+	for (i = 0; i < 2; i++) {
+		int ret = snapshot_live(trans, child[i]);
+		if (ret < 0)
+			return ret;
+
+		if (ret)
+			live_idx = i;
+		nr_live += ret;
+	}
+
+	snapshot_t(c, id)->equiv = nr_live == 1
+		? snapshot_t(c, child[live_idx])->equiv
+		: id;
+	return 0;
+}
+
+static int bch2_snapshots_set_equiv(struct btree_trans *trans)
+{
 	struct btree_iter iter;
 	struct bkey_s_c k;
-	struct bkey_s_c_snapshot snap;
-	unsigned i;
 	int ret;
 
 	for_each_btree_key(trans, iter, BTREE_ID_snapshots,
 			   POS_MIN, 0, k, ret) {
-		u32 id = k.k->p.offset, child[2];
-		unsigned nr_live = 0, live_idx = 0;
-
 		if (k.k->type != KEY_TYPE_snapshot)
 			continue;
 
-		snap = bkey_s_c_to_snapshot(k);
-		child[0] = le32_to_cpu(snap.v->children[0]);
-		child[1] = le32_to_cpu(snap.v->children[1]);
-
-		for (i = 0; i < 2; i++) {
-			ret = snapshot_live(trans, child[i]);
-			if (ret < 0)
-				goto err;
-
-			if (ret)
-				live_idx = i;
-			nr_live += ret;
-		}
-
-		snapshot_t(c, id)->equiv = nr_live == 1
-			? snapshot_t(c, child[live_idx])->equiv
-			: id;
+		ret = bch2_snapshot_set_equiv(trans, bkey_s_c_to_snapshot(k));
+		if (ret)
+			break;
 	}
-err:
 	bch2_trans_iter_exit(trans, &iter);
 
 	if (ret)
-		bch_err(c, "error walking snapshots: %i", ret);
+		bch_err(trans->c, "error in bch2_snapshots_set_equiv: %i", ret);
 
 	return ret;
 }
 
 /* fsck: */
-static int bch2_snapshot_check(struct btree_trans *trans,
-			       struct bkey_s_c_snapshot s)
+static int check_snapshot(struct btree_trans *trans,
+			  struct btree_iter *iter)
 {
+	struct bch_fs *c = trans->c;
+	struct bkey_s_c_snapshot s;
 	struct bch_subvolume subvol;
 	struct bch_snapshot v;
+	struct bkey_s_c k;
+	struct printbuf buf = PRINTBUF;
+	bool should_have_subvol;
 	u32 i, id;
-	int ret;
+	int ret = 0;
 
-	if (!BCH_SNAPSHOT_DELETED(s.v)) {
-		id = le32_to_cpu(s.v->subvol);
-		ret = lockrestart_do(trans, bch2_subvolume_get(trans, id, 0, false, &subvol));
-		if (ret == -ENOENT)
-			bch_err(trans->c, "snapshot node %llu has nonexistent subvolume %u",
-				s.k->p.offset, id);
-		if (ret)
-			return ret;
+	k = bch2_btree_iter_peek(iter);
+	if (!k.k)
+		return 0;
 
-		if (BCH_SNAPSHOT_SUBVOL(s.v) != (le32_to_cpu(subvol.snapshot) == s.k->p.offset)) {
-			bch_err(trans->c, "snapshot node %llu has wrong BCH_SNAPSHOT_SUBVOL",
-				s.k->p.offset);
-			return -EINVAL;
-		}
-	}
+	ret = bkey_err(k);
+	if (ret)
+		return ret;
+
+	if (k.k->type != KEY_TYPE_snapshot)
+		return 0;
 
+	s = bkey_s_c_to_snapshot(k);
 	id = le32_to_cpu(s.v->parent);
 	if (id) {
 		ret = lockrestart_do(trans, snapshot_lookup(trans, id, &v));
 		if (ret == -ENOENT)
-			bch_err(trans->c, "snapshot node %llu has nonexistent parent %u",
-				s.k->p.offset, id);
+			bch_err(c, "snapshot with nonexistent parent:\n  %s",
+				(bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf));
 		if (ret)
-			return ret;
+			goto err;
 
 		if (le32_to_cpu(v.children[0]) != s.k->p.offset &&
 		    le32_to_cpu(v.children[1]) != s.k->p.offset) {
-			bch_err(trans->c, "snapshot parent %u missing pointer to child %llu",
+			bch_err(c, "snapshot parent %u missing pointer to child %llu",
 				id, s.k->p.offset);
-			return -EINVAL;
+			ret = -EINVAL;
+			goto err;
 		}
 	}
 
@@ -231,67 +238,86 @@ static int bch2_snapshot_check(struct btree_trans *trans,
 
 		ret = lockrestart_do(trans, snapshot_lookup(trans, id, &v));
 		if (ret == -ENOENT)
-			bch_err(trans->c, "snapshot node %llu has nonexistent child %u",
+			bch_err(c, "snapshot node %llu has nonexistent child %u",
 				s.k->p.offset, id);
 		if (ret)
-			return ret;
+			goto err;
 
 		if (le32_to_cpu(v.parent) != s.k->p.offset) {
-			bch_err(trans->c, "snapshot child %u has wrong parent (got %u should be %llu)",
+			bch_err(c, "snapshot child %u has wrong parent (got %u should be %llu)",
 				id, le32_to_cpu(v.parent), s.k->p.offset);
-			return -EINVAL;
+			ret = -EINVAL;
+			goto err;
 		}
 	}
 
-	return 0;
+	should_have_subvol = BCH_SNAPSHOT_SUBVOL(s.v) &&
+		!BCH_SNAPSHOT_DELETED(s.v);
+
+	if (should_have_subvol) {
+		id = le32_to_cpu(s.v->subvol);
+		ret = lockrestart_do(trans, bch2_subvolume_get(trans, id, 0, false, &subvol));
+		if (ret == -ENOENT)
+			bch_err(c, "snapshot points to nonexistent subvolume:\n  %s",
+				(bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf));
+		if (ret)
+			goto err;
+
+		if (BCH_SNAPSHOT_SUBVOL(s.v) != (le32_to_cpu(subvol.snapshot) == s.k->p.offset)) {
+			bch_err(c, "snapshot node %llu has wrong BCH_SNAPSHOT_SUBVOL",
+				s.k->p.offset);
+			ret = -EINVAL;
+			goto err;
+		}
+	} else {
+		if (fsck_err_on(s.v->subvol, c, "snapshot should not point to subvol:\n  %s",
+				(bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
+			struct bkey_i_snapshot *u = bch2_trans_kmalloc(trans, sizeof(*u));
+
+			ret = PTR_ERR_OR_ZERO(u);
+			if (ret)
+				goto err;
+
+			bkey_reassemble(&u->k_i, s.s_c);
+			u->v.subvol = 0;
+			ret = bch2_trans_update(trans, iter, &u->k_i, 0);
+			if (ret)
+				goto err;
+		}
+	}
+
+	if (BCH_SNAPSHOT_DELETED(s.v))
+		set_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags);
+err:
+fsck_err:
+	printbuf_exit(&buf);
+	return ret;
 }
 
 int bch2_fs_check_snapshots(struct bch_fs *c)
 {
 	struct btree_trans trans;
 	struct btree_iter iter;
-	struct bkey_s_c k;
-	struct bch_snapshot s;
-	unsigned id;
 	int ret;
 
 	bch2_trans_init(&trans, c, 0, 0);
 
-	for_each_btree_key(&trans, iter, BTREE_ID_snapshots,
-			   POS_MIN, 0, k, ret) {
-		if (k.k->type != KEY_TYPE_snapshot)
-			continue;
+	bch2_trans_iter_init(&trans, &iter, BTREE_ID_snapshots,
+			     POS_MIN, BTREE_ITER_PREFETCH);
 
-		ret = bch2_snapshot_check(&trans, bkey_s_c_to_snapshot(k));
+	do {
+		ret = commit_do(&trans, NULL, NULL,
+				BTREE_INSERT_LAZY_RW|
+				BTREE_INSERT_NOFAIL,
+				check_snapshot(&trans, &iter));
 		if (ret)
 			break;
-	}
+	} while (bch2_btree_iter_advance(&iter));
 	bch2_trans_iter_exit(&trans, &iter);
 
-	if (ret) {
+	if (ret)
 		bch_err(c, "error %i checking snapshots", ret);
-		goto err;
-	}
 
-	for_each_btree_key(&trans, iter, BTREE_ID_subvolumes,
-			   POS_MIN, 0, k, ret) {
-		if (k.k->type != KEY_TYPE_subvolume)
-			continue;
-again_2:
-		id = le32_to_cpu(bkey_s_c_to_subvolume(k).v->snapshot);
-		ret = snapshot_lookup(&trans, id, &s);
-
-		if (ret == -EINTR) {
-			k = bch2_btree_iter_peek(&iter);
-			goto again_2;
-		} else if (ret == -ENOENT)
-			bch_err(c, "subvolume %llu points to nonexistent snapshot %u",
-				k.k->p.offset, id);
-		else if (ret)
-			break;
-	}
-	bch2_trans_iter_exit(&trans, &iter);
-err:
 	bch2_trans_exit(&trans);
 	return ret;
 }
@@ -301,6 +327,8 @@ static int check_subvol(struct btree_trans *trans,
 {
 	struct bkey_s_c k;
 	struct bkey_s_c_subvolume subvol;
+	struct bch_snapshot snapshot;
+	unsigned snapid;
 	int ret;
 
 	k = bch2_btree_iter_peek(iter);
@@ -315,6 +343,14 @@ static int check_subvol(struct btree_trans *trans,
 		return 0;
 
 	subvol = bkey_s_c_to_subvolume(k);
+	snapid = le32_to_cpu(subvol.v->snapshot);
+	ret = snapshot_lookup(trans, snapid, &snapshot);
+
+	if (ret == -ENOENT)
+		bch_err(trans->c, "subvolume %llu points to nonexistent snapshot %u",
+			k.k->p.offset, snapid);
+	if (ret)
+		return ret;
 
 	if (BCH_SUBVOLUME_UNLINKED(subvol.v)) {
 		ret = bch2_subvolume_delete(trans, iter->pos.offset);
@@ -334,12 +370,10 @@ int bch2_fs_check_subvols(struct bch_fs *c)
 	struct btree_iter iter;
 	int ret;
 
-	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
+	bch2_trans_init(&trans, c, 0, 0);
 
 	bch2_trans_iter_init(&trans, &iter, BTREE_ID_subvolumes,
-			     POS_MIN,
-			     BTREE_ITER_INTENT|
-			     BTREE_ITER_PREFETCH);
+			     POS_MIN, BTREE_ITER_PREFETCH);
 
 	do {
 		ret = commit_do(&trans, NULL, NULL,
@@ -375,29 +409,20 @@ int bch2_fs_snapshots_start(struct bch_fs *c)
 	       if (bkey_cmp(k.k->p, POS(0, U32_MAX)) > 0)
 		       break;
 
-		if (k.k->type != KEY_TYPE_snapshot) {
-			bch_err(c, "found wrong key type %u in snapshot node table",
-				k.k->type);
+		if (k.k->type != KEY_TYPE_snapshot)
 			continue;
-		}
-
-		if (BCH_SNAPSHOT_DELETED(bkey_s_c_to_snapshot(k).v))
-			set_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags);
 
-		ret = bch2_mark_snapshot(&trans, bkey_s_c_null, k, 0);
+		ret =   bch2_mark_snapshot(&trans, bkey_s_c_null, k, 0) ?:
+			bch2_snapshot_set_equiv(&trans, bkey_s_c_to_snapshot(k));
 		if (ret)
 			break;
 	}
 	bch2_trans_iter_exit(&trans, &iter);
 
-	if (ret)
-		goto err;
+	bch2_trans_exit(&trans);
 
-	ret = bch2_snapshots_set_equiv(&trans);
 	if (ret)
-		goto err;
-err:
-	bch2_trans_exit(&trans);
+		bch_err(c, "error starting snapshots: %i", ret);
 	return ret;
 }
 
@@ -601,6 +626,7 @@ int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent,
 
 		n->v.children[0] = cpu_to_le32(new_snapids[0]);
 		n->v.children[1] = cpu_to_le32(new_snapids[1]);
+		n->v.subvol = 0;
 		SET_BCH_SNAPSHOT_SUBVOL(&n->v, false);
 		ret = bch2_trans_update(trans, &iter, &n->k_i, 0);
 		if (ret)
-- 
cgit 


From c7a09cb1b13995da938f4e1df52adeba44515d7d Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 16 Jul 2022 23:21:15 -0400
Subject: bcachefs: When fsck finds redundant snapshot keys, trigger snapshots
 cleanup

Fsck now checks for keys in different snapshot IDs that are now
redundant due to other snapshots being deleted - it needs to for its own
algorithms to not get confused.

When it detects this it should re-run the post snapshot deletion cleanup
- this patch does that.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/errcode.h |  1 +
 fs/bcachefs/fsck.c    | 13 +++++++++++--
 2 files changed, 12 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h
index f7d12915c1cc..0581f3c7a0d8 100644
--- a/fs/bcachefs/errcode.h
+++ b/fs/bcachefs/errcode.h
@@ -7,6 +7,7 @@ enum {
 	OPEN_BUCKETS_EMPTY =	2048,
 	FREELIST_EMPTY,		/* Allocator thread not keeping up */
 	INSUFFICIENT_DEVICES,
+	NEED_SNAPSHOT_CLEANUP,
 };
 
 #endif /* _BCACHFES_ERRCODE_H */
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 43575d7e050e..b401c0913bdc 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -512,7 +512,7 @@ static int snapshots_seen_update(struct bch_fs *c, struct snapshots_seen *s,
 					bch2_btree_ids[btree_id],
 					pos.inode, pos.offset,
 					i->id, n.id, n.equiv);
-				return -EINVAL;
+				return -NEED_SNAPSHOT_CLEANUP;
 			}
 
 			return 0;
@@ -2388,7 +2388,9 @@ static int fix_reflink_p(struct bch_fs *c)
  */
 int bch2_fsck_full(struct bch_fs *c)
 {
-	return  bch2_fs_check_snapshots(c) ?:
+	int ret;
+again:
+	ret =   bch2_fs_check_snapshots(c) ?:
 		bch2_fs_check_subvols(c) ?:
 		bch2_delete_dead_snapshots(c) ?:
 		check_inodes(c, true) ?:
@@ -2399,6 +2401,13 @@ int bch2_fsck_full(struct bch_fs *c)
 		check_directory_structure(c) ?:
 		check_nlinks(c) ?:
 		fix_reflink_p(c);
+
+	if (ret == -NEED_SNAPSHOT_CLEANUP) {
+		set_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags);
+		goto again;
+	}
+
+	return ret;
 }
 
 int bch2_fsck_walk_inodes_only(struct bch_fs *c)
-- 
cgit 


From 0d06b4eca687b3a6a07b62fd4ca83d635103c77f Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 16 Jul 2022 23:31:28 -0400
Subject: bcachefs: Fix repair for extent past end of inode

When we find an extent past an inode's i_size, we need to do the
deletion in the inode's snapshot (which will emit a whiteout if
necessary); and we also need to note that we now have an a key at that
position and snapshot, so that we don't go into an infinite loop.

Also, switch to walking inodes in reverse older, oldest snapshot to
newest, so that we emit the fewest whiteouts possible.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/fsck.c | 68 +++++++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 54 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index b401c0913bdc..5cec55edb483 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -489,6 +489,28 @@ static inline void snapshots_seen_init(struct snapshots_seen *s)
 	memset(s, 0, sizeof(*s));
 }
 
+static int snapshots_seen_add(struct bch_fs *c, struct snapshots_seen *s, u32 id)
+{
+	struct snapshots_seen_entry *i, n = { id, id };
+	int ret;
+
+	darray_for_each(s->ids, i) {
+		if (n.equiv < i->equiv)
+			break;
+
+		if (i->equiv == n.equiv) {
+			bch_err(c, "adding duplicate snapshot in snapshots_seen_add()");
+			return -EINVAL;
+		}
+	}
+
+	ret = darray_insert_item(&s->ids, i - s->ids.data, n);
+	if (ret)
+		bch_err(c, "error reallocating snapshots_seen table (size %zu)",
+			s->ids.size);
+	return ret;
+}
+
 static int snapshots_seen_update(struct bch_fs *c, struct snapshots_seen *s,
 				 enum btree_id btree_id, struct bpos pos)
 {
@@ -1238,20 +1260,38 @@ peek:
 		goto out;
 	}
 
-	if (!bch2_snapshot_internal_node(c, equiv.snapshot)) {
-		for_each_visible_inode(c, s, inode, equiv.snapshot, i) {
-			if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_I_SIZE_DIRTY) &&
-					k.k->type != KEY_TYPE_reservation &&
-					k.k->p.offset > round_up(i->inode.bi_size, block_bytes(c)) >> 9, c,
-					"extent type %u offset %llu past end of inode %llu, i_size %llu",
-					k.k->type, k.k->p.offset, k.k->p.inode, i->inode.bi_size)) {
-				bch2_fs_lazy_rw(c);
-				ret = bch2_btree_delete_range_trans(trans, BTREE_ID_extents,
-						SPOS(k.k->p.inode, round_up(i->inode.bi_size, block_bytes(c)) >> 9,
-						     equiv.snapshot),
-						POS(k.k->p.inode, U64_MAX),
-						0, NULL) ?: -EINTR;
-				goto out;
+	/*
+	 * Check inodes in reverse order, from oldest snapshots to newest, so
+	 * that we emit the fewest number of whiteouts necessary:
+	 */
+	for (i = inode->inodes.data + inode->inodes.nr - 1;
+	     i >= inode->inodes.data;
+	     --i) {
+		if (i->snapshot > equiv.snapshot ||
+		    !key_visible_in_snapshot(c, s, i->snapshot, equiv.snapshot))
+			continue;
+
+		if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_I_SIZE_DIRTY) &&
+				k.k->type != KEY_TYPE_reservation &&
+				k.k->p.offset > round_up(i->inode.bi_size, block_bytes(c)) >> 9, c,
+				"extent type past end of inode %llu:%u, i_size %llu\n  %s",
+				i->inode.bi_inum, i->snapshot, i->inode.bi_size,
+				(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
+			struct btree_iter iter2;
+
+			bch2_trans_copy_iter(&iter2, iter);
+			bch2_btree_iter_set_snapshot(&iter2, i->snapshot);
+			ret =   bch2_btree_iter_traverse(&iter2) ?:
+				bch2_btree_delete_at(trans, &iter2,
+					BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+			bch2_trans_iter_exit(trans, &iter2);
+			if (ret)
+				goto err;
+
+			if (i->snapshot != equiv.snapshot) {
+				ret = snapshots_seen_add(c, s, i->snapshot);
+				if (ret)
+					goto err;
 			}
 		}
 	}
-- 
cgit 


From a1783320d46e878ddf5d2bb3380c181d515a5ff3 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 15 Jul 2022 20:51:09 -0400
Subject: bcachefs: for_each_btree_key2()

This introduces two new macros for iterating through the btree, with
transaction restart handling
 - for_each_btree_key2()
 - for_each_btree_key_commit()

Every iteration is now in an implicit transaction, and - as with
lockrestart_do() and commit_do() - returning -EINTR will cause the
transaction to be restarted, at the same key.

This patch converts a bunch of code that was open coding this to these
new macros, saving a substantial amount of code.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/alloc_background.c |  14 +---
 fs/bcachefs/btree_gc.c         | 112 +++++++++------------------
 fs/bcachefs/btree_iter.h       |  33 ++++++++
 fs/bcachefs/fsck.c             | 138 +++++++++------------------------
 fs/bcachefs/quota.c            |  29 +++----
 fs/bcachefs/subvolume.c        | 169 ++++++++++++++++-------------------------
 6 files changed, 191 insertions(+), 304 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index f515e679a90c..a01e79aba480 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -928,16 +928,10 @@ int bch2_check_alloc_to_lru_refs(struct bch_fs *c)
 
 	bch2_trans_init(&trans, c, 0, 0);
 
-	for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN,
-			   BTREE_ITER_PREFETCH, k, ret) {
-		ret = commit_do(&trans, NULL, NULL,
-				      BTREE_INSERT_NOFAIL|
-				      BTREE_INSERT_LAZY_RW,
-			bch2_check_alloc_to_lru_ref(&trans, &iter));
-		if (ret)
-			break;
-	}
-	bch2_trans_iter_exit(&trans, &iter);
+	for_each_btree_key_commit(&trans, iter, BTREE_ID_alloc,
+			POS_MIN, BTREE_ITER_PREFETCH, k,
+			NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW,
+		bch2_check_alloc_to_lru_ref(&trans, &iter));
 
 	bch2_trans_exit(&trans);
 	return ret < 0 ? ret : 0;
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index ebb1ad4b8abe..f72a5ceb130b 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -1845,10 +1845,15 @@ out:
 	return ret;
 }
 
-static bool gc_btree_gens_key(struct bch_fs *c, struct bkey_s_c k)
+static int gc_btree_gens_key(struct btree_trans *trans,
+			     struct btree_iter *iter,
+			     struct bkey_s_c k)
 {
+	struct bch_fs *c = trans->c;
 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
 	const struct bch_extent_ptr *ptr;
+	struct bkey_i *u;
+	int ret;
 
 	percpu_down_read(&c->mark_lock);
 	bkey_for_each_ptr(ptrs, ptr) {
@@ -1856,7 +1861,7 @@ static bool gc_btree_gens_key(struct bch_fs *c, struct bkey_s_c k)
 
 		if (ptr_stale(ca, ptr) > 16) {
 			percpu_up_read(&c->mark_lock);
-			return true;
+			goto update;
 		}
 	}
 
@@ -1868,77 +1873,27 @@ static bool gc_btree_gens_key(struct bch_fs *c, struct bkey_s_c k)
 			*gen = ptr->gen;
 	}
 	percpu_up_read(&c->mark_lock);
+	return 0;
+update:
+	u = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
+	ret = PTR_ERR_OR_ZERO(u);
+	if (ret)
+		return ret;
 
-	return false;
-}
-
-/*
- * For recalculating oldest gen, we only need to walk keys in leaf nodes; btree
- * node pointers currently never have cached pointers that can become stale:
- */
-static int bch2_gc_btree_gens(struct btree_trans *trans, enum btree_id btree_id)
-{
-	struct bch_fs *c = trans->c;
-	struct btree_iter iter;
-	struct bkey_s_c k;
-	struct bkey_buf sk;
-	int ret = 0, commit_err = 0;
-
-	bch2_bkey_buf_init(&sk);
-
-	bch2_trans_iter_init(trans, &iter, btree_id, POS_MIN,
-			     BTREE_ITER_PREFETCH|
-			     BTREE_ITER_NOT_EXTENTS|
-			     BTREE_ITER_ALL_SNAPSHOTS);
-
-	while ((bch2_trans_begin(trans),
-		k = bch2_btree_iter_peek(&iter)).k) {
-		ret = bkey_err(k);
-
-		if (ret == -EINTR)
-			continue;
-		if (ret)
-			break;
-
-		c->gc_gens_pos = iter.pos;
-
-		if (gc_btree_gens_key(c, k) && !commit_err) {
-			bch2_bkey_buf_reassemble(&sk, c, k);
-			bch2_extent_normalize(c, bkey_i_to_s(sk.k));
-
-			commit_err =
-				bch2_trans_update(trans, &iter, sk.k, 0) ?:
-				bch2_trans_commit(trans, NULL, NULL,
-						  BTREE_INSERT_NOWAIT|
-						  BTREE_INSERT_NOFAIL);
-			if (commit_err == -EINTR) {
-				commit_err = 0;
-				continue;
-			}
-		}
-
-		bch2_btree_iter_advance(&iter);
-	}
-	bch2_trans_iter_exit(trans, &iter);
-
-	bch2_bkey_buf_exit(&sk, c);
+	bkey_reassemble(u, k);
 
-	return ret;
+	bch2_extent_normalize(c, bkey_i_to_s(u));
+	return bch2_trans_update(trans, iter, u, 0);
 }
 
-static int bch2_alloc_write_oldest_gen(struct btree_trans *trans, struct btree_iter *iter)
+static int bch2_alloc_write_oldest_gen(struct btree_trans *trans, struct btree_iter *iter,
+				       struct bkey_s_c k)
 {
 	struct bch_dev *ca = bch_dev_bkey_exists(trans->c, iter->pos.inode);
-	struct bkey_s_c k;
 	struct bch_alloc_v4 a;
 	struct bkey_i_alloc_v4 *a_mut;
 	int ret;
 
-	k = bch2_btree_iter_peek_slot(iter);
-	ret = bkey_err(k);
-	if (ret)
-		return ret;
-
 	bch2_alloc_to_v4(k, &a);
 
 	if (a.oldest_gen == ca->oldest_gen[iter->pos.offset])
@@ -1998,26 +1953,35 @@ int bch2_gc_gens(struct bch_fs *c)
 
 	for (i = 0; i < BTREE_ID_NR; i++)
 		if ((1 << i) & BTREE_ID_HAS_PTRS) {
+			struct btree_iter iter;
+			struct bkey_s_c k;
+
 			c->gc_gens_btree = i;
 			c->gc_gens_pos = POS_MIN;
-			ret = bch2_gc_btree_gens(&trans, i);
+			ret = for_each_btree_key_commit(&trans, iter, i,
+					POS_MIN,
+					BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS,
+					k,
+					NULL, NULL,
+					BTREE_INSERT_NOFAIL,
+				gc_btree_gens_key(&trans, &iter, k));
 			if (ret) {
 				bch_err(c, "error recalculating oldest_gen: %i", ret);
 				goto err;
 			}
 		}
 
-	for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN,
-			   BTREE_ITER_PREFETCH, k, ret) {
-		ret = commit_do(&trans, NULL, NULL,
-				      BTREE_INSERT_NOFAIL,
-				bch2_alloc_write_oldest_gen(&trans, &iter));
-		if (ret) {
-			bch_err(c, "error writing oldest_gen: %i", ret);
-			break;
-		}
+	ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_alloc,
+			POS_MIN,
+			BTREE_ITER_PREFETCH,
+			k,
+			NULL, NULL,
+			BTREE_INSERT_NOFAIL,
+		bch2_alloc_write_oldest_gen(&trans, &iter, k));
+	if (ret) {
+		bch_err(c, "error writing oldest_gen: %i", ret);
+		goto err;
 	}
-	bch2_trans_iter_exit(&trans, &iter);
 
 	c->gc_gens_btree	= 0;
 	c->gc_gens_pos		= POS_MIN;
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index 39f241e25881..9e3a5f94831c 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -393,6 +393,39 @@ __bch2_btree_iter_peek_and_restart(struct btree_trans *trans,
 	return k;
 }
 
+#define for_each_btree_key2(_trans, _iter, _btree_id,			\
+			    _start, _flags, _k, _do)			\
+({									\
+	int _ret = 0;							\
+									\
+	bch2_trans_iter_init((_trans), &(_iter), (_btree_id),		\
+			     (_start), (_flags));			\
+									\
+	do {								\
+		bch2_trans_begin(_trans);				\
+		(_k) = bch2_btree_iter_peek_type(&(_iter), (_flags));	\
+		if (!(_k).k) {						\
+			_ret = 0;					\
+			break;						\
+		}							\
+									\
+		_ret = bkey_err(_k) ?: (_do);				\
+		if (!_ret)						\
+			bch2_btree_iter_advance(&(_iter));		\
+	} while (_ret == 0 || _ret == -EINTR);				\
+									\
+	bch2_trans_iter_exit((_trans), &(_iter));			\
+	_ret;								\
+})
+
+#define for_each_btree_key_commit(_trans, _iter, _btree_id,		\
+				  _start, _iter_flags, _k,		\
+				  _disk_res, _journal_seq, _commit_flags,\
+				  _do)					\
+	for_each_btree_key2(_trans, _iter, _btree_id, _start, _iter_flags, _k,\
+			    (_do) ?: bch2_trans_commit(_trans, (_disk_res),\
+					(_journal_seq), (_commit_flags)))
+
 #define for_each_btree_key(_trans, _iter, _btree_id,			\
 			   _start, _flags, _k, _ret)			\
 	for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id),	\
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 5cec55edb483..6165878c2ddc 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -852,24 +852,16 @@ fsck_err:
 
 static int check_inode(struct btree_trans *trans,
 		       struct btree_iter *iter,
+		       struct bkey_s_c k,
 		       struct bch_inode_unpacked *prev,
 		       struct snapshots_seen *s,
 		       bool full)
 {
 	struct bch_fs *c = trans->c;
-	struct bkey_s_c k;
 	struct bch_inode_unpacked u;
 	bool do_update = false;
 	int ret;
 
-	k = bch2_btree_iter_peek(iter);
-	if (!k.k)
-		return 0;
-
-	ret = bkey_err(k);
-	if (ret)
-		return ret;
-
 	ret = check_key_has_snapshot(trans, iter, k);
 	if (ret < 0)
 		goto err;
@@ -984,7 +976,7 @@ static int check_inode(struct btree_trans *trans,
 	}
 
 	if (do_update) {
-		ret = write_inode(trans, &u, iter->pos.snapshot);
+		ret = __write_inode(trans, &u, iter->pos.snapshot);
 		if (ret)
 			bch_err(c, "error in fsck: error %i "
 				"updating inode", ret);
@@ -1003,25 +995,19 @@ static int check_inodes(struct bch_fs *c, bool full)
 	struct btree_iter iter;
 	struct bch_inode_unpacked prev = { 0 };
 	struct snapshots_seen s;
+	struct bkey_s_c k;
 	int ret;
 
 	snapshots_seen_init(&s);
 	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
 
-	bch2_trans_iter_init(&trans, &iter, BTREE_ID_inodes, POS_MIN,
-			     BTREE_ITER_INTENT|
-			     BTREE_ITER_PREFETCH|
-			     BTREE_ITER_ALL_SNAPSHOTS);
-
-	do {
-		ret = commit_do(&trans, NULL, NULL,
-				      BTREE_INSERT_LAZY_RW|
-				      BTREE_INSERT_NOFAIL,
-			check_inode(&trans, &iter, &prev, &s, full));
-		if (ret)
-			break;
-	} while (bch2_btree_iter_advance(&iter));
-	bch2_trans_iter_exit(&trans, &iter);
+	ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_inodes,
+			POS_MIN,
+			BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS,
+			k,
+			NULL, NULL,
+			BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
+		check_inode(&trans, &iter, k, &prev, &s, full));
 
 	bch2_trans_exit(&trans);
 	snapshots_seen_exit(&s);
@@ -1166,23 +1152,15 @@ fsck_err:
 }
 
 static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
+			struct bkey_s_c k,
 			struct inode_walker *inode,
 			struct snapshots_seen *s)
 {
 	struct bch_fs *c = trans->c;
-	struct bkey_s_c k;
 	struct inode_walker_entry *i;
 	struct printbuf buf = PRINTBUF;
 	struct bpos equiv;
 	int ret = 0;
-peek:
-	k = bch2_btree_iter_peek(iter);
-	if (!k.k)
-		goto out;
-
-	ret = bkey_err(k);
-	if (ret)
-		goto err;
 
 	ret = check_key_has_snapshot(trans, iter, k);
 	if (ret) {
@@ -1212,7 +1190,7 @@ peek:
 		 * it shouldn't be but we need to fix the new i_sectors check
 		 * code and delete the old bch2_count_inode_sectors() first
 		 */
-		goto peek;
+		return -EINTR;
 	}
 #if 0
 	if (bkey_cmp(prev.k->k.p, bkey_start_pos(k.k)) > 0) {
@@ -1324,6 +1302,7 @@ static int check_extents(struct bch_fs *c)
 	struct snapshots_seen s;
 	struct btree_trans trans;
 	struct btree_iter iter;
+	struct bkey_s_c k;
 	int ret = 0;
 
 #if 0
@@ -1336,21 +1315,12 @@ static int check_extents(struct bch_fs *c)
 
 	bch_verbose(c, "checking extents");
 
-	bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
-			     POS(BCACHEFS_ROOT_INO, 0),
-			     BTREE_ITER_INTENT|
-			     BTREE_ITER_PREFETCH|
-			     BTREE_ITER_ALL_SNAPSHOTS);
-
-	do {
-		ret = commit_do(&trans, NULL, NULL,
-				      BTREE_INSERT_LAZY_RW|
-				      BTREE_INSERT_NOFAIL,
-			check_extent(&trans, &iter, &w, &s));
-		if (ret)
-			break;
-	} while (bch2_btree_iter_advance(&iter));
-	bch2_trans_iter_exit(&trans, &iter);
+	ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_extents,
+			POS(BCACHEFS_ROOT_INO, 0),
+			BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
+			NULL, NULL,
+			BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
+		check_extent(&trans, &iter, k, &w, &s));
 #if 0
 	bch2_bkey_buf_exit(&prev, c);
 #endif
@@ -1522,26 +1492,18 @@ fsck_err:
 }
 
 static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
+			struct bkey_s_c k,
 			struct bch_hash_info *hash_info,
 			struct inode_walker *dir,
 			struct inode_walker *target,
 			struct snapshots_seen *s)
 {
 	struct bch_fs *c = trans->c;
-	struct bkey_s_c k;
 	struct bkey_s_c_dirent d;
 	struct inode_walker_entry *i;
 	struct printbuf buf = PRINTBUF;
 	struct bpos equiv;
 	int ret = 0;
-peek:
-	k = bch2_btree_iter_peek(iter);
-	if (!k.k)
-		goto out;
-
-	ret = bkey_err(k);
-	if (ret)
-		goto err;
 
 	ret = check_key_has_snapshot(trans, iter, k);
 	if (ret) {
@@ -1567,7 +1529,7 @@ peek:
 
 	if (!iter->path->should_be_locked) {
 		/* hack: see check_extent() */
-		goto peek;
+		return -EINTR;
 	}
 
 	ret = __walk_inode(trans, dir, equiv);
@@ -1715,6 +1677,7 @@ static int check_dirents(struct bch_fs *c)
 	struct bch_hash_info hash_info;
 	struct btree_trans trans;
 	struct btree_iter iter;
+	struct bkey_s_c k;
 	int ret = 0;
 
 	bch_verbose(c, "checking dirents");
@@ -1722,22 +1685,13 @@ static int check_dirents(struct bch_fs *c)
 	snapshots_seen_init(&s);
 	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
 
-	bch2_trans_iter_init(&trans, &iter, BTREE_ID_dirents,
-			     POS(BCACHEFS_ROOT_INO, 0),
-			     BTREE_ITER_INTENT|
-			     BTREE_ITER_PREFETCH|
-			     BTREE_ITER_ALL_SNAPSHOTS);
-
-	do {
-		ret = commit_do(&trans, NULL, NULL,
-				      BTREE_INSERT_LAZY_RW|
-				      BTREE_INSERT_NOFAIL,
-			check_dirent(&trans, &iter, &hash_info,
-				     &dir, &target, &s));
-		if (ret)
-			break;
-	} while (bch2_btree_iter_advance(&iter));
-	bch2_trans_iter_exit(&trans, &iter);
+	ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_dirents,
+			POS(BCACHEFS_ROOT_INO, 0),
+			BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS,
+			k,
+			NULL, NULL,
+			BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
+		check_dirent(&trans, &iter, k, &hash_info, &dir, &target, &s));
 
 	bch2_trans_exit(&trans);
 	snapshots_seen_exit(&s);
@@ -1750,21 +1704,13 @@ static int check_dirents(struct bch_fs *c)
 }
 
 static int check_xattr(struct btree_trans *trans, struct btree_iter *iter,
+		       struct bkey_s_c k,
 		       struct bch_hash_info *hash_info,
 		       struct inode_walker *inode)
 {
 	struct bch_fs *c = trans->c;
-	struct bkey_s_c k;
 	int ret;
 
-	k = bch2_btree_iter_peek(iter);
-	if (!k.k)
-		return 0;
-
-	ret = bkey_err(k);
-	if (ret)
-		return ret;
-
 	ret = check_key_has_snapshot(trans, iter, k);
 	if (ret)
 		return ret;
@@ -1803,28 +1749,20 @@ static int check_xattrs(struct bch_fs *c)
 	struct bch_hash_info hash_info;
 	struct btree_trans trans;
 	struct btree_iter iter;
+	struct bkey_s_c k;
 	int ret = 0;
 
 	bch_verbose(c, "checking xattrs");
 
 	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
 
-	bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs,
-			     POS(BCACHEFS_ROOT_INO, 0),
-			     BTREE_ITER_INTENT|
-			     BTREE_ITER_PREFETCH|
-			     BTREE_ITER_ALL_SNAPSHOTS);
-
-	do {
-		ret = commit_do(&trans, NULL, NULL,
-				      BTREE_INSERT_LAZY_RW|
-				      BTREE_INSERT_NOFAIL,
-				      check_xattr(&trans, &iter, &hash_info,
-						  &inode));
-		if (ret)
-			break;
-	} while (bch2_btree_iter_advance(&iter));
-	bch2_trans_iter_exit(&trans, &iter);
+	ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_xattrs,
+			POS(BCACHEFS_ROOT_INO, 0),
+			BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS,
+			k,
+			NULL, NULL,
+			BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
+		check_xattr(&trans, &iter, k, &hash_info, &inode));
 
 	bch2_trans_exit(&trans);
 
diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c
index d764dc7abfe8..e35a6d1f31e9 100644
--- a/fs/bcachefs/quota.c
+++ b/fs/bcachefs/quota.c
@@ -455,22 +455,14 @@ static void bch2_sb_quota_read(struct bch_fs *c)
 }
 
 static int bch2_fs_quota_read_inode(struct btree_trans *trans,
-				    struct btree_iter *iter)
+				    struct btree_iter *iter,
+				    struct bkey_s_c k)
 {
 	struct bch_fs *c = trans->c;
 	struct bch_inode_unpacked u;
 	struct bch_subvolume subvolume;
-	struct bkey_s_c k;
 	int ret;
 
-	k = bch2_btree_iter_peek(iter);
-	ret = bkey_err(k);
-	if (ret)
-		return ret;
-
-	if (!k.k)
-		return 1;
-
 	ret = bch2_snapshot_get_subvol(trans, k.k->p.snapshot, &subvolume);
 	if (ret)
 		return ret;
@@ -503,6 +495,7 @@ int bch2_fs_quota_read(struct bch_fs *c)
 	struct bch_memquota_type *q;
 	struct btree_trans trans;
 	struct btree_iter iter;
+	struct bkey_s_c k;
 	int ret;
 
 	mutex_lock(&c->sb_lock);
@@ -517,18 +510,18 @@ int bch2_fs_quota_read(struct bch_fs *c)
 
 	bch2_trans_init(&trans, c, 0, 0);
 
-	bch2_trans_iter_init(&trans, &iter, BTREE_ID_inodes, POS_MIN,
+	ret = for_each_btree_key2(&trans, iter, BTREE_ID_inodes,
+			     POS_MIN,
 			     BTREE_ITER_INTENT|
 			     BTREE_ITER_PREFETCH|
-			     BTREE_ITER_ALL_SNAPSHOTS);
-	do {
-		ret = lockrestart_do(&trans,
-				     bch2_fs_quota_read_inode(&trans, &iter));
-	} while (!ret);
-	bch2_trans_iter_exit(&trans, &iter);
+			     BTREE_ITER_ALL_SNAPSHOTS,
+			     k,
+		bch2_fs_quota_read_inode(&trans, &iter, k));
+	if (ret)
+		bch_err(c, "err reading inodes in quota init: %i", ret);
 
 	bch2_trans_exit(&trans);
-	return ret < 0 ? ret : 0;
+	return ret;
 }
 
 /* Enable/disable/delete quotas for an entire filesystem: */
diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c
index 463b5afd3fc7..1a212bac2a04 100644
--- a/fs/bcachefs/subvolume.c
+++ b/fs/bcachefs/subvolume.c
@@ -131,7 +131,7 @@ static int snapshot_live(struct btree_trans *trans, u32 id)
 	if (!id)
 		return 0;
 
-	ret = lockrestart_do(trans, snapshot_lookup(trans, id, &v));
+	ret = snapshot_lookup(trans, id, &v);
 	if (ret == -ENOENT)
 		bch_err(trans->c, "snapshot node %u not found", id);
 	if (ret)
@@ -140,15 +140,20 @@ static int snapshot_live(struct btree_trans *trans, u32 id)
 	return !BCH_SNAPSHOT_DELETED(&v);
 }
 
-static int bch2_snapshot_set_equiv(struct btree_trans *trans,
-				   struct bkey_s_c_snapshot snap)
+static int bch2_snapshot_set_equiv(struct btree_trans *trans, struct bkey_s_c k)
 {
 	struct bch_fs *c = trans->c;
 	unsigned i, nr_live = 0, live_idx = 0;
-	u32 id = snap.k->p.offset, child[2] = {
-		[0] = le32_to_cpu(snap.v->children[0]),
-		[1] = le32_to_cpu(snap.v->children[1])
-	};
+	struct bkey_s_c_snapshot snap;
+	u32 id = k.k->p.offset, child[2];
+
+	if (k.k->type != KEY_TYPE_snapshot)
+		return 0;
+
+	snap = bkey_s_c_to_snapshot(k);
+
+	child[0] = le32_to_cpu(snap.v->children[0]);
+	child[1] = le32_to_cpu(snap.v->children[1]);
 
 	for (i = 0; i < 2; i++) {
 		int ret = snapshot_live(trans, child[i]);
@@ -166,58 +171,27 @@ static int bch2_snapshot_set_equiv(struct btree_trans *trans,
 	return 0;
 }
 
-static int bch2_snapshots_set_equiv(struct btree_trans *trans)
-{
-	struct btree_iter iter;
-	struct bkey_s_c k;
-	int ret;
-
-	for_each_btree_key(trans, iter, BTREE_ID_snapshots,
-			   POS_MIN, 0, k, ret) {
-		if (k.k->type != KEY_TYPE_snapshot)
-			continue;
-
-		ret = bch2_snapshot_set_equiv(trans, bkey_s_c_to_snapshot(k));
-		if (ret)
-			break;
-	}
-	bch2_trans_iter_exit(trans, &iter);
-
-	if (ret)
-		bch_err(trans->c, "error in bch2_snapshots_set_equiv: %i", ret);
-
-	return ret;
-}
-
 /* fsck: */
 static int check_snapshot(struct btree_trans *trans,
-			  struct btree_iter *iter)
+			  struct btree_iter *iter,
+			  struct bkey_s_c k)
 {
 	struct bch_fs *c = trans->c;
 	struct bkey_s_c_snapshot s;
 	struct bch_subvolume subvol;
 	struct bch_snapshot v;
-	struct bkey_s_c k;
 	struct printbuf buf = PRINTBUF;
 	bool should_have_subvol;
 	u32 i, id;
 	int ret = 0;
 
-	k = bch2_btree_iter_peek(iter);
-	if (!k.k)
-		return 0;
-
-	ret = bkey_err(k);
-	if (ret)
-		return ret;
-
 	if (k.k->type != KEY_TYPE_snapshot)
 		return 0;
 
 	s = bkey_s_c_to_snapshot(k);
 	id = le32_to_cpu(s.v->parent);
 	if (id) {
-		ret = lockrestart_do(trans, snapshot_lookup(trans, id, &v));
+		ret = snapshot_lookup(trans, id, &v);
 		if (ret == -ENOENT)
 			bch_err(c, "snapshot with nonexistent parent:\n  %s",
 				(bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf));
@@ -236,7 +210,7 @@ static int check_snapshot(struct btree_trans *trans,
 	for (i = 0; i < 2 && s.v->children[i]; i++) {
 		id = le32_to_cpu(s.v->children[i]);
 
-		ret = lockrestart_do(trans, snapshot_lookup(trans, id, &v));
+		ret = snapshot_lookup(trans, id, &v);
 		if (ret == -ENOENT)
 			bch_err(c, "snapshot node %llu has nonexistent child %u",
 				s.k->p.offset, id);
@@ -256,7 +230,7 @@ static int check_snapshot(struct btree_trans *trans,
 
 	if (should_have_subvol) {
 		id = le32_to_cpu(s.v->subvol);
-		ret = lockrestart_do(trans, bch2_subvolume_get(trans, id, 0, false, &subvol));
+		ret = bch2_subvolume_get(trans, id, 0, false, &subvol);
 		if (ret == -ENOENT)
 			bch_err(c, "snapshot points to nonexistent subvolume:\n  %s",
 				(bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf));
@@ -298,22 +272,16 @@ int bch2_fs_check_snapshots(struct bch_fs *c)
 {
 	struct btree_trans trans;
 	struct btree_iter iter;
+	struct bkey_s_c k;
 	int ret;
 
 	bch2_trans_init(&trans, c, 0, 0);
 
-	bch2_trans_iter_init(&trans, &iter, BTREE_ID_snapshots,
-			     POS_MIN, BTREE_ITER_PREFETCH);
-
-	do {
-		ret = commit_do(&trans, NULL, NULL,
-				BTREE_INSERT_LAZY_RW|
-				BTREE_INSERT_NOFAIL,
-				check_snapshot(&trans, &iter));
-		if (ret)
-			break;
-	} while (bch2_btree_iter_advance(&iter));
-	bch2_trans_iter_exit(&trans, &iter);
+	ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_snapshots,
+			POS(BCACHEFS_ROOT_INO, 0),
+			BTREE_ITER_PREFETCH, k,
+			NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
+		check_snapshot(&trans, &iter, k));
 
 	if (ret)
 		bch_err(c, "error %i checking snapshots", ret);
@@ -404,20 +372,10 @@ int bch2_fs_snapshots_start(struct bch_fs *c)
 
 	bch2_trans_init(&trans, c, 0, 0);
 
-	for_each_btree_key(&trans, iter, BTREE_ID_snapshots,
-			   POS_MIN, 0, k, ret) {
-	       if (bkey_cmp(k.k->p, POS(0, U32_MAX)) > 0)
-		       break;
-
-		if (k.k->type != KEY_TYPE_snapshot)
-			continue;
-
-		ret =   bch2_mark_snapshot(&trans, bkey_s_c_null, k, 0) ?:
-			bch2_snapshot_set_equiv(&trans, bkey_s_c_to_snapshot(k));
-		if (ret)
-			break;
-	}
-	bch2_trans_iter_exit(&trans, &iter);
+	for_each_btree_key2(&trans, iter, BTREE_ID_snapshots,
+			   POS_MIN, 0, k,
+		bch2_mark_snapshot(&trans, bkey_s_c_null, k, 0) ?:
+		bch2_snapshot_set_equiv(&trans, k));
 
 	bch2_trans_exit(&trans);
 
@@ -692,6 +650,34 @@ static int bch2_snapshot_delete_keys_btree(struct btree_trans *trans,
 	return ret;
 }
 
+static int bch2_delete_redundant_snapshot(struct btree_trans *trans, struct btree_iter *iter,
+					  struct bkey_s_c k)
+{
+	struct bkey_s_c_snapshot snap;
+	u32 children[2];
+	int ret;
+
+	if (k.k->type != KEY_TYPE_snapshot)
+		return 0;
+
+	snap = bkey_s_c_to_snapshot(k);
+	if (BCH_SNAPSHOT_DELETED(snap.v) ||
+	    BCH_SNAPSHOT_SUBVOL(snap.v))
+		return 0;
+
+	children[0] = le32_to_cpu(snap.v->children[0]);
+	children[1] = le32_to_cpu(snap.v->children[1]);
+
+	ret   = snapshot_live(trans, children[0]) ?:
+		snapshot_live(trans, children[1]);
+	if (ret < 0)
+		return ret;
+
+	if (!ret)
+		return bch2_snapshot_node_set_deleted(trans, k.k->p.offset);
+	return 0;
+}
+
 int bch2_delete_dead_snapshots(struct bch_fs *c)
 {
 	struct btree_trans trans;
@@ -699,7 +685,7 @@ int bch2_delete_dead_snapshots(struct bch_fs *c)
 	struct bkey_s_c k;
 	struct bkey_s_c_snapshot snap;
 	snapshot_id_list deleted = { 0 };
-	u32 i, id, children[2];
+	u32 i, id;
 	int ret = 0;
 
 	if (!test_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags))
@@ -719,43 +705,22 @@ int bch2_delete_dead_snapshots(struct bch_fs *c)
 	 * For every snapshot node: If we have no live children and it's not
 	 * pointed to by a subvolume, delete it:
 	 */
-	for_each_btree_key(&trans, iter, BTREE_ID_snapshots,
-			   POS_MIN, 0, k, ret) {
-		if (k.k->type != KEY_TYPE_snapshot)
-			continue;
-
-		snap = bkey_s_c_to_snapshot(k);
-		if (BCH_SNAPSHOT_DELETED(snap.v) ||
-		    BCH_SNAPSHOT_SUBVOL(snap.v))
-			continue;
-
-		children[0] = le32_to_cpu(snap.v->children[0]);
-		children[1] = le32_to_cpu(snap.v->children[1]);
-
-		ret   = snapshot_live(&trans, children[0]) ?:
-			snapshot_live(&trans, children[1]);
-		if (ret < 0)
-			break;
-		if (ret)
-			continue;
-
-		ret = commit_do(&trans, NULL, NULL, 0,
-			bch2_snapshot_node_set_deleted(&trans, iter.pos.offset));
-		if (ret) {
-			bch_err(c, "error deleting snapshot %llu: %i", iter.pos.offset, ret);
-			break;
-		}
-	}
-	bch2_trans_iter_exit(&trans, &iter);
-
+	ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_snapshots,
+			POS_MIN, 0, k,
+			NULL, NULL, 0,
+		bch2_delete_redundant_snapshot(&trans, &iter, k));
 	if (ret) {
-		bch_err(c, "error walking snapshots: %i", ret);
+		bch_err(c, "error deleting redundant snapshots: %i", ret);
 		goto err;
 	}
 
-	ret = bch2_snapshots_set_equiv(&trans);
-	if (ret)
+	for_each_btree_key2(&trans, iter, BTREE_ID_snapshots,
+			   POS_MIN, 0, k,
+		bch2_snapshot_set_equiv(&trans, k));
+	if (ret) {
+		bch_err(c, "error in bch2_snapshots_set_equiv: %i", ret);
 		goto err;
+	}
 
 	for_each_btree_key(&trans, iter, BTREE_ID_snapshots,
 			   POS_MIN, 0, k, ret) {
-- 
cgit 


From 43de721a33b214b253c07672c4c6ba7548f2d3e7 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 13 Jul 2022 06:03:21 -0400
Subject: bcachefs: Unlock in bch2_trans_begin() if we've held locks more than
 10us

We try to ensure we never hold btree locks for too long - bcachefs tries
to be soft realtime. This adds a check when restarting a transaction,
where a transaction restart is cheap - if we've been holding locks for
too long, drop and retake them.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_iter.c  | 10 +++++++++-
 fs/bcachefs/btree_types.h |  3 +++
 2 files changed, 12 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index b90aff2ad775..68d9d8ee6f97 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -3224,12 +3224,19 @@ void bch2_trans_begin(struct btree_trans *trans)
 			path->preserve = false;
 	}
 
-	bch2_trans_cond_resched(trans);
+	if (!trans->restarted &&
+	    (need_resched() ||
+	     ktime_get_ns() - trans->last_begin_time > BTREE_TRANS_MAX_LOCK_HOLD_TIME_NS)) {
+		bch2_trans_unlock(trans);
+		cond_resched();
+		bch2_trans_relock(trans);
+	}
 
 	if (trans->restarted)
 		bch2_btree_path_traverse_all(trans);
 
 	trans->restarted = false;
+	trans->last_begin_time = ktime_get_ns();
 }
 
 static void bch2_trans_alloc_paths(struct btree_trans *trans, struct bch_fs *c)
@@ -3259,6 +3266,7 @@ void __bch2_trans_init(struct btree_trans *trans, struct bch_fs *c,
 	memset(trans, 0, sizeof(*trans));
 	trans->c		= c;
 	trans->fn		= fn;
+	trans->last_begin_time	= ktime_get_ns();
 	trans->task		= current;
 	trans->journal_replay_not_finished =
 		!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags);
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 2eb8cc11aec4..b184ec512499 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -382,10 +382,13 @@ struct btree_trans_commit_hook {
 
 #define BTREE_TRANS_MEM_MAX	(1U << 16)
 
+#define BTREE_TRANS_MAX_LOCK_HOLD_TIME_NS	10000
+
 struct btree_trans {
 	struct bch_fs		*c;
 	const char		*fn;
 	struct list_head	list;
+	u64			last_begin_time;
 	struct btree		*locking;
 	unsigned		locking_path_idx;
 	struct bpos		locking_pos;
-- 
cgit 


From 8bfe14e86a00a44eb7bfbeff1d7368e44c93bb7c Mon Sep 17 00:00:00 2001
From: Daniel Hill <daniel@gluo.nz>
Date: Thu, 14 Jul 2022 18:58:23 +1200
Subject: bcachefs: lock time stats prep work.

We need the caller name and a place to store our results, btree_trans provides this.

Signed-off-by: Daniel Hill <daniel@gluo.nz>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_cache.c           |  2 +-
 fs/bcachefs/btree_iter.c            | 58 +++++++++++++++++++------------------
 fs/bcachefs/btree_iter.h            |  7 +++--
 fs/bcachefs/btree_key_cache.c       |  2 +-
 fs/bcachefs/btree_locking.h         |  8 +++--
 fs/bcachefs/btree_update_interior.c |  4 +--
 6 files changed, 43 insertions(+), 38 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index 00eb69dd16e9..1f80f08a69b2 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -883,7 +883,7 @@ lock_node:
 		 * was removed - and we'll bail out:
 		 */
 		if (btree_node_read_locked(path, level + 1))
-			btree_node_unlock(path, level + 1);
+			btree_node_unlock(trans, path, level + 1);
 
 		if (!btree_node_lock(trans, path, b, k->k.p, level, lock_type,
 				     lock_node_check_fn, (void *) k, trace_ip)) {
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 68d9d8ee6f97..d708bf32d408 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -224,7 +224,7 @@ bool bch2_btree_node_upgrade(struct btree_trans *trans,
 
 	if (btree_node_lock_seq_matches(path, b, level) &&
 	    btree_node_lock_increment(trans, b, level, BTREE_NODE_INTENT_LOCKED)) {
-		btree_node_unlock(path, level);
+		btree_node_unlock(trans, path, level);
 		goto success;
 	}
 
@@ -259,7 +259,7 @@ static inline bool btree_path_get_locks(struct btree_trans *trans,
 	 * the node that we failed to relock:
 	 */
 	if (fail_idx >= 0) {
-		__bch2_btree_path_unlock(path);
+		__bch2_btree_path_unlock(trans, path);
 		btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
 
 		do {
@@ -417,7 +417,7 @@ bool bch2_btree_path_relock_intent(struct btree_trans *trans,
 	     l < path->locks_want && btree_path_node(path, l);
 	     l++) {
 		if (!bch2_btree_node_relock(trans, path, l)) {
-			__bch2_btree_path_unlock(path);
+			__bch2_btree_path_unlock(trans, path);
 			btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
 			trace_trans_restart_relock_path_intent(trans->fn, _RET_IP_,
 						   path->btree_id, &path->pos);
@@ -496,7 +496,8 @@ bool __bch2_btree_path_upgrade(struct btree_trans *trans,
 	return false;
 }
 
-void __bch2_btree_path_downgrade(struct btree_path *path,
+void __bch2_btree_path_downgrade(struct btree_trans *trans,
+				 struct btree_path *path,
 				 unsigned new_locks_want)
 {
 	unsigned l;
@@ -508,7 +509,7 @@ void __bch2_btree_path_downgrade(struct btree_path *path,
 	while (path->nodes_locked &&
 	       (l = __fls(path->nodes_locked)) >= path->locks_want) {
 		if (l > path->level) {
-			btree_node_unlock(path, l);
+			btree_node_unlock(trans, path, l);
 		} else {
 			if (btree_node_intent_locked(path, l)) {
 				six_lock_downgrade(&path->l[l].b->c.lock);
@@ -526,7 +527,7 @@ void bch2_trans_downgrade(struct btree_trans *trans)
 	struct btree_path *path;
 
 	trans_for_each_path(trans, path)
-		bch2_btree_path_downgrade(path);
+		bch2_btree_path_downgrade(trans, path);
 }
 
 /* Btree transaction locking: */
@@ -554,7 +555,7 @@ void bch2_trans_unlock(struct btree_trans *trans)
 	struct btree_path *path;
 
 	trans_for_each_path(trans, path)
-		__bch2_btree_path_unlock(path);
+		__bch2_btree_path_unlock(trans, path);
 }
 
 /* Btree iterator: */
@@ -575,7 +576,7 @@ static void bch2_btree_path_verify_cached(struct btree_trans *trans,
 	       bkey_cmp(ck->key.pos, path->pos));
 
 	if (!locked)
-		btree_node_unlock(path, 0);
+		btree_node_unlock(trans, path, 0);
 }
 
 static void bch2_btree_path_verify_level(struct btree_trans *trans,
@@ -632,7 +633,7 @@ static void bch2_btree_path_verify_level(struct btree_trans *trans,
 	}
 
 	if (!locked)
-		btree_node_unlock(path, level);
+		btree_node_unlock(trans, path, level);
 	return;
 err:
 	bch2_bpos_to_text(&buf1, path->pos);
@@ -1106,7 +1107,7 @@ static void btree_path_verify_new_node(struct btree_trans *trans,
 	}
 
 	if (!parent_locked)
-		btree_node_unlock(path, plevel);
+		btree_node_unlock(trans, path, plevel);
 }
 
 static inline void __btree_path_level_init(struct btree_path *path,
@@ -1158,7 +1159,7 @@ void bch2_trans_node_add(struct btree_trans *trans, struct btree *b)
 
 			if (path->nodes_locked &&
 			    t != BTREE_NODE_UNLOCKED) {
-				btree_node_unlock(path, b->c.level);
+				btree_node_unlock(trans, path, b->c.level);
 				six_lock_increment(&b->c.lock, (enum six_lock_type) t);
 				mark_btree_node_locked(path, b->c.level, (enum six_lock_type) t);
 			}
@@ -1277,7 +1278,7 @@ static int btree_path_prefetch(struct btree_trans *trans, struct btree_path *pat
 	}
 
 	if (!was_locked)
-		btree_node_unlock(path, path->level);
+		btree_node_unlock(trans, path, path->level);
 
 	bch2_bkey_buf_exit(&tmp, c);
 	return ret;
@@ -1312,7 +1313,7 @@ static int btree_path_prefetch_j(struct btree_trans *trans, struct btree_path *p
 	}
 
 	if (!was_locked)
-		btree_node_unlock(path, path->level);
+		btree_node_unlock(trans, path, path->level);
 
 	bch2_bkey_buf_exit(&tmp, c);
 	return ret;
@@ -1337,7 +1338,7 @@ static noinline void btree_node_mem_ptr_set(struct btree_trans *trans,
 	bp->mem_ptr = (unsigned long)b;
 
 	if (!locked)
-		btree_node_unlock(path, plevel);
+		btree_node_unlock(trans, path, plevel);
 }
 
 static noinline int btree_node_iter_and_journal_peek(struct btree_trans *trans,
@@ -1410,7 +1411,7 @@ static __always_inline int btree_path_down(struct btree_trans *trans,
 		btree_node_mem_ptr_set(trans, path, level + 1, b);
 
 	if (btree_node_read_locked(path, level + 1))
-		btree_node_unlock(path, level + 1);
+		btree_node_unlock(trans, path, level + 1);
 	path->level = level;
 
 	bch2_btree_path_verify_locks(path);
@@ -1519,9 +1520,10 @@ static inline bool btree_path_good_node(struct btree_trans *trans,
 	return true;
 }
 
-static void btree_path_set_level_up(struct btree_path *path)
+static void btree_path_set_level_up(struct btree_trans *trans,
+				    struct btree_path *path)
 {
-	btree_node_unlock(path, path->level);
+	btree_node_unlock(trans, path, path->level);
 	path->l[path->level].b = BTREE_ITER_NO_NODE_UP;
 	path->level++;
 	btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
@@ -1537,7 +1539,7 @@ static void btree_path_set_level_down(struct btree_trans *trans,
 
 	for (l = path->level + 1; l < BTREE_MAX_DEPTH; l++)
 		if (btree_lock_want(path, l) == BTREE_NODE_UNLOCKED)
-			btree_node_unlock(path, l);
+			btree_node_unlock(trans, path, l);
 
 	btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
 	bch2_btree_path_verify(trans, path);
@@ -1551,7 +1553,7 @@ static inline unsigned btree_path_up_until_good_node(struct btree_trans *trans,
 
 	while (btree_path_node(path, l) &&
 	       !btree_path_good_node(trans, path, l, check_pos)) {
-		btree_node_unlock(path, l);
+		btree_node_unlock(trans, path, l);
 		path->l[l].b = BTREE_ITER_NO_NODE_UP;
 		l++;
 	}
@@ -1562,7 +1564,7 @@ static inline unsigned btree_path_up_until_good_node(struct btree_trans *trans,
 	     i++)
 		if (!bch2_btree_node_relock(trans, path, i))
 			while (l <= i) {
-				btree_node_unlock(path, l);
+				btree_node_unlock(trans, path, l);
 				path->l[l].b = BTREE_ITER_NO_NODE_UP;
 				l++;
 			}
@@ -1631,7 +1633,7 @@ static int btree_path_traverse_one(struct btree_trans *trans,
 				goto out;
 			}
 
-			__bch2_btree_path_unlock(path);
+			__bch2_btree_path_unlock(trans, path);
 			path->level = depth_want;
 
 			if (ret == -EIO)
@@ -1717,7 +1719,7 @@ __bch2_btree_path_set_pos(struct btree_trans *trans,
 	trans->paths_sorted	= false;
 
 	if (unlikely(path->cached)) {
-		btree_node_unlock(path, 0);
+		btree_node_unlock(trans, path, 0);
 		path->l[0].b = BTREE_ITER_NO_NODE_CACHED;
 		btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
 		goto out;
@@ -1740,7 +1742,7 @@ __bch2_btree_path_set_pos(struct btree_trans *trans,
 
 	if (l != path->level) {
 		btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
-		__bch2_btree_path_unlock(path);
+		__bch2_btree_path_unlock(trans, path);
 	}
 out:
 	bch2_btree_path_verify(trans, path);
@@ -1781,7 +1783,7 @@ static struct btree_path *have_node_at_pos(struct btree_trans *trans, struct btr
 
 static inline void __bch2_path_free(struct btree_trans *trans, struct btree_path *path)
 {
-	__bch2_btree_path_unlock(path);
+	__bch2_btree_path_unlock(trans, path);
 	btree_path_list_remove(trans, path);
 	trans->paths_allocated &= ~(1ULL << path->idx);
 }
@@ -2122,12 +2124,12 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
 
 	/* got to end? */
 	if (!btree_path_node(path, path->level + 1)) {
-		btree_path_set_level_up(path);
+		btree_path_set_level_up(trans, path);
 		return NULL;
 	}
 
 	if (!bch2_btree_node_relock(trans, path, path->level + 1)) {
-		__bch2_btree_path_unlock(path);
+		__bch2_btree_path_unlock(trans, path);
 		path->l[path->level].b = BTREE_ITER_NO_NODE_GET_LOCKS;
 		path->l[path->level + 1].b = BTREE_ITER_NO_NODE_GET_LOCKS;
 		btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
@@ -2141,7 +2143,7 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
 	b = btree_path_node(path, path->level + 1);
 
 	if (!bpos_cmp(iter->pos, b->key.k.p)) {
-		btree_node_unlock(path, path->level);
+		btree_node_unlock(trans, path, path->level);
 		path->l[path->level].b = BTREE_ITER_NO_NODE_UP;
 		path->level++;
 	} else {
@@ -2582,7 +2584,7 @@ struct bkey_s_c bch2_btree_iter_peek_all_levels(struct btree_iter *iter)
 		    (iter->advanced &&
 		     !bpos_cmp(path_l(iter->path)->b->key.k.p, iter->pos))) {
 			iter->pos = path_l(iter->path)->b->key.k.p;
-			btree_path_set_level_up(iter->path);
+			btree_path_set_level_up(trans, iter->path);
 			iter->advanced = false;
 			continue;
 		}
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index 9e3a5f94831c..209b89dd1d2b 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -228,14 +228,15 @@ static inline bool bch2_btree_path_upgrade(struct btree_trans *trans,
 		: path->uptodate == BTREE_ITER_UPTODATE;
 }
 
-void __bch2_btree_path_downgrade(struct btree_path *, unsigned);
+void __bch2_btree_path_downgrade(struct btree_trans *, struct btree_path *, unsigned);
 
-static inline void bch2_btree_path_downgrade(struct btree_path *path)
+static inline void bch2_btree_path_downgrade(struct btree_trans *trans,
+					     struct btree_path *path)
 {
 	unsigned new_locks_want = path->level + !!path->intent_ref;
 
 	if (path->locks_want > new_locks_want)
-		__bch2_btree_path_downgrade(path, new_locks_want);
+		__bch2_btree_path_downgrade(trans, path, new_locks_want);
 }
 
 void bch2_trans_downgrade(struct btree_trans *);
diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index 97c72f3917ec..baf1f25b91ca 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -431,7 +431,7 @@ fill:
 	return ret;
 err:
 	if (ret != -EINTR) {
-		btree_node_unlock(path, 0);
+		btree_node_unlock(trans, path, 0);
 		path->l[0].b = BTREE_ITER_NO_NODE_ERROR;
 	}
 	return ret;
diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h
index 4a87fa625d7a..230f376993ae 100644
--- a/fs/bcachefs/btree_locking.h
+++ b/fs/bcachefs/btree_locking.h
@@ -94,7 +94,8 @@ btree_lock_want(struct btree_path *path, int level)
 	return BTREE_NODE_UNLOCKED;
 }
 
-static inline void btree_node_unlock(struct btree_path *path, unsigned level)
+static inline void btree_node_unlock(struct btree_trans *trans,
+				     struct btree_path *path, unsigned level)
 {
 	int lock_type = btree_node_locked_type(path, level);
 
@@ -105,12 +106,13 @@ static inline void btree_node_unlock(struct btree_path *path, unsigned level)
 	mark_btree_node_unlocked(path, level);
 }
 
-static inline void __bch2_btree_path_unlock(struct btree_path *path)
+static inline void __bch2_btree_path_unlock(struct btree_trans *trans,
+					    struct btree_path *path)
 {
 	btree_path_set_dirty(path, BTREE_ITER_NEED_RELOCK);
 
 	while (path->nodes_locked)
-		btree_node_unlock(path, __ffs(path->nodes_locked));
+		btree_node_unlock(trans, path, __ffs(path->nodes_locked));
 }
 
 static inline enum bch_time_stats lock_to_time_stat(enum six_lock_type type)
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index ee95a79dc13e..9f9ab85ec6b8 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -1844,7 +1844,7 @@ int bch2_btree_node_rewrite(struct btree_trans *trans,
 
 	bch2_btree_update_done(as);
 out:
-	bch2_btree_path_downgrade(iter->path);
+	bch2_btree_path_downgrade(trans, iter->path);
 	return ret;
 }
 
@@ -1956,7 +1956,7 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans,
 		BUG_ON(iter2.path->level != b->c.level);
 		BUG_ON(bpos_cmp(iter2.path->pos, new_key->k.p));
 
-		btree_node_unlock(iter2.path, iter2.path->level);
+		btree_node_unlock(trans, iter2.path, iter2.path->level);
 		path_l(iter2.path)->b = BTREE_ITER_NO_NODE_UP;
 		iter2.path->level++;
 		btree_path_set_dirty(iter2.path, BTREE_ITER_NEED_TRAVERSE);
-- 
cgit 


From 25055c690f9ab3d4fb72b8a07323bf952c2682dc Mon Sep 17 00:00:00 2001
From: Daniel Hill <daniel@gluo.nz>
Date: Thu, 14 Jul 2022 20:31:36 +1200
Subject: bcachefs: bch2_time_stats_to_text now indents properly

Printbufs indentation feature doesn't yet work with '\n' and '\t'. So we've
replaced all instances of '\n' with prt_newline.

Signed-off-by: Daniel Hill <daniel@gluo.nz>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/util.c | 22 ++++++++++++++--------
 1 file changed, 14 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c
index a2e42ae77371..cb07ef2ceb59 100644
--- a/fs/bcachefs/util.c
+++ b/fs/bcachefs/util.c
@@ -378,31 +378,37 @@ void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats
 	u64 q, last_q = 0;
 	int i;
 
-	prt_printf(out, "count:\t\t%llu\n",
+	prt_printf(out, "count:\t\t%llu",
 			 stats->count);
-	prt_printf(out, "rate:\t\t%llu/sec\n",
+	prt_newline(out);
+	prt_printf(out, "rate:\t\t%llu/sec",
 	       freq ?  div64_u64(NSEC_PER_SEC, freq) : 0);
+	prt_newline(out);
 
 	prt_printf(out, "frequency:\t");
 	bch2_pr_time_units(out, freq);
 
-	prt_printf(out, "\navg duration:\t");
+	prt_newline(out);
+	prt_printf(out, "avg duration:\t");
 	bch2_pr_time_units(out, stats->average_duration);
 
-	prt_printf(out, "\nmax duration:\t");
+	prt_newline(out);
+	prt_printf(out, "max duration:\t");
 	bch2_pr_time_units(out, stats->max_duration);
 
 	i = eytzinger0_first(NR_QUANTILES);
 	u = pick_time_units(stats->quantiles.entries[i].m);
 
-	prt_printf(out, "\nquantiles (%s):\t", u->name);
+	prt_newline(out);
+	prt_printf(out, "quantiles (%s):\t", u->name);
 	eytzinger0_for_each(i, NR_QUANTILES) {
 		bool is_last = eytzinger0_next(i, NR_QUANTILES) == -1;
 
 		q = max(stats->quantiles.entries[i].m, last_q);
-		prt_printf(out, "%llu%s",
-		       div_u64(q, u->nsecs),
-		       is_last ? "\n" : " ");
+		prt_printf(out, "%llu ",
+		       div_u64(q, u->nsecs));
+		if (is_last)
+			prt_newline(out);
 		last_q = q;
 	}
 }
-- 
cgit 


From c807ca95a6e20bedbbb84287bc7087c2b2b775de Mon Sep 17 00:00:00 2001
From: Daniel Hill <daniel@gluo.nz>
Date: Thu, 14 Jul 2022 20:33:09 +1200
Subject: bcachefs: added lock held time stats

We now record the length of time btree locks are held and expose this in debugfs.

Enabled via CONFIG_BCACHEFS_LOCK_TIME_STATS.

Signed-off-by: Daniel Hill <daniel@gluo.nz>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/Kconfig           |  6 ++++
 fs/bcachefs/bcachefs.h        |  9 ++++++
 fs/bcachefs/btree_iter.c      | 19 ++++++++---
 fs/bcachefs/btree_key_cache.c |  4 +--
 fs/bcachefs/btree_locking.h   | 44 ++++++++++++++++++++-----
 fs/bcachefs/btree_types.h     |  4 +++
 fs/bcachefs/debug.c           | 74 +++++++++++++++++++++++++++++++++++++++++++
 7 files changed, 145 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/Kconfig b/fs/bcachefs/Kconfig
index d2eb65e9032b..7ae85900e5b4 100644
--- a/fs/bcachefs/Kconfig
+++ b/fs/bcachefs/Kconfig
@@ -59,6 +59,12 @@ config BCACHEFS_TESTS
 	help
 	Include some unit and performance tests for the core btree code
 
+config BCACHEFS_LOCK_TIME_STATS
+       bool "bcachefs lock time statistics"
+       depends on BCACHEFS_FS
+       help
+       Expose statistics for how long we held a lock in debugfs
+
 config BCACHEFS_NO_LATENCY_ACCT
 	bool "disable latency accounting and time stats"
 	depends on BCACHEFS_FS
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 7020eee5de21..9cd6f840b71a 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -319,6 +319,8 @@ BCH_DEBUG_PARAMS_DEBUG()
 #undef BCH_DEBUG_PARAM
 #endif
 
+#define BCH_LOCK_TIME_NR 128
+
 #define BCH_TIME_STATS()			\
 	x(btree_node_mem_alloc)			\
 	x(btree_node_split)			\
@@ -527,6 +529,11 @@ struct btree_debug {
 	unsigned		id;
 };
 
+struct lock_held_stats {
+	struct bch2_time_stats	times[BCH_LOCK_TIME_NR];
+	const char		*names[BCH_LOCK_TIME_NR];
+};
+
 struct bch_fs_pcpu {
 	u64			sectors_available;
 };
@@ -920,6 +927,8 @@ mempool_t		bio_bounce_pages;
 	bool			promote_whole_extents;
 
 	struct bch2_time_stats	times[BCH_TIME_STAT_NR];
+
+	struct lock_held_stats	lock_held_stats;
 };
 
 static inline void bch2_set_ra_pages(struct bch_fs *c, unsigned ra_pages)
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index d708bf32d408..30958cbb9532 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -177,7 +177,7 @@ bool __bch2_btree_node_relock(struct btree_trans *trans,
 	if (six_relock_type(&b->c.lock, want, path->l[level].lock_seq) ||
 	    (btree_node_lock_seq_matches(path, b, level) &&
 	     btree_node_lock_increment(trans, b, level, want))) {
-		mark_btree_node_locked(path, level, want);
+		mark_btree_node_locked(trans, path, level, want);
 		return true;
 	}
 fail:
@@ -230,7 +230,7 @@ bool bch2_btree_node_upgrade(struct btree_trans *trans,
 
 	return false;
 success:
-	mark_btree_node_intent_locked(path, level);
+	mark_btree_node_intent_locked(trans, path, level);
 	return true;
 }
 
@@ -1161,7 +1161,7 @@ void bch2_trans_node_add(struct btree_trans *trans, struct btree *b)
 			    t != BTREE_NODE_UNLOCKED) {
 				btree_node_unlock(trans, path, b->c.level);
 				six_lock_increment(&b->c.lock, (enum six_lock_type) t);
-				mark_btree_node_locked(path, b->c.level, (enum six_lock_type) t);
+				mark_btree_node_locked(trans, path, b->c.level, (enum six_lock_type) t);
 			}
 
 			btree_path_level_init(trans, path, b);
@@ -1238,7 +1238,7 @@ static inline int btree_path_lock_root(struct btree_trans *trans,
 			for (i = path->level + 1; i < BTREE_MAX_DEPTH; i++)
 				path->l[i].b = NULL;
 
-			mark_btree_node_locked(path, path->level, lock_type);
+			mark_btree_node_locked(trans, path, path->level, lock_type);
 			btree_path_level_init(trans, path, b);
 			return 0;
 		}
@@ -1402,7 +1402,7 @@ static __always_inline int btree_path_down(struct btree_trans *trans,
 	if (unlikely(ret))
 		goto err;
 
-	mark_btree_node_locked(path, level, lock_type);
+	mark_btree_node_locked(trans, path, level, lock_type);
 	btree_path_level_init(trans, path, b);
 
 	if (likely(!trans->journal_replay_not_finished &&
@@ -3273,6 +3273,15 @@ void __bch2_trans_init(struct btree_trans *trans, struct bch_fs *c,
 	trans->journal_replay_not_finished =
 		!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags);
 
+	while (c->lock_held_stats.names[trans->lock_name_idx] != fn
+	       && c->lock_held_stats.names[trans->lock_name_idx] != 0)
+		trans->lock_name_idx++;
+
+	if (trans->lock_name_idx >= BCH_LOCK_TIME_NR)
+		pr_warn_once("lock_times array not big enough!");
+	else
+		c->lock_held_stats.names[trans->lock_name_idx] = fn;
+
 	bch2_trans_alloc_paths(trans, c);
 
 	if (expected_mem_bytes) {
diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index baf1f25b91ca..e5a29240bbcc 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -382,7 +382,7 @@ retry:
 		if (!ck)
 			goto retry;
 
-		mark_btree_node_locked(path, 0, SIX_LOCK_intent);
+		mark_btree_node_locked(trans, path, 0, SIX_LOCK_intent);
 		path->locks_want = 1;
 	} else {
 		enum six_lock_type lock_want = __btree_lock_want(path, 0);
@@ -403,7 +403,7 @@ retry:
 			goto retry;
 		}
 
-		mark_btree_node_locked(path, 0, lock_want);
+		mark_btree_node_locked(trans, path, 0, lock_want);
 	}
 
 	path->l[0].lock_seq	= ck->c.lock.state.seq;
diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h
index 230f376993ae..b8708466c4e3 100644
--- a/fs/bcachefs/btree_locking.h
+++ b/fs/bcachefs/btree_locking.h
@@ -57,7 +57,8 @@ static inline void mark_btree_node_unlocked(struct btree_path *path,
 	path->nodes_intent_locked &= ~(1 << level);
 }
 
-static inline void mark_btree_node_locked(struct btree_path *path,
+static inline void mark_btree_node_locked_noreset(struct btree_trans *trans,
+					  struct btree_path *path,
 					  unsigned level,
 					  enum six_lock_type type)
 {
@@ -69,10 +70,22 @@ static inline void mark_btree_node_locked(struct btree_path *path,
 	path->nodes_intent_locked |= type << level;
 }
 
-static inline void mark_btree_node_intent_locked(struct btree_path *path,
+static inline void mark_btree_node_locked(struct btree_trans *trans,
+					  struct btree_path *path,
+					  unsigned level,
+					  enum six_lock_type type)
+{
+	mark_btree_node_locked_noreset(trans, path, level, type);
+#ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS
+	path->l[level].lock_taken_time = ktime_get_ns();
+#endif
+}
+
+static inline void mark_btree_node_intent_locked(struct btree_trans *trans,
+						 struct btree_path *path,
 						 unsigned level)
 {
-	mark_btree_node_locked(path, level, SIX_LOCK_intent);
+	mark_btree_node_locked_noreset(trans, path, level, SIX_LOCK_intent);
 }
 
 static inline enum six_lock_type __btree_lock_want(struct btree_path *path, int level)
@@ -101,8 +114,18 @@ static inline void btree_node_unlock(struct btree_trans *trans,
 
 	EBUG_ON(level >= BTREE_MAX_DEPTH);
 
-	if (lock_type != BTREE_NODE_UNLOCKED)
+	if (lock_type != BTREE_NODE_UNLOCKED) {
 		six_unlock_type(&path->l[level].b->c.lock, lock_type);
+#ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS
+		if (trans->lock_name_idx < BCH_LOCK_TIME_NR) {
+			struct bch_fs *c = trans->c;
+
+			__bch2_time_stats_update(&c->lock_held_stats.times[trans->lock_name_idx],
+					       path->l[level].lock_taken_time,
+						 ktime_get_ns());
+		}
+#endif
+	}
 	mark_btree_node_unlocked(path, level);
 }
 
@@ -196,10 +219,17 @@ static inline bool btree_node_lock(struct btree_trans *trans,
 	EBUG_ON(level >= BTREE_MAX_DEPTH);
 	EBUG_ON(!(trans->paths_allocated & (1ULL << path->idx)));
 
-	return likely(six_trylock_type(&b->c.lock, type)) ||
+	if (likely(six_trylock_type(&b->c.lock, type)) ||
 		btree_node_lock_increment(trans, b, level, type) ||
 		__bch2_btree_node_lock(trans, path, b, pos, level, type,
-				       should_sleep_fn, p, ip);
+				       should_sleep_fn, p, ip)) {
+#ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS
+		path->l[b->c.level].lock_taken_time = ktime_get_ns();
+#endif
+		return true;
+	} else {
+		return false;
+	}
 }
 
 bool __bch2_btree_node_relock(struct btree_trans *, struct btree_path *, unsigned);
@@ -252,5 +282,3 @@ static inline void bch2_btree_node_lock_write(struct btree_trans *trans,
 }
 
 #endif /* _BCACHEFS_BTREE_LOCKING_H */
-
-
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index b184ec512499..8cf3ef749020 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -251,6 +251,9 @@ struct btree_path {
 		struct btree	*b;
 		struct btree_node_iter iter;
 		u32		lock_seq;
+#ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS
+		u64             lock_taken_time;
+#endif
 	}			l[BTREE_MAX_DEPTH];
 #ifdef CONFIG_BCACHEFS_DEBUG
 	unsigned long		ip_allocated;
@@ -436,6 +439,7 @@ struct btree_trans {
 	unsigned		journal_u64s;
 	unsigned		journal_preres_u64s;
 	struct replicas_delta_list *fs_usage_deltas;
+	int                      lock_name_idx;
 };
 
 #define BTREE_FLAGS()							\
diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c
index 0f25b75e3de7..45f5229f20eb 100644
--- a/fs/bcachefs/debug.c
+++ b/fs/bcachefs/debug.c
@@ -638,6 +638,75 @@ static const struct file_operations journal_pins_ops = {
 	.read		= bch2_journal_pins_read,
 };
 
+static int lock_held_stats_open(struct inode *inode, struct file *file)
+{
+	struct bch_fs *c = inode->i_private;
+	struct dump_iter *i;
+
+	i = kzalloc(sizeof(struct dump_iter), GFP_KERNEL);
+
+	if (!i)
+		return -ENOMEM;
+
+	i->iter = 0;
+	i->c    = c;
+	i->buf  = PRINTBUF;
+	file->private_data = i;
+
+	return 0;
+}
+
+static int lock_held_stats_release(struct inode *inode, struct file *file)
+{
+	struct dump_iter *i = file->private_data;
+
+	printbuf_exit(&i->buf);
+	kfree(i);
+
+	return 0;
+}
+
+static ssize_t lock_held_stats_read(struct file *file, char __user *buf,
+				      size_t size, loff_t *ppos)
+{
+	struct dump_iter        *i = file->private_data;
+	struct lock_held_stats *lhs = &i->c->lock_held_stats;
+	int err;
+
+	i->ubuf = buf;
+	i->size = size;
+	i->ret  = 0;
+
+	while (lhs->names[i->iter] != 0 && i->iter < BCH_LOCK_TIME_NR) {
+		err = flush_buf(i);
+		if (err)
+			return err;
+
+		if (!i->size)
+			break;
+
+		prt_printf(&i->buf, "%s:", lhs->names[i->iter]);
+		prt_newline(&i->buf);
+		printbuf_indent_add(&i->buf, 8);
+		bch2_time_stats_to_text(&i->buf, &lhs->times[i->iter]);
+		printbuf_indent_sub(&i->buf, 8);
+		prt_newline(&i->buf);
+		i->iter++;
+	}
+
+	if (i->buf.allocation_failure)
+		return -ENOMEM;
+
+	return i->ret;
+}
+
+static const struct file_operations lock_held_stats_op = {
+	.owner = THIS_MODULE,
+	.open = lock_held_stats_open,
+	.release = lock_held_stats_release,
+	.read = lock_held_stats_read,
+};
+
 void bch2_fs_debug_exit(struct bch_fs *c)
 {
 	if (!IS_ERR_OR_NULL(c->fs_debug_dir))
@@ -668,6 +737,11 @@ void bch2_fs_debug_init(struct bch_fs *c)
 	debugfs_create_file("journal_pins", 0400, c->fs_debug_dir,
 			    c->btree_debug, &journal_pins_ops);
 
+	if (IS_ENABLED(CONFIG_BCACHEFS_LOCK_TIME_STATS)) {
+		debugfs_create_file("lock_held_stats", 0400, c->fs_debug_dir,
+				c, &lock_held_stats_op);
+	}
+
 	c->btree_debug_dir = debugfs_create_dir("btrees", c->fs_debug_dir);
 	if (IS_ERR_OR_NULL(c->btree_debug_dir))
 		return;
-- 
cgit 


From f501ad2b8108a7910adf494fcc5c59bbbfa886e8 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 17 Jul 2022 21:33:00 -0400
Subject: bcachefs: bch2_mark_alloc(): Do wakeups after updating usage

We have an obvious wake up race if we do the wakeup _before_ updating
the counters the thing doing the waiting is reading.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/buckets.c | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 99c9d5b14d48..71618f5bfcd5 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -547,22 +547,6 @@ int bch2_mark_alloc(struct btree_trans *trans,
 		}
 	}
 
-	if (new_a.data_type == BCH_DATA_free &&
-	    (!new_a.journal_seq || new_a.journal_seq < c->journal.flushed_seq_ondisk))
-		closure_wake_up(&c->freelist_wait);
-
-	if (new_a.data_type == BCH_DATA_need_discard &&
-	    (!new_a.journal_seq || new_a.journal_seq < c->journal.flushed_seq_ondisk))
-		bch2_do_discards(c);
-
-	if (old_a.data_type != BCH_DATA_cached &&
-	    new_a.data_type == BCH_DATA_cached &&
-	    should_invalidate_buckets(ca, bch2_dev_usage_read(ca)))
-		bch2_do_invalidates(c);
-
-	if (new_a.data_type == BCH_DATA_need_gc_gens)
-		bch2_do_gc_gens(c);
-
 	percpu_down_read(&c->mark_lock);
 	if (!gc && new_a.gen != old_a.gen)
 		*bucket_gen(ca, new.k->p.offset) = new_a.gen;
@@ -602,6 +586,22 @@ int bch2_mark_alloc(struct btree_trans *trans,
 		}
 	}
 
+	if (new_a.data_type == BCH_DATA_free &&
+	    (!new_a.journal_seq || new_a.journal_seq < c->journal.flushed_seq_ondisk))
+		closure_wake_up(&c->freelist_wait);
+
+	if (new_a.data_type == BCH_DATA_need_discard &&
+	    (!new_a.journal_seq || new_a.journal_seq < c->journal.flushed_seq_ondisk))
+		bch2_do_discards(c);
+
+	if (old_a.data_type != BCH_DATA_cached &&
+	    new_a.data_type == BCH_DATA_cached &&
+	    should_invalidate_buckets(ca, bch2_dev_usage_read(ca)))
+		bch2_do_invalidates(c);
+
+	if (new_a.data_type == BCH_DATA_need_gc_gens)
+		bch2_do_gc_gens(c);
+
 	return 0;
 }
 
-- 
cgit 


From 8ef983139940439b3c169ea90dbb8c21e5dcadd3 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 17 Jul 2022 21:40:39 -0400
Subject: bcachefs: Improve bucket_alloc_fail tracepoint

We should be printing the number of free buckets, not just the number of
available buckets.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_foreground.c | 15 ++++++++++++---
 fs/bcachefs/movinggc.h         |  1 +
 fs/bcachefs/trace.h            | 29 +++++++++++++++++++++++++----
 3 files changed, 38 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index 174b3a745ab8..2d44ce2e11de 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -26,6 +26,7 @@
 #include "error.h"
 #include "io.h"
 #include "journal.h"
+#include "movinggc.h"
 #include "trace.h"
 
 #include <linux/math64.h>
@@ -526,7 +527,7 @@ static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans,
 	bool waiting = false;
 again:
 	usage = bch2_dev_usage_read(ca);
-	avail = dev_buckets_free(ca, usage,reserve);
+	avail = dev_buckets_free(ca, usage, reserve);
 
 	if (usage.d[BCH_DATA_need_discard].buckets > avail)
 		bch2_do_discards(c);
@@ -581,14 +582,22 @@ err:
 		ob = ERR_PTR(-FREELIST_EMPTY);
 
 	if (!IS_ERR(ob)) {
-		trace_bucket_alloc(ca, bch2_alloc_reserves[reserve], avail,
+		trace_bucket_alloc(ca, bch2_alloc_reserves[reserve],
+				   usage.d[BCH_DATA_free].buckets,
+				   avail,
+				   bch2_copygc_wait_amount(c),
+				   c->copygc_wait - atomic64_read(&c->io_clock[WRITE].now),
 				   buckets_seen,
 				   skipped_open,
 				   skipped_need_journal_commit,
 				   skipped_nouse,
 				   cl == NULL, PTR_ERR_OR_ZERO(ob));
 	} else {
-		trace_bucket_alloc_fail(ca, bch2_alloc_reserves[reserve], avail,
+		trace_bucket_alloc_fail(ca, bch2_alloc_reserves[reserve],
+				   usage.d[BCH_DATA_free].buckets,
+				   avail,
+				   bch2_copygc_wait_amount(c),
+				   c->copygc_wait - atomic64_read(&c->io_clock[WRITE].now),
 				   buckets_seen,
 				   skipped_open,
 				   skipped_need_journal_commit,
diff --git a/fs/bcachefs/movinggc.h b/fs/bcachefs/movinggc.h
index 922738247d03..e85c8136a46e 100644
--- a/fs/bcachefs/movinggc.h
+++ b/fs/bcachefs/movinggc.h
@@ -2,6 +2,7 @@
 #ifndef _BCACHEFS_MOVINGGC_H
 #define _BCACHEFS_MOVINGGC_H
 
+unsigned long bch2_copygc_wait_amount(struct bch_fs *);
 void bch2_copygc_stop(struct bch_fs *);
 int bch2_copygc_start(struct bch_fs *);
 void bch2_fs_copygc_init(struct bch_fs *);
diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h
index b0ecf18fa139..5782952b72a5 100644
--- a/fs/bcachefs/trace.h
+++ b/fs/bcachefs/trace.h
@@ -440,19 +440,26 @@ DEFINE_EVENT(bch_fs, gc_gens_end,
 
 DECLARE_EVENT_CLASS(bucket_alloc,
 	TP_PROTO(struct bch_dev *ca, const char *alloc_reserve,
+		 u64 free,
 		 u64 avail,
+		 u64 copygc_wait_amount,
+		 s64 copygc_waiting_for,
 		 u64 seen,
 		 u64 open,
 		 u64 need_journal_commit,
 		 u64 nouse,
 		 bool nonblocking,
 		 int ret),
-	TP_ARGS(ca, alloc_reserve, avail, seen, open, need_journal_commit, nouse, nonblocking, ret),
+	TP_ARGS(ca, alloc_reserve, free, avail, copygc_wait_amount, copygc_waiting_for,
+		seen, open, need_journal_commit, nouse, nonblocking, ret),
 
 	TP_STRUCT__entry(
 		__field(dev_t,			dev			)
 		__array(char,	reserve,	16			)
+		__field(u64,			free			)
 		__field(u64,			avail			)
+		__field(u64,			copygc_wait_amount	)
+		__field(s64,			copygc_waiting_for	)
 		__field(u64,			seen			)
 		__field(u64,			open			)
 		__field(u64,			need_journal_commit	)
@@ -464,7 +471,10 @@ DECLARE_EVENT_CLASS(bucket_alloc,
 	TP_fast_assign(
 		__entry->dev		= ca->dev;
 		strlcpy(__entry->reserve, alloc_reserve, sizeof(__entry->reserve));
+		__entry->free		= free;
 		__entry->avail		= avail;
+		__entry->copygc_wait_amount	= copygc_wait_amount;
+		__entry->copygc_waiting_for	= copygc_waiting_for;
 		__entry->seen		= seen;
 		__entry->open		= open;
 		__entry->need_journal_commit = need_journal_commit;
@@ -473,10 +483,13 @@ DECLARE_EVENT_CLASS(bucket_alloc,
 		__entry->ret		= ret;
 	),
 
-	TP_printk("%d,%d reserve %s avail %llu seen %llu open %llu need_journal_commit %llu nouse %llu nonblocking %u ret %i",
+	TP_printk("%d,%d reserve %s free %llu avail %llu copygc_wait %llu/%lli seen %llu open %llu need_journal_commit %llu nouse %llu nonblocking %u ret %i",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  __entry->reserve,
+		  __entry->free,
 		  __entry->avail,
+		  __entry->copygc_wait_amount,
+		  __entry->copygc_waiting_for,
 		  __entry->seen,
 		  __entry->open,
 		  __entry->need_journal_commit,
@@ -487,26 +500,34 @@ DECLARE_EVENT_CLASS(bucket_alloc,
 
 DEFINE_EVENT(bucket_alloc, bucket_alloc,
 	TP_PROTO(struct bch_dev *ca, const char *alloc_reserve,
+		 u64 free,
 		 u64 avail,
+		 u64 copygc_wait_amount,
+		 s64 copygc_waiting_for,
 		 u64 seen,
 		 u64 open,
 		 u64 need_journal_commit,
 		 u64 nouse,
 		 bool nonblocking,
 		 int ret),
-	TP_ARGS(ca, alloc_reserve, avail, seen, open, need_journal_commit, nouse, nonblocking, ret)
+	TP_ARGS(ca, alloc_reserve, free, avail, copygc_wait_amount, copygc_waiting_for,
+		seen, open, need_journal_commit, nouse, nonblocking, ret)
 );
 
 DEFINE_EVENT(bucket_alloc, bucket_alloc_fail,
 	TP_PROTO(struct bch_dev *ca, const char *alloc_reserve,
+		 u64 free,
 		 u64 avail,
+		 u64 copygc_wait_amount,
+		 s64 copygc_waiting_for,
 		 u64 seen,
 		 u64 open,
 		 u64 need_journal_commit,
 		 u64 nouse,
 		 bool nonblocking,
 		 int ret),
-	TP_ARGS(ca, alloc_reserve, avail, seen, open, need_journal_commit, nouse, nonblocking, ret)
+	TP_ARGS(ca, alloc_reserve, free, avail, copygc_wait_amount, copygc_waiting_for,
+		seen, open, need_journal_commit, nouse, nonblocking, ret)
 );
 
 TRACE_EVENT(discard_buckets,
-- 
cgit 


From 4910a9506cff4760d56e8a362619dee3319bee8b Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 17 Jul 2022 00:31:40 -0400
Subject: bcachefs: Convert bch2_do_discards_work() to for_each_btree_key2()

The new for_each_btree_key2() macro handles transaction retries,
allowing us to avoid nested transactions - which we want to avoid since
they're tricky to do completely correctly and upcoming assertions are
going to be checking for that.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c | 112 +++++++++++++++++++++--------------------
 1 file changed, 57 insertions(+), 55 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index a01e79aba480..73e0029c7e34 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -937,17 +937,43 @@ int bch2_check_alloc_to_lru_refs(struct bch_fs *c)
 	return ret < 0 ? ret : 0;
 }
 
-static int bch2_clear_need_discard(struct btree_trans *trans, struct bpos pos,
-				   struct bch_dev *ca, bool *discard_done)
+static int bch2_discard_one_bucket(struct btree_trans *trans,
+				   struct btree_iter *need_discard_iter,
+				   struct bpos *discard_pos_done,
+				   u64 *seen,
+				   u64 *open,
+				   u64 *need_journal_commit,
+				   u64 *discarded)
 {
 	struct bch_fs *c = trans->c;
-	struct btree_iter iter;
+	struct bpos pos = need_discard_iter->pos;
+	struct btree_iter iter = { NULL };
 	struct bkey_s_c k;
+	struct bch_dev *ca;
 	struct bkey_i_alloc_v4 *a;
 	struct printbuf buf = PRINTBUF;
-	int ret;
+	int ret = 0;
+
+	ca = bch_dev_bkey_exists(c, pos.inode);
+	if (!percpu_ref_tryget(&ca->io_ref)) {
+		bch2_btree_iter_set_pos(need_discard_iter, POS(pos.inode + 1, 0));
+		return 0;
+	}
+
+	if (bch2_bucket_is_open_safe(c, pos.inode, pos.offset)) {
+		(*open)++;
+		goto out;
+	}
 
-	bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, pos,
+	if (bch2_bucket_needs_journal_commit(&c->buckets_waiting_for_journal,
+			c->journal.flushed_seq_ondisk,
+			pos.inode, pos.offset)) {
+		(*need_journal_commit)++;
+		goto out;
+	}
+
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc,
+			     need_discard_iter->pos,
 			     BTREE_ITER_CACHED);
 	k = bch2_btree_iter_peek_slot(&iter);
 	ret = bkey_err(k);
@@ -983,7 +1009,8 @@ static int bch2_clear_need_discard(struct btree_trans *trans, struct bpos pos,
 		goto out;
 	}
 
-	if (!*discard_done && ca->mi.discard && !c->opts.nochanges) {
+	if (bkey_cmp(*discard_pos_done, iter.pos) &&
+	    ca->mi.discard && !c->opts.nochanges) {
 		/*
 		 * This works without any other locks because this is the only
 		 * thread that removes items from the need_discard tree
@@ -993,7 +1020,7 @@ static int bch2_clear_need_discard(struct btree_trans *trans, struct bpos pos,
 				     k.k->p.offset * ca->mi.bucket_size,
 				     ca->mi.bucket_size,
 				     GFP_KERNEL);
-		*discard_done = true;
+		*discard_pos_done = iter.pos;
 
 		ret = bch2_trans_relock(trans) ? 0 : -EINTR;
 		if (ret)
@@ -1003,9 +1030,18 @@ static int bch2_clear_need_discard(struct btree_trans *trans, struct bpos pos,
 	SET_BCH_ALLOC_V4_NEED_DISCARD(&a->v, false);
 	a->v.data_type = alloc_data_type(a->v, a->v.data_type);
 write:
-	ret = bch2_trans_update(trans, &iter, &a->k_i, 0);
+	ret =   bch2_trans_update(trans, &iter, &a->k_i, 0) ?:
+		bch2_trans_commit(trans, NULL, NULL,
+				  BTREE_INSERT_USE_RESERVE|BTREE_INSERT_NOFAIL);
+	if (ret)
+		goto out;
+
+	this_cpu_inc(c->counters[BCH_COUNTER_bucket_discard]);
+	(*discarded)++;
 out:
+	(*seen)++;
 	bch2_trans_iter_exit(trans, &iter);
+	percpu_ref_put(&ca->io_ref);
 	printbuf_exit(&buf);
 	return ret;
 }
@@ -1013,61 +1049,27 @@ out:
 static void bch2_do_discards_work(struct work_struct *work)
 {
 	struct bch_fs *c = container_of(work, struct bch_fs, discard_work);
-	struct bch_dev *ca = NULL;
 	struct btree_trans trans;
 	struct btree_iter iter;
 	struct bkey_s_c k;
 	u64 seen = 0, open = 0, need_journal_commit = 0, discarded = 0;
+	struct bpos discard_pos_done = POS_MAX;
 	int ret;
 
 	bch2_trans_init(&trans, c, 0, 0);
 
-	for_each_btree_key(&trans, iter, BTREE_ID_need_discard,
-			   POS_MIN, 0, k, ret) {
-		bool discard_done = false;
-
-		if (ca && k.k->p.inode != ca->dev_idx) {
-			percpu_ref_put(&ca->io_ref);
-			ca = NULL;
-		}
-
-		if (!ca) {
-			ca = bch_dev_bkey_exists(c, k.k->p.inode);
-			if (!percpu_ref_tryget(&ca->io_ref)) {
-				ca = NULL;
-				bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode + 1, 0));
-				continue;
-			}
-		}
-
-		seen++;
-
-		if (bch2_bucket_is_open_safe(c, k.k->p.inode, k.k->p.offset)) {
-			open++;
-			continue;
-		}
-
-		if (bch2_bucket_needs_journal_commit(&c->buckets_waiting_for_journal,
-				c->journal.flushed_seq_ondisk,
-				k.k->p.inode, k.k->p.offset)) {
-			need_journal_commit++;
-			continue;
-		}
-
-		ret = commit_do(&trans, NULL, NULL,
-				      BTREE_INSERT_USE_RESERVE|
-				      BTREE_INSERT_NOFAIL,
-				bch2_clear_need_discard(&trans, k.k->p, ca, &discard_done));
-		if (ret)
-			break;
-
-		this_cpu_inc(c->counters[BCH_COUNTER_bucket_discard]);
-		discarded++;
-	}
-	bch2_trans_iter_exit(&trans, &iter);
-
-	if (ca)
-		percpu_ref_put(&ca->io_ref);
+	/*
+	 * We're doing the commit in bch2_discard_one_bucket instead of using
+	 * for_each_btree_key_commit() so that we can increment counters after
+	 * successful commit:
+	 */
+	ret = for_each_btree_key2(&trans, iter,
+			BTREE_ID_need_discard, POS_MIN, 0, k,
+		bch2_discard_one_bucket(&trans, &iter, &discard_pos_done,
+					&seen,
+					&open,
+					&need_journal_commit,
+					&discarded));
 
 	bch2_trans_exit(&trans);
 
-- 
cgit 


From ca91f40ff79f432772660b1d10e04cfc71214458 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 17 Jul 2022 00:44:19 -0400
Subject: bcachefs: Convert bch2_dev_freespace_init() to
 for_each_btree_key_commit()

The new for_each_btree_key2() macro handles transaction retries,
allowing us to avoid nested transactions - which we want to avoid since
they're tricky to do completely correctly and upcoming assertions are
going to be checking for that.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/alloc_background.c | 34 +++++++++++-----------------------
 1 file changed, 11 insertions(+), 23 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 73e0029c7e34..69cfc73b734f 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -1218,16 +1218,13 @@ void bch2_do_invalidates(struct bch_fs *c)
 		percpu_ref_put(&c->writes);
 }
 
-static int bucket_freespace_init(struct btree_trans *trans, struct btree_iter *iter)
+static int bucket_freespace_init(struct btree_trans *trans, struct btree_iter *iter,
+				 struct bkey_s_c k, struct bch_dev *ca)
 {
 	struct bch_alloc_v4 a;
-	struct bkey_s_c k;
-	int ret;
 
-	k = bch2_btree_iter_peek_slot(iter);
-	ret = bkey_err(k);
-	if (ret)
-		return ret;
+	if (iter->pos.offset >= ca->mi.nbuckets)
+		return 1;
 
 	bch2_alloc_to_v4(k, &a);
 	return bch2_bucket_do_index(trans, k, &a, true);
@@ -1243,24 +1240,15 @@ static int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca)
 
 	bch2_trans_init(&trans, c, 0, 0);
 
-	for_each_btree_key(&trans, iter, BTREE_ID_alloc,
-			   POS(ca->dev_idx, ca->mi.first_bucket),
-			   BTREE_ITER_SLOTS|
-			   BTREE_ITER_PREFETCH, k, ret) {
-		if (iter.pos.offset >= ca->mi.nbuckets)
-			break;
-
-		ret = commit_do(&trans, NULL, NULL,
-				      BTREE_INSERT_LAZY_RW,
-				 bucket_freespace_init(&trans, &iter));
-		if (ret)
-			break;
-	}
-	bch2_trans_iter_exit(&trans, &iter);
+	ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_alloc,
+			POS(ca->dev_idx, ca->mi.first_bucket),
+			BTREE_ITER_SLOTS|BTREE_ITER_PREFETCH, k,
+			NULL, NULL, BTREE_INSERT_LAZY_RW,
+		bucket_freespace_init(&trans, &iter, k, ca));
 
 	bch2_trans_exit(&trans);
 
-	if (ret) {
+	if (ret < 0) {
 		bch_err(ca, "error initializing free space: %i", ret);
 		return ret;
 	}
@@ -1270,7 +1258,7 @@ static int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca)
 	SET_BCH_MEMBER_FREESPACE_INITIALIZED(m, true);
 	mutex_unlock(&c->sb_lock);
 
-	return ret;
+	return 0;
 }
 
 int bch2_fs_freespace_init(struct bch_fs *c)
-- 
cgit 


From 1615505cdf2c681c72ca7ab742c9a3fd39fccfe3 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 17 Jul 2022 00:44:19 -0400
Subject: bcachefs: Convert bch2_check_lrus() to for_each_btree_key_commit()

The new for_each_btree_key2() macro handles transaction retries,
allowing us to avoid nested transactions - which we want to avoid since
they're tricky to do completely correctly and upcoming assertions are
going to be checking for that.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/lru.c | 27 +++++++--------------------
 1 file changed, 7 insertions(+), 20 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/lru.c b/fs/bcachefs/lru.c
index 94ecb3a39760..53e607d72274 100644
--- a/fs/bcachefs/lru.c
+++ b/fs/bcachefs/lru.c
@@ -130,25 +130,18 @@ int bch2_lru_change(struct btree_trans *trans, u64 id, u64 idx,
 }
 
 static int bch2_check_lru_key(struct btree_trans *trans,
-			      struct btree_iter *lru_iter)
+			      struct btree_iter *lru_iter,
+			      struct bkey_s_c lru_k)
 {
 	struct bch_fs *c = trans->c;
 	struct btree_iter iter;
-	struct bkey_s_c lru_k, k;
+	struct bkey_s_c k;
 	struct bch_alloc_v4 a;
 	struct printbuf buf1 = PRINTBUF;
 	struct printbuf buf2 = PRINTBUF;
 	struct bpos alloc_pos;
 	int ret;
 
-	lru_k = bch2_btree_iter_peek(lru_iter);
-	if (!lru_k.k)
-		return 0;
-
-	ret = bkey_err(lru_k);
-	if (ret)
-		return ret;
-
 	alloc_pos = POS(lru_k.k->p.inode,
 			le64_to_cpu(bkey_s_c_to_lru(lru_k).v->idx));
 
@@ -202,16 +195,10 @@ int bch2_check_lrus(struct bch_fs *c)
 
 	bch2_trans_init(&trans, c, 0, 0);
 
-	for_each_btree_key(&trans, iter, BTREE_ID_lru, POS_MIN,
-			   BTREE_ITER_PREFETCH, k, ret) {
-		ret = commit_do(&trans, NULL, NULL,
-				      BTREE_INSERT_NOFAIL|
-				      BTREE_INSERT_LAZY_RW,
-			bch2_check_lru_key(&trans, &iter));
-		if (ret)
-			break;
-	}
-	bch2_trans_iter_exit(&trans, &iter);
+	ret = for_each_btree_key_commit(&trans, iter,
+			BTREE_ID_lru, POS_MIN, BTREE_ITER_PREFETCH, k,
+			NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW,
+		bch2_check_lru_key(&trans, &iter, k));
 
 	bch2_trans_exit(&trans);
 	return ret;
-- 
cgit 


From 1329c7ce5651df67e5986e08fd16545c36a029a2 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 17 Jul 2022 00:44:19 -0400
Subject: bcachefs: Convert more quota code to for_each_btree_key2()

The new for_each_btree_key2() macro handles transaction retries,
allowing us to avoid nested transactions - which we want to avoid since
they're tricky to do completely correctly and upcoming assertions are
going to be checking for that.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/quota.c | 48 +++++++++---------------------------------------
 1 file changed, 9 insertions(+), 39 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c
index e35a6d1f31e9..42c831da70be 100644
--- a/fs/bcachefs/quota.c
+++ b/fs/bcachefs/quota.c
@@ -370,6 +370,9 @@ static int __bch2_quota_set(struct bch_fs *c, struct bkey_s_c k)
 
 	BUG_ON(k.k->p.inode >= QTYP_NR);
 
+	if (!((1U << k.k->p.inode) & enabled_qtypes(c)))
+		return 0;
+
 	switch (k.k->type) {
 	case KEY_TYPE_quota:
 		dq = bkey_s_c_to_quota(k);
@@ -393,30 +396,6 @@ static int __bch2_quota_set(struct bch_fs *c, struct bkey_s_c k)
 	return 0;
 }
 
-static int bch2_quota_init_type(struct bch_fs *c, enum quota_types type)
-{
-	struct btree_trans trans;
-	struct btree_iter iter;
-	struct bkey_s_c k;
-	int ret = 0;
-
-	bch2_trans_init(&trans, c, 0, 0);
-
-	for_each_btree_key(&trans, iter, BTREE_ID_quotas, POS(type, 0),
-			   BTREE_ITER_PREFETCH, k, ret) {
-		if (k.k->p.inode != type)
-			break;
-
-		ret = __bch2_quota_set(c, k);
-		if (ret)
-			break;
-	}
-	bch2_trans_iter_exit(&trans, &iter);
-
-	bch2_trans_exit(&trans);
-	return ret;
-}
-
 void bch2_fs_quota_exit(struct bch_fs *c)
 {
 	unsigned i;
@@ -491,8 +470,6 @@ advance:
 
 int bch2_fs_quota_read(struct bch_fs *c)
 {
-	unsigned i, qtypes = enabled_qtypes(c);
-	struct bch_memquota_type *q;
 	struct btree_trans trans;
 	struct btree_iter iter;
 	struct bkey_s_c k;
@@ -502,23 +479,16 @@ int bch2_fs_quota_read(struct bch_fs *c)
 	bch2_sb_quota_read(c);
 	mutex_unlock(&c->sb_lock);
 
-	for_each_set_qtype(c, i, q, qtypes) {
-		ret = bch2_quota_init_type(c, i);
-		if (ret)
-			return ret;
-	}
-
 	bch2_trans_init(&trans, c, 0, 0);
 
-	ret = for_each_btree_key2(&trans, iter, BTREE_ID_inodes,
-			     POS_MIN,
-			     BTREE_ITER_INTENT|
-			     BTREE_ITER_PREFETCH|
-			     BTREE_ITER_ALL_SNAPSHOTS,
-			     k,
+	ret = for_each_btree_key2(&trans, iter, BTREE_ID_quotas,
+			POS_MIN, BTREE_ITER_PREFETCH, k,
+		__bch2_quota_set(c, k)) ?:
+	      for_each_btree_key2(&trans, iter, BTREE_ID_inodes,
+			POS_MIN, BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
 		bch2_fs_quota_read_inode(&trans, &iter, k));
 	if (ret)
-		bch_err(c, "err reading inodes in quota init: %i", ret);
+		bch_err(c, "err in quota_read: %i", ret);
 
 	bch2_trans_exit(&trans);
 	return ret;
-- 
cgit 


From eace11a730b36e7e8ee184675927ce7e658e7616 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 17 Jul 2022 00:44:19 -0400
Subject: bcachefs: Convert more fsck code to for_each_btree_key2()

The new for_each_btree_key2() macro handles transaction retries,
allowing us to avoid nested transactions - which we want to avoid since
they're tricky to do completely correctly and upcoming assertions are
going to be checking for that.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/fsck.c | 129 +++++++++++++++++++++++++----------------------------
 1 file changed, 60 insertions(+), 69 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 6165878c2ddc..8f006b9a4804 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -1003,10 +1003,8 @@ static int check_inodes(struct bch_fs *c, bool full)
 
 	ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_inodes,
 			POS_MIN,
-			BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS,
-			k,
-			NULL, NULL,
-			BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
+			BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
+			NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
 		check_inode(&trans, &iter, k, &prev, &s, full));
 
 	bch2_trans_exit(&trans);
@@ -2194,6 +2192,47 @@ static int check_nlinks_walk_dirents(struct bch_fs *c, struct nlink_table *links
 	return ret;
 }
 
+static int check_nlinks_update_inode(struct btree_trans *trans, struct btree_iter *iter,
+				     struct bkey_s_c k,
+				     struct nlink_table *links,
+				     size_t *idx, u64 range_end)
+{
+	struct bch_fs *c = trans->c;
+	struct bch_inode_unpacked u;
+	struct nlink *link = &links->d[*idx];
+	int ret = 0;
+
+	if (k.k->p.offset >= range_end)
+		return 1;
+
+	if (!bkey_is_inode(k.k))
+		return 0;
+
+	BUG_ON(bch2_inode_unpack(k, &u));
+
+	if (S_ISDIR(le16_to_cpu(u.bi_mode)))
+		return 0;
+
+	if (!u.bi_nlink)
+		return 0;
+
+	while ((cmp_int(link->inum, k.k->p.offset) ?:
+		cmp_int(link->snapshot, k.k->p.snapshot)) < 0) {
+		BUG_ON(*idx == links->nr);
+		link = &links->d[++*idx];
+	}
+
+	if (fsck_err_on(bch2_inode_nlink_get(&u) != link->count, c,
+			"inode %llu type %s has wrong i_nlink (%u, should be %u)",
+			u.bi_inum, bch2_d_types[mode_to_type(u.bi_mode)],
+			bch2_inode_nlink_get(&u), link->count)) {
+		bch2_inode_nlink_set(&u, link->count);
+		ret = __write_inode(trans, &u, k.k->p.snapshot);
+	}
+fsck_err:
+	return ret;
+}
+
 noinline_for_stack
 static int check_nlinks_update_hardlinks(struct bch_fs *c,
 			       struct nlink_table *links,
@@ -2202,56 +2241,25 @@ static int check_nlinks_update_hardlinks(struct bch_fs *c,
 	struct btree_trans trans;
 	struct btree_iter iter;
 	struct bkey_s_c k;
-	struct bch_inode_unpacked u;
-	struct nlink *link = links->d;
+	size_t idx = 0;
 	int ret = 0;
 
 	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
 
-	for_each_btree_key(&trans, iter, BTREE_ID_inodes,
-			   POS(0, range_start),
-			   BTREE_ITER_INTENT|
-			   BTREE_ITER_PREFETCH|
-			   BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
-		if (k.k->p.offset >= range_end)
-			break;
-
-		if (!bkey_is_inode(k.k))
-			continue;
-
-		BUG_ON(bch2_inode_unpack(k, &u));
-
-		if (S_ISDIR(le16_to_cpu(u.bi_mode)))
-			continue;
-
-		if (!u.bi_nlink)
-			continue;
-
-		while ((cmp_int(link->inum, k.k->p.offset) ?:
-			cmp_int(link->snapshot, k.k->p.snapshot)) < 0) {
-			link++;
-			BUG_ON(link >= links->d + links->nr);
-		}
-
-		if (fsck_err_on(bch2_inode_nlink_get(&u) != link->count, c,
-				"inode %llu type %s has wrong i_nlink (%u, should be %u)",
-				u.bi_inum, bch2_d_types[mode_to_type(u.bi_mode)],
-				bch2_inode_nlink_get(&u), link->count)) {
-			bch2_inode_nlink_set(&u, link->count);
+	ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_inodes,
+			POS(0, range_start),
+			BTREE_ITER_INTENT|BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
+			NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
+		check_nlinks_update_inode(&trans, &iter, k, links, &idx, range_end));
 
-			ret = write_inode(&trans, &u, k.k->p.snapshot);
-			if (ret)
-				bch_err(c, "error in fsck: error %i updating inode", ret);
-		}
-	}
-fsck_err:
-	bch2_trans_iter_exit(&trans, &iter);
 	bch2_trans_exit(&trans);
 
-	if (ret)
+	if (ret < 0) {
 		bch_err(c, "error in fsck: btree error %i while walking inodes", ret);
+		return ret;
+	}
 
-	return ret;
+	return 0;
 }
 
 noinline_for_stack
@@ -2291,21 +2299,13 @@ static int check_nlinks(struct bch_fs *c)
 	return ret;
 }
 
-static int fix_reflink_p_key(struct btree_trans *trans, struct btree_iter *iter)
+static int fix_reflink_p_key(struct btree_trans *trans, struct btree_iter *iter,
+			     struct bkey_s_c k)
 {
-	struct bkey_s_c k;
 	struct bkey_s_c_reflink_p p;
 	struct bkey_i_reflink_p *u;
 	int ret;
 
-	k = bch2_btree_iter_peek(iter);
-	if (!k.k)
-		return 0;
-
-	ret = bkey_err(k);
-	if (ret)
-		return ret;
-
 	if (k.k->type != KEY_TYPE_reflink_p)
 		return 0;
 
@@ -2341,20 +2341,11 @@ static int fix_reflink_p(struct bch_fs *c)
 
 	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
 
-	for_each_btree_key(&trans, iter, BTREE_ID_extents, POS_MIN,
-			   BTREE_ITER_INTENT|
-			   BTREE_ITER_PREFETCH|
-			   BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
-		if (k.k->type == KEY_TYPE_reflink_p) {
-			ret = commit_do(&trans, NULL, NULL,
-					      BTREE_INSERT_NOFAIL|
-					      BTREE_INSERT_LAZY_RW,
-					      fix_reflink_p_key(&trans, &iter));
-			if (ret)
-				break;
-		}
-	}
-	bch2_trans_iter_exit(&trans, &iter);
+	ret = for_each_btree_key_commit(&trans, iter,
+			BTREE_ID_extents, POS_MIN,
+			BTREE_ITER_INTENT|BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
+			NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW,
+		fix_reflink_p_key(&trans, &iter, k));
 
 	bch2_trans_exit(&trans);
 	return ret;
-- 
cgit 


From 326568f18cb57ed95a420361da9a64330e122cda Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 17 Jul 2022 00:44:19 -0400
Subject: bcachefs: Convert bch2_gc_done() for_each_btree_key2()

This converts bch2_gc_stripes_done() and bch2_gc_reflink_done() to the
new for_each_btree_key_commit() macro.

The new for_each_btree_key2() and for_each_btree_key_commit() macros
handles transaction retries, allowing us to avoid nested transactions -
which we want to avoid since they're tricky to do completely correctly
and upcoming assertions are going to be checking for that.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_gc.c | 232 +++++++++++++++++++++++++------------------------
 1 file changed, 117 insertions(+), 115 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index f72a5ceb130b..7a7639e9ee3f 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -1321,21 +1321,19 @@ static inline bool bch2_alloc_v4_cmp(struct bch_alloc_v4 l,
 
 static int bch2_alloc_write_key(struct btree_trans *trans,
 				struct btree_iter *iter,
+				struct bkey_s_c k,
 				bool metadata_only)
 {
 	struct bch_fs *c = trans->c;
 	struct bch_dev *ca = bch_dev_bkey_exists(c, iter->pos.inode);
 	struct bucket gc, *b;
-	struct bkey_s_c k;
 	struct bkey_i_alloc_v4 *a;
 	struct bch_alloc_v4 old, new;
 	enum bch_data_type type;
 	int ret;
 
-	k = bch2_btree_iter_peek_slot(iter);
-	ret = bkey_err(k);
-	if (ret)
-		return ret;
+	if (bkey_cmp(iter->pos, POS(ca->dev_idx, ca->mi.nbuckets)) >= 0)
+		return 1;
 
 	bch2_alloc_to_v4(k, &old);
 	new = old;
@@ -1428,23 +1426,13 @@ static int bch2_gc_alloc_done(struct bch_fs *c, bool metadata_only)
 	bch2_trans_init(&trans, c, 0, 0);
 
 	for_each_member_device(ca, c, i) {
-		for_each_btree_key(&trans, iter, BTREE_ID_alloc,
-				   POS(ca->dev_idx, ca->mi.first_bucket),
-				   BTREE_ITER_SLOTS|
-				   BTREE_ITER_PREFETCH, k, ret) {
-			if (bkey_cmp(iter.pos, POS(ca->dev_idx, ca->mi.nbuckets)) >= 0)
-				break;
+		ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_alloc,
+				POS(ca->dev_idx, ca->mi.first_bucket),
+				BTREE_ITER_SLOTS|BTREE_ITER_PREFETCH, k,
+				NULL, NULL, BTREE_INSERT_LAZY_RW,
+			bch2_alloc_write_key(&trans, &iter, k, metadata_only));
 
-			ret = commit_do(&trans, NULL, NULL,
-					      BTREE_INSERT_LAZY_RW,
-					bch2_alloc_write_key(&trans, &iter,
-							     metadata_only));
-			if (ret)
-				break;
-		}
-		bch2_trans_iter_exit(&trans, &iter);
-
-		if (ret) {
+		if (ret < 0) {
 			bch_err(c, "error writing alloc info: %i", ret);
 			percpu_ref_put(&ca->ref);
 			break;
@@ -1452,7 +1440,7 @@ static int bch2_gc_alloc_done(struct bch_fs *c, bool metadata_only)
 	}
 
 	bch2_trans_exit(&trans);
-	return ret;
+	return ret < 0 ? ret : 0;
 }
 
 static int bch2_gc_alloc_start(struct bch_fs *c, bool metadata_only)
@@ -1536,72 +1524,79 @@ static void bch2_gc_alloc_reset(struct bch_fs *c, bool metadata_only)
 	};
 }
 
-static int bch2_gc_reflink_done(struct bch_fs *c, bool metadata_only)
+static int bch2_gc_write_reflink_key(struct btree_trans *trans,
+				     struct btree_iter *iter,
+				     struct bkey_s_c k,
+				     size_t *idx)
 {
-	struct btree_trans trans;
-	struct btree_iter iter;
-	struct bkey_s_c k;
-	struct reflink_gc *r;
-	size_t idx = 0;
+	struct bch_fs *c = trans->c;
+	const __le64 *refcount = bkey_refcount_c(k);
 	struct printbuf buf = PRINTBUF;
+	struct reflink_gc *r;
 	int ret = 0;
 
-	if (metadata_only)
+	if (!refcount)
 		return 0;
 
-	bch2_trans_init(&trans, c, 0, 0);
+	while ((r = genradix_ptr(&c->reflink_gc_table, *idx)) &&
+	       r->offset < k.k->p.offset)
+		++*idx;
 
-	for_each_btree_key(&trans, iter, BTREE_ID_reflink, POS_MIN,
-			   BTREE_ITER_PREFETCH, k, ret) {
-		const __le64 *refcount = bkey_refcount_c(k);
+	if (!r ||
+	    r->offset != k.k->p.offset ||
+	    r->size != k.k->size) {
+		bch_err(c, "unexpected inconsistency walking reflink table at gc finish");
+		return -EINVAL;
+	}
 
-		if (!refcount)
-			continue;
+	if (fsck_err_on(r->refcount != le64_to_cpu(*refcount), c,
+			"reflink key has wrong refcount:\n"
+			"  %s\n"
+			"  should be %u",
+			(bch2_bkey_val_to_text(&buf, c, k), buf.buf),
+			r->refcount)) {
+		struct bkey_i *new;
 
-		r = genradix_ptr(&c->reflink_gc_table, idx++);
-		if (!r ||
-		    r->offset != k.k->p.offset ||
-		    r->size != k.k->size) {
-			bch_err(c, "unexpected inconsistency walking reflink table at gc finish");
-			ret = -EINVAL;
-			break;
-		}
+		new = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
+		ret = PTR_ERR_OR_ZERO(new);
+		if (ret)
+			return ret;
 
-		if (fsck_err_on(r->refcount != le64_to_cpu(*refcount), c,
-				"reflink key has wrong refcount:\n"
-				"  %s\n"
-				"  should be %u",
-				(printbuf_reset(&buf),
-				 bch2_bkey_val_to_text(&buf, c, k), buf.buf),
-				r->refcount)) {
-			struct bkey_i *new;
+		bkey_reassemble(new, k);
 
-			new = kmalloc(bkey_bytes(k.k), GFP_KERNEL);
-			if (!new) {
-				ret = -ENOMEM;
-				break;
-			}
+		if (!r->refcount)
+			new->k.type = KEY_TYPE_deleted;
+		else
+			*bkey_refcount(new) = cpu_to_le64(r->refcount);
 
-			bkey_reassemble(new, k);
+		ret = bch2_trans_update(trans, iter, new, 0);
+	}
+fsck_err:
+	printbuf_exit(&buf);
+	return ret;
+}
 
-			if (!r->refcount)
-				new->k.type = KEY_TYPE_deleted;
-			else
-				*bkey_refcount(new) = cpu_to_le64(r->refcount);
+static int bch2_gc_reflink_done(struct bch_fs *c, bool metadata_only)
+{
+	struct btree_trans trans;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	size_t idx = 0;
+	int ret = 0;
 
-			ret = commit_do(&trans, NULL, NULL, 0,
-				__bch2_btree_insert(&trans, BTREE_ID_reflink, new));
-			kfree(new);
+	if (metadata_only)
+		return 0;
+
+	bch2_trans_init(&trans, c, 0, 0);
+
+	ret = for_each_btree_key_commit(&trans, iter,
+			BTREE_ID_reflink, POS_MIN,
+			BTREE_ITER_PREFETCH, k,
+			NULL, NULL, BTREE_INSERT_NOFAIL,
+		bch2_gc_write_reflink_key(&trans, &iter, k, &idx));
 
-			if (ret)
-				break;
-		}
-	}
-fsck_err:
-	bch2_trans_iter_exit(&trans, &iter);
 	c->reflink_gc_nr = 0;
 	bch2_trans_exit(&trans);
-	printbuf_exit(&buf);
 	return ret;
 }
 
@@ -1653,66 +1648,73 @@ static void bch2_gc_reflink_reset(struct bch_fs *c, bool metadata_only)
 		r->refcount = 0;
 }
 
-static int bch2_gc_stripes_done(struct bch_fs *c, bool metadata_only)
+static int bch2_gc_write_stripes_key(struct btree_trans *trans,
+				     struct btree_iter *iter,
+				     struct bkey_s_c k)
 {
-	struct btree_trans trans;
-	struct btree_iter iter;
-	struct bkey_s_c k;
-	struct gc_stripe *m;
-	const struct bch_stripe *s;
+	struct bch_fs *c = trans->c;
 	struct printbuf buf = PRINTBUF;
+	const struct bch_stripe *s;
+	struct gc_stripe *m;
 	unsigned i;
 	int ret = 0;
 
-	if (metadata_only)
+	if (k.k->type != KEY_TYPE_stripe)
 		return 0;
 
-	bch2_trans_init(&trans, c, 0, 0);
+	s = bkey_s_c_to_stripe(k).v;
+	m = genradix_ptr(&c->gc_stripes, k.k->p.offset);
 
-	for_each_btree_key(&trans, iter, BTREE_ID_stripes, POS_MIN,
-			   BTREE_ITER_PREFETCH, k, ret) {
-		if (k.k->type != KEY_TYPE_stripe)
-			continue;
-
-		s = bkey_s_c_to_stripe(k).v;
-		m = genradix_ptr(&c->gc_stripes, k.k->p.offset);
-
-		for (i = 0; i < s->nr_blocks; i++)
-			if (stripe_blockcount_get(s, i) != (m ? m->block_sectors[i] : 0))
-				goto inconsistent;
-		continue;
+	for (i = 0; i < s->nr_blocks; i++)
+		if (stripe_blockcount_get(s, i) != (m ? m->block_sectors[i] : 0))
+			goto inconsistent;
+	return 0;
 inconsistent:
-		if (fsck_err_on(true, c,
-				"stripe has wrong block sector count %u:\n"
-				"  %s\n"
-				"  should be %u", i,
-				(printbuf_reset(&buf),
-				 bch2_bkey_val_to_text(&buf, c, k), buf.buf),
-				m ? m->block_sectors[i] : 0)) {
-			struct bkey_i_stripe *new;
-
-			new = kmalloc(bkey_bytes(k.k), GFP_KERNEL);
-			if (!new) {
-				ret = -ENOMEM;
-				break;
-			}
+	if (fsck_err_on(true, c,
+			"stripe has wrong block sector count %u:\n"
+			"  %s\n"
+			"  should be %u", i,
+			(printbuf_reset(&buf),
+			 bch2_bkey_val_to_text(&buf, c, k), buf.buf),
+			m ? m->block_sectors[i] : 0)) {
+		struct bkey_i_stripe *new;
+
+		new = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
+		ret = PTR_ERR_OR_ZERO(new);
+		if (ret)
+			return ret;
 
-			bkey_reassemble(&new->k_i, k);
+		bkey_reassemble(&new->k_i, k);
 
-			for (i = 0; i < new->v.nr_blocks; i++)
-				stripe_blockcount_set(&new->v, i, m ? m->block_sectors[i] : 0);
+		for (i = 0; i < new->v.nr_blocks; i++)
+			stripe_blockcount_set(&new->v, i, m ? m->block_sectors[i] : 0);
 
-			ret = commit_do(&trans, NULL, NULL, 0,
-				__bch2_btree_insert(&trans, BTREE_ID_reflink, &new->k_i));
-			kfree(new);
-		}
+		ret = bch2_trans_update(trans, iter, &new->k_i, 0);
 	}
 fsck_err:
-	bch2_trans_iter_exit(&trans, &iter);
+	printbuf_exit(&buf);
+	return ret;
+}
 
-	bch2_trans_exit(&trans);
+static int bch2_gc_stripes_done(struct bch_fs *c, bool metadata_only)
+{
+	struct btree_trans trans;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	int ret = 0;
 
-	printbuf_exit(&buf);
+	if (metadata_only)
+		return 0;
+
+	bch2_trans_init(&trans, c, 0, 0);
+
+	ret = for_each_btree_key_commit(&trans, iter,
+			BTREE_ID_stripes, POS_MIN,
+			BTREE_ITER_PREFETCH, k,
+			NULL, NULL, BTREE_INSERT_NOFAIL,
+		bch2_gc_write_stripes_key(&trans, &iter, k));
+
+	bch2_trans_exit(&trans);
 	return ret;
 }
 
-- 
cgit 


From dadecd02c49c7bb14c04445fc514c394e28c1ae3 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 14 Jul 2022 02:08:58 -0400
Subject: bcachefs: bch2_trans_run()

This adds a new helper, bch2_trans_run(), that runs a function with a
btree_transaction context but without handling transaction restarts.
We're adding checks for nested transaction restart handling: when an
inner transaction handles a transaction restart it will still have to
return it to the outer transaction, or else assertions will be popped in
the outer transaction.

But some places don't need restart handling at the outer scope, so this
helper does what they need.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_update.h | 12 ++++++++++++
 fs/bcachefs/buckets.c      |  3 +--
 fs/bcachefs/journal.c      |  2 +-
 3 files changed, 14 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
index e9127dbf7e24..1c3dd012cae8 100644
--- a/fs/bcachefs/btree_update.h
+++ b/fs/bcachefs/btree_update.h
@@ -134,6 +134,18 @@ static inline int bch2_trans_commit(struct btree_trans *trans,
 	_ret;								\
 })
 
+#define bch2_trans_run(_c, _do)						\
+({									\
+	struct btree_trans trans;					\
+	int _ret;							\
+									\
+	bch2_trans_init(&trans, (_c), 0, 0);				\
+	_ret = (_do);							\
+	bch2_trans_exit(&trans);					\
+									\
+	_ret;								\
+})
+
 #define trans_for_each_update(_trans, _i)				\
 	for ((_i) = (_trans)->updates;					\
 	     (_i) < (_trans)->updates + (_trans)->nr_updates;		\
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 71618f5bfcd5..136e116981d7 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -1931,8 +1931,7 @@ static int __bch2_trans_mark_dev_sb(struct btree_trans *trans,
 
 int bch2_trans_mark_dev_sb(struct bch_fs *c, struct bch_dev *ca)
 {
-	return bch2_trans_do(c, NULL, NULL, BTREE_INSERT_LAZY_RW,
-			__bch2_trans_mark_dev_sb(&trans, ca));
+	return bch2_trans_run(c, __bch2_trans_mark_dev_sb(&trans, ca));
 }
 
 /* Disk reservations: */
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index a71bd1bb4066..26f60db751ca 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -882,7 +882,7 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
 
 	if (!new_fs) {
 		for (i = 0; i < nr_got; i++) {
-			ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_NOFAIL,
+			ret = bch2_trans_run(c,
 				bch2_trans_mark_metadata_bucket(&trans, ca,
 						bu[i], BCH_DATA_journal,
 						ca->mi.bucket_size));
-- 
cgit 


From d04801a0f452f022fcb278b6428853460db75ab8 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 17 Jul 2022 00:31:40 -0400
Subject: bcachefs: Convert bch2_do_invalidates_work() to for_each_btree_key2()

The new for_each_btree_key2() macro handles transaction retries,
allowing us to avoid nested transactions - which we want to avoid since
they're tricky to do completely correctly and upcoming assertions are
going to be checking for that.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/alloc_background.c | 80 ++++++++++++++++++------------------------
 1 file changed, 34 insertions(+), 46 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 69cfc73b734f..99b3f35c42d7 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -1088,29 +1088,20 @@ void bch2_do_discards(struct bch_fs *c)
 		percpu_ref_put(&c->writes);
 }
 
-static int invalidate_one_bucket(struct btree_trans *trans, struct bch_dev *ca,
-				 struct bpos *bucket_pos, unsigned *cached_sectors)
+static int invalidate_one_bucket(struct btree_trans *trans,
+				 struct btree_iter *lru_iter, struct bkey_s_c k,
+				 unsigned dev_idx, s64 *nr_to_invalidate)
 {
 	struct bch_fs *c = trans->c;
-	struct btree_iter lru_iter, alloc_iter = { NULL };
-	struct bkey_s_c k;
+	struct btree_iter alloc_iter = { NULL };
 	struct bkey_i_alloc_v4 *a;
-	u64 bucket, idx;
+	struct bpos bucket;
 	struct printbuf buf = PRINTBUF;
-	int ret;
-
-	bch2_trans_iter_init(trans, &lru_iter, BTREE_ID_lru,
-			     POS(ca->dev_idx, 0), 0);
-next_lru:
-	k = bch2_btree_iter_peek(&lru_iter);
-	ret = bkey_err(k);
-	if (ret)
-		goto out;
+	unsigned cached_sectors;
+	int ret = 0;
 
-	if (!k.k || k.k->p.inode != ca->dev_idx) {
-		ret = 1;
-		goto out;
-	}
+	if (*nr_to_invalidate <= 0 || k.k->p.inode != dev_idx)
+		return 1;
 
 	if (k.k->type != KEY_TYPE_lru) {
 		prt_printf(&buf, "non lru key in lru btree:\n  ");
@@ -1118,26 +1109,22 @@ next_lru:
 
 		if (!test_bit(BCH_FS_CHECK_LRUS_DONE, &c->flags)) {
 			bch_err(c, "%s", buf.buf);
-			bch2_btree_iter_advance(&lru_iter);
-			goto next_lru;
 		} else {
 			bch2_trans_inconsistent(trans, "%s", buf.buf);
 			ret = -EINVAL;
-			goto out;
 		}
-	}
 
-	idx	= k.k->p.offset;
-	bucket	= le64_to_cpu(bkey_s_c_to_lru(k).v->idx);
+		goto out;
+	}
 
-	*bucket_pos = POS(ca->dev_idx, bucket);
+	bucket = POS(dev_idx, le64_to_cpu(bkey_s_c_to_lru(k).v->idx));
 
-	a = bch2_trans_start_alloc_update(trans, &alloc_iter, *bucket_pos);
+	a = bch2_trans_start_alloc_update(trans, &alloc_iter, bucket);
 	ret = PTR_ERR_OR_ZERO(a);
 	if (ret)
 		goto out;
 
-	if (idx != alloc_lru_idx(a->v)) {
+	if (k.k->p.offset != alloc_lru_idx(a->v)) {
 		prt_printf(&buf, "alloc key does not point back to lru entry when invalidating bucket:\n  ");
 		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&a->k_i));
 		prt_printf(&buf, "\n  ");
@@ -1145,19 +1132,18 @@ next_lru:
 
 		if (!test_bit(BCH_FS_CHECK_LRUS_DONE, &c->flags)) {
 			bch_err(c, "%s", buf.buf);
-			bch2_btree_iter_advance(&lru_iter);
-			goto next_lru;
 		} else {
 			bch2_trans_inconsistent(trans, "%s", buf.buf);
 			ret = -EINVAL;
-			goto out;
 		}
+
+		goto out;
 	}
 
 	if (!a->v.cached_sectors)
 		bch_err(c, "invalidating empty bucket, confused");
 
-	*cached_sectors = a->v.cached_sectors;
+	cached_sectors = a->v.cached_sectors;
 
 	SET_BCH_ALLOC_V4_NEED_INC_GEN(&a->v, false);
 	a->v.gen++;
@@ -1167,13 +1153,18 @@ next_lru:
 	a->v.io_time[READ]	= atomic64_read(&c->io_clock[READ].now);
 	a->v.io_time[WRITE]	= atomic64_read(&c->io_clock[WRITE].now);
 
-	ret = bch2_trans_update(trans, &alloc_iter, &a->k_i,
-				BTREE_TRIGGER_BUCKET_INVALIDATE);
+	ret =   bch2_trans_update(trans, &alloc_iter, &a->k_i,
+				BTREE_TRIGGER_BUCKET_INVALIDATE) ?:
+		bch2_trans_commit(trans, NULL, NULL,
+				  BTREE_INSERT_USE_RESERVE|BTREE_INSERT_NOFAIL);
 	if (ret)
 		goto out;
+
+	trace_invalidate_bucket(c, bucket.inode, bucket.offset, cached_sectors);
+	this_cpu_inc(c->counters[BCH_COUNTER_bucket_invalidate]);
+	--*nr_to_invalidate;
 out:
 	bch2_trans_iter_exit(trans, &alloc_iter);
-	bch2_trans_iter_exit(trans, &lru_iter);
 	printbuf_exit(&buf);
 	return ret;
 }
@@ -1183,8 +1174,9 @@ static void bch2_do_invalidates_work(struct work_struct *work)
 	struct bch_fs *c = container_of(work, struct bch_fs, invalidate_work);
 	struct bch_dev *ca;
 	struct btree_trans trans;
-	struct bpos bucket;
-	unsigned i, sectors;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	unsigned i;
 	int ret = 0;
 
 	bch2_trans_init(&trans, c, 0, 0);
@@ -1193,17 +1185,13 @@ static void bch2_do_invalidates_work(struct work_struct *work)
 		s64 nr_to_invalidate =
 			should_invalidate_buckets(ca, bch2_dev_usage_read(ca));
 
-		while (nr_to_invalidate-- >= 0) {
-			ret = commit_do(&trans, NULL, NULL,
-					      BTREE_INSERT_USE_RESERVE|
-					      BTREE_INSERT_NOFAIL,
-					invalidate_one_bucket(&trans, ca, &bucket,
-							      &sectors));
-			if (ret)
-				break;
+		ret = for_each_btree_key2(&trans, iter, BTREE_ID_lru,
+				POS(ca->dev_idx, 0), BTREE_ITER_INTENT, k,
+			invalidate_one_bucket(&trans, &iter, k, ca->dev_idx, &nr_to_invalidate));
 
-			trace_invalidate_bucket(c, bucket.inode, bucket.offset, sectors);
-			this_cpu_inc(c->counters[BCH_COUNTER_bucket_invalidate]);
+		if (ret < 0) {
+			percpu_ref_put(&ca->ref);
+			break;
 		}
 	}
 
-- 
cgit 


From 8933315689bcb57a3b282bad262ac584e095a2f5 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 17 Jul 2022 00:31:40 -0400
Subject: bcachefs: Convert bch2_dev_usrdata_drop() to for_each_btree_key2()

The new for_each_btree_key2() macro handles transaction retries,
allowing us to avoid nested transactions - which we want to avoid since
they're tricky to do completely correctly and upcoming assertions are
going to be checking for that.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_types.h |   5 +++
 fs/bcachefs/migrate.c     | 111 +++++++++++++++++++++-------------------------
 2 files changed, 56 insertions(+), 60 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 8cf3ef749020..64f4bc8913e8 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -650,6 +650,11 @@ static inline bool btree_type_has_snapshots(enum btree_id id)
 	return (1 << id) & BTREE_ID_HAS_SNAPSHOTS;
 }
 
+static inline bool btree_type_has_ptrs(enum btree_id id)
+{
+	return (1 << id) & BTREE_ID_HAS_PTRS;
+}
+
 static inline bool btree_node_type_needs_gc(enum btree_node_type type)
 {
 	return BTREE_NODE_TYPE_HAS_TRIGGERS & (1U << type);
diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c
index 5345697f2712..be89628702f7 100644
--- a/fs/bcachefs/migrate.c
+++ b/fs/bcachefs/migrate.c
@@ -35,85 +35,76 @@ static int drop_dev_ptrs(struct bch_fs *c, struct bkey_s k,
 	return 0;
 }
 
-static int __bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags,
-				   enum btree_id btree_id)
+static int bch2_dev_usrdata_drop_key(struct btree_trans *trans,
+				     struct btree_iter *iter,
+				     struct bkey_s_c k,
+				     unsigned dev_idx,
+				     int flags)
+{
+	struct bch_fs *c = trans->c;
+	struct bkey_i *n;
+	int ret;
+
+	if (!bch2_bkey_has_device(k, dev_idx))
+		return 0;
+
+	n = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
+	ret = PTR_ERR_OR_ZERO(n);
+	if (ret)
+		return ret;
+
+	bkey_reassemble(n, k);
+
+	ret = drop_dev_ptrs(c, bkey_i_to_s(n), dev_idx, flags, false);
+	if (ret)
+		return ret;
+
+	/*
+	 * If the new extent no longer has any pointers, bch2_extent_normalize()
+	 * will do the appropriate thing with it (turning it into a
+	 * KEY_TYPE_error key, or just a discard if it was a cached extent)
+	 */
+	bch2_extent_normalize(c, bkey_i_to_s(n));
+
+	/*
+	 * Since we're not inserting through an extent iterator
+	 * (BTREE_ITER_ALL_SNAPSHOTS iterators aren't extent iterators),
+	 * we aren't using the extent overwrite path to delete, we're
+	 * just using the normal key deletion path:
+	 */
+	if (bkey_deleted(&n->k))
+		n->k.size = 0;
+
+	return bch2_trans_update(trans, iter, n, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+}
+
+static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
 {
 	struct btree_trans trans;
 	struct btree_iter iter;
 	struct bkey_s_c k;
-	struct bkey_buf sk;
+	enum btree_id id;
 	int ret = 0;
 
-	bch2_bkey_buf_init(&sk);
 	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
 
-	bch2_trans_iter_init(&trans, &iter, btree_id, POS_MIN,
-			     BTREE_ITER_PREFETCH|
-			     BTREE_ITER_ALL_SNAPSHOTS);
-
-	while ((bch2_trans_begin(&trans),
-		(k = bch2_btree_iter_peek(&iter)).k) &&
-	       !(ret = bkey_err(k))) {
-		if (!bch2_bkey_has_device(k, dev_idx)) {
-			bch2_btree_iter_advance(&iter);
+	for (id = 0; id < BTREE_ID_NR; id++) {
+		if (!btree_type_has_ptrs(id))
 			continue;
-		}
-
-		bch2_bkey_buf_reassemble(&sk, c, k);
 
-		ret = drop_dev_ptrs(c, bkey_i_to_s(sk.k),
-				    dev_idx, flags, false);
-		if (ret)
-			break;
-
-		/*
-		 * If the new extent no longer has any pointers, bch2_extent_normalize()
-		 * will do the appropriate thing with it (turning it into a
-		 * KEY_TYPE_error key, or just a discard if it was a cached extent)
-		 */
-		bch2_extent_normalize(c, bkey_i_to_s(sk.k));
-
-		/*
-		 * Since we're not inserting through an extent iterator
-		 * (BTREE_ITER_ALL_SNAPSHOTS iterators aren't extent iterators),
-		 * we aren't using the extent overwrite path to delete, we're
-		 * just using the normal key deletion path:
-		 */
-		if (bkey_deleted(&sk.k->k))
-			sk.k->k.size = 0;
-
-		ret   = bch2_btree_iter_traverse(&iter) ?:
-			bch2_trans_update(&trans, &iter, sk.k,
-					  BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
-			bch2_trans_commit(&trans, NULL, NULL,
-					BTREE_INSERT_NOFAIL);
-
-		/*
-		 * don't want to leave ret == -EINTR, since if we raced and
-		 * something else overwrote the key we could spuriously return
-		 * -EINTR below:
-		 */
-		if (ret == -EINTR)
-			ret = 0;
+		ret = for_each_btree_key_commit(&trans, iter, id, POS_MIN,
+				BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
+				NULL, NULL, BTREE_INSERT_NOFAIL,
+			bch2_dev_usrdata_drop_key(&trans, &iter, k, dev_idx, flags));
 		if (ret)
 			break;
 	}
-	bch2_trans_iter_exit(&trans, &iter);
 
 	bch2_trans_exit(&trans);
-	bch2_bkey_buf_exit(&sk, c);
-
-	BUG_ON(ret == -EINTR);
 
 	return ret;
 }
 
-static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
-{
-	return  __bch2_dev_usrdata_drop(c, dev_idx, flags, BTREE_ID_extents) ?:
-		__bch2_dev_usrdata_drop(c, dev_idx, flags, BTREE_ID_reflink);
-}
-
 static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
 {
 	struct btree_trans trans;
-- 
cgit 


From 6738dd19db5c96e574af79a3b7c1754fb2ecf2bd Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 17 Jul 2022 00:44:19 -0400
Subject: bcachefs: Convert subvol code to for_each_btree_key_commit()

The new for_each_btree_key2() macro handles transaction retries,
allowing us to avoid nested transactions - which we want to avoid since
they're tricky to do completely correctly and upcoming assertions are
going to be checking for that.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/subvolume.c | 105 ++++++++++++++++--------------------------------
 1 file changed, 34 insertions(+), 71 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c
index 1a212bac2a04..76be8735c700 100644
--- a/fs/bcachefs/subvolume.c
+++ b/fs/bcachefs/subvolume.c
@@ -291,22 +291,14 @@ int bch2_fs_check_snapshots(struct bch_fs *c)
 }
 
 static int check_subvol(struct btree_trans *trans,
-			struct btree_iter *iter)
+			struct btree_iter *iter,
+			struct bkey_s_c k)
 {
-	struct bkey_s_c k;
 	struct bkey_s_c_subvolume subvol;
 	struct bch_snapshot snapshot;
 	unsigned snapid;
 	int ret;
 
-	k = bch2_btree_iter_peek(iter);
-	if (!k.k)
-		return 0;
-
-	ret = bkey_err(k);
-	if (ret)
-		return ret;
-
 	if (k.k->type != KEY_TYPE_subvolume)
 		return 0;
 
@@ -336,22 +328,15 @@ int bch2_fs_check_subvols(struct bch_fs *c)
 {
 	struct btree_trans trans;
 	struct btree_iter iter;
+	struct bkey_s_c k;
 	int ret;
 
 	bch2_trans_init(&trans, c, 0, 0);
 
-	bch2_trans_iter_init(&trans, &iter, BTREE_ID_subvolumes,
-			     POS_MIN, BTREE_ITER_PREFETCH);
-
-	do {
-		ret = commit_do(&trans, NULL, NULL,
-				      BTREE_INSERT_LAZY_RW|
-				      BTREE_INSERT_NOFAIL,
-				      check_subvol(&trans, &iter));
-		if (ret)
-			break;
-	} while (bch2_btree_iter_advance(&iter));
-	bch2_trans_iter_exit(&trans, &iter);
+	ret = for_each_btree_key_commit(&trans, iter,
+			BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_PREFETCH, k,
+			NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
+		check_subvol(&trans, &iter, k));
 
 	bch2_trans_exit(&trans);
 
@@ -595,59 +580,27 @@ err:
 	return ret;
 }
 
-static int bch2_snapshot_delete_keys_btree(struct btree_trans *trans,
-					   snapshot_id_list *deleted,
-					   enum btree_id btree_id)
+static int snapshot_delete_key(struct btree_trans *trans,
+			       struct btree_iter *iter,
+			       struct bkey_s_c k,
+			       snapshot_id_list *deleted,
+			       snapshot_id_list *equiv_seen,
+			       struct bpos *last_pos)
 {
 	struct bch_fs *c = trans->c;
-	struct btree_iter iter;
-	struct bkey_s_c k;
-	snapshot_id_list equiv_seen = { 0 };
-	struct bpos last_pos = POS_MIN;
-	int ret = 0;
+	u32 equiv = snapshot_t(c, k.k->p.snapshot)->equiv;
 
-	/*
-	 * XXX: We should also delete whiteouts that no longer overwrite
-	 * anything
-	 */
-
-	bch2_trans_iter_init(trans, &iter, btree_id, POS_MIN,
-			     BTREE_ITER_INTENT|
-			     BTREE_ITER_PREFETCH|
-			     BTREE_ITER_NOT_EXTENTS|
-			     BTREE_ITER_ALL_SNAPSHOTS);
-
-	while ((bch2_trans_begin(trans),
-		(k = bch2_btree_iter_peek(&iter)).k) &&
-	       !(ret = bkey_err(k))) {
-		u32 equiv = snapshot_t(c, k.k->p.snapshot)->equiv;
-
-		if (bkey_cmp(k.k->p, last_pos))
-			equiv_seen.nr = 0;
-		last_pos = k.k->p;
-
-		if (snapshot_list_has_id(deleted, k.k->p.snapshot) ||
-		    snapshot_list_has_id(&equiv_seen, equiv)) {
-			ret = commit_do(trans, NULL, NULL,
-					      BTREE_INSERT_NOFAIL,
-				bch2_btree_iter_traverse(&iter) ?:
-				bch2_btree_delete_at(trans, &iter,
-					BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE));
-			if (ret)
-				break;
-		} else {
-			ret = snapshot_list_add(c, &equiv_seen, equiv);
-			if (ret)
-				break;
-		}
+	if (bkey_cmp(k.k->p, *last_pos))
+		equiv_seen->nr = 0;
+	*last_pos = k.k->p;
 
-		bch2_btree_iter_advance(&iter);
+	if (snapshot_list_has_id(deleted, k.k->p.snapshot) ||
+	    snapshot_list_has_id(equiv_seen, equiv)) {
+		return bch2_btree_delete_at(trans, iter,
+					    BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+	} else {
+		return snapshot_list_add(c, equiv_seen, equiv);
 	}
-	bch2_trans_iter_exit(trans, &iter);
-
-	darray_exit(&equiv_seen);
-
-	return ret;
 }
 
 static int bch2_delete_redundant_snapshot(struct btree_trans *trans, struct btree_iter *iter,
@@ -742,10 +695,20 @@ int bch2_delete_dead_snapshots(struct bch_fs *c)
 	}
 
 	for (id = 0; id < BTREE_ID_NR; id++) {
+		struct bpos last_pos = POS_MIN;
+		snapshot_id_list equiv_seen = { 0 };
+
 		if (!btree_type_has_snapshots(id))
 			continue;
 
-		ret = bch2_snapshot_delete_keys_btree(&trans, &deleted, id);
+		ret = for_each_btree_key_commit(&trans, iter,
+				id, POS_MIN,
+				BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
+				NULL, NULL, BTREE_INSERT_NOFAIL,
+			snapshot_delete_key(&trans, &iter, k, &deleted, &equiv_seen, &last_pos));
+
+		darray_exit(&equiv_seen);
+
 		if (ret) {
 			bch_err(c, "error deleting snapshot keys: %i", ret);
 			goto err;
-- 
cgit 


From 445d184af25abd70575539f7d7b2c8e25b8c49c8 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 17 Jul 2022 00:44:19 -0400
Subject: bcachefs: Convert alloc code to for_each_btree_key_commit()

The new for_each_btree_key2() macro handles transaction retries,
allowing us to avoid nested transactions - which we want to avoid since
they're tricky to do completely correctly and upcoming assertions are
going to be checking for that.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/alloc_background.c | 59 ++++++++++++------------------------------
 1 file changed, 16 insertions(+), 43 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 99b3f35c42d7..baefd12a3fe8 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -715,7 +715,7 @@ static int bch2_check_discard_freespace_key(struct btree_trans *trans,
 {
 	struct bch_fs *c = trans->c;
 	struct btree_iter alloc_iter;
-	struct bkey_s_c k, freespace_k;
+	struct bkey_s_c alloc_k;
 	struct bch_alloc_v4 a;
 	u64 genbits;
 	struct bpos pos;
@@ -725,14 +725,6 @@ static int bch2_check_discard_freespace_key(struct btree_trans *trans,
 	struct printbuf buf = PRINTBUF;
 	int ret;
 
-	freespace_k = bch2_btree_iter_peek(iter);
-	if (!freespace_k.k)
-		return 1;
-
-	ret = bkey_err(freespace_k);
-	if (ret)
-		return ret;
-
 	pos = iter->pos;
 	pos.offset &= ~(~0ULL << 56);
 	genbits = iter->pos.offset & (~0ULL << 56);
@@ -744,18 +736,18 @@ static int bch2_check_discard_freespace_key(struct btree_trans *trans,
 			bch2_btree_ids[iter->btree_id], pos.inode, pos.offset))
 		goto delete;
 
-	k = bch2_btree_iter_peek_slot(&alloc_iter);
-	ret = bkey_err(k);
+	alloc_k = bch2_btree_iter_peek_slot(&alloc_iter);
+	ret = bkey_err(alloc_k);
 	if (ret)
 		goto err;
 
-	bch2_alloc_to_v4(k, &a);
+	bch2_alloc_to_v4(alloc_k, &a);
 
 	if (fsck_err_on(a.data_type != state ||
 			(state == BCH_DATA_free &&
 			 genbits != alloc_freespace_genbits(a)), c,
 			"%s\n  incorrectly set in %s index (free %u, genbits %llu should be %llu)",
-			(bch2_bkey_val_to_text(&buf, c, k), buf.buf),
+			(bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf),
 			bch2_btree_ids[iter->btree_id],
 			a.data_type == state,
 			genbits >> 56, alloc_freespace_genbits(a) >> 56))
@@ -776,6 +768,7 @@ int bch2_check_alloc_info(struct bch_fs *c)
 {
 	struct btree_trans trans;
 	struct btree_iter iter, discard_iter, freespace_iter;
+	struct bkey_s_c k;
 	int ret = 0;
 
 	bch2_trans_init(&trans, c, 0, 0);
@@ -805,36 +798,16 @@ int bch2_check_alloc_info(struct bch_fs *c)
 	if (ret < 0)
 		goto err;
 
-	bch2_trans_iter_init(&trans, &iter, BTREE_ID_need_discard, POS_MIN,
-			     BTREE_ITER_PREFETCH);
-	while (1) {
-		ret = commit_do(&trans, NULL, NULL,
-				      BTREE_INSERT_NOFAIL|
-				      BTREE_INSERT_LAZY_RW,
-			bch2_check_discard_freespace_key(&trans, &iter));
-		if (ret)
-			break;
-
-		bch2_btree_iter_advance(&iter);
-	}
-	bch2_trans_iter_exit(&trans, &iter);
-
-	if (ret < 0)
-		goto err;
-
-	bch2_trans_iter_init(&trans, &iter, BTREE_ID_freespace, POS_MIN,
-			     BTREE_ITER_PREFETCH);
-	while (1) {
-		ret = commit_do(&trans, NULL, NULL,
-				      BTREE_INSERT_NOFAIL|
-				      BTREE_INSERT_LAZY_RW,
-			bch2_check_discard_freespace_key(&trans, &iter));
-		if (ret)
-			break;
-
-		bch2_btree_iter_advance(&iter);
-	}
-	bch2_trans_iter_exit(&trans, &iter);
+	ret = for_each_btree_key_commit(&trans, iter,
+			BTREE_ID_need_discard, POS_MIN,
+			BTREE_ITER_PREFETCH, k,
+			NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW,
+		bch2_check_discard_freespace_key(&trans, &iter)) ?:
+	      for_each_btree_key_commit(&trans, iter,
+			BTREE_ID_freespace, POS_MIN,
+			BTREE_ITER_PREFETCH, k,
+			NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW,
+		bch2_check_discard_freespace_key(&trans, &iter));
 err:
 	bch2_trans_exit(&trans);
 	return ret < 0 ? ret : 0;
-- 
cgit 


From e941ae7d3afc68127adef917a2b779dabe83fdfe Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 17 Jul 2022 19:35:38 -0400
Subject: bcachefs: Add a counter for btree_trans restarts

This will help us improve nested transactions - we need to add
assertions that whenever an inner transaction handles a restart, it
still returns -EINTR to the outer transaction.

This also adds nested_lockrestart_do() and nested_commit_do() which use
the new counters to correctly return -EINTR when the transaction was
restarted.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_iter.c   | 13 +++++++++++--
 fs/bcachefs/btree_iter.h   | 46 +++++++++++++++++++++++++++++++++++++++++++++-
 fs/bcachefs/btree_types.h  |  3 +++
 fs/bcachefs/btree_update.h | 16 ++++------------
 4 files changed, 63 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 30958cbb9532..45ecd196bceb 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -3188,7 +3188,7 @@ void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
  * node may return EINTR when the trylock fails. When this occurs
  * bch2_trans_begin() should be called and the transaction retried.
  */
-void bch2_trans_begin(struct btree_trans *trans)
+u32 bch2_trans_begin(struct btree_trans *trans)
 {
 	struct btree_path *path;
 
@@ -3234,11 +3234,20 @@ void bch2_trans_begin(struct btree_trans *trans)
 		bch2_trans_relock(trans);
 	}
 
+	trans->last_restarted_ip = _RET_IP_;
 	if (trans->restarted)
 		bch2_btree_path_traverse_all(trans);
 
-	trans->restarted = false;
 	trans->last_begin_time = ktime_get_ns();
+	return trans->restart_count;
+}
+
+void bch2_trans_verify_not_restarted(struct btree_trans *trans, u32 restart_count)
+{
+	bch2_trans_inconsistent_on(trans_was_restarted(trans, restart_count), trans,
+		"trans->restart_count %u, should be %u, last restarted by %ps\n",
+		trans->restart_count, restart_count,
+		(void *) trans->last_restarted_ip);
 }
 
 static void bch2_trans_alloc_paths(struct btree_trans *trans, struct bch_fs *c)
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index 209b89dd1d2b..c2f5afc9eeb9 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -203,10 +203,18 @@ void bch2_path_put(struct btree_trans *, struct btree_path *, bool);
 bool bch2_trans_relock(struct btree_trans *);
 void bch2_trans_unlock(struct btree_trans *);
 
+static inline int trans_was_restarted(struct btree_trans *trans, u32 restart_count)
+{
+	return restart_count != trans->restart_count ? -EINTR : 0;
+}
+
+void bch2_trans_verify_not_restarted(struct btree_trans *, u32);
+
 __always_inline
 static inline int btree_trans_restart(struct btree_trans *trans)
 {
 	trans->restarted = true;
+	trans->restart_count++;
 	bch2_trans_unlock(trans);
 	return -EINTR;
 }
@@ -321,7 +329,7 @@ static inline void set_btree_iter_dontneed(struct btree_iter *iter)
 }
 
 void *bch2_trans_kmalloc(struct btree_trans *, size_t);
-void bch2_trans_begin(struct btree_trans *);
+u32 bch2_trans_begin(struct btree_trans *);
 
 static inline struct btree *
 __btree_iter_peek_node_and_restart(struct btree_trans *trans, struct btree_iter *iter)
@@ -394,6 +402,42 @@ __bch2_btree_iter_peek_and_restart(struct btree_trans *trans,
 	return k;
 }
 
+#define lockrestart_do(_trans, _do)					\
+({									\
+	int _ret;							\
+									\
+	do {								\
+		bch2_trans_begin(_trans);				\
+		_ret = (_do);						\
+	} while (_ret == -EINTR);					\
+									\
+	_ret;								\
+})
+
+/*
+ * nested_lockrestart_do(), nested_commit_do():
+ *
+ * These are like lockrestart_do() and commit_do(), with two differences:
+ *
+ *  - We don't call bch2_trans_begin() unless we had a transaction restart
+ *  - We return -EINTR if we succeeded after a transaction restart
+ */
+#define nested_lockrestart_do(_trans, _do)				\
+({									\
+	u32 _restart_count, _orig_restart_count;			\
+	int _ret;							\
+									\
+	_restart_count = _orig_restart_count = (_trans)->restart_count;	\
+									\
+	while ((_ret = (_do)) == -EINTR)				\
+		_restart_count = bch2_trans_begin(_trans);		\
+									\
+	if (!_ret)							\
+		bch2_trans_verify_not_restarted(_trans, _restart_count);\
+									\
+	_ret ?: trans_was_restarted(_trans, _orig_restart_count);	\
+})
+
 #define for_each_btree_key2(_trans, _iter, _btree_id,			\
 			    _start, _flags, _k, _do)			\
 ({									\
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 64f4bc8913e8..0650a3558182 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -410,6 +410,9 @@ struct btree_trans {
 	bool			memory_allocation_failure:1;
 	bool			journal_transaction_names:1;
 	bool			journal_replay_not_finished:1;
+	u32			restart_count;
+	unsigned long		last_restarted_ip;
+
 	/*
 	 * For when bch2_trans_update notices we'll be splitting a compressed
 	 * extent:
diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
index 1c3dd012cae8..9b5a8b18b01b 100644
--- a/fs/bcachefs/btree_update.h
+++ b/fs/bcachefs/btree_update.h
@@ -106,22 +106,14 @@ static inline int bch2_trans_commit(struct btree_trans *trans,
 	return __bch2_trans_commit(trans);
 }
 
-#define lockrestart_do(_trans, _do)					\
-({									\
-	int _ret;							\
-									\
-	do {								\
-		bch2_trans_begin(_trans);				\
-		_ret = (_do);						\
-	} while (_ret == -EINTR);					\
-									\
-	_ret;								\
-})
-
 #define commit_do(_trans, _disk_res, _journal_seq, _flags, _do)	\
 	lockrestart_do(_trans, _do ?: bch2_trans_commit(_trans, (_disk_res),\
 					(_journal_seq), (_flags)))
 
+#define nested_commit_do(_trans, _disk_res, _journal_seq, _flags, _do)	\
+	nested_lockrestart_do(_trans, _do ?: bch2_trans_commit(_trans, (_disk_res),\
+					(_journal_seq), (_flags)))
+
 #define bch2_trans_do(_c, _disk_res, _journal_seq, _flags, _do)		\
 ({									\
 	struct btree_trans trans;					\
-- 
cgit 


From 0a5156334c721295928b4c5f42eabb9c625cd73e Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 17 Jul 2022 00:44:19 -0400
Subject: bcachefs: Convert erasure coding to for_each_btree_key_commit()

The new for_each_btree_key2() macro handles transaction retries,
allowing us to avoid nested transactions - which we want to avoid since
they're tricky to do completely correctly and upcoming assertions are
going to be checking for that.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/ec.c | 112 +++++++++++++++++++++++--------------------------------
 1 file changed, 47 insertions(+), 65 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index 6ce352c526f0..ed33563d6c28 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -822,80 +822,62 @@ static void extent_stripe_ptr_add(struct bkey_s_extent e,
 	};
 }
 
-static int ec_stripe_update_ptrs(struct bch_fs *c,
-				 struct ec_stripe_buf *s,
-				 struct bkey *pos)
+static int ec_stripe_update_extent(struct btree_trans *trans,
+				   struct btree_iter *iter,
+				   struct bkey_s_c k,
+				   struct ec_stripe_buf *s,
+				   struct bpos end)
 {
-	struct btree_trans trans;
-	struct btree_iter iter;
-	struct bkey_s_c k;
-	struct bkey_s_extent e;
-	struct bkey_buf sk;
-	struct bpos next_pos;
-	int ret = 0, dev, block;
-
-	bch2_bkey_buf_init(&sk);
-	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
-
-	/* XXX this doesn't support the reflink btree */
-
-	bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
-			     bkey_start_pos(pos),
-			     BTREE_ITER_INTENT);
-retry:
-	while (bch2_trans_begin(&trans),
-	       (k = bch2_btree_iter_peek(&iter)).k &&
-	       !(ret = bkey_err(k)) &&
-	       bkey_cmp(bkey_start_pos(k.k), pos->p) < 0) {
-		const struct bch_extent_ptr *ptr_c;
-		struct bch_extent_ptr *ptr, *ec_ptr = NULL;
-
-		if (extent_has_stripe_ptr(k, s->key.k.p.offset)) {
-			bch2_btree_iter_advance(&iter);
-			continue;
-		}
+	const struct bch_extent_ptr *ptr_c;
+	struct bch_extent_ptr *ptr, *ec_ptr = NULL;
+	struct bkey_i *n;
+	int ret, dev, block;
 
-		ptr_c = bkey_matches_stripe(&s->key.v, k, &block);
-		/*
-		 * It doesn't generally make sense to erasure code cached ptrs:
-		 * XXX: should we be incrementing a counter?
-		 */
-		if (!ptr_c || ptr_c->cached) {
-			bch2_btree_iter_advance(&iter);
-			continue;
-		}
+	if (bkey_cmp(bkey_start_pos(k.k), end) >= 0)
+		return 1;
 
-		dev = s->key.v.ptrs[block].dev;
+	if (extent_has_stripe_ptr(k, s->key.k.p.offset))
+		return 0;
 
-		bch2_bkey_buf_reassemble(&sk, c, k);
-		e = bkey_i_to_s_extent(sk.k);
+	ptr_c = bkey_matches_stripe(&s->key.v, k, &block);
+	/*
+	 * It doesn't generally make sense to erasure code cached ptrs:
+	 * XXX: should we be incrementing a counter?
+	 */
+	if (!ptr_c || ptr_c->cached)
+		return 0;
 
-		bch2_bkey_drop_ptrs(e.s, ptr, ptr->dev != dev);
-		ec_ptr = (void *) bch2_bkey_has_device(e.s_c, dev);
-		BUG_ON(!ec_ptr);
+	dev = s->key.v.ptrs[block].dev;
 
-		extent_stripe_ptr_add(e, s, ec_ptr, block);
+	n = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
+	ret = PTR_ERR_OR_ZERO(n);
+	if (ret)
+		return ret;
 
-		bch2_btree_iter_set_pos(&iter, bkey_start_pos(&sk.k->k));
-		next_pos = sk.k->k.p;
+	bkey_reassemble(n, k);
 
-		ret   = bch2_btree_iter_traverse(&iter) ?:
-			bch2_trans_update(&trans, &iter, sk.k, 0) ?:
-			bch2_trans_commit(&trans, NULL, NULL,
-					BTREE_INSERT_NOFAIL);
-		if (!ret)
-			bch2_btree_iter_set_pos(&iter, next_pos);
-		if (ret)
-			break;
-	}
-	if (ret == -EINTR)
-		goto retry;
-	bch2_trans_iter_exit(&trans, &iter);
+	bch2_bkey_drop_ptrs(bkey_i_to_s(n), ptr, ptr->dev != dev);
+	ec_ptr = (void *) bch2_bkey_has_device(bkey_i_to_s_c(n), dev);
+	BUG_ON(!ec_ptr);
 
-	bch2_trans_exit(&trans);
-	bch2_bkey_buf_exit(&sk, c);
+	extent_stripe_ptr_add(bkey_i_to_s_extent(n), s, ec_ptr, block);
 
-	return ret;
+	return bch2_trans_update(trans, iter, n, 0);
+}
+
+static int ec_stripe_update_extents(struct bch_fs *c,
+				 struct ec_stripe_buf *s,
+				 struct bkey *pos)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+
+	return bch2_trans_run(c,
+		for_each_btree_key_commit(&trans, iter,
+			BTREE_ID_extents, bkey_start_pos(pos),
+			BTREE_ITER_NOT_EXTENTS|BTREE_ITER_INTENT, k,
+			NULL, NULL, BTREE_INSERT_NOFAIL,
+		ec_stripe_update_extent(&trans, &iter, k, s, pos->p)));
 }
 
 /*
@@ -966,7 +948,7 @@ static void ec_stripe_create(struct ec_stripe_new *s)
 	}
 
 	for_each_keylist_key(&s->keys, k) {
-		ret = ec_stripe_update_ptrs(c, &s->new_stripe, &k->k);
+		ret = ec_stripe_update_extents(c, &s->new_stripe, &k->k);
 		if (ret) {
 			bch_err(c, "error creating stripe: error %i updating pointers", ret);
 			break;
-- 
cgit 


From 175379db206a3a36a80585b00bb974a6ab6c43c0 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 17 Jul 2022 20:08:37 -0400
Subject: bcachefs: ec_stripe_bkey_insert() -> for_each_btree_key_norestart()

With the upcoming patches to add assertions for incorrect nested
transaction restart handling, this code is now bogus. Switch it to
for_each_btree_key_norestart() so that transaction restarts are only
handled in one place.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/ec.c | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index ed33563d6c28..80e1689765e6 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -726,7 +726,7 @@ static int ec_stripe_bkey_insert(struct btree_trans *trans,
 	struct bpos start_pos = bpos_max(min_pos, POS(0, c->ec_stripe_hint));
 	int ret;
 
-	for_each_btree_key(trans, iter, BTREE_ID_stripes, start_pos,
+	for_each_btree_key_norestart(trans, iter, BTREE_ID_stripes, start_pos,
 			   BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) {
 		if (bkey_cmp(k.k->p, POS(0, U32_MAX)) > 0) {
 			if (start_pos.offset) {
@@ -740,12 +740,13 @@ static int ec_stripe_bkey_insert(struct btree_trans *trans,
 		}
 
 		if (bkey_deleted(k.k))
-			goto found_slot;
+			break;
 	}
 
-	goto err;
-found_slot:
-	start_pos = iter.pos;
+	c->ec_stripe_hint = iter.pos.offset;
+
+	if (ret)
+		goto err;
 
 	ret = ec_stripe_mem_alloc(trans, &iter);
 	if (ret)
@@ -754,8 +755,6 @@ found_slot:
 	stripe->k.p = iter.pos;
 
 	ret = bch2_trans_update(trans, &iter, &stripe->k_i, 0);
-
-	c->ec_stripe_hint = start_pos.offset;
 err:
 	bch2_trans_iter_exit(trans, &iter);
 
-- 
cgit 


From b962552eabd59f0026dcc21c14775b9d78336baf Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 17 Jul 2022 22:59:01 -0400
Subject: bcachefs: Fix should_invalidate_buckets()

Like bch2_copygc_wait_amount, should_invalidate_buckets() needs to try
to ensure that there are always more buckets free than the largest
reserve.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/alloc_background.h | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h
index ff366e61ace5..488db3211ce4 100644
--- a/fs/bcachefs/alloc_background.h
+++ b/fs/bcachefs/alloc_background.h
@@ -134,11 +134,13 @@ void bch2_do_discards(struct bch_fs *);
 static inline u64 should_invalidate_buckets(struct bch_dev *ca,
 					    struct bch_dev_usage u)
 {
-	u64 free = u.d[BCH_DATA_free].buckets +
-		u.d[BCH_DATA_need_discard].buckets;
+	u64 want_free = ca->mi.nbuckets >> 7;
+	u64 free = max_t(s64, 0,
+			   u.d[BCH_DATA_free].buckets
+			 + u.d[BCH_DATA_need_discard].buckets
+			 - bch2_dev_buckets_reserved(ca, RESERVE_none));
 
-	return clamp_t(s64, (ca->mi.nbuckets >> 7) - free,
-		       0, u.d[BCH_DATA_cached].buckets);
+	return clamp_t(s64, want_free - free, 0, u.d[BCH_DATA_cached].buckets);
 }
 
 void bch2_do_invalidates(struct bch_fs *);
-- 
cgit 


From 3ab25c1b4ef2a57b8bc55e786e90af63f7d06663 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 22 Jun 2022 23:06:16 -0400
Subject: bcachefs: We can handle missing btree roots for all alloc btrees

We can rebuild alloc info if these btree roots are missing - no need to
bail out and say the filesystem is unrecoverable

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/recovery.c | 24 ++++++++++++++++++++----
 1 file changed, 20 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index b7598e26c683..7fb470e2e7f3 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -924,6 +924,18 @@ fsck_err:
 	return ERR_PTR(ret);
 }
 
+static bool btree_id_is_alloc(enum btree_id id)
+{
+	switch (id) {
+	case BTREE_ID_alloc:
+	case BTREE_ID_need_discard:
+	case BTREE_ID_freespace:
+		return true;
+	default:
+		return false;
+	}
+}
+
 static int read_btree_roots(struct bch_fs *c)
 {
 	unsigned i;
@@ -935,14 +947,14 @@ static int read_btree_roots(struct bch_fs *c)
 		if (!r->alive)
 			continue;
 
-		if (i == BTREE_ID_alloc &&
+		if (btree_id_is_alloc(i) &&
 		    c->opts.reconstruct_alloc) {
 			c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
 			continue;
 		}
 
 		if (r->error) {
-			__fsck_err(c, i == BTREE_ID_alloc
+			__fsck_err(c, btree_id_is_alloc(i)
 				   ? FSCK_CAN_IGNORE : 0,
 				   "invalid btree root %s",
 				   bch2_btree_ids[i]);
@@ -952,11 +964,12 @@ static int read_btree_roots(struct bch_fs *c)
 
 		ret = bch2_btree_root_read(c, i, &r->key, r->level);
 		if (ret) {
-			__fsck_err(c, i == BTREE_ID_alloc
+			__fsck_err(c,
+				   btree_id_is_alloc(i)
 				   ? FSCK_CAN_IGNORE : 0,
 				   "error reading btree root %s",
 				   bch2_btree_ids[i]);
-			if (i == BTREE_ID_alloc)
+			if (btree_id_is_alloc(i))
 				c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
 		}
 	}
@@ -1192,6 +1205,9 @@ use_clean:
 	if (ret)
 		goto err;
 
+	if (c->opts.reconstruct_alloc)
+		bch2_journal_log_msg(&c->journal, "dropping alloc info");
+
 	/*
 	 * Skip past versions that might have possibly been used (as nonces),
 	 * but hadn't had their pointers written:
-- 
cgit 


From 615f867c14b2d70efb02dafb8e668d984e74d0e3 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 17 Jul 2022 22:31:21 -0400
Subject: bcachefs: Improved errcodes

Instead of overloading standard error codes (EINTR/EAGAIN), and defining
short lists of error codes in multiple places that potentially end up
overlapping & conflicting, we're now going to have one master list of
error codes.

Error codes are defined with an x-macro: thus we also have
bch2_err_str() now.

Also, error codes have a class field. Now, instead of checking for
errors with ==, code should use bch2_err_matches(), which returns true
if the error is equal to or a sub-error of the error class.

This means we can define unique errors for every source location where
an error is generated, which will help improve our error messages.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/Kconfig            |  1 +
 fs/bcachefs/Makefile           |  1 +
 fs/bcachefs/alloc_background.c |  3 ++-
 fs/bcachefs/alloc_foreground.c | 39 ++++++++++++++++----------------
 fs/bcachefs/errcode.c          | 51 ++++++++++++++++++++++++++++++++++++++++++
 fs/bcachefs/errcode.h          | 33 ++++++++++++++++++++++-----
 fs/bcachefs/fsck.c             |  4 ++--
 fs/bcachefs/trace.h            | 32 +++++++++++++-------------
 8 files changed, 120 insertions(+), 44 deletions(-)
 create mode 100644 fs/bcachefs/errcode.c

(limited to 'fs')

diff --git a/fs/bcachefs/Kconfig b/fs/bcachefs/Kconfig
index 7ae85900e5b4..76953e05b240 100644
--- a/fs/bcachefs/Kconfig
+++ b/fs/bcachefs/Kconfig
@@ -21,6 +21,7 @@ config BCACHEFS_FS
 	select XOR_BLOCKS
 	select XXHASH
 	select SRCU
+	select SYMBOLIC_ERRNAME
 	help
 	The bcachefs filesystem - a modern, copy on write filesystem, with
 	support for multiple devices, compression, checksumming, etc.
diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile
index 95b990ad0196..2f4bd31c862f 100644
--- a/fs/bcachefs/Makefile
+++ b/fs/bcachefs/Makefile
@@ -27,6 +27,7 @@ bcachefs-y		:=	\
 	disk_groups.o		\
 	data_update.o		\
 	ec.o			\
+	errcode.o		\
 	error.o			\
 	extents.o		\
 	extent_update.o		\
diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index baefd12a3fe8..9ba1fdba4138 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -1051,7 +1051,8 @@ static void bch2_do_discards_work(struct work_struct *work)
 
 	percpu_ref_put(&c->writes);
 
-	trace_discard_buckets(c, seen, open, need_journal_commit, discarded, ret);
+	trace_discard_buckets(c, seen, open, need_journal_commit, discarded,
+			      bch2_err_str(ret));
 }
 
 void bch2_do_discards(struct bch_fs *c)
diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index 2d44ce2e11de..39e3bb5205ca 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -238,7 +238,7 @@ static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev *
 			c->blocked_allocate_open_bucket = local_clock();
 
 		spin_unlock(&c->freelist_lock);
-		return ERR_PTR(-OPEN_BUCKETS_EMPTY);
+		return ERR_PTR(-BCH_ERR_open_buckets_empty);
 	}
 
 	/* Recheck under lock: */
@@ -440,7 +440,7 @@ again:
 		goto again;
 	}
 
-	return ob ?: ERR_PTR(ret ?: -FREELIST_EMPTY);
+	return ob ?: ERR_PTR(ret ?: -BCH_ERR_no_buckets_found);
 }
 
 static struct open_bucket *bch2_bucket_alloc_freelist(struct btree_trans *trans,
@@ -548,7 +548,7 @@ again:
 		if (!c->blocked_allocate)
 			c->blocked_allocate = local_clock();
 
-		ob = ERR_PTR(-FREELIST_EMPTY);
+		ob = ERR_PTR(-BCH_ERR_freelist_empty);
 		goto err;
 	}
 
@@ -579,7 +579,7 @@ again:
 		bch2_journal_flush_async(&c->journal, NULL);
 err:
 	if (!ob)
-		ob = ERR_PTR(-FREELIST_EMPTY);
+		ob = ERR_PTR(-BCH_ERR_no_buckets_found);
 
 	if (!IS_ERR(ob)) {
 		trace_bucket_alloc(ca, bch2_alloc_reserves[reserve],
@@ -591,7 +591,8 @@ err:
 				   skipped_open,
 				   skipped_need_journal_commit,
 				   skipped_nouse,
-				   cl == NULL, PTR_ERR_OR_ZERO(ob));
+				   cl == NULL,
+				   "");
 	} else {
 		trace_bucket_alloc_fail(ca, bch2_alloc_reserves[reserve],
 				   usage.d[BCH_DATA_free].buckets,
@@ -602,7 +603,8 @@ err:
 				   skipped_open,
 				   skipped_need_journal_commit,
 				   skipped_nouse,
-				   cl == NULL, PTR_ERR_OR_ZERO(ob));
+				   cl == NULL,
+				   bch2_err_str(PTR_ERR(ob)));
 		atomic_long_inc(&c->bucket_alloc_fail);
 	}
 
@@ -750,7 +752,7 @@ static int bch2_bucket_alloc_set_trans(struct btree_trans *trans,
 	if (*nr_effective >= nr_replicas)
 		ret = 0;
 	else if (!ret)
-		ret = -INSUFFICIENT_DEVICES;
+		ret = -BCH_ERR_insufficient_devices;
 
 	return ret;
 }
@@ -923,8 +925,8 @@ static int open_bucket_add_buckets(struct btree_trans *trans,
 						 nr_replicas, nr_effective,
 						 have_cache, flags, _cl);
 			if (ret == -EINTR ||
-			    ret == -FREELIST_EMPTY ||
-			    ret == -OPEN_BUCKETS_EMPTY)
+			    bch2_err_matches(ret, BCH_ERR_freelist_empty) ||
+			    bch2_err_matches(ret, BCH_ERR_open_buckets_empty))
 				return ret;
 			if (*nr_effective >= nr_replicas)
 				return 0;
@@ -947,7 +949,7 @@ retry_blocking:
 				reserve, flags, cl);
 	if (ret &&
 	    ret != -EINTR &&
-	    ret != -INSUFFICIENT_DEVICES &&
+	    !bch2_err_matches(ret, BCH_ERR_insufficient_devices) &&
 	    !cl && _cl) {
 		cl = _cl;
 		goto retry_blocking;
@@ -1203,7 +1205,7 @@ alloc_done:
 	if (erasure_code && !ec_open_bucket(c, &ptrs))
 		pr_debug("failed to get ec bucket: ret %u", ret);
 
-	if (ret == -INSUFFICIENT_DEVICES &&
+	if (ret == -BCH_ERR_insufficient_devices &&
 	    nr_effective >= nr_replicas_required)
 		ret = 0;
 
@@ -1234,19 +1236,18 @@ err:
 
 	mutex_unlock(&wp->lock);
 
-	if (ret == -FREELIST_EMPTY &&
+	if (bch2_err_matches(ret, BCH_ERR_freelist_empty) &&
 	    try_decrease_writepoints(c, write_points_nr))
 		goto retry;
 
-	switch (ret) {
-	case -OPEN_BUCKETS_EMPTY:
-	case -FREELIST_EMPTY:
+	if (bch2_err_matches(ret, BCH_ERR_open_buckets_empty) ||
+	    bch2_err_matches(ret, BCH_ERR_freelist_empty))
 		return cl ? -EAGAIN : -ENOSPC;
-	case -INSUFFICIENT_DEVICES:
+
+	if (bch2_err_matches(ret, BCH_ERR_insufficient_devices))
 		return -EROFS;
-	default:
-		return ret;
-	}
+
+	return ret;
 }
 
 int bch2_alloc_sectors_start(struct bch_fs *c,
diff --git a/fs/bcachefs/errcode.c b/fs/bcachefs/errcode.c
new file mode 100644
index 000000000000..9da8a5973af0
--- /dev/null
+++ b/fs/bcachefs/errcode.c
@@ -0,0 +1,51 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "errcode.h"
+
+#include <linux/errname.h>
+
+static const char * const bch2_errcode_strs[] = {
+#define x(class, err) [BCH_ERR_##err - BCH_ERR_START] = #err,
+	BCH_ERRCODES()
+#undef x
+	NULL
+};
+
+#define BCH_ERR_0	0
+
+static unsigned bch2_errcode_parents[] = {
+#define x(class, err) [BCH_ERR_##err - BCH_ERR_START] = BCH_ERR_##class,
+	BCH_ERRCODES()
+#undef x
+};
+
+const char *bch2_err_str(int err)
+{
+	const char *errstr;
+	err = abs(err);
+
+	BUG_ON(err >= BCH_ERR_MAX);
+
+	if (err >= BCH_ERR_START)
+		errstr = bch2_errcode_strs[err - BCH_ERR_START];
+	else if (err)
+		errstr = errname(err);
+	else
+		errstr = "(No error)";
+	return errstr ?: "(Invalid error)";
+}
+
+bool __bch2_err_matches(int err, int class)
+{
+	err	= abs(err);
+	class	= abs(class);
+
+	BUG_ON(err	>= BCH_ERR_MAX);
+	BUG_ON(class	>= BCH_ERR_MAX);
+
+	while (err >= BCH_ERR_START && err != class)
+		err = bch2_errcode_parents[err - BCH_ERR_START];
+
+	return err == class;
+}
diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h
index 0581f3c7a0d8..69cc7cdd1c06 100644
--- a/fs/bcachefs/errcode.h
+++ b/fs/bcachefs/errcode.h
@@ -2,12 +2,33 @@
 #ifndef _BCACHEFS_ERRCODE_H
 #define _BCACHEFS_ERRCODE_H
 
-enum {
-	/* Bucket allocator: */
-	OPEN_BUCKETS_EMPTY =	2048,
-	FREELIST_EMPTY,		/* Allocator thread not keeping up */
-	INSUFFICIENT_DEVICES,
-	NEED_SNAPSHOT_CLEANUP,
+#define BCH_ERRCODES()							\
+	x(0,			open_buckets_empty)			\
+	x(0,			freelist_empty)				\
+	x(freelist_empty,	no_buckets_found)			\
+	x(0,			insufficient_devices)			\
+	x(0,			need_snapshot_cleanup)
+
+enum bch_errcode {
+	BCH_ERR_START		= 2048,
+#define x(class, err) BCH_ERR_##err,
+	BCH_ERRCODES()
+#undef x
+	BCH_ERR_MAX
 };
 
+const char *bch2_err_str(int);
+bool __bch2_err_matches(int, int);
+
+static inline bool _bch2_err_matches(int err, int class)
+{
+	return err && __bch2_err_matches(err, class);
+}
+
+#define bch2_err_matches(_err, _class)			\
+({							\
+	BUILD_BUG_ON(!__builtin_constant_p(_class));	\
+	_bch2_err_matches(_err, _class);		\
+})
+
 #endif /* _BCACHFES_ERRCODE_H */
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 8f006b9a4804..e601a1ee0ee1 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -534,7 +534,7 @@ static int snapshots_seen_update(struct bch_fs *c, struct snapshots_seen *s,
 					bch2_btree_ids[btree_id],
 					pos.inode, pos.offset,
 					i->id, n.id, n.equiv);
-				return -NEED_SNAPSHOT_CLEANUP;
+				return -BCH_ERR_need_snapshot_cleanup;
 			}
 
 			return 0;
@@ -2371,7 +2371,7 @@ again:
 		check_nlinks(c) ?:
 		fix_reflink_p(c);
 
-	if (ret == -NEED_SNAPSHOT_CLEANUP) {
+	if (bch2_err_matches(ret, BCH_ERR_need_snapshot_cleanup)) {
 		set_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags);
 		goto again;
 	}
diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h
index 5782952b72a5..65c38aa38359 100644
--- a/fs/bcachefs/trace.h
+++ b/fs/bcachefs/trace.h
@@ -449,9 +449,9 @@ DECLARE_EVENT_CLASS(bucket_alloc,
 		 u64 need_journal_commit,
 		 u64 nouse,
 		 bool nonblocking,
-		 int ret),
+		 const char *err),
 	TP_ARGS(ca, alloc_reserve, free, avail, copygc_wait_amount, copygc_waiting_for,
-		seen, open, need_journal_commit, nouse, nonblocking, ret),
+		seen, open, need_journal_commit, nouse, nonblocking, err),
 
 	TP_STRUCT__entry(
 		__field(dev_t,			dev			)
@@ -465,7 +465,7 @@ DECLARE_EVENT_CLASS(bucket_alloc,
 		__field(u64,			need_journal_commit	)
 		__field(u64,			nouse			)
 		__field(bool,			nonblocking		)
-		__field(int,			ret			)
+		__array(char,			err,	16		)
 	),
 
 	TP_fast_assign(
@@ -480,10 +480,10 @@ DECLARE_EVENT_CLASS(bucket_alloc,
 		__entry->need_journal_commit = need_journal_commit;
 		__entry->nouse		= nouse;
 		__entry->nonblocking	= nonblocking;
-		__entry->ret		= ret;
+		strlcpy(__entry->err, err, sizeof(__entry->err));
 	),
 
-	TP_printk("%d,%d reserve %s free %llu avail %llu copygc_wait %llu/%lli seen %llu open %llu need_journal_commit %llu nouse %llu nonblocking %u ret %i",
+	TP_printk("%d,%d reserve %s free %llu avail %llu copygc_wait %llu/%lli seen %llu open %llu need_journal_commit %llu nouse %llu nonblocking %u err %s",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  __entry->reserve,
 		  __entry->free,
@@ -495,7 +495,7 @@ DECLARE_EVENT_CLASS(bucket_alloc,
 		  __entry->need_journal_commit,
 		  __entry->nouse,
 		  __entry->nonblocking,
-		  __entry->ret)
+		  __entry->err)
 );
 
 DEFINE_EVENT(bucket_alloc, bucket_alloc,
@@ -509,9 +509,9 @@ DEFINE_EVENT(bucket_alloc, bucket_alloc,
 		 u64 need_journal_commit,
 		 u64 nouse,
 		 bool nonblocking,
-		 int ret),
+		 const char *err),
 	TP_ARGS(ca, alloc_reserve, free, avail, copygc_wait_amount, copygc_waiting_for,
-		seen, open, need_journal_commit, nouse, nonblocking, ret)
+		seen, open, need_journal_commit, nouse, nonblocking, err)
 );
 
 DEFINE_EVENT(bucket_alloc, bucket_alloc_fail,
@@ -525,15 +525,15 @@ DEFINE_EVENT(bucket_alloc, bucket_alloc_fail,
 		 u64 need_journal_commit,
 		 u64 nouse,
 		 bool nonblocking,
-		 int ret),
+		 const char *err),
 	TP_ARGS(ca, alloc_reserve, free, avail, copygc_wait_amount, copygc_waiting_for,
-		seen, open, need_journal_commit, nouse, nonblocking, ret)
+		seen, open, need_journal_commit, nouse, nonblocking, err)
 );
 
 TRACE_EVENT(discard_buckets,
 	TP_PROTO(struct bch_fs *c, u64 seen, u64 open,
-		 u64 need_journal_commit, u64 discarded, int ret),
-	TP_ARGS(c, seen, open, need_journal_commit, discarded, ret),
+		 u64 need_journal_commit, u64 discarded, const char *err),
+	TP_ARGS(c, seen, open, need_journal_commit, discarded, err),
 
 	TP_STRUCT__entry(
 		__field(dev_t,		dev			)
@@ -541,7 +541,7 @@ TRACE_EVENT(discard_buckets,
 		__field(u64,		open			)
 		__field(u64,		need_journal_commit	)
 		__field(u64,		discarded		)
-		__field(int,		ret			)
+		__array(char,		err,	16		)
 	),
 
 	TP_fast_assign(
@@ -550,16 +550,16 @@ TRACE_EVENT(discard_buckets,
 		__entry->open			= open;
 		__entry->need_journal_commit	= need_journal_commit;
 		__entry->discarded		= discarded;
-		__entry->ret			= ret;
+		strlcpy(__entry->err, err, sizeof(__entry->err));
 	),
 
-	TP_printk("%d%d seen %llu open %llu need_journal_commit %llu discarded %llu ret %i",
+	TP_printk("%d%d seen %llu open %llu need_journal_commit %llu discarded %llu err %s",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  __entry->seen,
 		  __entry->open,
 		  __entry->need_journal_commit,
 		  __entry->discarded,
-		  __entry->ret)
+		  __entry->err)
 );
 
 TRACE_EVENT(invalidate_bucket,
-- 
cgit 


From d4bf5eecd78a90d019b933929a14c91d6d41af62 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 18 Jul 2022 19:42:58 -0400
Subject: bcachefs: Use bch2_err_str() in error messages

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/alloc_background.c |  4 +--
 fs/bcachefs/btree_gc.c         | 35 ++++++++++++------------
 fs/bcachefs/checksum.c         | 31 ++++++++++++---------
 fs/bcachefs/ec.c               |  3 ++-
 fs/bcachefs/fs.c               |  8 +++---
 fs/bcachefs/fsck.c             | 61 ++++++++++++++++++++++--------------------
 fs/bcachefs/journal_reclaim.c  |  9 ++++---
 fs/bcachefs/migrate.c          |  4 ++-
 fs/bcachefs/move.c             |  3 ++-
 fs/bcachefs/movinggc.c         | 11 +++++---
 fs/bcachefs/quota.c            |  3 ++-
 fs/bcachefs/rebalance.c        |  9 ++++---
 fs/bcachefs/recovery.c         |  5 ++--
 fs/bcachefs/subvolume.c        | 23 ++++++++--------
 fs/bcachefs/super.c            | 21 ++++++++-------
 fs/bcachefs/tests.c            | 42 ++++++++++++++---------------
 16 files changed, 149 insertions(+), 123 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 9ba1fdba4138..eb44a8bc04fe 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -464,7 +464,7 @@ int bch2_alloc_read(struct bch_fs *c)
 	bch2_trans_exit(&trans);
 
 	if (ret)
-		bch_err(c, "error reading alloc info: %i", ret);
+		bch_err(c, "error reading alloc info: %s", bch2_err_str(ret));
 
 	return ret;
 }
@@ -1211,7 +1211,7 @@ static int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca)
 	bch2_trans_exit(&trans);
 
 	if (ret < 0) {
-		bch_err(ca, "error initializing free space: %i", ret);
+		bch_err(ca, "error initializing free space: %s", bch2_err_str(ret));
 		return ret;
 	}
 
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 7a7639e9ee3f..e7098e910a73 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -402,8 +402,8 @@ again:
 		}
 
 		if (ret) {
-			bch_err(c, "%s: error %i getting btree node",
-				__func__, ret);
+			bch_err(c, "%s: error getting btree node: %s",
+				__func__, bch2_err_str(ret));
 			break;
 		}
 
@@ -471,8 +471,8 @@ again:
 		ret = PTR_ERR_OR_ZERO(cur);
 
 		if (ret) {
-			bch_err(c, "%s: error %i getting btree node",
-				__func__, ret);
+			bch_err(c, "%s: error getting btree node: %s",
+				__func__, bch2_err_str(ret));
 			goto err;
 		}
 
@@ -804,7 +804,7 @@ static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id,
 fsck_err:
 err:
 	if (ret)
-		bch_err(c, "%s: ret %i", __func__, ret);
+		bch_err(c, "error from %s(): %s", __func__, bch2_err_str(ret));
 	return ret;
 }
 
@@ -910,7 +910,8 @@ static int bch2_gc_btree_init_recurse(struct btree_trans *trans, struct btree *b
 		ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level,
 				       false, &k, true);
 		if (ret) {
-			bch_err(c, "%s: error %i from bch2_gc_mark_key", __func__, ret);
+			bch_err(c, "%s: error from bch2_gc_mark_key: %s",
+				__func__, bch2_err_str(ret));
 			goto fsck_err;
 		}
 
@@ -970,8 +971,8 @@ static int bch2_gc_btree_init_recurse(struct btree_trans *trans, struct btree *b
 					continue;
 				}
 			} else if (ret) {
-				bch_err(c, "%s: error %i getting btree node",
-					__func__, ret);
+				bch_err(c, "%s: error getting btree node: %s",
+					__func__, bch2_err_str(ret));
 				break;
 			}
 
@@ -1038,7 +1039,7 @@ fsck_err:
 	six_unlock_read(&b->c.lock);
 
 	if (ret < 0)
-		bch_err(c, "%s: ret %i", __func__, ret);
+		bch_err(c, "error from %s(): %s", __func__, bch2_err_str(ret));
 	printbuf_exit(&buf);
 	return ret;
 }
@@ -1068,7 +1069,7 @@ static int bch2_gc_btrees(struct bch_fs *c, bool initial, bool metadata_only)
 			: bch2_gc_btree(&trans, ids[i], initial, metadata_only);
 
 	if (ret < 0)
-		bch_err(c, "%s: ret %i", __func__, ret);
+		bch_err(c, "error from %s(): %s", __func__, bch2_err_str(ret));
 
 	bch2_trans_exit(&trans);
 	return ret;
@@ -1266,7 +1267,7 @@ fsck_err:
 	if (ca)
 		percpu_ref_put(&ca->ref);
 	if (ret)
-		bch_err(c, "%s: ret %i", __func__, ret);
+		bch_err(c, "error from %s(): %s", __func__, bch2_err_str(ret));
 
 	percpu_up_write(&c->mark_lock);
 	printbuf_exit(&buf);
@@ -1433,7 +1434,7 @@ static int bch2_gc_alloc_done(struct bch_fs *c, bool metadata_only)
 			bch2_alloc_write_key(&trans, &iter, k, metadata_only));
 
 		if (ret < 0) {
-			bch_err(c, "error writing alloc info: %i", ret);
+			bch_err(c, "error writing alloc info: %s", bch2_err_str(ret));
 			percpu_ref_put(&ca->ref);
 			break;
 		}
@@ -1497,7 +1498,7 @@ static int bch2_gc_alloc_start(struct bch_fs *c, bool metadata_only)
 	bch2_trans_exit(&trans);
 
 	if (ret)
-		bch_err(c, "error reading alloc info at gc start: %i", ret);
+		bch_err(c, "error reading alloc info at gc start: %s", bch2_err_str(ret));
 
 	return ret;
 }
@@ -1968,7 +1969,7 @@ int bch2_gc_gens(struct bch_fs *c)
 					BTREE_INSERT_NOFAIL,
 				gc_btree_gens_key(&trans, &iter, k));
 			if (ret) {
-				bch_err(c, "error recalculating oldest_gen: %i", ret);
+				bch_err(c, "error recalculating oldest_gen: %s", bch2_err_str(ret));
 				goto err;
 			}
 		}
@@ -1981,7 +1982,7 @@ int bch2_gc_gens(struct bch_fs *c)
 			BTREE_INSERT_NOFAIL,
 		bch2_alloc_write_oldest_gen(&trans, &iter, k));
 	if (ret) {
-		bch_err(c, "error writing oldest_gen: %i", ret);
+		bch_err(c, "error writing oldest_gen: %s", bch2_err_str(ret));
 		goto err;
 	}
 
@@ -2053,7 +2054,7 @@ static int bch2_gc_thread(void *arg)
 		ret = bch2_gc_gens(c);
 #endif
 		if (ret < 0)
-			bch_err(c, "btree gc failed: %i", ret);
+			bch_err(c, "btree gc failed: %s", bch2_err_str(ret));
 
 		debug_check_no_locks_held();
 	}
@@ -2083,7 +2084,7 @@ int bch2_gc_thread_start(struct bch_fs *c)
 
 	p = kthread_create(bch2_gc_thread, c, "bch-gc/%s", c->name);
 	if (IS_ERR(p)) {
-		bch_err(c, "error creating gc thread: %li", PTR_ERR(p));
+		bch_err(c, "error creating gc thread: %s", bch2_err_str(PTR_ERR(p)));
 		return PTR_ERR(p);
 	}
 
diff --git a/fs/bcachefs/checksum.c b/fs/bcachefs/checksum.c
index 7c2af6754aea..b5850a761b91 100644
--- a/fs/bcachefs/checksum.c
+++ b/fs/bcachefs/checksum.c
@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 #include "bcachefs.h"
 #include "checksum.h"
+#include "errcode.h"
 #include "super.h"
 #include "super-io.h"
 
@@ -527,7 +528,7 @@ int bch2_decrypt_sb_key(struct bch_fs *c,
 
 	ret = bch2_request_key(c->disk_sb.sb, &user_key);
 	if (ret) {
-		bch_err(c, "error requesting encryption key: %i", ret);
+		bch_err(c, "error requesting encryption key: %s", bch2_err_str(ret));
 		goto err;
 	}
 
@@ -552,20 +553,24 @@ err:
 
 static int bch2_alloc_ciphers(struct bch_fs *c)
 {
+	int ret;
+
 	if (!c->chacha20)
 		c->chacha20 = crypto_alloc_sync_skcipher("chacha20", 0, 0);
-	if (IS_ERR(c->chacha20)) {
-		bch_err(c, "error requesting chacha20 module: %li",
-			PTR_ERR(c->chacha20));
-		return PTR_ERR(c->chacha20);
+	ret = PTR_ERR_OR_ZERO(c->chacha20);
+
+	if (ret) {
+		bch_err(c, "error requesting chacha20 module: %s", bch2_err_str(ret));
+		return ret;
 	}
 
 	if (!c->poly1305)
 		c->poly1305 = crypto_alloc_shash("poly1305", 0, 0);
-	if (IS_ERR(c->poly1305)) {
-		bch_err(c, "error requesting poly1305 module: %li",
-			PTR_ERR(c->poly1305));
-		return PTR_ERR(c->poly1305);
+	ret = PTR_ERR_OR_ZERO(c->poly1305);
+
+	if (ret) {
+		bch_err(c, "error requesting poly1305 module: %s", bch2_err_str(ret));
+		return ret;
 	}
 
 	return 0;
@@ -626,7 +631,7 @@ int bch2_enable_encryption(struct bch_fs *c, bool keyed)
 	if (keyed) {
 		ret = bch2_request_key(c->disk_sb.sb, &user_key);
 		if (ret) {
-			bch_err(c, "error requesting encryption key: %i", ret);
+			bch_err(c, "error requesting encryption key: %s", bch2_err_str(ret));
 			goto err;
 		}
 
@@ -678,9 +683,9 @@ int bch2_fs_encryption_init(struct bch_fs *c)
 	pr_verbose_init(c->opts, "");
 
 	c->sha256 = crypto_alloc_shash("sha256", 0, 0);
-	if (IS_ERR(c->sha256)) {
-		bch_err(c, "error requesting sha256 module");
-		ret = PTR_ERR(c->sha256);
+	ret = PTR_ERR_OR_ZERO(c->sha256);
+	if (ret) {
+		bch_err(c, "error requesting sha256 module: %s", bch2_err_str(ret));
 		goto out;
 	}
 
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index 80e1689765e6..947f2f2b1c09 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -949,7 +949,8 @@ static void ec_stripe_create(struct ec_stripe_new *s)
 	for_each_keylist_key(&s->keys, k) {
 		ret = ec_stripe_update_extents(c, &s->new_stripe, &k->k);
 		if (ret) {
-			bch_err(c, "error creating stripe: error %i updating pointers", ret);
+			bch_err(c, "error creating stripe: error updating pointers: %s",
+				bch2_err_str(ret));
 			break;
 		}
 	}
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index 08268fe1074f..876552a2a83b 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -8,6 +8,7 @@
 #include "buckets.h"
 #include "chardev.h"
 #include "dirent.h"
+#include "errcode.h"
 #include "extents.h"
 #include "fs.h"
 #include "fs-common.h"
@@ -1871,10 +1872,9 @@ got_sb:
 	sb->s_shrink.seeks = 0;
 
 	vinode = bch2_vfs_inode_get(c, BCACHEFS_ROOT_SUBVOL_INUM);
-	if (IS_ERR(vinode)) {
-		bch_err(c, "error mounting: error getting root inode %i",
-			(int) PTR_ERR(vinode));
-		ret = PTR_ERR(vinode);
+	ret = PTR_ERR_OR_ZERO(vinode);
+	if (ret) {
+		bch_err(c, "error mounting: error getting root inode: %s", bch2_err_str(ret));
 		goto err_put_super;
 	}
 
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index e601a1ee0ee1..021affcc82d4 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -137,8 +137,8 @@ static int lookup_first_inode(struct btree_trans *trans, u64 inode_nr,
 	ret = bch2_inode_unpack(k, inode);
 err:
 	if (ret && ret != -EINTR)
-		bch_err(trans->c, "error %i fetching inode %llu",
-			ret, inode_nr);
+		bch_err(trans->c, "error fetching inode %llu: %s",
+			inode_nr, bch2_err_str(ret));
 	bch2_trans_iter_exit(trans, &iter);
 	return ret;
 }
@@ -165,8 +165,8 @@ static int __lookup_inode(struct btree_trans *trans, u64 inode_nr,
 		*snapshot = iter.pos.snapshot;
 err:
 	if (ret && ret != -EINTR)
-		bch_err(trans->c, "error %i fetching inode %llu:%u",
-			ret, inode_nr, *snapshot);
+		bch_err(trans->c, "error fetching inode %llu:%u: %s",
+			inode_nr, *snapshot, bch2_err_str(ret));
 	bch2_trans_iter_exit(trans, &iter);
 	return ret;
 }
@@ -225,7 +225,8 @@ static int write_inode(struct btree_trans *trans,
 				  BTREE_INSERT_LAZY_RW,
 				  __write_inode(trans, inode, snapshot));
 	if (ret)
-		bch_err(trans->c, "error in fsck: error %i updating inode", ret);
+		bch_err(trans->c, "error in fsck: error updating inode: %s",
+			bch2_err_str(ret));
 	return ret;
 }
 
@@ -314,7 +315,7 @@ static int __remove_dirent(struct btree_trans *trans, struct bpos pos)
 	bch2_trans_iter_exit(trans, &iter);
 err:
 	if (ret && ret != -EINTR)
-		bch_err(c, "error %i from __remove_dirent()", ret);
+		bch_err(c, "error from __remove_dirent(): %s", bch2_err_str(ret));
 	return ret;
 }
 
@@ -350,7 +351,7 @@ static int lookup_lostfound(struct btree_trans *trans, u32 subvol,
 	}
 
 	if (ret && ret != -EINTR)
-		bch_err(c, "error looking up lost+found: %i", ret);
+		bch_err(c, "error looking up lost+found: %s", bch2_err_str(ret));
 	if (ret)
 		return ret;
 
@@ -373,7 +374,7 @@ create_lostfound:
 				0, 0, S_IFDIR|0700, 0, NULL, NULL,
 				(subvol_inum) { }, 0);
 	if (ret && ret != -EINTR)
-		bch_err(c, "error creating lost+found: %i", ret);
+		bch_err(c, "error creating lost+found: %s", bch2_err_str(ret));
 	return ret;
 }
 
@@ -437,8 +438,8 @@ static int reattach_inode(struct btree_trans *trans,
 				  BTREE_INSERT_NOFAIL,
 			__reattach_inode(trans, inode, inode_snapshot));
 	if (ret) {
-		bch_err(trans->c, "error %i reattaching inode %llu",
-			ret, inode->bi_inum);
+		bch_err(trans->c, "error reattaching inode %llu: %s",
+			inode->bi_inum, bch2_err_str(ret));
 		return ret;
 	}
 
@@ -910,7 +911,8 @@ static int check_inode(struct btree_trans *trans,
 
 		ret = fsck_inode_rm(trans, u.bi_inum, iter->pos.snapshot);
 		if (ret)
-			bch_err(c, "error in fsck: error %i while deleting inode", ret);
+			bch_err(c, "error in fsck: error while deleting inode: %s",
+				bch2_err_str(ret));
 		return ret;
 	}
 
@@ -933,7 +935,8 @@ static int check_inode(struct btree_trans *trans,
 				POS(u.bi_inum, U64_MAX),
 				0, NULL);
 		if (ret) {
-			bch_err(c, "error in fsck: error %i truncating inode", ret);
+			bch_err(c, "error in fsck: error truncating inode: %s",
+				bch2_err_str(ret));
 			return ret;
 		}
 
@@ -958,8 +961,8 @@ static int check_inode(struct btree_trans *trans,
 
 		sectors = bch2_count_inode_sectors(trans, u.bi_inum, iter->pos.snapshot);
 		if (sectors < 0) {
-			bch_err(c, "error in fsck: error %i recounting inode sectors",
-				(int) sectors);
+			bch_err(c, "error in fsck: error recounting inode sectors: %s",
+				bch2_err_str(sectors));
 			return sectors;
 		}
 
@@ -978,13 +981,13 @@ static int check_inode(struct btree_trans *trans,
 	if (do_update) {
 		ret = __write_inode(trans, &u, iter->pos.snapshot);
 		if (ret)
-			bch_err(c, "error in fsck: error %i "
-				"updating inode", ret);
+			bch_err(c, "error in fsck: error updating inode: %s",
+				bch2_err_str(ret));
 	}
 err:
 fsck_err:
 	if (ret)
-		bch_err(c, "error %i from check_inode()", ret);
+		bch_err(c, "error from check_inode(): %s", bch2_err_str(ret));
 	return ret;
 }
 
@@ -1010,7 +1013,7 @@ static int check_inodes(struct bch_fs *c, bool full)
 	bch2_trans_exit(&trans);
 	snapshots_seen_exit(&s);
 	if (ret)
-		bch_err(c, "error %i from check_inodes()", ret);
+		bch_err(c, "error from check_inodes(): %s", bch2_err_str(ret));
 	return ret;
 }
 
@@ -1145,7 +1148,7 @@ static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w)
 	}
 fsck_err:
 	if (ret)
-		bch_err(c, "error %i from check_i_sectors()", ret);
+		bch_err(c, "error from check_i_sectors(): %s", bch2_err_str(ret));
 	return ret ?: ret2;
 }
 
@@ -1327,7 +1330,7 @@ static int check_extents(struct bch_fs *c)
 	snapshots_seen_exit(&s);
 
 	if (ret)
-		bch_err(c, "error %i from check_extents()", ret);
+		bch_err(c, "error from check_extents(): %s", bch2_err_str(ret));
 	return ret;
 }
 
@@ -1366,7 +1369,7 @@ static int check_subdir_count(struct btree_trans *trans, struct inode_walker *w)
 	}
 fsck_err:
 	if (ret)
-		bch_err(c, "error %i from check_subdir_count()", ret);
+		bch_err(c, "error from check_subdir_count(): %s", bch2_err_str(ret));
 	return ret ?: ret2;
 }
 
@@ -1485,7 +1488,7 @@ fsck_err:
 	printbuf_exit(&buf);
 
 	if (ret && ret != -EINTR)
-		bch_err(c, "error %i from check_target()", ret);
+		bch_err(c, "error from check_target(): %s", bch2_err_str(ret));
 	return ret;
 }
 
@@ -1658,7 +1661,7 @@ fsck_err:
 	printbuf_exit(&buf);
 
 	if (ret && ret != -EINTR)
-		bch_err(c, "error %i from check_dirent()", ret);
+		bch_err(c, "error from check_dirent(): %s", bch2_err_str(ret));
 	return ret;
 }
 
@@ -1697,7 +1700,7 @@ static int check_dirents(struct bch_fs *c)
 	inode_walker_exit(&target);
 
 	if (ret)
-		bch_err(c, "error %i from check_dirents()", ret);
+		bch_err(c, "error from check_dirents(): %s", bch2_err_str(ret));
 	return ret;
 }
 
@@ -1733,7 +1736,7 @@ static int check_xattr(struct btree_trans *trans, struct btree_iter *iter,
 	ret = hash_check_key(trans, bch2_xattr_hash_desc, hash_info, iter, k);
 fsck_err:
 	if (ret && ret != -EINTR)
-		bch_err(c, "error %i from check_xattr()", ret);
+		bch_err(c, "error from check_xattr(): %s", bch2_err_str(ret));
 	return ret;
 }
 
@@ -1765,7 +1768,7 @@ static int check_xattrs(struct bch_fs *c)
 	bch2_trans_exit(&trans);
 
 	if (ret)
-		bch_err(c, "error %i from check_xattrs()", ret);
+		bch_err(c, "error from check_xattrs(): %s", bch2_err_str(ret));
 	return ret;
 }
 
@@ -1797,7 +1800,7 @@ static int check_root_trans(struct btree_trans *trans)
 				      BTREE_INSERT_LAZY_RW,
 			__bch2_btree_insert(trans, BTREE_ID_subvolumes, &root_subvol.k_i));
 		if (ret) {
-			bch_err(c, "error writing root subvol: %i", ret);
+			bch_err(c, "error writing root subvol: %s", bch2_err_str(ret));
 			goto err;
 		}
 
@@ -1816,7 +1819,7 @@ static int check_root_trans(struct btree_trans *trans)
 
 		ret = __write_inode(trans, &root_inode, snapshot);
 		if (ret)
-			bch_err(c, "error writing root inode: %i", ret);
+			bch_err(c, "error writing root inode: %s", bch2_err_str(ret));
 	}
 err:
 fsck_err:
@@ -1969,7 +1972,7 @@ static int check_path(struct btree_trans *trans,
 	}
 fsck_err:
 	if (ret)
-		bch_err(c, "%s: err %i", __func__, ret);
+		bch_err(c, "%s: err %s", __func__, bch2_err_str(ret));
 	return ret;
 }
 
diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
index 873cc14e2ae9..00d9e3a8e526 100644
--- a/fs/bcachefs/journal_reclaim.c
+++ b/fs/bcachefs/journal_reclaim.c
@@ -2,6 +2,7 @@
 
 #include "bcachefs.h"
 #include "btree_key_cache.h"
+#include "errcode.h"
 #include "error.h"
 #include "journal.h"
 #include "journal_io.h"
@@ -741,15 +742,17 @@ int bch2_journal_reclaim_start(struct journal *j)
 {
 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
 	struct task_struct *p;
+	int ret;
 
 	if (j->reclaim_thread)
 		return 0;
 
 	p = kthread_create(bch2_journal_reclaim_thread, j,
 			   "bch-reclaim/%s", c->name);
-	if (IS_ERR(p)) {
-		bch_err(c, "error creating journal reclaim thread: %li", PTR_ERR(p));
-		return PTR_ERR(p);
+	ret = PTR_ERR_OR_ZERO(p);
+	if (ret) {
+		bch_err(c, "error creating journal reclaim thread: %s", bch2_err_str(ret));
+		return ret;
 	}
 
 	get_task_struct(p);
diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c
index be89628702f7..baeca0e2a302 100644
--- a/fs/bcachefs/migrate.c
+++ b/fs/bcachefs/migrate.c
@@ -8,6 +8,7 @@
 #include "btree_update.h"
 #include "btree_update_interior.h"
 #include "buckets.h"
+#include "errcode.h"
 #include "extents.h"
 #include "io.h"
 #include "journal.h"
@@ -151,7 +152,8 @@ retry:
 			}
 
 			if (ret) {
-				bch_err(c, "Error updating btree node key: %i", ret);
+				bch_err(c, "Error updating btree node key: %s",
+					bch2_err_str(ret));
 				break;
 			}
 next:
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index 8b44d95c32ce..7fba0f70c409 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -8,6 +8,7 @@
 #include "btree_update_interior.h"
 #include "disk_groups.h"
 #include "ec.h"
+#include "errcode.h"
 #include "inode.h"
 #include "io.h"
 #include "journal_reclaim.h"
@@ -564,7 +565,7 @@ next:
 	bch2_trans_exit(&trans);
 
 	if (ret)
-		bch_err(c, "error %i in bch2_move_btree", ret);
+		bch_err(c, "error in %s(): %s", __func__, bch2_err_str(ret));
 
 	bch2_btree_interior_updates_flush(c);
 
diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c
index 49fb405c1430..438ea22ad5bd 100644
--- a/fs/bcachefs/movinggc.c
+++ b/fs/bcachefs/movinggc.c
@@ -13,6 +13,7 @@
 #include "buckets.h"
 #include "clock.h"
 #include "disk_groups.h"
+#include "errcode.h"
 #include "error.h"
 #include "extents.h"
 #include "eytzinger.h"
@@ -319,7 +320,7 @@ static int bch2_copygc(struct bch_fs *c)
 			     false,
 			     copygc_pred, NULL);
 	if (ret < 0)
-		bch_err(c, "error %i from bch2_move_data() in copygc", ret);
+		bch_err(c, "error from bch2_move_data() in copygc: %s", bch2_err_str(ret));
 	if (ret)
 		return ret;
 
@@ -427,6 +428,7 @@ void bch2_copygc_stop(struct bch_fs *c)
 int bch2_copygc_start(struct bch_fs *c)
 {
 	struct task_struct *t;
+	int ret;
 
 	if (c->copygc_thread)
 		return 0;
@@ -438,9 +440,10 @@ int bch2_copygc_start(struct bch_fs *c)
 		return -ENOMEM;
 
 	t = kthread_create(bch2_copygc_thread, c, "bch-copygc/%s", c->name);
-	if (IS_ERR(t)) {
-		bch_err(c, "error creating copygc thread: %li", PTR_ERR(t));
-		return PTR_ERR(t);
+	ret = PTR_ERR_OR_ZERO(t);
+	if (ret) {
+		bch_err(c, "error creating copygc thread: %s", bch2_err_str(ret));
+		return ret;
 	}
 
 	get_task_struct(t);
diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c
index 42c831da70be..454c76e03be9 100644
--- a/fs/bcachefs/quota.c
+++ b/fs/bcachefs/quota.c
@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 #include "bcachefs.h"
 #include "btree_update.h"
+#include "errcode.h"
 #include "inode.h"
 #include "quota.h"
 #include "subvolume.h"
@@ -488,7 +489,7 @@ int bch2_fs_quota_read(struct bch_fs *c)
 			POS_MIN, BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
 		bch2_fs_quota_read_inode(&trans, &iter, k));
 	if (ret)
-		bch_err(c, "err in quota_read: %i", ret);
+		bch_err(c, "err in quota_read: %s", bch2_err_str(ret));
 
 	bch2_trans_exit(&trans);
 	return ret;
diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c
index 1de8183ea295..6b9ccc1b3fe3 100644
--- a/fs/bcachefs/rebalance.c
+++ b/fs/bcachefs/rebalance.c
@@ -6,6 +6,7 @@
 #include "buckets.h"
 #include "clock.h"
 #include "disk_groups.h"
+#include "errcode.h"
 #include "extents.h"
 #include "io.h"
 #include "move.h"
@@ -332,6 +333,7 @@ void bch2_rebalance_stop(struct bch_fs *c)
 int bch2_rebalance_start(struct bch_fs *c)
 {
 	struct task_struct *p;
+	int ret;
 
 	if (c->rebalance.thread)
 		return 0;
@@ -340,9 +342,10 @@ int bch2_rebalance_start(struct bch_fs *c)
 		return 0;
 
 	p = kthread_create(bch2_rebalance_thread, c, "bch-rebalance/%s", c->name);
-	if (IS_ERR(p)) {
-		bch_err(c, "error creating rebalance thread: %li", PTR_ERR(p));
-		return PTR_ERR(p);
+	ret = PTR_ERR_OR_ZERO(p);
+	if (ret) {
+		bch_err(c, "error creating rebalance thread: %s", bch2_err_str(ret));
+		return ret;
 	}
 
 	get_task_struct(p);
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 7fb470e2e7f3..bb04b6f053cc 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -10,6 +10,7 @@
 #include "buckets.h"
 #include "dirent.h"
 #include "ec.h"
+#include "errcode.h"
 #include "error.h"
 #include "fs-common.h"
 #include "fsck.h"
@@ -1419,9 +1420,9 @@ out:
 	}
 
 	if (ret)
-		bch_err(c, "Error in recovery: %s (%i)", err, ret);
+		bch_err(c, "Error in recovery: %s (%s)", err, bch2_err_str(ret));
 	else
-		bch_verbose(c, "ret %i", ret);
+		bch_verbose(c, "ret %s", bch2_err_str(ret));
 	return ret;
 err:
 fsck_err:
diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c
index 76be8735c700..0469b90064eb 100644
--- a/fs/bcachefs/subvolume.c
+++ b/fs/bcachefs/subvolume.c
@@ -3,6 +3,7 @@
 #include "bcachefs.h"
 #include "btree_key_cache.h"
 #include "btree_update.h"
+#include "errcode.h"
 #include "error.h"
 #include "fs.h"
 #include "subvolume.h"
@@ -315,8 +316,8 @@ static int check_subvol(struct btree_trans *trans,
 	if (BCH_SUBVOLUME_UNLINKED(subvol.v)) {
 		ret = bch2_subvolume_delete(trans, iter->pos.offset);
 		if (ret && ret != -EINTR)
-			bch_err(trans->c, "error deleting subvolume %llu: %i",
-				iter->pos.offset, ret);
+			bch_err(trans->c, "error deleting subvolume %llu: %s",
+				iter->pos.offset, bch2_err_str(ret));
 		if (ret)
 			return ret;
 	}
@@ -365,7 +366,7 @@ int bch2_fs_snapshots_start(struct bch_fs *c)
 	bch2_trans_exit(&trans);
 
 	if (ret)
-		bch_err(c, "error starting snapshots: %i", ret);
+		bch_err(c, "error starting snapshots: %s", bch2_err_str(ret));
 	return ret;
 }
 
@@ -647,7 +648,7 @@ int bch2_delete_dead_snapshots(struct bch_fs *c)
 	if (!test_bit(BCH_FS_STARTED, &c->flags)) {
 		ret = bch2_fs_read_write_early(c);
 		if (ret) {
-			bch_err(c, "error deleleting dead snapshots: error going rw: %i", ret);
+			bch_err(c, "error deleleting dead snapshots: error going rw: %s", bch2_err_str(ret));
 			return ret;
 		}
 	}
@@ -663,7 +664,7 @@ int bch2_delete_dead_snapshots(struct bch_fs *c)
 			NULL, NULL, 0,
 		bch2_delete_redundant_snapshot(&trans, &iter, k));
 	if (ret) {
-		bch_err(c, "error deleting redundant snapshots: %i", ret);
+		bch_err(c, "error deleting redundant snapshots: %s", bch2_err_str(ret));
 		goto err;
 	}
 
@@ -671,7 +672,7 @@ int bch2_delete_dead_snapshots(struct bch_fs *c)
 			   POS_MIN, 0, k,
 		bch2_snapshot_set_equiv(&trans, k));
 	if (ret) {
-		bch_err(c, "error in bch2_snapshots_set_equiv: %i", ret);
+		bch_err(c, "error in bch2_snapshots_set_equiv: %s", bch2_err_str(ret));
 		goto err;
 	}
 
@@ -690,7 +691,7 @@ int bch2_delete_dead_snapshots(struct bch_fs *c)
 	bch2_trans_iter_exit(&trans, &iter);
 
 	if (ret) {
-		bch_err(c, "error walking snapshots: %i", ret);
+		bch_err(c, "error walking snapshots: %s", bch2_err_str(ret));
 		goto err;
 	}
 
@@ -710,7 +711,7 @@ int bch2_delete_dead_snapshots(struct bch_fs *c)
 		darray_exit(&equiv_seen);
 
 		if (ret) {
-			bch_err(c, "error deleting snapshot keys: %i", ret);
+			bch_err(c, "error deleting snapshot keys: %s", bch2_err_str(ret));
 			goto err;
 		}
 	}
@@ -719,8 +720,8 @@ int bch2_delete_dead_snapshots(struct bch_fs *c)
 		ret = commit_do(&trans, NULL, NULL, 0,
 			bch2_snapshot_node_delete(&trans, deleted.data[i]));
 		if (ret) {
-			bch_err(c, "error deleting snapshot %u: %i",
-				deleted.data[i], ret);
+			bch_err(c, "error deleting snapshot %u: %s",
+				deleted.data[i], bch2_err_str(ret));
 			goto err;
 		}
 	}
@@ -912,7 +913,7 @@ void bch2_subvolume_wait_for_pagecache_and_delete(struct work_struct *work)
 			ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_NOFAIL,
 				      bch2_subvolume_delete(&trans, *id));
 			if (ret) {
-				bch_err(c, "error %i deleting subvolume %u", ret, *id);
+				bch_err(c, "error deleting subvolume %u: %s", *id, bch2_err_str(ret));
 				break;
 			}
 		}
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index b926fb1b14a9..87742962d6c2 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -25,6 +25,7 @@
 #include "debug.h"
 #include "disk_groups.h"
 #include "ec.h"
+#include "errcode.h"
 #include "error.h"
 #include "fs.h"
 #include "fs-io.h"
@@ -1430,7 +1431,7 @@ static int bch2_dev_remove_alloc(struct bch_fs *c, struct bch_dev *ca)
 		bch2_btree_delete_range(c, BTREE_ID_alloc, start, end,
 					BTREE_TRIGGER_NORUN, NULL);
 	if (ret)
-		bch_err(c, "error %i removing dev alloc info", ret);
+		bch_err(c, "error removing dev alloc info: %s", bch2_err_str(ret));
 
 	return ret;
 }
@@ -1458,7 +1459,7 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
 
 	ret = bch2_dev_data_drop(c, ca->dev_idx, flags);
 	if (ret) {
-		bch_err(ca, "Remove failed: error %i dropping data", ret);
+		bch_err(ca, "Remove failed: error dropping data: %s", bch2_err_str(ret));
 		goto err;
 	}
 
@@ -1470,7 +1471,7 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
 
 	ret = bch2_journal_flush_device_pins(&c->journal, ca->dev_idx);
 	if (ret) {
-		bch_err(ca, "Remove failed: error %i flushing journal", ret);
+		bch_err(ca, "Remove failed: error flushing journal: %s", bch2_err_str(ret));
 		goto err;
 	}
 
@@ -1482,7 +1483,7 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
 
 	ret = bch2_replicas_gc2(c);
 	if (ret) {
-		bch_err(ca, "Remove failed: error %i from replicas gc", ret);
+		bch_err(ca, "Remove failed: error from replicas gc: %s", bch2_err_str(ret));
 		goto err;
 	}
 
@@ -1546,7 +1547,7 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
 
 	ret = bch2_read_super(path, &opts, &sb);
 	if (ret) {
-		bch_err(c, "device add error: error reading super: %i", ret);
+		bch_err(c, "device add error: error reading super: %s", bch2_err_str(ret));
 		goto err;
 	}
 
@@ -1639,13 +1640,13 @@ have_slot:
 
 	ret = bch2_trans_mark_dev_sb(c, ca);
 	if (ret) {
-		bch_err(c, "device add error: error marking new superblock: %i", ret);
+		bch_err(c, "device add error: error marking new superblock: %s", bch2_err_str(ret));
 		goto err_late;
 	}
 
 	ret = bch2_fs_freespace_init(c);
 	if (ret) {
-		bch_err(c, "device add error: error initializing free space: %i", ret);
+		bch_err(c, "device add error: error initializing free space: %s", bch2_err_str(ret));
 		goto err_late;
 	}
 
@@ -1707,8 +1708,8 @@ int bch2_dev_online(struct bch_fs *c, const char *path)
 
 	ret = bch2_trans_mark_dev_sb(c, ca);
 	if (ret) {
-		bch_err(c, "error bringing %s online: error %i from bch2_trans_mark_dev_sb",
-			path, ret);
+		bch_err(c, "error bringing %s online: error from bch2_trans_mark_dev_sb: %s",
+			path, bch2_err_str(ret));
 		goto err;
 	}
 
@@ -1777,7 +1778,7 @@ int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
 
 	ret = bch2_dev_buckets_resize(c, ca, nbuckets);
 	if (ret) {
-		bch_err(ca, "Resize error: %i", ret);
+		bch_err(ca, "Resize error: %s", bch2_err_str(ret));
 		goto err;
 	}
 
diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c
index bfcb133ff483..bf0a33c0233d 100644
--- a/fs/bcachefs/tests.c
+++ b/fs/bcachefs/tests.c
@@ -46,7 +46,7 @@ static int test_delete(struct bch_fs *c, u64 nr)
 		bch2_btree_iter_traverse(&iter) ?:
 		bch2_trans_update(&trans, &iter, &k.k_i, 0));
 	if (ret) {
-		bch_err(c, "update error in test_delete: %i", ret);
+		bch_err(c, "update error in test_delete: %s", bch2_err_str(ret));
 		goto err;
 	}
 
@@ -55,7 +55,7 @@ static int test_delete(struct bch_fs *c, u64 nr)
 		bch2_btree_iter_traverse(&iter) ?:
 		bch2_btree_delete_at(&trans, &iter, 0));
 	if (ret) {
-		bch_err(c, "delete error (first) in test_delete: %i", ret);
+		bch_err(c, "delete error (first) in test_delete: %s", bch2_err_str(ret));
 		goto err;
 	}
 
@@ -64,7 +64,7 @@ static int test_delete(struct bch_fs *c, u64 nr)
 		bch2_btree_iter_traverse(&iter) ?:
 		bch2_btree_delete_at(&trans, &iter, 0));
 	if (ret) {
-		bch_err(c, "delete error (second) in test_delete: %i", ret);
+		bch_err(c, "delete error (second) in test_delete: %s", bch2_err_str(ret));
 		goto err;
 	}
 err:
@@ -92,7 +92,7 @@ static int test_delete_written(struct bch_fs *c, u64 nr)
 		bch2_btree_iter_traverse(&iter) ?:
 		bch2_trans_update(&trans, &iter, &k.k_i, 0));
 	if (ret) {
-		bch_err(c, "update error in test_delete_written: %i", ret);
+		bch_err(c, "update error in test_delete_written: %s", bch2_err_str(ret));
 		goto err;
 	}
 
@@ -103,7 +103,7 @@ static int test_delete_written(struct bch_fs *c, u64 nr)
 		bch2_btree_iter_traverse(&iter) ?:
 		bch2_btree_delete_at(&trans, &iter, 0));
 	if (ret) {
-		bch_err(c, "delete error in test_delete_written: %i", ret);
+		bch_err(c, "delete error in test_delete_written: %s", bch2_err_str(ret));
 		goto err;
 	}
 err:
@@ -136,7 +136,7 @@ static int test_iterate(struct bch_fs *c, u64 nr)
 		ret = bch2_btree_insert(c, BTREE_ID_xattrs, &k.k_i,
 					NULL, NULL, 0);
 		if (ret) {
-			bch_err(c, "insert error in test_iterate: %i", ret);
+			bch_err(c, "insert error in test_iterate: %s", bch2_err_str(ret));
 			goto err;
 		}
 	}
@@ -192,7 +192,7 @@ static int test_iterate_extents(struct bch_fs *c, u64 nr)
 		ret = bch2_btree_insert(c, BTREE_ID_extents, &k.k_i,
 					NULL, NULL, 0);
 		if (ret) {
-			bch_err(c, "insert error in test_iterate_extents: %i", ret);
+			bch_err(c, "insert error in test_iterate_extents: %s", bch2_err_str(ret));
 			goto err;
 		}
 	}
@@ -247,7 +247,7 @@ static int test_iterate_slots(struct bch_fs *c, u64 nr)
 		ret = bch2_btree_insert(c, BTREE_ID_xattrs, &k.k_i,
 					NULL, NULL, 0);
 		if (ret) {
-			bch_err(c, "insert error in test_iterate_slots: %i", ret);
+			bch_err(c, "insert error in test_iterate_slots: %s", bch2_err_str(ret));
 			goto err;
 		}
 	}
@@ -313,7 +313,7 @@ static int test_iterate_slots_extents(struct bch_fs *c, u64 nr)
 		ret = bch2_btree_insert(c, BTREE_ID_extents, &k.k_i,
 					NULL, NULL, 0);
 		if (ret) {
-			bch_err(c, "insert error in test_iterate_slots_extents: %i", ret);
+			bch_err(c, "insert error in test_iterate_slots_extents: %s", bch2_err_str(ret));
 			goto err;
 		}
 	}
@@ -419,7 +419,7 @@ static int insert_test_extent(struct bch_fs *c,
 	ret = bch2_btree_insert(c, BTREE_ID_extents, &k.k_i,
 				NULL, NULL, 0);
 	if (ret)
-		bch_err(c, "insert error in insert_test_extent: %i", ret);
+		bch_err(c, "insert error in insert_test_extent: %s", bch2_err_str(ret));
 	return ret;
 }
 
@@ -518,7 +518,7 @@ static int test_snapshots(struct bch_fs *c, u64 nr)
 
 	ret = test_snapshot_filter(c, snapids[0], snapids[1]);
 	if (ret) {
-		bch_err(c, "err %i from test_snapshot_filter", ret);
+		bch_err(c, "err from test_snapshot_filter: %s", bch2_err_str(ret));
 		return ret;
 	}
 
@@ -555,7 +555,7 @@ static int rand_insert(struct bch_fs *c, u64 nr)
 		ret = commit_do(&trans, NULL, NULL, 0,
 			__bch2_btree_insert(&trans, BTREE_ID_xattrs, &k.k_i));
 		if (ret) {
-			bch_err(c, "error in rand_insert: %i", ret);
+			bch_err(c, "error in rand_insert: %s", bch2_err_str(ret));
 			break;
 		}
 	}
@@ -591,7 +591,7 @@ static int rand_insert_multi(struct bch_fs *c, u64 nr)
 			__bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[6].k_i) ?:
 			__bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[7].k_i));
 		if (ret) {
-			bch_err(c, "error in rand_insert_multi: %i", ret);
+			bch_err(c, "error in rand_insert_multi: %s", bch2_err_str(ret));
 			break;
 		}
 	}
@@ -618,7 +618,7 @@ static int rand_lookup(struct bch_fs *c, u64 nr)
 		k = bch2_btree_iter_peek(&iter);
 		ret = bkey_err(k);
 		if (ret) {
-			bch_err(c, "error in rand_lookup: %i", ret);
+			bch_err(c, "error in rand_lookup: %s", bch2_err_str(ret));
 			break;
 		}
 	}
@@ -641,7 +641,7 @@ static int rand_mixed_trans(struct btree_trans *trans,
 	k = bch2_btree_iter_peek(iter);
 	ret = bkey_err(k);
 	if (ret && ret != -EINTR)
-		bch_err(trans->c, "lookup error in rand_mixed: %i", ret);
+		bch_err(trans->c, "lookup error in rand_mixed: %s", bch2_err_str(ret));
 	if (ret)
 		return ret;
 
@@ -671,7 +671,7 @@ static int rand_mixed(struct bch_fs *c, u64 nr)
 		ret = commit_do(&trans, NULL, NULL, 0,
 			rand_mixed_trans(&trans, &iter, &cookie, i, rand));
 		if (ret) {
-			bch_err(c, "update error in rand_mixed: %i", ret);
+			bch_err(c, "update error in rand_mixed: %s", bch2_err_str(ret));
 			break;
 		}
 	}
@@ -717,7 +717,7 @@ static int rand_delete(struct bch_fs *c, u64 nr)
 		ret = commit_do(&trans, NULL, NULL, 0,
 			__do_delete(&trans, pos));
 		if (ret) {
-			bch_err(c, "error in rand_delete: %i", ret);
+			bch_err(c, "error in rand_delete: %s", bch2_err_str(ret));
 			break;
 		}
 	}
@@ -747,7 +747,7 @@ static int seq_insert(struct bch_fs *c, u64 nr)
 			bch2_btree_iter_traverse(&iter) ?:
 			bch2_trans_update(&trans, &iter, &insert.k_i, 0));
 		if (ret) {
-			bch_err(c, "error in seq_insert: %i", ret);
+			bch_err(c, "error in seq_insert: %s", bch2_err_str(ret));
 			break;
 		}
 
@@ -798,7 +798,7 @@ static int seq_overwrite(struct bch_fs *c, u64 nr)
 			bch2_btree_iter_traverse(&iter) ?:
 			bch2_trans_update(&trans, &iter, &u.k_i, 0));
 		if (ret) {
-			bch_err(c, "error in seq_overwrite: %i", ret);
+			bch_err(c, "error in seq_overwrite: %s", bch2_err_str(ret));
 			break;
 		}
 	}
@@ -816,7 +816,7 @@ static int seq_delete(struct bch_fs *c, u64 nr)
 				      SPOS(0, 0, U32_MAX), SPOS_MAX,
 				      0, NULL);
 	if (ret)
-		bch_err(c, "error in seq_delete: %i", ret);
+		bch_err(c, "error in seq_delete: %s", bch2_err_str(ret));
 	return ret;
 }
 
@@ -853,7 +853,7 @@ static int btree_perf_test_thread(void *data)
 
 	ret = j->fn(j->c, div64_u64(j->nr, j->nr_threads));
 	if (ret) {
-		bch_err(j->c, "%ps: error %i", j->fn, ret);
+		bch_err(j->c, "%ps: error %s", j->fn, bch2_err_str(ret));
 		j->ret = ret;
 	}
 
-- 
cgit 


From 90cecb921cfe95858a32995019f11c20b6339607 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 19 Jul 2022 14:51:52 -0400
Subject: bcachefs: Prevent a btree iter overflow in alloc path

In bch2_bucket_alloc_trans(), we're iterating over buckets - but not
directly with an iterator, since we're iterating over the freespace
btree.

This means that we need to clear iter->path->preserve, otherwise we'll
end up retaining a btree_path for every alloc key we touched - which is
not what we want here.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/alloc_foreground.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index 39e3bb5205ca..a9f893361c73 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -351,6 +351,7 @@ static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bc
 	if (!ob)
 		iter.path->preserve = false;
 err:
+	set_btree_iter_dontneed(&iter);
 	bch2_trans_iter_exit(trans, &iter);
 	printbuf_exit(&buf);
 	return ob;
-- 
cgit 


From 0990efaeeab14de1e3e3bf2791808afebadd1cc4 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 5 Jul 2022 17:27:44 -0400
Subject: bcachefs: btree_trans_too_many_iters() is now a transaction restart

All transaction restarts need a tracepoint - this is essential for
debugging

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.h | 9 +++++++--
 fs/bcachefs/trace.h      | 6 ++++++
 2 files changed, 13 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index c2f5afc9eeb9..1952a7683610 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -4,6 +4,7 @@
 
 #include "bset.h"
 #include "btree_types.h"
+#include "trace.h"
 
 static inline void __btree_path_get(struct btree_path *path, bool intent)
 {
@@ -384,8 +385,12 @@ static inline struct bkey_s_c bch2_btree_iter_peek_upto_type(struct btree_iter *
 
 static inline int btree_trans_too_many_iters(struct btree_trans *trans)
 {
-	return hweight64(trans->paths_allocated) > BTREE_ITER_MAX / 2
-		? -EINTR : 0;
+	if (hweight64(trans->paths_allocated) > BTREE_ITER_MAX / 2) {
+		trace_trans_restart_too_many_iters(trans->fn, _THIS_IP_);
+		return btree_trans_restart(trans);
+	}
+
+	return 0;
 }
 
 static inline struct bkey_s_c
diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h
index 65c38aa38359..a52da91f279e 100644
--- a/fs/bcachefs/trace.h
+++ b/fs/bcachefs/trace.h
@@ -755,6 +755,12 @@ DEFINE_EVENT(transaction_event,	trans_restart_key_cache_raced,
 	TP_ARGS(trans_fn, caller_ip)
 );
 
+DEFINE_EVENT(transaction_event,	trans_restart_too_many_iters,
+	TP_PROTO(const char *trans_fn,
+		 unsigned long caller_ip),
+	TP_ARGS(trans_fn, caller_ip)
+);
+
 DECLARE_EVENT_CLASS(transaction_restart_iter,
 	TP_PROTO(const char *trans_fn,
 		 unsigned long caller_ip,
-- 
cgit 


From 549d173c1bd9b58c2ad41217522462e012a6545f Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 17 Jul 2022 23:06:38 -0400
Subject: bcachefs: EINTR -> BCH_ERR_transaction_restart

Now that we have error codes, with subtypes, we can switch to our own
error code for transaction restarts - and even better, a distinct error
code for each transaction restart reason: clearer code and better
debugging.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/acl.c                   |   4 +-
 fs/bcachefs/alloc_background.c      |   2 +-
 fs/bcachefs/alloc_foreground.c      |  17 +++---
 fs/bcachefs/btree_cache.c           |  54 +++++++++++--------
 fs/bcachefs/btree_iter.c            | 105 ++++++++++++++++++------------------
 fs/bcachefs/btree_iter.h            |  54 ++++++++++++-------
 fs/bcachefs/btree_key_cache.c       |  41 +++++++-------
 fs/bcachefs/btree_locking.h         |  38 ++++++-------
 fs/bcachefs/btree_types.h           |   2 +-
 fs/bcachefs/btree_update.h          |   1 -
 fs/bcachefs/btree_update_interior.c |  25 +++++----
 fs/bcachefs/btree_update_leaf.c     |  69 ++++++++++--------------
 fs/bcachefs/data_update.c           |   4 +-
 fs/bcachefs/dirent.c                |   4 +-
 fs/bcachefs/ec.c                    |  10 ++--
 fs/bcachefs/errcode.h               |  25 ++++++++-
 fs/bcachefs/fs-io.c                 |  22 ++++----
 fs/bcachefs/fs.c                    |  10 ++--
 fs/bcachefs/fsck.c                  |  39 +++++++-------
 fs/bcachefs/inode.c                 |   4 +-
 fs/bcachefs/io.c                    |  25 +++++----
 fs/bcachefs/journal_seq_blacklist.c |   2 +-
 fs/bcachefs/migrate.c               |   6 +--
 fs/bcachefs/move.c                  |  10 ++--
 fs/bcachefs/reflink.c               |   5 +-
 fs/bcachefs/subvolume.c             |   2 +-
 fs/bcachefs/tests.c                 |   2 +-
 fs/bcachefs/xattr.c                 |   2 +-
 28 files changed, 314 insertions(+), 270 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/acl.c b/fs/bcachefs/acl.c
index 00cd40a8d7fa..7edebeed779e 100644
--- a/fs/bcachefs/acl.c
+++ b/fs/bcachefs/acl.c
@@ -234,7 +234,7 @@ retry:
 			&X_SEARCH(acl_to_xattr_type(type), "", 0),
 			0);
 	if (ret) {
-		if (ret == -EINTR)
+		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
 			goto retry;
 		if (ret != -ENOENT)
 			acl = ERR_PTR(ret);
@@ -334,7 +334,7 @@ retry:
 btree_err:
 	bch2_trans_iter_exit(&trans, &inode_iter);
 
-	if (ret == -EINTR)
+	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
 		goto retry;
 	if (unlikely(ret))
 		goto err;
diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index eb44a8bc04fe..15c3c9a2da7b 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -995,7 +995,7 @@ static int bch2_discard_one_bucket(struct btree_trans *trans,
 				     GFP_KERNEL);
 		*discard_pos_done = iter.pos;
 
-		ret = bch2_trans_relock(trans) ? 0 : -EINTR;
+		ret = bch2_trans_relock(trans);
 		if (ret)
 			goto out;
 	}
diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index a9f893361c73..99fbf1d2dee5 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -470,8 +470,9 @@ again:
 		for (alloc_cursor = max(alloc_cursor, bkey_start_offset(k.k));
 		     alloc_cursor < k.k->p.offset;
 		     alloc_cursor++) {
-			if (btree_trans_too_many_iters(trans)) {
-				ob = ERR_PTR(-EINTR);
+			ret = btree_trans_too_many_iters(trans);
+			if (ret) {
+				ob = ERR_PTR(ret);
 				break;
 			}
 
@@ -488,7 +489,8 @@ again:
 				break;
 			}
 		}
-		if (ob)
+
+		if (ob || ret)
 			break;
 	}
 	bch2_trans_iter_exit(trans, &iter);
@@ -738,7 +740,7 @@ static int bch2_bucket_alloc_set_trans(struct btree_trans *trans,
 
 		ret = PTR_ERR_OR_ZERO(ob);
 		if (ret) {
-			if (ret == -EINTR || cl)
+			if (bch2_err_matches(ret, BCH_ERR_transaction_restart) || cl)
 				break;
 			continue;
 		}
@@ -925,7 +927,7 @@ static int open_bucket_add_buckets(struct btree_trans *trans,
 						 target, erasure_code,
 						 nr_replicas, nr_effective,
 						 have_cache, flags, _cl);
-			if (ret == -EINTR ||
+			if (bch2_err_matches(ret, BCH_ERR_transaction_restart) ||
 			    bch2_err_matches(ret, BCH_ERR_freelist_empty) ||
 			    bch2_err_matches(ret, BCH_ERR_open_buckets_empty))
 				return ret;
@@ -949,7 +951,7 @@ retry_blocking:
 				nr_replicas, nr_effective, have_cache,
 				reserve, flags, cl);
 	if (ret &&
-	    ret != -EINTR &&
+	    !bch2_err_matches(ret, BCH_ERR_transaction_restart) &&
 	    !bch2_err_matches(ret, BCH_ERR_insufficient_devices) &&
 	    !cl && _cl) {
 		cl = _cl;
@@ -1191,7 +1193,8 @@ retry:
 					      nr_replicas, &nr_effective,
 					      &have_cache, reserve,
 					      ob_flags, NULL);
-		if (!ret || ret == -EINTR)
+		if (!ret ||
+		    bch2_err_matches(ret, BCH_ERR_transaction_restart))
 			goto alloc_done;
 
 		ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have,
diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index 1f80f08a69b2..4032c27fcc9c 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -7,6 +7,7 @@
 #include "btree_iter.h"
 #include "btree_locking.h"
 #include "debug.h"
+#include "errcode.h"
 #include "error.h"
 #include "trace.h"
 
@@ -692,8 +693,7 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c,
 	if (trans && !bch2_btree_node_relock(trans, path, level + 1)) {
 		trace_trans_restart_relock_parent_for_fill(trans->fn,
 					_THIS_IP_, btree_id, &path->pos);
-		btree_trans_restart(trans);
-		return ERR_PTR(-EINTR);
+		return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_fill_relock));
 	}
 
 	b = bch2_btree_node_mem_alloc(c, level != 0);
@@ -702,8 +702,8 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c,
 		trans->memory_allocation_failure = true;
 		trace_trans_restart_memory_allocation_failure(trans->fn,
 				_THIS_IP_, btree_id, &path->pos);
-		btree_trans_restart(trans);
-		return ERR_PTR(-EINTR);
+
+		return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_fill_mem_alloc_fail));
 	}
 
 	if (IS_ERR(b))
@@ -740,18 +740,19 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c,
 	if (!sync)
 		return NULL;
 
-	if (trans &&
-	    (!bch2_trans_relock(trans) ||
-	     !bch2_btree_path_relock_intent(trans, path))) {
-		BUG_ON(!trans->restarted);
-		return ERR_PTR(-EINTR);
+	if (trans) {
+		int ret = bch2_trans_relock(trans) ?:
+			bch2_btree_path_relock_intent(trans, path);
+		if (ret) {
+			BUG_ON(!trans->restarted);
+			return ERR_PTR(ret);
+		}
 	}
 
 	if (!six_relock_type(&b->c.lock, lock_type, seq)) {
 		trace_trans_restart_relock_after_fill(trans->fn, _THIS_IP_,
 					   btree_id, &path->pos);
-		btree_trans_restart(trans);
-		return ERR_PTR(-EINTR);
+		return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_after_fill));
 	}
 
 	return b;
@@ -762,7 +763,9 @@ static int lock_node_check_fn(struct six_lock *lock, void *p)
 	struct btree *b = container_of(lock, struct btree, c.lock);
 	const struct bkey_i *k = p;
 
-	return b->hash_val == btree_ptr_hash_val(k) ? 0 : -1;
+	if (b->hash_val != btree_ptr_hash_val(k))
+		return BCH_ERR_lock_fail_node_reused;
+	return 0;
 }
 
 static noinline void btree_bad_header(struct bch_fs *c, struct btree *b)
@@ -821,6 +824,7 @@ struct btree *bch2_btree_node_get(struct btree_trans *trans, struct btree_path *
 	struct btree_cache *bc = &c->btree_cache;
 	struct btree *b;
 	struct bset_tree *t;
+	int ret;
 
 	EBUG_ON(level >= BTREE_MAX_DEPTH);
 
@@ -885,11 +889,14 @@ lock_node:
 		if (btree_node_read_locked(path, level + 1))
 			btree_node_unlock(trans, path, level + 1);
 
-		if (!btree_node_lock(trans, path, b, k->k.p, level, lock_type,
-				     lock_node_check_fn, (void *) k, trace_ip)) {
-			if (!trans->restarted)
+		ret = btree_node_lock(trans, path, b, k->k.p, level, lock_type,
+				      lock_node_check_fn, (void *) k, trace_ip);
+		if (unlikely(ret)) {
+			if (bch2_err_matches(ret, BCH_ERR_lock_fail_node_reused))
 				goto retry;
-			return ERR_PTR(-EINTR);
+			if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+				return ERR_PTR(ret);
+			BUG();
 		}
 
 		if (unlikely(b->hash_val != btree_ptr_hash_val(k) ||
@@ -903,8 +910,7 @@ lock_node:
 							      trace_ip,
 							      path->btree_id,
 							      &path->pos);
-			btree_trans_restart(trans);
-			return ERR_PTR(-EINTR);
+			return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_lock_node_reused));
 		}
 	}
 
@@ -920,11 +926,13 @@ lock_node:
 		 * should_be_locked is not set on this path yet, so we need to
 		 * relock it specifically:
 		 */
-		if (trans &&
-		    (!bch2_trans_relock(trans) ||
-		     !bch2_btree_path_relock_intent(trans, path))) {
-			BUG_ON(!trans->restarted);
-			return ERR_PTR(-EINTR);
+		if (trans) {
+			int ret = bch2_trans_relock(trans) ?:
+				bch2_btree_path_relock_intent(trans, path);
+			if (ret) {
+				BUG_ON(!trans->restarted);
+				return ERR_PTR(ret);
+			}
 		}
 
 		if (!six_relock_type(&b->c.lock, lock_type, seq))
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 45ecd196bceb..db247c96298f 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -34,7 +34,7 @@ static inline int bch2_trans_cond_resched(struct btree_trans *trans)
 	if (need_resched() || race_fault()) {
 		bch2_trans_unlock(trans);
 		schedule();
-		return bch2_trans_relock(trans) ? 0 : -EINTR;
+		return bch2_trans_relock(trans);
 	} else {
 		return 0;
 	}
@@ -285,13 +285,13 @@ static struct bpos btree_node_pos(struct btree_bkey_cached_common *_b,
 }
 
 /* Slowpath: */
-bool __bch2_btree_node_lock(struct btree_trans *trans,
-			    struct btree_path *path,
-			    struct btree *b,
-			    struct bpos pos, unsigned level,
-			    enum six_lock_type type,
-			    six_lock_should_sleep_fn should_sleep_fn, void *p,
-			    unsigned long ip)
+int __bch2_btree_node_lock(struct btree_trans *trans,
+			   struct btree_path *path,
+			   struct btree *b,
+			   struct bpos pos, unsigned level,
+			   enum six_lock_type type,
+			   six_lock_should_sleep_fn should_sleep_fn, void *p,
+			   unsigned long ip)
 {
 	struct btree_path *linked;
 	unsigned reason;
@@ -369,8 +369,7 @@ deadlock:
 			path->btree_id,
 			path->cached,
 			&pos);
-	btree_trans_restart(trans);
-	return false;
+	return btree_trans_restart(trans, BCH_ERR_transaction_restart_would_deadlock);
 }
 
 /* Btree iterator locking: */
@@ -408,8 +407,8 @@ static inline void bch2_btree_path_verify_locks(struct btree_path *path) {}
 /*
  * Only for btree_cache.c - only relocks intent locks
  */
-bool bch2_btree_path_relock_intent(struct btree_trans *trans,
-				   struct btree_path *path)
+int bch2_btree_path_relock_intent(struct btree_trans *trans,
+				  struct btree_path *path)
 {
 	unsigned l;
 
@@ -421,16 +420,15 @@ bool bch2_btree_path_relock_intent(struct btree_trans *trans,
 			btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
 			trace_trans_restart_relock_path_intent(trans->fn, _RET_IP_,
 						   path->btree_id, &path->pos);
-			btree_trans_restart(trans);
-			return false;
+			return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_path_intent);
 		}
 	}
 
-	return true;
+	return 0;
 }
 
 noinline __flatten
-static bool __bch2_btree_path_relock(struct btree_trans *trans,
+static int __bch2_btree_path_relock(struct btree_trans *trans,
 			struct btree_path *path, unsigned long trace_ip)
 {
 	bool ret = btree_path_get_locks(trans, path, false);
@@ -438,16 +436,17 @@ static bool __bch2_btree_path_relock(struct btree_trans *trans,
 	if (!ret) {
 		trace_trans_restart_relock_path(trans->fn, trace_ip,
 						path->btree_id, &path->pos);
-		btree_trans_restart(trans);
+		return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_path);
 	}
-	return ret;
+
+	return 0;
 }
 
-static inline bool bch2_btree_path_relock(struct btree_trans *trans,
+static inline int bch2_btree_path_relock(struct btree_trans *trans,
 			struct btree_path *path, unsigned long trace_ip)
 {
 	return btree_node_locked(path, path->level)
-		? true
+		? 0
 		: __bch2_btree_path_relock(trans, path, trace_ip);
 }
 
@@ -532,22 +531,22 @@ void bch2_trans_downgrade(struct btree_trans *trans)
 
 /* Btree transaction locking: */
 
-bool bch2_trans_relock(struct btree_trans *trans)
+int bch2_trans_relock(struct btree_trans *trans)
 {
 	struct btree_path *path;
 
 	if (unlikely(trans->restarted))
-		return false;
+		return -BCH_ERR_transaction_restart_relock;
 
 	trans_for_each_path(trans, path)
 		if (path->should_be_locked &&
-		    !bch2_btree_path_relock(trans, path, _RET_IP_)) {
+		    bch2_btree_path_relock(trans, path, _RET_IP_)) {
 			trace_trans_restart_relock(trans->fn, _RET_IP_,
 					path->btree_id, &path->pos);
 			BUG_ON(!trans->restarted);
-			return false;
+			return -BCH_ERR_transaction_restart_relock;
 		}
-	return true;
+	return 0;
 }
 
 void bch2_trans_unlock(struct btree_trans *trans)
@@ -1187,7 +1186,9 @@ static int lock_root_check_fn(struct six_lock *lock, void *p)
 	struct btree *b = container_of(lock, struct btree, c.lock);
 	struct btree **rootp = p;
 
-	return b == *rootp ? 0 : -1;
+	if (b != *rootp)
+		return BCH_ERR_lock_fail_root_changed;
+	return 0;
 }
 
 static inline int btree_path_lock_root(struct btree_trans *trans,
@@ -1199,6 +1200,7 @@ static inline int btree_path_lock_root(struct btree_trans *trans,
 	struct btree *b, **rootp = &c->btree_roots[path->btree_id].b;
 	enum six_lock_type lock_type;
 	unsigned i;
+	int ret;
 
 	EBUG_ON(path->nodes_locked);
 
@@ -1220,13 +1222,16 @@ static inline int btree_path_lock_root(struct btree_trans *trans,
 		}
 
 		lock_type = __btree_lock_want(path, path->level);
-		if (unlikely(!btree_node_lock(trans, path, b, SPOS_MAX,
-					      path->level, lock_type,
-					      lock_root_check_fn, rootp,
-					      trace_ip))) {
-			if (trans->restarted)
-				return -EINTR;
-			continue;
+		ret = btree_node_lock(trans, path, b, SPOS_MAX,
+				      path->level, lock_type,
+				      lock_root_check_fn, rootp,
+				      trace_ip);
+		if (unlikely(ret)) {
+			if (bch2_err_matches(ret, BCH_ERR_lock_fail_root_changed))
+				continue;
+			if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+				return ret;
+			BUG();
 		}
 
 		if (likely(b == READ_ONCE(*rootp) &&
@@ -1431,12 +1436,12 @@ static int bch2_btree_path_traverse_all(struct btree_trans *trans)
 	int i, ret = 0;
 
 	if (trans->in_traverse_all)
-		return -EINTR;
+		return -BCH_ERR_transaction_restart_in_traverse_all;
 
 	trans->in_traverse_all = true;
 retry_all:
 	prev = NULL;
-	trans->restarted = false;
+	trans->restarted = 0;
 
 	trans_for_each_path(trans, path)
 		path->should_be_locked = false;
@@ -1480,7 +1485,8 @@ retry_all:
 		 */
 		if (path->uptodate) {
 			ret = btree_path_traverse_one(trans, path, 0, _THIS_IP_);
-			if (ret == -EINTR || ret == -ENOMEM)
+			if (bch2_err_matches(ret, BCH_ERR_transaction_restart) ||
+			    ret == -ENOMEM)
 				goto retry_all;
 			if (ret)
 				goto err;
@@ -1587,19 +1593,17 @@ static int btree_path_traverse_one(struct btree_trans *trans,
 				   unsigned long trace_ip)
 {
 	unsigned depth_want = path->level;
-	int ret = 0;
+	int ret = trans->restarted;
 
-	if (unlikely(trans->restarted)) {
-		ret = -EINTR;
+	if (unlikely(ret))
 		goto out;
-	}
 
 	/*
 	 * Ensure we obey path->should_be_locked: if it's set, we can't unlock
 	 * and re-traverse the path without a transaction restart:
 	 */
 	if (path->should_be_locked) {
-		ret = bch2_btree_path_relock(trans, path, trace_ip) ? 0 : -EINTR;
+		ret = bch2_btree_path_relock(trans, path, trace_ip);
 		goto out;
 	}
 
@@ -1648,7 +1652,7 @@ static int btree_path_traverse_one(struct btree_trans *trans,
 
 	path->uptodate = BTREE_ITER_UPTODATE;
 out:
-	BUG_ON((ret == -EINTR) != !!trans->restarted);
+	BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart) != !!trans->restarted);
 	bch2_btree_path_verify(trans, path);
 	return ret;
 }
@@ -2135,8 +2139,7 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
 		btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
 		trace_trans_restart_relock_next_node(trans->fn, _THIS_IP_,
 					   path->btree_id, &path->pos);
-		btree_trans_restart(trans);
-		ret = -EINTR;
+		ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_relock);
 		goto err;
 	}
 
@@ -2517,8 +2520,9 @@ struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos e
 	BUG_ON(!iter->path->nodes_locked);
 out:
 	if (iter->update_path) {
-		if (unlikely(!bch2_btree_path_relock(trans, iter->update_path, _THIS_IP_))) {
-			k = bkey_s_c_err(-EINTR);
+		ret = bch2_btree_path_relock(trans, iter->update_path, _THIS_IP_);
+		if (unlikely(ret)) {
+			k = bkey_s_c_err(ret);
 		} else {
 			BUG_ON(!(iter->update_path->nodes_locked & 1));
 			iter->update_path->should_be_locked = true;
@@ -3169,8 +3173,7 @@ void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
 
 		if (old_bytes) {
 			trace_trans_restart_mem_realloced(trans->fn, _RET_IP_, new_bytes);
-			btree_trans_restart(trans);
-			return ERR_PTR(-EINTR);
+			return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_mem_realloced));
 		}
 	}
 
@@ -3184,9 +3187,9 @@ void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
  * bch2_trans_begin() - reset a transaction after a interrupted attempt
  * @trans: transaction to reset
  *
- * While iterating over nodes or updating nodes a attempt to lock a btree
- * node may return EINTR when the trylock fails. When this occurs
- * bch2_trans_begin() should be called and the transaction retried.
+ * While iterating over nodes or updating nodes a attempt to lock a btree node
+ * may return BCH_ERR_transaction_restart when the trylock fails. When this
+ * occurs bch2_trans_begin() should be called and the transaction retried.
  */
 u32 bch2_trans_begin(struct btree_trans *trans)
 {
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index 1952a7683610..79339a6abcd7 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -197,27 +197,36 @@ void bch2_btree_node_iter_fix(struct btree_trans *trans, struct btree_path *,
 			      struct btree *, struct btree_node_iter *,
 			      struct bkey_packed *, unsigned, unsigned);
 
-bool bch2_btree_path_relock_intent(struct btree_trans *, struct btree_path *);
+int bch2_btree_path_relock_intent(struct btree_trans *, struct btree_path *);
 
 void bch2_path_put(struct btree_trans *, struct btree_path *, bool);
 
-bool bch2_trans_relock(struct btree_trans *);
+int bch2_trans_relock(struct btree_trans *);
 void bch2_trans_unlock(struct btree_trans *);
 
-static inline int trans_was_restarted(struct btree_trans *trans, u32 restart_count)
+static inline bool trans_was_restarted(struct btree_trans *trans, u32 restart_count)
 {
-	return restart_count != trans->restart_count ? -EINTR : 0;
+	return restart_count != trans->restart_count;
 }
 
 void bch2_trans_verify_not_restarted(struct btree_trans *, u32);
 
 __always_inline
-static inline int btree_trans_restart(struct btree_trans *trans)
+static inline int btree_trans_restart_nounlock(struct btree_trans *trans, int err)
 {
-	trans->restarted = true;
+	BUG_ON(err <= 0);
+	BUG_ON(!bch2_err_matches(err, BCH_ERR_transaction_restart));
+
+	trans->restarted = err;
 	trans->restart_count++;
-	bch2_trans_unlock(trans);
-	return -EINTR;
+	return -err;
+}
+
+__always_inline
+static inline int btree_trans_restart(struct btree_trans *trans, int err)
+{
+	btree_trans_restart_nounlock(trans, err);
+	return -err;
 }
 
 bool bch2_btree_node_upgrade(struct btree_trans *,
@@ -338,7 +347,7 @@ __btree_iter_peek_node_and_restart(struct btree_trans *trans, struct btree_iter
 	struct btree *b;
 
 	while (b = bch2_btree_iter_peek_node(iter),
-	       PTR_ERR_OR_ZERO(b) == -EINTR)
+	       bch2_err_matches(PTR_ERR_OR_ZERO(b), BCH_ERR_transaction_restart))
 		bch2_trans_begin(trans);
 
 	return b;
@@ -387,7 +396,7 @@ static inline int btree_trans_too_many_iters(struct btree_trans *trans)
 {
 	if (hweight64(trans->paths_allocated) > BTREE_ITER_MAX / 2) {
 		trace_trans_restart_too_many_iters(trans->fn, _THIS_IP_);
-		return btree_trans_restart(trans);
+		return btree_trans_restart(trans, BCH_ERR_transaction_restart_too_many_iters);
 	}
 
 	return 0;
@@ -401,7 +410,7 @@ __bch2_btree_iter_peek_and_restart(struct btree_trans *trans,
 
 	while (btree_trans_too_many_iters(trans) ||
 	       (k = bch2_btree_iter_peek_type(iter, flags),
-		bkey_err(k) == -EINTR))
+		bch2_err_matches(bkey_err(k), BCH_ERR_transaction_restart)))
 		bch2_trans_begin(trans);
 
 	return k;
@@ -414,7 +423,7 @@ __bch2_btree_iter_peek_and_restart(struct btree_trans *trans,
 	do {								\
 		bch2_trans_begin(_trans);				\
 		_ret = (_do);						\
-	} while (_ret == -EINTR);					\
+	} while (bch2_err_matches(_ret, BCH_ERR_transaction_restart));	\
 									\
 	_ret;								\
 })
@@ -425,7 +434,8 @@ __bch2_btree_iter_peek_and_restart(struct btree_trans *trans,
  * These are like lockrestart_do() and commit_do(), with two differences:
  *
  *  - We don't call bch2_trans_begin() unless we had a transaction restart
- *  - We return -EINTR if we succeeded after a transaction restart
+ *  - We return -BCH_ERR_transaction_restart_nested if we succeeded after a
+ *  transaction restart
  */
 #define nested_lockrestart_do(_trans, _do)				\
 ({									\
@@ -434,13 +444,16 @@ __bch2_btree_iter_peek_and_restart(struct btree_trans *trans,
 									\
 	_restart_count = _orig_restart_count = (_trans)->restart_count;	\
 									\
-	while ((_ret = (_do)) == -EINTR)				\
+	while (bch2_err_matches(_ret = (_do), BCH_ERR_transaction_restart))\
 		_restart_count = bch2_trans_begin(_trans);		\
 									\
 	if (!_ret)							\
 		bch2_trans_verify_not_restarted(_trans, _restart_count);\
 									\
-	_ret ?: trans_was_restarted(_trans, _orig_restart_count);	\
+	if (!_ret && trans_was_restarted(_trans, _orig_restart_count))	\
+		_ret = -BCH_ERR_transaction_restart_nested;		\
+									\
+	_ret;								\
 })
 
 #define for_each_btree_key2(_trans, _iter, _btree_id,			\
@@ -451,7 +464,7 @@ __bch2_btree_iter_peek_and_restart(struct btree_trans *trans,
 	bch2_trans_iter_init((_trans), &(_iter), (_btree_id),		\
 			     (_start), (_flags));			\
 									\
-	do {								\
+	while (1) {							\
 		bch2_trans_begin(_trans);				\
 		(_k) = bch2_btree_iter_peek_type(&(_iter), (_flags));	\
 		if (!(_k).k) {						\
@@ -460,9 +473,12 @@ __bch2_btree_iter_peek_and_restart(struct btree_trans *trans,
 		}							\
 									\
 		_ret = bkey_err(_k) ?: (_do);				\
-		if (!_ret)						\
-			bch2_btree_iter_advance(&(_iter));		\
-	} while (_ret == 0 || _ret == -EINTR);				\
+		if (bch2_err_matches(_ret, BCH_ERR_transaction_restart))\
+			continue;					\
+		if (_ret)						\
+			break;						\
+		bch2_btree_iter_advance(&(_iter));			\
+	}								\
 									\
 	bch2_trans_iter_exit((_trans), &(_iter));			\
 	_ret;								\
diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index e5a29240bbcc..549abe607b53 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -5,6 +5,7 @@
 #include "btree_key_cache.h"
 #include "btree_locking.h"
 #include "btree_update.h"
+#include "errcode.h"
 #include "error.h"
 #include "journal.h"
 #include "journal_reclaim.h"
@@ -292,7 +293,7 @@ static int btree_key_cache_fill(struct btree_trans *trans,
 	if (!bch2_btree_node_relock(trans, ck_path, 0)) {
 		trace_trans_restart_relock_key_cache_fill(trans->fn,
 				_THIS_IP_, ck_path->btree_id, &ck_path->pos);
-		ret = btree_trans_restart(trans);
+		ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_raced);
 		goto err;
 	}
 
@@ -347,8 +348,10 @@ static int bkey_cached_check_fn(struct six_lock *lock, void *p)
 	struct bkey_cached *ck = container_of(lock, struct bkey_cached, c.lock);
 	const struct btree_path *path = p;
 
-	return ck->key.btree_id == path->btree_id &&
-		!bpos_cmp(ck->key.pos, path->pos) ? 0 : -1;
+	if (ck->key.btree_id != path->btree_id &&
+	    bpos_cmp(ck->key.pos, path->pos))
+		return BCH_ERR_lock_fail_node_reused;
+	return 0;
 }
 
 __flatten
@@ -387,14 +390,15 @@ retry:
 	} else {
 		enum six_lock_type lock_want = __btree_lock_want(path, 0);
 
-		if (!btree_node_lock(trans, path, (void *) ck, path->pos, 0,
-				     lock_want,
-				     bkey_cached_check_fn, path, _THIS_IP_)) {
-			if (!trans->restarted)
+		ret = btree_node_lock(trans, path, (void *) ck, path->pos, 0,
+				      lock_want,
+				      bkey_cached_check_fn, path, _THIS_IP_);
+		if (ret) {
+			if (bch2_err_matches(ret, BCH_ERR_lock_fail_node_reused))
 				goto retry;
-
-			ret = -EINTR;
-			goto err;
+			if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+				goto err;
+			BUG();
 		}
 
 		if (ck->key.btree_id != path->btree_id ||
@@ -413,7 +417,7 @@ fill:
 		if (!path->locks_want &&
 		    !__bch2_btree_path_upgrade(trans, path, 1)) {
 			trace_transaction_restart_ip(trans->fn, _THIS_IP_);
-			ret = btree_trans_restart(trans);
+			ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_upgrade);
 			goto err;
 		}
 
@@ -430,7 +434,7 @@ fill:
 
 	return ret;
 err:
-	if (ret != -EINTR) {
+	if (!bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
 		btree_node_unlock(trans, path, 0);
 		path->l[0].b = BTREE_ITER_NO_NODE_ERROR;
 	}
@@ -497,13 +501,14 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans,
 				   ? JOURNAL_WATERMARK_reserved
 				   : 0)|
 				  commit_flags);
-	if (ret) {
-		bch2_fs_fatal_err_on(ret != -EINTR &&
-				     ret != -EAGAIN &&
-				     !bch2_journal_error(j), c,
-			"error flushing key cache: %i", ret);
+
+	bch2_fs_fatal_err_on(ret &&
+			     !bch2_err_matches(ret, BCH_ERR_transaction_restart) &&
+			     !bch2_err_matches(ret, BCH_ERR_journal_reclaim_would_deadlock) &&
+			     !bch2_journal_error(j), c,
+			     "error flushing key cache: %s", bch2_err_str(ret));
+	if (ret)
 		goto out;
-	}
 
 	bch2_journal_pin_drop(j, &ck->journal);
 	bch2_journal_preres_put(j, &ck->res);
diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h
index b8708466c4e3..33a69e27c39e 100644
--- a/fs/bcachefs/btree_locking.h
+++ b/fs/bcachefs/btree_locking.h
@@ -152,7 +152,7 @@ static inline enum bch_time_stats lock_to_time_stat(enum six_lock_type type)
 	}
 }
 
-static inline bool btree_node_lock_type(struct btree_trans *trans,
+static inline int btree_node_lock_type(struct btree_trans *trans,
 				       struct btree_path *path,
 				       struct btree *b,
 				       struct bpos pos, unsigned level,
@@ -161,10 +161,10 @@ static inline bool btree_node_lock_type(struct btree_trans *trans,
 {
 	struct bch_fs *c = trans->c;
 	u64 start_time;
-	bool ret;
+	int ret;
 
 	if (six_trylock_type(&b->c.lock, type))
-		return true;
+		return 0;
 
 	start_time = local_clock();
 
@@ -174,13 +174,14 @@ static inline bool btree_node_lock_type(struct btree_trans *trans,
 	trans->locking_level	= level;
 	trans->locking_lock_type = type;
 	trans->locking		= b;
-	ret = six_lock_type(&b->c.lock, type, should_sleep_fn, p) == 0;
+	ret = six_lock_type(&b->c.lock, type, should_sleep_fn, p);
 	trans->locking = NULL;
 
 	if (ret)
-		bch2_time_stats_update(&c->times[lock_to_time_stat(type)], start_time);
+		return ret;
 
-	return ret;
+	bch2_time_stats_update(&c->times[lock_to_time_stat(type)], start_time);
+	return 0;
 }
 
 /*
@@ -203,33 +204,34 @@ static inline bool btree_node_lock_increment(struct btree_trans *trans,
 	return false;
 }
 
-bool __bch2_btree_node_lock(struct btree_trans *, struct btree_path *,
-			    struct btree *, struct bpos, unsigned,
-			    enum six_lock_type,
-			    six_lock_should_sleep_fn, void *,
-			    unsigned long);
+int __bch2_btree_node_lock(struct btree_trans *, struct btree_path *,
+			   struct btree *, struct bpos, unsigned,
+			   enum six_lock_type,
+			   six_lock_should_sleep_fn, void *,
+			   unsigned long);
 
-static inline bool btree_node_lock(struct btree_trans *trans,
+static inline int btree_node_lock(struct btree_trans *trans,
 			struct btree_path *path,
 			struct btree *b, struct bpos pos, unsigned level,
 			enum six_lock_type type,
 			six_lock_should_sleep_fn should_sleep_fn, void *p,
 			unsigned long ip)
 {
+	int ret = 0;
+
 	EBUG_ON(level >= BTREE_MAX_DEPTH);
 	EBUG_ON(!(trans->paths_allocated & (1ULL << path->idx)));
 
 	if (likely(six_trylock_type(&b->c.lock, type)) ||
-		btree_node_lock_increment(trans, b, level, type) ||
-		__bch2_btree_node_lock(trans, path, b, pos, level, type,
-				       should_sleep_fn, p, ip)) {
+	    btree_node_lock_increment(trans, b, level, type) ||
+	    !(ret = __bch2_btree_node_lock(trans, path, b, pos, level, type,
+					   should_sleep_fn, p, ip))) {
 #ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS
 		path->l[b->c.level].lock_taken_time = ktime_get_ns();
 #endif
-		return true;
-	} else {
-		return false;
 	}
+
+	return ret;
 }
 
 bool __bch2_btree_node_relock(struct btree_trans *, struct btree_path *, unsigned);
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 0650a3558182..bc1571fc2f1f 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -405,11 +405,11 @@ struct btree_trans {
 	u8			nr_updates;
 	bool			used_mempool:1;
 	bool			in_traverse_all:1;
-	bool			restarted:1;
 	bool			paths_sorted:1;
 	bool			memory_allocation_failure:1;
 	bool			journal_transaction_names:1;
 	bool			journal_replay_not_finished:1;
+	enum bch_errcode	restarted:16;
 	u32			restart_count;
 	unsigned long		last_restarted_ip;
 
diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
index 9b5a8b18b01b..89941fb8caa0 100644
--- a/fs/bcachefs/btree_update.h
+++ b/fs/bcachefs/btree_update.h
@@ -90,7 +90,6 @@ int bch2_trans_log_msg(struct btree_trans *, const char *);
  * This is main entry point for btree updates.
  *
  * Return values:
- * -EINTR: locking changed, this function should be called again.
  * -EROFS: filesystem read only
  * -EIO: journal or btree node IO error
  */
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 9f9ab85ec6b8..cf02e814c579 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -996,7 +996,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
 	if (!bch2_btree_path_upgrade(trans, path, U8_MAX)) {
 		trace_trans_restart_iter_upgrade(trans->fn, _RET_IP_,
 						 path->btree_id, &path->pos);
-		ret = btree_trans_restart(trans);
+		ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_upgrade);
 		return ERR_PTR(ret);
 	}
 
@@ -1005,9 +1005,10 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
 	else if (!down_read_trylock(&c->gc_lock)) {
 		bch2_trans_unlock(trans);
 		down_read(&c->gc_lock);
-		if (!bch2_trans_relock(trans)) {
+		ret = bch2_trans_relock(trans);
+		if (ret) {
 			up_read(&c->gc_lock);
-			return ERR_PTR(-EINTR);
+			return ERR_PTR(ret);
 		}
 	}
 
@@ -1053,7 +1054,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
 					      journal_flags);
 		if (ret) {
 			trace_trans_restart_journal_preres_get(trans->fn, _RET_IP_);
-			btree_trans_restart(trans);
+			ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_journal_preres_get);
 			goto err;
 		}
 
@@ -1090,10 +1091,9 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
 		goto err;
 	}
 
-	if (!bch2_trans_relock(trans)) {
-		ret = -EINTR;
+	ret = bch2_trans_relock(trans);
+	if (ret)
 		goto err;
-	}
 
 	return as;
 err:
@@ -2030,10 +2030,8 @@ int bch2_btree_node_update_key(struct btree_trans *trans, struct btree_iter *ite
 	int ret = 0;
 
 	if (!btree_node_intent_locked(path, b->c.level) &&
-	    !bch2_btree_path_upgrade(trans, path, b->c.level + 1)) {
-		btree_trans_restart(trans);
-		return -EINTR;
-	}
+	    !bch2_btree_path_upgrade(trans, path, b->c.level + 1))
+		return btree_trans_restart(trans, BCH_ERR_transaction_restart_upgrade);
 
 	closure_init_stack(&cl);
 
@@ -2046,8 +2044,9 @@ int bch2_btree_node_update_key(struct btree_trans *trans, struct btree_iter *ite
 		if (ret) {
 			bch2_trans_unlock(trans);
 			closure_sync(&cl);
-			if (!bch2_trans_relock(trans))
-				return -EINTR;
+			ret = bch2_trans_relock(trans);
+			if (ret)
+				return ret;
 		}
 
 		new_hash = bch2_btree_node_mem_alloc(c, false);
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index c6fe24f424de..541826df50d9 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -10,6 +10,7 @@
 #include "btree_locking.h"
 #include "buckets.h"
 #include "debug.h"
+#include "errcode.h"
 #include "error.h"
 #include "extent_update.h"
 #include "journal.h"
@@ -282,9 +283,10 @@ bch2_trans_journal_preres_get_cold(struct btree_trans *trans, unsigned u64s,
 	if (ret)
 		return ret;
 
-	if (!bch2_trans_relock(trans)) {
+	ret = bch2_trans_relock(trans);
+	if (ret) {
 		trace_trans_restart_journal_preres_get(trans->fn, trace_ip);
-		return -EINTR;
+		return ret;
 	}
 
 	return 0;
@@ -376,12 +378,7 @@ btree_key_can_insert_cached(struct btree_trans *trans,
 	trace_trans_restart_key_cache_key_realloced(trans->fn, _RET_IP_,
 					     path->btree_id, &path->pos,
 					     old_u64s, new_u64s);
-	/*
-	 * Not using btree_trans_restart() because we can't unlock here, we have
-	 * write locks held:
-	 */
-	trans->restarted = true;
-	return -EINTR;
+	return btree_trans_restart_nounlock(trans, BCH_ERR_transaction_restart_key_cache_realloced);
 }
 
 /* Triggers: */
@@ -573,8 +570,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans,
 
 	if (race_fault()) {
 		trace_trans_restart_fault_inject(trans->fn, trace_ip);
-		trans->restarted = true;
-		return -EINTR;
+		return btree_trans_restart_nounlock(trans, BCH_ERR_transaction_restart_fault_inject);
 	}
 
 	/*
@@ -812,6 +808,7 @@ static inline bool have_conflicting_read_lock(struct btree_trans *trans, struct
 static inline int trans_lock_write(struct btree_trans *trans)
 {
 	struct btree_insert_entry *i;
+	int ret;
 
 	trans_for_each_update(trans, i) {
 		if (same_leaf_as_prev(trans, i))
@@ -821,10 +818,11 @@ static inline int trans_lock_write(struct btree_trans *trans)
 			if (have_conflicting_read_lock(trans, i->path))
 				goto fail;
 
-			btree_node_lock_type(trans, i->path,
+			ret = btree_node_lock_type(trans, i->path,
 					     insert_l(i)->b,
 					     i->path->pos, i->level,
 					     SIX_LOCK_write, NULL, NULL);
+			BUG_ON(ret);
 		}
 
 		bch2_btree_node_prep_for_write(trans, i->path, insert_l(i)->b);
@@ -840,7 +838,7 @@ fail:
 	}
 
 	trace_trans_restart_would_deadlock_write(trans->fn);
-	return btree_trans_restart(trans);
+	return btree_trans_restart(trans, BCH_ERR_transaction_restart_would_deadlock_write);
 }
 
 static noinline void bch2_drop_overwrites_from_journal(struct btree_trans *trans)
@@ -971,10 +969,7 @@ int bch2_trans_commit_error(struct btree_trans *trans,
 	switch (ret) {
 	case BTREE_INSERT_BTREE_NODE_FULL:
 		ret = bch2_btree_split_leaf(trans, i->path, trans->flags);
-		if (!ret)
-			return 0;
-
-		if (ret == -EINTR)
+		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
 			trace_trans_restart_btree_node_split(trans->fn, trace_ip,
 						i->btree_id, &i->path->pos);
 		break;
@@ -985,19 +980,16 @@ int bch2_trans_commit_error(struct btree_trans *trans,
 		if (ret)
 			break;
 
-		if (bch2_trans_relock(trans))
-			return 0;
-
-		trace_trans_restart_mark_replicas(trans->fn, trace_ip);
-		ret = -EINTR;
+		ret = bch2_trans_relock(trans);
+		if (ret)
+			trace_trans_restart_mark_replicas(trans->fn, trace_ip);
 		break;
 	case BTREE_INSERT_NEED_JOURNAL_RES:
 		bch2_trans_unlock(trans);
 
 		if ((trans->flags & BTREE_INSERT_JOURNAL_RECLAIM) &&
 		    !(trans->flags & JOURNAL_WATERMARK_reserved)) {
-			trans->restarted = true;
-			ret = -EAGAIN;
+			ret = -BCH_ERR_journal_reclaim_would_deadlock;
 			break;
 		}
 
@@ -1005,11 +997,9 @@ int bch2_trans_commit_error(struct btree_trans *trans,
 		if (ret)
 			break;
 
-		if (bch2_trans_relock(trans))
-			return 0;
-
-		trace_trans_restart_journal_res_get(trans->fn, trace_ip);
-		ret = -EINTR;
+		ret = bch2_trans_relock(trans);
+		if (ret)
+			trace_trans_restart_journal_res_get(trans->fn, trace_ip);
 		break;
 	case BTREE_INSERT_NEED_JOURNAL_RECLAIM:
 		bch2_trans_unlock(trans);
@@ -1021,18 +1011,16 @@ int bch2_trans_commit_error(struct btree_trans *trans,
 		if (ret < 0)
 			break;
 
-		if (bch2_trans_relock(trans))
-			return 0;
-
-		trace_trans_restart_journal_reclaim(trans->fn, trace_ip);
-		ret = -EINTR;
+		ret = bch2_trans_relock(trans);
+		if (ret)
+			trace_trans_restart_journal_reclaim(trans->fn, trace_ip);
 		break;
 	default:
 		BUG_ON(ret >= 0);
 		break;
 	}
 
-	BUG_ON((ret == EINTR || ret == -EAGAIN) && !trans->restarted);
+	BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart) != !!trans->restarted);
 	BUG_ON(ret == -ENOSPC &&
 	       !(trans->flags & BTREE_INSERT_NOWAIT) &&
 	       (trans->flags & BTREE_INSERT_NOFAIL));
@@ -1052,13 +1040,11 @@ bch2_trans_commit_get_rw_cold(struct btree_trans *trans)
 
 	bch2_trans_unlock(trans);
 
-	ret = bch2_fs_read_write_early(c);
+	ret =   bch2_fs_read_write_early(c) ?:
+		bch2_trans_relock(trans);
 	if (ret)
 		return ret;
 
-	if (!bch2_trans_relock(trans))
-		return -EINTR;
-
 	percpu_ref_get(&c->writes);
 	return 0;
 }
@@ -1132,7 +1118,7 @@ int __bch2_trans_commit(struct btree_trans *trans)
 		if (unlikely(!bch2_btree_path_upgrade(trans, i->path, i->level + 1))) {
 			trace_trans_restart_upgrade(trans->fn, _RET_IP_,
 						    i->btree_id, &i->path->pos);
-			ret = btree_trans_restart(trans);
+			ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_upgrade);
 			goto out;
 		}
 
@@ -1654,8 +1640,7 @@ int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter
 
 			if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
 				trace_trans_restart_key_cache_raced(trans->fn, _RET_IP_);
-				btree_trans_restart(trans);
-				return -EINTR;
+				return btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_raced);
 			}
 
 			iter->key_cache_path->should_be_locked = true;
@@ -1783,7 +1768,7 @@ retry:
 			break;
 	}
 
-	if (ret == -EINTR) {
+	if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
 		ret = 0;
 		goto retry;
 	}
diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c
index 6726bd6b9b07..c0d6a48d3c72 100644
--- a/fs/bcachefs/data_update.c
+++ b/fs/bcachefs/data_update.c
@@ -236,7 +236,7 @@ int bch2_data_update_index_update(struct bch_write_op *op)
 				bch2_ob_add_backpointer(c, ec_ob, &insert->k);
 		}
 err:
-		if (ret == -EINTR)
+		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
 			ret = 0;
 		if (ret)
 			break;
@@ -264,7 +264,7 @@ out:
 	bch2_trans_exit(&trans);
 	bch2_bkey_buf_exit(&_insert, c);
 	bch2_bkey_buf_exit(&_new, c);
-	BUG_ON(ret == -EINTR);
+	BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart));
 	return ret;
 }
 
diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c
index 0cbb765cde54..4d942d224a08 100644
--- a/fs/bcachefs/dirent.c
+++ b/fs/bcachefs/dirent.c
@@ -471,7 +471,7 @@ retry:
 
 	ret = __bch2_dirent_lookup_trans(&trans, &iter, dir, hash_info,
 					  name, inum, 0);
-	if (ret == -EINTR)
+	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
 		goto retry;
 	if (!ret)
 		bch2_trans_iter_exit(&trans, &iter);
@@ -556,7 +556,7 @@ retry:
 	}
 	bch2_trans_iter_exit(&trans, &iter);
 err:
-	if (ret == -EINTR)
+	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
 		goto retry;
 
 	bch2_trans_exit(&trans);
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index 947f2f2b1c09..f33acf1af110 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -572,18 +572,14 @@ static int ec_stripe_mem_alloc(struct btree_trans *trans,
 			       struct btree_iter *iter)
 {
 	size_t idx = iter->pos.offset;
-	int ret = 0;
 
 	if (!__ec_stripe_mem_alloc(trans->c, idx, GFP_NOWAIT|__GFP_NOWARN))
-		return ret;
+		return 0;
 
 	bch2_trans_unlock(trans);
-	ret = -EINTR;
 
-	if (!__ec_stripe_mem_alloc(trans->c, idx, GFP_KERNEL))
-		return ret;
-
-	return -ENOMEM;
+	return   __ec_stripe_mem_alloc(trans->c, idx, GFP_KERNEL) ?:
+		bch2_trans_relock(trans);
 }
 
 static ssize_t stripe_idx_to_delete(struct bch_fs *c)
diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h
index 69cc7cdd1c06..7972b018d2d0 100644
--- a/fs/bcachefs/errcode.h
+++ b/fs/bcachefs/errcode.h
@@ -7,7 +7,30 @@
 	x(0,			freelist_empty)				\
 	x(freelist_empty,	no_buckets_found)			\
 	x(0,			insufficient_devices)			\
-	x(0,			need_snapshot_cleanup)
+	x(0,			need_snapshot_cleanup)			\
+	x(0,			transaction_restart)			\
+	x(transaction_restart,	transaction_restart_fault_inject)	\
+	x(transaction_restart,	transaction_restart_relock)		\
+	x(transaction_restart,	transaction_restart_relock_path)	\
+	x(transaction_restart,	transaction_restart_relock_path_intent)	\
+	x(transaction_restart,	transaction_restart_relock_after_fill)	\
+	x(transaction_restart,	transaction_restart_too_many_iters)	\
+	x(transaction_restart,	transaction_restart_lock_node_reused)	\
+	x(transaction_restart,	transaction_restart_fill_relock)	\
+	x(transaction_restart,	transaction_restart_fill_mem_alloc_fail)\
+	x(transaction_restart,	transaction_restart_mem_realloced)	\
+	x(transaction_restart,	transaction_restart_in_traverse_all)	\
+	x(transaction_restart,	transaction_restart_would_deadlock)	\
+	x(transaction_restart,	transaction_restart_would_deadlock_write)\
+	x(transaction_restart,	transaction_restart_upgrade)		\
+	x(transaction_restart,	transaction_restart_key_cache_fill)	\
+	x(transaction_restart,	transaction_restart_key_cache_raced)	\
+	x(transaction_restart,	transaction_restart_key_cache_realloced)\
+	x(transaction_restart,	transaction_restart_journal_preres_get)	\
+	x(transaction_restart,	transaction_restart_nested)		\
+	x(0,			lock_fail_node_reused)			\
+	x(0,			lock_fail_root_changed)			\
+	x(0,			journal_reclaim_would_deadlock)
 
 enum bch_errcode {
 	BCH_ERR_START		= 2048,
diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index c0dda29dabb4..9f1ecb8d7b3b 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -408,7 +408,7 @@ retry:
 	offset = iter.pos.offset;
 	bch2_trans_iter_exit(&trans, &iter);
 err:
-	if (ret == -EINTR)
+	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
 		goto retry;
 	bch2_trans_exit(&trans);
 
@@ -1018,10 +1018,9 @@ retry:
 		 * read_extent -> io_time_reset may cause a transaction restart
 		 * without returning an error, we need to check for that here:
 		 */
-		if (!bch2_trans_relock(trans)) {
-			ret = -EINTR;
+		ret = bch2_trans_relock(trans);
+		if (ret)
 			break;
-		}
 
 		bch2_btree_iter_set_pos(&iter,
 				POS(inum.inum, rbio->bio.bi_iter.bi_sector));
@@ -1074,7 +1073,7 @@ retry:
 err:
 	bch2_trans_iter_exit(trans, &iter);
 
-	if (ret == -EINTR)
+	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
 		goto retry;
 
 	if (ret) {
@@ -2035,7 +2034,7 @@ retry:
 	offset = iter.pos.offset;
 	bch2_trans_iter_exit(&trans, &iter);
 err:
-	if (err == -EINTR)
+	if (bch2_err_matches(err, BCH_ERR_transaction_restart))
 		goto retry;
 	bch2_trans_exit(&trans);
 
@@ -2427,7 +2426,7 @@ retry:
 	start = iter.pos;
 	bch2_trans_iter_exit(&trans, &iter);
 err:
-	if (ret == -EINTR)
+	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
 		goto retry;
 
 	bch2_trans_exit(&trans);
@@ -2817,7 +2816,8 @@ static long bchfs_fcollapse_finsert(struct bch_inode_info *inode,
 	bch2_trans_copy_iter(&dst, &src);
 	bch2_trans_copy_iter(&del, &src);
 
-	while (ret == 0 || ret == -EINTR) {
+	while (ret == 0 ||
+	       bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
 		struct disk_reservation disk_res =
 			bch2_disk_reservation_init(c, 0);
 		struct bkey_i delete;
@@ -3019,7 +3019,7 @@ static int __bchfs_fallocate(struct bch_inode_info *inode, int mode,
 bkey_err:
 		bch2_quota_reservation_put(c, inode, &quota_res);
 		bch2_disk_reservation_put(c, &disk_res);
-		if (ret == -EINTR)
+		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
 			ret = 0;
 	}
 
@@ -3301,7 +3301,7 @@ retry:
 	}
 	bch2_trans_iter_exit(&trans, &iter);
 err:
-	if (ret == -EINTR)
+	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
 		goto retry;
 
 	bch2_trans_exit(&trans);
@@ -3416,7 +3416,7 @@ retry:
 	}
 	bch2_trans_iter_exit(&trans, &iter);
 err:
-	if (ret == -EINTR)
+	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
 		goto retry;
 
 	bch2_trans_exit(&trans);
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index 876552a2a83b..af4941862187 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -154,7 +154,7 @@ retry:
 
 	bch2_trans_iter_exit(&trans, &iter);
 
-	if (ret == -EINTR)
+	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
 		goto retry;
 
 	bch2_trans_exit(&trans);
@@ -324,7 +324,7 @@ retry:
 		bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, -1,
 				KEY_TYPE_QUOTA_WARN);
 err_before_quota:
-		if (ret == -EINTR)
+		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
 			goto retry;
 		goto err_trans;
 	}
@@ -755,7 +755,7 @@ retry:
 btree_err:
 	bch2_trans_iter_exit(&trans, &inode_iter);
 
-	if (ret == -EINTR)
+	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
 		goto retry;
 	if (unlikely(ret))
 		goto err_trans;
@@ -987,7 +987,7 @@ retry:
 	start = iter.pos.offset;
 	bch2_trans_iter_exit(&trans, &iter);
 err:
-	if (ret == -EINTR)
+	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
 		goto retry;
 
 	if (!ret && have_extent)
@@ -1337,7 +1337,7 @@ found:
 	memcpy(name, d.v->d_name, name_len);
 	name[name_len] = '\0';
 err:
-	if (ret == -EINTR)
+	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
 		goto retry;
 
 	bch2_trans_iter_exit(&trans, &iter1);
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 021affcc82d4..29d731a12436 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -136,7 +136,7 @@ static int lookup_first_inode(struct btree_trans *trans, u64 inode_nr,
 
 	ret = bch2_inode_unpack(k, inode);
 err:
-	if (ret && ret != -EINTR)
+	if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
 		bch_err(trans->c, "error fetching inode %llu: %s",
 			inode_nr, bch2_err_str(ret));
 	bch2_trans_iter_exit(trans, &iter);
@@ -164,7 +164,7 @@ static int __lookup_inode(struct btree_trans *trans, u64 inode_nr,
 	if (!ret)
 		*snapshot = iter.pos.snapshot;
 err:
-	if (ret && ret != -EINTR)
+	if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
 		bch_err(trans->c, "error fetching inode %llu:%u: %s",
 			inode_nr, *snapshot, bch2_err_str(ret));
 	bch2_trans_iter_exit(trans, &iter);
@@ -287,7 +287,7 @@ retry:
 				BTREE_INSERT_NOFAIL);
 err:
 	bch2_trans_iter_exit(trans, &iter);
-	if (ret == -EINTR)
+	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
 		goto retry;
 
 	return ret;
@@ -314,7 +314,7 @@ static int __remove_dirent(struct btree_trans *trans, struct bpos pos)
 				  BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
 	bch2_trans_iter_exit(trans, &iter);
 err:
-	if (ret && ret != -EINTR)
+	if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
 		bch_err(c, "error from __remove_dirent(): %s", bch2_err_str(ret));
 	return ret;
 }
@@ -350,7 +350,7 @@ static int lookup_lostfound(struct btree_trans *trans, u32 subvol,
 		goto create_lostfound;
 	}
 
-	if (ret && ret != -EINTR)
+	if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
 		bch_err(c, "error looking up lost+found: %s", bch2_err_str(ret));
 	if (ret)
 		return ret;
@@ -373,7 +373,7 @@ create_lostfound:
 				lostfound, &lostfound_str,
 				0, 0, S_IFDIR|0700, 0, NULL, NULL,
 				(subvol_inum) { }, 0);
-	if (ret && ret != -EINTR)
+	if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
 		bch_err(c, "error creating lost+found: %s", bch2_err_str(ret));
 	return ret;
 }
@@ -843,10 +843,10 @@ bad_hash:
 
 	ret = hash_redo_key(trans, desc, hash_info, k_iter, hash_k);
 	if (ret) {
-		bch_err(c, "hash_redo_key err %i", ret);
+		bch_err(c, "hash_redo_key err %s", bch2_err_str(ret));
 		return ret;
 	}
-	ret = -EINTR;
+	ret = -BCH_ERR_transaction_restart_nested;
 fsck_err:
 	goto out;
 }
@@ -1144,7 +1144,7 @@ static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w)
 		ret = write_inode(trans, &i->inode, i->snapshot);
 		if (ret)
 			break;
-		ret2 = -EINTR;
+		ret2 = -BCH_ERR_transaction_restart_nested;
 	}
 fsck_err:
 	if (ret)
@@ -1191,7 +1191,7 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
 		 * it shouldn't be but we need to fix the new i_sectors check
 		 * code and delete the old bch2_count_inode_sectors() first
 		 */
-		return -EINTR;
+		return -BCH_ERR_transaction_restart_nested;
 	}
 #if 0
 	if (bkey_cmp(prev.k->k.p, bkey_start_pos(k.k)) > 0) {
@@ -1202,7 +1202,8 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
 		bch2_bkey_val_to_text(&PBUF(buf2), c, k);
 
 		if (fsck_err(c, "overlapping extents:\n%s\n%s", buf1, buf2)) {
-			ret = fix_overlapping_extent(trans, k, prev.k->k.p) ?: -EINTR;
+			ret = fix_overlapping_extent(trans, k, prev.k->k.p)
+				?: -BCH_ERR_transaction_restart_nested;
 			goto out;
 		}
 	}
@@ -1287,8 +1288,8 @@ err:
 fsck_err:
 	printbuf_exit(&buf);
 
-	if (ret && ret != -EINTR)
-		bch_err(c, "error %i from check_extent()", ret);
+	if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
+		bch_err(c, "error from check_extent(): %s", bch2_err_str(ret));
 	return ret;
 }
 
@@ -1364,7 +1365,7 @@ static int check_subdir_count(struct btree_trans *trans, struct inode_walker *w)
 			ret = write_inode(trans, &i->inode, i->snapshot);
 			if (ret)
 				break;
-			ret2 = -EINTR;
+			ret2 = -BCH_ERR_transaction_restart_nested;
 		}
 	}
 fsck_err:
@@ -1487,7 +1488,7 @@ err:
 fsck_err:
 	printbuf_exit(&buf);
 
-	if (ret && ret != -EINTR)
+	if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
 		bch_err(c, "error from check_target(): %s", bch2_err_str(ret));
 	return ret;
 }
@@ -1530,7 +1531,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
 
 	if (!iter->path->should_be_locked) {
 		/* hack: see check_extent() */
-		return -EINTR;
+		return -BCH_ERR_transaction_restart_nested;
 	}
 
 	ret = __walk_inode(trans, dir, equiv);
@@ -1660,7 +1661,7 @@ err:
 fsck_err:
 	printbuf_exit(&buf);
 
-	if (ret && ret != -EINTR)
+	if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
 		bch_err(c, "error from check_dirent(): %s", bch2_err_str(ret));
 	return ret;
 }
@@ -1735,7 +1736,7 @@ static int check_xattr(struct btree_trans *trans, struct btree_iter *iter,
 
 	ret = hash_check_key(trans, bch2_xattr_hash_desc, hash_info, iter, k);
 fsck_err:
-	if (ret && ret != -EINTR)
+	if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
 		bch_err(c, "error from check_xattr(): %s", bch2_err_str(ret));
 	return ret;
 }
@@ -2016,8 +2017,6 @@ static int check_directory_structure(struct bch_fs *c)
 	}
 	bch2_trans_iter_exit(&trans, &iter);
 
-	BUG_ON(ret == -EINTR);
-
 	darray_exit(&path);
 
 	bch2_trans_exit(&trans);
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index 5de66d62028b..fc0f98074dab 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -619,7 +619,7 @@ static int bch2_inode_delete_keys(struct btree_trans *trans,
 		      bch2_trans_commit(trans, NULL, NULL,
 					BTREE_INSERT_NOFAIL);
 err:
-		if (ret && ret != -EINTR)
+		if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
 			break;
 	}
 
@@ -690,7 +690,7 @@ retry:
 				BTREE_INSERT_NOFAIL);
 err:
 	bch2_trans_iter_exit(&trans, &iter);
-	if (ret == -EINTR)
+	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
 		goto retry;
 
 	bch2_trans_exit(&trans);
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index f137a8e90f07..dfa708c0a7fc 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -390,7 +390,7 @@ err:
 }
 
 /*
- * Returns -EINTR if we had to drop locks:
+ * Returns -BCH_ERR_transacton_restart if we had to drop locks:
  */
 int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter,
 		   subvol_inum inum, u64 end,
@@ -403,7 +403,8 @@ int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter,
 	int ret = 0, ret2 = 0;
 	u32 snapshot;
 
-	while (!ret || ret == -EINTR) {
+	while (!ret ||
+	       bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
 		struct disk_reservation disk_res =
 			bch2_disk_reservation_init(c, 0);
 		struct bkey_i delete;
@@ -462,7 +463,10 @@ int bch2_fpunch(struct bch_fs *c, subvol_inum inum, u64 start, u64 end,
 	bch2_trans_iter_exit(&trans, &iter);
 	bch2_trans_exit(&trans);
 
-	return ret == -EINTR ? 0 : ret;
+	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+		ret = 0;
+
+	return ret;
 }
 
 static int bch2_write_index_default(struct bch_write_op *op)
@@ -493,7 +497,7 @@ static int bch2_write_index_default(struct bch_write_op *op)
 
 		ret = bch2_subvolume_get_snapshot(&trans, inum.subvol,
 						  &sk.k->k.p.snapshot);
-		if (ret == -EINTR)
+		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
 			continue;
 		if (ret)
 			break;
@@ -508,7 +512,7 @@ static int bch2_write_index_default(struct bch_write_op *op)
 					 op->flags & BCH_WRITE_CHECK_ENOSPC);
 		bch2_trans_iter_exit(&trans, &iter);
 
-		if (ret == -EINTR)
+		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
 			continue;
 		if (ret)
 			break;
@@ -663,7 +667,7 @@ static void __bch2_write_index(struct bch_write_op *op)
 			? bch2_write_index_default(op)
 			: bch2_data_update_index_update(op);
 
-		BUG_ON(ret == -EINTR);
+		BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart));
 		BUG_ON(keylist_sectors(keys) && !ret);
 
 		op->written += sectors_start - keylist_sectors(keys);
@@ -2429,10 +2433,9 @@ retry:
 		 * read_extent -> io_time_reset may cause a transaction restart
 		 * without returning an error, we need to check for that here:
 		 */
-		if (!bch2_trans_relock(&trans)) {
-			ret = -EINTR;
+		ret = bch2_trans_relock(&trans);
+		if (ret)
 			break;
-		}
 
 		bch2_btree_iter_set_pos(&iter,
 				POS(inum.inum, bvec_iter.bi_sector));
@@ -2486,7 +2489,9 @@ retry:
 err:
 	bch2_trans_iter_exit(&trans, &iter);
 
-	if (ret == -EINTR || ret == READ_RETRY || ret == READ_RETRY_AVOID)
+	if (bch2_err_matches(ret, BCH_ERR_transaction_restart) ||
+	    ret == READ_RETRY ||
+	    ret == READ_RETRY_AVOID)
 		goto retry;
 
 	bch2_trans_exit(&trans);
diff --git a/fs/bcachefs/journal_seq_blacklist.c b/fs/bcachefs/journal_seq_blacklist.c
index d9b4042a2e4a..5c555b3703c0 100644
--- a/fs/bcachefs/journal_seq_blacklist.c
+++ b/fs/bcachefs/journal_seq_blacklist.c
@@ -272,7 +272,7 @@ retry:
 		       !test_bit(BCH_FS_STOPPING, &c->flags))
 			b = bch2_btree_iter_next_node(&iter);
 
-		if (ret == -EINTR)
+		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
 			goto retry;
 
 		bch2_trans_iter_exit(&trans, &iter);
diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c
index baeca0e2a302..8b258d966d04 100644
--- a/fs/bcachefs/migrate.c
+++ b/fs/bcachefs/migrate.c
@@ -146,7 +146,7 @@ retry:
 			}
 
 			ret = bch2_btree_node_update_key(&trans, &iter, b, k.k, false);
-			if (ret == -EINTR) {
+			if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
 				ret = 0;
 				continue;
 			}
@@ -159,7 +159,7 @@ retry:
 next:
 			bch2_btree_iter_next_node(&iter);
 		}
-		if (ret == -EINTR)
+		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
 			goto retry;
 
 		bch2_trans_iter_exit(&trans, &iter);
@@ -174,7 +174,7 @@ err:
 	bch2_trans_exit(&trans);
 	bch2_bkey_buf_exit(&k, c);
 
-	BUG_ON(ret == -EINTR);
+	BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart));
 
 	return ret;
 }
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index 7fba0f70c409..ea9ce6d436a2 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -387,7 +387,7 @@ static int __bch2_move_data(struct moving_context *ctxt,
 			break;
 
 		ret = bkey_err(k);
-		if (ret == -EINTR)
+		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
 			continue;
 		if (ret)
 			break;
@@ -409,7 +409,7 @@ static int __bch2_move_data(struct moving_context *ctxt,
 			ret = lookup_inode(&trans,
 					SPOS(0, k.k->p.inode, k.k->p.snapshot),
 					&inode);
-			if (ret == -EINTR)
+			if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
 				continue;
 
 			if (!ret)
@@ -432,7 +432,7 @@ static int __bch2_move_data(struct moving_context *ctxt,
 		ret2 = bch2_move_extent(&trans, ctxt, io_opts,
 					btree_id, k, data_opts);
 		if (ret2) {
-			if (ret2 == -EINTR)
+			if (bch2_err_matches(ret2, BCH_ERR_transaction_restart))
 				continue;
 
 			if (ret2 == -ENOMEM) {
@@ -546,14 +546,14 @@ retry:
 				goto next;
 
 			ret = bch2_btree_node_rewrite(&trans, &iter, b, 0) ?: ret;
-			if (ret == -EINTR)
+			if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
 				continue;
 			if (ret)
 				break;
 next:
 			bch2_btree_iter_next_node(&iter);
 		}
-		if (ret == -EINTR)
+		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
 			goto retry;
 
 		bch2_trans_iter_exit(&trans, &iter);
diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c
index 2038e3502d8c..d5c14bb2992d 100644
--- a/fs/bcachefs/reflink.c
+++ b/fs/bcachefs/reflink.c
@@ -299,7 +299,8 @@ s64 bch2_remap_range(struct bch_fs *c,
 	bch2_trans_iter_init(&trans, &dst_iter, BTREE_ID_extents, dst_start,
 			     BTREE_ITER_INTENT);
 
-	while ((ret == 0 || ret == -EINTR) &&
+	while ((ret == 0 ||
+		bch2_err_matches(ret, BCH_ERR_transaction_restart)) &&
 	       bkey_cmp(dst_iter.pos, dst_end) < 0) {
 		struct disk_reservation disk_res = { 0 };
 
@@ -409,7 +410,7 @@ s64 bch2_remap_range(struct bch_fs *c,
 		}
 
 		bch2_trans_iter_exit(&trans, &inode_iter);
-	} while (ret2 == -EINTR);
+	} while (bch2_err_matches(ret2, BCH_ERR_transaction_restart));
 
 	bch2_trans_exit(&trans);
 	bch2_bkey_buf_exit(&new_src, c);
diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c
index 0469b90064eb..b5b0f5e39f97 100644
--- a/fs/bcachefs/subvolume.c
+++ b/fs/bcachefs/subvolume.c
@@ -315,7 +315,7 @@ static int check_subvol(struct btree_trans *trans,
 
 	if (BCH_SUBVOLUME_UNLINKED(subvol.v)) {
 		ret = bch2_subvolume_delete(trans, iter->pos.offset);
-		if (ret && ret != -EINTR)
+		if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
 			bch_err(trans->c, "error deleting subvolume %llu: %s",
 				iter->pos.offset, bch2_err_str(ret));
 		if (ret)
diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c
index bf0a33c0233d..c6cac5c79c12 100644
--- a/fs/bcachefs/tests.c
+++ b/fs/bcachefs/tests.c
@@ -640,7 +640,7 @@ static int rand_mixed_trans(struct btree_trans *trans,
 
 	k = bch2_btree_iter_peek(iter);
 	ret = bkey_err(k);
-	if (ret && ret != -EINTR)
+	if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
 		bch_err(trans->c, "lookup error in rand_mixed: %s", bch2_err_str(ret));
 	if (ret)
 		return ret;
diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c
index 5df61b6b4a3c..37793b3357d3 100644
--- a/fs/bcachefs/xattr.c
+++ b/fs/bcachefs/xattr.c
@@ -344,7 +344,7 @@ retry:
 	offset = iter.pos.offset;
 	bch2_trans_iter_exit(&trans, &iter);
 err:
-	if (ret == -EINTR)
+	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
 		goto retry;
 
 	bch2_trans_exit(&trans);
-- 
cgit 


From a0cb8d784f309d22323974e47b103bf01d0b62c0 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 17 Jul 2022 20:22:30 -0400
Subject: bcachefs: Inject transaction restarts in debug mode

In CONFIG_BCACHEFS_DEBUG mode, we'll now randomly issue transaction
restarts - with a decaying probability based on the number of restarts
we've already had, to ensure that transactions eventually make forward
progress. This should help shake out some bugs.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c | 11 +++++++++++
 fs/bcachefs/trace.h      |  6 ++++++
 2 files changed, 17 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index db247c96298f..777e41a12246 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -17,6 +17,7 @@
 #include "subvolume.h"
 #include "trace.h"
 
+#include <linux/random.h>
 #include <linux/prefetch.h>
 
 static inline void btree_path_list_remove(struct btree_trans *, struct btree_path *);
@@ -1660,6 +1661,16 @@ out:
 int __must_check bch2_btree_path_traverse(struct btree_trans *trans,
 					  struct btree_path *path, unsigned flags)
 {
+	if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) {
+		unsigned restart_probability_bits = 4 << min(trans->restart_count, 32U);
+		u64 max = ~(~0ULL << restart_probability_bits);
+
+		if (!get_random_u32_below(max)) {
+			trace_transaction_restart_injected(trans->fn, _RET_IP_);
+			return btree_trans_restart(trans, BCH_ERR_transaction_restart_fault_inject);
+		}
+	}
+
 	if (path->uptodate < BTREE_ITER_NEED_RELOCK)
 		return 0;
 
diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h
index a52da91f279e..1e9e93161509 100644
--- a/fs/bcachefs/trace.h
+++ b/fs/bcachefs/trace.h
@@ -707,6 +707,12 @@ DEFINE_EVENT(transaction_event,	transaction_restart_ip,
 	TP_ARGS(trans_fn, caller_ip)
 );
 
+DEFINE_EVENT(transaction_event,	transaction_restart_injected,
+	TP_PROTO(const char *trans_fn,
+		 unsigned long caller_ip),
+	TP_ARGS(trans_fn, caller_ip)
+);
+
 DEFINE_EVENT(transaction_event,	trans_blocked_journal_reclaim,
 	TP_PROTO(const char *trans_fn,
 		 unsigned long caller_ip),
-- 
cgit 


From 1ed0a5d280ef7a1183b42b2fcc13d919925f1b6e Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 19 Jul 2022 17:20:18 -0400
Subject: bcachefs: Convert fsck errors to errcode.h

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_gc.c   | 18 +++++++++---------
 fs/bcachefs/btree_io.c   |  8 ++++----
 fs/bcachefs/errcode.h    | 11 +++++++++--
 fs/bcachefs/error.c      | 13 ++++++-------
 fs/bcachefs/error.h      | 27 ++++++---------------------
 fs/bcachefs/fsck.c       | 30 ++++++++++++++----------------
 fs/bcachefs/journal_io.c |  4 ++--
 fs/bcachefs/recovery.c   |  2 +-
 fs/bcachefs/super.c      | 27 +++------------------------
 9 files changed, 54 insertions(+), 86 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index e7098e910a73..4ab59880781a 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -98,7 +98,7 @@ static int bch2_gc_check_topology(struct bch_fs *c,
 				  buf1.buf, buf2.buf) &&
 			    !test_bit(BCH_FS_TOPOLOGY_REPAIR_DONE, &c->flags)) {
 				bch_info(c, "Halting mark and sweep to start topology repair pass");
-				ret = FSCK_ERR_START_TOPOLOGY_REPAIR;
+				ret = -BCH_ERR_need_topology_repair;
 				goto err;
 			} else {
 				set_bit(BCH_FS_INITIAL_GC_UNFIXED, &c->flags);
@@ -126,7 +126,7 @@ static int bch2_gc_check_topology(struct bch_fs *c,
 			  buf1.buf, buf2.buf) &&
 		    !test_bit(BCH_FS_TOPOLOGY_REPAIR_DONE, &c->flags)) {
 			bch_info(c, "Halting mark and sweep to start topology repair pass");
-			ret = FSCK_ERR_START_TOPOLOGY_REPAIR;
+			ret = -BCH_ERR_need_topology_repair;
 			goto err;
 		} else {
 			set_bit(BCH_FS_INITIAL_GC_UNFIXED, &c->flags);
@@ -537,7 +537,7 @@ static int bch2_repair_topology(struct bch_fs *c)
 
 		if (ret == DROP_THIS_NODE) {
 			bch_err(c, "empty btree root - repair unimplemented");
-			ret = FSCK_ERR_EXIT;
+			ret = -BCH_ERR_fsck_repair_unimplemented;
 		}
 	}
 
@@ -960,7 +960,7 @@ static int bch2_gc_btree_init_recurse(struct btree_trans *trans, struct btree *b
 					  (printbuf_reset(&buf),
 					   bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(cur.k)), buf.buf)) &&
 				    !test_bit(BCH_FS_TOPOLOGY_REPAIR_DONE, &c->flags)) {
-					ret = FSCK_ERR_START_TOPOLOGY_REPAIR;
+					ret = -BCH_ERR_need_topology_repair;
 					bch_info(c, "Halting mark and sweep to start topology repair pass");
 					goto fsck_err;
 				} else {
@@ -1013,7 +1013,7 @@ static int bch2_gc_btree_init(struct btree_trans *trans,
 	if (mustfix_fsck_err_on(bpos_cmp(b->data->min_key, POS_MIN), c,
 			"btree root with incorrect min_key: %s", buf.buf)) {
 		bch_err(c, "repair unimplemented");
-		ret = FSCK_ERR_EXIT;
+		ret = -BCH_ERR_fsck_repair_unimplemented;
 		goto fsck_err;
 	}
 
@@ -1022,7 +1022,7 @@ static int bch2_gc_btree_init(struct btree_trans *trans,
 	if (mustfix_fsck_err_on(bpos_cmp(b->data->max_key, SPOS_MAX), c,
 			"btree root with incorrect max_key: %s", buf.buf)) {
 		bch_err(c, "repair unimplemented");
-		ret = FSCK_ERR_EXIT;
+		ret = -BCH_ERR_fsck_repair_unimplemented;
 		goto fsck_err;
 	}
 
@@ -1777,7 +1777,7 @@ again:
 
 	ret = bch2_gc_btrees(c, initial, metadata_only);
 
-	if (ret == FSCK_ERR_START_TOPOLOGY_REPAIR &&
+	if (ret == -BCH_ERR_need_topology_repair &&
 	    !test_bit(BCH_FS_TOPOLOGY_REPAIR_DONE, &c->flags) &&
 	    !test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags)) {
 		set_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags);
@@ -1785,8 +1785,8 @@ again:
 		ret = 0;
 	}
 
-	if (ret == FSCK_ERR_START_TOPOLOGY_REPAIR)
-		ret = FSCK_ERR_EXIT;
+	if (ret == -BCH_ERR_need_topology_repair)
+		ret = -BCH_ERR_fsck_errors_not_fixed;
 
 	if (ret)
 		goto out;
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index b7441677dc33..4254f7c7d85e 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -537,7 +537,7 @@ enum btree_validate_ret {
 	struct printbuf out = PRINTBUF;					\
 									\
 	btree_err_msg(&out, c, ca, b, i, b->written, write);		\
-	prt_printf(&out, ": " msg, ##__VA_ARGS__);				\
+	prt_printf(&out, ": " msg, ##__VA_ARGS__);			\
 									\
 	if (type == BTREE_ERR_FIXABLE &&				\
 	    write == READ &&						\
@@ -552,7 +552,7 @@ enum btree_validate_ret {
 									\
 		switch (type) {						\
 		case BTREE_ERR_FIXABLE:					\
-			ret = BCH_FSCK_ERRORS_NOT_FIXED;		\
+			ret = -BCH_ERR_fsck_errors_not_fixed;		\
 			goto fsck_err;					\
 		case BTREE_ERR_WANT_RETRY:				\
 			if (have_retry) {				\
@@ -564,7 +564,7 @@ enum btree_validate_ret {
 			ret = BTREE_RETRY_READ;				\
 			goto fsck_err;					\
 		case BTREE_ERR_FATAL:					\
-			ret = BCH_FSCK_ERRORS_NOT_FIXED;		\
+			ret = -BCH_ERR_fsck_errors_not_fixed;		\
 			goto fsck_err;					\
 		}							\
 		break;							\
@@ -572,7 +572,7 @@ enum btree_validate_ret {
 		bch_err(c, "corrupt metadata before write: %s", out.buf);\
 									\
 		if (bch2_fs_inconsistent(c)) {				\
-			ret = BCH_FSCK_ERRORS_NOT_FIXED;		\
+			ret = -BCH_ERR_fsck_errors_not_fixed;		\
 			goto fsck_err;					\
 		}							\
 		break;							\
diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h
index 7972b018d2d0..95925c8434b3 100644
--- a/fs/bcachefs/errcode.h
+++ b/fs/bcachefs/errcode.h
@@ -7,7 +7,6 @@
 	x(0,			freelist_empty)				\
 	x(freelist_empty,	no_buckets_found)			\
 	x(0,			insufficient_devices)			\
-	x(0,			need_snapshot_cleanup)			\
 	x(0,			transaction_restart)			\
 	x(transaction_restart,	transaction_restart_fault_inject)	\
 	x(transaction_restart,	transaction_restart_relock)		\
@@ -30,7 +29,15 @@
 	x(transaction_restart,	transaction_restart_nested)		\
 	x(0,			lock_fail_node_reused)			\
 	x(0,			lock_fail_root_changed)			\
-	x(0,			journal_reclaim_would_deadlock)
+	x(0,			journal_reclaim_would_deadlock)		\
+	x(0,			fsck)					\
+	x(fsck,			fsck_fix)				\
+	x(fsck,			fsck_ignore)				\
+	x(fsck,			fsck_errors_not_fixed)			\
+	x(fsck,			fsck_repair_unimplemented)		\
+	x(fsck,			fsck_repair_impossible)			\
+	x(0,			need_snapshot_cleanup)			\
+	x(0,			need_topology_repair)
 
 enum bch_errcode {
 	BCH_ERR_START		= 2048,
diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c
index 8279a9ba76a5..f6a895b2ceb7 100644
--- a/fs/bcachefs/error.c
+++ b/fs/bcachefs/error.c
@@ -68,8 +68,7 @@ void bch2_io_error(struct bch_dev *ca)
 #include "tools-util.h"
 #endif
 
-enum fsck_err_ret bch2_fsck_err(struct bch_fs *c, unsigned flags,
-				const char *fmt, ...)
+int bch2_fsck_err(struct bch_fs *c, unsigned flags, const char *fmt, ...)
 {
 	struct fsck_err_state *s = NULL;
 	va_list args;
@@ -83,10 +82,10 @@ enum fsck_err_ret bch2_fsck_err(struct bch_fs *c, unsigned flags,
 
 		if (c->opts.errors == BCH_ON_ERROR_continue) {
 			bch_err(c, "fixing");
-			return FSCK_ERR_FIX;
+			return -BCH_ERR_fsck_fix;
 		} else {
 			bch2_inconsistent_error(c);
-			return FSCK_ERR_EXIT;
+			return -BCH_ERR_fsck_errors_not_fixed;
 		}
 	}
 
@@ -156,14 +155,14 @@ print:
 
 	if (fix) {
 		set_bit(BCH_FS_ERRORS_FIXED, &c->flags);
-		return FSCK_ERR_FIX;
+		return -BCH_ERR_fsck_fix;
 	} else {
 		set_bit(BCH_FS_ERRORS_NOT_FIXED, &c->flags);
 		set_bit(BCH_FS_ERROR, &c->flags);
 		return c->opts.fix_errors == FSCK_OPT_EXIT ||
 			!(flags & FSCK_CAN_IGNORE)
-			? FSCK_ERR_EXIT
-			: FSCK_ERR_IGNORE;
+			? -BCH_ERR_fsck_errors_not_fixed
+			: -BCH_ERR_fsck_ignore;
 	}
 }
 
diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h
index 6e63c38186f3..b603d738c549 100644
--- a/fs/bcachefs/error.h
+++ b/fs/bcachefs/error.h
@@ -91,14 +91,6 @@ do {									\
  * be able to repair:
  */
 
-enum {
-	BCH_FSCK_OK			= 0,
-	BCH_FSCK_ERRORS_NOT_FIXED	= 1,
-	BCH_FSCK_REPAIR_UNIMPLEMENTED	= 2,
-	BCH_FSCK_REPAIR_IMPOSSIBLE	= 3,
-	BCH_FSCK_UNKNOWN_VERSION	= 4,
-};
-
 enum fsck_err_opts {
 	FSCK_OPT_EXIT,
 	FSCK_OPT_YES,
@@ -106,13 +98,6 @@ enum fsck_err_opts {
 	FSCK_OPT_ASK,
 };
 
-enum fsck_err_ret {
-	FSCK_ERR_IGNORE	= 0,
-	FSCK_ERR_FIX	= 1,
-	FSCK_ERR_EXIT	= 2,
-	FSCK_ERR_START_TOPOLOGY_REPAIR = 3,
-};
-
 struct fsck_err_state {
 	struct list_head	list;
 	const char		*fmt;
@@ -127,21 +112,21 @@ struct fsck_err_state {
 #define FSCK_NO_RATELIMIT	(1 << 3)
 
 __printf(3, 4) __cold
-enum fsck_err_ret bch2_fsck_err(struct bch_fs *,
-				unsigned, const char *, ...);
+int bch2_fsck_err(struct bch_fs *, unsigned, const char *, ...);
 void bch2_flush_fsck_errs(struct bch_fs *);
 
 #define __fsck_err(c, _flags, msg, ...)					\
 ({									\
-	int _fix = bch2_fsck_err(c, _flags, msg, ##__VA_ARGS__);\
+	int _ret = bch2_fsck_err(c, _flags, msg, ##__VA_ARGS__);	\
 									\
-	if (_fix == FSCK_ERR_EXIT) {					\
+	if (_ret != -BCH_ERR_fsck_fix &&				\
+	    _ret != -BCH_ERR_fsck_ignore) {				\
 		bch_err(c, "Unable to continue, halting");		\
-		ret = BCH_FSCK_ERRORS_NOT_FIXED;			\
+		ret = _ret;						\
 		goto fsck_err;						\
 	}								\
 									\
-	_fix;								\
+	_ret == -BCH_ERR_fsck_fix;					\
 })
 
 /* These macros return true if error should be fixed: */
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 29d731a12436..306983811c1b 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -838,15 +838,14 @@ bad_hash:
 		     "hashed to %llu\n%s",
 		     bch2_btree_ids[desc.btree_id], hash_k.k->p.inode, hash_k.k->p.offset, hash,
 		     (printbuf_reset(&buf),
-		      bch2_bkey_val_to_text(&buf, c, hash_k), buf.buf)) == FSCK_ERR_IGNORE)
-		return 0;
-
-	ret = hash_redo_key(trans, desc, hash_info, k_iter, hash_k);
-	if (ret) {
-		bch_err(c, "hash_redo_key err %s", bch2_err_str(ret));
-		return ret;
+		      bch2_bkey_val_to_text(&buf, c, hash_k), buf.buf))) {
+		ret = hash_redo_key(trans, desc, hash_info, k_iter, hash_k);
+		if (ret) {
+			bch_err(c, "hash_redo_key err %s", bch2_err_str(ret));
+			return ret;
+		}
+		ret = -BCH_ERR_transaction_restart_nested;
 	}
-	ret = -BCH_ERR_transaction_restart_nested;
 fsck_err:
 	goto out;
 }
@@ -1137,14 +1136,13 @@ static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w)
 		if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_I_SECTORS_DIRTY), c,
 			    "inode %llu:%u has incorrect i_sectors: got %llu, should be %llu",
 			    w->cur_inum, i->snapshot,
-			    i->inode.bi_sectors, i->count) == FSCK_ERR_IGNORE)
-			continue;
-
-		i->inode.bi_sectors = i->count;
-		ret = write_inode(trans, &i->inode, i->snapshot);
-		if (ret)
-			break;
-		ret2 = -BCH_ERR_transaction_restart_nested;
+			    i->inode.bi_sectors, i->count)) {
+			i->inode.bi_sectors = i->count;
+			ret = write_inode(trans, &i->inode, i->snapshot);
+			if (ret)
+				break;
+			ret2 = -BCH_ERR_transaction_restart_nested;
+		}
 	}
 fsck_err:
 	if (ret)
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 4b4a1d000219..acb2005c3b72 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -196,7 +196,7 @@ static void journal_entry_null_range(void *start, void *end)
 		bch_err(c, "corrupt metadata before write:\n"		\
 			msg, ##__VA_ARGS__);				\
 		if (bch2_fs_inconsistent(c)) {				\
-			ret = BCH_FSCK_ERRORS_NOT_FIXED;		\
+			ret = -BCH_ERR_fsck_errors_not_fixed;		\
 			goto fsck_err;					\
 		}							\
 		break;							\
@@ -857,7 +857,7 @@ reread:
 				    end - offset, sectors_read,
 				    READ);
 		switch (ret) {
-		case BCH_FSCK_OK:
+		case 0:
 			sectors = vstruct_sectors(j, c->block_bits);
 			break;
 		case JOURNAL_ENTRY_REREAD:
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index bb04b6f053cc..2cf347530b65 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -1158,7 +1158,7 @@ int bch2_fs_recovery(struct bch_fs *c)
 use_clean:
 		if (!clean) {
 			bch_err(c, "no superblock clean section found");
-			ret = BCH_FSCK_REPAIR_IMPOSSIBLE;
+			ret = -BCH_ERR_fsck_repair_impossible;
 			goto err;
 
 		}
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 87742962d6c2..fe7938e7e07b 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -927,31 +927,10 @@ out:
 	up_write(&c->state_lock);
 	return ret;
 err:
-	switch (ret) {
-	case BCH_FSCK_ERRORS_NOT_FIXED:
-		bch_err(c, "filesystem contains errors: please report this to the developers");
-		pr_cont("mount with -o fix_errors to repair\n");
-		break;
-	case BCH_FSCK_REPAIR_UNIMPLEMENTED:
-		bch_err(c, "filesystem contains errors: please report this to the developers");
-		pr_cont("repair unimplemented: inform the developers so that it can be added\n");
-		break;
-	case BCH_FSCK_REPAIR_IMPOSSIBLE:
-		bch_err(c, "filesystem contains errors, but repair impossible");
-		break;
-	case BCH_FSCK_UNKNOWN_VERSION:
-		bch_err(c, "unknown metadata version");
-		break;
-	case -ENOMEM:
-		bch_err(c, "cannot allocate memory");
-		break;
-	case -EIO:
-		bch_err(c, "IO error");
-		break;
-	}
+	bch_err(c, "error starting filesystem: %s", bch2_err_str(ret));
 
-	if (ret >= 0)
-		ret = -EIO;
+	if (ret < -BCH_ERR_START)
+		ret = -EINVAL;
 	goto out;
 }
 
-- 
cgit 


From 4f84b7e30b3aa72ce5de032380799a1a5ba044fd Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 20 Jul 2022 16:13:27 -0400
Subject: bcachefs: for_each_btree_key_reverse()

This adds a new macro, like for_each_btree_key2(), but for iterating in
reverse order.

Also, change for_each_btree_key2() to properly check the return value of
bch2_btree_iter_advance().

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_iter.h | 41 ++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 40 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index 79339a6abcd7..9a3287da9a12 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -371,6 +371,15 @@ static inline int bkey_err(struct bkey_s_c k)
 	return PTR_ERR_OR_ZERO(k.k);
 }
 
+static inline struct bkey_s_c bch2_btree_iter_peek_prev_type(struct btree_iter *iter,
+							     unsigned flags)
+{
+	BUG_ON(flags & BTREE_ITER_ALL_LEVELS);
+
+	return  flags & BTREE_ITER_SLOTS      ? bch2_btree_iter_peek_slot(iter) :
+						bch2_btree_iter_peek_prev(iter);
+}
+
 static inline struct bkey_s_c bch2_btree_iter_peek_type(struct btree_iter *iter,
 							unsigned flags)
 {
@@ -477,7 +486,37 @@ __bch2_btree_iter_peek_and_restart(struct btree_trans *trans,
 			continue;					\
 		if (_ret)						\
 			break;						\
-		bch2_btree_iter_advance(&(_iter));			\
+		if (!bch2_btree_iter_advance(&(_iter)))			\
+			break;						\
+	}								\
+									\
+	bch2_trans_iter_exit((_trans), &(_iter));			\
+	_ret;								\
+})
+
+#define for_each_btree_key_reverse(_trans, _iter, _btree_id,		\
+				   _start, _flags, _k, _do)		\
+({									\
+	int _ret = 0;							\
+									\
+	bch2_trans_iter_init((_trans), &(_iter), (_btree_id),		\
+			     (_start), (_flags));			\
+									\
+	while (1) {							\
+		bch2_trans_begin(_trans);				\
+		(_k) = bch2_btree_iter_peek_prev_type(&(_iter), (_flags));\
+		if (!(_k).k) {						\
+			_ret = 0;					\
+			break;						\
+		}							\
+									\
+		_ret = bkey_err(_k) ?: (_do);				\
+		if (bch2_err_matches(_ret, BCH_ERR_transaction_restart))\
+			continue;					\
+		if (_ret)						\
+			break;						\
+		if (!bch2_btree_iter_rewind(&(_iter)))			\
+			break;						\
 	}								\
 									\
 	bch2_trans_iter_exit((_trans), &(_iter));			\
-- 
cgit 


From 84ece59ad5c1cd972619cae5a5df8998cc5e779f Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 20 Jul 2022 16:25:00 -0400
Subject: bcachefs: Unit test updates

 - Convert to for_each_btree_key2(), for_each_btree_key_commit(),
   for_each_btree_key_reverse()
 - No more bare bch2_btree_iter_peek(); we're now fault-injection lock
   restarts, so we always need a lockrestart_do() or equivalent.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/tests.c | 183 ++++++++++++++++++++++++++++++----------------------
 1 file changed, 106 insertions(+), 77 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c
index c6cac5c79c12..bed830e678bb 100644
--- a/fs/bcachefs/tests.c
+++ b/fs/bcachefs/tests.c
@@ -145,20 +145,30 @@ static int test_iterate(struct bch_fs *c, u64 nr)
 
 	i = 0;
 
-	for_each_btree_key(&trans, iter, BTREE_ID_xattrs,
-			   SPOS(0, 0, U32_MAX), 0, k, ret) {
-		if (k.k->p.inode)
-			break;
-
+	ret = for_each_btree_key2(&trans, iter, BTREE_ID_xattrs,
+				  SPOS(0, 0, U32_MAX), 0, k, ({
 		BUG_ON(k.k->p.offset != i++);
+		0;
+	}));
+	if (ret) {
+		bch_err(c, "%s(): error iterating forwards: %s", __func__, bch2_err_str(ret));
+		goto err;
 	}
 
 	BUG_ON(i != nr);
 
 	pr_info("iterating backwards");
 
-	while (!IS_ERR_OR_NULL((k = bch2_btree_iter_prev(&iter)).k))
-		BUG_ON(k.k->p.offset != --i);
+	ret = for_each_btree_key_reverse(&trans, iter, BTREE_ID_xattrs,
+					 SPOS(0, U64_MAX, U32_MAX), 0, k,
+		({
+			BUG_ON(k.k->p.offset != --i);
+			0;
+		}));
+	if (ret) {
+		bch_err(c, "%s(): error iterating backwards: %s", __func__, bch2_err_str(ret));
+		goto err;
+	}
 
 	BUG_ON(i);
 err:
@@ -201,19 +211,31 @@ static int test_iterate_extents(struct bch_fs *c, u64 nr)
 
 	i = 0;
 
-	for_each_btree_key(&trans, iter, BTREE_ID_extents,
-			   SPOS(0, 0, U32_MAX), 0, k, ret) {
+	ret = for_each_btree_key2(&trans, iter, BTREE_ID_extents,
+				  SPOS(0, 0, U32_MAX), 0, k, ({
 		BUG_ON(bkey_start_offset(k.k) != i);
 		i = k.k->p.offset;
+		0;
+	}));
+	if (ret) {
+		bch_err(c, "%s(): error iterating forwards: %s", __func__, bch2_err_str(ret));
+		goto err;
 	}
 
 	BUG_ON(i != nr);
 
 	pr_info("iterating backwards");
 
-	while (!IS_ERR_OR_NULL((k = bch2_btree_iter_prev(&iter)).k)) {
-		BUG_ON(k.k->p.offset != i);
-		i = bkey_start_offset(k.k);
+	ret = for_each_btree_key_reverse(&trans, iter, BTREE_ID_extents,
+					 SPOS(0, U64_MAX, U32_MAX), 0, k,
+		({
+			BUG_ON(k.k->p.offset != i);
+			i = bkey_start_offset(k.k);
+			0;
+		}));
+	if (ret) {
+		bch_err(c, "%s(): error iterating backwards: %s", __func__, bch2_err_str(ret));
+		goto err;
 	}
 
 	BUG_ON(i);
@@ -256,15 +278,16 @@ static int test_iterate_slots(struct bch_fs *c, u64 nr)
 
 	i = 0;
 
-	for_each_btree_key(&trans, iter, BTREE_ID_xattrs,
-			   SPOS(0, 0, U32_MAX), 0, k, ret) {
-		if (k.k->p.inode)
-			break;
-
+	ret = for_each_btree_key2(&trans, iter, BTREE_ID_xattrs,
+				  SPOS(0, 0, U32_MAX), 0, k, ({
 		BUG_ON(k.k->p.offset != i);
 		i += 2;
+		0;
+	}));
+	if (ret) {
+		bch_err(c, "%s(): error iterating forwards: %s", __func__, bch2_err_str(ret));
+		goto err;
 	}
-	bch2_trans_iter_exit(&trans, &iter);
 
 	BUG_ON(i != nr * 2);
 
@@ -272,17 +295,23 @@ static int test_iterate_slots(struct bch_fs *c, u64 nr)
 
 	i = 0;
 
-	for_each_btree_key(&trans, iter, BTREE_ID_xattrs,
-			   SPOS(0, 0, U32_MAX),
-			   BTREE_ITER_SLOTS, k, ret) {
+	ret = for_each_btree_key2(&trans, iter, BTREE_ID_xattrs,
+				  SPOS(0, 0, U32_MAX),
+				  BTREE_ITER_SLOTS, k, ({
+		if (i >= nr * 2)
+			break;
+
 		BUG_ON(k.k->p.offset != i);
 		BUG_ON(bkey_deleted(k.k) != (i & 1));
 
 		i++;
-		if (i == nr * 2)
-			break;
+		0;
+	}));
+	if (ret < 0) {
+		bch_err(c, "%s(): error iterating forwards by slots: %s", __func__, bch2_err_str(ret));
+		goto err;
 	}
-	bch2_trans_iter_exit(&trans, &iter);
+	ret = 0;
 err:
 	bch2_trans_exit(&trans);
 	return ret;
@@ -322,13 +351,17 @@ static int test_iterate_slots_extents(struct bch_fs *c, u64 nr)
 
 	i = 0;
 
-	for_each_btree_key(&trans, iter, BTREE_ID_extents,
-			   SPOS(0, 0, U32_MAX), 0, k, ret) {
+	ret = for_each_btree_key2(&trans, iter, BTREE_ID_extents,
+				  SPOS(0, 0, U32_MAX), 0, k, ({
 		BUG_ON(bkey_start_offset(k.k) != i + 8);
 		BUG_ON(k.k->size != 8);
 		i += 16;
+		0;
+	}));
+	if (ret) {
+		bch_err(c, "%s(): error iterating forwards: %s", __func__, bch2_err_str(ret));
+		goto err;
 	}
-	bch2_trans_iter_exit(&trans, &iter);
 
 	BUG_ON(i != nr);
 
@@ -336,19 +369,23 @@ static int test_iterate_slots_extents(struct bch_fs *c, u64 nr)
 
 	i = 0;
 
-	for_each_btree_key(&trans, iter, BTREE_ID_extents,
-			   SPOS(0, 0, U32_MAX),
-			   BTREE_ITER_SLOTS, k, ret) {
+	ret = for_each_btree_key2(&trans, iter, BTREE_ID_extents,
+				 SPOS(0, 0, U32_MAX),
+				 BTREE_ITER_SLOTS, k, ({
+		if (i == nr)
+			break;
 		BUG_ON(bkey_deleted(k.k) != !(i % 16));
 
 		BUG_ON(bkey_start_offset(k.k) != i);
 		BUG_ON(k.k->size != 8);
 		i = k.k->p.offset;
-
-		if (i == nr)
-			break;
+		0;
+	}));
+	if (ret) {
+		bch_err(c, "%s(): error iterating forwards by slots: %s", __func__, bch2_err_str(ret));
+		goto err;
 	}
-	bch2_trans_iter_exit(&trans, &iter);
+	ret = 0;
 err:
 	bch2_trans_exit(&trans);
 	return 0;
@@ -368,10 +405,10 @@ static int test_peek_end(struct bch_fs *c, u64 nr)
 	bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs,
 			     SPOS(0, 0, U32_MAX), 0);
 
-	k = bch2_btree_iter_peek(&iter);
+	lockrestart_do(&trans, bkey_err(k = bch2_btree_iter_peek(&iter)));
 	BUG_ON(k.k);
 
-	k = bch2_btree_iter_peek(&iter);
+	lockrestart_do(&trans, bkey_err(k = bch2_btree_iter_peek(&iter)));
 	BUG_ON(k.k);
 
 	bch2_trans_iter_exit(&trans, &iter);
@@ -389,10 +426,10 @@ static int test_peek_end_extents(struct bch_fs *c, u64 nr)
 	bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
 			     SPOS(0, 0, U32_MAX), 0);
 
-	k = bch2_btree_iter_peek(&iter);
+	lockrestart_do(&trans, bkey_err(k = bch2_btree_iter_peek(&iter)));
 	BUG_ON(k.k);
 
-	k = bch2_btree_iter_peek(&iter);
+	lockrestart_do(&trans, bkey_err(k = bch2_btree_iter_peek(&iter)));
 	BUG_ON(k.k);
 
 	bch2_trans_iter_exit(&trans, &iter);
@@ -482,7 +519,7 @@ static int test_snapshot_filter(struct bch_fs *c, u32 snapid_lo, u32 snapid_hi)
 	bch2_trans_init(&trans, c, 0, 0);
 	bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs,
 			     SPOS(0, 0, snapid_lo), 0);
-	k = bch2_btree_iter_peek(&iter);
+	lockrestart_do(&trans, bkey_err(k = bch2_btree_iter_peek(&iter)));
 
 	BUG_ON(k.k->p.snapshot != U32_MAX);
 
@@ -615,7 +652,7 @@ static int rand_lookup(struct bch_fs *c, u64 nr)
 	for (i = 0; i < nr; i++) {
 		bch2_btree_iter_set_pos(&iter, SPOS(0, test_rand(), U32_MAX));
 
-		k = bch2_btree_iter_peek(&iter);
+		lockrestart_do(&trans, bkey_err(k = bch2_btree_iter_peek(&iter)));
 		ret = bkey_err(k);
 		if (ret) {
 			bch_err(c, "error in rand_lookup: %s", bch2_err_str(ret));
@@ -689,7 +726,7 @@ static int __do_delete(struct btree_trans *trans, struct bpos pos)
 
 	bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, pos,
 			     BTREE_ITER_INTENT);
-	k = bch2_btree_iter_peek(&iter);
+	lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek(&iter)));
 	ret = bkey_err(k);
 	if (ret)
 		goto err;
@@ -733,28 +770,23 @@ static int seq_insert(struct bch_fs *c, u64 nr)
 	struct bkey_s_c k;
 	struct bkey_i_cookie insert;
 	int ret = 0;
-	u64 i = 0;
 
 	bkey_cookie_init(&insert.k_i);
 
 	bch2_trans_init(&trans, c, 0, 0);
 
-	for_each_btree_key(&trans, iter, BTREE_ID_xattrs, SPOS(0, 0, U32_MAX),
-			   BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) {
-		insert.k.p = iter.pos;
-
-		ret = commit_do(&trans, NULL, NULL, 0,
-			bch2_btree_iter_traverse(&iter) ?:
-			bch2_trans_update(&trans, &iter, &insert.k_i, 0));
-		if (ret) {
-			bch_err(c, "error in seq_insert: %s", bch2_err_str(ret));
-			break;
-		}
-
-		if (++i == nr)
-			break;
-	}
-	bch2_trans_iter_exit(&trans, &iter);
+	ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_xattrs,
+					SPOS(0, 0, U32_MAX),
+					BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k,
+					NULL, NULL, 0,
+		({
+			if (iter.pos.offset >= nr)
+				break;
+			insert.k.p = iter.pos;
+			bch2_trans_update(&trans, &iter, &insert.k_i, 0);
+		}));
+	if (ret)
+		bch_err(c, "error in %s(): %s", __func__, bch2_err_str(ret));
 
 	bch2_trans_exit(&trans);
 	return ret;
@@ -769,10 +801,11 @@ static int seq_lookup(struct bch_fs *c, u64 nr)
 
 	bch2_trans_init(&trans, c, 0, 0);
 
-	for_each_btree_key(&trans, iter, BTREE_ID_xattrs,
-			   SPOS(0, 0, U32_MAX), 0, k, ret)
-		;
-	bch2_trans_iter_exit(&trans, &iter);
+	ret = for_each_btree_key2(&trans, iter, BTREE_ID_xattrs,
+				  SPOS(0, 0, U32_MAX), 0, k,
+		0);
+	if (ret)
+		bch_err(c, "error in %s(): %s", __func__, bch2_err_str(ret));
 
 	bch2_trans_exit(&trans);
 	return ret;
@@ -787,22 +820,18 @@ static int seq_overwrite(struct bch_fs *c, u64 nr)
 
 	bch2_trans_init(&trans, c, 0, 0);
 
-	for_each_btree_key(&trans, iter, BTREE_ID_xattrs,
-			   SPOS(0, 0, U32_MAX),
-			   BTREE_ITER_INTENT, k, ret) {
-		struct bkey_i_cookie u;
-
-		bkey_reassemble(&u.k_i, k);
+	ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_xattrs,
+					SPOS(0, 0, U32_MAX),
+					BTREE_ITER_INTENT, k,
+					NULL, NULL, 0,
+		({
+			struct bkey_i_cookie u;
 
-		ret = commit_do(&trans, NULL, NULL, 0,
-			bch2_btree_iter_traverse(&iter) ?:
-			bch2_trans_update(&trans, &iter, &u.k_i, 0));
-		if (ret) {
-			bch_err(c, "error in seq_overwrite: %s", bch2_err_str(ret));
-			break;
-		}
-	}
-	bch2_trans_iter_exit(&trans, &iter);
+			bkey_reassemble(&u.k_i, k);
+			bch2_trans_update(&trans, &iter, &u.k_i, 0);
+		}));
+	if (ret)
+		bch_err(c, "error in %s(): %s", __func__, bch2_err_str(ret));
 
 	bch2_trans_exit(&trans);
 	return ret;
-- 
cgit 


From d7228ecc483e1e104179a24a7e1e4568ae9235cb Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 20 Jul 2022 16:50:26 -0400
Subject: bcachefs: Convert debugfs code to for_each_btree_key2()

This fixes a bug where we were leaking a transaction restart error to
userspace.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/debug.c | 79 +++++++++++++++++++++--------------------------------
 1 file changed, 31 insertions(+), 48 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c
index 45f5229f20eb..b9b6cad8cd40 100644
--- a/fs/bcachefs/debug.c
+++ b/fs/bcachefs/debug.c
@@ -188,6 +188,7 @@ struct dump_iter {
 	struct bch_fs		*c;
 	enum btree_id		id;
 	struct bpos		from;
+	struct bpos		prev_node;
 	u64			iter;
 
 	struct printbuf		buf;
@@ -257,39 +258,30 @@ static ssize_t bch2_read_btree(struct file *file, char __user *buf,
 	i->size	= size;
 	i->ret	= 0;
 
-	err = flush_buf(i);
-	if (err)
-		return err;
-
-	if (!i->size)
-		return i->ret;
-
 	bch2_trans_init(&trans, i->c, 0, 0);
 
-	bch2_trans_iter_init(&trans, &iter, i->id, i->from,
-			     BTREE_ITER_PREFETCH|
-			     BTREE_ITER_ALL_SNAPSHOTS);
-	k = bch2_btree_iter_peek(&iter);
-
-	while (k.k && !(err = bkey_err(k))) {
-		bch2_bkey_val_to_text(&i->buf, i->c, k);
-		prt_char(&i->buf, '\n');
-
-		k = bch2_btree_iter_next(&iter);
-		i->from = iter.pos;
-
+	err = for_each_btree_key2(&trans, iter, i->id, i->from,
+				  BTREE_ITER_PREFETCH|
+				  BTREE_ITER_ALL_SNAPSHOTS, k, ({
 		err = flush_buf(i);
 		if (err)
 			break;
 
 		if (!i->size)
 			break;
-	}
-	bch2_trans_iter_exit(&trans, &iter);
+
+		bch2_bkey_val_to_text(&i->buf, i->c, k);
+		prt_newline(&i->buf);
+		0;
+	}));
+	i->from = iter.pos;
+
+	if (!err)
+		err = flush_buf(i);
 
 	bch2_trans_exit(&trans);
 
-	return err < 0 ? err : i->ret;
+	return err ?: i->ret;
 }
 
 static const struct file_operations btree_debug_ops = {
@@ -359,7 +351,6 @@ static ssize_t bch2_read_bfloat_failed(struct file *file, char __user *buf,
 	struct btree_trans trans;
 	struct btree_iter iter;
 	struct bkey_s_c k;
-	struct btree *prev_node = NULL;
 	int err;
 
 	i->ubuf = buf;
@@ -375,44 +366,36 @@ static ssize_t bch2_read_bfloat_failed(struct file *file, char __user *buf,
 
 	bch2_trans_init(&trans, i->c, 0, 0);
 
-	bch2_trans_iter_init(&trans, &iter, i->id, i->from,
-			     BTREE_ITER_PREFETCH|
-			     BTREE_ITER_ALL_SNAPSHOTS);
-
-	while ((k = bch2_btree_iter_peek(&iter)).k &&
-	       !(err = bkey_err(k))) {
+	err = for_each_btree_key2(&trans, iter, i->id, i->from,
+				  BTREE_ITER_PREFETCH|
+				  BTREE_ITER_ALL_SNAPSHOTS, k, ({
 		struct btree_path_level *l = &iter.path->l[0];
 		struct bkey_packed *_k =
 			bch2_btree_node_iter_peek(&l->iter, l->b);
 
-		if (l->b != prev_node) {
-			bch2_btree_node_to_text(&i->buf, i->c, l->b);
-			err = flush_buf(i);
-			if (err)
-				break;
-		}
-		prev_node = l->b;
-
-		bch2_bfloat_to_text(&i->buf, l->b, _k);
-		err = flush_buf(i);
-		if (err)
-			break;
-
-		bch2_btree_iter_advance(&iter);
-		i->from = iter.pos;
-
 		err = flush_buf(i);
 		if (err)
 			break;
 
 		if (!i->size)
 			break;
-	}
-	bch2_trans_iter_exit(&trans, &iter);
+
+		if (bpos_cmp(l->b->key.k.p, i->prev_node) > 0) {
+			bch2_btree_node_to_text(&i->buf, i->c, l->b);
+			i->prev_node = l->b->key.k.p;
+		}
+
+		bch2_bfloat_to_text(&i->buf, l->b, _k);
+		0;
+	}));
+	i->from = iter.pos;
+
+	if (!err)
+		err = flush_buf(i);
 
 	bch2_trans_exit(&trans);
 
-	return err < 0 ? err : i->ret;
+	return err ?: i->ret;
 }
 
 static const struct file_operations bfloat_failed_debug_ops = {
-- 
cgit 


From 7903e3d2d7e5266dde40c9db746fafb078f84f4f Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 20 Jul 2022 17:35:57 -0400
Subject: bcachefs: Fix check_i_sectors()

bch2_count_inode_sectors() uses for_each_btree_key() internally, which
handles lock restarts - the lockrestart_do() in check_i_sectors() is
redundant, and buggy here since the count that
bch2_count_inode_sectors() returns was interpreted as an error by
lockrestart_do().

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/fsck.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 306983811c1b..ea264421fe8f 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -1122,8 +1122,7 @@ static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w)
 		if (i->inode.bi_sectors == i->count)
 			continue;
 
-		count2 = lockrestart_do(trans,
-			bch2_count_inode_sectors(trans, w->cur_inum, i->snapshot));
+		count2 = bch2_count_inode_sectors(trans, w->cur_inum, i->snapshot);
 
 		if (i->count != count2) {
 			bch_err(c, "fsck counted i_sectors wrong: got %llu should be %llu",
-- 
cgit 


From db346e7120a6dec1534184ea2abf9d22edbb9b8a Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 17 Jul 2022 02:46:46 -0400
Subject: bcachefs: bch2_bucket_alloc_trans_early ->
 for_each_btree_key_norestart

Nested btree transactions require special care, and an upcoming patch is
going to add assertions to that effect. We don't want to be using them
unnecessarily, so this patch switches bch2_bucket_trans_early() to not
handle transaction restarts.

This patch also adds a cursor so that on transaction restart we can
continue scanning from where the previous search for an empty bucket
left off.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_foreground.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index 99fbf1d2dee5..bbe74a05a7a2 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -406,7 +406,7 @@ bch2_bucket_alloc_early(struct btree_trans *trans,
 	u64 alloc_cursor = max(alloc_start, READ_ONCE(ca->alloc_cursor));
 	int ret;
 again:
-	for_each_btree_key(trans, iter, BTREE_ID_alloc, POS(ca->dev_idx, alloc_cursor),
+	for_each_btree_key_norestart(trans, iter, BTREE_ID_alloc, POS(ca->dev_idx, alloc_cursor),
 			   BTREE_ITER_SLOTS, k, ret) {
 		struct bch_alloc_v4 a;
 
-- 
cgit 


From 01eed77178049b85ec1850717fc81bcb24e19c70 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 21 Jul 2022 09:53:28 -0400
Subject: bcachefs: Tighten up btree_path assertions

Currently seeing a very rare and difficult to explain btree_path
inconsistency - this patch adds assertions to the only place that seems
to be missing them.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_iter.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 777e41a12246..01ca27ee6314 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1020,6 +1020,7 @@ static inline struct bkey_s_c btree_path_level_peek(struct btree_trans *trans,
 
 	path->pos = k.k ? k.k->p : l->b->key.k.p;
 	trans->paths_sorted = false;
+	bch2_btree_path_verify_level(trans, path, l - path->l);
 	return k;
 }
 
@@ -1033,6 +1034,7 @@ static inline struct bkey_s_c btree_path_level_prev(struct btree_trans *trans,
 
 	path->pos = k.k ? k.k->p : l->b->data->min_key;
 	trans->paths_sorted = false;
+	bch2_btree_path_verify_level(trans, path, l - path->l);
 	return k;
 }
 
@@ -1661,7 +1663,7 @@ out:
 int __must_check bch2_btree_path_traverse(struct btree_trans *trans,
 					  struct btree_path *path, unsigned flags)
 {
-	if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) {
+	if (0 && IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) {
 		unsigned restart_probability_bits = 4 << min(trans->restart_count, 32U);
 		u64 max = ~(~0ULL << restart_probability_bits);
 
-- 
cgit 


From 91f1b9fdd2c02a7375e46bb2628870f3c6116072 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 21 Jul 2022 15:41:29 -0400
Subject: bcachefs: Add an O_DIRECT option (for userspace)

Sometimes we see IO errors due to O_DIRECT alignment issues - having an
option to use buffered IO will be helpful.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/opts.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
index 30cf9a2d9dc1..6e2bd6e01f8c 100644
--- a/fs/bcachefs/opts.h
+++ b/fs/bcachefs/opts.h
@@ -346,6 +346,11 @@ enum opt_type {
 	  OPT_BOOL(),							\
 	  BCH2_NO_SB_OPT,		false,				\
 	  NULL,		"Don't open device in exclusive mode")		\
+	x(direct_io,			u8,				\
+	  OPT_FS|OPT_MOUNT,						\
+	  OPT_BOOL(),							\
+	  BCH2_NO_SB_OPT,			true,			\
+	  NULL,		"Use O_DIRECT (userspace only)")		\
 	x(sb,				u64,				\
 	  OPT_MOUNT,							\
 	  OPT_UINT(0, S64_MAX),						\
-- 
cgit 


From 0763c552e7ef024ce6dbc9cbc828b8715dff251c Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 22 Jul 2022 06:57:05 -0400
Subject: bcachefs: fsck: Fix nested transaction handling

This uses the new trans->restart count to make sure we always correctly
return -BCH_ERR_transaction_restart_nested when we restart a nested
transaction - eliminating some other hacks and preparing for new
assertions.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/fsck.c | 43 +++++++++++++++++++++++--------------------
 1 file changed, 23 insertions(+), 20 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index ea264421fe8f..bb8cab7cb405 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -650,6 +650,7 @@ static int __walk_inode(struct btree_trans *trans,
 	struct bch_fs *c = trans->c;
 	struct btree_iter iter;
 	struct bkey_s_c k;
+	u32 restart_count = trans->restart_count;
 	unsigned i;
 	int ret;
 
@@ -677,6 +678,10 @@ static int __walk_inode(struct btree_trans *trans,
 
 	w->cur_inum		= pos.inode;
 	w->first_this_inode	= true;
+
+	if (trans_was_restarted(trans, restart_count))
+		return -BCH_ERR_transaction_restart_nested;
+
 lookup_snapshot:
 	for (i = 0; i < w->inodes.nr; i++)
 		if (bch2_snapshot_is_ancestor(c, pos.snapshot, w->inodes.data[i].snapshot))
@@ -1115,7 +1120,8 @@ static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w)
 {
 	struct bch_fs *c = trans->c;
 	struct inode_walker_entry *i;
-	int ret = 0, ret2 = 0;
+	u32 restart_count = trans->restart_count;
+	int ret = 0;
 	s64 count2;
 
 	darray_for_each(w->inodes, i) {
@@ -1140,13 +1146,16 @@ static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w)
 			ret = write_inode(trans, &i->inode, i->snapshot);
 			if (ret)
 				break;
-			ret2 = -BCH_ERR_transaction_restart_nested;
 		}
 	}
 fsck_err:
-	if (ret)
+	if (ret) {
 		bch_err(c, "error from check_i_sectors(): %s", bch2_err_str(ret));
-	return ret ?: ret2;
+		return ret;
+	}
+	if (trans_was_restarted(trans, restart_count))
+		return -BCH_ERR_transaction_restart_nested;
+	return 0;
 }
 
 static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
@@ -1182,14 +1191,7 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
 			goto err;
 	}
 
-	if (!iter->path->should_be_locked) {
-		/*
-		 * hack: check_i_sectors may have handled a transaction restart,
-		 * it shouldn't be but we need to fix the new i_sectors check
-		 * code and delete the old bch2_count_inode_sectors() first
-		 */
-		return -BCH_ERR_transaction_restart_nested;
-	}
+	BUG_ON(!iter->path->should_be_locked);
 #if 0
 	if (bkey_cmp(prev.k->k.p, bkey_start_pos(k.k)) > 0) {
 		char buf1[200];
@@ -1336,7 +1338,8 @@ static int check_subdir_count(struct btree_trans *trans, struct inode_walker *w)
 {
 	struct bch_fs *c = trans->c;
 	struct inode_walker_entry *i;
-	int ret = 0, ret2 = 0;
+	u32 restart_count = trans->restart_count;
+	int ret = 0;
 	s64 count2;
 
 	darray_for_each(w->inodes, i) {
@@ -1362,13 +1365,16 @@ static int check_subdir_count(struct btree_trans *trans, struct inode_walker *w)
 			ret = write_inode(trans, &i->inode, i->snapshot);
 			if (ret)
 				break;
-			ret2 = -BCH_ERR_transaction_restart_nested;
 		}
 	}
 fsck_err:
-	if (ret)
+	if (ret) {
 		bch_err(c, "error from check_subdir_count(): %s", bch2_err_str(ret));
-	return ret ?: ret2;
+		return ret;
+	}
+	if (trans_was_restarted(trans, restart_count))
+		return -BCH_ERR_transaction_restart_nested;
+	return 0;
 }
 
 static int check_dirent_target(struct btree_trans *trans,
@@ -1526,10 +1532,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
 			goto err;
 	}
 
-	if (!iter->path->should_be_locked) {
-		/* hack: see check_extent() */
-		return -BCH_ERR_transaction_restart_nested;
-	}
+	BUG_ON(!iter->path->should_be_locked);
 
 	ret = __walk_inode(trans, dir, equiv);
 	if (ret < 0)
-- 
cgit 


From 4a7f7e9e4de665d94e1a7610bf5506265df414d3 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 26 Jul 2022 00:50:25 -0400
Subject: bcachefs: Fix not punting to worqueue when promoting

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/io.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index dfa708c0a7fc..0348e2ab6422 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -2027,6 +2027,7 @@ static void bch2_read_endio(struct bio *bio)
 	}
 
 	if (rbio->narrow_crcs ||
+	    rbio->promote ||
 	    crc_is_compressed(rbio->pick.crc) ||
 	    bch2_csum_type_is_encryption(rbio->pick.crc.csum_type))
 		context = RBIO_CONTEXT_UNBOUND,	wq = system_unbound_wq;
-- 
cgit 


From ae33e7a274abe6863bc4a1f4ea1e4c5f65b533f9 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 4 Aug 2022 12:46:37 -0400
Subject: bcachefs: Add distinct error code for key_cache_upgrade

This aids in debugging.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_key_cache.c | 2 +-
 fs/bcachefs/errcode.h         | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index 549abe607b53..bfd602273e91 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -417,7 +417,7 @@ fill:
 		if (!path->locks_want &&
 		    !__bch2_btree_path_upgrade(trans, path, 1)) {
 			trace_transaction_restart_ip(trans->fn, _THIS_IP_);
-			ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_upgrade);
+			ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_upgrade);
 			goto err;
 		}
 
diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h
index 95925c8434b3..6dd2152e782e 100644
--- a/fs/bcachefs/errcode.h
+++ b/fs/bcachefs/errcode.h
@@ -22,6 +22,7 @@
 	x(transaction_restart,	transaction_restart_would_deadlock)	\
 	x(transaction_restart,	transaction_restart_would_deadlock_write)\
 	x(transaction_restart,	transaction_restart_upgrade)		\
+	x(transaction_restart,	transaction_restart_key_cache_upgrade)	\
 	x(transaction_restart,	transaction_restart_key_cache_fill)	\
 	x(transaction_restart,	transaction_restart_key_cache_raced)	\
 	x(transaction_restart,	transaction_restart_key_cache_realloced)\
-- 
cgit 


From 86b74451931790eafa018021fe900faea3230189 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 5 Aug 2022 11:36:13 -0400
Subject: bcachefs: Fix bch2_btree_trans_to_text()

bch2_btree_trans_to_text() is used to print btree_transactions owned by
other threads; thus, it needs to be particularly careful. This fixes a
null ptr deref caused by racing with the owning thread changing
path->l[].b.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_iter.c    | 18 ++++++++----------
 fs/bcachefs/btree_locking.h |  2 +-
 fs/bcachefs/btree_types.h   |  2 +-
 3 files changed, 10 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 01ca27ee6314..171894d9347d 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -3417,19 +3417,19 @@ void bch2_trans_exit(struct btree_trans *trans)
 
 static void __maybe_unused
 bch2_btree_path_node_to_text(struct printbuf *out,
-			     struct btree_bkey_cached_common *_b,
+			     struct btree_bkey_cached_common *b,
 			     bool cached)
 {
 	prt_printf(out, "    l=%u %s:",
-	       _b->level, bch2_btree_ids[_b->btree_id]);
-	bch2_bpos_to_text(out, btree_node_pos(_b, cached));
+	       b->level, bch2_btree_ids[b->btree_id]);
+	bch2_bpos_to_text(out, btree_node_pos(b, cached));
 }
 
 #ifdef CONFIG_BCACHEFS_DEBUG_TRANSACTIONS
 void bch2_btree_trans_to_text(struct printbuf *out, struct btree_trans *trans)
 {
 	struct btree_path *path;
-	struct btree *b;
+	struct btree_bkey_cached_common *b;
 	static char lock_types[] = { 'r', 'i', 'w' };
 	unsigned l;
 
@@ -3448,12 +3448,11 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct btree_trans *trans)
 		prt_printf(out, "\n");
 
 		for (l = 0; l < BTREE_MAX_DEPTH; l++) {
-			if (btree_node_locked(path, l)) {
+			if (btree_node_locked(path, l) &&
+			    (unsigned long) (b = (void *) READ_ONCE(path->l[l].b)) >= 128) {
 				prt_printf(out, "    %s l=%u ",
 				       btree_node_intent_locked(path, l) ? "i" : "r", l);
-				bch2_btree_path_node_to_text(out,
-						(void *) path->l[l].b,
-						path->cached);
+				bch2_btree_path_node_to_text(out, b, path->cached);
 				prt_printf(out, "\n");
 			}
 		}
@@ -3471,8 +3470,7 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct btree_trans *trans)
 		bch2_bpos_to_text(out, trans->locking_pos);
 
 		prt_printf(out, " node ");
-		bch2_btree_path_node_to_text(out,
-				(void *) b, path->cached);
+		bch2_btree_path_node_to_text(out, b, path->cached);
 		prt_printf(out, "\n");
 	}
 }
diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h
index 33a69e27c39e..9d4e1a658eef 100644
--- a/fs/bcachefs/btree_locking.h
+++ b/fs/bcachefs/btree_locking.h
@@ -173,7 +173,7 @@ static inline int btree_node_lock_type(struct btree_trans *trans,
 	trans->locking_btree_id	= path->btree_id;
 	trans->locking_level	= level;
 	trans->locking_lock_type = type;
-	trans->locking		= b;
+	trans->locking		= &b->c;
 	ret = six_lock_type(&b->c.lock, type, should_sleep_fn, p);
 	trans->locking = NULL;
 
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index bc1571fc2f1f..1c70dff591a2 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -392,7 +392,7 @@ struct btree_trans {
 	const char		*fn;
 	struct list_head	list;
 	u64			last_begin_time;
-	struct btree		*locking;
+	struct btree_bkey_cached_common *locking;
 	unsigned		locking_path_idx;
 	struct bpos		locking_pos;
 	u8			locking_btree_id;
-- 
cgit 


From 17047fbced563cf5abe5aa546f6a92af48900b69 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 5 Aug 2022 17:08:35 -0400
Subject: bcachefs: Fix incorrectly freeing btree_path in alloc path

Clearing path->preserve means the path will be dropping in
bch2_trans_begin() - but on transaction restart, we're likely to need
that path again.

This fixes a livelock in the allocation path.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_iter.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index 9a3287da9a12..5ca92b6bb397 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -335,7 +335,8 @@ void bch2_trans_copy_iter(struct btree_iter *, struct btree_iter *);
 
 static inline void set_btree_iter_dontneed(struct btree_iter *iter)
 {
-	iter->path->preserve = false;
+	if (!iter->trans->restarted)
+		iter->path->preserve = false;
 }
 
 void *bch2_trans_kmalloc(struct btree_trans *, size_t);
-- 
cgit 


From 49e401fa55ab128461f84a4eeb1f2d974f3281c1 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 7 Aug 2022 13:43:32 -0400
Subject: bcachefs: Tracepoint improvements

 - use strlcpy(), not strncpy()
 - add tracepoints for btree_path alloc and free
 - give the tracepoint for key cache upgrade fail a proper name
 - add a tracepoint for btree_node_upgrade_fail

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_iter.c      |  59 +++++++++++++-------
 fs/bcachefs/btree_key_cache.c |   6 +-
 fs/bcachefs/trace.h           | 125 +++++++++++++++++++++++++++++++++++++-----
 3 files changed, 157 insertions(+), 33 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 171894d9347d..74978a50023f 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -132,15 +132,37 @@ void bch2_btree_node_unlock_write(struct btree_trans *trans,
 	bch2_btree_node_unlock_write_inlined(trans, path, b);
 }
 
-void __bch2_btree_node_lock_write(struct btree_trans *trans, struct btree *b)
+static struct six_lock_count btree_node_lock_counts(struct btree_trans *trans,
+					   struct btree_path *skip,
+					   struct btree *b,
+					   unsigned level)
 {
-	struct btree_path *linked;
-	unsigned readers = 0;
+	struct btree_path *path;
+	struct six_lock_count ret = { 0, 0 };
+
+	if ((unsigned long) b < 128)
+		return ret;
 
-	trans_for_each_path(trans, linked)
-		if (linked->l[b->c.level].b == b &&
-		    btree_node_read_locked(linked, b->c.level))
-			readers++;
+	trans_for_each_path(trans, path)
+		if (path != skip && path->l[level].b == b) {
+			ret.read += btree_node_read_locked(path, level);
+			ret.intent += btree_node_intent_locked(path, level);
+		}
+
+	return ret;
+}
+
+static inline void six_lock_readers_add(struct six_lock *lock, int nr)
+{
+	if (!lock->readers)
+		atomic64_add(__SIX_VAL(read_lock, nr), &lock->state.counter);
+	else
+		this_cpu_add(*lock->readers, nr);
+}
+
+void __bch2_btree_node_lock_write(struct btree_trans *trans, struct btree *b)
+{
+	int readers = btree_node_lock_counts(trans, NULL, b, b->c.level).read;
 
 	/*
 	 * Must drop our read locks before calling six_lock_write() -
@@ -148,19 +170,9 @@ void __bch2_btree_node_lock_write(struct btree_trans *trans, struct btree *b)
 	 * goes to 0, and it's safe because we have the node intent
 	 * locked:
 	 */
-	if (!b->c.lock.readers)
-		atomic64_sub(__SIX_VAL(read_lock, readers),
-			     &b->c.lock.state.counter);
-	else
-		this_cpu_sub(*b->c.lock.readers, readers);
-
+	six_lock_readers_add(&b->c.lock, -readers);
 	six_lock_write(&b->c.lock, NULL, NULL);
-
-	if (!b->c.lock.readers)
-		atomic64_add(__SIX_VAL(read_lock, readers),
-			     &b->c.lock.state.counter);
-	else
-		this_cpu_add(*b->c.lock.readers, readers);
+	six_lock_readers_add(&b->c.lock, readers);
 }
 
 bool __bch2_btree_node_relock(struct btree_trans *trans,
@@ -229,6 +241,12 @@ bool bch2_btree_node_upgrade(struct btree_trans *trans,
 		goto success;
 	}
 
+	trace_btree_node_upgrade_fail(trans->fn, _RET_IP_,
+				     path->btree_id,
+				     &path->pos,
+				     btree_node_locked(path, level),
+				     btree_node_lock_counts(trans, NULL, b, level),
+				     six_lock_counts(&b->c.lock));
 	return false;
 success:
 	mark_btree_node_intent_locked(trans, path, level);
@@ -1800,6 +1818,7 @@ static struct btree_path *have_node_at_pos(struct btree_trans *trans, struct btr
 
 static inline void __bch2_path_free(struct btree_trans *trans, struct btree_path *path)
 {
+	trace_btree_path_free(trans->fn, _RET_IP_, path->btree_id, &path->pos);
 	__bch2_btree_path_unlock(trans, path);
 	btree_path_list_remove(trans, path);
 	trans->paths_allocated &= ~(1ULL << path->idx);
@@ -1975,6 +1994,8 @@ struct btree_path *bch2_path_get(struct btree_trans *trans,
 		__btree_path_get(path_pos, intent);
 		path = bch2_btree_path_set_pos(trans, path_pos, pos, intent);
 	} else {
+		trace_btree_path_alloc(trans->fn, _RET_IP_, btree_id, &pos, locks_want);
+
 		path = btree_path_alloc(trans, path_pos);
 		path_pos = NULL;
 
diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index bfd602273e91..4ff3ed4de0a9 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -414,9 +414,13 @@ retry:
 	path->l[0].b		= (void *) ck;
 fill:
 	if (!ck->valid && !(flags & BTREE_ITER_CACHED_NOFILL)) {
+		/*
+		 * Using the underscore version because we haven't set
+		 * path->uptodate yet:
+		 */
 		if (!path->locks_want &&
 		    !__bch2_btree_path_upgrade(trans, path, 1)) {
-			trace_transaction_restart_ip(trans->fn, _THIS_IP_);
+			trace_transaction_restart_key_cache_upgrade(trans->fn, _THIS_IP_);
 			ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_upgrade);
 			goto err;
 		}
diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h
index 1e9e93161509..5969a049df7d 100644
--- a/fs/bcachefs/trace.h
+++ b/fs/bcachefs/trace.h
@@ -401,7 +401,7 @@ TRACE_EVENT(btree_node_relock_fail,
 	),
 
 	TP_fast_assign(
-		strncpy(__entry->trans_fn, trans_fn, sizeof(__entry->trans_fn));
+		strlcpy(__entry->trans_fn, trans_fn, sizeof(__entry->trans_fn));
 		__entry->caller_ip		= caller_ip;
 		__entry->btree_id		= btree_id;
 		__entry->pos_inode		= pos->inode;
@@ -424,6 +424,59 @@ TRACE_EVENT(btree_node_relock_fail,
 		  __entry->node_lock_seq)
 );
 
+TRACE_EVENT(btree_node_upgrade_fail,
+	TP_PROTO(const char *trans_fn,
+		 unsigned long caller_ip,
+		 enum btree_id btree_id,
+		 struct bpos *pos,
+		 bool locked,
+		 struct six_lock_count self_lock_count,
+		 struct six_lock_count lock_count),
+	TP_ARGS(trans_fn, caller_ip, btree_id, pos,
+		locked, self_lock_count, lock_count),
+
+	TP_STRUCT__entry(
+		__array(char,			trans_fn, 24	)
+		__field(unsigned long,		caller_ip	)
+		__field(u8,			btree_id	)
+		__field(u64,			pos_inode	)
+		__field(u64,			pos_offset	)
+		__field(u32,			pos_snapshot	)
+		__field(u8,			locked		)
+		__field(u8,			self_read_count	)
+		__field(u8,			read_count	)
+		__field(u8,			self_intent_count)
+		__field(u8,			intent_count	)
+	),
+
+	TP_fast_assign(
+		strlcpy(__entry->trans_fn, trans_fn, sizeof(__entry->trans_fn));
+		__entry->caller_ip		= caller_ip;
+		__entry->btree_id		= btree_id;
+		__entry->pos_inode		= pos->inode;
+		__entry->pos_offset		= pos->offset;
+		__entry->pos_snapshot		= pos->snapshot;
+		__entry->locked			= locked;
+		__entry->self_read_count	= self_lock_count.read;
+		__entry->self_intent_count	= self_lock_count.intent;
+		__entry->read_count		= lock_count.read;
+		__entry->intent_count		= lock_count.intent;
+	),
+
+	TP_printk("%s %pS btree %u pos %llu:%llu:%u, locked %u held %u:%u lock count %u:%u",
+		  __entry->trans_fn,
+		  (void *) __entry->caller_ip,
+		  __entry->btree_id,
+		  __entry->pos_inode,
+		  __entry->pos_offset,
+		  __entry->pos_snapshot,
+		  __entry->locked,
+		  __entry->self_read_count,
+		  __entry->self_intent_count,
+		  __entry->read_count,
+		  __entry->intent_count)
+);
+
 /* Garbage collection */
 
 DEFINE_EVENT(bch_fs, gc_gens_start,
@@ -688,7 +741,7 @@ DECLARE_EVENT_CLASS(transaction_event,
 	),
 
 	TP_fast_assign(
-		strncpy(__entry->trans_fn, trans_fn, sizeof(__entry->trans_fn));
+		strlcpy(__entry->trans_fn, trans_fn, sizeof(__entry->trans_fn));
 		__entry->caller_ip		= caller_ip;
 	),
 
@@ -701,12 +754,6 @@ DEFINE_EVENT(transaction_event,	transaction_commit,
 	TP_ARGS(trans_fn, caller_ip)
 );
 
-DEFINE_EVENT(transaction_event,	transaction_restart_ip,
-	TP_PROTO(const char *trans_fn,
-		 unsigned long caller_ip),
-	TP_ARGS(trans_fn, caller_ip)
-);
-
 DEFINE_EVENT(transaction_event,	transaction_restart_injected,
 	TP_PROTO(const char *trans_fn,
 		 unsigned long caller_ip),
@@ -784,7 +831,7 @@ DECLARE_EVENT_CLASS(transaction_restart_iter,
 	),
 
 	TP_fast_assign(
-		strncpy(__entry->trans_fn, trans_fn, sizeof(__entry->trans_fn));
+		strlcpy(__entry->trans_fn, trans_fn, sizeof(__entry->trans_fn));
 		__entry->caller_ip		= caller_ip;
 		__entry->btree_id		= btree_id;
 		__entry->pos_inode		= pos->inode;
@@ -865,6 +912,12 @@ DEFINE_EVENT(transaction_restart_iter,	trans_restart_relock_after_fill,
 	TP_ARGS(trans_fn, caller_ip, btree_id, pos)
 );
 
+DEFINE_EVENT(transaction_event,	transaction_restart_key_cache_upgrade,
+	TP_PROTO(const char *trans_fn,
+		 unsigned long caller_ip),
+	TP_ARGS(trans_fn, caller_ip)
+);
+
 DEFINE_EVENT(transaction_restart_iter,	trans_restart_relock_key_cache_fill,
 	TP_PROTO(const char *trans_fn,
 		 unsigned long caller_ip,
@@ -939,7 +992,7 @@ TRACE_EVENT(trans_restart_would_deadlock,
 	),
 
 	TP_fast_assign(
-		strncpy(__entry->trans_fn, trans_fn, sizeof(__entry->trans_fn));
+		strlcpy(__entry->trans_fn, trans_fn, sizeof(__entry->trans_fn));
 		__entry->caller_ip		= caller_ip;
 		__entry->in_traverse_all	= in_traverse_all;
 		__entry->reason			= reason;
@@ -983,7 +1036,7 @@ TRACE_EVENT(trans_restart_would_deadlock_write,
 	),
 
 	TP_fast_assign(
-		strncpy(__entry->trans_fn, trans_fn, sizeof(__entry->trans_fn));
+		strlcpy(__entry->trans_fn, trans_fn, sizeof(__entry->trans_fn));
 	),
 
 	TP_printk("%s", __entry->trans_fn)
@@ -1002,7 +1055,7 @@ TRACE_EVENT(trans_restart_mem_realloced,
 	),
 
 	TP_fast_assign(
-		strncpy(__entry->trans_fn, trans_fn, sizeof(__entry->trans_fn));
+		strlcpy(__entry->trans_fn, trans_fn, sizeof(__entry->trans_fn));
 		__entry->caller_ip	= caller_ip;
 		__entry->bytes		= bytes;
 	),
@@ -1034,7 +1087,7 @@ TRACE_EVENT(trans_restart_key_cache_key_realloced,
 	),
 
 	TP_fast_assign(
-		strncpy(__entry->trans_fn, trans_fn, sizeof(__entry->trans_fn));
+		strlcpy(__entry->trans_fn, trans_fn, sizeof(__entry->trans_fn));
 		__entry->caller_ip	= caller_ip;
 		__entry->btree_id	= btree_id;
 		__entry->inode		= pos->inode;
@@ -1055,6 +1108,52 @@ TRACE_EVENT(trans_restart_key_cache_key_realloced,
 		  __entry->new_u64s)
 );
 
+TRACE_EVENT(btree_path_alloc,
+	TP_PROTO(const char *trans_fn,
+		 unsigned long caller_ip,
+		 enum btree_id btree_id,
+		 struct bpos *pos,
+		 unsigned locks_want),
+	TP_ARGS(trans_fn, caller_ip, btree_id, pos, locks_want),
+
+	TP_STRUCT__entry(
+		__array(char,			trans_fn, 24	)
+		__field(unsigned long,		caller_ip	)
+		__field(u8,			btree_id	)
+		__field(u8,			locks_want	)
+		__field(u64,			pos_inode	)
+		__field(u64,			pos_offset	)
+		__field(u32,			pos_snapshot	)
+	),
+
+	TP_fast_assign(
+		strlcpy(__entry->trans_fn, trans_fn, sizeof(__entry->trans_fn));
+		__entry->caller_ip		= caller_ip;
+		__entry->btree_id		= btree_id;
+		__entry->locks_want		= locks_want;
+		__entry->pos_inode		= pos->inode;
+		__entry->pos_offset		= pos->offset;
+		__entry->pos_snapshot		= pos->snapshot;
+	),
+
+	TP_printk("%s %pS btree %u locks_want %u pos %llu:%llu:%u",
+		  __entry->trans_fn,
+		  (void *) __entry->caller_ip,
+		  __entry->btree_id,
+		  __entry->locks_want,
+		  __entry->pos_inode,
+		  __entry->pos_offset,
+		  __entry->pos_snapshot)
+);
+
+DEFINE_EVENT(transaction_restart_iter,	btree_path_free,
+	TP_PROTO(const char *trans_fn,
+		 unsigned long caller_ip,
+		 enum btree_id btree_id,
+		 struct bpos *pos),
+	TP_ARGS(trans_fn, caller_ip, btree_id, pos)
+);
+
 #endif /* _TRACE_BCACHEFS_H */
 
 /* This part must be outside protection */
-- 
cgit 


From 15f11c1aa8a98f3b6805d0b2a300a87ef0205d34 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 6 Aug 2022 23:02:09 -0400
Subject: bcachefs: Improve an error message

Update an error message to use bch2_err_str().

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/io.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index 0348e2ab6422..44fb14a5b5ae 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -674,7 +674,7 @@ static void __bch2_write_index(struct bch_write_op *op)
 
 		if (ret) {
 			bch_err_inum_ratelimited(c, op->pos.inode,
-				"write error %i from btree update", ret);
+				"write error while doing btree update: %s", bch2_err_str(ret));
 			goto err;
 		}
 	}
-- 
cgit 


From 31301dd46975b2423fd38fc64bc58728d89dbcac Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 9 Aug 2022 13:47:03 -0400
Subject: bcachefs: Fix missing error handling in bch2_subvolume_delete()

This fixes an assertion when the transaction has been unexpectedly
restarted.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/subvolume.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c
index b5b0f5e39f97..24244bc3d2fb 100644
--- a/fs/bcachefs/subvolume.c
+++ b/fs/bcachefs/subvolume.c
@@ -877,6 +877,8 @@ int bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid)
 		goto err;
 
 	ret = bch2_snapshot_node_set_deleted(trans, snapid);
+	if (ret)
+		goto err;
 
 	h = bch2_trans_kmalloc(trans, sizeof(*h));
 	ret = PTR_ERR_OR_ZERO(h);
-- 
cgit 


From fd211bc71c9b4093ed39d4fc93294f9ff423febc Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 10 Aug 2022 18:55:53 -0400
Subject: bcachefs: Don't set should_be_locked on paths that aren't locked

It doesn't make any sense to set should_be_locked on btree_paths that
aren't locked, and is often a bug - this patch adds assertions and fixes
some of those bugs.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_iter.c            | 57 +++++++++++++++++++------------------
 fs/bcachefs/btree_locking.h         |  8 ++++++
 fs/bcachefs/btree_update_interior.c |  2 +-
 fs/bcachefs/btree_update_leaf.c     |  4 +--
 4 files changed, 40 insertions(+), 31 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 74978a50023f..1d5d7b639241 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -2103,7 +2103,7 @@ bch2_btree_iter_traverse(struct btree_iter *iter)
 	if (ret)
 		return ret;
 
-	iter->path->should_be_locked = true;
+	btree_path_set_should_be_locked(iter->path);
 	return 0;
 }
 
@@ -2133,8 +2133,7 @@ struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter)
 
 	iter->path = bch2_btree_path_set_pos(trans, iter->path, b->key.k.p,
 					iter->flags & BTREE_ITER_INTENT);
-	iter->path->should_be_locked = true;
-	BUG_ON(iter->path->uptodate);
+	btree_path_set_should_be_locked(iter->path);
 out:
 	bch2_btree_iter_verify_entry_exit(iter);
 	bch2_btree_iter_verify(iter);
@@ -2206,7 +2205,7 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
 
 	iter->path = bch2_btree_path_set_pos(trans, iter->path, b->key.k.p,
 					iter->flags & BTREE_ITER_INTENT);
-	iter->path->should_be_locked = true;
+	btree_path_set_should_be_locked(iter->path);
 	BUG_ON(iter->path->uptodate);
 out:
 	bch2_btree_iter_verify_entry_exit(iter);
@@ -2360,7 +2359,7 @@ struct bkey_s_c btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos
 	if (unlikely(ret))
 		return bkey_s_c_err(ret);
 
-	iter->key_cache_path->should_be_locked = true;
+	btree_path_set_should_be_locked(iter->key_cache_path);
 
 	return bch2_btree_path_peek_slot(iter->key_cache_path, &u);
 }
@@ -2387,7 +2386,7 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp
 			goto out;
 		}
 
-		iter->path->should_be_locked = true;
+		btree_path_set_should_be_locked(iter->path);
 
 		k = btree_path_level_peek_all(trans->c, &iter->path->l[0], &iter->k);
 
@@ -2474,7 +2473,7 @@ struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos e
 	while (1) {
 		k = __bch2_btree_iter_peek(iter, search_key);
 		if (!k.k || bkey_err(k))
-			goto out;
+			goto out_no_locked;
 
 		/*
 		 * iter->pos should be mononotically increasing, and always be
@@ -2491,7 +2490,7 @@ struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos e
 		if (bkey_cmp(iter_pos, end) > 0) {
 			bch2_btree_iter_set_pos(iter, end);
 			k = bkey_s_c_null;
-			goto out;
+			goto out_no_locked;
 		}
 
 		if (iter->update_path &&
@@ -2551,18 +2550,16 @@ struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos e
 
 	iter->path = bch2_btree_path_set_pos(trans, iter->path, k.k->p,
 				iter->flags & BTREE_ITER_INTENT);
-	BUG_ON(!iter->path->nodes_locked);
-out:
+
+	btree_path_set_should_be_locked(iter->path);
+out_no_locked:
 	if (iter->update_path) {
 		ret = bch2_btree_path_relock(trans, iter->update_path, _THIS_IP_);
-		if (unlikely(ret)) {
+		if (unlikely(ret))
 			k = bkey_s_c_err(ret);
-		} else {
-			BUG_ON(!(iter->update_path->nodes_locked & 1));
-			iter->update_path->should_be_locked = true;
-		}
+		else
+			btree_path_set_should_be_locked(iter->update_path);
 	}
-	iter->path->should_be_locked = true;
 
 	if (!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS))
 		iter->pos.snapshot = iter->snapshot;
@@ -2605,13 +2602,13 @@ struct bkey_s_c bch2_btree_iter_peek_all_levels(struct btree_iter *iter)
 			/* ensure that iter->k is consistent with iter->pos: */
 			bch2_btree_iter_set_pos(iter, iter->pos);
 			k = bkey_s_c_err(ret);
-			goto out;
+			goto out_no_locked;
 		}
 
 		/* Already at end? */
 		if (!btree_path_node(iter->path, iter->path->level)) {
 			k = bkey_s_c_null;
-			goto out;
+			goto out_no_locked;
 		}
 
 		k = btree_path_level_peek_all(trans->c,
@@ -2664,8 +2661,8 @@ struct bkey_s_c bch2_btree_iter_peek_all_levels(struct btree_iter *iter)
 	}
 
 	iter->pos = k.k->p;
-out:
-	iter->path->should_be_locked = true;
+	btree_path_set_should_be_locked(iter->path);
+out_no_locked:
 	bch2_btree_iter_verify(iter);
 
 	return k;
@@ -2718,7 +2715,7 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
 			/* ensure that iter->k is consistent with iter->pos: */
 			bch2_btree_iter_set_pos(iter, iter->pos);
 			k = bkey_s_c_err(ret);
-			goto out;
+			goto out_no_locked;
 		}
 
 		k = btree_path_level_peek(trans, iter->path,
@@ -2782,7 +2779,7 @@ got_key:
 			/* Start of btree: */
 			bch2_btree_iter_set_pos(iter, POS_MIN);
 			k = bkey_s_c_null;
-			goto out;
+			goto out_no_locked;
 		}
 	}
 
@@ -2794,10 +2791,11 @@ got_key:
 
 	if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)
 		iter->pos.snapshot = iter->snapshot;
-out:
+
+	btree_path_set_should_be_locked(iter->path);
+out_no_locked:
 	if (saved_path)
 		bch2_path_put(trans, saved_path, iter->flags & BTREE_ITER_INTENT);
-	iter->path->should_be_locked = true;
 
 	bch2_btree_iter_verify_entry_exit(iter);
 	bch2_btree_iter_verify(iter);
@@ -2863,9 +2861,12 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
 
 		if (unlikely(iter->flags & BTREE_ITER_WITH_KEY_CACHE) &&
 		    (k = btree_trans_peek_key_cache(iter, iter->pos)).k) {
-			if (!bkey_err(k))
+			if (bkey_err(k)) {
+				goto out_no_locked;
+			} else {
 				iter->k = *k.k;
-			goto out;
+				goto out;
+			}
 		}
 
 		k = bch2_btree_path_peek_slot(iter->path, &iter->k);
@@ -2919,8 +2920,8 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
 		}
 	}
 out:
-	iter->path->should_be_locked = true;
-
+	btree_path_set_should_be_locked(iter->path);
+out_no_locked:
 	bch2_btree_iter_verify_entry_exit(iter);
 	bch2_btree_iter_verify(iter);
 	ret = bch2_btree_iter_verify_ret(iter, k);
diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h
index 9d4e1a658eef..90bf5c02f504 100644
--- a/fs/bcachefs/btree_locking.h
+++ b/fs/bcachefs/btree_locking.h
@@ -283,4 +283,12 @@ static inline void bch2_btree_node_lock_write(struct btree_trans *trans,
 		__bch2_btree_node_lock_write(trans, b);
 }
 
+static inline void btree_path_set_should_be_locked(struct btree_path *path)
+{
+	EBUG_ON(!btree_node_locked(path, path->level));
+	EBUG_ON(path->uptodate);
+
+	path->should_be_locked = true;
+}
+
 #endif /* _BCACHEFS_BTREE_LOCKING_H */
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index cf02e814c579..1fbf72df9e2f 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -1664,7 +1664,7 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans,
 	if (ret)
 		goto err;
 
-	sib_path->should_be_locked = true;
+	btree_path_set_should_be_locked(sib_path);
 
 	m = sib_path->l[level].b;
 
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 541826df50d9..9c84eed32007 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -1575,7 +1575,7 @@ bch2_trans_update_by_path_trace(struct btree_trans *trans, struct btree_path *pa
 		if (ret)
 			goto err;
 
-		btree_path->should_be_locked = true;
+		btree_path_set_should_be_locked(btree_path);
 		ret = bch2_trans_update_by_path_trace(trans, btree_path, k, flags, ip);
 err:
 		bch2_path_put(trans, btree_path, true);
@@ -1643,7 +1643,7 @@ int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter
 				return btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_raced);
 			}
 
-			iter->key_cache_path->should_be_locked = true;
+			btree_path_set_should_be_locked(iter->key_cache_path);
 		}
 
 		path = iter->key_cache_path;
-- 
cgit 


From 315c9ba6da5b480618a80dcb91a74a86e49366bb Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 10 Aug 2022 19:08:30 -0400
Subject: bcachefs: BTREE_ITER_NO_NODE -> BCH_ERR codes

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_iter.c            | 60 ++++++++++++-------------------------
 fs/bcachefs/btree_key_cache.c       |  2 +-
 fs/bcachefs/btree_locking.h         | 15 ++++++++++
 fs/bcachefs/btree_types.h           |  9 ------
 fs/bcachefs/btree_update_interior.c |  5 +---
 fs/bcachefs/errcode.h               |  9 ++++++
 6 files changed, 45 insertions(+), 55 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 1d5d7b639241..479e46a26f46 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -90,8 +90,7 @@ static inline struct bpos bkey_predecessor(struct btree_iter *iter, struct bpos
 
 static inline bool is_btree_node(struct btree_path *path, unsigned l)
 {
-	return l < BTREE_MAX_DEPTH &&
-		(unsigned long) path->l[l].b >= 128;
+	return l < BTREE_MAX_DEPTH && !IS_ERR_OR_NULL(path->l[l].b);
 }
 
 static inline struct bpos btree_iter_search_key(struct btree_iter *iter)
@@ -140,7 +139,7 @@ static struct six_lock_count btree_node_lock_counts(struct btree_trans *trans,
 	struct btree_path *path;
 	struct six_lock_count ret = { 0, 0 };
 
-	if ((unsigned long) b < 128)
+	if (IS_ERR_OR_NULL(b))
 		return ret;
 
 	trans_for_each_path(trans, path)
@@ -194,8 +193,8 @@ bool __bch2_btree_node_relock(struct btree_trans *trans,
 		return true;
 	}
 fail:
-	if (b != BTREE_ITER_NO_NODE_CACHED &&
-	    b != BTREE_ITER_NO_NODE_INIT)
+	if (b != ERR_PTR(-BCH_ERR_no_btree_node_cached) &&
+	    b != ERR_PTR(-BCH_ERR_no_btree_node_init))
 		trace_btree_node_relock_fail(trans->fn, _RET_IP_,
 					     path->btree_id,
 					     &path->pos,
@@ -282,7 +281,9 @@ static inline bool btree_path_get_locks(struct btree_trans *trans,
 		btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
 
 		do {
-			path->l[fail_idx].b = BTREE_ITER_NO_NODE_GET_LOCKS;
+			path->l[fail_idx].b = upgrade
+				? ERR_PTR(-BCH_ERR_no_btree_node_upgrade)
+				: ERR_PTR(-BCH_ERR_no_btree_node_relock);
 			--fail_idx;
 		} while (fail_idx >= 0);
 	}
@@ -1259,7 +1260,7 @@ static inline int btree_path_lock_root(struct btree_trans *trans,
 			   b->c.level == path->level &&
 			   !race_fault())) {
 			for (i = 0; i < path->level; i++)
-				path->l[i].b = BTREE_ITER_NO_NODE_LOCK_ROOT;
+				path->l[i].b = ERR_PTR(-BCH_ERR_no_btree_node_lock_root);
 			path->l[path->level].b = b;
 			for (i = path->level + 1; i < BTREE_MAX_DEPTH; i++)
 				path->l[i].b = NULL;
@@ -1547,15 +1548,6 @@ static inline bool btree_path_good_node(struct btree_trans *trans,
 	return true;
 }
 
-static void btree_path_set_level_up(struct btree_trans *trans,
-				    struct btree_path *path)
-{
-	btree_node_unlock(trans, path, path->level);
-	path->l[path->level].b = BTREE_ITER_NO_NODE_UP;
-	path->level++;
-	btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
-}
-
 static void btree_path_set_level_down(struct btree_trans *trans,
 				      struct btree_path *path,
 				      unsigned new_level)
@@ -1579,22 +1571,16 @@ static inline unsigned btree_path_up_until_good_node(struct btree_trans *trans,
 	unsigned i, l = path->level;
 
 	while (btree_path_node(path, l) &&
-	       !btree_path_good_node(trans, path, l, check_pos)) {
-		btree_node_unlock(trans, path, l);
-		path->l[l].b = BTREE_ITER_NO_NODE_UP;
-		l++;
-	}
+	       !btree_path_good_node(trans, path, l, check_pos))
+		__btree_path_set_level_up(trans, path, l++);
 
 	/* If we need intent locks, take them too: */
 	for (i = l + 1;
 	     i < path->locks_want && btree_path_node(path, i);
 	     i++)
 		if (!bch2_btree_node_relock(trans, path, i))
-			while (l <= i) {
-				btree_node_unlock(trans, path, l);
-				path->l[l].b = BTREE_ITER_NO_NODE_UP;
-				l++;
-			}
+			while (l <= i)
+				__btree_path_set_level_up(trans, path, l++);
 
 	return l;
 }
@@ -1660,13 +1646,7 @@ static int btree_path_traverse_one(struct btree_trans *trans,
 
 			__bch2_btree_path_unlock(trans, path);
 			path->level = depth_want;
-
-			if (ret == -EIO)
-				path->l[path->level].b =
-					BTREE_ITER_NO_NODE_ERROR;
-			else
-				path->l[path->level].b =
-					BTREE_ITER_NO_NODE_DOWN;
+			path->l[path->level].b = ERR_PTR(ret);
 			goto out;
 		}
 	}
@@ -1755,7 +1735,7 @@ __bch2_btree_path_set_pos(struct btree_trans *trans,
 
 	if (unlikely(path->cached)) {
 		btree_node_unlock(trans, path, 0);
-		path->l[0].b = BTREE_ITER_NO_NODE_CACHED;
+		path->l[0].b = ERR_PTR(-BCH_ERR_no_btree_node_up);
 		btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
 		goto out;
 	}
@@ -2010,7 +1990,7 @@ struct btree_path *bch2_path_get(struct btree_trans *trans,
 		path->nodes_locked		= 0;
 		path->nodes_intent_locked	= 0;
 		for (i = 0; i < ARRAY_SIZE(path->l); i++)
-			path->l[i].b		= BTREE_ITER_NO_NODE_INIT;
+			path->l[i].b		= ERR_PTR(-BCH_ERR_no_btree_node_init);
 #ifdef CONFIG_BCACHEFS_DEBUG
 		path->ip_allocated		= _RET_IP_;
 #endif
@@ -2167,8 +2147,8 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
 
 	if (!bch2_btree_node_relock(trans, path, path->level + 1)) {
 		__bch2_btree_path_unlock(trans, path);
-		path->l[path->level].b = BTREE_ITER_NO_NODE_GET_LOCKS;
-		path->l[path->level + 1].b = BTREE_ITER_NO_NODE_GET_LOCKS;
+		path->l[path->level].b		= ERR_PTR(-BCH_ERR_no_btree_node_relock);
+		path->l[path->level + 1].b	= ERR_PTR(-BCH_ERR_no_btree_node_relock);
 		btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
 		trace_trans_restart_relock_next_node(trans->fn, _THIS_IP_,
 					   path->btree_id, &path->pos);
@@ -2179,9 +2159,7 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
 	b = btree_path_node(path, path->level + 1);
 
 	if (!bpos_cmp(iter->pos, b->key.k.p)) {
-		btree_node_unlock(trans, path, path->level);
-		path->l[path->level].b = BTREE_ITER_NO_NODE_UP;
-		path->level++;
+		__btree_path_set_level_up(trans, path, path->level++);
 	} else {
 		/*
 		 * Haven't gotten to the end of the parent node: go back down to
@@ -3471,7 +3449,7 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct btree_trans *trans)
 
 		for (l = 0; l < BTREE_MAX_DEPTH; l++) {
 			if (btree_node_locked(path, l) &&
-			    (unsigned long) (b = (void *) READ_ONCE(path->l[l].b)) >= 128) {
+			    !IS_ERR_OR_NULL(b = (void *) READ_ONCE(path->l[l].b))) {
 				prt_printf(out, "    %s l=%u ",
 				       btree_node_intent_locked(path, l) ? "i" : "r", l);
 				bch2_btree_path_node_to_text(out, b, path->cached);
diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index 4ff3ed4de0a9..b05b40856e63 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -440,7 +440,7 @@ fill:
 err:
 	if (!bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
 		btree_node_unlock(trans, path, 0);
-		path->l[0].b = BTREE_ITER_NO_NODE_ERROR;
+		path->l[0].b = ERR_PTR(ret);
 	}
 	return ret;
 }
diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h
index 90bf5c02f504..7dcfe3009b84 100644
--- a/fs/bcachefs/btree_locking.h
+++ b/fs/bcachefs/btree_locking.h
@@ -291,4 +291,19 @@ static inline void btree_path_set_should_be_locked(struct btree_path *path)
 	path->should_be_locked = true;
 }
 
+static inline void __btree_path_set_level_up(struct btree_trans *trans,
+				      struct btree_path *path,
+				      unsigned l)
+{
+	btree_node_unlock(trans, path, l);
+	path->l[l].b = ERR_PTR(-BCH_ERR_no_btree_node_up);
+}
+
+static inline void btree_path_set_level_up(struct btree_trans *trans,
+				    struct btree_path *path)
+{
+	__btree_path_set_level_up(trans, path, path->level++);
+	btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
+}
+
 #endif /* _BCACHEFS_BTREE_LOCKING_H */
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 1c70dff591a2..edeaf843cd3f 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -215,15 +215,6 @@ enum btree_path_uptodate {
 	BTREE_ITER_NEED_TRAVERSE	= 2,
 };
 
-#define BTREE_ITER_NO_NODE_GET_LOCKS	((struct btree *) 1)
-#define BTREE_ITER_NO_NODE_DROP		((struct btree *) 2)
-#define BTREE_ITER_NO_NODE_LOCK_ROOT	((struct btree *) 3)
-#define BTREE_ITER_NO_NODE_UP		((struct btree *) 4)
-#define BTREE_ITER_NO_NODE_DOWN		((struct btree *) 5)
-#define BTREE_ITER_NO_NODE_INIT		((struct btree *) 6)
-#define BTREE_ITER_NO_NODE_ERROR	((struct btree *) 7)
-#define BTREE_ITER_NO_NODE_CACHED	((struct btree *) 8)
-
 struct btree_path {
 	u8			idx;
 	u8			sorted_idx;
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 1fbf72df9e2f..fe69644b60a9 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -1956,10 +1956,7 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans,
 		BUG_ON(iter2.path->level != b->c.level);
 		BUG_ON(bpos_cmp(iter2.path->pos, new_key->k.p));
 
-		btree_node_unlock(trans, iter2.path, iter2.path->level);
-		path_l(iter2.path)->b = BTREE_ITER_NO_NODE_UP;
-		iter2.path->level++;
-		btree_path_set_dirty(iter2.path, BTREE_ITER_NEED_TRAVERSE);
+		btree_path_set_level_up(trans, iter2.path);
 
 		trans->paths_sorted = false;
 
diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h
index 6dd2152e782e..15a1be2fcc84 100644
--- a/fs/bcachefs/errcode.h
+++ b/fs/bcachefs/errcode.h
@@ -28,6 +28,15 @@
 	x(transaction_restart,	transaction_restart_key_cache_realloced)\
 	x(transaction_restart,	transaction_restart_journal_preres_get)	\
 	x(transaction_restart,	transaction_restart_nested)		\
+	x(0,			no_btree_node)				\
+	x(no_btree_node,	no_btree_node_relock)			\
+	x(no_btree_node,	no_btree_node_upgrade)			\
+	x(no_btree_node,	no_btree_node_drop)			\
+	x(no_btree_node,	no_btree_node_lock_root)		\
+	x(no_btree_node,	no_btree_node_up)			\
+	x(no_btree_node,	no_btree_node_down)			\
+	x(no_btree_node,	no_btree_node_init)			\
+	x(no_btree_node,	no_btree_node_cached)			\
 	x(0,			lock_fail_node_reused)			\
 	x(0,			lock_fail_root_changed)			\
 	x(0,			journal_reclaim_would_deadlock)		\
-- 
cgit 


From fa3ae3ca4e13d86fe5f97c275748b3820c873091 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 10 Aug 2022 12:34:18 -0400
Subject: bcachefs: six_lock_counts() is now in six.c

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c | 6 +++---
 fs/bcachefs/trace.c      | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 479e46a26f46..2c24faca623a 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -131,7 +131,7 @@ void bch2_btree_node_unlock_write(struct btree_trans *trans,
 	bch2_btree_node_unlock_write_inlined(trans, path, b);
 }
 
-static struct six_lock_count btree_node_lock_counts(struct btree_trans *trans,
+struct six_lock_count bch2_btree_node_lock_counts(struct btree_trans *trans,
 					   struct btree_path *skip,
 					   struct btree *b,
 					   unsigned level)
@@ -161,7 +161,7 @@ static inline void six_lock_readers_add(struct six_lock *lock, int nr)
 
 void __bch2_btree_node_lock_write(struct btree_trans *trans, struct btree *b)
 {
-	int readers = btree_node_lock_counts(trans, NULL, b, b->c.level).read;
+	int readers = bch2_btree_node_lock_counts(trans, NULL, b, b->c.level).read;
 
 	/*
 	 * Must drop our read locks before calling six_lock_write() -
@@ -244,7 +244,7 @@ bool bch2_btree_node_upgrade(struct btree_trans *trans,
 				     path->btree_id,
 				     &path->pos,
 				     btree_node_locked(path, level),
-				     btree_node_lock_counts(trans, NULL, b, level),
+				     bch2_btree_node_lock_counts(trans, NULL, b, level),
 				     six_lock_counts(&b->c.lock));
 	return false;
 success:
diff --git a/fs/bcachefs/trace.c b/fs/bcachefs/trace.c
index b770973faa14..eff1e3dfbcea 100644
--- a/fs/bcachefs/trace.c
+++ b/fs/bcachefs/trace.c
@@ -4,9 +4,9 @@
 #include "buckets.h"
 #include "btree_types.h"
 #include "keylist.h"
+#include "six.h"
 
 #include <linux/blktrace_api.h>
-#include "keylist.h"
 
 #define CREATE_TRACE_POINTS
 #include "trace.h"
-- 
cgit 


From c7be3cb546e3bb2704008506bd6c50ad5ea02441 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 10 Aug 2022 20:22:01 -0400
Subject: bcachefs: "Snapshot deletion did not run correctly" should be a fsck
 err

This was noticed when a test hit this error and didn't fail, because
fsck wasn't returning that it fixed errors.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/fsck.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index bb8cab7cb405..c93e177a314f 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -519,7 +519,7 @@ static int snapshots_seen_update(struct bch_fs *c, struct snapshots_seen *s,
 		.id	= pos.snapshot,
 		.equiv	= bch2_snapshot_equiv(c, pos.snapshot),
 	};
-	int ret;
+	int ret = 0;
 
 	if (bkey_cmp(s->pos, pos))
 		s->ids.nr = 0;
@@ -529,14 +529,13 @@ static int snapshots_seen_update(struct bch_fs *c, struct snapshots_seen *s,
 
 	darray_for_each(s->ids, i)
 		if (i->equiv == n.equiv) {
-			if (i->id != n.id) {
-				bch_err(c, "snapshot deletion did not run correctly:\n"
+			if (fsck_err_on(i->id != n.id, c,
+					"snapshot deletion did not run correctly:\n"
 					"  duplicate keys in btree %s at %llu:%llu snapshots %u, %u (equiv %u)\n",
 					bch2_btree_ids[btree_id],
 					pos.inode, pos.offset,
-					i->id, n.id, n.equiv);
+					i->id, n.id, n.equiv))
 				return -BCH_ERR_need_snapshot_cleanup;
-			}
 
 			return 0;
 		}
@@ -545,6 +544,7 @@ static int snapshots_seen_update(struct bch_fs *c, struct snapshots_seen *s,
 	if (ret)
 		bch_err(c, "error reallocating snapshots_seen table (size %zu)",
 			s->ids.size);
+fsck_err:
 	return ret;
 }
 
-- 
cgit 


From 9f96568c0ab983fbb0f6eefa36ad799a72bc9358 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 10 Aug 2022 12:42:55 -0400
Subject: bcachefs: Tracepoint improvements

Our types are exported to the tracepoint code, so it's not necessary to
break things out individually when passing them to tracepoints - we can
also call other functions from TP_fast_assign().

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_cache.c           |  16 +-
 fs/bcachefs/btree_iter.c            |  55 ++---
 fs/bcachefs/btree_iter.h            |   2 +-
 fs/bcachefs/btree_key_cache.c       |   5 +-
 fs/bcachefs/btree_locking.h         |   8 +
 fs/bcachefs/btree_update_interior.c |   5 +-
 fs/bcachefs/btree_update_leaf.c     |  28 +--
 fs/bcachefs/trace.c                 |   4 +-
 fs/bcachefs/trace.h                 | 421 ++++++++++++++----------------------
 9 files changed, 213 insertions(+), 331 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index 4032c27fcc9c..969ecb2fdfad 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -691,8 +691,7 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c,
 	 * been freed:
 	 */
 	if (trans && !bch2_btree_node_relock(trans, path, level + 1)) {
-		trace_trans_restart_relock_parent_for_fill(trans->fn,
-					_THIS_IP_, btree_id, &path->pos);
+		trace_trans_restart_relock_parent_for_fill(trans, _THIS_IP_, path);
 		return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_fill_relock));
 	}
 
@@ -700,9 +699,7 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c,
 
 	if (trans && b == ERR_PTR(-ENOMEM)) {
 		trans->memory_allocation_failure = true;
-		trace_trans_restart_memory_allocation_failure(trans->fn,
-				_THIS_IP_, btree_id, &path->pos);
-
+		trace_trans_restart_memory_allocation_failure(trans, _THIS_IP_, path);
 		return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_fill_mem_alloc_fail));
 	}
 
@@ -750,8 +747,8 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c,
 	}
 
 	if (!six_relock_type(&b->c.lock, lock_type, seq)) {
-		trace_trans_restart_relock_after_fill(trans->fn, _THIS_IP_,
-					   btree_id, &path->pos);
+		if (trans)
+			trace_trans_restart_relock_after_fill(trans, _THIS_IP_, path);
 		return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_after_fill));
 	}
 
@@ -906,10 +903,7 @@ lock_node:
 			if (bch2_btree_node_relock(trans, path, level + 1))
 				goto retry;
 
-			trace_trans_restart_btree_node_reused(trans->fn,
-							      trace_ip,
-							      path->btree_id,
-							      &path->pos);
+			trace_trans_restart_btree_node_reused(trans, trace_ip, path);
 			return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_lock_node_reused));
 		}
 	}
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 2c24faca623a..7049077d47bc 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -88,11 +88,6 @@ static inline struct bpos bkey_predecessor(struct btree_iter *iter, struct bpos
 	return p;
 }
 
-static inline bool is_btree_node(struct btree_path *path, unsigned l)
-{
-	return l < BTREE_MAX_DEPTH && !IS_ERR_OR_NULL(path->l[l].b);
-}
-
 static inline struct bpos btree_iter_search_key(struct btree_iter *iter)
 {
 	struct bpos pos = iter->pos;
@@ -195,12 +190,7 @@ bool __bch2_btree_node_relock(struct btree_trans *trans,
 fail:
 	if (b != ERR_PTR(-BCH_ERR_no_btree_node_cached) &&
 	    b != ERR_PTR(-BCH_ERR_no_btree_node_init))
-		trace_btree_node_relock_fail(trans->fn, _RET_IP_,
-					     path->btree_id,
-					     &path->pos,
-					     (unsigned long) b,
-					     path->l[level].lock_seq,
-					     is_btree_node(path, level) ? b->c.lock.state.seq : 0);
+		trace_btree_node_relock_fail(trans, _RET_IP_, path, level);
 	return false;
 }
 
@@ -240,12 +230,7 @@ bool bch2_btree_node_upgrade(struct btree_trans *trans,
 		goto success;
 	}
 
-	trace_btree_node_upgrade_fail(trans->fn, _RET_IP_,
-				     path->btree_id,
-				     &path->pos,
-				     btree_node_locked(path, level),
-				     bch2_btree_node_lock_counts(trans, NULL, b, level),
-				     six_lock_counts(&b->c.lock));
+	trace_btree_node_upgrade_fail(trans, _RET_IP_, path, level);
 	return false;
 success:
 	mark_btree_node_intent_locked(trans, path, level);
@@ -381,14 +366,7 @@ int __bch2_btree_node_lock(struct btree_trans *trans,
 	return btree_node_lock_type(trans, path, b, pos, level,
 				    type, should_sleep_fn, p);
 deadlock:
-	trace_trans_restart_would_deadlock(trans->fn, ip,
-			trans->in_traverse_all, reason,
-			linked->btree_id,
-			linked->cached,
-			&linked->pos,
-			path->btree_id,
-			path->cached,
-			&pos);
+	trace_trans_restart_would_deadlock(trans, ip, reason, linked, path, &pos);
 	return btree_trans_restart(trans, BCH_ERR_transaction_restart_would_deadlock);
 }
 
@@ -438,8 +416,7 @@ int bch2_btree_path_relock_intent(struct btree_trans *trans,
 		if (!bch2_btree_node_relock(trans, path, l)) {
 			__bch2_btree_path_unlock(trans, path);
 			btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
-			trace_trans_restart_relock_path_intent(trans->fn, _RET_IP_,
-						   path->btree_id, &path->pos);
+			trace_trans_restart_relock_path_intent(trans, _RET_IP_, path);
 			return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_path_intent);
 		}
 	}
@@ -454,8 +431,7 @@ static int __bch2_btree_path_relock(struct btree_trans *trans,
 	bool ret = btree_path_get_locks(trans, path, false);
 
 	if (!ret) {
-		trace_trans_restart_relock_path(trans->fn, trace_ip,
-						path->btree_id, &path->pos);
+		trace_trans_restart_relock_path(trans, trace_ip, path);
 		return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_path);
 	}
 
@@ -561,8 +537,7 @@ int bch2_trans_relock(struct btree_trans *trans)
 	trans_for_each_path(trans, path)
 		if (path->should_be_locked &&
 		    bch2_btree_path_relock(trans, path, _RET_IP_)) {
-			trace_trans_restart_relock(trans->fn, _RET_IP_,
-					path->btree_id, &path->pos);
+			trace_trans_restart_relock(trans, _RET_IP_, path);
 			BUG_ON(!trans->restarted);
 			return -BCH_ERR_transaction_restart_relock;
 		}
@@ -1529,7 +1504,7 @@ err:
 
 	trans->in_traverse_all = false;
 
-	trace_trans_traverse_all(trans->fn, trace_ip);
+	trace_trans_traverse_all(trans, trace_ip);
 	return ret;
 }
 
@@ -1666,7 +1641,7 @@ int __must_check bch2_btree_path_traverse(struct btree_trans *trans,
 		u64 max = ~(~0ULL << restart_probability_bits);
 
 		if (!get_random_u32_below(max)) {
-			trace_transaction_restart_injected(trans->fn, _RET_IP_);
+			trace_transaction_restart_injected(trans, _RET_IP_);
 			return btree_trans_restart(trans, BCH_ERR_transaction_restart_fault_inject);
 		}
 	}
@@ -1798,7 +1773,6 @@ static struct btree_path *have_node_at_pos(struct btree_trans *trans, struct btr
 
 static inline void __bch2_path_free(struct btree_trans *trans, struct btree_path *path)
 {
-	trace_btree_path_free(trans->fn, _RET_IP_, path->btree_id, &path->pos);
 	__bch2_btree_path_unlock(trans, path);
 	btree_path_list_remove(trans, path);
 	trans->paths_allocated &= ~(1ULL << path->idx);
@@ -1891,10 +1865,10 @@ void bch2_dump_trans_paths_updates(struct btree_trans *trans)
 
 		bch2_bpos_to_text(&buf, path->pos);
 
-		printk(KERN_ERR "path: idx %u ref %u:%u%s%s btree=%s l=%u pos %s locks %u %pS\n",
+		printk(KERN_ERR "path: idx %2u ref %u:%u %c %c btree=%s l=%u pos %s locks %u %pS\n",
 		       path->idx, path->ref, path->intent_ref,
-		       path->should_be_locked ? " S" : "",
-		       path->preserve ? " P" : "",
+		       path->preserve ? 'P' : ' ',
+		       path->should_be_locked ? 'S' : ' ',
 		       bch2_btree_ids[path->btree_id],
 		       path->level,
 		       buf.buf,
@@ -1974,8 +1948,6 @@ struct btree_path *bch2_path_get(struct btree_trans *trans,
 		__btree_path_get(path_pos, intent);
 		path = bch2_btree_path_set_pos(trans, path_pos, pos, intent);
 	} else {
-		trace_btree_path_alloc(trans->fn, _RET_IP_, btree_id, &pos, locks_want);
-
 		path = btree_path_alloc(trans, path_pos);
 		path_pos = NULL;
 
@@ -2150,8 +2122,7 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
 		path->l[path->level].b		= ERR_PTR(-BCH_ERR_no_btree_node_relock);
 		path->l[path->level + 1].b	= ERR_PTR(-BCH_ERR_no_btree_node_relock);
 		btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
-		trace_trans_restart_relock_next_node(trans->fn, _THIS_IP_,
-					   path->btree_id, &path->pos);
+		trace_trans_restart_relock_next_node(trans, _THIS_IP_, path);
 		ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_relock);
 		goto err;
 	}
@@ -3185,7 +3156,7 @@ void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
 		trans->mem_bytes = new_bytes;
 
 		if (old_bytes) {
-			trace_trans_restart_mem_realloced(trans->fn, _RET_IP_, new_bytes);
+			trace_trans_restart_mem_realloced(trans, _RET_IP_, new_bytes);
 			return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_mem_realloced));
 		}
 	}
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index 5ca92b6bb397..3e3f35e29182 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -405,7 +405,7 @@ static inline struct bkey_s_c bch2_btree_iter_peek_upto_type(struct btree_iter *
 static inline int btree_trans_too_many_iters(struct btree_trans *trans)
 {
 	if (hweight64(trans->paths_allocated) > BTREE_ITER_MAX / 2) {
-		trace_trans_restart_too_many_iters(trans->fn, _THIS_IP_);
+		trace_trans_restart_too_many_iters(trans, _THIS_IP_);
 		return btree_trans_restart(trans, BCH_ERR_transaction_restart_too_many_iters);
 	}
 
diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index b05b40856e63..6e3d988f2112 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -291,8 +291,7 @@ static int btree_key_cache_fill(struct btree_trans *trans,
 	k = bch2_btree_path_peek_slot(path, &u);
 
 	if (!bch2_btree_node_relock(trans, ck_path, 0)) {
-		trace_trans_restart_relock_key_cache_fill(trans->fn,
-				_THIS_IP_, ck_path->btree_id, &ck_path->pos);
+		trace_trans_restart_relock_key_cache_fill(trans, _THIS_IP_, ck_path);
 		ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_raced);
 		goto err;
 	}
@@ -420,7 +419,7 @@ fill:
 		 */
 		if (!path->locks_want &&
 		    !__bch2_btree_path_upgrade(trans, path, 1)) {
-			trace_transaction_restart_key_cache_upgrade(trans->fn, _THIS_IP_);
+			trace_transaction_restart_key_cache_upgrade(trans, _THIS_IP_);
 			ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_upgrade);
 			goto err;
 		}
diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h
index 7dcfe3009b84..5e2cd170aea2 100644
--- a/fs/bcachefs/btree_locking.h
+++ b/fs/bcachefs/btree_locking.h
@@ -13,6 +13,11 @@
 #include "btree_iter.h"
 #include "six.h"
 
+static inline bool is_btree_node(struct btree_path *path, unsigned l)
+{
+	return l < BTREE_MAX_DEPTH && !IS_ERR_OR_NULL(path->l[l].b);
+}
+
 /* matches six lock types */
 enum btree_node_locked_type {
 	BTREE_NODE_UNLOCKED		= -1,
@@ -306,4 +311,7 @@ static inline void btree_path_set_level_up(struct btree_trans *trans,
 	btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
 }
 
+struct six_lock_count bch2_btree_node_lock_counts(struct btree_trans *,
+				struct btree_path *, struct btree *, unsigned);
+
 #endif /* _BCACHEFS_BTREE_LOCKING_H */
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index fe69644b60a9..2190f288e21f 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -994,8 +994,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
 		nr_nodes[1] += 1;
 
 	if (!bch2_btree_path_upgrade(trans, path, U8_MAX)) {
-		trace_trans_restart_iter_upgrade(trans->fn, _RET_IP_,
-						 path->btree_id, &path->pos);
+		trace_trans_restart_iter_upgrade(trans, _RET_IP_, path);
 		ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_upgrade);
 		return ERR_PTR(ret);
 	}
@@ -1053,7 +1052,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
 					      BTREE_UPDATE_JOURNAL_RES,
 					      journal_flags);
 		if (ret) {
-			trace_trans_restart_journal_preres_get(trans->fn, _RET_IP_);
+			trace_trans_restart_journal_preres_get(trans, _RET_IP_);
 			ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_journal_preres_get);
 			goto err;
 		}
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 9c84eed32007..2d824f6a6fab 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -285,7 +285,7 @@ bch2_trans_journal_preres_get_cold(struct btree_trans *trans, unsigned u64s,
 
 	ret = bch2_trans_relock(trans);
 	if (ret) {
-		trace_trans_restart_journal_preres_get(trans->fn, trace_ip);
+		trace_trans_restart_journal_preres_get(trans, trace_ip);
 		return ret;
 	}
 
@@ -375,9 +375,7 @@ btree_key_can_insert_cached(struct btree_trans *trans,
 	 * Keys returned by peek() are no longer valid pointers, so we need a
 	 * transaction restart:
 	 */
-	trace_trans_restart_key_cache_key_realloced(trans->fn, _RET_IP_,
-					     path->btree_id, &path->pos,
-					     old_u64s, new_u64s);
+	trace_trans_restart_key_cache_key_realloced(trans, _RET_IP_, path, old_u64s, new_u64s);
 	return btree_trans_restart_nounlock(trans, BCH_ERR_transaction_restart_key_cache_realloced);
 }
 
@@ -569,7 +567,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans,
 	int ret;
 
 	if (race_fault()) {
-		trace_trans_restart_fault_inject(trans->fn, trace_ip);
+		trace_trans_restart_fault_inject(trans, trace_ip);
 		return btree_trans_restart_nounlock(trans, BCH_ERR_transaction_restart_fault_inject);
 	}
 
@@ -837,7 +835,7 @@ fail:
 		bch2_btree_node_unlock_write_inlined(trans, i->path, insert_l(i)->b);
 	}
 
-	trace_trans_restart_would_deadlock_write(trans->fn);
+	trace_trans_restart_would_deadlock_write(trans);
 	return btree_trans_restart(trans, BCH_ERR_transaction_restart_would_deadlock_write);
 }
 
@@ -970,8 +968,7 @@ int bch2_trans_commit_error(struct btree_trans *trans,
 	case BTREE_INSERT_BTREE_NODE_FULL:
 		ret = bch2_btree_split_leaf(trans, i->path, trans->flags);
 		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-			trace_trans_restart_btree_node_split(trans->fn, trace_ip,
-						i->btree_id, &i->path->pos);
+			trace_trans_restart_btree_node_split(trans, trace_ip, i->path);
 		break;
 	case BTREE_INSERT_NEED_MARK_REPLICAS:
 		bch2_trans_unlock(trans);
@@ -982,7 +979,7 @@ int bch2_trans_commit_error(struct btree_trans *trans,
 
 		ret = bch2_trans_relock(trans);
 		if (ret)
-			trace_trans_restart_mark_replicas(trans->fn, trace_ip);
+			trace_trans_restart_mark_replicas(trans, trace_ip);
 		break;
 	case BTREE_INSERT_NEED_JOURNAL_RES:
 		bch2_trans_unlock(trans);
@@ -999,12 +996,12 @@ int bch2_trans_commit_error(struct btree_trans *trans,
 
 		ret = bch2_trans_relock(trans);
 		if (ret)
-			trace_trans_restart_journal_res_get(trans->fn, trace_ip);
+			trace_trans_restart_journal_res_get(trans, trace_ip);
 		break;
 	case BTREE_INSERT_NEED_JOURNAL_RECLAIM:
 		bch2_trans_unlock(trans);
 
-		trace_trans_blocked_journal_reclaim(trans->fn, trace_ip);
+		trace_trans_blocked_journal_reclaim(trans, trace_ip);
 
 		wait_event_freezable(c->journal.reclaim_wait,
 				     (ret = journal_reclaim_wait_done(c)));
@@ -1013,7 +1010,7 @@ int bch2_trans_commit_error(struct btree_trans *trans,
 
 		ret = bch2_trans_relock(trans);
 		if (ret)
-			trace_trans_restart_journal_reclaim(trans->fn, trace_ip);
+			trace_trans_restart_journal_reclaim(trans, trace_ip);
 		break;
 	default:
 		BUG_ON(ret >= 0);
@@ -1116,8 +1113,7 @@ int __bch2_trans_commit(struct btree_trans *trans)
 		BUG_ON(!i->path->should_be_locked);
 
 		if (unlikely(!bch2_btree_path_upgrade(trans, i->path, i->level + 1))) {
-			trace_trans_restart_upgrade(trans->fn, _RET_IP_,
-						    i->btree_id, &i->path->pos);
+			trace_trans_restart_upgrade(trans, _RET_IP_, i->path);
 			ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_upgrade);
 			goto out;
 		}
@@ -1163,7 +1159,7 @@ retry:
 	if (ret)
 		goto err;
 
-	trace_transaction_commit(trans->fn, _RET_IP_);
+	trace_transaction_commit(trans, _RET_IP_);
 out:
 	bch2_journal_preres_put(&c->journal, &trans->journal_preres);
 
@@ -1639,7 +1635,7 @@ int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter
 			ck = (void *) iter->key_cache_path->l[0].b;
 
 			if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
-				trace_trans_restart_key_cache_raced(trans->fn, _RET_IP_);
+				trace_trans_restart_key_cache_raced(trans, _RET_IP_);
 				return btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_raced);
 			}
 
diff --git a/fs/bcachefs/trace.c b/fs/bcachefs/trace.c
index eff1e3dfbcea..5c1d724cbb55 100644
--- a/fs/bcachefs/trace.c
+++ b/fs/bcachefs/trace.c
@@ -2,8 +2,10 @@
 #include "bcachefs.h"
 #include "alloc_types.h"
 #include "buckets.h"
-#include "btree_types.h"
+#include "btree_iter.h"
+#include "btree_locking.h"
 #include "keylist.h"
+#include "opts.h"
 #include "six.h"
 
 #include <linux/blktrace_api.h>
diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h
index 5969a049df7d..931da79e027f 100644
--- a/fs/bcachefs/trace.h
+++ b/fs/bcachefs/trace.h
@@ -7,21 +7,29 @@
 
 #include <linux/tracepoint.h>
 
+#define TRACE_BPOS_entries(name)				\
+	__field(u64,			name##_inode	)	\
+	__field(u64,			name##_offset	)	\
+	__field(u32,			name##_snapshot	)
+
+#define TRACE_BPOS_assign(dst, src)				\
+	__entry->dst##_inode		= (src).inode;		\
+	__entry->dst##_offset		= (src).offset;		\
+	__entry->dst##_snapshot		= (src).snapshot
+
 DECLARE_EVENT_CLASS(bpos,
 	TP_PROTO(struct bpos *p),
 	TP_ARGS(p),
 
 	TP_STRUCT__entry(
-		__field(u64,	inode				)
-		__field(u64,	offset				)
+		TRACE_BPOS_entries(p)
 	),
 
 	TP_fast_assign(
-		__entry->inode	= p->inode;
-		__entry->offset	= p->offset;
+		TRACE_BPOS_assign(p, *p);
 	),
 
-	TP_printk("%llu:%llu", __entry->inode, __entry->offset)
+	TP_printk("%llu:%llu:%u", __entry->p_inode, __entry->p_offset, __entry->p_snapshot)
 );
 
 DECLARE_EVENT_CLASS(bkey,
@@ -230,23 +238,22 @@ DECLARE_EVENT_CLASS(btree_node,
 	TP_STRUCT__entry(
 		__field(dev_t,		dev			)
 		__field(u8,		level			)
-		__field(u8,		id			)
-		__field(u64,		inode			)
-		__field(u64,		offset			)
+		__field(u8,		btree_id		)
+		TRACE_BPOS_entries(pos)
 	),
 
 	TP_fast_assign(
 		__entry->dev		= c->dev;
 		__entry->level		= b->c.level;
-		__entry->id		= b->c.btree_id;
-		__entry->inode		= b->key.k.p.inode;
-		__entry->offset		= b->key.k.p.offset;
+		__entry->btree_id	= b->c.btree_id;
+		TRACE_BPOS_assign(pos, b->key.k.p);
 	),
 
-	TP_printk("%d,%d  %u id %u %llu:%llu",
+	TP_printk("%d,%d %u %s %llu:%llu:%u",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
-		  __entry->level, __entry->id,
-		  __entry->inode, __entry->offset)
+		  __entry->level,
+		  bch2_btree_ids[__entry->btree_id],
+		  __entry->pos_inode, __entry->pos_offset, __entry->pos_snapshot)
 );
 
 DEFINE_EVENT(btree_node, btree_read,
@@ -379,43 +386,36 @@ TRACE_EVENT(btree_cache_scan,
 );
 
 TRACE_EVENT(btree_node_relock_fail,
-	TP_PROTO(const char *trans_fn,
+	TP_PROTO(struct btree_trans *trans,
 		 unsigned long caller_ip,
-		 enum btree_id btree_id,
-		 struct bpos *pos,
-		 unsigned long node,
-		 u32 iter_lock_seq,
-		 u32 node_lock_seq),
-	TP_ARGS(trans_fn, caller_ip, btree_id, pos, node, iter_lock_seq, node_lock_seq),
+		 struct btree_path *path,
+		 unsigned level),
+	TP_ARGS(trans, caller_ip, path, level),
 
 	TP_STRUCT__entry(
 		__array(char,			trans_fn, 24	)
 		__field(unsigned long,		caller_ip	)
 		__field(u8,			btree_id	)
-		__field(u64,			pos_inode	)
-		__field(u64,			pos_offset	)
-		__field(u32,			pos_snapshot	)
+		TRACE_BPOS_entries(pos)
 		__field(unsigned long,		node		)
 		__field(u32,			iter_lock_seq	)
 		__field(u32,			node_lock_seq	)
 	),
 
 	TP_fast_assign(
-		strlcpy(__entry->trans_fn, trans_fn, sizeof(__entry->trans_fn));
+		strlcpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
 		__entry->caller_ip		= caller_ip;
-		__entry->btree_id		= btree_id;
-		__entry->pos_inode		= pos->inode;
-		__entry->pos_offset		= pos->offset;
-		__entry->pos_snapshot		= pos->snapshot;
-		__entry->node			= node;
-		__entry->iter_lock_seq		= iter_lock_seq;
-		__entry->node_lock_seq		= node_lock_seq;
+		__entry->btree_id		= path->btree_id;
+		TRACE_BPOS_assign(pos, path->pos);
+		__entry->node			= (unsigned long) btree_path_node(path, level);
+		__entry->iter_lock_seq		= path->l[level].lock_seq;
+		__entry->node_lock_seq		= is_btree_node(path, level) ? path->l[level].b->c.lock.state.seq : 0;
 	),
 
-	TP_printk("%s %pS btree %u pos %llu:%llu:%u, node %lu iter seq %u lock seq %u",
+	TP_printk("%s %pS btree %s pos %llu:%llu:%u, node %lu iter seq %u lock seq %u",
 		  __entry->trans_fn,
 		  (void *) __entry->caller_ip,
-		  __entry->btree_id,
+		  bch2_btree_ids[__entry->btree_id],
 		  __entry->pos_inode,
 		  __entry->pos_offset,
 		  __entry->pos_snapshot,
@@ -425,48 +425,45 @@ TRACE_EVENT(btree_node_relock_fail,
 );
 
 TRACE_EVENT(btree_node_upgrade_fail,
-	TP_PROTO(const char *trans_fn,
+	TP_PROTO(struct btree_trans *trans,
 		 unsigned long caller_ip,
-		 enum btree_id btree_id,
-		 struct bpos *pos,
-		 bool locked,
-		 struct six_lock_count self_lock_count,
-		 struct six_lock_count lock_count),
-	TP_ARGS(trans_fn, caller_ip, btree_id, pos,
-		locked, self_lock_count, lock_count),
+		 struct btree_path *path,
+		 unsigned level),
+	TP_ARGS(trans, caller_ip, path, level),
 
 	TP_STRUCT__entry(
 		__array(char,			trans_fn, 24	)
 		__field(unsigned long,		caller_ip	)
 		__field(u8,			btree_id	)
-		__field(u64,			pos_inode	)
-		__field(u64,			pos_offset	)
-		__field(u32,			pos_snapshot	)
+		TRACE_BPOS_entries(pos)
 		__field(u8,			locked		)
 		__field(u8,			self_read_count	)
-		__field(u8,			read_count	)
 		__field(u8,			self_intent_count)
+		__field(u8,			read_count	)
 		__field(u8,			intent_count	)
 	),
 
 	TP_fast_assign(
-		strlcpy(__entry->trans_fn, trans_fn, sizeof(__entry->trans_fn));
+		struct six_lock_count c;
+
+		strlcpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
 		__entry->caller_ip		= caller_ip;
-		__entry->btree_id		= btree_id;
-		__entry->pos_inode		= pos->inode;
-		__entry->pos_offset		= pos->offset;
-		__entry->pos_snapshot		= pos->snapshot;
-		__entry->locked			= locked;
-		__entry->self_read_count	= self_lock_count.read;
-		__entry->self_intent_count	= self_lock_count.intent;
-		__entry->read_count		= lock_count.read;
-		__entry->intent_count		= lock_count.intent;
+		__entry->btree_id		= path->btree_id;
+		TRACE_BPOS_assign(pos, path->pos);
+		__entry->locked			= btree_node_locked(path, level);
+
+		c = bch2_btree_node_lock_counts(trans, NULL, path->l[level].b, level),
+		__entry->self_read_count	= c.read;
+		__entry->self_intent_count	= c.intent;
+		c = six_lock_counts(&path->l[level].b->c.lock);
+		__entry->read_count		= c.read;
+		__entry->intent_count		= c.intent;
 	),
 
-	TP_printk("%s %pS btree %u pos %llu:%llu:%u, locked %u held %u:%u lock count %u:%u",
+	TP_printk("%s %pS btree %s pos %llu:%llu:%u, locked %u held %u:%u lock count %u:%u",
 		  __entry->trans_fn,
 		  (void *) __entry->caller_ip,
-		  __entry->btree_id,
+		  bch2_btree_ids[__entry->btree_id],
 		  __entry->pos_inode,
 		  __entry->pos_offset,
 		  __entry->pos_snapshot,
@@ -731,9 +728,9 @@ TRACE_EVENT(copygc_wait,
 );
 
 DECLARE_EVENT_CLASS(transaction_event,
-	TP_PROTO(const char *trans_fn,
+	TP_PROTO(struct btree_trans *trans,
 		 unsigned long caller_ip),
-	TP_ARGS(trans_fn, caller_ip),
+	TP_ARGS(trans, caller_ip),
 
 	TP_STRUCT__entry(
 		__array(char,			trans_fn, 24	)
@@ -741,7 +738,7 @@ DECLARE_EVENT_CLASS(transaction_event,
 	),
 
 	TP_fast_assign(
-		strlcpy(__entry->trans_fn, trans_fn, sizeof(__entry->trans_fn));
+		strlcpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
 		__entry->caller_ip		= caller_ip;
 	),
 
@@ -749,229 +746,206 @@ DECLARE_EVENT_CLASS(transaction_event,
 );
 
 DEFINE_EVENT(transaction_event,	transaction_commit,
-	TP_PROTO(const char *trans_fn,
+	TP_PROTO(struct btree_trans *trans,
 		 unsigned long caller_ip),
-	TP_ARGS(trans_fn, caller_ip)
+	TP_ARGS(trans, caller_ip)
 );
 
 DEFINE_EVENT(transaction_event,	transaction_restart_injected,
-	TP_PROTO(const char *trans_fn,
+	TP_PROTO(struct btree_trans *trans,
 		 unsigned long caller_ip),
-	TP_ARGS(trans_fn, caller_ip)
+	TP_ARGS(trans, caller_ip)
 );
 
 DEFINE_EVENT(transaction_event,	trans_blocked_journal_reclaim,
-	TP_PROTO(const char *trans_fn,
+	TP_PROTO(struct btree_trans *trans,
 		 unsigned long caller_ip),
-	TP_ARGS(trans_fn, caller_ip)
+	TP_ARGS(trans, caller_ip)
 );
 
 DEFINE_EVENT(transaction_event,	trans_restart_journal_res_get,
-	TP_PROTO(const char *trans_fn,
+	TP_PROTO(struct btree_trans *trans,
 		 unsigned long caller_ip),
-	TP_ARGS(trans_fn, caller_ip)
+	TP_ARGS(trans, caller_ip)
 );
 
 DEFINE_EVENT(transaction_event,	trans_restart_journal_preres_get,
-	TP_PROTO(const char *trans_fn,
+	TP_PROTO(struct btree_trans *trans,
 		 unsigned long caller_ip),
-	TP_ARGS(trans_fn, caller_ip)
+	TP_ARGS(trans, caller_ip)
 );
 
 DEFINE_EVENT(transaction_event,	trans_restart_journal_reclaim,
-	TP_PROTO(const char *trans_fn,
+	TP_PROTO(struct btree_trans *trans,
 		 unsigned long caller_ip),
-	TP_ARGS(trans_fn, caller_ip)
+	TP_ARGS(trans, caller_ip)
 );
 
 DEFINE_EVENT(transaction_event,	trans_restart_fault_inject,
-	TP_PROTO(const char *trans_fn,
+	TP_PROTO(struct btree_trans *trans,
 		 unsigned long caller_ip),
-	TP_ARGS(trans_fn, caller_ip)
+	TP_ARGS(trans, caller_ip)
 );
 
 DEFINE_EVENT(transaction_event,	trans_traverse_all,
-	TP_PROTO(const char *trans_fn,
+	TP_PROTO(struct btree_trans *trans,
 		 unsigned long caller_ip),
-	TP_ARGS(trans_fn, caller_ip)
+	TP_ARGS(trans, caller_ip)
 );
 
 DEFINE_EVENT(transaction_event,	trans_restart_mark_replicas,
-	TP_PROTO(const char *trans_fn,
+	TP_PROTO(struct btree_trans *trans,
 		 unsigned long caller_ip),
-	TP_ARGS(trans_fn, caller_ip)
+	TP_ARGS(trans, caller_ip)
 );
 
 DEFINE_EVENT(transaction_event,	trans_restart_key_cache_raced,
-	TP_PROTO(const char *trans_fn,
+	TP_PROTO(struct btree_trans *trans,
 		 unsigned long caller_ip),
-	TP_ARGS(trans_fn, caller_ip)
+	TP_ARGS(trans, caller_ip)
 );
 
 DEFINE_EVENT(transaction_event,	trans_restart_too_many_iters,
-	TP_PROTO(const char *trans_fn,
+	TP_PROTO(struct btree_trans *trans,
 		 unsigned long caller_ip),
-	TP_ARGS(trans_fn, caller_ip)
+	TP_ARGS(trans, caller_ip)
 );
 
 DECLARE_EVENT_CLASS(transaction_restart_iter,
-	TP_PROTO(const char *trans_fn,
+	TP_PROTO(struct btree_trans *trans,
 		 unsigned long caller_ip,
-		 enum btree_id btree_id,
-		 struct bpos *pos),
-	TP_ARGS(trans_fn, caller_ip, btree_id, pos),
+		 struct btree_path *path),
+	TP_ARGS(trans, caller_ip, path),
 
 	TP_STRUCT__entry(
 		__array(char,			trans_fn, 24	)
 		__field(unsigned long,		caller_ip	)
 		__field(u8,			btree_id	)
-		__field(u64,			pos_inode	)
-		__field(u64,			pos_offset	)
-		__field(u32,			pos_snapshot	)
+		TRACE_BPOS_entries(pos)
 	),
 
 	TP_fast_assign(
-		strlcpy(__entry->trans_fn, trans_fn, sizeof(__entry->trans_fn));
+		strlcpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
 		__entry->caller_ip		= caller_ip;
-		__entry->btree_id		= btree_id;
-		__entry->pos_inode		= pos->inode;
-		__entry->pos_offset		= pos->offset;
-		__entry->pos_snapshot		= pos->snapshot;
+		__entry->btree_id		= path->btree_id;
+		TRACE_BPOS_assign(pos, path->pos)
 	),
 
-	TP_printk("%s %pS btree %u pos %llu:%llu:%u",
+	TP_printk("%s %pS btree %s pos %llu:%llu:%u",
 		  __entry->trans_fn,
 		  (void *) __entry->caller_ip,
-		  __entry->btree_id,
+		  bch2_btree_ids[__entry->btree_id],
 		  __entry->pos_inode,
 		  __entry->pos_offset,
 		  __entry->pos_snapshot)
 );
 
 DEFINE_EVENT(transaction_restart_iter,	trans_restart_btree_node_reused,
-	TP_PROTO(const char *trans_fn,
+	TP_PROTO(struct btree_trans *trans,
 		 unsigned long caller_ip,
-		 enum btree_id btree_id,
-		 struct bpos *pos),
-	TP_ARGS(trans_fn, caller_ip, btree_id, pos)
+		 struct btree_path *path),
+	TP_ARGS(trans, caller_ip, path)
 );
 
 DEFINE_EVENT(transaction_restart_iter,	trans_restart_btree_node_split,
-	TP_PROTO(const char *trans_fn,
+	TP_PROTO(struct btree_trans *trans,
 		 unsigned long caller_ip,
-		 enum btree_id btree_id,
-		 struct bpos *pos),
-	TP_ARGS(trans_fn, caller_ip, btree_id, pos)
+		 struct btree_path *path),
+	TP_ARGS(trans, caller_ip, path)
 );
 
 DEFINE_EVENT(transaction_restart_iter,	trans_restart_upgrade,
-	TP_PROTO(const char *trans_fn,
+	TP_PROTO(struct btree_trans *trans,
 		 unsigned long caller_ip,
-		 enum btree_id btree_id,
-		 struct bpos *pos),
-	TP_ARGS(trans_fn, caller_ip, btree_id, pos)
+		 struct btree_path *path),
+	TP_ARGS(trans, caller_ip, path)
 );
 
 DEFINE_EVENT(transaction_restart_iter,	trans_restart_iter_upgrade,
-	TP_PROTO(const char *trans_fn,
+	TP_PROTO(struct btree_trans *trans,
 		 unsigned long caller_ip,
-		 enum btree_id btree_id,
-		 struct bpos *pos),
-	TP_ARGS(trans_fn, caller_ip, btree_id, pos)
+		 struct btree_path *path),
+	TP_ARGS(trans, caller_ip, path)
 );
 
 DEFINE_EVENT(transaction_restart_iter,	trans_restart_relock,
-	TP_PROTO(const char *trans_fn,
+	TP_PROTO(struct btree_trans *trans,
 		 unsigned long caller_ip,
-		 enum btree_id btree_id,
-		 struct bpos *pos),
-	TP_ARGS(trans_fn, caller_ip, btree_id, pos)
+		 struct btree_path *path),
+	TP_ARGS(trans, caller_ip, path)
 );
 
 DEFINE_EVENT(transaction_restart_iter,	trans_restart_relock_next_node,
-	TP_PROTO(const char *trans_fn,
+	TP_PROTO(struct btree_trans *trans,
 		 unsigned long caller_ip,
-		 enum btree_id btree_id,
-		 struct bpos *pos),
-	TP_ARGS(trans_fn, caller_ip, btree_id, pos)
+		 struct btree_path *path),
+	TP_ARGS(trans, caller_ip, path)
 );
 
 DEFINE_EVENT(transaction_restart_iter,	trans_restart_relock_parent_for_fill,
-	TP_PROTO(const char *trans_fn,
+	TP_PROTO(struct btree_trans *trans,
 		 unsigned long caller_ip,
-		 enum btree_id btree_id,
-		 struct bpos *pos),
-	TP_ARGS(trans_fn, caller_ip, btree_id, pos)
+		 struct btree_path *path),
+	TP_ARGS(trans, caller_ip, path)
 );
 
 DEFINE_EVENT(transaction_restart_iter,	trans_restart_relock_after_fill,
-	TP_PROTO(const char *trans_fn,
+	TP_PROTO(struct btree_trans *trans,
 		 unsigned long caller_ip,
-		 enum btree_id btree_id,
-		 struct bpos *pos),
-	TP_ARGS(trans_fn, caller_ip, btree_id, pos)
+		 struct btree_path *path),
+	TP_ARGS(trans, caller_ip, path)
 );
 
 DEFINE_EVENT(transaction_event,	transaction_restart_key_cache_upgrade,
-	TP_PROTO(const char *trans_fn,
+	TP_PROTO(struct btree_trans *trans,
 		 unsigned long caller_ip),
-	TP_ARGS(trans_fn, caller_ip)
+	TP_ARGS(trans, caller_ip)
 );
 
 DEFINE_EVENT(transaction_restart_iter,	trans_restart_relock_key_cache_fill,
-	TP_PROTO(const char *trans_fn,
+	TP_PROTO(struct btree_trans *trans,
 		 unsigned long caller_ip,
-		 enum btree_id btree_id,
-		 struct bpos *pos),
-	TP_ARGS(trans_fn, caller_ip, btree_id, pos)
+		 struct btree_path *path),
+	TP_ARGS(trans, caller_ip, path)
 );
 
 DEFINE_EVENT(transaction_restart_iter,	trans_restart_relock_path,
-	TP_PROTO(const char *trans_fn,
+	TP_PROTO(struct btree_trans *trans,
 		 unsigned long caller_ip,
-		 enum btree_id btree_id,
-		 struct bpos *pos),
-	TP_ARGS(trans_fn, caller_ip, btree_id, pos)
+		 struct btree_path *path),
+	TP_ARGS(trans, caller_ip, path)
 );
 
 DEFINE_EVENT(transaction_restart_iter,	trans_restart_relock_path_intent,
-	TP_PROTO(const char *trans_fn,
+	TP_PROTO(struct btree_trans *trans,
 		 unsigned long caller_ip,
-		 enum btree_id btree_id,
-		 struct bpos *pos),
-	TP_ARGS(trans_fn, caller_ip, btree_id, pos)
+		 struct btree_path *path),
+	TP_ARGS(trans, caller_ip, path)
 );
 
 DEFINE_EVENT(transaction_restart_iter,	trans_restart_traverse,
-	TP_PROTO(const char *trans_fn,
+	TP_PROTO(struct btree_trans *trans,
 		 unsigned long caller_ip,
-		 enum btree_id btree_id,
-		 struct bpos *pos),
-	TP_ARGS(trans_fn, caller_ip, btree_id, pos)
+		 struct btree_path *path),
+	TP_ARGS(trans, caller_ip, path)
 );
 
 DEFINE_EVENT(transaction_restart_iter,	trans_restart_memory_allocation_failure,
-	TP_PROTO(const char *trans_fn,
+	TP_PROTO(struct btree_trans *trans,
 		 unsigned long caller_ip,
-		 enum btree_id btree_id,
-		 struct bpos *pos),
-	TP_ARGS(trans_fn, caller_ip, btree_id, pos)
+		 struct btree_path *path),
+	TP_ARGS(trans, caller_ip, path)
 );
 
 TRACE_EVENT(trans_restart_would_deadlock,
-	TP_PROTO(const char *trans_fn,
+	TP_PROTO(struct btree_trans *trans,
 		 unsigned long	caller_ip,
-		 bool		in_traverse_all,
 		 unsigned	reason,
-		 enum btree_id	have_btree_id,
-		 unsigned	have_iter_type,
-		 struct bpos	*have_pos,
-		 enum btree_id	want_btree_id,
-		 unsigned	want_iter_type,
+		 struct btree_path *have,
+		 struct btree_path *want,
 		 struct bpos	*want_pos),
-	TP_ARGS(trans_fn, caller_ip, in_traverse_all, reason,
-		have_btree_id, have_iter_type, have_pos,
-		want_btree_id, want_iter_type, want_pos),
+	TP_ARGS(trans, caller_ip, reason,
+		have, want, want_pos),
 
 	TP_STRUCT__entry(
 		__array(char,			trans_fn, 24	)
@@ -979,35 +953,24 @@ TRACE_EVENT(trans_restart_would_deadlock,
 		__field(u8,			in_traverse_all	)
 		__field(u8,			reason		)
 		__field(u8,			have_btree_id	)
-		__field(u8,			have_iter_type	)
+		__field(u8,			have_type	)
 		__field(u8,			want_btree_id	)
-		__field(u8,			want_iter_type	)
-
-		__field(u64,			have_pos_inode	)
-		__field(u64,			have_pos_offset	)
-		__field(u32,			have_pos_snapshot)
-		__field(u32,			want_pos_snapshot)
-		__field(u64,			want_pos_inode	)
-		__field(u64,			want_pos_offset	)
+		__field(u8,			want_type	)
+		TRACE_BPOS_entries(have_pos)
+		TRACE_BPOS_entries(want_pos)
 	),
 
 	TP_fast_assign(
-		strlcpy(__entry->trans_fn, trans_fn, sizeof(__entry->trans_fn));
+		strlcpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
 		__entry->caller_ip		= caller_ip;
-		__entry->in_traverse_all	= in_traverse_all;
+		__entry->in_traverse_all	= trans->in_traverse_all;
 		__entry->reason			= reason;
-		__entry->have_btree_id		= have_btree_id;
-		__entry->have_iter_type		= have_iter_type;
-		__entry->want_btree_id		= want_btree_id;
-		__entry->want_iter_type		= want_iter_type;
-
-		__entry->have_pos_inode		= have_pos->inode;
-		__entry->have_pos_offset	= have_pos->offset;
-		__entry->have_pos_snapshot	= have_pos->snapshot;
-
-		__entry->want_pos_inode		= want_pos->inode;
-		__entry->want_pos_offset	= want_pos->offset;
-		__entry->want_pos_snapshot	= want_pos->snapshot;
+		__entry->have_btree_id		= have->btree_id;
+		__entry->have_type		= have->cached;
+		__entry->want_btree_id		= want->btree_id;
+		__entry->want_type		= want->cached;
+		TRACE_BPOS_assign(have_pos, have->pos);
+		TRACE_BPOS_assign(want_pos, *want_pos);
 	),
 
 	TP_printk("%s %pS traverse_all %u because %u have %u:%u %llu:%llu:%u want %u:%u %llu:%llu:%u",
@@ -1016,37 +979,37 @@ TRACE_EVENT(trans_restart_would_deadlock,
 		  __entry->in_traverse_all,
 		  __entry->reason,
 		  __entry->have_btree_id,
-		  __entry->have_iter_type,
+		  __entry->have_type,
 		  __entry->have_pos_inode,
 		  __entry->have_pos_offset,
 		  __entry->have_pos_snapshot,
 		  __entry->want_btree_id,
-		  __entry->want_iter_type,
+		  __entry->want_type,
 		  __entry->want_pos_inode,
 		  __entry->want_pos_offset,
 		  __entry->want_pos_snapshot)
 );
 
 TRACE_EVENT(trans_restart_would_deadlock_write,
-	TP_PROTO(const char *trans_fn),
-	TP_ARGS(trans_fn),
+	TP_PROTO(struct btree_trans *trans),
+	TP_ARGS(trans),
 
 	TP_STRUCT__entry(
 		__array(char,			trans_fn, 24	)
 	),
 
 	TP_fast_assign(
-		strlcpy(__entry->trans_fn, trans_fn, sizeof(__entry->trans_fn));
+		strlcpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
 	),
 
 	TP_printk("%s", __entry->trans_fn)
 );
 
 TRACE_EVENT(trans_restart_mem_realloced,
-	TP_PROTO(const char *trans_fn,
+	TP_PROTO(struct btree_trans *trans,
 		 unsigned long caller_ip,
 		 unsigned long bytes),
-	TP_ARGS(trans_fn, caller_ip, bytes),
+	TP_ARGS(trans, caller_ip, bytes),
 
 	TP_STRUCT__entry(
 		__array(char,			trans_fn, 24	)
@@ -1055,7 +1018,7 @@ TRACE_EVENT(trans_restart_mem_realloced,
 	),
 
 	TP_fast_assign(
-		strlcpy(__entry->trans_fn, trans_fn, sizeof(__entry->trans_fn));
+		strlcpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
 		__entry->caller_ip	= caller_ip;
 		__entry->bytes		= bytes;
 	),
@@ -1067,32 +1030,28 @@ TRACE_EVENT(trans_restart_mem_realloced,
 );
 
 TRACE_EVENT(trans_restart_key_cache_key_realloced,
-	TP_PROTO(const char *trans_fn,
+	TP_PROTO(struct btree_trans *trans,
 		 unsigned long caller_ip,
-		 enum btree_id btree_id,
-		 struct bpos *pos,
+		 struct btree_path *path,
 		 unsigned old_u64s,
 		 unsigned new_u64s),
-	TP_ARGS(trans_fn, caller_ip, btree_id, pos, old_u64s, new_u64s),
+	TP_ARGS(trans, caller_ip, path, old_u64s, new_u64s),
 
 	TP_STRUCT__entry(
 		__array(char,			trans_fn, 24	)
 		__field(unsigned long,		caller_ip	)
 		__field(enum btree_id,		btree_id	)
-		__field(u64,			inode		)
-		__field(u64,			offset		)
-		__field(u32,			snapshot	)
+		TRACE_BPOS_entries(pos)
 		__field(u32,			old_u64s	)
 		__field(u32,			new_u64s	)
 	),
 
 	TP_fast_assign(
-		strlcpy(__entry->trans_fn, trans_fn, sizeof(__entry->trans_fn));
-		__entry->caller_ip	= caller_ip;
-		__entry->btree_id	= btree_id;
-		__entry->inode		= pos->inode;
-		__entry->offset		= pos->offset;
-		__entry->snapshot	= pos->snapshot;
+		strlcpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
+		__entry->caller_ip		= caller_ip;
+
+		__entry->btree_id	= path->btree_id;
+		TRACE_BPOS_assign(pos, path->pos);
 		__entry->old_u64s	= old_u64s;
 		__entry->new_u64s	= new_u64s;
 	),
@@ -1101,57 +1060,11 @@ TRACE_EVENT(trans_restart_key_cache_key_realloced,
 		  __entry->trans_fn,
 		  (void *) __entry->caller_ip,
 		  bch2_btree_ids[__entry->btree_id],
-		  __entry->inode,
-		  __entry->offset,
-		  __entry->snapshot,
-		  __entry->old_u64s,
-		  __entry->new_u64s)
-);
-
-TRACE_EVENT(btree_path_alloc,
-	TP_PROTO(const char *trans_fn,
-		 unsigned long caller_ip,
-		 enum btree_id btree_id,
-		 struct bpos *pos,
-		 unsigned locks_want),
-	TP_ARGS(trans_fn, caller_ip, btree_id, pos, locks_want),
-
-	TP_STRUCT__entry(
-		__array(char,			trans_fn, 24	)
-		__field(unsigned long,		caller_ip	)
-		__field(u8,			btree_id	)
-		__field(u8,			locks_want	)
-		__field(u64,			pos_inode	)
-		__field(u64,			pos_offset	)
-		__field(u32,			pos_snapshot	)
-	),
-
-	TP_fast_assign(
-		strlcpy(__entry->trans_fn, trans_fn, sizeof(__entry->trans_fn));
-		__entry->caller_ip		= caller_ip;
-		__entry->btree_id		= btree_id;
-		__entry->locks_want		= locks_want;
-		__entry->pos_inode		= pos->inode;
-		__entry->pos_offset		= pos->offset;
-		__entry->pos_snapshot		= pos->snapshot;
-	),
-
-	TP_printk("%s %pS btree %u locks_want %u pos %llu:%llu:%u",
-		  __entry->trans_fn,
-		  (void *) __entry->caller_ip,
-		  __entry->btree_id,
-		  __entry->locks_want,
 		  __entry->pos_inode,
 		  __entry->pos_offset,
-		  __entry->pos_snapshot)
-);
-
-DEFINE_EVENT(transaction_restart_iter,	btree_path_free,
-	TP_PROTO(const char *trans_fn,
-		 unsigned long caller_ip,
-		 enum btree_id btree_id,
-		 struct bpos *pos),
-	TP_ARGS(trans_fn, caller_ip, btree_id, pos)
+		  __entry->pos_snapshot,
+		  __entry->old_u64s,
+		  __entry->new_u64s)
 );
 
 #endif /* _TRACE_BCACHEFS_H */
-- 
cgit 


From 6fae65c112d9fb0a9827bad094e5633fdf2bcbda Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 11 Aug 2022 12:23:21 -0400
Subject: bcachefs: Kill BTREE_ITER_CACHED_(NOFILL|NOCREATE)

These were used more prior to getting rid of the in-memory bucket arrays
- they don't serve much purpose anymore, and deleting them lets us write
better assertions.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_iter.c      |  6 +-----
 fs/bcachefs/btree_key_cache.c | 10 ++--------
 fs/bcachefs/btree_types.h     | 16 +++++++---------
 3 files changed, 10 insertions(+), 22 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 7049077d47bc..bd6e35fcbdbd 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -2016,11 +2016,7 @@ inline struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *path, struct
 		EBUG_ON(ck &&
 			(path->btree_id != ck->key.btree_id ||
 			 bkey_cmp(path->pos, ck->key.pos)));
-
-		/* BTREE_ITER_CACHED_NOFILL|BTREE_ITER_CACHED_NOCREATE? */
-		if (unlikely(!ck || !ck->valid))
-			return bkey_s_c_null;
-
+		EBUG_ON(!ck || !ck->valid);
 		EBUG_ON(path->uptodate != BTREE_ITER_UPTODATE);
 
 		*u = ck->k->k;
diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index 6e3d988f2112..f0055f17381d 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -372,11 +372,6 @@ int bch2_btree_path_traverse_cached(struct btree_trans *trans, struct btree_path
 retry:
 	ck = bch2_btree_key_cache_find(c, path->btree_id, path->pos);
 	if (!ck) {
-		if (flags & BTREE_ITER_CACHED_NOCREATE) {
-			path->l[0].b = NULL;
-			return 0;
-		}
-
 		ck = btree_key_cache_create(c, path->btree_id, path->pos);
 		ret = PTR_ERR_OR_ZERO(ck);
 		if (ret)
@@ -412,7 +407,7 @@ retry:
 	path->l[0].lock_seq	= ck->c.lock.state.seq;
 	path->l[0].b		= (void *) ck;
 fill:
-	if (!ck->valid && !(flags & BTREE_ITER_CACHED_NOFILL)) {
+	if (!ck->valid) {
 		/*
 		 * Using the underscore version because we haven't set
 		 * path->uptodate yet:
@@ -433,6 +428,7 @@ fill:
 		set_bit(BKEY_CACHED_ACCESSED, &ck->flags);
 
 	path->uptodate = BTREE_ITER_UPTODATE;
+	BUG_ON(!ck->valid);
 	BUG_ON(btree_node_locked_type(path, 0) != btree_lock_want(path, 0));
 
 	return ret;
@@ -462,8 +458,6 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans,
 			     BTREE_ITER_ALL_SNAPSHOTS);
 	bch2_trans_iter_init(trans, &c_iter, key.btree_id, key.pos,
 			     BTREE_ITER_CACHED|
-			     BTREE_ITER_CACHED_NOFILL|
-			     BTREE_ITER_CACHED_NOCREATE|
 			     BTREE_ITER_INTENT);
 	b_iter.flags &= ~BTREE_ITER_WITH_KEY_CACHE;
 
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index edeaf843cd3f..a8917596d777 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -199,15 +199,13 @@ struct btree_node_iter {
 #define BTREE_ITER_IS_EXTENTS		(1 << 4)
 #define BTREE_ITER_NOT_EXTENTS		(1 << 5)
 #define BTREE_ITER_CACHED		(1 << 6)
-#define BTREE_ITER_CACHED_NOFILL	(1 << 7)
-#define BTREE_ITER_CACHED_NOCREATE	(1 << 8)
-#define BTREE_ITER_WITH_KEY_CACHE	(1 << 9)
-#define BTREE_ITER_WITH_UPDATES		(1 << 10)
-#define BTREE_ITER_WITH_JOURNAL		(1 << 11)
-#define __BTREE_ITER_ALL_SNAPSHOTS	(1 << 12)
-#define BTREE_ITER_ALL_SNAPSHOTS	(1 << 13)
-#define BTREE_ITER_FILTER_SNAPSHOTS	(1 << 14)
-#define BTREE_ITER_NOPRESERVE		(1 << 15)
+#define BTREE_ITER_WITH_KEY_CACHE	(1 << 7)
+#define BTREE_ITER_WITH_UPDATES		(1 << 8)
+#define BTREE_ITER_WITH_JOURNAL		(1 << 9)
+#define __BTREE_ITER_ALL_SNAPSHOTS	(1 << 10)
+#define BTREE_ITER_ALL_SNAPSHOTS	(1 << 11)
+#define BTREE_ITER_FILTER_SNAPSHOTS	(1 << 12)
+#define BTREE_ITER_NOPRESERVE		(1 << 13)
 
 enum btree_path_uptodate {
 	BTREE_ITER_UPTODATE		= 0,
-- 
cgit 


From a300261ad19d69e080278ec4950d39caef3ffbf1 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 10 Aug 2022 20:05:14 -0400
Subject: bcachefs: Fix duplicate paths left by bch2_path_put()

bch2_path_put() is supposed to drop paths that aren't needed on
transaction restart, or to hold locks that we're supposed to keep until
transaction commit: but it was failing to free paths in some cases that
it should have, leading to transaction path overflows with lots of
duplicate paths.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_iter.c | 69 ++++++++++++++++++++++++------------------------
 1 file changed, 35 insertions(+), 34 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index bd6e35fcbdbd..837cdcc5c77c 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -424,13 +424,17 @@ int bch2_btree_path_relock_intent(struct btree_trans *trans,
 	return 0;
 }
 
-noinline __flatten
-static int __bch2_btree_path_relock(struct btree_trans *trans,
+__flatten
+static bool bch2_btree_path_relock_norestart(struct btree_trans *trans,
 			struct btree_path *path, unsigned long trace_ip)
 {
-	bool ret = btree_path_get_locks(trans, path, false);
+	return btree_path_get_locks(trans, path, false);
+}
 
-	if (!ret) {
+static int __bch2_btree_path_relock(struct btree_trans *trans,
+			struct btree_path *path, unsigned long trace_ip)
+{
+	if (!bch2_btree_path_relock_norestart(trans, path, trace_ip)) {
 		trace_trans_restart_relock_path(trans, trace_ip, path);
 		return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_path);
 	}
@@ -1743,30 +1747,30 @@ out:
 
 static struct btree_path *have_path_at_pos(struct btree_trans *trans, struct btree_path *path)
 {
-	struct btree_path *next;
+	struct btree_path *sib;
 
-	next = prev_btree_path(trans, path);
-	if (next && !btree_path_cmp(next, path))
-		return next;
+	sib = prev_btree_path(trans, path);
+	if (sib && !btree_path_cmp(sib, path))
+		return sib;
 
-	next = next_btree_path(trans, path);
-	if (next && !btree_path_cmp(next, path))
-		return next;
+	sib = next_btree_path(trans, path);
+	if (sib && !btree_path_cmp(sib, path))
+		return sib;
 
 	return NULL;
 }
 
 static struct btree_path *have_node_at_pos(struct btree_trans *trans, struct btree_path *path)
 {
-	struct btree_path *next;
+	struct btree_path *sib;
 
-	next = prev_btree_path(trans, path);
-	if (next && next->level == path->level && path_l(next)->b == path_l(path)->b)
-		return next;
+	sib = prev_btree_path(trans, path);
+	if (sib && sib->level == path->level && path_l(sib)->b == path_l(path)->b)
+		return sib;
 
-	next = next_btree_path(trans, path);
-	if (next && next->level == path->level && path_l(next)->b == path_l(path)->b)
-		return next;
+	sib = next_btree_path(trans, path);
+	if (sib && sib->level == path->level && path_l(sib)->b == path_l(path)->b)
+		return sib;
 
 	return NULL;
 }
@@ -1788,26 +1792,23 @@ void bch2_path_put(struct btree_trans *trans, struct btree_path *path, bool inte
 	if (!__btree_path_put(path, intent))
 		return;
 
-	/*
-	 * Perhaps instead we should check for duplicate paths in traverse_all:
-	 */
-	if (path->preserve &&
-	    (dup = have_path_at_pos(trans, path))) {
-		dup->preserve = true;
-		path->preserve = false;
-		goto free;
-	}
+	dup = path->preserve
+		? have_path_at_pos(trans, path)
+		: have_node_at_pos(trans, path);
+
+	if (!dup && !(!path->preserve && !is_btree_node(path, path->level)))
+		return;
 
-	if (!path->preserve &&
-	    (dup = have_node_at_pos(trans, path)))
-		goto free;
-	return;
-free:
 	if (path->should_be_locked &&
-	    !btree_node_locked(dup, path->level))
+	    !trans->restarted &&
+	    (!dup || !bch2_btree_path_relock_norestart(trans, dup, _THIS_IP_)))
 		return;
 
-	dup->should_be_locked |= path->should_be_locked;
+	if (dup) {
+		dup->preserve		|= path->preserve;
+		dup->should_be_locked	|= path->should_be_locked;
+	}
+
 	__bch2_path_free(trans, path);
 }
 
-- 
cgit 


From 7c812ab786c4a689989aabf9e865164eb4f8004d Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 11 Aug 2022 13:23:04 -0400
Subject: bcachefs: Fix btree_path->uptodate inconsistency

This fixes an assertion in bch2_btree_path_peek_slot().

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_update_leaf.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 2d824f6a6fab..1371b7c6ff8b 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -726,8 +726,10 @@ bch2_trans_commit_write_locked(struct btree_trans *trans,
 			btree_insert_key_leaf(trans, i);
 		else if (!i->key_cache_already_flushed)
 			bch2_btree_insert_key_cached(trans, i->path, i->k);
-		else
+		else {
 			bch2_btree_key_cache_drop(trans, i->path);
+			btree_path_set_dirty(i->path, BTREE_ITER_NEED_TRAVERSE);
+		}
 	}
 
 	return ret;
-- 
cgit 


From 11c1a62f3b872d2345c97e72700ed4d1b2511888 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 11 Aug 2022 17:25:25 -0400
Subject: bcachefs: Switch bch2_btree_delete_range() to bch2_trans_run()

This fixes an assertion about unexpected transaction restarts -
bch2_delete_range_trans() handles transaction restarts.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_update_leaf.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 1371b7c6ff8b..6bf34853f261 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -1785,9 +1785,8 @@ int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id,
 			    unsigned update_flags,
 			    u64 *journal_seq)
 {
-	return bch2_trans_do(c, NULL, journal_seq, 0,
-			     bch2_btree_delete_range_trans(&trans, id, start, end,
-							   update_flags, journal_seq));
+	return bch2_trans_run(c,
+			bch2_btree_delete_range_trans(&trans, id, start, end, update_flags, journal_seq));
 }
 
 int bch2_trans_log_msg(struct btree_trans *trans, const char *msg)
-- 
cgit 


From 4aba7d4569f70167edf183055e809a37cd73cdd1 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 11 Aug 2022 19:36:24 -0400
Subject: bcachefs: Rename lock_held_stats -> btree_transaction_stats

Going to be adding more things to this in the next patch.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs.h      | 12 ++++++------
 fs/bcachefs/btree_iter.c    | 39 ++++++++++++++++++++++++++++++---------
 fs/bcachefs/btree_locking.h | 30 +++++++++++++++++++++---------
 fs/bcachefs/btree_types.h   |  2 +-
 fs/bcachefs/debug.c         | 17 ++++++++++-------
 5 files changed, 68 insertions(+), 32 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 9cd6f840b71a..9fe96516c114 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -319,8 +319,6 @@ BCH_DEBUG_PARAMS_DEBUG()
 #undef BCH_DEBUG_PARAM
 #endif
 
-#define BCH_LOCK_TIME_NR 128
-
 #define BCH_TIME_STATS()			\
 	x(btree_node_mem_alloc)			\
 	x(btree_node_split)			\
@@ -529,9 +527,10 @@ struct btree_debug {
 	unsigned		id;
 };
 
-struct lock_held_stats {
-	struct bch2_time_stats	times[BCH_LOCK_TIME_NR];
-	const char		*names[BCH_LOCK_TIME_NR];
+#define BCH_TRANSACTIONS_NR 128
+
+struct btree_transaction_stats {
+	struct bch2_time_stats	lock_hold_times;
 };
 
 struct bch_fs_pcpu {
@@ -928,7 +927,8 @@ mempool_t		bio_bounce_pages;
 
 	struct bch2_time_stats	times[BCH_TIME_STAT_NR];
 
-	struct lock_held_stats	lock_held_stats;
+	const char              *btree_transaction_fns[BCH_TRANSACTIONS_NR];
+	struct btree_transaction_stats btree_transaction_stats[BCH_TRANSACTIONS_NR];
 };
 
 static inline void bch2_set_ra_pages(struct bch_fs *c, unsigned ra_pages)
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 837cdcc5c77c..16d8391a5773 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -3252,6 +3252,22 @@ static void bch2_trans_alloc_paths(struct btree_trans *trans, struct bch_fs *c)
 	trans->updates		= p; p += updates_bytes;
 }
 
+static inline unsigned bch2_trans_get_fn_idx(struct btree_trans *trans, struct bch_fs *c,
+					const char *fn)
+{
+	unsigned i;
+
+	for (i = 0; i < ARRAY_SIZE(c->btree_transaction_fns); i++)
+		if (!c->btree_transaction_fns[i] ||
+		    c->btree_transaction_fns[i] == fn) {
+			c->btree_transaction_fns[i] = fn;
+			return i;
+		}
+
+	pr_warn_once("BCH_TRANSACTIONS_NR not big enough!");
+	return i;
+}
+
 void __bch2_trans_init(struct btree_trans *trans, struct bch_fs *c,
 		       unsigned expected_nr_iters,
 		       size_t expected_mem_bytes,
@@ -3262,19 +3278,11 @@ void __bch2_trans_init(struct btree_trans *trans, struct bch_fs *c,
 	trans->c		= c;
 	trans->fn		= fn;
 	trans->last_begin_time	= ktime_get_ns();
+	trans->fn_idx		= bch2_trans_get_fn_idx(trans, c, fn);
 	trans->task		= current;
 	trans->journal_replay_not_finished =
 		!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags);
 
-	while (c->lock_held_stats.names[trans->lock_name_idx] != fn
-	       && c->lock_held_stats.names[trans->lock_name_idx] != 0)
-		trans->lock_name_idx++;
-
-	if (trans->lock_name_idx >= BCH_LOCK_TIME_NR)
-		pr_warn_once("lock_times array not big enough!");
-	else
-		c->lock_held_stats.names[trans->lock_name_idx] = fn;
-
 	bch2_trans_alloc_paths(trans, c);
 
 	if (expected_mem_bytes) {
@@ -3446,6 +3454,13 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct btree_trans *trans)
 
 void bch2_fs_btree_iter_exit(struct bch_fs *c)
 {
+	struct btree_transaction_stats *s;
+
+	for (s = c->btree_transaction_stats;
+	     s < c->btree_transaction_stats + ARRAY_SIZE(c->btree_transaction_stats);
+	     s++)
+		bch2_time_stats_exit(&s->lock_hold_times);
+
 	if (c->btree_trans_barrier_initialized)
 		cleanup_srcu_struct(&c->btree_trans_barrier);
 	mempool_exit(&c->btree_trans_mem_pool);
@@ -3454,9 +3469,15 @@ void bch2_fs_btree_iter_exit(struct bch_fs *c)
 
 int bch2_fs_btree_iter_init(struct bch_fs *c)
 {
+	struct btree_transaction_stats *s;
 	unsigned nr = BTREE_ITER_MAX;
 	int ret;
 
+	for (s = c->btree_transaction_stats;
+	     s < c->btree_transaction_stats + ARRAY_SIZE(c->btree_transaction_stats);
+	     s++)
+		bch2_time_stats_init(&s->lock_hold_times);
+
 	INIT_LIST_HEAD(&c->btree_trans_list);
 	mutex_init(&c->btree_trans_lock);
 
diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h
index 5e2cd170aea2..acc27c3c05d6 100644
--- a/fs/bcachefs/btree_locking.h
+++ b/fs/bcachefs/btree_locking.h
@@ -112,6 +112,26 @@ btree_lock_want(struct btree_path *path, int level)
 	return BTREE_NODE_UNLOCKED;
 }
 
+static inline struct btree_transaction_stats *btree_trans_stats(struct btree_trans *trans)
+{
+	return trans->fn_idx < ARRAY_SIZE(trans->c->btree_transaction_stats)
+		? &trans->c->btree_transaction_stats[trans->fn_idx]
+		: NULL;
+}
+
+static void btree_trans_lock_hold_time_update(struct btree_trans *trans,
+					      struct btree_path *path, unsigned level)
+{
+#ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS
+	struct btree_transaction_stats *s = btree_trans_stats(trans);
+
+	if (s)
+		__bch2_time_stats_update(&s->lock_hold_times,
+					 path->l[level].lock_taken_time,
+					 ktime_get_ns());
+#endif
+}
+
 static inline void btree_node_unlock(struct btree_trans *trans,
 				     struct btree_path *path, unsigned level)
 {
@@ -121,15 +141,7 @@ static inline void btree_node_unlock(struct btree_trans *trans,
 
 	if (lock_type != BTREE_NODE_UNLOCKED) {
 		six_unlock_type(&path->l[level].b->c.lock, lock_type);
-#ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS
-		if (trans->lock_name_idx < BCH_LOCK_TIME_NR) {
-			struct bch_fs *c = trans->c;
-
-			__bch2_time_stats_update(&c->lock_held_stats.times[trans->lock_name_idx],
-					       path->l[level].lock_taken_time,
-						 ktime_get_ns());
-		}
-#endif
+		btree_trans_lock_hold_time_update(trans, path, level);
 	}
 	mark_btree_node_unlocked(path, level);
 }
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index a8917596d777..a49b3cd3baf8 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -390,6 +390,7 @@ struct btree_trans {
 	struct task_struct	*task;
 	int			srcu_idx;
 
+	u8			fn_idx;
 	u8			nr_sorted;
 	u8			nr_updates;
 	bool			used_mempool:1;
@@ -431,7 +432,6 @@ struct btree_trans {
 	unsigned		journal_u64s;
 	unsigned		journal_preres_u64s;
 	struct replicas_delta_list *fs_usage_deltas;
-	int                      lock_name_idx;
 };
 
 #define BTREE_FLAGS()							\
diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c
index b9b6cad8cd40..d40846f99f52 100644
--- a/fs/bcachefs/debug.c
+++ b/fs/bcachefs/debug.c
@@ -653,14 +653,17 @@ static ssize_t lock_held_stats_read(struct file *file, char __user *buf,
 				      size_t size, loff_t *ppos)
 {
 	struct dump_iter        *i = file->private_data;
-	struct lock_held_stats *lhs = &i->c->lock_held_stats;
+	struct bch_fs *c = i->c;
 	int err;
 
 	i->ubuf = buf;
 	i->size = size;
 	i->ret  = 0;
 
-	while (lhs->names[i->iter] != 0 && i->iter < BCH_LOCK_TIME_NR) {
+	while (i->iter < ARRAY_SIZE(c->btree_transaction_fns) &&
+	       c->btree_transaction_fns[i->iter]) {
+		struct btree_transaction_stats *s = &c->btree_transaction_stats[i->iter];
+
 		err = flush_buf(i);
 		if (err)
 			return err;
@@ -668,11 +671,11 @@ static ssize_t lock_held_stats_read(struct file *file, char __user *buf,
 		if (!i->size)
 			break;
 
-		prt_printf(&i->buf, "%s:", lhs->names[i->iter]);
+		prt_printf(&i->buf, "%s: ", c->btree_transaction_fns[i->iter]);
 		prt_newline(&i->buf);
-		printbuf_indent_add(&i->buf, 8);
-		bch2_time_stats_to_text(&i->buf, &lhs->times[i->iter]);
-		printbuf_indent_sub(&i->buf, 8);
+		printbuf_indent_add(&i->buf, 2);
+		bch2_time_stats_to_text(&i->buf, &s->lock_hold_times);
+		printbuf_indent_sub(&i->buf, 2);
 		prt_newline(&i->buf);
 		i->iter++;
 	}
@@ -721,7 +724,7 @@ void bch2_fs_debug_init(struct bch_fs *c)
 			    c->btree_debug, &journal_pins_ops);
 
 	if (IS_ENABLED(CONFIG_BCACHEFS_LOCK_TIME_STATS)) {
-		debugfs_create_file("lock_held_stats", 0400, c->fs_debug_dir,
+		debugfs_create_file("btree_transaction_stats", 0400, c->fs_debug_dir,
 				c, &lock_held_stats_op);
 	}
 
-- 
cgit 


From 5c0bb66ae341c71e5f62c193ea4d7b0cf278a914 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 11 Aug 2022 20:14:54 -0400
Subject: bcachefs: Track the maximum btree_paths ever allocated by each
 transaction

We need a way to check if the machinery for handling btree_paths with in
a transaction is behaving reasonably, as it often has not been - we've
had bugs with transaction path overflows caused by duplicate paths and
plenty of other things.

This patch tracks, per transaction fn, the most btree paths ever
allocated by that transaction and makes it available in debugfs.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs.h    |   3 ++
 fs/bcachefs/btree_iter.c  | 117 +++++++++++++++++++++++++++++++++++-----------
 fs/bcachefs/btree_iter.h  |   2 +
 fs/bcachefs/btree_types.h |   1 +
 fs/bcachefs/debug.c       |  30 ++++++++++--
 5 files changed, 120 insertions(+), 33 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 9fe96516c114..f8b7434534eb 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -531,6 +531,9 @@ struct btree_debug {
 
 struct btree_transaction_stats {
 	struct bch2_time_stats	lock_hold_times;
+	struct mutex		lock;
+	unsigned		nr_max_paths;
+	char			*max_paths_text;
 };
 
 struct bch_fs_pcpu {
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 16d8391a5773..ff0834049d94 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -775,6 +775,8 @@ void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id,
 	unsigned idx;
 	struct printbuf buf = PRINTBUF;
 
+	btree_trans_sort_paths(trans);
+
 	trans_for_each_path_inorder(trans, path, idx) {
 		int cmp = cmp_int(path->btree_id, id) ?:
 			cmp_int(path->cached, key_cache);
@@ -1812,6 +1814,7 @@ void bch2_path_put(struct btree_trans *trans, struct btree_path *path, bool inte
 	__bch2_path_free(trans, path);
 }
 
+noinline __cold
 void bch2_trans_updates_to_text(struct printbuf *buf, struct btree_trans *trans)
 {
 	struct btree_insert_entry *i;
@@ -1853,40 +1856,87 @@ void bch2_dump_trans_updates(struct btree_trans *trans)
 }
 
 noinline __cold
-void bch2_dump_trans_paths_updates(struct btree_trans *trans)
+void bch2_btree_path_to_text(struct printbuf *out, struct btree_path *path)
+{
+	prt_printf(out, "path: idx %2u ref %u:%u %c %c btree=%s l=%u pos ",
+		   path->idx, path->ref, path->intent_ref,
+		   path->preserve ? 'P' : ' ',
+		   path->should_be_locked ? 'S' : ' ',
+		   bch2_btree_ids[path->btree_id],
+		   path->level);
+	bch2_bpos_to_text(out, path->pos);
+
+	prt_printf(out, " locks %u", path->nodes_locked);
+#ifdef CONFIG_BCACHEFS_DEBUG
+	prt_printf(out, " %pS", (void *) path->ip_allocated);
+#endif
+	prt_newline(out);
+}
+
+noinline __cold
+void __bch2_trans_paths_to_text(struct printbuf *out, struct btree_trans *trans,
+				bool nosort)
 {
 	struct btree_path *path;
-	struct printbuf buf = PRINTBUF;
 	unsigned idx;
 
-	btree_trans_sort_paths(trans);
+	if (!nosort)
+		btree_trans_sort_paths(trans);
 
-	trans_for_each_path_inorder(trans, path, idx) {
-		printbuf_reset(&buf);
+	trans_for_each_path_inorder(trans, path, idx)
+		bch2_btree_path_to_text(out, path);
+}
 
-		bch2_bpos_to_text(&buf, path->pos);
+noinline __cold
+void bch2_trans_paths_to_text(struct printbuf *out, struct btree_trans *trans)
+{
+	__bch2_trans_paths_to_text(out, trans, false);
+}
 
-		printk(KERN_ERR "path: idx %2u ref %u:%u %c %c btree=%s l=%u pos %s locks %u %pS\n",
-		       path->idx, path->ref, path->intent_ref,
-		       path->preserve ? 'P' : ' ',
-		       path->should_be_locked ? 'S' : ' ',
-		       bch2_btree_ids[path->btree_id],
-		       path->level,
-		       buf.buf,
-		       path->nodes_locked,
-#ifdef CONFIG_BCACHEFS_DEBUG
-		       (void *) path->ip_allocated
-#else
-		       NULL
-#endif
-		       );
-	}
+noinline __cold
+void __bch2_dump_trans_paths_updates(struct btree_trans *trans, bool nosort)
+{
+	struct printbuf buf = PRINTBUF;
+
+	__bch2_trans_paths_to_text(&buf, trans, nosort);
 
+	printk(KERN_ERR "%s", buf.buf);
 	printbuf_exit(&buf);
 
 	bch2_dump_trans_updates(trans);
 }
 
+noinline __cold
+void bch2_dump_trans_paths_updates(struct btree_trans *trans)
+{
+	__bch2_dump_trans_paths_updates(trans, false);
+}
+
+noinline __cold
+static void bch2_trans_update_max_paths(struct btree_trans *trans)
+{
+	struct btree_transaction_stats *s = btree_trans_stats(trans);
+	struct printbuf buf = PRINTBUF;
+
+	if (!s)
+		return;
+
+	bch2_trans_paths_to_text(&buf, trans);
+
+	if (!buf.allocation_failure) {
+		mutex_lock(&s->lock);
+		if (s->nr_max_paths < hweight64(trans->paths_allocated)) {
+			s->nr_max_paths = hweight64(trans->paths_allocated);
+			swap(s->max_paths_text, buf.buf);
+		}
+		mutex_unlock(&s->lock);
+	}
+
+	printbuf_exit(&buf);
+
+	trans->nr_max_paths = hweight64(trans->paths_allocated);
+}
+
 static struct btree_path *btree_path_alloc(struct btree_trans *trans,
 					   struct btree_path *pos)
 {
@@ -1903,7 +1953,6 @@ static struct btree_path *btree_path_alloc(struct btree_trans *trans,
 	trans->paths_allocated |= 1ULL << idx;
 
 	path = &trans->paths[idx];
-
 	path->idx		= idx;
 	path->ref		= 0;
 	path->intent_ref	= 0;
@@ -1911,6 +1960,10 @@ static struct btree_path *btree_path_alloc(struct btree_trans *trans,
 	path->nodes_intent_locked = 0;
 
 	btree_path_list_add(trans, pos, path);
+	trans->paths_sorted = false;
+
+	if (unlikely(idx > trans->nr_max_paths))
+		bch2_trans_update_max_paths(trans);
 	return path;
 }
 
@@ -1929,8 +1982,6 @@ struct btree_path *bch2_path_get(struct btree_trans *trans,
 
 	btree_trans_sort_paths(trans);
 
-	btree_trans_sort_paths(trans);
-
 	trans_for_each_path_inorder(trans, path, i) {
 		if (__btree_path_cmp(path,
 				     btree_id,
@@ -2926,7 +2977,7 @@ static void btree_trans_verify_sorted(struct btree_trans *trans)
 
 	trans_for_each_path_inorder(trans, path, i) {
 		if (prev && btree_path_cmp(prev, path) > 0) {
-			bch2_dump_trans_paths_updates(trans);
+			__bch2_dump_trans_paths_updates(trans, true);
 			panic("trans paths out of order!\n");
 		}
 		prev = path;
@@ -2949,7 +3000,7 @@ void __bch2_btree_trans_sort_paths(struct btree_trans *trans)
 
 	/*
 	 * Cocktail shaker sort: this is efficient because iterators will be
-	 * mostly sorteda.
+	 * mostly sorted.
 	 */
 	do {
 		swapped = false;
@@ -3274,6 +3325,8 @@ void __bch2_trans_init(struct btree_trans *trans, struct bch_fs *c,
 		       const char *fn)
 	__acquires(&c->btree_trans_barrier)
 {
+	struct btree_transaction_stats *s;
+
 	memset(trans, 0, sizeof(*trans));
 	trans->c		= c;
 	trans->fn		= fn;
@@ -3297,6 +3350,10 @@ void __bch2_trans_init(struct btree_trans *trans, struct bch_fs *c,
 		}
 	}
 
+	s = btree_trans_stats(trans);
+	if (s)
+		trans->nr_max_paths = s->nr_max_paths;
+
 	trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
 
 	if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG_TRANSACTIONS)) {
@@ -3458,8 +3515,10 @@ void bch2_fs_btree_iter_exit(struct bch_fs *c)
 
 	for (s = c->btree_transaction_stats;
 	     s < c->btree_transaction_stats + ARRAY_SIZE(c->btree_transaction_stats);
-	     s++)
+	     s++) {
+		kfree(s->max_paths_text);
 		bch2_time_stats_exit(&s->lock_hold_times);
+	}
 
 	if (c->btree_trans_barrier_initialized)
 		cleanup_srcu_struct(&c->btree_trans_barrier);
@@ -3475,8 +3534,10 @@ int bch2_fs_btree_iter_init(struct bch_fs *c)
 
 	for (s = c->btree_transaction_stats;
 	     s < c->btree_transaction_stats + ARRAY_SIZE(c->btree_transaction_stats);
-	     s++)
+	     s++) {
 		bch2_time_stats_init(&s->lock_hold_times);
+		mutex_init(&s->lock);
+	}
 
 	INIT_LIST_HEAD(&c->btree_trans_list);
 	mutex_init(&c->btree_trans_lock);
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index 3e3f35e29182..aa4d2a5df34e 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -571,6 +571,8 @@ __bch2_btree_iter_peek_and_restart(struct btree_trans *trans,
 /* new multiple iterator interface: */
 
 void bch2_trans_updates_to_text(struct printbuf *, struct btree_trans *);
+void bch2_btree_path_to_text(struct printbuf *, struct btree_path *);
+void bch2_trans_paths_to_text(struct printbuf *, struct btree_trans *);
 void bch2_dump_trans_updates(struct btree_trans *);
 void bch2_dump_trans_paths_updates(struct btree_trans *);
 void __bch2_trans_init(struct btree_trans *, struct bch_fs *,
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index a49b3cd3baf8..0a5803a3a75d 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -408,6 +408,7 @@ struct btree_trans {
 	 * extent:
 	 */
 	unsigned		extra_journal_res;
+	unsigned		nr_max_paths;
 
 	u64			paths_allocated;
 
diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c
index d40846f99f52..c982b0d80c91 100644
--- a/fs/bcachefs/debug.c
+++ b/fs/bcachefs/debug.c
@@ -674,7 +674,29 @@ static ssize_t lock_held_stats_read(struct file *file, char __user *buf,
 		prt_printf(&i->buf, "%s: ", c->btree_transaction_fns[i->iter]);
 		prt_newline(&i->buf);
 		printbuf_indent_add(&i->buf, 2);
-		bch2_time_stats_to_text(&i->buf, &s->lock_hold_times);
+
+		mutex_lock(&s->lock);
+
+		if (IS_ENABLED(CONFIG_BCACHEFS_LOCK_TIME_STATS)) {
+			prt_printf(&i->buf, "Lock hold times:");
+			prt_newline(&i->buf);
+
+			printbuf_indent_add(&i->buf, 2);
+			bch2_time_stats_to_text(&i->buf, &s->lock_hold_times);
+			printbuf_indent_sub(&i->buf, 2);
+		}
+
+		if (s->max_paths_text) {
+			prt_printf(&i->buf, "Maximum allocated btree paths (%u):", s->nr_max_paths);
+			prt_newline(&i->buf);
+
+			printbuf_indent_add(&i->buf, 2);
+			prt_str_indented(&i->buf, s->max_paths_text);
+			printbuf_indent_sub(&i->buf, 2);
+		}
+
+		mutex_unlock(&s->lock);
+
 		printbuf_indent_sub(&i->buf, 2);
 		prt_newline(&i->buf);
 		i->iter++;
@@ -723,10 +745,8 @@ void bch2_fs_debug_init(struct bch_fs *c)
 	debugfs_create_file("journal_pins", 0400, c->fs_debug_dir,
 			    c->btree_debug, &journal_pins_ops);
 
-	if (IS_ENABLED(CONFIG_BCACHEFS_LOCK_TIME_STATS)) {
-		debugfs_create_file("btree_transaction_stats", 0400, c->fs_debug_dir,
-				c, &lock_held_stats_op);
-	}
+	debugfs_create_file("btree_transaction_stats", 0400, c->fs_debug_dir,
+			    c, &lock_held_stats_op);
 
 	c->btree_debug_dir = debugfs_create_dir("btrees", c->fs_debug_dir);
 	if (IS_ERR_OR_NULL(c->btree_debug_dir))
-- 
cgit 


From ff7dc3651d5bdcc9d9fe4ace3da21f0f5c2bd778 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 11 Aug 2022 21:06:02 -0400
Subject: bcachefs: Print last line in debugfs/btree_transaction_stats

We need to turn the flush_buf() thing into a proper API, to replace
seq_file.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/debug.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c
index c982b0d80c91..183e9f099ca9 100644
--- a/fs/bcachefs/debug.c
+++ b/fs/bcachefs/debug.c
@@ -660,8 +660,7 @@ static ssize_t lock_held_stats_read(struct file *file, char __user *buf,
 	i->size = size;
 	i->ret  = 0;
 
-	while (i->iter < ARRAY_SIZE(c->btree_transaction_fns) &&
-	       c->btree_transaction_fns[i->iter]) {
+	while (1) {
 		struct btree_transaction_stats *s = &c->btree_transaction_stats[i->iter];
 
 		err = flush_buf(i);
@@ -671,6 +670,10 @@ static ssize_t lock_held_stats_read(struct file *file, char __user *buf,
 		if (!i->size)
 			break;
 
+		if (i->iter == ARRAY_SIZE(c->btree_transaction_fns) ||
+		    !c->btree_transaction_fns[i->iter])
+			break;
+
 		prt_printf(&i->buf, "%s: ", c->btree_transaction_fns[i->iter]);
 		prt_newline(&i->buf);
 		printbuf_indent_add(&i->buf, 2);
-- 
cgit 


From 45b033fa1afd35a8eab0af003ffac9413548f476 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 11 Aug 2022 21:06:43 -0400
Subject: bcachefs: Fix assertion in bch2_btree_key_cache_drop()

Turns out this assertion was something we could legitimately hit - add a
comment describing what's going on, and handle it.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_key_cache.c | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index f0055f17381d..0e87c19effeb 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -639,11 +639,22 @@ bool bch2_btree_insert_key_cached(struct btree_trans *trans,
 void bch2_btree_key_cache_drop(struct btree_trans *trans,
 			       struct btree_path *path)
 {
+	struct bch_fs *c = trans->c;
 	struct bkey_cached *ck = (void *) path->l[0].b;
 
-	ck->valid = false;
+	BUG_ON(!ck->valid);
 
-	BUG_ON(test_bit(BKEY_CACHED_DIRTY, &ck->flags));
+	/*
+	 * We just did an update to the btree, bypassing the key cache: the key
+	 * cache key is now stale and must be dropped, even if dirty:
+	 */
+	if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
+		clear_bit(BKEY_CACHED_DIRTY, &ck->flags);
+		atomic_long_dec(&c->btree_key_cache.nr_dirty);
+		bch2_journal_pin_drop(&c->journal, &ck->journal);
+	}
+
+	ck->valid = false;
 }
 
 static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
-- 
cgit 


From c497df8b85a7be22373d3d2e57e067285ebcd731 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 12 Aug 2022 12:45:01 -0400
Subject: bcachefs: Increment restart count in bch2_trans_begin()

Instead of counting transaction restarts, count when the transaction is
restarted: if bch2_trans_begin() was called when the transaction wasn't
restarted we need to ensure restart_count is still incremented.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_iter.c |  1 +
 fs/bcachefs/btree_iter.h |  1 -
 fs/bcachefs/fsck.c       | 10 ++++------
 3 files changed, 5 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index ff0834049d94..08f39687e964 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -3229,6 +3229,7 @@ u32 bch2_trans_begin(struct btree_trans *trans)
 
 	bch2_trans_reset_updates(trans);
 
+	trans->restart_count++;
 	trans->mem_top			= 0;
 
 	if (trans->fs_usage_deltas) {
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index aa4d2a5df34e..c0b3c9d06505 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -218,7 +218,6 @@ static inline int btree_trans_restart_nounlock(struct btree_trans *trans, int er
 	BUG_ON(!bch2_err_matches(err, BCH_ERR_transaction_restart));
 
 	trans->restarted = err;
-	trans->restart_count++;
 	return -err;
 }
 
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index c93e177a314f..ef2e32864580 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -1149,13 +1149,11 @@ static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w)
 		}
 	}
 fsck_err:
-	if (ret) {
+	if (ret)
 		bch_err(c, "error from check_i_sectors(): %s", bch2_err_str(ret));
-		return ret;
-	}
-	if (trans_was_restarted(trans, restart_count))
-		return -BCH_ERR_transaction_restart_nested;
-	return 0;
+	if (!ret && trans_was_restarted(trans, restart_count))
+		ret = -BCH_ERR_transaction_restart_nested;
+	return ret;
 }
 
 static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
-- 
cgit 


From c59d66b51b11064f17d87d5b9695e0216b8ade99 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 15 Aug 2022 14:01:56 -0400
Subject: bcachefs: Fix bch2_fs_check_snapshots()

We were iterating starting at BCACHEFS_ROOT_INO, but snapshots start at
POS_MIN - meaning this code was never getting run.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Reported-by: Olexa Bilaniuk <obilaniu@gmail.com>
---
 fs/bcachefs/subvolume.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c
index 24244bc3d2fb..fb3f8e4074c7 100644
--- a/fs/bcachefs/subvolume.c
+++ b/fs/bcachefs/subvolume.c
@@ -278,8 +278,8 @@ int bch2_fs_check_snapshots(struct bch_fs *c)
 
 	bch2_trans_init(&trans, c, 0, 0);
 
-	ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_snapshots,
-			POS(BCACHEFS_ROOT_INO, 0),
+	ret = for_each_btree_key_commit(&trans, iter,
+			BTREE_ID_snapshots, POS_MIN,
 			BTREE_ITER_PREFETCH, k,
 			NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
 		check_snapshot(&trans, &iter, k));
-- 
cgit 


From 9375fbc20079ed0c8a10d8d387b2c173a3dc04d7 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 14 Aug 2022 16:11:35 -0400
Subject: bcachefs: Debugfs cleanup

This improves flush_buf() so that it always returns nonzero when we're
done reading and ready to return to userspace, and so that it returns
the value we want to return to userspace (number of bytes read, if there
wasn't an error).

In the future we'll be better abstracting this mechanism and pulling it
out of bcachefs, and using it to replace seq_file.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/debug.c | 115 +++++++++++++++++++++++-----------------------------
 1 file changed, 51 insertions(+), 64 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c
index 183e9f099ca9..86c4b023ac7c 100644
--- a/fs/bcachefs/debug.c
+++ b/fs/bcachefs/debug.c
@@ -198,7 +198,7 @@ struct dump_iter {
 	ssize_t			ret;	/* bytes read so far */
 };
 
-static int flush_buf(struct dump_iter *i)
+static ssize_t flush_buf(struct dump_iter *i)
 {
 	if (i->buf.pos) {
 		size_t bytes = min_t(size_t, i->buf.pos, i->size);
@@ -214,7 +214,7 @@ static int flush_buf(struct dump_iter *i)
 		memmove(i->buf.buf, i->buf.buf + bytes, i->buf.pos);
 	}
 
-	return 0;
+	return i->size ? 0 : i->ret;
 }
 
 static int bch2_dump_open(struct inode *inode, struct file *file)
@@ -252,7 +252,7 @@ static ssize_t bch2_read_btree(struct file *file, char __user *buf,
 	struct btree_trans trans;
 	struct btree_iter iter;
 	struct bkey_s_c k;
-	int err;
+	ssize_t ret;
 
 	i->ubuf = buf;
 	i->size	= size;
@@ -260,14 +260,11 @@ static ssize_t bch2_read_btree(struct file *file, char __user *buf,
 
 	bch2_trans_init(&trans, i->c, 0, 0);
 
-	err = for_each_btree_key2(&trans, iter, i->id, i->from,
+	ret = for_each_btree_key2(&trans, iter, i->id, i->from,
 				  BTREE_ITER_PREFETCH|
 				  BTREE_ITER_ALL_SNAPSHOTS, k, ({
-		err = flush_buf(i);
-		if (err)
-			break;
-
-		if (!i->size)
+		ret = flush_buf(i);
+		if (ret)
 			break;
 
 		bch2_bkey_val_to_text(&i->buf, i->c, k);
@@ -276,12 +273,12 @@ static ssize_t bch2_read_btree(struct file *file, char __user *buf,
 	}));
 	i->from = iter.pos;
 
-	if (!err)
-		err = flush_buf(i);
+	if (!ret)
+		ret = flush_buf(i);
 
 	bch2_trans_exit(&trans);
 
-	return err ?: i->ret;
+	return ret ?: i->ret;
 }
 
 static const struct file_operations btree_debug_ops = {
@@ -298,43 +295,39 @@ static ssize_t bch2_read_btree_formats(struct file *file, char __user *buf,
 	struct btree_trans trans;
 	struct btree_iter iter;
 	struct btree *b;
-	int err;
+	ssize_t ret;
 
 	i->ubuf = buf;
 	i->size	= size;
 	i->ret	= 0;
 
-	err = flush_buf(i);
-	if (err)
-		return err;
+	ret = flush_buf(i);
+	if (ret)
+		return ret;
 
-	if (!i->size || !bpos_cmp(SPOS_MAX, i->from))
+	if (!bpos_cmp(SPOS_MAX, i->from))
 		return i->ret;
 
 	bch2_trans_init(&trans, i->c, 0, 0);
 
-	for_each_btree_node(&trans, iter, i->id, i->from, 0, b, err) {
-		bch2_btree_node_to_text(&i->buf, i->c, b);
-		err = flush_buf(i);
-		if (err)
+	for_each_btree_node(&trans, iter, i->id, i->from, 0, b, ret) {
+		ret = flush_buf(i);
+		if (ret)
 			break;
 
-		/*
-		 * can't easily correctly restart a btree node traversal across
-		 * all nodes, meh
-		 */
+		bch2_btree_node_to_text(&i->buf, i->c, b);
 		i->from = bpos_cmp(SPOS_MAX, b->key.k.p)
 			? bpos_successor(b->key.k.p)
 			: b->key.k.p;
-
-		if (!i->size)
-			break;
 	}
 	bch2_trans_iter_exit(&trans, &iter);
 
 	bch2_trans_exit(&trans);
 
-	return err < 0 ? err : i->ret;
+	if (!ret)
+		ret = flush_buf(i);
+
+	return ret ?: i->ret;
 }
 
 static const struct file_operations btree_format_debug_ops = {
@@ -351,33 +344,27 @@ static ssize_t bch2_read_bfloat_failed(struct file *file, char __user *buf,
 	struct btree_trans trans;
 	struct btree_iter iter;
 	struct bkey_s_c k;
-	int err;
+	ssize_t ret;
 
 	i->ubuf = buf;
 	i->size	= size;
 	i->ret	= 0;
 
-	err = flush_buf(i);
-	if (err)
-		return err;
-
-	if (!i->size)
-		return i->ret;
+	ret = flush_buf(i);
+	if (ret)
+		return ret;
 
 	bch2_trans_init(&trans, i->c, 0, 0);
 
-	err = for_each_btree_key2(&trans, iter, i->id, i->from,
+	ret = for_each_btree_key2(&trans, iter, i->id, i->from,
 				  BTREE_ITER_PREFETCH|
 				  BTREE_ITER_ALL_SNAPSHOTS, k, ({
 		struct btree_path_level *l = &iter.path->l[0];
 		struct bkey_packed *_k =
 			bch2_btree_node_iter_peek(&l->iter, l->b);
 
-		err = flush_buf(i);
-		if (err)
-			break;
-
-		if (!i->size)
+		ret = flush_buf(i);
+		if (ret)
 			break;
 
 		if (bpos_cmp(l->b->key.k.p, i->prev_node) > 0) {
@@ -390,12 +377,12 @@ static ssize_t bch2_read_bfloat_failed(struct file *file, char __user *buf,
 	}));
 	i->from = iter.pos;
 
-	if (!err)
-		err = flush_buf(i);
-
 	bch2_trans_exit(&trans);
 
-	return err ?: i->ret;
+	if (!ret)
+		ret = flush_buf(i);
+
+	return ret ?: i->ret;
 }
 
 static const struct file_operations bfloat_failed_debug_ops = {
@@ -466,7 +453,7 @@ static ssize_t bch2_cached_btree_nodes_read(struct file *file, char __user *buf,
 	struct dump_iter *i = file->private_data;
 	struct bch_fs *c = i->c;
 	bool done = false;
-	int err;
+	ssize_t ret = 0;
 
 	i->ubuf = buf;
 	i->size	= size;
@@ -477,12 +464,9 @@ static ssize_t bch2_cached_btree_nodes_read(struct file *file, char __user *buf,
 		struct rhash_head *pos;
 		struct btree *b;
 
-		err = flush_buf(i);
-		if (err)
-			return err;
-
-		if (!i->size)
-			break;
+		ret = flush_buf(i);
+		if (ret)
+			return ret;
 
 		rcu_read_lock();
 		i->buf.atomic++;
@@ -500,9 +484,12 @@ static ssize_t bch2_cached_btree_nodes_read(struct file *file, char __user *buf,
 	} while (!done);
 
 	if (i->buf.allocation_failure)
-		return -ENOMEM;
+		ret = -ENOMEM;
 
-	return i->ret;
+	if (!ret)
+		ret = flush_buf(i);
+
+	return ret ?: i->ret;
 }
 
 static const struct file_operations cached_btree_nodes_ops = {
@@ -539,7 +526,7 @@ static ssize_t bch2_btree_transactions_read(struct file *file, char __user *buf,
 	struct dump_iter *i = file->private_data;
 	struct bch_fs *c = i->c;
 	struct btree_trans *trans;
-	int err;
+	ssize_t ret = 0;
 
 	i->ubuf = buf;
 	i->size	= size;
@@ -550,12 +537,9 @@ static ssize_t bch2_btree_transactions_read(struct file *file, char __user *buf,
 		if (trans->task->pid <= i->iter)
 			continue;
 
-		err = flush_buf(i);
-		if (err)
-			return err;
-
-		if (!i->size)
-			break;
+		ret = flush_buf(i);
+		if (ret)
+			return ret;
 
 		bch2_btree_trans_to_text(&i->buf, trans);
 
@@ -571,9 +555,12 @@ static ssize_t bch2_btree_transactions_read(struct file *file, char __user *buf,
 	mutex_unlock(&c->btree_trans_lock);
 
 	if (i->buf.allocation_failure)
-		return -ENOMEM;
+		ret = -ENOMEM;
 
-	return i->ret;
+	if (!ret)
+		ret = flush_buf(i);
+
+	return ret ?: i->ret;
 }
 
 static const struct file_operations btree_transactions_ops = {
-- 
cgit 


From efa8a7014d288b713781404367b54ef10aa2477f Mon Sep 17 00:00:00 2001
From: Olexa Bilaniuk <obilaniu@gmail.com>
Date: Mon, 15 Aug 2022 14:20:22 -0400
Subject: bcachefs: remove dead whiteout_u64s argument.

Signed-off-by: Olexa Bilaniuk <obilaniu@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_io.c | 16 +++++-----------
 1 file changed, 5 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 4254f7c7d85e..6e39c1641b90 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -773,8 +773,7 @@ static int bset_key_invalid(struct bch_fs *c, struct btree *b,
 }
 
 static int validate_bset_keys(struct bch_fs *c, struct btree *b,
-			 struct bset *i, unsigned *whiteout_u64s,
-			 int write, bool have_retry)
+			 struct bset *i, int write, bool have_retry)
 {
 	unsigned version = le16_to_cpu(i->version);
 	struct bkey_packed *k, *prev = NULL;
@@ -910,7 +909,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
 	}
 
 	while (b->written < (ptr_written ?: btree_sectors(c))) {
-		unsigned sectors, whiteout_u64s = 0;
+		unsigned sectors;
 		struct nonce nonce;
 		struct bch_csum csum;
 		bool first = !b->written;
@@ -979,8 +978,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
 		if (!b->written)
 			btree_node_set_format(b, b->data->format);
 
-		ret = validate_bset_keys(c, b, i, &whiteout_u64s,
-				    READ, have_retry);
+		ret = validate_bset_keys(c, b, i, READ, have_retry);
 		if (ret)
 			goto fsck_err;
 
@@ -1006,11 +1004,8 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
 		if (blacklisted && !first)
 			continue;
 
-		sort_iter_add(iter, i->start,
-			      vstruct_idx(i, whiteout_u64s));
-
 		sort_iter_add(iter,
-			      vstruct_idx(i, whiteout_u64s),
+			      vstruct_idx(i, 0),
 			      vstruct_last(i));
 
 		nonblacklisted_written = b->written;
@@ -1740,7 +1735,6 @@ static void btree_node_write_endio(struct bio *bio)
 static int validate_bset_for_write(struct bch_fs *c, struct btree *b,
 				   struct bset *i, unsigned sectors)
 {
-	unsigned whiteout_u64s = 0;
 	struct printbuf buf = PRINTBUF;
 	int ret;
 
@@ -1753,7 +1747,7 @@ static int validate_bset_for_write(struct bch_fs *c, struct btree *b,
 	if (ret)
 		return ret;
 
-	ret = validate_bset_keys(c, b, i, &whiteout_u64s, WRITE, false) ?:
+	ret = validate_bset_keys(c, b, i, WRITE, false) ?:
 		validate_bset(c, NULL, b, i, b->written, sectors, WRITE, false);
 	if (ret) {
 		bch2_inconsistent_error(c);
-- 
cgit 


From 15bc0948e73d9a858a6b69fc4eb34d176436044c Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 15 Aug 2022 14:05:44 -0400
Subject: bcachefs: Add an overflow check in set_bkey_val_u64s()

For now this is just a BUG_ON() - we may want to change this to return
an error in the future.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/bkey.h | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bkey.h b/fs/bcachefs/bkey.h
index 6a637a408a9f..8e9405f89537 100644
--- a/fs/bcachefs/bkey.h
+++ b/fs/bcachefs/bkey.h
@@ -50,12 +50,15 @@ static inline size_t bkey_val_bytes(const struct bkey *k)
 
 static inline void set_bkey_val_u64s(struct bkey *k, unsigned val_u64s)
 {
-	k->u64s = BKEY_U64s + val_u64s;
+	unsigned u64s = BKEY_U64s + val_u64s;
+
+	BUG_ON(u64s > U8_MAX);
+	k->u64s = u64s;
 }
 
 static inline void set_bkey_val_bytes(struct bkey *k, unsigned bytes)
 {
-	k->u64s = BKEY_U64s + DIV_ROUND_UP(bytes, sizeof(u64));
+	set_bkey_val_u64s(k, DIV_ROUND_UP(bytes, sizeof(u64)));
 }
 
 #define bkey_val_end(_k)	((void *) (((u64 *) (_k).v) + bkey_val_u64s((_k).k)))
-- 
cgit 


From bbf4288401519a7554201caf9b945c79f29753b3 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 17 Aug 2022 14:20:48 -0400
Subject: bcachefs: Always rebuild aux search trees when node boundaries change

Topology repair may change btree node min/max keys: when it does so, we
need to always rebuild eytzinger search trees because nodes directly
depend on those values.

This fixes a bug found by the 'kill_btree_node' test, where we'd pop an
assertion in bch2_bset_search_linear().

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_io.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 6e39c1641b90..bd74bd31dd1f 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -611,7 +611,6 @@ void bch2_btree_node_drop_keys_outside_node(struct btree *b)
 					  (u64 *) vstruct_end(i) - (u64 *) k);
 			i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - shift);
 			set_btree_bset_end(b, t);
-			bch2_bset_set_no_aux_tree(b, t);
 		}
 
 		for (k = i->start; k != vstruct_last(i); k = bkey_next(k))
@@ -621,10 +620,14 @@ void bch2_btree_node_drop_keys_outside_node(struct btree *b)
 		if (k != vstruct_last(i)) {
 			i->u64s = cpu_to_le16((u64 *) k - (u64 *) i->start);
 			set_btree_bset_end(b, t);
-			bch2_bset_set_no_aux_tree(b, t);
 		}
 	}
 
+	/*
+	 * Always rebuild search trees: eytzinger search tree nodes directly
+	 * depend on the values of min/max key:
+	 */
+	bch2_bset_set_no_aux_tree(b, b->set);
 	bch2_btree_build_aux_trees(b);
 
 	for_each_btree_node_key_unpack(b, k, &iter, &unpacked) {
-- 
cgit 


From 223b560e02098502b4e1c87aa9767620852d1bfd Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 15 Aug 2022 18:55:20 -0400
Subject: bcachefs: btree_path_down() optimization

We should be calling btree_node_mem_ptr_set() before path_level_init(),
since we already touched the key that btree_node_mem_ptr_set() will
modify and path_level_init() will be doing the lookup in the child btree
node we're recursing to.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_iter.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 08f39687e964..a464327d7024 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1410,9 +1410,6 @@ static __always_inline int btree_path_down(struct btree_trans *trans,
 	if (unlikely(ret))
 		goto err;
 
-	mark_btree_node_locked(trans, path, level, lock_type);
-	btree_path_level_init(trans, path, b);
-
 	if (likely(!trans->journal_replay_not_finished &&
 		   tmp.k->k.type == KEY_TYPE_btree_ptr_v2) &&
 	    unlikely(b != btree_node_mem_ptr(tmp.k)))
@@ -1420,7 +1417,10 @@ static __always_inline int btree_path_down(struct btree_trans *trans,
 
 	if (btree_node_read_locked(path, level + 1))
 		btree_node_unlock(trans, path, level + 1);
+
+	mark_btree_node_locked(trans, path, level, lock_type);
 	path->level = level;
+	btree_path_level_init(trans, path, b);
 
 	bch2_btree_path_verify_locks(path);
 err:
-- 
cgit 


From f0d2e9f2e511c137b75f15d0d13abd0217239253 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 7 Jul 2022 00:37:46 -0400
Subject: bcachefs: Add assertions for unexpected transaction restarts

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_iter.h            | 12 +++++++++---
 fs/bcachefs/btree_update_interior.c |  2 ++
 2 files changed, 11 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index c0b3c9d06505..51beeddcd45e 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -427,13 +427,17 @@ __bch2_btree_iter_peek_and_restart(struct btree_trans *trans,
 
 #define lockrestart_do(_trans, _do)					\
 ({									\
+	u32 _restart_count;						\
 	int _ret;							\
 									\
 	do {								\
-		bch2_trans_begin(_trans);				\
+		_restart_count = bch2_trans_begin(_trans);		\
 		_ret = (_do);						\
 	} while (bch2_err_matches(_ret, BCH_ERR_transaction_restart));	\
 									\
+	if (!_ret)							\
+		bch2_trans_verify_not_restarted(_trans, _restart_count);\
+									\
 	_ret;								\
 })
 
@@ -474,7 +478,7 @@ __bch2_btree_iter_peek_and_restart(struct btree_trans *trans,
 			     (_start), (_flags));			\
 									\
 	while (1) {							\
-		bch2_trans_begin(_trans);				\
+		u32 _restart_count = bch2_trans_begin(_trans);		\
 		(_k) = bch2_btree_iter_peek_type(&(_iter), (_flags));	\
 		if (!(_k).k) {						\
 			_ret = 0;					\
@@ -486,6 +490,7 @@ __bch2_btree_iter_peek_and_restart(struct btree_trans *trans,
 			continue;					\
 		if (_ret)						\
 			break;						\
+		bch2_trans_verify_not_restarted(_trans, _restart_count);\
 		if (!bch2_btree_iter_advance(&(_iter)))			\
 			break;						\
 	}								\
@@ -503,7 +508,7 @@ __bch2_btree_iter_peek_and_restart(struct btree_trans *trans,
 			     (_start), (_flags));			\
 									\
 	while (1) {							\
-		bch2_trans_begin(_trans);				\
+		u32 _restart_count = bch2_trans_begin(_trans);		\
 		(_k) = bch2_btree_iter_peek_prev_type(&(_iter), (_flags));\
 		if (!(_k).k) {						\
 			_ret = 0;					\
@@ -515,6 +520,7 @@ __bch2_btree_iter_peek_and_restart(struct btree_trans *trans,
 			continue;					\
 		if (_ret)						\
 			break;						\
+		bch2_trans_verify_not_restarted(_trans, _restart_count);\
 		if (!bch2_btree_iter_rewind(&(_iter)))			\
 			break;						\
 	}								\
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 2190f288e21f..e10c159ec079 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -969,6 +969,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
 	unsigned update_level = level;
 	int journal_flags = flags & JOURNAL_WATERMARK_MASK;
 	int ret = 0;
+	u32 restart_count = trans->restart_count;
 
 	BUG_ON(!path->should_be_locked);
 
@@ -1094,6 +1095,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
 	if (ret)
 		goto err;
 
+	bch2_trans_verify_not_restarted(trans, restart_count);
 	return as;
 err:
 	bch2_btree_update_free(as);
-- 
cgit 


From d0b50524f1d9b60318b92830546b45cd3325cfe2 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 14 Aug 2022 14:44:17 -0400
Subject: bcachefs: bch2_bkey_packed_to_binary_text()

For debugging the eytzinger search tree code, and low level bkey packing
code, it can be helpful to see things in binary: this patch improves our
helpers for doing so.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/bkey.c | 80 ++++++++++++++++++++++++++++++++++++------------------
 fs/bcachefs/bkey.h |  4 ++-
 fs/bcachefs/util.c |  6 ++++
 fs/bcachefs/util.h |  2 ++
 4 files changed, 65 insertions(+), 27 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bkey.c b/fs/bcachefs/bkey.c
index a1115abf83bb..52af6f370eb9 100644
--- a/fs/bcachefs/bkey.c
+++ b/fs/bcachefs/bkey.c
@@ -19,33 +19,49 @@ const struct bkey_format bch2_bkey_format_current = BKEY_FORMAT_CURRENT;
 struct bkey __bch2_bkey_unpack_key(const struct bkey_format *,
 			      const struct bkey_packed *);
 
-void bch2_to_binary(char *out, const u64 *p, unsigned nr_bits)
+void bch2_bkey_packed_to_binary_text(struct printbuf *out,
+				     const struct bkey_format *f,
+				     const struct bkey_packed *k)
 {
-	unsigned bit = high_bit_offset, done = 0;
+	const u64 *p = high_word(f, k);
+	unsigned word_bits = 64 - high_bit_offset;
+	unsigned nr_key_bits = bkey_format_key_bits(f) + high_bit_offset;
+	u64 v = *p & (~0ULL >> high_bit_offset);
+
+	if (!nr_key_bits) {
+		prt_str(out, "(empty)");
+		return;
+	}
 
 	while (1) {
-		while (bit < 64) {
-			if (done && !(done % 8))
-				*out++ = ' ';
-			*out++ = *p & (1ULL << (63 - bit)) ? '1' : '0';
-			bit++;
-			done++;
-			if (done == nr_bits) {
-				*out++ = '\0';
-				return;
-			}
+		unsigned next_key_bits = nr_key_bits;
+
+		if (nr_key_bits < 64) {
+			v >>= 64 - nr_key_bits;
+			next_key_bits = 0;
+		} else {
+			next_key_bits -= 64;
 		}
 
+		bch2_prt_u64_binary(out, v, min(word_bits, nr_key_bits));
+
+		if (!next_key_bits)
+			break;
+
+		prt_char(out, ' ');
+
 		p = next_word(p);
-		bit = 0;
+		v = *p;
+		word_bits = 64;
+		nr_key_bits = next_key_bits;
 	}
 }
 
 #ifdef CONFIG_BCACHEFS_DEBUG
 
 static void bch2_bkey_pack_verify(const struct bkey_packed *packed,
-				 const struct bkey *unpacked,
-				 const struct bkey_format *format)
+				  const struct bkey *unpacked,
+				  const struct bkey_format *format)
 {
 	struct bkey tmp;
 
@@ -57,23 +73,35 @@ static void bch2_bkey_pack_verify(const struct bkey_packed *packed,
 	tmp = __bch2_bkey_unpack_key(format, packed);
 
 	if (memcmp(&tmp, unpacked, sizeof(struct bkey))) {
-		struct printbuf buf1 = PRINTBUF;
-		struct printbuf buf2 = PRINTBUF;
-		char buf3[160], buf4[160];
+		struct printbuf buf = PRINTBUF;
 
-		bch2_bkey_to_text(&buf1, unpacked);
-		bch2_bkey_to_text(&buf2, &tmp);
-		bch2_to_binary(buf3, (void *) unpacked, 80);
-		bch2_to_binary(buf4, high_word(format, packed), 80);
-
-		panic("keys differ: format u64s %u fields %u %u %u %u %u\n%s\n%s\n%s\n%s\n",
+		prt_printf(&buf, "keys differ: format u64s %u fields %u %u %u %u %u\n",
 		      format->key_u64s,
 		      format->bits_per_field[0],
 		      format->bits_per_field[1],
 		      format->bits_per_field[2],
 		      format->bits_per_field[3],
-		      format->bits_per_field[4],
-		      buf1.buf, buf2.buf, buf3, buf4);
+		      format->bits_per_field[4]);
+
+		prt_printf(&buf, "compiled unpack: ");
+		bch2_bkey_to_text(&buf, unpacked);
+		prt_newline(&buf);
+
+		prt_printf(&buf, "c unpack:        ");
+		bch2_bkey_to_text(&buf, &tmp);
+		prt_newline(&buf);
+
+		prt_printf(&buf, "compiled unpack: ");
+		bch2_bkey_packed_to_binary_text(&buf, &bch2_bkey_format_current,
+						(struct bkey_packed *) unpacked);
+		prt_newline(&buf);
+
+		prt_printf(&buf, "c unpack:        ");
+		bch2_bkey_packed_to_binary_text(&buf, &bch2_bkey_format_current,
+						(struct bkey_packed *) &tmp);
+		prt_newline(&buf);
+
+		panic("%s", buf.buf);
 	}
 }
 
diff --git a/fs/bcachefs/bkey.h b/fs/bcachefs/bkey.h
index 8e9405f89537..2e7e6b6b4af7 100644
--- a/fs/bcachefs/bkey.h
+++ b/fs/bcachefs/bkey.h
@@ -20,7 +20,9 @@
 #endif
 #endif
 
-void bch2_to_binary(char *, const u64 *, unsigned);
+void bch2_bkey_packed_to_binary_text(struct printbuf *,
+				     const struct bkey_format *,
+				     const struct bkey_packed *);
 
 /* bkey with split value, const */
 struct bkey_s_c {
diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c
index cb07ef2ceb59..61cd44c5a6b4 100644
--- a/fs/bcachefs/util.c
+++ b/fs/bcachefs/util.c
@@ -238,6 +238,12 @@ bool bch2_is_zero(const void *_p, size_t n)
 	return true;
 }
 
+void bch2_prt_u64_binary(struct printbuf *out, u64 v, unsigned nr_bits)
+{
+	while (nr_bits)
+		prt_char(out, '0' + ((v >> --nr_bits) & 1));
+}
+
 /* time stats: */
 
 #ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h
index d5b19b1b2020..192d8b53f2ca 100644
--- a/fs/bcachefs/util.h
+++ b/fs/bcachefs/util.h
@@ -380,6 +380,8 @@ bool bch2_is_zero(const void *, size_t);
 
 u64 bch2_read_flag_list(char *, const char * const[]);
 
+void bch2_prt_u64_binary(struct printbuf *, u64, unsigned);
+
 #define NR_QUANTILES	15
 #define QUANTILE_IDX(i)	inorder_to_eytzinger0(i, NR_QUANTILES)
 #define QUANTILE_FIRST	eytzinger0_first(NR_QUANTILES)
-- 
cgit 


From 8192f8a58626ca3903a20f942ce86a6beb959a71 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 16 Aug 2022 03:08:15 -0400
Subject: bcachefs: Another should_be_locked fixup

When returning a key from the key cache, in BTREE_ITER_WITH_KEY_CACHE
mode, we don't want to set should_be_locked on iter->path; we're not
returning a key from that path, so we donn't need to, and also since we
traversed the key cache iterator before setting should_be_locked on that
path it might be unlocked (if we unlocked, bch2_trans_relock() won't
have relocked it).

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_iter.c | 28 ++++++++++++++++------------
 1 file changed, 16 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index a464327d7024..0c6e215f46c8 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -2334,7 +2334,7 @@ struct bkey_s_c btree_trans_peek_journal(struct btree_trans *trans,
  * bkey_s_c_null:
  */
 static noinline
-struct bkey_s_c btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos pos)
+struct bkey_s_c __btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos pos)
 {
 	struct btree_trans *trans = iter->trans;
 	struct bch_fs *c = trans->c;
@@ -2361,6 +2361,15 @@ struct bkey_s_c btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos
 	return bch2_btree_path_peek_slot(iter->key_cache_path, &u);
 }
 
+static noinline
+struct bkey_s_c btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos pos)
+{
+	struct bkey_s_c ret = __btree_trans_peek_key_cache(iter, pos);
+	int err = bkey_err(ret) ?: bch2_btree_path_relock(iter->trans, iter->path, _THIS_IP_);
+
+	return err ? bkey_s_c_err(err) : ret;
+}
+
 static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bpos search_key)
 {
 	struct btree_trans *trans = iter->trans;
@@ -2390,15 +2399,12 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp
 		if (unlikely(iter->flags & BTREE_ITER_WITH_KEY_CACHE) &&
 		    k.k &&
 		    (k2 = btree_trans_peek_key_cache(iter, k.k->p)).k) {
-			ret = bkey_err(k2);
+			k = k2;
+			ret = bkey_err(k);
 			if (ret) {
-				k = k2;
 				bch2_btree_iter_set_pos(iter, iter->pos);
 				goto out;
 			}
-
-			k = k2;
-			iter->k = *k.k;
 		}
 
 		if (unlikely(iter->flags & BTREE_ITER_WITH_JOURNAL))
@@ -2857,13 +2863,11 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
 			goto out;
 
 		if (unlikely(iter->flags & BTREE_ITER_WITH_KEY_CACHE) &&
-		    (k = btree_trans_peek_key_cache(iter, iter->pos)).k) {
-			if (bkey_err(k)) {
-				goto out_no_locked;
-			} else {
+		    (k = __btree_trans_peek_key_cache(iter, iter->pos)).k) {
+			if (!bkey_err(k))
 				iter->k = *k.k;
-				goto out;
-			}
+			/* We're not returning a key from iter->path: */
+			goto out_no_locked;
 		}
 
 		k = bch2_btree_path_peek_slot(iter->path, &iter->k);
-- 
cgit 


From 23dfb3a2f7757593249b745f2a17b56b756d2874 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 17 Aug 2022 17:49:12 -0400
Subject: bcachefs: Fix bch2_btree_iter_peek_slot() error path

iter->k needs to be consistent with iter->pos - required for
bch2_btree_iter_(rewind|advance) to work correctly.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_iter.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 0c6e215f46c8..88b2ed1d508a 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -2844,8 +2844,10 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
 					iter->flags & BTREE_ITER_INTENT);
 
 	ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
-	if (unlikely(ret))
-		return bkey_s_c_err(ret);
+	if (unlikely(ret)) {
+		k = bkey_s_c_err(ret);
+		goto out_no_locked;
+	}
 
 	if ((iter->flags & BTREE_ITER_CACHED) ||
 	    !(iter->flags & (BTREE_ITER_IS_EXTENTS|BTREE_ITER_FILTER_SNAPSHOTS))) {
@@ -2895,7 +2897,10 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
 			struct bpos pos = iter->pos;
 
 			k = bch2_btree_iter_peek(iter);
-			iter->pos = pos;
+			if (unlikely(bkey_err(k)))
+				bch2_btree_iter_set_pos(iter, pos);
+			else
+				iter->pos = pos;
 		}
 
 		if (unlikely(bkey_err(k)))
-- 
cgit 


From efd0d03816dd0d5127217220f85f1f0e621974d1 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 17 Aug 2022 22:17:08 -0400
Subject: bcachefs: Minor transaction restart handling fix

 - fsck_inode_rm() wasn't returning BCH_ERR_transaction_restart_nested
 - change bch2_trans_verify_not_restarted() to call panic() - we don't
   want these errors to be missed

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_iter.c | 8 ++++----
 fs/bcachefs/fsck.c       | 4 ++--
 2 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 88b2ed1d508a..95bc71dd87af 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -3289,10 +3289,10 @@ u32 bch2_trans_begin(struct btree_trans *trans)
 
 void bch2_trans_verify_not_restarted(struct btree_trans *trans, u32 restart_count)
 {
-	bch2_trans_inconsistent_on(trans_was_restarted(trans, restart_count), trans,
-		"trans->restart_count %u, should be %u, last restarted by %ps\n",
-		trans->restart_count, restart_count,
-		(void *) trans->last_restarted_ip);
+	if (trans_was_restarted(trans, restart_count))
+		panic("trans->restart_count %u, should be %u, last restarted by %pS\n",
+		      trans->restart_count, restart_count,
+		      (void *) trans->last_restarted_ip);
 }
 
 static void bch2_trans_alloc_paths(struct btree_trans *trans, struct bch_fs *c)
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index ef2e32864580..1a841146e379 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -290,7 +290,7 @@ err:
 	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
 		goto retry;
 
-	return ret;
+	return ret ?: -BCH_ERR_transaction_restart_nested;
 }
 
 static int __remove_dirent(struct btree_trans *trans, struct bpos pos)
@@ -914,7 +914,7 @@ static int check_inode(struct btree_trans *trans,
 		bch2_fs_lazy_rw(c);
 
 		ret = fsck_inode_rm(trans, u.bi_inum, iter->pos.snapshot);
-		if (ret)
+		if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
 			bch_err(c, "error in fsck: error while deleting inode: %s",
 				bch2_err_str(ret));
 		return ret;
-- 
cgit 


From 42590b53fef427f96fc50da4974923564e9033cd Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 18 Aug 2022 13:00:26 -0400
Subject: bcachefs: bch2_btree_delete_range_trans() now returns
 -BCH_ERR_transaction_restart_nested

The new convention is that functions that handle transaction restarts
within an existing transaction context should return
-BCH_ERR_transaction_restart_nested when they did so, since they
invalidated the outer transaction context.

This also means bch2_btree_delete_range_trans() is changed to only call
bch2_trans_begin() after a transaction restart, not on every loop
iteration.

This is to fix a bug in fsck, in check_inode() when we truncate an inode
with BCH_INODE_I_SIZE_DIRTY set.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_update_leaf.c | 19 ++++++++++++++-----
 fs/bcachefs/fsck.c              | 30 ++++++++++++++++++------------
 2 files changed, 32 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 6bf34853f261..11c3767896aa 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -1713,15 +1713,16 @@ int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id id,
 				  unsigned update_flags,
 				  u64 *journal_seq)
 {
+	u32 restart_count = trans->restart_count;
 	struct btree_iter iter;
 	struct bkey_s_c k;
 	int ret = 0;
 
 	bch2_trans_iter_init(trans, &iter, id, start, BTREE_ITER_INTENT);
 retry:
-	while ((bch2_trans_begin(trans),
-	       (k = bch2_btree_iter_peek(&iter)).k) &&
-	       !(ret = bkey_err(k)) &&
+	while ((k = bch2_btree_iter_peek(&iter)).k &&
+	       !(ret = bkey_err(k) ?:
+		 btree_trans_too_many_iters(trans)) &&
 	       bkey_cmp(iter.pos, end) < 0) {
 		struct disk_reservation disk_res =
 			bch2_disk_reservation_init(trans->c, 0);
@@ -1767,11 +1768,15 @@ retry:
 	}
 
 	if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
+		bch2_trans_begin(trans);
 		ret = 0;
 		goto retry;
 	}
 
 	bch2_trans_iter_exit(trans, &iter);
+
+	if (!ret && trans_was_restarted(trans, restart_count))
+		ret = -BCH_ERR_transaction_restart_nested;
 	return ret;
 }
 
@@ -1785,8 +1790,12 @@ int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id,
 			    unsigned update_flags,
 			    u64 *journal_seq)
 {
-	return bch2_trans_run(c,
-			bch2_btree_delete_range_trans(&trans, id, start, end, update_flags, journal_seq));
+	int ret = bch2_trans_run(c,
+			bch2_btree_delete_range_trans(&trans, id, start, end,
+						      update_flags, journal_seq));
+	if (ret == -BCH_ERR_transaction_restart_nested)
+		ret = 0;
+	return ret;
 }
 
 int bch2_trans_log_msg(struct btree_trans *trans, const char *msg)
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 1a841146e379..ff10f09eee56 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -19,6 +19,10 @@
 
 #define QSTR(n) { { { .len = strlen(n) } }, .name = n }
 
+/*
+ * XXX: this is handling transaction restarts without returning
+ * -BCH_ERR_transaction_restart_nested, this is not how we do things anymore:
+ */
 static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum,
 				    u32 snapshot)
 {
@@ -239,18 +243,20 @@ static int fsck_inode_rm(struct btree_trans *trans, u64 inum, u32 snapshot)
 	struct bkey_s_c k;
 	int ret;
 
-	ret   = bch2_btree_delete_range_trans(trans, BTREE_ID_extents,
-					      SPOS(inum, 0, snapshot),
-					      SPOS(inum, U64_MAX, snapshot),
-					      0, NULL) ?:
-		bch2_btree_delete_range_trans(trans, BTREE_ID_dirents,
-					      SPOS(inum, 0, snapshot),
-					      SPOS(inum, U64_MAX, snapshot),
-					      0, NULL) ?:
-		bch2_btree_delete_range_trans(trans, BTREE_ID_xattrs,
-					      SPOS(inum, 0, snapshot),
-					      SPOS(inum, U64_MAX, snapshot),
-					      0, NULL);
+	do {
+		ret   = bch2_btree_delete_range_trans(trans, BTREE_ID_extents,
+						      SPOS(inum, 0, snapshot),
+						      SPOS(inum, U64_MAX, snapshot),
+						      0, NULL) ?:
+			bch2_btree_delete_range_trans(trans, BTREE_ID_dirents,
+						      SPOS(inum, 0, snapshot),
+						      SPOS(inum, U64_MAX, snapshot),
+						      0, NULL) ?:
+			bch2_btree_delete_range_trans(trans, BTREE_ID_xattrs,
+						      SPOS(inum, 0, snapshot),
+						      SPOS(inum, U64_MAX, snapshot),
+						      0, NULL);
+	} while (ret == -BCH_ERR_transaction_restart_nested);
 	if (ret)
 		goto err;
 retry:
-- 
cgit 


From 12043cf1511420ecf38f4925a0089c1ae1aa058b Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 18 Aug 2022 17:00:12 -0400
Subject: bcachefs: fsck: Another transaction restart handling fix

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/data_update.c |  2 ++
 fs/bcachefs/fsck.c        | 12 +++++-------
 fs/bcachefs/trace.h       | 11 +++++++++--
 3 files changed, 16 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c
index c0d6a48d3c72..f9eb147fe229 100644
--- a/fs/bcachefs/data_update.c
+++ b/fs/bcachefs/data_update.c
@@ -248,6 +248,8 @@ next:
 		}
 		continue;
 nomatch:
+		trace_data_update_fail(&old.k->p);
+
 		if (m->ctxt) {
 			BUG_ON(k.k->p.offset <= iter.pos.offset);
 			atomic64_inc(&m->ctxt->stats->keys_raced);
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index ff10f09eee56..9f768d774ba6 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -728,7 +728,7 @@ static int __get_visible_inodes(struct btree_trans *trans,
 
 	w->inodes.nr = 0;
 
-	for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, inum),
+	for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, POS(0, inum),
 			   BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
 		u32 equiv = bch2_snapshot_equiv(c, k.k->p.snapshot);
 
@@ -1372,13 +1372,11 @@ static int check_subdir_count(struct btree_trans *trans, struct inode_walker *w)
 		}
 	}
 fsck_err:
-	if (ret) {
+	if (ret)
 		bch_err(c, "error from check_subdir_count(): %s", bch2_err_str(ret));
-		return ret;
-	}
-	if (trans_was_restarted(trans, restart_count))
-		return -BCH_ERR_transaction_restart_nested;
-	return 0;
+	if (!ret && trans_was_restarted(trans, restart_count))
+		ret = -BCH_ERR_transaction_restart_nested;
+	return ret;
 }
 
 static int check_dirent_target(struct btree_trans *trans,
diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h
index 931da79e027f..9353191c4fc8 100644
--- a/fs/bcachefs/trace.h
+++ b/fs/bcachefs/trace.h
@@ -18,7 +18,7 @@
 	__entry->dst##_snapshot		= (src).snapshot
 
 DECLARE_EVENT_CLASS(bpos,
-	TP_PROTO(struct bpos *p),
+	TP_PROTO(const struct bpos *p),
 	TP_ARGS(p),
 
 	TP_STRUCT__entry(
@@ -225,7 +225,7 @@ TRACE_EVENT(journal_reclaim_finish,
 /* bset.c: */
 
 DEFINE_EVENT(bpos, bkey_pack_pos_fail,
-	TP_PROTO(struct bpos *p),
+	TP_PROTO(const struct bpos *p),
 	TP_ARGS(p)
 );
 
@@ -727,6 +727,13 @@ TRACE_EVENT(copygc_wait,
 		  __entry->wait_amount, __entry->until)
 );
 
+DEFINE_EVENT(bpos, data_update_fail,
+	TP_PROTO(const struct bpos *p),
+	TP_ARGS(p)
+);
+
+/* btree transactions: */
+
 DECLARE_EVENT_CLASS(transaction_event,
 	TP_PROTO(struct btree_trans *trans,
 		 unsigned long caller_ip),
-- 
cgit 


From 02afcb8c26b14ae317754d8c79339f41b3dfeaae Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 18 Aug 2022 17:57:24 -0400
Subject: bcachefs: Fix adding a device with a label

Device labels are represented as pointers in the member info section: we
need to get and then set the label for it to be kept correctly.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/disk_groups.c | 28 +++++++++++++++-------------
 fs/bcachefs/disk_groups.h |  1 +
 fs/bcachefs/super.c       | 18 ++++++++++++++++++
 3 files changed, 34 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/disk_groups.c b/fs/bcachefs/disk_groups.c
index 33d2702e6849..5f405d38b3de 100644
--- a/fs/bcachefs/disk_groups.c
+++ b/fs/bcachefs/disk_groups.c
@@ -384,32 +384,34 @@ inval:
 	prt_printf(out, "invalid label %u", v);
 }
 
-int bch2_dev_group_set(struct bch_fs *c, struct bch_dev *ca, const char *name)
+int __bch2_dev_group_set(struct bch_fs *c, struct bch_dev *ca, const char *name)
 {
 	struct bch_member *mi;
-	int v = -1;
-	int ret = 0;
-
-	mutex_lock(&c->sb_lock);
+	int ret, v = -1;
 
 	if (!strlen(name) || !strcmp(name, "none"))
-		goto write_sb;
+		return 0;
 
 	v = bch2_disk_path_find_or_create(&c->disk_sb, name);
-	if (v < 0) {
-		mutex_unlock(&c->sb_lock);
+	if (v < 0)
 		return v;
-	}
 
 	ret = bch2_sb_disk_groups_to_cpu(c);
 	if (ret)
-		goto unlock;
-write_sb:
+		return ret;
+
 	mi = &bch2_sb_get_members(c->disk_sb.sb)->members[ca->dev_idx];
 	SET_BCH_MEMBER_GROUP(mi, v + 1);
+	return 0;
+}
 
-	bch2_write_super(c);
-unlock:
+int bch2_dev_group_set(struct bch_fs *c, struct bch_dev *ca, const char *name)
+{
+	int ret;
+
+	mutex_lock(&c->sb_lock);
+	ret = __bch2_dev_group_set(c, ca, name) ?:
+		bch2_write_super(c);
 	mutex_unlock(&c->sb_lock);
 
 	return ret;
diff --git a/fs/bcachefs/disk_groups.h b/fs/bcachefs/disk_groups.h
index de915480514b..e4470c357a66 100644
--- a/fs/bcachefs/disk_groups.h
+++ b/fs/bcachefs/disk_groups.h
@@ -82,6 +82,7 @@ void bch2_opt_target_to_text(struct printbuf *, struct bch_fs *, struct bch_sb *
 
 int bch2_sb_disk_groups_to_cpu(struct bch_fs *);
 
+int __bch2_dev_group_set(struct bch_fs *, struct bch_dev *, const char *);
 int bch2_dev_group_set(struct bch_fs *, struct bch_dev *, const char *);
 
 const char *bch2_sb_validate_disk_groups(struct bch_sb *,
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index fe7938e7e07b..1c8fac603644 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -1522,6 +1522,7 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
 	struct bch_member dev_mi;
 	unsigned dev_idx, nr_devices, u64s;
 	struct printbuf errbuf = PRINTBUF;
+	struct printbuf label = PRINTBUF;
 	int ret;
 
 	ret = bch2_read_super(path, &opts, &sb);
@@ -1532,6 +1533,14 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
 
 	dev_mi = bch2_sb_get_members(sb.sb)->members[sb.sb->dev_idx];
 
+	if (BCH_MEMBER_GROUP(&dev_mi)) {
+		bch2_disk_path_to_text(&label, sb.sb, BCH_MEMBER_GROUP(&dev_mi) - 1);
+		if (label.allocation_failure) {
+			ret = -ENOMEM;
+			goto err;
+		}
+	}
+
 	err = bch2_dev_may_add(sb.sb, c);
 	if (err) {
 		bch_err(c, "device add error: %s", err);
@@ -1612,6 +1621,14 @@ have_slot:
 	ca->disk_sb.sb->dev_idx	= dev_idx;
 	bch2_dev_attach(c, ca, dev_idx);
 
+	if (BCH_MEMBER_GROUP(&dev_mi)) {
+		ret = __bch2_dev_group_set(c, ca, label.buf);
+		if (ret) {
+			bch_err(c, "device add error: error setting label");
+			goto err_unlock;
+		}
+	}
+
 	bch2_write_super(c);
 	mutex_unlock(&c->sb_lock);
 
@@ -1644,6 +1661,7 @@ err:
 	if (ca)
 		bch2_dev_free(ca);
 	bch2_free_super(&sb);
+	printbuf_exit(&label);
 	printbuf_exit(&errbuf);
 	return ret;
 err_late:
-- 
cgit 


From cd5afabea1acd2bc351ec08d59511302b397f150 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 19 Aug 2022 15:35:34 -0400
Subject: bcachefs: btree_locking.c

Start to centralize some of the locking code in a new file; more locking
code will be moving here in the future.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/Makefile        |   1 +
 fs/bcachefs/btree_iter.c    | 444 +-------------------------------------------
 fs/bcachefs/btree_iter.h    |  16 --
 fs/bcachefs/btree_locking.c | 442 +++++++++++++++++++++++++++++++++++++++++++
 fs/bcachefs/btree_locking.h |  41 ++++
 fs/bcachefs/btree_types.h   |   8 +
 6 files changed, 494 insertions(+), 458 deletions(-)
 create mode 100644 fs/bcachefs/btree_locking.c

(limited to 'fs')

diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile
index 2f4bd31c862f..e23667548e09 100644
--- a/fs/bcachefs/Makefile
+++ b/fs/bcachefs/Makefile
@@ -13,6 +13,7 @@ bcachefs-y		:=	\
 	btree_io.o		\
 	btree_iter.o		\
 	btree_key_cache.o	\
+	btree_locking.o		\
 	btree_update_interior.o	\
 	btree_update_leaf.o	\
 	buckets.o		\
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 95bc71dd87af..488b56a209e3 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -118,444 +118,6 @@ static inline bool btree_path_pos_in_node(struct btree_path *path,
 		!btree_path_pos_after_node(path, b);
 }
 
-/* Btree node locking: */
-
-void bch2_btree_node_unlock_write(struct btree_trans *trans,
-			struct btree_path *path, struct btree *b)
-{
-	bch2_btree_node_unlock_write_inlined(trans, path, b);
-}
-
-struct six_lock_count bch2_btree_node_lock_counts(struct btree_trans *trans,
-					   struct btree_path *skip,
-					   struct btree *b,
-					   unsigned level)
-{
-	struct btree_path *path;
-	struct six_lock_count ret = { 0, 0 };
-
-	if (IS_ERR_OR_NULL(b))
-		return ret;
-
-	trans_for_each_path(trans, path)
-		if (path != skip && path->l[level].b == b) {
-			ret.read += btree_node_read_locked(path, level);
-			ret.intent += btree_node_intent_locked(path, level);
-		}
-
-	return ret;
-}
-
-static inline void six_lock_readers_add(struct six_lock *lock, int nr)
-{
-	if (!lock->readers)
-		atomic64_add(__SIX_VAL(read_lock, nr), &lock->state.counter);
-	else
-		this_cpu_add(*lock->readers, nr);
-}
-
-void __bch2_btree_node_lock_write(struct btree_trans *trans, struct btree *b)
-{
-	int readers = bch2_btree_node_lock_counts(trans, NULL, b, b->c.level).read;
-
-	/*
-	 * Must drop our read locks before calling six_lock_write() -
-	 * six_unlock() won't do wakeups until the reader count
-	 * goes to 0, and it's safe because we have the node intent
-	 * locked:
-	 */
-	six_lock_readers_add(&b->c.lock, -readers);
-	six_lock_write(&b->c.lock, NULL, NULL);
-	six_lock_readers_add(&b->c.lock, readers);
-}
-
-bool __bch2_btree_node_relock(struct btree_trans *trans,
-			      struct btree_path *path, unsigned level)
-{
-	struct btree *b = btree_path_node(path, level);
-	int want = __btree_lock_want(path, level);
-
-	if (!is_btree_node(path, level))
-		goto fail;
-
-	if (race_fault())
-		goto fail;
-
-	if (six_relock_type(&b->c.lock, want, path->l[level].lock_seq) ||
-	    (btree_node_lock_seq_matches(path, b, level) &&
-	     btree_node_lock_increment(trans, b, level, want))) {
-		mark_btree_node_locked(trans, path, level, want);
-		return true;
-	}
-fail:
-	if (b != ERR_PTR(-BCH_ERR_no_btree_node_cached) &&
-	    b != ERR_PTR(-BCH_ERR_no_btree_node_init))
-		trace_btree_node_relock_fail(trans, _RET_IP_, path, level);
-	return false;
-}
-
-bool bch2_btree_node_upgrade(struct btree_trans *trans,
-			     struct btree_path *path, unsigned level)
-{
-	struct btree *b = path->l[level].b;
-
-	if (!is_btree_node(path, level))
-		return false;
-
-	switch (btree_lock_want(path, level)) {
-	case BTREE_NODE_UNLOCKED:
-		BUG_ON(btree_node_locked(path, level));
-		return true;
-	case BTREE_NODE_READ_LOCKED:
-		BUG_ON(btree_node_intent_locked(path, level));
-		return bch2_btree_node_relock(trans, path, level);
-	case BTREE_NODE_INTENT_LOCKED:
-		break;
-	}
-
-	if (btree_node_intent_locked(path, level))
-		return true;
-
-	if (race_fault())
-		return false;
-
-	if (btree_node_locked(path, level)
-	    ? six_lock_tryupgrade(&b->c.lock)
-	    : six_relock_type(&b->c.lock, SIX_LOCK_intent, path->l[level].lock_seq))
-		goto success;
-
-	if (btree_node_lock_seq_matches(path, b, level) &&
-	    btree_node_lock_increment(trans, b, level, BTREE_NODE_INTENT_LOCKED)) {
-		btree_node_unlock(trans, path, level);
-		goto success;
-	}
-
-	trace_btree_node_upgrade_fail(trans, _RET_IP_, path, level);
-	return false;
-success:
-	mark_btree_node_intent_locked(trans, path, level);
-	return true;
-}
-
-static inline bool btree_path_get_locks(struct btree_trans *trans,
-					struct btree_path *path,
-					bool upgrade)
-{
-	unsigned l = path->level;
-	int fail_idx = -1;
-
-	do {
-		if (!btree_path_node(path, l))
-			break;
-
-		if (!(upgrade
-		      ? bch2_btree_node_upgrade(trans, path, l)
-		      : bch2_btree_node_relock(trans, path, l)))
-			fail_idx = l;
-
-		l++;
-	} while (l < path->locks_want);
-
-	/*
-	 * When we fail to get a lock, we have to ensure that any child nodes
-	 * can't be relocked so bch2_btree_path_traverse has to walk back up to
-	 * the node that we failed to relock:
-	 */
-	if (fail_idx >= 0) {
-		__bch2_btree_path_unlock(trans, path);
-		btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
-
-		do {
-			path->l[fail_idx].b = upgrade
-				? ERR_PTR(-BCH_ERR_no_btree_node_upgrade)
-				: ERR_PTR(-BCH_ERR_no_btree_node_relock);
-			--fail_idx;
-		} while (fail_idx >= 0);
-	}
-
-	if (path->uptodate == BTREE_ITER_NEED_RELOCK)
-		path->uptodate = BTREE_ITER_UPTODATE;
-
-	bch2_trans_verify_locks(trans);
-
-	return path->uptodate < BTREE_ITER_NEED_RELOCK;
-}
-
-static struct bpos btree_node_pos(struct btree_bkey_cached_common *_b,
-				  bool cached)
-{
-	return !cached
-		? container_of(_b, struct btree, c)->key.k.p
-		: container_of(_b, struct bkey_cached, c)->key.pos;
-}
-
-/* Slowpath: */
-int __bch2_btree_node_lock(struct btree_trans *trans,
-			   struct btree_path *path,
-			   struct btree *b,
-			   struct bpos pos, unsigned level,
-			   enum six_lock_type type,
-			   six_lock_should_sleep_fn should_sleep_fn, void *p,
-			   unsigned long ip)
-{
-	struct btree_path *linked;
-	unsigned reason;
-
-	/* Check if it's safe to block: */
-	trans_for_each_path(trans, linked) {
-		if (!linked->nodes_locked)
-			continue;
-
-		/*
-		 * Can't block taking an intent lock if we have _any_ nodes read
-		 * locked:
-		 *
-		 * - Our read lock blocks another thread with an intent lock on
-		 *   the same node from getting a write lock, and thus from
-		 *   dropping its intent lock
-		 *
-		 * - And the other thread may have multiple nodes intent locked:
-		 *   both the node we want to intent lock, and the node we
-		 *   already have read locked - deadlock:
-		 */
-		if (type == SIX_LOCK_intent &&
-		    linked->nodes_locked != linked->nodes_intent_locked) {
-			reason = 1;
-			goto deadlock;
-		}
-
-		if (linked->btree_id != path->btree_id) {
-			if (linked->btree_id < path->btree_id)
-				continue;
-
-			reason = 3;
-			goto deadlock;
-		}
-
-		/*
-		 * Within the same btree, non-cached paths come before cached
-		 * paths:
-		 */
-		if (linked->cached != path->cached) {
-			if (!linked->cached)
-				continue;
-
-			reason = 4;
-			goto deadlock;
-		}
-
-		/*
-		 * Interior nodes must be locked before their descendants: if
-		 * another path has possible descendants locked of the node
-		 * we're about to lock, it must have the ancestors locked too:
-		 */
-		if (level > __fls(linked->nodes_locked)) {
-			reason = 5;
-			goto deadlock;
-		}
-
-		/* Must lock btree nodes in key order: */
-		if (btree_node_locked(linked, level) &&
-		    bpos_cmp(pos, btree_node_pos((void *) linked->l[level].b,
-						 linked->cached)) <= 0) {
-			reason = 7;
-			goto deadlock;
-		}
-	}
-
-	return btree_node_lock_type(trans, path, b, pos, level,
-				    type, should_sleep_fn, p);
-deadlock:
-	trace_trans_restart_would_deadlock(trans, ip, reason, linked, path, &pos);
-	return btree_trans_restart(trans, BCH_ERR_transaction_restart_would_deadlock);
-}
-
-/* Btree iterator locking: */
-
-#ifdef CONFIG_BCACHEFS_DEBUG
-
-static void bch2_btree_path_verify_locks(struct btree_path *path)
-{
-	unsigned l;
-
-	if (!path->nodes_locked) {
-		BUG_ON(path->uptodate == BTREE_ITER_UPTODATE &&
-		       btree_path_node(path, path->level));
-		return;
-	}
-
-	for (l = 0; btree_path_node(path, l); l++)
-		BUG_ON(btree_lock_want(path, l) !=
-		       btree_node_locked_type(path, l));
-}
-
-void bch2_trans_verify_locks(struct btree_trans *trans)
-{
-	struct btree_path *path;
-
-	trans_for_each_path(trans, path)
-		bch2_btree_path_verify_locks(path);
-}
-#else
-static inline void bch2_btree_path_verify_locks(struct btree_path *path) {}
-#endif
-
-/* Btree path locking: */
-
-/*
- * Only for btree_cache.c - only relocks intent locks
- */
-int bch2_btree_path_relock_intent(struct btree_trans *trans,
-				  struct btree_path *path)
-{
-	unsigned l;
-
-	for (l = path->level;
-	     l < path->locks_want && btree_path_node(path, l);
-	     l++) {
-		if (!bch2_btree_node_relock(trans, path, l)) {
-			__bch2_btree_path_unlock(trans, path);
-			btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
-			trace_trans_restart_relock_path_intent(trans, _RET_IP_, path);
-			return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_path_intent);
-		}
-	}
-
-	return 0;
-}
-
-__flatten
-static bool bch2_btree_path_relock_norestart(struct btree_trans *trans,
-			struct btree_path *path, unsigned long trace_ip)
-{
-	return btree_path_get_locks(trans, path, false);
-}
-
-static int __bch2_btree_path_relock(struct btree_trans *trans,
-			struct btree_path *path, unsigned long trace_ip)
-{
-	if (!bch2_btree_path_relock_norestart(trans, path, trace_ip)) {
-		trace_trans_restart_relock_path(trans, trace_ip, path);
-		return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_path);
-	}
-
-	return 0;
-}
-
-static inline int bch2_btree_path_relock(struct btree_trans *trans,
-			struct btree_path *path, unsigned long trace_ip)
-{
-	return btree_node_locked(path, path->level)
-		? 0
-		: __bch2_btree_path_relock(trans, path, trace_ip);
-}
-
-bool __bch2_btree_path_upgrade(struct btree_trans *trans,
-			       struct btree_path *path,
-			       unsigned new_locks_want)
-{
-	struct btree_path *linked;
-
-	EBUG_ON(path->locks_want >= new_locks_want);
-
-	path->locks_want = new_locks_want;
-
-	if (btree_path_get_locks(trans, path, true))
-		return true;
-
-	/*
-	 * XXX: this is ugly - we'd prefer to not be mucking with other
-	 * iterators in the btree_trans here.
-	 *
-	 * On failure to upgrade the iterator, setting iter->locks_want and
-	 * calling get_locks() is sufficient to make bch2_btree_path_traverse()
-	 * get the locks we want on transaction restart.
-	 *
-	 * But if this iterator was a clone, on transaction restart what we did
-	 * to this iterator isn't going to be preserved.
-	 *
-	 * Possibly we could add an iterator field for the parent iterator when
-	 * an iterator is a copy - for now, we'll just upgrade any other
-	 * iterators with the same btree id.
-	 *
-	 * The code below used to be needed to ensure ancestor nodes get locked
-	 * before interior nodes - now that's handled by
-	 * bch2_btree_path_traverse_all().
-	 */
-	if (!path->cached && !trans->in_traverse_all)
-		trans_for_each_path(trans, linked)
-			if (linked != path &&
-			    linked->cached == path->cached &&
-			    linked->btree_id == path->btree_id &&
-			    linked->locks_want < new_locks_want) {
-				linked->locks_want = new_locks_want;
-				btree_path_get_locks(trans, linked, true);
-			}
-
-	return false;
-}
-
-void __bch2_btree_path_downgrade(struct btree_trans *trans,
-				 struct btree_path *path,
-				 unsigned new_locks_want)
-{
-	unsigned l;
-
-	EBUG_ON(path->locks_want < new_locks_want);
-
-	path->locks_want = new_locks_want;
-
-	while (path->nodes_locked &&
-	       (l = __fls(path->nodes_locked)) >= path->locks_want) {
-		if (l > path->level) {
-			btree_node_unlock(trans, path, l);
-		} else {
-			if (btree_node_intent_locked(path, l)) {
-				six_lock_downgrade(&path->l[l].b->c.lock);
-				path->nodes_intent_locked ^= 1 << l;
-			}
-			break;
-		}
-	}
-
-	bch2_btree_path_verify_locks(path);
-}
-
-void bch2_trans_downgrade(struct btree_trans *trans)
-{
-	struct btree_path *path;
-
-	trans_for_each_path(trans, path)
-		bch2_btree_path_downgrade(trans, path);
-}
-
-/* Btree transaction locking: */
-
-int bch2_trans_relock(struct btree_trans *trans)
-{
-	struct btree_path *path;
-
-	if (unlikely(trans->restarted))
-		return -BCH_ERR_transaction_restart_relock;
-
-	trans_for_each_path(trans, path)
-		if (path->should_be_locked &&
-		    bch2_btree_path_relock(trans, path, _RET_IP_)) {
-			trace_trans_restart_relock(trans, _RET_IP_, path);
-			BUG_ON(!trans->restarted);
-			return -BCH_ERR_transaction_restart_relock;
-		}
-	return 0;
-}
-
-void bch2_trans_unlock(struct btree_trans *trans)
-{
-	struct btree_path *path;
-
-	trans_for_each_path(trans, path)
-		__bch2_btree_path_unlock(trans, path);
-}
-
 /* Btree iterator: */
 
 #ifdef CONFIG_BCACHEFS_DEBUG
@@ -2036,10 +1598,8 @@ struct btree_path *bch2_path_get(struct btree_trans *trans,
 	 */
 
 	locks_want = min(locks_want, BTREE_MAX_DEPTH);
-	if (locks_want > path->locks_want) {
-		path->locks_want = locks_want;
-		btree_path_get_locks(trans, path, true);
-	}
+	if (locks_want > path->locks_want)
+		bch2_btree_path_upgrade_noupgrade_sibs(trans, path, locks_want);
 
 	return path;
 }
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index 51beeddcd45e..c083e49475d1 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -181,12 +181,10 @@ struct bkey_i *bch2_btree_journal_peek_slot(struct btree_trans *,
 
 #ifdef CONFIG_BCACHEFS_DEBUG
 void bch2_trans_verify_paths(struct btree_trans *);
-void bch2_trans_verify_locks(struct btree_trans *);
 void bch2_assert_pos_locked(struct btree_trans *, enum btree_id,
 			    struct bpos, bool);
 #else
 static inline void bch2_trans_verify_paths(struct btree_trans *trans) {}
-static inline void bch2_trans_verify_locks(struct btree_trans *trans) {}
 static inline void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id,
 					  struct bpos pos, bool key_cache) {}
 #endif
@@ -231,20 +229,6 @@ static inline int btree_trans_restart(struct btree_trans *trans, int err)
 bool bch2_btree_node_upgrade(struct btree_trans *,
 			     struct btree_path *, unsigned);
 
-bool __bch2_btree_path_upgrade(struct btree_trans *,
-			       struct btree_path *, unsigned);
-
-static inline bool bch2_btree_path_upgrade(struct btree_trans *trans,
-					   struct btree_path *path,
-					   unsigned new_locks_want)
-{
-	new_locks_want = min(new_locks_want, BTREE_MAX_DEPTH);
-
-	return path->locks_want < new_locks_want
-		? __bch2_btree_path_upgrade(trans, path, new_locks_want)
-		: path->uptodate == BTREE_ITER_UPTODATE;
-}
-
 void __bch2_btree_path_downgrade(struct btree_trans *, struct btree_path *, unsigned);
 
 static inline void bch2_btree_path_downgrade(struct btree_trans *trans,
diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c
new file mode 100644
index 000000000000..3f20fbcb8389
--- /dev/null
+++ b/fs/bcachefs/btree_locking.c
@@ -0,0 +1,442 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "btree_locking.h"
+#include "btree_types.h"
+
+struct lock_class_key bch2_btree_node_lock_key;
+
+/* Btree node locking: */
+
+void bch2_btree_node_unlock_write(struct btree_trans *trans,
+			struct btree_path *path, struct btree *b)
+{
+	bch2_btree_node_unlock_write_inlined(trans, path, b);
+}
+
+struct six_lock_count bch2_btree_node_lock_counts(struct btree_trans *trans,
+						  struct btree_path *skip,
+						  struct btree *b,
+						  unsigned level)
+{
+	struct btree_path *path;
+	struct six_lock_count ret = { 0, 0 };
+
+	if (IS_ERR_OR_NULL(b))
+		return ret;
+
+	trans_for_each_path(trans, path)
+		if (path != skip && path->l[level].b == b) {
+			ret.read += btree_node_read_locked(path, level);
+			ret.intent += btree_node_intent_locked(path, level);
+		}
+
+	return ret;
+}
+
+static inline void six_lock_readers_add(struct six_lock *lock, int nr)
+{
+	if (!lock->readers)
+		atomic64_add(__SIX_VAL(read_lock, nr), &lock->state.counter);
+	else
+		this_cpu_add(*lock->readers, nr);
+}
+
+void __bch2_btree_node_lock_write(struct btree_trans *trans, struct btree *b)
+{
+	int readers = bch2_btree_node_lock_counts(trans, NULL, b, b->c.level).read;
+
+	/*
+	 * Must drop our read locks before calling six_lock_write() -
+	 * six_unlock() won't do wakeups until the reader count
+	 * goes to 0, and it's safe because we have the node intent
+	 * locked:
+	 */
+	six_lock_readers_add(&b->c.lock, -readers);
+	six_lock_write(&b->c.lock, NULL, NULL);
+	six_lock_readers_add(&b->c.lock, readers);
+}
+
+bool __bch2_btree_node_relock(struct btree_trans *trans,
+			      struct btree_path *path, unsigned level)
+{
+	struct btree *b = btree_path_node(path, level);
+	int want = __btree_lock_want(path, level);
+
+	if (!is_btree_node(path, level))
+		goto fail;
+
+	if (race_fault())
+		goto fail;
+
+	if (six_relock_type(&b->c.lock, want, path->l[level].lock_seq) ||
+	    (btree_node_lock_seq_matches(path, b, level) &&
+	     btree_node_lock_increment(trans, b, level, want))) {
+		mark_btree_node_locked(trans, path, level, want);
+		return true;
+	}
+fail:
+	if (b != ERR_PTR(-BCH_ERR_no_btree_node_cached) &&
+	    b != ERR_PTR(-BCH_ERR_no_btree_node_init))
+		trace_btree_node_relock_fail(trans, _RET_IP_, path, level);
+	return false;
+}
+
+bool bch2_btree_node_upgrade(struct btree_trans *trans,
+			     struct btree_path *path, unsigned level)
+{
+	struct btree *b = path->l[level].b;
+
+	if (!is_btree_node(path, level))
+		return false;
+
+	switch (btree_lock_want(path, level)) {
+	case BTREE_NODE_UNLOCKED:
+		BUG_ON(btree_node_locked(path, level));
+		return true;
+	case BTREE_NODE_READ_LOCKED:
+		BUG_ON(btree_node_intent_locked(path, level));
+		return bch2_btree_node_relock(trans, path, level);
+	case BTREE_NODE_INTENT_LOCKED:
+		break;
+	}
+
+	if (btree_node_intent_locked(path, level))
+		return true;
+
+	if (race_fault())
+		return false;
+
+	if (btree_node_locked(path, level)
+	    ? six_lock_tryupgrade(&b->c.lock)
+	    : six_relock_type(&b->c.lock, SIX_LOCK_intent, path->l[level].lock_seq))
+		goto success;
+
+	if (btree_node_lock_seq_matches(path, b, level) &&
+	    btree_node_lock_increment(trans, b, level, BTREE_NODE_INTENT_LOCKED)) {
+		btree_node_unlock(trans, path, level);
+		goto success;
+	}
+
+	trace_btree_node_upgrade_fail(trans, _RET_IP_, path, level);
+	return false;
+success:
+	mark_btree_node_intent_locked(trans, path, level);
+	return true;
+}
+
+static inline bool btree_path_get_locks(struct btree_trans *trans,
+					struct btree_path *path,
+					bool upgrade)
+{
+	unsigned l = path->level;
+	int fail_idx = -1;
+
+	do {
+		if (!btree_path_node(path, l))
+			break;
+
+		if (!(upgrade
+		      ? bch2_btree_node_upgrade(trans, path, l)
+		      : bch2_btree_node_relock(trans, path, l)))
+			fail_idx = l;
+
+		l++;
+	} while (l < path->locks_want);
+
+	/*
+	 * When we fail to get a lock, we have to ensure that any child nodes
+	 * can't be relocked so bch2_btree_path_traverse has to walk back up to
+	 * the node that we failed to relock:
+	 */
+	if (fail_idx >= 0) {
+		__bch2_btree_path_unlock(trans, path);
+		btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
+
+		do {
+			path->l[fail_idx].b = upgrade
+				? ERR_PTR(-BCH_ERR_no_btree_node_upgrade)
+				: ERR_PTR(-BCH_ERR_no_btree_node_relock);
+			--fail_idx;
+		} while (fail_idx >= 0);
+	}
+
+	if (path->uptodate == BTREE_ITER_NEED_RELOCK)
+		path->uptodate = BTREE_ITER_UPTODATE;
+
+	bch2_trans_verify_locks(trans);
+
+	return path->uptodate < BTREE_ITER_NEED_RELOCK;
+}
+
+/* Slowpath: */
+int __bch2_btree_node_lock(struct btree_trans *trans,
+			   struct btree_path *path,
+			   struct btree *b,
+			   struct bpos pos, unsigned level,
+			   enum six_lock_type type,
+			   six_lock_should_sleep_fn should_sleep_fn, void *p,
+			   unsigned long ip)
+{
+	struct btree_path *linked;
+	unsigned reason;
+
+	/* Check if it's safe to block: */
+	trans_for_each_path(trans, linked) {
+		if (!linked->nodes_locked)
+			continue;
+
+		/*
+		 * Can't block taking an intent lock if we have _any_ nodes read
+		 * locked:
+		 *
+		 * - Our read lock blocks another thread with an intent lock on
+		 *   the same node from getting a write lock, and thus from
+		 *   dropping its intent lock
+		 *
+		 * - And the other thread may have multiple nodes intent locked:
+		 *   both the node we want to intent lock, and the node we
+		 *   already have read locked - deadlock:
+		 */
+		if (type == SIX_LOCK_intent &&
+		    linked->nodes_locked != linked->nodes_intent_locked) {
+			reason = 1;
+			goto deadlock;
+		}
+
+		if (linked->btree_id != path->btree_id) {
+			if (linked->btree_id < path->btree_id)
+				continue;
+
+			reason = 3;
+			goto deadlock;
+		}
+
+		/*
+		 * Within the same btree, non-cached paths come before cached
+		 * paths:
+		 */
+		if (linked->cached != path->cached) {
+			if (!linked->cached)
+				continue;
+
+			reason = 4;
+			goto deadlock;
+		}
+
+		/*
+		 * Interior nodes must be locked before their descendants: if
+		 * another path has possible descendants locked of the node
+		 * we're about to lock, it must have the ancestors locked too:
+		 */
+		if (level > __fls(linked->nodes_locked)) {
+			reason = 5;
+			goto deadlock;
+		}
+
+		/* Must lock btree nodes in key order: */
+		if (btree_node_locked(linked, level) &&
+		    bpos_cmp(pos, btree_node_pos((void *) linked->l[level].b,
+						 linked->cached)) <= 0) {
+			reason = 7;
+			goto deadlock;
+		}
+	}
+
+	return btree_node_lock_type(trans, path, b, pos, level,
+				    type, should_sleep_fn, p);
+deadlock:
+	trace_trans_restart_would_deadlock(trans, ip, reason, linked, path, &pos);
+	return btree_trans_restart(trans, BCH_ERR_transaction_restart_would_deadlock);
+}
+
+/* Btree iterator locking: */
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+
+void bch2_btree_path_verify_locks(struct btree_path *path)
+{
+	unsigned l;
+
+	if (!path->nodes_locked) {
+		BUG_ON(path->uptodate == BTREE_ITER_UPTODATE &&
+		       btree_path_node(path, path->level));
+		return;
+	}
+
+	for (l = 0; btree_path_node(path, l); l++)
+		BUG_ON(btree_lock_want(path, l) !=
+		       btree_node_locked_type(path, l));
+}
+
+void bch2_trans_verify_locks(struct btree_trans *trans)
+{
+	struct btree_path *path;
+
+	trans_for_each_path(trans, path)
+		bch2_btree_path_verify_locks(path);
+}
+
+#endif
+
+/* Btree path locking: */
+
+/*
+ * Only for btree_cache.c - only relocks intent locks
+ */
+int bch2_btree_path_relock_intent(struct btree_trans *trans,
+				  struct btree_path *path)
+{
+	unsigned l;
+
+	for (l = path->level;
+	     l < path->locks_want && btree_path_node(path, l);
+	     l++) {
+		if (!bch2_btree_node_relock(trans, path, l)) {
+			__bch2_btree_path_unlock(trans, path);
+			btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
+			trace_trans_restart_relock_path_intent(trans, _RET_IP_, path);
+			return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_path_intent);
+		}
+	}
+
+	return 0;
+}
+
+__flatten
+bool bch2_btree_path_relock_norestart(struct btree_trans *trans,
+			struct btree_path *path, unsigned long trace_ip)
+{
+	return btree_path_get_locks(trans, path, false);
+}
+
+int __bch2_btree_path_relock(struct btree_trans *trans,
+			struct btree_path *path, unsigned long trace_ip)
+{
+	if (!bch2_btree_path_relock_norestart(trans, path, trace_ip)) {
+		trace_trans_restart_relock_path(trans, trace_ip, path);
+		return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_path);
+	}
+
+	return 0;
+}
+
+__flatten
+bool bch2_btree_path_upgrade_norestart(struct btree_trans *trans,
+			struct btree_path *path, unsigned long trace_ip)
+{
+	return btree_path_get_locks(trans, path, true);
+}
+
+bool bch2_btree_path_upgrade_noupgrade_sibs(struct btree_trans *trans,
+			       struct btree_path *path,
+			       unsigned new_locks_want)
+{
+	EBUG_ON(path->locks_want >= new_locks_want);
+
+	path->locks_want = new_locks_want;
+
+	return btree_path_get_locks(trans, path, true);
+}
+
+bool __bch2_btree_path_upgrade(struct btree_trans *trans,
+			       struct btree_path *path,
+			       unsigned new_locks_want)
+{
+	struct btree_path *linked;
+
+	if (bch2_btree_path_upgrade_noupgrade_sibs(trans, path, new_locks_want))
+		return true;
+
+	/*
+	 * XXX: this is ugly - we'd prefer to not be mucking with other
+	 * iterators in the btree_trans here.
+	 *
+	 * On failure to upgrade the iterator, setting iter->locks_want and
+	 * calling get_locks() is sufficient to make bch2_btree_path_traverse()
+	 * get the locks we want on transaction restart.
+	 *
+	 * But if this iterator was a clone, on transaction restart what we did
+	 * to this iterator isn't going to be preserved.
+	 *
+	 * Possibly we could add an iterator field for the parent iterator when
+	 * an iterator is a copy - for now, we'll just upgrade any other
+	 * iterators with the same btree id.
+	 *
+	 * The code below used to be needed to ensure ancestor nodes get locked
+	 * before interior nodes - now that's handled by
+	 * bch2_btree_path_traverse_all().
+	 */
+	if (!path->cached && !trans->in_traverse_all)
+		trans_for_each_path(trans, linked)
+			if (linked != path &&
+			    linked->cached == path->cached &&
+			    linked->btree_id == path->btree_id &&
+			    linked->locks_want < new_locks_want) {
+				linked->locks_want = new_locks_want;
+				btree_path_get_locks(trans, linked, true);
+			}
+
+	return false;
+}
+
+void __bch2_btree_path_downgrade(struct btree_trans *trans,
+				 struct btree_path *path,
+				 unsigned new_locks_want)
+{
+	unsigned l;
+
+	EBUG_ON(path->locks_want < new_locks_want);
+
+	path->locks_want = new_locks_want;
+
+	while (path->nodes_locked &&
+	       (l = __fls(path->nodes_locked)) >= path->locks_want) {
+		if (l > path->level) {
+			btree_node_unlock(trans, path, l);
+		} else {
+			if (btree_node_intent_locked(path, l)) {
+				six_lock_downgrade(&path->l[l].b->c.lock);
+				path->nodes_intent_locked ^= 1 << l;
+			}
+			break;
+		}
+	}
+
+	bch2_btree_path_verify_locks(path);
+}
+
+void bch2_trans_downgrade(struct btree_trans *trans)
+{
+	struct btree_path *path;
+
+	trans_for_each_path(trans, path)
+		bch2_btree_path_downgrade(trans, path);
+}
+
+/* Btree transaction locking: */
+
+int bch2_trans_relock(struct btree_trans *trans)
+{
+	struct btree_path *path;
+
+	if (unlikely(trans->restarted))
+		return -BCH_ERR_transaction_restart_relock;
+
+	trans_for_each_path(trans, path)
+		if (path->should_be_locked &&
+		    bch2_btree_path_relock(trans, path, _RET_IP_)) {
+			trace_trans_restart_relock(trans, _RET_IP_, path);
+			BUG_ON(!trans->restarted);
+			return -BCH_ERR_transaction_restart_relock;
+		}
+	return 0;
+}
+
+void bch2_trans_unlock(struct btree_trans *trans)
+{
+	struct btree_path *path;
+
+	trans_for_each_path(trans, path)
+		__bch2_btree_path_unlock(trans, path);
+}
diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h
index acc27c3c05d6..5b5fa47844f7 100644
--- a/fs/bcachefs/btree_locking.h
+++ b/fs/bcachefs/btree_locking.h
@@ -13,6 +13,8 @@
 #include "btree_iter.h"
 #include "six.h"
 
+extern struct lock_class_key bch2_btree_node_lock_key;
+
 static inline bool is_btree_node(struct btree_path *path, unsigned l)
 {
 	return l < BTREE_MAX_DEPTH && !IS_ERR_OR_NULL(path->l[l].b);
@@ -300,6 +302,22 @@ static inline void bch2_btree_node_lock_write(struct btree_trans *trans,
 		__bch2_btree_node_lock_write(trans, b);
 }
 
+bool bch2_btree_path_upgrade_noupgrade_sibs(struct btree_trans *,
+			       struct btree_path *, unsigned);
+bool __bch2_btree_path_upgrade(struct btree_trans *,
+			       struct btree_path *, unsigned);
+
+static inline bool bch2_btree_path_upgrade(struct btree_trans *trans,
+					   struct btree_path *path,
+					   unsigned new_locks_want)
+{
+	new_locks_want = min(new_locks_want, BTREE_MAX_DEPTH);
+
+	return path->locks_want < new_locks_want
+		? __bch2_btree_path_upgrade(trans, path, new_locks_want)
+		: path->uptodate == BTREE_ITER_UPTODATE;
+}
+
 static inline void btree_path_set_should_be_locked(struct btree_path *path)
 {
 	EBUG_ON(!btree_node_locked(path, path->level));
@@ -326,4 +344,27 @@ static inline void btree_path_set_level_up(struct btree_trans *trans,
 struct six_lock_count bch2_btree_node_lock_counts(struct btree_trans *,
 				struct btree_path *, struct btree *, unsigned);
 
+bool bch2_btree_path_relock_norestart(struct btree_trans *,
+				      struct btree_path *, unsigned long);
+int __bch2_btree_path_relock(struct btree_trans *,
+			     struct btree_path *, unsigned long);
+
+static inline int bch2_btree_path_relock(struct btree_trans *trans,
+			struct btree_path *path, unsigned long trace_ip)
+{
+	return btree_node_locked(path, path->level)
+		? 0
+		: __bch2_btree_path_relock(trans, path, trace_ip);
+}
+
+int bch2_btree_path_relock(struct btree_trans *, struct btree_path *, unsigned long);
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+void bch2_btree_path_verify_locks(struct btree_path *);
+void bch2_trans_verify_locks(struct btree_trans *);
+#else
+static inline void bch2_btree_path_verify_locks(struct btree_path *path) {}
+static inline void bch2_trans_verify_locks(struct btree_trans *trans) {}
+#endif
+
 #endif /* _BCACHEFS_BTREE_LOCKING_H */
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 0a5803a3a75d..73aaa1196faf 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -336,6 +336,14 @@ struct bkey_cached {
 	struct bkey_i		*k;
 };
 
+static inline struct bpos btree_node_pos(struct btree_bkey_cached_common *b,
+					 bool cached)
+{
+	return !cached
+		? container_of(b, struct btree, c)->key.k.p
+		: container_of(b, struct bkey_cached, c)->key.pos;
+}
+
 struct btree_insert_entry {
 	unsigned		flags;
 	u8			bkey_type;
-- 
cgit 


From 8e5696698d140f599586426fb9a897abb0eaa576 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 19 Aug 2022 19:50:18 -0400
Subject: bcachefs: Reorganize btree_locking.[ch]

Tidy things up a bit before doing more work in this file.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_locking.c | 286 +++++++++++++++++++++++---------------------
 fs/bcachefs/btree_locking.h | 145 +++++++++++-----------
 2 files changed, 225 insertions(+), 206 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c
index 3f20fbcb8389..535232a240dc 100644
--- a/fs/bcachefs/btree_locking.c
+++ b/fs/bcachefs/btree_locking.c
@@ -8,10 +8,12 @@ struct lock_class_key bch2_btree_node_lock_key;
 
 /* Btree node locking: */
 
-void bch2_btree_node_unlock_write(struct btree_trans *trans,
-			struct btree_path *path, struct btree *b)
+static inline void six_lock_readers_add(struct six_lock *lock, int nr)
 {
-	bch2_btree_node_unlock_write_inlined(trans, path, b);
+	if (!lock->readers)
+		atomic64_add(__SIX_VAL(read_lock, nr), &lock->state.counter);
+	else
+		this_cpu_add(*lock->readers, nr);
 }
 
 struct six_lock_count bch2_btree_node_lock_counts(struct btree_trans *trans,
@@ -34,14 +36,16 @@ struct six_lock_count bch2_btree_node_lock_counts(struct btree_trans *trans,
 	return ret;
 }
 
-static inline void six_lock_readers_add(struct six_lock *lock, int nr)
+/* unlock */
+
+void bch2_btree_node_unlock_write(struct btree_trans *trans,
+			struct btree_path *path, struct btree *b)
 {
-	if (!lock->readers)
-		atomic64_add(__SIX_VAL(read_lock, nr), &lock->state.counter);
-	else
-		this_cpu_add(*lock->readers, nr);
+	bch2_btree_node_unlock_write_inlined(trans, path, b);
 }
 
+/* lock */
+
 void __bch2_btree_node_lock_write(struct btree_trans *trans, struct btree *b)
 {
 	int readers = bch2_btree_node_lock_counts(trans, NULL, b, b->c.level).read;
@@ -57,118 +61,6 @@ void __bch2_btree_node_lock_write(struct btree_trans *trans, struct btree *b)
 	six_lock_readers_add(&b->c.lock, readers);
 }
 
-bool __bch2_btree_node_relock(struct btree_trans *trans,
-			      struct btree_path *path, unsigned level)
-{
-	struct btree *b = btree_path_node(path, level);
-	int want = __btree_lock_want(path, level);
-
-	if (!is_btree_node(path, level))
-		goto fail;
-
-	if (race_fault())
-		goto fail;
-
-	if (six_relock_type(&b->c.lock, want, path->l[level].lock_seq) ||
-	    (btree_node_lock_seq_matches(path, b, level) &&
-	     btree_node_lock_increment(trans, b, level, want))) {
-		mark_btree_node_locked(trans, path, level, want);
-		return true;
-	}
-fail:
-	if (b != ERR_PTR(-BCH_ERR_no_btree_node_cached) &&
-	    b != ERR_PTR(-BCH_ERR_no_btree_node_init))
-		trace_btree_node_relock_fail(trans, _RET_IP_, path, level);
-	return false;
-}
-
-bool bch2_btree_node_upgrade(struct btree_trans *trans,
-			     struct btree_path *path, unsigned level)
-{
-	struct btree *b = path->l[level].b;
-
-	if (!is_btree_node(path, level))
-		return false;
-
-	switch (btree_lock_want(path, level)) {
-	case BTREE_NODE_UNLOCKED:
-		BUG_ON(btree_node_locked(path, level));
-		return true;
-	case BTREE_NODE_READ_LOCKED:
-		BUG_ON(btree_node_intent_locked(path, level));
-		return bch2_btree_node_relock(trans, path, level);
-	case BTREE_NODE_INTENT_LOCKED:
-		break;
-	}
-
-	if (btree_node_intent_locked(path, level))
-		return true;
-
-	if (race_fault())
-		return false;
-
-	if (btree_node_locked(path, level)
-	    ? six_lock_tryupgrade(&b->c.lock)
-	    : six_relock_type(&b->c.lock, SIX_LOCK_intent, path->l[level].lock_seq))
-		goto success;
-
-	if (btree_node_lock_seq_matches(path, b, level) &&
-	    btree_node_lock_increment(trans, b, level, BTREE_NODE_INTENT_LOCKED)) {
-		btree_node_unlock(trans, path, level);
-		goto success;
-	}
-
-	trace_btree_node_upgrade_fail(trans, _RET_IP_, path, level);
-	return false;
-success:
-	mark_btree_node_intent_locked(trans, path, level);
-	return true;
-}
-
-static inline bool btree_path_get_locks(struct btree_trans *trans,
-					struct btree_path *path,
-					bool upgrade)
-{
-	unsigned l = path->level;
-	int fail_idx = -1;
-
-	do {
-		if (!btree_path_node(path, l))
-			break;
-
-		if (!(upgrade
-		      ? bch2_btree_node_upgrade(trans, path, l)
-		      : bch2_btree_node_relock(trans, path, l)))
-			fail_idx = l;
-
-		l++;
-	} while (l < path->locks_want);
-
-	/*
-	 * When we fail to get a lock, we have to ensure that any child nodes
-	 * can't be relocked so bch2_btree_path_traverse has to walk back up to
-	 * the node that we failed to relock:
-	 */
-	if (fail_idx >= 0) {
-		__bch2_btree_path_unlock(trans, path);
-		btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
-
-		do {
-			path->l[fail_idx].b = upgrade
-				? ERR_PTR(-BCH_ERR_no_btree_node_upgrade)
-				: ERR_PTR(-BCH_ERR_no_btree_node_relock);
-			--fail_idx;
-		} while (fail_idx >= 0);
-	}
-
-	if (path->uptodate == BTREE_ITER_NEED_RELOCK)
-		path->uptodate = BTREE_ITER_UPTODATE;
-
-	bch2_trans_verify_locks(trans);
-
-	return path->uptodate < BTREE_ITER_NEED_RELOCK;
-}
-
 /* Slowpath: */
 int __bch2_btree_node_lock(struct btree_trans *trans,
 			   struct btree_path *path,
@@ -250,34 +142,121 @@ deadlock:
 	return btree_trans_restart(trans, BCH_ERR_transaction_restart_would_deadlock);
 }
 
-/* Btree iterator locking: */
-
-#ifdef CONFIG_BCACHEFS_DEBUG
+/* relock */
 
-void bch2_btree_path_verify_locks(struct btree_path *path)
+static inline bool btree_path_get_locks(struct btree_trans *trans,
+					struct btree_path *path,
+					bool upgrade)
 {
-	unsigned l;
+	unsigned l = path->level;
+	int fail_idx = -1;
 
-	if (!path->nodes_locked) {
-		BUG_ON(path->uptodate == BTREE_ITER_UPTODATE &&
-		       btree_path_node(path, path->level));
-		return;
+	do {
+		if (!btree_path_node(path, l))
+			break;
+
+		if (!(upgrade
+		      ? bch2_btree_node_upgrade(trans, path, l)
+		      : bch2_btree_node_relock(trans, path, l)))
+			fail_idx = l;
+
+		l++;
+	} while (l < path->locks_want);
+
+	/*
+	 * When we fail to get a lock, we have to ensure that any child nodes
+	 * can't be relocked so bch2_btree_path_traverse has to walk back up to
+	 * the node that we failed to relock:
+	 */
+	if (fail_idx >= 0) {
+		__bch2_btree_path_unlock(trans, path);
+		btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
+
+		do {
+			path->l[fail_idx].b = upgrade
+				? ERR_PTR(-BCH_ERR_no_btree_node_upgrade)
+				: ERR_PTR(-BCH_ERR_no_btree_node_relock);
+			--fail_idx;
+		} while (fail_idx >= 0);
 	}
 
-	for (l = 0; btree_path_node(path, l); l++)
-		BUG_ON(btree_lock_want(path, l) !=
-		       btree_node_locked_type(path, l));
+	if (path->uptodate == BTREE_ITER_NEED_RELOCK)
+		path->uptodate = BTREE_ITER_UPTODATE;
+
+	bch2_trans_verify_locks(trans);
+
+	return path->uptodate < BTREE_ITER_NEED_RELOCK;
 }
 
-void bch2_trans_verify_locks(struct btree_trans *trans)
+bool __bch2_btree_node_relock(struct btree_trans *trans,
+			      struct btree_path *path, unsigned level)
 {
-	struct btree_path *path;
+	struct btree *b = btree_path_node(path, level);
+	int want = __btree_lock_want(path, level);
 
-	trans_for_each_path(trans, path)
-		bch2_btree_path_verify_locks(path);
+	if (!is_btree_node(path, level))
+		goto fail;
+
+	if (race_fault())
+		goto fail;
+
+	if (six_relock_type(&b->c.lock, want, path->l[level].lock_seq) ||
+	    (btree_node_lock_seq_matches(path, b, level) &&
+	     btree_node_lock_increment(trans, b, level, want))) {
+		mark_btree_node_locked(trans, path, level, want);
+		return true;
+	}
+fail:
+	if (b != ERR_PTR(-BCH_ERR_no_btree_node_cached) &&
+	    b != ERR_PTR(-BCH_ERR_no_btree_node_init))
+		trace_btree_node_relock_fail(trans, _RET_IP_, path, level);
+	return false;
 }
 
-#endif
+/* upgrade */
+
+bool bch2_btree_node_upgrade(struct btree_trans *trans,
+			     struct btree_path *path, unsigned level)
+{
+	struct btree *b = path->l[level].b;
+
+	if (!is_btree_node(path, level))
+		return false;
+
+	switch (btree_lock_want(path, level)) {
+	case BTREE_NODE_UNLOCKED:
+		BUG_ON(btree_node_locked(path, level));
+		return true;
+	case BTREE_NODE_READ_LOCKED:
+		BUG_ON(btree_node_intent_locked(path, level));
+		return bch2_btree_node_relock(trans, path, level);
+	case BTREE_NODE_INTENT_LOCKED:
+		break;
+	}
+
+	if (btree_node_intent_locked(path, level))
+		return true;
+
+	if (race_fault())
+		return false;
+
+	if (btree_node_locked(path, level)
+	    ? six_lock_tryupgrade(&b->c.lock)
+	    : six_relock_type(&b->c.lock, SIX_LOCK_intent, path->l[level].lock_seq))
+		goto success;
+
+	if (btree_node_lock_seq_matches(path, b, level) &&
+	    btree_node_lock_increment(trans, b, level, BTREE_NODE_INTENT_LOCKED)) {
+		btree_node_unlock(trans, path, level);
+		goto success;
+	}
+
+	trace_btree_node_upgrade_fail(trans, _RET_IP_, path, level);
+	return false;
+success:
+	mark_btree_node_intent_locked(trans, path, level);
+	return true;
+}
 
 /* Btree path locking: */
 
@@ -406,6 +385,8 @@ void __bch2_btree_path_downgrade(struct btree_trans *trans,
 	bch2_btree_path_verify_locks(path);
 }
 
+/* Btree transaction locking: */
+
 void bch2_trans_downgrade(struct btree_trans *trans)
 {
 	struct btree_path *path;
@@ -414,8 +395,6 @@ void bch2_trans_downgrade(struct btree_trans *trans)
 		bch2_btree_path_downgrade(trans, path);
 }
 
-/* Btree transaction locking: */
-
 int bch2_trans_relock(struct btree_trans *trans)
 {
 	struct btree_path *path;
@@ -440,3 +419,32 @@ void bch2_trans_unlock(struct btree_trans *trans)
 	trans_for_each_path(trans, path)
 		__bch2_btree_path_unlock(trans, path);
 }
+
+/* Debug */
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+
+void bch2_btree_path_verify_locks(struct btree_path *path)
+{
+	unsigned l;
+
+	if (!path->nodes_locked) {
+		BUG_ON(path->uptodate == BTREE_ITER_UPTODATE &&
+		       btree_path_node(path, path->level));
+		return;
+	}
+
+	for (l = 0; btree_path_node(path, l); l++)
+		BUG_ON(btree_lock_want(path, l) !=
+		       btree_node_locked_type(path, l));
+}
+
+void bch2_trans_verify_locks(struct btree_trans *trans)
+{
+	struct btree_path *path;
+
+	trans_for_each_path(trans, path)
+		bch2_btree_path_verify_locks(path);
+}
+
+#endif
diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h
index 5b5fa47844f7..ea00c190dea8 100644
--- a/fs/bcachefs/btree_locking.h
+++ b/fs/bcachefs/btree_locking.h
@@ -20,6 +20,13 @@ static inline bool is_btree_node(struct btree_path *path, unsigned l)
 	return l < BTREE_MAX_DEPTH && !IS_ERR_OR_NULL(path->l[l].b);
 }
 
+static inline struct btree_transaction_stats *btree_trans_stats(struct btree_trans *trans)
+{
+	return trans->fn_idx < ARRAY_SIZE(trans->c->btree_transaction_stats)
+		? &trans->c->btree_transaction_stats[trans->fn_idx]
+		: NULL;
+}
+
 /* matches six lock types */
 enum btree_node_locked_type {
 	BTREE_NODE_UNLOCKED		= -1,
@@ -114,13 +121,6 @@ btree_lock_want(struct btree_path *path, int level)
 	return BTREE_NODE_UNLOCKED;
 }
 
-static inline struct btree_transaction_stats *btree_trans_stats(struct btree_trans *trans)
-{
-	return trans->fn_idx < ARRAY_SIZE(trans->c->btree_transaction_stats)
-		? &trans->c->btree_transaction_stats[trans->fn_idx]
-		: NULL;
-}
-
 static void btree_trans_lock_hold_time_update(struct btree_trans *trans,
 					      struct btree_path *path, unsigned level)
 {
@@ -134,6 +134,22 @@ static void btree_trans_lock_hold_time_update(struct btree_trans *trans,
 #endif
 }
 
+static inline enum bch_time_stats lock_to_time_stat(enum six_lock_type type)
+{
+	switch (type) {
+	case SIX_LOCK_read:
+		return BCH_TIME_btree_lock_contended_read;
+	case SIX_LOCK_intent:
+		return BCH_TIME_btree_lock_contended_intent;
+	case SIX_LOCK_write:
+		return BCH_TIME_btree_lock_contended_write;
+	default:
+		BUG();
+	}
+}
+
+/* unlock: */
+
 static inline void btree_node_unlock(struct btree_trans *trans,
 				     struct btree_path *path, unsigned level)
 {
@@ -157,20 +173,30 @@ static inline void __bch2_btree_path_unlock(struct btree_trans *trans,
 		btree_node_unlock(trans, path, __ffs(path->nodes_locked));
 }
 
-static inline enum bch_time_stats lock_to_time_stat(enum six_lock_type type)
+/*
+ * Updates the saved lock sequence number, so that bch2_btree_node_relock() will
+ * succeed:
+ */
+static inline void
+bch2_btree_node_unlock_write_inlined(struct btree_trans *trans, struct btree_path *path,
+				     struct btree *b)
 {
-	switch (type) {
-	case SIX_LOCK_read:
-		return BCH_TIME_btree_lock_contended_read;
-	case SIX_LOCK_intent:
-		return BCH_TIME_btree_lock_contended_intent;
-	case SIX_LOCK_write:
-		return BCH_TIME_btree_lock_contended_write;
-	default:
-		BUG();
-	}
+	struct btree_path *linked;
+
+	EBUG_ON(path->l[b->c.level].b != b);
+	EBUG_ON(path->l[b->c.level].lock_seq + 1 != b->c.lock.state.seq);
+
+	trans_for_each_path_with_node(trans, b, linked)
+		linked->l[b->c.level].lock_seq += 2;
+
+	six_unlock_write(&b->c.lock);
 }
 
+void bch2_btree_node_unlock_write(struct btree_trans *,
+			struct btree_path *, struct btree *);
+
+/* lock: */
+
 static inline int btree_node_lock_type(struct btree_trans *trans,
 				       struct btree_path *path,
 				       struct btree *b,
@@ -253,41 +279,6 @@ static inline int btree_node_lock(struct btree_trans *trans,
 	return ret;
 }
 
-bool __bch2_btree_node_relock(struct btree_trans *, struct btree_path *, unsigned);
-
-static inline bool bch2_btree_node_relock(struct btree_trans *trans,
-					  struct btree_path *path, unsigned level)
-{
-	EBUG_ON(btree_node_locked(path, level) &&
-		btree_node_locked_type(path, level) !=
-		__btree_lock_want(path, level));
-
-	return likely(btree_node_locked(path, level)) ||
-		__bch2_btree_node_relock(trans, path, level);
-}
-
-/*
- * Updates the saved lock sequence number, so that bch2_btree_node_relock() will
- * succeed:
- */
-static inline void
-bch2_btree_node_unlock_write_inlined(struct btree_trans *trans, struct btree_path *path,
-				     struct btree *b)
-{
-	struct btree_path *linked;
-
-	EBUG_ON(path->l[b->c.level].b != b);
-	EBUG_ON(path->l[b->c.level].lock_seq + 1 != b->c.lock.state.seq);
-
-	trans_for_each_path_with_node(trans, b, linked)
-		linked->l[b->c.level].lock_seq += 2;
-
-	six_unlock_write(&b->c.lock);
-}
-
-void bch2_btree_node_unlock_write(struct btree_trans *,
-			struct btree_path *, struct btree *);
-
 void __bch2_btree_node_lock_write(struct btree_trans *, struct btree *);
 
 static inline void bch2_btree_node_lock_write(struct btree_trans *trans,
@@ -302,6 +293,36 @@ static inline void bch2_btree_node_lock_write(struct btree_trans *trans,
 		__bch2_btree_node_lock_write(trans, b);
 }
 
+/* relock: */
+
+bool bch2_btree_path_relock_norestart(struct btree_trans *,
+				      struct btree_path *, unsigned long);
+int __bch2_btree_path_relock(struct btree_trans *,
+			     struct btree_path *, unsigned long);
+
+static inline int bch2_btree_path_relock(struct btree_trans *trans,
+				struct btree_path *path, unsigned long trace_ip)
+{
+	return btree_node_locked(path, path->level)
+		? 0
+		: __bch2_btree_path_relock(trans, path, trace_ip);
+}
+
+bool __bch2_btree_node_relock(struct btree_trans *, struct btree_path *, unsigned);
+
+static inline bool bch2_btree_node_relock(struct btree_trans *trans,
+					  struct btree_path *path, unsigned level)
+{
+	EBUG_ON(btree_node_locked(path, level) &&
+		btree_node_locked_type(path, level) !=
+		__btree_lock_want(path, level));
+
+	return likely(btree_node_locked(path, level)) ||
+		__bch2_btree_node_relock(trans, path, level);
+}
+
+/* upgrade */
+
 bool bch2_btree_path_upgrade_noupgrade_sibs(struct btree_trans *,
 			       struct btree_path *, unsigned);
 bool __bch2_btree_path_upgrade(struct btree_trans *,
@@ -318,6 +339,8 @@ static inline bool bch2_btree_path_upgrade(struct btree_trans *trans,
 		: path->uptodate == BTREE_ITER_UPTODATE;
 }
 
+/* misc: */
+
 static inline void btree_path_set_should_be_locked(struct btree_path *path)
 {
 	EBUG_ON(!btree_node_locked(path, path->level));
@@ -341,23 +364,11 @@ static inline void btree_path_set_level_up(struct btree_trans *trans,
 	btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
 }
 
+/* debug */
+
 struct six_lock_count bch2_btree_node_lock_counts(struct btree_trans *,
 				struct btree_path *, struct btree *, unsigned);
 
-bool bch2_btree_path_relock_norestart(struct btree_trans *,
-				      struct btree_path *, unsigned long);
-int __bch2_btree_path_relock(struct btree_trans *,
-			     struct btree_path *, unsigned long);
-
-static inline int bch2_btree_path_relock(struct btree_trans *trans,
-			struct btree_path *path, unsigned long trace_ip)
-{
-	return btree_node_locked(path, path->level)
-		? 0
-		: __bch2_btree_path_relock(trans, path, trace_ip);
-}
-
-int bch2_btree_path_relock(struct btree_trans *, struct btree_path *, unsigned long);
 
 #ifdef CONFIG_BCACHEFS_DEBUG
 void bch2_btree_path_verify_locks(struct btree_path *);
-- 
cgit 


From d4263e563879f6dda86052881fbbc9e21e6e07f5 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 21 Aug 2022 18:17:51 -0400
Subject: bcachefs: Better use of locking helpers

Held btree locks are tracked in btree_path->nodes_locked and
btree_path->nodes_intent_locked. Upcoming patches are going to change
the representation in struct btree_path, so this patch switches to
proper helpers instead of direct access to these fields.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_iter.c        |  6 ++---
 fs/bcachefs/btree_locking.c     | 20 ++++++++++++-----
 fs/bcachefs/btree_locking.h     | 50 +++++++++++++++++++++++------------------
 fs/bcachefs/btree_update_leaf.c | 21 +++++++++--------
 4 files changed, 58 insertions(+), 39 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 488b56a209e3..147250ce3af8 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -348,7 +348,7 @@ void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id,
 		if (cmp < 0)
 			continue;
 
-		if (!(path->nodes_locked & 1) ||
+		if (!btree_node_locked(path, 0) ||
 		    !path->should_be_locked)
 			continue;
 
@@ -3053,8 +3053,8 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct btree_trans *trans)
 		for (l = 0; l < BTREE_MAX_DEPTH; l++) {
 			if (btree_node_locked(path, l) &&
 			    !IS_ERR_OR_NULL(b = (void *) READ_ONCE(path->l[l].b))) {
-				prt_printf(out, "    %s l=%u ",
-				       btree_node_intent_locked(path, l) ? "i" : "r", l);
+				prt_printf(out, "    %c l=%u ",
+					   lock_types[btree_node_locked_type(path, l)], l);
 				bch2_btree_path_node_to_text(out, b, path->cached);
 				prt_printf(out, "\n");
 			}
diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c
index 535232a240dc..aac07e5e6854 100644
--- a/fs/bcachefs/btree_locking.c
+++ b/fs/bcachefs/btree_locking.c
@@ -61,6 +61,16 @@ void __bch2_btree_node_lock_write(struct btree_trans *trans, struct btree *b)
 	six_lock_readers_add(&b->c.lock, readers);
 }
 
+static inline bool path_has_read_locks(struct btree_path *path)
+{
+	unsigned l;
+
+	for (l = 0; l < BTREE_MAX_DEPTH; l++)
+		if (btree_node_read_locked(path, l))
+			return true;
+	return false;
+}
+
 /* Slowpath: */
 int __bch2_btree_node_lock(struct btree_trans *trans,
 			   struct btree_path *path,
@@ -91,7 +101,7 @@ int __bch2_btree_node_lock(struct btree_trans *trans,
 		 *   already have read locked - deadlock:
 		 */
 		if (type == SIX_LOCK_intent &&
-		    linked->nodes_locked != linked->nodes_intent_locked) {
+		    path_has_read_locks(linked)) {
 			reason = 1;
 			goto deadlock;
 		}
@@ -121,7 +131,7 @@ int __bch2_btree_node_lock(struct btree_trans *trans,
 		 * another path has possible descendants locked of the node
 		 * we're about to lock, it must have the ancestors locked too:
 		 */
-		if (level > __fls(linked->nodes_locked)) {
+		if (level > btree_path_highest_level_locked(linked)) {
 			reason = 5;
 			goto deadlock;
 		}
@@ -254,7 +264,7 @@ bool bch2_btree_node_upgrade(struct btree_trans *trans,
 	trace_btree_node_upgrade_fail(trans, _RET_IP_, path, level);
 	return false;
 success:
-	mark_btree_node_intent_locked(trans, path, level);
+	mark_btree_node_locked_noreset(path, level, SIX_LOCK_intent);
 	return true;
 }
 
@@ -370,13 +380,13 @@ void __bch2_btree_path_downgrade(struct btree_trans *trans,
 	path->locks_want = new_locks_want;
 
 	while (path->nodes_locked &&
-	       (l = __fls(path->nodes_locked)) >= path->locks_want) {
+	       (l = btree_path_highest_level_locked(path)) >= path->locks_want) {
 		if (l > path->level) {
 			btree_node_unlock(trans, path, l);
 		} else {
 			if (btree_node_intent_locked(path, l)) {
 				six_lock_downgrade(&path->l[l].b->c.lock);
-				path->nodes_intent_locked ^= 1 << l;
+				mark_btree_node_locked_noreset(path, l, SIX_LOCK_read);
 			}
 			break;
 		}
diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h
index ea00c190dea8..f00abaaa0ab5 100644
--- a/fs/bcachefs/btree_locking.h
+++ b/fs/bcachefs/btree_locking.h
@@ -61,27 +61,30 @@ static inline bool btree_node_read_locked(struct btree_path *path,
 
 static inline bool btree_node_locked(struct btree_path *path, unsigned level)
 {
-	return path->nodes_locked & (1 << level);
+	return btree_node_locked_type(path, level) != BTREE_NODE_UNLOCKED;
 }
 
-static inline void mark_btree_node_unlocked(struct btree_path *path,
-					    unsigned level)
-{
-	path->nodes_locked &= ~(1 << level);
-	path->nodes_intent_locked &= ~(1 << level);
-}
-
-static inline void mark_btree_node_locked_noreset(struct btree_trans *trans,
-					  struct btree_path *path,
+static inline void mark_btree_node_locked_noreset(struct btree_path *path,
 					  unsigned level,
-					  enum six_lock_type type)
+					  enum btree_node_locked_type type)
 {
 	/* relying on this to avoid a branch */
 	BUILD_BUG_ON(SIX_LOCK_read   != 0);
 	BUILD_BUG_ON(SIX_LOCK_intent != 1);
 
-	path->nodes_locked |= 1 << level;
-	path->nodes_intent_locked |= type << level;
+	path->nodes_locked &= ~(1 << level);
+	path->nodes_intent_locked &= ~(1 << level);
+
+	if (type != BTREE_NODE_UNLOCKED) {
+		path->nodes_locked |= 1 << level;
+		path->nodes_intent_locked |= type << level;
+	}
+}
+
+static inline void mark_btree_node_unlocked(struct btree_path *path,
+					    unsigned level)
+{
+	mark_btree_node_locked_noreset(path, level, BTREE_NODE_UNLOCKED);
 }
 
 static inline void mark_btree_node_locked(struct btree_trans *trans,
@@ -89,19 +92,12 @@ static inline void mark_btree_node_locked(struct btree_trans *trans,
 					  unsigned level,
 					  enum six_lock_type type)
 {
-	mark_btree_node_locked_noreset(trans, path, level, type);
+	mark_btree_node_locked_noreset(path, level, type);
 #ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS
 	path->l[level].lock_taken_time = ktime_get_ns();
 #endif
 }
 
-static inline void mark_btree_node_intent_locked(struct btree_trans *trans,
-						 struct btree_path *path,
-						 unsigned level)
-{
-	mark_btree_node_locked_noreset(trans, path, level, SIX_LOCK_intent);
-}
-
 static inline enum six_lock_type __btree_lock_want(struct btree_path *path, int level)
 {
 	return level < path->locks_want
@@ -164,13 +160,23 @@ static inline void btree_node_unlock(struct btree_trans *trans,
 	mark_btree_node_unlocked(path, level);
 }
 
+static inline int btree_path_lowest_level_locked(struct btree_path *path)
+{
+	return __ffs(path->nodes_locked);
+}
+
+static inline int btree_path_highest_level_locked(struct btree_path *path)
+{
+	return __fls(path->nodes_locked);
+}
+
 static inline void __bch2_btree_path_unlock(struct btree_trans *trans,
 					    struct btree_path *path)
 {
 	btree_path_set_dirty(path, BTREE_ITER_NEED_RELOCK);
 
 	while (path->nodes_locked)
-		btree_node_unlock(trans, path, __ffs(path->nodes_locked));
+		btree_node_unlock(trans, path, btree_path_lowest_level_locked(path));
 }
 
 /*
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 11c3767896aa..f8641b9f4abf 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -747,11 +747,12 @@ static inline void path_upgrade_readers(struct btree_trans *trans, struct btree_
 static inline void upgrade_readers(struct btree_trans *trans, struct btree_path *path)
 {
 	struct btree *b = path_l(path)->b;
+	unsigned l;
 
 	do {
-		if (path->nodes_locked &&
-		    path->nodes_locked != path->nodes_intent_locked)
-			path_upgrade_readers(trans, path);
+		for (l = 0; l < BTREE_MAX_DEPTH; l++)
+			if (btree_node_read_locked(path, l))
+				path_upgrade_readers(trans, path);
 	} while ((path = prev_btree_path(trans, path)) &&
 		 path_l(path)->b == b);
 }
@@ -770,11 +771,13 @@ static inline void normalize_read_intent_locks(struct btree_trans *trans)
 			? trans->paths + trans->sorted[i + 1]
 			: NULL;
 
-		if (path->nodes_locked) {
-			if (path->nodes_intent_locked)
-				nr_intent++;
-			else
-				nr_read++;
+		switch (btree_node_locked_type(path, path->level)) {
+		case BTREE_NODE_READ_LOCKED:
+			nr_read++;
+			break;
+		case BTREE_NODE_INTENT_LOCKED:
+			nr_intent++;
+			break;
 		}
 
 		if (!next || path_l(path)->b != path_l(next)->b) {
@@ -797,7 +800,7 @@ static inline bool have_conflicting_read_lock(struct btree_trans *trans, struct
 		//if (path == pos)
 		//	break;
 
-		if (path->nodes_locked != path->nodes_intent_locked &&
+		if (btree_node_read_locked(path, path->level) &&
 		    !bch2_btree_path_upgrade(trans, path, path->level + 1))
 			return true;
 	}
-- 
cgit 


From 2e27f6567b2662a2f7440a651e007ebc77cdcc7a Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 21 Aug 2022 17:20:42 -0400
Subject: bcachefs: Kill nodes_intent_locked

Previously, we used two different bit arrays for tracking held btree
node locks. This patch switches to an array of two bit integers, which
will let us track, in a future patch, when we hold a write lock.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_iter.c    |  2 --
 fs/bcachefs/btree_locking.h | 26 +++++++-------------------
 fs/bcachefs/btree_types.h   |  5 ++---
 3 files changed, 9 insertions(+), 24 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 147250ce3af8..1dc243f63b2d 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1519,7 +1519,6 @@ static struct btree_path *btree_path_alloc(struct btree_trans *trans,
 	path->ref		= 0;
 	path->intent_ref	= 0;
 	path->nodes_locked	= 0;
-	path->nodes_intent_locked = 0;
 
 	btree_path_list_add(trans, pos, path);
 	trans->paths_sorted = false;
@@ -1574,7 +1573,6 @@ struct btree_path *bch2_path_get(struct btree_trans *trans,
 		path->level			= level;
 		path->locks_want		= locks_want;
 		path->nodes_locked		= 0;
-		path->nodes_intent_locked	= 0;
 		for (i = 0; i < ARRAY_SIZE(path->l); i++)
 			path->l[i].b		= ERR_PTR(-BCH_ERR_no_btree_node_init);
 #ifdef CONFIG_BCACHEFS_DEBUG
diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h
index f00abaaa0ab5..2253a15d61c9 100644
--- a/fs/bcachefs/btree_locking.h
+++ b/fs/bcachefs/btree_locking.h
@@ -37,14 +37,7 @@ enum btree_node_locked_type {
 static inline int btree_node_locked_type(struct btree_path *path,
 					 unsigned level)
 {
-	/*
-	 * We're relying on the fact that if nodes_intent_locked is set
-	 * nodes_locked must be set as well, so that we can compute without
-	 * branches:
-	 */
-	return BTREE_NODE_UNLOCKED +
-		((path->nodes_locked >> level) & 1) +
-		((path->nodes_intent_locked >> level) & 1);
+	return BTREE_NODE_UNLOCKED + ((path->nodes_locked >> (level << 1)) & 3);
 }
 
 static inline bool btree_node_intent_locked(struct btree_path *path,
@@ -65,20 +58,15 @@ static inline bool btree_node_locked(struct btree_path *path, unsigned level)
 }
 
 static inline void mark_btree_node_locked_noreset(struct btree_path *path,
-					  unsigned level,
-					  enum btree_node_locked_type type)
+						  unsigned level,
+						  enum btree_node_locked_type type)
 {
 	/* relying on this to avoid a branch */
 	BUILD_BUG_ON(SIX_LOCK_read   != 0);
 	BUILD_BUG_ON(SIX_LOCK_intent != 1);
 
-	path->nodes_locked &= ~(1 << level);
-	path->nodes_intent_locked &= ~(1 << level);
-
-	if (type != BTREE_NODE_UNLOCKED) {
-		path->nodes_locked |= 1 << level;
-		path->nodes_intent_locked |= type << level;
-	}
+	path->nodes_locked &= ~(3U << (level << 1));
+	path->nodes_locked |= (type + 1) << (level << 1);
 }
 
 static inline void mark_btree_node_unlocked(struct btree_path *path,
@@ -162,12 +150,12 @@ static inline void btree_node_unlock(struct btree_trans *trans,
 
 static inline int btree_path_lowest_level_locked(struct btree_path *path)
 {
-	return __ffs(path->nodes_locked);
+	return __ffs(path->nodes_locked) >> 1;
 }
 
 static inline int btree_path_highest_level_locked(struct btree_path *path)
 {
-	return __fls(path->nodes_locked);
+	return __fls(path->nodes_locked) >> 1;
 }
 
 static inline void __bch2_btree_path_unlock(struct btree_trans *trans,
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 73aaa1196faf..ce148c21fd3b 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -232,9 +232,8 @@ struct btree_path {
 	 */
 	bool			should_be_locked:1;
 	unsigned		level:3,
-				locks_want:4,
-				nodes_locked:4,
-				nodes_intent_locked:4;
+				locks_want:4;
+	u8			nodes_locked;
 
 	struct btree_path_level {
 		struct btree	*b;
-- 
cgit 


From e3738c6909d69e980d8b56d33df2e438a2c1c798 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 21 Aug 2022 23:08:53 -0400
Subject: six locks: Improve six_lock_count

six_lock_count now counts up whether a write lock held, and this patch
now also correctly counts six_lock->intent_lock_recurse.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_locking.c | 12 ++++++++----
 fs/bcachefs/six.c           | 10 +++++++---
 fs/bcachefs/six.h           |  3 +--
 fs/bcachefs/trace.h         |  8 ++++----
 4 files changed, 20 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c
index aac07e5e6854..d46109320957 100644
--- a/fs/bcachefs/btree_locking.c
+++ b/fs/bcachefs/btree_locking.c
@@ -22,15 +22,19 @@ struct six_lock_count bch2_btree_node_lock_counts(struct btree_trans *trans,
 						  unsigned level)
 {
 	struct btree_path *path;
-	struct six_lock_count ret = { 0, 0 };
+	struct six_lock_count ret;
+
+	memset(&ret, 0, sizeof(ret));
 
 	if (IS_ERR_OR_NULL(b))
 		return ret;
 
 	trans_for_each_path(trans, path)
 		if (path != skip && path->l[level].b == b) {
-			ret.read += btree_node_read_locked(path, level);
-			ret.intent += btree_node_intent_locked(path, level);
+			int t = btree_node_locked_type(path, level);
+
+			if (t != BTREE_NODE_UNLOCKED)
+				ret.n[t]++;
 		}
 
 	return ret;
@@ -48,7 +52,7 @@ void bch2_btree_node_unlock_write(struct btree_trans *trans,
 
 void __bch2_btree_node_lock_write(struct btree_trans *trans, struct btree *b)
 {
-	int readers = bch2_btree_node_lock_counts(trans, NULL, b, b->c.level).read;
+	int readers = bch2_btree_node_lock_counts(trans, NULL, b, b->c.level).n[SIX_LOCK_read];
 
 	/*
 	 * Must drop our read locks before calling six_lock_write() -
diff --git a/fs/bcachefs/six.c b/fs/bcachefs/six.c
index 9dd4b71e63ab..464b1313d358 100644
--- a/fs/bcachefs/six.c
+++ b/fs/bcachefs/six.c
@@ -764,15 +764,19 @@ EXPORT_SYMBOL_GPL(six_lock_pcpu_alloc);
  */
 struct six_lock_count six_lock_counts(struct six_lock *lock)
 {
-	struct six_lock_count ret = { 0, lock->state.intent_lock };
+	struct six_lock_count ret;
+
+	ret.n[SIX_LOCK_read]	= 0;
+	ret.n[SIX_LOCK_intent]	= lock->state.intent_lock + lock->intent_lock_recurse;
+	ret.n[SIX_LOCK_write]	= lock->state.seq & 1;
 
 	if (!lock->readers)
-		ret.read += lock->state.read_lock;
+		ret.n[SIX_LOCK_read] += lock->state.read_lock;
 	else {
 		int cpu;
 
 		for_each_possible_cpu(cpu)
-			ret.read += *per_cpu_ptr(lock->readers, cpu);
+			ret.n[SIX_LOCK_read] += *per_cpu_ptr(lock->readers, cpu);
 	}
 
 	return ret;
diff --git a/fs/bcachefs/six.h b/fs/bcachefs/six.h
index 08d0e0c7f2b4..59d796cfde43 100644
--- a/fs/bcachefs/six.h
+++ b/fs/bcachefs/six.h
@@ -206,8 +206,7 @@ void six_lock_pcpu_free(struct six_lock *);
 void six_lock_pcpu_alloc(struct six_lock *);
 
 struct six_lock_count {
-	unsigned read;
-	unsigned intent;
+	unsigned n[3];
 };
 
 struct six_lock_count six_lock_counts(struct six_lock *);
diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h
index 9353191c4fc8..db05be59fa35 100644
--- a/fs/bcachefs/trace.h
+++ b/fs/bcachefs/trace.h
@@ -453,11 +453,11 @@ TRACE_EVENT(btree_node_upgrade_fail,
 		__entry->locked			= btree_node_locked(path, level);
 
 		c = bch2_btree_node_lock_counts(trans, NULL, path->l[level].b, level),
-		__entry->self_read_count	= c.read;
-		__entry->self_intent_count	= c.intent;
+		__entry->self_read_count	= c.n[SIX_LOCK_read];
+		__entry->self_intent_count	= c.n[SIX_LOCK_intent];
 		c = six_lock_counts(&path->l[level].b->c.lock);
-		__entry->read_count		= c.read;
-		__entry->intent_count		= c.intent;
+		__entry->read_count		= c.n[SIX_LOCK_read];
+		__entry->intent_count		= c.n[SIX_LOCK_read];
 	),
 
 	TP_printk("%s %pS btree %s pos %llu:%llu:%u, locked %u held %u:%u lock count %u:%u",
-- 
cgit 


From 616928c30f594775953ca75eb7ccc312a8abeb73 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 22 Aug 2022 21:49:55 -0400
Subject: bcachefs: Track maximum transaction memory

This patch
 - tracks maximum bch2_trans_kmalloc() memory used in btree_transaction_stats
 - makes it available in debugfs
 - switches bch2_trans_init() to using that for the amount of memory to
   preallocate, instead of the parameter passed in

This drastically reduces transaction restarts, and means we no longer
need to track this in the source code.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs.h    |  1 +
 fs/bcachefs/btree_iter.c  | 23 +++++++++++++----------
 fs/bcachefs/btree_iter.h  |  5 ++---
 fs/bcachefs/btree_types.h |  1 +
 fs/bcachefs/debug.c       |  3 +++
 5 files changed, 20 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index f8b7434534eb..9e6c10dfa443 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -533,6 +533,7 @@ struct btree_transaction_stats {
 	struct bch2_time_stats	lock_hold_times;
 	struct mutex		lock;
 	unsigned		nr_max_paths;
+	unsigned		max_mem;
 	char			*max_paths_text;
 };
 
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 1dc243f63b2d..f62f75ff82b2 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -2747,9 +2747,11 @@ void bch2_trans_copy_iter(struct btree_iter *dst, struct btree_iter *src)
 
 void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
 {
-	size_t new_top = trans->mem_top + size;
+	unsigned new_top = trans->mem_top + size;
 	void *p;
 
+	trans->mem_max = max(trans->mem_max, new_top);
+
 	if (new_top > trans->mem_bytes) {
 		size_t old_bytes = trans->mem_bytes;
 		size_t new_bytes = roundup_pow_of_two(new_top);
@@ -2887,10 +2889,7 @@ static inline unsigned bch2_trans_get_fn_idx(struct btree_trans *trans, struct b
 	return i;
 }
 
-void __bch2_trans_init(struct btree_trans *trans, struct bch_fs *c,
-		       unsigned expected_nr_iters,
-		       size_t expected_mem_bytes,
-		       const char *fn)
+void __bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, const char *fn)
 	__acquires(&c->btree_trans_barrier)
 {
 	struct btree_transaction_stats *s;
@@ -2906,8 +2905,10 @@ void __bch2_trans_init(struct btree_trans *trans, struct bch_fs *c,
 
 	bch2_trans_alloc_paths(trans, c);
 
-	if (expected_mem_bytes) {
-		expected_mem_bytes = roundup_pow_of_two(expected_mem_bytes);
+	s = btree_trans_stats(trans);
+	if (s) {
+		unsigned expected_mem_bytes = roundup_pow_of_two(s->max_mem);
+
 		trans->mem = kmalloc(expected_mem_bytes, GFP_KERNEL);
 
 		if (!unlikely(trans->mem)) {
@@ -2916,11 +2917,9 @@ void __bch2_trans_init(struct btree_trans *trans, struct bch_fs *c,
 		} else {
 			trans->mem_bytes = expected_mem_bytes;
 		}
-	}
 
-	s = btree_trans_stats(trans);
-	if (s)
 		trans->nr_max_paths = s->nr_max_paths;
+	}
 
 	trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
 
@@ -2967,9 +2966,13 @@ void bch2_trans_exit(struct btree_trans *trans)
 {
 	struct btree_insert_entry *i;
 	struct bch_fs *c = trans->c;
+	struct btree_transaction_stats *s = btree_trans_stats(trans);
 
 	bch2_trans_unlock(trans);
 
+	if (s)
+		s->max_mem = max(s->max_mem, trans->mem_max);
+
 	trans_for_each_update(trans, i)
 		__btree_path_put(i->path, true);
 	trans->nr_updates		= 0;
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index c083e49475d1..87b456998ef4 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -564,11 +564,10 @@ void bch2_btree_path_to_text(struct printbuf *, struct btree_path *);
 void bch2_trans_paths_to_text(struct printbuf *, struct btree_trans *);
 void bch2_dump_trans_updates(struct btree_trans *);
 void bch2_dump_trans_paths_updates(struct btree_trans *);
-void __bch2_trans_init(struct btree_trans *, struct bch_fs *,
-		       unsigned, size_t, const char *);
+void __bch2_trans_init(struct btree_trans *, struct bch_fs *, const char *);
 void bch2_trans_exit(struct btree_trans *);
 
-#define bch2_trans_init(...)	__bch2_trans_init(__VA_ARGS__, __func__)
+#define bch2_trans_init(_trans, _c, _nr_iters, _mem) __bch2_trans_init(_trans, _c, __func__)
 
 void bch2_btree_trans_to_text(struct printbuf *, struct btree_trans *);
 
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index ce148c21fd3b..42459a5bf035 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -420,6 +420,7 @@ struct btree_trans {
 	u64			paths_allocated;
 
 	unsigned		mem_top;
+	unsigned		mem_max;
 	unsigned		mem_bytes;
 	void			*mem;
 
diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c
index 86c4b023ac7c..4fe20d36212e 100644
--- a/fs/bcachefs/debug.c
+++ b/fs/bcachefs/debug.c
@@ -667,6 +667,9 @@ static ssize_t lock_held_stats_read(struct file *file, char __user *buf,
 
 		mutex_lock(&s->lock);
 
+		prt_printf(&i->buf, "Max mem used: %u", s->max_mem);
+		prt_newline(&i->buf);
+
 		if (IS_ENABLED(CONFIG_BCACHEFS_LOCK_TIME_STATS)) {
 			prt_printf(&i->buf, "Lock hold times:");
 			prt_newline(&i->buf);
-- 
cgit 


From 14599cce443323ce23b4b266068b7018e42bd30c Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 22 Aug 2022 13:21:10 -0400
Subject: bcachefs: Switch btree locking code to struct
 btree_bkey_cached_common

This is just some type safety cleanup.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_cache.c       |  2 +-
 fs/bcachefs/btree_iter.c        |  2 +-
 fs/bcachefs/btree_locking.c     | 14 +++++++-------
 fs/bcachefs/btree_locking.h     | 29 +++++++++++++++++------------
 fs/bcachefs/btree_update_leaf.c |  2 +-
 fs/bcachefs/trace.h             |  2 +-
 6 files changed, 28 insertions(+), 23 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index 969ecb2fdfad..7ffa88b74236 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -886,7 +886,7 @@ lock_node:
 		if (btree_node_read_locked(path, level + 1))
 			btree_node_unlock(trans, path, level + 1);
 
-		ret = btree_node_lock(trans, path, b, k->k.p, level, lock_type,
+		ret = btree_node_lock(trans, path, &b->c, k->k.p, level, lock_type,
 				      lock_node_check_fn, (void *) k, trace_ip);
 		if (unlikely(ret)) {
 			if (bch2_err_matches(ret, BCH_ERR_lock_fail_node_reused))
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index f62f75ff82b2..ce9437916cf2 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -787,7 +787,7 @@ static inline int btree_path_lock_root(struct btree_trans *trans,
 		}
 
 		lock_type = __btree_lock_want(path, path->level);
-		ret = btree_node_lock(trans, path, b, SPOS_MAX,
+		ret = btree_node_lock(trans, path, &b->c, SPOS_MAX,
 				      path->level, lock_type,
 				      lock_root_check_fn, rootp,
 				      trace_ip);
diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c
index d46109320957..84d1e37a0741 100644
--- a/fs/bcachefs/btree_locking.c
+++ b/fs/bcachefs/btree_locking.c
@@ -18,7 +18,7 @@ static inline void six_lock_readers_add(struct six_lock *lock, int nr)
 
 struct six_lock_count bch2_btree_node_lock_counts(struct btree_trans *trans,
 						  struct btree_path *skip,
-						  struct btree *b,
+						  struct btree_bkey_cached_common *b,
 						  unsigned level)
 {
 	struct btree_path *path;
@@ -30,7 +30,7 @@ struct six_lock_count bch2_btree_node_lock_counts(struct btree_trans *trans,
 		return ret;
 
 	trans_for_each_path(trans, path)
-		if (path != skip && path->l[level].b == b) {
+		if (path != skip && &path->l[level].b->c == b) {
 			int t = btree_node_locked_type(path, level);
 
 			if (t != BTREE_NODE_UNLOCKED)
@@ -52,7 +52,7 @@ void bch2_btree_node_unlock_write(struct btree_trans *trans,
 
 void __bch2_btree_node_lock_write(struct btree_trans *trans, struct btree *b)
 {
-	int readers = bch2_btree_node_lock_counts(trans, NULL, b, b->c.level).n[SIX_LOCK_read];
+	int readers = bch2_btree_node_lock_counts(trans, NULL, &b->c, b->c.level).n[SIX_LOCK_read];
 
 	/*
 	 * Must drop our read locks before calling six_lock_write() -
@@ -78,7 +78,7 @@ static inline bool path_has_read_locks(struct btree_path *path)
 /* Slowpath: */
 int __bch2_btree_node_lock(struct btree_trans *trans,
 			   struct btree_path *path,
-			   struct btree *b,
+			   struct btree_bkey_cached_common *b,
 			   struct bpos pos, unsigned level,
 			   enum six_lock_type type,
 			   six_lock_should_sleep_fn should_sleep_fn, void *p,
@@ -142,7 +142,7 @@ int __bch2_btree_node_lock(struct btree_trans *trans,
 
 		/* Must lock btree nodes in key order: */
 		if (btree_node_locked(linked, level) &&
-		    bpos_cmp(pos, btree_node_pos((void *) linked->l[level].b,
+		    bpos_cmp(pos, btree_node_pos(&linked->l[level].b->c,
 						 linked->cached)) <= 0) {
 			reason = 7;
 			goto deadlock;
@@ -216,7 +216,7 @@ bool __bch2_btree_node_relock(struct btree_trans *trans,
 
 	if (six_relock_type(&b->c.lock, want, path->l[level].lock_seq) ||
 	    (btree_node_lock_seq_matches(path, b, level) &&
-	     btree_node_lock_increment(trans, b, level, want))) {
+	     btree_node_lock_increment(trans, &b->c, level, want))) {
 		mark_btree_node_locked(trans, path, level, want);
 		return true;
 	}
@@ -260,7 +260,7 @@ bool bch2_btree_node_upgrade(struct btree_trans *trans,
 		goto success;
 
 	if (btree_node_lock_seq_matches(path, b, level) &&
-	    btree_node_lock_increment(trans, b, level, BTREE_NODE_INTENT_LOCKED)) {
+	    btree_node_lock_increment(trans, &b->c, level, BTREE_NODE_INTENT_LOCKED)) {
 		btree_node_unlock(trans, path, level);
 		goto success;
 	}
diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h
index 2253a15d61c9..3a9a4a0d61c4 100644
--- a/fs/bcachefs/btree_locking.h
+++ b/fs/bcachefs/btree_locking.h
@@ -193,7 +193,7 @@ void bch2_btree_node_unlock_write(struct btree_trans *,
 
 static inline int btree_node_lock_type(struct btree_trans *trans,
 				       struct btree_path *path,
-				       struct btree *b,
+				       struct btree_bkey_cached_common *b,
 				       struct bpos pos, unsigned level,
 				       enum six_lock_type type,
 				       six_lock_should_sleep_fn should_sleep_fn, void *p)
@@ -202,7 +202,7 @@ static inline int btree_node_lock_type(struct btree_trans *trans,
 	u64 start_time;
 	int ret;
 
-	if (six_trylock_type(&b->c.lock, type))
+	if (six_trylock_type(&b->lock, type))
 		return 0;
 
 	start_time = local_clock();
@@ -212,8 +212,8 @@ static inline int btree_node_lock_type(struct btree_trans *trans,
 	trans->locking_btree_id	= path->btree_id;
 	trans->locking_level	= level;
 	trans->locking_lock_type = type;
-	trans->locking		= &b->c;
-	ret = six_lock_type(&b->c.lock, type, should_sleep_fn, p);
+	trans->locking		= b;
+	ret = six_lock_type(&b->lock, type, should_sleep_fn, p);
 	trans->locking = NULL;
 
 	if (ret)
@@ -228,15 +228,16 @@ static inline int btree_node_lock_type(struct btree_trans *trans,
  * iterators:
  */
 static inline bool btree_node_lock_increment(struct btree_trans *trans,
-					     struct btree *b, unsigned level,
+					     struct btree_bkey_cached_common *b,
+					     unsigned level,
 					     enum btree_node_locked_type want)
 {
 	struct btree_path *path;
 
 	trans_for_each_path(trans, path)
-		if (path->l[level].b == b &&
+		if (&path->l[level].b->c == b &&
 		    btree_node_locked_type(path, level) >= want) {
-			six_lock_increment(&b->c.lock, want);
+			six_lock_increment(&b->lock, want);
 			return true;
 		}
 
@@ -244,14 +245,16 @@ static inline bool btree_node_lock_increment(struct btree_trans *trans,
 }
 
 int __bch2_btree_node_lock(struct btree_trans *, struct btree_path *,
-			   struct btree *, struct bpos, unsigned,
+			   struct btree_bkey_cached_common *,
+			   struct bpos, unsigned,
 			   enum six_lock_type,
 			   six_lock_should_sleep_fn, void *,
 			   unsigned long);
 
 static inline int btree_node_lock(struct btree_trans *trans,
 			struct btree_path *path,
-			struct btree *b, struct bpos pos, unsigned level,
+			struct btree_bkey_cached_common *b,
+			struct bpos pos, unsigned level,
 			enum six_lock_type type,
 			six_lock_should_sleep_fn should_sleep_fn, void *p,
 			unsigned long ip)
@@ -261,12 +264,12 @@ static inline int btree_node_lock(struct btree_trans *trans,
 	EBUG_ON(level >= BTREE_MAX_DEPTH);
 	EBUG_ON(!(trans->paths_allocated & (1ULL << path->idx)));
 
-	if (likely(six_trylock_type(&b->c.lock, type)) ||
+	if (likely(six_trylock_type(&b->lock, type)) ||
 	    btree_node_lock_increment(trans, b, level, type) ||
 	    !(ret = __bch2_btree_node_lock(trans, path, b, pos, level, type,
 					   should_sleep_fn, p, ip))) {
 #ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS
-		path->l[b->c.level].lock_taken_time = ktime_get_ns();
+		path->l[b->level].lock_taken_time = ktime_get_ns();
 #endif
 	}
 
@@ -361,7 +364,9 @@ static inline void btree_path_set_level_up(struct btree_trans *trans,
 /* debug */
 
 struct six_lock_count bch2_btree_node_lock_counts(struct btree_trans *,
-				struct btree_path *, struct btree *, unsigned);
+				struct btree_path *,
+				struct btree_bkey_cached_common *b,
+				unsigned);
 
 
 #ifdef CONFIG_BCACHEFS_DEBUG
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index f8641b9f4abf..291c1a3ff8c9 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -822,7 +822,7 @@ static inline int trans_lock_write(struct btree_trans *trans)
 				goto fail;
 
 			ret = btree_node_lock_type(trans, i->path,
-					     insert_l(i)->b,
+					     &insert_l(i)->b->c,
 					     i->path->pos, i->level,
 					     SIX_LOCK_write, NULL, NULL);
 			BUG_ON(ret);
diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h
index db05be59fa35..3da096354a19 100644
--- a/fs/bcachefs/trace.h
+++ b/fs/bcachefs/trace.h
@@ -452,7 +452,7 @@ TRACE_EVENT(btree_node_upgrade_fail,
 		TRACE_BPOS_assign(pos, path->pos);
 		__entry->locked			= btree_node_locked(path, level);
 
-		c = bch2_btree_node_lock_counts(trans, NULL, path->l[level].b, level),
+		c = bch2_btree_node_lock_counts(trans, NULL, &path->l[level].b->c, level),
 		__entry->self_read_count	= c.n[SIX_LOCK_read];
 		__entry->self_intent_count	= c.n[SIX_LOCK_intent];
 		c = six_lock_counts(&path->l[level].b->c.lock);
-- 
cgit 


From c240c3a94427346f27a7ff48f02cbe03f2c2ebd6 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 23 Aug 2022 01:20:24 -0400
Subject: bcachefs: Print lock counts in debugs btree_transactions

Improve our debugfs output, to help in debugging deadlocks: this shows,
for every btree node we print, the current number of readers/intent
locks/write locks held.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index ce9437916cf2..99422e29c704 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -3024,9 +3024,21 @@ bch2_btree_path_node_to_text(struct printbuf *out,
 			     struct btree_bkey_cached_common *b,
 			     bool cached)
 {
+	struct six_lock_count c = six_lock_counts(&b->lock);
+	struct task_struct *owner;
+	pid_t pid;
+
+	rcu_read_lock();
+	owner = READ_ONCE(b->lock.owner);
+	pid = owner ? owner->pid : 0;;
+	rcu_read_unlock();
+
 	prt_printf(out, "    l=%u %s:",
 	       b->level, bch2_btree_ids[b->btree_id]);
 	bch2_bpos_to_text(out, btree_node_pos(b, cached));
+
+	prt_printf(out, "    locks %u:%u:%u held by pid %u",
+		   c.n[0], c.n[1], c.n[2], pid);
 }
 
 #ifdef CONFIG_BCACHEFS_DEBUG_TRANSACTIONS
-- 
cgit 


From 131dcd5af7e2f1b13c2c0baf3095d7e449eb9859 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 22 Aug 2022 21:05:31 -0400
Subject: bcachefs: Track held write locks

The upcoming lock cycle detection code will need to know precisely which
locks every btree_trans is holding, including write locks - this patch
updates btree_node_locked_type to include write locks.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_locking.c     | 16 +++++++++++++---
 fs/bcachefs/btree_locking.h     | 26 ++++++++++++++++++--------
 fs/bcachefs/btree_update_leaf.c |  2 ++
 3 files changed, 33 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c
index 84d1e37a0741..be288fb96ea2 100644
--- a/fs/bcachefs/btree_locking.c
+++ b/fs/bcachefs/btree_locking.c
@@ -246,6 +246,8 @@ bool bch2_btree_node_upgrade(struct btree_trans *trans,
 		return bch2_btree_node_relock(trans, path, level);
 	case BTREE_NODE_INTENT_LOCKED:
 		break;
+	case BTREE_NODE_WRITE_LOCKED:
+		BUG();
 	}
 
 	if (btree_node_intent_locked(path, level))
@@ -448,9 +450,17 @@ void bch2_btree_path_verify_locks(struct btree_path *path)
 		return;
 	}
 
-	for (l = 0; btree_path_node(path, l); l++)
-		BUG_ON(btree_lock_want(path, l) !=
-		       btree_node_locked_type(path, l));
+	for (l = 0; l < BTREE_MAX_DEPTH; l++) {
+		int want = btree_lock_want(path, l);
+		int have = btree_node_locked_type(path, l);
+
+		BUG_ON(!is_btree_node(path, l) && have != BTREE_NODE_UNLOCKED);
+
+		BUG_ON(is_btree_node(path, l) &&
+		       (want == BTREE_NODE_UNLOCKED ||
+			have != BTREE_NODE_WRITE_LOCKED) &&
+		       want != have);
+	}
 }
 
 void bch2_trans_verify_locks(struct btree_trans *trans)
diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h
index 3a9a4a0d61c4..a221c4fd1bf9 100644
--- a/fs/bcachefs/btree_locking.h
+++ b/fs/bcachefs/btree_locking.h
@@ -32,6 +32,7 @@ enum btree_node_locked_type {
 	BTREE_NODE_UNLOCKED		= -1,
 	BTREE_NODE_READ_LOCKED		= SIX_LOCK_read,
 	BTREE_NODE_INTENT_LOCKED	= SIX_LOCK_intent,
+	BTREE_NODE_WRITE_LOCKED		= SIX_LOCK_write,
 };
 
 static inline int btree_node_locked_type(struct btree_path *path,
@@ -40,16 +41,19 @@ static inline int btree_node_locked_type(struct btree_path *path,
 	return BTREE_NODE_UNLOCKED + ((path->nodes_locked >> (level << 1)) & 3);
 }
 
-static inline bool btree_node_intent_locked(struct btree_path *path,
-					    unsigned level)
+static inline bool btree_node_write_locked(struct btree_path *path, unsigned l)
+{
+	return btree_node_locked_type(path, l) == BTREE_NODE_WRITE_LOCKED;
+}
+
+static inline bool btree_node_intent_locked(struct btree_path *path, unsigned l)
 {
-	return btree_node_locked_type(path, level) == BTREE_NODE_INTENT_LOCKED;
+	return btree_node_locked_type(path, l) == BTREE_NODE_INTENT_LOCKED;
 }
 
-static inline bool btree_node_read_locked(struct btree_path *path,
-					  unsigned level)
+static inline bool btree_node_read_locked(struct btree_path *path, unsigned l)
 {
-	return btree_node_locked_type(path, level) == BTREE_NODE_READ_LOCKED;
+	return btree_node_locked_type(path, l) == BTREE_NODE_READ_LOCKED;
 }
 
 static inline bool btree_node_locked(struct btree_path *path, unsigned level)
@@ -72,6 +76,7 @@ static inline void mark_btree_node_locked_noreset(struct btree_path *path,
 static inline void mark_btree_node_unlocked(struct btree_path *path,
 					    unsigned level)
 {
+	EBUG_ON(btree_node_write_locked(path, level));
 	mark_btree_node_locked_noreset(path, level, BTREE_NODE_UNLOCKED);
 }
 
@@ -179,6 +184,9 @@ bch2_btree_node_unlock_write_inlined(struct btree_trans *trans, struct btree_pat
 
 	EBUG_ON(path->l[b->c.level].b != b);
 	EBUG_ON(path->l[b->c.level].lock_seq + 1 != b->c.lock.state.seq);
+	EBUG_ON(btree_node_locked_type(path, b->c.level) != SIX_LOCK_write);
+
+	mark_btree_node_locked_noreset(path, b->c.level, SIX_LOCK_intent);
 
 	trans_for_each_path_with_node(trans, b, linked)
 		linked->l[b->c.level].lock_seq += 2;
@@ -288,6 +296,8 @@ static inline void bch2_btree_node_lock_write(struct btree_trans *trans,
 
 	if (unlikely(!six_trylock_write(&b->c.lock)))
 		__bch2_btree_node_lock_write(trans, b);
+
+	mark_btree_node_locked_noreset(path, b->c.level, SIX_LOCK_write);
 }
 
 /* relock: */
@@ -311,8 +321,8 @@ static inline bool bch2_btree_node_relock(struct btree_trans *trans,
 					  struct btree_path *path, unsigned level)
 {
 	EBUG_ON(btree_node_locked(path, level) &&
-		btree_node_locked_type(path, level) !=
-		__btree_lock_want(path, level));
+		!btree_node_write_locked(path, level) &&
+		btree_node_locked_type(path, level) != __btree_lock_want(path, level));
 
 	return likely(btree_node_locked(path, level)) ||
 		__bch2_btree_node_relock(trans, path, level);
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 291c1a3ff8c9..6ae4755cfd24 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -828,6 +828,8 @@ static inline int trans_lock_write(struct btree_trans *trans)
 			BUG_ON(ret);
 		}
 
+		mark_btree_node_locked_noreset(i->path, i->level, SIX_LOCK_write);
+
 		bch2_btree_node_prep_for_write(trans, i->path, insert_l(i)->b);
 	}
 
-- 
cgit 


From 06a53943222be722e5f85782721e4701bcd424e8 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 25 Aug 2022 21:42:46 -0400
Subject: bcachefs: Correctly initialize bkey_cached->lock

We need to use the right class for some assertions to work correctly.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_key_cache.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index 0e87c19effeb..cf41926b7f8e 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -181,7 +181,7 @@ bkey_cached_alloc(struct btree_key_cache *c)
 	ck = kmem_cache_alloc(bch2_key_cache, GFP_NOFS|__GFP_ZERO);
 	if (likely(ck)) {
 		INIT_LIST_HEAD(&ck->list);
-		six_lock_init(&ck->c.lock);
+		__six_lock_init(&ck->c.lock, "b->c.lock", &bch2_btree_node_lock_key);
 		lockdep_set_novalidate_class(&ck->c.lock);
 		BUG_ON(!six_trylock_intent(&ck->c.lock));
 		BUG_ON(!six_trylock_write(&ck->c.lock));
-- 
cgit 


From b1cdc398ae36689300b4108ce9c90c58cac1ba34 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 27 Aug 2022 10:30:36 -0400
Subject: bcachefs: Make more btree_paths available

 - Don't decrease BTREE_ITER_MAX when building with CONFIG_LOCKDEP
   anymore. The lockdep table sizes are configurable now, we don't need
   this anymore.
 - btree_trans_too_many_iters() is less conservative now. Previously it
   was causing a transaction restart if we had used more than
   BTREE_ITER_MAX / 2 paths, change this to BTREE_ITER_MAX - 8.

This helps with excessive transaction restarts/livelocks in the bucket
allocator path.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index 87b456998ef4..1081ea753be6 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -387,7 +387,7 @@ static inline struct bkey_s_c bch2_btree_iter_peek_upto_type(struct btree_iter *
 
 static inline int btree_trans_too_many_iters(struct btree_trans *trans)
 {
-	if (hweight64(trans->paths_allocated) > BTREE_ITER_MAX / 2) {
+	if (hweight64(trans->paths_allocated) > BTREE_ITER_MAX - 8) {
 		trace_trans_restart_too_many_iters(trans, _THIS_IP_);
 		return btree_trans_restart(trans, BCH_ERR_transaction_restart_too_many_iters);
 	}
-- 
cgit 


From 5f1dd9a633dd0aa8429742cdba08d9566f49177b Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 27 Aug 2022 12:11:18 -0400
Subject: bcachefs: Improve btree_node_relock_fail tracepoint

It now prints the error name when the btree node is an error pointer;
also, don't trace failures when the the btree node is
BCH_ERR_no_btree_node_up.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_locking.c |  3 ++-
 fs/bcachefs/trace.h         | 29 +++++++++++++++++------------
 2 files changed, 19 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c
index be288fb96ea2..8376067280ea 100644
--- a/fs/bcachefs/btree_locking.c
+++ b/fs/bcachefs/btree_locking.c
@@ -222,7 +222,8 @@ bool __bch2_btree_node_relock(struct btree_trans *trans,
 	}
 fail:
 	if (b != ERR_PTR(-BCH_ERR_no_btree_node_cached) &&
-	    b != ERR_PTR(-BCH_ERR_no_btree_node_init))
+	    b != ERR_PTR(-BCH_ERR_no_btree_node_init) &&
+	    b != ERR_PTR(-BCH_ERR_no_btree_node_up))
 		trace_btree_node_relock_fail(trans, _RET_IP_, path, level);
 	return false;
 }
diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h
index 3da096354a19..472175e8c2e3 100644
--- a/fs/bcachefs/trace.h
+++ b/fs/bcachefs/trace.h
@@ -323,7 +323,7 @@ TRACE_EVENT(btree_reserve_get_fail,
 	TP_ARGS(trans_fn, caller_ip, required),
 
 	TP_STRUCT__entry(
-		__array(char,			trans_fn, 24	)
+		__array(char,			trans_fn, 32	)
 		__field(unsigned long,		caller_ip	)
 		__field(size_t,			required	)
 	),
@@ -393,26 +393,31 @@ TRACE_EVENT(btree_node_relock_fail,
 	TP_ARGS(trans, caller_ip, path, level),
 
 	TP_STRUCT__entry(
-		__array(char,			trans_fn, 24	)
+		__array(char,			trans_fn, 32	)
 		__field(unsigned long,		caller_ip	)
 		__field(u8,			btree_id	)
 		TRACE_BPOS_entries(pos)
-		__field(unsigned long,		node		)
+		__array(char,			node, 24	)
 		__field(u32,			iter_lock_seq	)
 		__field(u32,			node_lock_seq	)
 	),
 
 	TP_fast_assign(
+		struct btree *b = btree_path_node(path, level);
+
 		strlcpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
 		__entry->caller_ip		= caller_ip;
 		__entry->btree_id		= path->btree_id;
 		TRACE_BPOS_assign(pos, path->pos);
-		__entry->node			= (unsigned long) btree_path_node(path, level);
+		if (IS_ERR(b))
+			strscpy(__entry->node, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node));
+		else
+			scnprintf(__entry->node, sizeof(__entry->node), "%px", b);
 		__entry->iter_lock_seq		= path->l[level].lock_seq;
 		__entry->node_lock_seq		= is_btree_node(path, level) ? path->l[level].b->c.lock.state.seq : 0;
 	),
 
-	TP_printk("%s %pS btree %s pos %llu:%llu:%u, node %lu iter seq %u lock seq %u",
+	TP_printk("%s %pS btree %s pos %llu:%llu:%u, node %s iter seq %u lock seq %u",
 		  __entry->trans_fn,
 		  (void *) __entry->caller_ip,
 		  bch2_btree_ids[__entry->btree_id],
@@ -432,7 +437,7 @@ TRACE_EVENT(btree_node_upgrade_fail,
 	TP_ARGS(trans, caller_ip, path, level),
 
 	TP_STRUCT__entry(
-		__array(char,			trans_fn, 24	)
+		__array(char,			trans_fn, 32	)
 		__field(unsigned long,		caller_ip	)
 		__field(u8,			btree_id	)
 		TRACE_BPOS_entries(pos)
@@ -740,7 +745,7 @@ DECLARE_EVENT_CLASS(transaction_event,
 	TP_ARGS(trans, caller_ip),
 
 	TP_STRUCT__entry(
-		__array(char,			trans_fn, 24	)
+		__array(char,			trans_fn, 32	)
 		__field(unsigned long,		caller_ip	)
 	),
 
@@ -825,7 +830,7 @@ DECLARE_EVENT_CLASS(transaction_restart_iter,
 	TP_ARGS(trans, caller_ip, path),
 
 	TP_STRUCT__entry(
-		__array(char,			trans_fn, 24	)
+		__array(char,			trans_fn, 32	)
 		__field(unsigned long,		caller_ip	)
 		__field(u8,			btree_id	)
 		TRACE_BPOS_entries(pos)
@@ -955,7 +960,7 @@ TRACE_EVENT(trans_restart_would_deadlock,
 		have, want, want_pos),
 
 	TP_STRUCT__entry(
-		__array(char,			trans_fn, 24	)
+		__array(char,			trans_fn, 32	)
 		__field(unsigned long,		caller_ip	)
 		__field(u8,			in_traverse_all	)
 		__field(u8,			reason		)
@@ -1002,7 +1007,7 @@ TRACE_EVENT(trans_restart_would_deadlock_write,
 	TP_ARGS(trans),
 
 	TP_STRUCT__entry(
-		__array(char,			trans_fn, 24	)
+		__array(char,			trans_fn, 32	)
 	),
 
 	TP_fast_assign(
@@ -1019,7 +1024,7 @@ TRACE_EVENT(trans_restart_mem_realloced,
 	TP_ARGS(trans, caller_ip, bytes),
 
 	TP_STRUCT__entry(
-		__array(char,			trans_fn, 24	)
+		__array(char,			trans_fn, 32	)
 		__field(unsigned long,		caller_ip	)
 		__field(unsigned long,		bytes		)
 	),
@@ -1045,7 +1050,7 @@ TRACE_EVENT(trans_restart_key_cache_key_realloced,
 	TP_ARGS(trans, caller_ip, path, old_u64s, new_u64s),
 
 	TP_STRUCT__entry(
-		__array(char,			trans_fn, 24	)
+		__array(char,			trans_fn, 32	)
 		__field(unsigned long,		caller_ip	)
 		__field(enum btree_id,		btree_id	)
 		TRACE_BPOS_entries(pos)
-- 
cgit 


From ce56bf7fc23b6c2cf6edfbdfba1805c1842641ca Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 27 Aug 2022 12:23:38 -0400
Subject: bcachefs: Improve trans_restart_journal_preres_get tracepoint

It now includes journal_flags.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update_interior.c |  2 +-
 fs/bcachefs/btree_update_leaf.c     |  2 +-
 fs/bcachefs/trace.h                 | 24 +++++++++++++++++++++---
 3 files changed, 23 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index e10c159ec079..fc768195be54 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -1053,7 +1053,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
 					      BTREE_UPDATE_JOURNAL_RES,
 					      journal_flags);
 		if (ret) {
-			trace_trans_restart_journal_preres_get(trans, _RET_IP_);
+			trace_trans_restart_journal_preres_get(trans, _RET_IP_, journal_flags);
 			ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_journal_preres_get);
 			goto err;
 		}
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 6ae4755cfd24..e3501623931a 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -285,7 +285,7 @@ bch2_trans_journal_preres_get_cold(struct btree_trans *trans, unsigned u64s,
 
 	ret = bch2_trans_relock(trans);
 	if (ret) {
-		trace_trans_restart_journal_preres_get(trans, trace_ip);
+		trace_trans_restart_journal_preres_get(trans, trace_ip, 0);
 		return ret;
 	}
 
diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h
index 472175e8c2e3..2c1661ab807b 100644
--- a/fs/bcachefs/trace.h
+++ b/fs/bcachefs/trace.h
@@ -781,10 +781,28 @@ DEFINE_EVENT(transaction_event,	trans_restart_journal_res_get,
 	TP_ARGS(trans, caller_ip)
 );
 
-DEFINE_EVENT(transaction_event,	trans_restart_journal_preres_get,
+
+TRACE_EVENT(trans_restart_journal_preres_get,
 	TP_PROTO(struct btree_trans *trans,
-		 unsigned long caller_ip),
-	TP_ARGS(trans, caller_ip)
+		 unsigned long caller_ip,
+		 unsigned flags),
+	TP_ARGS(trans, caller_ip, flags),
+
+	TP_STRUCT__entry(
+		__array(char,			trans_fn, 32	)
+		__field(unsigned long,		caller_ip	)
+		__field(unsigned,		flags		)
+	),
+
+	TP_fast_assign(
+		strlcpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
+		__entry->caller_ip		= caller_ip;
+		__entry->flags			= flags;
+	),
+
+	TP_printk("%s %pS %x", __entry->trans_fn,
+		  (void *) __entry->caller_ip,
+		  __entry->flags)
 );
 
 DEFINE_EVENT(transaction_event,	trans_restart_journal_reclaim,
-- 
cgit 


From 8a9c1b1cb0edacdf4ac9c378c4ec4fc376fc8bac Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 27 Aug 2022 12:28:09 -0400
Subject: bcachefs: Improve bch2_btree_node_relock()

This moves the IS_ERR_OR_NULL() check to the inline part, since that's a
fast path event.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_locking.c | 8 +-------
 fs/bcachefs/btree_locking.h | 3 ++-
 2 files changed, 3 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c
index 8376067280ea..76d99c694948 100644
--- a/fs/bcachefs/btree_locking.c
+++ b/fs/bcachefs/btree_locking.c
@@ -208,9 +208,6 @@ bool __bch2_btree_node_relock(struct btree_trans *trans,
 	struct btree *b = btree_path_node(path, level);
 	int want = __btree_lock_want(path, level);
 
-	if (!is_btree_node(path, level))
-		goto fail;
-
 	if (race_fault())
 		goto fail;
 
@@ -221,10 +218,7 @@ bool __bch2_btree_node_relock(struct btree_trans *trans,
 		return true;
 	}
 fail:
-	if (b != ERR_PTR(-BCH_ERR_no_btree_node_cached) &&
-	    b != ERR_PTR(-BCH_ERR_no_btree_node_init) &&
-	    b != ERR_PTR(-BCH_ERR_no_btree_node_up))
-		trace_btree_node_relock_fail(trans, _RET_IP_, path, level);
+	trace_btree_node_relock_fail(trans, _RET_IP_, path, level);
 	return false;
 }
 
diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h
index a221c4fd1bf9..3bc5df4263f8 100644
--- a/fs/bcachefs/btree_locking.h
+++ b/fs/bcachefs/btree_locking.h
@@ -325,7 +325,8 @@ static inline bool bch2_btree_node_relock(struct btree_trans *trans,
 		btree_node_locked_type(path, level) != __btree_lock_want(path, level));
 
 	return likely(btree_node_locked(path, level)) ||
-		__bch2_btree_node_relock(trans, path, level);
+		(!IS_ERR_OR_NULL(path->l[level].b) &&
+		 __bch2_btree_node_relock(trans, path, level));
 }
 
 /* upgrade */
-- 
cgit 


From d97e6aaed60a9c2c727cce2979ca311fe232163f Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 27 Aug 2022 12:37:05 -0400
Subject: bcachefs: Fix bch2_btree_update_start() to return
 -BCH_ERR_journal_reclaim_would_deadlock

On failure to get a journal pre-reservation because we're called from
journal reclaim we're not supposed to return a transaction restart error
- this fixes a livelock.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update_interior.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index fc768195be54..dd9405c631f5 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -1049,6 +1049,11 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
 	if (ret) {
 		bch2_trans_unlock(trans);
 
+		if (flags & BTREE_INSERT_JOURNAL_RECLAIM) {
+			ret = -BCH_ERR_journal_reclaim_would_deadlock;
+			goto err;
+		}
+
 		ret = bch2_journal_preres_get(&c->journal, &as->journal_preres,
 					      BTREE_UPDATE_JOURNAL_RES,
 					      journal_flags);
-- 
cgit 


From 674cfc26240b7807f078a23a4f04681ccae49b02 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 27 Aug 2022 12:48:36 -0400
Subject: bcachefs: Add persistent counters for all tracepoints

Also, do some reorganizing/renaming, convert atomic counters in bch_fs
to persistent counters, and add a few missing counters.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c      |   3 +-
 fs/bcachefs/alloc_foreground.c      |  52 +++++------
 fs/bcachefs/bcachefs.h              |  12 +--
 fs/bcachefs/bcachefs_format.h       |  81 ++++++++++++++--
 fs/bcachefs/btree_cache.c           |  22 ++---
 fs/bcachefs/btree_gc.c              |   4 +-
 fs/bcachefs/btree_io.c              |   4 +-
 fs/bcachefs/btree_iter.c            |   8 +-
 fs/bcachefs/btree_iter.h            |   2 +-
 fs/bcachefs/btree_key_cache.c       |   4 +-
 fs/bcachefs/btree_locking.c         |  12 +--
 fs/bcachefs/btree_update_interior.c |  21 ++---
 fs/bcachefs/btree_update_leaf.c     |  24 ++---
 fs/bcachefs/data_update.c           |  13 ++-
 fs/bcachefs/io.c                    |  10 +-
 fs/bcachefs/journal.c               |   4 +-
 fs/bcachefs/journal_io.c            |   2 +-
 fs/bcachefs/journal_reclaim.c       |   5 +-
 fs/bcachefs/move.c                  |   6 +-
 fs/bcachefs/movinggc.c              |   4 +-
 fs/bcachefs/super-io.c              |   2 +-
 fs/bcachefs/sysfs.c                 |  19 ----
 fs/bcachefs/trace.h                 | 182 +++++++++++++++++++-----------------
 23 files changed, 279 insertions(+), 217 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 15c3c9a2da7b..ffcfb9f1916e 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -1134,8 +1134,7 @@ static int invalidate_one_bucket(struct btree_trans *trans,
 	if (ret)
 		goto out;
 
-	trace_invalidate_bucket(c, bucket.inode, bucket.offset, cached_sectors);
-	this_cpu_inc(c->counters[BCH_COUNTER_bucket_invalidate]);
+	trace_and_count(c, bucket_invalidate, c, bucket.inode, bucket.offset, cached_sectors);
 	--*nr_to_invalidate;
 out:
 	bch2_trans_iter_exit(trans, &alloc_iter);
diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index bbe74a05a7a2..f60fe159916e 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -584,32 +584,32 @@ err:
 	if (!ob)
 		ob = ERR_PTR(-BCH_ERR_no_buckets_found);
 
-	if (!IS_ERR(ob)) {
-		trace_bucket_alloc(ca, bch2_alloc_reserves[reserve],
-				   usage.d[BCH_DATA_free].buckets,
-				   avail,
-				   bch2_copygc_wait_amount(c),
-				   c->copygc_wait - atomic64_read(&c->io_clock[WRITE].now),
-				   buckets_seen,
-				   skipped_open,
-				   skipped_need_journal_commit,
-				   skipped_nouse,
-				   cl == NULL,
-				   "");
-	} else {
-		trace_bucket_alloc_fail(ca, bch2_alloc_reserves[reserve],
-				   usage.d[BCH_DATA_free].buckets,
-				   avail,
-				   bch2_copygc_wait_amount(c),
-				   c->copygc_wait - atomic64_read(&c->io_clock[WRITE].now),
-				   buckets_seen,
-				   skipped_open,
-				   skipped_need_journal_commit,
-				   skipped_nouse,
-				   cl == NULL,
-				   bch2_err_str(PTR_ERR(ob)));
-		atomic_long_inc(&c->bucket_alloc_fail);
-	}
+	if (!IS_ERR(ob))
+		trace_and_count(c, bucket_alloc, ca,
+				bch2_alloc_reserves[reserve],
+				usage.d[BCH_DATA_free].buckets,
+				avail,
+				bch2_copygc_wait_amount(c),
+				c->copygc_wait - atomic64_read(&c->io_clock[WRITE].now),
+				buckets_seen,
+				skipped_open,
+				skipped_need_journal_commit,
+				skipped_nouse,
+				cl == NULL,
+				"");
+	else
+		trace_and_count(c, bucket_alloc_fail, ca,
+				bch2_alloc_reserves[reserve],
+				usage.d[BCH_DATA_free].buckets,
+				avail,
+				bch2_copygc_wait_amount(c),
+				c->copygc_wait - atomic64_read(&c->io_clock[WRITE].now),
+				buckets_seen,
+				skipped_open,
+				skipped_need_journal_commit,
+				skipped_nouse,
+				cl == NULL,
+				bch2_err_str(PTR_ERR(ob)));
 
 	return ob;
 }
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 9e6c10dfa443..bca61af71652 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -212,6 +212,12 @@
 #define dynamic_fault(...)		0
 #define race_fault(...)			0
 
+#define trace_and_count(_c, _name, ...)					\
+do {									\
+	this_cpu_inc((_c)->counters[BCH_COUNTER_##_name]);		\
+	trace_##_name(__VA_ARGS__);					\
+} while (0)
+
 #define bch2_fs_init_fault(name)					\
 	dynamic_fault("bcachefs:bch_fs_init:" name)
 #define bch2_meta_read_fault(name)					\
@@ -916,12 +922,6 @@ mempool_t		bio_bounce_pages;
 
 	u64			last_bucket_seq_cleanup;
 
-	/* TODO rewrite as counters - The rest of this all shows up in sysfs */
-	atomic_long_t		read_realloc_races;
-	atomic_long_t		extent_migrate_done;
-	atomic_long_t		extent_migrate_raced;
-	atomic_long_t		bucket_alloc_fail;
-
 	u64			counters_on_mount[BCH_COUNTER_NR];
 	u64 __percpu		*counters;
 
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index b9d614f608b5..0e80fe2568f2 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -1326,12 +1326,81 @@ struct bch_sb_field_disk_groups {
 
 /* BCH_SB_FIELD_counters */
 
-#define BCH_PERSISTENT_COUNTERS()			\
-	x(io_read,		0)			\
-	x(io_write,		1)			\
-	x(io_move,		2)			\
-	x(bucket_invalidate,	3)			\
-	x(bucket_discard,	4)
+#define BCH_PERSISTENT_COUNTERS()				\
+	x(io_read,					0)	\
+	x(io_write,					1)	\
+	x(io_move,					2)	\
+	x(bucket_invalidate,				3)	\
+	x(bucket_discard,				4)	\
+	x(bucket_alloc,					5)	\
+	x(bucket_alloc_fail,				6)	\
+	x(btree_cache_scan,				7)	\
+	x(btree_cache_reap,				8)	\
+	x(btree_cache_cannibalize,			9)	\
+	x(btree_cache_cannibalize_lock,			10)	\
+	x(btree_cache_cannibalize_lock_fail,		11)	\
+	x(btree_cache_cannibalize_unlock,		12)	\
+	x(btree_node_write,				13)	\
+	x(btree_node_read,				14)	\
+	x(btree_node_compact,				15)	\
+	x(btree_node_merge,				16)	\
+	x(btree_node_split,				17)	\
+	x(btree_node_rewrite,				18)	\
+	x(btree_node_alloc,				19)	\
+	x(btree_node_free,				20)	\
+	x(btree_node_set_root,				21)	\
+	x(btree_path_relock_fail,			22)	\
+	x(btree_path_upgrade_fail,			23)	\
+	x(btree_reserve_get_fail,			24)	\
+	x(journal_entry_full,				25)	\
+	x(journal_full,					26)	\
+	x(journal_reclaim_finish,			27)	\
+	x(journal_reclaim_start,			28)	\
+	x(journal_write,				29)	\
+	x(read_promote,					30)	\
+	x(read_bounce,					31)	\
+	x(read_split,					33)	\
+	x(read_retry,					32)	\
+	x(read_reuse_race,				34)	\
+	x(move_extent_read,				35)	\
+	x(move_extent_write,				36)	\
+	x(move_extent_finish,				37)	\
+	x(move_extent_fail,				38)	\
+	x(move_extent_alloc_mem_fail,			39)	\
+	x(copygc,					40)	\
+	x(copygc_wait,					41)	\
+	x(gc_gens_end,					42)	\
+	x(gc_gens_start,				43)	\
+	x(trans_blocked_journal_reclaim,		44)	\
+	x(trans_restart_btree_node_reused,		45)	\
+	x(trans_restart_btree_node_split,		46)	\
+	x(trans_restart_fault_inject,			47)	\
+	x(trans_restart_iter_upgrade,			48)	\
+	x(trans_restart_journal_preres_get,		49)	\
+	x(trans_restart_journal_reclaim,		50)	\
+	x(trans_restart_journal_res_get,		51)	\
+	x(trans_restart_key_cache_key_realloced,	52)	\
+	x(trans_restart_key_cache_raced,		53)	\
+	x(trans_restart_mark_replicas,			54)	\
+	x(trans_restart_mem_realloced,			55)	\
+	x(trans_restart_memory_allocation_failure,	56)	\
+	x(trans_restart_relock,				57)	\
+	x(trans_restart_relock_after_fill,		58)	\
+	x(trans_restart_relock_key_cache_fill,		59)	\
+	x(trans_restart_relock_next_node,		60)	\
+	x(trans_restart_relock_parent_for_fill,		61)	\
+	x(trans_restart_relock_path,			62)	\
+	x(trans_restart_relock_path_intent,		63)	\
+	x(trans_restart_too_many_iters,			64)	\
+	x(trans_restart_traverse,			65)	\
+	x(trans_restart_upgrade,			66)	\
+	x(trans_restart_would_deadlock,			67)	\
+	x(trans_restart_would_deadlock_write,		68)	\
+	x(trans_restart_injected,			69)	\
+	x(trans_restart_key_cache_upgrade,		70)	\
+	x(trans_traverse_all,				71)	\
+	x(transaction_commit,				72)	\
+	x(write_super,					73)
 
 enum bch_persistent_counters {
 #define x(t, n, ...) BCH_COUNTER_##t,
diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index 7ffa88b74236..e09fbf36ebc2 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -253,7 +253,7 @@ wait_on_io:
 	}
 out:
 	if (b->hash_val && !ret)
-		trace_btree_node_reap(c, b);
+		trace_and_count(c, btree_cache_reap, c, b);
 	return ret;
 out_unlock:
 	six_unlock_write(&b->c.lock);
@@ -377,7 +377,7 @@ out:
 	ret = freed;
 	memalloc_nofs_restore(flags);
 out_norestore:
-	trace_btree_cache_scan(sc->nr_to_scan, can_free, ret);
+	trace_and_count(c, btree_cache_scan, sc->nr_to_scan, can_free, ret);
 	return ret;
 }
 
@@ -504,7 +504,7 @@ void bch2_btree_cache_cannibalize_unlock(struct bch_fs *c)
 	struct btree_cache *bc = &c->btree_cache;
 
 	if (bc->alloc_lock == current) {
-		trace_btree_node_cannibalize_unlock(c);
+		trace_and_count(c, btree_cache_cannibalize_unlock, c);
 		bc->alloc_lock = NULL;
 		closure_wake_up(&bc->alloc_wait);
 	}
@@ -520,7 +520,7 @@ int bch2_btree_cache_cannibalize_lock(struct bch_fs *c, struct closure *cl)
 		goto success;
 
 	if (!cl) {
-		trace_btree_node_cannibalize_lock_fail(c);
+		trace_and_count(c, btree_cache_cannibalize_lock_fail, c);
 		return -ENOMEM;
 	}
 
@@ -534,11 +534,11 @@ int bch2_btree_cache_cannibalize_lock(struct bch_fs *c, struct closure *cl)
 		goto success;
 	}
 
-	trace_btree_node_cannibalize_lock_fail(c);
+	trace_and_count(c, btree_cache_cannibalize_lock_fail, c);
 	return -EAGAIN;
 
 success:
-	trace_btree_node_cannibalize_lock(c);
+	trace_and_count(c, btree_cache_cannibalize_lock, c);
 	return 0;
 }
 
@@ -662,7 +662,7 @@ err_locked:
 
 		mutex_unlock(&bc->lock);
 
-		trace_btree_node_cannibalize(c);
+		trace_and_count(c, btree_cache_cannibalize, c);
 		goto out;
 	}
 
@@ -691,7 +691,7 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c,
 	 * been freed:
 	 */
 	if (trans && !bch2_btree_node_relock(trans, path, level + 1)) {
-		trace_trans_restart_relock_parent_for_fill(trans, _THIS_IP_, path);
+		trace_and_count(c, trans_restart_relock_parent_for_fill, trans, _THIS_IP_, path);
 		return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_fill_relock));
 	}
 
@@ -699,7 +699,7 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c,
 
 	if (trans && b == ERR_PTR(-ENOMEM)) {
 		trans->memory_allocation_failure = true;
-		trace_trans_restart_memory_allocation_failure(trans, _THIS_IP_, path);
+		trace_and_count(c, trans_restart_memory_allocation_failure, trans, _THIS_IP_, path);
 		return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_fill_mem_alloc_fail));
 	}
 
@@ -748,7 +748,7 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c,
 
 	if (!six_relock_type(&b->c.lock, lock_type, seq)) {
 		if (trans)
-			trace_trans_restart_relock_after_fill(trans, _THIS_IP_, path);
+			trace_and_count(c, trans_restart_relock_after_fill, trans, _THIS_IP_, path);
 		return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_after_fill));
 	}
 
@@ -903,7 +903,7 @@ lock_node:
 			if (bch2_btree_node_relock(trans, path, level + 1))
 				goto retry;
 
-			trace_trans_restart_btree_node_reused(trans, trace_ip, path);
+			trace_and_count(c, trans_restart_btree_node_reused, trans, trace_ip, path);
 			return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_lock_node_reused));
 		}
 	}
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 4ab59880781a..239eda57bf02 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -1931,7 +1931,7 @@ int bch2_gc_gens(struct bch_fs *c)
 	if (!mutex_trylock(&c->gc_gens_lock))
 		return 0;
 
-	trace_gc_gens_start(c);
+	trace_and_count(c, gc_gens_start, c);
 	down_read(&c->gc_lock);
 	bch2_trans_init(&trans, c, 0, 0);
 
@@ -1992,7 +1992,7 @@ int bch2_gc_gens(struct bch_fs *c)
 	c->gc_count++;
 
 	bch2_time_stats_update(&c->times[BCH_TIME_btree_gc], start_time);
-	trace_gc_gens_end(c);
+	trace_and_count(c, gc_gens_end, c);
 err:
 	for_each_member_device(ca, c, i) {
 		kvfree(ca->oldest_gen);
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index bd74bd31dd1f..b3dc8b43298e 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -1485,7 +1485,7 @@ void bch2_btree_node_read(struct bch_fs *c, struct btree *b,
 	struct bio *bio;
 	int ret;
 
-	trace_btree_read(c, b);
+	trace_and_count(c, btree_node_read, c, b);
 
 	if (bch2_verify_all_btree_replicas &&
 	    !btree_node_read_all_replicas(c, b, sync))
@@ -1974,7 +1974,7 @@ do_write:
 	    c->opts.nochanges)
 		goto err;
 
-	trace_btree_write(b, bytes_to_write, sectors_to_write);
+	trace_and_count(c, btree_node_write, b, bytes_to_write, sectors_to_write);
 
 	wbio = container_of(bio_alloc_bioset(NULL,
 				buf_pages(data, sectors_to_write << 9),
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 99422e29c704..e76907af09f1 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1072,7 +1072,7 @@ err:
 
 	trans->in_traverse_all = false;
 
-	trace_trans_traverse_all(trans, trace_ip);
+	trace_and_count(c, trans_traverse_all, trans, trace_ip);
 	return ret;
 }
 
@@ -1209,7 +1209,7 @@ int __must_check bch2_btree_path_traverse(struct btree_trans *trans,
 		u64 max = ~(~0ULL << restart_probability_bits);
 
 		if (!get_random_u32_below(max)) {
-			trace_transaction_restart_injected(trans, _RET_IP_);
+			trace_and_count(trans->c, trans_restart_injected, trans, _RET_IP_);
 			return btree_trans_restart(trans, BCH_ERR_transaction_restart_fault_inject);
 		}
 	}
@@ -1728,7 +1728,7 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
 		path->l[path->level].b		= ERR_PTR(-BCH_ERR_no_btree_node_relock);
 		path->l[path->level + 1].b	= ERR_PTR(-BCH_ERR_no_btree_node_relock);
 		btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
-		trace_trans_restart_relock_next_node(trans, _THIS_IP_, path);
+		trace_and_count(trans->c, trans_restart_relock_next_node, trans, _THIS_IP_, path);
 		ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_relock);
 		goto err;
 	}
@@ -2773,7 +2773,7 @@ void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
 		trans->mem_bytes = new_bytes;
 
 		if (old_bytes) {
-			trace_trans_restart_mem_realloced(trans, _RET_IP_, new_bytes);
+			trace_and_count(trans->c, trans_restart_mem_realloced, trans, _RET_IP_, new_bytes);
 			return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_mem_realloced));
 		}
 	}
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index 1081ea753be6..bdc703324b9a 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -388,7 +388,7 @@ static inline struct bkey_s_c bch2_btree_iter_peek_upto_type(struct btree_iter *
 static inline int btree_trans_too_many_iters(struct btree_trans *trans)
 {
 	if (hweight64(trans->paths_allocated) > BTREE_ITER_MAX - 8) {
-		trace_trans_restart_too_many_iters(trans, _THIS_IP_);
+		trace_and_count(trans->c, trans_restart_too_many_iters, trans, _THIS_IP_);
 		return btree_trans_restart(trans, BCH_ERR_transaction_restart_too_many_iters);
 	}
 
diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index cf41926b7f8e..127cb6edaff5 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -291,7 +291,7 @@ static int btree_key_cache_fill(struct btree_trans *trans,
 	k = bch2_btree_path_peek_slot(path, &u);
 
 	if (!bch2_btree_node_relock(trans, ck_path, 0)) {
-		trace_trans_restart_relock_key_cache_fill(trans, _THIS_IP_, ck_path);
+		trace_and_count(trans->c, trans_restart_relock_key_cache_fill, trans, _THIS_IP_, ck_path);
 		ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_raced);
 		goto err;
 	}
@@ -414,7 +414,7 @@ fill:
 		 */
 		if (!path->locks_want &&
 		    !__bch2_btree_path_upgrade(trans, path, 1)) {
-			trace_transaction_restart_key_cache_upgrade(trans, _THIS_IP_);
+			trace_and_count(trans->c, trans_restart_key_cache_upgrade, trans, _THIS_IP_);
 			ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_upgrade);
 			goto err;
 		}
diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c
index 76d99c694948..301311763d59 100644
--- a/fs/bcachefs/btree_locking.c
+++ b/fs/bcachefs/btree_locking.c
@@ -152,7 +152,7 @@ int __bch2_btree_node_lock(struct btree_trans *trans,
 	return btree_node_lock_type(trans, path, b, pos, level,
 				    type, should_sleep_fn, p);
 deadlock:
-	trace_trans_restart_would_deadlock(trans, ip, reason, linked, path, &pos);
+	trace_and_count(trans->c, trans_restart_would_deadlock, trans, ip, reason, linked, path, &pos);
 	return btree_trans_restart(trans, BCH_ERR_transaction_restart_would_deadlock);
 }
 
@@ -218,7 +218,7 @@ bool __bch2_btree_node_relock(struct btree_trans *trans,
 		return true;
 	}
 fail:
-	trace_btree_node_relock_fail(trans, _RET_IP_, path, level);
+	trace_and_count(trans->c, btree_path_relock_fail, trans, _RET_IP_, path, level);
 	return false;
 }
 
@@ -262,7 +262,7 @@ bool bch2_btree_node_upgrade(struct btree_trans *trans,
 		goto success;
 	}
 
-	trace_btree_node_upgrade_fail(trans, _RET_IP_, path, level);
+	trace_and_count(trans->c, btree_path_upgrade_fail, trans, _RET_IP_, path, level);
 	return false;
 success:
 	mark_btree_node_locked_noreset(path, level, SIX_LOCK_intent);
@@ -285,7 +285,7 @@ int bch2_btree_path_relock_intent(struct btree_trans *trans,
 		if (!bch2_btree_node_relock(trans, path, l)) {
 			__bch2_btree_path_unlock(trans, path);
 			btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
-			trace_trans_restart_relock_path_intent(trans, _RET_IP_, path);
+			trace_and_count(trans->c, trans_restart_relock_path_intent, trans, _RET_IP_, path);
 			return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_path_intent);
 		}
 	}
@@ -304,7 +304,7 @@ int __bch2_btree_path_relock(struct btree_trans *trans,
 			struct btree_path *path, unsigned long trace_ip)
 {
 	if (!bch2_btree_path_relock_norestart(trans, path, trace_ip)) {
-		trace_trans_restart_relock_path(trans, trace_ip, path);
+		trace_and_count(trans->c, trans_restart_relock_path, trans, trace_ip, path);
 		return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_path);
 	}
 
@@ -416,7 +416,7 @@ int bch2_trans_relock(struct btree_trans *trans)
 	trans_for_each_path(trans, path)
 		if (path->should_be_locked &&
 		    bch2_btree_path_relock(trans, path, _RET_IP_)) {
-			trace_trans_restart_relock(trans, _RET_IP_, path);
+			trace_and_count(trans->c, trans_restart_relock, trans, _RET_IP_, path);
 			BUG_ON(!trans->restarted);
 			return -BCH_ERR_transaction_restart_relock;
 		}
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index dd9405c631f5..1f5b98a3d0a2 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -143,7 +143,7 @@ bool bch2_btree_node_format_fits(struct bch_fs *c, struct btree *b,
 
 static void __btree_node_free(struct bch_fs *c, struct btree *b)
 {
-	trace_btree_node_free(c, b);
+	trace_and_count(c, btree_node_free, c, b);
 
 	BUG_ON(btree_node_dirty(b));
 	BUG_ON(btree_node_need_write(b));
@@ -305,7 +305,7 @@ static struct btree *bch2_btree_node_alloc(struct btree_update *as, unsigned lev
 	ret = bch2_btree_node_hash_insert(&c->btree_cache, b, level, as->btree_id);
 	BUG_ON(ret);
 
-	trace_btree_node_alloc(c, b);
+	trace_and_count(c, btree_node_alloc, c, b);
 	return b;
 }
 
@@ -995,7 +995,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
 		nr_nodes[1] += 1;
 
 	if (!bch2_btree_path_upgrade(trans, path, U8_MAX)) {
-		trace_trans_restart_iter_upgrade(trans, _RET_IP_, path);
+		trace_and_count(c, trans_restart_iter_upgrade, trans, _RET_IP_, path);
 		ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_upgrade);
 		return ERR_PTR(ret);
 	}
@@ -1058,7 +1058,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
 					      BTREE_UPDATE_JOURNAL_RES,
 					      journal_flags);
 		if (ret) {
-			trace_trans_restart_journal_preres_get(trans, _RET_IP_, journal_flags);
+			trace_and_count(c, trans_restart_journal_preres_get, trans, _RET_IP_, journal_flags);
 			ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_journal_preres_get);
 			goto err;
 		}
@@ -1091,8 +1091,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
 	}
 
 	if (ret) {
-		trace_btree_reserve_get_fail(trans->fn, _RET_IP_,
-					     nr_nodes[0] + nr_nodes[1]);
+		trace_and_count(c, btree_reserve_get_fail, trans->fn, _RET_IP_, nr_nodes[0] + nr_nodes[1]);
 		goto err;
 	}
 
@@ -1147,7 +1146,7 @@ static void bch2_btree_set_root(struct btree_update *as,
 	struct bch_fs *c = as->c;
 	struct btree *old;
 
-	trace_btree_set_root(c, b);
+	trace_and_count(c, btree_node_set_root, c, b);
 	BUG_ON(!b->written);
 
 	old = btree_node_root(c, b);
@@ -1434,7 +1433,7 @@ static void btree_split(struct btree_update *as, struct btree_trans *trans,
 		btree_split_insert_keys(as, trans, path, n1, keys);
 
 	if (bset_u64s(&n1->set[0]) > BTREE_SPLIT_THRESHOLD(c)) {
-		trace_btree_split(c, b);
+		trace_and_count(c, btree_node_split, c, b);
 
 		n2 = __btree_split_node(as, n1);
 
@@ -1468,7 +1467,7 @@ static void btree_split(struct btree_update *as, struct btree_trans *trans,
 			bch2_btree_node_write(c, n3, SIX_LOCK_intent, 0);
 		}
 	} else {
-		trace_btree_compact(c, b);
+		trace_and_count(c, btree_node_compact, c, b);
 
 		bch2_btree_build_aux_trees(n1);
 		six_unlock_write(&n1->c.lock);
@@ -1737,7 +1736,7 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans,
 	if (ret)
 		goto err;
 
-	trace_btree_merge(c, b);
+	trace_and_count(c, btree_node_merge, c, b);
 
 	bch2_btree_interior_update_will_free_node(as, b);
 	bch2_btree_interior_update_will_free_node(as, m);
@@ -1829,7 +1828,7 @@ int bch2_btree_node_rewrite(struct btree_trans *trans,
 	bch2_btree_build_aux_trees(n);
 	six_unlock_write(&n->c.lock);
 
-	trace_btree_rewrite(c, b);
+	trace_and_count(c, btree_node_rewrite, c, b);
 
 	bch2_btree_node_write(c, n, SIX_LOCK_intent, 0);
 
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index e3501623931a..732d09d45041 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -285,7 +285,7 @@ bch2_trans_journal_preres_get_cold(struct btree_trans *trans, unsigned u64s,
 
 	ret = bch2_trans_relock(trans);
 	if (ret) {
-		trace_trans_restart_journal_preres_get(trans, trace_ip, 0);
+		trace_and_count(c, trans_restart_journal_preres_get, trans, trace_ip, 0);
 		return ret;
 	}
 
@@ -375,7 +375,7 @@ btree_key_can_insert_cached(struct btree_trans *trans,
 	 * Keys returned by peek() are no longer valid pointers, so we need a
 	 * transaction restart:
 	 */
-	trace_trans_restart_key_cache_key_realloced(trans, _RET_IP_, path, old_u64s, new_u64s);
+	trace_and_count(c, trans_restart_key_cache_key_realloced, trans, _RET_IP_, path, old_u64s, new_u64s);
 	return btree_trans_restart_nounlock(trans, BCH_ERR_transaction_restart_key_cache_realloced);
 }
 
@@ -567,7 +567,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans,
 	int ret;
 
 	if (race_fault()) {
-		trace_trans_restart_fault_inject(trans, trace_ip);
+		trace_and_count(c, trans_restart_fault_inject, trans, trace_ip);
 		return btree_trans_restart_nounlock(trans, BCH_ERR_transaction_restart_fault_inject);
 	}
 
@@ -842,7 +842,7 @@ fail:
 		bch2_btree_node_unlock_write_inlined(trans, i->path, insert_l(i)->b);
 	}
 
-	trace_trans_restart_would_deadlock_write(trans);
+	trace_and_count(trans->c, trans_restart_would_deadlock_write, trans);
 	return btree_trans_restart(trans, BCH_ERR_transaction_restart_would_deadlock_write);
 }
 
@@ -975,7 +975,7 @@ int bch2_trans_commit_error(struct btree_trans *trans,
 	case BTREE_INSERT_BTREE_NODE_FULL:
 		ret = bch2_btree_split_leaf(trans, i->path, trans->flags);
 		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-			trace_trans_restart_btree_node_split(trans, trace_ip, i->path);
+			trace_and_count(c, trans_restart_btree_node_split, trans, trace_ip, i->path);
 		break;
 	case BTREE_INSERT_NEED_MARK_REPLICAS:
 		bch2_trans_unlock(trans);
@@ -986,7 +986,7 @@ int bch2_trans_commit_error(struct btree_trans *trans,
 
 		ret = bch2_trans_relock(trans);
 		if (ret)
-			trace_trans_restart_mark_replicas(trans, trace_ip);
+			trace_and_count(c, trans_restart_mark_replicas, trans, trace_ip);
 		break;
 	case BTREE_INSERT_NEED_JOURNAL_RES:
 		bch2_trans_unlock(trans);
@@ -1003,12 +1003,12 @@ int bch2_trans_commit_error(struct btree_trans *trans,
 
 		ret = bch2_trans_relock(trans);
 		if (ret)
-			trace_trans_restart_journal_res_get(trans, trace_ip);
+			trace_and_count(c, trans_restart_journal_res_get, trans, trace_ip);
 		break;
 	case BTREE_INSERT_NEED_JOURNAL_RECLAIM:
 		bch2_trans_unlock(trans);
 
-		trace_trans_blocked_journal_reclaim(trans, trace_ip);
+		trace_and_count(c, trans_blocked_journal_reclaim, trans, trace_ip);
 
 		wait_event_freezable(c->journal.reclaim_wait,
 				     (ret = journal_reclaim_wait_done(c)));
@@ -1017,7 +1017,7 @@ int bch2_trans_commit_error(struct btree_trans *trans,
 
 		ret = bch2_trans_relock(trans);
 		if (ret)
-			trace_trans_restart_journal_reclaim(trans, trace_ip);
+			trace_and_count(c, trans_restart_journal_reclaim, trans, trace_ip);
 		break;
 	default:
 		BUG_ON(ret >= 0);
@@ -1120,7 +1120,7 @@ int __bch2_trans_commit(struct btree_trans *trans)
 		BUG_ON(!i->path->should_be_locked);
 
 		if (unlikely(!bch2_btree_path_upgrade(trans, i->path, i->level + 1))) {
-			trace_trans_restart_upgrade(trans, _RET_IP_, i->path);
+			trace_and_count(c, trans_restart_upgrade, trans, _RET_IP_, i->path);
 			ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_upgrade);
 			goto out;
 		}
@@ -1166,7 +1166,7 @@ retry:
 	if (ret)
 		goto err;
 
-	trace_transaction_commit(trans, _RET_IP_);
+	trace_and_count(c, transaction_commit, trans, _RET_IP_);
 out:
 	bch2_journal_preres_put(&c->journal, &trans->journal_preres);
 
@@ -1642,7 +1642,7 @@ int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter
 			ck = (void *) iter->key_cache_path->l[0].b;
 
 			if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
-				trace_trans_restart_key_cache_raced(trans, _RET_IP_);
+				trace_and_count(trans->c, trans_restart_key_cache_raced, trans, _RET_IP_);
 				return btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_raced);
 			}
 
diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c
index f9eb147fe229..0b6f765bcad9 100644
--- a/fs/bcachefs/data_update.c
+++ b/fs/bcachefs/data_update.c
@@ -231,9 +231,12 @@ int bch2_data_update_index_update(struct bch_write_op *op)
 				m->data_opts.btree_insert_flags);
 		if (!ret) {
 			bch2_btree_iter_set_pos(&iter, next_pos);
-			atomic_long_inc(&c->extent_migrate_done);
+
 			if (ec_ob)
 				bch2_ob_add_backpointer(c, ec_ob, &insert->k);
+
+			this_cpu_add(c->counters[BCH_COUNTER_move_extent_finish], new->k.size);
+			trace_move_extent_finish(&new->k);
 		}
 err:
 		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
@@ -248,16 +251,16 @@ next:
 		}
 		continue;
 nomatch:
-		trace_data_update_fail(&old.k->p);
-
 		if (m->ctxt) {
 			BUG_ON(k.k->p.offset <= iter.pos.offset);
 			atomic64_inc(&m->ctxt->stats->keys_raced);
 			atomic64_add(k.k->p.offset - iter.pos.offset,
 				     &m->ctxt->stats->sectors_raced);
 		}
-		atomic_long_inc(&c->extent_migrate_raced);
-		trace_move_race(&new->k);
+
+		this_cpu_add(c->counters[BCH_COUNTER_move_extent_fail], new->k.size);
+		trace_move_extent_fail(&new->k);
+
 		bch2_btree_iter_advance(&iter);
 		goto next;
 	}
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index 44fb14a5b5ae..ed78cb8d90a2 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -1496,7 +1496,7 @@ static void promote_start(struct promote_op *op, struct bch_read_bio *rbio)
 {
 	struct bio *bio = &op->write.op.wbio.bio;
 
-	trace_promote(&rbio->bio);
+	trace_and_count(op->write.op.c, read_promote, &rbio->bio);
 
 	/* we now own pages: */
 	BUG_ON(!rbio->bounce);
@@ -1761,7 +1761,7 @@ static void bch2_rbio_retry(struct work_struct *work)
 	};
 	struct bch_io_failures failed = { .nr = 0 };
 
-	trace_read_retry(&rbio->bio);
+	trace_and_count(c, read_retry, &rbio->bio);
 
 	if (rbio->retry == READ_RETRY_AVOID)
 		bch2_mark_io_failure(&failed, &rbio->pick);
@@ -2017,7 +2017,7 @@ static void bch2_read_endio(struct bio *bio)
 
 	if (((rbio->flags & BCH_READ_RETRY_IF_STALE) && race_fault()) ||
 	    ptr_stale(ca, &rbio->pick.ptr)) {
-		atomic_long_inc(&c->read_realloc_races);
+		trace_and_count(c, read_reuse_race, &rbio->bio);
 
 		if (rbio->flags & BCH_READ_RETRY_IF_STALE)
 			bch2_rbio_error(rbio, READ_RETRY, BLK_STS_AGAIN);
@@ -2305,7 +2305,7 @@ get_bio:
 	rbio->bio.bi_end_io	= bch2_read_endio;
 
 	if (rbio->bounce)
-		trace_read_bounce(&rbio->bio);
+		trace_and_count(c, read_bounce, &rbio->bio);
 
 	this_cpu_add(c->counters[BCH_COUNTER_io_read], bio_sectors(&rbio->bio));
 	bch2_increment_clock(c, bio_sectors(&rbio->bio), READ);
@@ -2320,7 +2320,7 @@ get_bio:
 
 	if (!(flags & (BCH_READ_IN_RETRY|BCH_READ_LAST_FRAGMENT))) {
 		bio_inc_remaining(&orig->bio);
-		trace_read_split(&orig->bio);
+		trace_and_count(c, read_split, &orig->bio);
 	}
 
 	if (!rbio->pick.idx) {
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index 26f60db751ca..9961cc674ad7 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -390,12 +390,12 @@ retry:
 	ret = journal_entry_open(j);
 
 	if (ret == JOURNAL_ERR_max_in_flight)
-		trace_journal_entry_full(c);
+		trace_and_count(c, journal_entry_full, c);
 unlock:
 	if ((ret && ret != JOURNAL_ERR_insufficient_devices) &&
 	    !j->res_get_blocked_start) {
 		j->res_get_blocked_start = local_clock() ?: 1;
-		trace_journal_full(c);
+		trace_and_count(c, journal_full, c);
 	}
 
 	can_discard = j->can_discard;
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index acb2005c3b72..090a718b917f 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -1552,7 +1552,7 @@ static void do_journal_write(struct closure *cl)
 
 		bch2_bio_map(bio, w->data, sectors << 9);
 
-		trace_journal_write(bio);
+		trace_and_count(c, journal_write, bio);
 		closure_bio_submit(bio, cl);
 
 		ca->journal.bucket_seq[ca->journal.cur_idx] =
diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
index 00d9e3a8e526..a4f9d01d33cc 100644
--- a/fs/bcachefs/journal_reclaim.c
+++ b/fs/bcachefs/journal_reclaim.c
@@ -642,7 +642,8 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct, bool kicked)
 
 		min_key_cache = min(bch2_nr_btree_keys_need_flush(c), (size_t) 128);
 
-		trace_journal_reclaim_start(c, direct, kicked,
+		trace_and_count(c, journal_reclaim_start, c,
+				direct, kicked,
 				min_nr, min_key_cache,
 				j->prereserved.reserved,
 				j->prereserved.remaining,
@@ -658,7 +659,7 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct, bool kicked)
 			j->nr_direct_reclaim += nr_flushed;
 		else
 			j->nr_background_reclaim += nr_flushed;
-		trace_journal_reclaim_finish(c, nr_flushed);
+		trace_and_count(c, journal_reclaim_finish, c, nr_flushed);
 
 		if (nr_flushed)
 			wake_up(&j->reclaim_wait);
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index ea9ce6d436a2..0486c7e14c56 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -245,8 +245,8 @@ static int bch2_move_extent(struct btree_trans *trans,
 	atomic64_inc(&ctxt->stats->keys_moved);
 	atomic64_add(k.k->size, &ctxt->stats->sectors_moved);
 	this_cpu_add(c->counters[BCH_COUNTER_io_move], k.k->size);
-
-	trace_move_extent(k.k);
+	this_cpu_add(c->counters[BCH_COUNTER_move_extent_read], k.k->size);
+	trace_move_extent_read(k.k);
 
 	atomic_add(io->read_sectors, &ctxt->read_sectors);
 	list_add_tail(&io->list, &ctxt->reads);
@@ -268,7 +268,7 @@ err_free:
 	kfree(io);
 err:
 	percpu_ref_put(&c->writes);
-	trace_move_alloc_mem_fail(k.k);
+	trace_and_count(c, move_extent_alloc_mem_fail, k.k);
 	return ret;
 }
 
diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c
index 438ea22ad5bd..dca8d4a3a89c 100644
--- a/fs/bcachefs/movinggc.c
+++ b/fs/bcachefs/movinggc.c
@@ -339,7 +339,7 @@ static int bch2_copygc(struct bch_fs *c)
 			 atomic64_read(&move_stats.keys_raced),
 			 atomic64_read(&move_stats.sectors_raced));
 
-	trace_copygc(c,
+	trace_and_count(c, copygc, c,
 		     atomic64_read(&move_stats.sectors_moved), sectors_not_moved,
 		     buckets_to_move, buckets_not_moved);
 	return 0;
@@ -397,7 +397,7 @@ static int bch2_copygc_thread(void *arg)
 		wait = bch2_copygc_wait_amount(c);
 
 		if (wait > clock->max_slop) {
-			trace_copygc_wait(c, wait, last + wait);
+			trace_and_count(c, copygc_wait, c, wait, last + wait);
 			c->copygc_wait = last + wait;
 			bch2_kthread_io_clock_wait(clock, last + wait,
 					MAX_SCHEDULE_TIMEOUT);
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index 48ad158637e5..4953f54e94d6 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -801,7 +801,7 @@ int bch2_write_super(struct bch_fs *c)
 	unsigned degraded_flags = BCH_FORCE_IF_DEGRADED;
 	int ret = 0;
 
-	trace_write_super(c, _RET_IP_);
+	trace_and_count(c, write_super, c, _RET_IP_);
 
 	if (c->opts.very_degraded)
 		degraded_flags |= BCH_FORCE_IF_LOST;
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index 4e2b6285cf3a..d10ac84c10ce 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -190,11 +190,6 @@ read_attribute(internal_uuid);
 read_attribute(has_data);
 read_attribute(alloc_debug);
 
-read_attribute(read_realloc_races);
-read_attribute(extent_migrate_done);
-read_attribute(extent_migrate_raced);
-read_attribute(bucket_alloc_fail);
-
 #define x(t, n, ...) read_attribute(t);
 BCH_PERSISTENT_COUNTERS()
 #undef x
@@ -378,15 +373,6 @@ SHOW(bch2_fs)
 	sysfs_hprint(btree_cache_size,		bch2_btree_cache_size(c));
 	sysfs_hprint(btree_avg_write_size,	bch2_btree_avg_write_size(c));
 
-	sysfs_print(read_realloc_races,
-		    atomic_long_read(&c->read_realloc_races));
-	sysfs_print(extent_migrate_done,
-		    atomic_long_read(&c->extent_migrate_done));
-	sysfs_print(extent_migrate_raced,
-		    atomic_long_read(&c->extent_migrate_raced));
-	sysfs_print(bucket_alloc_fail,
-		    atomic_long_read(&c->bucket_alloc_fail));
-
 	sysfs_printf(btree_gc_periodic, "%u",	(int) c->btree_gc_periodic);
 
 	if (attr == &sysfs_gc_gens_pos)
@@ -629,11 +615,6 @@ struct attribute *bch2_fs_internal_files[] = {
 	&sysfs_trigger_invalidates,
 	&sysfs_prune_cache,
 
-	&sysfs_read_realloc_races,
-	&sysfs_extent_migrate_done,
-	&sysfs_extent_migrate_raced,
-	&sysfs_bucket_alloc_fail,
-
 	&sysfs_gc_gens_pos,
 
 	&sysfs_copy_gc_enabled,
diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h
index 2c1661ab807b..1ef99af5cd03 100644
--- a/fs/bcachefs/trace.h
+++ b/fs/bcachefs/trace.h
@@ -52,6 +52,31 @@ DECLARE_EVENT_CLASS(bkey,
 		  __entry->offset, __entry->size)
 );
 
+DECLARE_EVENT_CLASS(btree_node,
+	TP_PROTO(struct bch_fs *c, struct btree *b),
+	TP_ARGS(c, b),
+
+	TP_STRUCT__entry(
+		__field(dev_t,		dev			)
+		__field(u8,		level			)
+		__field(u8,		btree_id		)
+		TRACE_BPOS_entries(pos)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= c->dev;
+		__entry->level		= b->c.level;
+		__entry->btree_id	= b->c.btree_id;
+		TRACE_BPOS_assign(pos, b->key.k.p);
+	),
+
+	TP_printk("%d,%d %u %s %llu:%llu:%u",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->level,
+		  bch2_btree_ids[__entry->btree_id],
+		  __entry->pos_inode, __entry->pos_offset, __entry->pos_snapshot)
+);
+
 DECLARE_EVENT_CLASS(bch_fs,
 	TP_PROTO(struct bch_fs *c),
 	TP_ARGS(c),
@@ -112,7 +137,7 @@ TRACE_EVENT(write_super,
 
 /* io.c: */
 
-DEFINE_EVENT(bio, read_split,
+DEFINE_EVENT(bio, read_promote,
 	TP_PROTO(struct bio *bio),
 	TP_ARGS(bio)
 );
@@ -122,12 +147,17 @@ DEFINE_EVENT(bio, read_bounce,
 	TP_ARGS(bio)
 );
 
+DEFINE_EVENT(bio, read_split,
+	TP_PROTO(struct bio *bio),
+	TP_ARGS(bio)
+);
+
 DEFINE_EVENT(bio, read_retry,
 	TP_PROTO(struct bio *bio),
 	TP_ARGS(bio)
 );
 
-DEFINE_EVENT(bio, promote,
+DEFINE_EVENT(bio, read_reuse_race,
 	TP_PROTO(struct bio *bio),
 	TP_ARGS(bio)
 );
@@ -220,8 +250,6 @@ TRACE_EVENT(journal_reclaim_finish,
 		  __entry->nr_flushed)
 );
 
-/* allocator: */
-
 /* bset.c: */
 
 DEFINE_EVENT(bpos, bkey_pack_pos_fail,
@@ -229,39 +257,61 @@ DEFINE_EVENT(bpos, bkey_pack_pos_fail,
 	TP_ARGS(p)
 );
 
-/* Btree */
+/* Btree cache: */
 
-DECLARE_EVENT_CLASS(btree_node,
-	TP_PROTO(struct bch_fs *c, struct btree *b),
-	TP_ARGS(c, b),
+TRACE_EVENT(btree_cache_scan,
+	TP_PROTO(long nr_to_scan, long can_free, long ret),
+	TP_ARGS(nr_to_scan, can_free, ret),
 
 	TP_STRUCT__entry(
-		__field(dev_t,		dev			)
-		__field(u8,		level			)
-		__field(u8,		btree_id		)
-		TRACE_BPOS_entries(pos)
+		__field(long,	nr_to_scan		)
+		__field(long,	can_free		)
+		__field(long,	ret			)
 	),
 
 	TP_fast_assign(
-		__entry->dev		= c->dev;
-		__entry->level		= b->c.level;
-		__entry->btree_id	= b->c.btree_id;
-		TRACE_BPOS_assign(pos, b->key.k.p);
+		__entry->nr_to_scan	= nr_to_scan;
+		__entry->can_free	= can_free;
+		__entry->ret		= ret;
 	),
 
-	TP_printk("%d,%d %u %s %llu:%llu:%u",
-		  MAJOR(__entry->dev), MINOR(__entry->dev),
-		  __entry->level,
-		  bch2_btree_ids[__entry->btree_id],
-		  __entry->pos_inode, __entry->pos_offset, __entry->pos_snapshot)
+	TP_printk("scanned for %li nodes, can free %li, ret %li",
+		  __entry->nr_to_scan, __entry->can_free, __entry->ret)
+);
+
+DEFINE_EVENT(btree_node, btree_cache_reap,
+	TP_PROTO(struct bch_fs *c, struct btree *b),
+	TP_ARGS(c, b)
+);
+
+DEFINE_EVENT(bch_fs, btree_cache_cannibalize_lock_fail,
+	TP_PROTO(struct bch_fs *c),
+	TP_ARGS(c)
+);
+
+DEFINE_EVENT(bch_fs, btree_cache_cannibalize_lock,
+	TP_PROTO(struct bch_fs *c),
+	TP_ARGS(c)
+);
+
+DEFINE_EVENT(bch_fs, btree_cache_cannibalize,
+	TP_PROTO(struct bch_fs *c),
+	TP_ARGS(c)
 );
 
-DEFINE_EVENT(btree_node, btree_read,
+DEFINE_EVENT(bch_fs, btree_cache_cannibalize_unlock,
+	TP_PROTO(struct bch_fs *c),
+	TP_ARGS(c)
+);
+
+/* Btree */
+
+DEFINE_EVENT(btree_node, btree_node_read,
 	TP_PROTO(struct bch_fs *c, struct btree *b),
 	TP_ARGS(c, b)
 );
 
-TRACE_EVENT(btree_write,
+TRACE_EVENT(btree_node_write,
 	TP_PROTO(struct btree *b, unsigned bytes, unsigned sectors),
 	TP_ARGS(b, bytes, sectors),
 
@@ -291,31 +341,6 @@ DEFINE_EVENT(btree_node, btree_node_free,
 	TP_ARGS(c, b)
 );
 
-DEFINE_EVENT(btree_node, btree_node_reap,
-	TP_PROTO(struct bch_fs *c, struct btree *b),
-	TP_ARGS(c, b)
-);
-
-DEFINE_EVENT(bch_fs, btree_node_cannibalize_lock_fail,
-	TP_PROTO(struct bch_fs *c),
-	TP_ARGS(c)
-);
-
-DEFINE_EVENT(bch_fs, btree_node_cannibalize_lock,
-	TP_PROTO(struct bch_fs *c),
-	TP_ARGS(c)
-);
-
-DEFINE_EVENT(bch_fs, btree_node_cannibalize,
-	TP_PROTO(struct bch_fs *c),
-	TP_ARGS(c)
-);
-
-DEFINE_EVENT(bch_fs, btree_node_cannibalize_unlock,
-	TP_PROTO(struct bch_fs *c),
-	TP_ARGS(c)
-);
-
 TRACE_EVENT(btree_reserve_get_fail,
 	TP_PROTO(const char *trans_fn,
 		 unsigned long caller_ip,
@@ -340,52 +365,32 @@ TRACE_EVENT(btree_reserve_get_fail,
 		  __entry->required)
 );
 
-DEFINE_EVENT(btree_node, btree_split,
+DEFINE_EVENT(btree_node, btree_node_compact,
 	TP_PROTO(struct bch_fs *c, struct btree *b),
 	TP_ARGS(c, b)
 );
 
-DEFINE_EVENT(btree_node, btree_compact,
+DEFINE_EVENT(btree_node, btree_node_merge,
 	TP_PROTO(struct bch_fs *c, struct btree *b),
 	TP_ARGS(c, b)
 );
 
-DEFINE_EVENT(btree_node, btree_merge,
+DEFINE_EVENT(btree_node, btree_node_split,
 	TP_PROTO(struct bch_fs *c, struct btree *b),
 	TP_ARGS(c, b)
 );
 
-DEFINE_EVENT(btree_node, btree_rewrite,
+DEFINE_EVENT(btree_node, btree_node_rewrite,
 	TP_PROTO(struct bch_fs *c, struct btree *b),
 	TP_ARGS(c, b)
 );
 
-DEFINE_EVENT(btree_node, btree_set_root,
+DEFINE_EVENT(btree_node, btree_node_set_root,
 	TP_PROTO(struct bch_fs *c, struct btree *b),
 	TP_ARGS(c, b)
 );
 
-TRACE_EVENT(btree_cache_scan,
-	TP_PROTO(long nr_to_scan, long can_free, long ret),
-	TP_ARGS(nr_to_scan, can_free, ret),
-
-	TP_STRUCT__entry(
-		__field(long,	nr_to_scan		)
-		__field(long,	can_free		)
-		__field(long,	ret			)
-	),
-
-	TP_fast_assign(
-		__entry->nr_to_scan	= nr_to_scan;
-		__entry->can_free	= can_free;
-		__entry->ret		= ret;
-	),
-
-	TP_printk("scanned for %li nodes, can free %li, ret %li",
-		  __entry->nr_to_scan, __entry->can_free, __entry->ret)
-);
-
-TRACE_EVENT(btree_node_relock_fail,
+TRACE_EVENT(btree_path_relock_fail,
 	TP_PROTO(struct btree_trans *trans,
 		 unsigned long caller_ip,
 		 struct btree_path *path,
@@ -429,7 +434,7 @@ TRACE_EVENT(btree_node_relock_fail,
 		  __entry->node_lock_seq)
 );
 
-TRACE_EVENT(btree_node_upgrade_fail,
+TRACE_EVENT(btree_path_upgrade_fail,
 	TP_PROTO(struct btree_trans *trans,
 		 unsigned long caller_ip,
 		 struct btree_path *path,
@@ -617,7 +622,7 @@ TRACE_EVENT(discard_buckets,
 		  __entry->err)
 );
 
-TRACE_EVENT(invalidate_bucket,
+TRACE_EVENT(bucket_invalidate,
 	TP_PROTO(struct bch_fs *c, unsigned dev, u64 bucket, u32 sectors),
 	TP_ARGS(c, dev, bucket, sectors),
 
@@ -643,17 +648,27 @@ TRACE_EVENT(invalidate_bucket,
 
 /* Moving IO */
 
-DEFINE_EVENT(bkey, move_extent,
+DEFINE_EVENT(bkey, move_extent_read,
+	TP_PROTO(const struct bkey *k),
+	TP_ARGS(k)
+);
+
+DEFINE_EVENT(bkey, move_extent_write,
 	TP_PROTO(const struct bkey *k),
 	TP_ARGS(k)
 );
 
-DEFINE_EVENT(bkey, move_alloc_mem_fail,
+DEFINE_EVENT(bkey, move_extent_finish,
 	TP_PROTO(const struct bkey *k),
 	TP_ARGS(k)
 );
 
-DEFINE_EVENT(bkey, move_race,
+DEFINE_EVENT(bkey, move_extent_fail,
+	TP_PROTO(const struct bkey *k),
+	TP_ARGS(k)
+);
+
+DEFINE_EVENT(bkey, move_extent_alloc_mem_fail,
 	TP_PROTO(const struct bkey *k),
 	TP_ARGS(k)
 );
@@ -732,11 +747,6 @@ TRACE_EVENT(copygc_wait,
 		  __entry->wait_amount, __entry->until)
 );
 
-DEFINE_EVENT(bpos, data_update_fail,
-	TP_PROTO(const struct bpos *p),
-	TP_ARGS(p)
-);
-
 /* btree transactions: */
 
 DECLARE_EVENT_CLASS(transaction_event,
@@ -763,7 +773,7 @@ DEFINE_EVENT(transaction_event,	transaction_commit,
 	TP_ARGS(trans, caller_ip)
 );
 
-DEFINE_EVENT(transaction_event,	transaction_restart_injected,
+DEFINE_EVENT(transaction_event,	trans_restart_injected,
 	TP_PROTO(struct btree_trans *trans,
 		 unsigned long caller_ip),
 	TP_ARGS(trans, caller_ip)
@@ -926,7 +936,7 @@ DEFINE_EVENT(transaction_restart_iter,	trans_restart_relock_after_fill,
 	TP_ARGS(trans, caller_ip, path)
 );
 
-DEFINE_EVENT(transaction_event,	transaction_restart_key_cache_upgrade,
+DEFINE_EVENT(transaction_event,	trans_restart_key_cache_upgrade,
 	TP_PROTO(struct btree_trans *trans,
 		 unsigned long caller_ip),
 	TP_ARGS(trans, caller_ip)
-- 
cgit 


From f5178b34b9f1b53d2a97a2a210d3c284966428e4 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 27 Aug 2022 15:00:59 -0400
Subject: six locks: Delete six_lock_pcpu_free_rcu()

Didn't have any users, and wasn't a good idea to begin with - delete it.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/six.c | 28 ----------------------------
 fs/bcachefs/six.h |  1 -
 2 files changed, 29 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/six.c b/fs/bcachefs/six.c
index 464b1313d358..0ab72f59d23b 100644
--- a/fs/bcachefs/six.c
+++ b/fs/bcachefs/six.c
@@ -712,34 +712,6 @@ void six_lock_wakeup_all(struct six_lock *lock)
 }
 EXPORT_SYMBOL_GPL(six_lock_wakeup_all);
 
-struct free_pcpu_rcu {
-	struct rcu_head		rcu;
-	void __percpu		*p;
-};
-
-static void free_pcpu_rcu_fn(struct rcu_head *_rcu)
-{
-	struct free_pcpu_rcu *rcu =
-		container_of(_rcu, struct free_pcpu_rcu, rcu);
-
-	free_percpu(rcu->p);
-	kfree(rcu);
-}
-
-void six_lock_pcpu_free_rcu(struct six_lock *lock)
-{
-	struct free_pcpu_rcu *rcu = kzalloc(sizeof(*rcu), GFP_KERNEL);
-
-	if (!rcu)
-		return;
-
-	rcu->p = lock->readers;
-	lock->readers = NULL;
-
-	call_rcu(&rcu->rcu, free_pcpu_rcu_fn);
-}
-EXPORT_SYMBOL_GPL(six_lock_pcpu_free_rcu);
-
 void six_lock_pcpu_free(struct six_lock *lock)
 {
 	BUG_ON(lock->readers && pcpu_read_count(lock));
diff --git a/fs/bcachefs/six.h b/fs/bcachefs/six.h
index 59d796cfde43..6c9ac82d146d 100644
--- a/fs/bcachefs/six.h
+++ b/fs/bcachefs/six.h
@@ -201,7 +201,6 @@ void six_lock_increment(struct six_lock *, enum six_lock_type);
 
 void six_lock_wakeup_all(struct six_lock *);
 
-void six_lock_pcpu_free_rcu(struct six_lock *);
 void six_lock_pcpu_free(struct six_lock *);
 void six_lock_pcpu_alloc(struct six_lock *);
 
-- 
cgit 


From c919f53f3bcba3598fc6ce1ee5c5aed75d0834b7 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 30 Aug 2022 11:40:03 -0400
Subject: bcachefs: Don't leak lock pcpu counts memory

This fixes a small memory leak.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_key_cache.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index 127cb6edaff5..7349c70f8445 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -238,8 +238,6 @@ btree_key_cache_create(struct bch_fs *c,
 	} else {
 		if (btree_id == BTREE_ID_subvolumes)
 			six_lock_pcpu_alloc(&ck->c.lock);
-		else
-			six_lock_pcpu_free(&ck->c.lock);
 	}
 
 	ck->c.level		= 0;
@@ -688,6 +686,7 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
 			break;
 
 		list_del(&ck->list);
+		six_lock_pcpu_free(&ck->c.lock);
 		kmem_cache_free(bch2_key_cache, ck);
 		atomic_long_dec(&bc->nr_freed);
 		scanned++;
-- 
cgit 


From 534a591e4cf98d036e478b93de4a95ff126fb018 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 27 Aug 2022 17:47:27 -0400
Subject: bcachefs: Delete time_stats for lock contended times

Since we've now got time_stats for lock hold times (per btree
transaction), we don't need this anymore.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs.h      |  3 ---
 fs/bcachefs/btree_locking.h | 25 +------------------------
 2 files changed, 1 insertion(+), 27 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index bca61af71652..c1d96222f4c3 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -335,9 +335,6 @@ BCH_DEBUG_PARAMS_DEBUG()
 	x(btree_interior_update_foreground)	\
 	x(btree_interior_update_total)		\
 	x(btree_gc)				\
-	x(btree_lock_contended_read)		\
-	x(btree_lock_contended_intent)		\
-	x(btree_lock_contended_write)		\
 	x(data_write)				\
 	x(data_read)				\
 	x(data_promote)				\
diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h
index 3bc5df4263f8..1e4c81d8084b 100644
--- a/fs/bcachefs/btree_locking.h
+++ b/fs/bcachefs/btree_locking.h
@@ -123,20 +123,6 @@ static void btree_trans_lock_hold_time_update(struct btree_trans *trans,
 #endif
 }
 
-static inline enum bch_time_stats lock_to_time_stat(enum six_lock_type type)
-{
-	switch (type) {
-	case SIX_LOCK_read:
-		return BCH_TIME_btree_lock_contended_read;
-	case SIX_LOCK_intent:
-		return BCH_TIME_btree_lock_contended_intent;
-	case SIX_LOCK_write:
-		return BCH_TIME_btree_lock_contended_write;
-	default:
-		BUG();
-	}
-}
-
 /* unlock: */
 
 static inline void btree_node_unlock(struct btree_trans *trans,
@@ -206,15 +192,11 @@ static inline int btree_node_lock_type(struct btree_trans *trans,
 				       enum six_lock_type type,
 				       six_lock_should_sleep_fn should_sleep_fn, void *p)
 {
-	struct bch_fs *c = trans->c;
-	u64 start_time;
 	int ret;
 
 	if (six_trylock_type(&b->lock, type))
 		return 0;
 
-	start_time = local_clock();
-
 	trans->locking_path_idx = path->idx;
 	trans->locking_pos	= pos;
 	trans->locking_btree_id	= path->btree_id;
@@ -223,12 +205,7 @@ static inline int btree_node_lock_type(struct btree_trans *trans,
 	trans->locking		= b;
 	ret = six_lock_type(&b->lock, type, should_sleep_fn, p);
 	trans->locking = NULL;
-
-	if (ret)
-		return ret;
-
-	bch2_time_stats_update(&c->times[lock_to_time_stat(type)], start_time);
-	return 0;
+	return ret;
 }
 
 /*
-- 
cgit 


From 546180874ade7225676bc0cd5ea4e2388e2374bc Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 26 Aug 2022 14:55:00 -0400
Subject: bcachefs: Mark write locks before taking lock

six locks are unfair: while a thread is blocked trying to take a write
lock, new read locks will fail. The new deadlock cycle detector makes
use of our existing lock tracing, so we need to tell it we're holding a
write lock before we take the lock for it to work correctly.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_locking.h     |  9 +++++++--
 fs/bcachefs/btree_update_leaf.c | 11 +++++++++--
 2 files changed, 16 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h
index 1e4c81d8084b..ab3161c1b1f4 100644
--- a/fs/bcachefs/btree_locking.h
+++ b/fs/bcachefs/btree_locking.h
@@ -271,10 +271,15 @@ static inline void bch2_btree_node_lock_write(struct btree_trans *trans,
 	EBUG_ON(path->l[b->c.level].lock_seq != b->c.lock.state.seq);
 	EBUG_ON(!btree_node_intent_locked(path, b->c.level));
 
+	/*
+	 * six locks are unfair, and read locks block while a thread wants a
+	 * write lock: thus, we need to tell the cycle detector we have a write
+	 * lock _before_ taking the lock:
+	 */
+	mark_btree_node_locked_noreset(path, b->c.level, SIX_LOCK_write);
+
 	if (unlikely(!six_trylock_write(&b->c.lock)))
 		__bch2_btree_node_lock_write(trans, b);
-
-	mark_btree_node_locked_noreset(path, b->c.level, SIX_LOCK_write);
 }
 
 /* relock: */
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 732d09d45041..a8306b16956d 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -817,6 +817,13 @@ static inline int trans_lock_write(struct btree_trans *trans)
 		if (same_leaf_as_prev(trans, i))
 			continue;
 
+		/*
+		 * six locks are unfair, and read locks block while a thread
+		 * wants a write lock: thus, we need to tell the cycle detector
+		 * we have a write lock _before_ taking the lock:
+		 */
+		mark_btree_node_locked_noreset(i->path, i->level, SIX_LOCK_write);
+
 		if (!six_trylock_write(&insert_l(i)->b->c.lock)) {
 			if (have_conflicting_read_lock(trans, i->path))
 				goto fail;
@@ -828,13 +835,13 @@ static inline int trans_lock_write(struct btree_trans *trans)
 			BUG_ON(ret);
 		}
 
-		mark_btree_node_locked_noreset(i->path, i->level, SIX_LOCK_write);
-
 		bch2_btree_node_prep_for_write(trans, i->path, insert_l(i)->b);
 	}
 
 	return 0;
 fail:
+	mark_btree_node_locked_noreset(i->path, i->level, SIX_LOCK_intent);
+
 	while (--i >= trans->updates) {
 		if (same_leaf_as_prev(trans, i))
 			continue;
-- 
cgit 


From ca7d8fcabf29fae627babb72bda9b51763f9a145 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 21 Aug 2022 14:29:43 -0400
Subject: bcachefs: New locking functions

In the future, with the new deadlock cycle detector, we won't be using
bare six_lock_* anymore: lock wait entries will all be embedded in
btree_trans, and we will need a btree_trans context whenever locking a
btree node.

This patch plumbs a btree_trans to the few places that need it, and adds
two new locking functions
 - btree_node_lock_nopath, which may fail returning a transaction
   restart, and
 - btree_node_lock_nopath_nofail, to be used in places where we know we
   cannot deadlock (i.e. because we're holding no other locks).

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_cache.c           | 21 +++++---
 fs/bcachefs/btree_cache.h           |  4 +-
 fs/bcachefs/btree_gc.c              | 40 +++++++++-------
 fs/bcachefs/btree_io.c              |  8 +++-
 fs/bcachefs/btree_key_cache.c       | 82 ++++++++++++++++++++++----------
 fs/bcachefs/btree_locking.c         |  2 +-
 fs/bcachefs/btree_locking.h         | 18 +++++++
 fs/bcachefs/btree_update_interior.c | 95 ++++++++++++++++++++-----------------
 fs/bcachefs/btree_update_interior.h |  1 +
 fs/bcachefs/btree_update_leaf.c     |  7 ++-
 10 files changed, 182 insertions(+), 96 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index e09fbf36ebc2..a0e9e14e3fa5 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -959,12 +959,13 @@ lock_node:
 	return b;
 }
 
-struct btree *bch2_btree_node_get_noiter(struct bch_fs *c,
+struct btree *bch2_btree_node_get_noiter(struct btree_trans *trans,
 					 const struct bkey_i *k,
 					 enum btree_id btree_id,
 					 unsigned level,
 					 bool nofill)
 {
+	struct bch_fs *c = trans->c;
 	struct btree_cache *bc = &c->btree_cache;
 	struct btree *b;
 	struct bset_tree *t;
@@ -998,9 +999,14 @@ retry:
 			goto out;
 	} else {
 lock_node:
-		ret = six_lock_read(&b->c.lock, lock_node_check_fn, (void *) k);
-		if (ret)
-			goto retry;
+		ret = btree_node_lock_nopath(trans, &b->c, SIX_LOCK_read);
+		if (unlikely(ret)) {
+			if (bch2_err_matches(ret, BCH_ERR_lock_fail_node_reused))
+				goto retry;
+			if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+				return ERR_PTR(ret);
+			BUG();
+		}
 
 		if (unlikely(b->hash_val != btree_ptr_hash_val(k) ||
 			     b->c.btree_id != btree_id ||
@@ -1062,8 +1068,9 @@ int bch2_btree_node_prefetch(struct bch_fs *c,
 	return PTR_ERR_OR_ZERO(b);
 }
 
-void bch2_btree_node_evict(struct bch_fs *c, const struct bkey_i *k)
+void bch2_btree_node_evict(struct btree_trans *trans, const struct bkey_i *k)
 {
+	struct bch_fs *c = trans->c;
 	struct btree_cache *bc = &c->btree_cache;
 	struct btree *b;
 
@@ -1079,8 +1086,8 @@ wait_on_io:
 	__bch2_btree_node_wait_on_read(b);
 	__bch2_btree_node_wait_on_write(b);
 
-	six_lock_intent(&b->c.lock, NULL, NULL);
-	six_lock_write(&b->c.lock, NULL, NULL);
+	btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent);
+	btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_write);
 
 	if (btree_node_dirty(b)) {
 		__bch2_btree_node_write(c, b, 0);
diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h
index 83723805f12a..a4df3e866bb8 100644
--- a/fs/bcachefs/btree_cache.h
+++ b/fs/bcachefs/btree_cache.h
@@ -26,13 +26,13 @@ struct btree *bch2_btree_node_get(struct btree_trans *, struct btree_path *,
 				  const struct bkey_i *, unsigned,
 				  enum six_lock_type, unsigned long);
 
-struct btree *bch2_btree_node_get_noiter(struct bch_fs *, const struct bkey_i *,
+struct btree *bch2_btree_node_get_noiter(struct btree_trans *, const struct bkey_i *,
 					 enum btree_id, unsigned, bool);
 
 int bch2_btree_node_prefetch(struct bch_fs *, struct btree_trans *, struct btree_path *,
 			     const struct bkey_i *, enum btree_id, unsigned);
 
-void bch2_btree_node_evict(struct bch_fs *, const struct bkey_i *);
+void bch2_btree_node_evict(struct btree_trans *, const struct bkey_i *);
 
 void bch2_fs_btree_cache_exit(struct bch_fs *);
 int bch2_fs_btree_cache_init(struct bch_fs *);
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 239eda57bf02..77a1fe81ac35 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -165,10 +165,11 @@ static void btree_ptr_to_v2(struct btree *b, struct bkey_i_btree_ptr_v2 *dst)
 	}
 }
 
-static void bch2_btree_node_update_key_early(struct bch_fs *c,
+static void bch2_btree_node_update_key_early(struct btree_trans *trans,
 					     enum btree_id btree, unsigned level,
 					     struct bkey_s_c old, struct bkey_i *new)
 {
+	struct bch_fs *c = trans->c;
 	struct btree *b;
 	struct bkey_buf tmp;
 	int ret;
@@ -176,7 +177,7 @@ static void bch2_btree_node_update_key_early(struct bch_fs *c,
 	bch2_bkey_buf_init(&tmp);
 	bch2_bkey_buf_reassemble(&tmp, c, old);
 
-	b = bch2_btree_node_get_noiter(c, tmp.k, btree, level, true);
+	b = bch2_btree_node_get_noiter(trans, tmp.k, btree, level, true);
 	if (!IS_ERR_OR_NULL(b)) {
 		mutex_lock(&c->btree_cache.lock);
 
@@ -352,8 +353,9 @@ fsck_err:
 	return ret;
 }
 
-static int bch2_btree_repair_topology_recurse(struct bch_fs *c, struct btree *b)
+static int bch2_btree_repair_topology_recurse(struct btree_trans *trans, struct btree *b)
 {
+	struct bch_fs *c = trans->c;
 	struct btree_and_journal_iter iter;
 	struct bkey_s_c k;
 	struct bkey_buf prev_k, cur_k;
@@ -378,7 +380,7 @@ again:
 		bch2_btree_and_journal_iter_advance(&iter);
 		bch2_bkey_buf_reassemble(&cur_k, c, k);
 
-		cur = bch2_btree_node_get_noiter(c, cur_k.k,
+		cur = bch2_btree_node_get_noiter(trans, cur_k.k,
 					b->c.btree_id, b->c.level - 1,
 					false);
 		ret = PTR_ERR_OR_ZERO(cur);
@@ -392,7 +394,7 @@ again:
 				bch2_btree_ids[b->c.btree_id],
 				b->c.level - 1,
 				buf.buf)) {
-			bch2_btree_node_evict(c, cur_k.k);
+			bch2_btree_node_evict(trans, cur_k.k);
 			ret = bch2_journal_key_delete(c, b->c.btree_id,
 						      b->c.level, cur_k.k->k.p);
 			cur = NULL;
@@ -411,7 +413,7 @@ again:
 
 		if (ret == DROP_THIS_NODE) {
 			six_unlock_read(&cur->c.lock);
-			bch2_btree_node_evict(c, cur_k.k);
+			bch2_btree_node_evict(trans, cur_k.k);
 			ret = bch2_journal_key_delete(c, b->c.btree_id,
 						      b->c.level, cur_k.k->k.p);
 			cur = NULL;
@@ -425,7 +427,7 @@ again:
 		prev = NULL;
 
 		if (ret == DROP_PREV_NODE) {
-			bch2_btree_node_evict(c, prev_k.k);
+			bch2_btree_node_evict(trans, prev_k.k);
 			ret = bch2_journal_key_delete(c, b->c.btree_id,
 						      b->c.level, prev_k.k->k.p);
 			if (ret)
@@ -465,7 +467,7 @@ again:
 		bch2_bkey_buf_reassemble(&cur_k, c, k);
 		bch2_btree_and_journal_iter_advance(&iter);
 
-		cur = bch2_btree_node_get_noiter(c, cur_k.k,
+		cur = bch2_btree_node_get_noiter(trans, cur_k.k,
 					b->c.btree_id, b->c.level - 1,
 					false);
 		ret = PTR_ERR_OR_ZERO(cur);
@@ -476,12 +478,12 @@ again:
 			goto err;
 		}
 
-		ret = bch2_btree_repair_topology_recurse(c, cur);
+		ret = bch2_btree_repair_topology_recurse(trans, cur);
 		six_unlock_read(&cur->c.lock);
 		cur = NULL;
 
 		if (ret == DROP_THIS_NODE) {
-			bch2_btree_node_evict(c, cur_k.k);
+			bch2_btree_node_evict(trans, cur_k.k);
 			ret = bch2_journal_key_delete(c, b->c.btree_id,
 						      b->c.level, cur_k.k->k.p);
 			dropped_children = true;
@@ -522,17 +524,20 @@ fsck_err:
 
 static int bch2_repair_topology(struct bch_fs *c)
 {
+	struct btree_trans trans;
 	struct btree *b;
 	unsigned i;
 	int ret = 0;
 
+	bch2_trans_init(&trans, c, 0, 0);
+
 	for (i = 0; i < BTREE_ID_NR && !ret; i++) {
 		b = c->btree_roots[i].b;
 		if (btree_node_fake(b))
 			continue;
 
-		six_lock_read(&b->c.lock, NULL, NULL);
-		ret = bch2_btree_repair_topology_recurse(c, b);
+		btree_node_lock_nopath_nofail(&trans, &b->c, SIX_LOCK_read);
+		ret = bch2_btree_repair_topology_recurse(&trans, b);
 		six_unlock_read(&b->c.lock);
 
 		if (ret == DROP_THIS_NODE) {
@@ -541,13 +546,16 @@ static int bch2_repair_topology(struct bch_fs *c)
 		}
 	}
 
+	bch2_trans_exit(&trans);
+
 	return ret;
 }
 
-static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id,
+static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id,
 			       unsigned level, bool is_root,
 			       struct bkey_s_c *k)
 {
+	struct bch_fs *c = trans->c;
 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(*k);
 	const union bch_extent_entry *entry;
 	struct extent_ptr_decoded p = { 0 };
@@ -747,7 +755,7 @@ found:
 		}
 
 		if (level)
-			bch2_btree_node_update_key_early(c, btree_id, level - 1, *k, new);
+			bch2_btree_node_update_key_early(trans, btree_id, level - 1, *k, new);
 
 		if (c->opts.verbose) {
 			printbuf_reset(&buf);
@@ -788,7 +796,7 @@ static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id,
 		BUG_ON(bch2_journal_seq_verify &&
 		       k->k->version.lo > atomic64_read(&c->journal.seq));
 
-		ret = bch2_check_fix_ptrs(c, btree_id, level, is_root, k);
+		ret = bch2_check_fix_ptrs(trans, btree_id, level, is_root, k);
 		if (ret)
 			goto err;
 
@@ -941,7 +949,7 @@ static int bch2_gc_btree_init_recurse(struct btree_trans *trans, struct btree *b
 			bch2_bkey_buf_reassemble(&cur, c, k);
 			bch2_btree_and_journal_iter_advance(&iter);
 
-			child = bch2_btree_node_get_noiter(c, cur.k,
+			child = bch2_btree_node_get_noiter(trans, cur.k,
 						b->c.btree_id, b->c.level - 1,
 						false);
 			ret = PTR_ERR_OR_ZERO(child);
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index b3dc8b43298e..c63cb70836cc 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -1652,9 +1652,15 @@ static void __btree_node_write_done(struct bch_fs *c, struct btree *b)
 
 static void btree_node_write_done(struct bch_fs *c, struct btree *b)
 {
-	six_lock_read(&b->c.lock, NULL, NULL);
+	struct btree_trans trans;
+
+	bch2_trans_init(&trans, c, 0, 0);
+
+	btree_node_lock_nopath_nofail(&trans, &b->c, SIX_LOCK_read);
 	__btree_node_write_done(c, b);
 	six_unlock_read(&b->c.lock);
+
+	bch2_trans_exit(&trans);
 }
 
 static void btree_node_write_work(struct work_struct *work)
diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index 7349c70f8445..38a66302d6e9 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -95,25 +95,14 @@ static void bkey_cached_free(struct btree_key_cache *bc,
 	six_unlock_intent(&ck->c.lock);
 }
 
-static void bkey_cached_free_fast(struct btree_key_cache *bc,
-				  struct bkey_cached *ck)
+static void bkey_cached_move_to_freelist(struct btree_key_cache *bc,
+				    struct bkey_cached *ck)
 {
-	struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
 	struct btree_key_cache_freelist *f;
 	bool freed = false;
 
 	BUG_ON(test_bit(BKEY_CACHED_DIRTY, &ck->flags));
 
-	ck->btree_trans_barrier_seq =
-		start_poll_synchronize_srcu(&c->btree_trans_barrier);
-
-	list_del_init(&ck->list);
-	atomic_long_inc(&bc->nr_freed);
-
-	kfree(ck->k);
-	ck->k		= NULL;
-	ck->u64s	= 0;
-
 	preempt_disable();
 	f = this_cpu_ptr(bc->pcpu_freed);
 
@@ -138,13 +127,32 @@ static void bkey_cached_free_fast(struct btree_key_cache *bc,
 		list_move_tail(&ck->list, &bc->freed);
 		mutex_unlock(&bc->lock);
 	}
+}
+
+static void bkey_cached_free_fast(struct btree_key_cache *bc,
+				  struct bkey_cached *ck)
+{
+	struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
+
+	ck->btree_trans_barrier_seq =
+		start_poll_synchronize_srcu(&c->btree_trans_barrier);
+
+	list_del_init(&ck->list);
+	atomic_long_inc(&bc->nr_freed);
+
+	kfree(ck->k);
+	ck->k		= NULL;
+	ck->u64s	= 0;
+
+	bkey_cached_move_to_freelist(bc, ck);
 
 	six_unlock_write(&ck->c.lock);
 	six_unlock_intent(&ck->c.lock);
 }
 
 static struct bkey_cached *
-bkey_cached_alloc(struct btree_key_cache *c)
+bkey_cached_alloc(struct btree_trans *trans,
+		  struct btree_key_cache *c)
 {
 	struct bkey_cached *ck = NULL;
 	struct btree_key_cache_freelist *f;
@@ -173,8 +181,21 @@ bkey_cached_alloc(struct btree_key_cache *c)
 	}
 
 	if (ck) {
-		six_lock_intent(&ck->c.lock, NULL, NULL);
-		six_lock_write(&ck->c.lock, NULL, NULL);
+		int ret;
+
+		ret = btree_node_lock_nopath(trans, &ck->c, SIX_LOCK_intent);
+		if (unlikely(ret)) {
+			bkey_cached_move_to_freelist(c, ck);
+			return ERR_PTR(ret);
+		}
+
+		ret = btree_node_lock_nopath(trans, &ck->c, SIX_LOCK_write);
+		if (unlikely(ret)) {
+			six_unlock_intent(&ck->c.lock);
+			bkey_cached_move_to_freelist(c, ck);
+			return ERR_PTR(ret);
+		}
+
 		return ck;
 	}
 
@@ -216,15 +237,18 @@ bkey_cached_reuse(struct btree_key_cache *c)
 }
 
 static struct bkey_cached *
-btree_key_cache_create(struct bch_fs *c,
+btree_key_cache_create(struct btree_trans *trans,
 		       enum btree_id btree_id,
 		       struct bpos pos)
 {
+	struct bch_fs *c = trans->c;
 	struct btree_key_cache *bc = &c->btree_key_cache;
 	struct bkey_cached *ck;
 	bool was_new = true;
 
-	ck = bkey_cached_alloc(bc);
+	ck = bkey_cached_alloc(trans, bc);
+	if (unlikely(IS_ERR(ck)))
+		return ck;
 
 	if (unlikely(!ck)) {
 		ck = bkey_cached_reuse(bc);
@@ -370,7 +394,7 @@ int bch2_btree_path_traverse_cached(struct btree_trans *trans, struct btree_path
 retry:
 	ck = bch2_btree_key_cache_find(c, path->btree_id, path->pos);
 	if (!ck) {
-		ck = btree_key_cache_create(c, path->btree_id, path->pos);
+		ck = btree_key_cache_create(trans, path->btree_id, path->pos);
 		ret = PTR_ERR_OR_ZERO(ck);
 		if (ret)
 			goto err;
@@ -519,10 +543,15 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans,
 evict:
 		BUG_ON(!btree_node_intent_locked(c_iter.path, 0));
 
-		mark_btree_node_unlocked(c_iter.path, 0);
-		c_iter.path->l[0].b = NULL;
+		/*
+		 * XXX: holding a lock that is not marked in btree_trans, not
+		 * ideal:
+		 */
+		six_lock_increment(&ck->c.lock, SIX_LOCK_intent);
+		bch2_trans_unlock(trans);
 
-		six_lock_write(&ck->c.lock, NULL, NULL);
+		/* Will not fail because we are holding no other locks: */
+		btree_node_lock_nopath_nofail(trans, &ck->c, SIX_LOCK_write);
 
 		if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
 			clear_bit(BKEY_CACHED_DIRTY, &ck->flags);
@@ -546,11 +575,13 @@ int bch2_btree_key_cache_journal_flush(struct journal *j,
 	struct bkey_cached *ck =
 		container_of(pin, struct bkey_cached, journal);
 	struct bkey_cached_key key;
+	struct btree_trans trans;
+	int srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
 	int ret = 0;
 
-	int srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
+	bch2_trans_init(&trans, c, 0, 0);
 
-	six_lock_read(&ck->c.lock, NULL, NULL);
+	btree_node_lock_nopath_nofail(&trans, &ck->c, SIX_LOCK_read);
 	key = ck->key;
 
 	if (ck->journal.seq != seq ||
@@ -567,12 +598,13 @@ int bch2_btree_key_cache_journal_flush(struct journal *j,
 	}
 	six_unlock_read(&ck->c.lock);
 
-	ret = bch2_trans_do(c, NULL, NULL, 0,
+	ret = commit_do(&trans, NULL, NULL, 0,
 		btree_key_cache_flush_pos(&trans, key, seq,
 				BTREE_INSERT_JOURNAL_RECLAIM, false));
 unlock:
 	srcu_read_unlock(&c->btree_trans_barrier, srcu_idx);
 
+	bch2_trans_exit(&trans);
 	return ret;
 }
 
diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c
index 301311763d59..24d0ea903380 100644
--- a/fs/bcachefs/btree_locking.c
+++ b/fs/bcachefs/btree_locking.c
@@ -61,7 +61,7 @@ void __bch2_btree_node_lock_write(struct btree_trans *trans, struct btree *b)
 	 * locked:
 	 */
 	six_lock_readers_add(&b->c.lock, -readers);
-	six_lock_write(&b->c.lock, NULL, NULL);
+	btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_write);
 	six_lock_readers_add(&b->c.lock, readers);
 }
 
diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h
index ab3161c1b1f4..32c28c1341e9 100644
--- a/fs/bcachefs/btree_locking.h
+++ b/fs/bcachefs/btree_locking.h
@@ -185,6 +185,24 @@ void bch2_btree_node_unlock_write(struct btree_trans *,
 
 /* lock: */
 
+static inline int __must_check
+btree_node_lock_nopath(struct btree_trans *trans,
+		       struct btree_bkey_cached_common *b,
+		       enum six_lock_type type)
+{
+	six_lock_type(&b->lock, type, NULL, NULL);
+	return 0;
+}
+
+static inline void btree_node_lock_nopath_nofail(struct btree_trans *trans,
+					 struct btree_bkey_cached_common *b,
+					 enum six_lock_type type)
+{
+	int ret = btree_node_lock_nopath(trans, b, type);
+
+	BUG_ON(ret);
+}
+
 static inline int btree_node_lock_type(struct btree_trans *trans,
 				       struct btree_path *path,
 				       struct btree_bkey_cached_common *b,
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 1f5b98a3d0a2..6fe49766c6c8 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -169,7 +169,7 @@ static void bch2_btree_node_free_inmem(struct btree_trans *trans,
 		BUG_ON(path->l[b->c.level].b == b &&
 		       path->l[b->c.level].lock_seq == b->c.lock.state.seq);
 
-	six_lock_write(&b->c.lock, NULL, NULL);
+	btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_write);
 
 	bch2_btree_node_hash_remove(&c->btree_cache, b);
 	__btree_node_free(c, b);
@@ -259,7 +259,9 @@ mem_alloc:
 	return b;
 }
 
-static struct btree *bch2_btree_node_alloc(struct btree_update *as, unsigned level)
+static struct btree *bch2_btree_node_alloc(struct btree_update *as,
+					   struct btree_trans *trans,
+					   unsigned level)
 {
 	struct bch_fs *c = as->c;
 	struct btree *b;
@@ -271,8 +273,8 @@ static struct btree *bch2_btree_node_alloc(struct btree_update *as, unsigned lev
 
 	b = p->b[--p->nr];
 
-	six_lock_intent(&b->c.lock, NULL, NULL);
-	six_lock_write(&b->c.lock, NULL, NULL);
+	btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent);
+	btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_write);
 
 	set_btree_node_accessed(b);
 	set_btree_node_dirty_acct(c, b);
@@ -323,12 +325,13 @@ static void btree_set_max(struct btree *b, struct bpos pos)
 }
 
 struct btree *__bch2_btree_node_alloc_replacement(struct btree_update *as,
+						  struct btree_trans *trans,
 						  struct btree *b,
 						  struct bkey_format format)
 {
 	struct btree *n;
 
-	n = bch2_btree_node_alloc(as, b->c.level);
+	n = bch2_btree_node_alloc(as, trans, b->c.level);
 
 	SET_BTREE_NODE_SEQ(n->data, BTREE_NODE_SEQ(b->data) + 1);
 
@@ -347,6 +350,7 @@ struct btree *__bch2_btree_node_alloc_replacement(struct btree_update *as,
 }
 
 static struct btree *bch2_btree_node_alloc_replacement(struct btree_update *as,
+						       struct btree_trans *trans,
 						       struct btree *b)
 {
 	struct bkey_format new_f = bch2_btree_calc_format(b);
@@ -358,12 +362,13 @@ static struct btree *bch2_btree_node_alloc_replacement(struct btree_update *as,
 	if (!bch2_btree_node_format_fits(as->c, b, &new_f))
 		new_f = b->format;
 
-	return __bch2_btree_node_alloc_replacement(as, b, new_f);
+	return __bch2_btree_node_alloc_replacement(as, trans, b, new_f);
 }
 
-static struct btree *__btree_root_alloc(struct btree_update *as, unsigned level)
+static struct btree *__btree_root_alloc(struct btree_update *as,
+				struct btree_trans *trans, unsigned level)
 {
-	struct btree *b = bch2_btree_node_alloc(as, level);
+	struct btree *b = bch2_btree_node_alloc(as, trans, level);
 
 	btree_set_min(b, POS_MIN);
 	btree_set_max(b, SPOS_MAX);
@@ -378,7 +383,7 @@ static struct btree *__btree_root_alloc(struct btree_update *as, unsigned level)
 	return b;
 }
 
-static void bch2_btree_reserve_put(struct btree_update *as)
+static void bch2_btree_reserve_put(struct btree_update *as, struct btree_trans *trans)
 {
 	struct bch_fs *c = as->c;
 	struct prealloc_nodes *p;
@@ -405,8 +410,8 @@ static void bch2_btree_reserve_put(struct btree_update *as)
 
 			mutex_unlock(&c->btree_reserve_cache_lock);
 
-			six_lock_intent(&b->c.lock, NULL, NULL);
-			six_lock_write(&b->c.lock, NULL, NULL);
+			btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent);
+			btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_write);
 			__btree_node_free(c, b);
 			six_unlock_write(&b->c.lock);
 			six_unlock_intent(&b->c.lock);
@@ -460,7 +465,7 @@ err:
 
 /* Asynchronous interior node update machinery */
 
-static void bch2_btree_update_free(struct btree_update *as)
+static void bch2_btree_update_free(struct btree_update *as, struct btree_trans *trans)
 {
 	struct bch_fs *c = as->c;
 
@@ -473,7 +478,7 @@ static void bch2_btree_update_free(struct btree_update *as)
 	bch2_journal_pin_drop(&c->journal, &as->journal);
 	bch2_journal_pin_flush(&c->journal, &as->journal);
 	bch2_disk_reservation_put(c, &as->disk_res);
-	bch2_btree_reserve_put(as);
+	bch2_btree_reserve_put(as, trans);
 
 	bch2_time_stats_update(&c->times[BCH_TIME_btree_interior_update_total],
 			       as->start_time);
@@ -551,12 +556,13 @@ static int btree_update_nodes_written_trans(struct btree_trans *trans,
 static void btree_update_nodes_written(struct btree_update *as)
 {
 	struct bch_fs *c = as->c;
-	struct btree *b = as->b;
+	struct btree *b;
 	struct btree_trans trans;
 	u64 journal_seq = 0;
 	unsigned i;
 	int ret;
 
+	bch2_trans_init(&trans, c, 0, 512);
 	/*
 	 * If we're already in an error state, it might be because a btree node
 	 * was never written, and we might be trying to free that same btree
@@ -573,15 +579,16 @@ static void btree_update_nodes_written(struct btree_update *as)
 	 * on disk:
 	 */
 	for (i = 0; i < as->nr_old_nodes; i++) {
-		struct btree *old = as->old_nodes[i];
 		__le64 seq;
 
-		six_lock_read(&old->c.lock, NULL, NULL);
-		seq = old->data ? old->data->keys.seq : 0;
-		six_unlock_read(&old->c.lock);
+		b = as->old_nodes[i];
+
+		btree_node_lock_nopath_nofail(&trans, &b->c, SIX_LOCK_read);
+		seq = b->data ? b->data->keys.seq : 0;
+		six_unlock_read(&b->c.lock);
 
 		if (seq == as->old_nodes_seq[i])
-			wait_on_bit_io(&old->flags, BTREE_NODE_write_in_flight_inner,
+			wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight_inner,
 				       TASK_UNINTERRUPTIBLE);
 	}
 
@@ -598,19 +605,19 @@ static void btree_update_nodes_written(struct btree_update *as)
 	 * journal reclaim does btree updates when flushing bkey_cached entries,
 	 * which may require allocations as well.
 	 */
-	bch2_trans_init(&trans, c, 0, 512);
 	ret = commit_do(&trans, &as->disk_res, &journal_seq,
-			      BTREE_INSERT_NOFAIL|
-			      BTREE_INSERT_NOCHECK_RW|
-			      BTREE_INSERT_JOURNAL_RECLAIM|
-			      JOURNAL_WATERMARK_reserved,
-			      btree_update_nodes_written_trans(&trans, as));
-	bch2_trans_exit(&trans);
+			BTREE_INSERT_NOFAIL|
+			BTREE_INSERT_NOCHECK_RW|
+			BTREE_INSERT_JOURNAL_RECLAIM|
+			JOURNAL_WATERMARK_reserved,
+			btree_update_nodes_written_trans(&trans, as));
+	bch2_trans_unlock(&trans);
 
 	bch2_fs_fatal_err_on(ret && !bch2_journal_error(&c->journal), c,
 			     "error %i in btree_update_nodes_written()", ret);
 err:
-	if (b) {
+	if (as->b) {
+		b = as->b;
 		/*
 		 * @b is the node we did the final insert into:
 		 *
@@ -623,8 +630,8 @@ err:
 		 * we're in journal error state:
 		 */
 
-		six_lock_intent(&b->c.lock, NULL, NULL);
-		six_lock_write(&b->c.lock, NULL, NULL);
+		btree_node_lock_nopath_nofail(&trans, &b->c, SIX_LOCK_intent);
+		btree_node_lock_nopath_nofail(&trans, &b->c, SIX_LOCK_write);
 		mutex_lock(&c->btree_interior_update_lock);
 
 		list_del(&as->write_blocked_list);
@@ -681,7 +688,7 @@ err:
 	for (i = 0; i < as->nr_new_nodes; i++) {
 		b = as->new_nodes[i];
 
-		six_lock_read(&b->c.lock, NULL, NULL);
+		btree_node_lock_nopath_nofail(&trans, &b->c, SIX_LOCK_read);
 		btree_node_write_if_need(c, b, SIX_LOCK_read);
 		six_unlock_read(&b->c.lock);
 	}
@@ -689,7 +696,8 @@ err:
 	for (i = 0; i < as->nr_open_buckets; i++)
 		bch2_open_bucket_put(c, c->open_buckets + as->open_buckets[i]);
 
-	bch2_btree_update_free(as);
+	bch2_btree_update_free(as, &trans);
+	bch2_trans_exit(&trans);
 }
 
 static void btree_interior_update_work(struct work_struct *work)
@@ -936,7 +944,7 @@ static void bch2_btree_interior_update_will_free_node(struct btree_update *as,
 	as->nr_old_nodes++;
 }
 
-static void bch2_btree_update_done(struct btree_update *as)
+static void bch2_btree_update_done(struct btree_update *as, struct btree_trans *trans)
 {
 	struct bch_fs *c = as->c;
 	u64 start_time = as->start_time;
@@ -947,7 +955,7 @@ static void bch2_btree_update_done(struct btree_update *as)
 		up_read(&as->c->gc_lock);
 	as->took_gc_lock = false;
 
-	bch2_btree_reserve_put(as);
+	bch2_btree_reserve_put(as, trans);
 
 	continue_at(&as->cl, btree_update_set_nodes_written,
 		    as->c->btree_interior_update_worker);
@@ -1102,7 +1110,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
 	bch2_trans_verify_not_restarted(trans, restart_count);
 	return as;
 err:
-	bch2_btree_update_free(as);
+	bch2_btree_update_free(as, trans);
 	return ERR_PTR(ret);
 }
 
@@ -1254,6 +1262,7 @@ __bch2_btree_insert_keys_interior(struct btree_update *as,
  * node)
  */
 static struct btree *__btree_split_node(struct btree_update *as,
+					struct btree_trans *trans,
 					struct btree *n1)
 {
 	struct bkey_format_state s;
@@ -1263,7 +1272,7 @@ static struct btree *__btree_split_node(struct btree_update *as,
 	struct bkey_packed *k, *set2_start, *set2_end, *out, *prev = NULL;
 	struct bpos n1_pos;
 
-	n2 = bch2_btree_node_alloc(as, n1->c.level);
+	n2 = bch2_btree_node_alloc(as, trans, n1->c.level);
 
 	n2->data->max_key	= n1->data->max_key;
 	n2->data->format	= n1->format;
@@ -1427,7 +1436,7 @@ static void btree_split(struct btree_update *as, struct btree_trans *trans,
 
 	bch2_btree_interior_update_will_free_node(as, b);
 
-	n1 = bch2_btree_node_alloc_replacement(as, b);
+	n1 = bch2_btree_node_alloc_replacement(as, trans, b);
 
 	if (keys)
 		btree_split_insert_keys(as, trans, path, n1, keys);
@@ -1435,7 +1444,7 @@ static void btree_split(struct btree_update *as, struct btree_trans *trans,
 	if (bset_u64s(&n1->set[0]) > BTREE_SPLIT_THRESHOLD(c)) {
 		trace_and_count(c, btree_node_split, c, b);
 
-		n2 = __btree_split_node(as, n1);
+		n2 = __btree_split_node(as, trans, n1);
 
 		bch2_btree_build_aux_trees(n2);
 		bch2_btree_build_aux_trees(n1);
@@ -1457,7 +1466,7 @@ static void btree_split(struct btree_update *as, struct btree_trans *trans,
 
 		if (!parent) {
 			/* Depth increases, make a new root */
-			n3 = __btree_root_alloc(as, b->c.level + 1);
+			n3 = __btree_root_alloc(as, trans, b->c.level + 1);
 
 			n3->sib_u64s[0] = U16_MAX;
 			n3->sib_u64s[1] = U16_MAX;
@@ -1622,7 +1631,7 @@ int bch2_btree_split_leaf(struct btree_trans *trans,
 		return PTR_ERR(as);
 
 	btree_split(as, trans, path, b, NULL, flags);
-	bch2_btree_update_done(as);
+	bch2_btree_update_done(as, trans);
 
 	for (l = path->level + 1; btree_path_node(path, l) && !ret; l++)
 		ret = bch2_foreground_maybe_merge(trans, path, l, flags);
@@ -1741,7 +1750,7 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans,
 	bch2_btree_interior_update_will_free_node(as, b);
 	bch2_btree_interior_update_will_free_node(as, m);
 
-	n = bch2_btree_node_alloc(as, b->c.level);
+	n = bch2_btree_node_alloc(as, trans, b->c.level);
 
 	SET_BTREE_NODE_SEQ(n->data,
 			   max(BTREE_NODE_SEQ(b->data),
@@ -1788,7 +1797,7 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans,
 
 	six_unlock_intent(&n->c.lock);
 
-	bch2_btree_update_done(as);
+	bch2_btree_update_done(as, trans);
 
 	bch2_time_stats_update(&c->times[BCH_TIME_btree_node_merge], start_time);
 out:
@@ -1822,7 +1831,7 @@ int bch2_btree_node_rewrite(struct btree_trans *trans,
 
 	bch2_btree_interior_update_will_free_node(as, b);
 
-	n = bch2_btree_node_alloc_replacement(as, b);
+	n = bch2_btree_node_alloc_replacement(as, trans, b);
 	bch2_btree_update_add_new_node(as, n);
 
 	bch2_btree_build_aux_trees(n);
@@ -1847,7 +1856,7 @@ int bch2_btree_node_rewrite(struct btree_trans *trans,
 	bch2_btree_node_free_inmem(trans, b);
 	six_unlock_intent(&n->c.lock);
 
-	bch2_btree_update_done(as);
+	bch2_btree_update_done(as, trans);
 out:
 	bch2_btree_path_downgrade(trans, iter->path);
 	return ret;
diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h
index adfc6c24a7a4..7af810df8348 100644
--- a/fs/bcachefs/btree_update_interior.h
+++ b/fs/bcachefs/btree_update_interior.h
@@ -117,6 +117,7 @@ struct btree_update {
 };
 
 struct btree *__bch2_btree_node_alloc_replacement(struct btree_update *,
+						  struct btree_trans *,
 						  struct btree *,
 						  struct bkey_format);
 
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index a8306b16956d..d414cbefa3c9 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -169,10 +169,13 @@ static int __btree_node_flush(struct journal *j, struct journal_entry_pin *pin,
 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
 	struct btree_write *w = container_of(pin, struct btree_write, journal);
 	struct btree *b = container_of(w, struct btree, writes[i]);
+	struct btree_trans trans;
 	unsigned long old, new, v;
 	unsigned idx = w - b->writes;
 
-	six_lock_read(&b->c.lock, NULL, NULL);
+	bch2_trans_init(&trans, c, 0, 0);
+
+	btree_node_lock_nopath_nofail(&trans, &b->c, SIX_LOCK_read);
 	v = READ_ONCE(b->flags);
 
 	do {
@@ -188,6 +191,8 @@ static int __btree_node_flush(struct journal *j, struct journal_entry_pin *pin,
 
 	btree_node_write_if_need(c, b, SIX_LOCK_read);
 	six_unlock_read(&b->c.lock);
+
+	bch2_trans_exit(&trans);
 	return 0;
 }
 
-- 
cgit 


From d5024b011cb37b03aeeddd4b38857db427a04f11 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 22 Aug 2022 23:39:23 -0400
Subject: bcachefs: bch2_btree_node_lock_write_nofail()

Taking a write lock will be able to fail, with the new cycle detector -
unless we pass it nofail, which is possible but not preferred.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_key_cache.c       | 11 ++++++-----
 fs/bcachefs/btree_locking.h         | 11 ++++++++++-
 fs/bcachefs/btree_update_interior.c |  4 ++--
 fs/bcachefs/btree_update_leaf.c     |  2 +-
 4 files changed, 19 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index 38a66302d6e9..94979b1a4912 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -342,11 +342,12 @@ static int btree_key_cache_fill(struct btree_trans *trans,
 		}
 	}
 
-	/*
-	 * XXX: not allowed to be holding read locks when we take a write lock,
-	 * currently
-	 */
-	bch2_btree_node_lock_write(trans, ck_path, ck_path->l[0].b);
+	ret = bch2_btree_node_lock_write(trans, ck_path, ck_path->l[0].b);
+	if (ret) {
+		kfree(new_k);
+		goto err;
+	}
+
 	if (new_k) {
 		kfree(ck->k);
 		ck->u64s = new_u64s;
diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h
index 32c28c1341e9..6eaf44fd3f37 100644
--- a/fs/bcachefs/btree_locking.h
+++ b/fs/bcachefs/btree_locking.h
@@ -281,7 +281,7 @@ static inline int btree_node_lock(struct btree_trans *trans,
 
 void __bch2_btree_node_lock_write(struct btree_trans *, struct btree *);
 
-static inline void bch2_btree_node_lock_write(struct btree_trans *trans,
+static inline void bch2_btree_node_lock_write_nofail(struct btree_trans *trans,
 					      struct btree_path *path,
 					      struct btree *b)
 {
@@ -300,6 +300,15 @@ static inline void bch2_btree_node_lock_write(struct btree_trans *trans,
 		__bch2_btree_node_lock_write(trans, b);
 }
 
+static inline int __must_check
+bch2_btree_node_lock_write(struct btree_trans *trans,
+			   struct btree_path *path,
+			   struct btree *b)
+{
+	bch2_btree_node_lock_write_nofail(trans, path, b);
+	return 0;
+}
+
 /* relock: */
 
 bool bch2_btree_path_relock_norestart(struct btree_trans *,
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 6fe49766c6c8..db45883d27ce 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -1163,7 +1163,7 @@ static void bch2_btree_set_root(struct btree_update *as,
 	 * Ensure no one is using the old root while we switch to the
 	 * new root:
 	 */
-	bch2_btree_node_lock_write(trans, path, old);
+	bch2_btree_node_lock_write_nofail(trans, path, old);
 
 	bch2_btree_set_root_inmem(c, b);
 
@@ -2002,7 +2002,7 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans,
 	if (ret)
 		goto err;
 
-	bch2_btree_node_lock_write(trans, iter->path, b);
+	bch2_btree_node_lock_write_nofail(trans, iter->path, b);
 
 	if (new_hash) {
 		mutex_lock(&c->btree_cache.lock);
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index d414cbefa3c9..3efec0b30466 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -81,7 +81,7 @@ void bch2_btree_node_lock_for_insert(struct btree_trans *trans,
 				     struct btree_path *path,
 				     struct btree *b)
 {
-	bch2_btree_node_lock_write(trans, path, b);
+	bch2_btree_node_lock_write_nofail(trans, path, b);
 	bch2_btree_node_prep_for_write(trans, path, b);
 }
 
-- 
cgit 


From 6b81f194f345d15dd15601ee7b604a0640445895 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 1 Sep 2022 22:05:16 -0400
Subject: bcachefs: Fix six_lock_readers_add()

Have to be careful with bit fields - when subtracting, this was
overflowing into the write_locking bit.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_locking.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c
index 24d0ea903380..158cb7ac64f2 100644
--- a/fs/bcachefs/btree_locking.c
+++ b/fs/bcachefs/btree_locking.c
@@ -10,10 +10,12 @@ struct lock_class_key bch2_btree_node_lock_key;
 
 static inline void six_lock_readers_add(struct six_lock *lock, int nr)
 {
-	if (!lock->readers)
+	if (lock->readers)
+		this_cpu_add(*lock->readers, nr);
+	else if (nr > 0)
 		atomic64_add(__SIX_VAL(read_lock, nr), &lock->state.counter);
 	else
-		this_cpu_add(*lock->readers, nr);
+		atomic64_sub(__SIX_VAL(read_lock, -nr), &lock->state.counter);
 }
 
 struct six_lock_count bch2_btree_node_lock_counts(struct btree_trans *trans,
-- 
cgit 


From 4e6defd106b69c3a78da380d694fd43275125dda Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 31 Aug 2022 18:53:42 -0400
Subject: bcachefs: btree_bkey_cached_common->cached

Add a type descriptor to btree_bkey_cached_common - there's no reason
not to since we've got padding that was otherwise unused, and this is a
nice cleanup (and helpful in later patches).

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c      | 9 ++++-----
 fs/bcachefs/btree_key_cache.c | 1 +
 fs/bcachefs/btree_locking.c   | 3 +--
 fs/bcachefs/btree_types.h     | 6 +++---
 4 files changed, 9 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index e76907af09f1..9c39027513b0 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -3021,8 +3021,7 @@ void bch2_trans_exit(struct btree_trans *trans)
 
 static void __maybe_unused
 bch2_btree_path_node_to_text(struct printbuf *out,
-			     struct btree_bkey_cached_common *b,
-			     bool cached)
+			     struct btree_bkey_cached_common *b)
 {
 	struct six_lock_count c = six_lock_counts(&b->lock);
 	struct task_struct *owner;
@@ -3035,7 +3034,7 @@ bch2_btree_path_node_to_text(struct printbuf *out,
 
 	prt_printf(out, "    l=%u %s:",
 	       b->level, bch2_btree_ids[b->btree_id]);
-	bch2_bpos_to_text(out, btree_node_pos(b, cached));
+	bch2_bpos_to_text(out, btree_node_pos(b));
 
 	prt_printf(out, "    locks %u:%u:%u held by pid %u",
 		   c.n[0], c.n[1], c.n[2], pid);
@@ -3068,7 +3067,7 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct btree_trans *trans)
 			    !IS_ERR_OR_NULL(b = (void *) READ_ONCE(path->l[l].b))) {
 				prt_printf(out, "    %c l=%u ",
 					   lock_types[btree_node_locked_type(path, l)], l);
-				bch2_btree_path_node_to_text(out, b, path->cached);
+				bch2_btree_path_node_to_text(out, b);
 				prt_printf(out, "\n");
 			}
 		}
@@ -3086,7 +3085,7 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct btree_trans *trans)
 		bch2_bpos_to_text(out, trans->locking_pos);
 
 		prt_printf(out, " node ");
-		bch2_btree_path_node_to_text(out, b, path->cached);
+		bch2_btree_path_node_to_text(out, b);
 		prt_printf(out, "\n");
 	}
 }
diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index 94979b1a4912..517b9861c01c 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -204,6 +204,7 @@ bkey_cached_alloc(struct btree_trans *trans,
 		INIT_LIST_HEAD(&ck->list);
 		__six_lock_init(&ck->c.lock, "b->c.lock", &bch2_btree_node_lock_key);
 		lockdep_set_novalidate_class(&ck->c.lock);
+		ck->c.cached = true;
 		BUG_ON(!six_trylock_intent(&ck->c.lock));
 		BUG_ON(!six_trylock_write(&ck->c.lock));
 		return ck;
diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c
index 158cb7ac64f2..c73902c170d4 100644
--- a/fs/bcachefs/btree_locking.c
+++ b/fs/bcachefs/btree_locking.c
@@ -144,8 +144,7 @@ int __bch2_btree_node_lock(struct btree_trans *trans,
 
 		/* Must lock btree nodes in key order: */
 		if (btree_node_locked(linked, level) &&
-		    bpos_cmp(pos, btree_node_pos(&linked->l[level].b->c,
-						 linked->cached)) <= 0) {
+		    bpos_cmp(pos, btree_node_pos(&linked->l[level].b->c)) <= 0) {
 			reason = 7;
 			goto deadlock;
 		}
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 42459a5bf035..6d9888e3a96a 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -63,6 +63,7 @@ struct btree_bkey_cached_common {
 	struct six_lock		lock;
 	u8			level;
 	u8			btree_id;
+	bool			cached;
 };
 
 struct btree {
@@ -335,10 +336,9 @@ struct bkey_cached {
 	struct bkey_i		*k;
 };
 
-static inline struct bpos btree_node_pos(struct btree_bkey_cached_common *b,
-					 bool cached)
+static inline struct bpos btree_node_pos(struct btree_bkey_cached_common *b)
 {
-	return !cached
+	return !b->cached
 		? container_of(b, struct btree, c)->key.k.p
 		: container_of(b, struct bkey_cached, c)->key.pos;
 }
-- 
cgit 


From da4474f20961f995a1d54f82b4c462c94ea03552 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 3 Sep 2022 21:09:54 -0400
Subject: bcachefs: Convert more locking code to btree_bkey_cached_common

Ideally, all the code in btree_locking.c should be converted, but then
we'd want to convert btree_path to point to btree_key_cached_common too,
and then we'd be in for a much bigger cleanup - but a bit of incremental
cleanup will still be helpful for the next patches.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_key_cache.c       |  2 +-
 fs/bcachefs/btree_locking.c         | 11 ++++++-----
 fs/bcachefs/btree_locking.h         | 16 ++++++++--------
 fs/bcachefs/btree_update_interior.c |  4 ++--
 fs/bcachefs/btree_update_leaf.c     |  2 +-
 5 files changed, 18 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index 517b9861c01c..2de9a0cc17b6 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -343,7 +343,7 @@ static int btree_key_cache_fill(struct btree_trans *trans,
 		}
 	}
 
-	ret = bch2_btree_node_lock_write(trans, ck_path, ck_path->l[0].b);
+	ret = bch2_btree_node_lock_write(trans, ck_path, &ck_path->l[0].b->c);
 	if (ret) {
 		kfree(new_k);
 		goto err;
diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c
index c73902c170d4..bfe9780aea3a 100644
--- a/fs/bcachefs/btree_locking.c
+++ b/fs/bcachefs/btree_locking.c
@@ -52,9 +52,10 @@ void bch2_btree_node_unlock_write(struct btree_trans *trans,
 
 /* lock */
 
-void __bch2_btree_node_lock_write(struct btree_trans *trans, struct btree *b)
+void __bch2_btree_node_lock_write(struct btree_trans *trans,
+				  struct btree_bkey_cached_common *b)
 {
-	int readers = bch2_btree_node_lock_counts(trans, NULL, &b->c, b->c.level).n[SIX_LOCK_read];
+	int readers = bch2_btree_node_lock_counts(trans, NULL, b, b->level).n[SIX_LOCK_read];
 
 	/*
 	 * Must drop our read locks before calling six_lock_write() -
@@ -62,9 +63,9 @@ void __bch2_btree_node_lock_write(struct btree_trans *trans, struct btree *b)
 	 * goes to 0, and it's safe because we have the node intent
 	 * locked:
 	 */
-	six_lock_readers_add(&b->c.lock, -readers);
-	btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_write);
-	six_lock_readers_add(&b->c.lock, readers);
+	six_lock_readers_add(&b->lock, -readers);
+	btree_node_lock_nopath_nofail(trans, b, SIX_LOCK_write);
+	six_lock_readers_add(&b->lock, readers);
 }
 
 static inline bool path_has_read_locks(struct btree_path *path)
diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h
index 6eaf44fd3f37..9758a0c05d25 100644
--- a/fs/bcachefs/btree_locking.h
+++ b/fs/bcachefs/btree_locking.h
@@ -279,31 +279,31 @@ static inline int btree_node_lock(struct btree_trans *trans,
 	return ret;
 }
 
-void __bch2_btree_node_lock_write(struct btree_trans *, struct btree *);
+void __bch2_btree_node_lock_write(struct btree_trans *, struct btree_bkey_cached_common *);
 
 static inline void bch2_btree_node_lock_write_nofail(struct btree_trans *trans,
 					      struct btree_path *path,
-					      struct btree *b)
+					      struct btree_bkey_cached_common *b)
 {
-	EBUG_ON(path->l[b->c.level].b != b);
-	EBUG_ON(path->l[b->c.level].lock_seq != b->c.lock.state.seq);
-	EBUG_ON(!btree_node_intent_locked(path, b->c.level));
+	EBUG_ON(&path->l[b->level].b->c != b);
+	EBUG_ON(path->l[b->level].lock_seq != b->lock.state.seq);
+	EBUG_ON(!btree_node_intent_locked(path, b->level));
 
 	/*
 	 * six locks are unfair, and read locks block while a thread wants a
 	 * write lock: thus, we need to tell the cycle detector we have a write
 	 * lock _before_ taking the lock:
 	 */
-	mark_btree_node_locked_noreset(path, b->c.level, SIX_LOCK_write);
+	mark_btree_node_locked_noreset(path, b->level, SIX_LOCK_write);
 
-	if (unlikely(!six_trylock_write(&b->c.lock)))
+	if (unlikely(!six_trylock_write(&b->lock)))
 		__bch2_btree_node_lock_write(trans, b);
 }
 
 static inline int __must_check
 bch2_btree_node_lock_write(struct btree_trans *trans,
 			   struct btree_path *path,
-			   struct btree *b)
+			   struct btree_bkey_cached_common *b)
 {
 	bch2_btree_node_lock_write_nofail(trans, path, b);
 	return 0;
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index db45883d27ce..d4e2ebe263a3 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -1163,7 +1163,7 @@ static void bch2_btree_set_root(struct btree_update *as,
 	 * Ensure no one is using the old root while we switch to the
 	 * new root:
 	 */
-	bch2_btree_node_lock_write_nofail(trans, path, old);
+	bch2_btree_node_lock_write_nofail(trans, path, &old->c);
 
 	bch2_btree_set_root_inmem(c, b);
 
@@ -2002,7 +2002,7 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans,
 	if (ret)
 		goto err;
 
-	bch2_btree_node_lock_write_nofail(trans, iter->path, b);
+	bch2_btree_node_lock_write_nofail(trans, iter->path, &b->c);
 
 	if (new_hash) {
 		mutex_lock(&c->btree_cache.lock);
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 3efec0b30466..7f60f9f81f42 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -81,7 +81,7 @@ void bch2_btree_node_lock_for_insert(struct btree_trans *trans,
 				     struct btree_path *path,
 				     struct btree *b)
 {
-	bch2_btree_node_lock_write_nofail(trans, path, b);
+	bch2_btree_node_lock_write_nofail(trans, path, &b->c);
 	bch2_btree_node_prep_for_write(trans, path, b);
 }
 
-- 
cgit 


From 0242130fb67fdcc617229fb9112c50f4caabab3c Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 3 Sep 2022 21:14:53 -0400
Subject: bcachefs: Refactor bkey_cached_alloc() path

Clean up the arguments passed and make them more consistent.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_key_cache.c | 38 +++++++++++++++++++-------------------
 1 file changed, 19 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index 2de9a0cc17b6..b3d383ff3d5b 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -151,33 +151,34 @@ static void bkey_cached_free_fast(struct btree_key_cache *bc,
 }
 
 static struct bkey_cached *
-bkey_cached_alloc(struct btree_trans *trans,
-		  struct btree_key_cache *c)
+bkey_cached_alloc(struct btree_trans *trans)
 {
+	struct bch_fs *c = trans->c;
+	struct btree_key_cache *bc = &c->btree_key_cache;
 	struct bkey_cached *ck = NULL;
 	struct btree_key_cache_freelist *f;
 
 	preempt_disable();
-	f = this_cpu_ptr(c->pcpu_freed);
+	f = this_cpu_ptr(bc->pcpu_freed);
 	if (f->nr)
 		ck = f->objs[--f->nr];
 	preempt_enable();
 
 	if (!ck) {
-		mutex_lock(&c->lock);
+		mutex_lock(&bc->lock);
 		preempt_disable();
-		f = this_cpu_ptr(c->pcpu_freed);
+		f = this_cpu_ptr(bc->pcpu_freed);
 
-		while (!list_empty(&c->freed) &&
+		while (!list_empty(&bc->freed) &&
 		       f->nr < ARRAY_SIZE(f->objs) / 2) {
-			ck = list_last_entry(&c->freed, struct bkey_cached, list);
+			ck = list_last_entry(&bc->freed, struct bkey_cached, list);
 			list_del_init(&ck->list);
 			f->objs[f->nr++] = ck;
 		}
 
 		ck = f->nr ? f->objs[--f->nr] : NULL;
 		preempt_enable();
-		mutex_unlock(&c->lock);
+		mutex_unlock(&bc->lock);
 	}
 
 	if (ck) {
@@ -185,14 +186,14 @@ bkey_cached_alloc(struct btree_trans *trans,
 
 		ret = btree_node_lock_nopath(trans, &ck->c, SIX_LOCK_intent);
 		if (unlikely(ret)) {
-			bkey_cached_move_to_freelist(c, ck);
+			bkey_cached_move_to_freelist(bc, ck);
 			return ERR_PTR(ret);
 		}
 
 		ret = btree_node_lock_nopath(trans, &ck->c, SIX_LOCK_write);
 		if (unlikely(ret)) {
 			six_unlock_intent(&ck->c.lock);
-			bkey_cached_move_to_freelist(c, ck);
+			bkey_cached_move_to_freelist(bc, ck);
 			return ERR_PTR(ret);
 		}
 
@@ -239,15 +240,14 @@ bkey_cached_reuse(struct btree_key_cache *c)
 
 static struct bkey_cached *
 btree_key_cache_create(struct btree_trans *trans,
-		       enum btree_id btree_id,
-		       struct bpos pos)
+		       struct btree_path *path)
 {
 	struct bch_fs *c = trans->c;
 	struct btree_key_cache *bc = &c->btree_key_cache;
 	struct bkey_cached *ck;
 	bool was_new = true;
 
-	ck = bkey_cached_alloc(trans, bc);
+	ck = bkey_cached_alloc(trans);
 	if (unlikely(IS_ERR(ck)))
 		return ck;
 
@@ -255,20 +255,20 @@ btree_key_cache_create(struct btree_trans *trans,
 		ck = bkey_cached_reuse(bc);
 		if (unlikely(!ck)) {
 			bch_err(c, "error allocating memory for key cache item, btree %s",
-				bch2_btree_ids[btree_id]);
+				bch2_btree_ids[path->btree_id]);
 			return ERR_PTR(-ENOMEM);
 		}
 
 		was_new = false;
 	} else {
-		if (btree_id == BTREE_ID_subvolumes)
+		if (path->btree_id == BTREE_ID_subvolumes)
 			six_lock_pcpu_alloc(&ck->c.lock);
 	}
 
 	ck->c.level		= 0;
-	ck->c.btree_id		= btree_id;
-	ck->key.btree_id	= btree_id;
-	ck->key.pos		= pos;
+	ck->c.btree_id		= path->btree_id;
+	ck->key.btree_id	= path->btree_id;
+	ck->key.pos		= path->pos;
 	ck->valid		= false;
 	ck->flags		= 1U << BKEY_CACHED_ACCESSED;
 
@@ -396,7 +396,7 @@ int bch2_btree_path_traverse_cached(struct btree_trans *trans, struct btree_path
 retry:
 	ck = bch2_btree_key_cache_find(c, path->btree_id, path->pos);
 	if (!ck) {
-		ck = btree_key_cache_create(trans, path->btree_id, path->pos);
+		ck = btree_key_cache_create(trans, path);
 		ret = PTR_ERR_OR_ZERO(ck);
 		if (ret)
 			goto err;
-- 
cgit 


From 3d21d48e898a2eadc9055c44e0fd51e6087c9e9f Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 3 Sep 2022 22:07:31 -0400
Subject: bcachefs: Fix usage of six lock's percpu mode, key cache version

Similar to "bcachefs: Fix usage of six lock's percpu mode", six locks
have a percpu mode, but we can't switch between percpu and non percpu
modes while a lock is in use: threads attempting to take a read lock may
race, and we'll end up with the read count permanently off.

Fixing this the "correct" way, in six_lock_pcpu_(alloc|free) would
require an RCU barrier, and we don't want to do that - instead, we have
to permanently segragate percpu and non percpu objects, including when
on freelists.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_key_cache.c | 130 ++++++++++++++++++++++++++++--------------
 fs/bcachefs/btree_types.h     |   3 +-
 2 files changed, 90 insertions(+), 43 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index b3d383ff3d5b..89e540c5a244 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -13,6 +13,11 @@
 
 #include <linux/sched/mm.h>
 
+static inline bool btree_uses_pcpu_readers(enum btree_id id)
+{
+	return id == BTREE_ID_subvolumes;
+}
+
 static struct kmem_cache *bch2_key_cache;
 
 static int bch2_btree_key_cache_cmp_fn(struct rhashtable_compare_arg *arg,
@@ -84,7 +89,10 @@ static void bkey_cached_free(struct btree_key_cache *bc,
 	ck->btree_trans_barrier_seq =
 		start_poll_synchronize_srcu(&c->btree_trans_barrier);
 
-	list_move_tail(&ck->list, &bc->freed);
+	if (ck->c.lock.readers)
+		list_move_tail(&ck->list, &bc->freed_pcpu);
+	else
+		list_move_tail(&ck->list, &bc->freed_nonpcpu);
 	atomic_long_inc(&bc->nr_freed);
 
 	kfree(ck->k);
@@ -96,35 +104,41 @@ static void bkey_cached_free(struct btree_key_cache *bc,
 }
 
 static void bkey_cached_move_to_freelist(struct btree_key_cache *bc,
-				    struct bkey_cached *ck)
+					 struct bkey_cached *ck)
 {
 	struct btree_key_cache_freelist *f;
 	bool freed = false;
 
 	BUG_ON(test_bit(BKEY_CACHED_DIRTY, &ck->flags));
 
-	preempt_disable();
-	f = this_cpu_ptr(bc->pcpu_freed);
-
-	if (f->nr < ARRAY_SIZE(f->objs)) {
-		f->objs[f->nr++] = ck;
-		freed = true;
-	}
-	preempt_enable();
-
-	if (!freed) {
-		mutex_lock(&bc->lock);
+	if (!ck->c.lock.readers) {
 		preempt_disable();
 		f = this_cpu_ptr(bc->pcpu_freed);
 
-		while (f->nr > ARRAY_SIZE(f->objs) / 2) {
-			struct bkey_cached *ck2 = f->objs[--f->nr];
-
-			list_move_tail(&ck2->list, &bc->freed);
+		if (f->nr < ARRAY_SIZE(f->objs)) {
+			f->objs[f->nr++] = ck;
+			freed = true;
 		}
 		preempt_enable();
 
-		list_move_tail(&ck->list, &bc->freed);
+		if (!freed) {
+			mutex_lock(&bc->lock);
+			preempt_disable();
+			f = this_cpu_ptr(bc->pcpu_freed);
+
+			while (f->nr > ARRAY_SIZE(f->objs) / 2) {
+				struct bkey_cached *ck2 = f->objs[--f->nr];
+
+				list_move_tail(&ck2->list, &bc->freed_nonpcpu);
+			}
+			preempt_enable();
+
+			list_move_tail(&ck->list, &bc->freed_nonpcpu);
+			mutex_unlock(&bc->lock);
+		}
+	} else {
+		mutex_lock(&bc->lock);
+		list_move_tail(&ck->list, &bc->freed_pcpu);
 		mutex_unlock(&bc->lock);
 	}
 }
@@ -151,33 +165,43 @@ static void bkey_cached_free_fast(struct btree_key_cache *bc,
 }
 
 static struct bkey_cached *
-bkey_cached_alloc(struct btree_trans *trans)
+bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path)
 {
 	struct bch_fs *c = trans->c;
 	struct btree_key_cache *bc = &c->btree_key_cache;
 	struct bkey_cached *ck = NULL;
 	struct btree_key_cache_freelist *f;
+	bool pcpu_readers = btree_uses_pcpu_readers(path->btree_id);
 
-	preempt_disable();
-	f = this_cpu_ptr(bc->pcpu_freed);
-	if (f->nr)
-		ck = f->objs[--f->nr];
-	preempt_enable();
-
-	if (!ck) {
-		mutex_lock(&bc->lock);
+	if (!pcpu_readers) {
 		preempt_disable();
 		f = this_cpu_ptr(bc->pcpu_freed);
+		if (f->nr)
+			ck = f->objs[--f->nr];
+		preempt_enable();
+
+		if (!ck) {
+			mutex_lock(&bc->lock);
+			preempt_disable();
+			f = this_cpu_ptr(bc->pcpu_freed);
 
-		while (!list_empty(&bc->freed) &&
-		       f->nr < ARRAY_SIZE(f->objs) / 2) {
-			ck = list_last_entry(&bc->freed, struct bkey_cached, list);
+			while (!list_empty(&bc->freed_nonpcpu) &&
+			       f->nr < ARRAY_SIZE(f->objs) / 2) {
+				ck = list_last_entry(&bc->freed_nonpcpu, struct bkey_cached, list);
+				list_del_init(&ck->list);
+				f->objs[f->nr++] = ck;
+			}
+
+			ck = f->nr ? f->objs[--f->nr] : NULL;
+			preempt_enable();
+			mutex_unlock(&bc->lock);
+		}
+	} else {
+		mutex_lock(&bc->lock);
+		if (!list_empty(&bc->freed_pcpu)) {
+			ck = list_last_entry(&bc->freed_pcpu, struct bkey_cached, list);
 			list_del_init(&ck->list);
-			f->objs[f->nr++] = ck;
 		}
-
-		ck = f->nr ? f->objs[--f->nr] : NULL;
-		preempt_enable();
 		mutex_unlock(&bc->lock);
 	}
 
@@ -205,6 +229,9 @@ bkey_cached_alloc(struct btree_trans *trans)
 		INIT_LIST_HEAD(&ck->list);
 		__six_lock_init(&ck->c.lock, "b->c.lock", &bch2_btree_node_lock_key);
 		lockdep_set_novalidate_class(&ck->c.lock);
+		if (pcpu_readers)
+			six_lock_pcpu_alloc(&ck->c.lock);
+
 		ck->c.cached = true;
 		BUG_ON(!six_trylock_intent(&ck->c.lock));
 		BUG_ON(!six_trylock_write(&ck->c.lock));
@@ -239,15 +266,14 @@ bkey_cached_reuse(struct btree_key_cache *c)
 }
 
 static struct bkey_cached *
-btree_key_cache_create(struct btree_trans *trans,
-		       struct btree_path *path)
+btree_key_cache_create(struct btree_trans *trans, struct btree_path *path)
 {
 	struct bch_fs *c = trans->c;
 	struct btree_key_cache *bc = &c->btree_key_cache;
 	struct bkey_cached *ck;
 	bool was_new = true;
 
-	ck = bkey_cached_alloc(trans);
+	ck = bkey_cached_alloc(trans, path);
 	if (unlikely(IS_ERR(ck)))
 		return ck;
 
@@ -714,7 +740,23 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
 	 * Newest freed entries are at the end of the list - once we hit one
 	 * that's too new to be freed, we can bail out:
 	 */
-	list_for_each_entry_safe(ck, t, &bc->freed, list) {
+	list_for_each_entry_safe(ck, t, &bc->freed_nonpcpu, list) {
+		if (!poll_state_synchronize_srcu(&c->btree_trans_barrier,
+						 ck->btree_trans_barrier_seq))
+			break;
+
+		list_del(&ck->list);
+		six_lock_pcpu_free(&ck->c.lock);
+		kmem_cache_free(bch2_key_cache, ck);
+		atomic_long_dec(&bc->nr_freed);
+		scanned++;
+		freed++;
+	}
+
+	if (scanned >= nr)
+		goto out;
+
+	list_for_each_entry_safe(ck, t, &bc->freed_pcpu, list) {
 		if (!poll_state_synchronize_srcu(&c->btree_trans_barrier,
 						 ck->btree_trans_barrier_seq))
 			break;
@@ -808,7 +850,7 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc)
 		for (i = 0; i < tbl->size; i++)
 			rht_for_each_entry_rcu(ck, pos, tbl, i, hash) {
 				bkey_cached_evict(bc, ck);
-				list_add(&ck->list, &bc->freed);
+				list_add(&ck->list, &bc->freed_nonpcpu);
 			}
 	rcu_read_unlock();
 
@@ -818,11 +860,13 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc)
 
 		for (i = 0; i < f->nr; i++) {
 			ck = f->objs[i];
-			list_add(&ck->list, &bc->freed);
+			list_add(&ck->list, &bc->freed_nonpcpu);
 		}
 	}
 
-	list_for_each_entry_safe(ck, n, &bc->freed, list) {
+	list_splice(&bc->freed_pcpu, &bc->freed_nonpcpu);
+
+	list_for_each_entry_safe(ck, n, &bc->freed_nonpcpu, list) {
 		cond_resched();
 
 		bch2_journal_pin_drop(&c->journal, &ck->journal);
@@ -830,6 +874,7 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc)
 
 		list_del(&ck->list);
 		kfree(ck->k);
+		six_lock_pcpu_free(&ck->c.lock);
 		kmem_cache_free(bch2_key_cache, ck);
 	}
 
@@ -849,7 +894,8 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc)
 void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *c)
 {
 	mutex_init(&c->lock);
-	INIT_LIST_HEAD(&c->freed);
+	INIT_LIST_HEAD(&c->freed_pcpu);
+	INIT_LIST_HEAD(&c->freed_nonpcpu);
 }
 
 int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc)
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 6d9888e3a96a..0a3854b614e0 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -299,7 +299,8 @@ struct btree_key_cache {
 	struct mutex		lock;
 	struct rhashtable	table;
 	bool			table_init_done;
-	struct list_head	freed;
+	struct list_head	freed_pcpu;
+	struct list_head	freed_nonpcpu;
 	struct shrinker		shrink;
 	unsigned		shrink_iter;
 	struct btree_key_cache_freelist __percpu *pcpu_freed;
-- 
cgit 


From 38474c264252475196a5e3c555b2625a5bc36a00 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 2 Sep 2022 22:59:39 -0400
Subject: bcachefs: Avoid using btree_node_lock_nopath()

With the upcoming cycle detector, we have to be careful about using
btree_node_lock_nopath - in particular, using it to take write locks can
cause deadlocks.

All held locks need to be tracked in a btree_path, so that the cycle
detector knows about them - unless we know that we cannot cause
deadlocks for other reasons: e.g. we are only taking read locks, or
we're in very early fsck (topology repair).

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c            |  6 ++---
 fs/bcachefs/btree_key_cache.c       | 17 +++++---------
 fs/bcachefs/btree_update_interior.c | 47 +++++++++++++++++--------------------
 3 files changed, 30 insertions(+), 40 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 9c39027513b0..b1c81278ad75 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -716,13 +716,13 @@ void bch2_trans_node_add(struct btree_trans *trans, struct btree *b)
 	struct btree_path *path;
 
 	trans_for_each_path(trans, path)
-		if (!path->cached &&
+		if (path->uptodate == BTREE_ITER_UPTODATE &&
+		    !path->cached &&
 		    btree_path_pos_in_node(path, b)) {
 			enum btree_node_locked_type t =
 				btree_lock_want(path, b->c.level);
 
-			if (path->nodes_locked &&
-			    t != BTREE_NODE_UNLOCKED) {
+			if (t != BTREE_NODE_UNLOCKED) {
 				btree_node_unlock(trans, path, b->c.level);
 				six_lock_increment(&b->c.lock, (enum six_lock_type) t);
 				mark_btree_node_locked(trans, path, b->c.level, (enum six_lock_type) t);
diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index 89e540c5a244..0f54db0c1b8a 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -568,26 +568,21 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans,
 			atomic_long_dec(&c->btree_key_cache.nr_dirty);
 		}
 	} else {
+		struct btree_path *path2;
 evict:
-		BUG_ON(!btree_node_intent_locked(c_iter.path, 0));
+		trans_for_each_path(trans, path2)
+			if (path2 != c_iter.path)
+				__bch2_btree_path_unlock(trans, path2);
 
-		/*
-		 * XXX: holding a lock that is not marked in btree_trans, not
-		 * ideal:
-		 */
-		six_lock_increment(&ck->c.lock, SIX_LOCK_intent);
-		bch2_trans_unlock(trans);
-
-		/* Will not fail because we are holding no other locks: */
-		btree_node_lock_nopath_nofail(trans, &ck->c, SIX_LOCK_write);
+		bch2_btree_node_lock_write_nofail(trans, c_iter.path, &ck->c);
 
 		if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
 			clear_bit(BKEY_CACHED_DIRTY, &ck->flags);
 			atomic_long_dec(&c->btree_key_cache.nr_dirty);
 		}
 
+		mark_btree_node_locked_noreset(c_iter.path, 0, BTREE_NODE_UNLOCKED);
 		bkey_cached_evict(&c->btree_key_cache, ck);
-
 		bkey_cached_free_fast(&c->btree_key_cache, ck);
 	}
 out:
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index d4e2ebe263a3..d3e3f9466af1 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -160,22 +160,23 @@ static void __btree_node_free(struct bch_fs *c, struct btree *b)
 }
 
 static void bch2_btree_node_free_inmem(struct btree_trans *trans,
+				       struct btree_path *path,
 				       struct btree *b)
 {
 	struct bch_fs *c = trans->c;
-	struct btree_path *path;
-
-	trans_for_each_path(trans, path)
-		BUG_ON(path->l[b->c.level].b == b &&
-		       path->l[b->c.level].lock_seq == b->c.lock.state.seq);
-
-	btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_write);
+	unsigned level = b->c.level;
 
+	bch2_btree_node_lock_write_nofail(trans, path, &b->c);
 	bch2_btree_node_hash_remove(&c->btree_cache, b);
 	__btree_node_free(c, b);
-
 	six_unlock_write(&b->c.lock);
-	six_unlock_intent(&b->c.lock);
+	mark_btree_node_locked_noreset(path, level, SIX_LOCK_intent);
+
+	trans_for_each_path(trans, path)
+		if (path->l[level].b == b) {
+			btree_node_unlock(trans, path, level);
+			path->l[level].b = ERR_PTR(-BCH_ERR_no_btree_node_init);
+		}
 }
 
 static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans,
@@ -1507,22 +1508,19 @@ static void btree_split(struct btree_update *as, struct btree_trans *trans,
 	if (n3)
 		bch2_btree_update_get_open_buckets(as, n3);
 
-	/* Successful split, update the path to point to the new nodes: */
-
-	six_lock_increment(&b->c.lock, SIX_LOCK_intent);
-	if (n3)
-		bch2_trans_node_add(trans, n3);
-	if (n2)
-		bch2_trans_node_add(trans, n2);
-	bch2_trans_node_add(trans, n1);
-
 	/*
 	 * The old node must be freed (in memory) _before_ unlocking the new
 	 * nodes - else another thread could re-acquire a read lock on the old
 	 * node after another thread has locked and updated the new node, thus
 	 * seeing stale data:
 	 */
-	bch2_btree_node_free_inmem(trans, b);
+	bch2_btree_node_free_inmem(trans, path, b);
+
+	if (n3)
+		bch2_trans_node_add(trans, n3);
+	if (n2)
+		bch2_trans_node_add(trans, n2);
+	bch2_trans_node_add(trans, n1);
 
 	if (n3)
 		six_unlock_intent(&n3->c.lock);
@@ -1785,16 +1783,13 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans,
 
 	bch2_btree_update_get_open_buckets(as, n);
 
-	six_lock_increment(&b->c.lock, SIX_LOCK_intent);
-	six_lock_increment(&m->c.lock, SIX_LOCK_intent);
+	bch2_btree_node_free_inmem(trans, path, b);
+	bch2_btree_node_free_inmem(trans, sib_path, m);
 
 	bch2_trans_node_add(trans, n);
 
 	bch2_trans_verify_paths(trans);
 
-	bch2_btree_node_free_inmem(trans, b);
-	bch2_btree_node_free_inmem(trans, m);
-
 	six_unlock_intent(&n->c.lock);
 
 	bch2_btree_update_done(as, trans);
@@ -1851,9 +1846,9 @@ int bch2_btree_node_rewrite(struct btree_trans *trans,
 
 	bch2_btree_update_get_open_buckets(as, n);
 
-	six_lock_increment(&b->c.lock, SIX_LOCK_intent);
+	bch2_btree_node_free_inmem(trans, iter->path, b);
+
 	bch2_trans_node_add(trans, n);
-	bch2_btree_node_free_inmem(trans, b);
 	six_unlock_intent(&n->c.lock);
 
 	bch2_btree_update_done(as, trans);
-- 
cgit 


From 1bb9123301834fbeb81de9e52181ba71b06a011a Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 3 Sep 2022 22:24:16 -0400
Subject: bcachefs: Ensure intent locks are marked before taking write locks

Locks must be correctly marked for the cycle detector to work.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_key_cache.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index 0f54db0c1b8a..977c523359a5 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -214,9 +214,13 @@ bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path)
 			return ERR_PTR(ret);
 		}
 
-		ret = btree_node_lock_nopath(trans, &ck->c, SIX_LOCK_write);
+		path->l[0].b = (void *) ck;
+		path->l[0].lock_seq = ck->c.lock.state.seq;
+		mark_btree_node_locked(trans, path, 0, SIX_LOCK_intent);
+
+		ret = bch2_btree_node_lock_write(trans, path, &ck->c);
 		if (unlikely(ret)) {
-			six_unlock_intent(&ck->c.lock);
+			btree_node_unlock(trans, path, 0);
 			bkey_cached_move_to_freelist(bc, ck);
 			return ERR_PTR(ret);
 		}
@@ -285,6 +289,7 @@ btree_key_cache_create(struct btree_trans *trans, struct btree_path *path)
 			return ERR_PTR(-ENOMEM);
 		}
 
+		mark_btree_node_locked(trans, path, 0, SIX_LOCK_intent);
 		was_new = false;
 	} else {
 		if (path->btree_id == BTREE_ID_subvolumes)
@@ -311,6 +316,7 @@ btree_key_cache_create(struct btree_trans *trans, struct btree_path *path)
 			bkey_cached_free_fast(bc, ck);
 		}
 
+		mark_btree_node_locked(trans, path, 0, BTREE_NODE_UNLOCKED);
 		return NULL;
 	}
 
-- 
cgit 


From e87b0e4a7120eeca1850666351b75bf8ceb9d5c9 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 4 Sep 2022 01:28:51 -0400
Subject: bcachefs: Fix redundant transaction restart

Little bit of tidying up, this makes the counters a little bit clearer
as to what's happening.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_locking.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c
index bfe9780aea3a..08dbc799bb35 100644
--- a/fs/bcachefs/btree_locking.c
+++ b/fs/bcachefs/btree_locking.c
@@ -413,14 +413,13 @@ int bch2_trans_relock(struct btree_trans *trans)
 	struct btree_path *path;
 
 	if (unlikely(trans->restarted))
-		return -BCH_ERR_transaction_restart_relock;
+		return - ((int) trans->restarted);
 
 	trans_for_each_path(trans, path)
 		if (path->should_be_locked &&
-		    bch2_btree_path_relock(trans, path, _RET_IP_)) {
+		    !bch2_btree_path_relock_norestart(trans, path, _RET_IP_)) {
 			trace_and_count(trans->c, trans_restart_relock, trans, _RET_IP_, path);
-			BUG_ON(!trans->restarted);
-			return -BCH_ERR_transaction_restart_relock;
+			return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock);
 		}
 	return 0;
 }
-- 
cgit 


From 1ffb876fb0f31632b761ee721f633e0d7491ca7b Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 12 Sep 2022 02:22:47 -0400
Subject: bcachefs: Kill journal_keys->journal_seq_base

This removes an optimization that didn't actually save us any memory,
due to alignment, but did make the code more complicated than it needed
to be. We were also seeing a bug where journal_seq_base wasn't getting
correctly initailized, so hopefully it'll fix that too.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs.h |  5 ++---
 fs/bcachefs/recovery.c | 14 ++------------
 2 files changed, 4 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index c1d96222f4c3..74da688d994b 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -555,13 +555,13 @@ struct journal_seq_blacklist_table {
 
 struct journal_keys {
 	struct journal_key {
+		u64		journal_seq;
+		u32		journal_offset;
 		enum btree_id	btree_id:8;
 		unsigned	level:8;
 		bool		allocated;
 		bool		overwritten;
 		struct bkey_i	*k;
-		u32		journal_seq;
-		u32		journal_offset;
 	}			*d;
 	/*
 	 * Gap buffer: instead of all the empty space in the array being at the
@@ -571,7 +571,6 @@ struct journal_keys {
 	size_t			gap;
 	size_t			nr;
 	size_t			size;
-	u64			journal_seq_base;
 };
 
 struct btree_path_buf {
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 2cf347530b65..ea8cc636a9e0 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -222,7 +222,6 @@ int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id,
 		struct journal_keys new_keys = {
 			.nr			= keys->nr,
 			.size			= max_t(size_t, keys->size, 8) * 2,
-			.journal_seq_base	= keys->journal_seq_base,
 		};
 
 		new_keys.d = kvmalloc(sizeof(new_keys.d[0]) * new_keys.size, GFP_KERNEL);
@@ -493,9 +492,6 @@ static int journal_keys_sort(struct bch_fs *c)
 		if (!i || i->ignore)
 			continue;
 
-		if (!keys->journal_seq_base)
-			keys->journal_seq_base = le64_to_cpu(i->j.seq);
-
 		for_each_jset_key(k, _n, entry, &i->j)
 			nr_keys++;
 	}
@@ -515,15 +511,12 @@ static int journal_keys_sort(struct bch_fs *c)
 		if (!i || i->ignore)
 			continue;
 
-		BUG_ON(le64_to_cpu(i->j.seq) - keys->journal_seq_base > U32_MAX);
-
 		for_each_jset_key(k, _n, entry, &i->j)
 			keys->d[keys->nr++] = (struct journal_key) {
 				.btree_id	= entry->btree_id,
 				.level		= entry->level,
 				.k		= k,
-				.journal_seq	= le64_to_cpu(i->j.seq) -
-					keys->journal_seq_base,
+				.journal_seq	= le64_to_cpu(i->j.seq),
 				.journal_offset	= k->_data - i->j._data,
 			};
 	}
@@ -617,15 +610,12 @@ static int bch2_journal_replay(struct bch_fs *c)
 	     sizeof(keys_sorted[0]),
 	     journal_sort_seq_cmp, NULL);
 
-	if (keys->nr)
-		replay_now_at(j, keys->journal_seq_base);
-
 	for (i = 0; i < keys->nr; i++) {
 		k = keys_sorted[i];
 
 		cond_resched();
 
-		replay_now_at(j, keys->journal_seq_base + k->journal_seq);
+		replay_now_at(j, k->journal_seq);
 
 		ret = bch2_trans_do(c, NULL, NULL,
 				    BTREE_INSERT_LAZY_RW|
-- 
cgit 


From 5877d8876afe1c5843731244f39d1739eba2665f Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 4 Sep 2022 14:10:12 -0400
Subject: bcachefs: Re-enable hash_redo_key()

When subvolumes & snapshots were rolled out, hash_redo_key() was
disabled due to some new complications - namely, bch2_hash_set() works
at the subvolume level, and fsck does not run in a defined subvolume,
instead working at the snapshot ID level.

This patch splits out bch2_hash_set_snapshot() from bch2_hash_set(), and
makes one small tweak for fsck:

 - Normally, bch2_hash_set() (and other dirent code) needs to know what
   subvolume we're in, because dirents that point to other subvolumes
   should only be visible in the subvolume they were created in, not
   other snapshots. We can't check that in fsck, so we just assume that
   all dirents are visible.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fsck.c     | 13 ++++++++-----
 fs/bcachefs/str_hash.h | 45 ++++++++++++++++++++++++++++++++-------------
 2 files changed, 40 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 9f768d774ba6..12f2ef4417cb 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -772,9 +772,6 @@ static int hash_redo_key(struct btree_trans *trans,
 			 struct bch_hash_info *hash_info,
 			 struct btree_iter *k_iter, struct bkey_s_c k)
 {
-	bch_err(trans->c, "hash_redo_key() not implemented yet");
-	return -EINVAL;
-#if 0
 	struct bkey_i *delete;
 	struct bkey_i *tmp;
 
@@ -792,8 +789,14 @@ static int hash_redo_key(struct btree_trans *trans,
 	delete->k.p = k_iter->pos;
 	return  bch2_btree_iter_traverse(k_iter) ?:
 		bch2_trans_update(trans, k_iter, delete, 0) ?:
-		bch2_hash_set(trans, desc, hash_info, k_iter->pos.inode, tmp, 0);
-#endif
+		bch2_hash_set_snapshot(trans, desc, hash_info,
+				       (subvol_inum) { 0, k.k->p.inode },
+				       k.k->p.snapshot, tmp,
+				       BCH_HASH_SET_MUST_CREATE,
+				       BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
+		bch2_trans_commit(trans, NULL, NULL,
+				  BTREE_INSERT_NOFAIL|
+				  BTREE_INSERT_LAZY_RW);
 }
 
 static int hash_check_key(struct btree_trans *trans,
diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h
index 591bbb9f8beb..560983df13f0 100644
--- a/fs/bcachefs/str_hash.h
+++ b/fs/bcachefs/str_hash.h
@@ -144,7 +144,9 @@ struct bch_hash_desc {
 static inline bool is_visible_key(struct bch_hash_desc desc, subvol_inum inum, struct bkey_s_c k)
 {
 	return k.k->type == desc.key_type &&
-		(!desc.is_visible || desc.is_visible(inum, k));
+		(!desc.is_visible ||
+		 !inum.inum ||
+		 desc.is_visible(inum, k));
 }
 
 static __always_inline int
@@ -239,27 +241,24 @@ int bch2_hash_needs_whiteout(struct btree_trans *trans,
 }
 
 static __always_inline
-int bch2_hash_set(struct btree_trans *trans,
-		  const struct bch_hash_desc desc,
-		  const struct bch_hash_info *info,
-		  subvol_inum inum,
-		  struct bkey_i *insert, int flags)
+int bch2_hash_set_snapshot(struct btree_trans *trans,
+			   const struct bch_hash_desc desc,
+			   const struct bch_hash_info *info,
+			   subvol_inum inum, u32 snapshot,
+			   struct bkey_i *insert,
+			   int flags,
+			   int update_flags)
 {
 	struct btree_iter iter, slot = { NULL };
 	struct bkey_s_c k;
 	bool found = false;
-	u32 snapshot;
 	int ret;
 
-	ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
-	if (ret)
-		return ret;
-
 	for_each_btree_key_upto_norestart(trans, iter, desc.btree_id,
-			   SPOS(inum.inum,
+			   SPOS(insert->k.p.inode,
 				desc.hash_bkey(info, bkey_i_to_s_c(insert)),
 				snapshot),
-			   POS(inum.inum, U64_MAX),
+			   POS(insert->k.p.inode, U64_MAX),
 			   BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) {
 		if (is_visible_key(desc, inum, k)) {
 			if (!desc.cmp_bkey(k, bkey_i_to_s_c(insert)))
@@ -303,6 +302,26 @@ not_found:
 	goto out;
 }
 
+static __always_inline
+int bch2_hash_set(struct btree_trans *trans,
+		  const struct bch_hash_desc desc,
+		  const struct bch_hash_info *info,
+		  subvol_inum inum,
+		  struct bkey_i *insert, int flags)
+{
+	u32 snapshot;
+	int ret;
+
+	ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
+	if (ret)
+		return ret;
+
+	insert->k.p.inode = inum.inum;
+
+	return bch2_hash_set_snapshot(trans, desc, info, inum,
+				      snapshot, insert, flags, 0);
+}
+
 static __always_inline
 int bch2_hash_delete_at(struct btree_trans *trans,
 			const struct bch_hash_desc desc,
-- 
cgit 


From 5a82c7c7d1925f6f060a427f38ea17b53c6945f1 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 16 Sep 2022 18:39:01 -0400
Subject: bcachefs: Fix sb_field_counters formatting

We have counters with longer names now, so adjust the tabstop - also,
make sure there's always a space printed between the name and the
number.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/counters.c | 2 +-
 fs/bcachefs/super-io.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/counters.c b/fs/bcachefs/counters.c
index 745f856e6d3e..edd1b2537f48 100644
--- a/fs/bcachefs/counters.c
+++ b/fs/bcachefs/counters.c
@@ -36,7 +36,7 @@ void bch2_sb_counters_to_text(struct printbuf *out, struct bch_sb *sb,
 
 	for (i = 0; i < nr; i++) {
 		if (i < BCH_COUNTER_NR)
-			prt_printf(out, "%s", bch2_counter_names[i]);
+			prt_printf(out, "%s ", bch2_counter_names[i]);
 		else
 			prt_printf(out, "(unknown)");
 
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index 4953f54e94d6..220fda28c865 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -1483,7 +1483,7 @@ void bch2_sb_to_text(struct printbuf *out, struct bch_sb *sb,
 	unsigned nr_devices = 0;
 
 	if (!out->nr_tabstops)
-		printbuf_tabstop_push(out, 32);
+		printbuf_tabstop_push(out, 44);
 
 	mi = bch2_sb_get_members(sb);
 	if (mi) {
-- 
cgit 


From b8eec675912ecb7e53dabe35d2869c2da60c45c9 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 17 Sep 2022 15:20:13 -0400
Subject: bcachefs: Add a manual trigger for lock wakeups

Spotted a lockup once that appeared to be a lost wakeup. Adding a manual
trigger for lock wakeups will make it easy to tell if that's what it is
next time it occurs.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/sysfs.c | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index d10ac84c10ce..96c107e0508e 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -156,6 +156,7 @@ write_attribute(trigger_gc);
 write_attribute(trigger_discards);
 write_attribute(trigger_invalidates);
 write_attribute(prune_cache);
+write_attribute(btree_wakeup);
 rw_attribute(btree_gc_periodic);
 rw_attribute(gc_gens_pos);
 
@@ -363,6 +364,21 @@ static void bch2_gc_gens_pos_to_text(struct printbuf *out, struct bch_fs *c)
 	prt_printf(out, "\n");
 }
 
+static void bch2_btree_wakeup_all(struct bch_fs *c)
+{
+	struct btree_trans *trans;
+
+	mutex_lock(&c->btree_trans_lock);
+	list_for_each_entry(trans, &c->btree_trans_list, list) {
+		struct btree_bkey_cached_common *b = READ_ONCE(trans->locking);
+
+		if (b)
+			six_lock_wakeup_all(&b->lock);
+
+	}
+	mutex_unlock(&c->btree_trans_lock);
+}
+
 SHOW(bch2_fs)
 {
 	struct bch_fs *c = container_of(kobj, struct bch_fs, kobj);
@@ -483,6 +499,9 @@ STORE(bch2_fs)
 		c->btree_cache.shrink.scan_objects(&c->btree_cache.shrink, &sc);
 	}
 
+	if (attr == &sysfs_btree_wakeup)
+		bch2_btree_wakeup_all(c);
+
 	if (attr == &sysfs_trigger_gc) {
 		/*
 		 * Full gc is currently incompatible with btree key cache:
@@ -614,6 +633,7 @@ struct attribute *bch2_fs_internal_files[] = {
 	&sysfs_trigger_discards,
 	&sysfs_trigger_invalidates,
 	&sysfs_prune_cache,
+	&sysfs_btree_wakeup,
 
 	&sysfs_gc_gens_pos,
 
-- 
cgit 


From 367d72dd5fd5b9e0b87633cbcb11b58b91d6bcc5 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 17 Sep 2022 14:36:24 -0400
Subject: bcachefs: bch2_btree_path_upgrade() now emits transaction restart

Centralizing the transaction restart/tracepoint in
bch2_btree_path_upgrade() lets us improve the tracepoint - now it emits
old and new locks_want.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_locking.h         | 19 ++++++++----
 fs/bcachefs/btree_update_interior.c | 12 ++++----
 fs/bcachefs/btree_update_leaf.c     |  8 ++---
 fs/bcachefs/trace.h                 | 59 +++++++++++++++++++++++++++++--------
 4 files changed, 68 insertions(+), 30 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h
index 9758a0c05d25..aea2ebafffd8 100644
--- a/fs/bcachefs/btree_locking.h
+++ b/fs/bcachefs/btree_locking.h
@@ -345,15 +345,22 @@ bool bch2_btree_path_upgrade_noupgrade_sibs(struct btree_trans *,
 bool __bch2_btree_path_upgrade(struct btree_trans *,
 			       struct btree_path *, unsigned);
 
-static inline bool bch2_btree_path_upgrade(struct btree_trans *trans,
-					   struct btree_path *path,
-					   unsigned new_locks_want)
+static inline int bch2_btree_path_upgrade(struct btree_trans *trans,
+					  struct btree_path *path,
+					  unsigned new_locks_want)
 {
+	unsigned old_locks_want = path->locks_want;
+
 	new_locks_want = min(new_locks_want, BTREE_MAX_DEPTH);
 
-	return path->locks_want < new_locks_want
-		? __bch2_btree_path_upgrade(trans, path, new_locks_want)
-		: path->uptodate == BTREE_ITER_UPTODATE;
+	if (path->locks_want < new_locks_want
+	    ? __bch2_btree_path_upgrade(trans, path, new_locks_want)
+	    : path->uptodate == BTREE_ITER_UPTODATE)
+		return 0;
+
+	trace_and_count(trans->c, trans_restart_upgrade, trans, _THIS_IP_, path,
+			old_locks_want, new_locks_want);
+	return btree_trans_restart(trans, BCH_ERR_transaction_restart_upgrade);
 }
 
 /* misc: */
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index d3e3f9466af1..783b63bcce2f 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -1003,11 +1003,9 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
 	if (update_level < BTREE_MAX_DEPTH)
 		nr_nodes[1] += 1;
 
-	if (!bch2_btree_path_upgrade(trans, path, U8_MAX)) {
-		trace_and_count(c, trans_restart_iter_upgrade, trans, _RET_IP_, path);
-		ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_upgrade);
+	ret = bch2_btree_path_upgrade(trans, path, U8_MAX);
+	if (ret)
 		return ERR_PTR(ret);
-	}
 
 	if (flags & BTREE_INSERT_GC_LOCK_HELD)
 		lockdep_assert_held(&c->gc_lock);
@@ -2035,9 +2033,9 @@ int bch2_btree_node_update_key(struct btree_trans *trans, struct btree_iter *ite
 	struct closure cl;
 	int ret = 0;
 
-	if (!btree_node_intent_locked(path, b->c.level) &&
-	    !bch2_btree_path_upgrade(trans, path, b->c.level + 1))
-		return btree_trans_restart(trans, BCH_ERR_transaction_restart_upgrade);
+	ret = bch2_btree_path_upgrade(trans, path, b->c.level + 1);
+	if (ret)
+		return ret;
 
 	closure_init_stack(&cl);
 
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 7f60f9f81f42..d262a9e16b95 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -806,7 +806,7 @@ static inline bool have_conflicting_read_lock(struct btree_trans *trans, struct
 		//	break;
 
 		if (btree_node_read_locked(path, path->level) &&
-		    !bch2_btree_path_upgrade(trans, path, path->level + 1))
+		    !bch2_btree_path_upgrade_noupgrade_sibs(trans, path, path->level + 1))
 			return true;
 	}
 
@@ -1131,11 +1131,9 @@ int __bch2_trans_commit(struct btree_trans *trans)
 	trans_for_each_update(trans, i) {
 		BUG_ON(!i->path->should_be_locked);
 
-		if (unlikely(!bch2_btree_path_upgrade(trans, i->path, i->level + 1))) {
-			trace_and_count(c, trans_restart_upgrade, trans, _RET_IP_, i->path);
-			ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_upgrade);
+		ret = bch2_btree_path_upgrade(trans, i->path, i->level + 1);
+		if (unlikely(ret))
 			goto out;
-		}
 
 		BUG_ON(!btree_node_intent_locked(i->path, i->level));
 
diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h
index 1ef99af5cd03..62de89fcb74b 100644
--- a/fs/bcachefs/trace.h
+++ b/fs/bcachefs/trace.h
@@ -401,6 +401,7 @@ TRACE_EVENT(btree_path_relock_fail,
 		__array(char,			trans_fn, 32	)
 		__field(unsigned long,		caller_ip	)
 		__field(u8,			btree_id	)
+		__field(u8,			level		)
 		TRACE_BPOS_entries(pos)
 		__array(char,			node, 24	)
 		__field(u32,			iter_lock_seq	)
@@ -413,6 +414,7 @@ TRACE_EVENT(btree_path_relock_fail,
 		strlcpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
 		__entry->caller_ip		= caller_ip;
 		__entry->btree_id		= path->btree_id;
+		__entry->level			= path->level;
 		TRACE_BPOS_assign(pos, path->pos);
 		if (IS_ERR(b))
 			strscpy(__entry->node, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node));
@@ -422,13 +424,14 @@ TRACE_EVENT(btree_path_relock_fail,
 		__entry->node_lock_seq		= is_btree_node(path, level) ? path->l[level].b->c.lock.state.seq : 0;
 	),
 
-	TP_printk("%s %pS btree %s pos %llu:%llu:%u, node %s iter seq %u lock seq %u",
+	TP_printk("%s %pS btree %s pos %llu:%llu:%u level %u node %s iter seq %u lock seq %u",
 		  __entry->trans_fn,
 		  (void *) __entry->caller_ip,
 		  bch2_btree_ids[__entry->btree_id],
 		  __entry->pos_inode,
 		  __entry->pos_offset,
 		  __entry->pos_snapshot,
+		  __entry->level,
 		  __entry->node,
 		  __entry->iter_lock_seq,
 		  __entry->node_lock_seq)
@@ -445,12 +448,15 @@ TRACE_EVENT(btree_path_upgrade_fail,
 		__array(char,			trans_fn, 32	)
 		__field(unsigned long,		caller_ip	)
 		__field(u8,			btree_id	)
+		__field(u8,			level		)
 		TRACE_BPOS_entries(pos)
 		__field(u8,			locked		)
 		__field(u8,			self_read_count	)
 		__field(u8,			self_intent_count)
 		__field(u8,			read_count	)
 		__field(u8,			intent_count	)
+		__field(u32,			iter_lock_seq	)
+		__field(u32,			node_lock_seq	)
 	),
 
 	TP_fast_assign(
@@ -459,6 +465,7 @@ TRACE_EVENT(btree_path_upgrade_fail,
 		strlcpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
 		__entry->caller_ip		= caller_ip;
 		__entry->btree_id		= path->btree_id;
+		__entry->level			= level;
 		TRACE_BPOS_assign(pos, path->pos);
 		__entry->locked			= btree_node_locked(path, level);
 
@@ -468,20 +475,25 @@ TRACE_EVENT(btree_path_upgrade_fail,
 		c = six_lock_counts(&path->l[level].b->c.lock);
 		__entry->read_count		= c.n[SIX_LOCK_read];
 		__entry->intent_count		= c.n[SIX_LOCK_read];
+		__entry->iter_lock_seq		= path->l[level].lock_seq;
+		__entry->node_lock_seq		= is_btree_node(path, level) ? path->l[level].b->c.lock.state.seq : 0;
 	),
 
-	TP_printk("%s %pS btree %s pos %llu:%llu:%u, locked %u held %u:%u lock count %u:%u",
+	TP_printk("%s %pS btree %s pos %llu:%llu:%u level %u locked %u held %u:%u lock count %u:%u iter seq %u lock seq %u",
 		  __entry->trans_fn,
 		  (void *) __entry->caller_ip,
 		  bch2_btree_ids[__entry->btree_id],
 		  __entry->pos_inode,
 		  __entry->pos_offset,
 		  __entry->pos_snapshot,
+		  __entry->level,
 		  __entry->locked,
 		  __entry->self_read_count,
 		  __entry->self_intent_count,
 		  __entry->read_count,
-		  __entry->intent_count)
+		  __entry->intent_count,
+		  __entry->iter_lock_seq,
+		  __entry->node_lock_seq)
 );
 
 /* Garbage collection */
@@ -894,18 +906,41 @@ DEFINE_EVENT(transaction_restart_iter,	trans_restart_btree_node_split,
 	TP_ARGS(trans, caller_ip, path)
 );
 
-DEFINE_EVENT(transaction_restart_iter,	trans_restart_upgrade,
+TRACE_EVENT(trans_restart_upgrade,
 	TP_PROTO(struct btree_trans *trans,
 		 unsigned long caller_ip,
-		 struct btree_path *path),
-	TP_ARGS(trans, caller_ip, path)
-);
+		 struct btree_path *path,
+		 unsigned old_locks_want,
+		 unsigned new_locks_want),
+	TP_ARGS(trans, caller_ip, path, old_locks_want, new_locks_want),
 
-DEFINE_EVENT(transaction_restart_iter,	trans_restart_iter_upgrade,
-	TP_PROTO(struct btree_trans *trans,
-		 unsigned long caller_ip,
-		 struct btree_path *path),
-	TP_ARGS(trans, caller_ip, path)
+	TP_STRUCT__entry(
+		__array(char,			trans_fn, 32	)
+		__field(unsigned long,		caller_ip	)
+		__field(u8,			btree_id	)
+		__field(u8,			old_locks_want	)
+		__field(u8,			new_locks_want	)
+		TRACE_BPOS_entries(pos)
+	),
+
+	TP_fast_assign(
+		strlcpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
+		__entry->caller_ip		= caller_ip;
+		__entry->btree_id		= path->btree_id;
+		__entry->old_locks_want		= old_locks_want;
+		__entry->new_locks_want		= new_locks_want;
+		TRACE_BPOS_assign(pos, path->pos)
+	),
+
+	TP_printk("%s %pS btree %s pos %llu:%llu:%u locks_want %u -> %u",
+		  __entry->trans_fn,
+		  (void *) __entry->caller_ip,
+		  bch2_btree_ids[__entry->btree_id],
+		  __entry->pos_inode,
+		  __entry->pos_offset,
+		  __entry->pos_snapshot,
+		  __entry->old_locks_want,
+		  __entry->new_locks_want)
 );
 
 DEFINE_EVENT(transaction_restart_iter,	trans_restart_relock,
-- 
cgit 


From e4215d0fec777e6516306f5f1b69a45a4205dce0 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 16 Sep 2022 14:42:38 -0400
Subject: bcachefs: All held locks must be in a btree path

With the new deadlock cycle detector, it's critical that all held locks
be marked in a btree_path, because that's what the cycle detector
traverses - any locks that aren't correctly marked will cause deadlocks.

This changes the btree_path to allocate some btree_paths for the new
nodes, since until the final update is done we otherwise don't have a
path referencing them.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c            | 69 +++-----------------------------
 fs/bcachefs/btree_iter.h            |  3 ++
 fs/bcachefs/btree_update_interior.c | 78 +++++++++++++++++++++++++++++++++++--
 3 files changed, 84 insertions(+), 66 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index b1c81278ad75..e65c300ffe40 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -619,61 +619,6 @@ static inline bool btree_path_advance_to_pos(struct btree_path *path,
 	return true;
 }
 
-/*
- * Verify that iterator for parent node points to child node:
- */
-static void btree_path_verify_new_node(struct btree_trans *trans,
-				       struct btree_path *path, struct btree *b)
-{
-	struct bch_fs *c = trans->c;
-	struct btree_path_level *l;
-	unsigned plevel;
-	bool parent_locked;
-	struct bkey_packed *k;
-
-	if (!IS_ENABLED(CONFIG_BCACHEFS_DEBUG))
-		return;
-
-	if (trans->journal_replay_not_finished)
-		return;
-
-	plevel = b->c.level + 1;
-	if (!btree_path_node(path, plevel))
-		return;
-
-	parent_locked = btree_node_locked(path, plevel);
-
-	if (!bch2_btree_node_relock(trans, path, plevel))
-		return;
-
-	l = &path->l[plevel];
-	k = bch2_btree_node_iter_peek_all(&l->iter, l->b);
-	if (!k ||
-	    bkey_deleted(k) ||
-	    bkey_cmp_left_packed(l->b, k, &b->key.k.p)) {
-		struct printbuf buf1 = PRINTBUF;
-		struct printbuf buf2 = PRINTBUF;
-		struct printbuf buf3 = PRINTBUF;
-		struct printbuf buf4 = PRINTBUF;
-		struct bkey uk = bkey_unpack_key(b, k);
-
-		bch2_dump_btree_node(c, l->b);
-		bch2_bpos_to_text(&buf1, path->pos);
-		bch2_bkey_to_text(&buf2, &uk);
-		bch2_bpos_to_text(&buf3, b->data->min_key);
-		bch2_bpos_to_text(&buf3, b->data->max_key);
-		panic("parent iter doesn't point to new node:\n"
-		      "iter pos %s %s\n"
-		      "iter key %s\n"
-		      "new node %s-%s\n",
-		      bch2_btree_ids[path->btree_id],
-		      buf1.buf, buf2.buf, buf3.buf, buf4.buf);
-	}
-
-	if (!parent_locked)
-		btree_node_unlock(trans, path, plevel);
-}
-
 static inline void __btree_path_level_init(struct btree_path *path,
 					   unsigned level)
 {
@@ -689,14 +634,12 @@ static inline void __btree_path_level_init(struct btree_path *path,
 		bch2_btree_node_iter_peek(&l->iter, l->b);
 }
 
-static inline void btree_path_level_init(struct btree_trans *trans,
-					 struct btree_path *path,
-					 struct btree *b)
+inline void bch2_btree_path_level_init(struct btree_trans *trans,
+				       struct btree_path *path,
+				       struct btree *b)
 {
 	BUG_ON(path->cached);
 
-	btree_path_verify_new_node(trans, path, b);
-
 	EBUG_ON(!btree_path_pos_in_node(path, b));
 	EBUG_ON(b->c.lock.state.seq & 1);
 
@@ -728,7 +671,7 @@ void bch2_trans_node_add(struct btree_trans *trans, struct btree *b)
 				mark_btree_node_locked(trans, path, b->c.level, (enum six_lock_type) t);
 			}
 
-			btree_path_level_init(trans, path, b);
+			bch2_btree_path_level_init(trans, path, b);
 		}
 }
 
@@ -809,7 +752,7 @@ static inline int btree_path_lock_root(struct btree_trans *trans,
 				path->l[i].b = NULL;
 
 			mark_btree_node_locked(trans, path, path->level, lock_type);
-			btree_path_level_init(trans, path, b);
+			bch2_btree_path_level_init(trans, path, b);
 			return 0;
 		}
 
@@ -982,7 +925,7 @@ static __always_inline int btree_path_down(struct btree_trans *trans,
 
 	mark_btree_node_locked(trans, path, level, lock_type);
 	path->level = level;
-	btree_path_level_init(trans, path, b);
+	bch2_btree_path_level_init(trans, path, b);
 
 	bch2_btree_path_verify_locks(path);
 err:
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index bdc703324b9a..2f47889c688a 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -179,6 +179,9 @@ inline struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *, struct bke
 struct bkey_i *bch2_btree_journal_peek_slot(struct btree_trans *,
 					struct btree_iter *, struct bpos);
 
+inline void bch2_btree_path_level_init(struct btree_trans *,
+				       struct btree_path *, struct btree *);
+
 #ifdef CONFIG_BCACHEFS_DEBUG
 void bch2_trans_verify_paths(struct btree_trans *);
 void bch2_assert_pos_locked(struct btree_trans *, enum btree_id,
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 783b63bcce2f..7028597358d5 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -28,6 +28,22 @@ static void bch2_btree_insert_node(struct btree_update *, struct btree_trans *,
 				   struct keylist *, unsigned);
 static void bch2_btree_update_add_new_node(struct btree_update *, struct btree *);
 
+static struct btree_path *get_unlocked_mut_path(struct btree_trans *trans,
+						enum btree_id btree_id,
+						unsigned level,
+						struct bpos pos)
+{
+	struct btree_path *path;
+
+	path = bch2_path_get(trans, btree_id, pos, level + 1, level,
+			     BTREE_ITER_NOPRESERVE|
+			     BTREE_ITER_INTENT);
+	path = bch2_btree_path_make_mut(trans, path, true);
+	bch2_btree_path_downgrade(trans, path);
+	__bch2_btree_path_unlock(trans, path);
+	return path;
+}
+
 /* Debug code: */
 
 /*
@@ -618,7 +634,10 @@ static void btree_update_nodes_written(struct btree_update *as)
 			     "error %i in btree_update_nodes_written()", ret);
 err:
 	if (as->b) {
+		struct btree_path *path;
+
 		b = as->b;
+		path = get_unlocked_mut_path(&trans, as->btree_id, b->c.level, b->key.k.p);
 		/*
 		 * @b is the node we did the final insert into:
 		 *
@@ -632,7 +651,12 @@ err:
 		 */
 
 		btree_node_lock_nopath_nofail(&trans, &b->c, SIX_LOCK_intent);
-		btree_node_lock_nopath_nofail(&trans, &b->c, SIX_LOCK_write);
+		mark_btree_node_locked(&trans, path, b->c.level, SIX_LOCK_intent);
+		path->l[b->c.level].lock_seq = b->c.lock.state.seq;
+		path->l[b->c.level].b = b;
+
+		bch2_btree_node_lock_write_nofail(&trans, path, &b->c);
+
 		mutex_lock(&c->btree_interior_update_lock);
 
 		list_del(&as->write_blocked_list);
@@ -666,10 +690,13 @@ err:
 		}
 
 		mutex_unlock(&c->btree_interior_update_lock);
+
+		mark_btree_node_locked_noreset(path, b->c.level, SIX_LOCK_intent);
 		six_unlock_write(&b->c.lock);
 
 		btree_node_write_if_need(c, b, SIX_LOCK_intent);
-		six_unlock_intent(&b->c.lock);
+		btree_node_unlock(&trans, path, b->c.level);
+		bch2_path_put(&trans, path, true);
 	}
 
 	bch2_journal_pin_drop(&c->journal, &as->journal);
@@ -1428,6 +1455,7 @@ static void btree_split(struct btree_update *as, struct btree_trans *trans,
 	struct bch_fs *c = as->c;
 	struct btree *parent = btree_node_parent(path, b);
 	struct btree *n1, *n2 = NULL, *n3 = NULL;
+	struct btree_path *path1 = NULL, *path2 = NULL;
 	u64 start_time = local_clock();
 
 	BUG_ON(!parent && (b != btree_node_root(c, b)));
@@ -1450,6 +1478,16 @@ static void btree_split(struct btree_update *as, struct btree_trans *trans,
 		six_unlock_write(&n2->c.lock);
 		six_unlock_write(&n1->c.lock);
 
+		path1 = get_unlocked_mut_path(trans, path->btree_id, n1->c.level, n1->key.k.p);
+		six_lock_increment(&n1->c.lock, SIX_LOCK_intent);
+		mark_btree_node_locked(trans, path1, n1->c.level, SIX_LOCK_intent);
+		bch2_btree_path_level_init(trans, path1, n1);
+
+		path2 = get_unlocked_mut_path(trans, path->btree_id, n2->c.level, n2->key.k.p);
+		six_lock_increment(&n2->c.lock, SIX_LOCK_intent);
+		mark_btree_node_locked(trans, path2, n2->c.level, SIX_LOCK_intent);
+		bch2_btree_path_level_init(trans, path2, n2);
+
 		bch2_btree_update_add_new_node(as, n1);
 
 		bch2_btree_node_write(c, n1, SIX_LOCK_intent, 0);
@@ -1467,6 +1505,12 @@ static void btree_split(struct btree_update *as, struct btree_trans *trans,
 			/* Depth increases, make a new root */
 			n3 = __btree_root_alloc(as, trans, b->c.level + 1);
 
+			path2->locks_want++;
+			BUG_ON(btree_node_locked(path2, n3->c.level));
+			six_lock_increment(&n3->c.lock, SIX_LOCK_intent);
+			mark_btree_node_locked(trans, path2, n3->c.level, SIX_LOCK_intent);
+			bch2_btree_path_level_init(trans, path2, n3);
+
 			n3->sib_u64s[0] = U16_MAX;
 			n3->sib_u64s[1] = U16_MAX;
 
@@ -1480,6 +1524,11 @@ static void btree_split(struct btree_update *as, struct btree_trans *trans,
 		bch2_btree_build_aux_trees(n1);
 		six_unlock_write(&n1->c.lock);
 
+		path1 = get_unlocked_mut_path(trans, path->btree_id, n1->c.level, n1->key.k.p);
+		six_lock_increment(&n1->c.lock, SIX_LOCK_intent);
+		mark_btree_node_locked(trans, path1, n1->c.level, SIX_LOCK_intent);
+		bch2_btree_path_level_init(trans, path1, n1);
+
 		bch2_btree_update_add_new_node(as, n1);
 
 		bch2_btree_node_write(c, n1, SIX_LOCK_intent, 0);
@@ -1526,6 +1575,15 @@ static void btree_split(struct btree_update *as, struct btree_trans *trans,
 		six_unlock_intent(&n2->c.lock);
 	six_unlock_intent(&n1->c.lock);
 
+	if (path2) {
+		__bch2_btree_path_unlock(trans, path2);
+		bch2_path_put(trans, path2, true);
+	}
+	if (path1) {
+		__bch2_btree_path_unlock(trans, path1);
+		bch2_path_put(trans, path1, true);
+	}
+
 	bch2_trans_verify_locks(trans);
 
 	bch2_time_stats_update(&c->times[n2
@@ -1642,7 +1700,7 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans,
 				  enum btree_node_sibling sib)
 {
 	struct bch_fs *c = trans->c;
-	struct btree_path *sib_path = NULL;
+	struct btree_path *sib_path = NULL, *new_path = NULL;
 	struct btree_update *as;
 	struct bkey_format_state new_s;
 	struct bkey_format new_f;
@@ -1766,6 +1824,11 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans,
 	bch2_btree_build_aux_trees(n);
 	six_unlock_write(&n->c.lock);
 
+	new_path = get_unlocked_mut_path(trans, path->btree_id, n->c.level, n->key.k.p);
+	six_lock_increment(&n->c.lock, SIX_LOCK_intent);
+	mark_btree_node_locked(trans, new_path, n->c.level, SIX_LOCK_intent);
+	bch2_btree_path_level_init(trans, new_path, n);
+
 	bch2_btree_node_write(c, n, SIX_LOCK_intent, 0);
 
 	bkey_init(&delete.k);
@@ -1795,6 +1858,8 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans,
 	bch2_time_stats_update(&c->times[BCH_TIME_btree_node_merge], start_time);
 out:
 err:
+	if (new_path)
+		bch2_path_put(trans, new_path, true);
 	bch2_path_put(trans, sib_path, true);
 	bch2_trans_verify_locks(trans);
 	return ret;
@@ -1809,6 +1874,7 @@ int bch2_btree_node_rewrite(struct btree_trans *trans,
 			    unsigned flags)
 {
 	struct bch_fs *c = trans->c;
+	struct btree_path *new_path = NULL;
 	struct btree *n, *parent;
 	struct btree_update *as;
 	int ret;
@@ -1830,6 +1896,11 @@ int bch2_btree_node_rewrite(struct btree_trans *trans,
 	bch2_btree_build_aux_trees(n);
 	six_unlock_write(&n->c.lock);
 
+	new_path = get_unlocked_mut_path(trans, iter->btree_id, n->c.level, n->key.k.p);
+	six_lock_increment(&n->c.lock, SIX_LOCK_intent);
+	mark_btree_node_locked(trans, new_path, n->c.level, SIX_LOCK_intent);
+	bch2_btree_path_level_init(trans, new_path, n);
+
 	trace_and_count(c, btree_node_rewrite, c, b);
 
 	bch2_btree_node_write(c, n, SIX_LOCK_intent, 0);
@@ -1850,6 +1921,7 @@ int bch2_btree_node_rewrite(struct btree_trans *trans,
 	six_unlock_intent(&n->c.lock);
 
 	bch2_btree_update_done(as, trans);
+	bch2_path_put(trans, new_path, true);
 out:
 	bch2_btree_path_downgrade(trans, iter->path);
 	return ret;
-- 
cgit 


From 57ce827442c4e7b0f38b14b91c97413c5d779697 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 18 Sep 2022 13:37:34 -0400
Subject: bcachefs: Make an assertion more informative

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index e65c300ffe40..1650ba87ef03 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -2606,7 +2606,10 @@ static void __bch2_trans_iter_init(struct btree_trans *trans,
 				   unsigned depth,
 				   unsigned flags)
 {
-	EBUG_ON(trans->restarted);
+	if (trans->restarted)
+		panic("bch2_trans_iter_init(): in transaction restart, %s by %pS\n",
+		      bch2_err_str(trans->restarted),
+		      (void *) trans->last_restarted_ip);
 
 	if (flags & BTREE_ITER_ALL_LEVELS)
 		flags |= BTREE_ITER_ALL_SNAPSHOTS|__BTREE_ITER_ALL_SNAPSHOTS;
-- 
cgit 


From 5c1ef830f6786059f85bebe7501b63dffed0b633 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 18 Sep 2022 15:43:50 -0400
Subject: bcachefs: Errcodes can now subtype standard error codes

The next patch is going to be adding private error codes for all the
places we return -ENOSPC.

Additionally, this patch updates return paths at all module boundaries
to call bch2_err_class(), to return the standard error code.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/errcode.c  | 13 ++++++-
 fs/bcachefs/errcode.h  | 97 +++++++++++++++++++++++++++-----------------------
 fs/bcachefs/fs-io.c    | 49 +++++++++++++++----------
 fs/bcachefs/fs-ioctl.c | 50 +++++++++++++++++---------
 fs/bcachefs/fs.c       | 23 ++++++------
 fs/bcachefs/sysfs.c    | 15 ++++++--
 fs/bcachefs/xattr.c    | 16 ++++++---
 7 files changed, 164 insertions(+), 99 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/errcode.c b/fs/bcachefs/errcode.c
index 9da8a5973af0..cc9ce0be356e 100644
--- a/fs/bcachefs/errcode.c
+++ b/fs/bcachefs/errcode.c
@@ -15,7 +15,7 @@ static const char * const bch2_errcode_strs[] = {
 #define BCH_ERR_0	0
 
 static unsigned bch2_errcode_parents[] = {
-#define x(class, err) [BCH_ERR_##err - BCH_ERR_START] = BCH_ERR_##class,
+#define x(class, err) [BCH_ERR_##err - BCH_ERR_START] = class,
 	BCH_ERRCODES()
 #undef x
 };
@@ -49,3 +49,14 @@ bool __bch2_err_matches(int err, int class)
 
 	return err == class;
 }
+
+int __bch2_err_class(int err)
+{
+	err = -err;
+	BUG_ON((unsigned) err >= BCH_ERR_MAX);
+
+	while (err >= BCH_ERR_START && bch2_errcode_parents[err - BCH_ERR_START])
+		err = bch2_errcode_parents[err - BCH_ERR_START];
+
+	return -err;
+}
diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h
index 15a1be2fcc84..2088cc5a4f3c 100644
--- a/fs/bcachefs/errcode.h
+++ b/fs/bcachefs/errcode.h
@@ -3,51 +3,51 @@
 #define _BCACHEFS_ERRCODE_H
 
 #define BCH_ERRCODES()							\
-	x(0,			open_buckets_empty)			\
-	x(0,			freelist_empty)				\
-	x(freelist_empty,	no_buckets_found)			\
-	x(0,			insufficient_devices)			\
-	x(0,			transaction_restart)			\
-	x(transaction_restart,	transaction_restart_fault_inject)	\
-	x(transaction_restart,	transaction_restart_relock)		\
-	x(transaction_restart,	transaction_restart_relock_path)	\
-	x(transaction_restart,	transaction_restart_relock_path_intent)	\
-	x(transaction_restart,	transaction_restart_relock_after_fill)	\
-	x(transaction_restart,	transaction_restart_too_many_iters)	\
-	x(transaction_restart,	transaction_restart_lock_node_reused)	\
-	x(transaction_restart,	transaction_restart_fill_relock)	\
-	x(transaction_restart,	transaction_restart_fill_mem_alloc_fail)\
-	x(transaction_restart,	transaction_restart_mem_realloced)	\
-	x(transaction_restart,	transaction_restart_in_traverse_all)	\
-	x(transaction_restart,	transaction_restart_would_deadlock)	\
-	x(transaction_restart,	transaction_restart_would_deadlock_write)\
-	x(transaction_restart,	transaction_restart_upgrade)		\
-	x(transaction_restart,	transaction_restart_key_cache_upgrade)	\
-	x(transaction_restart,	transaction_restart_key_cache_fill)	\
-	x(transaction_restart,	transaction_restart_key_cache_raced)	\
-	x(transaction_restart,	transaction_restart_key_cache_realloced)\
-	x(transaction_restart,	transaction_restart_journal_preres_get)	\
-	x(transaction_restart,	transaction_restart_nested)		\
-	x(0,			no_btree_node)				\
-	x(no_btree_node,	no_btree_node_relock)			\
-	x(no_btree_node,	no_btree_node_upgrade)			\
-	x(no_btree_node,	no_btree_node_drop)			\
-	x(no_btree_node,	no_btree_node_lock_root)		\
-	x(no_btree_node,	no_btree_node_up)			\
-	x(no_btree_node,	no_btree_node_down)			\
-	x(no_btree_node,	no_btree_node_init)			\
-	x(no_btree_node,	no_btree_node_cached)			\
-	x(0,			lock_fail_node_reused)			\
-	x(0,			lock_fail_root_changed)			\
-	x(0,			journal_reclaim_would_deadlock)		\
-	x(0,			fsck)					\
-	x(fsck,			fsck_fix)				\
-	x(fsck,			fsck_ignore)				\
-	x(fsck,			fsck_errors_not_fixed)			\
-	x(fsck,			fsck_repair_unimplemented)		\
-	x(fsck,			fsck_repair_impossible)			\
-	x(0,			need_snapshot_cleanup)			\
-	x(0,			need_topology_repair)
+	x(0,				open_buckets_empty)			\
+	x(0,				freelist_empty)				\
+	x(BCH_ERR_freelist_empty,	no_buckets_found)			\
+	x(0,				insufficient_devices)			\
+	x(0,				transaction_restart)			\
+	x(BCH_ERR_transaction_restart,	transaction_restart_fault_inject)	\
+	x(BCH_ERR_transaction_restart,	transaction_restart_relock)		\
+	x(BCH_ERR_transaction_restart,	transaction_restart_relock_path)	\
+	x(BCH_ERR_transaction_restart,	transaction_restart_relock_path_intent)	\
+	x(BCH_ERR_transaction_restart,	transaction_restart_relock_after_fill)	\
+	x(BCH_ERR_transaction_restart,	transaction_restart_too_many_iters)	\
+	x(BCH_ERR_transaction_restart,	transaction_restart_lock_node_reused)	\
+	x(BCH_ERR_transaction_restart,	transaction_restart_fill_relock)	\
+	x(BCH_ERR_transaction_restart,	transaction_restart_fill_mem_alloc_fail)\
+	x(BCH_ERR_transaction_restart,	transaction_restart_mem_realloced)	\
+	x(BCH_ERR_transaction_restart,	transaction_restart_in_traverse_all)	\
+	x(BCH_ERR_transaction_restart,	transaction_restart_would_deadlock)	\
+	x(BCH_ERR_transaction_restart,	transaction_restart_would_deadlock_write)\
+	x(BCH_ERR_transaction_restart,	transaction_restart_upgrade)		\
+	x(BCH_ERR_transaction_restart,	transaction_restart_key_cache_upgrade)	\
+	x(BCH_ERR_transaction_restart,	transaction_restart_key_cache_fill)	\
+	x(BCH_ERR_transaction_restart,	transaction_restart_key_cache_raced)	\
+	x(BCH_ERR_transaction_restart,	transaction_restart_key_cache_realloced)\
+	x(BCH_ERR_transaction_restart,	transaction_restart_journal_preres_get)	\
+	x(BCH_ERR_transaction_restart,	transaction_restart_nested)		\
+	x(0,				no_btree_node)				\
+	x(BCH_ERR_no_btree_node,	no_btree_node_relock)			\
+	x(BCH_ERR_no_btree_node,	no_btree_node_upgrade)			\
+	x(BCH_ERR_no_btree_node,	no_btree_node_drop)			\
+	x(BCH_ERR_no_btree_node,	no_btree_node_lock_root)		\
+	x(BCH_ERR_no_btree_node,	no_btree_node_up)			\
+	x(BCH_ERR_no_btree_node,	no_btree_node_down)			\
+	x(BCH_ERR_no_btree_node,	no_btree_node_init)			\
+	x(BCH_ERR_no_btree_node,	no_btree_node_cached)			\
+	x(0,				lock_fail_node_reused)			\
+	x(0,				lock_fail_root_changed)			\
+	x(0,				journal_reclaim_would_deadlock)		\
+	x(0,				fsck)					\
+	x(BCH_ERR_fsck,			fsck_fix)				\
+	x(BCH_ERR_fsck,			fsck_ignore)				\
+	x(BCH_ERR_fsck,			fsck_errors_not_fixed)			\
+	x(BCH_ERR_fsck,			fsck_repair_unimplemented)		\
+	x(BCH_ERR_fsck,			fsck_repair_impossible)			\
+	x(0,				need_snapshot_cleanup)			\
+	x(0,				need_topology_repair)
 
 enum bch_errcode {
 	BCH_ERR_START		= 2048,
@@ -71,4 +71,11 @@ static inline bool _bch2_err_matches(int err, int class)
 	_bch2_err_matches(_err, _class);		\
 })
 
+int __bch2_err_class(int);
+
+static inline long bch2_err_class(long err)
+{
+	return err < 0 ? __bch2_err_class(err) : err;
+}
+
 #endif /* _BCACHFES_ERRCODE_H */
diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 9f1ecb8d7b3b..c83e1de9a39a 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -1186,7 +1186,7 @@ int bch2_read_folio(struct file *file, struct folio *folio)
 
 	ret = bch2_read_single_page(page, page->mapping);
 	folio_unlock(folio);
-	return ret;
+	return bch2_err_class(ret);
 }
 
 /* writepages: */
@@ -1465,7 +1465,7 @@ int bch2_writepages(struct address_space *mapping, struct writeback_control *wbc
 	if (w.io)
 		bch2_writepage_do_io(&w);
 	blk_finish_plug(&plug);
-	return ret;
+	return bch2_err_class(ret);
 }
 
 /* buffered writes: */
@@ -1550,7 +1550,7 @@ err_unlock:
 	bch2_pagecache_add_put(&inode->ei_pagecache_lock);
 	kfree(res);
 	*fsdata = NULL;
-	return ret;
+	return bch2_err_class(ret);
 }
 
 int bch2_write_end(struct file *file, struct address_space *mapping,
@@ -1975,7 +1975,7 @@ ssize_t bch2_read_iter(struct kiocb *iocb, struct iov_iter *iter)
 						iocb->ki_pos,
 						iocb->ki_pos + count - 1);
 			if (ret < 0)
-				return ret;
+				goto out;
 		}
 
 		file_accessed(file);
@@ -1991,8 +1991,8 @@ ssize_t bch2_read_iter(struct kiocb *iocb, struct iov_iter *iter)
 		ret = generic_file_read_iter(iocb, iter);
 		bch2_pagecache_add_put(&inode->ei_pagecache_lock);
 	}
-
-	return ret;
+out:
+	return bch2_err_class(ret);
 }
 
 /* O_DIRECT writes */
@@ -2224,6 +2224,9 @@ err:
 	/* inode->i_dio_count is our ref on inode and thus bch_fs */
 	inode_dio_end(&inode->v);
 
+	if (ret < 0)
+		ret = bch2_err_class(ret);
+
 	if (!sync) {
 		req->ki_complete(req, ret);
 		ret = -EIOCBQUEUED;
@@ -2332,8 +2335,10 @@ ssize_t bch2_write_iter(struct kiocb *iocb, struct iov_iter *from)
 	struct bch_inode_info *inode = file_bch_inode(file);
 	ssize_t	ret;
 
-	if (iocb->ki_flags & IOCB_DIRECT)
-		return bch2_direct_write(iocb, from);
+	if (iocb->ki_flags & IOCB_DIRECT) {
+		ret = bch2_direct_write(iocb, from);
+		goto out;
+	}
 
 	inode_lock(&inode->v);
 
@@ -2357,8 +2362,8 @@ unlock:
 
 	if (ret > 0)
 		ret = generic_write_sync(iocb, ret);
-
-	return ret;
+out:
+	return bch2_err_class(ret);
 }
 
 /* fsync: */
@@ -2392,7 +2397,7 @@ int bch2_fsync(struct file *file, loff_t start, loff_t end, int datasync)
 	ret2 = sync_inode_metadata(&inode->v, 1);
 	ret3 = bch2_flush_inode(c, inode_inum(inode));
 
-	return ret ?: ret2 ?: ret3;
+	return bch2_err_class(ret ?: ret2 ?: ret3);
 }
 
 /* truncate: */
@@ -2698,7 +2703,7 @@ int bch2_truncate(struct mnt_idmap *idmap,
 	ret = bch2_setattr_nonsize(idmap, inode, iattr);
 err:
 	bch2_pagecache_block_put(&inode->ei_pagecache_lock);
-	return ret;
+	return bch2_err_class(ret);
 }
 
 /* fallocate: */
@@ -3128,7 +3133,7 @@ long bch2_fallocate_dispatch(struct file *file, int mode,
 	inode_unlock(&inode->v);
 	percpu_ref_put(&c->writes);
 
-	return ret;
+	return bch2_err_class(ret);
 }
 
 loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src,
@@ -3206,7 +3211,7 @@ loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src,
 err:
 	bch2_unlock_inodes(INODE_LOCK|INODE_PAGECACHE_BLOCK, src, dst);
 
-	return ret;
+	return bch2_err_class(ret);
 }
 
 /* fseek: */
@@ -3431,18 +3436,26 @@ err:
 
 loff_t bch2_llseek(struct file *file, loff_t offset, int whence)
 {
+	loff_t ret;
+
 	switch (whence) {
 	case SEEK_SET:
 	case SEEK_CUR:
 	case SEEK_END:
-		return generic_file_llseek(file, offset, whence);
+		ret = generic_file_llseek(file, offset, whence);
+		break;
 	case SEEK_DATA:
-		return bch2_seek_data(file, offset);
+		ret = bch2_seek_data(file, offset);
+		break;
 	case SEEK_HOLE:
-		return bch2_seek_hole(file, offset);
+		ret = bch2_seek_hole(file, offset);
+		break;
+	default:
+		ret = -EINVAL;
+		break;
 	}
 
-	return -EINVAL;
+	return bch2_err_class(ret);
 }
 
 void bch2_fs_fsio_exit(struct bch_fs *c)
diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c
index de94895ace9f..3df2f5f3d1ea 100644
--- a/fs/bcachefs/fs-ioctl.c
+++ b/fs/bcachefs/fs-ioctl.c
@@ -455,51 +455,67 @@ long bch2_fs_file_ioctl(struct file *file, unsigned cmd, unsigned long arg)
 {
 	struct bch_inode_info *inode = file_bch_inode(file);
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	long ret;
 
 	switch (cmd) {
 	case FS_IOC_GETFLAGS:
-		return bch2_ioc_getflags(inode, (int __user *) arg);
+		ret = bch2_ioc_getflags(inode, (int __user *) arg);
+		break;
 
 	case FS_IOC_SETFLAGS:
-		return bch2_ioc_setflags(c, file, inode, (int __user *) arg);
+		ret = bch2_ioc_setflags(c, file, inode, (int __user *) arg);
+		break;
 
 	case FS_IOC_FSGETXATTR:
-		return bch2_ioc_fsgetxattr(inode, (void __user *) arg);
+		ret = bch2_ioc_fsgetxattr(inode, (void __user *) arg);
+		break;
+
 	case FS_IOC_FSSETXATTR:
-		return bch2_ioc_fssetxattr(c, file, inode,
-					   (void __user *) arg);
+		ret = bch2_ioc_fssetxattr(c, file, inode,
+					  (void __user *) arg);
+		break;
 
 	case BCHFS_IOC_REINHERIT_ATTRS:
-		return bch2_ioc_reinherit_attrs(c, file, inode,
-						(void __user *) arg);
+		ret = bch2_ioc_reinherit_attrs(c, file, inode,
+					       (void __user *) arg);
+		break;
 
 	case FS_IOC_GETVERSION:
-		return -ENOTTY;
+		ret = -ENOTTY;
+		break;
+
 	case FS_IOC_SETVERSION:
-		return -ENOTTY;
+		ret = -ENOTTY;
+		break;
 
 	case FS_IOC_GOINGDOWN:
-		return bch2_ioc_goingdown(c, (u32 __user *) arg);
+		ret = bch2_ioc_goingdown(c, (u32 __user *) arg);
+		break;
 
 	case BCH_IOCTL_SUBVOLUME_CREATE: {
 		struct bch_ioctl_subvolume i;
 
-		if (copy_from_user(&i, (void __user *) arg, sizeof(i)))
-			return -EFAULT;
-		return bch2_ioctl_subvolume_create(c, file, i);
+		ret = copy_from_user(&i, (void __user *) arg, sizeof(i))
+			? -EFAULT
+			: bch2_ioctl_subvolume_create(c, file, i);
+		break;
 	}
 
 	case BCH_IOCTL_SUBVOLUME_DESTROY: {
 		struct bch_ioctl_subvolume i;
 
-		if (copy_from_user(&i, (void __user *) arg, sizeof(i)))
-			return -EFAULT;
-		return bch2_ioctl_subvolume_destroy(c, file, i);
+		ret = copy_from_user(&i, (void __user *) arg, sizeof(i))
+			? -EFAULT
+			: bch2_ioctl_subvolume_destroy(c, file, i);
+		break;
 	}
 
 	default:
-		return bch2_fs_ioctl(c, cmd, (void __user *) arg);
+		ret = bch2_fs_ioctl(c, cmd, (void __user *) arg);
+		break;
 	}
+
+	return bch2_err_class(ret);
 }
 
 #ifdef CONFIG_COMPAT
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index af4941862187..66fcd3e28e0c 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -419,7 +419,7 @@ static int bch2_mknod(struct mnt_idmap *idmap,
 			      (subvol_inum) { 0 }, 0);
 
 	if (IS_ERR(inode))
-		return PTR_ERR(inode);
+		return bch2_err_class(PTR_ERR(inode));
 
 	d_instantiate(dentry, &inode->v);
 	return 0;
@@ -529,7 +529,7 @@ static int bch2_symlink(struct mnt_idmap *idmap,
 	inode = __bch2_create(idmap, dir, dentry, S_IFLNK|S_IRWXUGO, 0,
 			      (subvol_inum) { 0 }, BCH_CREATE_TMPFILE);
 	if (unlikely(IS_ERR(inode)))
-		return PTR_ERR(inode);
+		return bch2_err_class(PTR_ERR(inode));
 
 	inode_lock(&inode->v);
 	ret = page_symlink(&inode->v, symname, strlen(symname) + 1);
@@ -769,7 +769,7 @@ err_trans:
 err:
 	mutex_unlock(&inode->ei_update_lock);
 
-	return ret;
+	return bch2_err_class(ret);
 }
 
 static int bch2_getattr(struct mnt_idmap *idmap,
@@ -839,7 +839,7 @@ static int bch2_tmpfile(struct mnt_idmap *idmap,
 			      (subvol_inum) { 0 }, BCH_CREATE_TMPFILE);
 
 	if (IS_ERR(inode))
-		return PTR_ERR(inode);
+		return bch2_err_class(PTR_ERR(inode));
 
 	d_mark_tmpfile(file, &inode->v);
 	d_instantiate(file->f_path.dentry, &inode->v);
@@ -1454,7 +1454,7 @@ static int bch2_vfs_write_inode(struct inode *vinode,
 			       ATTR_ATIME|ATTR_MTIME|ATTR_CTIME);
 	mutex_unlock(&inode->ei_update_lock);
 
-	return ret;
+	return bch2_err_class(ret);
 }
 
 static void bch2_evict_inode(struct inode *vinode)
@@ -1558,6 +1558,7 @@ static int bch2_statfs(struct dentry *dentry, struct kstatfs *buf)
 static int bch2_sync_fs(struct super_block *sb, int wait)
 {
 	struct bch_fs *c = sb->s_fs_info;
+	int ret;
 
 	if (c->opts.journal_flush_disabled)
 		return 0;
@@ -1567,7 +1568,8 @@ static int bch2_sync_fs(struct super_block *sb, int wait)
 		return 0;
 	}
 
-	return bch2_journal_flush(&c->journal);
+	ret = bch2_journal_flush(&c->journal);
+	return bch2_err_class(ret);
 }
 
 static struct bch_fs *bch2_path_to_fs(const char *path)
@@ -1623,7 +1625,7 @@ static int bch2_remount(struct super_block *sb, int *flags, char *data)
 
 	ret = bch2_parse_mount_opts(c, &opts, data);
 	if (ret)
-		return ret;
+		goto err;
 
 	if (opts.read_only != c->opts.read_only) {
 		down_write(&c->state_lock);
@@ -1637,7 +1639,8 @@ static int bch2_remount(struct super_block *sb, int *flags, char *data)
 			if (ret) {
 				bch_err(c, "error going rw: %i", ret);
 				up_write(&c->state_lock);
-				return -EINVAL;
+				ret = -EINVAL;
+				goto err;
 			}
 
 			sb->s_flags &= ~SB_RDONLY;
@@ -1650,8 +1653,8 @@ static int bch2_remount(struct super_block *sb, int *flags, char *data)
 
 	if (opts.errors >= 0)
 		c->opts.errors = opts.errors;
-
-	return ret;
+err:
+	return bch2_err_class(ret);
 }
 
 static int bch2_show_devname(struct seq_file *seq, struct dentry *root)
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index 96c107e0508e..50b3ba92c5ae 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -40,14 +40,14 @@
 #include "util.h"
 
 #define SYSFS_OPS(type)							\
-const struct sysfs_ops type ## _sysfs_ops = {					\
+const struct sysfs_ops type ## _sysfs_ops = {				\
 	.show	= type ## _show,					\
 	.store	= type ## _store					\
 }
 
 #define SHOW(fn)							\
 static ssize_t fn ## _to_text(struct printbuf *,			\
-			      struct kobject *, struct attribute *);\
+			      struct kobject *, struct attribute *);	\
 									\
 static ssize_t fn ## _show(struct kobject *kobj, struct attribute *attr,\
 			   char *buf)					\
@@ -66,15 +66,24 @@ static ssize_t fn ## _show(struct kobject *kobj, struct attribute *attr,\
 		memcpy(buf, out.buf, ret);				\
 	}								\
 	printbuf_exit(&out);						\
-	return ret;							\
+	return bch2_err_class(ret);					\
 }									\
 									\
 static ssize_t fn ## _to_text(struct printbuf *out, struct kobject *kobj,\
 			      struct attribute *attr)
 
 #define STORE(fn)							\
+static ssize_t fn ## _store_inner(struct kobject *, struct attribute *,\
+			    const char *, size_t);			\
+									\
 static ssize_t fn ## _store(struct kobject *kobj, struct attribute *attr,\
 			    const char *buf, size_t size)		\
+{									\
+	return bch2_err_class(fn##_store_inner(kobj, attr, buf, size));	\
+}									\
+									\
+static ssize_t fn ## _store_inner(struct kobject *kobj, struct attribute *attr,\
+				  const char *buf, size_t size)
 
 #define __sysfs_attribute(_name, _mode)					\
 	static struct attribute sysfs_##_name =				\
diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c
index 37793b3357d3..2b9fb4941e9f 100644
--- a/fs/bcachefs/xattr.c
+++ b/fs/bcachefs/xattr.c
@@ -350,17 +350,19 @@ err:
 	bch2_trans_exit(&trans);
 
 	if (ret)
-		return ret;
+		goto out;
 
 	ret = bch2_xattr_list_bcachefs(c, &inode->ei_inode, &buf, false);
 	if (ret)
-		return ret;
+		goto out;
 
 	ret = bch2_xattr_list_bcachefs(c, &inode->ei_inode, &buf, true);
 	if (ret)
-		return ret;
+		goto out;
 
 	return buf.used;
+out:
+	return bch2_err_class(ret);
 }
 
 static int bch2_xattr_get_handler(const struct xattr_handler *handler,
@@ -369,8 +371,10 @@ static int bch2_xattr_get_handler(const struct xattr_handler *handler,
 {
 	struct bch_inode_info *inode = to_bch_ei(vinode);
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	int ret;
 
-	return bch2_xattr_get(c, inode, name, buffer, size, handler->flags);
+	ret = bch2_xattr_get(c, inode, name, buffer, size, handler->flags);
+	return bch2_err_class(ret);
 }
 
 static int bch2_xattr_set_handler(const struct xattr_handler *handler,
@@ -382,11 +386,13 @@ static int bch2_xattr_set_handler(const struct xattr_handler *handler,
 	struct bch_inode_info *inode = to_bch_ei(vinode);
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
 	struct bch_hash_info hash = bch2_hash_info_init(c, &inode->ei_inode);
+	int ret;
 
-	return bch2_trans_do(c, NULL, NULL, 0,
+	ret = bch2_trans_do(c, NULL, NULL, 0,
 			bch2_xattr_set(&trans, inode_inum(inode), &hash,
 				       name, value, size,
 				       handler->flags, flags));
+	return bch2_err_class(ret);
 }
 
 static const struct xattr_handler bch_xattr_user_handler = {
-- 
cgit 


From 098ef98d5bff461c66c3798fbebca7b1c06fdf79 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 18 Sep 2022 17:10:33 -0400
Subject: bcachefs: Add private error codes for ENOSPC

Continuing the saga of introducing private dedicated error codes for
each error path, this patch converts ENOSPC to error codes that are
subtypes of ENOSPC. We've recently had a test failure where we got
-ENOSPC where we shouldn't have, and didn't have enough information to
tell where it came from, so this patch will solve that problem.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_foreground.c  |  4 +++-
 fs/bcachefs/btree_update_leaf.c |  8 +++++---
 fs/bcachefs/buckets.c           |  2 +-
 fs/bcachefs/disk_groups.c       |  2 +-
 fs/bcachefs/ec.c                |  4 ++--
 fs/bcachefs/errcode.h           | 16 +++++++++++++++-
 fs/bcachefs/fs-io.c             |  4 ++--
 fs/bcachefs/inode.c             |  2 +-
 fs/bcachefs/journal.c           | 13 ++++++++-----
 fs/bcachefs/journal_sb.c        |  2 +-
 fs/bcachefs/quota.c             |  2 +-
 fs/bcachefs/replicas.c          | 16 +++++++---------
 fs/bcachefs/str_hash.h          |  4 ++--
 fs/bcachefs/subvolume.c         |  4 ++--
 fs/bcachefs/super-io.c          |  2 +-
 fs/bcachefs/super.c             |  6 +++---
 16 files changed, 55 insertions(+), 36 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index f60fe159916e..e890b09f80c6 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -1246,7 +1246,9 @@ err:
 
 	if (bch2_err_matches(ret, BCH_ERR_open_buckets_empty) ||
 	    bch2_err_matches(ret, BCH_ERR_freelist_empty))
-		return cl ? -EAGAIN : -ENOSPC;
+		return cl
+			? -EAGAIN
+			: -BCH_ERR_ENOSPC_bucket_alloc;
 
 	if (bch2_err_matches(ret, BCH_ERR_insufficient_devices))
 		return -EROFS;
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index d262a9e16b95..bf3177a3a420 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -1037,9 +1037,11 @@ int bch2_trans_commit_error(struct btree_trans *trans,
 	}
 
 	BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart) != !!trans->restarted);
-	BUG_ON(ret == -ENOSPC &&
-	       !(trans->flags & BTREE_INSERT_NOWAIT) &&
-	       (trans->flags & BTREE_INSERT_NOFAIL));
+
+	bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOSPC) &&
+				!(trans->flags & BTREE_INSERT_NOWAIT) &&
+				(trans->flags & BTREE_INSERT_NOFAIL), c,
+		"%s: incorrectly got %s\n", __func__, bch2_err_str(ret));
 
 	return ret;
 }
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 136e116981d7..f01b8171cb92 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -1991,7 +1991,7 @@ recalculate:
 		ret = 0;
 	} else {
 		atomic64_set(&c->sectors_available, sectors_available);
-		ret = -ENOSPC;
+		ret = -BCH_ERR_ENOSPC_disk_reservation;
 	}
 
 	mutex_unlock(&c->sectors_available_lock);
diff --git a/fs/bcachefs/disk_groups.c b/fs/bcachefs/disk_groups.c
index 5f405d38b3de..6b81f35861ac 100644
--- a/fs/bcachefs/disk_groups.c
+++ b/fs/bcachefs/disk_groups.c
@@ -276,7 +276,7 @@ static int __bch2_disk_group_add(struct bch_sb_handle *sb, unsigned parent,
 
 		groups = bch2_sb_resize_disk_groups(sb, u64s);
 		if (!groups)
-			return -ENOSPC;
+			return -BCH_ERR_ENOSPC_disk_label_add;
 
 		nr_groups = disk_groups_nr(groups);
 	}
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index f33acf1af110..aa8301146382 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -731,7 +731,7 @@ static int ec_stripe_bkey_insert(struct btree_trans *trans,
 				continue;
 			}
 
-			ret = -ENOSPC;
+			ret = -BCH_ERR_ENOSPC_stripe_create;
 			break;
 		}
 
@@ -1388,7 +1388,7 @@ static int __bch2_ec_stripe_head_reuse(struct bch_fs *c,
 	idx = get_existing_stripe(c, h);
 	if (idx < 0) {
 		bch_err(c, "failed to find an existing stripe");
-		return -ENOSPC;
+		return -BCH_ERR_ENOSPC_stripe_reuse;
 	}
 
 	h->s->have_existing_stripe = true;
diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h
index 2088cc5a4f3c..3dc477eb3600 100644
--- a/fs/bcachefs/errcode.h
+++ b/fs/bcachefs/errcode.h
@@ -2,7 +2,21 @@
 #ifndef _BCACHEFS_ERRCODE_H
 #define _BCACHEFS_ERRCODE_H
 
-#define BCH_ERRCODES()							\
+#define BCH_ERRCODES()								\
+	x(ENOSPC,			ENOSPC_disk_reservation)		\
+	x(ENOSPC,			ENOSPC_bucket_alloc)			\
+	x(ENOSPC,			ENOSPC_disk_label_add)			\
+	x(ENOSPC,			ENOSPC_stripe_create)			\
+	x(ENOSPC,			ENOSPC_stripe_reuse)			\
+	x(ENOSPC,			ENOSPC_inode_create)			\
+	x(ENOSPC,			ENOSPC_str_hash_create)			\
+	x(ENOSPC,			ENOSPC_snapshot_create)			\
+	x(ENOSPC,			ENOSPC_subvolume_create)		\
+	x(ENOSPC,			ENOSPC_sb)				\
+	x(ENOSPC,			ENOSPC_sb_journal)			\
+	x(ENOSPC,			ENOSPC_sb_quota)			\
+	x(ENOSPC,			ENOSPC_sb_replicas)			\
+	x(ENOSPC,			ENOSPC_sb_members)			\
 	x(0,				open_buckets_empty)			\
 	x(0,				freelist_empty)				\
 	x(BCH_ERR_freelist_empty,	no_buckets_found)			\
diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index c83e1de9a39a..73f5677cadce 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -3031,7 +3031,7 @@ bkey_err:
 	bch2_trans_unlock(&trans); /* lock ordering, before taking pagecache locks: */
 	mark_pagecache_reserved(inode, start_sector, iter.pos.offset);
 
-	if (ret == -ENOSPC && (mode & FALLOC_FL_ZERO_RANGE)) {
+	if (bch2_err_matches(ret, ENOSPC) && (mode & FALLOC_FL_ZERO_RANGE)) {
 		struct quota_res quota_res = { 0 };
 		s64 i_sectors_delta = 0;
 
@@ -3082,7 +3082,7 @@ static long bchfs_fallocate(struct bch_inode_info *inode, int mode,
 	 * so that the VFS cache i_size is consistent with the btree i_size:
 	 */
 	if (ret &&
-	    !(ret == -ENOSPC && (mode & FALLOC_FL_ZERO_RANGE)))
+	    !(bch2_err_matches(ret, ENOSPC) && (mode & FALLOC_FL_ZERO_RANGE)))
 		return ret;
 
 	if (mode & FALLOC_FL_KEEP_SIZE && end > inode->v.i_size)
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index fc0f98074dab..99987db87ab6 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -552,7 +552,7 @@ again:
 		goto found_slot;
 
 	if (!ret && start == min)
-		ret = -ENOSPC;
+		ret = -BCH_ERR_ENOSPC_inode_create;
 
 	if (ret) {
 		bch2_trans_iter_exit(trans, iter);
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index 9961cc674ad7..97c1ecb65dbd 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -808,14 +808,16 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
 		if (new_fs) {
 			bu[nr_got] = bch2_bucket_alloc_new_fs(ca);
 			if (bu[nr_got] < 0) {
-				ret = -ENOSPC;
+				ret = -BCH_ERR_ENOSPC_bucket_alloc;
 				break;
 			}
 		} else {
 			ob[nr_got] = bch2_bucket_alloc(c, ca, RESERVE_none,
 					       false, cl);
 			if (IS_ERR(ob[nr_got])) {
-				ret = cl ? -EAGAIN : -ENOSPC;
+				ret = cl
+					? -EAGAIN
+					: -BCH_ERR_ENOSPC_bucket_alloc;
 				break;
 			}
 
@@ -942,10 +944,11 @@ int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca,
 		 * reservation to ensure we'll actually be able to allocate:
 		 */
 
-		if (bch2_disk_reservation_get(c, &disk_res,
-					      bucket_to_sector(ca, nr - ja->nr), 1, 0)) {
+		ret = bch2_disk_reservation_get(c, &disk_res,
+						bucket_to_sector(ca, nr - ja->nr), 1, 0);
+		if (ret) {
 			mutex_unlock(&c->sb_lock);
-			return -ENOSPC;
+			return ret;
 		}
 
 		ret = __bch2_set_nr_journal_buckets(ca, nr, false, &cl);
diff --git a/fs/bcachefs/journal_sb.c b/fs/bcachefs/journal_sb.c
index 001cecec1291..cfdbd92d2164 100644
--- a/fs/bcachefs/journal_sb.c
+++ b/fs/bcachefs/journal_sb.c
@@ -197,7 +197,7 @@ int bch2_journal_buckets_to_sb(struct bch_fs *c, struct bch_dev *ca)
 	j = bch2_sb_resize_journal_v2(&ca->disk_sb,
 				 (sizeof(*j) + sizeof(j->d[0]) * nr) / sizeof(u64));
 	if (!j)
-		return -ENOSPC;
+		return -BCH_ERR_ENOSPC_sb_journal;
 
 	bch2_sb_field_delete(&ca->disk_sb, BCH_SB_FIELD_journal);
 
diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c
index 454c76e03be9..c12d715fb758 100644
--- a/fs/bcachefs/quota.c
+++ b/fs/bcachefs/quota.c
@@ -665,7 +665,7 @@ static int bch2_quota_set_info(struct super_block *sb, int type,
 		sb_quota = bch2_sb_resize_quota(&c->disk_sb,
 					sizeof(*sb_quota) / sizeof(u64));
 		if (!sb_quota)
-			return -ENOSPC;
+			return -BCH_ERR_ENOSPC_sb_quota;
 	}
 
 	if (info->i_fieldmask & QC_SPC_TIMER)
diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c
index 4ede807e2fb7..e540c1aa91ba 100644
--- a/fs/bcachefs/replicas.c
+++ b/fs/bcachefs/replicas.c
@@ -485,7 +485,7 @@ int bch2_replicas_gc_end(struct bch_fs *c, int ret)
 		    bch2_fs_usage_read_one(c, &c->usage_base->replicas[i])) {
 			n = cpu_replicas_add_entry(&c->replicas_gc, e);
 			if (!n.entries) {
-				ret = -ENOSPC;
+				ret = -ENOMEM;
 				goto err;
 			}
 
@@ -494,10 +494,9 @@ int bch2_replicas_gc_end(struct bch_fs *c, int ret)
 		}
 	}
 
-	if (bch2_cpu_replicas_to_sb_replicas(c, &c->replicas_gc)) {
-		ret = -ENOSPC;
+	ret = bch2_cpu_replicas_to_sb_replicas(c, &c->replicas_gc);
+	if (ret)
 		goto err;
-	}
 
 	ret = replicas_table_update(c, &c->replicas_gc);
 err:
@@ -600,10 +599,9 @@ retry:
 
 	bch2_cpu_replicas_sort(&new);
 
-	if (bch2_cpu_replicas_to_sb_replicas(c, &new)) {
-		ret = -ENOSPC;
+	ret = bch2_cpu_replicas_to_sb_replicas(c, &new);
+	if (ret)
 		goto err;
-	}
 
 	ret = replicas_table_update(c, &new);
 err:
@@ -758,7 +756,7 @@ static int bch2_cpu_replicas_to_sb_replicas_v0(struct bch_fs *c,
 	sb_r = bch2_sb_resize_replicas_v0(&c->disk_sb,
 			DIV_ROUND_UP(bytes, sizeof(u64)));
 	if (!sb_r)
-		return -ENOSPC;
+		return -BCH_ERR_ENOSPC_sb_replicas;
 
 	bch2_sb_field_delete(&c->disk_sb, BCH_SB_FIELD_replicas);
 	sb_r = bch2_sb_get_replicas_v0(c->disk_sb.sb);
@@ -803,7 +801,7 @@ static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *c,
 	sb_r = bch2_sb_resize_replicas(&c->disk_sb,
 			DIV_ROUND_UP(bytes, sizeof(u64)));
 	if (!sb_r)
-		return -ENOSPC;
+		return -BCH_ERR_ENOSPC_sb_replicas;
 
 	bch2_sb_field_delete(&c->disk_sb, BCH_SB_FIELD_replicas_v0);
 	sb_r = bch2_sb_get_replicas(c->disk_sb.sb);
diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h
index 560983df13f0..6178ae620ff1 100644
--- a/fs/bcachefs/str_hash.h
+++ b/fs/bcachefs/str_hash.h
@@ -207,7 +207,7 @@ bch2_hash_hole(struct btree_trans *trans,
 			return 0;
 	bch2_trans_iter_exit(trans, iter);
 
-	return ret ?: -ENOSPC;
+	return ret ?: -BCH_ERR_ENOSPC_str_hash_create;
 }
 
 static __always_inline
@@ -277,7 +277,7 @@ int bch2_hash_set_snapshot(struct btree_trans *trans,
 	}
 
 	if (!ret)
-		ret = -ENOSPC;
+		ret = -BCH_ERR_ENOSPC_str_hash_create;
 out:
 	bch2_trans_iter_exit(trans, &slot);
 	bch2_trans_iter_exit(trans, &iter);
diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c
index fb3f8e4074c7..8c98bacca290 100644
--- a/fs/bcachefs/subvolume.c
+++ b/fs/bcachefs/subvolume.c
@@ -517,7 +517,7 @@ int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent,
 			goto err;
 
 		if (!k.k || !k.k->p.offset) {
-			ret = -ENOSPC;
+			ret = -BCH_ERR_ENOSPC_snapshot_create;
 			goto err;
 		}
 
@@ -1031,7 +1031,7 @@ int bch2_subvolume_create(struct btree_trans *trans, u64 inode,
 	}
 
 	if (!ret)
-		ret = -ENOSPC;
+		ret = -BCH_ERR_ENOSPC_subvolume_create;
 	goto err;
 found_slot:
 	snapshot_subvols[0] = dst_iter.pos.offset;
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index 220fda28c865..12edd4b9a44b 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -132,7 +132,7 @@ int bch2_sb_realloc(struct bch_sb_handle *sb, unsigned u64s)
 		if (new_bytes > max_bytes) {
 			pr_err("%pg: superblock too big: want %zu but have %llu",
 			       sb->bdev, new_bytes, max_bytes);
-			return -ENOSPC;
+			return -BCH_ERR_ENOSPC_sb;
 		}
 	}
 
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 1c8fac603644..8dc87c103216 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -1584,7 +1584,7 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
 				le32_to_cpu(mi->field.u64s) +
 				sizeof(dev_mi) / sizeof(u64))) {
 		bch_err(c, "device add error: new device superblock too small");
-		ret = -ENOSPC;
+		ret = -BCH_ERR_ENOSPC_sb_members;
 		goto err_unlock;
 	}
 
@@ -1597,7 +1597,7 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
 			goto have_slot;
 no_slot:
 	bch_err(c, "device add error: already have maximum number of devices");
-	ret = -ENOSPC;
+	ret = -BCH_ERR_ENOSPC_sb_members;
 	goto err_unlock;
 
 have_slot:
@@ -1608,7 +1608,7 @@ have_slot:
 	mi = bch2_sb_resize_members(&c->disk_sb, u64s);
 	if (!mi) {
 		bch_err(c, "device add error: no room in superblock for member info");
-		ret = -ENOSPC;
+		ret = -BCH_ERR_ENOSPC_sb_members;
 		goto err_unlock;
 	}
 
-- 
cgit 


From ebc6f76a667f5fc599a5f76515f6881dfb82af2f Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 25 Aug 2022 10:49:52 -0400
Subject: six locks: Simplify wait lists

This switches to a single list of waiters, instead of separate lists for
read and intent, and switches write locks to also use the wait lists
instead of being handled differently.

Also, removal from the wait list is now done by the process waiting on
the lock, not the process doing the wakeup. This is needed for the new
deadlock cycle detector - we need tasks to stay on the waitlist until
they've successfully acquired the lock.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/six.c | 107 ++++++++++++++++++++----------------------------------
 fs/bcachefs/six.h |  11 ++++--
 2 files changed, 48 insertions(+), 70 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/six.c b/fs/bcachefs/six.c
index 0ab72f59d23b..d5e09fae1538 100644
--- a/fs/bcachefs/six.c
+++ b/fs/bcachefs/six.c
@@ -89,50 +89,38 @@ static inline unsigned pcpu_read_count(struct six_lock *lock)
 	return read_count;
 }
 
-struct six_lock_waiter {
-	struct list_head	list;
-	struct task_struct	*task;
-};
-
 /* This is probably up there with the more evil things I've done */
 #define waitlist_bitnr(id) ilog2((((union six_lock_state) { .waiters = 1 << (id) }).l))
 
 static inline void six_lock_wakeup(struct six_lock *lock,
 				   union six_lock_state state,
-				   unsigned waitlist_id)
+				   enum six_lock_type lock_type)
 {
-	if (waitlist_id == SIX_LOCK_write) {
-		if (state.write_locking && !state.read_lock) {
-			struct task_struct *p = READ_ONCE(lock->owner);
-			if (p)
-				wake_up_process(p);
-		}
-	} else {
-		struct list_head *wait_list = &lock->wait_list[waitlist_id];
-		struct six_lock_waiter *w, *next;
-
-		if (!(state.waiters & (1 << waitlist_id)))
-			return;
+	struct six_lock_waiter *w;
+	bool found = false;
 
-		clear_bit(waitlist_bitnr(waitlist_id),
-			  (unsigned long *) &lock->state.v);
+	if (lock_type == SIX_LOCK_write && state.read_lock)
+		return;
 
-		raw_spin_lock(&lock->wait_lock);
+	if (!(state.waiters & (1 << lock_type)))
+		return;
 
-		list_for_each_entry_safe(w, next, wait_list, list) {
-			list_del_init(&w->list);
+	raw_spin_lock(&lock->wait_lock);
 
-			if (wake_up_process(w->task) &&
-			    waitlist_id != SIX_LOCK_read) {
-				if (!list_empty(wait_list))
-					set_bit(waitlist_bitnr(waitlist_id),
-						(unsigned long *) &lock->state.v);
-				break;
-			}
-		}
+	list_for_each_entry(w, &lock->wait_list, list) {
+		if (w->lock_want != lock_type)
+			continue;
 
-		raw_spin_unlock(&lock->wait_lock);
+		found = true;
+		wake_up_process(w->task);
+		if (lock_type != SIX_LOCK_read)
+			break;
 	}
+
+	if (!found)
+		clear_bit(waitlist_bitnr(lock_type), (unsigned long *) &lock->state.v);
+
+	raw_spin_unlock(&lock->wait_lock);
 }
 
 static __always_inline bool do_six_trylock_type(struct six_lock *lock,
@@ -146,7 +134,6 @@ static __always_inline bool do_six_trylock_type(struct six_lock *lock,
 
 	EBUG_ON(type == SIX_LOCK_write && lock->owner != current);
 	EBUG_ON(type == SIX_LOCK_write && (lock->state.seq & 1));
-
 	EBUG_ON(type == SIX_LOCK_write && (try != !(lock->state.write_locking)));
 
 	/*
@@ -182,12 +169,8 @@ retry:
 		 * lock, issue a wakeup because we might have caused a
 		 * spurious trylock failure:
 		 */
-		if (old.write_locking) {
-			struct task_struct *p = READ_ONCE(lock->owner);
-
-			if (p)
-				wake_up_process(p);
-		}
+		if (old.write_locking)
+			six_lock_wakeup(lock, old, SIX_LOCK_write);
 
 		/*
 		 * If we failed from the lock path and the waiting bit wasn't
@@ -228,6 +211,9 @@ retry:
 		if (ret || try)
 			v -= __SIX_VAL(write_locking, 1);
 
+		if (!ret && !try && !(lock->state.waiters & (1 << SIX_LOCK_write)))
+			v += __SIX_VAL(waiters, 1 << SIX_LOCK_write);
+
 		if (try && !ret) {
 			old.v = atomic64_add_return(v, &lock->state.counter);
 			six_lock_wakeup(lock, old, SIX_LOCK_read);
@@ -244,8 +230,7 @@ retry:
 
 				if (type == SIX_LOCK_write)
 					new.write_locking = 0;
-			} else if (!try && type != SIX_LOCK_write &&
-				   !(new.waiters & (1 << type)))
+			} else if (!try && !(new.waiters & (1 << type)))
 				new.waiters |= 1 << type;
 			else
 				break; /* waiting bit already set */
@@ -305,12 +290,8 @@ static bool __six_relock_type(struct six_lock *lock, enum six_lock_type type,
 		 * Similar to the lock path, we may have caused a spurious write
 		 * lock fail and need to issue a wakeup:
 		 */
-		if (old.write_locking) {
-			struct task_struct *p = READ_ONCE(lock->owner);
-
-			if (p)
-				wake_up_process(p);
-		}
+		if (old.write_locking)
+			six_lock_wakeup(lock, old, SIX_LOCK_write);
 
 		if (ret)
 			six_acquire(&lock->dep_map, 1);
@@ -479,19 +460,17 @@ static int __six_lock_type_slowpath(struct six_lock *lock, enum six_lock_type ty
 
 	lock_contended(&lock->dep_map, _RET_IP_);
 
-	INIT_LIST_HEAD(&wait.list);
-	wait.task = current;
+	wait.task	= current;
+	wait.lock_want	= type;
+
+	raw_spin_lock(&lock->wait_lock);
+	if (!(lock->state.waiters & (1 << type)))
+		set_bit(waitlist_bitnr(type), (unsigned long *) &lock->state.v);
+	list_add_tail(&wait.list, &lock->wait_list);
+	raw_spin_unlock(&lock->wait_lock);
 
 	while (1) {
 		set_current_state(TASK_UNINTERRUPTIBLE);
-		if (type == SIX_LOCK_write)
-			EBUG_ON(lock->owner != current);
-		else if (list_empty_careful(&wait.list)) {
-			raw_spin_lock(&lock->wait_lock);
-			list_add_tail(&wait.list, &lock->wait_list[type]);
-			raw_spin_unlock(&lock->wait_lock);
-		}
-
 		if (do_six_trylock_type(lock, type, false))
 			break;
 
@@ -504,11 +483,9 @@ static int __six_lock_type_slowpath(struct six_lock *lock, enum six_lock_type ty
 
 	__set_current_state(TASK_RUNNING);
 
-	if (!list_empty_careful(&wait.list)) {
-		raw_spin_lock(&lock->wait_lock);
-		list_del_init(&wait.list);
-		raw_spin_unlock(&lock->wait_lock);
-	}
+	raw_spin_lock(&lock->wait_lock);
+	list_del(&wait.list);
+	raw_spin_unlock(&lock->wait_lock);
 out_before_sleep:
 	if (ret && type == SIX_LOCK_write) {
 		old.v = atomic64_sub_return(__SIX_VAL(write_locking, 1),
@@ -702,12 +679,8 @@ void six_lock_wakeup_all(struct six_lock *lock)
 	struct six_lock_waiter *w;
 
 	raw_spin_lock(&lock->wait_lock);
-
-	list_for_each_entry(w, &lock->wait_list[0], list)
+	list_for_each_entry(w, &lock->wait_list, list)
 		wake_up_process(w->task);
-	list_for_each_entry(w, &lock->wait_list[1], list)
-		wake_up_process(w->task);
-
 	raw_spin_unlock(&lock->wait_lock);
 }
 EXPORT_SYMBOL_GPL(six_lock_wakeup_all);
diff --git a/fs/bcachefs/six.h b/fs/bcachefs/six.h
index 6c9ac82d146d..0e55845195d9 100644
--- a/fs/bcachefs/six.h
+++ b/fs/bcachefs/six.h
@@ -116,12 +116,18 @@ struct six_lock {
 	unsigned __percpu	*readers;
 
 	raw_spinlock_t		wait_lock;
-	struct list_head	wait_list[2];
+	struct list_head	wait_list;
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 	struct lockdep_map	dep_map;
 #endif
 };
 
+struct six_lock_waiter {
+	struct list_head	list;
+	struct task_struct	*task;
+	enum six_lock_type	lock_want;
+};
+
 typedef int (*six_lock_should_sleep_fn)(struct six_lock *lock, void *);
 
 static __always_inline void __six_lock_init(struct six_lock *lock,
@@ -130,8 +136,7 @@ static __always_inline void __six_lock_init(struct six_lock *lock,
 {
 	atomic64_set(&lock->state.counter, 0);
 	raw_spin_lock_init(&lock->wait_lock);
-	INIT_LIST_HEAD(&lock->wait_list[SIX_LOCK_read]);
-	INIT_LIST_HEAD(&lock->wait_list[SIX_LOCK_intent]);
+	INIT_LIST_HEAD(&lock->wait_list);
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 	debug_check_no_locks_freed((void *) lock, sizeof(*lock));
 	lockdep_init_map(&lock->dep_map, name, key, 0);
-- 
cgit 


From 0bfb9f42b7b16aa11a7b5d283b0b7b98d11476b7 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 27 Aug 2022 16:22:51 -0400
Subject: six locks: six_lock_waiter()

This allows passing in the wait list entry - to be used for a deadlock
cycle detector.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/six.c | 36 +++++++++++++++++++++++++++---------
 fs/bcachefs/six.h |  9 +++++++++
 2 files changed, 36 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/six.c b/fs/bcachefs/six.c
index d5e09fae1538..82e8d77c3082 100644
--- a/fs/bcachefs/six.c
+++ b/fs/bcachefs/six.c
@@ -439,10 +439,10 @@ static inline bool six_optimistic_spin(struct six_lock *lock, enum six_lock_type
 
 noinline
 static int __six_lock_type_slowpath(struct six_lock *lock, enum six_lock_type type,
+				    struct six_lock_waiter *wait,
 				    six_lock_should_sleep_fn should_sleep_fn, void *p)
 {
 	union six_lock_state old;
-	struct six_lock_waiter wait;
 	int ret = 0;
 
 	if (type == SIX_LOCK_write) {
@@ -460,13 +460,13 @@ static int __six_lock_type_slowpath(struct six_lock *lock, enum six_lock_type ty
 
 	lock_contended(&lock->dep_map, _RET_IP_);
 
-	wait.task	= current;
-	wait.lock_want	= type;
+	wait->task		= current;
+	wait->lock_want		= type;
 
 	raw_spin_lock(&lock->wait_lock);
 	if (!(lock->state.waiters & (1 << type)))
 		set_bit(waitlist_bitnr(type), (unsigned long *) &lock->state.v);
-	list_add_tail(&wait.list, &lock->wait_list);
+	list_add_tail(&wait->list, &lock->wait_list);
 	raw_spin_unlock(&lock->wait_lock);
 
 	while (1) {
@@ -484,7 +484,7 @@ static int __six_lock_type_slowpath(struct six_lock *lock, enum six_lock_type ty
 	__set_current_state(TASK_RUNNING);
 
 	raw_spin_lock(&lock->wait_lock);
-	list_del(&wait.list);
+	list_del(&wait->list);
 	raw_spin_unlock(&lock->wait_lock);
 out_before_sleep:
 	if (ret && type == SIX_LOCK_write) {
@@ -496,9 +496,10 @@ out_before_sleep:
 	return ret;
 }
 
-__always_inline
-static int __six_lock_type(struct six_lock *lock, enum six_lock_type type,
-			   six_lock_should_sleep_fn should_sleep_fn, void *p)
+__always_inline __flatten
+static int __six_lock_type_waiter(struct six_lock *lock, enum six_lock_type type,
+			 struct six_lock_waiter *wait,
+			 six_lock_should_sleep_fn should_sleep_fn, void *p)
 {
 	int ret;
 
@@ -506,7 +507,7 @@ static int __six_lock_type(struct six_lock *lock, enum six_lock_type type,
 		six_acquire(&lock->dep_map, 0);
 
 	ret = do_six_trylock_type(lock, type, true) ? 0
-		: __six_lock_type_slowpath(lock, type, should_sleep_fn, p);
+		: __six_lock_type_slowpath(lock, type, wait, should_sleep_fn, p);
 
 	if (ret && type != SIX_LOCK_write)
 		six_release(&lock->dep_map);
@@ -516,6 +517,15 @@ static int __six_lock_type(struct six_lock *lock, enum six_lock_type type,
 	return ret;
 }
 
+__always_inline
+static int __six_lock_type(struct six_lock *lock, enum six_lock_type type,
+			   six_lock_should_sleep_fn should_sleep_fn, void *p)
+{
+	struct six_lock_waiter wait;
+
+	return __six_lock_type_waiter(lock, type, &wait, should_sleep_fn, p);
+}
+
 __always_inline __flatten
 static void __six_unlock_type(struct six_lock *lock, enum six_lock_type type)
 {
@@ -574,6 +584,14 @@ int six_lock_##type(struct six_lock *lock,				\
 }									\
 EXPORT_SYMBOL_GPL(six_lock_##type);					\
 									\
+int six_lock_waiter_##type(struct six_lock *lock,			\
+			   struct six_lock_waiter *wait,		\
+			   six_lock_should_sleep_fn should_sleep_fn, void *p)\
+{									\
+	return __six_lock_type_waiter(lock, SIX_LOCK_##type, wait, should_sleep_fn, p);\
+}									\
+EXPORT_SYMBOL_GPL(six_lock_waiter_##type);				\
+									\
 void six_unlock_##type(struct six_lock *lock)				\
 {									\
 	__six_unlock_type(lock, SIX_LOCK_##type);			\
diff --git a/fs/bcachefs/six.h b/fs/bcachefs/six.h
index 0e55845195d9..ab06773e8094 100644
--- a/fs/bcachefs/six.h
+++ b/fs/bcachefs/six.h
@@ -156,6 +156,8 @@ do {									\
 bool six_trylock_##type(struct six_lock *);				\
 bool six_relock_##type(struct six_lock *, u32);				\
 int six_lock_##type(struct six_lock *, six_lock_should_sleep_fn, void *);\
+int six_lock_waiter_##type(struct six_lock *, struct six_lock_waiter *,	\
+			   six_lock_should_sleep_fn, void *);		\
 void six_unlock_##type(struct six_lock *);
 
 __SIX_LOCK(read)
@@ -192,6 +194,13 @@ static inline int six_lock_type(struct six_lock *lock, enum six_lock_type type,
 	SIX_LOCK_DISPATCH(type, six_lock, lock, should_sleep_fn, p);
 }
 
+static inline int six_lock_type_waiter(struct six_lock *lock, enum six_lock_type type,
+				struct six_lock_waiter *wait,
+				six_lock_should_sleep_fn should_sleep_fn, void *p)
+{
+	SIX_LOCK_DISPATCH(type, six_lock_waiter, lock, wait, should_sleep_fn, p);
+}
+
 static inline void six_unlock_type(struct six_lock *lock, enum six_lock_type type)
 {
 	SIX_LOCK_DISPATCH(type, six_unlock, lock);
-- 
cgit 


From f6ea2d575d70ab0e1aaa9f9fced1d04e6dd6ef4f Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 24 Sep 2022 01:33:13 -0400
Subject: six locks: Add start_time to six_lock_waiter

This is needed by the cycle detector in bcachefs - we need a way to
iterater over waitlist entries while dropping and retaking the waitlist
lock.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/six.c | 14 ++++++++++++++
 fs/bcachefs/six.h |  1 +
 2 files changed, 15 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/six.c b/fs/bcachefs/six.c
index 82e8d77c3082..e2cebd3ba5fe 100644
--- a/fs/bcachefs/six.c
+++ b/fs/bcachefs/six.c
@@ -6,6 +6,7 @@
 #include <linux/preempt.h>
 #include <linux/rcupdate.h>
 #include <linux/sched.h>
+#include <linux/sched/clock.h>
 #include <linux/sched/rt.h>
 #include <linux/slab.h>
 
@@ -466,6 +467,17 @@ static int __six_lock_type_slowpath(struct six_lock *lock, enum six_lock_type ty
 	raw_spin_lock(&lock->wait_lock);
 	if (!(lock->state.waiters & (1 << type)))
 		set_bit(waitlist_bitnr(type), (unsigned long *) &lock->state.v);
+	wait->start_time = local_clock();
+
+	if (!list_empty(&lock->wait_list)) {
+		struct six_lock_waiter *last =
+			list_last_entry(&lock->wait_list,
+				struct six_lock_waiter, list);
+
+		if (time_before_eq64(wait->start_time, last->start_time))
+			wait->start_time = last->start_time + 1;
+	}
+
 	list_add_tail(&wait->list, &lock->wait_list);
 	raw_spin_unlock(&lock->wait_lock);
 
@@ -503,6 +515,8 @@ static int __six_lock_type_waiter(struct six_lock *lock, enum six_lock_type type
 {
 	int ret;
 
+	wait->start_time = 0;
+
 	if (type != SIX_LOCK_write)
 		six_acquire(&lock->dep_map, 0);
 
diff --git a/fs/bcachefs/six.h b/fs/bcachefs/six.h
index ab06773e8094..757f8aa4d339 100644
--- a/fs/bcachefs/six.h
+++ b/fs/bcachefs/six.h
@@ -126,6 +126,7 @@ struct six_lock_waiter {
 	struct list_head	list;
 	struct task_struct	*task;
 	enum six_lock_type	lock_want;
+	u64			start_time;
 };
 
 typedef int (*six_lock_should_sleep_fn)(struct six_lock *lock, void *);
-- 
cgit 


From 5b254da5733d9b8c6a13073fecc506c2861aaeb2 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 24 Sep 2022 00:13:56 -0400
Subject: six locks: Enable lockdep

Now that we have lockdep_set_no_check_recursion(), we can enable lockdep
checking.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/six.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/six.c b/fs/bcachefs/six.c
index e2cebd3ba5fe..01ff210ff18c 100644
--- a/fs/bcachefs/six.c
+++ b/fs/bcachefs/six.c
@@ -18,7 +18,7 @@
 #define EBUG_ON(cond)		do {} while (0)
 #endif
 
-#define six_acquire(l, t)	lock_acquire(l, 0, t, 0, 0, NULL, _RET_IP_)
+#define six_acquire(l, t, r)	lock_acquire(l, 0, t, r, 1, NULL, _RET_IP_)
 #define six_release(l)		lock_release(l, _RET_IP_)
 
 struct six_lock_vals {
@@ -258,7 +258,7 @@ static bool __six_trylock_type(struct six_lock *lock, enum six_lock_type type)
 		return false;
 
 	if (type != SIX_LOCK_write)
-		six_acquire(&lock->dep_map, 1);
+		six_acquire(&lock->dep_map, 1, type == SIX_LOCK_read);
 	return true;
 }
 
@@ -295,7 +295,7 @@ static bool __six_relock_type(struct six_lock *lock, enum six_lock_type type,
 			six_lock_wakeup(lock, old, SIX_LOCK_write);
 
 		if (ret)
-			six_acquire(&lock->dep_map, 1);
+			six_acquire(&lock->dep_map, 1, type == SIX_LOCK_read);
 
 		return ret;
 	}
@@ -312,7 +312,7 @@ static bool __six_relock_type(struct six_lock *lock, enum six_lock_type type,
 
 	six_set_owner(lock, type, old);
 	if (type != SIX_LOCK_write)
-		six_acquire(&lock->dep_map, 1);
+		six_acquire(&lock->dep_map, 1, type == SIX_LOCK_read);
 	return true;
 }
 
@@ -518,7 +518,7 @@ static int __six_lock_type_waiter(struct six_lock *lock, enum six_lock_type type
 	wait->start_time = 0;
 
 	if (type != SIX_LOCK_write)
-		six_acquire(&lock->dep_map, 0);
+		six_acquire(&lock->dep_map, 0, type == SIX_LOCK_read);
 
 	ret = do_six_trylock_type(lock, type, true) ? 0
 		: __six_lock_type_slowpath(lock, type, wait, should_sleep_fn, p);
@@ -681,7 +681,7 @@ void six_lock_increment(struct six_lock *lock, enum six_lock_type type)
 {
 	const struct six_lock_vals l[] = LOCK_VALS;
 
-	six_acquire(&lock->dep_map, 0);
+	six_acquire(&lock->dep_map, 0, type == SIX_LOCK_read);
 
 	/* XXX: assert already locked, and that we don't overflow: */
 
-- 
cgit 


From e4b7254c754b676a6f4d607fd92cd71d221ff130 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 15 Oct 2022 00:34:38 -0400
Subject: six locks: Fix a lost wakeup

There was a lost wakeup between a read unlock in percpu mode and a write
lock. The unlock path unlocks, then executes a barrier, then checks for
waiters; correspondingly, the lock side should set the wait bit and
execute a barrier, then attempt to take the lock.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/six.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/six.c b/fs/bcachefs/six.c
index 01ff210ff18c..abdc2414f58b 100644
--- a/fs/bcachefs/six.c
+++ b/fs/bcachefs/six.c
@@ -198,6 +198,14 @@ retry:
 			atomic64_add(__SIX_VAL(write_locking, 1),
 				     &lock->state.counter);
 			smp_mb__after_atomic();
+		} else if (!(lock->state.waiters & (1 << SIX_LOCK_write))) {
+			atomic64_add(__SIX_VAL(waiters, 1 << SIX_LOCK_write),
+				     &lock->state.counter);
+			/*
+			 * pairs with barrier after unlock and before checking
+			 * for readers in unlock path
+			 */
+			smp_mb__after_atomic();
 		}
 
 		ret = !pcpu_read_count(lock);
@@ -212,9 +220,6 @@ retry:
 		if (ret || try)
 			v -= __SIX_VAL(write_locking, 1);
 
-		if (!ret && !try && !(lock->state.waiters & (1 << SIX_LOCK_write)))
-			v += __SIX_VAL(waiters, 1 << SIX_LOCK_write);
-
 		if (try && !ret) {
 			old.v = atomic64_add_return(v, &lock->state.counter);
 			six_lock_wakeup(lock, old, SIX_LOCK_read);
-- 
cgit 


From 84a37cbf62e04480607ddd1940e3d8ce65b3828d Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 26 Aug 2022 19:22:24 -0400
Subject: six locks: Wakeup now takes lock on behalf of waiter

This brings back an important optimization, to avoid touching the wait
lists an extra time, while preserving the property that a thread is on a
lock waitlist iff it is waiting - it is never removed from the waitlist
until it has the lock.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/six.c | 263 ++++++++++++++++++++++++++++++++----------------------
 fs/bcachefs/six.h |   4 +-
 2 files changed, 160 insertions(+), 107 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/six.c b/fs/bcachefs/six.c
index abdc2414f58b..3f9d4ff2edf4 100644
--- a/fs/bcachefs/six.c
+++ b/fs/bcachefs/six.c
@@ -21,6 +21,8 @@
 #define six_acquire(l, t, r)	lock_acquire(l, 0, t, r, 1, NULL, _RET_IP_)
 #define six_release(l)		lock_release(l, _RET_IP_)
 
+static void do_six_unlock_type(struct six_lock *lock, enum six_lock_type type);
+
 struct six_lock_vals {
 	/* Value we add to the lock in order to take the lock: */
 	u64			lock_val;
@@ -67,14 +69,15 @@ struct six_lock_vals {
 }
 
 static inline void six_set_owner(struct six_lock *lock, enum six_lock_type type,
-				 union six_lock_state old)
+				 union six_lock_state old,
+				 struct task_struct *owner)
 {
 	if (type != SIX_LOCK_intent)
 		return;
 
 	if (!old.intent_lock) {
 		EBUG_ON(lock->owner);
-		lock->owner = current;
+		lock->owner = owner;
 	} else {
 		EBUG_ON(lock->owner != current);
 	}
@@ -93,47 +96,17 @@ static inline unsigned pcpu_read_count(struct six_lock *lock)
 /* This is probably up there with the more evil things I've done */
 #define waitlist_bitnr(id) ilog2((((union six_lock_state) { .waiters = 1 << (id) }).l))
 
-static inline void six_lock_wakeup(struct six_lock *lock,
-				   union six_lock_state state,
-				   enum six_lock_type lock_type)
-{
-	struct six_lock_waiter *w;
-	bool found = false;
-
-	if (lock_type == SIX_LOCK_write && state.read_lock)
-		return;
-
-	if (!(state.waiters & (1 << lock_type)))
-		return;
-
-	raw_spin_lock(&lock->wait_lock);
-
-	list_for_each_entry(w, &lock->wait_list, list) {
-		if (w->lock_want != lock_type)
-			continue;
-
-		found = true;
-		wake_up_process(w->task);
-		if (lock_type != SIX_LOCK_read)
-			break;
-	}
-
-	if (!found)
-		clear_bit(waitlist_bitnr(lock_type), (unsigned long *) &lock->state.v);
-
-	raw_spin_unlock(&lock->wait_lock);
-}
-
-static __always_inline bool do_six_trylock_type(struct six_lock *lock,
-						enum six_lock_type type,
-						bool try)
+static int __do_six_trylock_type(struct six_lock *lock,
+				 enum six_lock_type type,
+				 struct task_struct *task,
+				 bool try)
 {
 	const struct six_lock_vals l[] = LOCK_VALS;
 	union six_lock_state old, new;
-	bool ret;
+	int ret;
 	u64 v;
 
-	EBUG_ON(type == SIX_LOCK_write && lock->owner != current);
+	EBUG_ON(type == SIX_LOCK_write && lock->owner != task);
 	EBUG_ON(type == SIX_LOCK_write && (lock->state.seq & 1));
 	EBUG_ON(type == SIX_LOCK_write && (try != !(lock->state.write_locking)));
 
@@ -153,7 +126,6 @@ static __always_inline bool do_six_trylock_type(struct six_lock *lock,
 	 */
 
 	if (type == SIX_LOCK_read && lock->readers) {
-retry:
 		preempt_disable();
 		this_cpu_inc(*lock->readers); /* signal that we own lock */
 
@@ -171,28 +143,7 @@ retry:
 		 * spurious trylock failure:
 		 */
 		if (old.write_locking)
-			six_lock_wakeup(lock, old, SIX_LOCK_write);
-
-		/*
-		 * If we failed from the lock path and the waiting bit wasn't
-		 * set, set it:
-		 */
-		if (!try && !ret) {
-			v = old.v;
-
-			do {
-				new.v = old.v = v;
-
-				if (!(old.v & l[type].lock_fail))
-					goto retry;
-
-				if (new.waiters & (1 << type))
-					break;
-
-				new.waiters |= 1 << type;
-			} while ((v = atomic64_cmpxchg(&lock->state.counter,
-						       old.v, new.v)) != old.v);
-		}
+			ret = -1 - SIX_LOCK_write;
 	} else if (type == SIX_LOCK_write && lock->readers) {
 		if (try) {
 			atomic64_add(__SIX_VAL(write_locking, 1),
@@ -222,7 +173,8 @@ retry:
 
 		if (try && !ret) {
 			old.v = atomic64_add_return(v, &lock->state.counter);
-			six_lock_wakeup(lock, old, SIX_LOCK_read);
+			if (old.waiters & (1 << SIX_LOCK_read))
+				ret = -1 - SIX_LOCK_read;
 		} else {
 			atomic64_add(v, &lock->state.counter);
 		}
@@ -248,14 +200,84 @@ retry:
 		EBUG_ON(ret && !(lock->state.v & l[type].held_mask));
 	}
 
-	if (ret)
-		six_set_owner(lock, type, old);
+	if (ret > 0)
+		six_set_owner(lock, type, old, task);
 
-	EBUG_ON(type == SIX_LOCK_write && (try || ret) && (lock->state.write_locking));
+	EBUG_ON(type == SIX_LOCK_write && (try || ret > 0) && (lock->state.write_locking));
 
 	return ret;
 }
 
+static inline void __six_lock_wakeup(struct six_lock *lock, enum six_lock_type lock_type)
+{
+	struct six_lock_waiter *w, *next;
+	struct task_struct *task;
+	bool saw_one;
+	int ret;
+again:
+	ret = 0;
+	saw_one = false;
+	raw_spin_lock(&lock->wait_lock);
+
+	list_for_each_entry_safe(w, next, &lock->wait_list, list) {
+		if (w->lock_want != lock_type)
+			continue;
+
+		if (saw_one && lock_type != SIX_LOCK_read)
+			goto unlock;
+		saw_one = true;
+
+		ret = __do_six_trylock_type(lock, lock_type, w->task, false);
+		if (ret <= 0)
+			goto unlock;
+
+		__list_del(w->list.prev, w->list.next);
+		task = w->task;
+		/*
+		 * Do no writes to @w besides setting lock_acquired - otherwise
+		 * we would need a memory barrier:
+		 */
+		barrier();
+		w->lock_acquired = true;
+		wake_up_process(task);
+	}
+
+	clear_bit(waitlist_bitnr(lock_type), (unsigned long *) &lock->state.v);
+unlock:
+	raw_spin_unlock(&lock->wait_lock);
+
+	if (ret < 0) {
+		lock_type = -ret - 1;
+		goto again;
+	}
+}
+
+static inline void six_lock_wakeup(struct six_lock *lock,
+				   union six_lock_state state,
+				   enum six_lock_type lock_type)
+{
+	if (lock_type == SIX_LOCK_write && state.read_lock)
+		return;
+
+	if (!(state.waiters & (1 << lock_type)))
+		return;
+
+	__six_lock_wakeup(lock, lock_type);
+}
+
+static bool do_six_trylock_type(struct six_lock *lock,
+				enum six_lock_type type,
+				bool try)
+{
+	int ret;
+
+	ret = __do_six_trylock_type(lock, type, current, try);
+	if (ret < 0)
+		__six_lock_wakeup(lock, -ret - 1);
+
+	return ret > 0;
+}
+
 __always_inline __flatten
 static bool __six_trylock_type(struct six_lock *lock, enum six_lock_type type)
 {
@@ -315,7 +337,7 @@ static bool __six_relock_type(struct six_lock *lock, enum six_lock_type type,
 				old.v,
 				old.v + l[type].lock_val)) != old.v);
 
-	six_set_owner(lock, type, old);
+	six_set_owner(lock, type, old, current);
 	if (type != SIX_LOCK_write)
 		six_acquire(&lock->dep_map, 1, type == SIX_LOCK_read);
 	return true;
@@ -457,54 +479,73 @@ static int __six_lock_type_slowpath(struct six_lock *lock, enum six_lock_type ty
 		smp_mb__after_atomic();
 	}
 
-	ret = should_sleep_fn ? should_sleep_fn(lock, p) : 0;
-	if (ret)
-		goto out_before_sleep;
-
 	if (six_optimistic_spin(lock, type))
-		goto out_before_sleep;
+		goto out;
 
 	lock_contended(&lock->dep_map, _RET_IP_);
 
 	wait->task		= current;
 	wait->lock_want		= type;
+	wait->lock_acquired	= false;
 
 	raw_spin_lock(&lock->wait_lock);
 	if (!(lock->state.waiters & (1 << type)))
 		set_bit(waitlist_bitnr(type), (unsigned long *) &lock->state.v);
-	wait->start_time = local_clock();
+	/*
+	 * Retry taking the lock after taking waitlist lock, have raced with an
+	 * unlock:
+	 */
+	ret = __do_six_trylock_type(lock, type, current, false);
+	if (ret <= 0) {
+		wait->start_time = local_clock();
 
-	if (!list_empty(&lock->wait_list)) {
-		struct six_lock_waiter *last =
-			list_last_entry(&lock->wait_list,
-				struct six_lock_waiter, list);
+		if (!list_empty(&lock->wait_list)) {
+			struct six_lock_waiter *last =
+				list_last_entry(&lock->wait_list,
+					struct six_lock_waiter, list);
 
-		if (time_before_eq64(wait->start_time, last->start_time))
-			wait->start_time = last->start_time + 1;
-	}
+			if (time_before_eq64(wait->start_time, last->start_time))
+				wait->start_time = last->start_time + 1;
+		}
 
-	list_add_tail(&wait->list, &lock->wait_list);
+		list_add_tail(&wait->list, &lock->wait_list);
+	}
 	raw_spin_unlock(&lock->wait_lock);
 
+	if (unlikely(ret > 0)) {
+		ret = 0;
+		goto out;
+	}
+
+	if (unlikely(ret < 0)) {
+		__six_lock_wakeup(lock, -ret - 1);
+		ret = 0;
+	}
+
 	while (1) {
 		set_current_state(TASK_UNINTERRUPTIBLE);
-		if (do_six_trylock_type(lock, type, false))
+
+		if (wait->lock_acquired)
 			break;
 
 		ret = should_sleep_fn ? should_sleep_fn(lock, p) : 0;
-		if (ret)
+		if (unlikely(ret)) {
+			raw_spin_lock(&lock->wait_lock);
+			if (!wait->lock_acquired)
+				list_del(&wait->list);
+			raw_spin_unlock(&lock->wait_lock);
+
+			if (wait->lock_acquired)
+				do_six_unlock_type(lock, type);
 			break;
+		}
 
 		schedule();
 	}
 
 	__set_current_state(TASK_RUNNING);
-
-	raw_spin_lock(&lock->wait_lock);
-	list_del(&wait->list);
-	raw_spin_unlock(&lock->wait_lock);
-out_before_sleep:
-	if (ret && type == SIX_LOCK_write) {
+out:
+	if (ret && type == SIX_LOCK_write && lock->state.write_locking) {
 		old.v = atomic64_sub_return(__SIX_VAL(write_locking, 1),
 					    &lock->state.counter);
 		six_lock_wakeup(lock, old, SIX_LOCK_read);
@@ -546,27 +587,13 @@ static int __six_lock_type(struct six_lock *lock, enum six_lock_type type,
 }
 
 __always_inline __flatten
-static void __six_unlock_type(struct six_lock *lock, enum six_lock_type type)
+static void do_six_unlock_type(struct six_lock *lock, enum six_lock_type type)
 {
 	const struct six_lock_vals l[] = LOCK_VALS;
 	union six_lock_state state;
 
-	EBUG_ON(type == SIX_LOCK_write &&
-		!(lock->state.v & __SIX_LOCK_HELD_intent));
-
-	if (type != SIX_LOCK_write)
-		six_release(&lock->dep_map);
-
-	if (type == SIX_LOCK_intent) {
-		EBUG_ON(lock->owner != current);
-
-		if (lock->intent_lock_recurse) {
-			--lock->intent_lock_recurse;
-			return;
-		}
-
+	if (type == SIX_LOCK_intent)
 		lock->owner = NULL;
-	}
 
 	if (type == SIX_LOCK_read &&
 	    lock->readers) {
@@ -583,6 +610,27 @@ static void __six_unlock_type(struct six_lock *lock, enum six_lock_type type)
 	six_lock_wakeup(lock, state, l[type].unlock_wakeup);
 }
 
+__always_inline __flatten
+static void __six_unlock_type(struct six_lock *lock, enum six_lock_type type)
+{
+	EBUG_ON(type == SIX_LOCK_write &&
+		!(lock->state.v & __SIX_LOCK_HELD_intent));
+	EBUG_ON((type == SIX_LOCK_write ||
+		 type == SIX_LOCK_intent) &&
+		lock->owner != current);
+
+	if (type != SIX_LOCK_write)
+		six_release(&lock->dep_map);
+
+	if (type == SIX_LOCK_intent &&
+	    lock->intent_lock_recurse) {
+		--lock->intent_lock_recurse;
+		return;
+	}
+
+	do_six_unlock_type(lock, type);
+}
+
 #define __SIX_LOCK(type)						\
 bool six_trylock_##type(struct six_lock *lock)				\
 {									\
@@ -654,7 +702,7 @@ bool six_lock_tryupgrade(struct six_lock *lock)
 	if (lock->readers)
 		this_cpu_dec(*lock->readers);
 
-	six_set_owner(lock, SIX_LOCK_intent, old);
+	six_set_owner(lock, SIX_LOCK_intent, old, current);
 
 	return true;
 }
@@ -713,8 +761,13 @@ EXPORT_SYMBOL_GPL(six_lock_increment);
 
 void six_lock_wakeup_all(struct six_lock *lock)
 {
+	union six_lock_state state = lock->state;
 	struct six_lock_waiter *w;
 
+	six_lock_wakeup(lock, state, SIX_LOCK_read);
+	six_lock_wakeup(lock, state, SIX_LOCK_intent);
+	six_lock_wakeup(lock, state, SIX_LOCK_write);
+
 	raw_spin_lock(&lock->wait_lock);
 	list_for_each_entry(w, &lock->wait_list, list)
 		wake_up_process(w->task);
diff --git a/fs/bcachefs/six.h b/fs/bcachefs/six.h
index 757f8aa4d339..9ebbf8095573 100644
--- a/fs/bcachefs/six.h
+++ b/fs/bcachefs/six.h
@@ -110,11 +110,10 @@ struct six_lock {
 	union six_lock_state	state;
 	unsigned		intent_lock_recurse;
 	struct task_struct	*owner;
+	unsigned __percpu	*readers;
 #ifdef CONFIG_SIX_LOCK_SPIN_ON_OWNER
 	struct optimistic_spin_queue osq;
 #endif
-	unsigned __percpu	*readers;
-
 	raw_spinlock_t		wait_lock;
 	struct list_head	wait_list;
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
@@ -126,6 +125,7 @@ struct six_lock_waiter {
 	struct list_head	list;
 	struct task_struct	*task;
 	enum six_lock_type	lock_want;
+	bool			lock_acquired;
 	u64			start_time;
 };
 
-- 
cgit 


From 845cffed0d343ecea9f6ff3883cac9a6872d9920 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 19 Sep 2022 14:14:01 -0400
Subject: bcachefs: Add a debug assert

Chasing down a strange locking bug.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 1650ba87ef03..df9949cef907 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1110,6 +1110,9 @@ static int btree_path_traverse_one(struct btree_trans *trans,
 
 	path->level = btree_path_up_until_good_node(trans, path, 0);
 
+	EBUG_ON(btree_path_node(path, path->level) &&
+		!btree_node_locked(path, path->level));
+
 	/*
 	 * Note: path->nodes[path->level] may be temporarily NULL here - that
 	 * would indicate to other code that we got to the end of the btree,
-- 
cgit 


From 62448afee714354a26db8a0f3c644f58628f0792 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 5 Aug 2022 13:06:44 -0400
Subject: bcachefs: Fix bch2_btree_node_upgrade()

Previously, if we were trying to upgrade from a read to an intent lock
but we held an additional read lock via another btree_path,
bch2_btree_node_upgrade() would always fail, in six_lock_tryupgrade().

This patch factors out the code that __bch2_btree_node_lock_write() uses
to temporarily drop extra read locks, so that six_lock_tryupgrade() can
succeed.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_locking.c | 22 ++++++++++++++++++----
 1 file changed, 18 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c
index 08dbc799bb35..5b6d8184ea45 100644
--- a/fs/bcachefs/btree_locking.c
+++ b/fs/bcachefs/btree_locking.c
@@ -230,6 +230,7 @@ bool bch2_btree_node_upgrade(struct btree_trans *trans,
 			     struct btree_path *path, unsigned level)
 {
 	struct btree *b = path->l[level].b;
+	struct six_lock_count count = bch2_btree_node_lock_counts(trans, path, &b->c, level);
 
 	if (!is_btree_node(path, level))
 		return false;
@@ -253,11 +254,24 @@ bool bch2_btree_node_upgrade(struct btree_trans *trans,
 	if (race_fault())
 		return false;
 
-	if (btree_node_locked(path, level)
-	    ? six_lock_tryupgrade(&b->c.lock)
-	    : six_relock_type(&b->c.lock, SIX_LOCK_intent, path->l[level].lock_seq))
-		goto success;
+	if (btree_node_locked(path, level)) {
+		bool ret;
+
+		six_lock_readers_add(&b->c.lock, -count.n[SIX_LOCK_read]);
+		ret = six_lock_tryupgrade(&b->c.lock);
+		six_lock_readers_add(&b->c.lock, count.n[SIX_LOCK_read]);
+
+		if (ret)
+			goto success;
+	} else {
+		if (six_relock_type(&b->c.lock, SIX_LOCK_intent, path->l[level].lock_seq))
+			goto success;
+	}
 
+	/*
+	 * Do we already have an intent lock via another path? If so, just bump
+	 * lock count:
+	 */
 	if (btree_node_lock_seq_matches(path, b, level) &&
 	    btree_node_lock_increment(trans, &b->c, level, BTREE_NODE_INTENT_LOCKED)) {
 		btree_node_unlock(trans, path, level);
-- 
cgit 


From 33bd5d068603f9e81e0b73dbe50e9b88b2e56d0d Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 22 Aug 2022 13:23:47 -0400
Subject: bcachefs: Deadlock cycle detector

We've outgrown our own deadlock avoidance strategy.

The btree iterator API provides an interface where the user doesn't need
to concern themselves with lock ordering - different btree iterators can
be traversed in any order. Without special care, this will lead to
deadlocks.

Our previous strategy was to define a lock ordering internally, and
whenever we attempt to take a lock and trylock() fails, we'd check if
the current btree transaction is holding any locks that cause a lock
ordering violation. If so, we'd issue a transaction restart, and then
bch2_trans_begin() would re-traverse all previously used iterators, but
in the correct order.

That approach had some issues, though.
 - Sometimes we'd issue transaction restarts unnecessarily, when no
   deadlock would have actually occured. Lock ordering restarts have
   become our primary cause of transaction restarts, on some workloads
   totally 20% of actual transaction commits.

 - To avoid deadlock or livelock, we'd often have to take intent locks
   when we only wanted a read lock: with the lock ordering approach, it
   is actually illegal to hold _any_ read lock while blocking on an intent
   lock, and this has been causing us unnecessary lock contention.

 - It was getting fragile - the various lock ordering rules are not
   trivial, and we'd been seeing occasional livelock issues related to
   this machinery.

So, since bcachefs is already a relational database masquerading as a
filesystem, we're stealing the next traditional database technique and
switching to a cycle detector for avoiding deadlocks.

When we block taking a btree lock, after adding ourself to the waitlist
but before sleeping, we do a DFS of btree transactions waiting on other
btree transactions, starting with the current transaction and walking
our held locks, and transactions blocking on our held locks.

If we find a cycle, we emit a transaction restart. Occasionally (e.g.
the btree split path) we can not allow the lock() operation to fail, so
if necessary we'll tell another transaction that it has to fail.

Result: trans_restart_would_deadlock events are reduced by a factor of
10 to 100, and we'll be able to delete a whole bunch of grotty, fragile
code.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/bcachefs_format.h |   3 +-
 fs/bcachefs/btree_iter.c      |  11 +-
 fs/bcachefs/btree_iter.h      |   7 +-
 fs/bcachefs/btree_locking.c   | 246 +++++++++++++++++++++++++++++++++++++++++-
 fs/bcachefs/btree_locking.h   |  65 ++++++++---
 fs/bcachefs/btree_types.h     |  10 +-
 fs/bcachefs/debug.c           |   6 +-
 fs/bcachefs/errcode.h         |   1 +
 fs/bcachefs/trace.h           |   6 ++
 9 files changed, 322 insertions(+), 33 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index 0e80fe2568f2..5471b797be93 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -1400,7 +1400,8 @@ struct bch_sb_field_disk_groups {
 	x(trans_restart_key_cache_upgrade,		70)	\
 	x(trans_traverse_all,				71)	\
 	x(transaction_commit,				72)	\
-	x(write_super,					73)
+	x(write_super,					73)	\
+	x(trans_restart_would_deadlock_recursion_limit,	74)
 
 enum bch_persistent_counters {
 #define x(t, n, ...) BCH_COUNTER_##t,
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index df9949cef907..5773b00e69ac 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -2848,9 +2848,10 @@ void __bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, const char *
 	trans->fn		= fn;
 	trans->last_begin_time	= ktime_get_ns();
 	trans->fn_idx		= bch2_trans_get_fn_idx(trans, c, fn);
-	trans->task		= current;
+	trans->locking_wait.task = current;
 	trans->journal_replay_not_finished =
 		!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags);
+	closure_init_stack(&trans->ref);
 
 	bch2_trans_alloc_paths(trans, c);
 
@@ -2877,7 +2878,7 @@ void __bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, const char *
 
 		mutex_lock(&c->btree_trans_lock);
 		list_for_each_entry(pos, &c->btree_trans_list, list) {
-			if (trans->task->pid < pos->task->pid) {
+			if (trans->locking_wait.task->pid < pos->locking_wait.task->pid) {
 				list_add_tail(&trans->list, &pos->list);
 				goto list_add_done;
 			}
@@ -2919,6 +2920,8 @@ void bch2_trans_exit(struct btree_trans *trans)
 
 	bch2_trans_unlock(trans);
 
+	closure_sync(&trans->ref);
+
 	if (s)
 		s->max_mem = max(s->max_mem, trans->mem_max);
 
@@ -2997,7 +3000,7 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct btree_trans *trans)
 	static char lock_types[] = { 'r', 'i', 'w' };
 	unsigned l;
 
-	prt_printf(out, "%i %s\n", trans->task->pid, trans->fn);
+	prt_printf(out, "%i %s\n", trans->locking_wait.task->pid, trans->fn);
 
 	trans_for_each_path(trans, path) {
 		if (!path->nodes_locked)
@@ -3029,7 +3032,7 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct btree_trans *trans)
 		       trans->locking_path_idx,
 		       path->cached ? 'c' : 'b',
 		       trans->locking_level,
-		       lock_types[trans->locking_lock_type],
+		       lock_types[trans->locking_wait.lock_want],
 		       bch2_btree_ids[trans->locking_btree_id]);
 		bch2_bpos_to_text(out, trans->locking_pos);
 
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index 2f47889c688a..04b6773d6e10 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -81,11 +81,14 @@ __trans_next_path(struct btree_trans *trans, unsigned idx)
 	return &trans->paths[idx];
 }
 
-#define trans_for_each_path(_trans, _path)				\
-	for (_path = __trans_next_path((_trans), 0);			\
+#define trans_for_each_path_from(_trans, _path, _start)			\
+	for (_path = __trans_next_path((_trans), _start);		\
 	     (_path);							\
 	     _path = __trans_next_path((_trans), (_path)->idx + 1))
 
+#define trans_for_each_path(_trans, _path)				\
+	trans_for_each_path_from(_trans, _path, 0)
+
 static inline struct btree_path *next_btree_path(struct btree_trans *trans, struct btree_path *path)
 {
 	unsigned idx = path ? path->sorted_idx + 1 : 0;
diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c
index 5b6d8184ea45..869f4163a3c6 100644
--- a/fs/bcachefs/btree_locking.c
+++ b/fs/bcachefs/btree_locking.c
@@ -52,10 +52,248 @@ void bch2_btree_node_unlock_write(struct btree_trans *trans,
 
 /* lock */
 
-void __bch2_btree_node_lock_write(struct btree_trans *trans,
-				  struct btree_bkey_cached_common *b)
+/*
+ * @trans wants to lock @b with type @type
+ */
+struct trans_waiting_for_lock {
+	struct btree_trans		*trans;
+	struct btree_bkey_cached_common	*node_want;
+	enum six_lock_type		lock_want;
+
+	/* for iterating over held locks :*/
+	u8				path_idx;
+	u8				level;
+	u64				lock_start_time;
+};
+
+struct lock_graph {
+	struct trans_waiting_for_lock	g[8];
+	unsigned			nr;
+};
+
+static void lock_graph_pop(struct lock_graph *g)
+{
+	closure_put(&g->g[--g->nr].trans->ref);
+}
+
+static int abort_lock(struct lock_graph *g, struct trans_waiting_for_lock *i)
+{
+	int ret;
+
+	if (i == g->g) {
+		ret = btree_trans_restart(i->trans, BCH_ERR_transaction_restart_would_deadlock);
+	} else {
+		i->trans->lock_must_abort = true;
+		ret = 0;
+	}
+
+	for (i = g->g + 1; i < g->g + g->nr; i++)
+		wake_up_process(i->trans->locking_wait.task);
+	return ret;
+}
+
+static noinline int break_cycle(struct lock_graph *g)
+{
+	struct trans_waiting_for_lock *i;
+
+	for (i = g->g; i < g->g + g->nr; i++) {
+		if (i->trans->lock_may_not_fail ||
+		    i->trans->locking_wait.lock_want == SIX_LOCK_write)
+			continue;
+
+		return abort_lock(g, i);
+	}
+
+	for (i = g->g; i < g->g + g->nr; i++) {
+		if (i->trans->lock_may_not_fail ||
+		    !i->trans->in_traverse_all)
+			continue;
+
+		return abort_lock(g, i);
+	}
+
+	for (i = g->g; i < g->g + g->nr; i++) {
+		if (i->trans->lock_may_not_fail)
+			continue;
+
+		return abort_lock(g, i);
+	}
+
+	BUG();
+}
+
+static int lock_graph_descend(struct lock_graph *g, struct btree_trans *trans)
+{
+	struct btree_trans *orig_trans = g->g->trans;
+	struct trans_waiting_for_lock *i;
+	int ret = 0;
+
+	for (i = g->g; i < g->g + g->nr; i++) {
+		if (i->trans->locking != i->node_want)
+			while (g->g + g->nr >= i) {
+				lock_graph_pop(g);
+				return 0;
+			}
+
+		if (i->trans == trans) {
+			ret = break_cycle(g);
+			if (ret)
+				goto deadlock;
+			/*
+			 * If we didn't abort (instead telling another
+			 * transaction to abort), keep checking:
+			 */
+		}
+	}
+
+	if (g->nr == ARRAY_SIZE(g->g)) {
+		if (orig_trans->lock_may_not_fail)
+			return 0;
+
+		trace_and_count(trans->c, trans_restart_would_deadlock_recursion_limit, trans, _RET_IP_);
+		ret = btree_trans_restart(orig_trans, BCH_ERR_transaction_restart_deadlock_recursion_limit);
+		goto deadlock;
+	}
+
+	closure_get(&trans->ref);
+
+	g->g[g->nr++] = (struct trans_waiting_for_lock) {
+		.trans		= trans,
+		.node_want	= trans->locking,
+		.lock_want	= trans->locking_wait.lock_want,
+	};
+
+	return 0;
+deadlock:
+	while (g->nr)
+		lock_graph_pop(g);
+	return ret;
+}
+
+#if 0
+static void print_cycle(struct printbuf *out, struct lock_graph *g)
+{
+	struct trans_waiting_for_lock *i;
+
+	prt_str(out, "Found lock cycle:");
+	prt_newline(out);
+
+	for (i = g->g; i < g->g + g->nr; i++)
+		bch2_btree_trans_to_text(out, i->trans);
+}
+#endif
+
+static noinline void lock_graph_remove_non_waiters(struct lock_graph *g)
+{
+	struct trans_waiting_for_lock *i;
+
+	for (i = g->g + 1; i < g->g + g->nr; i++)
+		if (i->trans->locking != i->node_want ||
+		    i->trans->locking_wait.start_time != i[-1].lock_start_time) {
+			while (g->g + g->nr >= i)
+				lock_graph_pop(g);
+			return;
+		}
+	BUG();
+}
+
+static bool lock_type_conflicts(enum six_lock_type t1, enum six_lock_type t2)
+{
+	return t1 + t2 > 1;
+}
+
+static int check_for_deadlock(struct btree_trans *trans)
+{
+	struct lock_graph g;
+	struct trans_waiting_for_lock *top;
+	struct btree_bkey_cached_common *b;
+	struct btree_path *path;
+	int ret;
+
+	if (trans->lock_must_abort)
+		return btree_trans_restart(trans, BCH_ERR_transaction_restart_would_deadlock);
+
+	g.nr = 0;
+	ret = lock_graph_descend(&g, trans);
+	BUG_ON(ret);
+next:
+	if (!g.nr)
+		return 0;
+
+	top = &g.g[g.nr - 1];
+
+	trans_for_each_path_from(top->trans, path, top->path_idx) {
+		if (!path->nodes_locked)
+			continue;
+
+		if (top->path_idx != path->idx) {
+			top->path_idx		= path->idx;
+			top->level		= 0;
+			top->lock_start_time	= 0;
+		}
+
+		for (;
+		     top->level < BTREE_MAX_DEPTH;
+		     top->level++, top->lock_start_time = 0) {
+			int lock_held = btree_node_locked_type(path, top->level);
+
+			if (lock_held == BTREE_NODE_UNLOCKED)
+				continue;
+
+			b = &READ_ONCE(path->l[top->level].b)->c;
+
+			if (unlikely(IS_ERR_OR_NULL(b))) {
+				lock_graph_remove_non_waiters(&g);
+				goto next;
+			}
+
+			if (list_empty_careful(&b->lock.wait_list))
+				continue;
+
+			raw_spin_lock(&b->lock.wait_lock);
+			list_for_each_entry(trans, &b->lock.wait_list, locking_wait.list) {
+				BUG_ON(b != trans->locking);
+
+				if (top->lock_start_time &&
+				    time_after_eq64(top->lock_start_time, trans->locking_wait.start_time))
+					continue;
+
+				top->lock_start_time = trans->locking_wait.start_time;
+
+				/* Don't check for self deadlock: */
+				if (trans == top->trans ||
+				    !lock_type_conflicts(lock_held, trans->locking_wait.lock_want))
+					continue;
+
+				ret = lock_graph_descend(&g, trans);
+				raw_spin_unlock(&b->lock.wait_lock);
+
+				if (ret)
+					return ret < 0 ? ret : 0;
+				goto next;
+
+			}
+			raw_spin_unlock(&b->lock.wait_lock);
+		}
+	}
+
+	lock_graph_pop(&g);
+	goto next;
+}
+
+int bch2_six_check_for_deadlock(struct six_lock *lock, void *p)
+{
+	struct btree_trans *trans = p;
+
+	return check_for_deadlock(trans);
+}
+
+int __bch2_btree_node_lock_write(struct btree_trans *trans,
+				 struct btree_bkey_cached_common *b,
+				 bool lock_may_not_fail)
 {
 	int readers = bch2_btree_node_lock_counts(trans, NULL, b, b->level).n[SIX_LOCK_read];
+	int ret;
 
 	/*
 	 * Must drop our read locks before calling six_lock_write() -
@@ -64,8 +302,10 @@ void __bch2_btree_node_lock_write(struct btree_trans *trans,
 	 * locked:
 	 */
 	six_lock_readers_add(&b->lock, -readers);
-	btree_node_lock_nopath_nofail(trans, b, SIX_LOCK_write);
+	ret = __btree_node_lock_nopath(trans, b, SIX_LOCK_write, lock_may_not_fail);
 	six_lock_readers_add(&b->lock, readers);
+
+	return ret;
 }
 
 static inline bool path_has_read_locks(struct btree_path *path)
diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h
index aea2ebafffd8..874dd4428b3a 100644
--- a/fs/bcachefs/btree_locking.h
+++ b/fs/bcachefs/btree_locking.h
@@ -183,22 +183,41 @@ bch2_btree_node_unlock_write_inlined(struct btree_trans *trans, struct btree_pat
 void bch2_btree_node_unlock_write(struct btree_trans *,
 			struct btree_path *, struct btree *);
 
+int bch2_six_check_for_deadlock(struct six_lock *lock, void *p);
+
 /* lock: */
 
+static inline int __btree_node_lock_nopath(struct btree_trans *trans,
+					 struct btree_bkey_cached_common *b,
+					 enum six_lock_type type,
+					 bool lock_may_not_fail)
+{
+	int ret;
+
+	trans->lock_may_not_fail = lock_may_not_fail;
+	trans->locking		= b;
+	trans->lock_must_abort	= false;
+
+	ret = six_lock_type_waiter(&b->lock, type, &trans->locking_wait,
+				   bch2_six_check_for_deadlock, trans);
+	WRITE_ONCE(trans->locking, NULL);
+	WRITE_ONCE(trans->locking_wait.start_time, 0);
+	return ret;
+}
+
 static inline int __must_check
 btree_node_lock_nopath(struct btree_trans *trans,
 		       struct btree_bkey_cached_common *b,
 		       enum six_lock_type type)
 {
-	six_lock_type(&b->lock, type, NULL, NULL);
-	return 0;
+	return __btree_node_lock_nopath(trans, b, type, false);
 }
 
 static inline void btree_node_lock_nopath_nofail(struct btree_trans *trans,
 					 struct btree_bkey_cached_common *b,
 					 enum six_lock_type type)
 {
-	int ret = btree_node_lock_nopath(trans, b, type);
+	int ret = __btree_node_lock_nopath(trans, b, type, true);
 
 	BUG_ON(ret);
 }
@@ -210,8 +229,6 @@ static inline int btree_node_lock_type(struct btree_trans *trans,
 				       enum six_lock_type type,
 				       six_lock_should_sleep_fn should_sleep_fn, void *p)
 {
-	int ret;
-
 	if (six_trylock_type(&b->lock, type))
 		return 0;
 
@@ -219,11 +236,10 @@ static inline int btree_node_lock_type(struct btree_trans *trans,
 	trans->locking_pos	= pos;
 	trans->locking_btree_id	= path->btree_id;
 	trans->locking_level	= level;
-	trans->locking_lock_type = type;
+	trans->lock_may_not_fail = false;
 	trans->locking		= b;
-	ret = six_lock_type(&b->lock, type, should_sleep_fn, p);
-	trans->locking = NULL;
-	return ret;
+	return six_lock_type_waiter(&b->lock, type, &trans->locking_wait,
+				    bch2_six_check_for_deadlock, trans);
 }
 
 /*
@@ -279,12 +295,15 @@ static inline int btree_node_lock(struct btree_trans *trans,
 	return ret;
 }
 
-void __bch2_btree_node_lock_write(struct btree_trans *, struct btree_bkey_cached_common *);
+int __bch2_btree_node_lock_write(struct btree_trans *, struct btree_bkey_cached_common *, bool);
 
-static inline void bch2_btree_node_lock_write_nofail(struct btree_trans *trans,
-					      struct btree_path *path,
-					      struct btree_bkey_cached_common *b)
+static inline int __btree_node_lock_write(struct btree_trans *trans,
+					  struct btree_path *path,
+					  struct btree_bkey_cached_common *b,
+					  bool lock_may_not_fail)
 {
+	int ret;
+
 	EBUG_ON(&path->l[b->level].b->c != b);
 	EBUG_ON(path->l[b->level].lock_seq != b->lock.state.seq);
 	EBUG_ON(!btree_node_intent_locked(path, b->level));
@@ -296,8 +315,21 @@ static inline void bch2_btree_node_lock_write_nofail(struct btree_trans *trans,
 	 */
 	mark_btree_node_locked_noreset(path, b->level, SIX_LOCK_write);
 
-	if (unlikely(!six_trylock_write(&b->lock)))
-		__bch2_btree_node_lock_write(trans, b);
+	ret = likely(six_trylock_write(&b->lock))
+		? 0
+		: __bch2_btree_node_lock_write(trans, b, lock_may_not_fail);
+	if (ret)
+		mark_btree_node_locked_noreset(path, b->level, SIX_LOCK_intent);
+
+	return ret;
+}
+
+static inline void bch2_btree_node_lock_write_nofail(struct btree_trans *trans,
+					      struct btree_path *path,
+					      struct btree_bkey_cached_common *b)
+{
+	int ret = __btree_node_lock_write(trans, path, b, true);
+	BUG_ON(ret);
 }
 
 static inline int __must_check
@@ -305,8 +337,7 @@ bch2_btree_node_lock_write(struct btree_trans *trans,
 			   struct btree_path *path,
 			   struct btree_bkey_cached_common *b)
 {
-	bch2_btree_node_lock_write_nofail(trans, path, b);
-	return 0;
+	return __btree_node_lock_write(trans, path, b, false);
 }
 
 /* relock: */
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 0a3854b614e0..578cf8fa3d2f 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -387,15 +387,19 @@ struct btree_trans_commit_hook {
 struct btree_trans {
 	struct bch_fs		*c;
 	const char		*fn;
+	struct closure		ref;
 	struct list_head	list;
 	u64			last_begin_time;
-	struct btree_bkey_cached_common *locking;
+
 	unsigned		locking_path_idx;
 	struct bpos		locking_pos;
 	u8			locking_btree_id;
 	u8			locking_level;
-	u8			locking_lock_type;
-	struct task_struct	*task;
+	u8			lock_may_not_fail;
+	u8			lock_must_abort;
+	struct btree_bkey_cached_common *locking;
+	struct six_lock_waiter	locking_wait;
+
 	int			srcu_idx;
 
 	u8			fn_idx;
diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c
index 4fe20d36212e..6944dfef5bcb 100644
--- a/fs/bcachefs/debug.c
+++ b/fs/bcachefs/debug.c
@@ -534,7 +534,7 @@ static ssize_t bch2_btree_transactions_read(struct file *file, char __user *buf,
 
 	mutex_lock(&c->btree_trans_lock);
 	list_for_each_entry(trans, &c->btree_trans_list, list) {
-		if (trans->task->pid <= i->iter)
+		if (trans->locking_wait.task->pid <= i->iter)
 			continue;
 
 		ret = flush_buf(i);
@@ -546,11 +546,11 @@ static ssize_t bch2_btree_transactions_read(struct file *file, char __user *buf,
 		prt_printf(&i->buf, "backtrace:");
 		prt_newline(&i->buf);
 		printbuf_indent_add(&i->buf, 2);
-		prt_backtrace(&i->buf, trans->task);
+		prt_backtrace(&i->buf, trans->locking_wait.task);
 		printbuf_indent_sub(&i->buf, 2);
 		prt_newline(&i->buf);
 
-		i->iter = trans->task->pid;
+		i->iter = trans->locking_wait.task->pid;
 	}
 	mutex_unlock(&c->btree_trans_lock);
 
diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h
index 3dc477eb3600..1ea004f1adbb 100644
--- a/fs/bcachefs/errcode.h
+++ b/fs/bcachefs/errcode.h
@@ -35,6 +35,7 @@
 	x(BCH_ERR_transaction_restart,	transaction_restart_in_traverse_all)	\
 	x(BCH_ERR_transaction_restart,	transaction_restart_would_deadlock)	\
 	x(BCH_ERR_transaction_restart,	transaction_restart_would_deadlock_write)\
+	x(BCH_ERR_transaction_restart,	transaction_restart_deadlock_recursion_limit)\
 	x(BCH_ERR_transaction_restart,	transaction_restart_upgrade)		\
 	x(BCH_ERR_transaction_restart,	transaction_restart_key_cache_upgrade)	\
 	x(BCH_ERR_transaction_restart,	transaction_restart_key_cache_fill)	\
diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h
index 62de89fcb74b..35c40678f4b5 100644
--- a/fs/bcachefs/trace.h
+++ b/fs/bcachefs/trace.h
@@ -1065,6 +1065,12 @@ TRACE_EVENT(trans_restart_would_deadlock,
 		  __entry->want_pos_snapshot)
 );
 
+DEFINE_EVENT(transaction_event,	trans_restart_would_deadlock_recursion_limit,
+	TP_PROTO(struct btree_trans *trans,
+		 unsigned long caller_ip),
+	TP_ARGS(trans, caller_ip)
+);
+
 TRACE_EVENT(trans_restart_would_deadlock_write,
 	TP_PROTO(struct btree_trans *trans),
 	TP_ARGS(trans),
-- 
cgit 


From 96d994b37cfcf468bf1d71527ae95ad93a311e38 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 22 Aug 2022 23:12:11 -0400
Subject: bcachefs: Print deadlock cycle in debugfs

In the event that we're not finished debugging the cycle detector, this
adds a new file to debugfs that shows what the cycle detector finds, if
anything. By comparing this with btree_transactions, which shows held
locks for every btree_transaction, we'll be able to determine if it's
the cycle detector that's buggy or something else.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c    |  2 --
 fs/bcachefs/btree_locking.c | 44 +++++++++++++++++++++++++-------------------
 fs/bcachefs/btree_locking.h |  1 +
 fs/bcachefs/debug.c         | 43 +++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 69 insertions(+), 21 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 5773b00e69ac..ece80d7914b2 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -2992,7 +2992,6 @@ bch2_btree_path_node_to_text(struct printbuf *out,
 		   c.n[0], c.n[1], c.n[2], pid);
 }
 
-#ifdef CONFIG_BCACHEFS_DEBUG_TRANSACTIONS
 void bch2_btree_trans_to_text(struct printbuf *out, struct btree_trans *trans)
 {
 	struct btree_path *path;
@@ -3041,7 +3040,6 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct btree_trans *trans)
 		prt_printf(out, "\n");
 	}
 }
-#endif
 
 void bch2_fs_btree_iter_exit(struct bch_fs *c)
 {
diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c
index 869f4163a3c6..e270579d3622 100644
--- a/fs/bcachefs/btree_locking.c
+++ b/fs/bcachefs/btree_locking.c
@@ -76,6 +76,17 @@ static void lock_graph_pop(struct lock_graph *g)
 	closure_put(&g->g[--g->nr].trans->ref);
 }
 
+static noinline void print_cycle(struct printbuf *out, struct lock_graph *g)
+{
+	struct trans_waiting_for_lock *i;
+
+	prt_printf(out, "Found lock cycle (%u entries):", g->nr);
+	prt_newline(out);
+
+	for (i = g->g; i < g->g + g->nr; i++)
+		bch2_btree_trans_to_text(out, i->trans);
+}
+
 static int abort_lock(struct lock_graph *g, struct trans_waiting_for_lock *i)
 {
 	int ret;
@@ -122,7 +133,8 @@ static noinline int break_cycle(struct lock_graph *g)
 	BUG();
 }
 
-static int lock_graph_descend(struct lock_graph *g, struct btree_trans *trans)
+static int lock_graph_descend(struct lock_graph *g, struct btree_trans *trans,
+			      struct printbuf *cycle)
 {
 	struct btree_trans *orig_trans = g->g->trans;
 	struct trans_waiting_for_lock *i;
@@ -136,7 +148,14 @@ static int lock_graph_descend(struct lock_graph *g, struct btree_trans *trans)
 			}
 
 		if (i->trans == trans) {
-			ret = break_cycle(g);
+			if (cycle) {
+				/* Only checking: */
+				print_cycle(cycle, g);
+				ret = -1;
+			} else {
+				ret = break_cycle(g);
+			}
+
 			if (ret)
 				goto deadlock;
 			/*
@@ -170,19 +189,6 @@ deadlock:
 	return ret;
 }
 
-#if 0
-static void print_cycle(struct printbuf *out, struct lock_graph *g)
-{
-	struct trans_waiting_for_lock *i;
-
-	prt_str(out, "Found lock cycle:");
-	prt_newline(out);
-
-	for (i = g->g; i < g->g + g->nr; i++)
-		bch2_btree_trans_to_text(out, i->trans);
-}
-#endif
-
 static noinline void lock_graph_remove_non_waiters(struct lock_graph *g)
 {
 	struct trans_waiting_for_lock *i;
@@ -202,7 +208,7 @@ static bool lock_type_conflicts(enum six_lock_type t1, enum six_lock_type t2)
 	return t1 + t2 > 1;
 }
 
-static int check_for_deadlock(struct btree_trans *trans)
+int bch2_check_for_deadlock(struct btree_trans *trans, struct printbuf *cycle)
 {
 	struct lock_graph g;
 	struct trans_waiting_for_lock *top;
@@ -214,7 +220,7 @@ static int check_for_deadlock(struct btree_trans *trans)
 		return btree_trans_restart(trans, BCH_ERR_transaction_restart_would_deadlock);
 
 	g.nr = 0;
-	ret = lock_graph_descend(&g, trans);
+	ret = lock_graph_descend(&g, trans, cycle);
 	BUG_ON(ret);
 next:
 	if (!g.nr)
@@ -265,7 +271,7 @@ next:
 				    !lock_type_conflicts(lock_held, trans->locking_wait.lock_want))
 					continue;
 
-				ret = lock_graph_descend(&g, trans);
+				ret = lock_graph_descend(&g, trans, cycle);
 				raw_spin_unlock(&b->lock.wait_lock);
 
 				if (ret)
@@ -285,7 +291,7 @@ int bch2_six_check_for_deadlock(struct six_lock *lock, void *p)
 {
 	struct btree_trans *trans = p;
 
-	return check_for_deadlock(trans);
+	return bch2_check_for_deadlock(trans, NULL);
 }
 
 int __bch2_btree_node_lock_write(struct btree_trans *trans,
diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h
index 874dd4428b3a..86f68b26cc94 100644
--- a/fs/bcachefs/btree_locking.h
+++ b/fs/bcachefs/btree_locking.h
@@ -426,6 +426,7 @@ struct six_lock_count bch2_btree_node_lock_counts(struct btree_trans *,
 				struct btree_bkey_cached_common *b,
 				unsigned);
 
+int bch2_check_for_deadlock(struct btree_trans *, struct printbuf *);
 
 #ifdef CONFIG_BCACHEFS_DEBUG
 void bch2_btree_path_verify_locks(struct btree_path *);
diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c
index 6944dfef5bcb..41b2772afef9 100644
--- a/fs/bcachefs/debug.c
+++ b/fs/bcachefs/debug.c
@@ -11,6 +11,7 @@
 #include "btree_cache.h"
 #include "btree_io.h"
 #include "btree_iter.h"
+#include "btree_locking.h"
 #include "btree_update.h"
 #include "buckets.h"
 #include "debug.h"
@@ -708,6 +709,45 @@ static const struct file_operations lock_held_stats_op = {
 	.read = lock_held_stats_read,
 };
 
+static ssize_t bch2_btree_deadlock_read(struct file *file, char __user *buf,
+					    size_t size, loff_t *ppos)
+{
+	struct dump_iter *i = file->private_data;
+	struct bch_fs *c = i->c;
+	struct btree_trans *trans;
+	ssize_t ret = 0;
+
+	i->ubuf = buf;
+	i->size	= size;
+	i->ret	= 0;
+
+	if (i->iter)
+		goto out;
+
+	mutex_lock(&c->btree_trans_lock);
+	list_for_each_entry(trans, &c->btree_trans_list, list)
+		if (bch2_check_for_deadlock(trans, &i->buf)) {
+			i->iter = 1;
+			break;
+		}
+	mutex_unlock(&c->btree_trans_lock);
+out:
+	if (i->buf.allocation_failure)
+		ret = -ENOMEM;
+
+	if (!ret)
+		ret = flush_buf(i);
+
+	return ret ?: i->ret;
+}
+
+static const struct file_operations btree_deadlock_ops = {
+	.owner		= THIS_MODULE,
+	.open		= bch2_dump_open,
+	.release	= bch2_dump_release,
+	.read		= bch2_btree_deadlock_read,
+};
+
 void bch2_fs_debug_exit(struct bch_fs *c)
 {
 	if (!IS_ERR_OR_NULL(c->fs_debug_dir))
@@ -741,6 +781,9 @@ void bch2_fs_debug_init(struct bch_fs *c)
 	debugfs_create_file("btree_transaction_stats", 0400, c->fs_debug_dir,
 			    c, &lock_held_stats_op);
 
+	debugfs_create_file("btree_deadlock", 0400, c->fs_debug_dir,
+			    c->btree_debug, &btree_deadlock_ops);
+
 	c->btree_debug_dir = debugfs_create_dir("btrees", c->fs_debug_dir);
 	if (IS_ERR_OR_NULL(c->btree_debug_dir))
 		return;
-- 
cgit 


From 0d7009d7ca99ad9261a7cffcecd515108377a6ac Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 22 Aug 2022 15:29:53 -0400
Subject: bcachefs: Delete old deadlock avoidance code

This deletes our old lock ordering based deadlock avoidance code.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_cache.c       |  37 ++++-----------
 fs/bcachefs/btree_iter.c        |  42 ++---------------
 fs/bcachefs/btree_key_cache.c   |  27 +++--------
 fs/bcachefs/btree_locking.c     | 100 ++++------------------------------------
 fs/bcachefs/btree_locking.h     |  48 +++----------------
 fs/bcachefs/btree_types.h       |   4 --
 fs/bcachefs/btree_update_leaf.c |  39 ++--------------
 fs/bcachefs/errcode.h           |   1 -
 fs/bcachefs/trace.h             |  53 ++-------------------
 9 files changed, 40 insertions(+), 311 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index a0e9e14e3fa5..aeb058c800cd 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -151,8 +151,6 @@ void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b)
 
 	/* Cause future lookups for this node to fail: */
 	b->hash_val = 0;
-
-	six_lock_wakeup_all(&b->c.lock);
 }
 
 int __bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b)
@@ -755,16 +753,6 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c,
 	return b;
 }
 
-static int lock_node_check_fn(struct six_lock *lock, void *p)
-{
-	struct btree *b = container_of(lock, struct btree, c.lock);
-	const struct bkey_i *k = p;
-
-	if (b->hash_val != btree_ptr_hash_val(k))
-		return BCH_ERR_lock_fail_node_reused;
-	return 0;
-}
-
 static noinline void btree_bad_header(struct bch_fs *c, struct btree *b)
 {
 	struct printbuf buf = PRINTBUF;
@@ -886,15 +874,11 @@ lock_node:
 		if (btree_node_read_locked(path, level + 1))
 			btree_node_unlock(trans, path, level + 1);
 
-		ret = btree_node_lock(trans, path, &b->c, k->k.p, level, lock_type,
-				      lock_node_check_fn, (void *) k, trace_ip);
-		if (unlikely(ret)) {
-			if (bch2_err_matches(ret, BCH_ERR_lock_fail_node_reused))
-				goto retry;
-			if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-				return ERR_PTR(ret);
-			BUG();
-		}
+		ret = btree_node_lock(trans, path, &b->c, level, lock_type, trace_ip);
+		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+			return ERR_PTR(ret);
+
+		BUG_ON(ret);
 
 		if (unlikely(b->hash_val != btree_ptr_hash_val(k) ||
 			     b->c.level != level ||
@@ -1000,13 +984,10 @@ retry:
 	} else {
 lock_node:
 		ret = btree_node_lock_nopath(trans, &b->c, SIX_LOCK_read);
-		if (unlikely(ret)) {
-			if (bch2_err_matches(ret, BCH_ERR_lock_fail_node_reused))
-				goto retry;
-			if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-				return ERR_PTR(ret);
-			BUG();
-		}
+		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+			return ERR_PTR(ret);
+
+		BUG_ON(ret);
 
 		if (unlikely(b->hash_val != btree_ptr_hash_val(k) ||
 			     b->c.btree_id != btree_id ||
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index ece80d7914b2..6f4af13cf9e4 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -689,16 +689,6 @@ void bch2_trans_node_reinit_iter(struct btree_trans *trans, struct btree *b)
 
 /* Btree path: traverse, set_pos: */
 
-static int lock_root_check_fn(struct six_lock *lock, void *p)
-{
-	struct btree *b = container_of(lock, struct btree, c.lock);
-	struct btree **rootp = p;
-
-	if (b != *rootp)
-		return BCH_ERR_lock_fail_root_changed;
-	return 0;
-}
-
 static inline int btree_path_lock_root(struct btree_trans *trans,
 				       struct btree_path *path,
 				       unsigned depth_want,
@@ -730,10 +720,8 @@ static inline int btree_path_lock_root(struct btree_trans *trans,
 		}
 
 		lock_type = __btree_lock_want(path, path->level);
-		ret = btree_node_lock(trans, path, &b->c, SPOS_MAX,
-				      path->level, lock_type,
-				      lock_root_check_fn, rootp,
-				      trace_ip);
+		ret = btree_node_lock(trans, path, &b->c,
+				      path->level, lock_type, trace_ip);
 		if (unlikely(ret)) {
 			if (bch2_err_matches(ret, BCH_ERR_lock_fail_root_changed))
 				continue;
@@ -939,7 +927,7 @@ static int btree_path_traverse_one(struct btree_trans *, struct btree_path *,
 static int bch2_btree_path_traverse_all(struct btree_trans *trans)
 {
 	struct bch_fs *c = trans->c;
-	struct btree_path *path, *prev;
+	struct btree_path *path;
 	unsigned long trace_ip = _RET_IP_;
 	int i, ret = 0;
 
@@ -948,7 +936,6 @@ static int bch2_btree_path_traverse_all(struct btree_trans *trans)
 
 	trans->in_traverse_all = true;
 retry_all:
-	prev = NULL;
 	trans->restarted = 0;
 
 	trans_for_each_path(trans, path)
@@ -956,18 +943,6 @@ retry_all:
 
 	btree_trans_sort_paths(trans);
 
-	trans_for_each_path_inorder_reverse(trans, path, i) {
-		if (prev) {
-			if (path->btree_id == prev->btree_id &&
-			    path->locks_want < prev->locks_want)
-				__bch2_btree_path_upgrade(trans, path, prev->locks_want);
-			else if (!path->locks_want && prev->locks_want)
-				__bch2_btree_path_upgrade(trans, path, 1);
-		}
-
-		prev = path;
-	}
-
 	bch2_trans_unlock(trans);
 	cond_resched();
 
@@ -3026,16 +3001,7 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct btree_trans *trans)
 
 	b = READ_ONCE(trans->locking);
 	if (b) {
-		path = &trans->paths[trans->locking_path_idx];
-		prt_printf(out, "  locking path %u %c l=%u %c %s:",
-		       trans->locking_path_idx,
-		       path->cached ? 'c' : 'b',
-		       trans->locking_level,
-		       lock_types[trans->locking_wait.lock_want],
-		       bch2_btree_ids[trans->locking_btree_id]);
-		bch2_bpos_to_text(out, trans->locking_pos);
-
-		prt_printf(out, " node ");
+		prt_printf(out, " locking node ");
 		bch2_btree_path_node_to_text(out, b);
 		prt_printf(out, "\n");
 	}
diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index 977c523359a5..1a88d1d79699 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -398,17 +398,6 @@ err:
 	return ret;
 }
 
-static int bkey_cached_check_fn(struct six_lock *lock, void *p)
-{
-	struct bkey_cached *ck = container_of(lock, struct bkey_cached, c.lock);
-	const struct btree_path *path = p;
-
-	if (ck->key.btree_id != path->btree_id &&
-	    bpos_cmp(ck->key.pos, path->pos))
-		return BCH_ERR_lock_fail_node_reused;
-	return 0;
-}
-
 __flatten
 int bch2_btree_path_traverse_cached(struct btree_trans *trans, struct btree_path *path,
 				    unsigned flags)
@@ -440,16 +429,12 @@ retry:
 	} else {
 		enum six_lock_type lock_want = __btree_lock_want(path, 0);
 
-		ret = btree_node_lock(trans, path, (void *) ck, path->pos, 0,
-				      lock_want,
-				      bkey_cached_check_fn, path, _THIS_IP_);
-		if (ret) {
-			if (bch2_err_matches(ret, BCH_ERR_lock_fail_node_reused))
-				goto retry;
-			if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-				goto err;
-			BUG();
-		}
+		ret = btree_node_lock(trans, path, (void *) ck, 0,
+				      lock_want, _THIS_IP_);
+		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+			goto err;
+
+		BUG_ON(ret);
 
 		if (ck->key.btree_id != path->btree_id ||
 		    bpos_cmp(ck->key.pos, path->pos)) {
diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c
index e270579d3622..11f83a936dc7 100644
--- a/fs/bcachefs/btree_locking.c
+++ b/fs/bcachefs/btree_locking.c
@@ -92,6 +92,7 @@ static int abort_lock(struct lock_graph *g, struct trans_waiting_for_lock *i)
 	int ret;
 
 	if (i == g->g) {
+		trace_and_count(i->trans->c, trans_restart_would_deadlock, i->trans, _RET_IP_);
 		ret = btree_trans_restart(i->trans, BCH_ERR_transaction_restart_would_deadlock);
 	} else {
 		i->trans->lock_must_abort = true;
@@ -216,8 +217,10 @@ int bch2_check_for_deadlock(struct btree_trans *trans, struct printbuf *cycle)
 	struct btree_path *path;
 	int ret;
 
-	if (trans->lock_must_abort)
+	if (trans->lock_must_abort) {
+		trace_and_count(trans->c, trans_restart_would_deadlock, trans, _RET_IP_);
 		return btree_trans_restart(trans, BCH_ERR_transaction_restart_would_deadlock);
+	}
 
 	g.nr = 0;
 	ret = lock_graph_descend(&g, trans, cycle);
@@ -294,7 +297,7 @@ int bch2_six_check_for_deadlock(struct six_lock *lock, void *p)
 	return bch2_check_for_deadlock(trans, NULL);
 }
 
-int __bch2_btree_node_lock_write(struct btree_trans *trans,
+int __bch2_btree_node_lock_write(struct btree_trans *trans, struct btree_path *path,
 				 struct btree_bkey_cached_common *b,
 				 bool lock_may_not_fail)
 {
@@ -311,97 +314,10 @@ int __bch2_btree_node_lock_write(struct btree_trans *trans,
 	ret = __btree_node_lock_nopath(trans, b, SIX_LOCK_write, lock_may_not_fail);
 	six_lock_readers_add(&b->lock, readers);
 
-	return ret;
-}
-
-static inline bool path_has_read_locks(struct btree_path *path)
-{
-	unsigned l;
-
-	for (l = 0; l < BTREE_MAX_DEPTH; l++)
-		if (btree_node_read_locked(path, l))
-			return true;
-	return false;
-}
-
-/* Slowpath: */
-int __bch2_btree_node_lock(struct btree_trans *trans,
-			   struct btree_path *path,
-			   struct btree_bkey_cached_common *b,
-			   struct bpos pos, unsigned level,
-			   enum six_lock_type type,
-			   six_lock_should_sleep_fn should_sleep_fn, void *p,
-			   unsigned long ip)
-{
-	struct btree_path *linked;
-	unsigned reason;
-
-	/* Check if it's safe to block: */
-	trans_for_each_path(trans, linked) {
-		if (!linked->nodes_locked)
-			continue;
-
-		/*
-		 * Can't block taking an intent lock if we have _any_ nodes read
-		 * locked:
-		 *
-		 * - Our read lock blocks another thread with an intent lock on
-		 *   the same node from getting a write lock, and thus from
-		 *   dropping its intent lock
-		 *
-		 * - And the other thread may have multiple nodes intent locked:
-		 *   both the node we want to intent lock, and the node we
-		 *   already have read locked - deadlock:
-		 */
-		if (type == SIX_LOCK_intent &&
-		    path_has_read_locks(linked)) {
-			reason = 1;
-			goto deadlock;
-		}
+	if (ret)
+		mark_btree_node_locked_noreset(path, b->level, SIX_LOCK_intent);
 
-		if (linked->btree_id != path->btree_id) {
-			if (linked->btree_id < path->btree_id)
-				continue;
-
-			reason = 3;
-			goto deadlock;
-		}
-
-		/*
-		 * Within the same btree, non-cached paths come before cached
-		 * paths:
-		 */
-		if (linked->cached != path->cached) {
-			if (!linked->cached)
-				continue;
-
-			reason = 4;
-			goto deadlock;
-		}
-
-		/*
-		 * Interior nodes must be locked before their descendants: if
-		 * another path has possible descendants locked of the node
-		 * we're about to lock, it must have the ancestors locked too:
-		 */
-		if (level > btree_path_highest_level_locked(linked)) {
-			reason = 5;
-			goto deadlock;
-		}
-
-		/* Must lock btree nodes in key order: */
-		if (btree_node_locked(linked, level) &&
-		    bpos_cmp(pos, btree_node_pos(&linked->l[level].b->c)) <= 0) {
-			reason = 7;
-			goto deadlock;
-		}
-	}
-
-	return btree_node_lock_type(trans, path, b, pos, level,
-				    type, should_sleep_fn, p);
-deadlock:
-	trace_and_count(trans->c, trans_restart_would_deadlock, trans, ip, reason, linked, path, &pos);
-	return btree_trans_restart(trans, BCH_ERR_transaction_restart_would_deadlock);
+	return ret;
 }
 
 /* relock */
diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h
index 86f68b26cc94..6d8df25bf076 100644
--- a/fs/bcachefs/btree_locking.h
+++ b/fs/bcachefs/btree_locking.h
@@ -195,8 +195,8 @@ static inline int __btree_node_lock_nopath(struct btree_trans *trans,
 	int ret;
 
 	trans->lock_may_not_fail = lock_may_not_fail;
-	trans->locking		= b;
 	trans->lock_must_abort	= false;
+	trans->locking		= b;
 
 	ret = six_lock_type_waiter(&b->lock, type, &trans->locking_wait,
 				   bch2_six_check_for_deadlock, trans);
@@ -222,26 +222,6 @@ static inline void btree_node_lock_nopath_nofail(struct btree_trans *trans,
 	BUG_ON(ret);
 }
 
-static inline int btree_node_lock_type(struct btree_trans *trans,
-				       struct btree_path *path,
-				       struct btree_bkey_cached_common *b,
-				       struct bpos pos, unsigned level,
-				       enum six_lock_type type,
-				       six_lock_should_sleep_fn should_sleep_fn, void *p)
-{
-	if (six_trylock_type(&b->lock, type))
-		return 0;
-
-	trans->locking_path_idx = path->idx;
-	trans->locking_pos	= pos;
-	trans->locking_btree_id	= path->btree_id;
-	trans->locking_level	= level;
-	trans->lock_may_not_fail = false;
-	trans->locking		= b;
-	return six_lock_type_waiter(&b->lock, type, &trans->locking_wait,
-				    bch2_six_check_for_deadlock, trans);
-}
-
 /*
  * Lock a btree node if we already have it locked on one of our linked
  * iterators:
@@ -263,19 +243,11 @@ static inline bool btree_node_lock_increment(struct btree_trans *trans,
 	return false;
 }
 
-int __bch2_btree_node_lock(struct btree_trans *, struct btree_path *,
-			   struct btree_bkey_cached_common *,
-			   struct bpos, unsigned,
-			   enum six_lock_type,
-			   six_lock_should_sleep_fn, void *,
-			   unsigned long);
-
 static inline int btree_node_lock(struct btree_trans *trans,
 			struct btree_path *path,
 			struct btree_bkey_cached_common *b,
-			struct bpos pos, unsigned level,
+			unsigned level,
 			enum six_lock_type type,
-			six_lock_should_sleep_fn should_sleep_fn, void *p,
 			unsigned long ip)
 {
 	int ret = 0;
@@ -285,8 +257,7 @@ static inline int btree_node_lock(struct btree_trans *trans,
 
 	if (likely(six_trylock_type(&b->lock, type)) ||
 	    btree_node_lock_increment(trans, b, level, type) ||
-	    !(ret = __bch2_btree_node_lock(trans, path, b, pos, level, type,
-					   should_sleep_fn, p, ip))) {
+	    !(ret = btree_node_lock_nopath(trans, b, type))) {
 #ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS
 		path->l[b->level].lock_taken_time = ktime_get_ns();
 #endif
@@ -295,15 +266,14 @@ static inline int btree_node_lock(struct btree_trans *trans,
 	return ret;
 }
 
-int __bch2_btree_node_lock_write(struct btree_trans *, struct btree_bkey_cached_common *, bool);
+int __bch2_btree_node_lock_write(struct btree_trans *, struct btree_path *,
+				 struct btree_bkey_cached_common *b, bool);
 
 static inline int __btree_node_lock_write(struct btree_trans *trans,
 					  struct btree_path *path,
 					  struct btree_bkey_cached_common *b,
 					  bool lock_may_not_fail)
 {
-	int ret;
-
 	EBUG_ON(&path->l[b->level].b->c != b);
 	EBUG_ON(path->l[b->level].lock_seq != b->lock.state.seq);
 	EBUG_ON(!btree_node_intent_locked(path, b->level));
@@ -315,13 +285,9 @@ static inline int __btree_node_lock_write(struct btree_trans *trans,
 	 */
 	mark_btree_node_locked_noreset(path, b->level, SIX_LOCK_write);
 
-	ret = likely(six_trylock_write(&b->lock))
+	return likely(six_trylock_write(&b->lock))
 		? 0
-		: __bch2_btree_node_lock_write(trans, b, lock_may_not_fail);
-	if (ret)
-		mark_btree_node_locked_noreset(path, b->level, SIX_LOCK_intent);
-
-	return ret;
+		: __bch2_btree_node_lock_write(trans, path, b, lock_may_not_fail);
 }
 
 static inline void bch2_btree_node_lock_write_nofail(struct btree_trans *trans,
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 578cf8fa3d2f..2b57e6d6ed31 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -391,10 +391,6 @@ struct btree_trans {
 	struct list_head	list;
 	u64			last_begin_time;
 
-	unsigned		locking_path_idx;
-	struct bpos		locking_pos;
-	u8			locking_btree_id;
-	u8			locking_level;
 	u8			lock_may_not_fail;
 	u8			lock_must_abort;
 	struct btree_bkey_cached_common *locking;
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index bf3177a3a420..7b21971fa13d 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -796,23 +796,6 @@ static inline void normalize_read_intent_locks(struct btree_trans *trans)
 	bch2_trans_verify_locks(trans);
 }
 
-static inline bool have_conflicting_read_lock(struct btree_trans *trans, struct btree_path *pos)
-{
-	struct btree_path *path;
-	unsigned i;
-
-	trans_for_each_path_inorder(trans, path, i) {
-		//if (path == pos)
-		//	break;
-
-		if (btree_node_read_locked(path, path->level) &&
-		    !bch2_btree_path_upgrade_noupgrade_sibs(trans, path, path->level + 1))
-			return true;
-	}
-
-	return false;
-}
-
 static inline int trans_lock_write(struct btree_trans *trans)
 {
 	struct btree_insert_entry *i;
@@ -822,31 +805,15 @@ static inline int trans_lock_write(struct btree_trans *trans)
 		if (same_leaf_as_prev(trans, i))
 			continue;
 
-		/*
-		 * six locks are unfair, and read locks block while a thread
-		 * wants a write lock: thus, we need to tell the cycle detector
-		 * we have a write lock _before_ taking the lock:
-		 */
-		mark_btree_node_locked_noreset(i->path, i->level, SIX_LOCK_write);
-
-		if (!six_trylock_write(&insert_l(i)->b->c.lock)) {
-			if (have_conflicting_read_lock(trans, i->path))
-				goto fail;
-
-			ret = btree_node_lock_type(trans, i->path,
-					     &insert_l(i)->b->c,
-					     i->path->pos, i->level,
-					     SIX_LOCK_write, NULL, NULL);
-			BUG_ON(ret);
-		}
+		ret = bch2_btree_node_lock_write(trans, i->path, &insert_l(i)->b->c);
+		if (ret)
+			goto fail;
 
 		bch2_btree_node_prep_for_write(trans, i->path, insert_l(i)->b);
 	}
 
 	return 0;
 fail:
-	mark_btree_node_locked_noreset(i->path, i->level, SIX_LOCK_intent);
-
 	while (--i >= trans->updates) {
 		if (same_leaf_as_prev(trans, i))
 			continue;
diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h
index 1ea004f1adbb..bf7ae99d9cce 100644
--- a/fs/bcachefs/errcode.h
+++ b/fs/bcachefs/errcode.h
@@ -52,7 +52,6 @@
 	x(BCH_ERR_no_btree_node,	no_btree_node_down)			\
 	x(BCH_ERR_no_btree_node,	no_btree_node_init)			\
 	x(BCH_ERR_no_btree_node,	no_btree_node_cached)			\
-	x(0,				lock_fail_node_reused)			\
 	x(0,				lock_fail_root_changed)			\
 	x(0,				journal_reclaim_would_deadlock)		\
 	x(0,				fsck)					\
diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h
index 35c40678f4b5..69e142d8b651 100644
--- a/fs/bcachefs/trace.h
+++ b/fs/bcachefs/trace.h
@@ -1012,57 +1012,10 @@ DEFINE_EVENT(transaction_restart_iter,	trans_restart_memory_allocation_failure,
 	TP_ARGS(trans, caller_ip, path)
 );
 
-TRACE_EVENT(trans_restart_would_deadlock,
+DEFINE_EVENT(transaction_event,	trans_restart_would_deadlock,
 	TP_PROTO(struct btree_trans *trans,
-		 unsigned long	caller_ip,
-		 unsigned	reason,
-		 struct btree_path *have,
-		 struct btree_path *want,
-		 struct bpos	*want_pos),
-	TP_ARGS(trans, caller_ip, reason,
-		have, want, want_pos),
-
-	TP_STRUCT__entry(
-		__array(char,			trans_fn, 32	)
-		__field(unsigned long,		caller_ip	)
-		__field(u8,			in_traverse_all	)
-		__field(u8,			reason		)
-		__field(u8,			have_btree_id	)
-		__field(u8,			have_type	)
-		__field(u8,			want_btree_id	)
-		__field(u8,			want_type	)
-		TRACE_BPOS_entries(have_pos)
-		TRACE_BPOS_entries(want_pos)
-	),
-
-	TP_fast_assign(
-		strlcpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
-		__entry->caller_ip		= caller_ip;
-		__entry->in_traverse_all	= trans->in_traverse_all;
-		__entry->reason			= reason;
-		__entry->have_btree_id		= have->btree_id;
-		__entry->have_type		= have->cached;
-		__entry->want_btree_id		= want->btree_id;
-		__entry->want_type		= want->cached;
-		TRACE_BPOS_assign(have_pos, have->pos);
-		TRACE_BPOS_assign(want_pos, *want_pos);
-	),
-
-	TP_printk("%s %pS traverse_all %u because %u have %u:%u %llu:%llu:%u want %u:%u %llu:%llu:%u",
-		  __entry->trans_fn,
-		  (void *) __entry->caller_ip,
-		  __entry->in_traverse_all,
-		  __entry->reason,
-		  __entry->have_btree_id,
-		  __entry->have_type,
-		  __entry->have_pos_inode,
-		  __entry->have_pos_offset,
-		  __entry->have_pos_snapshot,
-		  __entry->want_btree_id,
-		  __entry->want_type,
-		  __entry->want_pos_inode,
-		  __entry->want_pos_offset,
-		  __entry->want_pos_snapshot)
+		 unsigned long caller_ip),
+	TP_ARGS(trans, caller_ip)
 );
 
 DEFINE_EVENT(transaction_event,	trans_restart_would_deadlock_recursion_limit,
-- 
cgit 


From 2ec254c098da677295c2487ae36e75a26d557222 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 6 Mar 2023 08:58:02 -0500
Subject: bcachefs: Ensure bch2_btree_node_lock_write_nofail() never fails

In order for bch2_btree_node_lock_write_nofail() to never produce a
deadlock, we must ensure we're never holding read locks when using it.
Fortunately, it's only used from code paths where any read locks may be
safely dropped.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_locking.c | 34 ++++++++++++++++++++++++++++++++++
 fs/bcachefs/btree_locking.h | 12 ++++--------
 2 files changed, 38 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c
index 11f83a936dc7..6793d7dd18d7 100644
--- a/fs/bcachefs/btree_locking.c
+++ b/fs/bcachefs/btree_locking.c
@@ -320,6 +320,40 @@ int __bch2_btree_node_lock_write(struct btree_trans *trans, struct btree_path *p
 	return ret;
 }
 
+void bch2_btree_node_lock_write_nofail(struct btree_trans *trans,
+				       struct btree_path *path,
+				       struct btree_bkey_cached_common *b)
+{
+	struct btree_path *linked;
+	unsigned i;
+	int ret;
+
+	/*
+	 * XXX BIG FAT NOTICE
+	 *
+	 * Drop all read locks before taking a write lock:
+	 *
+	 * This is a hack, because bch2_btree_node_lock_write_nofail() is a
+	 * hack - but by dropping read locks first, this should never fail, and
+	 * we only use this in code paths where whatever read locks we've
+	 * already taken are no longer needed:
+	 */
+
+	trans_for_each_path(trans, linked) {
+		if (!linked->nodes_locked)
+			continue;
+
+		for (i = 0; i < BTREE_MAX_DEPTH; i++)
+			if (btree_node_read_locked(linked, i)) {
+				btree_node_unlock(trans, linked, i);
+				btree_path_set_dirty(linked, BTREE_ITER_NEED_RELOCK);
+			}
+	}
+
+	ret = __btree_node_lock_write(trans, path, b, true);
+	BUG_ON(ret);
+}
+
 /* relock */
 
 static inline bool btree_path_get_locks(struct btree_trans *trans,
diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h
index 6d8df25bf076..95089693a420 100644
--- a/fs/bcachefs/btree_locking.h
+++ b/fs/bcachefs/btree_locking.h
@@ -290,14 +290,6 @@ static inline int __btree_node_lock_write(struct btree_trans *trans,
 		: __bch2_btree_node_lock_write(trans, path, b, lock_may_not_fail);
 }
 
-static inline void bch2_btree_node_lock_write_nofail(struct btree_trans *trans,
-					      struct btree_path *path,
-					      struct btree_bkey_cached_common *b)
-{
-	int ret = __btree_node_lock_write(trans, path, b, true);
-	BUG_ON(ret);
-}
-
 static inline int __must_check
 bch2_btree_node_lock_write(struct btree_trans *trans,
 			   struct btree_path *path,
@@ -306,6 +298,10 @@ bch2_btree_node_lock_write(struct btree_trans *trans,
 	return __btree_node_lock_write(trans, path, b, false);
 }
 
+void bch2_btree_node_lock_write_nofail(struct btree_trans *,
+				       struct btree_path *,
+				       struct btree_bkey_cached_common *);
+
 /* relock: */
 
 bool bch2_btree_path_relock_norestart(struct btree_trans *,
-- 
cgit 


From 8b31e4fc7d9e27fd5dc56bb063acf9e4ea575973 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 22 Aug 2022 15:29:53 -0400
Subject: bcachefs: Kill normalize_read_intent_locks()

Before we had the deadlock cycle detector, we didn't want to be holding
read locks when taking intent locks, because blocking on an intent lock
while holding a read lock was a lock ordering violation that could
cause a deadlock.

With the cycle detector this is no longer an issue, so this code can be
deleted.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/btree_update_leaf.c | 58 -----------------------------------------
 1 file changed, 58 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 7b21971fa13d..ad46c887185b 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -740,62 +740,6 @@ bch2_trans_commit_write_locked(struct btree_trans *trans,
 	return ret;
 }
 
-static inline void path_upgrade_readers(struct btree_trans *trans, struct btree_path *path)
-{
-	unsigned l;
-
-	for (l = 0; l < BTREE_MAX_DEPTH; l++)
-		if (btree_node_read_locked(path, l))
-			BUG_ON(!bch2_btree_node_upgrade(trans, path, l));
-}
-
-static inline void upgrade_readers(struct btree_trans *trans, struct btree_path *path)
-{
-	struct btree *b = path_l(path)->b;
-	unsigned l;
-
-	do {
-		for (l = 0; l < BTREE_MAX_DEPTH; l++)
-			if (btree_node_read_locked(path, l))
-				path_upgrade_readers(trans, path);
-	} while ((path = prev_btree_path(trans, path)) &&
-		 path_l(path)->b == b);
-}
-
-/*
- * Check for nodes that we have both read and intent locks on, and upgrade the
- * readers to intent:
- */
-static inline void normalize_read_intent_locks(struct btree_trans *trans)
-{
-	struct btree_path *path;
-	unsigned i, nr_read = 0, nr_intent = 0;
-
-	trans_for_each_path_inorder(trans, path, i) {
-		struct btree_path *next = i + 1 < trans->nr_sorted
-			? trans->paths + trans->sorted[i + 1]
-			: NULL;
-
-		switch (btree_node_locked_type(path, path->level)) {
-		case BTREE_NODE_READ_LOCKED:
-			nr_read++;
-			break;
-		case BTREE_NODE_INTENT_LOCKED:
-			nr_intent++;
-			break;
-		}
-
-		if (!next || path_l(path)->b != path_l(next)->b) {
-			if (nr_read && nr_intent)
-				upgrade_readers(trans, path);
-
-			nr_read = nr_intent = 0;
-		}
-	}
-
-	bch2_trans_verify_locks(trans);
-}
-
 static inline int trans_lock_write(struct btree_trans *trans)
 {
 	struct btree_insert_entry *i;
@@ -899,8 +843,6 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans,
 	if (unlikely(ret))
 		return ret;
 
-	normalize_read_intent_locks(trans);
-
 	ret = trans_lock_write(trans);
 	if (unlikely(ret))
 		return ret;
-- 
cgit 


From afbc71946861902cfee7bee2b16c8e1570375e0e Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 1 Sep 2022 22:56:27 -0400
Subject: bcachefs: Improve bch2_btree_trans_to_text()

This is just a formatting/readability improvement.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c | 31 ++++++++++++++++++++-----------
 1 file changed, 20 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 6f4af13cf9e4..962010230c41 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -2947,8 +2947,8 @@ void bch2_trans_exit(struct btree_trans *trans)
 }
 
 static void __maybe_unused
-bch2_btree_path_node_to_text(struct printbuf *out,
-			     struct btree_bkey_cached_common *b)
+bch2_btree_bkey_cached_common_to_text(struct printbuf *out,
+				      struct btree_bkey_cached_common *b)
 {
 	struct six_lock_count c = six_lock_counts(&b->lock);
 	struct task_struct *owner;
@@ -2959,11 +2959,13 @@ bch2_btree_path_node_to_text(struct printbuf *out,
 	pid = owner ? owner->pid : 0;;
 	rcu_read_unlock();
 
-	prt_printf(out, "    l=%u %s:",
-	       b->level, bch2_btree_ids[b->btree_id]);
+	prt_tab(out);
+	prt_printf(out, "%px %c l=%u %s:", b, b->cached ? 'c' : 'b',
+		   b->level, bch2_btree_ids[b->btree_id]);
 	bch2_bpos_to_text(out, btree_node_pos(b));
 
-	prt_printf(out, "    locks %u:%u:%u held by pid %u",
+	prt_tab(out);
+	prt_printf(out, " locks %u:%u:%u held by pid %u",
 		   c.n[0], c.n[1], c.n[2], pid);
 }
 
@@ -2974,6 +2976,11 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct btree_trans *trans)
 	static char lock_types[] = { 'r', 'i', 'w' };
 	unsigned l;
 
+	if (!out->nr_tabstops) {
+		printbuf_tabstop_push(out, 16);
+		printbuf_tabstop_push(out, 32);
+	}
+
 	prt_printf(out, "%i %s\n", trans->locking_wait.task->pid, trans->fn);
 
 	trans_for_each_path(trans, path) {
@@ -2986,24 +2993,26 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct btree_trans *trans)
 		       path->level,
 		       bch2_btree_ids[path->btree_id]);
 		bch2_bpos_to_text(out, path->pos);
-		prt_printf(out, "\n");
+		prt_newline(out);
 
 		for (l = 0; l < BTREE_MAX_DEPTH; l++) {
 			if (btree_node_locked(path, l) &&
 			    !IS_ERR_OR_NULL(b = (void *) READ_ONCE(path->l[l].b))) {
 				prt_printf(out, "    %c l=%u ",
 					   lock_types[btree_node_locked_type(path, l)], l);
-				bch2_btree_path_node_to_text(out, b);
-				prt_printf(out, "\n");
+				bch2_btree_bkey_cached_common_to_text(out, b);
+				prt_newline(out);
 			}
 		}
 	}
 
 	b = READ_ONCE(trans->locking);
 	if (b) {
-		prt_printf(out, " locking node ");
-		bch2_btree_path_node_to_text(out, b);
-		prt_printf(out, "\n");
+		prt_str(out, "  want");
+		prt_newline(out);
+		prt_printf(out, "    %c", lock_types[trans->locking_wait.lock_want]);
+		bch2_btree_bkey_cached_common_to_text(out, b);
+		prt_newline(out);
 	}
 }
 
-- 
cgit 


From d602657cd185784bf4b227ecd1a88ed90735eae5 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 22 Sep 2022 21:27:42 -0400
Subject: bcachefs: Fix error handling in bch2_btree_update_start()

We were checking for -EAGAIN, but we're not returned that when we didn't
pass a closure to wait with - oops.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update_interior.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 7028597358d5..0d87b1c88ed4 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -1110,8 +1110,8 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
 		goto err;
 
 	ret = bch2_btree_reserve_get(trans, as, nr_nodes, flags, NULL);
-	if (ret == -EAGAIN ||
-	    ret == -ENOMEM) {
+	if (bch2_err_matches(ret, ENOSPC) ||
+	    bch2_err_matches(ret, ENOMEM)) {
 		struct closure cl;
 
 		closure_init_stack(&cl);
-- 
cgit 


From 01ed3359b2705d7b862ad0644dabc616774f8779 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 23 Sep 2022 00:20:21 -0400
Subject: bcachefs: btree_update_nodes_written() needs BTREE_INSERT_USE_RESERVE

This fixes an obvious deadlock - whoops.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update_interior.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 0d87b1c88ed4..7a682c1751b3 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -625,6 +625,7 @@ static void btree_update_nodes_written(struct btree_update *as)
 	ret = commit_do(&trans, &as->disk_res, &journal_seq,
 			BTREE_INSERT_NOFAIL|
 			BTREE_INSERT_NOCHECK_RW|
+			BTREE_INSERT_USE_RESERVE|
 			BTREE_INSERT_JOURNAL_RECLAIM|
 			JOURNAL_WATERMARK_reserved,
 			btree_update_nodes_written_trans(&trans, as));
-- 
cgit 


From c6cf49a95ac7aea3450cacb66b7de1d46671deeb Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 23 Sep 2022 21:00:24 -0400
Subject: bcachefs: Fix blocking with locks held

This is a major oopsy - we should always be unlocking before calling
closure_sync(), else we'll cause a deadlock.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update_interior.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 7a682c1751b3..797efa738dc4 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -1117,10 +1117,10 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
 
 		closure_init_stack(&cl);
 
-		bch2_trans_unlock(trans);
-
 		do {
 			ret = bch2_btree_reserve_get(trans, as, nr_nodes, flags, &cl);
+
+			bch2_trans_unlock(trans);
 			closure_sync(&cl);
 		} while (ret == -EAGAIN);
 	}
-- 
cgit 


From c36ff038fd3af6092e356cb4ed1c79a041b77b19 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 25 Sep 2022 14:49:14 -0400
Subject: bcachefs: bch2_btree_cache_scan() improvement

We're still seeing OOM issues caused by the btree node cache shrinker
not sufficiently freeing memory: thus, this patch changes the shrinker
to not exit if __GFP_FS was not supplied.

Instead, tweak btree node memory allocation so that we never invoke
memory reclaim while holding the btree node cache lock.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_cache.c | 84 ++++++++++++++++++++++-------------------------
 1 file changed, 40 insertions(+), 44 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index aeb058c800cd..db786df19318 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -110,9 +110,9 @@ static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp)
 	return 0;
 }
 
-static struct btree *__btree_node_mem_alloc(struct bch_fs *c)
+static struct btree *__btree_node_mem_alloc(struct bch_fs *c, gfp_t gfp)
 {
-	struct btree *b = kzalloc(sizeof(struct btree), GFP_KERNEL);
+	struct btree *b = kzalloc(sizeof(struct btree), gfp);
 	if (!b)
 		return NULL;
 
@@ -128,7 +128,7 @@ static struct btree *__btree_node_mem_alloc(struct bch_fs *c)
 struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *c)
 {
 	struct btree_cache *bc = &c->btree_cache;
-	struct btree *b = __btree_node_mem_alloc(c);
+	struct btree *b = __btree_node_mem_alloc(c, GFP_KERNEL);
 	if (!b)
 		return NULL;
 
@@ -280,20 +280,17 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink,
 	struct btree *b, *t;
 	unsigned long nr = sc->nr_to_scan;
 	unsigned long can_free = 0;
-	unsigned long touched = 0;
 	unsigned long freed = 0;
+	unsigned long touched = 0;
 	unsigned i, flags;
 	unsigned long ret = SHRINK_STOP;
+	bool trigger_writes = atomic_read(&bc->dirty) + nr >=
+		bc->used * 3 / 4;
 
 	if (bch2_btree_shrinker_disabled)
 		return SHRINK_STOP;
 
-	/* Return -1 if we can't do anything right now */
-	if (sc->gfp_mask & __GFP_FS)
-		mutex_lock(&bc->lock);
-	else if (!mutex_trylock(&bc->lock))
-		goto out_norestore;
-
+	mutex_lock(&bc->lock);
 	flags = memalloc_nofs_save();
 
 	/*
@@ -318,7 +315,7 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink,
 		touched++;
 
 		if (touched >= nr)
-			break;
+			goto out;
 
 		if (!btree_node_reclaim(c, b)) {
 			btree_node_data_free(c, b);
@@ -329,52 +326,46 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink,
 	}
 restart:
 	list_for_each_entry_safe(b, t, &bc->live, list) {
-		/* tweak this */
+		touched++;
+
 		if (btree_node_accessed(b)) {
 			clear_btree_node_accessed(b);
-			goto touched;
-		}
-
-		if (!btree_node_reclaim(c, b)) {
-			/* can't call bch2_btree_node_hash_remove under lock  */
+		} else if (!btree_node_reclaim(c, b)) {
 			freed++;
-			if (&t->list != &bc->live)
-				list_move_tail(&bc->live, &t->list);
-
 			btree_node_data_free(c, b);
-			mutex_unlock(&bc->lock);
 
 			bch2_btree_node_hash_remove(bc, b);
 			six_unlock_write(&b->c.lock);
 			six_unlock_intent(&b->c.lock);
 
-			if (freed >= nr)
-				goto out;
-
-			if (sc->gfp_mask & __GFP_FS)
-				mutex_lock(&bc->lock);
-			else if (!mutex_trylock(&bc->lock))
-				goto out;
+			if (freed == nr)
+				goto out_rotate;
+		} else if (trigger_writes &&
+			   btree_node_dirty(b) &&
+			   !btree_node_will_make_reachable(b) &&
+			   !btree_node_write_blocked(b) &&
+			   six_trylock_read(&b->c.lock)) {
+			list_move(&bc->live, &b->list);
+			mutex_unlock(&bc->lock);
+			__bch2_btree_node_write(c, b, 0);
+			six_unlock_read(&b->c.lock);
+			if (touched >= nr)
+				goto out_nounlock;
+			mutex_lock(&bc->lock);
 			goto restart;
-		} else {
-			continue;
 		}
-touched:
-		touched++;
 
-		if (touched >= nr) {
-			/* Save position */
-			if (&t->list != &bc->live)
-				list_move_tail(&bc->live, &t->list);
+		if (touched >= nr)
 			break;
-		}
 	}
-
-	mutex_unlock(&bc->lock);
+out_rotate:
+	if (&t->list != &bc->live)
+		list_move_tail(&bc->live, &t->list);
 out:
+	mutex_unlock(&bc->lock);
+out_nounlock:
 	ret = freed;
 	memalloc_nofs_restore(flags);
-out_norestore:
 	trace_and_count(c, btree_cache_scan, sc->nr_to_scan, can_free, ret);
 	return ret;
 }
@@ -586,9 +577,14 @@ struct btree *bch2_btree_node_mem_alloc(struct bch_fs *c, bool pcpu_read_locks)
 			goto got_node;
 		}
 
-	b = __btree_node_mem_alloc(c);
-	if (!b)
-		goto err_locked;
+	b = __btree_node_mem_alloc(c, __GFP_NOWARN);
+	if (!b) {
+		mutex_unlock(&bc->lock);
+		b = __btree_node_mem_alloc(c, GFP_KERNEL);
+		if (!b)
+			goto err;
+		mutex_lock(&bc->lock);
+	}
 
 	if (pcpu_read_locks)
 		six_lock_pcpu_alloc(&b->c.lock);
@@ -641,7 +637,7 @@ out:
 	return b;
 err:
 	mutex_lock(&bc->lock);
-err_locked:
+
 	/* Try to cannibalize another cached btree node: */
 	if (bc->alloc_lock == current) {
 		b2 = btree_node_cannibalize(c);
-- 
cgit 


From e9174370d0522b466ea770576230b487941101f8 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 25 Sep 2022 16:42:53 -0400
Subject: bcachefs: bch2_btree_node_relock_notrace()

Most of the node_relock_fail trace events are generated from
bch2_btree_path_verify_level(), when debugcheck_iterators is enabled -
but we're not interested in these trace events, they don't indicate that
we're in a slowpath.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c    |  2 +-
 fs/bcachefs/btree_locking.c |  6 ++++--
 fs/bcachefs/btree_locking.h | 16 ++++++++++++++--
 3 files changed, 19 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 962010230c41..237e5c0afffa 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -167,7 +167,7 @@ static void bch2_btree_path_verify_level(struct btree_trans *trans,
 	if (!btree_path_node(path, level))
 		return;
 
-	if (!bch2_btree_node_relock(trans, path, level))
+	if (!bch2_btree_node_relock_notrace(trans, path, level))
 		return;
 
 	BUG_ON(!btree_path_pos_in_node(path, l->b));
diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c
index 6793d7dd18d7..19062cea8774 100644
--- a/fs/bcachefs/btree_locking.c
+++ b/fs/bcachefs/btree_locking.c
@@ -401,7 +401,8 @@ static inline bool btree_path_get_locks(struct btree_trans *trans,
 }
 
 bool __bch2_btree_node_relock(struct btree_trans *trans,
-			      struct btree_path *path, unsigned level)
+			      struct btree_path *path, unsigned level,
+			      bool trace)
 {
 	struct btree *b = btree_path_node(path, level);
 	int want = __btree_lock_want(path, level);
@@ -416,7 +417,8 @@ bool __bch2_btree_node_relock(struct btree_trans *trans,
 		return true;
 	}
 fail:
-	trace_and_count(trans->c, btree_path_relock_fail, trans, _RET_IP_, path, level);
+	if (trace)
+		trace_and_count(trans->c, btree_path_relock_fail, trans, _RET_IP_, path, level);
 	return false;
 }
 
diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h
index 95089693a420..c74a5fd4d908 100644
--- a/fs/bcachefs/btree_locking.h
+++ b/fs/bcachefs/btree_locking.h
@@ -317,7 +317,7 @@ static inline int bch2_btree_path_relock(struct btree_trans *trans,
 		: __bch2_btree_path_relock(trans, path, trace_ip);
 }
 
-bool __bch2_btree_node_relock(struct btree_trans *, struct btree_path *, unsigned);
+bool __bch2_btree_node_relock(struct btree_trans *, struct btree_path *, unsigned, bool trace);
 
 static inline bool bch2_btree_node_relock(struct btree_trans *trans,
 					  struct btree_path *path, unsigned level)
@@ -328,7 +328,19 @@ static inline bool bch2_btree_node_relock(struct btree_trans *trans,
 
 	return likely(btree_node_locked(path, level)) ||
 		(!IS_ERR_OR_NULL(path->l[level].b) &&
-		 __bch2_btree_node_relock(trans, path, level));
+		 __bch2_btree_node_relock(trans, path, level, true));
+}
+
+static inline bool bch2_btree_node_relock_notrace(struct btree_trans *trans,
+						  struct btree_path *path, unsigned level)
+{
+	EBUG_ON(btree_node_locked(path, level) &&
+		!btree_node_write_locked(path, level) &&
+		btree_node_locked_type(path, level) != __btree_lock_want(path, level));
+
+	return likely(btree_node_locked(path, level)) ||
+		(!IS_ERR_OR_NULL(path->l[level].b) &&
+		 __bch2_btree_node_relock(trans, path, level, false));
 }
 
 /* upgrade */
-- 
cgit 


From a8f35428430446d8c9e871b36ab2b49c0a9daec7 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 25 Sep 2022 16:43:55 -0400
Subject: bcachefs: bch2_print_string_as_lines()

This adds a helper for printing a large buffer one line at a time, to
avoid the 1k printk limit.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c |  7 +++----
 fs/bcachefs/util.c       | 21 +++++++++++++++++++++
 fs/bcachefs/util.h       |  2 ++
 3 files changed, 26 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 237e5c0afffa..d58e29acfda3 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1334,7 +1334,7 @@ void bch2_dump_trans_updates(struct btree_trans *trans)
 	struct printbuf buf = PRINTBUF;
 
 	bch2_trans_updates_to_text(&buf, trans);
-	bch_err(trans->c, "%s", buf.buf);
+	bch2_print_string_as_lines(KERN_ERR, buf.buf);
 	printbuf_exit(&buf);
 }
 
@@ -1382,11 +1382,10 @@ void __bch2_dump_trans_paths_updates(struct btree_trans *trans, bool nosort)
 	struct printbuf buf = PRINTBUF;
 
 	__bch2_trans_paths_to_text(&buf, trans, nosort);
+	bch2_trans_updates_to_text(&buf, trans);
 
-	printk(KERN_ERR "%s", buf.buf);
+	bch2_print_string_as_lines(KERN_ERR, buf.buf);
 	printbuf_exit(&buf);
-
-	bch2_dump_trans_updates(trans);
 }
 
 noinline __cold
diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c
index 61cd44c5a6b4..477c260de50b 100644
--- a/fs/bcachefs/util.c
+++ b/fs/bcachefs/util.c
@@ -8,6 +8,7 @@
 
 #include <linux/bio.h>
 #include <linux/blkdev.h>
+#include <linux/console.h>
 #include <linux/ctype.h>
 #include <linux/debugfs.h>
 #include <linux/freezer.h>
@@ -244,6 +245,26 @@ void bch2_prt_u64_binary(struct printbuf *out, u64 v, unsigned nr_bits)
 		prt_char(out, '0' + ((v >> --nr_bits) & 1));
 }
 
+void bch2_print_string_as_lines(const char *prefix, const char *lines)
+{
+	const char *p;
+
+	if (!lines) {
+		printk("%s (null)\n", prefix);
+		return;
+	}
+
+	console_lock();
+	while (1) {
+		p = strchrnul(lines, '\n');
+		printk("%s%.*s\n", prefix, (int) (p - lines), lines);
+		if (!*p)
+			break;
+		lines = p + 1;
+	}
+	console_unlock();
+}
+
 /* time stats: */
 
 #ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h
index 192d8b53f2ca..a16f8bb9d415 100644
--- a/fs/bcachefs/util.h
+++ b/fs/bcachefs/util.h
@@ -382,6 +382,8 @@ u64 bch2_read_flag_list(char *, const char * const[]);
 
 void bch2_prt_u64_binary(struct printbuf *, u64, unsigned);
 
+void bch2_print_string_as_lines(const char *prefix, const char *lines);
+
 #define NR_QUANTILES	15
 #define QUANTILE_IDX(i)	inorder_to_eytzinger0(i, NR_QUANTILES)
 #define QUANTILE_FIRST	eytzinger0_first(NR_QUANTILES)
-- 
cgit 


From dbb9936b0dc905657db6e5289be18e425f1b60d3 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 25 Sep 2022 18:18:48 -0400
Subject: bcachefs: Improve bch2_fsck_err()

 - factor out fsck_err_get()
 - if the "bcachefs (%s):" prefix has already been applied, don't
   duplicate it
 - convert to printbufs instead of static char arrays
 - tidy up control flow a bit
 - use bch2_print_string_as_lines(), to avoid messages getting truncated

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs.h |   4 +-
 fs/bcachefs/error.c    | 152 ++++++++++++++++++++++++++++++-------------------
 fs/bcachefs/error.h    |   3 +-
 3 files changed, 97 insertions(+), 62 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 74da688d994b..08fd899d8837 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -226,9 +226,11 @@ do {									\
 	 dynamic_fault("bcachefs:meta:write:" name)
 
 #ifdef __KERNEL__
-#define bch2_fmt(_c, fmt)		"bcachefs (%s): " fmt "\n", ((_c)->name)
+#define bch2_log_msg(_c, fmt)		"bcachefs (%s): " fmt, ((_c)->name)
+#define bch2_fmt(_c, fmt)		bch2_log_msg(_c, fmt "\n")
 #define bch2_fmt_inum(_c, _inum, fmt)	"bcachefs (%s inum %llu): " fmt "\n", ((_c)->name), (_inum)
 #else
+#define bch2_log_msg(_c, fmt)		fmt
 #define bch2_fmt(_c, fmt)		fmt "\n"
 #define bch2_fmt_inum(_c, _inum, fmt)	"inum %llu: " fmt "\n", (_inum)
 #endif
diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c
index f6a895b2ceb7..762abdf2f283 100644
--- a/fs/bcachefs/error.c
+++ b/fs/bcachefs/error.c
@@ -68,102 +68,135 @@ void bch2_io_error(struct bch_dev *ca)
 #include "tools-util.h"
 #endif
 
-int bch2_fsck_err(struct bch_fs *c, unsigned flags, const char *fmt, ...)
+static struct fsck_err_state *fsck_err_get(struct bch_fs *c, const char *fmt)
 {
-	struct fsck_err_state *s = NULL;
-	va_list args;
-	bool fix = false, print = true, suppressing = false;
-	char _buf[sizeof(s->buf)], *buf = _buf;
-
-	if (test_bit(BCH_FS_FSCK_DONE, &c->flags)) {
-		va_start(args, fmt);
-		vprintk(fmt, args);
-		va_end(args);
+	struct fsck_err_state *s;
 
-		if (c->opts.errors == BCH_ON_ERROR_continue) {
-			bch_err(c, "fixing");
-			return -BCH_ERR_fsck_fix;
-		} else {
-			bch2_inconsistent_error(c);
-			return -BCH_ERR_fsck_errors_not_fixed;
-		}
-	}
-
-	mutex_lock(&c->fsck_error_lock);
+	if (test_bit(BCH_FS_FSCK_DONE, &c->flags))
+		return NULL;
 
 	list_for_each_entry(s, &c->fsck_errors, list)
-		if (s->fmt == fmt)
-			goto found;
+		if (s->fmt == fmt) {
+			/*
+			 * move it to the head of the list: repeated fsck errors
+			 * are common
+			 */
+			list_move(&s->list, &c->fsck_errors);
+			return s;
+		}
 
 	s = kzalloc(sizeof(*s), GFP_NOFS);
 	if (!s) {
 		if (!c->fsck_alloc_err)
 			bch_err(c, "kmalloc err, cannot ratelimit fsck errs");
 		c->fsck_alloc_err = true;
-		buf = _buf;
-		goto print;
+		return NULL;
 	}
 
 	INIT_LIST_HEAD(&s->list);
 	s->fmt = fmt;
-found:
-	list_move(&s->list, &c->fsck_errors);
-	s->nr++;
-	if (c->opts.ratelimit_errors &&
-	    !(flags & FSCK_NO_RATELIMIT) &&
-	    s->nr >= FSCK_ERR_RATELIMIT_NR) {
-		if (s->nr == FSCK_ERR_RATELIMIT_NR)
-			suppressing = true;
-		else
-			print = false;
+	s->buf = PRINTBUF;
+	list_add(&s->list, &c->fsck_errors);
+	return s;
+}
+
+int bch2_fsck_err(struct bch_fs *c, unsigned flags, const char *fmt, ...)
+{
+	struct fsck_err_state *s = NULL;
+	va_list args;
+	bool print = true, suppressing = false;
+	struct printbuf buf = PRINTBUF, *out = &buf;
+	int ret = -BCH_ERR_fsck_ignore;
+
+	mutex_lock(&c->fsck_error_lock);
+	s = fsck_err_get(c, fmt);
+	if (s) {
+		if (c->opts.ratelimit_errors &&
+		    !(flags & FSCK_NO_RATELIMIT) &&
+		    s->nr >= FSCK_ERR_RATELIMIT_NR) {
+			if (s->nr == FSCK_ERR_RATELIMIT_NR)
+				suppressing = true;
+			else
+				print = false;
+		}
+
+		printbuf_reset(&s->buf);
+		out = &s->buf;
+		s->nr++;
 	}
-	buf		= s->buf;
-print:
+
+	if (!strncmp(fmt, "bcachefs:", 9))
+		prt_printf(out, bch2_log_msg(c, ""));
+
 	va_start(args, fmt);
-	vscnprintf(buf, sizeof(_buf), fmt, args);
+	prt_vprintf(out, fmt, args);
 	va_end(args);
 
-	if (c->opts.fix_errors == FSCK_OPT_EXIT) {
-		bch_err(c, "%s, exiting", buf);
+	if (test_bit(BCH_FS_FSCK_DONE, &c->flags)) {
+		if (c->opts.errors != BCH_ON_ERROR_continue ||
+		    !(flags & (FSCK_CAN_FIX|FSCK_CAN_IGNORE))) {
+			prt_str(out, ", shutting down");
+			bch2_inconsistent_error(c);
+			ret = -BCH_ERR_fsck_errors_not_fixed;
+		} else if (flags & FSCK_CAN_FIX) {
+			prt_str(out, ", fixing");
+			ret = -BCH_ERR_fsck_fix;
+		} else {
+			prt_str(out, ", continuing");
+			ret = -BCH_ERR_fsck_ignore;
+		}
+	} else if (c->opts.fix_errors == FSCK_OPT_EXIT) {
+		prt_str(out, ", exiting");
+		ret = -BCH_ERR_fsck_errors_not_fixed;
 	} else if (flags & FSCK_CAN_FIX) {
 		if (c->opts.fix_errors == FSCK_OPT_ASK) {
-			printk(KERN_ERR "%s: fix?", buf);
-			fix = ask_yn();
+			prt_str(out, ": fix?");
+			bch2_print_string_as_lines(KERN_ERR, out->buf);
+			print = false;
+			ret = ask_yn()
+				? -BCH_ERR_fsck_fix
+				: -BCH_ERR_fsck_ignore;
 		} else if (c->opts.fix_errors == FSCK_OPT_YES ||
 			   (c->opts.nochanges &&
 			    !(flags & FSCK_CAN_IGNORE))) {
-			if (print)
-				bch_err(c, "%s, fixing", buf);
-			fix = true;
+			prt_str(out, ", fixing");
+			ret = -BCH_ERR_fsck_fix;
 		} else {
-			if (print)
-				bch_err(c, "%s, not fixing", buf);
-			fix = false;
+			prt_str(out, ", not fixing");
 		}
 	} else if (flags & FSCK_NEED_FSCK) {
-		if (print)
-			bch_err(c, "%s (run fsck to correct)", buf);
+		prt_str(out, " (run fsck to correct)");
 	} else {
-		if (print)
-			bch_err(c, "%s (repair unimplemented)", buf);
+		prt_str(out, " (repair unimplemented)");
 	}
 
-	if (suppressing)
+	if (ret == -BCH_ERR_fsck_ignore &&
+	    (c->opts.fix_errors == FSCK_OPT_EXIT ||
+	     !(flags & FSCK_CAN_IGNORE)))
+		ret = -BCH_ERR_fsck_errors_not_fixed;
+
+	if (print)
+		bch2_print_string_as_lines(KERN_ERR, out->buf);
+
+	if (!test_bit(BCH_FS_FSCK_DONE, &c->flags) &&
+	    (ret != -BCH_ERR_fsck_fix &&
+	     ret != -BCH_ERR_fsck_ignore))
+		bch_err(c, "Unable to continue, halting");
+	else if (suppressing)
 		bch_err(c, "Ratelimiting new instances of previous error");
 
 	mutex_unlock(&c->fsck_error_lock);
 
-	if (fix) {
+	printbuf_exit(&buf);
+
+	if (ret == -BCH_ERR_fsck_fix) {
 		set_bit(BCH_FS_ERRORS_FIXED, &c->flags);
-		return -BCH_ERR_fsck_fix;
 	} else {
 		set_bit(BCH_FS_ERRORS_NOT_FIXED, &c->flags);
 		set_bit(BCH_FS_ERROR, &c->flags);
-		return c->opts.fix_errors == FSCK_OPT_EXIT ||
-			!(flags & FSCK_CAN_IGNORE)
-			? -BCH_ERR_fsck_errors_not_fixed
-			: -BCH_ERR_fsck_ignore;
 	}
+
+	return ret;
 }
 
 void bch2_flush_fsck_errs(struct bch_fs *c)
@@ -174,9 +207,10 @@ void bch2_flush_fsck_errs(struct bch_fs *c)
 
 	list_for_each_entry_safe(s, n, &c->fsck_errors, list) {
 		if (s->ratelimited)
-			bch_err(c, "Saw %llu errors like:\n    %s", s->nr, s->buf);
+			bch_err(c, "Saw %llu errors like:\n    %s", s->nr, s->buf.buf);
 
 		list_del(&s->list);
+		printbuf_exit(&s->buf);
 		kfree(s);
 	}
 
diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h
index b603d738c549..bbf9b6d85b4d 100644
--- a/fs/bcachefs/error.h
+++ b/fs/bcachefs/error.h
@@ -103,7 +103,7 @@ struct fsck_err_state {
 	const char		*fmt;
 	u64			nr;
 	bool			ratelimited;
-	char			buf[512];
+	struct printbuf		buf;
 };
 
 #define FSCK_CAN_FIX		(1 << 0)
@@ -121,7 +121,6 @@ void bch2_flush_fsck_errs(struct bch_fs *);
 									\
 	if (_ret != -BCH_ERR_fsck_fix &&				\
 	    _ret != -BCH_ERR_fsck_ignore) {				\
-		bch_err(c, "Unable to continue, halting");		\
 		ret = _ret;						\
 		goto fsck_err;						\
 	}								\
-- 
cgit 


From d704d62355b76e3f1f7efbe9b3072627fd4b4a3f Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 25 Sep 2022 18:22:54 -0400
Subject: bcachefs: btree_err() now uses bch2_print_string_as_lines()

We've seen long error messages get truncated here, so convert to the new
bch2_print_string_as_lines().

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_io.c | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index c63cb70836cc..603b825ed6fe 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -507,9 +507,11 @@ static void btree_err_msg(struct printbuf *out, struct bch_fs *c,
 			  struct btree *b, struct bset *i,
 			  unsigned offset, int write)
 {
-	prt_printf(out, "error validating btree node ");
-	if (write)
-		prt_printf(out, "before write ");
+	prt_printf(out, bch2_log_msg(c, ""));
+	if (!write)
+		prt_str(out, "error validating btree node ");
+	else
+		prt_str(out, "corrupt btree node before write ");
 	if (ca)
 		prt_printf(out, "on %s ", ca->name);
 	prt_printf(out, "at btree ");
@@ -518,6 +520,7 @@ static void btree_err_msg(struct printbuf *out, struct bch_fs *c,
 	prt_printf(out, "\n  node offset %u", b->written);
 	if (i)
 		prt_printf(out, " bset u64s %u", le16_to_cpu(i->u64s));
+	prt_str(out, ": ");
 }
 
 enum btree_err_type {
@@ -537,7 +540,7 @@ enum btree_validate_ret {
 	struct printbuf out = PRINTBUF;					\
 									\
 	btree_err_msg(&out, c, ca, b, i, b->written, write);		\
-	prt_printf(&out, ": " msg, ##__VA_ARGS__);			\
+	prt_printf(&out, msg, ##__VA_ARGS__);				\
 									\
 	if (type == BTREE_ERR_FIXABLE &&				\
 	    write == READ &&						\
@@ -546,10 +549,10 @@ enum btree_validate_ret {
 		goto out;						\
 	}								\
 									\
+	bch2_print_string_as_lines(KERN_ERR, out.buf);			\
+									\
 	switch (write) {						\
 	case READ:							\
-		bch_err(c, "%s", out.buf);				\
-									\
 		switch (type) {						\
 		case BTREE_ERR_FIXABLE:					\
 			ret = -BCH_ERR_fsck_errors_not_fixed;		\
@@ -569,8 +572,6 @@ enum btree_validate_ret {
 		}							\
 		break;							\
 	case WRITE:							\
-		bch_err(c, "corrupt metadata before write: %s", out.buf);\
-									\
 		if (bch2_fs_inconsistent(c)) {				\
 			ret = -BCH_ERR_fsck_errors_not_fixed;		\
 			goto fsck_err;					\
-- 
cgit 


From f3b8403ee70e5dcce5a16d3517b411bd8839319b Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 25 Sep 2022 22:26:48 -0400
Subject: bcachefs: Run bch2_fs_counters_init() earlier

We need counters to be initialized before initializing shrinkers - the
shrinker callbacks will update those counters. This fixes a segfault in
userspace.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/super.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 8dc87c103216..29e2b76322d7 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -781,7 +781,8 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 		goto err;
 	}
 
-	ret = bch2_io_clock_init(&c->io_clock[READ]) ?:
+	ret = bch2_fs_counters_init(c) ?:
+	    bch2_io_clock_init(&c->io_clock[READ]) ?:
 	    bch2_io_clock_init(&c->io_clock[WRITE]) ?:
 	    bch2_fs_journal_init(&c->journal) ?:
 	    bch2_fs_replicas_init(c) ?:
@@ -795,8 +796,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 	    bch2_fs_encryption_init(c) ?:
 	    bch2_fs_compress_init(c) ?:
 	    bch2_fs_ec_init(c) ?:
-	    bch2_fs_fsio_init(c) ?:
-	    bch2_fs_counters_init(c);
+	    bch2_fs_fsio_init(c);
 	if (ret)
 		goto err;
 
-- 
cgit 


From 14d8f26ad09d05676e90dc4fe1e6728e13d84607 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 26 Sep 2022 16:15:17 -0400
Subject: bcachefs: Inline bch2_trans_kmalloc() fast path

Small performance optimization.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c | 39 ++++++++++++++++++---------------------
 fs/bcachefs/btree_iter.h | 18 +++++++++++++++++-
 2 files changed, 35 insertions(+), 22 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index d58e29acfda3..30e7f07e3fa8 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -2668,37 +2668,34 @@ void bch2_trans_copy_iter(struct btree_iter *dst, struct btree_iter *src)
 	dst->key_cache_path = NULL;
 }
 
-void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
+void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
 {
 	unsigned new_top = trans->mem_top + size;
+	size_t old_bytes = trans->mem_bytes;
+	size_t new_bytes = roundup_pow_of_two(new_top);
+	void *new_mem;
 	void *p;
 
 	trans->mem_max = max(trans->mem_max, new_top);
 
-	if (new_top > trans->mem_bytes) {
-		size_t old_bytes = trans->mem_bytes;
-		size_t new_bytes = roundup_pow_of_two(new_top);
-		void *new_mem;
+	WARN_ON_ONCE(new_bytes > BTREE_TRANS_MEM_MAX);
 
-		WARN_ON_ONCE(new_bytes > BTREE_TRANS_MEM_MAX);
-
-		new_mem = krealloc(trans->mem, new_bytes, GFP_NOFS);
-		if (!new_mem && new_bytes <= BTREE_TRANS_MEM_MAX) {
-			new_mem = mempool_alloc(&trans->c->btree_trans_mem_pool, GFP_KERNEL);
-			new_bytes = BTREE_TRANS_MEM_MAX;
-			kfree(trans->mem);
-		}
+	new_mem = krealloc(trans->mem, new_bytes, GFP_NOFS);
+	if (!new_mem && new_bytes <= BTREE_TRANS_MEM_MAX) {
+		new_mem = mempool_alloc(&trans->c->btree_trans_mem_pool, GFP_KERNEL);
+		new_bytes = BTREE_TRANS_MEM_MAX;
+		kfree(trans->mem);
+	}
 
-		if (!new_mem)
-			return ERR_PTR(-ENOMEM);
+	if (!new_mem)
+		return ERR_PTR(-ENOMEM);
 
-		trans->mem = new_mem;
-		trans->mem_bytes = new_bytes;
+	trans->mem = new_mem;
+	trans->mem_bytes = new_bytes;
 
-		if (old_bytes) {
-			trace_and_count(trans->c, trans_restart_mem_realloced, trans, _RET_IP_, new_bytes);
-			return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_mem_realloced));
-		}
+	if (old_bytes) {
+		trace_and_count(trans->c, trans_restart_mem_realloced, trans, _RET_IP_, new_bytes);
+		return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_mem_realloced));
 	}
 
 	p = trans->mem + trans->mem_top;
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index 04b6773d6e10..1be7c8883101 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -328,7 +328,23 @@ static inline void set_btree_iter_dontneed(struct btree_iter *iter)
 		iter->path->preserve = false;
 }
 
-void *bch2_trans_kmalloc(struct btree_trans *, size_t);
+void *__bch2_trans_kmalloc(struct btree_trans *, size_t);
+
+static inline void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
+{
+	unsigned new_top = trans->mem_top + size;
+	void *p = trans->mem + trans->mem_top;
+
+	if (likely(new_top <= trans->mem_bytes)) {
+		trans->mem_top += size;
+		memset(p, 0, size);
+		return p;
+	} else {
+		return __bch2_trans_kmalloc(trans, size);
+
+	}
+}
+
 u32 bch2_trans_begin(struct btree_trans *);
 
 static inline struct btree *
-- 
cgit 


From 3f3bc66ef0601e425a3c2901b34a825d4166da63 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 26 Sep 2022 16:19:56 -0400
Subject: bcachefs: Optimize btree_path_alloc()

 - move slowpath code to a separate function, btree_path_overflow()
 - no need to use hweight64
 - copy nr_max_paths from btree_transaction_stats to btree_trans,
   avoiding a data dependency in the fast path

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 30e7f07e3fa8..366be8015b97 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1408,7 +1408,8 @@ static void bch2_trans_update_max_paths(struct btree_trans *trans)
 	if (!buf.allocation_failure) {
 		mutex_lock(&s->lock);
 		if (s->nr_max_paths < hweight64(trans->paths_allocated)) {
-			s->nr_max_paths = hweight64(trans->paths_allocated);
+			s->nr_max_paths = trans->nr_max_paths =
+				hweight64(trans->paths_allocated);
 			swap(s->max_paths_text, buf.buf);
 		}
 		mutex_unlock(&s->lock);
@@ -1419,17 +1420,21 @@ static void bch2_trans_update_max_paths(struct btree_trans *trans)
 	trans->nr_max_paths = hweight64(trans->paths_allocated);
 }
 
-static struct btree_path *btree_path_alloc(struct btree_trans *trans,
-					   struct btree_path *pos)
+static noinline void btree_path_overflow(struct btree_trans *trans)
+{
+	bch2_dump_trans_paths_updates(trans);
+	panic("trans path oveflow\n");
+}
+
+static inline struct btree_path *btree_path_alloc(struct btree_trans *trans,
+						  struct btree_path *pos)
 {
 	struct btree_path *path;
 	unsigned idx;
 
 	if (unlikely(trans->paths_allocated ==
-		     ~((~0ULL << 1) << (BTREE_ITER_MAX - 1)))) {
-		bch2_dump_trans_paths_updates(trans);
-		panic("trans path oveflow\n");
-	}
+		     ~((~0ULL << 1) << (BTREE_ITER_MAX - 1))))
+		btree_path_overflow(trans);
 
 	idx = __ffs64(~trans->paths_allocated);
 	trans->paths_allocated |= 1ULL << idx;
-- 
cgit 


From c23a9e088259193dc883371cf49c99fed675e951 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 26 Sep 2022 16:23:19 -0400
Subject: bcachefs: Improve jset_validate()

Previously, jset_validate() was formatting the initial part of an error
string for every entry it validating - expensive.

This moves that code to journal_entry_err_msg(), which is now only
called if there's an actual error.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/journal_io.c | 160 +++++++++++++++++++++++++++--------------------
 fs/bcachefs/journal_io.h |   2 +-
 fs/bcachefs/super-io.c   |   2 +-
 3 files changed, 94 insertions(+), 70 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 090a718b917f..0c82f1048e21 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -186,30 +186,57 @@ static void journal_entry_null_range(void *start, void *end)
 #define JOURNAL_ENTRY_NONE	6
 #define JOURNAL_ENTRY_BAD	7
 
-#define journal_entry_err(c, msg, ...)					\
+static void journal_entry_err_msg(struct printbuf *out,
+				  struct jset *jset,
+				  struct jset_entry *entry)
+{
+	prt_str(out, "invalid journal entry ");
+	if (entry)
+		prt_printf(out, "%s ", bch2_jset_entry_types[entry->type]);
+
+	if (!jset)
+		prt_printf(out, "in superblock");
+	else if (!entry)
+		prt_printf(out, "at seq %llu", le64_to_cpu(jset->seq));
+	else
+		prt_printf(out, "at offset %zi/%u seq %llu",
+			   (u64 *) entry - jset->_data,
+			   le32_to_cpu(jset->u64s),
+			   le64_to_cpu(jset->seq));
+	prt_str(out, ": ");
+}
+
+#define journal_entry_err(c, jset, entry, msg, ...)			\
 ({									\
+	struct printbuf buf = PRINTBUF;					\
+									\
+	journal_entry_err_msg(&buf, jset, entry);			\
+	prt_printf(&buf, msg, ##__VA_ARGS__);				\
+									\
 	switch (write) {						\
 	case READ:							\
-		mustfix_fsck_err(c, msg, ##__VA_ARGS__);		\
+		mustfix_fsck_err(c, "%s", buf.buf);			\
 		break;							\
 	case WRITE:							\
-		bch_err(c, "corrupt metadata before write:\n"		\
-			msg, ##__VA_ARGS__);				\
+		bch_err(c, "corrupt metadata before write: %s\n", buf.buf);\
 		if (bch2_fs_inconsistent(c)) {				\
 			ret = -BCH_ERR_fsck_errors_not_fixed;		\
 			goto fsck_err;					\
 		}							\
 		break;							\
 	}								\
+									\
+	printbuf_exit(&buf);						\
 	true;								\
 })
 
-#define journal_entry_err_on(cond, c, msg, ...)				\
-	((cond) ? journal_entry_err(c, msg, ##__VA_ARGS__) : false)
+#define journal_entry_err_on(cond, c, jset, entry, msg, ...)		\
+	((cond) ? journal_entry_err(c, jset, entry, msg, ##__VA_ARGS__) : false)
 
 #define FSCK_DELETED_KEY	5
 
-static int journal_validate_key(struct bch_fs *c, const char *where,
+static int journal_validate_key(struct bch_fs *c,
+				struct jset *jset,
 				struct jset_entry *entry,
 				unsigned level, enum btree_id btree_id,
 				struct bkey_i *k,
@@ -219,33 +246,24 @@ static int journal_validate_key(struct bch_fs *c, const char *where,
 	struct printbuf buf = PRINTBUF;
 	int ret = 0;
 
-	if (journal_entry_err_on(!k->k.u64s, c,
-			"invalid key in %s at %s offset %zi/%u: k->u64s 0",
-			bch2_jset_entry_types[entry->type], where,
-			(u64 *) k - entry->_data,
-			le16_to_cpu(entry->u64s))) {
+	if (journal_entry_err_on(!k->k.u64s, c, jset, entry, "k->u64s 0")) {
 		entry->u64s = cpu_to_le16((u64 *) k - entry->_data);
 		journal_entry_null_range(vstruct_next(entry), next);
 		return FSCK_DELETED_KEY;
 	}
 
 	if (journal_entry_err_on((void *) bkey_next(k) >
-				(void *) vstruct_next(entry), c,
-			"invalid key in %s at %s offset %zi/%u: extends past end of journal entry",
-			bch2_jset_entry_types[entry->type], where,
-			(u64 *) k - entry->_data,
-			le16_to_cpu(entry->u64s))) {
+				 (void *) vstruct_next(entry),
+				 c, jset, entry,
+				 "extends past end of journal entry")) {
 		entry->u64s = cpu_to_le16((u64 *) k - entry->_data);
 		journal_entry_null_range(vstruct_next(entry), next);
 		return FSCK_DELETED_KEY;
 	}
 
-	if (journal_entry_err_on(k->k.format != KEY_FORMAT_CURRENT, c,
-			"invalid key in %s at %s offset %zi/%u: bad format %u",
-			bch2_jset_entry_types[entry->type], where,
-			(u64 *) k - entry->_data,
-			le16_to_cpu(entry->u64s),
-			k->k.format)) {
+	if (journal_entry_err_on(k->k.format != KEY_FORMAT_CURRENT,
+				 c, jset, entry,
+				 "bad format %u", k->k.format)) {
 		le16_add_cpu(&entry->u64s, -((u16) k->k.u64s));
 		memmove(k, bkey_next(k), next - (void *) bkey_next(k));
 		journal_entry_null_range(vstruct_next(entry), next);
@@ -259,10 +277,11 @@ static int journal_validate_key(struct bch_fs *c, const char *where,
 	if (bch2_bkey_invalid(c, bkey_i_to_s_c(k),
 			      __btree_node_type(level, btree_id), write, &buf)) {
 		printbuf_reset(&buf);
-		prt_printf(&buf, "invalid key in %s at %s offset %zi/%u:",
-		       bch2_jset_entry_types[entry->type], where,
-		       (u64 *) k - entry->_data,
-		       le16_to_cpu(entry->u64s));
+		prt_printf(&buf, "invalid journal entry %s at offset %zi/%u seq %llu:",
+			   bch2_jset_entry_types[entry->type],
+			   (u64 *) entry - jset->_data,
+			   le32_to_cpu(jset->u64s),
+			   le64_to_cpu(jset->seq));
 		prt_newline(&buf);
 		printbuf_indent_add(&buf, 2);
 
@@ -290,14 +309,14 @@ fsck_err:
 }
 
 static int journal_entry_btree_keys_validate(struct bch_fs *c,
-					     const char *where,
+					     struct jset *jset,
 					     struct jset_entry *entry,
 					     unsigned version, int big_endian, int write)
 {
 	struct bkey_i *k = entry->start;
 
 	while (k != vstruct_last(entry)) {
-		int ret = journal_validate_key(c, where, entry,
+		int ret = journal_validate_key(c, jset, entry,
 					       entry->level,
 					       entry->btree_id,
 					       k, version, big_endian, write);
@@ -328,7 +347,7 @@ static void journal_entry_btree_keys_to_text(struct printbuf *out, struct bch_fs
 }
 
 static int journal_entry_btree_root_validate(struct bch_fs *c,
-					     const char *where,
+					     struct jset *jset,
 					     struct jset_entry *entry,
 					     unsigned version, int big_endian, int write)
 {
@@ -336,7 +355,8 @@ static int journal_entry_btree_root_validate(struct bch_fs *c,
 	int ret = 0;
 
 	if (journal_entry_err_on(!entry->u64s ||
-				 le16_to_cpu(entry->u64s) != k->k.u64s, c,
+				 le16_to_cpu(entry->u64s) != k->k.u64s,
+				 c, jset, entry,
 				 "invalid btree root journal entry: wrong number of keys")) {
 		void *next = vstruct_next(entry);
 		/*
@@ -349,7 +369,7 @@ static int journal_entry_btree_root_validate(struct bch_fs *c,
 		return 0;
 	}
 
-	return journal_validate_key(c, where, entry, 1, entry->btree_id, k,
+	return journal_validate_key(c, jset, entry, 1, entry->btree_id, k,
 				    version, big_endian, write);
 fsck_err:
 	return ret;
@@ -362,7 +382,7 @@ static void journal_entry_btree_root_to_text(struct printbuf *out, struct bch_fs
 }
 
 static int journal_entry_prio_ptrs_validate(struct bch_fs *c,
-					    const char *where,
+					    struct jset *jset,
 					    struct jset_entry *entry,
 					    unsigned version, int big_endian, int write)
 {
@@ -376,13 +396,14 @@ static void journal_entry_prio_ptrs_to_text(struct printbuf *out, struct bch_fs
 }
 
 static int journal_entry_blacklist_validate(struct bch_fs *c,
-					    const char *where,
+					    struct jset *jset,
 					    struct jset_entry *entry,
 					    unsigned version, int big_endian, int write)
 {
 	int ret = 0;
 
-	if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 1, c,
+	if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 1,
+				 c, jset, entry,
 		"invalid journal seq blacklist entry: bad size")) {
 		journal_entry_null_range(entry, vstruct_next(entry));
 	}
@@ -400,14 +421,15 @@ static void journal_entry_blacklist_to_text(struct printbuf *out, struct bch_fs
 }
 
 static int journal_entry_blacklist_v2_validate(struct bch_fs *c,
-					       const char *where,
+					       struct jset *jset,
 					       struct jset_entry *entry,
 					       unsigned version, int big_endian, int write)
 {
 	struct jset_entry_blacklist_v2 *bl_entry;
 	int ret = 0;
 
-	if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 2, c,
+	if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 2,
+				 c, jset, entry,
 		"invalid journal seq blacklist entry: bad size")) {
 		journal_entry_null_range(entry, vstruct_next(entry));
 		goto out;
@@ -416,7 +438,8 @@ static int journal_entry_blacklist_v2_validate(struct bch_fs *c,
 	bl_entry = container_of(entry, struct jset_entry_blacklist_v2, entry);
 
 	if (journal_entry_err_on(le64_to_cpu(bl_entry->start) >
-				 le64_to_cpu(bl_entry->end), c,
+				 le64_to_cpu(bl_entry->end),
+				 c, jset, entry,
 		"invalid journal seq blacklist entry: start > end")) {
 		journal_entry_null_range(entry, vstruct_next(entry));
 	}
@@ -437,7 +460,7 @@ static void journal_entry_blacklist_v2_to_text(struct printbuf *out, struct bch_
 }
 
 static int journal_entry_usage_validate(struct bch_fs *c,
-					const char *where,
+					struct jset *jset,
 					struct jset_entry *entry,
 					unsigned version, int big_endian, int write)
 {
@@ -447,7 +470,7 @@ static int journal_entry_usage_validate(struct bch_fs *c,
 	int ret = 0;
 
 	if (journal_entry_err_on(bytes < sizeof(*u),
-				 c,
+				 c, jset, entry,
 				 "invalid journal entry usage: bad size")) {
 		journal_entry_null_range(entry, vstruct_next(entry));
 		return ret;
@@ -469,7 +492,7 @@ static void journal_entry_usage_to_text(struct printbuf *out, struct bch_fs *c,
 }
 
 static int journal_entry_data_usage_validate(struct bch_fs *c,
-					const char *where,
+					struct jset *jset,
 					struct jset_entry *entry,
 					unsigned version, int big_endian, int write)
 {
@@ -480,7 +503,7 @@ static int journal_entry_data_usage_validate(struct bch_fs *c,
 
 	if (journal_entry_err_on(bytes < sizeof(*u) ||
 				 bytes < sizeof(*u) + u->r.nr_devs,
-				 c,
+				 c, jset, entry,
 				 "invalid journal entry usage: bad size")) {
 		journal_entry_null_range(entry, vstruct_next(entry));
 		return ret;
@@ -501,7 +524,7 @@ static void journal_entry_data_usage_to_text(struct printbuf *out, struct bch_fs
 }
 
 static int journal_entry_clock_validate(struct bch_fs *c,
-					const char *where,
+					struct jset *jset,
 					struct jset_entry *entry,
 					unsigned version, int big_endian, int write)
 {
@@ -511,13 +534,13 @@ static int journal_entry_clock_validate(struct bch_fs *c,
 	int ret = 0;
 
 	if (journal_entry_err_on(bytes != sizeof(*clock),
-				 c, "invalid journal entry clock: bad size")) {
+				 c, jset, entry, "bad size")) {
 		journal_entry_null_range(entry, vstruct_next(entry));
 		return ret;
 	}
 
 	if (journal_entry_err_on(clock->rw > 1,
-				 c, "invalid journal entry clock: bad rw")) {
+				 c, jset, entry, "bad rw")) {
 		journal_entry_null_range(entry, vstruct_next(entry));
 		return ret;
 	}
@@ -536,7 +559,7 @@ static void journal_entry_clock_to_text(struct printbuf *out, struct bch_fs *c,
 }
 
 static int journal_entry_dev_usage_validate(struct bch_fs *c,
-					    const char *where,
+					    struct jset *jset,
 					    struct jset_entry *entry,
 					    unsigned version, int big_endian, int write)
 {
@@ -548,7 +571,7 @@ static int journal_entry_dev_usage_validate(struct bch_fs *c,
 	int ret = 0;
 
 	if (journal_entry_err_on(bytes < expected,
-				 c, "invalid journal entry dev usage: bad size (%u < %u)",
+				 c, jset, entry, "bad size (%u < %u)",
 				 bytes, expected)) {
 		journal_entry_null_range(entry, vstruct_next(entry));
 		return ret;
@@ -557,13 +580,13 @@ static int journal_entry_dev_usage_validate(struct bch_fs *c,
 	dev = le32_to_cpu(u->dev);
 
 	if (journal_entry_err_on(!bch2_dev_exists2(c, dev),
-				 c, "invalid journal entry dev usage: bad dev")) {
+				 c, jset, entry, "bad dev")) {
 		journal_entry_null_range(entry, vstruct_next(entry));
 		return ret;
 	}
 
 	if (journal_entry_err_on(u->pad,
-				 c, "invalid journal entry dev usage: bad pad")) {
+				 c, jset, entry, "bad pad")) {
 		journal_entry_null_range(entry, vstruct_next(entry));
 		return ret;
 	}
@@ -596,7 +619,7 @@ static void journal_entry_dev_usage_to_text(struct printbuf *out, struct bch_fs
 }
 
 static int journal_entry_log_validate(struct bch_fs *c,
-				      const char *where,
+				      struct jset *jset,
 				      struct jset_entry *entry,
 				      unsigned version, int big_endian, int write)
 {
@@ -612,11 +635,12 @@ static void journal_entry_log_to_text(struct printbuf *out, struct bch_fs *c,
 	prt_printf(out, "%.*s", bytes, l->d);
 }
 
-static int journal_entry_overwrite_validate(struct bch_fs *c, const char *where,
+static int journal_entry_overwrite_validate(struct bch_fs *c,
+				      struct jset *jset,
 				      struct jset_entry *entry,
 				      unsigned version, int big_endian, int write)
 {
-	return journal_entry_btree_keys_validate(c, where, entry, version, big_endian, write);
+	return journal_entry_btree_keys_validate(c, jset, entry, version, big_endian, write);
 }
 
 static void journal_entry_overwrite_to_text(struct printbuf *out, struct bch_fs *c,
@@ -626,7 +650,7 @@ static void journal_entry_overwrite_to_text(struct printbuf *out, struct bch_fs
 }
 
 struct jset_entry_ops {
-	int (*validate)(struct bch_fs *, const char *,
+	int (*validate)(struct bch_fs *, struct jset *,
 			struct jset_entry *, unsigned, int, int);
 	void (*to_text)(struct printbuf *, struct bch_fs *, struct jset_entry *);
 };
@@ -641,12 +665,13 @@ static const struct jset_entry_ops bch2_jset_entry_ops[] = {
 #undef x
 };
 
-int bch2_journal_entry_validate(struct bch_fs *c, const char *where,
+int bch2_journal_entry_validate(struct bch_fs *c,
+				struct jset *jset,
 				struct jset_entry *entry,
 				unsigned version, int big_endian, int write)
 {
 	return entry->type < BCH_JSET_ENTRY_NR
-		? bch2_jset_entry_ops[entry->type].validate(c, where, entry,
+		? bch2_jset_entry_ops[entry->type].validate(c, jset, entry,
 				version, big_endian, write)
 		: 0;
 }
@@ -665,24 +690,18 @@ void bch2_journal_entry_to_text(struct printbuf *out, struct bch_fs *c,
 static int jset_validate_entries(struct bch_fs *c, struct jset *jset,
 				 int write)
 {
-	char buf[100];
 	struct jset_entry *entry;
 	int ret = 0;
 
 	vstruct_for_each(jset, entry) {
-		scnprintf(buf, sizeof(buf), "jset %llu entry offset %zi/%u",
-			  le64_to_cpu(jset->seq),
-			  (u64 *) entry - jset->_data,
-			  le32_to_cpu(jset->u64s));
-
 		if (journal_entry_err_on(vstruct_next(entry) >
-					 vstruct_last(jset), c,
+					 vstruct_last(jset), c, jset, entry,
 				"journal entry extends past end of jset")) {
 			jset->u64s = cpu_to_le32((u64 *) entry - jset->_data);
 			break;
 		}
 
-		ret = bch2_journal_entry_validate(c, buf, entry,
+		ret = bch2_journal_entry_validate(c, jset, entry,
 					le32_to_cpu(jset->version),
 					JSET_BIG_ENDIAN(jset), write);
 		if (ret)
@@ -710,7 +729,8 @@ static int jset_validate(struct bch_fs *c,
 	version = le32_to_cpu(jset->version);
 	if (journal_entry_err_on((version != BCH_JSET_VERSION_OLD &&
 				  version < bcachefs_metadata_version_min) ||
-				 version >= bcachefs_metadata_version_max, c,
+				 version >= bcachefs_metadata_version_max,
+				 c, jset, NULL,
 			"%s sector %llu seq %llu: unknown journal entry version %u",
 			ca ? ca->name : c->name,
 			sector, le64_to_cpu(jset->seq),
@@ -723,7 +743,8 @@ static int jset_validate(struct bch_fs *c,
 	    sectors_read < bucket_sectors_left)
 		return JOURNAL_ENTRY_REREAD;
 
-	if (journal_entry_err_on(bytes > bucket_sectors_left << 9, c,
+	if (journal_entry_err_on(bytes > bucket_sectors_left << 9,
+				 c, jset, NULL,
 			"%s sector %llu seq %llu: journal entry too big (%zu bytes)",
 			ca ? ca->name : c->name,
 			sector, le64_to_cpu(jset->seq), bytes)) {
@@ -732,7 +753,8 @@ static int jset_validate(struct bch_fs *c,
 			     -((bytes - (bucket_sectors_left << 9)) / 8));
 	}
 
-	if (journal_entry_err_on(!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(jset)), c,
+	if (journal_entry_err_on(!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(jset)),
+				 c, jset, NULL,
 			"%s sector %llu seq %llu: journal entry with unknown csum type %llu",
 			ca ? ca->name : c->name,
 			sector, le64_to_cpu(jset->seq),
@@ -745,7 +767,8 @@ static int jset_validate(struct bch_fs *c,
 		goto csum_done;
 
 	csum = csum_vstruct(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), jset);
-	if (journal_entry_err_on(bch2_crc_cmp(csum, jset->csum), c,
+	if (journal_entry_err_on(bch2_crc_cmp(csum, jset->csum),
+				 c, jset, NULL,
 				 "%s sector %llu seq %llu: journal checksum bad",
 				 ca ? ca->name : c->name,
 				 sector, le64_to_cpu(jset->seq)))
@@ -759,7 +782,8 @@ static int jset_validate(struct bch_fs *c,
 csum_done:
 	/* last_seq is ignored when JSET_NO_FLUSH is true */
 	if (journal_entry_err_on(!JSET_NO_FLUSH(jset) &&
-				 le64_to_cpu(jset->last_seq) > le64_to_cpu(jset->seq), c,
+				 le64_to_cpu(jset->last_seq) > le64_to_cpu(jset->seq),
+				 c, jset, NULL,
 				 "invalid journal entry: last_seq > seq (%llu > %llu)",
 				 le64_to_cpu(jset->last_seq),
 				 le64_to_cpu(jset->seq))) {
diff --git a/fs/bcachefs/journal_io.h b/fs/bcachefs/journal_io.h
index 30e995c81fc4..1a91f2c0a26c 100644
--- a/fs/bcachefs/journal_io.h
+++ b/fs/bcachefs/journal_io.h
@@ -44,7 +44,7 @@ static inline struct jset_entry *__jset_entry_type_next(struct jset *jset,
 	for_each_jset_entry_type(entry, jset, BCH_JSET_ENTRY_btree_keys)	\
 		vstruct_for_each_safe(entry, k, _n)
 
-int bch2_journal_entry_validate(struct bch_fs *, const char *,
+int bch2_journal_entry_validate(struct bch_fs *, struct jset *,
 				struct jset_entry *, unsigned, int, int);
 void bch2_journal_entry_to_text(struct printbuf *, struct bch_fs *,
 				struct jset_entry *);
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index 12edd4b9a44b..2a347efdbd83 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -1161,7 +1161,7 @@ int bch2_sb_clean_validate_late(struct bch_fs *c, struct bch_sb_field_clean *cle
 	for (entry = clean->start;
 	     entry < (struct jset_entry *) vstruct_end(&clean->field);
 	     entry = vstruct_next(entry)) {
-		ret = bch2_journal_entry_validate(c, "superblock", entry,
+		ret = bch2_journal_entry_validate(c, NULL, entry,
 						  le16_to_cpu(c->disk_sb.sb->version),
 						  BCH_SB_BIG_ENDIAN(c->disk_sb.sb),
 						  write);
-- 
cgit 


From 25b4b3308e3a0dd45677b6eeeea4431ee08a32c1 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 26 Sep 2022 18:13:29 -0400
Subject: bcachefs: Inline fast path of check_pos_snapshot_overwritten()

This moves the slowpath of check_pos_snapshot_overwritten() to a
separate function, and inlines the fast path - helping performance on
btrees that don't use snapshot and for users that aren't using
snapshots.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update_leaf.c | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index ad46c887185b..31b60864b6da 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -1113,7 +1113,7 @@ err:
 	goto retry;
 }
 
-static int check_pos_snapshot_overwritten(struct btree_trans *trans,
+static noinline int __check_pos_snapshot_overwritten(struct btree_trans *trans,
 					  enum btree_id id,
 					  struct bpos pos)
 {
@@ -1122,12 +1122,6 @@ static int check_pos_snapshot_overwritten(struct btree_trans *trans,
 	struct bkey_s_c k;
 	int ret;
 
-	if (!btree_type_has_snapshots(id))
-		return 0;
-
-	if (!snapshot_t(c, pos.snapshot)->children[0])
-		return 0;
-
 	bch2_trans_iter_init(trans, &iter, id, pos,
 			     BTREE_ITER_NOT_EXTENTS|
 			     BTREE_ITER_ALL_SNAPSHOTS);
@@ -1153,6 +1147,18 @@ static int check_pos_snapshot_overwritten(struct btree_trans *trans,
 	return ret;
 }
 
+static inline int check_pos_snapshot_overwritten(struct btree_trans *trans,
+					  enum btree_id id,
+					  struct bpos pos)
+{
+	if (!btree_type_has_snapshots(id) ||
+	    pos.snapshot == U32_MAX ||
+	    !snapshot_t(trans->c, pos.snapshot)->children[0])
+		return 0;
+
+	return __check_pos_snapshot_overwritten(trans, id, pos);
+}
+
 static noinline int extent_front_merge(struct btree_trans *trans,
 				       struct btree_iter *iter,
 				       struct bkey_s_c k,
-- 
cgit 


From c298fd7d346f5a7c55c4f1d01f97fbd39da9ff53 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 26 Sep 2022 18:15:33 -0400
Subject: bcachefs; Mark __bch2_trans_iter_init as inline

This function is fairly small and only used in two places: one very hot,
the other cold, so it should definitely be inlined.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c            | 12 ++++++------
 fs/bcachefs/btree_update_interior.c |  3 +--
 2 files changed, 7 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 366be8015b97..67a1e0d70ed9 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -2581,12 +2581,12 @@ void bch2_trans_iter_exit(struct btree_trans *trans, struct btree_iter *iter)
 	iter->key_cache_path = NULL;
 }
 
-static void __bch2_trans_iter_init(struct btree_trans *trans,
-				   struct btree_iter *iter,
-				   enum btree_id btree_id, struct bpos pos,
-				   unsigned locks_want,
-				   unsigned depth,
-				   unsigned flags)
+static inline void __bch2_trans_iter_init(struct btree_trans *trans,
+					  struct btree_iter *iter,
+					  enum btree_id btree_id, struct bpos pos,
+					  unsigned locks_want,
+					  unsigned depth,
+					  unsigned flags)
 {
 	if (trans->restarted)
 		panic("bch2_trans_iter_init(): in transaction restart, %s by %pS\n",
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 797efa738dc4..fd44492150b0 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -36,8 +36,7 @@ static struct btree_path *get_unlocked_mut_path(struct btree_trans *trans,
 	struct btree_path *path;
 
 	path = bch2_path_get(trans, btree_id, pos, level + 1, level,
-			     BTREE_ITER_NOPRESERVE|
-			     BTREE_ITER_INTENT);
+			     BTREE_ITER_NOPRESERVE|BTREE_ITER_INTENT);
 	path = bch2_btree_path_make_mut(trans, path, true);
 	bch2_btree_path_downgrade(trans, path);
 	__bch2_btree_path_unlock(trans, path);
-- 
cgit 


From 68b6cd194ab23d0696a9d7adb024eabca95d4920 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 26 Sep 2022 18:18:00 -0400
Subject: bcachefs: Improve bucket_alloc tracepoint

It now includes more info - whether the bucket was for metadata or data
- and also call it in the same place as the bucket_alloc_fail
tracepoint.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_foreground.c |  4 ++++
 fs/bcachefs/trace.h            | 25 ++++++++++++++++++++-----
 2 files changed, 24 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index e890b09f80c6..0525a4ee0dd1 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -587,6 +587,8 @@ err:
 	if (!IS_ERR(ob))
 		trace_and_count(c, bucket_alloc, ca,
 				bch2_alloc_reserves[reserve],
+				may_alloc_partial,
+				ob->bucket,
 				usage.d[BCH_DATA_free].buckets,
 				avail,
 				bch2_copygc_wait_amount(c),
@@ -600,6 +602,8 @@ err:
 	else
 		trace_and_count(c, bucket_alloc_fail, ca,
 				bch2_alloc_reserves[reserve],
+				may_alloc_partial,
+				0,
 				usage.d[BCH_DATA_free].buckets,
 				avail,
 				bch2_copygc_wait_amount(c),
diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h
index 69e142d8b651..24d089507a21 100644
--- a/fs/bcachefs/trace.h
+++ b/fs/bcachefs/trace.h
@@ -512,6 +512,8 @@ DEFINE_EVENT(bch_fs, gc_gens_end,
 
 DECLARE_EVENT_CLASS(bucket_alloc,
 	TP_PROTO(struct bch_dev *ca, const char *alloc_reserve,
+		 bool user,
+		 u64 bucket,
 		 u64 free,
 		 u64 avail,
 		 u64 copygc_wait_amount,
@@ -522,12 +524,15 @@ DECLARE_EVENT_CLASS(bucket_alloc,
 		 u64 nouse,
 		 bool nonblocking,
 		 const char *err),
-	TP_ARGS(ca, alloc_reserve, free, avail, copygc_wait_amount, copygc_waiting_for,
+	TP_ARGS(ca, alloc_reserve, user, bucket, free, avail,
+		copygc_wait_amount, copygc_waiting_for,
 		seen, open, need_journal_commit, nouse, nonblocking, err),
 
 	TP_STRUCT__entry(
 		__field(dev_t,			dev			)
 		__array(char,	reserve,	16			)
+		__field(bool,			user	)
+		__field(u64,			bucket	)
 		__field(u64,			free			)
 		__field(u64,			avail			)
 		__field(u64,			copygc_wait_amount	)
@@ -537,12 +542,14 @@ DECLARE_EVENT_CLASS(bucket_alloc,
 		__field(u64,			need_journal_commit	)
 		__field(u64,			nouse			)
 		__field(bool,			nonblocking		)
-		__array(char,			err,	16		)
+		__array(char,			err,	32		)
 	),
 
 	TP_fast_assign(
 		__entry->dev		= ca->dev;
 		strlcpy(__entry->reserve, alloc_reserve, sizeof(__entry->reserve));
+		__entry->user		= user;
+		__entry->bucket		= bucket;
 		__entry->free		= free;
 		__entry->avail		= avail;
 		__entry->copygc_wait_amount	= copygc_wait_amount;
@@ -555,9 +562,11 @@ DECLARE_EVENT_CLASS(bucket_alloc,
 		strlcpy(__entry->err, err, sizeof(__entry->err));
 	),
 
-	TP_printk("%d,%d reserve %s free %llu avail %llu copygc_wait %llu/%lli seen %llu open %llu need_journal_commit %llu nouse %llu nonblocking %u err %s",
+	TP_printk("%d,%d reserve %s user %u bucket %llu free %llu avail %llu copygc_wait %llu/%lli seen %llu open %llu need_journal_commit %llu nouse %llu nonblocking %u err %s",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  __entry->reserve,
+		  __entry->user,
+		  __entry->bucket,
 		  __entry->free,
 		  __entry->avail,
 		  __entry->copygc_wait_amount,
@@ -572,6 +581,8 @@ DECLARE_EVENT_CLASS(bucket_alloc,
 
 DEFINE_EVENT(bucket_alloc, bucket_alloc,
 	TP_PROTO(struct bch_dev *ca, const char *alloc_reserve,
+		 bool user,
+		 u64 bucket,
 		 u64 free,
 		 u64 avail,
 		 u64 copygc_wait_amount,
@@ -582,12 +593,15 @@ DEFINE_EVENT(bucket_alloc, bucket_alloc,
 		 u64 nouse,
 		 bool nonblocking,
 		 const char *err),
-	TP_ARGS(ca, alloc_reserve, free, avail, copygc_wait_amount, copygc_waiting_for,
+	TP_ARGS(ca, alloc_reserve, user, bucket, free, avail,
+		copygc_wait_amount, copygc_waiting_for,
 		seen, open, need_journal_commit, nouse, nonblocking, err)
 );
 
 DEFINE_EVENT(bucket_alloc, bucket_alloc_fail,
 	TP_PROTO(struct bch_dev *ca, const char *alloc_reserve,
+		 bool user,
+		 u64 bucket,
 		 u64 free,
 		 u64 avail,
 		 u64 copygc_wait_amount,
@@ -598,7 +612,8 @@ DEFINE_EVENT(bucket_alloc, bucket_alloc_fail,
 		 u64 nouse,
 		 bool nonblocking,
 		 const char *err),
-	TP_ARGS(ca, alloc_reserve, free, avail, copygc_wait_amount, copygc_waiting_for,
+	TP_ARGS(ca, alloc_reserve, user, bucket, free, avail,
+		copygc_wait_amount, copygc_waiting_for,
 		seen, open, need_journal_commit, nouse, nonblocking, err)
 );
 
-- 
cgit 


From 2d848dacb2a7b7b6766c43b1945351ef360f4344 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 26 Sep 2022 18:21:07 -0400
Subject: bcachefs: Kill io_in_flight semaphore

This used to be needed more for buffered IO, but now the block layer has
writeback throttling - we can delete this now.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-io.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 73f5677cadce..3cb542f0d8c7 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -1214,8 +1214,6 @@ static void bch2_writepage_io_done(struct bch_write_op *op)
 	struct bio_vec *bvec;
 	unsigned i;
 
-	up(&io->op.c->io_in_flight);
-
 	if (io->op.error) {
 		set_bit(EI_INODE_ERROR, &io->inode->ei_flags);
 
@@ -1278,8 +1276,6 @@ static void bch2_writepage_do_io(struct bch_writepage_state *w)
 {
 	struct bch_writepage_io *io = w->io;
 
-	down(&io->op.c->io_in_flight);
-
 	w->io = NULL;
 	closure_call(&io->op.cl, bch2_write, NULL, NULL);
 }
-- 
cgit 


From 99e2146bea04d092d9fe2825c4dcd1fb19994bce Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 26 Sep 2022 22:34:49 -0400
Subject: bcachefs: Break out bch2_btree_path_traverse_cached_slowpath()

Prep work for further refactoring.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_key_cache.c | 60 ++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 57 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index 1a88d1d79699..b26d4ffe2a11 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -398,9 +398,9 @@ err:
 	return ret;
 }
 
-__flatten
-int bch2_btree_path_traverse_cached(struct btree_trans *trans, struct btree_path *path,
-				    unsigned flags)
+noinline static int
+bch2_btree_path_traverse_cached_slowpath(struct btree_trans *trans, struct btree_path *path,
+					 unsigned flags)
 {
 	struct bch_fs *c = trans->c;
 	struct bkey_cached *ck;
@@ -481,6 +481,60 @@ err:
 	return ret;
 }
 
+int bch2_btree_path_traverse_cached(struct btree_trans *trans, struct btree_path *path,
+				    unsigned flags)
+{
+	struct bch_fs *c = trans->c;
+	struct bkey_cached *ck;
+	int ret = 0;
+
+	EBUG_ON(path->level);
+
+	path->l[1].b = NULL;
+
+	if (bch2_btree_node_relock(trans, path, 0)) {
+		ck = (void *) path->l[0].b;
+		goto fill;
+	}
+retry:
+	ck = bch2_btree_key_cache_find(c, path->btree_id, path->pos);
+	if (!ck) {
+		return bch2_btree_path_traverse_cached_slowpath(trans, path, flags);
+	} else {
+		enum six_lock_type lock_want = __btree_lock_want(path, 0);
+
+		ret = btree_node_lock(trans, path, (void *) ck, 0,
+				      lock_want, _THIS_IP_);
+		EBUG_ON(ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart));
+
+		if (ret)
+			return ret;
+
+		if (ck->key.btree_id != path->btree_id ||
+		    bpos_cmp(ck->key.pos, path->pos)) {
+			six_unlock_type(&ck->c.lock, lock_want);
+			goto retry;
+		}
+
+		mark_btree_node_locked(trans, path, 0, lock_want);
+	}
+
+	path->l[0].lock_seq	= ck->c.lock.state.seq;
+	path->l[0].b		= (void *) ck;
+fill:
+	if (!ck->valid)
+		return bch2_btree_path_traverse_cached_slowpath(trans, path, flags);
+
+	if (!test_bit(BKEY_CACHED_ACCESSED, &ck->flags))
+		set_bit(BKEY_CACHED_ACCESSED, &ck->flags);
+
+	path->uptodate = BTREE_ITER_UPTODATE;
+	EBUG_ON(!ck->valid);
+	EBUG_ON(btree_node_locked_type(path, 0) != btree_lock_want(path, 0));
+
+	return ret;
+}
+
 static int btree_key_cache_flush_pos(struct btree_trans *trans,
 				     struct bkey_cached_key key,
 				     u64 journal_seq,
-- 
cgit 


From e0eaf8625974d91b4e50a0911b11af5d46c811c9 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 27 Sep 2022 17:17:23 -0400
Subject: bcachefs: Factor out bch2_write_drop_io_error_ptrs()

Move slowpath code to a separate, non-inline function.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/io.c | 39 ++++++++++++++++++++++++++-------------
 fs/bcachefs/io.h |  2 ++
 2 files changed, 28 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index ed78cb8d90a2..648e4a0a21a9 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -615,17 +615,11 @@ static void bch2_write_done(struct closure *cl)
 	op->end_io(op);
 }
 
-/**
- * bch_write_index - after a write, update index to point to new data
- */
-static void __bch2_write_index(struct bch_write_op *op)
+static noinline int bch2_write_drop_io_error_ptrs(struct bch_write_op *op)
 {
-	struct bch_fs *c = op->c;
 	struct keylist *keys = &op->insert_keys;
 	struct bch_extent_ptr *ptr;
-	struct bkey_i *src, *dst = keys->keys, *n, *k;
-	unsigned dev;
-	int ret = 0;
+	struct bkey_i *src, *dst = keys->keys, *n;
 
 	for (src = keys->keys; src != keys->top; src = n) {
 		n = bkey_next(src);
@@ -634,10 +628,8 @@ static void __bch2_write_index(struct bch_write_op *op)
 			bch2_bkey_drop_ptrs(bkey_i_to_s(src), ptr,
 					    test_bit(ptr->dev, op->failed.d));
 
-			if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(src))) {
-				ret = -EIO;
-				goto err;
-			}
+			if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(src)))
+				return -EIO;
 		}
 
 		if (dst != src)
@@ -646,6 +638,25 @@ static void __bch2_write_index(struct bch_write_op *op)
 	}
 
 	keys->top = dst;
+	return 0;
+}
+
+/**
+ * bch_write_index - after a write, update index to point to new data
+ */
+static void __bch2_write_index(struct bch_write_op *op)
+{
+	struct bch_fs *c = op->c;
+	struct keylist *keys = &op->insert_keys;
+	struct bkey_i *k;
+	unsigned dev;
+	int ret = 0;
+
+	if (unlikely(op->flags & BCH_WRITE_IO_ERROR)) {
+		ret = bch2_write_drop_io_error_ptrs(op);
+		if (ret)
+			goto err;
+	}
 
 	/*
 	 * probably not the ideal place to hook this in, but I don't
@@ -787,8 +798,10 @@ static void bch2_write_endio(struct bio *bio)
 				    op->pos.inode,
 				    op->pos.offset - bio_sectors(bio), /* XXX definitely wrong */
 				    "data write error: %s",
-			       bch2_blk_status_to_str(bio->bi_status)))
+				    bch2_blk_status_to_str(bio->bi_status))) {
 		set_bit(wbio->dev, op->failed.d);
+		op->flags |= BCH_WRITE_IO_ERROR;
+	}
 
 	if (wbio->have_ioref) {
 		bch2_latency_acct(ca, wbio->submit_time, WRITE);
diff --git a/fs/bcachefs/io.h b/fs/bcachefs/io.h
index b484d3387968..a3505762b68d 100644
--- a/fs/bcachefs/io.h
+++ b/fs/bcachefs/io.h
@@ -43,6 +43,7 @@ enum bch_write_flags {
 	__BCH_WRITE_JOURNAL_SEQ_PTR,
 	__BCH_WRITE_IN_WORKER,
 	__BCH_WRITE_DONE,
+	__BCH_WRITE_IO_ERROR,
 };
 
 #define BCH_WRITE_ALLOC_NOWAIT		(1U << __BCH_WRITE_ALLOC_NOWAIT)
@@ -61,6 +62,7 @@ enum bch_write_flags {
 #define BCH_WRITE_JOURNAL_SEQ_PTR	(1U << __BCH_WRITE_JOURNAL_SEQ_PTR)
 #define BCH_WRITE_IN_WORKER		(1U << __BCH_WRITE_IN_WORKER)
 #define BCH_WRITE_DONE			(1U << __BCH_WRITE_DONE)
+#define BCH_WRITE_IO_ERROR		(1U << __BCH_WRITE_IO_ERROR)
 
 static inline u64 *op_journal_seq(struct bch_write_op *op)
 {
-- 
cgit 


From 29cea6f4834b36f6a51832e08218e03b60002a9a Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 27 Sep 2022 18:56:57 -0400
Subject: bcachefs: Fix bch2_btree_path_up_until_good_node()

There was a rare bug when path->locks_want was nonzero, but not
BTREE_MAX_DEPTH, where we'd return on a valid node that wasn't locked -
oops.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 67a1e0d70ed9..7ea297249850 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1030,7 +1030,7 @@ static inline unsigned btree_path_up_until_good_node(struct btree_trans *trans,
 						     int check_pos)
 {
 	unsigned i, l = path->level;
-
+again:
 	while (btree_path_node(path, l) &&
 	       !btree_path_good_node(trans, path, l, check_pos))
 		__btree_path_set_level_up(trans, path, l++);
@@ -1039,9 +1039,11 @@ static inline unsigned btree_path_up_until_good_node(struct btree_trans *trans,
 	for (i = l + 1;
 	     i < path->locks_want && btree_path_node(path, i);
 	     i++)
-		if (!bch2_btree_node_relock(trans, path, i))
+		if (!bch2_btree_node_relock(trans, path, i)) {
 			while (l <= i)
 				__btree_path_set_level_up(trans, path, l++);
+			goto again;
+		}
 
 	return l;
 }
-- 
cgit 


From ae10fe017bf54653a61a93e49fac1c3e2b474e20 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 4 Nov 2022 16:06:55 -0400
Subject: bcachefs: bucket_alloc_state

This refactoring puts our various allocation path counters into a
dedicated struct - the upcoming nocow patch is going to add another
counter.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_foreground.c | 81 +++++++++++-------------------------------
 fs/bcachefs/alloc_types.h      |  7 ++++
 fs/bcachefs/trace.h            | 29 ++++++---------
 3 files changed, 37 insertions(+), 80 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index 0525a4ee0dd1..2318d08ab70f 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -205,26 +205,24 @@ static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev *
 					      u64 bucket,
 					      enum alloc_reserve reserve,
 					      struct bch_alloc_v4 *a,
-					      u64 *skipped_open,
-					      u64 *skipped_need_journal_commit,
-					      u64 *skipped_nouse,
+					      struct bucket_alloc_state *s,
 					      struct closure *cl)
 {
 	struct open_bucket *ob;
 
 	if (unlikely(ca->buckets_nouse && test_bit(bucket, ca->buckets_nouse))) {
-		(*skipped_nouse)++;
+		s->skipped_nouse++;
 		return NULL;
 	}
 
 	if (bch2_bucket_is_open(c, ca->dev_idx, bucket)) {
-		(*skipped_open)++;
+		s->skipped_open++;
 		return NULL;
 	}
 
 	if (bch2_bucket_needs_journal_commit(&c->buckets_waiting_for_journal,
 			c->journal.flushed_seq_ondisk, ca->dev_idx, bucket)) {
-		(*skipped_need_journal_commit)++;
+		s->skipped_need_journal_commit++;
 		return NULL;
 	}
 
@@ -244,7 +242,7 @@ static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev *
 	/* Recheck under lock: */
 	if (bch2_bucket_is_open(c, ca->dev_idx, bucket)) {
 		spin_unlock(&c->freelist_lock);
-		(*skipped_open)++;
+		s->skipped_open++;
 		return NULL;
 	}
 
@@ -283,9 +281,7 @@ static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev *
 
 static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bch_dev *ca,
 					    enum alloc_reserve reserve, u64 free_entry,
-					    u64 *skipped_open,
-					    u64 *skipped_need_journal_commit,
-					    u64 *skipped_nouse,
+					    struct bucket_alloc_state *s,
 					    struct bkey_s_c freespace_k,
 					    struct closure *cl)
 {
@@ -343,11 +339,7 @@ static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bc
 		goto err;
 	}
 
-	ob = __try_alloc_bucket(c, ca, b, reserve, &a,
-				skipped_open,
-				skipped_need_journal_commit,
-				skipped_nouse,
-				cl);
+	ob = __try_alloc_bucket(c, ca, b, reserve, &a, s, cl);
 	if (!ob)
 		iter.path->preserve = false;
 err:
@@ -393,10 +385,7 @@ static noinline struct open_bucket *
 bch2_bucket_alloc_early(struct btree_trans *trans,
 			struct bch_dev *ca,
 			enum alloc_reserve reserve,
-			u64 *buckets_seen,
-			u64 *skipped_open,
-			u64 *skipped_need_journal_commit,
-			u64 *skipped_nouse,
+			struct bucket_alloc_state *s,
 			struct closure *cl)
 {
 	struct btree_iter iter;
@@ -422,13 +411,9 @@ again:
 		if (a.data_type != BCH_DATA_free)
 			continue;
 
-		(*buckets_seen)++;
+		s->buckets_seen++;
 
-		ob = __try_alloc_bucket(trans->c, ca, k.k->p.offset, reserve, &a,
-					skipped_open,
-					skipped_need_journal_commit,
-					skipped_nouse,
-					cl);
+		ob = __try_alloc_bucket(trans->c, ca, k.k->p.offset, reserve, &a, s, cl);
 		if (ob)
 			break;
 	}
@@ -447,10 +432,7 @@ again:
 static struct open_bucket *bch2_bucket_alloc_freelist(struct btree_trans *trans,
 						   struct bch_dev *ca,
 						   enum alloc_reserve reserve,
-						   u64 *buckets_seen,
-						   u64 *skipped_open,
-						   u64 *skipped_need_journal_commit,
-						   u64 *skipped_nouse,
+						   struct bucket_alloc_state *s,
 						   struct closure *cl)
 {
 	struct btree_iter iter;
@@ -476,14 +458,10 @@ again:
 				break;
 			}
 
-			(*buckets_seen)++;
+			s->buckets_seen++;
 
 			ob = try_alloc_bucket(trans, ca, reserve,
-					      alloc_cursor,
-					      skipped_open,
-					      skipped_need_journal_commit,
-					      skipped_nouse,
-					      k, cl);
+					      alloc_cursor, s, k, cl);
 			if (ob) {
 				iter.path->preserve = false;
 				break;
@@ -523,10 +501,7 @@ static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans,
 	struct open_bucket *ob = NULL;
 	struct bch_dev_usage usage;
 	u64 avail;
-	u64 buckets_seen = 0;
-	u64 skipped_open = 0;
-	u64 skipped_need_journal_commit = 0;
-	u64 skipped_nouse = 0;
+	struct bucket_alloc_state s = { 0 };
 	bool waiting = false;
 again:
 	usage = bch2_dev_usage_read(ca);
@@ -565,20 +540,10 @@ again:
 	}
 
 	ob = likely(ca->mi.freespace_initialized)
-		? bch2_bucket_alloc_freelist(trans, ca, reserve,
-					&buckets_seen,
-					&skipped_open,
-					&skipped_need_journal_commit,
-					&skipped_nouse,
-					cl)
-		: bch2_bucket_alloc_early(trans, ca, reserve,
-					&buckets_seen,
-					&skipped_open,
-					&skipped_need_journal_commit,
-					&skipped_nouse,
-					cl);
-
-	if (skipped_need_journal_commit * 2 > avail)
+		? bch2_bucket_alloc_freelist(trans, ca, reserve, &s, cl)
+		: bch2_bucket_alloc_early(trans, ca, reserve, &s, cl);
+
+	if (s.skipped_need_journal_commit * 2 > avail)
 		bch2_journal_flush_async(&c->journal, NULL);
 err:
 	if (!ob)
@@ -593,10 +558,7 @@ err:
 				avail,
 				bch2_copygc_wait_amount(c),
 				c->copygc_wait - atomic64_read(&c->io_clock[WRITE].now),
-				buckets_seen,
-				skipped_open,
-				skipped_need_journal_commit,
-				skipped_nouse,
+				&s,
 				cl == NULL,
 				"");
 	else
@@ -608,10 +570,7 @@ err:
 				avail,
 				bch2_copygc_wait_amount(c),
 				c->copygc_wait - atomic64_read(&c->io_clock[WRITE].now),
-				buckets_seen,
-				skipped_open,
-				skipped_need_journal_commit,
-				skipped_nouse,
+				&s,
 				cl == NULL,
 				bch2_err_str(PTR_ERR(ob)));
 
diff --git a/fs/bcachefs/alloc_types.h b/fs/bcachefs/alloc_types.h
index 5eed5ce67c57..2c96794d1993 100644
--- a/fs/bcachefs/alloc_types.h
+++ b/fs/bcachefs/alloc_types.h
@@ -8,6 +8,13 @@
 #include "clock_types.h"
 #include "fifo.h"
 
+struct bucket_alloc_state {
+	u64	buckets_seen;
+	u64	skipped_open;
+	u64	skipped_need_journal_commit;
+	u64	skipped_nouse;
+};
+
 struct ec_bucket_buf;
 
 #define BCH_ALLOC_RESERVES()		\
diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h
index 24d089507a21..b5f44c4e80d1 100644
--- a/fs/bcachefs/trace.h
+++ b/fs/bcachefs/trace.h
@@ -518,15 +518,12 @@ DECLARE_EVENT_CLASS(bucket_alloc,
 		 u64 avail,
 		 u64 copygc_wait_amount,
 		 s64 copygc_waiting_for,
-		 u64 seen,
-		 u64 open,
-		 u64 need_journal_commit,
-		 u64 nouse,
+		 struct bucket_alloc_state *s,
 		 bool nonblocking,
 		 const char *err),
 	TP_ARGS(ca, alloc_reserve, user, bucket, free, avail,
 		copygc_wait_amount, copygc_waiting_for,
-		seen, open, need_journal_commit, nouse, nonblocking, err),
+		s, nonblocking, err),
 
 	TP_STRUCT__entry(
 		__field(dev_t,			dev			)
@@ -554,10 +551,10 @@ DECLARE_EVENT_CLASS(bucket_alloc,
 		__entry->avail		= avail;
 		__entry->copygc_wait_amount	= copygc_wait_amount;
 		__entry->copygc_waiting_for	= copygc_waiting_for;
-		__entry->seen		= seen;
-		__entry->open		= open;
-		__entry->need_journal_commit = need_journal_commit;
-		__entry->nouse		= nouse;
+		__entry->seen		= s->buckets_seen;
+		__entry->open		= s->skipped_open;
+		__entry->need_journal_commit = s->skipped_need_journal_commit;
+		__entry->nouse		= s->skipped_nouse;
 		__entry->nonblocking	= nonblocking;
 		strlcpy(__entry->err, err, sizeof(__entry->err));
 	),
@@ -587,15 +584,12 @@ DEFINE_EVENT(bucket_alloc, bucket_alloc,
 		 u64 avail,
 		 u64 copygc_wait_amount,
 		 s64 copygc_waiting_for,
-		 u64 seen,
-		 u64 open,
-		 u64 need_journal_commit,
-		 u64 nouse,
+		 struct bucket_alloc_state *s,
 		 bool nonblocking,
 		 const char *err),
 	TP_ARGS(ca, alloc_reserve, user, bucket, free, avail,
 		copygc_wait_amount, copygc_waiting_for,
-		seen, open, need_journal_commit, nouse, nonblocking, err)
+		s, nonblocking, err)
 );
 
 DEFINE_EVENT(bucket_alloc, bucket_alloc_fail,
@@ -606,15 +600,12 @@ DEFINE_EVENT(bucket_alloc, bucket_alloc_fail,
 		 u64 avail,
 		 u64 copygc_wait_amount,
 		 s64 copygc_waiting_for,
-		 u64 seen,
-		 u64 open,
-		 u64 need_journal_commit,
-		 u64 nouse,
+		 struct bucket_alloc_state *s,
 		 bool nonblocking,
 		 const char *err),
 	TP_ARGS(ca, alloc_reserve, user, bucket, free, avail,
 		copygc_wait_amount, copygc_waiting_for,
-		seen, open, need_journal_commit, nouse, nonblocking, err)
+		s, nonblocking, err)
 );
 
 TRACE_EVENT(discard_buckets,
-- 
cgit 


From 685e0f0c477dfc2b2147a20137a349f25b0a1f62 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 28 Sep 2022 10:16:57 -0400
Subject: bcachefs: Fix a trans path overflow in
 bch2_btree_delete_range_trans()

bch2_btree_delete_range_trans() was using btree_trans_too_many_iters()
to avoid path overflow, but this was buggy here (and also
btree_trans_too_many_iters() is suspect in general).

btree_trans_too_many_iters() only returns true when we're close to the
maximum number of paths - within 8 - but extent insert/delete assumes
that it can use more paths than that.

Instead, we need to call bch2_trans_begin() on every loop iteration.
Since we don't want to call bch2_trans_begin() (restarting the outer
transaction) if the call was a no-op - if we had no work to do - we have
to structure things a bit oddly.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update_leaf.c | 33 ++++++++++++++++++++-------------
 1 file changed, 20 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 31b60864b6da..98030f22ee05 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -1651,15 +1651,18 @@ int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id id,
 	int ret = 0;
 
 	bch2_trans_iter_init(trans, &iter, id, start, BTREE_ITER_INTENT);
-retry:
-	while ((k = bch2_btree_iter_peek(&iter)).k &&
-	       !(ret = bkey_err(k) ?:
-		 btree_trans_too_many_iters(trans)) &&
-	       bkey_cmp(iter.pos, end) < 0) {
+	while ((k = bch2_btree_iter_peek(&iter)).k) {
 		struct disk_reservation disk_res =
 			bch2_disk_reservation_init(trans->c, 0);
 		struct bkey_i delete;
 
+		ret = bkey_err(k);
+		if (ret)
+			goto err;
+
+		if (bkey_cmp(iter.pos, end) >= 0)
+			break;
+
 		bkey_init(&delete.k);
 
 		/*
@@ -1688,23 +1691,27 @@ retry:
 
 			ret = bch2_extent_trim_atomic(trans, &iter, &delete);
 			if (ret)
-				break;
+				goto err;
 		}
 
 		ret   = bch2_trans_update(trans, &iter, &delete, update_flags) ?:
 			bch2_trans_commit(trans, &disk_res, journal_seq,
 					  BTREE_INSERT_NOFAIL);
 		bch2_disk_reservation_put(trans->c, &disk_res);
+err:
+		/*
+		 * the bch2_trans_begin() call is in a weird place because we
+		 * need to call it after every transaction commit, to avoid path
+		 * overflow, but don't want to call it if the delete operation
+		 * is a no-op and we have no work to do:
+		 */
+		bch2_trans_begin(trans);
+
+		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+			ret = 0;
 		if (ret)
 			break;
 	}
-
-	if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
-		bch2_trans_begin(trans);
-		ret = 0;
-		goto retry;
-	}
-
 	bch2_trans_iter_exit(trans, &iter);
 
 	if (!ret && trans_was_restarted(trans, restart_count))
-- 
cgit 


From 943f9946a6cc58e2c15ae39970547cddbe845190 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 1 Oct 2022 23:54:46 -0400
Subject: bcachefs: Don't quash error in bch2_bucket_alloc_set_trans()

We were incorrectly returning -BCH_ERR_insufficient_devices when we'd
received a different error from bch2_bucket_alloc_trans(), which
(erronously) turns into -EROFS further up the call chain.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_foreground.c | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index 2318d08ab70f..0a7657541b8c 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -671,7 +671,7 @@ static int bch2_bucket_alloc_set_trans(struct btree_trans *trans,
 		bch2_dev_alloc_list(c, stripe, devs_may_alloc);
 	unsigned dev;
 	struct bch_dev *ca;
-	int ret = 0;
+	int ret = -BCH_ERR_insufficient_devices;
 	unsigned i;
 
 	BUG_ON(*nr_effective >= nr_replicas);
@@ -701,8 +701,8 @@ static int bch2_bucket_alloc_set_trans(struct btree_trans *trans,
 			bch2_dev_stripe_increment(ca, stripe);
 		percpu_ref_put(&ca->ref);
 
-		ret = PTR_ERR_OR_ZERO(ob);
-		if (ret) {
+		if (IS_ERR(ob)) {
+			ret = PTR_ERR(ob);
 			if (bch2_err_matches(ret, BCH_ERR_transaction_restart) || cl)
 				break;
 			continue;
@@ -711,15 +711,12 @@ static int bch2_bucket_alloc_set_trans(struct btree_trans *trans,
 		add_new_bucket(c, ptrs, devs_may_alloc,
 			       nr_effective, have_cache, flags, ob);
 
-		if (*nr_effective >= nr_replicas)
+		if (*nr_effective >= nr_replicas) {
+			ret = 0;
 			break;
+		}
 	}
 
-	if (*nr_effective >= nr_replicas)
-		ret = 0;
-	else if (!ret)
-		ret = -BCH_ERR_insufficient_devices;
-
 	return ret;
 }
 
-- 
cgit 


From 40a44873a5ca9843532344d12583e6a3a78ea848 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 2 Oct 2022 01:41:08 -0400
Subject: bcachefs: Improve btree_deadlock debugfs output

This changes bch2_check_for_deadlock() to print the longest chains it
finds - when we have a deadlock because the cycle detector isn't finding
something, this will let us see what it's missing.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_locking.c | 54 +++++++++++++++++++++++++++++++--------------
 fs/bcachefs/debug.c         | 17 +++++++++-----
 2 files changed, 50 insertions(+), 21 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c
index 19062cea8774..b79543ae5eeb 100644
--- a/fs/bcachefs/btree_locking.c
+++ b/fs/bcachefs/btree_locking.c
@@ -71,11 +71,6 @@ struct lock_graph {
 	unsigned			nr;
 };
 
-static void lock_graph_pop(struct lock_graph *g)
-{
-	closure_put(&g->g[--g->nr].trans->ref);
-}
-
 static noinline void print_cycle(struct printbuf *out, struct lock_graph *g)
 {
 	struct trans_waiting_for_lock *i;
@@ -87,6 +82,18 @@ static noinline void print_cycle(struct printbuf *out, struct lock_graph *g)
 		bch2_btree_trans_to_text(out, i->trans);
 }
 
+static noinline void print_chain(struct printbuf *out, struct lock_graph *g)
+{
+	struct trans_waiting_for_lock *i;
+
+	for (i = g->g; i != g->g + g->nr; i++) {
+		if (i != g->g)
+			prt_str(out, "<- ");
+		prt_printf(out, "%u ", i->trans->locking_wait.task->pid);
+	}
+	prt_newline(out);
+}
+
 static int abort_lock(struct lock_graph *g, struct trans_waiting_for_lock *i)
 {
 	int ret;
@@ -134,6 +141,21 @@ static noinline int break_cycle(struct lock_graph *g)
 	BUG();
 }
 
+static void lock_graph_pop(struct lock_graph *g)
+{
+	closure_put(&g->g[--g->nr].trans->ref);
+}
+
+static void lock_graph_pop_above(struct lock_graph *g, struct trans_waiting_for_lock *above,
+				 struct printbuf *cycle)
+{
+	if (g->nr > 1 && cycle)
+		print_chain(cycle, g);
+
+	while (g->g + g->nr > above)
+		lock_graph_pop(g);
+}
+
 static int lock_graph_descend(struct lock_graph *g, struct btree_trans *trans,
 			      struct printbuf *cycle)
 {
@@ -142,11 +164,10 @@ static int lock_graph_descend(struct lock_graph *g, struct btree_trans *trans,
 	int ret = 0;
 
 	for (i = g->g; i < g->g + g->nr; i++) {
-		if (i->trans->locking != i->node_want)
-			while (g->g + g->nr >= i) {
-				lock_graph_pop(g);
-				return 0;
-			}
+		if (i->trans->locking != i->node_want) {
+			lock_graph_pop_above(g, i - 1, cycle);
+			return 0;
+		}
 
 		if (i->trans == trans) {
 			if (cycle) {
@@ -185,20 +206,19 @@ static int lock_graph_descend(struct lock_graph *g, struct btree_trans *trans,
 
 	return 0;
 deadlock:
-	while (g->nr)
-		lock_graph_pop(g);
+	lock_graph_pop_above(g, g->g, cycle);
 	return ret;
 }
 
-static noinline void lock_graph_remove_non_waiters(struct lock_graph *g)
+static noinline void lock_graph_remove_non_waiters(struct lock_graph *g,
+						   struct printbuf *cycle)
 {
 	struct trans_waiting_for_lock *i;
 
 	for (i = g->g + 1; i < g->g + g->nr; i++)
 		if (i->trans->locking != i->node_want ||
 		    i->trans->locking_wait.start_time != i[-1].lock_start_time) {
-			while (g->g + g->nr >= i)
-				lock_graph_pop(g);
+			lock_graph_pop_above(g, i - 1, cycle);
 			return;
 		}
 	BUG();
@@ -252,7 +272,7 @@ next:
 			b = &READ_ONCE(path->l[top->level].b)->c;
 
 			if (unlikely(IS_ERR_OR_NULL(b))) {
-				lock_graph_remove_non_waiters(&g);
+				lock_graph_remove_non_waiters(&g, cycle);
 				goto next;
 			}
 
@@ -286,6 +306,8 @@ next:
 		}
 	}
 
+	if (g.nr > 1 && cycle)
+		print_chain(cycle, &g);
 	lock_graph_pop(&g);
 	goto next;
 }
diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c
index 41b2772afef9..c7d558381388 100644
--- a/fs/bcachefs/debug.c
+++ b/fs/bcachefs/debug.c
@@ -725,11 +725,18 @@ static ssize_t bch2_btree_deadlock_read(struct file *file, char __user *buf,
 		goto out;
 
 	mutex_lock(&c->btree_trans_lock);
-	list_for_each_entry(trans, &c->btree_trans_list, list)
-		if (bch2_check_for_deadlock(trans, &i->buf)) {
-			i->iter = 1;
-			break;
-		}
+	list_for_each_entry(trans, &c->btree_trans_list, list) {
+		if (trans->locking_wait.task->pid <= i->iter)
+			continue;
+
+		ret = flush_buf(i);
+		if (ret)
+			return ret;
+
+		bch2_check_for_deadlock(trans, &i->buf);
+
+		i->iter = trans->locking_wait.task->pid;
+	}
 	mutex_unlock(&c->btree_trans_lock);
 out:
 	if (i->buf.allocation_failure)
-- 
cgit 


From 13bc41a7151a6af26107240fbdd2562d95adad44 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 3 Oct 2022 16:39:49 -0400
Subject: bcachefs: bch2_trans_locked()

Useful debugging function.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.h    |  1 +
 fs/bcachefs/btree_locking.c | 10 ++++++++++
 2 files changed, 11 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index 1be7c8883101..2e94cd2657e9 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -207,6 +207,7 @@ void bch2_path_put(struct btree_trans *, struct btree_path *, bool);
 
 int bch2_trans_relock(struct btree_trans *);
 void bch2_trans_unlock(struct btree_trans *);
+bool bch2_trans_locked(struct btree_trans *);
 
 static inline bool trans_was_restarted(struct btree_trans *trans, u32 restart_count)
 {
diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c
index b79543ae5eeb..ad6e364980f3 100644
--- a/fs/bcachefs/btree_locking.c
+++ b/fs/bcachefs/btree_locking.c
@@ -666,6 +666,16 @@ void bch2_trans_unlock(struct btree_trans *trans)
 		__bch2_btree_path_unlock(trans, path);
 }
 
+bool bch2_trans_locked(struct btree_trans *trans)
+{
+	struct btree_path *path;
+
+	trans_for_each_path(trans, path)
+		if (path->nodes_locked)
+			return true;
+	return false;
+}
+
 /* Debug */
 
 #ifdef CONFIG_BCACHEFS_DEBUG
-- 
cgit 


From 8aaee94d463f781fbd5377b7d96234342de9c6eb Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 3 Oct 2022 16:41:17 -0400
Subject: bcachefs: Fix a deadlock in btree_update_nodes_written()

btree_node_lock_nopath() is something we'd like to get rid of, it's
always prone to deadlocks if we accidentally are holding other locks,
because it doesn't mark the lock it's taking in a path: we'll want to
get rid of it in the future, but for now this patch works it by calling
bch2_trans_unlock().

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update_interior.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index fd44492150b0..247555dffa3f 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -650,6 +650,17 @@ err:
 		 * we're in journal error state:
 		 */
 
+		/*
+		 * Ensure transaction is unlocked before using
+		 * btree_node_lock_nopath() (the use of which is always suspect,
+		 * we need to work on removing this in the future)
+		 *
+		 * It should be, but get_unlocked_mut_path() -> bch2_path_get()
+		 * calls bch2_path_upgrade(), before we call path_make_mut(), so
+		 * we may rarely end up with a locked path besides the one we
+		 * have here:
+		 */
+		bch2_trans_unlock(&trans);
 		btree_node_lock_nopath_nofail(&trans, &b->c, SIX_LOCK_intent);
 		mark_btree_node_locked(&trans, path, b->c.level, SIX_LOCK_intent);
 		path->l[b->c.level].lock_seq = b->c.lock.state.seq;
-- 
cgit 


From 22f516213358379732d63367432d334157cbbe4d Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 9 Oct 2022 00:54:36 -0400
Subject: bcachefs: Ensure fsck error is printed before panic

When errors=panic, we want to make sure we print the error before
calling bch2_inconsistent_error().

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/error.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c
index 762abdf2f283..2fb5102ee31d 100644
--- a/fs/bcachefs/error.c
+++ b/fs/bcachefs/error.c
@@ -104,7 +104,7 @@ int bch2_fsck_err(struct bch_fs *c, unsigned flags, const char *fmt, ...)
 {
 	struct fsck_err_state *s = NULL;
 	va_list args;
-	bool print = true, suppressing = false;
+	bool print = true, suppressing = false, inconsistent = false;
 	struct printbuf buf = PRINTBUF, *out = &buf;
 	int ret = -BCH_ERR_fsck_ignore;
 
@@ -136,7 +136,7 @@ int bch2_fsck_err(struct bch_fs *c, unsigned flags, const char *fmt, ...)
 		if (c->opts.errors != BCH_ON_ERROR_continue ||
 		    !(flags & (FSCK_CAN_FIX|FSCK_CAN_IGNORE))) {
 			prt_str(out, ", shutting down");
-			bch2_inconsistent_error(c);
+			inconsistent = true;
 			ret = -BCH_ERR_fsck_errors_not_fixed;
 		} else if (flags & FSCK_CAN_FIX) {
 			prt_str(out, ", fixing");
@@ -189,6 +189,9 @@ int bch2_fsck_err(struct bch_fs *c, unsigned flags, const char *fmt, ...)
 
 	printbuf_exit(&buf);
 
+	if (inconsistent)
+		bch2_inconsistent_error(c);
+
 	if (ret == -BCH_ERR_fsck_fix) {
 		set_bit(BCH_FS_ERRORS_FIXED, &c->flags);
 	} else {
-- 
cgit 


From 6c22eb7085d3ee055a178ed0a4e8d0e5d18800f3 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 9 Oct 2022 01:08:51 -0400
Subject: bcachefs: Fix "multiple types of data in same bucket" with ec

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/buckets.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index f01b8171cb92..4b041707cd54 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -687,6 +687,10 @@ static int check_bucket_ref(struct bch_fs *c,
 	if (bucket_data_type == BCH_DATA_cached)
 		bucket_data_type = BCH_DATA_user;
 
+	if ((bucket_data_type == BCH_DATA_stripe && ptr_data_type == BCH_DATA_user) ||
+	    (bucket_data_type == BCH_DATA_user   && ptr_data_type == BCH_DATA_stripe))
+		bucket_data_type = ptr_data_type = BCH_DATA_stripe;
+
 	if (gen_after(ptr->gen, b_gen)) {
 		bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
 			"bucket %u:%zu gen %u data type %s: ptr gen %u newer than bucket gen\n"
-- 
cgit 


From 2da671dc4a62da6de4ce0de529fc3e80f1f8f603 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 9 Oct 2022 02:25:53 -0400
Subject: bcachefs: Use btree_type_has_ptrs() more consistently

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_gc.c | 2 +-
 fs/bcachefs/sysfs.c    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 77a1fe81ac35..5d19029477cf 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -1963,7 +1963,7 @@ int bch2_gc_gens(struct bch_fs *c)
 	}
 
 	for (i = 0; i < BTREE_ID_NR; i++)
-		if ((1 << i) & BTREE_ID_HAS_PTRS) {
+		if (btree_type_has_ptrs(i)) {
 			struct btree_iter iter;
 			struct bkey_s_c k;
 
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index 50b3ba92c5ae..a27ceabd5e49 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -298,7 +298,7 @@ static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c
 	bch2_trans_init(&trans, c, 0, 0);
 
 	for (id = 0; id < BTREE_ID_NR; id++) {
-		if (!((1U << id) & BTREE_ID_HAS_PTRS))
+		if (!btree_type_has_ptrs(id))
 			continue;
 
 		for_each_btree_key(&trans, iter, id, POS_MIN,
-- 
cgit 


From 160dff6dad43d9428b1250f927721a9a8756cfd9 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 9 Oct 2022 02:30:50 -0400
Subject: bcachefs: Ratelimit ec error message

We should fix this, but for now this makes this more usable.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/ec.c | 26 +++++++-------------------
 1 file changed, 7 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index aa8301146382..2dcca5c7fcec 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -1386,10 +1386,8 @@ static int __bch2_ec_stripe_head_reuse(struct bch_fs *c,
 	int ret;
 
 	idx = get_existing_stripe(c, h);
-	if (idx < 0) {
-		bch_err(c, "failed to find an existing stripe");
+	if (idx < 0)
 		return -BCH_ERR_ENOSPC_stripe_reuse;
-	}
 
 	h->s->have_existing_stripe = true;
 	ret = get_stripe_key(c, idx, &h->s->existing_stripe);
@@ -1427,21 +1425,9 @@ static int __bch2_ec_stripe_head_reuse(struct bch_fs *c,
 static int __bch2_ec_stripe_head_reserve(struct bch_fs *c,
 							struct ec_stripe_head *h)
 {
-	int ret;
-
-	ret = bch2_disk_reservation_get(c, &h->s->res,
-			h->blocksize,
-			h->s->nr_parity, 0);
-
-	if (ret) {
-		/*
-		 * This means we need to wait for copygc to
-		 * empty out buckets from existing stripes:
-		 */
-		bch_err(c, "failed to reserve stripe");
-	}
-
-	return ret;
+	return bch2_disk_reservation_get(c, &h->s->res,
+					 h->blocksize,
+					 h->s->nr_parity, 0);
 }
 
 struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *c,
@@ -1483,8 +1469,10 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *c,
 		ret = __bch2_ec_stripe_head_reserve(c, h);
 	if (ret && needs_stripe_new)
 		ret = __bch2_ec_stripe_head_reuse(c, h);
-	if (ret)
+	if (ret) {
+		bch_err_ratelimited(c, "failed to get stripe: %s", bch2_err_str(ret));
 		goto err;
+	}
 
 	if (!h->s->allocated) {
 		ret = new_stripe_alloc_buckets(c, h, cl);
-- 
cgit 


From 1be887979bc12a6c88b33b0d53dfdc369bfa9d49 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 9 Oct 2022 03:32:17 -0400
Subject: bcachefs: Handle dropping pointers in data_update path

Cached pointers are generally dropped, not moved: this led to an
assertion firing in the data update path when there were no new replicas
being written.

This path adds a data_options field for pointers to be dropped, and
tweaks move_extent() to check if we're only dropping pointers, not
writing new ones, before kicking off a data update operation.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/data_update.c | 23 +++++++++++++++++--
 fs/bcachefs/data_update.h |  2 ++
 fs/bcachefs/move.c        | 56 ++++++++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 78 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c
index 0b6f765bcad9..c606f075688f 100644
--- a/fs/bcachefs/data_update.c
+++ b/fs/bcachefs/data_update.c
@@ -331,8 +331,9 @@ int bch2_data_update_init(struct bch_fs *c, struct data_update *m,
 
 	i = 0;
 	bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
-		if (p.ptr.cached)
-			m->data_opts.rewrite_ptrs &= ~(1U << i);
+		if (((1U << i) & m->data_opts.rewrite_ptrs) &&
+		    p.ptr.cached)
+			BUG();
 
 		if (!((1U << i) & m->data_opts.rewrite_ptrs))
 			bch2_dev_list_add_dev(&m->op.devs_have, p.ptr.dev);
@@ -368,5 +369,23 @@ int bch2_data_update_init(struct bch_fs *c, struct data_update *m,
 
 	m->op.nr_replicas = m->op.nr_replicas_required =
 		hweight32(m->data_opts.rewrite_ptrs) + m->data_opts.extra_replicas;
+
+	BUG_ON(!m->op.nr_replicas);
 	return 0;
 }
+
+void bch2_data_update_opts_normalize(struct bkey_s_c k, struct data_update_opts *opts)
+{
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+	const struct bch_extent_ptr *ptr;
+	unsigned i = 0;
+
+	bkey_for_each_ptr(ptrs, ptr) {
+		if ((opts->rewrite_ptrs & (1U << i)) && ptr->cached) {
+			opts->kill_ptrs |= 1U << i;
+			opts->rewrite_ptrs ^= 1U << i;
+		}
+
+		i++;
+	}
+}
diff --git a/fs/bcachefs/data_update.h b/fs/bcachefs/data_update.h
index ee38bd655af1..5d8690795959 100644
--- a/fs/bcachefs/data_update.h
+++ b/fs/bcachefs/data_update.h
@@ -10,6 +10,7 @@ struct moving_context;
 
 struct data_update_opts {
 	unsigned	rewrite_ptrs;
+	unsigned	kill_ptrs;
 	u16		target;
 	u8		extra_replicas;
 	unsigned	btree_insert_flags;
@@ -35,5 +36,6 @@ int bch2_data_update_init(struct bch_fs *, struct data_update *,
 			  struct write_point_specifier,
 			  struct bch_io_opts, struct data_update_opts,
 			  enum btree_id, struct bkey_s_c);
+void bch2_data_update_opts_normalize(struct bkey_s_c, struct data_update_opts *);
 
 #endif /* _BCACHEFS_DATA_UPDATE_H */
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index 0486c7e14c56..f00c57c8e7a3 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -183,7 +183,52 @@ void bch_move_stats_init(struct bch_move_stats *stats, char *name)
 	scnprintf(stats->name, sizeof(stats->name), "%s", name);
 }
 
+static int bch2_extent_drop_ptrs(struct btree_trans *trans,
+				 struct btree_iter *iter,
+				 struct bkey_s_c k,
+				 struct data_update_opts data_opts)
+{
+	struct bch_fs *c = trans->c;
+	struct bkey_i *n;
+	int ret;
+
+	n = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
+	ret = PTR_ERR_OR_ZERO(n);
+	if (ret)
+		return ret;
+
+	bkey_reassemble(n, k);
+
+	while (data_opts.kill_ptrs) {
+		unsigned i = 0, drop = __fls(data_opts.kill_ptrs);
+		struct bch_extent_ptr *ptr;
+
+		bch2_bkey_drop_ptrs(bkey_i_to_s(n), ptr, i++ == drop);
+		data_opts.kill_ptrs ^= 1U << drop;
+	}
+
+	/*
+	 * If the new extent no longer has any pointers, bch2_extent_normalize()
+	 * will do the appropriate thing with it (turning it into a
+	 * KEY_TYPE_error key, or just a discard if it was a cached extent)
+	 */
+	bch2_extent_normalize(c, bkey_i_to_s(n));
+
+	/*
+	 * Since we're not inserting through an extent iterator
+	 * (BTREE_ITER_ALL_SNAPSHOTS iterators aren't extent iterators),
+	 * we aren't using the extent overwrite path to delete, we're
+	 * just using the normal key deletion path:
+	 */
+	if (bkey_deleted(&n->k))
+		n->k.size = 0;
+
+	return bch2_trans_update(trans, iter, n, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
+		bch2_trans_commit(trans, NULL, NULL, BTREE_INSERT_NOFAIL);
+}
+
 static int bch2_move_extent(struct btree_trans *trans,
+			    struct btree_iter *iter,
 			    struct moving_context *ctxt,
 			    struct bch_io_opts io_opts,
 			    enum btree_id btree_id,
@@ -198,6 +243,15 @@ static int bch2_move_extent(struct btree_trans *trans,
 	unsigned sectors = k.k->size, pages;
 	int ret = -ENOMEM;
 
+	bch2_data_update_opts_normalize(k, &data_opts);
+
+	if (!data_opts.rewrite_ptrs &&
+	    !data_opts.extra_replicas) {
+		if (data_opts.kill_ptrs)
+			return bch2_extent_drop_ptrs(trans, iter, k, data_opts);
+		return 0;
+	}
+
 	if (!percpu_ref_tryget_live(&c->writes))
 		return -EROFS;
 
@@ -429,7 +483,7 @@ static int __bch2_move_data(struct moving_context *ctxt,
 		bch2_bkey_buf_reassemble(&sk, c, k);
 		k = bkey_i_to_s_c(sk.k);
 
-		ret2 = bch2_move_extent(&trans, ctxt, io_opts,
+		ret2 = bch2_move_extent(&trans, &iter, ctxt, io_opts,
 					btree_id, k, data_opts);
 		if (ret2) {
 			if (bch2_err_matches(ret2, BCH_ERR_transaction_restart))
-- 
cgit 


From 1148a97f1fb9b80ef5355021f0c2dfc7b8f003a2 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 9 Oct 2022 04:29:04 -0400
Subject: bcachefs: Print cycle on unrecoverable deadlock

Some lock operations can't fail; a cycle of nofail locks is impossible
to recover from. So we want to get rid of these nofail locking
operations, but as this is tricky it'll be done incrementally.

If such a cycle happens, this patch prints out which codepaths are
involved so we know what to work on next.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_locking.c | 24 +++++++++++++++++++++++-
 fs/bcachefs/debug.c         | 22 +---------------------
 fs/bcachefs/util.c          | 20 ++++++++++++++++++++
 fs/bcachefs/util.h          |  1 +
 4 files changed, 45 insertions(+), 22 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c
index ad6e364980f3..3973e8d7e6da 100644
--- a/fs/bcachefs/btree_locking.c
+++ b/fs/bcachefs/btree_locking.c
@@ -138,7 +138,29 @@ static noinline int break_cycle(struct lock_graph *g)
 		return abort_lock(g, i);
 	}
 
-	BUG();
+	{
+		struct bch_fs *c = g->g->trans->c;
+		struct printbuf buf = PRINTBUF;
+
+		bch_err(c, "cycle of nofail locks");
+
+		for (i = g->g; i < g->g + g->nr; i++) {
+			struct btree_trans *trans = i->trans;
+
+			bch2_btree_trans_to_text(&buf, trans);
+
+			prt_printf(&buf, "backtrace:");
+			prt_newline(&buf);
+			printbuf_indent_add(&buf, 2);
+			bch2_prt_backtrace(&buf, trans->locking_wait.task);
+			printbuf_indent_sub(&buf, 2);
+			prt_newline(&buf);
+		}
+
+		bch2_print_string_as_lines(KERN_ERR, buf.buf);
+		printbuf_exit(&buf);
+		BUG();
+	}
 }
 
 static void lock_graph_pop(struct lock_graph *g)
diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c
index c7d558381388..7abc707d2f38 100644
--- a/fs/bcachefs/debug.c
+++ b/fs/bcachefs/debug.c
@@ -501,26 +501,6 @@ static const struct file_operations cached_btree_nodes_ops = {
 };
 
 #ifdef CONFIG_BCACHEFS_DEBUG_TRANSACTIONS
-static int prt_backtrace(struct printbuf *out, struct task_struct *task)
-{
-	unsigned long entries[32];
-	unsigned i, nr_entries;
-	int ret;
-
-	ret = down_read_killable(&task->signal->exec_update_lock);
-	if (ret)
-		return ret;
-
-	nr_entries = stack_trace_save_tsk(task, entries, ARRAY_SIZE(entries), 0);
-	for (i = 0; i < nr_entries; i++) {
-		prt_printf(out, "[<0>] %pB", (void *)entries[i]);
-		prt_newline(out);
-	}
-
-	up_read(&task->signal->exec_update_lock);
-	return 0;
-}
-
 static ssize_t bch2_btree_transactions_read(struct file *file, char __user *buf,
 					    size_t size, loff_t *ppos)
 {
@@ -547,7 +527,7 @@ static ssize_t bch2_btree_transactions_read(struct file *file, char __user *buf,
 		prt_printf(&i->buf, "backtrace:");
 		prt_newline(&i->buf);
 		printbuf_indent_add(&i->buf, 2);
-		prt_backtrace(&i->buf, trans->locking_wait.task);
+		bch2_prt_backtrace(&i->buf, trans->locking_wait.task);
 		printbuf_indent_sub(&i->buf, 2);
 		prt_newline(&i->buf);
 
diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c
index 477c260de50b..bf529bb137ed 100644
--- a/fs/bcachefs/util.c
+++ b/fs/bcachefs/util.c
@@ -265,6 +265,26 @@ void bch2_print_string_as_lines(const char *prefix, const char *lines)
 	console_unlock();
 }
 
+int bch2_prt_backtrace(struct printbuf *out, struct task_struct *task)
+{
+	unsigned long entries[32];
+	unsigned i, nr_entries;
+	int ret;
+
+	ret = down_read_killable(&task->signal->exec_update_lock);
+	if (ret)
+		return ret;
+
+	nr_entries = stack_trace_save_tsk(task, entries, ARRAY_SIZE(entries), 0);
+	for (i = 0; i < nr_entries; i++) {
+		prt_printf(out, "[<0>] %pB", (void *)entries[i]);
+		prt_newline(out);
+	}
+
+	up_read(&task->signal->exec_update_lock);
+	return 0;
+}
+
 /* time stats: */
 
 #ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h
index a16f8bb9d415..3b0090faef4d 100644
--- a/fs/bcachefs/util.h
+++ b/fs/bcachefs/util.h
@@ -383,6 +383,7 @@ u64 bch2_read_flag_list(char *, const char * const[]);
 void bch2_prt_u64_binary(struct printbuf *, u64, unsigned);
 
 void bch2_print_string_as_lines(const char *prefix, const char *lines);
+int bch2_prt_backtrace(struct printbuf *, struct task_struct *);
 
 #define NR_QUANTILES	15
 #define QUANTILE_IDX(i)	inorder_to_eytzinger0(i, NR_QUANTILES)
-- 
cgit 


From fe2de9a8dc8312a0992d91c1d63d93bf28574bcf Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 9 Oct 2022 04:55:02 -0400
Subject: bcachefs: Simplify break_cycle()

We'd like to prioritize aborting transactions that have done less work -
however, it appears breaking cycles by telling other threads to abort
may still be buggy, so disable that for now.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_locking.c | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c
index 3973e8d7e6da..4940b3069a76 100644
--- a/fs/bcachefs/btree_locking.c
+++ b/fs/bcachefs/btree_locking.c
@@ -96,25 +96,26 @@ static noinline void print_chain(struct printbuf *out, struct lock_graph *g)
 
 static int abort_lock(struct lock_graph *g, struct trans_waiting_for_lock *i)
 {
-	int ret;
-
 	if (i == g->g) {
 		trace_and_count(i->trans->c, trans_restart_would_deadlock, i->trans, _RET_IP_);
-		ret = btree_trans_restart(i->trans, BCH_ERR_transaction_restart_would_deadlock);
+		return btree_trans_restart(i->trans, BCH_ERR_transaction_restart_would_deadlock);
 	} else {
 		i->trans->lock_must_abort = true;
-		ret = 0;
-	}
-
-	for (i = g->g + 1; i < g->g + g->nr; i++)
 		wake_up_process(i->trans->locking_wait.task);
-	return ret;
+		return 0;
+	}
 }
 
 static noinline int break_cycle(struct lock_graph *g)
 {
 	struct trans_waiting_for_lock *i;
 
+	/*
+	 * We'd like to prioritize aborting transactions that have done less
+	 * work - but it appears breaking cycles by telling other transactions
+	 * to abort may still be buggy:
+	 */
+#if 0
 	for (i = g->g; i < g->g + g->nr; i++) {
 		if (i->trans->lock_may_not_fail ||
 		    i->trans->locking_wait.lock_want == SIX_LOCK_write)
@@ -130,7 +131,7 @@ static noinline int break_cycle(struct lock_graph *g)
 
 		return abort_lock(g, i);
 	}
-
+#endif
 	for (i = g->g; i < g->g + g->nr; i++) {
 		if (i->trans->lock_may_not_fail)
 			continue;
-- 
cgit 


From 8cbb0002509a605972781c0e747ae68112f94f54 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 1 Oct 2022 00:34:02 -0400
Subject: bcachefs: Write new btree nodes after parent update

In order to avoid locking all btree nodes up to the root for btree node
splits, we're going to have to introduce a new error path into
bch2_btree_insert_node(); this mean we can't have done any writes or
modified global state before that point.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_io.c              | 15 +++++-------
 fs/bcachefs/btree_update_interior.c | 46 +++++++++++++++++++------------------
 2 files changed, 30 insertions(+), 31 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 603b825ed6fe..a322a8367688 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -1908,6 +1908,8 @@ do_write:
 	u64s = bch2_sort_keys(i->start, &sort_iter, false);
 	le16_add_cpu(&i->u64s, u64s);
 
+	BUG_ON(!b->written && i->u64s != b->data->keys.u64s);
+
 	set_needs_whiteout(i, false);
 
 	/* do we have data to write? */
@@ -1917,6 +1919,10 @@ do_write:
 	bytes_to_write = vstruct_end(i) - data;
 	sectors_to_write = round_up(bytes_to_write, block_bytes(c)) >> 9;
 
+	if (!b->written &&
+	    b->key.k.type == KEY_TYPE_btree_ptr_v2)
+		BUG_ON(btree_ptr_sectors_written(&b->key) != sectors_to_write);
+
 	memset(data + bytes_to_write, 0,
 	       (sectors_to_write << 9) - bytes_to_write);
 
@@ -2005,11 +2011,6 @@ do_write:
 
 	b->written += sectors_to_write;
 
-	if (wbio->wbio.first_btree_write &&
-	    b->key.k.type == KEY_TYPE_btree_ptr_v2)
-		bkey_i_to_btree_ptr_v2(&b->key)->v.sectors_written =
-			cpu_to_le16(b->written);
-
 	if (wbio->key.k.type == KEY_TYPE_btree_ptr_v2)
 		bkey_i_to_btree_ptr_v2(&wbio->key)->v.sectors_written =
 			cpu_to_le16(b->written);
@@ -2022,10 +2023,6 @@ do_write:
 	return;
 err:
 	set_btree_node_noevict(b);
-	if (!b->written &&
-	    b->key.k.type == KEY_TYPE_btree_ptr_v2)
-		bkey_i_to_btree_ptr_v2(&b->key)->v.sectors_written =
-			cpu_to_le16(sectors_to_write);
 	b->written += sectors_to_write;
 nowrite:
 	btree_bounce_free(c, bytes, used_mempool, data);
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 247555dffa3f..ac1e6d7286aa 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -392,8 +392,6 @@ static struct btree *__btree_root_alloc(struct btree_update *as,
 
 	btree_node_set_format(b, b->data->format);
 	bch2_btree_build_aux_trees(b);
-
-	bch2_btree_update_add_new_node(as, b);
 	six_unlock_write(&b->c.lock);
 
 	return b;
@@ -860,6 +858,14 @@ static void bch2_btree_update_add_new_node(struct btree_update *as, struct btree
 	mutex_unlock(&c->btree_interior_update_lock);
 
 	btree_update_add_key(as, &as->new_keys, b);
+
+	if (b->key.k.type == KEY_TYPE_btree_ptr_v2) {
+		unsigned bytes = vstruct_end(&b->data->keys) - (void *) b->data;
+		unsigned sectors = round_up(bytes, block_bytes(c)) >> 9;
+
+		bkey_i_to_btree_ptr_v2(&b->key)->v.sectors_written =
+			cpu_to_le16(sectors);
+	}
 }
 
 /*
@@ -1192,7 +1198,6 @@ static void bch2_btree_set_root(struct btree_update *as,
 	struct btree *old;
 
 	trace_and_count(c, btree_node_set_root, c, b);
-	BUG_ON(!b->written);
 
 	old = btree_node_root(c, b);
 
@@ -1316,8 +1321,6 @@ static struct btree *__btree_split_node(struct btree_update *as,
 	SET_BTREE_NODE_SEQ(n2->data, BTREE_NODE_SEQ(n1->data));
 	n2->key.k.p = n1->key.k.p;
 
-	bch2_btree_update_add_new_node(as, n2);
-
 	set1 = btree_bset_first(n1);
 	set2 = btree_bset_first(n2);
 
@@ -1500,9 +1503,7 @@ static void btree_split(struct btree_update *as, struct btree_trans *trans,
 		bch2_btree_path_level_init(trans, path2, n2);
 
 		bch2_btree_update_add_new_node(as, n1);
-
-		bch2_btree_node_write(c, n1, SIX_LOCK_intent, 0);
-		bch2_btree_node_write(c, n2, SIX_LOCK_intent, 0);
+		bch2_btree_update_add_new_node(as, n2);
 
 		/*
 		 * Note that on recursive parent_keys == keys, so we
@@ -1525,9 +1526,9 @@ static void btree_split(struct btree_update *as, struct btree_trans *trans,
 			n3->sib_u64s[0] = U16_MAX;
 			n3->sib_u64s[1] = U16_MAX;
 
-			btree_split_insert_keys(as, trans, path, n3, &as->parent_keys);
+			bch2_btree_update_add_new_node(as, n3);
 
-			bch2_btree_node_write(c, n3, SIX_LOCK_intent, 0);
+			btree_split_insert_keys(as, trans, path, n3, &as->parent_keys);
 		}
 	} else {
 		trace_and_count(c, btree_node_compact, c, b);
@@ -1542,8 +1543,6 @@ static void btree_split(struct btree_update *as, struct btree_trans *trans,
 
 		bch2_btree_update_add_new_node(as, n1);
 
-		bch2_btree_node_write(c, n1, SIX_LOCK_intent, 0);
-
 		if (parent)
 			bch2_keylist_add(&as->parent_keys, &n1->key);
 	}
@@ -1560,11 +1559,16 @@ static void btree_split(struct btree_update *as, struct btree_trans *trans,
 		bch2_btree_set_root(as, trans, path, n1);
 	}
 
-	bch2_btree_update_get_open_buckets(as, n1);
-	if (n2)
-		bch2_btree_update_get_open_buckets(as, n2);
-	if (n3)
+	if (n3) {
 		bch2_btree_update_get_open_buckets(as, n3);
+		bch2_btree_node_write(c, n3, SIX_LOCK_intent, 0);
+	}
+	if (n2) {
+		bch2_btree_update_get_open_buckets(as, n2);
+		bch2_btree_node_write(c, n2, SIX_LOCK_intent, 0);
+	}
+	bch2_btree_update_get_open_buckets(as, n1);
+	bch2_btree_node_write(c, n1, SIX_LOCK_intent, 0);
 
 	/*
 	 * The old node must be freed (in memory) _before_ unlocking the new
@@ -1824,8 +1828,6 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans,
 	btree_set_min(n, prev->data->min_key);
 	btree_set_max(n, next->data->max_key);
 
-	bch2_btree_update_add_new_node(as, n);
-
 	n->data->format	 = new_f;
 	btree_node_set_format(n, new_f);
 
@@ -1835,13 +1837,13 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans,
 	bch2_btree_build_aux_trees(n);
 	six_unlock_write(&n->c.lock);
 
+	bch2_btree_update_add_new_node(as, n);
+
 	new_path = get_unlocked_mut_path(trans, path->btree_id, n->c.level, n->key.k.p);
 	six_lock_increment(&n->c.lock, SIX_LOCK_intent);
 	mark_btree_node_locked(trans, new_path, n->c.level, SIX_LOCK_intent);
 	bch2_btree_path_level_init(trans, new_path, n);
 
-	bch2_btree_node_write(c, n, SIX_LOCK_intent, 0);
-
 	bkey_init(&delete.k);
 	delete.k.p = prev->key.k.p;
 	bch2_keylist_add(&as->parent_keys, &delete);
@@ -1854,6 +1856,7 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans,
 	bch2_trans_verify_paths(trans);
 
 	bch2_btree_update_get_open_buckets(as, n);
+	bch2_btree_node_write(c, n, SIX_LOCK_intent, 0);
 
 	bch2_btree_node_free_inmem(trans, path, b);
 	bch2_btree_node_free_inmem(trans, sib_path, m);
@@ -1914,8 +1917,6 @@ int bch2_btree_node_rewrite(struct btree_trans *trans,
 
 	trace_and_count(c, btree_node_rewrite, c, b);
 
-	bch2_btree_node_write(c, n, SIX_LOCK_intent, 0);
-
 	if (parent) {
 		bch2_keylist_add(&as->parent_keys, &n->key);
 		bch2_btree_insert_node(as, trans, iter->path, parent,
@@ -1925,6 +1926,7 @@ int bch2_btree_node_rewrite(struct btree_trans *trans,
 	}
 
 	bch2_btree_update_get_open_buckets(as, n);
+	bch2_btree_node_write(c, n, SIX_LOCK_intent, 0);
 
 	bch2_btree_node_free_inmem(trans, iter->path, b);
 
-- 
cgit 


From a8eefbd324cd40fab57ab8eef88347d4f745db93 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 1 Oct 2022 22:15:30 -0400
Subject: bcachefs: Add error path to btree_split()

The next patch in the series is (finally!) going to change btree splits
(and interior updates in general) to not take intent locks all the way
up to the root - instead only locking the nodes they'll need to modify.

However, this will be introducing a race since if we're not holding a
write lock on a btree node it can be written out by another thread, and
then we might not have enough space for a new bset entry.

We can handle this by retrying - we just need to introduce a new error
path.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update_interior.c | 105 +++++++++++++++++++++++++++++-------
 fs/bcachefs/errcode.h               |   1 +
 2 files changed, 88 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index ac1e6d7286aa..b0a15757aaea 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -23,9 +23,9 @@
 
 #include <linux/random.h>
 
-static void bch2_btree_insert_node(struct btree_update *, struct btree_trans *,
-				   struct btree_path *, struct btree *,
-				   struct keylist *, unsigned);
+static int bch2_btree_insert_node(struct btree_update *, struct btree_trans *,
+				  struct btree_path *, struct btree *,
+				  struct keylist *, unsigned);
 static void bch2_btree_update_add_new_node(struct btree_update *, struct btree *);
 
 static struct btree_path *get_unlocked_mut_path(struct btree_trans *trans,
@@ -194,6 +194,43 @@ static void bch2_btree_node_free_inmem(struct btree_trans *trans,
 		}
 }
 
+static void bch2_btree_node_free_never_used(struct btree_update *as,
+					    struct btree_trans *trans,
+					    struct btree *b)
+{
+	struct bch_fs *c = as->c;
+	struct prealloc_nodes *p = &as->prealloc_nodes[b->c.lock.readers != NULL];
+	struct btree_path *path;
+	unsigned level = b->c.level;
+
+	BUG_ON(!list_empty(&b->write_blocked));
+	BUG_ON(b->will_make_reachable != (1UL|(unsigned long) as));
+
+	b->will_make_reachable = 0;
+	closure_put(&as->cl);
+
+	clear_btree_node_will_make_reachable(b);
+	clear_btree_node_accessed(b);
+	clear_btree_node_dirty_acct(c, b);
+	clear_btree_node_need_write(b);
+
+	mutex_lock(&c->btree_cache.lock);
+	list_del_init(&b->list);
+	bch2_btree_node_hash_remove(&c->btree_cache, b);
+	mutex_unlock(&c->btree_cache.lock);
+
+	BUG_ON(p->nr >= ARRAY_SIZE(p->b));
+	p->b[p->nr++] = b;
+
+	six_unlock_intent(&b->c.lock);
+
+	trans_for_each_path(trans, path)
+		if (path->l[level].b == b) {
+			btree_node_unlock(trans, path, level);
+			path->l[level].b = ERR_PTR(-BCH_ERR_no_btree_node_init);
+		}
+}
+
 static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans,
 					     struct disk_reservation *res,
 					     struct closure *cl,
@@ -1462,15 +1499,16 @@ static void btree_split_insert_keys(struct btree_update *as,
 	btree_node_interior_verify(as->c, b);
 }
 
-static void btree_split(struct btree_update *as, struct btree_trans *trans,
-			struct btree_path *path, struct btree *b,
-			struct keylist *keys, unsigned flags)
+static int btree_split(struct btree_update *as, struct btree_trans *trans,
+		       struct btree_path *path, struct btree *b,
+		       struct keylist *keys, unsigned flags)
 {
 	struct bch_fs *c = as->c;
 	struct btree *parent = btree_node_parent(path, b);
 	struct btree *n1, *n2 = NULL, *n3 = NULL;
 	struct btree_path *path1 = NULL, *path2 = NULL;
 	u64 start_time = local_clock();
+	int ret = 0;
 
 	BUG_ON(!parent && (b != btree_node_root(c, b)));
 	BUG_ON(!btree_node_intent_locked(path, btree_node_root(c, b)->c.level));
@@ -1551,7 +1589,9 @@ static void btree_split(struct btree_update *as, struct btree_trans *trans,
 
 	if (parent) {
 		/* Split a non root node */
-		bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys, flags);
+		ret = bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys, flags);
+		if (ret)
+			goto err;
 	} else if (n3) {
 		bch2_btree_set_root(as, trans, path, n3);
 	} else {
@@ -1589,7 +1629,7 @@ static void btree_split(struct btree_update *as, struct btree_trans *trans,
 	if (n2)
 		six_unlock_intent(&n2->c.lock);
 	six_unlock_intent(&n1->c.lock);
-
+out:
 	if (path2) {
 		__bch2_btree_path_unlock(trans, path2);
 		bch2_path_put(trans, path2, true);
@@ -1605,6 +1645,14 @@ static void btree_split(struct btree_update *as, struct btree_trans *trans,
 			       ? BCH_TIME_btree_node_split
 			       : BCH_TIME_btree_node_compact],
 			       start_time);
+	return ret;
+err:
+	if (n3)
+		bch2_btree_node_free_never_used(as, trans, n3);
+	if (n2)
+		bch2_btree_node_free_never_used(as, trans, n2);
+	bch2_btree_node_free_never_used(as, trans, n1);
+	goto out;
 }
 
 static void
@@ -1639,9 +1687,9 @@ bch2_btree_insert_keys_interior(struct btree_update *as,
  * If a split occurred, this function will return early. This can only happen
  * for leaf nodes -- inserts into interior nodes have to be atomic.
  */
-static void bch2_btree_insert_node(struct btree_update *as, struct btree_trans *trans,
-				   struct btree_path *path, struct btree *b,
-				   struct keylist *keys, unsigned flags)
+static int bch2_btree_insert_node(struct btree_update *as, struct btree_trans *trans,
+				  struct btree_path *path, struct btree *b,
+				  struct keylist *keys, unsigned flags)
 {
 	struct bch_fs *c = as->c;
 	int old_u64s = le16_to_cpu(btree_bset_last(b)->u64s);
@@ -1654,6 +1702,9 @@ static void bch2_btree_insert_node(struct btree_update *as, struct btree_trans *
 	BUG_ON(!as || as->b);
 	bch2_verify_keylist_sorted(keys);
 
+	if (!(local_clock() & 63))
+		return btree_trans_restart(trans, BCH_ERR_transaction_restart_split_race);
+
 	bch2_btree_node_lock_for_insert(trans, path, b);
 
 	if (!bch2_btree_node_insert_fits(c, b, bch2_keylist_u64s(keys))) {
@@ -1680,9 +1731,9 @@ static void bch2_btree_insert_node(struct btree_update *as, struct btree_trans *
 	bch2_btree_node_unlock_write(trans, path, b);
 
 	btree_node_interior_verify(c, b);
-	return;
+	return 0;
 split:
-	btree_split(as, trans, path, b, keys, flags);
+	return btree_split(as, trans, path, b, keys, flags);
 }
 
 int bch2_btree_split_leaf(struct btree_trans *trans,
@@ -1699,7 +1750,12 @@ int bch2_btree_split_leaf(struct btree_trans *trans,
 	if (IS_ERR(as))
 		return PTR_ERR(as);
 
-	btree_split(as, trans, path, b, NULL, flags);
+	ret = btree_split(as, trans, path, b, NULL, flags);
+	if (ret) {
+		bch2_btree_update_free(as, trans);
+		return ret;
+	}
+
 	bch2_btree_update_done(as, trans);
 
 	for (l = path->level + 1; btree_path_node(path, l) && !ret; l++)
@@ -1851,7 +1907,9 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans,
 
 	bch2_trans_verify_paths(trans);
 
-	bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys, flags);
+	ret = bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys, flags);
+	if (ret)
+		goto err_free_update;
 
 	bch2_trans_verify_paths(trans);
 
@@ -1877,6 +1935,10 @@ err:
 	bch2_path_put(trans, sib_path, true);
 	bch2_trans_verify_locks(trans);
 	return ret;
+err_free_update:
+	bch2_btree_node_free_never_used(as, trans, n);
+	bch2_btree_update_free(as, trans);
+	goto out;
 }
 
 /**
@@ -1919,8 +1981,10 @@ int bch2_btree_node_rewrite(struct btree_trans *trans,
 
 	if (parent) {
 		bch2_keylist_add(&as->parent_keys, &n->key);
-		bch2_btree_insert_node(as, trans, iter->path, parent,
-				       &as->parent_keys, flags);
+		ret = bch2_btree_insert_node(as, trans, iter->path, parent,
+					     &as->parent_keys, flags);
+		if (ret)
+			goto err;
 	} else {
 		bch2_btree_set_root(as, trans, iter->path, n);
 	}
@@ -1934,10 +1998,15 @@ int bch2_btree_node_rewrite(struct btree_trans *trans,
 	six_unlock_intent(&n->c.lock);
 
 	bch2_btree_update_done(as, trans);
-	bch2_path_put(trans, new_path, true);
 out:
+	if (new_path)
+		bch2_path_put(trans, new_path, true);
 	bch2_btree_path_downgrade(trans, iter->path);
 	return ret;
+err:
+	bch2_btree_node_free_never_used(as, trans, n);
+	bch2_btree_update_free(as, trans);
+	goto out;
 }
 
 struct async_btree_rewrite {
diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h
index bf7ae99d9cce..fb1e1cd0f864 100644
--- a/fs/bcachefs/errcode.h
+++ b/fs/bcachefs/errcode.h
@@ -42,6 +42,7 @@
 	x(BCH_ERR_transaction_restart,	transaction_restart_key_cache_raced)	\
 	x(BCH_ERR_transaction_restart,	transaction_restart_key_cache_realloced)\
 	x(BCH_ERR_transaction_restart,	transaction_restart_journal_preres_get)	\
+	x(BCH_ERR_transaction_restart,	transaction_restart_split_race)		\
 	x(BCH_ERR_transaction_restart,	transaction_restart_nested)		\
 	x(0,				no_btree_node)				\
 	x(BCH_ERR_no_btree_node,	no_btree_node_relock)			\
-- 
cgit 


From 1ff7849f3b2478a4b4ec8abf77ce5e35acac70be Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 9 Oct 2022 05:04:38 -0400
Subject: bcachefs: bch2_btree_insert_node() no longer uses lock_write_nofail

Now that we have an error path plumbed through, there's no need to be
using bch2_btree_node_lock_write_nofail().

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update.h          |  4 ++--
 fs/bcachefs/btree_update_interior.c |  7 ++++++-
 fs/bcachefs/btree_update_leaf.c     | 14 +++-----------
 3 files changed, 11 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
index 89941fb8caa0..1c2e7b2b4ed5 100644
--- a/fs/bcachefs/btree_update.h
+++ b/fs/bcachefs/btree_update.h
@@ -8,8 +8,8 @@
 struct bch_fs;
 struct btree;
 
-void bch2_btree_node_lock_for_insert(struct btree_trans *, struct btree_path *,
-				     struct btree *);
+void bch2_btree_node_prep_for_write(struct btree_trans *,
+				    struct btree_path *, struct btree *);
 bool bch2_btree_bset_insert_key(struct btree_trans *, struct btree_path *,
 				struct btree *, struct btree_node_iter *,
 				struct bkey_i *);
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index b0a15757aaea..7619890d9df1 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -1695,6 +1695,7 @@ static int bch2_btree_insert_node(struct btree_update *as, struct btree_trans *t
 	int old_u64s = le16_to_cpu(btree_bset_last(b)->u64s);
 	int old_live_u64s = b->nr.live_u64s;
 	int live_u64s_added, u64s_added;
+	int ret;
 
 	lockdep_assert_held(&c->gc_lock);
 	BUG_ON(!btree_node_intent_locked(path, btree_node_root(c, b)->c.level));
@@ -1705,7 +1706,11 @@ static int bch2_btree_insert_node(struct btree_update *as, struct btree_trans *t
 	if (!(local_clock() & 63))
 		return btree_trans_restart(trans, BCH_ERR_transaction_restart_split_race);
 
-	bch2_btree_node_lock_for_insert(trans, path, b);
+	ret = bch2_btree_node_lock_write(trans, path, &b->c);
+	if (ret)
+		return ret;
+
+	bch2_btree_node_prep_for_write(trans, path, b);
 
 	if (!bch2_btree_node_insert_fits(c, b, bch2_keylist_u64s(keys))) {
 		bch2_btree_node_unlock_write(trans, path, b);
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 98030f22ee05..cf4a7093f1e9 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -56,9 +56,9 @@ static inline bool same_leaf_as_next(struct btree_trans *trans,
 		insert_l(&i[0])->b == insert_l(&i[1])->b;
 }
 
-static inline void bch2_btree_node_prep_for_write(struct btree_trans *trans,
-						  struct btree_path *path,
-						  struct btree *b)
+inline void bch2_btree_node_prep_for_write(struct btree_trans *trans,
+					   struct btree_path *path,
+					   struct btree *b)
 {
 	struct bch_fs *c = trans->c;
 
@@ -77,14 +77,6 @@ static inline void bch2_btree_node_prep_for_write(struct btree_trans *trans,
 		bch2_btree_init_next(trans, b);
 }
 
-void bch2_btree_node_lock_for_insert(struct btree_trans *trans,
-				     struct btree_path *path,
-				     struct btree *b)
-{
-	bch2_btree_node_lock_write_nofail(trans, path, &b->c);
-	bch2_btree_node_prep_for_write(trans, path, b);
-}
-
 /* Inserting into a given leaf node (last stage of insert): */
 
 /* Handle overwrites and do insert, for non extents: */
-- 
cgit 


From 969576ecaeb9b36250f0e099424713e95ca6d730 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 9 Oct 2022 22:25:19 -0400
Subject: bcachefs: bch2_btree_iter_peek() now works with interior nodes

Needed by the next patch, which will be iterating over keys in nodes at
level 1.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c | 21 ++++++++++++++++-----
 1 file changed, 16 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 7ea297249850..baf8ed40280c 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1862,10 +1862,12 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp
 	struct bkey_s_c k, k2;
 	int ret;
 
-	EBUG_ON(iter->path->cached || iter->path->level);
+	EBUG_ON(iter->path->cached);
 	bch2_btree_iter_verify(iter);
 
 	while (1) {
+		struct btree_path_level *l;
+
 		iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key,
 					iter->flags & BTREE_ITER_INTENT);
 
@@ -1877,9 +1879,18 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp
 			goto out;
 		}
 
+		l = path_l(iter->path);
+
+		if (unlikely(!l->b)) {
+			/* No btree nodes at requested level: */
+			bch2_btree_iter_set_pos(iter, SPOS_MAX);
+			k = bkey_s_c_null;
+			goto out;
+		}
+
 		btree_path_set_should_be_locked(iter->path);
 
-		k = btree_path_level_peek_all(trans->c, &iter->path->l[0], &iter->k);
+		k = btree_path_level_peek_all(trans->c, l, &iter->k);
 
 		if (unlikely(iter->flags & BTREE_ITER_WITH_KEY_CACHE) &&
 		    k.k &&
@@ -1899,7 +1910,7 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp
 
 		if (next_update &&
 		    bpos_cmp(next_update->k.p,
-			     k.k ? k.k->p : iter->path->l[0].b->key.k.p) <= 0) {
+			     k.k ? k.k->p : l->b->key.k.p) <= 0) {
 			iter->k = next_update->k;
 			k = bkey_i_to_s_c(next_update);
 		}
@@ -1920,9 +1931,9 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp
 
 		if (likely(k.k)) {
 			break;
-		} else if (likely(bpos_cmp(iter->path->l[0].b->key.k.p, SPOS_MAX))) {
+		} else if (likely(bpos_cmp(l->b->key.k.p, SPOS_MAX))) {
 			/* Advance to next leaf node: */
-			search_key = bpos_successor(iter->path->l[0].b->key.k.p);
+			search_key = bpos_successor(l->b->key.k.p);
 		} else {
 			/* End of btree: */
 			bch2_btree_iter_set_pos(iter, SPOS_MAX);
-- 
cgit 


From 1f0f731ffef13bde3b2cd5a439c886d94d2bb3cc Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 27 Sep 2022 18:57:34 -0400
Subject: bcachefs: Btree splits now only take the locks they need

Previously, bch2_btree_update_start() would always take all intent
locks, all the way up to the root.

We've finally got data from users where this became a scalability issue
- so, this patch fixes bch2_btree_update_start() to only take the locks
we need.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update_interior.c | 42 ++++++++++++++++++++++---------------
 fs/bcachefs/btree_update_interior.h |  1 +
 2 files changed, 26 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 7619890d9df1..84a1cd0a0a4f 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -1070,23 +1070,23 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
 		nr_nodes[!!update_level] += 1 + split;
 		update_level++;
 
-		if (!btree_path_node(path, update_level))
-			break;
+		ret = bch2_btree_path_upgrade(trans, path, update_level + 1);
+		if (ret)
+			return ERR_PTR(ret);
 
-		/*
-		 * XXX: figure out how far we might need to split,
-		 * instead of locking/reserving all the way to the root:
-		 */
-		split = update_level + 1 < BTREE_MAX_DEPTH;
-	}
+		if (!btree_path_node(path, update_level)) {
+			/* Allocating new root? */
+			nr_nodes[1] += split;
+			update_level = BTREE_MAX_DEPTH;
+			break;
+		}
 
-	/* Might have to allocate a new root: */
-	if (update_level < BTREE_MAX_DEPTH)
-		nr_nodes[1] += 1;
+		if (bch2_btree_node_insert_fits(c, path->l[update_level].b,
+					BKEY_BTREE_PTR_U64s_MAX * (1 + split)))
+			break;
 
-	ret = bch2_btree_path_upgrade(trans, path, U8_MAX);
-	if (ret)
-		return ERR_PTR(ret);
+		split = true;
+	}
 
 	if (flags & BTREE_INSERT_GC_LOCK_HELD)
 		lockdep_assert_held(&c->gc_lock);
@@ -1108,6 +1108,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
 	as->mode	= BTREE_INTERIOR_NO_UPDATE;
 	as->took_gc_lock = !(flags & BTREE_INSERT_GC_LOCK_HELD);
 	as->btree_id	= path->btree_id;
+	as->update_level = update_level;
 	INIT_LIST_HEAD(&as->list);
 	INIT_LIST_HEAD(&as->unwritten_list);
 	INIT_LIST_HEAD(&as->write_blocked_list);
@@ -1511,7 +1512,7 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans,
 	int ret = 0;
 
 	BUG_ON(!parent && (b != btree_node_root(c, b)));
-	BUG_ON(!btree_node_intent_locked(path, btree_node_root(c, b)->c.level));
+	BUG_ON(parent && !btree_node_intent_locked(path, b->c.level + 1));
 
 	bch2_btree_interior_update_will_free_node(as, b);
 
@@ -1698,7 +1699,7 @@ static int bch2_btree_insert_node(struct btree_update *as, struct btree_trans *t
 	int ret;
 
 	lockdep_assert_held(&c->gc_lock);
-	BUG_ON(!btree_node_intent_locked(path, btree_node_root(c, b)->c.level));
+	BUG_ON(!btree_node_intent_locked(path, b->c.level));
 	BUG_ON(!b->c.level);
 	BUG_ON(!as || as->b);
 	bch2_verify_keylist_sorted(keys);
@@ -1738,6 +1739,13 @@ static int bch2_btree_insert_node(struct btree_update *as, struct btree_trans *t
 	btree_node_interior_verify(c, b);
 	return 0;
 split:
+	/*
+	 * We could attempt to avoid the transaction restart, by calling
+	 * bch2_btree_path_upgrade() and allocating more nodes:
+	 */
+	if (b->c.level >= as->update_level)
+		return btree_trans_restart(trans, BCH_ERR_transaction_restart_split_race);
+
 	return btree_split(as, trans, path, b, keys, flags);
 }
 
@@ -1763,7 +1771,7 @@ int bch2_btree_split_leaf(struct btree_trans *trans,
 
 	bch2_btree_update_done(as, trans);
 
-	for (l = path->level + 1; btree_path_node(path, l) && !ret; l++)
+	for (l = path->level + 1; btree_node_intent_locked(path, l) && !ret; l++)
 		ret = bch2_foreground_maybe_merge(trans, path, l, flags);
 
 	return ret;
diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h
index 7af810df8348..dabe81596544 100644
--- a/fs/bcachefs/btree_update_interior.h
+++ b/fs/bcachefs/btree_update_interior.h
@@ -52,6 +52,7 @@ struct btree_update {
 	unsigned			took_gc_lock:1;
 
 	enum btree_id			btree_id;
+	unsigned			update_level;
 
 	struct disk_reservation		disk_res;
 	struct journal_preres		journal_preres;
-- 
cgit 


From 5b3243cb528f96e5d90d65f56f96ad179c666ff5 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 11 Oct 2022 04:49:23 -0400
Subject: bcachefs: Fix cached data accounting

Negating without casting to a signed integer means the value wasn't
getting sign extended properly - oops.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/buckets.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 4b041707cd54..5cb4a00166f9 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -578,7 +578,7 @@ int bch2_mark_alloc(struct btree_trans *trans,
 	if ((flags & BTREE_TRIGGER_BUCKET_INVALIDATE) &&
 	    old_a.cached_sectors) {
 		ret = update_cached_sectors(c, new, ca->dev_idx,
-					    -old_a.cached_sectors,
+					    -((s64) old_a.cached_sectors),
 					    journal_seq, gc);
 		if (ret) {
 			bch2_fs_fatal_error(c, "bch2_mark_alloc(): no replicas entry while updating cached sectors");
-- 
cgit 


From 7dcbdbd85cc3af14c0b9b5b80eb87cca8a322285 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 11 Oct 2022 06:37:56 -0400
Subject: bcachefs: bch2_path_put_nokeep()

The btree iterator code may allocate extra btree paths, temporarily,
that do not refer to keys being returned: we don't need to wait until
transaction restart to drop these, when they're not referenced they
should be deleted right away.

This fixes a transaction path overflow bug.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c | 28 ++++++++++++++++++++--------
 1 file changed, 20 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index baf8ed40280c..283764225d13 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1299,6 +1299,18 @@ void bch2_path_put(struct btree_trans *trans, struct btree_path *path, bool inte
 	__bch2_path_free(trans, path);
 }
 
+static void bch2_path_put_nokeep(struct btree_trans *trans, struct btree_path *path,
+				 bool intent)
+{
+	EBUG_ON(trans->paths + path->idx != path);
+	EBUG_ON(!path->ref);
+
+	if (!__btree_path_put(path, intent))
+		return;
+
+	__bch2_path_free(trans, path);
+}
+
 noinline __cold
 void bch2_trans_updates_to_text(struct printbuf *buf, struct btree_trans *trans)
 {
@@ -1962,8 +1974,8 @@ struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos e
 	EBUG_ON(iter->flags & BTREE_ITER_ALL_LEVELS);
 
 	if (iter->update_path) {
-		bch2_path_put(trans, iter->update_path,
-			      iter->flags & BTREE_ITER_INTENT);
+		bch2_path_put_nokeep(trans, iter->update_path,
+				     iter->flags & BTREE_ITER_INTENT);
 		iter->update_path = NULL;
 	}
 
@@ -1994,8 +2006,8 @@ struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos e
 
 		if (iter->update_path &&
 		    bkey_cmp(iter->update_path->pos, k.k->p)) {
-			bch2_path_put(trans, iter->update_path,
-				      iter->flags & BTREE_ITER_INTENT);
+			bch2_path_put_nokeep(trans, iter->update_path,
+					     iter->flags & BTREE_ITER_INTENT);
 			iter->update_path = NULL;
 		}
 
@@ -2237,7 +2249,7 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
 				 * that candidate
 				 */
 				if (saved_path && bkey_cmp(k.k->p, saved_k.p)) {
-					bch2_path_put(trans, iter->path,
+					bch2_path_put_nokeep(trans, iter->path,
 						      iter->flags & BTREE_ITER_INTENT);
 					iter->path = saved_path;
 					saved_path = NULL;
@@ -2250,7 +2262,7 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
 							      iter->snapshot,
 							      k.k->p.snapshot)) {
 					if (saved_path)
-						bch2_path_put(trans, saved_path,
+						bch2_path_put_nokeep(trans, saved_path,
 						      iter->flags & BTREE_ITER_INTENT);
 					saved_path = btree_path_clone(trans, iter->path,
 								iter->flags & BTREE_ITER_INTENT);
@@ -2294,7 +2306,7 @@ got_key:
 	btree_path_set_should_be_locked(iter->path);
 out_no_locked:
 	if (saved_path)
-		bch2_path_put(trans, saved_path, iter->flags & BTREE_ITER_INTENT);
+		bch2_path_put_nokeep(trans, saved_path, iter->flags & BTREE_ITER_INTENT);
 
 	bch2_btree_iter_verify_entry_exit(iter);
 	bch2_btree_iter_verify(iter);
@@ -2584,7 +2596,7 @@ void bch2_trans_iter_exit(struct btree_trans *trans, struct btree_iter *iter)
 		bch2_path_put(trans, iter->path,
 			      iter->flags & BTREE_ITER_INTENT);
 	if (iter->update_path)
-		bch2_path_put(trans, iter->update_path,
+		bch2_path_put_nokeep(trans, iter->update_path,
 			      iter->flags & BTREE_ITER_INTENT);
 	if (iter->key_cache_path)
 		bch2_path_put(trans, iter->key_cache_path,
-- 
cgit 


From f42238b5cde2f1624b2be5f64c813e6127a8012a Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 12 Oct 2022 07:58:50 -0400
Subject: bcachefs: Fix a rare path in bch2_btree_path_peek_slot()

In the drop_alloc tests, we may end up calling
bch2_btree_iter_peek_slot() on an interior level that doesn't exist.
Previously, this would hit the path->uptodate assertion in
bch2_btree_path_peek_slot(); this path first checks a NULL btree node,
which is how we know we're at the end of the btree.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 283764225d13..0bb156e6152a 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1544,14 +1544,17 @@ struct btree_path *bch2_path_get(struct btree_trans *trans,
 inline struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *path, struct bkey *u)
 {
 
+	struct btree_path_level *l = path_l(path);
+	struct bkey_packed *_k;
 	struct bkey_s_c k;
 
-	if (!path->cached) {
-		struct btree_path_level *l = path_l(path);
-		struct bkey_packed *_k;
+	if (unlikely(!l->b))
+		return bkey_s_c_null;
 
-		EBUG_ON(path->uptodate != BTREE_ITER_UPTODATE);
+	EBUG_ON(path->uptodate != BTREE_ITER_UPTODATE);
+	EBUG_ON(!btree_node_locked(path, path->level));
 
+	if (!path->cached) {
 		_k = bch2_btree_node_iter_peek_all(&l->iter, l->b);
 		k = _k ? bkey_disassemble(l->b, _k, u) : bkey_s_c_null;
 
@@ -1566,7 +1569,6 @@ inline struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *path, struct
 			(path->btree_id != ck->key.btree_id ||
 			 bkey_cmp(path->pos, ck->key.pos)));
 		EBUG_ON(!ck || !ck->valid);
-		EBUG_ON(path->uptodate != BTREE_ITER_UPTODATE);
 
 		*u = ck->k->k;
 		k = bkey_i_to_s_c(ck->k);
@@ -2381,6 +2383,8 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
 		}
 
 		k = bch2_btree_path_peek_slot(iter->path, &iter->k);
+		if (unlikely(!k.k))
+			goto out_no_locked;
 	} else {
 		struct bpos next;
 
@@ -2412,7 +2416,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
 		}
 
 		if (unlikely(bkey_err(k)))
-			return k;
+			goto out_no_locked;
 
 		next = k.k ? bkey_start_pos(k.k) : POS_MAX;
 
-- 
cgit 


From e8540e56812360d4253b6a30e46452ce7448b24b Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 11 Oct 2022 04:32:14 -0400
Subject: bcachefs: Reflink now respects quotas

This adds a new helper, quota_reserve_range(), which takes a quota
reservation for unallocated blocks in a given file range, and uses it in
bch2_remap_file_range().

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-io.c | 65 ++++++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 60 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 3cb542f0d8c7..f5517c31f120 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -150,7 +150,7 @@ static void bch2_quota_reservation_put(struct bch_fs *c,
 static int bch2_quota_reservation_add(struct bch_fs *c,
 				      struct bch_inode_info *inode,
 				      struct quota_res *res,
-				      unsigned sectors,
+				      u64 sectors,
 				      bool check_enospc)
 {
 	int ret;
@@ -3132,6 +3132,55 @@ long bch2_fallocate_dispatch(struct file *file, int mode,
 	return bch2_err_class(ret);
 }
 
+static int quota_reserve_range(struct bch_inode_info *inode,
+			       struct quota_res *res,
+			       u64 start, u64 end)
+{
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	struct btree_trans trans;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	u32 snapshot;
+	u64 sectors = end - start;
+	u64 pos = start;
+	int ret;
+
+	bch2_trans_init(&trans, c, 0, 0);
+retry:
+	bch2_trans_begin(&trans);
+
+	ret = bch2_subvolume_get_snapshot(&trans, inode->ei_subvol, &snapshot);
+	if (ret)
+		goto err;
+
+	bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
+			     SPOS(inode->v.i_ino, pos, snapshot), 0);
+
+	while (!(ret = btree_trans_too_many_iters(&trans)) &&
+	       (k = bch2_btree_iter_peek_upto(&iter, POS(inode->v.i_ino, end - 1))).k &&
+	       !(ret = bkey_err(k))) {
+		if (bkey_extent_is_allocation(k.k)) {
+			u64 s = min(end, k.k->p.offset) -
+				max(start, bkey_start_offset(k.k));
+			BUG_ON(s > sectors);
+			sectors -= s;
+		}
+		bch2_btree_iter_advance(&iter);
+	}
+	pos = iter.pos.offset;
+	bch2_trans_iter_exit(&trans, &iter);
+err:
+	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+		goto retry;
+
+	bch2_trans_exit(&trans);
+
+	if (ret)
+		return ret;
+
+	return bch2_quota_reservation_add(c, inode, res, sectors, true);
+}
+
 loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src,
 			     struct file *file_dst, loff_t pos_dst,
 			     loff_t len, unsigned remap_flags)
@@ -3139,6 +3188,7 @@ loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src,
 	struct bch_inode_info *src = file_bch_inode(file_src);
 	struct bch_inode_info *dst = file_bch_inode(file_dst);
 	struct bch_fs *c = src->v.i_sb->s_fs_info;
+	struct quota_res quota_res = { 0 };
 	s64 i_sectors_delta = 0;
 	u64 aligned_len;
 	loff_t ret = 0;
@@ -3159,8 +3209,6 @@ loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src,
 
 	bch2_lock_inodes(INODE_LOCK|INODE_PAGECACHE_BLOCK, src, dst);
 
-	file_update_time(file_dst);
-
 	inode_dio_wait(&src->v);
 	inode_dio_wait(&dst->v);
 
@@ -3177,6 +3225,13 @@ loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src,
 	if (ret)
 		goto err;
 
+	ret = quota_reserve_range(dst, &quota_res, pos_dst >> 9,
+				  (pos_dst + aligned_len) >> 9);
+	if (ret)
+		goto err;
+
+	file_update_time(file_dst);
+
 	mark_pagecache_unallocated(src, pos_src >> 9,
 				   (pos_src + aligned_len) >> 9);
 
@@ -3193,8 +3248,7 @@ loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src,
 	 */
 	ret = min((u64) ret << 9, (u64) len);
 
-	/* XXX get a quota reservation */
-	i_sectors_acct(c, dst, NULL, i_sectors_delta);
+	i_sectors_acct(c, dst, &quota_res, i_sectors_delta);
 
 	spin_lock(&dst->v.i_lock);
 	if (pos_dst + ret > dst->v.i_size)
@@ -3205,6 +3259,7 @@ loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src,
 	    IS_SYNC(file_inode(file_dst)))
 		ret = bch2_flush_inode(c, inode_inum(dst));
 err:
+	bch2_quota_reservation_put(c, dst, &quota_res);
 	bch2_unlock_inodes(INODE_LOCK|INODE_PAGECACHE_BLOCK, src, dst);
 
 	return bch2_err_class(ret);
-- 
cgit 


From de107dc8008fe559ac39e89cabc4d21d0129684e Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 12 Oct 2022 11:04:28 -0400
Subject: bcachefs: Call bch2_btree_update_add_new_node() before dropping write
 lock

btree nodes can be written by other threads (shrinker, journal reclaim)
with only a read lock, but brand new nodes should only be written by the
thread doing the split/interior update. bch2_btree_update_add_new_node()
sets btree node flags to indicate that this is a new node and should not
be written out by other threads, thus we need to call it before dropping
our write lock.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update_interior.c | 20 +++++++++-----------
 1 file changed, 9 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 84a1cd0a0a4f..9680d83f9036 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -429,7 +429,6 @@ static struct btree *__btree_root_alloc(struct btree_update *as,
 
 	btree_node_set_format(b, b->data->format);
 	bch2_btree_build_aux_trees(b);
-	six_unlock_write(&b->c.lock);
 
 	return b;
 }
@@ -1528,6 +1527,9 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans,
 
 		bch2_btree_build_aux_trees(n2);
 		bch2_btree_build_aux_trees(n1);
+
+		bch2_btree_update_add_new_node(as, n1);
+		bch2_btree_update_add_new_node(as, n2);
 		six_unlock_write(&n2->c.lock);
 		six_unlock_write(&n1->c.lock);
 
@@ -1541,9 +1543,6 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans,
 		mark_btree_node_locked(trans, path2, n2->c.level, SIX_LOCK_intent);
 		bch2_btree_path_level_init(trans, path2, n2);
 
-		bch2_btree_update_add_new_node(as, n1);
-		bch2_btree_update_add_new_node(as, n2);
-
 		/*
 		 * Note that on recursive parent_keys == keys, so we
 		 * can't start adding new keys to parent_keys before emptying it
@@ -1556,6 +1555,9 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans,
 			/* Depth increases, make a new root */
 			n3 = __btree_root_alloc(as, trans, b->c.level + 1);
 
+			bch2_btree_update_add_new_node(as, n3);
+			six_unlock_write(&n3->c.lock);
+
 			path2->locks_want++;
 			BUG_ON(btree_node_locked(path2, n3->c.level));
 			six_lock_increment(&n3->c.lock, SIX_LOCK_intent);
@@ -1565,14 +1567,13 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans,
 			n3->sib_u64s[0] = U16_MAX;
 			n3->sib_u64s[1] = U16_MAX;
 
-			bch2_btree_update_add_new_node(as, n3);
-
 			btree_split_insert_keys(as, trans, path, n3, &as->parent_keys);
 		}
 	} else {
 		trace_and_count(c, btree_node_compact, c, b);
 
 		bch2_btree_build_aux_trees(n1);
+		bch2_btree_update_add_new_node(as, n1);
 		six_unlock_write(&n1->c.lock);
 
 		path1 = get_unlocked_mut_path(trans, path->btree_id, n1->c.level, n1->key.k.p);
@@ -1580,8 +1581,6 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans,
 		mark_btree_node_locked(trans, path1, n1->c.level, SIX_LOCK_intent);
 		bch2_btree_path_level_init(trans, path1, n1);
 
-		bch2_btree_update_add_new_node(as, n1);
-
 		if (parent)
 			bch2_keylist_add(&as->parent_keys, &n1->key);
 	}
@@ -1904,9 +1903,8 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans,
 	bch2_btree_sort_into(c, n, next);
 
 	bch2_btree_build_aux_trees(n);
-	six_unlock_write(&n->c.lock);
-
 	bch2_btree_update_add_new_node(as, n);
+	six_unlock_write(&n->c.lock);
 
 	new_path = get_unlocked_mut_path(trans, path->btree_id, n->c.level, n->key.k.p);
 	six_lock_increment(&n->c.lock, SIX_LOCK_intent);
@@ -1980,9 +1978,9 @@ int bch2_btree_node_rewrite(struct btree_trans *trans,
 	bch2_btree_interior_update_will_free_node(as, b);
 
 	n = bch2_btree_node_alloc_replacement(as, trans, b);
-	bch2_btree_update_add_new_node(as, n);
 
 	bch2_btree_build_aux_trees(n);
+	bch2_btree_update_add_new_node(as, n);
 	six_unlock_write(&n->c.lock);
 
 	new_path = get_unlocked_mut_path(trans, iter->btree_id, n->c.level, n->key.k.p);
-- 
cgit 


From f866870f5dbe9c9fb745f5a24bb30b6477ec619a Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 12 Oct 2022 14:47:58 -0400
Subject: bcachefs: Initialize sb_quota with default 1 week timer

For compliance with other quota implementations, we should be
initializing quota information with a default 1 week timelimit: this
fixes fstests generic/235.

Also, this adds to_text() functions for some quota structs - useful
debugging aids.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/quota.c | 173 +++++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 166 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c
index c12d715fb758..ad7130a14691 100644
--- a/fs/bcachefs/quota.c
+++ b/fs/bcachefs/quota.c
@@ -95,6 +95,113 @@ void bch2_quota_to_text(struct printbuf *out, struct bch_fs *c,
 #include <linux/fs.h>
 #include <linux/quota.h>
 
+static void qc_info_to_text(struct printbuf *out, struct qc_info *i)
+{
+	printbuf_tabstops_reset(out);
+	printbuf_tabstop_push(out, 20);
+
+	prt_str(out, "i_fieldmask");
+	prt_tab(out);
+	prt_printf(out, "%x", i->i_fieldmask);
+	prt_newline(out);
+
+	prt_str(out, "i_flags");
+	prt_tab(out);
+	prt_printf(out, "%u", i->i_flags);
+	prt_newline(out);
+
+	prt_str(out, "i_spc_timelimit");
+	prt_tab(out);
+	prt_printf(out, "%u", i->i_spc_timelimit);
+	prt_newline(out);
+
+	prt_str(out, "i_ino_timelimit");
+	prt_tab(out);
+	prt_printf(out, "%u", i->i_ino_timelimit);
+	prt_newline(out);
+
+	prt_str(out, "i_rt_spc_timelimit");
+	prt_tab(out);
+	prt_printf(out, "%u", i->i_rt_spc_timelimit);
+	prt_newline(out);
+
+	prt_str(out, "i_spc_warnlimit");
+	prt_tab(out);
+	prt_printf(out, "%u", i->i_spc_warnlimit);
+	prt_newline(out);
+
+	prt_str(out, "i_ino_warnlimit");
+	prt_tab(out);
+	prt_printf(out, "%u", i->i_ino_warnlimit);
+	prt_newline(out);
+
+	prt_str(out, "i_rt_spc_warnlimit");
+	prt_tab(out);
+	prt_printf(out, "%u", i->i_rt_spc_warnlimit);
+	prt_newline(out);
+}
+
+static void qc_dqblk_to_text(struct printbuf *out, struct qc_dqblk *q)
+{
+	printbuf_tabstops_reset(out);
+	printbuf_tabstop_push(out, 20);
+
+	prt_str(out, "d_fieldmask");
+	prt_tab(out);
+	prt_printf(out, "%x", q->d_fieldmask);
+	prt_newline(out);
+
+	prt_str(out, "d_spc_hardlimit");
+	prt_tab(out);
+	prt_printf(out, "%llu", q->d_spc_hardlimit);
+	prt_newline(out);
+
+	prt_str(out, "d_spc_softlimit");
+	prt_tab(out);
+	prt_printf(out, "%llu", q->d_spc_softlimit);
+	prt_newline(out);
+
+	prt_str(out, "d_ino_hardlimit");
+	prt_tab(out);
+	prt_printf(out, "%llu", q->d_ino_hardlimit);
+	prt_newline(out);
+
+	prt_str(out, "d_ino_softlimit");
+	prt_tab(out);
+	prt_printf(out, "%llu", q->d_ino_softlimit);
+	prt_newline(out);
+
+	prt_str(out, "d_space");
+	prt_tab(out);
+	prt_printf(out, "%llu", q->d_space);
+	prt_newline(out);
+
+	prt_str(out, "d_ino_count");
+	prt_tab(out);
+	prt_printf(out, "%llu", q->d_ino_count);
+	prt_newline(out);
+
+	prt_str(out, "d_ino_timer");
+	prt_tab(out);
+	prt_printf(out, "%llu", q->d_ino_timer);
+	prt_newline(out);
+
+	prt_str(out, "d_spc_timer");
+	prt_tab(out);
+	prt_printf(out, "%llu", q->d_spc_timer);
+	prt_newline(out);
+
+	prt_str(out, "d_ino_warns");
+	prt_tab(out);
+	prt_printf(out, "%i", q->d_ino_warns);
+	prt_newline(out);
+
+	prt_str(out, "d_spc_warns");
+	prt_tab(out);
+	prt_printf(out, "%i", q->d_spc_warns);
+	prt_newline(out);
+}
+
 static inline unsigned __next_qtype(unsigned i, unsigned qtypes)
 {
 	qtypes >>= i;
@@ -413,6 +520,26 @@ void bch2_fs_quota_init(struct bch_fs *c)
 		mutex_init(&c->quotas[i].lock);
 }
 
+static struct bch_sb_field_quota *bch2_sb_get_or_create_quota(struct bch_sb_handle *sb)
+{
+	struct bch_sb_field_quota *sb_quota = bch2_sb_get_quota(sb->sb);
+
+	if (sb_quota)
+		return sb_quota;
+
+	sb_quota = bch2_sb_resize_quota(sb, sizeof(*sb_quota) / sizeof(u64));
+	if (sb_quota) {
+		unsigned qtype, qc;
+
+		for (qtype = 0; qtype < QTYP_NR; qtype++)
+			for (qc = 0; qc < Q_COUNTERS; qc++)
+				sb_quota->q[qtype].c[qc].timelimit =
+					cpu_to_le32(7 * 24 * 60 * 60);
+	}
+
+	return sb_quota;
+}
+
 static void bch2_sb_quota_read(struct bch_fs *c)
 {
 	struct bch_sb_field_quota *sb_quota;
@@ -471,12 +598,19 @@ advance:
 
 int bch2_fs_quota_read(struct bch_fs *c)
 {
+	struct bch_sb_field_quota *sb_quota;
 	struct btree_trans trans;
 	struct btree_iter iter;
 	struct bkey_s_c k;
 	int ret;
 
 	mutex_lock(&c->sb_lock);
+	sb_quota = bch2_sb_get_or_create_quota(&c->disk_sb);
+	if (!sb_quota) {
+		mutex_unlock(&c->sb_lock);
+		return -BCH_ERR_ENOSPC_sb_quota;
+	}
+
 	bch2_sb_quota_read(c);
 	mutex_unlock(&c->sb_lock);
 
@@ -500,6 +634,8 @@ int bch2_fs_quota_read(struct bch_fs *c)
 static int bch2_quota_enable(struct super_block	*sb, unsigned uflags)
 {
 	struct bch_fs *c = sb->s_fs_info;
+	struct bch_sb_field_quota *sb_quota;
+	int ret = 0;
 
 	if (sb->s_flags & SB_RDONLY)
 		return -EROFS;
@@ -519,6 +655,12 @@ static int bch2_quota_enable(struct super_block	*sb, unsigned uflags)
 		return -EINVAL;
 
 	mutex_lock(&c->sb_lock);
+	sb_quota = bch2_sb_get_or_create_quota(&c->disk_sb);
+	if (!sb_quota) {
+		ret = -BCH_ERR_ENOSPC_sb_quota;
+		goto unlock;
+	}
+
 	if (uflags & FS_QUOTA_UDQ_ENFD)
 		SET_BCH_SB_USRQUOTA(c->disk_sb.sb, true);
 
@@ -529,9 +671,10 @@ static int bch2_quota_enable(struct super_block	*sb, unsigned uflags)
 		SET_BCH_SB_PRJQUOTA(c->disk_sb.sb, true);
 
 	bch2_write_super(c);
+unlock:
 	mutex_unlock(&c->sb_lock);
 
-	return 0;
+	return bch2_err_class(ret);
 }
 
 static int bch2_quota_disable(struct super_block *sb, unsigned uflags)
@@ -643,6 +786,15 @@ static int bch2_quota_set_info(struct super_block *sb, int type,
 	struct bch_fs *c = sb->s_fs_info;
 	struct bch_sb_field_quota *sb_quota;
 	struct bch_memquota_type *q;
+	int ret = 0;
+
+	if (0) {
+		struct printbuf buf = PRINTBUF;
+
+		qc_info_to_text(&buf, info);
+		pr_info("setting:\n%s", buf.buf);
+		printbuf_exit(&buf);
+	}
 
 	if (sb->s_flags & SB_RDONLY)
 		return -EROFS;
@@ -660,12 +812,10 @@ static int bch2_quota_set_info(struct super_block *sb, int type,
 	q = &c->quotas[type];
 
 	mutex_lock(&c->sb_lock);
-	sb_quota = bch2_sb_get_quota(c->disk_sb.sb);
+	sb_quota = bch2_sb_get_or_create_quota(&c->disk_sb);
 	if (!sb_quota) {
-		sb_quota = bch2_sb_resize_quota(&c->disk_sb,
-					sizeof(*sb_quota) / sizeof(u64));
-		if (!sb_quota)
-			return -BCH_ERR_ENOSPC_sb_quota;
+		ret = -BCH_ERR_ENOSPC_sb_quota;
+		goto unlock;
 	}
 
 	if (info->i_fieldmask & QC_SPC_TIMER)
@@ -687,9 +837,10 @@ static int bch2_quota_set_info(struct super_block *sb, int type,
 	bch2_sb_quota_read(c);
 
 	bch2_write_super(c);
+unlock:
 	mutex_unlock(&c->sb_lock);
 
-	return 0;
+	return bch2_err_class(ret);
 }
 
 /* Get/set individual quotas: */
@@ -794,6 +945,14 @@ static int bch2_set_quota(struct super_block *sb, struct kqid qid,
 	struct bkey_i_quota new_quota;
 	int ret;
 
+	if (0) {
+		struct printbuf buf = PRINTBUF;
+
+		qc_dqblk_to_text(&buf, qdq);
+		pr_info("setting:\n%s", buf.buf);
+		printbuf_exit(&buf);
+	}
+
 	if (sb->s_flags & SB_RDONLY)
 		return -EROFS;
 
-- 
cgit 


From bf9cb250edf776454d0600b4341c6667974bedb8 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 12 Oct 2022 16:12:27 -0400
Subject: bcachefs: Don't allow hardlinks when inherited attrs would change

This is the right thing to do, and conforms with our own behaviour on
rename and xfs's behaviour on hardlink.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-common.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-common.c b/fs/bcachefs/fs-common.c
index 53ffc684223c..e9dd1d13ec7e 100644
--- a/fs/bcachefs/fs-common.c
+++ b/fs/bcachefs/fs-common.c
@@ -212,6 +212,11 @@ int bch2_link_trans(struct btree_trans *trans,
 	if (ret)
 		goto err;
 
+	if (bch2_reinherit_attrs(inode_u, dir_u)) {
+		ret = -EXDEV;
+		goto err;
+	}
+
 	dir_u->bi_mtime = dir_u->bi_ctime = now;
 
 	dir_hash = bch2_hash_info_init(c, dir_u);
-- 
cgit 


From 65ff2d3a7abb9f1cc41dc824a9fc374a2c81eaf3 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 12 Oct 2022 16:21:08 -0400
Subject: bcachefs: Support FS_XFLAG_PROJINHERIT

We already have support for the flag's semantics: inode options are
inherited by children if they were explicitly set on the parent. This
patch just maps the FS_XFLAG_PROJINHERIT flag to the "this option was
epxlicitly set" bit.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-ioctl.c | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c
index 3df2f5f3d1ea..b51053130f28 100644
--- a/fs/bcachefs/fs-ioctl.c
+++ b/fs/bcachefs/fs-ioctl.c
@@ -26,6 +26,9 @@ struct flags_set {
 	unsigned		flags;
 
 	unsigned		projid;
+
+	bool			set_projinherit;
+	bool			projinherit;
 };
 
 static int bch2_inode_flags_set(struct bch_inode_info *inode,
@@ -50,6 +53,11 @@ static int bch2_inode_flags_set(struct bch_inode_info *inode,
 	    (newflags & (BCH_INODE_NODUMP|BCH_INODE_NOATIME)) != newflags)
 		return -EINVAL;
 
+	if (s->set_projinherit) {
+		bi->bi_fields_set &= ~(1 << Inode_opt_project);
+		bi->bi_fields_set |= ((int) s->projinherit << Inode_opt_project);
+	}
+
 	bi->bi_flags &= ~s->mask;
 	bi->bi_flags |= newflags;
 
@@ -107,6 +115,10 @@ static int bch2_ioc_fsgetxattr(struct bch_inode_info *inode,
 	struct fsxattr fa = { 0 };
 
 	fa.fsx_xflags = map_flags(bch_flags_to_xflags, inode->ei_inode.bi_flags);
+
+	if (inode->ei_inode.bi_fields_set & (1 << Inode_opt_project))
+		fa.fsx_xflags |= FS_XFLAG_PROJINHERIT;
+
 	fa.fsx_projid = inode->ei_qid.q[QTYP_PRJ];
 
 	return copy_to_user(arg, &fa, sizeof(fa));
@@ -138,6 +150,10 @@ static int bch2_ioc_fssetxattr(struct bch_fs *c,
 	if (copy_from_user(&fa, arg, sizeof(fa)))
 		return -EFAULT;
 
+	s.set_projinherit = true;
+	s.projinherit = (fa.fsx_xflags & FS_XFLAG_PROJINHERIT) != 0;
+	fa.fsx_xflags &= ~FS_XFLAG_PROJINHERIT;
+
 	s.flags = map_flags_rev(bch_flags_to_xflags, fa.fsx_xflags);
 	if (fa.fsx_xflags)
 		return -EOPNOTSUPP;
-- 
cgit 


From 896f1b316f8e8f51f83095ab4b0e319471d93803 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 12 Oct 2022 18:17:49 -0400
Subject: bcachefs: Fix lock_graph_remove_non_waiters()

We were removing 1 more entry than we were supposed to - oops.

Also some other simplifications and cleanups, and bring back the abort
preference code in a better fashion.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_locking.c | 172 ++++++++++++++++++++------------------------
 1 file changed, 76 insertions(+), 96 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c
index 4940b3069a76..922cfc7f5450 100644
--- a/fs/bcachefs/btree_locking.c
+++ b/fs/bcachefs/btree_locking.c
@@ -94,6 +94,37 @@ static noinline void print_chain(struct printbuf *out, struct lock_graph *g)
 	prt_newline(out);
 }
 
+static void lock_graph_up(struct lock_graph *g)
+{
+	closure_put(&g->g[--g->nr].trans->ref);
+}
+
+static void lock_graph_down(struct lock_graph *g, struct btree_trans *trans)
+{
+	closure_get(&trans->ref);
+
+	g->g[g->nr++] = (struct trans_waiting_for_lock) {
+		.trans		= trans,
+		.node_want	= trans->locking,
+		.lock_want	= trans->locking_wait.lock_want,
+	};
+}
+
+static bool lock_graph_remove_non_waiters(struct lock_graph *g)
+{
+	struct trans_waiting_for_lock *i;
+
+	for (i = g->g + 1; i < g->g + g->nr; i++)
+		if (i->trans->locking != i->node_want ||
+		    i->trans->locking_wait.start_time != i[-1].lock_start_time) {
+			while (g->g + g->nr > i)
+				lock_graph_up(g);
+			return true;
+		}
+
+	return false;
+}
+
 static int abort_lock(struct lock_graph *g, struct trans_waiting_for_lock *i)
 {
 	if (i == g->g) {
@@ -106,40 +137,42 @@ static int abort_lock(struct lock_graph *g, struct trans_waiting_for_lock *i)
 	}
 }
 
-static noinline int break_cycle(struct lock_graph *g)
+static int btree_trans_abort_preference(struct btree_trans *trans)
 {
-	struct trans_waiting_for_lock *i;
-
-	/*
-	 * We'd like to prioritize aborting transactions that have done less
-	 * work - but it appears breaking cycles by telling other transactions
-	 * to abort may still be buggy:
-	 */
-#if 0
-	for (i = g->g; i < g->g + g->nr; i++) {
-		if (i->trans->lock_may_not_fail ||
-		    i->trans->locking_wait.lock_want == SIX_LOCK_write)
-			continue;
+	if (trans->lock_may_not_fail)
+		return 0;
+	if (trans->locking_wait.lock_want == SIX_LOCK_write)
+		return 1;
+	if (!trans->in_traverse_all)
+		return 2;
+	return 3;
+}
 
-		return abort_lock(g, i);
-	}
+static noinline int break_cycle(struct lock_graph *g, struct printbuf *cycle)
+{
+	struct trans_waiting_for_lock *i, *abort = NULL;
+	unsigned best = 0, pref;
+	int ret;
 
-	for (i = g->g; i < g->g + g->nr; i++) {
-		if (i->trans->lock_may_not_fail ||
-		    !i->trans->in_traverse_all)
-			continue;
+	if (lock_graph_remove_non_waiters(g))
+		return 0;
 
-		return abort_lock(g, i);
+	/* Only checking, for debugfs: */
+	if (cycle) {
+		print_cycle(cycle, g);
+		ret = -1;
+		goto out;
 	}
-#endif
-	for (i = g->g; i < g->g + g->nr; i++) {
-		if (i->trans->lock_may_not_fail)
-			continue;
 
-		return abort_lock(g, i);
+	for (i = g->g; i < g->g + g->nr; i++) {
+		pref = btree_trans_abort_preference(i->trans);
+		if (pref > best) {
+			abort = i;
+			best = pref;
+		}
 	}
 
-	{
+	if (unlikely(!best)) {
 		struct bch_fs *c = g->g->trans->c;
 		struct printbuf buf = PRINTBUF;
 
@@ -162,21 +195,13 @@ static noinline int break_cycle(struct lock_graph *g)
 		printbuf_exit(&buf);
 		BUG();
 	}
-}
-
-static void lock_graph_pop(struct lock_graph *g)
-{
-	closure_put(&g->g[--g->nr].trans->ref);
-}
-
-static void lock_graph_pop_above(struct lock_graph *g, struct trans_waiting_for_lock *above,
-				 struct printbuf *cycle)
-{
-	if (g->nr > 1 && cycle)
-		print_chain(cycle, g);
 
-	while (g->g + g->nr > above)
-		lock_graph_pop(g);
+	ret = abort_lock(g, abort);
+out:
+	if (ret)
+		while (g->nr)
+			lock_graph_up(g);
+	return ret;
 }
 
 static int lock_graph_descend(struct lock_graph *g, struct btree_trans *trans,
@@ -184,67 +209,23 @@ static int lock_graph_descend(struct lock_graph *g, struct btree_trans *trans,
 {
 	struct btree_trans *orig_trans = g->g->trans;
 	struct trans_waiting_for_lock *i;
-	int ret = 0;
-
-	for (i = g->g; i < g->g + g->nr; i++) {
-		if (i->trans->locking != i->node_want) {
-			lock_graph_pop_above(g, i - 1, cycle);
-			return 0;
-		}
-
-		if (i->trans == trans) {
-			if (cycle) {
-				/* Only checking: */
-				print_cycle(cycle, g);
-				ret = -1;
-			} else {
-				ret = break_cycle(g);
-			}
 
-			if (ret)
-				goto deadlock;
-			/*
-			 * If we didn't abort (instead telling another
-			 * transaction to abort), keep checking:
-			 */
-		}
-	}
+	for (i = g->g; i < g->g + g->nr; i++)
+		if (i->trans == trans)
+			return break_cycle(g, cycle);
 
 	if (g->nr == ARRAY_SIZE(g->g)) {
 		if (orig_trans->lock_may_not_fail)
 			return 0;
 
+		while (g->nr)
+			lock_graph_up(g);
 		trace_and_count(trans->c, trans_restart_would_deadlock_recursion_limit, trans, _RET_IP_);
-		ret = btree_trans_restart(orig_trans, BCH_ERR_transaction_restart_deadlock_recursion_limit);
-		goto deadlock;
+		return btree_trans_restart(orig_trans, BCH_ERR_transaction_restart_deadlock_recursion_limit);
 	}
 
-	closure_get(&trans->ref);
-
-	g->g[g->nr++] = (struct trans_waiting_for_lock) {
-		.trans		= trans,
-		.node_want	= trans->locking,
-		.lock_want	= trans->locking_wait.lock_want,
-	};
-
+	lock_graph_down(g, trans);
 	return 0;
-deadlock:
-	lock_graph_pop_above(g, g->g, cycle);
-	return ret;
-}
-
-static noinline void lock_graph_remove_non_waiters(struct lock_graph *g,
-						   struct printbuf *cycle)
-{
-	struct trans_waiting_for_lock *i;
-
-	for (i = g->g + 1; i < g->g + g->nr; i++)
-		if (i->trans->locking != i->node_want ||
-		    i->trans->locking_wait.start_time != i[-1].lock_start_time) {
-			lock_graph_pop_above(g, i - 1, cycle);
-			return;
-		}
-	BUG();
 }
 
 static bool lock_type_conflicts(enum six_lock_type t1, enum six_lock_type t2)
@@ -266,8 +247,7 @@ int bch2_check_for_deadlock(struct btree_trans *trans, struct printbuf *cycle)
 	}
 
 	g.nr = 0;
-	ret = lock_graph_descend(&g, trans, cycle);
-	BUG_ON(ret);
+	lock_graph_down(&g, trans);
 next:
 	if (!g.nr)
 		return 0;
@@ -295,7 +275,7 @@ next:
 			b = &READ_ONCE(path->l[top->level].b)->c;
 
 			if (unlikely(IS_ERR_OR_NULL(b))) {
-				lock_graph_remove_non_waiters(&g, cycle);
+				BUG_ON(!lock_graph_remove_non_waiters(&g));
 				goto next;
 			}
 
@@ -321,7 +301,7 @@ next:
 				raw_spin_unlock(&b->lock.wait_lock);
 
 				if (ret)
-					return ret < 0 ? ret : 0;
+					return ret;
 				goto next;
 
 			}
@@ -331,7 +311,7 @@ next:
 
 	if (g.nr > 1 && cycle)
 		print_chain(cycle, &g);
-	lock_graph_pop(&g);
+	lock_graph_up(&g);
 	goto next;
 }
 
-- 
cgit 


From 80df5b8cacceb25962621ccf4cf555413bdfbdbb Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 20 Jan 2023 15:35:07 -0500
Subject: fixup bcachefs: Deadlock cycle detector

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_locking.c | 28 ++++++++++++++++++++++++++--
 1 file changed, 26 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c
index 922cfc7f5450..76a532f98c72 100644
--- a/fs/bcachefs/btree_locking.c
+++ b/fs/bcachefs/btree_locking.c
@@ -99,6 +99,12 @@ static void lock_graph_up(struct lock_graph *g)
 	closure_put(&g->g[--g->nr].trans->ref);
 }
 
+static noinline void lock_graph_pop_all(struct lock_graph *g)
+{
+	while (g->nr)
+		lock_graph_up(g);
+}
+
 static void lock_graph_down(struct lock_graph *g, struct btree_trans *trans)
 {
 	closure_get(&trans->ref);
@@ -274,8 +280,26 @@ next:
 
 			b = &READ_ONCE(path->l[top->level].b)->c;
 
-			if (unlikely(IS_ERR_OR_NULL(b))) {
-				BUG_ON(!lock_graph_remove_non_waiters(&g));
+			if (IS_ERR_OR_NULL(b)) {
+				/*
+				 * If we get here, it means we raced with the
+				 * other thread updating its btree_path
+				 * structures - which means it can't be blocked
+				 * waiting on a lock:
+				 */
+				if (!lock_graph_remove_non_waiters(&g)) {
+					/*
+					 * If lock_graph_remove_non_waiters()
+					 * didn't do anything, it must be
+					 * because we're being called by debugfs
+					 * checking for lock cycles, which
+					 * invokes us on btree_transactions that
+					 * aren't actually waiting on anything.
+					 * Just bail out:
+					 */
+					lock_graph_pop_all(&g);
+				}
+
 				goto next;
 			}
 
-- 
cgit 


From 40405557b92dfe9cd581f914a5fa5f2c0e82d797 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 20 Jan 2023 15:35:07 -0500
Subject: fixup bcachefs: Deadlock cycle detector

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_locking.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c
index 76a532f98c72..5e9424fbc3be 100644
--- a/fs/bcachefs/btree_locking.c
+++ b/fs/bcachefs/btree_locking.c
@@ -226,6 +226,10 @@ static int lock_graph_descend(struct lock_graph *g, struct btree_trans *trans,
 
 		while (g->nr)
 			lock_graph_up(g);
+
+		if (cycle)
+			return 0;
+
 		trace_and_count(trans->c, trans_restart_would_deadlock_recursion_limit, trans, _RET_IP_);
 		return btree_trans_restart(orig_trans, BCH_ERR_transaction_restart_deadlock_recursion_limit);
 	}
@@ -248,6 +252,9 @@ int bch2_check_for_deadlock(struct btree_trans *trans, struct printbuf *cycle)
 	int ret;
 
 	if (trans->lock_must_abort) {
+		if (cycle)
+			return -1;
+
 		trace_and_count(trans->c, trans_restart_would_deadlock, trans, _RET_IP_);
 		return btree_trans_restart(trans, BCH_ERR_transaction_restart_would_deadlock);
 	}
-- 
cgit 


From 3a4d3656e5620df8323448c7d33f4b1401ff14ec Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 13 Oct 2022 00:24:17 -0400
Subject: bcachefs: Fix bch2_write_begin()

An error case was jumping to the wrong label, creating an infinite loop
- oops.

This fixes fstests generic/648.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-io.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index f5517c31f120..cca764cdb4f4 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -1517,7 +1517,7 @@ out:
 	if (!bch2_page_state_create(page, __GFP_NOFAIL)->uptodate) {
 		ret = bch2_page_state_set(c, inode_inum(inode), &page, 1);
 		if (ret)
-			goto out;
+			goto err;
 	}
 
 	ret = bch2_page_reservation_get(c, inode, page, res,
-- 
cgit 


From 07bfcc0b4c92a569b7f613a3202f45c89f983b4f Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 13 Oct 2022 00:44:34 -0400
Subject: bcachefs: Fix for not dropping privs in fallocate

When modifying a file, we may be required to drop the suid/sgid bits -
we were missing a file_modified() call to do this.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-io.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index cca764cdb4f4..9a0751a140e4 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -3113,6 +3113,10 @@ long bch2_fallocate_dispatch(struct file *file, int mode,
 	inode_dio_wait(&inode->v);
 	bch2_pagecache_block_get(&inode->ei_pagecache_lock);
 
+	ret = file_modified(file);
+	if (ret)
+		goto err;
+
 	if (!(mode & ~(FALLOC_FL_KEEP_SIZE|FALLOC_FL_ZERO_RANGE)))
 		ret = bchfs_fallocate(inode, mode, offset, len);
 	else if (mode == (FALLOC_FL_PUNCH_HOLE|FALLOC_FL_KEEP_SIZE))
@@ -3123,8 +3127,7 @@ long bch2_fallocate_dispatch(struct file *file, int mode,
 		ret = bchfs_fcollapse_finsert(inode, offset, len, false);
 	else
 		ret = -EOPNOTSUPP;
-
-
+err:
 	bch2_pagecache_block_put(&inode->ei_pagecache_lock);
 	inode_unlock(&inode->v);
 	percpu_ref_put(&c->writes);
-- 
cgit 


From 92095781e0f607e735971c1a6462ca6dad8826d2 Mon Sep 17 00:00:00 2001
From: Daniel Hill <daniel@gluo.nz>
Date: Sat, 6 Aug 2022 14:48:49 +1200
Subject: bcachefs: Mean and variance

This module provides a fast 64bit implementation of basic statistics
functions, including mean, variance and standard deviation in both
weighted and unweighted variants, the unweighted variant has a 32bit
limitation per sample to prevent overflow when squaring.

Signed-off-by: Daniel Hill <daniel@gluo.nz>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/Kconfig                  |   9 ++
 fs/bcachefs/Makefile                 |   2 +
 fs/bcachefs/mean_and_variance.c      | 159 ++++++++++++++++++++++++++++
 fs/bcachefs/mean_and_variance.h      | 199 +++++++++++++++++++++++++++++++++++
 fs/bcachefs/mean_and_variance_test.c | 153 +++++++++++++++++++++++++++
 5 files changed, 522 insertions(+)
 create mode 100644 fs/bcachefs/mean_and_variance.c
 create mode 100644 fs/bcachefs/mean_and_variance.h
 create mode 100644 fs/bcachefs/mean_and_variance_test.c

(limited to 'fs')

diff --git a/fs/bcachefs/Kconfig b/fs/bcachefs/Kconfig
index 76953e05b240..f8e208826997 100644
--- a/fs/bcachefs/Kconfig
+++ b/fs/bcachefs/Kconfig
@@ -71,3 +71,12 @@ config BCACHEFS_NO_LATENCY_ACCT
 	depends on BCACHEFS_FS
 	help
 	This disables device latency tracking and time stats, only for performance testing
+
+config MEAN_AND_VARIANCE_UNIT_TEST
+	tristate "mean_and_variance unit tests" if !KUNIT_ALL_TESTS
+	depends on KUNIT
+	select MEAN_AND_VARIANCE
+	default KUNIT_ALL_TESTS
+	help
+	  This option enables the kunit tests for mean_and_variance module.
+	  If unsure, say N.
diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile
index e23667548e09..444e79c62b50 100644
--- a/fs/bcachefs/Makefile
+++ b/fs/bcachefs/Makefile
@@ -46,6 +46,7 @@ bcachefs-y		:=	\
 	journal_seq_blacklist.o	\
 	keylist.o		\
 	lru.o			\
+	mean_and_variance.o	\
 	migrate.o		\
 	move.o			\
 	movinggc.o		\
@@ -69,3 +70,4 @@ bcachefs-y		:=	\
 	xattr.o
 
 bcachefs-$(CONFIG_BCACHEFS_POSIX_ACL) += acl.o
+obj-$(CONFIG_MEAN_AND_VARIANCE_UNIT_TEST)   += mean_and_variance_test.o
diff --git a/fs/bcachefs/mean_and_variance.c b/fs/bcachefs/mean_and_variance.c
new file mode 100644
index 000000000000..1f0801e2e565
--- /dev/null
+++ b/fs/bcachefs/mean_and_variance.c
@@ -0,0 +1,159 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Functions for incremental mean and variance.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * Copyright © 2022 Daniel B. Hill
+ *
+ * Author: Daniel B. Hill <daniel@gluo.nz>
+ *
+ * Description:
+ *
+ * This is includes some incremental algorithms for mean and variance calculation
+ *
+ * Derived from the paper: https://fanf2.user.srcf.net/hermes/doc/antiforgery/stats.pdf
+ *
+ * Create a struct and if it's the weighted variant set the w field (weight = 2^k).
+ *
+ * Use mean_and_variance[_weighted]_update() on the struct to update it's state.
+ *
+ * Use the mean_and_variance[_weighted]_get_* functions to calculate the mean and variance, some computation
+ * is deferred to these functions for performance reasons.
+ *
+ * see lib/math/mean_and_variance_test.c for examples of usage.
+ *
+ * DO NOT access the mean and variance fields of the weighted variants directly.
+ * DO NOT change the weight after calling update.
+ */
+
+#include <linux/bug.h>
+#include <linux/compiler.h>
+#include <linux/export.h>
+#include <linux/limits.h>
+#include <linux/math.h>
+#include <linux/math64.h>
+#include <linux/module.h>
+
+#include "mean_and_variance.h"
+
+u128_u u128_div(u128_u n, u64 d)
+{
+	u128_u r;
+	u64 rem;
+	u64 hi = u128_hi(n);
+	u64 lo = u128_lo(n);
+	u64  h =  hi & ((u64) U32_MAX  << 32);
+	u64  l = (hi &  (u64) U32_MAX) << 32;
+
+	r =             u128_shl(u64_to_u128(div64_u64_rem(h,                d, &rem)), 64);
+	r = u128_add(r, u128_shl(u64_to_u128(div64_u64_rem(l  + (rem << 32), d, &rem)), 32));
+	r = u128_add(r,          u64_to_u128(div64_u64_rem(lo + (rem << 32), d, &rem)));
+	return r;
+}
+EXPORT_SYMBOL_GPL(u128_div);
+
+/**
+ * mean_and_variance_get_mean() - get mean from @s
+ */
+s64 mean_and_variance_get_mean(struct mean_and_variance s)
+{
+	return s.n ? div64_u64(s.sum, s.n) : 0;
+}
+EXPORT_SYMBOL_GPL(mean_and_variance_get_mean);
+
+/**
+ * mean_and_variance_get_variance() -  get variance from @s1
+ *
+ * see linked pdf equation 12.
+ */
+u64 mean_and_variance_get_variance(struct mean_and_variance s1)
+{
+	if (s1.n) {
+		u128_u s2 = u128_div(s1.sum_squares, s1.n);
+		u64  s3 = abs(mean_and_variance_get_mean(s1));
+
+		return u128_lo(u128_sub(s2, u128_square(s3)));
+	} else {
+		return 0;
+	}
+}
+EXPORT_SYMBOL_GPL(mean_and_variance_get_variance);
+
+/**
+ * mean_and_variance_get_stddev() - get standard deviation from @s
+ */
+u32 mean_and_variance_get_stddev(struct mean_and_variance s)
+{
+	return int_sqrt64(mean_and_variance_get_variance(s));
+}
+EXPORT_SYMBOL_GPL(mean_and_variance_get_stddev);
+
+/**
+ * mean_and_variance_weighted_update() - exponentially weighted variant of mean_and_variance_update()
+ * @s1: ..
+ * @s2: ..
+ *
+ * see linked pdf: function derived from equations 140-143 where alpha = 2^w.
+ * values are stored bitshifted for performance and added precision.
+ */
+void mean_and_variance_weighted_update(struct mean_and_variance_weighted *s, s64 x)
+{
+	// previous weighted variance.
+	u8 w		= s->weight;
+	u64 var_w0	= s->variance;
+	// new value weighted.
+	s64 x_w		= x << w;
+	s64 diff_w	= x_w - s->mean;
+	s64 diff	= fast_divpow2(diff_w, w);
+	// new mean weighted.
+	s64 u_w1	= s->mean + diff;
+
+	if (!s->init) {
+		s->mean = x_w;
+		s->variance = 0;
+	} else {
+		s->mean = u_w1;
+		s->variance = ((var_w0 << w) - var_w0 + ((diff_w * (x_w - u_w1)) >> w)) >> w;
+	}
+	s->init = true;
+}
+EXPORT_SYMBOL_GPL(mean_and_variance_weighted_update);
+
+/**
+ * mean_and_variance_weighted_get_mean() - get mean from @s
+ */
+s64 mean_and_variance_weighted_get_mean(struct mean_and_variance_weighted s)
+{
+	return fast_divpow2(s.mean, s.weight);
+}
+EXPORT_SYMBOL_GPL(mean_and_variance_weighted_get_mean);
+
+/**
+ * mean_and_variance_weighted_get_variance() -- get variance from @s
+ */
+u64 mean_and_variance_weighted_get_variance(struct mean_and_variance_weighted s)
+{
+	// always positive don't need fast divpow2
+	return s.variance >> s.weight;
+}
+EXPORT_SYMBOL_GPL(mean_and_variance_weighted_get_variance);
+
+/**
+ * mean_and_variance_weighted_get_stddev() - get standard deviation from @s
+ */
+u32 mean_and_variance_weighted_get_stddev(struct mean_and_variance_weighted s)
+{
+	return int_sqrt64(mean_and_variance_weighted_get_variance(s));
+}
+EXPORT_SYMBOL_GPL(mean_and_variance_weighted_get_stddev);
+
+MODULE_AUTHOR("Daniel B. Hill");
+MODULE_LICENSE("GPL");
diff --git a/fs/bcachefs/mean_and_variance.h b/fs/bcachefs/mean_and_variance.h
new file mode 100644
index 000000000000..880e9501c614
--- /dev/null
+++ b/fs/bcachefs/mean_and_variance.h
@@ -0,0 +1,199 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef MEAN_AND_VARIANCE_H_
+#define MEAN_AND_VARIANCE_H_
+
+#include <linux/types.h>
+#include <linux/limits.h>
+#include <linux/math64.h>
+
+#define SQRT_U64_MAX 4294967295ULL
+
+/*
+ * u128_u: u128 user mode, because not all architectures support a real int128
+ * type
+ */
+
+#ifdef __SIZEOF_INT128__
+
+typedef struct {
+	unsigned __int128 v;
+} __aligned(16) u128_u;
+
+static inline u128_u u64_to_u128(u64 a)
+{
+	return (u128_u) { .v = a };
+}
+
+static inline u64 u128_lo(u128_u a)
+{
+	return a.v;
+}
+
+static inline u64 u128_hi(u128_u a)
+{
+	return a.v >> 64;
+}
+
+static inline u128_u u128_add(u128_u a, u128_u b)
+{
+	a.v += b.v;
+	return a;
+}
+
+static inline u128_u u128_sub(u128_u a, u128_u b)
+{
+	a.v -= b.v;
+	return a;
+}
+
+static inline u128_u u128_shl(u128_u a, s8 shift)
+{
+	a.v <<= shift;
+	return a;
+}
+
+static inline u128_u u128_square(u64 a)
+{
+	u128_u b = u64_to_u128(a);
+
+	b.v *= b.v;
+	return b;
+}
+
+#else
+
+typedef struct {
+	u64 hi, lo;
+} __aligned(16) u128_u;
+
+/* conversions */
+
+static inline u128_u u64_to_u128(u64 a)
+{
+	return (u128_u) { .lo = a };
+}
+
+static inline u64 u128_lo(u128_u a)
+{
+	return a.lo;
+}
+
+static inline u64 u128_hi(u128_u a)
+{
+	return a.hi;
+}
+
+/* arithmetic */
+
+static inline u128_u u128_add(u128_u a, u128_u b)
+{
+	u128_u c;
+
+	c.lo = a.lo + b.lo;
+	c.hi = a.hi + b.hi + (c.lo < a.lo);
+	return c;
+}
+
+static inline u128_u u128_sub(u128_u a, u128_u b)
+{
+	u128_u c;
+
+	c.lo = a.lo - b.lo;
+	c.hi = a.hi - b.hi - (c.lo > a.lo);
+	return c;
+}
+
+static inline u128_u u128_shl(u128_u i, s8 shift)
+{
+	u128_u r;
+
+	r.lo = i.lo << shift;
+	if (shift < 64)
+		r.hi = (i.hi << shift) | (i.lo >> (64 - shift));
+	else {
+		r.hi = i.lo << (shift - 64);
+		r.lo = 0;
+	}
+	return r;
+}
+
+static inline u128_u u128_square(u64 i)
+{
+	u128_u r;
+	u64  h = i >> 32, l = i & U32_MAX;
+
+	r =             u128_shl(u64_to_u128(h*h), 64);
+	r = u128_add(r, u128_shl(u64_to_u128(h*l), 32));
+	r = u128_add(r, u128_shl(u64_to_u128(l*h), 32));
+	r = u128_add(r,          u64_to_u128(l*l));
+	return r;
+}
+
+#endif
+
+static inline u128_u u64s_to_u128(u64 hi, u64 lo)
+{
+	u128_u c = u64_to_u128(hi);
+
+	c = u128_shl(c, 64);
+	c = u128_add(c, u64_to_u128(lo));
+	return c;
+}
+
+u128_u u128_div(u128_u n, u64 d);
+
+struct mean_and_variance {
+	s64	n;
+	s64	sum;
+	u128_u	sum_squares;
+};
+
+/* expontentially weighted variant */
+struct mean_and_variance_weighted {
+	bool	init;
+	u8	weight;	/* base 2 logarithim */
+	s64	mean;
+	u64	variance;
+};
+
+/**
+ * fast_divpow2() - fast approximation for n / (1 << d)
+ * @n: numerator
+ * @d: the power of 2 denominator.
+ *
+ * note: this rounds towards 0.
+ */
+static inline s64 fast_divpow2(s64 n, u8 d)
+{
+	return (n + ((n < 0) ? ((1 << d) - 1) : 0)) >> d;
+}
+
+/**
+ * mean_and_variance_update() - update a mean_and_variance struct @s1 with a new sample @v1
+ * and return it.
+ * @s1: the mean_and_variance to update.
+ * @v1: the new sample.
+ *
+ * see linked pdf equation 12.
+ */
+static inline struct mean_and_variance
+mean_and_variance_update(struct mean_and_variance s, s64 v)
+{
+	return (struct mean_and_variance) {
+		.n           = s.n + 1,
+		.sum         = s.sum + v,
+		.sum_squares = u128_add(s.sum_squares, u128_square(abs(v))),
+	};
+}
+
+s64 mean_and_variance_get_mean(struct mean_and_variance s);
+u64 mean_and_variance_get_variance(struct mean_and_variance s1);
+u32 mean_and_variance_get_stddev(struct mean_and_variance s);
+
+void mean_and_variance_weighted_update(struct mean_and_variance_weighted *s, s64 v);
+
+s64 mean_and_variance_weighted_get_mean(struct mean_and_variance_weighted s);
+u64 mean_and_variance_weighted_get_variance(struct mean_and_variance_weighted s);
+u32 mean_and_variance_weighted_get_stddev(struct mean_and_variance_weighted s);
+
+#endif // MEAN_AND_VAIRANCE_H_
diff --git a/fs/bcachefs/mean_and_variance_test.c b/fs/bcachefs/mean_and_variance_test.c
new file mode 100644
index 000000000000..2b4cf9b1781b
--- /dev/null
+++ b/fs/bcachefs/mean_and_variance_test.c
@@ -0,0 +1,153 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <kunit/test.h>
+
+#include "mean_and_variance.h"
+
+#define MAX_SQR (SQRT_U64_MAX*SQRT_U64_MAX)
+
+static void mean_and_variance_basic_test(struct kunit *test)
+{
+	struct mean_and_variance s = {};
+
+	s = mean_and_variance_update(s, 2);
+	s = mean_and_variance_update(s, 2);
+
+	KUNIT_EXPECT_EQ(test, mean_and_variance_get_mean(s), 2);
+	KUNIT_EXPECT_EQ(test, mean_and_variance_get_variance(s), 0);
+	KUNIT_EXPECT_EQ(test, s.n, 2);
+
+	s = mean_and_variance_update(s, 4);
+	s = mean_and_variance_update(s, 4);
+
+	KUNIT_EXPECT_EQ(test, mean_and_variance_get_mean(s), 3);
+	KUNIT_EXPECT_EQ(test, mean_and_variance_get_variance(s), 1);
+	KUNIT_EXPECT_EQ(test, s.n, 4);
+}
+
+/*
+ * Test values computed using a spreadsheet from the psuedocode at the bottom:
+ * https://fanf2.user.srcf.net/hermes/doc/antiforgery/stats.pdf
+ */
+
+static void mean_and_variance_weighted_test(struct kunit *test)
+{
+	struct mean_and_variance_weighted s = { .weight = 2 };
+
+	s.weight = 2;
+
+	mean_and_variance_weighted_update(&s, 10);
+	KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), 10);
+	KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 0);
+
+	mean_and_variance_weighted_update(&s, 20);
+	KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), 12);
+	KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 18);
+
+	mean_and_variance_weighted_update(&s, 30);
+	KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), 16);
+	KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 72);
+
+	s = (struct mean_and_variance_weighted) { .weight = 2 };
+
+	mean_and_variance_weighted_update(&s, -10);
+	KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), -10);
+	KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 0);
+
+	mean_and_variance_weighted_update(&s, -20);
+	KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), -12);
+	KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 18);
+
+	mean_and_variance_weighted_update(&s, -30);
+	KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), -16);
+	KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 72);
+
+}
+
+static void mean_and_variance_weighted_advanced_test(struct kunit *test)
+{
+	struct mean_and_variance_weighted s = { .weight = 8 };
+	s64 i;
+
+	for (i = 10; i <= 100; i += 10)
+		mean_and_variance_weighted_update(&s, i);
+
+	KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), 11);
+	KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 107);
+
+	s = (struct mean_and_variance_weighted) { .weight = 8 };
+
+	for (i = -10; i >= -100; i -= 10)
+		mean_and_variance_weighted_update(&s, i);
+
+	KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), -11);
+	KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 107);
+
+}
+
+static void mean_and_variance_fast_divpow2(struct kunit *test)
+{
+	s64 i;
+	u8 d;
+
+	for (i = 0; i < 100; i++) {
+		d = 0;
+		KUNIT_EXPECT_EQ(test, fast_divpow2(i, d), div_u64(i, 1LLU << d));
+		KUNIT_EXPECT_EQ(test, abs(fast_divpow2(-i, d)), div_u64(i, 1LLU << d));
+		for (d = 1; d < 32; d++) {
+			KUNIT_EXPECT_EQ_MSG(test, abs(fast_divpow2(i, d)),
+					    div_u64(i, 1 << d), "%lld %u", i, d);
+			KUNIT_EXPECT_EQ_MSG(test, abs(fast_divpow2(-i, d)),
+					    div_u64(i, 1 << d), "%lld %u", -i, d);
+		}
+	}
+}
+
+static void mean_and_variance_u128_basic_test(struct kunit *test)
+{
+	u128_u a  = u64s_to_u128(0, U64_MAX);
+	u128_u a1 = u64s_to_u128(0, 1);
+	u128_u b  = u64s_to_u128(1, 0);
+	u128_u c  = u64s_to_u128(0, 1LLU << 63);
+	u128_u c2 = u64s_to_u128(U64_MAX, U64_MAX);
+
+	KUNIT_EXPECT_EQ(test, u128_hi(u128_add(a, a1)), 1);
+	KUNIT_EXPECT_EQ(test, u128_lo(u128_add(a, a1)), 0);
+	KUNIT_EXPECT_EQ(test, u128_hi(u128_add(a1, a)), 1);
+	KUNIT_EXPECT_EQ(test, u128_lo(u128_add(a1, a)), 0);
+
+	KUNIT_EXPECT_EQ(test, u128_lo(u128_sub(b, a1)), U64_MAX);
+	KUNIT_EXPECT_EQ(test, u128_hi(u128_sub(b, a1)), 0);
+
+	KUNIT_EXPECT_EQ(test, u128_hi(u128_shl(c, 1)), 1);
+	KUNIT_EXPECT_EQ(test, u128_lo(u128_shl(c, 1)), 0);
+
+	KUNIT_EXPECT_EQ(test, u128_hi(u128_square(U64_MAX)), U64_MAX - 1);
+	KUNIT_EXPECT_EQ(test, u128_lo(u128_square(U64_MAX)), 1);
+
+	KUNIT_EXPECT_EQ(test, u128_lo(u128_div(b, 2)), 1LLU << 63);
+
+	KUNIT_EXPECT_EQ(test, u128_hi(u128_div(c2, 2)), U64_MAX >> 1);
+	KUNIT_EXPECT_EQ(test, u128_lo(u128_div(c2, 2)), U64_MAX);
+
+	KUNIT_EXPECT_EQ(test, u128_hi(u128_div(u128_shl(u64_to_u128(U64_MAX), 32), 2)), U32_MAX >> 1);
+	KUNIT_EXPECT_EQ(test, u128_lo(u128_div(u128_shl(u64_to_u128(U64_MAX), 32), 2)), U64_MAX << 31);
+}
+
+static struct kunit_case mean_and_variance_test_cases[] = {
+	KUNIT_CASE(mean_and_variance_fast_divpow2),
+	KUNIT_CASE(mean_and_variance_u128_basic_test),
+	KUNIT_CASE(mean_and_variance_basic_test),
+	KUNIT_CASE(mean_and_variance_weighted_test),
+	KUNIT_CASE(mean_and_variance_weighted_advanced_test),
+	{}
+};
+
+static struct kunit_suite mean_and_variance_test_suite = {
+	.name		= "mean and variance tests",
+	.test_cases	= mean_and_variance_test_cases
+};
+
+kunit_test_suite(mean_and_variance_test_suite);
+
+MODULE_AUTHOR("Daniel B. Hill");
+MODULE_LICENSE("GPL");
-- 
cgit 


From bf8f8b20a1e729170493d99a2014c90c5cf5b84b Mon Sep 17 00:00:00 2001
From: Daniel Hill <daniel@gluo.nz>
Date: Fri, 12 Aug 2022 09:03:28 +1200
Subject: bcachefs: time stats now uses the mean_and_variance module.

Signed-off-by: Daniel Hill <daniel@gluo.nz>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/Kconfig |   1 +
 fs/bcachefs/super.c |   6 ++
 fs/bcachefs/util.c  | 173 ++++++++++++++++++++++++++++++++++++++++------------
 fs/bcachefs/util.h  |  12 +++-
 4 files changed, 150 insertions(+), 42 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/Kconfig b/fs/bcachefs/Kconfig
index f8e208826997..bc56c6bf37d7 100644
--- a/fs/bcachefs/Kconfig
+++ b/fs/bcachefs/Kconfig
@@ -22,6 +22,7 @@ config BCACHEFS_FS
 	select XXHASH
 	select SRCU
 	select SYMBOLIC_ERRNAME
+	select MEAN_AND_VARIANCE
 	help
 	The bcachefs filesystem - a modern, copy on write filesystem, with
 	support for multiple devices, compression, checksumming, etc.
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 29e2b76322d7..c69d64555339 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -893,6 +893,12 @@ int bch2_fs_start(struct bch_fs *c)
 		bch2_dev_allocator_add(c, ca);
 	bch2_recalc_capacity(c);
 
+	for (i = 0; i < BCH_TRANSACTIONS_NR; i++) {
+		mutex_lock(&c->btree_transaction_stats[i].lock);
+		bch2_time_stats_init(&c->btree_transaction_stats[i].lock_hold_times);
+		mutex_unlock(&c->btree_transaction_stats[i].lock);
+	}
+
 	ret = BCH_SB_INITIALIZED(c->disk_sb.sb)
 		? bch2_fs_recovery(c)
 		: bch2_fs_initialize(c);
diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c
index bf529bb137ed..ee85bb27e231 100644
--- a/fs/bcachefs/util.c
+++ b/fs/bcachefs/util.c
@@ -24,6 +24,7 @@
 #include <linux/sched/clock.h>
 
 #include "eytzinger.h"
+#include "mean_and_variance.h"
 #include "util.h"
 
 static const char si_units[] = "?kMGTPEZY";
@@ -323,38 +324,39 @@ static void bch2_time_stats_update_one(struct bch2_time_stats *stats,
 {
 	u64 duration, freq;
 
-	duration	= time_after64(end, start)
-		? end - start : 0;
-	freq		= time_after64(end, stats->last_event)
-		? end - stats->last_event : 0;
-
-	stats->count++;
-
-	stats->average_duration = stats->average_duration
-		? ewma_add(stats->average_duration, duration, 6)
-		: duration;
-
-	stats->average_frequency = stats->average_frequency
-		? ewma_add(stats->average_frequency, freq, 6)
-		: freq;
-
-	stats->max_duration = max(stats->max_duration, duration);
-
-	stats->last_event = end;
+	if (time_after64(end, start)) {
+		duration = end - start;
+		stats->duration_stats = mean_and_variance_update(stats->duration_stats, duration);
+		mean_and_variance_weighted_update(&stats->duration_stats_weighted, duration);
+		stats->max_duration = max(stats->max_duration, duration);
+		stats->min_duration = min(stats->min_duration, duration);
+		bch2_quantiles_update(&stats->quantiles, duration);
+	}
 
-	bch2_quantiles_update(&stats->quantiles, duration);
+	if (time_after64(end, stats->last_event)) {
+		freq = end - stats->last_event;
+		stats->freq_stats = mean_and_variance_update(stats->freq_stats, freq);
+		mean_and_variance_weighted_update(&stats->freq_stats_weighted, freq);
+		stats->max_freq = max(stats->max_freq, freq);
+		stats->min_freq = min(stats->min_freq, freq);
+		stats->last_event = end;
+	}
 }
 
 void __bch2_time_stats_update(struct bch2_time_stats *stats, u64 start, u64 end)
 {
 	unsigned long flags;
 
+	WARN_RATELIMIT(!stats->min_duration || !stats->min_freq,
+		       "time_stats: min_duration = %llu, min_freq = %llu",
+		       stats->min_duration, stats->min_freq);
+
 	if (!stats->buffer) {
 		spin_lock_irqsave(&stats->lock, flags);
 		bch2_time_stats_update_one(stats, start, end);
 
-		if (stats->average_frequency < 32 &&
-		    stats->count > 1024)
+		if (mean_and_variance_weighted_get_mean(stats->freq_stats_weighted) < 32 &&
+		    stats->duration_stats.n > 1024)
 			stats->buffer =
 				alloc_percpu_gfp(struct bch2_time_stat_buffer,
 						 GFP_ATOMIC);
@@ -390,12 +392,15 @@ void __bch2_time_stats_update(struct bch2_time_stats *stats, u64 start, u64 end)
 
 static const struct time_unit {
 	const char	*name;
-	u32		nsecs;
+	u64		nsecs;
 } time_units[] = {
-	{ "ns",		1		},
-	{ "us",		NSEC_PER_USEC	},
-	{ "ms",		NSEC_PER_MSEC	},
-	{ "sec",	NSEC_PER_SEC	},
+	{ "ns",		1		 },
+	{ "us",		NSEC_PER_USEC	 },
+	{ "ms",		NSEC_PER_MSEC	 },
+	{ "s",		NSEC_PER_SEC	 },
+	{ "m",          NSEC_PER_SEC * 60},
+	{ "h",          NSEC_PER_SEC * 3600},
+	{ "eon",        U64_MAX          },
 };
 
 static const struct time_unit *pick_time_units(u64 ns)
@@ -418,35 +423,121 @@ void bch2_pr_time_units(struct printbuf *out, u64 ns)
 	prt_printf(out, "%llu %s", div_u64(ns, u->nsecs), u->name);
 }
 
+static void bch2_pr_time_units_aligned(struct printbuf *out, u64 ns)
+{
+	const struct time_unit *u = pick_time_units(ns);
+
+	prt_printf(out, "%llu ", div64_u64(ns, u->nsecs));
+	prt_tab_rjust(out);
+	prt_printf(out, "%s", u->name);
+}
+
+#define TABSTOP_SIZE 12
+
+static inline void pr_name_and_units(struct printbuf *out, const char *name, u64 ns)
+{
+	prt_printf(out, name);
+	prt_tab(out);
+	bch2_pr_time_units_aligned(out, ns);
+	prt_newline(out);
+}
+
 void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats)
 {
 	const struct time_unit *u;
-	u64 freq = READ_ONCE(stats->average_frequency);
-	u64 q, last_q = 0;
+	s64 f_mean = 0, d_mean = 0;
+	u64 q, last_q = 0, f_stddev = 0, d_stddev = 0;
 	int i;
+	/*
+	 * avoid divide by zero
+	 */
+	if (stats->freq_stats.n) {
+		f_mean = mean_and_variance_get_mean(stats->freq_stats);
+		f_stddev = mean_and_variance_get_stddev(stats->freq_stats);
+		d_mean = mean_and_variance_get_mean(stats->duration_stats);
+		d_stddev = mean_and_variance_get_stddev(stats->duration_stats);
+	}
+
+	printbuf_tabstop_push(out, out->indent + TABSTOP_SIZE);
+	prt_printf(out, "count:");
+	prt_tab(out);
+	prt_printf(out, "%llu ",
+			 stats->duration_stats.n);
+	printbuf_tabstop_pop(out);
+	prt_newline(out);
+
+	printbuf_tabstops_reset(out);
 
-	prt_printf(out, "count:\t\t%llu",
-			 stats->count);
+	printbuf_tabstop_push(out, out->indent + 20);
+	printbuf_tabstop_push(out, TABSTOP_SIZE + 2);
+	printbuf_tabstop_push(out, 0);
+	printbuf_tabstop_push(out, TABSTOP_SIZE + 2);
+
+	prt_tab(out);
+	prt_printf(out, "since mount");
+	prt_tab_rjust(out);
+	prt_tab(out);
+	prt_printf(out, "recent");
+	prt_tab_rjust(out);
+	prt_newline(out);
+
+	printbuf_tabstops_reset(out);
+	printbuf_tabstop_push(out, out->indent + 20);
+	printbuf_tabstop_push(out, TABSTOP_SIZE);
+	printbuf_tabstop_push(out, 2);
+	printbuf_tabstop_push(out, TABSTOP_SIZE);
+
+	prt_printf(out, "duration of events");
+	prt_newline(out);
+	printbuf_indent_add(out, 2);
+
+	pr_name_and_units(out, "min:", stats->min_duration);
+	pr_name_and_units(out, "max:", stats->max_duration);
+
+	prt_printf(out, "mean:");
+	prt_tab(out);
+	bch2_pr_time_units_aligned(out, d_mean);
+	prt_tab(out);
+	bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_mean(stats->duration_stats_weighted));
+	prt_newline(out);
+
+	prt_printf(out, "stddev:");
+	prt_tab(out);
+	bch2_pr_time_units_aligned(out, d_stddev);
+	prt_tab(out);
+	bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_stddev(stats->duration_stats_weighted));
+
+	printbuf_indent_sub(out, 2);
 	prt_newline(out);
-	prt_printf(out, "rate:\t\t%llu/sec",
-	       freq ?  div64_u64(NSEC_PER_SEC, freq) : 0);
+
+	prt_printf(out, "time between events");
 	prt_newline(out);
+	printbuf_indent_add(out, 2);
 
-	prt_printf(out, "frequency:\t");
-	bch2_pr_time_units(out, freq);
+	pr_name_and_units(out, "min:", stats->min_freq);
+	pr_name_and_units(out, "max:", stats->max_freq);
 
+	prt_printf(out, "mean:");
+	prt_tab(out);
+	bch2_pr_time_units_aligned(out, f_mean);
+	prt_tab(out);
+	bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_mean(stats->freq_stats_weighted));
 	prt_newline(out);
-	prt_printf(out, "avg duration:\t");
-	bch2_pr_time_units(out, stats->average_duration);
 
+	prt_printf(out, "stddev:");
+	prt_tab(out);
+	bch2_pr_time_units_aligned(out, f_stddev);
+	prt_tab(out);
+	bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_stddev(stats->freq_stats_weighted));
+
+	printbuf_indent_sub(out, 2);
 	prt_newline(out);
-	prt_printf(out, "max duration:\t");
-	bch2_pr_time_units(out, stats->max_duration);
+
+	printbuf_tabstops_reset(out);
 
 	i = eytzinger0_first(NR_QUANTILES);
 	u = pick_time_units(stats->quantiles.entries[i].m);
 
-	prt_newline(out);
 	prt_printf(out, "quantiles (%s):\t", u->name);
 	eytzinger0_for_each(i, NR_QUANTILES) {
 		bool is_last = eytzinger0_next(i, NR_QUANTILES) == -1;
@@ -468,6 +559,10 @@ void bch2_time_stats_exit(struct bch2_time_stats *stats)
 void bch2_time_stats_init(struct bch2_time_stats *stats)
 {
 	memset(stats, 0, sizeof(*stats));
+	stats->duration_stats_weighted.weight = 8;
+	stats->freq_stats_weighted.weight = 8;
+	stats->min_duration = U64_MAX;
+	stats->min_freq = U64_MAX;
 	spin_lock_init(&stats->lock);
 }
 
diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h
index 3b0090faef4d..4243a22c766c 100644
--- a/fs/bcachefs/util.h
+++ b/fs/bcachefs/util.h
@@ -18,6 +18,8 @@
 #include <linux/vmalloc.h>
 #include <linux/workqueue.h>
 
+#include "mean_and_variance.h"
+
 struct closure;
 
 #ifdef CONFIG_BCACHEFS_DEBUG
@@ -407,14 +409,18 @@ struct bch2_time_stat_buffer {
 
 struct bch2_time_stats {
 	spinlock_t	lock;
-	u64		count;
 	/* all fields are in nanoseconds */
-	u64		average_duration;
-	u64		average_frequency;
 	u64		max_duration;
+	u64             min_duration;
+	u64             max_freq;
+	u64             min_freq;
 	u64		last_event;
 	struct bch2_quantiles quantiles;
 
+	struct mean_and_variance	  duration_stats;
+	struct mean_and_variance_weighted duration_stats_weighted;
+	struct mean_and_variance	  freq_stats;
+	struct mean_and_variance_weighted freq_stats_weighted;
 	struct bch2_time_stat_buffer __percpu *buffer;
 };
 
-- 
cgit 


From 17fe3b6452f62c0ee353f3b4f0107685cfd6847d Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 14 Oct 2022 01:14:15 -0400
Subject: bcachefs: Improve journal_entry_add()

Prep work for the next patch, to defer journal entry validation: we now
track for each replica whether we had a good checksum.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/journal_io.c | 82 +++++++++++++++++++++++++++++++-----------------
 fs/bcachefs/journal_io.h |  4 +--
 2 files changed, 56 insertions(+), 30 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 0c82f1048e21..bc6582114003 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -16,6 +16,23 @@
 #include "replicas.h"
 #include "trace.h"
 
+static struct nonce journal_nonce(const struct jset *jset)
+{
+	return (struct nonce) {{
+		[0] = 0,
+		[1] = ((__le32 *) &jset->seq)[0],
+		[2] = ((__le32 *) &jset->seq)[1],
+		[3] = BCH_NONCE_JOURNAL,
+	}};
+}
+
+static bool jset_csum_good(struct bch_fs *c, struct jset *j)
+{
+	return bch2_checksum_type_valid(c, JSET_CSUM_TYPE(j)) &&
+		!bch2_crc_cmp(j->csum,
+			      csum_vstruct(c, JSET_CSUM_TYPE(j), journal_nonce(j), j));
+}
+
 static inline u32 journal_entry_radix_idx(struct bch_fs *c, u64 seq)
 {
 	return (seq - c->journal_entries_base_seq) & (~0U >> 1);
@@ -58,8 +75,7 @@ struct journal_list {
  */
 static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca,
 			     struct journal_ptr entry_ptr,
-			     struct journal_list *jlist, struct jset *j,
-			     bool bad)
+			     struct journal_list *jlist, struct jset *j)
 {
 	struct genradix_iter iter;
 	struct journal_replay **_i, *i, *dup;
@@ -110,38 +126,53 @@ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca,
 	 */
 	dup = *_i;
 	if (dup) {
-		if (dup->bad) {
-			/* we'll replace @dup: */
-		} else if (bad) {
+		if (bytes == vstruct_bytes(&dup->j) &&
+		    !memcmp(j, &dup->j, bytes)) {
 			i = dup;
 			goto found;
-		} else {
-			fsck_err_on(bytes != vstruct_bytes(&dup->j) ||
-				    memcmp(j, &dup->j, bytes), c,
-				    "found duplicate but non identical journal entries (seq %llu)",
-				    le64_to_cpu(j->seq));
+		}
+
+		if (!entry_ptr.csum_good) {
 			i = dup;
 			goto found;
 		}
-	}
 
+		if (!dup->csum_good)
+			goto replace;
+
+		fsck_err(c, "found duplicate but non identical journal entries (seq %llu)",
+			 le64_to_cpu(j->seq));
+		i = dup;
+		goto found;
+	}
+replace:
 	i = kvpmalloc(offsetof(struct journal_replay, j) + bytes, GFP_KERNEL);
 	if (!i)
 		return -ENOMEM;
 
-	i->nr_ptrs	 = 0;
-	i->bad		= bad;
+	i->nr_ptrs	= 0;
+	i->csum_good	= entry_ptr.csum_good;
 	i->ignore	= false;
 	unsafe_memcpy(&i->j, j, bytes, "embedded variable length struct");
+	i->ptrs[i->nr_ptrs++] = entry_ptr;
 
 	if (dup) {
-		i->nr_ptrs = dup->nr_ptrs;
-		memcpy(i->ptrs, dup->ptrs, sizeof(dup->ptrs));
+		if (dup->nr_ptrs >= ARRAY_SIZE(dup->ptrs)) {
+			bch_err(c, "found too many copies of journal entry %llu",
+				le64_to_cpu(i->j.seq));
+			dup->nr_ptrs = ARRAY_SIZE(dup->ptrs) - 1;
+		}
+
+		/* The first ptr should represent the jset we kept: */
+		memcpy(i->ptrs + i->nr_ptrs,
+		       dup->ptrs,
+		       sizeof(dup->ptrs[0]) * dup->nr_ptrs);
+		i->nr_ptrs += dup->nr_ptrs;
 		__journal_replay_free(c, dup);
 	}
 
-
 	*_i = i;
+	return 0;
 found:
 	for (ptr = i->ptrs; ptr < i->ptrs + i->nr_ptrs; ptr++) {
 		if (ptr->dev == ca->dev_idx) {
@@ -163,16 +194,6 @@ fsck_err:
 	return ret;
 }
 
-static struct nonce journal_nonce(const struct jset *jset)
-{
-	return (struct nonce) {{
-		[0] = 0,
-		[1] = ((__le32 *) &jset->seq)[0],
-		[2] = ((__le32 *) &jset->seq)[1],
-		[3] = BCH_NONCE_JOURNAL,
-	}};
-}
-
 /* this fills in a range with empty jset_entries: */
 static void journal_entry_null_range(void *start, void *end)
 {
@@ -838,7 +859,7 @@ static int journal_read_bucket(struct bch_dev *ca,
 	unsigned sectors, sectors_read = 0;
 	u64 offset = bucket_to_sector(ca, ja->buckets[bucket]),
 	    end = offset + ca->mi.bucket_size;
-	bool saw_bad = false;
+	bool saw_bad = false, csum_good;
 	int ret = 0;
 
 	pr_debug("reading %u", bucket);
@@ -921,14 +942,19 @@ reread:
 
 		ja->bucket_seq[bucket] = le64_to_cpu(j->seq);
 
+		csum_good = jset_csum_good(c, j);
+		if (!csum_good)
+			saw_bad = true;
+
 		mutex_lock(&jlist->lock);
 		ret = journal_entry_add(c, ca, (struct journal_ptr) {
+					.csum_good	= csum_good,
 					.dev		= ca->dev_idx,
 					.bucket		= bucket,
 					.bucket_offset	= offset -
 						bucket_to_sector(ca, ja->buckets[bucket]),
 					.sector		= offset,
-					}, jlist, j, ret != 0);
+					}, jlist, j);
 		mutex_unlock(&jlist->lock);
 
 		switch (ret) {
diff --git a/fs/bcachefs/journal_io.h b/fs/bcachefs/journal_io.h
index 1a91f2c0a26c..2f8bbf06b289 100644
--- a/fs/bcachefs/journal_io.h
+++ b/fs/bcachefs/journal_io.h
@@ -8,6 +8,7 @@
  */
 struct journal_replay {
 	struct journal_ptr {
+		bool		csum_good;
 		u8		dev;
 		u32		bucket;
 		u32		bucket_offset;
@@ -15,8 +16,7 @@ struct journal_replay {
 	}			ptrs[BCH_REPLICAS_MAX];
 	unsigned		nr_ptrs;
 
-	/* checksum error, but we may want to try using it anyways: */
-	bool			bad;
+	bool			csum_good;
 	bool			ignore;
 	/* must be last: */
 	struct jset		j;
-- 
cgit 


From d1b2c864e001c4a709ab040d299c553284bcdb2b Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 13 Oct 2022 22:52:40 -0400
Subject: bcachefs: Defer full journal entry validation

On journal read, previously we would do full journal entry validation
immediately after reading a journal entry.

However, this would lead to errors for journal entries we weren't
actually going to use, either because they were too old or too new
(newer than the most recent flush).

We've observed write tearing on journal entries newer than the newest
flush - which makes sense, prior to a flush there's no guarantees about
write persistence.

This patch defers full journal entry validation until the end of the
journal read path, when we know which journal entries we'll want to use.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/journal_io.c | 130 +++++++++++++++++++++++++++--------------------
 1 file changed, 76 insertions(+), 54 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index bc6582114003..1db2ccf2627a 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -735,12 +735,8 @@ fsck_err:
 static int jset_validate(struct bch_fs *c,
 			 struct bch_dev *ca,
 			 struct jset *jset, u64 sector,
-			 unsigned bucket_sectors_left,
-			 unsigned sectors_read,
 			 int write)
 {
-	size_t bytes = vstruct_bytes(jset);
-	struct bch_csum csum;
 	unsigned version;
 	int ret = 0;
 
@@ -757,21 +753,7 @@ static int jset_validate(struct bch_fs *c,
 			sector, le64_to_cpu(jset->seq),
 			version)) {
 		/* don't try to continue: */
-		return EINVAL;
-	}
-
-	if (bytes > (sectors_read << 9) &&
-	    sectors_read < bucket_sectors_left)
-		return JOURNAL_ENTRY_REREAD;
-
-	if (journal_entry_err_on(bytes > bucket_sectors_left << 9,
-				 c, jset, NULL,
-			"%s sector %llu seq %llu: journal entry too big (%zu bytes)",
-			ca ? ca->name : c->name,
-			sector, le64_to_cpu(jset->seq), bytes)) {
-		ret = JOURNAL_ENTRY_BAD;
-		le32_add_cpu(&jset->u64s,
-			     -((bytes - (bucket_sectors_left << 9)) / 8));
+		return -EINVAL;
 	}
 
 	if (journal_entry_err_on(!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(jset)),
@@ -779,28 +761,9 @@ static int jset_validate(struct bch_fs *c,
 			"%s sector %llu seq %llu: journal entry with unknown csum type %llu",
 			ca ? ca->name : c->name,
 			sector, le64_to_cpu(jset->seq),
-			JSET_CSUM_TYPE(jset))) {
-		ret = JOURNAL_ENTRY_BAD;
-		goto csum_done;
-	}
-
-	if (write)
-		goto csum_done;
-
-	csum = csum_vstruct(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), jset);
-	if (journal_entry_err_on(bch2_crc_cmp(csum, jset->csum),
-				 c, jset, NULL,
-				 "%s sector %llu seq %llu: journal checksum bad",
-				 ca ? ca->name : c->name,
-				 sector, le64_to_cpu(jset->seq)))
+			JSET_CSUM_TYPE(jset)))
 		ret = JOURNAL_ENTRY_BAD;
 
-	ret = bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset),
-		     jset->encrypted_start,
-		     vstruct_end(jset) - (void *) jset->encrypted_start);
-	bch2_fs_fatal_err_on(ret, c,
-			"error decrypting journal entry: %i", ret);
-csum_done:
 	/* last_seq is ignored when JSET_NO_FLUSH is true */
 	if (journal_entry_err_on(!JSET_NO_FLUSH(jset) &&
 				 le64_to_cpu(jset->last_seq) > le64_to_cpu(jset->seq),
@@ -811,16 +774,52 @@ csum_done:
 		jset->last_seq = jset->seq;
 		return JOURNAL_ENTRY_BAD;
 	}
+
+	ret = jset_validate_entries(c, jset, write);
 fsck_err:
 	return ret;
 }
 
-static int jset_validate_for_write(struct bch_fs *c, struct jset *jset)
+static int jset_validate_early(struct bch_fs *c,
+			 struct bch_dev *ca,
+			 struct jset *jset, u64 sector,
+			 unsigned bucket_sectors_left,
+			 unsigned sectors_read)
 {
-	unsigned sectors = vstruct_sectors(jset, c->block_bits);
+	size_t bytes = vstruct_bytes(jset);
+	unsigned version;
+	int write = READ;
+	int ret = 0;
+
+	if (le64_to_cpu(jset->magic) != jset_magic(c))
+		return JOURNAL_ENTRY_NONE;
+
+	version = le32_to_cpu(jset->version);
+	if (journal_entry_err_on((version != BCH_JSET_VERSION_OLD &&
+				  version < bcachefs_metadata_version_min) ||
+				 version >= bcachefs_metadata_version_max,
+				 c, jset, NULL,
+			"%s sector %llu seq %llu: unknown journal entry version %u",
+			ca ? ca->name : c->name,
+			sector, le64_to_cpu(jset->seq),
+			version)) {
+		/* don't try to continue: */
+		return -EINVAL;
+	}
+
+	if (bytes > (sectors_read << 9) &&
+	    sectors_read < bucket_sectors_left)
+		return JOURNAL_ENTRY_REREAD;
 
-	return jset_validate(c, NULL, jset, 0, sectors, sectors, WRITE) ?:
-		jset_validate_entries(c, jset, WRITE);
+	if (journal_entry_err_on(bytes > bucket_sectors_left << 9,
+				 c, jset, NULL,
+			"%s sector %llu seq %llu: journal entry too big (%zu bytes)",
+			ca ? ca->name : c->name,
+			sector, le64_to_cpu(jset->seq), bytes))
+		le32_add_cpu(&jset->u64s,
+			     -((bytes - (bucket_sectors_left << 9)) / 8));
+fsck_err:
+	return ret;
 }
 
 struct journal_read_buf {
@@ -898,9 +897,8 @@ reread:
 			j = buf->data;
 		}
 
-		ret = jset_validate(c, ca, j, offset,
-				    end - offset, sectors_read,
-				    READ);
+		ret = jset_validate_early(c, ca, j, offset,
+				    end - offset, sectors_read);
 		switch (ret) {
 		case 0:
 			sectors = vstruct_sectors(j, c->block_bits);
@@ -916,17 +914,13 @@ reread:
 		case JOURNAL_ENTRY_NONE:
 			if (!saw_bad)
 				return 0;
-			sectors = block_sectors(c);
-			goto next_block;
-		case JOURNAL_ENTRY_BAD:
-			saw_bad = true;
 			/*
 			 * On checksum error we don't really trust the size
 			 * field of the journal entry we read, so try reading
 			 * again at next block boundary:
 			 */
 			sectors = block_sectors(c);
-			break;
+			goto next_block;
 		default:
 			return ret;
 		}
@@ -946,6 +940,12 @@ reread:
 		if (!csum_good)
 			saw_bad = true;
 
+		ret = bch2_encrypt(c, JSET_CSUM_TYPE(j), journal_nonce(j),
+			     j->encrypted_start,
+			     vstruct_end(j) - (void *) j->encrypted_start);
+		bch2_fs_fatal_err_on(ret, c,
+				"error decrypting journal entry: %i", ret);
+
 		mutex_lock(&jlist->lock);
 		ret = journal_entry_add(c, ca, (struct journal_ptr) {
 					.csum_good	= csum_good,
@@ -1153,6 +1153,14 @@ int bch2_journal_read(struct bch_fs *c, u64 *blacklist_seq, u64 *start_seq)
 			*start_seq = le64_to_cpu(i->j.seq) + 1;
 
 		if (!JSET_NO_FLUSH(&i->j)) {
+			int write = READ;
+			if (journal_entry_err_on(le64_to_cpu(i->j.last_seq) > le64_to_cpu(i->j.seq),
+						 c, &i->j, NULL,
+						 "invalid journal entry: last_seq > seq (%llu > %llu)",
+						 le64_to_cpu(i->j.last_seq),
+						 le64_to_cpu(i->j.seq)))
+				i->j.last_seq = i->j.seq;
+
 			last_seq	= le64_to_cpu(i->j.last_seq);
 			*blacklist_seq	= le64_to_cpu(i->j.seq) + 1;
 			break;
@@ -1256,7 +1264,21 @@ int bch2_journal_read(struct bch_fs *c, u64 *blacklist_seq, u64 *start_seq)
 		if (!i || i->ignore)
 			continue;
 
-		ret = jset_validate_entries(c, &i->j, READ);
+		for (ptr = 0; ptr < i->nr_ptrs; ptr++) {
+			struct bch_dev *ca = bch_dev_bkey_exists(c, i->ptrs[ptr].dev);
+
+			if (!i->ptrs[ptr].csum_good)
+				printk(KERN_ERR "bcachefs (%s) sector %llu: invalid journal checksum, seq %llu%s\n",
+				       ca->name, i->ptrs[ptr].sector,
+				       le64_to_cpu(i->j.seq),
+				       i->csum_good ? " (had good copy on another device)" : "");
+		}
+
+		ret = jset_validate(c,
+				    bch_dev_bkey_exists(c, i->ptrs[0].dev),
+				    &i->j,
+				    i->ptrs[0].sector,
+				    READ);
 		if (ret)
 			goto err;
 
@@ -1694,7 +1716,7 @@ void bch2_journal_write(struct closure *cl)
 		validate_before_checksum = true;
 
 	if (validate_before_checksum &&
-	    jset_validate_for_write(c, jset))
+	    jset_validate(c, NULL, jset, 0, WRITE))
 		goto err;
 
 	ret = bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset),
@@ -1708,7 +1730,7 @@ void bch2_journal_write(struct closure *cl)
 				  journal_nonce(jset), jset);
 
 	if (!validate_before_checksum &&
-	    jset_validate_for_write(c, jset))
+	    jset_validate(c, NULL, jset, 0, WRITE))
 		goto err;
 
 	sectors = vstruct_sectors(jset, c->block_bits);
-- 
cgit 


From 0196eb89abb9a794d1350684de8e73484f32a19a Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 14 Oct 2022 06:48:23 -0400
Subject: bcachefs: bch2_btree_key_cache_scan() doesn't need trylock

We don't actually allocate memory under the btree key cache lock - so
there's no recursion concerns, and the shrinker can just use
mutex_lock().

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_key_cache.c | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index b26d4ffe2a11..be9431dde458 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -228,6 +228,7 @@ bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path)
 		return ck;
 	}
 
+	/* GFP_NOFS because we're holding btree locks: */
 	ck = kmem_cache_alloc(bch2_key_cache, GFP_NOFS|__GFP_ZERO);
 	if (likely(ck)) {
 		INIT_LIST_HEAD(&ck->list);
@@ -767,12 +768,7 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
 	unsigned start, flags;
 	int srcu_idx;
 
-	/* Return -1 if we can't do anything right now */
-	if (sc->gfp_mask & __GFP_FS)
-		mutex_lock(&bc->lock);
-	else if (!mutex_trylock(&bc->lock))
-		return -1;
-
+	mutex_lock(&bc->lock);
 	srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
 	flags = memalloc_nofs_save();
 
-- 
cgit 


From dccedaaa5262cfbf537b740d83aabf52e94c3143 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 14 Oct 2022 07:20:05 -0400
Subject: bcachefs: Fix btree node prefetchig

We were forgetting to count down the number of nodes to prefetch, firing
off _way_ more than intended - whoops.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 0bb156e6152a..c87129a11640 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -764,7 +764,7 @@ static int btree_path_prefetch(struct btree_trans *trans, struct btree_path *pat
 
 	bch2_bkey_buf_init(&tmp);
 
-	while (nr && !ret) {
+	while (nr-- && !ret) {
 		if (!bch2_btree_node_relock(trans, path, path->level))
 			break;
 
@@ -799,7 +799,7 @@ static int btree_path_prefetch_j(struct btree_trans *trans, struct btree_path *p
 
 	bch2_bkey_buf_init(&tmp);
 
-	while (nr && !ret) {
+	while (nr-- && !ret) {
 		if (!bch2_btree_node_relock(trans, path, path->level))
 			break;
 
-- 
cgit 


From fe5b37f699c02f90505933959797f70645ba95fb Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 15 Oct 2022 00:47:21 -0400
Subject: bcachefs: Btree key cache improvements

 - In userspace, we don't have real percpu variables; this patch
   disables the percpu freelists in userspace
 - add some error messages for the asserts in
   bch2_fs_btree_key_cache_exit(); we've been hitting this (only in
   userspace, oddly), perhaps this will help us track down the error.
 - bkey_cached_reuse() should likely be taking the key cache lock, and
   it's a slowpath so it doesn't hurt to

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_key_cache.c | 67 ++++++++++++++++++++++++++++++++-----------
 1 file changed, 50 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index be9431dde458..419317bc6bec 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -112,6 +112,7 @@ static void bkey_cached_move_to_freelist(struct btree_key_cache *bc,
 	BUG_ON(test_bit(BKEY_CACHED_DIRTY, &ck->flags));
 
 	if (!ck->c.lock.readers) {
+#ifdef __KERNEL__
 		preempt_disable();
 		f = this_cpu_ptr(bc->pcpu_freed);
 
@@ -136,6 +137,11 @@ static void bkey_cached_move_to_freelist(struct btree_key_cache *bc,
 			list_move_tail(&ck->list, &bc->freed_nonpcpu);
 			mutex_unlock(&bc->lock);
 		}
+#else
+		mutex_lock(&bc->lock);
+		list_move_tail(&ck->list, &bc->freed_nonpcpu);
+		mutex_unlock(&bc->lock);
+#endif
 	} else {
 		mutex_lock(&bc->lock);
 		list_move_tail(&ck->list, &bc->freed_pcpu);
@@ -174,6 +180,7 @@ bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path)
 	bool pcpu_readers = btree_uses_pcpu_readers(path->btree_id);
 
 	if (!pcpu_readers) {
+#ifdef __KERNEL__
 		preempt_disable();
 		f = this_cpu_ptr(bc->pcpu_freed);
 		if (f->nr)
@@ -196,6 +203,14 @@ bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path)
 			preempt_enable();
 			mutex_unlock(&bc->lock);
 		}
+#else
+		mutex_lock(&bc->lock);
+		if (!list_empty(&bc->freed_nonpcpu)) {
+			ck = list_last_entry(&bc->freed_nonpcpu, struct bkey_cached, list);
+			list_del_init(&ck->list);
+		}
+		mutex_unlock(&bc->lock);
+#endif
 	} else {
 		mutex_lock(&bc->lock);
 		if (!list_empty(&bc->freed_pcpu)) {
@@ -254,6 +269,7 @@ bkey_cached_reuse(struct btree_key_cache *c)
 	struct bkey_cached *ck;
 	unsigned i;
 
+	mutex_lock(&c->lock);
 	rcu_read_lock();
 	tbl = rht_dereference_rcu(c->table.tbl, &c->table);
 	for (i = 0; i < tbl->size; i++)
@@ -261,13 +277,14 @@ bkey_cached_reuse(struct btree_key_cache *c)
 			if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) &&
 			    bkey_cached_lock_for_evict(ck)) {
 				bkey_cached_evict(c, ck);
-				rcu_read_unlock();
-				return ck;
+				goto out;
 			}
 		}
+	ck = NULL;
+out:
 	rcu_read_unlock();
-
-	return NULL;
+	mutex_unlock(&c->lock);
+	return ck;
 }
 
 static struct bkey_cached *
@@ -873,23 +890,31 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc)
 	struct bkey_cached *ck, *n;
 	struct rhash_head *pos;
 	unsigned i;
+#ifdef __KERNEL__
 	int cpu;
+#endif
 
 	if (bc->shrink.list.next)
 		unregister_shrinker(&bc->shrink);
 
 	mutex_lock(&bc->lock);
 
-	rcu_read_lock();
-	tbl = rht_dereference_rcu(bc->table.tbl, &bc->table);
-	if (tbl)
-		for (i = 0; i < tbl->size; i++)
-			rht_for_each_entry_rcu(ck, pos, tbl, i, hash) {
-				bkey_cached_evict(bc, ck);
-				list_add(&ck->list, &bc->freed_nonpcpu);
-			}
-	rcu_read_unlock();
+	/*
+	 * The loop is needed to guard against racing with rehash:
+	 */
+	while (atomic_long_read(&bc->nr_keys)) {
+		rcu_read_lock();
+		tbl = rht_dereference_rcu(bc->table.tbl, &bc->table);
+		if (tbl)
+			for (i = 0; i < tbl->size; i++)
+				rht_for_each_entry_rcu(ck, pos, tbl, i, hash) {
+					bkey_cached_evict(bc, ck);
+					list_add(&ck->list, &bc->freed_nonpcpu);
+				}
+		rcu_read_unlock();
+	}
 
+#ifdef __KERNEL__
 	for_each_possible_cpu(cpu) {
 		struct btree_key_cache_freelist *f =
 			per_cpu_ptr(bc->pcpu_freed, cpu);
@@ -899,6 +924,7 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc)
 			list_add(&ck->list, &bc->freed_nonpcpu);
 		}
 	}
+#endif
 
 	list_splice(&bc->freed_pcpu, &bc->freed_nonpcpu);
 
@@ -914,10 +940,15 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc)
 		kmem_cache_free(bch2_key_cache, ck);
 	}
 
-	BUG_ON(atomic_long_read(&bc->nr_dirty) &&
-	       !bch2_journal_error(&c->journal) &&
-	       test_bit(BCH_FS_WAS_RW, &c->flags));
-	BUG_ON(atomic_long_read(&bc->nr_keys));
+	if (atomic_long_read(&bc->nr_dirty) &&
+	    !bch2_journal_error(&c->journal) &&
+	    test_bit(BCH_FS_WAS_RW, &c->flags))
+		panic("btree key cache shutdown error: nr_dirty nonzero (%li)\n",
+		      atomic_long_read(&bc->nr_dirty));
+
+	if (atomic_long_read(&bc->nr_keys))
+		panic("btree key cache shutdown error: nr_keys nonzero (%li)\n",
+		      atomic_long_read(&bc->nr_keys));
 
 	mutex_unlock(&bc->lock);
 
@@ -939,9 +970,11 @@ int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc)
 	struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
 	int ret;
 
+#ifdef __KERNEL__
 	bc->pcpu_freed = alloc_percpu(struct btree_key_cache_freelist);
 	if (!bc->pcpu_freed)
 		return -ENOMEM;
+#endif
 
 	ret = rhashtable_init(&bc->table, &bch2_btree_key_cache_params);
 	if (ret)
-- 
cgit 


From d7e4e51370ef62776ea4af22f83047640425efda Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 15 Oct 2022 01:03:14 -0400
Subject: bcachefs: Switch to local_clock() for fastpath time source

local_clock() isn't always completely accurate - e.g. on machines with
TSC drift - but ktime_get_ns() overhead is too high, unfortunately.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c    | 6 +++---
 fs/bcachefs/btree_locking.h | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index c87129a11640..f928de6692ae 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -2787,7 +2787,7 @@ u32 bch2_trans_begin(struct btree_trans *trans)
 
 	if (!trans->restarted &&
 	    (need_resched() ||
-	     ktime_get_ns() - trans->last_begin_time > BTREE_TRANS_MAX_LOCK_HOLD_TIME_NS)) {
+	     local_clock() - trans->last_begin_time > BTREE_TRANS_MAX_LOCK_HOLD_TIME_NS)) {
 		bch2_trans_unlock(trans);
 		cond_resched();
 		bch2_trans_relock(trans);
@@ -2797,7 +2797,7 @@ u32 bch2_trans_begin(struct btree_trans *trans)
 	if (trans->restarted)
 		bch2_btree_path_traverse_all(trans);
 
-	trans->last_begin_time = ktime_get_ns();
+	trans->last_begin_time = local_clock();
 	return trans->restart_count;
 }
 
@@ -2851,7 +2851,7 @@ void __bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, const char *
 	memset(trans, 0, sizeof(*trans));
 	trans->c		= c;
 	trans->fn		= fn;
-	trans->last_begin_time	= ktime_get_ns();
+	trans->last_begin_time	= local_clock();
 	trans->fn_idx		= bch2_trans_get_fn_idx(trans, c, fn);
 	trans->locking_wait.task = current;
 	trans->journal_replay_not_finished =
diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h
index c74a5fd4d908..3356f089e268 100644
--- a/fs/bcachefs/btree_locking.h
+++ b/fs/bcachefs/btree_locking.h
@@ -87,7 +87,7 @@ static inline void mark_btree_node_locked(struct btree_trans *trans,
 {
 	mark_btree_node_locked_noreset(path, level, type);
 #ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS
-	path->l[level].lock_taken_time = ktime_get_ns();
+	path->l[level].lock_taken_time = local_clock();
 #endif
 }
 
@@ -119,7 +119,7 @@ static void btree_trans_lock_hold_time_update(struct btree_trans *trans,
 	if (s)
 		__bch2_time_stats_update(&s->lock_hold_times,
 					 path->l[level].lock_taken_time,
-					 ktime_get_ns());
+					 local_clock());
 #endif
 }
 
@@ -259,7 +259,7 @@ static inline int btree_node_lock(struct btree_trans *trans,
 	    btree_node_lock_increment(trans, b, level, type) ||
 	    !(ret = btree_node_lock_nopath(trans, b, type))) {
 #ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS
-		path->l[b->level].lock_taken_time = ktime_get_ns();
+		path->l[b->level].lock_taken_time = local_clock();
 #endif
 	}
 
-- 
cgit 


From bd954215cad1dbe4304736e6968d8a0c10d1e048 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 15 Oct 2022 03:52:28 -0400
Subject: bcachefs: Quota fixes

 - We now correctly allow soft limits to be exceeded, instead of always
   returning -EDQUOT
 - Disk quota grate times/warnings can now be set, not just the
   systemwide defaults

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-io.c | 18 ++++++------------
 fs/bcachefs/quota.c | 48 ++++++++++++++++++++++--------------------------
 2 files changed, 28 insertions(+), 38 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 9a0751a140e4..274dc78916f8 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -605,7 +605,7 @@ static void bch2_page_reservation_put(struct bch_fs *c,
 static int bch2_page_reservation_get(struct bch_fs *c,
 			struct bch_inode_info *inode, struct page *page,
 			struct bch2_page_reservation *res,
-			unsigned offset, unsigned len, bool check_enospc)
+			unsigned offset, unsigned len)
 {
 	struct bch_page_state *s = bch2_page_state_create(page, 0);
 	unsigned i, disk_sectors = 0, quota_sectors = 0;
@@ -625,19 +625,14 @@ static int bch2_page_reservation_get(struct bch_fs *c,
 	}
 
 	if (disk_sectors) {
-		ret = bch2_disk_reservation_add(c, &res->disk,
-						disk_sectors,
-						!check_enospc
-						? BCH_DISK_RESERVATION_NOFAIL
-						: 0);
+		ret = bch2_disk_reservation_add(c, &res->disk, disk_sectors, 0);
 		if (unlikely(ret))
 			return ret;
 	}
 
 	if (quota_sectors) {
 		ret = bch2_quota_reservation_add(c, inode, &res->quota,
-						 quota_sectors,
-						 check_enospc);
+						 quota_sectors, true);
 		if (unlikely(ret)) {
 			struct disk_reservation tmp = {
 				.sectors = disk_sectors
@@ -821,7 +816,7 @@ vm_fault_t bch2_page_mkwrite(struct vm_fault *vmf)
 		}
 	}
 
-	if (bch2_page_reservation_get(c, inode, page, &res, 0, len, true)) {
+	if (bch2_page_reservation_get(c, inode, page, &res, 0, len)) {
 		unlock_page(page);
 		ret = VM_FAULT_SIGBUS;
 		goto out;
@@ -1520,8 +1515,7 @@ out:
 			goto err;
 	}
 
-	ret = bch2_page_reservation_get(c, inode, page, res,
-					offset, len, true);
+	ret = bch2_page_reservation_get(c, inode, page, res, offset, len);
 	if (ret) {
 		if (!PageUptodate(page)) {
 			/*
@@ -1663,7 +1657,7 @@ static int __bch2_buffered_write(struct bch_inode_info *inode,
 		}
 
 		ret = bch2_page_reservation_get(c, inode, page, &res,
-						pg_offset, pg_len, true);
+						pg_offset, pg_len);
 		if (ret)
 			goto out;
 
diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c
index ad7130a14691..db8172736527 100644
--- a/fs/bcachefs/quota.c
+++ b/fs/bcachefs/quota.c
@@ -332,34 +332,20 @@ static int bch2_quota_check_limit(struct bch_fs *c,
 	if (qc->hardlimit &&
 	    qc->hardlimit < n &&
 	    !ignore_hardlimit(q)) {
-		if (mode == KEY_TYPE_QUOTA_PREALLOC)
-			return -EDQUOT;
-
 		prepare_warning(qc, qtype, counter, msgs, HARDWARN);
+		return -EDQUOT;
 	}
 
 	if (qc->softlimit &&
-	    qc->softlimit < n &&
-	    qc->timer &&
-	    ktime_get_real_seconds() >= qc->timer &&
-	    !ignore_hardlimit(q)) {
-		if (mode == KEY_TYPE_QUOTA_PREALLOC)
-			return -EDQUOT;
-
-		prepare_warning(qc, qtype, counter, msgs, SOFTLONGWARN);
-	}
-
-	if (qc->softlimit &&
-	    qc->softlimit < n &&
-	    qc->timer == 0) {
-		if (mode == KEY_TYPE_QUOTA_PREALLOC)
+	    qc->softlimit < n) {
+		if (qc->timer == 0) {
+			qc->timer = ktime_get_real_seconds() + q->limits[counter].timelimit;
+			prepare_warning(qc, qtype, counter, msgs, SOFTWARN);
+		} else if (ktime_get_real_seconds() >= qc->timer &&
+			   !ignore_hardlimit(q)) {
+			prepare_warning(qc, qtype, counter, msgs, SOFTLONGWARN);
 			return -EDQUOT;
-
-		prepare_warning(qc, qtype, counter, msgs, SOFTWARN);
-
-		/* XXX is this the right one? */
-		qc->timer = ktime_get_real_seconds() +
-			q->limits[counter].warnlimit;
+		}
 	}
 
 	return 0;
@@ -469,7 +455,8 @@ err:
 	return ret;
 }
 
-static int __bch2_quota_set(struct bch_fs *c, struct bkey_s_c k)
+static int __bch2_quota_set(struct bch_fs *c, struct bkey_s_c k,
+			    struct qc_dqblk *qdq)
 {
 	struct bkey_s_c_quota dq;
 	struct bch_memquota_type *q;
@@ -498,6 +485,15 @@ static int __bch2_quota_set(struct bch_fs *c, struct bkey_s_c k)
 			mq->c[i].softlimit = le64_to_cpu(dq.v->c[i].softlimit);
 		}
 
+		if (qdq && qdq->d_fieldmask & QC_SPC_TIMER)
+			mq->c[Q_SPC].timer	= cpu_to_le64(qdq->d_spc_timer);
+		if (qdq && qdq->d_fieldmask & QC_SPC_WARNS)
+			mq->c[Q_SPC].warns	= cpu_to_le64(qdq->d_spc_warns);
+		if (qdq && qdq->d_fieldmask & QC_INO_TIMER)
+			mq->c[Q_INO].timer	= cpu_to_le64(qdq->d_ino_timer);
+		if (qdq && qdq->d_fieldmask & QC_INO_WARNS)
+			mq->c[Q_INO].warns	= cpu_to_le64(qdq->d_ino_warns);
+
 		mutex_unlock(&q->lock);
 	}
 
@@ -618,7 +614,7 @@ int bch2_fs_quota_read(struct bch_fs *c)
 
 	ret = for_each_btree_key2(&trans, iter, BTREE_ID_quotas,
 			POS_MIN, BTREE_ITER_PREFETCH, k,
-		__bch2_quota_set(c, k)) ?:
+		__bch2_quota_set(c, k, NULL)) ?:
 	      for_each_btree_key2(&trans, iter, BTREE_ID_inodes,
 			POS_MIN, BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
 		bch2_fs_quota_read_inode(&trans, &iter, k));
@@ -961,7 +957,7 @@ static int bch2_set_quota(struct super_block *sb, struct kqid qid,
 
 	ret = bch2_trans_do(c, NULL, NULL, 0,
 			    bch2_set_quota_trans(&trans, &new_quota, qdq)) ?:
-		__bch2_quota_set(c, bkey_i_to_s_c(&new_quota.k_i));
+		__bch2_quota_set(c, bkey_i_to_s_c(&new_quota.k_i), qdq);
 
 	return ret;
 }
-- 
cgit 


From b5ac23c465c4ef8e94f6f2c9f2333193dccf9fc3 Mon Sep 17 00:00:00 2001
From: Daniel Hill <daniel@gluo.nz>
Date: Thu, 6 Oct 2022 15:53:36 +1300
Subject: bcachefs: improve behaviour of btree_cache_scan()

Appending new nodes to the end of the list means we're more likely to
evict old entries when btree_cache_scan() is started.

Signed-off-by: Daniel Hill <daniel@gluo.nz>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_cache.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index db786df19318..75bc18466e75 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -173,7 +173,7 @@ int bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b,
 	mutex_lock(&bc->lock);
 	ret = __bch2_btree_node_hash_insert(bc, b);
 	if (!ret)
-		list_add(&b->list, &bc->live);
+		list_add_tail(&b->list, &bc->live);
 	mutex_unlock(&bc->lock);
 
 	return ret;
-- 
cgit 


From 597c6d17b18e2d53e7ab30c5626f38422fe4848b Mon Sep 17 00:00:00 2001
From: Daniel Hill <daniel@gluo.nz>
Date: Wed, 25 May 2022 16:11:56 +1200
Subject: bcachefs: make durability a read-write sysfs option

Sometimes the user may need to change durability after formatting to
match current hardware setup, this option provides a quick and flexible
alternative to removing then adding the device.
It is HIGHLY ADVISED TO RUN REREPLICATE after changing this value so the
system doesn't remain degraded.

Signed-off-by: Daniel Hill <daniel@gluo.nz>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/sysfs.c | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index a27ceabd5e49..76301209898f 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -174,7 +174,7 @@ read_attribute(minor);
 read_attribute(bucket_size);
 read_attribute(first_bucket);
 read_attribute(nbuckets);
-read_attribute(durability);
+rw_attribute(durability);
 read_attribute(iodone);
 
 read_attribute(io_latency_read);
@@ -911,6 +911,19 @@ STORE(bch2_dev)
 		mutex_unlock(&c->sb_lock);
 	}
 
+	if (attr == &sysfs_durability) {
+		u64 v = strtoul_or_return(buf);
+
+		mutex_lock(&c->sb_lock);
+		mi = &bch2_sb_get_members(c->disk_sb.sb)->members[ca->dev_idx];
+
+		if (v != BCH_MEMBER_DURABILITY(mi)) {
+			SET_BCH_MEMBER_DURABILITY(mi, v + 1);
+			bch2_write_super(c);
+		}
+		mutex_unlock(&c->sb_lock);
+	}
+
 	if (attr == &sysfs_label) {
 		char *tmp;
 		int ret;
-- 
cgit 


From be75bb7a0e0565c0c409842048567e8d07f28675 Mon Sep 17 00:00:00 2001
From: Daniel Hill <daniel@gluo.nz>
Date: Sun, 16 Oct 2022 02:25:54 +1300
Subject: bcachefs: __bio_compress() fix up.

A single block can't be compressed, so it's incompressible.
This stops rebalance repeatably marking extents as uncompressed.

Signed-off-by: Daniel Hill <daniel@gluo.nz>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/compress.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c
index f692f35a6a98..2b7080b67eca 100644
--- a/fs/bcachefs/compress.c
+++ b/fs/bcachefs/compress.c
@@ -377,7 +377,7 @@ static unsigned __bio_compress(struct bch_fs *c,
 
 	/* If it's only one block, don't bother trying to compress: */
 	if (src->bi_iter.bi_size <= c->opts.block_size)
-		return 0;
+		return BCH_COMPRESSION_TYPE_incompressible;
 
 	dst_data = bio_map_or_bounce(c, dst, WRITE);
 	src_data = bio_map_or_bounce(c, src, READ);
-- 
cgit 


From b2f83e769f607409753888c95a9b46dc927dc856 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 17 Oct 2022 02:08:07 -0400
Subject: bcachefs: Btree key cache shrinker fix

The shrinker assumes freed key cache items are ordered by age, so that
it doesn't have to scan the full list to find items that are old enough
(according to the srcu code) to be freed.

But percpu freelists broke this ordering; this patch fixes this by
ensuring we insert items into the proper position.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_key_cache.c | 42 ++++++++++++++++++++++++++++++++----------
 1 file changed, 32 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index 419317bc6bec..9a5729309b8f 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -103,16 +103,34 @@ static void bkey_cached_free(struct btree_key_cache *bc,
 	six_unlock_intent(&ck->c.lock);
 }
 
+#ifdef __KERNEL__
+static void __bkey_cached_move_to_freelist_ordered(struct btree_key_cache *bc,
+						   struct bkey_cached *ck)
+{
+	struct bkey_cached *pos;
+
+	list_for_each_entry_reverse(pos, &bc->freed_nonpcpu, list) {
+		if (ULONG_CMP_GE(ck->btree_trans_barrier_seq,
+				 pos->btree_trans_barrier_seq)) {
+			list_move(&ck->list, &pos->list);
+			return;
+		}
+	}
+
+	list_move(&ck->list, &bc->freed_nonpcpu);
+}
+#endif
+
 static void bkey_cached_move_to_freelist(struct btree_key_cache *bc,
 					 struct bkey_cached *ck)
 {
-	struct btree_key_cache_freelist *f;
-	bool freed = false;
-
 	BUG_ON(test_bit(BKEY_CACHED_DIRTY, &ck->flags));
 
 	if (!ck->c.lock.readers) {
 #ifdef __KERNEL__
+		struct btree_key_cache_freelist *f;
+		bool freed = false;
+
 		preempt_disable();
 		f = this_cpu_ptr(bc->pcpu_freed);
 
@@ -130,11 +148,11 @@ static void bkey_cached_move_to_freelist(struct btree_key_cache *bc,
 			while (f->nr > ARRAY_SIZE(f->objs) / 2) {
 				struct bkey_cached *ck2 = f->objs[--f->nr];
 
-				list_move_tail(&ck2->list, &bc->freed_nonpcpu);
+				__bkey_cached_move_to_freelist_ordered(bc, ck2);
 			}
 			preempt_enable();
 
-			list_move_tail(&ck->list, &bc->freed_nonpcpu);
+			__bkey_cached_move_to_freelist_ordered(bc, ck);
 			mutex_unlock(&bc->lock);
 		}
 #else
@@ -176,11 +194,12 @@ bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path)
 	struct bch_fs *c = trans->c;
 	struct btree_key_cache *bc = &c->btree_key_cache;
 	struct bkey_cached *ck = NULL;
-	struct btree_key_cache_freelist *f;
 	bool pcpu_readers = btree_uses_pcpu_readers(path->btree_id);
 
 	if (!pcpu_readers) {
 #ifdef __KERNEL__
+		struct btree_key_cache_freelist *f;
+
 		preempt_disable();
 		f = this_cpu_ptr(bc->pcpu_freed);
 		if (f->nr)
@@ -982,7 +1001,7 @@ int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc)
 
 	bc->table_init_done = true;
 
-	bc->shrink.seeks		= 1;
+	bc->shrink.seeks		= 0;
 	bc->shrink.count_objects	= bch2_btree_key_cache_count;
 	bc->shrink.scan_objects		= bch2_btree_key_cache_scan;
 	return register_shrinker(&bc->shrink, "%s/btree_key_cache", c->name);
@@ -990,9 +1009,12 @@ int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc)
 
 void bch2_btree_key_cache_to_text(struct printbuf *out, struct btree_key_cache *c)
 {
-	prt_printf(out, "nr_freed:\t%zu\n",	atomic_long_read(&c->nr_freed));
-	prt_printf(out, "nr_keys:\t%lu\n",	atomic_long_read(&c->nr_keys));
-	prt_printf(out, "nr_dirty:\t%lu\n",	atomic_long_read(&c->nr_dirty));
+	prt_printf(out, "nr_freed:\t%zu",	atomic_long_read(&c->nr_freed));
+	prt_newline(out);
+	prt_printf(out, "nr_keys:\t%lu",	atomic_long_read(&c->nr_keys));
+	prt_newline(out);
+	prt_printf(out, "nr_dirty:\t%lu",	atomic_long_read(&c->nr_dirty));
+	prt_newline(out);
 }
 
 void bch2_btree_key_cache_exit(void)
-- 
cgit 


From 29aa78f15e1bbd984cc14f395544d62b6f0a2a33 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 17 Oct 2022 02:04:31 -0400
Subject: bcachefs: Split out __btree_path_up_until_good_node()

This breaks up btree_path_up_until_good_node() so that only the fastpath
gets inlined.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c | 34 ++++++++++++++++++++++++----------
 1 file changed, 24 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index f928de6692ae..0bd67600f0db 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -994,14 +994,9 @@ err:
 	return ret;
 }
 
-static inline bool btree_path_good_node(struct btree_trans *trans,
-					struct btree_path *path,
-					unsigned l, int check_pos)
+static inline bool btree_path_check_pos_in_node(struct btree_path *path,
+						unsigned l, int check_pos)
 {
-	if (!is_btree_node(path, l) ||
-	    !bch2_btree_node_relock(trans, path, l))
-		return false;
-
 	if (check_pos < 0 && btree_path_pos_before_node(path, path->l[l].b))
 		return false;
 	if (check_pos > 0 && btree_path_pos_after_node(path, path->l[l].b))
@@ -1009,6 +1004,15 @@ static inline bool btree_path_good_node(struct btree_trans *trans,
 	return true;
 }
 
+static inline bool btree_path_good_node(struct btree_trans *trans,
+					struct btree_path *path,
+					unsigned l, int check_pos)
+{
+	return is_btree_node(path, l) &&
+		bch2_btree_node_relock(trans, path, l) &&
+		btree_path_check_pos_in_node(path, l, check_pos);
+}
+
 static void btree_path_set_level_down(struct btree_trans *trans,
 				      struct btree_path *path,
 				      unsigned new_level)
@@ -1025,9 +1029,9 @@ static void btree_path_set_level_down(struct btree_trans *trans,
 	bch2_btree_path_verify(trans, path);
 }
 
-static inline unsigned btree_path_up_until_good_node(struct btree_trans *trans,
-						     struct btree_path *path,
-						     int check_pos)
+static noinline unsigned __btree_path_up_until_good_node(struct btree_trans *trans,
+							 struct btree_path *path,
+							 int check_pos)
 {
 	unsigned i, l = path->level;
 again:
@@ -1048,6 +1052,16 @@ again:
 	return l;
 }
 
+static inline unsigned btree_path_up_until_good_node(struct btree_trans *trans,
+						     struct btree_path *path,
+						     int check_pos)
+{
+	return likely(btree_node_locked(path, path->level) &&
+		      btree_path_check_pos_in_node(path, path->level, check_pos))
+		? path->level
+		: __btree_path_up_until_good_node(trans, path, check_pos);
+}
+
 /*
  * This is the main state machine for walking down the btree - walks down to a
  * specified depth
-- 
cgit 


From 307e3c13192002f684bdfc23865a57274e6bb4ad Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 17 Oct 2022 07:03:11 -0400
Subject: bcachefs: Optimize bch2_trans_init()

Now we store the transaction's fn idx in a local variable, instead of
redoing the lookup every time we call bch2_trans_init().

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs.h   |  1 -
 fs/bcachefs/btree_iter.c | 20 +++++++++++---------
 fs/bcachefs/btree_iter.h | 15 +++++++++++++--
 fs/bcachefs/debug.c      |  6 +++---
 4 files changed, 27 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 08fd899d8837..7ca1aa3a847f 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -929,7 +929,6 @@ mempool_t		bio_bounce_pages;
 
 	struct bch2_time_stats	times[BCH_TIME_STAT_NR];
 
-	const char              *btree_transaction_fns[BCH_TRANSACTIONS_NR];
 	struct btree_transaction_stats btree_transaction_stats[BCH_TRANSACTIONS_NR];
 };
 
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 0bd67600f0db..4402fcee26e3 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -2841,15 +2841,16 @@ static void bch2_trans_alloc_paths(struct btree_trans *trans, struct bch_fs *c)
 	trans->updates		= p; p += updates_bytes;
 }
 
-static inline unsigned bch2_trans_get_fn_idx(struct btree_trans *trans, struct bch_fs *c,
-					const char *fn)
+const char *bch2_btree_transaction_fns[BCH_TRANSACTIONS_NR];
+
+unsigned bch2_trans_get_fn_idx(const char *fn)
 {
 	unsigned i;
 
-	for (i = 0; i < ARRAY_SIZE(c->btree_transaction_fns); i++)
-		if (!c->btree_transaction_fns[i] ||
-		    c->btree_transaction_fns[i] == fn) {
-			c->btree_transaction_fns[i] = fn;
+	for (i = 0; i < ARRAY_SIZE(bch2_btree_transaction_fns); i++)
+		if (!bch2_btree_transaction_fns[i] ||
+		    bch2_btree_transaction_fns[i] == fn) {
+			bch2_btree_transaction_fns[i] = fn;
 			return i;
 		}
 
@@ -2857,16 +2858,17 @@ static inline unsigned bch2_trans_get_fn_idx(struct btree_trans *trans, struct b
 	return i;
 }
 
-void __bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, const char *fn)
+void __bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, unsigned fn_idx)
 	__acquires(&c->btree_trans_barrier)
 {
 	struct btree_transaction_stats *s;
 
 	memset(trans, 0, sizeof(*trans));
 	trans->c		= c;
-	trans->fn		= fn;
+	trans->fn		= fn_idx < ARRAY_SIZE(bch2_btree_transaction_fns)
+		? bch2_btree_transaction_fns[fn_idx] : NULL;
 	trans->last_begin_time	= local_clock();
-	trans->fn_idx		= bch2_trans_get_fn_idx(trans, c, fn);
+	trans->fn_idx		= fn_idx;
 	trans->locking_wait.task = current;
 	trans->journal_replay_not_finished =
 		!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags);
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index 2e94cd2657e9..cfbd07bc9366 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -587,10 +587,21 @@ void bch2_btree_path_to_text(struct printbuf *, struct btree_path *);
 void bch2_trans_paths_to_text(struct printbuf *, struct btree_trans *);
 void bch2_dump_trans_updates(struct btree_trans *);
 void bch2_dump_trans_paths_updates(struct btree_trans *);
-void __bch2_trans_init(struct btree_trans *, struct bch_fs *, const char *);
+void __bch2_trans_init(struct btree_trans *, struct bch_fs *, unsigned);
 void bch2_trans_exit(struct btree_trans *);
 
-#define bch2_trans_init(_trans, _c, _nr_iters, _mem) __bch2_trans_init(_trans, _c, __func__)
+extern const char *bch2_btree_transaction_fns[BCH_TRANSACTIONS_NR];
+unsigned bch2_trans_get_fn_idx(const char *);
+
+#define bch2_trans_init(_trans, _c, _nr_iters, _mem)			\
+do {									\
+	static unsigned trans_fn_idx;					\
+									\
+	if (unlikely(!trans_fn_idx))					\
+		trans_fn_idx = bch2_trans_get_fn_idx(__func__);		\
+									\
+	__bch2_trans_init(_trans, _c, trans_fn_idx);			\
+} while (0)
 
 void bch2_btree_trans_to_text(struct printbuf *, struct btree_trans *);
 
diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c
index 7abc707d2f38..a9e4180d6a80 100644
--- a/fs/bcachefs/debug.c
+++ b/fs/bcachefs/debug.c
@@ -638,11 +638,11 @@ static ssize_t lock_held_stats_read(struct file *file, char __user *buf,
 		if (!i->size)
 			break;
 
-		if (i->iter == ARRAY_SIZE(c->btree_transaction_fns) ||
-		    !c->btree_transaction_fns[i->iter])
+		if (i->iter == ARRAY_SIZE(bch2_btree_transaction_fns) ||
+		    !bch2_btree_transaction_fns[i->iter])
 			break;
 
-		prt_printf(&i->buf, "%s: ", c->btree_transaction_fns[i->iter]);
+		prt_printf(&i->buf, "%s: ", bch2_btree_transaction_fns[i->iter]);
 		prt_newline(&i->buf);
 		printbuf_indent_add(&i->buf, 2);
 
-- 
cgit 


From adf16c6dfa279fee088a85bac9d602f282699915 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 17 Oct 2022 07:07:28 -0400
Subject: bcachefs: bucket_alloc_fail tracepoint should only fire when we have
 to block

We don't want to fire the bucket_alloc_fail tracepoint on transaction
restart, when we can retry immediately - only when we the allocation
actually has to block.

Also, switch from sched_clock() to local_clock(), as we've been doing
elsewhere.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_foreground.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index 0a7657541b8c..ab288176695e 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -561,7 +561,7 @@ err:
 				&s,
 				cl == NULL,
 				"");
-	else
+	else if (!bch2_err_matches(PTR_ERR(ob), BCH_ERR_transaction_restart))
 		trace_and_count(c, bucket_alloc_fail, ca,
 				bch2_alloc_reserves[reserve],
 				may_alloc_partial,
@@ -1093,7 +1093,7 @@ restart_find_oldest:
 	hlist_add_head_rcu(&wp->node, head);
 	mutex_unlock(&c->write_points_hash_lock);
 out:
-	wp->last_used = sched_clock();
+	wp->last_used = local_clock();
 	return wp;
 }
 
@@ -1341,7 +1341,7 @@ void bch2_fs_allocator_foreground_init(struct bch_fs *c)
 	     wp < c->write_points + c->write_points_nr; wp++) {
 		writepoint_init(wp, BCH_DATA_user);
 
-		wp->last_used	= sched_clock();
+		wp->last_used	= local_clock();
 		wp->write_point	= (unsigned long) wp;
 		hlist_add_head_rcu(&wp->node,
 				   writepoint_hash(c, wp->write_point));
-- 
cgit 


From 3e8b4b3afedc4757c2d8aaad9a900e98a453d110 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 17 Oct 2022 07:09:02 -0400
Subject: bcachefs: Inline bch2_inode_pack()

It's mainly used from bch2_inode_write(), so inline it there.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/inode.c | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index 99987db87ab6..18cfad860ddf 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -60,9 +60,9 @@ static int inode_decode_field(const u8 *in, const u8 *end,
 	return bytes;
 }
 
-void bch2_inode_pack(struct bch_fs *c,
-		     struct bkey_inode_buf *packed,
-		     const struct bch_inode_unpacked *inode)
+static inline void bch2_inode_pack_inlined(struct bch_fs *c,
+					   struct bkey_inode_buf *packed,
+					   const struct bch_inode_unpacked *inode)
 {
 	struct bkey_i_inode_v2 *k = &packed->inode;
 	u8 *out = k->v.fields;
@@ -130,6 +130,13 @@ void bch2_inode_pack(struct bch_fs *c,
 	}
 }
 
+void bch2_inode_pack(struct bch_fs *c,
+		     struct bkey_inode_buf *packed,
+		     const struct bch_inode_unpacked *inode)
+{
+	bch2_inode_pack_inlined(c, packed, inode);
+}
+
 static noinline int bch2_inode_unpack_v1(struct bkey_s_c_inode inode,
 				struct bch_inode_unpacked *unpacked)
 {
@@ -288,7 +295,7 @@ int bch2_inode_write(struct btree_trans *trans,
 	if (IS_ERR(inode_p))
 		return PTR_ERR(inode_p);
 
-	bch2_inode_pack(trans->c, inode_p, inode);
+	bch2_inode_pack_inlined(trans->c, inode_p, inode);
 	inode_p->inode.k.p.snapshot = iter->snapshot;
 	return bch2_trans_update(trans, iter, &inode_p->inode.k_i, 0);
 }
-- 
cgit 


From b0c5b15cc8969f79b410a825efe9894cdec85738 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 17 Oct 2022 07:19:34 -0400
Subject: bcachefs: Optimize __bkey_unpack_key_format_checked()

Delete some code when CONFIG_BCACHEFS_DEBUG=n

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bset.h | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bset.h b/fs/bcachefs/bset.h
index 9708b9ffa4df..e458d1acdef4 100644
--- a/fs/bcachefs/bset.h
+++ b/fs/bcachefs/bset.h
@@ -212,20 +212,19 @@ __bkey_unpack_key_format_checked(const struct btree *b,
 			       struct bkey *dst,
 			       const struct bkey_packed *src)
 {
-#ifdef HAVE_BCACHEFS_COMPILED_UNPACK
-	{
+	if (IS_ENABLED(HAVE_BCACHEFS_COMPILED_UNPACK)) {
 		compiled_unpack_fn unpack_fn = b->aux_data;
 		unpack_fn(dst, src);
 
-		if (bch2_expensive_debug_checks) {
+		if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) &&
+		    bch2_expensive_debug_checks) {
 			struct bkey dst2 = __bch2_bkey_unpack_key(&b->format, src);
 
 			BUG_ON(memcmp(dst, &dst2, sizeof(*dst)));
 		}
+	} else {
+		*dst = __bch2_bkey_unpack_key(&b->format, src);
 	}
-#else
-	*dst = __bch2_bkey_unpack_key(&b->format, src);
-#endif
 }
 
 static inline struct bkey
-- 
cgit 


From ef035f42a03888da62cbe29cd2132d78e5ba393a Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 17 Oct 2022 07:31:37 -0400
Subject: bcachefs: Separate out flush_new_cached_update()

This separates out the slowpath of bch2_trans_update_by_path_trace()
into a new non-inlined helper.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update_leaf.c | 54 ++++++++++++++++++++++++++---------------
 1 file changed, 34 insertions(+), 20 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index cf4a7093f1e9..b60786c20ccf 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -1406,6 +1406,37 @@ static int need_whiteout_for_snapshot(struct btree_trans *trans,
 	return ret;
 }
 
+static int __must_check
+bch2_trans_update_by_path_trace(struct btree_trans *trans, struct btree_path *path,
+				struct bkey_i *k, enum btree_update_flags flags,
+				unsigned long ip);
+
+static noinline int flush_new_cached_update(struct btree_trans *trans,
+					    struct btree_path *path,
+					    struct btree_insert_entry *i,
+					    enum btree_update_flags flags,
+					    unsigned long ip)
+{
+	struct btree_path *btree_path;
+	int ret;
+
+	i->key_cache_already_flushed = true;
+	i->flags |= BTREE_TRIGGER_NORUN;
+
+	btree_path = bch2_path_get(trans, path->btree_id, path->pos, 1, 0,
+				   BTREE_ITER_INTENT);
+
+	ret = bch2_btree_path_traverse(trans, btree_path, 0);
+	if (ret)
+		goto err;
+
+	btree_path_set_should_be_locked(btree_path);
+	ret = bch2_trans_update_by_path_trace(trans, btree_path, i->k, flags, ip);
+err:
+	bch2_path_put(trans, btree_path, true);
+	return ret;
+}
+
 static int __must_check
 bch2_trans_update_by_path_trace(struct btree_trans *trans, struct btree_path *path,
 				struct bkey_i *k, enum btree_update_flags flags,
@@ -1413,7 +1444,6 @@ bch2_trans_update_by_path_trace(struct btree_trans *trans, struct btree_path *pa
 {
 	struct bch_fs *c = trans->c;
 	struct btree_insert_entry *i, n;
-	int ret = 0;
 
 	BUG_ON(!path->should_be_locked);
 
@@ -1484,26 +1514,10 @@ bch2_trans_update_by_path_trace(struct btree_trans *trans, struct btree_path *pa
 	 */
 	if (path->cached &&
 	    bkey_deleted(&i->old_k) &&
-	    !(flags & BTREE_UPDATE_NO_KEY_CACHE_COHERENCY)) {
-		struct btree_path *btree_path;
-
-		i->key_cache_already_flushed = true;
-		i->flags |= BTREE_TRIGGER_NORUN;
+	    !(flags & BTREE_UPDATE_NO_KEY_CACHE_COHERENCY))
+		return flush_new_cached_update(trans, path, i, flags, ip);
 
-		btree_path = bch2_path_get(trans, path->btree_id, path->pos,
-					   1, 0, BTREE_ITER_INTENT);
-
-		ret = bch2_btree_path_traverse(trans, btree_path, 0);
-		if (ret)
-			goto err;
-
-		btree_path_set_should_be_locked(btree_path);
-		ret = bch2_trans_update_by_path_trace(trans, btree_path, k, flags, ip);
-err:
-		bch2_path_put(trans, btree_path, true);
-	}
-
-	return ret;
+	return 0;
 }
 
 static int __must_check
-- 
cgit 


From f83009cda309ca1012cc206102284802b86da70b Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 17 Oct 2022 07:32:57 -0400
Subject: bcachefs: Don't issue transaction restart on key cache realloc

This shouldn't be needed anymore, since we don't rely on the pointer
validity that this was guarding against anymore - we get a new good
reference and save it right after this function.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update_leaf.c | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index b60786c20ccf..fc53958e5619 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -339,7 +339,7 @@ btree_key_can_insert_cached(struct btree_trans *trans,
 {
 	struct bch_fs *c = trans->c;
 	struct bkey_cached *ck = (void *) path->l[0].b;
-	unsigned old_u64s = ck->u64s, new_u64s;
+	unsigned new_u64s;
 	struct bkey_i *new_k;
 
 	EBUG_ON(path->level);
@@ -368,12 +368,7 @@ btree_key_can_insert_cached(struct btree_trans *trans,
 
 	ck->u64s	= new_u64s;
 	ck->k		= new_k;
-	/*
-	 * Keys returned by peek() are no longer valid pointers, so we need a
-	 * transaction restart:
-	 */
-	trace_and_count(c, trans_restart_key_cache_key_realloced, trans, _RET_IP_, path, old_u64s, new_u64s);
-	return btree_trans_restart_nounlock(trans, BCH_ERR_transaction_restart_key_cache_realloced);
+	return 0;
 }
 
 /* Triggers: */
-- 
cgit 


From 55b8550d304a1c0884e98d0bb7126d490a96128f Mon Sep 17 00:00:00 2001
From: "Daniel B. Hill" <daniel@gluo.nz>
Date: Tue, 18 Oct 2022 09:54:32 +1300
Subject: bcachefs: fix security warning in pr_name_and_units

Signed-off-by: Daniel Hill <daniel@gluo.nz>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/util.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c
index ee85bb27e231..a58239fb2a6d 100644
--- a/fs/bcachefs/util.c
+++ b/fs/bcachefs/util.c
@@ -436,7 +436,7 @@ static void bch2_pr_time_units_aligned(struct printbuf *out, u64 ns)
 
 static inline void pr_name_and_units(struct printbuf *out, const char *name, u64 ns)
 {
-	prt_printf(out, name);
+	prt_str(out, name);
 	prt_tab(out);
 	bch2_pr_time_units_aligned(out, ns);
 	prt_newline(out);
-- 
cgit 


From 2d485df3da368193dafc78be933669d427b7ddf7 Mon Sep 17 00:00:00 2001
From: Daniel Hill <daniel@gluo.nz>
Date: Tue, 11 Oct 2022 21:33:56 +1300
Subject: bcachefs: fix bch2_write_extent() crc corruption.

crc.compression_type & nouce gets reset to inside bch2_rechecksum_bio(),
we set it back to the previous values calculated. This fixes
incompressible extents being marked as uncompressed.

Signed-off-by: Daniel Hill <daniel@gluo.nz>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/io.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index 648e4a0a21a9..616407fa08ae 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -1074,8 +1074,7 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp,
 	saved_iter = dst->bi_iter;
 
 	do {
-		struct bch_extent_crc_unpacked crc =
-			(struct bch_extent_crc_unpacked) { 0 };
+		struct bch_extent_crc_unpacked crc = { 0 };
 		struct bversion version = op->version;
 		size_t dst_len, src_len;
 
@@ -1127,6 +1126,8 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp,
 		    !crc_is_compressed(crc) &&
 		    bch2_csum_type_is_encryption(op->crc.csum_type) ==
 		    bch2_csum_type_is_encryption(op->csum_type)) {
+			u8 compression_type = crc.compression_type;
+			u16 nonce = crc.nonce;
 			/*
 			 * Note: when we're using rechecksum(), we need to be
 			 * checksumming @src because it has all the data our
@@ -1145,6 +1146,13 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp,
 					bio_sectors(src) - (src_len >> 9),
 					op->csum_type))
 				goto csum_err;
+			/*
+			 * rchecksum_bio sets compression_type on crc from op->crc,
+			 * this isn't always correct as sometimes we're changing
+			 * an extent from uncompressed to incompressible.
+			 */
+			crc.compression_type = compression_type;
+			crc.nonce = nonce;
 		} else {
 			if ((op->flags & BCH_WRITE_DATA_ENCODED) &&
 			    bch2_rechecksum_bio(c, src, version, op->crc,
-- 
cgit 


From ed80c5699a23c4005ba8e81d4b8fb3e1b922fa40 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 21 Oct 2022 14:01:19 -0400
Subject: bcachefs: Optimize bch2_dev_usage_read()

 - add bch2_dev_usage_read_fast(), which doesn't return by value -
   bch_dev_usage is big enough that we don't want the silent memcpy
 - tweak the allocation path to only call bch2_dev_usage_read() once per
   bucket allocated

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_foreground.c | 40 ++++++++++++++++++++++++++--------------
 fs/bcachefs/buckets.c          |  9 +++------
 fs/bcachefs/buckets.h          | 10 +++++++++-
 3 files changed, 38 insertions(+), 21 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index ab288176695e..5d7231979024 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -495,25 +495,25 @@ static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans,
 				      struct bch_dev *ca,
 				      enum alloc_reserve reserve,
 				      bool may_alloc_partial,
-				      struct closure *cl)
+				      struct closure *cl,
+				      struct bch_dev_usage *usage)
 {
 	struct bch_fs *c = trans->c;
 	struct open_bucket *ob = NULL;
-	struct bch_dev_usage usage;
 	u64 avail;
 	struct bucket_alloc_state s = { 0 };
 	bool waiting = false;
 again:
-	usage = bch2_dev_usage_read(ca);
-	avail = dev_buckets_free(ca, usage, reserve);
+	bch2_dev_usage_read_fast(ca, usage);
+	avail = dev_buckets_free(ca, *usage, reserve);
 
-	if (usage.d[BCH_DATA_need_discard].buckets > avail)
+	if (usage->d[BCH_DATA_need_discard].buckets > avail)
 		bch2_do_discards(c);
 
-	if (usage.d[BCH_DATA_need_gc_gens].buckets > avail)
+	if (usage->d[BCH_DATA_need_gc_gens].buckets > avail)
 		bch2_do_gc_gens(c);
 
-	if (should_invalidate_buckets(ca, usage))
+	if (should_invalidate_buckets(ca, *usage))
 		bch2_do_invalidates(c);
 
 	if (!avail) {
@@ -554,7 +554,7 @@ err:
 				bch2_alloc_reserves[reserve],
 				may_alloc_partial,
 				ob->bucket,
-				usage.d[BCH_DATA_free].buckets,
+				usage->d[BCH_DATA_free].buckets,
 				avail,
 				bch2_copygc_wait_amount(c),
 				c->copygc_wait - atomic64_read(&c->io_clock[WRITE].now),
@@ -566,7 +566,7 @@ err:
 				bch2_alloc_reserves[reserve],
 				may_alloc_partial,
 				0,
-				usage.d[BCH_DATA_free].buckets,
+				usage->d[BCH_DATA_free].buckets,
 				avail,
 				bch2_copygc_wait_amount(c),
 				c->copygc_wait - atomic64_read(&c->io_clock[WRITE].now),
@@ -582,11 +582,12 @@ struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca,
 				      bool may_alloc_partial,
 				      struct closure *cl)
 {
+	struct bch_dev_usage usage;
 	struct open_bucket *ob;
 
 	bch2_trans_do(c, NULL, NULL, 0,
 		      PTR_ERR_OR_ZERO(ob = bch2_bucket_alloc_trans(&trans, ca, reserve,
-								   may_alloc_partial, cl)));
+							may_alloc_partial, cl, &usage)));
 	return ob;
 }
 
@@ -613,8 +614,9 @@ struct dev_alloc_list bch2_dev_alloc_list(struct bch_fs *c,
 	return ret;
 }
 
-void bch2_dev_stripe_increment(struct bch_dev *ca,
-			       struct dev_stripe_state *stripe)
+static inline void bch2_dev_stripe_increment_inlined(struct bch_dev *ca,
+			       struct dev_stripe_state *stripe,
+			       struct bch_dev_usage *usage)
 {
 	u64 *v = stripe->next_alloc + ca->dev_idx;
 	u64 free_space = dev_buckets_available(ca, RESERVE_none);
@@ -633,6 +635,15 @@ void bch2_dev_stripe_increment(struct bch_dev *ca,
 		*v = *v < scale ? 0 : *v - scale;
 }
 
+void bch2_dev_stripe_increment(struct bch_dev *ca,
+			       struct dev_stripe_state *stripe)
+{
+	struct bch_dev_usage usage;
+
+	bch2_dev_usage_read_fast(ca, &usage);
+	bch2_dev_stripe_increment_inlined(ca, stripe, &usage);
+}
+
 #define BUCKET_MAY_ALLOC_PARTIAL	(1 << 0)
 #define BUCKET_ALLOC_USE_DURABILITY	(1 << 1)
 
@@ -677,6 +688,7 @@ static int bch2_bucket_alloc_set_trans(struct btree_trans *trans,
 	BUG_ON(*nr_effective >= nr_replicas);
 
 	for (i = 0; i < devs_sorted.nr; i++) {
+		struct bch_dev_usage usage;
 		struct open_bucket *ob;
 
 		dev = devs_sorted.devs[i];
@@ -696,9 +708,9 @@ static int bch2_bucket_alloc_set_trans(struct btree_trans *trans,
 		}
 
 		ob = bch2_bucket_alloc_trans(trans, ca, reserve,
-				flags & BUCKET_MAY_ALLOC_PARTIAL, cl);
+				flags & BUCKET_MAY_ALLOC_PARTIAL, cl, &usage);
 		if (!IS_ERR(ob))
-			bch2_dev_stripe_increment(ca, stripe);
+			bch2_dev_stripe_increment_inlined(ca, stripe, &usage);
 		percpu_ref_put(&ca->ref);
 
 		if (IS_ERR(ob)) {
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 5cb4a00166f9..1a1790ac01ae 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -88,20 +88,17 @@ static inline struct bch_dev_usage *dev_usage_ptr(struct bch_dev *ca,
 			    : ca->usage[journal_seq & JOURNAL_BUF_MASK]);
 }
 
-struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *ca)
+void bch2_dev_usage_read_fast(struct bch_dev *ca, struct bch_dev_usage *usage)
 {
 	struct bch_fs *c = ca->fs;
-	struct bch_dev_usage ret;
 	unsigned seq, i, u64s = dev_usage_u64s();
 
 	do {
 		seq = read_seqcount_begin(&c->usage_lock);
-		memcpy(&ret, ca->usage_base, u64s * sizeof(u64));
+		memcpy(usage, ca->usage_base, u64s * sizeof(u64));
 		for (i = 0; i < ARRAY_SIZE(ca->usage); i++)
-			acc_u64s_percpu((u64 *) &ret, (u64 __percpu *) ca->usage[i], u64s);
+			acc_u64s_percpu((u64 *) usage, (u64 __percpu *) ca->usage[i], u64s);
 	} while (read_seqcount_retry(&c->usage_lock, seq));
-
-	return ret;
 }
 
 static inline struct bch_fs_usage *fs_usage_ptr(struct bch_fs *c,
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index b4cf10a47c52..a43622193355 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -120,7 +120,15 @@ static inline u8 ptr_stale(struct bch_dev *ca,
 
 /* Device usage: */
 
-struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *);
+void bch2_dev_usage_read_fast(struct bch_dev *, struct bch_dev_usage *);
+static inline struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *ca)
+{
+	struct bch_dev_usage ret;
+
+	bch2_dev_usage_read_fast(ca, &ret);
+	return ret;
+}
+
 void bch2_dev_usage_init(struct bch_dev *);
 
 static inline u64 bch2_dev_buckets_reserved(struct bch_dev *ca, enum alloc_reserve reserve)
-- 
cgit 


From 3e3e02e6bce627ed9e3a5d9fd3118e6569dc2548 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 19 Oct 2022 18:31:33 -0400
Subject: bcachefs: Assorted checkpatch fixes

checkpatch.pl gives lots of warnings that we don't want - suggested
ignore list:

 ASSIGN_IN_IF
 UNSPECIFIED_INT	- bcachefs coding style prefers single token type names
 NEW_TYPEDEFS		- typedefs are occasionally good
 FUNCTION_ARGUMENTS	- we prefer to look at functions in .c files
			  (hopefully with docbook documentation), not .h
			  file prototypes
 MULTISTATEMENT_MACRO_USE_DO_WHILE
			- we have _many_ x-macros and other macros where
			  we can't do this

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/acl.c                   |  2 +-
 fs/bcachefs/alloc_foreground.c      |  2 +-
 fs/bcachefs/bcachefs.h              |  2 +-
 fs/bcachefs/bcachefs_format.h       | 22 +++++++++++-----------
 fs/bcachefs/bkey.c                  |  4 ++--
 fs/bcachefs/bkey.h                  |  5 +++--
 fs/bcachefs/bset.c                  |  4 ++--
 fs/bcachefs/btree_cache.c           |  4 ++--
 fs/bcachefs/btree_gc.c              |  2 +-
 fs/bcachefs/btree_iter.c            |  4 ++--
 fs/bcachefs/btree_key_cache.c       | 10 +++++-----
 fs/bcachefs/btree_locking.c         |  2 +-
 fs/bcachefs/btree_update_interior.c |  2 +-
 fs/bcachefs/buckets.c               |  8 ++++----
 fs/bcachefs/checksum.c              |  2 +-
 fs/bcachefs/data_update.c           |  2 +-
 fs/bcachefs/debug.c                 |  2 +-
 fs/bcachefs/dirent.c                |  2 +-
 fs/bcachefs/extents.c               |  4 ++--
 fs/bcachefs/fs-common.c             |  4 ++--
 fs/bcachefs/fs-io.c                 |  2 +-
 fs/bcachefs/fs.c                    |  7 +++----
 fs/bcachefs/fsck.c                  |  3 ++-
 fs/bcachefs/inode.c                 |  2 +-
 fs/bcachefs/journal.c               | 12 ++++++------
 fs/bcachefs/journal_reclaim.c       |  2 +-
 fs/bcachefs/journal_sb.c            |  4 ++--
 fs/bcachefs/move.c                  |  4 ++--
 fs/bcachefs/recovery.c              |  4 ++--
 fs/bcachefs/siphash.c               |  2 +-
 fs/bcachefs/super-io.c              |  6 ++----
 fs/bcachefs/super.c                 |  9 ++++-----
 fs/bcachefs/util.c                  |  2 --
 33 files changed, 72 insertions(+), 76 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/acl.c b/fs/bcachefs/acl.c
index 7edebeed779e..2bf58aa89f71 100644
--- a/fs/bcachefs/acl.c
+++ b/fs/bcachefs/acl.c
@@ -173,7 +173,7 @@ bch2_acl_to_xattr(struct btree_trans *trans,
 	bkey_xattr_init(&xattr->k_i);
 	xattr->k.u64s		= u64s;
 	xattr->v.x_type		= acl_to_xattr_type(type);
-	xattr->v.x_name_len	= 0,
+	xattr->v.x_name_len	= 0;
 	xattr->v.x_val_len	= cpu_to_le16(acl_len);
 
 	acl_header = xattr_val(&xattr->v);
diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index 5d7231979024..ccc6be5a002f 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -490,7 +490,7 @@ again:
  * bch_bucket_alloc - allocate a single bucket from a specific device
  *
  * Returns index of bucket on success, 0 on failure
- * */
+ */
 static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans,
 				      struct bch_dev *ca,
 				      enum alloc_reserve reserve,
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 7ca1aa3a847f..544621dd4af4 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -107,7 +107,7 @@
  *
  * BTREE NODES:
  *
- * Our unit of allocation is a bucket, and we we can't arbitrarily allocate and
+ * Our unit of allocation is a bucket, and we can't arbitrarily allocate and
  * free smaller than a bucket - so, that's how big our btree nodes are.
  *
  * (If buckets are really big we'll only use part of the bucket for a btree node
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index 5471b797be93..2047484ebe4b 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -340,7 +340,7 @@ static inline void bkey_init(struct bkey *k)
  *   number.
  *
  * - WHITEOUT: for hash table btrees
-*/
+ */
 #define BCH_BKEY_TYPES()				\
 	x(deleted,		0)			\
 	x(whiteout,		1)			\
@@ -783,16 +783,16 @@ enum {
 	 * User flags (get/settable with FS_IOC_*FLAGS, correspond to FS_*_FL
 	 * flags)
 	 */
-	__BCH_INODE_SYNC	= 0,
-	__BCH_INODE_IMMUTABLE	= 1,
-	__BCH_INODE_APPEND	= 2,
-	__BCH_INODE_NODUMP	= 3,
-	__BCH_INODE_NOATIME	= 4,
-
-	__BCH_INODE_I_SIZE_DIRTY= 5,
-	__BCH_INODE_I_SECTORS_DIRTY= 6,
-	__BCH_INODE_UNLINKED	= 7,
-	__BCH_INODE_BACKPTR_UNTRUSTED = 8,
+	__BCH_INODE_SYNC		= 0,
+	__BCH_INODE_IMMUTABLE		= 1,
+	__BCH_INODE_APPEND		= 2,
+	__BCH_INODE_NODUMP		= 3,
+	__BCH_INODE_NOATIME		= 4,
+
+	__BCH_INODE_I_SIZE_DIRTY	= 5,
+	__BCH_INODE_I_SECTORS_DIRTY	= 6,
+	__BCH_INODE_UNLINKED		= 7,
+	__BCH_INODE_BACKPTR_UNTRUSTED	= 8,
 
 	/* bits 20+ reserved for packed fields below: */
 };
diff --git a/fs/bcachefs/bkey.c b/fs/bcachefs/bkey.c
index 52af6f370eb9..e09a5e3fd709 100644
--- a/fs/bcachefs/bkey.c
+++ b/fs/bcachefs/bkey.c
@@ -1113,10 +1113,10 @@ int bch2_bkey_cmp_packed(const struct btree *b,
 
 	if (bkey_packed(l)) {
 		__bkey_unpack_key_format_checked(b, &unpacked, l);
-		l = (void*) &unpacked;
+		l = (void *) &unpacked;
 	} else if (bkey_packed(r)) {
 		__bkey_unpack_key_format_checked(b, &unpacked, r);
-		r = (void*) &unpacked;
+		r = (void *) &unpacked;
 	}
 
 	return bpos_cmp(((struct bkey *) l)->p, ((struct bkey *) r)->p);
diff --git a/fs/bcachefs/bkey.h b/fs/bcachefs/bkey.h
index 2e7e6b6b4af7..d1d9b5d7e2c9 100644
--- a/fs/bcachefs/bkey.h
+++ b/fs/bcachefs/bkey.h
@@ -142,8 +142,9 @@ int bkey_cmp_left_packed(const struct btree *b,
 }
 
 /*
- * we prefer to pass bpos by ref, but it's often enough terribly convenient to
- * pass it by by val... as much as I hate c++, const ref would be nice here:
+ * The compiler generates better code when we pass bpos by ref, but it's often
+ * enough terribly convenient to pass it by val... as much as I hate c++, const
+ * ref would be nice here:
  */
 __pure __flatten
 static inline int bkey_cmp_left_packed_byval(const struct btree *b,
diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c
index f29fb9327cf7..e92737eb34e6 100644
--- a/fs/bcachefs/bset.c
+++ b/fs/bcachefs/bset.c
@@ -953,7 +953,7 @@ static void bch2_bset_fix_lookup_table(struct btree *b,
 	t->size -= j - l;
 
 	for (j = l; j < t->size; j++)
-	       rw_aux_tree(b, t)[j].offset += shift;
+		rw_aux_tree(b, t)[j].offset += shift;
 
 	EBUG_ON(l < t->size &&
 		rw_aux_tree(b, t)[l].offset ==
@@ -1254,7 +1254,7 @@ void bch2_btree_node_iter_push(struct btree_node_iter *iter,
 	bch2_btree_node_iter_sort(iter, b);
 }
 
-noinline __flatten __attribute__((cold))
+noinline __flatten __cold
 static void btree_node_iter_init_pack_failed(struct btree_node_iter *iter,
 			      struct btree *b, struct bpos *search)
 {
diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index 75bc18466e75..135c3ea1377d 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -819,7 +819,7 @@ struct btree *bch2_btree_node_get(struct btree_trans *trans, struct btree_path *
 	if (likely(c->opts.btree_node_mem_ptr_optimization &&
 		   b &&
 		   b->hash_val == btree_ptr_hash_val(k)))
-			goto lock_node;
+		goto lock_node;
 retry:
 	b = btree_cache_find(bc, k);
 	if (unlikely(!b)) {
@@ -1059,7 +1059,7 @@ wait_on_io:
 
 	/* XXX we're called from btree_gc which will be holding other btree
 	 * nodes locked
-	 * */
+	 */
 	__bch2_btree_node_wait_on_read(b);
 	__bch2_btree_node_wait_on_write(b);
 
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 5d19029477cf..1bc5bded0546 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -318,7 +318,7 @@ static int btree_repair_node_boundaries(struct bch_fs *c, struct btree *b,
 				"  node %s",
 				bch2_btree_ids[b->c.btree_id], b->c.level,
 				buf1.buf, buf2.buf))
-		    ret = set_node_min(c, cur, expected_start);
+			ret = set_node_min(c, cur, expected_start);
 	}
 out:
 fsck_err:
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 4402fcee26e3..51eac08e9eea 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -2832,7 +2832,7 @@ static void bch2_trans_alloc_paths(struct btree_trans *trans, struct bch_fs *c)
 	BUG_ON(trans->used_mempool);
 
 #ifdef __KERNEL__
-	p = this_cpu_xchg(c->btree_paths_bufs->path , NULL);
+	p = this_cpu_xchg(c->btree_paths_bufs->path, NULL);
 #endif
 	if (!p)
 		p = mempool_alloc(&trans->c->btree_paths_pool, GFP_NOFS);
@@ -3002,7 +3002,7 @@ bch2_btree_bkey_cached_common_to_text(struct printbuf *out,
 
 	rcu_read_lock();
 	owner = READ_ONCE(b->lock.owner);
-	pid = owner ? owner->pid : 0;;
+	pid = owner ? owner->pid : 0;
 	rcu_read_unlock();
 
 	prt_tab(out);
diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index 9a5729309b8f..179669dbd688 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 
 #include "bcachefs.h"
 #include "btree_cache.h"
@@ -315,7 +316,7 @@ btree_key_cache_create(struct btree_trans *trans, struct btree_path *path)
 	bool was_new = true;
 
 	ck = bkey_cached_alloc(trans, path);
-	if (unlikely(IS_ERR(ck)))
+	if (IS_ERR(ck))
 		return ck;
 
 	if (unlikely(!ck)) {
@@ -435,7 +436,7 @@ err:
 	return ret;
 }
 
-noinline static int
+static noinline int
 bch2_btree_path_traverse_cached_slowpath(struct btree_trans *trans, struct btree_path *path,
 					 unsigned flags)
 {
@@ -616,7 +617,7 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans,
 	 * Since journal reclaim depends on us making progress here, and the
 	 * allocator/copygc depend on journal reclaim making progress, we need
 	 * to be using alloc reserves:
-	 * */
+	 */
 	ret   = bch2_btree_iter_traverse(&b_iter) ?:
 		bch2_trans_update(trans, &b_iter, ck->k,
 				  BTREE_UPDATE_KEY_CACHE_RECLAIM|
@@ -1019,8 +1020,7 @@ void bch2_btree_key_cache_to_text(struct printbuf *out, struct btree_key_cache *
 
 void bch2_btree_key_cache_exit(void)
 {
-	if (bch2_key_cache)
-		kmem_cache_destroy(bch2_key_cache);
+	kmem_cache_destroy(bch2_key_cache);
 }
 
 int __init bch2_btree_key_cache_init(void)
diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c
index 5e9424fbc3be..1530457f0e69 100644
--- a/fs/bcachefs/btree_locking.c
+++ b/fs/bcachefs/btree_locking.c
@@ -681,7 +681,7 @@ int bch2_trans_relock(struct btree_trans *trans)
 	struct btree_path *path;
 
 	if (unlikely(trans->restarted))
-		return - ((int) trans->restarted);
+		return -((int) trans->restarted);
 
 	trans_for_each_path(trans, path)
 		if (path->should_be_locked &&
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 9680d83f9036..30b7c46cb86b 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -2047,7 +2047,7 @@ static int async_btree_node_rewrite_trans(struct btree_trans *trans,
 		goto out;
 
 	ret = bch2_btree_node_rewrite(trans, &iter, b, 0);
-out :
+out:
 	bch2_trans_iter_exit(trans, &iter);
 
 	return ret;
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 1a1790ac01ae..17a1e4767077 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -933,7 +933,7 @@ int bch2_mark_extent(struct btree_trans *trans,
 {
 	u64 journal_seq = trans->journal_res.seq;
 	struct bch_fs *c = trans->c;
-	struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old: new;
+	struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old : new;
 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
 	const union bch_extent_entry *entry;
 	struct extent_ptr_decoded p;
@@ -1152,7 +1152,7 @@ int bch2_mark_reservation(struct btree_trans *trans,
 			  unsigned flags)
 {
 	struct bch_fs *c = trans->c;
-	struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old: new;
+	struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old : new;
 	struct bch_fs_usage __percpu *fs_usage;
 	unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas;
 	s64 sectors = (s64) k.k->size;
@@ -1231,7 +1231,7 @@ int bch2_mark_reflink_p(struct btree_trans *trans,
 			unsigned flags)
 {
 	struct bch_fs *c = trans->c;
-	struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old: new;
+	struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old : new;
 	struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
 	struct reflink_gc *ref;
 	size_t l, r, m;
@@ -2102,5 +2102,5 @@ int bch2_dev_buckets_alloc(struct bch_fs *c, struct bch_dev *ca)
 			return -ENOMEM;
 	}
 
-	return bch2_dev_buckets_resize(c, ca, ca->mi.nbuckets);;
+	return bch2_dev_buckets_resize(c, ca, ca->mi.nbuckets);
 }
diff --git a/fs/bcachefs/checksum.c b/fs/bcachefs/checksum.c
index b5850a761b91..3268e8d48603 100644
--- a/fs/bcachefs/checksum.c
+++ b/fs/bcachefs/checksum.c
@@ -131,7 +131,7 @@ static inline int do_encrypt(struct crypto_sync_skcipher *tfm,
 		size_t orig_len = len;
 		int ret, i;
 
-		sg = kmalloc_array(sizeof(*sg), pages, GFP_KERNEL);
+		sg = kmalloc_array(pages, sizeof(*sg), GFP_KERNEL);
 		if (!sg)
 			return -ENOMEM;
 
diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c
index c606f075688f..927deb3943b5 100644
--- a/fs/bcachefs/data_update.c
+++ b/fs/bcachefs/data_update.c
@@ -315,7 +315,7 @@ int bch2_data_update_init(struct bch_fs *c, struct data_update *m,
 	bch2_write_op_init(&m->op, c, io_opts);
 	m->op.pos	= bkey_start_pos(k.k);
 	m->op.version	= k.k->version;
-	m->op.target	= data_opts.target,
+	m->op.target	= data_opts.target;
 	m->op.write_point = wp;
 	m->op.flags	|= BCH_WRITE_PAGES_STABLE|
 		BCH_WRITE_PAGES_OWNED|
diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c
index a9e4180d6a80..16be8d3db2ad 100644
--- a/fs/bcachefs/debug.c
+++ b/fs/bcachefs/debug.c
@@ -476,7 +476,7 @@ static ssize_t bch2_cached_btree_nodes_read(struct file *file, char __user *buf,
 		if (i->iter < tbl->size) {
 			rht_for_each_entry_rcu(b, pos, tbl, i->iter, hash)
 				bch2_cached_btree_node_to_text(&i->buf, c, b);
-			i->iter++;;
+			i->iter++;
 		} else {
 			done = true;
 		}
diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c
index 4d942d224a08..288f46b55876 100644
--- a/fs/bcachefs/dirent.c
+++ b/fs/bcachefs/dirent.c
@@ -103,7 +103,7 @@ int bch2_dirent_invalid(const struct bch_fs *c, struct bkey_s_c k,
 
 	if (bkey_val_u64s(k.k) > dirent_val_u64s(len)) {
 		prt_printf(err, "value too big (%zu > %u)",
-		       bkey_val_u64s(k.k),dirent_val_u64s(len));
+		       bkey_val_u64s(k.k), dirent_val_u64s(len));
 		return -EINVAL;
 	}
 
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 38836c1990aa..bb1b862bfa65 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -291,7 +291,7 @@ bool bch2_extent_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r)
 		if (lp.crc.offset + lp.crc.live_size + rp.crc.live_size <=
 		    lp.crc.uncompressed_size) {
 			/* can use left extent's crc entry */
-		} else if (lp.crc.live_size <= rp.crc.offset ) {
+		} else if (lp.crc.live_size <= rp.crc.offset) {
 			/* can use right extent's crc entry */
 		} else {
 			/* check if checksums can be merged: */
@@ -350,7 +350,7 @@ bool bch2_extent_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r)
 			if (crc_l.offset + crc_l.live_size + crc_r.live_size <=
 			    crc_l.uncompressed_size) {
 				/* can use left extent's crc entry */
-			} else if (crc_l.live_size <= crc_r.offset ) {
+			} else if (crc_l.live_size <= crc_r.offset) {
 				/* can use right extent's crc entry */
 				crc_r.offset -= crc_l.live_size;
 				bch2_extent_crc_pack(entry_to_crc(en_l), crc_r,
diff --git a/fs/bcachefs/fs-common.c b/fs/bcachefs/fs-common.c
index e9dd1d13ec7e..1f2e1fc4f6b2 100644
--- a/fs/bcachefs/fs-common.c
+++ b/fs/bcachefs/fs-common.c
@@ -487,11 +487,11 @@ int bch2_rename_trans(struct btree_trans *trans,
 	ret =   bch2_inode_write(trans, &src_dir_iter, src_dir_u) ?:
 		(src_dir.inum != dst_dir.inum
 		 ? bch2_inode_write(trans, &dst_dir_iter, dst_dir_u)
-		 : 0 ) ?:
+		 : 0) ?:
 		bch2_inode_write(trans, &src_inode_iter, src_inode_u) ?:
 		(dst_inum.inum
 		 ? bch2_inode_write(trans, &dst_inode_iter, dst_inode_u)
-		 : 0 );
+		 : 0);
 err:
 	bch2_trans_iter_exit(trans, &dst_inode_iter);
 	bch2_trans_iter_exit(trans, &src_inode_iter);
diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 274dc78916f8..b1d53290f6ba 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -2724,7 +2724,7 @@ static long bchfs_fpunch(struct bch_inode_info *inode, loff_t offset, loff_t len
 
 	truncate_pagecache_range(&inode->v, offset, end - 1);
 
-	if (block_start < block_end ) {
+	if (block_start < block_end) {
 		s64 i_sectors_delta = 0;
 
 		ret = bch2_fpunch(c, inode_inum(inode),
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index 66fcd3e28e0c..485cb9cbcd51 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -528,7 +528,7 @@ static int bch2_symlink(struct mnt_idmap *idmap,
 
 	inode = __bch2_create(idmap, dir, dentry, S_IFLNK|S_IRWXUGO, 0,
 			      (subvol_inum) { 0 }, BCH_CREATE_TMPFILE);
-	if (unlikely(IS_ERR(inode)))
+	if (IS_ERR(inode))
 		return bch2_err_class(PTR_ERR(inode));
 
 	inode_lock(&inode->v);
@@ -1847,7 +1847,7 @@ got_sb:
 	sb->s_time_min		= div_s64(S64_MIN, c->sb.time_units_per_sec) + 1;
 	sb->s_time_max		= div_s64(S64_MAX, c->sb.time_units_per_sec);
 	c->vfs_sb		= sb;
-	strlcpy(sb->s_id, c->name, sizeof(sb->s_id));
+	strscpy(sb->s_id, c->name, sizeof(sb->s_id));
 
 	ret = super_setup_bdi(sb);
 	if (ret)
@@ -1918,8 +1918,7 @@ MODULE_ALIAS_FS("bcachefs");
 void bch2_vfs_exit(void)
 {
 	unregister_filesystem(&bcache_fs_type);
-	if (bch2_inode_cache)
-		kmem_cache_destroy(bch2_inode_cache);
+	kmem_cache_destroy(bch2_inode_cache);
 }
 
 int __init bch2_vfs_init(void)
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 12f2ef4417cb..ca95d85b7348 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -2044,7 +2044,8 @@ static int add_nlink(struct bch_fs *c, struct nlink_table *t,
 {
 	if (t->nr == t->size) {
 		size_t new_size = max_t(size_t, 128UL, t->size * 2);
-		void *d = kvmalloc(new_size * sizeof(t->d[0]), GFP_KERNEL);
+		void *d = kvmalloc_array(new_size, sizeof(t->d[0]), GFP_KERNEL);
+
 		if (!d) {
 			bch_err(c, "fsck: error allocating memory for nlink_table, size %zu",
 				new_size);
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index 18cfad860ddf..4161cd850eb8 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -314,7 +314,7 @@ static int __bch2_inode_invalid(struct bkey_s_c k, struct printbuf *err)
 		return -EINVAL;
 	}
 
-	if (bch2_inode_unpack(k, &unpacked)){
+	if (bch2_inode_unpack(k, &unpacked)) {
 		prt_printf(err, "invalid variable length fields");
 		return -EINVAL;
 	}
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index 97c1ecb65dbd..ed3ed3072db1 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -738,7 +738,7 @@ int bch2_journal_log_msg(struct journal *j, const char *fmt, ...)
 		return ret;
 
 	entry = container_of(journal_res_entry(j, &res),
-			     struct jset_entry_log, entry);;
+			     struct jset_entry_log, entry);
 	memset(entry, 0, u64s * sizeof(u64));
 	entry->entry.type = BCH_JSET_ENTRY_log;
 	entry->entry.u64s = u64s - 1;
@@ -795,10 +795,10 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
 		bch2_journal_block(&c->journal);
 	}
 
-	bu		= kzalloc(nr_want * sizeof(*bu), GFP_KERNEL);
-	ob		= kzalloc(nr_want * sizeof(*ob), GFP_KERNEL);
-	new_buckets	= kzalloc(nr * sizeof(u64), GFP_KERNEL);
-	new_bucket_seq	= kzalloc(nr * sizeof(u64), GFP_KERNEL);
+	bu		= kcalloc(nr_want, sizeof(*bu), GFP_KERNEL);
+	ob		= kcalloc(nr_want, sizeof(*ob), GFP_KERNEL);
+	new_buckets	= kcalloc(nr, sizeof(u64), GFP_KERNEL);
+	new_bucket_seq	= kcalloc(nr, sizeof(u64), GFP_KERNEL);
 	if (!bu || !ob || !new_buckets || !new_bucket_seq) {
 		ret = -ENOMEM;
 		goto err_unblock;
@@ -1264,7 +1264,7 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
 	rcu_read_lock();
 	s = READ_ONCE(j->reservations);
 
-	prt_printf(out, "dirty journal entries:\t%llu/%llu\n",fifo_used(&j->pin), j->pin.size);
+	prt_printf(out, "dirty journal entries:\t%llu/%llu\n",	fifo_used(&j->pin), j->pin.size);
 	prt_printf(out, "seq:\t\t\t%llu\n",			journal_cur_seq(j));
 	prt_printf(out, "seq_ondisk:\t\t%llu\n",		j->seq_ondisk);
 	prt_printf(out, "last_seq:\t\t%llu\n",		journal_last_seq(j));
diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
index a4f9d01d33cc..b683a13dbf87 100644
--- a/fs/bcachefs/journal_reclaim.c
+++ b/fs/bcachefs/journal_reclaim.c
@@ -232,7 +232,7 @@ void bch2_journal_space_available(struct journal *j)
 	if ((j->space[journal_space_clean_ondisk].next_entry <
 	     j->space[journal_space_clean_ondisk].total) &&
 	    (clean - clean_ondisk <= total / 8) &&
-	    (clean_ondisk * 2 > clean ))
+	    (clean_ondisk * 2 > clean))
 		set_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags);
 	else
 		clear_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags);
diff --git a/fs/bcachefs/journal_sb.c b/fs/bcachefs/journal_sb.c
index cfdbd92d2164..c19db0425dd7 100644
--- a/fs/bcachefs/journal_sb.c
+++ b/fs/bcachefs/journal_sb.c
@@ -31,7 +31,7 @@ static int bch2_sb_journal_validate(struct bch_sb *sb,
 	if (!nr)
 		return 0;
 
-	b = kmalloc_array(sizeof(u64), nr, GFP_KERNEL);
+	b = kmalloc_array(nr, sizeof(u64), GFP_KERNEL);
 	if (!b)
 		return -ENOMEM;
 
@@ -114,7 +114,7 @@ static int bch2_sb_journal_v2_validate(struct bch_sb *sb,
 	if (!nr)
 		return 0;
 
-	b = kmalloc_array(sizeof(*b), nr, GFP_KERNEL);
+	b = kmalloc_array(nr, sizeof(*b), GFP_KERNEL);
 	if (!b)
 		return -ENOMEM;
 
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index f00c57c8e7a3..7a9d1e4466c5 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -479,7 +479,7 @@ static int __bch2_move_data(struct moving_context *ctxt,
 		/*
 		 * The iterator gets unlocked by __bch2_read_extent - need to
 		 * save a copy of @k elsewhere:
-		  */
+		 */
 		bch2_bkey_buf_reassemble(&sk, c, k);
 		k = bkey_i_to_s_c(sk.k);
 
@@ -667,7 +667,7 @@ static bool migrate_pred(struct bch_fs *c, void *arg,
 		i++;
 	}
 
-	return data_opts->rewrite_ptrs != 0;;
+	return data_opts->rewrite_ptrs != 0;
 }
 
 static bool rereplicate_btree_pred(struct bch_fs *c, void *arg,
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index ea8cc636a9e0..580ff915d0e6 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -224,7 +224,7 @@ int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id,
 			.size			= max_t(size_t, keys->size, 8) * 2,
 		};
 
-		new_keys.d = kvmalloc(sizeof(new_keys.d[0]) * new_keys.size, GFP_KERNEL);
+		new_keys.d = kvmalloc_array(new_keys.size, sizeof(new_keys.d[0]), GFP_KERNEL);
 		if (!new_keys.d) {
 			bch_err(c, "%s: error allocating new key array (size %zu)",
 				__func__, new_keys.size);
@@ -501,7 +501,7 @@ static int journal_keys_sort(struct bch_fs *c)
 
 	keys->size = roundup_pow_of_two(nr_keys);
 
-	keys->d = kvmalloc(sizeof(keys->d[0]) * keys->size, GFP_KERNEL);
+	keys->d = kvmalloc_array(keys->size, sizeof(keys->d[0]), GFP_KERNEL);
 	if (!keys->d)
 		return -ENOMEM;
 
diff --git a/fs/bcachefs/siphash.c b/fs/bcachefs/siphash.c
index c062edb3fbc2..dc1a27cc31cd 100644
--- a/fs/bcachefs/siphash.c
+++ b/fs/bcachefs/siphash.c
@@ -160,7 +160,7 @@ u64 SipHash_End(SIPHASH_CTX *ctx, int rc, int rf)
 
 	r = (ctx->v[0] ^ ctx->v[1]) ^ (ctx->v[2] ^ ctx->v[3]);
 	memset(ctx, 0, sizeof(*ctx));
-	return (r);
+	return r;
 }
 
 u64 SipHash(const SIPHASH_KEY *key, int rc, int rf, const void *src, size_t len)
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index 2a347efdbd83..42e3ce7c0f8c 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -101,8 +101,7 @@ void bch2_sb_field_delete(struct bch_sb_handle *sb,
 
 void bch2_free_super(struct bch_sb_handle *sb)
 {
-	if (sb->bio)
-		kfree(sb->bio);
+	kfree(sb->bio);
 	if (!IS_ERR_OR_NULL(sb->bdev))
 		blkdev_put(sb->bdev, sb->holder);
 	kfree(sb->holder);
@@ -151,8 +150,7 @@ int bch2_sb_realloc(struct bch_sb_handle *sb, unsigned u64s)
 
 		bio_init(bio, NULL, bio->bi_inline_vecs, nr_bvecs, 0);
 
-		if (sb->bio)
-			kfree(sb->bio);
+		kfree(sb->bio);
 		sb->bio = bio;
 	}
 
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index c69d64555339..8ee0783a1e78 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -461,8 +461,8 @@ static void __bch2_fs_free(struct bch_fs *c)
 	kfree(c->unused_inode_hints);
 	free_heap(&c->copygc_heap);
 
-	if (c->io_complete_wq )
-		destroy_workqueue(c->io_complete_wq );
+	if (c->io_complete_wq)
+		destroy_workqueue(c->io_complete_wq);
 	if (c->copygc_wq)
 		destroy_workqueue(c->copygc_wq);
 	if (c->btree_io_complete_wq)
@@ -712,7 +712,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 		goto err;
 
 	pr_uuid(&name, c->sb.user_uuid.b);
-	strlcpy(c->name, name.buf, sizeof(c->name));
+	strscpy(c->name, name.buf, sizeof(c->name));
 	printbuf_exit(&name);
 
 	ret = name.allocation_failure ? -ENOMEM : 0;
@@ -1786,9 +1786,8 @@ int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
 	}
 
 	ret = bch2_trans_mark_dev_sb(c, ca);
-	if (ret) {
+	if (ret)
 		goto err;
-	}
 
 	mutex_lock(&c->sb_lock);
 	mi = &bch2_sb_get_members(c->disk_sb.sb)->members[ca->dev_idx];
diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c
index a58239fb2a6d..8b2eef24498e 100644
--- a/fs/bcachefs/util.c
+++ b/fs/bcachefs/util.c
@@ -789,8 +789,6 @@ void memcpy_from_bio(void *dst, struct bio *src, struct bvec_iter src_iter)
 	}
 }
 
-#include "eytzinger.h"
-
 static int alignment_ok(const void *base, size_t align)
 {
 	return IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) ||
-- 
cgit 


From c81f5836a41fc796f37a5ff2bb39f7c76d07d35d Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 21 Oct 2022 17:26:49 -0400
Subject: bcachefs: Don't touch c->flags in bch2_trans_iter_init()

This moves the JOURNAL_REPLAY_DONE flag check from
bch2_trans_iter_init() to bch2_trans_init(), where we stash a copy in
btree_trans - gaining us a small performance improvement.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 51eac08e9eea..15bf079e17ab 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1167,10 +1167,12 @@ static void btree_path_copy(struct btree_trans *trans, struct btree_path *dst,
 	       (void *) src + offset,
 	       sizeof(struct btree_path) - offset);
 
-	for (i = 0; i < BTREE_MAX_DEPTH; i++)
-		if (btree_node_locked(dst, i))
-			six_lock_increment(&dst->l[i].b->c.lock,
-					   __btree_lock_want(dst, i));
+	for (i = 0; i < BTREE_MAX_DEPTH; i++) {
+		unsigned t = btree_node_locked_type(dst, i);
+
+		if (t != BTREE_NODE_UNLOCKED)
+			six_lock_increment(&dst->l[i].b->c.lock, t);
+	}
 
 	trans->paths_sorted = false;
 }
@@ -2631,7 +2633,7 @@ static inline void __bch2_trans_iter_init(struct btree_trans *trans,
 					  unsigned depth,
 					  unsigned flags)
 {
-	if (trans->restarted)
+	if (unlikely(trans->restarted))
 		panic("bch2_trans_iter_init(): in transaction restart, %s by %pS\n",
 		      bch2_err_str(trans->restarted),
 		      (void *) trans->last_restarted_ip);
-- 
cgit 


From 005def8ff16885743b9d711fe19fc039c0254eed Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 21 Oct 2022 17:37:42 -0400
Subject: bcachefs: Optimize __bch2_btree_node_iter_advance()

This replaces an expensive memmove() call with an open-coded version.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bset.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c
index e92737eb34e6..aa8508efca00 100644
--- a/fs/bcachefs/bset.c
+++ b/fs/bcachefs/bset.c
@@ -1429,7 +1429,10 @@ static inline void __bch2_btree_node_iter_advance(struct btree_node_iter *iter,
 	EBUG_ON(iter->data->k > iter->data->end);
 
 	if (unlikely(__btree_node_iter_set_end(iter, 0))) {
-		bch2_btree_node_iter_set_drop(iter, iter->data);
+		/* avoid an expensive memmove call: */
+		iter->data[0] = iter->data[1];
+		iter->data[1] = iter->data[2];
+		iter->data[2] = (struct btree_node_iter_set) { 0, 0 };
 		return;
 	}
 
-- 
cgit 


From 77671e8fffdd09f37de2ed0cdcdc92069f3597e8 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 21 Oct 2022 19:15:07 -0400
Subject: bcachefs: Move bkey bkey_unpack_key() to bkey.h

Long ago, bkey_unpack_key() was added to bset.h instead of bkey.h
because bkey.h didn't include btree_types.h, which it needs for the
compiled unpack function.

This patch finally moves it to the proper location.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bkey.h        | 94 +++++++++++++++++++++++++++++++++++++++++++++++
 fs/bcachefs/bkey_buf.h    |  1 +
 fs/bcachefs/bset.h        | 93 ----------------------------------------------
 fs/bcachefs/btree_cache.h |  1 +
 fs/bcachefs/btree_types.h |  2 +-
 fs/bcachefs/buckets.h     |  2 -
 fs/bcachefs/inode.h       |  1 +
 fs/bcachefs/keylist.c     |  1 +
 fs/bcachefs/replicas.h    |  1 +
 9 files changed, 100 insertions(+), 96 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bkey.h b/fs/bcachefs/bkey.h
index d1d9b5d7e2c9..137b2d8bdb49 100644
--- a/fs/bcachefs/bkey.h
+++ b/fs/bcachefs/bkey.h
@@ -5,6 +5,7 @@
 #include <linux/bug.h>
 #include "bcachefs_format.h"
 
+#include "btree_types.h"
 #include "util.h"
 #include "vstructs.h"
 
@@ -365,6 +366,99 @@ void bch2_bkey_unpack(const struct btree *, struct bkey_i *,
 bool bch2_bkey_pack(struct bkey_packed *, const struct bkey_i *,
 	       const struct bkey_format *);
 
+typedef void (*compiled_unpack_fn)(struct bkey *, const struct bkey_packed *);
+
+static inline void
+__bkey_unpack_key_format_checked(const struct btree *b,
+			       struct bkey *dst,
+			       const struct bkey_packed *src)
+{
+	if (IS_ENABLED(HAVE_BCACHEFS_COMPILED_UNPACK)) {
+		compiled_unpack_fn unpack_fn = b->aux_data;
+		unpack_fn(dst, src);
+
+		if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) &&
+		    bch2_expensive_debug_checks) {
+			struct bkey dst2 = __bch2_bkey_unpack_key(&b->format, src);
+
+			BUG_ON(memcmp(dst, &dst2, sizeof(*dst)));
+		}
+	} else {
+		*dst = __bch2_bkey_unpack_key(&b->format, src);
+	}
+}
+
+static inline struct bkey
+bkey_unpack_key_format_checked(const struct btree *b,
+			       const struct bkey_packed *src)
+{
+	struct bkey dst;
+
+	__bkey_unpack_key_format_checked(b, &dst, src);
+	return dst;
+}
+
+static inline void __bkey_unpack_key(const struct btree *b,
+				     struct bkey *dst,
+				     const struct bkey_packed *src)
+{
+	if (likely(bkey_packed(src)))
+		__bkey_unpack_key_format_checked(b, dst, src);
+	else
+		*dst = *packed_to_bkey_c(src);
+}
+
+/**
+ * bkey_unpack_key -- unpack just the key, not the value
+ */
+static inline struct bkey bkey_unpack_key(const struct btree *b,
+					  const struct bkey_packed *src)
+{
+	return likely(bkey_packed(src))
+		? bkey_unpack_key_format_checked(b, src)
+		: *packed_to_bkey_c(src);
+}
+
+static inline struct bpos
+bkey_unpack_pos_format_checked(const struct btree *b,
+			       const struct bkey_packed *src)
+{
+#ifdef HAVE_BCACHEFS_COMPILED_UNPACK
+	return bkey_unpack_key_format_checked(b, src).p;
+#else
+	return __bkey_unpack_pos(&b->format, src);
+#endif
+}
+
+static inline struct bpos bkey_unpack_pos(const struct btree *b,
+					  const struct bkey_packed *src)
+{
+	return likely(bkey_packed(src))
+		? bkey_unpack_pos_format_checked(b, src)
+		: packed_to_bkey_c(src)->p;
+}
+
+/* Disassembled bkeys */
+
+static inline struct bkey_s_c bkey_disassemble(struct btree *b,
+					       const struct bkey_packed *k,
+					       struct bkey *u)
+{
+	__bkey_unpack_key(b, u, k);
+
+	return (struct bkey_s_c) { u, bkeyp_val(&b->format, k), };
+}
+
+/* non const version: */
+static inline struct bkey_s __bkey_disassemble(struct btree *b,
+					       struct bkey_packed *k,
+					       struct bkey *u)
+{
+	__bkey_unpack_key(b, u, k);
+
+	return (struct bkey_s) { .k = u, .v = bkeyp_val(&b->format, k), };
+}
+
 static inline u64 bkey_field_max(const struct bkey_format *f,
 				 enum bch_bkey_fields nr)
 {
diff --git a/fs/bcachefs/bkey_buf.h b/fs/bcachefs/bkey_buf.h
index 0d7c67a959af..a30c4ae8eb36 100644
--- a/fs/bcachefs/bkey_buf.h
+++ b/fs/bcachefs/bkey_buf.h
@@ -3,6 +3,7 @@
 #define _BCACHEFS_BKEY_BUF_H
 
 #include "bcachefs.h"
+#include "bkey.h"
 
 struct bkey_buf {
 	struct bkey_i	*k;
diff --git a/fs/bcachefs/bset.h b/fs/bcachefs/bset.h
index e458d1acdef4..b352d5a40de0 100644
--- a/fs/bcachefs/bset.h
+++ b/fs/bcachefs/bset.h
@@ -205,99 +205,6 @@ static inline size_t btree_aux_data_u64s(const struct btree *b)
 	return btree_aux_data_bytes(b) / sizeof(u64);
 }
 
-typedef void (*compiled_unpack_fn)(struct bkey *, const struct bkey_packed *);
-
-static inline void
-__bkey_unpack_key_format_checked(const struct btree *b,
-			       struct bkey *dst,
-			       const struct bkey_packed *src)
-{
-	if (IS_ENABLED(HAVE_BCACHEFS_COMPILED_UNPACK)) {
-		compiled_unpack_fn unpack_fn = b->aux_data;
-		unpack_fn(dst, src);
-
-		if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) &&
-		    bch2_expensive_debug_checks) {
-			struct bkey dst2 = __bch2_bkey_unpack_key(&b->format, src);
-
-			BUG_ON(memcmp(dst, &dst2, sizeof(*dst)));
-		}
-	} else {
-		*dst = __bch2_bkey_unpack_key(&b->format, src);
-	}
-}
-
-static inline struct bkey
-bkey_unpack_key_format_checked(const struct btree *b,
-			       const struct bkey_packed *src)
-{
-	struct bkey dst;
-
-	__bkey_unpack_key_format_checked(b, &dst, src);
-	return dst;
-}
-
-static inline void __bkey_unpack_key(const struct btree *b,
-				     struct bkey *dst,
-				     const struct bkey_packed *src)
-{
-	if (likely(bkey_packed(src)))
-		__bkey_unpack_key_format_checked(b, dst, src);
-	else
-		*dst = *packed_to_bkey_c(src);
-}
-
-/**
- * bkey_unpack_key -- unpack just the key, not the value
- */
-static inline struct bkey bkey_unpack_key(const struct btree *b,
-					  const struct bkey_packed *src)
-{
-	return likely(bkey_packed(src))
-		? bkey_unpack_key_format_checked(b, src)
-		: *packed_to_bkey_c(src);
-}
-
-static inline struct bpos
-bkey_unpack_pos_format_checked(const struct btree *b,
-			       const struct bkey_packed *src)
-{
-#ifdef HAVE_BCACHEFS_COMPILED_UNPACK
-	return bkey_unpack_key_format_checked(b, src).p;
-#else
-	return __bkey_unpack_pos(&b->format, src);
-#endif
-}
-
-static inline struct bpos bkey_unpack_pos(const struct btree *b,
-					  const struct bkey_packed *src)
-{
-	return likely(bkey_packed(src))
-		? bkey_unpack_pos_format_checked(b, src)
-		: packed_to_bkey_c(src)->p;
-}
-
-/* Disassembled bkeys */
-
-static inline struct bkey_s_c bkey_disassemble(struct btree *b,
-					       const struct bkey_packed *k,
-					       struct bkey *u)
-{
-	__bkey_unpack_key(b, u, k);
-
-	return (struct bkey_s_c) { u, bkeyp_val(&b->format, k), };
-}
-
-/* non const version: */
-static inline struct bkey_s __bkey_disassemble(struct btree *b,
-					       struct bkey_packed *k,
-					       struct bkey *u)
-{
-	__bkey_unpack_key(b, u, k);
-
-	return (struct bkey_s) { .k = u, .v = bkeyp_val(&b->format, k), };
-}
-
 #define for_each_bset(_b, _t)						\
 	for (_t = (_b)->set; _t < (_b)->set + (_b)->nsets; _t++)
 
diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h
index a4df3e866bb8..238da8dbc5da 100644
--- a/fs/bcachefs/btree_cache.h
+++ b/fs/bcachefs/btree_cache.h
@@ -4,6 +4,7 @@
 
 #include "bcachefs.h"
 #include "btree_types.h"
+#include "bkey_methods.h"
 
 extern const char * const bch2_btree_node_flags[];
 
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 2b57e6d6ed31..c2bb6b656f4e 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -5,7 +5,7 @@
 #include <linux/list.h>
 #include <linux/rhashtable.h>
 
-#include "bkey_methods.h"
+//#include "bkey_methods.h"
 #include "buckets_types.h"
 #include "darray.h"
 #include "journal_types.h"
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index a43622193355..ff61a0054eaa 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -229,8 +229,6 @@ int bch2_trans_mark_inode(struct btree_trans *, enum btree_id, unsigned, struct
 int bch2_trans_mark_reservation(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_i *, unsigned);
 int bch2_trans_mark_reflink_p(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_i *, unsigned);
 
-int bch2_mark_key(struct btree_trans *, struct bkey_s_c, struct bkey_s_c, unsigned);
-
 int bch2_trans_fs_usage_apply(struct btree_trans *, struct replicas_delta_list *);
 
 int bch2_trans_mark_metadata_bucket(struct btree_trans *, struct bch_dev *,
diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h
index 2ac2fc10513b..717a0bc95d93 100644
--- a/fs/bcachefs/inode.h
+++ b/fs/bcachefs/inode.h
@@ -2,6 +2,7 @@
 #ifndef _BCACHEFS_INODE_H
 #define _BCACHEFS_INODE_H
 
+#include "bkey.h"
 #include "opts.h"
 
 extern const char * const bch2_inode_opts[];
diff --git a/fs/bcachefs/keylist.c b/fs/bcachefs/keylist.c
index cda77835b9ea..5e85055b0f93 100644
--- a/fs/bcachefs/keylist.c
+++ b/fs/bcachefs/keylist.c
@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 
 #include "bcachefs.h"
+#include "bkey.h"
 #include "keylist.h"
 
 int bch2_keylist_realloc(struct keylist *l, u64 *inline_u64s,
diff --git a/fs/bcachefs/replicas.h b/fs/bcachefs/replicas.h
index 87820b2e1ad3..cc34b3809206 100644
--- a/fs/bcachefs/replicas.h
+++ b/fs/bcachefs/replicas.h
@@ -2,6 +2,7 @@
 #ifndef _BCACHEFS_REPLICAS_H
 #define _BCACHEFS_REPLICAS_H
 
+#include "bkey.h"
 #include "eytzinger.h"
 #include "replicas_types.h"
 
-- 
cgit 


From e5baf3dad91a6561ab81e2514217876d58648c1a Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 21 Oct 2022 19:20:09 -0400
Subject: bcachefs: bch2_bkey_cmp_packed_inlined()

This adds an inlined version of bch2_bkey_cmp_packed(), and uses it in
bch2_sort_keys(), where it's part of the inner loop.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bkey.c      | 108 ++--------------------------------------
 fs/bcachefs/bkey_cmp.h  | 129 ++++++++++++++++++++++++++++++++++++++++++++++++
 fs/bcachefs/bkey_sort.c |   5 +-
 3 files changed, 135 insertions(+), 107 deletions(-)
 create mode 100644 fs/bcachefs/bkey_cmp.h

(limited to 'fs')

diff --git a/fs/bcachefs/bkey.c b/fs/bcachefs/bkey.c
index e09a5e3fd709..161b5bd60a63 100644
--- a/fs/bcachefs/bkey.c
+++ b/fs/bcachefs/bkey.c
@@ -2,6 +2,7 @@
 
 #include "bcachefs.h"
 #include "bkey.h"
+#include "bkey_cmp.h"
 #include "bkey_methods.h"
 #include "bset.h"
 #include "util.h"
@@ -763,50 +764,6 @@ unsigned bch2_bkey_ffs(const struct btree *b, const struct bkey_packed *k)
 
 #ifdef HAVE_BCACHEFS_COMPILED_UNPACK
 
-static inline int __bkey_cmp_bits(const u64 *l, const u64 *r,
-				  unsigned nr_key_bits)
-{
-	long d0, d1, d2, d3;
-	int cmp;
-
-	/* we shouldn't need asm for this, but gcc is being retarded: */
-
-	asm(".intel_syntax noprefix;"
-	    "xor eax, eax;"
-	    "xor edx, edx;"
-	    "1:;"
-	    "mov r8, [rdi];"
-	    "mov r9, [rsi];"
-	    "sub ecx, 64;"
-	    "jl 2f;"
-
-	    "cmp r8, r9;"
-	    "jnz 3f;"
-
-	    "lea rdi, [rdi - 8];"
-	    "lea rsi, [rsi - 8];"
-	    "jmp 1b;"
-
-	    "2:;"
-	    "not ecx;"
-	    "shr r8, 1;"
-	    "shr r9, 1;"
-	    "shr r8, cl;"
-	    "shr r9, cl;"
-	    "cmp r8, r9;"
-
-	    "3:\n"
-	    "seta al;"
-	    "setb dl;"
-	    "sub eax, edx;"
-	    ".att_syntax prefix;"
-	    : "=&D" (d0), "=&S" (d1), "=&d" (d2), "=&c" (d3), "=&a" (cmp)
-	    : "0" (l), "1" (r), "3" (nr_key_bits)
-	    : "r8", "r9", "cc", "memory");
-
-	return cmp;
-}
-
 #define I(_x)			(*(out)++ = (_x))
 #define I1(i0)						I(i0)
 #define I2(i0, i1)		(I1(i0),		I(i1))
@@ -1037,40 +994,6 @@ int bch2_compile_bkey_format(const struct bkey_format *format, void *_out)
 }
 
 #else
-static inline int __bkey_cmp_bits(const u64 *l, const u64 *r,
-				  unsigned nr_key_bits)
-{
-	u64 l_v, r_v;
-
-	if (!nr_key_bits)
-		return 0;
-
-	/* for big endian, skip past header */
-	nr_key_bits += high_bit_offset;
-	l_v = *l & (~0ULL >> high_bit_offset);
-	r_v = *r & (~0ULL >> high_bit_offset);
-
-	while (1) {
-		if (nr_key_bits < 64) {
-			l_v >>= 64 - nr_key_bits;
-			r_v >>= 64 - nr_key_bits;
-			nr_key_bits = 0;
-		} else {
-			nr_key_bits -= 64;
-		}
-
-		if (!nr_key_bits || l_v != r_v)
-			break;
-
-		l = next_word(l);
-		r = next_word(r);
-
-		l_v = *l;
-		r_v = *r;
-	}
-
-	return cmp_int(l_v, r_v);
-}
 #endif
 
 __pure
@@ -1078,19 +1001,7 @@ int __bch2_bkey_cmp_packed_format_checked(const struct bkey_packed *l,
 					  const struct bkey_packed *r,
 					  const struct btree *b)
 {
-	const struct bkey_format *f = &b->format;
-	int ret;
-
-	EBUG_ON(!bkey_packed(l) || !bkey_packed(r));
-	EBUG_ON(b->nr_key_bits != bkey_format_key_bits(f));
-
-	ret = __bkey_cmp_bits(high_word(f, l),
-			      high_word(f, r),
-			      b->nr_key_bits);
-
-	EBUG_ON(ret != bpos_cmp(bkey_unpack_pos(b, l),
-				bkey_unpack_pos(b, r)));
-	return ret;
+	return __bch2_bkey_cmp_packed_format_checked_inlined(l, r, b);
 }
 
 __pure __flatten
@@ -1106,20 +1017,7 @@ int bch2_bkey_cmp_packed(const struct btree *b,
 			 const struct bkey_packed *l,
 			 const struct bkey_packed *r)
 {
-	struct bkey unpacked;
-
-	if (likely(bkey_packed(l) && bkey_packed(r)))
-		return __bch2_bkey_cmp_packed_format_checked(l, r, b);
-
-	if (bkey_packed(l)) {
-		__bkey_unpack_key_format_checked(b, &unpacked, l);
-		l = (void *) &unpacked;
-	} else if (bkey_packed(r)) {
-		__bkey_unpack_key_format_checked(b, &unpacked, r);
-		r = (void *) &unpacked;
-	}
-
-	return bpos_cmp(((struct bkey *) l)->p, ((struct bkey *) r)->p);
+	return bch2_bkey_cmp_packed_inlined(b, l, r);
 }
 
 __pure __flatten
diff --git a/fs/bcachefs/bkey_cmp.h b/fs/bcachefs/bkey_cmp.h
new file mode 100644
index 000000000000..5f42a6e69360
--- /dev/null
+++ b/fs/bcachefs/bkey_cmp.h
@@ -0,0 +1,129 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BKEY_CMP_H
+#define _BCACHEFS_BKEY_CMP_H
+
+#include "bkey.h"
+
+#ifdef CONFIG_X86_64
+static inline int __bkey_cmp_bits(const u64 *l, const u64 *r,
+				  unsigned nr_key_bits)
+{
+	long d0, d1, d2, d3;
+	int cmp;
+
+	/* we shouldn't need asm for this, but gcc is being retarded: */
+
+	asm(".intel_syntax noprefix;"
+	    "xor eax, eax;"
+	    "xor edx, edx;"
+	    "1:;"
+	    "mov r8, [rdi];"
+	    "mov r9, [rsi];"
+	    "sub ecx, 64;"
+	    "jl 2f;"
+
+	    "cmp r8, r9;"
+	    "jnz 3f;"
+
+	    "lea rdi, [rdi - 8];"
+	    "lea rsi, [rsi - 8];"
+	    "jmp 1b;"
+
+	    "2:;"
+	    "not ecx;"
+	    "shr r8, 1;"
+	    "shr r9, 1;"
+	    "shr r8, cl;"
+	    "shr r9, cl;"
+	    "cmp r8, r9;"
+
+	    "3:\n"
+	    "seta al;"
+	    "setb dl;"
+	    "sub eax, edx;"
+	    ".att_syntax prefix;"
+	    : "=&D" (d0), "=&S" (d1), "=&d" (d2), "=&c" (d3), "=&a" (cmp)
+	    : "0" (l), "1" (r), "3" (nr_key_bits)
+	    : "r8", "r9", "cc", "memory");
+
+	return cmp;
+}
+#else
+static inline int __bkey_cmp_bits(const u64 *l, const u64 *r,
+				  unsigned nr_key_bits)
+{
+	u64 l_v, r_v;
+
+	if (!nr_key_bits)
+		return 0;
+
+	/* for big endian, skip past header */
+	nr_key_bits += high_bit_offset;
+	l_v = *l & (~0ULL >> high_bit_offset);
+	r_v = *r & (~0ULL >> high_bit_offset);
+
+	while (1) {
+		if (nr_key_bits < 64) {
+			l_v >>= 64 - nr_key_bits;
+			r_v >>= 64 - nr_key_bits;
+			nr_key_bits = 0;
+		} else {
+			nr_key_bits -= 64;
+		}
+
+		if (!nr_key_bits || l_v != r_v)
+			break;
+
+		l = next_word(l);
+		r = next_word(r);
+
+		l_v = *l;
+		r_v = *r;
+	}
+
+	return cmp_int(l_v, r_v);
+}
+#endif
+
+static inline __pure __flatten
+int __bch2_bkey_cmp_packed_format_checked_inlined(const struct bkey_packed *l,
+					  const struct bkey_packed *r,
+					  const struct btree *b)
+{
+	const struct bkey_format *f = &b->format;
+	int ret;
+
+	EBUG_ON(!bkey_packed(l) || !bkey_packed(r));
+	EBUG_ON(b->nr_key_bits != bkey_format_key_bits(f));
+
+	ret = __bkey_cmp_bits(high_word(f, l),
+			      high_word(f, r),
+			      b->nr_key_bits);
+
+	EBUG_ON(ret != bpos_cmp(bkey_unpack_pos(b, l),
+				bkey_unpack_pos(b, r)));
+	return ret;
+}
+
+static inline __pure __flatten
+int bch2_bkey_cmp_packed_inlined(const struct btree *b,
+			 const struct bkey_packed *l,
+			 const struct bkey_packed *r)
+{
+	struct bkey unpacked;
+
+	if (likely(bkey_packed(l) && bkey_packed(r)))
+		return __bch2_bkey_cmp_packed_format_checked_inlined(l, r, b);
+
+	if (bkey_packed(l)) {
+		__bkey_unpack_key_format_checked(b, &unpacked, l);
+		l = (void *) &unpacked;
+	} else if (bkey_packed(r)) {
+		__bkey_unpack_key_format_checked(b, &unpacked, r);
+		r = (void *) &unpacked;
+	}
+
+	return bpos_cmp(((struct bkey *) l)->p, ((struct bkey *) r)->p);
+}
+
+#endif /* _BCACHEFS_BKEY_CMP_H */
diff --git a/fs/bcachefs/bkey_sort.c b/fs/bcachefs/bkey_sort.c
index b1385a77da11..be0d4bc1afd3 100644
--- a/fs/bcachefs/bkey_sort.c
+++ b/fs/bcachefs/bkey_sort.c
@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 #include "bcachefs.h"
 #include "bkey_buf.h"
+#include "bkey_cmp.h"
 #include "bkey_sort.h"
 #include "bset.h"
 #include "extents.h"
@@ -155,7 +156,7 @@ static inline int sort_keys_cmp(struct btree *b,
 				struct bkey_packed *l,
 				struct bkey_packed *r)
 {
-	return bch2_bkey_cmp_packed(b, l, r) ?:
+	return bch2_bkey_cmp_packed_inlined(b, l, r) ?:
 		(int) bkey_deleted(r) - (int) bkey_deleted(l) ?:
 		(int) l->needs_whiteout - (int) r->needs_whiteout;
 }
@@ -177,7 +178,7 @@ unsigned bch2_sort_keys(struct bkey_packed *dst,
 			continue;
 
 		while ((next = sort_iter_peek(iter)) &&
-		       !bch2_bkey_cmp_packed(iter->b, in, next)) {
+		       !bch2_bkey_cmp_packed_inlined(iter->b, in, next)) {
 			BUG_ON(in->needs_whiteout &&
 			       next->needs_whiteout);
 			needs_whiteout |= in->needs_whiteout;
-- 
cgit 


From fd0c767966327c1b938b489de0f6d2d2036b9055 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 22 Oct 2022 15:00:16 -0400
Subject: bcachefs: Convert to __packed and __aligned

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs_format.h | 92 +++++++++++++++++++++----------------------
 fs/bcachefs/bcachefs_ioctl.h  |  8 ++--
 fs/bcachefs/btree_types.h     |  2 +-
 fs/bcachefs/inode.h           |  2 +-
 4 files changed, 52 insertions(+), 52 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index 2047484ebe4b..35fe7002b37d 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -151,7 +151,7 @@ struct bpos {
 #else
 #error edit for your odd byteorder.
 #endif
-} __attribute__((packed, aligned(4)));
+} __packed __aligned(4);
 
 #define KEY_INODE_MAX			((__u64)~0ULL)
 #define KEY_OFFSET_MAX			((__u64)~0ULL)
@@ -185,7 +185,7 @@ struct bversion {
 	__u32		hi;
 	__u64		lo;
 #endif
-} __attribute__((packed, aligned(4)));
+} __packed __aligned(4);
 
 struct bkey {
 	/* Size of combined key and value, in u64s */
@@ -218,7 +218,7 @@ struct bkey {
 
 	__u8		pad[1];
 #endif
-} __attribute__((packed, aligned(8)));
+} __packed __aligned(8);
 
 struct bkey_packed {
 	__u64		_data[0];
@@ -252,7 +252,7 @@ struct bkey_packed {
 	 * to the same size as struct bkey should hopefully be safest.
 	 */
 	__u8		pad[sizeof(struct bkey) - 3];
-} __attribute__((packed, aligned(8)));
+} __packed __aligned(8);
 
 #define BKEY_U64s			(sizeof(struct bkey) / sizeof(__u64))
 #define BKEY_U64s_MAX			U8_MAX
@@ -480,7 +480,7 @@ struct bch_set {
 struct bch_csum {
 	__le64			lo;
 	__le64			hi;
-} __attribute__((packed, aligned(8)));
+} __packed __aligned(8);
 
 #define BCH_EXTENT_ENTRY_TYPES()		\
 	x(ptr,			0)		\
@@ -517,7 +517,7 @@ struct bch_extent_crc32 {
 				_compressed_size:7,
 				type:2;
 #endif
-} __attribute__((packed, aligned(8)));
+} __packed __aligned(8);
 
 #define CRC32_SIZE_MAX		(1U << 7)
 #define CRC32_NONCE_MAX		0
@@ -543,7 +543,7 @@ struct bch_extent_crc64 {
 				type:3;
 #endif
 	__u64			csum_lo;
-} __attribute__((packed, aligned(8)));
+} __packed __aligned(8);
 
 #define CRC64_SIZE_MAX		(1U << 9)
 #define CRC64_NONCE_MAX		((1U << 10) - 1)
@@ -567,7 +567,7 @@ struct bch_extent_crc128 {
 				type:4;
 #endif
 	struct bch_csum		csum;
-} __attribute__((packed, aligned(8)));
+} __packed __aligned(8);
 
 #define CRC128_SIZE_MAX		(1U << 13)
 #define CRC128_NONCE_MAX	((1U << 13) - 1)
@@ -593,7 +593,7 @@ struct bch_extent_ptr {
 				cached:1,
 				type:1;
 #endif
-} __attribute__((packed, aligned(8)));
+} __packed __aligned(8);
 
 struct bch_extent_stripe_ptr {
 #if defined(__LITTLE_ENDIAN_BITFIELD)
@@ -645,7 +645,7 @@ struct bch_btree_ptr {
 
 	__u64			_data[0];
 	struct bch_extent_ptr	start[];
-} __attribute__((packed, aligned(8)));
+} __packed __aligned(8);
 
 struct bch_btree_ptr_v2 {
 	struct bch_val		v;
@@ -657,7 +657,7 @@ struct bch_btree_ptr_v2 {
 	struct bpos		min_key;
 	__u64			_data[0];
 	struct bch_extent_ptr	start[];
-} __attribute__((packed, aligned(8)));
+} __packed __aligned(8);
 
 LE16_BITMASK(BTREE_PTR_RANGE_UPDATED,	struct bch_btree_ptr_v2, flags, 0, 1);
 
@@ -666,7 +666,7 @@ struct bch_extent {
 
 	__u64			_data[0];
 	union bch_extent_entry	start[];
-} __attribute__((packed, aligned(8)));
+} __packed __aligned(8);
 
 struct bch_reservation {
 	struct bch_val		v;
@@ -674,7 +674,7 @@ struct bch_reservation {
 	__le32			generation;
 	__u8			nr_replicas;
 	__u8			pad[3];
-} __attribute__((packed, aligned(8)));
+} __packed __aligned(8);
 
 /* Maximum size (in u64s) a single pointer could be: */
 #define BKEY_EXTENT_PTR_U64s_MAX\
@@ -708,7 +708,7 @@ struct bch_inode {
 	__le32			bi_flags;
 	__le16			bi_mode;
 	__u8			fields[0];
-} __attribute__((packed, aligned(8)));
+} __packed __aligned(8);
 
 struct bch_inode_v2 {
 	struct bch_val		v;
@@ -718,14 +718,14 @@ struct bch_inode_v2 {
 	__le64			bi_flags;
 	__le16			bi_mode;
 	__u8			fields[0];
-} __attribute__((packed, aligned(8)));
+} __packed __aligned(8);
 
 struct bch_inode_generation {
 	struct bch_val		v;
 
 	__le32			bi_generation;
 	__le32			pad;
-} __attribute__((packed, aligned(8)));
+} __packed __aligned(8);
 
 /*
  * bi_subvol and bi_parent_subvol are only set for subvolume roots:
@@ -846,7 +846,7 @@ struct bch_dirent {
 	__u8			d_type;
 
 	__u8			d_name[];
-} __attribute__((packed, aligned(8)));
+} __packed __aligned(8);
 
 #define DT_SUBVOL	16
 #define BCH_DT_MAX	17
@@ -869,7 +869,7 @@ struct bch_xattr {
 	__u8			x_name_len;
 	__le16			x_val_len;
 	__u8			x_name[];
-} __attribute__((packed, aligned(8)));
+} __packed __aligned(8);
 
 /* Bucket/allocation information: */
 
@@ -878,7 +878,7 @@ struct bch_alloc {
 	__u8			fields;
 	__u8			gen;
 	__u8			data[];
-} __attribute__((packed, aligned(8)));
+} __packed __aligned(8);
 
 #define BCH_ALLOC_FIELDS_V1()			\
 	x(read_time,		16)		\
@@ -897,7 +897,7 @@ struct bch_alloc_v2 {
 	__u8			oldest_gen;
 	__u8			data_type;
 	__u8			data[];
-} __attribute__((packed, aligned(8)));
+} __packed __aligned(8);
 
 #define BCH_ALLOC_FIELDS_V2()			\
 	x(read_time,		64)		\
@@ -916,7 +916,7 @@ struct bch_alloc_v3 {
 	__u8			oldest_gen;
 	__u8			data_type;
 	__u8			data[];
-} __attribute__((packed, aligned(8)));
+} __packed __aligned(8);
 
 struct bch_alloc_v4 {
 	struct bch_val		v;
@@ -932,7 +932,7 @@ struct bch_alloc_v4 {
 	__u32			stripe;
 	__u32			nr_external_backpointers;
 	struct bpos		backpointers[0];
-} __attribute__((packed, aligned(8)));
+} __packed __aligned(8);
 
 LE32_BITMASK(BCH_ALLOC_V3_NEED_DISCARD,struct bch_alloc_v3, flags,  0,  1)
 LE32_BITMASK(BCH_ALLOC_V3_NEED_INC_GEN,struct bch_alloc_v3, flags,  1,  2)
@@ -971,7 +971,7 @@ struct bch_quota_counter {
 struct bch_quota {
 	struct bch_val		v;
 	struct bch_quota_counter c[Q_COUNTERS];
-} __attribute__((packed, aligned(8)));
+} __packed __aligned(8);
 
 /* Erasure coding */
 
@@ -987,7 +987,7 @@ struct bch_stripe {
 	__u8			pad;
 
 	struct bch_extent_ptr	ptrs[];
-} __attribute__((packed, aligned(8)));
+} __packed __aligned(8);
 
 /* Reflink: */
 
@@ -1004,14 +1004,14 @@ struct bch_reflink_p {
 	 */
 	__le32			front_pad;
 	__le32			back_pad;
-} __attribute__((packed, aligned(8)));
+} __packed __aligned(8);
 
 struct bch_reflink_v {
 	struct bch_val		v;
 	__le64			refcount;
 	union bch_extent_entry	start[0];
 	__u64			_data[0];
-} __attribute__((packed, aligned(8)));
+} __packed __aligned(8);
 
 struct bch_indirect_inline_data {
 	struct bch_val		v;
@@ -1068,7 +1068,7 @@ LE32_BITMASK(BCH_SNAPSHOT_SUBVOL,	struct bch_snapshot, flags,  1,  2)
 struct bch_lru {
 	struct bch_val		v;
 	__le64			idx;
-} __attribute__((packed, aligned(8)));
+} __packed __aligned(8);
 
 #define LRU_ID_STRIPES		(1U << 16)
 
@@ -1267,19 +1267,19 @@ struct bch_replicas_entry_v0 {
 	__u8			data_type;
 	__u8			nr_devs;
 	__u8			devs[];
-} __attribute__((packed));
+} __packed;
 
 struct bch_sb_field_replicas_v0 {
 	struct bch_sb_field	field;
 	struct bch_replicas_entry_v0 entries[];
-} __attribute__((packed, aligned(8)));
+} __packed __aligned(8);
 
 struct bch_replicas_entry {
 	__u8			data_type;
 	__u8			nr_devs;
 	__u8			nr_required;
 	__u8			devs[];
-} __attribute__((packed));
+} __packed;
 
 #define replicas_entry_bytes(_i)					\
 	(offsetof(typeof(*(_i)), devs) + (_i)->nr_devs)
@@ -1287,7 +1287,7 @@ struct bch_replicas_entry {
 struct bch_sb_field_replicas {
 	struct bch_sb_field	field;
 	struct bch_replicas_entry entries[];
-} __attribute__((packed, aligned(8)));
+} __packed __aligned(8);
 
 /* BCH_SB_FIELD_quota: */
 
@@ -1304,7 +1304,7 @@ struct bch_sb_quota_type {
 struct bch_sb_field_quota {
 	struct bch_sb_field		field;
 	struct bch_sb_quota_type	q[QTYP_NR];
-} __attribute__((packed, aligned(8)));
+} __packed __aligned(8);
 
 /* BCH_SB_FIELD_disk_groups: */
 
@@ -1313,7 +1313,7 @@ struct bch_sb_field_quota {
 struct bch_disk_group {
 	__u8			label[BCH_SB_LABEL_SIZE];
 	__le64			flags[2];
-} __attribute__((packed, aligned(8)));
+} __packed __aligned(8);
 
 LE64_BITMASK(BCH_GROUP_DELETED,		struct bch_disk_group, flags[0], 0,  1)
 LE64_BITMASK(BCH_GROUP_DATA_ALLOWED,	struct bch_disk_group, flags[0], 1,  6)
@@ -1322,7 +1322,7 @@ LE64_BITMASK(BCH_GROUP_PARENT,		struct bch_disk_group, flags[0], 6, 24)
 struct bch_sb_field_disk_groups {
 	struct bch_sb_field	field;
 	struct bch_disk_group	entries[0];
-} __attribute__((packed, aligned(8)));
+} __packed __aligned(8);
 
 /* BCH_SB_FIELD_counters */
 
@@ -1504,7 +1504,7 @@ struct bch_sb_layout {
 	__u8			nr_superblocks;
 	__u8			pad[5];
 	__le64			sb_offset[61];
-} __attribute__((packed, aligned(8)));
+} __packed __aligned(8);
 
 #define BCH_SB_LAYOUT_SECTOR	7
 
@@ -1555,7 +1555,7 @@ struct bch_sb {
 		struct bch_sb_field start[0];
 		__le64		_data[0];
 	};
-} __attribute__((packed, aligned(8)));
+} __packed __aligned(8);
 
 /*
  * Flags:
@@ -1914,26 +1914,26 @@ enum {
 struct jset_entry_usage {
 	struct jset_entry	entry;
 	__le64			v;
-} __attribute__((packed));
+} __packed;
 
 struct jset_entry_data_usage {
 	struct jset_entry	entry;
 	__le64			v;
 	struct bch_replicas_entry r;
-} __attribute__((packed));
+} __packed;
 
 struct jset_entry_clock {
 	struct jset_entry	entry;
 	__u8			rw;
 	__u8			pad[7];
 	__le64			time;
-} __attribute__((packed));
+} __packed;
 
 struct jset_entry_dev_usage_type {
 	__le64			buckets;
 	__le64			sectors;
 	__le64			fragmented;
-} __attribute__((packed));
+} __packed;
 
 struct jset_entry_dev_usage {
 	struct jset_entry	entry;
@@ -1944,7 +1944,7 @@ struct jset_entry_dev_usage {
 	__le64			_buckets_unavailable; /* No longer used */
 
 	struct jset_entry_dev_usage_type d[];
-} __attribute__((packed));
+} __packed;
 
 static inline unsigned jset_entry_dev_usage_nr_types(struct jset_entry_dev_usage *u)
 {
@@ -1955,7 +1955,7 @@ static inline unsigned jset_entry_dev_usage_nr_types(struct jset_entry_dev_usage
 struct jset_entry_log {
 	struct jset_entry	entry;
 	u8			d[];
-} __attribute__((packed));
+} __packed;
 
 /*
  * On disk format for a journal entry:
@@ -1990,7 +1990,7 @@ struct jset {
 		struct jset_entry start[0];
 		__u64		_data[0];
 	};
-} __attribute__((packed, aligned(8)));
+} __packed __aligned(8);
 
 LE32_BITMASK(JSET_CSUM_TYPE,	struct jset, flags, 0, 4);
 LE32_BITMASK(JSET_BIG_ENDIAN,	struct jset, flags, 4, 5);
@@ -2052,7 +2052,7 @@ struct bset {
 		struct bkey_packed start[0];
 		__u64		_data[0];
 	};
-} __attribute__((packed, aligned(8)));
+} __packed __aligned(8);
 
 LE32_BITMASK(BSET_CSUM_TYPE,	struct bset, flags, 0, 4);
 
@@ -2085,7 +2085,7 @@ struct btree_node {
 
 	};
 	};
-} __attribute__((packed, aligned(8)));
+} __packed __aligned(8);
 
 LE64_BITMASK(BTREE_NODE_ID,	struct btree_node, flags,  0,  4);
 LE64_BITMASK(BTREE_NODE_LEVEL,	struct btree_node, flags,  4,  8);
@@ -2106,6 +2106,6 @@ struct btree_node_entry {
 
 	};
 	};
-} __attribute__((packed, aligned(8)));
+} __packed __aligned(8);
 
 #endif /* _BCACHEFS_FORMAT_H */
diff --git a/fs/bcachefs/bcachefs_ioctl.h b/fs/bcachefs/bcachefs_ioctl.h
index 5e0062c6ec5c..f05881f7e113 100644
--- a/fs/bcachefs/bcachefs_ioctl.h
+++ b/fs/bcachefs/bcachefs_ioctl.h
@@ -208,7 +208,7 @@ struct bch_ioctl_data {
 		__u64		pad[8];
 	};
 	};
-} __attribute__((packed, aligned(8)));
+} __packed __aligned(8);
 
 enum bch_data_event {
 	BCH_DATA_EVENT_PROGRESS	= 0,
@@ -224,7 +224,7 @@ struct bch_ioctl_data_progress {
 
 	__u64			sectors_done;
 	__u64			sectors_total;
-} __attribute__((packed, aligned(8)));
+} __packed __aligned(8);
 
 struct bch_ioctl_data_event {
 	__u8			type;
@@ -233,12 +233,12 @@ struct bch_ioctl_data_event {
 	struct bch_ioctl_data_progress p;
 	__u64			pad2[15];
 	};
-} __attribute__((packed, aligned(8)));
+} __packed __aligned(8);
 
 struct bch_replicas_usage {
 	__u64			sectors;
 	struct bch_replicas_entry r;
-} __attribute__((packed));
+} __packed;
 
 static inline struct bch_replicas_usage *
 replicas_usage_next(struct bch_replicas_usage *u)
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index c2bb6b656f4e..ea844dd7a16b 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -313,7 +313,7 @@ struct btree_key_cache {
 struct bkey_cached_key {
 	u32			btree_id;
 	struct bpos		pos;
-} __attribute__((packed, aligned(4)));
+} __packed __aligned(4);
 
 #define BKEY_CACHED_ACCESSED		0
 #define BKEY_CACHED_DIRTY		1
diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h
index 717a0bc95d93..5c80bdf587f9 100644
--- a/fs/bcachefs/inode.h
+++ b/fs/bcachefs/inode.h
@@ -66,7 +66,7 @@ struct bkey_inode_buf {
 #define x(_name, _bits)		+ 8 + _bits / 8
 	u8		_pad[0 + BCH_INODE_FIELDS()];
 #undef  x
-} __attribute__((packed, aligned(8)));
+} __packed __aligned(8);
 
 void bch2_inode_pack(struct bch_fs *, struct bkey_inode_buf *,
 		     const struct bch_inode_unpacked *);
-- 
cgit 


From df6a24f81aa29a0e844afb53f7d5bc8989cdbac3 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 22 Oct 2022 15:10:28 -0400
Subject: bcachefs: Make error messages more uniform

Use __func__ in error messages that refer to function name, and do so
more uniformly - suggested by checkpatch.pl

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update_interior.c |  6 ++---
 fs/bcachefs/buckets.c               |  8 ++++---
 fs/bcachefs/fsck.c                  | 26 ++++++++++----------
 fs/bcachefs/tests.c                 | 47 +++++++++++++++++--------------------
 4 files changed, 43 insertions(+), 44 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 30b7c46cb86b..0150943074fa 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -665,7 +665,7 @@ static void btree_update_nodes_written(struct btree_update *as)
 	bch2_trans_unlock(&trans);
 
 	bch2_fs_fatal_err_on(ret && !bch2_journal_error(&c->journal), c,
-			     "error %i in btree_update_nodes_written()", ret);
+			     "%s(): error %s", __func__, bch2_err_str(ret));
 err:
 	if (as->b) {
 		struct btree_path *path;
@@ -1839,10 +1839,10 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans,
 		bch2_bpos_to_text(&buf1, prev->data->max_key);
 		bch2_bpos_to_text(&buf2, next->data->min_key);
 		bch_err(c,
-			"btree topology error in btree merge:\n"
+			"%s(): btree topology error:\n"
 			"  prev ends at   %s\n"
 			"  next starts at %s",
-			buf1.buf, buf2.buf);
+			__func__, buf1.buf, buf2.buf);
 		printbuf_exit(&buf1);
 		printbuf_exit(&buf2);
 		bch2_topology_error(c);
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 17a1e4767077..fffe59f0e89b 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -578,7 +578,8 @@ int bch2_mark_alloc(struct btree_trans *trans,
 					    -((s64) old_a.cached_sectors),
 					    journal_seq, gc);
 		if (ret) {
-			bch2_fs_fatal_error(c, "bch2_mark_alloc(): no replicas entry while updating cached sectors");
+			bch2_fs_fatal_error(c, "%s(): no replicas entry while updating cached sectors",
+					    __func__);
 			return ret;
 		}
 	}
@@ -972,7 +973,8 @@ int bch2_mark_extent(struct btree_trans *trans,
 				ret = update_cached_sectors(c, k, p.ptr.dev,
 						disk_sectors, journal_seq, true);
 				if (ret) {
-					bch2_fs_fatal_error(c, "bch2_mark_extent(): no replicas entry while updating cached sectors");
+					bch2_fs_fatal_error(c, "%s(): no replicas entry while updating cached sectors",
+							    __func__);
 					return ret;
 				}
 			}
@@ -1000,7 +1002,7 @@ int bch2_mark_extent(struct btree_trans *trans,
 			struct printbuf buf = PRINTBUF;
 
 			bch2_bkey_val_to_text(&buf, c, k);
-			bch2_fs_fatal_error(c, "no replicas entry for %s", buf.buf);
+			bch2_fs_fatal_error(c, "%s(): no replicas entry for %s", __func__, buf.buf);
 			printbuf_exit(&buf);
 			return ret;
 		}
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index ca95d85b7348..6f7310f010b9 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -321,7 +321,7 @@ static int __remove_dirent(struct btree_trans *trans, struct bpos pos)
 	bch2_trans_iter_exit(trans, &iter);
 err:
 	if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
-		bch_err(c, "error from __remove_dirent(): %s", bch2_err_str(ret));
+		bch_err(c, "%s(): error %s", __func__, bch2_err_str(ret));
 	return ret;
 }
 
@@ -506,7 +506,7 @@ static int snapshots_seen_add(struct bch_fs *c, struct snapshots_seen *s, u32 id
 			break;
 
 		if (i->equiv == n.equiv) {
-			bch_err(c, "adding duplicate snapshot in snapshots_seen_add()");
+			bch_err(c, "%s(): adding duplicate snapshot", __func__);
 			return -EINVAL;
 		}
 	}
@@ -1000,7 +1000,7 @@ static int check_inode(struct btree_trans *trans,
 err:
 fsck_err:
 	if (ret)
-		bch_err(c, "error from check_inode(): %s", bch2_err_str(ret));
+		bch_err(c, "%s(): error %s", __func__, bch2_err_str(ret));
 	return ret;
 }
 
@@ -1026,7 +1026,7 @@ static int check_inodes(struct bch_fs *c, bool full)
 	bch2_trans_exit(&trans);
 	snapshots_seen_exit(&s);
 	if (ret)
-		bch_err(c, "error from check_inodes(): %s", bch2_err_str(ret));
+		bch_err(c, "%s(): error %s", __func__, bch2_err_str(ret));
 	return ret;
 }
 
@@ -1159,7 +1159,7 @@ static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w)
 	}
 fsck_err:
 	if (ret)
-		bch_err(c, "error from check_i_sectors(): %s", bch2_err_str(ret));
+		bch_err(c, "%s(): error %s", __func__, bch2_err_str(ret));
 	if (!ret && trans_was_restarted(trans, restart_count))
 		ret = -BCH_ERR_transaction_restart_nested;
 	return ret;
@@ -1295,7 +1295,7 @@ fsck_err:
 	printbuf_exit(&buf);
 
 	if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
-		bch_err(c, "error from check_extent(): %s", bch2_err_str(ret));
+		bch_err(c, "%s(): error %s", __func__, bch2_err_str(ret));
 	return ret;
 }
 
@@ -1337,7 +1337,7 @@ static int check_extents(struct bch_fs *c)
 	snapshots_seen_exit(&s);
 
 	if (ret)
-		bch_err(c, "error from check_extents(): %s", bch2_err_str(ret));
+		bch_err(c, "%s(): error %s", __func__, bch2_err_str(ret));
 	return ret;
 }
 
@@ -1376,7 +1376,7 @@ static int check_subdir_count(struct btree_trans *trans, struct inode_walker *w)
 	}
 fsck_err:
 	if (ret)
-		bch_err(c, "error from check_subdir_count(): %s", bch2_err_str(ret));
+		bch_err(c, "%s(): error %s", __func__, bch2_err_str(ret));
 	if (!ret && trans_was_restarted(trans, restart_count))
 		ret = -BCH_ERR_transaction_restart_nested;
 	return ret;
@@ -1497,7 +1497,7 @@ fsck_err:
 	printbuf_exit(&buf);
 
 	if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
-		bch_err(c, "error from check_target(): %s", bch2_err_str(ret));
+		bch_err(c, "%s(): error %s", __func__, bch2_err_str(ret));
 	return ret;
 }
 
@@ -1667,7 +1667,7 @@ fsck_err:
 	printbuf_exit(&buf);
 
 	if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
-		bch_err(c, "error from check_dirent(): %s", bch2_err_str(ret));
+		bch_err(c, "%s(): error %s", __func__, bch2_err_str(ret));
 	return ret;
 }
 
@@ -1706,7 +1706,7 @@ static int check_dirents(struct bch_fs *c)
 	inode_walker_exit(&target);
 
 	if (ret)
-		bch_err(c, "error from check_dirents(): %s", bch2_err_str(ret));
+		bch_err(c, "%s(): error %s", __func__, bch2_err_str(ret));
 	return ret;
 }
 
@@ -1742,7 +1742,7 @@ static int check_xattr(struct btree_trans *trans, struct btree_iter *iter,
 	ret = hash_check_key(trans, bch2_xattr_hash_desc, hash_info, iter, k);
 fsck_err:
 	if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
-		bch_err(c, "error from check_xattr(): %s", bch2_err_str(ret));
+		bch_err(c, "%s(): error %s", __func__, bch2_err_str(ret));
 	return ret;
 }
 
@@ -1774,7 +1774,7 @@ static int check_xattrs(struct bch_fs *c)
 	bch2_trans_exit(&trans);
 
 	if (ret)
-		bch_err(c, "error from check_xattrs(): %s", bch2_err_str(ret));
+		bch_err(c, "%s(): error %s", __func__, bch2_err_str(ret));
 	return ret;
 }
 
diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c
index bed830e678bb..72364313126b 100644
--- a/fs/bcachefs/tests.c
+++ b/fs/bcachefs/tests.c
@@ -46,7 +46,7 @@ static int test_delete(struct bch_fs *c, u64 nr)
 		bch2_btree_iter_traverse(&iter) ?:
 		bch2_trans_update(&trans, &iter, &k.k_i, 0));
 	if (ret) {
-		bch_err(c, "update error in test_delete: %s", bch2_err_str(ret));
+		bch_err(c, "%s(): update error in: %s", __func__, bch2_err_str(ret));
 		goto err;
 	}
 
@@ -55,7 +55,7 @@ static int test_delete(struct bch_fs *c, u64 nr)
 		bch2_btree_iter_traverse(&iter) ?:
 		bch2_btree_delete_at(&trans, &iter, 0));
 	if (ret) {
-		bch_err(c, "delete error (first) in test_delete: %s", bch2_err_str(ret));
+		bch_err(c, "%s(): delete error (first): %s", __func__, bch2_err_str(ret));
 		goto err;
 	}
 
@@ -64,7 +64,7 @@ static int test_delete(struct bch_fs *c, u64 nr)
 		bch2_btree_iter_traverse(&iter) ?:
 		bch2_btree_delete_at(&trans, &iter, 0));
 	if (ret) {
-		bch_err(c, "delete error (second) in test_delete: %s", bch2_err_str(ret));
+		bch_err(c, "%s(): delete error (second): %s", __func__, bch2_err_str(ret));
 		goto err;
 	}
 err:
@@ -92,7 +92,7 @@ static int test_delete_written(struct bch_fs *c, u64 nr)
 		bch2_btree_iter_traverse(&iter) ?:
 		bch2_trans_update(&trans, &iter, &k.k_i, 0));
 	if (ret) {
-		bch_err(c, "update error in test_delete_written: %s", bch2_err_str(ret));
+		bch_err(c, "%s(): update error: %s", __func__, bch2_err_str(ret));
 		goto err;
 	}
 
@@ -103,7 +103,7 @@ static int test_delete_written(struct bch_fs *c, u64 nr)
 		bch2_btree_iter_traverse(&iter) ?:
 		bch2_btree_delete_at(&trans, &iter, 0));
 	if (ret) {
-		bch_err(c, "delete error in test_delete_written: %s", bch2_err_str(ret));
+		bch_err(c, "%s(): delete error: %s", __func__, bch2_err_str(ret));
 		goto err;
 	}
 err:
@@ -136,7 +136,7 @@ static int test_iterate(struct bch_fs *c, u64 nr)
 		ret = bch2_btree_insert(c, BTREE_ID_xattrs, &k.k_i,
 					NULL, NULL, 0);
 		if (ret) {
-			bch_err(c, "insert error in test_iterate: %s", bch2_err_str(ret));
+			bch_err(c, "%s(): insert error: %s", __func__, bch2_err_str(ret));
 			goto err;
 		}
 	}
@@ -202,7 +202,7 @@ static int test_iterate_extents(struct bch_fs *c, u64 nr)
 		ret = bch2_btree_insert(c, BTREE_ID_extents, &k.k_i,
 					NULL, NULL, 0);
 		if (ret) {
-			bch_err(c, "insert error in test_iterate_extents: %s", bch2_err_str(ret));
+			bch_err(c, "%s(): insert error: %s", __func__, bch2_err_str(ret));
 			goto err;
 		}
 	}
@@ -269,7 +269,7 @@ static int test_iterate_slots(struct bch_fs *c, u64 nr)
 		ret = bch2_btree_insert(c, BTREE_ID_xattrs, &k.k_i,
 					NULL, NULL, 0);
 		if (ret) {
-			bch_err(c, "insert error in test_iterate_slots: %s", bch2_err_str(ret));
+			bch_err(c, "%s(): insert error: %s", __func__, bch2_err_str(ret));
 			goto err;
 		}
 	}
@@ -342,7 +342,7 @@ static int test_iterate_slots_extents(struct bch_fs *c, u64 nr)
 		ret = bch2_btree_insert(c, BTREE_ID_extents, &k.k_i,
 					NULL, NULL, 0);
 		if (ret) {
-			bch_err(c, "insert error in test_iterate_slots_extents: %s", bch2_err_str(ret));
+			bch_err(c, "%s(): insert error: %s", __func__, bch2_err_str(ret));
 			goto err;
 		}
 	}
@@ -456,7 +456,7 @@ static int insert_test_extent(struct bch_fs *c,
 	ret = bch2_btree_insert(c, BTREE_ID_extents, &k.k_i,
 				NULL, NULL, 0);
 	if (ret)
-		bch_err(c, "insert error in insert_test_extent: %s", bch2_err_str(ret));
+		bch_err(c, "%s(): insert error: %s", __func__, bch2_err_str(ret));
 	return ret;
 }
 
@@ -555,7 +555,7 @@ static int test_snapshots(struct bch_fs *c, u64 nr)
 
 	ret = test_snapshot_filter(c, snapids[0], snapids[1]);
 	if (ret) {
-		bch_err(c, "err from test_snapshot_filter: %s", bch2_err_str(ret));
+		bch_err(c, "%s(): err from test_snapshot_filter: %s", __func__, bch2_err_str(ret));
 		return ret;
 	}
 
@@ -567,11 +567,8 @@ static int test_snapshots(struct bch_fs *c, u64 nr)
 static u64 test_rand(void)
 {
 	u64 v;
-#if 0
-	v = prandom_u32_max(U32_MAX);
-#else
+
 	get_random_bytes(&v, sizeof(v));
-#endif
 	return v;
 }
 
@@ -592,7 +589,7 @@ static int rand_insert(struct bch_fs *c, u64 nr)
 		ret = commit_do(&trans, NULL, NULL, 0,
 			__bch2_btree_insert(&trans, BTREE_ID_xattrs, &k.k_i));
 		if (ret) {
-			bch_err(c, "error in rand_insert: %s", bch2_err_str(ret));
+			bch_err(c, "%s(): error %s", __func__, bch2_err_str(ret));
 			break;
 		}
 	}
@@ -628,7 +625,7 @@ static int rand_insert_multi(struct bch_fs *c, u64 nr)
 			__bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[6].k_i) ?:
 			__bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[7].k_i));
 		if (ret) {
-			bch_err(c, "error in rand_insert_multi: %s", bch2_err_str(ret));
+			bch_err(c, "%s(): error %s", __func__, bch2_err_str(ret));
 			break;
 		}
 	}
@@ -655,7 +652,7 @@ static int rand_lookup(struct bch_fs *c, u64 nr)
 		lockrestart_do(&trans, bkey_err(k = bch2_btree_iter_peek(&iter)));
 		ret = bkey_err(k);
 		if (ret) {
-			bch_err(c, "error in rand_lookup: %s", bch2_err_str(ret));
+			bch_err(c, "%s(): error %s", __func__, bch2_err_str(ret));
 			break;
 		}
 	}
@@ -678,7 +675,7 @@ static int rand_mixed_trans(struct btree_trans *trans,
 	k = bch2_btree_iter_peek(iter);
 	ret = bkey_err(k);
 	if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
-		bch_err(trans->c, "lookup error in rand_mixed: %s", bch2_err_str(ret));
+		bch_err(trans->c, "%s(): lookup error: %s", __func__, bch2_err_str(ret));
 	if (ret)
 		return ret;
 
@@ -708,7 +705,7 @@ static int rand_mixed(struct bch_fs *c, u64 nr)
 		ret = commit_do(&trans, NULL, NULL, 0,
 			rand_mixed_trans(&trans, &iter, &cookie, i, rand));
 		if (ret) {
-			bch_err(c, "update error in rand_mixed: %s", bch2_err_str(ret));
+			bch_err(c, "%s(): update error: %s", __func__, bch2_err_str(ret));
 			break;
 		}
 	}
@@ -754,7 +751,7 @@ static int rand_delete(struct bch_fs *c, u64 nr)
 		ret = commit_do(&trans, NULL, NULL, 0,
 			__do_delete(&trans, pos));
 		if (ret) {
-			bch_err(c, "error in rand_delete: %s", bch2_err_str(ret));
+			bch_err(c, "%s(): error %s", __func__, bch2_err_str(ret));
 			break;
 		}
 	}
@@ -786,7 +783,7 @@ static int seq_insert(struct bch_fs *c, u64 nr)
 			bch2_trans_update(&trans, &iter, &insert.k_i, 0);
 		}));
 	if (ret)
-		bch_err(c, "error in %s(): %s", __func__, bch2_err_str(ret));
+		bch_err(c, "%s(): error %s", __func__, bch2_err_str(ret));
 
 	bch2_trans_exit(&trans);
 	return ret;
@@ -805,7 +802,7 @@ static int seq_lookup(struct bch_fs *c, u64 nr)
 				  SPOS(0, 0, U32_MAX), 0, k,
 		0);
 	if (ret)
-		bch_err(c, "error in %s(): %s", __func__, bch2_err_str(ret));
+		bch_err(c, "%s(): error %s", __func__, bch2_err_str(ret));
 
 	bch2_trans_exit(&trans);
 	return ret;
@@ -831,7 +828,7 @@ static int seq_overwrite(struct bch_fs *c, u64 nr)
 			bch2_trans_update(&trans, &iter, &u.k_i, 0);
 		}));
 	if (ret)
-		bch_err(c, "error in %s(): %s", __func__, bch2_err_str(ret));
+		bch_err(c, "%s(): error %s", __func__, bch2_err_str(ret));
 
 	bch2_trans_exit(&trans);
 	return ret;
@@ -845,7 +842,7 @@ static int seq_delete(struct bch_fs *c, u64 nr)
 				      SPOS(0, 0, U32_MAX), SPOS_MAX,
 				      0, NULL);
 	if (ret)
-		bch_err(c, "error in seq_delete: %s", bch2_err_str(ret));
+		bch_err(c, "%s(): error %s", __func__, bch2_err_str(ret));
 	return ret;
 }
 
-- 
cgit 


From 1f69368c5cfce6770d101aaeff46ef22d22de07c Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 22 Oct 2022 16:19:27 -0400
Subject: bcachefs: Fix an out-of-bounds shift

roundup_pow_of_two() is undefined for 0 - oops.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 15bf079e17ab..611f7b0ca014 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -2879,7 +2879,7 @@ void __bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, unsigned fn_
 	bch2_trans_alloc_paths(trans, c);
 
 	s = btree_trans_stats(trans);
-	if (s) {
+	if (s && s->max_mem) {
 		unsigned expected_mem_bytes = roundup_pow_of_two(s->max_mem);
 
 		trans->mem = kmalloc(expected_mem_bytes, GFP_KERNEL);
@@ -2890,9 +2890,9 @@ void __bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, unsigned fn_
 		} else {
 			trans->mem_bytes = expected_mem_bytes;
 		}
-
-		trans->nr_max_paths = s->nr_max_paths;
 	}
+	if (s)
+		trans->nr_max_paths = s->nr_max_paths;
 
 	trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
 
-- 
cgit 


From c167f9e54100179a009051ad6eac1dfb0bcd21f6 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 23 Oct 2022 17:37:23 -0400
Subject: bcachefs: Journal keys overlay fixes

 - In the btree iterator code that overlays keys from the journal, we
   were incorrectly specifying level=0 instead of the btree_path's
   current level in a few places
 - When we didn't do journal replay, we shouldn't free the journal keys:
   this fixes cmd_list and cmd_dump, which run in norecovery mode

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c | 2 +-
 fs/bcachefs/recovery.c   | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 611f7b0ca014..5af295317cee 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1834,7 +1834,7 @@ struct bkey_s_c btree_trans_peek_journal(struct btree_trans *trans,
 {
 	struct bkey_i *next_journal =
 		bch2_btree_journal_peek(trans, iter,
-				k.k ? k.k->p : iter->path->l[0].b->key.k.p);
+				k.k ? k.k->p : path_l(iter->path)->b->key.k.p);
 
 	if (next_journal) {
 		iter->k = next_journal->k;
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 580ff915d0e6..b2379adcf8ae 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -1398,7 +1398,8 @@ out:
 	set_bit(BCH_FS_FSCK_DONE, &c->flags);
 	bch2_flush_fsck_errs(c);
 
-	if (!c->opts.keep_journal) {
+	if (!c->opts.keep_journal &&
+	    test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)) {
 		bch2_journal_keys_free(&c->journal_keys);
 		bch2_journal_entries_free(c);
 	}
-- 
cgit 


From 353448f3ea42e5deec298d6d2c577ade7028b7fd Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 23 Oct 2022 22:01:50 -0400
Subject: bcachefs: Fix buffered write path for generic/275

Per fstests generic/275, on -ENOSPC we're supposed write until the
filesystem is full - i.e. do a partial write instead of failing the full
write.

This is a partial fix for the buffered write path: we'll still fail on a
page boundary.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-io.c | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index b1d53290f6ba..49b0fb6522e7 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -1656,10 +1656,21 @@ static int __bch2_buffered_write(struct bch_inode_info *inode,
 				goto out;
 		}
 
+		/*
+		 * XXX: per POSIX and fstests generic/275, on -ENOSPC we're
+		 * supposed to write as much as we have disk space for.
+		 *
+		 * On failure here we should still write out a partial page if
+		 * we aren't completely out of disk space - we don't do that
+		 * yet:
+		 */
 		ret = bch2_page_reservation_get(c, inode, page, &res,
 						pg_offset, pg_len);
-		if (ret)
-			goto out;
+		if (unlikely(ret)) {
+			if (!reserved)
+				goto out;
+			break;
+		}
 
 		reserved += pg_len;
 	}
@@ -1668,10 +1679,10 @@ static int __bch2_buffered_write(struct bch_inode_info *inode,
 		for (i = 0; i < nr_pages; i++)
 			flush_dcache_page(pages[i]);
 
-	while (copied < len) {
+	while (copied < reserved) {
 		struct page *page = pages[(offset + copied) >> PAGE_SHIFT];
 		unsigned pg_offset = (offset + copied) & (PAGE_SIZE - 1);
-		unsigned pg_len = min_t(unsigned, len - copied,
+		unsigned pg_len = min_t(unsigned, reserved - copied,
 					PAGE_SIZE - pg_offset);
 		unsigned pg_copied = copy_page_from_iter_atomic(page,
 						pg_offset, pg_len, iter);
-- 
cgit 


From 80fe580c8db02059d833d2ded6143e90641184ab Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 24 Oct 2022 15:10:14 -0400
Subject: bcachefs: Fix a spurious warning

Fixes fstests generic/648

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-io.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 49b0fb6522e7..9e9ada8f007a 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -1428,7 +1428,8 @@ do_io:
 
 		/* Check for writing past i_size: */
 		WARN_ON_ONCE((bio_end_sector(&w->io->op.wbio.bio) << 9) >
-			     round_up(i_size, block_bytes(c)));
+			     round_up(i_size, block_bytes(c)) &&
+			     !test_bit(BCH_FS_EMERGENCY_RO, &c->flags));
 
 		w->io->op.res.sectors += reserved_sectors;
 		w->io->op.i_sectors_delta -= dirty_sectors;
-- 
cgit 


From 8852501fe570c4956c0e29246e1e5636f09b58fb Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 24 Oct 2022 13:34:17 -0400
Subject: bcachefs: Improve fs_usage_apply_warn() message

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/buckets.c | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index fffe59f0e89b..2e657ded03ff 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -1276,23 +1276,24 @@ void fs_usage_apply_warn(struct btree_trans *trans,
 	struct btree_insert_entry *i;
 	struct printbuf buf = PRINTBUF;
 
-	bch_err(c, "disk usage increased %lli more than %u sectors reserved",
-		should_not_have_added, disk_res_sectors);
+	prt_printf(&buf,
+		   bch2_fmt(c, "disk usage increased %lli more than %u sectors reserved)"),
+		   should_not_have_added, disk_res_sectors);
 
 	trans_for_each_update(trans, i) {
 		struct bkey_s_c old = { &i->old_k, i->old_v };
 
-		pr_err("while inserting");
-		printbuf_reset(&buf);
+		prt_str(&buf, "new ");
 		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(i->k));
-		pr_err("  %s", buf.buf);
-		pr_err("overlapping with");
-		printbuf_reset(&buf);
+		prt_newline(&buf);
+
+		prt_str(&buf, "old ");
 		bch2_bkey_val_to_text(&buf, c, old);
-		pr_err("  %s", buf.buf);
+		prt_newline(&buf);
 	}
 
 	__WARN();
+	bch2_print_string_as_lines(KERN_ERR, buf.buf);
 	printbuf_exit(&buf);
 }
 
-- 
cgit 


From 46fee692eebb850b8478531e185fb5a5f942d3ea Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 28 Oct 2022 17:08:41 -0400
Subject: bcachefs: Improved btree write statistics

This replaces sysfs btree_avg_write_size with btree_write_stats, which
now breaks out statistics by the source of the btree write.

Btree writes that are too small are a source of inefficiency, and
excessive btree resort overhead - this will let us see what's causing
them.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs.h              | 29 +++++++++++++++++++----
 fs/bcachefs/btree_cache.c           | 11 +++++----
 fs/bcachefs/btree_io.c              | 46 +++++++++++++++++++++++++++++++++----
 fs/bcachefs/btree_io.h              | 10 ++++++--
 fs/bcachefs/btree_types.h           |  1 +
 fs/bcachefs/btree_update_interior.c |  1 +
 fs/bcachefs/btree_update_interior.h |  1 +
 fs/bcachefs/btree_update_leaf.c     |  2 ++
 fs/bcachefs/sysfs.c                 | 16 ++++---------
 9 files changed, 91 insertions(+), 26 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 544621dd4af4..18fe09cdae4d 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -596,6 +596,23 @@ typedef struct {
 #define BCACHEFS_ROOT_SUBVOL_INUM					\
 	((subvol_inum) { BCACHEFS_ROOT_SUBVOL,	BCACHEFS_ROOT_INO })
 
+#define BCH_BTREE_WRITE_TYPES()						\
+	x(initial,		0)					\
+	x(init_next_bset,	1)					\
+	x(cache_reclaim,	2)					\
+	x(journal_reclaim,	3)					\
+	x(interior,		4)
+
+enum btree_write_type {
+#define x(t, n) BTREE_WRITE_##t,
+	BCH_BTREE_WRITE_TYPES()
+#undef x
+	BTREE_WRITE_TYPE_NR,
+};
+
+#define BTREE_WRITE_TYPE_MASK	(roundup_pow_of_two(BTREE_WRITE_TYPE_NR) - 1)
+#define BTREE_WRITE_TYPE_BITS	ilog2(BTREE_WRITE_TYPE_MASK)
+
 struct bch_fs {
 	struct closure		cl;
 
@@ -705,6 +722,13 @@ struct bch_fs {
 	struct workqueue_struct	*btree_interior_update_worker;
 	struct work_struct	btree_interior_update_work;
 
+	/* btree_io.c: */
+	spinlock_t		btree_write_error_lock;
+	struct btree_write_stats {
+		atomic64_t	nr;
+		atomic64_t	bytes;
+	}			btree_write_stats[BTREE_WRITE_TYPE_NR];
+
 	/* btree_iter.c: */
 	struct mutex		btree_trans_lock;
 	struct list_head	btree_trans_list;
@@ -880,11 +904,6 @@ mempool_t		bio_bounce_pages;
 	struct bio_set		dio_write_bioset;
 	struct bio_set		dio_read_bioset;
 
-
-	atomic64_t		btree_writes_nr;
-	atomic64_t		btree_writes_sectors;
-	spinlock_t		btree_write_error_lock;
-
 	/* ERRORS */
 	struct list_head	fsck_errors;
 	struct mutex		fsck_error_lock;
diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index 135c3ea1377d..709453a909fc 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -241,9 +241,11 @@ wait_on_io:
 		 * the post write cleanup:
 		 */
 		if (bch2_verify_btree_ondisk)
-			bch2_btree_node_write(c, b, SIX_LOCK_intent, 0);
+			bch2_btree_node_write(c, b, SIX_LOCK_intent,
+					      BTREE_WRITE_cache_reclaim);
 		else
-			__bch2_btree_node_write(c, b, 0);
+			__bch2_btree_node_write(c, b,
+						BTREE_WRITE_cache_reclaim);
 
 		six_unlock_write(&b->c.lock);
 		six_unlock_intent(&b->c.lock);
@@ -347,7 +349,7 @@ restart:
 			   six_trylock_read(&b->c.lock)) {
 			list_move(&bc->live, &b->list);
 			mutex_unlock(&bc->lock);
-			__bch2_btree_node_write(c, b, 0);
+			__bch2_btree_node_write(c, b, BTREE_WRITE_cache_reclaim);
 			six_unlock_read(&b->c.lock);
 			if (touched >= nr)
 				goto out_nounlock;
@@ -624,6 +626,7 @@ out:
 	b->flags		= 0;
 	b->written		= 0;
 	b->nsets		= 0;
+	b->write_type		= 0;
 	b->sib_u64s[0]		= 0;
 	b->sib_u64s[1]		= 0;
 	b->whiteout_u64s	= 0;
@@ -1067,7 +1070,7 @@ wait_on_io:
 	btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_write);
 
 	if (btree_node_dirty(b)) {
-		__bch2_btree_node_write(c, b, 0);
+		__bch2_btree_node_write(c, b, BTREE_WRITE_cache_reclaim);
 		six_unlock_write(&b->c.lock);
 		six_unlock_intent(&b->c.lock);
 		goto wait_on_io;
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index a322a8367688..56f9637d2ca6 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -471,7 +471,8 @@ void bch2_btree_init_next(struct btree_trans *trans, struct btree *b)
 		};
 
 		if (log_u64s[1] >= (log_u64s[0] + log_u64s[2]) / 2) {
-			bch2_btree_node_write(c, b, SIX_LOCK_write, 0);
+			bch2_btree_node_write(c, b, SIX_LOCK_write,
+					      BTREE_WRITE_init_next_bset);
 			reinit_iter = true;
 		}
 	}
@@ -1646,7 +1647,7 @@ static void __btree_node_write_done(struct bch_fs *c, struct btree *b)
 	} while ((v = cmpxchg(&b->flags, old, new)) != old);
 
 	if (new & (1U << BTREE_NODE_write_in_flight))
-		__bch2_btree_node_write(c, b, BTREE_WRITE_ALREADY_STARTED);
+		__bch2_btree_node_write(c, b, BTREE_WRITE_ALREADY_STARTED|b->write_type);
 	else
 		wake_up_bit(&b->flags, BTREE_NODE_write_in_flight);
 }
@@ -1795,6 +1796,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, unsigned flags)
 	bool used_mempool;
 	unsigned long old, new;
 	bool validate_before_checksum = false;
+	enum btree_write_type type = flags & BTREE_WRITE_TYPE_MASK;
 	void *data;
 	int ret;
 
@@ -1841,6 +1843,12 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, unsigned flags)
 	if (new & (1U << BTREE_NODE_need_write))
 		return;
 do_write:
+	if ((flags & BTREE_WRITE_ONLY_IF_NEED))
+		type = b->write_type;
+	b->write_type = 0;
+
+	BUG_ON((type == BTREE_WRITE_initial) != (b->written == 0));
+
 	atomic_dec(&c->btree_cache.dirty);
 
 	BUG_ON(btree_node_fake(b));
@@ -2015,8 +2023,8 @@ do_write:
 		bkey_i_to_btree_ptr_v2(&wbio->key)->v.sectors_written =
 			cpu_to_le16(b->written);
 
-	atomic64_inc(&c->btree_writes_nr);
-	atomic64_add(sectors_to_write, &c->btree_writes_sectors);
+	atomic64_inc(&c->btree_write_stats[type].nr);
+	atomic64_add(bytes_to_write, &c->btree_write_stats[type].bytes);
 
 	INIT_WORK(&wbio->work, btree_write_submit);
 	queue_work(c->io_complete_wq, &wbio->work);
@@ -2144,3 +2152,33 @@ bool bch2_btree_flush_all_writes(struct bch_fs *c)
 {
 	return __bch2_btree_flush_all(c, BTREE_NODE_write_in_flight);
 }
+
+const char * const bch2_btree_write_types[] = {
+#define x(t, n) [n] = #t,
+	BCH_BTREE_WRITE_TYPES()
+	NULL
+};
+
+void bch2_btree_write_stats_to_text(struct printbuf *out, struct bch_fs *c)
+{
+	printbuf_tabstop_push(out, 20);
+	printbuf_tabstop_push(out, 10);
+
+	prt_tab(out);
+	prt_str(out, "nr");
+	prt_tab(out);
+	prt_str(out, "size");
+	prt_newline(out);
+
+	for (unsigned i = 0; i < BTREE_WRITE_TYPE_NR; i++) {
+		u64 nr		= atomic64_read(&c->btree_write_stats[i].nr);
+		u64 bytes	= atomic64_read(&c->btree_write_stats[i].bytes);
+
+		prt_printf(out, "%s:", bch2_btree_write_types[i]);
+		prt_tab(out);
+		prt_u64(out, nr);
+		prt_tab(out);
+		prt_human_readable_u64(out, nr ? div64_u64(bytes, nr) : 0);
+		prt_newline(out);
+	}
+}
diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h
index 8af853642123..4b1810ad7d91 100644
--- a/fs/bcachefs/btree_io.h
+++ b/fs/bcachefs/btree_io.h
@@ -139,8 +139,12 @@ void bch2_btree_complete_write(struct bch_fs *, struct btree *,
 
 bool bch2_btree_post_write_cleanup(struct bch_fs *, struct btree *);
 
-#define BTREE_WRITE_ONLY_IF_NEED	(1U << 0)
-#define BTREE_WRITE_ALREADY_STARTED	(1U << 1)
+enum btree_write_flags {
+	__BTREE_WRITE_ONLY_IF_NEED = BTREE_WRITE_TYPE_BITS,
+	__BTREE_WRITE_ALREADY_STARTED,
+};
+#define BTREE_WRITE_ONLY_IF_NEED	(1U << __BTREE_WRITE_ONLY_IF_NEED )
+#define BTREE_WRITE_ALREADY_STARTED	(1U << __BTREE_WRITE_ALREADY_STARTED)
 
 void __bch2_btree_node_write(struct bch_fs *, struct btree *, unsigned);
 void bch2_btree_node_write(struct bch_fs *, struct btree *,
@@ -219,4 +223,6 @@ static inline void compat_btree_node(unsigned level, enum btree_id btree_id,
 		bn->min_key = bpos_nosnap_successor(bn->min_key);
 }
 
+void bch2_btree_write_stats_to_text(struct printbuf *, struct bch_fs *);
+
 #endif /* _BCACHEFS_BTREE_IO_H */
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index ea844dd7a16b..38c4754dbd7e 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -77,6 +77,7 @@ struct btree {
 	u8			nsets;
 	u8			nr_key_bits;
 	u16			version_ondisk;
+	u8			write_type;
 
 	struct bkey_format	format;
 
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 0150943074fa..e0483abadd72 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -1308,6 +1308,7 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as,
 	bch2_btree_bset_insert_key(trans, path, b, node_iter, insert);
 	set_btree_node_dirty_acct(c, b);
 	set_btree_node_need_write(b);
+	b->write_type = BTREE_WRITE_interior;
 
 	printbuf_exit(&buf);
 }
diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h
index dabe81596544..2e6d220c3bcd 100644
--- a/fs/bcachefs/btree_update_interior.h
+++ b/fs/bcachefs/btree_update_interior.h
@@ -282,6 +282,7 @@ static inline void push_whiteout(struct bch_fs *c, struct btree *b,
 	struct bkey_packed k;
 
 	BUG_ON(bch_btree_keys_u64s_remaining(c, b) < BKEY_U64s);
+	EBUG_ON(btree_node_just_written(b));
 
 	if (!bkey_pack_pos(&k, pos, b)) {
 		struct bkey *u = (void *) &k;
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index fc53958e5619..8cc271030be6 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -181,6 +181,8 @@ static int __btree_node_flush(struct journal *j, struct journal_entry_pin *pin,
 		new |= 1 << BTREE_NODE_need_write;
 	} while ((v = cmpxchg(&b->flags, old, new)) != old);
 
+	b->write_type = BTREE_WRITE_journal_reclaim;
+
 	btree_node_write_if_need(c, b, SIX_LOCK_read);
 	six_unlock_read(&b->c.lock);
 
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index 76301209898f..db3d377ba10c 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -183,7 +183,7 @@ read_attribute(io_latency_stats_read);
 read_attribute(io_latency_stats_write);
 read_attribute(congested);
 
-read_attribute(btree_avg_write_size);
+read_attribute(btree_write_stats);
 
 read_attribute(btree_cache_size);
 read_attribute(compression_stats);
@@ -250,14 +250,6 @@ static size_t bch2_btree_cache_size(struct bch_fs *c)
 	return ret;
 }
 
-static size_t bch2_btree_avg_write_size(struct bch_fs *c)
-{
-	u64 nr = atomic64_read(&c->btree_writes_nr);
-	u64 sectors = atomic64_read(&c->btree_writes_sectors);
-
-	return nr ? div64_u64(sectors, nr) : 0;
-}
-
 static long data_progress_to_text(struct printbuf *out, struct bch_fs *c)
 {
 	long ret = 0;
@@ -396,7 +388,9 @@ SHOW(bch2_fs)
 	sysfs_printf(internal_uuid, "%pU",	c->sb.uuid.b);
 
 	sysfs_hprint(btree_cache_size,		bch2_btree_cache_size(c));
-	sysfs_hprint(btree_avg_write_size,	bch2_btree_avg_write_size(c));
+
+	if (attr == &sysfs_btree_write_stats)
+		bch2_btree_write_stats_to_text(out, c);
 
 	sysfs_printf(btree_gc_periodic, "%u",	(int) c->btree_gc_periodic);
 
@@ -557,7 +551,7 @@ SYSFS_OPS(bch2_fs);
 struct attribute *bch2_fs_files[] = {
 	&sysfs_minor,
 	&sysfs_btree_cache_size,
-	&sysfs_btree_avg_write_size,
+	&sysfs_btree_write_stats,
 
 	&sysfs_promote_whole_extents,
 
-- 
cgit 


From 2cb75179694a646e192247cd56b62cf375af3ae9 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 28 Oct 2022 18:56:31 -0400
Subject: bcachefs: should_compact_all()

This factors out a properly-documented helper for deciding when we want
to sort a btree node with MAX_BSETS bsets down to a single bset.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_io.c | 36 ++++++++++++++++++++++++------------
 1 file changed, 24 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 56f9637d2ca6..dd149de8d31d 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -444,6 +444,24 @@ void bch2_btree_build_aux_trees(struct btree *b)
 				t == bset_tree_last(b));
 }
 
+/*
+ * If we have MAX_BSETS (3) bsets, should we sort them all down to just one?
+ *
+ * The first bset is going to be of similar order to the size of the node, the
+ * last bset is bounded by btree_write_set_buffer(), which is set to keep the
+ * memmove on insert from being too expensive: the middle bset should, ideally,
+ * be the geometric mean of the first and the last.
+ *
+ * Returns true if the middle bset is greater than that geometric mean:
+ */
+static inline bool should_compact_all(struct bch_fs *c, struct btree *b)
+{
+	unsigned mid_u64s_bits =
+		(ilog2(btree_max_u64s(c)) + BTREE_WRITE_SET_U64s_BITS) / 2;
+
+	return bset_u64s(&b->set[1]) > 1U << mid_u64s_bits;
+}
+
 /*
  * @bch_btree_init_next - initialize a new (unwritten) bset that can then be
  * inserted into
@@ -461,20 +479,14 @@ void bch2_btree_init_next(struct btree_trans *trans, struct btree *b)
 
 	EBUG_ON(!(b->c.lock.state.seq & 1));
 	BUG_ON(bset_written(b, bset(b, &b->set[1])));
+	BUG_ON(btree_node_just_written(b));
 
 	if (b->nsets == MAX_BSETS &&
-	    !btree_node_write_in_flight(b)) {
-		unsigned log_u64s[] = {
-			ilog2(bset_u64s(&b->set[0])),
-			ilog2(bset_u64s(&b->set[1])),
-			ilog2(bset_u64s(&b->set[2])),
-		};
-
-		if (log_u64s[1] >= (log_u64s[0] + log_u64s[2]) / 2) {
-			bch2_btree_node_write(c, b, SIX_LOCK_write,
-					      BTREE_WRITE_init_next_bset);
-			reinit_iter = true;
-		}
+	    !btree_node_write_in_flight(b) &&
+	    should_compact_all(c, b)) {
+		bch2_btree_node_write(c, b, SIX_LOCK_write,
+				      BTREE_WRITE_init_next_bset);
+		reinit_iter = true;
 	}
 
 	if (b->nsets == MAX_BSETS &&
-- 
cgit 


From d4bce63636ab81ca4aed03d6641ad70c8416e921 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 30 Oct 2022 22:21:11 -0400
Subject: bcachefs: Kill BCH_WRITE_JOURNAL_SEQ_PTR

Dead code, delete.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/data_update.c | 2 +-
 fs/bcachefs/io.c          | 4 ++--
 fs/bcachefs/io.h          | 8 --------
 fs/bcachefs/io_types.h    | 9 +--------
 4 files changed, 4 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c
index 927deb3943b5..658868048c22 100644
--- a/fs/bcachefs/data_update.c
+++ b/fs/bcachefs/data_update.c
@@ -226,7 +226,7 @@ int bch2_data_update_index_update(struct bch_write_op *op)
 			bch2_trans_update(&trans, &iter, insert,
 				BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
 			bch2_trans_commit(&trans, &op->res,
-				op_journal_seq(op),
+				&op->journal_seq,
 				BTREE_INSERT_NOFAIL|
 				m->data_opts.btree_insert_flags);
 		if (!ret) {
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index 616407fa08ae..97427487aa79 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -507,7 +507,7 @@ static int bch2_write_index_default(struct bch_write_op *op)
 				     BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
 
 		ret = bch2_extent_update(&trans, inum, &iter, sk.k,
-					 &op->res, op_journal_seq(op),
+					 &op->res, &op->journal_seq,
 					 op->new_i_size, &op->i_sectors_delta,
 					 op->flags & BCH_WRITE_CHECK_ENOSPC);
 		bch2_trans_iter_exit(&trans, &iter);
@@ -776,7 +776,7 @@ unlock:
 			__bch2_write(op);
 		} else if (!op->error && (op->flags & BCH_WRITE_FLUSH)) {
 			bch2_journal_flush_seq_async(&op->c->journal,
-						     *op_journal_seq(op),
+						     op->journal_seq,
 						     &op->cl);
 			continue_at(&op->cl, bch2_write_done, index_update_wq(op));
 		} else {
diff --git a/fs/bcachefs/io.h b/fs/bcachefs/io.h
index a3505762b68d..730c7a5bcce8 100644
--- a/fs/bcachefs/io.h
+++ b/fs/bcachefs/io.h
@@ -40,7 +40,6 @@ enum bch_write_flags {
 	__BCH_WRITE_FROM_INTERNAL,
 	__BCH_WRITE_CHECK_ENOSPC,
 	__BCH_WRITE_MOVE,
-	__BCH_WRITE_JOURNAL_SEQ_PTR,
 	__BCH_WRITE_IN_WORKER,
 	__BCH_WRITE_DONE,
 	__BCH_WRITE_IO_ERROR,
@@ -59,17 +58,10 @@ enum bch_write_flags {
 #define BCH_WRITE_MOVE			(1U << __BCH_WRITE_MOVE)
 
 /* Internal: */
-#define BCH_WRITE_JOURNAL_SEQ_PTR	(1U << __BCH_WRITE_JOURNAL_SEQ_PTR)
 #define BCH_WRITE_IN_WORKER		(1U << __BCH_WRITE_IN_WORKER)
 #define BCH_WRITE_DONE			(1U << __BCH_WRITE_DONE)
 #define BCH_WRITE_IO_ERROR		(1U << __BCH_WRITE_IO_ERROR)
 
-static inline u64 *op_journal_seq(struct bch_write_op *op)
-{
-	return (op->flags & BCH_WRITE_JOURNAL_SEQ_PTR)
-		? op->journal_seq_p : &op->journal_seq;
-}
-
 static inline struct workqueue_struct *index_update_wq(struct bch_write_op *op)
 {
 	return op->alloc_reserve == RESERVE_movinggc
diff --git a/fs/bcachefs/io_types.h b/fs/bcachefs/io_types.h
index c316a39d381a..685fb1183399 100644
--- a/fs/bcachefs/io_types.h
+++ b/fs/bcachefs/io_types.h
@@ -142,14 +142,7 @@ struct bch_write_op {
 
 	struct open_buckets	open_buckets;
 
-	/*
-	 * If caller wants to flush but hasn't passed us a journal_seq ptr, we
-	 * still need to stash the journal_seq somewhere:
-	 */
-	union {
-		u64			*journal_seq_p;
-		u64			journal_seq;
-	};
+	u64			journal_seq;
 	u64			new_i_size;
 	s64			i_sectors_delta;
 
-- 
cgit 


From a10195764901e0a41e64d596de57a957e7f982f0 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 22 Oct 2022 15:59:53 -0400
Subject: bcachefs: More style fixes

Fixes for various checkpatch errors.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.h | 16 ++++++++--------
 fs/bcachefs/bcachefs.h         |  2 +-
 fs/bcachefs/bcachefs_format.h  |  1 +
 fs/bcachefs/bkey.c             |  3 ---
 fs/bcachefs/bkey_methods.c     | 29 +++++++++++++++--------------
 fs/bcachefs/bkey_methods.h     |  2 +-
 fs/bcachefs/btree_cache.c      |  9 +++++++--
 fs/bcachefs/btree_gc.c         |  4 ++--
 fs/bcachefs/btree_io.c         |  2 ++
 fs/bcachefs/btree_iter.c       |  2 ++
 fs/bcachefs/btree_key_cache.h  |  1 +
 fs/bcachefs/checksum.h         | 10 +++++-----
 fs/bcachefs/dirent.h           |  4 ++--
 fs/bcachefs/ec.h               |  4 ++--
 fs/bcachefs/errcode.c          |  1 +
 fs/bcachefs/extents.h          | 22 +++++++++++++---------
 fs/bcachefs/fifo.h             |  2 +-
 fs/bcachefs/fsck.c             |  3 +--
 fs/bcachefs/inode.h            | 12 ++++++------
 fs/bcachefs/journal.h          |  4 ++--
 fs/bcachefs/lru.h              |  4 ++--
 fs/bcachefs/quota.h            |  4 ++--
 fs/bcachefs/recovery.c         |  3 +--
 fs/bcachefs/reflink.h          | 12 ++++++------
 fs/bcachefs/replicas_types.h   |  1 +
 fs/bcachefs/subvolume.c        |  1 +
 fs/bcachefs/subvolume.h        |  8 ++++----
 fs/bcachefs/sysfs.c            | 12 +++++++-----
 fs/bcachefs/trace.h            | 26 +++++++++++++-------------
 fs/bcachefs/xattr.h            |  4 ++--
 30 files changed, 112 insertions(+), 96 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h
index 488db3211ce4..318beb588aa9 100644
--- a/fs/bcachefs/alloc_background.h
+++ b/fs/bcachefs/alloc_background.h
@@ -87,34 +87,34 @@ int bch2_alloc_v4_invalid(const struct bch_fs *, struct bkey_s_c, int, struct pr
 void bch2_alloc_v4_swab(struct bkey_s);
 void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 
-#define bch2_bkey_ops_alloc (struct bkey_ops) {		\
+#define bch2_bkey_ops_alloc ((struct bkey_ops) {	\
 	.key_invalid	= bch2_alloc_v1_invalid,	\
 	.val_to_text	= bch2_alloc_to_text,		\
 	.trans_trigger	= bch2_trans_mark_alloc,	\
 	.atomic_trigger	= bch2_mark_alloc,		\
-}
+})
 
-#define bch2_bkey_ops_alloc_v2 (struct bkey_ops) {	\
+#define bch2_bkey_ops_alloc_v2 ((struct bkey_ops) {	\
 	.key_invalid	= bch2_alloc_v2_invalid,	\
 	.val_to_text	= bch2_alloc_to_text,		\
 	.trans_trigger	= bch2_trans_mark_alloc,	\
 	.atomic_trigger	= bch2_mark_alloc,		\
-}
+})
 
-#define bch2_bkey_ops_alloc_v3 (struct bkey_ops) {	\
+#define bch2_bkey_ops_alloc_v3 ((struct bkey_ops) {	\
 	.key_invalid	= bch2_alloc_v3_invalid,	\
 	.val_to_text	= bch2_alloc_to_text,		\
 	.trans_trigger	= bch2_trans_mark_alloc,	\
 	.atomic_trigger	= bch2_mark_alloc,		\
-}
+})
 
-#define bch2_bkey_ops_alloc_v4 (struct bkey_ops) {	\
+#define bch2_bkey_ops_alloc_v4 ((struct bkey_ops) {	\
 	.key_invalid	= bch2_alloc_v4_invalid,	\
 	.val_to_text	= bch2_alloc_to_text,		\
 	.swab		= bch2_alloc_v4_swab,		\
 	.trans_trigger	= bch2_trans_mark_alloc,	\
 	.atomic_trigger	= bch2_mark_alloc,		\
-}
+})
 
 static inline bool bkey_is_alloc(const struct bkey *k)
 {
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 18fe09cdae4d..c0416258985b 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -282,7 +282,7 @@ do {									\
 		"When reading btree nodes, read all replicas and "	\
 		"compare them")
 
-/* Parameters that should only be compiled in in debug mode: */
+/* Parameters that should only be compiled in debug mode: */
 #define BCH_DEBUG_PARAMS_DEBUG()					\
 	BCH_DEBUG_PARAM(expensive_debug_checks,				\
 		"Enables various runtime debugging checks that "	\
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index 35fe7002b37d..6e01fd81e3f0 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -1840,6 +1840,7 @@ enum bch_compression_opts {
 static inline __le64 __bch2_sb_magic(struct bch_sb *sb)
 {
 	__le64 ret;
+
 	memcpy(&ret, &sb->uuid, sizeof(ret));
 	return ret;
 }
diff --git a/fs/bcachefs/bkey.c b/fs/bcachefs/bkey.c
index 161b5bd60a63..1c9c02deffbe 100644
--- a/fs/bcachefs/bkey.c
+++ b/fs/bcachefs/bkey.c
@@ -17,9 +17,6 @@
 
 const struct bkey_format bch2_bkey_format_current = BKEY_FORMAT_CURRENT;
 
-struct bkey __bch2_bkey_unpack_key(const struct bkey_format *,
-			      const struct bkey_packed *);
-
 void bch2_bkey_packed_to_binary_text(struct printbuf *out,
 				     const struct bkey_format *f,
 				     const struct bkey_packed *k)
diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c
index f2351e5ee7c1..141754db5fa1 100644
--- a/fs/bcachefs/bkey_methods.c
+++ b/fs/bcachefs/bkey_methods.c
@@ -28,13 +28,13 @@ static int deleted_key_invalid(const struct bch_fs *c, struct bkey_s_c k,
 	return 0;
 }
 
-#define bch2_bkey_ops_deleted (struct bkey_ops) {	\
+#define bch2_bkey_ops_deleted ((struct bkey_ops) {	\
 	.key_invalid = deleted_key_invalid,		\
-}
+})
 
-#define bch2_bkey_ops_whiteout (struct bkey_ops) {	\
+#define bch2_bkey_ops_whiteout ((struct bkey_ops) {	\
 	.key_invalid = deleted_key_invalid,		\
-}
+})
 
 static int empty_val_key_invalid(const struct bch_fs *c, struct bkey_s_c k,
 				 int rw, struct printbuf *err)
@@ -48,9 +48,9 @@ static int empty_val_key_invalid(const struct bch_fs *c, struct bkey_s_c k,
 	return 0;
 }
 
-#define bch2_bkey_ops_error (struct bkey_ops) {		\
+#define bch2_bkey_ops_error ((struct bkey_ops) {	\
 	.key_invalid = empty_val_key_invalid,		\
-}
+})
 
 static int key_type_cookie_invalid(const struct bch_fs *c, struct bkey_s_c k,
 				   int rw, struct printbuf *err)
@@ -64,13 +64,13 @@ static int key_type_cookie_invalid(const struct bch_fs *c, struct bkey_s_c k,
 	return 0;
 }
 
-#define bch2_bkey_ops_cookie (struct bkey_ops) {	\
+#define bch2_bkey_ops_cookie ((struct bkey_ops) {	\
 	.key_invalid = key_type_cookie_invalid,		\
-}
+})
 
-#define bch2_bkey_ops_hash_whiteout (struct bkey_ops) {	\
+#define bch2_bkey_ops_hash_whiteout ((struct bkey_ops) {\
 	.key_invalid = empty_val_key_invalid,		\
-}
+})
 
 static int key_type_inline_data_invalid(const struct bch_fs *c, struct bkey_s_c k,
 					int rw, struct printbuf *err)
@@ -88,10 +88,10 @@ static void key_type_inline_data_to_text(struct printbuf *out, struct bch_fs *c,
 	       datalen, min(datalen, 32U), d.v->data);
 }
 
-#define bch2_bkey_ops_inline_data (struct bkey_ops) {	\
+#define bch2_bkey_ops_inline_data ((struct bkey_ops) {	\
 	.key_invalid	= key_type_inline_data_invalid,	\
 	.val_to_text	= key_type_inline_data_to_text,	\
-}
+})
 
 static int key_type_set_invalid(const struct bch_fs *c, struct bkey_s_c k,
 				int rw, struct printbuf *err)
@@ -111,10 +111,10 @@ static bool key_type_set_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_
 	return true;
 }
 
-#define bch2_bkey_ops_set (struct bkey_ops) {		\
+#define bch2_bkey_ops_set ((struct bkey_ops) {		\
 	.key_invalid	= key_type_set_invalid,		\
 	.key_merge	= key_type_set_merge,		\
-}
+})
 
 const struct bkey_ops bch2_bkey_ops[] = {
 #define x(name, nr) [KEY_TYPE_##name]	= bch2_bkey_ops_##name,
@@ -439,6 +439,7 @@ void __bch2_bkey_compat(unsigned level, enum btree_id btree_id,
 		    btree_id == BTREE_ID_inodes) {
 			if (!bkey_packed(k)) {
 				struct bkey_i *u = packed_to_bkey(k);
+
 				swap(u->k.p.inode, u->k.p.offset);
 			} else if (f->bits_per_field[BKEY_FIELD_INODE] &&
 				   f->bits_per_field[BKEY_FIELD_OFFSET]) {
diff --git a/fs/bcachefs/bkey_methods.h b/fs/bcachefs/bkey_methods.h
index cff6f6dc44c4..0c74ba335e64 100644
--- a/fs/bcachefs/bkey_methods.h
+++ b/fs/bcachefs/bkey_methods.h
@@ -18,7 +18,7 @@ extern const char * const bch2_bkey_types[];
  *
  * When invalid, error string is returned via @err. @rw indicates whether key is
  * being read or written; more aggressive checks can be enabled when rw == WRITE.
-*/
+ */
 struct bkey_ops {
 	int		(*key_invalid)(const struct bch_fs *c, struct bkey_s_c k,
 				       int rw, struct printbuf *err);
diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index 709453a909fc..5adfdc5afbea 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -112,7 +112,9 @@ static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp)
 
 static struct btree *__btree_node_mem_alloc(struct bch_fs *c, gfp_t gfp)
 {
-	struct btree *b = kzalloc(sizeof(struct btree), gfp);
+	struct btree *b;
+
+	b = kzalloc(sizeof(struct btree), gfp);
 	if (!b)
 		return NULL;
 
@@ -128,7 +130,9 @@ static struct btree *__btree_node_mem_alloc(struct bch_fs *c, gfp_t gfp)
 struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *c)
 {
 	struct btree_cache *bc = &c->btree_cache;
-	struct btree *b = __btree_node_mem_alloc(c, GFP_KERNEL);
+	struct btree *b;
+
+	b = __btree_node_mem_alloc(c, GFP_KERNEL);
 	if (!b)
 		return NULL;
 
@@ -147,6 +151,7 @@ struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *c)
 void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b)
 {
 	int ret = rhashtable_remove_fast(&bc->table, &b->hash, bch_btree_cache_params);
+
 	BUG_ON(ret);
 
 	/* Cause future lookups for this node to fail: */
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 1bc5bded0546..3395fa56c724 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -199,7 +199,7 @@ static int set_node_min(struct bch_fs *c, struct btree *b, struct bpos new_min)
 	struct bkey_i_btree_ptr_v2 *new;
 	int ret;
 
-	new = kmalloc(BKEY_BTREE_PTR_U64s_MAX * sizeof(u64), GFP_KERNEL);
+	new = kmalloc_array(BKEY_BTREE_PTR_U64s_MAX, sizeof(u64), GFP_KERNEL);
 	if (!new)
 		return -ENOMEM;
 
@@ -228,7 +228,7 @@ static int set_node_max(struct bch_fs *c, struct btree *b, struct bpos new_max)
 	if (ret)
 		return ret;
 
-	new = kmalloc(BKEY_BTREE_PTR_U64s_MAX * sizeof(u64), GFP_KERNEL);
+	new = kmalloc_array(BKEY_BTREE_PTR_U64s_MAX, sizeof(u64), GFP_KERNEL);
 	if (!new)
 		return -ENOMEM;
 
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index dd149de8d31d..5d750f447241 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -1224,6 +1224,7 @@ static void btree_node_read_endio(struct bio *bio)
 
 	if (rb->have_ioref) {
 		struct bch_dev *ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev);
+
 		bch2_latency_acct(ca, rb->start_time, READ);
 	}
 
@@ -1411,6 +1412,7 @@ static void btree_node_read_all_replicas_endio(struct bio *bio)
 
 	if (rb->have_ioref) {
 		struct bch_dev *ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev);
+
 		bch2_latency_acct(ca, rb->start_time, READ);
 	}
 
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 5af295317cee..cbba0b79fdb8 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -200,6 +200,7 @@ err:
 
 	if (p) {
 		struct bkey uk = bkey_unpack_key(l->b, p);
+
 		bch2_bkey_to_text(&buf2, &uk);
 	} else {
 		prt_printf(&buf2, "(none)");
@@ -207,6 +208,7 @@ err:
 
 	if (k) {
 		struct bkey uk = bkey_unpack_key(l->b, k);
+
 		bch2_bkey_to_text(&buf3, &uk);
 	} else {
 		prt_printf(&buf3, "(none)");
diff --git a/fs/bcachefs/btree_key_cache.h b/fs/bcachefs/btree_key_cache.h
index 670746e72dab..eccea15fca79 100644
--- a/fs/bcachefs/btree_key_cache.h
+++ b/fs/bcachefs/btree_key_cache.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 #ifndef _BCACHEFS_BTREE_KEY_CACHE_H
 #define _BCACHEFS_BTREE_KEY_CACHE_H
 
diff --git a/fs/bcachefs/checksum.h b/fs/bcachefs/checksum.h
index c86c3c05d620..3d6d13bcfd72 100644
--- a/fs/bcachefs/checksum.h
+++ b/fs/bcachefs/checksum.h
@@ -78,15 +78,15 @@ static inline enum bch_csum_type bch2_csum_opt_to_type(enum bch_csum_opts type,
 {
 	switch (type) {
 	case BCH_CSUM_OPT_none:
-	     return BCH_CSUM_none;
+		return BCH_CSUM_none;
 	case BCH_CSUM_OPT_crc32c:
-	     return data ? BCH_CSUM_crc32c : BCH_CSUM_crc32c_nonzero;
+		return data ? BCH_CSUM_crc32c : BCH_CSUM_crc32c_nonzero;
 	case BCH_CSUM_OPT_crc64:
-	     return data ? BCH_CSUM_crc64 : BCH_CSUM_crc64_nonzero;
+		return data ? BCH_CSUM_crc64 : BCH_CSUM_crc64_nonzero;
 	case BCH_CSUM_OPT_xxhash:
-	     return BCH_CSUM_xxhash;
+		return BCH_CSUM_xxhash;
 	default:
-	     BUG();
+		BUG();
 	}
 }
 
diff --git a/fs/bcachefs/dirent.h b/fs/bcachefs/dirent.h
index b1466932c768..1a2c9108f864 100644
--- a/fs/bcachefs/dirent.h
+++ b/fs/bcachefs/dirent.h
@@ -9,10 +9,10 @@ extern const struct bch_hash_desc bch2_dirent_hash_desc;
 int bch2_dirent_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *);
 void bch2_dirent_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 
-#define bch2_bkey_ops_dirent (struct bkey_ops) {	\
+#define bch2_bkey_ops_dirent ((struct bkey_ops) {	\
 	.key_invalid	= bch2_dirent_invalid,		\
 	.val_to_text	= bch2_dirent_to_text,		\
-}
+})
 
 struct qstr;
 struct file;
diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h
index c53187df4651..8596fa763b4c 100644
--- a/fs/bcachefs/ec.h
+++ b/fs/bcachefs/ec.h
@@ -12,13 +12,13 @@ int bch2_stripe_invalid(const struct bch_fs *, struct bkey_s_c,
 void bch2_stripe_to_text(struct printbuf *, struct bch_fs *,
 			 struct bkey_s_c);
 
-#define bch2_bkey_ops_stripe (struct bkey_ops) {	\
+#define bch2_bkey_ops_stripe ((struct bkey_ops) {	\
 	.key_invalid	= bch2_stripe_invalid,		\
 	.val_to_text	= bch2_stripe_to_text,		\
 	.swab		= bch2_ptr_swab,		\
 	.trans_trigger	= bch2_trans_mark_stripe,	\
 	.atomic_trigger	= bch2_mark_stripe,		\
-}
+})
 
 static inline unsigned stripe_csums_per_device(const struct bch_stripe *s)
 {
diff --git a/fs/bcachefs/errcode.c b/fs/bcachefs/errcode.c
index cc9ce0be356e..dc906fc9176f 100644
--- a/fs/bcachefs/errcode.c
+++ b/fs/bcachefs/errcode.c
@@ -23,6 +23,7 @@ static unsigned bch2_errcode_parents[] = {
 const char *bch2_err_str(int err)
 {
 	const char *errstr;
+
 	err = abs(err);
 
 	BUG_ON(err >= BCH_ERR_MAX);
diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h
index 3c17b81130bb..224df17206cb 100644
--- a/fs/bcachefs/extents.h
+++ b/fs/bcachefs/extents.h
@@ -198,6 +198,7 @@ static inline struct bkey_ptrs_c bch2_bkey_ptrs_c(struct bkey_s_c k)
 	switch (k.k->type) {
 	case KEY_TYPE_btree_ptr: {
 		struct bkey_s_c_btree_ptr e = bkey_s_c_to_btree_ptr(k);
+
 		return (struct bkey_ptrs_c) {
 			to_entry(&e.v->start[0]),
 			to_entry(extent_entry_last(e))
@@ -205,6 +206,7 @@ static inline struct bkey_ptrs_c bch2_bkey_ptrs_c(struct bkey_s_c k)
 	}
 	case KEY_TYPE_extent: {
 		struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
+
 		return (struct bkey_ptrs_c) {
 			e.v->start,
 			extent_entry_last(e)
@@ -212,6 +214,7 @@ static inline struct bkey_ptrs_c bch2_bkey_ptrs_c(struct bkey_s_c k)
 	}
 	case KEY_TYPE_stripe: {
 		struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k);
+
 		return (struct bkey_ptrs_c) {
 			to_entry(&s.v->ptrs[0]),
 			to_entry(&s.v->ptrs[s.v->nr_blocks]),
@@ -227,6 +230,7 @@ static inline struct bkey_ptrs_c bch2_bkey_ptrs_c(struct bkey_s_c k)
 	}
 	case KEY_TYPE_btree_ptr_v2: {
 		struct bkey_s_c_btree_ptr_v2 e = bkey_s_c_to_btree_ptr_v2(k);
+
 		return (struct bkey_ptrs_c) {
 			to_entry(&e.v->start[0]),
 			to_entry(extent_entry_last(e))
@@ -342,7 +346,7 @@ out:									\
 
 #define extent_for_each_entry_from(_e, _entry, _start)			\
 	__bkey_extent_entry_for_each_from(_start,			\
-				extent_entry_last(_e),_entry)
+				extent_entry_last(_e), _entry)
 
 #define extent_for_each_entry(_e, _entry)				\
 	extent_for_each_entry_from(_e, _entry, (_e).v->start)
@@ -376,28 +380,28 @@ void bch2_btree_ptr_v2_to_text(struct printbuf *, struct bch_fs *, struct bkey_s
 void bch2_btree_ptr_v2_compat(enum btree_id, unsigned, unsigned,
 			      int, struct bkey_s);
 
-#define bch2_bkey_ops_btree_ptr (struct bkey_ops) {		\
+#define bch2_bkey_ops_btree_ptr ((struct bkey_ops) {		\
 	.key_invalid	= bch2_btree_ptr_invalid,		\
 	.val_to_text	= bch2_btree_ptr_to_text,		\
 	.swab		= bch2_ptr_swab,			\
 	.trans_trigger	= bch2_trans_mark_extent,		\
 	.atomic_trigger	= bch2_mark_extent,			\
-}
+})
 
-#define bch2_bkey_ops_btree_ptr_v2 (struct bkey_ops) {		\
+#define bch2_bkey_ops_btree_ptr_v2 ((struct bkey_ops) {		\
 	.key_invalid	= bch2_btree_ptr_v2_invalid,		\
 	.val_to_text	= bch2_btree_ptr_v2_to_text,		\
 	.swab		= bch2_ptr_swab,			\
 	.compat		= bch2_btree_ptr_v2_compat,		\
 	.trans_trigger	= bch2_trans_mark_extent,		\
 	.atomic_trigger	= bch2_mark_extent,			\
-}
+})
 
 /* KEY_TYPE_extent: */
 
 bool bch2_extent_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);
 
-#define bch2_bkey_ops_extent (struct bkey_ops) {		\
+#define bch2_bkey_ops_extent ((struct bkey_ops) {		\
 	.key_invalid	= bch2_bkey_ptrs_invalid,		\
 	.val_to_text	= bch2_bkey_ptrs_to_text,		\
 	.swab		= bch2_ptr_swab,			\
@@ -405,7 +409,7 @@ bool bch2_extent_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);
 	.key_merge	= bch2_extent_merge,			\
 	.trans_trigger	= bch2_trans_mark_extent,		\
 	.atomic_trigger	= bch2_mark_extent,			\
-}
+})
 
 /* KEY_TYPE_reservation: */
 
@@ -414,13 +418,13 @@ int bch2_reservation_invalid(const struct bch_fs *, struct bkey_s_c,
 void bch2_reservation_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 bool bch2_reservation_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);
 
-#define bch2_bkey_ops_reservation (struct bkey_ops) {		\
+#define bch2_bkey_ops_reservation ((struct bkey_ops) {		\
 	.key_invalid	= bch2_reservation_invalid,		\
 	.val_to_text	= bch2_reservation_to_text,		\
 	.key_merge	= bch2_reservation_merge,		\
 	.trans_trigger	= bch2_trans_mark_reservation,		\
 	.atomic_trigger	= bch2_mark_reservation,		\
-}
+})
 
 /* Extent checksum entries: */
 
diff --git a/fs/bcachefs/fifo.h b/fs/bcachefs/fifo.h
index cdb272708a4b..66b945be10c2 100644
--- a/fs/bcachefs/fifo.h
+++ b/fs/bcachefs/fifo.h
@@ -65,7 +65,7 @@ do {									\
 	   (((p) - (fifo)->data)))
 
 #define fifo_entry_idx(fifo, p)	(((p) - &fifo_peek_front(fifo)) & (fifo)->mask)
-#define fifo_idx_entry(fifo, i)	(fifo)->data[((fifo)->front + (i)) & (fifo)->mask]
+#define fifo_idx_entry(fifo, i)	((fifo)->data[((fifo)->front + (i)) & (fifo)->mask])
 
 #define fifo_push_back_ref(f)						\
 	(fifo_full((f)) ? NULL : &(f)->data[(f)->back++ & (f)->mask])
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 6f7310f010b9..f4f0e0cec85d 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -848,8 +848,7 @@ out:
 	printbuf_exit(&buf);
 	return ret;
 bad_hash:
-	if (fsck_err(c, "hash table key at wrong offset: btree %s inode %llu offset %llu, "
-		     "hashed to %llu\n%s",
+	if (fsck_err(c, "hash table key at wrong offset: btree %s inode %llu offset %llu, hashed to %llu\n%s",
 		     bch2_btree_ids[desc.btree_id], hash_k.k->p.inode, hash_k.k->p.offset, hash,
 		     (printbuf_reset(&buf),
 		      bch2_bkey_val_to_text(&buf, c, hash_k), buf.buf))) {
diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h
index 5c80bdf587f9..9ea0d575a183 100644
--- a/fs/bcachefs/inode.h
+++ b/fs/bcachefs/inode.h
@@ -11,19 +11,19 @@ int bch2_inode_invalid(const struct bch_fs *, struct bkey_s_c, int, struct print
 int bch2_inode_v2_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *);
 void bch2_inode_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 
-#define bch2_bkey_ops_inode (struct bkey_ops) {		\
+#define bch2_bkey_ops_inode ((struct bkey_ops) {	\
 	.key_invalid	= bch2_inode_invalid,		\
 	.val_to_text	= bch2_inode_to_text,		\
 	.trans_trigger	= bch2_trans_mark_inode,	\
 	.atomic_trigger	= bch2_mark_inode,		\
-}
+})
 
-#define bch2_bkey_ops_inode_v2 (struct bkey_ops) {	\
+#define bch2_bkey_ops_inode_v2 ((struct bkey_ops) {	\
 	.key_invalid	= bch2_inode_v2_invalid,	\
 	.val_to_text	= bch2_inode_to_text,		\
 	.trans_trigger	= bch2_trans_mark_inode,	\
 	.atomic_trigger	= bch2_mark_inode,		\
-}
+})
 
 static inline bool bkey_is_inode(const struct bkey *k)
 {
@@ -35,10 +35,10 @@ int bch2_inode_generation_invalid(const struct bch_fs *, struct bkey_s_c,
 				  int, struct printbuf *);
 void bch2_inode_generation_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 
-#define bch2_bkey_ops_inode_generation (struct bkey_ops) {	\
+#define bch2_bkey_ops_inode_generation ((struct bkey_ops) {	\
 	.key_invalid	= bch2_inode_generation_invalid,	\
 	.val_to_text	= bch2_inode_generation_to_text,	\
-}
+})
 
 #if 0
 typedef struct {
diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h
index d3caa7ea7ce9..3e93f0d67c09 100644
--- a/fs/bcachefs/journal.h
+++ b/fs/bcachefs/journal.h
@@ -29,8 +29,8 @@
  *
  * Synchronous updates are specified by passing a closure (@flush_cl) to
  * bch2_btree_insert() or bch_btree_insert_node(), which then pass that parameter
- * down to the journalling code. That closure will will wait on the journal
- * write to complete (via closure_wait()).
+ * down to the journalling code. That closure will wait on the journal write to
+ * complete (via closure_wait()).
  *
  * If the index update wasn't synchronous, the journal entry will be
  * written out after 10 ms have elapsed, by default (the delay_ms field
diff --git a/fs/bcachefs/lru.h b/fs/bcachefs/lru.h
index 3decb7b1dde2..925c29b49b86 100644
--- a/fs/bcachefs/lru.h
+++ b/fs/bcachefs/lru.h
@@ -5,10 +5,10 @@
 int bch2_lru_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *);
 void bch2_lru_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 
-#define bch2_bkey_ops_lru (struct bkey_ops) {	\
+#define bch2_bkey_ops_lru ((struct bkey_ops) {	\
 	.key_invalid	= bch2_lru_invalid,	\
 	.val_to_text	= bch2_lru_to_text,	\
-}
+})
 
 int bch2_lru_delete(struct btree_trans *, u64, u64, u64, struct bkey_s_c);
 int bch2_lru_set(struct btree_trans *, u64, u64, u64 *);
diff --git a/fs/bcachefs/quota.h b/fs/bcachefs/quota.h
index 8c67ae1da7c7..59bed1148201 100644
--- a/fs/bcachefs/quota.h
+++ b/fs/bcachefs/quota.h
@@ -10,10 +10,10 @@ extern const struct bch_sb_field_ops bch_sb_field_ops_quota;
 int bch2_quota_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *);
 void bch2_quota_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 
-#define bch2_bkey_ops_quota (struct bkey_ops) {		\
+#define bch2_bkey_ops_quota ((struct bkey_ops) {	\
 	.key_invalid	= bch2_quota_invalid,		\
 	.val_to_text	= bch2_quota_to_text,		\
-}
+})
 
 static inline struct bch_qid bch_qid(struct bch_inode_unpacked *u)
 {
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index b2379adcf8ae..7eaced534a5b 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -1509,8 +1509,7 @@ int bch2_fs_initialize(struct bch_fs *c)
 		goto err;
 	bch_verbose(c, "reading snapshots done");
 
-	bch2_inode_init(c, &root_inode, 0, 0,
-			S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0, NULL);
+	bch2_inode_init(c, &root_inode, 0, 0, S_IFDIR|0755, 0, NULL);
 	root_inode.bi_inum	= BCACHEFS_ROOT_INO;
 	root_inode.bi_subvol	= BCACHEFS_ROOT_SUBVOL;
 	bch2_inode_pack(c, &packed_inode, &root_inode);
diff --git a/fs/bcachefs/reflink.h b/fs/bcachefs/reflink.h
index f9848dc3eebb..ce0012aa99c6 100644
--- a/fs/bcachefs/reflink.h
+++ b/fs/bcachefs/reflink.h
@@ -8,13 +8,13 @@ void bch2_reflink_p_to_text(struct printbuf *, struct bch_fs *,
 			    struct bkey_s_c);
 bool bch2_reflink_p_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);
 
-#define bch2_bkey_ops_reflink_p (struct bkey_ops) {		\
+#define bch2_bkey_ops_reflink_p ((struct bkey_ops) {		\
 	.key_invalid	= bch2_reflink_p_invalid,		\
 	.val_to_text	= bch2_reflink_p_to_text,		\
 	.key_merge	= bch2_reflink_p_merge,			\
 	.trans_trigger	= bch2_trans_mark_reflink_p,		\
 	.atomic_trigger	= bch2_mark_reflink_p,			\
-}
+})
 
 int bch2_reflink_v_invalid(const struct bch_fs *, struct bkey_s_c,
 			   int, struct printbuf *);
@@ -23,13 +23,13 @@ void bch2_reflink_v_to_text(struct printbuf *, struct bch_fs *,
 int bch2_trans_mark_reflink_v(struct btree_trans *, enum btree_id, unsigned,
 			      struct bkey_s_c, struct bkey_i *, unsigned);
 
-#define bch2_bkey_ops_reflink_v (struct bkey_ops) {		\
+#define bch2_bkey_ops_reflink_v ((struct bkey_ops) {		\
 	.key_invalid	= bch2_reflink_v_invalid,		\
 	.val_to_text	= bch2_reflink_v_to_text,		\
 	.swab		= bch2_ptr_swab,			\
 	.trans_trigger	= bch2_trans_mark_reflink_v,		\
 	.atomic_trigger	= bch2_mark_extent,			\
-}
+})
 
 int bch2_indirect_inline_data_invalid(const struct bch_fs *, struct bkey_s_c,
 				      int, struct printbuf *);
@@ -40,11 +40,11 @@ int bch2_trans_mark_indirect_inline_data(struct btree_trans *,
 			      struct bkey_s_c, struct bkey_i *,
 			      unsigned);
 
-#define bch2_bkey_ops_indirect_inline_data (struct bkey_ops) {	\
+#define bch2_bkey_ops_indirect_inline_data ((struct bkey_ops) {	\
 	.key_invalid	= bch2_indirect_inline_data_invalid,	\
 	.val_to_text	= bch2_indirect_inline_data_to_text,	\
 	.trans_trigger	= bch2_trans_mark_indirect_inline_data,	\
-}
+})
 
 static inline const __le64 *bkey_refcount_c(struct bkey_s_c k)
 {
diff --git a/fs/bcachefs/replicas_types.h b/fs/bcachefs/replicas_types.h
index 0535b1d3760e..f12a35b3dbcf 100644
--- a/fs/bcachefs/replicas_types.h
+++ b/fs/bcachefs/replicas_types.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 #ifndef _BCACHEFS_REPLICAS_TYPES_H
 #define _BCACHEFS_REPLICAS_TYPES_H
 
diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c
index 8c98bacca290..1133783477e1 100644
--- a/fs/bcachefs/subvolume.c
+++ b/fs/bcachefs/subvolume.c
@@ -158,6 +158,7 @@ static int bch2_snapshot_set_equiv(struct btree_trans *trans, struct bkey_s_c k)
 
 	for (i = 0; i < 2; i++) {
 		int ret = snapshot_live(trans, child[i]);
+
 		if (ret < 0)
 			return ret;
 
diff --git a/fs/bcachefs/subvolume.h b/fs/bcachefs/subvolume.h
index 02a636644988..c694c1c24483 100644
--- a/fs/bcachefs/subvolume.h
+++ b/fs/bcachefs/subvolume.h
@@ -9,10 +9,10 @@ void bch2_snapshot_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 int bch2_snapshot_invalid(const struct bch_fs *, struct bkey_s_c,
 			  int rw, struct printbuf *);
 
-#define bch2_bkey_ops_snapshot (struct bkey_ops) {		\
+#define bch2_bkey_ops_snapshot ((struct bkey_ops) {		\
 	.key_invalid	= bch2_snapshot_invalid,		\
 	.val_to_text	= bch2_snapshot_to_text,		\
-}
+})
 
 int bch2_mark_snapshot(struct btree_trans *, struct bkey_s_c,
 		       struct bkey_s_c, unsigned);
@@ -109,10 +109,10 @@ int bch2_subvolume_invalid(const struct bch_fs *, struct bkey_s_c,
 			   int rw, struct printbuf *);
 void bch2_subvolume_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 
-#define bch2_bkey_ops_subvolume (struct bkey_ops) {		\
+#define bch2_bkey_ops_subvolume ((struct bkey_ops) {		\
 	.key_invalid	= bch2_subvolume_invalid,		\
 	.val_to_text	= bch2_subvolume_to_text,		\
-}
+})
 
 int bch2_subvolume_get(struct btree_trans *, unsigned,
 		       bool, int, struct bch_subvolume *);
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index db3d377ba10c..7ccdf3197d51 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -89,9 +89,9 @@ static ssize_t fn ## _store_inner(struct kobject *kobj, struct attribute *attr,\
 	static struct attribute sysfs_##_name =				\
 		{ .name = #_name, .mode = _mode }
 
-#define write_attribute(n)	__sysfs_attribute(n, S_IWUSR)
-#define read_attribute(n)	__sysfs_attribute(n, S_IRUGO)
-#define rw_attribute(n)		__sysfs_attribute(n, S_IRUGO|S_IWUSR)
+#define write_attribute(n)	__sysfs_attribute(n, 0200)
+#define read_attribute(n)	__sysfs_attribute(n, 0444)
+#define rw_attribute(n)		__sysfs_attribute(n, 0644)
 
 #define sysfs_printf(file, fmt, ...)					\
 do {									\
@@ -228,13 +228,13 @@ write_attribute(perf_test);
 
 #define x(_name)						\
 	static struct attribute sysfs_time_stat_##_name =		\
-		{ .name = #_name, .mode = S_IRUGO };
+		{ .name = #_name, .mode = 0444 };
 	BCH_TIME_STATS()
 #undef x
 
 static struct attribute sysfs_state_rw = {
 	.name = "state",
-	.mode = S_IRUGO
+	.mode =  0444,
 };
 
 static size_t bch2_btree_cache_size(struct bch_fs *c)
@@ -610,12 +610,14 @@ struct attribute *bch2_fs_counters_files[] = {
 SHOW(bch2_fs_internal)
 {
 	struct bch_fs *c = container_of(kobj, struct bch_fs, internal);
+
 	return bch2_fs_to_text(out, &c->kobj, attr);
 }
 
 STORE(bch2_fs_internal)
 {
 	struct bch_fs *c = container_of(kobj, struct bch_fs, internal);
+
 	return bch2_fs_store(&c->kobj, attr, buf, size);
 }
 SYSFS_OPS(bch2_fs_internal);
diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h
index b5f44c4e80d1..7004da8d341f 100644
--- a/fs/bcachefs/trace.h
+++ b/fs/bcachefs/trace.h
@@ -354,7 +354,7 @@ TRACE_EVENT(btree_reserve_get_fail,
 	),
 
 	TP_fast_assign(
-		strlcpy(__entry->trans_fn, trans_fn, sizeof(__entry->trans_fn));
+		strscpy(__entry->trans_fn, trans_fn, sizeof(__entry->trans_fn));
 		__entry->caller_ip	= caller_ip;
 		__entry->required	= required;
 	),
@@ -411,7 +411,7 @@ TRACE_EVENT(btree_path_relock_fail,
 	TP_fast_assign(
 		struct btree *b = btree_path_node(path, level);
 
-		strlcpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
+		strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
 		__entry->caller_ip		= caller_ip;
 		__entry->btree_id		= path->btree_id;
 		__entry->level			= path->level;
@@ -462,7 +462,7 @@ TRACE_EVENT(btree_path_upgrade_fail,
 	TP_fast_assign(
 		struct six_lock_count c;
 
-		strlcpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
+		strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
 		__entry->caller_ip		= caller_ip;
 		__entry->btree_id		= path->btree_id;
 		__entry->level			= level;
@@ -544,7 +544,7 @@ DECLARE_EVENT_CLASS(bucket_alloc,
 
 	TP_fast_assign(
 		__entry->dev		= ca->dev;
-		strlcpy(__entry->reserve, alloc_reserve, sizeof(__entry->reserve));
+		strscpy(__entry->reserve, alloc_reserve, sizeof(__entry->reserve));
 		__entry->user		= user;
 		__entry->bucket		= bucket;
 		__entry->free		= free;
@@ -556,7 +556,7 @@ DECLARE_EVENT_CLASS(bucket_alloc,
 		__entry->need_journal_commit = s->skipped_need_journal_commit;
 		__entry->nouse		= s->skipped_nouse;
 		__entry->nonblocking	= nonblocking;
-		strlcpy(__entry->err, err, sizeof(__entry->err));
+		strscpy(__entry->err, err, sizeof(__entry->err));
 	),
 
 	TP_printk("%d,%d reserve %s user %u bucket %llu free %llu avail %llu copygc_wait %llu/%lli seen %llu open %llu need_journal_commit %llu nouse %llu nonblocking %u err %s",
@@ -628,7 +628,7 @@ TRACE_EVENT(discard_buckets,
 		__entry->open			= open;
 		__entry->need_journal_commit	= need_journal_commit;
 		__entry->discarded		= discarded;
-		strlcpy(__entry->err, err, sizeof(__entry->err));
+		strscpy(__entry->err, err, sizeof(__entry->err));
 	),
 
 	TP_printk("%d%d seen %llu open %llu need_journal_commit %llu discarded %llu err %s",
@@ -778,7 +778,7 @@ DECLARE_EVENT_CLASS(transaction_event,
 	),
 
 	TP_fast_assign(
-		strlcpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
+		strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
 		__entry->caller_ip		= caller_ip;
 	),
 
@@ -823,7 +823,7 @@ TRACE_EVENT(trans_restart_journal_preres_get,
 	),
 
 	TP_fast_assign(
-		strlcpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
+		strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
 		__entry->caller_ip		= caller_ip;
 		__entry->flags			= flags;
 	),
@@ -883,7 +883,7 @@ DECLARE_EVENT_CLASS(transaction_restart_iter,
 	),
 
 	TP_fast_assign(
-		strlcpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
+		strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
 		__entry->caller_ip		= caller_ip;
 		__entry->btree_id		= path->btree_id;
 		TRACE_BPOS_assign(pos, path->pos)
@@ -930,7 +930,7 @@ TRACE_EVENT(trans_restart_upgrade,
 	),
 
 	TP_fast_assign(
-		strlcpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
+		strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
 		__entry->caller_ip		= caller_ip;
 		__entry->btree_id		= path->btree_id;
 		__entry->old_locks_want		= old_locks_want;
@@ -1039,7 +1039,7 @@ TRACE_EVENT(trans_restart_would_deadlock_write,
 	),
 
 	TP_fast_assign(
-		strlcpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
+		strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
 	),
 
 	TP_printk("%s", __entry->trans_fn)
@@ -1058,7 +1058,7 @@ TRACE_EVENT(trans_restart_mem_realloced,
 	),
 
 	TP_fast_assign(
-		strlcpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
+		strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
 		__entry->caller_ip	= caller_ip;
 		__entry->bytes		= bytes;
 	),
@@ -1087,7 +1087,7 @@ TRACE_EVENT(trans_restart_key_cache_key_realloced,
 	),
 
 	TP_fast_assign(
-		strlcpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
+		strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
 		__entry->caller_ip		= caller_ip;
 
 		__entry->btree_id	= path->btree_id;
diff --git a/fs/bcachefs/xattr.h b/fs/bcachefs/xattr.h
index 66d7a1e30350..03f1b73fc926 100644
--- a/fs/bcachefs/xattr.h
+++ b/fs/bcachefs/xattr.h
@@ -9,10 +9,10 @@ extern const struct bch_hash_desc bch2_xattr_hash_desc;
 int bch2_xattr_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *);
 void bch2_xattr_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 
-#define bch2_bkey_ops_xattr (struct bkey_ops) {		\
+#define bch2_bkey_ops_xattr ((struct bkey_ops) {	\
 	.key_invalid	= bch2_xattr_invalid,		\
 	.val_to_text	= bch2_xattr_to_text,		\
-}
+})
 
 static inline unsigned xattr_val_u64s(unsigned name_len, unsigned val_len)
 {
-- 
cgit 


From 1df3e19996a3b29ed82315bf03cb02ac4e4e70ab Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 29 Oct 2022 15:54:17 -0400
Subject: bcachefs: BCH_WRITE_SYNC

This adds a new flag for the write path, BCH_WRITE_SYNC, and switches
the O_DIRECT write path to use it when we're not running asynchronously.

It runs the btree update after the write in the original thread's
context instead of a kworker, cutting context switches in half.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-io.c |  2 ++
 fs/bcachefs/io.c    | 81 +++++++++++++++++++++++++++++++++--------------------
 fs/bcachefs/io.h    |  2 ++
 3 files changed, 55 insertions(+), 30 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 9e9ada8f007a..dbad24f5f2ea 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -2156,6 +2156,8 @@ static long bch2_dio_write_loop(struct dio_write *dio)
 		dio->op.subvol		= inode->ei_subvol;
 		dio->op.pos		= POS(inode->v.i_ino, (u64) req->ki_pos >> 9);
 
+		if (sync)
+			dio->op.flags |= BCH_WRITE_SYNC;
 		if ((req->ki_flags & IOCB_DSYNC) &&
 		    !c->opts.journal_flush_disabled)
 			dio->op.flags |= BCH_WRITE_FLUSH;
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index 97427487aa79..4818c78e5213 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -596,7 +596,7 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
 
 static void __bch2_write(struct bch_write_op *);
 
-static void bch2_write_done(struct closure *cl)
+static void __bch2_write_done(struct closure *cl)
 {
 	struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
 	struct bch_fs *c = op->c;
@@ -612,7 +612,23 @@ static void bch2_write_done(struct closure *cl)
 
 	EBUG_ON(cl->parent);
 	closure_debug_destroy(cl);
-	op->end_io(op);
+	if (op->end_io)
+		op->end_io(op);
+}
+
+static __always_inline void bch2_write_done(struct bch_write_op *op)
+{
+	if (likely(!(op->flags & BCH_WRITE_FLUSH) || op->error)) {
+		__bch2_write_done(&op->cl);
+	} else if (!(op->flags & BCH_WRITE_SYNC)) {
+		bch2_journal_flush_seq_async(&op->c->journal,
+					     op->journal_seq,
+					     &op->cl);
+		continue_at(&op->cl, __bch2_write_done, index_update_wq(op));
+	} else {
+		bch2_journal_flush_seq(&op->c->journal, op->journal_seq);
+		__bch2_write_done(&op->cl);
+	}
 }
 
 static noinline int bch2_write_drop_io_error_ptrs(struct bch_write_op *op)
@@ -699,6 +715,7 @@ out:
 err:
 	keys->top = keys->keys;
 	op->error = ret;
+	op->flags |= BCH_WRITE_DONE;
 	goto out;
 }
 
@@ -778,9 +795,9 @@ unlock:
 			bch2_journal_flush_seq_async(&op->c->journal,
 						     op->journal_seq,
 						     &op->cl);
-			continue_at(&op->cl, bch2_write_done, index_update_wq(op));
+			continue_at(&op->cl, __bch2_write_done, index_update_wq(op));
 		} else {
-			bch2_write_done(&op->cl);
+			__bch2_write_done(&op->cl);
 		}
 	}
 }
@@ -1271,10 +1288,10 @@ again:
 			? NULL : &op->cl,
 			&wp);
 		if (unlikely(ret)) {
-			if (unlikely(ret != -EAGAIN))
-				goto err;
+			if (ret == -EAGAIN)
+				break;
 
-			break;
+			goto err;
 		}
 
 		EBUG_ON(!wp);
@@ -1283,13 +1300,25 @@ again:
 		ret = bch2_write_extent(op, wp, &bio);
 
 		bch2_alloc_sectors_done(c, wp);
+err:
+		if (ret <= 0) {
+			if (!(op->flags & BCH_WRITE_SYNC)) {
+				spin_lock(&wp->writes_lock);
+				op->wp = wp;
+				list_add_tail(&op->wp_list, &wp->writes);
+				if (wp->state == WRITE_POINT_stopped)
+					__wp_update_state(wp, WRITE_POINT_waiting_io);
+				spin_unlock(&wp->writes_lock);
+			}
 
-		if (ret < 0)
-			goto err;
-
-		if (!ret)
 			op->flags |= BCH_WRITE_DONE;
 
+			if (ret < 0) {
+				op->error = ret;
+				break;
+			}
+		}
+
 		bio->bi_end_io	= bch2_write_endio;
 		bio->bi_private	= &op->cl;
 		bio->bi_opf |= REQ_OP_WRITE;
@@ -1302,36 +1331,28 @@ again:
 		bch2_submit_wbio_replicas(to_wbio(bio), c, BCH_DATA_user,
 					  key_to_write);
 	} while (ret);
-out:
+
 	/*
-	 * If the write can't all be submitted at once, we generally want to
-	 * block synchronously as that signals backpressure to the caller.
+	 * Sync or no?
+	 *
+	 * If we're running asynchronously, wne may still want to block
+	 * synchronously here if we weren't able to submit all of the IO at
+	 * once, as that signals backpressure to the caller.
 	 */
-	if (!(op->flags & BCH_WRITE_DONE) &&
-	    !(op->flags & BCH_WRITE_IN_WORKER)) {
+	if ((op->flags & BCH_WRITE_SYNC) ||
+	    (!(op->flags & BCH_WRITE_DONE) &&
+	     !(op->flags & BCH_WRITE_IN_WORKER))) {
 		closure_sync(&op->cl);
 		__bch2_write_index(op);
 
 		if (!(op->flags & BCH_WRITE_DONE))
 			goto again;
-		bch2_write_done(&op->cl);
+		bch2_write_done(op);
 	} else {
-		spin_lock(&wp->writes_lock);
-		op->wp = wp;
-		list_add_tail(&op->wp_list, &wp->writes);
-		if (wp->state == WRITE_POINT_stopped)
-			__wp_update_state(wp, WRITE_POINT_waiting_io);
-		spin_unlock(&wp->writes_lock);
-
 		continue_at(&op->cl, bch2_write_index, NULL);
 	}
 
 	memalloc_nofs_restore(nofs_flags);
-	return;
-err:
-	op->error = ret;
-	op->flags |= BCH_WRITE_DONE;
-	goto out;
 }
 
 static void bch2_write_data_inline(struct bch_write_op *op, unsigned data_len)
@@ -1374,7 +1395,7 @@ static void bch2_write_data_inline(struct bch_write_op *op, unsigned data_len)
 
 	__bch2_write_index(op);
 err:
-	bch2_write_done(&op->cl);
+	bch2_write_done(op);
 }
 
 /**
diff --git a/fs/bcachefs/io.h b/fs/bcachefs/io.h
index 730c7a5bcce8..9322484135f9 100644
--- a/fs/bcachefs/io.h
+++ b/fs/bcachefs/io.h
@@ -39,6 +39,7 @@ enum bch_write_flags {
 	__BCH_WRITE_WROTE_DATA_INLINE,
 	__BCH_WRITE_FROM_INTERNAL,
 	__BCH_WRITE_CHECK_ENOSPC,
+	__BCH_WRITE_SYNC,
 	__BCH_WRITE_MOVE,
 	__BCH_WRITE_IN_WORKER,
 	__BCH_WRITE_DONE,
@@ -55,6 +56,7 @@ enum bch_write_flags {
 #define BCH_WRITE_WROTE_DATA_INLINE	(1U << __BCH_WRITE_WROTE_DATA_INLINE)
 #define BCH_WRITE_FROM_INTERNAL		(1U << __BCH_WRITE_FROM_INTERNAL)
 #define BCH_WRITE_CHECK_ENOSPC		(1U << __BCH_WRITE_CHECK_ENOSPC)
+#define BCH_WRITE_SYNC			(1U << __BCH_WRITE_SYNC)
 #define BCH_WRITE_MOVE			(1U << __BCH_WRITE_MOVE)
 
 /* Internal: */
-- 
cgit 


From 182c7bbfbfe8d435672b8cb9730b07e88e103670 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 31 Oct 2022 20:30:27 -0400
Subject: bcachefs: DIO write path optimization

 - With BCH_WRITE_SYNC, we no longer need the completion in struct
   dio_write
 - Pull out bch2_dio_write_copy_iov() into a separate non-inline
   function, it's code that doesn't run in the common case
 - Copy mapping and inode pointers into dio_write, avoiding pointer
   chasing at the start of bch2_dio_write_loop()
 - kthread_use_mm() is not needed in the common case; move it into
   bch2_dio_write_loop_async()
 - factor out various helpers from bch2_dio_write_loop() and rework
   control flow for better icache utilization

Other small optimizations:

 - bch2_keylist_free() is only used in one place, at the end of the
   bch2_write() path - drop the reinit
 - in bch2_disk_reservation_put(), check if res->sectors is nonzero
   before touching c->online_reserved, since that will likely be a cache
   miss

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>

bcachefs: More DIO write path optimization

Better code prefetching (?)

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-io.c   | 175 +++++++++++++++++++++++++++++---------------------
 fs/bcachefs/keylist.h |   1 -
 2 files changed, 103 insertions(+), 73 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index dbad24f5f2ea..dff103a66780 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -72,8 +72,9 @@ struct bch_writepage_io {
 };
 
 struct dio_write {
-	struct completion		done;
 	struct kiocb			*req;
+	struct address_space		*mapping;
+	struct bch_inode_info		*inode;
 	struct mm_struct		*mm;
 	unsigned			loop:1,
 					sync:1,
@@ -2043,6 +2044,18 @@ err:
 	return err ? false : ret;
 }
 
+static noinline bool bch2_dio_write_check_allocated(struct dio_write *dio)
+{
+	struct bch_fs *c = dio->op.c;
+	struct bch_inode_info *inode = dio->inode;
+	struct bio *bio = &dio->op.wbio.bio;
+
+	return bch2_check_range_allocated(c, inode_inum(inode),
+				dio->op.pos.offset, bio_sectors(bio),
+				dio->op.opts.data_replicas,
+				dio->op.opts.compression != 0);
+}
+
 /*
  * We're going to return -EIOCBQUEUED, but we haven't finished consuming the
  * iov_iter yet, so we need to stash a copy of the iovec: it might be on the
@@ -2082,27 +2095,71 @@ static noinline int bch2_dio_write_copy_iov(struct dio_write *dio)
 
 static void bch2_dio_write_loop_async(struct bch_write_op *);
 
+static __always_inline long bch2_dio_write_done(struct dio_write *dio)
+{
+	struct bch_fs *c = dio->op.c;
+	struct kiocb *req = dio->req;
+	struct bch_inode_info *inode = dio->inode;
+	bool sync = dio->sync;
+	long ret = dio->op.error ?: ((long) dio->written << 9);
+
+	bch2_pagecache_block_put(&inode->ei_pagecache_lock);
+	bch2_quota_reservation_put(c, inode, &dio->quota_res);
+
+	if (dio->free_iov)
+		kfree(dio->iter.__iov);
+	bio_put(&dio->op.wbio.bio);
+
+	/* inode->i_dio_count is our ref on inode and thus bch_fs */
+	inode_dio_end(&inode->v);
+
+	if (ret < 0)
+		ret = bch2_err_class(ret);
+
+	if (!sync) {
+		req->ki_complete(req, ret);
+		ret = -EIOCBQUEUED;
+	}
+	return ret;
+}
+
+static __always_inline void bch2_dio_write_end(struct dio_write *dio)
+{
+	struct bch_fs *c = dio->op.c;
+	struct kiocb *req = dio->req;
+	struct bch_inode_info *inode = dio->inode;
+	struct bio *bio = &dio->op.wbio.bio;
+
+	i_sectors_acct(c, inode, &dio->quota_res, dio->op.i_sectors_delta);
+	req->ki_pos += (u64) dio->op.written << 9;
+	dio->written += dio->op.written;
+
+	spin_lock(&inode->v.i_lock);
+	if (req->ki_pos > inode->v.i_size)
+		i_size_write(&inode->v, req->ki_pos);
+	spin_unlock(&inode->v.i_lock);
+
+	bio_release_pages(bio, false);
+
+	if (unlikely(dio->op.error))
+		set_bit(EI_INODE_ERROR, &inode->ei_flags);
+}
+
 static long bch2_dio_write_loop(struct dio_write *dio)
 {
-	bool kthread = (current->flags & PF_KTHREAD) != 0;
+	struct bch_fs *c = dio->op.c;
 	struct kiocb *req = dio->req;
-	struct address_space *mapping = req->ki_filp->f_mapping;
-	struct bch_inode_info *inode = file_bch_inode(req->ki_filp);
-	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	struct address_space *mapping = dio->mapping;
+	struct bch_inode_info *inode = dio->inode;
 	struct bio *bio = &dio->op.wbio.bio;
 	unsigned unaligned, iter_count;
 	bool sync = dio->sync, dropped_locks;
 	long ret;
 
-	if (dio->loop)
-		goto loop;
-
 	while (1) {
 		iter_count = dio->iter.count;
 
-		if (kthread && dio->mm)
-			kthread_use_mm(dio->mm);
-		BUG_ON(current->faults_disabled_mapping);
+		EBUG_ON(current->faults_disabled_mapping);
 		current->faults_disabled_mapping = mapping;
 
 		ret = bio_iov_iter_get_pages(bio, &dio->iter);
@@ -2110,8 +2167,6 @@ static long bch2_dio_write_loop(struct dio_write *dio)
 		dropped_locks = fdm_dropped_locks();
 
 		current->faults_disabled_mapping = NULL;
-		if (kthread && dio->mm)
-			kthread_unuse_mm(dio->mm);
 
 		/*
 		 * If the fault handler returned an error but also signalled
@@ -2149,7 +2204,9 @@ static long bch2_dio_write_loop(struct dio_write *dio)
 		}
 
 		bch2_write_op_init(&dio->op, c, io_opts(c, &inode->ei_inode));
-		dio->op.end_io		= bch2_dio_write_loop_async;
+		dio->op.end_io		= sync
+			? NULL
+			: bch2_dio_write_loop_async;
 		dio->op.target		= dio->op.opts.foreground_target;
 		dio->op.write_point	= writepoint_hashed((unsigned long) current);
 		dio->op.nr_replicas	= dio->op.opts.data_replicas;
@@ -2166,86 +2223,58 @@ static long bch2_dio_write_loop(struct dio_write *dio)
 		ret = bch2_disk_reservation_get(c, &dio->op.res, bio_sectors(bio),
 						dio->op.opts.data_replicas, 0);
 		if (unlikely(ret) &&
-		    !bch2_check_range_allocated(c, inode_inum(inode),
-				dio->op.pos.offset, bio_sectors(bio),
-				dio->op.opts.data_replicas,
-				dio->op.opts.compression != 0))
+		    !bch2_dio_write_check_allocated(dio))
 			goto err;
 
 		task_io_account_write(bio->bi_iter.bi_size);
 
-		if (!dio->sync && !dio->loop && dio->iter.count) {
-			if (bch2_dio_write_copy_iov(dio)) {
-				dio->sync = sync = true;
-				goto do_io;
-			}
-		}
-do_io:
+		if (unlikely(dio->iter.count) &&
+		    !dio->sync &&
+		    !dio->loop &&
+		    bch2_dio_write_copy_iov(dio))
+			dio->sync = sync = true;
+
 		dio->loop = true;
 		closure_call(&dio->op.cl, bch2_write, NULL, NULL);
 
-		if (sync)
-			wait_for_completion(&dio->done);
-		else
+		if (!sync)
 			return -EIOCBQUEUED;
-loop:
-		i_sectors_acct(c, inode, &dio->quota_res,
-			       dio->op.i_sectors_delta);
-		req->ki_pos += (u64) dio->op.written << 9;
-		dio->written += dio->op.written;
 
-		spin_lock(&inode->v.i_lock);
-		if (req->ki_pos > inode->v.i_size)
-			i_size_write(&inode->v, req->ki_pos);
-		spin_unlock(&inode->v.i_lock);
+		bch2_dio_write_end(dio);
 
-		bio_release_pages(bio, false);
-		bio->bi_vcnt = 0;
-
-		if (dio->op.error) {
-			set_bit(EI_INODE_ERROR, &inode->ei_flags);
-			break;
-		}
-
-		if (!dio->iter.count)
+		if (likely(!dio->iter.count) || dio->op.error)
 			break;
 
 		bio_reset(bio, NULL, REQ_OP_WRITE);
-		reinit_completion(&dio->done);
 	}
-
-	ret = dio->op.error ?: ((long) dio->written << 9);
+out:
+	return bch2_dio_write_done(dio);
 err:
-	bch2_pagecache_block_put(&inode->ei_pagecache_lock);
-	bch2_quota_reservation_put(c, inode, &dio->quota_res);
-
-	if (dio->free_iov)
-		kfree(dio->iter.__iov);
+	dio->op.error = ret;
 
 	bio_release_pages(bio, false);
-	bio_put(bio);
-
-	/* inode->i_dio_count is our ref on inode and thus bch_fs */
-	inode_dio_end(&inode->v);
-
-	if (ret < 0)
-		ret = bch2_err_class(ret);
-
-	if (!sync) {
-		req->ki_complete(req, ret);
-		ret = -EIOCBQUEUED;
-	}
-	return ret;
+	goto out;
 }
 
 static void bch2_dio_write_loop_async(struct bch_write_op *op)
 {
 	struct dio_write *dio = container_of(op, struct dio_write, op);
+	struct mm_struct *mm = dio->mm;
 
-	if (dio->sync)
-		complete(&dio->done);
-	else
-		bch2_dio_write_loop(dio);
+	bch2_dio_write_end(dio);
+
+	if (likely(!dio->iter.count) || dio->op.error) {
+		bch2_dio_write_done(dio);
+		return;
+	}
+
+	bio_reset(&dio->op.wbio.bio, NULL, REQ_OP_WRITE);
+
+	if (mm)
+		kthread_use_mm(mm);
+	bch2_dio_write_loop(dio);
+	if (mm)
+		kthread_unuse_mm(mm);
 }
 
 static noinline
@@ -2297,8 +2326,9 @@ ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter)
 			       GFP_KERNEL,
 			       &c->dio_write_bioset);
 	dio = container_of(bio, struct dio_write, op.wbio.bio);
-	init_completion(&dio->done);
 	dio->req		= req;
+	dio->mapping		= mapping;
+	dio->inode		= inode;
 	dio->mm			= current->mm;
 	dio->loop		= false;
 	dio->sync		= is_sync_kiocb(req) || extending;
@@ -2306,6 +2336,7 @@ ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter)
 	dio->quota_res.sectors	= 0;
 	dio->written		= 0;
 	dio->iter		= *iter;
+	dio->op.c		= c;
 
 	ret = bch2_quota_reservation_add(c, inode, &dio->quota_res,
 					 iter->count >> 9, true);
diff --git a/fs/bcachefs/keylist.h b/fs/bcachefs/keylist.h
index 195799bb20bc..635efb7e8228 100644
--- a/fs/bcachefs/keylist.h
+++ b/fs/bcachefs/keylist.h
@@ -17,7 +17,6 @@ static inline void bch2_keylist_free(struct keylist *l, u64 *inline_keys)
 {
 	if (l->keys_p != inline_keys)
 		kfree(l->keys_p);
-	bch2_keylist_init(l, inline_keys);
 }
 
 static inline void bch2_keylist_push(struct keylist *l)
-- 
cgit 


From 0cc455b3ca5ffc9b0d5e9b1a21a9f3cd7fde8247 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 1 Nov 2022 03:37:53 -0400
Subject: bcachefs: Inlining improvements

 - Don't call into bch2_encrypt_bio() when we're not encrypting
 - Pull slowpath out of trans_lock_write()
 - Make sure bc2h_trans_journal_res_get() gets inlined.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c        |  3 ++-
 fs/bcachefs/btree_update_leaf.c | 31 ++++++++++++++++---------------
 fs/bcachefs/checksum.c          |  2 +-
 fs/bcachefs/checksum.h          | 12 ++++++++++--
 4 files changed, 29 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index cbba0b79fdb8..4a9476e8399e 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1160,7 +1160,7 @@ int __must_check bch2_btree_path_traverse(struct btree_trans *trans,
 		btree_path_traverse_one(trans, path, flags, _RET_IP_);
 }
 
-static void btree_path_copy(struct btree_trans *trans, struct btree_path *dst,
+static inline void btree_path_copy(struct btree_trans *trans, struct btree_path *dst,
 			    struct btree_path *src)
 {
 	unsigned i, offset = offsetof(struct btree_path, pos);
@@ -1189,6 +1189,7 @@ static struct btree_path *btree_path_clone(struct btree_trans *trans, struct btr
 	return new;
 }
 
+__flatten
 struct btree_path *__bch2_btree_path_make_mut(struct btree_trans *trans,
 			 struct btree_path *path, bool intent)
 {
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 8cc271030be6..7cd3d56a8d7b 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -291,7 +291,7 @@ bch2_trans_journal_preres_get_cold(struct btree_trans *trans, unsigned u64s,
 	return 0;
 }
 
-static inline int bch2_trans_journal_res_get(struct btree_trans *trans,
+static __always_inline int bch2_trans_journal_res_get(struct btree_trans *trans,
 					     unsigned flags)
 {
 	struct bch_fs *c = trans->c;
@@ -729,33 +729,34 @@ bch2_trans_commit_write_locked(struct btree_trans *trans,
 	return ret;
 }
 
+static noinline int trans_lock_write_fail(struct btree_trans *trans, struct btree_insert_entry *i)
+{
+	while (--i >= trans->updates) {
+		if (same_leaf_as_prev(trans, i))
+			continue;
+
+		bch2_btree_node_unlock_write(trans, i->path, insert_l(i)->b);
+	}
+
+	trace_and_count(trans->c, trans_restart_would_deadlock_write, trans);
+	return btree_trans_restart(trans, BCH_ERR_transaction_restart_would_deadlock_write);
+}
+
 static inline int trans_lock_write(struct btree_trans *trans)
 {
 	struct btree_insert_entry *i;
-	int ret;
 
 	trans_for_each_update(trans, i) {
 		if (same_leaf_as_prev(trans, i))
 			continue;
 
-		ret = bch2_btree_node_lock_write(trans, i->path, &insert_l(i)->b->c);
-		if (ret)
-			goto fail;
+		if (bch2_btree_node_lock_write(trans, i->path, &insert_l(i)->b->c))
+			return trans_lock_write_fail(trans, i);
 
 		bch2_btree_node_prep_for_write(trans, i->path, insert_l(i)->b);
 	}
 
 	return 0;
-fail:
-	while (--i >= trans->updates) {
-		if (same_leaf_as_prev(trans, i))
-			continue;
-
-		bch2_btree_node_unlock_write_inlined(trans, i->path, insert_l(i)->b);
-	}
-
-	trace_and_count(trans->c, trans_restart_would_deadlock_write, trans);
-	return btree_trans_restart(trans, BCH_ERR_transaction_restart_would_deadlock_write);
 }
 
 static noinline void bch2_drop_overwrites_from_journal(struct btree_trans *trans)
diff --git a/fs/bcachefs/checksum.c b/fs/bcachefs/checksum.c
index 3268e8d48603..43d22fe8131b 100644
--- a/fs/bcachefs/checksum.c
+++ b/fs/bcachefs/checksum.c
@@ -316,7 +316,7 @@ struct bch_csum bch2_checksum_bio(struct bch_fs *c, unsigned type,
 	return __bch2_checksum_bio(c, type, nonce, bio, &iter);
 }
 
-int bch2_encrypt_bio(struct bch_fs *c, unsigned type,
+int __bch2_encrypt_bio(struct bch_fs *c, unsigned type,
 		     struct nonce nonce, struct bio *bio)
 {
 	struct bio_vec bv;
diff --git a/fs/bcachefs/checksum.h b/fs/bcachefs/checksum.h
index 3d6d13bcfd72..f7ccef7a5520 100644
--- a/fs/bcachefs/checksum.h
+++ b/fs/bcachefs/checksum.h
@@ -61,8 +61,16 @@ int bch2_rechecksum_bio(struct bch_fs *, struct bio *, struct bversion,
 			struct bch_extent_crc_unpacked *,
 			unsigned, unsigned, unsigned);
 
-int bch2_encrypt_bio(struct bch_fs *, unsigned,
-		     struct nonce, struct bio *);
+int __bch2_encrypt_bio(struct bch_fs *, unsigned,
+		       struct nonce, struct bio *);
+
+static inline int bch2_encrypt_bio(struct bch_fs *c, unsigned type,
+				   struct nonce nonce, struct bio *bio)
+{
+	return bch2_csum_type_is_encryption(type)
+		? __bch2_encrypt_bio(c, type, nonce, bio)
+		: 0;
+}
 
 int bch2_decrypt_sb_key(struct bch_fs *, struct bch_sb_field_crypt *,
 			struct bch_key *);
-- 
cgit 


From 984dc67e3bceb8871444961df0d2c2a45d5a3f4e Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 1 Nov 2022 04:23:24 -0400
Subject: bcachefs: Improve __bch2_btree_path_make_mut()

btree_path_copy() doesn't need to call
bch2_btree_path_check_sort_fast() - the newly allocated path will always
be in the correct position, post copy; also delete some redundant
branches from __bch2_btree_path_make_mut().

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 4a9476e8399e..d30fb32bb683 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1175,8 +1175,6 @@ static inline void btree_path_copy(struct btree_trans *trans, struct btree_path
 		if (t != BTREE_NODE_UNLOCKED)
 			six_lock_increment(&dst->l[i].b->c.lock, t);
 	}
-
-	trans->paths_sorted = false;
 }
 
 static struct btree_path *btree_path_clone(struct btree_trans *trans, struct btree_path *src,
@@ -1196,10 +1194,6 @@ struct btree_path *__bch2_btree_path_make_mut(struct btree_trans *trans,
 	__btree_path_put(path, intent);
 	path = btree_path_clone(trans, path, intent);
 	path->preserve = false;
-#ifdef CONFIG_BCACHEFS_DEBUG
-	path->ip_allocated = _RET_IP_;
-#endif
-	path->should_be_locked = false;
 	return path;
 }
 
-- 
cgit 


From 07de1803b888131ef1675b17a0260b50d684175e Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 2 Nov 2022 15:41:32 -0400
Subject: bcachefs: Kill bch2_alloc_sectors_start()

Only used in one place, just inline it there.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_foreground.c | 23 -----------------------
 fs/bcachefs/alloc_foreground.h |  9 ---------
 fs/bcachefs/io.c               | 26 +++++++++++++-------------
 3 files changed, 13 insertions(+), 45 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index ccc6be5a002f..9a4a62211755 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -1228,29 +1228,6 @@ err:
 	return ret;
 }
 
-int bch2_alloc_sectors_start(struct bch_fs *c,
-			     unsigned target,
-			     unsigned erasure_code,
-			     struct write_point_specifier write_point,
-			     struct bch_devs_list *devs_have,
-			     unsigned nr_replicas,
-			     unsigned nr_replicas_required,
-			     enum alloc_reserve reserve,
-			     unsigned flags,
-			     struct closure *cl,
-			     struct write_point **wp_ret)
-{
-	return bch2_trans_do(c, NULL, NULL, 0,
-			     bch2_alloc_sectors_start_trans(&trans, target,
-							    erasure_code,
-							    write_point,
-							    devs_have,
-							    nr_replicas,
-							    nr_replicas_required,
-							    reserve,
-							    flags, cl, wp_ret));
-}
-
 struct bch_extent_ptr bch2_ob_ptr(struct bch_fs *c, struct open_bucket *ob)
 {
 	struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
diff --git a/fs/bcachefs/alloc_foreground.h b/fs/bcachefs/alloc_foreground.h
index 12583a7e7aa3..b784a59d67e7 100644
--- a/fs/bcachefs/alloc_foreground.h
+++ b/fs/bcachefs/alloc_foreground.h
@@ -147,15 +147,6 @@ int bch2_alloc_sectors_start_trans(struct btree_trans *,
 				   unsigned,
 				   struct closure *,
 				   struct write_point **);
-int bch2_alloc_sectors_start(struct bch_fs *,
-			     unsigned, unsigned,
-			     struct write_point_specifier,
-			     struct bch_devs_list *,
-			     unsigned, unsigned,
-			     enum alloc_reserve,
-			     unsigned,
-			     struct closure *,
-			     struct write_point **);
 
 struct bch_extent_ptr bch2_ob_ptr(struct bch_fs *, struct open_bucket *);
 void bch2_alloc_sectors_append_ptrs(struct bch_fs *, struct write_point *,
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index 4818c78e5213..e754f57c1342 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -1274,19 +1274,19 @@ again:
 		 * freeing up space on specific disks, which means that
 		 * allocations for specific disks may hang arbitrarily long:
 		 */
-		ret = bch2_alloc_sectors_start(c,
-			op->target,
-			op->opts.erasure_code && !(op->flags & BCH_WRITE_CACHED),
-			op->write_point,
-			&op->devs_have,
-			op->nr_replicas,
-			op->nr_replicas_required,
-			op->alloc_reserve,
-			op->flags,
-			(op->flags & (BCH_WRITE_ALLOC_NOWAIT|
-				      BCH_WRITE_ONLY_SPECIFIED_DEVS))
-			? NULL : &op->cl,
-			&wp);
+		ret = bch2_trans_do(c, NULL, NULL, 0,
+			bch2_alloc_sectors_start_trans(&trans,
+				op->target,
+				op->opts.erasure_code && !(op->flags & BCH_WRITE_CACHED),
+				op->write_point,
+				&op->devs_have,
+				op->nr_replicas,
+				op->nr_replicas_required,
+				op->alloc_reserve,
+				op->flags,
+				(op->flags & (BCH_WRITE_ALLOC_NOWAIT|
+					      BCH_WRITE_ONLY_SPECIFIED_DEVS))
+				? NULL : &op->cl, &wp));
 		if (unlikely(ret)) {
 			if (ret == -EAGAIN)
 				break;
-- 
cgit 


From 03e83f630223261978e23875299e87ae61403548 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 3 Nov 2022 11:14:04 -0400
Subject: bcachefs: bch2_trans_commit_bkey_invalid()

 - factor out more slowpath code into non-inline function
 - use bch2_print_string_as_lines(), so our error message doesn't get
   truncated

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update_leaf.c | 48 +++++++++++++++++++++++++----------------
 1 file changed, 30 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 7cd3d56a8d7b..1405ad4eda02 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -767,6 +767,33 @@ static noinline void bch2_drop_overwrites_from_journal(struct btree_trans *trans
 		bch2_journal_key_overwritten(trans->c, i->btree_id, i->level, i->k->k.p);
 }
 
+static noinline int bch2_trans_commit_bkey_invalid(struct btree_trans *trans,
+						   struct btree_insert_entry *i,
+						   struct printbuf *err)
+{
+	struct bch_fs *c = trans->c;
+	int rw = (trans->flags & BTREE_INSERT_JOURNAL_REPLAY) ? READ : WRITE;
+
+	printbuf_reset(err);
+	prt_printf(err, "invalid bkey on insert from %s -> %ps",
+		   trans->fn, (void *) i->ip_allocated);
+	prt_newline(err);
+	printbuf_indent_add(err, 2);
+
+	bch2_bkey_val_to_text(err, c, bkey_i_to_s_c(i->k));
+	prt_newline(err);
+
+	bch2_bkey_invalid(c, bkey_i_to_s_c(i->k),
+			  i->bkey_type, rw, err);
+	bch2_print_string_as_lines(KERN_ERR, err->buf);
+
+	bch2_inconsistent_error(c);
+	bch2_dump_trans_updates(trans);
+	printbuf_exit(err);
+
+	return -EINVAL;
+}
+
 /*
  * Get journal reservation, take write locks, and attempt to do btree update(s):
  */
@@ -781,24 +808,9 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans,
 	int rw = (trans->flags & BTREE_INSERT_JOURNAL_REPLAY) ? READ : WRITE;
 
 	trans_for_each_update(trans, i) {
-		if (bch2_bkey_invalid(c, bkey_i_to_s_c(i->k),
-				      i->bkey_type, rw, &buf)) {
-			printbuf_reset(&buf);
-			prt_printf(&buf, "invalid bkey on insert from %s -> %ps",
-			       trans->fn, (void *) i->ip_allocated);
-			prt_newline(&buf);
-			printbuf_indent_add(&buf, 2);
-
-			bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(i->k));
-			prt_newline(&buf);
-
-			bch2_bkey_invalid(c, bkey_i_to_s_c(i->k),
-					  i->bkey_type, rw, &buf);
-
-			bch2_trans_inconsistent(trans, "%s", buf.buf);
-			printbuf_exit(&buf);
-			return -EINVAL;
-		}
+		if (unlikely(bch2_bkey_invalid(c, bkey_i_to_s_c(i->k),
+					       i->bkey_type, rw, &buf)))
+			return bch2_trans_commit_bkey_invalid(trans, i, &buf);
 		btree_insert_entry_checks(trans, i);
 	}
 
-- 
cgit 


From a1ee777bfcceeb916d837321144c782e12082588 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 3 Nov 2022 00:29:43 -0400
Subject: bcachefs: Kill BCH_WRITE_FLUSH

BCH_WRITE_FLUSH is a write flag that causes a journal flush.  It's only
used in the direct IO path, and this will allow for some consolidation
with the regular fsync path, which will help with the upcoming nocow
mode.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/data_update.c |  3 +--
 fs/bcachefs/fs-io.c       | 59 +++++++++++++++++++++++++++++++++++++++++------
 fs/bcachefs/io.c          | 44 ++++++++---------------------------
 fs/bcachefs/io.h          |  5 +---
 fs/bcachefs/io_types.h    |  1 -
 fs/bcachefs/reflink.c     |  2 +-
 6 files changed, 64 insertions(+), 50 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c
index 658868048c22..9d1290ff179a 100644
--- a/fs/bcachefs/data_update.c
+++ b/fs/bcachefs/data_update.c
@@ -226,7 +226,7 @@ int bch2_data_update_index_update(struct bch_write_op *op)
 			bch2_trans_update(&trans, &iter, insert,
 				BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
 			bch2_trans_commit(&trans, &op->res,
-				&op->journal_seq,
+				NULL,
 				BTREE_INSERT_NOFAIL|
 				m->data_opts.btree_insert_flags);
 		if (!ret) {
@@ -320,7 +320,6 @@ int bch2_data_update_init(struct bch_fs *c, struct data_update *m,
 	m->op.flags	|= BCH_WRITE_PAGES_STABLE|
 		BCH_WRITE_PAGES_OWNED|
 		BCH_WRITE_DATA_ENCODED|
-		BCH_WRITE_FROM_INTERNAL|
 		BCH_WRITE_MOVE|
 		m->data_opts.write_flags;
 	m->op.compression_type =
diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index dff103a66780..3c3fa95215ac 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -78,6 +78,7 @@ struct dio_write {
 	struct mm_struct		*mm;
 	unsigned			loop:1,
 					sync:1,
+					flush:1,
 					free_iov:1;
 	struct quota_res		quota_res;
 	u64				written;
@@ -2056,6 +2057,9 @@ static noinline bool bch2_dio_write_check_allocated(struct dio_write *dio)
 				dio->op.opts.compression != 0);
 }
 
+static void bch2_dio_write_loop_async(struct bch_write_op *);
+static __always_inline long bch2_dio_write_done(struct dio_write *dio);
+
 /*
  * We're going to return -EIOCBQUEUED, but we haven't finished consuming the
  * iov_iter yet, so we need to stash a copy of the iovec: it might be on the
@@ -2093,7 +2097,43 @@ static noinline int bch2_dio_write_copy_iov(struct dio_write *dio)
 	return 0;
 }
 
-static void bch2_dio_write_loop_async(struct bch_write_op *);
+static void bch2_dio_write_flush_done(struct closure *cl)
+{
+	struct dio_write *dio = container_of(cl, struct dio_write, op.cl);
+	struct bch_fs *c = dio->op.c;
+
+	closure_debug_destroy(cl);
+
+	dio->op.error = bch2_journal_error(&c->journal);
+
+	bch2_dio_write_done(dio);
+}
+
+static noinline void bch2_dio_write_flush(struct dio_write *dio)
+{
+	struct bch_fs *c = dio->op.c;
+	struct bch_inode_unpacked inode;
+	int ret;
+
+	dio->flush = 0;
+
+	closure_init(&dio->op.cl, NULL);
+
+	if (!dio->op.error) {
+		ret = bch2_inode_find_by_inum(c, inode_inum(dio->inode), &inode);
+		if (ret)
+			dio->op.error = ret;
+		else
+			bch2_journal_flush_seq_async(&c->journal, inode.bi_journal_seq, &dio->op.cl);
+	}
+
+	if (dio->sync) {
+		closure_sync(&dio->op.cl);
+		closure_debug_destroy(&dio->op.cl);
+	} else {
+		continue_at(&dio->op.cl, bch2_dio_write_flush_done, NULL);
+	}
+}
 
 static __always_inline long bch2_dio_write_done(struct dio_write *dio)
 {
@@ -2101,13 +2141,21 @@ static __always_inline long bch2_dio_write_done(struct dio_write *dio)
 	struct kiocb *req = dio->req;
 	struct bch_inode_info *inode = dio->inode;
 	bool sync = dio->sync;
-	long ret = dio->op.error ?: ((long) dio->written << 9);
+	long ret;
+
+	if (unlikely(dio->flush)) {
+		bch2_dio_write_flush(dio);
+		if (!sync)
+			return -EIOCBQUEUED;
+	}
 
 	bch2_pagecache_block_put(&inode->ei_pagecache_lock);
 	bch2_quota_reservation_put(c, inode, &dio->quota_res);
 
 	if (dio->free_iov)
 		kfree(dio->iter.__iov);
+
+	ret = dio->op.error ?: ((long) dio->written << 9);
 	bio_put(&dio->op.wbio.bio);
 
 	/* inode->i_dio_count is our ref on inode and thus bch_fs */
@@ -2215,9 +2263,6 @@ static long bch2_dio_write_loop(struct dio_write *dio)
 
 		if (sync)
 			dio->op.flags |= BCH_WRITE_SYNC;
-		if ((req->ki_flags & IOCB_DSYNC) &&
-		    !c->opts.journal_flush_disabled)
-			dio->op.flags |= BCH_WRITE_FLUSH;
 		dio->op.flags |= BCH_WRITE_CHECK_ENOSPC;
 
 		ret = bch2_disk_reservation_get(c, &dio->op.res, bio_sectors(bio),
@@ -2332,6 +2377,7 @@ ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter)
 	dio->mm			= current->mm;
 	dio->loop		= false;
 	dio->sync		= is_sync_kiocb(req) || extending;
+	dio->flush		= iocb_is_dsync(req) && !c->opts.journal_flush_disabled;
 	dio->free_iov		= false;
 	dio->quota_res.sectors	= 0;
 	dio->written		= 0;
@@ -3050,8 +3096,7 @@ static int __bchfs_fallocate(struct bch_inode_info *inode, int mode,
 		}
 
 		ret = bch2_extent_update(&trans, inode_inum(inode), &iter,
-					 &reservation.k_i,
-				&disk_res, NULL,
+				&reservation.k_i, &disk_res,
 				0, &i_sectors_delta, true);
 		if (ret)
 			goto bkey_err;
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index e754f57c1342..701bfc8ce0e4 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -273,7 +273,6 @@ int bch2_extent_update(struct btree_trans *trans,
 		       struct btree_iter *iter,
 		       struct bkey_i *k,
 		       struct disk_reservation *disk_res,
-		       u64 *journal_seq,
 		       u64 new_i_size,
 		       s64 *i_sectors_delta_total,
 		       bool check_enospc)
@@ -374,7 +373,7 @@ int bch2_extent_update(struct btree_trans *trans,
 	}
 
 	ret =   bch2_trans_update(trans, iter, k, 0) ?:
-		bch2_trans_commit(trans, disk_res, journal_seq,
+		bch2_trans_commit(trans, disk_res, NULL,
 				BTREE_INSERT_NOCHECK_RW|
 				BTREE_INSERT_NOFAIL);
 err:
@@ -438,8 +437,7 @@ int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter,
 		bch2_cut_back(end_pos, &delete);
 
 		ret = bch2_extent_update(trans, inum, iter, &delete,
-				&disk_res, NULL,
-				0, i_sectors_delta, false);
+				&disk_res, 0, i_sectors_delta, false);
 		bch2_disk_reservation_put(c, &disk_res);
 	}
 
@@ -507,7 +505,7 @@ static int bch2_write_index_default(struct bch_write_op *op)
 				     BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
 
 		ret = bch2_extent_update(&trans, inum, &iter, sk.k,
-					 &op->res, &op->journal_seq,
+					 &op->res,
 					 op->new_i_size, &op->i_sectors_delta,
 					 op->flags & BCH_WRITE_CHECK_ENOSPC);
 		bch2_trans_iter_exit(&trans, &iter);
@@ -596,14 +594,11 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
 
 static void __bch2_write(struct bch_write_op *);
 
-static void __bch2_write_done(struct closure *cl)
+static void bch2_write_done(struct closure *cl)
 {
 	struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
 	struct bch_fs *c = op->c;
 
-	if (!op->error && (op->flags & BCH_WRITE_FLUSH))
-		op->error = bch2_journal_error(&c->journal);
-
 	bch2_disk_reservation_put(c, &op->res);
 	percpu_ref_put(&c->writes);
 	bch2_keylist_free(&op->insert_keys, op->inline_keys);
@@ -616,21 +611,6 @@ static void __bch2_write_done(struct closure *cl)
 		op->end_io(op);
 }
 
-static __always_inline void bch2_write_done(struct bch_write_op *op)
-{
-	if (likely(!(op->flags & BCH_WRITE_FLUSH) || op->error)) {
-		__bch2_write_done(&op->cl);
-	} else if (!(op->flags & BCH_WRITE_SYNC)) {
-		bch2_journal_flush_seq_async(&op->c->journal,
-					     op->journal_seq,
-					     &op->cl);
-		continue_at(&op->cl, __bch2_write_done, index_update_wq(op));
-	} else {
-		bch2_journal_flush_seq(&op->c->journal, op->journal_seq);
-		__bch2_write_done(&op->cl);
-	}
-}
-
 static noinline int bch2_write_drop_io_error_ptrs(struct bch_write_op *op)
 {
 	struct keylist *keys = &op->insert_keys;
@@ -789,16 +769,10 @@ unlock:
 
 		__bch2_write_index(op);
 
-		if (!(op->flags & BCH_WRITE_DONE)) {
+		if (!(op->flags & BCH_WRITE_DONE))
 			__bch2_write(op);
-		} else if (!op->error && (op->flags & BCH_WRITE_FLUSH)) {
-			bch2_journal_flush_seq_async(&op->c->journal,
-						     op->journal_seq,
-						     &op->cl);
-			continue_at(&op->cl, __bch2_write_done, index_update_wq(op));
-		} else {
-			__bch2_write_done(&op->cl);
-		}
+		else
+			bch2_write_done(&op->cl);
 	}
 }
 
@@ -1347,7 +1321,7 @@ err:
 
 		if (!(op->flags & BCH_WRITE_DONE))
 			goto again;
-		bch2_write_done(op);
+		bch2_write_done(&op->cl);
 	} else {
 		continue_at(&op->cl, bch2_write_index, NULL);
 	}
@@ -1395,7 +1369,7 @@ static void bch2_write_data_inline(struct bch_write_op *op, unsigned data_len)
 
 	__bch2_write_index(op);
 err:
-	bch2_write_done(op);
+	bch2_write_done(&op->cl);
 }
 
 /**
diff --git a/fs/bcachefs/io.h b/fs/bcachefs/io.h
index 9322484135f9..faf2c2057828 100644
--- a/fs/bcachefs/io.h
+++ b/fs/bcachefs/io.h
@@ -31,7 +31,6 @@ const char *bch2_blk_status_to_str(blk_status_t);
 enum bch_write_flags {
 	__BCH_WRITE_ALLOC_NOWAIT,
 	__BCH_WRITE_CACHED,
-	__BCH_WRITE_FLUSH,
 	__BCH_WRITE_DATA_ENCODED,
 	__BCH_WRITE_PAGES_STABLE,
 	__BCH_WRITE_PAGES_OWNED,
@@ -48,7 +47,6 @@ enum bch_write_flags {
 
 #define BCH_WRITE_ALLOC_NOWAIT		(1U << __BCH_WRITE_ALLOC_NOWAIT)
 #define BCH_WRITE_CACHED		(1U << __BCH_WRITE_CACHED)
-#define BCH_WRITE_FLUSH			(1U << __BCH_WRITE_FLUSH)
 #define BCH_WRITE_DATA_ENCODED		(1U << __BCH_WRITE_DATA_ENCODED)
 #define BCH_WRITE_PAGES_STABLE		(1U << __BCH_WRITE_PAGES_STABLE)
 #define BCH_WRITE_PAGES_OWNED		(1U << __BCH_WRITE_PAGES_OWNED)
@@ -75,7 +73,7 @@ int bch2_sum_sector_overwrites(struct btree_trans *, struct btree_iter *,
 			       struct bkey_i *, bool *, bool *, s64 *, s64 *);
 int bch2_extent_update(struct btree_trans *, subvol_inum,
 		       struct btree_iter *, struct bkey_i *,
-		       struct disk_reservation *, u64 *, u64, s64 *, bool);
+		       struct disk_reservation *, u64, s64 *, bool);
 
 int bch2_fpunch_at(struct btree_trans *, struct btree_iter *,
 		   subvol_inum, u64, s64 *);
@@ -104,7 +102,6 @@ static inline void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c,
 	op->version		= ZERO_VERSION;
 	op->write_point		= (struct write_point_specifier) { 0 };
 	op->res			= (struct disk_reservation) { 0 };
-	op->journal_seq		= 0;
 	op->new_i_size		= U64_MAX;
 	op->i_sectors_delta	= 0;
 }
diff --git a/fs/bcachefs/io_types.h b/fs/bcachefs/io_types.h
index 685fb1183399..b31f2a22f098 100644
--- a/fs/bcachefs/io_types.h
+++ b/fs/bcachefs/io_types.h
@@ -142,7 +142,6 @@ struct bch_write_op {
 
 	struct open_buckets	open_buckets;
 
-	u64			journal_seq;
 	u64			new_i_size;
 	s64			i_sectors_delta;
 
diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c
index d5c14bb2992d..0d4c004d7f9d 100644
--- a/fs/bcachefs/reflink.c
+++ b/fs/bcachefs/reflink.c
@@ -378,7 +378,7 @@ s64 bch2_remap_range(struct bch_fs *c,
 				    dst_end.offset - dst_iter.pos.offset));
 
 		ret = bch2_extent_update(&trans, dst_inum, &dst_iter,
-					 new_dst.k, &disk_res, NULL,
+					 new_dst.k, &disk_res,
 					 new_i_size, i_sectors_delta,
 					 true);
 		bch2_disk_reservation_put(c, &disk_res);
-- 
cgit 


From a7ecd30c8300624448c4e66cd7a7e7209b96ea61 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 4 Nov 2022 13:25:57 -0400
Subject: bcachefs: Factor out two_state_shared_lock

We have a unique lock used for controlling adding to the pagecache: the
lock has two states, where both states are shared - the lock may be held
multiple times for either state - but not both states at the same time.

This is exactly what we need for nocow mode locking, so this patch pulls
it out of fs.c into its own file.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/Makefile                |  1 +
 fs/bcachefs/fs-io.c                 | 50 +++++++++++++++++-----------------
 fs/bcachefs/fs.c                    | 54 +------------------------------------
 fs/bcachefs/fs.h                    | 35 ++++++++----------------
 fs/bcachefs/two_state_shared_lock.c | 33 +++++++++++++++++++++++
 fs/bcachefs/two_state_shared_lock.h | 28 +++++++++++++++++++
 6 files changed, 99 insertions(+), 102 deletions(-)
 create mode 100644 fs/bcachefs/two_state_shared_lock.c
 create mode 100644 fs/bcachefs/two_state_shared_lock.h

(limited to 'fs')

diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile
index 444e79c62b50..966c9b9a74fc 100644
--- a/fs/bcachefs/Makefile
+++ b/fs/bcachefs/Makefile
@@ -65,6 +65,7 @@ bcachefs-y		:=	\
 	sysfs.o			\
 	tests.o			\
 	trace.o			\
+	two_state_shared_lock.o	\
 	util.o			\
 	varint.o		\
 	xattr.o
diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 3c3fa95215ac..ab5b4e086e0a 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -751,25 +751,25 @@ vm_fault_t bch2_page_fault(struct vm_fault *vmf)
 	if (fdm > mapping) {
 		struct bch_inode_info *fdm_host = to_bch_ei(fdm->host);
 
-		if (bch2_pagecache_add_tryget(&inode->ei_pagecache_lock))
+		if (bch2_pagecache_add_tryget(inode))
 			goto got_lock;
 
-		bch2_pagecache_block_put(&fdm_host->ei_pagecache_lock);
+		bch2_pagecache_block_put(fdm_host);
 
-		bch2_pagecache_add_get(&inode->ei_pagecache_lock);
-		bch2_pagecache_add_put(&inode->ei_pagecache_lock);
+		bch2_pagecache_add_get(inode);
+		bch2_pagecache_add_put(inode);
 
-		bch2_pagecache_block_get(&fdm_host->ei_pagecache_lock);
+		bch2_pagecache_block_get(fdm_host);
 
 		/* Signal that lock has been dropped: */
 		set_fdm_dropped_locks();
 		return VM_FAULT_SIGBUS;
 	}
 
-	bch2_pagecache_add_get(&inode->ei_pagecache_lock);
+	bch2_pagecache_add_get(inode);
 got_lock:
 	ret = filemap_fault(vmf);
-	bch2_pagecache_add_put(&inode->ei_pagecache_lock);
+	bch2_pagecache_add_put(inode);
 
 	return ret;
 }
@@ -797,7 +797,7 @@ vm_fault_t bch2_page_mkwrite(struct vm_fault *vmf)
 	 * a write_invalidate_inode_pages_range() that works without dropping
 	 * page lock before invalidating page
 	 */
-	bch2_pagecache_add_get(&inode->ei_pagecache_lock);
+	bch2_pagecache_add_get(inode);
 
 	lock_page(page);
 	isize = i_size_read(&inode->v);
@@ -830,7 +830,7 @@ vm_fault_t bch2_page_mkwrite(struct vm_fault *vmf)
 	wait_for_stable_page(page);
 	ret = VM_FAULT_LOCKED;
 out:
-	bch2_pagecache_add_put(&inode->ei_pagecache_lock);
+	bch2_pagecache_add_put(inode);
 	sb_end_pagefault(inode->v.i_sb);
 
 	return ret;
@@ -1098,7 +1098,7 @@ void bch2_readahead(struct readahead_control *ractl)
 
 	bch2_trans_init(&trans, c, 0, 0);
 
-	bch2_pagecache_add_get(&inode->ei_pagecache_lock);
+	bch2_pagecache_add_get(inode);
 
 	while ((page = readpage_iter_next(&readpages_iter))) {
 		pgoff_t index = readpages_iter.offset + readpages_iter.idx;
@@ -1121,7 +1121,7 @@ void bch2_readahead(struct readahead_control *ractl)
 			   &readpages_iter);
 	}
 
-	bch2_pagecache_add_put(&inode->ei_pagecache_lock);
+	bch2_pagecache_add_put(inode);
 
 	bch2_trans_exit(&trans);
 	kfree(readpages_iter.pages);
@@ -1483,7 +1483,7 @@ int bch2_write_begin(struct file *file, struct address_space *mapping,
 	bch2_page_reservation_init(c, inode, res);
 	*fsdata = res;
 
-	bch2_pagecache_add_get(&inode->ei_pagecache_lock);
+	bch2_pagecache_add_get(inode);
 
 	page = grab_cache_page_write_begin(mapping, index);
 	if (!page)
@@ -1540,7 +1540,7 @@ err:
 	put_page(page);
 	*pagep = NULL;
 err_unlock:
-	bch2_pagecache_add_put(&inode->ei_pagecache_lock);
+	bch2_pagecache_add_put(inode);
 	kfree(res);
 	*fsdata = NULL;
 	return bch2_err_class(ret);
@@ -1584,7 +1584,7 @@ int bch2_write_end(struct file *file, struct address_space *mapping,
 
 	unlock_page(page);
 	put_page(page);
-	bch2_pagecache_add_put(&inode->ei_pagecache_lock);
+	bch2_pagecache_add_put(inode);
 
 	bch2_page_reservation_put(c, inode, res);
 	kfree(res);
@@ -1753,7 +1753,7 @@ static ssize_t bch2_buffered_write(struct kiocb *iocb, struct iov_iter *iter)
 	ssize_t written = 0;
 	int ret = 0;
 
-	bch2_pagecache_add_get(&inode->ei_pagecache_lock);
+	bch2_pagecache_add_get(inode);
 
 	do {
 		unsigned offset = pos & (PAGE_SIZE - 1);
@@ -1811,7 +1811,7 @@ again:
 		balance_dirty_pages_ratelimited(mapping);
 	} while (iov_iter_count(iter));
 
-	bch2_pagecache_add_put(&inode->ei_pagecache_lock);
+	bch2_pagecache_add_put(inode);
 
 	return written ? written : ret;
 }
@@ -1991,9 +1991,9 @@ ssize_t bch2_read_iter(struct kiocb *iocb, struct iov_iter *iter)
 		if (ret >= 0)
 			iocb->ki_pos += ret;
 	} else {
-		bch2_pagecache_add_get(&inode->ei_pagecache_lock);
+		bch2_pagecache_add_get(inode);
 		ret = generic_file_read_iter(iocb, iter);
-		bch2_pagecache_add_put(&inode->ei_pagecache_lock);
+		bch2_pagecache_add_put(inode);
 	}
 out:
 	return bch2_err_class(ret);
@@ -2149,7 +2149,7 @@ static __always_inline long bch2_dio_write_done(struct dio_write *dio)
 			return -EIOCBQUEUED;
 	}
 
-	bch2_pagecache_block_put(&inode->ei_pagecache_lock);
+	bch2_pagecache_block_put(inode);
 	bch2_quota_reservation_put(c, inode, &dio->quota_res);
 
 	if (dio->free_iov)
@@ -2357,7 +2357,7 @@ ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter)
 		goto err;
 
 	inode_dio_begin(&inode->v);
-	bch2_pagecache_block_get(&inode->ei_pagecache_lock);
+	bch2_pagecache_block_get(inode);
 
 	extending = req->ki_pos + iter->count > inode->v.i_size;
 	if (!extending) {
@@ -2403,7 +2403,7 @@ err:
 		inode_unlock(&inode->v);
 	return ret;
 err_put_bio:
-	bch2_pagecache_block_put(&inode->ei_pagecache_lock);
+	bch2_pagecache_block_put(inode);
 	bch2_quota_reservation_put(c, inode, &dio->quota_res);
 	bio_put(bio);
 	inode_dio_end(&inode->v);
@@ -2704,7 +2704,7 @@ int bch2_truncate(struct mnt_idmap *idmap,
 	}
 
 	inode_dio_wait(&inode->v);
-	bch2_pagecache_block_get(&inode->ei_pagecache_lock);
+	bch2_pagecache_block_get(inode);
 
 	ret = bch2_inode_find_by_inum(c, inode_inum(inode), &inode_u);
 	if (ret)
@@ -2783,7 +2783,7 @@ int bch2_truncate(struct mnt_idmap *idmap,
 
 	ret = bch2_setattr_nonsize(idmap, inode, iattr);
 err:
-	bch2_pagecache_block_put(&inode->ei_pagecache_lock);
+	bch2_pagecache_block_put(inode);
 	return bch2_err_class(ret);
 }
 
@@ -3195,7 +3195,7 @@ long bch2_fallocate_dispatch(struct file *file, int mode,
 
 	inode_lock(&inode->v);
 	inode_dio_wait(&inode->v);
-	bch2_pagecache_block_get(&inode->ei_pagecache_lock);
+	bch2_pagecache_block_get(inode);
 
 	ret = file_modified(file);
 	if (ret)
@@ -3212,7 +3212,7 @@ long bch2_fallocate_dispatch(struct file *file, int mode,
 	else
 		ret = -EOPNOTSUPP;
 err:
-	bch2_pagecache_block_put(&inode->ei_pagecache_lock);
+	bch2_pagecache_block_put(inode);
 	inode_unlock(&inode->v);
 	percpu_ref_put(&c->writes);
 
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index 485cb9cbcd51..90297cfc7934 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -43,58 +43,6 @@ static void bch2_vfs_inode_init(struct btree_trans *, subvol_inum,
 				struct bch_inode_unpacked *,
 				struct bch_subvolume *);
 
-static void __pagecache_lock_put(struct pagecache_lock *lock, long i)
-{
-	BUG_ON(atomic_long_read(&lock->v) == 0);
-
-	if (atomic_long_sub_return_release(i, &lock->v) == 0)
-		wake_up_all(&lock->wait);
-}
-
-static bool __pagecache_lock_tryget(struct pagecache_lock *lock, long i)
-{
-	long v = atomic_long_read(&lock->v), old;
-
-	do {
-		old = v;
-
-		if (i > 0 ? v < 0 : v > 0)
-			return false;
-	} while ((v = atomic_long_cmpxchg_acquire(&lock->v,
-					old, old + i)) != old);
-	return true;
-}
-
-static void __pagecache_lock_get(struct pagecache_lock *lock, long i)
-{
-	wait_event(lock->wait, __pagecache_lock_tryget(lock, i));
-}
-
-void bch2_pagecache_add_put(struct pagecache_lock *lock)
-{
-	__pagecache_lock_put(lock, 1);
-}
-
-bool bch2_pagecache_add_tryget(struct pagecache_lock *lock)
-{
-	return __pagecache_lock_tryget(lock, 1);
-}
-
-void bch2_pagecache_add_get(struct pagecache_lock *lock)
-{
-	__pagecache_lock_get(lock, 1);
-}
-
-void bch2_pagecache_block_put(struct pagecache_lock *lock)
-{
-	__pagecache_lock_put(lock, -1);
-}
-
-void bch2_pagecache_block_get(struct pagecache_lock *lock)
-{
-	__pagecache_lock_get(lock, -1);
-}
-
 void bch2_inode_update_after_write(struct btree_trans *trans,
 				   struct bch_inode_info *inode,
 				   struct bch_inode_unpacked *bi,
@@ -1410,7 +1358,7 @@ static struct inode *bch2_alloc_inode(struct super_block *sb)
 
 	inode_init_once(&inode->v);
 	mutex_init(&inode->ei_update_lock);
-	pagecache_lock_init(&inode->ei_pagecache_lock);
+	two_state_lock_init(&inode->ei_pagecache_lock);
 	mutex_init(&inode->ei_quota_lock);
 
 	return &inode->v;
diff --git a/fs/bcachefs/fs.h b/fs/bcachefs/fs.h
index 73b96d0b5d83..4164d0669d70 100644
--- a/fs/bcachefs/fs.h
+++ b/fs/bcachefs/fs.h
@@ -6,31 +6,11 @@
 #include "opts.h"
 #include "str_hash.h"
 #include "quota_types.h"
+#include "two_state_shared_lock.h"
 
 #include <linux/seqlock.h>
 #include <linux/stat.h>
 
-/*
- * Two-state lock - can be taken for add or block - both states are shared,
- * like read side of rwsem, but conflict with other state:
- */
-struct pagecache_lock {
-	atomic_long_t		v;
-	wait_queue_head_t	wait;
-};
-
-static inline void pagecache_lock_init(struct pagecache_lock *lock)
-{
-	atomic_long_set(&lock->v, 0);
-	init_waitqueue_head(&lock->wait);
-}
-
-void bch2_pagecache_add_put(struct pagecache_lock *);
-bool bch2_pagecache_add_tryget(struct pagecache_lock *);
-void bch2_pagecache_add_get(struct pagecache_lock *);
-void bch2_pagecache_block_put(struct pagecache_lock *);
-void bch2_pagecache_block_get(struct pagecache_lock *);
-
 struct bch_inode_info {
 	struct inode		v;
 	unsigned long		ei_flags;
@@ -38,7 +18,7 @@ struct bch_inode_info {
 	struct mutex		ei_update_lock;
 	u64			ei_quota_reserved;
 	unsigned long		ei_last_dirtied;
-	struct pagecache_lock	ei_pagecache_lock;
+	two_state_lock_t	ei_pagecache_lock;
 
 	struct mutex		ei_quota_lock;
 	struct bch_qid		ei_qid;
@@ -49,6 +29,13 @@ struct bch_inode_info {
 	struct bch_inode_unpacked ei_inode;
 };
 
+#define bch2_pagecache_add_put(i)	bch2_two_state_unlock(&i->ei_pagecache_lock, 0)
+#define bch2_pagecache_add_tryget(i)	bch2_two_state_trylock(&i->ei_pagecache_lock, 0)
+#define bch2_pagecache_add_get(i)	bch2_two_state_lock(&i->ei_pagecache_lock, 0)
+
+#define bch2_pagecache_block_put(i)	bch2_two_state_unlock(&i->ei_pagecache_lock, 1)
+#define bch2_pagecache_block_get(i)	bch2_two_state_lock(&i->ei_pagecache_lock, 1)
+
 static inline subvol_inum inode_inum(struct bch_inode_info *inode)
 {
 	return (subvol_inum) {
@@ -95,7 +82,7 @@ do {									\
 			if ((_locks) & INODE_LOCK)			\
 				down_write_nested(&a[i]->v.i_rwsem, i);	\
 			if ((_locks) & INODE_PAGECACHE_BLOCK)		\
-				bch2_pagecache_block_get(&a[i]->ei_pagecache_lock);\
+				bch2_pagecache_block_get(a[i]);\
 			if ((_locks) & INODE_UPDATE_LOCK)			\
 				mutex_lock_nested(&a[i]->ei_update_lock, i);\
 		}							\
@@ -113,7 +100,7 @@ do {									\
 			if ((_locks) & INODE_LOCK)			\
 				up_write(&a[i]->v.i_rwsem);		\
 			if ((_locks) & INODE_PAGECACHE_BLOCK)		\
-				bch2_pagecache_block_put(&a[i]->ei_pagecache_lock);\
+				bch2_pagecache_block_put(a[i]);\
 			if ((_locks) & INODE_UPDATE_LOCK)			\
 				mutex_unlock(&a[i]->ei_update_lock);	\
 		}							\
diff --git a/fs/bcachefs/two_state_shared_lock.c b/fs/bcachefs/two_state_shared_lock.c
new file mode 100644
index 000000000000..dc508d545de0
--- /dev/null
+++ b/fs/bcachefs/two_state_shared_lock.c
@@ -0,0 +1,33 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "two_state_shared_lock.h"
+
+void bch2_two_state_unlock(two_state_lock_t *lock, int s)
+{
+	long i = s ? 1 : -1;
+
+	BUG_ON(atomic_long_read(&lock->v) == 0);
+
+	if (atomic_long_sub_return_release(i, &lock->v) == 0)
+		wake_up_all(&lock->wait);
+}
+
+bool bch2_two_state_trylock(two_state_lock_t *lock, int s)
+{
+	long i = s ? 1 : -1;
+	long v = atomic_long_read(&lock->v), old;
+
+	do {
+		old = v;
+
+		if (i > 0 ? v < 0 : v > 0)
+			return false;
+	} while ((v = atomic_long_cmpxchg_acquire(&lock->v,
+					old, old + i)) != old);
+	return true;
+}
+
+void bch2_two_state_lock(two_state_lock_t *lock, int s)
+{
+	wait_event(lock->wait, bch2_two_state_trylock(lock, s));
+}
diff --git a/fs/bcachefs/two_state_shared_lock.h b/fs/bcachefs/two_state_shared_lock.h
new file mode 100644
index 000000000000..1b4f108908a1
--- /dev/null
+++ b/fs/bcachefs/two_state_shared_lock.h
@@ -0,0 +1,28 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_TWO_STATE_LOCK_H
+#define _BCACHEFS_TWO_STATE_LOCK_H
+
+#include <linux/atomic.h>
+#include <linux/sched.h>
+#include <linux/wait.h>
+
+/*
+ * Two-state lock - can be taken for add or block - both states are shared,
+ * like read side of rwsem, but conflict with other state:
+ */
+typedef struct {
+	atomic_long_t		v;
+	wait_queue_head_t	wait;
+} two_state_lock_t;
+
+static inline void two_state_lock_init(two_state_lock_t *lock)
+{
+	atomic_long_set(&lock->v, 0);
+	init_waitqueue_head(&lock->wait);
+}
+
+void bch2_two_state_unlock(two_state_lock_t *, int);
+bool bch2_two_state_trylock(two_state_lock_t *, int);
+void bch2_two_state_lock(two_state_lock_t *, int);
+
+#endif /* _BCACHEFS_TWO_STATE_LOCK_H */
-- 
cgit 


From b2d1d56b1d34bcfb6da77eb74a2fbcdea92514f9 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 13 Nov 2022 20:01:42 -0500
Subject: bcachefs: Fixes for building in userspace

 - Marking a non-static function as inline doesn't actually work and is
   now causing problems - drop that

 - Introduce BCACHEFS_LOG_PREFIX for when we want to prefix log messages
   with bcachefs (filesystem name)

 - Userspace doesn't have real percpu variables (maybe we can get this
   fixed someday), put an #ifdef around bch2_disk_reservation_add()
   fastpath

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs.h      | 4 ++++
 fs/bcachefs/btree_iter.c    | 8 ++++----
 fs/bcachefs/btree_iter.h    | 5 ++---
 fs/bcachefs/btree_locking.c | 3 +--
 fs/bcachefs/buckets.h       | 4 ++++
 fs/bcachefs/error.c         | 2 ++
 fs/bcachefs/move.c          | 8 ++++----
 fs/bcachefs/move.h          | 3 +--
 fs/bcachefs/movinggc.c      | 2 +-
 fs/bcachefs/rebalance.c     | 2 +-
 fs/bcachefs/recovery.c      | 2 +-
 11 files changed, 25 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index c0416258985b..90a1bd1f9d63 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -226,6 +226,10 @@ do {									\
 	 dynamic_fault("bcachefs:meta:write:" name)
 
 #ifdef __KERNEL__
+#define BCACHEFS_LOG_PREFIX
+#endif
+
+#ifdef BCACHEFS_LOG_PREFIX
 #define bch2_log_msg(_c, fmt)		"bcachefs (%s): " fmt, ((_c)->name)
 #define bch2_fmt(_c, fmt)		bch2_log_msg(_c, fmt "\n")
 #define bch2_fmt_inum(_c, _inum, fmt)	"bcachefs (%s inum %llu): " fmt "\n", ((_c)->name), (_inum)
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index d30fb32bb683..fc057ba11190 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -636,9 +636,9 @@ static inline void __btree_path_level_init(struct btree_path *path,
 		bch2_btree_node_iter_peek(&l->iter, l->b);
 }
 
-inline void bch2_btree_path_level_init(struct btree_trans *trans,
-				       struct btree_path *path,
-				       struct btree *b)
+void bch2_btree_path_level_init(struct btree_trans *trans,
+				struct btree_path *path,
+				struct btree *b)
 {
 	BUG_ON(path->cached);
 
@@ -1554,7 +1554,7 @@ struct btree_path *bch2_path_get(struct btree_trans *trans,
 	return path;
 }
 
-inline struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *path, struct bkey *u)
+struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *path, struct bkey *u)
 {
 
 	struct btree_path_level *l = path_l(path);
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index cfbd07bc9366..38bea61ed7f8 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -177,13 +177,12 @@ int __must_check bch2_btree_path_traverse(struct btree_trans *,
 					  struct btree_path *, unsigned);
 struct btree_path *bch2_path_get(struct btree_trans *, enum btree_id, struct bpos,
 				 unsigned, unsigned, unsigned);
-inline struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *, struct bkey *);
+struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *, struct bkey *);
 
 struct bkey_i *bch2_btree_journal_peek_slot(struct btree_trans *,
 					struct btree_iter *, struct bpos);
 
-inline void bch2_btree_path_level_init(struct btree_trans *,
-				       struct btree_path *, struct btree *);
+void bch2_btree_path_level_init(struct btree_trans *, struct btree_path *, struct btree *);
 
 #ifdef CONFIG_BCACHEFS_DEBUG
 void bch2_trans_verify_paths(struct btree_trans *);
diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c
index 1530457f0e69..9d4be3c9cfd7 100644
--- a/fs/bcachefs/btree_locking.c
+++ b/fs/bcachefs/btree_locking.c
@@ -179,10 +179,9 @@ static noinline int break_cycle(struct lock_graph *g, struct printbuf *cycle)
 	}
 
 	if (unlikely(!best)) {
-		struct bch_fs *c = g->g->trans->c;
 		struct printbuf buf = PRINTBUF;
 
-		bch_err(c, "cycle of nofail locks");
+		prt_printf(&buf, bch2_fmt(g->g->trans->c, "cycle of nofail locks"));
 
 		for (i = g->g; i < g->g + g->nr; i++) {
 			struct btree_trans *trans = i->trans;
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index ff61a0054eaa..0fc101b9aaf1 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -255,6 +255,7 @@ int __bch2_disk_reservation_add(struct bch_fs *,
 static inline int bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res,
 					    u64 sectors, int flags)
 {
+#ifdef __KERNEL__
 	u64 old, new;
 
 	do {
@@ -268,6 +269,9 @@ static inline int bch2_disk_reservation_add(struct bch_fs *c, struct disk_reserv
 	this_cpu_add(*c->online_reserved, sectors);
 	res->sectors			+= sectors;
 	return 0;
+#else
+	return __bch2_disk_reservation_add(c, res, sectors, flags);
+#endif
 }
 
 static inline struct disk_reservation
diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c
index 2fb5102ee31d..3e49d72d65b5 100644
--- a/fs/bcachefs/error.c
+++ b/fs/bcachefs/error.c
@@ -125,8 +125,10 @@ int bch2_fsck_err(struct bch_fs *c, unsigned flags, const char *fmt, ...)
 		s->nr++;
 	}
 
+#ifdef BCACHEFS_LOG_PREFIX
 	if (!strncmp(fmt, "bcachefs:", 9))
 		prt_printf(out, bch2_log_msg(c, ""));
+#endif
 
 	va_start(args, fmt);
 	prt_vprintf(out, fmt, args);
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index 7a9d1e4466c5..a66fbc1faa7b 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -177,7 +177,7 @@ void bch2_moving_ctxt_init(struct moving_context *ctxt,
 	}
 }
 
-void bch_move_stats_init(struct bch_move_stats *stats, char *name)
+void bch2_move_stats_init(struct bch_move_stats *stats, char *name)
 {
 	memset(stats, 0, sizeof(*stats));
 	scnprintf(stats->name, sizeof(stats->name), "%s", name);
@@ -755,7 +755,7 @@ int bch2_data_job(struct bch_fs *c,
 
 	switch (op.op) {
 	case BCH_DATA_OP_REREPLICATE:
-		bch_move_stats_init(stats, "rereplicate");
+		bch2_move_stats_init(stats, "rereplicate");
 		stats->data_type = BCH_DATA_journal;
 		ret = bch2_journal_flush_device_pins(&c->journal, -1);
 
@@ -779,7 +779,7 @@ int bch2_data_job(struct bch_fs *c,
 		if (op.migrate.dev >= c->sb.nr_devices)
 			return -EINVAL;
 
-		bch_move_stats_init(stats, "migrate");
+		bch2_move_stats_init(stats, "migrate");
 		stats->data_type = BCH_DATA_journal;
 		ret = bch2_journal_flush_device_pins(&c->journal, op.migrate.dev);
 
@@ -800,7 +800,7 @@ int bch2_data_job(struct bch_fs *c,
 		ret = bch2_replicas_gc2(c) ?: ret;
 		break;
 	case BCH_DATA_OP_REWRITE_OLD_NODES:
-		bch_move_stats_init(stats, "rewrite_old_nodes");
+		bch2_move_stats_init(stats, "rewrite_old_nodes");
 		ret = bch2_scan_old_btree_nodes(c, stats);
 		break;
 	default:
diff --git a/fs/bcachefs/move.h b/fs/bcachefs/move.h
index 6250c75618c4..2eb6a15542e0 100644
--- a/fs/bcachefs/move.h
+++ b/fs/bcachefs/move.h
@@ -50,8 +50,7 @@ int bch2_data_job(struct bch_fs *,
 		  struct bch_move_stats *,
 		  struct bch_ioctl_data);
 
-inline void bch_move_stats_init(struct bch_move_stats *stats,
-				char *name);
+void bch2_move_stats_init(struct bch_move_stats *stats, char *name);
 
 
 #endif /* _BCACHEFS_MOVE_H */
diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c
index dca8d4a3a89c..09f4303de1f6 100644
--- a/fs/bcachefs/movinggc.c
+++ b/fs/bcachefs/movinggc.c
@@ -213,7 +213,7 @@ static int bch2_copygc(struct bch_fs *c)
 	size_t heap_size = 0;
 	int ret;
 
-	bch_move_stats_init(&move_stats, "copygc");
+	bch2_move_stats_init(&move_stats, "copygc");
 
 	/*
 	 * Find buckets with lowest sector counts, skipping completely
diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c
index 6b9ccc1b3fe3..66c40999163d 100644
--- a/fs/bcachefs/rebalance.c
+++ b/fs/bcachefs/rebalance.c
@@ -189,7 +189,7 @@ static int bch2_rebalance_thread(void *arg)
 	prev_start	= jiffies;
 	prev_cputime	= curr_cputime();
 
-	bch_move_stats_init(&move_stats, "rebalance");
+	bch2_move_stats_init(&move_stats, "rebalance");
 	while (!kthread_wait_freezable(r->enabled)) {
 		cond_resched();
 
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 7eaced534a5b..8d767e787d6b 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -1376,7 +1376,7 @@ use_clean:
 	    le16_to_cpu(c->sb.version_min) < bcachefs_metadata_version_btree_ptr_sectors_written) {
 		struct bch_move_stats stats;
 
-		bch_move_stats_init(&stats, "recovery");
+		bch2_move_stats_init(&stats, "recovery");
 
 		bch_info(c, "scanning for old btree nodes");
 		ret = bch2_fs_read_write(c);
-- 
cgit 


From 961cbdef3c270266e17aa831cf22eb14b900af65 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 13 Nov 2022 20:53:24 -0500
Subject: bcachefs: Delete atomic_inc_bug()

These were wrappers around atomic operations that verified that the
counter wasn't negative, but they're dead code - delete.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/util.h | 26 +-------------------------
 1 file changed, 1 insertion(+), 25 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h
index 4243a22c766c..67b0d3de24cc 100644
--- a/fs/bcachefs/util.h
+++ b/fs/bcachefs/util.h
@@ -23,33 +23,9 @@
 struct closure;
 
 #ifdef CONFIG_BCACHEFS_DEBUG
-
 #define EBUG_ON(cond)		BUG_ON(cond)
-#define atomic_dec_bug(v)	BUG_ON(atomic_dec_return(v) < 0)
-#define atomic_inc_bug(v, i)	BUG_ON(atomic_inc_return(v) <= i)
-#define atomic_sub_bug(i, v)	BUG_ON(atomic_sub_return(i, v) < 0)
-#define atomic_add_bug(i, v)	BUG_ON(atomic_add_return(i, v) < 0)
-#define atomic_long_dec_bug(v)		BUG_ON(atomic_long_dec_return(v) < 0)
-#define atomic_long_sub_bug(i, v)	BUG_ON(atomic_long_sub_return(i, v) < 0)
-#define atomic64_dec_bug(v)	BUG_ON(atomic64_dec_return(v) < 0)
-#define atomic64_inc_bug(v, i)	BUG_ON(atomic64_inc_return(v) <= i)
-#define atomic64_sub_bug(i, v)	BUG_ON(atomic64_sub_return(i, v) < 0)
-#define atomic64_add_bug(i, v)	BUG_ON(atomic64_add_return(i, v) < 0)
-
-#else /* DEBUG */
-
+#else
 #define EBUG_ON(cond)
-#define atomic_dec_bug(v)	atomic_dec(v)
-#define atomic_inc_bug(v, i)	atomic_inc(v)
-#define atomic_sub_bug(i, v)	atomic_sub(i, v)
-#define atomic_add_bug(i, v)	atomic_add(i, v)
-#define atomic_long_dec_bug(v)		atomic_long_dec(v)
-#define atomic_long_sub_bug(i, v)	atomic_long_sub(i, v)
-#define atomic64_dec_bug(v)	atomic64_dec(v)
-#define atomic64_inc_bug(v, i)	atomic64_inc(v)
-#define atomic64_sub_bug(i, v)	atomic64_sub(i, v)
-#define atomic64_add_bug(i, v)	atomic64_add(i, v)
-
 #endif
 
 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
-- 
cgit 


From 061f7999a6322c639dd6616dc6d3785957de2bc3 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 14 Nov 2022 02:22:30 -0500
Subject: bcachefs: Fix a use after free

This fixes a regression from percpu freedlists in the btree key cache
code: in a rare error path, we were immediately freeing a bkey_cached
that had been used before and should've waited for an SRCU barrier.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_key_cache.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index 179669dbd688..1ac91221cc95 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -190,7 +190,8 @@ static void bkey_cached_free_fast(struct btree_key_cache *bc,
 }
 
 static struct bkey_cached *
-bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path)
+bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path,
+		  bool *was_new)
 {
 	struct bch_fs *c = trans->c;
 	struct btree_key_cache *bc = &c->btree_key_cache;
@@ -275,6 +276,7 @@ bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path)
 		ck->c.cached = true;
 		BUG_ON(!six_trylock_intent(&ck->c.lock));
 		BUG_ON(!six_trylock_write(&ck->c.lock));
+		*was_new = true;
 		return ck;
 	}
 
@@ -313,9 +315,9 @@ btree_key_cache_create(struct btree_trans *trans, struct btree_path *path)
 	struct bch_fs *c = trans->c;
 	struct btree_key_cache *bc = &c->btree_key_cache;
 	struct bkey_cached *ck;
-	bool was_new = true;
+	bool was_new = false;
 
-	ck = bkey_cached_alloc(trans, path);
+	ck = bkey_cached_alloc(trans, path, &was_new);
 	if (IS_ERR(ck))
 		return ck;
 
@@ -328,7 +330,6 @@ btree_key_cache_create(struct btree_trans *trans, struct btree_path *path)
 		}
 
 		mark_btree_node_locked(trans, path, 0, SIX_LOCK_intent);
-		was_new = false;
 	} else {
 		if (path->btree_id == BTREE_ID_subvolumes)
 			six_lock_pcpu_alloc(&ck->c.lock);
-- 
cgit 


From 84fea8e5b3abc9147a20211e608ba8844c479998 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 13 Nov 2022 22:35:55 -0500
Subject: bcachefs: Quota: Don't allocate memory under lock

The genradix code can handle multiple threads trying to allocate at the
same time - we don't need the genradix_ptr_alloc() call to happen under
a lock.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/quota.c | 27 +++++++++++++--------------
 1 file changed, 13 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c
index db8172736527..7f74c026e9da 100644
--- a/fs/bcachefs/quota.c
+++ b/fs/bcachefs/quota.c
@@ -364,16 +364,16 @@ int bch2_quota_acct(struct bch_fs *c, struct bch_qid qid,
 
 	memset(&msgs, 0, sizeof(msgs));
 
+	for_each_set_qtype(c, i, q, qtypes) {
+		mq[i] = genradix_ptr_alloc(&q->table, qid.q[i], GFP_KERNEL);
+		if (!mq[i])
+			return -ENOMEM;
+	}
+
 	for_each_set_qtype(c, i, q, qtypes)
 		mutex_lock_nested(&q->lock, i);
 
 	for_each_set_qtype(c, i, q, qtypes) {
-		mq[i] = genradix_ptr_alloc(&q->table, qid.q[i], GFP_NOFS);
-		if (!mq[i]) {
-			ret = -ENOMEM;
-			goto err;
-		}
-
 		ret = bch2_quota_check_limit(c, i, mq[i], &msgs, counter, v, mode);
 		if (ret)
 			goto err;
@@ -416,18 +416,17 @@ int bch2_quota_transfer(struct bch_fs *c, unsigned qtypes,
 
 	memset(&msgs, 0, sizeof(msgs));
 
+	for_each_set_qtype(c, i, q, qtypes) {
+		src_q[i] = genradix_ptr_alloc(&q->table, src.q[i], GFP_KERNEL);
+		dst_q[i] = genradix_ptr_alloc(&q->table, dst.q[i], GFP_KERNEL);
+		if (!src_q[i] || !dst_q[i])
+			return -ENOMEM;
+	}
+
 	for_each_set_qtype(c, i, q, qtypes)
 		mutex_lock_nested(&q->lock, i);
 
 	for_each_set_qtype(c, i, q, qtypes) {
-		src_q[i] = genradix_ptr_alloc(&q->table, src.q[i], GFP_NOFS);
-		dst_q[i] = genradix_ptr_alloc(&q->table, dst.q[i], GFP_NOFS);
-
-		if (!src_q[i] || !dst_q[i]) {
-			ret = -ENOMEM;
-			goto err;
-		}
-
 		ret = bch2_quota_check_limit(c, i, dst_q[i], &msgs, Q_SPC,
 					     dst_q[i]->c[Q_SPC].v + space,
 					     mode);
-- 
cgit 


From 6b1b186a5a8e9cf4770e9546c3606fef40666830 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 13 Nov 2022 22:43:37 -0500
Subject: bcachefs: Minor dio write path improvements

This switches where we take quota reservations to be per bch_wirte_op
instead of per dio_write, so we can drop the quota reservation in the
same place as we call i_sectors_acct(), and only take/release
ei_quota_lock once.

In the future we'd like ei_quota_lock to not be a mutex, so that we can
avoid punting to process context before deliving write completions in
nocow mode.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-io.c | 87 +++++++++++++++++++++++++++++++++--------------------
 1 file changed, 54 insertions(+), 33 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index ab5b4e086e0a..968e670229d3 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -77,6 +77,7 @@ struct dio_write {
 	struct bch_inode_info		*inode;
 	struct mm_struct		*mm;
 	unsigned			loop:1,
+					extending:1,
 					sync:1,
 					flush:1,
 					free_iov:1;
@@ -131,24 +132,29 @@ static noinline int write_invalidate_inode_pages_range(struct address_space *map
 
 #ifdef CONFIG_BCACHEFS_QUOTA
 
-static void bch2_quota_reservation_put(struct bch_fs *c,
-				       struct bch_inode_info *inode,
-				       struct quota_res *res)
+static void __bch2_quota_reservation_put(struct bch_fs *c,
+					 struct bch_inode_info *inode,
+					 struct quota_res *res)
 {
-	if (!res->sectors)
-		return;
-
-	mutex_lock(&inode->ei_quota_lock);
 	BUG_ON(res->sectors > inode->ei_quota_reserved);
 
 	bch2_quota_acct(c, inode->ei_qid, Q_SPC,
 			-((s64) res->sectors), KEY_TYPE_QUOTA_PREALLOC);
 	inode->ei_quota_reserved -= res->sectors;
-	mutex_unlock(&inode->ei_quota_lock);
-
 	res->sectors = 0;
 }
 
+static void bch2_quota_reservation_put(struct bch_fs *c,
+				       struct bch_inode_info *inode,
+				       struct quota_res *res)
+{
+	if (res->sectors) {
+		mutex_lock(&inode->ei_quota_lock);
+		__bch2_quota_reservation_put(c, inode, res);
+		mutex_unlock(&inode->ei_quota_lock);
+	}
+}
+
 static int bch2_quota_reservation_add(struct bch_fs *c,
 				      struct bch_inode_info *inode,
 				      struct quota_res *res,
@@ -171,11 +177,13 @@ static int bch2_quota_reservation_add(struct bch_fs *c,
 
 #else
 
+static void __bch2_quota_reservation_put(struct bch_fs *c,
+					 struct bch_inode_info *inode,
+					 struct quota_res *res) {}
+
 static void bch2_quota_reservation_put(struct bch_fs *c,
 				       struct bch_inode_info *inode,
-				       struct quota_res *res)
-{
-}
+				       struct quota_res *res) {}
 
 static int bch2_quota_reservation_add(struct bch_fs *c,
 				      struct bch_inode_info *inode,
@@ -226,13 +234,9 @@ int __must_check bch2_write_inode_size(struct bch_fs *c,
 	return bch2_write_inode(c, inode, inode_set_size, &s, fields);
 }
 
-static void i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode,
+static void __i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode,
 			   struct quota_res *quota_res, s64 sectors)
 {
-	if (!sectors)
-		return;
-
-	mutex_lock(&inode->ei_quota_lock);
 	bch2_fs_inconsistent_on((s64) inode->v.i_blocks + sectors < 0, c,
 				"inode %lu i_blocks underflow: %llu + %lli < 0 (ondisk %lli)",
 				inode->v.i_ino, (u64) inode->v.i_blocks, sectors,
@@ -250,7 +254,16 @@ static void i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode,
 		bch2_quota_acct(c, inode->ei_qid, Q_SPC, sectors, KEY_TYPE_QUOTA_WARN);
 	}
 #endif
-	mutex_unlock(&inode->ei_quota_lock);
+}
+
+static void i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode,
+			   struct quota_res *quota_res, s64 sectors)
+{
+	if (sectors) {
+		mutex_lock(&inode->ei_quota_lock);
+		__i_sectors_acct(c, inode, quota_res, sectors);
+		mutex_unlock(&inode->ei_quota_lock);
+	}
 }
 
 /* page state: */
@@ -2137,7 +2150,6 @@ static noinline void bch2_dio_write_flush(struct dio_write *dio)
 
 static __always_inline long bch2_dio_write_done(struct dio_write *dio)
 {
-	struct bch_fs *c = dio->op.c;
 	struct kiocb *req = dio->req;
 	struct bch_inode_info *inode = dio->inode;
 	bool sync = dio->sync;
@@ -2150,7 +2162,6 @@ static __always_inline long bch2_dio_write_done(struct dio_write *dio)
 	}
 
 	bch2_pagecache_block_put(inode);
-	bch2_quota_reservation_put(c, inode, &dio->quota_res);
 
 	if (dio->free_iov)
 		kfree(dio->iter.__iov);
@@ -2178,14 +2189,22 @@ static __always_inline void bch2_dio_write_end(struct dio_write *dio)
 	struct bch_inode_info *inode = dio->inode;
 	struct bio *bio = &dio->op.wbio.bio;
 
-	i_sectors_acct(c, inode, &dio->quota_res, dio->op.i_sectors_delta);
-	req->ki_pos += (u64) dio->op.written << 9;
-	dio->written += dio->op.written;
+	req->ki_pos	+= (u64) dio->op.written << 9;
+	dio->written	+= dio->op.written;
 
-	spin_lock(&inode->v.i_lock);
-	if (req->ki_pos > inode->v.i_size)
-		i_size_write(&inode->v, req->ki_pos);
-	spin_unlock(&inode->v.i_lock);
+	if (dio->extending) {
+		spin_lock(&inode->v.i_lock);
+		if (req->ki_pos > inode->v.i_size)
+			i_size_write(&inode->v, req->ki_pos);
+		spin_unlock(&inode->v.i_lock);
+	}
+
+	if (dio->op.i_sectors_delta || dio->quota_res.sectors) {
+		mutex_lock(&inode->ei_quota_lock);
+		__i_sectors_acct(c, inode, &dio->quota_res, dio->op.i_sectors_delta);
+		__bch2_quota_reservation_put(c, inode, &dio->quota_res);
+		mutex_unlock(&inode->ei_quota_lock);
+	}
 
 	bio_release_pages(bio, false);
 
@@ -2265,6 +2284,11 @@ static long bch2_dio_write_loop(struct dio_write *dio)
 			dio->op.flags |= BCH_WRITE_SYNC;
 		dio->op.flags |= BCH_WRITE_CHECK_ENOSPC;
 
+		ret = bch2_quota_reservation_add(c, inode, &dio->quota_res,
+						 bio_sectors(bio), true);
+		if (unlikely(ret))
+			goto err;
+
 		ret = bch2_disk_reservation_get(c, &dio->op.res, bio_sectors(bio),
 						dio->op.opts.data_replicas, 0);
 		if (unlikely(ret) &&
@@ -2298,6 +2322,8 @@ err:
 	dio->op.error = ret;
 
 	bio_release_pages(bio, false);
+
+	bch2_quota_reservation_put(c, inode, &dio->quota_res);
 	goto out;
 }
 
@@ -2376,6 +2402,7 @@ ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter)
 	dio->inode		= inode;
 	dio->mm			= current->mm;
 	dio->loop		= false;
+	dio->extending		= extending;
 	dio->sync		= is_sync_kiocb(req) || extending;
 	dio->flush		= iocb_is_dsync(req) && !c->opts.journal_flush_disabled;
 	dio->free_iov		= false;
@@ -2384,11 +2411,6 @@ ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter)
 	dio->iter		= *iter;
 	dio->op.c		= c;
 
-	ret = bch2_quota_reservation_add(c, inode, &dio->quota_res,
-					 iter->count >> 9, true);
-	if (unlikely(ret))
-		goto err_put_bio;
-
 	if (unlikely(mapping->nrpages)) {
 		ret = write_invalidate_inode_pages_range(mapping,
 						req->ki_pos,
@@ -2404,7 +2426,6 @@ err:
 	return ret;
 err_put_bio:
 	bch2_pagecache_block_put(inode);
-	bch2_quota_reservation_put(c, inode, &dio->quota_res);
 	bio_put(bio);
 	inode_dio_end(&inode->v);
 	goto err;
-- 
cgit 


From 0f35e0860a73a35e8c3aa1afebc45e75eb2fbae6 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 15 Nov 2022 22:48:03 -0500
Subject: bcachefs: Fix return code from btree_path_traverse_one()

trans->restarted is a positive error code, not the usual negative

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c | 2 +-
 fs/bcachefs/btree_iter.h | 5 +++++
 2 files changed, 6 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index fc057ba11190..e4ea77d7c1d8 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1079,7 +1079,7 @@ static int btree_path_traverse_one(struct btree_trans *trans,
 				   unsigned long trace_ip)
 {
 	unsigned depth_want = path->level;
-	int ret = trans->restarted;
+	int ret = -((int) trans->restarted);
 
 	if (unlikely(ret))
 		goto out;
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index 38bea61ed7f8..635cf97f5692 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -359,6 +359,11 @@ __btree_iter_peek_node_and_restart(struct btree_trans *trans, struct btree_iter
 	return b;
 }
 
+/*
+ * XXX
+ * this does not handle transaction restarts from bch2_btree_iter_next_node()
+ * correctly
+ */
 #define __for_each_btree_node(_trans, _iter, _btree_id, _start,		\
 			      _locks_want, _depth, _flags, _b, _ret)	\
 	for (bch2_trans_node_iter_init((_trans), &(_iter), (_btree_id),	\
-- 
cgit 


From 4fcdd6ec345e1aeed23fbdbe53d62965c1a79b99 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 15 Nov 2022 21:52:12 -0500
Subject: bcachefs: Btree split improvement

This improves the bkey_format calculation when splitting btree nodes.
Previously, we'd use a format calculated for the original node for the
lower of the two new nodes.

This was particularly bad on sequential insertions, where we iteratively
split the last btree node, whos format has to include KEY_MAX.

Now, we calculate formats precisely for the keys the two new nodes will
contain. This also should make splitting a bit more efficient, since
we're only copying keys once (from the original node to the new node,
instead of new node, replacement node, then upper split).

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bkey_sort.c             |   2 +
 fs/bcachefs/bset.h                  |   5 +
 fs/bcachefs/btree_update_interior.c | 250 +++++++++++++++---------------------
 3 files changed, 113 insertions(+), 144 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bkey_sort.c b/fs/bcachefs/bkey_sort.c
index be0d4bc1afd3..557a79cad986 100644
--- a/fs/bcachefs/bkey_sort.c
+++ b/fs/bcachefs/bkey_sort.c
@@ -144,6 +144,8 @@ bch2_sort_repack(struct bset *dst, struct btree *src,
 		else
 			bch2_bkey_unpack(src, (void *) out, in);
 
+		out->needs_whiteout = false;
+
 		btree_keys_account_key_add(&nr, 0, out);
 		out = bkey_next(out);
 	}
diff --git a/fs/bcachefs/bset.h b/fs/bcachefs/bset.h
index b352d5a40de0..fd2915a15070 100644
--- a/fs/bcachefs/bset.h
+++ b/fs/bcachefs/bset.h
@@ -447,6 +447,11 @@ struct bkey_s_c bch2_btree_node_iter_peek_unpack(struct btree_node_iter *,
 						struct btree *,
 						struct bkey *);
 
+#define for_each_btree_node_key(b, k, iter)				\
+	for (bch2_btree_node_iter_init_from_start((iter), (b));		\
+	     (k = bch2_btree_node_iter_peek((iter), (b)));		\
+	     bch2_btree_node_iter_advance(iter, b))
+
 #define for_each_btree_node_key_unpack(b, k, iter, unpacked)		\
 	for (bch2_btree_node_iter_init_from_start((iter), (b));		\
 	     (k = bch2_btree_node_iter_peek_unpack((iter), (b), (unpacked))).k;\
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index e0483abadd72..ac3a5ef1b1af 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -377,14 +377,19 @@ static void btree_set_max(struct btree *b, struct bpos pos)
 	b->data->max_key = pos;
 }
 
-struct btree *__bch2_btree_node_alloc_replacement(struct btree_update *as,
-						  struct btree_trans *trans,
-						  struct btree *b,
-						  struct bkey_format format)
+static struct btree *bch2_btree_node_alloc_replacement(struct btree_update *as,
+						       struct btree_trans *trans,
+						       struct btree *b)
 {
-	struct btree *n;
+	struct btree *n = bch2_btree_node_alloc(as, trans, b->c.level);
+	struct bkey_format format = bch2_btree_calc_format(b);
 
-	n = bch2_btree_node_alloc(as, trans, b->c.level);
+	/*
+	 * The keys might expand with the new format - if they wouldn't fit in
+	 * the btree node anymore, use the old format for now:
+	 */
+	if (!bch2_btree_node_format_fits(as->c, b, &format))
+		format = b->format;
 
 	SET_BTREE_NODE_SEQ(n->data, BTREE_NODE_SEQ(b->data) + 1);
 
@@ -397,27 +402,9 @@ struct btree *__bch2_btree_node_alloc_replacement(struct btree_update *as,
 	bch2_btree_sort_into(as->c, n, b);
 
 	btree_node_reset_sib_u64s(n);
-
-	n->key.k.p = b->key.k.p;
 	return n;
 }
 
-static struct btree *bch2_btree_node_alloc_replacement(struct btree_update *as,
-						       struct btree_trans *trans,
-						       struct btree *b)
-{
-	struct bkey_format new_f = bch2_btree_calc_format(b);
-
-	/*
-	 * The keys might expand with the new format - if they wouldn't fit in
-	 * the btree node anymore, use the old format for now:
-	 */
-	if (!bch2_btree_node_format_fits(as->c, b, &new_f))
-		new_f = b->format;
-
-	return __bch2_btree_node_alloc_replacement(as, trans, b, new_f);
-}
-
 static struct btree *__btree_root_alloc(struct btree_update *as,
 				struct btree_trans *trans, unsigned level)
 {
@@ -1331,8 +1318,12 @@ __bch2_btree_insert_keys_interior(struct btree_update *as,
 		;
 
 	while (!bch2_keylist_empty(keys)) {
-		bch2_insert_fixup_btree_ptr(as, trans, path, b,
-				&node_iter, bch2_keylist_front(keys));
+		struct bkey_i *k = bch2_keylist_front(keys);
+
+		if (bpos_cmp(k->k.p, b->key.k.p) > 0)
+			break;
+
+		bch2_insert_fixup_btree_ptr(as, trans, path, b, &node_iter, k);
 		bch2_keylist_pop_front(keys);
 	}
 }
@@ -1341,109 +1332,91 @@ __bch2_btree_insert_keys_interior(struct btree_update *as,
  * Move keys from n1 (original replacement node, now lower node) to n2 (higher
  * node)
  */
-static struct btree *__btree_split_node(struct btree_update *as,
-					struct btree_trans *trans,
-					struct btree *n1)
+static void __btree_split_node(struct btree_update *as,
+			       struct btree_trans *trans,
+			       struct btree *b,
+			       struct btree *n[2])
 {
-	struct bkey_format_state s;
-	size_t nr_packed = 0, nr_unpacked = 0;
-	struct btree *n2;
-	struct bset *set1, *set2;
-	struct bkey_packed *k, *set2_start, *set2_end, *out, *prev = NULL;
-	struct bpos n1_pos;
+	struct bkey_packed *k;
+	struct bpos n1_pos = POS_MIN;
+	struct btree_node_iter iter;
+	struct bset *bsets[2];
+	struct bkey_format_state format[2];
+	struct bkey_packed *out[2];
+	struct bkey uk;
+	unsigned u64s, n1_u64s = (b->nr.live_u64s * 3) / 5;
+	int i;
 
-	n2 = bch2_btree_node_alloc(as, trans, n1->c.level);
+	for (i = 0; i < 2; i++) {
+		BUG_ON(n[i]->nsets != 1);
 
-	n2->data->max_key	= n1->data->max_key;
-	n2->data->format	= n1->format;
-	SET_BTREE_NODE_SEQ(n2->data, BTREE_NODE_SEQ(n1->data));
-	n2->key.k.p = n1->key.k.p;
+		bsets[i] = btree_bset_first(n[i]);
+		out[i] = bsets[i]->start;
 
-	set1 = btree_bset_first(n1);
-	set2 = btree_bset_first(n2);
+		SET_BTREE_NODE_SEQ(n[i]->data, BTREE_NODE_SEQ(b->data) + 1);
+		bch2_bkey_format_init(&format[i]);
+	}
 
-	/*
-	 * Has to be a linear search because we don't have an auxiliary
-	 * search tree yet
-	 */
-	k = set1->start;
-	while (1) {
-		struct bkey_packed *n = bkey_next(k);
+	u64s = 0;
+	for_each_btree_node_key(b, k, &iter) {
+		if (bkey_deleted(k))
+			continue;
+
+		i = u64s >= n1_u64s;
+		u64s += k->u64s;
+		uk = bkey_unpack_key(b, k);
+		if (!i)
+			n1_pos = uk.p;
+		bch2_bkey_format_add_key(&format[i], &uk);
+	}
 
-		if (n == vstruct_last(set1))
-			break;
-		if (k->_data - set1->_data >= (le16_to_cpu(set1->u64s) * 3) / 5)
-			break;
+	btree_set_min(n[0], b->data->min_key);
+	btree_set_max(n[0], n1_pos);
+	btree_set_min(n[1], bpos_successor(n1_pos));
+	btree_set_max(n[1], b->data->max_key);
 
-		if (bkey_packed(k))
-			nr_packed++;
-		else
-			nr_unpacked++;
+	for (i = 0; i < 2; i++) {
+		bch2_bkey_format_add_pos(&format[i], n[i]->data->min_key);
+		bch2_bkey_format_add_pos(&format[i], n[i]->data->max_key);
 
-		prev = k;
-		k = n;
+		n[i]->data->format = bch2_bkey_format_done(&format[i]);
+		btree_node_set_format(n[i], n[i]->data->format);
 	}
 
-	BUG_ON(!prev);
-	set2_start	= k;
-	set2_end	= vstruct_last(set1);
-
-	set1->u64s = cpu_to_le16((u64 *) set2_start - set1->_data);
-	set_btree_bset_end(n1, n1->set);
+	u64s = 0;
+	for_each_btree_node_key(b, k, &iter) {
+		if (bkey_deleted(k))
+			continue;
 
-	n1->nr.live_u64s	= le16_to_cpu(set1->u64s);
-	n1->nr.bset_u64s[0]	= le16_to_cpu(set1->u64s);
-	n1->nr.packed_keys	= nr_packed;
-	n1->nr.unpacked_keys	= nr_unpacked;
+		i = u64s >= n1_u64s;
+		u64s += k->u64s;
 
-	n1_pos = bkey_unpack_pos(n1, prev);
-	if (as->c->sb.version < bcachefs_metadata_version_snapshot)
-		n1_pos.snapshot = U32_MAX;
-
-	btree_set_max(n1, n1_pos);
-	btree_set_min(n2, bpos_successor(n1->key.k.p));
+		if (bch2_bkey_transform(&n[i]->format, out[i], bkey_packed(k)
+					? &b->format: &bch2_bkey_format_current, k))
+			out[i]->format = KEY_FORMAT_LOCAL_BTREE;
+		else
+			bch2_bkey_unpack(b, (void *) out[i], k);
 
-	bch2_bkey_format_init(&s);
-	bch2_bkey_format_add_pos(&s, n2->data->min_key);
-	bch2_bkey_format_add_pos(&s, n2->data->max_key);
+		out[i]->needs_whiteout = false;
 
-	for (k = set2_start; k != set2_end; k = bkey_next(k)) {
-		struct bkey uk = bkey_unpack_key(n1, k);
-		bch2_bkey_format_add_key(&s, &uk);
+		btree_keys_account_key_add(&n[i]->nr, 0, out[i]);
+		out[i] = bkey_next(out[i]);
 	}
 
-	n2->data->format = bch2_bkey_format_done(&s);
-	btree_node_set_format(n2, n2->data->format);
+	for (i = 0; i < 2; i++) {
+		bsets[i]->u64s = cpu_to_le16((u64 *) out[i] - bsets[i]->_data);
 
-	out = set2->start;
-	memset(&n2->nr, 0, sizeof(n2->nr));
+		BUG_ON(!bsets[i]->u64s);
 
-	for (k = set2_start; k != set2_end; k = bkey_next(k)) {
-		BUG_ON(!bch2_bkey_transform(&n2->format, out, bkey_packed(k)
-				       ? &n1->format : &bch2_bkey_format_current, k));
-		out->format = KEY_FORMAT_LOCAL_BTREE;
-		btree_keys_account_key_add(&n2->nr, 0, out);
-		out = bkey_next(out);
-	}
+		set_btree_bset_end(n[i], n[i]->set);
 
-	set2->u64s = cpu_to_le16((u64 *) out - set2->_data);
-	set_btree_bset_end(n2, n2->set);
+		btree_node_reset_sib_u64s(n[i]);
 
-	BUG_ON(!set1->u64s);
-	BUG_ON(!set2->u64s);
+		bch2_verify_btree_nr_keys(n[i]);
 
-	btree_node_reset_sib_u64s(n1);
-	btree_node_reset_sib_u64s(n2);
-
-	bch2_verify_btree_nr_keys(n1);
-	bch2_verify_btree_nr_keys(n2);
-
-	if (n1->c.level) {
-		btree_node_interior_verify(as->c, n1);
-		btree_node_interior_verify(as->c, n2);
+		if (b->c.level)
+			btree_node_interior_verify(as->c, n[i]);
 	}
-
-	return n2;
 }
 
 /*
@@ -1463,41 +1436,17 @@ static void btree_split_insert_keys(struct btree_update *as,
 				    struct btree *b,
 				    struct keylist *keys)
 {
-	struct btree_node_iter node_iter;
-	struct bkey_i *k = bch2_keylist_front(keys);
-	struct bkey_packed *src, *dst, *n;
-	struct bset *i;
+	if (!bch2_keylist_empty(keys) &&
+	    bpos_cmp(bch2_keylist_front(keys)->k.p,
+		     b->data->max_key) <= 0) {
+		struct btree_node_iter node_iter;
 
-	bch2_btree_node_iter_init(&node_iter, b, &k->k.p);
+		bch2_btree_node_iter_init(&node_iter, b, &bch2_keylist_front(keys)->k.p);
 
-	__bch2_btree_insert_keys_interior(as, trans, path, b, node_iter, keys);
+		__bch2_btree_insert_keys_interior(as, trans, path, b, node_iter, keys);
 
-	/*
-	 * We can't tolerate whiteouts here - with whiteouts there can be
-	 * duplicate keys, and it would be rather bad if we picked a duplicate
-	 * for the pivot:
-	 */
-	i = btree_bset_first(b);
-	src = dst = i->start;
-	while (src != vstruct_last(i)) {
-		n = bkey_next(src);
-		if (!bkey_deleted(src)) {
-			memmove_u64s_down(dst, src, src->u64s);
-			dst = bkey_next(dst);
-		}
-		src = n;
+		btree_node_interior_verify(as->c, b);
 	}
-
-	/* Also clear out the unwritten whiteouts area: */
-	b->whiteout_u64s = 0;
-
-	i->u64s = cpu_to_le16((u64 *) dst - i->_data);
-	set_btree_bset_end(b, b->set);
-
-	BUG_ON(b->nsets != 1 ||
-	       b->nr.live_u64s != le16_to_cpu(btree_bset_first(b)->u64s));
-
-	btree_node_interior_verify(as->c, b);
 }
 
 static int btree_split(struct btree_update *as, struct btree_trans *trans,
@@ -1516,15 +1465,21 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans,
 
 	bch2_btree_interior_update_will_free_node(as, b);
 
-	n1 = bch2_btree_node_alloc_replacement(as, trans, b);
+	if (b->nr.live_u64s > BTREE_SPLIT_THRESHOLD(c)) {
+		struct btree *n[2];
 
-	if (keys)
-		btree_split_insert_keys(as, trans, path, n1, keys);
-
-	if (bset_u64s(&n1->set[0]) > BTREE_SPLIT_THRESHOLD(c)) {
 		trace_and_count(c, btree_node_split, c, b);
 
-		n2 = __btree_split_node(as, trans, n1);
+		n[0] = n1 = bch2_btree_node_alloc(as, trans, b->c.level);
+		n[1] = n2 = bch2_btree_node_alloc(as, trans, b->c.level);
+
+		__btree_split_node(as, trans, b, n);
+
+		if (keys) {
+			btree_split_insert_keys(as, trans, path, n1, keys);
+			btree_split_insert_keys(as, trans, path, n2, keys);
+			BUG_ON(!bch2_keylist_empty(keys));
+		}
 
 		bch2_btree_build_aux_trees(n2);
 		bch2_btree_build_aux_trees(n1);
@@ -1573,6 +1528,13 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans,
 	} else {
 		trace_and_count(c, btree_node_compact, c, b);
 
+		n1 = bch2_btree_node_alloc_replacement(as, trans, b);
+
+		if (keys) {
+			btree_split_insert_keys(as, trans, path, n1, keys);
+			BUG_ON(!bch2_keylist_empty(keys));
+		}
+
 		bch2_btree_build_aux_trees(n1);
 		bch2_btree_update_add_new_node(as, n1);
 		six_unlock_write(&n1->c.lock);
-- 
cgit 


From 3bce13837395f1af2fd585ef6dd4b54d80c2d1fb Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 15 Nov 2022 23:17:55 -0500
Subject: bcachefs: Fix for_each_btree_key2()

Previously, when we exited from the loop body with a break statement
_ret wouldn't have been assigned to yet, and we could spuriously return
a transaction restart error.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index 635cf97f5692..1c60122c5ea5 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -490,11 +490,11 @@ __bch2_btree_iter_peek_and_restart(struct btree_trans *trans,
 									\
 	while (1) {							\
 		u32 _restart_count = bch2_trans_begin(_trans);		\
+									\
+		_ret = 0;						\
 		(_k) = bch2_btree_iter_peek_type(&(_iter), (_flags));	\
-		if (!(_k).k) {						\
-			_ret = 0;					\
+		if (!(_k).k)						\
 			break;						\
-		}							\
 									\
 		_ret = bkey_err(_k) ?: (_do);				\
 		if (bch2_err_matches(_ret, BCH_ERR_transaction_restart))\
-- 
cgit 


From 8eb71e9e1af8aaec3c70673560931a328fe840bd Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 15 Nov 2022 15:57:07 -0500
Subject: bcachefs: Improve a few warnings

Warnings ought to always have a format string/log message - makes them
considerably more useful.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-io.c | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 968e670229d3..36c42231b7ed 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -1442,9 +1442,13 @@ do_io:
 				     sectors << 9, offset << 9));
 
 		/* Check for writing past i_size: */
-		WARN_ON_ONCE((bio_end_sector(&w->io->op.wbio.bio) << 9) >
-			     round_up(i_size, block_bytes(c)) &&
-			     !test_bit(BCH_FS_EMERGENCY_RO, &c->flags));
+		WARN_ONCE((bio_end_sector(&w->io->op.wbio.bio) << 9) >
+			  round_up(i_size, block_bytes(c)) &&
+			  !test_bit(BCH_FS_EMERGENCY_RO, &c->flags),
+			  "writing past i_size: %llu > %llu (unrounded %llu)\n",
+			  bio_end_sector(&w->io->op.wbio.bio) << 9,
+			  round_up(i_size, block_bytes(c)),
+			  i_size);
 
 		w->io->op.res.sectors += reserved_sectors;
 		w->io->op.i_sectors_delta -= dirty_sectors;
@@ -2740,8 +2744,10 @@ int bch2_truncate(struct mnt_idmap *idmap,
 	if (ret)
 		goto err;
 
-	WARN_ON(!test_bit(EI_INODE_ERROR, &inode->ei_flags) &&
-		inode->v.i_size < inode_u.bi_size);
+	WARN_ONCE(!test_bit(EI_INODE_ERROR, &inode->ei_flags) &&
+		  inode->v.i_size < inode_u.bi_size,
+		  "truncate spotted in mem i_size < btree i_size: %llu < %llu\n",
+		  (u64) inode->v.i_size, inode_u.bi_size);
 
 	if (iattr->ia_size > inode->v.i_size) {
 		ret = bch2_extend(idmap, inode, &inode_u, iattr);
-- 
cgit 


From 7fec8266af12b655e98978050e716e12e8544fe6 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 15 Nov 2022 20:25:08 -0500
Subject: bcachefs: Error message improvement

 - Centralize format strings in bcachefs.h
 - Add bch2_fmt_inum_offset() and related helpers
 - Switch error messages for inodes to also print out the offset, in
   bytes

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs.h   | 39 ++++++++++++++++++++++++++++++------
 fs/bcachefs/error.h      | 29 +++++++++------------------
 fs/bcachefs/fs-io.c      |  4 +++-
 fs/bcachefs/io.c         | 52 ++++++++++++++++++++++++++++++++----------------
 fs/bcachefs/io_types.h   |  1 +
 fs/bcachefs/journal_io.c |  8 ++++----
 6 files changed, 85 insertions(+), 48 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 90a1bd1f9d63..c8ab0e2029df 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -230,15 +230,27 @@ do {									\
 #endif
 
 #ifdef BCACHEFS_LOG_PREFIX
-#define bch2_log_msg(_c, fmt)		"bcachefs (%s): " fmt, ((_c)->name)
-#define bch2_fmt(_c, fmt)		bch2_log_msg(_c, fmt "\n")
-#define bch2_fmt_inum(_c, _inum, fmt)	"bcachefs (%s inum %llu): " fmt "\n", ((_c)->name), (_inum)
+
+#define bch2_log_msg(_c, fmt)			"bcachefs (%s): " fmt, ((_c)->name)
+#define bch2_fmt_dev(_ca, fmt)			"bcachefs (%s): " fmt "\n", ((_ca)->name)
+#define bch2_fmt_dev_offset(_ca, _offset, fmt)	"bcachefs (%s sector %llu): " fmt "\n", ((_ca)->name), (_offset)
+#define bch2_fmt_inum(_c, _inum, fmt)		"bcachefs (%s inum %llu): " fmt "\n", ((_c)->name), (_inum)
+#define bch2_fmt_inum_offset(_c, _inum, _offset, fmt)			\
+	 "bcachefs (%s inum %llu offset %llu): " fmt "\n", ((_c)->name), (_inum), (_offset)
+
 #else
-#define bch2_log_msg(_c, fmt)		fmt
-#define bch2_fmt(_c, fmt)		fmt "\n"
-#define bch2_fmt_inum(_c, _inum, fmt)	"inum %llu: " fmt "\n", (_inum)
+
+#define bch2_log_msg(_c, fmt)			fmt
+#define bch2_fmt_dev(_ca, fmt)			"%s: " fmt "\n", ((_ca)->name)
+#define bch2_fmt_dev_offset(_ca, _offset, fmt)	"%s sector %llu: " fmt "\n", ((_ca)->name), (_offset)
+#define bch2_fmt_inum(_c, _inum, fmt)		"inum %llu: " fmt "\n", (_inum)
+#define bch2_fmt_inum_offset(_c, _inum, _offset, fmt)				\
+	 "inum %llu offset %llu: " fmt "\n", (_inum), (_offset)
+
 #endif
 
+#define bch2_fmt(_c, fmt)		bch2_log_msg(_c, fmt "\n")
+
 #define bch_info(c, fmt, ...) \
 	printk(KERN_INFO bch2_fmt(c, fmt), ##__VA_ARGS__)
 #define bch_notice(c, fmt, ...) \
@@ -247,13 +259,28 @@ do {									\
 	printk(KERN_WARNING bch2_fmt(c, fmt), ##__VA_ARGS__)
 #define bch_warn_ratelimited(c, fmt, ...) \
 	printk_ratelimited(KERN_WARNING bch2_fmt(c, fmt), ##__VA_ARGS__)
+
 #define bch_err(c, fmt, ...) \
 	printk(KERN_ERR bch2_fmt(c, fmt), ##__VA_ARGS__)
+#define bch_err_dev(ca, fmt, ...) \
+	printk(KERN_ERR bch2_fmt_dev(ca, fmt), ##__VA_ARGS__)
+#define bch_err_dev_offset(ca, _offset, fmt, ...) \
+	printk(KERN_ERR bch2_fmt_dev_offset(ca, _offset, fmt), ##__VA_ARGS__)
+#define bch_err_inum(c, _inum, fmt, ...) \
+	printk(KERN_ERR bch2_fmt_inum(c, _inum, fmt), ##__VA_ARGS__)
+#define bch_err_inum_offset(c, _inum, _offset, fmt, ...) \
+	printk(KERN_ERR bch2_fmt_inum_offset(c, _inum, _offset, fmt), ##__VA_ARGS__)
 
 #define bch_err_ratelimited(c, fmt, ...) \
 	printk_ratelimited(KERN_ERR bch2_fmt(c, fmt), ##__VA_ARGS__)
+#define bch_err_dev_ratelimited(ca, fmt, ...) \
+	printk_ratelimited(KERN_ERR bch2_fmt_dev(ca, fmt), ##__VA_ARGS__)
+#define bch_err_dev_offset_ratelimited(ca, _offset, fmt, ...) \
+	printk_ratelimited(KERN_ERR bch2_fmt_dev_offset(ca, _offset, fmt), ##__VA_ARGS__)
 #define bch_err_inum_ratelimited(c, _inum, fmt, ...) \
 	printk_ratelimited(KERN_ERR bch2_fmt_inum(c, _inum, fmt), ##__VA_ARGS__)
+#define bch_err_inum_offset_ratelimited(c, _inum, _offset, fmt, ...) \
+	printk_ratelimited(KERN_ERR bch2_fmt_inum_offset(c, _inum, _offset, fmt), ##__VA_ARGS__)
 
 #define bch_verbose(c, fmt, ...)					\
 do {									\
diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h
index bbf9b6d85b4d..dae72620dae3 100644
--- a/fs/bcachefs/error.h
+++ b/fs/bcachefs/error.h
@@ -186,36 +186,25 @@ void bch2_io_error_work(struct work_struct *);
 /* Does the error handling without logging a message */
 void bch2_io_error(struct bch_dev *);
 
-/* Logs message and handles the error: */
-#define bch2_dev_io_error(ca, fmt, ...)					\
-do {									\
-	printk_ratelimited(KERN_ERR "bcachefs (%s): " fmt,		\
-		(ca)->name, ##__VA_ARGS__);				\
-	bch2_io_error(ca);						\
-} while (0)
-
-#define bch2_dev_inum_io_error(ca, _inum, _offset, fmt, ...)		\
-do {									\
-	printk_ratelimited(KERN_ERR "bcachefs (%s inum %llu offset %llu): " fmt,\
-		(ca)->name, (_inum), (_offset), ##__VA_ARGS__);		\
-	bch2_io_error(ca);						\
-} while (0)
-
 #define bch2_dev_io_err_on(cond, ca, ...)				\
 ({									\
 	bool _ret = (cond);						\
 									\
-	if (_ret)							\
-		bch2_dev_io_error(ca, __VA_ARGS__);			\
+	if (_ret) {							\
+		bch_err_dev_ratelimited(ca, __VA_ARGS__);		\
+		bch2_io_error(ca);					\
+	}								\
 	_ret;								\
 })
 
-#define bch2_dev_inum_io_err_on(cond, ca, _inum, _offset, ...)		\
+#define bch2_dev_inum_io_err_on(cond, ca, ...)				\
 ({									\
 	bool _ret = (cond);						\
 									\
-	if (_ret)							\
-		bch2_dev_inum_io_error(ca, _inum, _offset, __VA_ARGS__);\
+	if (_ret) {							\
+		bch_err_inum_offset_ratelimited(ca, __VA_ARGS__);	\
+		bch2_io_error(ca);					\
+	}								\
 	_ret;								\
 })
 
diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 36c42231b7ed..8deb476a17c8 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -1087,7 +1087,9 @@ err:
 		goto retry;
 
 	if (ret) {
-		bch_err_inum_ratelimited(c, inum.inum,
+		bch_err_inum_offset_ratelimited(c,
+				iter.pos.inode,
+				iter.pos.offset << 9,
 				"read error %i from btree lookup", ret);
 		rbio->bio.bi_status = BLK_STS_IOERR;
 		bio_endio(&rbio->bio);
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index 701bfc8ce0e4..821c14763c66 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -571,6 +571,7 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
 		n->have_ioref		= bch2_dev_get_ioref(ca,
 					type == BCH_DATA_btree ? READ : WRITE);
 		n->submit_time		= local_clock();
+		n->inode_offset		= bkey_start_offset(&k->k);
 		n->bio.bi_iter.bi_sector = ptr->offset;
 
 		if (likely(n->have_ioref)) {
@@ -680,8 +681,12 @@ static void __bch2_write_index(struct bch_write_op *op)
 		op->written += sectors_start - keylist_sectors(keys);
 
 		if (ret) {
-			bch_err_inum_ratelimited(c, op->pos.inode,
-				"write error while doing btree update: %s", bch2_err_str(ret));
+			struct bkey_i *k = bch2_keylist_front(&op->insert_keys);
+
+			bch_err_inum_offset_ratelimited(c,
+				k->k.p.inode, k->k.p.offset << 9,
+				"write error while doing btree update: %s",
+				bch2_err_str(ret));
 			goto err;
 		}
 	}
@@ -787,7 +792,7 @@ static void bch2_write_endio(struct bio *bio)
 
 	if (bch2_dev_inum_io_err_on(bio->bi_status, ca,
 				    op->pos.inode,
-				    op->pos.offset - bio_sectors(bio), /* XXX definitely wrong */
+				    wbio->inode_offset << 9,
 				    "data write error: %s",
 				    bch2_blk_status_to_str(bio->bi_status))) {
 		set_bit(wbio->dev, op->failed.d);
@@ -1405,8 +1410,10 @@ void bch2_write(struct closure *cl)
 	wbio_init(bio)->put_bio = false;
 
 	if (bio->bi_iter.bi_size & (c->opts.block_size - 1)) {
-		bch_err_inum_ratelimited(c, op->pos.inode,
-					 "misaligned write");
+		bch_err_inum_offset_ratelimited(c,
+			op->pos.inode,
+			op->pos.offset << 9,
+			"misaligned write");
 		op->error = -EIO;
 		goto err;
 	}
@@ -1987,20 +1994,25 @@ csum_err:
 		goto out;
 	}
 
-	bch2_dev_inum_io_error(ca, rbio->read_pos.inode, (u64) rbio->bvec_iter.bi_sector,
+	bch_err_inum_offset_ratelimited(ca,
+		rbio->read_pos.inode,
+		rbio->read_pos.offset << 9,
 		"data checksum error: expected %0llx:%0llx got %0llx:%0llx (type %s)",
 		rbio->pick.crc.csum.hi, rbio->pick.crc.csum.lo,
 		csum.hi, csum.lo, bch2_csum_types[crc.csum_type]);
+	bch2_io_error(ca);
 	bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
 	goto out;
 decompression_err:
-	bch_err_inum_ratelimited(c, rbio->read_pos.inode,
-				 "decompression error");
+	bch_err_inum_offset_ratelimited(c, rbio->read_pos.inode,
+					rbio->read_pos.offset << 9,
+					"decompression error");
 	bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR);
 	goto out;
 decrypt_err:
-	bch_err_inum_ratelimited(c, rbio->read_pos.inode,
-				 "decrypt error");
+	bch_err_inum_offset_ratelimited(c, rbio->read_pos.inode,
+					rbio->read_pos.offset << 9,
+					"decrypt error");
 	bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR);
 	goto out;
 }
@@ -2075,7 +2087,9 @@ int __bch2_read_indirect_extent(struct btree_trans *trans,
 
 	if (k.k->type != KEY_TYPE_reflink_v &&
 	    k.k->type != KEY_TYPE_indirect_inline_data) {
-		bch_err_inum_ratelimited(trans->c, orig_k->k->k.p.inode,
+		bch_err_inum_offset_ratelimited(trans->c,
+			orig_k->k->k.p.inode,
+			orig_k->k->k.p.offset << 9,
 			"%llu len %u points to nonexistent indirect extent %llu",
 			orig_k->k->k.p.offset,
 			orig_k->k->k.size,
@@ -2161,8 +2175,9 @@ retry_pick:
 		goto hole;
 
 	if (pick_ret < 0) {
-		bch_err_inum_ratelimited(c, k.k->p.inode,
-					 "no device to read from");
+		bch_err_inum_offset_ratelimited(c,
+				read_pos.inode, read_pos.offset << 9,
+				"no device to read from");
 		goto err;
 	}
 
@@ -2341,8 +2356,10 @@ get_bio:
 
 	if (!rbio->pick.idx) {
 		if (!rbio->have_ioref) {
-			bch_err_inum_ratelimited(c, k.k->p.inode,
-						 "no device to read from");
+			bch_err_inum_offset_ratelimited(c,
+					read_pos.inode,
+					read_pos.offset << 9,
+					"no device to read from");
 			bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
 			goto out;
 		}
@@ -2515,8 +2532,9 @@ err:
 	bch2_bkey_buf_exit(&sk, c);
 
 	if (ret) {
-		bch_err_inum_ratelimited(c, inum.inum,
-					 "read error %i from btree lookup", ret);
+		bch_err_inum_offset_ratelimited(c, inum.inum,
+						bvec_iter.bi_sector << 9,
+						"read error %i from btree lookup", ret);
 		rbio->bio.bi_status = BLK_STS_IOERR;
 		bch2_rbio_done(rbio);
 	}
diff --git a/fs/bcachefs/io_types.h b/fs/bcachefs/io_types.h
index b31f2a22f098..8e83ce5bc805 100644
--- a/fs/bcachefs/io_types.h
+++ b/fs/bcachefs/io_types.h
@@ -88,6 +88,7 @@ struct bch_write_bio {
 	struct bch_write_bio	*parent;
 
 	u64			submit_time;
+	u64			inode_offset;
 
 	struct bch_devs_list	failed;
 	u8			dev;
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 1db2ccf2627a..6b9bd1f55fe3 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -1268,10 +1268,10 @@ int bch2_journal_read(struct bch_fs *c, u64 *blacklist_seq, u64 *start_seq)
 			struct bch_dev *ca = bch_dev_bkey_exists(c, i->ptrs[ptr].dev);
 
 			if (!i->ptrs[ptr].csum_good)
-				printk(KERN_ERR "bcachefs (%s) sector %llu: invalid journal checksum, seq %llu%s\n",
-				       ca->name, i->ptrs[ptr].sector,
-				       le64_to_cpu(i->j.seq),
-				       i->csum_good ? " (had good copy on another device)" : "");
+				bch_err_dev_offset(ca, i->ptrs[ptr].sector,
+						   "invalid journal checksum, seq %llu%s",
+						   le64_to_cpu(i->j.seq),
+						   i->csum_good ? " (had good copy on another device)" : "");
 		}
 
 		ret = jset_validate(c,
-- 
cgit 


From 42af0ad569edbfcd252e9abf0badd97b895c34be Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 17 Nov 2022 16:03:15 -0500
Subject: bcachefs: Fix a race with b->write_type

b->write_type needs to be set atomically with setting the
btree_node_need_write flag, so move it into b->flags.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs.h              | 17 -----------------
 fs/bcachefs/btree_cache.c           |  1 -
 fs/bcachefs/btree_io.c              | 14 +++++++++-----
 fs/bcachefs/btree_types.h           | 20 +++++++++++++++++++-
 fs/bcachefs/btree_update_interior.c | 12 ++++++++++--
 fs/bcachefs/btree_update_leaf.c     |  4 ++--
 6 files changed, 40 insertions(+), 28 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index c8ab0e2029df..e61dc1e6da06 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -627,23 +627,6 @@ typedef struct {
 #define BCACHEFS_ROOT_SUBVOL_INUM					\
 	((subvol_inum) { BCACHEFS_ROOT_SUBVOL,	BCACHEFS_ROOT_INO })
 
-#define BCH_BTREE_WRITE_TYPES()						\
-	x(initial,		0)					\
-	x(init_next_bset,	1)					\
-	x(cache_reclaim,	2)					\
-	x(journal_reclaim,	3)					\
-	x(interior,		4)
-
-enum btree_write_type {
-#define x(t, n) BTREE_WRITE_##t,
-	BCH_BTREE_WRITE_TYPES()
-#undef x
-	BTREE_WRITE_TYPE_NR,
-};
-
-#define BTREE_WRITE_TYPE_MASK	(roundup_pow_of_two(BTREE_WRITE_TYPE_NR) - 1)
-#define BTREE_WRITE_TYPE_BITS	ilog2(BTREE_WRITE_TYPE_MASK)
-
 struct bch_fs {
 	struct closure		cl;
 
diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index 5adfdc5afbea..c9d287f38d63 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -631,7 +631,6 @@ out:
 	b->flags		= 0;
 	b->written		= 0;
 	b->nsets		= 0;
-	b->write_type		= 0;
 	b->sib_u64s[0]		= 0;
 	b->sib_u64s[1]		= 0;
 	b->whiteout_u64s	= 0;
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 5d750f447241..8dbe930c1eb2 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -1636,6 +1636,7 @@ static void __btree_node_write_done(struct bch_fs *c, struct btree *b)
 {
 	struct btree_write *w = btree_prev_write(b);
 	unsigned long old, new, v;
+	unsigned type = 0;
 
 	bch2_btree_complete_write(c, b, w);
 
@@ -1654,6 +1655,9 @@ static void __btree_node_write_done(struct bch_fs *c, struct btree *b)
 			new |=  (1U << BTREE_NODE_write_in_flight_inner);
 			new |=  (1U << BTREE_NODE_just_written);
 			new ^=  (1U << BTREE_NODE_write_idx);
+
+			type = new & BTREE_WRITE_TYPE_MASK;
+			new &= ~BTREE_WRITE_TYPE_MASK;
 		} else {
 			new &= ~(1U << BTREE_NODE_write_in_flight);
 			new &= ~(1U << BTREE_NODE_write_in_flight_inner);
@@ -1661,7 +1665,7 @@ static void __btree_node_write_done(struct bch_fs *c, struct btree *b)
 	} while ((v = cmpxchg(&b->flags, old, new)) != old);
 
 	if (new & (1U << BTREE_NODE_write_in_flight))
-		__bch2_btree_node_write(c, b, BTREE_WRITE_ALREADY_STARTED|b->write_type);
+		__bch2_btree_node_write(c, b, BTREE_WRITE_ALREADY_STARTED|type);
 	else
 		wake_up_bit(&b->flags, BTREE_NODE_write_in_flight);
 }
@@ -1846,6 +1850,10 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, unsigned flags)
 		if (old & (1 << BTREE_NODE_write_in_flight))
 			return;
 
+		if (flags & BTREE_WRITE_ONLY_IF_NEED)
+			type = new & BTREE_WRITE_TYPE_MASK;
+		new &= ~BTREE_WRITE_TYPE_MASK;
+
 		new &= ~(1 << BTREE_NODE_dirty);
 		new &= ~(1 << BTREE_NODE_need_write);
 		new |=  (1 << BTREE_NODE_write_in_flight);
@@ -1857,10 +1865,6 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, unsigned flags)
 	if (new & (1U << BTREE_NODE_need_write))
 		return;
 do_write:
-	if ((flags & BTREE_WRITE_ONLY_IF_NEED))
-		type = b->write_type;
-	b->write_type = 0;
-
 	BUG_ON((type == BTREE_WRITE_initial) != (b->written == 0));
 
 	atomic_dec(&c->btree_cache.dirty);
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 38c4754dbd7e..72e6a214b89a 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -77,7 +77,6 @@ struct btree {
 	u8			nsets;
 	u8			nr_key_bits;
 	u16			version_ondisk;
-	u8			write_type;
 
 	struct bkey_format	format;
 
@@ -445,6 +444,23 @@ struct btree_trans {
 	struct replicas_delta_list *fs_usage_deltas;
 };
 
+#define BCH_BTREE_WRITE_TYPES()						\
+	x(initial,		0)					\
+	x(init_next_bset,	1)					\
+	x(cache_reclaim,	2)					\
+	x(journal_reclaim,	3)					\
+	x(interior,		4)
+
+enum btree_write_type {
+#define x(t, n) BTREE_WRITE_##t,
+	BCH_BTREE_WRITE_TYPES()
+#undef x
+	BTREE_WRITE_TYPE_NR,
+};
+
+#define BTREE_WRITE_TYPE_MASK	(roundup_pow_of_two(BTREE_WRITE_TYPE_NR) - 1)
+#define BTREE_WRITE_TYPE_BITS	ilog2(roundup_pow_of_two(BTREE_WRITE_TYPE_NR))
+
 #define BTREE_FLAGS()							\
 	x(read_in_flight)						\
 	x(read_error)							\
@@ -464,6 +480,8 @@ struct btree_trans {
 	x(never_write)
 
 enum btree_flags {
+	/* First bits for btree node write type */
+	BTREE_NODE_FLAGS_START = BTREE_WRITE_TYPE_BITS - 1,
 #define x(flag)	BTREE_NODE_##flag,
 	BTREE_FLAGS()
 #undef x
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index ac3a5ef1b1af..03e016758af3 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -1257,6 +1257,7 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as,
 	struct bch_fs *c = as->c;
 	struct bkey_packed *k;
 	struct printbuf buf = PRINTBUF;
+	unsigned long old, new, v;
 
 	BUG_ON(insert->k.type == KEY_TYPE_btree_ptr_v2 &&
 	       !btree_ptr_sectors_written(insert));
@@ -1294,8 +1295,15 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as,
 
 	bch2_btree_bset_insert_key(trans, path, b, node_iter, insert);
 	set_btree_node_dirty_acct(c, b);
-	set_btree_node_need_write(b);
-	b->write_type = BTREE_WRITE_interior;
+
+	v = READ_ONCE(b->flags);
+	do {
+		old = new = v;
+
+		new &= ~BTREE_WRITE_TYPE_MASK;
+		new |= BTREE_WRITE_interior;
+		new |= 1 << BTREE_NODE_need_write;
+	} while ((v = cmpxchg(&b->flags, old, new)) != old);
 
 	printbuf_exit(&buf);
 }
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 1405ad4eda02..445f8f57ef0c 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -178,11 +178,11 @@ static int __btree_node_flush(struct journal *j, struct journal_entry_pin *pin,
 		    w->journal.seq != seq)
 			break;
 
+		new &= ~BTREE_WRITE_TYPE_MASK;
+		new |= BTREE_WRITE_journal_reclaim;
 		new |= 1 << BTREE_NODE_need_write;
 	} while ((v = cmpxchg(&b->flags, old, new)) != old);
 
-	b->write_type = BTREE_WRITE_journal_reclaim;
-
 	btree_node_write_if_need(c, b, SIX_LOCK_read);
 	six_unlock_read(&b->c.lock);
 
-- 
cgit 


From 96c2e01083f19c75421002bebb819a668839184e Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 17 Nov 2022 19:07:40 -0500
Subject: bcachefs: Fix a transaction path overflow

It turns out we need bch2_extent_trim_atomi() even when we're deleting
extents one at a time because it's possible for one reflink_p to
reference arbitrarily many reflink_v extents. This doesn't normally
happen, but the data move path can fragment existing extents in the
background.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/inode.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index 4161cd850eb8..f026e2f70dcd 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -599,7 +599,6 @@ static int bch2_inode_delete_keys(struct btree_trans *trans,
 	 * iterator:
 	 */
 	bch2_trans_iter_init(trans, &iter, id, POS(inum.inum, 0),
-			     BTREE_ITER_NOT_EXTENTS|
 			     BTREE_ITER_INTENT);
 
 	while (1) {
@@ -622,6 +621,14 @@ static int bch2_inode_delete_keys(struct btree_trans *trans,
 		bkey_init(&delete.k);
 		delete.k.p = iter.pos;
 
+		if (iter.flags & BTREE_ITER_IS_EXTENTS) {
+			bch2_key_resize(&delete.k, k.k->p.offset - iter.pos.offset);
+
+			ret = bch2_extent_trim_atomic(trans, &iter, &delete);
+			if (ret)
+				goto err;
+		}
+
 		ret = bch2_trans_update(trans, &iter, &delete, 0) ?:
 		      bch2_trans_commit(trans, NULL, NULL,
 					BTREE_INSERT_NOFAIL);
-- 
cgit 


From ff56d68cf9ea04504be94eb7a476efcb92028a42 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 19 Nov 2022 21:20:58 -0500
Subject: bcachefs: Improve journal_read() logging

Print out the journal entries we read and will replay as soon as
possible - if we get an error walidating keys it's helpful to know where
it was in the journal.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/journal_io.c | 26 ++++++++++----------------
 1 file changed, 10 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 6b9bd1f55fe3..485fd6f3003b 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -1106,7 +1106,6 @@ int bch2_journal_read(struct bch_fs *c, u64 *blacklist_seq, u64 *start_seq)
 	struct bch_dev *ca;
 	unsigned iter;
 	struct printbuf buf = PRINTBUF;
-	size_t keys = 0, entries = 0;
 	bool degraded = false;
 	u64 seq, last_seq = 0;
 	int ret = 0;
@@ -1137,7 +1136,8 @@ int bch2_journal_read(struct bch_fs *c, u64 *blacklist_seq, u64 *start_seq)
 	if (jlist.ret)
 		return jlist.ret;
 
-	*start_seq = 0;
+	*start_seq	= 0;
+	*blacklist_seq	= 0;
 
 	/*
 	 * Find most recent flush entry, and ignore newer non flush entries -
@@ -1150,7 +1150,7 @@ int bch2_journal_read(struct bch_fs *c, u64 *blacklist_seq, u64 *start_seq)
 			continue;
 
 		if (!*start_seq)
-			*start_seq = le64_to_cpu(i->j.seq) + 1;
+			*blacklist_seq = *start_seq = le64_to_cpu(i->j.seq) + 1;
 
 		if (!JSET_NO_FLUSH(&i->j)) {
 			int write = READ;
@@ -1180,6 +1180,13 @@ int bch2_journal_read(struct bch_fs *c, u64 *blacklist_seq, u64 *start_seq)
 		goto err;
 	}
 
+	bch_info(c, "journal read done, replaying entries %llu-%llu",
+		 last_seq, *blacklist_seq - 1);
+
+	if (*start_seq != *blacklist_seq)
+		bch_info(c, "dropped unflushed entries %llu-%llu",
+			 *blacklist_seq, *start_seq - 1);
+
 	/* Drop blacklisted entries and entries older than last_seq: */
 	genradix_for_each(&c->journal_entries, radix_iter, _i) {
 		i = *_i;
@@ -1252,8 +1259,6 @@ int bch2_journal_read(struct bch_fs *c, u64 *blacklist_seq, u64 *start_seq)
 	}
 
 	genradix_for_each(&c->journal_entries, radix_iter, _i) {
-		struct jset_entry *entry;
-		struct bkey_i *k, *_n;
 		struct bch_replicas_padded replicas = {
 			.e.data_type = BCH_DATA_journal,
 			.e.nr_required = 1,
@@ -1303,18 +1308,7 @@ int bch2_journal_read(struct bch_fs *c, u64 *blacklist_seq, u64 *start_seq)
 			if (ret)
 				goto err;
 		}
-
-		for_each_jset_key(k, _n, entry, &i->j)
-			keys++;
-		entries++;
 	}
-
-	bch_info(c, "journal read done, %zu keys in %zu entries, seq %llu",
-		 keys, entries, *start_seq);
-
-	if (*start_seq != *blacklist_seq)
-		bch_info(c, "dropped unflushed entries %llu-%llu",
-			 *blacklist_seq, *start_seq - 1);
 err:
 fsck_err:
 	printbuf_exit(&buf);
-- 
cgit 


From dab1e24867f0e694c8ab73c075d10676c2699d85 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 19 Nov 2022 21:40:35 -0500
Subject: bcachefs: Handle last journal write being torn

If the last journal write didn't complete sucessfully due to a torn
write, we'll detect it as a checksum error. In that case, we should just
pretend that journal entry was never written.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/journal_io.c | 39 +++++++++++++++++++++++++--------------
 1 file changed, 25 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 485fd6f3003b..d1deb0573ffd 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -1106,7 +1106,7 @@ int bch2_journal_read(struct bch_fs *c, u64 *blacklist_seq, u64 *start_seq)
 	struct bch_dev *ca;
 	unsigned iter;
 	struct printbuf buf = PRINTBUF;
-	bool degraded = false;
+	bool degraded = false, last_write_torn = false;
 	u64 seq, last_seq = 0;
 	int ret = 0;
 
@@ -1142,8 +1142,13 @@ int bch2_journal_read(struct bch_fs *c, u64 *blacklist_seq, u64 *start_seq)
 	/*
 	 * Find most recent flush entry, and ignore newer non flush entries -
 	 * those entries will be blacklisted:
+	 *
+	 *
+	 * XXX check for torn write on last journal entry
 	 */
 	genradix_for_each_reverse(&c->journal_entries, radix_iter, _i) {
+		int write = READ;
+
 		i = *_i;
 
 		if (!i || i->ignore)
@@ -1152,21 +1157,27 @@ int bch2_journal_read(struct bch_fs *c, u64 *blacklist_seq, u64 *start_seq)
 		if (!*start_seq)
 			*blacklist_seq = *start_seq = le64_to_cpu(i->j.seq) + 1;
 
-		if (!JSET_NO_FLUSH(&i->j)) {
-			int write = READ;
-			if (journal_entry_err_on(le64_to_cpu(i->j.last_seq) > le64_to_cpu(i->j.seq),
-						 c, &i->j, NULL,
-						 "invalid journal entry: last_seq > seq (%llu > %llu)",
-						 le64_to_cpu(i->j.last_seq),
-						 le64_to_cpu(i->j.seq)))
-				i->j.last_seq = i->j.seq;
-
-			last_seq	= le64_to_cpu(i->j.last_seq);
-			*blacklist_seq	= le64_to_cpu(i->j.seq) + 1;
-			break;
+		if (JSET_NO_FLUSH(&i->j)) {
+			journal_replay_free(c, i);
+			continue;
 		}
 
-		journal_replay_free(c, i);
+		if (!last_write_torn && !i->csum_good) {
+			last_write_torn = true;
+			journal_replay_free(c, i);
+			continue;
+		}
+
+		if (journal_entry_err_on(le64_to_cpu(i->j.last_seq) > le64_to_cpu(i->j.seq),
+					 c, &i->j, NULL,
+					 "invalid journal entry: last_seq > seq (%llu > %llu)",
+					 le64_to_cpu(i->j.last_seq),
+					 le64_to_cpu(i->j.seq)))
+			i->j.last_seq = i->j.seq;
+
+		last_seq	= le64_to_cpu(i->j.last_seq);
+		*blacklist_seq	= le64_to_cpu(i->j.seq) + 1;
+		break;
 	}
 
 	if (!*start_seq) {
-- 
cgit 


From 001783e2614ea333267e443a9b38ac25644f839b Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 22 Nov 2022 22:05:45 -0500
Subject: bcachefs: Split out __bch2_btree_node_get()

Standard splitting out of the slow path from the fast path of a
function. We may follow this up in another patch with inlining the fast
path into btree_iter.c.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_cache.c | 162 ++++++++++++++++++++++++++++++----------------
 1 file changed, 108 insertions(+), 54 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index c9d287f38d63..91ddbc7b8489 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -794,19 +794,10 @@ static inline void btree_check_header(struct bch_fs *c, struct btree *b)
 		btree_bad_header(c, b);
 }
 
-/**
- * bch_btree_node_get - find a btree node in the cache and lock it, reading it
- * in from disk if necessary.
- *
- * If IO is necessary and running under generic_make_request, returns -EAGAIN.
- *
- * The btree node will have either a read or a write lock held, depending on
- * the @write parameter.
- */
-struct btree *bch2_btree_node_get(struct btree_trans *trans, struct btree_path *path,
-				  const struct bkey_i *k, unsigned level,
-				  enum six_lock_type lock_type,
-				  unsigned long trace_ip)
+static struct btree *__bch2_btree_node_get(struct btree_trans *trans, struct btree_path *path,
+					   const struct bkey_i *k, unsigned level,
+					   enum six_lock_type lock_type,
+					   unsigned long trace_ip)
 {
 	struct bch_fs *c = trans->c;
 	struct btree_cache *bc = &c->btree_cache;
@@ -815,18 +806,6 @@ struct btree *bch2_btree_node_get(struct btree_trans *trans, struct btree_path *
 	int ret;
 
 	EBUG_ON(level >= BTREE_MAX_DEPTH);
-
-	b = btree_node_mem_ptr(k);
-
-	/*
-	 * Check b->hash_val _before_ calling btree_node_lock() - this might not
-	 * be the node we want anymore, and trying to lock the wrong node could
-	 * cause an unneccessary transaction restart:
-	 */
-	if (likely(c->opts.btree_node_mem_ptr_optimization &&
-		   b &&
-		   b->hash_val == btree_ptr_hash_val(k)))
-		goto lock_node;
 retry:
 	b = btree_cache_find(bc, k);
 	if (unlikely(!b)) {
@@ -845,35 +824,6 @@ retry:
 		if (IS_ERR(b))
 			return b;
 	} else {
-lock_node:
-		/*
-		 * There's a potential deadlock with splits and insertions into
-		 * interior nodes we have to avoid:
-		 *
-		 * The other thread might be holding an intent lock on the node
-		 * we want, and they want to update its parent node so they're
-		 * going to upgrade their intent lock on the parent node to a
-		 * write lock.
-		 *
-		 * But if we're holding a read lock on the parent, and we're
-		 * trying to get the intent lock they're holding, we deadlock.
-		 *
-		 * So to avoid this we drop the read locks on parent nodes when
-		 * we're starting to take intent locks - and handle the race.
-		 *
-		 * The race is that they might be about to free the node we
-		 * want, and dropping our read lock on the parent node lets them
-		 * update the parent marking the node we want as freed, and then
-		 * free it:
-		 *
-		 * To guard against this, btree nodes are evicted from the cache
-		 * when they're freed - and b->hash_val is zeroed out, which we
-		 * check for after we lock the node.
-		 *
-		 * Then, bch2_btree_node_relock() on the parent will fail - because
-		 * the parent was modified, when the pointer to the node we want
-		 * was removed - and we'll bail out:
-		 */
 		if (btree_node_read_locked(path, level + 1))
 			btree_node_unlock(trans, path, level + 1);
 
@@ -946,6 +896,110 @@ lock_node:
 	return b;
 }
 
+/**
+ * bch_btree_node_get - find a btree node in the cache and lock it, reading it
+ * in from disk if necessary.
+ *
+ * If IO is necessary and running under generic_make_request, returns -EAGAIN.
+ *
+ * The btree node will have either a read or a write lock held, depending on
+ * the @write parameter.
+ */
+struct btree *bch2_btree_node_get(struct btree_trans *trans, struct btree_path *path,
+				  const struct bkey_i *k, unsigned level,
+				  enum six_lock_type lock_type,
+				  unsigned long trace_ip)
+{
+	struct bch_fs *c = trans->c;
+	struct btree *b;
+	struct bset_tree *t;
+	int ret;
+
+	EBUG_ON(level >= BTREE_MAX_DEPTH);
+
+	b = btree_node_mem_ptr(k);
+
+	/*
+	 * Check b->hash_val _before_ calling btree_node_lock() - this might not
+	 * be the node we want anymore, and trying to lock the wrong node could
+	 * cause an unneccessary transaction restart:
+	 */
+	if (unlikely(!c->opts.btree_node_mem_ptr_optimization ||
+		     !b ||
+		     b->hash_val != btree_ptr_hash_val(k)))
+		return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip);
+
+	if (btree_node_read_locked(path, level + 1))
+		btree_node_unlock(trans, path, level + 1);
+
+	ret = btree_node_lock(trans, path, &b->c, level, lock_type, trace_ip);
+	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+		return ERR_PTR(ret);
+
+	BUG_ON(ret);
+
+	if (unlikely(b->hash_val != btree_ptr_hash_val(k) ||
+		     b->c.level != level ||
+		     race_fault())) {
+		six_unlock_type(&b->c.lock, lock_type);
+		if (bch2_btree_node_relock(trans, path, level + 1))
+			return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip);
+
+		trace_and_count(c, trans_restart_btree_node_reused, trans, trace_ip, path);
+		return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_lock_node_reused));
+	}
+
+	if (unlikely(btree_node_read_in_flight(b))) {
+		u32 seq = b->c.lock.state.seq;
+
+		six_unlock_type(&b->c.lock, lock_type);
+		bch2_trans_unlock(trans);
+
+		bch2_btree_node_wait_on_read(b);
+
+		/*
+		 * should_be_locked is not set on this path yet, so we need to
+		 * relock it specifically:
+		 */
+		if (trans) {
+			int ret = bch2_trans_relock(trans) ?:
+				bch2_btree_path_relock_intent(trans, path);
+			if (ret) {
+				BUG_ON(!trans->restarted);
+				return ERR_PTR(ret);
+			}
+		}
+
+		if (!six_relock_type(&b->c.lock, lock_type, seq))
+			return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip);
+	}
+
+	prefetch(b->aux_data);
+
+	for_each_bset(b, t) {
+		void *p = (u64 *) b->aux_data + t->aux_data_offset;
+
+		prefetch(p + L1_CACHE_BYTES * 0);
+		prefetch(p + L1_CACHE_BYTES * 1);
+		prefetch(p + L1_CACHE_BYTES * 2);
+	}
+
+	/* avoid atomic set bit if it's not needed: */
+	if (!btree_node_accessed(b))
+		set_btree_node_accessed(b);
+
+	if (unlikely(btree_node_read_error(b))) {
+		six_unlock_type(&b->c.lock, lock_type);
+		return ERR_PTR(-EIO);
+	}
+
+	EBUG_ON(b->c.btree_id != path->btree_id);
+	EBUG_ON(BTREE_NODE_LEVEL(b->data) != level);
+	btree_check_header(c, b);
+
+	return b;
+}
+
 struct btree *bch2_btree_node_get_noiter(struct btree_trans *trans,
 					 const struct bkey_i *k,
 					 enum btree_id btree_id,
-- 
cgit 


From c9ee99ad8c52a9d7f93e2e9f786a172a849622fd Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 22 Nov 2022 22:06:04 -0500
Subject: bcachefs: Move some asserts behind CONFIG_BCACHEFS_DEBUG

Convert some non-critical asserts in long-stable code to debug asserts.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c        |  2 +-
 fs/bcachefs/btree_update_leaf.c | 28 ++++++++++++++++------------
 2 files changed, 17 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index e4ea77d7c1d8..331479f34a0c 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1490,7 +1490,7 @@ struct btree_path *bch2_path_get(struct btree_trans *trans,
 	bool intent = flags & BTREE_ITER_INTENT;
 	int i;
 
-	BUG_ON(trans->restarted);
+	EBUG_ON(trans->restarted);
 	bch2_trans_verify_locks(trans);
 
 	btree_trans_sort_paths(trans);
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 445f8f57ef0c..7029391496cb 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -517,11 +517,12 @@ static int bch2_trans_commit_run_triggers(struct btree_trans *trans)
 		}
 	}
 
+#ifdef CONFIG_BCACHEFS_DEBUG
 	trans_for_each_update(trans, i)
 		BUG_ON(!(i->flags & BTREE_TRIGGER_NORUN) &&
 		       (BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type)) &&
 		       (!i->insert_trigger_run || !i->overwrite_trigger_run));
-
+#endif
 	return 0;
 }
 
@@ -767,6 +768,7 @@ static noinline void bch2_drop_overwrites_from_journal(struct btree_trans *trans
 		bch2_journal_key_overwritten(trans->c, i->btree_id, i->level, i->k->k.p);
 }
 
+#ifdef CONFIG_BCACHEFS_DEBUG
 static noinline int bch2_trans_commit_bkey_invalid(struct btree_trans *trans,
 						   struct btree_insert_entry *i,
 						   struct printbuf *err)
@@ -793,6 +795,7 @@ static noinline int bch2_trans_commit_bkey_invalid(struct btree_trans *trans,
 
 	return -EINVAL;
 }
+#endif
 
 /*
  * Get journal reservation, take write locks, and attempt to do btree update(s):
@@ -805,15 +808,17 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans,
 	struct btree_insert_entry *i;
 	struct printbuf buf = PRINTBUF;
 	int ret, u64s_delta = 0;
-	int rw = (trans->flags & BTREE_INSERT_JOURNAL_REPLAY) ? READ : WRITE;
 
+#ifdef CONFIG_BCACHEFS_DEBUG
 	trans_for_each_update(trans, i) {
+		int rw = (trans->flags & BTREE_INSERT_JOURNAL_REPLAY) ? READ : WRITE;
+
 		if (unlikely(bch2_bkey_invalid(c, bkey_i_to_s_c(i->k),
 					       i->bkey_type, rw, &buf)))
 			return bch2_trans_commit_bkey_invalid(trans, i, &buf);
 		btree_insert_entry_checks(trans, i);
 	}
-
+#endif
 	printbuf_exit(&buf);
 
 	trans_for_each_update(trans, i) {
@@ -1042,13 +1047,13 @@ int __bch2_trans_commit(struct btree_trans *trans)
 		trans->journal_u64s += jset_u64s(JSET_ENTRY_LOG_U64s);
 
 	trans_for_each_update(trans, i) {
-		BUG_ON(!i->path->should_be_locked);
+		EBUG_ON(!i->path->should_be_locked);
 
 		ret = bch2_btree_path_upgrade(trans, i->path, i->level + 1);
 		if (unlikely(ret))
 			goto out;
 
-		BUG_ON(!btree_node_intent_locked(i->path, i->level));
+		EBUG_ON(!btree_node_intent_locked(i->path, i->level));
 
 		if (i->key_cache_already_flushed)
 			continue;
@@ -1078,7 +1083,7 @@ int __bch2_trans_commit(struct btree_trans *trans)
 			goto err;
 	}
 retry:
-	BUG_ON(trans->restarted);
+	EBUG_ON(trans->restarted);
 	memset(&trans->journal_res, 0, sizeof(trans->journal_res));
 
 	ret = do_bch2_trans_commit(trans, &i, _RET_IP_);
@@ -1455,10 +1460,9 @@ bch2_trans_update_by_path_trace(struct btree_trans *trans, struct btree_path *pa
 	struct bch_fs *c = trans->c;
 	struct btree_insert_entry *i, n;
 
-	BUG_ON(!path->should_be_locked);
-
-	BUG_ON(trans->nr_updates >= BTREE_ITER_MAX);
-	BUG_ON(bpos_cmp(k->k.p, path->pos));
+	EBUG_ON(!path->should_be_locked);
+	EBUG_ON(trans->nr_updates >= BTREE_ITER_MAX);
+	EBUG_ON(bpos_cmp(k->k.p, path->pos));
 
 	n = (struct btree_insert_entry) {
 		.flags		= flags,
@@ -1487,7 +1491,7 @@ bch2_trans_update_by_path_trace(struct btree_trans *trans, struct btree_path *pa
 
 	if (i < trans->updates + trans->nr_updates &&
 	    !btree_insert_entry_cmp(&n, i)) {
-		BUG_ON(i->insert_trigger_run || i->overwrite_trigger_run);
+		EBUG_ON(i->insert_trigger_run || i->overwrite_trigger_run);
 
 		bch2_path_put(trans, i->path, true);
 		i->flags	= n.flags;
@@ -1530,7 +1534,7 @@ bch2_trans_update_by_path_trace(struct btree_trans *trans, struct btree_path *pa
 	return 0;
 }
 
-static int __must_check
+static inline int __must_check
 bch2_trans_update_by_path(struct btree_trans *trans, struct btree_path *path,
 			  struct bkey_i *k, enum btree_update_flags flags)
 {
-- 
cgit 


From a2519a9688d3eeb6c4b2df3ab80b70e62458528d Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 22 Nov 2022 23:47:22 -0500
Subject: bcachefs: Tiny bch2_trans_update_by_path_trace() optimization

This just removes a redundant comparison - there's more work we could do
here to remove some redundant copying.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update_leaf.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 7029391496cb..3782dd56088f 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -1459,6 +1459,7 @@ bch2_trans_update_by_path_trace(struct btree_trans *trans, struct btree_path *pa
 {
 	struct bch_fs *c = trans->c;
 	struct btree_insert_entry *i, n;
+	int cmp;
 
 	EBUG_ON(!path->should_be_locked);
 	EBUG_ON(trans->nr_updates >= BTREE_ITER_MAX);
@@ -1485,12 +1486,13 @@ bch2_trans_update_by_path_trace(struct btree_trans *trans, struct btree_path *pa
 	 * Pending updates are kept sorted: first, find position of new update,
 	 * then delete/trim any updates the new update overwrites:
 	 */
-	trans_for_each_update(trans, i)
-		if (btree_insert_entry_cmp(&n, i) <= 0)
+	trans_for_each_update(trans, i) {
+		cmp = btree_insert_entry_cmp(&n, i);
+		if (cmp <= 0)
 			break;
+	}
 
-	if (i < trans->updates + trans->nr_updates &&
-	    !btree_insert_entry_cmp(&n, i)) {
+	if (!cmp && i < trans->updates + trans->nr_updates) {
 		EBUG_ON(i->insert_trigger_run || i->overwrite_trigger_run);
 
 		bch2_path_put(trans, i->path, true);
-- 
cgit 


From 0aba9eba76442d6887dc98924bb8c0396a79c984 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 24 Nov 2022 03:38:31 -0500
Subject: bcachefs: Inline bch2_bkey_format_add_key()

This is only called in two places, and when it's used we use it in a
tight loop - it's definitely worth inlining.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bkey.c | 26 --------------------------
 fs/bcachefs/bkey.h | 47 ++++++++++++++++++++++++++++++++++++-----------
 2 files changed, 36 insertions(+), 37 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bkey.c b/fs/bcachefs/bkey.c
index 1c9c02deffbe..0291d216e5dd 100644
--- a/fs/bcachefs/bkey.c
+++ b/fs/bcachefs/bkey.c
@@ -262,14 +262,6 @@ bool bch2_bkey_transform(const struct bkey_format *out_f,
 	return true;
 }
 
-#define bkey_fields()							\
-	x(BKEY_FIELD_INODE,		p.inode)			\
-	x(BKEY_FIELD_OFFSET,		p.offset)			\
-	x(BKEY_FIELD_SNAPSHOT,		p.snapshot)			\
-	x(BKEY_FIELD_SIZE,		size)				\
-	x(BKEY_FIELD_VERSION_HI,	version.hi)			\
-	x(BKEY_FIELD_VERSION_LO,	version.lo)
-
 struct bkey __bch2_bkey_unpack_key(const struct bkey_format *format,
 			      const struct bkey_packed *in)
 {
@@ -553,24 +545,6 @@ void bch2_bkey_format_init(struct bkey_format_state *s)
 	s->field_min[BKEY_FIELD_SIZE] = 0;
 }
 
-static void __bkey_format_add(struct bkey_format_state *s,
-			      unsigned field, u64 v)
-{
-	s->field_min[field] = min(s->field_min[field], v);
-	s->field_max[field] = max(s->field_max[field], v);
-}
-
-/*
- * Changes @format so that @k can be successfully packed with @format
- */
-void bch2_bkey_format_add_key(struct bkey_format_state *s, const struct bkey *k)
-{
-#define x(id, field) __bkey_format_add(s, id, k->field);
-	bkey_fields()
-#undef x
-	__bkey_format_add(s, BKEY_FIELD_OFFSET, bkey_start_offset(k));
-}
-
 void bch2_bkey_format_add_pos(struct bkey_format_state *s, struct bpos p)
 {
 	unsigned field = 0;
diff --git a/fs/bcachefs/bkey.h b/fs/bcachefs/bkey.h
index 137b2d8bdb49..0ce020bcc55f 100644
--- a/fs/bcachefs/bkey.h
+++ b/fs/bcachefs/bkey.h
@@ -97,17 +97,6 @@ do {								\
 
 struct btree;
 
-struct bkey_format_state {
-	u64 field_min[BKEY_NR_FIELDS];
-	u64 field_max[BKEY_NR_FIELDS];
-};
-
-void bch2_bkey_format_init(struct bkey_format_state *);
-void bch2_bkey_format_add_key(struct bkey_format_state *, const struct bkey *);
-void bch2_bkey_format_add_pos(struct bkey_format_state *, struct bpos);
-struct bkey_format bch2_bkey_format_done(struct bkey_format_state *);
-const char *bch2_bkey_format_validate(struct bkey_format *);
-
 __pure
 unsigned bch2_bkey_greatest_differing_bit(const struct btree *,
 					  const struct bkey_packed *,
@@ -671,4 +660,40 @@ void bch2_bkey_pack_test(void);
 static inline void bch2_bkey_pack_test(void) {}
 #endif
 
+#define bkey_fields()							\
+	x(BKEY_FIELD_INODE,		p.inode)			\
+	x(BKEY_FIELD_OFFSET,		p.offset)			\
+	x(BKEY_FIELD_SNAPSHOT,		p.snapshot)			\
+	x(BKEY_FIELD_SIZE,		size)				\
+	x(BKEY_FIELD_VERSION_HI,	version.hi)			\
+	x(BKEY_FIELD_VERSION_LO,	version.lo)
+
+struct bkey_format_state {
+	u64 field_min[BKEY_NR_FIELDS];
+	u64 field_max[BKEY_NR_FIELDS];
+};
+
+void bch2_bkey_format_init(struct bkey_format_state *);
+
+static inline void __bkey_format_add(struct bkey_format_state *s, unsigned field, u64 v)
+{
+	s->field_min[field] = min(s->field_min[field], v);
+	s->field_max[field] = max(s->field_max[field], v);
+}
+
+/*
+ * Changes @format so that @k can be successfully packed with @format
+ */
+static inline void bch2_bkey_format_add_key(struct bkey_format_state *s, const struct bkey *k)
+{
+#define x(id, field) __bkey_format_add(s, id, k->field);
+	bkey_fields()
+#undef x
+	__bkey_format_add(s, BKEY_FIELD_OFFSET, bkey_start_offset(k));
+}
+
+void bch2_bkey_format_add_pos(struct bkey_format_state *, struct bpos);
+struct bkey_format bch2_bkey_format_done(struct bkey_format_state *);
+const char *bch2_bkey_format_validate(struct bkey_format *);
+
 #endif /* _BCACHEFS_BKEY_H */
-- 
cgit 


From 98638ffa1d914e780a527c0bd92323f0b7307f09 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 23 Nov 2022 18:22:59 -0500
Subject: bcachefs: Better inlining in bch2_subvolume_get_snapshot()

This provides an inlined version of bch2_subvolume_get() and uses it in
bch2_subvolume_get_snapshot(), since this is the version that's used all
over the place and in fast paths (e.g. IO paths).

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/subvolume.c | 29 +++++++++++++++++++----------
 1 file changed, 19 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c
index 1133783477e1..0e3b6ae3835a 100644
--- a/fs/bcachefs/subvolume.c
+++ b/fs/bcachefs/subvolume.c
@@ -795,10 +795,11 @@ void bch2_subvolume_to_text(struct printbuf *out, struct bch_fs *c,
 	       le32_to_cpu(s.v->snapshot));
 }
 
-int bch2_subvolume_get(struct btree_trans *trans, unsigned subvol,
-		       bool inconsistent_if_not_found,
-		       int iter_flags,
-		       struct bch_subvolume *s)
+static __always_inline int
+bch2_subvolume_get_inlined(struct btree_trans *trans, unsigned subvol,
+			   bool inconsistent_if_not_found,
+			   int iter_flags,
+			   struct bch_subvolume *s)
 {
 	struct btree_iter iter;
 	struct bkey_s_c k;
@@ -818,6 +819,14 @@ int bch2_subvolume_get(struct btree_trans *trans, unsigned subvol,
 	return ret;
 }
 
+int bch2_subvolume_get(struct btree_trans *trans, unsigned subvol,
+		       bool inconsistent_if_not_found,
+		       int iter_flags,
+		       struct bch_subvolume *s)
+{
+	return bch2_subvolume_get_inlined(trans, subvol, inconsistent_if_not_found, iter_flags, s);
+}
+
 int bch2_snapshot_get_subvol(struct btree_trans *trans, u32 snapshot,
 			     struct bch_subvolume *subvol)
 {
@@ -833,12 +842,12 @@ int bch2_subvolume_get_snapshot(struct btree_trans *trans, u32 subvol,
 	struct bch_subvolume s;
 	int ret;
 
-	ret = bch2_subvolume_get(trans, subvol, true,
-				 BTREE_ITER_CACHED|
-				 BTREE_ITER_WITH_UPDATES,
-				 &s);
-
-	*snapid = le32_to_cpu(s.snapshot);
+	ret = bch2_subvolume_get_inlined(trans, subvol, true,
+					 BTREE_ITER_CACHED|
+					 BTREE_ITER_WITH_UPDATES,
+					 &s);
+	if (!ret)
+		*snapid = le32_to_cpu(s.snapshot);
 	return ret;
 }
 
-- 
cgit 


From abb936fb9f2ab4a447a266477d65e50d476277a5 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 23 Nov 2022 20:28:15 -0500
Subject: bcachefs: Improve bch2_inode_opts_to_opts()

It turns out the *_defined entries of bch_io_opts are only used in one
place - in the xattr get path - and there we immediately convert to a
bch_opts struct, which also has the *_defined entries.

This patch changes bch2_inode_opts_to_opts() to go directly from
bch_inode_unpacked to bch_opts, which is a minor simplification and will
also let us slim down struct bch_io_opts in another patch.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/inode.c | 11 +++++++++++
 fs/bcachefs/inode.h |  2 ++
 fs/bcachefs/opts.c  | 11 -----------
 fs/bcachefs/opts.h  |  1 -
 fs/bcachefs/xattr.c |  2 +-
 5 files changed, 14 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index f026e2f70dcd..827a0b04b00f 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -763,3 +763,14 @@ void bch2_inode_nlink_dec(struct btree_trans *trans, struct bch_inode_unpacked *
 	else
 		bi->bi_flags |= BCH_INODE_UNLINKED;
 }
+
+struct bch_opts bch2_inode_opts_to_opts(struct bch_inode_unpacked *inode)
+{
+	struct bch_opts ret = { 0 };
+#define x(_name, _bits)							\
+	if (inode->bi_##_name)						\
+		opt_set(ret, _name, inode->bi_##_name - 1);
+	BCH_INODE_OPTS()
+#undef x
+	return ret;
+}
diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h
index 9ea0d575a183..27744f78ae96 100644
--- a/fs/bcachefs/inode.h
+++ b/fs/bcachefs/inode.h
@@ -187,4 +187,6 @@ static inline void bch2_inode_nlink_set(struct bch_inode_unpacked *bi,
 int bch2_inode_nlink_inc(struct bch_inode_unpacked *);
 void bch2_inode_nlink_dec(struct btree_trans *, struct bch_inode_unpacked *);
 
+struct bch_opts bch2_inode_opts_to_opts(struct bch_inode_unpacked *);
+
 #endif /* _BCACHEFS_INODE_H */
diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c
index 98568f21d6d0..9c49d543b062 100644
--- a/fs/bcachefs/opts.c
+++ b/fs/bcachefs/opts.c
@@ -540,17 +540,6 @@ struct bch_io_opts bch2_opts_to_inode_opts(struct bch_opts src)
 	return ret;
 }
 
-struct bch_opts bch2_inode_opts_to_opts(struct bch_io_opts src)
-{
-	struct bch_opts ret = { 0 };
-#define x(_name, _bits)					\
-	if (opt_defined(src, _name))					\
-		opt_set(ret, _name, src._name);
-	BCH_INODE_OPTS()
-#undef x
-	return ret;
-}
-
 void bch2_io_opts_apply(struct bch_io_opts *dst, struct bch_io_opts src)
 {
 #define x(_name, _bits)					\
diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
index 6e2bd6e01f8c..a32a7ab73bd5 100644
--- a/fs/bcachefs/opts.h
+++ b/fs/bcachefs/opts.h
@@ -513,7 +513,6 @@ struct bch_io_opts {
 };
 
 struct bch_io_opts bch2_opts_to_inode_opts(struct bch_opts);
-struct bch_opts bch2_inode_opts_to_opts(struct bch_io_opts);
 void bch2_io_opts_apply(struct bch_io_opts *, struct bch_io_opts);
 bool bch2_opt_is_inode_opt(enum bch_opt_id);
 
diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c
index 2b9fb4941e9f..bd118f6ea08b 100644
--- a/fs/bcachefs/xattr.c
+++ b/fs/bcachefs/xattr.c
@@ -444,7 +444,7 @@ static int __bch2_xattr_bcachefs_get(const struct xattr_handler *handler,
 	struct bch_inode_info *inode = to_bch_ei(vinode);
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
 	struct bch_opts opts =
-		bch2_inode_opts_to_opts(bch2_inode_opts_get(&inode->ei_inode));
+		bch2_inode_opts_to_opts(&inode->ei_inode);
 	const struct bch_option *opt;
 	int id, inode_opt_id;
 	struct printbuf out = PRINTBUF;
-- 
cgit 


From 4a390fec24a3d6f88678b43f2baa8ad9c2f0716c Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 23 Nov 2022 18:51:27 -0500
Subject: bcachefs: Kill some unneeded references to c->flags

This drops some unneeded references to JOURNAL_REPLAY_DONE in c->flags:
we're already mirroring it in btree_trans, we just weren't using it
consistently.

We may want to do this with more flags:
  btree_iter.c:   unsigned nr = test_bit(BCH_FS_STARTED, &c->flags)
  btree_update_leaf.c:    if (unlikely(!test_bit(BCH_FS_MAY_GO_RW, &c->flags))) {

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update_leaf.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 3782dd56088f..8db474c6146e 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -615,7 +615,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans,
 		 */
 		i->old_v = bch2_btree_path_peek_slot(i->path, &i->old_k).v;
 
-		if (unlikely(!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags))) {
+		if (unlikely(trans->journal_replay_not_finished)) {
 			struct bkey_i *j_k =
 				bch2_journal_keys_peek_slot(c, i->btree_id, i->level,
 							    i->k->k.p);
-- 
cgit 


From 4d868d18e569e1f74c2a59d70ee7f0f0f099f677 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 24 Nov 2022 23:52:28 -0500
Subject: bcachefs: More dio inlining

Eliminate another function call in the O_DIRECT write path.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-io.c | 24 ++++++++++++++----------
 1 file changed, 14 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 8deb476a17c8..4dd5ebafe742 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -2218,7 +2218,7 @@ static __always_inline void bch2_dio_write_end(struct dio_write *dio)
 		set_bit(EI_INODE_ERROR, &inode->ei_flags);
 }
 
-static long bch2_dio_write_loop(struct dio_write *dio)
+static __always_inline long bch2_dio_write_loop(struct dio_write *dio)
 {
 	struct bch_fs *c = dio->op.c;
 	struct kiocb *req = dio->req;
@@ -2333,18 +2333,10 @@ err:
 	goto out;
 }
 
-static void bch2_dio_write_loop_async(struct bch_write_op *op)
+static noinline __cold void bch2_dio_write_continue(struct dio_write *dio)
 {
-	struct dio_write *dio = container_of(op, struct dio_write, op);
 	struct mm_struct *mm = dio->mm;
 
-	bch2_dio_write_end(dio);
-
-	if (likely(!dio->iter.count) || dio->op.error) {
-		bch2_dio_write_done(dio);
-		return;
-	}
-
 	bio_reset(&dio->op.wbio.bio, NULL, REQ_OP_WRITE);
 
 	if (mm)
@@ -2354,6 +2346,18 @@ static void bch2_dio_write_loop_async(struct bch_write_op *op)
 		kthread_unuse_mm(mm);
 }
 
+static void bch2_dio_write_loop_async(struct bch_write_op *op)
+{
+	struct dio_write *dio = container_of(op, struct dio_write, op);
+
+	bch2_dio_write_end(dio);
+
+	if (likely(!dio->iter.count) || dio->op.error)
+		bch2_dio_write_done(dio);
+	else
+		bch2_dio_write_continue(dio);
+}
+
 static noinline
 ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter)
 {
-- 
cgit 


From c96f108b053b394d622f56f2bcefeccb32d0394c Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 25 Nov 2022 00:40:27 -0500
Subject: bcachefs: Optimize bch2_trans_iter_init()

When flags & btree_id are constants, we can constant fold the entire
calculation of the actual iterator flags - and the whole thing becomes
small enough to inline.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c  | 82 ++++++++++++-----------------------------------
 fs/bcachefs/btree_iter.h  | 76 +++++++++++++++++++++++++++++++++++++++++--
 fs/bcachefs/btree_types.h |  1 -
 3 files changed, 94 insertions(+), 65 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 331479f34a0c..238ba10d34e4 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -2623,68 +2623,22 @@ void bch2_trans_iter_exit(struct btree_trans *trans, struct btree_iter *iter)
 	iter->key_cache_path = NULL;
 }
 
-static inline void __bch2_trans_iter_init(struct btree_trans *trans,
-					  struct btree_iter *iter,
-					  enum btree_id btree_id, struct bpos pos,
-					  unsigned locks_want,
-					  unsigned depth,
-					  unsigned flags)
-{
-	if (unlikely(trans->restarted))
-		panic("bch2_trans_iter_init(): in transaction restart, %s by %pS\n",
-		      bch2_err_str(trans->restarted),
-		      (void *) trans->last_restarted_ip);
-
-	if (flags & BTREE_ITER_ALL_LEVELS)
-		flags |= BTREE_ITER_ALL_SNAPSHOTS|__BTREE_ITER_ALL_SNAPSHOTS;
-
-	if (!(flags & (BTREE_ITER_ALL_SNAPSHOTS|BTREE_ITER_NOT_EXTENTS)) &&
-	    btree_node_type_is_extents(btree_id))
-		flags |= BTREE_ITER_IS_EXTENTS;
-
-	if (!(flags & __BTREE_ITER_ALL_SNAPSHOTS) &&
-	    !btree_type_has_snapshots(btree_id))
-		flags &= ~BTREE_ITER_ALL_SNAPSHOTS;
-
-	if (!(flags & BTREE_ITER_ALL_SNAPSHOTS) &&
-	    btree_type_has_snapshots(btree_id))
-		flags |= BTREE_ITER_FILTER_SNAPSHOTS;
-
-	if (trans->journal_replay_not_finished)
-		flags |= BTREE_ITER_WITH_JOURNAL;
-
-	iter->trans	= trans;
-	iter->path	= NULL;
-	iter->update_path = NULL;
-	iter->key_cache_path = NULL;
-	iter->btree_id	= btree_id;
-	iter->min_depth	= depth;
-	iter->flags	= flags;
-	iter->snapshot	= pos.snapshot;
-	iter->pos	= pos;
-	iter->k.type	= KEY_TYPE_deleted;
-	iter->k.p	= pos;
-	iter->k.size	= 0;
-	iter->journal_idx = 0;
-	iter->journal_pos = POS_MIN;
-
-	iter->path = bch2_path_get(trans, btree_id, iter->pos,
-				   locks_want, depth, flags);
-}
-
-void bch2_trans_iter_init(struct btree_trans *trans,
+static inline void bch2_trans_iter_init_inlined(struct btree_trans *trans,
 			  struct btree_iter *iter,
 			  unsigned btree_id, struct bpos pos,
 			  unsigned flags)
 {
-	if (!btree_id_cached(trans->c, btree_id)) {
-		flags &= ~BTREE_ITER_CACHED;
-		flags &= ~BTREE_ITER_WITH_KEY_CACHE;
-	} else if (!(flags & BTREE_ITER_CACHED))
-		flags |= BTREE_ITER_WITH_KEY_CACHE;
+	bch2_trans_iter_init_common(trans, iter, btree_id, pos, 0, 0,
+			       bch2_btree_iter_flags(trans, btree_id, flags));
+}
 
-	__bch2_trans_iter_init(trans, iter, btree_id, pos,
-			       0, 0, flags);
+void bch2_trans_iter_init_outlined(struct btree_trans *trans,
+			  struct btree_iter *iter,
+			  enum btree_id btree_id, struct bpos pos,
+			  unsigned flags)
+{
+	bch2_trans_iter_init_common(trans, iter, btree_id, pos, 0, 0,
+			       bch2_btree_iter_flags(trans, btree_id, flags));
 }
 
 void bch2_trans_node_iter_init(struct btree_trans *trans,
@@ -2695,11 +2649,15 @@ void bch2_trans_node_iter_init(struct btree_trans *trans,
 			       unsigned depth,
 			       unsigned flags)
 {
-	__bch2_trans_iter_init(trans, iter, btree_id, pos, locks_want, depth,
-			       BTREE_ITER_NOT_EXTENTS|
-			       __BTREE_ITER_ALL_SNAPSHOTS|
-			       BTREE_ITER_ALL_SNAPSHOTS|
-			       flags);
+       flags |= BTREE_ITER_NOT_EXTENTS;
+       flags |= __BTREE_ITER_ALL_SNAPSHOTS;
+       flags |= BTREE_ITER_ALL_SNAPSHOTS;
+
+	bch2_trans_iter_init_common(trans, iter, btree_id, pos, locks_want, depth,
+			       __bch2_btree_iter_flags(trans, btree_id, flags));
+
+	iter->min_depth	= depth;
+
 	BUG_ON(iter->path->locks_want	 < min(locks_want, BTREE_MAX_DEPTH));
 	BUG_ON(iter->path->level	!= depth);
 	BUG_ON(iter->min_depth		!= depth);
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index 1c60122c5ea5..3f46c60b748b 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -315,8 +315,80 @@ static inline void bch2_btree_iter_set_snapshot(struct btree_iter *iter, u32 sna
 }
 
 void bch2_trans_iter_exit(struct btree_trans *, struct btree_iter *);
-void bch2_trans_iter_init(struct btree_trans *, struct btree_iter *,
-			  unsigned, struct bpos, unsigned);
+
+static inline unsigned __bch2_btree_iter_flags(struct btree_trans *trans,
+					       unsigned btree_id,
+					       unsigned flags)
+{
+	if (flags & BTREE_ITER_ALL_LEVELS)
+		flags |= BTREE_ITER_ALL_SNAPSHOTS|__BTREE_ITER_ALL_SNAPSHOTS;
+
+	if (!(flags & (BTREE_ITER_ALL_SNAPSHOTS|BTREE_ITER_NOT_EXTENTS)) &&
+	    btree_node_type_is_extents(btree_id))
+		flags |= BTREE_ITER_IS_EXTENTS;
+
+	if (!(flags & __BTREE_ITER_ALL_SNAPSHOTS) &&
+	    !btree_type_has_snapshots(btree_id))
+		flags &= ~BTREE_ITER_ALL_SNAPSHOTS;
+
+	if (!(flags & BTREE_ITER_ALL_SNAPSHOTS) &&
+	    btree_type_has_snapshots(btree_id))
+		flags |= BTREE_ITER_FILTER_SNAPSHOTS;
+
+	if (trans->journal_replay_not_finished)
+		flags |= BTREE_ITER_WITH_JOURNAL;
+
+	return flags;
+}
+
+static inline unsigned bch2_btree_iter_flags(struct btree_trans *trans,
+					     unsigned btree_id,
+					     unsigned flags)
+{
+	if (!btree_id_cached(trans->c, btree_id)) {
+		flags &= ~BTREE_ITER_CACHED;
+		flags &= ~BTREE_ITER_WITH_KEY_CACHE;
+	} else if (!(flags & BTREE_ITER_CACHED))
+		flags |= BTREE_ITER_WITH_KEY_CACHE;
+
+	return __bch2_btree_iter_flags(trans, btree_id, flags);
+}
+
+static inline void bch2_trans_iter_init_common(struct btree_trans *trans,
+					  struct btree_iter *iter,
+					  unsigned btree_id, struct bpos pos,
+					  unsigned locks_want,
+					  unsigned depth,
+					  unsigned flags)
+{
+	memset(iter, 0, sizeof(*iter));
+	iter->trans	= trans;
+	iter->btree_id	= btree_id;
+	iter->flags	= flags;
+	iter->snapshot	= pos.snapshot;
+	iter->pos	= pos;
+	iter->k.p	= pos;
+
+	iter->path = bch2_path_get(trans, btree_id, iter->pos,
+				   locks_want, depth, flags);
+}
+
+void bch2_trans_iter_init_outlined(struct btree_trans *, struct btree_iter *,
+			  enum btree_id, struct bpos, unsigned);
+
+static inline void bch2_trans_iter_init(struct btree_trans *trans,
+			  struct btree_iter *iter,
+			  unsigned btree_id, struct bpos pos,
+			  unsigned flags)
+{
+	if (__builtin_constant_p(btree_id) &&
+	    __builtin_constant_p(flags))
+		bch2_trans_iter_init_common(trans, iter, btree_id, pos, 0, 0,
+				bch2_btree_iter_flags(trans, btree_id, flags));
+	else
+		bch2_trans_iter_init_outlined(trans, iter, btree_id, pos, flags);
+}
+
 void bch2_trans_node_iter_init(struct btree_trans *, struct btree_iter *,
 			       enum btree_id, struct bpos,
 			       unsigned, unsigned, unsigned);
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 72e6a214b89a..ae5a692d1924 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -278,7 +278,6 @@ struct btree_iter {
 	unsigned		snapshot;
 
 	struct bpos		pos;
-	struct bpos		pos_after_commit;
 	/*
 	 * Current unpacked key - so that bch2_btree_iter_next()/
 	 * bch2_btree_iter_next_slot() can correctly advance pos.
-- 
cgit 


From 30c92ffe4752b10059cfe00cea775d4af2f5196c Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 23 Nov 2022 18:23:48 -0500
Subject: bcachefs: Better inlining in bch2_time_stats_update()

Move the actual slowpath off into a new function -
bch2_time_stats_clear_buffer() - and inline
bch2_time_stats_update_one().

Alo, use the new inlined update functions from mean_and_variance.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/util.c | 34 ++++++++++++++++++++--------------
 1 file changed, 20 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c
index 8b2eef24498e..31934f7a6436 100644
--- a/fs/bcachefs/util.c
+++ b/fs/bcachefs/util.c
@@ -319,8 +319,8 @@ static void bch2_quantiles_update(struct bch2_quantiles *q, u64 v)
 	}
 }
 
-static void bch2_time_stats_update_one(struct bch2_time_stats *stats,
-				       u64 start, u64 end)
+static inline void bch2_time_stats_update_one(struct bch2_time_stats *stats,
+					      u64 start, u64 end)
 {
 	u64 duration, freq;
 
@@ -343,6 +343,22 @@ static void bch2_time_stats_update_one(struct bch2_time_stats *stats,
 	}
 }
 
+static noinline void bch2_time_stats_clear_buffer(struct bch2_time_stats *stats,
+						  struct bch2_time_stat_buffer *b)
+{
+	struct bch2_time_stat_buffer_entry *i;
+	unsigned long flags;
+
+	spin_lock_irqsave(&stats->lock, flags);
+	for (i = b->entries;
+	     i < b->entries + ARRAY_SIZE(b->entries);
+	     i++)
+		bch2_time_stats_update_one(stats, i->start, i->end);
+	spin_unlock_irqrestore(&stats->lock, flags);
+
+	b->nr = 0;
+}
+
 void __bch2_time_stats_update(struct bch2_time_stats *stats, u64 start, u64 end)
 {
 	unsigned long flags;
@@ -362,7 +378,6 @@ void __bch2_time_stats_update(struct bch2_time_stats *stats, u64 start, u64 end)
 						 GFP_ATOMIC);
 		spin_unlock_irqrestore(&stats->lock, flags);
 	} else {
-		struct bch2_time_stat_buffer_entry *i;
 		struct bch2_time_stat_buffer *b;
 
 		preempt_disable();
@@ -374,17 +389,8 @@ void __bch2_time_stats_update(struct bch2_time_stats *stats, u64 start, u64 end)
 			.end = end
 		};
 
-		if (b->nr == ARRAY_SIZE(b->entries)) {
-			spin_lock_irqsave(&stats->lock, flags);
-			for (i = b->entries;
-			     i < b->entries + ARRAY_SIZE(b->entries);
-			     i++)
-				bch2_time_stats_update_one(stats, i->start, i->end);
-			spin_unlock_irqrestore(&stats->lock, flags);
-
-			b->nr = 0;
-		}
-
+		if (unlikely(b->nr == ARRAY_SIZE(b->entries)))
+			bch2_time_stats_clear_buffer(stats, b);
 		preempt_enable();
 	}
 }
-- 
cgit 


From 42fab2695bfea686dc5eef92da0b4ce3277484a0 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 25 Nov 2022 12:56:49 -0500
Subject: bcachefs: Kill BCH_FEATURE_incompressible

This isn't needed anymore, we only support metadata versions that have
this.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/io.c | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index 821c14763c66..5fe049d64e27 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -660,14 +660,9 @@ static void __bch2_write_index(struct bch_write_op *op)
 	 * particularly want to plumb io_opts all the way through the btree
 	 * update stack right now
 	 */
-	for_each_keylist_key(keys, k) {
+	for_each_keylist_key(keys, k)
 		bch2_rebalance_add_key(c, bkey_i_to_s_c(k), &op->opts);
 
-		if (bch2_bkey_is_incompressible(bkey_i_to_s_c(k)))
-			bch2_check_set_feature(op->c, BCH_FEATURE_incompressible);
-
-	}
-
 	if (!bch2_keylist_empty(keys)) {
 		u64 sectors_start = keylist_sectors(keys);
 
-- 
cgit 


From b6804b6103263417994602482e9186fb3697a3e6 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 25 Nov 2022 16:15:14 -0500
Subject: bcachefs: Fix an include

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/ec_types.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/ec_types.h b/fs/bcachefs/ec_types.h
index edd93da663c1..2bf26d254b2c 100644
--- a/fs/bcachefs/ec_types.h
+++ b/fs/bcachefs/ec_types.h
@@ -2,7 +2,7 @@
 #ifndef _BCACHEFS_EC_TYPES_H
 #define _BCACHEFS_EC_TYPES_H
 
-#include <linux/llist.h>
+#include "bcachefs_format.h"
 
 struct bch_replicas_padded {
 	struct bch_replicas_entry	e;
-- 
cgit 


From 447e92274af6c7e8dcdc7921a6af238afcc87a0a Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 25 Nov 2022 16:04:42 -0500
Subject: bcachefs: Don't set accessed bit on btree node fill

Btree nodes shouldn't have their accessed bit set when entering the
btree cache by being read in from disk - this fixes linear scans
thrashing the cache.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_cache.c | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index 91ddbc7b8489..90be4c7325f7 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -707,6 +707,12 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c,
 	if (IS_ERR(b))
 		return b;
 
+	/*
+	 * Btree nodes read in from disk should not have the accessed bit set
+	 * initially, so that linear scans don't thrash the cache:
+	 */
+	clear_btree_node_accessed(b);
+
 	bkey_copy(&b->key, k);
 	if (bch2_btree_node_hash_insert(bc, b, level, btree_id)) {
 		/* raced with another fill: */
@@ -843,6 +849,10 @@ retry:
 			trace_and_count(c, trans_restart_btree_node_reused, trans, trace_ip, path);
 			return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_lock_node_reused));
 		}
+
+		/* avoid atomic set bit if it's not needed: */
+		if (!btree_node_accessed(b))
+			set_btree_node_accessed(b);
 	}
 
 	if (unlikely(btree_node_read_in_flight(b))) {
@@ -880,10 +890,6 @@ retry:
 		prefetch(p + L1_CACHE_BYTES * 2);
 	}
 
-	/* avoid atomic set bit if it's not needed: */
-	if (!btree_node_accessed(b))
-		set_btree_node_accessed(b);
-
 	if (unlikely(btree_node_read_error(b))) {
 		six_unlock_type(&b->c.lock, lock_type);
 		return ERR_PTR(-EIO);
-- 
cgit 


From e2fcf7f6306f12c50ad2ec655fdb8d1918c23bdd Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 25 Nov 2022 18:29:36 -0500
Subject: bcachefs: Fix BCH_IOCTL_DISK_SET_STATE

 - Ensure we print an error message if necessary.

   Ideally we'd return the precise error code to userspace and leave
   printing the error message to the userspace tool, but we haven't
   decided to make our private error codes ABI-stable yet.

 - Return standard error code to userspace

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/chardev.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c
index 7b448b9551b6..28854a6c31b9 100644
--- a/fs/bcachefs/chardev.c
+++ b/fs/bcachefs/chardev.c
@@ -284,6 +284,8 @@ static long bch2_ioctl_disk_set_state(struct bch_fs *c,
 		return PTR_ERR(ca);
 
 	ret = bch2_dev_set_state(c, ca, arg.new_state, arg.flags);
+	if (ret)
+		bch_err(c, "Error setting device state: %s", bch2_err_str(ret));
 
 	percpu_ref_put(&ca->ref);
 	return ret;
@@ -631,11 +633,14 @@ do {									\
 									\
 	if (copy_from_user(&i, arg, sizeof(i)))				\
 		return -EFAULT;						\
-	return bch2_ioctl_##_name(c, i);				\
+	ret = bch2_ioctl_##_name(c, i);					\
+	goto out;							\
 } while (0)
 
 long bch2_fs_ioctl(struct bch_fs *c, unsigned cmd, void __user *arg)
 {
+	long ret;
+
 	switch (cmd) {
 	case BCH_IOCTL_QUERY_UUID:
 		return bch2_ioctl_query_uuid(c, arg);
@@ -679,6 +684,10 @@ long bch2_fs_ioctl(struct bch_fs *c, unsigned cmd, void __user *arg)
 	default:
 		return -ENOTTY;
 	}
+out:
+	if (ret < 0)
+		ret = bch2_err_class(ret);
+	return ret;
 }
 
 static DEFINE_IDR(bch_chardev_minor);
-- 
cgit 


From 6530d89e0a598592badde77930269740b44ea2ee Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 26 Nov 2022 04:36:56 -0500
Subject: bcachefs: extents no longer require special handling for packing

Extent overwrite used to  be handled differently, underneath the
journaling layer and within the core btree code. This imposed
restrictions on bkey packing/packed formats, which no longer apply.

This patch deletes those restrictions.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bkey.c | 9 ---------
 fs/bcachefs/bkey.h | 1 -
 2 files changed, 10 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bkey.c b/fs/bcachefs/bkey.c
index 0291d216e5dd..a10046ae01f2 100644
--- a/fs/bcachefs/bkey.c
+++ b/fs/bcachefs/bkey.c
@@ -323,15 +323,6 @@ bool bch2_bkey_pack_key(struct bkey_packed *out, const struct bkey *in,
 #define x(id, field)	if (!set_inc_field(&state, id, in->field)) return false;
 	bkey_fields()
 #undef x
-
-	/*
-	 * Extents - we have to guarantee that if an extent is packed, a trimmed
-	 * version will also pack:
-	 */
-	if (bkey_start_offset(in) <
-	    le64_to_cpu(format->field_offset[BKEY_FIELD_OFFSET]))
-		return false;
-
 	pack_state_finish(&state, out);
 	out->u64s	= format->key_u64s + in->u64s - BKEY_U64s;
 	out->format	= KEY_FORMAT_LOCAL_BTREE;
diff --git a/fs/bcachefs/bkey.h b/fs/bcachefs/bkey.h
index 0ce020bcc55f..df8189476016 100644
--- a/fs/bcachefs/bkey.h
+++ b/fs/bcachefs/bkey.h
@@ -689,7 +689,6 @@ static inline void bch2_bkey_format_add_key(struct bkey_format_state *s, const s
 #define x(id, field) __bkey_format_add(s, id, k->field);
 	bkey_fields()
 #undef x
-	__bkey_format_add(s, BKEY_FIELD_OFFSET, bkey_start_offset(k));
 }
 
 void bch2_bkey_format_add_pos(struct bkey_format_state *, struct bpos);
-- 
cgit 


From e15382125948523cd5c887c5fe4fa4303e9a9dc1 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 2 Dec 2022 19:46:49 -0500
Subject: bcachefs: New magic number

Add a new bcachefs-specific magic number for the superblock, instead of
continuing to use the old bcache magic number3

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs_format.h | 2 +-
 fs/bcachefs/super-io.c        | 5 +++++
 2 files changed, 6 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index 6e01fd81e3f0..6ee9321e7d21 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -1514,7 +1514,7 @@ struct bch_sb_layout {
  * @version_min	- Oldest metadata version this filesystem contains; so we can
  *		  safely drop compatibility code and refuse to mount filesystems
  *		  we'd need it for
- * @magic	- identifies as a bcachefs superblock (BCACHE_MAGIC)
+ * @magic	- identifies as a bcachefs superblock (BCHFS_MAGIC)
  * @seq		- incremented each time superblock is written
  * @uuid	- used for generating various magic numbers and identifying
  *                member devices, never changes
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index 42e3ce7c0f8c..e27b301432b1 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -809,6 +809,11 @@ int bch2_write_super(struct bch_fs *c)
 	closure_init_stack(cl);
 	memset(&sb_written, 0, sizeof(sb_written));
 
+	if (c->opts.version_upgrade) {
+		c->disk_sb.sb->magic = BCHFS_MAGIC;
+		c->disk_sb.sb->layout.magic = BCHFS_MAGIC;
+	}
+
 	le64_add_cpu(&c->disk_sb.sb->seq, 1);
 
 	if (test_bit(BCH_FS_ERROR, &c->flags))
-- 
cgit 


From e88a75ebe86c1df42f0ca9ab6e8fa50db26e7cef Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 24 Nov 2022 03:12:22 -0500
Subject: bcachefs: New bpos_cmp(), bkey_cmp() replacements

This patch introduces
 - bpos_eq()
 - bpos_lt()
 - bpos_le()
 - bpos_gt()
 - bpos_ge()

and equivalent replacements for bkey_cmp().

Looking at the generated assembly these could probably be improved
further, but we already see a significant code size improvement with
this patch.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c      |  2 +-
 fs/bcachefs/alloc_foreground.c      |  2 +-
 fs/bcachefs/bkey.h                  | 65 +++++++++++++++++++++++++++-
 fs/bcachefs/bkey_methods.c          | 12 +++---
 fs/bcachefs/bkey_methods.h          |  2 +-
 fs/bcachefs/bset.c                  | 13 +++---
 fs/bcachefs/btree_cache.c           |  4 +-
 fs/bcachefs/btree_gc.c              | 34 +++++++--------
 fs/bcachefs/btree_io.c              | 10 ++---
 fs/bcachefs/btree_io.h              |  4 +-
 fs/bcachefs/btree_iter.c            | 86 ++++++++++++++++++-------------------
 fs/bcachefs/btree_iter.h            |  2 +-
 fs/bcachefs/btree_key_cache.c       |  8 ++--
 fs/bcachefs/btree_update_interior.c | 17 ++++----
 fs/bcachefs/btree_update_leaf.c     | 26 +++++------
 fs/bcachefs/data_update.c           |  6 +--
 fs/bcachefs/debug.c                 |  6 +--
 fs/bcachefs/dirent.c                |  4 +-
 fs/bcachefs/ec.c                    |  4 +-
 fs/bcachefs/extent_update.c         |  8 ++--
 fs/bcachefs/extents.c               | 10 ++---
 fs/bcachefs/extents.h               |  5 +--
 fs/bcachefs/fs-io.c                 | 12 +++---
 fs/bcachefs/fsck.c                  |  8 ++--
 fs/bcachefs/inode.c                 |  2 +-
 fs/bcachefs/io.c                    |  8 ++--
 fs/bcachefs/keylist.c               |  4 +-
 fs/bcachefs/move.c                  |  4 +-
 fs/bcachefs/recovery.c              | 19 ++++----
 fs/bcachefs/reflink.c               | 12 +++---
 fs/bcachefs/subvolume.c             | 12 +++---
 31 files changed, 233 insertions(+), 178 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index ffcfb9f1916e..a0b9fa30260a 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -982,7 +982,7 @@ static int bch2_discard_one_bucket(struct btree_trans *trans,
 		goto out;
 	}
 
-	if (bkey_cmp(*discard_pos_done, iter.pos) &&
+	if (!bkey_eq(*discard_pos_done, iter.pos) &&
 	    ca->mi.discard && !c->opts.nochanges) {
 		/*
 		 * This works without any other locks because this is the only
diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index 9a4a62211755..dd47eeb1efc5 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -399,7 +399,7 @@ again:
 			   BTREE_ITER_SLOTS, k, ret) {
 		struct bch_alloc_v4 a;
 
-		if (bkey_cmp(k.k->p, POS(ca->dev_idx, ca->mi.nbuckets)) >= 0)
+		if (bkey_ge(k.k->p, POS(ca->dev_idx, ca->mi.nbuckets)))
 			break;
 
 		if (ca->new_fs_bucket_idx &&
diff --git a/fs/bcachefs/bkey.h b/fs/bcachefs/bkey.h
index df8189476016..dc2b91bc67f3 100644
--- a/fs/bcachefs/bkey.h
+++ b/fs/bcachefs/bkey.h
@@ -144,6 +144,37 @@ static inline int bkey_cmp_left_packed_byval(const struct btree *b,
 	return bkey_cmp_left_packed(b, l, &r);
 }
 
+static __always_inline bool bpos_eq(struct bpos l, struct bpos r)
+{
+	return  !((l.inode	^ r.inode) |
+		  (l.offset	^ r.offset) |
+		  (l.snapshot	^ r.snapshot));
+}
+
+static __always_inline bool bpos_lt(struct bpos l, struct bpos r)
+{
+	return  l.inode	!= r.inode ? l.inode < r.inode :
+		l.offset != r.offset ? l.offset < r.offset :
+		l.snapshot != r.snapshot ? l.snapshot < r.snapshot : false;
+}
+
+static __always_inline bool bpos_le(struct bpos l, struct bpos r)
+{
+	return  l.inode	!= r.inode ? l.inode < r.inode :
+		l.offset != r.offset ? l.offset < r.offset :
+		l.snapshot != r.snapshot ? l.snapshot < r.snapshot : true;
+}
+
+static __always_inline bool bpos_gt(struct bpos l, struct bpos r)
+{
+	return bpos_lt(r, l);
+}
+
+static __always_inline bool bpos_ge(struct bpos l, struct bpos r)
+{
+	return bpos_le(r, l);
+}
+
 static __always_inline int bpos_cmp(struct bpos l, struct bpos r)
 {
 	return  cmp_int(l.inode,    r.inode) ?:
@@ -151,6 +182,36 @@ static __always_inline int bpos_cmp(struct bpos l, struct bpos r)
 		cmp_int(l.snapshot, r.snapshot);
 }
 
+static __always_inline bool bkey_eq(struct bpos l, struct bpos r)
+{
+	return  !((l.inode	^ r.inode) |
+		  (l.offset	^ r.offset));
+}
+
+static __always_inline bool bkey_lt(struct bpos l, struct bpos r)
+{
+	return  l.inode	!= r.inode
+		? l.inode < r.inode
+		: l.offset < r.offset;
+}
+
+static __always_inline bool bkey_le(struct bpos l, struct bpos r)
+{
+	return  l.inode	!= r.inode
+		? l.inode < r.inode
+		: l.offset <= r.offset;
+}
+
+static __always_inline bool bkey_gt(struct bpos l, struct bpos r)
+{
+	return bkey_lt(r, l);
+}
+
+static __always_inline bool bkey_ge(struct bpos l, struct bpos r)
+{
+	return bkey_le(r, l);
+}
+
 static __always_inline int bkey_cmp(struct bpos l, struct bpos r)
 {
 	return  cmp_int(l.inode,    r.inode) ?:
@@ -159,12 +220,12 @@ static __always_inline int bkey_cmp(struct bpos l, struct bpos r)
 
 static inline struct bpos bpos_min(struct bpos l, struct bpos r)
 {
-	return bpos_cmp(l, r) < 0 ? l : r;
+	return bpos_lt(l, r) ? l : r;
 }
 
 static inline struct bpos bpos_max(struct bpos l, struct bpos r)
 {
-	return bpos_cmp(l, r) > 0 ? l : r;
+	return bpos_gt(l, r) ? l : r;
 }
 
 void bch2_bpos_swab(struct bpos *);
diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c
index 141754db5fa1..7fcd6ca40b93 100644
--- a/fs/bcachefs/bkey_methods.c
+++ b/fs/bcachefs/bkey_methods.c
@@ -245,7 +245,7 @@ int __bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k,
 	}
 
 	if (type != BKEY_TYPE_btree &&
-	    !bkey_cmp(k.k->p, POS_MAX)) {
+	    bkey_eq(k.k->p, POS_MAX)) {
 		prt_printf(err, "key at POS_MAX");
 		return -EINVAL;
 	}
@@ -264,12 +264,12 @@ int bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k,
 int bch2_bkey_in_btree_node(struct btree *b, struct bkey_s_c k,
 			    struct printbuf *err)
 {
-	if (bpos_cmp(k.k->p, b->data->min_key) < 0) {
+	if (bpos_lt(k.k->p, b->data->min_key)) {
 		prt_printf(err, "key before start of btree node");
 		return -EINVAL;
 	}
 
-	if (bpos_cmp(k.k->p, b->data->max_key) > 0) {
+	if (bpos_gt(k.k->p, b->data->max_key)) {
 		prt_printf(err, "key past end of btree node");
 		return -EINVAL;
 	}
@@ -279,11 +279,11 @@ int bch2_bkey_in_btree_node(struct btree *b, struct bkey_s_c k,
 
 void bch2_bpos_to_text(struct printbuf *out, struct bpos pos)
 {
-	if (!bpos_cmp(pos, POS_MIN))
+	if (bpos_eq(pos, POS_MIN))
 		prt_printf(out, "POS_MIN");
-	else if (!bpos_cmp(pos, POS_MAX))
+	else if (bpos_eq(pos, POS_MAX))
 		prt_printf(out, "POS_MAX");
-	else if (!bpos_cmp(pos, SPOS_MAX))
+	else if (bpos_eq(pos, SPOS_MAX))
 		prt_printf(out, "SPOS_MAX");
 	else {
 		if (pos.inode == U64_MAX)
diff --git a/fs/bcachefs/bkey_methods.h b/fs/bcachefs/bkey_methods.h
index 0c74ba335e64..7c907b7fd0d7 100644
--- a/fs/bcachefs/bkey_methods.h
+++ b/fs/bcachefs/bkey_methods.h
@@ -60,7 +60,7 @@ static inline bool bch2_bkey_maybe_mergable(const struct bkey *l, const struct b
 {
 	return l->type == r->type &&
 		!bversion_cmp(l->version, r->version) &&
-		!bpos_cmp(l->p, bkey_start_pos(r));
+		bpos_eq(l->p, bkey_start_pos(r));
 }
 
 bool bch2_bkey_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);
diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c
index aa8508efca00..50a1c9d8ebab 100644
--- a/fs/bcachefs/bset.c
+++ b/fs/bcachefs/bset.c
@@ -83,13 +83,12 @@ void bch2_dump_bset(struct bch_fs *c, struct btree *b,
 
 		n = bkey_unpack_key(b, _n);
 
-		if (bpos_cmp(n.p, k.k->p) < 0) {
+		if (bpos_lt(n.p, k.k->p)) {
 			printk(KERN_ERR "Key skipped backwards\n");
 			continue;
 		}
 
-		if (!bkey_deleted(k.k) &&
-		    !bpos_cmp(n.p, k.k->p))
+		if (!bkey_deleted(k.k) && bpos_eq(n.p, k.k->p))
 			printk(KERN_ERR "Duplicate keys\n");
 	}
 
@@ -530,7 +529,7 @@ static void bch2_bset_verify_rw_aux_tree(struct btree *b,
 	goto start;
 	while (1) {
 		if (rw_aux_to_bkey(b, t, j) == k) {
-			BUG_ON(bpos_cmp(rw_aux_tree(b, t)[j].k,
+			BUG_ON(!bpos_eq(rw_aux_tree(b, t)[j].k,
 					bkey_unpack_pos(b, k)));
 start:
 			if (++j == t->size)
@@ -1065,7 +1064,7 @@ static struct bkey_packed *bset_search_write_set(const struct btree *b,
 	while (l + 1 != r) {
 		unsigned m = (l + r) >> 1;
 
-		if (bpos_cmp(rw_aux_tree(b, t)[m].k, *search) < 0)
+		if (bpos_lt(rw_aux_tree(b, t)[m].k, *search))
 			l = m;
 		else
 			r = m;
@@ -1318,8 +1317,8 @@ void bch2_btree_node_iter_init(struct btree_node_iter *iter,
 	struct bkey_packed *k[MAX_BSETS];
 	unsigned i;
 
-	EBUG_ON(bpos_cmp(*search, b->data->min_key) < 0);
-	EBUG_ON(bpos_cmp(*search, b->data->max_key) > 0);
+	EBUG_ON(bpos_lt(*search, b->data->min_key));
+	EBUG_ON(bpos_gt(*search, b->data->max_key));
 	bset_aux_tree_verify(b);
 
 	memset(iter, 0, sizeof(*iter));
diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index 90be4c7325f7..0ac8636edba2 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -793,9 +793,9 @@ static inline void btree_check_header(struct bch_fs *c, struct btree *b)
 {
 	if (b->c.btree_id != BTREE_NODE_ID(b->data) ||
 	    b->c.level != BTREE_NODE_LEVEL(b->data) ||
-	    bpos_cmp(b->data->max_key, b->key.k.p) ||
+	    !bpos_eq(b->data->max_key, b->key.k.p) ||
 	    (b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
-	     bpos_cmp(b->data->min_key,
+	     !bpos_eq(b->data->min_key,
 		      bkey_i_to_btree_ptr_v2(&b->key)->v.min_key)))
 		btree_bad_header(c, b);
 }
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 3395fa56c724..f5b46f382340 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -76,7 +76,7 @@ static int bch2_gc_check_topology(struct bch_fs *c,
 	if (cur.k->k.type == KEY_TYPE_btree_ptr_v2) {
 		struct bkey_i_btree_ptr_v2 *bp = bkey_i_to_btree_ptr_v2(cur.k);
 
-		if (bpos_cmp(expected_start, bp->v.min_key)) {
+		if (!bpos_eq(expected_start, bp->v.min_key)) {
 			bch2_topology_error(c);
 
 			if (bkey_deleted(&prev->k->k)) {
@@ -106,7 +106,7 @@ static int bch2_gc_check_topology(struct bch_fs *c,
 		}
 	}
 
-	if (is_last && bpos_cmp(cur.k->k.p, node_end)) {
+	if (is_last && !bpos_eq(cur.k->k.p, node_end)) {
 		bch2_topology_error(c);
 
 		printbuf_reset(&buf1);
@@ -274,12 +274,12 @@ static int btree_repair_node_boundaries(struct bch_fs *c, struct btree *b,
 	bch2_bkey_val_to_text(&buf2, c, bkey_i_to_s_c(&cur->key));
 
 	if (prev &&
-	    bpos_cmp(expected_start, cur->data->min_key) > 0 &&
+	    bpos_gt(expected_start, cur->data->min_key) &&
 	    BTREE_NODE_SEQ(cur->data) > BTREE_NODE_SEQ(prev->data)) {
 		/* cur overwrites prev: */
 
-		if (mustfix_fsck_err_on(bpos_cmp(prev->data->min_key,
-						 cur->data->min_key) >= 0, c,
+		if (mustfix_fsck_err_on(bpos_ge(prev->data->min_key,
+						cur->data->min_key), c,
 				"btree node overwritten by next node at btree %s level %u:\n"
 				"  node %s\n"
 				"  next %s",
@@ -289,7 +289,7 @@ static int btree_repair_node_boundaries(struct bch_fs *c, struct btree *b,
 			goto out;
 		}
 
-		if (mustfix_fsck_err_on(bpos_cmp(prev->key.k.p,
+		if (mustfix_fsck_err_on(!bpos_eq(prev->key.k.p,
 						 bpos_predecessor(cur->data->min_key)), c,
 				"btree node with incorrect max_key at btree %s level %u:\n"
 				"  node %s\n"
@@ -301,8 +301,8 @@ static int btree_repair_node_boundaries(struct bch_fs *c, struct btree *b,
 	} else {
 		/* prev overwrites cur: */
 
-		if (mustfix_fsck_err_on(bpos_cmp(expected_start,
-						 cur->data->max_key) >= 0, c,
+		if (mustfix_fsck_err_on(bpos_ge(expected_start,
+						cur->data->max_key), c,
 				"btree node overwritten by prev node at btree %s level %u:\n"
 				"  prev %s\n"
 				"  node %s",
@@ -312,7 +312,7 @@ static int btree_repair_node_boundaries(struct bch_fs *c, struct btree *b,
 			goto out;
 		}
 
-		if (mustfix_fsck_err_on(bpos_cmp(expected_start, cur->data->min_key), c,
+		if (mustfix_fsck_err_on(!bpos_eq(expected_start, cur->data->min_key), c,
 				"btree node with incorrect min_key at btree %s level %u:\n"
 				"  prev %s\n"
 				"  node %s",
@@ -336,7 +336,7 @@ static int btree_repair_node_end(struct bch_fs *c, struct btree *b,
 	bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(&child->key));
 	bch2_bpos_to_text(&buf2, b->key.k.p);
 
-	if (mustfix_fsck_err_on(bpos_cmp(child->key.k.p, b->key.k.p), c,
+	if (mustfix_fsck_err_on(!bpos_eq(child->key.k.p, b->key.k.p), c,
 			"btree node with incorrect max_key at btree %s level %u:\n"
 			"  %s\n"
 			"  expected %s",
@@ -374,8 +374,8 @@ again:
 	bch2_btree_and_journal_iter_init_node_iter(&iter, c, b);
 
 	while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
-		BUG_ON(bpos_cmp(k.k->p, b->data->min_key) < 0);
-		BUG_ON(bpos_cmp(k.k->p, b->data->max_key) > 0);
+		BUG_ON(bpos_lt(k.k->p, b->data->min_key));
+		BUG_ON(bpos_gt(k.k->p, b->data->max_key));
 
 		bch2_btree_and_journal_iter_advance(&iter);
 		bch2_bkey_buf_reassemble(&cur_k, c, k);
@@ -912,8 +912,8 @@ static int bch2_gc_btree_init_recurse(struct btree_trans *trans, struct btree *b
 	bkey_init(&prev.k->k);
 
 	while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
-		BUG_ON(bpos_cmp(k.k->p, b->data->min_key) < 0);
-		BUG_ON(bpos_cmp(k.k->p, b->data->max_key) > 0);
+		BUG_ON(bpos_lt(k.k->p, b->data->min_key));
+		BUG_ON(bpos_gt(k.k->p, b->data->max_key));
 
 		ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level,
 				       false, &k, true);
@@ -1018,7 +1018,7 @@ static int bch2_gc_btree_init(struct btree_trans *trans,
 	six_lock_read(&b->c.lock, NULL, NULL);
 	printbuf_reset(&buf);
 	bch2_bpos_to_text(&buf, b->data->min_key);
-	if (mustfix_fsck_err_on(bpos_cmp(b->data->min_key, POS_MIN), c,
+	if (mustfix_fsck_err_on(!bpos_eq(b->data->min_key, POS_MIN), c,
 			"btree root with incorrect min_key: %s", buf.buf)) {
 		bch_err(c, "repair unimplemented");
 		ret = -BCH_ERR_fsck_repair_unimplemented;
@@ -1027,7 +1027,7 @@ static int bch2_gc_btree_init(struct btree_trans *trans,
 
 	printbuf_reset(&buf);
 	bch2_bpos_to_text(&buf, b->data->max_key);
-	if (mustfix_fsck_err_on(bpos_cmp(b->data->max_key, SPOS_MAX), c,
+	if (mustfix_fsck_err_on(!bpos_eq(b->data->max_key, SPOS_MAX), c,
 			"btree root with incorrect max_key: %s", buf.buf)) {
 		bch_err(c, "repair unimplemented");
 		ret = -BCH_ERR_fsck_repair_unimplemented;
@@ -1341,7 +1341,7 @@ static int bch2_alloc_write_key(struct btree_trans *trans,
 	enum bch_data_type type;
 	int ret;
 
-	if (bkey_cmp(iter->pos, POS(ca->dev_idx, ca->mi.nbuckets)) >= 0)
+	if (bkey_ge(iter->pos, POS(ca->dev_idx, ca->mi.nbuckets)))
 		return 1;
 
 	bch2_alloc_to_v4(k, &old);
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 8dbe930c1eb2..9dedac2c7885 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -77,7 +77,7 @@ static void verify_no_dups(struct btree *b,
 		struct bkey l = bkey_unpack_key(b, p);
 		struct bkey r = bkey_unpack_key(b, k);
 
-		BUG_ON(bpos_cmp(l.p, bkey_start_pos(&r)) >= 0);
+		BUG_ON(bpos_ge(l.p, bkey_start_pos(&r)));
 	}
 #endif
 }
@@ -645,8 +645,8 @@ void bch2_btree_node_drop_keys_outside_node(struct btree *b)
 	bch2_btree_build_aux_trees(b);
 
 	for_each_btree_node_key_unpack(b, k, &iter, &unpacked) {
-		BUG_ON(bpos_cmp(k.k->p, b->data->min_key) < 0);
-		BUG_ON(bpos_cmp(k.k->p, b->data->max_key) > 0);
+		BUG_ON(bpos_lt(k.k->p, b->data->min_key));
+		BUG_ON(bpos_gt(k.k->p, b->data->max_key));
 	}
 }
 
@@ -744,7 +744,7 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
 				b->data->max_key = b->key.k.p;
 			}
 
-			btree_err_on(bpos_cmp(b->data->min_key, bp->min_key),
+			btree_err_on(!bpos_eq(b->data->min_key, bp->min_key),
 				     BTREE_ERR_MUST_RETRY, c, ca, b, NULL,
 				     "incorrect min_key: got %s should be %s",
 				     (printbuf_reset(&buf1),
@@ -753,7 +753,7 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
 				      bch2_bpos_to_text(&buf2, bp->min_key), buf2.buf));
 		}
 
-		btree_err_on(bpos_cmp(bn->max_key, b->key.k.p),
+		btree_err_on(!bpos_eq(bn->max_key, b->key.k.p),
 			     BTREE_ERR_MUST_RETRY, c, ca, b, i,
 			     "incorrect max key %s",
 			     (printbuf_reset(&buf1),
diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h
index 4b1810ad7d91..a720dd74139b 100644
--- a/fs/bcachefs/btree_io.h
+++ b/fs/bcachefs/btree_io.h
@@ -201,7 +201,7 @@ static inline void compat_btree_node(unsigned level, enum btree_id btree_id,
 {
 	if (version < bcachefs_metadata_version_inode_btree_change &&
 	    btree_node_type_is_extents(btree_id) &&
-	    bpos_cmp(bn->min_key, POS_MIN) &&
+	    !bpos_eq(bn->min_key, POS_MIN) &&
 	    write)
 		bn->min_key = bpos_nosnap_predecessor(bn->min_key);
 
@@ -218,7 +218,7 @@ static inline void compat_btree_node(unsigned level, enum btree_id btree_id,
 
 	if (version < bcachefs_metadata_version_inode_btree_change &&
 	    btree_node_type_is_extents(btree_id) &&
-	    bpos_cmp(bn->min_key, POS_MIN) &&
+	    !bpos_eq(bn->min_key, POS_MIN) &&
 	    !write)
 		bn->min_key = bpos_nosnap_successor(bn->min_key);
 }
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 238ba10d34e4..8a18b55cab26 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -93,7 +93,7 @@ static inline struct bpos btree_iter_search_key(struct btree_iter *iter)
 	struct bpos pos = iter->pos;
 
 	if ((iter->flags & BTREE_ITER_IS_EXTENTS) &&
-	    bkey_cmp(pos, POS_MAX))
+	    !bkey_eq(pos, POS_MAX))
 		pos = bkey_successor(iter, pos);
 	return pos;
 }
@@ -101,13 +101,13 @@ static inline struct bpos btree_iter_search_key(struct btree_iter *iter)
 static inline bool btree_path_pos_before_node(struct btree_path *path,
 					      struct btree *b)
 {
-	return bpos_cmp(path->pos, b->data->min_key) < 0;
+	return bpos_lt(path->pos, b->data->min_key);
 }
 
 static inline bool btree_path_pos_after_node(struct btree_path *path,
 					     struct btree *b)
 {
-	return bpos_cmp(b->key.k.p, path->pos) < 0;
+	return bpos_gt(path->pos, b->key.k.p);
 }
 
 static inline bool btree_path_pos_in_node(struct btree_path *path,
@@ -133,7 +133,7 @@ static void bch2_btree_path_verify_cached(struct btree_trans *trans,
 
 	ck = (void *) path->l[0].b;
 	BUG_ON(ck->key.btree_id != path->btree_id ||
-	       bkey_cmp(ck->key.pos, path->pos));
+	       !bkey_eq(ck->key.pos, path->pos));
 
 	if (!locked)
 		btree_node_unlock(trans, path, 0);
@@ -278,8 +278,8 @@ static void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter)
 	BUG_ON(!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS) &&
 	       iter->pos.snapshot != iter->snapshot);
 
-	BUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&iter->k)) < 0 ||
-	       bkey_cmp(iter->pos, iter->k.p) > 0);
+	BUG_ON(bkey_lt(iter->pos, bkey_start_pos(&iter->k)) ||
+	       bkey_gt(iter->pos, iter->k.p));
 }
 
 static int bch2_btree_iter_verify_ret(struct btree_iter *iter, struct bkey_s_c k)
@@ -313,7 +313,7 @@ static int bch2_btree_iter_verify_ret(struct btree_iter *iter, struct bkey_s_c k
 	if (ret)
 		goto out;
 
-	if (!bkey_cmp(prev.k->p, k.k->p) &&
+	if (bkey_eq(prev.k->p, k.k->p) &&
 	    bch2_snapshot_is_ancestor(trans->c, iter->snapshot,
 				      prev.k->p.snapshot) > 0) {
 		struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF;
@@ -355,11 +355,11 @@ void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id,
 			continue;
 
 		if (!key_cache) {
-			if (bkey_cmp(pos, path->l[0].b->data->min_key) >= 0 &&
-			    bkey_cmp(pos, path->l[0].b->key.k.p) <= 0)
+			if (bkey_ge(pos, path->l[0].b->data->min_key) &&
+			    bkey_le(pos, path->l[0].b->key.k.p))
 				return;
 		} else {
-			if (!bkey_cmp(pos, path->pos))
+			if (bkey_eq(pos, path->pos))
 				return;
 		}
 	}
@@ -1571,16 +1571,16 @@ struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *path, struct bkey *
 		_k = bch2_btree_node_iter_peek_all(&l->iter, l->b);
 		k = _k ? bkey_disassemble(l->b, _k, u) : bkey_s_c_null;
 
-		EBUG_ON(k.k && bkey_deleted(k.k) && bpos_cmp(k.k->p, path->pos) == 0);
+		EBUG_ON(k.k && bkey_deleted(k.k) && bpos_eq(k.k->p, path->pos));
 
-		if (!k.k || bpos_cmp(path->pos, k.k->p))
+		if (!k.k || !bpos_eq(path->pos, k.k->p))
 			goto hole;
 	} else {
 		struct bkey_cached *ck = (void *) path->l[0].b;
 
 		EBUG_ON(ck &&
 			(path->btree_id != ck->key.btree_id ||
-			 bkey_cmp(path->pos, ck->key.pos)));
+			 !bkey_eq(path->pos, ck->key.pos)));
 		EBUG_ON(!ck || !ck->valid);
 
 		*u = ck->k->k;
@@ -1638,7 +1638,7 @@ struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter)
 	if (!b)
 		goto out;
 
-	BUG_ON(bpos_cmp(b->key.k.p, iter->pos) < 0);
+	BUG_ON(bpos_lt(b->key.k.p, iter->pos));
 
 	bkey_init(&iter->k);
 	iter->k.p = iter->pos = b->key.k.p;
@@ -1689,7 +1689,7 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
 
 	b = btree_path_node(path, path->level + 1);
 
-	if (!bpos_cmp(iter->pos, b->key.k.p)) {
+	if (bpos_eq(iter->pos, b->key.k.p)) {
 		__btree_path_set_level_up(trans, path, path->level++);
 	} else {
 		/*
@@ -1732,9 +1732,9 @@ inline bool bch2_btree_iter_advance(struct btree_iter *iter)
 {
 	if (likely(!(iter->flags & BTREE_ITER_ALL_LEVELS))) {
 		struct bpos pos = iter->k.p;
-		bool ret = (iter->flags & BTREE_ITER_ALL_SNAPSHOTS
-			    ? bpos_cmp(pos, SPOS_MAX)
-			    : bkey_cmp(pos, SPOS_MAX)) != 0;
+		bool ret = !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS
+			     ? bpos_eq(pos, SPOS_MAX)
+			     : bkey_eq(pos, SPOS_MAX));
 
 		if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS))
 			pos = bkey_successor(iter, pos);
@@ -1752,9 +1752,9 @@ inline bool bch2_btree_iter_advance(struct btree_iter *iter)
 inline bool bch2_btree_iter_rewind(struct btree_iter *iter)
 {
 	struct bpos pos = bkey_start_pos(&iter->k);
-	bool ret = (iter->flags & BTREE_ITER_ALL_SNAPSHOTS
-		    ? bpos_cmp(pos, POS_MIN)
-		    : bkey_cmp(pos, POS_MIN)) != 0;
+	bool ret = !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS
+		     ? bpos_eq(pos, POS_MIN)
+		     : bkey_eq(pos, POS_MIN));
 
 	if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS))
 		pos = bkey_predecessor(iter, pos);
@@ -1773,11 +1773,11 @@ struct bkey_i *__bch2_btree_trans_peek_updates(struct btree_iter *iter)
 			continue;
 		if (i->btree_id > iter->btree_id)
 			break;
-		if (bpos_cmp(i->k->k.p, iter->path->pos) < 0)
+		if (bpos_lt(i->k->k.p, iter->path->pos))
 			continue;
 		if (i->key_cache_already_flushed)
 			continue;
-		if (!ret || bpos_cmp(i->k->k.p, ret->k.p) < 0)
+		if (!ret || bpos_lt(i->k->k.p, ret->k.p))
 			ret = i->k;
 	}
 
@@ -1797,7 +1797,7 @@ struct bkey_i *bch2_btree_journal_peek(struct btree_trans *trans,
 {
 	struct bkey_i *k;
 
-	if (bpos_cmp(iter->path->pos, iter->journal_pos) < 0)
+	if (bpos_lt(iter->path->pos, iter->journal_pos))
 		iter->journal_idx = 0;
 
 	k = bch2_journal_keys_peek_upto(trans->c, iter->btree_id,
@@ -1936,8 +1936,8 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp
 		next_update = btree_trans_peek_updates(iter);
 
 		if (next_update &&
-		    bpos_cmp(next_update->k.p,
-			     k.k ? k.k->p : l->b->key.k.p) <= 0) {
+		    bpos_le(next_update->k.p,
+			    k.k ? k.k->p : l->b->key.k.p)) {
 			iter->k = next_update->k;
 			k = bkey_i_to_s_c(next_update);
 		}
@@ -1950,7 +1950,7 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp
 			 * whiteout, with a real key at the same position, since
 			 * in the btree deleted keys sort before non deleted.
 			 */
-			search_key = bpos_cmp(search_key, k.k->p)
+			search_key = !bpos_eq(search_key, k.k->p)
 				? k.k->p
 				: bpos_successor(k.k->p);
 			continue;
@@ -1958,7 +1958,7 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp
 
 		if (likely(k.k)) {
 			break;
-		} else if (likely(bpos_cmp(l->b->key.k.p, SPOS_MAX))) {
+		} else if (likely(!bpos_eq(l->b->key.k.p, SPOS_MAX))) {
 			/* Advance to next leaf node: */
 			search_key = bpos_successor(l->b->key.k.p);
 		} else {
@@ -2008,19 +2008,19 @@ struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos e
 		 */
 		if (!(iter->flags & BTREE_ITER_IS_EXTENTS))
 			iter_pos = k.k->p;
-		else if (bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0)
+		else if (bkey_gt(bkey_start_pos(k.k), iter->pos))
 			iter_pos = bkey_start_pos(k.k);
 		else
 			iter_pos = iter->pos;
 
-		if (bkey_cmp(iter_pos, end) > 0) {
+		if (bkey_gt(iter_pos, end)) {
 			bch2_btree_iter_set_pos(iter, end);
 			k = bkey_s_c_null;
 			goto out_no_locked;
 		}
 
 		if (iter->update_path &&
-		    bkey_cmp(iter->update_path->pos, k.k->p)) {
+		    !bkey_eq(iter->update_path->pos, k.k->p)) {
 			bch2_path_put_nokeep(trans, iter->update_path,
 					     iter->flags & BTREE_ITER_INTENT);
 			iter->update_path = NULL;
@@ -2143,7 +2143,7 @@ struct bkey_s_c bch2_btree_iter_peek_all_levels(struct btree_iter *iter)
 		/* Check if we should go up to the parent node: */
 		if (!k.k ||
 		    (iter->advanced &&
-		     !bpos_cmp(path_l(iter->path)->b->key.k.p, iter->pos))) {
+		     bpos_eq(path_l(iter->path)->b->key.k.p, iter->pos))) {
 			iter->pos = path_l(iter->path)->b->key.k.p;
 			btree_path_set_level_up(trans, iter->path);
 			iter->advanced = false;
@@ -2159,7 +2159,7 @@ struct bkey_s_c bch2_btree_iter_peek_all_levels(struct btree_iter *iter)
 		if (iter->path->level != iter->min_depth &&
 		    (iter->advanced ||
 		     !k.k ||
-		     bpos_cmp(iter->pos, k.k->p))) {
+		     !bpos_eq(iter->pos, k.k->p))) {
 			btree_path_set_level_down(trans, iter->path, iter->min_depth);
 			iter->pos = bpos_successor(iter->pos);
 			iter->advanced = false;
@@ -2170,7 +2170,7 @@ struct bkey_s_c bch2_btree_iter_peek_all_levels(struct btree_iter *iter)
 		if (iter->path->level == iter->min_depth &&
 		    iter->advanced &&
 		    k.k &&
-		    !bpos_cmp(iter->pos, k.k->p)) {
+		    bpos_eq(iter->pos, k.k->p)) {
 			iter->pos = bpos_successor(iter->pos);
 			iter->advanced = false;
 			continue;
@@ -2178,7 +2178,7 @@ struct bkey_s_c bch2_btree_iter_peek_all_levels(struct btree_iter *iter)
 
 		if (iter->advanced &&
 		    iter->path->level == iter->min_depth &&
-		    bpos_cmp(k.k->p, iter->pos))
+		    !bpos_eq(k.k->p, iter->pos))
 			iter->advanced = false;
 
 		BUG_ON(iter->advanced);
@@ -2248,8 +2248,8 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
 					  &iter->path->l[0], &iter->k);
 		if (!k.k ||
 		    ((iter->flags & BTREE_ITER_IS_EXTENTS)
-		     ? bpos_cmp(bkey_start_pos(k.k), search_key) >= 0
-		     : bpos_cmp(k.k->p, search_key) > 0))
+		     ? bpos_ge(bkey_start_pos(k.k), search_key)
+		     : bpos_gt(k.k->p, search_key)))
 			k = btree_path_level_prev(trans, iter->path,
 						  &iter->path->l[0], &iter->k);
 
@@ -2263,7 +2263,7 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
 				 * longer at the same _key_ (not pos), return
 				 * that candidate
 				 */
-				if (saved_path && bkey_cmp(k.k->p, saved_k.p)) {
+				if (saved_path && !bkey_eq(k.k->p, saved_k.p)) {
 					bch2_path_put_nokeep(trans, iter->path,
 						      iter->flags & BTREE_ITER_INTENT);
 					iter->path = saved_path;
@@ -2298,7 +2298,7 @@ got_key:
 			}
 
 			break;
-		} else if (likely(bpos_cmp(iter->path->l[0].b->data->min_key, POS_MIN))) {
+		} else if (likely(!bpos_eq(iter->path->l[0].b->data->min_key, POS_MIN))) {
 			/* Advance to previous leaf node: */
 			search_key = bpos_predecessor(iter->path->l[0].b->data->min_key);
 		} else {
@@ -2309,10 +2309,10 @@ got_key:
 		}
 	}
 
-	EBUG_ON(bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0);
+	EBUG_ON(bkey_gt(bkey_start_pos(k.k), iter->pos));
 
 	/* Extents can straddle iter->pos: */
-	if (bkey_cmp(k.k->p, iter->pos) < 0)
+	if (bkey_lt(k.k->p, iter->pos))
 		iter->pos = k.k->p;
 
 	if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)
@@ -2377,7 +2377,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
 		struct bkey_i *next_update;
 
 		if ((next_update = btree_trans_peek_updates(iter)) &&
-		    !bpos_cmp(next_update->k.p, iter->pos)) {
+		    bpos_eq(next_update->k.p, iter->pos)) {
 			iter->k = next_update->k;
 			k = bkey_i_to_s_c(next_update);
 			goto out;
@@ -2433,7 +2433,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
 
 		next = k.k ? bkey_start_pos(k.k) : POS_MAX;
 
-		if (bkey_cmp(iter->pos, next) < 0) {
+		if (bkey_lt(iter->pos, next)) {
 			bkey_init(&iter->k);
 			iter->k.p = iter->pos;
 
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index 3f46c60b748b..3cf0b453a4c0 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -478,7 +478,7 @@ static inline struct bkey_s_c bch2_btree_iter_peek_upto_type(struct btree_iter *
 	if (!(flags & BTREE_ITER_SLOTS))
 		return bch2_btree_iter_peek_upto(iter, end);
 
-	if (bkey_cmp(iter->pos, end) > 0)
+	if (bkey_gt(iter->pos, end))
 		return bkey_s_c_null;
 
 	return bch2_btree_iter_peek_slot(iter);
diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index 1ac91221cc95..0ae5d893a4f7 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -27,8 +27,8 @@ static int bch2_btree_key_cache_cmp_fn(struct rhashtable_compare_arg *arg,
 	const struct bkey_cached *ck = obj;
 	const struct bkey_cached_key *key = arg->key;
 
-	return cmp_int(ck->key.btree_id, key->btree_id) ?:
-		bpos_cmp(ck->key.pos, key->pos);
+	return ck->key.btree_id != key->btree_id ||
+		!bpos_eq(ck->key.pos, key->pos);
 }
 
 static const struct rhashtable_params bch2_btree_key_cache_params = {
@@ -476,7 +476,7 @@ retry:
 		BUG_ON(ret);
 
 		if (ck->key.btree_id != path->btree_id ||
-		    bpos_cmp(ck->key.pos, path->pos)) {
+		    !bpos_eq(ck->key.pos, path->pos)) {
 			six_unlock_type(&ck->c.lock, lock_want);
 			goto retry;
 		}
@@ -550,7 +550,7 @@ retry:
 			return ret;
 
 		if (ck->key.btree_id != path->btree_id ||
-		    bpos_cmp(ck->key.pos, path->pos)) {
+		    !bpos_eq(ck->key.pos, path->pos)) {
 			six_unlock_type(&ck->c.lock, lock_want);
 			goto retry;
 		}
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 03e016758af3..e184b857c4c4 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -71,7 +71,7 @@ static void btree_node_interior_verify(struct bch_fs *c, struct btree *b)
 			break;
 		bp = bkey_s_c_to_btree_ptr_v2(k);
 
-		if (bpos_cmp(next_node, bp.v->min_key)) {
+		if (!bpos_eq(next_node, bp.v->min_key)) {
 			bch2_dump_btree_node(c, b);
 			bch2_bpos_to_text(&buf1, next_node);
 			bch2_bpos_to_text(&buf2, bp.v->min_key);
@@ -81,7 +81,7 @@ static void btree_node_interior_verify(struct bch_fs *c, struct btree *b)
 		bch2_btree_node_iter_advance(&iter, b);
 
 		if (bch2_btree_node_iter_end(&iter)) {
-			if (bpos_cmp(k.k->p, b->key.k.p)) {
+			if (!bpos_eq(k.k->p, b->key.k.p)) {
 				bch2_dump_btree_node(c, b);
 				bch2_bpos_to_text(&buf1, b->key.k.p);
 				bch2_bpos_to_text(&buf2, k.k->p);
@@ -1328,7 +1328,7 @@ __bch2_btree_insert_keys_interior(struct btree_update *as,
 	while (!bch2_keylist_empty(keys)) {
 		struct bkey_i *k = bch2_keylist_front(keys);
 
-		if (bpos_cmp(k->k.p, b->key.k.p) > 0)
+		if (bpos_gt(k->k.p, b->key.k.p))
 			break;
 
 		bch2_insert_fixup_btree_ptr(as, trans, path, b, &node_iter, k);
@@ -1445,8 +1445,7 @@ static void btree_split_insert_keys(struct btree_update *as,
 				    struct keylist *keys)
 {
 	if (!bch2_keylist_empty(keys) &&
-	    bpos_cmp(bch2_keylist_front(keys)->k.p,
-		     b->data->max_key) <= 0) {
+	    bpos_le(bch2_keylist_front(keys)->k.p, b->data->max_key)) {
 		struct btree_node_iter node_iter;
 
 		bch2_btree_node_iter_init(&node_iter, b, &bch2_keylist_front(keys)->k.p);
@@ -1770,8 +1769,8 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans,
 
 	b = path->l[level].b;
 
-	if ((sib == btree_prev_sib && !bpos_cmp(b->data->min_key, POS_MIN)) ||
-	    (sib == btree_next_sib && !bpos_cmp(b->data->max_key, SPOS_MAX))) {
+	if ((sib == btree_prev_sib && bpos_eq(b->data->min_key, POS_MIN)) ||
+	    (sib == btree_next_sib && bpos_eq(b->data->max_key, SPOS_MAX))) {
 		b->sib_u64s[sib] = U16_MAX;
 		return 0;
 	}
@@ -1804,7 +1803,7 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans,
 		next = m;
 	}
 
-	if (bkey_cmp(bpos_successor(prev->data->max_key), next->data->min_key)) {
+	if (!bpos_eq(bpos_successor(prev->data->max_key), next->data->min_key)) {
 		struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF;
 
 		bch2_bpos_to_text(&buf1, prev->data->max_key);
@@ -2097,7 +2096,7 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans,
 				iter2.flags & BTREE_ITER_INTENT);
 
 		BUG_ON(iter2.path->level != b->c.level);
-		BUG_ON(bpos_cmp(iter2.path->pos, new_key->k.p));
+		BUG_ON(!bpos_eq(iter2.path->pos, new_key->k.p));
 
 		btree_path_set_level_up(trans, iter2.path);
 
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 8db474c6146e..323f2942b11d 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -92,8 +92,8 @@ bool bch2_btree_bset_insert_key(struct btree_trans *trans,
 	EBUG_ON(btree_node_just_written(b));
 	EBUG_ON(bset_written(b, btree_bset_last(b)));
 	EBUG_ON(bkey_deleted(&insert->k) && bkey_val_u64s(&insert->k));
-	EBUG_ON(bpos_cmp(insert->k.p, b->data->min_key) < 0);
-	EBUG_ON(bpos_cmp(insert->k.p, b->data->max_key) > 0);
+	EBUG_ON(bpos_lt(insert->k.p, b->data->min_key));
+	EBUG_ON(bpos_gt(insert->k.p, b->data->max_key));
 	EBUG_ON(insert->k.u64s >
 		bch_btree_keys_u64s_remaining(trans->c, b));
 
@@ -257,7 +257,7 @@ static void btree_insert_key_leaf(struct btree_trans *trans,
 static inline void btree_insert_entry_checks(struct btree_trans *trans,
 					     struct btree_insert_entry *i)
 {
-	BUG_ON(bpos_cmp(i->k->k.p, i->path->pos));
+	BUG_ON(!bpos_eq(i->k->k.p, i->path->pos));
 	BUG_ON(i->cached	!= i->path->cached);
 	BUG_ON(i->level		!= i->path->level);
 	BUG_ON(i->btree_id	!= i->path->btree_id);
@@ -1141,7 +1141,7 @@ static noinline int __check_pos_snapshot_overwritten(struct btree_trans *trans,
 		if (!k.k)
 			break;
 
-		if (bkey_cmp(pos, k.k->p))
+		if (!bkey_eq(pos, k.k->p))
 			break;
 
 		if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, pos.snapshot)) {
@@ -1242,7 +1242,7 @@ int bch2_trans_update_extent(struct btree_trans *trans,
 	if (!k.k)
 		goto out;
 
-	if (!bkey_cmp(k.k->p, bkey_start_pos(&insert->k))) {
+	if (bkey_eq(k.k->p, bkey_start_pos(&insert->k))) {
 		if (bch2_bkey_maybe_mergable(k.k, &insert->k)) {
 			ret = extent_front_merge(trans, &iter, k, &insert, flags);
 			if (ret)
@@ -1252,9 +1252,9 @@ int bch2_trans_update_extent(struct btree_trans *trans,
 		goto next;
 	}
 
-	while (bkey_cmp(insert->k.p, bkey_start_pos(k.k)) > 0) {
-		bool front_split = bkey_cmp(bkey_start_pos(k.k), start) < 0;
-		bool back_split  = bkey_cmp(k.k->p, insert->k.p) > 0;
+	while (bkey_gt(insert->k.p, bkey_start_pos(k.k))) {
+		bool front_split = bkey_lt(bkey_start_pos(k.k), start);
+		bool back_split  = bkey_gt(k.k->p, insert->k.p);
 
 		/*
 		 * If we're going to be splitting a compressed extent, note it
@@ -1313,7 +1313,7 @@ int bch2_trans_update_extent(struct btree_trans *trans,
 				goto err;
 		}
 
-		if (bkey_cmp(k.k->p, insert->k.p) <= 0) {
+		if (bkey_le(k.k->p, insert->k.p)) {
 			update = bch2_trans_kmalloc(trans, sizeof(*update));
 			if ((ret = PTR_ERR_OR_ZERO(update)))
 				goto err;
@@ -1407,7 +1407,7 @@ static int need_whiteout_for_snapshot(struct btree_trans *trans,
 	for_each_btree_key_norestart(trans, iter, btree_id, pos,
 			   BTREE_ITER_ALL_SNAPSHOTS|
 			   BTREE_ITER_NOPRESERVE, k, ret) {
-		if (bkey_cmp(k.k->p, pos))
+		if (!bkey_eq(k.k->p, pos))
 			break;
 
 		if (bch2_snapshot_is_ancestor(trans->c, snapshot,
@@ -1463,7 +1463,7 @@ bch2_trans_update_by_path_trace(struct btree_trans *trans, struct btree_path *pa
 
 	EBUG_ON(!path->should_be_locked);
 	EBUG_ON(trans->nr_updates >= BTREE_ITER_MAX);
-	EBUG_ON(bpos_cmp(k->k.p, path->pos));
+	EBUG_ON(!bpos_eq(k->k.p, path->pos));
 
 	n = (struct btree_insert_entry) {
 		.flags		= flags,
@@ -1573,7 +1573,7 @@ int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter
 	    btree_id_cached(trans->c, path->btree_id)) {
 		if (!iter->key_cache_path ||
 		    !iter->key_cache_path->should_be_locked ||
-		    bpos_cmp(iter->key_cache_path->pos, k->k.p)) {
+		    !bpos_eq(iter->key_cache_path->pos, k->k.p)) {
 			if (!iter->key_cache_path)
 				iter->key_cache_path =
 					bch2_path_get(trans, path->btree_id, path->pos, 1, 0,
@@ -1682,7 +1682,7 @@ int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id id,
 		if (ret)
 			goto err;
 
-		if (bkey_cmp(iter.pos, end) >= 0)
+		if (bkey_ge(iter.pos, end))
 			break;
 
 		bkey_init(&delete.k);
diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c
index 9d1290ff179a..b4480852e935 100644
--- a/fs/bcachefs/data_update.c
+++ b/fs/bcachefs/data_update.c
@@ -30,7 +30,7 @@ static int insert_snapshot_whiteouts(struct btree_trans *trans,
 
 	darray_init(&s);
 
-	if (!bkey_cmp(old_pos, new_pos))
+	if (bkey_eq(old_pos, new_pos))
 		return 0;
 
 	if (!snapshot_t(c, old_pos.snapshot)->children[0])
@@ -45,7 +45,7 @@ static int insert_snapshot_whiteouts(struct btree_trans *trans,
 		if (ret)
 			break;
 
-		if (bkey_cmp(old_pos, k.k->p))
+		if (!bkey_eq(old_pos, k.k->p))
 			break;
 
 		if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, old_pos.snapshot)) {
@@ -244,7 +244,7 @@ err:
 		if (ret)
 			break;
 next:
-		while (bkey_cmp(iter.pos, bch2_keylist_front(keys)->k.p) >= 0) {
+		while (bkey_ge(iter.pos, bch2_keylist_front(keys)->k.p)) {
 			bch2_keylist_pop_front(keys);
 			if (bch2_keylist_empty(keys))
 				goto out;
diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c
index 16be8d3db2ad..d3e769b1eb3e 100644
--- a/fs/bcachefs/debug.c
+++ b/fs/bcachefs/debug.c
@@ -306,7 +306,7 @@ static ssize_t bch2_read_btree_formats(struct file *file, char __user *buf,
 	if (ret)
 		return ret;
 
-	if (!bpos_cmp(SPOS_MAX, i->from))
+	if (bpos_eq(SPOS_MAX, i->from))
 		return i->ret;
 
 	bch2_trans_init(&trans, i->c, 0, 0);
@@ -317,7 +317,7 @@ static ssize_t bch2_read_btree_formats(struct file *file, char __user *buf,
 			break;
 
 		bch2_btree_node_to_text(&i->buf, i->c, b);
-		i->from = bpos_cmp(SPOS_MAX, b->key.k.p)
+		i->from = !bpos_eq(SPOS_MAX, b->key.k.p)
 			? bpos_successor(b->key.k.p)
 			: b->key.k.p;
 	}
@@ -368,7 +368,7 @@ static ssize_t bch2_read_bfloat_failed(struct file *file, char __user *buf,
 		if (ret)
 			break;
 
-		if (bpos_cmp(l->b->key.k.p, i->prev_node) > 0) {
+		if (bpos_gt(l->b->key.k.p, i->prev_node)) {
 			bch2_btree_node_to_text(&i->buf, i->c, l->b);
 			i->prev_node = l->b->key.k.p;
 		}
diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c
index 288f46b55876..c2126f39369b 100644
--- a/fs/bcachefs/dirent.c
+++ b/fs/bcachefs/dirent.c
@@ -350,8 +350,8 @@ int bch2_dirent_rename(struct btree_trans *trans,
 		bkey_init(&new_src->k);
 		new_src->k.p = src_iter.pos;
 
-		if (bkey_cmp(dst_pos, src_iter.pos) <= 0 &&
-		    bkey_cmp(src_iter.pos, dst_iter.pos) < 0) {
+		if (bkey_le(dst_pos, src_iter.pos) &&
+		    bkey_lt(src_iter.pos, dst_iter.pos)) {
 			/*
 			 * We have a hash collision for the new dst key,
 			 * and new_src - the key we're deleting - is between
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index 2dcca5c7fcec..503a47b39ad1 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -107,7 +107,7 @@ int bch2_stripe_invalid(const struct bch_fs *c, struct bkey_s_c k,
 {
 	const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
 
-	if (!bkey_cmp(k.k->p, POS_MIN)) {
+	if (bkey_eq(k.k->p, POS_MIN)) {
 		prt_printf(err, "stripe at POS_MIN");
 		return -EINVAL;
 	}
@@ -724,7 +724,7 @@ static int ec_stripe_bkey_insert(struct btree_trans *trans,
 
 	for_each_btree_key_norestart(trans, iter, BTREE_ID_stripes, start_pos,
 			   BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) {
-		if (bkey_cmp(k.k->p, POS(0, U32_MAX)) > 0) {
+		if (bkey_gt(k.k->p, POS(0, U32_MAX))) {
 			if (start_pos.offset) {
 				start_pos = min_pos;
 				bch2_btree_iter_set_pos(&iter, start_pos);
diff --git a/fs/bcachefs/extent_update.c b/fs/bcachefs/extent_update.c
index 2fd5d9672a44..21d6f88c7397 100644
--- a/fs/bcachefs/extent_update.c
+++ b/fs/bcachefs/extent_update.c
@@ -73,8 +73,7 @@ static int count_iters_for_insert(struct btree_trans *trans,
 		for_each_btree_key_norestart(trans, iter,
 				   BTREE_ID_reflink, POS(0, idx + offset),
 				   BTREE_ITER_SLOTS, r_k, ret2) {
-			if (bkey_cmp(bkey_start_pos(r_k.k),
-				     POS(0, idx + sectors)) >= 0)
+			if (bkey_ge(bkey_start_pos(r_k.k), POS(0, idx + sectors)))
 				break;
 
 			/* extent_update_to_keys(), for the reflink_v update */
@@ -132,11 +131,10 @@ int bch2_extent_atomic_end(struct btree_trans *trans,
 	for_each_btree_key_continue_norestart(copy, 0, k, ret) {
 		unsigned offset = 0;
 
-		if (bkey_cmp(bkey_start_pos(k.k), *end) >= 0)
+		if (bkey_ge(bkey_start_pos(k.k), *end))
 			break;
 
-		if (bkey_cmp(bkey_start_pos(&insert->k),
-			     bkey_start_pos(k.k)) > 0)
+		if (bkey_gt(bkey_start_pos(&insert->k), bkey_start_pos(k.k)))
 			offset = bkey_start_offset(&insert->k) -
 				bkey_start_offset(k.k);
 
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index bb1b862bfa65..e3bc39bee197 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -227,7 +227,7 @@ void bch2_btree_ptr_v2_compat(enum btree_id btree_id, unsigned version,
 
 	if (version < bcachefs_metadata_version_inode_btree_change &&
 	    btree_node_type_is_extents(btree_id) &&
-	    bkey_cmp(bp.v->min_key, POS_MIN))
+	    !bkey_eq(bp.v->min_key, POS_MIN))
 		bp.v->min_key = write
 			? bpos_nosnap_predecessor(bp.v->min_key)
 			: bpos_nosnap_successor(bp.v->min_key);
@@ -1211,10 +1211,10 @@ int bch2_cut_front_s(struct bpos where, struct bkey_s k)
 	int val_u64s_delta;
 	u64 sub;
 
-	if (bkey_cmp(where, bkey_start_pos(k.k)) <= 0)
+	if (bkey_le(where, bkey_start_pos(k.k)))
 		return 0;
 
-	EBUG_ON(bkey_cmp(where, k.k->p) > 0);
+	EBUG_ON(bkey_gt(where, k.k->p));
 
 	sub = where.offset - bkey_start_offset(k.k);
 
@@ -1291,10 +1291,10 @@ int bch2_cut_back_s(struct bpos where, struct bkey_s k)
 	int val_u64s_delta;
 	u64 len = 0;
 
-	if (bkey_cmp(where, k.k->p) >= 0)
+	if (bkey_ge(where, k.k->p))
 		return 0;
 
-	EBUG_ON(bkey_cmp(where, bkey_start_pos(k.k)) < 0);
+	EBUG_ON(bkey_lt(where, bkey_start_pos(k.k)));
 
 	len = where.offset - bkey_start_offset(k.k);
 
diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h
index 224df17206cb..21dbdf96bd59 100644
--- a/fs/bcachefs/extents.h
+++ b/fs/bcachefs/extents.h
@@ -636,9 +636,8 @@ enum bch_extent_overlap {
 static inline enum bch_extent_overlap bch2_extent_overlap(const struct bkey *k,
 							  const struct bkey *m)
 {
-	int cmp1 = bkey_cmp(k->p, m->p) < 0;
-	int cmp2 = bkey_cmp(bkey_start_pos(k),
-			    bkey_start_pos(m)) > 0;
+	int cmp1 = bkey_lt(k->p, m->p);
+	int cmp2 = bkey_gt(bkey_start_pos(k), bkey_start_pos(m));
 
 	return (cmp1 << 1) + cmp2;
 }
diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 4dd5ebafe742..0bb8b39140ec 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -2043,7 +2043,7 @@ retry:
 	for_each_btree_key_norestart(&trans, iter, BTREE_ID_extents,
 			   SPOS(inum.inum, offset, snapshot),
 			   BTREE_ITER_SLOTS, k, err) {
-		if (bkey_cmp(bkey_start_pos(k.k), POS(inum.inum, end)) >= 0)
+		if (bkey_ge(bkey_start_pos(k.k), POS(inum.inum, end)))
 			break;
 
 		if (k.k->p.snapshot != snapshot ||
@@ -2532,7 +2532,7 @@ retry:
 		goto err;
 
 	for_each_btree_key_norestart(&trans, iter, BTREE_ID_extents, start, 0, k, ret) {
-		if (bkey_cmp(bkey_start_pos(k.k), end) >= 0)
+		if (bkey_ge(bkey_start_pos(k.k), end))
 			break;
 
 		if (bkey_extent_is_data(k.k)) {
@@ -2970,13 +2970,13 @@ static long bchfs_fcollapse_finsert(struct bch_inode_info *inode,
 			break;
 
 		if (insert &&
-		    bkey_cmp(k.k->p, POS(inode->v.i_ino, offset >> 9)) <= 0)
+		    bkey_le(k.k->p, POS(inode->v.i_ino, offset >> 9)))
 			break;
 reassemble:
 		bch2_bkey_buf_reassemble(&copy, c, k);
 
 		if (insert &&
-		    bkey_cmp(bkey_start_pos(k.k), move_pos) < 0)
+		    bkey_lt(bkey_start_pos(k.k), move_pos))
 			bch2_cut_front(move_pos, copy.k);
 
 		copy.k->k.p.offset += shift >> 9;
@@ -2986,7 +2986,7 @@ reassemble:
 		if (ret)
 			continue;
 
-		if (bkey_cmp(atomic_end, copy.k->k.p)) {
+		if (!bkey_eq(atomic_end, copy.k->k.p)) {
 			if (insert) {
 				move_pos = atomic_end;
 				move_pos.offset -= shift >> 9;
@@ -3064,7 +3064,7 @@ static int __bchfs_fallocate(struct bch_inode_info *inode, int mode,
 			POS(inode->v.i_ino, start_sector),
 			BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
 
-	while (!ret && bkey_cmp(iter.pos, end_pos) < 0) {
+	while (!ret && bkey_lt(iter.pos, end_pos)) {
 		s64 i_sectors_delta = 0;
 		struct disk_reservation disk_res = { 0 };
 		struct quota_res quota_res = { 0 };
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index f4f0e0cec85d..7db1486a1143 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -133,7 +133,7 @@ static int lookup_first_inode(struct btree_trans *trans, u64 inode_nr,
 	if (ret)
 		goto err;
 
-	if (!k.k || bkey_cmp(k.k->p, POS(0, inode_nr))) {
+	if (!k.k || !bkey_eq(k.k->p, POS(0, inode_nr))) {
 		ret = -ENOENT;
 		goto err;
 	}
@@ -527,7 +527,7 @@ static int snapshots_seen_update(struct bch_fs *c, struct snapshots_seen *s,
 	};
 	int ret = 0;
 
-	if (bkey_cmp(s->pos, pos))
+	if (!bkey_eq(s->pos, pos))
 		s->ids.nr = 0;
 
 	pos.snapshot = n.equiv;
@@ -825,7 +825,7 @@ static int hash_check_key(struct btree_trans *trans,
 	for_each_btree_key_norestart(trans, iter, desc.btree_id,
 				     POS(hash_k.k->p.inode, hash),
 				     BTREE_ITER_SLOTS, k, ret) {
-		if (!bkey_cmp(k.k->p, hash_k.k->p))
+		if (bkey_eq(k.k->p, hash_k.k->p))
 			break;
 
 		if (fsck_err_on(k.k->type == desc.key_type &&
@@ -1199,7 +1199,7 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
 
 	BUG_ON(!iter->path->should_be_locked);
 #if 0
-	if (bkey_cmp(prev.k->k.p, bkey_start_pos(k.k)) > 0) {
+	if (bkey_gt(prev.k->k.p, bkey_start_pos(k.k))) {
 		char buf1[200];
 		char buf2[200];
 
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index 827a0b04b00f..b4f09d77148d 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -543,7 +543,7 @@ int bch2_inode_create(struct btree_trans *trans,
 again:
 	while ((k = bch2_btree_iter_peek(iter)).k &&
 	       !(ret = bkey_err(k)) &&
-	       bkey_cmp(k.k->p, POS(0, max)) < 0) {
+	       bkey_lt(k.k->p, POS(0, max))) {
 		if (pos < iter->pos.offset)
 			goto found_slot;
 
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index 5fe049d64e27..491fad4dfb28 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -237,7 +237,7 @@ int bch2_sum_sector_overwrites(struct btree_trans *trans,
 		     (!new_compressed && bch2_bkey_sectors_compressed(old))))
 			*usage_increasing = true;
 
-		if (bkey_cmp(old.k->p, new->k.p) >= 0) {
+		if (bkey_ge(old.k->p, new->k.p)) {
 			/*
 			 * Check if there's already data above where we're
 			 * going to be writing to - this means we're definitely
@@ -420,7 +420,7 @@ int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter,
 		bch2_btree_iter_set_snapshot(iter, snapshot);
 
 		k = bch2_btree_iter_peek(iter);
-		if (bkey_cmp(iter->pos, end_pos) >= 0) {
+		if (bkey_ge(iter->pos, end_pos)) {
 			bch2_btree_iter_set_pos(iter, end_pos);
 			break;
 		}
@@ -518,7 +518,7 @@ static int bch2_write_index_default(struct bch_write_op *op)
 		if (ec_ob)
 			bch2_ob_add_backpointer(c, ec_ob, &sk.k->k);
 
-		if (bkey_cmp(iter.pos, k->k.p) >= 0)
+		if (bkey_ge(iter.pos, k->k.p))
 			bch2_keylist_pop_front(&op->insert_keys);
 		else
 			bch2_cut_front(iter.pos, k);
@@ -1398,7 +1398,7 @@ void bch2_write(struct closure *cl)
 	EBUG_ON(op->cl.parent);
 	BUG_ON(!op->nr_replicas);
 	BUG_ON(!op->write_point.v);
-	BUG_ON(!bkey_cmp(op->pos, POS_MAX));
+	BUG_ON(bkey_eq(op->pos, POS_MAX));
 
 	op->start_time = local_clock();
 	bch2_keylist_init(&op->insert_keys, op->inline_keys);
diff --git a/fs/bcachefs/keylist.c b/fs/bcachefs/keylist.c
index 5e85055b0f93..29e51bde8313 100644
--- a/fs/bcachefs/keylist.c
+++ b/fs/bcachefs/keylist.c
@@ -36,7 +36,7 @@ void bch2_keylist_add_in_order(struct keylist *l, struct bkey_i *insert)
 	struct bkey_i *where;
 
 	for_each_keylist_key(l, where)
-		if (bkey_cmp(insert->k.p, where->k.p) < 0)
+		if (bpos_lt(insert->k.p, where->k.p))
 			break;
 
 	memmove_u64s_up((u64 *) where + insert->k.u64s,
@@ -63,6 +63,6 @@ void bch2_verify_keylist_sorted(struct keylist *l)
 
 	for_each_keylist_key(l, k)
 		BUG_ON(bkey_next(k) != l->top &&
-		       bpos_cmp(k->k.p, bkey_next(k)->k.p) >= 0);
+		       bpos_ge(k->k.p, bkey_next(k)->k.p));
 }
 #endif
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index a66fbc1faa7b..9125cea080bd 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -340,7 +340,7 @@ static int lookup_inode(struct btree_trans *trans, struct bpos pos,
 	if (ret)
 		goto err;
 
-	if (!k.k || bkey_cmp(k.k->p, pos)) {
+	if (!k.k || !bkey_eq(k.k->p, pos)) {
 		ret = -ENOENT;
 		goto err;
 	}
@@ -446,7 +446,7 @@ static int __bch2_move_data(struct moving_context *ctxt,
 		if (ret)
 			break;
 
-		if (bkey_cmp(bkey_start_pos(k.k), end) >= 0)
+		if (bkey_ge(bkey_start_pos(k.k), end))
 			break;
 
 		ctxt->stats->pos = iter.pos;
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 8d767e787d6b..7c9f4a97bc03 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -132,9 +132,8 @@ search:
 	       (k = idx_to_key(keys, *idx),
 		k->btree_id == btree_id &&
 		k->level == level &&
-		bpos_cmp(k->k->k.p, end_pos) <= 0)) {
-		if (bpos_cmp(k->k->k.p, pos) >= 0 &&
-		    !k->overwritten)
+		bpos_le(k->k->k.p, end_pos))) {
+		if (bpos_ge(k->k->k.p, pos) && !k->overwritten)
 			return k->k;
 
 		(*idx)++;
@@ -295,7 +294,7 @@ void bch2_journal_key_overwritten(struct bch_fs *c, enum btree_id btree,
 	if (idx < keys->size &&
 	    keys->d[idx].btree_id	== btree &&
 	    keys->d[idx].level		== level &&
-	    !bpos_cmp(keys->d[idx].k->k.p, pos))
+	    bpos_eq(keys->d[idx].k->k.p, pos))
 		keys->d[idx].overwritten = true;
 }
 
@@ -354,7 +353,7 @@ static void bch2_journal_iter_advance_btree(struct btree_and_journal_iter *iter)
 
 void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *iter)
 {
-	if (!bpos_cmp(iter->pos, SPOS_MAX))
+	if (bpos_eq(iter->pos, SPOS_MAX))
 		iter->at_end = true;
 	else
 		iter->pos = bpos_successor(iter->pos);
@@ -368,19 +367,19 @@ again:
 		return bkey_s_c_null;
 
 	while ((btree_k = bch2_journal_iter_peek_btree(iter)).k &&
-	       bpos_cmp(btree_k.k->p, iter->pos) < 0)
+	       bpos_lt(btree_k.k->p, iter->pos))
 		bch2_journal_iter_advance_btree(iter);
 
 	while ((journal_k = bch2_journal_iter_peek(&iter->journal)).k &&
-	       bpos_cmp(journal_k.k->p, iter->pos) < 0)
+	       bpos_lt(journal_k.k->p, iter->pos))
 		bch2_journal_iter_advance(&iter->journal);
 
 	ret = journal_k.k &&
-		(!btree_k.k || bpos_cmp(journal_k.k->p, btree_k.k->p) <= 0)
+		(!btree_k.k || bpos_le(journal_k.k->p, btree_k.k->p))
 		? journal_k
 		: btree_k;
 
-	if (ret.k && iter->b && bpos_cmp(ret.k->p, iter->b->data->max_key) > 0)
+	if (ret.k && iter->b && bpos_gt(ret.k->p, iter->b->data->max_key))
 		ret = bkey_s_c_null;
 
 	if (ret.k) {
@@ -528,7 +527,7 @@ static int journal_keys_sort(struct bch_fs *c)
 		while (src + 1 < keys->d + keys->nr &&
 		       src[0].btree_id	== src[1].btree_id &&
 		       src[0].level	== src[1].level &&
-		       !bpos_cmp(src[0].k->k.p, src[1].k->k.p))
+		       bpos_eq(src[0].k->k.p, src[1].k->k.p))
 			src++;
 
 		*dst++ = *src++;
diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c
index 0d4c004d7f9d..aebed671c43a 100644
--- a/fs/bcachefs/reflink.c
+++ b/fs/bcachefs/reflink.c
@@ -252,14 +252,14 @@ static struct bkey_s_c get_next_src(struct btree_iter *iter, struct bpos end)
 	int ret;
 
 	for_each_btree_key_continue_norestart(*iter, 0, k, ret) {
-		if (bkey_cmp(iter->pos, end) >= 0)
+		if (bkey_ge(iter->pos, end))
 			break;
 
 		if (bkey_extent_is_data(k.k))
 			return k;
 	}
 
-	if (bkey_cmp(iter->pos, end) >= 0)
+	if (bkey_ge(iter->pos, end))
 		bch2_btree_iter_set_pos(iter, end);
 	return ret ? bkey_s_c_err(ret) : bkey_s_c_null;
 }
@@ -301,7 +301,7 @@ s64 bch2_remap_range(struct bch_fs *c,
 
 	while ((ret == 0 ||
 		bch2_err_matches(ret, BCH_ERR_transaction_restart)) &&
-	       bkey_cmp(dst_iter.pos, dst_end) < 0) {
+	       bkey_lt(dst_iter.pos, dst_end)) {
 		struct disk_reservation disk_res = { 0 };
 
 		bch2_trans_begin(&trans);
@@ -334,7 +334,7 @@ s64 bch2_remap_range(struct bch_fs *c,
 		if (ret)
 			continue;
 
-		if (bkey_cmp(src_want, src_iter.pos) < 0) {
+		if (bkey_lt(src_want, src_iter.pos)) {
 			ret = bch2_fpunch_at(&trans, &dst_iter, dst_inum,
 					min(dst_end.offset,
 					    dst_iter.pos.offset +
@@ -386,8 +386,8 @@ s64 bch2_remap_range(struct bch_fs *c,
 	bch2_trans_iter_exit(&trans, &dst_iter);
 	bch2_trans_iter_exit(&trans, &src_iter);
 
-	BUG_ON(!ret && bkey_cmp(dst_iter.pos, dst_end));
-	BUG_ON(bkey_cmp(dst_iter.pos, dst_end) > 0);
+	BUG_ON(!ret && !bkey_eq(dst_iter.pos, dst_end));
+	BUG_ON(bkey_gt(dst_iter.pos, dst_end));
 
 	dst_done = dst_iter.pos.offset - dst_start.offset;
 	new_i_size = min(dst_iter.pos.offset << 9, new_i_size);
diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c
index 0e3b6ae3835a..e37ffaad5883 100644
--- a/fs/bcachefs/subvolume.c
+++ b/fs/bcachefs/subvolume.c
@@ -30,8 +30,8 @@ int bch2_snapshot_invalid(const struct bch_fs *c, struct bkey_s_c k,
 	struct bkey_s_c_snapshot s;
 	u32 i, id;
 
-	if (bkey_cmp(k.k->p, POS(0, U32_MAX)) > 0 ||
-	    bkey_cmp(k.k->p, POS(0, 1)) < 0) {
+	if (bkey_gt(k.k->p, POS(0, U32_MAX)) ||
+	    bkey_lt(k.k->p, POS(0, 1))) {
 		prt_printf(err, "bad pos");
 		return -EINVAL;
 	}
@@ -592,7 +592,7 @@ static int snapshot_delete_key(struct btree_trans *trans,
 	struct bch_fs *c = trans->c;
 	u32 equiv = snapshot_t(c, k.k->p.snapshot)->equiv;
 
-	if (bkey_cmp(k.k->p, *last_pos))
+	if (!bkey_eq(k.k->p, *last_pos))
 		equiv_seen->nr = 0;
 	*last_pos = k.k->p;
 
@@ -770,8 +770,8 @@ static int bch2_delete_dead_snapshots_hook(struct btree_trans *trans,
 int bch2_subvolume_invalid(const struct bch_fs *c, struct bkey_s_c k,
 			   int rw, struct printbuf *err)
 {
-	if (bkey_cmp(k.k->p, SUBVOL_POS_MIN) < 0 ||
-	    bkey_cmp(k.k->p, SUBVOL_POS_MAX) > 0) {
+	if (bkey_lt(k.k->p, SUBVOL_POS_MIN) ||
+	    bkey_gt(k.k->p, SUBVOL_POS_MAX)) {
 		prt_printf(err, "invalid pos");
 		return -EINVAL;
 	}
@@ -1028,7 +1028,7 @@ int bch2_subvolume_create(struct btree_trans *trans, u64 inode,
 
 	for_each_btree_key(trans, dst_iter, BTREE_ID_subvolumes, SUBVOL_POS_MIN,
 			   BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) {
-		if (bkey_cmp(k.k->p, SUBVOL_POS_MAX) > 0)
+		if (bkey_gt(k.k->p, SUBVOL_POS_MAX))
 			break;
 
 		/*
-- 
cgit 


From 52bf51b91f5d19ab0555b901023def61d60f1a97 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 20 Dec 2022 10:51:20 -0500
Subject: bcachefs: Fix __btree_trans_peek_key_cache()

We were returning a pointer to a variable on the stack - oops.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 8a18b55cab26..c95dbeaaceab 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1851,6 +1851,7 @@ struct bkey_s_c __btree_trans_peek_key_cache(struct btree_iter *iter, struct bpo
 	struct btree_trans *trans = iter->trans;
 	struct bch_fs *c = trans->c;
 	struct bkey u;
+	struct bkey_s_c k;
 	int ret;
 
 	if (!bch2_btree_key_cache_find(c, iter->btree_id, pos))
@@ -1870,7 +1871,12 @@ struct bkey_s_c __btree_trans_peek_key_cache(struct btree_iter *iter, struct bpo
 
 	btree_path_set_should_be_locked(iter->key_cache_path);
 
-	return bch2_btree_path_peek_slot(iter->key_cache_path, &u);
+	k = bch2_btree_path_peek_slot(iter->key_cache_path, &u);
+	if (k.k && !bkey_err(k)) {
+		iter->k = u;
+		k.k = &iter->k;
+	}
+	return k;
 }
 
 static noinline
-- 
cgit 


From a52a4da4fd07ad32f7beceaa61672e74efd7f03a Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 20 Dec 2022 11:13:19 -0500
Subject: bcachefs: bch2_btree_path_peek_slot_exact()

When we start using the key cache for inodes again, it'll be possible
for bch2_btree_path_peek_slot() to return a key in a different snapshot
with a key cache path.

This isn't what we want when triggers are checking what they're
overwriting, so introduce a new helper for the commit path.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update_leaf.c | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 323f2942b11d..62b0aa55c752 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -24,6 +24,22 @@
 #include <linux/prefetch.h>
 #include <linux/sort.h>
 
+/*
+ * bch2_btree_path_peek_slot() for a cached iterator might return a key in a
+ * different snapshot:
+ */
+struct bkey_s_c bch2_btree_path_peek_slot_exact(struct btree_path *path, struct bkey *u)
+{
+	struct bkey_s_c k = bch2_btree_path_peek_slot(path, u);
+
+	if (k.k && bpos_eq(path->pos, k.k->p))
+		return k;
+
+	bkey_init(u);
+	u->p = path->pos;
+	return (struct bkey_s_c) { u, NULL };
+}
+
 static int __must_check
 bch2_trans_update_by_path(struct btree_trans *, struct btree_path *,
 			  struct bkey_i *, enum btree_update_flags);
@@ -1505,7 +1521,7 @@ bch2_trans_update_by_path_trace(struct btree_trans *trans, struct btree_path *pa
 		array_insert_item(trans->updates, trans->nr_updates,
 				  i - trans->updates, n);
 
-		i->old_v = bch2_btree_path_peek_slot(path, &i->old_k).v;
+		i->old_v = bch2_btree_path_peek_slot_exact(path, &i->old_k).v;
 		i->old_btree_u64s = !bkey_deleted(&i->old_k) ? i->old_k.u64s : 0;
 
 		if (unlikely(trans->journal_replay_not_finished)) {
-- 
cgit 


From dcced069421c2e1f8c2d2bff2263a48c319b6166 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 20 Dec 2022 16:02:09 -0500
Subject: bcachefs: Kill __btree_trans_peek_key_cache()

There was no reason for this to be a separate helper - we always want
the relock call that btree_trans_peek_key_cache() did.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c | 17 +++++------------
 1 file changed, 5 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index c95dbeaaceab..5883bb42bc09 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1846,7 +1846,7 @@ struct bkey_s_c btree_trans_peek_journal(struct btree_trans *trans,
  * bkey_s_c_null:
  */
 static noinline
-struct bkey_s_c __btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos pos)
+struct bkey_s_c btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos pos)
 {
 	struct btree_trans *trans = iter->trans;
 	struct bch_fs *c = trans->c;
@@ -1865,7 +1865,9 @@ struct bkey_s_c __btree_trans_peek_key_cache(struct btree_iter *iter, struct bpo
 	iter->key_cache_path = bch2_btree_path_set_pos(trans, iter->key_cache_path, pos,
 					iter->flags & BTREE_ITER_INTENT);
 
-	ret = bch2_btree_path_traverse(trans, iter->key_cache_path, iter->flags|BTREE_ITER_CACHED);
+	ret =   bch2_btree_path_traverse(trans, iter->key_cache_path,
+					 iter->flags|BTREE_ITER_CACHED) ?:
+		bch2_btree_path_relock(trans, iter->path, _THIS_IP_);
 	if (unlikely(ret))
 		return bkey_s_c_err(ret);
 
@@ -1879,15 +1881,6 @@ struct bkey_s_c __btree_trans_peek_key_cache(struct btree_iter *iter, struct bpo
 	return k;
 }
 
-static noinline
-struct bkey_s_c btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos pos)
-{
-	struct bkey_s_c ret = __btree_trans_peek_key_cache(iter, pos);
-	int err = bkey_err(ret) ?: bch2_btree_path_relock(iter->trans, iter->path, _THIS_IP_);
-
-	return err ? bkey_s_c_err(err) : ret;
-}
-
 static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bpos search_key)
 {
 	struct btree_trans *trans = iter->trans;
@@ -2394,7 +2387,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
 			goto out;
 
 		if (unlikely(iter->flags & BTREE_ITER_WITH_KEY_CACHE) &&
-		    (k = __btree_trans_peek_key_cache(iter, iter->pos)).k) {
+		    (k = btree_trans_peek_key_cache(iter, iter->pos)).k) {
 			if (!bkey_err(k))
 				iter->k = *k.k;
 			/* We're not returning a key from iter->path: */
-- 
cgit 


From 087e53c255b6fe8ec6b573acbdf12a555aae493b Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 20 Dec 2022 11:26:57 -0500
Subject: bcachefs: Bring back BTREE_ITER_CACHED_NOFILL

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c      | 6 ++++--
 fs/bcachefs/btree_key_cache.c | 3 +--
 fs/bcachefs/btree_types.h     | 1 +
 3 files changed, 6 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 5883bb42bc09..b1580f6efb0f 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1581,7 +1581,8 @@ struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *path, struct bkey *
 		EBUG_ON(ck &&
 			(path->btree_id != ck->key.btree_id ||
 			 !bkey_eq(path->pos, ck->key.pos)));
-		EBUG_ON(!ck || !ck->valid);
+		if (!ck || !ck->valid)
+			return bkey_s_c_null;
 
 		*u = ck->k->k;
 		k = bkey_i_to_s_c(ck->k);
@@ -1860,7 +1861,8 @@ struct bkey_s_c btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos
 	if (!iter->key_cache_path)
 		iter->key_cache_path = bch2_path_get(trans, iter->btree_id, pos,
 						     iter->flags & BTREE_ITER_INTENT, 0,
-						     iter->flags|BTREE_ITER_CACHED);
+						     iter->flags|BTREE_ITER_CACHED|
+						     BTREE_ITER_CACHED_NOFILL);
 
 	iter->key_cache_path = bch2_btree_path_set_pos(trans, iter->key_cache_path, pos,
 					iter->flags & BTREE_ITER_INTENT);
diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index 0ae5d893a4f7..fc924fd24274 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -487,7 +487,7 @@ retry:
 	path->l[0].lock_seq	= ck->c.lock.state.seq;
 	path->l[0].b		= (void *) ck;
 fill:
-	if (!ck->valid) {
+	if (!ck->valid && !(flags & BTREE_ITER_CACHED_NOFILL)) {
 		/*
 		 * Using the underscore version because we haven't set
 		 * path->uptodate yet:
@@ -508,7 +508,6 @@ fill:
 		set_bit(BKEY_CACHED_ACCESSED, &ck->flags);
 
 	path->uptodate = BTREE_ITER_UPTODATE;
-	BUG_ON(!ck->valid);
 	BUG_ON(btree_node_locked_type(path, 0) != btree_lock_want(path, 0));
 
 	return ret;
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index ae5a692d1924..cdb887abcfe1 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -207,6 +207,7 @@ struct btree_node_iter {
 #define BTREE_ITER_ALL_SNAPSHOTS	(1 << 11)
 #define BTREE_ITER_FILTER_SNAPSHOTS	(1 << 12)
 #define BTREE_ITER_NOPRESERVE		(1 << 13)
+#define BTREE_ITER_CACHED_NOFILL	(1 << 14)
 
 enum btree_path_uptodate {
 	BTREE_ITER_UPTODATE		= 0,
-- 
cgit 


From 1617d56dc9bc3d9fd56824e8e488e88acbba152f Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 22 Nov 2022 20:15:33 -0500
Subject: bcachefs: Key cache now works for snapshots btrees

This switches btree_key_cache_fill() to use a btree iterator, not a
btree path, so that it can search for keys in previous snapshots.

We also add another iterator flag, BTREE_ITER_KEY_CACHE_FILL, to avoid
recursion back into the key cache.

This will allow us to re-enable the key cache for inodes in the next
patch.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c      |  4 ++++
 fs/bcachefs/btree_key_cache.c | 16 ++++++++--------
 fs/bcachefs/btree_types.h     |  1 +
 fs/bcachefs/inode.c           |  4 ++--
 4 files changed, 15 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index b1580f6efb0f..ecb6f27e4917 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1855,6 +1855,10 @@ struct bkey_s_c btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos
 	struct bkey_s_c k;
 	int ret;
 
+	if ((iter->flags & BTREE_ITER_KEY_CACHE_FILL) &&
+	    bpos_eq(iter->pos, pos))
+		return bkey_s_c_null;
+
 	if (!bch2_btree_key_cache_find(c, iter->btree_id, pos))
 		return bkey_s_c_null;
 
diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index fc924fd24274..c118d1b8241f 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -370,20 +370,20 @@ static int btree_key_cache_fill(struct btree_trans *trans,
 				struct btree_path *ck_path,
 				struct bkey_cached *ck)
 {
-	struct btree_path *path;
+	struct btree_iter iter;
 	struct bkey_s_c k;
 	unsigned new_u64s = 0;
 	struct bkey_i *new_k = NULL;
-	struct bkey u;
 	int ret;
 
-	path = bch2_path_get(trans, ck->key.btree_id, ck->key.pos, 0, 0, 0);
-	ret = bch2_btree_path_traverse(trans, path, 0);
+	bch2_trans_iter_init(trans, &iter, ck->key.btree_id, ck->key.pos,
+			     BTREE_ITER_KEY_CACHE_FILL|
+			     BTREE_ITER_CACHED_NOFILL);
+	k = bch2_btree_iter_peek_slot(&iter);
+	ret = bkey_err(k);
 	if (ret)
 		goto err;
 
-	k = bch2_btree_path_peek_slot(path, &u);
-
 	if (!bch2_btree_node_relock(trans, ck_path, 0)) {
 		trace_and_count(trans->c, trans_restart_relock_key_cache_fill, trans, _THIS_IP_, ck_path);
 		ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_raced);
@@ -431,9 +431,9 @@ static int btree_key_cache_fill(struct btree_trans *trans,
 	bch2_btree_node_unlock_write(trans, ck_path, ck_path->l[0].b);
 
 	/* We're not likely to need this iterator again: */
-	path->preserve = false;
+	set_btree_iter_dontneed(&iter);
 err:
-	bch2_path_put(trans, path, 0);
+	bch2_trans_iter_exit(trans, &iter);
 	return ret;
 }
 
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index cdb887abcfe1..3f6ca40b52f6 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -208,6 +208,7 @@ struct btree_node_iter {
 #define BTREE_ITER_FILTER_SNAPSHOTS	(1 << 12)
 #define BTREE_ITER_NOPRESERVE		(1 << 13)
 #define BTREE_ITER_CACHED_NOFILL	(1 << 14)
+#define BTREE_ITER_KEY_CACHE_FILL	(1 << 15)
 
 enum btree_path_uptodate {
 	BTREE_ITER_UPTODATE		= 0,
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index b4f09d77148d..4ca70c6c3a4f 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -684,8 +684,8 @@ retry:
 
 	if (!bkey_is_inode(k.k)) {
 		bch2_fs_inconsistent(trans.c,
-				     "inode %llu not found when deleting",
-				     inum.inum);
+				     "inode %llu:%u not found when deleting",
+				     inum.inum, snapshot);
 		ret = -EIO;
 		goto err;
 	}
-- 
cgit 


From 6f90e6b28180cb567b0abdb753ccac4c7d840cb2 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 25 Jan 2023 10:15:39 -0500
Subject: bcachefs: Fix a livelock in key cache fill path

We weren't setting path->uptodate before calling
bch2_btree_key_cache_fill() - which causes __bch2_btree_path_upgrade()
to fail.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_key_cache.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index c118d1b8241f..4833cb4c7cf5 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -487,6 +487,8 @@ retry:
 	path->l[0].lock_seq	= ck->c.lock.state.seq;
 	path->l[0].b		= (void *) ck;
 fill:
+	path->uptodate = BTREE_ITER_UPTODATE;
+
 	if (!ck->valid && !(flags & BTREE_ITER_CACHED_NOFILL)) {
 		/*
 		 * Using the underscore version because we haven't set
@@ -502,16 +504,23 @@ fill:
 		ret = btree_key_cache_fill(trans, path, ck);
 		if (ret)
 			goto err;
+
+		ret = bch2_btree_path_relock(trans, path, _THIS_IP_);
+		if (ret)
+			goto err;
+
+		path->uptodate = BTREE_ITER_UPTODATE;
 	}
 
 	if (!test_bit(BKEY_CACHED_ACCESSED, &ck->flags))
 		set_bit(BKEY_CACHED_ACCESSED, &ck->flags);
 
-	path->uptodate = BTREE_ITER_UPTODATE;
 	BUG_ON(btree_node_locked_type(path, 0) != btree_lock_want(path, 0));
+	BUG_ON(path->uptodate);
 
 	return ret;
 err:
+	path->uptodate = BTREE_ITER_NEED_TRAVERSE;
 	if (!bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
 		btree_node_unlock(trans, path, 0);
 		path->l[0].b = ERR_PTR(ret);
-- 
cgit 


From e0de429a3ab5f9485ca781d6d4d7368a2e12d835 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 1 Dec 2022 11:17:18 -0500
Subject: bcachefs: Don't error out when just reading the journal

This tweaks the recovery and journal paths so that we don't error out
before we need to: the list_journal command should work, even if we
wouldn't be able to replay successfully.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/journal_io.c |  3 +--
 fs/bcachefs/recovery.c   | 14 +++++++-------
 2 files changed, 8 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index d1deb0573ffd..cd48ba11e771 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -1187,8 +1187,7 @@ int bch2_journal_read(struct bch_fs *c, u64 *blacklist_seq, u64 *start_seq)
 
 	if (!last_seq) {
 		fsck_err(c, "journal read done, but no entries found after dropping non-flushes");
-		ret = -1;
-		goto err;
+		return 0;
 	}
 
 	bch_info(c, "journal read done, replaying entries %llu-%llu",
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 7c9f4a97bc03..aff813e3e360 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -1113,6 +1113,13 @@ int bch2_fs_recovery(struct bch_fs *c)
 		if (ret)
 			goto err;
 
+		/*
+		 * note: cmd_list_journal needs the blacklist table fully up to date so
+		 * it can asterisk ignored journal entries:
+		 */
+		if (c->opts.read_journal_only)
+			goto out;
+
 		genradix_for_each_reverse(&c->journal_entries, iter, i)
 			if (*i && !(*i)->ignore) {
 				last_journal_entry = &(*i)->j;
@@ -1184,13 +1191,6 @@ use_clean:
 		}
 	}
 
-	/*
-	 * note: cmd_list_journal needs the blacklist table fully up to date so
-	 * it can asterisk ignored journal entries:
-	 */
-	if (c->opts.read_journal_only)
-		goto out;
-
 	ret = bch2_fs_journal_start(&c->journal, journal_seq);
 	if (ret)
 		goto err;
-- 
cgit 


From b9004e8576b1c2803ee7d7b3f28fbcc779f05ffb Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 2 Dec 2022 11:45:58 -0500
Subject: bcachefs: Fix a "no journal entries found" bug

On startup, we need to ensure the first journal entry written is a flush
write: after a clean shutdown we generally don't read the journal, which
means we might be overwriting whatever was there previously, and there
must always be at least one flush entry in the journal or recovery will
fail.

Found by fstests generic/388.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/journal_io.c    | 34 ++++++++++++++++++++++++++++------
 fs/bcachefs/journal_types.h |  3 ++-
 fs/bcachefs/super.c         |  8 ++++++++
 3 files changed, 38 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index cd48ba11e771..a5c9524aa6e7 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -1661,20 +1661,42 @@ void bch2_journal_write(struct closure *cl)
 	j->write_start_time = local_clock();
 
 	spin_lock(&j->lock);
-	if (bch2_journal_error(j) ||
-	    w->noflush ||
-	    (!w->must_flush &&
-	     (jiffies - j->last_flush_write) < msecs_to_jiffies(c->opts.journal_flush_delay) &&
-	     test_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags))) {
+
+	/*
+	 * If the journal is in an error state - we did an emergency shutdown -
+	 * we prefer to continue doing journal writes. We just mark them as
+	 * noflush so they'll never be used, but they'll still be visible by the
+	 * list_journal tool - this helps in debugging.
+	 *
+	 * There's a caveat: the first journal write after marking the
+	 * superblock dirty must always be a flush write, because on startup
+	 * from a clean shutdown we didn't necessarily read the journal and the
+	 * new journal write might overwrite whatever was in the journal
+	 * previously - we can't leave the journal without any flush writes in
+	 * it.
+	 *
+	 * So if we're in an error state, and we're still starting up, we don't
+	 * write anything at all.
+	 */
+	if (!test_bit(JOURNAL_NEED_FLUSH_WRITE, &j->flags) &&
+	    (bch2_journal_error(j) ||
+	     w->noflush ||
+	     (!w->must_flush &&
+	      (jiffies - j->last_flush_write) < msecs_to_jiffies(c->opts.journal_flush_delay) &&
+	      test_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags)))) {
 		w->noflush = true;
 		SET_JSET_NO_FLUSH(jset, true);
 		jset->last_seq	= 0;
 		w->last_seq	= 0;
 
 		j->nr_noflush_writes++;
-	} else {
+	} else if (!bch2_journal_error(j)) {
 		j->last_flush_write = jiffies;
 		j->nr_flush_writes++;
+		clear_bit(JOURNAL_NEED_FLUSH_WRITE, &j->flags);
+	} else {
+		spin_unlock(&j->lock);
+		goto err;
 	}
 	spin_unlock(&j->lock);
 
diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h
index a41b915b3ac6..4c3065dceeea 100644
--- a/fs/bcachefs/journal_types.h
+++ b/fs/bcachefs/journal_types.h
@@ -141,10 +141,11 @@ enum journal_space_from {
 	journal_space_nr,
 };
 
-enum {
+enum journal_flags {
 	JOURNAL_REPLAY_DONE,
 	JOURNAL_STARTED,
 	JOURNAL_MAY_SKIP_FLUSH,
+	JOURNAL_NEED_FLUSH_WRITE,
 };
 
 #define JOURNAL_WATERMARKS()		\
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 8ee0783a1e78..234dab15fa63 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -379,6 +379,14 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early)
 
 	clear_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags);
 
+	/*
+	 * First journal write must be a flush write: after a clean shutdown we
+	 * don't read the journal, so the first journal write may end up
+	 * overwriting whatever was there previously, and there must always be
+	 * at least one non-flush write in the journal or recovery will fail:
+	 */
+	set_bit(JOURNAL_NEED_FLUSH_WRITE, &c->journal.flags);
+
 	for_each_rw_member(ca, c, i)
 		bch2_dev_allocator_add(c, ca);
 	bch2_recalc_capacity(c);
-- 
cgit 


From 230fa1c735496a2f89eb8bcc1471a46f0d917975 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 1 Dec 2022 21:59:25 -0500
Subject: bcachefs: Simplify journal read path

This just cleans up and simplifies the code that decides where to resume
writing in the journal - when the code was originally written we weren't
saving the precise location of every journal write found.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/journal_io.c | 31 ++++++-------------------------
 1 file changed, 6 insertions(+), 25 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index a5c9524aa6e7..854a0685db09 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -986,7 +986,6 @@ static void bch2_journal_read_device(struct closure *cl)
 	struct journal_replay *r, **_r;
 	struct genradix_iter iter;
 	struct journal_read_buf buf = { NULL, 0 };
-	u64 min_seq = U64_MAX;
 	unsigned i;
 	int ret = 0;
 
@@ -1005,45 +1004,27 @@ static void bch2_journal_read_device(struct closure *cl)
 			goto err;
 	}
 
-	/* Find the journal bucket with the highest sequence number: */
-	for (i = 0; i < ja->nr; i++) {
-		if (ja->bucket_seq[i] > ja->bucket_seq[ja->cur_idx])
-			ja->cur_idx = i;
-
-		min_seq = min(ja->bucket_seq[i], min_seq);
-	}
-
-	/*
-	 * If there's duplicate journal entries in multiple buckets (which
-	 * definitely isn't supposed to happen, but...) - make sure to start
-	 * cur_idx at the last of those buckets, so we don't deadlock trying to
-	 * allocate
-	 */
-	while (ja->bucket_seq[ja->cur_idx] > min_seq &&
-	       ja->bucket_seq[ja->cur_idx] ==
-	       ja->bucket_seq[(ja->cur_idx + 1) % ja->nr])
-		ja->cur_idx = (ja->cur_idx + 1) % ja->nr;
-
 	ja->sectors_free = ca->mi.bucket_size;
 
 	mutex_lock(&jlist->lock);
-	genradix_for_each(&c->journal_entries, iter, _r) {
+	genradix_for_each_reverse(&c->journal_entries, iter, _r) {
 		r = *_r;
 
 		if (!r)
 			continue;
 
 		for (i = 0; i < r->nr_ptrs; i++) {
-			if (r->ptrs[i].dev == ca->dev_idx &&
-			    sector_to_bucket(ca, r->ptrs[i].sector) == ja->buckets[ja->cur_idx]) {
+			if (r->ptrs[i].dev == ca->dev_idx) {
 				unsigned wrote = bucket_remainder(ca, r->ptrs[i].sector) +
 					vstruct_sectors(&r->j, c->block_bits);
 
-				ja->sectors_free = min(ja->sectors_free,
-						       ca->mi.bucket_size - wrote);
+				ja->cur_idx = r->ptrs[i].bucket;
+				ja->sectors_free = ca->mi.bucket_size - wrote;
+				goto found;
 			}
 		}
 	}
+found:
 	mutex_unlock(&jlist->lock);
 
 	if (ja->bucket_seq[ja->cur_idx] &&
-- 
cgit 


From 5c792e1b64cb5b87129f7226d610fd63465039bd Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 2 Dec 2022 12:45:37 -0500
Subject: bcachefs: Fix a btree iter assertion pop

This fixes a (harmless) broken invariant in __bch2_btree_path_set_pos():
iterators to interior nodes should point to the first non whiteout.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c | 23 ++++++++++++++++-------
 1 file changed, 16 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index ecb6f27e4917..ff796a74fd2f 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1202,7 +1202,7 @@ __bch2_btree_path_set_pos(struct btree_trans *trans,
 			  struct btree_path *path, struct bpos new_pos,
 			  bool intent, int cmp)
 {
-	unsigned l = path->level;
+	unsigned level = path->level;
 
 	EBUG_ON(trans->restarted);
 	EBUG_ON(!path->ref);
@@ -1219,10 +1219,12 @@ __bch2_btree_path_set_pos(struct btree_trans *trans,
 		goto out;
 	}
 
-	l = btree_path_up_until_good_node(trans, path, cmp);
+	level = btree_path_up_until_good_node(trans, path, cmp);
 
-	if (btree_path_node(path, l)) {
-		BUG_ON(!btree_node_locked(path, l));
+	if (btree_path_node(path, level)) {
+		struct btree_path_level *l = &path->l[level];
+
+		BUG_ON(!btree_node_locked(path, level));
 		/*
 		 * We might have to skip over many keys, or just a few: try
 		 * advancing the node iterator, and if we have to skip over too
@@ -1230,11 +1232,18 @@ __bch2_btree_path_set_pos(struct btree_trans *trans,
 		 * is expensive).
 		 */
 		if (cmp < 0 ||
-		    !btree_path_advance_to_pos(path, &path->l[l], 8))
-			__btree_path_level_init(path, l);
+		    !btree_path_advance_to_pos(path, l, 8))
+			bch2_btree_node_iter_init(&l->iter, l->b, &path->pos);
+
+		/*
+		 * Iterators to interior nodes should always be pointed at the first non
+		 * whiteout:
+		 */
+		if (unlikely(level))
+			bch2_btree_node_iter_peek(&l->iter, l->b);
 	}
 
-	if (l != path->level) {
+	if (unlikely(level != path->level)) {
 		btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
 		__bch2_btree_path_unlock(trans, path);
 	}
-- 
cgit 


From ac9fa4bdc79bb54e639f5dc262f9f5976a5ccd21 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 7 Dec 2022 11:39:34 -0500
Subject: bcachefs: Kill btree_insert_ret enum

Replace with standard bcachefs-private error codes.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_types.h       |  9 ---------
 fs/bcachefs/btree_update_leaf.c | 33 +++++++++++++++------------------
 fs/bcachefs/errcode.h           |  5 +++++
 3 files changed, 20 insertions(+), 27 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 3f6ca40b52f6..6a852f7fbf70 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -692,15 +692,6 @@ struct btree_root {
 	s8			error;
 };
 
-enum btree_insert_ret {
-	BTREE_INSERT_OK,
-	/* leaf node needs to be split */
-	BTREE_INSERT_BTREE_NODE_FULL,
-	BTREE_INSERT_NEED_MARK_REPLICAS,
-	BTREE_INSERT_NEED_JOURNAL_RES,
-	BTREE_INSERT_NEED_JOURNAL_RECLAIM,
-};
-
 enum btree_gc_coalesce_fail_reason {
 	BTREE_GC_COALESCE_FAIL_RESERVE_GET,
 	BTREE_GC_COALESCE_FAIL_KEYLIST_REALLOC,
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 62b0aa55c752..c7d8d2a55551 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -318,7 +318,7 @@ static __always_inline int bch2_trans_journal_res_get(struct btree_trans *trans,
 				   flags|
 				   (trans->flags & JOURNAL_WATERMARK_MASK));
 
-	return ret == -EAGAIN ? BTREE_INSERT_NEED_JOURNAL_RES : ret;
+	return ret == -EAGAIN ? -BCH_ERR_btree_insert_need_journal_res : ret;
 }
 
 #define JSET_ENTRY_LOG_U64s		4
@@ -337,23 +337,20 @@ static noinline void journal_transaction_name(struct btree_trans *trans)
 	strncpy(l->d, trans->fn, JSET_ENTRY_LOG_U64s * sizeof(u64));
 }
 
-static inline enum btree_insert_ret
-btree_key_can_insert(struct btree_trans *trans,
-		     struct btree *b,
-		     unsigned u64s)
+static inline int btree_key_can_insert(struct btree_trans *trans,
+				       struct btree *b, unsigned u64s)
 {
 	struct bch_fs *c = trans->c;
 
 	if (!bch2_btree_node_insert_fits(c, b, u64s))
-		return BTREE_INSERT_BTREE_NODE_FULL;
+		return -BCH_ERR_btree_insert_btree_node_full;
 
-	return BTREE_INSERT_OK;
+	return 0;
 }
 
-static enum btree_insert_ret
-btree_key_can_insert_cached(struct btree_trans *trans,
-			    struct btree_path *path,
-			    unsigned u64s)
+static int btree_key_can_insert_cached(struct btree_trans *trans,
+				       struct btree_path *path,
+				       unsigned u64s)
 {
 	struct bch_fs *c = trans->c;
 	struct bkey_cached *ck = (void *) path->l[0].b;
@@ -365,7 +362,7 @@ btree_key_can_insert_cached(struct btree_trans *trans,
 	if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) &&
 	    bch2_btree_key_cache_must_wait(c) &&
 	    !(trans->flags & BTREE_INSERT_JOURNAL_RECLAIM))
-		return BTREE_INSERT_NEED_JOURNAL_RECLAIM;
+		return -BCH_ERR_btree_insert_need_journal_reclaim;
 
 	/*
 	 * bch2_varint_decode can read past the end of the buffer by at most 7
@@ -374,7 +371,7 @@ btree_key_can_insert_cached(struct btree_trans *trans,
 	u64s += 1;
 
 	if (u64s <= ck->u64s)
-		return BTREE_INSERT_OK;
+		return 0;
 
 	new_u64s	= roundup_pow_of_two(u64s);
 	new_k		= krealloc(ck->k, new_u64s * sizeof(u64), GFP_NOFS);
@@ -684,7 +681,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans,
 
 	if (trans->fs_usage_deltas &&
 	    bch2_trans_fs_usage_apply(trans, trans->fs_usage_deltas))
-		return BTREE_INSERT_NEED_MARK_REPLICAS;
+		return -BCH_ERR_btree_insert_need_mark_replicas;
 
 	trans_for_each_update(trans, i)
 		if (BTREE_NODE_TYPE_HAS_MEM_TRIGGERS & (1U << i->bkey_type)) {
@@ -916,12 +913,12 @@ int bch2_trans_commit_error(struct btree_trans *trans,
 	struct bch_fs *c = trans->c;
 
 	switch (ret) {
-	case BTREE_INSERT_BTREE_NODE_FULL:
+	case -BCH_ERR_btree_insert_btree_node_full:
 		ret = bch2_btree_split_leaf(trans, i->path, trans->flags);
 		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
 			trace_and_count(c, trans_restart_btree_node_split, trans, trace_ip, i->path);
 		break;
-	case BTREE_INSERT_NEED_MARK_REPLICAS:
+	case -BCH_ERR_btree_insert_need_mark_replicas:
 		bch2_trans_unlock(trans);
 
 		ret = bch2_replicas_delta_list_mark(c, trans->fs_usage_deltas);
@@ -932,7 +929,7 @@ int bch2_trans_commit_error(struct btree_trans *trans,
 		if (ret)
 			trace_and_count(c, trans_restart_mark_replicas, trans, trace_ip);
 		break;
-	case BTREE_INSERT_NEED_JOURNAL_RES:
+	case -BCH_ERR_btree_insert_need_journal_res:
 		bch2_trans_unlock(trans);
 
 		if ((trans->flags & BTREE_INSERT_JOURNAL_RECLAIM) &&
@@ -949,7 +946,7 @@ int bch2_trans_commit_error(struct btree_trans *trans,
 		if (ret)
 			trace_and_count(c, trans_restart_journal_res_get, trans, trace_ip);
 		break;
-	case BTREE_INSERT_NEED_JOURNAL_RECLAIM:
+	case -BCH_ERR_btree_insert_need_journal_reclaim:
 		bch2_trans_unlock(trans);
 
 		trace_and_count(c, trans_blocked_journal_reclaim, trans, trace_ip);
diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h
index fb1e1cd0f864..3ec5808dcbd9 100644
--- a/fs/bcachefs/errcode.h
+++ b/fs/bcachefs/errcode.h
@@ -53,6 +53,11 @@
 	x(BCH_ERR_no_btree_node,	no_btree_node_down)			\
 	x(BCH_ERR_no_btree_node,	no_btree_node_init)			\
 	x(BCH_ERR_no_btree_node,	no_btree_node_cached)			\
+	x(0,				btree_insert_fail)			\
+	x(BCH_ERR_btree_insert_fail,	btree_insert_btree_node_full)		\
+	x(BCH_ERR_btree_insert_fail,	btree_insert_need_mark_replicas)	\
+	x(BCH_ERR_btree_insert_fail,	btree_insert_need_journal_res)		\
+	x(BCH_ERR_btree_insert_fail,	btree_insert_need_journal_reclaim)	\
 	x(0,				lock_fail_root_changed)			\
 	x(0,				journal_reclaim_would_deadlock)		\
 	x(0,				fsck)					\
-- 
cgit 


From 4f948723eda1ca44e470c31fcab4a453ed53aa13 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 9 Dec 2022 14:09:14 -0500
Subject: bcachefs: Fix bch2_journal_keys_peek_upto()

bch2_journal_keys_peek_upto() was comparing against btree_id & level
incorrectly - fix this by using __journal_key_cmp().

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/recovery.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index aff813e3e360..15a676196e2f 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -128,12 +128,12 @@ search:
 	if (!*idx)
 		*idx = __bch2_journal_key_search(keys, btree_id, level, pos);
 
-	while (*idx < keys->nr &&
-	       (k = idx_to_key(keys, *idx),
-		k->btree_id == btree_id &&
-		k->level == level &&
-		bpos_le(k->k->k.p, end_pos))) {
-		if (bpos_ge(k->k->k.p, pos) && !k->overwritten)
+	while ((k = *idx < keys->nr ? idx_to_key(keys, *idx) : NULL)) {
+		if (__journal_key_cmp(btree_id, level, end_pos, k) < 0)
+			return NULL;
+
+		if (__journal_key_cmp(btree_id, level, pos, k) <= 0 &&
+		    !k->overwritten)
 			return k->k;
 
 		(*idx)++;
-- 
cgit 


From ef0732861a3af5bd1c5b08b6f64ca5b2cbee04bc Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 9 Dec 2022 16:22:36 -0500
Subject: bcachefs: Add a missing bch2_btree_path_traverse() call

bch2_btree_iter_peek_upto() in snapshots mode may need to keep a
btree_path for the insert position, not just the position of the key
we're returning. The code was incorrectly assuming this would be in the
same btree node - we were missing a bch2_btree_path_traverse() call.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index ff796a74fd2f..04051e45f4e0 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -2063,6 +2063,11 @@ struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos e
 			iter->update_path = bch2_btree_path_set_pos(trans,
 						iter->update_path, pos,
 						iter->flags & BTREE_ITER_INTENT);
+			ret = bch2_btree_path_traverse(trans, iter->update_path, iter->flags);
+			if (unlikely(ret)) {
+				k = bkey_s_c_err(ret);
+				goto out_no_locked;
+			}
 		}
 
 		/*
-- 
cgit 


From 5f659376fc1b9ad23b00a35242179b8961e0bc2d Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 12 Oct 2022 16:11:31 -0400
Subject: bcachefs: Suppress -EROFS messages when shutting down

This isn't actually an error condition, this just indicates a normal
shutdown - no reason for these to be in the log.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_gc.c | 8 ++++----
 fs/bcachefs/io.c       | 6 ++++--
 fs/bcachefs/movinggc.c | 2 +-
 3 files changed, 9 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index f5b46f382340..2defa811f48f 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -1976,10 +1976,10 @@ int bch2_gc_gens(struct bch_fs *c)
 					NULL, NULL,
 					BTREE_INSERT_NOFAIL,
 				gc_btree_gens_key(&trans, &iter, k));
-			if (ret) {
+			if (ret && ret != -EROFS)
 				bch_err(c, "error recalculating oldest_gen: %s", bch2_err_str(ret));
+			if (ret)
 				goto err;
-			}
 		}
 
 	ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_alloc,
@@ -1989,10 +1989,10 @@ int bch2_gc_gens(struct bch_fs *c)
 			NULL, NULL,
 			BTREE_INSERT_NOFAIL,
 		bch2_alloc_write_oldest_gen(&trans, &iter, k));
-	if (ret) {
+	if (ret && ret != -EROFS)
 		bch_err(c, "error writing oldest_gen: %s", bch2_err_str(ret));
+	if (ret)
 		goto err;
-	}
 
 	c->gc_gens_btree	= 0;
 	c->gc_gens_pos		= POS_MIN;
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index 491fad4dfb28..27265ba35fac 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -675,15 +675,17 @@ static void __bch2_write_index(struct bch_write_op *op)
 
 		op->written += sectors_start - keylist_sectors(keys);
 
-		if (ret) {
+		if (ret && !bch2_err_matches(ret, EROFS)) {
 			struct bkey_i *k = bch2_keylist_front(&op->insert_keys);
 
 			bch_err_inum_offset_ratelimited(c,
 				k->k.p.inode, k->k.p.offset << 9,
 				"write error while doing btree update: %s",
 				bch2_err_str(ret));
-			goto err;
 		}
+
+		if (ret)
+			goto err;
 	}
 out:
 	/* If some a bucket wasn't written, we can't erasure code it: */
diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c
index 09f4303de1f6..66f18f711d53 100644
--- a/fs/bcachefs/movinggc.c
+++ b/fs/bcachefs/movinggc.c
@@ -319,7 +319,7 @@ static int bch2_copygc(struct bch_fs *c)
 			     writepoint_ptr(&c->copygc_write_point),
 			     false,
 			     copygc_pred, NULL);
-	if (ret < 0)
+	if (ret < 0 && ret != -EROFS)
 		bch_err(c, "error from bch2_move_data() in copygc: %s", bch2_err_str(ret));
 	if (ret)
 		return ret;
-- 
cgit 


From 78c0b75c34209c471616566b3978eac4c1c53e99 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 19 Nov 2022 22:39:08 -0500
Subject: bcachefs: More errcode cleanup

We shouldn't be overloading standard error codes now that we have
provisions for bcachefs-specific errorcodes: this patch converts super.c
and super-io.c to per error site errcodes, with a bit of cleanup.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c      | 20 +++++-----
 fs/bcachefs/bkey_methods.c          | 28 +++++++-------
 fs/bcachefs/dirent.c                | 16 ++++----
 fs/bcachefs/disk_groups.c           | 13 +++----
 fs/bcachefs/ec.c                    |  8 ++--
 fs/bcachefs/errcode.h               | 40 ++++++++++++++++++-
 fs/bcachefs/extents.c               | 36 +++++++++---------
 fs/bcachefs/fs.c                    |  7 +++-
 fs/bcachefs/inode.c                 | 26 ++++++-------
 fs/bcachefs/journal_sb.c            |  4 +-
 fs/bcachefs/journal_seq_blacklist.c |  4 +-
 fs/bcachefs/lru.c                   |  2 +-
 fs/bcachefs/quota.c                 |  6 +--
 fs/bcachefs/reflink.c               |  4 +-
 fs/bcachefs/replicas.c              | 10 ++---
 fs/bcachefs/subvolume.c             | 16 ++++----
 fs/bcachefs/super-io.c              | 68 ++++++++++++++++-----------------
 fs/bcachefs/super.c                 | 76 ++++++++++++++++---------------------
 fs/bcachefs/xattr.c                 | 10 ++---
 19 files changed, 211 insertions(+), 183 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index a0b9fa30260a..cef5de13a6e4 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -302,7 +302,7 @@ int bch2_alloc_v1_invalid(const struct bch_fs *c, struct bkey_s_c k,
 	if (bkey_val_u64s(a.k) < bch_alloc_v1_val_u64s(a.v)) {
 		prt_printf(err, "incorrect value size (%zu < %u)",
 		       bkey_val_u64s(a.k), bch_alloc_v1_val_u64s(a.v));
-		return -EINVAL;
+		return -BCH_ERR_invalid_bkey;
 	}
 
 	return 0;
@@ -315,7 +315,7 @@ int bch2_alloc_v2_invalid(const struct bch_fs *c, struct bkey_s_c k,
 
 	if (bch2_alloc_unpack_v2(&u, k)) {
 		prt_printf(err, "unpack error");
-		return -EINVAL;
+		return -BCH_ERR_invalid_bkey;
 	}
 
 	return 0;
@@ -328,7 +328,7 @@ int bch2_alloc_v3_invalid(const struct bch_fs *c, struct bkey_s_c k,
 
 	if (bch2_alloc_unpack_v3(&u, k)) {
 		prt_printf(err, "unpack error");
-		return -EINVAL;
+		return -BCH_ERR_invalid_bkey;
 	}
 
 	return 0;
@@ -342,14 +342,14 @@ int bch2_alloc_v4_invalid(const struct bch_fs *c, struct bkey_s_c k,
 	if (bkey_val_bytes(k.k) != sizeof(struct bch_alloc_v4)) {
 		prt_printf(err, "bad val size (%zu != %zu)",
 		       bkey_val_bytes(k.k), sizeof(struct bch_alloc_v4));
-		return -EINVAL;
+		return -BCH_ERR_invalid_bkey;
 	}
 
 	if (rw == WRITE) {
 		if (alloc_data_type(*a.v, a.v->data_type) != a.v->data_type) {
 			prt_printf(err, "invalid data type (got %u should be %u)",
 			       a.v->data_type, alloc_data_type(*a.v, a.v->data_type));
-			return -EINVAL;
+			return -BCH_ERR_invalid_bkey;
 		}
 
 		switch (a.v->data_type) {
@@ -360,7 +360,7 @@ int bch2_alloc_v4_invalid(const struct bch_fs *c, struct bkey_s_c k,
 			    a.v->cached_sectors ||
 			    a.v->stripe) {
 				prt_printf(err, "empty data type free but have data");
-				return -EINVAL;
+				return -BCH_ERR_invalid_bkey;
 			}
 			break;
 		case BCH_DATA_sb:
@@ -371,7 +371,7 @@ int bch2_alloc_v4_invalid(const struct bch_fs *c, struct bkey_s_c k,
 			if (!a.v->dirty_sectors) {
 				prt_printf(err, "data_type %s but dirty_sectors==0",
 				       bch2_data_types[a.v->data_type]);
-				return -EINVAL;
+				return -BCH_ERR_invalid_bkey;
 			}
 			break;
 		case BCH_DATA_cached:
@@ -379,20 +379,20 @@ int bch2_alloc_v4_invalid(const struct bch_fs *c, struct bkey_s_c k,
 			    a.v->dirty_sectors ||
 			    a.v->stripe) {
 				prt_printf(err, "data type inconsistency");
-				return -EINVAL;
+				return -BCH_ERR_invalid_bkey;
 			}
 
 			if (!a.v->io_time[READ] &&
 			    test_bit(BCH_FS_CHECK_ALLOC_TO_LRU_REFS_DONE, &c->flags)) {
 				prt_printf(err, "cached bucket with read_time == 0");
-				return -EINVAL;
+				return -BCH_ERR_invalid_bkey;
 			}
 			break;
 		case BCH_DATA_stripe:
 			if (!a.v->stripe) {
 				prt_printf(err, "data_type %s but stripe==0",
 				       bch2_data_types[a.v->data_type]);
-				return -EINVAL;
+				return -BCH_ERR_invalid_bkey;
 			}
 			break;
 		}
diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c
index 7fcd6ca40b93..29809da5e9cf 100644
--- a/fs/bcachefs/bkey_methods.c
+++ b/fs/bcachefs/bkey_methods.c
@@ -42,7 +42,7 @@ static int empty_val_key_invalid(const struct bch_fs *c, struct bkey_s_c k,
 	if (bkey_val_bytes(k.k)) {
 		prt_printf(err, "incorrect value size (%zu != 0)",
 		       bkey_val_bytes(k.k));
-		return -EINVAL;
+		return -BCH_ERR_invalid_bkey;
 	}
 
 	return 0;
@@ -58,7 +58,7 @@ static int key_type_cookie_invalid(const struct bch_fs *c, struct bkey_s_c k,
 	if (bkey_val_bytes(k.k) != sizeof(struct bch_cookie)) {
 		prt_printf(err, "incorrect value size (%zu != %zu)",
 		       bkey_val_bytes(k.k), sizeof(struct bch_cookie));
-		return -EINVAL;
+		return -BCH_ERR_invalid_bkey;
 	}
 
 	return 0;
@@ -99,7 +99,7 @@ static int key_type_set_invalid(const struct bch_fs *c, struct bkey_s_c k,
 	if (bkey_val_bytes(k.k)) {
 		prt_printf(err, "incorrect value size (%zu != %zu)",
 		       bkey_val_bytes(k.k), sizeof(struct bch_cookie));
-		return -EINVAL;
+		return -BCH_ERR_invalid_bkey;
 	}
 
 	return 0;
@@ -127,7 +127,7 @@ int bch2_bkey_val_invalid(struct bch_fs *c, struct bkey_s_c k,
 {
 	if (k.k->type >= KEY_TYPE_MAX) {
 		prt_printf(err, "invalid type (%u >= %u)", k.k->type, KEY_TYPE_MAX);
-		return -EINVAL;
+		return -BCH_ERR_invalid_bkey;
 	}
 
 	return bch2_bkey_ops[k.k->type].key_invalid(c, k, rw, err);
@@ -203,30 +203,30 @@ int __bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k,
 {
 	if (k.k->u64s < BKEY_U64s) {
 		prt_printf(err, "u64s too small (%u < %zu)", k.k->u64s, BKEY_U64s);
-		return -EINVAL;
+		return -BCH_ERR_invalid_bkey;
 	}
 
 	if (!(bch2_key_types_allowed[type] & (1U << k.k->type))) {
 		prt_printf(err, "invalid key type for btree %s (%s)",
 			   bch2_btree_ids[type], bch2_bkey_types[k.k->type]);
-		return -EINVAL;
+		return -BCH_ERR_invalid_bkey;
 	}
 
 	if (btree_node_type_is_extents(type) && !bkey_whiteout(k.k)) {
 		if (k.k->size == 0) {
 			prt_printf(err, "size == 0");
-			return -EINVAL;
+			return -BCH_ERR_invalid_bkey;
 		}
 
 		if (k.k->size > k.k->p.offset) {
 			prt_printf(err, "size greater than offset (%u > %llu)",
 			       k.k->size, k.k->p.offset);
-			return -EINVAL;
+			return -BCH_ERR_invalid_bkey;
 		}
 	} else {
 		if (k.k->size) {
 			prt_printf(err, "size != 0");
-			return -EINVAL;
+			return -BCH_ERR_invalid_bkey;
 		}
 	}
 
@@ -234,20 +234,20 @@ int __bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k,
 	    !btree_type_has_snapshots(type) &&
 	    k.k->p.snapshot) {
 		prt_printf(err, "nonzero snapshot");
-		return -EINVAL;
+		return -BCH_ERR_invalid_bkey;
 	}
 
 	if (type != BKEY_TYPE_btree &&
 	    btree_type_has_snapshots(type) &&
 	    !k.k->p.snapshot) {
 		prt_printf(err, "snapshot == 0");
-		return -EINVAL;
+		return -BCH_ERR_invalid_bkey;
 	}
 
 	if (type != BKEY_TYPE_btree &&
 	    bkey_eq(k.k->p, POS_MAX)) {
 		prt_printf(err, "key at POS_MAX");
-		return -EINVAL;
+		return -BCH_ERR_invalid_bkey;
 	}
 
 	return 0;
@@ -266,12 +266,12 @@ int bch2_bkey_in_btree_node(struct btree *b, struct bkey_s_c k,
 {
 	if (bpos_lt(k.k->p, b->data->min_key)) {
 		prt_printf(err, "key before start of btree node");
-		return -EINVAL;
+		return -BCH_ERR_invalid_bkey;
 	}
 
 	if (bpos_gt(k.k->p, b->data->max_key)) {
 		prt_printf(err, "key past end of btree node");
-		return -EINVAL;
+		return -BCH_ERR_invalid_bkey;
 	}
 
 	return 0;
diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c
index c2126f39369b..f1838b7c45ee 100644
--- a/fs/bcachefs/dirent.c
+++ b/fs/bcachefs/dirent.c
@@ -92,46 +92,46 @@ int bch2_dirent_invalid(const struct bch_fs *c, struct bkey_s_c k,
 	if (bkey_val_bytes(k.k) < sizeof(struct bch_dirent)) {
 		prt_printf(err, "incorrect value size (%zu < %zu)",
 		       bkey_val_bytes(k.k), sizeof(*d.v));
-		return -EINVAL;
+		return -BCH_ERR_invalid_bkey;
 	}
 
 	len = bch2_dirent_name_bytes(d);
 	if (!len) {
 		prt_printf(err, "empty name");
-		return -EINVAL;
+		return -BCH_ERR_invalid_bkey;
 	}
 
 	if (bkey_val_u64s(k.k) > dirent_val_u64s(len)) {
 		prt_printf(err, "value too big (%zu > %u)",
 		       bkey_val_u64s(k.k), dirent_val_u64s(len));
-		return -EINVAL;
+		return -BCH_ERR_invalid_bkey;
 	}
 
 	if (len > BCH_NAME_MAX) {
 		prt_printf(err, "dirent name too big (%u > %u)",
 		       len, BCH_NAME_MAX);
-		return -EINVAL;
+		return -BCH_ERR_invalid_bkey;
 	}
 
 	if (len == 1 && !memcmp(d.v->d_name, ".", 1)) {
 		prt_printf(err, "invalid name");
-		return -EINVAL;
+		return -BCH_ERR_invalid_bkey;
 	}
 
 	if (len == 2 && !memcmp(d.v->d_name, "..", 2)) {
 		prt_printf(err, "invalid name");
-		return -EINVAL;
+		return -BCH_ERR_invalid_bkey;
 	}
 
 	if (memchr(d.v->d_name, '/', len)) {
 		prt_printf(err, "invalid name");
-		return -EINVAL;
+		return -BCH_ERR_invalid_bkey;
 	}
 
 	if (d.v->d_type != DT_SUBVOL &&
 	    le64_to_cpu(d.v->d_inum) == d.k->p.inode) {
 		prt_printf(err, "dirent points to own directory");
-		return -EINVAL;
+		return -BCH_ERR_invalid_bkey;
 	}
 
 	return 0;
diff --git a/fs/bcachefs/disk_groups.c b/fs/bcachefs/disk_groups.c
index 6b81f35861ac..fcd5dbff248d 100644
--- a/fs/bcachefs/disk_groups.c
+++ b/fs/bcachefs/disk_groups.c
@@ -27,7 +27,7 @@ static int bch2_sb_disk_groups_validate(struct bch_sb *sb,
 	struct bch_sb_field_members *mi = bch2_sb_get_members(sb);
 	unsigned nr_groups = disk_groups_nr(groups);
 	unsigned i, len;
-	int ret = -EINVAL;
+	int ret = 0;
 
 	for (i = 0; i < sb->nr_devices; i++) {
 		struct bch_member *m = mi->members + i;
@@ -41,12 +41,12 @@ static int bch2_sb_disk_groups_validate(struct bch_sb *sb,
 		if (g >= nr_groups) {
 			prt_printf(err, "disk %u has invalid label %u (have %u)",
 			       i, g, nr_groups);
-			return -EINVAL;
+			return -BCH_ERR_invalid_sb_disk_groups;
 		}
 
 		if (BCH_GROUP_DELETED(&groups->entries[g])) {
 			prt_printf(err, "disk %u has deleted label %u", i, g);
-			return -EINVAL;
+			return -BCH_ERR_invalid_sb_disk_groups;
 		}
 	}
 
@@ -62,7 +62,7 @@ static int bch2_sb_disk_groups_validate(struct bch_sb *sb,
 		len = strnlen(g->label, sizeof(g->label));
 		if (!len) {
 			prt_printf(err, "label %u empty", i);
-			return -EINVAL;
+			return -BCH_ERR_invalid_sb_disk_groups;
 		}
 	}
 
@@ -79,13 +79,12 @@ static int bch2_sb_disk_groups_validate(struct bch_sb *sb,
 			prt_printf(err, "duplicate label %llu.%.*s",
 			       BCH_GROUP_PARENT(g),
 			       (int) sizeof(g->label), g->label);
+			ret = -BCH_ERR_invalid_sb_disk_groups;
 			goto err;
 		}
-
-	ret = 0;
 err:
 	kfree(sorted);
-	return 0;
+	return ret;
 }
 
 static void bch2_sb_disk_groups_to_text(struct printbuf *out,
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index 503a47b39ad1..c855ea025f0e 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -109,24 +109,24 @@ int bch2_stripe_invalid(const struct bch_fs *c, struct bkey_s_c k,
 
 	if (bkey_eq(k.k->p, POS_MIN)) {
 		prt_printf(err, "stripe at POS_MIN");
-		return -EINVAL;
+		return -BCH_ERR_invalid_bkey;
 	}
 
 	if (k.k->p.inode) {
 		prt_printf(err, "nonzero inode field");
-		return -EINVAL;
+		return -BCH_ERR_invalid_bkey;
 	}
 
 	if (bkey_val_bytes(k.k) < sizeof(*s)) {
 		prt_printf(err, "incorrect value size (%zu < %zu)",
 		       bkey_val_bytes(k.k), sizeof(*s));
-		return -EINVAL;
+		return -BCH_ERR_invalid_bkey;
 	}
 
 	if (bkey_val_u64s(k.k) < stripe_val_u64s(s)) {
 		prt_printf(err, "incorrect value size (%zu < %u)",
 		       bkey_val_u64s(k.k), stripe_val_u64s(s));
-		return -EINVAL;
+		return -BCH_ERR_invalid_bkey;
 	}
 
 	return bch2_bkey_ptrs_invalid(c, k, rw, err);
diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h
index 3ec5808dcbd9..dc388864be6f 100644
--- a/fs/bcachefs/errcode.h
+++ b/fs/bcachefs/errcode.h
@@ -67,7 +67,45 @@
 	x(BCH_ERR_fsck,			fsck_repair_unimplemented)		\
 	x(BCH_ERR_fsck,			fsck_repair_impossible)			\
 	x(0,				need_snapshot_cleanup)			\
-	x(0,				need_topology_repair)
+	x(0,				need_topology_repair)			\
+	x(EINVAL,			device_state_not_allowed)		\
+	x(EINVAL,			member_info_missing)			\
+	x(EINVAL,			mismatched_block_size)			\
+	x(EINVAL,			block_size_too_small)			\
+	x(EINVAL,			bucket_size_too_small)			\
+	x(EINVAL,			device_size_too_small)			\
+	x(EINVAL,			device_not_a_member_of_filesystem)	\
+	x(EINVAL,			device_has_been_removed)		\
+	x(EINVAL,			device_already_online)			\
+	x(EINVAL,			insufficient_devices_to_start)		\
+	x(EINVAL,			invalid)				\
+	x(BCH_ERR_invalid,		invalid_sb)				\
+	x(BCH_ERR_invalid_sb,		invalid_sb_magic)			\
+	x(BCH_ERR_invalid_sb,		invalid_sb_version)			\
+	x(BCH_ERR_invalid_sb,		invalid_sb_features)			\
+	x(BCH_ERR_invalid_sb,		invalid_sb_too_big)			\
+	x(BCH_ERR_invalid_sb,		invalid_sb_csum_type)			\
+	x(BCH_ERR_invalid_sb,		invalid_sb_csum)			\
+	x(BCH_ERR_invalid_sb,		invalid_sb_block_size)			\
+	x(BCH_ERR_invalid_sb,		invalid_sb_uuid)			\
+	x(BCH_ERR_invalid_sb,		invalid_sb_too_many_members)		\
+	x(BCH_ERR_invalid_sb,		invalid_sb_dev_idx)			\
+	x(BCH_ERR_invalid_sb,		invalid_sb_time_precision)		\
+	x(BCH_ERR_invalid_sb,		invalid_sb_field_size)			\
+	x(BCH_ERR_invalid_sb,		invalid_sb_layout)			\
+	x(BCH_ERR_invalid_sb_layout,	invalid_sb_layout_type)			\
+	x(BCH_ERR_invalid_sb_layout,	invalid_sb_layout_nr_superblocks)	\
+	x(BCH_ERR_invalid_sb_layout,	invalid_sb_layout_superblocks_overlap)	\
+	x(BCH_ERR_invalid_sb,		invalid_sb_members_missing)		\
+	x(BCH_ERR_invalid_sb,		invalid_sb_members)			\
+	x(BCH_ERR_invalid_sb,		invalid_sb_disk_groups)			\
+	x(BCH_ERR_invalid_sb,		invalid_sb_replicas)			\
+	x(BCH_ERR_invalid_sb,		invalid_sb_journal)			\
+	x(BCH_ERR_invalid_sb,		invalid_sb_journal_seq_blacklist)	\
+	x(BCH_ERR_invalid_sb,		invalid_sb_crypt)			\
+	x(BCH_ERR_invalid_sb,		invalid_sb_clean)			\
+	x(BCH_ERR_invalid_sb,		invalid_sb_quota)			\
+	x(BCH_ERR_invalid,		invalid_bkey)				\
 
 enum bch_errcode {
 	BCH_ERR_START		= 2048,
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index e3bc39bee197..422adca7230b 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -163,7 +163,7 @@ int bch2_btree_ptr_invalid(const struct bch_fs *c, struct bkey_s_c k,
 	if (bkey_val_u64s(k.k) > BCH_REPLICAS_MAX) {
 		prt_printf(err, "value too big (%zu > %u)",
 		       bkey_val_u64s(k.k), BCH_REPLICAS_MAX);
-		return -EINVAL;
+		return -BCH_ERR_invalid_bkey;
 	}
 
 	return bch2_bkey_ptrs_invalid(c, k, rw, err);
@@ -183,20 +183,20 @@ int bch2_btree_ptr_v2_invalid(const struct bch_fs *c, struct bkey_s_c k,
 	if (bkey_val_bytes(k.k) <= sizeof(*bp.v)) {
 		prt_printf(err, "value too small (%zu <= %zu)",
 		       bkey_val_bytes(k.k), sizeof(*bp.v));
-		return -EINVAL;
+		return -BCH_ERR_invalid_bkey;
 	}
 
 	if (bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX) {
 		prt_printf(err, "value too big (%zu > %zu)",
 		       bkey_val_u64s(k.k), BKEY_BTREE_PTR_VAL_U64s_MAX);
-		return -EINVAL;
+		return -BCH_ERR_invalid_bkey;
 	}
 
 	if (c->sb.version < bcachefs_metadata_version_snapshot &&
 	    bp.v->min_key.snapshot) {
 		prt_printf(err, "invalid min_key.snapshot (%u != 0)",
 		       bp.v->min_key.snapshot);
-		return -EINVAL;
+		return -BCH_ERR_invalid_bkey;
 	}
 
 	return bch2_bkey_ptrs_invalid(c, k, rw, err);
@@ -387,13 +387,13 @@ int bch2_reservation_invalid(const struct bch_fs *c, struct bkey_s_c k,
 	if (bkey_val_bytes(k.k) != sizeof(struct bch_reservation)) {
 		prt_printf(err, "incorrect value size (%zu != %zu)",
 		       bkey_val_bytes(k.k), sizeof(*r.v));
-		return -EINVAL;
+		return -BCH_ERR_invalid_bkey;
 	}
 
 	if (!r.v->nr_replicas || r.v->nr_replicas > BCH_REPLICAS_MAX) {
 		prt_printf(err, "invalid nr_replicas (%u)",
 		       r.v->nr_replicas);
-		return -EINVAL;
+		return -BCH_ERR_invalid_bkey;
 	}
 
 	return 0;
@@ -1054,14 +1054,14 @@ static int extent_ptr_invalid(const struct bch_fs *c,
 
 	if (!bch2_dev_exists2(c, ptr->dev)) {
 		prt_printf(err, "pointer to invalid device (%u)", ptr->dev);
-		return -EINVAL;
+		return -BCH_ERR_invalid_bkey;
 	}
 
 	ca = bch_dev_bkey_exists(c, ptr->dev);
 	bkey_for_each_ptr(ptrs, ptr2)
 		if (ptr != ptr2 && ptr->dev == ptr2->dev) {
 			prt_printf(err, "multiple pointers to same device (%u)", ptr->dev);
-			return -EINVAL;
+			return -BCH_ERR_invalid_bkey;
 		}
 
 	bucket = sector_to_bucket_and_offset(ca, ptr->offset, &bucket_offset);
@@ -1069,19 +1069,19 @@ static int extent_ptr_invalid(const struct bch_fs *c,
 	if (bucket >= ca->mi.nbuckets) {
 		prt_printf(err, "pointer past last bucket (%llu > %llu)",
 		       bucket, ca->mi.nbuckets);
-		return -EINVAL;
+		return -BCH_ERR_invalid_bkey;
 	}
 
 	if (ptr->offset < bucket_to_sector(ca, ca->mi.first_bucket)) {
 		prt_printf(err, "pointer before first bucket (%llu < %u)",
 		       bucket, ca->mi.first_bucket);
-		return -EINVAL;
+		return -BCH_ERR_invalid_bkey;
 	}
 
 	if (bucket_offset + size_ondisk > ca->mi.bucket_size) {
 		prt_printf(err, "pointer spans multiple buckets (%u + %u > %u)",
 		       bucket_offset, size_ondisk, ca->mi.bucket_size);
-		return -EINVAL;
+		return -BCH_ERR_invalid_bkey;
 	}
 
 	return 0;
@@ -1105,13 +1105,13 @@ int bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k,
 		if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX) {
 			prt_printf(err, "invalid extent entry type (got %u, max %u)",
 			       __extent_entry_type(entry), BCH_EXTENT_ENTRY_MAX);
-			return -EINVAL;
+			return -BCH_ERR_invalid_bkey;
 		}
 
 		if (bkey_is_btree_ptr(k.k) &&
 		    !extent_entry_is_ptr(entry)) {
 			prt_printf(err, "has non ptr field");
-			return -EINVAL;
+			return -BCH_ERR_invalid_bkey;
 		}
 
 		switch (extent_entry_type(entry)) {
@@ -1130,19 +1130,19 @@ int bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k,
 			if (crc.offset + crc.live_size >
 			    crc.uncompressed_size) {
 				prt_printf(err, "checksum offset + key size > uncompressed size");
-				return -EINVAL;
+				return -BCH_ERR_invalid_bkey;
 			}
 
 			size_ondisk = crc.compressed_size;
 
 			if (!bch2_checksum_type_valid(c, crc.csum_type)) {
 				prt_printf(err, "invalid checksum type");
-				return -EINVAL;
+				return -BCH_ERR_invalid_bkey;
 			}
 
 			if (crc.compression_type >= BCH_COMPRESSION_TYPE_NR) {
 				prt_printf(err, "invalid compression type");
-				return -EINVAL;
+				return -BCH_ERR_invalid_bkey;
 			}
 
 			if (bch2_csum_type_is_encryption(crc.csum_type)) {
@@ -1150,7 +1150,7 @@ int bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k,
 					nonce = crc.offset + crc.nonce;
 				else if (nonce != crc.offset + crc.nonce) {
 					prt_printf(err, "incorrect nonce");
-					return -EINVAL;
+					return -BCH_ERR_invalid_bkey;
 				}
 			}
 			break;
@@ -1161,7 +1161,7 @@ int bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k,
 
 	if (nr_ptrs >= BCH_BKEY_PTRS_MAX) {
 		prt_str(err, "too many ptrs");
-		return -EINVAL;
+		return -BCH_ERR_invalid_bkey;
 	}
 
 	return 0;
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index 90297cfc7934..cc41472a335e 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -1767,8 +1767,11 @@ got_sb:
 	kfree(devs[0]);
 	kfree(devs);
 
-	if (IS_ERR(sb))
-		return ERR_CAST(sb);
+	if (IS_ERR(sb)) {
+		ret = PTR_ERR(sb);
+		ret = bch2_err_class(ret);
+		return ERR_PTR(ret);
+	}
 
 	c = sb->s_fs_info;
 
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index 4ca70c6c3a4f..cf453edcb5ab 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -306,40 +306,40 @@ static int __bch2_inode_invalid(struct bkey_s_c k, struct printbuf *err)
 
 	if (k.k->p.inode) {
 		prt_printf(err, "nonzero k.p.inode");
-		return -EINVAL;
+		return -BCH_ERR_invalid_bkey;
 	}
 
 	if (k.k->p.offset < BLOCKDEV_INODE_MAX) {
 		prt_printf(err, "fs inode in blockdev range");
-		return -EINVAL;
+		return -BCH_ERR_invalid_bkey;
 	}
 
 	if (bch2_inode_unpack(k, &unpacked)) {
 		prt_printf(err, "invalid variable length fields");
-		return -EINVAL;
+		return -BCH_ERR_invalid_bkey;
 	}
 
 	if (unpacked.bi_data_checksum >= BCH_CSUM_OPT_NR + 1) {
 		prt_printf(err, "invalid data checksum type (%u >= %u",
 			unpacked.bi_data_checksum, BCH_CSUM_OPT_NR + 1);
-		return -EINVAL;
+		return -BCH_ERR_invalid_bkey;
 	}
 
 	if (unpacked.bi_compression >= BCH_COMPRESSION_OPT_NR + 1) {
 		prt_printf(err, "invalid data checksum type (%u >= %u)",
 		       unpacked.bi_compression, BCH_COMPRESSION_OPT_NR + 1);
-		return -EINVAL;
+		return -BCH_ERR_invalid_bkey;
 	}
 
 	if ((unpacked.bi_flags & BCH_INODE_UNLINKED) &&
 	    unpacked.bi_nlink != 0) {
 		prt_printf(err, "flagged as unlinked but bi_nlink != 0");
-		return -EINVAL;
+		return -BCH_ERR_invalid_bkey;
 	}
 
 	if (unpacked.bi_subvol && !S_ISDIR(unpacked.bi_mode)) {
 		prt_printf(err, "subvolume root but not a directory");
-		return -EINVAL;
+		return -BCH_ERR_invalid_bkey;
 	}
 
 	return 0;
@@ -353,13 +353,13 @@ int bch2_inode_invalid(const struct bch_fs *c, struct bkey_s_c k,
 	if (bkey_val_bytes(k.k) < sizeof(*inode.v)) {
 		prt_printf(err, "incorrect value size (%zu < %zu)",
 		       bkey_val_bytes(k.k), sizeof(*inode.v));
-		return -EINVAL;
+		return -BCH_ERR_invalid_bkey;
 	}
 
 	if (INODE_STR_HASH(inode.v) >= BCH_STR_HASH_NR) {
 		prt_printf(err, "invalid str hash type (%llu >= %u)",
 		       INODE_STR_HASH(inode.v), BCH_STR_HASH_NR);
-		return -EINVAL;
+		return -BCH_ERR_invalid_bkey;
 	}
 
 	return __bch2_inode_invalid(k, err);
@@ -373,13 +373,13 @@ int bch2_inode_v2_invalid(const struct bch_fs *c, struct bkey_s_c k,
 	if (bkey_val_bytes(k.k) < sizeof(*inode.v)) {
 		prt_printf(err, "incorrect value size (%zu < %zu)",
 		       bkey_val_bytes(k.k), sizeof(*inode.v));
-		return -EINVAL;
+		return -BCH_ERR_invalid_bkey;
 	}
 
 	if (INODEv2_STR_HASH(inode.v) >= BCH_STR_HASH_NR) {
 		prt_printf(err, "invalid str hash type (%llu >= %u)",
 		       INODEv2_STR_HASH(inode.v), BCH_STR_HASH_NR);
-		return -EINVAL;
+		return -BCH_ERR_invalid_bkey;
 	}
 
 	return __bch2_inode_invalid(k, err);
@@ -421,13 +421,13 @@ int bch2_inode_generation_invalid(const struct bch_fs *c, struct bkey_s_c k,
 {
 	if (k.k->p.inode) {
 		prt_printf(err, "nonzero k.p.inode");
-		return -EINVAL;
+		return -BCH_ERR_invalid_bkey;
 	}
 
 	if (bkey_val_bytes(k.k) != sizeof(struct bch_inode_generation)) {
 		prt_printf(err, "incorrect value size (%zu != %zu)",
 		       bkey_val_bytes(k.k), sizeof(struct bch_inode_generation));
-		return -EINVAL;
+		return -BCH_ERR_invalid_bkey;
 	}
 
 	return 0;
diff --git a/fs/bcachefs/journal_sb.c b/fs/bcachefs/journal_sb.c
index c19db0425dd7..9b933330a4c3 100644
--- a/fs/bcachefs/journal_sb.c
+++ b/fs/bcachefs/journal_sb.c
@@ -22,7 +22,7 @@ static int bch2_sb_journal_validate(struct bch_sb *sb,
 {
 	struct bch_sb_field_journal *journal = field_to_type(f, journal);
 	struct bch_member *m = bch2_sb_get_members(sb)->members + sb->dev_idx;
-	int ret = -EINVAL;
+	int ret = -BCH_ERR_invalid_sb_journal;
 	unsigned nr;
 	unsigned i;
 	u64 *b;
@@ -105,7 +105,7 @@ static int bch2_sb_journal_v2_validate(struct bch_sb *sb,
 {
 	struct bch_sb_field_journal_v2 *journal = field_to_type(f, journal_v2);
 	struct bch_member *m = bch2_sb_get_members(sb)->members + sb->dev_idx;
-	int ret = -EINVAL;
+	int ret = -BCH_ERR_invalid_sb_journal;
 	unsigned nr;
 	unsigned i;
 	struct u64_range *b;
diff --git a/fs/bcachefs/journal_seq_blacklist.c b/fs/bcachefs/journal_seq_blacklist.c
index 5c555b3703c0..012c870acce0 100644
--- a/fs/bcachefs/journal_seq_blacklist.c
+++ b/fs/bcachefs/journal_seq_blacklist.c
@@ -203,7 +203,7 @@ static int bch2_sb_journal_seq_blacklist_validate(struct bch_sb *sb,
 		    le64_to_cpu(e->end)) {
 			prt_printf(err, "entry %u start >= end (%llu >= %llu)",
 			       i, le64_to_cpu(e->start), le64_to_cpu(e->end));
-			return -EINVAL;
+			return -BCH_ERR_invalid_sb_journal_seq_blacklist;
 		}
 
 		if (i + 1 < nr &&
@@ -211,7 +211,7 @@ static int bch2_sb_journal_seq_blacklist_validate(struct bch_sb *sb,
 		    le64_to_cpu(e[1].start)) {
 			prt_printf(err, "entry %u out of order with next entry (%llu > %llu)",
 			       i + 1, le64_to_cpu(e[0].end), le64_to_cpu(e[1].start));
-			return -EINVAL;
+			return -BCH_ERR_invalid_sb_journal_seq_blacklist;
 		}
 	}
 
diff --git a/fs/bcachefs/lru.c b/fs/bcachefs/lru.c
index 53e607d72274..db1674ef1d22 100644
--- a/fs/bcachefs/lru.c
+++ b/fs/bcachefs/lru.c
@@ -16,7 +16,7 @@ int bch2_lru_invalid(const struct bch_fs *c, struct bkey_s_c k,
 	if (bkey_val_bytes(k.k) < sizeof(*lru)) {
 		prt_printf(err, "incorrect value size (%zu < %zu)",
 		       bkey_val_bytes(k.k), sizeof(*lru));
-		return -EINVAL;
+		return -BCH_ERR_invalid_bkey;
 	}
 
 	return 0;
diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c
index 7f74c026e9da..ededc826e9a0 100644
--- a/fs/bcachefs/quota.c
+++ b/fs/bcachefs/quota.c
@@ -26,7 +26,7 @@ static int bch2_sb_quota_validate(struct bch_sb *sb, struct bch_sb_field *f,
 	if (vstruct_bytes(&q->field) < sizeof(*q)) {
 		prt_printf(err, "wrong size (got %zu should be %zu)",
 		       vstruct_bytes(&q->field), sizeof(*q));
-		return -EINVAL;
+		return -BCH_ERR_invalid_sb_quota;
 	}
 
 	return 0;
@@ -64,13 +64,13 @@ int bch2_quota_invalid(const struct bch_fs *c, struct bkey_s_c k,
 	if (k.k->p.inode >= QTYP_NR) {
 		prt_printf(err, "invalid quota type (%llu >= %u)",
 		       k.k->p.inode, QTYP_NR);
-		return -EINVAL;
+		return -BCH_ERR_invalid_bkey;
 	}
 
 	if (bkey_val_bytes(k.k) != sizeof(struct bch_quota)) {
 		prt_printf(err, "incorrect value size (%zu != %zu)",
 		       bkey_val_bytes(k.k), sizeof(struct bch_quota));
-		return -EINVAL;
+		return -BCH_ERR_invalid_bkey;
 	}
 
 	return 0;
diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c
index aebed671c43a..8c426d6440c9 100644
--- a/fs/bcachefs/reflink.c
+++ b/fs/bcachefs/reflink.c
@@ -85,7 +85,7 @@ int bch2_reflink_v_invalid(const struct bch_fs *c, struct bkey_s_c k,
 	if (bkey_val_bytes(r.k) < sizeof(*r.v)) {
 		prt_printf(err, "incorrect value size (%zu < %zu)",
 		       bkey_val_bytes(r.k), sizeof(*r.v));
-		return -EINVAL;
+		return -BCH_ERR_invalid_bkey;
 	}
 
 	return bch2_bkey_ptrs_invalid(c, k, rw, err);
@@ -136,7 +136,7 @@ int bch2_indirect_inline_data_invalid(const struct bch_fs *c, struct bkey_s_c k,
 	if (bkey_val_bytes(k.k) < sizeof(struct bch_indirect_inline_data)) {
 		prt_printf(err, "incorrect value size (%zu < %zu)",
 		       bkey_val_bytes(k.k), sizeof(struct bch_indirect_inline_data));
-		return -EINVAL;
+		return -BCH_ERR_invalid_bkey;
 	}
 
 	return 0;
diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c
index e540c1aa91ba..482bedf4be8b 100644
--- a/fs/bcachefs/replicas.c
+++ b/fs/bcachefs/replicas.c
@@ -841,27 +841,27 @@ static int bch2_cpu_replicas_validate(struct bch_replicas_cpu *cpu_r,
 		if (e->data_type >= BCH_DATA_NR) {
 			prt_printf(err, "invalid data type in entry ");
 			bch2_replicas_entry_to_text(err, e);
-			return -EINVAL;
+			return -BCH_ERR_invalid_sb_replicas;
 		}
 
 		if (!e->nr_devs) {
 			prt_printf(err, "no devices in entry ");
 			bch2_replicas_entry_to_text(err, e);
-			return -EINVAL;
+			return -BCH_ERR_invalid_sb_replicas;
 		}
 
 		if (e->nr_required > 1 &&
 		    e->nr_required >= e->nr_devs) {
 			prt_printf(err, "bad nr_required in entry ");
 			bch2_replicas_entry_to_text(err, e);
-			return -EINVAL;
+			return -BCH_ERR_invalid_sb_replicas;
 		}
 
 		for (j = 0; j < e->nr_devs; j++)
 			if (!bch2_dev_exists(sb, mi, e->devs[j])) {
 				prt_printf(err, "invalid device %u in entry ", e->devs[j]);
 				bch2_replicas_entry_to_text(err, e);
-				return -EINVAL;
+				return -BCH_ERR_invalid_sb_replicas;
 			}
 
 		if (i + 1 < cpu_r->nr) {
@@ -873,7 +873,7 @@ static int bch2_cpu_replicas_validate(struct bch_replicas_cpu *cpu_r,
 			if (!memcmp(e, n, cpu_r->entry_size)) {
 				prt_printf(err, "duplicate replicas entry ");
 				bch2_replicas_entry_to_text(err, e);
-				return -EINVAL;
+				return -BCH_ERR_invalid_sb_replicas;
 			}
 		}
 	}
diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c
index e37ffaad5883..f19f6f8d3233 100644
--- a/fs/bcachefs/subvolume.c
+++ b/fs/bcachefs/subvolume.c
@@ -33,13 +33,13 @@ int bch2_snapshot_invalid(const struct bch_fs *c, struct bkey_s_c k,
 	if (bkey_gt(k.k->p, POS(0, U32_MAX)) ||
 	    bkey_lt(k.k->p, POS(0, 1))) {
 		prt_printf(err, "bad pos");
-		return -EINVAL;
+		return -BCH_ERR_invalid_bkey;
 	}
 
 	if (bkey_val_bytes(k.k) != sizeof(struct bch_snapshot)) {
 		prt_printf(err, "bad val size (%zu != %zu)",
 		       bkey_val_bytes(k.k), sizeof(struct bch_snapshot));
-		return -EINVAL;
+		return -BCH_ERR_invalid_bkey;
 	}
 
 	s = bkey_s_c_to_snapshot(k);
@@ -48,18 +48,18 @@ int bch2_snapshot_invalid(const struct bch_fs *c, struct bkey_s_c k,
 	if (id && id <= k.k->p.offset) {
 		prt_printf(err, "bad parent node (%u <= %llu)",
 		       id, k.k->p.offset);
-		return -EINVAL;
+		return -BCH_ERR_invalid_bkey;
 	}
 
 	if (le32_to_cpu(s.v->children[0]) < le32_to_cpu(s.v->children[1])) {
 		prt_printf(err, "children not normalized");
-		return -EINVAL;
+		return -BCH_ERR_invalid_bkey;
 	}
 
 	if (s.v->children[0] &&
 	    s.v->children[0] == s.v->children[1]) {
 		prt_printf(err, "duplicate child nodes");
-		return -EINVAL;
+		return -BCH_ERR_invalid_bkey;
 	}
 
 	for (i = 0; i < 2; i++) {
@@ -68,7 +68,7 @@ int bch2_snapshot_invalid(const struct bch_fs *c, struct bkey_s_c k,
 		if (id >= k.k->p.offset) {
 			prt_printf(err, "bad child node (%u >= %llu)",
 			       id, k.k->p.offset);
-			return -EINVAL;
+			return -BCH_ERR_invalid_bkey;
 		}
 	}
 
@@ -773,13 +773,13 @@ int bch2_subvolume_invalid(const struct bch_fs *c, struct bkey_s_c k,
 	if (bkey_lt(k.k->p, SUBVOL_POS_MIN) ||
 	    bkey_gt(k.k->p, SUBVOL_POS_MAX)) {
 		prt_printf(err, "invalid pos");
-		return -EINVAL;
+		return -BCH_ERR_invalid_bkey;
 	}
 
 	if (bkey_val_bytes(k.k) != sizeof(struct bch_subvolume)) {
 		prt_printf(err, "incorrect value size (%zu != %zu)",
 		       bkey_val_bytes(k.k), sizeof(struct bch_subvolume));
-		return -EINVAL;
+		return -BCH_ERR_invalid_bkey;
 	}
 
 	return 0;
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index e27b301432b1..8dfe92d7eb77 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -216,23 +216,23 @@ static int validate_sb_layout(struct bch_sb_layout *layout, struct printbuf *out
 	if (!uuid_equal(&layout->magic, &BCACHE_MAGIC) &&
 	    !uuid_equal(&layout->magic, &BCHFS_MAGIC)) {
 		prt_printf(out, "Not a bcachefs superblock layout");
-		return -EINVAL;
+		return -BCH_ERR_invalid_sb_layout;
 	}
 
 	if (layout->layout_type != 0) {
 		prt_printf(out, "Invalid superblock layout type %u",
 		       layout->layout_type);
-		return -EINVAL;
+		return -BCH_ERR_invalid_sb_layout_type;
 	}
 
 	if (!layout->nr_superblocks) {
 		prt_printf(out, "Invalid superblock layout: no superblocks");
-		return -EINVAL;
+		return -BCH_ERR_invalid_sb_layout_nr_superblocks;
 	}
 
 	if (layout->nr_superblocks > ARRAY_SIZE(layout->sb_offset)) {
 		prt_printf(out, "Invalid superblock layout: too many superblocks");
-		return -EINVAL;
+		return -BCH_ERR_invalid_sb_layout_nr_superblocks;
 	}
 
 	max_sectors = 1 << layout->sb_max_size_bits;
@@ -246,7 +246,7 @@ static int validate_sb_layout(struct bch_sb_layout *layout, struct printbuf *out
 			prt_printf(out, "Invalid superblock layout: superblocks overlap\n"
 			       "  (sb %u ends at %llu next starts at %llu",
 			       i - 1, prev_offset + max_sectors, offset);
-			return -EINVAL;
+			return -BCH_ERR_invalid_sb_layout_superblocks_overlap;
 		}
 		prev_offset = offset;
 	}
@@ -273,25 +273,25 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb, struct printbuf *out,
 	if (version    >= bcachefs_metadata_version_max) {
 		prt_printf(out, "Unsupported superblock version %u (min %u, max %u)",
 		       version, bcachefs_metadata_version_min, bcachefs_metadata_version_max);
-		return -EINVAL;
+		return -BCH_ERR_invalid_sb_version;
 	}
 
 	if (version_min < bcachefs_metadata_version_min) {
 		prt_printf(out, "Unsupported superblock version %u (min %u, max %u)",
 		       version_min, bcachefs_metadata_version_min, bcachefs_metadata_version_max);
-		return -EINVAL;
+		return -BCH_ERR_invalid_sb_version;
 	}
 
 	if (version_min > version) {
 		prt_printf(out, "Bad minimum version %u, greater than version field %u",
 		       version_min, version);
-		return -EINVAL;
+		return -BCH_ERR_invalid_sb_version;
 	}
 
 	if (sb->features[1] ||
 	    (le64_to_cpu(sb->features[0]) & (~0ULL << BCH_FEATURE_NR))) {
 		prt_printf(out, "Filesystem has incompatible features");
-		return -EINVAL;
+		return -BCH_ERR_invalid_sb_features;
 	}
 
 	block_size = le16_to_cpu(sb->block_size);
@@ -299,37 +299,37 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb, struct printbuf *out,
 	if (block_size > PAGE_SECTORS) {
 		prt_printf(out, "Block size too big (got %u, max %u)",
 		       block_size, PAGE_SECTORS);
-		return -EINVAL;
+		return -BCH_ERR_invalid_sb_block_size;
 	}
 
 	if (bch2_is_zero(sb->user_uuid.b, sizeof(sb->user_uuid))) {
 		prt_printf(out, "Bad user UUID (got zeroes)");
-		return -EINVAL;
+		return -BCH_ERR_invalid_sb_uuid;
 	}
 
 	if (bch2_is_zero(sb->uuid.b, sizeof(sb->uuid))) {
 		prt_printf(out, "Bad intenal UUID (got zeroes)");
-		return -EINVAL;
+		return -BCH_ERR_invalid_sb_uuid;
 	}
 
 	if (!sb->nr_devices ||
 	    sb->nr_devices > BCH_SB_MEMBERS_MAX) {
 		prt_printf(out, "Bad number of member devices %u (max %u)",
 		       sb->nr_devices, BCH_SB_MEMBERS_MAX);
-		return -EINVAL;
+		return -BCH_ERR_invalid_sb_too_many_members;
 	}
 
 	if (sb->dev_idx >= sb->nr_devices) {
 		prt_printf(out, "Bad dev_idx (got %u, nr_devices %u)",
 		       sb->dev_idx, sb->nr_devices);
-		return -EINVAL;
+		return -BCH_ERR_invalid_sb_dev_idx;
 	}
 
 	if (!sb->time_precision ||
 	    le32_to_cpu(sb->time_precision) > NSEC_PER_SEC) {
 		prt_printf(out, "Invalid time precision: %u (min 1, max %lu)",
 		       le32_to_cpu(sb->time_precision), NSEC_PER_SEC);
-		return -EINVAL;
+		return -BCH_ERR_invalid_sb_time_precision;
 	}
 
 	if (rw == READ) {
@@ -366,15 +366,15 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb, struct printbuf *out,
 
 	vstruct_for_each(sb, f) {
 		if (!f->u64s) {
-			prt_printf(out, "Invalid superblock: optional with size 0 (type %u)",
+			prt_printf(out, "Invalid superblock: optional field with size 0 (type %u)",
 			       le32_to_cpu(f->type));
-			return -EINVAL;
+			return -BCH_ERR_invalid_sb_field_size;
 		}
 
 		if (vstruct_next(f) > vstruct_last(sb)) {
 			prt_printf(out, "Invalid superblock: optional field extends past end of superblock (type %u)",
 			       le32_to_cpu(f->type));
-			return -EINVAL;
+			return -BCH_ERR_invalid_sb_field_size;
 		}
 	}
 
@@ -382,7 +382,7 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb, struct printbuf *out,
 	mi = bch2_sb_get_members(sb);
 	if (!mi) {
 		prt_printf(out, "Invalid superblock: member info area missing");
-		return -EINVAL;
+		return -BCH_ERR_invalid_sb_members_missing;
 	}
 
 	ret = bch2_sb_field_validate(sb, &mi->field, out);
@@ -544,7 +544,7 @@ reread:
 	if (!uuid_equal(&sb->sb->magic, &BCACHE_MAGIC) &&
 	    !uuid_equal(&sb->sb->magic, &BCHFS_MAGIC)) {
 		prt_printf(err, "Not a bcachefs superblock");
-		return -EINVAL;
+		return -BCH_ERR_invalid_sb_magic;
 	}
 
 	version		= le16_to_cpu(sb->sb->version);
@@ -555,13 +555,13 @@ reread:
 	if (version    >= bcachefs_metadata_version_max) {
 		prt_printf(err, "Unsupported superblock version %u (min %u, max %u)",
 		       version, bcachefs_metadata_version_min, bcachefs_metadata_version_max);
-		return -EINVAL;
+		return -BCH_ERR_invalid_sb_version;
 	}
 
 	if (version_min < bcachefs_metadata_version_min) {
 		prt_printf(err, "Unsupported superblock version %u (min %u, max %u)",
 		       version_min, bcachefs_metadata_version_min, bcachefs_metadata_version_max);
-		return -EINVAL;
+		return -BCH_ERR_invalid_sb_version;
 	}
 
 	bytes = vstruct_bytes(sb->sb);
@@ -569,7 +569,7 @@ reread:
 	if (bytes > 512 << sb->sb->layout.sb_max_size_bits) {
 		prt_printf(err, "Invalid superblock: too big (got %zu bytes, layout max %lu)",
 		       bytes, 512UL << sb->sb->layout.sb_max_size_bits);
-		return -EINVAL;
+		return -BCH_ERR_invalid_sb_too_big;
 	}
 
 	if (bytes > sb->buffer_size) {
@@ -580,7 +580,7 @@ reread:
 
 	if (BCH_SB_CSUM_TYPE(sb->sb) >= BCH_CSUM_NR) {
 		prt_printf(err, "unknown checksum type %llu", BCH_SB_CSUM_TYPE(sb->sb));
-		return -EINVAL;
+		return -BCH_ERR_invalid_sb_csum_type;
 	}
 
 	/* XXX: verify MACs */
@@ -589,7 +589,7 @@ reread:
 
 	if (bch2_crc_cmp(csum, sb->sb->csum)) {
 		prt_printf(err, "bad checksum");
-		return -EINVAL;
+		return -BCH_ERR_invalid_sb_csum;
 	}
 
 	sb->seq = le64_to_cpu(sb->sb->seq);
@@ -703,7 +703,7 @@ got_super:
 		prt_printf(&err, "block size (%u) smaller than device block size (%u)",
 		       le16_to_cpu(sb->sb->block_size) << 9,
 		       bdev_logical_block_size(sb->bdev));
-		ret = -EINVAL;
+		ret = -BCH_ERR_block_size_too_small;
 		goto err;
 	}
 
@@ -958,7 +958,7 @@ static int bch2_sb_members_validate(struct bch_sb *sb,
 	if ((void *) (mi->members + sb->nr_devices) >
 	    vstruct_end(&mi->field)) {
 		prt_printf(err, "too many devices for section size");
-		return -EINVAL;
+		return -BCH_ERR_invalid_sb_members;
 	}
 
 	for (i = 0; i < sb->nr_devices; i++) {
@@ -970,28 +970,28 @@ static int bch2_sb_members_validate(struct bch_sb *sb,
 		if (le64_to_cpu(m->nbuckets) > LONG_MAX) {
 			prt_printf(err, "device %u: too many buckets (got %llu, max %lu)",
 			       i, le64_to_cpu(m->nbuckets), LONG_MAX);
-			return -EINVAL;
+			return -BCH_ERR_invalid_sb_members;
 		}
 
 		if (le64_to_cpu(m->nbuckets) -
 		    le16_to_cpu(m->first_bucket) < BCH_MIN_NR_NBUCKETS) {
 			prt_printf(err, "device %u: not enough buckets (got %llu, max %u)",
 			       i, le64_to_cpu(m->nbuckets), BCH_MIN_NR_NBUCKETS);
-			return -EINVAL;
+			return -BCH_ERR_invalid_sb_members;
 		}
 
 		if (le16_to_cpu(m->bucket_size) <
 		    le16_to_cpu(sb->block_size)) {
 			prt_printf(err, "device %u: bucket size %u smaller than block size %u",
 			       i, le16_to_cpu(m->bucket_size), le16_to_cpu(sb->block_size));
-			return -EINVAL;
+			return -BCH_ERR_invalid_sb_members;
 		}
 
 		if (le16_to_cpu(m->bucket_size) <
 		    BCH_SB_BTREE_NODE_SIZE(sb)) {
 			prt_printf(err, "device %u: bucket size %u smaller than btree node size %llu",
 			       i, le16_to_cpu(m->bucket_size), BCH_SB_BTREE_NODE_SIZE(sb));
-			return -EINVAL;
+			return -BCH_ERR_invalid_sb_members;
 		}
 	}
 
@@ -1123,12 +1123,12 @@ static int bch2_sb_crypt_validate(struct bch_sb *sb,
 	if (vstruct_bytes(&crypt->field) < sizeof(*crypt)) {
 		prt_printf(err, "wrong size (got %zu should be %zu)",
 		       vstruct_bytes(&crypt->field), sizeof(*crypt));
-		return -EINVAL;
+		return -BCH_ERR_invalid_sb_crypt;
 	}
 
 	if (BCH_CRYPT_KDF_TYPE(crypt)) {
 		prt_printf(err, "bad kdf type %llu", BCH_CRYPT_KDF_TYPE(crypt));
-		return -EINVAL;
+		return -BCH_ERR_invalid_sb_crypt;
 	}
 
 	return 0;
@@ -1365,7 +1365,7 @@ static int bch2_sb_clean_validate(struct bch_sb *sb,
 	if (vstruct_bytes(&clean->field) < sizeof(*clean)) {
 		prt_printf(err, "wrong size (got %zu should be %zu)",
 		       vstruct_bytes(&clean->field), sizeof(*clean));
-		return -EINVAL;
+		return -BCH_ERR_invalid_sb_clean;
 	}
 
 	return 0;
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 234dab15fa63..37dce3e3cccb 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -880,7 +880,7 @@ int bch2_fs_start(struct bch_fs *c)
 	struct bch_dev *ca;
 	time64_t now = ktime_get_real_seconds();
 	unsigned i;
-	int ret = -EINVAL;
+	int ret;
 
 	down_write(&c->state_lock);
 
@@ -917,9 +917,9 @@ int bch2_fs_start(struct bch_fs *c)
 	if (ret)
 		goto err;
 
-	ret = -EINVAL;
 	if (bch2_fs_init_fault("fs_start")) {
 		bch_err(c, "fs_start fault injected");
+		ret = -EINVAL;
 		goto err;
 	}
 
@@ -942,46 +942,43 @@ out:
 	return ret;
 err:
 	bch_err(c, "error starting filesystem: %s", bch2_err_str(ret));
-
-	if (ret < -BCH_ERR_START)
-		ret = -EINVAL;
 	goto out;
 }
 
-static const char *bch2_dev_may_add(struct bch_sb *sb, struct bch_fs *c)
+static int bch2_dev_may_add(struct bch_sb *sb, struct bch_fs *c)
 {
 	struct bch_sb_field_members *sb_mi;
 
 	sb_mi = bch2_sb_get_members(sb);
 	if (!sb_mi)
-		return "Invalid superblock: member info area missing";
+		return -BCH_ERR_member_info_missing;
 
 	if (le16_to_cpu(sb->block_size) != block_sectors(c))
-		return "mismatched block size";
+		return -BCH_ERR_mismatched_block_size;
 
 	if (le16_to_cpu(sb_mi->members[sb->dev_idx].bucket_size) <
 	    BCH_SB_BTREE_NODE_SIZE(c->disk_sb.sb))
-		return "new cache bucket size is too small";
+		return -BCH_ERR_bucket_size_too_small;
 
-	return NULL;
+	return 0;
 }
 
-static const char *bch2_dev_in_fs(struct bch_sb *fs, struct bch_sb *sb)
+static int bch2_dev_in_fs(struct bch_sb *fs, struct bch_sb *sb)
 {
 	struct bch_sb *newest =
 		le64_to_cpu(fs->seq) > le64_to_cpu(sb->seq) ? fs : sb;
 	struct bch_sb_field_members *mi = bch2_sb_get_members(newest);
 
 	if (!uuid_equal(&fs->uuid, &sb->uuid))
-		return "device not a member of filesystem";
+		return -BCH_ERR_device_not_a_member_of_filesystem;
 
 	if (!bch2_dev_exists(newest, mi, sb->dev_idx))
-		return "device has been removed";
+		return -BCH_ERR_device_has_been_removed;
 
 	if (fs->block_size != sb->block_size)
-		return "mismatched block size";
+		return -BCH_ERR_mismatched_block_size;
 
-	return NULL;
+	return 0;
 }
 
 /* Device startup/shutdown: */
@@ -1179,23 +1176,17 @@ static int __bch2_dev_attach_bdev(struct bch_dev *ca, struct bch_sb_handle *sb)
 	if (bch2_dev_is_online(ca)) {
 		bch_err(ca, "already have device online in slot %u",
 			sb->sb->dev_idx);
-		return -EINVAL;
+		return -BCH_ERR_device_already_online;
 	}
 
 	if (get_capacity(sb->bdev->bd_disk) <
 	    ca->mi.bucket_size * ca->mi.nbuckets) {
 		bch_err(ca, "cannot online: device too small");
-		return -EINVAL;
+		return -BCH_ERR_device_size_too_small;
 	}
 
 	BUG_ON(!percpu_ref_is_zero(&ca->io_ref));
 
-	if (get_capacity(sb->bdev->bd_disk) <
-	    ca->mi.bucket_size * ca->mi.nbuckets) {
-		bch_err(ca, "device too small");
-		return -EINVAL;
-	}
-
 	ret = bch2_dev_journal_init(ca, sb->sb);
 	if (ret)
 		return ret;
@@ -1370,7 +1361,7 @@ int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca,
 		return 0;
 
 	if (!bch2_dev_state_allowed(c, ca, new_state, flags))
-		return -EINVAL;
+		return -BCH_ERR_device_state_not_allowed;
 
 	if (new_state != BCH_MEMBER_STATE_rw)
 		__bch2_dev_read_only(c, ca);
@@ -1433,7 +1424,7 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
 {
 	struct bch_sb_field_members *mi;
 	unsigned dev_idx = ca->dev_idx, data;
-	int ret = -EINVAL;
+	int ret;
 
 	down_write(&c->state_lock);
 
@@ -1445,6 +1436,7 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
 
 	if (!bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_failed, flags)) {
 		bch_err(ca, "Cannot remove without losing data");
+		ret = -BCH_ERR_device_state_not_allowed;
 		goto err;
 	}
 
@@ -1530,7 +1522,6 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
 {
 	struct bch_opts opts = bch2_opts_empty();
 	struct bch_sb_handle sb;
-	const char *err;
 	struct bch_dev *ca = NULL;
 	struct bch_sb_field_members *mi;
 	struct bch_member dev_mi;
@@ -1555,10 +1546,9 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
 		}
 	}
 
-	err = bch2_dev_may_add(sb.sb, c);
-	if (err) {
-		bch_err(c, "device add error: %s", err);
-		ret = -EINVAL;
+	ret = bch2_dev_may_add(sb.sb, c);
+	if (ret) {
+		bch_err(c, "device add error: %s", bch2_err_str(ret));
 		goto err;
 	}
 
@@ -1692,7 +1682,6 @@ int bch2_dev_online(struct bch_fs *c, const char *path)
 	struct bch_sb_field_members *mi;
 	struct bch_dev *ca;
 	unsigned dev_idx;
-	const char *err;
 	int ret;
 
 	down_write(&c->state_lock);
@@ -1705,9 +1694,9 @@ int bch2_dev_online(struct bch_fs *c, const char *path)
 
 	dev_idx = sb.sb->dev_idx;
 
-	err = bch2_dev_in_fs(c->disk_sb.sb, sb.sb);
-	if (err) {
-		bch_err(c, "error bringing %s online: %s", path, err);
+	ret = bch2_dev_in_fs(c->disk_sb.sb, sb.sb);
+	if (ret) {
+		bch_err(c, "error bringing %s online: %s", path, bch2_err_str(ret));
 		goto err;
 	}
 
@@ -1741,7 +1730,7 @@ int bch2_dev_online(struct bch_fs *c, const char *path)
 err:
 	up_write(&c->state_lock);
 	bch2_free_super(&sb);
-	return -EINVAL;
+	return ret;
 }
 
 int bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca, int flags)
@@ -1757,7 +1746,7 @@ int bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca, int flags)
 	if (!bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_failed, flags)) {
 		bch_err(ca, "Cannot offline required disk");
 		up_write(&c->state_lock);
-		return -EINVAL;
+		return -BCH_ERR_device_state_not_allowed;
 	}
 
 	__bch2_dev_offline(c, ca);
@@ -1783,7 +1772,7 @@ int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
 	    get_capacity(ca->disk_sb.bdev->bd_disk) <
 	    ca->mi.bucket_size * nbuckets) {
 		bch_err(ca, "New size larger than device");
-		ret = -EINVAL;
+		ret = -BCH_ERR_device_size_too_small;
 		goto err;
 	}
 
@@ -1836,7 +1825,6 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices,
 	struct bch_fs *c = NULL;
 	struct bch_sb_field_members *mi;
 	unsigned i, best_sb = 0;
-	const char *err;
 	struct printbuf errbuf = PRINTBUF;
 	int ret = 0;
 
@@ -1880,8 +1868,8 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices,
 			continue;
 		}
 
-		err = bch2_dev_in_fs(sb[best_sb].sb, sb[i].sb);
-		if (err)
+		ret = bch2_dev_in_fs(sb[best_sb].sb, sb[i].sb);
+		if (ret)
 			goto err_print;
 		i++;
 	}
@@ -1902,9 +1890,10 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices,
 	}
 	up_write(&c->state_lock);
 
-	err = "insufficient devices";
-	if (!bch2_fs_may_start(c))
+	if (!bch2_fs_may_start(c)) {
+		ret = -BCH_ERR_insufficient_devices_to_start;
 		goto err_print;
+	}
 
 	if (!c->opts.nostart) {
 		ret = bch2_fs_start(c);
@@ -1919,8 +1908,7 @@ out:
 	return c;
 err_print:
 	pr_err("bch_fs_open err opening %s: %s",
-	       devices[0], err);
-	ret = -EINVAL;
+	       devices[0], bch2_err_str(ret));
 err:
 	if (!IS_ERR_OR_NULL(c))
 		bch2_fs_stop(c);
diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c
index bd118f6ea08b..448737be045c 100644
--- a/fs/bcachefs/xattr.c
+++ b/fs/bcachefs/xattr.c
@@ -78,7 +78,7 @@ int bch2_xattr_invalid(const struct bch_fs *c, struct bkey_s_c k,
 	if (bkey_val_bytes(k.k) < sizeof(struct bch_xattr)) {
 		prt_printf(err, "incorrect value size (%zu < %zu)",
 		       bkey_val_bytes(k.k), sizeof(*xattr.v));
-		return -EINVAL;
+		return -BCH_ERR_invalid_bkey;
 	}
 
 	if (bkey_val_u64s(k.k) <
@@ -88,7 +88,7 @@ int bch2_xattr_invalid(const struct bch_fs *c, struct bkey_s_c k,
 		       bkey_val_u64s(k.k),
 		       xattr_val_u64s(xattr.v->x_name_len,
 				      le16_to_cpu(xattr.v->x_val_len)));
-		return -EINVAL;
+		return -BCH_ERR_invalid_bkey;
 	}
 
 	/* XXX why +4 ? */
@@ -99,18 +99,18 @@ int bch2_xattr_invalid(const struct bch_fs *c, struct bkey_s_c k,
 		       bkey_val_u64s(k.k),
 		       xattr_val_u64s(xattr.v->x_name_len,
 				      le16_to_cpu(xattr.v->x_val_len) + 4));
-		return -EINVAL;
+		return -BCH_ERR_invalid_bkey;
 	}
 
 	handler = bch2_xattr_type_to_handler(xattr.v->x_type);
 	if (!handler) {
 		prt_printf(err, "invalid type (%u)", xattr.v->x_type);
-		return -EINVAL;
+		return -BCH_ERR_invalid_bkey;
 	}
 
 	if (memchr(xattr.v->x_name, '\0', xattr.v->x_name_len)) {
 		prt_printf(err, "xattr name has invalid characters");
-		return -EINVAL;
+		return -BCH_ERR_invalid_bkey;
 	}
 
 	return 0;
-- 
cgit 


From 321bdc73f3aaba5acb9ed7082cf222444541eb74 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 25 Nov 2022 15:01:36 -0500
Subject: bcachefs: bkey_min(), bkey_max()

Parallel to bpos_min(), bpos_max() - trivial refactoring.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bkey.h       | 18 ++++++++++++++----
 fs/bcachefs/btree_iter.c |  4 +---
 2 files changed, 15 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bkey.h b/fs/bcachefs/bkey.h
index dc2b91bc67f3..28a70ad5a25d 100644
--- a/fs/bcachefs/bkey.h
+++ b/fs/bcachefs/bkey.h
@@ -182,6 +182,16 @@ static __always_inline int bpos_cmp(struct bpos l, struct bpos r)
 		cmp_int(l.snapshot, r.snapshot);
 }
 
+static inline struct bpos bpos_min(struct bpos l, struct bpos r)
+{
+	return bpos_lt(l, r) ? l : r;
+}
+
+static inline struct bpos bpos_max(struct bpos l, struct bpos r)
+{
+	return bpos_gt(l, r) ? l : r;
+}
+
 static __always_inline bool bkey_eq(struct bpos l, struct bpos r)
 {
 	return  !((l.inode	^ r.inode) |
@@ -218,14 +228,14 @@ static __always_inline int bkey_cmp(struct bpos l, struct bpos r)
 		cmp_int(l.offset,   r.offset);
 }
 
-static inline struct bpos bpos_min(struct bpos l, struct bpos r)
+static inline struct bpos bkey_min(struct bpos l, struct bpos r)
 {
-	return bpos_lt(l, r) ? l : r;
+	return bkey_lt(l, r) ? l : r;
 }
 
-static inline struct bpos bpos_max(struct bpos l, struct bpos r)
+static inline struct bpos bkey_max(struct bpos l, struct bpos r)
 {
-	return bpos_gt(l, r) ? l : r;
+	return bkey_gt(l, r) ? l : r;
 }
 
 void bch2_bpos_swab(struct bpos *);
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 04051e45f4e0..8c951cfa74ae 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -2022,10 +2022,8 @@ struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos e
 		 */
 		if (!(iter->flags & BTREE_ITER_IS_EXTENTS))
 			iter_pos = k.k->p;
-		else if (bkey_gt(bkey_start_pos(k.k), iter->pos))
-			iter_pos = bkey_start_pos(k.k);
 		else
-			iter_pos = iter->pos;
+			iter_pos = bkey_max(iter->pos, bkey_start_pos(k.k));
 
 		if (bkey_gt(iter_pos, end)) {
 			bch2_btree_iter_set_pos(iter, end);
-- 
cgit 


From 08f7803159f63e0ce5660acca061cbd6bac06166 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 23 Nov 2022 18:46:03 -0500
Subject: bcachefs: bch2_trans_revalidate_updates_in_node()

When we started stashing the key being overwritten in
btree_insert_entry, this introduced a typical iterator invalidation
problem, triggered by btree node splits or resorts.

Previously, dealt with this by unconditionally re-validating those
stashed pointers in the transaction commit path. This patch gets rid of
that by doing it only when needed, in bch2_trans_node_add() or
bch2_trans_node_reinit_iter().

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c        | 30 +++++++++++++++++++++
 fs/bcachefs/btree_update_leaf.c | 60 ++++++++++++++++++++++-------------------
 2 files changed, 63 insertions(+), 27 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 8c951cfa74ae..c6ccf3add733 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -652,6 +652,32 @@ void bch2_btree_path_level_init(struct btree_trans *trans,
 
 /* Btree path: fixups after btree node updates: */
 
+static void bch2_trans_revalidate_updates_in_node(struct btree_trans *trans, struct btree *b)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_insert_entry *i;
+
+	trans_for_each_update(trans, i)
+		if (!i->cached &&
+		    i->level	== b->c.level &&
+		    i->btree_id	== b->c.btree_id &&
+		    bpos_cmp(i->k->k.p, b->data->min_key) >= 0 &&
+		    bpos_cmp(i->k->k.p, b->data->max_key) <= 0) {
+			i->old_v = bch2_btree_path_peek_slot(i->path, &i->old_k).v;
+
+			if (unlikely(trans->journal_replay_not_finished)) {
+				struct bkey_i *j_k =
+					bch2_journal_keys_peek_slot(c, i->btree_id, i->level,
+								    i->k->k.p);
+
+				if (j_k) {
+					i->old_k = j_k->k;
+					i->old_v = &j_k->v;
+				}
+			}
+		}
+}
+
 /*
  * A btree node is being replaced - update the iterator to point to the new
  * node:
@@ -675,6 +701,8 @@ void bch2_trans_node_add(struct btree_trans *trans, struct btree *b)
 
 			bch2_btree_path_level_init(trans, path, b);
 		}
+
+	bch2_trans_revalidate_updates_in_node(trans, b);
 }
 
 /*
@@ -687,6 +715,8 @@ void bch2_trans_node_reinit_iter(struct btree_trans *trans, struct btree *b)
 
 	trans_for_each_path_with_node(trans, b, path)
 		__btree_path_level_init(path, b->c.level);
+
+	bch2_trans_revalidate_updates_in_node(trans, b);
 }
 
 /* Btree path: traverse, set_pos: */
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index c7d8d2a55551..bdd703289ecb 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -40,6 +40,28 @@ struct bkey_s_c bch2_btree_path_peek_slot_exact(struct btree_path *path, struct
 	return (struct bkey_s_c) { u, NULL };
 }
 
+static void verify_update_old_key(struct btree_trans *trans, struct btree_insert_entry *i)
+{
+#ifdef CONFIG_BCACHEFS_DEBUG
+	struct bch_fs *c = trans->c;
+	struct bkey u;
+	struct bkey_s_c k = bch2_btree_path_peek_slot_exact(i->path, &u);
+
+	if (unlikely(trans->journal_replay_not_finished)) {
+		struct bkey_i *j_k =
+			bch2_journal_keys_peek_slot(c, i->btree_id, i->level, i->k->k.p);
+
+		if (j_k)
+			k = bkey_i_to_s_c(j_k);
+	}
+
+	i->old_k.needs_whiteout = k.k->needs_whiteout;
+
+	BUG_ON(memcmp(&i->old_k, k.k, sizeof(struct bkey)));
+	BUG_ON(i->old_v != k.v);
+#endif
+}
+
 static int __must_check
 bch2_trans_update_by_path(struct btree_trans *, struct btree_path *,
 			  struct bkey_i *, enum btree_update_flags);
@@ -354,6 +376,7 @@ static int btree_key_can_insert_cached(struct btree_trans *trans,
 {
 	struct bch_fs *c = trans->c;
 	struct bkey_cached *ck = (void *) path->l[0].b;
+	struct btree_insert_entry *i;
 	unsigned new_u64s;
 	struct bkey_i *new_k;
 
@@ -381,6 +404,10 @@ static int btree_key_can_insert_cached(struct btree_trans *trans,
 		return -ENOMEM;
 	}
 
+	trans_for_each_update(trans, i)
+		if (i->old_v == &ck->k->v)
+			i->old_v = &new_k->v;
+
 	ck->u64s	= new_u64s;
 	ck->k		= new_k;
 	return 0;
@@ -396,6 +423,8 @@ static int run_one_mem_trigger(struct btree_trans *trans,
 	struct bkey_i *new = i->k;
 	int ret;
 
+	verify_update_old_key(trans, i);
+
 	if (unlikely(flags & BTREE_TRIGGER_NORUN))
 		return 0;
 
@@ -433,6 +462,8 @@ static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_
 	struct bkey old_k = i->old_k;
 	struct bkey_s_c old = { &old_k, i->old_v };
 
+	verify_update_old_key(trans, i);
+
 	if ((i->flags & BTREE_TRIGGER_NORUN) ||
 	    !(BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type)))
 		return 0;
@@ -611,33 +642,6 @@ bch2_trans_commit_write_locked(struct btree_trans *trans,
 
 		if (btree_node_type_needs_gc(i->bkey_type))
 			marking = true;
-
-		/*
-		 * Revalidate before calling mem triggers - XXX, ugly:
-		 *
-		 * - successful btree node splits don't cause transaction
-		 *   restarts and will have invalidated the pointer to the bkey
-		 *   value
-		 * - btree_node_lock_for_insert() -> btree_node_prep_for_write()
-		 *   when it has to resort
-		 * - btree_key_can_insert_cached() when it has to reallocate
-		 *
-		 *   Ugly because we currently have no way to tell if the
-		 *   pointer's been invalidated, which means it's debatabale
-		 *   whether we should be stashing the old key at all.
-		 */
-		i->old_v = bch2_btree_path_peek_slot(i->path, &i->old_k).v;
-
-		if (unlikely(trans->journal_replay_not_finished)) {
-			struct bkey_i *j_k =
-				bch2_journal_keys_peek_slot(c, i->btree_id, i->level,
-							    i->k->k.p);
-
-			if (j_k) {
-				i->old_k = j_k->k;
-				i->old_v = &j_k->v;
-			}
-		}
 	}
 
 	/*
@@ -707,6 +711,8 @@ bch2_trans_commit_write_locked(struct btree_trans *trans,
 			if (i->flags & BTREE_UPDATE_NOJOURNAL)
 				continue;
 
+			verify_update_old_key(trans, i);
+
 			if (trans->journal_transaction_names) {
 				entry = bch2_journal_add_entry(j, &trans->journal_res,
 						       BCH_JSET_ENTRY_overwrite,
-- 
cgit 


From 1ae40fd816ca6f52b46a8d74f799f8a85ecb92ad Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 13 Dec 2022 18:19:30 -0500
Subject: bcachefs: Fix error path in bch2_trans_commit_write_locked()

Previously, we were journalling extra_journal_entries (which is used for
new btree roots, and irreversably mutates system state) before calling
bch2_trans_fs_usage_apply(), which can fail - whoops.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update_leaf.c | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index bdd703289ecb..f986f1774b51 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -660,21 +660,13 @@ bch2_trans_commit_write_locked(struct btree_trans *trans,
 		trans->journal_res.seq = c->journal.replay_journal_seq;
 	}
 
-	if (unlikely(trans->extra_journal_entries.nr)) {
-		memcpy_u64s_small(journal_res_entry(&c->journal, &trans->journal_res),
-				  trans->extra_journal_entries.data,
-				  trans->extra_journal_entries.nr);
-
-		trans->journal_res.offset	+= trans->extra_journal_entries.nr;
-		trans->journal_res.u64s		-= trans->extra_journal_entries.nr;
-	}
-
 	/*
 	 * Not allowed to fail after we've gotten our journal reservation - we
 	 * have to use it:
 	 */
 
-	if (!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)) {
+	if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) &&
+	    !(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)) {
 		if (bch2_journal_seq_verify)
 			trans_for_each_update(trans, i)
 				i->k->k.version.lo = trans->journal_res.seq;
@@ -700,6 +692,15 @@ bch2_trans_commit_write_locked(struct btree_trans *trans,
 			return ret;
 	}
 
+	if (unlikely(trans->extra_journal_entries.nr)) {
+		memcpy_u64s_small(journal_res_entry(&c->journal, &trans->journal_res),
+				  trans->extra_journal_entries.data,
+				  trans->extra_journal_entries.nr);
+
+		trans->journal_res.offset	+= trans->extra_journal_entries.nr;
+		trans->journal_res.u64s		-= trans->extra_journal_entries.nr;
+	}
+
 	if (likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) {
 		trans_for_each_update(trans, i) {
 			struct journal *j = &c->journal;
-- 
cgit 


From 14d7d61fac9c151a270c6ef6f969993eae9f1bbf Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 13 Dec 2022 14:43:03 -0500
Subject: bcachefs: Fix btree_gc when multiple passes required

We weren't resetting filesystem & device usage when restarting gc, which
was spotted when free bucket counters overflowed - whoops.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_gc.c | 25 +++++++++++++++++++++----
 1 file changed, 21 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 2defa811f48f..d4a0e0a716c5 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -1282,8 +1282,7 @@ fsck_err:
 	return ret;
 }
 
-static int bch2_gc_start(struct bch_fs *c,
-			 bool metadata_only)
+static int bch2_gc_start(struct bch_fs *c)
 {
 	struct bch_dev *ca = NULL;
 	unsigned i;
@@ -1298,7 +1297,6 @@ static int bch2_gc_start(struct bch_fs *c,
 	}
 
 	for_each_member_device(ca, c, i) {
-		BUG_ON(ca->buckets_gc);
 		BUG_ON(ca->usage_gc);
 
 		ca->usage_gc = alloc_percpu(struct bch_dev_usage);
@@ -1315,6 +1313,22 @@ static int bch2_gc_start(struct bch_fs *c,
 	return 0;
 }
 
+static int bch2_gc_reset(struct bch_fs *c)
+{
+	struct bch_dev *ca;
+	unsigned i;
+
+	for_each_member_device(ca, c, i) {
+		free_percpu(ca->usage_gc);
+		ca->usage_gc = NULL;
+	}
+
+	free_percpu(c->usage_gc);
+	c->usage_gc = NULL;
+
+	return bch2_gc_start(c);
+}
+
 /* returns true if not equal */
 static inline bool bch2_alloc_v4_cmp(struct bch_alloc_v4 l,
 				     struct bch_alloc_v4 r)
@@ -1761,7 +1775,7 @@ int bch2_gc(struct bch_fs *c, bool initial, bool metadata_only)
 
 	bch2_btree_interior_updates_flush(c);
 
-	ret   = bch2_gc_start(c, metadata_only) ?:
+	ret   = bch2_gc_start(c) ?:
 		bch2_gc_alloc_start(c, metadata_only) ?:
 		bch2_gc_reflink_start(c, metadata_only);
 	if (ret)
@@ -1822,6 +1836,9 @@ again:
 		bch2_gc_stripes_reset(c, metadata_only);
 		bch2_gc_alloc_reset(c, metadata_only);
 		bch2_gc_reflink_reset(c, metadata_only);
+		ret = bch2_gc_reset(c);
+		if (ret)
+			goto out;
 
 		/* flush fsck errors, reset counters */
 		bch2_flush_fsck_errs(c);
-- 
cgit 


From 1ba8a796b427d312aa68c2e04e00b42ec742883e Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 14 Dec 2022 14:47:42 -0500
Subject: bcachefs: Recover from blacklisted journal entries

If it so happens that we crash while dirty, meaning we don't have the
superblock clean section, and we erroneously mark a journal entry we
wrote as blacklisted, we won't be able to recover.

This patch fixes this by adding a fallback: if we've got no superblock
clean section, and no non-ignored journal entries, we try the most
recent ignored journal entry.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/journal_io.c | 10 +++-------
 fs/bcachefs/recovery.c   | 10 +++++++++-
 2 files changed, 12 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 854a0685db09..a54c06064647 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -1123,9 +1123,6 @@ int bch2_journal_read(struct bch_fs *c, u64 *blacklist_seq, u64 *start_seq)
 	/*
 	 * Find most recent flush entry, and ignore newer non flush entries -
 	 * those entries will be blacklisted:
-	 *
-	 *
-	 * XXX check for torn write on last journal entry
 	 */
 	genradix_for_each_reverse(&c->journal_entries, radix_iter, _i) {
 		int write = READ;
@@ -1139,13 +1136,13 @@ int bch2_journal_read(struct bch_fs *c, u64 *blacklist_seq, u64 *start_seq)
 			*blacklist_seq = *start_seq = le64_to_cpu(i->j.seq) + 1;
 
 		if (JSET_NO_FLUSH(&i->j)) {
-			journal_replay_free(c, i);
+			i->ignore = true;
 			continue;
 		}
 
 		if (!last_write_torn && !i->csum_good) {
 			last_write_torn = true;
-			journal_replay_free(c, i);
+			i->ignore = true;
 			continue;
 		}
 
@@ -1194,8 +1191,7 @@ int bch2_journal_read(struct bch_fs *c, u64 *blacklist_seq, u64 *start_seq)
 		if (bch2_journal_seq_is_blacklisted(c, seq, true)) {
 			fsck_err_on(!JSET_NO_FLUSH(&i->j), c,
 				    "found blacklisted journal entry %llu", seq);
-
-			journal_replay_free(c, i);
+			i->ignore = true;
 		}
 	}
 
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 15a676196e2f..976c336f294a 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -1137,7 +1137,15 @@ int bch2_fs_recovery(struct bch_fs *c)
 
 		if (!last_journal_entry) {
 			fsck_err_on(!c->sb.clean, c, "no journal entries found");
-			goto use_clean;
+			if (clean)
+				goto use_clean;
+
+			genradix_for_each_reverse(&c->journal_entries, iter, i)
+				if (*i) {
+					last_journal_entry = &(*i)->j;
+					(*i)->ignore = false;
+					break;
+				}
 		}
 
 		ret = journal_keys_sort(c);
-- 
cgit 


From a16b19cd1d91366e38c06f1195437b200b32e980 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 2 Dec 2022 20:36:06 -0500
Subject: bcachefs: Allow for more btrees

Expand some bitfields so we can keep adding more btrees.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_types.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 6a852f7fbf70..e47fd252c3fb 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -225,7 +225,7 @@ struct btree_path {
 	/* btree_iter_copy starts here: */
 	struct bpos		pos;
 
-	enum btree_id		btree_id:4;
+	enum btree_id		btree_id:5;
 	bool			cached:1;
 	bool			preserve:1;
 	enum btree_path_uptodate uptodate:2;
@@ -235,7 +235,7 @@ struct btree_path {
 	 */
 	bool			should_be_locked:1;
 	unsigned		level:3,
-				locks_want:4;
+				locks_want:3;
 	u8			nodes_locked;
 
 	struct btree_path_level {
@@ -269,7 +269,7 @@ struct btree_iter {
 	struct btree_path	*update_path;
 	struct btree_path	*key_cache_path;
 
-	enum btree_id		btree_id:4;
+	enum btree_id		btree_id:8;
 	unsigned		min_depth:3;
 	unsigned		advanced:1;
 
-- 
cgit 


From 994ba475433a9395cb71e2ffb1928ce9fdb98e80 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 23 Nov 2022 22:13:19 -0500
Subject: bcachefs: New btree helpers

This introduces some new conveniences, to help cut down on boilerplate:

 - bch2_trans_kmalloc_nomemzero() - performance optimiation
 - bch2_bkey_make_mut()
 - bch2_bkey_get_mut()
 - bch2_bkey_get_mut_typed()
 - bch2_bkey_alloc()

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c  |   2 +-
 fs/bcachefs/btree_gc.c          |   9 +---
 fs/bcachefs/btree_iter.h        |  63 +++++++++++++++++++++--
 fs/bcachefs/btree_update_leaf.c |  15 ++----
 fs/bcachefs/buckets.c           |  45 +++++------------
 fs/bcachefs/ec.c                |   4 +-
 fs/bcachefs/fsck.c              |   4 +-
 fs/bcachefs/lru.c               |  18 ++-----
 fs/bcachefs/migrate.c           |   4 +-
 fs/bcachefs/move.c              |   4 +-
 fs/bcachefs/subvolume.c         | 108 ++++++++++------------------------------
 11 files changed, 114 insertions(+), 162 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index cef5de13a6e4..860ac8fc5833 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -491,7 +491,7 @@ static int bch2_bucket_do_index(struct btree_trans *trans,
 	    a->data_type != BCH_DATA_need_discard)
 		return 0;
 
-	k = bch2_trans_kmalloc(trans, sizeof(*k));
+	k = bch2_trans_kmalloc_nomemzero(trans, sizeof(*k));
 	if (IS_ERR(k))
 		return PTR_ERR(k);
 
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index d4a0e0a716c5..fdc9de6e9908 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -1578,15 +1578,12 @@ static int bch2_gc_write_reflink_key(struct btree_trans *trans,
 			"  should be %u",
 			(bch2_bkey_val_to_text(&buf, c, k), buf.buf),
 			r->refcount)) {
-		struct bkey_i *new;
+		struct bkey_i *new = bch2_bkey_make_mut(trans, k);
 
-		new = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
 		ret = PTR_ERR_OR_ZERO(new);
 		if (ret)
 			return ret;
 
-		bkey_reassemble(new, k);
-
 		if (!r->refcount)
 			new->k.type = KEY_TYPE_deleted;
 		else
@@ -1903,13 +1900,11 @@ static int gc_btree_gens_key(struct btree_trans *trans,
 	percpu_up_read(&c->mark_lock);
 	return 0;
 update:
-	u = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
+	u = bch2_bkey_make_mut(trans, k);
 	ret = PTR_ERR_OR_ZERO(u);
 	if (ret)
 		return ret;
 
-	bkey_reassemble(u, k);
-
 	bch2_extent_normalize(c, bkey_i_to_s(u));
 	return bch2_trans_update(trans, iter, u, 0);
 }
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index 3cf0b453a4c0..98ff39bcd8f2 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -404,19 +404,76 @@ void *__bch2_trans_kmalloc(struct btree_trans *, size_t);
 
 static inline void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
 {
-	unsigned new_top = trans->mem_top + size;
-	void *p = trans->mem + trans->mem_top;
+	size = roundup(size, 8);
+
+	if (likely(trans->mem_top + size <= trans->mem_bytes)) {
+		void *p = trans->mem + trans->mem_top;
 
-	if (likely(new_top <= trans->mem_bytes)) {
 		trans->mem_top += size;
 		memset(p, 0, size);
 		return p;
 	} else {
 		return __bch2_trans_kmalloc(trans, size);
+	}
+}
 
+static inline void *bch2_trans_kmalloc_nomemzero(struct btree_trans *trans, size_t size)
+{
+	size = roundup(size, 8);
+
+	if (likely(trans->mem_top + size <= trans->mem_bytes)) {
+		void *p = trans->mem + trans->mem_top;
+
+		trans->mem_top += size;
+		return p;
+	} else {
+		return __bch2_trans_kmalloc(trans, size);
 	}
 }
 
+static inline struct bkey_i *bch2_bkey_make_mut(struct btree_trans *trans, struct bkey_s_c k)
+{
+	struct bkey_i *mut = bch2_trans_kmalloc_nomemzero(trans, bkey_bytes(k.k));
+
+	if (!IS_ERR(mut))
+		bkey_reassemble(mut, k);
+	return mut;
+}
+
+static inline struct bkey_i *bch2_bkey_get_mut(struct btree_trans *trans,
+					       struct btree_iter *iter)
+{
+	struct bkey_s_c k = bch2_btree_iter_peek_slot(iter);
+
+	return unlikely(IS_ERR(k.k))
+		? ERR_CAST(k.k)
+		: bch2_bkey_make_mut(trans, k);
+}
+
+#define bch2_bkey_get_mut_typed(_trans, _iter, _type)			\
+({									\
+	struct bkey_i *_k = bch2_bkey_get_mut(_trans, _iter);		\
+	struct bkey_i_##_type *_ret;					\
+									\
+	if (IS_ERR(_k))							\
+		_ret = ERR_CAST(_k);					\
+	else if (unlikely(_k->k.type != KEY_TYPE_##_type))		\
+		_ret = ERR_PTR(-ENOENT);				\
+	else								\
+		_ret = bkey_i_to_##_type(_k);				\
+	_ret;								\
+})
+
+#define bch2_bkey_alloc(_trans, _iter, _type)				\
+({									\
+	struct bkey_i_##_type *_k = bch2_trans_kmalloc_nomemzero(_trans, sizeof(*_k));\
+	if (!IS_ERR(_k)) {						\
+		bkey_##_type##_init(&_k->k_i);				\
+		_k->k.p	= (_iter)->pos;					\
+	}								\
+	_k;								\
+})
+
 u32 bch2_trans_begin(struct btree_trans *);
 
 static inline struct btree *
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index f986f1774b51..ca15ed33a204 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -1196,13 +1196,11 @@ static noinline int extent_front_merge(struct btree_trans *trans,
 	struct bkey_i *update;
 	int ret;
 
-	update = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
+	update = bch2_bkey_make_mut(trans, k);
 	ret = PTR_ERR_OR_ZERO(update);
 	if (ret)
 		return ret;
 
-	bkey_reassemble(update, k);
-
 	if (!bch2_bkey_merge(c, bkey_i_to_s(update), bkey_i_to_s_c(*insert)))
 		return 0;
 
@@ -1287,12 +1285,10 @@ int bch2_trans_update_extent(struct btree_trans *trans,
 			trans->extra_journal_res += compressed_sectors;
 
 		if (front_split) {
-			update = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
+			update = bch2_bkey_make_mut(trans, k);
 			if ((ret = PTR_ERR_OR_ZERO(update)))
 				goto err;
 
-			bkey_reassemble(update, k);
-
 			bch2_cut_back(start, update);
 
 			bch2_trans_iter_init(trans, &update_iter, btree_id, update->k.p,
@@ -1311,12 +1307,10 @@ int bch2_trans_update_extent(struct btree_trans *trans,
 
 		if (k.k->p.snapshot != insert->k.p.snapshot &&
 		    (front_split || back_split)) {
-			update = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
+			update = bch2_bkey_make_mut(trans, k);
 			if ((ret = PTR_ERR_OR_ZERO(update)))
 				goto err;
 
-			bkey_reassemble(update, k);
-
 			bch2_cut_front(start, update);
 			bch2_cut_back(insert->k.p, update);
 
@@ -1360,11 +1354,10 @@ int bch2_trans_update_extent(struct btree_trans *trans,
 		}
 
 		if (back_split) {
-			update = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
+			update = bch2_bkey_make_mut(trans, k);
 			if ((ret = PTR_ERR_OR_ZERO(update)))
 				goto err;
 
-			bkey_reassemble(update, k);
 			bch2_cut_front(insert->k.p, update);
 
 			ret = bch2_trans_update_by_path(trans, iter.path, update,
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 2e657ded03ff..49cb2589f47c 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -1400,7 +1400,6 @@ static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans,
 			s64 sectors, enum bch_data_type data_type)
 {
 	struct btree_iter iter;
-	struct bkey_s_c k;
 	struct bkey_i_stripe *s;
 	struct bch_replicas_padded r;
 	int ret = 0;
@@ -1408,20 +1407,16 @@ static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans,
 	bch2_trans_iter_init(trans, &iter, BTREE_ID_stripes, POS(0, p.ec.idx),
 			     BTREE_ITER_INTENT|
 			     BTREE_ITER_WITH_UPDATES);
-	k = bch2_btree_iter_peek_slot(&iter);
-	ret = bkey_err(k);
-	if (ret)
-		goto err;
-
-	if (k.k->type != KEY_TYPE_stripe) {
-		bch2_trans_inconsistent(trans,
+	s = bch2_bkey_get_mut_typed(trans, &iter, stripe);
+	ret = PTR_ERR_OR_ZERO(s);
+	if (unlikely(ret)) {
+		bch2_trans_inconsistent_on(ret == -ENOENT, trans,
 			"pointer to nonexistent stripe %llu",
 			(u64) p.ec.idx);
-		ret = -EIO;
 		goto err;
 	}
 
-	if (!bch2_ptr_matches_stripe(bkey_s_c_to_stripe(k).v, p)) {
+	if (!bch2_ptr_matches_stripe(&s->v, p)) {
 		bch2_trans_inconsistent(trans,
 			"stripe pointer doesn't match stripe %llu",
 			(u64) p.ec.idx);
@@ -1429,12 +1424,6 @@ static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans,
 		goto err;
 	}
 
-	s = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
-	ret = PTR_ERR_OR_ZERO(s);
-	if (ret)
-		goto err;
-
-	bkey_reassemble(&s->k_i, k);
 	stripe_blockcount_set(&s->v, p.ec.block,
 		stripe_blockcount_get(&s->v, p.ec.block) +
 		sectors);
@@ -1710,8 +1699,7 @@ static int __bch2_trans_mark_reflink_p(struct btree_trans *trans,
 {
 	struct bch_fs *c = trans->c;
 	struct btree_iter iter;
-	struct bkey_s_c k;
-	struct bkey_i *n;
+	struct bkey_i *k;
 	__le64 *refcount;
 	int add = !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1;
 	struct printbuf buf = PRINTBUF;
@@ -1720,19 +1708,12 @@ static int __bch2_trans_mark_reflink_p(struct btree_trans *trans,
 	bch2_trans_iter_init(trans, &iter, BTREE_ID_reflink, POS(0, *idx),
 			     BTREE_ITER_INTENT|
 			     BTREE_ITER_WITH_UPDATES);
-	k = bch2_btree_iter_peek_slot(&iter);
-	ret = bkey_err(k);
+	k = bch2_bkey_get_mut(trans, &iter);
+	ret = PTR_ERR_OR_ZERO(k);
 	if (ret)
 		goto err;
 
-	n = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
-	ret = PTR_ERR_OR_ZERO(n);
-	if (ret)
-		goto err;
-
-	bkey_reassemble(n, k);
-
-	refcount = bkey_refcount(n);
+	refcount = bkey_refcount(k);
 	if (!refcount) {
 		bch2_bkey_val_to_text(&buf, c, p.s_c);
 		bch2_trans_inconsistent(trans,
@@ -1756,12 +1737,12 @@ static int __bch2_trans_mark_reflink_p(struct btree_trans *trans,
 		u64 pad;
 
 		pad = max_t(s64, le32_to_cpu(v->front_pad),
-			    le64_to_cpu(v->idx) - bkey_start_offset(k.k));
+			    le64_to_cpu(v->idx) - bkey_start_offset(&k->k));
 		BUG_ON(pad > U32_MAX);
 		v->front_pad = cpu_to_le32(pad);
 
 		pad = max_t(s64, le32_to_cpu(v->back_pad),
-			    k.k->p.offset - p.k->size - le64_to_cpu(v->idx));
+			    k->k.p.offset - p.k->size - le64_to_cpu(v->idx));
 		BUG_ON(pad > U32_MAX);
 		v->back_pad = cpu_to_le32(pad);
 	}
@@ -1769,11 +1750,11 @@ static int __bch2_trans_mark_reflink_p(struct btree_trans *trans,
 	le64_add_cpu(refcount, add);
 
 	bch2_btree_iter_set_pos_to_extent_start(&iter);
-	ret = bch2_trans_update(trans, &iter, n, 0);
+	ret = bch2_trans_update(trans, &iter, k, 0);
 	if (ret)
 		goto err;
 
-	*idx = k.k->p.offset;
+	*idx = k->k.p.offset;
 err:
 	bch2_trans_iter_exit(trans, &iter);
 	printbuf_exit(&buf);
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index c855ea025f0e..65da4a185bbb 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -844,13 +844,11 @@ static int ec_stripe_update_extent(struct btree_trans *trans,
 
 	dev = s->key.v.ptrs[block].dev;
 
-	n = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
+	n = bch2_bkey_make_mut(trans, k);
 	ret = PTR_ERR_OR_ZERO(n);
 	if (ret)
 		return ret;
 
-	bkey_reassemble(n, k);
-
 	bch2_bkey_drop_ptrs(bkey_i_to_s(n), ptr, ptr->dev != dev);
 	ec_ptr = (void *) bch2_bkey_has_device(bkey_i_to_s_c(n), dev);
 	BUG_ON(!ec_ptr);
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 7db1486a1143..0124aa0b14c5 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -779,12 +779,10 @@ static int hash_redo_key(struct btree_trans *trans,
 	if (IS_ERR(delete))
 		return PTR_ERR(delete);
 
-	tmp = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
+	tmp = bch2_bkey_make_mut(trans, k);
 	if (IS_ERR(tmp))
 		return PTR_ERR(tmp);
 
-	bkey_reassemble(tmp, k);
-
 	bkey_init(&delete->k);
 	delete->k.p = k_iter->pos;
 	return  bch2_btree_iter_traverse(k_iter) ?:
diff --git a/fs/bcachefs/lru.c b/fs/bcachefs/lru.c
index db1674ef1d22..e16686b3b45a 100644
--- a/fs/bcachefs/lru.c
+++ b/fs/bcachefs/lru.c
@@ -101,14 +101,12 @@ int bch2_lru_set(struct btree_trans *trans, u64 lru_id, u64 idx, u64 *time)
 	BUG_ON(iter.pos.inode != lru_id);
 	*time = iter.pos.offset;
 
-	lru = bch2_trans_kmalloc(trans, sizeof(*lru));
+	lru = bch2_bkey_alloc(trans, &iter, lru);
 	ret = PTR_ERR_OR_ZERO(lru);
 	if (ret)
 		goto err;
 
-	bkey_lru_init(&lru->k_i);
-	lru->k.p	= iter.pos;
-	lru->v.idx	= cpu_to_le64(idx);
+	lru->v.idx = cpu_to_le64(idx);
 
 	ret = bch2_trans_update(trans, &iter, &lru->k_i, 0);
 	if (ret)
@@ -164,17 +162,7 @@ static int bch2_check_lru_key(struct btree_trans *trans,
 			"  for %s",
 			(bch2_bkey_val_to_text(&buf1, c, lru_k), buf1.buf),
 			(bch2_bkey_val_to_text(&buf2, c, k), buf2.buf))) {
-		struct bkey_i *update =
-			bch2_trans_kmalloc(trans, sizeof(*update));
-
-		ret = PTR_ERR_OR_ZERO(update);
-		if (ret)
-			goto err;
-
-		bkey_init(&update->k);
-		update->k.p = lru_iter->pos;
-
-		ret = bch2_trans_update(trans, lru_iter, update, 0);
+		ret = bch2_btree_delete_at(trans, lru_iter, 0);
 		if (ret)
 			goto err;
 	}
diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c
index 8b258d966d04..e3e39127b40a 100644
--- a/fs/bcachefs/migrate.c
+++ b/fs/bcachefs/migrate.c
@@ -49,13 +49,11 @@ static int bch2_dev_usrdata_drop_key(struct btree_trans *trans,
 	if (!bch2_bkey_has_device(k, dev_idx))
 		return 0;
 
-	n = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
+	n = bch2_bkey_make_mut(trans, k);
 	ret = PTR_ERR_OR_ZERO(n);
 	if (ret)
 		return ret;
 
-	bkey_reassemble(n, k);
-
 	ret = drop_dev_ptrs(c, bkey_i_to_s(n), dev_idx, flags, false);
 	if (ret)
 		return ret;
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index 9125cea080bd..4d6fd3025e40 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -192,13 +192,11 @@ static int bch2_extent_drop_ptrs(struct btree_trans *trans,
 	struct bkey_i *n;
 	int ret;
 
-	n = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
+	n = bch2_bkey_make_mut(trans, k);
 	ret = PTR_ERR_OR_ZERO(n);
 	if (ret)
 		return ret;
 
-	bkey_reassemble(n, k);
-
 	while (data_opts.kill_ptrs) {
 		unsigned i = 0, drop = __fls(data_opts.kill_ptrs);
 		struct bch_extent_ptr *ptr;
diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c
index f19f6f8d3233..d090a74bd052 100644
--- a/fs/bcachefs/subvolume.c
+++ b/fs/bcachefs/subvolume.c
@@ -377,33 +377,22 @@ int bch2_fs_snapshots_start(struct bch_fs *c)
 static int bch2_snapshot_node_set_deleted(struct btree_trans *trans, u32 id)
 {
 	struct btree_iter iter;
-	struct bkey_s_c k;
 	struct bkey_i_snapshot *s;
 	int ret = 0;
 
 	bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots, POS(0, id),
 			     BTREE_ITER_INTENT);
-	k = bch2_btree_iter_peek_slot(&iter);
-	ret = bkey_err(k);
-	if (ret)
-		goto err;
-
-	if (k.k->type != KEY_TYPE_snapshot) {
-		bch2_fs_inconsistent(trans->c, "missing snapshot %u", id);
-		ret = -ENOENT;
+	s = bch2_bkey_get_mut_typed(trans, &iter, snapshot);
+	ret = PTR_ERR_OR_ZERO(s);
+	if (unlikely(ret)) {
+		bch2_fs_inconsistent_on(ret == -ENOENT, trans->c, "missing snapshot %u", id);
 		goto err;
 	}
 
 	/* already deleted? */
-	if (BCH_SNAPSHOT_DELETED(bkey_s_c_to_snapshot(k).v))
+	if (BCH_SNAPSHOT_DELETED(&s->v))
 		goto err;
 
-	s = bch2_trans_kmalloc(trans, sizeof(*s));
-	ret = PTR_ERR_OR_ZERO(s);
-	if (ret)
-		goto err;
-
-	bkey_reassemble(&s->k_i, k);
 	SET_BCH_SNAPSHOT_DELETED(&s->v, true);
 	SET_BCH_SNAPSHOT_SUBVOL(&s->v, false);
 	s->v.subvol = 0;
@@ -421,7 +410,6 @@ static int bch2_snapshot_node_delete(struct btree_trans *trans, u32 id)
 	struct btree_iter iter, p_iter = (struct btree_iter) { NULL };
 	struct bkey_s_c k;
 	struct bkey_s_c_snapshot s;
-	struct bkey_i_snapshot *parent;
 	u32 parent_id;
 	unsigned i;
 	int ret = 0;
@@ -445,26 +433,17 @@ static int bch2_snapshot_node_delete(struct btree_trans *trans, u32 id)
 	parent_id = le32_to_cpu(s.v->parent);
 
 	if (parent_id) {
+		struct bkey_i_snapshot *parent;
+
 		bch2_trans_iter_init(trans, &p_iter, BTREE_ID_snapshots,
 				     POS(0, parent_id),
 				     BTREE_ITER_INTENT);
-		k = bch2_btree_iter_peek_slot(&p_iter);
-		ret = bkey_err(k);
-		if (ret)
-			goto err;
-
-		if (k.k->type != KEY_TYPE_snapshot) {
-			bch2_fs_inconsistent(trans->c, "missing snapshot %u", parent_id);
-			ret = -ENOENT;
-			goto err;
-		}
-
-		parent = bch2_trans_kmalloc(trans, sizeof(*parent));
+		parent = bch2_bkey_get_mut_typed(trans, &p_iter, snapshot);
 		ret = PTR_ERR_OR_ZERO(parent);
-		if (ret)
+		if (unlikely(ret)) {
+			bch2_fs_inconsistent_on(ret == -ENOENT, trans->c, "missing snapshot %u", parent_id);
 			goto err;
-
-		bkey_reassemble(&parent->k_i, k);
+		}
 
 		for (i = 0; i < 2; i++)
 			if (le32_to_cpu(parent->v.children[i]) == id)
@@ -522,13 +501,11 @@ int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent,
 			goto err;
 		}
 
-		n = bch2_trans_kmalloc(trans, sizeof(*n));
+		n = bch2_bkey_alloc(trans, &iter, snapshot);
 		ret = PTR_ERR_OR_ZERO(n);
 		if (ret)
 			goto err;
 
-		bkey_snapshot_init(&n->k_i);
-		n->k.p		= iter.pos;
 		n->v.flags	= 0;
 		n->v.parent	= cpu_to_le32(parent);
 		n->v.subvol	= cpu_to_le32(snapshot_subvols[i]);
@@ -545,23 +522,13 @@ int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent,
 
 	if (parent) {
 		bch2_btree_iter_set_pos(&iter, POS(0, parent));
-		k = bch2_btree_iter_peek(&iter);
-		ret = bkey_err(k);
-		if (ret)
-			goto err;
-
-		if (k.k->type != KEY_TYPE_snapshot) {
-			bch_err(trans->c, "snapshot %u not found", parent);
-			ret = -ENOENT;
-			goto err;
-		}
-
-		n = bch2_trans_kmalloc(trans, sizeof(*n));
+		n = bch2_bkey_get_mut_typed(trans, &iter, snapshot);
 		ret = PTR_ERR_OR_ZERO(n);
-		if (ret)
+		if (unlikely(ret)) {
+			if (ret == -ENOENT)
+				bch_err(trans->c, "snapshot %u not found", parent);
 			goto err;
-
-		bkey_reassemble(&n->k_i, k);
+		}
 
 		if (n->v.children[0] || n->v.children[1]) {
 			bch_err(trans->c, "Trying to add child snapshot nodes to parent that already has children");
@@ -967,7 +934,6 @@ int bch2_subvolume_wait_for_pagecache_and_delete_hook(struct btree_trans *trans,
 int bch2_subvolume_unlink(struct btree_trans *trans, u32 subvolid)
 {
 	struct btree_iter iter;
-	struct bkey_s_c k;
 	struct bkey_i_subvolume *n;
 	struct subvolume_unlink_hook *h;
 	int ret = 0;
@@ -976,23 +942,13 @@ int bch2_subvolume_unlink(struct btree_trans *trans, u32 subvolid)
 			     POS(0, subvolid),
 			     BTREE_ITER_CACHED|
 			     BTREE_ITER_INTENT);
-	k = bch2_btree_iter_peek_slot(&iter);
-	ret = bkey_err(k);
-	if (ret)
-		goto err;
-
-	if (k.k->type != KEY_TYPE_subvolume) {
-		bch2_fs_inconsistent(trans->c, "missing subvolume %u", subvolid);
-		ret = -EIO;
-		goto err;
-	}
-
-	n = bch2_trans_kmalloc(trans, sizeof(*n));
+	n = bch2_bkey_get_mut_typed(trans, &iter, subvolume);
 	ret = PTR_ERR_OR_ZERO(n);
-	if (ret)
+	if (unlikely(ret)) {
+		bch2_fs_inconsistent_on(ret == -ENOENT, trans->c, "missing subvolume %u", subvolid);
 		goto err;
+	}
 
-	bkey_reassemble(&n->k_i, k);
 	SET_BCH_SUBVOLUME_UNLINKED(&n->v, true);
 
 	ret = bch2_trans_update(trans, &iter, &n->k_i, 0);
@@ -1049,27 +1005,19 @@ found_slot:
 
 	if (src_subvolid) {
 		/* Creating a snapshot: */
-		src_subvol = bch2_trans_kmalloc(trans, sizeof(*src_subvol));
-		ret = PTR_ERR_OR_ZERO(src_subvol);
-		if (ret)
-			goto err;
 
 		bch2_trans_iter_init(trans, &src_iter, BTREE_ID_subvolumes,
 				     POS(0, src_subvolid),
 				     BTREE_ITER_CACHED|
 				     BTREE_ITER_INTENT);
-		k = bch2_btree_iter_peek_slot(&src_iter);
-		ret = bkey_err(k);
-		if (ret)
-			goto err;
-
-		if (k.k->type != KEY_TYPE_subvolume) {
-			bch_err(c, "subvolume %u not found", src_subvolid);
-			ret = -ENOENT;
+		src_subvol = bch2_bkey_get_mut_typed(trans, &src_iter, subvolume);
+		ret = PTR_ERR_OR_ZERO(src_subvol);
+		if (unlikely(ret)) {
+			bch2_fs_inconsistent_on(ret == -ENOENT, trans->c,
+						"subvolume %u not found", src_subvolid);
 			goto err;
 		}
 
-		bkey_reassemble(&src_subvol->k_i, k);
 		parent = le32_to_cpu(src_subvol->v.snapshot);
 	}
 
@@ -1086,18 +1034,16 @@ found_slot:
 			goto err;
 	}
 
-	new_subvol = bch2_trans_kmalloc(trans, sizeof(*new_subvol));
+	new_subvol = bch2_bkey_alloc(trans, &dst_iter, subvolume);
 	ret = PTR_ERR_OR_ZERO(new_subvol);
 	if (ret)
 		goto err;
 
-	bkey_subvolume_init(&new_subvol->k_i);
 	new_subvol->v.flags	= 0;
 	new_subvol->v.snapshot	= cpu_to_le32(new_nodes[0]);
 	new_subvol->v.inode	= cpu_to_le64(inode);
 	SET_BCH_SUBVOLUME_RO(&new_subvol->v, ro);
 	SET_BCH_SUBVOLUME_SNAP(&new_subvol->v, src_subvolid != 0);
-	new_subvol->k.p		= dst_iter.pos;
 	ret = bch2_trans_update(trans, &dst_iter, &new_subvol->k_i, 0);
 	if (ret)
 		goto err;
-- 
cgit 


From 47a0ea6abeef60115e3ca60ed0a640bba376a7d0 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 23 Nov 2022 20:11:46 -0500
Subject: bcachefs: Add some unlikely() annotations

Add a few easy unlikely() optimizations. These are mainly worthwhile
because the compiler will (usually) put the branch-not-taken path at the
end of the function, meaning better icache utilization.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bkey.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bkey.c b/fs/bcachefs/bkey.c
index a10046ae01f2..abb444192749 100644
--- a/fs/bcachefs/bkey.c
+++ b/fs/bcachefs/bkey.c
@@ -488,18 +488,18 @@ enum bkey_pack_pos_ret bch2_bkey_pack_pos_lossy(struct bkey_packed *out,
 		     le64_to_cpu(f->field_offset[BKEY_FIELD_INODE])))
 		return BKEY_PACK_POS_FAIL;
 
-	if (!set_inc_field_lossy(&state, BKEY_FIELD_INODE, in.inode)) {
+	if (unlikely(!set_inc_field_lossy(&state, BKEY_FIELD_INODE, in.inode))) {
 		in.offset	= KEY_OFFSET_MAX;
 		in.snapshot	= KEY_SNAPSHOT_MAX;
 		exact = false;
 	}
 
-	if (!set_inc_field_lossy(&state, BKEY_FIELD_OFFSET, in.offset)) {
+	if (unlikely(!set_inc_field_lossy(&state, BKEY_FIELD_OFFSET, in.offset))) {
 		in.snapshot	= KEY_SNAPSHOT_MAX;
 		exact = false;
 	}
 
-	if (!set_inc_field_lossy(&state, BKEY_FIELD_SNAPSHOT, in.snapshot))
+	if (unlikely(!set_inc_field_lossy(&state, BKEY_FIELD_SNAPSHOT, in.snapshot)))
 		exact = false;
 
 	pack_state_finish(&state, out);
-- 
cgit 


From 67ace2724603378fec0c5321736e2772d459fd8c Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 22 Dec 2022 20:51:02 -0500
Subject: bcachefs: Add a missing bch2_err_str() call

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/recovery.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 976c336f294a..55819378fd4a 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -1564,6 +1564,6 @@ int bch2_fs_initialize(struct bch_fs *c)
 
 	return 0;
 err:
-	pr_err("Error initializing new filesystem: %s (%i)", err, ret);
+	pr_err("Error initializing new filesystem: %s (%s)", err, bch2_err_str(ret));
 	return ret;
 }
-- 
cgit 


From e242b92af5ef74cdf40b237c9e904034c081b144 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 15 Dec 2022 21:44:32 -0500
Subject: bcachefs: Fix for long running btree transactions & key cache

While a btree transaction is running, we hold a SRCU read lock on the
btree key cache that prevents btree key cache keys from being freed -
this is so that relock() operations won't access freed memory.

The downside of this is that long running btree transactions prevent
memory from being freed from the key cache. This adds a check in
bch2_trans_begin() - if the transaction has been running longer than 1
second, drop and retake the SRCU read lock and zero out pointers to
unlock key cache paths.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c  | 18 ++++++++++++++++++
 fs/bcachefs/btree_types.h |  1 +
 fs/bcachefs/errcode.h     |  1 +
 3 files changed, 20 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index c6ccf3add733..669d2b0b384a 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -2756,6 +2756,20 @@ void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
 	return p;
 }
 
+static noinline void bch2_trans_reset_srcu_lock(struct btree_trans *trans)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_path *path;
+
+	trans_for_each_path(trans, path)
+		if (path->cached && !btree_node_locked(path, 0))
+			path->l[0].b = ERR_PTR(-BCH_ERR_no_btree_node_srcu_reset);
+
+	srcu_read_unlock(&c->btree_trans_barrier, trans->srcu_idx);
+	trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
+	trans->srcu_lock_time	= jiffies;
+}
+
 /**
  * bch2_trans_begin() - reset a transaction after a interrupted attempt
  * @trans: transaction to reset
@@ -2811,6 +2825,9 @@ u32 bch2_trans_begin(struct btree_trans *trans)
 		bch2_trans_relock(trans);
 	}
 
+	if (unlikely(time_after(jiffies, trans->srcu_lock_time + HZ)))
+		bch2_trans_reset_srcu_lock(trans);
+
 	trans->last_restarted_ip = _RET_IP_;
 	if (trans->restarted)
 		bch2_btree_path_traverse_all(trans);
@@ -2897,6 +2914,7 @@ void __bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, unsigned fn_
 		trans->nr_max_paths = s->nr_max_paths;
 
 	trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
+	trans->srcu_lock_time	= jiffies;
 
 	if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG_TRANSACTIONS)) {
 		struct btree_trans *pos;
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index e47fd252c3fb..390cfe63fbe8 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -411,6 +411,7 @@ struct btree_trans {
 	enum bch_errcode	restarted:16;
 	u32			restart_count;
 	unsigned long		last_restarted_ip;
+	unsigned long		srcu_lock_time;
 
 	/*
 	 * For when bch2_trans_update notices we'll be splitting a compressed
diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h
index dc388864be6f..5f0f75726784 100644
--- a/fs/bcachefs/errcode.h
+++ b/fs/bcachefs/errcode.h
@@ -53,6 +53,7 @@
 	x(BCH_ERR_no_btree_node,	no_btree_node_down)			\
 	x(BCH_ERR_no_btree_node,	no_btree_node_init)			\
 	x(BCH_ERR_no_btree_node,	no_btree_node_cached)			\
+	x(BCH_ERR_no_btree_node,	no_btree_node_srcu_reset)		\
 	x(0,				btree_insert_fail)			\
 	x(BCH_ERR_btree_insert_fail,	btree_insert_btree_node_full)		\
 	x(BCH_ERR_btree_insert_fail,	btree_insert_need_mark_replicas)	\
-- 
cgit 


From a2b9a5b272f2bbfc3988f4b5c8e92d9951885bcb Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 7 Mar 2022 01:35:55 -0500
Subject: bcachefs: Fix bch2_journal_flush_device_pins()

It's now legal for the pin fifo to be empty, which means this code needs
to be updated in order to not hit an assert.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/journal_reclaim.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
index b683a13dbf87..e8b7ed8e1333 100644
--- a/fs/bcachefs/journal_reclaim.c
+++ b/fs/bcachefs/journal_reclaim.c
@@ -831,10 +831,12 @@ int bch2_journal_flush_device_pins(struct journal *j, int dev_idx)
 	seq = 0;
 
 	spin_lock(&j->lock);
-	while (!ret && seq < j->pin.back) {
+	while (!ret) {
 		struct bch_replicas_padded replicas;
 
 		seq = max(seq, journal_last_seq(j));
+		if (seq >= j->pin.back)
+			break;
 		bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal,
 					 journal_seq_pin(j, seq)->devs);
 		seq++;
-- 
cgit 


From 84464e57529b45e235c24bc7db24b60ffba7f540 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 20 Dec 2022 19:43:41 -0500
Subject: bcachefs: Be less restrictive when validating journal overwrite
 entries

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/journal_io.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index a54c06064647..dbbf2a03bd38 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -661,7 +661,7 @@ static int journal_entry_overwrite_validate(struct bch_fs *c,
 				      struct jset_entry *entry,
 				      unsigned version, int big_endian, int write)
 {
-	return journal_entry_btree_keys_validate(c, jset, entry, version, big_endian, write);
+	return journal_entry_btree_keys_validate(c, jset, entry, version, big_endian, READ);
 }
 
 static void journal_entry_overwrite_to_text(struct printbuf *out, struct bch_fs *c,
-- 
cgit 


From 2cc9c0db89f7b6e3c3218ddaa7964055af8d95af Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 28 Dec 2022 15:17:07 -0500
Subject: bcachefs: Fix some memcpy() warnings

With CONFIG_FORTIFY_SOURCE, the compiler attempts to warn about mempcys
that extend past struct field boundaries. This results in some spurious
warnings where we use embedded variable length structs, this patch
switches to unsafe_mecpy() to fix the warnings.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/buckets.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 49cb2589f47c..137a9aef6987 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -156,7 +156,8 @@ retry:
 
 	do {
 		seq = read_seqcount_begin(&c->usage_lock);
-		memcpy(&ret->u, c->usage_base, u64s * sizeof(u64));
+		unsafe_memcpy(&ret->u, c->usage_base, u64s * sizeof(u64),
+			      "embedded variable length struct");
 		for (i = 0; i < ARRAY_SIZE(c->usage); i++)
 			acc_u64s_percpu((u64 *) &ret->u, (u64 __percpu *) c->usage[i], u64s);
 	} while (read_seqcount_retry(&c->usage_lock, seq));
-- 
cgit 


From 9d7f2a4111be34eac6b23ed62271efb12f36815f Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 4 Jan 2023 21:34:41 -0500
Subject: bcachefs: bch2_btree_trans_to_text(): print blocked time

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 669d2b0b384a..326742e00159 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -3076,7 +3076,9 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct btree_trans *trans)
 
 	b = READ_ONCE(trans->locking);
 	if (b) {
-		prt_str(out, "  want");
+		prt_printf(out, "  blocked for %lluus on",
+			   div_u64(local_clock() - trans->locking_wait.start_time,
+				   1000));
 		prt_newline(out);
 		prt_printf(out, "    %c", lock_types[trans->locking_wait.lock_want]);
 		bch2_btree_bkey_cached_common_to_text(out, b);
-- 
cgit 


From 5bbe3f2d0e1e52c03f32cb40cc749e1ace6453d0 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 14 Dec 2022 10:39:04 -0500
Subject: bcachefs: Log more messages in the journal

This patch

 - Adds a mechanism for queuing up journal entries prior to the journal
   being started, which will be used for early journal log messages

 - Adds bch2_fs_log_msg() and improves bch2_trans_log_msg(), which now
   take format strings. bch2_fs_log_msg() can be used before or after
   the journal has been started, and will use the appropriate mechanism.

 - Deletes the now obsolete bch2_journal_log_msg()

 - And adds more log messages to the recovery path - messages for
   journal/filesystem started, journal entries being blacklisted, and
   journal replay starting/finishing.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_types.h       |  2 +-
 fs/bcachefs/btree_update.h      |  3 +-
 fs/bcachefs/btree_update_leaf.c | 63 ++++++++++++++++++++++++++++++++++-------
 fs/bcachefs/journal.c           | 50 ++++++++++----------------------
 fs/bcachefs/journal.h           |  1 -
 fs/bcachefs/journal_io.c        | 20 +++++++------
 fs/bcachefs/journal_io.h        |  2 +-
 fs/bcachefs/journal_types.h     |  8 ++++++
 fs/bcachefs/recovery.c          | 29 +++++++++++++------
 9 files changed, 111 insertions(+), 67 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 390cfe63fbe8..3cf10b3f3788 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -433,7 +433,7 @@ struct btree_trans {
 
 	/* update path: */
 	struct btree_trans_commit_hook *hooks;
-	DARRAY(u64)		extra_journal_entries;
+	darray_u64		extra_journal_entries;
 	struct journal_entry_pin *journal_pin;
 
 	struct journal_res	journal_res;
diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
index 1c2e7b2b4ed5..7e9f1f170d5f 100644
--- a/fs/bcachefs/btree_update.h
+++ b/fs/bcachefs/btree_update.h
@@ -82,7 +82,8 @@ void bch2_trans_commit_hook(struct btree_trans *,
 			    struct btree_trans_commit_hook *);
 int __bch2_trans_commit(struct btree_trans *);
 
-int bch2_trans_log_msg(struct btree_trans *, const char *);
+int bch2_trans_log_msg(struct btree_trans *, const char *, ...);
+int bch2_fs_log_msg(struct bch_fs *, const char *, ...);
 
 /**
  * bch2_trans_commit - insert keys at given iterator positions
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index ca15ed33a204..d333d0e46d0d 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -1772,18 +1772,25 @@ int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id,
 	return ret;
 }
 
-int bch2_trans_log_msg(struct btree_trans *trans, const char *msg)
+static int __bch2_trans_log_msg(darray_u64 *entries, const char *fmt, va_list args)
 {
-	unsigned len = strlen(msg);
-	unsigned u64s = DIV_ROUND_UP(len, sizeof(u64));
+	struct printbuf buf = PRINTBUF;
 	struct jset_entry_log *l;
+	unsigned u64s;
 	int ret;
 
-	ret = darray_make_room(&trans->extra_journal_entries, jset_u64s(u64s));
+	prt_vprintf(&buf, fmt, args);
+	ret = buf.allocation_failure ? -ENOMEM : 0;
 	if (ret)
-		return ret;
+		goto err;
+
+	u64s = DIV_ROUND_UP(buf.pos, sizeof(u64));
+
+	ret = darray_make_room(entries, jset_u64s(u64s));
+	if (ret)
+		goto err;
 
-	l = (void *) &darray_top(trans->extra_journal_entries);
+	l = (void *) &darray_top(*entries);
 	l->entry.u64s		= cpu_to_le16(u64s);
 	l->entry.btree_id	= 0;
 	l->entry.level		= 1;
@@ -1791,10 +1798,44 @@ int bch2_trans_log_msg(struct btree_trans *trans, const char *msg)
 	l->entry.pad[0]		= 0;
 	l->entry.pad[1]		= 0;
 	l->entry.pad[2]		= 0;
-	memcpy(l->d, msg, len);
-	while (len & 7)
-		l->d[len++] = '\0';
+	memcpy(l->d, buf.buf, buf.pos);
+	while (buf.pos & 7)
+		l->d[buf.pos++] = '\0';
+
+	entries->nr += jset_u64s(u64s);
+err:
+	printbuf_exit(&buf);
+	return ret;
+}
+
+int bch2_trans_log_msg(struct btree_trans *trans, const char *fmt, ...)
+{
+	va_list args;
+	int ret;
+
+	va_start(args, fmt);
+	ret = __bch2_trans_log_msg(&trans->extra_journal_entries, fmt, args);
+	va_end(args);
+
+	return ret;
+}
+
+int bch2_fs_log_msg(struct bch_fs *c, const char *fmt, ...)
+{
+	va_list args;
+	int ret;
+
+	va_start(args, fmt);
+
+	if (!test_bit(JOURNAL_STARTED, &c->journal.flags)) {
+		ret = __bch2_trans_log_msg(&c->journal.early_journal_entries, fmt, args);
+	} else {
+		ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_LAZY_RW,
+			__bch2_trans_log_msg(&trans.extra_journal_entries, fmt, args));
+	}
+
+	va_end(args);
+
+	return ret;
 
-	trans->extra_journal_entries.nr += jset_u64s(u64s);
-	return 0;
 }
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index ed3ed3072db1..c7a7b9cd20f0 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -249,7 +249,7 @@ static int journal_entry_open(struct journal *j)
 		journal_entry_overhead(j);
 	u64s = clamp_t(int, u64s, 0, JOURNAL_ENTRY_CLOSED_VAL - 1);
 
-	if (u64s <= 0)
+	if (u64s <= (ssize_t) j->early_journal_entries.nr)
 		return JOURNAL_ERR_journal_full;
 
 	if (fifo_empty(&j->pin) && j->reclaim_thread)
@@ -274,6 +274,12 @@ static int journal_entry_open(struct journal *j)
 	buf->data->seq	= cpu_to_le64(journal_cur_seq(j));
 	buf->data->u64s	= 0;
 
+	if (j->early_journal_entries.nr) {
+		memcpy(buf->data->_data, j->early_journal_entries.data,
+		       j->early_journal_entries.nr * sizeof(u64));
+		le32_add_cpu(&buf->data->u64s, j->early_journal_entries.nr);
+	}
+
 	/*
 	 * Must be set before marking the journal entry as open:
 	 */
@@ -290,7 +296,9 @@ static int journal_entry_open(struct journal *j)
 		BUG_ON(new.idx != (journal_cur_seq(j) & JOURNAL_BUF_MASK));
 
 		journal_state_inc(&new);
-		new.cur_entry_offset = 0;
+
+		/* Handle any already added entries */
+		new.cur_entry_offset = le32_to_cpu(buf->data->u64s);
 	} while ((v = atomic64_cmpxchg(&j->reservations.counter,
 				       old.v, new.v)) != old.v);
 
@@ -303,6 +311,9 @@ static int journal_entry_open(struct journal *j)
 			 &j->write_work,
 			 msecs_to_jiffies(c->opts.journal_flush_delay));
 	journal_wake(j);
+
+	if (j->early_journal_entries.nr)
+		darray_exit(&j->early_journal_entries);
 	return 0;
 }
 
@@ -719,39 +730,6 @@ int bch2_journal_meta(struct journal *j)
 	return bch2_journal_flush_seq(j, res.seq);
 }
 
-int bch2_journal_log_msg(struct journal *j, const char *fmt, ...)
-{
-	struct jset_entry_log *entry;
-	struct journal_res res = { 0 };
-	unsigned msglen, u64s;
-	va_list args;
-	int ret;
-
-	va_start(args, fmt);
-	msglen = vsnprintf(NULL, 0, fmt, args) + 1;
-	va_end(args);
-
-	u64s = jset_u64s(DIV_ROUND_UP(msglen, sizeof(u64)));
-
-	ret = bch2_journal_res_get(j, &res, u64s, 0);
-	if (ret)
-		return ret;
-
-	entry = container_of(journal_res_entry(j, &res),
-			     struct jset_entry_log, entry);
-	memset(entry, 0, u64s * sizeof(u64));
-	entry->entry.type = BCH_JSET_ENTRY_log;
-	entry->entry.u64s = u64s - 1;
-
-	va_start(args, fmt);
-	vsnprintf(entry->d, INT_MAX, fmt, args);
-	va_end(args);
-
-	bch2_journal_res_put(j, &res);
-
-	return bch2_journal_flush_seq(j, res.seq);
-}
-
 /* block/unlock the journal: */
 
 void bch2_journal_unblock(struct journal *j)
@@ -1197,6 +1175,8 @@ void bch2_fs_journal_exit(struct journal *j)
 {
 	unsigned i;
 
+	darray_exit(&j->early_journal_entries);
+
 	for (i = 0; i < ARRAY_SIZE(j->buf); i++)
 		kvpfree(j->buf[i].data, j->buf[i].buf_size);
 	free_fifo(&j->pin);
diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h
index 3e93f0d67c09..1c7508333432 100644
--- a/fs/bcachefs/journal.h
+++ b/fs/bcachefs/journal.h
@@ -478,7 +478,6 @@ int bch2_journal_flush_seq(struct journal *, u64);
 int bch2_journal_flush(struct journal *);
 bool bch2_journal_noflush_seq(struct journal *, u64);
 int bch2_journal_meta(struct journal *);
-int bch2_journal_log_msg(struct journal *, const char *, ...);
 
 void bch2_journal_halt(struct journal *);
 
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index dbbf2a03bd38..a9744924d619 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -1079,7 +1079,10 @@ void bch2_journal_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
 	}
 }
 
-int bch2_journal_read(struct bch_fs *c, u64 *blacklist_seq, u64 *start_seq)
+int bch2_journal_read(struct bch_fs *c,
+		      u64 *last_seq,
+		      u64 *blacklist_seq,
+		      u64 *start_seq)
 {
 	struct journal_list jlist;
 	struct journal_replay *i, **_i, *prev = NULL;
@@ -1088,7 +1091,7 @@ int bch2_journal_read(struct bch_fs *c, u64 *blacklist_seq, u64 *start_seq)
 	unsigned iter;
 	struct printbuf buf = PRINTBUF;
 	bool degraded = false, last_write_torn = false;
-	u64 seq, last_seq = 0;
+	u64 seq;
 	int ret = 0;
 
 	closure_init_stack(&jlist.cl);
@@ -1117,6 +1120,7 @@ int bch2_journal_read(struct bch_fs *c, u64 *blacklist_seq, u64 *start_seq)
 	if (jlist.ret)
 		return jlist.ret;
 
+	*last_seq	= 0;
 	*start_seq	= 0;
 	*blacklist_seq	= 0;
 
@@ -1153,7 +1157,7 @@ int bch2_journal_read(struct bch_fs *c, u64 *blacklist_seq, u64 *start_seq)
 					 le64_to_cpu(i->j.seq)))
 			i->j.last_seq = i->j.seq;
 
-		last_seq	= le64_to_cpu(i->j.last_seq);
+		*last_seq	= le64_to_cpu(i->j.last_seq);
 		*blacklist_seq	= le64_to_cpu(i->j.seq) + 1;
 		break;
 	}
@@ -1163,13 +1167,13 @@ int bch2_journal_read(struct bch_fs *c, u64 *blacklist_seq, u64 *start_seq)
 		return 0;
 	}
 
-	if (!last_seq) {
+	if (!*last_seq) {
 		fsck_err(c, "journal read done, but no entries found after dropping non-flushes");
 		return 0;
 	}
 
 	bch_info(c, "journal read done, replaying entries %llu-%llu",
-		 last_seq, *blacklist_seq - 1);
+		 *last_seq, *blacklist_seq - 1);
 
 	if (*start_seq != *blacklist_seq)
 		bch_info(c, "dropped unflushed entries %llu-%llu",
@@ -1183,7 +1187,7 @@ int bch2_journal_read(struct bch_fs *c, u64 *blacklist_seq, u64 *start_seq)
 			continue;
 
 		seq = le64_to_cpu(i->j.seq);
-		if (seq < last_seq) {
+		if (seq < *last_seq) {
 			journal_replay_free(c, i);
 			continue;
 		}
@@ -1196,7 +1200,7 @@ int bch2_journal_read(struct bch_fs *c, u64 *blacklist_seq, u64 *start_seq)
 	}
 
 	/* Check for missing entries: */
-	seq = last_seq;
+	seq = *last_seq;
 	genradix_for_each(&c->journal_entries, radix_iter, _i) {
 		i = *_i;
 
@@ -1234,7 +1238,7 @@ int bch2_journal_read(struct bch_fs *c, u64 *blacklist_seq, u64 *start_seq)
 				 "  prev at %s\n"
 				 "  next at %s",
 				 missing_start, missing_end,
-				 last_seq, *blacklist_seq - 1,
+				 *last_seq, *blacklist_seq - 1,
 				 buf1.buf, buf2.buf);
 
 			printbuf_exit(&buf1);
diff --git a/fs/bcachefs/journal_io.h b/fs/bcachefs/journal_io.h
index 2f8bbf06b289..a32c2876f2a6 100644
--- a/fs/bcachefs/journal_io.h
+++ b/fs/bcachefs/journal_io.h
@@ -52,7 +52,7 @@ void bch2_journal_entry_to_text(struct printbuf *, struct bch_fs *,
 void bch2_journal_ptrs_to_text(struct printbuf *, struct bch_fs *,
 			       struct journal_replay *);
 
-int bch2_journal_read(struct bch_fs *, u64 *, u64 *);
+int bch2_journal_read(struct bch_fs *, u64 *, u64 *, u64 *);
 
 void bch2_journal_write(struct closure *);
 
diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h
index 4c3065dceeea..0e6bde669b3e 100644
--- a/fs/bcachefs/journal_types.h
+++ b/fs/bcachefs/journal_types.h
@@ -177,6 +177,8 @@ enum journal_errors {
 #undef x
 };
 
+typedef DARRAY(u64)		darray_u64;
+
 /* Embedded in struct bch_fs */
 struct journal {
 	/* Fastpath stuff up front: */
@@ -206,6 +208,12 @@ struct journal {
 	enum journal_errors	cur_entry_error;
 
 	unsigned		buf_size_want;
+	/*
+	 * We may queue up some things to be journalled (log messages) before
+	 * the journal has actually started - stash them here:
+	 */
+	darray_u64		early_journal_entries;
+
 	/*
 	 * Two journal entries -- one is currently open for new entries, the
 	 * other is possibly being written out.
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 55819378fd4a..d054e83d86ef 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -587,7 +587,7 @@ static int journal_sort_seq_cmp(const void *_l, const void *_r)
 	return cmp_int(l->journal_seq, r->journal_seq);
 }
 
-static int bch2_journal_replay(struct bch_fs *c)
+static int bch2_journal_replay(struct bch_fs *c, u64 start_seq, u64 end_seq)
 {
 	struct journal_keys *keys = &c->journal_keys;
 	struct journal_key **keys_sorted, *k;
@@ -609,6 +609,13 @@ static int bch2_journal_replay(struct bch_fs *c)
 	     sizeof(keys_sorted[0]),
 	     journal_sort_seq_cmp, NULL);
 
+	if (keys->nr) {
+		ret = bch2_fs_log_msg(c, "Starting journal replay (%zu keys in entries %llu-%llu)",
+				      keys->nr, start_seq, end_seq);
+		if (ret)
+			goto err;
+	}
+
 	for (i = 0; i < keys->nr; i++) {
 		k = keys_sorted[i];
 
@@ -638,7 +645,7 @@ static int bch2_journal_replay(struct bch_fs *c)
 	ret = bch2_journal_error(j);
 
 	if (keys->nr && !ret)
-		bch2_journal_log_msg(&c->journal, "journal replay finished");
+		bch2_fs_log_msg(c, "journal replay finished");
 err:
 	kvfree(keys_sorted);
 	return ret;
@@ -1042,7 +1049,7 @@ int bch2_fs_recovery(struct bch_fs *c)
 	const char *err = "cannot allocate memory";
 	struct bch_sb_field_clean *clean = NULL;
 	struct jset *last_journal_entry = NULL;
-	u64 blacklist_seq, journal_seq;
+	u64 last_seq, blacklist_seq, journal_seq;
 	bool write_sb = false;
 	int ret = 0;
 
@@ -1109,7 +1116,7 @@ int bch2_fs_recovery(struct bch_fs *c)
 		struct journal_replay **i;
 
 		bch_verbose(c, "starting journal read");
-		ret = bch2_journal_read(c, &blacklist_seq, &journal_seq);
+		ret = bch2_journal_read(c, &last_seq, &blacklist_seq, &journal_seq);
 		if (ret)
 			goto err;
 
@@ -1191,7 +1198,9 @@ use_clean:
 		journal_seq += 8;
 
 	if (blacklist_seq != journal_seq) {
-		ret = bch2_journal_seq_blacklist_add(c,
+		ret =   bch2_fs_log_msg(c, "blacklisting entries %llu-%llu",
+					blacklist_seq, journal_seq) ?:
+			bch2_journal_seq_blacklist_add(c,
 					blacklist_seq, journal_seq);
 		if (ret) {
 			bch_err(c, "error creating new journal seq blacklist entry");
@@ -1199,12 +1208,14 @@ use_clean:
 		}
 	}
 
-	ret = bch2_fs_journal_start(&c->journal, journal_seq);
+	ret =   bch2_fs_log_msg(c, "starting journal at entry %llu, replaying %llu-%llu",
+				journal_seq, last_seq, blacklist_seq - 1) ?:
+		bch2_fs_journal_start(&c->journal, journal_seq);
 	if (ret)
 		goto err;
 
 	if (c->opts.reconstruct_alloc)
-		bch2_journal_log_msg(&c->journal, "dropping alloc info");
+		bch2_fs_log_msg(c, "dropping alloc info");
 
 	/*
 	 * Skip past versions that might have possibly been used (as nonces),
@@ -1260,7 +1271,7 @@ use_clean:
 
 		bch_info(c, "starting journal replay, %zu keys", c->journal_keys.nr);
 		err = "journal replay failed";
-		ret = bch2_journal_replay(c);
+		ret = bch2_journal_replay(c, last_seq, blacklist_seq - 1);
 		if (ret)
 			goto err;
 		if (c->opts.verbose || !c->sb.clean)
@@ -1293,7 +1304,7 @@ use_clean:
 
 		bch_verbose(c, "starting journal replay, %zu keys", c->journal_keys.nr);
 		err = "journal replay failed";
-		ret = bch2_journal_replay(c);
+		ret = bch2_journal_replay(c, last_seq, blacklist_seq - 1);
 		if (ret)
 			goto err;
 		if (c->opts.verbose || !c->sb.clean)
-- 
cgit 


From 60573ff5d0de3f54a8af397f5ba9d3ab443f274e Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 20 Dec 2022 19:27:02 -0500
Subject: bcachefs: Make log message at startup a bit cleaner

Don't print out opts= if no options have been specified.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/super.c | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 37dce3e3cccb..c911a07f8e8e 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -846,9 +846,12 @@ static void print_mount_opts(struct bch_fs *c)
 	struct printbuf p = PRINTBUF;
 	bool first = true;
 
+	prt_printf(&p, "mounted version=%s", bch2_metadata_versions[c->sb.version]);
+
 	if (c->opts.read_only) {
-		prt_printf(&p, "ro");
+		prt_str(&p, " opts=");
 		first = false;
+		prt_printf(&p, "ro");
 	}
 
 	for (i = 0; i < bch2_opts_nr; i++) {
@@ -861,16 +864,12 @@ static void print_mount_opts(struct bch_fs *c)
 		if (v == bch2_opt_get_by_id(&bch2_opts_default, i))
 			continue;
 
-		if (!first)
-			prt_printf(&p, ",");
+		prt_str(&p, first ? " opts=" : ",");
 		first = false;
 		bch2_opt_to_text(&p, c, c->disk_sb.sb, opt, v, OPT_SHOW_MOUNT_STYLE);
 	}
 
-	if (!p.pos)
-		prt_printf(&p, "(null)");
-
-	bch_info(c, "mounted version=%s opts=%s", bch2_metadata_versions[c->sb.version], p.buf);
+	bch_info(c, "%s", p.buf);
 	printbuf_exit(&p);
 }
 
-- 
cgit 


From 149651dc6c1250b3b51c38391b3677261ac94075 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 25 Dec 2022 15:31:27 -0500
Subject: bcachefs: fix fsck error

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_io.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 9dedac2c7885..81f513c7a1a0 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -557,7 +557,7 @@ enum btree_validate_ret {
 									\
 	if (type == BTREE_ERR_FIXABLE &&				\
 	    write == READ &&						\
-	    !test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags)) {		\
+	    !test_bit(BCH_FS_FSCK_DONE, &c->flags)) {			\
 		mustfix_fsck_err(c, "%s", out.buf);			\
 		goto out;						\
 	}								\
-- 
cgit 


From b8fe1b1dfecc10e571f82327d61c693720d39b19 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 3 Jan 2023 17:14:07 -0500
Subject: bcachefs: Convert btree_err() to a function

This makes the code more readable, and reduces text size by 8 kb.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_io.c | 113 ++++++++++++++++++++++++++++---------------------
 1 file changed, 64 insertions(+), 49 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 81f513c7a1a0..61603b3a4a5d 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -520,11 +520,10 @@ static void btree_err_msg(struct printbuf *out, struct bch_fs *c,
 			  struct btree *b, struct bset *i,
 			  unsigned offset, int write)
 {
-	prt_printf(out, bch2_log_msg(c, ""));
-	if (!write)
-		prt_str(out, "error validating btree node ");
-	else
-		prt_str(out, "corrupt btree node before write ");
+	prt_printf(out, bch2_log_msg(c, "%s"),
+		   write == READ
+		   ? "error validating btree node "
+		   : "corrupt btree node before write ");
 	if (ca)
 		prt_printf(out, "on %s ", ca->name);
 	prt_printf(out, "at btree ");
@@ -547,52 +546,68 @@ enum btree_validate_ret {
 	BTREE_RETRY_READ = 64,
 };
 
+static int __btree_err(enum btree_err_type type,
+		       struct bch_fs *c,
+		       struct bch_dev *ca,
+		       struct btree *b,
+		       struct bset *i,
+		       int write,
+		       bool have_retry,
+		       const char *fmt, ...)
+{
+	struct printbuf out = PRINTBUF;
+	va_list args;
+	int ret = -BCH_ERR_fsck_fix;
+
+	btree_err_msg(&out, c, ca, b, i, b->written, write);
+
+	va_start(args, fmt);
+	prt_vprintf(&out, fmt, args);
+	va_end(args);
+
+	if (write == READ &&
+	    type == BTREE_ERR_FIXABLE &&
+	    !test_bit(BCH_FS_FSCK_DONE, &c->flags)) {
+		mustfix_fsck_err(c, "%s", out.buf);
+		goto out;
+	}
+
+	bch2_print_string_as_lines(KERN_ERR, out.buf);
+
+	if (write == WRITE) {
+		ret = c->opts.errors == BCH_ON_ERROR_continue
+			? 0
+			: -BCH_ERR_fsck_errors_not_fixed;
+		goto out;
+	}
+
+	switch (type) {
+	case BTREE_ERR_FIXABLE:
+		ret = -BCH_ERR_fsck_errors_not_fixed;
+		break;
+	case BTREE_ERR_WANT_RETRY:
+		if (have_retry)
+			ret = BTREE_RETRY_READ;
+		break;
+	case BTREE_ERR_MUST_RETRY:
+		ret = BTREE_RETRY_READ;
+		break;
+	case BTREE_ERR_FATAL:
+		ret = -BCH_ERR_fsck_errors_not_fixed;
+		break;
+	}
+out:
+fsck_err:
+	printbuf_exit(&out);
+	return ret;
+}
+
 #define btree_err(type, c, ca, b, i, msg, ...)				\
 ({									\
-	__label__ out;							\
-	struct printbuf out = PRINTBUF;					\
-									\
-	btree_err_msg(&out, c, ca, b, i, b->written, write);		\
-	prt_printf(&out, msg, ##__VA_ARGS__);				\
-									\
-	if (type == BTREE_ERR_FIXABLE &&				\
-	    write == READ &&						\
-	    !test_bit(BCH_FS_FSCK_DONE, &c->flags)) {			\
-		mustfix_fsck_err(c, "%s", out.buf);			\
-		goto out;						\
-	}								\
-									\
-	bch2_print_string_as_lines(KERN_ERR, out.buf);			\
+	int _ret = __btree_err(type, c, ca, b, i, write, have_retry, msg, ##__VA_ARGS__);\
 									\
-	switch (write) {						\
-	case READ:							\
-		switch (type) {						\
-		case BTREE_ERR_FIXABLE:					\
-			ret = -BCH_ERR_fsck_errors_not_fixed;		\
-			goto fsck_err;					\
-		case BTREE_ERR_WANT_RETRY:				\
-			if (have_retry) {				\
-				ret = BTREE_RETRY_READ;			\
-				goto fsck_err;				\
-			}						\
-			break;						\
-		case BTREE_ERR_MUST_RETRY:				\
-			ret = BTREE_RETRY_READ;				\
-			goto fsck_err;					\
-		case BTREE_ERR_FATAL:					\
-			ret = -BCH_ERR_fsck_errors_not_fixed;		\
-			goto fsck_err;					\
-		}							\
-		break;							\
-	case WRITE:							\
-		if (bch2_fs_inconsistent(c)) {				\
-			ret = -BCH_ERR_fsck_errors_not_fixed;		\
-			goto fsck_err;					\
-		}							\
-		break;							\
-	}								\
-out:									\
-	printbuf_exit(&out);						\
+	if (_ret != -BCH_ERR_fsck_fix)					\
+		goto fsck_err;						\
 	true;								\
 })
 
@@ -892,7 +907,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
 	unsigned blacklisted_written, nonblacklisted_written = 0;
 	unsigned ptr_written = btree_ptr_sectors_written(&b->key);
 	struct printbuf buf = PRINTBUF;
-	int ret, retry_read = 0, write = READ;
+	int ret = 0, retry_read = 0, write = READ;
 
 	b->version_ondisk = U16_MAX;
 	/* We might get called multiple times on read retry: */
-- 
cgit 


From 494dcc57a7bf639c39364b5f84c1b6db39a0f83a Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 3 Jan 2023 17:32:16 -0500
Subject: bcachefs: Plumb saw_error through to btree_err()

The btree node read path has the ability to kick off an asynchronous
btree node rewrite if we saw and corrected an error. Previously this was
only used for errors that caused one of the replicas to be unusable -
this patch plumbs it through to all error paths, so that normal fsck
errors can be corrected.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_io.c | 25 +++++++++++++++----------
 fs/bcachefs/btree_io.h |  2 +-
 fs/bcachefs/debug.c    |  4 ++--
 3 files changed, 18 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 61603b3a4a5d..700ce14baa24 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -608,7 +608,7 @@ fsck_err:
 									\
 	if (_ret != -BCH_ERR_fsck_fix)					\
 		goto fsck_err;						\
-	true;								\
+	*saw_error = true;						\
 })
 
 #define btree_err_on(cond, ...)	((cond) ? btree_err(__VA_ARGS__) : false)
@@ -668,7 +668,7 @@ void bch2_btree_node_drop_keys_outside_node(struct btree *b)
 static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
 			 struct btree *b, struct bset *i,
 			 unsigned offset, unsigned sectors,
-			 int write, bool have_retry)
+			 int write, bool have_retry, bool *saw_error)
 {
 	unsigned version = le16_to_cpu(i->version);
 	const char *err;
@@ -805,7 +805,8 @@ static int bset_key_invalid(struct bch_fs *c, struct btree *b,
 }
 
 static int validate_bset_keys(struct bch_fs *c, struct btree *b,
-			 struct bset *i, int write, bool have_retry)
+			 struct bset *i, int write,
+			 bool have_retry, bool *saw_error)
 {
 	unsigned version = le16_to_cpu(i->version);
 	struct bkey_packed *k, *prev = NULL;
@@ -892,7 +893,7 @@ fsck_err:
 }
 
 int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
-			      struct btree *b, bool have_retry)
+			      struct btree *b, bool have_retry, bool *saw_error)
 {
 	struct btree_node_entry *bne;
 	struct sort_iter *iter;
@@ -1003,14 +1004,14 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
 					le16_to_cpu(i->version));
 
 		ret = validate_bset(c, ca, b, i, b->written, sectors,
-				    READ, have_retry);
+				    READ, have_retry, saw_error);
 		if (ret)
 			goto fsck_err;
 
 		if (!b->written)
 			btree_node_set_format(b, b->data->format);
 
-		ret = validate_bset_keys(c, b, i, READ, have_retry);
+		ret = validate_bset_keys(c, b, i, READ, have_retry, saw_error);
 		if (ret)
 			goto fsck_err;
 
@@ -1205,7 +1206,7 @@ start:
 				&failed, &rb->pick) > 0;
 
 		if (!bio->bi_status &&
-		    !bch2_btree_node_read_done(c, ca, b, can_retry)) {
+		    !bch2_btree_node_read_done(c, ca, b, can_retry, &saw_error)) {
 			if (retry)
 				bch_info(c, "retry success");
 			break;
@@ -1311,6 +1312,7 @@ static void btree_node_read_all_replicas_done(struct closure *cl)
 	unsigned i, written = 0, written2 = 0;
 	__le64 seq = b->key.k.type == KEY_TYPE_btree_ptr_v2
 		? bkey_i_to_btree_ptr_v2(&b->key)->v.seq : 0;
+	bool _saw_error = false, *saw_error = &_saw_error;
 
 	for (i = 0; i < ra->nr; i++) {
 		struct btree_node *bn = ra->buf[i];
@@ -1397,13 +1399,15 @@ fsck_err:
 
 	if (best >= 0) {
 		memcpy(b->data, ra->buf[best], btree_bytes(c));
-		ret = bch2_btree_node_read_done(c, NULL, b, false);
+		ret = bch2_btree_node_read_done(c, NULL, b, false, saw_error);
 	} else {
 		ret = -1;
 	}
 
 	if (ret)
 		set_btree_node_read_error(b);
+	else if (*saw_error)
+		bch2_btree_node_rewrite_async(c, b);
 
 	for (i = 0; i < ra->nr; i++) {
 		mempool_free(ra->buf[i], &c->btree_bounce_pool);
@@ -1780,6 +1784,7 @@ static int validate_bset_for_write(struct bch_fs *c, struct btree *b,
 				   struct bset *i, unsigned sectors)
 {
 	struct printbuf buf = PRINTBUF;
+	bool saw_error;
 	int ret;
 
 	ret = bch2_bkey_invalid(c, bkey_i_to_s_c(&b->key),
@@ -1791,8 +1796,8 @@ static int validate_bset_for_write(struct bch_fs *c, struct btree *b,
 	if (ret)
 		return ret;
 
-	ret = validate_bset_keys(c, b, i, WRITE, false) ?:
-		validate_bset(c, NULL, b, i, b->written, sectors, WRITE, false);
+	ret = validate_bset_keys(c, b, i, WRITE, false, &saw_error) ?:
+		validate_bset(c, NULL, b, i, b->written, sectors, WRITE, false, &saw_error);
 	if (ret) {
 		bch2_inconsistent_error(c);
 		dump_stack();
diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h
index a720dd74139b..c43fb60b8c82 100644
--- a/fs/bcachefs/btree_io.h
+++ b/fs/bcachefs/btree_io.h
@@ -129,7 +129,7 @@ void bch2_btree_build_aux_trees(struct btree *);
 void bch2_btree_init_next(struct btree_trans *, struct btree *);
 
 int bch2_btree_node_read_done(struct bch_fs *, struct bch_dev *,
-			      struct btree *, bool);
+			      struct btree *, bool, bool *);
 void bch2_btree_node_read(struct bch_fs *, struct btree *, bool);
 int bch2_btree_root_read(struct bch_fs *, enum btree_id,
 			 const struct bkey_i *, unsigned);
diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c
index d3e769b1eb3e..ab210296223b 100644
--- a/fs/bcachefs/debug.c
+++ b/fs/bcachefs/debug.c
@@ -39,7 +39,7 @@ static bool bch2_btree_verify_replica(struct bch_fs *c, struct btree *b,
 	struct bset *sorted, *inmemory = &b->data->keys;
 	struct bch_dev *ca = bch_dev_bkey_exists(c, pick.ptr.dev);
 	struct bio *bio;
-	bool failed = false;
+	bool failed = false, saw_error = false;
 
 	if (!bch2_dev_get_ioref(ca, READ))
 		return false;
@@ -60,7 +60,7 @@ static bool bch2_btree_verify_replica(struct bch_fs *c, struct btree *b,
 	memcpy(n_ondisk, n_sorted, btree_bytes(c));
 
 	v->written = 0;
-	if (bch2_btree_node_read_done(c, ca, v, false))
+	if (bch2_btree_node_read_done(c, ca, v, false, &saw_error) || saw_error)
 		return false;
 
 	n_sorted = c->verify_data->data;
-- 
cgit 


From c515e3f019fe0ab60ae6f5343d211f52b8a2c759 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 6 Jan 2023 06:29:04 -0500
Subject: bcachefs: Kill bch2_extent_trim_atomic() usage

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update_leaf.c | 16 ++++------------
 fs/bcachefs/inode.c             | 14 +++-----------
 2 files changed, 7 insertions(+), 23 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index d333d0e46d0d..3e57722007a9 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -1716,18 +1716,10 @@ int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id id,
 		 */
 		delete.k.p = iter.pos;
 
-		if (iter.flags & BTREE_ITER_IS_EXTENTS) {
-			unsigned max_sectors =
-				KEY_SIZE_MAX & (~0 << trans->c->block_bits);
-
-			/* create the biggest key we can */
-			bch2_key_resize(&delete.k, max_sectors);
-			bch2_cut_back(end, &delete);
-
-			ret = bch2_extent_trim_atomic(trans, &iter, &delete);
-			if (ret)
-				goto err;
-		}
+		if (iter.flags & BTREE_ITER_IS_EXTENTS)
+			bch2_key_resize(&delete.k,
+					bpos_min(end, k.k->p).offset -
+					iter.pos.offset);
 
 		ret   = bch2_trans_update(trans, &iter, &delete, update_flags) ?:
 			bch2_trans_commit(trans, &disk_res, journal_seq,
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index cf453edcb5ab..141cf21e2951 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -595,11 +595,11 @@ static int bch2_inode_delete_keys(struct btree_trans *trans,
 	int ret = 0;
 
 	/*
-	 * We're never going to be deleting extents, no need to use an extent
-	 * iterator:
+	 * We're never going to be deleting partial extents, no need to use an
+	 * extent iterator:
 	 */
 	bch2_trans_iter_init(trans, &iter, id, POS(inum.inum, 0),
-			     BTREE_ITER_INTENT);
+			     BTREE_ITER_INTENT|BTREE_ITER_NOT_EXTENTS);
 
 	while (1) {
 		bch2_trans_begin(trans);
@@ -621,14 +621,6 @@ static int bch2_inode_delete_keys(struct btree_trans *trans,
 		bkey_init(&delete.k);
 		delete.k.p = iter.pos;
 
-		if (iter.flags & BTREE_ITER_IS_EXTENTS) {
-			bch2_key_resize(&delete.k, k.k->p.offset - iter.pos.offset);
-
-			ret = bch2_extent_trim_atomic(trans, &iter, &delete);
-			if (ret)
-				goto err;
-		}
-
 		ret = bch2_trans_update(trans, &iter, &delete, 0) ?:
 		      bch2_trans_commit(trans, NULL, NULL,
 					BTREE_INSERT_NOFAIL);
-- 
cgit 


From ee94c413a7ef5f10a2768826b2e576981990c4b8 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 6 Jan 2023 22:58:19 -0500
Subject: bcachefs: Delete a faulty assertion

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 326742e00159..398db0faa816 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1011,12 +1011,11 @@ retry_all:
 	}
 
 	/*
-	 * BTREE_ITER_NEED_RELOCK is ok here - if we called bch2_trans_unlock()
-	 * and relock(), relock() won't relock since path->should_be_locked
-	 * isn't set yet, which is all fine
+	 * We used to assert that all paths had been traversed here
+	 * (path->uptodate < BTREE_ITER_NEED_TRAVERSE); however, since
+	 * path->Should_be_locked is not set yet, we we might have unlocked and
+	 * then failed to relock a path - that's fine.
 	 */
-	trans_for_each_path(trans, path)
-		BUG_ON(path->uptodate >= BTREE_ITER_NEED_TRAVERSE);
 err:
 	bch2_btree_cache_cannibalize_unlock(c);
 
-- 
cgit 


From c82ed3047b8875b07b19e6e287c48f27a37b756f Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 8 Jan 2023 00:04:30 -0500
Subject: bcachefs: Fix bch2_btree_path_traverse_all()

We need to take a ref on a path while we're traversing it: this fixes a
bug with paths getting reused while being traversed, in the key cache
fill code.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 398db0faa816..31733c239746 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -999,7 +999,10 @@ retry_all:
 		 * the same position:
 		 */
 		if (path->uptodate) {
+			__btree_path_get(path, false);
 			ret = btree_path_traverse_one(trans, path, 0, _THIS_IP_);
+			__btree_path_put(path, false);
+
 			if (bch2_err_matches(ret, BCH_ERR_transaction_restart) ||
 			    ret == -ENOMEM)
 				goto retry_all;
-- 
cgit 


From 7af365eb3694b7ef7ce2b90b6de4b830a49cdda4 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 8 Jan 2023 00:05:30 -0500
Subject: bcachefs: Improve bkey_cached_lock_for_evict()

We don't need a write lock to check if a key is dirty.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_key_cache.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index 4833cb4c7cf5..53b9f0825ec5 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -56,13 +56,12 @@ static bool bkey_cached_lock_for_evict(struct bkey_cached *ck)
 	if (!six_trylock_intent(&ck->c.lock))
 		return false;
 
-	if (!six_trylock_write(&ck->c.lock)) {
+	if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
 		six_unlock_intent(&ck->c.lock);
 		return false;
 	}
 
-	if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
-		six_unlock_write(&ck->c.lock);
+	if (!six_trylock_write(&ck->c.lock)) {
 		six_unlock_intent(&ck->c.lock);
 		return false;
 	}
-- 
cgit 


From 6c36318cc702f05d302fb98a99636e320392bdf1 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 7 Jan 2023 05:46:52 -0500
Subject: bcachefs: key cache: Don't hold btree locks while using GFP_RECLAIM

This is something we need to do more widely: instead of bothering with
GFP_NOIO/GFP_NOFS, if we need to allocate memory while holding locks:

 - first attempt the allocation with GFP_NOWAIT
 - if that fails, drop btree locks with bch2_trans_unlock(), then
   retry with GFP_KERNEL.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c      |  2 +-
 fs/bcachefs/btree_key_cache.c | 70 ++++++++++++++++++++++++++++++-------------
 2 files changed, 51 insertions(+), 21 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 31733c239746..0a0d3aa05395 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -2827,7 +2827,7 @@ u32 bch2_trans_begin(struct btree_trans *trans)
 		bch2_trans_relock(trans);
 	}
 
-	if (unlikely(time_after(jiffies, trans->srcu_lock_time + HZ)))
+	if (unlikely(time_after(jiffies, trans->srcu_lock_time + msecs_to_jiffies(10))))
 		bch2_trans_reset_srcu_lock(trans);
 
 	trans->last_restarted_ip = _RET_IP_;
diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index 53b9f0825ec5..d432d26cc68b 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -196,6 +196,7 @@ bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path,
 	struct btree_key_cache *bc = &c->btree_key_cache;
 	struct bkey_cached *ck = NULL;
 	bool pcpu_readers = btree_uses_pcpu_readers(path->btree_id);
+	int ret;
 
 	if (!pcpu_readers) {
 #ifdef __KERNEL__
@@ -263,23 +264,34 @@ bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path,
 		return ck;
 	}
 
-	/* GFP_NOFS because we're holding btree locks: */
-	ck = kmem_cache_alloc(bch2_key_cache, GFP_NOFS|__GFP_ZERO);
-	if (likely(ck)) {
-		INIT_LIST_HEAD(&ck->list);
-		__six_lock_init(&ck->c.lock, "b->c.lock", &bch2_btree_node_lock_key);
-		lockdep_set_novalidate_class(&ck->c.lock);
-		if (pcpu_readers)
-			six_lock_pcpu_alloc(&ck->c.lock);
+	ck = kmem_cache_zalloc(bch2_key_cache, GFP_NOWAIT|__GFP_NOWARN);
+	if (likely(ck))
+		goto init;
 
-		ck->c.cached = true;
-		BUG_ON(!six_trylock_intent(&ck->c.lock));
-		BUG_ON(!six_trylock_write(&ck->c.lock));
-		*was_new = true;
-		return ck;
+	bch2_trans_unlock(trans);
+
+	ck = kmem_cache_zalloc(bch2_key_cache, GFP_KERNEL);
+
+	ret = bch2_trans_relock(trans);
+	if (ret) {
+		kmem_cache_free(bch2_key_cache, ck);
+		return ERR_PTR(ret);
 	}
 
-	return NULL;
+	if (!ck)
+		return NULL;
+init:
+	INIT_LIST_HEAD(&ck->list);
+	__six_lock_init(&ck->c.lock, "b->c.lock", &bch2_btree_node_lock_key);
+	lockdep_set_novalidate_class(&ck->c.lock);
+	if (pcpu_readers)
+		six_lock_pcpu_alloc(&ck->c.lock);
+
+	ck->c.cached = true;
+	BUG_ON(!six_trylock_intent(&ck->c.lock));
+	BUG_ON(!six_trylock_write(&ck->c.lock));
+	*was_new = true;
+	return ck;
 }
 
 static struct bkey_cached *
@@ -385,7 +397,7 @@ static int btree_key_cache_fill(struct btree_trans *trans,
 
 	if (!bch2_btree_node_relock(trans, ck_path, 0)) {
 		trace_and_count(trans->c, trans_restart_relock_key_cache_fill, trans, _THIS_IP_, ck_path);
-		ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_raced);
+		ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_fill);
 		goto err;
 	}
 
@@ -404,12 +416,30 @@ static int btree_key_cache_fill(struct btree_trans *trans,
 
 	if (new_u64s > ck->u64s) {
 		new_u64s = roundup_pow_of_two(new_u64s);
-		new_k = kmalloc(new_u64s * sizeof(u64), GFP_NOFS);
+		new_k = kmalloc(new_u64s * sizeof(u64), GFP_NOWAIT|__GFP_NOWARN);
 		if (!new_k) {
-			bch_err(trans->c, "error allocating memory for key cache key, btree %s u64s %u",
-				bch2_btree_ids[ck->key.btree_id], new_u64s);
-			ret = -ENOMEM;
-			goto err;
+			bch2_trans_unlock(trans);
+
+			new_k = kmalloc(new_u64s * sizeof(u64), GFP_KERNEL);
+			if (!new_k) {
+				bch_err(trans->c, "error allocating memory for key cache key, btree %s u64s %u",
+					bch2_btree_ids[ck->key.btree_id], new_u64s);
+				ret = -ENOMEM;
+				goto err;
+			}
+
+			if (!bch2_btree_node_relock(trans, ck_path, 0)) {
+				kfree(new_k);
+				trace_and_count(trans->c, trans_restart_relock_key_cache_fill, trans, _THIS_IP_, ck_path);
+				ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_fill);
+				goto err;
+			}
+
+			ret = bch2_trans_relock(trans);
+			if (ret) {
+				kfree(new_k);
+				goto err;
+			}
 		}
 	}
 
-- 
cgit 


From ee2c6ea7760eceee3051ef2f2046d16dc5ab06ec Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 9 Jan 2023 01:11:18 -0500
Subject: bcachefs: btree_iter->ip_allocated

In debug mode, we now track where btree iterators and paths are
initialized/allocated - helpful in tracking down btree path overflows.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c            | 69 +++++++++++++++++++++++++------------
 fs/bcachefs/btree_iter.h            | 28 +++++++++------
 fs/bcachefs/btree_types.h           |  3 ++
 fs/bcachefs/btree_update_interior.c | 10 +++---
 fs/bcachefs/btree_update_leaf.c     |  8 +++--
 5 files changed, 78 insertions(+), 40 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 0a0d3aa05395..5034f8ebfb04 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -24,6 +24,15 @@ static inline void btree_path_list_remove(struct btree_trans *, struct btree_pat
 static inline void btree_path_list_add(struct btree_trans *, struct btree_path *,
 				       struct btree_path *);
 
+static inline unsigned long btree_iter_ip_allocated(struct btree_iter *iter)
+{
+#ifdef CONFIG_BCACHEFS_DEBUG
+	return iter->ip_allocated;
+#else
+	return 0;
+#endif
+}
+
 static struct btree_path *btree_path_alloc(struct btree_trans *, struct btree_path *);
 
 /*
@@ -1221,7 +1230,8 @@ static struct btree_path *btree_path_clone(struct btree_trans *trans, struct btr
 
 __flatten
 struct btree_path *__bch2_btree_path_make_mut(struct btree_trans *trans,
-			 struct btree_path *path, bool intent)
+			 struct btree_path *path, bool intent,
+			 unsigned long ip)
 {
 	__btree_path_put(path, intent);
 	path = btree_path_clone(trans, path, intent);
@@ -1231,15 +1241,15 @@ struct btree_path *__bch2_btree_path_make_mut(struct btree_trans *trans,
 
 struct btree_path * __must_check
 __bch2_btree_path_set_pos(struct btree_trans *trans,
-			  struct btree_path *path, struct bpos new_pos,
-			  bool intent, int cmp)
+		   struct btree_path *path, struct bpos new_pos,
+		   bool intent, unsigned long ip, int cmp)
 {
 	unsigned level = path->level;
 
 	EBUG_ON(trans->restarted);
 	EBUG_ON(!path->ref);
 
-	path = bch2_btree_path_make_mut(trans, path, intent);
+	path = bch2_btree_path_make_mut(trans, path, intent, ip);
 
 	path->pos		= new_pos;
 	trans->paths_sorted	= false;
@@ -1524,7 +1534,7 @@ static inline struct btree_path *btree_path_alloc(struct btree_trans *trans,
 struct btree_path *bch2_path_get(struct btree_trans *trans,
 				 enum btree_id btree_id, struct bpos pos,
 				 unsigned locks_want, unsigned level,
-				 unsigned flags)
+				 unsigned flags, unsigned long ip)
 {
 	struct btree_path *path, *path_pos = NULL;
 	bool cached = flags & BTREE_ITER_CACHED;
@@ -1552,7 +1562,7 @@ struct btree_path *bch2_path_get(struct btree_trans *trans,
 	    path_pos->btree_id	== btree_id &&
 	    path_pos->level	== level) {
 		__btree_path_get(path_pos, intent);
-		path = bch2_btree_path_set_pos(trans, path_pos, pos, intent);
+		path = bch2_btree_path_set_pos(trans, path_pos, pos, intent, ip);
 	} else {
 		path = btree_path_alloc(trans, path_pos);
 		path_pos = NULL;
@@ -1569,7 +1579,7 @@ struct btree_path *bch2_path_get(struct btree_trans *trans,
 		for (i = 0; i < ARRAY_SIZE(path->l); i++)
 			path->l[i].b		= ERR_PTR(-BCH_ERR_no_btree_node_init);
 #ifdef CONFIG_BCACHEFS_DEBUG
-		path->ip_allocated		= _RET_IP_;
+		path->ip_allocated		= ip;
 #endif
 		trans->paths_sorted		= false;
 	}
@@ -1651,7 +1661,8 @@ bch2_btree_iter_traverse(struct btree_iter *iter)
 
 	iter->path = bch2_btree_path_set_pos(iter->trans, iter->path,
 					btree_iter_search_key(iter),
-					iter->flags & BTREE_ITER_INTENT);
+					iter->flags & BTREE_ITER_INTENT,
+					btree_iter_ip_allocated(iter));
 
 	ret = bch2_btree_path_traverse(iter->trans, iter->path, iter->flags);
 	if (ret)
@@ -1686,7 +1697,8 @@ struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter)
 	iter->k.p = iter->pos = b->key.k.p;
 
 	iter->path = bch2_btree_path_set_pos(trans, iter->path, b->key.k.p,
-					iter->flags & BTREE_ITER_INTENT);
+					iter->flags & BTREE_ITER_INTENT,
+					btree_iter_ip_allocated(iter));
 	btree_path_set_should_be_locked(iter->path);
 out:
 	bch2_btree_iter_verify_entry_exit(iter);
@@ -1740,7 +1752,8 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
 		 */
 		path = iter->path =
 			bch2_btree_path_set_pos(trans, path, bpos_successor(iter->pos),
-					   iter->flags & BTREE_ITER_INTENT);
+					   iter->flags & BTREE_ITER_INTENT,
+					   btree_iter_ip_allocated(iter));
 
 		btree_path_set_level_down(trans, path, iter->min_depth);
 
@@ -1755,7 +1768,8 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
 	iter->k.p = iter->pos = b->key.k.p;
 
 	iter->path = bch2_btree_path_set_pos(trans, iter->path, b->key.k.p,
-					iter->flags & BTREE_ITER_INTENT);
+					iter->flags & BTREE_ITER_INTENT,
+					btree_iter_ip_allocated(iter));
 	btree_path_set_should_be_locked(iter->path);
 	BUG_ON(iter->path->uptodate);
 out:
@@ -1907,10 +1921,12 @@ struct bkey_s_c btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos
 		iter->key_cache_path = bch2_path_get(trans, iter->btree_id, pos,
 						     iter->flags & BTREE_ITER_INTENT, 0,
 						     iter->flags|BTREE_ITER_CACHED|
-						     BTREE_ITER_CACHED_NOFILL);
+						     BTREE_ITER_CACHED_NOFILL,
+						     _THIS_IP_);
 
 	iter->key_cache_path = bch2_btree_path_set_pos(trans, iter->key_cache_path, pos,
-					iter->flags & BTREE_ITER_INTENT);
+					iter->flags & BTREE_ITER_INTENT,
+					btree_iter_ip_allocated(iter));
 
 	ret =   bch2_btree_path_traverse(trans, iter->key_cache_path,
 					 iter->flags|BTREE_ITER_CACHED) ?:
@@ -1942,7 +1958,8 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp
 		struct btree_path_level *l;
 
 		iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key,
-					iter->flags & BTREE_ITER_INTENT);
+					iter->flags & BTREE_ITER_INTENT,
+					btree_iter_ip_allocated(iter));
 
 		ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
 		if (unlikely(ret)) {
@@ -2092,7 +2109,8 @@ struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos e
 
 			iter->update_path = bch2_btree_path_set_pos(trans,
 						iter->update_path, pos,
-						iter->flags & BTREE_ITER_INTENT);
+						iter->flags & BTREE_ITER_INTENT,
+						_THIS_IP_);
 			ret = bch2_btree_path_traverse(trans, iter->update_path, iter->flags);
 			if (unlikely(ret)) {
 				k = bkey_s_c_err(ret);
@@ -2124,7 +2142,8 @@ struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos e
 	iter->pos = iter_pos;
 
 	iter->path = bch2_btree_path_set_pos(trans, iter->path, k.k->p,
-				iter->flags & BTREE_ITER_INTENT);
+				iter->flags & BTREE_ITER_INTENT,
+				btree_iter_ip_allocated(iter));
 
 	btree_path_set_should_be_locked(iter->path);
 out_no_locked:
@@ -2170,7 +2189,8 @@ struct bkey_s_c bch2_btree_iter_peek_all_levels(struct btree_iter *iter)
 
 	while (1) {
 		iter->path = bch2_btree_path_set_pos(trans, iter->path, iter->pos,
-					iter->flags & BTREE_ITER_INTENT);
+					iter->flags & BTREE_ITER_INTENT,
+					btree_iter_ip_allocated(iter));
 
 		ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
 		if (unlikely(ret)) {
@@ -2283,7 +2303,8 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
 
 	while (1) {
 		iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key,
-						iter->flags & BTREE_ITER_INTENT);
+						iter->flags & BTREE_ITER_INTENT,
+						btree_iter_ip_allocated(iter));
 
 		ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
 		if (unlikely(ret)) {
@@ -2413,7 +2434,8 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
 
 	search_key = btree_iter_search_key(iter);
 	iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key,
-					iter->flags & BTREE_ITER_INTENT);
+					iter->flags & BTREE_ITER_INTENT,
+					btree_iter_ip_allocated(iter));
 
 	ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
 	if (unlikely(ret)) {
@@ -2678,7 +2700,8 @@ static inline void bch2_trans_iter_init_inlined(struct btree_trans *trans,
 			  unsigned flags)
 {
 	bch2_trans_iter_init_common(trans, iter, btree_id, pos, 0, 0,
-			       bch2_btree_iter_flags(trans, btree_id, flags));
+			       bch2_btree_iter_flags(trans, btree_id, flags),
+			       _RET_IP_);
 }
 
 void bch2_trans_iter_init_outlined(struct btree_trans *trans,
@@ -2687,7 +2710,8 @@ void bch2_trans_iter_init_outlined(struct btree_trans *trans,
 			  unsigned flags)
 {
 	bch2_trans_iter_init_common(trans, iter, btree_id, pos, 0, 0,
-			       bch2_btree_iter_flags(trans, btree_id, flags));
+			       bch2_btree_iter_flags(trans, btree_id, flags),
+			       _RET_IP_);
 }
 
 void bch2_trans_node_iter_init(struct btree_trans *trans,
@@ -2703,7 +2727,8 @@ void bch2_trans_node_iter_init(struct btree_trans *trans,
        flags |= BTREE_ITER_ALL_SNAPSHOTS;
 
 	bch2_trans_iter_init_common(trans, iter, btree_id, pos, locks_want, depth,
-			       __bch2_btree_iter_flags(trans, btree_id, flags));
+			       __bch2_btree_iter_flags(trans, btree_id, flags),
+			       _RET_IP_);
 
 	iter->min_depth	= depth;
 
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index 98ff39bcd8f2..6814c87c2359 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -144,39 +144,40 @@ __trans_next_path_with_node(struct btree_trans *trans, struct btree *b,
 	     _path = __trans_next_path_with_node((_trans), (_b),	\
 						 (_path)->idx + 1))
 
-struct btree_path *__bch2_btree_path_make_mut(struct btree_trans *,
-					      struct btree_path *, bool);
+struct btree_path *__bch2_btree_path_make_mut(struct btree_trans *, struct btree_path *,
+			 bool, unsigned long);
 
 static inline struct btree_path * __must_check
 bch2_btree_path_make_mut(struct btree_trans *trans,
-			 struct btree_path *path, bool intent)
+			 struct btree_path *path, bool intent,
+			 unsigned long ip)
 {
 	if (path->ref > 1 || path->preserve)
-		path = __bch2_btree_path_make_mut(trans, path, intent);
+		path = __bch2_btree_path_make_mut(trans, path, intent, ip);
 	path->should_be_locked = false;
 	return path;
 }
 
 struct btree_path * __must_check
 __bch2_btree_path_set_pos(struct btree_trans *, struct btree_path *,
-			  struct bpos, bool, int);
+			struct bpos, bool, unsigned long, int);
 
 static inline struct btree_path * __must_check
 bch2_btree_path_set_pos(struct btree_trans *trans,
 		   struct btree_path *path, struct bpos new_pos,
-		   bool intent)
+		   bool intent, unsigned long ip)
 {
 	int cmp = bpos_cmp(new_pos, path->pos);
 
 	return cmp
-		? __bch2_btree_path_set_pos(trans, path, new_pos, intent, cmp)
+		? __bch2_btree_path_set_pos(trans, path, new_pos, intent, ip, cmp)
 		: path;
 }
 
 int __must_check bch2_btree_path_traverse(struct btree_trans *,
 					  struct btree_path *, unsigned);
 struct btree_path *bch2_path_get(struct btree_trans *, enum btree_id, struct bpos,
-				 unsigned, unsigned, unsigned);
+				 unsigned, unsigned, unsigned, unsigned long);
 struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *, struct bkey *);
 
 struct bkey_i *bch2_btree_journal_peek_slot(struct btree_trans *,
@@ -359,7 +360,8 @@ static inline void bch2_trans_iter_init_common(struct btree_trans *trans,
 					  unsigned btree_id, struct bpos pos,
 					  unsigned locks_want,
 					  unsigned depth,
-					  unsigned flags)
+					  unsigned flags,
+					  unsigned long ip)
 {
 	memset(iter, 0, sizeof(*iter));
 	iter->trans	= trans;
@@ -369,8 +371,11 @@ static inline void bch2_trans_iter_init_common(struct btree_trans *trans,
 	iter->pos	= pos;
 	iter->k.p	= pos;
 
+#ifdef CONFIG_BCACHEFS_DEBUG
+	iter->ip_allocated = ip;
+#endif
 	iter->path = bch2_path_get(trans, btree_id, iter->pos,
-				   locks_want, depth, flags);
+				   locks_want, depth, flags, ip);
 }
 
 void bch2_trans_iter_init_outlined(struct btree_trans *, struct btree_iter *,
@@ -384,7 +389,8 @@ static inline void bch2_trans_iter_init(struct btree_trans *trans,
 	if (__builtin_constant_p(btree_id) &&
 	    __builtin_constant_p(flags))
 		bch2_trans_iter_init_common(trans, iter, btree_id, pos, 0, 0,
-				bch2_btree_iter_flags(trans, btree_id, flags));
+				bch2_btree_iter_flags(trans, btree_id, flags),
+				_THIS_IP_);
 	else
 		bch2_trans_iter_init_outlined(trans, iter, btree_id, pos, flags);
 }
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 3cf10b3f3788..7c664186f3c3 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -289,6 +289,9 @@ struct btree_iter {
 	/* BTREE_ITER_WITH_JOURNAL: */
 	size_t			journal_idx;
 	struct bpos		journal_pos;
+#ifdef CONFIG_BCACHEFS_DEBUG
+	unsigned long		ip_allocated;
+#endif
 };
 
 struct btree_key_cache_freelist {
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index e184b857c4c4..cb1e7dbf5440 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -36,8 +36,9 @@ static struct btree_path *get_unlocked_mut_path(struct btree_trans *trans,
 	struct btree_path *path;
 
 	path = bch2_path_get(trans, btree_id, pos, level + 1, level,
-			     BTREE_ITER_NOPRESERVE|BTREE_ITER_INTENT);
-	path = bch2_btree_path_make_mut(trans, path, true);
+			     BTREE_ITER_NOPRESERVE|
+			     BTREE_ITER_INTENT, _RET_IP_);
+	path = bch2_btree_path_make_mut(trans, path, true, _RET_IP_);
 	bch2_btree_path_downgrade(trans, path);
 	__bch2_btree_path_unlock(trans, path);
 	return path;
@@ -1780,7 +1781,7 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans,
 		: bpos_successor(b->data->max_key);
 
 	sib_path = bch2_path_get(trans, path->btree_id, sib_pos,
-				 U8_MAX, level, BTREE_ITER_INTENT);
+				 U8_MAX, level, BTREE_ITER_INTENT, _THIS_IP_);
 	ret = bch2_btree_path_traverse(trans, sib_path, false);
 	if (ret)
 		goto err;
@@ -2093,7 +2094,8 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans,
 		bch2_trans_copy_iter(&iter2, iter);
 
 		iter2.path = bch2_btree_path_make_mut(trans, iter2.path,
-				iter2.flags & BTREE_ITER_INTENT);
+				iter2.flags & BTREE_ITER_INTENT,
+				_THIS_IP_);
 
 		BUG_ON(iter2.path->level != b->c.level);
 		BUG_ON(!bpos_eq(iter2.path->pos, new_key->k.p));
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 3e57722007a9..7a95649dd71e 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -1452,7 +1452,7 @@ static noinline int flush_new_cached_update(struct btree_trans *trans,
 	i->flags |= BTREE_TRIGGER_NORUN;
 
 	btree_path = bch2_path_get(trans, path->btree_id, path->pos, 1, 0,
-				   BTREE_ITER_INTENT);
+				   BTREE_ITER_INTENT, _THIS_IP_);
 
 	ret = bch2_btree_path_traverse(trans, btree_path, 0);
 	if (ret)
@@ -1590,11 +1590,13 @@ int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter
 			if (!iter->key_cache_path)
 				iter->key_cache_path =
 					bch2_path_get(trans, path->btree_id, path->pos, 1, 0,
-						      BTREE_ITER_INTENT|BTREE_ITER_CACHED);
+						      BTREE_ITER_INTENT|
+						      BTREE_ITER_CACHED, _THIS_IP_);
 
 			iter->key_cache_path =
 				bch2_btree_path_set_pos(trans, iter->key_cache_path, path->pos,
-							iter->flags & BTREE_ITER_INTENT);
+							iter->flags & BTREE_ITER_INTENT,
+							_THIS_IP_);
 
 			ret = bch2_btree_path_traverse(trans, iter->key_cache_path,
 						       BTREE_ITER_CACHED);
-- 
cgit 


From 313816363a843f1b812ae9190f6dcb4c49145057 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 24 Jan 2023 00:26:48 -0500
Subject: bcachefs: bch2_trans_relock_notrace()

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c |  2 +-
 fs/bcachefs/btree_iter.h       |  1 +
 fs/bcachefs/btree_locking.c    | 15 +++++++++++++++
 3 files changed, 17 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 860ac8fc5833..cbde19bafbba 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -995,7 +995,7 @@ static int bch2_discard_one_bucket(struct btree_trans *trans,
 				     GFP_KERNEL);
 		*discard_pos_done = iter.pos;
 
-		ret = bch2_trans_relock(trans);
+		ret = bch2_trans_relock_notrace(trans);
 		if (ret)
 			goto out;
 	}
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index 6814c87c2359..b889d1c03c7b 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -206,6 +206,7 @@ int bch2_btree_path_relock_intent(struct btree_trans *, struct btree_path *);
 void bch2_path_put(struct btree_trans *, struct btree_path *, bool);
 
 int bch2_trans_relock(struct btree_trans *);
+int bch2_trans_relock_notrace(struct btree_trans *);
 void bch2_trans_unlock(struct btree_trans *);
 bool bch2_trans_locked(struct btree_trans *);
 
diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c
index 9d4be3c9cfd7..e7659b4cf9e4 100644
--- a/fs/bcachefs/btree_locking.c
+++ b/fs/bcachefs/btree_locking.c
@@ -691,6 +691,21 @@ int bch2_trans_relock(struct btree_trans *trans)
 	return 0;
 }
 
+int bch2_trans_relock_notrace(struct btree_trans *trans)
+{
+	struct btree_path *path;
+
+	if (unlikely(trans->restarted))
+		return -((int) trans->restarted);
+
+	trans_for_each_path(trans, path)
+		if (path->should_be_locked &&
+		    !bch2_btree_path_relock_norestart(trans, path, _RET_IP_)) {
+			return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock);
+		}
+	return 0;
+}
+
 void bch2_trans_unlock(struct btree_trans *trans)
 {
 	struct btree_path *path;
-- 
cgit 


From e9a1da97377f89f09e6b0b484554fe7a0e2dbe3e Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 3 Dec 2022 15:44:54 -0500
Subject: bcachefs: Fix compat path for old inode formats

Old inode formats don't have all the fields of the current inode format:
when unpacking inodes in the current format we can thus skip zeroing out
the destination buffer, but that doesn't work on for the old formats.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/inode.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index 141cf21e2951..6e7ba2e6fe33 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -213,6 +213,8 @@ static int bch2_inode_unpack_v2(struct bch_inode_unpacked *unpacked,
 int bch2_inode_unpack(struct bkey_s_c k,
 		      struct bch_inode_unpacked *unpacked)
 {
+	memset(unpacked, 0, sizeof(*unpacked));
+
 	switch (k.k->type) {
 	case KEY_TYPE_inode: {
 		struct bkey_s_c_inode inode = bkey_s_c_to_inode(k);
-- 
cgit 


From 858536c7cea8bb86511501768ef797d103642498 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 11 Dec 2022 20:37:11 -0500
Subject: bcachefs: Convert EROFS errors to private error codes

More error code improvements - this gets us more useful error messages.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_foreground.c  | 3 ---
 fs/bcachefs/btree_gc.c          | 4 ++--
 fs/bcachefs/btree_update_leaf.c | 2 +-
 fs/bcachefs/ec.c                | 2 +-
 fs/bcachefs/errcode.h           | 6 +++++-
 fs/bcachefs/io.c                | 2 +-
 fs/bcachefs/journal.c           | 2 +-
 fs/bcachefs/move.c              | 2 +-
 fs/bcachefs/movinggc.c          | 2 +-
 fs/bcachefs/recovery.c          | 4 ++--
 fs/bcachefs/reflink.c           | 2 +-
 fs/bcachefs/super-io.c          | 4 ++--
 12 files changed, 18 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index dd47eeb1efc5..c4aee0022fab 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -1222,9 +1222,6 @@ err:
 			? -EAGAIN
 			: -BCH_ERR_ENOSPC_bucket_alloc;
 
-	if (bch2_err_matches(ret, BCH_ERR_insufficient_devices))
-		return -EROFS;
-
 	return ret;
 }
 
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index fdc9de6e9908..e43ccf896e8e 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -1988,7 +1988,7 @@ int bch2_gc_gens(struct bch_fs *c)
 					NULL, NULL,
 					BTREE_INSERT_NOFAIL,
 				gc_btree_gens_key(&trans, &iter, k));
-			if (ret && ret != -EROFS)
+			if (ret && !bch2_err_matches(ret, EROFS))
 				bch_err(c, "error recalculating oldest_gen: %s", bch2_err_str(ret));
 			if (ret)
 				goto err;
@@ -2001,7 +2001,7 @@ int bch2_gc_gens(struct bch_fs *c)
 			NULL, NULL,
 			BTREE_INSERT_NOFAIL,
 		bch2_alloc_write_oldest_gen(&trans, &iter, k));
-	if (ret && ret != -EROFS)
+	if (ret && !bch2_err_matches(ret, EROFS))
 		bch_err(c, "error writing oldest_gen: %s", bch2_err_str(ret));
 	if (ret)
 		goto err;
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 7a95649dd71e..f44abb3fe469 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -990,7 +990,7 @@ bch2_trans_commit_get_rw_cold(struct btree_trans *trans)
 
 	if (likely(!(trans->flags & BTREE_INSERT_LAZY_RW)) ||
 	    test_bit(BCH_FS_STARTED, &c->flags))
-		return -EROFS;
+		return -BCH_ERR_erofs_trans_commit;
 
 	bch2_trans_unlock(trans);
 
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index 65da4a185bbb..0d33dee1aed4 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -891,7 +891,7 @@ static void ec_stripe_create(struct ec_stripe_new *s)
 	closure_sync(&s->iodone);
 
 	if (s->err) {
-		if (s->err != -EROFS)
+		if (!bch2_err_matches(s->err, EROFS))
 			bch_err(c, "error creating stripe: error writing data buckets");
 		goto err;
 	}
diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h
index 5f0f75726784..96ee72c125d6 100644
--- a/fs/bcachefs/errcode.h
+++ b/fs/bcachefs/errcode.h
@@ -20,7 +20,6 @@
 	x(0,				open_buckets_empty)			\
 	x(0,				freelist_empty)				\
 	x(BCH_ERR_freelist_empty,	no_buckets_found)			\
-	x(0,				insufficient_devices)			\
 	x(0,				transaction_restart)			\
 	x(BCH_ERR_transaction_restart,	transaction_restart_fault_inject)	\
 	x(BCH_ERR_transaction_restart,	transaction_restart_relock)		\
@@ -80,6 +79,11 @@
 	x(EINVAL,			device_already_online)			\
 	x(EINVAL,			insufficient_devices_to_start)		\
 	x(EINVAL,			invalid)				\
+	x(EROFS,			erofs_trans_commit)			\
+	x(EROFS,			erofs_no_writes)			\
+	x(EROFS,			erofs_journal_err)			\
+	x(EROFS,			erofs_sb_err)				\
+	x(EROFS,			insufficient_devices)			\
 	x(BCH_ERR_invalid,		invalid_sb)				\
 	x(BCH_ERR_invalid_sb,		invalid_sb_magic)			\
 	x(BCH_ERR_invalid_sb,		invalid_sb_version)			\
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index 27265ba35fac..44f3719d4b71 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -1417,7 +1417,7 @@ void bch2_write(struct closure *cl)
 
 	if (c->opts.nochanges ||
 	    !percpu_ref_tryget_live(&c->writes)) {
-		op->error = -EROFS;
+		op->error = -BCH_ERR_erofs_no_writes;
 		goto err;
 	}
 
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index c7a7b9cd20f0..9daa99f3732a 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -363,7 +363,7 @@ retry:
 		return 0;
 
 	if (bch2_journal_error(j))
-		return -EROFS;
+		return -BCH_ERR_erofs_journal_err;
 
 	spin_lock(&j->lock);
 
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index 4d6fd3025e40..848a415b6797 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -251,7 +251,7 @@ static int bch2_move_extent(struct btree_trans *trans,
 	}
 
 	if (!percpu_ref_tryget_live(&c->writes))
-		return -EROFS;
+		return -BCH_ERR_erofs_no_writes;
 
 	/* write path might have to decompress data: */
 	bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c
index 66f18f711d53..9c55a88a2b08 100644
--- a/fs/bcachefs/movinggc.c
+++ b/fs/bcachefs/movinggc.c
@@ -319,7 +319,7 @@ static int bch2_copygc(struct bch_fs *c)
 			     writepoint_ptr(&c->copygc_write_point),
 			     false,
 			     copygc_pred, NULL);
-	if (ret < 0 && ret != -EROFS)
+	if (ret < 0 && !bch2_err_matches(ret, EROFS))
 		bch_err(c, "error from bch2_move_data() in copygc: %s", bch2_err_str(ret));
 	if (ret)
 		return ret;
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index d054e83d86ef..80736be21b9f 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -631,8 +631,8 @@ static int bch2_journal_replay(struct bch_fs *c, u64 start_seq, u64 end_seq)
 				     : 0),
 			     bch2_journal_replay_key(&trans, k));
 		if (ret) {
-			bch_err(c, "journal replay: error %d while replaying key at btree %s level %u",
-				ret, bch2_btree_ids[k->btree_id], k->level);
+			bch_err(c, "journal replay: error while replaying key at btree %s level %u: %s",
+				bch2_btree_ids[k->btree_id], k->level, bch2_err_str(ret));
 			goto err;
 		}
 	}
diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c
index 8c426d6440c9..08c98ac03c13 100644
--- a/fs/bcachefs/reflink.c
+++ b/fs/bcachefs/reflink.c
@@ -283,7 +283,7 @@ s64 bch2_remap_range(struct bch_fs *c,
 	int ret = 0, ret2 = 0;
 
 	if (!percpu_ref_tryget_live(&c->writes))
-		return -EROFS;
+		return -BCH_ERR_erofs_no_writes;
 
 	bch2_check_set_feature(c, BCH_FEATURE_reflink);
 
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index 8dfe92d7eb77..ff27ae1839a8 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -868,7 +868,7 @@ int bch2_write_super(struct bch_fs *c)
 				le64_to_cpu(ca->sb_read_scratch->seq),
 				ca->disk_sb.seq);
 			percpu_ref_put(&ca->io_ref);
-			ret = -EROFS;
+			ret = -BCH_ERR_erofs_sb_err;
 			goto out;
 		}
 
@@ -878,7 +878,7 @@ int bch2_write_super(struct bch_fs *c)
 				le64_to_cpu(ca->sb_read_scratch->seq),
 				ca->disk_sb.seq);
 			percpu_ref_put(&ca->io_ref);
-			ret = -EROFS;
+			ret = -BCH_ERR_erofs_sb_err;
 			goto out;
 		}
 	}
-- 
cgit 


From 87ced107f37fc017d34b8f56afeb7daa06c87310 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 13 Dec 2022 15:17:40 -0500
Subject: bcachefs: Convert EAGAIN errors to private error codes

More error code cleanup, for better error messages and debugability.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_foreground.c      |  2 +-
 fs/bcachefs/btree_cache.c           |  4 +---
 fs/bcachefs/btree_update_interior.c |  2 +-
 fs/bcachefs/btree_update_leaf.c     | 17 ++++++-----------
 fs/bcachefs/errcode.h               |  5 +++++
 fs/bcachefs/io.c                    |  2 +-
 fs/bcachefs/journal.c               | 22 ++++++++--------------
 fs/bcachefs/journal.h               |  2 +-
 8 files changed, 24 insertions(+), 32 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index c4aee0022fab..3219c37d9262 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -1219,7 +1219,7 @@ err:
 	if (bch2_err_matches(ret, BCH_ERR_open_buckets_empty) ||
 	    bch2_err_matches(ret, BCH_ERR_freelist_empty))
 		return cl
-			? -EAGAIN
+			? -BCH_ERR_bucket_alloc_blocked
 			: -BCH_ERR_ENOSPC_bucket_alloc;
 
 	return ret;
diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index 0ac8636edba2..7868536d7581 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -531,7 +531,7 @@ int bch2_btree_cache_cannibalize_lock(struct bch_fs *c, struct closure *cl)
 	}
 
 	trace_and_count(c, btree_cache_cannibalize_lock_fail, c);
-	return -EAGAIN;
+	return -BCH_ERR_btree_cache_cannibalize_lock_blocked;
 
 success:
 	trace_and_count(c, btree_cache_cannibalize_lock, c);
@@ -906,8 +906,6 @@ retry:
  * bch_btree_node_get - find a btree node in the cache and lock it, reading it
  * in from disk if necessary.
  *
- * If IO is necessary and running under generic_make_request, returns -EAGAIN.
- *
  * The btree node will have either a read or a write lock held, depending on
  * the @write parameter.
  */
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index cb1e7dbf5440..d2e785b9c835 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -1163,7 +1163,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
 
 			bch2_trans_unlock(trans);
 			closure_sync(&cl);
-		} while (ret == -EAGAIN);
+		} while (bch2_err_matches(ret, BCH_ERR_operation_blocked));
 	}
 
 	if (ret) {
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index f44abb3fe469..61b61acef7a8 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -332,15 +332,10 @@ bch2_trans_journal_preres_get_cold(struct btree_trans *trans, unsigned u64s,
 static __always_inline int bch2_trans_journal_res_get(struct btree_trans *trans,
 					     unsigned flags)
 {
-	struct bch_fs *c = trans->c;
-	int ret;
-
-	ret = bch2_journal_res_get(&c->journal, &trans->journal_res,
-				   trans->journal_u64s,
-				   flags|
-				   (trans->flags & JOURNAL_WATERMARK_MASK));
-
-	return ret == -EAGAIN ? -BCH_ERR_btree_insert_need_journal_res : ret;
+	return bch2_journal_res_get(&trans->c->journal, &trans->journal_res,
+				    trans->journal_u64s,
+				    flags|
+				    (trans->flags & JOURNAL_WATERMARK_MASK));
 }
 
 #define JSET_ENTRY_LOG_U64s		4
@@ -864,7 +859,7 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans,
 			&trans->journal_preres, trans->journal_preres_u64s,
 			JOURNAL_RES_GET_NONBLOCK|
 			(trans->flags & JOURNAL_WATERMARK_MASK));
-	if (unlikely(ret == -EAGAIN))
+	if (unlikely(ret == -BCH_ERR_journal_preres_get_blocked))
 		ret = bch2_trans_journal_preres_get_cold(trans,
 						trans->journal_preres_u64s, trace_ip);
 	if (unlikely(ret))
@@ -936,7 +931,7 @@ int bch2_trans_commit_error(struct btree_trans *trans,
 		if (ret)
 			trace_and_count(c, trans_restart_mark_replicas, trans, trace_ip);
 		break;
-	case -BCH_ERR_btree_insert_need_journal_res:
+	case -BCH_ERR_journal_res_get_blocked:
 		bch2_trans_unlock(trans);
 
 		if ((trans->flags & BTREE_INSERT_JOURNAL_RECLAIM) &&
diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h
index 96ee72c125d6..bb296edcf4f7 100644
--- a/fs/bcachefs/errcode.h
+++ b/fs/bcachefs/errcode.h
@@ -84,6 +84,11 @@
 	x(EROFS,			erofs_journal_err)			\
 	x(EROFS,			erofs_sb_err)				\
 	x(EROFS,			insufficient_devices)			\
+	x(0,				operation_blocked)			\
+	x(BCH_ERR_operation_blocked,	btree_cache_cannibalize_lock_blocked)	\
+	x(BCH_ERR_operation_blocked,	journal_res_get_blocked)		\
+	x(BCH_ERR_operation_blocked,	journal_preres_get_blocked)		\
+	x(BCH_ERR_operation_blocked,	bucket_alloc_blocked)			\
 	x(BCH_ERR_invalid,		invalid_sb)				\
 	x(BCH_ERR_invalid_sb,		invalid_sb_magic)			\
 	x(BCH_ERR_invalid_sb,		invalid_sb_version)			\
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index 44f3719d4b71..c39f00f9ebd8 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -1264,7 +1264,7 @@ again:
 					      BCH_WRITE_ONLY_SPECIFIED_DEVS))
 				? NULL : &op->cl, &wp));
 		if (unlikely(ret)) {
-			if (ret == -EAGAIN)
+			if (bch2_err_matches(ret, BCH_ERR_operation_blocked))
 				break;
 
 			goto err;
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index 9daa99f3732a..66bd0a72c774 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -198,12 +198,6 @@ static bool journal_entry_close(struct journal *j)
 /*
  * should _only_ called from journal_res_get() - when we actually want a
  * journal reservation - journal entry is open means journal is dirty:
- *
- * returns:
- * 0:		success
- * -ENOSPC:	journal currently full, must invoke reclaim
- * -EAGAIN:	journal blocked, must wait
- * -EROFS:	insufficient rw devices or journal error
  */
 static int journal_entry_open(struct journal *j)
 {
@@ -455,7 +449,9 @@ unlock:
 		}
 	}
 
-	return ret == JOURNAL_ERR_insufficient_devices ? -EROFS : -EAGAIN;
+	return ret == JOURNAL_ERR_insufficient_devices
+		? -EROFS
+		: -BCH_ERR_journal_res_get_blocked;
 }
 
 /*
@@ -474,7 +470,8 @@ int bch2_journal_res_get_slowpath(struct journal *j, struct journal_res *res,
 	int ret;
 
 	closure_wait_event(&j->async_wait,
-		   (ret = __journal_res_get(j, res, flags)) != -EAGAIN ||
+		   (ret = __journal_res_get(j, res, flags)) !=
+		   -BCH_ERR_journal_res_get_blocked||
 		   (flags & JOURNAL_RES_GET_NONBLOCK));
 	return ret;
 }
@@ -792,12 +789,9 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
 		} else {
 			ob[nr_got] = bch2_bucket_alloc(c, ca, RESERVE_none,
 					       false, cl);
-			if (IS_ERR(ob[nr_got])) {
-				ret = cl
-					? -EAGAIN
-					: -BCH_ERR_ENOSPC_bucket_alloc;
+			ret = PTR_ERR_OR_ZERO(ob[nr_got]);
+			if (ret)
 				break;
-			}
 
 			bu[nr_got] = ob[nr_got]->bucket;
 		}
@@ -907,7 +901,7 @@ int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca,
 
 	closure_init_stack(&cl);
 
-	while (ja->nr != nr && (ret == 0 || ret == -EAGAIN)) {
+	while (ja->nr != nr && (ret == 0 || ret == -BCH_ERR_bucket_alloc_blocked)) {
 		struct disk_reservation disk_res = { 0, 0 };
 
 		closure_sync(&cl);
diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h
index 1c7508333432..024cea9f5902 100644
--- a/fs/bcachefs/journal.h
+++ b/fs/bcachefs/journal.h
@@ -460,7 +460,7 @@ static inline int bch2_journal_preres_get(struct journal *j,
 		return 0;
 
 	if (flags & JOURNAL_RES_GET_NONBLOCK)
-		return -EAGAIN;
+		return -BCH_ERR_journal_preres_get_blocked;
 
 	return __bch2_journal_preres_get(j, res, new_u64s, flags);
 }
-- 
cgit 


From 834dc29d521d34718602bfb8d93d370093a5d430 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 24 Jan 2023 19:42:04 -0500
Subject: bcachefs: debug: Fix some locking bugs

This fixes a few error paths in debug code that lead to locks failing to
be dropped.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/debug.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c
index ab210296223b..fcefd55a5322 100644
--- a/fs/bcachefs/debug.c
+++ b/fs/bcachefs/debug.c
@@ -520,7 +520,7 @@ static ssize_t bch2_btree_transactions_read(struct file *file, char __user *buf,
 
 		ret = flush_buf(i);
 		if (ret)
-			return ret;
+			break;
 
 		bch2_btree_trans_to_text(&i->buf, trans);
 
@@ -711,7 +711,7 @@ static ssize_t bch2_btree_deadlock_read(struct file *file, char __user *buf,
 
 		ret = flush_buf(i);
 		if (ret)
-			return ret;
+			break;
 
 		bch2_check_for_deadlock(trans, &i->buf);
 
-- 
cgit 


From ad5d3d820a97500d8af0d5f337f8f523c6099ac6 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 25 Jan 2023 10:07:52 -0500
Subject: bcachefs: Kill fs_usage_apply_warn()

We now have bch2_trans_inconsistent() which generically does the same
thing - dumps pending btree transaction updates.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/buckets.c | 34 +++-------------------------------
 1 file changed, 3 insertions(+), 31 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 137a9aef6987..4fd396cb1dad 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -1268,36 +1268,6 @@ int bch2_mark_reflink_p(struct btree_trans *trans,
 	return ret;
 }
 
-static noinline __cold
-void fs_usage_apply_warn(struct btree_trans *trans,
-			 unsigned disk_res_sectors,
-			 s64 should_not_have_added)
-{
-	struct bch_fs *c = trans->c;
-	struct btree_insert_entry *i;
-	struct printbuf buf = PRINTBUF;
-
-	prt_printf(&buf,
-		   bch2_fmt(c, "disk usage increased %lli more than %u sectors reserved)"),
-		   should_not_have_added, disk_res_sectors);
-
-	trans_for_each_update(trans, i) {
-		struct bkey_s_c old = { &i->old_k, i->old_v };
-
-		prt_str(&buf, "new ");
-		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(i->k));
-		prt_newline(&buf);
-
-		prt_str(&buf, "old ");
-		bch2_bkey_val_to_text(&buf, c, old);
-		prt_newline(&buf);
-	}
-
-	__WARN();
-	bch2_print_string_as_lines(KERN_ERR, buf.buf);
-	printbuf_exit(&buf);
-}
-
 int bch2_trans_fs_usage_apply(struct btree_trans *trans,
 			      struct replicas_delta_list *deltas)
 {
@@ -1362,7 +1332,9 @@ int bch2_trans_fs_usage_apply(struct btree_trans *trans,
 	percpu_up_read(&c->mark_lock);
 
 	if (unlikely(warn) && !xchg(&warned_disk_usage, 1))
-		fs_usage_apply_warn(trans, disk_res_sectors, should_not_have_added);
+		bch2_trans_inconsistent(trans,
+					"disk usage increased %lli more than %u sectors reserved)",
+					should_not_have_added, disk_res_sectors);
 	return 0;
 need_mark:
 	/* revert changes: */
-- 
cgit 


From 0329631c9165d2dddd5a89da5f72f3175011b49f Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 25 Jan 2023 10:08:50 -0500
Subject: bcachefs: Dump transaction updates before panicing

When errors=panic, we need to dump transaction updates before calling
bch2_inconsistent_error().

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/error.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h
index dae72620dae3..9991879dfbff 100644
--- a/fs/bcachefs/error.h
+++ b/fs/bcachefs/error.h
@@ -73,8 +73,8 @@ do {									\
 #define bch2_trans_inconsistent(trans, ...)				\
 ({									\
 	bch_err(trans->c, __VA_ARGS__);					\
-	bch2_inconsistent_error(trans->c);				\
 	bch2_dump_trans_updates(trans);					\
+	bch2_inconsistent_error(trans->c);				\
 })
 
 #define bch2_trans_inconsistent_on(cond, trans, ...)			\
-- 
cgit 


From 7c909f654bae57083a0965f105e52ac8737a0785 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 20 Jan 2023 17:02:56 -0500
Subject: bcachefs: Fix repair path in bch2_mark_reflink_p()

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/buckets.c | 22 ++++++++++++++--------
 1 file changed, 14 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 4fd396cb1dad..6c9dcfd54be6 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -1213,17 +1213,23 @@ not_found:
 		     "  missing range %llu-%llu",
 		     (bch2_bkey_val_to_text(&buf, c, p.s_c), buf.buf),
 		     *idx, next_idx)) {
-		struct bkey_i_error new;
-
-		bkey_init(&new.k);
-		new.k.type	= KEY_TYPE_error;
-		new.k.p		= bkey_start_pos(p.k);
-		new.k.p.offset += *idx - start;
-		bch2_key_resize(&new.k, next_idx - *idx);
-		ret = __bch2_btree_insert(trans, BTREE_ID_extents, &new.k_i);
+		struct bkey_i_error *new;
+
+		new = bch2_trans_kmalloc(trans, sizeof(*new));
+		ret = PTR_ERR_OR_ZERO(new);
+		if (ret)
+			goto err;
+
+		bkey_init(&new->k);
+		new->k.type	= KEY_TYPE_error;
+		new->k.p		= bkey_start_pos(p.k);
+		new->k.p.offset += *idx - start;
+		bch2_key_resize(&new->k, next_idx - *idx);
+		ret = __bch2_btree_insert(trans, BTREE_ID_extents, &new->k_i);
 	}
 
 	*idx = next_idx;
+err:
 fsck_err:
 	printbuf_exit(&buf);
 	return ret;
-- 
cgit 


From d7dd3fb84f05a0d221be3979929706a4828fb252 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 7 Jan 2023 22:55:42 -0500
Subject: bcachefs: Fix rereplicate when we already have a cached pointer

When we need to add more replicas to an extent, it might be the case
that we already have a replica on every device, but some of them are
cached.

This patch fixes a bug where we'd spin on that extent because the write
path fails to find a device we can allocate from: we allow allocating
from devices that already have cached replicas on them, and change
bch2_data_update_index_update() to drop the cached replica if needed.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/data_update.c | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c
index b4480852e935..acb634b3480b 100644
--- a/fs/bcachefs/data_update.c
+++ b/fs/bcachefs/data_update.c
@@ -183,7 +183,17 @@ int bch2_data_update_index_update(struct bch_write_op *op)
 
 		/* Add new ptrs: */
 		extent_for_each_ptr_decode(extent_i_to_s(new), p, entry) {
-			if (bch2_bkey_has_device(bkey_i_to_s_c(insert), p.ptr.dev)) {
+			const struct bch_extent_ptr *existing_ptr =
+				bch2_bkey_has_device(bkey_i_to_s_c(insert), p.ptr.dev);
+
+			if (existing_ptr && existing_ptr->cached) {
+				/*
+				 * We're replacing a cached pointer with a non
+				 * cached pointer:
+				 */
+				bch2_bkey_drop_device_noerror(bkey_i_to_s(insert),
+							      existing_ptr->dev);
+			} else if (existing_ptr) {
 				/*
 				 * raced with another move op? extent already
 				 * has a pointer to the device we just wrote
@@ -334,7 +344,8 @@ int bch2_data_update_init(struct bch_fs *c, struct data_update *m,
 		    p.ptr.cached)
 			BUG();
 
-		if (!((1U << i) & m->data_opts.rewrite_ptrs))
+		if (!((1U << i) & m->data_opts.rewrite_ptrs) &&
+		    !p.ptr.cached)
 			bch2_dev_list_add_dev(&m->op.devs_have, p.ptr.dev);
 
 		if (((1U << i) & m->data_opts.rewrite_ptrs) &&
-- 
cgit 


From 9fea089a9502784f42868b2649a732724f4c1d0b Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 4 Jan 2023 00:00:55 -0500
Subject: bcachefs: Check for lru entries with time=0

These are invalid.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/lru.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/lru.c b/fs/bcachefs/lru.c
index e16686b3b45a..9b4ce27d12f3 100644
--- a/fs/bcachefs/lru.c
+++ b/fs/bcachefs/lru.c
@@ -19,6 +19,12 @@ int bch2_lru_invalid(const struct bch_fs *c, struct bkey_s_c k,
 		return -BCH_ERR_invalid_bkey;
 	}
 
+	if (!k.k->p.offset) {
+		prt_printf(err, "lru entry at time=0");
+		return -BCH_ERR_invalid_bkey;
+
+	}
+
 	return 0;
 }
 
-- 
cgit 


From db36c1477d1753f4d5bebaed074ca4e4477df3ea Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 23 Jan 2023 20:28:59 -0500
Subject: bcachefs: Fix bch2_bucket_alloc_early()

We were incorrectly retrying after a transaction restart.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_foreground.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index 3219c37d9262..471ae15caa75 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -421,12 +421,15 @@ again:
 
 	ca->alloc_cursor = alloc_cursor;
 
+	if (!ob && ret)
+		ob = ERR_PTR(ret);
+
 	if (!ob && alloc_cursor > alloc_start) {
 		alloc_cursor = alloc_start;
 		goto again;
 	}
 
-	return ob ?: ERR_PTR(ret ?: -BCH_ERR_no_buckets_found);
+	return ob;
 }
 
 static struct open_bucket *bch2_bucket_alloc_freelist(struct btree_trans *trans,
-- 
cgit 


From adf6360b5d6071ea268fa6f5f03befba4909ffaa Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 1 Feb 2023 22:51:51 -0500
Subject: bcachefs: Improve btree_reserve_get_fail tracepoint

Now we include the return code.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update_interior.c |  3 ++-
 fs/bcachefs/trace.h                 | 12 ++++++++----
 2 files changed, 10 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index d2e785b9c835..566838317845 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -1167,7 +1167,8 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
 	}
 
 	if (ret) {
-		trace_and_count(c, btree_reserve_get_fail, trans->fn, _RET_IP_, nr_nodes[0] + nr_nodes[1]);
+		trace_and_count(c, btree_reserve_get_fail, trans->fn,
+				_RET_IP_, nr_nodes[0] + nr_nodes[1], ret);
 		goto err;
 	}
 
diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h
index 7004da8d341f..17fc58e73702 100644
--- a/fs/bcachefs/trace.h
+++ b/fs/bcachefs/trace.h
@@ -344,25 +344,29 @@ DEFINE_EVENT(btree_node, btree_node_free,
 TRACE_EVENT(btree_reserve_get_fail,
 	TP_PROTO(const char *trans_fn,
 		 unsigned long caller_ip,
-		 size_t required),
-	TP_ARGS(trans_fn, caller_ip, required),
+		 size_t required,
+		 int ret),
+	TP_ARGS(trans_fn, caller_ip, required, ret),
 
 	TP_STRUCT__entry(
 		__array(char,			trans_fn, 32	)
 		__field(unsigned long,		caller_ip	)
 		__field(size_t,			required	)
+		__array(char,			ret, 32		)
 	),
 
 	TP_fast_assign(
 		strscpy(__entry->trans_fn, trans_fn, sizeof(__entry->trans_fn));
 		__entry->caller_ip	= caller_ip;
 		__entry->required	= required;
+		strscpy(__entry->ret, bch2_err_str(ret), sizeof(__entry->ret));
 	),
 
-	TP_printk("%s %pS required %zu",
+	TP_printk("%s %pS required %zu ret %s",
 		  __entry->trans_fn,
 		  (void *) __entry->caller_ip,
-		  __entry->required)
+		  __entry->required,
+		  __entry->ret)
 );
 
 DEFINE_EVENT(btree_node, btree_node_compact,
-- 
cgit 


From 19a614d2e4beed7faf52ab95cb48ce38a3c38c04 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 30 Jan 2023 20:58:43 -0500
Subject: bcachefs: Better inlining for bch2_alloc_to_v4_mut

This separates out the slowpath into a separate function, and inlines
bch2_alloc_v4_mut into bch2_trans_start_alloc_update(), the main place
it's called.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c | 310 ++++++++++++++++++++++++-----------------
 fs/bcachefs/alloc_background.h |  36 ++++-
 fs/bcachefs/alloc_foreground.c |  24 ++--
 fs/bcachefs/bcachefs_format.h  |   3 +
 fs/bcachefs/btree_gc.c         |  43 +++---
 fs/bcachefs/buckets.c          |  64 +++++----
 fs/bcachefs/lru.c              |   9 +-
 fs/bcachefs/movinggc.c         |  17 +--
 8 files changed, 305 insertions(+), 201 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index cbde19bafbba..d75738134f94 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -37,8 +37,6 @@ static const unsigned BCH_ALLOC_V1_FIELD_BYTES[] = {
 
 struct bkey_alloc_unpacked {
 	u64		journal_seq;
-	u64		bucket;
-	u8		dev;
 	u8		gen;
 	u8		oldest_gen;
 	u8		data_type;
@@ -194,11 +192,7 @@ static int bch2_alloc_unpack_v3(struct bkey_alloc_unpacked *out,
 
 static struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c k)
 {
-	struct bkey_alloc_unpacked ret = {
-		.dev	= k.k->p.inode,
-		.bucket	= k.k->p.offset,
-		.gen	= 0,
-	};
+	struct bkey_alloc_unpacked ret = { .gen	= 0 };
 
 	switch (k.k->type) {
 	case KEY_TYPE_alloc:
@@ -215,73 +209,6 @@ static struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c k)
 	return ret;
 }
 
-void bch2_alloc_to_v4(struct bkey_s_c k, struct bch_alloc_v4 *out)
-{
-	if (k.k->type == KEY_TYPE_alloc_v4) {
-		*out = *bkey_s_c_to_alloc_v4(k).v;
-	} else {
-		struct bkey_alloc_unpacked u = bch2_alloc_unpack(k);
-
-		*out = (struct bch_alloc_v4) {
-			.journal_seq		= u.journal_seq,
-			.flags			= u.need_discard,
-			.gen			= u.gen,
-			.oldest_gen		= u.oldest_gen,
-			.data_type		= u.data_type,
-			.stripe_redundancy	= u.stripe_redundancy,
-			.dirty_sectors		= u.dirty_sectors,
-			.cached_sectors		= u.cached_sectors,
-			.io_time[READ]		= u.read_time,
-			.io_time[WRITE]		= u.write_time,
-			.stripe			= u.stripe,
-		};
-	}
-}
-
-struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut(struct btree_trans *trans, struct bkey_s_c k)
-{
-	struct bkey_i_alloc_v4 *ret;
-
-	if (k.k->type == KEY_TYPE_alloc_v4) {
-		ret = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
-		if (!IS_ERR(ret))
-			bkey_reassemble(&ret->k_i, k);
-	} else {
-		ret = bch2_trans_kmalloc(trans, sizeof(*ret));
-		if (!IS_ERR(ret)) {
-			bkey_alloc_v4_init(&ret->k_i);
-			ret->k.p = k.k->p;
-			bch2_alloc_to_v4(k, &ret->v);
-		}
-	}
-	return ret;
-}
-
-struct bkey_i_alloc_v4 *
-bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree_iter *iter,
-			      struct bpos pos)
-{
-	struct bkey_s_c k;
-	struct bkey_i_alloc_v4 *a;
-	int ret;
-
-	bch2_trans_iter_init(trans, iter, BTREE_ID_alloc, pos,
-			     BTREE_ITER_WITH_UPDATES|
-			     BTREE_ITER_CACHED|
-			     BTREE_ITER_INTENT);
-	k = bch2_btree_iter_peek_slot(iter);
-	ret = bkey_err(k);
-	if (ret) {
-		bch2_trans_iter_exit(trans, iter);
-		return ERR_PTR(ret);
-	}
-
-	a = bch2_alloc_to_v4_mut(trans, k);
-	if (IS_ERR(a))
-		bch2_trans_iter_exit(trans, iter);
-	return a;
-}
-
 static unsigned bch_alloc_v1_val_u64s(const struct bch_alloc *a)
 {
 	unsigned i, bytes = offsetof(struct bch_alloc, data);
@@ -417,21 +344,154 @@ void bch2_alloc_v4_swab(struct bkey_s k)
 
 void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
 {
-	struct bch_alloc_v4 a;
+	struct bch_alloc_v4 _a;
+	const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &_a);
+
+	prt_newline(out);
+	printbuf_indent_add(out, 2);
+
+	prt_printf(out, "gen %u oldest_gen %u data_type %s",
+	       a->gen, a->oldest_gen, bch2_data_types[a->data_type]);
+	prt_newline(out);
+	prt_printf(out, "journal_seq       %llu",	a->journal_seq);
+	prt_newline(out);
+	prt_printf(out, "need_discard      %llu",	BCH_ALLOC_V4_NEED_DISCARD(a));
+	prt_newline(out);
+	prt_printf(out, "need_inc_gen      %llu",	BCH_ALLOC_V4_NEED_INC_GEN(a));
+	prt_newline(out);
+	prt_printf(out, "dirty_sectors     %u",	a->dirty_sectors);
+	prt_newline(out);
+	prt_printf(out, "cached_sectors    %u",	a->cached_sectors);
+	prt_newline(out);
+	prt_printf(out, "stripe            %u",	a->stripe);
+	prt_newline(out);
+	prt_printf(out, "stripe_redundancy %u",	a->stripe_redundancy);
+	prt_newline(out);
+	prt_printf(out, "io_time[READ]     %llu",	a->io_time[READ]);
+	prt_newline(out);
+	prt_printf(out, "io_time[WRITE]    %llu",	a->io_time[WRITE]);
+	prt_newline(out);
+	prt_printf(out, "backpointers:     %llu",	BCH_ALLOC_V4_NR_BACKPOINTERS(a));
+
+	printbuf_indent_sub(out, 2);
+}
+
+static inline void *alloc_v4_backpointers(struct bch_alloc_v4 *a)
+{
+	return (void *) ((u64 *) &a->v + BCH_ALLOC_V4_BACKPOINTERS_START(a));
+}
+
+void __bch2_alloc_to_v4(struct bkey_s_c k, struct bch_alloc_v4 *out)
+{
+	if (k.k->type == KEY_TYPE_alloc_v4) {
+		void *src, *dst;
+
+		*out = *bkey_s_c_to_alloc_v4(k).v;
+
+		src = alloc_v4_backpointers(out);
+		SET_BCH_ALLOC_V4_BACKPOINTERS_START(out, BCH_ALLOC_V4_U64s);
+		dst = alloc_v4_backpointers(out);
+
+		if (src < dst)
+			memset(src, 0, dst - src);
+	} else {
+		struct bkey_alloc_unpacked u = bch2_alloc_unpack(k);
 
-	bch2_alloc_to_v4(k, &a);
-
-	prt_printf(out, "gen %u oldest_gen %u data_type %s journal_seq %llu need_discard %llu need_inc_gen %llu",
-	       a.gen, a.oldest_gen, bch2_data_types[a.data_type],
-	       a.journal_seq,
-	       BCH_ALLOC_V4_NEED_DISCARD(&a),
-	       BCH_ALLOC_V4_NEED_INC_GEN(&a));
-	prt_printf(out, " dirty_sectors %u",	a.dirty_sectors);
-	prt_printf(out, " cached_sectors %u",	a.cached_sectors);
-	prt_printf(out, " stripe %u",		a.stripe);
-	prt_printf(out, " stripe_redundancy %u",	a.stripe_redundancy);
-	prt_printf(out, " read_time %llu",		a.io_time[READ]);
-	prt_printf(out, " write_time %llu",		a.io_time[WRITE]);
+		*out = (struct bch_alloc_v4) {
+			.journal_seq		= u.journal_seq,
+			.flags			= u.need_discard,
+			.gen			= u.gen,
+			.oldest_gen		= u.oldest_gen,
+			.data_type		= u.data_type,
+			.stripe_redundancy	= u.stripe_redundancy,
+			.dirty_sectors		= u.dirty_sectors,
+			.cached_sectors		= u.cached_sectors,
+			.io_time[READ]		= u.read_time,
+			.io_time[WRITE]		= u.write_time,
+			.stripe			= u.stripe,
+		};
+
+		SET_BCH_ALLOC_V4_BACKPOINTERS_START(out, BCH_ALLOC_V4_U64s);
+	}
+}
+
+static noinline struct bkey_i_alloc_v4 *
+__bch2_alloc_to_v4_mut(struct btree_trans *trans, struct bkey_s_c k)
+{
+	struct bkey_i_alloc_v4 *ret;
+
+	if (k.k->type == KEY_TYPE_alloc_v4) {
+		unsigned bytes = min(sizeof(struct bkey_i_alloc_v4), bkey_bytes(k.k));
+		void *src, *dst;
+
+		ret = bch2_trans_kmalloc(trans, bytes);
+		if (IS_ERR(ret))
+			return ret;
+
+		bkey_reassemble(&ret->k_i, k);
+
+		src = alloc_v4_backpointers(&ret->v);
+		SET_BCH_ALLOC_V4_BACKPOINTERS_START(&ret->v, BCH_ALLOC_V4_U64s);
+		dst = alloc_v4_backpointers(&ret->v);
+
+		if (src < dst)
+			memset(src, 0, dst - src);
+		set_alloc_v4_u64s(ret);
+	} else {
+		ret = bch2_trans_kmalloc(trans, sizeof(*ret));
+		if (!IS_ERR(ret)) {
+			bkey_alloc_v4_init(&ret->k_i);
+			ret->k.p = k.k->p;
+			bch2_alloc_to_v4(k, &ret->v);
+		}
+	}
+	return ret;
+}
+
+static inline struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut_inlined(struct btree_trans *trans, struct bkey_s_c k)
+{
+	if (likely(k.k->type == KEY_TYPE_alloc_v4) &&
+	    BCH_ALLOC_V4_BACKPOINTERS_START(bkey_s_c_to_alloc_v4(k).v) == BCH_ALLOC_V4_U64s) {
+		struct bkey_i_alloc_v4 *ret =
+			bch2_trans_kmalloc_nomemzero(trans, bkey_bytes(k.k));
+		if (!IS_ERR(ret))
+			bkey_reassemble(&ret->k_i, k);
+		return ret;
+	}
+
+	return __bch2_alloc_to_v4_mut(trans, k);
+}
+
+struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut(struct btree_trans *trans, struct bkey_s_c k)
+{
+	return bch2_alloc_to_v4_mut_inlined(trans, k);
+}
+
+struct bkey_i_alloc_v4 *
+bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree_iter *iter,
+			      struct bpos pos)
+{
+	struct bkey_s_c k;
+	struct bkey_i_alloc_v4 *a;
+	int ret;
+
+	bch2_trans_iter_init(trans, iter, BTREE_ID_alloc, pos,
+			     BTREE_ITER_WITH_UPDATES|
+			     BTREE_ITER_CACHED|
+			     BTREE_ITER_INTENT);
+	k = bch2_btree_iter_peek_slot(iter);
+	ret = bkey_err(k);
+	if (unlikely(ret))
+		goto err;
+
+	a = bch2_alloc_to_v4_mut_inlined(trans, k);
+	ret = PTR_ERR_OR_ZERO(a);
+	if (unlikely(ret))
+		goto err;
+	return a;
+err:
+	bch2_trans_iter_exit(trans, iter);
+	return ERR_PTR(ret);
 }
 
 int bch2_alloc_read(struct bch_fs *c)
@@ -455,9 +515,8 @@ int bch2_alloc_read(struct bch_fs *c)
 			continue;
 
 		ca = bch_dev_bkey_exists(c, k.k->p.inode);
-		bch2_alloc_to_v4(k, &a);
 
-		*bucket_gen(ca, k.k->p.offset) = a.gen;
+		*bucket_gen(ca, k.k->p.offset) = bch2_alloc_to_v4(k, &a)->gen;
 	}
 	bch2_trans_iter_exit(&trans, &iter);
 
@@ -546,7 +605,8 @@ int bch2_trans_mark_alloc(struct btree_trans *trans,
 			  unsigned flags)
 {
 	struct bch_fs *c = trans->c;
-	struct bch_alloc_v4 old_a, *new_a;
+	struct bch_alloc_v4 old_a_convert, *new_a;
+	const struct bch_alloc_v4 *old_a;
 	u64 old_lru, new_lru;
 	int ret = 0;
 
@@ -556,13 +616,13 @@ int bch2_trans_mark_alloc(struct btree_trans *trans,
 	 */
 	BUG_ON(new->k.type != KEY_TYPE_alloc_v4);
 
-	bch2_alloc_to_v4(old, &old_a);
+	old_a = bch2_alloc_to_v4(old, &old_a_convert);
 	new_a = &bkey_i_to_alloc_v4(new)->v;
 
 	new_a->data_type = alloc_data_type(*new_a, new_a->data_type);
 
-	if (new_a->dirty_sectors > old_a.dirty_sectors ||
-	    new_a->cached_sectors > old_a.cached_sectors) {
+	if (new_a->dirty_sectors > old_a->dirty_sectors ||
+	    new_a->cached_sectors > old_a->cached_sectors) {
 		new_a->io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now));
 		new_a->io_time[WRITE]= max_t(u64, 1, atomic64_read(&c->io_clock[WRITE].now));
 		SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, true);
@@ -576,10 +636,10 @@ int bch2_trans_mark_alloc(struct btree_trans *trans,
 		SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, false);
 	}
 
-	if (old_a.data_type != new_a->data_type ||
+	if (old_a->data_type != new_a->data_type ||
 	    (new_a->data_type == BCH_DATA_free &&
-	     alloc_freespace_genbits(old_a) != alloc_freespace_genbits(*new_a))) {
-		ret =   bch2_bucket_do_index(trans, old, &old_a, false) ?:
+	     alloc_freespace_genbits(*old_a) != alloc_freespace_genbits(*new_a))) {
+		ret =   bch2_bucket_do_index(trans, old, old_a, false) ?:
 			bch2_bucket_do_index(trans, bkey_i_to_s_c(new), new_a, true);
 		if (ret)
 			return ret;
@@ -589,7 +649,7 @@ int bch2_trans_mark_alloc(struct btree_trans *trans,
 	    !new_a->io_time[READ])
 		new_a->io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now));
 
-	old_lru = alloc_lru_idx(old_a);
+	old_lru = alloc_lru_idx(*old_a);
 	new_lru = alloc_lru_idx(*new_a);
 
 	if (old_lru != new_lru) {
@@ -612,7 +672,8 @@ static int bch2_check_alloc_key(struct btree_trans *trans,
 {
 	struct bch_fs *c = trans->c;
 	struct bch_dev *ca;
-	struct bch_alloc_v4 a;
+	struct bch_alloc_v4 a_convert;
+	const struct bch_alloc_v4 *a;
 	unsigned discard_key_type, freespace_key_type;
 	struct bkey_s_c alloc_k, k;
 	struct printbuf buf = PRINTBUF;
@@ -637,15 +698,15 @@ static int bch2_check_alloc_key(struct btree_trans *trans,
 	if (!ca->mi.freespace_initialized)
 		return 0;
 
-	bch2_alloc_to_v4(alloc_k, &a);
+	a = bch2_alloc_to_v4(alloc_k, &a_convert);
 
-	discard_key_type = a.data_type == BCH_DATA_need_discard
+	discard_key_type = a->data_type == BCH_DATA_need_discard
 		? KEY_TYPE_set : 0;
-	freespace_key_type = a.data_type == BCH_DATA_free
+	freespace_key_type = a->data_type == BCH_DATA_free
 		? KEY_TYPE_set : 0;
 
 	bch2_btree_iter_set_pos(discard_iter, alloc_k.k->p);
-	bch2_btree_iter_set_pos(freespace_iter, alloc_freespace_pos(alloc_k.k->p, a));
+	bch2_btree_iter_set_pos(freespace_iter, alloc_freespace_pos(alloc_k.k->p, *a));
 
 	k = bch2_btree_iter_peek_slot(discard_iter);
 	ret = bkey_err(k);
@@ -716,7 +777,8 @@ static int bch2_check_discard_freespace_key(struct btree_trans *trans,
 	struct bch_fs *c = trans->c;
 	struct btree_iter alloc_iter;
 	struct bkey_s_c alloc_k;
-	struct bch_alloc_v4 a;
+	struct bch_alloc_v4 a_convert;
+	const struct bch_alloc_v4 *a;
 	u64 genbits;
 	struct bpos pos;
 	enum bch_data_type state = iter->btree_id == BTREE_ID_need_discard
@@ -741,16 +803,16 @@ static int bch2_check_discard_freespace_key(struct btree_trans *trans,
 	if (ret)
 		goto err;
 
-	bch2_alloc_to_v4(alloc_k, &a);
+	a = bch2_alloc_to_v4(alloc_k, &a_convert);
 
-	if (fsck_err_on(a.data_type != state ||
+	if (fsck_err_on(a->data_type != state ||
 			(state == BCH_DATA_free &&
-			 genbits != alloc_freespace_genbits(a)), c,
+			 genbits != alloc_freespace_genbits(*a)), c,
 			"%s\n  incorrectly set in %s index (free %u, genbits %llu should be %llu)",
 			(bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf),
 			bch2_btree_ids[iter->btree_id],
-			a.data_type == state,
-			genbits >> 56, alloc_freespace_genbits(a) >> 56))
+			a->data_type == state,
+			genbits >> 56, alloc_freespace_genbits(*a) >> 56))
 		goto delete;
 out:
 err:
@@ -818,7 +880,8 @@ static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans,
 {
 	struct bch_fs *c = trans->c;
 	struct btree_iter lru_iter;
-	struct bch_alloc_v4 a;
+	struct bch_alloc_v4 a_convert;
+	const struct bch_alloc_v4 *a;
 	struct bkey_s_c alloc_k, k;
 	struct printbuf buf = PRINTBUF;
 	struct printbuf buf2 = PRINTBUF;
@@ -832,20 +895,20 @@ static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans,
 	if (ret)
 		return ret;
 
-	bch2_alloc_to_v4(alloc_k, &a);
+	a = bch2_alloc_to_v4(alloc_k, &a_convert);
 
-	if (a.data_type != BCH_DATA_cached)
+	if (a->data_type != BCH_DATA_cached)
 		return 0;
 
 	bch2_trans_iter_init(trans, &lru_iter, BTREE_ID_lru,
-			     POS(alloc_k.k->p.inode, a.io_time[READ]), 0);
+			     POS(alloc_k.k->p.inode, a->io_time[READ]), 0);
 
 	k = bch2_btree_iter_peek_slot(&lru_iter);
 	ret = bkey_err(k);
 	if (ret)
 		goto err;
 
-	if (fsck_err_on(!a.io_time[READ], c,
+	if (fsck_err_on(!a->io_time[READ], c,
 			"cached bucket with read_time 0\n"
 			"  %s",
 		(printbuf_reset(&buf),
@@ -858,26 +921,24 @@ static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans,
 			(printbuf_reset(&buf),
 			 bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf),
 			(bch2_bkey_val_to_text(&buf2, c, k), buf2.buf))) {
-		u64 read_time = a.io_time[READ];
-
-		if (!a.io_time[READ])
-			a.io_time[READ] = atomic64_read(&c->io_clock[READ].now);
+		u64 read_time = a->io_time[READ] ?:
+			atomic64_read(&c->io_clock[READ].now);
 
 		ret = bch2_lru_set(trans,
 				   alloc_k.k->p.inode,
 				   alloc_k.k->p.offset,
-				   &a.io_time[READ]);
+				   &read_time);
 		if (ret)
 			goto err;
 
-		if (a.io_time[READ] != read_time) {
+		if (a->io_time[READ] != read_time) {
 			struct bkey_i_alloc_v4 *a_mut =
 				bch2_alloc_to_v4_mut(trans, alloc_k);
 			ret = PTR_ERR_OR_ZERO(a_mut);
 			if (ret)
 				goto err;
 
-			a_mut->v.io_time[READ] = a.io_time[READ];
+			a_mut->v.io_time[READ] = read_time;
 			ret = bch2_trans_update(trans, alloc_iter,
 						&a_mut->k_i, BTREE_TRIGGER_NORUN);
 			if (ret)
@@ -1182,13 +1243,14 @@ void bch2_do_invalidates(struct bch_fs *c)
 static int bucket_freespace_init(struct btree_trans *trans, struct btree_iter *iter,
 				 struct bkey_s_c k, struct bch_dev *ca)
 {
-	struct bch_alloc_v4 a;
+	struct bch_alloc_v4 a_convert;
+	const struct bch_alloc_v4 *a;
 
 	if (iter->pos.offset >= ca->mi.nbuckets)
 		return 1;
 
-	bch2_alloc_to_v4(k, &a);
-	return bch2_bucket_do_index(trans, k, &a, true);
+	a = bch2_alloc_to_v4(k, &a_convert);
+	return bch2_bucket_do_index(trans, k, a, true);
 }
 
 static int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca)
diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h
index 318beb588aa9..c562aff3ac33 100644
--- a/fs/bcachefs/alloc_background.h
+++ b/fs/bcachefs/alloc_background.h
@@ -70,16 +70,46 @@ static inline struct bpos alloc_freespace_pos(struct bpos pos, struct bch_alloc_
 	return pos;
 }
 
+static inline unsigned alloc_v4_u64s(const struct bch_alloc_v4 *a)
+{
+	unsigned ret = (BCH_ALLOC_V4_BACKPOINTERS_START(a) ?:
+			BCH_ALLOC_V4_U64s_V0);
+
+	BUG_ON(ret > U8_MAX - BKEY_U64s);
+	return ret;
+}
+
+static inline void set_alloc_v4_u64s(struct bkey_i_alloc_v4 *a)
+{
+	set_bkey_val_u64s(&a->k, alloc_v4_u64s(&a->v));
+}
+
 struct bkey_i_alloc_v4 *
 bch2_trans_start_alloc_update(struct btree_trans *, struct btree_iter *, struct bpos);
 
-void bch2_alloc_to_v4(struct bkey_s_c, struct bch_alloc_v4 *);
+void __bch2_alloc_to_v4(struct bkey_s_c, struct bch_alloc_v4 *);
+
+static inline const struct bch_alloc_v4 *bch2_alloc_to_v4(struct bkey_s_c k, struct bch_alloc_v4 *convert)
+{
+	const struct bch_alloc_v4 *ret;
+
+	if (unlikely(k.k->type != KEY_TYPE_alloc_v4))
+		goto slowpath;
+
+	ret = bkey_s_c_to_alloc_v4(k).v;
+	if (BCH_ALLOC_V4_BACKPOINTERS_START(ret) != BCH_ALLOC_V4_U64s)
+		goto slowpath;
+
+	return ret;
+slowpath:
+	__bch2_alloc_to_v4(k, convert);
+	return convert;
+}
+
 struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut(struct btree_trans *, struct bkey_s_c);
 
 int bch2_bucket_io_time_reset(struct btree_trans *, unsigned, size_t, int);
 
-#define ALLOC_SCAN_BATCH(ca)		max_t(size_t, 1, (ca)->mi.nbuckets >> 9)
-
 int bch2_alloc_v1_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *);
 int bch2_alloc_v2_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *);
 int bch2_alloc_v3_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *);
diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index 471ae15caa75..02a61df6705f 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -204,7 +204,7 @@ static inline unsigned open_buckets_reserved(enum alloc_reserve reserve)
 static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
 					      u64 bucket,
 					      enum alloc_reserve reserve,
-					      struct bch_alloc_v4 *a,
+					      const struct bch_alloc_v4 *a,
 					      struct bucket_alloc_state *s,
 					      struct closure *cl)
 {
@@ -289,7 +289,8 @@ static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bc
 	struct btree_iter iter = { NULL };
 	struct bkey_s_c k;
 	struct open_bucket *ob;
-	struct bch_alloc_v4 a;
+	struct bch_alloc_v4 a_convert;
+	const struct bch_alloc_v4 *a;
 	u64 b = free_entry & ~(~0ULL << 56);
 	unsigned genbits = free_entry >> 56;
 	struct printbuf buf = PRINTBUF;
@@ -313,12 +314,12 @@ static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bc
 		goto err;
 	}
 
-	bch2_alloc_to_v4(k, &a);
+	a = bch2_alloc_to_v4(k, &a_convert);
 
-	if (genbits != (alloc_freespace_genbits(a) >> 56)) {
+	if (genbits != (alloc_freespace_genbits(*a) >> 56)) {
 		prt_printf(&buf, "bucket in freespace btree with wrong genbits (got %u should be %llu)\n"
 		       "  freespace key ",
-		       genbits, alloc_freespace_genbits(a) >> 56);
+		       genbits, alloc_freespace_genbits(*a) >> 56);
 		bch2_bkey_val_to_text(&buf, c, freespace_k);
 		prt_printf(&buf, "\n  ");
 		bch2_bkey_val_to_text(&buf, c, k);
@@ -328,7 +329,7 @@ static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bc
 
 	}
 
-	if (a.data_type != BCH_DATA_free) {
+	if (a->data_type != BCH_DATA_free) {
 		prt_printf(&buf, "non free bucket in freespace btree\n"
 		       "  freespace key ");
 		bch2_bkey_val_to_text(&buf, c, freespace_k);
@@ -339,7 +340,7 @@ static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bc
 		goto err;
 	}
 
-	ob = __try_alloc_bucket(c, ca, b, reserve, &a, s, cl);
+	ob = __try_alloc_bucket(c, ca, b, reserve, a, s, cl);
 	if (!ob)
 		iter.path->preserve = false;
 err:
@@ -397,7 +398,8 @@ bch2_bucket_alloc_early(struct btree_trans *trans,
 again:
 	for_each_btree_key_norestart(trans, iter, BTREE_ID_alloc, POS(ca->dev_idx, alloc_cursor),
 			   BTREE_ITER_SLOTS, k, ret) {
-		struct bch_alloc_v4 a;
+		struct bch_alloc_v4 a_convert;
+		const struct bch_alloc_v4 *a;
 
 		if (bkey_ge(k.k->p, POS(ca->dev_idx, ca->mi.nbuckets)))
 			break;
@@ -406,14 +408,14 @@ again:
 		    is_superblock_bucket(ca, k.k->p.offset))
 			continue;
 
-		bch2_alloc_to_v4(k, &a);
+		a = bch2_alloc_to_v4(k, &a_convert);
 
-		if (a.data_type != BCH_DATA_free)
+		if (a->data_type != BCH_DATA_free)
 			continue;
 
 		s->buckets_seen++;
 
-		ob = __try_alloc_bucket(trans->c, ca, k.k->p.offset, reserve, &a, s, cl);
+		ob = __try_alloc_bucket(trans->c, ca, k.k->p.offset, reserve, a, s, cl);
 		if (ob)
 			break;
 	}
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index 6ee9321e7d21..d96efc8338d5 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -934,6 +934,9 @@ struct bch_alloc_v4 {
 	struct bpos		backpointers[0];
 } __packed __aligned(8);
 
+#define BCH_ALLOC_V4_U64s_V0	6
+#define BCH_ALLOC_V4_U64s	(sizeof(struct bch_alloc_v4) / sizeof(u64))
+
 LE32_BITMASK(BCH_ALLOC_V3_NEED_DISCARD,struct bch_alloc_v3, flags,  0,  1)
 LE32_BITMASK(BCH_ALLOC_V3_NEED_INC_GEN,struct bch_alloc_v3, flags,  1,  2)
 
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index e43ccf896e8e..959f4081b42f 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -1351,15 +1351,16 @@ static int bch2_alloc_write_key(struct btree_trans *trans,
 	struct bch_dev *ca = bch_dev_bkey_exists(c, iter->pos.inode);
 	struct bucket gc, *b;
 	struct bkey_i_alloc_v4 *a;
-	struct bch_alloc_v4 old, new;
+	struct bch_alloc_v4 old_convert, new;
+	const struct bch_alloc_v4 *old;
 	enum bch_data_type type;
 	int ret;
 
 	if (bkey_ge(iter->pos, POS(ca->dev_idx, ca->mi.nbuckets)))
 		return 1;
 
-	bch2_alloc_to_v4(k, &old);
-	new = old;
+	old = bch2_alloc_to_v4(k, &old_convert);
+	new = *old;
 
 	percpu_down_read(&c->mark_lock);
 	b = gc_bucket(ca, iter->pos.offset);
@@ -1371,7 +1372,7 @@ static int bch2_alloc_write_key(struct btree_trans *trans,
 	type = __alloc_data_type(b->dirty_sectors,
 				 b->cached_sectors,
 				 b->stripe,
-				 old,
+				 *old,
 				 b->data_type);
 	if (b->data_type != type) {
 		struct bch_dev_usage *u;
@@ -1393,7 +1394,7 @@ static int bch2_alloc_write_key(struct btree_trans *trans,
 	    gc.data_type != BCH_DATA_btree)
 		return 0;
 
-	if (gen_after(old.gen, gc.gen))
+	if (gen_after(old->gen, gc.gen))
 		return 0;
 
 #define copy_bucket_field(_f)						\
@@ -1415,7 +1416,7 @@ static int bch2_alloc_write_key(struct btree_trans *trans,
 	copy_bucket_field(stripe);
 #undef copy_bucket_field
 
-	if (!bch2_alloc_v4_cmp(old, new))
+	if (!bch2_alloc_v4_cmp(*old, new))
 		return 0;
 
 	a = bch2_alloc_to_v4_mut(trans, k);
@@ -1473,7 +1474,8 @@ static int bch2_gc_alloc_start(struct bch_fs *c, bool metadata_only)
 	struct btree_iter iter;
 	struct bkey_s_c k;
 	struct bucket *g;
-	struct bch_alloc_v4 a;
+	struct bch_alloc_v4 a_convert;
+	const struct bch_alloc_v4 *a;
 	unsigned i;
 	int ret;
 
@@ -1499,20 +1501,20 @@ static int bch2_gc_alloc_start(struct bch_fs *c, bool metadata_only)
 		ca = bch_dev_bkey_exists(c, k.k->p.inode);
 		g = gc_bucket(ca, k.k->p.offset);
 
-		bch2_alloc_to_v4(k, &a);
+		a = bch2_alloc_to_v4(k, &a_convert);
 
 		g->gen_valid	= 1;
-		g->gen		= a.gen;
+		g->gen		= a->gen;
 
 		if (metadata_only &&
-		    (a.data_type == BCH_DATA_user ||
-		     a.data_type == BCH_DATA_cached ||
-		     a.data_type == BCH_DATA_parity)) {
-			g->data_type		= a.data_type;
-			g->dirty_sectors	= a.dirty_sectors;
-			g->cached_sectors	= a.cached_sectors;
-			g->stripe		= a.stripe;
-			g->stripe_redundancy	= a.stripe_redundancy;
+		    (a->data_type == BCH_DATA_user ||
+		     a->data_type == BCH_DATA_cached ||
+		     a->data_type == BCH_DATA_parity)) {
+			g->data_type		= a->data_type;
+			g->dirty_sectors	= a->dirty_sectors;
+			g->cached_sectors	= a->cached_sectors;
+			g->stripe		= a->stripe;
+			g->stripe_redundancy	= a->stripe_redundancy;
 		}
 	}
 	bch2_trans_iter_exit(&trans, &iter);
@@ -1913,13 +1915,12 @@ static int bch2_alloc_write_oldest_gen(struct btree_trans *trans, struct btree_i
 				       struct bkey_s_c k)
 {
 	struct bch_dev *ca = bch_dev_bkey_exists(trans->c, iter->pos.inode);
-	struct bch_alloc_v4 a;
+	struct bch_alloc_v4 a_convert;
+	const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &a_convert);
 	struct bkey_i_alloc_v4 *a_mut;
 	int ret;
 
-	bch2_alloc_to_v4(k, &a);
-
-	if (a.oldest_gen == ca->oldest_gen[iter->pos.offset])
+	if (a->oldest_gen == ca->oldest_gen[iter->pos.offset])
 		return 0;
 
 	a_mut = bch2_alloc_to_v4_mut(trans, k);
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 6c9dcfd54be6..153987376b89 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -490,8 +490,10 @@ int bch2_mark_alloc(struct btree_trans *trans,
 {
 	bool gc = flags & BTREE_TRIGGER_GC;
 	u64 journal_seq = trans->journal_res.seq;
+	u64 bucket_journal_seq;
 	struct bch_fs *c = trans->c;
-	struct bch_alloc_v4 old_a, new_a;
+	struct bch_alloc_v4 old_a_convert, new_a_convert;
+	const struct bch_alloc_v4 *old_a, *new_a;
 	struct bch_dev *ca;
 	int ret = 0;
 
@@ -508,36 +510,38 @@ int bch2_mark_alloc(struct btree_trans *trans,
 
 	ca = bch_dev_bkey_exists(c, new.k->p.inode);
 
-	bch2_alloc_to_v4(old, &old_a);
-	bch2_alloc_to_v4(new, &new_a);
+	old_a = bch2_alloc_to_v4(old, &old_a_convert);
+	new_a = bch2_alloc_to_v4(new, &new_a_convert);
+
+	bucket_journal_seq = new_a->journal_seq;
 
 	if ((flags & BTREE_TRIGGER_INSERT) &&
-	    data_type_is_empty(old_a.data_type) !=
-	    data_type_is_empty(new_a.data_type) &&
+	    data_type_is_empty(old_a->data_type) !=
+	    data_type_is_empty(new_a->data_type) &&
 	    new.k->type == KEY_TYPE_alloc_v4) {
 		struct bch_alloc_v4 *v = (struct bch_alloc_v4 *) new.v;
 
-		BUG_ON(!journal_seq);
+		EBUG_ON(!journal_seq);
 
 		/*
 		 * If the btree updates referring to a bucket weren't flushed
 		 * before the bucket became empty again, then the we don't have
 		 * to wait on a journal flush before we can reuse the bucket:
 		 */
-		new_a.journal_seq = data_type_is_empty(new_a.data_type) &&
+		v->journal_seq = bucket_journal_seq =
+			data_type_is_empty(new_a->data_type) &&
 			(journal_seq == v->journal_seq ||
 			 bch2_journal_noflush_seq(&c->journal, v->journal_seq))
 			? 0 : journal_seq;
-		v->journal_seq = new_a.journal_seq;
 	}
 
-	if (!data_type_is_empty(old_a.data_type) &&
-	    data_type_is_empty(new_a.data_type) &&
-	    new_a.journal_seq) {
+	if (!data_type_is_empty(old_a->data_type) &&
+	    data_type_is_empty(new_a->data_type) &&
+	    bucket_journal_seq) {
 		ret = bch2_set_bucket_needs_journal_commit(&c->buckets_waiting_for_journal,
 				c->journal.flushed_seq_ondisk,
 				new.k->p.inode, new.k->p.offset,
-				new_a.journal_seq);
+				bucket_journal_seq);
 		if (ret) {
 			bch2_fs_fatal_error(c,
 				"error setting bucket_needs_journal_commit: %i", ret);
@@ -546,10 +550,10 @@ int bch2_mark_alloc(struct btree_trans *trans,
 	}
 
 	percpu_down_read(&c->mark_lock);
-	if (!gc && new_a.gen != old_a.gen)
-		*bucket_gen(ca, new.k->p.offset) = new_a.gen;
+	if (!gc && new_a->gen != old_a->gen)
+		*bucket_gen(ca, new.k->p.offset) = new_a->gen;
 
-	bch2_dev_usage_update(c, ca, old_a, new_a, journal_seq, gc);
+	bch2_dev_usage_update(c, ca, *old_a, *new_a, journal_seq, gc);
 
 	if (gc) {
 		struct bucket *g = gc_bucket(ca, new.k->p.offset);
@@ -557,12 +561,12 @@ int bch2_mark_alloc(struct btree_trans *trans,
 		bucket_lock(g);
 
 		g->gen_valid		= 1;
-		g->gen			= new_a.gen;
-		g->data_type		= new_a.data_type;
-		g->stripe		= new_a.stripe;
-		g->stripe_redundancy	= new_a.stripe_redundancy;
-		g->dirty_sectors	= new_a.dirty_sectors;
-		g->cached_sectors	= new_a.cached_sectors;
+		g->gen			= new_a->gen;
+		g->data_type		= new_a->data_type;
+		g->stripe		= new_a->stripe;
+		g->stripe_redundancy	= new_a->stripe_redundancy;
+		g->dirty_sectors	= new_a->dirty_sectors;
+		g->cached_sectors	= new_a->cached_sectors;
 
 		bucket_unlock(g);
 	}
@@ -574,9 +578,9 @@ int bch2_mark_alloc(struct btree_trans *trans,
 	 */
 
 	if ((flags & BTREE_TRIGGER_BUCKET_INVALIDATE) &&
-	    old_a.cached_sectors) {
+	    old_a->cached_sectors) {
 		ret = update_cached_sectors(c, new, ca->dev_idx,
-					    -((s64) old_a.cached_sectors),
+					    -((s64) old_a->cached_sectors),
 					    journal_seq, gc);
 		if (ret) {
 			bch2_fs_fatal_error(c, "%s(): no replicas entry while updating cached sectors",
@@ -585,20 +589,20 @@ int bch2_mark_alloc(struct btree_trans *trans,
 		}
 	}
 
-	if (new_a.data_type == BCH_DATA_free &&
-	    (!new_a.journal_seq || new_a.journal_seq < c->journal.flushed_seq_ondisk))
+	if (new_a->data_type == BCH_DATA_free &&
+	    (!new_a->journal_seq || new_a->journal_seq < c->journal.flushed_seq_ondisk))
 		closure_wake_up(&c->freelist_wait);
 
-	if (new_a.data_type == BCH_DATA_need_discard &&
-	    (!new_a.journal_seq || new_a.journal_seq < c->journal.flushed_seq_ondisk))
+	if (new_a->data_type == BCH_DATA_need_discard &&
+	    (!bucket_journal_seq || bucket_journal_seq < c->journal.flushed_seq_ondisk))
 		bch2_do_discards(c);
 
-	if (old_a.data_type != BCH_DATA_cached &&
-	    new_a.data_type == BCH_DATA_cached &&
+	if (old_a->data_type != BCH_DATA_cached &&
+	    new_a->data_type == BCH_DATA_cached &&
 	    should_invalidate_buckets(ca, bch2_dev_usage_read(ca)))
 		bch2_do_invalidates(c);
 
-	if (new_a.data_type == BCH_DATA_need_gc_gens)
+	if (new_a->data_type == BCH_DATA_need_gc_gens)
 		bch2_do_gc_gens(c);
 
 	return 0;
diff --git a/fs/bcachefs/lru.c b/fs/bcachefs/lru.c
index 9b4ce27d12f3..12821868df71 100644
--- a/fs/bcachefs/lru.c
+++ b/fs/bcachefs/lru.c
@@ -140,7 +140,8 @@ static int bch2_check_lru_key(struct btree_trans *trans,
 	struct bch_fs *c = trans->c;
 	struct btree_iter iter;
 	struct bkey_s_c k;
-	struct bch_alloc_v4 a;
+	struct bch_alloc_v4 a_convert;
+	const struct bch_alloc_v4 *a;
 	struct printbuf buf1 = PRINTBUF;
 	struct printbuf buf2 = PRINTBUF;
 	struct bpos alloc_pos;
@@ -160,10 +161,10 @@ static int bch2_check_lru_key(struct btree_trans *trans,
 	if (ret)
 		goto err;
 
-	bch2_alloc_to_v4(k, &a);
+	a = bch2_alloc_to_v4(k, &a_convert);
 
-	if (fsck_err_on(a.data_type != BCH_DATA_cached ||
-			a.io_time[READ] != lru_k.k->p.offset, c,
+	if (fsck_err_on(a->data_type != BCH_DATA_cached ||
+			a->io_time[READ] != lru_k.k->p.offset, c,
 			"incorrect lru entry %s\n"
 			"  for %s",
 			(bch2_bkey_val_to_text(&buf1, c, lru_k), buf1.buf),
diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c
index 9c55a88a2b08..a04e2330d0e6 100644
--- a/fs/bcachefs/movinggc.c
+++ b/fs/bcachefs/movinggc.c
@@ -117,7 +117,6 @@ static int walk_buckets_to_copygc(struct bch_fs *c)
 	struct btree_trans trans;
 	struct btree_iter iter;
 	struct bkey_s_c k;
-	struct bch_alloc_v4 a;
 	int ret;
 
 	bch2_trans_init(&trans, c, 0, 0);
@@ -126,21 +125,23 @@ static int walk_buckets_to_copygc(struct bch_fs *c)
 			   BTREE_ITER_PREFETCH, k, ret) {
 		struct bch_dev *ca = bch_dev_bkey_exists(c, iter.pos.inode);
 		struct copygc_heap_entry e;
+		struct bch_alloc_v4 a_convert;
+		const struct bch_alloc_v4 *a;
 
-		bch2_alloc_to_v4(k, &a);
+		a = bch2_alloc_to_v4(k, &a_convert);
 
-		if (a.data_type != BCH_DATA_user ||
-		    a.dirty_sectors >= ca->mi.bucket_size ||
+		if (a->data_type != BCH_DATA_user ||
+		    a->dirty_sectors >= ca->mi.bucket_size ||
 		    bch2_bucket_is_open(c, iter.pos.inode, iter.pos.offset))
 			continue;
 
 		e = (struct copygc_heap_entry) {
 			.dev		= iter.pos.inode,
-			.gen		= a.gen,
-			.replicas	= 1 + a.stripe_redundancy,
-			.fragmentation	= div_u64((u64) a.dirty_sectors * (1ULL << 31),
+			.gen		= a->gen,
+			.replicas	= 1 + a->stripe_redundancy,
+			.fragmentation	= div_u64((u64) a->dirty_sectors * (1ULL << 31),
 						  ca->mi.bucket_size),
-			.sectors	= a.dirty_sectors,
+			.sectors	= a->dirty_sectors,
 			.offset		= bucket_to_sector(ca, iter.pos.offset),
 		};
 		heap_add_or_replace(h, e, -fragmentation_cmp, NULL);
-- 
cgit 


From 393a1f6863790fddf8b53bfb81f2c984cdbc1990 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 24 Nov 2022 18:03:55 -0500
Subject: bcachefs: Better inlining in core write path

Provide inline versions of some allocation functions
 - bch2_alloc_sectors_done_inlined()
 - bch2_alloc_sectors_append_ptrs_inlined()

and use them in the core IO path.

Also, inline bch2_extent_update_i_size_sectors() and
bch2_bkey_append_ptr().

In the core write path, function call overhead matters - every function
call is a jump to a new location and a potential cache miss.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_foreground.c | 37 ++-----------------------------
 fs/bcachefs/alloc_foreground.h | 49 ++++++++++++++++++++++++++++++++++++++++++
 fs/bcachefs/extents.c          | 23 --------------------
 fs/bcachefs/extents.h          | 34 +++++++++++++++++++++++------
 fs/bcachefs/io.c               |  4 ++--
 5 files changed, 81 insertions(+), 66 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index 02a61df6705f..534dbf197d58 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -1244,34 +1244,11 @@ struct bch_extent_ptr bch2_ob_ptr(struct bch_fs *c, struct open_bucket *ob)
 	};
 }
 
-/*
- * Append pointers to the space we just allocated to @k, and mark @sectors space
- * as allocated out of @ob
- */
 void bch2_alloc_sectors_append_ptrs(struct bch_fs *c, struct write_point *wp,
 				    struct bkey_i *k, unsigned sectors,
 				    bool cached)
 {
-	struct open_bucket *ob;
-	unsigned i;
-
-	BUG_ON(sectors > wp->sectors_free);
-	wp->sectors_free	-= sectors;
-	wp->sectors_allocated	+= sectors;
-
-	open_bucket_for_each(c, &wp->ptrs, ob, i) {
-		struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
-		struct bch_extent_ptr ptr = bch2_ob_ptr(c, ob);
-
-		ptr.cached = cached ||
-			(!ca->mi.durability &&
-			 wp->data_type == BCH_DATA_user);
-
-		bch2_bkey_append_ptr(k, ptr);
-
-		BUG_ON(sectors > ob->sectors_free);
-		ob->sectors_free -= sectors;
-	}
+	bch2_alloc_sectors_append_ptrs_inlined(c, wp, k, sectors, cached);
 }
 
 /*
@@ -1280,17 +1257,7 @@ void bch2_alloc_sectors_append_ptrs(struct bch_fs *c, struct write_point *wp,
  */
 void bch2_alloc_sectors_done(struct bch_fs *c, struct write_point *wp)
 {
-	struct open_buckets ptrs = { .nr = 0 }, keep = { .nr = 0 };
-	struct open_bucket *ob;
-	unsigned i;
-
-	open_bucket_for_each(c, &wp->ptrs, ob, i)
-		ob_push(c, !ob->sectors_free ? &ptrs : &keep, ob);
-	wp->ptrs = keep;
-
-	mutex_unlock(&wp->lock);
-
-	bch2_open_buckets_put(c, &ptrs);
+	bch2_alloc_sectors_done_inlined(c, wp);
 }
 
 static inline void writepoint_init(struct write_point *wp,
diff --git a/fs/bcachefs/alloc_foreground.h b/fs/bcachefs/alloc_foreground.h
index b784a59d67e7..26e986f2385b 100644
--- a/fs/bcachefs/alloc_foreground.h
+++ b/fs/bcachefs/alloc_foreground.h
@@ -4,6 +4,8 @@
 
 #include "bcachefs.h"
 #include "alloc_types.h"
+#include "extents.h"
+#include "super.h"
 
 #include <linux/hash.h>
 
@@ -81,6 +83,21 @@ static inline void bch2_open_buckets_put(struct bch_fs *c,
 	ptrs->nr = 0;
 }
 
+static inline void bch2_alloc_sectors_done_inlined(struct bch_fs *c, struct write_point *wp)
+{
+	struct open_buckets ptrs = { .nr = 0 }, keep = { .nr = 0 };
+	struct open_bucket *ob;
+	unsigned i;
+
+	open_bucket_for_each(c, &wp->ptrs, ob, i)
+		ob_push(c, !ob->sectors_free ? &ptrs : &keep, ob);
+	wp->ptrs = keep;
+
+	mutex_unlock(&wp->lock);
+
+	bch2_open_buckets_put(c, &ptrs);
+}
+
 static inline void bch2_open_bucket_get(struct bch_fs *c,
 					struct write_point *wp,
 					struct open_buckets *ptrs)
@@ -149,6 +166,38 @@ int bch2_alloc_sectors_start_trans(struct btree_trans *,
 				   struct write_point **);
 
 struct bch_extent_ptr bch2_ob_ptr(struct bch_fs *, struct open_bucket *);
+
+/*
+ * Append pointers to the space we just allocated to @k, and mark @sectors space
+ * as allocated out of @ob
+ */
+static inline void
+bch2_alloc_sectors_append_ptrs_inlined(struct bch_fs *c, struct write_point *wp,
+				       struct bkey_i *k, unsigned sectors,
+				       bool cached)
+{
+	struct open_bucket *ob;
+	unsigned i;
+
+	BUG_ON(sectors > wp->sectors_free);
+	wp->sectors_free	-= sectors;
+	wp->sectors_allocated	+= sectors;
+
+	open_bucket_for_each(c, &wp->ptrs, ob, i) {
+		struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
+		struct bch_extent_ptr ptr = bch2_ob_ptr(c, ob);
+
+		ptr.cached = cached ||
+			(!ca->mi.durability &&
+			 wp->data_type == BCH_DATA_user);
+
+		bch2_bkey_append_ptr(k, ptr);
+
+		BUG_ON(sectors > ob->sectors_free);
+		ob->sectors_free -= sectors;
+	}
+}
+
 void bch2_alloc_sectors_append_ptrs(struct bch_fs *, struct write_point *,
 				    struct bkey_i *, unsigned, bool);
 void bch2_alloc_sectors_done(struct bch_fs *, struct write_point *);
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 422adca7230b..3d124dc5bbef 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -698,29 +698,6 @@ void bch2_bkey_extent_entry_drop(struct bkey_i *k, union bch_extent_entry *entry
 	k->k.u64s -= extent_entry_u64s(entry);
 }
 
-void bch2_bkey_append_ptr(struct bkey_i *k,
-			  struct bch_extent_ptr ptr)
-{
-	EBUG_ON(bch2_bkey_has_device(bkey_i_to_s_c(k), ptr.dev));
-
-	switch (k->k.type) {
-	case KEY_TYPE_btree_ptr:
-	case KEY_TYPE_btree_ptr_v2:
-	case KEY_TYPE_extent:
-		EBUG_ON(bkey_val_u64s(&k->k) >= BKEY_EXTENT_VAL_U64s_MAX);
-
-		ptr.type = 1 << BCH_EXTENT_ENTRY_ptr;
-
-		memcpy((void *) &k->v + bkey_val_bytes(&k->k),
-		       &ptr,
-		       sizeof(ptr));
-		k->u64s++;
-		break;
-	default:
-		BUG();
-	}
-}
-
 static inline void __extent_entry_insert(struct bkey_i *k,
 					 union bch_extent_entry *dst,
 					 union bch_extent_entry *new)
diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h
index 21dbdf96bd59..f640254004e7 100644
--- a/fs/bcachefs/extents.h
+++ b/fs/bcachefs/extents.h
@@ -581,8 +581,35 @@ unsigned bch2_bkey_sectors_compressed(struct bkey_s_c);
 unsigned bch2_bkey_replicas(struct bch_fs *, struct bkey_s_c);
 unsigned bch2_bkey_durability(struct bch_fs *, struct bkey_s_c);
 
+void bch2_bkey_drop_device(struct bkey_s, unsigned);
+void bch2_bkey_drop_device_noerror(struct bkey_s, unsigned);
+const struct bch_extent_ptr *bch2_bkey_has_device(struct bkey_s_c, unsigned);
+bool bch2_bkey_has_target(struct bch_fs *, struct bkey_s_c, unsigned);
+
 void bch2_bkey_extent_entry_drop(struct bkey_i *, union bch_extent_entry *);
-void bch2_bkey_append_ptr(struct bkey_i *, struct bch_extent_ptr);
+
+static inline void bch2_bkey_append_ptr(struct bkey_i *k, struct bch_extent_ptr ptr)
+{
+	EBUG_ON(bch2_bkey_has_device(bkey_i_to_s_c(k), ptr.dev));
+
+	switch (k->k.type) {
+	case KEY_TYPE_btree_ptr:
+	case KEY_TYPE_btree_ptr_v2:
+	case KEY_TYPE_extent:
+		EBUG_ON(bkey_val_u64s(&k->k) >= BKEY_EXTENT_VAL_U64s_MAX);
+
+		ptr.type = 1 << BCH_EXTENT_ENTRY_ptr;
+
+		memcpy((void *) &k->v + bkey_val_bytes(&k->k),
+		       &ptr,
+		       sizeof(ptr));
+		k->u64s++;
+		break;
+	default:
+		BUG();
+	}
+}
+
 void bch2_extent_ptr_decoded_append(struct bkey_i *,
 				    struct extent_ptr_decoded *);
 union bch_extent_entry *bch2_bkey_drop_ptr(struct bkey_s,
@@ -605,11 +632,6 @@ do {									\
 	}								\
 } while (0)
 
-void bch2_bkey_drop_device(struct bkey_s, unsigned);
-void bch2_bkey_drop_device_noerror(struct bkey_s, unsigned);
-const struct bch_extent_ptr *bch2_bkey_has_device(struct bkey_s_c, unsigned);
-bool bch2_bkey_has_target(struct bch_fs *, struct bkey_s_c, unsigned);
-
 bool bch2_bkey_matches_ptr(struct bch_fs *, struct bkey_s_c,
 			   struct bch_extent_ptr, u64);
 bool bch2_extents_match(struct bkey_s_c, struct bkey_s_c);
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index c39f00f9ebd8..0c264266f466 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -832,7 +832,7 @@ static void init_append_extent(struct bch_write_op *op,
 	    crc.nonce)
 		bch2_extent_crc_append(&e->k_i, crc);
 
-	bch2_alloc_sectors_append_ptrs(op->c, wp, &e->k_i, crc.compressed_size,
+	bch2_alloc_sectors_append_ptrs_inlined(op->c, wp, &e->k_i, crc.compressed_size,
 				       op->flags & BCH_WRITE_CACHED);
 
 	bch2_keylist_push(&op->insert_keys);
@@ -1275,7 +1275,7 @@ again:
 		bch2_open_bucket_get(c, wp, &op->open_buckets);
 		ret = bch2_write_extent(op, wp, &bio);
 
-		bch2_alloc_sectors_done(c, wp);
+		bch2_alloc_sectors_done_inlined(c, wp);
 err:
 		if (ret <= 0) {
 			if (!(op->flags & BCH_WRITE_SYNC)) {
-- 
cgit 


From f52dd1ae20ac8094eb881f816e7274d3f7910a84 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 19 Dec 2022 15:55:38 -0500
Subject: bcachefs: Fix bch_alloc_to_text()

We weren't guarding against the alloc key having an invalid data type.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index d75738134f94..388a44858097 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -351,7 +351,10 @@ void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c
 	printbuf_indent_add(out, 2);
 
 	prt_printf(out, "gen %u oldest_gen %u data_type %s",
-	       a->gen, a->oldest_gen, bch2_data_types[a->data_type]);
+	       a->gen, a->oldest_gen,
+	       a->data_type < BCH_DATA_NR
+	       ? bch2_data_types[a->data_type]
+	       : "(invalid data type)");
 	prt_newline(out);
 	prt_printf(out, "journal_seq       %llu",	a->journal_seq);
 	prt_newline(out);
-- 
cgit 


From 01ad673727b59664c9d12df4e5b6f5bad1ea2825 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 23 Nov 2022 20:14:55 -0500
Subject: bcachefs: bch2_inode_opts_get()

This improves io_opts() and makes it a non-inline function - it's big
enough that it probably shouldn't be.

Also, bch_io_opts no longer needs fields for whether options are
defined, so we can slim it down a bit.

We'd like to stop passing around the full bch_io_opts, but that'll be
tricky because of bch2_rebalance_add_key().

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-io.c | 34 +++++++++++++++++++++++-----------
 fs/bcachefs/inode.c |  8 ++++++++
 fs/bcachefs/inode.h | 24 ++++--------------------
 fs/bcachefs/move.c  |  2 +-
 fs/bcachefs/opts.c  | 17 +++--------------
 fs/bcachefs/opts.h  |  5 -----
 6 files changed, 39 insertions(+), 51 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 0bb8b39140ec..25094e0406da 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -1102,12 +1102,14 @@ void bch2_readahead(struct readahead_control *ractl)
 {
 	struct bch_inode_info *inode = to_bch_ei(ractl->mapping->host);
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	struct bch_io_opts opts = io_opts(c, &inode->ei_inode);
+	struct bch_io_opts opts;
 	struct btree_trans trans;
 	struct page *page;
 	struct readpages_iter readpages_iter;
 	int ret;
 
+	bch2_inode_opts_get(&opts, c, &inode->ei_inode);
+
 	ret = readpages_iter_init(&readpages_iter, ractl);
 	BUG_ON(ret);
 
@@ -1170,11 +1172,14 @@ static int bch2_read_single_page(struct page *page,
 	struct bch_inode_info *inode = to_bch_ei(mapping->host);
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
 	struct bch_read_bio *rbio;
+	struct bch_io_opts opts;
 	int ret;
 	DECLARE_COMPLETION_ONSTACK(done);
 
+	bch2_inode_opts_get(&opts, c, &inode->ei_inode);
+
 	rbio = rbio_init(bio_alloc_bioset(NULL, 1, REQ_OP_READ, GFP_NOFS, &c->bio_read),
-			 io_opts(c, &inode->ei_inode));
+			 opts);
 	rbio->bio.bi_private = &done;
 	rbio->bio.bi_end_io = bch2_read_single_page_end_io;
 
@@ -1211,9 +1216,10 @@ struct bch_writepage_state {
 static inline struct bch_writepage_state bch_writepage_state_init(struct bch_fs *c,
 								  struct bch_inode_info *inode)
 {
-	return (struct bch_writepage_state) {
-		.opts = io_opts(c, &inode->ei_inode)
-	};
+	struct bch_writepage_state ret = { 0 };
+
+	bch2_inode_opts_get(&ret.opts, c, &inode->ei_inode);
+	return ret;
 }
 
 static void bch2_writepage_io_done(struct bch_write_op *op)
@@ -1879,7 +1885,7 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter)
 	struct file *file = req->ki_filp;
 	struct bch_inode_info *inode = file_bch_inode(file);
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	struct bch_io_opts opts = io_opts(c, &inode->ei_inode);
+	struct bch_io_opts opts;
 	struct dio_read *dio;
 	struct bio *bio;
 	loff_t offset = req->ki_pos;
@@ -1887,6 +1893,8 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter)
 	size_t shorten;
 	ssize_t ret;
 
+	bch2_inode_opts_get(&opts, c, &inode->ei_inode);
+
 	if ((offset|iter->count) & (block_bytes(c) - 1))
 		return -EINVAL;
 
@@ -2224,11 +2232,14 @@ static __always_inline long bch2_dio_write_loop(struct dio_write *dio)
 	struct kiocb *req = dio->req;
 	struct address_space *mapping = dio->mapping;
 	struct bch_inode_info *inode = dio->inode;
+	struct bch_io_opts opts;
 	struct bio *bio = &dio->op.wbio.bio;
 	unsigned unaligned, iter_count;
 	bool sync = dio->sync, dropped_locks;
 	long ret;
 
+	bch2_inode_opts_get(&opts, c, &inode->ei_inode);
+
 	while (1) {
 		iter_count = dio->iter.count;
 
@@ -2276,7 +2287,7 @@ static __always_inline long bch2_dio_write_loop(struct dio_write *dio)
 			goto err;
 		}
 
-		bch2_write_op_init(&dio->op, c, io_opts(c, &inode->ei_inode));
+		bch2_write_op_init(&dio->op, c, opts);
 		dio->op.end_io		= sync
 			? NULL
 			: bch2_dio_write_loop_async;
@@ -3055,9 +3066,10 @@ static int __bchfs_fallocate(struct bch_inode_info *inode, int mode,
 	struct btree_trans trans;
 	struct btree_iter iter;
 	struct bpos end_pos = POS(inode->v.i_ino, end_sector);
-	unsigned replicas = io_opts(c, &inode->ei_inode).data_replicas;
+	struct bch_io_opts opts;
 	int ret = 0;
 
+	bch2_inode_opts_get(&opts, c, &inode->ei_inode);
 	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 512);
 
 	bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
@@ -3088,7 +3100,7 @@ static int __bchfs_fallocate(struct bch_inode_info *inode, int mode,
 
 		/* already reserved */
 		if (k.k->type == KEY_TYPE_reservation &&
-		    bkey_s_c_to_reservation(k).v->nr_replicas >= replicas) {
+		    bkey_s_c_to_reservation(k).v->nr_replicas >= opts.data_replicas) {
 			bch2_btree_iter_advance(&iter);
 			continue;
 		}
@@ -3118,10 +3130,10 @@ static int __bchfs_fallocate(struct bch_inode_info *inode, int mode,
 				goto bkey_err;
 		}
 
-		if (reservation.v.nr_replicas < replicas ||
+		if (reservation.v.nr_replicas < opts.data_replicas ||
 		    bch2_bkey_sectors_compressed(k)) {
 			ret = bch2_disk_reservation_get(c, &disk_res, sectors,
-							replicas, 0);
+							opts.data_replicas, 0);
 			if (unlikely(ret))
 				goto bkey_err;
 
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index 6e7ba2e6fe33..9eeabe70aec1 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -768,3 +768,11 @@ struct bch_opts bch2_inode_opts_to_opts(struct bch_inode_unpacked *inode)
 #undef x
 	return ret;
 }
+
+void bch2_inode_opts_get(struct bch_io_opts *opts, struct bch_fs *c,
+			 struct bch_inode_unpacked *inode)
+{
+#define x(_name, _bits)		opts->_name = inode_opt_get(c, inode, _name);
+	BCH_INODE_OPTS()
+#undef x
+}
diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h
index 27744f78ae96..da78ed023a30 100644
--- a/fs/bcachefs/inode.h
+++ b/fs/bcachefs/inode.h
@@ -98,17 +98,8 @@ int bch2_inode_find_by_inum_trans(struct btree_trans *, subvol_inum,
 int bch2_inode_find_by_inum(struct bch_fs *, subvol_inum,
 			    struct bch_inode_unpacked *);
 
-static inline struct bch_io_opts bch2_inode_opts_get(struct bch_inode_unpacked *inode)
-{
-	struct bch_io_opts ret = { 0 };
-
-#define x(_name, _bits)					\
-	if (inode->bi_##_name)						\
-		opt_set(ret, _name, inode->bi_##_name - 1);
-	BCH_INODE_OPTS()
-#undef x
-	return ret;
-}
+#define inode_opt_get(_c, _inode, _name)			\
+	((_inode)->bi_##_name ? (_inode)->bi_##_name - 1 : (_c)->opts._name)
 
 static inline void bch2_inode_opt_set(struct bch_inode_unpacked *inode,
 				      enum inode_opt_id id, u64 v)
@@ -139,15 +130,6 @@ static inline u64 bch2_inode_opt_get(struct bch_inode_unpacked *inode,
 	}
 }
 
-static inline struct bch_io_opts
-io_opts(struct bch_fs *c, struct bch_inode_unpacked *inode)
-{
-	struct bch_io_opts opts = bch2_opts_to_inode_opts(c->opts);
-
-	bch2_io_opts_apply(&opts, bch2_inode_opts_get(inode));
-	return opts;
-}
-
 static inline u8 mode_to_type(umode_t mode)
 {
 	return (mode >> 12) & 15;
@@ -188,5 +170,7 @@ int bch2_inode_nlink_inc(struct bch_inode_unpacked *);
 void bch2_inode_nlink_dec(struct btree_trans *, struct bch_inode_unpacked *);
 
 struct bch_opts bch2_inode_opts_to_opts(struct bch_inode_unpacked *);
+void bch2_inode_opts_get(struct bch_io_opts *, struct bch_fs *,
+			 struct bch_inode_unpacked *);
 
 #endif /* _BCACHEFS_INODE_H */
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index 848a415b6797..65c3af1b2e11 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -465,7 +465,7 @@ static int __bch2_move_data(struct moving_context *ctxt,
 				continue;
 
 			if (!ret)
-				bch2_io_opts_apply(&io_opts, bch2_inode_opts_get(&inode));
+				bch2_inode_opts_get(&io_opts, c, &inode);
 
 			cur_inum = k.k->p.inode;
 		}
diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c
index 9c49d543b062..04e2989cd6b3 100644
--- a/fs/bcachefs/opts.c
+++ b/fs/bcachefs/opts.c
@@ -531,22 +531,11 @@ void bch2_opt_set_sb(struct bch_fs *c, const struct bch_option *opt, u64 v)
 
 struct bch_io_opts bch2_opts_to_inode_opts(struct bch_opts src)
 {
-	struct bch_io_opts ret = { 0 };
-#define x(_name, _bits)					\
-	if (opt_defined(src, _name))					\
-		opt_set(ret, _name, src._name);
-	BCH_INODE_OPTS()
-#undef x
-	return ret;
-}
-
-void bch2_io_opts_apply(struct bch_io_opts *dst, struct bch_io_opts src)
-{
-#define x(_name, _bits)					\
-	if (opt_defined(src, _name))					\
-		opt_set(*dst, _name, src._name);
+	return (struct bch_io_opts) {
+#define x(_name, _bits)	._name = src._name,
 	BCH_INODE_OPTS()
 #undef x
+	};
 }
 
 bool bch2_opt_is_inode_opt(enum bch_opt_id id)
diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
index a32a7ab73bd5..c6025172f32e 100644
--- a/fs/bcachefs/opts.h
+++ b/fs/bcachefs/opts.h
@@ -503,17 +503,12 @@ int bch2_parse_mount_opts(struct bch_fs *, struct bch_opts *, char *);
 /* inode opts: */
 
 struct bch_io_opts {
-#define x(_name, _bits)	unsigned _name##_defined:1;
-	BCH_INODE_OPTS()
-#undef x
-
 #define x(_name, _bits)	u##_bits _name;
 	BCH_INODE_OPTS()
 #undef x
 };
 
 struct bch_io_opts bch2_opts_to_inode_opts(struct bch_opts);
-void bch2_io_opts_apply(struct bch_io_opts *, struct bch_io_opts);
 bool bch2_opt_is_inode_opt(enum bch_opt_id);
 
 #endif /* _BCACHEFS_OPTS_H */
-- 
cgit 


From 3e57db65cdd93bee55d1ecf53ff4ab895e05c40e Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 25 Jan 2023 10:18:32 -0500
Subject: bcachefs: Use trylock in bch2_prt_backtrace()

Easy workaround for a lockdep splat - and since bch2_prt_backtrace() is
only used in debug code this is fine.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/util.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c
index 31934f7a6436..e6672b67ae32 100644
--- a/fs/bcachefs/util.c
+++ b/fs/bcachefs/util.c
@@ -270,11 +270,9 @@ int bch2_prt_backtrace(struct printbuf *out, struct task_struct *task)
 {
 	unsigned long entries[32];
 	unsigned i, nr_entries;
-	int ret;
 
-	ret = down_read_killable(&task->signal->exec_update_lock);
-	if (ret)
-		return ret;
+	if (!down_read_trylock(&task->signal->exec_update_lock))
+		return 0;
 
 	nr_entries = stack_trace_save_tsk(task, entries, ARRAY_SIZE(entries), 0);
 	for (i = 0; i < nr_entries; i++) {
-- 
cgit 


From b8c5b16f970b32a5b8e8d75a9e4b96041db73d43 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 25 Jan 2023 12:16:23 -0500
Subject: bcachefs: Don't emit tracepoints for expected events

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_key_cache.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index d432d26cc68b..d4cfb48d2013 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -478,7 +478,7 @@ bch2_btree_path_traverse_cached_slowpath(struct btree_trans *trans, struct btree
 
 	path->l[1].b = NULL;
 
-	if (bch2_btree_node_relock(trans, path, 0)) {
+	if (bch2_btree_node_relock_notrace(trans, path, 0)) {
 		ck = (void *) path->l[0].b;
 		goto fill;
 	}
@@ -568,7 +568,7 @@ int bch2_btree_path_traverse_cached(struct btree_trans *trans, struct btree_path
 
 	path->l[1].b = NULL;
 
-	if (bch2_btree_node_relock(trans, path, 0)) {
+	if (bch2_btree_node_relock_notrace(trans, path, 0)) {
 		ck = (void *) path->l[0].b;
 		goto fill;
 	}
-- 
cgit 


From 419fc65f8cfbadb29b2024457bf914787af8db91 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 1 Feb 2023 16:46:42 -0500
Subject: bcachefs: Fix hash_check_key()

On hash collision when we have to check for duplicates or incorrect
hash value, we weren't specifying a snapshot ID to iterate with.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fsck.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 0124aa0b14c5..e0d7ab67ca35 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -821,7 +821,7 @@ static int hash_check_key(struct btree_trans *trans,
 		goto bad_hash;
 
 	for_each_btree_key_norestart(trans, iter, desc.btree_id,
-				     POS(hash_k.k->p.inode, hash),
+				     SPOS(hash_k.k->p.inode, hash, hash_k.k->p.snapshot),
 				     BTREE_ITER_SLOTS, k, ret) {
 		if (bkey_eq(k.k->p, hash_k.k->p))
 			break;
-- 
cgit 


From 4e3d18991a7d1138604f0975e7849d9a2d82c524 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 4 Feb 2023 20:40:29 -0500
Subject: bcachefs: Inline bch2_btree_path_traverse() fastpath

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c | 32 +++++---------------------------
 fs/bcachefs/btree_iter.h | 12 ++++++++++++
 2 files changed, 17 insertions(+), 27 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 5034f8ebfb04..3774ee8577a9 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -962,8 +962,6 @@ err:
 	return ret;
 }
 
-static int btree_path_traverse_one(struct btree_trans *, struct btree_path *,
-				   unsigned, unsigned long);
 
 static int bch2_btree_path_traverse_all(struct btree_trans *trans)
 {
@@ -1009,7 +1007,7 @@ retry_all:
 		 */
 		if (path->uptodate) {
 			__btree_path_get(path, false);
-			ret = btree_path_traverse_one(trans, path, 0, _THIS_IP_);
+			ret = bch2_btree_path_traverse_one(trans, path, 0, _THIS_IP_);
 			__btree_path_put(path, false);
 
 			if (bch2_err_matches(ret, BCH_ERR_transaction_restart) ||
@@ -1114,10 +1112,10 @@ static inline unsigned btree_path_up_until_good_node(struct btree_trans *trans,
  * On error, caller (peek_node()/peek_key()) must return NULL; the error is
  * stashed in the iterator and returned from bch2_trans_exit().
  */
-static int btree_path_traverse_one(struct btree_trans *trans,
-				   struct btree_path *path,
-				   unsigned flags,
-				   unsigned long trace_ip)
+int bch2_btree_path_traverse_one(struct btree_trans *trans,
+				 struct btree_path *path,
+				 unsigned flags,
+				 unsigned long trace_ip)
 {
 	unsigned depth_want = path->level;
 	int ret = -((int) trans->restarted);
@@ -1181,26 +1179,6 @@ out:
 	return ret;
 }
 
-int __must_check bch2_btree_path_traverse(struct btree_trans *trans,
-					  struct btree_path *path, unsigned flags)
-{
-	if (0 && IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) {
-		unsigned restart_probability_bits = 4 << min(trans->restart_count, 32U);
-		u64 max = ~(~0ULL << restart_probability_bits);
-
-		if (!get_random_u32_below(max)) {
-			trace_and_count(trans->c, trans_restart_injected, trans, _RET_IP_);
-			return btree_trans_restart(trans, BCH_ERR_transaction_restart_fault_inject);
-		}
-	}
-
-	if (path->uptodate < BTREE_ITER_NEED_RELOCK)
-		return 0;
-
-	return  bch2_trans_cond_resched(trans) ?:
-		btree_path_traverse_one(trans, path, flags, _RET_IP_);
-}
-
 static inline void btree_path_copy(struct btree_trans *trans, struct btree_path *dst,
 			    struct btree_path *src)
 {
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index b889d1c03c7b..df87d88982ae 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -174,6 +174,18 @@ bch2_btree_path_set_pos(struct btree_trans *trans,
 		: path;
 }
 
+int __must_check bch2_btree_path_traverse_one(struct btree_trans *, struct btree_path *,
+					      unsigned, unsigned long);
+
+static inline int __must_check bch2_btree_path_traverse(struct btree_trans *trans,
+					  struct btree_path *path, unsigned flags)
+{
+	if (path->uptodate < BTREE_ITER_NEED_RELOCK)
+		return 0;
+
+	return bch2_btree_path_traverse_one(trans, path, flags, _RET_IP_);
+}
+
 int __must_check bch2_btree_path_traverse(struct btree_trans *,
 					  struct btree_path *, unsigned);
 struct btree_path *bch2_path_get(struct btree_trans *, enum btree_id, struct bpos,
-- 
cgit 


From 464b415539cc543addb45b85e76c44da145b114c Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 5 Feb 2023 14:07:34 -0500
Subject: bcachefs: Fix bch2_trans_reset_updates()

This should have been resetting trans->fs_usage_deltas as well.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c        |  8 --------
 fs/bcachefs/btree_types.h       |  1 +
 fs/bcachefs/btree_update.h      |  8 ++++++++
 fs/bcachefs/btree_update_leaf.c |  8 --------
 fs/bcachefs/replicas.h          | 16 ----------------
 fs/bcachefs/replicas_types.h    | 16 ++++++++++++++++
 6 files changed, 25 insertions(+), 32 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 3774ee8577a9..1a71e8af52d0 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -2792,14 +2792,6 @@ u32 bch2_trans_begin(struct btree_trans *trans)
 	trans->restart_count++;
 	trans->mem_top			= 0;
 
-	if (trans->fs_usage_deltas) {
-		trans->fs_usage_deltas->used = 0;
-		memset((void *) trans->fs_usage_deltas +
-		       offsetof(struct replicas_delta_list, memset_start), 0,
-		       (void *) &trans->fs_usage_deltas->memset_end -
-		       (void *) &trans->fs_usage_deltas->memset_start);
-	}
-
 	trans_for_each_path(trans, path) {
 		path->should_be_locked = false;
 
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 7c664186f3c3..5cf03ec52051 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -9,6 +9,7 @@
 #include "buckets_types.h"
 #include "darray.h"
 #include "journal_types.h"
+#include "replicas_types.h"
 #include "six.h"
 
 struct open_bucket;
diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
index 7e9f1f170d5f..9a3c859ea572 100644
--- a/fs/bcachefs/btree_update.h
+++ b/fs/bcachefs/btree_update.h
@@ -154,6 +154,14 @@ static inline void bch2_trans_reset_updates(struct btree_trans *trans)
 	trans->nr_updates		= 0;
 	trans->hooks			= NULL;
 	trans->extra_journal_entries.nr	= 0;
+
+	if (trans->fs_usage_deltas) {
+		trans->fs_usage_deltas->used = 0;
+		memset((void *) trans->fs_usage_deltas +
+		       offsetof(struct replicas_delta_list, memset_start), 0,
+		       (void *) &trans->fs_usage_deltas->memset_end -
+		       (void *) &trans->fs_usage_deltas->memset_start);
+	}
 }
 
 #endif /* _BCACHEFS_BTREE_UPDATE_H */
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 61b61acef7a8..1dc86ac6f837 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -1118,14 +1118,6 @@ out:
 out_reset:
 	bch2_trans_reset_updates(trans);
 
-	if (trans->fs_usage_deltas) {
-		trans->fs_usage_deltas->used = 0;
-		memset((void *) trans->fs_usage_deltas +
-		       offsetof(struct replicas_delta_list, memset_start), 0,
-		       (void *) &trans->fs_usage_deltas->memset_end -
-		       (void *) &trans->fs_usage_deltas->memset_start);
-	}
-
 	return ret;
 err:
 	ret = bch2_trans_commit_error(trans, i, ret, _RET_IP_);
diff --git a/fs/bcachefs/replicas.h b/fs/bcachefs/replicas.h
index cc34b3809206..4887675a86f0 100644
--- a/fs/bcachefs/replicas.h
+++ b/fs/bcachefs/replicas.h
@@ -27,22 +27,6 @@ bool bch2_replicas_marked(struct bch_fs *, struct bch_replicas_entry *);
 int bch2_mark_replicas(struct bch_fs *,
 		       struct bch_replicas_entry *);
 
-struct replicas_delta {
-	s64			delta;
-	struct bch_replicas_entry r;
-} __packed;
-
-struct replicas_delta_list {
-	unsigned		size;
-	unsigned		used;
-
-	struct			{} memset_start;
-	u64			nr_inodes;
-	u64			persistent_reserved[BCH_REPLICAS_MAX];
-	struct			{} memset_end;
-	struct replicas_delta	d[0];
-};
-
 static inline struct replicas_delta *
 replicas_delta_next(struct replicas_delta *d)
 {
diff --git a/fs/bcachefs/replicas_types.h b/fs/bcachefs/replicas_types.h
index f12a35b3dbcf..5cfff489bbc3 100644
--- a/fs/bcachefs/replicas_types.h
+++ b/fs/bcachefs/replicas_types.h
@@ -8,4 +8,20 @@ struct bch_replicas_cpu {
 	struct bch_replicas_entry *entries;
 };
 
+struct replicas_delta {
+	s64			delta;
+	struct bch_replicas_entry r;
+} __packed;
+
+struct replicas_delta_list {
+	unsigned		size;
+	unsigned		used;
+
+	struct			{} memset_start;
+	u64			nr_inodes;
+	u64			persistent_reserved[BCH_REPLICAS_MAX];
+	struct			{} memset_end;
+	struct replicas_delta	d[0];
+};
+
 #endif /* _BCACHEFS_REPLICAS_TYPES_H */
-- 
cgit 


From 2e98404000e9c48c235c234bd179bb1acbf4c4e2 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 1 Feb 2023 15:45:45 -0500
Subject: bcachefs: Improve btree node read error path

This ensures that failure to read a btree node error is treated as a
topology error, and returns the correct error so that the topology
repair pass will be run.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_io.c | 61 +++++++++++++++++++++++++++++++-------------------
 fs/bcachefs/error.c    |  5 ++++-
 2 files changed, 42 insertions(+), 24 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 700ce14baa24..dfa45cf4021f 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -536,10 +536,23 @@ static void btree_err_msg(struct printbuf *out, struct bch_fs *c,
 }
 
 enum btree_err_type {
+	/*
+	 * We can repair this locally, and we're after the checksum check so
+	 * there's no need to try another replica:
+	 */
 	BTREE_ERR_FIXABLE,
+	/*
+	 * We can repair this if we have to, but we should try reading another
+	 * replica if we can:
+	 */
 	BTREE_ERR_WANT_RETRY,
+	/*
+	 * Read another replica if we have one, otherwise consider the whole
+	 * node bad:
+	 */
 	BTREE_ERR_MUST_RETRY,
-	BTREE_ERR_FATAL,
+	BTREE_ERR_BAD_NODE,
+	BTREE_ERR_INCOMPATIBLE,
 };
 
 enum btree_validate_ret {
@@ -565,36 +578,40 @@ static int __btree_err(enum btree_err_type type,
 	prt_vprintf(&out, fmt, args);
 	va_end(args);
 
-	if (write == READ &&
-	    type == BTREE_ERR_FIXABLE &&
-	    !test_bit(BCH_FS_FSCK_DONE, &c->flags)) {
-		mustfix_fsck_err(c, "%s", out.buf);
-		goto out;
-	}
-
-	bch2_print_string_as_lines(KERN_ERR, out.buf);
-
 	if (write == WRITE) {
+		bch2_print_string_as_lines(KERN_ERR, out.buf);
 		ret = c->opts.errors == BCH_ON_ERROR_continue
 			? 0
 			: -BCH_ERR_fsck_errors_not_fixed;
 		goto out;
 	}
 
+	if (!have_retry && type == BTREE_ERR_WANT_RETRY)
+		type = BTREE_ERR_FIXABLE;
+	if (!have_retry && type == BTREE_ERR_MUST_RETRY)
+		type = BTREE_ERR_BAD_NODE;
+
 	switch (type) {
 	case BTREE_ERR_FIXABLE:
-		ret = -BCH_ERR_fsck_errors_not_fixed;
+		mustfix_fsck_err(c, "%s", out.buf);
+		ret = -BCH_ERR_fsck_fix;
 		break;
 	case BTREE_ERR_WANT_RETRY:
-		if (have_retry)
-			ret = BTREE_RETRY_READ;
-		break;
 	case BTREE_ERR_MUST_RETRY:
+		bch2_print_string_as_lines(KERN_ERR, out.buf);
 		ret = BTREE_RETRY_READ;
 		break;
-	case BTREE_ERR_FATAL:
+	case BTREE_ERR_BAD_NODE:
+		bch2_print_string_as_lines(KERN_ERR, out.buf);
+		bch2_topology_error(c);
+		ret = -BCH_ERR_need_topology_repair;
+		break;
+	case BTREE_ERR_INCOMPATIBLE:
+		bch2_print_string_as_lines(KERN_ERR, out.buf);
 		ret = -BCH_ERR_fsck_errors_not_fixed;
 		break;
+	default:
+		BUG();
 	}
 out:
 fsck_err:
@@ -679,7 +696,7 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
 	btree_err_on((version != BCH_BSET_VERSION_OLD &&
 		      version < bcachefs_metadata_version_min) ||
 		     version >= bcachefs_metadata_version_max,
-		     BTREE_ERR_FATAL, c, ca, b, i,
+		     BTREE_ERR_INCOMPATIBLE, c, ca, b, i,
 		     "unsupported bset version");
 
 	if (btree_err_on(version < c->sb.version_min,
@@ -703,7 +720,7 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
 	}
 
 	btree_err_on(BSET_SEPARATE_WHITEOUTS(i),
-		     BTREE_ERR_FATAL, c, ca, b, i,
+		     BTREE_ERR_INCOMPATIBLE, c, ca, b, i,
 		     "BSET_SEPARATE_WHITEOUTS no longer supported");
 
 	if (btree_err_on(offset + sectors > btree_sectors(c),
@@ -780,7 +797,7 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
 
 		err = bch2_bkey_format_validate(&bn->format);
 		btree_err_on(err,
-			     BTREE_ERR_FATAL, c, ca, b, i,
+			     BTREE_ERR_BAD_NODE, c, ca, b, i,
 			     "invalid bkey format: %s", err);
 
 		compat_bformat(b->c.level, b->c.btree_id, version,
@@ -969,7 +986,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
 
 			btree_err_on(btree_node_type_is_extents(btree_node_type(b)) &&
 				     !BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data),
-				     BTREE_ERR_FATAL, c, NULL, b, NULL,
+				     BTREE_ERR_INCOMPATIBLE, c, NULL, b, NULL,
 				     "btree node does not have NEW_EXTENT_OVERWRITE set");
 
 			sectors = vstruct_sectors(b->data, c->block_bits);
@@ -1151,12 +1168,10 @@ out:
 	printbuf_exit(&buf);
 	return retry_read;
 fsck_err:
-	if (ret == BTREE_RETRY_READ) {
+	if (ret == BTREE_RETRY_READ)
 		retry_read = 1;
-	} else {
-		bch2_inconsistent_error(c);
+	else
 		set_btree_node_read_error(b);
-	}
 	goto out;
 }
 
diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c
index 3e49d72d65b5..c2882c599896 100644
--- a/fs/bcachefs/error.c
+++ b/fs/bcachefs/error.c
@@ -27,8 +27,11 @@ bool bch2_inconsistent_error(struct bch_fs *c)
 
 void bch2_topology_error(struct bch_fs *c)
 {
+	if (!test_bit(BCH_FS_TOPOLOGY_REPAIR_DONE, &c->flags))
+		return;
+
 	set_bit(BCH_FS_TOPOLOGY_ERROR, &c->flags);
-	if (test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags))
+	if (test_bit(BCH_FS_FSCK_DONE, &c->flags))
 		bch2_inconsistent_error(c);
 }
 
-- 
cgit 


From 12344c7cb966e1dbcb213ad1507c2ef3932790a3 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 1 Feb 2023 16:15:51 -0500
Subject: bcachefs: bch2_trans_in_restart_error()

This replaces various BUG_ON() assertions with panics that tell us where
the restart was done and the restart type.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c        | 28 +++++++++++++++++-----------
 fs/bcachefs/btree_iter.h        | 17 ++++++++++++++++-
 fs/bcachefs/btree_update_leaf.c |  2 +-
 3 files changed, 34 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 1a71e8af52d0..25345ed11076 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1224,7 +1224,7 @@ __bch2_btree_path_set_pos(struct btree_trans *trans,
 {
 	unsigned level = path->level;
 
-	EBUG_ON(trans->restarted);
+	bch2_trans_verify_not_in_restart(trans);
 	EBUG_ON(!path->ref);
 
 	path = bch2_btree_path_make_mut(trans, path, intent, ip);
@@ -1353,6 +1353,20 @@ static void bch2_path_put_nokeep(struct btree_trans *trans, struct btree_path *p
 	__bch2_path_free(trans, path);
 }
 
+void __noreturn bch2_trans_restart_error(struct btree_trans *trans, u32 restart_count)
+{
+	panic("trans->restart_count %u, should be %u, last restarted by %pS\n",
+	      trans->restart_count, restart_count,
+	      (void *) trans->last_restarted_ip);
+}
+
+void __noreturn bch2_trans_in_restart_error(struct btree_trans *trans)
+{
+	panic("in transaction restart: %s, last restarted by %pS\n",
+	      bch2_err_str(trans->restarted),
+	      (void *) trans->last_restarted_ip);
+}
+
 noinline __cold
 void bch2_trans_updates_to_text(struct printbuf *buf, struct btree_trans *trans)
 {
@@ -1519,7 +1533,7 @@ struct btree_path *bch2_path_get(struct btree_trans *trans,
 	bool intent = flags & BTREE_ITER_INTENT;
 	int i;
 
-	EBUG_ON(trans->restarted);
+	bch2_trans_verify_not_in_restart(trans);
 	bch2_trans_verify_locks(trans);
 
 	btree_trans_sort_paths(trans);
@@ -1695,7 +1709,7 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
 	struct btree *b = NULL;
 	int ret;
 
-	BUG_ON(trans->restarted);
+	bch2_trans_verify_not_in_restart(trans);
 	EBUG_ON(iter->path->cached);
 	bch2_btree_iter_verify(iter);
 
@@ -2833,14 +2847,6 @@ u32 bch2_trans_begin(struct btree_trans *trans)
 	return trans->restart_count;
 }
 
-void bch2_trans_verify_not_restarted(struct btree_trans *trans, u32 restart_count)
-{
-	if (trans_was_restarted(trans, restart_count))
-		panic("trans->restart_count %u, should be %u, last restarted by %pS\n",
-		      trans->restart_count, restart_count,
-		      (void *) trans->last_restarted_ip);
-}
-
 static void bch2_trans_alloc_paths(struct btree_trans *trans, struct bch_fs *c)
 {
 	size_t paths_bytes	= sizeof(struct btree_path) * BTREE_ITER_MAX;
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index df87d88982ae..2a57da036f27 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -227,7 +227,22 @@ static inline bool trans_was_restarted(struct btree_trans *trans, u32 restart_co
 	return restart_count != trans->restart_count;
 }
 
-void bch2_trans_verify_not_restarted(struct btree_trans *, u32);
+void __noreturn bch2_trans_restart_error(struct btree_trans *, u32);
+
+static inline void bch2_trans_verify_not_restarted(struct btree_trans *trans,
+						   u32 restart_count)
+{
+	if (trans_was_restarted(trans, restart_count))
+		bch2_trans_restart_error(trans, restart_count);
+}
+
+void __noreturn bch2_trans_in_restart_error(struct btree_trans *);
+
+static inline void bch2_trans_verify_not_in_restart(struct btree_trans *trans)
+{
+	if (trans->restarted)
+		bch2_trans_in_restart_error(trans);
+}
 
 __always_inline
 static inline int btree_trans_restart_nounlock(struct btree_trans *trans, int err)
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 1dc86ac6f837..656de4f59d82 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -1098,7 +1098,7 @@ int __bch2_trans_commit(struct btree_trans *trans)
 			goto err;
 	}
 retry:
-	EBUG_ON(trans->restarted);
+	bch2_trans_verify_not_in_restart(trans);
 	memset(&trans->journal_res, 0, sizeof(trans->journal_res));
 
 	ret = do_bch2_trans_commit(trans, &i, _RET_IP_);
-- 
cgit 


From f746c62ca5d02f43d92c9666ffd3dab01f1972ff Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 4 Feb 2023 19:38:43 -0500
Subject: six locks: Expose tracepoint IP

This adds _ip variations of the various lock functions that allow an IP
to be passed in, which is used by lockstat.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/six.c | 79 ++++++++++++++++++++++++++++++-------------------------
 fs/bcachefs/six.h | 45 ++++++++++++++++++++++++++-----
 2 files changed, 82 insertions(+), 42 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/six.c b/fs/bcachefs/six.c
index 3f9d4ff2edf4..40b7fdf2dbb0 100644
--- a/fs/bcachefs/six.c
+++ b/fs/bcachefs/six.c
@@ -18,8 +18,8 @@
 #define EBUG_ON(cond)		do {} while (0)
 #endif
 
-#define six_acquire(l, t, r)	lock_acquire(l, 0, t, r, 1, NULL, _RET_IP_)
-#define six_release(l)		lock_release(l, _RET_IP_)
+#define six_acquire(l, t, r, ip)	lock_acquire(l, 0, t, r, 1, NULL, ip)
+#define six_release(l, ip)		lock_release(l, ip)
 
 static void do_six_unlock_type(struct six_lock *lock, enum six_lock_type type);
 
@@ -279,19 +279,20 @@ static bool do_six_trylock_type(struct six_lock *lock,
 }
 
 __always_inline __flatten
-static bool __six_trylock_type(struct six_lock *lock, enum six_lock_type type)
+static bool __six_trylock_type(struct six_lock *lock, enum six_lock_type type,
+			       unsigned long ip)
 {
 	if (!do_six_trylock_type(lock, type, true))
 		return false;
 
 	if (type != SIX_LOCK_write)
-		six_acquire(&lock->dep_map, 1, type == SIX_LOCK_read);
+		six_acquire(&lock->dep_map, 1, type == SIX_LOCK_read, ip);
 	return true;
 }
 
 __always_inline __flatten
 static bool __six_relock_type(struct six_lock *lock, enum six_lock_type type,
-			      unsigned seq)
+			      unsigned seq, unsigned long ip)
 {
 	const struct six_lock_vals l[] = LOCK_VALS;
 	union six_lock_state old;
@@ -322,7 +323,7 @@ static bool __six_relock_type(struct six_lock *lock, enum six_lock_type type,
 			six_lock_wakeup(lock, old, SIX_LOCK_write);
 
 		if (ret)
-			six_acquire(&lock->dep_map, 1, type == SIX_LOCK_read);
+			six_acquire(&lock->dep_map, 1, type == SIX_LOCK_read, ip);
 
 		return ret;
 	}
@@ -339,7 +340,7 @@ static bool __six_relock_type(struct six_lock *lock, enum six_lock_type type,
 
 	six_set_owner(lock, type, old, current);
 	if (type != SIX_LOCK_write)
-		six_acquire(&lock->dep_map, 1, type == SIX_LOCK_read);
+		six_acquire(&lock->dep_map, 1, type == SIX_LOCK_read, ip);
 	return true;
 }
 
@@ -468,7 +469,8 @@ static inline bool six_optimistic_spin(struct six_lock *lock, enum six_lock_type
 noinline
 static int __six_lock_type_slowpath(struct six_lock *lock, enum six_lock_type type,
 				    struct six_lock_waiter *wait,
-				    six_lock_should_sleep_fn should_sleep_fn, void *p)
+				    six_lock_should_sleep_fn should_sleep_fn, void *p,
+				    unsigned long ip)
 {
 	union six_lock_state old;
 	int ret = 0;
@@ -482,7 +484,7 @@ static int __six_lock_type_slowpath(struct six_lock *lock, enum six_lock_type ty
 	if (six_optimistic_spin(lock, type))
 		goto out;
 
-	lock_contended(&lock->dep_map, _RET_IP_);
+	lock_contended(&lock->dep_map, ip);
 
 	wait->task		= current;
 	wait->lock_want		= type;
@@ -557,33 +559,35 @@ out:
 __always_inline __flatten
 static int __six_lock_type_waiter(struct six_lock *lock, enum six_lock_type type,
 			 struct six_lock_waiter *wait,
-			 six_lock_should_sleep_fn should_sleep_fn, void *p)
+			 six_lock_should_sleep_fn should_sleep_fn, void *p,
+			 unsigned long ip)
 {
 	int ret;
 
 	wait->start_time = 0;
 
 	if (type != SIX_LOCK_write)
-		six_acquire(&lock->dep_map, 0, type == SIX_LOCK_read);
+		six_acquire(&lock->dep_map, 0, type == SIX_LOCK_read, ip);
 
 	ret = do_six_trylock_type(lock, type, true) ? 0
-		: __six_lock_type_slowpath(lock, type, wait, should_sleep_fn, p);
+		: __six_lock_type_slowpath(lock, type, wait, should_sleep_fn, p, ip);
 
 	if (ret && type != SIX_LOCK_write)
-		six_release(&lock->dep_map);
+		six_release(&lock->dep_map, ip);
 	if (!ret)
-		lock_acquired(&lock->dep_map, _RET_IP_);
+		lock_acquired(&lock->dep_map, ip);
 
 	return ret;
 }
 
 __always_inline
 static int __six_lock_type(struct six_lock *lock, enum six_lock_type type,
-			   six_lock_should_sleep_fn should_sleep_fn, void *p)
+			   six_lock_should_sleep_fn should_sleep_fn, void *p,
+			   unsigned long ip)
 {
 	struct six_lock_waiter wait;
 
-	return __six_lock_type_waiter(lock, type, &wait, should_sleep_fn, p);
+	return __six_lock_type_waiter(lock, type, &wait, should_sleep_fn, p, ip);
 }
 
 __always_inline __flatten
@@ -611,7 +615,8 @@ static void do_six_unlock_type(struct six_lock *lock, enum six_lock_type type)
 }
 
 __always_inline __flatten
-static void __six_unlock_type(struct six_lock *lock, enum six_lock_type type)
+static void __six_unlock_type(struct six_lock *lock, enum six_lock_type type,
+			      unsigned long ip)
 {
 	EBUG_ON(type == SIX_LOCK_write &&
 		!(lock->state.v & __SIX_LOCK_HELD_intent));
@@ -620,7 +625,7 @@ static void __six_unlock_type(struct six_lock *lock, enum six_lock_type type)
 		lock->owner != current);
 
 	if (type != SIX_LOCK_write)
-		six_release(&lock->dep_map);
+		six_release(&lock->dep_map, ip);
 
 	if (type == SIX_LOCK_intent &&
 	    lock->intent_lock_recurse) {
@@ -632,38 +637,40 @@ static void __six_unlock_type(struct six_lock *lock, enum six_lock_type type)
 }
 
 #define __SIX_LOCK(type)						\
-bool six_trylock_##type(struct six_lock *lock)				\
+bool six_trylock_ip_##type(struct six_lock *lock, unsigned long ip)	\
 {									\
-	return __six_trylock_type(lock, SIX_LOCK_##type);		\
+	return __six_trylock_type(lock, SIX_LOCK_##type, ip);		\
 }									\
-EXPORT_SYMBOL_GPL(six_trylock_##type);					\
+EXPORT_SYMBOL_GPL(six_trylock_ip_##type);				\
 									\
-bool six_relock_##type(struct six_lock *lock, u32 seq)			\
+bool six_relock_ip_##type(struct six_lock *lock, u32 seq, unsigned long ip)\
 {									\
-	return __six_relock_type(lock, SIX_LOCK_##type, seq);		\
+	return __six_relock_type(lock, SIX_LOCK_##type, seq, ip);	\
 }									\
-EXPORT_SYMBOL_GPL(six_relock_##type);					\
+EXPORT_SYMBOL_GPL(six_relock_ip_##type);				\
 									\
-int six_lock_##type(struct six_lock *lock,				\
-		    six_lock_should_sleep_fn should_sleep_fn, void *p)	\
+int six_lock_ip_##type(struct six_lock *lock,				\
+		    six_lock_should_sleep_fn should_sleep_fn, void *p,	\
+		    unsigned long ip)					\
 {									\
-	return __six_lock_type(lock, SIX_LOCK_##type, should_sleep_fn, p);\
+	return __six_lock_type(lock, SIX_LOCK_##type, should_sleep_fn, p, ip);\
 }									\
-EXPORT_SYMBOL_GPL(six_lock_##type);					\
+EXPORT_SYMBOL_GPL(six_lock_ip_##type);					\
 									\
-int six_lock_waiter_##type(struct six_lock *lock,			\
+int six_lock_ip_waiter_##type(struct six_lock *lock,			\
 			   struct six_lock_waiter *wait,		\
-			   six_lock_should_sleep_fn should_sleep_fn, void *p)\
+			   six_lock_should_sleep_fn should_sleep_fn, void *p,\
+			   unsigned long ip)				\
 {									\
-	return __six_lock_type_waiter(lock, SIX_LOCK_##type, wait, should_sleep_fn, p);\
+	return __six_lock_type_waiter(lock, SIX_LOCK_##type, wait, should_sleep_fn, p, ip);\
 }									\
-EXPORT_SYMBOL_GPL(six_lock_waiter_##type);				\
+EXPORT_SYMBOL_GPL(six_lock_ip_waiter_##type);				\
 									\
-void six_unlock_##type(struct six_lock *lock)				\
+void six_unlock_ip_##type(struct six_lock *lock, unsigned long ip)	\
 {									\
-	__six_unlock_type(lock, SIX_LOCK_##type);			\
+	__six_unlock_type(lock, SIX_LOCK_##type, ip);			\
 }									\
-EXPORT_SYMBOL_GPL(six_unlock_##type);
+EXPORT_SYMBOL_GPL(six_unlock_ip_##type);
 
 __SIX_LOCK(read)
 __SIX_LOCK(intent)
@@ -734,7 +741,7 @@ void six_lock_increment(struct six_lock *lock, enum six_lock_type type)
 {
 	const struct six_lock_vals l[] = LOCK_VALS;
 
-	six_acquire(&lock->dep_map, 0, type == SIX_LOCK_read);
+	six_acquire(&lock->dep_map, 0, type == SIX_LOCK_read, _RET_IP_);
 
 	/* XXX: assert already locked, and that we don't overflow: */
 
diff --git a/fs/bcachefs/six.h b/fs/bcachefs/six.h
index 9ebbf8095573..c9159cd51d20 100644
--- a/fs/bcachefs/six.h
+++ b/fs/bcachefs/six.h
@@ -154,12 +154,37 @@ do {									\
 #define __SIX_VAL(field, _v)	(((union six_lock_state) { .field = _v }).v)
 
 #define __SIX_LOCK(type)						\
-bool six_trylock_##type(struct six_lock *);				\
-bool six_relock_##type(struct six_lock *, u32);				\
-int six_lock_##type(struct six_lock *, six_lock_should_sleep_fn, void *);\
-int six_lock_waiter_##type(struct six_lock *, struct six_lock_waiter *,	\
-			   six_lock_should_sleep_fn, void *);		\
-void six_unlock_##type(struct six_lock *);
+bool six_trylock_ip_##type(struct six_lock *, unsigned long);		\
+bool six_relock_ip_##type(struct six_lock *, u32, unsigned long);	\
+int six_lock_ip_##type(struct six_lock *, six_lock_should_sleep_fn,	\
+		       void *, unsigned long);				\
+int six_lock_ip_waiter_##type(struct six_lock *, struct six_lock_waiter *,\
+			six_lock_should_sleep_fn, void *, unsigned long);\
+void six_unlock_ip_##type(struct six_lock *, unsigned long);		\
+									\
+static inline bool six_trylock_##type(struct six_lock *lock)		\
+{									\
+	return six_trylock_ip_##type(lock, _THIS_IP_);			\
+}									\
+static inline bool six_relock_##type(struct six_lock *lock, u32 seq)	\
+{									\
+	return six_relock_ip_##type(lock, seq, _THIS_IP_);		\
+}									\
+static inline int six_lock_##type(struct six_lock *lock,		\
+				  six_lock_should_sleep_fn fn, void *p)\
+{									\
+	return six_lock_ip_##type(lock, fn, p, _THIS_IP_);		\
+}									\
+static inline int six_lock_waiter_##type(struct six_lock *lock,		\
+			struct six_lock_waiter *wait,			\
+			six_lock_should_sleep_fn fn, void *p)		\
+{									\
+	return six_lock_ip_waiter_##type(lock, wait, fn, p, _THIS_IP_);	\
+}									\
+static inline void six_unlock_##type(struct six_lock *lock)		\
+{									\
+	return six_unlock_ip_##type(lock, _THIS_IP_);			\
+}
 
 __SIX_LOCK(read)
 __SIX_LOCK(intent)
@@ -195,6 +220,14 @@ static inline int six_lock_type(struct six_lock *lock, enum six_lock_type type,
 	SIX_LOCK_DISPATCH(type, six_lock, lock, should_sleep_fn, p);
 }
 
+static inline int six_lock_type_ip_waiter(struct six_lock *lock, enum six_lock_type type,
+				struct six_lock_waiter *wait,
+				six_lock_should_sleep_fn should_sleep_fn, void *p,
+				unsigned long ip)
+{
+	SIX_LOCK_DISPATCH(type, six_lock_ip_waiter, lock, wait, should_sleep_fn, p, ip);
+}
+
 static inline int six_lock_type_waiter(struct six_lock *lock, enum six_lock_type type,
 				struct six_lock_waiter *wait,
 				six_lock_should_sleep_fn should_sleep_fn, void *p)
-- 
cgit 


From 94c69fafa7081d84be89ba1067558be39b4ea44b Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 4 Feb 2023 19:39:59 -0500
Subject: bcachefs: Use six_lock_ip()

This uses the new _ip() interface to six locks and hooks it up to
btree_path->ip_allocated, when available.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_cache.c     |  2 +-
 fs/bcachefs/btree_iter.c      |  6 +++---
 fs/bcachefs/btree_key_cache.c |  2 +-
 fs/bcachefs/btree_locking.c   |  3 ++-
 fs/bcachefs/btree_locking.h   | 16 +++++++++-------
 fs/bcachefs/btree_types.h     | 17 +++++++++++++++--
 6 files changed, 31 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index 7868536d7581..769f17b67fcf 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -1044,7 +1044,7 @@ retry:
 			goto out;
 	} else {
 lock_node:
-		ret = btree_node_lock_nopath(trans, &b->c, SIX_LOCK_read);
+		ret = btree_node_lock_nopath(trans, &b->c, SIX_LOCK_read, _THIS_IP_);
 		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
 			return ERR_PTR(ret);
 
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 25345ed11076..f41c4416fe1c 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -26,7 +26,7 @@ static inline void btree_path_list_add(struct btree_trans *, struct btree_path *
 
 static inline unsigned long btree_iter_ip_allocated(struct btree_iter *iter)
 {
-#ifdef CONFIG_BCACHEFS_DEBUG
+#ifdef TRACK_PATH_ALLOCATED
 	return iter->ip_allocated;
 #else
 	return 0;
@@ -1420,7 +1420,7 @@ void bch2_btree_path_to_text(struct printbuf *out, struct btree_path *path)
 	bch2_bpos_to_text(out, path->pos);
 
 	prt_printf(out, " locks %u", path->nodes_locked);
-#ifdef CONFIG_BCACHEFS_DEBUG
+#ifdef TRACK_PATH_ALLOCATED
 	prt_printf(out, " %pS", (void *) path->ip_allocated);
 #endif
 	prt_newline(out);
@@ -1570,7 +1570,7 @@ struct btree_path *bch2_path_get(struct btree_trans *trans,
 		path->nodes_locked		= 0;
 		for (i = 0; i < ARRAY_SIZE(path->l); i++)
 			path->l[i].b		= ERR_PTR(-BCH_ERR_no_btree_node_init);
-#ifdef CONFIG_BCACHEFS_DEBUG
+#ifdef TRACK_PATH_ALLOCATED
 		path->ip_allocated		= ip;
 #endif
 		trans->paths_sorted		= false;
diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index d4cfb48d2013..743ebeba12b1 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -244,7 +244,7 @@ bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path,
 	if (ck) {
 		int ret;
 
-		ret = btree_node_lock_nopath(trans, &ck->c, SIX_LOCK_intent);
+		ret = btree_node_lock_nopath(trans, &ck->c, SIX_LOCK_intent, _THIS_IP_);
 		if (unlikely(ret)) {
 			bkey_cached_move_to_freelist(bc, ck);
 			return ERR_PTR(ret);
diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c
index e7659b4cf9e4..56489e7b0ac2 100644
--- a/fs/bcachefs/btree_locking.c
+++ b/fs/bcachefs/btree_locking.c
@@ -366,7 +366,8 @@ int __bch2_btree_node_lock_write(struct btree_trans *trans, struct btree_path *p
 	 * locked:
 	 */
 	six_lock_readers_add(&b->lock, -readers);
-	ret = __btree_node_lock_nopath(trans, b, SIX_LOCK_write, lock_may_not_fail);
+	ret = __btree_node_lock_nopath(trans, b, SIX_LOCK_write,
+				       lock_may_not_fail, _RET_IP_);
 	six_lock_readers_add(&b->lock, readers);
 
 	if (ret)
diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h
index 3356f089e268..30c89daa5009 100644
--- a/fs/bcachefs/btree_locking.h
+++ b/fs/bcachefs/btree_locking.h
@@ -190,7 +190,8 @@ int bch2_six_check_for_deadlock(struct six_lock *lock, void *p);
 static inline int __btree_node_lock_nopath(struct btree_trans *trans,
 					 struct btree_bkey_cached_common *b,
 					 enum six_lock_type type,
-					 bool lock_may_not_fail)
+					 bool lock_may_not_fail,
+					 unsigned long ip)
 {
 	int ret;
 
@@ -198,8 +199,8 @@ static inline int __btree_node_lock_nopath(struct btree_trans *trans,
 	trans->lock_must_abort	= false;
 	trans->locking		= b;
 
-	ret = six_lock_type_waiter(&b->lock, type, &trans->locking_wait,
-				   bch2_six_check_for_deadlock, trans);
+	ret = six_lock_type_ip_waiter(&b->lock, type, &trans->locking_wait,
+				   bch2_six_check_for_deadlock, trans, ip);
 	WRITE_ONCE(trans->locking, NULL);
 	WRITE_ONCE(trans->locking_wait.start_time, 0);
 	return ret;
@@ -208,16 +209,17 @@ static inline int __btree_node_lock_nopath(struct btree_trans *trans,
 static inline int __must_check
 btree_node_lock_nopath(struct btree_trans *trans,
 		       struct btree_bkey_cached_common *b,
-		       enum six_lock_type type)
+		       enum six_lock_type type,
+		       unsigned long ip)
 {
-	return __btree_node_lock_nopath(trans, b, type, false);
+	return __btree_node_lock_nopath(trans, b, type, false, ip);
 }
 
 static inline void btree_node_lock_nopath_nofail(struct btree_trans *trans,
 					 struct btree_bkey_cached_common *b,
 					 enum six_lock_type type)
 {
-	int ret = __btree_node_lock_nopath(trans, b, type, true);
+	int ret = __btree_node_lock_nopath(trans, b, type, true, _THIS_IP_);
 
 	BUG_ON(ret);
 }
@@ -257,7 +259,7 @@ static inline int btree_node_lock(struct btree_trans *trans,
 
 	if (likely(six_trylock_type(&b->lock, type)) ||
 	    btree_node_lock_increment(trans, b, level, type) ||
-	    !(ret = btree_node_lock_nopath(trans, b, type))) {
+	    !(ret = btree_node_lock_nopath(trans, b, type, btree_path_ip_allocated(path)))) {
 #ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS
 		path->l[b->level].lock_taken_time = local_clock();
 #endif
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 5cf03ec52051..5660d076c678 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -217,6 +217,10 @@ enum btree_path_uptodate {
 	BTREE_ITER_NEED_TRAVERSE	= 2,
 };
 
+#if defined(CONFIG_BCACHEFS_LOCK_TIME_STATS) || defined(CONFIG_BCACHEFS_DEBUG)
+#define TRACK_PATH_ALLOCATED
+#endif
+
 struct btree_path {
 	u8			idx;
 	u8			sorted_idx;
@@ -247,7 +251,7 @@ struct btree_path {
 		u64             lock_taken_time;
 #endif
 	}			l[BTREE_MAX_DEPTH];
-#ifdef CONFIG_BCACHEFS_DEBUG
+#ifdef TRACK_PATH_ALLOCATED
 	unsigned long		ip_allocated;
 #endif
 };
@@ -257,6 +261,15 @@ static inline struct btree_path_level *path_l(struct btree_path *path)
 	return path->l + path->level;
 }
 
+static inline unsigned long btree_path_ip_allocated(struct btree_path *path)
+{
+#ifdef TRACK_PATH_ALLOCATED
+	return path->ip_allocated;
+#else
+	return _THIS_IP_;
+#endif
+}
+
 /*
  * @pos			- iterator's current position
  * @level		- current btree depth
@@ -290,7 +303,7 @@ struct btree_iter {
 	/* BTREE_ITER_WITH_JOURNAL: */
 	size_t			journal_idx;
 	struct bpos		journal_pos;
-#ifdef CONFIG_BCACHEFS_DEBUG
+#ifdef TRACK_PATH_ALLOCATED
 	unsigned long		ip_allocated;
 #endif
 };
-- 
cgit 


From 91db80668149a4eb19ab3bfcfecf9f09ad1f2c8f Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 5 Feb 2023 14:09:30 -0500
Subject: six locks: Improved optimistic spinning

This adds a threshold for the maximum spin time, similar to the rwsem
code, and a flag to the lock itself indicating when we've spun too long
so other threads also refrain from spinning.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/six.c | 52 +++++++++++++++++++++++++++++++++++++---------------
 fs/bcachefs/six.h |  3 ++-
 2 files changed, 39 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/six.c b/fs/bcachefs/six.c
index 40b7fdf2dbb0..5d003e41ae43 100644
--- a/fs/bcachefs/six.c
+++ b/fs/bcachefs/six.c
@@ -346,30 +346,39 @@ static bool __six_relock_type(struct six_lock *lock, enum six_lock_type type,
 
 #ifdef CONFIG_SIX_LOCK_SPIN_ON_OWNER
 
-static inline int six_can_spin_on_owner(struct six_lock *lock)
+static inline bool six_can_spin_on_owner(struct six_lock *lock)
 {
 	struct task_struct *owner;
-	int retval = 1;
+	bool ret;
 
 	if (need_resched())
-		return 0;
+		return false;
 
 	rcu_read_lock();
 	owner = READ_ONCE(lock->owner);
-	if (owner)
-		retval = owner->on_cpu;
+	ret = !owner || owner_on_cpu(owner);
 	rcu_read_unlock();
-	/*
-	 * if lock->owner is not set, the mutex owner may have just acquired
-	 * it and not set the owner yet or the mutex has been released.
-	 */
-	return retval;
+
+	return ret;
+}
+
+static inline void six_set_nospin(struct six_lock *lock)
+{
+	union six_lock_state old, new;
+	u64 v = READ_ONCE(lock->state.v);
+
+	do {
+		new.v = old.v = v;
+		new.nospin = true;
+	} while ((v = atomic64_cmpxchg(&lock->state.counter, old.v, new.v)) != old.v);
 }
 
 static inline bool six_spin_on_owner(struct six_lock *lock,
-				     struct task_struct *owner)
+				     struct task_struct *owner,
+				     u64 end_time)
 {
 	bool ret = true;
+	unsigned loop = 0;
 
 	rcu_read_lock();
 	while (lock->owner == owner) {
@@ -381,7 +390,13 @@ static inline bool six_spin_on_owner(struct six_lock *lock,
 		 */
 		barrier();
 
-		if (!owner->on_cpu || need_resched()) {
+		if (!owner_on_cpu(owner) || need_resched()) {
+			ret = false;
+			break;
+		}
+
+		if (!(++loop & 0xf) && (time_after64(sched_clock(), end_time))) {
+			six_set_nospin(lock);
 			ret = false;
 			break;
 		}
@@ -396,6 +411,7 @@ static inline bool six_spin_on_owner(struct six_lock *lock,
 static inline bool six_optimistic_spin(struct six_lock *lock, enum six_lock_type type)
 {
 	struct task_struct *task = current;
+	u64 end_time;
 
 	if (type == SIX_LOCK_write)
 		return false;
@@ -407,6 +423,8 @@ static inline bool six_optimistic_spin(struct six_lock *lock, enum six_lock_type
 	if (!osq_lock(&lock->osq))
 		goto fail;
 
+	end_time = sched_clock() + 10 * NSEC_PER_USEC;
+
 	while (1) {
 		struct task_struct *owner;
 
@@ -415,7 +433,7 @@ static inline bool six_optimistic_spin(struct six_lock *lock, enum six_lock_type
 		 * release the lock or go to sleep.
 		 */
 		owner = READ_ONCE(lock->owner);
-		if (owner && !six_spin_on_owner(lock, owner))
+		if (owner && !six_spin_on_owner(lock, owner, end_time))
 			break;
 
 		if (do_six_trylock_type(lock, type, false)) {
@@ -606,9 +624,13 @@ static void do_six_unlock_type(struct six_lock *lock, enum six_lock_type type)
 		smp_mb(); /* between unlocking and checking for waiters */
 		state.v = READ_ONCE(lock->state.v);
 	} else {
+		u64 v = l[type].unlock_val;
+
+		if (type != SIX_LOCK_read)
+			v -= lock->state.v & __SIX_VAL(nospin, 1);
+
 		EBUG_ON(!(lock->state.v & l[type].held_mask));
-		state.v = atomic64_add_return_release(l[type].unlock_val,
-						      &lock->state.counter);
+		state.v = atomic64_add_return_release(v, &lock->state.counter);
 	}
 
 	six_lock_wakeup(lock, state, l[type].unlock_wakeup);
diff --git a/fs/bcachefs/six.h b/fs/bcachefs/six.h
index c9159cd51d20..09abea29a021 100644
--- a/fs/bcachefs/six.h
+++ b/fs/bcachefs/six.h
@@ -83,9 +83,10 @@ union six_lock_state {
 	};
 
 	struct {
-		unsigned	read_lock:27;
+		unsigned	read_lock:26;
 		unsigned	write_locking:1;
 		unsigned	intent_lock:1;
+		unsigned	nospin:1;
 		unsigned	waiters:3;
 		/*
 		 * seq works much like in seqlocks: it's incremented every time
-- 
cgit 


From 5b3008bc6182e56fdd5ba36fdf324430d0792e0c Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 2 Mar 2023 23:51:47 -0500
Subject: bcachefs: Don't call bch2_journal_pin_drop() under key cache lock

This fixes a (harmless) lockdep splat, due to a lock order violation in
the key cache exit path.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_key_cache.c | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index 743ebeba12b1..867f063f22d1 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -947,6 +947,7 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc)
 	struct bucket_table *tbl;
 	struct bkey_cached *ck, *n;
 	struct rhash_head *pos;
+	LIST_HEAD(items);
 	unsigned i;
 #ifdef __KERNEL__
 	int cpu;
@@ -967,7 +968,7 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc)
 			for (i = 0; i < tbl->size; i++)
 				rht_for_each_entry_rcu(ck, pos, tbl, i, hash) {
 					bkey_cached_evict(bc, ck);
-					list_add(&ck->list, &bc->freed_nonpcpu);
+					list_add(&ck->list, &items);
 				}
 		rcu_read_unlock();
 	}
@@ -979,14 +980,17 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc)
 
 		for (i = 0; i < f->nr; i++) {
 			ck = f->objs[i];
-			list_add(&ck->list, &bc->freed_nonpcpu);
+			list_add(&ck->list, &items);
 		}
 	}
 #endif
 
-	list_splice(&bc->freed_pcpu, &bc->freed_nonpcpu);
+	list_splice(&bc->freed_pcpu,	&items);
+	list_splice(&bc->freed_nonpcpu,	&items);
 
-	list_for_each_entry_safe(ck, n, &bc->freed_nonpcpu, list) {
+	mutex_unlock(&bc->lock);
+
+	list_for_each_entry_safe(ck, n, &items, list) {
 		cond_resched();
 
 		bch2_journal_pin_drop(&c->journal, &ck->journal);
@@ -1008,8 +1012,6 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc)
 		panic("btree key cache shutdown error: nr_keys nonzero (%li)\n",
 		      atomic_long_read(&bc->nr_keys));
 
-	mutex_unlock(&bc->lock);
-
 	if (bc->table_init_done)
 		rhashtable_destroy(&bc->table);
 
-- 
cgit 


From c72f687a1ff1801b404fab804fdddcaf034e6ef4 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 11 Oct 2022 04:32:41 -0400
Subject: bcachefs: Use for_each_btree_key_upto() more consistently

It's important that in BTREE_ITER_FILTER_SNAPSHOTS mode we always use
peek_upto() and provide an end for the interval we're searching for -
otherwise, when we hit the end of the inode the next inode be in a
different subvolume and not have any keys in the current snapshot, and
we'd iterate over arbitrarily many keys before returning one.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c        | 28 ++++++++++-------
 fs/bcachefs/btree_iter.h        | 69 +++++++++++++++++++++++++++++++++++++++++
 fs/bcachefs/btree_update_leaf.c |  5 +--
 fs/bcachefs/ec.c                |  2 +-
 fs/bcachefs/extent_update.c     |  5 +--
 fs/bcachefs/fs-io.c             | 26 ++++++++--------
 fs/bcachefs/fsck.c              | 20 +++++-------
 fs/bcachefs/io.c                |  9 +++---
 fs/bcachefs/quota.c             |  6 ++--
 fs/bcachefs/reflink.c           |  6 +---
 fs/bcachefs/tests.c             | 55 ++++++++++++++++++--------------
 11 files changed, 150 insertions(+), 81 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index f41c4416fe1c..e5c82aa9bfeb 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -2042,6 +2042,7 @@ struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos e
 	int ret;
 
 	EBUG_ON(iter->flags & BTREE_ITER_ALL_LEVELS);
+	EBUG_ON((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) && bkey_eq(end, POS_MAX));
 
 	if (iter->update_path) {
 		bch2_path_put_nokeep(trans, iter->update_path,
@@ -2053,7 +2054,9 @@ struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos e
 
 	while (1) {
 		k = __bch2_btree_iter_peek(iter, search_key);
-		if (!k.k || bkey_err(k))
+		if (unlikely(!k.k))
+			goto end;
+		if (unlikely(bkey_err(k)))
 			goto out_no_locked;
 
 		/*
@@ -2066,11 +2069,10 @@ struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos e
 		else
 			iter_pos = bkey_max(iter->pos, bkey_start_pos(k.k));
 
-		if (bkey_gt(iter_pos, end)) {
-			bch2_btree_iter_set_pos(iter, end);
-			k = bkey_s_c_null;
-			goto out_no_locked;
-		}
+		if (unlikely(!(iter->flags & BTREE_ITER_IS_EXTENTS)
+			     ? bkey_gt(iter_pos, end)
+			     : bkey_ge(iter_pos, end)))
+			goto end;
 
 		if (iter->update_path &&
 		    !bkey_eq(iter->update_path->pos, k.k->p)) {
@@ -2159,6 +2161,10 @@ out_no_locked:
 	bch2_btree_iter_verify_entry_exit(iter);
 
 	return k;
+end:
+	bch2_btree_iter_set_pos(iter, end);
+	k = bkey_s_c_null;
+	goto out_no_locked;
 }
 
 /**
@@ -2463,15 +2469,15 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
 			goto out_no_locked;
 	} else {
 		struct bpos next;
+		struct bpos end = iter->pos;
+
+		if (iter->flags & BTREE_ITER_IS_EXTENTS)
+			end.offset = U64_MAX;
 
 		EBUG_ON(iter->path->level);
 
 		if (iter->flags & BTREE_ITER_INTENT) {
 			struct btree_iter iter2;
-			struct bpos end = iter->pos;
-
-			if (iter->flags & BTREE_ITER_IS_EXTENTS)
-				end.offset = U64_MAX;
 
 			bch2_trans_copy_iter(&iter2, iter);
 			k = bch2_btree_iter_peek_upto(&iter2, end);
@@ -2484,7 +2490,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
 		} else {
 			struct bpos pos = iter->pos;
 
-			k = bch2_btree_iter_peek(iter);
+			k = bch2_btree_iter_peek_upto(iter, end);
 			if (unlikely(bkey_err(k)))
 				bch2_btree_iter_set_pos(iter, pos);
 			else
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index 2a57da036f27..4b1f993ea3fb 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -599,6 +599,22 @@ __bch2_btree_iter_peek_and_restart(struct btree_trans *trans,
 	return k;
 }
 
+static inline struct bkey_s_c
+__bch2_btree_iter_peek_upto_and_restart(struct btree_trans *trans,
+					struct btree_iter *iter,
+					struct bpos end,
+					unsigned flags)
+{
+	struct bkey_s_c k;
+
+	while (btree_trans_too_many_iters(trans) ||
+	       (k = bch2_btree_iter_peek_upto_type(iter, end, flags),
+		bch2_err_matches(bkey_err(k), BCH_ERR_transaction_restart)))
+		bch2_trans_begin(trans);
+
+	return k;
+}
+
 #define lockrestart_do(_trans, _do)					\
 ({									\
 	u32 _restart_count;						\
@@ -673,6 +689,36 @@ __bch2_btree_iter_peek_and_restart(struct btree_trans *trans,
 	_ret;								\
 })
 
+#define for_each_btree_key2_upto(_trans, _iter, _btree_id,		\
+			    _start, _end, _flags, _k, _do)		\
+({									\
+	int _ret = 0;							\
+									\
+	bch2_trans_iter_init((_trans), &(_iter), (_btree_id),		\
+			     (_start), (_flags));			\
+									\
+	while (1) {							\
+		u32 _restart_count = bch2_trans_begin(_trans);		\
+									\
+		_ret = 0;						\
+		(_k) = bch2_btree_iter_peek_upto_type(&(_iter), _end, (_flags));\
+		if (!(_k).k)						\
+			break;						\
+									\
+		_ret = bkey_err(_k) ?: (_do);				\
+		if (bch2_err_matches(_ret, BCH_ERR_transaction_restart))\
+			continue;					\
+		if (_ret)						\
+			break;						\
+		bch2_trans_verify_not_restarted(_trans, _restart_count);\
+		if (!bch2_btree_iter_advance(&(_iter)))			\
+			break;						\
+	}								\
+									\
+	bch2_trans_iter_exit((_trans), &(_iter));			\
+	_ret;								\
+})
+
 #define for_each_btree_key_reverse(_trans, _iter, _btree_id,		\
 				   _start, _flags, _k, _do)		\
 ({									\
@@ -711,6 +757,14 @@ __bch2_btree_iter_peek_and_restart(struct btree_trans *trans,
 			    (_do) ?: bch2_trans_commit(_trans, (_disk_res),\
 					(_journal_seq), (_commit_flags)))
 
+#define for_each_btree_key_upto_commit(_trans, _iter, _btree_id,	\
+				  _start, _end, _iter_flags, _k,	\
+				  _disk_res, _journal_seq, _commit_flags,\
+				  _do)					\
+	for_each_btree_key2_upto(_trans, _iter, _btree_id, _start, _end, _iter_flags, _k,\
+			    (_do) ?: bch2_trans_commit(_trans, (_disk_res),\
+					(_journal_seq), (_commit_flags)))
+
 #define for_each_btree_key(_trans, _iter, _btree_id,			\
 			   _start, _flags, _k, _ret)			\
 	for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id),	\
@@ -719,6 +773,15 @@ __bch2_btree_iter_peek_and_restart(struct btree_trans *trans,
 	     !((_ret) = bkey_err(_k)) && (_k).k;			\
 	     bch2_btree_iter_advance(&(_iter)))
 
+#define for_each_btree_key_upto(_trans, _iter, _btree_id,		\
+				_start, _end, _flags, _k, _ret)		\
+	for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id),	\
+				  (_start), (_flags));			\
+	     (_k) = __bch2_btree_iter_peek_upto_and_restart((_trans),	\
+						&(_iter), _end, _flags),\
+	     !((_ret) = bkey_err(_k)) && (_k).k;			\
+	     bch2_btree_iter_advance(&(_iter)))
+
 #define for_each_btree_key_norestart(_trans, _iter, _btree_id,		\
 			   _start, _flags, _k, _ret)			\
 	for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id),	\
@@ -747,6 +810,12 @@ __bch2_btree_iter_peek_and_restart(struct btree_trans *trans,
 	     !((_ret) = bkey_err(_k)) && (_k).k;			\
 	     bch2_btree_iter_advance(&(_iter)))
 
+#define for_each_btree_key_upto_continue_norestart(_iter, _end, _flags, _k, _ret)\
+	for (;									\
+	     (_k) = bch2_btree_iter_peek_upto_type(&(_iter), _end, _flags),	\
+	     !((_ret) = bkey_err(_k)) && (_k).k;				\
+	     bch2_btree_iter_advance(&(_iter)))
+
 /* new multiple iterator interface: */
 
 void bch2_trans_updates_to_text(struct printbuf *, struct btree_trans *);
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 656de4f59d82..4584bc8c94b3 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -1675,7 +1675,7 @@ int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id id,
 	int ret = 0;
 
 	bch2_trans_iter_init(trans, &iter, id, start, BTREE_ITER_INTENT);
-	while ((k = bch2_btree_iter_peek(&iter)).k) {
+	while ((k = bch2_btree_iter_peek_upto(&iter, end)).k) {
 		struct disk_reservation disk_res =
 			bch2_disk_reservation_init(trans->c, 0);
 		struct bkey_i delete;
@@ -1684,9 +1684,6 @@ int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id id,
 		if (ret)
 			goto err;
 
-		if (bkey_ge(iter.pos, end))
-			break;
-
 		bkey_init(&delete.k);
 
 		/*
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index 0d33dee1aed4..e320868a8b8b 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -683,7 +683,7 @@ static int ec_stripe_delete(struct bch_fs *c, size_t idx)
 {
 	return bch2_btree_delete_range(c, BTREE_ID_stripes,
 				       POS(0, idx),
-				       POS(0, idx + 1),
+				       POS(0, idx),
 				       0, NULL);
 }
 
diff --git a/fs/bcachefs/extent_update.c b/fs/bcachefs/extent_update.c
index 21d6f88c7397..21af6fb8cecf 100644
--- a/fs/bcachefs/extent_update.c
+++ b/fs/bcachefs/extent_update.c
@@ -128,12 +128,9 @@ int bch2_extent_atomic_end(struct btree_trans *trans,
 
 	bch2_trans_copy_iter(&copy, iter);
 
-	for_each_btree_key_continue_norestart(copy, 0, k, ret) {
+	for_each_btree_key_upto_continue_norestart(copy, insert->k.p, 0, k, ret) {
 		unsigned offset = 0;
 
-		if (bkey_ge(bkey_start_pos(k.k), *end))
-			break;
-
 		if (bkey_gt(bkey_start_pos(&insert->k), bkey_start_pos(k.k)))
 			offset = bkey_start_offset(&insert->k) -
 				bkey_start_offset(k.k);
diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 25094e0406da..378cca413c75 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -2542,15 +2542,11 @@ retry:
 	if (ret)
 		goto err;
 
-	for_each_btree_key_norestart(&trans, iter, BTREE_ID_extents, start, 0, k, ret) {
-		if (bkey_ge(bkey_start_pos(k.k), end))
-			break;
-
+	for_each_btree_key_upto_norestart(&trans, iter, BTREE_ID_extents, start, end, 0, k, ret)
 		if (bkey_extent_is_data(k.k)) {
 			ret = 1;
 			break;
 		}
-	}
 	start = iter.pos;
 	bch2_trans_iter_exit(&trans, &iter);
 err:
@@ -2590,8 +2586,8 @@ static int __bch2_truncate_page(struct bch_inode_info *inode,
 		 * page
 		 */
 		ret = range_has_data(c, inode->ei_subvol,
-				POS(inode->v.i_ino, index << PAGE_SECTORS_SHIFT),
-				POS(inode->v.i_ino, (index + 1) << PAGE_SECTORS_SHIFT));
+				POS(inode->v.i_ino, (index << PAGE_SECTORS_SHIFT)),
+				POS(inode->v.i_ino, (index << PAGE_SECTORS_SHIFT) + PAGE_SECTORS));
 		if (ret <= 0)
 			return ret;
 
@@ -2973,7 +2969,7 @@ static long bchfs_fcollapse_finsert(struct bch_inode_info *inode,
 
 		k = insert
 			? bch2_btree_iter_peek_prev(&src)
-			: bch2_btree_iter_peek(&src);
+			: bch2_btree_iter_peek_upto(&src, POS(inode->v.i_ino, U64_MAX));
 		if ((ret = bkey_err(k)))
 			continue;
 
@@ -3264,6 +3260,10 @@ err:
 	return bch2_err_class(ret);
 }
 
+/*
+ * Take a quota reservation for unallocated blocks in a given file range
+ * Does not check pagecache
+ */
 static int quota_reserve_range(struct bch_inode_info *inode,
 			       struct quota_res *res,
 			       u64 start, u64 end)
@@ -3477,11 +3477,11 @@ retry:
 	if (ret)
 		goto err;
 
-	for_each_btree_key_norestart(&trans, iter, BTREE_ID_extents,
-			   SPOS(inode->v.i_ino, offset >> 9, snapshot), 0, k, ret) {
-		if (k.k->p.inode != inode->v.i_ino) {
-			break;
-		} else if (bkey_extent_is_data(k.k)) {
+	for_each_btree_key_upto_norestart(&trans, iter, BTREE_ID_extents,
+			   SPOS(inode->v.i_ino, offset >> 9, snapshot),
+			   POS(inode->v.i_ino, U64_MAX),
+			   0, k, ret) {
+		if (bkey_extent_is_data(k.k)) {
 			next_data = max(offset, bkey_start_offset(k.k) << 9);
 			break;
 		} else if (k.k->p.offset >> 9 > isize)
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index e0d7ab67ca35..24365b9260f6 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -31,14 +31,12 @@ static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum,
 	u64 sectors = 0;
 	int ret;
 
-	for_each_btree_key(trans, iter, BTREE_ID_extents,
-			   SPOS(inum, 0, snapshot), 0, k, ret) {
-		if (k.k->p.inode != inum)
-			break;
-
+	for_each_btree_key_upto(trans, iter, BTREE_ID_extents,
+				SPOS(inum, 0, snapshot),
+				POS(inum, U64_MAX),
+				0, k, ret)
 		if (bkey_extent_is_allocation(k.k))
 			sectors += k.k->size;
-	}
 
 	bch2_trans_iter_exit(trans, &iter);
 
@@ -54,11 +52,10 @@ static s64 bch2_count_subdirs(struct btree_trans *trans, u64 inum,
 	u64 subdirs = 0;
 	int ret;
 
-	for_each_btree_key(trans, iter, BTREE_ID_dirents,
-			   SPOS(inum, 0, snapshot), 0, k, ret) {
-		if (k.k->p.inode != inum)
-			break;
-
+	for_each_btree_key_upto(trans, iter, BTREE_ID_dirents,
+				SPOS(inum, 0, snapshot),
+				POS(inum, U64_MAX),
+				0, k, ret) {
 		if (k.k->type != KEY_TYPE_dirent)
 			continue;
 
@@ -66,7 +63,6 @@ static s64 bch2_count_subdirs(struct btree_trans *trans, u64 inum,
 		if (d.v->d_type == DT_DIR)
 			subdirs++;
 	}
-
 	bch2_trans_iter_exit(trans, &iter);
 
 	return ret ?: subdirs;
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index 0c264266f466..70e05fcf643a 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -419,11 +419,12 @@ int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter,
 
 		bch2_btree_iter_set_snapshot(iter, snapshot);
 
-		k = bch2_btree_iter_peek(iter);
-		if (bkey_ge(iter->pos, end_pos)) {
-			bch2_btree_iter_set_pos(iter, end_pos);
+		/*
+		 * peek_upto() doesn't have ideal semantics for extents:
+		 */
+		k = bch2_btree_iter_peek_upto(iter, end_pos);
+		if (!k.k)
 			break;
-		}
 
 		ret = bkey_err(k);
 		if (ret)
diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c
index ededc826e9a0..4b663f320bfc 100644
--- a/fs/bcachefs/quota.c
+++ b/fs/bcachefs/quota.c
@@ -709,7 +709,7 @@ static int bch2_quota_remove(struct super_block *sb, unsigned uflags)
 
 		ret = bch2_btree_delete_range(c, BTREE_ID_quotas,
 					      POS(QTYP_USR, 0),
-					      POS(QTYP_USR + 1, 0),
+					      POS(QTYP_USR, U64_MAX),
 					      0, NULL);
 		if (ret)
 			return ret;
@@ -721,7 +721,7 @@ static int bch2_quota_remove(struct super_block *sb, unsigned uflags)
 
 		ret = bch2_btree_delete_range(c, BTREE_ID_quotas,
 					      POS(QTYP_GRP, 0),
-					      POS(QTYP_GRP + 1, 0),
+					      POS(QTYP_GRP, U64_MAX),
 					      0, NULL);
 		if (ret)
 			return ret;
@@ -733,7 +733,7 @@ static int bch2_quota_remove(struct super_block *sb, unsigned uflags)
 
 		ret = bch2_btree_delete_range(c, BTREE_ID_quotas,
 					      POS(QTYP_PRJ, 0),
-					      POS(QTYP_PRJ + 1, 0),
+					      POS(QTYP_PRJ, U64_MAX),
 					      0, NULL);
 		if (ret)
 			return ret;
diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c
index 08c98ac03c13..130ecc3a05c6 100644
--- a/fs/bcachefs/reflink.c
+++ b/fs/bcachefs/reflink.c
@@ -251,13 +251,9 @@ static struct bkey_s_c get_next_src(struct btree_iter *iter, struct bpos end)
 	struct bkey_s_c k;
 	int ret;
 
-	for_each_btree_key_continue_norestart(*iter, 0, k, ret) {
-		if (bkey_ge(iter->pos, end))
-			break;
-
+	for_each_btree_key_upto_continue_norestart(*iter, end, 0, k, ret)
 		if (bkey_extent_is_data(k.k))
 			return k;
-	}
 
 	if (bkey_ge(iter->pos, end))
 		bch2_btree_iter_set_pos(iter, end);
diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c
index 72364313126b..80fce1c95470 100644
--- a/fs/bcachefs/tests.c
+++ b/fs/bcachefs/tests.c
@@ -15,13 +15,14 @@ static void delete_test_keys(struct bch_fs *c)
 	int ret;
 
 	ret = bch2_btree_delete_range(c, BTREE_ID_extents,
-				      SPOS(0, 0, U32_MAX), SPOS_MAX,
-				      0,
-				      NULL);
+				      SPOS(0, 0, U32_MAX),
+				      POS(0, U64_MAX),
+				      0, NULL);
 	BUG_ON(ret);
 
 	ret = bch2_btree_delete_range(c, BTREE_ID_xattrs,
-				      SPOS(0, 0, U32_MAX), SPOS_MAX,
+				      SPOS(0, 0, U32_MAX),
+				      POS(0, U64_MAX),
 				      0, NULL);
 	BUG_ON(ret);
 }
@@ -145,8 +146,9 @@ static int test_iterate(struct bch_fs *c, u64 nr)
 
 	i = 0;
 
-	ret = for_each_btree_key2(&trans, iter, BTREE_ID_xattrs,
-				  SPOS(0, 0, U32_MAX), 0, k, ({
+	ret = for_each_btree_key2_upto(&trans, iter, BTREE_ID_xattrs,
+				  SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
+				  0, k, ({
 		BUG_ON(k.k->p.offset != i++);
 		0;
 	}));
@@ -211,8 +213,9 @@ static int test_iterate_extents(struct bch_fs *c, u64 nr)
 
 	i = 0;
 
-	ret = for_each_btree_key2(&trans, iter, BTREE_ID_extents,
-				  SPOS(0, 0, U32_MAX), 0, k, ({
+	ret = for_each_btree_key2_upto(&trans, iter, BTREE_ID_extents,
+				  SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
+				  0, k, ({
 		BUG_ON(bkey_start_offset(k.k) != i);
 		i = k.k->p.offset;
 		0;
@@ -278,8 +281,9 @@ static int test_iterate_slots(struct bch_fs *c, u64 nr)
 
 	i = 0;
 
-	ret = for_each_btree_key2(&trans, iter, BTREE_ID_xattrs,
-				  SPOS(0, 0, U32_MAX), 0, k, ({
+	ret = for_each_btree_key2_upto(&trans, iter, BTREE_ID_xattrs,
+				  SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
+				  0, k, ({
 		BUG_ON(k.k->p.offset != i);
 		i += 2;
 		0;
@@ -295,8 +299,8 @@ static int test_iterate_slots(struct bch_fs *c, u64 nr)
 
 	i = 0;
 
-	ret = for_each_btree_key2(&trans, iter, BTREE_ID_xattrs,
-				  SPOS(0, 0, U32_MAX),
+	ret = for_each_btree_key2_upto(&trans, iter, BTREE_ID_xattrs,
+				  SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
 				  BTREE_ITER_SLOTS, k, ({
 		if (i >= nr * 2)
 			break;
@@ -351,8 +355,9 @@ static int test_iterate_slots_extents(struct bch_fs *c, u64 nr)
 
 	i = 0;
 
-	ret = for_each_btree_key2(&trans, iter, BTREE_ID_extents,
-				  SPOS(0, 0, U32_MAX), 0, k, ({
+	ret = for_each_btree_key2_upto(&trans, iter, BTREE_ID_extents,
+				  SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
+				  0, k, ({
 		BUG_ON(bkey_start_offset(k.k) != i + 8);
 		BUG_ON(k.k->size != 8);
 		i += 16;
@@ -369,8 +374,8 @@ static int test_iterate_slots_extents(struct bch_fs *c, u64 nr)
 
 	i = 0;
 
-	ret = for_each_btree_key2(&trans, iter, BTREE_ID_extents,
-				 SPOS(0, 0, U32_MAX),
+	ret = for_each_btree_key2_upto(&trans, iter, BTREE_ID_extents,
+				 SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
 				 BTREE_ITER_SLOTS, k, ({
 		if (i == nr)
 			break;
@@ -405,10 +410,10 @@ static int test_peek_end(struct bch_fs *c, u64 nr)
 	bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs,
 			     SPOS(0, 0, U32_MAX), 0);
 
-	lockrestart_do(&trans, bkey_err(k = bch2_btree_iter_peek(&iter)));
+	lockrestart_do(&trans, bkey_err(k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX))));
 	BUG_ON(k.k);
 
-	lockrestart_do(&trans, bkey_err(k = bch2_btree_iter_peek(&iter)));
+	lockrestart_do(&trans, bkey_err(k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX))));
 	BUG_ON(k.k);
 
 	bch2_trans_iter_exit(&trans, &iter);
@@ -426,10 +431,10 @@ static int test_peek_end_extents(struct bch_fs *c, u64 nr)
 	bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
 			     SPOS(0, 0, U32_MAX), 0);
 
-	lockrestart_do(&trans, bkey_err(k = bch2_btree_iter_peek(&iter)));
+	lockrestart_do(&trans, bkey_err(k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX))));
 	BUG_ON(k.k);
 
-	lockrestart_do(&trans, bkey_err(k = bch2_btree_iter_peek(&iter)));
+	lockrestart_do(&trans, bkey_err(k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX))));
 	BUG_ON(k.k);
 
 	bch2_trans_iter_exit(&trans, &iter);
@@ -519,7 +524,7 @@ static int test_snapshot_filter(struct bch_fs *c, u32 snapid_lo, u32 snapid_hi)
 	bch2_trans_init(&trans, c, 0, 0);
 	bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs,
 			     SPOS(0, 0, snapid_lo), 0);
-	lockrestart_do(&trans, bkey_err(k = bch2_btree_iter_peek(&iter)));
+	lockrestart_do(&trans, bkey_err(k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX))));
 
 	BUG_ON(k.k->p.snapshot != U32_MAX);
 
@@ -798,8 +803,9 @@ static int seq_lookup(struct bch_fs *c, u64 nr)
 
 	bch2_trans_init(&trans, c, 0, 0);
 
-	ret = for_each_btree_key2(&trans, iter, BTREE_ID_xattrs,
-				  SPOS(0, 0, U32_MAX), 0, k,
+	ret = for_each_btree_key2_upto(&trans, iter, BTREE_ID_xattrs,
+				  SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
+				  0, k,
 		0);
 	if (ret)
 		bch_err(c, "%s(): error %s", __func__, bch2_err_str(ret));
@@ -839,7 +845,8 @@ static int seq_delete(struct bch_fs *c, u64 nr)
 	int ret;
 
 	ret = bch2_btree_delete_range(c, BTREE_ID_xattrs,
-				      SPOS(0, 0, U32_MAX), SPOS_MAX,
+				      SPOS(0, 0, U32_MAX),
+				      POS(0, U64_MAX),
 				      0, NULL);
 	if (ret)
 		bch_err(c, "%s(): error %s", __func__, bch2_err_str(ret));
-- 
cgit 


From d7afe651ffa29fca79725a3cf5580a3fd2421fed Mon Sep 17 00:00:00 2001
From: Brett Holman <bholman.devel@gmail.com>
Date: Fri, 10 Feb 2023 16:36:55 -0700
Subject: bcachefs: Fix memleak in replicas_table_update()

Signed-off-by: Brett Holman <bholman.devel@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/replicas.c | 7 -------
 1 file changed, 7 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c
index 482bedf4be8b..3bff21959d98 100644
--- a/fs/bcachefs/replicas.c
+++ b/fs/bcachefs/replicas.c
@@ -299,13 +299,6 @@ static int replicas_table_update(struct bch_fs *c,
 
 	memset(new_usage, 0, sizeof(new_usage));
 
-	for (i = 0; i < ARRAY_SIZE(new_usage); i++)
-		if (!(new_usage[i] = __alloc_percpu_gfp(bytes,
-					sizeof(u64), GFP_KERNEL)))
-			goto err;
-
-	memset(new_usage, 0, sizeof(new_usage));
-
 	for (i = 0; i < ARRAY_SIZE(new_usage); i++)
 		if (!(new_usage[i] = __alloc_percpu_gfp(bytes,
 					sizeof(u64), GFP_KERNEL)))
-- 
cgit 


From 992fa4e62020d257197efa4ec567499d52e9c381 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 8 Feb 2023 18:04:22 -0500
Subject: bcachefs: Fix btree_path_alloc()

We need to call bch2_trans_update_max_paths() before marking the new
path as allocated, since we're not initializing it yet.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index e5c82aa9bfeb..3d138ae19469 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1507,6 +1507,14 @@ static inline struct btree_path *btree_path_alloc(struct btree_trans *trans,
 		btree_path_overflow(trans);
 
 	idx = __ffs64(~trans->paths_allocated);
+
+	/*
+	 * Do this before marking the new path as allocated, since it won't be
+	 * initialized yet:
+	 */
+	if (unlikely(idx > trans->nr_max_paths))
+		bch2_trans_update_max_paths(trans);
+
 	trans->paths_allocated |= 1ULL << idx;
 
 	path = &trans->paths[idx];
@@ -1517,9 +1525,6 @@ static inline struct btree_path *btree_path_alloc(struct btree_trans *trans,
 
 	btree_path_list_add(trans, pos, path);
 	trans->paths_sorted = false;
-
-	if (unlikely(idx > trans->nr_max_paths))
-		bch2_trans_update_max_paths(trans);
 	return path;
 }
 
-- 
cgit 


From 434b1c75a4e79ee63cd58225567f752311cf3cd0 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 9 Feb 2023 15:49:25 -0500
Subject: bcachefs: Switch a BUG_ON() to a panic()

This assert is popping - rarely - in the CI, this will help us track it
down from the logs.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 3d138ae19469..7bb7b5ffe001 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1174,7 +1174,10 @@ int bch2_btree_path_traverse_one(struct btree_trans *trans,
 
 	path->uptodate = BTREE_ITER_UPTODATE;
 out:
-	BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart) != !!trans->restarted);
+	if (bch2_err_matches(ret, BCH_ERR_transaction_restart) != !!trans->restarted)
+		panic("ret %s (%i) trans->restarted %s (%i)\n",
+		      bch2_err_str(ret), ret,
+		      bch2_err_str(trans->restarted), trans->restarted);
 	bch2_btree_path_verify(trans, path);
 	return ret;
 }
-- 
cgit 


From 06ab86d596170b9f3b88ce3f8e9fea7e9c1ea0c2 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 9 Feb 2023 21:13:37 -0500
Subject: bcachefs: Fix btree_node_write_blocked() not being cleared

The btree_node_write_blocked bit was a later addition to this code,
it only mirrors the state of the b->write_blocked list (empty or
nonempty) - unfortunately, when it was added it wasn't correctly kept in
sync - oops.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update_interior.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 566838317845..4e9c963dbd23 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -161,6 +161,7 @@ static void __btree_node_free(struct bch_fs *c, struct btree *b)
 {
 	trace_and_count(c, btree_node_free, c, b);
 
+	BUG_ON(btree_node_write_blocked(b));
 	BUG_ON(btree_node_dirty(b));
 	BUG_ON(btree_node_need_write(b));
 	BUG_ON(b == btree_node_root(c, b));
@@ -807,6 +808,7 @@ static void btree_update_updated_node(struct btree_update *as, struct btree *b)
 
 	BUG_ON(as->mode != BTREE_INTERIOR_NO_UPDATE);
 	BUG_ON(!btree_node_dirty(b));
+	BUG_ON(!b->c.level);
 
 	as->mode	= BTREE_INTERIOR_UPDATING_NODE;
 	as->b		= b;
@@ -976,6 +978,7 @@ static void bch2_btree_interior_update_will_free_node(struct btree_update *as,
 
 	clear_btree_node_dirty_acct(c, b);
 	clear_btree_node_need_write(b);
+	clear_btree_node_write_blocked(b);
 
 	/*
 	 * Does this node have unwritten data that has a pin on the journal?
-- 
cgit 


From dd81a060eb0680e09d133b81db54b90442c32b5e Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 9 Feb 2023 12:22:58 -0500
Subject: bcachefs: ec_stripe_delete_work() now takes ref on c->writes

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/ec.c    | 14 +++++++++++---
 fs/bcachefs/ec.h    |  2 ++
 fs/bcachefs/super.c |  9 ++++-----
 3 files changed, 17 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index e320868a8b8b..f4b903f2fd22 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -672,9 +672,8 @@ void bch2_stripes_heap_update(struct bch_fs *c,
 
 	heap_verify_backpointer(c, idx);
 
-	if (stripe_idx_to_delete(c) >= 0 &&
-	    !percpu_ref_is_dying(&c->writes))
-		schedule_work(&c->ec_stripe_delete_work);
+	if (stripe_idx_to_delete(c) >= 0)
+		bch2_do_stripe_deletes(c);
 }
 
 /* stripe deletion */
@@ -707,6 +706,15 @@ static void ec_stripe_delete_work(struct work_struct *work)
 		if (ec_stripe_delete(c, idx))
 			break;
 	}
+
+	percpu_ref_put(&c->writes);
+}
+
+void bch2_do_stripe_deletes(struct bch_fs *c)
+{
+	if (percpu_ref_tryget_live(&c->writes) &&
+	    !schedule_work(&c->ec_stripe_delete_work))
+		percpu_ref_put(&c->writes);
 }
 
 /* stripe creation: */
diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h
index 8596fa763b4c..4d4e3756dd59 100644
--- a/fs/bcachefs/ec.h
+++ b/fs/bcachefs/ec.h
@@ -213,6 +213,8 @@ void bch2_stripes_heap_update(struct bch_fs *, struct stripe *, size_t);
 void bch2_stripes_heap_del(struct bch_fs *, struct stripe *, size_t);
 void bch2_stripes_heap_insert(struct bch_fs *, struct stripe *, size_t);
 
+void bch2_do_stripe_deletes(struct bch_fs *);
+
 void bch2_ec_stop_dev(struct bch_fs *, struct bch_dev *);
 
 void bch2_ec_flush_new_stripes(struct bch_fs *);
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index c911a07f8e8e..7dfe9050a006 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -258,8 +258,6 @@ void bch2_fs_read_only(struct bch_fs *c)
 	 */
 	percpu_ref_kill(&c->writes);
 
-	cancel_work_sync(&c->ec_stripe_delete_work);
-
 	/*
 	 * If we're not doing an emergency shutdown, we want to wait on
 	 * outstanding writes to complete so they don't see spurious errors due
@@ -391,9 +389,6 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early)
 		bch2_dev_allocator_add(c, ca);
 	bch2_recalc_capacity(c);
 
-	bch2_do_discards(c);
-	bch2_do_invalidates(c);
-
 	if (!early) {
 		ret = bch2_fs_read_write_late(c);
 		if (ret)
@@ -403,6 +398,10 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early)
 	percpu_ref_reinit(&c->writes);
 	set_bit(BCH_FS_RW, &c->flags);
 	set_bit(BCH_FS_WAS_RW, &c->flags);
+
+	bch2_do_discards(c);
+	bch2_do_invalidates(c);
+	bch2_do_stripe_deletes(c);
 	return 0;
 err:
 	__bch2_fs_read_only(c);
-- 
cgit 


From d94189ad568f6cbd80d372cf7aa6e4898b6c5c17 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 9 Feb 2023 12:21:45 -0500
Subject: bcachefs: Debug mode for c->writes references

This adds a debug mode where we split up the c->writes refcount into
distinct refcounts for every codepath that takes a reference, and adds
sysfs code to print the value of each ref.

This will make it easier to debug shutdown hangs due to refcount leaks.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c      | 12 +++----
 fs/bcachefs/bcachefs.h              | 72 +++++++++++++++++++++++++++++++++++++
 fs/bcachefs/btree_update_interior.c |  6 ++--
 fs/bcachefs/btree_update_leaf.c     |  6 ++--
 fs/bcachefs/ec.c                    | 10 +++---
 fs/bcachefs/fs-io.c                 |  4 +--
 fs/bcachefs/io.c                    | 10 +++---
 fs/bcachefs/move.c                  |  6 ++--
 fs/bcachefs/reflink.c               |  4 +--
 fs/bcachefs/subvolume.c             | 16 ++++-----
 fs/bcachefs/super.c                 | 30 +++++++++++++---
 fs/bcachefs/super.h                 |  3 +-
 fs/bcachefs/sysfs.c                 | 35 ++++++++++++++++--
 13 files changed, 168 insertions(+), 46 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 388a44858097..1db0b6253661 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -1113,7 +1113,7 @@ static void bch2_do_discards_work(struct work_struct *work)
 	if (need_journal_commit * 2 > seen)
 		bch2_journal_flush_async(&c->journal, NULL);
 
-	percpu_ref_put(&c->writes);
+	bch2_write_ref_put(c, BCH_WRITE_REF_discard);
 
 	trace_discard_buckets(c, seen, open, need_journal_commit, discarded,
 			      bch2_err_str(ret));
@@ -1121,9 +1121,9 @@ static void bch2_do_discards_work(struct work_struct *work)
 
 void bch2_do_discards(struct bch_fs *c)
 {
-	if (percpu_ref_tryget_live(&c->writes) &&
+	if (bch2_write_ref_tryget(c, BCH_WRITE_REF_discard) &&
 	    !queue_work(system_long_wq, &c->discard_work))
-		percpu_ref_put(&c->writes);
+		bch2_write_ref_put(c, BCH_WRITE_REF_discard);
 }
 
 static int invalidate_one_bucket(struct btree_trans *trans,
@@ -1233,14 +1233,14 @@ static void bch2_do_invalidates_work(struct work_struct *work)
 	}
 
 	bch2_trans_exit(&trans);
-	percpu_ref_put(&c->writes);
+	bch2_write_ref_put(c, BCH_WRITE_REF_invalidate);
 }
 
 void bch2_do_invalidates(struct bch_fs *c)
 {
-	if (percpu_ref_tryget_live(&c->writes) &&
+	if (bch2_write_ref_tryget(c, BCH_WRITE_REF_invalidate) &&
 	    !queue_work(system_long_wq, &c->invalidate_work))
-		percpu_ref_put(&c->writes);
+		bch2_write_ref_put(c, BCH_WRITE_REF_invalidate);
 }
 
 static int bucket_freespace_init(struct btree_trans *trans, struct btree_iter *iter,
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index e61dc1e6da06..56bc58a7bfcf 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -209,6 +209,10 @@
 #include "opts.h"
 #include "util.h"
 
+#ifdef CONFIG_BCACHEFS_DEBUG
+#define BCH_WRITE_REF_DEBUG
+#endif
+
 #define dynamic_fault(...)		0
 #define race_fault(...)			0
 
@@ -538,6 +542,7 @@ enum {
 	/* shutdown: */
 	BCH_FS_STOPPING,
 	BCH_FS_EMERGENCY_RO,
+	BCH_FS_GOING_RO,
 	BCH_FS_WRITE_DISABLE_COMPLETE,
 	BCH_FS_CLEAN_SHUTDOWN,
 
@@ -627,6 +632,29 @@ typedef struct {
 #define BCACHEFS_ROOT_SUBVOL_INUM					\
 	((subvol_inum) { BCACHEFS_ROOT_SUBVOL,	BCACHEFS_ROOT_INO })
 
+#define BCH_WRITE_REFS()						\
+	x(trans)							\
+	x(write)							\
+	x(promote)							\
+	x(node_rewrite)							\
+	x(stripe_create)						\
+	x(stripe_delete)						\
+	x(reflink)							\
+	x(fallocate)							\
+	x(discard)							\
+	x(invalidate)							\
+	x(move)								\
+	x(delete_dead_snapshots)					\
+	x(snapshot_delete_pagecache)					\
+	x(sysfs)
+
+enum bch_write_ref {
+#define x(n) BCH_WRITE_REF_##n,
+	BCH_WRITE_REFS()
+#undef x
+	BCH_WRITE_REF_NR,
+};
+
 struct bch_fs {
 	struct closure		cl;
 
@@ -648,7 +676,11 @@ struct bch_fs {
 	struct rw_semaphore	state_lock;
 
 	/* Counts outstanding writes, for clean transition to read-only */
+#ifdef BCH_WRITE_REF_DEBUG
+	atomic_long_t		writes[BCH_WRITE_REF_NR];
+#else
 	struct percpu_ref	writes;
+#endif
 	struct work_struct	read_only_work;
 
 	struct bch_dev __rcu	*devs[BCH_SB_MEMBERS_MAX];
@@ -965,6 +997,46 @@ mempool_t		bio_bounce_pages;
 	struct btree_transaction_stats btree_transaction_stats[BCH_TRANSACTIONS_NR];
 };
 
+extern struct wait_queue_head bch2_read_only_wait;
+
+static inline void bch2_write_ref_get(struct bch_fs *c, enum bch_write_ref ref)
+{
+#ifdef BCH_WRITE_REF_DEBUG
+	atomic_long_inc(&c->writes[ref]);
+#else
+	percpu_ref_get(&c->writes);
+#endif
+}
+
+static inline bool bch2_write_ref_tryget(struct bch_fs *c, enum bch_write_ref ref)
+{
+#ifdef BCH_WRITE_REF_DEBUG
+	return !test_bit(BCH_FS_GOING_RO, &c->flags) &&
+		atomic_long_inc_not_zero(&c->writes[ref]);
+#else
+	return percpu_ref_tryget_live(&c->writes);
+#endif
+}
+
+static inline void bch2_write_ref_put(struct bch_fs *c, enum bch_write_ref ref)
+{
+#ifdef BCH_WRITE_REF_DEBUG
+	long v = atomic_long_dec_return(&c->writes[ref]);
+
+	BUG_ON(v < 0);
+	if (v)
+		return;
+	for (unsigned i = 0; i < BCH_WRITE_REF_NR; i++)
+		if (atomic_long_read(&c->writes[i]))
+			return;
+
+	set_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags);
+	wake_up(&bch2_read_only_wait);
+#else
+	percpu_ref_put(&c->writes);
+#endif
+}
+
 static inline void bch2_set_ra_pages(struct bch_fs *c, unsigned ra_pages)
 {
 #ifndef NO_BCACHEFS_FS
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 4e9c963dbd23..6287e926f605 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -2036,7 +2036,7 @@ void async_btree_node_rewrite_work(struct work_struct *work)
 
 	bch2_trans_do(c, NULL, NULL, 0,
 		      async_btree_node_rewrite_trans(&trans, a));
-	percpu_ref_put(&c->writes);
+	bch2_write_ref_put(c, BCH_WRITE_REF_node_rewrite);
 	kfree(a);
 }
 
@@ -2044,12 +2044,12 @@ void bch2_btree_node_rewrite_async(struct bch_fs *c, struct btree *b)
 {
 	struct async_btree_rewrite *a;
 
-	if (!percpu_ref_tryget_live(&c->writes))
+	if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_node_rewrite))
 		return;
 
 	a = kmalloc(sizeof(*a), GFP_NOFS);
 	if (!a) {
-		percpu_ref_put(&c->writes);
+		bch2_write_ref_put(c, BCH_WRITE_REF_node_rewrite);
 		return;
 	}
 
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 4584bc8c94b3..60ebe0606d96 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -994,7 +994,7 @@ bch2_trans_commit_get_rw_cold(struct btree_trans *trans)
 	if (ret)
 		return ret;
 
-	percpu_ref_get(&c->writes);
+	bch2_write_ref_get(c, BCH_WRITE_REF_trans);
 	return 0;
 }
 
@@ -1043,7 +1043,7 @@ int __bch2_trans_commit(struct btree_trans *trans)
 	}
 
 	if (!(trans->flags & BTREE_INSERT_NOCHECK_RW) &&
-	    unlikely(!percpu_ref_tryget_live(&c->writes))) {
+	    unlikely(!bch2_write_ref_tryget(c, BCH_WRITE_REF_trans))) {
 		ret = bch2_trans_commit_get_rw_cold(trans);
 		if (ret)
 			goto out_reset;
@@ -1114,7 +1114,7 @@ out:
 	bch2_journal_preres_put(&c->journal, &trans->journal_preres);
 
 	if (likely(!(trans->flags & BTREE_INSERT_NOCHECK_RW)))
-		percpu_ref_put(&c->writes);
+		bch2_write_ref_put(c, BCH_WRITE_REF_trans);
 out_reset:
 	bch2_trans_reset_updates(trans);
 
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index f4b903f2fd22..af6a23021381 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -707,14 +707,14 @@ static void ec_stripe_delete_work(struct work_struct *work)
 			break;
 	}
 
-	percpu_ref_put(&c->writes);
+	bch2_write_ref_put(c, BCH_WRITE_REF_stripe_delete);
 }
 
 void bch2_do_stripe_deletes(struct bch_fs *c)
 {
-	if (percpu_ref_tryget_live(&c->writes) &&
+	if (bch2_write_ref_tryget(c, BCH_WRITE_REF_stripe_delete) &&
 	    !schedule_work(&c->ec_stripe_delete_work))
-		percpu_ref_put(&c->writes);
+		bch2_write_ref_put(c, BCH_WRITE_REF_stripe_delete);
 }
 
 /* stripe creation: */
@@ -922,7 +922,7 @@ static void ec_stripe_create(struct ec_stripe_new *s)
 
 	BUG_ON(!s->allocated);
 
-	if (!percpu_ref_tryget_live(&c->writes))
+	if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_stripe_create))
 		goto err;
 
 	ec_generate_ec(&s->new_stripe);
@@ -964,7 +964,7 @@ static void ec_stripe_create(struct ec_stripe_new *s)
 	bch2_stripes_heap_insert(c, m, s->new_stripe.key.k.p.offset);
 	spin_unlock(&c->ec_stripes_heap_lock);
 err_put_writes:
-	percpu_ref_put(&c->writes);
+	bch2_write_ref_put(c, BCH_WRITE_REF_stripe_create);
 err:
 	bch2_disk_reservation_put(c, &s->res);
 
diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 378cca413c75..944fffd9f7b5 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -3231,7 +3231,7 @@ long bch2_fallocate_dispatch(struct file *file, int mode,
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
 	long ret;
 
-	if (!percpu_ref_tryget_live(&c->writes))
+	if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_fallocate))
 		return -EROFS;
 
 	inode_lock(&inode->v);
@@ -3255,7 +3255,7 @@ long bch2_fallocate_dispatch(struct file *file, int mode,
 err:
 	bch2_pagecache_block_put(inode);
 	inode_unlock(&inode->v);
-	percpu_ref_put(&c->writes);
+	bch2_write_ref_put(c, BCH_WRITE_REF_fallocate);
 
 	return bch2_err_class(ret);
 }
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index 70e05fcf643a..bd55c4b41d7c 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -602,7 +602,7 @@ static void bch2_write_done(struct closure *cl)
 	struct bch_fs *c = op->c;
 
 	bch2_disk_reservation_put(c, &op->res);
-	percpu_ref_put(&c->writes);
+	bch2_write_ref_put(c, BCH_WRITE_REF_write);
 	bch2_keylist_free(&op->insert_keys, op->inline_keys);
 
 	bch2_time_stats_update(&c->times[BCH_TIME_data_write], op->start_time);
@@ -1417,7 +1417,7 @@ void bch2_write(struct closure *cl)
 	}
 
 	if (c->opts.nochanges ||
-	    !percpu_ref_tryget_live(&c->writes)) {
+	    !bch2_write_ref_tryget(c, BCH_WRITE_REF_write)) {
 		op->error = -BCH_ERR_erofs_no_writes;
 		goto err;
 	}
@@ -1496,7 +1496,7 @@ static void promote_free(struct bch_fs *c, struct promote_op *op)
 	ret = rhashtable_remove_fast(&c->promote_table, &op->hash,
 				     bch_promote_params);
 	BUG_ON(ret);
-	percpu_ref_put(&c->writes);
+	bch2_write_ref_put(c, BCH_WRITE_REF_promote);
 	kfree_rcu(op, rcu);
 }
 
@@ -1544,7 +1544,7 @@ static struct promote_op *__promote_alloc(struct bch_fs *c,
 	unsigned pages = DIV_ROUND_UP(sectors, PAGE_SECTORS);
 	int ret;
 
-	if (!percpu_ref_tryget_live(&c->writes))
+	if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_promote))
 		return NULL;
 
 	op = kzalloc(sizeof(*op) + sizeof(struct bio_vec) * pages, GFP_NOIO);
@@ -1601,7 +1601,7 @@ err:
 	kfree(*rbio);
 	*rbio = NULL;
 	kfree(op);
-	percpu_ref_put(&c->writes);
+	bch2_write_ref_put(c, BCH_WRITE_REF_promote);
 	return NULL;
 }
 
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index 65c3af1b2e11..46677ad911cd 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -57,7 +57,7 @@ static void move_free(struct moving_io *io)
 
 	bch2_data_update_exit(&io->write);
 	wake_up(&ctxt->wait);
-	percpu_ref_put(&c->writes);
+	bch2_write_ref_put(c, BCH_WRITE_REF_move);
 	kfree(io);
 }
 
@@ -250,7 +250,7 @@ static int bch2_move_extent(struct btree_trans *trans,
 		return 0;
 	}
 
-	if (!percpu_ref_tryget_live(&c->writes))
+	if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_move))
 		return -BCH_ERR_erofs_no_writes;
 
 	/* write path might have to decompress data: */
@@ -319,7 +319,7 @@ err_free_pages:
 err_free:
 	kfree(io);
 err:
-	percpu_ref_put(&c->writes);
+	bch2_write_ref_put(c, BCH_WRITE_REF_move);
 	trace_and_count(c, move_extent_alloc_mem_fail, k.k);
 	return ret;
 }
diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c
index 130ecc3a05c6..aae924dc81f7 100644
--- a/fs/bcachefs/reflink.c
+++ b/fs/bcachefs/reflink.c
@@ -278,7 +278,7 @@ s64 bch2_remap_range(struct bch_fs *c,
 	u32 dst_snapshot, src_snapshot;
 	int ret = 0, ret2 = 0;
 
-	if (!percpu_ref_tryget_live(&c->writes))
+	if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_reflink))
 		return -BCH_ERR_erofs_no_writes;
 
 	bch2_check_set_feature(c, BCH_FEATURE_reflink);
@@ -412,7 +412,7 @@ s64 bch2_remap_range(struct bch_fs *c,
 	bch2_bkey_buf_exit(&new_src, c);
 	bch2_bkey_buf_exit(&new_dst, c);
 
-	percpu_ref_put(&c->writes);
+	bch2_write_ref_put(c, BCH_WRITE_REF_reflink);
 
 	return dst_done ?: ret ?: ret2;
 }
diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c
index d090a74bd052..3f5893f317d1 100644
--- a/fs/bcachefs/subvolume.c
+++ b/fs/bcachefs/subvolume.c
@@ -706,16 +706,14 @@ static void bch2_delete_dead_snapshots_work(struct work_struct *work)
 	struct bch_fs *c = container_of(work, struct bch_fs, snapshot_delete_work);
 
 	bch2_delete_dead_snapshots(c);
-	percpu_ref_put(&c->writes);
+	bch2_write_ref_put(c, BCH_WRITE_REF_delete_dead_snapshots);
 }
 
 void bch2_delete_dead_snapshots_async(struct bch_fs *c)
 {
-	if (!percpu_ref_tryget_live(&c->writes))
-		return;
-
-	if (!queue_work(system_long_wq, &c->snapshot_delete_work))
-		percpu_ref_put(&c->writes);
+	if (bch2_write_ref_tryget(c, BCH_WRITE_REF_delete_dead_snapshots) &&
+	    !queue_work(system_long_wq, &c->snapshot_delete_work))
+		bch2_write_ref_put(c, BCH_WRITE_REF_delete_dead_snapshots);
 }
 
 static int bch2_delete_dead_snapshots_hook(struct btree_trans *trans,
@@ -900,7 +898,7 @@ void bch2_subvolume_wait_for_pagecache_and_delete(struct work_struct *work)
 		darray_exit(&s);
 	}
 
-	percpu_ref_put(&c->writes);
+	bch2_write_ref_put(c, BCH_WRITE_REF_snapshot_delete_pagecache);
 }
 
 struct subvolume_unlink_hook {
@@ -923,11 +921,11 @@ int bch2_subvolume_wait_for_pagecache_and_delete_hook(struct btree_trans *trans,
 	if (ret)
 		return ret;
 
-	if (unlikely(!percpu_ref_tryget_live(&c->writes)))
+	if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_snapshot_delete_pagecache))
 		return -EROFS;
 
 	if (!queue_work(system_long_wq, &c->snapshot_wait_for_pagecache_and_delete_work))
-		percpu_ref_put(&c->writes);
+		bch2_write_ref_put(c, BCH_WRITE_REF_snapshot_delete_pagecache);
 	return 0;
 }
 
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 7dfe9050a006..872b82a24505 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -107,7 +107,7 @@ static struct kset *bcachefs_kset;
 static LIST_HEAD(bch_fs_list);
 static DEFINE_MUTEX(bch_fs_list_lock);
 
-static DECLARE_WAIT_QUEUE_HEAD(bch_read_only_wait);
+DECLARE_WAIT_QUEUE_HEAD(bch2_read_only_wait);
 
 static void bch2_dev_free(struct bch_dev *);
 static int bch2_dev_alloc(struct bch_fs *, unsigned);
@@ -235,13 +235,15 @@ static void __bch2_fs_read_only(struct bch_fs *c)
 		bch2_dev_allocator_remove(c, ca);
 }
 
+#ifndef BCH_WRITE_REF_DEBUG
 static void bch2_writes_disabled(struct percpu_ref *writes)
 {
 	struct bch_fs *c = container_of(writes, struct bch_fs, writes);
 
 	set_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags);
-	wake_up(&bch_read_only_wait);
+	wake_up(&bch2_read_only_wait);
 }
+#endif
 
 void bch2_fs_read_only(struct bch_fs *c)
 {
@@ -256,7 +258,13 @@ void bch2_fs_read_only(struct bch_fs *c)
 	 * Block new foreground-end write operations from starting - any new
 	 * writes will return -EROFS:
 	 */
+	set_bit(BCH_FS_GOING_RO, &c->flags);
+#ifndef BCH_WRITE_REF_DEBUG
 	percpu_ref_kill(&c->writes);
+#else
+	for (unsigned i = 0; i < BCH_WRITE_REF_NR; i++)
+		bch2_write_ref_put(c, i);
+#endif
 
 	/*
 	 * If we're not doing an emergency shutdown, we want to wait on
@@ -269,16 +277,17 @@ void bch2_fs_read_only(struct bch_fs *c)
 	 * we do need to wait on them before returning and signalling
 	 * that going RO is complete:
 	 */
-	wait_event(bch_read_only_wait,
+	wait_event(bch2_read_only_wait,
 		   test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags) ||
 		   test_bit(BCH_FS_EMERGENCY_RO, &c->flags));
 
 	__bch2_fs_read_only(c);
 
-	wait_event(bch_read_only_wait,
+	wait_event(bch2_read_only_wait,
 		   test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags));
 
 	clear_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags);
+	clear_bit(BCH_FS_GOING_RO, &c->flags);
 
 	if (!bch2_journal_error(&c->journal) &&
 	    !test_bit(BCH_FS_ERROR, &c->flags) &&
@@ -315,7 +324,7 @@ bool bch2_fs_emergency_read_only(struct bch_fs *c)
 	bch2_journal_halt(&c->journal);
 	bch2_fs_read_only_async(c);
 
-	wake_up(&bch_read_only_wait);
+	wake_up(&bch2_read_only_wait);
 	return ret;
 }
 
@@ -395,7 +404,14 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early)
 			goto err;
 	}
 
+#ifndef BCH_WRITE_REF_DEBUG
 	percpu_ref_reinit(&c->writes);
+#else
+	for (unsigned i = 0; i < BCH_WRITE_REF_NR; i++) {
+		BUG_ON(atomic_long_read(&c->writes[i]));
+		atomic_long_inc(&c->writes[i]);
+	}
+#endif
 	set_bit(BCH_FS_RW, &c->flags);
 	set_bit(BCH_FS_WAS_RW, &c->flags);
 
@@ -462,7 +478,9 @@ static void __bch2_fs_free(struct bch_fs *c)
 	mempool_exit(&c->btree_bounce_pool);
 	bioset_exit(&c->btree_bio);
 	mempool_exit(&c->fill_iter);
+#ifndef BCH_WRITE_REF_DEBUG
 	percpu_ref_exit(&c->writes);
+#endif
 	kfree(rcu_dereference_protected(c->disk_groups, 1));
 	kfree(c->journal_seq_blacklist_table);
 	kfree(c->unused_inode_hints);
@@ -769,8 +787,10 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 				WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) ||
 	    !(c->io_complete_wq = alloc_workqueue("bcachefs_io",
 				WQ_FREEZABLE|WQ_HIGHPRI|WQ_MEM_RECLAIM, 1)) ||
+#ifndef BCH_WRITE_REF_DEBUG
 	    percpu_ref_init(&c->writes, bch2_writes_disabled,
 			    PERCPU_REF_INIT_DEAD, GFP_KERNEL) ||
+#endif
 	    mempool_init_kmalloc_pool(&c->fill_iter, 1, iter_size) ||
 	    bioset_init(&c->btree_bio, 1,
 			max(offsetof(struct btree_read_bio, bio),
diff --git a/fs/bcachefs/super.h b/fs/bcachefs/super.h
index d66de6f589ac..5e6fbbfd2d43 100644
--- a/fs/bcachefs/super.h
+++ b/fs/bcachefs/super.h
@@ -250,7 +250,8 @@ int bch2_fs_read_write_early(struct bch_fs *);
  */
 static inline void bch2_fs_lazy_rw(struct bch_fs *c)
 {
-	if (percpu_ref_is_zero(&c->writes))
+	if (!test_bit(BCH_FS_RW, &c->flags) &&
+	    !test_bit(BCH_FS_WAS_RW, &c->flags))
 		bch2_fs_read_write_early(c);
 }
 
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index 7ccdf3197d51..20484f67c3bc 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -195,6 +195,29 @@ read_attribute(stripes_heap);
 read_attribute(open_buckets);
 read_attribute(write_points);
 
+#ifdef BCH_WRITE_REF_DEBUG
+read_attribute(write_refs);
+
+const char * const bch2_write_refs[] = {
+#define x(n)	#n,
+	BCH_WRITE_REFS()
+#undef x
+	NULL
+};
+
+static void bch2_write_refs_to_text(struct printbuf *out, struct bch_fs *c)
+{
+	bch2_printbuf_tabstop_push(out, 24);
+
+	for (unsigned i = 0; i < ARRAY_SIZE(c->writes); i++) {
+		prt_str(out, bch2_write_refs[i]);
+		prt_tab(out);
+		prt_printf(out, "%li", atomic_long_read(&c->writes[i]));
+		prt_newline(out);
+	}
+}
+#endif
+
 read_attribute(internal_uuid);
 
 read_attribute(has_data);
@@ -448,6 +471,11 @@ SHOW(bch2_fs)
 	if (attr == &sysfs_data_jobs)
 		data_progress_to_text(out, c);
 
+#ifdef BCH_WRITE_REF_DEBUG
+	if (attr == &sysfs_write_refs)
+		bch2_write_refs_to_text(out, c);
+#endif
+
 	return 0;
 }
 
@@ -631,6 +659,9 @@ struct attribute *bch2_fs_internal_files[] = {
 	&sysfs_stripes_heap,
 	&sysfs_open_buckets,
 	&sysfs_write_points,
+#ifdef BCH_WRITE_REF_DEBUG
+	&sysfs_write_refs,
+#endif
 	&sysfs_io_timers_read,
 	&sysfs_io_timers_write,
 
@@ -682,7 +713,7 @@ STORE(bch2_fs_opts_dir)
 	 * We don't need to take c->writes for correctness, but it eliminates an
 	 * unsightly error message in the dmesg log when we're RO:
 	 */
-	if (unlikely(!percpu_ref_tryget_live(&c->writes)))
+	if (unlikely(!bch2_write_ref_tryget(c, BCH_WRITE_REF_sysfs)))
 		return -EROFS;
 
 	tmp = kstrdup(buf, GFP_KERNEL);
@@ -712,7 +743,7 @@ STORE(bch2_fs_opts_dir)
 
 	ret = size;
 err:
-	percpu_ref_put(&c->writes);
+	bch2_write_ref_put(c, BCH_WRITE_REF_sysfs);
 	return ret;
 }
 SYSFS_OPS(bch2_fs_opts_dir);
-- 
cgit 


From 60b5538877a2d34396280615484b995911e09b69 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 9 Feb 2023 14:48:54 -0500
Subject: bcachefs: trans->notrace_relock_fail

When we unlock in order to submit IO, the next relock event is likely to
fail if submit_bio() blocked - we shouldn't those events in our _fail
stats, since those are expected events and shouldn't cause test
failures.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c    | 4 +++-
 fs/bcachefs/btree_locking.c | 2 +-
 fs/bcachefs/btree_types.h   | 1 +
 fs/bcachefs/io.c            | 6 ++++++
 4 files changed, 11 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 7bb7b5ffe001..21f12e522360 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -2854,8 +2854,10 @@ u32 bch2_trans_begin(struct btree_trans *trans)
 		bch2_trans_reset_srcu_lock(trans);
 
 	trans->last_restarted_ip = _RET_IP_;
-	if (trans->restarted)
+	if (trans->restarted) {
 		bch2_btree_path_traverse_all(trans);
+		trans->notrace_relock_fail = false;
+	}
 
 	trans->last_begin_time = local_clock();
 	return trans->restart_count;
diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c
index 56489e7b0ac2..cf138cd9d431 100644
--- a/fs/bcachefs/btree_locking.c
+++ b/fs/bcachefs/btree_locking.c
@@ -473,7 +473,7 @@ bool __bch2_btree_node_relock(struct btree_trans *trans,
 		return true;
 	}
 fail:
-	if (trace)
+	if (trace && !trans->notrace_relock_fail)
 		trace_and_count(trans->c, btree_path_relock_fail, trans, _RET_IP_, path, level);
 	return false;
 }
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 5660d076c678..a815cd5a072e 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -425,6 +425,7 @@ struct btree_trans {
 	bool			memory_allocation_failure:1;
 	bool			journal_transaction_names:1;
 	bool			journal_replay_not_finished:1;
+	bool			notrace_relock_fail:1;
 	enum bch_errcode	restarted:16;
 	u32			restart_count;
 	unsigned long		last_restarted_ip;
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index bd55c4b41d7c..b57187ce1f65 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -2375,6 +2375,12 @@ get_bio:
 			else
 				submit_bio_wait(&rbio->bio);
 		}
+
+		/*
+		 * We just submitted IO which may block, we expect relock fail
+		 * events and shouldn't count them:
+		 */
+		trans->notrace_relock_fail = true;
 	} else {
 		/* Attempting reconstruct read: */
 		if (bch2_ec_read_extent(c, rbio)) {
-- 
cgit 


From 30ca6ece88f2d11647c3854faf0dce528c32d5cf Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 9 Feb 2023 13:22:12 -0500
Subject: bcachefs: Kill trans->flags

Recursive transaction commits are occasionally necessary - in
particular, for the upcoming btree write buffer's flush path.

This avoids bugs due to trans->flags being accidentally mutated
mid-commit, which can cause c->writes refcount leaks.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_key_cache.c   |  3 +-
 fs/bcachefs/btree_key_cache.h   |  2 +-
 fs/bcachefs/btree_types.h       |  1 -
 fs/bcachefs/btree_update.h      |  5 +--
 fs/bcachefs/btree_update_leaf.c | 90 ++++++++++++++++++++---------------------
 5 files changed, 50 insertions(+), 51 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index 867f063f22d1..67db6b9d8e10 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -769,6 +769,7 @@ int bch2_btree_key_cache_flush(struct btree_trans *trans,
 }
 
 bool bch2_btree_insert_key_cached(struct btree_trans *trans,
+				  unsigned flags,
 				  struct btree_path *path,
 				  struct bkey_i *insert)
 {
@@ -778,7 +779,7 @@ bool bch2_btree_insert_key_cached(struct btree_trans *trans,
 
 	BUG_ON(insert->u64s > ck->u64s);
 
-	if (likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) {
+	if (likely(!(flags & BTREE_INSERT_JOURNAL_REPLAY))) {
 		int difference;
 
 		BUG_ON(jset_u64s(insert->u64s) > trans->journal_preres.u64s);
diff --git a/fs/bcachefs/btree_key_cache.h b/fs/bcachefs/btree_key_cache.h
index eccea15fca79..c86d5e48f6e3 100644
--- a/fs/bcachefs/btree_key_cache.h
+++ b/fs/bcachefs/btree_key_cache.h
@@ -29,7 +29,7 @@ bch2_btree_key_cache_find(struct bch_fs *, enum btree_id, struct bpos);
 int bch2_btree_path_traverse_cached(struct btree_trans *, struct btree_path *,
 				    unsigned);
 
-bool bch2_btree_insert_key_cached(struct btree_trans *,
+bool bch2_btree_insert_key_cached(struct btree_trans *, unsigned,
 			struct btree_path *, struct bkey_i *);
 int bch2_btree_key_cache_flush(struct btree_trans *,
 			       enum btree_id, struct bpos);
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index a815cd5a072e..93c928a93dca 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -458,7 +458,6 @@ struct btree_trans {
 	struct journal_preres	journal_preres;
 	u64			*journal_seq;
 	struct disk_reservation *disk_res;
-	unsigned		flags;
 	unsigned		journal_u64s;
 	unsigned		journal_preres_u64s;
 	struct replicas_delta_list *fs_usage_deltas;
diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
index 9a3c859ea572..673c3a78aae2 100644
--- a/fs/bcachefs/btree_update.h
+++ b/fs/bcachefs/btree_update.h
@@ -80,7 +80,7 @@ int __must_check bch2_trans_update(struct btree_trans *, struct btree_iter *,
 
 void bch2_trans_commit_hook(struct btree_trans *,
 			    struct btree_trans_commit_hook *);
-int __bch2_trans_commit(struct btree_trans *);
+int __bch2_trans_commit(struct btree_trans *, unsigned);
 
 int bch2_trans_log_msg(struct btree_trans *, const char *, ...);
 int bch2_fs_log_msg(struct bch_fs *, const char *, ...);
@@ -101,9 +101,8 @@ static inline int bch2_trans_commit(struct btree_trans *trans,
 {
 	trans->disk_res		= disk_res;
 	trans->journal_seq	= journal_seq;
-	trans->flags		= flags;
 
-	return __bch2_trans_commit(trans);
+	return __bch2_trans_commit(trans, flags);
 }
 
 #define commit_do(_trans, _disk_res, _journal_seq, _flags, _do)	\
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 60ebe0606d96..84f79affbe07 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -307,7 +307,7 @@ static inline void btree_insert_entry_checks(struct btree_trans *trans,
 }
 
 static noinline int
-bch2_trans_journal_preres_get_cold(struct btree_trans *trans, unsigned u64s,
+bch2_trans_journal_preres_get_cold(struct btree_trans *trans, unsigned flags,
 				   unsigned long trace_ip)
 {
 	struct bch_fs *c = trans->c;
@@ -316,7 +316,9 @@ bch2_trans_journal_preres_get_cold(struct btree_trans *trans, unsigned u64s,
 	bch2_trans_unlock(trans);
 
 	ret = bch2_journal_preres_get(&c->journal,
-			&trans->journal_preres, u64s, 0);
+			&trans->journal_preres,
+			trans->journal_preres_u64s,
+			(flags & JOURNAL_WATERMARK_MASK));
 	if (ret)
 		return ret;
 
@@ -330,12 +332,10 @@ bch2_trans_journal_preres_get_cold(struct btree_trans *trans, unsigned u64s,
 }
 
 static __always_inline int bch2_trans_journal_res_get(struct btree_trans *trans,
-					     unsigned flags)
+						      unsigned flags)
 {
 	return bch2_journal_res_get(&trans->c->journal, &trans->journal_res,
-				    trans->journal_u64s,
-				    flags|
-				    (trans->flags & JOURNAL_WATERMARK_MASK));
+				    trans->journal_u64s, flags);
 }
 
 #define JSET_ENTRY_LOG_U64s		4
@@ -365,9 +365,8 @@ static inline int btree_key_can_insert(struct btree_trans *trans,
 	return 0;
 }
 
-static int btree_key_can_insert_cached(struct btree_trans *trans,
-				       struct btree_path *path,
-				       unsigned u64s)
+static int btree_key_can_insert_cached(struct btree_trans *trans, unsigned flags,
+				       struct btree_path *path, unsigned u64s)
 {
 	struct bch_fs *c = trans->c;
 	struct bkey_cached *ck = (void *) path->l[0].b;
@@ -379,7 +378,7 @@ static int btree_key_can_insert_cached(struct btree_trans *trans,
 
 	if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) &&
 	    bch2_btree_key_cache_must_wait(c) &&
-	    !(trans->flags & BTREE_INSERT_JOURNAL_RECLAIM))
+	    !(flags & BTREE_INSERT_JOURNAL_RECLAIM))
 		return -BCH_ERR_btree_insert_need_journal_reclaim;
 
 	/*
@@ -589,7 +588,7 @@ static noinline int bch2_trans_commit_run_gc_triggers(struct btree_trans *trans)
 }
 
 static inline int
-bch2_trans_commit_write_locked(struct btree_trans *trans,
+bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
 			       struct btree_insert_entry **stopped_at,
 			       unsigned long trace_ip)
 {
@@ -629,7 +628,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans,
 		u64s += i->k->k.u64s;
 		ret = !i->cached
 			? btree_key_can_insert(trans, insert_l(i)->b, u64s)
-			: btree_key_can_insert_cached(trans, i->path, u64s);
+			: btree_key_can_insert_cached(trans, flags, i->path, u64s);
 		if (ret) {
 			*stopped_at = i;
 			return ret;
@@ -643,8 +642,9 @@ bch2_trans_commit_write_locked(struct btree_trans *trans,
 	 * Don't get journal reservation until after we know insert will
 	 * succeed:
 	 */
-	if (likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) {
+	if (likely(!(flags & BTREE_INSERT_JOURNAL_REPLAY))) {
 		ret = bch2_trans_journal_res_get(trans,
+				(flags & JOURNAL_WATERMARK_MASK)|
 				JOURNAL_RES_GET_NONBLOCK);
 		if (ret)
 			return ret;
@@ -661,7 +661,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans,
 	 */
 
 	if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) &&
-	    !(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)) {
+	    !(flags & BTREE_INSERT_JOURNAL_REPLAY)) {
 		if (bch2_journal_seq_verify)
 			trans_for_each_update(trans, i)
 				i->k->k.version.lo = trans->journal_res.seq;
@@ -696,7 +696,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans,
 		trans->journal_res.u64s		-= trans->extra_journal_entries.nr;
 	}
 
-	if (likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) {
+	if (likely(!(flags & BTREE_INSERT_JOURNAL_REPLAY))) {
 		trans_for_each_update(trans, i) {
 			struct journal *j = &c->journal;
 			struct jset_entry *entry;
@@ -735,7 +735,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans,
 		if (!i->cached)
 			btree_insert_key_leaf(trans, i);
 		else if (!i->key_cache_already_flushed)
-			bch2_btree_insert_key_cached(trans, i->path, i->k);
+			bch2_btree_insert_key_cached(trans, flags, i->path, i->k);
 		else {
 			bch2_btree_key_cache_drop(trans, i->path);
 			btree_path_set_dirty(i->path, BTREE_ITER_NEED_TRAVERSE);
@@ -784,12 +784,12 @@ static noinline void bch2_drop_overwrites_from_journal(struct btree_trans *trans
 }
 
 #ifdef CONFIG_BCACHEFS_DEBUG
-static noinline int bch2_trans_commit_bkey_invalid(struct btree_trans *trans,
+static noinline int bch2_trans_commit_bkey_invalid(struct btree_trans *trans, unsigned flags,
 						   struct btree_insert_entry *i,
 						   struct printbuf *err)
 {
 	struct bch_fs *c = trans->c;
-	int rw = (trans->flags & BTREE_INSERT_JOURNAL_REPLAY) ? READ : WRITE;
+	int rw = (flags & BTREE_INSERT_JOURNAL_REPLAY) ? READ : WRITE;
 
 	printbuf_reset(err);
 	prt_printf(err, "invalid bkey on insert from %s -> %ps",
@@ -815,7 +815,7 @@ static noinline int bch2_trans_commit_bkey_invalid(struct btree_trans *trans,
 /*
  * Get journal reservation, take write locks, and attempt to do btree update(s):
  */
-static inline int do_bch2_trans_commit(struct btree_trans *trans,
+static inline int do_bch2_trans_commit(struct btree_trans *trans, unsigned flags,
 				       struct btree_insert_entry **stopped_at,
 				       unsigned long trace_ip)
 {
@@ -826,11 +826,11 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans,
 
 #ifdef CONFIG_BCACHEFS_DEBUG
 	trans_for_each_update(trans, i) {
-		int rw = (trans->flags & BTREE_INSERT_JOURNAL_REPLAY) ? READ : WRITE;
+		int rw = (flags & BTREE_INSERT_JOURNAL_REPLAY) ? READ : WRITE;
 
 		if (unlikely(bch2_bkey_invalid(c, bkey_i_to_s_c(i->k),
 					       i->bkey_type, rw, &buf)))
-			return bch2_trans_commit_bkey_invalid(trans, i, &buf);
+			return bch2_trans_commit_bkey_invalid(trans, flags, i, &buf);
 		btree_insert_entry_checks(trans, i);
 	}
 #endif
@@ -846,7 +846,7 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans,
 		if (!same_leaf_as_next(trans, i)) {
 			if (u64s_delta <= 0) {
 				ret = bch2_foreground_maybe_merge(trans, i->path,
-							i->level, trans->flags);
+							i->level, flags);
 				if (unlikely(ret))
 					return ret;
 			}
@@ -857,11 +857,9 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans,
 
 	ret = bch2_journal_preres_get(&c->journal,
 			&trans->journal_preres, trans->journal_preres_u64s,
-			JOURNAL_RES_GET_NONBLOCK|
-			(trans->flags & JOURNAL_WATERMARK_MASK));
+			(flags & JOURNAL_WATERMARK_MASK)|JOURNAL_RES_GET_NONBLOCK);
 	if (unlikely(ret == -BCH_ERR_journal_preres_get_blocked))
-		ret = bch2_trans_journal_preres_get_cold(trans,
-						trans->journal_preres_u64s, trace_ip);
+		ret = bch2_trans_journal_preres_get_cold(trans, flags, trace_ip);
 	if (unlikely(ret))
 		return ret;
 
@@ -869,7 +867,7 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans,
 	if (unlikely(ret))
 		return ret;
 
-	ret = bch2_trans_commit_write_locked(trans, stopped_at, trace_ip);
+	ret = bch2_trans_commit_write_locked(trans, flags, stopped_at, trace_ip);
 
 	if (!ret && unlikely(trans->journal_replay_not_finished))
 		bch2_drop_overwrites_from_journal(trans);
@@ -908,7 +906,7 @@ static int journal_reclaim_wait_done(struct bch_fs *c)
 }
 
 static noinline
-int bch2_trans_commit_error(struct btree_trans *trans,
+int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags,
 			    struct btree_insert_entry *i,
 			    int ret, unsigned long trace_ip)
 {
@@ -916,7 +914,7 @@ int bch2_trans_commit_error(struct btree_trans *trans,
 
 	switch (ret) {
 	case -BCH_ERR_btree_insert_btree_node_full:
-		ret = bch2_btree_split_leaf(trans, i->path, trans->flags);
+		ret = bch2_btree_split_leaf(trans, i->path, flags);
 		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
 			trace_and_count(c, trans_restart_btree_node_split, trans, trace_ip, i->path);
 		break;
@@ -934,13 +932,15 @@ int bch2_trans_commit_error(struct btree_trans *trans,
 	case -BCH_ERR_journal_res_get_blocked:
 		bch2_trans_unlock(trans);
 
-		if ((trans->flags & BTREE_INSERT_JOURNAL_RECLAIM) &&
-		    !(trans->flags & JOURNAL_WATERMARK_reserved)) {
+		if ((flags & BTREE_INSERT_JOURNAL_RECLAIM) &&
+		    !(flags & JOURNAL_WATERMARK_reserved)) {
 			ret = -BCH_ERR_journal_reclaim_would_deadlock;
 			break;
 		}
 
-		ret = bch2_trans_journal_res_get(trans, JOURNAL_RES_GET_CHECK);
+		ret = bch2_trans_journal_res_get(trans,
+					(flags & JOURNAL_WATERMARK_MASK)|
+					JOURNAL_RES_GET_CHECK);
 		if (ret)
 			break;
 
@@ -970,20 +970,20 @@ int bch2_trans_commit_error(struct btree_trans *trans,
 	BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart) != !!trans->restarted);
 
 	bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOSPC) &&
-				!(trans->flags & BTREE_INSERT_NOWAIT) &&
-				(trans->flags & BTREE_INSERT_NOFAIL), c,
+				!(flags & BTREE_INSERT_NOWAIT) &&
+				(flags & BTREE_INSERT_NOFAIL), c,
 		"%s: incorrectly got %s\n", __func__, bch2_err_str(ret));
 
 	return ret;
 }
 
 static noinline int
-bch2_trans_commit_get_rw_cold(struct btree_trans *trans)
+bch2_trans_commit_get_rw_cold(struct btree_trans *trans, unsigned flags)
 {
 	struct bch_fs *c = trans->c;
 	int ret;
 
-	if (likely(!(trans->flags & BTREE_INSERT_LAZY_RW)) ||
+	if (likely(!(flags & BTREE_INSERT_LAZY_RW)) ||
 	    test_bit(BCH_FS_STARTED, &c->flags))
 		return -BCH_ERR_erofs_trans_commit;
 
@@ -1019,7 +1019,7 @@ do_bch2_trans_commit_to_journal_replay(struct btree_trans *trans)
 	return ret;
 }
 
-int __bch2_trans_commit(struct btree_trans *trans)
+int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
 {
 	struct bch_fs *c = trans->c;
 	struct btree_insert_entry *i = NULL;
@@ -1030,7 +1030,7 @@ int __bch2_trans_commit(struct btree_trans *trans)
 	    !trans->extra_journal_entries.nr)
 		goto out_reset;
 
-	if (trans->flags & BTREE_INSERT_GC_LOCK_HELD)
+	if (flags & BTREE_INSERT_GC_LOCK_HELD)
 		lockdep_assert_held(&c->gc_lock);
 
 	ret = bch2_trans_commit_run_triggers(trans);
@@ -1042,9 +1042,9 @@ int __bch2_trans_commit(struct btree_trans *trans)
 		goto out_reset;
 	}
 
-	if (!(trans->flags & BTREE_INSERT_NOCHECK_RW) &&
+	if (!(flags & BTREE_INSERT_NOCHECK_RW) &&
 	    unlikely(!bch2_write_ref_tryget(c, BCH_WRITE_REF_trans))) {
-		ret = bch2_trans_commit_get_rw_cold(trans);
+		ret = bch2_trans_commit_get_rw_cold(trans, flags);
 		if (ret)
 			goto out_reset;
 	}
@@ -1076,7 +1076,7 @@ int __bch2_trans_commit(struct btree_trans *trans)
 		/* we're going to journal the key being updated: */
 		u64s = jset_u64s(i->k->k.u64s);
 		if (i->cached &&
-		    likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)))
+		    likely(!(flags & BTREE_INSERT_JOURNAL_REPLAY)))
 			trans->journal_preres_u64s += u64s;
 
 		if (i->flags & BTREE_UPDATE_NOJOURNAL)
@@ -1092,7 +1092,7 @@ int __bch2_trans_commit(struct btree_trans *trans)
 	if (trans->extra_journal_res) {
 		ret = bch2_disk_reservation_add(c, trans->disk_res,
 				trans->extra_journal_res,
-				(trans->flags & BTREE_INSERT_NOFAIL)
+				(flags & BTREE_INSERT_NOFAIL)
 				? BCH_DISK_RESERVATION_NOFAIL : 0);
 		if (ret)
 			goto err;
@@ -1101,7 +1101,7 @@ retry:
 	bch2_trans_verify_not_in_restart(trans);
 	memset(&trans->journal_res, 0, sizeof(trans->journal_res));
 
-	ret = do_bch2_trans_commit(trans, &i, _RET_IP_);
+	ret = do_bch2_trans_commit(trans, flags, &i, _RET_IP_);
 
 	/* make sure we didn't drop or screw up locks: */
 	bch2_trans_verify_locks(trans);
@@ -1113,14 +1113,14 @@ retry:
 out:
 	bch2_journal_preres_put(&c->journal, &trans->journal_preres);
 
-	if (likely(!(trans->flags & BTREE_INSERT_NOCHECK_RW)))
+	if (likely(!(flags & BTREE_INSERT_NOCHECK_RW)))
 		bch2_write_ref_put(c, BCH_WRITE_REF_trans);
 out_reset:
 	bch2_trans_reset_updates(trans);
 
 	return ret;
 err:
-	ret = bch2_trans_commit_error(trans, i, ret, _RET_IP_);
+	ret = bch2_trans_commit_error(trans, flags, i, ret, _RET_IP_);
 	if (ret)
 		goto out;
 
-- 
cgit 


From 5f5c74661713327309f124e247de61db6729bc3d Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 17 Oct 2022 04:51:58 -0400
Subject: bcachefs: Start copygc when first going read-write

In the distant past, it wasn't possible to start copygc until after
journal replay had finished. Now, the btree iterator code overlays keys
from the journal, so there's no reason not to start it earlier - and it
solves a rare deadlock.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/recovery.c | 47 ++++++++++++++++++++++++++++++-----------------
 fs/bcachefs/super.c    | 26 ++++++++++++--------------
 2 files changed, 42 insertions(+), 31 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 80736be21b9f..ebdf9f754e08 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -1267,6 +1267,20 @@ use_clean:
 			goto err;
 		bch_verbose(c, "done checking need_discard and freespace btrees");
 
+		if (c->sb.version < bcachefs_metadata_version_snapshot_2) {
+			err = "error creating root snapshot node";
+			ret = bch2_fs_initialize_subvolumes(c);
+			if (ret)
+				goto err;
+		}
+
+		bch_verbose(c, "reading snapshots table");
+		err = "error reading snapshots table";
+		ret = bch2_fs_snapshots_start(c);
+		if (ret)
+			goto err;
+		bch_verbose(c, "reading snapshots done");
+
 		set_bit(BCH_FS_MAY_GO_RW, &c->flags);
 
 		bch_info(c, "starting journal replay, %zu keys", c->journal_keys.nr);
@@ -1293,7 +1307,6 @@ use_clean:
 		bch_verbose(c, "done checking alloc to lru refs");
 		set_bit(BCH_FS_CHECK_ALLOC_TO_LRU_REFS_DONE, &c->flags);
 	} else {
-		set_bit(BCH_FS_MAY_GO_RW, &c->flags);
 		set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags);
 		set_bit(BCH_FS_CHECK_LRUS_DONE, &c->flags);
 		set_bit(BCH_FS_CHECK_ALLOC_TO_LRU_REFS_DONE, &c->flags);
@@ -1302,6 +1315,22 @@ use_clean:
 		if (c->opts.norecovery)
 			goto out;
 
+		if (c->sb.version < bcachefs_metadata_version_snapshot_2) {
+			err = "error creating root snapshot node";
+			ret = bch2_fs_initialize_subvolumes(c);
+			if (ret)
+				goto err;
+		}
+
+		bch_verbose(c, "reading snapshots table");
+		err = "error reading snapshots table";
+		ret = bch2_fs_snapshots_start(c);
+		if (ret)
+			goto err;
+		bch_verbose(c, "reading snapshots done");
+
+		set_bit(BCH_FS_MAY_GO_RW, &c->flags);
+
 		bch_verbose(c, "starting journal replay, %zu keys", c->journal_keys.nr);
 		err = "journal replay failed";
 		ret = bch2_journal_replay(c, last_seq, blacklist_seq - 1);
@@ -1316,22 +1345,6 @@ use_clean:
 	if (ret)
 		goto err;
 
-	if (c->sb.version < bcachefs_metadata_version_snapshot_2) {
-		bch2_fs_lazy_rw(c);
-
-		err = "error creating root snapshot node";
-		ret = bch2_fs_initialize_subvolumes(c);
-		if (ret)
-			goto err;
-	}
-
-	bch_verbose(c, "reading snapshots table");
-	err = "error reading snapshots table";
-	ret = bch2_fs_snapshots_start(c);
-	if (ret)
-		goto err;
-	bch_verbose(c, "reading snapshots done");
-
 	if (c->sb.version < bcachefs_metadata_version_snapshot_2) {
 		/* set bi_subvol on root inode */
 		err = "error upgrade root inode for subvolumes";
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 872b82a24505..e7e3dcbe2339 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -332,26 +332,12 @@ static int bch2_fs_read_write_late(struct bch_fs *c)
 {
 	int ret;
 
-	ret = bch2_gc_thread_start(c);
-	if (ret) {
-		bch_err(c, "error starting gc thread");
-		return ret;
-	}
-
-	ret = bch2_copygc_start(c);
-	if (ret) {
-		bch_err(c, "error starting copygc thread");
-		return ret;
-	}
-
 	ret = bch2_rebalance_start(c);
 	if (ret) {
 		bch_err(c, "error starting rebalance thread");
 		return ret;
 	}
 
-	schedule_work(&c->ec_stripe_delete_work);
-
 	return 0;
 }
 
@@ -398,6 +384,18 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early)
 		bch2_dev_allocator_add(c, ca);
 	bch2_recalc_capacity(c);
 
+	ret = bch2_gc_thread_start(c);
+	if (ret) {
+		bch_err(c, "error starting gc thread");
+		return ret;
+	}
+
+	ret = bch2_copygc_start(c);
+	if (ret) {
+		bch_err(c, "error starting copygc thread");
+		return ret;
+	}
+
 	if (!early) {
 		ret = bch2_fs_read_write_late(c);
 		if (ret)
-- 
cgit 


From f2b542ba42a8b35d9dc43f5eab9791fea76bfd3a Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 11 Dec 2022 19:14:30 -0500
Subject: bcachefs: Go RW before check_alloc_info()

It's possible to do btree updates before going RW by adding them to the
list of updates for journal replay to do, but this is limited by what
fits in RAM. This patch switches the second alloc info phase to run
after going RW - btree_gc has already ensured the alloc btree itself is
correct - and tweaks the allocation path to deal with the potential
small inconsistencies.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c | 32 ++++++++++++++++++++------------
 fs/bcachefs/alloc_foreground.c | 32 ++++++++++++++++++++++----------
 fs/bcachefs/bcachefs.h         |  1 +
 fs/bcachefs/recovery.c         | 17 ++++++++++-------
 4 files changed, 53 insertions(+), 29 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 1db0b6253661..f75d05beaf31 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -583,6 +583,7 @@ static int bch2_bucket_do_index(struct btree_trans *trans,
 		goto err;
 
 	if (ca->mi.freespace_initialized &&
+	    test_bit(BCH_FS_CHECK_ALLOC_DONE, &c->flags) &&
 	    bch2_trans_inconsistent_on(old.k->type != old_type, trans,
 			"incorrect key when %s %s btree (got %s should be %s)\n"
 			"  for %s",
@@ -1028,21 +1029,28 @@ static int bch2_discard_one_bucket(struct btree_trans *trans,
 		goto write;
 	}
 
-	if (bch2_trans_inconsistent_on(a->v.journal_seq > c->journal.flushed_seq_ondisk, trans,
-			"clearing need_discard but journal_seq %llu > flushed_seq %llu\n"
-			"%s",
-			a->v.journal_seq,
-			c->journal.flushed_seq_ondisk,
-			(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
-		ret = -EIO;
+	if (a->v.journal_seq > c->journal.flushed_seq_ondisk) {
+		if (test_bit(BCH_FS_CHECK_ALLOC_DONE, &c->flags)) {
+			bch2_trans_inconsistent(trans,
+				"clearing need_discard but journal_seq %llu > flushed_seq %llu\n"
+				"%s",
+				a->v.journal_seq,
+				c->journal.flushed_seq_ondisk,
+				(bch2_bkey_val_to_text(&buf, c, k), buf.buf));
+			ret = -EIO;
+		}
 		goto out;
 	}
 
-	if (bch2_trans_inconsistent_on(a->v.data_type != BCH_DATA_need_discard, trans,
-			"bucket incorrectly set in need_discard btree\n"
-			"%s",
-			(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
-		ret = -EIO;
+	if (a->v.data_type != BCH_DATA_need_discard) {
+		if (test_bit(BCH_FS_CHECK_ALLOC_DONE, &c->flags)) {
+			bch2_trans_inconsistent(trans,
+				"bucket incorrectly set in need_discard btree\n"
+				"%s",
+				(bch2_bkey_val_to_text(&buf, c, k), buf.buf));
+			ret = -EIO;
+		}
+
 		goto out;
 	}
 
diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index 534dbf197d58..ba14cfe06515 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -316,28 +316,34 @@ static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bc
 
 	a = bch2_alloc_to_v4(k, &a_convert);
 
-	if (genbits != (alloc_freespace_genbits(*a) >> 56)) {
-		prt_printf(&buf, "bucket in freespace btree with wrong genbits (got %u should be %llu)\n"
-		       "  freespace key ",
-		       genbits, alloc_freespace_genbits(*a) >> 56);
+	if (a->data_type != BCH_DATA_free) {
+		if (!test_bit(BCH_FS_CHECK_ALLOC_DONE, &c->flags)) {
+			ob = NULL;
+			goto err;
+		}
+
+		prt_printf(&buf, "non free bucket in freespace btree\n"
+		       "  freespace key ");
 		bch2_bkey_val_to_text(&buf, c, freespace_k);
 		prt_printf(&buf, "\n  ");
 		bch2_bkey_val_to_text(&buf, c, k);
 		bch2_trans_inconsistent(trans, "%s", buf.buf);
 		ob = ERR_PTR(-EIO);
 		goto err;
-
 	}
 
-	if (a->data_type != BCH_DATA_free) {
-		prt_printf(&buf, "non free bucket in freespace btree\n"
-		       "  freespace key ");
+	if (genbits != (alloc_freespace_genbits(*a) >> 56) &&
+	    test_bit(BCH_FS_CHECK_ALLOC_DONE, &c->flags)) {
+		prt_printf(&buf, "bucket in freespace btree with wrong genbits (got %u should be %llu)\n"
+		       "  freespace key ",
+		       genbits, alloc_freespace_genbits(*a) >> 56);
 		bch2_bkey_val_to_text(&buf, c, freespace_k);
 		prt_printf(&buf, "\n  ");
 		bch2_bkey_val_to_text(&buf, c, k);
 		bch2_trans_inconsistent(trans, "%s", buf.buf);
 		ob = ERR_PTR(-EIO);
 		goto err;
+
 	}
 
 	ob = __try_alloc_bucket(c, ca, b, reserve, a, s, cl);
@@ -505,6 +511,7 @@ static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans,
 {
 	struct bch_fs *c = trans->c;
 	struct open_bucket *ob = NULL;
+	bool freespace = READ_ONCE(ca->mi.freespace_initialized);
 	u64 avail;
 	struct bucket_alloc_state s = { 0 };
 	bool waiting = false;
@@ -543,13 +550,18 @@ again:
 		if (ob)
 			return ob;
 	}
-
-	ob = likely(ca->mi.freespace_initialized)
+alloc:
+	ob = likely(freespace)
 		? bch2_bucket_alloc_freelist(trans, ca, reserve, &s, cl)
 		: bch2_bucket_alloc_early(trans, ca, reserve, &s, cl);
 
 	if (s.skipped_need_journal_commit * 2 > avail)
 		bch2_journal_flush_async(&c->journal, NULL);
+
+	if (!ob && freespace && !test_bit(BCH_FS_CHECK_ALLOC_DONE, &c->flags)) {
+		freespace = false;
+		goto alloc;
+	}
 err:
 	if (!ob)
 		ob = ERR_PTR(-BCH_ERR_no_buckets_found);
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 56bc58a7bfcf..ad3bf019487e 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -549,6 +549,7 @@ enum {
 	/* fsck passes: */
 	BCH_FS_TOPOLOGY_REPAIR_DONE,
 	BCH_FS_INITIAL_GC_DONE,		/* kill when we enumerate fsck passes */
+	BCH_FS_CHECK_ALLOC_DONE,
 	BCH_FS_CHECK_LRUS_DONE,
 	BCH_FS_CHECK_ALLOC_TO_LRU_REFS_DONE,
 	BCH_FS_FSCK_DONE,
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index ebdf9f754e08..61890755d335 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -1260,13 +1260,6 @@ use_clean:
 
 		set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags);
 
-		bch_info(c, "checking need_discard and freespace btrees");
-		err = "error checking need_discard and freespace btrees";
-		ret = bch2_check_alloc_info(c);
-		if (ret)
-			goto err;
-		bch_verbose(c, "done checking need_discard and freespace btrees");
-
 		if (c->sb.version < bcachefs_metadata_version_snapshot_2) {
 			err = "error creating root snapshot node";
 			ret = bch2_fs_initialize_subvolumes(c);
@@ -1291,6 +1284,15 @@ use_clean:
 		if (c->opts.verbose || !c->sb.clean)
 			bch_info(c, "journal replay done");
 
+		bch_info(c, "checking need_discard and freespace btrees");
+		err = "error checking need_discard and freespace btrees";
+		ret = bch2_check_alloc_info(c);
+		if (ret)
+			goto err;
+		bch_verbose(c, "done checking need_discard and freespace btrees");
+
+		set_bit(BCH_FS_CHECK_ALLOC_DONE, &c->flags);
+
 		bch_info(c, "checking lrus");
 		err = "error checking lrus";
 		ret = bch2_check_lrus(c);
@@ -1308,6 +1310,7 @@ use_clean:
 		set_bit(BCH_FS_CHECK_ALLOC_TO_LRU_REFS_DONE, &c->flags);
 	} else {
 		set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags);
+		set_bit(BCH_FS_CHECK_ALLOC_DONE, &c->flags);
 		set_bit(BCH_FS_CHECK_LRUS_DONE, &c->flags);
 		set_bit(BCH_FS_CHECK_ALLOC_TO_LRU_REFS_DONE, &c->flags);
 		set_bit(BCH_FS_FSCK_DONE, &c->flags);
-- 
cgit 


From 920e69bc3db88d3825c69190cafd43f0a1918d3b Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 4 Jan 2023 00:00:50 -0500
Subject: bcachefs: Btree write buffer

This adds a new method of doing btree updates - a straight write buffer,
implemented as a flat fixed size array.

This is only useful when we don't need to read from the btree in order
to do the update, and when reading is infrequent - perfect for the LRU
btree.

This will make LRU btree updates fast enough that we'll be able to use
it for persistently indexing buckets by fragmentation, which will be a
massive boost to copygc performance.

Changes:
 - A new btree_insert_type enum, for btree_insert_entries. Specifies
   btree, btree key cache, or btree write buffer.

 - bch2_trans_update_buffered(): updates via the btree write buffer
   don't need a btree path, so we need a new update path.

 - Transaction commit path changes:
   The update to the btree write buffer both mutates global, and can
   fail if there isn't currently room. Therefore we do all write buffer
   updates in the transaction all at once, and also if it fails we have
   to revert filesystem usage counter changes.

   If there isn't room we flush the write buffer in the transaction
   commit error path and retry.

 - A new persistent option, for specifying the number of entries in the
   write buffer.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/Makefile                   |   1 +
 fs/bcachefs/bcachefs.h                 |   4 +
 fs/bcachefs/bcachefs_format.h          |   4 +-
 fs/bcachefs/btree_iter.c               |  17 +-
 fs/bcachefs/btree_types.h              |   3 +
 fs/bcachefs/btree_update.h             |  12 ++
 fs/bcachefs/btree_update_leaf.c        | 172 +++++++++++++++--
 fs/bcachefs/btree_write_buffer.c       | 330 +++++++++++++++++++++++++++++++++
 fs/bcachefs/btree_write_buffer.h       |  14 ++
 fs/bcachefs/btree_write_buffer_types.h |  44 +++++
 fs/bcachefs/buckets.c                  |  41 ++++
 fs/bcachefs/buckets.h                  |   1 +
 fs/bcachefs/errcode.h                  |   2 +
 fs/bcachefs/opts.h                     |   5 +
 fs/bcachefs/super.c                    |   3 +
 fs/bcachefs/trace.h                    |  45 +++++
 16 files changed, 677 insertions(+), 21 deletions(-)
 create mode 100644 fs/bcachefs/btree_write_buffer.c
 create mode 100644 fs/bcachefs/btree_write_buffer.h
 create mode 100644 fs/bcachefs/btree_write_buffer_types.h

(limited to 'fs')

diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile
index 966c9b9a74fc..c0e715760c8b 100644
--- a/fs/bcachefs/Makefile
+++ b/fs/bcachefs/Makefile
@@ -16,6 +16,7 @@ bcachefs-y		:=	\
 	btree_locking.o		\
 	btree_update_interior.o	\
 	btree_update_leaf.o	\
+	btree_write_buffer.o	\
 	buckets.o		\
 	buckets_waiting_for_journal.o	\
 	chardev.o		\
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index ad3bf019487e..91f635faccb0 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -391,6 +391,7 @@ enum bch_time_stats {
 
 #include "alloc_types.h"
 #include "btree_types.h"
+#include "btree_write_buffer_types.h"
 #include "buckets_types.h"
 #include "buckets_waiting_for_journal_types.h"
 #include "clock_types.h"
@@ -575,6 +576,7 @@ struct btree_transaction_stats {
 	struct bch2_time_stats	lock_hold_times;
 	struct mutex		lock;
 	unsigned		nr_max_paths;
+	unsigned		wb_updates_size;
 	unsigned		max_mem;
 	char			*max_paths_text;
 };
@@ -789,6 +791,8 @@ struct bch_fs {
 	struct btree_key_cache	btree_key_cache;
 	unsigned		btree_key_cache_btrees;
 
+	struct btree_write_buffer btree_write_buffer;
+
 	struct workqueue_struct	*btree_update_wq;
 	struct workqueue_struct	*btree_io_complete_wq;
 	/* copygc needs its own workqueue for index updates.. */
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index d96efc8338d5..8e070402e73f 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -1404,7 +1404,8 @@ struct bch_sb_field_disk_groups {
 	x(trans_traverse_all,				71)	\
 	x(transaction_commit,				72)	\
 	x(write_super,					73)	\
-	x(trans_restart_would_deadlock_recursion_limit,	74)
+	x(trans_restart_would_deadlock_recursion_limit,	74)	\
+	x(trans_restart_write_buffer_flush,		75)
 
 enum bch_persistent_counters {
 #define x(t, n, ...) BCH_COUNTER_##t,
@@ -1633,6 +1634,7 @@ LE64_BITMASK(BCH_SB_JOURNAL_FLUSH_DELAY,struct bch_sb, flags[3], 30, 62);
 LE64_BITMASK(BCH_SB_JOURNAL_FLUSH_DISABLED,struct bch_sb, flags[3], 62, 63);
 LE64_BITMASK(BCH_SB_JOURNAL_RECLAIM_DELAY,struct bch_sb, flags[4], 0, 32);
 LE64_BITMASK(BCH_SB_JOURNAL_TRANSACTION_NAMES,struct bch_sb, flags[4], 32, 33);
+LE64_BITMASK(BCH_SB_WRITE_BUFFER_SIZE,	struct bch_sb, flags[4], 34, 54);
 
 /*
  * Features:
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 21f12e522360..4ac1364acc8b 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1374,6 +1374,7 @@ noinline __cold
 void bch2_trans_updates_to_text(struct printbuf *buf, struct btree_trans *trans)
 {
 	struct btree_insert_entry *i;
+	struct btree_write_buffered_key *wb;
 
 	prt_printf(buf, "transaction updates for %s journal seq %llu",
 	       trans->fn, trans->journal_res.seq);
@@ -1398,6 +1399,17 @@ void bch2_trans_updates_to_text(struct printbuf *buf, struct btree_trans *trans)
 		prt_newline(buf);
 	}
 
+	trans_for_each_wb_update(trans, wb) {
+		prt_printf(buf, "update: btree=%s wb=1 %pS",
+		       bch2_btree_ids[wb->btree],
+		       (void *) i->ip_allocated);
+		prt_newline(buf);
+
+		prt_printf(buf, "  new ");
+		bch2_bkey_val_to_text(buf, trans->c, bkey_i_to_s_c(&wb->k));
+		prt_newline(buf);
+	}
+
 	printbuf_indent_sub(buf, 2);
 }
 
@@ -2929,8 +2941,11 @@ void __bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, unsigned fn_
 			trans->mem_bytes = expected_mem_bytes;
 		}
 	}
-	if (s)
+
+	if (s) {
 		trans->nr_max_paths = s->nr_max_paths;
+		trans->wb_updates_size = s->wb_updates_size;
+	}
 
 	trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
 	trans->srcu_lock_time	= jiffies;
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 93c928a93dca..153ae548a89a 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -419,6 +419,8 @@ struct btree_trans {
 	u8			fn_idx;
 	u8			nr_sorted;
 	u8			nr_updates;
+	u8			nr_wb_updates;
+	u8			wb_updates_size;
 	bool			used_mempool:1;
 	bool			in_traverse_all:1;
 	bool			paths_sorted:1;
@@ -448,6 +450,7 @@ struct btree_trans {
 	u8			sorted[BTREE_ITER_MAX + 8];
 	struct btree_path	*paths;
 	struct btree_insert_entry *updates;
+	struct btree_write_buffered_key *wb_updates;
 
 	/* update path: */
 	struct btree_trans_commit_hook *hooks;
diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
index 673c3a78aae2..96d27e34d5b1 100644
--- a/fs/bcachefs/btree_update.h
+++ b/fs/bcachefs/btree_update.h
@@ -15,6 +15,9 @@ bool bch2_btree_bset_insert_key(struct btree_trans *, struct btree_path *,
 				struct bkey_i *);
 void bch2_btree_add_journal_pin(struct bch_fs *, struct btree *, u64);
 
+void bch2_btree_insert_key_leaf(struct btree_trans *, struct btree_path *,
+				struct bkey_i *, u64);
+
 enum btree_insert_flags {
 	/* First two bits for journal watermark: */
 	__BTREE_INSERT_NOFAIL = 2,
@@ -77,6 +80,8 @@ int bch2_trans_update_extent(struct btree_trans *, struct btree_iter *,
 
 int __must_check bch2_trans_update(struct btree_trans *, struct btree_iter *,
 				   struct bkey_i *, enum btree_update_flags);
+int __must_check bch2_trans_update_buffered(struct btree_trans *,
+					    enum btree_id, struct bkey_i *);
 
 void bch2_trans_commit_hook(struct btree_trans *,
 			    struct btree_trans_commit_hook *);
@@ -142,6 +147,11 @@ static inline int bch2_trans_commit(struct btree_trans *trans,
 	     (_i) < (_trans)->updates + (_trans)->nr_updates;		\
 	     (_i)++)
 
+#define trans_for_each_wb_update(_trans, _i)				\
+	for ((_i) = (_trans)->wb_updates;				\
+	     (_i) < (_trans)->wb_updates + (_trans)->nr_wb_updates;	\
+	     (_i)++)
+
 static inline void bch2_trans_reset_updates(struct btree_trans *trans)
 {
 	struct btree_insert_entry *i;
@@ -151,6 +161,8 @@ static inline void bch2_trans_reset_updates(struct btree_trans *trans)
 
 	trans->extra_journal_res	= 0;
 	trans->nr_updates		= 0;
+	trans->nr_wb_updates		= 0;
+	trans->wb_updates		= NULL;
 	trans->hooks			= NULL;
 	trans->extra_journal_entries.nr	= 0;
 
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 84f79affbe07..8169f2b89848 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -8,6 +8,7 @@
 #include "btree_iter.h"
 #include "btree_key_cache.h"
 #include "btree_locking.h"
+#include "btree_write_buffer.h"
 #include "buckets.h"
 #include "debug.h"
 #include "errcode.h"
@@ -100,9 +101,6 @@ inline void bch2_btree_node_prep_for_write(struct btree_trans *trans,
 {
 	struct bch_fs *c = trans->c;
 
-	if (path->cached)
-		return;
-
 	if (unlikely(btree_node_just_written(b)) &&
 	    bch2_btree_post_write_cleanup(c, b))
 		bch2_trans_node_reinit_iter(trans, b);
@@ -252,25 +250,26 @@ inline void bch2_btree_add_journal_pin(struct bch_fs *c,
 /**
  * btree_insert_key - insert a key one key into a leaf node
  */
-static void btree_insert_key_leaf(struct btree_trans *trans,
-				  struct btree_insert_entry *insert)
+inline void bch2_btree_insert_key_leaf(struct btree_trans *trans,
+				       struct btree_path *path,
+				       struct bkey_i *insert,
+				       u64 journal_seq)
 {
 	struct bch_fs *c = trans->c;
-	struct btree *b = insert_l(insert)->b;
+	struct btree *b = path_l(path)->b;
 	struct bset_tree *t = bset_tree_last(b);
 	struct bset *i = bset(b, t);
 	int old_u64s = bset_u64s(t);
 	int old_live_u64s = b->nr.live_u64s;
 	int live_u64s_added, u64s_added;
 
-	if (unlikely(!bch2_btree_bset_insert_key(trans, insert->path, b,
-					&insert_l(insert)->iter, insert->k)))
+	if (unlikely(!bch2_btree_bset_insert_key(trans, path, b,
+					&path_l(path)->iter, insert)))
 		return;
 
-	i->journal_seq = cpu_to_le64(max(trans->journal_res.seq,
-					 le64_to_cpu(i->journal_seq)));
+	i->journal_seq = cpu_to_le64(max(journal_seq, le64_to_cpu(i->journal_seq)));
 
-	bch2_btree_add_journal_pin(c, b, trans->journal_res.seq);
+	bch2_btree_add_journal_pin(c, b, journal_seq);
 
 	if (unlikely(!btree_node_dirty(b)))
 		set_btree_node_dirty_acct(c, b);
@@ -288,6 +287,12 @@ static void btree_insert_key_leaf(struct btree_trans *trans,
 		bch2_trans_node_reinit_iter(trans, b);
 }
 
+static void btree_insert_key_leaf(struct btree_trans *trans,
+				  struct btree_insert_entry *insert)
+{
+	bch2_btree_insert_key_leaf(trans, insert->path, insert->k, trans->journal_res.seq);
+}
+
 /* Cached btree updates: */
 
 /* Normal update interface: */
@@ -594,6 +599,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
 {
 	struct bch_fs *c = trans->c;
 	struct btree_insert_entry *i;
+	struct btree_write_buffered_key *wb;
 	struct btree_trans_commit_hook *h;
 	unsigned u64s = 0;
 	bool marking = false;
@@ -638,6 +644,10 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
 			marking = true;
 	}
 
+	if (trans->nr_wb_updates &&
+	    trans->nr_wb_updates + c->btree_write_buffer.state.nr > c->btree_write_buffer.size)
+		return -BCH_ERR_btree_insert_need_flush_buffer;
+
 	/*
 	 * Don't get journal reservation until after we know insert will
 	 * succeed:
@@ -674,17 +684,25 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
 	    bch2_trans_fs_usage_apply(trans, trans->fs_usage_deltas))
 		return -BCH_ERR_btree_insert_need_mark_replicas;
 
+	if (trans->nr_wb_updates) {
+		EBUG_ON(flags & BTREE_INSERT_JOURNAL_REPLAY);
+
+		ret = bch2_btree_insert_keys_write_buffer(trans);
+		if (ret)
+			goto revert_fs_usage;
+	}
+
 	trans_for_each_update(trans, i)
 		if (BTREE_NODE_TYPE_HAS_MEM_TRIGGERS & (1U << i->bkey_type)) {
 			ret = run_one_mem_trigger(trans, i, i->flags);
 			if (ret)
-				return ret;
+				goto fatal_err;
 		}
 
 	if (unlikely(c->gc_pos.phase)) {
 		ret = bch2_trans_commit_run_gc_triggers(trans);
 		if  (ret)
-			return ret;
+			goto fatal_err;
 	}
 
 	if (unlikely(trans->extra_journal_entries.nr)) {
@@ -697,10 +715,10 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
 	}
 
 	if (likely(!(flags & BTREE_INSERT_JOURNAL_REPLAY))) {
-		trans_for_each_update(trans, i) {
-			struct journal *j = &c->journal;
-			struct jset_entry *entry;
+		struct journal *j = &c->journal;
+		struct jset_entry *entry;
 
+		trans_for_each_update(trans, i) {
 			if (i->key_cache_already_flushed)
 				continue;
 
@@ -725,6 +743,14 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
 			bkey_copy(&entry->start[0], i->k);
 		}
 
+		trans_for_each_wb_update(trans, wb) {
+			entry = bch2_journal_add_entry(j, &trans->journal_res,
+					       BCH_JSET_ENTRY_btree_keys,
+					       wb->btree, 0,
+					       wb->k.k.u64s);
+			bkey_copy(&entry->start[0], &wb->k);
+		}
+
 		if (trans->journal_seq)
 			*trans->journal_seq = trans->journal_res.seq;
 	}
@@ -742,6 +768,12 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
 		}
 	}
 
+	return 0;
+fatal_err:
+	bch2_fatal_error(c);
+revert_fs_usage:
+	if (trans->fs_usage_deltas)
+		bch2_trans_fs_usage_revert(trans, trans->fs_usage_deltas);
 	return ret;
 }
 
@@ -769,7 +801,8 @@ static inline int trans_lock_write(struct btree_trans *trans)
 		if (bch2_btree_node_lock_write(trans, i->path, &insert_l(i)->b->c))
 			return trans_lock_write_fail(trans, i);
 
-		bch2_btree_node_prep_for_write(trans, i->path, insert_l(i)->b);
+		if (!i->cached)
+			bch2_btree_node_prep_for_write(trans, i->path, insert_l(i)->b);
 	}
 
 	return 0;
@@ -778,9 +811,13 @@ static inline int trans_lock_write(struct btree_trans *trans)
 static noinline void bch2_drop_overwrites_from_journal(struct btree_trans *trans)
 {
 	struct btree_insert_entry *i;
+	struct btree_write_buffered_key *wb;
 
 	trans_for_each_update(trans, i)
 		bch2_journal_key_overwritten(trans->c, i->btree_id, i->level, i->k->k.p);
+
+	trans_for_each_wb_update(trans, wb)
+		bch2_journal_key_overwritten(trans->c, wb->btree, 0, wb->k.k.p);
 }
 
 #ifdef CONFIG_BCACHEFS_DEBUG
@@ -821,10 +858,11 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, unsigned flags
 {
 	struct bch_fs *c = trans->c;
 	struct btree_insert_entry *i;
-	struct printbuf buf = PRINTBUF;
 	int ret, u64s_delta = 0;
 
 #ifdef CONFIG_BCACHEFS_DEBUG
+	struct printbuf buf = PRINTBUF;
+
 	trans_for_each_update(trans, i) {
 		int rw = (flags & BTREE_INSERT_JOURNAL_REPLAY) ? READ : WRITE;
 
@@ -833,8 +871,8 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, unsigned flags
 			return bch2_trans_commit_bkey_invalid(trans, flags, i, &buf);
 		btree_insert_entry_checks(trans, i);
 	}
-#endif
 	printbuf_exit(&buf);
+#endif
 
 	trans_for_each_update(trans, i) {
 		if (i->cached)
@@ -962,6 +1000,30 @@ int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags,
 		if (ret)
 			trace_and_count(c, trans_restart_journal_reclaim, trans, trace_ip);
 		break;
+	case -BCH_ERR_btree_insert_need_flush_buffer: {
+		struct btree_write_buffer *wb = &c->btree_write_buffer;
+
+		ret = 0;
+
+		if (wb->state.nr > wb->size * 3 / 4) {
+			bch2_trans_reset_updates(trans);
+			bch2_trans_unlock(trans);
+
+			mutex_lock(&wb->flush_lock);
+
+			if (wb->state.nr > wb->size * 3 / 4)
+				ret = __bch2_btree_write_buffer_flush(trans,
+						flags|BTREE_INSERT_NOCHECK_RW, true);
+			else
+				mutex_unlock(&wb->flush_lock);
+
+			if (!ret) {
+				trace_and_count(c, trans_restart_write_buffer_flush, trans, _THIS_IP_);
+				ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_write_buffer_flush);
+			}
+		}
+		break;
+	}
 	default:
 		BUG_ON(ret >= 0);
 		break;
@@ -1023,10 +1085,12 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
 {
 	struct bch_fs *c = trans->c;
 	struct btree_insert_entry *i = NULL;
+	struct btree_write_buffered_key *wb;
 	unsigned u64s;
 	int ret = 0;
 
 	if (!trans->nr_updates &&
+	    !trans->nr_wb_updates &&
 	    !trans->extra_journal_entries.nr)
 		goto out_reset;
 
@@ -1049,6 +1113,20 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
 			goto out_reset;
 	}
 
+	if (c->btree_write_buffer.state.nr > c->btree_write_buffer.size / 2 &&
+	    mutex_trylock(&c->btree_write_buffer.flush_lock)) {
+		bch2_trans_begin(trans);
+		bch2_trans_unlock(trans);
+
+		ret = __bch2_btree_write_buffer_flush(trans,
+					flags|BTREE_INSERT_NOCHECK_RW, true);
+		if (!ret) {
+			trace_and_count(c, trans_restart_write_buffer_flush, trans, _THIS_IP_);
+			ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_write_buffer_flush);
+		}
+		goto out;
+	}
+
 	EBUG_ON(test_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags));
 
 	memset(&trans->journal_preres, 0, sizeof(trans->journal_preres));
@@ -1089,6 +1167,9 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
 			trans->journal_u64s += jset_u64s(i->old_k.u64s);
 	}
 
+	trans_for_each_wb_update(trans, wb)
+		trans->journal_u64s += jset_u64s(wb->k.k.u64s);
+
 	if (trans->extra_journal_res) {
 		ret = bch2_disk_reservation_add(c, trans->disk_res,
 				trans->extra_journal_res,
@@ -1606,6 +1687,59 @@ int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter
 	return bch2_trans_update_by_path(trans, path, k, flags);
 }
 
+int __must_check bch2_trans_update_buffered(struct btree_trans *trans,
+					    enum btree_id btree,
+					    struct bkey_i *k)
+{
+	struct btree_write_buffered_key *i;
+	int ret;
+
+	EBUG_ON(trans->nr_wb_updates > trans->wb_updates_size);
+	EBUG_ON(k->k.u64s > BTREE_WRITE_BUFERED_U64s_MAX);
+
+	trans_for_each_wb_update(trans, i) {
+		if (i->btree == btree && bpos_eq(i->k.k.p, k->k.p)) {
+			bkey_copy(&i->k, k);
+			return 0;
+		}
+	}
+
+	if (!trans->wb_updates ||
+	    trans->nr_wb_updates == trans->wb_updates_size) {
+		struct btree_write_buffered_key *u;
+
+		if (trans->nr_wb_updates == trans->wb_updates_size) {
+			struct btree_transaction_stats *s = btree_trans_stats(trans);
+
+			BUG_ON(trans->wb_updates_size > U8_MAX / 2);
+			trans->wb_updates_size = max(1, trans->wb_updates_size * 2);
+			if (s)
+				s->wb_updates_size = trans->wb_updates_size;
+		}
+
+		u = bch2_trans_kmalloc_nomemzero(trans,
+					trans->wb_updates_size *
+					sizeof(struct btree_write_buffered_key));
+		ret = PTR_ERR_OR_ZERO(u);
+		if (ret)
+			return ret;
+
+		if (trans->nr_wb_updates)
+			memcpy(u, trans->wb_updates, trans->nr_wb_updates *
+			       sizeof(struct btree_write_buffered_key));
+		trans->wb_updates = u;
+	}
+
+	trans->wb_updates[trans->nr_wb_updates] = (struct btree_write_buffered_key) {
+		.btree	= btree,
+	};
+
+	bkey_copy(&trans->wb_updates[trans->nr_wb_updates].k, k);
+	trans->nr_wb_updates++;
+
+	return 0;
+}
+
 void bch2_trans_commit_hook(struct btree_trans *trans,
 			    struct btree_trans_commit_hook *h)
 {
diff --git a/fs/bcachefs/btree_write_buffer.c b/fs/bcachefs/btree_write_buffer.c
new file mode 100644
index 000000000000..84c3e6ddb38e
--- /dev/null
+++ b/fs/bcachefs/btree_write_buffer.c
@@ -0,0 +1,330 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "btree_locking.h"
+#include "btree_update.h"
+#include "btree_update_interior.h"
+#include "btree_write_buffer.h"
+#include "error.h"
+#include "journal.h"
+#include "journal_reclaim.h"
+
+#include <linux/sort.h>
+
+static int btree_write_buffered_key_cmp(const void *_l, const void *_r)
+{
+	const struct btree_write_buffered_key *l = _l;
+	const struct btree_write_buffered_key *r = _r;
+
+	return  cmp_int(l->btree, r->btree) ?:
+		bpos_cmp(l->k.k.p, r->k.k.p) ?:
+		cmp_int(l->journal_seq, r->journal_seq) ?:
+		cmp_int(l->journal_offset, r->journal_offset);
+}
+
+static int btree_write_buffered_journal_cmp(const void *_l, const void *_r)
+{
+	const struct btree_write_buffered_key *l = _l;
+	const struct btree_write_buffered_key *r = _r;
+
+	return  cmp_int(l->journal_seq, r->journal_seq);
+}
+
+static int bch2_btree_write_buffer_flush_one(struct btree_trans *trans,
+					     struct btree_iter *iter,
+					     struct btree_write_buffered_key *wb,
+					     unsigned commit_flags,
+					     bool *write_locked,
+					     size_t *fast)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_path *path;
+	int ret;
+
+	ret = bch2_btree_iter_traverse(iter);
+	if (ret)
+		return ret;
+
+	path = iter->path;
+
+	if (!*write_locked) {
+		ret = bch2_btree_node_lock_write(trans, path, &path->l[0].b->c);
+		if (ret)
+			return ret;
+
+		bch2_btree_node_prep_for_write(trans, path, path->l[0].b);
+		*write_locked = true;
+	}
+
+	if (!bch2_btree_node_insert_fits(c, path->l[0].b, wb->k.k.u64s)) {
+		bch2_btree_node_unlock_write(trans, path, path->l[0].b);
+		*write_locked = false;
+		goto trans_commit;
+	}
+
+	bch2_btree_insert_key_leaf(trans, path, &wb->k, wb->journal_seq);
+	(*fast)++;
+	return 0;
+trans_commit:
+	return  bch2_trans_update(trans, iter, &wb->k, 0) ?:
+		bch2_trans_commit(trans, NULL, NULL,
+				  commit_flags|
+				  BTREE_INSERT_NOFAIL|
+				  BTREE_INSERT_JOURNAL_RECLAIM);
+}
+
+static union btree_write_buffer_state btree_write_buffer_switch(struct btree_write_buffer *wb)
+{
+	union btree_write_buffer_state old, new;
+	u64 v = READ_ONCE(wb->state.v);
+
+	do {
+		old.v = new.v = v;
+
+		new.nr = 0;
+		new.idx++;
+	} while ((v = atomic64_cmpxchg_acquire(&wb->state.counter, old.v, new.v)) != old.v);
+
+	while (old.idx == 0 ? wb->state.ref0 : wb->state.ref1)
+		cpu_relax();
+
+	smp_mb();
+
+	return old;
+}
+
+int __bch2_btree_write_buffer_flush(struct btree_trans *trans, unsigned commit_flags,
+				    bool locked)
+{
+	struct bch_fs *c = trans->c;
+	struct journal *j = &c->journal;
+	struct btree_write_buffer *wb = &c->btree_write_buffer;
+	struct journal_entry_pin pin;
+	struct btree_write_buffered_key *i, *dst, *keys;
+	struct btree_iter iter = { NULL };
+	size_t nr = 0, skipped = 0, fast = 0;
+	bool write_locked = false;
+	union btree_write_buffer_state s;
+	int ret = 0;
+
+	memset(&pin, 0, sizeof(pin));
+
+	if (!locked && !mutex_trylock(&wb->flush_lock))
+		return 0;
+
+	bch2_journal_pin_copy(j, &pin, &wb->journal_pin, NULL);
+	bch2_journal_pin_drop(j, &wb->journal_pin);
+
+	s = btree_write_buffer_switch(wb);
+	keys = wb->keys[s.idx];
+	nr = s.nr;
+
+	/*
+	 * We first sort so that we can detect and skip redundant updates, and
+	 * then we attempt to flush in sorted btree order, as this is most
+	 * efficient.
+	 *
+	 * However, since we're not flushing in the order they appear in the
+	 * journal we won't be able to drop our journal pin until everything is
+	 * flushed - which means this could deadlock the journal, if we weren't
+	 * passing BTREE_INSERT_JORUNAL_RECLAIM. This causes the update to fail
+	 * if it would block taking a journal reservation.
+	 *
+	 * If that happens, we sort them by the order they appeared in the
+	 * journal - after dropping redundant entries - and then restart
+	 * flushing, this time dropping journal pins as we go.
+	 */
+
+	sort(keys, nr, sizeof(keys[0]),
+	     btree_write_buffered_key_cmp, NULL);
+
+	for (i = keys; i < keys + nr; i++) {
+		if (i + 1 < keys + nr &&
+		    i[0].btree == i[1].btree &&
+		    bpos_eq(i[0].k.k.p, i[1].k.k.p)) {
+			skipped++;
+			continue;
+		}
+
+		if (write_locked &&
+		    (iter.path->btree_id != i->btree ||
+		     bpos_gt(i->k.k.p, iter.path->l[0].b->key.k.p))) {
+			bch2_btree_node_unlock_write(trans, iter.path, iter.path->l[0].b);
+			write_locked = false;
+		}
+
+		if (!iter.path || iter.path->btree_id != i->btree) {
+			bch2_trans_iter_exit(trans, &iter);
+			bch2_trans_iter_init(trans, &iter, i->btree, i->k.k.p, BTREE_ITER_INTENT);
+		}
+
+		bch2_btree_iter_set_pos(&iter, i->k.k.p);
+		iter.path->preserve = false;
+
+		do {
+			ret = bch2_btree_write_buffer_flush_one(trans, &iter, i,
+						commit_flags, &write_locked, &fast);
+			if (!write_locked)
+				bch2_trans_begin(trans);
+		} while (bch2_err_matches(ret, BCH_ERR_transaction_restart));
+
+		if (ret)
+			break;
+	}
+
+	if (write_locked)
+		bch2_btree_node_unlock_write(trans, iter.path, iter.path->l[0].b);
+	bch2_trans_iter_exit(trans, &iter);
+
+	trace_write_buffer_flush(trans, nr, skipped, fast, wb->size);
+
+	if (ret == -BCH_ERR_journal_reclaim_would_deadlock)
+		goto slowpath;
+
+	bch2_fs_fatal_err_on(ret, c, "%s: insert error %s", __func__, bch2_err_str(ret));
+out:
+	bch2_journal_pin_drop(j, &pin);
+	mutex_unlock(&wb->flush_lock);
+	return ret;
+slowpath:
+	trace_write_buffer_flush_slowpath(trans, i - keys, nr);
+
+	dst = keys;
+	for (; i < keys + nr; i++) {
+		if (i + 1 < keys + nr &&
+		    i[0].btree == i[1].btree &&
+		    bpos_eq(i[0].k.k.p, i[1].k.k.p))
+			continue;
+
+		*dst = *i;
+		dst++;
+	}
+	nr = dst - keys;
+
+	sort(keys, nr, sizeof(keys[0]),
+	     btree_write_buffered_journal_cmp,
+	     NULL);
+
+	for (i = keys; i < keys + nr; i++) {
+		if (i->journal_seq > pin.seq) {
+			struct journal_entry_pin pin2;
+
+			memset(&pin2, 0, sizeof(pin2));
+
+			bch2_journal_pin_add(j, i->journal_seq, &pin2, NULL);
+			bch2_journal_pin_drop(j, &pin);
+			bch2_journal_pin_copy(j, &pin, &pin2, NULL);
+			bch2_journal_pin_drop(j, &pin2);
+		}
+
+		ret = commit_do(trans, NULL, NULL,
+				commit_flags|
+				BTREE_INSERT_NOFAIL|
+				BTREE_INSERT_JOURNAL_RECLAIM|
+				JOURNAL_WATERMARK_reserved,
+				__bch2_btree_insert(trans, i->btree, &i->k));
+		if (bch2_fs_fatal_err_on(ret, c, "%s: insert error %s", __func__, bch2_err_str(ret)))
+			break;
+	}
+
+	goto out;
+}
+
+int bch2_btree_write_buffer_flush_sync(struct btree_trans *trans)
+{
+	bch2_trans_unlock(trans);
+	mutex_lock(&trans->c->btree_write_buffer.flush_lock);
+	return __bch2_btree_write_buffer_flush(trans, 0, true);
+}
+
+int bch2_btree_write_buffer_flush(struct btree_trans *trans)
+{
+	return __bch2_btree_write_buffer_flush(trans, 0, false);
+}
+
+static int bch2_btree_write_buffer_journal_flush(struct journal *j,
+				struct journal_entry_pin *_pin, u64 seq)
+{
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
+	struct btree_write_buffer *wb = &c->btree_write_buffer;
+
+	mutex_lock(&wb->flush_lock);
+
+	return bch2_trans_run(c,
+			__bch2_btree_write_buffer_flush(&trans, BTREE_INSERT_NOCHECK_RW, true));
+}
+
+static inline u64 btree_write_buffer_ref(int idx)
+{
+	return ((union btree_write_buffer_state) {
+		.ref0 = idx == 0,
+		.ref1 = idx == 1,
+	}).v;
+}
+
+int bch2_btree_insert_keys_write_buffer(struct btree_trans *trans)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_write_buffer *wb = &c->btree_write_buffer;
+	struct btree_write_buffered_key *i;
+	union btree_write_buffer_state old, new;
+	int ret = 0;
+	u64 v;
+
+	trans_for_each_wb_update(trans, i) {
+		EBUG_ON(i->k.k.u64s > BTREE_WRITE_BUFERED_U64s_MAX);
+
+		i->journal_seq		= trans->journal_res.seq;
+		i->journal_offset	= trans->journal_res.offset;
+	}
+
+	preempt_disable();
+	v = READ_ONCE(wb->state.v);
+	do {
+		old.v = new.v = v;
+
+		new.v += btree_write_buffer_ref(new.idx);
+		new.nr += trans->nr_wb_updates;
+		if (new.nr > wb->size) {
+			ret = -BCH_ERR_btree_insert_need_flush_buffer;
+			goto out;
+		}
+	} while ((v = atomic64_cmpxchg_acquire(&wb->state.counter, old.v, new.v)) != old.v);
+
+	memcpy(wb->keys[new.idx] + old.nr,
+	       trans->wb_updates,
+	       sizeof(trans->wb_updates[0]) * trans->nr_wb_updates);
+
+	bch2_journal_pin_add(&c->journal, trans->journal_res.seq, &wb->journal_pin,
+			     bch2_btree_write_buffer_journal_flush);
+
+	atomic64_sub_return_release(btree_write_buffer_ref(new.idx), &wb->state.counter);
+out:
+	preempt_enable();
+	return ret;
+}
+
+void bch2_fs_btree_write_buffer_exit(struct bch_fs *c)
+{
+	struct btree_write_buffer *wb = &c->btree_write_buffer;
+
+	BUG_ON(wb->state.nr && !bch2_journal_error(&c->journal));
+
+	kvfree(wb->keys[1]);
+	kvfree(wb->keys[0]);
+}
+
+int bch2_fs_btree_write_buffer_init(struct bch_fs *c)
+{
+	struct btree_write_buffer *wb = &c->btree_write_buffer;
+
+	mutex_init(&wb->flush_lock);
+	wb->size = c->opts.btree_write_buffer_size;
+
+	wb->keys[0] = kvmalloc_array(wb->size, sizeof(*wb->keys[0]), GFP_KERNEL);
+	wb->keys[1] = kvmalloc_array(wb->size, sizeof(*wb->keys[1]), GFP_KERNEL);
+	if (!wb->keys[0] || !wb->keys[1])
+		return -ENOMEM;
+
+	return 0;
+}
diff --git a/fs/bcachefs/btree_write_buffer.h b/fs/bcachefs/btree_write_buffer.h
new file mode 100644
index 000000000000..322df1c8304e
--- /dev/null
+++ b/fs/bcachefs/btree_write_buffer.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BTREE_WRITE_BUFFER_H
+#define _BCACHEFS_BTREE_WRITE_BUFFER_H
+
+int __bch2_btree_write_buffer_flush(struct btree_trans *, unsigned, bool);
+int bch2_btree_write_buffer_flush_sync(struct btree_trans *);
+int bch2_btree_write_buffer_flush(struct btree_trans *);
+
+int bch2_btree_insert_keys_write_buffer(struct btree_trans *);
+
+void bch2_fs_btree_write_buffer_exit(struct bch_fs *);
+int bch2_fs_btree_write_buffer_init(struct bch_fs *);
+
+#endif /* _BCACHEFS_BTREE_WRITE_BUFFER_H */
diff --git a/fs/bcachefs/btree_write_buffer_types.h b/fs/bcachefs/btree_write_buffer_types.h
new file mode 100644
index 000000000000..99993ba77aea
--- /dev/null
+++ b/fs/bcachefs/btree_write_buffer_types.h
@@ -0,0 +1,44 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BTREE_WRITE_BUFFER_TYPES_H
+#define _BCACHEFS_BTREE_WRITE_BUFFER_TYPES_H
+
+#include "journal_types.h"
+
+#define BTREE_WRITE_BUFERED_VAL_U64s_MAX	4
+#define BTREE_WRITE_BUFERED_U64s_MAX	(BKEY_U64s + BTREE_WRITE_BUFERED_VAL_U64s_MAX)
+
+struct btree_write_buffered_key {
+	u64			journal_seq;
+	unsigned		journal_offset;
+	enum btree_id		btree;
+	__BKEY_PADDED(k, BTREE_WRITE_BUFERED_VAL_U64s_MAX);
+};
+
+union btree_write_buffer_state {
+	struct {
+		atomic64_t	counter;
+	};
+
+	struct {
+		u64		v;
+	};
+
+	struct {
+		u64			nr:23;
+		u64			idx:1;
+		u64			ref0:20;
+		u64			ref1:20;
+	};
+};
+
+struct btree_write_buffer {
+	struct mutex			flush_lock;
+	struct journal_entry_pin	journal_pin;
+
+	union btree_write_buffer_state	state;
+	size_t				size;
+
+	struct btree_write_buffered_key	*keys[2];
+};
+
+#endif /* _BCACHEFS_BTREE_WRITE_BUFFER_TYPES_H */
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 153987376b89..86f48f5762dd 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -1278,6 +1278,47 @@ int bch2_mark_reflink_p(struct btree_trans *trans,
 	return ret;
 }
 
+void bch2_trans_fs_usage_revert(struct btree_trans *trans,
+				struct replicas_delta_list *deltas)
+{
+	struct bch_fs *c = trans->c;
+	struct bch_fs_usage *dst;
+	struct replicas_delta *d, *top = (void *) deltas->d + deltas->used;
+	s64 added = 0;
+	unsigned i;
+
+	percpu_down_read(&c->mark_lock);
+	preempt_disable();
+	dst = fs_usage_ptr(c, trans->journal_res.seq, false);
+
+	/* revert changes: */
+	for (d = deltas->d; d != top; d = replicas_delta_next(d)) {
+		switch (d->r.data_type) {
+		case BCH_DATA_btree:
+		case BCH_DATA_user:
+		case BCH_DATA_parity:
+			added += d->delta;
+		}
+		BUG_ON(__update_replicas(c, dst, &d->r, -d->delta));
+	}
+
+	dst->nr_inodes -= deltas->nr_inodes;
+
+	for (i = 0; i < BCH_REPLICAS_MAX; i++) {
+		added				-= deltas->persistent_reserved[i];
+		dst->reserved			-= deltas->persistent_reserved[i];
+		dst->persistent_reserved[i]	-= deltas->persistent_reserved[i];
+	}
+
+	if (added > 0) {
+		trans->disk_res->sectors += added;
+		this_cpu_add(*c->online_reserved, added);
+	}
+
+	preempt_enable();
+	percpu_up_read(&c->mark_lock);
+}
+
 int bch2_trans_fs_usage_apply(struct btree_trans *trans,
 			      struct replicas_delta_list *deltas)
 {
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index 0fc101b9aaf1..e8e3a3b09714 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -229,6 +229,7 @@ int bch2_trans_mark_inode(struct btree_trans *, enum btree_id, unsigned, struct
 int bch2_trans_mark_reservation(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_i *, unsigned);
 int bch2_trans_mark_reflink_p(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_i *, unsigned);
 
+void bch2_trans_fs_usage_revert(struct btree_trans *, struct replicas_delta_list *);
 int bch2_trans_fs_usage_apply(struct btree_trans *, struct replicas_delta_list *);
 
 int bch2_trans_mark_metadata_bucket(struct btree_trans *, struct bch_dev *,
diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h
index bb296edcf4f7..7a6448f48fca 100644
--- a/fs/bcachefs/errcode.h
+++ b/fs/bcachefs/errcode.h
@@ -42,6 +42,7 @@
 	x(BCH_ERR_transaction_restart,	transaction_restart_key_cache_realloced)\
 	x(BCH_ERR_transaction_restart,	transaction_restart_journal_preres_get)	\
 	x(BCH_ERR_transaction_restart,	transaction_restart_split_race)		\
+	x(BCH_ERR_transaction_restart,	transaction_restart_write_buffer_flush)	\
 	x(BCH_ERR_transaction_restart,	transaction_restart_nested)		\
 	x(0,				no_btree_node)				\
 	x(BCH_ERR_no_btree_node,	no_btree_node_relock)			\
@@ -58,6 +59,7 @@
 	x(BCH_ERR_btree_insert_fail,	btree_insert_need_mark_replicas)	\
 	x(BCH_ERR_btree_insert_fail,	btree_insert_need_journal_res)		\
 	x(BCH_ERR_btree_insert_fail,	btree_insert_need_journal_reclaim)	\
+	x(BCH_ERR_btree_insert_fail,	btree_insert_need_flush_buffer)		\
 	x(0,				lock_fail_root_changed)			\
 	x(0,				journal_reclaim_would_deadlock)		\
 	x(0,				fsck)					\
diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
index c6025172f32e..85927b306014 100644
--- a/fs/bcachefs/opts.h
+++ b/fs/bcachefs/opts.h
@@ -206,6 +206,11 @@ enum opt_type {
 	  OPT_BOOL(),							\
 	  BCH2_NO_SB_OPT,		true,				\
 	  NULL,		"Stash pointer to in memory btree node in btree ptr")\
+	x(btree_write_buffer_size, u32,					\
+	  OPT_FS|OPT_MOUNT,						\
+	  OPT_UINT(16, (1U << 20) - 1),					\
+	  BCH2_NO_SB_OPT,		1U << 13,			\
+	  NULL,		"Number of btree write buffer entries")		\
 	x(gc_reserve_percent,		u8,				\
 	  OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,			\
 	  OPT_UINT(5, 21),						\
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index e7e3dcbe2339..ade8d074e887 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -16,6 +16,7 @@
 #include "btree_key_cache.h"
 #include "btree_update_interior.h"
 #include "btree_io.h"
+#include "btree_write_buffer.h"
 #include "buckets_waiting_for_journal.h"
 #include "chardev.h"
 #include "checksum.h"
@@ -463,6 +464,7 @@ static void __bch2_fs_free(struct bch_fs *c)
 	bch2_fs_compress_exit(c);
 	bch2_journal_keys_free(&c->journal_keys);
 	bch2_journal_entries_free(c);
+	bch2_fs_btree_write_buffer_exit(c);
 	percpu_free_rwsem(&c->mark_lock);
 	free_percpu(c->online_reserved);
 
@@ -816,6 +818,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 	    bch2_fs_btree_iter_init(c) ?:
 	    bch2_fs_btree_interior_update_init(c) ?:
 	    bch2_fs_buckets_waiting_for_journal_init(c) ?:
+	    bch2_fs_btree_write_buffer_init(c) ?:
 	    bch2_fs_subvolumes_init(c) ?:
 	    bch2_fs_io_init(c) ?:
 	    bch2_fs_encryption_init(c) ?:
diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h
index 17fc58e73702..937fd132bfd2 100644
--- a/fs/bcachefs/trace.h
+++ b/fs/bcachefs/trace.h
@@ -1111,6 +1111,51 @@ TRACE_EVENT(trans_restart_key_cache_key_realloced,
 		  __entry->new_u64s)
 );
 
+DEFINE_EVENT(transaction_event,	trans_restart_write_buffer_flush,
+	TP_PROTO(struct btree_trans *trans,
+		 unsigned long caller_ip),
+	TP_ARGS(trans, caller_ip)
+);
+
+TRACE_EVENT(write_buffer_flush,
+	TP_PROTO(struct btree_trans *trans, size_t nr, size_t skipped, size_t fast, size_t size),
+	TP_ARGS(trans, nr, skipped, fast, size),
+
+	TP_STRUCT__entry(
+		__field(size_t,		nr		)
+		__field(size_t,		skipped		)
+		__field(size_t,		fast		)
+		__field(size_t,		size		)
+	),
+
+	TP_fast_assign(
+		__entry->nr	= nr;
+		__entry->skipped = skipped;
+		__entry->fast	= fast;
+		__entry->size	= size;
+	),
+
+	TP_printk("%zu/%zu skipped %zu fast %zu",
+		  __entry->nr, __entry->size, __entry->skipped, __entry->fast)
+);
+
+TRACE_EVENT(write_buffer_flush_slowpath,
+	TP_PROTO(struct btree_trans *trans, size_t nr, size_t size),
+	TP_ARGS(trans, nr, size),
+
+	TP_STRUCT__entry(
+		__field(size_t,		nr		)
+		__field(size_t,		size		)
+	),
+
+	TP_fast_assign(
+		__entry->nr	= nr;
+		__entry->size	= size;
+	),
+
+	TP_printk("%zu/%zu", __entry->nr, __entry->size)
+);
+
 #endif /* _TRACE_BCACHEFS_H */
 
 /* This part must be outside protection */
-- 
cgit 


From a8c752bb1d93a24a0de753e209d4f4d58d65c878 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 17 Mar 2022 20:51:27 -0400
Subject: bcachefs: New on disk format: Backpointers

This patch adds backpointers: we now have a reverse index from device
and offset on that device (specifically, offset within a bucket) back to
btree nodes and (non cached) data extents.

The first 40 backpointers within a bucket are stored in the alloc key;
after that backpointers spill over to the next backpointers btree. This
is to help avoid performance regressions from additional btree updates
on large streaming workloads.

This patch adds all the code for creating, checking and repairing
backpointers. The next patch in the series is going to use backpointers
for copygc - finally getting rid of the need to scan all extents to do
copygc.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/Makefile           |   1 +
 fs/bcachefs/alloc_background.c | 104 +++++-
 fs/bcachefs/alloc_background.h |  16 +-
 fs/bcachefs/alloc_foreground.c |  23 ++
 fs/bcachefs/backpointers.c     | 799 +++++++++++++++++++++++++++++++++++++++++
 fs/bcachefs/backpointers.h     | 131 +++++++
 fs/bcachefs/bcachefs.h         |   7 +-
 fs/bcachefs/bcachefs_format.h  |  62 ++--
 fs/bcachefs/bkey_methods.c     |   4 +
 fs/bcachefs/buckets.c          |  48 ++-
 fs/bcachefs/buckets.h          |  19 +
 fs/bcachefs/errcode.h          |   1 +
 fs/bcachefs/recovery.c         |  35 +-
 fs/bcachefs/super.c            |   2 +
 14 files changed, 1186 insertions(+), 66 deletions(-)
 create mode 100644 fs/bcachefs/backpointers.c
 create mode 100644 fs/bcachefs/backpointers.h

(limited to 'fs')

diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile
index c0e715760c8b..456d540441ce 100644
--- a/fs/bcachefs/Makefile
+++ b/fs/bcachefs/Makefile
@@ -4,6 +4,7 @@ obj-$(CONFIG_BCACHEFS_FS)	+= bcachefs.o
 bcachefs-y		:=	\
 	alloc_background.o	\
 	alloc_foreground.o	\
+	backpointers.o		\
 	bkey.o			\
 	bkey_methods.o		\
 	bkey_sort.o		\
diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index f75d05beaf31..58ec650a512c 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -2,6 +2,7 @@
 #include "bcachefs.h"
 #include "alloc_background.h"
 #include "alloc_foreground.h"
+#include "backpointers.h"
 #include "btree_cache.h"
 #include "btree_io.h"
 #include "btree_key_cache.h"
@@ -266,12 +267,34 @@ int bch2_alloc_v4_invalid(const struct bch_fs *c, struct bkey_s_c k,
 {
 	struct bkey_s_c_alloc_v4 a = bkey_s_c_to_alloc_v4(k);
 
-	if (bkey_val_bytes(k.k) != sizeof(struct bch_alloc_v4)) {
-		prt_printf(err, "bad val size (%zu != %zu)",
-		       bkey_val_bytes(k.k), sizeof(struct bch_alloc_v4));
+	if (alloc_v4_u64s(a.v) != bkey_val_u64s(k.k)) {
+		prt_printf(err, "bad val size (%lu != %u)",
+		       bkey_val_u64s(k.k), alloc_v4_u64s(a.v));
 		return -BCH_ERR_invalid_bkey;
 	}
 
+	if (!BCH_ALLOC_V4_BACKPOINTERS_START(a.v) &&
+	    BCH_ALLOC_V4_NR_BACKPOINTERS(a.v)) {
+		prt_printf(err, "invalid backpointers_start");
+		return -BCH_ERR_invalid_bkey;
+	}
+
+	/*
+	 * XXX this is wrong, we'll be checking updates that happened from
+	 * before BCH_FS_CHECK_BACKPOINTERS_DONE
+	 */
+	if (rw == WRITE && test_bit(BCH_FS_CHECK_BACKPOINTERS_DONE, &c->flags)) {
+		unsigned i, bp_len = 0;
+
+		for (i = 0; i < BCH_ALLOC_V4_NR_BACKPOINTERS(a.v); i++)
+			bp_len += alloc_v4_backpointers_c(a.v)[i].bucket_len;
+
+		if (bp_len > a.v->dirty_sectors) {
+			prt_printf(err, "too many backpointers");
+			return -BCH_ERR_invalid_bkey;
+		}
+	}
+
 	if (rw == WRITE) {
 		if (alloc_data_type(*a.v, a.v->data_type) != a.v->data_type) {
 			prt_printf(err, "invalid data type (got %u should be %u)",
@@ -328,9 +351,19 @@ int bch2_alloc_v4_invalid(const struct bch_fs *c, struct bkey_s_c k,
 	return 0;
 }
 
+static inline u64 swab40(u64 x)
+{
+	return (((x & 0x00000000ffULL) << 32)|
+		((x & 0x000000ff00ULL) << 16)|
+		((x & 0x0000ff0000ULL) >>  0)|
+		((x & 0x00ff000000ULL) >> 16)|
+		((x & 0xff00000000ULL) >> 32));
+}
+
 void bch2_alloc_v4_swab(struct bkey_s k)
 {
 	struct bch_alloc_v4 *a = bkey_s_to_alloc_v4(k).v;
+	struct bch_backpointer *bp, *bps;
 
 	a->journal_seq		= swab64(a->journal_seq);
 	a->flags		= swab32(a->flags);
@@ -340,12 +373,20 @@ void bch2_alloc_v4_swab(struct bkey_s k)
 	a->io_time[1]		= swab64(a->io_time[1]);
 	a->stripe		= swab32(a->stripe);
 	a->nr_external_backpointers = swab32(a->nr_external_backpointers);
+
+	bps = alloc_v4_backpointers(a);
+	for (bp = bps; bp < bps + BCH_ALLOC_V4_NR_BACKPOINTERS(a); bp++) {
+		bp->bucket_offset	= swab40(bp->bucket_offset);
+		bp->bucket_len		= swab32(bp->bucket_len);
+		bch2_bpos_swab(&bp->pos);
+	}
 }
 
 void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
 {
 	struct bch_alloc_v4 _a;
 	const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &_a);
+	unsigned i;
 
 	prt_newline(out);
 	printbuf_indent_add(out, 2);
@@ -374,14 +415,25 @@ void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c
 	prt_newline(out);
 	prt_printf(out, "io_time[WRITE]    %llu",	a->io_time[WRITE]);
 	prt_newline(out);
-	prt_printf(out, "backpointers:     %llu",	BCH_ALLOC_V4_NR_BACKPOINTERS(a));
+	prt_printf(out, "bp_start          %llu", BCH_ALLOC_V4_BACKPOINTERS_START(a));
+	prt_newline(out);
 
-	printbuf_indent_sub(out, 2);
-}
+	if (BCH_ALLOC_V4_NR_BACKPOINTERS(a)) {
+		struct bkey_s_c_alloc_v4 a_raw = bkey_s_c_to_alloc_v4(k);
+		const struct bch_backpointer *bps = alloc_v4_backpointers_c(a_raw.v);
 
-static inline void *alloc_v4_backpointers(struct bch_alloc_v4 *a)
-{
-	return (void *) ((u64 *) &a->v + BCH_ALLOC_V4_BACKPOINTERS_START(a));
+		prt_printf(out, "backpointers:     %llu", BCH_ALLOC_V4_NR_BACKPOINTERS(a_raw.v));
+		printbuf_indent_add(out, 2);
+
+		for (i = 0; i < BCH_ALLOC_V4_NR_BACKPOINTERS(a_raw.v); i++) {
+			prt_newline(out);
+			bch2_backpointer_to_text(out, &bps[i]);
+		}
+
+		printbuf_indent_sub(out, 2);
+	}
+
+	printbuf_indent_sub(out, 2);
 }
 
 void __bch2_alloc_to_v4(struct bkey_s_c k, struct bch_alloc_v4 *out)
@@ -422,12 +474,18 @@ static noinline struct bkey_i_alloc_v4 *
 __bch2_alloc_to_v4_mut(struct btree_trans *trans, struct bkey_s_c k)
 {
 	struct bkey_i_alloc_v4 *ret;
-
 	if (k.k->type == KEY_TYPE_alloc_v4) {
-		unsigned bytes = min(sizeof(struct bkey_i_alloc_v4), bkey_bytes(k.k));
+		struct bkey_s_c_alloc_v4 a = bkey_s_c_to_alloc_v4(k);
+		unsigned bytes = sizeof(struct bkey_i_alloc_v4) +
+			BCH_ALLOC_V4_NR_BACKPOINTERS(a.v) *
+			sizeof(struct bch_backpointer);
 		void *src, *dst;
 
-		ret = bch2_trans_kmalloc(trans, bytes);
+		/*
+		 * Reserve space for one more backpointer here:
+		 * Not sketchy at doing it this way, nope...
+		 */
+		ret = bch2_trans_kmalloc(trans, bytes + sizeof(struct bch_backpointer));
 		if (IS_ERR(ret))
 			return ret;
 
@@ -437,16 +495,20 @@ __bch2_alloc_to_v4_mut(struct btree_trans *trans, struct bkey_s_c k)
 		SET_BCH_ALLOC_V4_BACKPOINTERS_START(&ret->v, BCH_ALLOC_V4_U64s);
 		dst = alloc_v4_backpointers(&ret->v);
 
+		memmove(dst, src, BCH_ALLOC_V4_NR_BACKPOINTERS(&ret->v) *
+			sizeof(struct bch_backpointer));
 		if (src < dst)
 			memset(src, 0, dst - src);
 		set_alloc_v4_u64s(ret);
 	} else {
-		ret = bch2_trans_kmalloc(trans, sizeof(*ret));
-		if (!IS_ERR(ret)) {
-			bkey_alloc_v4_init(&ret->k_i);
-			ret->k.p = k.k->p;
-			bch2_alloc_to_v4(k, &ret->v);
-		}
+		ret = bch2_trans_kmalloc(trans, sizeof(struct bkey_i_alloc_v4) +
+					 sizeof(struct bch_backpointer));
+		if (IS_ERR(ret))
+			return ret;
+
+		bkey_alloc_v4_init(&ret->k_i);
+		ret->k.p = k.k->p;
+		bch2_alloc_to_v4(k, &ret->v);
 	}
 	return ret;
 }
@@ -455,8 +517,12 @@ static inline struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut_inlined(struct btree_
 {
 	if (likely(k.k->type == KEY_TYPE_alloc_v4) &&
 	    BCH_ALLOC_V4_BACKPOINTERS_START(bkey_s_c_to_alloc_v4(k).v) == BCH_ALLOC_V4_U64s) {
+		/*
+		 * Reserve space for one more backpointer here:
+		 * Not sketchy at doing it this way, nope...
+		 */
 		struct bkey_i_alloc_v4 *ret =
-			bch2_trans_kmalloc_nomemzero(trans, bkey_bytes(k.k));
+			bch2_trans_kmalloc_nomemzero(trans, bkey_bytes(k.k) + sizeof(struct bch_backpointer));
 		if (!IS_ERR(ret))
 			bkey_reassemble(&ret->k_i, k);
 		return ret;
diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h
index c562aff3ac33..b843316d3846 100644
--- a/fs/bcachefs/alloc_background.h
+++ b/fs/bcachefs/alloc_background.h
@@ -73,7 +73,9 @@ static inline struct bpos alloc_freespace_pos(struct bpos pos, struct bch_alloc_
 static inline unsigned alloc_v4_u64s(const struct bch_alloc_v4 *a)
 {
 	unsigned ret = (BCH_ALLOC_V4_BACKPOINTERS_START(a) ?:
-			BCH_ALLOC_V4_U64s_V0);
+			BCH_ALLOC_V4_U64s_V0) +
+		BCH_ALLOC_V4_NR_BACKPOINTERS(a) *
+		(sizeof(struct bch_backpointer) / sizeof(u64));
 
 	BUG_ON(ret > U8_MAX - BKEY_U64s);
 	return ret;
@@ -175,6 +177,18 @@ static inline u64 should_invalidate_buckets(struct bch_dev *ca,
 
 void bch2_do_invalidates(struct bch_fs *);
 
+static inline struct bch_backpointer *alloc_v4_backpointers(struct bch_alloc_v4 *a)
+{
+	return (void *) ((u64 *) &a->v +
+			 (BCH_ALLOC_V4_BACKPOINTERS_START(a) ?:
+			  BCH_ALLOC_V4_U64s_V0));
+}
+
+static inline const struct bch_backpointer *alloc_v4_backpointers_c(const struct bch_alloc_v4 *a)
+{
+	return (void *) ((u64 *) &a->v + BCH_ALLOC_V4_BACKPOINTERS_START(a));
+}
+
 int bch2_fs_freespace_init(struct bch_fs *);
 
 void bch2_recalc_capacity(struct bch_fs *);
diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index ba14cfe06515..5988aa288c98 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -14,6 +14,7 @@
 #include "bcachefs.h"
 #include "alloc_background.h"
 #include "alloc_foreground.h"
+#include "backpointers.h"
 #include "btree_iter.h"
 #include "btree_update.h"
 #include "btree_gc.h"
@@ -346,6 +347,28 @@ static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bc
 
 	}
 
+	if (!test_bit(BCH_FS_CHECK_BACKPOINTERS_DONE, &c->flags)) {
+		struct bch_backpointer bp;
+		u64 bp_offset = 0;
+
+		ret = bch2_get_next_backpointer(trans, POS(ca->dev_idx, b), -1,
+						&bp_offset, &bp);
+		if (ret) {
+			ob = ERR_PTR(ret);
+			goto err;
+		}
+
+		if (bp_offset != U64_MAX) {
+			/*
+			 * Bucket may have data in it - we don't call
+			 * bc2h_trans_inconnsistent() because fsck hasn't
+			 * finished yet
+			 */
+			ob = NULL;
+			goto err;
+		}
+	}
+
 	ob = __try_alloc_bucket(c, ca, b, reserve, a, s, cl);
 	if (!ob)
 		iter.path->preserve = false;
diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c
new file mode 100644
index 000000000000..6efc286cd6ba
--- /dev/null
+++ b/fs/bcachefs/backpointers.c
@@ -0,0 +1,799 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "bcachefs.h"
+#include "alloc_background.h"
+#include "backpointers.h"
+#include "btree_cache.h"
+#include "btree_update.h"
+#include "error.h"
+
+static bool extent_matches_bp(struct bch_fs *c,
+			      enum btree_id btree_id, unsigned level,
+			      struct bkey_s_c k,
+			      struct bpos bucket,
+			      struct bch_backpointer bp)
+{
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+	const union bch_extent_entry *entry;
+	struct extent_ptr_decoded p;
+
+	bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+		struct bpos bucket2;
+		struct bch_backpointer bp2;
+
+		if (p.ptr.cached)
+			continue;
+
+		bch2_extent_ptr_to_bp(c, btree_id, level, k, p,
+				      &bucket2, &bp2);
+		if (bpos_eq(bucket, bucket2) &&
+		    !memcmp(&bp, &bp2, sizeof(bp)))
+			return true;
+	}
+
+	return false;
+}
+
+int bch2_backpointer_invalid(const struct bch_fs *c, struct bkey_s_c k,
+			     int rw, struct printbuf *err)
+{
+	struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(k);
+	struct bpos bucket = bp_pos_to_bucket(c, bp.k->p);
+
+	if (bkey_val_bytes(bp.k) < sizeof(*bp.v)) {
+		prt_str(err, "incorrect value size");
+		return -BCH_ERR_invalid_bkey;
+	}
+
+	if (!bpos_eq(bp.k->p, bucket_pos_to_bp(c, bucket, bp.v->bucket_offset))) {
+		prt_str(err, "backpointer at wrong pos");
+		return -BCH_ERR_invalid_bkey;
+	}
+
+	return 0;
+}
+
+void bch2_backpointer_to_text(struct printbuf *out, const struct bch_backpointer *bp)
+{
+	prt_printf(out, "btree=%s l=%u offset=%llu:%u len=%u pos=",
+	       bch2_btree_ids[bp->btree_id],
+	       bp->level,
+	       (u64) (bp->bucket_offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT),
+	       (u32) bp->bucket_offset & ~(~0U << MAX_EXTENT_COMPRESS_RATIO_SHIFT),
+	       bp->bucket_len);
+	bch2_bpos_to_text(out, bp->pos);
+}
+
+void bch2_backpointer_k_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
+{
+	bch2_backpointer_to_text(out, bkey_s_c_to_backpointer(k).v);
+}
+
+void bch2_backpointer_swab(struct bkey_s k)
+{
+	struct bkey_s_backpointer bp = bkey_s_to_backpointer(k);
+
+	bp.v->bucket_offset	= swab32(bp.v->bucket_offset);
+	bp.v->bucket_len	= swab32(bp.v->bucket_len);
+	bch2_bpos_swab(&bp.v->pos);
+}
+
+#define BACKPOINTER_OFFSET_MAX	((1ULL << 40) - 1)
+
+static inline int backpointer_cmp(struct bch_backpointer l, struct bch_backpointer r)
+{
+	return cmp_int(l.bucket_offset, r.bucket_offset);
+}
+
+static int bch2_backpointer_del_by_offset(struct btree_trans *trans,
+					  struct bpos bucket,
+					  u64 bp_offset,
+					  struct bch_backpointer bp)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	int ret;
+
+	if (bp_offset < BACKPOINTER_OFFSET_MAX) {
+		struct bch_backpointer *bps;
+		struct bkey_i_alloc_v4 *a;
+		unsigned i, nr;
+
+		bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc,
+				     bucket,
+				     BTREE_ITER_INTENT|
+				     BTREE_ITER_SLOTS|
+				     BTREE_ITER_WITH_UPDATES);
+		k = bch2_btree_iter_peek_slot(&iter);
+		ret = bkey_err(k);
+		if (ret)
+			goto err;
+
+		if (k.k->type != KEY_TYPE_alloc_v4) {
+			ret = -ENOENT;
+			goto err;
+		}
+
+		a = bch2_alloc_to_v4_mut(trans, k);
+		ret = PTR_ERR_OR_ZERO(a);
+		if (ret)
+			goto err;
+		bps = alloc_v4_backpointers(&a->v);
+		nr = BCH_ALLOC_V4_NR_BACKPOINTERS(&a->v);
+
+		for (i = 0; i < nr; i++) {
+			if (bps[i].bucket_offset == bp_offset)
+				goto found;
+			if (bps[i].bucket_offset > bp_offset)
+				break;
+		}
+
+		ret = -ENOENT;
+		goto err;
+found:
+		if (memcmp(&bps[i], &bp, sizeof(bp))) {
+			ret = -ENOENT;
+			goto err;
+		}
+		array_remove_item(bps, nr, i);
+		SET_BCH_ALLOC_V4_NR_BACKPOINTERS(&a->v, nr);
+		set_alloc_v4_u64s(a);
+		ret = bch2_trans_update(trans, &iter, &a->k_i, 0);
+	} else {
+		bp_offset -= BACKPOINTER_OFFSET_MAX;
+
+		bch2_trans_iter_init(trans, &iter, BTREE_ID_backpointers,
+				     bucket_pos_to_bp(c, bucket, bp_offset),
+				     BTREE_ITER_INTENT|
+				     BTREE_ITER_SLOTS|
+				     BTREE_ITER_WITH_UPDATES);
+		k = bch2_btree_iter_peek_slot(&iter);
+		ret = bkey_err(k);
+		if (ret)
+			goto err;
+
+		if (k.k->type != KEY_TYPE_backpointer ||
+		    memcmp(bkey_s_c_to_backpointer(k).v, &bp, sizeof(bp))) {
+			ret = -ENOENT;
+			goto err;
+		}
+
+		ret = bch2_btree_delete_at(trans, &iter, 0);
+	}
+err:
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+bool bch2_bucket_backpointer_del(struct btree_trans *trans,
+				 struct bkey_i_alloc_v4 *a,
+				 struct bch_backpointer bp)
+{
+	struct bch_backpointer *bps = alloc_v4_backpointers(&a->v);
+	unsigned i, nr = BCH_ALLOC_V4_NR_BACKPOINTERS(&a->v);
+
+	for (i = 0; i < nr; i++) {
+		int cmp = backpointer_cmp(bps[i], bp) ?:
+			memcmp(&bps[i], &bp, sizeof(bp));
+		if (!cmp) {
+			array_remove_item(bps, nr, i);
+			SET_BCH_ALLOC_V4_NR_BACKPOINTERS(&a->v, nr);
+			set_alloc_v4_u64s(a);
+			return true;
+		}
+		if (cmp >= 0)
+			break;
+	}
+
+	return false;
+}
+
+static noinline int backpointer_mod_err(struct btree_trans *trans,
+					struct bch_backpointer bp,
+					struct bkey_s_c bp_k,
+					struct bkey_s_c orig_k,
+					bool insert)
+{
+	struct bch_fs *c = trans->c;
+	struct printbuf buf = PRINTBUF;
+
+	if (insert) {
+		prt_printf(&buf, "existing backpointer found when inserting ");
+		bch2_backpointer_to_text(&buf, &bp);
+		prt_newline(&buf);
+		printbuf_indent_add(&buf, 2);
+
+		prt_printf(&buf, "found ");
+		bch2_bkey_val_to_text(&buf, c, bp_k);
+		prt_newline(&buf);
+
+		prt_printf(&buf, "for ");
+		bch2_bkey_val_to_text(&buf, c, orig_k);
+
+		bch_err(c, "%s", buf.buf);
+	} else if (test_bit(BCH_FS_CHECK_BACKPOINTERS_DONE, &c->flags)) {
+		prt_printf(&buf, "backpointer not found when deleting");
+		prt_newline(&buf);
+		printbuf_indent_add(&buf, 2);
+
+		prt_printf(&buf, "searching for ");
+		bch2_backpointer_to_text(&buf, &bp);
+		prt_newline(&buf);
+
+		prt_printf(&buf, "got ");
+		bch2_bkey_val_to_text(&buf, c, bp_k);
+		prt_newline(&buf);
+
+		prt_printf(&buf, "for ");
+		bch2_bkey_val_to_text(&buf, c, orig_k);
+
+		bch_err(c, "%s", buf.buf);
+	}
+
+	printbuf_exit(&buf);
+
+	if (test_bit(BCH_FS_CHECK_BACKPOINTERS_DONE, &c->flags)) {
+		bch2_inconsistent_error(c);
+		return -EIO;
+	} else {
+		return 0;
+	}
+}
+
+int bch2_bucket_backpointer_mod_nowritebuffer(struct btree_trans *trans,
+				struct bkey_i_alloc_v4 *a,
+				struct bch_backpointer bp,
+				struct bkey_s_c orig_k,
+				bool insert)
+{
+	struct bch_fs *c = trans->c;
+	struct bkey_i_backpointer *bp_k;
+	struct btree_iter bp_iter;
+	struct bkey_s_c k;
+	int ret;
+
+	bp_k = bch2_trans_kmalloc_nomemzero(trans, sizeof(struct bkey_i_backpointer));
+	ret = PTR_ERR_OR_ZERO(bp_k);
+	if (ret)
+		return ret;
+
+	bkey_backpointer_init(&bp_k->k_i);
+	bp_k->k.p = bucket_pos_to_bp(c, a->k.p, bp.bucket_offset);
+	bp_k->v = bp;
+
+	if (!insert) {
+		bp_k->k.type = KEY_TYPE_deleted;
+		set_bkey_val_u64s(&bp_k->k, 0);
+	}
+
+	bch2_trans_iter_init(trans, &bp_iter, BTREE_ID_backpointers,
+			     bucket_pos_to_bp(c, a->k.p, bp.bucket_offset),
+			     BTREE_ITER_INTENT|
+			     BTREE_ITER_SLOTS|
+			     BTREE_ITER_WITH_UPDATES);
+	k = bch2_btree_iter_peek_slot(&bp_iter);
+	ret = bkey_err(k);
+	if (ret)
+		goto err;
+
+	if (insert
+	    ? k.k->type
+	    : (k.k->type != KEY_TYPE_backpointer ||
+	       memcmp(bkey_s_c_to_backpointer(k).v, &bp, sizeof(bp)))) {
+		ret = backpointer_mod_err(trans, bp, k, orig_k, insert);
+		if (ret)
+			goto err;
+	}
+
+	ret = bch2_trans_update(trans, &bp_iter, &bp_k->k_i, 0);
+err:
+	bch2_trans_iter_exit(trans, &bp_iter);
+	return ret;
+}
+
+/*
+ * Find the next backpointer >= *bp_offset:
+ */
+int bch2_get_next_backpointer(struct btree_trans *trans,
+			      struct bpos bucket, int gen,
+			      u64 *bp_offset,
+			      struct bch_backpointer *dst)
+{
+	struct bch_fs *c = trans->c;
+	struct bpos bp_pos, bp_end_pos;
+	struct btree_iter alloc_iter, bp_iter = { NULL };
+	struct bkey_s_c k;
+	struct bkey_s_c_alloc_v4 a;
+	size_t i;
+	int ret;
+
+	if (*bp_offset == U64_MAX)
+		return 0;
+
+	bp_pos = bucket_pos_to_bp(c, bucket,
+				  max(*bp_offset, BACKPOINTER_OFFSET_MAX) - BACKPOINTER_OFFSET_MAX);
+	bp_end_pos = bucket_pos_to_bp(c, bpos_nosnap_successor(bucket), 0);
+
+	bch2_trans_iter_init(trans, &alloc_iter, BTREE_ID_alloc,
+			     bucket, BTREE_ITER_CACHED);
+	k = bch2_btree_iter_peek_slot(&alloc_iter);
+	ret = bkey_err(k);
+	if (ret)
+		goto out;
+
+	if (k.k->type != KEY_TYPE_alloc_v4)
+		goto done;
+
+	a = bkey_s_c_to_alloc_v4(k);
+	if (gen >= 0 && a.v->gen != gen)
+		goto done;
+
+	for (i = 0; i < BCH_ALLOC_V4_NR_BACKPOINTERS(a.v); i++) {
+		if (alloc_v4_backpointers_c(a.v)[i].bucket_offset < *bp_offset)
+			continue;
+
+		*dst = alloc_v4_backpointers_c(a.v)[i];
+		*bp_offset = dst->bucket_offset;
+		goto out;
+	}
+
+	for_each_btree_key_norestart(trans, bp_iter, BTREE_ID_backpointers,
+				     bp_pos, 0, k, ret) {
+		if (bpos_ge(k.k->p, bp_end_pos))
+			break;
+
+		if (k.k->type != KEY_TYPE_backpointer)
+			continue;
+
+		*dst = *bkey_s_c_to_backpointer(k).v;
+		*bp_offset = dst->bucket_offset + BACKPOINTER_OFFSET_MAX;
+		goto out;
+	}
+done:
+	*bp_offset = U64_MAX;
+out:
+	bch2_trans_iter_exit(trans, &bp_iter);
+	bch2_trans_iter_exit(trans, &alloc_iter);
+	return ret;
+}
+
+static void backpointer_not_found(struct btree_trans *trans,
+				  struct bpos bucket,
+				  u64 bp_offset,
+				  struct bch_backpointer bp,
+				  struct bkey_s_c k,
+				  const char *thing_it_points_to)
+{
+	struct bch_fs *c = trans->c;
+	struct printbuf buf = PRINTBUF;
+
+	if (likely(!bch2_backpointers_no_use_write_buffer))
+		return;
+
+	prt_printf(&buf, "backpointer doesn't match %s it points to:\n  ",
+		   thing_it_points_to);
+	prt_printf(&buf, "bucket: ");
+	bch2_bpos_to_text(&buf, bucket);
+	prt_printf(&buf, "\n  ");
+
+	if (bp_offset >= BACKPOINTER_OFFSET_MAX) {
+		struct bpos bp_pos =
+			bucket_pos_to_bp(c, bucket,
+					bp_offset - BACKPOINTER_OFFSET_MAX);
+		prt_printf(&buf, "backpointer pos: ");
+		bch2_bpos_to_text(&buf, bp_pos);
+		prt_printf(&buf, "\n  ");
+	}
+
+	bch2_backpointer_to_text(&buf, &bp);
+	prt_printf(&buf, "\n  ");
+	bch2_bkey_val_to_text(&buf, c, k);
+	if (!test_bit(BCH_FS_CHECK_BACKPOINTERS_DONE, &c->flags))
+		bch_err_ratelimited(c, "%s", buf.buf);
+	else
+		bch2_trans_inconsistent(trans, "%s", buf.buf);
+
+	printbuf_exit(&buf);
+}
+
+struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *trans,
+					 struct btree_iter *iter,
+					 struct bpos bucket,
+					 u64 bp_offset,
+					 struct bch_backpointer bp)
+{
+	struct bch_fs *c = trans->c;
+	struct bkey_s_c k;
+
+	bch2_trans_node_iter_init(trans, iter,
+				  bp.btree_id,
+				  bp.pos,
+				  0,
+				  min(bp.level, c->btree_roots[bp.btree_id].level),
+				  0);
+	k = bch2_btree_iter_peek_slot(iter);
+	if (bkey_err(k)) {
+		bch2_trans_iter_exit(trans, iter);
+		return k;
+	}
+
+	if (bp.level == c->btree_roots[bp.btree_id].level + 1)
+		k = bkey_i_to_s_c(&c->btree_roots[bp.btree_id].key);
+
+	if (k.k && extent_matches_bp(c, bp.btree_id, bp.level, k, bucket, bp))
+		return k;
+
+	bch2_trans_iter_exit(trans, iter);
+
+	if (unlikely(bch2_backpointers_no_use_write_buffer)) {
+		if (bp.level) {
+			struct btree *b;
+
+			/*
+			 * If a backpointer for a btree node wasn't found, it may be
+			 * because it was overwritten by a new btree node that hasn't
+			 * been written out yet - backpointer_get_node() checks for
+			 * this:
+			 */
+			b = bch2_backpointer_get_node(trans, iter, bucket, bp_offset, bp);
+			if (!IS_ERR_OR_NULL(b))
+				return bkey_i_to_s_c(&b->key);
+
+			bch2_trans_iter_exit(trans, iter);
+
+			if (IS_ERR(b))
+				return bkey_s_c_err(PTR_ERR(b));
+			return bkey_s_c_null;
+		}
+
+		backpointer_not_found(trans, bucket, bp_offset, bp, k, "extent");
+	}
+
+	return bkey_s_c_null;
+}
+
+struct btree *bch2_backpointer_get_node(struct btree_trans *trans,
+					struct btree_iter *iter,
+					struct bpos bucket,
+					u64 bp_offset,
+					struct bch_backpointer bp)
+{
+	struct bch_fs *c = trans->c;
+	struct btree *b;
+
+	BUG_ON(!bp.level);
+
+	bch2_trans_node_iter_init(trans, iter,
+				  bp.btree_id,
+				  bp.pos,
+				  0,
+				  bp.level - 1,
+				  0);
+	b = bch2_btree_iter_peek_node(iter);
+	if (IS_ERR(b))
+		goto err;
+
+	if (b && extent_matches_bp(c, bp.btree_id, bp.level,
+				   bkey_i_to_s_c(&b->key),
+				   bucket, bp))
+		return b;
+
+	if (b && btree_node_will_make_reachable(b)) {
+		b = ERR_PTR(-BCH_ERR_backpointer_to_overwritten_btree_node);
+	} else {
+		backpointer_not_found(trans, bucket, bp_offset, bp,
+				      bkey_i_to_s_c(&b->key), "btree node");
+		b = NULL;
+	}
+err:
+	bch2_trans_iter_exit(trans, iter);
+	return b;
+}
+
+static int bch2_check_btree_backpointer(struct btree_trans *trans, struct btree_iter *bp_iter,
+					struct bkey_s_c k)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter alloc_iter = { NULL };
+	struct bch_dev *ca;
+	struct bkey_s_c alloc_k;
+	struct printbuf buf = PRINTBUF;
+	int ret = 0;
+
+	if (fsck_err_on(!bch2_dev_exists2(c, k.k->p.inode), c,
+			"backpointer for mising device:\n%s",
+			(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
+		ret = bch2_btree_delete_at(trans, bp_iter, 0);
+		goto out;
+	}
+
+	ca = bch_dev_bkey_exists(c, k.k->p.inode);
+
+	bch2_trans_iter_init(trans, &alloc_iter, BTREE_ID_alloc,
+			     bp_pos_to_bucket(c, k.k->p), 0);
+
+	alloc_k = bch2_btree_iter_peek_slot(&alloc_iter);
+	ret = bkey_err(alloc_k);
+	if (ret)
+		goto out;
+
+	if (fsck_err_on(alloc_k.k->type != KEY_TYPE_alloc_v4, c,
+			"backpointer for nonexistent alloc key: %llu:%llu:0\n%s",
+			alloc_iter.pos.inode, alloc_iter.pos.offset,
+			(bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) {
+		ret = bch2_btree_delete_at(trans, bp_iter, 0);
+		goto out;
+	}
+out:
+fsck_err:
+	bch2_trans_iter_exit(trans, &alloc_iter);
+	printbuf_exit(&buf);
+	return ret;
+}
+
+/* verify that every backpointer has a corresponding alloc key */
+int bch2_check_btree_backpointers(struct bch_fs *c)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+
+	return bch2_trans_run(c,
+		for_each_btree_key_commit(&trans, iter,
+			BTREE_ID_backpointers, POS_MIN, 0, k,
+			NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
+		  bch2_check_btree_backpointer(&trans, &iter, k)));
+}
+
+static int check_bp_exists(struct btree_trans *trans,
+			   struct bpos bucket_pos,
+			   struct bch_backpointer bp,
+			   struct bkey_s_c orig_k)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter alloc_iter, bp_iter = { NULL };
+	struct printbuf buf = PRINTBUF;
+	struct bkey_s_c alloc_k, bp_k;
+	int ret;
+
+	bch2_trans_iter_init(trans, &alloc_iter, BTREE_ID_alloc, bucket_pos, 0);
+	alloc_k = bch2_btree_iter_peek_slot(&alloc_iter);
+	ret = bkey_err(alloc_k);
+	if (ret)
+		goto err;
+
+	if (alloc_k.k->type == KEY_TYPE_alloc_v4) {
+		struct bkey_s_c_alloc_v4 a = bkey_s_c_to_alloc_v4(alloc_k);
+		const struct bch_backpointer *bps = alloc_v4_backpointers_c(a.v);
+		unsigned i, nr = BCH_ALLOC_V4_NR_BACKPOINTERS(a.v);
+
+		for (i = 0; i < nr; i++) {
+			int cmp = backpointer_cmp(bps[i], bp) ?:
+				memcmp(&bps[i], &bp, sizeof(bp));
+			if (!cmp)
+				goto out;
+			if (cmp >= 0)
+				break;
+		}
+	} else {
+		goto missing;
+	}
+
+	bch2_trans_iter_init(trans, &bp_iter, BTREE_ID_backpointers,
+			     bucket_pos_to_bp(c, bucket_pos, bp.bucket_offset),
+			     0);
+	bp_k = bch2_btree_iter_peek_slot(&bp_iter);
+	ret = bkey_err(bp_k);
+	if (ret)
+		goto err;
+
+	if (bp_k.k->type != KEY_TYPE_backpointer ||
+	    memcmp(bkey_s_c_to_backpointer(bp_k).v, &bp, sizeof(bp)))
+		goto missing;
+out:
+err:
+fsck_err:
+	bch2_trans_iter_exit(trans, &bp_iter);
+	bch2_trans_iter_exit(trans, &alloc_iter);
+	printbuf_exit(&buf);
+	return ret;
+missing:
+	prt_printf(&buf, "missing backpointer for btree=%s l=%u ",
+	       bch2_btree_ids[bp.btree_id], bp.level);
+	bch2_bkey_val_to_text(&buf, c, orig_k);
+	prt_printf(&buf, "\nbp pos ");
+	bch2_bpos_to_text(&buf, bp_iter.pos);
+
+	if (c->sb.version < bcachefs_metadata_version_backpointers ||
+	    c->opts.reconstruct_alloc ||
+	    fsck_err(c, "%s", buf.buf)) {
+		struct bkey_i_alloc_v4 *a = bch2_alloc_to_v4_mut(trans, alloc_k);
+
+		ret   = PTR_ERR_OR_ZERO(a) ?:
+			bch2_bucket_backpointer_mod(trans, a, bp, orig_k, true);
+	}
+
+	goto out;
+}
+
+static int check_extent_to_backpointers(struct btree_trans *trans,
+					struct btree_iter *iter)
+{
+	struct bch_fs *c = trans->c;
+	struct bkey_ptrs_c ptrs;
+	const union bch_extent_entry *entry;
+	struct extent_ptr_decoded p;
+	struct bkey_s_c k;
+	int ret;
+
+	k = bch2_btree_iter_peek_all_levels(iter);
+	ret = bkey_err(k);
+	if (ret)
+		return ret;
+	if (!k.k)
+		return 0;
+
+	ptrs = bch2_bkey_ptrs_c(k);
+	bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+		struct bpos bucket_pos;
+		struct bch_backpointer bp;
+
+		if (p.ptr.cached)
+			continue;
+
+		bch2_extent_ptr_to_bp(c, iter->btree_id, iter->path->level,
+				      k, p, &bucket_pos, &bp);
+
+		ret = check_bp_exists(trans, bucket_pos, bp, k);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+static int check_btree_root_to_backpointers(struct btree_trans *trans,
+					    enum btree_id btree_id)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter iter;
+	struct btree *b;
+	struct bkey_s_c k;
+	struct bkey_ptrs_c ptrs;
+	struct extent_ptr_decoded p;
+	const union bch_extent_entry *entry;
+	int ret;
+
+	bch2_trans_node_iter_init(trans, &iter, btree_id, POS_MIN, 0,
+				  c->btree_roots[btree_id].level, 0);
+	b = bch2_btree_iter_peek_node(&iter);
+	ret = PTR_ERR_OR_ZERO(b);
+	if (ret)
+		goto err;
+
+	BUG_ON(b != btree_node_root(c, b));
+
+	k = bkey_i_to_s_c(&b->key);
+	ptrs = bch2_bkey_ptrs_c(k);
+	bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+		struct bpos bucket_pos;
+		struct bch_backpointer bp;
+
+		if (p.ptr.cached)
+			continue;
+
+		bch2_extent_ptr_to_bp(c, iter.btree_id, iter.path->level + 1,
+				      k, p, &bucket_pos, &bp);
+
+		ret = check_bp_exists(trans, bucket_pos, bp, k);
+		if (ret)
+			goto err;
+	}
+err:
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+int bch2_check_extents_to_backpointers(struct bch_fs *c)
+{
+	struct btree_trans trans;
+	struct btree_iter iter;
+	enum btree_id btree_id;
+	int ret = 0;
+
+	bch2_trans_init(&trans, c, 0, 0);
+	for (btree_id = 0; btree_id < BTREE_ID_NR; btree_id++) {
+		unsigned depth = btree_type_has_ptrs(btree_id) ? 0 : 1;
+
+		bch2_trans_node_iter_init(&trans, &iter, btree_id, POS_MIN, 0,
+					  depth,
+					  BTREE_ITER_ALL_LEVELS|
+					  BTREE_ITER_PREFETCH);
+
+		do {
+			ret = commit_do(&trans, NULL, NULL,
+					      BTREE_INSERT_LAZY_RW|
+					      BTREE_INSERT_NOFAIL,
+					      check_extent_to_backpointers(&trans, &iter));
+			if (ret)
+				break;
+		} while (!bch2_btree_iter_advance(&iter));
+
+		bch2_trans_iter_exit(&trans, &iter);
+
+		if (ret)
+			break;
+
+		ret = commit_do(&trans, NULL, NULL,
+				      BTREE_INSERT_LAZY_RW|
+				      BTREE_INSERT_NOFAIL,
+				      check_btree_root_to_backpointers(&trans, btree_id));
+		if (ret)
+			break;
+	}
+	bch2_trans_exit(&trans);
+	return ret;
+}
+
+static int check_one_backpointer(struct btree_trans *trans,
+				 struct bpos bucket,
+				 u64 *bp_offset)
+{
+	struct btree_iter iter;
+	struct bch_backpointer bp;
+	struct bkey_s_c k;
+	struct printbuf buf = PRINTBUF;
+	int ret;
+
+	ret = bch2_get_next_backpointer(trans, bucket, -1,
+					bp_offset, &bp);
+	if (ret || *bp_offset == U64_MAX)
+		return ret;
+
+	k = bch2_backpointer_get_key(trans, &iter, bucket, *bp_offset, bp);
+	ret = bkey_err(k);
+	if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node)
+		return 0;
+	if (ret)
+		return ret;
+
+	if (fsck_err_on(!k.k, trans->c,
+			"%s backpointer points to missing extent\n%s",
+			*bp_offset < BACKPOINTER_OFFSET_MAX ? "alloc" : "btree",
+			(bch2_backpointer_to_text(&buf, &bp), buf.buf))) {
+		ret = bch2_backpointer_del_by_offset(trans, bucket, *bp_offset, bp);
+		if (ret == -ENOENT)
+			bch_err(trans->c, "backpointer at %llu not found", *bp_offset);
+	}
+
+	bch2_trans_iter_exit(trans, &iter);
+fsck_err:
+	printbuf_exit(&buf);
+	return ret;
+}
+
+int bch2_check_backpointers_to_extents(struct bch_fs *c)
+{
+	struct btree_trans trans;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	int ret = 0;
+
+	bch2_trans_init(&trans, c, 0, 0);
+	for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN,
+			   BTREE_ITER_PREFETCH, k, ret) {
+		u64 bp_offset = 0;
+
+		while (!(ret = commit_do(&trans, NULL, NULL,
+					       BTREE_INSERT_LAZY_RW|
+					       BTREE_INSERT_NOFAIL,
+				check_one_backpointer(&trans, iter.pos, &bp_offset))) &&
+		       bp_offset < U64_MAX)
+			bp_offset++;
+
+		if (ret)
+			break;
+	}
+	bch2_trans_iter_exit(&trans, &iter);
+	bch2_trans_exit(&trans);
+	return ret < 0 ? ret : 0;
+}
diff --git a/fs/bcachefs/backpointers.h b/fs/bcachefs/backpointers.h
new file mode 100644
index 000000000000..e1506492f022
--- /dev/null
+++ b/fs/bcachefs/backpointers.h
@@ -0,0 +1,131 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BACKPOINTERS_BACKGROUND_H
+#define _BCACHEFS_BACKPOINTERS_BACKGROUND_H
+
+#include "btree_iter.h"
+#include "btree_update.h"
+#include "buckets.h"
+#include "super.h"
+
+int bch2_backpointer_invalid(const struct bch_fs *, struct bkey_s_c k,
+			     int, struct printbuf *);
+void bch2_backpointer_to_text(struct printbuf *, const struct bch_backpointer *);
+void bch2_backpointer_k_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
+void bch2_backpointer_swab(struct bkey_s);
+
+#define bch2_bkey_ops_backpointer ((struct bkey_ops) {	\
+	.key_invalid	= bch2_backpointer_invalid,	\
+	.val_to_text	= bch2_backpointer_k_to_text,	\
+	.swab		= bch2_backpointer_swab,	\
+})
+
+#define MAX_EXTENT_COMPRESS_RATIO_SHIFT		10
+
+/*
+ * Convert from pos in backpointer btree to pos of corresponding bucket in alloc
+ * btree:
+ */
+static inline struct bpos bp_pos_to_bucket(const struct bch_fs *c,
+					   struct bpos bp_pos)
+{
+	struct bch_dev *ca = bch_dev_bkey_exists(c, bp_pos.inode);
+	u64 bucket_sector = bp_pos.offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT;
+
+	return POS(bp_pos.inode, sector_to_bucket(ca, bucket_sector));
+}
+
+/*
+ * Convert from pos in alloc btree + bucket offset to pos in backpointer btree:
+ */
+static inline struct bpos bucket_pos_to_bp(const struct bch_fs *c,
+					   struct bpos bucket,
+					   u64 bucket_offset)
+{
+	struct bch_dev *ca = bch_dev_bkey_exists(c, bucket.inode);
+	struct bpos ret;
+
+	ret = POS(bucket.inode,
+		  (bucket_to_sector(ca, bucket.offset) <<
+		   MAX_EXTENT_COMPRESS_RATIO_SHIFT) + bucket_offset);
+
+	BUG_ON(!bkey_eq(bucket, bp_pos_to_bucket(c, ret)));
+
+	return ret;
+}
+
+bool bch2_bucket_backpointer_del(struct btree_trans *,
+				struct bkey_i_alloc_v4 *,
+				struct bch_backpointer);
+
+int bch2_bucket_backpointer_mod_nowritebuffer(struct btree_trans *,
+				struct bkey_i_alloc_v4 *,
+				struct bch_backpointer, struct bkey_s_c, bool);
+
+static inline int bch2_bucket_backpointer_mod(struct btree_trans *trans,
+				struct bkey_i_alloc_v4 *a,
+				struct bch_backpointer bp,
+				struct bkey_s_c orig_k,
+				bool insert)
+{
+	struct bch_fs *c = trans->c;
+	struct bkey_i_backpointer *bp_k;
+	int ret;
+
+	if (!insert &&
+	    unlikely(BCH_ALLOC_V4_NR_BACKPOINTERS(&a->v)) &&
+	    bch2_bucket_backpointer_del(trans, a, bp))
+		return 0;
+
+	if (unlikely(bch2_backpointers_no_use_write_buffer))
+		return bch2_bucket_backpointer_mod_nowritebuffer(trans, a, bp, orig_k, insert);
+
+	bp_k = bch2_trans_kmalloc_nomemzero(trans, sizeof(struct bkey_i_backpointer));
+	ret = PTR_ERR_OR_ZERO(bp_k);
+	if (ret)
+		return ret;
+
+	bkey_backpointer_init(&bp_k->k_i);
+	bp_k->k.p = bucket_pos_to_bp(c, a->k.p, bp.bucket_offset);
+	bp_k->v = bp;
+
+	if (!insert) {
+		bp_k->k.type = KEY_TYPE_deleted;
+		set_bkey_val_u64s(&bp_k->k, 0);
+	}
+
+	return bch2_trans_update_buffered(trans, BTREE_ID_backpointers, &bp_k->k_i);
+}
+
+static inline void bch2_extent_ptr_to_bp(struct bch_fs *c,
+			   enum btree_id btree_id, unsigned level,
+			   struct bkey_s_c k, struct extent_ptr_decoded p,
+			   struct bpos *bucket_pos, struct bch_backpointer *bp)
+{
+	enum bch_data_type data_type = level ? BCH_DATA_btree : BCH_DATA_user;
+	s64 sectors = level ? btree_sectors(c) : k.k->size;
+	u32 bucket_offset;
+
+	*bucket_pos = PTR_BUCKET_POS_OFFSET(c, &p.ptr, &bucket_offset);
+	*bp = (struct bch_backpointer) {
+		.btree_id	= btree_id,
+		.level		= level,
+		.data_type	= data_type,
+		.bucket_offset	= ((u64) bucket_offset << MAX_EXTENT_COMPRESS_RATIO_SHIFT) +
+			p.crc.offset,
+		.bucket_len	= ptr_disk_sectors(sectors, p),
+		.pos		= k.k->p,
+	};
+}
+
+int bch2_get_next_backpointer(struct btree_trans *, struct bpos, int,
+			      u64 *, struct bch_backpointer *);
+struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *, struct btree_iter *,
+					 struct bpos, u64, struct bch_backpointer);
+struct btree *bch2_backpointer_get_node(struct btree_trans *, struct btree_iter *,
+					struct bpos, u64, struct bch_backpointer);
+
+int bch2_check_btree_backpointers(struct bch_fs *);
+int bch2_check_extents_to_backpointers(struct bch_fs *);
+int bch2_check_backpointers_to_extents(struct bch_fs *);
+
+#endif /* _BCACHEFS_BACKPOINTERS_BACKGROUND_H */
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 91f635faccb0..6d048e5d8843 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -315,7 +315,10 @@ do {									\
 		"done in memory")					\
 	BCH_DEBUG_PARAM(verify_all_btree_replicas,			\
 		"When reading btree nodes, read all replicas and "	\
-		"compare them")
+		"compare them")						\
+	BCH_DEBUG_PARAM(backpointers_no_use_write_buffer,		\
+		"Don't use the write buffer for backpointers, enabling "\
+		"extra runtime checks")
 
 /* Parameters that should only be compiled in debug mode: */
 #define BCH_DEBUG_PARAMS_DEBUG()					\
@@ -435,6 +438,7 @@ enum gc_phase {
 	GC_PHASE_BTREE_lru,
 	GC_PHASE_BTREE_freespace,
 	GC_PHASE_BTREE_need_discard,
+	GC_PHASE_BTREE_backpointers,
 
 	GC_PHASE_PENDING_DELETE,
 };
@@ -552,6 +556,7 @@ enum {
 	BCH_FS_INITIAL_GC_DONE,		/* kill when we enumerate fsck passes */
 	BCH_FS_CHECK_ALLOC_DONE,
 	BCH_FS_CHECK_LRUS_DONE,
+	BCH_FS_CHECK_BACKPOINTERS_DONE,
 	BCH_FS_CHECK_ALLOC_TO_LRU_REFS_DONE,
 	BCH_FS_FSCK_DONE,
 	BCH_FS_INITIAL_GC_UNFIXED,	/* kill when we enumerate fsck errors */
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index 8e070402e73f..66c885186160 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -369,7 +369,8 @@ static inline void bkey_init(struct bkey *k)
 	x(alloc_v3,		24)			\
 	x(set,			25)			\
 	x(lru,			26)			\
-	x(alloc_v4,		27)
+	x(alloc_v4,		27)			\
+	x(backpointer,		28)
 
 enum bch_bkey_type {
 #define x(name, nr) KEY_TYPE_##name	= nr,
@@ -890,6 +891,12 @@ struct bch_alloc {
 	x(stripe,		32)		\
 	x(stripe_redundancy,	8)
 
+enum {
+#define x(name, _bits) BCH_ALLOC_FIELD_V1_##name,
+	BCH_ALLOC_FIELDS_V1()
+#undef x
+};
+
 struct bch_alloc_v2 {
 	struct bch_val		v;
 	__u8			nr_fields;
@@ -918,6 +925,9 @@ struct bch_alloc_v3 {
 	__u8			data[];
 } __packed __aligned(8);
 
+LE32_BITMASK(BCH_ALLOC_V3_NEED_DISCARD,struct bch_alloc_v3, flags,  0,  1)
+LE32_BITMASK(BCH_ALLOC_V3_NEED_INC_GEN,struct bch_alloc_v3, flags,  1,  2)
+
 struct bch_alloc_v4 {
 	struct bch_val		v;
 	__u64			journal_seq;
@@ -931,25 +941,27 @@ struct bch_alloc_v4 {
 	__u64			io_time[2];
 	__u32			stripe;
 	__u32			nr_external_backpointers;
-	struct bpos		backpointers[0];
 } __packed __aligned(8);
 
 #define BCH_ALLOC_V4_U64s_V0	6
 #define BCH_ALLOC_V4_U64s	(sizeof(struct bch_alloc_v4) / sizeof(u64))
 
-LE32_BITMASK(BCH_ALLOC_V3_NEED_DISCARD,struct bch_alloc_v3, flags,  0,  1)
-LE32_BITMASK(BCH_ALLOC_V3_NEED_INC_GEN,struct bch_alloc_v3, flags,  1,  2)
-
 BITMASK(BCH_ALLOC_V4_NEED_DISCARD,	struct bch_alloc_v4, flags,  0,  1)
 BITMASK(BCH_ALLOC_V4_NEED_INC_GEN,	struct bch_alloc_v4, flags,  1,  2)
 BITMASK(BCH_ALLOC_V4_BACKPOINTERS_START,struct bch_alloc_v4, flags,  2,  8)
 BITMASK(BCH_ALLOC_V4_NR_BACKPOINTERS,	struct bch_alloc_v4, flags,  8,  14)
 
-enum {
-#define x(name, _bits) BCH_ALLOC_FIELD_V1_##name,
-	BCH_ALLOC_FIELDS_V1()
-#undef x
-};
+#define BCH_ALLOC_V4_NR_BACKPOINTERS_MAX	40
+
+struct bch_backpointer {
+	struct bch_val		v;
+	__u8			btree_id;
+	__u8			level;
+	__u8			data_type;
+	__u64			bucket_offset:40;
+	__u32			bucket_len;
+	struct bpos		pos;
+} __packed __aligned(8);
 
 /* Quotas: */
 
@@ -1486,7 +1498,8 @@ struct bch_sb_field_journal_seq_blacklist {
 	x(inode_v2,			18)		\
 	x(freespace,			19)		\
 	x(alloc_v4,			20)		\
-	x(new_data_types,		21)
+	x(new_data_types,		21)		\
+	x(backpointers,			22)
 
 enum bcachefs_metadata_version {
 	bcachefs_metadata_version_min = 9,
@@ -2007,19 +2020,20 @@ LE32_BITMASK(JSET_NO_FLUSH,	struct jset, flags, 5, 6);
 /* Btree: */
 
 #define BCH_BTREE_IDS()				\
-	x(extents,	0)			\
-	x(inodes,	1)			\
-	x(dirents,	2)			\
-	x(xattrs,	3)			\
-	x(alloc,	4)			\
-	x(quotas,	5)			\
-	x(stripes,	6)			\
-	x(reflink,	7)			\
-	x(subvolumes,	8)			\
-	x(snapshots,	9)			\
-	x(lru,		10)			\
-	x(freespace,	11)			\
-	x(need_discard,	12)
+	x(extents,		0)		\
+	x(inodes,		1)		\
+	x(dirents,		2)		\
+	x(xattrs,		3)		\
+	x(alloc,		4)		\
+	x(quotas,		5)		\
+	x(stripes,		6)		\
+	x(reflink,		7)		\
+	x(subvolumes,		8)		\
+	x(snapshots,		9)		\
+	x(lru,			10)		\
+	x(freespace,		11)		\
+	x(need_discard,		12)		\
+	x(backpointers,		13)
 
 enum btree_id {
 #define x(kwd, val) BTREE_ID_##kwd = val,
diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c
index 29809da5e9cf..45c8b2c61c5b 100644
--- a/fs/bcachefs/bkey_methods.c
+++ b/fs/bcachefs/bkey_methods.c
@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 
 #include "bcachefs.h"
+#include "backpointers.h"
 #include "bkey_methods.h"
 #include "btree_types.h"
 #include "alloc_background.h"
@@ -191,6 +192,9 @@ static unsigned bch2_key_types_allowed[] = {
 	[BKEY_TYPE_need_discard] =
 		(1U << KEY_TYPE_deleted)|
 		(1U << KEY_TYPE_set),
+	[BKEY_TYPE_backpointers] =
+		(1U << KEY_TYPE_deleted)|
+		(1U << KEY_TYPE_backpointer),
 	[BKEY_TYPE_btree] =
 		(1U << KEY_TYPE_deleted)|
 		(1U << KEY_TYPE_btree_ptr)|
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 86f48f5762dd..b657f8545a3b 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -7,6 +7,7 @@
 
 #include "bcachefs.h"
 #include "alloc_background.h"
+#include "backpointers.h"
 #include "bset.h"
 #include "btree_gc.h"
 #include "btree_update.h"
@@ -662,16 +663,6 @@ err:
 	return ret;
 }
 
-static s64 ptr_disk_sectors(s64 sectors, struct extent_ptr_decoded p)
-{
-	EBUG_ON(sectors < 0);
-
-	return crc_is_compressed(p.crc)
-		? DIV_ROUND_UP_ULL(sectors * p.crc.compressed_size,
-				   p.crc.uncompressed_size)
-		: sectors;
-}
-
 static int check_bucket_ref(struct bch_fs *c,
 			    struct bkey_s_c k,
 			    const struct bch_extent_ptr *ptr,
@@ -1399,22 +1390,42 @@ need_mark:
 
 /* trans_mark: */
 
-static int bch2_trans_mark_pointer(struct btree_trans *trans,
-			struct bkey_s_c k, struct extent_ptr_decoded p,
-			s64 sectors, enum bch_data_type data_type)
+static inline int bch2_trans_mark_pointer(struct btree_trans *trans,
+				   enum btree_id btree_id, unsigned level,
+				   struct bkey_s_c k, struct extent_ptr_decoded p,
+				   unsigned flags)
 {
+	bool insert = !(flags & BTREE_TRIGGER_OVERWRITE);
 	struct btree_iter iter;
 	struct bkey_i_alloc_v4 *a;
+	struct bpos bucket_pos;
+	struct bch_backpointer bp;
+	s64 sectors;
 	int ret;
 
-	a = bch2_trans_start_alloc_update(trans, &iter, PTR_BUCKET_POS(trans->c, &p.ptr));
+	bch2_extent_ptr_to_bp(trans->c, btree_id, level, k, p, &bucket_pos, &bp);
+	sectors = bp.bucket_len;
+	if (!insert)
+		sectors = -sectors;
+
+	a = bch2_trans_start_alloc_update(trans, &iter, bucket_pos);
 	if (IS_ERR(a))
 		return PTR_ERR(a);
 
-	ret = __mark_pointer(trans, k, &p.ptr, sectors, data_type,
+	ret = __mark_pointer(trans, k, &p.ptr, sectors, bp.data_type,
 			     a->v.gen, &a->v.data_type,
-			     &a->v.dirty_sectors, &a->v.cached_sectors) ?:
-		bch2_trans_update(trans, &iter, &a->k_i, 0);
+			     &a->v.dirty_sectors, &a->v.cached_sectors);
+	if (ret)
+		goto err;
+
+	if (!p.ptr.cached) {
+		ret = bch2_bucket_backpointer_mod(trans, a, bp, k, insert);
+		if (ret)
+			goto err;
+	}
+
+	ret = bch2_trans_update(trans, &iter, &a->k_i, 0);
+err:
 	bch2_trans_iter_exit(trans, &iter);
 	return ret;
 }
@@ -1497,8 +1508,7 @@ int bch2_trans_mark_extent(struct btree_trans *trans,
 		if (flags & BTREE_TRIGGER_OVERWRITE)
 			disk_sectors = -disk_sectors;
 
-		ret = bch2_trans_mark_pointer(trans, k, p,
-					disk_sectors, data_type);
+		ret = bch2_trans_mark_pointer(trans, btree_id, level, k, p, flags);
 		if (ret < 0)
 			return ret;
 
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index e8e3a3b09714..3398c9c3a81b 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -75,6 +75,15 @@ static inline struct bpos PTR_BUCKET_POS(const struct bch_fs *c,
 	return POS(ptr->dev, PTR_BUCKET_NR(ca, ptr));
 }
 
+static inline struct bpos PTR_BUCKET_POS_OFFSET(const struct bch_fs *c,
+						const struct bch_extent_ptr *ptr,
+						u32 *bucket_offset)
+{
+	struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+
+	return POS(ptr->dev, sector_to_bucket_and_offset(ca, ptr->offset, bucket_offset));
+}
+
 static inline struct bucket *PTR_GC_BUCKET(struct bch_dev *ca,
 					   const struct bch_extent_ptr *ptr)
 {
@@ -90,6 +99,16 @@ static inline enum bch_data_type ptr_data_type(const struct bkey *k,
 	return ptr->cached ? BCH_DATA_cached : BCH_DATA_user;
 }
 
+static inline s64 ptr_disk_sectors(s64 sectors, struct extent_ptr_decoded p)
+{
+	EBUG_ON(sectors < 0);
+
+	return crc_is_compressed(p.crc)
+		? DIV_ROUND_UP_ULL(sectors * p.crc.compressed_size,
+				   p.crc.uncompressed_size)
+		: sectors;
+}
+
 static inline int gen_cmp(u8 a, u8 b)
 {
 	return (s8) (a - b);
diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h
index 7a6448f48fca..804bc15dce31 100644
--- a/fs/bcachefs/errcode.h
+++ b/fs/bcachefs/errcode.h
@@ -60,6 +60,7 @@
 	x(BCH_ERR_btree_insert_fail,	btree_insert_need_journal_res)		\
 	x(BCH_ERR_btree_insert_fail,	btree_insert_need_journal_reclaim)	\
 	x(BCH_ERR_btree_insert_fail,	btree_insert_need_flush_buffer)		\
+	x(0,				backpointer_to_overwritten_btree_node)	\
 	x(0,				lock_fail_root_changed)			\
 	x(0,				journal_reclaim_would_deadlock)		\
 	x(0,				fsck)					\
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 61890755d335..55356c117737 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 
 #include "bcachefs.h"
+#include "backpointers.h"
 #include "bkey_buf.h"
 #include "alloc_background.h"
 #include "btree_gc.h"
@@ -925,6 +926,7 @@ static bool btree_id_is_alloc(enum btree_id id)
 {
 	switch (id) {
 	case BTREE_ID_alloc:
+	case BTREE_ID_backpointers:
 	case BTREE_ID_need_discard:
 	case BTREE_ID_freespace:
 		return true;
@@ -1091,8 +1093,8 @@ int bch2_fs_recovery(struct bch_fs *c)
 	}
 
 	if (!c->opts.nochanges) {
-		if (c->sb.version < bcachefs_metadata_version_new_data_types) {
-			bch_info(c, "version prior to new_data_types, upgrade and fsck required");
+		if (c->sb.version < bcachefs_metadata_version_backpointers) {
+			bch_info(c, "version prior to backpointers, upgrade and fsck required");
 			c->opts.version_upgrade	= true;
 			c->opts.fsck		= true;
 			c->opts.fix_errors	= FSCK_OPT_YES;
@@ -1301,6 +1303,28 @@ use_clean:
 		bch_verbose(c, "done checking lrus");
 		set_bit(BCH_FS_CHECK_LRUS_DONE, &c->flags);
 
+		bch_info(c, "checking backpointers to alloc keys");
+		err = "error checking backpointers to alloc keys";
+		ret = bch2_check_btree_backpointers(c);
+		if (ret)
+			goto err;
+		bch_verbose(c, "done checking backpointers to alloc keys");
+
+		bch_info(c, "checking backpointers to extents");
+		err = "error checking backpointers to extents";
+		ret = bch2_check_backpointers_to_extents(c);
+		if (ret)
+			goto err;
+		bch_verbose(c, "done checking backpointers to extents");
+
+		bch_info(c, "checking extents to backpointers");
+		err = "error checking extents to backpointers";
+		ret = bch2_check_extents_to_backpointers(c);
+		if (ret)
+			goto err;
+		bch_verbose(c, "done checking extents to backpointers");
+		set_bit(BCH_FS_CHECK_BACKPOINTERS_DONE, &c->flags);
+
 		bch_info(c, "checking alloc to lru refs");
 		err = "error checking alloc to lru refs";
 		ret = bch2_check_alloc_to_lru_refs(c);
@@ -1312,6 +1336,7 @@ use_clean:
 		set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags);
 		set_bit(BCH_FS_CHECK_ALLOC_DONE, &c->flags);
 		set_bit(BCH_FS_CHECK_LRUS_DONE, &c->flags);
+		set_bit(BCH_FS_CHECK_BACKPOINTERS_DONE, &c->flags);
 		set_bit(BCH_FS_CHECK_ALLOC_TO_LRU_REFS_DONE, &c->flags);
 		set_bit(BCH_FS_FSCK_DONE, &c->flags);
 
@@ -1471,6 +1496,9 @@ int bch2_fs_initialize(struct bch_fs *c)
 	c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_extents_above_btree_updates_done);
 	c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_bformat_overflow_done);
 
+	if (c->sb.version < bcachefs_metadata_version_backpointers)
+		c->opts.version_upgrade	= true;
+
 	if (c->opts.version_upgrade) {
 		c->disk_sb.sb->version = cpu_to_le16(bcachefs_metadata_version_current);
 		c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALL);
@@ -1479,6 +1507,9 @@ int bch2_fs_initialize(struct bch_fs *c)
 	mutex_unlock(&c->sb_lock);
 
 	set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags);
+	set_bit(BCH_FS_CHECK_LRUS_DONE, &c->flags);
+	set_bit(BCH_FS_CHECK_BACKPOINTERS_DONE, &c->flags);
+	set_bit(BCH_FS_CHECK_ALLOC_TO_LRU_REFS_DONE, &c->flags);
 	set_bit(BCH_FS_MAY_GO_RW, &c->flags);
 	set_bit(BCH_FS_FSCK_DONE, &c->flags);
 
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index ade8d074e887..c5efaa7d38a8 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -1431,6 +1431,8 @@ static int bch2_dev_remove_alloc(struct bch_fs *c, struct bch_dev *ca)
 					BTREE_TRIGGER_NORUN, NULL) ?:
 		bch2_btree_delete_range(c, BTREE_ID_freespace, start, end,
 					BTREE_TRIGGER_NORUN, NULL) ?:
+		bch2_btree_delete_range(c, BTREE_ID_backpointers, start, end,
+					BTREE_TRIGGER_NORUN, NULL) ?:
 		bch2_btree_delete_range(c, BTREE_ID_alloc, start, end,
 					BTREE_TRIGGER_NORUN, NULL);
 	if (ret)
-- 
cgit 


From 8e3f913e2ab6ac2cb9e75a0a8635d0b44f838c33 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 18 Mar 2022 00:42:09 -0400
Subject: bcachefs: Copygc now uses backpointers

Previously, copygc needed to walk the entire extents & reflink btrees to
find extents that needed to be moved.

Now that we have backpointers, this patch implements
bch2_evacuate_bucket() in the move code, which copygc now uses for
evacuating mostly empty buckets.

Also, thanks to the new backpointers code, copygc can now move btree
nodes.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/buckets_types.h |   2 +-
 fs/bcachefs/move.c          | 291 +++++++++++++++++++++++++++++++++++++++++---
 fs/bcachefs/move.h          |  10 ++
 fs/bcachefs/movinggc.c      | 236 +++++------------------------------
 fs/bcachefs/trace.h         |  31 +++++
 5 files changed, 346 insertions(+), 224 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h
index 0a9dd5af3524..1dbba7d906dd 100644
--- a/fs/bcachefs/buckets_types.h
+++ b/fs/bcachefs/buckets_types.h
@@ -95,7 +95,7 @@ struct copygc_heap_entry {
 	u8			replicas;
 	u32			fragmentation;
 	u32			sectors;
-	u64			offset;
+	u64			bucket;
 };
 
 typedef HEAP(struct copygc_heap_entry) copygc_heap;
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index 46677ad911cd..690c3128c5e1 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -1,14 +1,18 @@
 // SPDX-License-Identifier: GPL-2.0
 
 #include "bcachefs.h"
+#include "alloc_background.h"
 #include "alloc_foreground.h"
+#include "backpointers.h"
 #include "bkey_buf.h"
 #include "btree_gc.h"
 #include "btree_update.h"
 #include "btree_update_interior.h"
+#include "btree_write_buffer.h"
 #include "disk_groups.h"
 #include "ec.h"
 #include "errcode.h"
+#include "error.h"
 #include "inode.h"
 #include "io.h"
 #include "journal_reclaim.h"
@@ -66,6 +70,9 @@ static void move_write_done(struct bch_write_op *op)
 	struct moving_io *io = container_of(op, struct moving_io, write.op);
 	struct moving_context *ctxt = io->write.ctxt;
 
+	if (io->write.op.error)
+		ctxt->write_error = true;
+
 	atomic_sub(io->write_sectors, &io->write.ctxt->write_sectors);
 	move_free(io);
 	closure_put(&ctxt->cl);
@@ -401,6 +408,30 @@ static int move_ratelimit(struct btree_trans *trans,
 	return 0;
 }
 
+static int move_get_io_opts(struct btree_trans *trans,
+			    struct bch_io_opts *io_opts,
+			    struct bkey_s_c k, u64 *cur_inum)
+{
+	struct bch_inode_unpacked inode;
+	int ret;
+
+	if (*cur_inum == k.k->p.inode)
+		return 0;
+
+	ret = lookup_inode(trans,
+			   SPOS(0, k.k->p.inode, k.k->p.snapshot),
+			   &inode);
+	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+		return ret;
+
+	if (!ret)
+		bch2_inode_opts_get(io_opts, trans->c, &inode);
+	else
+		*io_opts = bch2_opts_to_inode_opts(trans->c->opts);
+	*cur_inum = k.k->p.inode;
+	return 0;
+}
+
 static int __bch2_move_data(struct moving_context *ctxt,
 			    struct bpos start,
 			    struct bpos end,
@@ -452,23 +483,9 @@ static int __bch2_move_data(struct moving_context *ctxt,
 		if (!bkey_extent_is_direct_data(k.k))
 			goto next_nondata;
 
-		if (btree_id == BTREE_ID_extents &&
-		    cur_inum != k.k->p.inode) {
-			struct bch_inode_unpacked inode;
-
-			io_opts = bch2_opts_to_inode_opts(c->opts);
-
-			ret = lookup_inode(&trans,
-					SPOS(0, k.k->p.inode, k.k->p.snapshot),
-					&inode);
-			if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-				continue;
-
-			if (!ret)
-				bch2_inode_opts_get(&io_opts, c, &inode);
-
-			cur_inum = k.k->p.inode;
-		}
+		ret = move_get_io_opts(&trans, &io_opts, k, &cur_inum);
+		if (ret)
+			continue;
 
 		memset(&data_opts, 0, sizeof(data_opts));
 		if (!pred(c, arg, k, &io_opts, &data_opts))
@@ -549,6 +566,246 @@ int bch2_move_data(struct bch_fs *c,
 	return ret;
 }
 
+static int verify_bucket_evacuated(struct btree_trans *trans, struct bpos bucket, int gen)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	struct printbuf buf = PRINTBUF;
+	struct bch_backpointer bp;
+	u64 bp_offset = 0;
+	int ret;
+
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc,
+			     bucket, BTREE_ITER_CACHED);
+again:
+	k = bch2_btree_iter_peek_slot(&iter);
+	ret = bkey_err(k);
+
+	if (!ret && k.k->type == KEY_TYPE_alloc_v4) {
+		struct bkey_s_c_alloc_v4 a = bkey_s_c_to_alloc_v4(k);
+
+		if (a.v->gen == gen &&
+		    a.v->dirty_sectors) {
+			if (a.v->data_type == BCH_DATA_btree) {
+				bch2_trans_unlock(trans);
+				if (bch2_btree_interior_updates_flush(c))
+					goto again;
+				goto failed_to_evacuate;
+			}
+		}
+	}
+
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+failed_to_evacuate:
+	bch2_trans_iter_exit(trans, &iter);
+
+	prt_printf(&buf, bch2_log_msg(c, "failed to evacuate bucket "));
+	bch2_bkey_val_to_text(&buf, c, k);
+
+	while (1) {
+		bch2_trans_begin(trans);
+
+		ret = bch2_get_next_backpointer(trans, bucket, gen,
+						&bp_offset, &bp);
+		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+			continue;
+		if (ret)
+			break;
+		if (bp_offset == U64_MAX)
+			break;
+
+		k = bch2_backpointer_get_key(trans, &iter,
+					     bucket, bp_offset, bp);
+		ret = bkey_err(k);
+		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+			continue;
+		if (ret)
+			break;
+		if (!k.k)
+			continue;
+		prt_newline(&buf);
+		bch2_bkey_val_to_text(&buf, c, k);
+		bch2_trans_iter_exit(trans, &iter);
+	}
+
+	bch2_print_string_as_lines(KERN_ERR, buf.buf);
+	printbuf_exit(&buf);
+	return 0;
+}
+
+int __bch2_evacuate_bucket(struct moving_context *ctxt,
+			   struct bpos bucket, int gen,
+			   struct data_update_opts _data_opts)
+{
+	struct bch_fs *c = ctxt->c;
+	struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts);
+	struct btree_trans trans;
+	struct btree_iter iter;
+	struct bkey_buf sk;
+	struct bch_backpointer bp;
+	struct bch_alloc_v4 a_convert;
+	const struct bch_alloc_v4 *a;
+	struct bkey_s_c k;
+	struct data_update_opts data_opts;
+	unsigned dirty_sectors, bucket_size;
+	u64 bp_offset = 0, cur_inum = U64_MAX;
+	int ret = 0;
+
+	bch2_bkey_buf_init(&sk);
+	bch2_trans_init(&trans, c, 0, 0);
+
+	bch2_trans_iter_init(&trans, &iter, BTREE_ID_alloc,
+			     bucket, BTREE_ITER_CACHED);
+	ret = lockrestart_do(&trans,
+			bkey_err(k = bch2_btree_iter_peek_slot(&iter)));
+	bch2_trans_iter_exit(&trans, &iter);
+
+	if (ret) {
+		bch_err(c, "%s: error looking up alloc key: %s", __func__, bch2_err_str(ret));
+		goto err;
+	}
+
+	a = bch2_alloc_to_v4(k, &a_convert);
+	dirty_sectors = a->dirty_sectors;
+	bucket_size = bch_dev_bkey_exists(c, bucket.inode)->mi.bucket_size;
+
+	ret = bch2_btree_write_buffer_flush(&trans);
+	if (ret) {
+		bch_err(c, "%s: error flushing btree write buffer: %s", __func__, bch2_err_str(ret));
+		goto err;
+	}
+
+	while (!(ret = move_ratelimit(&trans, ctxt))) {
+		bch2_trans_begin(&trans);
+
+		ret = bch2_get_next_backpointer(&trans, bucket, gen,
+						&bp_offset, &bp);
+		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+			continue;
+		if (ret)
+			goto err;
+		if (bp_offset == U64_MAX)
+			break;
+
+		if (!bp.level) {
+			const struct bch_extent_ptr *ptr;
+			struct bkey_s_c k;
+			unsigned i = 0;
+
+			k = bch2_backpointer_get_key(&trans, &iter,
+						bucket, bp_offset, bp);
+			ret = bkey_err(k);
+			if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+				continue;
+			if (ret)
+				goto err;
+			if (!k.k)
+				goto next;
+
+			bch2_bkey_buf_reassemble(&sk, c, k);
+			k = bkey_i_to_s_c(sk.k);
+
+			ret = move_get_io_opts(&trans, &io_opts, k, &cur_inum);
+			if (ret) {
+				bch2_trans_iter_exit(&trans, &iter);
+				continue;
+			}
+
+			data_opts = _data_opts;
+			data_opts.target	= io_opts.background_target;
+			data_opts.rewrite_ptrs = 0;
+
+			bkey_for_each_ptr(bch2_bkey_ptrs_c(k), ptr) {
+				if (ptr->dev == bucket.inode)
+					data_opts.rewrite_ptrs |= 1U << i;
+				i++;
+			}
+
+			ret = bch2_move_extent(&trans, &iter, ctxt, io_opts,
+					       bp.btree_id, k, data_opts);
+			bch2_trans_iter_exit(&trans, &iter);
+
+			if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+				continue;
+			if (ret == -ENOMEM) {
+				/* memory allocation failure, wait for some IO to finish */
+				bch2_move_ctxt_wait_for_io(ctxt, &trans);
+				continue;
+			}
+			if (ret)
+				goto err;
+
+			if (ctxt->rate)
+				bch2_ratelimit_increment(ctxt->rate, k.k->size);
+			atomic64_add(k.k->size, &ctxt->stats->sectors_seen);
+		} else {
+			struct btree *b;
+
+			b = bch2_backpointer_get_node(&trans, &iter,
+						bucket, bp_offset, bp);
+			ret = PTR_ERR_OR_ZERO(b);
+			if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node)
+				continue;
+			if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+				continue;
+			if (ret)
+				goto err;
+			if (!b)
+				goto next;
+
+			ret = bch2_btree_node_rewrite(&trans, &iter, b, 0);
+			bch2_trans_iter_exit(&trans, &iter);
+
+			if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+				continue;
+			if (ret)
+				goto err;
+
+			if (ctxt->rate)
+				bch2_ratelimit_increment(ctxt->rate,
+							 c->opts.btree_node_size >> 9);
+			atomic64_add(c->opts.btree_node_size >> 9, &ctxt->stats->sectors_seen);
+			atomic64_add(c->opts.btree_node_size >> 9, &ctxt->stats->sectors_moved);
+		}
+next:
+		bp_offset++;
+	}
+
+	trace_evacuate_bucket(c, &bucket, dirty_sectors, bucket_size, ret);
+
+	if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) && gen >= 0) {
+		bch2_trans_unlock(&trans);
+		move_ctxt_wait_event(ctxt, NULL, list_empty(&ctxt->reads));
+		closure_sync(&ctxt->cl);
+		if (!ctxt->write_error)
+			lockrestart_do(&trans, verify_bucket_evacuated(&trans, bucket, gen));
+	}
+err:
+	bch2_trans_exit(&trans);
+	bch2_bkey_buf_exit(&sk, c);
+	return ret;
+}
+
+int bch2_evacuate_bucket(struct bch_fs *c,
+			 struct bpos bucket, int gen,
+			 struct data_update_opts data_opts,
+			 struct bch_ratelimit *rate,
+			 struct bch_move_stats *stats,
+			 struct write_point_specifier wp,
+			 bool wait_on_copygc)
+{
+	struct moving_context ctxt;
+	int ret;
+
+	bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc);
+	ret = __bch2_evacuate_bucket(&ctxt, bucket, gen, data_opts);
+	bch2_moving_ctxt_exit(&ctxt);
+
+	return ret;
+}
+
 typedef bool (*move_btree_pred)(struct bch_fs *, void *,
 				struct btree *, struct bch_io_opts *,
 				struct data_update_opts *);
diff --git a/fs/bcachefs/move.h b/fs/bcachefs/move.h
index 2eb6a15542e0..b14f679f6904 100644
--- a/fs/bcachefs/move.h
+++ b/fs/bcachefs/move.h
@@ -15,6 +15,7 @@ struct moving_context {
 	struct bch_move_stats	*stats;
 	struct write_point_specifier wp;
 	bool			wait_on_copygc;
+	bool			write_error;
 
 	/* For waiting on outstanding reads and writes: */
 	struct closure		cl;
@@ -46,6 +47,15 @@ int bch2_move_data(struct bch_fs *,
 		   bool,
 		   move_pred_fn, void *);
 
+int __bch2_evacuate_bucket(struct moving_context *,
+			   struct bpos, int,
+			   struct data_update_opts);
+int bch2_evacuate_bucket(struct bch_fs *, struct bpos, int,
+			 struct data_update_opts,
+			 struct bch_ratelimit *,
+			 struct bch_move_stats *,
+			 struct write_point_specifier,
+			 bool);
 int bch2_data_job(struct bch_fs *,
 		  struct bch_move_stats *,
 		  struct bch_ioctl_data);
diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c
index a04e2330d0e6..b420b79edb36 100644
--- a/fs/bcachefs/movinggc.c
+++ b/fs/bcachefs/movinggc.c
@@ -31,79 +31,6 @@
 #include <linux/sort.h>
 #include <linux/wait.h>
 
-static int bucket_offset_cmp(const void *_l, const void *_r, size_t size)
-{
-	const struct copygc_heap_entry *l = _l;
-	const struct copygc_heap_entry *r = _r;
-
-	return  cmp_int(l->dev,    r->dev) ?:
-		cmp_int(l->offset, r->offset);
-}
-
-static bool copygc_pred(struct bch_fs *c, void *arg,
-			struct bkey_s_c k,
-			struct bch_io_opts *io_opts,
-			struct data_update_opts *data_opts)
-{
-	copygc_heap *h = &c->copygc_heap;
-	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-	const union bch_extent_entry *entry;
-	struct extent_ptr_decoded p = { 0 };
-	unsigned i = 0;
-
-	/*
-	 * We need to use the journal reserve here, because
-	 *  - journal reclaim depends on btree key cache
-	 *    flushing to make forward progress,
-	 *  - which has to make forward progress when the
-	 *    journal is pre-reservation full,
-	 *  - and depends on allocation - meaning allocator and
-	 *    copygc
-	 */
-
-	data_opts->rewrite_ptrs		= 0;
-	data_opts->target		= io_opts->background_target;
-	data_opts->extra_replicas	= 0;
-	data_opts->btree_insert_flags	= BTREE_INSERT_USE_RESERVE|
-		JOURNAL_WATERMARK_copygc;
-
-	bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
-		struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev);
-		struct copygc_heap_entry search = {
-			.dev	= p.ptr.dev,
-			.offset	= p.ptr.offset,
-		};
-		ssize_t eytz;
-
-		if (p.ptr.cached)
-			continue;
-
-		eytz = eytzinger0_find_le(h->data, h->used,
-				       sizeof(h->data[0]),
-				       bucket_offset_cmp, &search);
-#if 0
-		/* eytzinger search verify code: */
-		ssize_t j = -1, k;
-
-		for (k = 0; k < h->used; k++)
-			if (h->data[k].offset <= ptr->offset &&
-			    (j < 0 || h->data[k].offset > h->data[j].offset))
-				j = k;
-
-		BUG_ON(i != j);
-#endif
-		if (eytz >= 0 &&
-		    p.ptr.dev == h->data[eytz].dev &&
-		    p.ptr.offset < h->data[eytz].offset + ca->mi.bucket_size &&
-		    p.ptr.gen == h->data[eytz].gen)
-			data_opts->rewrite_ptrs |= 1U << i;
-
-		i++;
-	}
-
-	return data_opts->rewrite_ptrs != 0;
-}
-
 static inline int fragmentation_cmp(copygc_heap *heap,
 				   struct copygc_heap_entry l,
 				   struct copygc_heap_entry r)
@@ -111,7 +38,7 @@ static inline int fragmentation_cmp(copygc_heap *heap,
 	return cmp_int(l.fragmentation, r.fragmentation);
 }
 
-static int walk_buckets_to_copygc(struct bch_fs *c)
+static int find_buckets_to_copygc(struct bch_fs *c)
 {
 	copygc_heap *h = &c->copygc_heap;
 	struct btree_trans trans;
@@ -121,6 +48,14 @@ static int walk_buckets_to_copygc(struct bch_fs *c)
 
 	bch2_trans_init(&trans, c, 0, 0);
 
+	/*
+	 * Find buckets with lowest sector counts, skipping completely
+	 * empty buckets, by building a maxheap sorted by sector count,
+	 * and repeatedly replacing the maximum element until all
+	 * buckets have been visited.
+	 */
+	h->used = 0;
+
 	for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN,
 			   BTREE_ITER_PREFETCH, k, ret) {
 		struct bch_dev *ca = bch_dev_bkey_exists(c, iter.pos.inode);
@@ -130,7 +65,8 @@ static int walk_buckets_to_copygc(struct bch_fs *c)
 
 		a = bch2_alloc_to_v4(k, &a_convert);
 
-		if (a->data_type != BCH_DATA_user ||
+		if ((a->data_type != BCH_DATA_btree &&
+		     a->data_type != BCH_DATA_user) ||
 		    a->dirty_sectors >= ca->mi.bucket_size ||
 		    bch2_bucket_is_open(c, iter.pos.inode, iter.pos.offset))
 			continue;
@@ -142,7 +78,7 @@ static int walk_buckets_to_copygc(struct bch_fs *c)
 			.fragmentation	= div_u64((u64) a->dirty_sectors * (1ULL << 31),
 						  ca->mi.bucket_size),
 			.sectors	= a->dirty_sectors,
-			.offset		= bucket_to_sector(ca, iter.pos.offset),
+			.bucket		= iter.pos.offset,
 		};
 		heap_add_or_replace(h, e, -fragmentation_cmp, NULL);
 
@@ -153,77 +89,22 @@ static int walk_buckets_to_copygc(struct bch_fs *c)
 	return ret;
 }
 
-static int bucket_inorder_cmp(const void *_l, const void *_r)
-{
-	const struct copygc_heap_entry *l = _l;
-	const struct copygc_heap_entry *r = _r;
-
-	return cmp_int(l->dev, r->dev) ?: cmp_int(l->offset, r->offset);
-}
-
-static int check_copygc_was_done(struct bch_fs *c,
-				 u64 *sectors_not_moved,
-				 u64 *buckets_not_moved)
-{
-	copygc_heap *h = &c->copygc_heap;
-	struct btree_trans trans;
-	struct btree_iter iter;
-	struct bkey_s_c k;
-	struct bch_alloc_v4 a;
-	struct copygc_heap_entry *i;
-	int ret = 0;
-
-	sort(h->data, h->used, sizeof(h->data[0]), bucket_inorder_cmp, NULL);
-
-	bch2_trans_init(&trans, c, 0, 0);
-	bch2_trans_iter_init(&trans, &iter, BTREE_ID_alloc, POS_MIN, 0);
-
-	for (i = h->data; i < h->data + h->used; i++) {
-		struct bch_dev *ca = bch_dev_bkey_exists(c, i->dev);
-
-		bch2_btree_iter_set_pos(&iter, POS(i->dev, sector_to_bucket(ca, i->offset)));
-
-		ret = lockrestart_do(&trans,
-				bkey_err(k = bch2_btree_iter_peek_slot(&iter)));
-		if (ret)
-			break;
-
-		bch2_alloc_to_v4(k, &a);
-
-		if (a.gen == i->gen && a.dirty_sectors) {
-			*sectors_not_moved += a.dirty_sectors;
-			*buckets_not_moved += 1;
-		}
-	}
-	bch2_trans_iter_exit(&trans, &iter);
-
-	bch2_trans_exit(&trans);
-	return ret;
-}
-
 static int bch2_copygc(struct bch_fs *c)
 {
 	copygc_heap *h = &c->copygc_heap;
-	struct copygc_heap_entry e, *i;
+	struct copygc_heap_entry e;
 	struct bch_move_stats move_stats;
-	u64 sectors_to_move = 0, sectors_to_write = 0, sectors_not_moved = 0;
-	u64 sectors_reserved = 0;
-	u64 buckets_to_move, buckets_not_moved = 0;
 	struct bch_dev *ca;
 	unsigned dev_idx;
 	size_t heap_size = 0;
-	int ret;
+	struct moving_context ctxt;
+	struct data_update_opts data_opts = {
+		.btree_insert_flags = BTREE_INSERT_USE_RESERVE|JOURNAL_WATERMARK_copygc,
+	};
+	int ret = 0;
 
 	bch2_move_stats_init(&move_stats, "copygc");
 
-	/*
-	 * Find buckets with lowest sector counts, skipping completely
-	 * empty buckets, by building a maxheap sorted by sector count,
-	 * and repeatedly replacing the maximum element until all
-	 * buckets have been visited.
-	 */
-	h->used = 0;
-
 	for_each_rw_member(ca, c, dev_idx)
 		heap_size += ca->mi.nbuckets >> 7;
 
@@ -235,21 +116,7 @@ static int bch2_copygc(struct bch_fs *c)
 		}
 	}
 
-	for_each_rw_member(ca, c, dev_idx) {
-		struct bch_dev_usage usage = bch2_dev_usage_read(ca);
-
-		u64 avail = max_t(s64, 0,
-				  usage.d[BCH_DATA_free].buckets +
-				  usage.d[BCH_DATA_need_discard].buckets -
-				  ca->nr_open_buckets -
-				  bch2_dev_buckets_reserved(ca, RESERVE_movinggc));
-
-		avail = min(avail, ca->mi.nbuckets >> 6);
-
-		sectors_reserved += avail * ca->mi.bucket_size;
-	}
-
-	ret = walk_buckets_to_copygc(c);
+	ret = find_buckets_to_copygc(c);
 	if (ret) {
 		bch2_fs_fatal_error(c, "error walking buckets to copygc!");
 		return ret;
@@ -281,69 +148,26 @@ static int bch2_copygc(struct bch_fs *c)
 		return 0;
 	}
 
-	/*
-	 * Our btree node allocations also come out of RESERVE_movingc:
-	 */
-	sectors_reserved = (sectors_reserved * 3) / 4;
-	if (!sectors_reserved) {
-		bch2_fs_fatal_error(c, "stuck, ran out of copygc reserve!");
-		return -1;
-	}
+	heap_resort(h, fragmentation_cmp, NULL);
 
-	for (i = h->data; i < h->data + h->used; i++) {
-		sectors_to_move += i->sectors;
-		sectors_to_write += i->sectors * i->replicas;
-	}
+	bch2_moving_ctxt_init(&ctxt, c, NULL, &move_stats,
+			      writepoint_ptr(&c->copygc_write_point),
+			      false);
 
-	while (sectors_to_write > sectors_reserved) {
+	/* not correct w.r.t. device removal */
+	while (h->used && !ret) {
 		BUG_ON(!heap_pop(h, e, -fragmentation_cmp, NULL));
-		sectors_to_write -= e.sectors * e.replicas;
+		ret = __bch2_evacuate_bucket(&ctxt, POS(e.dev, e.bucket), e.gen,
+					     data_opts);
 	}
 
-	buckets_to_move = h->used;
+	bch2_moving_ctxt_exit(&ctxt);
 
-	if (!buckets_to_move) {
-		bch_err_ratelimited(c, "copygc cannot run - sectors_reserved %llu!",
-				    sectors_reserved);
-		return 0;
-	}
-
-	eytzinger0_sort(h->data, h->used,
-			sizeof(h->data[0]),
-			bucket_offset_cmp, NULL);
-
-	ret = bch2_move_data(c,
-			     0,			POS_MIN,
-			     BTREE_ID_NR,	POS_MAX,
-			     NULL,
-			     &move_stats,
-			     writepoint_ptr(&c->copygc_write_point),
-			     false,
-			     copygc_pred, NULL);
 	if (ret < 0 && !bch2_err_matches(ret, EROFS))
 		bch_err(c, "error from bch2_move_data() in copygc: %s", bch2_err_str(ret));
-	if (ret)
-		return ret;
-
-	ret = check_copygc_was_done(c, &sectors_not_moved, &buckets_not_moved);
-	if (ret) {
-		bch_err(c, "error %i from check_copygc_was_done()", ret);
-		return ret;
-	}
 
-	if (sectors_not_moved)
-		bch_warn_ratelimited(c,
-			"copygc finished but %llu/%llu sectors, %llu/%llu buckets not moved (move stats: moved %llu sectors, raced %llu keys, %llu sectors)",
-			 sectors_not_moved, sectors_to_move,
-			 buckets_not_moved, buckets_to_move,
-			 atomic64_read(&move_stats.sectors_moved),
-			 atomic64_read(&move_stats.keys_raced),
-			 atomic64_read(&move_stats.sectors_raced));
-
-	trace_and_count(c, copygc, c,
-		     atomic64_read(&move_stats.sectors_moved), sectors_not_moved,
-		     buckets_to_move, buckets_not_moved);
-	return 0;
+	trace_and_count(c, copygc, c, atomic64_read(&move_stats.sectors_moved), 0, 0, 0);
+	return ret;
 }
 
 /*
diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h
index 937fd132bfd2..fabee8302afa 100644
--- a/fs/bcachefs/trace.h
+++ b/fs/bcachefs/trace.h
@@ -717,6 +717,37 @@ TRACE_EVENT(move_data,
 		  __entry->sectors_moved, __entry->keys_moved)
 );
 
+TRACE_EVENT(evacuate_bucket,
+	TP_PROTO(struct bch_fs *c, struct bpos *bucket,
+		 unsigned sectors, unsigned bucket_size,
+		 int ret),
+	TP_ARGS(c, bucket, sectors, bucket_size, ret),
+
+	TP_STRUCT__entry(
+		__field(dev_t,		dev		)
+		__field(u64,		member		)
+		__field(u64,		bucket		)
+		__field(u32,		sectors		)
+		__field(u32,		bucket_size	)
+		__field(int,		ret		)
+	),
+
+	TP_fast_assign(
+		__entry->dev			= c->dev;
+		__entry->member			= bucket->inode;
+		__entry->bucket			= bucket->offset;
+		__entry->sectors		= sectors;
+		__entry->bucket_size		= bucket_size;
+		__entry->ret			= ret;
+	),
+
+	TP_printk("%d,%d %llu:%llu sectors %u/%u ret %i",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->member, __entry->bucket,
+		  __entry->sectors, __entry->bucket_size,
+		  __entry->ret)
+);
+
 TRACE_EVENT(copygc,
 	TP_PROTO(struct bch_fs *c,
 		 u64 sectors_moved, u64 sectors_not_moved,
-- 
cgit 


From dea5647e16d15ee7c47dbe11b1f68ec221dc51be Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 9 Oct 2022 00:29:51 -0400
Subject: bcachefs: Erasure coding now uses backpointers

This is only a start to updating erasure coding for backpointers - it's
still not working yet. The subsequent patch will delete our old in
memory backpointers for copygc, and this fixes a spurious EPERM
bug/error message.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/ec.c | 115 ++++++++++++++++++++++++++++++++++++++++---------------
 1 file changed, 85 insertions(+), 30 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index af6a23021381..68f5314b51e6 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -4,10 +4,12 @@
 
 #include "bcachefs.h"
 #include "alloc_foreground.h"
+#include "backpointers.h"
 #include "bkey_buf.h"
 #include "bset.h"
 #include "btree_gc.h"
 #include "btree_update.h"
+#include "btree_write_buffer.h"
 #include "buckets.h"
 #include "disk_groups.h"
 #include "ec.h"
@@ -826,21 +828,42 @@ static void extent_stripe_ptr_add(struct bkey_s_extent e,
 }
 
 static int ec_stripe_update_extent(struct btree_trans *trans,
-				   struct btree_iter *iter,
-				   struct bkey_s_c k,
+				   struct bpos bucket, u8 gen,
 				   struct ec_stripe_buf *s,
-				   struct bpos end)
+				   u64 *bp_offset)
 {
+	struct bch_fs *c = trans->c;
+	struct bch_backpointer bp;
+	struct btree_iter iter;
+	struct bkey_s_c k;
 	const struct bch_extent_ptr *ptr_c;
 	struct bch_extent_ptr *ptr, *ec_ptr = NULL;
 	struct bkey_i *n;
 	int ret, dev, block;
 
-	if (bkey_cmp(bkey_start_pos(k.k), end) >= 0)
-		return 1;
+	ret = bch2_get_next_backpointer(trans, bucket, gen, bp_offset, &bp);
+	if (ret)
+		return ret;
+	if (*bp_offset == U64_MAX)
+		return 0;
 
-	if (extent_has_stripe_ptr(k, s->key.k.p.offset))
+	if (bch2_fs_inconsistent_on(bp.level, c, "found btree node in erasure coded bucket!?"))
+		return -EIO;
+
+	k = bch2_backpointer_get_key(trans, &iter, bucket, *bp_offset, bp);
+	ret = bkey_err(k);
+	if (ret)
+		return ret;
+	if (!k.k) {
+		/*
+		 * extent no longer exists - we could flush the btree
+		 * write buffer and retry to verify, but no need:
+		 */
 		return 0;
+	}
+
+	if (extent_has_stripe_ptr(k, s->key.k.p.offset))
+		goto out;
 
 	ptr_c = bkey_matches_stripe(&s->key.v, k, &block);
 	/*
@@ -848,14 +871,14 @@ static int ec_stripe_update_extent(struct btree_trans *trans,
 	 * XXX: should we be incrementing a counter?
 	 */
 	if (!ptr_c || ptr_c->cached)
-		return 0;
+		goto out;
 
 	dev = s->key.v.ptrs[block].dev;
 
 	n = bch2_bkey_make_mut(trans, k);
 	ret = PTR_ERR_OR_ZERO(n);
 	if (ret)
-		return ret;
+		goto out;
 
 	bch2_bkey_drop_ptrs(bkey_i_to_s(n), ptr, ptr->dev != dev);
 	ec_ptr = (void *) bch2_bkey_has_device(bkey_i_to_s_c(n), dev);
@@ -863,22 +886,59 @@ static int ec_stripe_update_extent(struct btree_trans *trans,
 
 	extent_stripe_ptr_add(bkey_i_to_s_extent(n), s, ec_ptr, block);
 
-	return bch2_trans_update(trans, iter, n, 0);
+	ret = bch2_trans_update(trans, &iter, n, 0);
+out:
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
 }
 
-static int ec_stripe_update_extents(struct bch_fs *c,
-				 struct ec_stripe_buf *s,
-				 struct bkey *pos)
+static int ec_stripe_update_bucket(struct btree_trans *trans, struct ec_stripe_buf *s,
+				   unsigned block)
 {
-	struct btree_iter iter;
-	struct bkey_s_c k;
+	struct bch_fs *c = trans->c;
+	struct bch_extent_ptr bucket = s->key.v.ptrs[block];
+	struct bpos bucket_pos = PTR_BUCKET_POS(c, &bucket);
+	u64 bp_offset = 0;
+	int ret = 0;
+
+	while (1) {
+		ret = commit_do(trans, NULL, NULL,
+				BTREE_INSERT_NOFAIL,
+			ec_stripe_update_extent(trans, bucket_pos, bucket.gen,
+						s, &bp_offset));
+		if (ret)
+			break;
+		if (bp_offset == U64_MAX)
+			break;
+
+		bp_offset++;
+	}
+
+	return ret;
+}
+
+static int ec_stripe_update_extents(struct bch_fs *c, struct ec_stripe_buf *s)
+{
+	struct btree_trans trans;
+	struct bch_stripe *v = &s->key.v;
+	unsigned i, nr_data = v->nr_blocks - v->nr_redundant;
+	int ret = 0;
+
+	bch2_trans_init(&trans, c, 0, 0);
 
-	return bch2_trans_run(c,
-		for_each_btree_key_commit(&trans, iter,
-			BTREE_ID_extents, bkey_start_pos(pos),
-			BTREE_ITER_NOT_EXTENTS|BTREE_ITER_INTENT, k,
-			NULL, NULL, BTREE_INSERT_NOFAIL,
-		ec_stripe_update_extent(&trans, &iter, k, s, pos->p)));
+	ret = bch2_btree_write_buffer_flush(&trans);
+	if (ret)
+		goto err;
+
+	for (i = 0; i < nr_data; i++) {
+		ret = ec_stripe_update_bucket(&trans, s, i);
+		if (ret)
+			break;
+	}
+err:
+	bch2_trans_exit(&trans);
+
+	return ret;
 }
 
 /*
@@ -888,7 +948,6 @@ static void ec_stripe_create(struct ec_stripe_new *s)
 {
 	struct bch_fs *c = s->c;
 	struct open_bucket *ob;
-	struct bkey_i *k;
 	struct stripe *m;
 	struct bch_stripe *v = &s->new_stripe.key.v;
 	unsigned i, nr_data = v->nr_blocks - v->nr_redundant;
@@ -948,14 +1007,10 @@ static void ec_stripe_create(struct ec_stripe_new *s)
 		goto err_put_writes;
 	}
 
-	for_each_keylist_key(&s->keys, k) {
-		ret = ec_stripe_update_extents(c, &s->new_stripe, &k->k);
-		if (ret) {
-			bch_err(c, "error creating stripe: error updating pointers: %s",
-				bch2_err_str(ret));
-			break;
-		}
-	}
+	ret = ec_stripe_update_extents(c, &s->new_stripe);
+	if (ret)
+		bch_err(c, "error creating stripe: error updating pointers: %s",
+			bch2_err_str(ret));
 
 	spin_lock(&c->ec_stripes_heap_lock);
 	m = genradix_ptr(&c->stripes, s->new_stripe.key.k.p.offset);
@@ -1423,7 +1478,7 @@ static int __bch2_ec_stripe_head_reuse(struct bch_fs *c,
 	}
 
 	bkey_copy(&h->s->new_stripe.key.k_i,
-			&h->s->existing_stripe.key.k_i);
+		  &h->s->existing_stripe.key.k_i);
 
 	return 0;
 }
-- 
cgit 


From c9828cea312e83f3f17f4a80990f91739ff33d06 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 9 Oct 2022 00:47:18 -0400
Subject: bcachefs: Delete in memory ec backpointers

Post btree backpointers, these aren't needed anymore.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/data_update.c |  4 ----
 fs/bcachefs/ec.c          | 28 ----------------------------
 fs/bcachefs/ec.h          |  6 ------
 fs/bcachefs/io.c          |  4 ----
 4 files changed, 42 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c
index acb634b3480b..d2b1296c4c49 100644
--- a/fs/bcachefs/data_update.c
+++ b/fs/bcachefs/data_update.c
@@ -103,7 +103,6 @@ int bch2_data_update_index_update(struct bch_write_op *op)
 	struct btree_iter iter;
 	struct data_update *m =
 		container_of(op, struct data_update, op);
-	struct open_bucket *ec_ob = ec_open_bucket(c, &op->open_buckets);
 	struct keylist *keys = &op->insert_keys;
 	struct bkey_buf _new, _insert;
 	int ret = 0;
@@ -242,9 +241,6 @@ int bch2_data_update_index_update(struct bch_write_op *op)
 		if (!ret) {
 			bch2_btree_iter_set_pos(&iter, next_pos);
 
-			if (ec_ob)
-				bch2_ob_add_backpointer(c, ec_ob, &insert->k);
-
 			this_cpu_add(c->counters[BCH_COUNTER_move_extent_finish], new->k.size);
 			trace_move_extent_finish(&new->k);
 		}
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index 68f5314b51e6..c7ac2894db2e 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -1035,8 +1035,6 @@ err:
 			}
 		}
 
-	bch2_keylist_free(&s->keys, s->inline_keys);
-
 	ec_stripe_buf_exit(&s->existing_stripe);
 	ec_stripe_buf_exit(&s->new_stripe);
 	closure_debug_destroy(&s->iodone);
@@ -1119,30 +1117,6 @@ void *bch2_writepoint_ec_buf(struct bch_fs *c, struct write_point *wp)
 	return ob->ec->new_stripe.data[ob->ec_idx] + (offset << 9);
 }
 
-void bch2_ob_add_backpointer(struct bch_fs *c, struct open_bucket *ob,
-			     struct bkey *k)
-{
-	struct ec_stripe_new *ec = ob->ec;
-
-	if (!ec)
-		return;
-
-	mutex_lock(&ec->lock);
-
-	if (bch2_keylist_realloc(&ec->keys, ec->inline_keys,
-				 ARRAY_SIZE(ec->inline_keys),
-				 BKEY_U64s)) {
-		BUG();
-	}
-
-	bkey_init(&ec->keys.top->k);
-	ec->keys.top->k.p	= k->p;
-	ec->keys.top->k.size	= k->size;
-	bch2_keylist_push(&ec->keys);
-
-	mutex_unlock(&ec->lock);
-}
-
 static int unsigned_cmp(const void *_l, const void *_r)
 {
 	unsigned l = *((const unsigned *) _l);
@@ -1235,8 +1209,6 @@ static int ec_new_stripe_alloc(struct bch_fs *c, struct ec_stripe_head *h)
 				BCH_BKEY_PTRS_MAX) - h->redundancy;
 	s->nr_parity	= h->redundancy;
 
-	bch2_keylist_init(&s->keys, s->inline_keys);
-
 	ec_stripe_key_init(c, &s->new_stripe.key, s->nr_data,
 			   s->nr_parity, h->blocksize);
 
diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h
index 4d4e3756dd59..5587c9467fb5 100644
--- a/fs/bcachefs/ec.h
+++ b/fs/bcachefs/ec.h
@@ -5,7 +5,6 @@
 #include "ec_types.h"
 #include "buckets_types.h"
 #include "extents_types.h"
-#include "keylist_types.h"
 
 int bch2_stripe_invalid(const struct bch_fs *, struct bkey_s_c,
 			int rw, struct printbuf *);
@@ -167,9 +166,6 @@ struct ec_stripe_new {
 	open_bucket_idx_t	blocks[BCH_BKEY_PTRS_MAX];
 	struct disk_reservation	res;
 
-	struct keylist		keys;
-	u64			inline_keys[BKEY_U64s * 8];
-
 	struct ec_stripe_buf	new_stripe;
 	struct ec_stripe_buf	existing_stripe;
 };
@@ -197,8 +193,6 @@ struct ec_stripe_head {
 int bch2_ec_read_extent(struct bch_fs *, struct bch_read_bio *);
 
 void *bch2_writepoint_ec_buf(struct bch_fs *, struct write_point *);
-void bch2_ob_add_backpointer(struct bch_fs *, struct open_bucket *,
-			     struct bkey *);
 
 void bch2_ec_bucket_written(struct bch_fs *, struct open_bucket *);
 void bch2_ec_bucket_cancel(struct bch_fs *, struct open_bucket *);
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index b57187ce1f65..a937940f5096 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -472,7 +472,6 @@ static int bch2_write_index_default(struct bch_write_op *op)
 {
 	struct bch_fs *c = op->c;
 	struct bkey_buf sk;
-	struct open_bucket *ec_ob = ec_open_bucket(c, &op->open_buckets);
 	struct keylist *keys = &op->insert_keys;
 	struct bkey_i *k = bch2_keylist_front(keys);
 	struct btree_trans trans;
@@ -516,9 +515,6 @@ static int bch2_write_index_default(struct bch_write_op *op)
 		if (ret)
 			break;
 
-		if (ec_ob)
-			bch2_ob_add_backpointer(c, ec_ob, &sk.k->k);
-
 		if (bkey_ge(iter.pos, k->k.p))
 			bch2_keylist_pop_front(&op->insert_keys);
 		else
-- 
cgit 


From 15949c549993a2383ebacf6c563b85722278fba3 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 9 Oct 2022 04:26:06 -0400
Subject: bcachefs: Don't stop copygc while removing devices

With the new backpointer based copygc we don't need an explicit copygc
reserve, we're always evacuating buckets one at a time - so this is no
longer needed, and in fact removing it fixes a deadlock in
bch2_dev_allocator_remove().

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/super.c | 8 --------
 1 file changed, 8 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index c5efaa7d38a8..2fb7e6300ea5 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -1345,19 +1345,11 @@ static bool bch2_fs_may_start(struct bch_fs *c)
 
 static void __bch2_dev_read_only(struct bch_fs *c, struct bch_dev *ca)
 {
-	/*
-	 * Device going read only means the copygc reserve get smaller, so we
-	 * don't want that happening while copygc is in progress:
-	 */
-	bch2_copygc_stop(c);
-
 	/*
 	 * The allocator thread itself allocates btree nodes, so stop it first:
 	 */
 	bch2_dev_allocator_remove(c, ca);
 	bch2_dev_journal_stop(&c->journal, ca);
-
-	bch2_copygc_start(c);
 }
 
 static void __bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca)
-- 
cgit 


From 23792a712d29ee8adc6f5c165e61e8624838169d Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 9 Oct 2022 22:18:06 -0400
Subject: bcachefs: Run bch2_check_backpointers_to_extents() in multiple passes
 if necessary

When the extents + reflink btrees don't fit into memory this fsck pass
becomes _much_ slower, since we're doing random lookups.

This patch changes this pass to check how much of the relevant btrees
will fit into memory, and run in multiple passes if needed.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/backpointers.c | 145 +++++++++++++++++++++++++++++++++++++++++----
 fs/bcachefs/bbpos.h        |  48 +++++++++++++++
 2 files changed, 180 insertions(+), 13 deletions(-)
 create mode 100644 fs/bcachefs/bbpos.h

(limited to 'fs')

diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c
index 6efc286cd6ba..63a0f329cbd6 100644
--- a/fs/bcachefs/backpointers.c
+++ b/fs/bcachefs/backpointers.c
@@ -1,11 +1,14 @@
 // SPDX-License-Identifier: GPL-2.0
 #include "bcachefs.h"
+#include "bbpos.h"
 #include "alloc_background.h"
 #include "backpointers.h"
 #include "btree_cache.h"
 #include "btree_update.h"
 #include "error.h"
 
+#include <linux/mm.h>
+
 static bool extent_matches_bp(struct bch_fs *c,
 			      enum btree_id btree_id, unsigned level,
 			      struct bkey_s_c k,
@@ -693,6 +696,71 @@ err:
 	return ret;
 }
 
+static inline struct bbpos bp_to_bbpos(struct bch_backpointer bp)
+{
+	return (struct bbpos) {
+		.btree	= bp.btree_id,
+		.pos	= bp.pos,
+	};
+}
+
+static size_t btree_nodes_fit_in_ram(struct bch_fs *c)
+{
+	struct sysinfo i;
+	u64 mem_bytes;
+
+	si_meminfo(&i);
+	mem_bytes = i.totalram * i.mem_unit;
+	return (mem_bytes >> 1) / btree_bytes(c);
+}
+
+int bch2_get_btree_in_memory_pos(struct btree_trans *trans,
+				 unsigned btree_leaf_mask,
+				 unsigned btree_interior_mask,
+				 struct bbpos start, struct bbpos *end)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	size_t btree_nodes = btree_nodes_fit_in_ram(trans->c);
+	enum btree_id btree;
+	int ret = 0;
+
+	for (btree = start.btree; btree < BTREE_ID_NR && !ret; btree++) {
+		unsigned depth = ((1U << btree) & btree_leaf_mask) ? 1 : 2;
+
+		if (!((1U << btree) & btree_leaf_mask) &&
+		    !((1U << btree) & btree_interior_mask))
+			continue;
+
+		bch2_trans_node_iter_init(trans, &iter, btree,
+					  btree == start.btree ? start.pos : POS_MIN,
+					  0, depth, 0);
+		/*
+		 * for_each_btree_key_contineu() doesn't check the return value
+		 * from bch2_btree_iter_advance(), which is needed when
+		 * iterating over interior nodes where we'll see keys at
+		 * SPOS_MAX:
+		 */
+		do {
+			k = __bch2_btree_iter_peek_and_restart(trans, &iter, 0);
+			ret = bkey_err(k);
+			if (!k.k || ret)
+				break;
+
+			--btree_nodes;
+			if (!btree_nodes) {
+				*end = BBPOS(btree, k.k->p);
+				bch2_trans_iter_exit(trans, &iter);
+				return 0;
+			}
+		} while (bch2_btree_iter_advance(&iter));
+		bch2_trans_iter_exit(trans, &iter);
+	}
+
+	*end = BBPOS_MAX;
+	return ret;
+}
+
 int bch2_check_extents_to_backpointers(struct bch_fs *c)
 {
 	struct btree_trans trans;
@@ -736,19 +804,26 @@ int bch2_check_extents_to_backpointers(struct bch_fs *c)
 
 static int check_one_backpointer(struct btree_trans *trans,
 				 struct bpos bucket,
-				 u64 *bp_offset)
+				 u64 *bp_offset,
+				 struct bbpos start,
+				 struct bbpos end)
 {
 	struct btree_iter iter;
 	struct bch_backpointer bp;
+	struct bbpos pos;
 	struct bkey_s_c k;
 	struct printbuf buf = PRINTBUF;
 	int ret;
 
-	ret = bch2_get_next_backpointer(trans, bucket, -1,
-					bp_offset, &bp);
+	ret = bch2_get_next_backpointer(trans, bucket, -1, bp_offset, &bp);
 	if (ret || *bp_offset == U64_MAX)
 		return ret;
 
+	pos = bp_to_bbpos(bp);
+	if (bbpos_cmp(pos, start) < 0 ||
+	    bbpos_cmp(pos, end) > 0)
+		return 0;
+
 	k = bch2_backpointer_get_key(trans, &iter, bucket, *bp_offset, bp);
 	ret = bkey_err(k);
 	if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node)
@@ -771,29 +846,73 @@ fsck_err:
 	return ret;
 }
 
-int bch2_check_backpointers_to_extents(struct bch_fs *c)
+static int bch2_check_backpointers_to_extents_pass(struct btree_trans *trans,
+						   struct bbpos start,
+						   struct bbpos end)
 {
-	struct btree_trans trans;
 	struct btree_iter iter;
 	struct bkey_s_c k;
 	int ret = 0;
 
-	bch2_trans_init(&trans, c, 0, 0);
-	for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN,
+	for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN,
 			   BTREE_ITER_PREFETCH, k, ret) {
 		u64 bp_offset = 0;
 
-		while (!(ret = commit_do(&trans, NULL, NULL,
-					       BTREE_INSERT_LAZY_RW|
-					       BTREE_INSERT_NOFAIL,
-				check_one_backpointer(&trans, iter.pos, &bp_offset))) &&
+		while (!(ret = commit_do(trans, NULL, NULL,
+					 BTREE_INSERT_LAZY_RW|
+					 BTREE_INSERT_NOFAIL,
+				check_one_backpointer(trans, iter.pos, &bp_offset, start, end))) &&
 		       bp_offset < U64_MAX)
 			bp_offset++;
 
 		if (ret)
 			break;
 	}
-	bch2_trans_iter_exit(&trans, &iter);
-	bch2_trans_exit(&trans);
+	bch2_trans_iter_exit(trans, &iter);
 	return ret < 0 ? ret : 0;
 }
+
+int bch2_check_backpointers_to_extents(struct bch_fs *c)
+{
+	struct btree_trans trans;
+	struct bbpos start = (struct bbpos) { .btree = 0, .pos = POS_MIN, }, end;
+	int ret;
+
+	bch2_trans_init(&trans, c, 0, 0);
+	while (1) {
+		ret = bch2_get_btree_in_memory_pos(&trans,
+						   (1U << BTREE_ID_extents)|
+						   (1U << BTREE_ID_reflink),
+						   ~0,
+						   start, &end);
+		if (ret)
+			break;
+
+		if (!bbpos_cmp(start, BBPOS_MIN) &&
+		    bbpos_cmp(end, BBPOS_MAX))
+			bch_verbose(c, "%s(): extents do not fit in ram, running in multiple passes with %zu nodes per pass",
+				    __func__, btree_nodes_fit_in_ram(c));
+
+		if (bbpos_cmp(start, BBPOS_MIN) ||
+		    bbpos_cmp(end, BBPOS_MAX)) {
+			struct printbuf buf = PRINTBUF;
+
+			prt_str(&buf, "check_backpointers_to_extents(): ");
+			bch2_bbpos_to_text(&buf, start);
+			prt_str(&buf, "-");
+			bch2_bbpos_to_text(&buf, end);
+
+			bch_verbose(c, "%s", buf.buf);
+			printbuf_exit(&buf);
+		}
+
+		ret = bch2_check_backpointers_to_extents_pass(&trans, start, end);
+		if (ret || !bbpos_cmp(end, BBPOS_MAX))
+			break;
+
+		start = bbpos_successor(end);
+	}
+	bch2_trans_exit(&trans);
+
+	return ret;
+}
diff --git a/fs/bcachefs/bbpos.h b/fs/bcachefs/bbpos.h
new file mode 100644
index 000000000000..1fbed1f8378d
--- /dev/null
+++ b/fs/bcachefs/bbpos.h
@@ -0,0 +1,48 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BBPOS_H
+#define _BCACHEFS_BBPOS_H
+
+#include "bkey_methods.h"
+
+struct bbpos {
+	enum btree_id		btree;
+	struct bpos		pos;
+};
+
+static inline struct bbpos BBPOS(enum btree_id btree, struct bpos pos)
+{
+	return (struct bbpos) { btree, pos };
+}
+
+#define BBPOS_MIN	BBPOS(0, POS_MIN)
+#define BBPOS_MAX	BBPOS(BTREE_ID_NR - 1, POS_MAX)
+
+static inline int bbpos_cmp(struct bbpos l, struct bbpos r)
+{
+	return cmp_int(l.btree, r.btree) ?: bpos_cmp(l.pos, r.pos);
+}
+
+static inline struct bbpos bbpos_successor(struct bbpos pos)
+{
+	if (bpos_cmp(pos.pos, SPOS_MAX)) {
+		pos.pos = bpos_successor(pos.pos);
+		return pos;
+	}
+
+	if (pos.btree != BTREE_ID_NR) {
+		pos.btree++;
+		pos.pos = POS_MIN;
+		return pos;
+	}
+
+	BUG();
+}
+
+static inline void bch2_bbpos_to_text(struct printbuf *out, struct bbpos pos)
+{
+	prt_str(out, bch2_btree_ids[pos.btree]);
+	prt_char(out, ':');
+	bch2_bpos_to_text(out, pos.pos);
+}
+
+#endif /* _BCACHEFS_BBPOS_H */
-- 
cgit 


From b32f9a577b8d532d31ee7d71e58d1ec512a25a9a Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 28 Sep 2022 10:06:10 -0400
Subject: bcachefs: Run check_extents_to_backpointers() in multiple passes

Similer to the previous patch for check_backpointers_to_extents(), if
the alloc + backpointers btrees do not fit in ram we need to run into
multiple passes.

The counting of btree nodes that fit in memory is different here,
because we have to walk the alloc and backpointers btrees at the same
time, since a backpointer could reside in either of them and we don't
know which without checking both.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/backpointers.c | 142 +++++++++++++++++++++++++++++++++++++++------
 1 file changed, 124 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c
index 63a0f329cbd6..20cd5bf349c4 100644
--- a/fs/bcachefs/backpointers.c
+++ b/fs/bcachefs/backpointers.c
@@ -550,7 +550,9 @@ int bch2_check_btree_backpointers(struct bch_fs *c)
 static int check_bp_exists(struct btree_trans *trans,
 			   struct bpos bucket_pos,
 			   struct bch_backpointer bp,
-			   struct bkey_s_c orig_k)
+			   struct bkey_s_c orig_k,
+			   struct bpos bucket_start,
+			   struct bpos bucket_end)
 {
 	struct bch_fs *c = trans->c;
 	struct btree_iter alloc_iter, bp_iter = { NULL };
@@ -558,6 +560,10 @@ static int check_bp_exists(struct btree_trans *trans,
 	struct bkey_s_c alloc_k, bp_k;
 	int ret;
 
+	if (bpos_lt(bucket_pos, bucket_start) ||
+	    bpos_gt(bucket_pos, bucket_end))
+		return 0;
+
 	bch2_trans_iter_init(trans, &alloc_iter, BTREE_ID_alloc, bucket_pos, 0);
 	alloc_k = bch2_btree_iter_peek_slot(&alloc_iter);
 	ret = bkey_err(alloc_k);
@@ -619,7 +625,9 @@ missing:
 }
 
 static int check_extent_to_backpointers(struct btree_trans *trans,
-					struct btree_iter *iter)
+					struct btree_iter *iter,
+					struct bpos bucket_start,
+					struct bpos bucket_end)
 {
 	struct bch_fs *c = trans->c;
 	struct bkey_ptrs_c ptrs;
@@ -646,7 +654,7 @@ static int check_extent_to_backpointers(struct btree_trans *trans,
 		bch2_extent_ptr_to_bp(c, iter->btree_id, iter->path->level,
 				      k, p, &bucket_pos, &bp);
 
-		ret = check_bp_exists(trans, bucket_pos, bp, k);
+		ret = check_bp_exists(trans, bucket_pos, bp, k, bucket_start, bucket_end);
 		if (ret)
 			return ret;
 	}
@@ -655,7 +663,9 @@ static int check_extent_to_backpointers(struct btree_trans *trans,
 }
 
 static int check_btree_root_to_backpointers(struct btree_trans *trans,
-					    enum btree_id btree_id)
+					    enum btree_id btree_id,
+					    struct bpos bucket_start,
+					    struct bpos bucket_end)
 {
 	struct bch_fs *c = trans->c;
 	struct btree_iter iter;
@@ -687,7 +697,7 @@ static int check_btree_root_to_backpointers(struct btree_trans *trans,
 		bch2_extent_ptr_to_bp(c, iter.btree_id, iter.path->level + 1,
 				      k, p, &bucket_pos, &bp);
 
-		ret = check_bp_exists(trans, bucket_pos, bp, k);
+		ret = check_bp_exists(trans, bucket_pos, bp, k, bucket_start, bucket_end);
 		if (ret)
 			goto err;
 	}
@@ -761,44 +771,140 @@ int bch2_get_btree_in_memory_pos(struct btree_trans *trans,
 	return ret;
 }
 
-int bch2_check_extents_to_backpointers(struct bch_fs *c)
+static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans,
+						   struct bpos bucket_start,
+						   struct bpos bucket_end)
 {
-	struct btree_trans trans;
 	struct btree_iter iter;
 	enum btree_id btree_id;
 	int ret = 0;
 
-	bch2_trans_init(&trans, c, 0, 0);
 	for (btree_id = 0; btree_id < BTREE_ID_NR; btree_id++) {
 		unsigned depth = btree_type_has_ptrs(btree_id) ? 0 : 1;
 
-		bch2_trans_node_iter_init(&trans, &iter, btree_id, POS_MIN, 0,
+		bch2_trans_node_iter_init(trans, &iter, btree_id, POS_MIN, 0,
 					  depth,
 					  BTREE_ITER_ALL_LEVELS|
 					  BTREE_ITER_PREFETCH);
 
 		do {
-			ret = commit_do(&trans, NULL, NULL,
-					      BTREE_INSERT_LAZY_RW|
-					      BTREE_INSERT_NOFAIL,
-					      check_extent_to_backpointers(&trans, &iter));
+			ret = commit_do(trans, NULL, NULL,
+					BTREE_INSERT_LAZY_RW|
+					BTREE_INSERT_NOFAIL,
+					check_extent_to_backpointers(trans, &iter,
+								bucket_start, bucket_end));
 			if (ret)
 				break;
 		} while (!bch2_btree_iter_advance(&iter));
 
-		bch2_trans_iter_exit(&trans, &iter);
+		bch2_trans_iter_exit(trans, &iter);
 
 		if (ret)
 			break;
 
-		ret = commit_do(&trans, NULL, NULL,
-				      BTREE_INSERT_LAZY_RW|
-				      BTREE_INSERT_NOFAIL,
-				      check_btree_root_to_backpointers(&trans, btree_id));
+		ret = commit_do(trans, NULL, NULL,
+				BTREE_INSERT_LAZY_RW|
+				BTREE_INSERT_NOFAIL,
+				check_btree_root_to_backpointers(trans, btree_id,
+							bucket_start, bucket_end));
 		if (ret)
 			break;
 	}
+	return ret;
+}
+
+static struct bpos bucket_pos_to_bp_safe(const struct bch_fs *c,
+					 struct bpos bucket)
+{
+	return bch2_dev_exists2(c, bucket.inode)
+		? bucket_pos_to_bp(c, bucket, 0)
+		: bucket;
+}
+
+int bch2_get_alloc_in_memory_pos(struct btree_trans *trans,
+				 struct bpos start, struct bpos *end)
+{
+	struct btree_iter alloc_iter;
+	struct btree_iter bp_iter;
+	struct bkey_s_c alloc_k, bp_k;
+	size_t btree_nodes = btree_nodes_fit_in_ram(trans->c);
+	bool alloc_end = false, bp_end = false;
+	int ret = 0;
+
+	bch2_trans_node_iter_init(trans, &alloc_iter, BTREE_ID_alloc,
+				  start, 0, 1, 0);
+	bch2_trans_node_iter_init(trans, &bp_iter, BTREE_ID_backpointers,
+				  bucket_pos_to_bp_safe(trans->c, start), 0, 1, 0);
+	while (1) {
+		alloc_k = !alloc_end
+			? __bch2_btree_iter_peek_and_restart(trans, &alloc_iter, 0)
+			: bkey_s_c_null;
+		bp_k = !bp_end
+			? __bch2_btree_iter_peek_and_restart(trans, &bp_iter, 0)
+			: bkey_s_c_null;
+
+		ret = bkey_err(alloc_k) ?: bkey_err(bp_k);
+		if ((!alloc_k.k && !bp_k.k) || ret) {
+			*end = SPOS_MAX;
+			break;
+		}
+
+		--btree_nodes;
+		if (!btree_nodes) {
+			*end = alloc_k.k->p;
+			break;
+		}
+
+		if (bpos_lt(alloc_iter.pos, SPOS_MAX) &&
+		    bpos_lt(bucket_pos_to_bp_safe(trans->c, alloc_iter.pos), bp_iter.pos)) {
+			if (!bch2_btree_iter_advance(&alloc_iter))
+				alloc_end = true;
+		} else {
+			if (!bch2_btree_iter_advance(&bp_iter))
+				bp_end = true;
+		}
+	}
+	bch2_trans_iter_exit(trans, &bp_iter);
+	bch2_trans_iter_exit(trans, &alloc_iter);
+	return ret;
+}
+
+int bch2_check_extents_to_backpointers(struct bch_fs *c)
+{
+	struct btree_trans trans;
+	struct bpos start = POS_MIN, end;
+	int ret;
+
+	bch2_trans_init(&trans, c, 0, 0);
+	while (1) {
+		ret = bch2_get_alloc_in_memory_pos(&trans, start, &end);
+		if (ret)
+			break;
+
+		if (bpos_eq(start, POS_MIN) && !bpos_eq(end, SPOS_MAX))
+			bch_verbose(c, "%s(): alloc info does not fit in ram, running in multiple passes with %zu nodes per pass",
+				    __func__, btree_nodes_fit_in_ram(c));
+
+		if (!bpos_eq(start, POS_MIN) || !bpos_eq(end, SPOS_MAX)) {
+			struct printbuf buf = PRINTBUF;
+
+			prt_str(&buf, "check_extents_to_backpointers(): ");
+			bch2_bpos_to_text(&buf, start);
+			prt_str(&buf, "-");
+			bch2_bpos_to_text(&buf, end);
+
+			bch_verbose(c, "%s", buf.buf);
+			printbuf_exit(&buf);
+		}
+
+		ret = bch2_check_extents_to_backpointers_pass(&trans, start, end);
+		if (ret || bpos_eq(end, SPOS_MAX))
+			break;
+
+		start = bpos_successor(end);
+	}
 	bch2_trans_exit(&trans);
+
 	return ret;
 }
 
-- 
cgit 


From 53b1c6f44b1a98ea6def11b74c1fde9710f2a0b9 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 14 Oct 2022 07:02:36 -0400
Subject: bcachefs: Don't use key cache during fsck

The btree key cache mainly helps with lock contention, at the cost of
additional memory overhead. During some fsck passes the memory overhead
really matters, but fsck is single threaded so lock contention is an
issue - so skipping the key cache during fsck will help with
performance.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_foreground.c | 3 ++-
 fs/bcachefs/backpointers.c     | 5 +++--
 fs/bcachefs/backpointers.h     | 2 +-
 fs/bcachefs/ec.c               | 3 ++-
 fs/bcachefs/move.c             | 6 ++++--
 5 files changed, 12 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index 5988aa288c98..a179bbe23c93 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -352,7 +352,8 @@ static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bc
 		u64 bp_offset = 0;
 
 		ret = bch2_get_next_backpointer(trans, POS(ca->dev_idx, b), -1,
-						&bp_offset, &bp);
+						&bp_offset, &bp,
+						BTREE_ITER_NOPRESERVE);
 		if (ret) {
 			ob = ERR_PTR(ret);
 			goto err;
diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c
index 20cd5bf349c4..3978c0b50f20 100644
--- a/fs/bcachefs/backpointers.c
+++ b/fs/bcachefs/backpointers.c
@@ -300,7 +300,8 @@ err:
 int bch2_get_next_backpointer(struct btree_trans *trans,
 			      struct bpos bucket, int gen,
 			      u64 *bp_offset,
-			      struct bch_backpointer *dst)
+			      struct bch_backpointer *dst,
+			      unsigned iter_flags)
 {
 	struct bch_fs *c = trans->c;
 	struct bpos bp_pos, bp_end_pos;
@@ -921,7 +922,7 @@ static int check_one_backpointer(struct btree_trans *trans,
 	struct printbuf buf = PRINTBUF;
 	int ret;
 
-	ret = bch2_get_next_backpointer(trans, bucket, -1, bp_offset, &bp);
+	ret = bch2_get_next_backpointer(trans, bucket, -1, bp_offset, &bp, 0);
 	if (ret || *bp_offset == U64_MAX)
 		return ret;
 
diff --git a/fs/bcachefs/backpointers.h b/fs/bcachefs/backpointers.h
index e1506492f022..153870d4e9a0 100644
--- a/fs/bcachefs/backpointers.h
+++ b/fs/bcachefs/backpointers.h
@@ -118,7 +118,7 @@ static inline void bch2_extent_ptr_to_bp(struct bch_fs *c,
 }
 
 int bch2_get_next_backpointer(struct btree_trans *, struct bpos, int,
-			      u64 *, struct bch_backpointer *);
+			      u64 *, struct bch_backpointer *, unsigned);
 struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *, struct btree_iter *,
 					 struct bpos, u64, struct bch_backpointer);
 struct btree *bch2_backpointer_get_node(struct btree_trans *, struct btree_iter *,
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index c7ac2894db2e..7028fb718ebf 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -841,7 +841,8 @@ static int ec_stripe_update_extent(struct btree_trans *trans,
 	struct bkey_i *n;
 	int ret, dev, block;
 
-	ret = bch2_get_next_backpointer(trans, bucket, gen, bp_offset, &bp);
+	ret = bch2_get_next_backpointer(trans, bucket, gen,
+				bp_offset, &bp, BTREE_ITER_CACHED);
 	if (ret)
 		return ret;
 	if (*bp_offset == U64_MAX)
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index 690c3128c5e1..8eb4978cc043 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -608,7 +608,8 @@ failed_to_evacuate:
 		bch2_trans_begin(trans);
 
 		ret = bch2_get_next_backpointer(trans, bucket, gen,
-						&bp_offset, &bp);
+						&bp_offset, &bp,
+						BTREE_ITER_CACHED);
 		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
 			continue;
 		if (ret)
@@ -681,7 +682,8 @@ int __bch2_evacuate_bucket(struct moving_context *ctxt,
 		bch2_trans_begin(&trans);
 
 		ret = bch2_get_next_backpointer(&trans, bucket, gen,
-						&bp_offset, &bp);
+						&bp_offset, &bp,
+						BTREE_ITER_CACHED);
 		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
 			continue;
 		if (ret)
-- 
cgit 


From 7c057d35098613b2936c361aa8289590fef987ba Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 12 Feb 2023 14:25:59 -0500
Subject: fixup bcachefs: New on disk format: Backpointers

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/backpointers.c | 35 +++++++++++++++++++++++++++--------
 1 file changed, 27 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c
index 3978c0b50f20..3e862da6f15f 100644
--- a/fs/bcachefs/backpointers.c
+++ b/fs/bcachefs/backpointers.c
@@ -5,6 +5,7 @@
 #include "backpointers.h"
 #include "btree_cache.h"
 #include "btree_update.h"
+#include "btree_write_buffer.h"
 #include "error.h"
 
 #include <linux/mm.h>
@@ -553,7 +554,8 @@ static int check_bp_exists(struct btree_trans *trans,
 			   struct bch_backpointer bp,
 			   struct bkey_s_c orig_k,
 			   struct bpos bucket_start,
-			   struct bpos bucket_end)
+			   struct bpos bucket_end,
+			   struct bpos *last_flushed_pos)
 {
 	struct bch_fs *c = trans->c;
 	struct btree_iter alloc_iter, bp_iter = { NULL };
@@ -597,8 +599,15 @@ static int check_bp_exists(struct btree_trans *trans,
 		goto err;
 
 	if (bp_k.k->type != KEY_TYPE_backpointer ||
-	    memcmp(bkey_s_c_to_backpointer(bp_k).v, &bp, sizeof(bp)))
+	    memcmp(bkey_s_c_to_backpointer(bp_k).v, &bp, sizeof(bp))) {
+		if (!bpos_eq(*last_flushed_pos, orig_k.k->p)) {
+			*last_flushed_pos = orig_k.k->p;
+			ret = bch2_btree_write_buffer_flush_sync(trans) ?:
+				-BCH_ERR_transaction_restart_write_buffer_flush;
+			goto out;
+		}
 		goto missing;
+	}
 out:
 err:
 fsck_err:
@@ -607,6 +616,7 @@ fsck_err:
 	printbuf_exit(&buf);
 	return ret;
 missing:
+
 	prt_printf(&buf, "missing backpointer for btree=%s l=%u ",
 	       bch2_btree_ids[bp.btree_id], bp.level);
 	bch2_bkey_val_to_text(&buf, c, orig_k);
@@ -628,7 +638,8 @@ missing:
 static int check_extent_to_backpointers(struct btree_trans *trans,
 					struct btree_iter *iter,
 					struct bpos bucket_start,
-					struct bpos bucket_end)
+					struct bpos bucket_end,
+					struct bpos *last_flushed_pos)
 {
 	struct bch_fs *c = trans->c;
 	struct bkey_ptrs_c ptrs;
@@ -655,7 +666,9 @@ static int check_extent_to_backpointers(struct btree_trans *trans,
 		bch2_extent_ptr_to_bp(c, iter->btree_id, iter->path->level,
 				      k, p, &bucket_pos, &bp);
 
-		ret = check_bp_exists(trans, bucket_pos, bp, k, bucket_start, bucket_end);
+		ret = check_bp_exists(trans, bucket_pos, bp, k,
+				      bucket_start, bucket_end,
+				      last_flushed_pos);
 		if (ret)
 			return ret;
 	}
@@ -666,7 +679,8 @@ static int check_extent_to_backpointers(struct btree_trans *trans,
 static int check_btree_root_to_backpointers(struct btree_trans *trans,
 					    enum btree_id btree_id,
 					    struct bpos bucket_start,
-					    struct bpos bucket_end)
+					    struct bpos bucket_end,
+					    struct bpos *last_flushed_pos)
 {
 	struct bch_fs *c = trans->c;
 	struct btree_iter iter;
@@ -698,7 +712,9 @@ static int check_btree_root_to_backpointers(struct btree_trans *trans,
 		bch2_extent_ptr_to_bp(c, iter.btree_id, iter.path->level + 1,
 				      k, p, &bucket_pos, &bp);
 
-		ret = check_bp_exists(trans, bucket_pos, bp, k, bucket_start, bucket_end);
+		ret = check_bp_exists(trans, bucket_pos, bp, k,
+				      bucket_start, bucket_end,
+				      last_flushed_pos);
 		if (ret)
 			goto err;
 	}
@@ -778,6 +794,7 @@ static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans,
 {
 	struct btree_iter iter;
 	enum btree_id btree_id;
+	struct bpos last_flushed_pos = SPOS_MAX;
 	int ret = 0;
 
 	for (btree_id = 0; btree_id < BTREE_ID_NR; btree_id++) {
@@ -793,7 +810,8 @@ static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans,
 					BTREE_INSERT_LAZY_RW|
 					BTREE_INSERT_NOFAIL,
 					check_extent_to_backpointers(trans, &iter,
-								bucket_start, bucket_end));
+								bucket_start, bucket_end,
+								&last_flushed_pos));
 			if (ret)
 				break;
 		} while (!bch2_btree_iter_advance(&iter));
@@ -807,7 +825,8 @@ static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans,
 				BTREE_INSERT_LAZY_RW|
 				BTREE_INSERT_NOFAIL,
 				check_btree_root_to_backpointers(trans, btree_id,
-							bucket_start, bucket_end));
+							bucket_start, bucket_end,
+							&last_flushed_pos));
 		if (ret)
 			break;
 	}
-- 
cgit 


From cc65f5659941a4d30608b47c3edfadb5e0e7b02e Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 26 Nov 2022 04:37:11 -0500
Subject: bcachefs: Improve bch2_dev_freespace_init()

This makes bch2_dev_freespace_init() much faster: instead of processing
every bucket on the device one at a time, we handle ranges of missing
keys all at once: the freespace btree is an extents style btree, so we
only have to insert one freespace key for every range of missing keys
in the alloc btree.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c | 111 ++++++++++++++++++++++++++++++++++-------
 1 file changed, 93 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 58ec650a512c..0f4c92c0d66f 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -1317,35 +1317,110 @@ void bch2_do_invalidates(struct bch_fs *c)
 		bch2_write_ref_put(c, BCH_WRITE_REF_invalidate);
 }
 
-static int bucket_freespace_init(struct btree_trans *trans, struct btree_iter *iter,
-				 struct bkey_s_c k, struct bch_dev *ca)
-{
-	struct bch_alloc_v4 a_convert;
-	const struct bch_alloc_v4 *a;
-
-	if (iter->pos.offset >= ca->mi.nbuckets)
-		return 1;
-
-	a = bch2_alloc_to_v4(k, &a_convert);
-	return bch2_bucket_do_index(trans, k, a, true);
-}
-
 static int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca)
 {
 	struct btree_trans trans;
 	struct btree_iter iter;
 	struct bkey_s_c k;
+	struct bpos end = POS(ca->dev_idx, ca->mi.nbuckets);
 	struct bch_member *m;
 	int ret;
 
 	bch2_trans_init(&trans, c, 0, 0);
 
-	ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_alloc,
-			POS(ca->dev_idx, ca->mi.first_bucket),
-			BTREE_ITER_SLOTS|BTREE_ITER_PREFETCH, k,
-			NULL, NULL, BTREE_INSERT_LAZY_RW,
-		bucket_freespace_init(&trans, &iter, k, ca));
+	bch2_trans_iter_init(&trans, &iter, BTREE_ID_alloc,
+			     POS(ca->dev_idx, ca->mi.first_bucket),
+			     BTREE_ITER_PREFETCH);
+	/*
+	 * Scan the alloc btree for every bucket on @ca, and add buckets to the
+	 * freespace/need_discard/need_gc_gens btrees as needed:
+	 */
+	while (1) {
+		bch2_trans_begin(&trans);
+		ret = 0;
+
+		if (bkey_ge(iter.pos, end))
+			break;
+
+		k = bch2_btree_iter_peek_slot(&iter);
+		ret = bkey_err(k);
+		if (ret)
+			goto bkey_err;
+
+		if (k.k->type) {
+			/*
+			 * We process live keys in the alloc btree one at a
+			 * time:
+			 */
+			struct bch_alloc_v4 a_convert;
+			const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &a_convert);
+
+			ret =   bch2_bucket_do_index(&trans, k, a, true) ?:
+				bch2_trans_commit(&trans, NULL, NULL,
+						  BTREE_INSERT_LAZY_RW|
+						  BTREE_INSERT_NOFAIL);
+			if (ret)
+				goto bkey_err;
+
+			bch2_btree_iter_advance(&iter);
+		} else {
+			/*
+			 * When there's a hole, process a whole range of keys
+			 * all at once:
+			 *
+			 * This is similar to how extent btree iterators in
+			 * slots mode will synthesize a whole range - a
+			 * KEY_TYPE_deleted extent.
+			 *
+			 * But alloc keys aren't extents (they have zero size),
+			 * so we're open coding it here:
+			 */
+			struct btree_iter iter2;
+			struct bkey_i *freespace;
+			struct bpos next;
+
+			bch2_trans_copy_iter(&iter2, &iter);
+			k = bch2_btree_iter_peek_upto(&iter2,
+					bkey_min(bkey_min(end,
+							  iter.path->l[0].b->key.k.p),
+							  POS(iter.pos.inode, iter.pos.offset + U32_MAX - 1)));
+			next = iter2.pos;
+			ret = bkey_err(k);
+			bch2_trans_iter_exit(&trans, &iter2);
+
+			BUG_ON(next.offset >= iter.pos.offset + U32_MAX);
 
+			if (ret)
+				goto bkey_err;
+
+			freespace = bch2_trans_kmalloc(&trans, sizeof(*freespace));
+			ret = PTR_ERR_OR_ZERO(freespace);
+			if (ret)
+				goto bkey_err;
+
+			bkey_init(&freespace->k);
+			freespace->k.type = KEY_TYPE_set;
+			freespace->k.p = iter.pos;
+
+			bch2_key_resize(&freespace->k, next.offset - iter.pos.offset);
+
+			ret = __bch2_btree_insert(&trans, BTREE_ID_freespace, freespace) ?:
+				bch2_trans_commit(&trans, NULL, NULL,
+						  BTREE_INSERT_LAZY_RW|
+						  BTREE_INSERT_NOFAIL);
+			if (ret)
+				goto bkey_err;
+
+			bch2_btree_iter_set_pos(&iter, next);
+		}
+bkey_err:
+		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+			continue;
+		if (ret)
+			break;
+	}
+
+	bch2_trans_iter_exit(&trans, &iter);
 	bch2_trans_exit(&trans);
 
 	if (ret < 0) {
-- 
cgit 


From d23124c757490641c2ec8281d54079aea3f3a7ad Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 30 Nov 2022 13:25:17 -0500
Subject: bcachefs: Improve bch2_check_alloc_info()

This factors out a new helper from bch2_dev_freespace_init(),
bch2_get_key_or_hole(), and uses it in bch2_check_alloc_info(): we're
now able to process holes in the alloc btree as ranges, instead of one
bucket at a time.

This will improve fsck performance on new filesystems, or filesystems
where not every bucket has been used yet.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c | 260 ++++++++++++++++++++++++++++++++---------
 1 file changed, 207 insertions(+), 53 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 0f4c92c0d66f..f4c3effe2f4e 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -735,7 +735,106 @@ int bch2_trans_mark_alloc(struct btree_trans *trans,
 	return 0;
 }
 
+/*
+ * This synthesizes deleted extents for holes, similar to BTREE_ITER_SLOTS for
+ * extents style btrees, but works on non-extents btrees:
+ */
+struct bkey_s_c bch2_get_key_or_hole(struct btree_iter *iter, struct bpos end, struct bkey *hole)
+{
+	struct bkey_s_c k = bch2_btree_iter_peek_slot(iter);
+
+	if (bkey_err(k))
+		return k;
+
+	if (k.k->type) {
+		return k;
+	} else {
+		struct btree_iter iter2;
+		struct bpos next;
+
+		bch2_trans_copy_iter(&iter2, iter);
+		k = bch2_btree_iter_peek_upto(&iter2,
+				bkey_min(bkey_min(end,
+						  iter->path->l[0].b->key.k.p),
+						  POS(iter->pos.inode, iter->pos.offset + U32_MAX - 1)));
+		next = iter2.pos;
+		bch2_trans_iter_exit(iter->trans, &iter2);
+
+		BUG_ON(next.offset >= iter->pos.offset + U32_MAX);
+
+		if (bkey_err(k))
+			return k;
+
+		bkey_init(hole);
+		hole->p = iter->pos;
+
+		bch2_key_resize(hole, next.offset - iter->pos.offset);
+		return (struct bkey_s_c) { hole, NULL };
+	}
+}
+
+static bool next_bucket(struct bch_fs *c, struct bpos *bucket)
+{
+	struct bch_dev *ca;
+	unsigned iter;
+
+	if (bch2_dev_bucket_exists(c, *bucket))
+		return true;
+
+	if (bch2_dev_exists2(c, bucket->inode)) {
+		ca = bch_dev_bkey_exists(c, bucket->inode);
+
+		if (bucket->offset < ca->mi.first_bucket) {
+			bucket->offset = ca->mi.first_bucket;
+			return true;
+		}
+
+		bucket->inode++;
+		bucket->offset = 0;
+	}
+
+	rcu_read_lock();
+	iter = bucket->inode;
+	ca = __bch2_next_dev(c, &iter, NULL);
+	if (ca)
+		bucket->offset = ca->mi.first_bucket;
+	rcu_read_unlock();
+
+	return ca != NULL;
+}
+
+struct bkey_s_c bch2_get_key_or_real_bucket_hole(struct btree_iter *iter, struct bkey *hole)
+{
+	struct bch_fs *c = iter->trans->c;
+	struct bkey_s_c k;
+again:
+	k = bch2_get_key_or_hole(iter, POS_MAX, hole);
+	if (bkey_err(k))
+		return k;
+
+	if (!k.k->type) {
+		struct bpos bucket = bkey_start_pos(k.k);
+
+		if (!bch2_dev_bucket_exists(c, bucket)) {
+			if (!next_bucket(c, &bucket))
+				return bkey_s_c_null;
+
+			bch2_btree_iter_set_pos(iter, bucket);
+			goto again;
+		}
+
+		if (!bch2_dev_bucket_exists(c, k.k->p)) {
+			struct bch_dev *ca = bch_dev_bkey_exists(c, bucket.inode);
+
+			bch2_key_resize(hole, ca->mi.nbuckets - bucket.offset);
+		}
+	}
+
+	return k;
+}
+
 static int bch2_check_alloc_key(struct btree_trans *trans,
+				struct bkey_s_c alloc_k,
 				struct btree_iter *alloc_iter,
 				struct btree_iter *discard_iter,
 				struct btree_iter *freespace_iter)
@@ -745,20 +844,10 @@ static int bch2_check_alloc_key(struct btree_trans *trans,
 	struct bch_alloc_v4 a_convert;
 	const struct bch_alloc_v4 *a;
 	unsigned discard_key_type, freespace_key_type;
-	struct bkey_s_c alloc_k, k;
+	struct bkey_s_c k;
 	struct printbuf buf = PRINTBUF;
 	int ret;
 
-	alloc_k = bch2_dev_bucket_exists(c, alloc_iter->pos)
-		? bch2_btree_iter_peek_slot(alloc_iter)
-		: bch2_btree_iter_peek(alloc_iter);
-	if (!alloc_k.k)
-		return 1;
-
-	ret = bkey_err(alloc_k);
-	if (ret)
-		return ret;
-
 	if (fsck_err_on(!bch2_dev_bucket_exists(c, alloc_k.k->p), c,
 			"alloc key for invalid device:bucket %llu:%llu",
 			alloc_k.k->p.inode, alloc_k.k->p.offset))
@@ -841,6 +930,61 @@ fsck_err:
 	return ret;
 }
 
+static int bch2_check_alloc_hole(struct btree_trans *trans,
+				 struct bpos start,
+				 struct bpos *end,
+				 struct btree_iter *freespace_iter)
+{
+	struct bch_fs *c = trans->c;
+	struct bch_dev *ca;
+	struct bkey_s_c k;
+	struct printbuf buf = PRINTBUF;
+	int ret;
+
+	ca = bch_dev_bkey_exists(c, start.inode);
+	if (!ca->mi.freespace_initialized)
+		return 0;
+
+	bch2_btree_iter_set_pos(freespace_iter, start);
+
+	k = bch2_btree_iter_peek_slot(freespace_iter);
+	ret = bkey_err(k);
+	if (ret)
+		goto err;
+
+	*end = bkey_min(k.k->p, *end);
+
+	if (k.k->type != KEY_TYPE_set &&
+	    (c->opts.reconstruct_alloc ||
+	     fsck_err(c, "hole in alloc btree missing in freespace btree\n"
+		      "  device %llu buckets %llu-%llu",
+		      freespace_iter->pos.inode,
+		      freespace_iter->pos.offset,
+		      end->offset))) {
+		struct bkey_i *update =
+			bch2_trans_kmalloc(trans, sizeof(*update));
+
+		ret = PTR_ERR_OR_ZERO(update);
+		if (ret)
+			goto err;
+
+		bkey_init(&update->k);
+		update->k.type	= KEY_TYPE_set;
+		update->k.p	= freespace_iter->pos;
+		bch2_key_resize(&update->k,
+				min_t(u64, U32_MAX, end->offset -
+				      freespace_iter->pos.offset));
+
+		ret = bch2_trans_update(trans, freespace_iter, update, 0);
+		if (ret)
+			goto err;
+	}
+err:
+fsck_err:
+	printbuf_exit(&buf);
+	return ret;
+}
+
 static int bch2_check_discard_freespace_key(struct btree_trans *trans,
 					    struct btree_iter *iter)
 {
@@ -900,6 +1044,7 @@ int bch2_check_alloc_info(struct bch_fs *c)
 {
 	struct btree_trans trans;
 	struct btree_iter iter, discard_iter, freespace_iter;
+	struct bkey hole;
 	struct bkey_s_c k;
 	int ret = 0;
 
@@ -911,17 +1056,52 @@ int bch2_check_alloc_info(struct bch_fs *c)
 			     BTREE_ITER_PREFETCH);
 	bch2_trans_iter_init(&trans, &freespace_iter, BTREE_ID_freespace, POS_MIN,
 			     BTREE_ITER_PREFETCH);
+
 	while (1) {
-		ret = commit_do(&trans, NULL, NULL,
-				      BTREE_INSERT_NOFAIL|
-				      BTREE_INSERT_LAZY_RW,
-			bch2_check_alloc_key(&trans, &iter,
-					     &discard_iter,
-					     &freespace_iter));
+		struct bpos next;
+
+		bch2_trans_begin(&trans);
+
+		k = bch2_get_key_or_real_bucket_hole(&iter, &hole);
+		ret = bkey_err(k);
 		if (ret)
+			goto bkey_err;
+
+		if (!k.k)
 			break;
 
-		bch2_btree_iter_advance(&iter);
+		if (k.k->type) {
+			next = bpos_nosnap_successor(k.k->p);
+
+			ret = bch2_check_alloc_key(&trans,
+						   k, &iter,
+						   &discard_iter,
+						   &freespace_iter);
+			if (ret)
+				break;
+		} else {
+			next = k.k->p;
+
+			ret = bch2_check_alloc_hole(&trans,
+						    bkey_start_pos(k.k),
+						    &next,
+						    &freespace_iter);
+			if (ret)
+				goto bkey_err;
+		}
+
+		ret = bch2_trans_commit(&trans, NULL, NULL,
+					BTREE_INSERT_NOFAIL|
+					BTREE_INSERT_LAZY_RW);
+		if (ret)
+			goto bkey_err;
+
+		bch2_btree_iter_set_pos(&iter, next);
+bkey_err:
+		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+			continue;
+		if (ret)
+			break;
 	}
 	bch2_trans_iter_exit(&trans, &freespace_iter);
 	bch2_trans_iter_exit(&trans, &discard_iter);
@@ -1322,6 +1502,7 @@ static int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca)
 	struct btree_trans trans;
 	struct btree_iter iter;
 	struct bkey_s_c k;
+	struct bkey hole;
 	struct bpos end = POS(ca->dev_idx, ca->mi.nbuckets);
 	struct bch_member *m;
 	int ret;
@@ -1337,12 +1518,13 @@ static int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca)
 	 */
 	while (1) {
 		bch2_trans_begin(&trans);
-		ret = 0;
 
-		if (bkey_ge(iter.pos, end))
+		if (bkey_ge(iter.pos, end)) {
+			ret = 0;
 			break;
+		}
 
-		k = bch2_btree_iter_peek_slot(&iter);
+		k = bch2_get_key_or_hole(&iter, end, &hole);
 		ret = bkey_err(k);
 		if (ret)
 			goto bkey_err;
@@ -1364,34 +1546,7 @@ static int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca)
 
 			bch2_btree_iter_advance(&iter);
 		} else {
-			/*
-			 * When there's a hole, process a whole range of keys
-			 * all at once:
-			 *
-			 * This is similar to how extent btree iterators in
-			 * slots mode will synthesize a whole range - a
-			 * KEY_TYPE_deleted extent.
-			 *
-			 * But alloc keys aren't extents (they have zero size),
-			 * so we're open coding it here:
-			 */
-			struct btree_iter iter2;
 			struct bkey_i *freespace;
-			struct bpos next;
-
-			bch2_trans_copy_iter(&iter2, &iter);
-			k = bch2_btree_iter_peek_upto(&iter2,
-					bkey_min(bkey_min(end,
-							  iter.path->l[0].b->key.k.p),
-							  POS(iter.pos.inode, iter.pos.offset + U32_MAX - 1)));
-			next = iter2.pos;
-			ret = bkey_err(k);
-			bch2_trans_iter_exit(&trans, &iter2);
-
-			BUG_ON(next.offset >= iter.pos.offset + U32_MAX);
-
-			if (ret)
-				goto bkey_err;
 
 			freespace = bch2_trans_kmalloc(&trans, sizeof(*freespace));
 			ret = PTR_ERR_OR_ZERO(freespace);
@@ -1399,10 +1554,9 @@ static int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca)
 				goto bkey_err;
 
 			bkey_init(&freespace->k);
-			freespace->k.type = KEY_TYPE_set;
-			freespace->k.p = iter.pos;
-
-			bch2_key_resize(&freespace->k, next.offset - iter.pos.offset);
+			freespace->k.type	= KEY_TYPE_set;
+			freespace->k.p		= k.k->p;
+			freespace->k.size	= k.k->size;
 
 			ret = __bch2_btree_insert(&trans, BTREE_ID_freespace, freespace) ?:
 				bch2_trans_commit(&trans, NULL, NULL,
@@ -1411,7 +1565,7 @@ static int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca)
 			if (ret)
 				goto bkey_err;
 
-			bch2_btree_iter_set_pos(&iter, next);
+			bch2_btree_iter_set_pos(&iter, k.k->p);
 		}
 bkey_err:
 		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-- 
cgit 


From 47b323a0b0612c5310c35935a40012125a3e18b8 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 19 Jan 2023 03:37:44 -0500
Subject: bcachefs: Start snapshots before bch2_gc()

bch2_gc may require snapshots to be started - the repair path when
checking the reflink btree may do updates to the extents btree.

This moves bch2_fs_initialize_subvolumes() and bch2_fs_snapshots_start()
to before bch2_gc() - since we haven't gone RW yet, the updates in
bch2_fs_initialize_subvolumes() are done via the journal replay keys
list, so it's fine to do this before bch2_gc().

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/recovery.c | 42 ++++++++++++++----------------------------
 1 file changed, 14 insertions(+), 28 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 55356c117737..2df1a541cb40 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -1250,6 +1250,20 @@ use_clean:
 
 	bch2_stripes_heap_start(c);
 
+	if (c->sb.version < bcachefs_metadata_version_snapshot_2) {
+		err = "error creating root snapshot node";
+		ret = bch2_fs_initialize_subvolumes(c);
+		if (ret)
+			goto err;
+	}
+
+	bch_verbose(c, "reading snapshots table");
+	err = "error reading snapshots table";
+	ret = bch2_fs_snapshots_start(c);
+	if (ret)
+		goto err;
+	bch_verbose(c, "reading snapshots done");
+
 	if (c->opts.fsck) {
 		bool metadata_only = c->opts.norecovery;
 
@@ -1262,20 +1276,6 @@ use_clean:
 
 		set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags);
 
-		if (c->sb.version < bcachefs_metadata_version_snapshot_2) {
-			err = "error creating root snapshot node";
-			ret = bch2_fs_initialize_subvolumes(c);
-			if (ret)
-				goto err;
-		}
-
-		bch_verbose(c, "reading snapshots table");
-		err = "error reading snapshots table";
-		ret = bch2_fs_snapshots_start(c);
-		if (ret)
-			goto err;
-		bch_verbose(c, "reading snapshots done");
-
 		set_bit(BCH_FS_MAY_GO_RW, &c->flags);
 
 		bch_info(c, "starting journal replay, %zu keys", c->journal_keys.nr);
@@ -1343,20 +1343,6 @@ use_clean:
 		if (c->opts.norecovery)
 			goto out;
 
-		if (c->sb.version < bcachefs_metadata_version_snapshot_2) {
-			err = "error creating root snapshot node";
-			ret = bch2_fs_initialize_subvolumes(c);
-			if (ret)
-				goto err;
-		}
-
-		bch_verbose(c, "reading snapshots table");
-		err = "error reading snapshots table";
-		ret = bch2_fs_snapshots_start(c);
-		if (ret)
-			goto err;
-		bch_verbose(c, "reading snapshots done");
-
 		set_bit(BCH_FS_MAY_GO_RW, &c->flags);
 
 		bch_verbose(c, "starting journal replay, %zu keys", c->journal_keys.nr);
-- 
cgit 


From 8dd69d9f64e92529037550c97a07b1b78296e92c Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 21 Oct 2022 13:21:03 -0400
Subject: bcachefs: KEY_TYPE_inode_v3, metadata_version_inode_v3

Move bi_size and bi_sectors into the non-varint portion of the inode, so
that the write path can update them without going through the relatively
expensive unpack/pack operations.

Other changes:
 - Add a field for the offset of the varint section, so we can add new
   non-varint fields without needing a new inode type, like alloc_v3
 - Move bi_mode into the flags field, so that the varint section can be
   u64 aligned

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs_format.h |  55 +++++++++++++-
 fs/bcachefs/bkey_methods.c    |   1 +
 fs/bcachefs/buckets.c         |   4 +-
 fs/bcachefs/inode.c           | 163 ++++++++++++++++++++++++++++++++++++------
 fs/bcachefs/inode.h           |  24 +++++--
 fs/bcachefs/io.c              |   2 +-
 fs/bcachefs/recovery.c        |   7 +-
 7 files changed, 219 insertions(+), 37 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index 66c885186160..e0e2219fb1cc 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -370,7 +370,8 @@ static inline void bkey_init(struct bkey *k)
 	x(set,			25)			\
 	x(lru,			26)			\
 	x(alloc_v4,		27)			\
-	x(backpointer,		28)
+	x(backpointer,		28)			\
+	x(inode_v3,		29)
 
 enum bch_bkey_type {
 #define x(name, nr) KEY_TYPE_##name	= nr,
@@ -721,6 +722,21 @@ struct bch_inode_v2 {
 	__u8			fields[0];
 } __packed __aligned(8);
 
+struct bch_inode_v3 {
+	struct bch_val		v;
+
+	__le64			bi_journal_seq;
+	__le64			bi_hash_seed;
+	__le64			bi_flags;
+	__le64			bi_sectors;
+	__le64			bi_size;
+	__le64			bi_version;
+	__u8			fields[0];
+} __packed __aligned(8);
+
+#define INODEv3_FIELDS_START_INITIAL	6
+#define INODEv3_FIELDS_START_CUR	(offsetof(struct bch_inode_v3, fields) / sizeof(u64))
+
 struct bch_inode_generation {
 	struct bch_val		v;
 
@@ -732,7 +748,7 @@ struct bch_inode_generation {
  * bi_subvol and bi_parent_subvol are only set for subvolume roots:
  */
 
-#define BCH_INODE_FIELDS()			\
+#define BCH_INODE_FIELDS_v2()			\
 	x(bi_atime,			96)	\
 	x(bi_ctime,			96)	\
 	x(bi_mtime,			96)	\
@@ -759,6 +775,31 @@ struct bch_inode_generation {
 	x(bi_subvol,			32)	\
 	x(bi_parent_subvol,		32)
 
+#define BCH_INODE_FIELDS_v3()			\
+	x(bi_atime,			96)	\
+	x(bi_ctime,			96)	\
+	x(bi_mtime,			96)	\
+	x(bi_otime,			96)	\
+	x(bi_uid,			32)	\
+	x(bi_gid,			32)	\
+	x(bi_nlink,			32)	\
+	x(bi_generation,		32)	\
+	x(bi_dev,			32)	\
+	x(bi_data_checksum,		8)	\
+	x(bi_compression,		8)	\
+	x(bi_project,			32)	\
+	x(bi_background_compression,	8)	\
+	x(bi_data_replicas,		8)	\
+	x(bi_promote_target,		16)	\
+	x(bi_foreground_target,		16)	\
+	x(bi_background_target,		16)	\
+	x(bi_erasure_code,		16)	\
+	x(bi_fields_set,		16)	\
+	x(bi_dir,			64)	\
+	x(bi_dir_offset,		64)	\
+	x(bi_subvol,			32)	\
+	x(bi_parent_subvol,		32)
+
 /* subset of BCH_INODE_FIELDS */
 #define BCH_INODE_OPTS()			\
 	x(data_checksum,		8)	\
@@ -815,6 +856,13 @@ LE32_BITMASK(INODE_NEW_VARINT,	struct bch_inode, bi_flags, 31, 32);
 LE64_BITMASK(INODEv2_STR_HASH,	struct bch_inode_v2, bi_flags, 20, 24);
 LE64_BITMASK(INODEv2_NR_FIELDS,	struct bch_inode_v2, bi_flags, 24, 31);
 
+LE64_BITMASK(INODEv3_STR_HASH,	struct bch_inode_v3, bi_flags, 20, 24);
+LE64_BITMASK(INODEv3_NR_FIELDS,	struct bch_inode_v3, bi_flags, 24, 31);
+
+LE64_BITMASK(INODEv3_FIELDS_START,
+				struct bch_inode_v3, bi_flags, 31, 36);
+LE64_BITMASK(INODEv3_MODE,	struct bch_inode_v3, bi_flags, 36, 52);
+
 /* Dirents */
 
 /*
@@ -1499,7 +1547,8 @@ struct bch_sb_field_journal_seq_blacklist {
 	x(freespace,			19)		\
 	x(alloc_v4,			20)		\
 	x(new_data_types,		21)		\
-	x(backpointers,			22)
+	x(backpointers,			22)		\
+	x(inode_v3,			23)
 
 enum bcachefs_metadata_version {
 	bcachefs_metadata_version_min = 9,
diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c
index 45c8b2c61c5b..c7c0a9781a35 100644
--- a/fs/bcachefs/bkey_methods.c
+++ b/fs/bcachefs/bkey_methods.c
@@ -149,6 +149,7 @@ static unsigned bch2_key_types_allowed[] = {
 		(1U << KEY_TYPE_whiteout)|
 		(1U << KEY_TYPE_inode)|
 		(1U << KEY_TYPE_inode_v2)|
+		(1U << KEY_TYPE_inode_v3)|
 		(1U << KEY_TYPE_inode_generation),
 	[BKEY_TYPE_dirents] =
 		(1U << KEY_TYPE_deleted)|
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index b657f8545a3b..9dcdfca19d52 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -1123,10 +1123,10 @@ int bch2_mark_inode(struct btree_trans *trans,
 	u64 journal_seq = trans->journal_res.seq;
 
 	if (flags & BTREE_TRIGGER_INSERT) {
-		struct bch_inode_v2 *v = (struct bch_inode_v2 *) new.v;
+		struct bch_inode_v3 *v = (struct bch_inode_v3 *) new.v;
 
 		BUG_ON(!journal_seq);
-		BUG_ON(new.k->type != KEY_TYPE_inode_v2);
+		BUG_ON(new.k->type != KEY_TYPE_inode_v3);
 
 		v->bi_journal_seq = cpu_to_le64(journal_seq);
 	}
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index 9eeabe70aec1..f338cf6fd8b7 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -60,11 +60,10 @@ static int inode_decode_field(const u8 *in, const u8 *end,
 	return bytes;
 }
 
-static inline void bch2_inode_pack_inlined(struct bch_fs *c,
-					   struct bkey_inode_buf *packed,
+static inline void bch2_inode_pack_inlined(struct bkey_inode_buf *packed,
 					   const struct bch_inode_unpacked *inode)
 {
-	struct bkey_i_inode_v2 *k = &packed->inode;
+	struct bkey_i_inode_v3 *k = &packed->inode;
 	u8 *out = k->v.fields;
 	u8 *end = (void *) &packed[1];
 	u8 *last_nonzero_field = out;
@@ -72,13 +71,17 @@ static inline void bch2_inode_pack_inlined(struct bch_fs *c,
 	unsigned bytes;
 	int ret;
 
-	bkey_inode_v2_init(&packed->inode.k_i);
+	bkey_inode_v3_init(&packed->inode.k_i);
 	packed->inode.k.p.offset	= inode->bi_inum;
 	packed->inode.v.bi_journal_seq	= cpu_to_le64(inode->bi_journal_seq);
 	packed->inode.v.bi_hash_seed	= inode->bi_hash_seed;
 	packed->inode.v.bi_flags	= cpu_to_le64(inode->bi_flags);
-	packed->inode.v.bi_flags	= cpu_to_le64(inode->bi_flags);
-	packed->inode.v.bi_mode		= cpu_to_le16(inode->bi_mode);
+	packed->inode.v.bi_sectors	= cpu_to_le64(inode->bi_sectors);
+	packed->inode.v.bi_size		= cpu_to_le64(inode->bi_size);
+	packed->inode.v.bi_version	= cpu_to_le64(inode->bi_version);
+	SET_INODEv3_MODE(&packed->inode.v, inode->bi_mode);
+	SET_INODEv3_FIELDS_START(&packed->inode.v, INODEv3_FIELDS_START_CUR);
+
 
 #define x(_name, _bits)							\
 	nr_fields++;							\
@@ -99,7 +102,7 @@ static inline void bch2_inode_pack_inlined(struct bch_fs *c,
 			*out++ = 0;					\
 	}
 
-	BCH_INODE_FIELDS()
+	BCH_INODE_FIELDS_v3()
 #undef  x
 	BUG_ON(out > end);
 
@@ -110,7 +113,7 @@ static inline void bch2_inode_pack_inlined(struct bch_fs *c,
 	set_bkey_val_bytes(&packed->inode.k, bytes);
 	memset_u64s_tail(&packed->inode.v, 0, bytes);
 
-	SET_INODEv2_NR_FIELDS(&k->v, nr_fields);
+	SET_INODEv3_NR_FIELDS(&k->v, nr_fields);
 
 	if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) {
 		struct bch_inode_unpacked unpacked;
@@ -120,21 +123,23 @@ static inline void bch2_inode_pack_inlined(struct bch_fs *c,
 		BUG_ON(ret);
 		BUG_ON(unpacked.bi_inum		!= inode->bi_inum);
 		BUG_ON(unpacked.bi_hash_seed	!= inode->bi_hash_seed);
+		BUG_ON(unpacked.bi_sectors	!= inode->bi_sectors);
+		BUG_ON(unpacked.bi_size		!= inode->bi_size);
+		BUG_ON(unpacked.bi_version	!= inode->bi_version);
 		BUG_ON(unpacked.bi_mode		!= inode->bi_mode);
 
 #define x(_name, _bits)	if (unpacked._name != inode->_name)		\
 			panic("unpacked %llu should be %llu",		\
 			      (u64) unpacked._name, (u64) inode->_name);
-		BCH_INODE_FIELDS()
+		BCH_INODE_FIELDS_v3()
 #undef  x
 	}
 }
 
-void bch2_inode_pack(struct bch_fs *c,
-		     struct bkey_inode_buf *packed,
+void bch2_inode_pack(struct bkey_inode_buf *packed,
 		     const struct bch_inode_unpacked *inode)
 {
-	bch2_inode_pack_inlined(c, packed, inode);
+	bch2_inode_pack_inlined(packed, inode);
 }
 
 static noinline int bch2_inode_unpack_v1(struct bkey_s_c_inode inode,
@@ -164,7 +169,7 @@ static noinline int bch2_inode_unpack_v1(struct bkey_s_c_inode inode,
 	unpacked->_name = field[1];					\
 	in += ret;
 
-	BCH_INODE_FIELDS()
+	BCH_INODE_FIELDS_v2()
 #undef  x
 
 	/* XXX: signal if there were more fields than expected? */
@@ -203,15 +208,66 @@ static int bch2_inode_unpack_v2(struct bch_inode_unpacked *unpacked,
 		return -1;						\
 	fieldnr++;
 
-	BCH_INODE_FIELDS()
+	BCH_INODE_FIELDS_v2()
 #undef  x
 
 	/* XXX: signal if there were more fields than expected? */
 	return 0;
 }
 
-int bch2_inode_unpack(struct bkey_s_c k,
-		      struct bch_inode_unpacked *unpacked)
+static int bch2_inode_unpack_v3(struct bkey_s_c k,
+				struct bch_inode_unpacked *unpacked)
+{
+	struct bkey_s_c_inode_v3 inode = bkey_s_c_to_inode_v3(k);
+	const u8 *in = inode.v->fields;
+	const u8 *end = bkey_val_end(inode);
+	unsigned nr_fields = INODEv3_NR_FIELDS(inode.v);
+	unsigned fieldnr = 0;
+	int ret;
+	u64 v[2];
+
+	unpacked->bi_inum	= inode.k->p.offset;
+	unpacked->bi_journal_seq= le64_to_cpu(inode.v->bi_journal_seq);
+	unpacked->bi_hash_seed	= inode.v->bi_hash_seed;
+	unpacked->bi_flags	= le64_to_cpu(inode.v->bi_flags);
+	unpacked->bi_sectors	= le64_to_cpu(inode.v->bi_sectors);
+	unpacked->bi_size	= le64_to_cpu(inode.v->bi_size);
+	unpacked->bi_version	= le64_to_cpu(inode.v->bi_version);
+	unpacked->bi_mode	= INODEv3_MODE(inode.v);
+
+#define x(_name, _bits)							\
+	if (fieldnr < nr_fields) {					\
+		ret = bch2_varint_decode_fast(in, end, &v[0]);		\
+		if (ret < 0)						\
+			return ret;					\
+		in += ret;						\
+									\
+		if (_bits > 64) {					\
+			ret = bch2_varint_decode_fast(in, end, &v[1]);	\
+			if (ret < 0)					\
+				return ret;				\
+			in += ret;					\
+		} else {						\
+			v[1] = 0;					\
+		}							\
+	} else {							\
+		v[0] = v[1] = 0;					\
+	}								\
+									\
+	unpacked->_name = v[0];						\
+	if (v[1] || v[0] != unpacked->_name)				\
+		return -1;						\
+	fieldnr++;
+
+	BCH_INODE_FIELDS_v3()
+#undef  x
+
+	/* XXX: signal if there were more fields than expected? */
+	return 0;
+}
+
+static noinline int bch2_inode_unpack_slowpath(struct bkey_s_c k,
+					       struct bch_inode_unpacked *unpacked)
 {
 	memset(unpacked, 0, sizeof(*unpacked));
 
@@ -252,6 +308,14 @@ int bch2_inode_unpack(struct bkey_s_c k,
 	}
 }
 
+int bch2_inode_unpack(struct bkey_s_c k,
+		      struct bch_inode_unpacked *unpacked)
+{
+	if (likely(k.k->type == KEY_TYPE_inode_v3))
+		return bch2_inode_unpack_v3(k, unpacked);
+	return bch2_inode_unpack_slowpath(k, unpacked);
+}
+
 int bch2_inode_peek(struct btree_trans *trans,
 		    struct btree_iter *iter,
 		    struct bch_inode_unpacked *inode,
@@ -297,11 +361,32 @@ int bch2_inode_write(struct btree_trans *trans,
 	if (IS_ERR(inode_p))
 		return PTR_ERR(inode_p);
 
-	bch2_inode_pack_inlined(trans->c, inode_p, inode);
+	bch2_inode_pack_inlined(inode_p, inode);
 	inode_p->inode.k.p.snapshot = iter->snapshot;
 	return bch2_trans_update(trans, iter, &inode_p->inode.k_i, 0);
 }
 
+struct bkey_i *bch2_inode_to_v3(struct btree_trans *trans, struct bkey_i *k)
+{
+	struct bch_inode_unpacked u;
+	struct bkey_inode_buf *inode_p;
+	int ret;
+
+	if (!bkey_is_inode(&k->k))
+		return ERR_PTR(-ENOENT);
+
+	inode_p = bch2_trans_kmalloc(trans, sizeof(*inode_p));
+	if (IS_ERR(inode_p))
+		return ERR_CAST(inode_p);
+
+	ret = bch2_inode_unpack(bkey_i_to_s_c(k), &u);
+	if (ret)
+		return ERR_PTR(ret);
+
+	bch2_inode_pack(inode_p, &u);
+	return &inode_p->inode.k_i;
+}
+
 static int __bch2_inode_invalid(struct bkey_s_c k, struct printbuf *err)
 {
 	struct bch_inode_unpacked unpacked;
@@ -387,15 +472,48 @@ int bch2_inode_v2_invalid(const struct bch_fs *c, struct bkey_s_c k,
 	return __bch2_inode_invalid(k, err);
 }
 
-static void __bch2_inode_unpacked_to_text(struct printbuf *out, struct bch_inode_unpacked *inode)
+int bch2_inode_v3_invalid(const struct bch_fs *c, struct bkey_s_c k,
+			  int rw, struct printbuf *err)
+{
+	struct bkey_s_c_inode_v3 inode = bkey_s_c_to_inode_v3(k);
+
+	if (bkey_val_bytes(k.k) < sizeof(*inode.v)) {
+		prt_printf(err, "incorrect value size (%zu < %zu)",
+		       bkey_val_bytes(k.k), sizeof(*inode.v));
+		return -BCH_ERR_invalid_bkey;
+	}
+
+	if (INODEv3_FIELDS_START(inode.v) < INODEv3_FIELDS_START_INITIAL ||
+	    INODEv3_FIELDS_START(inode.v) > bkey_val_u64s(inode.k)) {
+		prt_printf(err, "invalid fields_start (got %llu, min %u max %zu)",
+		       INODEv3_FIELDS_START(inode.v),
+		       INODEv3_FIELDS_START_INITIAL,
+		       bkey_val_u64s(inode.k));
+		return -BCH_ERR_invalid_bkey;
+	}
+
+	if (INODEv3_STR_HASH(inode.v) >= BCH_STR_HASH_NR) {
+		prt_printf(err, "invalid str hash type (%llu >= %u)",
+		       INODEv3_STR_HASH(inode.v), BCH_STR_HASH_NR);
+		return -BCH_ERR_invalid_bkey;
+	}
+
+	return __bch2_inode_invalid(k, err);
+}
+
+static void __bch2_inode_unpacked_to_text(struct printbuf *out,
+					  struct bch_inode_unpacked *inode)
 {
-	prt_printf(out, "mode %o flags %x journal_seq %llu",
+	prt_printf(out, "mode %o flags %x journal_seq %llu bi_size %llu bi_sectors %llu bi_version %llu",
 	       inode->bi_mode, inode->bi_flags,
-	       inode->bi_journal_seq);
+	       inode->bi_journal_seq,
+	       inode->bi_size,
+	       inode->bi_sectors,
+	       inode->bi_version);
 
 #define x(_name, _bits)						\
 	prt_printf(out, " "#_name " %llu", (u64) inode->_name);
-	BCH_INODE_FIELDS()
+	BCH_INODE_FIELDS_v3()
 #undef  x
 }
 
@@ -405,8 +523,7 @@ void bch2_inode_unpacked_to_text(struct printbuf *out, struct bch_inode_unpacked
 	__bch2_inode_unpacked_to_text(out, inode);
 }
 
-void bch2_inode_to_text(struct printbuf *out, struct bch_fs *c,
-		       struct bkey_s_c k)
+void bch2_inode_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
 {
 	struct bch_inode_unpacked inode;
 
diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h
index da78ed023a30..b753e1b254e4 100644
--- a/fs/bcachefs/inode.h
+++ b/fs/bcachefs/inode.h
@@ -9,6 +9,7 @@ extern const char * const bch2_inode_opts[];
 
 int bch2_inode_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *);
 int bch2_inode_v2_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *);
+int bch2_inode_v3_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *);
 void bch2_inode_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 
 #define bch2_bkey_ops_inode ((struct bkey_ops) {	\
@@ -25,10 +26,18 @@ void bch2_inode_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 	.atomic_trigger	= bch2_mark_inode,		\
 })
 
+#define bch2_bkey_ops_inode_v3 ((struct bkey_ops) {	\
+	.key_invalid	= bch2_inode_v3_invalid,	\
+	.val_to_text	= bch2_inode_to_text,		\
+	.trans_trigger	= bch2_trans_mark_inode,	\
+	.atomic_trigger	= bch2_mark_inode,		\
+})
+
 static inline bool bkey_is_inode(const struct bkey *k)
 {
 	return  k->type == KEY_TYPE_inode ||
-		k->type == KEY_TYPE_inode_v2;
+		k->type == KEY_TYPE_inode_v2 ||
+		k->type == KEY_TYPE_inode_v3;
 }
 
 int bch2_inode_generation_invalid(const struct bch_fs *, struct bkey_s_c,
@@ -52,25 +61,28 @@ struct bch_inode_unpacked {
 	u64			bi_inum;
 	u64			bi_journal_seq;
 	__le64			bi_hash_seed;
+	u64			bi_size;
+	u64			bi_sectors;
+	u64			bi_version;
 	u32			bi_flags;
 	u16			bi_mode;
 
 #define x(_name, _bits)	u##_bits _name;
-	BCH_INODE_FIELDS()
+	BCH_INODE_FIELDS_v3()
 #undef  x
 };
 
 struct bkey_inode_buf {
-	struct bkey_i_inode_v2	inode;
+	struct bkey_i_inode_v3	inode;
 
 #define x(_name, _bits)		+ 8 + _bits / 8
-	u8		_pad[0 + BCH_INODE_FIELDS()];
+	u8		_pad[0 + BCH_INODE_FIELDS_v3()];
 #undef  x
 } __packed __aligned(8);
 
-void bch2_inode_pack(struct bch_fs *, struct bkey_inode_buf *,
-		     const struct bch_inode_unpacked *);
+void bch2_inode_pack(struct bkey_inode_buf *, const struct bch_inode_unpacked *);
 int bch2_inode_unpack(struct bkey_s_c, struct bch_inode_unpacked *);
+struct bkey_i *bch2_inode_to_v3(struct btree_trans *, struct bkey_i *);
 
 void bch2_inode_unpacked_to_text(struct printbuf *, struct bch_inode_unpacked *);
 
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index a937940f5096..fb85c2bfd569 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -356,7 +356,7 @@ int bch2_extent_update(struct btree_trans *trans,
 	}
 
 	if (i_sectors_delta || new_i_size) {
-		bch2_inode_pack(trans->c, &inode_p, &inode_u);
+		bch2_inode_pack(&inode_p, &inode_u);
 
 		inode_p.inode.k.p.snapshot = iter->snapshot;
 
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 2df1a541cb40..b35590226037 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -1098,6 +1098,9 @@ int bch2_fs_recovery(struct bch_fs *c)
 			c->opts.version_upgrade	= true;
 			c->opts.fsck		= true;
 			c->opts.fix_errors	= FSCK_OPT_YES;
+		} else if (c->sb.version < bcachefs_metadata_version_inode_v3) {
+			bch_info(c, "version prior to inode_v3, upgrade required");
+			c->opts.version_upgrade	= true;
 		}
 	}
 
@@ -1482,7 +1485,7 @@ int bch2_fs_initialize(struct bch_fs *c)
 	c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_extents_above_btree_updates_done);
 	c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_bformat_overflow_done);
 
-	if (c->sb.version < bcachefs_metadata_version_backpointers)
+	if (c->sb.version < bcachefs_metadata_version_inode_v3)
 		c->opts.version_upgrade	= true;
 
 	if (c->opts.version_upgrade) {
@@ -1563,7 +1566,7 @@ int bch2_fs_initialize(struct bch_fs *c)
 	bch2_inode_init(c, &root_inode, 0, 0, S_IFDIR|0755, 0, NULL);
 	root_inode.bi_inum	= BCACHEFS_ROOT_INO;
 	root_inode.bi_subvol	= BCACHEFS_ROOT_SUBVOL;
-	bch2_inode_pack(c, &packed_inode, &root_inode);
+	bch2_inode_pack(&packed_inode, &root_inode);
 	packed_inode.inode.k.p.snapshot = U32_MAX;
 
 	err = "error creating root directory";
-- 
cgit 


From b08b492ed3068b80122bb58476baeacad2d0fa4c Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 8 Nov 2021 12:30:47 -0500
Subject: bcachefs: Drop old maybe_extending optimization

The extend update path had an optimization to avoid updating the inode
if we knew we were definitely not extending the file. But now that we're
updating inodes on every extent update - for fsync - that code can be
deleted.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/data_update.c |  3 +--
 fs/bcachefs/io.c          | 34 ++--------------------------------
 fs/bcachefs/io.h          |  2 +-
 3 files changed, 4 insertions(+), 35 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c
index d2b1296c4c49..700936ac44d9 100644
--- a/fs/bcachefs/data_update.c
+++ b/fs/bcachefs/data_update.c
@@ -126,7 +126,7 @@ int bch2_data_update_index_update(struct bch_write_op *op)
 		struct extent_ptr_decoded p;
 		struct bpos next_pos;
 		bool did_work = false;
-		bool extending = false, should_check_enospc;
+		bool should_check_enospc;
 		s64 i_sectors_delta = 0, disk_sectors_delta = 0;
 		unsigned i;
 
@@ -212,7 +212,6 @@ int bch2_data_update_index_update(struct bch_write_op *op)
 		bch2_extent_normalize(c, bkey_i_to_s(insert));
 
 		ret = bch2_sum_sector_overwrites(&trans, &iter, insert,
-						 &extending,
 						 &should_check_enospc,
 						 &i_sectors_delta,
 						 &disk_sectors_delta);
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index fb85c2bfd569..9cd91180d890 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -198,7 +198,6 @@ void bch2_bio_alloc_pages_pool(struct bch_fs *c, struct bio *bio,
 int bch2_sum_sector_overwrites(struct btree_trans *trans,
 			       struct btree_iter *extent_iter,
 			       struct bkey_i *new,
-			       bool *maybe_extending,
 			       bool *usage_increasing,
 			       s64 *i_sectors_delta,
 			       s64 *disk_sectors_delta)
@@ -210,7 +209,6 @@ int bch2_sum_sector_overwrites(struct btree_trans *trans,
 	bool new_compressed = bch2_bkey_sectors_compressed(bkey_i_to_s_c(new));
 	int ret = 0;
 
-	*maybe_extending	= true;
 	*usage_increasing	= false;
 	*i_sectors_delta	= 0;
 	*disk_sectors_delta	= 0;
@@ -237,31 +235,8 @@ int bch2_sum_sector_overwrites(struct btree_trans *trans,
 		     (!new_compressed && bch2_bkey_sectors_compressed(old))))
 			*usage_increasing = true;
 
-		if (bkey_ge(old.k->p, new->k.p)) {
-			/*
-			 * Check if there's already data above where we're
-			 * going to be writing to - this means we're definitely
-			 * not extending the file:
-			 *
-			 * Note that it's not sufficient to check if there's
-			 * data up to the sector offset we're going to be
-			 * writing to, because i_size could be up to one block
-			 * less:
-			 */
-			if (!bkey_cmp(old.k->p, new->k.p)) {
-				old = bch2_btree_iter_next(&iter);
-				ret = bkey_err(old);
-				if (ret)
-					break;
-			}
-
-			if (old.k && !bkey_err(old) &&
-			    old.k->p.inode == extent_iter->pos.inode &&
-			    bkey_extent_is_data(old.k))
-				*maybe_extending = false;
-
+		if (bkey_ge(old.k->p, new->k.p))
 			break;
-		}
 	}
 
 	bch2_trans_iter_exit(trans, &iter);
@@ -283,7 +258,7 @@ int bch2_extent_update(struct btree_trans *trans,
 	struct bch_inode_unpacked inode_u;
 	struct bpos next_pos;
 	struct bkey_s_c inode;
-	bool extending = false, usage_increasing;
+	bool usage_increasing;
 	s64 i_sectors_delta = 0, disk_sectors_delta = 0;
 	int ret;
 
@@ -305,7 +280,6 @@ int bch2_extent_update(struct btree_trans *trans,
 	next_pos = k->k.p;
 
 	ret = bch2_sum_sector_overwrites(trans, iter, k,
-			&extending,
 			&usage_increasing,
 			&i_sectors_delta,
 			&disk_sectors_delta);
@@ -322,10 +296,6 @@ int bch2_extent_update(struct btree_trans *trans,
 			return ret;
 	}
 
-	new_i_size = extending
-		? min(k->k.p.offset << 9, new_i_size)
-		: 0;
-
 	bch2_trans_iter_init(trans, &inode_iter, BTREE_ID_inodes,
 			     SPOS(0, inum.inum, iter->snapshot),
 			     BTREE_ITER_INTENT|
diff --git a/fs/bcachefs/io.h b/fs/bcachefs/io.h
index faf2c2057828..39f2aabf4f59 100644
--- a/fs/bcachefs/io.h
+++ b/fs/bcachefs/io.h
@@ -70,7 +70,7 @@ static inline struct workqueue_struct *index_update_wq(struct bch_write_op *op)
 }
 
 int bch2_sum_sector_overwrites(struct btree_trans *, struct btree_iter *,
-			       struct bkey_i *, bool *, bool *, s64 *, s64 *);
+			       struct bkey_i *, bool *, s64 *, s64 *);
 int bch2_extent_update(struct btree_trans *, subvol_inum,
 		       struct btree_iter *, struct bkey_i *,
 		       struct disk_reservation *, u64, s64 *, bool);
-- 
cgit 


From 9bcbc0307d9cbaae5836a8051c91b468fe1571c9 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 21 Oct 2022 13:42:38 -0400
Subject: bcachefs: Skip inode unpack/pack in bch2_extent_update()

This takes advantage of the new inode type to skip the expensive
pack/unpack when inode updates are required in the extent update path.
Additionally, we now skip the inode update entirely when i_sectors and
i_size aren't changing.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/io.c | 88 +++++++++++++++++++++++++++++---------------------------
 1 file changed, 45 insertions(+), 43 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index 9cd91180d890..af6b9a7456f5 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -252,13 +252,13 @@ int bch2_extent_update(struct btree_trans *trans,
 		       s64 *i_sectors_delta_total,
 		       bool check_enospc)
 {
-	/* this must live until after bch2_trans_commit(): */
-	struct bkey_inode_buf inode_p;
 	struct btree_iter inode_iter = { NULL };
-	struct bch_inode_unpacked inode_u;
+	struct bkey_s_c inode_k;
+	struct bkey_s_c_inode_v3 inode;
+	struct bkey_i_inode_v3 *new_inode;
 	struct bpos next_pos;
-	struct bkey_s_c inode;
 	bool usage_increasing;
+	unsigned inode_update_flags = BTREE_UPDATE_NOJOURNAL;
 	s64 i_sectors_delta = 0, disk_sectors_delta = 0;
 	int ret;
 
@@ -298,64 +298,66 @@ int bch2_extent_update(struct btree_trans *trans,
 
 	bch2_trans_iter_init(trans, &inode_iter, BTREE_ID_inodes,
 			     SPOS(0, inum.inum, iter->snapshot),
-			     BTREE_ITER_INTENT|
-			     (trans->c->opts.inodes_use_key_cache
-			      ? BTREE_ITER_CACHED
-			      : 0));
-	inode = bch2_btree_iter_peek_slot(&inode_iter);
-	ret = bkey_err(inode);
-	if (ret)
+			     BTREE_ITER_INTENT|BTREE_ITER_CACHED);
+	inode_k = bch2_btree_iter_peek_slot(&inode_iter);
+	ret = bkey_err(inode_k);
+	if (unlikely(ret))
 		goto err;
 
-	ret = bkey_is_inode(inode.k) ? 0 : -ENOENT;
-	if (ret)
+	ret = bkey_is_inode(inode_k.k) ? 0 : -ENOENT;
+	if (unlikely(ret))
 		goto err;
 
-	if (i_sectors_delta || new_i_size) {
-		ret = bch2_inode_unpack(inode, &inode_u);
-		if (ret)
+	if (unlikely(inode_k.k->type != KEY_TYPE_inode_v3)) {
+		inode_k = bch2_inode_to_v3(trans, inode_k);
+		ret = bkey_err(inode_k);
+		if (unlikely(ret))
 			goto err;
-
-		if (!(inode_u.bi_flags & BCH_INODE_I_SIZE_DIRTY) &&
-		    new_i_size > inode_u.bi_size)
-			inode_u.bi_size = new_i_size;
-		else
-			new_i_size = 0;
-
-		inode_u.bi_sectors += i_sectors_delta;
 	}
 
-	if (i_sectors_delta || new_i_size) {
-		bch2_inode_pack(&inode_p, &inode_u);
+	inode = bkey_s_c_to_inode_v3(inode_k);
 
-		inode_p.inode.k.p.snapshot = iter->snapshot;
+	new_inode = bch2_trans_kmalloc(trans, bkey_bytes(inode_k.k));
+	ret = PTR_ERR_OR_ZERO(new_inode);
+	if (unlikely(ret))
+		goto err;
 
-		ret = bch2_trans_update(trans, &inode_iter,
-				  &inode_p.inode.k_i, 0);
-	} else {
-		bkey_reassemble(&inode_p.inode.k_i, inode);
+	bkey_reassemble(&new_inode->k_i, inode.s_c);
 
-		ret = bch2_trans_update(trans, &inode_iter,
-					&inode_p.inode.k_i,
-					BTREE_UPDATE_NOJOURNAL);
-		if (ret)
-			goto err;
+	if (!(le64_to_cpu(inode.v->bi_flags) & BCH_INODE_I_SIZE_DIRTY) &&
+	    new_i_size > le64_to_cpu(inode.v->bi_size)) {
+		new_inode->v.bi_size = cpu_to_le64(new_i_size);
+		inode_update_flags = 0;
 	}
 
-	ret =   bch2_trans_update(trans, iter, k, 0) ?:
+	if (i_sectors_delta) {
+		le64_add_cpu(&new_inode->v.bi_sectors, i_sectors_delta);
+		inode_update_flags = 0;
+	}
+
+	new_inode->k.p.snapshot = iter->snapshot;
+
+	/*
+	 * Note:
+	 * We always have to do an inode updated - even when i_size/i_sectors
+	 * aren't changing - for fsync to work properly; fsync relies on
+	 * inode->bi_journal_seq which is updated by the trigger code:
+	 */
+	ret =   bch2_trans_update(trans, &inode_iter, &new_inode->k_i,
+				  inode_update_flags) ?:
+		bch2_trans_update(trans, iter, k, 0) ?:
 		bch2_trans_commit(trans, disk_res, NULL,
 				BTREE_INSERT_NOCHECK_RW|
 				BTREE_INSERT_NOFAIL);
-err:
-	bch2_trans_iter_exit(trans, &inode_iter);
-	if (ret)
-		return ret;
+	if (unlikely(ret))
+		goto err;
 
 	if (i_sectors_delta_total)
 		*i_sectors_delta_total += i_sectors_delta;
 	bch2_btree_iter_set_pos(iter, next_pos);
-
-	return 0;
+err:
+	bch2_trans_iter_exit(trans, &inode_iter);
+	return ret;
 }
 
 /*
-- 
cgit 


From 70de7a47e2c56adbd76c24c80e95cf2203a9e74f Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 13 Nov 2022 18:54:37 -0500
Subject: bcachefs: bch2_extent_fallocate()

This factors out part of __bchfs_fallocate() in fs-io.c into an new,
lower level io.c helper, which creates a single extent reservation.

This is prep work for nocow support - the new helper will shortly gain
the ability to create unwritten extents.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-io.c | 31 +++++--------------------------
 fs/bcachefs/io.c    | 30 ++++++++++++++++++++++++++++++
 fs/bcachefs/io.h    |  3 +++
 3 files changed, 38 insertions(+), 26 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 944fffd9f7b5..77037574cb0d 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -3074,9 +3074,7 @@ static int __bchfs_fallocate(struct bch_inode_info *inode, int mode,
 
 	while (!ret && bkey_lt(iter.pos, end_pos)) {
 		s64 i_sectors_delta = 0;
-		struct disk_reservation disk_res = { 0 };
 		struct quota_res quota_res = { 0 };
-		struct bkey_i_reservation reservation;
 		struct bkey_s_c k;
 		unsigned sectors;
 		u32 snapshot;
@@ -3107,16 +3105,7 @@ static int __bchfs_fallocate(struct bch_inode_info *inode, int mode,
 			continue;
 		}
 
-		bkey_reservation_init(&reservation.k_i);
-		reservation.k.type	= KEY_TYPE_reservation;
-		reservation.k.p		= k.k->p;
-		reservation.k.size	= k.k->size;
-
-		bch2_cut_front(iter.pos,	&reservation.k_i);
-		bch2_cut_back(end_pos,		&reservation.k_i);
-
-		sectors = reservation.k.size;
-		reservation.v.nr_replicas = bch2_bkey_nr_ptrs_allocated(k);
+		sectors = bpos_min(k.k->p, end_pos).offset - iter.pos.offset;
 
 		if (!bkey_extent_is_allocation(k.k)) {
 			ret = bch2_quota_reservation_add(c, inode,
@@ -3126,25 +3115,15 @@ static int __bchfs_fallocate(struct bch_inode_info *inode, int mode,
 				goto bkey_err;
 		}
 
-		if (reservation.v.nr_replicas < opts.data_replicas ||
-		    bch2_bkey_sectors_compressed(k)) {
-			ret = bch2_disk_reservation_get(c, &disk_res, sectors,
-							opts.data_replicas, 0);
-			if (unlikely(ret))
-				goto bkey_err;
-
-			reservation.v.nr_replicas = disk_res.nr_replicas;
-		}
-
-		ret = bch2_extent_update(&trans, inode_inum(inode), &iter,
-				&reservation.k_i, &disk_res,
-				0, &i_sectors_delta, true);
+		ret = bch2_extent_fallocate(&trans, inode_inum(inode), &iter,
+					    sectors, opts, &i_sectors_delta,
+					    writepoint_hashed((unsigned long) current));
 		if (ret)
 			goto bkey_err;
+
 		i_sectors_acct(c, inode, &quota_res, i_sectors_delta);
 bkey_err:
 		bch2_quota_reservation_put(c, inode, &quota_res);
-		bch2_disk_reservation_put(c, &disk_res);
 		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
 			ret = 0;
 	}
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index af6b9a7456f5..bde3a4c42189 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -360,6 +360,36 @@ err:
 	return ret;
 }
 
+/* Overwrites whatever was present with zeroes: */
+int bch2_extent_fallocate(struct btree_trans *trans,
+			  subvol_inum inum,
+			  struct btree_iter *iter,
+			  unsigned sectors,
+			  struct bch_io_opts opts,
+			  s64 *i_sectors_delta,
+			  struct write_point_specifier write_point)
+{
+	int ret;
+	struct bch_fs *c = trans->c;
+	struct disk_reservation disk_res = { 0 };
+	struct bkey_i_reservation *reservation =
+		bch2_trans_kmalloc(trans, sizeof(*reservation));
+
+	ret = PTR_ERR_OR_ZERO(reservation);
+	if (ret)
+		return ret;
+
+	bkey_reservation_init(&reservation->k_i);
+	reservation->k.p = iter->pos;
+	bch2_key_resize(&reservation->k, sectors);
+	reservation->v.nr_replicas = opts.data_replicas;
+
+	ret = bch2_extent_update(trans, inum, iter, &reservation->k_i, &disk_res,
+				 0, i_sectors_delta, true);
+	bch2_disk_reservation_put(c, &disk_res);
+	return ret;
+}
+
 /*
  * Returns -BCH_ERR_transacton_restart if we had to drop locks:
  */
diff --git a/fs/bcachefs/io.h b/fs/bcachefs/io.h
index 39f2aabf4f59..aafe1bf993bb 100644
--- a/fs/bcachefs/io.h
+++ b/fs/bcachefs/io.h
@@ -74,6 +74,9 @@ int bch2_sum_sector_overwrites(struct btree_trans *, struct btree_iter *,
 int bch2_extent_update(struct btree_trans *, subvol_inum,
 		       struct btree_iter *, struct bkey_i *,
 		       struct disk_reservation *, u64, s64 *, bool);
+int bch2_extent_fallocate(struct btree_trans *, subvol_inum, struct btree_iter *,
+			  unsigned, struct bch_io_opts, s64 *,
+			  struct write_point_specifier);
 
 int bch2_fpunch_at(struct btree_trans *, struct btree_iter *,
 		   subvol_inum, u64, s64 *);
-- 
cgit 


From 2f1f7fe98d1da65c5ef646a90770b17cb012f1ee Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 14 Nov 2022 23:41:18 -0500
Subject: bcachefs: bch2_extent_update_i_size_sectors()

In the io path, when we do the extent update we also have to update the
inode - for i_size and i_sectors updates, as well as for bi_journal_seq
for fsync.

This factors that out into a new helper which will be used in the new
nocow mode, in the unwritten extent conversion path.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/io.c | 114 +++++++++++++++++++++++++++++--------------------------
 1 file changed, 60 insertions(+), 54 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index bde3a4c42189..c51381daf1c5 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -243,6 +243,60 @@ int bch2_sum_sector_overwrites(struct btree_trans *trans,
 	return ret;
 }
 
+static inline int bch2_extent_update_i_size_sectors(struct btree_trans *trans,
+						    struct btree_iter *extent_iter,
+						    u64 new_i_size,
+						    s64 i_sectors_delta)
+{
+	struct btree_iter iter;
+	struct bkey_i *k;
+	struct bkey_i_inode_v3 *inode;
+	unsigned inode_update_flags = BTREE_UPDATE_NOJOURNAL;
+	int ret;
+
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes,
+			     SPOS(0,
+				  extent_iter->pos.inode,
+				  extent_iter->snapshot),
+			     BTREE_ITER_INTENT|BTREE_ITER_CACHED);
+	k = bch2_bkey_get_mut(trans, &iter);
+	ret = PTR_ERR_OR_ZERO(k);
+	if (unlikely(ret))
+		goto err;
+
+	if (unlikely(k->k.type != KEY_TYPE_inode_v3)) {
+		k = bch2_inode_to_v3(trans, k);
+		ret = PTR_ERR_OR_ZERO(k);
+		if (unlikely(ret))
+			goto err;
+	}
+
+	inode = bkey_i_to_inode_v3(k);
+
+	if (!(le64_to_cpu(inode->v.bi_flags) & BCH_INODE_I_SIZE_DIRTY) &&
+	    new_i_size > le64_to_cpu(inode->v.bi_size)) {
+		inode->v.bi_size = cpu_to_le64(new_i_size);
+		inode_update_flags = 0;
+	}
+
+	if (i_sectors_delta) {
+		le64_add_cpu(&inode->v.bi_sectors, i_sectors_delta);
+		inode_update_flags = 0;
+	}
+
+	if (inode->k.p.snapshot != iter.snapshot) {
+		inode->k.p.snapshot = iter.snapshot;
+		inode_update_flags = 0;
+	}
+
+	ret = bch2_trans_update(trans, &iter, &inode->k_i,
+				BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|
+				inode_update_flags);
+err:
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
 int bch2_extent_update(struct btree_trans *trans,
 		       subvol_inum inum,
 		       struct btree_iter *iter,
@@ -252,13 +306,8 @@ int bch2_extent_update(struct btree_trans *trans,
 		       s64 *i_sectors_delta_total,
 		       bool check_enospc)
 {
-	struct btree_iter inode_iter = { NULL };
-	struct bkey_s_c inode_k;
-	struct bkey_s_c_inode_v3 inode;
-	struct bkey_i_inode_v3 *new_inode;
 	struct bpos next_pos;
 	bool usage_increasing;
-	unsigned inode_update_flags = BTREE_UPDATE_NOJOURNAL;
 	s64 i_sectors_delta = 0, disk_sectors_delta = 0;
 	int ret;
 
@@ -276,7 +325,6 @@ int bch2_extent_update(struct btree_trans *trans,
 	if (ret)
 		return ret;
 
-	new_i_size = min(k->k.p.offset << 9, new_i_size);
 	next_pos = k->k.p;
 
 	ret = bch2_sum_sector_overwrites(trans, iter, k,
@@ -296,68 +344,26 @@ int bch2_extent_update(struct btree_trans *trans,
 			return ret;
 	}
 
-	bch2_trans_iter_init(trans, &inode_iter, BTREE_ID_inodes,
-			     SPOS(0, inum.inum, iter->snapshot),
-			     BTREE_ITER_INTENT|BTREE_ITER_CACHED);
-	inode_k = bch2_btree_iter_peek_slot(&inode_iter);
-	ret = bkey_err(inode_k);
-	if (unlikely(ret))
-		goto err;
-
-	ret = bkey_is_inode(inode_k.k) ? 0 : -ENOENT;
-	if (unlikely(ret))
-		goto err;
-
-	if (unlikely(inode_k.k->type != KEY_TYPE_inode_v3)) {
-		inode_k = bch2_inode_to_v3(trans, inode_k);
-		ret = bkey_err(inode_k);
-		if (unlikely(ret))
-			goto err;
-	}
-
-	inode = bkey_s_c_to_inode_v3(inode_k);
-
-	new_inode = bch2_trans_kmalloc(trans, bkey_bytes(inode_k.k));
-	ret = PTR_ERR_OR_ZERO(new_inode);
-	if (unlikely(ret))
-		goto err;
-
-	bkey_reassemble(&new_inode->k_i, inode.s_c);
-
-	if (!(le64_to_cpu(inode.v->bi_flags) & BCH_INODE_I_SIZE_DIRTY) &&
-	    new_i_size > le64_to_cpu(inode.v->bi_size)) {
-		new_inode->v.bi_size = cpu_to_le64(new_i_size);
-		inode_update_flags = 0;
-	}
-
-	if (i_sectors_delta) {
-		le64_add_cpu(&new_inode->v.bi_sectors, i_sectors_delta);
-		inode_update_flags = 0;
-	}
-
-	new_inode->k.p.snapshot = iter->snapshot;
-
 	/*
 	 * Note:
-	 * We always have to do an inode updated - even when i_size/i_sectors
+	 * We always have to do an inode update - even when i_size/i_sectors
 	 * aren't changing - for fsync to work properly; fsync relies on
 	 * inode->bi_journal_seq which is updated by the trigger code:
 	 */
-	ret =   bch2_trans_update(trans, &inode_iter, &new_inode->k_i,
-				  inode_update_flags) ?:
+	ret =   bch2_extent_update_i_size_sectors(trans, iter,
+						  min(k->k.p.offset << 9, new_i_size),
+						  i_sectors_delta) ?:
 		bch2_trans_update(trans, iter, k, 0) ?:
 		bch2_trans_commit(trans, disk_res, NULL,
 				BTREE_INSERT_NOCHECK_RW|
 				BTREE_INSERT_NOFAIL);
 	if (unlikely(ret))
-		goto err;
+		return ret;
 
 	if (i_sectors_delta_total)
 		*i_sectors_delta_total += i_sectors_delta;
 	bch2_btree_iter_set_pos(iter, next_pos);
-err:
-	bch2_trans_iter_exit(trans, &inode_iter);
-	return ret;
+	return 0;
 }
 
 /* Overwrites whatever was present with zeroes: */
-- 
cgit 


From 792031116bee35e13be7c8ae8cf1b8eec141b136 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 13 Nov 2022 18:59:01 -0500
Subject: bcachefs: Unwritten extents support

 - bch2_extent_merge checks unwritten bit
 - read path returns 0s for unwritten extents without actually reading
 - reflink path skips over unwritten extents
 - bch2_bkey_ptrs_invalid() checks for extents with both written and
   unwritten extents, and non-normal extents (stripes, btree ptrs) with
   unwritten ptrs
 - fiemap checks for unwritten extents and returns
   FIEMAP_EXTENT_UNWRITTEN

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs_format.h |  4 ++--
 fs/bcachefs/extents.c         | 34 ++++++++++++++++++++++++++++++----
 fs/bcachefs/extents.h         | 17 +++++++++++++++++
 fs/bcachefs/fs-io.c           | 14 +++++++-------
 fs/bcachefs/fs.c              |  3 +++
 fs/bcachefs/fsck.c            |  4 ++--
 fs/bcachefs/io.c              |  3 +++
 fs/bcachefs/reflink.c         |  6 +++++-
 8 files changed, 69 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index e0e2219fb1cc..57327c4dc9b4 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -582,7 +582,7 @@ struct bch_extent_ptr {
 	__u64			type:1,
 				cached:1,
 				unused:1,
-				reservation:1,
+				unwritten:1,
 				offset:44, /* 8 petabytes */
 				dev:8,
 				gen:8;
@@ -590,7 +590,7 @@ struct bch_extent_ptr {
 	__u64			gen:8,
 				dev:8,
 				offset:44,
-				reservation:1,
+				unwritten:1,
 				unused:1,
 				cached:1,
 				type:1;
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 3d124dc5bbef..627edba24900 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -116,6 +116,13 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k,
 		return -EIO;
 
 	bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+		/*
+		 * Unwritten extent: no need to actually read, treat it as a
+		 * hole and return 0s:
+		 */
+		if (p.ptr.unwritten)
+			return 0;
+
 		ca = bch_dev_bkey_exists(c, p.ptr.dev);
 
 		/*
@@ -269,6 +276,7 @@ bool bch2_extent_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r)
 		    rp.ptr.offset + rp.crc.offset ||
 		    lp.ptr.dev			!= rp.ptr.dev ||
 		    lp.ptr.gen			!= rp.ptr.gen ||
+		    lp.ptr.unwritten		!= rp.ptr.unwritten ||
 		    lp.has_ec			!= rp.has_ec)
 			return false;
 
@@ -904,6 +912,9 @@ bool bch2_extents_match(struct bkey_s_c k1, struct bkey_s_c k2)
 	const union bch_extent_entry *entry1, *entry2;
 	struct extent_ptr_decoded p1, p2;
 
+	if (bkey_extent_is_unwritten(k1) != bkey_extent_is_unwritten(k2))
+		return false;
+
 	bkey_for_each_ptr_decode(k1.k, ptrs1, p1, entry1)
 		bkey_for_each_ptr_decode(k2.k, ptrs2, p2, entry2)
 			if (p1.ptr.dev		== p2.ptr.dev &&
@@ -981,10 +992,12 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
 				u32 offset;
 				u64 b = sector_to_bucket_and_offset(ca, ptr->offset, &offset);
 
-				prt_printf(out, "ptr: %u:%llu:%u gen %u%s", ptr->dev,
-				       b, offset, ptr->gen,
-				       ptr->cached ? " cached" : "");
-
+				prt_printf(out, "ptr: %u:%llu:%u gen %u",
+					   ptr->dev, b, offset, ptr->gen);
+				if (ptr->cached)
+					prt_str(out, " cached");
+				if (ptr->unwritten)
+					prt_str(out, " unwritten");
 				if (ca && ptr_stale(ca, ptr))
 					prt_printf(out, " stale");
 			}
@@ -1073,6 +1086,7 @@ int bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k,
 	unsigned size_ondisk = k.k->size;
 	unsigned nonce = UINT_MAX;
 	unsigned nr_ptrs = 0;
+	bool unwritten = false;
 	int ret;
 
 	if (bkey_is_btree_ptr(k.k))
@@ -1097,6 +1111,18 @@ int bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k,
 						 false, err);
 			if (ret)
 				return ret;
+
+			if (nr_ptrs && unwritten != entry->ptr.unwritten) {
+				prt_printf(err, "extent with unwritten and written ptrs");
+				return -BCH_ERR_invalid_bkey;
+			}
+
+			if (k.k->type != KEY_TYPE_extent && entry->ptr.unwritten) {
+				prt_printf(err, "has unwritten ptrs");
+				return -BCH_ERR_invalid_bkey;
+			}
+
+			unwritten = entry->ptr.unwritten;
 			nr_ptrs++;
 			break;
 		case BCH_EXTENT_ENTRY_crc32:
diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h
index f640254004e7..659ab76ea62c 100644
--- a/fs/bcachefs/extents.h
+++ b/fs/bcachefs/extents.h
@@ -510,6 +510,23 @@ static inline bool bkey_extent_is_allocation(const struct bkey *k)
 	}
 }
 
+static inline bool bkey_extent_is_unwritten(struct bkey_s_c k)
+{
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+	const struct bch_extent_ptr *ptr;
+
+	bkey_for_each_ptr(ptrs, ptr)
+		if (ptr->unwritten)
+			return true;
+	return false;
+}
+
+static inline bool bkey_extent_is_reservation(struct bkey_s_c k)
+{
+	return k.k->type == KEY_TYPE_reservation ||
+		bkey_extent_is_unwritten(k);
+}
+
 static inline struct bch_devs_list bch2_bkey_devs(struct bkey_s_c k)
 {
 	struct bch_devs_list ret = (struct bch_devs_list) { 0 };
diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 77037574cb0d..b5cf0a3218ea 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -341,11 +341,11 @@ static struct bch_page_state *bch2_page_state_create(struct page *page,
 	return bch2_page_state(page) ?: __bch2_page_state_create(page, gfp);
 }
 
-static unsigned bkey_to_sector_state(const struct bkey *k)
+static unsigned bkey_to_sector_state(struct bkey_s_c k)
 {
-	if (k->type == KEY_TYPE_reservation)
+	if (bkey_extent_is_reservation(k))
 		return SECTOR_RESERVED;
-	if (bkey_extent_is_allocation(k))
+	if (bkey_extent_is_allocation(k.k))
 		return SECTOR_ALLOCATED;
 	return SECTOR_UNALLOCATED;
 }
@@ -396,7 +396,7 @@ retry:
 			   SPOS(inum.inum, offset, snapshot),
 			   BTREE_ITER_SLOTS, k, ret) {
 		unsigned nr_ptrs = bch2_bkey_nr_ptrs_fully_allocated(k);
-		unsigned state = bkey_to_sector_state(k.k);
+		unsigned state = bkey_to_sector_state(k);
 
 		while (pg_idx < nr_pages) {
 			struct page *page = pages[pg_idx];
@@ -436,7 +436,7 @@ static void bch2_bio_page_state_set(struct bio *bio, struct bkey_s_c k)
 	struct bio_vec bv;
 	unsigned nr_ptrs = k.k->type == KEY_TYPE_reflink_v
 		? 0 : bch2_bkey_nr_ptrs_fully_allocated(k);
-	unsigned state = bkey_to_sector_state(k.k);
+	unsigned state = bkey_to_sector_state(k);
 
 	bio_for_each_segment(bv, bio, iter)
 		__bch2_page_state_set(bv.bv_page, bv.bv_offset >> 9,
@@ -3093,8 +3093,8 @@ static int __bchfs_fallocate(struct bch_inode_info *inode, int mode,
 			goto bkey_err;
 
 		/* already reserved */
-		if (k.k->type == KEY_TYPE_reservation &&
-		    bkey_s_c_to_reservation(k).v->nr_replicas >= opts.data_replicas) {
+		if (bkey_extent_is_reservation(k) &&
+		    bch2_bkey_nr_ptrs_fully_allocated(k) >= opts.data_replicas) {
 			bch2_btree_iter_advance(&iter);
 			continue;
 		}
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index cc41472a335e..15ab77ebb8c6 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -811,6 +811,9 @@ static int bch2_fill_extent(struct bch_fs *c,
 			int flags2 = 0;
 			u64 offset = p.ptr.offset;
 
+			if (p.ptr.unwritten)
+				flags2 |= FIEMAP_EXTENT_UNWRITTEN;
+
 			if (p.crc.compression_type)
 				flags2 |= FIEMAP_EXTENT_ENCODED;
 			else
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 24365b9260f6..5887d78190eb 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -1251,8 +1251,8 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
 			continue;
 
 		if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_I_SIZE_DIRTY) &&
-				k.k->type != KEY_TYPE_reservation &&
-				k.k->p.offset > round_up(i->inode.bi_size, block_bytes(c)) >> 9, c,
+				k.k->p.offset > round_up(i->inode.bi_size, block_bytes(c)) >> 9 &&
+				!bkey_extent_is_reservation(k), c,
 				"extent type past end of inode %llu:%u, i_size %llu\n  %s",
 				i->inode.bi_inum, i->snapshot, i->inode.bi_size,
 				(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index c51381daf1c5..1d0ec638f645 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -1481,6 +1481,9 @@ static inline bool should_promote(struct bch_fs *c, struct bkey_s_c k,
 	if (bch2_bkey_has_target(c, k, opts.promote_target))
 		return false;
 
+	if (bkey_extent_is_unwritten(k))
+		return false;
+
 	if (bch2_target_congested(c, opts.promote_target)) {
 		/* XXX trace this */
 		return false;
diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c
index aae924dc81f7..faf75bcf9ee7 100644
--- a/fs/bcachefs/reflink.c
+++ b/fs/bcachefs/reflink.c
@@ -251,9 +251,13 @@ static struct bkey_s_c get_next_src(struct btree_iter *iter, struct bpos end)
 	struct bkey_s_c k;
 	int ret;
 
-	for_each_btree_key_upto_continue_norestart(*iter, end, 0, k, ret)
+	for_each_btree_key_upto_continue_norestart(*iter, end, 0, k, ret) {
+		if (bkey_extent_is_unwritten(k))
+			continue;
+
 		if (bkey_extent_is_data(k.k))
 			return k;
+	}
 
 	if (bkey_ge(iter->pos, end))
 		bch2_btree_iter_set_pos(iter, end);
-- 
cgit 


From 4dcd1cae72912ab08d313ee5a730608022b211d4 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 14 Nov 2022 01:31:10 -0500
Subject: bcachefs: Data update support for unwritten extents

The data update path requires special support for unwritten extents - we
still need to be able to move them, but there's no need to read or write
anything.

This patch adds a new error code to tell bch2_move_extent() that we're
short circuiting the read, and adds bch2_update_unwritten_extent() to
create a reservation then call __bch2_data_update_index_update().

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/data_update.c | 118 +++++++++++++++++++++++++++++++++++++++++-----
 fs/bcachefs/data_update.h |   1 +
 fs/bcachefs/errcode.h     |   1 +
 fs/bcachefs/move.c        |  11 ++++-
 4 files changed, 118 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c
index 700936ac44d9..82d7e13e61a5 100644
--- a/fs/bcachefs/data_update.c
+++ b/fs/bcachefs/data_update.c
@@ -96,10 +96,10 @@ static void bch2_bkey_mark_dev_cached(struct bkey_s k, unsigned dev)
 			ptr->cached = true;
 }
 
-int bch2_data_update_index_update(struct bch_write_op *op)
+static int __bch2_data_update_index_update(struct btree_trans *trans,
+					   struct bch_write_op *op)
 {
 	struct bch_fs *c = op->c;
-	struct btree_trans trans;
 	struct btree_iter iter;
 	struct data_update *m =
 		container_of(op, struct data_update, op);
@@ -111,9 +111,7 @@ int bch2_data_update_index_update(struct bch_write_op *op)
 	bch2_bkey_buf_init(&_insert);
 	bch2_bkey_buf_realloc(&_insert, c, U8_MAX);
 
-	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024);
-
-	bch2_trans_iter_init(&trans, &iter, m->btree_id,
+	bch2_trans_iter_init(trans, &iter, m->btree_id,
 			     bkey_start_pos(&bch2_keylist_front(keys)->k),
 			     BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
 
@@ -130,7 +128,7 @@ int bch2_data_update_index_update(struct bch_write_op *op)
 		s64 i_sectors_delta = 0, disk_sectors_delta = 0;
 		unsigned i;
 
-		bch2_trans_begin(&trans);
+		bch2_trans_begin(trans);
 
 		k = bch2_btree_iter_peek_slot(&iter);
 		ret = bkey_err(k);
@@ -211,7 +209,7 @@ int bch2_data_update_index_update(struct bch_write_op *op)
 		bch2_bkey_narrow_crcs(insert, (struct bch_extent_crc_unpacked) { 0 });
 		bch2_extent_normalize(c, bkey_i_to_s(insert));
 
-		ret = bch2_sum_sector_overwrites(&trans, &iter, insert,
+		ret = bch2_sum_sector_overwrites(trans, &iter, insert,
 						 &should_check_enospc,
 						 &i_sectors_delta,
 						 &disk_sectors_delta);
@@ -229,11 +227,11 @@ int bch2_data_update_index_update(struct bch_write_op *op)
 
 		next_pos = insert->k.p;
 
-		ret   = insert_snapshot_whiteouts(&trans, m->btree_id,
+		ret   = insert_snapshot_whiteouts(trans, m->btree_id,
 						  k.k->p, insert->k.p) ?:
-			bch2_trans_update(&trans, &iter, insert,
+			bch2_trans_update(trans, &iter, insert,
 				BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
-			bch2_trans_commit(&trans, &op->res,
+			bch2_trans_commit(trans, &op->res,
 				NULL,
 				BTREE_INSERT_NOFAIL|
 				m->data_opts.btree_insert_flags);
@@ -270,14 +268,26 @@ nomatch:
 		goto next;
 	}
 out:
-	bch2_trans_iter_exit(&trans, &iter);
-	bch2_trans_exit(&trans);
+	bch2_trans_iter_exit(trans, &iter);
 	bch2_bkey_buf_exit(&_insert, c);
 	bch2_bkey_buf_exit(&_new, c);
 	BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart));
 	return ret;
 }
 
+int bch2_data_update_index_update(struct bch_write_op *op)
+{
+	struct bch_fs *c = op->c;
+	struct btree_trans trans;
+	int ret;
+
+	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024);
+	ret = __bch2_data_update_index_update(&trans, op);
+	bch2_trans_exit(&trans);
+
+	return ret;
+}
+
 void bch2_data_update_read_done(struct data_update *m,
 				struct bch_extent_crc_unpacked crc)
 {
@@ -299,6 +309,86 @@ void bch2_data_update_exit(struct data_update *update)
 	bch2_bio_free_pages_pool(c, &update->op.wbio.bio);
 }
 
+void bch2_update_unwritten_extent(struct btree_trans *trans,
+				  struct data_update *update)
+{
+	struct bch_fs *c = update->op.c;
+	struct bio *bio = &update->op.wbio.bio;
+	struct bkey_i_extent *e;
+	struct write_point *wp;
+	struct bch_extent_ptr *ptr;
+	struct closure cl;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	int ret;
+
+	closure_init_stack(&cl);
+	bch2_keylist_init(&update->op.insert_keys, update->op.inline_keys);
+
+	while (bio_sectors(bio)) {
+		unsigned sectors = bio_sectors(bio);
+
+		bch2_trans_iter_init(trans, &iter, update->btree_id, update->op.pos,
+				     BTREE_ITER_SLOTS);
+		ret = lockrestart_do(trans, ({
+			k = bch2_btree_iter_peek_slot(&iter);
+			bkey_err(k);
+		}));
+		bch2_trans_iter_exit(trans, &iter);
+
+		if (ret || !bch2_extents_match(k, bkey_i_to_s_c(update->k.k)))
+			break;
+
+		e = bkey_extent_init(update->op.insert_keys.top);
+		e->k.p = update->op.pos;
+
+		ret = bch2_alloc_sectors_start_trans(trans,
+				update->op.target,
+				false,
+				update->op.write_point,
+				&update->op.devs_have,
+				update->op.nr_replicas,
+				update->op.nr_replicas,
+				update->op.alloc_reserve,
+				0, &cl, &wp);
+		if (bch2_err_matches(ret, BCH_ERR_operation_blocked)) {
+			bch2_trans_unlock(trans);
+			closure_sync(&cl);
+			continue;
+		}
+
+		if (ret)
+			return;
+
+		sectors = min(sectors, wp->sectors_free);
+
+		bch2_key_resize(&e->k, sectors);
+
+		bch2_open_bucket_get(c, wp, &update->op.open_buckets);
+		bch2_alloc_sectors_append_ptrs(c, wp, &e->k_i, sectors, false);
+		bch2_alloc_sectors_done(c, wp);
+
+		bio_advance(bio, sectors << 9);
+		update->op.pos.offset += sectors;
+
+		extent_for_each_ptr(extent_i_to_s(e), ptr)
+			ptr->unwritten = true;
+		bch2_keylist_push(&update->op.insert_keys);
+
+		ret = __bch2_data_update_index_update(trans, &update->op);
+
+		bch2_open_buckets_put(c, &update->op.open_buckets);
+
+		if (ret)
+			break;
+	}
+
+	if ((atomic_read(&cl.remaining) & CLOSURE_REMAINING_MASK) != 1) {
+		bch2_trans_unlock(trans);
+		closure_sync(&cl);
+	}
+}
+
 int bch2_data_update_init(struct bch_fs *c, struct data_update *m,
 			  struct write_point_specifier wp,
 			  struct bch_io_opts io_opts,
@@ -376,6 +466,10 @@ int bch2_data_update_init(struct bch_fs *c, struct data_update *m,
 		hweight32(m->data_opts.rewrite_ptrs) + m->data_opts.extra_replicas;
 
 	BUG_ON(!m->op.nr_replicas);
+
+	/* Special handling required: */
+	if (bkey_extent_is_unwritten(k))
+		return -BCH_ERR_unwritten_extent_update;
 	return 0;
 }
 
diff --git a/fs/bcachefs/data_update.h b/fs/bcachefs/data_update.h
index 5d8690795959..f304c3366226 100644
--- a/fs/bcachefs/data_update.h
+++ b/fs/bcachefs/data_update.h
@@ -32,6 +32,7 @@ void bch2_data_update_read_done(struct data_update *,
 				struct bch_extent_crc_unpacked);
 
 void bch2_data_update_exit(struct data_update *);
+void bch2_update_unwritten_extent(struct btree_trans *, struct data_update *);
 int bch2_data_update_init(struct bch_fs *, struct data_update *,
 			  struct write_point_specifier,
 			  struct bch_io_opts, struct data_update_opts,
diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h
index 804bc15dce31..57f1d0a6a490 100644
--- a/fs/bcachefs/errcode.h
+++ b/fs/bcachefs/errcode.h
@@ -71,6 +71,7 @@
 	x(BCH_ERR_fsck,			fsck_repair_impossible)			\
 	x(0,				need_snapshot_cleanup)			\
 	x(0,				need_topology_repair)			\
+	x(0,				unwritten_extent_update)		\
 	x(EINVAL,			device_state_not_allowed)		\
 	x(EINVAL,			member_info_missing)			\
 	x(EINVAL,			mismatched_block_size)			\
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index 8eb4978cc043..52f126a0bb73 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -295,7 +295,7 @@ static int bch2_move_extent(struct btree_trans *trans,
 
 	ret = bch2_data_update_init(c, &io->write, ctxt->wp, io_opts,
 				    data_opts, btree_id, k);
-	if (ret)
+	if (ret && ret != -BCH_ERR_unwritten_extent_update)
 		goto err_free_pages;
 
 	io->write.ctxt = ctxt;
@@ -303,6 +303,15 @@ static int bch2_move_extent(struct btree_trans *trans,
 
 	atomic64_inc(&ctxt->stats->keys_moved);
 	atomic64_add(k.k->size, &ctxt->stats->sectors_moved);
+
+	if (ret == -BCH_ERR_unwritten_extent_update) {
+		bch2_update_unwritten_extent(trans, &io->write);
+		move_free(io);
+		return 0;
+	}
+
+	BUG_ON(ret);
+
 	this_cpu_add(c->counters[BCH_COUNTER_io_move], k.k->size);
 	this_cpu_add(c->counters[BCH_COUNTER_move_extent_read], k.k->size);
 	trace_move_extent_read(k.k);
-- 
cgit 


From a8b3a677e786fa869d220a6a78b5532a36dc2f4d Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 2 Nov 2022 17:12:00 -0400
Subject: bcachefs: Nocow support

This adds support for nocow mode, where we do writes in-place when
possible. Patch components:

 - New boolean filesystem and inode option, nocow: note that when nocow
   is enabled, data checksumming and compression are implicitly disabled

 - To prevent in-place writes from racing with data moves
   (data_update.c) or bucket reuse (i.e. a bucket being reused and
   re-allocated while a nocow write is in flight, we have a new locking
   mechanism.

   Buckets can be locked for either data update or data move, using a
   fixed size hash table of two_state_shared locks. We don't have any
   chaining, meaning updates and moves to different buckets that hash to
   the same lock will wait unnecessarily - we'll want to watch for this
   becoming an issue.

 - The allocator path also needs to check for in-place writes in flight
   to a given bucket before giving it out: thus we add another counter
   to bucket_alloc_state so we can track this.

 - Fsync now may need to issue cache flushes to block devices instead of
   flushing the journal. We add a device bitmask to bch_inode_info,
   ei_devs_need_flush, which tracks devices that need to have flushes
   issued - note that this will lead to unnecessary flushes when other
   codepaths have already issued flushes, we may want to replace this with
   a sequence number.

 - New nocow write path: look up extents, and if they're writable write
   to them - otherwise fall back to the normal COW write path.

XXX: switch to sequence numbers instead of bitmask for devs needing
journal flush

XXX: ei_quota_lock being a mutex means bch2_nocow_write_done() needs to
run in process context - see if we can improve this

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/Makefile           |   1 +
 fs/bcachefs/alloc_foreground.c |   5 +
 fs/bcachefs/alloc_types.h      |   1 +
 fs/bcachefs/bcachefs.h         |  10 +-
 fs/bcachefs/bcachefs_format.h  |  10 +-
 fs/bcachefs/btree_io.c         |   3 +-
 fs/bcachefs/checksum.h         |   7 +-
 fs/bcachefs/data_update.c      |  10 +
 fs/bcachefs/extents.c          |  39 ++--
 fs/bcachefs/extents.h          |   1 +
 fs/bcachefs/fs-io.c            |  98 ++++++++-
 fs/bcachefs/fs.h               |  11 +
 fs/bcachefs/inode.c            |   3 +
 fs/bcachefs/io.c               | 452 +++++++++++++++++++++++++++++++++++++++--
 fs/bcachefs/io.h               |   7 +-
 fs/bcachefs/io_types.h         |   7 +
 fs/bcachefs/move.c             |   7 +
 fs/bcachefs/nocow_locking.c    |  15 ++
 fs/bcachefs/nocow_locking.h    |  55 +++++
 fs/bcachefs/opts.h             |   7 +
 fs/bcachefs/super.h            |   7 +-
 fs/bcachefs/trace.h            |   5 +-
 22 files changed, 709 insertions(+), 52 deletions(-)
 create mode 100644 fs/bcachefs/nocow_locking.c
 create mode 100644 fs/bcachefs/nocow_locking.h

(limited to 'fs')

diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile
index 456d540441ce..55b6d85d55c3 100644
--- a/fs/bcachefs/Makefile
+++ b/fs/bcachefs/Makefile
@@ -52,6 +52,7 @@ bcachefs-y		:=	\
 	migrate.o		\
 	move.o			\
 	movinggc.o		\
+	nocow_locking.o		\
 	opts.o			\
 	printbuf.o		\
 	quota.o			\
diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index a179bbe23c93..f78eaa52c11f 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -227,6 +227,11 @@ static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev *
 		return NULL;
 	}
 
+	if (bch2_bucket_nocow_is_locked(&c->nocow_locks, POS(ca->dev_idx, bucket))) {
+		s->skipped_nocow++;
+		return NULL;
+	}
+
 	spin_lock(&c->freelist_lock);
 
 	if (unlikely(c->open_buckets_nr_free <= open_buckets_reserved(reserve))) {
diff --git a/fs/bcachefs/alloc_types.h b/fs/bcachefs/alloc_types.h
index 2c96794d1993..2e6f48069258 100644
--- a/fs/bcachefs/alloc_types.h
+++ b/fs/bcachefs/alloc_types.h
@@ -12,6 +12,7 @@ struct bucket_alloc_state {
 	u64	buckets_seen;
 	u64	skipped_open;
 	u64	skipped_need_journal_commit;
+	u64	skipped_nocow;
 	u64	skipped_nouse;
 };
 
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 6d048e5d8843..74632105fb45 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -206,6 +206,7 @@
 #include "bcachefs_format.h"
 #include "errcode.h"
 #include "fifo.h"
+#include "nocow_locking.h"
 #include "opts.h"
 #include "util.h"
 
@@ -383,7 +384,8 @@ BCH_DEBUG_PARAMS_DEBUG()
 	x(journal_flush_seq)			\
 	x(blocked_journal)			\
 	x(blocked_allocate)			\
-	x(blocked_allocate_open_bucket)
+	x(blocked_allocate_open_bucket)		\
+	x(nocow_lock_contended)
 
 enum bch_time_stats {
 #define x(name) BCH_TIME_##name,
@@ -483,6 +485,7 @@ struct bch_dev {
 	struct bch_sb		*sb_read_scratch;
 	int			sb_write_error;
 	dev_t			dev;
+	atomic_t		flush_seq;
 
 	struct bch_devs_mask	self;
 
@@ -897,7 +900,9 @@ struct bch_fs {
 	struct bio_set		bio_read_split;
 	struct bio_set		bio_write;
 	struct mutex		bio_bounce_pages_lock;
-mempool_t		bio_bounce_pages;
+	mempool_t		bio_bounce_pages;
+	struct bucket_nocow_lock_table
+				nocow_locks;
 	struct rhashtable	promote_table;
 
 	mempool_t		compression_bounce[2];
@@ -959,6 +964,7 @@ mempool_t		bio_bounce_pages;
 	struct bio_set		writepage_bioset;
 	struct bio_set		dio_write_bioset;
 	struct bio_set		dio_read_bioset;
+	struct bio_set		nocow_flush_bioset;
 
 	/* ERRORS */
 	struct list_head	fsck_errors;
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index 57327c4dc9b4..024a714955f2 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -798,7 +798,8 @@ struct bch_inode_generation {
 	x(bi_dir,			64)	\
 	x(bi_dir_offset,		64)	\
 	x(bi_subvol,			32)	\
-	x(bi_parent_subvol,		32)
+	x(bi_parent_subvol,		32)	\
+	x(bi_nocow,			8)
 
 /* subset of BCH_INODE_FIELDS */
 #define BCH_INODE_OPTS()			\
@@ -810,7 +811,8 @@ struct bch_inode_generation {
 	x(promote_target,		16)	\
 	x(foreground_target,		16)	\
 	x(background_target,		16)	\
-	x(erasure_code,			16)
+	x(erasure_code,			16)	\
+	x(nocow,			8)
 
 enum inode_opt_id {
 #define x(name, ...)				\
@@ -1548,7 +1550,8 @@ struct bch_sb_field_journal_seq_blacklist {
 	x(alloc_v4,			20)		\
 	x(new_data_types,		21)		\
 	x(backpointers,			22)		\
-	x(inode_v3,			23)
+	x(inode_v3,			23)		\
+	x(unwritten_extents,		24)
 
 enum bcachefs_metadata_version {
 	bcachefs_metadata_version_min = 9,
@@ -1696,6 +1699,7 @@ LE64_BITMASK(BCH_SB_JOURNAL_FLUSH_DELAY,struct bch_sb, flags[3], 30, 62);
 LE64_BITMASK(BCH_SB_JOURNAL_FLUSH_DISABLED,struct bch_sb, flags[3], 62, 63);
 LE64_BITMASK(BCH_SB_JOURNAL_RECLAIM_DELAY,struct bch_sb, flags[4], 0, 32);
 LE64_BITMASK(BCH_SB_JOURNAL_TRANSACTION_NAMES,struct bch_sb, flags[4], 32, 33);
+LE64_BITMASK(BCH_SB_NOCOW,		struct bch_sb, flags[4], 33, 34);
 LE64_BITMASK(BCH_SB_WRITE_BUFFER_SIZE,	struct bch_sb, flags[4], 34, 54);
 
 /*
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index dfa45cf4021f..87d80a59dd7e 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -1832,7 +1832,8 @@ static void btree_write_submit(struct work_struct *work)
 	bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&tmp.k)), ptr)
 		ptr->offset += wbio->sector_offset;
 
-	bch2_submit_wbio_replicas(&wbio->wbio, wbio->wbio.c, BCH_DATA_btree, &tmp.k);
+	bch2_submit_wbio_replicas(&wbio->wbio, wbio->wbio.c, BCH_DATA_btree,
+				  &tmp.k, false);
 }
 
 void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, unsigned flags)
diff --git a/fs/bcachefs/checksum.h b/fs/bcachefs/checksum.h
index f7ccef7a5520..409ad534d9f4 100644
--- a/fs/bcachefs/checksum.h
+++ b/fs/bcachefs/checksum.h
@@ -99,14 +99,17 @@ static inline enum bch_csum_type bch2_csum_opt_to_type(enum bch_csum_opts type,
 }
 
 static inline enum bch_csum_type bch2_data_checksum_type(struct bch_fs *c,
-							 unsigned opt)
+							 struct bch_io_opts opts)
 {
+	if (opts.nocow)
+		return 0;
+
 	if (c->sb.encryption_type)
 		return c->opts.wide_macs
 			? BCH_CSUM_chacha20_poly1305_128
 			: BCH_CSUM_chacha20_poly1305_80;
 
-	return bch2_csum_opt_to_type(opt, true);
+	return bch2_csum_opt_to_type(opts.data_checksum, true);
 }
 
 static inline enum bch_csum_type bch2_meta_checksum_type(struct bch_fs *c)
diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c
index 82d7e13e61a5..c3f12b3adb14 100644
--- a/fs/bcachefs/data_update.c
+++ b/fs/bcachefs/data_update.c
@@ -303,6 +303,13 @@ void bch2_data_update_read_done(struct data_update *m,
 void bch2_data_update_exit(struct data_update *update)
 {
 	struct bch_fs *c = update->op.c;
+	struct bkey_ptrs_c ptrs =
+		bch2_bkey_ptrs_c(bkey_i_to_s_c(update->k.k));
+	const struct bch_extent_ptr *ptr;
+
+	bkey_for_each_ptr(ptrs, ptr)
+		bch2_bucket_nocow_unlock(&c->nocow_locks,
+				       PTR_BUCKET_POS(c, ptr), 0);
 
 	bch2_bkey_buf_exit(&update->k, c);
 	bch2_disk_reservation_put(c, &update->op.res);
@@ -451,6 +458,9 @@ int bch2_data_update_init(struct bch_fs *c, struct data_update *m,
 			m->op.incompressible = true;
 
 		i++;
+
+		bch2_bucket_nocow_lock(&c->nocow_locks,
+				       PTR_BUCKET_POS(c, &p.ptr), 0);
 	}
 
 	if (reserve_sectors) {
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 627edba24900..55a8879dc4fe 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -664,22 +664,21 @@ unsigned bch2_bkey_replicas(struct bch_fs *c, struct bkey_s_c k)
 	return replicas;
 }
 
-static unsigned bch2_extent_ptr_durability(struct bch_fs *c,
-					   struct extent_ptr_decoded p)
+unsigned bch2_extent_ptr_durability(struct bch_fs *c, struct extent_ptr_decoded *p)
 {
 	unsigned durability = 0;
 	struct bch_dev *ca;
 
-	if (p.ptr.cached)
+	if (p->ptr.cached)
 		return 0;
 
-	ca = bch_dev_bkey_exists(c, p.ptr.dev);
+	ca = bch_dev_bkey_exists(c, p->ptr.dev);
 
 	if (ca->mi.state != BCH_MEMBER_STATE_failed)
 		durability = max_t(unsigned, durability, ca->mi.durability);
 
-	if (p.has_ec)
-		durability += p.ec.redundancy;
+	if (p->has_ec)
+		durability += p->ec.redundancy;
 
 	return durability;
 }
@@ -692,7 +691,7 @@ unsigned bch2_bkey_durability(struct bch_fs *c, struct bkey_s_c k)
 	unsigned durability = 0;
 
 	bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
-		durability += bch2_extent_ptr_durability(c, p);
+		durability += bch2_extent_ptr_durability(c,& p);
 
 	return durability;
 }
@@ -907,23 +906,31 @@ bool bch2_bkey_matches_ptr(struct bch_fs *c, struct bkey_s_c k,
  */
 bool bch2_extents_match(struct bkey_s_c k1, struct bkey_s_c k2)
 {
-	struct bkey_ptrs_c ptrs1 = bch2_bkey_ptrs_c(k1);
-	struct bkey_ptrs_c ptrs2 = bch2_bkey_ptrs_c(k2);
-	const union bch_extent_entry *entry1, *entry2;
-	struct extent_ptr_decoded p1, p2;
-
-	if (bkey_extent_is_unwritten(k1) != bkey_extent_is_unwritten(k2))
+	if (k1.k->type != k2.k->type)
 		return false;
 
-	bkey_for_each_ptr_decode(k1.k, ptrs1, p1, entry1)
-		bkey_for_each_ptr_decode(k2.k, ptrs2, p2, entry2)
+	if (bkey_extent_is_direct_data(k1.k)) {
+		struct bkey_ptrs_c ptrs1 = bch2_bkey_ptrs_c(k1);
+		struct bkey_ptrs_c ptrs2 = bch2_bkey_ptrs_c(k2);
+		const union bch_extent_entry *entry1, *entry2;
+		struct extent_ptr_decoded p1, p2;
+
+		if (bkey_extent_is_unwritten(k1) != bkey_extent_is_unwritten(k2))
+			return false;
+
+		bkey_for_each_ptr_decode(k1.k, ptrs1, p1, entry1)
+			bkey_for_each_ptr_decode(k2.k, ptrs2, p2, entry2)
 			if (p1.ptr.dev		== p2.ptr.dev &&
 			    p1.ptr.gen		== p2.ptr.gen &&
 			    (s64) p1.ptr.offset + p1.crc.offset - bkey_start_offset(k1.k) ==
 			    (s64) p2.ptr.offset + p2.crc.offset - bkey_start_offset(k2.k))
 				return true;
 
-	return false;
+		return false;
+	} else {
+		/* KEY_TYPE_deleted, etc. */
+		return true;
+	}
 }
 
 bool bch2_extent_has_ptr(struct bkey_s_c k1, struct extent_ptr_decoded p1,
diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h
index 659ab76ea62c..e27d39b728b3 100644
--- a/fs/bcachefs/extents.h
+++ b/fs/bcachefs/extents.h
@@ -596,6 +596,7 @@ bool bch2_bkey_is_incompressible(struct bkey_s_c);
 unsigned bch2_bkey_sectors_compressed(struct bkey_s_c);
 
 unsigned bch2_bkey_replicas(struct bch_fs *, struct bkey_s_c);
+unsigned bch2_extent_ptr_durability(struct bch_fs *, struct extent_ptr_decoded *);
 unsigned bch2_bkey_durability(struct bch_fs *, struct bkey_s_c);
 
 void bch2_bkey_drop_device(struct bkey_s, unsigned);
diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index b5cf0a3218ea..ec575b27eedb 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -35,6 +35,72 @@
 
 #include <trace/events/writeback.h>
 
+struct nocow_flush {
+	struct closure	*cl;
+	struct bch_dev	*ca;
+	struct bio	bio;
+};
+
+static void nocow_flush_endio(struct bio *_bio)
+{
+
+	struct nocow_flush *bio = container_of(_bio, struct nocow_flush, bio);
+
+	closure_put(bio->cl);
+	percpu_ref_put(&bio->ca->io_ref);
+	bio_put(&bio->bio);
+}
+
+static void bch2_inode_flush_nocow_writes_async(struct bch_fs *c,
+						struct bch_inode_info *inode,
+						struct closure *cl)
+{
+	struct nocow_flush *bio;
+	struct bch_dev *ca;
+	struct bch_devs_mask devs;
+	unsigned dev;
+
+	dev = find_first_bit(inode->ei_devs_need_flush.d, BCH_SB_MEMBERS_MAX);
+	if (dev == BCH_SB_MEMBERS_MAX)
+		return;
+
+	devs = inode->ei_devs_need_flush;
+	memset(&inode->ei_devs_need_flush, 0, sizeof(inode->ei_devs_need_flush));
+
+	for_each_set_bit(dev, devs.d, BCH_SB_MEMBERS_MAX) {
+		rcu_read_lock();
+		ca = rcu_dereference(c->devs[dev]);
+		if (ca && !percpu_ref_tryget(&ca->io_ref))
+			ca = NULL;
+		rcu_read_unlock();
+
+		if (!ca)
+			continue;
+
+		bio = container_of(bio_alloc_bioset(ca->disk_sb.bdev, 0,
+						    REQ_OP_FLUSH,
+						    GFP_KERNEL,
+						    &c->nocow_flush_bioset),
+				   struct nocow_flush, bio);
+		bio->cl			= cl;
+		bio->ca			= ca;
+		bio->bio.bi_end_io	= nocow_flush_endio;
+		closure_bio_submit(&bio->bio, cl);
+	}
+}
+
+static int bch2_inode_flush_nocow_writes(struct bch_fs *c,
+					 struct bch_inode_info *inode)
+{
+	struct closure cl;
+
+	closure_init_stack(&cl);
+	bch2_inode_flush_nocow_writes_async(c, inode, &cl);
+	closure_sync(&cl);
+
+	return 0;
+}
+
 static inline bool bio_full(struct bio *bio, unsigned len)
 {
 	if (bio->bi_vcnt >= bio->bi_max_vecs)
@@ -1327,6 +1393,7 @@ static void bch2_writepage_io_alloc(struct bch_fs *c,
 	op->subvol		= inode->ei_subvol;
 	op->pos			= POS(inode->v.i_ino, sector);
 	op->end_io		= bch2_writepage_io_done;
+	op->devs_need_flush	= &inode->ei_devs_need_flush;
 	op->wbio.bio.bi_iter.bi_sector = sector;
 	op->wbio.bio.bi_opf	= wbc_to_write_flags(wbc);
 }
@@ -2148,10 +2215,12 @@ static noinline void bch2_dio_write_flush(struct dio_write *dio)
 
 	if (!dio->op.error) {
 		ret = bch2_inode_find_by_inum(c, inode_inum(dio->inode), &inode);
-		if (ret)
+		if (ret) {
 			dio->op.error = ret;
-		else
+		} else {
 			bch2_journal_flush_seq_async(&c->journal, inode.bi_journal_seq, &dio->op.cl);
+			bch2_inode_flush_nocow_writes_async(c, dio->inode, &dio->op.cl);
+		}
 	}
 
 	if (dio->sync) {
@@ -2296,6 +2365,7 @@ static __always_inline long bch2_dio_write_loop(struct dio_write *dio)
 		dio->op.nr_replicas	= dio->op.opts.data_replicas;
 		dio->op.subvol		= inode->ei_subvol;
 		dio->op.pos		= POS(inode->v.i_ino, (u64) req->ki_pos >> 9);
+		dio->op.devs_need_flush	= &inode->ei_devs_need_flush;
 
 		if (sync)
 			dio->op.flags |= BCH_WRITE_SYNC;
@@ -2495,19 +2565,21 @@ out:
  * inode->ei_inode.bi_journal_seq won't be up to date since it's set in an
  * insert trigger: look up the btree inode instead
  */
-static int bch2_flush_inode(struct bch_fs *c, subvol_inum inum)
+static int bch2_flush_inode(struct bch_fs *c,
+			    struct bch_inode_info *inode)
 {
-	struct bch_inode_unpacked inode;
+	struct bch_inode_unpacked u;
 	int ret;
 
 	if (c->opts.journal_flush_disabled)
 		return 0;
 
-	ret = bch2_inode_find_by_inum(c, inum, &inode);
+	ret = bch2_inode_find_by_inum(c, inode_inum(inode), &u);
 	if (ret)
 		return ret;
 
-	return bch2_journal_flush_seq(&c->journal, inode.bi_journal_seq);
+	return bch2_journal_flush_seq(&c->journal, u.bi_journal_seq) ?:
+		bch2_inode_flush_nocow_writes(c, inode);
 }
 
 int bch2_fsync(struct file *file, loff_t start, loff_t end, int datasync)
@@ -2518,7 +2590,7 @@ int bch2_fsync(struct file *file, loff_t start, loff_t end, int datasync)
 
 	ret = file_write_and_wait_range(file, start, end);
 	ret2 = sync_inode_metadata(&inode->v, 1);
-	ret3 = bch2_flush_inode(c, inode_inum(inode));
+	ret3 = bch2_flush_inode(c, inode);
 
 	return bch2_err_class(ret ?: ret2 ?: ret3);
 }
@@ -3105,6 +3177,11 @@ static int __bchfs_fallocate(struct bch_inode_info *inode, int mode,
 			continue;
 		}
 
+		/*
+		 * XXX: for nocow mode, we should promote shared extents to
+		 * unshared here
+		 */
+
 		sectors = bpos_min(k.k->p, end_pos).offset - iter.pos.offset;
 
 		if (!bkey_extent_is_allocation(k.k)) {
@@ -3368,7 +3445,7 @@ loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src,
 
 	if ((file_dst->f_flags & (__O_SYNC | O_DSYNC)) ||
 	    IS_SYNC(file_inode(file_dst)))
-		ret = bch2_flush_inode(c, inode_inum(dst));
+		ret = bch2_flush_inode(c, dst);
 err:
 	bch2_quota_reservation_put(c, dst, &quota_res);
 	bch2_unlock_inodes(INODE_LOCK|INODE_PAGECACHE_BLOCK, src, dst);
@@ -3622,6 +3699,7 @@ loff_t bch2_llseek(struct file *file, loff_t offset, int whence)
 
 void bch2_fs_fsio_exit(struct bch_fs *c)
 {
+	bioset_exit(&c->nocow_flush_bioset);
 	bioset_exit(&c->dio_write_bioset);
 	bioset_exit(&c->dio_read_bioset);
 	bioset_exit(&c->writepage_bioset);
@@ -3641,7 +3719,9 @@ int bch2_fs_fsio_init(struct bch_fs *c)
 			BIOSET_NEED_BVECS) ||
 	    bioset_init(&c->dio_write_bioset,
 			4, offsetof(struct dio_write, op.wbio.bio),
-			BIOSET_NEED_BVECS))
+			BIOSET_NEED_BVECS) ||
+	    bioset_init(&c->nocow_flush_bioset,
+			1, offsetof(struct nocow_flush, bio), 0))
 		ret = -ENOMEM;
 
 	pr_verbose_init(c->opts, "ret %i", ret);
diff --git a/fs/bcachefs/fs.h b/fs/bcachefs/fs.h
index 4164d0669d70..e1c73a38c607 100644
--- a/fs/bcachefs/fs.h
+++ b/fs/bcachefs/fs.h
@@ -25,6 +25,17 @@ struct bch_inode_info {
 
 	u32			ei_subvol;
 
+	/*
+	 * When we've been doing nocow writes we'll need to issue flushes to the
+	 * underlying block devices
+	 *
+	 * XXX: a device may have had a flush issued by some other codepath. It
+	 * would be better to keep for each device a sequence number that's
+	 * incremented when we isusue a cache flush, and track here the sequence
+	 * number that needs flushing.
+	 */
+	struct bch_devs_mask	ei_devs_need_flush;
+
 	/* copy of inode in btree: */
 	struct bch_inode_unpacked ei_inode;
 };
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index f338cf6fd8b7..a98e40065122 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -892,4 +892,7 @@ void bch2_inode_opts_get(struct bch_io_opts *opts, struct bch_fs *c,
 #define x(_name, _bits)		opts->_name = inode_opt_get(c, inode, _name);
 	BCH_INODE_OPTS()
 #undef x
+
+	if (opts->nocow)
+		opts->compression = opts->background_compression = opts->data_checksum = opts->erasure_code = 0;
 }
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index 1d0ec638f645..d511bd664953 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -34,6 +34,7 @@
 #include "trace.h"
 
 #include <linux/blkdev.h>
+#include <linux/prefetch.h>
 #include <linux/random.h>
 #include <linux/sched/mm.h>
 
@@ -375,24 +376,118 @@ int bch2_extent_fallocate(struct btree_trans *trans,
 			  s64 *i_sectors_delta,
 			  struct write_point_specifier write_point)
 {
-	int ret;
 	struct bch_fs *c = trans->c;
 	struct disk_reservation disk_res = { 0 };
-	struct bkey_i_reservation *reservation =
-		bch2_trans_kmalloc(trans, sizeof(*reservation));
+	struct closure cl;
+	struct open_buckets open_buckets;
+	struct bkey_s_c k;
+	struct bkey_buf old, new;
+	bool have_reservation = false;
+	bool unwritten = opts.nocow &&
+	    c->sb.version >= bcachefs_metadata_version_unwritten_extents;
+	int ret;
 
-	ret = PTR_ERR_OR_ZERO(reservation);
+	bch2_bkey_buf_init(&old);
+	bch2_bkey_buf_init(&new);
+	closure_init_stack(&cl);
+	open_buckets.nr = 0;
+retry:
+	k = bch2_btree_iter_peek_slot(iter);
+	ret = bkey_err(k);
 	if (ret)
 		return ret;
 
-	bkey_reservation_init(&reservation->k_i);
-	reservation->k.p = iter->pos;
-	bch2_key_resize(&reservation->k, sectors);
-	reservation->v.nr_replicas = opts.data_replicas;
+	sectors = min_t(u64, sectors, k.k->p.offset - iter->pos.offset);
+
+	if (!have_reservation) {
+		unsigned new_replicas =
+			max(0, (int) opts.data_replicas -
+			    (int) bch2_bkey_nr_ptrs_fully_allocated(k));
+		/*
+		 * Get a disk reservation before (in the nocow case) calling
+		 * into the allocator:
+		 */
+		ret = bch2_disk_reservation_get(c, &disk_res, sectors, new_replicas, 0);
+		if (unlikely(ret))
+			goto out;
+
+		bch2_bkey_buf_reassemble(&old, c, k);
+	}
+
+	if (have_reservation) {
+		if (!bch2_extents_match(k, bkey_i_to_s_c(old.k)))
+			goto out;
+
+		bch2_key_resize(&new.k->k, sectors);
+	} else if (!unwritten) {
+		struct bkey_i_reservation *reservation;
+
+		bch2_bkey_buf_realloc(&new, c, sizeof(*reservation) / sizeof(u64));
+		reservation = bkey_reservation_init(new.k);
+		reservation->k.p = iter->pos;
+		bch2_key_resize(&reservation->k, sectors);
+		reservation->v.nr_replicas = opts.data_replicas;
+	} else {
+		struct bkey_i_extent *e;
+		struct bch_devs_list devs_have;
+		struct write_point *wp;
+		struct bch_extent_ptr *ptr;
+
+		devs_have.nr = 0;
+
+		bch2_bkey_buf_realloc(&new, c, BKEY_EXTENT_U64s_MAX);
+
+		e = bkey_extent_init(new.k);
+		e->k.p = iter->pos;
+
+		ret = bch2_alloc_sectors_start_trans(trans,
+				opts.foreground_target,
+				false,
+				write_point,
+				&devs_have,
+				opts.data_replicas,
+				opts.data_replicas,
+				RESERVE_none, 0, &cl, &wp);
+		if (bch2_err_matches(ret, BCH_ERR_operation_blocked)) {
+			bch2_trans_unlock(trans);
+			closure_sync(&cl);
+			goto retry;
+		}
+		if (ret)
+			return ret;
+
+		sectors = min(sectors, wp->sectors_free);
+
+		bch2_key_resize(&e->k, sectors);
+
+		bch2_open_bucket_get(c, wp, &open_buckets);
+		bch2_alloc_sectors_append_ptrs(c, wp, &e->k_i, sectors, false);
+		bch2_alloc_sectors_done(c, wp);
+
+		extent_for_each_ptr(extent_i_to_s(e), ptr)
+			ptr->unwritten = true;
+	}
+
+	have_reservation = true;
 
-	ret = bch2_extent_update(trans, inum, iter, &reservation->k_i, &disk_res,
+	ret = bch2_extent_update(trans, inum, iter, new.k, &disk_res,
 				 0, i_sectors_delta, true);
+out:
+	if ((atomic_read(&cl.remaining) & CLOSURE_REMAINING_MASK) != 1) {
+		bch2_trans_unlock(trans);
+		closure_sync(&cl);
+	}
+
+	if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
+		bch2_trans_begin(trans);
+		goto retry;
+	}
+
+	bch2_open_buckets_put(c, &open_buckets);
 	bch2_disk_reservation_put(c, &disk_res);
+	bch2_bkey_buf_exit(&new, c);
+	bch2_bkey_buf_exit(&old, c);
+
 	return ret;
 }
 
@@ -539,7 +634,8 @@ static int bch2_write_index_default(struct bch_write_op *op)
 
 void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
 			       enum bch_data_type type,
-			       const struct bkey_i *k)
+			       const struct bkey_i *k,
+			       bool nocow)
 {
 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(k));
 	const struct bch_extent_ptr *ptr;
@@ -573,8 +669,9 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
 
 		n->c			= c;
 		n->dev			= ptr->dev;
-		n->have_ioref		= bch2_dev_get_ioref(ca,
+		n->have_ioref		= nocow || bch2_dev_get_ioref(ca,
 					type == BCH_DATA_btree ? READ : WRITE);
+		n->nocow		= nocow;
 		n->submit_time		= local_clock();
 		n->inode_offset		= bkey_start_offset(&k->k);
 		n->bio.bi_iter.bi_sector = ptr->offset;
@@ -801,6 +898,9 @@ static void bch2_write_endio(struct bio *bio)
 		op->flags |= BCH_WRITE_IO_ERROR;
 	}
 
+	if (wbio->nocow)
+		set_bit(wbio->dev, op->devs_need_flush->d);
+
 	if (wbio->have_ioref) {
 		bch2_latency_acct(ca, wbio->submit_time, WRITE);
 		percpu_ref_put(&ca->io_ref);
@@ -1221,6 +1321,321 @@ err:
 	return ret;
 }
 
+static bool bch2_extent_is_writeable(struct bch_write_op *op,
+				     struct bkey_s_c k)
+{
+	struct bch_fs *c = op->c;
+	struct bkey_s_c_extent e;
+	struct extent_ptr_decoded p;
+	const union bch_extent_entry *entry;
+	unsigned replicas = 0;
+
+	if (k.k->type != KEY_TYPE_extent)
+		return false;
+
+	e = bkey_s_c_to_extent(k);
+	extent_for_each_ptr_decode(e, p, entry) {
+		if (p.crc.csum_type ||
+		    crc_is_compressed(p.crc) ||
+		    p.has_ec)
+			return false;
+
+		replicas += bch2_extent_ptr_durability(c, &p);
+	}
+
+	return replicas >= op->opts.data_replicas;
+}
+
+static inline void bch2_nocow_write_unlock(struct bch_write_op *op)
+{
+	struct bch_fs *c = op->c;
+	const struct bch_extent_ptr *ptr;
+	struct bkey_i *k;
+
+	for_each_keylist_key(&op->insert_keys, k) {
+		struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(k));
+
+		bkey_for_each_ptr(ptrs, ptr)
+			bch2_bucket_nocow_unlock(&c->nocow_locks,
+					       PTR_BUCKET_POS(c, ptr),
+					       BUCKET_NOCOW_LOCK_UPDATE);
+	}
+}
+
+static int bch2_nocow_write_convert_one_unwritten(struct btree_trans *trans,
+						  struct btree_iter *iter,
+						  struct bkey_i *orig,
+						  struct bkey_s_c k,
+						  u64 new_i_size)
+{
+	struct bkey_i *new;
+	struct bkey_ptrs ptrs;
+	struct bch_extent_ptr *ptr;
+	int ret;
+
+	if (!bch2_extents_match(bkey_i_to_s_c(orig), k)) {
+		/* trace this */
+		return 0;
+	}
+
+	new = bch2_bkey_make_mut(trans, k);
+	ret = PTR_ERR_OR_ZERO(new);
+	if (ret)
+		return ret;
+
+	bch2_cut_front(bkey_start_pos(&orig->k), new);
+	bch2_cut_back(orig->k.p, new);
+
+	ptrs = bch2_bkey_ptrs(bkey_i_to_s(new));
+	bkey_for_each_ptr(ptrs, ptr)
+		ptr->unwritten = 0;
+
+	/*
+	 * Note that we're not calling bch2_subvol_get_snapshot() in this path -
+	 * that was done when we kicked off the write, and here it's important
+	 * that we update the extent that we wrote to - even if a snapshot has
+	 * since been created. The write is still outstanding, so we're ok
+	 * w.r.t. snapshot atomicity:
+	 */
+	return  bch2_extent_update_i_size_sectors(trans, iter,
+					min(new->k.p.offset << 9, new_i_size), 0) ?:
+		bch2_trans_update(trans, iter, new,
+				  BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+}
+
+static void bch2_nocow_write_convert_unwritten(struct bch_write_op *op)
+{
+	struct bch_fs *c = op->c;
+	struct btree_trans trans;
+	struct btree_iter iter;
+	struct bkey_i *orig;
+	struct bkey_s_c k;
+	int ret;
+
+	bch2_trans_init(&trans, c, 0, 0);
+
+	for_each_keylist_key(&op->insert_keys, orig) {
+		ret = for_each_btree_key_upto_commit(&trans, iter, BTREE_ID_extents,
+				     bkey_start_pos(&orig->k), orig->k.p,
+				     BTREE_ITER_INTENT, k,
+				     NULL, NULL, BTREE_INSERT_NOFAIL, ({
+			bch2_nocow_write_convert_one_unwritten(&trans, &iter, orig, k, op->new_i_size);
+		}));
+
+		if (ret && !bch2_err_matches(ret, EROFS)) {
+			struct bkey_i *k = bch2_keylist_front(&op->insert_keys);
+
+			bch_err_inum_offset_ratelimited(c,
+				k->k.p.inode, k->k.p.offset << 9,
+				"write error while doing btree update: %s",
+				bch2_err_str(ret));
+		}
+
+		if (ret) {
+			op->error = ret;
+			break;
+		}
+	}
+
+	bch2_trans_exit(&trans);
+}
+
+static void __bch2_nocow_write_done(struct bch_write_op *op)
+{
+	bch2_nocow_write_unlock(op);
+
+	if (unlikely(op->flags & BCH_WRITE_IO_ERROR)) {
+		op->error = -EIO;
+	} else if (unlikely(op->flags & BCH_WRITE_CONVERT_UNWRITTEN))
+		bch2_nocow_write_convert_unwritten(op);
+}
+
+static void bch2_nocow_write_done(struct closure *cl)
+{
+	struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
+
+	__bch2_nocow_write_done(op);
+	bch2_write_done(cl);
+}
+
+static void bch2_nocow_write(struct bch_write_op *op)
+{
+	struct bch_fs *c = op->c;
+	struct btree_trans trans;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	struct bkey_ptrs_c ptrs;
+	const struct bch_extent_ptr *ptr, *ptr2;
+	struct {
+		struct bpos	b;
+		unsigned	gen;
+		two_state_lock_t *l;
+	} buckets[BCH_REPLICAS_MAX];
+	unsigned nr_buckets = 0;
+	u32 snapshot;
+	int ret, i;
+
+	if (op->flags & BCH_WRITE_MOVE)
+		return;
+
+	bch2_trans_init(&trans, c, 0, 0);
+retry:
+	bch2_trans_begin(&trans);
+
+	ret = bch2_subvolume_get_snapshot(&trans, op->subvol, &snapshot);
+	if (unlikely(ret))
+		goto err;
+
+	bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
+			     SPOS(op->pos.inode, op->pos.offset, snapshot),
+			     BTREE_ITER_SLOTS);
+	while (1) {
+		struct bio *bio = &op->wbio.bio;
+
+		nr_buckets = 0;
+
+		k = bch2_btree_iter_peek_slot(&iter);
+		ret = bkey_err(k);
+		if (ret)
+			break;
+
+		/* fall back to normal cow write path? */
+		if (unlikely(k.k->p.snapshot != snapshot ||
+			     !bch2_extent_is_writeable(op, k)))
+			break;
+
+		if (bch2_keylist_realloc(&op->insert_keys,
+					op->inline_keys,
+					ARRAY_SIZE(op->inline_keys),
+					k.k->u64s))
+			break;
+
+		/* Get iorefs before dropping btree locks: */
+		ptrs = bch2_bkey_ptrs_c(k);
+		bkey_for_each_ptr(ptrs, ptr) {
+			buckets[nr_buckets].b = PTR_BUCKET_POS(c, ptr);
+			buckets[nr_buckets].gen = ptr->gen;
+			buckets[nr_buckets].l =
+				bucket_nocow_lock(&c->nocow_locks, buckets[nr_buckets].b);
+
+			prefetch(buckets[nr_buckets].l);
+			nr_buckets++;
+
+			if (unlikely(!bch2_dev_get_ioref(bch_dev_bkey_exists(c, ptr->dev), WRITE)))
+				goto err_get_ioref;
+
+			if (ptr->unwritten)
+				op->flags |= BCH_WRITE_CONVERT_UNWRITTEN;
+		}
+
+		/* Unlock before taking nocow locks, doing IO: */
+		bkey_reassemble(op->insert_keys.top, k);
+		bch2_trans_unlock(&trans);
+
+		bch2_cut_front(op->pos, op->insert_keys.top);
+		if (op->flags & BCH_WRITE_CONVERT_UNWRITTEN)
+			bch2_cut_back(POS(op->pos.inode, op->pos.offset + bio_sectors(bio)), op->insert_keys.top);
+
+		for (i = 0; i < nr_buckets; i++) {
+			struct bch_dev *ca = bch_dev_bkey_exists(c, buckets[i].b.inode);
+			two_state_lock_t *l = buckets[i].l;
+			bool stale;
+
+			if (!bch2_two_state_trylock(l, BUCKET_NOCOW_LOCK_UPDATE))
+				__bch2_bucket_nocow_lock(&c->nocow_locks, l, BUCKET_NOCOW_LOCK_UPDATE);
+
+			rcu_read_lock();
+			stale = gen_after(*bucket_gen(ca, buckets[i].b.offset), buckets[i].gen);
+			rcu_read_unlock();
+
+			if (unlikely(stale))
+				goto err_bucket_stale;
+		}
+
+		bio = &op->wbio.bio;
+		if (k.k->p.offset < op->pos.offset + bio_sectors(bio)) {
+			bio = bio_split(bio, k.k->p.offset - op->pos.offset,
+					GFP_KERNEL, &c->bio_write);
+			wbio_init(bio)->put_bio = true;
+			bio->bi_opf = op->wbio.bio.bi_opf;
+		} else {
+			op->flags |= BCH_WRITE_DONE;
+		}
+
+		op->pos.offset += bio_sectors(bio);
+		op->written += bio_sectors(bio);
+
+		bio->bi_end_io	= bch2_write_endio;
+		bio->bi_private	= &op->cl;
+		bio->bi_opf |= REQ_OP_WRITE;
+		closure_get(&op->cl);
+		bch2_submit_wbio_replicas(to_wbio(bio), c, BCH_DATA_user,
+					  op->insert_keys.top, true);
+
+		bch2_keylist_push(&op->insert_keys);
+		if (op->flags & BCH_WRITE_DONE)
+			break;
+		bch2_btree_iter_advance(&iter);
+	}
+out:
+	bch2_trans_iter_exit(&trans, &iter);
+err:
+	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+		goto retry;
+
+	if (ret) {
+		bch_err_inum_offset_ratelimited(c,
+				op->pos.inode,
+				op->pos.offset << 9,
+				"%s: btree lookup error %s",
+				__func__, bch2_err_str(ret));
+		op->error = ret;
+		op->flags |= BCH_WRITE_DONE;
+	}
+
+	bch2_trans_exit(&trans);
+
+	/* fallback to cow write path? */
+	if (!(op->flags & BCH_WRITE_DONE)) {
+		closure_sync(&op->cl);
+		__bch2_nocow_write_done(op);
+		op->insert_keys.top = op->insert_keys.keys;
+	} else if (op->flags & BCH_WRITE_SYNC) {
+		closure_sync(&op->cl);
+		bch2_nocow_write_done(&op->cl);
+	} else {
+		/*
+		 * XXX
+		 * needs to run out of process context because ei_quota_lock is
+		 * a mutex
+		 */
+		continue_at(&op->cl, bch2_nocow_write_done, index_update_wq(op));
+	}
+	return;
+err_get_ioref:
+	bkey_for_each_ptr(ptrs, ptr2) {
+		if (ptr2 == ptr)
+			break;
+
+		percpu_ref_put(&bch_dev_bkey_exists(c, ptr2->dev)->io_ref);
+	}
+
+	/* Fall back to COW path: */
+	goto out;
+err_bucket_stale:
+	while (--i >= 0)
+		bch2_bucket_nocow_unlock(&c->nocow_locks,
+					 buckets[i].b,
+					 BUCKET_NOCOW_LOCK_UPDATE);
+
+	bkey_for_each_ptr(ptrs, ptr2)
+		percpu_ref_put(&bch_dev_bkey_exists(c, ptr2->dev)->io_ref);
+
+	/* We can retry this: */
+	ret = BCH_ERR_transaction_restart;
+	goto out;
+}
+
 static void __bch2_write(struct bch_write_op *op)
 {
 	struct bch_fs *c = op->c;
@@ -1230,6 +1645,12 @@ static void __bch2_write(struct bch_write_op *op)
 	int ret;
 
 	nofs_flags = memalloc_nofs_save();
+
+	if (unlikely(op->opts.nocow)) {
+		bch2_nocow_write(op);
+		if (op->flags & BCH_WRITE_DONE)
+			goto out_nofs_restore;
+	}
 again:
 	memset(&op->failed, 0, sizeof(op->failed));
 	op->btree_update_ready = false;
@@ -1310,7 +1731,7 @@ err:
 					 key_to_write_offset);
 
 		bch2_submit_wbio_replicas(to_wbio(bio), c, BCH_DATA_user,
-					  key_to_write);
+					  key_to_write, false);
 	} while (ret);
 
 	/*
@@ -1332,7 +1753,7 @@ err:
 	} else {
 		continue_at(&op->cl, bch2_write_index, NULL);
 	}
-
+out_nofs_restore:
 	memalloc_nofs_restore(nofs_flags);
 }
 
@@ -2563,6 +2984,11 @@ void bch2_fs_io_exit(struct bch_fs *c)
 
 int bch2_fs_io_init(struct bch_fs *c)
 {
+	unsigned i;
+
+	for (i = 0; i < ARRAY_SIZE(c->nocow_locks.l); i++)
+		two_state_lock_init(&c->nocow_locks.l[i]);
+
 	if (bioset_init(&c->bio_read, 1, offsetof(struct bch_read_bio, bio),
 			BIOSET_NEED_BVECS) ||
 	    bioset_init(&c->bio_read_split, 1, offsetof(struct bch_read_bio, bio),
diff --git a/fs/bcachefs/io.h b/fs/bcachefs/io.h
index aafe1bf993bb..77a4a1cef71c 100644
--- a/fs/bcachefs/io.h
+++ b/fs/bcachefs/io.h
@@ -22,7 +22,7 @@ static inline void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw
 #endif
 
 void bch2_submit_wbio_replicas(struct bch_write_bio *, struct bch_fs *,
-			       enum bch_data_type, const struct bkey_i *);
+			       enum bch_data_type, const struct bkey_i *, bool);
 
 #define BLK_STS_REMOVED		((__force blk_status_t)128)
 
@@ -43,6 +43,7 @@ enum bch_write_flags {
 	__BCH_WRITE_IN_WORKER,
 	__BCH_WRITE_DONE,
 	__BCH_WRITE_IO_ERROR,
+	__BCH_WRITE_CONVERT_UNWRITTEN,
 };
 
 #define BCH_WRITE_ALLOC_NOWAIT		(1U << __BCH_WRITE_ALLOC_NOWAIT)
@@ -61,6 +62,7 @@ enum bch_write_flags {
 #define BCH_WRITE_IN_WORKER		(1U << __BCH_WRITE_IN_WORKER)
 #define BCH_WRITE_DONE			(1U << __BCH_WRITE_DONE)
 #define BCH_WRITE_IO_ERROR		(1U << __BCH_WRITE_IO_ERROR)
+#define BCH_WRITE_CONVERT_UNWRITTEN	(1U << __BCH_WRITE_CONVERT_UNWRITTEN)
 
 static inline struct workqueue_struct *index_update_wq(struct bch_write_op *op)
 {
@@ -90,7 +92,7 @@ static inline void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c,
 	op->flags		= 0;
 	op->written		= 0;
 	op->error		= 0;
-	op->csum_type		= bch2_data_checksum_type(c, opts.data_checksum);
+	op->csum_type		= bch2_data_checksum_type(c, opts);
 	op->compression_type	= bch2_compression_opt_to_type[opts.compression];
 	op->nr_replicas		= 0;
 	op->nr_replicas_required = c->opts.data_replicas_required;
@@ -107,6 +109,7 @@ static inline void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c,
 	op->res			= (struct disk_reservation) { 0 };
 	op->new_i_size		= U64_MAX;
 	op->i_sectors_delta	= 0;
+	op->devs_need_flush	= NULL;
 }
 
 void bch2_write(struct closure *);
diff --git a/fs/bcachefs/io_types.h b/fs/bcachefs/io_types.h
index 8e83ce5bc805..200af9e3e6b0 100644
--- a/fs/bcachefs/io_types.h
+++ b/fs/bcachefs/io_types.h
@@ -97,6 +97,7 @@ struct bch_write_bio {
 				bounce:1,
 				put_bio:1,
 				have_ioref:1,
+				nocow:1,
 				used_mempool:1,
 				first_btree_write:1;
 	);
@@ -151,6 +152,12 @@ struct bch_write_op {
 	struct keylist		insert_keys;
 	u64			inline_keys[BKEY_EXTENT_U64s_MAX * 2];
 
+	/*
+	 * Bitmask of devices that have had nocow writes issued to them since
+	 * last flush:
+	 */
+	struct bch_devs_mask	*devs_need_flush;
+
 	/* Must be last: */
 	struct bch_write_bio	wbio;
 };
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index 52f126a0bb73..9e453b8495e8 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -260,6 +260,12 @@ static int bch2_move_extent(struct btree_trans *trans,
 	if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_move))
 		return -BCH_ERR_erofs_no_writes;
 
+	/*
+	 * Before memory allocations & taking nocow locks in
+	 * bch2_data_update_init():
+	 */
+	bch2_trans_unlock(trans);
+
 	/* write path might have to decompress data: */
 	bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
 		sectors = max_t(unsigned, sectors, p.crc.uncompressed_size);
@@ -506,6 +512,7 @@ static int __bch2_move_data(struct moving_context *ctxt,
 		 */
 		bch2_bkey_buf_reassemble(&sk, c, k);
 		k = bkey_i_to_s_c(sk.k);
+		bch2_trans_unlock(&trans);
 
 		ret2 = bch2_move_extent(&trans, &iter, ctxt, io_opts,
 					btree_id, k, data_opts);
diff --git a/fs/bcachefs/nocow_locking.c b/fs/bcachefs/nocow_locking.c
new file mode 100644
index 000000000000..8f06e08370a2
--- /dev/null
+++ b/fs/bcachefs/nocow_locking.c
@@ -0,0 +1,15 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "nocow_locking.h"
+#include "util.h"
+
+void __bch2_bucket_nocow_lock(struct bucket_nocow_lock_table *t,
+			      two_state_lock_t *l, int flags)
+{
+	struct bch_fs *c = container_of(t, struct bch_fs, nocow_locks);
+	u64 start_time = local_clock();
+
+	bch2_two_state_lock(l, flags & BUCKET_NOCOW_LOCK_UPDATE);
+	bch2_time_stats_update(&c->times[BCH_TIME_nocow_lock_contended], start_time);
+}
diff --git a/fs/bcachefs/nocow_locking.h b/fs/bcachefs/nocow_locking.h
new file mode 100644
index 000000000000..2a7a9f44e88e
--- /dev/null
+++ b/fs/bcachefs/nocow_locking.h
@@ -0,0 +1,55 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_NOCOW_LOCKING_H
+#define _BCACHEFS_NOCOW_LOCKING_H
+
+#include "bcachefs_format.h"
+#include "two_state_shared_lock.h"
+
+#include <linux/hash.h>
+
+#define BUCKET_NOCOW_LOCKS_BITS		10
+#define BUCKET_NOCOW_LOCKS		(1U << BUCKET_NOCOW_LOCKS_BITS)
+
+struct bucket_nocow_lock_table {
+	two_state_lock_t		l[BUCKET_NOCOW_LOCKS];
+};
+
+#define BUCKET_NOCOW_LOCK_UPDATE	(1 << 0)
+
+static inline two_state_lock_t *bucket_nocow_lock(struct bucket_nocow_lock_table *t,
+						  struct bpos bucket)
+{
+	u64 dev_bucket = bucket.inode << 56 | bucket.offset;
+	unsigned h = hash_64(dev_bucket, BUCKET_NOCOW_LOCKS_BITS);
+
+	return t->l + (h & (BUCKET_NOCOW_LOCKS - 1));
+}
+
+static inline bool bch2_bucket_nocow_is_locked(struct bucket_nocow_lock_table *t,
+					       struct bpos bucket)
+{
+	two_state_lock_t *l = bucket_nocow_lock(t, bucket);
+
+	return atomic_long_read(&l->v) != 0;
+}
+
+static inline void bch2_bucket_nocow_unlock(struct bucket_nocow_lock_table *t,
+					    struct bpos bucket, int flags)
+{
+	two_state_lock_t *l = bucket_nocow_lock(t, bucket);
+
+	bch2_two_state_unlock(l, flags & BUCKET_NOCOW_LOCK_UPDATE);
+}
+
+void __bch2_bucket_nocow_lock(struct bucket_nocow_lock_table *, two_state_lock_t *, int);
+
+static inline void bch2_bucket_nocow_lock(struct bucket_nocow_lock_table *t,
+					  struct bpos bucket, int flags)
+{
+	two_state_lock_t *l = bucket_nocow_lock(t, bucket);
+
+	if (!bch2_two_state_trylock(l, flags & BUCKET_NOCOW_LOCK_UPDATE))
+		__bch2_bucket_nocow_lock(t, l, flags);
+}
+
+#endif /* _BCACHEFS_NOCOW_LOCKING_H */
diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
index 85927b306014..ef1b8a03f149 100644
--- a/fs/bcachefs/opts.h
+++ b/fs/bcachefs/opts.h
@@ -392,6 +392,13 @@ enum opt_type {
 	  OPT_BOOL(),							\
 	  BCH2_NO_SB_OPT,		false,				\
 	  NULL,		NULL)						\
+	x(nocow,			u8,				\
+	  OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE,		\
+	  OPT_BOOL(),							\
+	  BCH_SB_NOCOW,			false,				\
+	  NULL,		"Nocow mode: Writes will be done in place when possible.\n"\
+			"Snapshots and reflink will still caused writes to be COW\n"\
+			"Implicitly disables data checksumming, compression and encryption")\
 	x(no_data_io,			u8,				\
 	  OPT_MOUNT,							\
 	  OPT_BOOL(),							\
diff --git a/fs/bcachefs/super.h b/fs/bcachefs/super.h
index 5e6fbbfd2d43..36bcb9ec2b3a 100644
--- a/fs/bcachefs/super.h
+++ b/fs/bcachefs/super.h
@@ -88,9 +88,10 @@ static inline void bch2_dev_list_drop_dev(struct bch_devs_list *devs,
 static inline void bch2_dev_list_add_dev(struct bch_devs_list *devs,
 					 unsigned dev)
 {
-	BUG_ON(bch2_dev_list_has_dev(*devs, dev));
-	BUG_ON(devs->nr >= ARRAY_SIZE(devs->devs));
-	devs->devs[devs->nr++] = dev;
+	if (!bch2_dev_list_has_dev(*devs, dev)) {
+		BUG_ON(devs->nr >= ARRAY_SIZE(devs->devs));
+		devs->devs[devs->nr++] = dev;
+	}
 }
 
 static inline struct bch_devs_list bch2_dev_list_single(unsigned dev)
diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h
index fabee8302afa..24dd2defe7c7 100644
--- a/fs/bcachefs/trace.h
+++ b/fs/bcachefs/trace.h
@@ -543,6 +543,7 @@ DECLARE_EVENT_CLASS(bucket_alloc,
 		__field(u64,			need_journal_commit	)
 		__field(u64,			nouse			)
 		__field(bool,			nonblocking		)
+		__field(u64,			nocow			)
 		__array(char,			err,	32		)
 	),
 
@@ -560,10 +561,11 @@ DECLARE_EVENT_CLASS(bucket_alloc,
 		__entry->need_journal_commit = s->skipped_need_journal_commit;
 		__entry->nouse		= s->skipped_nouse;
 		__entry->nonblocking	= nonblocking;
+		__entry->nocow		= s->skipped_nocow;
 		strscpy(__entry->err, err, sizeof(__entry->err));
 	),
 
-	TP_printk("%d,%d reserve %s user %u bucket %llu free %llu avail %llu copygc_wait %llu/%lli seen %llu open %llu need_journal_commit %llu nouse %llu nonblocking %u err %s",
+	TP_printk("%d,%d reserve %s user %u bucket %llu free %llu avail %llu copygc_wait %llu/%lli seen %llu open %llu need_journal_commit %llu nouse %llu nocow %llu nonblocking %u err %s",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  __entry->reserve,
 		  __entry->user,
@@ -576,6 +578,7 @@ DECLARE_EVENT_CLASS(bucket_alloc,
 		  __entry->open,
 		  __entry->need_journal_commit,
 		  __entry->nouse,
+		  __entry->nocow,
 		  __entry->nonblocking,
 		  __entry->err)
 );
-- 
cgit 


From 19fe87e00b6a601b2ec8251d0231f4c9b3bb5002 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 23 Nov 2022 12:22:39 -0500
Subject: bcachefs: Inline bch2_two_state_(trylock|unlock)

Standard inlining of fast paths - these locks are now used by our new
nocow mode.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/nocow_locking.c         |  2 +-
 fs/bcachefs/two_state_shared_lock.c | 29 ++---------------------------
 fs/bcachefs/two_state_shared_lock.h | 37 ++++++++++++++++++++++++++++++++++---
 3 files changed, 37 insertions(+), 31 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/nocow_locking.c b/fs/bcachefs/nocow_locking.c
index 8f06e08370a2..b325fb105322 100644
--- a/fs/bcachefs/nocow_locking.c
+++ b/fs/bcachefs/nocow_locking.c
@@ -10,6 +10,6 @@ void __bch2_bucket_nocow_lock(struct bucket_nocow_lock_table *t,
 	struct bch_fs *c = container_of(t, struct bch_fs, nocow_locks);
 	u64 start_time = local_clock();
 
-	bch2_two_state_lock(l, flags & BUCKET_NOCOW_LOCK_UPDATE);
+	__bch2_two_state_lock(l, flags & BUCKET_NOCOW_LOCK_UPDATE);
 	bch2_time_stats_update(&c->times[BCH_TIME_nocow_lock_contended], start_time);
 }
diff --git a/fs/bcachefs/two_state_shared_lock.c b/fs/bcachefs/two_state_shared_lock.c
index dc508d545de0..9764c2e6a910 100644
--- a/fs/bcachefs/two_state_shared_lock.c
+++ b/fs/bcachefs/two_state_shared_lock.c
@@ -2,32 +2,7 @@
 
 #include "two_state_shared_lock.h"
 
-void bch2_two_state_unlock(two_state_lock_t *lock, int s)
+void __bch2_two_state_lock(two_state_lock_t *lock, int s)
 {
-	long i = s ? 1 : -1;
-
-	BUG_ON(atomic_long_read(&lock->v) == 0);
-
-	if (atomic_long_sub_return_release(i, &lock->v) == 0)
-		wake_up_all(&lock->wait);
-}
-
-bool bch2_two_state_trylock(two_state_lock_t *lock, int s)
-{
-	long i = s ? 1 : -1;
-	long v = atomic_long_read(&lock->v), old;
-
-	do {
-		old = v;
-
-		if (i > 0 ? v < 0 : v > 0)
-			return false;
-	} while ((v = atomic_long_cmpxchg_acquire(&lock->v,
-					old, old + i)) != old);
-	return true;
-}
-
-void bch2_two_state_lock(two_state_lock_t *lock, int s)
-{
-	wait_event(lock->wait, bch2_two_state_trylock(lock, s));
+	__wait_event(lock->wait, bch2_two_state_trylock(lock, s));
 }
diff --git a/fs/bcachefs/two_state_shared_lock.h b/fs/bcachefs/two_state_shared_lock.h
index 1b4f108908a1..905801772002 100644
--- a/fs/bcachefs/two_state_shared_lock.h
+++ b/fs/bcachefs/two_state_shared_lock.h
@@ -6,6 +6,8 @@
 #include <linux/sched.h>
 #include <linux/wait.h>
 
+#include "util.h"
+
 /*
  * Two-state lock - can be taken for add or block - both states are shared,
  * like read side of rwsem, but conflict with other state:
@@ -21,8 +23,37 @@ static inline void two_state_lock_init(two_state_lock_t *lock)
 	init_waitqueue_head(&lock->wait);
 }
 
-void bch2_two_state_unlock(two_state_lock_t *, int);
-bool bch2_two_state_trylock(two_state_lock_t *, int);
-void bch2_two_state_lock(two_state_lock_t *, int);
+static inline void bch2_two_state_unlock(two_state_lock_t *lock, int s)
+{
+	long i = s ? 1 : -1;
+
+	EBUG_ON(atomic_long_read(&lock->v) == 0);
+
+	if (atomic_long_sub_return_release(i, &lock->v) == 0)
+		wake_up_all(&lock->wait);
+}
+
+static inline bool bch2_two_state_trylock(two_state_lock_t *lock, int s)
+{
+	long i = s ? 1 : -1;
+	long v = atomic_long_read(&lock->v), old;
+
+	do {
+		old = v;
+
+		if (i > 0 ? v < 0 : v > 0)
+			return false;
+	} while ((v = atomic_long_cmpxchg_acquire(&lock->v,
+					old, old + i)) != old);
+	return true;
+}
+
+void __bch2_two_state_lock(two_state_lock_t *, int);
+
+static inline void bch2_two_state_lock(two_state_lock_t *lock, int s)
+{
+	if (!bch2_two_state_trylock(lock, s))
+		__bch2_two_state_lock(lock, s);
+}
 
 #endif /* _BCACHEFS_TWO_STATE_LOCK_H */
-- 
cgit 


From 5250b74d55e16246a47bdcf1182b7469e28e0652 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 25 Nov 2022 23:14:30 -0500
Subject: bcachefs: bucket_gens btree

To improve mount times, add a btree for just bucket gens, 256 of them
per key: this means we'll have to scan drastically less metadata at
startup.

This adds
 - trigger for keeping it in sync with the all btree
 - initialization code, for filesystems from previous versions
 - new path for reading bucket gens
 - new fsck code

And a new on disk format version.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c | 406 +++++++++++++++++++++++++++++++++++++++--
 fs/bcachefs/alloc_background.h |  11 ++
 fs/bcachefs/bcachefs.h         |   1 +
 fs/bcachefs/bcachefs_format.h  |  18 +-
 fs/bcachefs/bkey_methods.c     |   3 +
 fs/bcachefs/recovery.c         |  15 +-
 6 files changed, 435 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index f4c3effe2f4e..f515b038c14e 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -597,6 +597,159 @@ int bch2_alloc_read(struct bch_fs *c)
 	return ret;
 }
 
+static struct bpos alloc_gens_pos(struct bpos pos, unsigned *offset)
+{
+	*offset = pos.offset & KEY_TYPE_BUCKET_GENS_MASK;
+
+	pos.offset >>= KEY_TYPE_BUCKET_GENS_BITS;
+	return pos;
+}
+
+static struct bpos bucket_gens_pos_to_alloc(struct bpos pos, unsigned offset)
+{
+	pos.offset <<= KEY_TYPE_BUCKET_GENS_BITS;
+	pos.offset += offset;
+	return pos;
+}
+
+static unsigned alloc_gen(struct bkey_s_c k, unsigned offset)
+{
+	return k.k->type == KEY_TYPE_bucket_gens
+		? bkey_s_c_to_bucket_gens(k).v->gens[offset]
+		: 0;
+}
+
+int bch2_bucket_gens_invalid(const struct bch_fs *c, struct bkey_s_c k,
+			     int rw, struct printbuf *err)
+{
+	if (bkey_val_bytes(k.k) != sizeof(struct bch_bucket_gens)) {
+		prt_printf(err, "bad val size (%lu != %zu)",
+		       bkey_val_bytes(k.k), sizeof(struct bch_bucket_gens));
+		return -BCH_ERR_invalid_bkey;
+	}
+
+	return 0;
+}
+
+void bch2_bucket_gens_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
+{
+	struct bkey_s_c_bucket_gens g = bkey_s_c_to_bucket_gens(k);
+	unsigned i;
+
+	for (i = 0; i < ARRAY_SIZE(g.v->gens); i++) {
+		if (i)
+			prt_char(out, ' ');
+		prt_printf(out, "%u", g.v->gens[i]);
+	}
+}
+
+int bch2_bucket_gens_init(struct bch_fs *c)
+{
+	struct btree_trans trans;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	struct bch_alloc_v4 a;
+	struct bkey_i_bucket_gens g;
+	bool have_bucket_gens_key = false;
+	unsigned offset;
+	struct bpos pos;
+	u8 gen;
+	int ret;
+
+	bch2_trans_init(&trans, c, 0, 0);
+
+	for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN,
+			   BTREE_ITER_PREFETCH, k, ret) {
+		/*
+		 * Not a fsck error because this is checked/repaired by
+		 * bch2_check_alloc_key() which runs later:
+		 */
+		if (!bch2_dev_bucket_exists(c, k.k->p))
+			continue;
+
+		gen = bch2_alloc_to_v4(k, &a)->gen;
+		pos = alloc_gens_pos(iter.pos, &offset);
+
+		if (have_bucket_gens_key && bkey_cmp(iter.pos, pos)) {
+			ret = commit_do(&trans, NULL, NULL,
+					BTREE_INSERT_NOFAIL|
+					BTREE_INSERT_LAZY_RW,
+				__bch2_btree_insert(&trans, BTREE_ID_bucket_gens, &g.k_i));
+			if (ret)
+				break;
+			have_bucket_gens_key = false;
+		}
+
+		if (!have_bucket_gens_key) {
+			bkey_bucket_gens_init(&g.k_i);
+			g.k.p = pos;
+			have_bucket_gens_key = true;
+		}
+
+		g.v.gens[offset] = gen;
+	}
+	bch2_trans_iter_exit(&trans, &iter);
+
+	if (have_bucket_gens_key && !ret)
+		ret = commit_do(&trans, NULL, NULL,
+				BTREE_INSERT_NOFAIL|
+				BTREE_INSERT_LAZY_RW,
+			__bch2_btree_insert(&trans, BTREE_ID_bucket_gens, &g.k_i));
+
+	bch2_trans_exit(&trans);
+
+	if (ret)
+		bch_err(c, "%s: error %s", __func__, bch2_err_str(ret));
+
+	return ret;
+}
+
+int bch2_bucket_gens_read(struct bch_fs *c)
+{
+	struct btree_trans trans;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	const struct bch_bucket_gens *g;
+	struct bch_dev *ca;
+	u64 b;
+	int ret;
+
+	bch2_trans_init(&trans, c, 0, 0);
+
+	for_each_btree_key(&trans, iter, BTREE_ID_bucket_gens, POS_MIN,
+			   BTREE_ITER_PREFETCH, k, ret) {
+		u64 start = bucket_gens_pos_to_alloc(k.k->p, 0).offset;
+		u64 end = bucket_gens_pos_to_alloc(bpos_nosnap_successor(k.k->p), 0).offset;
+
+		if (k.k->type != KEY_TYPE_bucket_gens)
+			continue;
+
+		g = bkey_s_c_to_bucket_gens(k).v;
+
+		/*
+		 * Not a fsck error because this is checked/repaired by
+		 * bch2_check_alloc_key() which runs later:
+		 */
+		if (!bch2_dev_exists2(c, k.k->p.inode))
+			continue;
+
+		ca = bch_dev_bkey_exists(c, k.k->p.inode);
+
+		for (b = max_t(u64, ca->mi.first_bucket, start);
+		     b < min_t(u64, ca->mi.nbuckets, end);
+		     b++)
+			*bucket_gen(ca, b) = g->gens[b & KEY_TYPE_BUCKET_GENS_MASK];
+	}
+	bch2_trans_iter_exit(&trans, &iter);
+
+	bch2_trans_exit(&trans);
+
+	if (ret)
+		bch_err(c, "error reading alloc info: %s", bch2_err_str(ret));
+
+	return ret;
+}
+
 /* Free space/discard btree: */
 
 static int bch2_bucket_do_index(struct btree_trans *trans,
@@ -669,6 +822,44 @@ err:
 	return ret;
 }
 
+static noinline int bch2_bucket_gen_update(struct btree_trans *trans,
+					   struct bpos bucket, u8 gen)
+{
+	struct btree_iter iter;
+	unsigned offset;
+	struct bpos pos = alloc_gens_pos(bucket, &offset);
+	struct bkey_i_bucket_gens *g;
+	struct bkey_s_c k;
+	int ret;
+
+	g = bch2_trans_kmalloc(trans, sizeof(*g));
+	ret = PTR_ERR_OR_ZERO(g);
+	if (ret)
+		return ret;
+
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_bucket_gens, pos,
+			     BTREE_ITER_INTENT|
+			     BTREE_ITER_WITH_UPDATES);
+	k = bch2_btree_iter_peek_slot(&iter);
+	ret = bkey_err(k);
+	if (ret)
+		goto err;
+
+	if (k.k->type != KEY_TYPE_bucket_gens) {
+		bkey_bucket_gens_init(&g->k_i);
+		g->k.p = iter.pos;
+	} else {
+		bkey_reassemble(&g->k_i, k);
+	}
+
+	g->v.gens[offset] = gen;
+
+	ret = bch2_trans_update(trans, &iter, &g->k_i, 0);
+err:
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
 int bch2_trans_mark_alloc(struct btree_trans *trans,
 			  enum btree_id btree_id, unsigned level,
 			  struct bkey_s_c old, struct bkey_i *new,
@@ -732,6 +923,12 @@ int bch2_trans_mark_alloc(struct btree_trans *trans,
 			new_a->io_time[READ] = new_lru;
 	}
 
+	if (old_a->gen != new_a->gen) {
+		ret = bch2_bucket_gen_update(trans, new->k.p, new_a->gen);
+		if (ret)
+			return ret;
+	}
+
 	return 0;
 }
 
@@ -837,13 +1034,15 @@ static int bch2_check_alloc_key(struct btree_trans *trans,
 				struct bkey_s_c alloc_k,
 				struct btree_iter *alloc_iter,
 				struct btree_iter *discard_iter,
-				struct btree_iter *freespace_iter)
+				struct btree_iter *freespace_iter,
+				struct btree_iter *bucket_gens_iter)
 {
 	struct bch_fs *c = trans->c;
 	struct bch_dev *ca;
 	struct bch_alloc_v4 a_convert;
 	const struct bch_alloc_v4 *a;
 	unsigned discard_key_type, freespace_key_type;
+	unsigned gens_offset;
 	struct bkey_s_c k;
 	struct printbuf buf = PRINTBUF;
 	int ret;
@@ -859,14 +1058,8 @@ static int bch2_check_alloc_key(struct btree_trans *trans,
 
 	a = bch2_alloc_to_v4(alloc_k, &a_convert);
 
-	discard_key_type = a->data_type == BCH_DATA_need_discard
-		? KEY_TYPE_set : 0;
-	freespace_key_type = a->data_type == BCH_DATA_free
-		? KEY_TYPE_set : 0;
-
+	discard_key_type = a->data_type == BCH_DATA_need_discard ? KEY_TYPE_set : 0;
 	bch2_btree_iter_set_pos(discard_iter, alloc_k.k->p);
-	bch2_btree_iter_set_pos(freespace_iter, alloc_freespace_pos(alloc_k.k->p, *a));
-
 	k = bch2_btree_iter_peek_slot(discard_iter);
 	ret = bkey_err(k);
 	if (ret)
@@ -895,6 +1088,8 @@ static int bch2_check_alloc_key(struct btree_trans *trans,
 			goto err;
 	}
 
+	freespace_key_type = a->data_type == BCH_DATA_free ? KEY_TYPE_set : 0;
+	bch2_btree_iter_set_pos(freespace_iter, alloc_freespace_pos(alloc_k.k->p, *a));
 	k = bch2_btree_iter_peek_slot(freespace_iter);
 	ret = bkey_err(k);
 	if (ret)
@@ -924,13 +1119,47 @@ static int bch2_check_alloc_key(struct btree_trans *trans,
 		if (ret)
 			goto err;
 	}
+
+	bch2_btree_iter_set_pos(bucket_gens_iter, alloc_gens_pos(alloc_k.k->p, &gens_offset));
+	k = bch2_btree_iter_peek_slot(bucket_gens_iter);
+	ret = bkey_err(k);
+	if (ret)
+		goto err;
+
+	if (a->gen != alloc_gen(k, gens_offset) &&
+	    (c->opts.reconstruct_alloc ||
+	     fsck_err(c, "incorrect gen in bucket_gens btree (got %u should be %u)\n"
+		      "  %s",
+		      alloc_gen(k, gens_offset), a->gen,
+		      (printbuf_reset(&buf),
+		       bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf)))) {
+		struct bkey_i_bucket_gens *g =
+			bch2_trans_kmalloc(trans, sizeof(*g));
+
+		ret = PTR_ERR_OR_ZERO(g);
+		if (ret)
+			goto err;
+
+		if (k.k->type == KEY_TYPE_bucket_gens) {
+			bkey_reassemble(&g->k_i, k);
+		} else {
+			bkey_bucket_gens_init(&g->k_i);
+			g->k.p = alloc_gens_pos(alloc_k.k->p, &gens_offset);
+		}
+
+		g->v.gens[gens_offset] = a->gen;
+
+		ret = bch2_trans_update(trans, bucket_gens_iter, &g->k_i, 0);
+		if (ret)
+			goto err;
+	}
 err:
 fsck_err:
 	printbuf_exit(&buf);
 	return ret;
 }
 
-static int bch2_check_alloc_hole(struct btree_trans *trans,
+static int bch2_check_alloc_hole_freespace(struct btree_trans *trans,
 				 struct bpos start,
 				 struct bpos *end,
 				 struct btree_iter *freespace_iter)
@@ -985,6 +1214,71 @@ fsck_err:
 	return ret;
 }
 
+static int bch2_check_alloc_hole_bucket_gens(struct btree_trans *trans,
+				 struct bpos start,
+				 struct bpos *end,
+				 struct btree_iter *bucket_gens_iter)
+{
+	struct bch_fs *c = trans->c;
+	struct bkey_s_c k;
+	struct printbuf buf = PRINTBUF;
+	unsigned i, gens_offset, gens_end_offset;
+	int ret;
+
+	if (c->sb.version < bcachefs_metadata_version_bucket_gens &&
+	    !c->opts.version_upgrade)
+		return 0;
+
+	bch2_btree_iter_set_pos(bucket_gens_iter, alloc_gens_pos(start, &gens_offset));
+
+	k = bch2_btree_iter_peek_slot(bucket_gens_iter);
+	ret = bkey_err(k);
+	if (ret)
+		goto err;
+
+	if (bkey_cmp(alloc_gens_pos(start, &gens_offset),
+		     alloc_gens_pos(*end,  &gens_end_offset)))
+		gens_end_offset = KEY_TYPE_BUCKET_GENS_NR;
+
+	if (k.k->type == KEY_TYPE_bucket_gens) {
+		struct bkey_i_bucket_gens g;
+		bool need_update = false;
+
+		bkey_reassemble(&g.k_i, k);
+
+		for (i = gens_offset; i < gens_end_offset; i++) {
+			if (fsck_err_on(g.v.gens[i], c,
+					"hole in alloc btree at %llu:%llu with nonzero gen in bucket_gens btree (%u)",
+					bucket_gens_pos_to_alloc(k.k->p, i).inode,
+					bucket_gens_pos_to_alloc(k.k->p, i).offset,
+					g.v.gens[i])) {
+				g.v.gens[i] = 0;
+				need_update = true;
+			}
+		}
+
+		if (need_update) {
+			struct bkey_i *k = bch2_trans_kmalloc(trans, sizeof(g));
+
+			ret = PTR_ERR_OR_ZERO(k);
+			if (ret)
+				goto err;
+
+			memcpy(k, &g, sizeof(g));
+
+			ret = bch2_trans_update(trans, bucket_gens_iter, k, 0);
+			if (ret)
+				goto err;
+		}
+	}
+
+	*end = bkey_min(*end, bucket_gens_pos_to_alloc(bpos_nosnap_successor(k.k->p), 0));
+err:
+fsck_err:
+	printbuf_exit(&buf);
+	return ret;
+}
+
 static int bch2_check_discard_freespace_key(struct btree_trans *trans,
 					    struct btree_iter *iter)
 {
@@ -1040,10 +1334,79 @@ delete:
 	goto out;
 }
 
+/*
+ * We've already checked that generation numbers in the bucket_gens btree are
+ * valid for buckets that exist; this just checks for keys for nonexistent
+ * buckets.
+ */
+static int bch2_check_bucket_gens_key(struct btree_trans *trans,
+				      struct btree_iter *iter,
+				      struct bkey_s_c k)
+{
+	struct bch_fs *c = trans->c;
+	struct bkey_i_bucket_gens g;
+	struct bch_dev *ca;
+	u64 start = bucket_gens_pos_to_alloc(k.k->p, 0).offset;
+	u64 end = bucket_gens_pos_to_alloc(bpos_nosnap_successor(k.k->p), 0).offset;
+	u64 b;
+	bool need_update = false;
+	struct printbuf buf = PRINTBUF;
+	int ret = 0;
+
+	BUG_ON(k.k->type != KEY_TYPE_bucket_gens);
+	bkey_reassemble(&g.k_i, k);
+
+	if (fsck_err_on(!bch2_dev_exists2(c, k.k->p.inode), c,
+			"bucket_gens key for invalid device:\n  %s",
+			(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
+		ret = bch2_btree_delete_at(trans, iter, 0);
+		goto out;
+	}
+
+	ca = bch_dev_bkey_exists(c, k.k->p.inode);
+	if (fsck_err_on(end <= ca->mi.first_bucket ||
+			start >= ca->mi.nbuckets, c,
+			"bucket_gens key for invalid buckets:\n  %s",
+			(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
+		ret = bch2_btree_delete_at(trans, iter, 0);
+		goto out;
+	}
+
+	for (b = start; b < ca->mi.first_bucket; b++)
+		if (fsck_err_on(g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK], c,
+				"bucket_gens key has nonzero gen for invalid bucket")) {
+			g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK] = 0;
+			need_update = true;
+		}
+
+	for (b = ca->mi.nbuckets; b < end; b++)
+		if (fsck_err_on(g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK], c,
+				"bucket_gens key has nonzero gen for invalid bucket")) {
+			g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK] = 0;
+			need_update = true;
+		}
+
+	if (need_update) {
+		struct bkey_i *k;
+
+		k = bch2_trans_kmalloc(trans, sizeof(g));
+		ret = PTR_ERR_OR_ZERO(k);
+		if (ret)
+			goto out;
+
+		memcpy(k, &g, sizeof(g));
+		ret = bch2_trans_update(trans, iter, k, 0);
+	}
+out:
+fsck_err:
+	printbuf_exit(&buf);
+	return ret;
+}
+
 int bch2_check_alloc_info(struct bch_fs *c)
 {
 	struct btree_trans trans;
-	struct btree_iter iter, discard_iter, freespace_iter;
+	struct btree_iter iter, discard_iter, freespace_iter, bucket_gens_iter;
 	struct bkey hole;
 	struct bkey_s_c k;
 	int ret = 0;
@@ -1056,6 +1419,8 @@ int bch2_check_alloc_info(struct bch_fs *c)
 			     BTREE_ITER_PREFETCH);
 	bch2_trans_iter_init(&trans, &freespace_iter, BTREE_ID_freespace, POS_MIN,
 			     BTREE_ITER_PREFETCH);
+	bch2_trans_iter_init(&trans, &bucket_gens_iter, BTREE_ID_bucket_gens, POS_MIN,
+			     BTREE_ITER_PREFETCH);
 
 	while (1) {
 		struct bpos next;
@@ -1076,16 +1441,21 @@ int bch2_check_alloc_info(struct bch_fs *c)
 			ret = bch2_check_alloc_key(&trans,
 						   k, &iter,
 						   &discard_iter,
-						   &freespace_iter);
+						   &freespace_iter,
+						   &bucket_gens_iter);
 			if (ret)
-				break;
+				goto bkey_err;
 		} else {
 			next = k.k->p;
 
-			ret = bch2_check_alloc_hole(&trans,
+			ret = bch2_check_alloc_hole_freespace(&trans,
 						    bkey_start_pos(k.k),
 						    &next,
-						    &freespace_iter);
+						    &freespace_iter) ?:
+				bch2_check_alloc_hole_bucket_gens(&trans,
+						    bkey_start_pos(k.k),
+						    &next,
+						    &bucket_gens_iter);
 			if (ret)
 				goto bkey_err;
 		}
@@ -1103,6 +1473,7 @@ bkey_err:
 		if (ret)
 			break;
 	}
+	bch2_trans_iter_exit(&trans, &bucket_gens_iter);
 	bch2_trans_iter_exit(&trans, &freespace_iter);
 	bch2_trans_iter_exit(&trans, &discard_iter);
 	bch2_trans_iter_exit(&trans, &iter);
@@ -1119,7 +1490,12 @@ bkey_err:
 			BTREE_ID_freespace, POS_MIN,
 			BTREE_ITER_PREFETCH, k,
 			NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW,
-		bch2_check_discard_freespace_key(&trans, &iter));
+		bch2_check_discard_freespace_key(&trans, &iter)) ?:
+	      for_each_btree_key_commit(&trans, iter,
+			BTREE_ID_bucket_gens, POS_MIN,
+			BTREE_ITER_PREFETCH, k,
+			NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW,
+		bch2_check_bucket_gens_key(&trans, &iter, k));
 err:
 	bch2_trans_exit(&trans);
 	return ret < 0 ? ret : 0;
diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h
index b843316d3846..d4957b4557bf 100644
--- a/fs/bcachefs/alloc_background.h
+++ b/fs/bcachefs/alloc_background.h
@@ -148,6 +148,16 @@ void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 	.atomic_trigger	= bch2_mark_alloc,		\
 })
 
+int bch2_bucket_gens_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *);
+void bch2_bucket_gens_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
+
+#define bch2_bkey_ops_bucket_gens ((struct bkey_ops) {	\
+	.key_invalid	= bch2_bucket_gens_invalid,	\
+	.val_to_text	= bch2_bucket_gens_to_text,	\
+})
+
+int bch2_bucket_gens_init(struct bch_fs *);
+
 static inline bool bkey_is_alloc(const struct bkey *k)
 {
 	return  k->type == KEY_TYPE_alloc ||
@@ -156,6 +166,7 @@ static inline bool bkey_is_alloc(const struct bkey *k)
 }
 
 int bch2_alloc_read(struct bch_fs *);
+int bch2_bucket_gens_read(struct bch_fs *);
 
 int bch2_trans_mark_alloc(struct btree_trans *, enum btree_id, unsigned,
 			  struct bkey_s_c, struct bkey_i *, unsigned);
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 74632105fb45..acd4adaf475a 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -441,6 +441,7 @@ enum gc_phase {
 	GC_PHASE_BTREE_freespace,
 	GC_PHASE_BTREE_need_discard,
 	GC_PHASE_BTREE_backpointers,
+	GC_PHASE_BTREE_bucket_gens,
 
 	GC_PHASE_PENDING_DELETE,
 };
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index 024a714955f2..7e67d2e94a29 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -371,7 +371,8 @@ static inline void bkey_init(struct bkey *k)
 	x(lru,			26)			\
 	x(alloc_v4,		27)			\
 	x(backpointer,		28)			\
-	x(inode_v3,		29)
+	x(inode_v3,		29)			\
+	x(bucket_gens,		30)
 
 enum bch_bkey_type {
 #define x(name, nr) KEY_TYPE_##name	= nr,
@@ -1013,6 +1014,15 @@ struct bch_backpointer {
 	struct bpos		pos;
 } __packed __aligned(8);
 
+#define KEY_TYPE_BUCKET_GENS_BITS	8
+#define KEY_TYPE_BUCKET_GENS_NR		(1U << KEY_TYPE_BUCKET_GENS_BITS)
+#define KEY_TYPE_BUCKET_GENS_MASK	(KEY_TYPE_BUCKET_GENS_NR - 1)
+
+struct bch_bucket_gens {
+	struct bch_val		v;
+	u8			gens[KEY_TYPE_BUCKET_GENS_NR];
+} __packed __aligned(8);
+
 /* Quotas: */
 
 enum quota_types {
@@ -1551,7 +1561,8 @@ struct bch_sb_field_journal_seq_blacklist {
 	x(new_data_types,		21)		\
 	x(backpointers,			22)		\
 	x(inode_v3,			23)		\
-	x(unwritten_extents,		24)
+	x(unwritten_extents,		24)		\
+	x(bucket_gens,			25)
 
 enum bcachefs_metadata_version {
 	bcachefs_metadata_version_min = 9,
@@ -2086,7 +2097,8 @@ LE32_BITMASK(JSET_NO_FLUSH,	struct jset, flags, 5, 6);
 	x(lru,			10)		\
 	x(freespace,		11)		\
 	x(need_discard,		12)		\
-	x(backpointers,		13)
+	x(backpointers,		13)		\
+	x(bucket_gens,		14)
 
 enum btree_id {
 #define x(kwd, val) BTREE_ID_##kwd = val,
diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c
index c7c0a9781a35..293188f47e8a 100644
--- a/fs/bcachefs/bkey_methods.c
+++ b/fs/bcachefs/bkey_methods.c
@@ -196,6 +196,9 @@ static unsigned bch2_key_types_allowed[] = {
 	[BKEY_TYPE_backpointers] =
 		(1U << KEY_TYPE_deleted)|
 		(1U << KEY_TYPE_backpointer),
+	[BKEY_TYPE_bucket_gens] =
+		(1U << KEY_TYPE_deleted)|
+		(1U << KEY_TYPE_bucket_gens),
 	[BKEY_TYPE_btree] =
 		(1U << KEY_TYPE_deleted)|
 		(1U << KEY_TYPE_btree_ptr)|
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index b35590226037..b10ba8963350 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -929,6 +929,7 @@ static bool btree_id_is_alloc(enum btree_id id)
 	case BTREE_ID_backpointers:
 	case BTREE_ID_need_discard:
 	case BTREE_ID_freespace:
+	case BTREE_ID_bucket_gens:
 		return true;
 	default:
 		return false;
@@ -1237,7 +1238,9 @@ use_clean:
 	err = "error reading allocation information";
 
 	down_read(&c->gc_lock);
-	ret = bch2_alloc_read(c);
+	ret = c->sb.version < bcachefs_metadata_version_bucket_gens
+		? bch2_alloc_read(c)
+		: bch2_bucket_gens_read(c);
 	up_read(&c->gc_lock);
 
 	if (ret)
@@ -1362,6 +1365,16 @@ use_clean:
 	if (ret)
 		goto err;
 
+	if (c->sb.version < bcachefs_metadata_version_bucket_gens &&
+	    c->opts.version_upgrade) {
+		bch_info(c, "initializing bucket_gens");
+		err = "error initializing bucket gens";
+		ret = bch2_bucket_gens_init(c);
+		if (ret)
+			goto err;
+		bch_verbose(c, "bucket_gens init done");
+	}
+
 	if (c->sb.version < bcachefs_metadata_version_snapshot_2) {
 		/* set bi_subvol on root inode */
 		err = "error upgrade root inode for subvolumes";
-- 
cgit 


From 71fe14655f49f717b06d92192f2492c22da6b3af Mon Sep 17 00:00:00 2001
From: Daniel Hill <daniel@gluo.nz>
Date: Wed, 7 Dec 2022 18:41:21 +1300
Subject: bcachefs: expose nocow_lock table in sysfs

Signed-off-by: Daniel Hill <daniel@gluo.nz>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/sysfs.c | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index 20484f67c3bc..5b1f792243cd 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -194,6 +194,7 @@ read_attribute(btree_key_cache);
 read_attribute(stripes_heap);
 read_attribute(open_buckets);
 read_attribute(write_points);
+read_attribute(nocow_lock_table);
 
 #ifdef BCH_WRITE_REF_DEBUG
 read_attribute(write_refs);
@@ -476,6 +477,23 @@ SHOW(bch2_fs)
 		bch2_write_refs_to_text(out, c);
 #endif
 
+	if (attr == &sysfs_nocow_lock_table) {
+		int i, count = 1;
+		long last, curr = 0;
+
+		last = atomic_long_read(&c->nocow_locks.l[0].v);
+		for (i = 1; i < BUCKET_NOCOW_LOCKS; i++) {
+			curr = atomic_long_read(&c->nocow_locks.l[i].v);
+			if (last != curr) {
+				prt_printf(out, "%li: %d\n", last, count);
+				count = 1;
+				last = curr;
+			} else
+				count++;
+		}
+		prt_printf(out, "%li: %d\n", last, count);
+	}
+
 	return 0;
 }
 
@@ -662,6 +680,7 @@ struct attribute *bch2_fs_internal_files[] = {
 #ifdef BCH_WRITE_REF_DEBUG
 	&sysfs_write_refs,
 #endif
+	&sysfs_nocow_lock_table,
 	&sysfs_io_timers_read,
 	&sysfs_io_timers_write,
 
-- 
cgit 


From f3a37e76cade1469871c4309584ebbc358becf40 Mon Sep 17 00:00:00 2001
From: Daniel Hill <daniel@gluo.nz>
Date: Fri, 9 Dec 2022 12:37:56 +1300
Subject: bcachefs: handle failed data_update_init cleanup

data_update_init allocates several resources, but we forget to clean
these up when it fails.

Signed-off-by: Daniel Hill <daniel@gluo.nz>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/data_update.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c
index c3f12b3adb14..190ad03910af 100644
--- a/fs/bcachefs/data_update.c
+++ b/fs/bcachefs/data_update.c
@@ -469,7 +469,7 @@ int bch2_data_update_init(struct bch_fs *c, struct data_update *m,
 				? 0
 				: BCH_DISK_RESERVATION_NOFAIL);
 		if (ret)
-			return ret;
+			goto err;
 	}
 
 	m->op.nr_replicas = m->op.nr_replicas_required =
@@ -481,6 +481,14 @@ int bch2_data_update_init(struct bch_fs *c, struct data_update *m,
 	if (bkey_extent_is_unwritten(k))
 		return -BCH_ERR_unwritten_extent_update;
 	return 0;
+err:
+	bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
+		bch2_bucket_nocow_unlock(&c->nocow_locks,
+				       PTR_BUCKET_POS(c, &p.ptr), 0);
+
+	bch2_bkey_buf_exit(&m->k, c);
+	bch2_bio_free_pages_pool(c, &m->op.wbio.bio);
+	return ret;
 }
 
 void bch2_data_update_opts_normalize(struct bkey_s_c k, struct data_update_opts *opts)
-- 
cgit 


From 350175bf9b0fe5da12a2fd8bfd453a49f038ceb4 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 14 Dec 2022 20:52:11 -0500
Subject: bcachefs: Improved nocow locking

This improves the nocow lock table so that hash table entries have
multiple locks, and locks specify which bucket they're for - i.e. we can
now resolve hash collisions.

This is important because the allocator has to skip buckets that are
locked in the nocow lock table, and previously hash collisions would
cause it to spuriously skip unlocked buckets.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.h    |  10 ++++
 fs/bcachefs/alloc_foreground.c    |   1 +
 fs/bcachefs/bcachefs.h            |   2 +-
 fs/bcachefs/data_update.c         |   1 +
 fs/bcachefs/io.c                  |  18 +++---
 fs/bcachefs/nocow_locking.c       | 114 ++++++++++++++++++++++++++++++++++++--
 fs/bcachefs/nocow_locking.h       |  54 ++++++++----------
 fs/bcachefs/nocow_locking_types.h |  20 +++++++
 fs/bcachefs/super.c               |   2 +
 fs/bcachefs/sysfs.c               |  19 +------
 10 files changed, 179 insertions(+), 62 deletions(-)
 create mode 100644 fs/bcachefs/nocow_locking_types.h

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h
index d4957b4557bf..a0c3c47b49b5 100644
--- a/fs/bcachefs/alloc_background.h
+++ b/fs/bcachefs/alloc_background.h
@@ -23,6 +23,16 @@ static inline bool bch2_dev_bucket_exists(struct bch_fs *c, struct bpos pos)
 		pos.offset < ca->mi.nbuckets;
 }
 
+static inline u64 bucket_to_u64(struct bpos bucket)
+{
+	return (bucket.inode << 48) | bucket.offset;
+}
+
+static inline struct bpos u64_to_bucket(u64 bucket)
+{
+	return POS(bucket >> 48, bucket & ~(~0ULL << 48));
+}
+
 static inline u8 alloc_gc_gen(struct bch_alloc_v4 a)
 {
 	return a.gen - a.oldest_gen;
diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index f78eaa52c11f..9e1c236d57b8 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -28,6 +28,7 @@
 #include "io.h"
 #include "journal.h"
 #include "movinggc.h"
+#include "nocow_locking.h"
 #include "trace.h"
 
 #include <linux/math64.h>
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index acd4adaf475a..6089d9ed6c27 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -206,7 +206,7 @@
 #include "bcachefs_format.h"
 #include "errcode.h"
 #include "fifo.h"
-#include "nocow_locking.h"
+#include "nocow_locking_types.h"
 #include "opts.h"
 #include "util.h"
 
diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c
index 190ad03910af..eb248968de48 100644
--- a/fs/bcachefs/data_update.c
+++ b/fs/bcachefs/data_update.c
@@ -11,6 +11,7 @@
 #include "io.h"
 #include "keylist.h"
 #include "move.h"
+#include "nocow_locking.h"
 #include "subvolume.h"
 #include "trace.h"
 
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index d511bd664953..fe0c4b58e525 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -27,6 +27,7 @@
 #include "journal.h"
 #include "keylist.h"
 #include "move.h"
+#include "nocow_locking.h"
 #include "rebalance.h"
 #include "subvolume.h"
 #include "super.h"
@@ -1469,7 +1470,7 @@ static void bch2_nocow_write(struct bch_write_op *op)
 	struct {
 		struct bpos	b;
 		unsigned	gen;
-		two_state_lock_t *l;
+		struct nocow_lock_bucket *l;
 	} buckets[BCH_REPLICAS_MAX];
 	unsigned nr_buckets = 0;
 	u32 snapshot;
@@ -1516,7 +1517,8 @@ retry:
 			buckets[nr_buckets].b = PTR_BUCKET_POS(c, ptr);
 			buckets[nr_buckets].gen = ptr->gen;
 			buckets[nr_buckets].l =
-				bucket_nocow_lock(&c->nocow_locks, buckets[nr_buckets].b);
+				bucket_nocow_lock(&c->nocow_locks,
+						  bucket_to_u64(buckets[nr_buckets].b));
 
 			prefetch(buckets[nr_buckets].l);
 			nr_buckets++;
@@ -1538,11 +1540,12 @@ retry:
 
 		for (i = 0; i < nr_buckets; i++) {
 			struct bch_dev *ca = bch_dev_bkey_exists(c, buckets[i].b.inode);
-			two_state_lock_t *l = buckets[i].l;
+			struct nocow_lock_bucket *l = buckets[i].l;
 			bool stale;
 
-			if (!bch2_two_state_trylock(l, BUCKET_NOCOW_LOCK_UPDATE))
-				__bch2_bucket_nocow_lock(&c->nocow_locks, l, BUCKET_NOCOW_LOCK_UPDATE);
+			__bch2_bucket_nocow_lock(&c->nocow_locks, l,
+						 bucket_to_u64(buckets[i].b),
+						 BUCKET_NOCOW_LOCK_UPDATE);
 
 			rcu_read_lock();
 			stale = gen_after(*bucket_gen(ca, buckets[i].b.offset), buckets[i].gen);
@@ -2984,11 +2987,6 @@ void bch2_fs_io_exit(struct bch_fs *c)
 
 int bch2_fs_io_init(struct bch_fs *c)
 {
-	unsigned i;
-
-	for (i = 0; i < ARRAY_SIZE(c->nocow_locks.l); i++)
-		two_state_lock_init(&c->nocow_locks.l[i]);
-
 	if (bioset_init(&c->bio_read, 1, offsetof(struct bch_read_bio, bio),
 			BIOSET_NEED_BVECS) ||
 	    bioset_init(&c->bio_read_split, 1, offsetof(struct bch_read_bio, bio),
diff --git a/fs/bcachefs/nocow_locking.c b/fs/bcachefs/nocow_locking.c
index b325fb105322..53e5bc9fd585 100644
--- a/fs/bcachefs/nocow_locking.c
+++ b/fs/bcachefs/nocow_locking.c
@@ -4,12 +4,116 @@
 #include "nocow_locking.h"
 #include "util.h"
 
+#include <linux/closure.h>
+
+bool bch2_bucket_nocow_is_locked(struct bucket_nocow_lock_table *t, struct bpos bucket)
+{
+	u64 dev_bucket = bucket_to_u64(bucket);
+	struct nocow_lock_bucket *l = bucket_nocow_lock(t, dev_bucket);
+	unsigned i;
+
+	for (i = 0; i < ARRAY_SIZE(l->b); i++)
+		if (l->b[i] == dev_bucket && atomic_read(&l->l[i]))
+			return true;
+	return false;
+}
+
+void bch2_bucket_nocow_unlock(struct bucket_nocow_lock_table *t, struct bpos bucket, int flags)
+{
+	u64 dev_bucket = bucket_to_u64(bucket);
+	struct nocow_lock_bucket *l = bucket_nocow_lock(t, dev_bucket);
+	int lock_val = flags ? 1 : -1;
+	unsigned i;
+
+	for (i = 0; i < ARRAY_SIZE(l->b); i++)
+		if (l->b[i] == dev_bucket) {
+			if (!atomic_sub_return(lock_val, &l->l[i]))
+				closure_wake_up(&l->wait);
+			return;
+		}
+
+	BUG();
+}
+
+bool __bch2_bucket_nocow_trylock(struct nocow_lock_bucket *l,
+				 u64 dev_bucket, int flags)
+{
+	int v, lock_val = flags ? 1 : -1;
+	unsigned i;
+
+	spin_lock(&l->lock);
+
+	for (i = 0; i < ARRAY_SIZE(l->b); i++)
+		if (l->b[i] == dev_bucket)
+			goto got_entry;
+
+	for (i = 0; i < ARRAY_SIZE(l->b); i++)
+		if (!atomic_read(&l->l[i])) {
+			l->b[i] = dev_bucket;
+			goto take_lock;
+		}
+fail:
+	spin_unlock(&l->lock);
+	return false;
+got_entry:
+	v = atomic_read(&l->l[i]);
+	if (lock_val > 0 ? v < 0 : v > 0)
+		goto fail;
+take_lock:
+	atomic_add(lock_val, &l->l[i]);
+	spin_unlock(&l->lock);
+	return true;
+}
+
 void __bch2_bucket_nocow_lock(struct bucket_nocow_lock_table *t,
-			      two_state_lock_t *l, int flags)
+			      struct nocow_lock_bucket *l,
+			      u64 dev_bucket, int flags)
+{
+	if (!__bch2_bucket_nocow_trylock(l, dev_bucket, flags)) {
+		struct bch_fs *c = container_of(t, struct bch_fs, nocow_locks);
+		u64 start_time = local_clock();
+
+		__closure_wait_event(&l->wait, __bch2_bucket_nocow_trylock(l, dev_bucket, flags));
+		bch2_time_stats_update(&c->times[BCH_TIME_nocow_lock_contended], start_time);
+	}
+}
+
+void bch2_nocow_locks_to_text(struct printbuf *out, struct bucket_nocow_lock_table *t)
 {
-	struct bch_fs *c = container_of(t, struct bch_fs, nocow_locks);
-	u64 start_time = local_clock();
+	unsigned i, nr_zero = 0;
+	struct nocow_lock_bucket *l;
+
+	for (l = t->l; l < t->l + ARRAY_SIZE(t->l); l++) {
+		unsigned v = 0;
+
+		for (i = 0; i < ARRAY_SIZE(l->l); i++)
+			v |= atomic_read(&l->l[i]);
+
+		if (!v) {
+			nr_zero++;
+			continue;
+		}
+
+		if (nr_zero)
+			prt_printf(out, "(%u empty entries)\n", nr_zero);
+		nr_zero = 0;
+
+		for (i = 0; i < ARRAY_SIZE(l->l); i++)
+			if (atomic_read(&l->l[i]))
+				prt_printf(out, "%llu: %i ", l->b[i], atomic_read(&l->l[i]));
+		prt_newline(out);
+	}
+
+	if (nr_zero)
+		prt_printf(out, "(%u empty entries)\n", nr_zero);
+}
+
+int bch2_fs_nocow_locking_init(struct bch_fs *c)
+{
+	unsigned i;
+
+	for (i = 0; i < ARRAY_SIZE(c->nocow_locks.l); i++)
+		spin_lock_init(&c->nocow_locks.l[i].lock);
 
-	__bch2_two_state_lock(l, flags & BUCKET_NOCOW_LOCK_UPDATE);
-	bch2_time_stats_update(&c->times[BCH_TIME_nocow_lock_contended], start_time);
+	return 0;
 }
diff --git a/fs/bcachefs/nocow_locking.h b/fs/bcachefs/nocow_locking.h
index 2a7a9f44e88e..ff8e4af52edc 100644
--- a/fs/bcachefs/nocow_locking.h
+++ b/fs/bcachefs/nocow_locking.h
@@ -2,54 +2,48 @@
 #ifndef _BCACHEFS_NOCOW_LOCKING_H
 #define _BCACHEFS_NOCOW_LOCKING_H
 
-#include "bcachefs_format.h"
-#include "two_state_shared_lock.h"
+#include "bcachefs.h"
+#include "alloc_background.h"
+#include "nocow_locking_types.h"
 
 #include <linux/hash.h>
 
-#define BUCKET_NOCOW_LOCKS_BITS		10
-#define BUCKET_NOCOW_LOCKS		(1U << BUCKET_NOCOW_LOCKS_BITS)
-
-struct bucket_nocow_lock_table {
-	two_state_lock_t		l[BUCKET_NOCOW_LOCKS];
-};
-
-#define BUCKET_NOCOW_LOCK_UPDATE	(1 << 0)
-
-static inline two_state_lock_t *bucket_nocow_lock(struct bucket_nocow_lock_table *t,
-						  struct bpos bucket)
+static inline struct nocow_lock_bucket *bucket_nocow_lock(struct bucket_nocow_lock_table *t,
+							  u64 dev_bucket)
 {
-	u64 dev_bucket = bucket.inode << 56 | bucket.offset;
 	unsigned h = hash_64(dev_bucket, BUCKET_NOCOW_LOCKS_BITS);
 
 	return t->l + (h & (BUCKET_NOCOW_LOCKS - 1));
 }
 
-static inline bool bch2_bucket_nocow_is_locked(struct bucket_nocow_lock_table *t,
-					       struct bpos bucket)
-{
-	two_state_lock_t *l = bucket_nocow_lock(t, bucket);
+#define BUCKET_NOCOW_LOCK_UPDATE	(1 << 0)
 
-	return atomic_long_read(&l->v) != 0;
-}
+bool bch2_bucket_nocow_is_locked(struct bucket_nocow_lock_table *, struct bpos);
+void bch2_bucket_nocow_unlock(struct bucket_nocow_lock_table *, struct bpos, int);
+bool __bch2_bucket_nocow_trylock(struct nocow_lock_bucket *, u64, int);
+void __bch2_bucket_nocow_lock(struct bucket_nocow_lock_table *,
+			      struct nocow_lock_bucket *, u64, int);
 
-static inline void bch2_bucket_nocow_unlock(struct bucket_nocow_lock_table *t,
-					    struct bpos bucket, int flags)
+static inline void bch2_bucket_nocow_lock(struct bucket_nocow_lock_table *t,
+					  struct bpos bucket, int flags)
 {
-	two_state_lock_t *l = bucket_nocow_lock(t, bucket);
+	u64 dev_bucket = bucket_to_u64(bucket);
+	struct nocow_lock_bucket *l = bucket_nocow_lock(t, dev_bucket);
 
-	bch2_two_state_unlock(l, flags & BUCKET_NOCOW_LOCK_UPDATE);
+	__bch2_bucket_nocow_lock(t, l, dev_bucket, flags);
 }
 
-void __bch2_bucket_nocow_lock(struct bucket_nocow_lock_table *, two_state_lock_t *, int);
-
-static inline void bch2_bucket_nocow_lock(struct bucket_nocow_lock_table *t,
+static inline bool bch2_bucket_nocow_trylock(struct bucket_nocow_lock_table *t,
 					  struct bpos bucket, int flags)
 {
-	two_state_lock_t *l = bucket_nocow_lock(t, bucket);
+	u64 dev_bucket = bucket_to_u64(bucket);
+	struct nocow_lock_bucket *l = bucket_nocow_lock(t, dev_bucket);
 
-	if (!bch2_two_state_trylock(l, flags & BUCKET_NOCOW_LOCK_UPDATE))
-		__bch2_bucket_nocow_lock(t, l, flags);
+	return __bch2_bucket_nocow_trylock(l, dev_bucket, flags);
 }
 
+void bch2_nocow_locks_to_text(struct printbuf *, struct bucket_nocow_lock_table *);
+
+int bch2_fs_nocow_locking_init(struct bch_fs *);
+
 #endif /* _BCACHEFS_NOCOW_LOCKING_H */
diff --git a/fs/bcachefs/nocow_locking_types.h b/fs/bcachefs/nocow_locking_types.h
new file mode 100644
index 000000000000..bd12bf677924
--- /dev/null
+++ b/fs/bcachefs/nocow_locking_types.h
@@ -0,0 +1,20 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_NOCOW_LOCKING_TYPES_H
+#define _BCACHEFS_NOCOW_LOCKING_TYPES_H
+
+#define BUCKET_NOCOW_LOCKS_BITS		10
+#define BUCKET_NOCOW_LOCKS		(1U << BUCKET_NOCOW_LOCKS_BITS)
+
+struct nocow_lock_bucket {
+	struct closure_waitlist		wait;
+	spinlock_t			lock;
+	u64				b[4];
+	atomic_t			l[4];
+} __aligned(SMP_CACHE_BYTES);
+
+struct bucket_nocow_lock_table {
+	struct nocow_lock_bucket	l[BUCKET_NOCOW_LOCKS];
+};
+
+#endif /* _BCACHEFS_NOCOW_LOCKING_TYPES_H */
+
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 2fb7e6300ea5..e142de2a5527 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -39,6 +39,7 @@
 #include "move.h"
 #include "migrate.h"
 #include "movinggc.h"
+#include "nocow_locking.h"
 #include "quota.h"
 #include "rebalance.h"
 #include "recovery.h"
@@ -821,6 +822,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 	    bch2_fs_btree_write_buffer_init(c) ?:
 	    bch2_fs_subvolumes_init(c) ?:
 	    bch2_fs_io_init(c) ?:
+	    bch2_fs_nocow_locking_init(c) ?:
 	    bch2_fs_encryption_init(c) ?:
 	    bch2_fs_compress_init(c) ?:
 	    bch2_fs_ec_init(c) ?:
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index 5b1f792243cd..6cbdf70f36bd 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -27,6 +27,7 @@
 #include "journal.h"
 #include "keylist.h"
 #include "move.h"
+#include "nocow_locking.h"
 #include "opts.h"
 #include "rebalance.h"
 #include "replicas.h"
@@ -477,22 +478,8 @@ SHOW(bch2_fs)
 		bch2_write_refs_to_text(out, c);
 #endif
 
-	if (attr == &sysfs_nocow_lock_table) {
-		int i, count = 1;
-		long last, curr = 0;
-
-		last = atomic_long_read(&c->nocow_locks.l[0].v);
-		for (i = 1; i < BUCKET_NOCOW_LOCKS; i++) {
-			curr = atomic_long_read(&c->nocow_locks.l[i].v);
-			if (last != curr) {
-				prt_printf(out, "%li: %d\n", last, count);
-				count = 1;
-				last = curr;
-			} else
-				count++;
-		}
-		prt_printf(out, "%li: %d\n", last, count);
-	}
+	if (attr == &sysfs_nocow_lock_table)
+		bch2_nocow_locks_to_text(out, &c->nocow_locks);
 
 	return 0;
 }
-- 
cgit 


From 83f33d686553c5105ff36da4dd554c34125094e9 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 5 Dec 2022 16:49:13 -0500
Subject: bcachefs: Rework lru btree

This patch changes how the LRU index works:

Instead of using KEY_TYPE_lru where the bucket the lru entry points to
is part of the value, this switches to KEY_TYPE_set and encoding the
bucket we refer to in the low bits of the key.

This means that we no longer have to check for collisions when inserting
LRU entries. We'll be making using of this in the next patch, which adds
a btree write buffer - a pure write buffer for btree updates, where
updates are appended to a simple array and then periodically sorted and
batch inserted.

This is a new on disk format version, and a forced upgrade.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c |  75 ++++++++++----------------
 fs/bcachefs/bcachefs_format.h  |   3 +-
 fs/bcachefs/bkey_methods.c     |   2 +-
 fs/bcachefs/lru.c              | 120 +++++++++++++----------------------------
 fs/bcachefs/lru.h              |  26 +++++++--
 fs/bcachefs/recovery.c         |   5 +-
 6 files changed, 93 insertions(+), 138 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index f515b038c14e..e81c04bc2327 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -914,13 +914,11 @@ int bch2_trans_mark_alloc(struct btree_trans *trans,
 	new_lru = alloc_lru_idx(*new_a);
 
 	if (old_lru != new_lru) {
-		ret = bch2_lru_change(trans, new->k.p.inode, new->k.p.offset,
-				      old_lru, &new_lru, old);
+		ret = bch2_lru_change(trans, new->k.p.inode,
+				      bucket_to_u64(new->k.p),
+				      old_lru, new_lru);
 		if (ret)
 			return ret;
-
-		if (new_a->data_type == BCH_DATA_cached)
-			new_a->io_time[READ] = new_lru;
 	}
 
 	if (old_a->gen != new_a->gen) {
@@ -1510,7 +1508,6 @@ static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans,
 	const struct bch_alloc_v4 *a;
 	struct bkey_s_c alloc_k, k;
 	struct printbuf buf = PRINTBUF;
-	struct printbuf buf2 = PRINTBUF;
 	int ret;
 
 	alloc_k = bch2_btree_iter_peek(alloc_iter);
@@ -1527,8 +1524,9 @@ static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans,
 		return 0;
 
 	bch2_trans_iter_init(trans, &lru_iter, BTREE_ID_lru,
-			     POS(alloc_k.k->p.inode, a->io_time[READ]), 0);
-
+			     lru_pos(alloc_k.k->p.inode,
+				     bucket_to_u64(alloc_k.k->p),
+				     a->io_time[READ]), 0);
 	k = bch2_btree_iter_peek_slot(&lru_iter);
 	ret = bkey_err(k);
 	if (ret)
@@ -1539,21 +1537,18 @@ static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans,
 			"  %s",
 		(printbuf_reset(&buf),
 		 bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf)) ||
-	    fsck_err_on(k.k->type != KEY_TYPE_lru ||
-			le64_to_cpu(bkey_s_c_to_lru(k).v->idx) != alloc_k.k->p.offset, c,
-			"incorrect/missing lru entry\n"
-			"  %s\n"
+	    fsck_err_on(k.k->type != KEY_TYPE_set, c,
+			"missing lru entry\n"
 			"  %s",
 			(printbuf_reset(&buf),
-			 bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf),
-			(bch2_bkey_val_to_text(&buf2, c, k), buf2.buf))) {
+			 bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) {
 		u64 read_time = a->io_time[READ] ?:
 			atomic64_read(&c->io_clock[READ].now);
 
 		ret = bch2_lru_set(trans,
 				   alloc_k.k->p.inode,
-				   alloc_k.k->p.offset,
-				   &read_time);
+				   bucket_to_u64(alloc_k.k->p),
+				   read_time);
 		if (ret)
 			goto err;
 
@@ -1574,7 +1569,6 @@ static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans,
 err:
 fsck_err:
 	bch2_trans_iter_exit(trans, &lru_iter);
-	printbuf_exit(&buf2);
 	printbuf_exit(&buf);
 	return ret;
 }
@@ -1757,51 +1751,34 @@ void bch2_do_discards(struct bch_fs *c)
 }
 
 static int invalidate_one_bucket(struct btree_trans *trans,
-				 struct btree_iter *lru_iter, struct bkey_s_c k,
-				 unsigned dev_idx, s64 *nr_to_invalidate)
+				 struct btree_iter *lru_iter,
+				 struct bpos bucket,
+				 s64 *nr_to_invalidate)
 {
 	struct bch_fs *c = trans->c;
 	struct btree_iter alloc_iter = { NULL };
 	struct bkey_i_alloc_v4 *a;
-	struct bpos bucket;
 	struct printbuf buf = PRINTBUF;
 	unsigned cached_sectors;
 	int ret = 0;
 
-	if (*nr_to_invalidate <= 0 || k.k->p.inode != dev_idx)
+	if (*nr_to_invalidate <= 0)
 		return 1;
 
-	if (k.k->type != KEY_TYPE_lru) {
-		prt_printf(&buf, "non lru key in lru btree:\n  ");
-		bch2_bkey_val_to_text(&buf, c, k);
-
-		if (!test_bit(BCH_FS_CHECK_LRUS_DONE, &c->flags)) {
-			bch_err(c, "%s", buf.buf);
-		} else {
-			bch2_trans_inconsistent(trans, "%s", buf.buf);
-			ret = -EINVAL;
-		}
-
-		goto out;
-	}
-
-	bucket = POS(dev_idx, le64_to_cpu(bkey_s_c_to_lru(k).v->idx));
-
 	a = bch2_trans_start_alloc_update(trans, &alloc_iter, bucket);
 	ret = PTR_ERR_OR_ZERO(a);
 	if (ret)
 		goto out;
 
-	if (k.k->p.offset != alloc_lru_idx(a->v)) {
+	if (lru_pos_time(lru_iter->pos) != alloc_lru_idx(a->v)) {
 		prt_printf(&buf, "alloc key does not point back to lru entry when invalidating bucket:\n  ");
-		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&a->k_i));
+		bch2_bpos_to_text(&buf, lru_iter->pos);
 		prt_printf(&buf, "\n  ");
-		bch2_bkey_val_to_text(&buf, c, k);
+		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&a->k_i));
 
-		if (!test_bit(BCH_FS_CHECK_LRUS_DONE, &c->flags)) {
-			bch_err(c, "%s", buf.buf);
-		} else {
-			bch2_trans_inconsistent(trans, "%s", buf.buf);
+		bch_err(c, "%s", buf.buf);
+		if (test_bit(BCH_FS_CHECK_LRUS_DONE, &c->flags)) {
+			bch2_inconsistent_error(c);
 			ret = -EINVAL;
 		}
 
@@ -1852,9 +1829,13 @@ static void bch2_do_invalidates_work(struct work_struct *work)
 		s64 nr_to_invalidate =
 			should_invalidate_buckets(ca, bch2_dev_usage_read(ca));
 
-		ret = for_each_btree_key2(&trans, iter, BTREE_ID_lru,
-				POS(ca->dev_idx, 0), BTREE_ITER_INTENT, k,
-			invalidate_one_bucket(&trans, &iter, k, ca->dev_idx, &nr_to_invalidate));
+		ret = for_each_btree_key2_upto(&trans, iter, BTREE_ID_lru,
+				lru_pos(ca->dev_idx, 0, 0),
+				lru_pos(ca->dev_idx, U64_MAX, LRU_TIME_MAX),
+				BTREE_ITER_INTENT, k,
+			invalidate_one_bucket(&trans, &iter,
+					      u64_to_bucket(k.k->p.offset),
+					      &nr_to_invalidate));
 
 		if (ret < 0) {
 			percpu_ref_put(&ca->ref);
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index 7e67d2e94a29..99f9fbd1401f 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -1562,7 +1562,8 @@ struct bch_sb_field_journal_seq_blacklist {
 	x(backpointers,			22)		\
 	x(inode_v3,			23)		\
 	x(unwritten_extents,		24)		\
-	x(bucket_gens,			25)
+	x(bucket_gens,			25)		\
+	x(lru_v2,			26)
 
 enum bcachefs_metadata_version {
 	bcachefs_metadata_version_min = 9,
diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c
index 293188f47e8a..f40a3ea3f79b 100644
--- a/fs/bcachefs/bkey_methods.c
+++ b/fs/bcachefs/bkey_methods.c
@@ -186,7 +186,7 @@ static unsigned bch2_key_types_allowed[] = {
 		(1U << KEY_TYPE_snapshot),
 	[BKEY_TYPE_lru] =
 		(1U << KEY_TYPE_deleted)|
-		(1U << KEY_TYPE_lru),
+		(1U << KEY_TYPE_set),
 	[BKEY_TYPE_freespace] =
 		(1U << KEY_TYPE_deleted)|
 		(1U << KEY_TYPE_set),
diff --git a/fs/bcachefs/lru.c b/fs/bcachefs/lru.c
index 12821868df71..6f7becb051bc 100644
--- a/fs/bcachefs/lru.c
+++ b/fs/bcachefs/lru.c
@@ -8,6 +8,7 @@
 #include "lru.h"
 #include "recovery.h"
 
+/* KEY_TYPE_lru is obsolete: */
 int bch2_lru_invalid(const struct bch_fs *c, struct bkey_s_c k,
 		     int rw, struct printbuf *err)
 {
@@ -19,7 +20,7 @@ int bch2_lru_invalid(const struct bch_fs *c, struct bkey_s_c k,
 		return -BCH_ERR_invalid_bkey;
 	}
 
-	if (!k.k->p.offset) {
+	if (!lru_pos_time(k.k->p)) {
 		prt_printf(err, "lru entry at time=0");
 		return -BCH_ERR_invalid_bkey;
 
@@ -36,101 +37,57 @@ void bch2_lru_to_text(struct printbuf *out, struct bch_fs *c,
 	prt_printf(out, "idx %llu", le64_to_cpu(lru->idx));
 }
 
-int bch2_lru_delete(struct btree_trans *trans, u64 id, u64 idx, u64 time,
-		    struct bkey_s_c orig_k)
+static int __bch2_lru_set(struct btree_trans *trans, u16 lru_id,
+			u64 dev_bucket, u64 time, unsigned key_type)
 {
 	struct btree_iter iter;
-	struct bkey_s_c k;
-	u64 existing_idx;
-	struct printbuf buf = PRINTBUF;
+	struct bkey_i *k;
 	int ret = 0;
 
 	if (!time)
 		return 0;
 
-	bch2_trans_iter_init(trans, &iter, BTREE_ID_lru,
-			     POS(id, time),
-			     BTREE_ITER_INTENT|
-			     BTREE_ITER_WITH_UPDATES);
-	k = bch2_btree_iter_peek_slot(&iter);
-	ret = bkey_err(k);
-	if (ret)
-		goto err;
+	k = bch2_trans_kmalloc_nomemzero(trans, sizeof(*k));
+	ret = PTR_ERR_OR_ZERO(k);
+	if (unlikely(ret))
+		return ret;
 
-	if (k.k->type != KEY_TYPE_lru) {
-		bch2_bkey_val_to_text(&buf, trans->c, orig_k);
-		bch2_trans_inconsistent(trans,
-			"pointer to nonexistent lru %llu:%llu\n%s",
-			id, time, buf.buf);
-		ret = -EIO;
-		goto err;
-	}
+	bkey_init(&k->k);
+	k->k.type = key_type;
+	k->k.p = lru_pos(lru_id, dev_bucket, time);
 
-	existing_idx = le64_to_cpu(bkey_s_c_to_lru(k).v->idx);
-	if (existing_idx != idx) {
-		bch2_bkey_val_to_text(&buf, trans->c, orig_k);
-		bch2_trans_inconsistent(trans,
-			"lru %llu:%llu with wrong backpointer: got %llu, should be %llu\n%s",
-			id, time, existing_idx, idx, buf.buf);
-		ret = -EIO;
-		goto err;
-	}
+	EBUG_ON(lru_pos_id(k->k.p) != lru_id);
+	EBUG_ON(lru_pos_time(k->k.p) != time);
+	EBUG_ON(k->k.p.offset != dev_bucket);
 
-	ret = bch2_btree_delete_at(trans, &iter, 0);
-err:
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_lru,
+			     k->k.p, BTREE_ITER_INTENT);
+
+	ret = bch2_btree_iter_traverse(&iter) ?:
+		bch2_trans_update(trans, &iter, k, 0);
 	bch2_trans_iter_exit(trans, &iter);
-	printbuf_exit(&buf);
 	return ret;
 }
 
-int bch2_lru_set(struct btree_trans *trans, u64 lru_id, u64 idx, u64 *time)
+int bch2_lru_del(struct btree_trans *trans, u16 lru_id, u64 dev_bucket, u64 time)
 {
-	struct btree_iter iter;
-	struct bkey_s_c k;
-	struct bkey_i_lru *lru;
-	int ret = 0;
-
-	if (!*time)
-		return 0;
-
-	for_each_btree_key_norestart(trans, iter, BTREE_ID_lru,
-			POS(lru_id, *time),
-			BTREE_ITER_SLOTS|
-			BTREE_ITER_INTENT|
-			BTREE_ITER_WITH_UPDATES, k, ret)
-		if (bkey_deleted(k.k))
-			break;
-
-	if (ret)
-		goto err;
-
-	BUG_ON(iter.pos.inode != lru_id);
-	*time = iter.pos.offset;
-
-	lru = bch2_bkey_alloc(trans, &iter, lru);
-	ret = PTR_ERR_OR_ZERO(lru);
-	if (ret)
-		goto err;
-
-	lru->v.idx = cpu_to_le64(idx);
+	return __bch2_lru_set(trans, lru_id, dev_bucket, time, KEY_TYPE_deleted);
+}
 
-	ret = bch2_trans_update(trans, &iter, &lru->k_i, 0);
-	if (ret)
-		goto err;
-err:
-	bch2_trans_iter_exit(trans, &iter);
-	return ret;
+int bch2_lru_set(struct btree_trans *trans, u16 lru_id, u64 dev_bucket, u64 time)
+{
+	return __bch2_lru_set(trans, lru_id, dev_bucket, time, KEY_TYPE_set);
 }
 
-int bch2_lru_change(struct btree_trans *trans, u64 id, u64 idx,
-		    u64 old_time, u64 *new_time,
-		    struct bkey_s_c k)
+int bch2_lru_change(struct btree_trans *trans,
+		    u16 lru_id, u64 dev_bucket,
+		    u64 old_time, u64 new_time)
 {
-	if (old_time == *new_time)
+	if (old_time == new_time)
 		return 0;
 
-	return  bch2_lru_delete(trans, id, idx, old_time, k) ?:
-		bch2_lru_set(trans, id, idx, new_time);
+	return  bch2_lru_del(trans, lru_id, dev_bucket, old_time) ?:
+		bch2_lru_set(trans, lru_id, dev_bucket, new_time);
 }
 
 static int bch2_check_lru_key(struct btree_trans *trans,
@@ -144,12 +101,9 @@ static int bch2_check_lru_key(struct btree_trans *trans,
 	const struct bch_alloc_v4 *a;
 	struct printbuf buf1 = PRINTBUF;
 	struct printbuf buf2 = PRINTBUF;
-	struct bpos alloc_pos;
+	struct bpos alloc_pos = u64_to_bucket(lru_k.k->p.offset);
 	int ret;
 
-	alloc_pos = POS(lru_k.k->p.inode,
-			le64_to_cpu(bkey_s_c_to_lru(lru_k).v->idx));
-
 	if (fsck_err_on(!bch2_dev_bucket_exists(c, alloc_pos), c,
 			"lru key points to nonexistent device:bucket %llu:%llu",
 			alloc_pos.inode, alloc_pos.offset))
@@ -163,10 +117,12 @@ static int bch2_check_lru_key(struct btree_trans *trans,
 
 	a = bch2_alloc_to_v4(k, &a_convert);
 
-	if (fsck_err_on(a->data_type != BCH_DATA_cached ||
-			a->io_time[READ] != lru_k.k->p.offset, c,
-			"incorrect lru entry %s\n"
+	if (fsck_err_on(lru_k.k->type != KEY_TYPE_set ||
+			a->data_type != BCH_DATA_cached ||
+			a->io_time[READ] != lru_pos_time(lru_k.k->p), c,
+			"incorrect lru entry (time %llu) %s\n"
 			"  for %s",
+			lru_pos_time(lru_k.k->p),
 			(bch2_bkey_val_to_text(&buf1, c, lru_k), buf1.buf),
 			(bch2_bkey_val_to_text(&buf2, c, k), buf2.buf))) {
 		ret = bch2_btree_delete_at(trans, lru_iter, 0);
diff --git a/fs/bcachefs/lru.h b/fs/bcachefs/lru.h
index 925c29b49b86..2e22f139848a 100644
--- a/fs/bcachefs/lru.h
+++ b/fs/bcachefs/lru.h
@@ -2,6 +2,26 @@
 #ifndef _BCACHEFS_LRU_H
 #define _BCACHEFS_LRU_H
 
+#define LRU_TIME_BITS	48
+#define LRU_TIME_MAX	((1ULL << LRU_TIME_BITS) - 1)
+
+static inline struct bpos lru_pos(u16 lru_id, u64 dev_bucket, u64 time)
+{
+	EBUG_ON(time > LRU_TIME_MAX);
+
+	return POS(((u64) lru_id << LRU_TIME_BITS)|time, dev_bucket);
+}
+
+static inline u64 lru_pos_id(struct bpos pos)
+{
+	return pos.inode >> LRU_TIME_BITS;
+}
+
+static inline u64 lru_pos_time(struct bpos pos)
+{
+	return pos.inode & ~(~0ULL << LRU_TIME_BITS);
+}
+
 int bch2_lru_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *);
 void bch2_lru_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 
@@ -10,9 +30,9 @@ void bch2_lru_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 	.val_to_text	= bch2_lru_to_text,	\
 })
 
-int bch2_lru_delete(struct btree_trans *, u64, u64, u64, struct bkey_s_c);
-int bch2_lru_set(struct btree_trans *, u64, u64, u64 *);
-int bch2_lru_change(struct btree_trans *, u64, u64, u64, u64 *, struct bkey_s_c);
+int bch2_lru_del(struct btree_trans *, u16, u64, u64);
+int bch2_lru_set(struct btree_trans *, u16, u64, u64);
+int bch2_lru_change(struct btree_trans *, u16, u64, u64, u64);
 
 int bch2_check_lrus(struct bch_fs *);
 
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index b10ba8963350..8a78377bf9c5 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -1094,14 +1094,11 @@ int bch2_fs_recovery(struct bch_fs *c)
 	}
 
 	if (!c->opts.nochanges) {
-		if (c->sb.version < bcachefs_metadata_version_backpointers) {
+		if (c->sb.version < bcachefs_metadata_version_lru_v2) {
 			bch_info(c, "version prior to backpointers, upgrade and fsck required");
 			c->opts.version_upgrade	= true;
 			c->opts.fsck		= true;
 			c->opts.fix_errors	= FSCK_OPT_YES;
-		} else if (c->sb.version < bcachefs_metadata_version_inode_v3) {
-			bch_info(c, "version prior to inode_v3, upgrade required");
-			c->opts.version_upgrade	= true;
 		}
 	}
 
-- 
cgit 


From facafdcbc157686311dbe58649ef9d29fcf8e610 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 20 Dec 2022 19:58:16 -0500
Subject: bcachefs: Change bkey_invalid() rw param to flags

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c | 11 ++++++-----
 fs/bcachefs/alloc_background.h | 10 +++++-----
 fs/bcachefs/backpointers.c     |  2 +-
 fs/bcachefs/backpointers.h     |  2 +-
 fs/bcachefs/bkey_methods.c     | 22 +++++++++++-----------
 fs/bcachefs/bkey_methods.h     |  8 ++++----
 fs/bcachefs/dirent.c           |  2 +-
 fs/bcachefs/dirent.h           |  2 +-
 fs/bcachefs/ec.c               |  4 ++--
 fs/bcachefs/ec.h               |  2 +-
 fs/bcachefs/extents.c          | 12 ++++++------
 fs/bcachefs/extents.h          |  8 ++++----
 fs/bcachefs/inode.c            |  8 ++++----
 fs/bcachefs/inode.h            |  8 ++++----
 fs/bcachefs/lru.c              |  2 +-
 fs/bcachefs/lru.h              |  2 +-
 fs/bcachefs/quota.c            |  2 +-
 fs/bcachefs/quota.h            |  2 +-
 fs/bcachefs/reflink.c          |  8 ++++----
 fs/bcachefs/reflink.h          |  6 +++---
 fs/bcachefs/subvolume.c        |  4 ++--
 fs/bcachefs/subvolume.h        |  4 ++--
 fs/bcachefs/xattr.c            |  2 +-
 fs/bcachefs/xattr.h            |  2 +-
 24 files changed, 68 insertions(+), 67 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index e81c04bc2327..8b15d7a78933 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -222,7 +222,7 @@ static unsigned bch_alloc_v1_val_u64s(const struct bch_alloc *a)
 }
 
 int bch2_alloc_v1_invalid(const struct bch_fs *c, struct bkey_s_c k,
-			  int rw, struct printbuf *err)
+			  unsigned flags, struct printbuf *err)
 {
 	struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k);
 
@@ -237,7 +237,7 @@ int bch2_alloc_v1_invalid(const struct bch_fs *c, struct bkey_s_c k,
 }
 
 int bch2_alloc_v2_invalid(const struct bch_fs *c, struct bkey_s_c k,
-			  int rw, struct printbuf *err)
+			  unsigned flags, struct printbuf *err)
 {
 	struct bkey_alloc_unpacked u;
 
@@ -250,7 +250,7 @@ int bch2_alloc_v2_invalid(const struct bch_fs *c, struct bkey_s_c k,
 }
 
 int bch2_alloc_v3_invalid(const struct bch_fs *c, struct bkey_s_c k,
-			  int rw, struct printbuf *err)
+			  unsigned flags, struct printbuf *err)
 {
 	struct bkey_alloc_unpacked u;
 
@@ -263,9 +263,10 @@ int bch2_alloc_v3_invalid(const struct bch_fs *c, struct bkey_s_c k,
 }
 
 int bch2_alloc_v4_invalid(const struct bch_fs *c, struct bkey_s_c k,
-			  int rw, struct printbuf *err)
+			  unsigned flags, struct printbuf *err)
 {
 	struct bkey_s_c_alloc_v4 a = bkey_s_c_to_alloc_v4(k);
+	int rw = flags & WRITE;
 
 	if (alloc_v4_u64s(a.v) != bkey_val_u64s(k.k)) {
 		prt_printf(err, "bad val size (%lu != %u)",
@@ -620,7 +621,7 @@ static unsigned alloc_gen(struct bkey_s_c k, unsigned offset)
 }
 
 int bch2_bucket_gens_invalid(const struct bch_fs *c, struct bkey_s_c k,
-			     int rw, struct printbuf *err)
+			     unsigned flags, struct printbuf *err)
 {
 	if (bkey_val_bytes(k.k) != sizeof(struct bch_bucket_gens)) {
 		prt_printf(err, "bad val size (%lu != %zu)",
diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h
index a0c3c47b49b5..b3c2f1e0deb6 100644
--- a/fs/bcachefs/alloc_background.h
+++ b/fs/bcachefs/alloc_background.h
@@ -122,10 +122,10 @@ struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut(struct btree_trans *, struct bkey_s
 
 int bch2_bucket_io_time_reset(struct btree_trans *, unsigned, size_t, int);
 
-int bch2_alloc_v1_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *);
-int bch2_alloc_v2_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *);
-int bch2_alloc_v3_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *);
-int bch2_alloc_v4_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *);
+int bch2_alloc_v1_invalid(const struct bch_fs *, struct bkey_s_c, unsigned, struct printbuf *);
+int bch2_alloc_v2_invalid(const struct bch_fs *, struct bkey_s_c, unsigned, struct printbuf *);
+int bch2_alloc_v3_invalid(const struct bch_fs *, struct bkey_s_c, unsigned, struct printbuf *);
+int bch2_alloc_v4_invalid(const struct bch_fs *, struct bkey_s_c, unsigned, struct printbuf *);
 void bch2_alloc_v4_swab(struct bkey_s);
 void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 
@@ -158,7 +158,7 @@ void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 	.atomic_trigger	= bch2_mark_alloc,		\
 })
 
-int bch2_bucket_gens_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *);
+int bch2_bucket_gens_invalid(const struct bch_fs *, struct bkey_s_c, unsigned, struct printbuf *);
 void bch2_bucket_gens_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 
 #define bch2_bkey_ops_bucket_gens ((struct bkey_ops) {	\
diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c
index 3e862da6f15f..c269fc73a41d 100644
--- a/fs/bcachefs/backpointers.c
+++ b/fs/bcachefs/backpointers.c
@@ -38,7 +38,7 @@ static bool extent_matches_bp(struct bch_fs *c,
 }
 
 int bch2_backpointer_invalid(const struct bch_fs *c, struct bkey_s_c k,
-			     int rw, struct printbuf *err)
+			     unsigned flags, struct printbuf *err)
 {
 	struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(k);
 	struct bpos bucket = bp_pos_to_bucket(c, bp.k->p);
diff --git a/fs/bcachefs/backpointers.h b/fs/bcachefs/backpointers.h
index 153870d4e9a0..ded1ab7fb0bc 100644
--- a/fs/bcachefs/backpointers.h
+++ b/fs/bcachefs/backpointers.h
@@ -8,7 +8,7 @@
 #include "super.h"
 
 int bch2_backpointer_invalid(const struct bch_fs *, struct bkey_s_c k,
-			     int, struct printbuf *);
+			     unsigned, struct printbuf *);
 void bch2_backpointer_to_text(struct printbuf *, const struct bch_backpointer *);
 void bch2_backpointer_k_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 void bch2_backpointer_swab(struct bkey_s);
diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c
index f40a3ea3f79b..72d95831d65d 100644
--- a/fs/bcachefs/bkey_methods.c
+++ b/fs/bcachefs/bkey_methods.c
@@ -24,7 +24,7 @@ const char * const bch2_bkey_types[] = {
 };
 
 static int deleted_key_invalid(const struct bch_fs *c, struct bkey_s_c k,
-			       int rw, struct printbuf *err)
+			       unsigned flags, struct printbuf *err)
 {
 	return 0;
 }
@@ -38,7 +38,7 @@ static int deleted_key_invalid(const struct bch_fs *c, struct bkey_s_c k,
 })
 
 static int empty_val_key_invalid(const struct bch_fs *c, struct bkey_s_c k,
-				 int rw, struct printbuf *err)
+				 unsigned flags, struct printbuf *err)
 {
 	if (bkey_val_bytes(k.k)) {
 		prt_printf(err, "incorrect value size (%zu != 0)",
@@ -54,7 +54,7 @@ static int empty_val_key_invalid(const struct bch_fs *c, struct bkey_s_c k,
 })
 
 static int key_type_cookie_invalid(const struct bch_fs *c, struct bkey_s_c k,
-				   int rw, struct printbuf *err)
+				   unsigned flags, struct printbuf *err)
 {
 	if (bkey_val_bytes(k.k) != sizeof(struct bch_cookie)) {
 		prt_printf(err, "incorrect value size (%zu != %zu)",
@@ -74,7 +74,7 @@ static int key_type_cookie_invalid(const struct bch_fs *c, struct bkey_s_c k,
 })
 
 static int key_type_inline_data_invalid(const struct bch_fs *c, struct bkey_s_c k,
-					int rw, struct printbuf *err)
+					unsigned flags, struct printbuf *err)
 {
 	return 0;
 }
@@ -95,7 +95,7 @@ static void key_type_inline_data_to_text(struct printbuf *out, struct bch_fs *c,
 })
 
 static int key_type_set_invalid(const struct bch_fs *c, struct bkey_s_c k,
-				int rw, struct printbuf *err)
+				unsigned flags, struct printbuf *err)
 {
 	if (bkey_val_bytes(k.k)) {
 		prt_printf(err, "incorrect value size (%zu != %zu)",
@@ -124,14 +124,14 @@ const struct bkey_ops bch2_bkey_ops[] = {
 };
 
 int bch2_bkey_val_invalid(struct bch_fs *c, struct bkey_s_c k,
-			  int rw, struct printbuf *err)
+			  unsigned flags, struct printbuf *err)
 {
 	if (k.k->type >= KEY_TYPE_MAX) {
 		prt_printf(err, "invalid type (%u >= %u)", k.k->type, KEY_TYPE_MAX);
 		return -BCH_ERR_invalid_bkey;
 	}
 
-	return bch2_bkey_ops[k.k->type].key_invalid(c, k, rw, err);
+	return bch2_bkey_ops[k.k->type].key_invalid(c, k, flags, err);
 }
 
 static unsigned bch2_key_types_allowed[] = {
@@ -207,7 +207,7 @@ static unsigned bch2_key_types_allowed[] = {
 
 int __bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k,
 			enum btree_node_type type,
-			int rw, struct printbuf *err)
+			unsigned flags, struct printbuf *err)
 {
 	if (k.k->u64s < BKEY_U64s) {
 		prt_printf(err, "u64s too small (%u < %zu)", k.k->u64s, BKEY_U64s);
@@ -263,10 +263,10 @@ int __bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k,
 
 int bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k,
 		      enum btree_node_type type,
-		      int rw, struct printbuf *err)
+		      unsigned flags, struct printbuf *err)
 {
-	return __bch2_bkey_invalid(c, k, type, rw, err) ?:
-		bch2_bkey_val_invalid(c, k, rw, err);
+	return __bch2_bkey_invalid(c, k, type, flags, err) ?:
+		bch2_bkey_val_invalid(c, k, flags, err);
 }
 
 int bch2_bkey_in_btree_node(struct btree *b, struct bkey_s_c k,
diff --git a/fs/bcachefs/bkey_methods.h b/fs/bcachefs/bkey_methods.h
index 7c907b7fd0d7..8f60e1df678e 100644
--- a/fs/bcachefs/bkey_methods.h
+++ b/fs/bcachefs/bkey_methods.h
@@ -21,7 +21,7 @@ extern const char * const bch2_bkey_types[];
  */
 struct bkey_ops {
 	int		(*key_invalid)(const struct bch_fs *c, struct bkey_s_c k,
-				       int rw, struct printbuf *err);
+				       unsigned flags, struct printbuf *err);
 	void		(*val_to_text)(struct printbuf *, struct bch_fs *,
 				       struct bkey_s_c);
 	void		(*swab)(struct bkey_s);
@@ -38,11 +38,11 @@ struct bkey_ops {
 
 extern const struct bkey_ops bch2_bkey_ops[];
 
-int bch2_bkey_val_invalid(struct bch_fs *, struct bkey_s_c, int, struct printbuf *);
+int bch2_bkey_val_invalid(struct bch_fs *, struct bkey_s_c, unsigned, struct printbuf *);
 int __bch2_bkey_invalid(struct bch_fs *, struct bkey_s_c,
-			enum btree_node_type, int, struct printbuf *);
+			enum btree_node_type, unsigned, struct printbuf *);
 int bch2_bkey_invalid(struct bch_fs *, struct bkey_s_c,
-		      enum btree_node_type, int, struct printbuf *);
+		      enum btree_node_type, unsigned, struct printbuf *);
 int bch2_bkey_in_btree_node(struct btree *, struct bkey_s_c, struct printbuf *);
 
 void bch2_bpos_to_text(struct printbuf *, struct bpos);
diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c
index f1838b7c45ee..4c85d3399fb4 100644
--- a/fs/bcachefs/dirent.c
+++ b/fs/bcachefs/dirent.c
@@ -84,7 +84,7 @@ const struct bch_hash_desc bch2_dirent_hash_desc = {
 };
 
 int bch2_dirent_invalid(const struct bch_fs *c, struct bkey_s_c k,
-			int rw, struct printbuf *err)
+			unsigned flags, struct printbuf *err)
 {
 	struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
 	unsigned len;
diff --git a/fs/bcachefs/dirent.h b/fs/bcachefs/dirent.h
index 1a2c9108f864..ad131e8edc29 100644
--- a/fs/bcachefs/dirent.h
+++ b/fs/bcachefs/dirent.h
@@ -6,7 +6,7 @@
 
 extern const struct bch_hash_desc bch2_dirent_hash_desc;
 
-int bch2_dirent_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *);
+int bch2_dirent_invalid(const struct bch_fs *, struct bkey_s_c, unsigned, struct printbuf *);
 void bch2_dirent_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 
 #define bch2_bkey_ops_dirent ((struct bkey_ops) {	\
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index 7028fb718ebf..879df8bd1f51 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -105,7 +105,7 @@ struct ec_bio {
 /* Stripes btree keys: */
 
 int bch2_stripe_invalid(const struct bch_fs *c, struct bkey_s_c k,
-			int rw, struct printbuf *err)
+			unsigned flags, struct printbuf *err)
 {
 	const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
 
@@ -131,7 +131,7 @@ int bch2_stripe_invalid(const struct bch_fs *c, struct bkey_s_c k,
 		return -BCH_ERR_invalid_bkey;
 	}
 
-	return bch2_bkey_ptrs_invalid(c, k, rw, err);
+	return bch2_bkey_ptrs_invalid(c, k, flags, err);
 }
 
 void bch2_stripe_to_text(struct printbuf *out, struct bch_fs *c,
diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h
index 5587c9467fb5..d47da7d86fe7 100644
--- a/fs/bcachefs/ec.h
+++ b/fs/bcachefs/ec.h
@@ -7,7 +7,7 @@
 #include "extents_types.h"
 
 int bch2_stripe_invalid(const struct bch_fs *, struct bkey_s_c,
-			int rw, struct printbuf *);
+			unsigned, struct printbuf *);
 void bch2_stripe_to_text(struct printbuf *, struct bch_fs *,
 			 struct bkey_s_c);
 
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 55a8879dc4fe..4b865949768f 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -165,7 +165,7 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k,
 /* KEY_TYPE_btree_ptr: */
 
 int bch2_btree_ptr_invalid(const struct bch_fs *c, struct bkey_s_c k,
-			   int rw, struct printbuf *err)
+			   unsigned flags, struct printbuf *err)
 {
 	if (bkey_val_u64s(k.k) > BCH_REPLICAS_MAX) {
 		prt_printf(err, "value too big (%zu > %u)",
@@ -173,7 +173,7 @@ int bch2_btree_ptr_invalid(const struct bch_fs *c, struct bkey_s_c k,
 		return -BCH_ERR_invalid_bkey;
 	}
 
-	return bch2_bkey_ptrs_invalid(c, k, rw, err);
+	return bch2_bkey_ptrs_invalid(c, k, flags, err);
 }
 
 void bch2_btree_ptr_to_text(struct printbuf *out, struct bch_fs *c,
@@ -183,7 +183,7 @@ void bch2_btree_ptr_to_text(struct printbuf *out, struct bch_fs *c,
 }
 
 int bch2_btree_ptr_v2_invalid(const struct bch_fs *c, struct bkey_s_c k,
-			      int rw, struct printbuf *err)
+			      unsigned flags, struct printbuf *err)
 {
 	struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k);
 
@@ -206,7 +206,7 @@ int bch2_btree_ptr_v2_invalid(const struct bch_fs *c, struct bkey_s_c k,
 		return -BCH_ERR_invalid_bkey;
 	}
 
-	return bch2_bkey_ptrs_invalid(c, k, rw, err);
+	return bch2_bkey_ptrs_invalid(c, k, flags, err);
 }
 
 void bch2_btree_ptr_v2_to_text(struct printbuf *out, struct bch_fs *c,
@@ -388,7 +388,7 @@ bool bch2_extent_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r)
 /* KEY_TYPE_reservation: */
 
 int bch2_reservation_invalid(const struct bch_fs *c, struct bkey_s_c k,
-			     int rw, struct printbuf *err)
+			     unsigned flags, struct printbuf *err)
 {
 	struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k);
 
@@ -1085,7 +1085,7 @@ static int extent_ptr_invalid(const struct bch_fs *c,
 }
 
 int bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k,
-			   int rw, struct printbuf *err)
+			   unsigned flags, struct printbuf *err)
 {
 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
 	const union bch_extent_entry *entry;
diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h
index e27d39b728b3..1d8f3b309b07 100644
--- a/fs/bcachefs/extents.h
+++ b/fs/bcachefs/extents.h
@@ -371,11 +371,11 @@ int bch2_bkey_pick_read_device(struct bch_fs *, struct bkey_s_c,
 
 /* KEY_TYPE_btree_ptr: */
 
-int bch2_btree_ptr_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *);
+int bch2_btree_ptr_invalid(const struct bch_fs *, struct bkey_s_c, unsigned, struct printbuf *);
 void bch2_btree_ptr_to_text(struct printbuf *, struct bch_fs *,
 			    struct bkey_s_c);
 
-int bch2_btree_ptr_v2_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *);
+int bch2_btree_ptr_v2_invalid(const struct bch_fs *, struct bkey_s_c, unsigned, struct printbuf *);
 void bch2_btree_ptr_v2_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 void bch2_btree_ptr_v2_compat(enum btree_id, unsigned, unsigned,
 			      int, struct bkey_s);
@@ -414,7 +414,7 @@ bool bch2_extent_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);
 /* KEY_TYPE_reservation: */
 
 int bch2_reservation_invalid(const struct bch_fs *, struct bkey_s_c,
-			     int, struct printbuf *);
+			     unsigned, struct printbuf *);
 void bch2_reservation_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 bool bch2_reservation_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);
 
@@ -659,7 +659,7 @@ bool bch2_extent_normalize(struct bch_fs *, struct bkey_s);
 void bch2_bkey_ptrs_to_text(struct printbuf *, struct bch_fs *,
 			    struct bkey_s_c);
 int bch2_bkey_ptrs_invalid(const struct bch_fs *, struct bkey_s_c,
-			   int, struct printbuf *);
+			   unsigned, struct printbuf *);
 
 void bch2_ptr_swab(struct bkey_s);
 
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index a98e40065122..560545a7ea03 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -433,7 +433,7 @@ static int __bch2_inode_invalid(struct bkey_s_c k, struct printbuf *err)
 }
 
 int bch2_inode_invalid(const struct bch_fs *c, struct bkey_s_c k,
-		       int rw, struct printbuf *err)
+		       unsigned flags, struct printbuf *err)
 {
 	struct bkey_s_c_inode inode = bkey_s_c_to_inode(k);
 
@@ -453,7 +453,7 @@ int bch2_inode_invalid(const struct bch_fs *c, struct bkey_s_c k,
 }
 
 int bch2_inode_v2_invalid(const struct bch_fs *c, struct bkey_s_c k,
-			  int rw, struct printbuf *err)
+			  unsigned flags, struct printbuf *err)
 {
 	struct bkey_s_c_inode_v2 inode = bkey_s_c_to_inode_v2(k);
 
@@ -473,7 +473,7 @@ int bch2_inode_v2_invalid(const struct bch_fs *c, struct bkey_s_c k,
 }
 
 int bch2_inode_v3_invalid(const struct bch_fs *c, struct bkey_s_c k,
-			  int rw, struct printbuf *err)
+			  unsigned flags, struct printbuf *err)
 {
 	struct bkey_s_c_inode_v3 inode = bkey_s_c_to_inode_v3(k);
 
@@ -536,7 +536,7 @@ void bch2_inode_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c
 }
 
 int bch2_inode_generation_invalid(const struct bch_fs *c, struct bkey_s_c k,
-				  int rw, struct printbuf *err)
+				  unsigned flags, struct printbuf *err)
 {
 	if (k.k->p.inode) {
 		prt_printf(err, "nonzero k.p.inode");
diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h
index b753e1b254e4..f5066afb4886 100644
--- a/fs/bcachefs/inode.h
+++ b/fs/bcachefs/inode.h
@@ -7,9 +7,9 @@
 
 extern const char * const bch2_inode_opts[];
 
-int bch2_inode_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *);
-int bch2_inode_v2_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *);
-int bch2_inode_v3_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *);
+int bch2_inode_invalid(const struct bch_fs *, struct bkey_s_c, unsigned, struct printbuf *);
+int bch2_inode_v2_invalid(const struct bch_fs *, struct bkey_s_c, unsigned, struct printbuf *);
+int bch2_inode_v3_invalid(const struct bch_fs *, struct bkey_s_c, unsigned, struct printbuf *);
 void bch2_inode_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 
 #define bch2_bkey_ops_inode ((struct bkey_ops) {	\
@@ -41,7 +41,7 @@ static inline bool bkey_is_inode(const struct bkey *k)
 }
 
 int bch2_inode_generation_invalid(const struct bch_fs *, struct bkey_s_c,
-				  int, struct printbuf *);
+				  unsigned, struct printbuf *);
 void bch2_inode_generation_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 
 #define bch2_bkey_ops_inode_generation ((struct bkey_ops) {	\
diff --git a/fs/bcachefs/lru.c b/fs/bcachefs/lru.c
index 6f7becb051bc..f1d6368dda07 100644
--- a/fs/bcachefs/lru.c
+++ b/fs/bcachefs/lru.c
@@ -10,7 +10,7 @@
 
 /* KEY_TYPE_lru is obsolete: */
 int bch2_lru_invalid(const struct bch_fs *c, struct bkey_s_c k,
-		     int rw, struct printbuf *err)
+		     unsigned flags, struct printbuf *err)
 {
 	const struct bch_lru *lru = bkey_s_c_to_lru(k).v;
 
diff --git a/fs/bcachefs/lru.h b/fs/bcachefs/lru.h
index 2e22f139848a..0cfc7459711b 100644
--- a/fs/bcachefs/lru.h
+++ b/fs/bcachefs/lru.h
@@ -22,7 +22,7 @@ static inline u64 lru_pos_time(struct bpos pos)
 	return pos.inode & ~(~0ULL << LRU_TIME_BITS);
 }
 
-int bch2_lru_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *);
+int bch2_lru_invalid(const struct bch_fs *, struct bkey_s_c, unsigned, struct printbuf *);
 void bch2_lru_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 
 #define bch2_bkey_ops_lru ((struct bkey_ops) {	\
diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c
index 4b663f320bfc..331f22835d18 100644
--- a/fs/bcachefs/quota.c
+++ b/fs/bcachefs/quota.c
@@ -59,7 +59,7 @@ const struct bch_sb_field_ops bch_sb_field_ops_quota = {
 };
 
 int bch2_quota_invalid(const struct bch_fs *c, struct bkey_s_c k,
-		       int rw, struct printbuf *err)
+		       unsigned flags, struct printbuf *err)
 {
 	if (k.k->p.inode >= QTYP_NR) {
 		prt_printf(err, "invalid quota type (%llu >= %u)",
diff --git a/fs/bcachefs/quota.h b/fs/bcachefs/quota.h
index 59bed1148201..146264fd16ce 100644
--- a/fs/bcachefs/quota.h
+++ b/fs/bcachefs/quota.h
@@ -7,7 +7,7 @@
 
 extern const struct bch_sb_field_ops bch_sb_field_ops_quota;
 
-int bch2_quota_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *);
+int bch2_quota_invalid(const struct bch_fs *, struct bkey_s_c, unsigned, struct printbuf *);
 void bch2_quota_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 
 #define bch2_bkey_ops_quota ((struct bkey_ops) {	\
diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c
index faf75bcf9ee7..87446f7bad4f 100644
--- a/fs/bcachefs/reflink.c
+++ b/fs/bcachefs/reflink.c
@@ -26,7 +26,7 @@ static inline unsigned bkey_type_to_indirect(const struct bkey *k)
 /* reflink pointers */
 
 int bch2_reflink_p_invalid(const struct bch_fs *c, struct bkey_s_c k,
-			   int rw, struct printbuf *err)
+			   unsigned flags, struct printbuf *err)
 {
 	struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
 
@@ -78,7 +78,7 @@ bool bch2_reflink_p_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r
 /* indirect extents */
 
 int bch2_reflink_v_invalid(const struct bch_fs *c, struct bkey_s_c k,
-			   int rw, struct printbuf *err)
+			   unsigned flags, struct printbuf *err)
 {
 	struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(k);
 
@@ -88,7 +88,7 @@ int bch2_reflink_v_invalid(const struct bch_fs *c, struct bkey_s_c k,
 		return -BCH_ERR_invalid_bkey;
 	}
 
-	return bch2_bkey_ptrs_invalid(c, k, rw, err);
+	return bch2_bkey_ptrs_invalid(c, k, flags, err);
 }
 
 void bch2_reflink_v_to_text(struct printbuf *out, struct bch_fs *c,
@@ -131,7 +131,7 @@ int bch2_trans_mark_reflink_v(struct btree_trans *trans,
 /* indirect inline data */
 
 int bch2_indirect_inline_data_invalid(const struct bch_fs *c, struct bkey_s_c k,
-				      int rw, struct printbuf *err)
+				      unsigned flags, struct printbuf *err)
 {
 	if (bkey_val_bytes(k.k) < sizeof(struct bch_indirect_inline_data)) {
 		prt_printf(err, "incorrect value size (%zu < %zu)",
diff --git a/fs/bcachefs/reflink.h b/fs/bcachefs/reflink.h
index ce0012aa99c6..2391037c2ece 100644
--- a/fs/bcachefs/reflink.h
+++ b/fs/bcachefs/reflink.h
@@ -3,7 +3,7 @@
 #define _BCACHEFS_REFLINK_H
 
 int bch2_reflink_p_invalid(const struct bch_fs *, struct bkey_s_c,
-			   int, struct printbuf *);
+			   unsigned, struct printbuf *);
 void bch2_reflink_p_to_text(struct printbuf *, struct bch_fs *,
 			    struct bkey_s_c);
 bool bch2_reflink_p_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);
@@ -17,7 +17,7 @@ bool bch2_reflink_p_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);
 })
 
 int bch2_reflink_v_invalid(const struct bch_fs *, struct bkey_s_c,
-			   int, struct printbuf *);
+			   unsigned, struct printbuf *);
 void bch2_reflink_v_to_text(struct printbuf *, struct bch_fs *,
 			    struct bkey_s_c);
 int bch2_trans_mark_reflink_v(struct btree_trans *, enum btree_id, unsigned,
@@ -32,7 +32,7 @@ int bch2_trans_mark_reflink_v(struct btree_trans *, enum btree_id, unsigned,
 })
 
 int bch2_indirect_inline_data_invalid(const struct bch_fs *, struct bkey_s_c,
-				      int, struct printbuf *);
+				      unsigned, struct printbuf *);
 void bch2_indirect_inline_data_to_text(struct printbuf *,
 				struct bch_fs *, struct bkey_s_c);
 int bch2_trans_mark_indirect_inline_data(struct btree_trans *,
diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c
index 3f5893f317d1..1805c8542d65 100644
--- a/fs/bcachefs/subvolume.c
+++ b/fs/bcachefs/subvolume.c
@@ -25,7 +25,7 @@ void bch2_snapshot_to_text(struct printbuf *out, struct bch_fs *c,
 }
 
 int bch2_snapshot_invalid(const struct bch_fs *c, struct bkey_s_c k,
-			  int rw, struct printbuf *err)
+			  unsigned flags, struct printbuf *err)
 {
 	struct bkey_s_c_snapshot s;
 	u32 i, id;
@@ -733,7 +733,7 @@ static int bch2_delete_dead_snapshots_hook(struct btree_trans *trans,
 /* Subvolumes: */
 
 int bch2_subvolume_invalid(const struct bch_fs *c, struct bkey_s_c k,
-			   int rw, struct printbuf *err)
+			   unsigned flags, struct printbuf *err)
 {
 	if (bkey_lt(k.k->p, SUBVOL_POS_MIN) ||
 	    bkey_gt(k.k->p, SUBVOL_POS_MAX)) {
diff --git a/fs/bcachefs/subvolume.h b/fs/bcachefs/subvolume.h
index c694c1c24483..b6740eab78d3 100644
--- a/fs/bcachefs/subvolume.h
+++ b/fs/bcachefs/subvolume.h
@@ -7,7 +7,7 @@
 
 void bch2_snapshot_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 int bch2_snapshot_invalid(const struct bch_fs *, struct bkey_s_c,
-			  int rw, struct printbuf *);
+			  unsigned, struct printbuf *);
 
 #define bch2_bkey_ops_snapshot ((struct bkey_ops) {		\
 	.key_invalid	= bch2_snapshot_invalid,		\
@@ -106,7 +106,7 @@ void bch2_fs_snapshots_exit(struct bch_fs *);
 int bch2_fs_snapshots_start(struct bch_fs *);
 
 int bch2_subvolume_invalid(const struct bch_fs *, struct bkey_s_c,
-			   int rw, struct printbuf *);
+			   unsigned, struct printbuf *);
 void bch2_subvolume_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 
 #define bch2_bkey_ops_subvolume ((struct bkey_ops) {		\
diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c
index 448737be045c..4c86878b3df2 100644
--- a/fs/bcachefs/xattr.c
+++ b/fs/bcachefs/xattr.c
@@ -70,7 +70,7 @@ const struct bch_hash_desc bch2_xattr_hash_desc = {
 };
 
 int bch2_xattr_invalid(const struct bch_fs *c, struct bkey_s_c k,
-		       int rw, struct printbuf *err)
+		       unsigned flags, struct printbuf *err)
 {
 	const struct xattr_handler *handler;
 	struct bkey_s_c_xattr xattr = bkey_s_c_to_xattr(k);
diff --git a/fs/bcachefs/xattr.h b/fs/bcachefs/xattr.h
index 03f1b73fc926..1a4cff3a9d96 100644
--- a/fs/bcachefs/xattr.h
+++ b/fs/bcachefs/xattr.h
@@ -6,7 +6,7 @@
 
 extern const struct bch_hash_desc bch2_xattr_hash_desc;
 
-int bch2_xattr_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *);
+int bch2_xattr_invalid(const struct bch_fs *, struct bkey_s_c, unsigned, struct printbuf *);
 void bch2_xattr_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 
 #define bch2_bkey_ops_xattr ((struct bkey_ops) {	\
-- 
cgit 


From dbe17f18838df6d0facf51b43cdc5efd372c28d6 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 20 Dec 2022 20:00:34 -0500
Subject: bcachefs: BKEY_INVALID_FROM_JOURNAL

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c | 8 +++-----
 fs/bcachefs/bkey_methods.h     | 2 ++
 fs/bcachefs/journal_io.c       | 5 +++--
 3 files changed, 8 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 8b15d7a78933..e8412fb1547e 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -280,11 +280,9 @@ int bch2_alloc_v4_invalid(const struct bch_fs *c, struct bkey_s_c k,
 		return -BCH_ERR_invalid_bkey;
 	}
 
-	/*
-	 * XXX this is wrong, we'll be checking updates that happened from
-	 * before BCH_FS_CHECK_BACKPOINTERS_DONE
-	 */
-	if (rw == WRITE && test_bit(BCH_FS_CHECK_BACKPOINTERS_DONE, &c->flags)) {
+	if (rw == WRITE &&
+	    !(flags & BKEY_INVALID_FROM_JOURNAL) &&
+	    test_bit(BCH_FS_CHECK_BACKPOINTERS_DONE, &c->flags)) {
 		unsigned i, bp_len = 0;
 
 		for (i = 0; i < BCH_ALLOC_V4_NR_BACKPOINTERS(a.v); i++)
diff --git a/fs/bcachefs/bkey_methods.h b/fs/bcachefs/bkey_methods.h
index 8f60e1df678e..9a6afab87f6c 100644
--- a/fs/bcachefs/bkey_methods.h
+++ b/fs/bcachefs/bkey_methods.h
@@ -38,6 +38,8 @@ struct bkey_ops {
 
 extern const struct bkey_ops bch2_bkey_ops[];
 
+#define BKEY_INVALID_FROM_JOURNAL		(1 << 1)
+
 int bch2_bkey_val_invalid(struct bch_fs *, struct bkey_s_c, unsigned, struct printbuf *);
 int __bch2_bkey_invalid(struct bch_fs *, struct bkey_s_c,
 			enum btree_node_type, unsigned, struct printbuf *);
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index a9744924d619..c6bb78d2a07f 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -340,7 +340,7 @@ static int journal_entry_btree_keys_validate(struct bch_fs *c,
 		int ret = journal_validate_key(c, jset, entry,
 					       entry->level,
 					       entry->btree_id,
-					       k, version, big_endian, write);
+					       k, version, big_endian, write|BKEY_INVALID_FROM_JOURNAL);
 		if (ret == FSCK_DELETED_KEY)
 			continue;
 
@@ -661,7 +661,8 @@ static int journal_entry_overwrite_validate(struct bch_fs *c,
 				      struct jset_entry *entry,
 				      unsigned version, int big_endian, int write)
 {
-	return journal_entry_btree_keys_validate(c, jset, entry, version, big_endian, READ);
+	return journal_entry_btree_keys_validate(c, jset, entry,
+				version, big_endian, READ);
 }
 
 static void journal_entry_overwrite_to_text(struct printbuf *out, struct bch_fs *c,
-- 
cgit 


From 7ffb6a7ec6712eb1ba84a80137c2f712e67c4171 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 2 Jan 2023 17:53:02 -0500
Subject: bcachefs: Fix deadlock on nocow locks in data move path

The recent nocow locking rework introduced a deadlock in the data move
path: the new nocow locking scheme uses a hash table with a fixed size
array for chaining, meaning on hash collision we may have to wait for
other locks to be released before we can lock a bucket.

And since the data move path needs to submit writes from the same thread
that's taking nocow locks and submitting reads, this introduces a
deadlock.

This shouldn't happen often in practice, but since the data move path
can keep large numbers of IOs in flight simultaneously, it's something
we have to handle.

This patch makes move_ctxt_wait_event() available to
bch2_data_update_init() and uses it when appropriate, which is our
normal solution to this kind of thing.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/data_update.c | 22 +++++++++++++++++++---
 fs/bcachefs/data_update.h |  3 ++-
 fs/bcachefs/io.c          | 28 +++++++++++++++-------------
 fs/bcachefs/move.c        | 21 ++++++---------------
 fs/bcachefs/move.h        | 13 +++++++++++++
 5 files changed, 55 insertions(+), 32 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c
index eb248968de48..91bc95b8ecb9 100644
--- a/fs/bcachefs/data_update.c
+++ b/fs/bcachefs/data_update.c
@@ -397,13 +397,16 @@ void bch2_update_unwritten_extent(struct btree_trans *trans,
 	}
 }
 
-int bch2_data_update_init(struct bch_fs *c, struct data_update *m,
+int bch2_data_update_init(struct btree_trans *trans,
+			  struct moving_context *ctxt,
+			  struct data_update *m,
 			  struct write_point_specifier wp,
 			  struct bch_io_opts io_opts,
 			  struct data_update_opts data_opts,
 			  enum btree_id btree_id,
 			  struct bkey_s_c k)
 {
+	struct bch_fs *c = trans->c;
 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
 	const union bch_extent_entry *entry;
 	struct extent_ptr_decoded p;
@@ -460,8 +463,21 @@ int bch2_data_update_init(struct bch_fs *c, struct data_update *m,
 
 		i++;
 
-		bch2_bucket_nocow_lock(&c->nocow_locks,
-				       PTR_BUCKET_POS(c, &p.ptr), 0);
+		if (ctxt) {
+			bool locked;
+
+			move_ctxt_wait_event(ctxt, trans,
+					(locked = bch2_bucket_nocow_trylock(&c->nocow_locks,
+								  PTR_BUCKET_POS(c, &p.ptr), 0)) ||
+					!atomic_read(&ctxt->read_sectors));
+
+			if (!locked)
+				bch2_bucket_nocow_lock(&c->nocow_locks,
+						       PTR_BUCKET_POS(c, &p.ptr), 0);
+		} else {
+			bch2_bucket_nocow_lock(&c->nocow_locks,
+					       PTR_BUCKET_POS(c, &p.ptr), 0);
+		}
 	}
 
 	if (reserve_sectors) {
diff --git a/fs/bcachefs/data_update.h b/fs/bcachefs/data_update.h
index f304c3366226..49e9055cbb52 100644
--- a/fs/bcachefs/data_update.h
+++ b/fs/bcachefs/data_update.h
@@ -33,7 +33,8 @@ void bch2_data_update_read_done(struct data_update *,
 
 void bch2_data_update_exit(struct data_update *);
 void bch2_update_unwritten_extent(struct btree_trans *, struct data_update *);
-int bch2_data_update_init(struct bch_fs *, struct data_update *,
+int bch2_data_update_init(struct btree_trans *, struct moving_context *,
+			  struct data_update *,
 			  struct write_point_specifier,
 			  struct bch_io_opts, struct data_update_opts,
 			  enum btree_id, struct bkey_s_c);
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index fe0c4b58e525..93d300812b4b 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -1961,7 +1961,7 @@ static void promote_start(struct promote_op *op, struct bch_read_bio *rbio)
 	bch2_data_update_read_done(&op->write, rbio->pick.crc);
 }
 
-static struct promote_op *__promote_alloc(struct bch_fs *c,
+static struct promote_op *__promote_alloc(struct btree_trans *trans,
 					  enum btree_id btree_id,
 					  struct bkey_s_c k,
 					  struct bpos pos,
@@ -1970,6 +1970,7 @@ static struct promote_op *__promote_alloc(struct bch_fs *c,
 					  unsigned sectors,
 					  struct bch_read_bio **rbio)
 {
+	struct bch_fs *c = trans->c;
 	struct promote_op *op = NULL;
 	struct bio *bio;
 	unsigned pages = DIV_ROUND_UP(sectors, PAGE_SECTORS);
@@ -2013,7 +2014,7 @@ static struct promote_op *__promote_alloc(struct bch_fs *c,
 	bio = &op->write.op.wbio.bio;
 	bio_init(bio, NULL, bio->bi_inline_vecs, pages, 0);
 
-	ret = bch2_data_update_init(c, &op->write,
+	ret = bch2_data_update_init(trans, NULL, &op->write,
 			writepoint_hashed((unsigned long) current),
 			opts,
 			(struct data_update_opts) {
@@ -2037,16 +2038,17 @@ err:
 }
 
 noinline
-static struct promote_op *promote_alloc(struct bch_fs *c,
-					       struct bvec_iter iter,
-					       struct bkey_s_c k,
-					       struct extent_ptr_decoded *pick,
-					       struct bch_io_opts opts,
-					       unsigned flags,
-					       struct bch_read_bio **rbio,
-					       bool *bounce,
-					       bool *read_full)
+static struct promote_op *promote_alloc(struct btree_trans *trans,
+					struct bvec_iter iter,
+					struct bkey_s_c k,
+					struct extent_ptr_decoded *pick,
+					struct bch_io_opts opts,
+					unsigned flags,
+					struct bch_read_bio **rbio,
+					bool *bounce,
+					bool *read_full)
 {
+	struct bch_fs *c = trans->c;
 	bool promote_full = *read_full || READ_ONCE(c->promote_whole_extents);
 	/* data might have to be decompressed in the write path: */
 	unsigned sectors = promote_full
@@ -2060,7 +2062,7 @@ static struct promote_op *promote_alloc(struct bch_fs *c,
 	if (!should_promote(c, k, pos, opts, flags))
 		return NULL;
 
-	promote = __promote_alloc(c,
+	promote = __promote_alloc(trans,
 				  k.k->type == KEY_TYPE_reflink_v
 				  ? BTREE_ID_reflink
 				  : BTREE_ID_extents,
@@ -2667,7 +2669,7 @@ retry_pick:
 	}
 
 	if (orig->opts.promote_target)
-		promote = promote_alloc(c, iter, k, &pick, orig->opts, flags,
+		promote = promote_alloc(trans, iter, k, &pick, orig->opts, flags,
 					&rbio, &bounce, &read_full);
 
 	if (!read_full) {
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index 9e453b8495e8..d0ce656755d7 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -91,7 +91,7 @@ static void move_write(struct moving_io *io)
 	bch2_data_update_read_done(&io->write, io->rbio.pick.crc);
 }
 
-static inline struct moving_io *next_pending_write(struct moving_context *ctxt)
+struct moving_io *bch2_moving_ctxt_next_pending_write(struct moving_context *ctxt)
 {
 	struct moving_io *io =
 		list_first_entry_or_null(&ctxt->reads, struct moving_io, list);
@@ -111,29 +111,20 @@ static void move_read_endio(struct bio *bio)
 	closure_put(&ctxt->cl);
 }
 
-static void do_pending_writes(struct moving_context *ctxt, struct btree_trans *trans)
+void bch2_moving_ctxt_do_pending_writes(struct moving_context *ctxt,
+					struct btree_trans *trans)
 {
 	struct moving_io *io;
 
 	if (trans)
 		bch2_trans_unlock(trans);
 
-	while ((io = next_pending_write(ctxt))) {
+	while ((io = bch2_moving_ctxt_next_pending_write(ctxt))) {
 		list_del(&io->list);
 		move_write(io);
 	}
 }
 
-#define move_ctxt_wait_event(_ctxt, _trans, _cond)		\
-do {								\
-	do_pending_writes(_ctxt, _trans);			\
-								\
-	if (_cond)						\
-		break;						\
-	__wait_event((_ctxt)->wait,				\
-		     next_pending_write(_ctxt) || (_cond));	\
-} while (1)
-
 static void bch2_move_ctxt_wait_for_io(struct moving_context *ctxt,
 				       struct btree_trans *trans)
 {
@@ -299,8 +290,8 @@ static int bch2_move_extent(struct btree_trans *trans,
 	io->rbio.bio.bi_iter.bi_sector	= bkey_start_offset(k.k);
 	io->rbio.bio.bi_end_io		= move_read_endio;
 
-	ret = bch2_data_update_init(c, &io->write, ctxt->wp, io_opts,
-				    data_opts, btree_id, k);
+	ret = bch2_data_update_init(trans, ctxt, &io->write, ctxt->wp,
+				    io_opts, data_opts, btree_id, k);
 	if (ret && ret != -BCH_ERR_unwritten_extent_update)
 		goto err_free_pages;
 
diff --git a/fs/bcachefs/move.h b/fs/bcachefs/move.h
index b14f679f6904..a2822d4a4afb 100644
--- a/fs/bcachefs/move.h
+++ b/fs/bcachefs/move.h
@@ -28,6 +28,16 @@ struct moving_context {
 	wait_queue_head_t	wait;
 };
 
+#define move_ctxt_wait_event(_ctxt, _trans, _cond)		\
+do {								\
+	bch2_moving_ctxt_do_pending_writes(_ctxt, _trans);	\
+								\
+	if (_cond)						\
+		break;						\
+	__wait_event((_ctxt)->wait,				\
+		     bch2_moving_ctxt_next_pending_write(_ctxt) || (_cond));\
+} while (1)
+
 typedef bool (*move_pred_fn)(struct bch_fs *, void *, struct bkey_s_c,
 			     struct bch_io_opts *, struct data_update_opts *);
 
@@ -35,6 +45,9 @@ void bch2_moving_ctxt_exit(struct moving_context *);
 void bch2_moving_ctxt_init(struct moving_context *, struct bch_fs *,
 			   struct bch_ratelimit *, struct bch_move_stats *,
 			   struct write_point_specifier, bool);
+struct moving_io *bch2_moving_ctxt_next_pending_write(struct moving_context *);
+void bch2_moving_ctxt_do_pending_writes(struct moving_context *,
+					struct btree_trans *);
 
 int bch2_scan_old_btree_nodes(struct bch_fs *, struct bch_move_stats *);
 
-- 
cgit 


From 46eea9cb9ec14684d1b6e90c17db1ac896f26a5b Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 3 Jan 2023 23:39:42 -0500
Subject: bcachefs: Fix move_ctxt_wait_event()

We shouldn't be evaluating cond again if it already returned true.

This fixes a bug when this helper is used for taking nocow locks.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/move.h | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/move.h b/fs/bcachefs/move.h
index a2822d4a4afb..34b9cadef6b5 100644
--- a/fs/bcachefs/move.h
+++ b/fs/bcachefs/move.h
@@ -28,14 +28,18 @@ struct moving_context {
 	wait_queue_head_t	wait;
 };
 
-#define move_ctxt_wait_event(_ctxt, _trans, _cond)		\
-do {								\
-	bch2_moving_ctxt_do_pending_writes(_ctxt, _trans);	\
-								\
-	if (_cond)						\
-		break;						\
-	__wait_event((_ctxt)->wait,				\
-		     bch2_moving_ctxt_next_pending_write(_ctxt) || (_cond));\
+#define move_ctxt_wait_event(_ctxt, _trans, _cond)			\
+do {									\
+	bool cond_finished = false;					\
+	bch2_moving_ctxt_do_pending_writes(_ctxt, _trans);		\
+									\
+	if (_cond)							\
+		break;							\
+	__wait_event((_ctxt)->wait,					\
+		     bch2_moving_ctxt_next_pending_write(_ctxt) ||	\
+		     (cond_finished = (_cond)));			\
+	if (cond_finished)						\
+		break;							\
 } while (1)
 
 typedef bool (*move_pred_fn)(struct bch_fs *, void *, struct bkey_s_c,
-- 
cgit 


From 629a21b621c466deac6e7ce20242308091f09735 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 3 Jan 2023 23:54:10 -0500
Subject: bcachefs: Improve invalidate_one_bucket() error messages

Make sure to check for lru entries that point to buckets that don't
exist as well as buckets in the wrong state, and improve the error
message we print out.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c | 51 +++++++++++++++++++++++++++++-------------
 fs/bcachefs/lru.c              |  9 ++++++++
 fs/bcachefs/lru.h              |  2 ++
 3 files changed, 46 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index e8412fb1547e..83c750e049dc 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -1751,37 +1751,38 @@ void bch2_do_discards(struct bch_fs *c)
 
 static int invalidate_one_bucket(struct btree_trans *trans,
 				 struct btree_iter *lru_iter,
-				 struct bpos bucket,
+				 struct bkey_s_c lru_k,
 				 s64 *nr_to_invalidate)
 {
 	struct bch_fs *c = trans->c;
 	struct btree_iter alloc_iter = { NULL };
-	struct bkey_i_alloc_v4 *a;
+	struct bkey_i_alloc_v4 *a = NULL;
 	struct printbuf buf = PRINTBUF;
+	struct bpos bucket = u64_to_bucket(lru_k.k->p.offset);
 	unsigned cached_sectors;
 	int ret = 0;
 
 	if (*nr_to_invalidate <= 0)
 		return 1;
 
+	if (!bch2_dev_bucket_exists(c, bucket)) {
+		prt_str(&buf, "lru entry points to invalid bucket");
+		goto err;
+	}
+
 	a = bch2_trans_start_alloc_update(trans, &alloc_iter, bucket);
 	ret = PTR_ERR_OR_ZERO(a);
 	if (ret)
 		goto out;
 
 	if (lru_pos_time(lru_iter->pos) != alloc_lru_idx(a->v)) {
-		prt_printf(&buf, "alloc key does not point back to lru entry when invalidating bucket:\n  ");
-		bch2_bpos_to_text(&buf, lru_iter->pos);
-		prt_printf(&buf, "\n  ");
-		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&a->k_i));
-
-		bch_err(c, "%s", buf.buf);
-		if (test_bit(BCH_FS_CHECK_LRUS_DONE, &c->flags)) {
-			bch2_inconsistent_error(c);
-			ret = -EINVAL;
-		}
+		prt_str(&buf, "alloc key does not point back to lru entry when invalidating bucket:");
+		goto err;
+	}
 
-		goto out;
+	if (a->v.data_type != BCH_DATA_cached) {
+		prt_str(&buf, "lru entry points to non cached bucket:");
+		goto err;
 	}
 
 	if (!a->v.cached_sectors)
@@ -1810,6 +1811,26 @@ out:
 	bch2_trans_iter_exit(trans, &alloc_iter);
 	printbuf_exit(&buf);
 	return ret;
+err:
+	prt_str(&buf, "\n  lru key: ");
+	bch2_bkey_val_to_text(&buf, c, lru_k);
+
+	prt_str(&buf, "\n  lru entry: ");
+	bch2_lru_pos_to_text(&buf, lru_iter->pos);
+
+	prt_str(&buf, "\n  alloc key: ");
+	if (!a)
+		bch2_bpos_to_text(&buf, bucket);
+	else
+		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&a->k_i));
+
+	bch_err(c, "%s", buf.buf);
+	if (test_bit(BCH_FS_CHECK_LRUS_DONE, &c->flags)) {
+		bch2_inconsistent_error(c);
+		ret = -EINVAL;
+	}
+
+	goto out;
 }
 
 static void bch2_do_invalidates_work(struct work_struct *work)
@@ -1832,9 +1853,7 @@ static void bch2_do_invalidates_work(struct work_struct *work)
 				lru_pos(ca->dev_idx, 0, 0),
 				lru_pos(ca->dev_idx, U64_MAX, LRU_TIME_MAX),
 				BTREE_ITER_INTENT, k,
-			invalidate_one_bucket(&trans, &iter,
-					      u64_to_bucket(k.k->p.offset),
-					      &nr_to_invalidate));
+			invalidate_one_bucket(&trans, &iter, k, &nr_to_invalidate));
 
 		if (ret < 0) {
 			percpu_ref_put(&ca->ref);
diff --git a/fs/bcachefs/lru.c b/fs/bcachefs/lru.c
index f1d6368dda07..07fb41ca8c6b 100644
--- a/fs/bcachefs/lru.c
+++ b/fs/bcachefs/lru.c
@@ -37,6 +37,15 @@ void bch2_lru_to_text(struct printbuf *out, struct bch_fs *c,
 	prt_printf(out, "idx %llu", le64_to_cpu(lru->idx));
 }
 
+void bch2_lru_pos_to_text(struct printbuf *out, struct bpos lru)
+{
+	prt_printf(out, "%llu:%llu -> %llu:%llu",
+		   lru_pos_id(lru),
+		   lru_pos_time(lru),
+		   u64_to_bucket(lru.offset).inode,
+		   u64_to_bucket(lru.offset).offset);
+}
+
 static int __bch2_lru_set(struct btree_trans *trans, u16 lru_id,
 			u64 dev_bucket, u64 time, unsigned key_type)
 {
diff --git a/fs/bcachefs/lru.h b/fs/bcachefs/lru.h
index 0cfc7459711b..b8d9848cdb1a 100644
--- a/fs/bcachefs/lru.h
+++ b/fs/bcachefs/lru.h
@@ -25,6 +25,8 @@ static inline u64 lru_pos_time(struct bpos pos)
 int bch2_lru_invalid(const struct bch_fs *, struct bkey_s_c, unsigned, struct printbuf *);
 void bch2_lru_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 
+void bch2_lru_pos_to_text(struct printbuf *, struct bpos);
+
 #define bch2_bkey_ops_lru ((struct bkey_ops) {	\
 	.key_invalid	= bch2_lru_invalid,	\
 	.val_to_text	= bch2_lru_to_text,	\
-- 
cgit 


From 0093b9e9704cc932363c66b2b072b762771ffe1f Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 5 Jan 2023 03:55:23 -0500
Subject: bcachefs: Fix promote path leak

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/io.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index 93d300812b4b..1436863fe418 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -1924,6 +1924,8 @@ static void promote_free(struct bch_fs *c, struct promote_op *op)
 {
 	int ret;
 
+	bch2_data_update_exit(&op->write);
+
 	ret = rhashtable_remove_fast(&c->promote_table, &op->hash,
 				     bch_promote_params);
 	BUG_ON(ret);
@@ -1939,8 +1941,6 @@ static void promote_done(struct bch_write_op *wop)
 
 	bch2_time_stats_update(&c->times[BCH_TIME_data_promote],
 			       op->start_time);
-
-	bch2_data_update_exit(&op->write);
 	promote_free(c, op);
 }
 
-- 
cgit 


From 3482dd6a250397fe1dc088a16bc9e50d72f217e6 Mon Sep 17 00:00:00 2001
From: Daniel Hill <daniel@gluo.nz>
Date: Fri, 6 Jan 2023 21:11:07 +1300
Subject: bcachefs: don't block reads if we're promoting

The promote path calls data_update_init() and now that we take locks here,
there's potential for promote to block our read path, just error
when we can't take the lock instead of blocking.

Signed-off-by: Daniel Hill <daniel@gluo.nz>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/data_update.c | 26 +++++++++++++++++---------
 fs/bcachefs/errcode.h     |  1 +
 fs/bcachefs/io.c          |  7 +++++++
 3 files changed, 25 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c
index 91bc95b8ecb9..8ff20a4587d9 100644
--- a/fs/bcachefs/data_update.c
+++ b/fs/bcachefs/data_update.c
@@ -411,6 +411,7 @@ int bch2_data_update_init(struct btree_trans *trans,
 	const union bch_extent_entry *entry;
 	struct extent_ptr_decoded p;
 	unsigned i, reserve_sectors = k.k->size * data_opts.extra_replicas;
+	unsigned int ptrs_locked = 0;
 	int ret;
 
 	bch2_bkey_buf_init(&m->k);
@@ -436,6 +437,8 @@ int bch2_data_update_init(struct btree_trans *trans,
 
 	i = 0;
 	bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+		bool locked;
+
 		if (((1U << i) & m->data_opts.rewrite_ptrs) &&
 		    p.ptr.cached)
 			BUG();
@@ -461,11 +464,7 @@ int bch2_data_update_init(struct btree_trans *trans,
 		if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible)
 			m->op.incompressible = true;
 
-		i++;
-
 		if (ctxt) {
-			bool locked;
-
 			move_ctxt_wait_event(ctxt, trans,
 					(locked = bch2_bucket_nocow_trylock(&c->nocow_locks,
 								  PTR_BUCKET_POS(c, &p.ptr), 0)) ||
@@ -475,9 +474,14 @@ int bch2_data_update_init(struct btree_trans *trans,
 				bch2_bucket_nocow_lock(&c->nocow_locks,
 						       PTR_BUCKET_POS(c, &p.ptr), 0);
 		} else {
-			bch2_bucket_nocow_lock(&c->nocow_locks,
-					       PTR_BUCKET_POS(c, &p.ptr), 0);
+			if (!bch2_bucket_nocow_trylock(&c->nocow_locks,
+						       PTR_BUCKET_POS(c, &p.ptr), 0)) {
+				ret = -BCH_ERR_nocow_lock_blocked;
+				goto err;
+			}
 		}
+		ptrs_locked |= (1U << i);
+		i++;
 	}
 
 	if (reserve_sectors) {
@@ -499,9 +503,13 @@ int bch2_data_update_init(struct btree_trans *trans,
 		return -BCH_ERR_unwritten_extent_update;
 	return 0;
 err:
-	bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
-		bch2_bucket_nocow_unlock(&c->nocow_locks,
-				       PTR_BUCKET_POS(c, &p.ptr), 0);
+	i = 0;
+	bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+		if ((1U << i) & ptrs_locked)
+			bch2_bucket_nocow_unlock(&c->nocow_locks,
+						PTR_BUCKET_POS(c, &p.ptr), 0);
+		i++;
+	}
 
 	bch2_bkey_buf_exit(&m->k, c);
 	bch2_bio_free_pages_pool(c, &m->op.wbio.bio);
diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h
index 57f1d0a6a490..6129af6129c3 100644
--- a/fs/bcachefs/errcode.h
+++ b/fs/bcachefs/errcode.h
@@ -120,6 +120,7 @@
 	x(BCH_ERR_invalid_sb,		invalid_sb_clean)			\
 	x(BCH_ERR_invalid_sb,		invalid_sb_quota)			\
 	x(BCH_ERR_invalid,		invalid_bkey)				\
+	x(BCH_ERR_operation_blocked,    nocow_lock_blocked)			\
 
 enum bch_errcode {
 	BCH_ERR_START		= 2048,
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index 1436863fe418..6f7e4dac4268 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -2023,6 +2023,13 @@ static struct promote_op *__promote_alloc(struct btree_trans *trans,
 				.write_flags	= BCH_WRITE_ALLOC_NOWAIT|BCH_WRITE_CACHED,
 			},
 			btree_id, k);
+	if (ret == -BCH_ERR_nocow_lock_blocked) {
+		ret = rhashtable_remove_fast(&c->promote_table, &op->hash,
+					bch_promote_params);
+		BUG_ON(ret);
+		goto err;
+	}
+
 	BUG_ON(ret);
 	op->write.op.end_io = promote_done;
 
-- 
cgit 


From 01efebd8f13c41341754a2f0b431aa81209f8f30 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 26 Jan 2023 13:36:30 -0500
Subject: bcachefs: Add an assert to bch2_bucket_nocow_unlock()

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/nocow_locking.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/nocow_locking.c b/fs/bcachefs/nocow_locking.c
index 53e5bc9fd585..396357cd8f2f 100644
--- a/fs/bcachefs/nocow_locking.c
+++ b/fs/bcachefs/nocow_locking.c
@@ -18,6 +18,8 @@ bool bch2_bucket_nocow_is_locked(struct bucket_nocow_lock_table *t, struct bpos
 	return false;
 }
 
+#define sign(v)		(v < 0 ? -1 : v > 0 ? 1 : 0)
+
 void bch2_bucket_nocow_unlock(struct bucket_nocow_lock_table *t, struct bpos bucket, int flags)
 {
 	u64 dev_bucket = bucket_to_u64(bucket);
@@ -27,6 +29,8 @@ void bch2_bucket_nocow_unlock(struct bucket_nocow_lock_table *t, struct bpos buc
 
 	for (i = 0; i < ARRAY_SIZE(l->b); i++)
 		if (l->b[i] == dev_bucket) {
+			BUG_ON(sign(atomic_read(&l->l[i])) != lock_val);
+
 			if (!atomic_sub_return(lock_val, &l->l[i]))
 				closure_wake_up(&l->wait);
 			return;
-- 
cgit 


From c782c5832e9251ab6f4df837932d959f3e02ab25 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 9 Jan 2023 01:45:18 -0500
Subject: bcachefs: Add max nr of IOs in flight to the move path

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/move.c | 22 ++++++++++++++++------
 fs/bcachefs/move.h |  2 ++
 fs/bcachefs/opts.h |  7 ++++++-
 3 files changed, 24 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index d0ce656755d7..c2226353c775 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -74,6 +74,7 @@ static void move_write_done(struct bch_write_op *op)
 		ctxt->write_error = true;
 
 	atomic_sub(io->write_sectors, &io->write.ctxt->write_sectors);
+	atomic_dec(&io->write.ctxt->write_ios);
 	move_free(io);
 	closure_put(&ctxt->cl);
 }
@@ -87,6 +88,7 @@ static void move_write(struct moving_io *io)
 
 	closure_get(&io->write.ctxt->cl);
 	atomic_add(io->write_sectors, &io->write.ctxt->write_sectors);
+	atomic_inc(&io->write.ctxt->write_ios);
 
 	bch2_data_update_read_done(&io->write, io->rbio.pick.crc);
 }
@@ -105,6 +107,7 @@ static void move_read_endio(struct bio *bio)
 	struct moving_context *ctxt = io->write.ctxt;
 
 	atomic_sub(io->read_sectors, &ctxt->read_sectors);
+	atomic_dec(&ctxt->read_ios);
 	io->read_completed = true;
 
 	wake_up(&ctxt->wait);
@@ -139,7 +142,11 @@ void bch2_moving_ctxt_exit(struct moving_context *ctxt)
 {
 	move_ctxt_wait_event(ctxt, NULL, list_empty(&ctxt->reads));
 	closure_sync(&ctxt->cl);
+
 	EBUG_ON(atomic_read(&ctxt->write_sectors));
+	EBUG_ON(atomic_read(&ctxt->write_ios));
+	EBUG_ON(atomic_read(&ctxt->read_sectors));
+	EBUG_ON(atomic_read(&ctxt->read_ios));
 
 	if (ctxt->stats) {
 		progress_list_del(ctxt->c, ctxt->stats);
@@ -314,6 +321,7 @@ static int bch2_move_extent(struct btree_trans *trans,
 	trace_move_extent_read(k.k);
 
 	atomic_add(io->read_sectors, &ctxt->read_sectors);
+	atomic_inc(&ctxt->read_ios);
 	list_add_tail(&io->list, &ctxt->reads);
 
 	/*
@@ -403,13 +411,15 @@ static int move_ratelimit(struct btree_trans *trans,
 		}
 	} while (delay);
 
+	/*
+	 * XXX: these limits really ought to be per device, SSDs and hard drives
+	 * will want different limits
+	 */
 	move_ctxt_wait_event(ctxt, trans,
-		atomic_read(&ctxt->write_sectors) <
-		c->opts.move_bytes_in_flight >> 9);
-
-	move_ctxt_wait_event(ctxt, trans,
-		atomic_read(&ctxt->read_sectors) <
-		c->opts.move_bytes_in_flight >> 9);
+		atomic_read(&ctxt->write_sectors) < c->opts.move_bytes_in_flight >> 9 &&
+		atomic_read(&ctxt->read_sectors) < c->opts.move_bytes_in_flight >> 9 &&
+		atomic_read(&ctxt->write_ios) < c->opts.move_ios_in_flight &&
+		atomic_read(&ctxt->read_ios) < c->opts.move_ios_in_flight);
 
 	return 0;
 }
diff --git a/fs/bcachefs/move.h b/fs/bcachefs/move.h
index 34b9cadef6b5..aef613802935 100644
--- a/fs/bcachefs/move.h
+++ b/fs/bcachefs/move.h
@@ -24,6 +24,8 @@ struct moving_context {
 	/* in flight sectors: */
 	atomic_t		read_sectors;
 	atomic_t		write_sectors;
+	atomic_t		read_ios;
+	atomic_t		write_ios;
 
 	wait_queue_head_t	wait;
 };
diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
index ef1b8a03f149..fc444c68025c 100644
--- a/fs/bcachefs/opts.h
+++ b/fs/bcachefs/opts.h
@@ -299,7 +299,12 @@ enum opt_type {
 	  OPT_HUMAN_READABLE|OPT_FS|OPT_MOUNT|OPT_RUNTIME,		\
 	  OPT_UINT(1024, U32_MAX),					\
 	  BCH2_NO_SB_OPT,		1U << 20,			\
-	  NULL,		"Amount of IO in flight to keep in flight by the move path")\
+	  NULL,		"Maximum Amount of IO to keep in flight by the move path")\
+	x(move_ios_in_flight,		u32,				\
+	  OPT_FS|OPT_MOUNT|OPT_RUNTIME,					\
+	  OPT_UINT(1, 1024),						\
+	  BCH2_NO_SB_OPT,		32,				\
+	  NULL,		"Maximum number of IOs to keep in flight by the move path")\
 	x(fsck,				u8,				\
 	  OPT_FS|OPT_MOUNT,						\
 	  OPT_BOOL(),							\
-- 
cgit 


From 637de729fc86effe021bd067cccd68efd07f59c0 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 11 Nov 2021 15:50:22 -0500
Subject: bcachefs: Ensure btree node cache is not more than half dirty

Tweak journal reclaim to ensure the btree node cache isn't more
than half dirty so that memory reclaim can always make progress - the
same as we do for the btree key cache.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/journal_reclaim.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
index e8b7ed8e1333..0f3c103e63ee 100644
--- a/fs/bcachefs/journal_reclaim.c
+++ b/fs/bcachefs/journal_reclaim.c
@@ -640,6 +640,9 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct, bool kicked)
 		if (fifo_free(&j->pin) <= 32)
 			min_nr = 1;
 
+		if (atomic_read(&c->btree_cache.dirty) * 2 > c->btree_cache.used)
+			min_nr = 1;
+
 		min_key_cache = min(bch2_nr_btree_keys_need_flush(c), (size_t) 128);
 
 		trace_and_count(c, journal_reclaim_start, c,
-- 
cgit 


From 12795a1937460020af999e3cf54c146598402455 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 10 Feb 2023 15:47:46 -0500
Subject: bcachefs: Add some logging for btree node rewrites due to errors

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_io.c              | 10 +++++++++-
 fs/bcachefs/btree_update_interior.c | 23 ++++++++++++++++++++---
 2 files changed, 29 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 87d80a59dd7e..66747fe21323 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -1240,8 +1240,16 @@ start:
 	bio_put(&rb->bio);
 	printbuf_exit(&buf);
 
-	if (saw_error && !btree_node_read_error(b))
+	if (saw_error && !btree_node_read_error(b)) {
+		struct printbuf buf = PRINTBUF;
+
+		bch2_bpos_to_text(&buf, b->key.k.p);
+		bch_info(c, "%s: rewriting btree node at btree=%s level=%u %s due to error",
+			 __func__, bch2_btree_ids[b->c.btree_id], b->c.level, buf.buf);
+		printbuf_exit(&buf);
+
 		bch2_btree_node_rewrite_async(c, b);
+	}
 
 	clear_btree_node_read_in_flight(b);
 	wake_up_bit(&b->flags, BTREE_NODE_read_in_flight);
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 6287e926f605..612d0007fb23 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -2007,6 +2007,7 @@ struct async_btree_rewrite {
 static int async_btree_node_rewrite_trans(struct btree_trans *trans,
 					  struct async_btree_rewrite *a)
 {
+	struct bch_fs *c = trans->c;
 	struct btree_iter iter;
 	struct btree *b;
 	int ret;
@@ -2018,8 +2019,18 @@ static int async_btree_node_rewrite_trans(struct btree_trans *trans,
 	if (ret)
 		goto out;
 
-	if (!b || b->data->keys.seq != a->seq)
+	if (!b || b->data->keys.seq != a->seq) {
+		struct printbuf buf = PRINTBUF;
+
+		if (b)
+			bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
+		else
+			prt_str(&buf, "(null");
+		bch_info(c, "%s: node to rewrite not found:, searching for seq %llu, got\n%s",
+			 __func__, a->seq, buf.buf);
+		printbuf_exit(&buf);
 		goto out;
+	}
 
 	ret = bch2_btree_node_rewrite(trans, &iter, b, 0);
 out:
@@ -2033,9 +2044,12 @@ void async_btree_node_rewrite_work(struct work_struct *work)
 	struct async_btree_rewrite *a =
 		container_of(work, struct async_btree_rewrite, work);
 	struct bch_fs *c = a->c;
+	int ret;
 
-	bch2_trans_do(c, NULL, NULL, 0,
+	ret = bch2_trans_do(c, NULL, NULL, 0,
 		      async_btree_node_rewrite_trans(&trans, a));
+	if (ret)
+		bch_err(c, "%s: error %s", __func__, bch2_err_str(ret));
 	bch2_write_ref_put(c, BCH_WRITE_REF_node_rewrite);
 	kfree(a);
 }
@@ -2044,12 +2058,15 @@ void bch2_btree_node_rewrite_async(struct bch_fs *c, struct btree *b)
 {
 	struct async_btree_rewrite *a;
 
-	if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_node_rewrite))
+	if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_node_rewrite)) {
+		bch_err(c, "%s: error getting c->writes ref", __func__);
 		return;
+	}
 
 	a = kmalloc(sizeof(*a), GFP_NOFS);
 	if (!a) {
 		bch2_write_ref_put(c, BCH_WRITE_REF_node_rewrite);
+		bch_err(c, "%s: error allocating memory", __func__);
 		return;
 	}
 
-- 
cgit 


From 09d70d0be1d5670a9df24656c5e429ab4f239c16 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 11 Feb 2023 12:38:28 -0500
Subject: bcachefs: Nocow locking fixup

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/data_update.c | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c
index 8ff20a4587d9..04d56e960872 100644
--- a/fs/bcachefs/data_update.c
+++ b/fs/bcachefs/data_update.c
@@ -308,9 +308,11 @@ void bch2_data_update_exit(struct data_update *update)
 		bch2_bkey_ptrs_c(bkey_i_to_s_c(update->k.k));
 	const struct bch_extent_ptr *ptr;
 
-	bkey_for_each_ptr(ptrs, ptr)
+	bkey_for_each_ptr(ptrs, ptr) {
 		bch2_bucket_nocow_unlock(&c->nocow_locks,
-				       PTR_BUCKET_POS(c, ptr), 0);
+					 PTR_BUCKET_POS(c, ptr), 0);
+		percpu_ref_put(&bch_dev_bkey_exists(c, ptr->dev)->ref);
+	}
 
 	bch2_bkey_buf_exit(&update->k, c);
 	bch2_disk_reservation_put(c, &update->op.res);
@@ -410,6 +412,7 @@ int bch2_data_update_init(struct btree_trans *trans,
 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
 	const union bch_extent_entry *entry;
 	struct extent_ptr_decoded p;
+	const struct bch_extent_ptr *ptr;
 	unsigned i, reserve_sectors = k.k->size * data_opts.extra_replicas;
 	unsigned int ptrs_locked = 0;
 	int ret;
@@ -435,6 +438,9 @@ int bch2_data_update_init(struct btree_trans *trans,
 	if (m->data_opts.btree_insert_flags & BTREE_INSERT_USE_RESERVE)
 		m->op.alloc_reserve = RESERVE_movinggc;
 
+	bkey_for_each_ptr(ptrs, ptr)
+		percpu_ref_get(&bch_dev_bkey_exists(c, ptr->dev)->ref);
+
 	i = 0;
 	bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
 		bool locked;
@@ -507,7 +513,8 @@ err:
 	bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
 		if ((1U << i) & ptrs_locked)
 			bch2_bucket_nocow_unlock(&c->nocow_locks,
-						PTR_BUCKET_POS(c, &p.ptr), 0);
+						 PTR_BUCKET_POS(c, &p.ptr), 0);
+		percpu_ref_put(&bch_dev_bkey_exists(c, p.ptr.dev)->ref);
 		i++;
 	}
 
-- 
cgit 


From a1f26d700aa51fc942ca07ee501b9117075c84e0 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 11 Feb 2023 12:57:04 -0500
Subject: bcachefs: Handle btree node rewrites before going RW

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs.h              |  3 ++
 fs/bcachefs/btree_update_interior.c | 65 +++++++++++++++++++++++++++++++++----
 fs/bcachefs/btree_update_interior.h |  3 ++
 fs/bcachefs/super.c                 |  2 ++
 4 files changed, 66 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 6089d9ed6c27..84b30adf56c9 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -780,6 +780,9 @@ struct bch_fs {
 	struct workqueue_struct	*btree_interior_update_worker;
 	struct work_struct	btree_interior_update_work;
 
+	struct list_head	pending_node_rewrites;
+	struct mutex		pending_node_rewrites_lock;
+
 	/* btree_io.c: */
 	spinlock_t		btree_write_error_lock;
 	struct btree_write_stats {
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 612d0007fb23..45004f17d51d 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -1998,6 +1998,7 @@ err:
 struct async_btree_rewrite {
 	struct bch_fs		*c;
 	struct work_struct	work;
+	struct list_head	list;
 	enum btree_id		btree_id;
 	unsigned		level;
 	struct bpos		pos;
@@ -2057,15 +2058,10 @@ void async_btree_node_rewrite_work(struct work_struct *work)
 void bch2_btree_node_rewrite_async(struct bch_fs *c, struct btree *b)
 {
 	struct async_btree_rewrite *a;
-
-	if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_node_rewrite)) {
-		bch_err(c, "%s: error getting c->writes ref", __func__);
-		return;
-	}
+	int ret;
 
 	a = kmalloc(sizeof(*a), GFP_NOFS);
 	if (!a) {
-		bch2_write_ref_put(c, BCH_WRITE_REF_node_rewrite);
 		bch_err(c, "%s: error allocating memory", __func__);
 		return;
 	}
@@ -2075,11 +2071,63 @@ void bch2_btree_node_rewrite_async(struct bch_fs *c, struct btree *b)
 	a->level	= b->c.level;
 	a->pos		= b->key.k.p;
 	a->seq		= b->data->keys.seq;
-
 	INIT_WORK(&a->work, async_btree_node_rewrite_work);
+
+	if (unlikely(!test_bit(BCH_FS_MAY_GO_RW, &c->flags))) {
+		mutex_lock(&c->pending_node_rewrites_lock);
+		list_add(&a->list, &c->pending_node_rewrites);
+		mutex_unlock(&c->pending_node_rewrites_lock);
+		return;
+	}
+
+	if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_node_rewrite)) {
+		if (test_bit(BCH_FS_STARTED, &c->flags)) {
+			bch_err(c, "%s: error getting c->writes ref", __func__);
+			kfree(a);
+			return;
+		}
+
+		ret = bch2_fs_read_write_early(c);
+		if (ret) {
+			bch_err(c, "%s: error going read-write: %s",
+				__func__, bch2_err_str(ret));
+			kfree(a);
+			return;
+		}
+
+		bch2_write_ref_get(c, BCH_WRITE_REF_node_rewrite);
+	}
+
 	queue_work(c->btree_interior_update_worker, &a->work);
 }
 
+void bch2_do_pending_node_rewrites(struct bch_fs *c)
+{
+	struct async_btree_rewrite *a, *n;
+
+	mutex_lock(&c->pending_node_rewrites_lock);
+	list_for_each_entry_safe(a, n, &c->pending_node_rewrites, list) {
+		list_del(&a->list);
+
+		bch2_write_ref_get(c, BCH_WRITE_REF_node_rewrite);
+		queue_work(c->btree_interior_update_worker, &a->work);
+	}
+	mutex_unlock(&c->pending_node_rewrites_lock);
+}
+
+void bch2_free_pending_node_rewrites(struct bch_fs *c)
+{
+	struct async_btree_rewrite *a, *n;
+
+	mutex_lock(&c->pending_node_rewrites_lock);
+	list_for_each_entry_safe(a, n, &c->pending_node_rewrites, list) {
+		list_del(&a->list);
+
+		kfree(a);
+	}
+	mutex_unlock(&c->pending_node_rewrites_lock);
+}
+
 static int __bch2_btree_node_update_key(struct btree_trans *trans,
 					struct btree_iter *iter,
 					struct btree *b, struct btree *new_hash,
@@ -2417,6 +2465,9 @@ int bch2_fs_btree_interior_update_init(struct bch_fs *c)
 	mutex_init(&c->btree_interior_update_lock);
 	INIT_WORK(&c->btree_interior_update_work, btree_interior_update_work);
 
+	INIT_LIST_HEAD(&c->pending_node_rewrites);
+	mutex_init(&c->pending_node_rewrites_lock);
+
 	c->btree_interior_update_worker =
 		alloc_workqueue("btree_update", WQ_UNBOUND|WQ_MEM_RECLAIM, 1);
 	if (!c->btree_interior_update_worker)
diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h
index 2e6d220c3bcd..30e9c137b0e2 100644
--- a/fs/bcachefs/btree_update_interior.h
+++ b/fs/bcachefs/btree_update_interior.h
@@ -318,6 +318,9 @@ void bch2_journal_entries_to_btree_roots(struct bch_fs *, struct jset *);
 struct jset_entry *bch2_btree_roots_to_journal_entries(struct bch_fs *,
 					struct jset_entry *, struct jset_entry *);
 
+void bch2_do_pending_node_rewrites(struct bch_fs *);
+void bch2_free_pending_node_rewrites(struct bch_fs *);
+
 void bch2_fs_btree_interior_update_exit(struct bch_fs *);
 int bch2_fs_btree_interior_update_init(struct bch_fs *);
 
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index e142de2a5527..58517f6d128f 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -418,6 +418,7 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early)
 	bch2_do_discards(c);
 	bch2_do_invalidates(c);
 	bch2_do_stripe_deletes(c);
+	bch2_do_pending_node_rewrites(c);
 	return 0;
 err:
 	__bch2_fs_read_only(c);
@@ -446,6 +447,7 @@ static void __bch2_fs_free(struct bch_fs *c)
 	for (i = 0; i < BCH_TIME_STAT_NR; i++)
 		bch2_time_stats_exit(&c->times[i]);
 
+	bch2_free_pending_node_rewrites(c);
 	bch2_fs_counters_exit(c);
 	bch2_fs_snapshots_exit(c);
 	bch2_fs_quota_exit(c);
-- 
cgit 


From 930c0c4cefede8532765cc4f74ec3ff05dc1db15 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 11 Feb 2023 19:30:41 -0500
Subject: bcachefs: Add missing include

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_types.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 153ae548a89a..9fdddfb15782 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -8,6 +8,7 @@
 //#include "bkey_methods.h"
 #include "buckets_types.h"
 #include "darray.h"
+#include "errcode.h"
 #include "journal_types.h"
 #include "replicas_types.h"
 #include "six.h"
-- 
cgit 


From c1f59ef6d00940f22fa5e88d1d7705c85ec6d118 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 11 Feb 2023 19:31:03 -0500
Subject: bcachefs: More info on check_bucket_ref() error

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/buckets.c | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 9dcdfca19d52..abbd28bf9a45 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -663,13 +663,14 @@ err:
 	return ret;
 }
 
-static int check_bucket_ref(struct bch_fs *c,
+static int check_bucket_ref(struct btree_trans *trans,
 			    struct bkey_s_c k,
 			    const struct bch_extent_ptr *ptr,
 			    s64 sectors, enum bch_data_type ptr_data_type,
 			    u8 b_gen, u8 bucket_data_type,
 			    u32 dirty_sectors, u32 cached_sectors)
 {
+	struct bch_fs *c = trans->c;
 	struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
 	size_t bucket_nr = PTR_BUCKET_NR(ca, ptr);
 	u16 bucket_sectors = !ptr->cached
@@ -726,7 +727,7 @@ static int check_bucket_ref(struct bch_fs *c,
 
 	if (b_gen != ptr->gen) {
 		ret = 1;
-		goto err;
+		goto out;
 	}
 
 	if (!data_type_is_empty(bucket_data_type) &&
@@ -756,9 +757,12 @@ static int check_bucket_ref(struct bch_fs *c,
 		ret = -EIO;
 		goto err;
 	}
-err:
+out:
 	printbuf_exit(&buf);
 	return ret;
+err:
+	bch2_dump_trans_updates(trans);
+	goto out;
 }
 
 static int mark_stripe_bucket(struct btree_trans *trans,
@@ -800,7 +804,7 @@ static int mark_stripe_bucket(struct btree_trans *trans,
 	bucket_lock(g);
 	old = *g;
 
-	ret = check_bucket_ref(c, k, ptr, sectors, data_type,
+	ret = check_bucket_ref(trans, k, ptr, sectors, data_type,
 			       g->gen, g->data_type,
 			       g->dirty_sectors, g->cached_sectors);
 	if (ret)
@@ -832,7 +836,7 @@ static int __mark_pointer(struct btree_trans *trans,
 	u32 *dst_sectors = !ptr->cached
 		? dirty_sectors
 		: cached_sectors;
-	int ret = check_bucket_ref(trans->c, k, ptr, sectors, ptr_data_type,
+	int ret = check_bucket_ref(trans, k, ptr, sectors, ptr_data_type,
 				   bucket_gen, *bucket_data_type,
 				   *dirty_sectors, *cached_sectors);
 
@@ -1557,7 +1561,7 @@ static int bch2_trans_mark_stripe_bucket(struct btree_trans *trans,
 	if (IS_ERR(a))
 		return PTR_ERR(a);
 
-	ret = check_bucket_ref(c, s.s_c, ptr, sectors, data_type,
+	ret = check_bucket_ref(trans, s.s_c, ptr, sectors, data_type,
 			       a->v.gen, a->v.data_type,
 			       a->v.dirty_sectors, a->v.cached_sectors);
 	if (ret)
-- 
cgit 


From 76966dbfa9eb4a723cb899ba07f55448e5b21bbe Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 11 Feb 2023 16:53:59 -0500
Subject: bcachefs: Improve locking in __bch2_set_nr_journal_buckets()

This refactors to not call bch2_journal_block() with c->sb_lock held.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/journal.c | 46 ++++++++++++++++++++++------------------------
 1 file changed, 22 insertions(+), 24 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index 66bd0a72c774..00e806a64247 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -768,6 +768,7 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
 	if (c) {
 		bch2_journal_flush_all_pins(&c->journal);
 		bch2_journal_block(&c->journal);
+		mutex_lock(&c->sb_lock);
 	}
 
 	bu		= kcalloc(nr_want, sizeof(*bu), GFP_KERNEL);
@@ -848,6 +849,9 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
 	if (!new_fs)
 		spin_unlock(&c->journal.lock);
 
+	if (ja->nr != old_nr && !new_fs)
+		bch2_write_super(c);
+
 	if (c)
 		bch2_journal_unblock(&c->journal);
 
@@ -867,6 +871,9 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
 		}
 	}
 err:
+	if (c)
+		mutex_unlock(&c->sb_lock);
+
 	if (ob && !new_fs)
 		for (i = 0; i < nr_got; i++)
 			bch2_open_bucket_put(c, ob[i]);
@@ -892,7 +899,6 @@ int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca,
 {
 	struct journal_device *ja = &ca->journal;
 	struct closure cl;
-	unsigned current_nr;
 	int ret = 0;
 
 	/* don't handle reducing nr of buckets yet: */
@@ -901,44 +907,44 @@ int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca,
 
 	closure_init_stack(&cl);
 
-	while (ja->nr != nr && (ret == 0 || ret == -BCH_ERR_bucket_alloc_blocked)) {
+	while (ja->nr != nr) {
 		struct disk_reservation disk_res = { 0, 0 };
 
-		closure_sync(&cl);
-
-		mutex_lock(&c->sb_lock);
-		current_nr = ja->nr;
-
 		/*
 		 * note: journal buckets aren't really counted as _sectors_ used yet, so
 		 * we don't need the disk reservation to avoid the BUG_ON() in buckets.c
 		 * when space used goes up without a reservation - but we do need the
 		 * reservation to ensure we'll actually be able to allocate:
+		 *
+		 * XXX: that's not right, disk reservations only ensure a
+		 * filesystem-wide allocation will succeed, this is a device
+		 * specific allocation - we can hang here:
 		 */
 
 		ret = bch2_disk_reservation_get(c, &disk_res,
 						bucket_to_sector(ca, nr - ja->nr), 1, 0);
-		if (ret) {
-			mutex_unlock(&c->sb_lock);
-			return ret;
-		}
+		if (ret)
+			break;
 
 		ret = __bch2_set_nr_journal_buckets(ca, nr, false, &cl);
 
 		bch2_disk_reservation_put(c, &disk_res);
 
-		if (ja->nr != current_nr)
-			bch2_write_super(c);
-		mutex_unlock(&c->sb_lock);
+		closure_sync(&cl);
+
+		if (ret && ret != -BCH_ERR_bucket_alloc_blocked)
+			break;
 	}
 
+	if (ret)
+		bch_err(c, "%s: err %s", __func__, bch2_err_str(ret));
+
 	return ret;
 }
 
 int bch2_dev_journal_alloc(struct bch_dev *ca)
 {
 	unsigned nr;
-	int ret;
 
 	if (dynamic_fault("bcachefs:add:journal_alloc"))
 		return -ENOMEM;
@@ -955,15 +961,7 @@ int bch2_dev_journal_alloc(struct bch_dev *ca)
 		     min(1 << 13,
 			 (1 << 24) / ca->mi.bucket_size));
 
-	if (ca->fs)
-		mutex_lock(&ca->fs->sb_lock);
-
-	ret = __bch2_set_nr_journal_buckets(ca, nr, true, NULL);
-
-	if (ca->fs)
-		mutex_unlock(&ca->fs->sb_lock);
-
-	return ret;
+	return __bch2_set_nr_journal_buckets(ca, nr, true, NULL);
 }
 
 /* startup/shutdown: */
-- 
cgit 


From 8ffa11a2c523b49836ca05f8755e22a4607d86a7 Mon Sep 17 00:00:00 2001
From: Daniel Hill <daniel@gluo.nz>
Date: Fri, 20 Jan 2023 01:27:30 +1300
Subject: bcachefs: let __bch2_btree_insert() pass in flags

This patch is prep work for the following patch.

Signed-off-by: Daniel Hill <daniel@gluo.nz>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c   |  6 +++---
 fs/bcachefs/btree_update.h       |  3 ++-
 fs/bcachefs/btree_update_leaf.c  |  7 ++++---
 fs/bcachefs/btree_write_buffer.c |  2 +-
 fs/bcachefs/buckets.c            |  3 ++-
 fs/bcachefs/fsck.c               |  3 ++-
 fs/bcachefs/tests.c              | 18 +++++++++---------
 7 files changed, 23 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 83c750e049dc..81bd56152fd0 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -673,7 +673,7 @@ int bch2_bucket_gens_init(struct bch_fs *c)
 			ret = commit_do(&trans, NULL, NULL,
 					BTREE_INSERT_NOFAIL|
 					BTREE_INSERT_LAZY_RW,
-				__bch2_btree_insert(&trans, BTREE_ID_bucket_gens, &g.k_i));
+				__bch2_btree_insert(&trans, BTREE_ID_bucket_gens, &g.k_i, 0));
 			if (ret)
 				break;
 			have_bucket_gens_key = false;
@@ -693,7 +693,7 @@ int bch2_bucket_gens_init(struct bch_fs *c)
 		ret = commit_do(&trans, NULL, NULL,
 				BTREE_INSERT_NOFAIL|
 				BTREE_INSERT_LAZY_RW,
-			__bch2_btree_insert(&trans, BTREE_ID_bucket_gens, &g.k_i));
+			__bch2_btree_insert(&trans, BTREE_ID_bucket_gens, &g.k_i, 0));
 
 	bch2_trans_exit(&trans);
 
@@ -1933,7 +1933,7 @@ static int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca)
 			freespace->k.p		= k.k->p;
 			freespace->k.size	= k.k->size;
 
-			ret = __bch2_btree_insert(&trans, BTREE_ID_freespace, freespace) ?:
+			ret = __bch2_btree_insert(&trans, BTREE_ID_freespace, freespace, 0) ?:
 				bch2_trans_commit(&trans, NULL, NULL,
 						  BTREE_INSERT_LAZY_RW|
 						  BTREE_INSERT_NOFAIL);
diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
index 96d27e34d5b1..771e4b239c66 100644
--- a/fs/bcachefs/btree_update.h
+++ b/fs/bcachefs/btree_update.h
@@ -58,7 +58,8 @@ int bch2_btree_delete_extent_at(struct btree_trans *, struct btree_iter *,
 				unsigned, unsigned);
 int bch2_btree_delete_at(struct btree_trans *, struct btree_iter *, unsigned);
 
-int __bch2_btree_insert(struct btree_trans *, enum btree_id, struct bkey_i *);
+int __bch2_btree_insert(struct btree_trans *, enum btree_id, struct bkey_i *,
+			enum btree_update_flags);
 int bch2_btree_insert(struct bch_fs *, enum btree_id, struct bkey_i *,
 		     struct disk_reservation *, u64 *, int flags);
 
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 8169f2b89848..cefe62d28cc0 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -1748,7 +1748,8 @@ void bch2_trans_commit_hook(struct btree_trans *trans,
 }
 
 int __bch2_btree_insert(struct btree_trans *trans,
-			enum btree_id id, struct bkey_i *k)
+			enum btree_id id,
+			struct bkey_i *k, enum btree_update_flags flags)
 {
 	struct btree_iter iter;
 	int ret;
@@ -1756,7 +1757,7 @@ int __bch2_btree_insert(struct btree_trans *trans,
 	bch2_trans_iter_init(trans, &iter, id, bkey_start_pos(&k->k),
 			     BTREE_ITER_INTENT);
 	ret   = bch2_btree_iter_traverse(&iter) ?:
-		bch2_trans_update(trans, &iter, k, 0);
+		bch2_trans_update(trans, &iter, k, flags);
 	bch2_trans_iter_exit(trans, &iter);
 	return ret;
 }
@@ -1774,7 +1775,7 @@ int bch2_btree_insert(struct bch_fs *c, enum btree_id id,
 		      u64 *journal_seq, int flags)
 {
 	return bch2_trans_do(c, disk_res, journal_seq, flags,
-			     __bch2_btree_insert(&trans, id, k));
+			     __bch2_btree_insert(&trans, id, k, 0));
 }
 
 int bch2_btree_delete_extent_at(struct btree_trans *trans, struct btree_iter *iter,
diff --git a/fs/bcachefs/btree_write_buffer.c b/fs/bcachefs/btree_write_buffer.c
index 84c3e6ddb38e..6285532e7790 100644
--- a/fs/bcachefs/btree_write_buffer.c
+++ b/fs/bcachefs/btree_write_buffer.c
@@ -222,7 +222,7 @@ slowpath:
 				BTREE_INSERT_NOFAIL|
 				BTREE_INSERT_JOURNAL_RECLAIM|
 				JOURNAL_WATERMARK_reserved,
-				__bch2_btree_insert(trans, i->btree, &i->k));
+				__bch2_btree_insert(trans, i->btree, &i->k, 0));
 		if (bch2_fs_fatal_err_on(ret, c, "%s: insert error %s", __func__, bch2_err_str(ret)))
 			break;
 	}
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index abbd28bf9a45..32750a65d37a 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -1224,7 +1224,8 @@ not_found:
 		new->k.p		= bkey_start_pos(p.k);
 		new->k.p.offset += *idx - start;
 		bch2_key_resize(&new->k, next_idx - *idx);
-		ret = __bch2_btree_insert(trans, BTREE_ID_extents, &new->k_i);
+		ret = __bch2_btree_insert(trans, BTREE_ID_extents, &new->k_i,
+					  0);
 	}
 
 	*idx = next_idx;
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 5887d78190eb..52bb00b52b90 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -1797,7 +1797,8 @@ static int check_root_trans(struct btree_trans *trans)
 		ret = commit_do(trans, NULL, NULL,
 				      BTREE_INSERT_NOFAIL|
 				      BTREE_INSERT_LAZY_RW,
-			__bch2_btree_insert(trans, BTREE_ID_subvolumes, &root_subvol.k_i));
+			__bch2_btree_insert(trans, BTREE_ID_subvolumes,
+					    &root_subvol.k_i, 0));
 		if (ret) {
 			bch_err(c, "error writing root subvol: %s", bch2_err_str(ret));
 			goto err;
diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c
index 80fce1c95470..d352821d5614 100644
--- a/fs/bcachefs/tests.c
+++ b/fs/bcachefs/tests.c
@@ -592,7 +592,7 @@ static int rand_insert(struct bch_fs *c, u64 nr)
 		k.k.p.snapshot = U32_MAX;
 
 		ret = commit_do(&trans, NULL, NULL, 0,
-			__bch2_btree_insert(&trans, BTREE_ID_xattrs, &k.k_i));
+			__bch2_btree_insert(&trans, BTREE_ID_xattrs, &k.k_i, 0));
 		if (ret) {
 			bch_err(c, "%s(): error %s", __func__, bch2_err_str(ret));
 			break;
@@ -621,14 +621,14 @@ static int rand_insert_multi(struct bch_fs *c, u64 nr)
 		}
 
 		ret = commit_do(&trans, NULL, NULL, 0,
-			__bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[0].k_i) ?:
-			__bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[1].k_i) ?:
-			__bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[2].k_i) ?:
-			__bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[3].k_i) ?:
-			__bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[4].k_i) ?:
-			__bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[5].k_i) ?:
-			__bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[6].k_i) ?:
-			__bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[7].k_i));
+			__bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[0].k_i, 0) ?:
+			__bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[1].k_i, 0) ?:
+			__bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[2].k_i, 0) ?:
+			__bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[3].k_i, 0) ?:
+			__bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[4].k_i, 0) ?:
+			__bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[5].k_i, 0) ?:
+			__bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[6].k_i, 0) ?:
+			__bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[7].k_i, 0));
 		if (ret) {
 			bch_err(c, "%s(): error %s", __func__, bch2_err_str(ret));
 			break;
-- 
cgit 


From 3277081522d8620f7410b173881d4b0267ce58f9 Mon Sep 17 00:00:00 2001
From: Daniel Hill <daniel@gluo.nz>
Date: Sun, 12 Feb 2023 15:51:45 +1300
Subject: bcachefs: Don't run triggers when repairing in
 __bch2_mark_reflink_p()

Triggers current trip-up on the faulty reflink we're trying to repair,
Disabling them lets us fix broken reflink and continue.

Signed-off-by: Daniel Hill <daniel@gluo.nz>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/buckets.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 32750a65d37a..2e1751eeaef4 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -1225,7 +1225,7 @@ not_found:
 		new->k.p.offset += *idx - start;
 		bch2_key_resize(&new->k, next_idx - *idx);
 		ret = __bch2_btree_insert(trans, BTREE_ID_extents, &new->k_i,
-					  0);
+					  BTREE_TRIGGER_NORUN);
 	}
 
 	*idx = next_idx;
-- 
cgit 


From 806c8a6aa83410cf78dc13fc63bb5df6352670f2 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 12 Feb 2023 19:24:34 -0500
Subject: bcachefs: Fix failure to read btree roots

If failed to read a btree root - or if we're not using a btree root,
because of the reconstruct_alloc option - make sure we update the
corresponding info for the key/level for the root on disk.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/recovery.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 8a78377bf9c5..178f06424460 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -974,9 +974,15 @@ static int read_btree_roots(struct bch_fs *c)
 		}
 	}
 
-	for (i = 0; i < BTREE_ID_NR; i++)
-		if (!c->btree_roots[i].b)
+	for (i = 0; i < BTREE_ID_NR; i++) {
+		struct btree_root *r = &c->btree_roots[i];
+
+		if (!r->b) {
+			r->alive = false;
+			r->level = 0;
 			bch2_btree_root_alloc(c, i);
+		}
+	}
 fsck_err:
 	return ret;
 }
-- 
cgit 


From 4bd4035e64c2a90b9c939135e95be0106205b370 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 12 Feb 2023 22:08:39 -0500
Subject: bcachefs: Handle sb buffer resizing in __copy_super()

This fixes a rare buffer overrun when one field is growing and another
field is shrinking - and is a nice simplification as well.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/super-io.c | 55 ++++++++++++++++++++------------------------------
 1 file changed, 22 insertions(+), 33 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index ff27ae1839a8..e311b1b4595a 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -435,7 +435,7 @@ static void bch2_sb_update(struct bch_fs *c)
 		ca->mi = bch2_mi_to_cpu(mi->members + i);
 }
 
-static void __copy_super(struct bch_sb_handle *dst_handle, struct bch_sb *src)
+static int __copy_super(struct bch_sb_handle *dst_handle, struct bch_sb *src)
 {
 	struct bch_sb_field *src_f, *dst_f;
 	struct bch_sb *dst = dst_handle->sb;
@@ -460,42 +460,45 @@ static void __copy_super(struct bch_sb_handle *dst_handle, struct bch_sb *src)
 	memcpy(dst->compat,	src->compat,	sizeof(dst->compat));
 
 	for (i = 0; i < BCH_SB_FIELD_NR; i++) {
+		int d;
+
 		if ((1U << i) & BCH_SINGLE_DEVICE_SB_FIELDS)
 			continue;
 
 		src_f = bch2_sb_field_get(src, i);
 		dst_f = bch2_sb_field_get(dst, i);
+
+		d = (src_f ? le32_to_cpu(src_f->u64s) : 0) -
+		    (dst_f ? le32_to_cpu(dst_f->u64s) : 0);
+		if (d > 0) {
+			int ret = bch2_sb_realloc(dst_handle, le32_to_cpu(dst_handle->sb->u64s) + d);
+			if (ret)
+				return ret;
+
+			dst = dst_handle->sb;
+			dst_f = bch2_sb_field_get(dst, i);
+		}
+
 		dst_f = __bch2_sb_field_resize(dst_handle, dst_f,
 				src_f ? le32_to_cpu(src_f->u64s) : 0);
 
 		if (src_f)
 			memcpy(dst_f, src_f, vstruct_bytes(src_f));
 	}
+
+	return 0;
 }
 
 int bch2_sb_to_fs(struct bch_fs *c, struct bch_sb *src)
 {
-	struct bch_sb_field_journal *journal_buckets =
-		bch2_sb_get_journal(src);
-	unsigned journal_u64s = journal_buckets
-		? le32_to_cpu(journal_buckets->field.u64s)
-		: 0;
 	int ret;
 
 	lockdep_assert_held(&c->sb_lock);
 
-	ret = bch2_sb_realloc(&c->disk_sb,
-			      le32_to_cpu(src->u64s) - journal_u64s);
-	if (ret)
-		return ret;
-
-	__copy_super(&c->disk_sb, src);
-
-	ret = bch2_sb_replicas_to_cpu_replicas(c);
-	if (ret)
-		return ret;
-
-	ret = bch2_sb_disk_groups_to_cpu(c);
+	ret =   bch2_sb_realloc(&c->disk_sb, 0) ?:
+		__copy_super(&c->disk_sb, src) ?:
+		bch2_sb_replicas_to_cpu_replicas(c) ?:
+		bch2_sb_disk_groups_to_cpu(c);
 	if (ret)
 		return ret;
 
@@ -505,21 +508,7 @@ int bch2_sb_to_fs(struct bch_fs *c, struct bch_sb *src)
 
 int bch2_sb_from_fs(struct bch_fs *c, struct bch_dev *ca)
 {
-	struct bch_sb *src = c->disk_sb.sb, *dst = ca->disk_sb.sb;
-	struct bch_sb_field_journal *journal_buckets =
-		bch2_sb_get_journal(dst);
-	unsigned journal_u64s = journal_buckets
-		? le32_to_cpu(journal_buckets->field.u64s)
-		: 0;
-	unsigned u64s = le32_to_cpu(src->u64s) + journal_u64s;
-	int ret;
-
-	ret = bch2_sb_realloc(&ca->disk_sb, u64s);
-	if (ret)
-		return ret;
-
-	__copy_super(&ca->disk_sb, src);
-	return 0;
+	return __copy_super(&ca->disk_sb, c->disk_sb.sb);
 }
 
 /* read superblock: */
-- 
cgit 


From 429dd4270fab3c88a8bfcb5b2b8c6d60ec6a1f2a Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 12 Feb 2023 22:42:31 -0500
Subject: bcachefs: Fix verify_bucket_evacuated()

This fixes an incorrectly handled transaction restart.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/move.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index c2226353c775..67f861eb597a 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -583,7 +583,7 @@ int bch2_move_data(struct bch_fs *c,
 	return ret;
 }
 
-static int verify_bucket_evacuated(struct btree_trans *trans, struct bpos bucket, int gen)
+static noinline void verify_bucket_evacuated(struct btree_trans *trans, struct bpos bucket, int gen)
 {
 	struct bch_fs *c = trans->c;
 	struct btree_iter iter;
@@ -596,8 +596,8 @@ static int verify_bucket_evacuated(struct btree_trans *trans, struct bpos bucket
 	bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc,
 			     bucket, BTREE_ITER_CACHED);
 again:
-	k = bch2_btree_iter_peek_slot(&iter);
-	ret = bkey_err(k);
+	ret = lockrestart_do(trans,
+			bkey_err(k = bch2_btree_iter_peek_slot(&iter)));
 
 	if (!ret && k.k->type == KEY_TYPE_alloc_v4) {
 		struct bkey_s_c_alloc_v4 a = bkey_s_c_to_alloc_v4(k);
@@ -614,7 +614,7 @@ again:
 	}
 
 	bch2_trans_iter_exit(trans, &iter);
-	return ret;
+	return;
 failed_to_evacuate:
 	bch2_trans_iter_exit(trans, &iter);
 
@@ -650,7 +650,6 @@ failed_to_evacuate:
 
 	bch2_print_string_as_lines(KERN_ERR, buf.buf);
 	printbuf_exit(&buf);
-	return 0;
 }
 
 int __bch2_evacuate_bucket(struct moving_context *ctxt,
@@ -799,7 +798,7 @@ next:
 		move_ctxt_wait_event(ctxt, NULL, list_empty(&ctxt->reads));
 		closure_sync(&ctxt->cl);
 		if (!ctxt->write_error)
-			lockrestart_do(&trans, verify_bucket_evacuated(&trans, bucket, gen));
+			verify_bucket_evacuated(&trans, bucket, gen);
 	}
 err:
 	bch2_trans_exit(&trans);
-- 
cgit 


From 3ea4219d9894130008d723fb9e9c24290d4a42b6 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 12 Feb 2023 23:15:53 -0500
Subject: bcachefs: New backtrace utility code

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_locking.c |  2 +-
 fs/bcachefs/debug.c         |  2 +-
 fs/bcachefs/util.c          | 44 +++++++++++++++++++++++++++++++++++---------
 fs/bcachefs/util.h          |  8 +++++++-
 4 files changed, 44 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c
index cf138cd9d431..49c7e94573c9 100644
--- a/fs/bcachefs/btree_locking.c
+++ b/fs/bcachefs/btree_locking.c
@@ -191,7 +191,7 @@ static noinline int break_cycle(struct lock_graph *g, struct printbuf *cycle)
 			prt_printf(&buf, "backtrace:");
 			prt_newline(&buf);
 			printbuf_indent_add(&buf, 2);
-			bch2_prt_backtrace(&buf, trans->locking_wait.task);
+			bch2_prt_task_backtrace(&buf, trans->locking_wait.task);
 			printbuf_indent_sub(&buf, 2);
 			prt_newline(&buf);
 		}
diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c
index fcefd55a5322..8f43581f3972 100644
--- a/fs/bcachefs/debug.c
+++ b/fs/bcachefs/debug.c
@@ -527,7 +527,7 @@ static ssize_t bch2_btree_transactions_read(struct file *file, char __user *buf,
 		prt_printf(&i->buf, "backtrace:");
 		prt_newline(&i->buf);
 		printbuf_indent_add(&i->buf, 2);
-		bch2_prt_backtrace(&i->buf, trans->locking_wait.task);
+		bch2_prt_task_backtrace(&i->buf, trans->locking_wait.task);
 		printbuf_indent_sub(&i->buf, 2);
 		prt_newline(&i->buf);
 
diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c
index e6672b67ae32..12f4107662fc 100644
--- a/fs/bcachefs/util.c
+++ b/fs/bcachefs/util.c
@@ -266,22 +266,48 @@ void bch2_print_string_as_lines(const char *prefix, const char *lines)
 	console_unlock();
 }
 
-int bch2_prt_backtrace(struct printbuf *out, struct task_struct *task)
+int bch2_save_backtrace(bch_stacktrace *stack, struct task_struct *task)
 {
-	unsigned long entries[32];
-	unsigned i, nr_entries;
+	unsigned nr_entries = 0;
+	int ret = 0;
+
+	stack->nr = 0;
+	ret = darray_make_room(stack, 32);
+	if (ret)
+		return ret;
 
 	if (!down_read_trylock(&task->signal->exec_update_lock))
-		return 0;
+		return -1;
+
+	do {
+		nr_entries = stack_trace_save_tsk(task, stack->data, stack->size, 0);
+	} while (nr_entries == stack->size &&
+		 !(ret = darray_make_room(stack, stack->size * 2)));
+
+	stack->nr = nr_entries;
+	up_read(&task->signal->exec_update_lock);
 
-	nr_entries = stack_trace_save_tsk(task, entries, ARRAY_SIZE(entries), 0);
-	for (i = 0; i < nr_entries; i++) {
-		prt_printf(out, "[<0>] %pB", (void *)entries[i]);
+	return ret;
+}
+
+void bch2_prt_backtrace(struct printbuf *out, bch_stacktrace *stack)
+{
+	unsigned long *i;
+
+	darray_for_each(*stack, i) {
+		prt_printf(out, "[<0>] %pB", (void *) *i);
 		prt_newline(out);
 	}
+}
 
-	up_read(&task->signal->exec_update_lock);
-	return 0;
+int bch2_prt_task_backtrace(struct printbuf *out, struct task_struct *task)
+{
+	bch_stacktrace stack = { 0 };
+	int ret = bch2_save_backtrace(&stack, task);
+
+	bch2_prt_backtrace(out, &stack);
+	darray_exit(&stack);
+	return ret;
 }
 
 /* time stats: */
diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h
index 67b0d3de24cc..4188f380f54f 100644
--- a/fs/bcachefs/util.h
+++ b/fs/bcachefs/util.h
@@ -20,6 +20,8 @@
 
 #include "mean_and_variance.h"
 
+#include "darray.h"
+
 struct closure;
 
 #ifdef CONFIG_BCACHEFS_DEBUG
@@ -361,7 +363,11 @@ u64 bch2_read_flag_list(char *, const char * const[]);
 void bch2_prt_u64_binary(struct printbuf *, u64, unsigned);
 
 void bch2_print_string_as_lines(const char *prefix, const char *lines);
-int bch2_prt_backtrace(struct printbuf *, struct task_struct *);
+
+typedef DARRAY(unsigned long) bch_stacktrace;
+int bch2_save_backtrace(bch_stacktrace *stack, struct task_struct *);
+void bch2_prt_backtrace(struct printbuf *, bch_stacktrace *);
+int bch2_prt_task_backtrace(struct printbuf *, struct task_struct *);
 
 #define NR_QUANTILES	15
 #define QUANTILE_IDX(i)	inorder_to_eytzinger0(i, NR_QUANTILES)
-- 
cgit 


From 70f0b0fd7e9b85a14fdb8b3f229572b7439d8915 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 13 Feb 2023 18:21:40 -0500
Subject: bcachefs: Fix verify_update_old_key()

This fixes a very-rare race in our assertion, with needs_whiteout being
modified in the btree key.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update_leaf.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index cefe62d28cc0..3773439d031a 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -56,9 +56,10 @@ static void verify_update_old_key(struct btree_trans *trans, struct btree_insert
 			k = bkey_i_to_s_c(j_k);
 	}
 
-	i->old_k.needs_whiteout = k.k->needs_whiteout;
+	u = *k.k;
+	u.needs_whiteout = i->old_k.needs_whiteout;
 
-	BUG_ON(memcmp(&i->old_k, k.k, sizeof(struct bkey)));
+	BUG_ON(memcmp(&i->old_k, &u, sizeof(struct bkey)));
 	BUG_ON(i->old_v != k.v);
 #endif
 }
-- 
cgit 


From 0cc1bc84d69138531d8a5e163caa57dfacd58cf2 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 15 Feb 2023 18:29:16 -0500
Subject: six locks: Simplify six_lock_counts()

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/six.c | 13 +++----------
 1 file changed, 3 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/six.c b/fs/bcachefs/six.c
index 5d003e41ae43..e1e9df0368b6 100644
--- a/fs/bcachefs/six.c
+++ b/fs/bcachefs/six.c
@@ -830,19 +830,12 @@ struct six_lock_count six_lock_counts(struct six_lock *lock)
 {
 	struct six_lock_count ret;
 
-	ret.n[SIX_LOCK_read]	= 0;
+	ret.n[SIX_LOCK_read]	= !lock->readers
+		? lock->state.read_lock
+		: pcpu_read_count(lock);
 	ret.n[SIX_LOCK_intent]	= lock->state.intent_lock + lock->intent_lock_recurse;
 	ret.n[SIX_LOCK_write]	= lock->state.seq & 1;
 
-	if (!lock->readers)
-		ret.n[SIX_LOCK_read] += lock->state.read_lock;
-	else {
-		int cpu;
-
-		for_each_possible_cpu(cpu)
-			ret.n[SIX_LOCK_read] += *per_cpu_ptr(lock->readers, cpu);
-	}
-
 	return ret;
 }
 EXPORT_SYMBOL_GPL(six_lock_counts);
-- 
cgit 


From 564fbd9dd6329abde8aca143da26ffbada13ff7d Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 17 Feb 2023 15:36:46 -0500
Subject: bcachefs: Fix a 64 bit divide

This fixes a build failure on 32 bit

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/backpointers.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c
index c269fc73a41d..e001f4191671 100644
--- a/fs/bcachefs/backpointers.c
+++ b/fs/bcachefs/backpointers.c
@@ -738,7 +738,7 @@ static size_t btree_nodes_fit_in_ram(struct bch_fs *c)
 
 	si_meminfo(&i);
 	mem_bytes = i.totalram * i.mem_unit;
-	return (mem_bytes >> 1) / btree_bytes(c);
+	return div_u64(mem_bytes >> 1, btree_bytes(c));
 }
 
 int bch2_get_btree_in_memory_pos(struct btree_trans *trans,
-- 
cgit 


From 2798143aa8eb796be19775dcb5ae3927bf983730 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 16 Feb 2023 23:09:27 -0500
Subject: bcachefs: bch2_btree_insert_nonextent()

This adds a new helper to delete some redundant code in
bch2_trans_update_extent().

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update.h      |  3 +++
 fs/bcachefs/btree_update_leaf.c | 54 ++++++++++++++++++-----------------------
 fs/bcachefs/data_update.c       | 10 ++------
 3 files changed, 29 insertions(+), 38 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
index 771e4b239c66..ee1d15931022 100644
--- a/fs/bcachefs/btree_update.h
+++ b/fs/bcachefs/btree_update.h
@@ -58,6 +58,9 @@ int bch2_btree_delete_extent_at(struct btree_trans *, struct btree_iter *,
 				unsigned, unsigned);
 int bch2_btree_delete_at(struct btree_trans *, struct btree_iter *, unsigned);
 
+int bch2_btree_insert_nonextent(struct btree_trans *, enum btree_id,
+				struct bkey_i *, enum btree_update_flags);
+
 int __bch2_btree_insert(struct btree_trans *, enum btree_id, struct bkey_i *,
 			enum btree_update_flags);
 int bch2_btree_insert(struct bch_fs *, enum btree_id, struct bkey_i *,
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 3773439d031a..0fc98b43a073 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -1312,7 +1312,7 @@ int bch2_trans_update_extent(struct btree_trans *trans,
 			     struct bkey_i *insert,
 			     enum btree_update_flags flags)
 {
-	struct btree_iter iter, update_iter;
+	struct btree_iter iter;
 	struct bpos start = bkey_start_pos(&insert->k);
 	struct bkey_i *update;
 	struct bkey_s_c k;
@@ -1360,16 +1360,8 @@ int bch2_trans_update_extent(struct btree_trans *trans,
 
 			bch2_cut_back(start, update);
 
-			bch2_trans_iter_init(trans, &update_iter, btree_id, update->k.p,
-					     BTREE_ITER_NOT_EXTENTS|
-					     BTREE_ITER_ALL_SNAPSHOTS|
-					     BTREE_ITER_INTENT);
-			ret   = bch2_btree_iter_traverse(&update_iter) ?:
-				bch2_trans_update(trans, &update_iter, update,
-						  BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|
-						  flags);
-			bch2_trans_iter_exit(trans, &update_iter);
-
+			ret = bch2_btree_insert_nonextent(trans, btree_id, update,
+						  BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|flags);
 			if (ret)
 				goto err;
 		}
@@ -1383,15 +1375,8 @@ int bch2_trans_update_extent(struct btree_trans *trans,
 			bch2_cut_front(start, update);
 			bch2_cut_back(insert->k.p, update);
 
-			bch2_trans_iter_init(trans, &update_iter, btree_id, update->k.p,
-					     BTREE_ITER_NOT_EXTENTS|
-					     BTREE_ITER_ALL_SNAPSHOTS|
-					     BTREE_ITER_INTENT);
-			ret   = bch2_btree_iter_traverse(&update_iter) ?:
-				bch2_trans_update(trans, &update_iter, update,
-						  BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|
-						  flags);
-			bch2_trans_iter_exit(trans, &update_iter);
+			ret = bch2_btree_insert_nonextent(trans, btree_id, update,
+						  BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|flags);
 			if (ret)
 				goto err;
 		}
@@ -1409,15 +1394,9 @@ int bch2_trans_update_extent(struct btree_trans *trans,
 				update->k.type = KEY_TYPE_whiteout;
 			}
 
-			bch2_trans_iter_init(trans, &update_iter, btree_id, update->k.p,
-					     BTREE_ITER_NOT_EXTENTS|
-					     BTREE_ITER_INTENT);
-			ret   = bch2_btree_iter_traverse(&update_iter) ?:
-				bch2_trans_update(trans, &update_iter, update,
-						  BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|
-						  flags);
-			bch2_trans_iter_exit(trans, &update_iter);
 
+			ret = bch2_btree_insert_nonextent(trans, btree_id, update,
+						  BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|flags);
 			if (ret)
 				goto err;
 		}
@@ -1748,8 +1727,23 @@ void bch2_trans_commit_hook(struct btree_trans *trans,
 	trans->hooks = h;
 }
 
-int __bch2_btree_insert(struct btree_trans *trans,
-			enum btree_id id,
+int bch2_btree_insert_nonextent(struct btree_trans *trans,
+				enum btree_id btree, struct bkey_i *k,
+				enum btree_update_flags flags)
+{
+	struct btree_iter iter;
+	int ret;
+
+	bch2_trans_iter_init(trans, &iter, btree, k->k.p,
+			     BTREE_ITER_NOT_EXTENTS|
+			     BTREE_ITER_INTENT);
+	ret   = bch2_btree_iter_traverse(&iter) ?:
+		bch2_trans_update(trans, &iter, k, flags);
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+int __bch2_btree_insert(struct btree_trans *trans, enum btree_id id,
 			struct bkey_i *k, enum btree_update_flags flags)
 {
 	struct btree_iter iter;
diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c
index 04d56e960872..09a5fff339fe 100644
--- a/fs/bcachefs/data_update.c
+++ b/fs/bcachefs/data_update.c
@@ -21,7 +21,7 @@ static int insert_snapshot_whiteouts(struct btree_trans *trans,
 				     struct bpos new_pos)
 {
 	struct bch_fs *c = trans->c;
-	struct btree_iter iter, update_iter;
+	struct btree_iter iter;
 	struct bkey_s_c k;
 	snapshot_id_list s;
 	int ret;
@@ -65,14 +65,8 @@ static int insert_snapshot_whiteouts(struct btree_trans *trans,
 			update->k.p = new_pos;
 			update->k.p.snapshot = k.k->p.snapshot;
 
-			bch2_trans_iter_init(trans, &update_iter, id, update->k.p,
-					     BTREE_ITER_NOT_EXTENTS|
-					     BTREE_ITER_ALL_SNAPSHOTS|
-					     BTREE_ITER_INTENT);
-			ret   = bch2_btree_iter_traverse(&update_iter) ?:
-				bch2_trans_update(trans, &update_iter, update,
+			ret  = bch2_btree_insert_nonextent(trans, id, update,
 					  BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
-			bch2_trans_iter_exit(trans, &update_iter);
 			if (ret)
 				break;
 
-- 
cgit 


From 9c5d38bba034253dace198e5801dc7bad6fb8c7e Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 16 Feb 2023 21:02:14 -0500
Subject: bcachefs: Don't print out duplicate fsck errors

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/error.c | 30 ++++++++++++++++++++----------
 fs/bcachefs/error.h |  3 ++-
 2 files changed, 22 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c
index c2882c599896..1dae649ff0e2 100644
--- a/fs/bcachefs/error.c
+++ b/fs/bcachefs/error.c
@@ -98,7 +98,6 @@ static struct fsck_err_state *fsck_err_get(struct bch_fs *c, const char *fmt)
 
 	INIT_LIST_HEAD(&s->list);
 	s->fmt = fmt;
-	s->buf = PRINTBUF;
 	list_add(&s->list, &c->fsck_errors);
 	return s;
 }
@@ -111,9 +110,23 @@ int bch2_fsck_err(struct bch_fs *c, unsigned flags, const char *fmt, ...)
 	struct printbuf buf = PRINTBUF, *out = &buf;
 	int ret = -BCH_ERR_fsck_ignore;
 
+	va_start(args, fmt);
+	prt_vprintf(out, fmt, args);
+	va_end(args);
+
 	mutex_lock(&c->fsck_error_lock);
 	s = fsck_err_get(c, fmt);
 	if (s) {
+		if (s->last_msg && !strcmp(buf.buf, s->last_msg)) {
+			ret = s->ret;
+			mutex_unlock(&c->fsck_error_lock);
+			printbuf_exit(&buf);
+			return ret;
+		}
+
+		kfree(s->last_msg);
+		s->last_msg = kstrdup(buf.buf, GFP_KERNEL);
+
 		if (c->opts.ratelimit_errors &&
 		    !(flags & FSCK_NO_RATELIMIT) &&
 		    s->nr >= FSCK_ERR_RATELIMIT_NR) {
@@ -123,8 +136,6 @@ int bch2_fsck_err(struct bch_fs *c, unsigned flags, const char *fmt, ...)
 				print = false;
 		}
 
-		printbuf_reset(&s->buf);
-		out = &s->buf;
 		s->nr++;
 	}
 
@@ -133,10 +144,6 @@ int bch2_fsck_err(struct bch_fs *c, unsigned flags, const char *fmt, ...)
 		prt_printf(out, bch2_log_msg(c, ""));
 #endif
 
-	va_start(args, fmt);
-	prt_vprintf(out, fmt, args);
-	va_end(args);
-
 	if (test_bit(BCH_FS_FSCK_DONE, &c->flags)) {
 		if (c->opts.errors != BCH_ON_ERROR_continue ||
 		    !(flags & (FSCK_CAN_FIX|FSCK_CAN_IGNORE))) {
@@ -190,6 +197,9 @@ int bch2_fsck_err(struct bch_fs *c, unsigned flags, const char *fmt, ...)
 	else if (suppressing)
 		bch_err(c, "Ratelimiting new instances of previous error");
 
+	if (s)
+		s->ret = ret;
+
 	mutex_unlock(&c->fsck_error_lock);
 
 	printbuf_exit(&buf);
@@ -214,11 +224,11 @@ void bch2_flush_fsck_errs(struct bch_fs *c)
 	mutex_lock(&c->fsck_error_lock);
 
 	list_for_each_entry_safe(s, n, &c->fsck_errors, list) {
-		if (s->ratelimited)
-			bch_err(c, "Saw %llu errors like:\n    %s", s->nr, s->buf.buf);
+		if (s->ratelimited && s->last_msg)
+			bch_err(c, "Saw %llu errors like:\n    %s", s->nr, s->last_msg);
 
 		list_del(&s->list);
-		printbuf_exit(&s->buf);
+		kfree(s->last_msg);
 		kfree(s);
 	}
 
diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h
index 9991879dfbff..91c7e4ee8f72 100644
--- a/fs/bcachefs/error.h
+++ b/fs/bcachefs/error.h
@@ -103,7 +103,8 @@ struct fsck_err_state {
 	const char		*fmt;
 	u64			nr;
 	bool			ratelimited;
-	struct printbuf		buf;
+	int			ret;
+	char			*last_msg;
 };
 
 #define FSCK_CAN_FIX		(1 << 0)
-- 
cgit 


From c58029ec807594856ae69dd7864eb7b3afb92f4a Mon Sep 17 00:00:00 2001
From: Daniel Hill <daniel@gluo.nz>
Date: Sun, 8 May 2022 15:03:28 +1200
Subject: bcachefs: Reimplement repair for overlapping extents

Repair now checks if overlapping extents exist in the same snapshot
and calls update_trans_update_extent to do the repair work.

Signed-off-by: Daniel Hill <daniel@gluo.nz>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fsck.c | 144 ++++++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 121 insertions(+), 23 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 52bb00b52b90..e232f331ae9a 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -605,6 +605,20 @@ static int ref_visible(struct bch_fs *c, struct snapshots_seen *s,
 		: bch2_snapshot_is_ancestor(c, src, dst);
 }
 
+static int ref_visible2(struct bch_fs *c,
+			u32 src, struct snapshots_seen *src_seen,
+			u32 dst, struct snapshots_seen *dst_seen)
+{
+	src = bch2_snapshot_equiv(c, src);
+	dst = bch2_snapshot_equiv(c, dst);
+
+	if (dst > src) {
+		swap(dst, src);
+		swap(dst_seen, src_seen);
+	}
+	return key_visible_in_snapshot(c, src_seen, dst, src);
+}
+
 #define for_each_visible_inode(_c, _s, _w, _snapshot, _i)				\
 	for (_i = (_w)->inodes.data; _i < (_w)->inodes.data + (_w)->inodes.nr &&	\
 	     (_i)->snapshot <= (_snapshot); _i++)					\
@@ -1158,10 +1172,102 @@ fsck_err:
 	return ret;
 }
 
+struct extent_end {
+	u32			snapshot;
+	u64			offset;
+	struct snapshots_seen	seen;
+};
+
+typedef DARRAY(struct extent_end) extent_ends;
+
+static int check_overlapping_extents(struct btree_trans *trans,
+			      struct snapshots_seen *seen,
+			      extent_ends *extent_ends,
+			      struct bkey_s_c k,
+			      struct btree_iter *iter)
+{
+	struct bch_fs *c = trans->c;
+	struct extent_end *i;
+	struct printbuf buf = PRINTBUF;
+	int ret = 0;
+
+	darray_for_each(*extent_ends, i) {
+		/* duplicate, due to transaction restart: */
+		if (i->offset	== k.k->p.offset &&
+		    i->snapshot == k.k->p.snapshot)
+			continue;
+
+		if (!ref_visible2(c,
+				  k.k->p.snapshot, seen,
+				  i->snapshot, &i->seen))
+			continue;
+
+		if (fsck_err_on(i->offset > bkey_start_offset(k.k), c,
+				"overlapping extents: extent in snapshot %u ends at %llu overlaps with\n%s",
+				i->snapshot,
+				i->offset,
+				(printbuf_reset(&buf),
+				 bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
+			struct bkey_i *update = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
+			if ((ret = PTR_ERR_OR_ZERO(update)))
+				goto err;
+			bkey_reassemble(update, k);
+			ret = bch2_trans_update_extent(trans, iter, update, 0);
+			if (!ret)
+				goto err;
+		}
+	}
+err:
+fsck_err:
+	printbuf_exit(&buf);
+	return ret;
+}
+
+static int extent_ends_at(extent_ends *extent_ends,
+			  struct snapshots_seen *seen,
+			  struct bkey_s_c k)
+{
+	struct extent_end *i, n = (struct extent_end) {
+		.snapshot	= k.k->p.snapshot,
+		.offset		= k.k->p.offset,
+		.seen		= *seen,
+	};
+
+	n.seen.ids.data = kmemdup(seen->ids.data,
+			      sizeof(seen->ids.data[0]) * seen->ids.size,
+			      GFP_KERNEL);
+	if (!n.seen.ids.data)
+		return -ENOMEM;
+
+	darray_for_each(*extent_ends, i) {
+		if (i->snapshot == k.k->p.snapshot) {
+			snapshots_seen_exit(&i->seen);
+			*i = n;
+			return 0;
+		}
+
+		if (i->snapshot >= k.k->p.snapshot)
+			break;
+	}
+
+	return darray_insert_item(extent_ends, i - extent_ends->data, n);
+}
+
+static void extent_ends_reset(extent_ends *extent_ends)
+{
+	struct extent_end *i;
+
+	darray_for_each(*extent_ends, i)
+		snapshots_seen_exit(&i->seen);
+
+	extent_ends->nr = 0;
+}
+
 static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
 			struct bkey_s_c k,
 			struct inode_walker *inode,
-			struct snapshots_seen *s)
+			struct snapshots_seen *s,
+			extent_ends *extent_ends)
 {
 	struct bch_fs *c = trans->c;
 	struct inode_walker_entry *i;
@@ -1189,24 +1295,20 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
 		ret = check_i_sectors(trans, inode);
 		if (ret)
 			goto err;
+
+		extent_ends_reset(extent_ends);
 	}
 
 	BUG_ON(!iter->path->should_be_locked);
-#if 0
-	if (bkey_gt(prev.k->k.p, bkey_start_pos(k.k))) {
-		char buf1[200];
-		char buf2[200];
 
-		bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(prev.k));
-		bch2_bkey_val_to_text(&PBUF(buf2), c, k);
+	ret = check_overlapping_extents(trans, s, extent_ends, k, iter);
+	if (ret)
+		goto err;
+
+	ret = extent_ends_at(extent_ends, s, k);
+	if (ret)
+		goto err;
 
-		if (fsck_err(c, "overlapping extents:\n%s\n%s", buf1, buf2)) {
-			ret = fix_overlapping_extent(trans, k, prev.k->k.p)
-				?: -BCH_ERR_transaction_restart_nested;
-			goto out;
-		}
-	}
-#endif
 	ret = __walk_inode(trans, inode, equiv);
 	if (ret < 0)
 		goto err;
@@ -1304,13 +1406,9 @@ static int check_extents(struct bch_fs *c)
 	struct btree_trans trans;
 	struct btree_iter iter;
 	struct bkey_s_c k;
+	extent_ends extent_ends = { 0 };
 	int ret = 0;
 
-#if 0
-	struct bkey_buf prev;
-	bch2_bkey_buf_init(&prev);
-	prev.k->k = KEY(0, 0, 0);
-#endif
 	snapshots_seen_init(&s);
 	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
 
@@ -1321,10 +1419,10 @@ static int check_extents(struct bch_fs *c)
 			BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
 			NULL, NULL,
 			BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
-		check_extent(&trans, &iter, k, &w, &s));
-#if 0
-	bch2_bkey_buf_exit(&prev, c);
-#endif
+		check_extent(&trans, &iter, k, &w, &s, &extent_ends));
+
+	extent_ends_reset(&extent_ends);
+	darray_exit(&extent_ends);
 	inode_walker_exit(&w);
 	bch2_trans_exit(&trans);
 	snapshots_seen_exit(&s);
-- 
cgit 


From 2ffe3ad62dafac036c523204c6e2e2f39b23cb6f Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 16 Feb 2023 23:36:41 -0500
Subject: bcachefs: Snapshot whiteout fix

When fully overwriting an existing extent, we may need to generate a
whiteout - not just if the extent being overwritten was in an older
snapshot, but also if it was overwriting an extent in an older snapshot.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update_leaf.c | 75 ++++++++++++++++++++---------------------
 1 file changed, 37 insertions(+), 38 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 0fc98b43a073..544b90b15260 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -1307,6 +1307,39 @@ static noinline int extent_back_merge(struct btree_trans *trans,
 	return 0;
 }
 
+/*
+ * When deleting, check if we need to emit a whiteout (because we're overwriting
+ * something in an ancestor snapshot)
+ */
+static int need_whiteout_for_snapshot(struct btree_trans *trans,
+				      enum btree_id btree_id, struct bpos pos)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	u32 snapshot = pos.snapshot;
+	int ret;
+
+	if (!bch2_snapshot_parent(trans->c, pos.snapshot))
+		return 0;
+
+	pos.snapshot++;
+
+	for_each_btree_key_norestart(trans, iter, btree_id, pos,
+			   BTREE_ITER_ALL_SNAPSHOTS|
+			   BTREE_ITER_NOPRESERVE, k, ret) {
+		if (!bkey_eq(k.k->p, pos))
+			break;
+
+		if (bch2_snapshot_is_ancestor(trans->c, snapshot,
+					      k.k->p.snapshot)) {
+			ret = !bkey_whiteout(k.k);
+			break;
+		}
+	}
+	bch2_trans_iter_exit(trans, &iter);
+
+	return ret;
+}
 int bch2_trans_update_extent(struct btree_trans *trans,
 			     struct btree_iter *orig_iter,
 			     struct bkey_i *insert,
@@ -1388,12 +1421,12 @@ int bch2_trans_update_extent(struct btree_trans *trans,
 
 			bkey_init(&update->k);
 			update->k.p = k.k->p;
+			update->k.p.snapshot = insert->k.p.snapshot;
 
-			if (insert->k.p.snapshot != k.k->p.snapshot) {
-				update->k.p.snapshot = insert->k.p.snapshot;
+			if (insert->k.p.snapshot != k.k->p.snapshot ||
+			    (btree_type_has_snapshots(btree_id) &&
+			     need_whiteout_for_snapshot(trans, btree_id, update->k.p)))
 				update->k.type = KEY_TYPE_whiteout;
-			}
-
 
 			ret = bch2_btree_insert_nonextent(trans, btree_id, update,
 						  BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|flags);
@@ -1448,40 +1481,6 @@ err:
 	return ret;
 }
 
-/*
- * When deleting, check if we need to emit a whiteout (because we're overwriting
- * something in an ancestor snapshot)
- */
-static int need_whiteout_for_snapshot(struct btree_trans *trans,
-				      enum btree_id btree_id, struct bpos pos)
-{
-	struct btree_iter iter;
-	struct bkey_s_c k;
-	u32 snapshot = pos.snapshot;
-	int ret;
-
-	if (!bch2_snapshot_parent(trans->c, pos.snapshot))
-		return 0;
-
-	pos.snapshot++;
-
-	for_each_btree_key_norestart(trans, iter, btree_id, pos,
-			   BTREE_ITER_ALL_SNAPSHOTS|
-			   BTREE_ITER_NOPRESERVE, k, ret) {
-		if (!bkey_eq(k.k->p, pos))
-			break;
-
-		if (bch2_snapshot_is_ancestor(trans->c, snapshot,
-					      k.k->p.snapshot)) {
-			ret = !bkey_whiteout(k.k);
-			break;
-		}
-	}
-	bch2_trans_iter_exit(trans, &iter);
-
-	return ret;
-}
-
 static int __must_check
 bch2_trans_update_by_path_trace(struct btree_trans *trans, struct btree_path *path,
 				struct bkey_i *k, enum btree_update_flags flags,
-- 
cgit 


From 19d6521964ed0439a7a03776d8cf0451afb63c1d Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 17 Feb 2023 00:39:12 -0500
Subject: bcachefs: bch2_mark_snapshot() now called like other triggers

This fixes a bug where bch2_mark_snapshot() wasn't called for existing
snapshot nodes being updated when child nodes were added.

This led to the data update path thinking the key being updated was for
a snapshot that didn't have children, causing it to fail to insert
whiteouts when splitting existing extents.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/subvolume.c | 3 +--
 fs/bcachefs/subvolume.h | 6 +++---
 2 files changed, 4 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c
index 1805c8542d65..ba281104eb30 100644
--- a/fs/bcachefs/subvolume.c
+++ b/fs/bcachefs/subvolume.c
@@ -512,8 +512,7 @@ int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent,
 		n->v.pad	= 0;
 		SET_BCH_SNAPSHOT_SUBVOL(&n->v, true);
 
-		ret   = bch2_trans_update(trans, &iter, &n->k_i, 0) ?:
-			bch2_mark_snapshot(trans, bkey_s_c_null, bkey_i_to_s_c(&n->k_i), 0);
+		ret   = bch2_trans_update(trans, &iter, &n->k_i, 0);
 		if (ret)
 			goto err;
 
diff --git a/fs/bcachefs/subvolume.h b/fs/bcachefs/subvolume.h
index b6740eab78d3..65f108a83835 100644
--- a/fs/bcachefs/subvolume.h
+++ b/fs/bcachefs/subvolume.h
@@ -8,15 +8,15 @@
 void bch2_snapshot_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 int bch2_snapshot_invalid(const struct bch_fs *, struct bkey_s_c,
 			  unsigned, struct printbuf *);
+int bch2_mark_snapshot(struct btree_trans *, struct bkey_s_c,
+		       struct bkey_s_c, unsigned);
 
 #define bch2_bkey_ops_snapshot ((struct bkey_ops) {		\
 	.key_invalid	= bch2_snapshot_invalid,		\
 	.val_to_text	= bch2_snapshot_to_text,		\
+	.atomic_trigger	= bch2_mark_snapshot,			\
 })
 
-int bch2_mark_snapshot(struct btree_trans *, struct bkey_s_c,
-		       struct bkey_s_c, unsigned);
-
 static inline struct snapshot_t *snapshot_t(struct bch_fs *c, u32 id)
 {
 	return genradix_ptr(&c->snapshots, U32_MAX - id);
-- 
cgit 


From f2a53270c7d6bceae5441ed180516d3b76799680 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 16 Feb 2023 23:42:09 -0500
Subject: bcachefs: Fix insert_snapshot_whiteouts()

 - We were failing to set the key type on the whiteouts it was creating,
   oops.

 - Also, we need to create whiteouts when generating front splits, not
   just back splits.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/data_update.c | 66 +++++++++++++++++++++++++++++++----------------
 fs/bcachefs/subvolume.h   |  7 +++++
 2 files changed, 51 insertions(+), 22 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c
index 09a5fff339fe..c3adc7b32e19 100644
--- a/fs/bcachefs/data_update.c
+++ b/fs/bcachefs/data_update.c
@@ -21,9 +21,10 @@ static int insert_snapshot_whiteouts(struct btree_trans *trans,
 				     struct bpos new_pos)
 {
 	struct bch_fs *c = trans->c;
-	struct btree_iter iter;
-	struct bkey_s_c k;
+	struct btree_iter iter, iter2;
+	struct bkey_s_c k, k2;
 	snapshot_id_list s;
+	struct bkey_i *update;
 	int ret;
 
 	if (!btree_type_has_snapshots(id))
@@ -31,10 +32,7 @@ static int insert_snapshot_whiteouts(struct btree_trans *trans,
 
 	darray_init(&s);
 
-	if (bkey_eq(old_pos, new_pos))
-		return 0;
-
-	if (!snapshot_t(c, old_pos.snapshot)->children[0])
+	if (!bch2_snapshot_has_children(c, old_pos.snapshot))
 		return 0;
 
 	bch2_trans_iter_init(trans, &iter, id, old_pos,
@@ -46,27 +44,39 @@ static int insert_snapshot_whiteouts(struct btree_trans *trans,
 		if (ret)
 			break;
 
+		if (!k.k)
+			break;
+
 		if (!bkey_eq(old_pos, k.k->p))
 			break;
 
-		if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, old_pos.snapshot)) {
-			struct bkey_i *update;
+		if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, old_pos.snapshot) &&
+		    !snapshot_list_has_ancestor(c, &s, k.k->p.snapshot)) {
+			struct bpos whiteout_pos = new_pos;
 
-			if (snapshot_list_has_ancestor(c, &s, k.k->p.snapshot))
-				continue;
+			whiteout_pos.snapshot = k.k->p.snapshot;
 
-			update = bch2_trans_kmalloc(trans, sizeof(struct bkey_i));
+			bch2_trans_iter_init(trans, &iter2, id, whiteout_pos,
+					     BTREE_ITER_NOT_EXTENTS|
+					     BTREE_ITER_INTENT);
+			k2 = bch2_btree_iter_peek_slot(&iter2);
+			ret = bkey_err(k2);
 
-			ret = PTR_ERR_OR_ZERO(update);
-			if (ret)
-				break;
+			if (!ret && k2.k->type == KEY_TYPE_deleted) {
+				update = bch2_trans_kmalloc(trans, sizeof(struct bkey_i));
+				ret = PTR_ERR_OR_ZERO(update);
+				if (ret)
+					break;
 
-			bkey_init(&update->k);
-			update->k.p = new_pos;
-			update->k.p.snapshot = k.k->p.snapshot;
+				bkey_init(&update->k);
+				update->k.p		= whiteout_pos;
+				update->k.type		= KEY_TYPE_whiteout;
+
+				ret = bch2_trans_update(trans, &iter2, update,
+							BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+			}
+			bch2_trans_iter_exit(trans, &iter2);
 
-			ret  = bch2_btree_insert_nonextent(trans, id, update,
-					  BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
 			if (ret)
 				break;
 
@@ -222,9 +232,21 @@ static int __bch2_data_update_index_update(struct btree_trans *trans,
 
 		next_pos = insert->k.p;
 
-		ret   = insert_snapshot_whiteouts(trans, m->btree_id,
-						  k.k->p, insert->k.p) ?:
-			bch2_trans_update(trans, &iter, insert,
+		if (!bkey_eq(bkey_start_pos(&insert->k), bkey_start_pos(k.k))) {
+			ret = insert_snapshot_whiteouts(trans, m->btree_id, k.k->p,
+							bkey_start_pos(&insert->k));
+			if (ret)
+				goto err;
+		}
+
+		if (!bkey_eq(insert->k.p, k.k->p)) {
+			ret = insert_snapshot_whiteouts(trans, m->btree_id,
+							k.k->p, insert->k.p);
+			if (ret)
+				goto err;
+		}
+
+		ret   = bch2_trans_update(trans, &iter, insert,
 				BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
 			bch2_trans_commit(trans, &op->res,
 				NULL,
diff --git a/fs/bcachefs/subvolume.h b/fs/bcachefs/subvolume.h
index 65f108a83835..7c488c3d78e0 100644
--- a/fs/bcachefs/subvolume.h
+++ b/fs/bcachefs/subvolume.h
@@ -68,6 +68,13 @@ static inline bool bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ances
 	return id == ancestor;
 }
 
+static inline bool bch2_snapshot_has_children(struct bch_fs *c, u32 id)
+{
+	struct snapshot_t *t = snapshot_t(c, id);
+
+	return (t->children[0]|t->children[1]) != 0;
+}
+
 static inline bool snapshot_list_has_id(snapshot_id_list *s, u32 id)
 {
 	u32 *i;
-- 
cgit 


From f2c6e4b3621778103805fe4d5f384db3fcd96159 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 17 Feb 2023 16:06:51 -0500
Subject: bcachefs: Fix integer overflow warnings on 32 bit

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/util.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c
index 12f4107662fc..e0c93da2523f 100644
--- a/fs/bcachefs/util.c
+++ b/fs/bcachefs/util.c
@@ -428,8 +428,8 @@ static const struct time_unit {
 	{ "us",		NSEC_PER_USEC	 },
 	{ "ms",		NSEC_PER_MSEC	 },
 	{ "s",		NSEC_PER_SEC	 },
-	{ "m",          NSEC_PER_SEC * 60},
-	{ "h",          NSEC_PER_SEC * 3600},
+	{ "m",          (u64) NSEC_PER_SEC * 60},
+	{ "h",          (u64) NSEC_PER_SEC * 3600},
 	{ "eon",        U64_MAX          },
 };
 
-- 
cgit 


From 1b30ed5fd87828b5e29647510eefb18a363e4d19 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 6 Feb 2023 18:51:42 -0500
Subject: bcachefs: Use btree write buffer for LRU btree

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c | 19 ++++++++--------
 fs/bcachefs/lru.c              | 49 ++++++++++++++++++++++++------------------
 2 files changed, 38 insertions(+), 30 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 81bd56152fd0..af3e55fdd54a 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -9,6 +9,7 @@
 #include "btree_update.h"
 #include "btree_update_interior.h"
 #include "btree_gc.h"
+#include "btree_write_buffer.h"
 #include "buckets.h"
 #include "buckets_waiting_for_journal.h"
 #include "clock.h"
@@ -1775,15 +1776,11 @@ static int invalidate_one_bucket(struct btree_trans *trans,
 	if (ret)
 		goto out;
 
-	if (lru_pos_time(lru_iter->pos) != alloc_lru_idx(a->v)) {
-		prt_str(&buf, "alloc key does not point back to lru entry when invalidating bucket:");
-		goto err;
-	}
+	/* We expect harmless races here due to the btree write buffer: */
+	if (lru_pos_time(lru_iter->pos) != alloc_lru_idx(a->v))
+		goto out;
 
-	if (a->v.data_type != BCH_DATA_cached) {
-		prt_str(&buf, "lru entry points to non cached bucket:");
-		goto err;
-	}
+	BUG_ON(a->v.data_type != BCH_DATA_cached);
 
 	if (!a->v.cached_sectors)
 		bch_err(c, "invalidating empty bucket, confused");
@@ -1845,6 +1842,10 @@ static void bch2_do_invalidates_work(struct work_struct *work)
 
 	bch2_trans_init(&trans, c, 0, 0);
 
+	ret = bch2_btree_write_buffer_flush(&trans);
+	if (ret)
+		goto err;
+
 	for_each_member_device(ca, c, i) {
 		s64 nr_to_invalidate =
 			should_invalidate_buckets(ca, bch2_dev_usage_read(ca));
@@ -1860,7 +1861,7 @@ static void bch2_do_invalidates_work(struct work_struct *work)
 			break;
 		}
 	}
-
+err:
 	bch2_trans_exit(&trans);
 	bch2_write_ref_put(c, BCH_WRITE_REF_invalidate);
 }
diff --git a/fs/bcachefs/lru.c b/fs/bcachefs/lru.c
index 07fb41ca8c6b..c121a7cc3acd 100644
--- a/fs/bcachefs/lru.c
+++ b/fs/bcachefs/lru.c
@@ -4,6 +4,7 @@
 #include "alloc_background.h"
 #include "btree_iter.h"
 #include "btree_update.h"
+#include "btree_write_buffer.h"
 #include "error.h"
 #include "lru.h"
 #include "recovery.h"
@@ -49,7 +50,6 @@ void bch2_lru_pos_to_text(struct printbuf *out, struct bpos lru)
 static int __bch2_lru_set(struct btree_trans *trans, u16 lru_id,
 			u64 dev_bucket, u64 time, unsigned key_type)
 {
-	struct btree_iter iter;
 	struct bkey_i *k;
 	int ret = 0;
 
@@ -69,13 +69,7 @@ static int __bch2_lru_set(struct btree_trans *trans, u16 lru_id,
 	EBUG_ON(lru_pos_time(k->k.p) != time);
 	EBUG_ON(k->k.p.offset != dev_bucket);
 
-	bch2_trans_iter_init(trans, &iter, BTREE_ID_lru,
-			     k->k.p, BTREE_ITER_INTENT);
-
-	ret = bch2_btree_iter_traverse(&iter) ?:
-		bch2_trans_update(trans, &iter, k, 0);
-	bch2_trans_iter_exit(trans, &iter);
-	return ret;
+	return bch2_trans_update_buffered(trans, BTREE_ID_lru, k);
 }
 
 int bch2_lru_del(struct btree_trans *trans, u16 lru_id, u64 dev_bucket, u64 time)
@@ -101,7 +95,8 @@ int bch2_lru_change(struct btree_trans *trans,
 
 static int bch2_check_lru_key(struct btree_trans *trans,
 			      struct btree_iter *lru_iter,
-			      struct bkey_s_c lru_k)
+			      struct bkey_s_c lru_k,
+			      struct bpos *last_flushed_pos)
 {
 	struct bch_fs *c = trans->c;
 	struct btree_iter iter;
@@ -126,18 +121,29 @@ static int bch2_check_lru_key(struct btree_trans *trans,
 
 	a = bch2_alloc_to_v4(k, &a_convert);
 
-	if (fsck_err_on(lru_k.k->type != KEY_TYPE_set ||
-			a->data_type != BCH_DATA_cached ||
-			a->io_time[READ] != lru_pos_time(lru_k.k->p), c,
-			"incorrect lru entry (time %llu) %s\n"
-			"  for %s",
-			lru_pos_time(lru_k.k->p),
-			(bch2_bkey_val_to_text(&buf1, c, lru_k), buf1.buf),
-			(bch2_bkey_val_to_text(&buf2, c, k), buf2.buf))) {
-		ret = bch2_btree_delete_at(trans, lru_iter, 0);
-		if (ret)
-			goto err;
+	if (lru_k.k->type != KEY_TYPE_set ||
+	    a->data_type != BCH_DATA_cached ||
+	    a->io_time[READ] != lru_pos_time(lru_k.k->p)) {}
+		if (!bpos_eq(*last_flushed_pos, lru_k.k->p)) {
+			*last_flushed_pos = lru_k.k->p;
+			ret = bch2_btree_write_buffer_flush_sync(trans) ?:
+				-BCH_ERR_transaction_restart_write_buffer_flush;
+			goto out;
+		}
+
+		if (fsck_err_on(lru_k.k->type != KEY_TYPE_set ||
+				a->data_type != BCH_DATA_cached ||
+				a->io_time[READ] != lru_pos_time(lru_k.k->p), c,
+				"incorrect lru entry (time %llu) %s\n"
+				"  for %s",
+				lru_pos_time(lru_k.k->p),
+				(bch2_bkey_val_to_text(&buf1, c, lru_k), buf1.buf),
+				(bch2_bkey_val_to_text(&buf2, c, k), buf2.buf))) {
+			ret = bch2_btree_delete_at(trans, lru_iter, 0);
+			if (ret)
+				goto err;
 	}
+out:
 err:
 fsck_err:
 	bch2_trans_iter_exit(trans, &iter);
@@ -151,6 +157,7 @@ int bch2_check_lrus(struct bch_fs *c)
 	struct btree_trans trans;
 	struct btree_iter iter;
 	struct bkey_s_c k;
+	struct bpos last_flushed_pos = POS_MIN;
 	int ret = 0;
 
 	bch2_trans_init(&trans, c, 0, 0);
@@ -158,7 +165,7 @@ int bch2_check_lrus(struct bch_fs *c)
 	ret = for_each_btree_key_commit(&trans, iter,
 			BTREE_ID_lru, POS_MIN, BTREE_ITER_PREFETCH, k,
 			NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW,
-		bch2_check_lru_key(&trans, &iter, k));
+		bch2_check_lru_key(&trans, &iter, k, &last_flushed_pos));
 
 	bch2_trans_exit(&trans);
 	return ret;
-- 
cgit 


From 80c33085783656617d0d07e1bc9fba70a592ce5c Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 5 Dec 2022 10:24:19 -0500
Subject: bcachefs: Fragmentation LRU

Now that we have much more efficient updates to the LRU btree, this
patch adds a new LRU that indexes buckets by fragmentation.

This means copygc no longer has to scan every bucket to find buckets
that need to be evacuated.

Changes:
 - A new field in bch_alloc_v4, fragmentation_lru - this corresponds to
   the bucket's position in the fragmentation LRU. We add a new field
   for this instead of calculating it as needed because we may make the
   fragmentation LRU optional; this field indicates whether a bucket is
   on the fragmentation LRU.

   Also, zoned devices will introduce variable bucket sizes; explicitly
   recording the LRU position will be safer for them.

 - A new copygc path for using the fragmentation LRU instead of
   scanning every bucket and building up an in-memory heap.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c |  20 ++++-
 fs/bcachefs/alloc_background.h |  15 +++-
 fs/bcachefs/bcachefs.h         |   1 -
 fs/bcachefs/bcachefs_format.h  |   4 +-
 fs/bcachefs/buckets_types.h    |  11 ---
 fs/bcachefs/lru.c              |  38 ++++++---
 fs/bcachefs/lru.h              |  21 +++++
 fs/bcachefs/move.c             |  51 ++++++------
 fs/bcachefs/move.h             |   3 +-
 fs/bcachefs/movinggc.c         | 171 +++++++++++++++++------------------------
 fs/bcachefs/recovery.c         |   3 +
 fs/bcachefs/super.c            |   1 -
 fs/bcachefs/trace.h            |  10 ++-
 13 files changed, 189 insertions(+), 160 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index af3e55fdd54a..aefe72d34c5b 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -415,6 +415,8 @@ void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c
 	prt_newline(out);
 	prt_printf(out, "io_time[WRITE]    %llu",	a->io_time[WRITE]);
 	prt_newline(out);
+	prt_printf(out, "fragmentation     %llu",	a->fragmentation_lru);
+	prt_newline(out);
 	prt_printf(out, "bp_start          %llu", BCH_ALLOC_V4_BACKPOINTERS_START(a));
 	prt_newline(out);
 
@@ -910,8 +912,8 @@ int bch2_trans_mark_alloc(struct btree_trans *trans,
 	    !new_a->io_time[READ])
 		new_a->io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now));
 
-	old_lru = alloc_lru_idx(*old_a);
-	new_lru = alloc_lru_idx(*new_a);
+	old_lru = alloc_lru_idx_read(*old_a);
+	new_lru = alloc_lru_idx_read(*new_a);
 
 	if (old_lru != new_lru) {
 		ret = bch2_lru_change(trans, new->k.p.inode,
@@ -921,6 +923,18 @@ int bch2_trans_mark_alloc(struct btree_trans *trans,
 			return ret;
 	}
 
+	new_a->fragmentation_lru = alloc_lru_idx_fragmentation(*new_a,
+					bch_dev_bkey_exists(c, new->k.p.inode));
+
+	if (old_a->fragmentation_lru != new_a->fragmentation_lru) {
+		ret = bch2_lru_change(trans,
+				BCH_LRU_FRAGMENTATION_START,
+				bucket_to_u64(new->k.p),
+				old_a->fragmentation_lru, new_a->fragmentation_lru);
+		if (ret)
+			return ret;
+	}
+
 	if (old_a->gen != new_a->gen) {
 		ret = bch2_bucket_gen_update(trans, new->k.p, new_a->gen);
 		if (ret)
@@ -1777,7 +1791,7 @@ static int invalidate_one_bucket(struct btree_trans *trans,
 		goto out;
 
 	/* We expect harmless races here due to the btree write buffer: */
-	if (lru_pos_time(lru_iter->pos) != alloc_lru_idx(a->v))
+	if (lru_pos_time(lru_iter->pos) != alloc_lru_idx_read(a->v))
 		goto out;
 
 	BUG_ON(a->v.data_type != BCH_DATA_cached);
diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h
index b3c2f1e0deb6..96ac8f396d46 100644
--- a/fs/bcachefs/alloc_background.h
+++ b/fs/bcachefs/alloc_background.h
@@ -64,11 +64,24 @@ static inline enum bch_data_type alloc_data_type(struct bch_alloc_v4 a,
 				 a.stripe, a, data_type);
 }
 
-static inline u64 alloc_lru_idx(struct bch_alloc_v4 a)
+static inline u64 alloc_lru_idx_read(struct bch_alloc_v4 a)
 {
 	return a.data_type == BCH_DATA_cached ? a.io_time[READ] : 0;
 }
 
+static inline u64 alloc_lru_idx_fragmentation(struct bch_alloc_v4 a,
+					      struct bch_dev *ca)
+{
+	if (a.data_type != BCH_DATA_btree &&
+	    a.data_type != BCH_DATA_user)
+		return 0;
+
+	if (a.dirty_sectors >= ca->mi.bucket_size)
+		return 0;
+
+	return div_u64((u64) a.dirty_sectors * (1ULL << 31), ca->mi.bucket_size);
+}
+
 static inline u64 alloc_freespace_genbits(struct bch_alloc_v4 a)
 {
 	return ((u64) alloc_gc_gen(a) >> 4) << 56;
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 84b30adf56c9..5dc4b0c133ad 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -927,7 +927,6 @@ struct bch_fs {
 
 	/* COPYGC */
 	struct task_struct	*copygc_thread;
-	copygc_heap		copygc_heap;
 	struct write_point	copygc_write_point;
 	s64			copygc_wait;
 	bool			copygc_running;
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index 99f9fbd1401f..9524ff02f2d7 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -992,6 +992,7 @@ struct bch_alloc_v4 {
 	__u64			io_time[2];
 	__u32			stripe;
 	__u32			nr_external_backpointers;
+	__u64			fragmentation_lru;
 } __packed __aligned(8);
 
 #define BCH_ALLOC_V4_U64s_V0	6
@@ -1563,7 +1564,8 @@ struct bch_sb_field_journal_seq_blacklist {
 	x(inode_v3,			23)		\
 	x(unwritten_extents,		24)		\
 	x(bucket_gens,			25)		\
-	x(lru_v2,			26)
+	x(lru_v2,			26)		\
+	x(fragmentation_lru,		27)
 
 enum bcachefs_metadata_version {
 	bcachefs_metadata_version_min = 9,
diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h
index 1dbba7d906dd..2a9dab9006ef 100644
--- a/fs/bcachefs/buckets_types.h
+++ b/fs/bcachefs/buckets_types.h
@@ -89,15 +89,4 @@ struct disk_reservation {
 	unsigned		nr_replicas;
 };
 
-struct copygc_heap_entry {
-	u8			dev;
-	u8			gen;
-	u8			replicas;
-	u32			fragmentation;
-	u32			sectors;
-	u64			bucket;
-};
-
-typedef HEAP(struct copygc_heap_entry) copygc_heap;
-
 #endif /* _BUCKETS_TYPES_H */
diff --git a/fs/bcachefs/lru.c b/fs/bcachefs/lru.c
index c121a7cc3acd..e913b90f37b7 100644
--- a/fs/bcachefs/lru.c
+++ b/fs/bcachefs/lru.c
@@ -93,6 +93,13 @@ int bch2_lru_change(struct btree_trans *trans,
 		bch2_lru_set(trans, lru_id, dev_bucket, new_time);
 }
 
+static const char * const bch2_lru_types[] = {
+#define x(n) #n,
+	BCH_LRU_TYPES()
+#undef x
+	NULL
+};
+
 static int bch2_check_lru_key(struct btree_trans *trans,
 			      struct btree_iter *lru_iter,
 			      struct bkey_s_c lru_k,
@@ -105,7 +112,9 @@ static int bch2_check_lru_key(struct btree_trans *trans,
 	const struct bch_alloc_v4 *a;
 	struct printbuf buf1 = PRINTBUF;
 	struct printbuf buf2 = PRINTBUF;
+	enum bch_lru_type type = lru_type(lru_k);
 	struct bpos alloc_pos = u64_to_bucket(lru_k.k->p.offset);
+	u64 idx;
 	int ret;
 
 	if (fsck_err_on(!bch2_dev_bucket_exists(c, alloc_pos), c,
@@ -121,9 +130,17 @@ static int bch2_check_lru_key(struct btree_trans *trans,
 
 	a = bch2_alloc_to_v4(k, &a_convert);
 
+	switch (type) {
+	case BCH_LRU_read:
+		idx = alloc_lru_idx_read(*a);
+		break;
+	case BCH_LRU_fragmentation:
+		idx = a->fragmentation_lru;
+		break;
+	}
+
 	if (lru_k.k->type != KEY_TYPE_set ||
-	    a->data_type != BCH_DATA_cached ||
-	    a->io_time[READ] != lru_pos_time(lru_k.k->p)) {}
+	    lru_pos_time(lru_k.k->p) != idx) {
 		if (!bpos_eq(*last_flushed_pos, lru_k.k->p)) {
 			*last_flushed_pos = lru_k.k->p;
 			ret = bch2_btree_write_buffer_flush_sync(trans) ?:
@@ -131,17 +148,14 @@ static int bch2_check_lru_key(struct btree_trans *trans,
 			goto out;
 		}
 
-		if (fsck_err_on(lru_k.k->type != KEY_TYPE_set ||
-				a->data_type != BCH_DATA_cached ||
-				a->io_time[READ] != lru_pos_time(lru_k.k->p), c,
-				"incorrect lru entry (time %llu) %s\n"
-				"  for %s",
-				lru_pos_time(lru_k.k->p),
-				(bch2_bkey_val_to_text(&buf1, c, lru_k), buf1.buf),
-				(bch2_bkey_val_to_text(&buf2, c, k), buf2.buf))) {
+		if (fsck_err(c, "incorrect lru entry: lru %s time %llu\n"
+			     "  %s\n"
+			     "  for %s",
+			     bch2_lru_types[type],
+			     lru_pos_time(lru_k.k->p),
+			     (bch2_bkey_val_to_text(&buf1, c, lru_k), buf1.buf),
+			     (bch2_bkey_val_to_text(&buf2, c, k), buf2.buf)))
 			ret = bch2_btree_delete_at(trans, lru_iter, 0);
-			if (ret)
-				goto err;
 	}
 out:
 err:
diff --git a/fs/bcachefs/lru.h b/fs/bcachefs/lru.h
index b8d9848cdb1a..78a6076999ed 100644
--- a/fs/bcachefs/lru.h
+++ b/fs/bcachefs/lru.h
@@ -22,6 +22,27 @@ static inline u64 lru_pos_time(struct bpos pos)
 	return pos.inode & ~(~0ULL << LRU_TIME_BITS);
 }
 
+#define BCH_LRU_TYPES()		\
+	x(read)			\
+	x(fragmentation)
+
+enum bch_lru_type {
+#define x(n) BCH_LRU_##n,
+	BCH_LRU_TYPES()
+#undef x
+};
+
+#define BCH_LRU_FRAGMENTATION_START	((1U << 16) - 1)
+
+static inline enum bch_lru_type lru_type(struct bkey_s_c l)
+{
+	u16 lru_id = l.k->p.inode >> 48;
+
+	if (lru_id == BCH_LRU_FRAGMENTATION_START)
+		return BCH_LRU_fragmentation;
+	return BCH_LRU_read;
+}
+
 int bch2_lru_invalid(const struct bch_fs *, struct bkey_s_c, unsigned, struct printbuf *);
 void bch2_lru_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index 67f861eb597a..c964643e7ebf 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -652,13 +652,13 @@ failed_to_evacuate:
 	printbuf_exit(&buf);
 }
 
-int __bch2_evacuate_bucket(struct moving_context *ctxt,
+int __bch2_evacuate_bucket(struct btree_trans *trans,
+			   struct moving_context *ctxt,
 			   struct bpos bucket, int gen,
 			   struct data_update_opts _data_opts)
 {
 	struct bch_fs *c = ctxt->c;
 	struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts);
-	struct btree_trans trans;
 	struct btree_iter iter;
 	struct bkey_buf sk;
 	struct bch_backpointer bp;
@@ -667,17 +667,17 @@ int __bch2_evacuate_bucket(struct moving_context *ctxt,
 	struct bkey_s_c k;
 	struct data_update_opts data_opts;
 	unsigned dirty_sectors, bucket_size;
+	u64 fragmentation;
 	u64 bp_offset = 0, cur_inum = U64_MAX;
 	int ret = 0;
 
 	bch2_bkey_buf_init(&sk);
-	bch2_trans_init(&trans, c, 0, 0);
 
-	bch2_trans_iter_init(&trans, &iter, BTREE_ID_alloc,
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc,
 			     bucket, BTREE_ITER_CACHED);
-	ret = lockrestart_do(&trans,
+	ret = lockrestart_do(trans,
 			bkey_err(k = bch2_btree_iter_peek_slot(&iter)));
-	bch2_trans_iter_exit(&trans, &iter);
+	bch2_trans_iter_exit(trans, &iter);
 
 	if (ret) {
 		bch_err(c, "%s: error looking up alloc key: %s", __func__, bch2_err_str(ret));
@@ -687,17 +687,18 @@ int __bch2_evacuate_bucket(struct moving_context *ctxt,
 	a = bch2_alloc_to_v4(k, &a_convert);
 	dirty_sectors = a->dirty_sectors;
 	bucket_size = bch_dev_bkey_exists(c, bucket.inode)->mi.bucket_size;
+	fragmentation = a->fragmentation_lru;
 
-	ret = bch2_btree_write_buffer_flush(&trans);
+	ret = bch2_btree_write_buffer_flush(trans);
 	if (ret) {
 		bch_err(c, "%s: error flushing btree write buffer: %s", __func__, bch2_err_str(ret));
 		goto err;
 	}
 
-	while (!(ret = move_ratelimit(&trans, ctxt))) {
-		bch2_trans_begin(&trans);
+	while (!(ret = move_ratelimit(trans, ctxt))) {
+		bch2_trans_begin(trans);
 
-		ret = bch2_get_next_backpointer(&trans, bucket, gen,
+		ret = bch2_get_next_backpointer(trans, bucket, gen,
 						&bp_offset, &bp,
 						BTREE_ITER_CACHED);
 		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
@@ -712,7 +713,7 @@ int __bch2_evacuate_bucket(struct moving_context *ctxt,
 			struct bkey_s_c k;
 			unsigned i = 0;
 
-			k = bch2_backpointer_get_key(&trans, &iter,
+			k = bch2_backpointer_get_key(trans, &iter,
 						bucket, bp_offset, bp);
 			ret = bkey_err(k);
 			if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
@@ -725,9 +726,9 @@ int __bch2_evacuate_bucket(struct moving_context *ctxt,
 			bch2_bkey_buf_reassemble(&sk, c, k);
 			k = bkey_i_to_s_c(sk.k);
 
-			ret = move_get_io_opts(&trans, &io_opts, k, &cur_inum);
+			ret = move_get_io_opts(trans, &io_opts, k, &cur_inum);
 			if (ret) {
-				bch2_trans_iter_exit(&trans, &iter);
+				bch2_trans_iter_exit(trans, &iter);
 				continue;
 			}
 
@@ -741,15 +742,15 @@ int __bch2_evacuate_bucket(struct moving_context *ctxt,
 				i++;
 			}
 
-			ret = bch2_move_extent(&trans, &iter, ctxt, io_opts,
+			ret = bch2_move_extent(trans, &iter, ctxt, io_opts,
 					       bp.btree_id, k, data_opts);
-			bch2_trans_iter_exit(&trans, &iter);
+			bch2_trans_iter_exit(trans, &iter);
 
 			if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
 				continue;
 			if (ret == -ENOMEM) {
 				/* memory allocation failure, wait for some IO to finish */
-				bch2_move_ctxt_wait_for_io(ctxt, &trans);
+				bch2_move_ctxt_wait_for_io(ctxt, trans);
 				continue;
 			}
 			if (ret)
@@ -761,7 +762,7 @@ int __bch2_evacuate_bucket(struct moving_context *ctxt,
 		} else {
 			struct btree *b;
 
-			b = bch2_backpointer_get_node(&trans, &iter,
+			b = bch2_backpointer_get_node(trans, &iter,
 						bucket, bp_offset, bp);
 			ret = PTR_ERR_OR_ZERO(b);
 			if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node)
@@ -773,8 +774,8 @@ int __bch2_evacuate_bucket(struct moving_context *ctxt,
 			if (!b)
 				goto next;
 
-			ret = bch2_btree_node_rewrite(&trans, &iter, b, 0);
-			bch2_trans_iter_exit(&trans, &iter);
+			ret = bch2_btree_node_rewrite(trans, &iter, b, 0);
+			bch2_trans_iter_exit(trans, &iter);
 
 			if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
 				continue;
@@ -791,17 +792,16 @@ next:
 		bp_offset++;
 	}
 
-	trace_evacuate_bucket(c, &bucket, dirty_sectors, bucket_size, ret);
+	trace_evacuate_bucket(c, &bucket, dirty_sectors, bucket_size, fragmentation, ret);
 
 	if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) && gen >= 0) {
-		bch2_trans_unlock(&trans);
+		bch2_trans_unlock(trans);
 		move_ctxt_wait_event(ctxt, NULL, list_empty(&ctxt->reads));
 		closure_sync(&ctxt->cl);
 		if (!ctxt->write_error)
-			verify_bucket_evacuated(&trans, bucket, gen);
+			verify_bucket_evacuated(trans, bucket, gen);
 	}
 err:
-	bch2_trans_exit(&trans);
 	bch2_bkey_buf_exit(&sk, c);
 	return ret;
 }
@@ -814,12 +814,15 @@ int bch2_evacuate_bucket(struct bch_fs *c,
 			 struct write_point_specifier wp,
 			 bool wait_on_copygc)
 {
+	struct btree_trans trans;
 	struct moving_context ctxt;
 	int ret;
 
+	bch2_trans_init(&trans, c, 0, 0);
 	bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc);
-	ret = __bch2_evacuate_bucket(&ctxt, bucket, gen, data_opts);
+	ret = __bch2_evacuate_bucket(&trans, &ctxt, bucket, gen, data_opts);
 	bch2_moving_ctxt_exit(&ctxt);
+	bch2_trans_exit(&trans);
 
 	return ret;
 }
diff --git a/fs/bcachefs/move.h b/fs/bcachefs/move.h
index aef613802935..c5a7c0add1d6 100644
--- a/fs/bcachefs/move.h
+++ b/fs/bcachefs/move.h
@@ -66,7 +66,8 @@ int bch2_move_data(struct bch_fs *,
 		   bool,
 		   move_pred_fn, void *);
 
-int __bch2_evacuate_bucket(struct moving_context *,
+int __bch2_evacuate_bucket(struct btree_trans *,
+			   struct moving_context *,
 			   struct bpos, int,
 			   struct data_update_opts);
 int bch2_evacuate_bucket(struct bch_fs *, struct bpos, int,
diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c
index b420b79edb36..74e57f6ea148 100644
--- a/fs/bcachefs/movinggc.c
+++ b/fs/bcachefs/movinggc.c
@@ -10,6 +10,7 @@
 #include "alloc_foreground.h"
 #include "btree_iter.h"
 #include "btree_update.h"
+#include "btree_write_buffer.h"
 #include "buckets.h"
 #include "clock.h"
 #include "disk_groups.h"
@@ -19,6 +20,7 @@
 #include "eytzinger.h"
 #include "io.h"
 #include "keylist.h"
+#include "lru.h"
 #include "move.h"
 #include "movinggc.h"
 #include "super-io.h"
@@ -31,138 +33,105 @@
 #include <linux/sort.h>
 #include <linux/wait.h>
 
-static inline int fragmentation_cmp(copygc_heap *heap,
-				   struct copygc_heap_entry l,
-				   struct copygc_heap_entry r)
+static int bch2_bucket_is_movable(struct btree_trans *trans,
+				  struct bpos bucket, u64 time, u8 *gen)
 {
-	return cmp_int(l.fragmentation, r.fragmentation);
-}
-
-static int find_buckets_to_copygc(struct bch_fs *c)
-{
-	copygc_heap *h = &c->copygc_heap;
-	struct btree_trans trans;
 	struct btree_iter iter;
 	struct bkey_s_c k;
+	struct bch_alloc_v4 _a;
+	const struct bch_alloc_v4 *a;
 	int ret;
 
-	bch2_trans_init(&trans, c, 0, 0);
+	if (bch2_bucket_is_open(trans->c, bucket.inode, bucket.offset))
+		return 0;
 
-	/*
-	 * Find buckets with lowest sector counts, skipping completely
-	 * empty buckets, by building a maxheap sorted by sector count,
-	 * and repeatedly replacing the maximum element until all
-	 * buckets have been visited.
-	 */
-	h->used = 0;
-
-	for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN,
-			   BTREE_ITER_PREFETCH, k, ret) {
-		struct bch_dev *ca = bch_dev_bkey_exists(c, iter.pos.inode);
-		struct copygc_heap_entry e;
-		struct bch_alloc_v4 a_convert;
-		const struct bch_alloc_v4 *a;
-
-		a = bch2_alloc_to_v4(k, &a_convert);
-
-		if ((a->data_type != BCH_DATA_btree &&
-		     a->data_type != BCH_DATA_user) ||
-		    a->dirty_sectors >= ca->mi.bucket_size ||
-		    bch2_bucket_is_open(c, iter.pos.inode, iter.pos.offset))
-			continue;
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, bucket, 0);
+	k = bch2_btree_iter_peek_slot(&iter);
+	ret = bkey_err(k);
+	bch2_trans_iter_exit(trans, &iter);
+
+	if (ret)
+		return ret;
 
-		e = (struct copygc_heap_entry) {
-			.dev		= iter.pos.inode,
-			.gen		= a->gen,
-			.replicas	= 1 + a->stripe_redundancy,
-			.fragmentation	= div_u64((u64) a->dirty_sectors * (1ULL << 31),
-						  ca->mi.bucket_size),
-			.sectors	= a->dirty_sectors,
-			.bucket		= iter.pos.offset,
-		};
-		heap_add_or_replace(h, e, -fragmentation_cmp, NULL);
+	a = bch2_alloc_to_v4(k, &_a);
+	*gen = a->gen;
+	ret = (a->data_type == BCH_DATA_btree ||
+	       a->data_type == BCH_DATA_user) &&
+		a->fragmentation_lru &&
+		a->fragmentation_lru <= time;
 
+	if (ret) {
+		struct printbuf buf = PRINTBUF;
+
+		bch2_bkey_val_to_text(&buf, trans->c, k);
+		pr_debug("%s", buf.buf);
+		printbuf_exit(&buf);
 	}
-	bch2_trans_iter_exit(&trans, &iter);
 
-	bch2_trans_exit(&trans);
 	return ret;
 }
 
+static int bch2_copygc_next_bucket(struct btree_trans *trans,
+				   struct bpos *bucket, u8 *gen, struct bpos *pos)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	int ret;
+
+	ret = for_each_btree_key2_upto(trans, iter, BTREE_ID_lru,
+				  bpos_max(*pos, lru_pos(BCH_LRU_FRAGMENTATION_START, 0, 0)),
+				  lru_pos(BCH_LRU_FRAGMENTATION_START, U64_MAX, LRU_TIME_MAX),
+				  0, k, ({
+		*bucket = u64_to_bucket(k.k->p.offset);
+
+		bch2_bucket_is_movable(trans, *bucket, lru_pos_time(k.k->p), gen);
+	}));
+
+	*pos = iter.pos;
+	if (ret < 0)
+		return ret;
+	return ret ? 0 : -ENOENT;
+}
+
 static int bch2_copygc(struct bch_fs *c)
 {
-	copygc_heap *h = &c->copygc_heap;
-	struct copygc_heap_entry e;
 	struct bch_move_stats move_stats;
-	struct bch_dev *ca;
-	unsigned dev_idx;
-	size_t heap_size = 0;
+	struct btree_trans trans;
 	struct moving_context ctxt;
 	struct data_update_opts data_opts = {
 		.btree_insert_flags = BTREE_INSERT_USE_RESERVE|JOURNAL_WATERMARK_copygc,
 	};
+	struct bpos bucket;
+	struct bpos pos;
+	u8 gen = 0;
+	unsigned nr_evacuated;
 	int ret = 0;
 
 	bch2_move_stats_init(&move_stats, "copygc");
-
-	for_each_rw_member(ca, c, dev_idx)
-		heap_size += ca->mi.nbuckets >> 7;
-
-	if (h->size < heap_size) {
-		free_heap(&c->copygc_heap);
-		if (!init_heap(&c->copygc_heap, heap_size, GFP_KERNEL)) {
-			bch_err(c, "error allocating copygc heap");
-			return 0;
-		}
-	}
-
-	ret = find_buckets_to_copygc(c);
-	if (ret) {
-		bch2_fs_fatal_error(c, "error walking buckets to copygc!");
-		return ret;
-	}
-
-	if (!h->used) {
-		s64 wait = S64_MAX, dev_wait;
-		u64 dev_min_wait_fragmented = 0;
-		u64 dev_min_wait_allowed = 0;
-		int dev_min_wait = -1;
-
-		for_each_rw_member(ca, c, dev_idx) {
-			struct bch_dev_usage usage = bch2_dev_usage_read(ca);
-			s64 allowed = ((__dev_buckets_available(ca, usage, RESERVE_none) *
-					       ca->mi.bucket_size) >> 1);
-			s64 fragmented = usage.d[BCH_DATA_user].fragmented;
-
-			dev_wait = max(0LL, allowed - fragmented);
-
-			if (dev_min_wait < 0 || dev_wait < wait) {
-				dev_min_wait = dev_idx;
-				dev_min_wait_fragmented = fragmented;
-				dev_min_wait_allowed	= allowed;
-			}
-		}
-
-		bch_err_ratelimited(c, "copygc requested to run but found no buckets to move! dev %u fragmented %llu allowed %llu",
-				    dev_min_wait, dev_min_wait_fragmented, dev_min_wait_allowed);
-		return 0;
-	}
-
-	heap_resort(h, fragmentation_cmp, NULL);
-
 	bch2_moving_ctxt_init(&ctxt, c, NULL, &move_stats,
 			      writepoint_ptr(&c->copygc_write_point),
 			      false);
+	bch2_trans_init(&trans, c, 0, 0);
+
+	ret = bch2_btree_write_buffer_flush(&trans);
+	BUG_ON(ret);
 
-	/* not correct w.r.t. device removal */
-	while (h->used && !ret) {
-		BUG_ON(!heap_pop(h, e, -fragmentation_cmp, NULL));
-		ret = __bch2_evacuate_bucket(&ctxt, POS(e.dev, e.bucket), e.gen,
-					     data_opts);
+	for (nr_evacuated = 0, pos = POS_MIN;
+	     nr_evacuated < 32 && !ret;
+	     nr_evacuated++, pos = bpos_nosnap_successor(pos)) {
+		ret = bch2_copygc_next_bucket(&trans, &bucket, &gen, &pos) ?:
+			__bch2_evacuate_bucket(&trans, &ctxt, bucket, gen, data_opts);
+		if (bkey_eq(pos, POS_MAX))
+			break;
 	}
 
+	bch2_trans_exit(&trans);
 	bch2_moving_ctxt_exit(&ctxt);
 
+	/* no entries in LRU btree found, or got to end: */
+	if (ret == -ENOENT)
+		ret = 0;
+
 	if (ret < 0 && !bch2_err_matches(ret, EROFS))
 		bch_err(c, "error from bch2_move_data() in copygc: %s", bch2_err_str(ret));
 
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 178f06424460..1976d5fa3427 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -1105,6 +1105,9 @@ int bch2_fs_recovery(struct bch_fs *c)
 			c->opts.version_upgrade	= true;
 			c->opts.fsck		= true;
 			c->opts.fix_errors	= FSCK_OPT_YES;
+		} else if (c->sb.version < bcachefs_metadata_version_fragmentation_lru) {
+			bch_info(c, "version prior to backpointers, upgrade required");
+			c->opts.version_upgrade	= true;
 		}
 	}
 
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 58517f6d128f..f703e41c7560 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -487,7 +487,6 @@ static void __bch2_fs_free(struct bch_fs *c)
 	kfree(rcu_dereference_protected(c->disk_groups, 1));
 	kfree(c->journal_seq_blacklist_table);
 	kfree(c->unused_inode_hints);
-	free_heap(&c->copygc_heap);
 
 	if (c->io_complete_wq)
 		destroy_workqueue(c->io_complete_wq);
diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h
index 24dd2defe7c7..30b10908ced0 100644
--- a/fs/bcachefs/trace.h
+++ b/fs/bcachefs/trace.h
@@ -723,8 +723,8 @@ TRACE_EVENT(move_data,
 TRACE_EVENT(evacuate_bucket,
 	TP_PROTO(struct bch_fs *c, struct bpos *bucket,
 		 unsigned sectors, unsigned bucket_size,
-		 int ret),
-	TP_ARGS(c, bucket, sectors, bucket_size, ret),
+		 u64 fragmentation, int ret),
+	TP_ARGS(c, bucket, sectors, bucket_size, fragmentation, ret),
 
 	TP_STRUCT__entry(
 		__field(dev_t,		dev		)
@@ -732,6 +732,7 @@ TRACE_EVENT(evacuate_bucket,
 		__field(u64,		bucket		)
 		__field(u32,		sectors		)
 		__field(u32,		bucket_size	)
+		__field(u64,		fragmentation	)
 		__field(int,		ret		)
 	),
 
@@ -741,14 +742,15 @@ TRACE_EVENT(evacuate_bucket,
 		__entry->bucket			= bucket->offset;
 		__entry->sectors		= sectors;
 		__entry->bucket_size		= bucket_size;
+		__entry->fragmentation		= fragmentation;
 		__entry->ret			= ret;
 	),
 
-	TP_printk("%d,%d %llu:%llu sectors %u/%u ret %i",
+	TP_printk("%d,%d %llu:%llu sectors %u/%u fragmentation %llu ret %i",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  __entry->member, __entry->bucket,
 		  __entry->sectors, __entry->bucket_size,
-		  __entry->ret)
+		  __entry->fragmentation, __entry->ret)
 );
 
 TRACE_EVENT(copygc,
-- 
cgit 


From 84ddb8b98e674c0d052dd56a406efc5275c9508a Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 17 Feb 2023 20:33:12 -0500
Subject: bcachefs: Don't invalidate open buckets

Like bch2_trans_mark_bucket(), we shouldn't be incrementing a bucket gen
while it's still open - erasure coding was hitting this.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index aefe72d34c5b..472466d16b68 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -1785,6 +1785,9 @@ static int invalidate_one_bucket(struct btree_trans *trans,
 		goto err;
 	}
 
+	if (bch2_bucket_is_open_safe(c, bucket.inode, bucket.offset))
+		return 0;
+
 	a = bch2_trans_start_alloc_update(trans, &alloc_iter, bucket);
 	ret = PTR_ERR_OR_ZERO(a);
 	if (ret)
-- 
cgit 


From 2c7dd446d91681e90396c82e20c703b93f8daa2f Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 17 Feb 2023 20:50:55 -0500
Subject: bcachefs: Erasure coding now uses bch2_bucket_alloc_trans

This code predates plumbing btree_trans through the bucket allocation
path: switching to it fixes a deadlock due to using multiple btree_trans
at the same time, which we never want to do.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_foreground.c | 29 ++++++-----------------------
 fs/bcachefs/alloc_foreground.h |  2 +-
 fs/bcachefs/ec.c               | 13 +++++++------
 fs/bcachefs/ec.h               |  2 +-
 4 files changed, 15 insertions(+), 31 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index 9e1c236d57b8..2eab63b90664 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -713,7 +713,7 @@ static void add_new_bucket(struct bch_fs *c,
 	ob_push(c, ptrs, ob);
 }
 
-static int bch2_bucket_alloc_set_trans(struct btree_trans *trans,
+int bch2_bucket_alloc_set_trans(struct btree_trans *trans,
 		      struct open_buckets *ptrs,
 		      struct dev_stripe_state *stripe,
 		      struct bch_devs_mask *devs_may_alloc,
@@ -779,24 +779,6 @@ static int bch2_bucket_alloc_set_trans(struct btree_trans *trans,
 	return ret;
 }
 
-int bch2_bucket_alloc_set(struct bch_fs *c,
-		      struct open_buckets *ptrs,
-		      struct dev_stripe_state *stripe,
-		      struct bch_devs_mask *devs_may_alloc,
-		      unsigned nr_replicas,
-		      unsigned *nr_effective,
-		      bool *have_cache,
-		      enum alloc_reserve reserve,
-		      unsigned flags,
-		      struct closure *cl)
-{
-	return bch2_trans_do(c, NULL, NULL, 0,
-		      bch2_bucket_alloc_set_trans(&trans, ptrs, stripe,
-					      devs_may_alloc, nr_replicas,
-					      nr_effective, have_cache, reserve,
-					      flags, cl));
-}
-
 /* Allocate from stripes: */
 
 /*
@@ -805,7 +787,7 @@ int bch2_bucket_alloc_set(struct bch_fs *c,
  * it's to a device we don't want:
  */
 
-static int bucket_alloc_from_stripe(struct bch_fs *c,
+static int bucket_alloc_from_stripe(struct btree_trans *trans,
 			 struct open_buckets *ptrs,
 			 struct write_point *wp,
 			 struct bch_devs_mask *devs_may_alloc,
@@ -817,6 +799,7 @@ static int bucket_alloc_from_stripe(struct bch_fs *c,
 			 unsigned flags,
 			 struct closure *cl)
 {
+	struct bch_fs *c = trans->c;
 	struct dev_alloc_list devs_sorted;
 	struct ec_stripe_head *h;
 	struct open_bucket *ob;
@@ -832,11 +815,11 @@ static int bucket_alloc_from_stripe(struct bch_fs *c,
 	if (ec_open_bucket(c, ptrs))
 		return 0;
 
-	h = bch2_ec_stripe_head_get(c, target, 0, nr_replicas - 1,
+	h = bch2_ec_stripe_head_get(trans, target, 0, nr_replicas - 1,
 				    wp == &c->copygc_write_point,
 				    cl);
 	if (IS_ERR(h))
-		return -PTR_ERR(h);
+		return PTR_ERR(h);
 	if (!h)
 		return 0;
 
@@ -942,7 +925,7 @@ static int open_bucket_add_buckets(struct btree_trans *trans,
 		}
 
 		if (!ec_open_bucket(c, ptrs)) {
-			ret = bucket_alloc_from_stripe(c, ptrs, wp, &devs,
+			ret = bucket_alloc_from_stripe(trans, ptrs, wp, &devs,
 						 target, erasure_code,
 						 nr_replicas, nr_effective,
 						 have_cache, flags, _cl);
diff --git a/fs/bcachefs/alloc_foreground.h b/fs/bcachefs/alloc_foreground.h
index 26e986f2385b..ba7a87afda0e 100644
--- a/fs/bcachefs/alloc_foreground.h
+++ b/fs/bcachefs/alloc_foreground.h
@@ -150,7 +150,7 @@ static inline bool bch2_bucket_is_open_safe(struct bch_fs *c, unsigned dev, u64
 	return ret;
 }
 
-int bch2_bucket_alloc_set(struct bch_fs *, struct open_buckets *,
+int bch2_bucket_alloc_set_trans(struct btree_trans *, struct open_buckets *,
 		      struct dev_stripe_state *, struct bch_devs_mask *,
 		      unsigned, unsigned *, bool *, enum alloc_reserve,
 		      unsigned, struct closure *);
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index 879df8bd1f51..ca3e4a18e28a 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -1294,9 +1294,10 @@ found:
 	return h;
 }
 
-static int new_stripe_alloc_buckets(struct bch_fs *c, struct ec_stripe_head *h,
+static int new_stripe_alloc_buckets(struct btree_trans *trans, struct ec_stripe_head *h,
 				    struct closure *cl)
 {
+	struct bch_fs *c = trans->c;
 	struct bch_devs_mask devs = h->devs;
 	struct open_bucket *ob;
 	struct open_buckets buckets;
@@ -1319,7 +1320,7 @@ static int new_stripe_alloc_buckets(struct bch_fs *c, struct ec_stripe_head *h,
 
 	buckets.nr = 0;
 	if (nr_have_parity < h->s->nr_parity) {
-		ret = bch2_bucket_alloc_set(c, &buckets,
+		ret = bch2_bucket_alloc_set_trans(trans, &buckets,
 					    &h->parity_stripe,
 					    &devs,
 					    h->s->nr_parity,
@@ -1348,7 +1349,7 @@ static int new_stripe_alloc_buckets(struct bch_fs *c, struct ec_stripe_head *h,
 
 	buckets.nr = 0;
 	if (nr_have_data < h->s->nr_data) {
-		ret = bch2_bucket_alloc_set(c, &buckets,
+		ret = bch2_bucket_alloc_set_trans(trans, &buckets,
 					    &h->block_stripe,
 					    &devs,
 					    h->s->nr_data,
@@ -1464,13 +1465,14 @@ static int __bch2_ec_stripe_head_reserve(struct bch_fs *c,
 					 h->s->nr_parity, 0);
 }
 
-struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *c,
+struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans,
 					       unsigned target,
 					       unsigned algo,
 					       unsigned redundancy,
 					       bool copygc,
 					       struct closure *cl)
 {
+	struct bch_fs *c = trans->c;
 	struct ec_stripe_head *h;
 	int ret;
 	bool needs_stripe_new;
@@ -1509,7 +1511,7 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *c,
 	}
 
 	if (!h->s->allocated) {
-		ret = new_stripe_alloc_buckets(c, h, cl);
+		ret = new_stripe_alloc_buckets(trans, h, cl);
 		if (ret)
 			goto err;
 
@@ -1517,7 +1519,6 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *c,
 	}
 
 	return h;
-
 err:
 	bch2_ec_stripe_head_put(c, h);
 	return ERR_PTR(ret);
diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h
index d47da7d86fe7..37d42e2a4505 100644
--- a/fs/bcachefs/ec.h
+++ b/fs/bcachefs/ec.h
@@ -200,7 +200,7 @@ void bch2_ec_bucket_cancel(struct bch_fs *, struct open_bucket *);
 int bch2_ec_stripe_new_alloc(struct bch_fs *, struct ec_stripe_head *);
 
 void bch2_ec_stripe_head_put(struct bch_fs *, struct ec_stripe_head *);
-struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *,
+struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *,
 			unsigned, unsigned, unsigned, bool, struct closure *);
 
 void bch2_stripes_heap_update(struct bch_fs *, struct stripe *, size_t);
-- 
cgit 


From 6623c0fcdffe22db466ec38c5f9f4b3a44c33003 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 17 Feb 2023 20:51:52 -0500
Subject: bcachefs: Add an assertion for using multiple btree_trans

A thread should never be using more than one btree_trans - doing so is
an invitation for deadlocks.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 4ac1364acc8b..5ab22c4c2b2e 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -2955,6 +2955,15 @@ void __bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, unsigned fn_
 
 		mutex_lock(&c->btree_trans_lock);
 		list_for_each_entry(pos, &c->btree_trans_list, list) {
+			/*
+			 * We'd much prefer to be stricter here and completely
+			 * disallow multiple btree_trans in the same thread -
+			 * but the data move path calls bch2_write when we
+			 * already have a btree_trans initialized.
+			 */
+			BUG_ON(trans->locking_wait.task->pid == pos->locking_wait.task->pid &&
+			       bch2_trans_locked(pos));
+
 			if (trans->locking_wait.task->pid < pos->locking_wait.task->pid) {
 				list_add_tail(&trans->list, &pos->list);
 				goto list_add_done;
-- 
cgit 


From af0ee5bcf3012be753ab15ce9c27971e5b34bd74 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 17 Feb 2023 21:04:46 -0500
Subject: bcachefs: Don't block on ec_stripe_head_lock with btree locks held

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/ec.c | 24 ++++++++++++++++++------
 1 file changed, 18 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index ca3e4a18e28a..236e1bef5f02 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -1267,18 +1267,30 @@ void bch2_ec_stripe_head_put(struct bch_fs *c, struct ec_stripe_head *h)
 	mutex_unlock(&h->lock);
 }
 
-struct ec_stripe_head *__bch2_ec_stripe_head_get(struct bch_fs *c,
+struct ec_stripe_head *__bch2_ec_stripe_head_get(struct btree_trans *trans,
 						 unsigned target,
 						 unsigned algo,
 						 unsigned redundancy,
 						 bool copygc)
 {
+	struct bch_fs *c = trans->c;
 	struct ec_stripe_head *h;
+	int ret;
 
 	if (!redundancy)
 		return NULL;
 
-	mutex_lock(&c->ec_stripe_head_lock);
+	if (!mutex_trylock(&c->ec_stripe_head_lock)) {
+		bch2_trans_unlock(trans);
+		mutex_lock(&c->ec_stripe_head_lock);
+
+		ret = bch2_trans_relock(trans);
+		if (ret) {
+			mutex_unlock(&c->ec_stripe_head_lock);
+			return ERR_PTR(ret);
+		}
+	}
+
 	list_for_each_entry(h, &c->ec_stripe_head_list, list)
 		if (h->target		== target &&
 		    h->algo		== algo &&
@@ -1477,11 +1489,11 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans,
 	int ret;
 	bool needs_stripe_new;
 
-	h = __bch2_ec_stripe_head_get(c, target, algo, redundancy, copygc);
-	if (!h) {
+	h = __bch2_ec_stripe_head_get(trans, target, algo, redundancy, copygc);
+	if (!h)
 		bch_err(c, "no stripe head");
-		return NULL;
-	}
+	if (IS_ERR_OR_NULL(h))
+		return h;
 
 	needs_stripe_new = !h->s;
 	if (needs_stripe_new) {
-- 
cgit 


From 73d86dfd888541fd85f7e4d03c898f2ad8486196 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 17 Feb 2023 22:43:47 -0500
Subject: bcachefs: Fix erasure coding locking

This adds a new helper, bch2_trans_mutex_lock(), for locking a mutex -
dropping and retaking btree locks as needed.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_foreground.c | 10 +++++-----
 fs/bcachefs/btree_iter.h       |  9 +++++++++
 fs/bcachefs/btree_locking.c    | 13 +++++++++++++
 fs/bcachefs/ec.c               | 19 +++++++------------
 4 files changed, 34 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index 2eab63b90664..0b0fe4fea6cc 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -1073,7 +1073,7 @@ static bool try_decrease_writepoints(struct bch_fs *c,
 	return true;
 }
 
-static void bch2_trans_mutex_lock(struct btree_trans *trans,
+static void bch2_trans_mutex_lock_norelock(struct btree_trans *trans,
 				  struct mutex *lock)
 {
 	if (!mutex_trylock(lock)) {
@@ -1091,7 +1091,7 @@ static struct write_point *writepoint_find(struct btree_trans *trans,
 
 	if (!(write_point & 1UL)) {
 		wp = (struct write_point *) write_point;
-		bch2_trans_mutex_lock(trans, &wp->lock);
+		bch2_trans_mutex_lock_norelock(trans, &wp->lock);
 		return wp;
 	}
 
@@ -1100,7 +1100,7 @@ restart_find:
 	wp = __writepoint_find(head, write_point);
 	if (wp) {
 lock_wp:
-		bch2_trans_mutex_lock(trans, &wp->lock);
+		bch2_trans_mutex_lock_norelock(trans, &wp->lock);
 		if (wp->write_point == write_point)
 			goto out;
 		mutex_unlock(&wp->lock);
@@ -1113,8 +1113,8 @@ restart_find_oldest:
 		if (!oldest || time_before64(wp->last_used, oldest->last_used))
 			oldest = wp;
 
-	bch2_trans_mutex_lock(trans, &oldest->lock);
-	bch2_trans_mutex_lock(trans, &c->write_points_hash_lock);
+	bch2_trans_mutex_lock_norelock(trans, &oldest->lock);
+	bch2_trans_mutex_lock_norelock(trans, &c->write_points_hash_lock);
 	if (oldest >= c->write_points + c->write_points_nr ||
 	    try_increase_writepoints(c)) {
 		mutex_unlock(&c->write_points_hash_lock);
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index 4b1f993ea3fb..458c7f7dc5b7 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -197,6 +197,15 @@ struct bkey_i *bch2_btree_journal_peek_slot(struct btree_trans *,
 
 void bch2_btree_path_level_init(struct btree_trans *, struct btree_path *, struct btree *);
 
+int __bch2_trans_mutex_lock(struct btree_trans *, struct mutex *);
+
+static inline int bch2_trans_mutex_lock(struct btree_trans *trans, struct mutex *lock)
+{
+	return mutex_trylock(lock)
+		? 0
+		: __bch2_trans_mutex_lock(trans, lock);
+}
+
 #ifdef CONFIG_BCACHEFS_DEBUG
 void bch2_trans_verify_paths(struct btree_trans *);
 void bch2_assert_pos_locked(struct btree_trans *, enum btree_id,
diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c
index 49c7e94573c9..14a0614af436 100644
--- a/fs/bcachefs/btree_locking.c
+++ b/fs/bcachefs/btree_locking.c
@@ -725,6 +725,19 @@ bool bch2_trans_locked(struct btree_trans *trans)
 	return false;
 }
 
+int __bch2_trans_mutex_lock(struct btree_trans *trans,
+			    struct mutex *lock)
+{
+	int ret;
+
+	bch2_trans_unlock(trans);
+	mutex_lock(lock);
+	ret = bch2_trans_relock(trans);
+	if (ret)
+		mutex_unlock(lock);
+	return ret;
+}
+
 /* Debug */
 
 #ifdef CONFIG_BCACHEFS_DEBUG
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index 236e1bef5f02..6d0a49000bef 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -1231,7 +1231,7 @@ ec_new_stripe_head_alloc(struct bch_fs *c, unsigned target,
 		return NULL;
 
 	mutex_init(&h->lock);
-	mutex_lock(&h->lock);
+	BUG_ON(!mutex_trylock(&h->lock));
 
 	h->target	= target;
 	h->algo		= algo;
@@ -1280,23 +1280,18 @@ struct ec_stripe_head *__bch2_ec_stripe_head_get(struct btree_trans *trans,
 	if (!redundancy)
 		return NULL;
 
-	if (!mutex_trylock(&c->ec_stripe_head_lock)) {
-		bch2_trans_unlock(trans);
-		mutex_lock(&c->ec_stripe_head_lock);
-
-		ret = bch2_trans_relock(trans);
-		if (ret) {
-			mutex_unlock(&c->ec_stripe_head_lock);
-			return ERR_PTR(ret);
-		}
-	}
+	ret = bch2_trans_mutex_lock(trans, &c->ec_stripe_head_lock);
+	if (ret)
+		return ERR_PTR(ret);
 
 	list_for_each_entry(h, &c->ec_stripe_head_list, list)
 		if (h->target		== target &&
 		    h->algo		== algo &&
 		    h->redundancy	== redundancy &&
 		    h->copygc		== copygc) {
-			mutex_lock(&h->lock);
+			ret = bch2_trans_mutex_lock(trans, &h->lock);
+			if (ret)
+				h = ERR_PTR(ret);
 			goto found;
 		}
 
-- 
cgit 


From 5e2d8be8bd7985aa590dcccec14fb38fb529b3f0 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 18 Feb 2023 21:20:18 -0500
Subject: bcachefs: Split trans->last_begin_ip and trans->last_restarted_ip

These are two different things - this improves our debug assert
messages.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c  | 5 +++--
 fs/bcachefs/btree_iter.h  | 1 +
 fs/bcachefs/btree_types.h | 1 +
 3 files changed, 5 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 5ab22c4c2b2e..63213205a043 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -976,6 +976,7 @@ static int bch2_btree_path_traverse_all(struct btree_trans *trans)
 	trans->in_traverse_all = true;
 retry_all:
 	trans->restarted = 0;
+	trans->last_restarted_ip = 0;
 
 	trans_for_each_path(trans, path)
 		path->should_be_locked = false;
@@ -1360,7 +1361,7 @@ void __noreturn bch2_trans_restart_error(struct btree_trans *trans, u32 restart_
 {
 	panic("trans->restart_count %u, should be %u, last restarted by %pS\n",
 	      trans->restart_count, restart_count,
-	      (void *) trans->last_restarted_ip);
+	      (void *) trans->last_begin_ip);
 }
 
 void __noreturn bch2_trans_in_restart_error(struct btree_trans *trans)
@@ -2865,7 +2866,7 @@ u32 bch2_trans_begin(struct btree_trans *trans)
 	if (unlikely(time_after(jiffies, trans->srcu_lock_time + msecs_to_jiffies(10))))
 		bch2_trans_reset_srcu_lock(trans);
 
-	trans->last_restarted_ip = _RET_IP_;
+	trans->last_begin_ip = _RET_IP_;
 	if (trans->restarted) {
 		bch2_btree_path_traverse_all(trans);
 		trans->notrace_relock_fail = false;
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index 458c7f7dc5b7..50b39704c56f 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -260,6 +260,7 @@ static inline int btree_trans_restart_nounlock(struct btree_trans *trans, int er
 	BUG_ON(!bch2_err_matches(err, BCH_ERR_transaction_restart));
 
 	trans->restarted = err;
+	trans->last_restarted_ip = _THIS_IP_;
 	return -err;
 }
 
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 9fdddfb15782..38bc2a1d198b 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -431,6 +431,7 @@ struct btree_trans {
 	bool			notrace_relock_fail:1;
 	enum bch_errcode	restarted:16;
 	u32			restart_count;
+	unsigned long		last_begin_ip;
 	unsigned long		last_restarted_ip;
 	unsigned long		srcu_lock_time;
 
-- 
cgit 


From 627a231239e050e70cf55a9eec316a8270a2fd63 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 18 Feb 2023 20:49:37 -0500
Subject: bcachefs: Switch ec_stripes_heap_lock to a mutex

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs.h |  2 +-
 fs/bcachefs/buckets.c  | 14 +++++++-------
 fs/bcachefs/ec.c       | 33 ++++++++++++++++-----------------
 fs/bcachefs/super.c    |  2 +-
 4 files changed, 25 insertions(+), 26 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 5dc4b0c133ad..c9c7ffa9fa71 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -941,7 +941,7 @@ struct bch_fs {
 	GENRADIX(struct gc_stripe) gc_stripes;
 
 	ec_stripes_heap		ec_stripes_heap;
-	spinlock_t		ec_stripes_heap_lock;
+	struct mutex		ec_stripes_heap_lock;
 
 	/* ERASURE CODING */
 	struct list_head	ec_stripe_head_list;
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 2e1751eeaef4..ddbf88a759f9 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -907,10 +907,10 @@ static int bch2_mark_stripe_ptr(struct btree_trans *trans,
 		return -ENOMEM;
 	}
 
-	spin_lock(&c->ec_stripes_heap_lock);
+	mutex_lock(&c->ec_stripes_heap_lock);
 
 	if (!m || !m->alive) {
-		spin_unlock(&c->ec_stripes_heap_lock);
+		mutex_unlock(&c->ec_stripes_heap_lock);
 		bch_err_ratelimited(c, "pointer to nonexistent stripe %llu",
 				    (u64) p.idx);
 		bch2_inconsistent_error(c);
@@ -920,7 +920,7 @@ static int bch2_mark_stripe_ptr(struct btree_trans *trans,
 	m->block_sectors[p.block] += sectors;
 
 	r = m->r;
-	spin_unlock(&c->ec_stripes_heap_lock);
+	mutex_unlock(&c->ec_stripes_heap_lock);
 
 	r.e.data_type = data_type;
 	update_replicas(c, k, &r.e, sectors, trans->journal_res.seq, true);
@@ -1047,9 +1047,9 @@ int bch2_mark_stripe(struct btree_trans *trans,
 		}
 
 		if (!new_s) {
-			spin_lock(&c->ec_stripes_heap_lock);
+			mutex_lock(&c->ec_stripes_heap_lock);
 			bch2_stripes_heap_del(c, m, idx);
-			spin_unlock(&c->ec_stripes_heap_lock);
+			mutex_unlock(&c->ec_stripes_heap_lock);
 
 			memset(m, 0, sizeof(*m));
 		} else {
@@ -1063,9 +1063,9 @@ int bch2_mark_stripe(struct btree_trans *trans,
 			for (i = 0; i < new_s->nr_blocks; i++)
 				m->blocks_nonempty += !!stripe_blockcount_get(new_s, i);
 
-			spin_lock(&c->ec_stripes_heap_lock);
+			mutex_lock(&c->ec_stripes_heap_lock);
 			bch2_stripes_heap_update(c, m, idx);
-			spin_unlock(&c->ec_stripes_heap_lock);
+			mutex_unlock(&c->ec_stripes_heap_lock);
 		}
 	} else {
 		struct gc_stripe *m =
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index 6d0a49000bef..7a6b962ae1fc 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -549,13 +549,13 @@ static int __ec_stripe_mem_alloc(struct bch_fs *c, size_t idx, gfp_t gfp)
 		if (!init_heap(&n, max(1024UL, roundup_pow_of_two(idx + 1)), gfp))
 			return -ENOMEM;
 
-		spin_lock(&c->ec_stripes_heap_lock);
+		mutex_lock(&c->ec_stripes_heap_lock);
 		if (n.size > h->size) {
 			memcpy(n.data, h->data, h->used * sizeof(h->data[0]));
 			n.used = h->used;
 			swap(*h, n);
 		}
-		spin_unlock(&c->ec_stripes_heap_lock);
+		mutex_unlock(&c->ec_stripes_heap_lock);
 
 		free_heap(&n);
 	}
@@ -695,15 +695,15 @@ static void ec_stripe_delete_work(struct work_struct *work)
 	ssize_t idx;
 
 	while (1) {
-		spin_lock(&c->ec_stripes_heap_lock);
+		mutex_lock(&c->ec_stripes_heap_lock);
 		idx = stripe_idx_to_delete(c);
 		if (idx < 0) {
-			spin_unlock(&c->ec_stripes_heap_lock);
+			mutex_unlock(&c->ec_stripes_heap_lock);
 			break;
 		}
 
 		bch2_stripes_heap_del(c, genradix_ptr(&c->stripes, idx), idx);
-		spin_unlock(&c->ec_stripes_heap_lock);
+		mutex_unlock(&c->ec_stripes_heap_lock);
 
 		if (ec_stripe_delete(c, idx))
 			break;
@@ -1013,12 +1013,13 @@ static void ec_stripe_create(struct ec_stripe_new *s)
 		bch_err(c, "error creating stripe: error updating pointers: %s",
 			bch2_err_str(ret));
 
-	spin_lock(&c->ec_stripes_heap_lock);
+
+	mutex_lock(&c->ec_stripes_heap_lock);
 	m = genradix_ptr(&c->stripes, s->new_stripe.key.k.p.offset);
 
 	BUG_ON(m->on_heap);
 	bch2_stripes_heap_insert(c, m, s->new_stripe.key.k.p.offset);
-	spin_unlock(&c->ec_stripes_heap_lock);
+	mutex_unlock(&c->ec_stripes_heap_lock);
 err_put_writes:
 	bch2_write_ref_put(c, BCH_WRITE_REF_stripe_create);
 err:
@@ -1398,7 +1399,7 @@ static s64 get_existing_stripe(struct bch_fs *c,
 	if (may_create_new_stripe(c))
 		return -1;
 
-	spin_lock(&c->ec_stripes_heap_lock);
+	mutex_lock(&c->ec_stripes_heap_lock);
 	for (heap_idx = 0; heap_idx < h->used; heap_idx++) {
 		/* No blocks worth reusing, stripe will just be deleted: */
 		if (!h->data[heap_idx].blocks_nonempty)
@@ -1416,12 +1417,11 @@ static s64 get_existing_stripe(struct bch_fs *c,
 			break;
 		}
 	}
-	spin_unlock(&c->ec_stripes_heap_lock);
+	mutex_unlock(&c->ec_stripes_heap_lock);
 	return ret;
 }
 
-static int __bch2_ec_stripe_head_reuse(struct bch_fs *c,
-						   struct ec_stripe_head *h)
+static int __bch2_ec_stripe_head_reuse(struct bch_fs *c, struct ec_stripe_head *h)
 {
 	unsigned i;
 	s64 idx;
@@ -1464,8 +1464,7 @@ static int __bch2_ec_stripe_head_reuse(struct bch_fs *c,
 	return 0;
 }
 
-static int __bch2_ec_stripe_head_reserve(struct bch_fs *c,
-							struct ec_stripe_head *h)
+static int __bch2_ec_stripe_head_reserve(struct bch_fs *c, struct ec_stripe_head *h)
 {
 	return bch2_disk_reservation_get(c, &h->s->res,
 					 h->blocksize,
@@ -1606,9 +1605,9 @@ int bch2_stripes_read(struct bch_fs *c)
 		for (i = 0; i < s->nr_blocks; i++)
 			m->blocks_nonempty += !!stripe_blockcount_get(s, i);
 
-		spin_lock(&c->ec_stripes_heap_lock);
+		mutex_lock(&c->ec_stripes_heap_lock);
 		bch2_stripes_heap_update(c, m, k.k->p.offset);
-		spin_unlock(&c->ec_stripes_heap_lock);
+		mutex_unlock(&c->ec_stripes_heap_lock);
 	}
 	bch2_trans_iter_exit(&trans, &iter);
 
@@ -1626,7 +1625,7 @@ void bch2_stripes_heap_to_text(struct printbuf *out, struct bch_fs *c)
 	struct stripe *m;
 	size_t i;
 
-	spin_lock(&c->ec_stripes_heap_lock);
+	mutex_lock(&c->ec_stripes_heap_lock);
 	for (i = 0; i < min_t(size_t, h->used, 20); i++) {
 		m = genradix_ptr(&c->stripes, h->data[i].idx);
 
@@ -1635,7 +1634,7 @@ void bch2_stripes_heap_to_text(struct printbuf *out, struct bch_fs *c)
 		       m->nr_blocks - m->nr_redundant,
 		       m->nr_redundant);
 	}
-	spin_unlock(&c->ec_stripes_heap_lock);
+	mutex_unlock(&c->ec_stripes_heap_lock);
 }
 
 void bch2_new_stripes_to_text(struct printbuf *out, struct bch_fs *c)
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index f703e41c7560..b030d0bb26e7 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -706,7 +706,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 	INIT_LIST_HEAD(&c->data_progress_list);
 	mutex_init(&c->data_progress_lock);
 
-	spin_lock_init(&c->ec_stripes_heap_lock);
+	mutex_init(&c->ec_stripes_heap_lock);
 
 	seqcount_init(&c->gc_pos_lock);
 
-- 
cgit 


From ebe8bd75a073c303e695589c11f298fcc3a1fb1c Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 18 Feb 2023 21:10:13 -0500
Subject: bcachefs: Improve c->writes refcounting for stripe create path

This makes our handling of c->writes more consistent with other
asynchronous work items.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/ec.c | 54 +++++++++++++++++++++++++++++++++---------------------
 1 file changed, 33 insertions(+), 21 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index 7a6b962ae1fc..44e7b6584713 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -982,9 +982,6 @@ static void ec_stripe_create(struct ec_stripe_new *s)
 
 	BUG_ON(!s->allocated);
 
-	if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_stripe_create))
-		goto err;
-
 	ec_generate_ec(&s->new_stripe);
 
 	ec_generate_checksums(&s->new_stripe);
@@ -996,7 +993,7 @@ static void ec_stripe_create(struct ec_stripe_new *s)
 
 	if (ec_nr_failed(&s->new_stripe)) {
 		bch_err(c, "error creating stripe: error writing redundancy buckets");
-		goto err_put_writes;
+		goto err;
 	}
 
 	ret = bch2_trans_do(c, &s->res, NULL, BTREE_INSERT_NOFAIL,
@@ -1005,7 +1002,7 @@ static void ec_stripe_create(struct ec_stripe_new *s)
 			    : ec_stripe_bkey_insert(&trans, &s->new_stripe.key, &s->res));
 	if (ret) {
 		bch_err(c, "error creating stripe: error creating stripe key");
-		goto err_put_writes;
+		goto err;
 	}
 
 	ret = ec_stripe_update_extents(c, &s->new_stripe);
@@ -1013,15 +1010,12 @@ static void ec_stripe_create(struct ec_stripe_new *s)
 		bch_err(c, "error creating stripe: error updating pointers: %s",
 			bch2_err_str(ret));
 
-
 	mutex_lock(&c->ec_stripes_heap_lock);
 	m = genradix_ptr(&c->stripes, s->new_stripe.key.k.p.offset);
 
 	BUG_ON(m->on_heap);
 	bch2_stripes_heap_insert(c, m, s->new_stripe.key.k.p.offset);
 	mutex_unlock(&c->ec_stripes_heap_lock);
-err_put_writes:
-	bch2_write_ref_put(c, BCH_WRITE_REF_stripe_create);
 err:
 	bch2_disk_reservation_put(c, &s->res);
 
@@ -1043,31 +1037,49 @@ err:
 	kfree(s);
 }
 
-static void ec_stripe_create_work(struct work_struct *work)
+static struct ec_stripe_new *get_pending_stripe(struct bch_fs *c)
 {
-	struct bch_fs *c = container_of(work,
-		struct bch_fs, ec_stripe_create_work);
-	struct ec_stripe_new *s, *n;
-restart:
+	struct ec_stripe_new *s;
+
 	mutex_lock(&c->ec_stripe_new_lock);
-	list_for_each_entry_safe(s, n, &c->ec_stripe_new_list, list)
+	list_for_each_entry(s, &c->ec_stripe_new_list, list)
 		if (!atomic_read(&s->pin)) {
 			list_del(&s->list);
-			mutex_unlock(&c->ec_stripe_new_lock);
-			ec_stripe_create(s);
-			goto restart;
+			goto out;
 		}
+	s = NULL;
+out:
 	mutex_unlock(&c->ec_stripe_new_lock);
+
+	return s;
+}
+
+static void ec_stripe_create_work(struct work_struct *work)
+{
+	struct bch_fs *c = container_of(work,
+		struct bch_fs, ec_stripe_create_work);
+	struct ec_stripe_new *s;
+
+	while ((s = get_pending_stripe(c)))
+		ec_stripe_create(s);
+
+	bch2_write_ref_put(c, BCH_WRITE_REF_stripe_create);
+}
+
+void bch2_ec_do_stripe_creates(struct bch_fs *c)
+{
+	bch2_write_ref_get(c, BCH_WRITE_REF_stripe_create);
+
+	if (!queue_work(system_long_wq, &c->ec_stripe_create_work))
+		bch2_write_ref_put(c, BCH_WRITE_REF_stripe_create);
 }
 
 static void ec_stripe_new_put(struct bch_fs *c, struct ec_stripe_new *s)
 {
 	BUG_ON(atomic_read(&s->pin) <= 0);
 
-	if (atomic_dec_and_test(&s->pin)) {
-		BUG_ON(!s->pending);
-		queue_work(system_long_wq, &c->ec_stripe_create_work);
-	}
+	if (atomic_dec_and_test(&s->pin))
+		bch2_ec_do_stripe_creates(c);
 }
 
 static void ec_stripe_set_pending(struct bch_fs *c, struct ec_stripe_head *h)
-- 
cgit 


From ba7c37d330816bcc10c55c8eaab268afca2447e8 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 18 Feb 2023 21:31:07 -0500
Subject: bcachefs: Stripe deletion now checks what it's deleting

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/ec.c | 72 +++++++++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 56 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index 44e7b6584713..17284349ae2e 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -584,12 +584,12 @@ static int ec_stripe_mem_alloc(struct btree_trans *trans,
 		bch2_trans_relock(trans);
 }
 
-static ssize_t stripe_idx_to_delete(struct bch_fs *c)
+static u64 stripe_idx_to_delete(struct bch_fs *c)
 {
 	ec_stripes_heap *h = &c->ec_stripes_heap;
 
 	return h->used && h->data[0].blocks_nonempty == 0
-		? h->data[0].idx : -1;
+		? h->data[0].idx : 0;
 }
 
 static inline int ec_stripes_heap_cmp(ec_stripes_heap *h,
@@ -674,41 +674,81 @@ void bch2_stripes_heap_update(struct bch_fs *c,
 
 	heap_verify_backpointer(c, idx);
 
-	if (stripe_idx_to_delete(c) >= 0)
+	if (stripe_idx_to_delete(c))
 		bch2_do_stripe_deletes(c);
 }
 
 /* stripe deletion */
 
-static int ec_stripe_delete(struct bch_fs *c, size_t idx)
+static int ec_stripe_delete(struct btree_trans *trans, u64 idx)
 {
-	return bch2_btree_delete_range(c, BTREE_ID_stripes,
-				       POS(0, idx),
-				       POS(0, idx),
-				       0, NULL);
+	struct bch_fs *c = trans->c;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	struct bkey_s_c_stripe s;
+	int ret;
+
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_stripes, POS(0, idx),
+			     BTREE_ITER_INTENT);
+	k = bch2_btree_iter_peek_slot(&iter);
+	ret = bkey_err(k);
+	if (ret)
+		goto err;
+
+	if (k.k->type != KEY_TYPE_stripe) {
+		bch2_fs_inconsistent(c, "attempting to delete nonexistent stripe %llu", idx);
+		ret = -EINVAL;
+		goto err;
+	}
+
+	s = bkey_s_c_to_stripe(k);
+	for (unsigned i = 0; i < s.v->nr_blocks; i++)
+		if (stripe_blockcount_get(s.v, i)) {
+			struct printbuf buf = PRINTBUF;
+
+			bch2_bkey_val_to_text(&buf, c, k);
+			bch2_fs_inconsistent(c, "attempting to delete nonempty stripe %s", buf.buf);
+			printbuf_exit(&buf);
+			ret = -EINVAL;
+			goto err;
+		}
+
+	ret = bch2_btree_delete_at(trans, &iter, 0);
+err:
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
 }
 
 static void ec_stripe_delete_work(struct work_struct *work)
 {
 	struct bch_fs *c =
 		container_of(work, struct bch_fs, ec_stripe_delete_work);
-	ssize_t idx;
+	struct btree_trans trans;
+	int ret;
+	u64 idx;
+
+	bch2_trans_init(&trans, c, 0, 0);
 
 	while (1) {
 		mutex_lock(&c->ec_stripes_heap_lock);
 		idx = stripe_idx_to_delete(c);
-		if (idx < 0) {
-			mutex_unlock(&c->ec_stripes_heap_lock);
-			break;
-		}
-
-		bch2_stripes_heap_del(c, genradix_ptr(&c->stripes, idx), idx);
+		if (idx)
+			bch2_stripes_heap_del(c, genradix_ptr(&c->stripes, idx), idx);
 		mutex_unlock(&c->ec_stripes_heap_lock);
 
-		if (ec_stripe_delete(c, idx))
+		if (!idx)
+			break;
+
+		ret = commit_do(&trans, NULL, NULL, BTREE_INSERT_NOFAIL,
+				ec_stripe_delete(&trans, idx));
+		if (ret) {
+			bch_err(c, "%s: err %s", __func__, bch2_err_str(ret));
 			break;
+		}
 	}
 
+	bch2_trans_exit(&trans);
+
 	bch2_write_ref_put(c, BCH_WRITE_REF_stripe_delete);
 }
 
-- 
cgit 


From 4b1e669995a6c19f1e1cc8a600101edf7fe9277e Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 18 Feb 2023 21:07:25 -0500
Subject: bcachefs: Erasure coding: Track open stripes

This adds a new hash table for stripes being created or updated, instead
of hackily relying on the stripes heap.

This lets us reserve the slot for the new stripe up front, at the same
time as we would pick an existing stripe - if we were updating an
existing stripe - making the overall code more consistent.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs.h |   3 +
 fs/bcachefs/ec.c       | 231 +++++++++++++++++++++++++++++++++----------------
 fs/bcachefs/ec.h       |   4 +
 3 files changed, 165 insertions(+), 73 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index c9c7ffa9fa71..85a815cdf586 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -940,6 +940,9 @@ struct bch_fs {
 	GENRADIX(struct stripe) stripes;
 	GENRADIX(struct gc_stripe) gc_stripes;
 
+	struct hlist_head	ec_stripes_new[32];
+	spinlock_t		ec_stripes_new_lock;
+
 	ec_stripes_heap		ec_stripes_heap;
 	struct mutex		ec_stripes_heap_lock;
 
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index 17284349ae2e..eb8ce55e6fd4 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -584,12 +584,79 @@ static int ec_stripe_mem_alloc(struct btree_trans *trans,
 		bch2_trans_relock(trans);
 }
 
+/*
+ * Hash table of open stripes:
+ * Stripes that are being created or modified are kept in a hash table, so that
+ * stripe deletion can skip them.
+ */
+
+static bool __bch2_stripe_is_open(struct bch_fs *c, u64 idx)
+{
+	unsigned hash = hash_64(idx, ilog2(ARRAY_SIZE(c->ec_stripes_new)));
+	struct ec_stripe_new *s;
+
+	hlist_for_each_entry(s, &c->ec_stripes_new[hash], hash)
+		if (s->idx == idx)
+			return true;
+	return false;
+}
+
+static bool bch2_stripe_is_open(struct bch_fs *c, u64 idx)
+{
+	bool ret = false;
+
+	spin_lock(&c->ec_stripes_new_lock);
+	ret = __bch2_stripe_is_open(c, idx);
+	spin_unlock(&c->ec_stripes_new_lock);
+
+	return ret;
+}
+
+static bool bch2_try_open_stripe(struct bch_fs *c,
+				 struct ec_stripe_new *s,
+				 u64 idx)
+{
+	bool ret;
+
+	spin_lock(&c->ec_stripes_new_lock);
+	ret = !__bch2_stripe_is_open(c, idx);
+	if (ret) {
+		unsigned hash = hash_64(idx, ilog2(ARRAY_SIZE(c->ec_stripes_new)));
+
+		s->idx = idx;
+		hlist_add_head(&s->hash, &c->ec_stripes_new[hash]);
+	}
+	spin_unlock(&c->ec_stripes_new_lock);
+
+	return ret;
+}
+
+static void bch2_stripe_close(struct bch_fs *c, struct ec_stripe_new *s)
+{
+	BUG_ON(!s->idx);
+
+	spin_lock(&c->ec_stripes_new_lock);
+	hlist_del_init(&s->hash);
+	spin_unlock(&c->ec_stripes_new_lock);
+
+	s->idx = 0;
+}
+
+/* Heap of all existing stripes, ordered by blocks_nonempty */
+
 static u64 stripe_idx_to_delete(struct bch_fs *c)
 {
 	ec_stripes_heap *h = &c->ec_stripes_heap;
+	size_t heap_idx;
+
+	lockdep_assert_held(&c->ec_stripes_heap_lock);
 
-	return h->used && h->data[0].blocks_nonempty == 0
-		? h->data[0].idx : 0;
+	for (heap_idx = 0; heap_idx < h->used; heap_idx++)
+		if (h->data[heap_idx].blocks_nonempty == 0 &&
+		    !bch2_stripe_is_open(c, h->data[heap_idx].idx))
+			return h->data[heap_idx].idx;
+
+	return 0;
 }
 
 static inline int ec_stripes_heap_cmp(ec_stripes_heap *h,
@@ -761,60 +828,13 @@ void bch2_do_stripe_deletes(struct bch_fs *c)
 
 /* stripe creation: */
 
-static int ec_stripe_bkey_insert(struct btree_trans *trans,
-				 struct bkey_i_stripe *stripe,
-				 struct disk_reservation *res)
+static int ec_stripe_key_update(struct btree_trans *trans,
+				struct bkey_i_stripe *new,
+				bool create)
 {
 	struct bch_fs *c = trans->c;
 	struct btree_iter iter;
 	struct bkey_s_c k;
-	struct bpos min_pos = POS(0, 1);
-	struct bpos start_pos = bpos_max(min_pos, POS(0, c->ec_stripe_hint));
-	int ret;
-
-	for_each_btree_key_norestart(trans, iter, BTREE_ID_stripes, start_pos,
-			   BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) {
-		if (bkey_gt(k.k->p, POS(0, U32_MAX))) {
-			if (start_pos.offset) {
-				start_pos = min_pos;
-				bch2_btree_iter_set_pos(&iter, start_pos);
-				continue;
-			}
-
-			ret = -BCH_ERR_ENOSPC_stripe_create;
-			break;
-		}
-
-		if (bkey_deleted(k.k))
-			break;
-	}
-
-	c->ec_stripe_hint = iter.pos.offset;
-
-	if (ret)
-		goto err;
-
-	ret = ec_stripe_mem_alloc(trans, &iter);
-	if (ret)
-		goto err;
-
-	stripe->k.p = iter.pos;
-
-	ret = bch2_trans_update(trans, &iter, &stripe->k_i, 0);
-err:
-	bch2_trans_iter_exit(trans, &iter);
-
-	return ret;
-}
-
-static int ec_stripe_bkey_update(struct btree_trans *trans,
-				 struct bkey_i_stripe *new,
-				 struct disk_reservation *res)
-{
-	struct btree_iter iter;
-	struct bkey_s_c k;
-	const struct bch_stripe *existing;
-	unsigned i;
 	int ret;
 
 	bch2_trans_iter_init(trans, &iter, BTREE_ID_stripes,
@@ -824,23 +844,27 @@ static int ec_stripe_bkey_update(struct btree_trans *trans,
 	if (ret)
 		goto err;
 
-	if (!k.k || k.k->type != KEY_TYPE_stripe) {
-		bch_err(trans->c, "error updating stripe: not found");
-		ret = -ENOENT;
+	if (k.k->type != (create ? KEY_TYPE_deleted : KEY_TYPE_stripe)) {
+		bch2_fs_inconsistent(c, "error %s stripe: got existing key type %s",
+				     create ? "creating" : "updating",
+				     bch2_bkey_types[k.k->type]);
+		ret = -EINVAL;
 		goto err;
 	}
 
-	existing = bkey_s_c_to_stripe(k).v;
+	if (k.k->type == KEY_TYPE_stripe) {
+		const struct bch_stripe *old = bkey_s_c_to_stripe(k).v;
+		unsigned i;
 
-	if (existing->nr_blocks != new->v.nr_blocks) {
-		bch_err(trans->c, "error updating stripe: nr_blocks does not match");
-		ret = -EINVAL;
-		goto err;
-	}
+		if (old->nr_blocks != new->v.nr_blocks) {
+			bch_err(c, "error updating stripe: nr_blocks does not match");
+			ret = -EINVAL;
+			goto err;
+		}
 
-	for (i = 0; i < new->v.nr_blocks; i++)
-		stripe_blockcount_set(&new->v, i,
-			stripe_blockcount_get(existing, i));
+		for (i = 0; i < new->v.nr_blocks; i++)
+			stripe_blockcount_set(&new->v, i, stripe_blockcount_get(old, i));
+	}
 
 	ret = bch2_trans_update(trans, &iter, &new->k_i, 0);
 err:
@@ -1037,18 +1061,21 @@ static void ec_stripe_create(struct ec_stripe_new *s)
 	}
 
 	ret = bch2_trans_do(c, &s->res, NULL, BTREE_INSERT_NOFAIL,
-			    s->have_existing_stripe
-			    ? ec_stripe_bkey_update(&trans, &s->new_stripe.key, &s->res)
-			    : ec_stripe_bkey_insert(&trans, &s->new_stripe.key, &s->res));
+			    ec_stripe_key_update(&trans, &s->new_stripe.key,
+						 !s->have_existing_stripe));
 	if (ret) {
 		bch_err(c, "error creating stripe: error creating stripe key");
 		goto err;
 	}
 
 	ret = ec_stripe_update_extents(c, &s->new_stripe);
-	if (ret)
+	if (ret) {
 		bch_err(c, "error creating stripe: error updating pointers: %s",
 			bch2_err_str(ret));
+		goto err;
+	}
+
+	bch2_stripe_close(c, s);
 
 	mutex_lock(&c->ec_stripes_heap_lock);
 	m = genradix_ptr(&c->stripes, s->new_stripe.key.k.p.offset);
@@ -1458,12 +1485,16 @@ static s64 get_existing_stripe(struct bch_fs *c,
 			continue;
 
 		stripe_idx = h->data[heap_idx].idx;
+
 		m = genradix_ptr(&c->stripes, stripe_idx);
 
 		if (m->algorithm	== head->algo &&
 		    m->nr_redundant	== head->redundancy &&
 		    m->sectors		== head->blocksize &&
 		    m->blocks_nonempty	< m->nr_blocks - m->nr_redundant) {
+			if (!bch2_try_open_stripe(c, head->s, stripe_idx))
+				continue;
+
 			bch2_stripes_heap_del(c, m, stripe_idx);
 			ret = stripe_idx;
 			break;
@@ -1516,11 +1547,59 @@ static int __bch2_ec_stripe_head_reuse(struct bch_fs *c, struct ec_stripe_head *
 	return 0;
 }
 
-static int __bch2_ec_stripe_head_reserve(struct bch_fs *c, struct ec_stripe_head *h)
+static int __bch2_ec_stripe_head_reserve(struct btree_trans *trans, struct ec_stripe_head *h)
 {
-	return bch2_disk_reservation_get(c, &h->s->res,
-					 h->blocksize,
-					 h->s->nr_parity, 0);
+	struct bch_fs *c = trans->c;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	struct bpos min_pos = POS(0, 1);
+	struct bpos start_pos = bpos_max(min_pos, POS(0, c->ec_stripe_hint));
+	int ret;
+
+	BUG_ON(h->s->res.sectors);
+
+	ret = bch2_disk_reservation_get(c, &h->s->res,
+					h->blocksize,
+					h->s->nr_parity, 0);
+	if (ret)
+		return ret;
+
+	for_each_btree_key_norestart(trans, iter, BTREE_ID_stripes, start_pos,
+			   BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) {
+		if (bkey_gt(k.k->p, POS(0, U32_MAX))) {
+			if (start_pos.offset) {
+				start_pos = min_pos;
+				bch2_btree_iter_set_pos(&iter, start_pos);
+				continue;
+			}
+
+			ret = -BCH_ERR_ENOSPC_stripe_create;
+			break;
+		}
+
+		if (bkey_deleted(k.k) &&
+		    bch2_try_open_stripe(c, h->s, k.k->p.offset))
+			break;
+	}
+
+	c->ec_stripe_hint = iter.pos.offset;
+
+	if (ret)
+		goto err;
+
+	ret = ec_stripe_mem_alloc(trans, &iter);
+	if (ret) {
+		bch2_stripe_close(c, h->s);
+		goto err;
+	}
+
+	h->s->new_stripe.key.k.p = iter.pos;
+out:
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+err:
+	bch2_disk_reservation_put(c, &h->s->res);
+	goto out;
 }
 
 struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans,
@@ -1560,7 +1639,10 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans,
 	 */
 	ret = 0;
 	if (!h->s->allocated && !h->s->res.sectors && !h->s->have_existing_stripe)
-		ret = __bch2_ec_stripe_head_reserve(c, h);
+		ret = __bch2_ec_stripe_head_reserve(trans, h);
+	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+		goto err;
+
 	if (ret && needs_stripe_new)
 		ret = __bch2_ec_stripe_head_reuse(c, h);
 	if (ret) {
@@ -1576,6 +1658,7 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans,
 		h->s->allocated = true;
 	}
 
+	BUG_ON(trans->restarted);
 	return h;
 err:
 	bch2_ec_stripe_head_put(c, h);
@@ -1749,6 +1832,8 @@ void bch2_fs_ec_init_early(struct bch_fs *c)
 
 int bch2_fs_ec_init(struct bch_fs *c)
 {
+	spin_lock_init(&c->ec_stripes_new_lock);
+
 	return bioset_init(&c->ec_bioset, 1, offsetof(struct ec_bio, bio),
 			   BIOSET_NEED_BVECS);
 }
diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h
index 37d42e2a4505..0a69114bb160 100644
--- a/fs/bcachefs/ec.h
+++ b/fs/bcachefs/ec.h
@@ -148,6 +148,10 @@ struct ec_stripe_new {
 	struct ec_stripe_head	*h;
 	struct mutex		lock;
 	struct list_head	list;
+
+	struct hlist_node	hash;
+	u64			idx;
+
 	struct closure		iodone;
 
 	/* counts in flight writes, stripe is created when pin == 0 */
-- 
cgit 


From 27616a31241e0625ee063f2cacd8c8e339b2de65 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 18 Feb 2023 22:11:50 -0500
Subject: bcachefs: Simplify ec stripes heap

Now that we have a separate data structure for tracking open stripes,
the stripes heap can track all existing stripes, which is a nice
simplification.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/buckets.c  | 12 +++++------
 fs/bcachefs/ec.c       | 57 ++++++++++++--------------------------------------
 fs/bcachefs/ec.h       |  2 --
 fs/bcachefs/ec_types.h |  5 -----
 fs/bcachefs/recovery.c |  2 --
 5 files changed, 18 insertions(+), 60 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index ddbf88a759f9..7c4780b3ceb5 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -1031,7 +1031,7 @@ int bch2_mark_stripe(struct btree_trans *trans,
 	if (!gc) {
 		struct stripe *m = genradix_ptr(&c->stripes, idx);
 
-		if (!m || (old_s && !m->alive)) {
+		if (!m) {
 			struct printbuf buf1 = PRINTBUF;
 			struct printbuf buf2 = PRINTBUF;
 
@@ -1047,13 +1047,10 @@ int bch2_mark_stripe(struct btree_trans *trans,
 		}
 
 		if (!new_s) {
-			mutex_lock(&c->ec_stripes_heap_lock);
 			bch2_stripes_heap_del(c, m, idx);
-			mutex_unlock(&c->ec_stripes_heap_lock);
 
 			memset(m, 0, sizeof(*m));
 		} else {
-			m->alive	= true;
 			m->sectors	= le16_to_cpu(new_s->sectors);
 			m->algorithm	= new_s->algorithm;
 			m->nr_blocks	= new_s->nr_blocks;
@@ -1063,9 +1060,10 @@ int bch2_mark_stripe(struct btree_trans *trans,
 			for (i = 0; i < new_s->nr_blocks; i++)
 				m->blocks_nonempty += !!stripe_blockcount_get(new_s, i);
 
-			mutex_lock(&c->ec_stripes_heap_lock);
-			bch2_stripes_heap_update(c, m, idx);
-			mutex_unlock(&c->ec_stripes_heap_lock);
+			if (!old_s)
+				bch2_stripes_heap_insert(c, m, idx);
+			else
+				bch2_stripes_heap_update(c, m, idx);
 		}
 	} else {
 		struct gc_stripe *m =
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index eb8ce55e6fd4..924494868102 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -680,7 +680,6 @@ static void heap_verify_backpointer(struct bch_fs *c, size_t idx)
 	ec_stripes_heap *h = &c->ec_stripes_heap;
 	struct stripe *m = genradix_ptr(&c->stripes, idx);
 
-	BUG_ON(!m->alive);
 	BUG_ON(m->heap_idx >= h->used);
 	BUG_ON(h->data[m->heap_idx].idx != idx);
 }
@@ -688,28 +687,21 @@ static void heap_verify_backpointer(struct bch_fs *c, size_t idx)
 void bch2_stripes_heap_del(struct bch_fs *c,
 			   struct stripe *m, size_t idx)
 {
-	if (!m->on_heap)
-		return;
-
-	m->on_heap = false;
-
+	mutex_lock(&c->ec_stripes_heap_lock);
 	heap_verify_backpointer(c, idx);
 
 	heap_del(&c->ec_stripes_heap, m->heap_idx,
 		 ec_stripes_heap_cmp,
 		 ec_stripes_heap_set_backpointer);
+	mutex_unlock(&c->ec_stripes_heap_lock);
 }
 
 void bch2_stripes_heap_insert(struct bch_fs *c,
 			      struct stripe *m, size_t idx)
 {
-	if (m->on_heap)
-		return;
-
+	mutex_lock(&c->ec_stripes_heap_lock);
 	BUG_ON(heap_full(&c->ec_stripes_heap));
 
-	m->on_heap = true;
-
 	heap_add(&c->ec_stripes_heap, ((struct ec_stripe_heap_entry) {
 			.idx = idx,
 			.blocks_nonempty = m->blocks_nonempty,
@@ -718,17 +710,17 @@ void bch2_stripes_heap_insert(struct bch_fs *c,
 		 ec_stripes_heap_set_backpointer);
 
 	heap_verify_backpointer(c, idx);
+	mutex_unlock(&c->ec_stripes_heap_lock);
 }
 
 void bch2_stripes_heap_update(struct bch_fs *c,
 			      struct stripe *m, size_t idx)
 {
 	ec_stripes_heap *h = &c->ec_stripes_heap;
+	bool do_deletes;
 	size_t i;
 
-	if (!m->on_heap)
-		return;
-
+	mutex_lock(&c->ec_stripes_heap_lock);
 	heap_verify_backpointer(c, idx);
 
 	h->data[m->heap_idx].blocks_nonempty = m->blocks_nonempty;
@@ -741,7 +733,10 @@ void bch2_stripes_heap_update(struct bch_fs *c,
 
 	heap_verify_backpointer(c, idx);
 
-	if (stripe_idx_to_delete(c))
+	do_deletes = stripe_idx_to_delete(c) != 0;
+	mutex_unlock(&c->ec_stripes_heap_lock);
+
+	if (do_deletes)
 		bch2_do_stripe_deletes(c);
 }
 
@@ -799,8 +794,6 @@ static void ec_stripe_delete_work(struct work_struct *work)
 	while (1) {
 		mutex_lock(&c->ec_stripes_heap_lock);
 		idx = stripe_idx_to_delete(c);
-		if (idx)
-			bch2_stripes_heap_del(c, genradix_ptr(&c->stripes, idx), idx);
 		mutex_unlock(&c->ec_stripes_heap_lock);
 
 		if (!idx)
@@ -1013,7 +1006,6 @@ static void ec_stripe_create(struct ec_stripe_new *s)
 {
 	struct bch_fs *c = s->c;
 	struct open_bucket *ob;
-	struct stripe *m;
 	struct bch_stripe *v = &s->new_stripe.key.v;
 	unsigned i, nr_data = v->nr_blocks - v->nr_redundant;
 	int ret;
@@ -1076,13 +1068,6 @@ static void ec_stripe_create(struct ec_stripe_new *s)
 	}
 
 	bch2_stripe_close(c, s);
-
-	mutex_lock(&c->ec_stripes_heap_lock);
-	m = genradix_ptr(&c->stripes, s->new_stripe.key.k.p.offset);
-
-	BUG_ON(m->on_heap);
-	bch2_stripes_heap_insert(c, m, s->new_stripe.key.k.p.offset);
-	mutex_unlock(&c->ec_stripes_heap_lock);
 err:
 	bch2_disk_reservation_put(c, &s->res);
 
@@ -1491,11 +1476,8 @@ static s64 get_existing_stripe(struct bch_fs *c,
 		if (m->algorithm	== head->algo &&
 		    m->nr_redundant	== head->redundancy &&
 		    m->sectors		== head->blocksize &&
-		    m->blocks_nonempty	< m->nr_blocks - m->nr_redundant) {
-			if (!bch2_try_open_stripe(c, head->s, stripe_idx))
-				continue;
-
-			bch2_stripes_heap_del(c, m, stripe_idx);
+		    m->blocks_nonempty	< m->nr_blocks - m->nr_redundant &&
+		    bch2_try_open_stripe(c, head->s, stripe_idx)) {
 			ret = stripe_idx;
 			break;
 		}
@@ -1696,16 +1678,6 @@ unlock:
 	mutex_unlock(&c->ec_stripe_head_lock);
 }
 
-void bch2_stripes_heap_start(struct bch_fs *c)
-{
-	struct genradix_iter iter;
-	struct stripe *m;
-
-	genradix_for_each(&c->stripes, iter, m)
-		if (m->alive)
-			bch2_stripes_heap_insert(c, m, iter.pos);
-}
-
 int bch2_stripes_read(struct bch_fs *c)
 {
 	struct btree_trans trans;
@@ -1730,7 +1702,6 @@ int bch2_stripes_read(struct bch_fs *c)
 		s = bkey_s_c_to_stripe(k).v;
 
 		m = genradix_ptr(&c->stripes, k.k->p.offset);
-		m->alive	= true;
 		m->sectors	= le16_to_cpu(s->sectors);
 		m->algorithm	= s->algorithm;
 		m->nr_blocks	= s->nr_blocks;
@@ -1740,9 +1711,7 @@ int bch2_stripes_read(struct bch_fs *c)
 		for (i = 0; i < s->nr_blocks; i++)
 			m->blocks_nonempty += !!stripe_blockcount_get(s, i);
 
-		mutex_lock(&c->ec_stripes_heap_lock);
-		bch2_stripes_heap_update(c, m, k.k->p.offset);
-		mutex_unlock(&c->ec_stripes_heap_lock);
+		bch2_stripes_heap_insert(c, m, k.k->p.offset);
 	}
 	bch2_trans_iter_exit(&trans, &iter);
 
diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h
index 0a69114bb160..c9e4fb214649 100644
--- a/fs/bcachefs/ec.h
+++ b/fs/bcachefs/ec.h
@@ -217,8 +217,6 @@ void bch2_ec_stop_dev(struct bch_fs *, struct bch_dev *);
 
 void bch2_ec_flush_new_stripes(struct bch_fs *);
 
-void bch2_stripes_heap_start(struct bch_fs *);
-
 int bch2_stripes_read(struct bch_fs *);
 
 void bch2_stripes_heap_to_text(struct printbuf *, struct bch_fs *);
diff --git a/fs/bcachefs/ec_types.h b/fs/bcachefs/ec_types.h
index 2bf26d254b2c..e2b02a82de32 100644
--- a/fs/bcachefs/ec_types.h
+++ b/fs/bcachefs/ec_types.h
@@ -11,15 +11,10 @@ struct bch_replicas_padded {
 
 struct stripe {
 	size_t			heap_idx;
-
 	u16			sectors;
 	u8			algorithm;
-
 	u8			nr_blocks;
 	u8			nr_redundant;
-
-	unsigned		alive:1; /* does a corresponding key exist in stripes btree? */
-	unsigned		on_heap:1;
 	u8			blocks_nonempty;
 };
 
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 1976d5fa3427..f5946b4dbce2 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -1260,8 +1260,6 @@ use_clean:
 		goto err;
 	bch_verbose(c, "stripes_read done");
 
-	bch2_stripes_heap_start(c);
-
 	if (c->sb.version < bcachefs_metadata_version_snapshot_2) {
 		err = "error creating root snapshot node";
 		ret = bch2_fs_initialize_subvolumes(c);
-- 
cgit 


From 7546c78df1963c60b2b6b80265daa26462b87ec5 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 19 Feb 2023 00:43:10 -0500
Subject: bcachefs: Fix ec repair code check

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_gc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 959f4081b42f..8ae838acd853 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -661,7 +661,7 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id
 					 bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))
 				do_update = true;
 
-			if (fsck_err_on(!bch2_ptr_matches_stripe_m(m, p), c,
+			if (fsck_err_on(m && m->alive && !bch2_ptr_matches_stripe_m(m, p), c,
 					"pointer does not match stripe %llu\n"
 					"while marking %s",
 					(u64) p.ec.idx,
-- 
cgit 


From 9f6db1276c0c80b017f9278d6f081f20cecbeb33 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 19 Feb 2023 00:49:51 -0500
Subject: bcachefs: bch2_journal_entries_postprocess()

This brings back journal_entries_compact(), but in a more efficient form
- we need to do multiple postprocess steps, so iterate over the
journal entries being written just once to make it more efficient.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update_interior.c | 15 ++++--------
 fs/bcachefs/btree_update_interior.h |  2 +-
 fs/bcachefs/journal_io.c            | 48 ++++++++++++++++++++++++++++++++++++-
 3 files changed, 53 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 45004f17d51d..ad86c0b9e42e 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -2402,20 +2402,15 @@ bool bch2_btree_interior_updates_flush(struct bch_fs *c)
 	return ret;
 }
 
-void bch2_journal_entries_to_btree_roots(struct bch_fs *c, struct jset *jset)
+void bch2_journal_entry_to_btree_root(struct bch_fs *c, struct jset_entry *entry)
 {
-	struct btree_root *r;
-	struct jset_entry *entry;
+	struct btree_root *r = &c->btree_roots[entry->btree_id];
 
 	mutex_lock(&c->btree_root_lock);
 
-	vstruct_for_each(jset, entry)
-		if (entry->type == BCH_JSET_ENTRY_btree_root) {
-			r = &c->btree_roots[entry->btree_id];
-			r->level = entry->level;
-			r->alive = true;
-			bkey_copy(&r->key, &entry->start[0]);
-		}
+	r->level = entry->level;
+	r->alive = true;
+	bkey_copy(&r->key, &entry->start[0]);
 
 	mutex_unlock(&c->btree_root_lock);
 }
diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h
index 30e9c137b0e2..dcfd7ceacc59 100644
--- a/fs/bcachefs/btree_update_interior.h
+++ b/fs/bcachefs/btree_update_interior.h
@@ -314,7 +314,7 @@ void bch2_btree_updates_to_text(struct printbuf *, struct bch_fs *);
 
 bool bch2_btree_interior_updates_flush(struct bch_fs *);
 
-void bch2_journal_entries_to_btree_roots(struct bch_fs *, struct jset *);
+void bch2_journal_entry_to_btree_root(struct bch_fs *, struct jset_entry *);
 struct jset_entry *bch2_btree_roots_to_journal_entries(struct bch_fs *,
 					struct jset_entry *, struct jset_entry *);
 
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index c6bb78d2a07f..377c07125183 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -1621,6 +1621,52 @@ static void do_journal_write(struct closure *cl)
 	return;
 }
 
+static void bch2_journal_entries_postprocess(struct bch_fs *c, struct jset *jset)
+{
+	struct jset_entry *i, *next, *prev = NULL;
+
+	/*
+	 * Simple compaction, dropping empty jset_entries (from journal
+	 * reservations that weren't fully used) and merging jset_entries that
+	 * can be.
+	 *
+	 * If we wanted to be really fancy here, we could sort all the keys in
+	 * the jset and drop keys that were overwritten - probably not worth it:
+	 */
+	vstruct_for_each_safe(jset, i, next) {
+		unsigned u64s = le16_to_cpu(i->u64s);
+
+		/* Empty entry: */
+		if (!u64s)
+			continue;
+
+		if (i->type == BCH_JSET_ENTRY_btree_root)
+			bch2_journal_entry_to_btree_root(c, i);
+
+		/* Can we merge with previous entry? */
+		if (prev &&
+		    i->btree_id == prev->btree_id &&
+		    i->level	== prev->level &&
+		    i->type	== prev->type &&
+		    i->type	== BCH_JSET_ENTRY_btree_keys &&
+		    le16_to_cpu(prev->u64s) + u64s <= U16_MAX) {
+			memmove_u64s_down(vstruct_next(prev),
+					  i->_data,
+					  u64s);
+			le16_add_cpu(&prev->u64s, u64s);
+			continue;
+		}
+
+		/* Couldn't merge, move i into new position (after prev): */
+		prev = prev ? vstruct_next(prev) : jset->start;
+		if (i != prev)
+			memmove_u64s_down(prev, i, jset_u64s(u64s));
+	}
+
+	prev = prev ? vstruct_next(prev) : jset->start;
+	jset->u64s = cpu_to_le32((u64 *) prev - jset->_data);
+}
+
 void bch2_journal_write(struct closure *cl)
 {
 	struct journal *j = container_of(cl, struct journal, io);
@@ -1692,7 +1738,7 @@ void bch2_journal_write(struct closure *cl)
 	 * entry:
 	 */
 
-	bch2_journal_entries_to_btree_roots(c, jset);
+	bch2_journal_entries_postprocess(c, jset);
 
 	start = end = vstruct_last(jset);
 
-- 
cgit 


From 93bd2f877f221f05a1b51dcfac09d196c8bcda5c Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 20 Feb 2023 14:34:38 -0500
Subject: bcachefs: Improve a verbose log message

We should be using bch2_err_str() where applicable.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/super.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index b030d0bb26e7..cc27f19960f1 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -1920,7 +1920,8 @@ out:
 	kfree(sb);
 	printbuf_exit(&errbuf);
 	module_put(THIS_MODULE);
-	pr_verbose_init(opts, "ret %i", PTR_ERR_OR_ZERO(c));
+	pr_verbose_init(opts, "ret %s (%i)", bch2_err_str(PTR_ERR_OR_ZERO(c)),
+			PTR_ERR_OR_ZERO(c));
 	return c;
 err_print:
 	pr_err("bch_fs_open err opening %s: %s",
-- 
cgit 


From 930256d4fbe7b8dee8f1a1658630655d8c6043f8 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 20 Feb 2023 14:33:46 -0500
Subject: bcachefs: __bch2_btree_insert uses BTREE_INSERT_CACHED

Cached btrees should be doing cached updates by default: this fixes a
bug in the migrate tool.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update_leaf.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 544b90b15260..0bf4116442f4 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -1749,6 +1749,7 @@ int __bch2_btree_insert(struct btree_trans *trans, enum btree_id id,
 	int ret;
 
 	bch2_trans_iter_init(trans, &iter, id, bkey_start_pos(&k->k),
+			     BTREE_ITER_CACHED|
 			     BTREE_ITER_INTENT);
 	ret   = bch2_btree_iter_traverse(&iter) ?:
 		bch2_trans_update(trans, &iter, k, flags);
-- 
cgit 


From e151580d3072e7326732edcaf2a77ea423c695f5 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 20 Feb 2023 16:41:03 -0500
Subject: bcachefs: Add tracepoint & counter for btree split race

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs_format.h       | 3 ++-
 fs/bcachefs/btree_update_interior.c | 4 +++-
 fs/bcachefs/trace.h                 | 6 ++++++
 3 files changed, 11 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index 9524ff02f2d7..4d3c5e2f7ea6 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -1478,7 +1478,8 @@ struct bch_sb_field_disk_groups {
 	x(transaction_commit,				72)	\
 	x(write_super,					73)	\
 	x(trans_restart_would_deadlock_recursion_limit,	74)	\
-	x(trans_restart_write_buffer_flush,		75)
+	x(trans_restart_write_buffer_flush,		75)	\
+	x(trans_restart_split_race,			76)
 
 enum bch_persistent_counters {
 #define x(t, n, ...) BCH_COUNTER_##t,
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index ad86c0b9e42e..79be89006403 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -1717,8 +1717,10 @@ split:
 	 * We could attempt to avoid the transaction restart, by calling
 	 * bch2_btree_path_upgrade() and allocating more nodes:
 	 */
-	if (b->c.level >= as->update_level)
+	if (b->c.level >= as->update_level) {
+		trace_and_count(c, trans_restart_split_race, trans, _THIS_IP_);
 		return btree_trans_restart(trans, BCH_ERR_transaction_restart_split_race);
+	}
 
 	return btree_split(as, trans, path, b, keys, flags);
 }
diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h
index 30b10908ced0..f608baec294a 100644
--- a/fs/bcachefs/trace.h
+++ b/fs/bcachefs/trace.h
@@ -837,6 +837,12 @@ DEFINE_EVENT(transaction_event,	trans_restart_injected,
 	TP_ARGS(trans, caller_ip)
 );
 
+DEFINE_EVENT(transaction_event,	trans_restart_split_race,
+	TP_PROTO(struct btree_trans *trans,
+		 unsigned long caller_ip),
+	TP_ARGS(trans, caller_ip)
+);
+
 DEFINE_EVENT(transaction_event,	trans_blocked_journal_reclaim,
 	TP_PROTO(struct btree_trans *trans,
 		 unsigned long caller_ip),
-- 
cgit 


From e094beccc139b1d2975563cdba0b661d14e6e06b Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 22 Feb 2023 00:56:41 -0500
Subject: bcachefs: Kill bch2_keylist_add_in_order()

Dead code, so delete

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/keylist.c | 16 ----------------
 fs/bcachefs/keylist.h |  1 -
 2 files changed, 17 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/keylist.c b/fs/bcachefs/keylist.c
index 29e51bde8313..cf5998e519e7 100644
--- a/fs/bcachefs/keylist.c
+++ b/fs/bcachefs/keylist.c
@@ -31,22 +31,6 @@ int bch2_keylist_realloc(struct keylist *l, u64 *inline_u64s,
 	return 0;
 }
 
-void bch2_keylist_add_in_order(struct keylist *l, struct bkey_i *insert)
-{
-	struct bkey_i *where;
-
-	for_each_keylist_key(l, where)
-		if (bpos_lt(insert->k.p, where->k.p))
-			break;
-
-	memmove_u64s_up((u64 *) where + insert->k.u64s,
-			where,
-			((u64 *) l->top) - ((u64 *) where));
-
-	l->top_p += insert->k.u64s;
-	bkey_copy(where, insert);
-}
-
 void bch2_keylist_pop_front(struct keylist *l)
 {
 	l->top_p -= bch2_keylist_front(l)->k.u64s;
diff --git a/fs/bcachefs/keylist.h b/fs/bcachefs/keylist.h
index 635efb7e8228..fe759c7031e0 100644
--- a/fs/bcachefs/keylist.h
+++ b/fs/bcachefs/keylist.h
@@ -5,7 +5,6 @@
 #include "keylist_types.h"
 
 int bch2_keylist_realloc(struct keylist *, u64 *, size_t, size_t);
-void bch2_keylist_add_in_order(struct keylist *, struct bkey_i *);
 void bch2_keylist_pop_front(struct keylist *);
 
 static inline void bch2_keylist_init(struct keylist *l, u64 *inline_keys)
-- 
cgit 


From c9163bb03b81e465019cf56d2edf47b70798e3ee Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 21 Feb 2023 19:22:44 -0500
Subject: bcachefs: Cached pointers should not be erasure coded

There's no reason to erasure code cached pointers: we'll always have
another copy, and it'll be cheaper to read the other copy than do a
reconstruct read. And erasure coded cached pointers would add
complications that we'd rather not have to deal with, so let's make sure
to disallow them.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/data_update.c |  6 ++++--
 fs/bcachefs/extents.c     | 32 +++++++++++++++++++++++++++++++-
 fs/bcachefs/extents.h     |  2 ++
 3 files changed, 37 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c
index c3adc7b32e19..c98a393f4916 100644
--- a/fs/bcachefs/data_update.c
+++ b/fs/bcachefs/data_update.c
@@ -97,8 +97,10 @@ static void bch2_bkey_mark_dev_cached(struct bkey_s k, unsigned dev)
 	struct bch_extent_ptr *ptr;
 
 	bkey_for_each_ptr(ptrs, ptr)
-		if (ptr->dev == dev)
-			ptr->cached = true;
+		if (ptr->dev == dev) {
+			bch2_extent_ptr_set_cached(k, ptr);
+			return;
+		}
 }
 
 static int __bch2_data_update_index_update(struct btree_trans *trans,
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 4b865949768f..2e41545dc1e9 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -950,6 +950,29 @@ bool bch2_extent_has_ptr(struct bkey_s_c k1, struct extent_ptr_decoded p1,
 	return false;
 }
 
+void bch2_extent_ptr_set_cached(struct bkey_s k, struct bch_extent_ptr *ptr)
+{
+	struct bkey_ptrs ptrs = bch2_bkey_ptrs(k);
+	union bch_extent_entry *entry;
+	union bch_extent_entry *ec = NULL;
+
+	bkey_extent_entry_for_each(ptrs, entry) {
+		if (&entry->ptr == ptr) {
+			ptr->cached = true;
+			if (ec)
+				extent_entry_drop(k, ec);
+			return;
+		}
+
+		if (extent_entry_is_stripe_ptr(entry))
+			ec = entry;
+		else if (extent_entry_is_ptr(entry))
+			ec = NULL;
+	}
+
+	BUG();
+}
+
 /*
  * bch_extent_normalize - clean up an extent, dropping stale pointers etc.
  *
@@ -1093,7 +1116,7 @@ int bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k,
 	unsigned size_ondisk = k.k->size;
 	unsigned nonce = UINT_MAX;
 	unsigned nr_ptrs = 0;
-	bool unwritten = false;
+	bool unwritten = false, have_ec = false;
 	int ret;
 
 	if (bkey_is_btree_ptr(k.k))
@@ -1129,7 +1152,13 @@ int bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k,
 				return -BCH_ERR_invalid_bkey;
 			}
 
+			if (entry->ptr.cached && have_ec) {
+				prt_printf(err, "cached, erasure coded ptr");
+				return -BCH_ERR_invalid_bkey;
+			}
+
 			unwritten = entry->ptr.unwritten;
+			have_ec = false;
 			nr_ptrs++;
 			break;
 		case BCH_EXTENT_ENTRY_crc32:
@@ -1165,6 +1194,7 @@ int bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k,
 			}
 			break;
 		case BCH_EXTENT_ENTRY_stripe_ptr:
+			have_ec = true;
 			break;
 		}
 	}
diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h
index 1d8f3b309b07..c52a09832857 100644
--- a/fs/bcachefs/extents.h
+++ b/fs/bcachefs/extents.h
@@ -655,6 +655,8 @@ bool bch2_bkey_matches_ptr(struct bch_fs *, struct bkey_s_c,
 bool bch2_extents_match(struct bkey_s_c, struct bkey_s_c);
 bool bch2_extent_has_ptr(struct bkey_s_c, struct extent_ptr_decoded, struct bkey_s_c);
 
+void bch2_extent_ptr_set_cached(struct bkey_s, struct bch_extent_ptr *);
+
 bool bch2_extent_normalize(struct bch_fs *, struct bkey_s);
 void bch2_bkey_ptrs_to_text(struct printbuf *, struct bch_fs *,
 			    struct bkey_s_c);
-- 
cgit 


From 43b0e8787ecb92760202c0682172141ed4b22c62 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 21 Feb 2023 23:51:19 -0500
Subject: bcachefs: Check for redundant ec entries/stripe ptrs

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/extents.c | 28 +++++++++++++++++++++++++++-
 1 file changed, 27 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 2e41545dc1e9..a55e0ed75548 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -1116,7 +1116,7 @@ int bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k,
 	unsigned size_ondisk = k.k->size;
 	unsigned nonce = UINT_MAX;
 	unsigned nr_ptrs = 0;
-	bool unwritten = false, have_ec = false;
+	bool unwritten = false, have_ec = false, crc_since_last_ptr = false;
 	int ret;
 
 	if (bkey_is_btree_ptr(k.k))
@@ -1159,6 +1159,7 @@ int bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k,
 
 			unwritten = entry->ptr.unwritten;
 			have_ec = false;
+			crc_since_last_ptr = false;
 			nr_ptrs++;
 			break;
 		case BCH_EXTENT_ENTRY_crc32:
@@ -1192,18 +1193,43 @@ int bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k,
 					return -BCH_ERR_invalid_bkey;
 				}
 			}
+
+			if (crc_since_last_ptr) {
+				prt_printf(err, "redundant crc entry");
+				return -BCH_ERR_invalid_bkey;
+			}
+			crc_since_last_ptr = true;
 			break;
 		case BCH_EXTENT_ENTRY_stripe_ptr:
+			if (have_ec) {
+				prt_printf(err, "redundant stripe entry");
+				return -BCH_ERR_invalid_bkey;
+			}
 			have_ec = true;
 			break;
 		}
 	}
 
+	if (!nr_ptrs) {
+		prt_str(err, "no ptrs");
+		return -BCH_ERR_invalid_bkey;
+	}
+
 	if (nr_ptrs >= BCH_BKEY_PTRS_MAX) {
 		prt_str(err, "too many ptrs");
 		return -BCH_ERR_invalid_bkey;
 	}
 
+	if (crc_since_last_ptr) {
+		prt_printf(err, "redundant crc entry");
+		return -BCH_ERR_invalid_bkey;
+	}
+
+	if (have_ec) {
+		prt_printf(err, "redundant stripe entry");
+		return -BCH_ERR_invalid_bkey;
+	}
+
 	return 0;
 }
 
-- 
cgit 


From 64784ade4fd75cdd47d7ddfbfcbed3506fe10523 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 22 Feb 2023 17:57:59 -0500
Subject: bcachefs: Fix buffer overrun in ec_stripe_update_extent()

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/ec.c      | 35 ++++++++++++++---------------------
 fs/bcachefs/extents.c | 12 ------------
 fs/bcachefs/extents.h | 12 ++++++++++++
 3 files changed, 26 insertions(+), 33 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index 924494868102..4b054f7c4d4e 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -865,25 +865,6 @@ err:
 	return ret;
 }
 
-static void extent_stripe_ptr_add(struct bkey_s_extent e,
-				  struct ec_stripe_buf *s,
-				  struct bch_extent_ptr *ptr,
-				  unsigned block)
-{
-	struct bch_extent_stripe_ptr *dst = (void *) ptr;
-	union bch_extent_entry *end = extent_entry_last(e);
-
-	memmove_u64s_up(dst + 1, dst, (u64 *) end - (u64 *) dst);
-	e.k->u64s += sizeof(*dst) / sizeof(u64);
-
-	*dst = (struct bch_extent_stripe_ptr) {
-		.type = 1 << BCH_EXTENT_ENTRY_stripe_ptr,
-		.block		= block,
-		.redundancy	= s->key.v.nr_redundant,
-		.idx		= s->key.k.p.offset,
-	};
-}
-
 static int ec_stripe_update_extent(struct btree_trans *trans,
 				   struct bpos bucket, u8 gen,
 				   struct ec_stripe_buf *s,
@@ -895,6 +876,7 @@ static int ec_stripe_update_extent(struct btree_trans *trans,
 	struct bkey_s_c k;
 	const struct bch_extent_ptr *ptr_c;
 	struct bch_extent_ptr *ptr, *ec_ptr = NULL;
+	struct bch_extent_stripe_ptr stripe_ptr;
 	struct bkey_i *n;
 	int ret, dev, block;
 
@@ -933,16 +915,27 @@ static int ec_stripe_update_extent(struct btree_trans *trans,
 
 	dev = s->key.v.ptrs[block].dev;
 
-	n = bch2_bkey_make_mut(trans, k);
+	n = bch2_trans_kmalloc(trans, bkey_bytes(k.k) + sizeof(stripe_ptr));
 	ret = PTR_ERR_OR_ZERO(n);
 	if (ret)
 		goto out;
 
+	bkey_reassemble(n, k);
+
 	bch2_bkey_drop_ptrs(bkey_i_to_s(n), ptr, ptr->dev != dev);
 	ec_ptr = (void *) bch2_bkey_has_device(bkey_i_to_s_c(n), dev);
 	BUG_ON(!ec_ptr);
 
-	extent_stripe_ptr_add(bkey_i_to_s_extent(n), s, ec_ptr, block);
+	stripe_ptr = (struct bch_extent_stripe_ptr) {
+		.type = 1 << BCH_EXTENT_ENTRY_stripe_ptr,
+		.block		= block,
+		.redundancy	= s->key.v.nr_redundant,
+		.idx		= s->key.k.p.offset,
+	};
+
+	__extent_entry_insert(n,
+			(union bch_extent_entry *) ec_ptr,
+			(union bch_extent_entry *) &stripe_ptr);
 
 	ret = bch2_trans_update(trans, &iter, n, 0);
 out:
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index a55e0ed75548..38be9bf91264 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -705,18 +705,6 @@ void bch2_bkey_extent_entry_drop(struct bkey_i *k, union bch_extent_entry *entry
 	k->k.u64s -= extent_entry_u64s(entry);
 }
 
-static inline void __extent_entry_insert(struct bkey_i *k,
-					 union bch_extent_entry *dst,
-					 union bch_extent_entry *new)
-{
-	union bch_extent_entry *end = bkey_val_end(bkey_i_to_s(k));
-
-	memmove_u64s_up_small((u64 *) dst + extent_entry_u64s(new),
-			      dst, (u64 *) end - (u64 *) dst);
-	k->k.u64s += extent_entry_u64s(new);
-	memcpy_u64s_small(dst, new, extent_entry_u64s(new));
-}
-
 void bch2_extent_ptr_decoded_append(struct bkey_i *k,
 				    struct extent_ptr_decoded *p)
 {
diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h
index c52a09832857..2e37543a6229 100644
--- a/fs/bcachefs/extents.h
+++ b/fs/bcachefs/extents.h
@@ -76,6 +76,18 @@ static inline size_t extent_entry_u64s(const union bch_extent_entry *entry)
 	return extent_entry_bytes(entry) / sizeof(u64);
 }
 
+static inline void __extent_entry_insert(struct bkey_i *k,
+					 union bch_extent_entry *dst,
+					 union bch_extent_entry *new)
+{
+	union bch_extent_entry *end = bkey_val_end(bkey_i_to_s(k));
+
+	memmove_u64s_up_small((u64 *) dst + extent_entry_u64s(new),
+			      dst, (u64 *) end - (u64 *) dst);
+	k->k.u64s += extent_entry_u64s(new);
+	memcpy_u64s_small(dst, new, extent_entry_u64s(new));
+}
+
 static inline bool extent_entry_is_ptr(const union bch_extent_entry *e)
 {
 	return extent_entry_type(e) == BCH_EXTENT_ENTRY_ptr;
-- 
cgit 


From e3877382fb9d0a8ae43c7e012742622bbd6e6d17 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 22 Feb 2023 18:35:51 -0500
Subject: bcachefs: Fix erasure coding shutdown path

It's possible when shutting down to for a stripe head to have a new
stripe that doesn't yet have any blocks allocated - we just need to free
it.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/ec.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index 4b054f7c4d4e..123ca0b0b43f 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -1764,6 +1764,7 @@ void bch2_new_stripes_to_text(struct printbuf *out, struct bch_fs *c)
 void bch2_fs_ec_exit(struct bch_fs *c)
 {
 	struct ec_stripe_head *h;
+	unsigned i;
 
 	while (1) {
 		mutex_lock(&c->ec_stripe_head_lock);
@@ -1775,7 +1776,12 @@ void bch2_fs_ec_exit(struct bch_fs *c)
 		if (!h)
 			break;
 
-		BUG_ON(h->s);
+		if (h->s) {
+			for (i = 0; i < h->s->new_stripe.key.v.nr_blocks; i++)
+				BUG_ON(h->s->blocks[i]);
+
+			kfree(h->s);
+		}
 		kfree(h);
 	}
 
-- 
cgit 


From 70ded998c5abef9907b8d42a5118dddd3de67160 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 22 Feb 2023 19:28:58 -0500
Subject: bcachefs: get_stripe_key_trans()

Another nested btree_trans fix

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/ec.c | 22 +++++++++++++---------
 1 file changed, 13 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index 123ca0b0b43f..b711f33f0e03 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -442,15 +442,14 @@ static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf,
 	percpu_ref_put(&ca->io_ref);
 }
 
-static int get_stripe_key(struct bch_fs *c, u64 idx, struct ec_stripe_buf *stripe)
+static int get_stripe_key_trans(struct btree_trans *trans, u64 idx,
+				struct ec_stripe_buf *stripe)
 {
-	struct btree_trans trans;
 	struct btree_iter iter;
 	struct bkey_s_c k;
 	int ret;
 
-	bch2_trans_init(&trans, c, 0, 0);
-	bch2_trans_iter_init(&trans, &iter, BTREE_ID_stripes,
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_stripes,
 			     POS(0, idx), BTREE_ITER_SLOTS);
 	k = bch2_btree_iter_peek_slot(&iter);
 	ret = bkey_err(k);
@@ -462,11 +461,15 @@ static int get_stripe_key(struct bch_fs *c, u64 idx, struct ec_stripe_buf *strip
 	}
 	bkey_reassemble(&stripe->key.k_i, k);
 err:
-	bch2_trans_iter_exit(&trans, &iter);
-	bch2_trans_exit(&trans);
+	bch2_trans_iter_exit(trans, &iter);
 	return ret;
 }
 
+static int get_stripe_key(struct bch_fs *c, u64 idx, struct ec_stripe_buf *stripe)
+{
+	return bch2_trans_run(c, get_stripe_key_trans(&trans, idx, stripe));
+}
+
 /* recovery read path: */
 int bch2_ec_read_extent(struct bch_fs *c, struct bch_read_bio *rbio)
 {
@@ -1479,8 +1482,9 @@ static s64 get_existing_stripe(struct bch_fs *c,
 	return ret;
 }
 
-static int __bch2_ec_stripe_head_reuse(struct bch_fs *c, struct ec_stripe_head *h)
+static int __bch2_ec_stripe_head_reuse(struct btree_trans *trans, struct ec_stripe_head *h)
 {
+	struct bch_fs *c = trans->c;
 	unsigned i;
 	s64 idx;
 	int ret;
@@ -1490,7 +1494,7 @@ static int __bch2_ec_stripe_head_reuse(struct bch_fs *c, struct ec_stripe_head *
 		return -BCH_ERR_ENOSPC_stripe_reuse;
 
 	h->s->have_existing_stripe = true;
-	ret = get_stripe_key(c, idx, &h->s->existing_stripe);
+	ret = get_stripe_key_trans(trans, idx, &h->s->existing_stripe);
 	if (ret) {
 		bch2_fs_fatal_error(c, "error reading stripe key: %i", ret);
 		return ret;
@@ -1619,7 +1623,7 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans,
 		goto err;
 
 	if (ret && needs_stripe_new)
-		ret = __bch2_ec_stripe_head_reuse(c, h);
+		ret = __bch2_ec_stripe_head_reuse(trans, h);
 	if (ret) {
 		bch_err_ratelimited(c, "failed to get stripe: %s", bch2_err_str(ret));
 		goto err;
-- 
cgit 


From e9b70146545120f7d4bdba87eb8ec9a8f1669c7d Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 22 Feb 2023 19:39:02 -0500
Subject: bcachefs: Don't call bch2_trans_update() unlocked

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/move.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index c964643e7ebf..814ca33a15f6 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -226,7 +226,8 @@ static int bch2_extent_drop_ptrs(struct btree_trans *trans,
 	if (bkey_deleted(&n->k))
 		n->k.size = 0;
 
-	return bch2_trans_update(trans, iter, n, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
+	return bch2_trans_relock(trans) ?:
+		bch2_trans_update(trans, iter, n, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
 		bch2_trans_commit(trans, NULL, NULL, BTREE_INSERT_NOFAIL);
 }
 
-- 
cgit 


From 1a14e255100cb17cface9ca179ca7ddba87fd8b9 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 24 Feb 2023 19:06:32 -0500
Subject: bcachefs: Make bucket_alloc tracepoint more readable

Print bucket in dev:bucket notation, to be consistent with how we refer
to buckets elsewhere.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/trace.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h
index f608baec294a..aae04d0619be 100644
--- a/fs/bcachefs/trace.h
+++ b/fs/bcachefs/trace.h
@@ -530,7 +530,7 @@ DECLARE_EVENT_CLASS(bucket_alloc,
 		s, nonblocking, err),
 
 	TP_STRUCT__entry(
-		__field(dev_t,			dev			)
+		__field(u8,			dev			)
 		__array(char,	reserve,	16			)
 		__field(bool,			user	)
 		__field(u64,			bucket	)
@@ -548,7 +548,7 @@ DECLARE_EVENT_CLASS(bucket_alloc,
 	),
 
 	TP_fast_assign(
-		__entry->dev		= ca->dev;
+		__entry->dev		= ca->dev_idx;
 		strscpy(__entry->reserve, alloc_reserve, sizeof(__entry->reserve));
 		__entry->user		= user;
 		__entry->bucket		= bucket;
@@ -565,10 +565,10 @@ DECLARE_EVENT_CLASS(bucket_alloc,
 		strscpy(__entry->err, err, sizeof(__entry->err));
 	),
 
-	TP_printk("%d,%d reserve %s user %u bucket %llu free %llu avail %llu copygc_wait %llu/%lli seen %llu open %llu need_journal_commit %llu nouse %llu nocow %llu nonblocking %u err %s",
-		  MAJOR(__entry->dev), MINOR(__entry->dev),
+	TP_printk("reserve %s user %u bucket %u:%llu free %llu avail %llu copygc_wait %llu/%lli seen %llu open %llu need_journal_commit %llu nouse %llu nocow %llu nonblocking %u err %s",
 		  __entry->reserve,
 		  __entry->user,
+		  __entry->dev,
 		  __entry->bucket,
 		  __entry->free,
 		  __entry->avail,
-- 
cgit 


From 33669e0cc94e9554cf162cbe2e63155887a10231 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 24 Feb 2023 19:07:21 -0500
Subject: bcachefs: Add option for completely disabling nocow

This adds an option for completely disabling nocow mode, including the
locking in the data move path.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/data_update.c | 38 +++++++++++++++++++++-----------------
 fs/bcachefs/io.c          |  2 +-
 fs/bcachefs/opts.h        |  6 ++++++
 3 files changed, 28 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c
index c98a393f4916..dacea5e04000 100644
--- a/fs/bcachefs/data_update.c
+++ b/fs/bcachefs/data_update.c
@@ -327,8 +327,9 @@ void bch2_data_update_exit(struct data_update *update)
 	const struct bch_extent_ptr *ptr;
 
 	bkey_for_each_ptr(ptrs, ptr) {
-		bch2_bucket_nocow_unlock(&c->nocow_locks,
-					 PTR_BUCKET_POS(c, ptr), 0);
+		if (c->opts.nocow_enabled)
+			bch2_bucket_nocow_unlock(&c->nocow_locks,
+						 PTR_BUCKET_POS(c, ptr), 0);
 		percpu_ref_put(&bch_dev_bkey_exists(c, ptr->dev)->ref);
 	}
 
@@ -488,23 +489,26 @@ int bch2_data_update_init(struct btree_trans *trans,
 		if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible)
 			m->op.incompressible = true;
 
-		if (ctxt) {
-			move_ctxt_wait_event(ctxt, trans,
-					(locked = bch2_bucket_nocow_trylock(&c->nocow_locks,
-								  PTR_BUCKET_POS(c, &p.ptr), 0)) ||
-					!atomic_read(&ctxt->read_sectors));
-
-			if (!locked)
-				bch2_bucket_nocow_lock(&c->nocow_locks,
-						       PTR_BUCKET_POS(c, &p.ptr), 0);
-		} else {
-			if (!bch2_bucket_nocow_trylock(&c->nocow_locks,
-						       PTR_BUCKET_POS(c, &p.ptr), 0)) {
-				ret = -BCH_ERR_nocow_lock_blocked;
-				goto err;
+		if (c->opts.nocow_enabled) {
+			if (ctxt) {
+				move_ctxt_wait_event(ctxt, trans,
+						(locked = bch2_bucket_nocow_trylock(&c->nocow_locks,
+									  PTR_BUCKET_POS(c, &p.ptr), 0)) ||
+						!atomic_read(&ctxt->read_sectors));
+
+				if (!locked)
+					bch2_bucket_nocow_lock(&c->nocow_locks,
+							       PTR_BUCKET_POS(c, &p.ptr), 0);
+			} else {
+				if (!bch2_bucket_nocow_trylock(&c->nocow_locks,
+							       PTR_BUCKET_POS(c, &p.ptr), 0)) {
+					ret = -BCH_ERR_nocow_lock_blocked;
+					goto err;
+				}
 			}
+			ptrs_locked |= (1U << i);
 		}
-		ptrs_locked |= (1U << i);
+
 		i++;
 	}
 
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index 6f7e4dac4268..ede2f3116935 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -1649,7 +1649,7 @@ static void __bch2_write(struct bch_write_op *op)
 
 	nofs_flags = memalloc_nofs_save();
 
-	if (unlikely(op->opts.nocow)) {
+	if (unlikely(op->opts.nocow && c->opts.nocow_enabled)) {
 		bch2_nocow_write(op);
 		if (op->flags & BCH_WRITE_DONE)
 			goto out_nofs_restore;
diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
index fc444c68025c..afbf82d62977 100644
--- a/fs/bcachefs/opts.h
+++ b/fs/bcachefs/opts.h
@@ -404,6 +404,12 @@ enum opt_type {
 	  NULL,		"Nocow mode: Writes will be done in place when possible.\n"\
 			"Snapshots and reflink will still caused writes to be COW\n"\
 			"Implicitly disables data checksumming, compression and encryption")\
+	x(nocow_enabled,		u8,				\
+	  OPT_FS|OPT_MOUNT,						\
+	  OPT_BOOL(),							\
+	  BCH2_NO_SB_OPT,			true,			\
+	  NULL,		"Enable nocow mode: enables runtime locking in\n"\
+			"data move path needed if nocow will ever be in use\n")\
 	x(no_data_io,			u8,				\
 	  OPT_MOUNT,							\
 	  OPT_BOOL(),							\
-- 
cgit 


From 0d763863af0b1d70dcb64e515df4b9242bc9c28d Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 24 Feb 2023 19:26:03 -0500
Subject: bcachefs: Improve bch2_stripe_to_text()

We now print pointers as bucket:offset, the same as how we print extent
pointers.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/ec.c | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index b711f33f0e03..473f1c09e106 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -138,20 +138,28 @@ void bch2_stripe_to_text(struct printbuf *out, struct bch_fs *c,
 			 struct bkey_s_c k)
 {
 	const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
-	unsigned i;
+	unsigned i, nr_data = s->nr_blocks - s->nr_redundant;
 
 	prt_printf(out, "algo %u sectors %u blocks %u:%u csum %u gran %u",
 	       s->algorithm,
 	       le16_to_cpu(s->sectors),
-	       s->nr_blocks - s->nr_redundant,
+	       nr_data,
 	       s->nr_redundant,
 	       s->csum_type,
 	       1U << s->csum_granularity_bits);
 
-	for (i = 0; i < s->nr_blocks; i++)
-		prt_printf(out, " %u:%llu:%u", s->ptrs[i].dev,
-		       (u64) s->ptrs[i].offset,
-		       stripe_blockcount_get(s, i));
+	for (i = 0; i < s->nr_blocks; i++) {
+		const struct bch_extent_ptr *ptr = s->ptrs + i;
+		struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+		u32 offset;
+		u64 b = sector_to_bucket_and_offset(ca, ptr->offset, &offset);
+
+		prt_printf(out, " %u:%llu:%u", ptr->dev, b, offset);
+		if (i < nr_data)
+			prt_printf(out, "#%u", stripe_blockcount_get(s, i));
+		if (ptr_stale(ca, ptr))
+			prt_printf(out, " stale");
+	}
 }
 
 /* returns blocknr in stripe that we matched: */
-- 
cgit 


From 39a1ea129a6906a0d6127036222bdb68ec01a277 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 25 Feb 2023 00:32:34 -0500
Subject: bcachefs: Single open_bucket_partial list

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c | 28 ++++++++++++++++------------
 fs/bcachefs/alloc_foreground.c | 22 ++++++++++------------
 fs/bcachefs/bcachefs.h         |  6 +++---
 3 files changed, 29 insertions(+), 27 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 472466d16b68..ce0ea4886288 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -2175,21 +2175,25 @@ void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca)
 	}
 	mutex_unlock(&c->btree_reserve_cache_lock);
 
-	while (1) {
-		struct open_bucket *ob;
-
-		spin_lock(&c->freelist_lock);
-		if (!ca->open_buckets_partial_nr) {
+	spin_lock(&c->freelist_lock);
+	i = 0;
+	while (i < c->open_buckets_partial_nr) {
+		struct open_bucket *ob =
+			c->open_buckets + c->open_buckets_partial[i];
+
+		if (ob->dev == ca->dev_idx) {
+			--c->open_buckets_partial_nr;
+			swap(c->open_buckets_partial[i],
+			     c->open_buckets_partial[c->open_buckets_partial_nr]);
+			ob->on_partial_list = false;
 			spin_unlock(&c->freelist_lock);
-			break;
+			bch2_open_bucket_put(c, ob);
+			spin_lock(&c->freelist_lock);
+		} else {
+			i++;
 		}
-		ob = c->open_buckets +
-			ca->open_buckets_partial[--ca->open_buckets_partial_nr];
-		ob->on_partial_list = false;
-		spin_unlock(&c->freelist_lock);
-
-		bch2_open_bucket_put(c, ob);
 	}
+	spin_unlock(&c->freelist_lock);
 
 	bch2_ec_stop_dev(c, ca);
 
diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index 0b0fe4fea6cc..1405d6b8cbc5 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -154,18 +154,15 @@ static void open_bucket_free_unused(struct bch_fs *c,
 				    struct write_point *wp,
 				    struct open_bucket *ob)
 {
-	struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
 	bool may_realloc = wp->data_type == BCH_DATA_user;
 
-	BUG_ON(ca->open_buckets_partial_nr >
-	       ARRAY_SIZE(ca->open_buckets_partial));
+	BUG_ON(c->open_buckets_partial_nr >=
+	       ARRAY_SIZE(c->open_buckets_partial));
 
-	if (ca->open_buckets_partial_nr <
-	    ARRAY_SIZE(ca->open_buckets_partial) &&
-	    may_realloc) {
+	if (may_realloc) {
 		spin_lock(&c->freelist_lock);
 		ob->on_partial_list = true;
-		ca->open_buckets_partial[ca->open_buckets_partial_nr++] =
+		c->open_buckets_partial[c->open_buckets_partial_nr++] =
 			ob - c->open_buckets;
 		spin_unlock(&c->freelist_lock);
 
@@ -394,12 +391,13 @@ static struct open_bucket *try_alloc_partial_bucket(struct bch_fs *c, struct bch
 
 	spin_lock(&c->freelist_lock);
 
-	for (i = ca->open_buckets_partial_nr - 1; i >= 0; --i) {
-		ob = c->open_buckets + ca->open_buckets_partial[i];
+	for (i = c->open_buckets_partial_nr - 1; i >= 0; --i) {
+		ob = c->open_buckets + c->open_buckets_partial[i];
 
-		if (reserve <= ob->alloc_reserve) {
-			array_remove_item(ca->open_buckets_partial,
-					  ca->open_buckets_partial_nr,
+		if (ob->dev == ca->dev_idx &&
+		    reserve <= ob->alloc_reserve) {
+			array_remove_item(c->open_buckets_partial,
+					  c->open_buckets_partial_nr,
 					  i);
 			ob->on_partial_list = false;
 			ob->alloc_reserve = reserve;
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 85a815cdf586..509a16469613 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -516,9 +516,6 @@ struct bch_dev {
 	unsigned		nr_open_buckets;
 	unsigned		nr_btree_reserve;
 
-	open_bucket_idx_t	open_buckets_partial[OPEN_BUCKETS_COUNT];
-	open_bucket_idx_t	open_buckets_partial_nr;
-
 	size_t			inc_gen_needs_gc;
 	size_t			inc_gen_really_needs_gc;
 	size_t			buckets_waiting_on_journal;
@@ -859,6 +856,9 @@ struct bch_fs {
 	struct open_bucket	open_buckets[OPEN_BUCKETS_COUNT];
 	open_bucket_idx_t	open_buckets_hash[OPEN_BUCKETS_COUNT];
 
+	open_bucket_idx_t	open_buckets_partial[OPEN_BUCKETS_COUNT];
+	open_bucket_idx_t	open_buckets_partial_nr;
+
 	struct write_point	btree_write_point;
 	struct write_point	rebalance_write_point;
 
-- 
cgit 


From 747ded6ddfe88eb9644ee0512c061e46fe2fb09d Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 26 Feb 2023 15:48:39 -0500
Subject: bcachefs: Fix for shared paths in write buffer flush

It's possible for bch2_write_buffer_flush_one() to end up with a shared
path, if called from a context that already has a btree iterator
pointing to a key being flushed. We have to be careful when that
happens, since we can't clone a path that holds write locks.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_write_buffer.c | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_write_buffer.c b/fs/bcachefs/btree_write_buffer.c
index 6285532e7790..026c249a3f44 100644
--- a/fs/bcachefs/btree_write_buffer.c
+++ b/fs/bcachefs/btree_write_buffer.c
@@ -64,6 +64,15 @@ static int bch2_btree_write_buffer_flush_one(struct btree_trans *trans,
 
 	bch2_btree_insert_key_leaf(trans, path, &wb->k, wb->journal_seq);
 	(*fast)++;
+
+	if (path->ref > 1) {
+		/*
+		 * We can't clone a path that has write locks: if the path is
+		 * shared, unlock before set_pos(), traverse():
+		 */
+		bch2_btree_node_unlock_write(trans, path, path->l[0].b);
+		*write_locked = false;
+	}
 	return 0;
 trans_commit:
 	return  bch2_trans_update(trans, iter, &wb->k, 0) ?:
-- 
cgit 


From e07cb97460b9dd057c510e6d2294a29e72e60797 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 25 Feb 2023 05:22:37 -0500
Subject: bcachefs: Flush write buffer as needed in backpointers repair

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/backpointers.c | 31 +++++++++++++++++++++++++------
 1 file changed, 25 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c
index e001f4191671..a40c26125d2a 100644
--- a/fs/bcachefs/backpointers.c
+++ b/fs/bcachefs/backpointers.c
@@ -932,11 +932,14 @@ static int check_one_backpointer(struct btree_trans *trans,
 				 struct bpos bucket,
 				 u64 *bp_offset,
 				 struct bbpos start,
-				 struct bbpos end)
+				 struct bbpos end,
+				 struct bpos *last_flushed_pos)
 {
+	struct bch_fs *c = trans->c;
 	struct btree_iter iter;
 	struct bch_backpointer bp;
 	struct bbpos pos;
+	struct bpos bp_pos;
 	struct bkey_s_c k;
 	struct printbuf buf = PRINTBUF;
 	int ret;
@@ -957,17 +960,31 @@ static int check_one_backpointer(struct btree_trans *trans,
 	if (ret)
 		return ret;
 
-	if (fsck_err_on(!k.k, trans->c,
+	bp_pos = bucket_pos_to_bp(c, bucket,
+			max(*bp_offset, BACKPOINTER_OFFSET_MAX) - BACKPOINTER_OFFSET_MAX);
+
+	if (!k.k && !bpos_eq(*last_flushed_pos, bp_pos)) {
+		*last_flushed_pos = bp_pos;
+		pr_info("flushing at %llu:%llu",
+			last_flushed_pos->inode,
+			last_flushed_pos->offset);
+
+		ret = bch2_btree_write_buffer_flush_sync(trans) ?:
+			-BCH_ERR_transaction_restart_write_buffer_flush;
+		goto out;
+	}
+
+	if (fsck_err_on(!k.k, c,
 			"%s backpointer points to missing extent\n%s",
 			*bp_offset < BACKPOINTER_OFFSET_MAX ? "alloc" : "btree",
 			(bch2_backpointer_to_text(&buf, &bp), buf.buf))) {
 		ret = bch2_backpointer_del_by_offset(trans, bucket, *bp_offset, bp);
 		if (ret == -ENOENT)
-			bch_err(trans->c, "backpointer at %llu not found", *bp_offset);
+			bch_err(c, "backpointer at %llu not found", *bp_offset);
 	}
-
-	bch2_trans_iter_exit(trans, &iter);
+out:
 fsck_err:
+	bch2_trans_iter_exit(trans, &iter);
 	printbuf_exit(&buf);
 	return ret;
 }
@@ -978,6 +995,7 @@ static int bch2_check_backpointers_to_extents_pass(struct btree_trans *trans,
 {
 	struct btree_iter iter;
 	struct bkey_s_c k;
+	struct bpos last_flushed_pos = SPOS_MAX;
 	int ret = 0;
 
 	for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN,
@@ -987,7 +1005,8 @@ static int bch2_check_backpointers_to_extents_pass(struct btree_trans *trans,
 		while (!(ret = commit_do(trans, NULL, NULL,
 					 BTREE_INSERT_LAZY_RW|
 					 BTREE_INSERT_NOFAIL,
-				check_one_backpointer(trans, iter.pos, &bp_offset, start, end))) &&
+				check_one_backpointer(trans, iter.pos, &bp_offset,
+						      start, end, &last_flushed_pos))) &&
 		       bp_offset < U64_MAX)
 			bp_offset++;
 
-- 
cgit 


From 039c45feef4f9a46aa0ee0b5ecfafdfd4c0dde76 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 26 Feb 2023 17:12:05 -0500
Subject: bcachefs: bch2_data_update_index_update() -> bch2_trans_run()

Convert to use the standard helper

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/data_update.c | 10 +---------
 1 file changed, 1 insertion(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c
index dacea5e04000..84c2963d4069 100644
--- a/fs/bcachefs/data_update.c
+++ b/fs/bcachefs/data_update.c
@@ -296,15 +296,7 @@ out:
 
 int bch2_data_update_index_update(struct bch_write_op *op)
 {
-	struct bch_fs *c = op->c;
-	struct btree_trans trans;
-	int ret;
-
-	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024);
-	ret = __bch2_data_update_index_update(&trans, op);
-	bch2_trans_exit(&trans);
-
-	return ret;
+	return bch2_trans_run(op->c, __bch2_data_update_index_update(&trans, op));
 }
 
 void bch2_data_update_read_done(struct data_update *m,
-- 
cgit 


From 94bc95c468344d6a329dd87ab4461532584a7b71 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 26 Feb 2023 17:12:36 -0500
Subject: bcachefs: ec: zero_out_rest_of_ec_bucket()

Occasionally, we won't write to an entire bucket. This fixes the EC code
to handle this case, zeroing out the rest of the bucket as needed.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/ec.c | 40 +++++++++++++++++++++++++++++++++++++---
 1 file changed, 37 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index 473f1c09e106..88f319992c37 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -1003,6 +1003,35 @@ err:
 	return ret;
 }
 
+static void zero_out_rest_of_ec_bucket(struct bch_fs *c,
+				       struct ec_stripe_new *s,
+				       unsigned block,
+				       struct open_bucket *ob)
+{
+	struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
+	unsigned offset = ca->mi.bucket_size - ob->sectors_free;
+	int ret;
+
+	if (!bch2_dev_get_ioref(ca, WRITE)) {
+		s->err = -EROFS;
+		return;
+	}
+
+	memset(s->new_stripe.data[block] + (offset << 9),
+	       0,
+	       ob->sectors_free << 9);
+
+	ret = blkdev_issue_zeroout(ca->disk_sb.bdev,
+			ob->bucket * ca->mi.bucket_size + offset,
+			ob->sectors_free,
+			GFP_KERNEL, 0);
+
+	percpu_ref_put(&ca->io_ref);
+
+	if (ret)
+		s->err = ret;
+}
+
 /*
  * data buckets of new stripe all written: create the stripe
  */
@@ -1018,6 +1047,14 @@ static void ec_stripe_create(struct ec_stripe_new *s)
 
 	closure_sync(&s->iodone);
 
+	for (i = 0; i < nr_data; i++)
+		if (s->blocks[i]) {
+			ob = c->open_buckets + s->blocks[i];
+
+			if (ob->sectors_free)
+				zero_out_rest_of_ec_bucket(c, s, i, ob);
+		}
+
 	if (s->err) {
 		if (!bch2_err_matches(s->err, EROFS))
 			bch_err(c, "error creating stripe: error writing data buckets");
@@ -1159,9 +1196,6 @@ void bch2_ec_bucket_written(struct bch_fs *c, struct open_bucket *ob)
 {
 	struct ec_stripe_new *s = ob->ec;
 
-	if (ob->sectors_free)
-		s->err = -1;
-
 	ec_stripe_new_put(c, s);
 }
 
-- 
cgit 


From 0f2ea6550f1fbfabbe2db276107246c1236018ab Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 27 Feb 2023 21:26:07 -0500
Subject: bcachefs: bch2_btree_iter_peek_and_restart_outlined()

Needed for interfacing with Rust - bindgen can't handle inline
functions, alas.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c | 12 ++++++++++++
 fs/bcachefs/btree_iter.h |  2 ++
 2 files changed, 14 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 63213205a043..eebab7534f62 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -2568,6 +2568,18 @@ struct bkey_s_c bch2_btree_iter_prev_slot(struct btree_iter *iter)
 	return bch2_btree_iter_peek_slot(iter);
 }
 
+struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_iter *iter)
+{
+	struct bkey_s_c k;
+
+	while (btree_trans_too_many_iters(iter->trans) ||
+	       (k = bch2_btree_iter_peek_type(iter, iter->flags),
+		bch2_err_matches(bkey_err(k), BCH_ERR_transaction_restart)))
+		bch2_trans_begin(iter->trans);
+
+	return k;
+}
+
 /* new transactional stuff: */
 
 #ifdef CONFIG_BCACHEFS_DEBUG
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index 50b39704c56f..8e1f754e641b 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -595,6 +595,8 @@ static inline int btree_trans_too_many_iters(struct btree_trans *trans)
 	return 0;
 }
 
+struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_iter *);
+
 static inline struct bkey_s_c
 __bch2_btree_iter_peek_and_restart(struct btree_trans *trans,
 				   struct btree_iter *iter, unsigned flags)
-- 
cgit 


From f3a65bb98b4612745cf2505734c78404344e2c9e Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 27 Feb 2023 22:12:06 -0500
Subject: bcachefs: Convert constants to consts

Rust bindgen doesn't handle macros, but it does handle integer
constants: this conversion aids in implementing safe Rust wrapper
interfaces.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_types.h | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 38bc2a1d198b..bc4aa26b9486 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -184,33 +184,33 @@ struct btree_node_iter {
 /*
  * Iterate over all possible positions, synthesizing deleted keys for holes:
  */
-#define BTREE_ITER_SLOTS		(1 << 0)
-#define BTREE_ITER_ALL_LEVELS		(1 << 1)
+static const u16 BTREE_ITER_SLOTS		= 1 << 0;
+static const u16 BTREE_ITER_ALL_LEVELS		= 1 << 1;
 /*
  * Indicates that intent locks should be taken on leaf nodes, because we expect
  * to be doing updates:
  */
-#define BTREE_ITER_INTENT		(1 << 2)
+static const u16 BTREE_ITER_INTENT		= 1 << 2;
 /*
  * Causes the btree iterator code to prefetch additional btree nodes from disk:
  */
-#define BTREE_ITER_PREFETCH		(1 << 3)
+static const u16 BTREE_ITER_PREFETCH		= 1 << 3;
 /*
  * Used in bch2_btree_iter_traverse(), to indicate whether we're searching for
  * @pos or the first key strictly greater than @pos
  */
-#define BTREE_ITER_IS_EXTENTS		(1 << 4)
-#define BTREE_ITER_NOT_EXTENTS		(1 << 5)
-#define BTREE_ITER_CACHED		(1 << 6)
-#define BTREE_ITER_WITH_KEY_CACHE	(1 << 7)
-#define BTREE_ITER_WITH_UPDATES		(1 << 8)
-#define BTREE_ITER_WITH_JOURNAL		(1 << 9)
-#define __BTREE_ITER_ALL_SNAPSHOTS	(1 << 10)
-#define BTREE_ITER_ALL_SNAPSHOTS	(1 << 11)
-#define BTREE_ITER_FILTER_SNAPSHOTS	(1 << 12)
-#define BTREE_ITER_NOPRESERVE		(1 << 13)
-#define BTREE_ITER_CACHED_NOFILL	(1 << 14)
-#define BTREE_ITER_KEY_CACHE_FILL	(1 << 15)
+static const u16 BTREE_ITER_IS_EXTENTS		= 1 << 4;
+static const u16 BTREE_ITER_NOT_EXTENTS		= 1 << 5;
+static const u16 BTREE_ITER_CACHED		= 1 << 6;
+static const u16 BTREE_ITER_WITH_KEY_CACHE	= 1 << 7;
+static const u16 BTREE_ITER_WITH_UPDATES	= 1 << 8;
+static const u16 BTREE_ITER_WITH_JOURNAL	= 1 << 9;
+static const u16 __BTREE_ITER_ALL_SNAPSHOTS	= 1 << 10;
+static const u16 BTREE_ITER_ALL_SNAPSHOTS	= 1 << 11;
+static const u16 BTREE_ITER_FILTER_SNAPSHOTS	= 1 << 12;
+static const u16 BTREE_ITER_NOPRESERVE		= 1 << 13;
+static const u16 BTREE_ITER_CACHED_NOFILL	= 1 << 14;
+static const u16 BTREE_ITER_KEY_CACHE_FILL	= 1 << 15;
 
 enum btree_path_uptodate {
 	BTREE_ITER_UPTODATE		= 0,
-- 
cgit 


From a64adedb8663ded044e535159b2e723a04396845 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 27 Feb 2023 22:30:54 -0500
Subject: bcachefs: ec: Ensure new stripe is closed in error path

This fixes a use-after-free bug.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/ec.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index 88f319992c37..cb291b2a4dd8 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -1107,8 +1107,6 @@ static void ec_stripe_create(struct ec_stripe_new *s)
 			bch2_err_str(ret));
 		goto err;
 	}
-
-	bch2_stripe_close(c, s);
 err:
 	bch2_disk_reservation_put(c, &s->res);
 
@@ -1124,6 +1122,8 @@ err:
 			}
 		}
 
+	bch2_stripe_close(c, s);
+
 	ec_stripe_buf_exit(&s->existing_stripe);
 	ec_stripe_buf_exit(&s->new_stripe);
 	closure_debug_destroy(&s->iodone);
-- 
cgit 


From 11bb67a4a31d9581a3148a75e023e680197f25fe Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 27 Feb 2023 23:16:37 -0500
Subject: bcachefs: bch2_data_update_init() considers ptr durability

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/data_update.c | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c
index 84c2963d4069..b3216f50bb3c 100644
--- a/fs/bcachefs/data_update.c
+++ b/fs/bcachefs/data_update.c
@@ -425,7 +425,7 @@ int bch2_data_update_init(struct btree_trans *trans,
 	struct extent_ptr_decoded p;
 	const struct bch_extent_ptr *ptr;
 	unsigned i, reserve_sectors = k.k->size * data_opts.extra_replicas;
-	unsigned int ptrs_locked = 0;
+	unsigned ptrs_locked = 0;
 	int ret;
 
 	bch2_bkey_buf_init(&m->k);
@@ -438,6 +438,7 @@ int bch2_data_update_init(struct btree_trans *trans,
 	m->op.version	= k.k->version;
 	m->op.target	= data_opts.target;
 	m->op.write_point = wp;
+	m->op.nr_replicas = 0;
 	m->op.flags	|= BCH_WRITE_PAGES_STABLE|
 		BCH_WRITE_PAGES_OWNED|
 		BCH_WRITE_DATA_ENCODED|
@@ -456,17 +457,16 @@ int bch2_data_update_init(struct btree_trans *trans,
 	bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
 		bool locked;
 
-		if (((1U << i) & m->data_opts.rewrite_ptrs) &&
-		    p.ptr.cached)
-			BUG();
+		if (((1U << i) & m->data_opts.rewrite_ptrs)) {
+			BUG_ON(p.ptr.cached);
 
-		if (!((1U << i) & m->data_opts.rewrite_ptrs) &&
-		    !p.ptr.cached)
-			bch2_dev_list_add_dev(&m->op.devs_have, p.ptr.dev);
+			if (crc_is_compressed(p.crc))
+				reserve_sectors += k.k->size;
 
-		if (((1U << i) & m->data_opts.rewrite_ptrs) &&
-		    crc_is_compressed(p.crc))
-			reserve_sectors += k.k->size;
+			m->op.nr_replicas += bch2_extent_ptr_durability(c, &p);
+		} else if (!p.ptr.cached) {
+			bch2_dev_list_add_dev(&m->op.devs_have, p.ptr.dev);
+		}
 
 		/*
 		 * op->csum_type is normally initialized from the fs/file's
@@ -513,8 +513,8 @@ int bch2_data_update_init(struct btree_trans *trans,
 			goto err;
 	}
 
-	m->op.nr_replicas = m->op.nr_replicas_required =
-		hweight32(m->data_opts.rewrite_ptrs) + m->data_opts.extra_replicas;
+	m->op.nr_replicas += m->data_opts.extra_replicas;
+	m->op.nr_replicas_required = m->op.nr_replicas;
 
 	BUG_ON(!m->op.nr_replicas);
 
-- 
cgit 


From 2f4e9472fa67ff528973f71729175c4c715b1912 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 28 Feb 2023 23:08:48 -0500
Subject: bcachefs: bch2_open_bucket_to_text()

Factor out a common helper

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_foreground.c | 47 +++++++++++++++++++++++++++++++++++-------
 fs/bcachefs/alloc_foreground.h |  1 +
 fs/bcachefs/sysfs.c            |  5 +++++
 3 files changed, 46 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index 1405d6b8cbc5..161585de70c8 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -1329,23 +1329,56 @@ void bch2_fs_allocator_foreground_init(struct bch_fs *c)
 	}
 }
 
+static void bch2_open_bucket_to_text(struct printbuf *out, struct bch_fs *c, struct open_bucket *ob)
+{
+	struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
+	unsigned data_type = ob->data_type;
+	barrier(); /* READ_ONCE() doesn't work on bitfields */
+
+	prt_printf(out, "%zu ref %u %s %u:%llu gen %u allocated %u/%u",
+		   ob - c->open_buckets,
+		   atomic_read(&ob->pin),
+		   data_type < BCH_DATA_NR ? bch2_data_types[data_type] : "invalid data type",
+		   ob->dev, ob->bucket, ob->gen,
+		   ca->mi.bucket_size - ob->sectors_free, ca->mi.bucket_size);
+	if (ob->ec)
+		prt_printf(out, " ec idx %llu", ob->ec->idx);
+	if (ob->on_partial_list)
+		prt_str(out, " partial");
+	prt_newline(out);
+}
+
 void bch2_open_buckets_to_text(struct printbuf *out, struct bch_fs *c)
 {
 	struct open_bucket *ob;
 
+	out->atomic++;
+
 	for (ob = c->open_buckets;
 	     ob < c->open_buckets + ARRAY_SIZE(c->open_buckets);
 	     ob++) {
 		spin_lock(&ob->lock);
-		if (ob->valid && !ob->on_partial_list) {
-			prt_printf(out, "%zu ref %u type %s %u:%llu:%u\n",
-			       ob - c->open_buckets,
-			       atomic_read(&ob->pin),
-			       bch2_data_types[ob->data_type],
-			       ob->dev, ob->bucket, ob->gen);
-		}
+		if (ob->valid && !ob->on_partial_list)
+			bch2_open_bucket_to_text(out, c, ob);
 		spin_unlock(&ob->lock);
 	}
+
+	--out->atomic;
+}
+
+void bch2_open_buckets_partial_to_text(struct printbuf *out, struct bch_fs *c)
+{
+	unsigned i;
+
+	out->atomic++;
+	spin_lock(&c->freelist_lock);
+
+	for (i = 0; i < c->open_buckets_partial_nr; i++)
+		bch2_open_bucket_to_text(out, c,
+				c->open_buckets + c->open_buckets_partial[i]);
+
+	spin_unlock(&c->freelist_lock);
+	--out->atomic;
 }
 
 static const char * const bch2_write_point_states[] = {
diff --git a/fs/bcachefs/alloc_foreground.h b/fs/bcachefs/alloc_foreground.h
index ba7a87afda0e..4f492e278493 100644
--- a/fs/bcachefs/alloc_foreground.h
+++ b/fs/bcachefs/alloc_foreground.h
@@ -221,6 +221,7 @@ static inline struct write_point_specifier writepoint_ptr(struct write_point *wp
 void bch2_fs_allocator_foreground_init(struct bch_fs *);
 
 void bch2_open_buckets_to_text(struct printbuf *, struct bch_fs *);
+void bch2_open_buckets_partial_to_text(struct printbuf *, struct bch_fs *);
 
 void bch2_write_points_to_text(struct printbuf *, struct bch_fs *);
 
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index 6cbdf70f36bd..8108e1d81345 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -194,6 +194,7 @@ read_attribute(btree_cache);
 read_attribute(btree_key_cache);
 read_attribute(stripes_heap);
 read_attribute(open_buckets);
+read_attribute(open_buckets_partial);
 read_attribute(write_points);
 read_attribute(nocow_lock_table);
 
@@ -455,6 +456,9 @@ SHOW(bch2_fs)
 	if (attr == &sysfs_open_buckets)
 		bch2_open_buckets_to_text(out, c);
 
+	if (attr == &sysfs_open_buckets_partial)
+		bch2_open_buckets_partial_to_text(out, c);
+
 	if (attr == &sysfs_write_points)
 		bch2_write_points_to_text(out, c);
 
@@ -663,6 +667,7 @@ struct attribute *bch2_fs_internal_files[] = {
 	&sysfs_new_stripes,
 	&sysfs_stripes_heap,
 	&sysfs_open_buckets,
+	&sysfs_open_buckets_partial,
 	&sysfs_write_points,
 #ifdef BCH_WRITE_REF_DEBUG
 	&sysfs_write_refs,
-- 
cgit 


From 8f2bbcdd9bc8f9c0a2d4b6801d1dfeb20e28e954 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 28 Feb 2023 23:11:36 -0500
Subject: bcachefs: ec: Improve error message for btree node in stripe

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/ec.c | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index cb291b2a4dd8..88c4873e1833 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -898,8 +898,21 @@ static int ec_stripe_update_extent(struct btree_trans *trans,
 	if (*bp_offset == U64_MAX)
 		return 0;
 
-	if (bch2_fs_inconsistent_on(bp.level, c, "found btree node in erasure coded bucket!?"))
+	if (bp.level) {
+		struct printbuf buf = PRINTBUF;
+		struct btree_iter node_iter;
+		struct btree *b;
+
+		b = bch2_backpointer_get_node(trans, &node_iter, bucket, *bp_offset, bp);
+		bch2_trans_iter_exit(trans, &node_iter);
+
+		prt_printf(&buf, "found btree node in erasure coded bucket: b=%px\n", b);
+		bch2_backpointer_to_text(&buf, &bp);
+
+		bch2_fs_inconsistent(c, "%s", buf.buf);
+		printbuf_exit(&buf);
 		return -EIO;
+	}
 
 	k = bch2_backpointer_get_key(trans, &iter, bucket, *bp_offset, bp);
 	ret = bkey_err(k);
-- 
cgit 


From e902095868819c23433c6eba6636e2f1122da126 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 28 Feb 2023 23:08:04 -0500
Subject: bcachefs: bch2_write_queue()

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/io.c | 22 +++++++++++++---------
 1 file changed, 13 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index ede2f3116935..dac0dad16f0a 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -849,6 +849,18 @@ static void bch2_write_index(struct closure *cl)
 	queue_work(wq, &wp->index_update_work);
 }
 
+static inline void bch2_write_queue(struct bch_write_op *op, struct write_point *wp)
+{
+	op->btree_update_ready = false;
+	op->wp = wp;
+
+	spin_lock(&wp->writes_lock);
+	list_add_tail(&op->wp_list, &wp->writes);
+	if (wp->state == WRITE_POINT_stopped)
+		__wp_update_state(wp, WRITE_POINT_waiting_io);
+	spin_unlock(&wp->writes_lock);
+}
+
 void bch2_write_point_do_index_updates(struct work_struct *work)
 {
 	struct write_point *wp =
@@ -1707,15 +1719,6 @@ again:
 		bch2_alloc_sectors_done_inlined(c, wp);
 err:
 		if (ret <= 0) {
-			if (!(op->flags & BCH_WRITE_SYNC)) {
-				spin_lock(&wp->writes_lock);
-				op->wp = wp;
-				list_add_tail(&op->wp_list, &wp->writes);
-				if (wp->state == WRITE_POINT_stopped)
-					__wp_update_state(wp, WRITE_POINT_waiting_io);
-				spin_unlock(&wp->writes_lock);
-			}
-
 			op->flags |= BCH_WRITE_DONE;
 
 			if (ret < 0) {
@@ -1754,6 +1757,7 @@ err:
 			goto again;
 		bch2_write_done(&op->cl);
 	} else {
+		bch2_write_queue(op, wp);
 		continue_at(&op->cl, bch2_write_index, NULL);
 	}
 out_nofs_restore:
-- 
cgit 


From 2611a041ae9c35b1bf4e7e1462c77c4096490a4e Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 1 Mar 2023 22:14:31 -0500
Subject: bcachefs: bch2_mark_key() now takes btree_id & level

btree & level are passed to trans_mark - for backpointers -
bch2_mark_key() should take them as well.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bkey_methods.h      | 12 ++++++------
 fs/bcachefs/btree_gc.c          |  6 +++---
 fs/bcachefs/btree_update_leaf.c |  9 ++++++---
 fs/bcachefs/buckets.c           |  6 ++++++
 fs/bcachefs/buckets.h           | 18 ++++++++++++------
 fs/bcachefs/subvolume.c         |  3 ++-
 fs/bcachefs/subvolume.h         |  4 ++--
 7 files changed, 37 insertions(+), 21 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bkey_methods.h b/fs/bcachefs/bkey_methods.h
index 9a6afab87f6c..6ae517884a37 100644
--- a/fs/bcachefs/bkey_methods.h
+++ b/fs/bcachefs/bkey_methods.h
@@ -29,8 +29,8 @@ struct bkey_ops {
 	bool		(*key_merge)(struct bch_fs *, struct bkey_s, struct bkey_s_c);
 	int		(*trans_trigger)(struct btree_trans *, enum btree_id, unsigned,
 					 struct bkey_s_c, struct bkey_i *, unsigned);
-	int		(*atomic_trigger)(struct btree_trans *, struct bkey_s_c,
-					  struct bkey_s_c, unsigned);
+	int		(*atomic_trigger)(struct btree_trans *, enum btree_id, unsigned,
+					  struct bkey_s_c, struct bkey_s_c, unsigned);
 	void		(*compat)(enum btree_id id, unsigned version,
 				  unsigned big_endian, int write,
 				  struct bkey_s);
@@ -68,14 +68,14 @@ static inline bool bch2_bkey_maybe_mergable(const struct bkey *l, const struct b
 bool bch2_bkey_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);
 
 static inline int bch2_mark_key(struct btree_trans *trans,
-		  struct bkey_s_c old,
-		  struct bkey_s_c new,
-		  unsigned flags)
+		enum btree_id btree, unsigned level,
+		struct bkey_s_c old, struct bkey_s_c new,
+		unsigned flags)
 {
 	const struct bkey_ops *ops = &bch2_bkey_ops[old.k->type ?: new.k->type];
 
 	return ops->atomic_trigger
-		? ops->atomic_trigger(trans, old, new, flags)
+		? ops->atomic_trigger(trans, btree, level, old, new, flags)
 		: 0;
 }
 
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 8ae838acd853..6a0eaa661002 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -808,7 +808,7 @@ static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id,
 	}
 
 	ret = commit_do(trans, NULL, NULL, 0,
-			bch2_mark_key(trans, old, *k, flags));
+			bch2_mark_key(trans, btree_id, level, old, *k, flags));
 fsck_err:
 err:
 	if (ret)
@@ -887,7 +887,7 @@ static int bch2_gc_btree(struct btree_trans *trans, enum btree_id btree_id,
 	if (!btree_node_fake(b)) {
 		struct bkey_s_c k = bkey_i_to_s_c(&b->key);
 
-		ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level,
+		ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level + 1,
 				       true, &k, initial);
 	}
 	gc_pos_set(c, gc_pos_btree_root(b->c.btree_id));
@@ -1040,7 +1040,7 @@ static int bch2_gc_btree_init(struct btree_trans *trans,
 	if (!ret) {
 		struct bkey_s_c k = bkey_i_to_s_c(&b->key);
 
-		ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level, true,
+		ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level + 1, true,
 				       &k, true);
 	}
 fsck_err:
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 0bf4116442f4..de6fedf4bdd0 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -434,7 +434,8 @@ static int run_one_mem_trigger(struct btree_trans *trans,
 	if (bch2_bkey_ops[old.k->type].atomic_trigger ==
 	    bch2_bkey_ops[i->k->k.type].atomic_trigger &&
 	    ((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) {
-		ret   = bch2_mark_key(trans, old, bkey_i_to_s_c(new),
+		ret   = bch2_mark_key(trans, i->btree_id, i->level,
+				old, bkey_i_to_s_c(new),
 				BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|flags);
 	} else {
 		struct bkey		_deleted = KEY(0, 0, 0);
@@ -442,9 +443,11 @@ static int run_one_mem_trigger(struct btree_trans *trans,
 
 		_deleted.p = i->path->pos;
 
-		ret   = bch2_mark_key(trans, deleted, bkey_i_to_s_c(new),
+		ret   = bch2_mark_key(trans, i->btree_id, i->level,
+				deleted, bkey_i_to_s_c(new),
 				BTREE_TRIGGER_INSERT|flags) ?:
-			bch2_mark_key(trans, old, deleted,
+			bch2_mark_key(trans, i->btree_id, i->level,
+				old, deleted,
 				BTREE_TRIGGER_OVERWRITE|flags);
 	}
 
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 7c4780b3ceb5..40e3d649a05e 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -486,6 +486,7 @@ static inline void update_cached_sectors_list(struct btree_trans *trans,
 }
 
 int bch2_mark_alloc(struct btree_trans *trans,
+		    enum btree_id btree, unsigned level,
 		    struct bkey_s_c old, struct bkey_s_c new,
 		    unsigned flags)
 {
@@ -929,6 +930,7 @@ static int bch2_mark_stripe_ptr(struct btree_trans *trans,
 }
 
 int bch2_mark_extent(struct btree_trans *trans,
+		     enum btree_id btree_id, unsigned level,
 		     struct bkey_s_c old, struct bkey_s_c new,
 		     unsigned flags)
 {
@@ -1012,6 +1014,7 @@ int bch2_mark_extent(struct btree_trans *trans,
 }
 
 int bch2_mark_stripe(struct btree_trans *trans,
+		     enum btree_id btree_id, unsigned level,
 		     struct bkey_s_c old, struct bkey_s_c new,
 		     unsigned flags)
 {
@@ -1117,6 +1120,7 @@ int bch2_mark_stripe(struct btree_trans *trans,
 }
 
 int bch2_mark_inode(struct btree_trans *trans,
+		    enum btree_id btree_id, unsigned level,
 		    struct bkey_s_c old, struct bkey_s_c new,
 		    unsigned flags)
 {
@@ -1148,6 +1152,7 @@ int bch2_mark_inode(struct btree_trans *trans,
 }
 
 int bch2_mark_reservation(struct btree_trans *trans,
+			  enum btree_id btree_id, unsigned level,
 			  struct bkey_s_c old, struct bkey_s_c new,
 			  unsigned flags)
 {
@@ -1234,6 +1239,7 @@ fsck_err:
 }
 
 int bch2_mark_reflink_p(struct btree_trans *trans,
+			enum btree_id btree_id, unsigned level,
 			struct bkey_s_c old, struct bkey_s_c new,
 			unsigned flags)
 {
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index 3398c9c3a81b..e6d476f387a1 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -235,12 +235,18 @@ int bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *,
 			      size_t, enum bch_data_type, unsigned,
 			      struct gc_pos, unsigned);
 
-int bch2_mark_alloc(struct btree_trans *, struct bkey_s_c, struct bkey_s_c, unsigned);
-int bch2_mark_extent(struct btree_trans *, struct bkey_s_c, struct bkey_s_c, unsigned);
-int bch2_mark_stripe(struct btree_trans *, struct bkey_s_c, struct bkey_s_c, unsigned);
-int bch2_mark_inode(struct btree_trans *, struct bkey_s_c, struct bkey_s_c, unsigned);
-int bch2_mark_reservation(struct btree_trans *, struct bkey_s_c, struct bkey_s_c, unsigned);
-int bch2_mark_reflink_p(struct btree_trans *, struct bkey_s_c, struct bkey_s_c, unsigned);
+int bch2_mark_alloc(struct btree_trans *, enum btree_id, unsigned,
+		    struct bkey_s_c, struct bkey_s_c, unsigned);
+int bch2_mark_extent(struct btree_trans *, enum btree_id, unsigned,
+		     struct bkey_s_c, struct bkey_s_c, unsigned);
+int bch2_mark_stripe(struct btree_trans *, enum btree_id, unsigned,
+		     struct bkey_s_c, struct bkey_s_c, unsigned);
+int bch2_mark_inode(struct btree_trans *, enum btree_id, unsigned,
+		    struct bkey_s_c, struct bkey_s_c, unsigned);
+int bch2_mark_reservation(struct btree_trans *, enum btree_id, unsigned,
+			  struct bkey_s_c, struct bkey_s_c, unsigned);
+int bch2_mark_reflink_p(struct btree_trans *, enum btree_id, unsigned,
+			struct bkey_s_c, struct bkey_s_c, unsigned);
 
 int bch2_trans_mark_extent(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_i *, unsigned);
 int bch2_trans_mark_stripe(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_i *, unsigned);
diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c
index ba281104eb30..d76239654a89 100644
--- a/fs/bcachefs/subvolume.c
+++ b/fs/bcachefs/subvolume.c
@@ -76,6 +76,7 @@ int bch2_snapshot_invalid(const struct bch_fs *c, struct bkey_s_c k,
 }
 
 int bch2_mark_snapshot(struct btree_trans *trans,
+		       enum btree_id btree, unsigned level,
 		       struct bkey_s_c old, struct bkey_s_c new,
 		       unsigned flags)
 {
@@ -361,7 +362,7 @@ int bch2_fs_snapshots_start(struct bch_fs *c)
 
 	for_each_btree_key2(&trans, iter, BTREE_ID_snapshots,
 			   POS_MIN, 0, k,
-		bch2_mark_snapshot(&trans, bkey_s_c_null, k, 0) ?:
+		bch2_mark_snapshot(&trans, BTREE_ID_snapshots, 0, bkey_s_c_null, k, 0) ?:
 		bch2_snapshot_set_equiv(&trans, k));
 
 	bch2_trans_exit(&trans);
diff --git a/fs/bcachefs/subvolume.h b/fs/bcachefs/subvolume.h
index 7c488c3d78e0..df6657952e2f 100644
--- a/fs/bcachefs/subvolume.h
+++ b/fs/bcachefs/subvolume.h
@@ -8,8 +8,8 @@
 void bch2_snapshot_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 int bch2_snapshot_invalid(const struct bch_fs *, struct bkey_s_c,
 			  unsigned, struct printbuf *);
-int bch2_mark_snapshot(struct btree_trans *, struct bkey_s_c,
-		       struct bkey_s_c, unsigned);
+int bch2_mark_snapshot(struct btree_trans *, enum btree_id, unsigned,
+		       struct bkey_s_c, struct bkey_s_c, unsigned);
 
 #define bch2_bkey_ops_snapshot ((struct bkey_ops) {		\
 	.key_invalid	= bch2_snapshot_invalid,		\
-- 
cgit 


From c85d7796090741fe6a75f953afae964344066448 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 1 Mar 2023 23:10:39 -0500
Subject: bcachefs: bch2_copygc_wait_to_text()

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/movinggc.c | 12 ++++++++++++
 fs/bcachefs/movinggc.h |  2 ++
 fs/bcachefs/sysfs.c    |  7 ++++---
 3 files changed, 18 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c
index 74e57f6ea148..63b358c95282 100644
--- a/fs/bcachefs/movinggc.c
+++ b/fs/bcachefs/movinggc.c
@@ -172,6 +172,18 @@ unsigned long bch2_copygc_wait_amount(struct bch_fs *c)
 	return wait;
 }
 
+void bch2_copygc_wait_to_text(struct printbuf *out, struct bch_fs *c)
+{
+	prt_printf(out, "Currently waiting for:     ");
+	prt_human_readable_u64(out, max(0LL, c->copygc_wait -
+					atomic64_read(&c->io_clock[WRITE].now)) << 9);
+	prt_newline(out);
+
+	prt_printf(out, "Currently calculated wait: ");
+	prt_human_readable_u64(out, bch2_copygc_wait_amount(c));
+	prt_newline(out);
+}
+
 static int bch2_copygc_thread(void *arg)
 {
 	struct bch_fs *c = arg;
diff --git a/fs/bcachefs/movinggc.h b/fs/bcachefs/movinggc.h
index e85c8136a46e..ea181fef5bc9 100644
--- a/fs/bcachefs/movinggc.h
+++ b/fs/bcachefs/movinggc.h
@@ -3,6 +3,8 @@
 #define _BCACHEFS_MOVINGGC_H
 
 unsigned long bch2_copygc_wait_amount(struct bch_fs *);
+void bch2_copygc_wait_to_text(struct printbuf *, struct bch_fs *);
+
 void bch2_copygc_stop(struct bch_fs *);
 int bch2_copygc_start(struct bch_fs *);
 void bch2_fs_copygc_init(struct bch_fs *);
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index 8108e1d81345..4abba2c9ccfa 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -27,6 +27,7 @@
 #include "journal.h"
 #include "keylist.h"
 #include "move.h"
+#include "movinggc.h"
 #include "nocow_locking.h"
 #include "opts.h"
 #include "rebalance.h"
@@ -427,9 +428,9 @@ SHOW(bch2_fs)
 
 	sysfs_printf(rebalance_enabled,		"%i", c->rebalance.enabled);
 	sysfs_pd_controller_show(rebalance,	&c->rebalance.pd); /* XXX */
-	sysfs_hprint(copy_gc_wait,
-		     max(0LL, c->copygc_wait -
-			 atomic64_read(&c->io_clock[WRITE].now)) << 9);
+
+	if (attr == &sysfs_copy_gc_wait)
+		bch2_copygc_wait_to_text(out, c);
 
 	if (attr == &sysfs_rebalance_work)
 		bch2_rebalance_work_to_text(out, c);
-- 
cgit 


From b1cfe5ed2b5d5dbd2d8bcb2a4c1131513a1b3e1c Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 2 Mar 2023 01:08:46 -0500
Subject: bcachefs: Improve dev_alloc_debug_to_text()

Now we also print the number of buckets reserved for each watermark.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_types.h |   1 +
 fs/bcachefs/buckets.h     |   2 +
 fs/bcachefs/printbuf.h    |   2 +-
 fs/bcachefs/sysfs.c       | 126 ++++++++++++++++++++++++++++++++++------------
 fs/bcachefs/util.h        |   2 +-
 5 files changed, 99 insertions(+), 34 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_types.h b/fs/bcachefs/alloc_types.h
index 2e6f48069258..c8a45ea9d661 100644
--- a/fs/bcachefs/alloc_types.h
+++ b/fs/bcachefs/alloc_types.h
@@ -28,6 +28,7 @@ enum alloc_reserve {
 #define x(name)	RESERVE_##name,
 	BCH_ALLOC_RESERVES()
 #undef x
+	RESERVE_NR,
 };
 
 #define OPEN_BUCKETS_COUNT	1024
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index e6d476f387a1..22721bfea414 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -155,6 +155,8 @@ static inline u64 bch2_dev_buckets_reserved(struct bch_dev *ca, enum alloc_reser
 	s64 reserved = 0;
 
 	switch (reserve) {
+	case RESERVE_NR:
+		unreachable();
 	case RESERVE_none:
 		reserved += ca->mi.nbuckets >> 6;
 		fallthrough;
diff --git a/fs/bcachefs/printbuf.h b/fs/bcachefs/printbuf.h
index 2e9939957833..2191423d9f22 100644
--- a/fs/bcachefs/printbuf.h
+++ b/fs/bcachefs/printbuf.h
@@ -71,7 +71,7 @@ enum printbuf_si {
 	PRINTBUF_UNITS_10,	/* use powers of 10^3 (standard SI) */
 };
 
-#define PRINTBUF_INLINE_TABSTOPS	4
+#define PRINTBUF_INLINE_TABSTOPS	6
 
 struct printbuf {
 	char			*buf;
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index 4abba2c9ccfa..e3a166f79cb6 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -821,38 +821,100 @@ static void dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca)
 	for (i = 0; i < ARRAY_SIZE(c->open_buckets); i++)
 		nr[c->open_buckets[i].data_type]++;
 
-	prt_printf(out,
-	       "\t\t\t buckets\t sectors      fragmented\n"
-	       "capacity\t%16llu\n",
-	       ca->mi.nbuckets - ca->mi.first_bucket);
-
-	for (i = 0; i < BCH_DATA_NR; i++)
-		prt_printf(out, "%-16s%16llu%16llu%16llu\n",
-		       bch2_data_types[i], stats.d[i].buckets,
-		       stats.d[i].sectors, stats.d[i].fragmented);
-
-	prt_printf(out,
-	       "ec\t\t%16llu\n"
-	       "\n"
-	       "freelist_wait\t\t%s\n"
-	       "open buckets allocated\t%u\n"
-	       "open buckets this dev\t%u\n"
-	       "open buckets total\t%u\n"
-	       "open_buckets_wait\t%s\n"
-	       "open_buckets_btree\t%u\n"
-	       "open_buckets_user\t%u\n"
-	       "buckets_to_invalidate\t%llu\n"
-	       "btree reserve cache\t%u\n",
-	       stats.buckets_ec,
-	       c->freelist_wait.list.first		? "waiting" : "empty",
-	       OPEN_BUCKETS_COUNT - c->open_buckets_nr_free,
-	       ca->nr_open_buckets,
-	       OPEN_BUCKETS_COUNT,
-	       c->open_buckets_wait.list.first		? "waiting" : "empty",
-	       nr[BCH_DATA_btree],
-	       nr[BCH_DATA_user],
-	       should_invalidate_buckets(ca, stats),
-	       c->btree_reserve_cache_nr);
+	printbuf_tabstop_push(out, 8);
+	printbuf_tabstop_push(out, 16);
+	printbuf_tabstop_push(out, 16);
+	printbuf_tabstop_push(out, 16);
+	printbuf_tabstop_push(out, 16);
+
+	prt_tab(out);
+	prt_str(out, "buckets");
+	prt_tab_rjust(out);
+	prt_str(out, "sectors");
+	prt_tab_rjust(out);
+	prt_str(out, "fragmented");
+	prt_tab_rjust(out);
+	prt_newline(out);
+
+	for (i = 0; i < BCH_DATA_NR; i++) {
+		prt_str(out, bch2_data_types[i]);
+		prt_tab(out);
+		prt_u64(out, stats.d[i].buckets);
+		prt_tab_rjust(out);
+		prt_u64(out, stats.d[i].sectors);
+		prt_tab_rjust(out);
+		prt_u64(out, stats.d[i].fragmented);
+		prt_tab_rjust(out);
+		prt_newline(out);
+	}
+
+	prt_str(out, "ec");
+	prt_tab(out);
+	prt_u64(out, stats.buckets_ec);
+	prt_tab_rjust(out);
+	prt_newline(out);
+
+	prt_newline(out);
+
+	prt_printf(out, "reserves:");
+	prt_newline(out);
+	for (i = 0; i < RESERVE_NR; i++) {
+		prt_str(out, bch2_alloc_reserves[i]);
+		prt_tab(out);
+		prt_u64(out, bch2_dev_buckets_reserved(ca, i));
+		prt_tab_rjust(out);
+		prt_newline(out);
+	}
+
+	prt_newline(out);
+
+	printbuf_tabstops_reset(out);
+	printbuf_tabstop_push(out, 24);
+
+	prt_str(out, "freelist_wait");
+	prt_tab(out);
+	prt_str(out, c->freelist_wait.list.first ? "waiting" : "empty");
+	prt_newline(out);
+
+	prt_str(out, "open buckets allocated");
+	prt_tab(out);
+	prt_u64(out, OPEN_BUCKETS_COUNT - c->open_buckets_nr_free);
+	prt_newline(out);
+
+	prt_str(out, "open buckets this dev");
+	prt_tab(out);
+	prt_u64(out, ca->nr_open_buckets);
+	prt_newline(out);
+
+	prt_str(out, "open buckets total");
+	prt_tab(out);
+	prt_u64(out, OPEN_BUCKETS_COUNT);
+	prt_newline(out);
+
+	prt_str(out, "open_buckets_wait");
+	prt_tab(out);
+	prt_str(out, c->open_buckets_wait.list.first ? "waiting" : "empty");
+	prt_newline(out);
+
+	prt_str(out, "open_buckets_btree");
+	prt_tab(out);
+	prt_u64(out, nr[BCH_DATA_btree]);
+	prt_newline(out);
+
+	prt_str(out, "open_buckets_user");
+	prt_tab(out);
+	prt_u64(out, nr[BCH_DATA_user]);
+	prt_newline(out);
+
+	prt_str(out, "buckets_to_invalidate");
+	prt_tab(out);
+	prt_u64(out, should_invalidate_buckets(ca, stats));
+	prt_newline(out);
+
+	prt_str(out, "btree reserve cache");
+	prt_tab(out);
+	prt_u64(out, c->btree_reserve_cache_nr);
+	prt_newline(out);
 }
 
 static const char * const bch2_rw[] = {
diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h
index 4188f380f54f..44c6a2a10f35 100644
--- a/fs/bcachefs/util.h
+++ b/fs/bcachefs/util.h
@@ -236,7 +236,7 @@ do {									\
 #define prt_tab_rjust(_out)		bch2_prt_tab_rjust(_out)
 
 #define prt_bytes_indented(...)		bch2_prt_bytes_indented(__VA_ARGS__)
-#define prt_u64(_out, _v)		prt_printf(_out, "%llu", _v)
+#define prt_u64(_out, _v)		prt_printf(_out, "%llu", (u64) (_v))
 #define prt_human_readable_u64(...)	bch2_prt_human_readable_u64(__VA_ARGS__)
 #define prt_human_readable_s64(...)	bch2_prt_human_readable_s64(__VA_ARGS__)
 #define prt_units_u64(...)		bch2_prt_units_u64(__VA_ARGS__)
-- 
cgit 


From 1306f87de399a0c791f03d68b50e03bdb3f409ae Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 2 Mar 2023 02:12:18 -0500
Subject: bcachefs: Plumb btree_trans through btree cache code

Soon, __bch2_btree_node_write() is going to require a btree_trans: zoned
device support is going to require a new allocation for every btree node
write. This is a bit of prep work.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_cache.c           | 27 ++++++++++++++-------------
 fs/bcachefs/btree_cache.h           |  4 ++--
 fs/bcachefs/btree_io.c              | 14 +++++++++++---
 fs/bcachefs/btree_iter.c            |  4 ++--
 fs/bcachefs/btree_update_interior.c | 15 +++++++++++----
 5 files changed, 40 insertions(+), 24 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index 769f17b67fcf..76cad6109297 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -561,8 +561,9 @@ static struct btree *btree_node_cannibalize(struct bch_fs *c)
 	}
 }
 
-struct btree *bch2_btree_node_mem_alloc(struct bch_fs *c, bool pcpu_read_locks)
+struct btree *bch2_btree_node_mem_alloc(struct btree_trans *trans, bool pcpu_read_locks)
 {
+	struct bch_fs *c = trans->c;
 	struct btree_cache *bc = &c->btree_cache;
 	struct list_head *freed = pcpu_read_locks
 		? &bc->freed_pcpu
@@ -673,8 +674,7 @@ err:
 }
 
 /* Slowpath, don't want it inlined into btree_iter_traverse() */
-static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c,
-				struct btree_trans *trans,
+static noinline struct btree *bch2_btree_node_fill(struct btree_trans *trans,
 				struct btree_path *path,
 				const struct bkey_i *k,
 				enum btree_id btree_id,
@@ -682,6 +682,7 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c,
 				enum six_lock_type lock_type,
 				bool sync)
 {
+	struct bch_fs *c = trans->c;
 	struct btree_cache *bc = &c->btree_cache;
 	struct btree *b;
 	u32 seq;
@@ -691,14 +692,14 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c,
 	 * Parent node must be locked, else we could read in a btree node that's
 	 * been freed:
 	 */
-	if (trans && !bch2_btree_node_relock(trans, path, level + 1)) {
+	if (path && !bch2_btree_node_relock(trans, path, level + 1)) {
 		trace_and_count(c, trans_restart_relock_parent_for_fill, trans, _THIS_IP_, path);
 		return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_fill_relock));
 	}
 
-	b = bch2_btree_node_mem_alloc(c, level != 0);
+	b = bch2_btree_node_mem_alloc(trans, level != 0);
 
-	if (trans && b == ERR_PTR(-ENOMEM)) {
+	if (b == ERR_PTR(-ENOMEM)) {
 		trans->memory_allocation_failure = true;
 		trace_and_count(c, trans_restart_memory_allocation_failure, trans, _THIS_IP_, path);
 		return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_fill_mem_alloc_fail));
@@ -744,7 +745,7 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c,
 	if (!sync)
 		return NULL;
 
-	if (trans) {
+	if (path) {
 		int ret = bch2_trans_relock(trans) ?:
 			bch2_btree_path_relock_intent(trans, path);
 		if (ret) {
@@ -754,7 +755,7 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c,
 	}
 
 	if (!six_relock_type(&b->c.lock, lock_type, seq)) {
-		if (trans)
+		if (path)
 			trace_and_count(c, trans_restart_relock_after_fill, trans, _THIS_IP_, path);
 		return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_after_fill));
 	}
@@ -820,7 +821,7 @@ retry:
 		 * else we could read in a btree node from disk that's been
 		 * freed:
 		 */
-		b = bch2_btree_node_fill(c, trans, path, k, path->btree_id,
+		b = bch2_btree_node_fill(trans, path, k, path->btree_id,
 					 level, lock_type, true);
 
 		/* We raced and found the btree node in the cache */
@@ -1029,7 +1030,7 @@ retry:
 		if (nofill)
 			goto out;
 
-		b = bch2_btree_node_fill(c, NULL, NULL, k, btree_id,
+		b = bch2_btree_node_fill(trans, NULL, k, btree_id,
 					 level, SIX_LOCK_read, true);
 
 		/* We raced and found the btree node in the cache */
@@ -1089,12 +1090,12 @@ out:
 	return b;
 }
 
-int bch2_btree_node_prefetch(struct bch_fs *c,
-			     struct btree_trans *trans,
+int bch2_btree_node_prefetch(struct btree_trans *trans,
 			     struct btree_path *path,
 			     const struct bkey_i *k,
 			     enum btree_id btree_id, unsigned level)
 {
+	struct bch_fs *c = trans->c;
 	struct btree_cache *bc = &c->btree_cache;
 	struct btree *b;
 
@@ -1105,7 +1106,7 @@ int bch2_btree_node_prefetch(struct bch_fs *c,
 	if (b)
 		return 0;
 
-	b = bch2_btree_node_fill(c, trans, path, k, btree_id,
+	b = bch2_btree_node_fill(trans, path, k, btree_id,
 				 level, SIX_LOCK_read, false);
 	return PTR_ERR_OR_ZERO(b);
 }
diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h
index 238da8dbc5da..56ea27340771 100644
--- a/fs/bcachefs/btree_cache.h
+++ b/fs/bcachefs/btree_cache.h
@@ -21,7 +21,7 @@ void bch2_btree_cache_cannibalize_unlock(struct bch_fs *);
 int bch2_btree_cache_cannibalize_lock(struct bch_fs *, struct closure *);
 
 struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *);
-struct btree *bch2_btree_node_mem_alloc(struct bch_fs *, bool);
+struct btree *bch2_btree_node_mem_alloc(struct btree_trans *, bool);
 
 struct btree *bch2_btree_node_get(struct btree_trans *, struct btree_path *,
 				  const struct bkey_i *, unsigned,
@@ -30,7 +30,7 @@ struct btree *bch2_btree_node_get(struct btree_trans *, struct btree_path *,
 struct btree *bch2_btree_node_get_noiter(struct btree_trans *, const struct bkey_i *,
 					 enum btree_id, unsigned, bool);
 
-int bch2_btree_node_prefetch(struct bch_fs *, struct btree_trans *, struct btree_path *,
+int bch2_btree_node_prefetch(struct btree_trans *, struct btree_path *,
 			     const struct bkey_i *, enum btree_id, unsigned);
 
 void bch2_btree_node_evict(struct btree_trans *, const struct bkey_i *);
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 66747fe21323..7ffdce97214e 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -1610,9 +1610,10 @@ void bch2_btree_node_read(struct bch_fs *c, struct btree *b,
 	}
 }
 
-int bch2_btree_root_read(struct bch_fs *c, enum btree_id id,
-			const struct bkey_i *k, unsigned level)
+static int __bch2_btree_root_read(struct btree_trans *trans, enum btree_id id,
+				  const struct bkey_i *k, unsigned level)
 {
+	struct bch_fs *c = trans->c;
 	struct closure cl;
 	struct btree *b;
 	int ret;
@@ -1624,7 +1625,7 @@ int bch2_btree_root_read(struct bch_fs *c, enum btree_id id,
 		closure_sync(&cl);
 	} while (ret);
 
-	b = bch2_btree_node_mem_alloc(c, level != 0);
+	b = bch2_btree_node_mem_alloc(trans, level != 0);
 	bch2_btree_cache_cannibalize_unlock(c);
 
 	BUG_ON(IS_ERR(b));
@@ -1655,6 +1656,13 @@ err:
 	return ret;
 }
 
+int bch2_btree_root_read(struct bch_fs *c, enum btree_id id,
+			const struct bkey_i *k, unsigned level)
+{
+	return bch2_trans_run(c, __bch2_btree_root_read(&trans, id, k, level));
+
+}
+
 void bch2_btree_complete_write(struct bch_fs *c, struct btree *b,
 			      struct btree_write *w)
 {
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index eebab7534f62..e6dcca59e31a 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -815,7 +815,7 @@ static int btree_path_prefetch(struct btree_trans *trans, struct btree_path *pat
 			break;
 
 		bch2_bkey_buf_unpack(&tmp, c, l->b, k);
-		ret = bch2_btree_node_prefetch(c, trans, path, tmp.k, path->btree_id,
+		ret = bch2_btree_node_prefetch(trans, path, tmp.k, path->btree_id,
 					       path->level - 1);
 	}
 
@@ -850,7 +850,7 @@ static int btree_path_prefetch_j(struct btree_trans *trans, struct btree_path *p
 			break;
 
 		bch2_bkey_buf_reassemble(&tmp, c, k);
-		ret = bch2_btree_node_prefetch(c, trans, path, tmp.k, path->btree_id,
+		ret = bch2_btree_node_prefetch(trans, path, tmp.k, path->btree_id,
 					       path->level - 1);
 	}
 
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 79be89006403..bce6a9d7795c 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -300,7 +300,7 @@ retry:
 	bch2_open_bucket_get(c, wp, &ob);
 	bch2_alloc_sectors_done(c, wp);
 mem_alloc:
-	b = bch2_btree_node_mem_alloc(c, interior_node);
+	b = bch2_btree_node_mem_alloc(trans, interior_node);
 	six_unlock_write(&b->c.lock);
 	six_unlock_intent(&b->c.lock);
 
@@ -2261,7 +2261,7 @@ int bch2_btree_node_update_key(struct btree_trans *trans, struct btree_iter *ite
 				return ret;
 		}
 
-		new_hash = bch2_btree_node_mem_alloc(c, false);
+		new_hash = bch2_btree_node_mem_alloc(trans, false);
 	}
 
 	path->intent_ref++;
@@ -2324,8 +2324,9 @@ void bch2_btree_set_root_for_read(struct bch_fs *c, struct btree *b)
 	bch2_btree_set_root_inmem(c, b);
 }
 
-void bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id)
+static int __bch2_btree_root_alloc(struct btree_trans *trans, enum btree_id id)
 {
+	struct bch_fs *c = trans->c;
 	struct closure cl;
 	struct btree *b;
 	int ret;
@@ -2337,7 +2338,7 @@ void bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id)
 		closure_sync(&cl);
 	} while (ret);
 
-	b = bch2_btree_node_mem_alloc(c, false);
+	b = bch2_btree_node_mem_alloc(trans, false);
 	bch2_btree_cache_cannibalize_unlock(c);
 
 	set_btree_node_fake(b);
@@ -2366,6 +2367,12 @@ void bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id)
 
 	six_unlock_write(&b->c.lock);
 	six_unlock_intent(&b->c.lock);
+	return 0;
+}
+
+void bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id)
+{
+	bch2_trans_run(c, __bch2_btree_root_alloc(&trans, id));
 }
 
 void bch2_btree_updates_to_text(struct printbuf *out, struct bch_fs *c)
-- 
cgit 


From 3329cf1bb91d6293a96cf35ad72b2a2e1e1c0e3d Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 3 Mar 2023 00:03:01 -0500
Subject: bcachefs: Centralize btree node lock initialization

This fixes some confusion in the lockdep code due to initializing btree
node/key cache locks with the same lockdep key, but different names.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_cache.c     |  3 +--
 fs/bcachefs/btree_io.c        |  6 ++++++
 fs/bcachefs/btree_key_cache.c |  3 +--
 fs/bcachefs/btree_locking.c   | 18 +++++++++++++++++-
 fs/bcachefs/btree_locking.h   |  8 +++++++-
 5 files changed, 32 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index 76cad6109297..9b331c319acc 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -119,8 +119,7 @@ static struct btree *__btree_node_mem_alloc(struct bch_fs *c, gfp_t gfp)
 		return NULL;
 
 	bkey_btree_ptr_init(&b->key);
-	six_lock_init(&b->c.lock);
-	lockdep_set_novalidate_class(&b->c.lock);
+	bch2_btree_lock_init(&b->c);
 	INIT_LIST_HEAD(&b->list);
 	INIT_LIST_HEAD(&b->write_blocked);
 	b->byte_order = ilog2(btree_bytes(c));
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 7ffdce97214e..358a285c3bcf 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -33,6 +33,8 @@ void bch2_btree_node_io_unlock(struct btree *b)
 
 void bch2_btree_node_io_lock(struct btree *b)
 {
+	bch2_assert_btree_nodes_not_locked();
+
 	wait_on_bit_lock_io(&b->flags, BTREE_NODE_write_in_flight,
 			    TASK_UNINTERRUPTIBLE);
 }
@@ -51,12 +53,16 @@ void __bch2_btree_node_wait_on_write(struct btree *b)
 
 void bch2_btree_node_wait_on_read(struct btree *b)
 {
+	bch2_assert_btree_nodes_not_locked();
+
 	wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight,
 		       TASK_UNINTERRUPTIBLE);
 }
 
 void bch2_btree_node_wait_on_write(struct btree *b)
 {
+	bch2_assert_btree_nodes_not_locked();
+
 	wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight,
 		       TASK_UNINTERRUPTIBLE);
 }
diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index 67db6b9d8e10..1e692c0a2f3a 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -282,8 +282,7 @@ bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path,
 		return NULL;
 init:
 	INIT_LIST_HEAD(&ck->list);
-	__six_lock_init(&ck->c.lock, "b->c.lock", &bch2_btree_node_lock_key);
-	lockdep_set_novalidate_class(&ck->c.lock);
+	bch2_btree_lock_init(&ck->c);
 	if (pcpu_readers)
 		six_lock_pcpu_alloc(&ck->c.lock);
 
diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c
index 14a0614af436..9e097ab668a6 100644
--- a/fs/bcachefs/btree_locking.c
+++ b/fs/bcachefs/btree_locking.c
@@ -4,7 +4,23 @@
 #include "btree_locking.h"
 #include "btree_types.h"
 
-struct lock_class_key bch2_btree_node_lock_key;
+static struct lock_class_key bch2_btree_node_lock_key;
+
+void bch2_btree_lock_init(struct btree_bkey_cached_common *b)
+{
+	__six_lock_init(&b->lock, "b->c.lock", &bch2_btree_node_lock_key);
+	lockdep_set_novalidate_class(&b->lock);
+}
+
+#ifdef CONFIG_LOCKDEP
+void bch2_assert_btree_nodes_not_locked(void)
+{
+#if 0
+	//Re-enable when lock_class_is_held() is merged:
+	BUG_ON(lock_class_is_held(&bch2_btree_node_lock_key));
+#endif
+}
+#endif
 
 /* Btree node locking: */
 
diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h
index 30c89daa5009..76aac49966fe 100644
--- a/fs/bcachefs/btree_locking.h
+++ b/fs/bcachefs/btree_locking.h
@@ -13,7 +13,13 @@
 #include "btree_iter.h"
 #include "six.h"
 
-extern struct lock_class_key bch2_btree_node_lock_key;
+void bch2_btree_lock_init(struct btree_bkey_cached_common *);
+
+#ifdef CONFIG_LOCKDEP
+void bch2_assert_btree_nodes_not_locked(void);
+#else
+static inline void bch2_assert_btree_nodes_not_locked(void) {}
+#endif
 
 static inline bool is_btree_node(struct btree_path *path, unsigned l)
 {
-- 
cgit 


From 910659763edaba58aa04bd5924dd933bc08cc56f Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 1 Mar 2023 21:47:07 -0500
Subject: bcachefs: Mark stripe buckets with correct data type

Currently, we don't use bucket data type for tracking whether buckets
are part of a stripe; parity buckets are BCH_DATA_parity, but data
buckets in a stripe are BCH_DATA_user. There's a separate counter,
buckets_ec, outside the BCH_DATA_TYPES system for tracking number of
buckets on a device that are part of a stripe.

The trouble with this approach is that it's too coarse grained, and we
need better information on fragmentation for debugging copygc.

With this patch, data buckets in a stripe are now tracked as
BCH_DATA_stripe buckets.

This doesn't yet differentiate between erasure coded and non-erasure
coded data in a stripe bucket, nor do we yet track empty data buckets in
stripes.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.h | 26 +++++++++++++++++++-------
 fs/bcachefs/backpointers.h     | 10 +++++++++-
 fs/bcachefs/btree_gc.c         | 15 ++++++++++++---
 fs/bcachefs/buckets.c          | 14 ++++++++------
 fs/bcachefs/movinggc.c         | 10 +++++++---
 5 files changed, 55 insertions(+), 20 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h
index 96ac8f396d46..c9ff590ef978 100644
--- a/fs/bcachefs/alloc_background.h
+++ b/fs/bcachefs/alloc_background.h
@@ -44,10 +44,10 @@ static inline enum bch_data_type __alloc_data_type(u32 dirty_sectors,
 						   struct bch_alloc_v4 a,
 						   enum bch_data_type data_type)
 {
+	if (stripe)
+		return data_type == BCH_DATA_parity ? data_type : BCH_DATA_stripe;
 	if (dirty_sectors)
 		return data_type;
-	if (stripe)
-		return BCH_DATA_stripe;
 	if (cached_sectors)
 		return BCH_DATA_cached;
 	if (BCH_ALLOC_V4_NEED_DISCARD(&a))
@@ -64,19 +64,31 @@ static inline enum bch_data_type alloc_data_type(struct bch_alloc_v4 a,
 				 a.stripe, a, data_type);
 }
 
+static inline enum bch_data_type bucket_data_type(enum bch_data_type data_type)
+{
+	return data_type == BCH_DATA_stripe ? BCH_DATA_user : data_type;
+}
+
 static inline u64 alloc_lru_idx_read(struct bch_alloc_v4 a)
 {
 	return a.data_type == BCH_DATA_cached ? a.io_time[READ] : 0;
 }
 
+#define DATA_TYPES_MOVABLE		\
+	((1U << BCH_DATA_btree)|	\
+	 (1U << BCH_DATA_user)|		\
+	 (1U << BCH_DATA_stripe))
+
+static inline bool data_type_movable(enum bch_data_type type)
+{
+	return (1U << type) & DATA_TYPES_MOVABLE;
+}
+
 static inline u64 alloc_lru_idx_fragmentation(struct bch_alloc_v4 a,
 					      struct bch_dev *ca)
 {
-	if (a.data_type != BCH_DATA_btree &&
-	    a.data_type != BCH_DATA_user)
-		return 0;
-
-	if (a.dirty_sectors >= ca->mi.bucket_size)
+	if (!data_type_movable(a.data_type) ||
+	    a.dirty_sectors >= ca->mi.bucket_size)
 		return 0;
 
 	return div_u64((u64) a.dirty_sectors * (1ULL << 31), ca->mi.bucket_size);
diff --git a/fs/bcachefs/backpointers.h b/fs/bcachefs/backpointers.h
index ded1ab7fb0bc..314fee21dc27 100644
--- a/fs/bcachefs/backpointers.h
+++ b/fs/bcachefs/backpointers.h
@@ -96,12 +96,20 @@ static inline int bch2_bucket_backpointer_mod(struct btree_trans *trans,
 	return bch2_trans_update_buffered(trans, BTREE_ID_backpointers, &bp_k->k_i);
 }
 
+static inline enum bch_data_type bkey_ptr_data_type(enum btree_id btree_id, unsigned level,
+						    struct bkey_s_c k, struct extent_ptr_decoded p)
+{
+	return  level		? BCH_DATA_btree :
+		p.has_ec	? BCH_DATA_stripe :
+				  BCH_DATA_user;
+}
+
 static inline void bch2_extent_ptr_to_bp(struct bch_fs *c,
 			   enum btree_id btree_id, unsigned level,
 			   struct bkey_s_c k, struct extent_ptr_decoded p,
 			   struct bpos *bucket_pos, struct bch_backpointer *bp)
 {
-	enum bch_data_type data_type = level ? BCH_DATA_btree : BCH_DATA_user;
+	enum bch_data_type data_type = bkey_ptr_data_type(btree_id, level, k, p);
 	s64 sectors = level ? btree_sectors(c) : k.k->size;
 	u32 bucket_offset;
 
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 6a0eaa661002..df4cdd16c08d 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -633,8 +633,8 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id
 		if (data_type != BCH_DATA_btree && p.ptr.gen != g->gen)
 			continue;
 
-		if (fsck_err_on(g->data_type &&
-				g->data_type != data_type, c,
+		if (fsck_err_on(bucket_data_type(g->data_type) &&
+				bucket_data_type(g->data_type) != data_type, c,
 				"bucket %u:%zu different types of data in same bucket: %s, %s\n"
 				"while marking %s",
 				p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
@@ -1397,6 +1397,16 @@ static int bch2_alloc_write_key(struct btree_trans *trans,
 	if (gen_after(old->gen, gc.gen))
 		return 0;
 
+	if (c->opts.reconstruct_alloc ||
+	    fsck_err_on(new.data_type != gc.data_type, c,
+			"bucket %llu:%llu gen %u has wrong data_type"
+			": got %s, should be %s",
+			iter->pos.inode, iter->pos.offset,
+			gc.gen,
+			bch2_data_types[new.data_type],
+			bch2_data_types[gc.data_type]))
+		new.data_type = gc.data_type;
+
 #define copy_bucket_field(_f)						\
 	if (c->opts.reconstruct_alloc ||				\
 	    fsck_err_on(new._f != gc._f, c,				\
@@ -1409,7 +1419,6 @@ static int bch2_alloc_write_key(struct btree_trans *trans,
 		new._f = gc._f;						\
 
 	copy_bucket_field(gen);
-	copy_bucket_field(data_type);
 	copy_bucket_field(dirty_sectors);
 	copy_bucket_field(cached_sectors);
 	copy_bucket_field(stripe_redundancy);
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 40e3d649a05e..c7139dd8e1dc 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -776,7 +776,7 @@ static int mark_stripe_bucket(struct btree_trans *trans,
 	const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
 	unsigned nr_data = s->nr_blocks - s->nr_redundant;
 	bool parity = ptr_idx >= nr_data;
-	enum bch_data_type data_type = parity ? BCH_DATA_parity : 0;
+	enum bch_data_type data_type = parity ? BCH_DATA_parity : BCH_DATA_stripe;
 	s64 sectors = parity ? le16_to_cpu(s->sectors) : 0;
 	const struct bch_extent_ptr *ptr = s->ptrs + ptr_idx;
 	struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
@@ -811,8 +811,7 @@ static int mark_stripe_bucket(struct btree_trans *trans,
 	if (ret)
 		goto err;
 
-	if (data_type)
-		g->data_type = data_type;
+	g->data_type = data_type;
 	g->dirty_sectors += sectors;
 
 	g->stripe		= k.k->p.offset;
@@ -851,15 +850,17 @@ static int __mark_pointer(struct btree_trans *trans,
 }
 
 static int bch2_mark_pointer(struct btree_trans *trans,
+			     enum btree_id btree_id, unsigned level,
 			     struct bkey_s_c k,
 			     struct extent_ptr_decoded p,
-			     s64 sectors, enum bch_data_type data_type,
+			     s64 sectors,
 			     unsigned flags)
 {
 	u64 journal_seq = trans->journal_res.seq;
 	struct bch_fs *c = trans->c;
 	struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev);
 	struct bucket old, new, *g;
+	enum bch_data_type data_type = bkey_ptr_data_type(btree_id, level, k, p);
 	u8 bucket_data_type;
 	int ret = 0;
 
@@ -963,8 +964,7 @@ int bch2_mark_extent(struct btree_trans *trans,
 		if (flags & BTREE_TRIGGER_OVERWRITE)
 			disk_sectors = -disk_sectors;
 
-		ret = bch2_mark_pointer(trans, k, p, disk_sectors,
-					data_type, flags);
+		ret = bch2_mark_pointer(trans, btree_id, level, k, p, disk_sectors, flags);
 		if (ret < 0)
 			return ret;
 
@@ -1596,6 +1596,7 @@ static int bch2_trans_mark_stripe_bucket(struct btree_trans *trans,
 
 		a->v.stripe		= s.k->p.offset;
 		a->v.stripe_redundancy	= s.v->nr_redundant;
+		a->v.data_type		= BCH_DATA_stripe;
 	} else {
 		if (bch2_trans_inconsistent_on(a->v.stripe != s.k->p.offset ||
 					       a->v.stripe_redundancy != s.v->nr_redundant, trans,
@@ -1608,6 +1609,7 @@ static int bch2_trans_mark_stripe_bucket(struct btree_trans *trans,
 
 		a->v.stripe		= 0;
 		a->v.stripe_redundancy	= 0;
+		a->v.data_type		= alloc_data_type(a->v, BCH_DATA_user);
 	}
 
 	a->v.dirty_sectors += sectors;
diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c
index 63b358c95282..bfa0463b4ffe 100644
--- a/fs/bcachefs/movinggc.c
+++ b/fs/bcachefs/movinggc.c
@@ -55,8 +55,7 @@ static int bch2_bucket_is_movable(struct btree_trans *trans,
 
 	a = bch2_alloc_to_v4(k, &_a);
 	*gen = a->gen;
-	ret = (a->data_type == BCH_DATA_btree ||
-	       a->data_type == BCH_DATA_user) &&
+	ret = data_type_movable(a->data_type) &&
 		a->fragmentation_lru &&
 		a->fragmentation_lru <= time;
 
@@ -158,13 +157,18 @@ unsigned long bch2_copygc_wait_amount(struct bch_fs *c)
 	struct bch_dev *ca;
 	unsigned dev_idx;
 	s64 wait = S64_MAX, fragmented_allowed, fragmented;
+	unsigned i;
 
 	for_each_rw_member(ca, c, dev_idx) {
 		struct bch_dev_usage usage = bch2_dev_usage_read(ca);
 
 		fragmented_allowed = ((__dev_buckets_available(ca, usage, RESERVE_none) *
 				       ca->mi.bucket_size) >> 1);
-		fragmented = usage.d[BCH_DATA_user].fragmented;
+		fragmented = 0;
+
+		for (i = 0; i < BCH_DATA_NR; i++)
+			if (data_type_movable(i))
+				fragmented += usage.d[i].fragmented;
 
 		wait = min(wait, max(0LL, fragmented_allowed - fragmented));
 	}
-- 
cgit 


From a1fb08f5df6a8b2995d9daf8c2997cd478b51c55 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 3 Mar 2023 02:43:39 -0500
Subject: bcachefs: Plumb alloc_reserve through stripe create path

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_foreground.c |  7 +++----
 fs/bcachefs/ec.c               | 40 +++++++++++++++++-----------------------
 fs/bcachefs/ec.h               |  5 +++--
 3 files changed, 23 insertions(+), 29 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index 161585de70c8..5aab85f1a835 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -794,6 +794,7 @@ static int bucket_alloc_from_stripe(struct btree_trans *trans,
 			 unsigned nr_replicas,
 			 unsigned *nr_effective,
 			 bool *have_cache,
+			 enum alloc_reserve reserve,
 			 unsigned flags,
 			 struct closure *cl)
 {
@@ -813,9 +814,7 @@ static int bucket_alloc_from_stripe(struct btree_trans *trans,
 	if (ec_open_bucket(c, ptrs))
 		return 0;
 
-	h = bch2_ec_stripe_head_get(trans, target, 0, nr_replicas - 1,
-				    wp == &c->copygc_write_point,
-				    cl);
+	h = bch2_ec_stripe_head_get(trans, target, 0, nr_replicas - 1, reserve, cl);
 	if (IS_ERR(h))
 		return PTR_ERR(h);
 	if (!h)
@@ -926,7 +925,7 @@ static int open_bucket_add_buckets(struct btree_trans *trans,
 			ret = bucket_alloc_from_stripe(trans, ptrs, wp, &devs,
 						 target, erasure_code,
 						 nr_replicas, nr_effective,
-						 have_cache, flags, _cl);
+						 have_cache, reserve, flags, _cl);
 			if (bch2_err_matches(ret, BCH_ERR_transaction_restart) ||
 			    bch2_err_matches(ret, BCH_ERR_freelist_empty) ||
 			    bch2_err_matches(ret, BCH_ERR_open_buckets_empty))
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index 88c4873e1833..4bf655b0de0a 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -1336,7 +1336,7 @@ static int ec_new_stripe_alloc(struct bch_fs *c, struct ec_stripe_head *h)
 static struct ec_stripe_head *
 ec_new_stripe_head_alloc(struct bch_fs *c, unsigned target,
 			 unsigned algo, unsigned redundancy,
-			 bool copygc)
+			 enum alloc_reserve reserve)
 {
 	struct ec_stripe_head *h;
 	struct bch_dev *ca;
@@ -1352,7 +1352,7 @@ ec_new_stripe_head_alloc(struct bch_fs *c, unsigned target,
 	h->target	= target;
 	h->algo		= algo;
 	h->redundancy	= redundancy;
-	h->copygc	= copygc;
+	h->reserve	= reserve;
 
 	rcu_read_lock();
 	h->devs = target_rw_devs(c, BCH_DATA_user, target);
@@ -1387,7 +1387,7 @@ struct ec_stripe_head *__bch2_ec_stripe_head_get(struct btree_trans *trans,
 						 unsigned target,
 						 unsigned algo,
 						 unsigned redundancy,
-						 bool copygc)
+						 enum alloc_reserve reserve)
 {
 	struct bch_fs *c = trans->c;
 	struct ec_stripe_head *h;
@@ -1404,21 +1404,21 @@ struct ec_stripe_head *__bch2_ec_stripe_head_get(struct btree_trans *trans,
 		if (h->target		== target &&
 		    h->algo		== algo &&
 		    h->redundancy	== redundancy &&
-		    h->copygc		== copygc) {
+		    h->reserve		== reserve) {
 			ret = bch2_trans_mutex_lock(trans, &h->lock);
 			if (ret)
 				h = ERR_PTR(ret);
 			goto found;
 		}
 
-	h = ec_new_stripe_head_alloc(c, target, algo, redundancy, copygc);
+	h = ec_new_stripe_head_alloc(c, target, algo, redundancy, reserve);
 found:
 	mutex_unlock(&c->ec_stripe_head_lock);
 	return h;
 }
 
 static int new_stripe_alloc_buckets(struct btree_trans *trans, struct ec_stripe_head *h,
-				    struct closure *cl)
+				    enum alloc_reserve reserve, struct closure *cl)
 {
 	struct bch_fs *c = trans->c;
 	struct bch_devs_mask devs = h->devs;
@@ -1428,14 +1428,12 @@ static int new_stripe_alloc_buckets(struct btree_trans *trans, struct ec_stripe_
 	bool have_cache = true;
 	int ret = 0;
 
-	for (i = 0; i < h->s->new_stripe.key.v.nr_blocks; i++) {
-		if (test_bit(i, h->s->blocks_gotten)) {
-			__clear_bit(h->s->new_stripe.key.v.ptrs[i].dev, devs.d);
-			if (i < h->s->nr_data)
-				nr_have_data++;
-			else
-				nr_have_parity++;
-		}
+	for_each_set_bit(i, h->s->blocks_gotten, h->s->new_stripe.key.v.nr_blocks) {
+		__clear_bit(h->s->new_stripe.key.v.ptrs[i].dev, devs.d);
+		if (i < h->s->nr_data)
+			nr_have_data++;
+		else
+			nr_have_parity++;
 	}
 
 	BUG_ON(nr_have_data	> h->s->nr_data);
@@ -1449,9 +1447,7 @@ static int new_stripe_alloc_buckets(struct btree_trans *trans, struct ec_stripe_
 					    h->s->nr_parity,
 					    &nr_have_parity,
 					    &have_cache,
-					    h->copygc
-					    ? RESERVE_movinggc
-					    : RESERVE_none,
+					    reserve,
 					    0,
 					    cl);
 
@@ -1478,9 +1474,7 @@ static int new_stripe_alloc_buckets(struct btree_trans *trans, struct ec_stripe_
 					    h->s->nr_data,
 					    &nr_have_data,
 					    &have_cache,
-					    h->copygc
-					    ? RESERVE_movinggc
-					    : RESERVE_none,
+					    reserve,
 					    0,
 					    cl);
 
@@ -1640,7 +1634,7 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans,
 					       unsigned target,
 					       unsigned algo,
 					       unsigned redundancy,
-					       bool copygc,
+					       enum alloc_reserve reserve,
 					       struct closure *cl)
 {
 	struct bch_fs *c = trans->c;
@@ -1648,7 +1642,7 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans,
 	int ret;
 	bool needs_stripe_new;
 
-	h = __bch2_ec_stripe_head_get(trans, target, algo, redundancy, copygc);
+	h = __bch2_ec_stripe_head_get(trans, target, algo, redundancy, reserve);
 	if (!h)
 		bch_err(c, "no stripe head");
 	if (IS_ERR_OR_NULL(h))
@@ -1685,7 +1679,7 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans,
 	}
 
 	if (!h->s->allocated) {
-		ret = new_stripe_alloc_buckets(trans, h, cl);
+		ret = new_stripe_alloc_buckets(trans, h, reserve, cl);
 		if (ret)
 			goto err;
 
diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h
index c9e4fb214649..56d1b5e7d797 100644
--- a/fs/bcachefs/ec.h
+++ b/fs/bcachefs/ec.h
@@ -181,7 +181,7 @@ struct ec_stripe_head {
 	unsigned		target;
 	unsigned		algo;
 	unsigned		redundancy;
-	bool			copygc;
+	enum alloc_reserve	reserve;
 
 	struct bch_devs_mask	devs;
 	unsigned		nr_active_devs;
@@ -205,7 +205,8 @@ int bch2_ec_stripe_new_alloc(struct bch_fs *, struct ec_stripe_head *);
 
 void bch2_ec_stripe_head_put(struct bch_fs *, struct ec_stripe_head *);
 struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *,
-			unsigned, unsigned, unsigned, bool, struct closure *);
+			unsigned, unsigned, unsigned,
+			enum alloc_reserve, struct closure *);
 
 void bch2_stripes_heap_update(struct bch_fs *, struct stripe *, size_t);
 void bch2_stripes_heap_del(struct bch_fs *, struct stripe *, size_t);
-- 
cgit 


From 9d32097f3b6617680c75ccbb5cd36d89bcff0dfc Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 3 Mar 2023 03:11:06 -0500
Subject: bcachefs: More stripe create cleanup/fixes

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_gc.c |  5 +++--
 fs/bcachefs/ec.c       | 38 +++++++++++++++++++++++---------------
 2 files changed, 26 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index df4cdd16c08d..f6db56765f38 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -1702,11 +1702,12 @@ static int bch2_gc_write_stripes_key(struct btree_trans *trans,
 	return 0;
 inconsistent:
 	if (fsck_err_on(true, c,
-			"stripe has wrong block sector count %u:\n"
+			"stripe block %u has wrong sector count:\n"
 			"  %s\n"
-			"  should be %u", i,
+			"  got %u, should be %u", i,
 			(printbuf_reset(&buf),
 			 bch2_bkey_val_to_text(&buf, c, k), buf.buf),
+			stripe_blockcount_get(s, i),
 			m ? m->block_sectors[i] : 0)) {
 		struct bkey_i_stripe *new;
 
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index 4bf655b0de0a..d206da686da8 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -866,8 +866,16 @@ static int ec_stripe_key_update(struct btree_trans *trans,
 			goto err;
 		}
 
-		for (i = 0; i < new->v.nr_blocks; i++)
-			stripe_blockcount_set(&new->v, i, stripe_blockcount_get(old, i));
+		for (i = 0; i < new->v.nr_blocks; i++) {
+			unsigned v = stripe_blockcount_get(old, i);
+
+			BUG_ON(v &&
+			       (old->ptrs[i].dev != new->v.ptrs[i].dev ||
+				old->ptrs[i].gen != new->v.ptrs[i].gen ||
+				old->ptrs[i].offset != new->v.ptrs[i].offset));
+
+			stripe_blockcount_set(&new->v, i, v);
+		}
 	}
 
 	ret = bch2_trans_update(trans, &iter, &new->k_i, 0);
@@ -1542,10 +1550,11 @@ static int __bch2_ec_stripe_head_reuse(struct btree_trans *trans, struct ec_stri
 	if (idx < 0)
 		return -BCH_ERR_ENOSPC_stripe_reuse;
 
-	h->s->have_existing_stripe = true;
 	ret = get_stripe_key_trans(trans, idx, &h->s->existing_stripe);
 	if (ret) {
-		bch2_fs_fatal_error(c, "error reading stripe key: %i", ret);
+		bch2_stripe_close(c, h->s);
+		if (!bch2_err_matches(ret, BCH_ERR_transaction_restart))
+			bch2_fs_fatal_error(c, "error reading stripe key: %s", bch2_err_str(ret));
 		return ret;
 	}
 
@@ -1569,8 +1578,8 @@ static int __bch2_ec_stripe_head_reuse(struct btree_trans *trans, struct ec_stri
 		ec_block_io(c, &h->s->existing_stripe, READ, i, &h->s->iodone);
 	}
 
-	bkey_copy(&h->s->new_stripe.key.k_i,
-		  &h->s->existing_stripe.key.k_i);
+	bkey_copy(&h->s->new_stripe.key.k_i, &h->s->existing_stripe.key.k_i);
+	h->s->have_existing_stripe = true;
 
 	return 0;
 }
@@ -1584,13 +1593,14 @@ static int __bch2_ec_stripe_head_reserve(struct btree_trans *trans, struct ec_st
 	struct bpos start_pos = bpos_max(min_pos, POS(0, c->ec_stripe_hint));
 	int ret;
 
-	BUG_ON(h->s->res.sectors);
-
-	ret = bch2_disk_reservation_get(c, &h->s->res,
+	if (!h->s->res.sectors) {
+		ret = bch2_disk_reservation_get(c, &h->s->res,
 					h->blocksize,
-					h->s->nr_parity, 0);
-	if (ret)
-		return ret;
+					h->s->nr_parity,
+					BCH_DISK_RESERVATION_NOFAIL);
+		if (ret)
+			return ret;
+	}
 
 	for_each_btree_key_norestart(trans, iter, BTREE_ID_stripes, start_pos,
 			   BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) {
@@ -1673,10 +1683,8 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans,
 
 	if (ret && needs_stripe_new)
 		ret = __bch2_ec_stripe_head_reuse(trans, h);
-	if (ret) {
-		bch_err_ratelimited(c, "failed to get stripe: %s", bch2_err_str(ret));
+	if (ret)
 		goto err;
-	}
 
 	if (!h->s->allocated) {
 		ret = new_stripe_alloc_buckets(trans, h, reserve, cl);
-- 
cgit 


From d57c9add59b187a6fcd76cb80d60f36234ca8033 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 3 Mar 2023 23:08:11 -0500
Subject: bcachefs: Improve error message for stripe block sector counts wrong

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_gc.c | 29 ++++++++++++++++-------------
 1 file changed, 16 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index f6db56765f38..37017eea2323 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -1687,6 +1687,7 @@ static int bch2_gc_write_stripes_key(struct btree_trans *trans,
 	struct printbuf buf = PRINTBUF;
 	const struct bch_stripe *s;
 	struct gc_stripe *m;
+	bool bad = false;
 	unsigned i;
 	int ret = 0;
 
@@ -1696,19 +1697,21 @@ static int bch2_gc_write_stripes_key(struct btree_trans *trans,
 	s = bkey_s_c_to_stripe(k).v;
 	m = genradix_ptr(&c->gc_stripes, k.k->p.offset);
 
-	for (i = 0; i < s->nr_blocks; i++)
-		if (stripe_blockcount_get(s, i) != (m ? m->block_sectors[i] : 0))
-			goto inconsistent;
-	return 0;
-inconsistent:
-	if (fsck_err_on(true, c,
-			"stripe block %u has wrong sector count:\n"
-			"  %s\n"
-			"  got %u, should be %u", i,
-			(printbuf_reset(&buf),
-			 bch2_bkey_val_to_text(&buf, c, k), buf.buf),
-			stripe_blockcount_get(s, i),
-			m ? m->block_sectors[i] : 0)) {
+	for (i = 0; i < s->nr_blocks; i++) {
+		u32 old = stripe_blockcount_get(s, i);
+		u32 new = (m ? m->block_sectors[i] : 0);
+
+		if (old != new) {
+			prt_printf(&buf, "stripe block %u has wrong sector count: got %u, should be %u\n",
+				   i, old, new);
+			bad = true;
+		}
+	}
+
+	if (bad)
+		bch2_bkey_val_to_text(&buf, c, k);
+
+	if (fsck_err_on(bad, c, "%s", buf.buf)) {
 		struct bkey_i_stripe *new;
 
 		new = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
-- 
cgit 


From e84face6f0c9512d896eb1bf6c8238ea2fa7edd0 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 2 Mar 2023 01:54:17 -0500
Subject: bcachefs: RESERVE_stripe

Rework stripe creation path - new algorithm for deciding when to create
new stripes or reuse existing stripes.

We add a new allocation watermark, RESERVE_stripe, above RESERVE_none.
Then we always try to create a new stripe by doing RESERVE_stripe
allocations; if this fails, we reuse an existing stripe and allocate
buckets for it with the reserve watermark for the given write
(RESERVE_none or RESERVE_movinggc).

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.h |  2 +-
 fs/bcachefs/alloc_types.h      |  3 +-
 fs/bcachefs/buckets.h          |  3 ++
 fs/bcachefs/ec.c               | 73 ++++++++++++++++++++++++++++++------------
 fs/bcachefs/errcode.h          |  1 +
 5 files changed, 60 insertions(+), 22 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h
index c9ff590ef978..324798396fc6 100644
--- a/fs/bcachefs/alloc_background.h
+++ b/fs/bcachefs/alloc_background.h
@@ -216,7 +216,7 @@ static inline u64 should_invalidate_buckets(struct bch_dev *ca,
 	u64 free = max_t(s64, 0,
 			   u.d[BCH_DATA_free].buckets
 			 + u.d[BCH_DATA_need_discard].buckets
-			 - bch2_dev_buckets_reserved(ca, RESERVE_none));
+			 - bch2_dev_buckets_reserved(ca, RESERVE_stripe));
 
 	return clamp_t(s64, want_free - free, 0, u.d[BCH_DATA_cached].buckets);
 }
diff --git a/fs/bcachefs/alloc_types.h b/fs/bcachefs/alloc_types.h
index c8a45ea9d661..4d09bd20d8ec 100644
--- a/fs/bcachefs/alloc_types.h
+++ b/fs/bcachefs/alloc_types.h
@@ -22,7 +22,8 @@ struct ec_bucket_buf;
 	x(btree_movinggc)		\
 	x(btree)			\
 	x(movinggc)			\
-	x(none)
+	x(none)				\
+	x(stripe)
 
 enum alloc_reserve {
 #define x(name)	RESERVE_##name,
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index 22721bfea414..d677b0225c52 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -157,6 +157,9 @@ static inline u64 bch2_dev_buckets_reserved(struct bch_dev *ca, enum alloc_reser
 	switch (reserve) {
 	case RESERVE_NR:
 		unreachable();
+	case RESERVE_stripe:
+		reserved += ca->mi.nbuckets >> 6;
+		fallthrough;
 	case RESERVE_none:
 		reserved += ca->mi.nbuckets >> 6;
 		fallthrough;
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index d206da686da8..6bf14f975d93 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -1569,6 +1569,17 @@ static int __bch2_ec_stripe_head_reuse(struct btree_trans *trans, struct ec_stri
 	BUG_ON(h->s->existing_stripe.size != h->blocksize);
 	BUG_ON(h->s->existing_stripe.size != h->s->existing_stripe.key.v.sectors);
 
+	/*
+	 * Free buckets we initially allocated - they might conflict with
+	 * blocks from the stripe we're reusing:
+	 */
+	for_each_set_bit(i, h->s->blocks_gotten, h->s->new_stripe.key.v.nr_blocks) {
+		bch2_open_bucket_put(c, c->open_buckets + h->s->blocks[i]);
+		h->s->blocks[i] = 0;
+	}
+	memset(h->s->blocks_gotten, 0, sizeof(h->s->blocks_gotten));
+	memset(h->s->blocks_allocated, 0, sizeof(h->s->blocks_allocated));
+
 	for (i = 0; i < h->s->existing_stripe.key.v.nr_blocks; i++) {
 		if (stripe_blockcount_get(&h->s->existing_stripe.key.v, i)) {
 			__set_bit(i, h->s->blocks_gotten);
@@ -1649,8 +1660,8 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans,
 {
 	struct bch_fs *c = trans->c;
 	struct ec_stripe_head *h;
+	bool waiting = false;
 	int ret;
-	bool needs_stripe_new;
 
 	h = __bch2_ec_stripe_head_get(trans, target, algo, redundancy, reserve);
 	if (!h)
@@ -1658,8 +1669,7 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans,
 	if (IS_ERR_OR_NULL(h))
 		return h;
 
-	needs_stripe_new = !h->s;
-	if (needs_stripe_new) {
+	if (!h->s) {
 		if (ec_new_stripe_alloc(c, h)) {
 			ret = -ENOMEM;
 			bch_err(c, "failed to allocate new stripe");
@@ -1670,30 +1680,53 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans,
 			BUG();
 	}
 
-	/*
-	 * Try reserve a new stripe before reusing an
-	 * existing stripe. This will prevent unnecessary
-	 * read amplification during write oriented workloads.
-	 */
-	ret = 0;
-	if (!h->s->allocated && !h->s->res.sectors && !h->s->have_existing_stripe)
-		ret = __bch2_ec_stripe_head_reserve(trans, h);
-	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-		goto err;
+	if (h->s->allocated)
+		goto allocated;
 
-	if (ret && needs_stripe_new)
-		ret = __bch2_ec_stripe_head_reuse(trans, h);
-	if (ret)
+	if (h->s->have_existing_stripe)
+		goto alloc_existing;
+
+	/* First, try to allocate a full stripe: */
+	ret =   new_stripe_alloc_buckets(trans, h, RESERVE_stripe, NULL) ?:
+		__bch2_ec_stripe_head_reserve(trans, h);
+	if (!ret)
+		goto allocated;
+	if (bch2_err_matches(ret, BCH_ERR_transaction_restart) ||
+	    bch2_err_matches(ret, ENOMEM))
 		goto err;
 
-	if (!h->s->allocated) {
-		ret = new_stripe_alloc_buckets(trans, h, reserve, cl);
-		if (ret)
+	/*
+	 * Not enough buckets available for a full stripe: we must reuse an
+	 * existing stripe:
+	 */
+	while (1) {
+		ret = __bch2_ec_stripe_head_reuse(trans, h);
+		if (!ret)
+			break;
+		if (ret == -BCH_ERR_ENOSPC_stripe_reuse && cl)
+			ret = -BCH_ERR_stripe_alloc_blocked;
+		if (waiting || !cl || ret != -BCH_ERR_stripe_alloc_blocked)
 			goto err;
 
-		h->s->allocated = true;
+		/* XXX freelist_wait? */
+		closure_wait(&c->freelist_wait, cl);
+		waiting = true;
 	}
 
+	if (waiting)
+		closure_wake_up(&c->freelist_wait);
+alloc_existing:
+	/*
+	 * Retry allocating buckets, with the reserve watermark for this
+	 * particular write:
+	 */
+	ret = new_stripe_alloc_buckets(trans, h, reserve, cl);
+	if (ret)
+		goto err;
+allocated:
+	h->s->allocated = true;
+	BUG_ON(!h->s->idx);
+
 	BUG_ON(trans->restarted);
 	return h;
 err:
diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h
index 6129af6129c3..283303db7dfd 100644
--- a/fs/bcachefs/errcode.h
+++ b/fs/bcachefs/errcode.h
@@ -93,6 +93,7 @@
 	x(BCH_ERR_operation_blocked,	journal_res_get_blocked)		\
 	x(BCH_ERR_operation_blocked,	journal_preres_get_blocked)		\
 	x(BCH_ERR_operation_blocked,	bucket_alloc_blocked)			\
+	x(BCH_ERR_operation_blocked,	stripe_alloc_blocked)			\
 	x(BCH_ERR_invalid,		invalid_sb)				\
 	x(BCH_ERR_invalid_sb,		invalid_sb_magic)			\
 	x(BCH_ERR_invalid_sb,		invalid_sb_version)			\
-- 
cgit 


From 2f528663c5abf101aae90b2adcce715bda424bfc Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 4 Mar 2023 02:51:12 -0500
Subject: bcachefs: moving_context->stats is allowed to be NULL

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/data_update.c |  2 +-
 fs/bcachefs/move.c        | 37 +++++++++++++++++++++++--------------
 2 files changed, 24 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c
index b3216f50bb3c..26b351d48940 100644
--- a/fs/bcachefs/data_update.c
+++ b/fs/bcachefs/data_update.c
@@ -273,7 +273,7 @@ next:
 		}
 		continue;
 nomatch:
-		if (m->ctxt) {
+		if (m->ctxt && m->ctxt->stats) {
 			BUG_ON(k.k->p.offset <= iter.pos.offset);
 			atomic64_inc(&m->ctxt->stats->keys_raced);
 			atomic64_add(k.k->p.offset - iter.pos.offset,
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index 814ca33a15f6..89d6635e2169 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -303,12 +303,6 @@ static int bch2_move_extent(struct btree_trans *trans,
 	if (ret && ret != -BCH_ERR_unwritten_extent_update)
 		goto err_free_pages;
 
-	io->write.ctxt = ctxt;
-	io->write.op.end_io = move_write_done;
-
-	atomic64_inc(&ctxt->stats->keys_moved);
-	atomic64_add(k.k->size, &ctxt->stats->sectors_moved);
-
 	if (ret == -BCH_ERR_unwritten_extent_update) {
 		bch2_update_unwritten_extent(trans, &io->write);
 		move_free(io);
@@ -317,6 +311,14 @@ static int bch2_move_extent(struct btree_trans *trans,
 
 	BUG_ON(ret);
 
+	io->write.ctxt = ctxt;
+	io->write.op.end_io = move_write_done;
+
+	if (ctxt->stats) {
+		atomic64_inc(&ctxt->stats->keys_moved);
+		atomic64_add(k.k->size, &ctxt->stats->sectors_moved);
+	}
+
 	this_cpu_add(c->counters[BCH_COUNTER_io_move], k.k->size);
 	this_cpu_add(c->counters[BCH_COUNTER_move_extent_read], k.k->size);
 	trace_move_extent_read(k.k);
@@ -468,9 +470,11 @@ static int __bch2_move_data(struct moving_context *ctxt,
 	bch2_bkey_buf_init(&sk);
 	bch2_trans_init(&trans, c, 0, 0);
 
-	ctxt->stats->data_type	= BCH_DATA_user;
-	ctxt->stats->btree_id	= btree_id;
-	ctxt->stats->pos	= start;
+	if (ctxt->stats) {
+		ctxt->stats->data_type	= BCH_DATA_user;
+		ctxt->stats->btree_id	= btree_id;
+		ctxt->stats->pos	= start;
+	}
 
 	bch2_trans_iter_init(&trans, &iter, btree_id, start,
 			     BTREE_ITER_PREFETCH|
@@ -495,7 +499,8 @@ static int __bch2_move_data(struct moving_context *ctxt,
 		if (bkey_ge(bkey_start_pos(k.k), end))
 			break;
 
-		ctxt->stats->pos = iter.pos;
+		if (ctxt->stats)
+			ctxt->stats->pos = iter.pos;
 
 		if (!bkey_extent_is_direct_data(k.k))
 			goto next_nondata;
@@ -535,7 +540,8 @@ static int __bch2_move_data(struct moving_context *ctxt,
 		if (ctxt->rate)
 			bch2_ratelimit_increment(ctxt->rate, k.k->size);
 next:
-		atomic64_add(k.k->size, &ctxt->stats->sectors_seen);
+		if (ctxt->stats)
+			atomic64_add(k.k->size, &ctxt->stats->sectors_seen);
 next_nondata:
 		bch2_btree_iter_advance(&iter);
 	}
@@ -759,7 +765,8 @@ int __bch2_evacuate_bucket(struct btree_trans *trans,
 
 			if (ctxt->rate)
 				bch2_ratelimit_increment(ctxt->rate, k.k->size);
-			atomic64_add(k.k->size, &ctxt->stats->sectors_seen);
+			if (ctxt->stats)
+				atomic64_add(k.k->size, &ctxt->stats->sectors_seen);
 		} else {
 			struct btree *b;
 
@@ -786,8 +793,10 @@ int __bch2_evacuate_bucket(struct btree_trans *trans,
 			if (ctxt->rate)
 				bch2_ratelimit_increment(ctxt->rate,
 							 c->opts.btree_node_size >> 9);
-			atomic64_add(c->opts.btree_node_size >> 9, &ctxt->stats->sectors_seen);
-			atomic64_add(c->opts.btree_node_size >> 9, &ctxt->stats->sectors_moved);
+			if (ctxt->stats) {
+				atomic64_add(c->opts.btree_node_size >> 9, &ctxt->stats->sectors_seen);
+				atomic64_add(c->opts.btree_node_size >> 9, &ctxt->stats->sectors_moved);
+			}
 		}
 next:
 		bp_offset++;
-- 
cgit 


From 45dd05b3ecc371560f9e36e4b57295ee338ee879 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 4 Mar 2023 22:36:02 -0500
Subject: bcachefs: BKEY_PADDED_ONSTACK()

Rust bindgen doesn't do anonymous structs very nicely: BKEY_PADDED()
only needs the anonymous struct when it's used on the stack, to
guarantee layout, not when it's embedded in another struct.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs.h              | 3 +++
 fs/bcachefs/bcachefs_format.h       | 2 +-
 fs/bcachefs/btree_io.c              | 2 +-
 fs/bcachefs/btree_update_interior.c | 2 +-
 4 files changed, 6 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 509a16469613..baaa4cd3caa7 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -1123,4 +1123,7 @@ static inline bool bch2_dev_exists2(const struct bch_fs *c, unsigned dev)
 	return dev < c->sb.nr_devices && c->devs[dev];
 }
 
+#define BKEY_PADDED_ONSTACK(key, pad)				\
+	struct { struct bkey_i key; __u64 key ## _pad[pad]; }
+
 #endif /* _BCACHEFS_H */
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index 4d3c5e2f7ea6..f5587ed7dfd2 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -322,7 +322,7 @@ static inline void bkey_init(struct bkey *k)
 #define bkey_bytes(_k)		((_k)->u64s * sizeof(__u64))
 
 #define __BKEY_PADDED(key, pad)					\
-	struct { struct bkey_i key; __u64 key ## _pad[pad]; }
+	struct bkey_i key; __u64 key ## _pad[pad]
 
 /*
  * - DELETED keys are used internally to mark keys that should be ignored but
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 358a285c3bcf..159a97596052 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -1847,7 +1847,7 @@ static void btree_write_submit(struct work_struct *work)
 {
 	struct btree_write_bio *wbio = container_of(work, struct btree_write_bio, work);
 	struct bch_extent_ptr *ptr;
-	__BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp;
+	BKEY_PADDED_ONSTACK(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp;
 
 	bkey_copy(&tmp.k, &wbio->key);
 
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index bce6a9d7795c..9e77d2e3b421 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -242,7 +242,7 @@ static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans,
 	struct bch_fs *c = trans->c;
 	struct write_point *wp;
 	struct btree *b;
-	__BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp;
+	BKEY_PADDED_ONSTACK(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp;
 	struct open_buckets ob = { .nr = 0 };
 	struct bch_devs_list devs_have = (struct bch_devs_list) { 0 };
 	unsigned nr_reserve;
-- 
cgit 


From ac2ccddc2689d5889fd1520383738b60dbafc1d0 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 4 Mar 2023 23:05:55 -0500
Subject: bcachefs: Drop some anonymous structs, unions

Rust bindgen doesn't cope well with anonymous structs and unions. This
patch drops the fancy anonymous structs & unions in bkey_i that let us
use the same helpers for bkey_i and bkey_packed; since bkey_packed is an
internal type that's never exposed to outside code, it's only a minor
inconvenienc.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs_format.h       | 12 ++----------
 fs/bcachefs/bkey.h                  |  7 ++++++-
 fs/bcachefs/bkey_sort.c             |  8 ++++----
 fs/bcachefs/bset.c                  | 20 ++++++++++----------
 fs/bcachefs/bset.h                  |  2 +-
 fs/bcachefs/btree_io.c              | 32 ++++++++++++++++----------------
 fs/bcachefs/btree_key_cache.c       |  6 +++---
 fs/bcachefs/btree_update_interior.c |  4 ++--
 fs/bcachefs/debug.c                 |  2 +-
 fs/bcachefs/extents.h               |  2 +-
 fs/bcachefs/io.c                    |  2 +-
 fs/bcachefs/journal_io.c            |  2 +-
 fs/bcachefs/journal_io.h            | 11 ++++++++---
 fs/bcachefs/recovery.c              |  8 ++++----
 fs/bcachefs/reflink.c               |  6 ++++++
 15 files changed, 66 insertions(+), 58 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index f5587ed7dfd2..aae658d17797 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -294,16 +294,8 @@ enum bch_bkey_fields {
 struct bkey_i {
 	__u64			_data[0];
 
-	union {
-	struct {
-		/* Size of combined key and value, in u64s */
-		__u8		u64s;
-	};
-	struct {
-		struct bkey	k;
-		struct bch_val	v;
-	};
-	};
+	struct bkey	k;
+	struct bch_val	v;
 };
 
 #define KEY(_inode, _offset, _size)					\
diff --git a/fs/bcachefs/bkey.h b/fs/bcachefs/bkey.h
index 28a70ad5a25d..dbe4873cad02 100644
--- a/fs/bcachefs/bkey.h
+++ b/fs/bcachefs/bkey.h
@@ -42,7 +42,12 @@ struct bkey_s {
 	};
 };
 
-#define bkey_next(_k)		vstruct_next(_k)
+#define bkey_p_next(_k)		vstruct_next(_k)
+
+static inline struct bkey_i *bkey_next(struct bkey_i *k)
+{
+	return (struct bkey_i *) (k->_data + k->k.u64s);
+}
 
 #define bkey_val_u64s(_k)	((_k)->u64s - BKEY_U64s)
 
diff --git a/fs/bcachefs/bkey_sort.c b/fs/bcachefs/bkey_sort.c
index 557a79cad986..cdef41db7692 100644
--- a/fs/bcachefs/bkey_sort.c
+++ b/fs/bcachefs/bkey_sort.c
@@ -46,7 +46,7 @@ static inline void sort_iter_advance(struct sort_iter *iter, sort_cmp_fn cmp)
 
 	BUG_ON(!iter->used);
 
-	i->k = bkey_next(i->k);
+	i->k = bkey_p_next(i->k);
 
 	BUG_ON(i->k > i->end);
 
@@ -108,7 +108,7 @@ bch2_key_sort_fix_overlapping(struct bch_fs *c, struct bset *dst,
 		    !should_drop_next_key(iter)) {
 			bkey_copy(out, k);
 			btree_keys_account_key_add(&nr, 0, out);
-			out = bkey_next(out);
+			out = bkey_p_next(out);
 		}
 
 		sort_iter_advance(iter, key_sort_fix_overlapping_cmp);
@@ -147,7 +147,7 @@ bch2_sort_repack(struct bset *dst, struct btree *src,
 		out->needs_whiteout = false;
 
 		btree_keys_account_key_add(&nr, 0, out);
-		out = bkey_next(out);
+		out = bkey_p_next(out);
 	}
 
 	dst->u64s = cpu_to_le16((u64 *) out - dst->_data);
@@ -194,7 +194,7 @@ unsigned bch2_sort_keys(struct bkey_packed *dst,
 			bkey_copy(out, in);
 		}
 		out->needs_whiteout |= needs_whiteout;
-		out = bkey_next(out);
+		out = bkey_p_next(out);
 	}
 
 	return (u64 *) out - (u64 *) dst;
diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c
index 50a1c9d8ebab..a7cb12ea2a04 100644
--- a/fs/bcachefs/bset.c
+++ b/fs/bcachefs/bset.c
@@ -66,7 +66,7 @@ void bch2_dump_bset(struct bch_fs *c, struct btree *b,
 	for (_k = i->start;
 	     _k < vstruct_last(i);
 	     _k = _n) {
-		_n = bkey_next(_k);
+		_n = bkey_p_next(_k);
 
 		k = bkey_disassemble(b, _k, &uk);
 
@@ -539,7 +539,7 @@ start:
 			       rw_aux_tree(b, t)[j - 1].offset);
 		}
 
-		k = bkey_next(k);
+		k = bkey_p_next(k);
 		BUG_ON(k >= btree_bkey_last(b, t));
 	}
 }
@@ -730,7 +730,7 @@ retry:
 	/* First we figure out where the first key in each cacheline is */
 	eytzinger1_for_each(j, t->size - 1) {
 		while (bkey_to_cacheline(b, t, k) < cacheline)
-			prev = k, k = bkey_next(k);
+			prev = k, k = bkey_p_next(k);
 
 		if (k >= btree_bkey_last(b, t)) {
 			/* XXX: this path sucks */
@@ -747,7 +747,7 @@ retry:
 	}
 
 	while (k != btree_bkey_last(b, t))
-		prev = k, k = bkey_next(k);
+		prev = k, k = bkey_p_next(k);
 
 	if (!bkey_pack_pos(bkey_to_packed(&min_key), b->data->min_key, b)) {
 		bkey_init(&min_key.k);
@@ -885,7 +885,7 @@ struct bkey_packed *bch2_bkey_prev_filter(struct btree *b,
 	struct bkey_packed *p, *i, *ret = NULL, *orig_k = k;
 
 	while ((p = __bkey_prev(b, t, k)) && !ret) {
-		for (i = p; i != k; i = bkey_next(i))
+		for (i = p; i != k; i = bkey_p_next(i))
 			if (i->type >= min_key_type)
 				ret = i;
 
@@ -896,10 +896,10 @@ struct bkey_packed *bch2_bkey_prev_filter(struct btree *b,
 		BUG_ON(ret >= orig_k);
 
 		for (i = ret
-			? bkey_next(ret)
+			? bkey_p_next(ret)
 			: btree_bkey_first(b, t);
 		     i != orig_k;
-		     i = bkey_next(i))
+		     i = bkey_p_next(i))
 			BUG_ON(i->type >= min_key_type);
 	}
 
@@ -971,7 +971,7 @@ static void bch2_bset_fix_lookup_table(struct btree *b,
 		struct bkey_packed *k = start;
 
 		while (1) {
-			k = bkey_next(k);
+			k = bkey_p_next(k);
 			if (k == end)
 				break;
 
@@ -1205,12 +1205,12 @@ struct bkey_packed *bch2_bset_search_linear(struct btree *b,
 		while (m != btree_bkey_last(b, t) &&
 		       bkey_iter_cmp_p_or_unp(b, m,
 					lossy_packed_search, search) < 0)
-			m = bkey_next(m);
+			m = bkey_p_next(m);
 
 	if (!packed_search)
 		while (m != btree_bkey_last(b, t) &&
 		       bkey_iter_pos_cmp(b, m, search) < 0)
-			m = bkey_next(m);
+			m = bkey_p_next(m);
 
 	if (bch2_expensive_debug_checks) {
 		struct bkey_packed *prev = bch2_bkey_prev_all(b, t, m);
diff --git a/fs/bcachefs/bset.h b/fs/bcachefs/bset.h
index fd2915a15070..2105e7836557 100644
--- a/fs/bcachefs/bset.h
+++ b/fs/bcachefs/bset.h
@@ -211,7 +211,7 @@ static inline size_t btree_aux_data_u64s(const struct btree *b)
 #define bset_tree_for_each_key(_b, _t, _k)				\
 	for (_k = btree_bkey_first(_b, _t);				\
 	     _k != btree_bkey_last(_b, _t);				\
-	     _k = bkey_next(_k))
+	     _k = bkey_p_next(_k))
 
 static inline bool bset_has_ro_aux_tree(struct bset_tree *t)
 {
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 159a97596052..5dc2b3ecb319 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -77,9 +77,9 @@ static void verify_no_dups(struct btree *b,
 	if (start == end)
 		return;
 
-	for (p = start, k = bkey_next(start);
+	for (p = start, k = bkey_p_next(start);
 	     k != end;
-	     p = k, k = bkey_next(k)) {
+	     p = k, k = bkey_p_next(k)) {
 		struct bkey l = bkey_unpack_key(b, p);
 		struct bkey r = bkey_unpack_key(b, k);
 
@@ -92,7 +92,7 @@ static void set_needs_whiteout(struct bset *i, int v)
 {
 	struct bkey_packed *k;
 
-	for (k = i->start; k != vstruct_last(i); k = bkey_next(k))
+	for (k = i->start; k != vstruct_last(i); k = bkey_p_next(k))
 		k->needs_whiteout = v;
 }
 
@@ -175,7 +175,7 @@ static void bch2_sort_whiteouts(struct bch_fs *c, struct btree *b)
 
 	for (k = unwritten_whiteouts_start(c, b);
 	     k != unwritten_whiteouts_end(c, b);
-	     k = bkey_next(k))
+	     k = bkey_p_next(k))
 		*--ptrs = k;
 
 	sort_bkey_ptrs(b, ptrs, ptrs_end - ptrs);
@@ -184,7 +184,7 @@ static void bch2_sort_whiteouts(struct bch_fs *c, struct btree *b)
 
 	while (ptrs != ptrs_end) {
 		bkey_copy(k, *ptrs);
-		k = bkey_next(k);
+		k = bkey_p_next(k);
 		ptrs++;
 	}
 
@@ -256,11 +256,11 @@ static bool bch2_drop_whiteouts(struct btree *b, enum compact_mode mode)
 		out = i->start;
 
 		for (k = start; k != end; k = n) {
-			n = bkey_next(k);
+			n = bkey_p_next(k);
 
 			if (!bkey_deleted(k)) {
 				bkey_copy(out, k);
-				out = bkey_next(out);
+				out = bkey_p_next(out);
 			} else {
 				BUG_ON(k->needs_whiteout);
 			}
@@ -652,7 +652,7 @@ void bch2_btree_node_drop_keys_outside_node(struct btree *b)
 		struct bset *i = bset(b, t);
 		struct bkey_packed *k;
 
-		for (k = i->start; k != vstruct_last(i); k = bkey_next(k))
+		for (k = i->start; k != vstruct_last(i); k = bkey_p_next(k))
 			if (bkey_cmp_left_packed(b, k, &b->data->min_key) >= 0)
 				break;
 
@@ -665,7 +665,7 @@ void bch2_btree_node_drop_keys_outside_node(struct btree *b)
 			set_btree_bset_end(b, t);
 		}
 
-		for (k = i->start; k != vstruct_last(i); k = bkey_next(k))
+		for (k = i->start; k != vstruct_last(i); k = bkey_p_next(k))
 			if (bkey_cmp_left_packed(b, k, &b->data->max_key) > 0)
 				break;
 
@@ -843,7 +843,7 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b,
 		struct bkey_s u;
 		struct bkey tmp;
 
-		if (btree_err_on(bkey_next(k) > vstruct_last(i),
+		if (btree_err_on(bkey_p_next(k) > vstruct_last(i),
 				 BTREE_ERR_FIXABLE, c, NULL, b, i,
 				 "key extends past end of bset")) {
 			i->u64s = cpu_to_le16((u64 *) k - i->_data);
@@ -854,7 +854,7 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b,
 				 BTREE_ERR_FIXABLE, c, NULL, b, i,
 				 "invalid bkey format %u", k->format)) {
 			i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
-			memmove_u64s_down(k, bkey_next(k),
+			memmove_u64s_down(k, bkey_p_next(k),
 					  (u64 *) vstruct_end(i) - (u64 *) k);
 			continue;
 		}
@@ -878,7 +878,7 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b,
 			btree_err(BTREE_ERR_FIXABLE, c, NULL, b, i, "%s", buf.buf);
 
 			i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
-			memmove_u64s_down(k, bkey_next(k),
+			memmove_u64s_down(k, bkey_p_next(k),
 					  (u64 *) vstruct_end(i) - (u64 *) k);
 			continue;
 		}
@@ -901,14 +901,14 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b,
 
 			if (btree_err(BTREE_ERR_FIXABLE, c, NULL, b, i, "%s", buf.buf)) {
 				i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
-				memmove_u64s_down(k, bkey_next(k),
+				memmove_u64s_down(k, bkey_p_next(k),
 						  (u64 *) vstruct_end(i) - (u64 *) k);
 				continue;
 			}
 		}
 
 		prev = k;
-		k = bkey_next(k);
+		k = bkey_p_next(k);
 	}
 fsck_err:
 	printbuf_exit(&buf);
@@ -1139,7 +1139,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
 			btree_keys_account_key_drop(&b->nr, 0, k);
 
 			i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
-			memmove_u64s_down(k, bkey_next(k),
+			memmove_u64s_down(k, bkey_p_next(k),
 					  (u64 *) vstruct_end(i) - (u64 *) k);
 			set_btree_bset_end(b, b->set);
 			continue;
@@ -1151,7 +1151,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
 			bp.v->mem_ptr = 0;
 		}
 
-		k = bkey_next(k);
+		k = bkey_p_next(k);
 	}
 
 	bch2_bset_build_aux_tree(b, b->set, false);
diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index 1e692c0a2f3a..074c61f271d3 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -776,14 +776,14 @@ bool bch2_btree_insert_key_cached(struct btree_trans *trans,
 	struct bkey_cached *ck = (void *) path->l[0].b;
 	bool kick_reclaim = false;
 
-	BUG_ON(insert->u64s > ck->u64s);
+	BUG_ON(insert->k.u64s > ck->u64s);
 
 	if (likely(!(flags & BTREE_INSERT_JOURNAL_REPLAY))) {
 		int difference;
 
-		BUG_ON(jset_u64s(insert->u64s) > trans->journal_preres.u64s);
+		BUG_ON(jset_u64s(insert->k.u64s) > trans->journal_preres.u64s);
 
-		difference = jset_u64s(insert->u64s) - ck->res.u64s;
+		difference = jset_u64s(insert->k.u64s) - ck->res.u64s;
 		if (difference > 0) {
 			trans->journal_preres.u64s	-= difference;
 			ck->res.u64s			+= difference;
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 9e77d2e3b421..1db5ef4f2257 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -1413,7 +1413,7 @@ static void __btree_split_node(struct btree_update *as,
 		out[i]->needs_whiteout = false;
 
 		btree_keys_account_key_add(&n[i]->nr, 0, out[i]);
-		out[i] = bkey_next(out[i]);
+		out[i] = bkey_p_next(out[i]);
 	}
 
 	for (i = 0; i < 2; i++) {
@@ -2445,7 +2445,7 @@ bch2_btree_roots_to_journal_entries(struct bch_fs *c,
 					  BCH_JSET_ENTRY_btree_root,
 					  i, c->btree_roots[i].level,
 					  &c->btree_roots[i].key,
-					  c->btree_roots[i].key.u64s);
+					  c->btree_roots[i].key.k.u64s);
 			end = vstruct_next(end);
 		}
 
diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c
index 8f43581f3972..0035fe875a47 100644
--- a/fs/bcachefs/debug.c
+++ b/fs/bcachefs/debug.c
@@ -153,7 +153,7 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b)
 
 	BUG_ON(b->nsets != 1);
 
-	for (k = inmemory->start; k != vstruct_last(inmemory); k = bkey_next(k))
+	for (k = inmemory->start; k != vstruct_last(inmemory); k = bkey_p_next(k))
 		if (k->type == KEY_TYPE_btree_ptr_v2) {
 			struct bch_btree_ptr_v2 *v = (void *) bkeyp_val(&b->format, k);
 			v->mem_ptr = 0;
diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h
index 2e37543a6229..bac6a1ed2c59 100644
--- a/fs/bcachefs/extents.h
+++ b/fs/bcachefs/extents.h
@@ -633,7 +633,7 @@ static inline void bch2_bkey_append_ptr(struct bkey_i *k, struct bch_extent_ptr
 		memcpy((void *) &k->v + bkey_val_bytes(&k->k),
 		       &ptr,
 		       sizeof(ptr));
-		k->u64s++;
+		k->k.u64s++;
 		break;
 	default:
 		BUG();
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index dac0dad16f0a..a7e09772895e 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -733,7 +733,7 @@ static noinline int bch2_write_drop_io_error_ptrs(struct bch_write_op *op)
 		}
 
 		if (dst != src)
-			memmove_u64s_down(dst, src, src->u64s);
+			memmove_u64s_down(dst, src, src->k.u64s);
 		dst = bkey_next(dst);
 	}
 
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 377c07125183..51c26f9857d9 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -356,7 +356,7 @@ static void journal_entry_btree_keys_to_text(struct printbuf *out, struct bch_fs
 	struct bkey_i *k;
 	bool first = true;
 
-	vstruct_for_each(entry, k) {
+	jset_entry_for_each_key(entry, k) {
 		if (!first) {
 			prt_newline(out);
 			prt_printf(out, "%s: ", bch2_jset_entry_types[entry->type]);
diff --git a/fs/bcachefs/journal_io.h b/fs/bcachefs/journal_io.h
index a32c2876f2a6..8801e98104bd 100644
--- a/fs/bcachefs/journal_io.h
+++ b/fs/bcachefs/journal_io.h
@@ -40,9 +40,14 @@ static inline struct jset_entry *__jset_entry_type_next(struct jset *jset,
 	     (entry = __jset_entry_type_next(jset, entry, type));	\
 	     entry = vstruct_next(entry))
 
-#define for_each_jset_key(k, _n, entry, jset)				\
-	for_each_jset_entry_type(entry, jset, BCH_JSET_ENTRY_btree_keys)	\
-		vstruct_for_each_safe(entry, k, _n)
+#define jset_entry_for_each_key(_e, _k)					\
+	for (_k = (_e)->start;						\
+	     _k < vstruct_last(_e);					\
+	     _k = bkey_next(_k))
+
+#define for_each_jset_key(k, entry, jset)				\
+	for_each_jset_entry_type(entry, jset, BCH_JSET_ENTRY_btree_keys)\
+		jset_entry_for_each_key(entry, k)
 
 int bch2_journal_entry_validate(struct bch_fs *, struct jset *,
 				struct jset_entry *, unsigned, int, int);
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index f5946b4dbce2..aafe4054d25d 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -481,7 +481,7 @@ static int journal_keys_sort(struct bch_fs *c)
 	struct genradix_iter iter;
 	struct journal_replay *i, **_i;
 	struct jset_entry *entry;
-	struct bkey_i *k, *_n;
+	struct bkey_i *k;
 	struct journal_keys *keys = &c->journal_keys;
 	struct journal_key *src, *dst;
 	size_t nr_keys = 0;
@@ -492,7 +492,7 @@ static int journal_keys_sort(struct bch_fs *c)
 		if (!i || i->ignore)
 			continue;
 
-		for_each_jset_key(k, _n, entry, &i->j)
+		for_each_jset_key(k, entry, &i->j)
 			nr_keys++;
 	}
 
@@ -511,7 +511,7 @@ static int journal_keys_sort(struct bch_fs *c)
 		if (!i || i->ignore)
 			continue;
 
-		for_each_jset_key(k, _n, entry, &i->j)
+		for_each_jset_key(k, entry, &i->j)
 			keys->d[keys->nr++] = (struct journal_key) {
 				.btree_id	= entry->btree_id,
 				.level		= entry->level,
@@ -871,7 +871,7 @@ static int verify_superblock_clean(struct bch_fs *c,
 				    IS_ERR(k1) ||
 				    IS_ERR(k2) ||
 				    k1->k.u64s != k2->k.u64s ||
-				    memcmp(k1, k2, bkey_bytes(k1)) ||
+				    memcmp(k1, k2, bkey_bytes(&k1->k)) ||
 				    l1 != l2, c,
 			"superblock btree root %u doesn't match journal after clean shutdown\n"
 			"sb:      l=%u %s\n"
diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c
index 87446f7bad4f..d2e6adc13fb1 100644
--- a/fs/bcachefs/reflink.c
+++ b/fs/bcachefs/reflink.c
@@ -233,7 +233,13 @@ static int bch2_make_extent_indirect(struct btree_trans *trans,
 	orig->k.type = KEY_TYPE_reflink_p;
 	r_p = bkey_i_to_reflink_p(orig);
 	set_bkey_val_bytes(&r_p->k, sizeof(r_p->v));
+
+	/* FORTIFY_SOURCE is broken here, and doesn't provide unsafe_memset() */
+#if !defined(__NO_FORTIFY) && defined(__OPTIMIZE__) && defined(CONFIG_FORTIFY_SOURCE)
+	__underlying_memset(&r_p->v, 0, sizeof(r_p->v));
+#else
 	memset(&r_p->v, 0, sizeof(r_p->v));
+#endif
 
 	r_p->v.idx = cpu_to_le64(bkey_start_offset(&r_v->k));
 
-- 
cgit 


From 5be6a274ff7a7cd9640555db63d60127c6434e1a Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 5 Mar 2023 02:52:40 -0500
Subject: bcachefs: Fix stripe reuse path

It's possible that we reuse a stripe that doesn't have quite the same
configuration as the stripe_head we're allocating from. In that case, we
have to make sure that the new stripe uses the settings from the stripe
we resue, not the stripe head, and make sure the buffer is allocated
correctly.

This fixes the ec_mixed_tiers test.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/ec.c      | 52 +++++++++++++++++++++++++++++++++------------------
 fs/bcachefs/errcode.h |  2 +-
 2 files changed, 35 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index 6bf14f975d93..a2facb2f9fc1 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -213,8 +213,9 @@ static void ec_stripe_buf_exit(struct ec_stripe_buf *buf)
 	}
 }
 
+/* XXX: this is a non-mempoolified memory allocation: */
 static int ec_stripe_buf_init(struct ec_stripe_buf *buf,
-			       unsigned offset, unsigned size)
+			      unsigned offset, unsigned size)
 {
 	struct bch_stripe *v = &buf->key.v;
 	unsigned csum_granularity = 1U << v->csum_granularity_bits;
@@ -241,7 +242,7 @@ static int ec_stripe_buf_init(struct ec_stripe_buf *buf,
 	return 0;
 err:
 	ec_stripe_buf_exit(buf);
-	return -ENOMEM;
+	return -BCH_ERR_ENOMEM_stripe_buf;
 }
 
 /* Checksumming: */
@@ -1099,6 +1100,7 @@ static void ec_stripe_create(struct ec_stripe_new *s)
 	}
 
 	BUG_ON(!s->allocated);
+	BUG_ON(!s->idx);
 
 	ec_generate_ec(&s->new_stripe);
 
@@ -1143,7 +1145,8 @@ err:
 			}
 		}
 
-	bch2_stripe_close(c, s);
+	if (s->idx)
+		bch2_stripe_close(c, s);
 
 	ec_stripe_buf_exit(&s->existing_stripe);
 	ec_stripe_buf_exit(&s->new_stripe);
@@ -1191,6 +1194,7 @@ void bch2_ec_do_stripe_creates(struct bch_fs *c)
 static void ec_stripe_new_put(struct bch_fs *c, struct ec_stripe_new *s)
 {
 	BUG_ON(atomic_read(&s->pin) <= 0);
+	BUG_ON(!s->err && !s->idx);
 
 	if (atomic_dec_and_test(&s->pin))
 		bch2_ec_do_stripe_creates(c);
@@ -1236,6 +1240,8 @@ void *bch2_writepoint_ec_buf(struct bch_fs *c, struct write_point *wp)
 	if (!ob)
 		return NULL;
 
+	BUG_ON(!ob->ec->new_stripe.data[ob->ec_idx]);
+
 	ca	= bch_dev_bkey_exists(c, ob->dev);
 	offset	= ca->mi.bucket_size - ob->sectors_free;
 
@@ -1436,6 +1442,9 @@ static int new_stripe_alloc_buckets(struct btree_trans *trans, struct ec_stripe_
 	bool have_cache = true;
 	int ret = 0;
 
+	BUG_ON(h->s->new_stripe.key.v.nr_blocks		!= h->s->nr_data + h->s->nr_parity);
+	BUG_ON(h->s->new_stripe.key.v.nr_redundant	!= h->s->nr_parity);
+
 	for_each_set_bit(i, h->s->blocks_gotten, h->s->new_stripe.key.v.nr_blocks) {
 		__clear_bit(h->s->new_stripe.key.v.ptrs[i].dev, devs.d);
 		if (i < h->s->nr_data)
@@ -1546,9 +1555,13 @@ static int __bch2_ec_stripe_head_reuse(struct btree_trans *trans, struct ec_stri
 	s64 idx;
 	int ret;
 
+	/*
+	 * If we can't allocate a new stripe, and there's no stripes with empty
+	 * blocks for us to reuse, that means we have to wait on copygc:
+	 */
 	idx = get_existing_stripe(c, h);
 	if (idx < 0)
-		return -BCH_ERR_ENOSPC_stripe_reuse;
+		return -BCH_ERR_stripe_alloc_blocked;
 
 	ret = get_stripe_key_trans(trans, idx, &h->s->existing_stripe);
 	if (ret) {
@@ -1558,12 +1571,14 @@ static int __bch2_ec_stripe_head_reuse(struct btree_trans *trans, struct ec_stri
 		return ret;
 	}
 
-	if (ec_stripe_buf_init(&h->s->existing_stripe, 0, h->blocksize)) {
-		/*
-		 * this is a problem: we have deleted from the
-		 * stripes heap already
-		 */
-		BUG();
+	BUG_ON(h->s->existing_stripe.key.v.nr_redundant != h->s->nr_parity);
+	h->s->nr_data = h->s->existing_stripe.key.v.nr_blocks -
+		h->s->existing_stripe.key.v.nr_redundant;
+
+	ret = ec_stripe_buf_init(&h->s->existing_stripe, 0, h->blocksize);
+	if (ret) {
+		bch2_stripe_close(c, h->s);
+		return ret;
 	}
 
 	BUG_ON(h->s->existing_stripe.size != h->blocksize);
@@ -1675,9 +1690,6 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans,
 			bch_err(c, "failed to allocate new stripe");
 			goto err;
 		}
-
-		if (ec_stripe_buf_init(&h->s->new_stripe, 0, h->blocksize))
-			BUG();
 	}
 
 	if (h->s->allocated)
@@ -1690,7 +1702,7 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans,
 	ret =   new_stripe_alloc_buckets(trans, h, RESERVE_stripe, NULL) ?:
 		__bch2_ec_stripe_head_reserve(trans, h);
 	if (!ret)
-		goto allocated;
+		goto allocate_buf;
 	if (bch2_err_matches(ret, BCH_ERR_transaction_restart) ||
 	    bch2_err_matches(ret, ENOMEM))
 		goto err;
@@ -1703,8 +1715,6 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans,
 		ret = __bch2_ec_stripe_head_reuse(trans, h);
 		if (!ret)
 			break;
-		if (ret == -BCH_ERR_ENOSPC_stripe_reuse && cl)
-			ret = -BCH_ERR_stripe_alloc_blocked;
 		if (waiting || !cl || ret != -BCH_ERR_stripe_alloc_blocked)
 			goto err;
 
@@ -1723,10 +1733,16 @@ alloc_existing:
 	ret = new_stripe_alloc_buckets(trans, h, reserve, cl);
 	if (ret)
 		goto err;
-allocated:
+
+allocate_buf:
+	ret = ec_stripe_buf_init(&h->s->new_stripe, 0, h->blocksize);
+	if (ret)
+		goto err;
+
 	h->s->allocated = true;
+allocated:
 	BUG_ON(!h->s->idx);
-
+	BUG_ON(!h->s->new_stripe.data[0]);
 	BUG_ON(trans->restarted);
 	return h;
 err:
diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h
index 283303db7dfd..162e315601f9 100644
--- a/fs/bcachefs/errcode.h
+++ b/fs/bcachefs/errcode.h
@@ -3,11 +3,11 @@
 #define _BCACHEFS_ERRCODE_H
 
 #define BCH_ERRCODES()								\
+	x(ENOMEM,			ENOMEM_stripe_buf)			\
 	x(ENOSPC,			ENOSPC_disk_reservation)		\
 	x(ENOSPC,			ENOSPC_bucket_alloc)			\
 	x(ENOSPC,			ENOSPC_disk_label_add)			\
 	x(ENOSPC,			ENOSPC_stripe_create)			\
-	x(ENOSPC,			ENOSPC_stripe_reuse)			\
 	x(ENOSPC,			ENOSPC_inode_create)			\
 	x(ENOSPC,			ENOSPC_str_hash_create)			\
 	x(ENOSPC,			ENOSPC_snapshot_create)			\
-- 
cgit 


From 0b943b973c67c2780cb991c3011ad3279f670fa4 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 5 Mar 2023 03:11:00 -0500
Subject: bcachefs: Free move buffers as early as possible

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/io.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index a7e09772895e..393629e52d03 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -834,6 +834,10 @@ static void bch2_write_index(struct closure *cl)
 	struct write_point *wp = op->wp;
 	struct workqueue_struct *wq = index_update_wq(op);
 
+	if ((op->flags & BCH_WRITE_DONE) &&
+	    (op->flags & BCH_WRITE_MOVE))
+		bch2_bio_free_pages_pool(op->c, &op->wbio.bio);
+
 	barrier();
 
 	/*
-- 
cgit 


From 8fcdf81418f16ef878de7a7535e90599e396fd69 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 27 Feb 2023 22:58:01 -0500
Subject: bcachefs: Improved copygc pipelining

This improves copygc pipelining across multiple buckets: we now track
each in flight bucket we're evacuating, with separate moving_contexts.

This means that whereas previously we had to wait for outstanding moves
to complete to ensure we didn't try to evacuate the same bucket twice,
we can now just check buckets we want to evacuate against the pending
list.

This also mean we can run the verify_bucket_evacuated() check without
killing pipelining - meaning it can now always be enabled, not just on
debug builds.

This is going to be important for the upcoming erasure coding work,
where moving IOs that are being erasure coded will now skip the initial
replication step; instead the IOs will wait on the stripe to complete.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/move.c       |  45 +++++++----
 fs/bcachefs/move.h       |   3 +
 fs/bcachefs/move_types.h |   6 ++
 fs/bcachefs/movinggc.c   | 191 +++++++++++++++++++++++++++++++++++++----------
 fs/bcachefs/util.h       |  20 +++++
 5 files changed, 212 insertions(+), 53 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index 89d6635e2169..3a650bc4173a 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -40,18 +40,19 @@ static void progress_list_del(struct bch_fs *c, struct bch_move_stats *stats)
 }
 
 struct moving_io {
-	struct list_head	list;
-	struct closure		cl;
-	bool			read_completed;
+	struct list_head		list;
+	struct move_bucket_in_flight	*b;
+	struct closure			cl;
+	bool				read_completed;
 
-	unsigned		read_sectors;
-	unsigned		write_sectors;
+	unsigned			read_sectors;
+	unsigned			write_sectors;
 
-	struct bch_read_bio	rbio;
+	struct bch_read_bio		rbio;
 
-	struct data_update	write;
+	struct data_update		write;
 	/* Must be last since it is variable size */
-	struct bio_vec		bi_inline_vecs[0];
+	struct bio_vec			bi_inline_vecs[0];
 };
 
 static void move_free(struct moving_io *io)
@@ -59,6 +60,9 @@ static void move_free(struct moving_io *io)
 	struct moving_context *ctxt = io->write.ctxt;
 	struct bch_fs *c = ctxt->c;
 
+	if (io->b)
+		atomic_dec(&io->b->count);
+
 	bch2_data_update_exit(&io->write);
 	wake_up(&ctxt->wait);
 	bch2_write_ref_put(c, BCH_WRITE_REF_move);
@@ -234,6 +238,7 @@ static int bch2_extent_drop_ptrs(struct btree_trans *trans,
 static int bch2_move_extent(struct btree_trans *trans,
 			    struct btree_iter *iter,
 			    struct moving_context *ctxt,
+			    struct move_bucket_in_flight *bucket_in_flight,
 			    struct bch_io_opts io_opts,
 			    enum btree_id btree_id,
 			    struct bkey_s_c k,
@@ -319,6 +324,11 @@ static int bch2_move_extent(struct btree_trans *trans,
 		atomic64_add(k.k->size, &ctxt->stats->sectors_moved);
 	}
 
+	if (bucket_in_flight) {
+		io->b = bucket_in_flight;
+		atomic_inc(&io->b->count);
+	}
+
 	this_cpu_add(c->counters[BCH_COUNTER_io_move], k.k->size);
 	this_cpu_add(c->counters[BCH_COUNTER_move_extent_read], k.k->size);
 	trace_move_extent_read(k.k);
@@ -521,8 +531,8 @@ static int __bch2_move_data(struct moving_context *ctxt,
 		k = bkey_i_to_s_c(sk.k);
 		bch2_trans_unlock(&trans);
 
-		ret2 = bch2_move_extent(&trans, &iter, ctxt, io_opts,
-					btree_id, k, data_opts);
+		ret2 = bch2_move_extent(&trans, &iter, ctxt, NULL,
+					io_opts, btree_id, k, data_opts);
 		if (ret2) {
 			if (bch2_err_matches(ret2, BCH_ERR_transaction_restart))
 				continue;
@@ -590,7 +600,7 @@ int bch2_move_data(struct bch_fs *c,
 	return ret;
 }
 
-static noinline void verify_bucket_evacuated(struct btree_trans *trans, struct bpos bucket, int gen)
+void bch2_verify_bucket_evacuated(struct btree_trans *trans, struct bpos bucket, int gen)
 {
 	struct bch_fs *c = trans->c;
 	struct btree_iter iter;
@@ -625,6 +635,9 @@ again:
 failed_to_evacuate:
 	bch2_trans_iter_exit(trans, &iter);
 
+	if (test_bit(BCH_FS_EMERGENCY_RO, &c->flags))
+		return;
+
 	prt_printf(&buf, bch2_log_msg(c, "failed to evacuate bucket "));
 	bch2_bkey_val_to_text(&buf, c, k);
 
@@ -661,6 +674,7 @@ failed_to_evacuate:
 
 int __bch2_evacuate_bucket(struct btree_trans *trans,
 			   struct moving_context *ctxt,
+			   struct move_bucket_in_flight *bucket_in_flight,
 			   struct bpos bucket, int gen,
 			   struct data_update_opts _data_opts)
 {
@@ -749,8 +763,9 @@ int __bch2_evacuate_bucket(struct btree_trans *trans,
 				i++;
 			}
 
-			ret = bch2_move_extent(trans, &iter, ctxt, io_opts,
-					       bp.btree_id, k, data_opts);
+			ret = bch2_move_extent(trans, &iter, ctxt,
+					bucket_in_flight,
+					io_opts, bp.btree_id, k, data_opts);
 			bch2_trans_iter_exit(trans, &iter);
 
 			if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
@@ -809,7 +824,7 @@ next:
 		move_ctxt_wait_event(ctxt, NULL, list_empty(&ctxt->reads));
 		closure_sync(&ctxt->cl);
 		if (!ctxt->write_error)
-			verify_bucket_evacuated(trans, bucket, gen);
+			bch2_verify_bucket_evacuated(trans, bucket, gen);
 	}
 err:
 	bch2_bkey_buf_exit(&sk, c);
@@ -830,7 +845,7 @@ int bch2_evacuate_bucket(struct bch_fs *c,
 
 	bch2_trans_init(&trans, c, 0, 0);
 	bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc);
-	ret = __bch2_evacuate_bucket(&trans, &ctxt, bucket, gen, data_opts);
+	ret = __bch2_evacuate_bucket(&trans, &ctxt, NULL, bucket, gen, data_opts);
 	bch2_moving_ctxt_exit(&ctxt);
 	bch2_trans_exit(&trans);
 
diff --git a/fs/bcachefs/move.h b/fs/bcachefs/move.h
index c5a7c0add1d6..4c0013872347 100644
--- a/fs/bcachefs/move.h
+++ b/fs/bcachefs/move.h
@@ -30,6 +30,8 @@ struct moving_context {
 	wait_queue_head_t	wait;
 };
 
+void bch2_verify_bucket_evacuated(struct btree_trans *, struct bpos, int);
+
 #define move_ctxt_wait_event(_ctxt, _trans, _cond)			\
 do {									\
 	bool cond_finished = false;					\
@@ -68,6 +70,7 @@ int bch2_move_data(struct bch_fs *,
 
 int __bch2_evacuate_bucket(struct btree_trans *,
 			   struct moving_context *,
+			   struct move_bucket_in_flight *,
 			   struct bpos, int,
 			   struct data_update_opts);
 int bch2_evacuate_bucket(struct bch_fs *, struct bpos, int,
diff --git a/fs/bcachefs/move_types.h b/fs/bcachefs/move_types.h
index 9df6d18137a5..285ffdb762ac 100644
--- a/fs/bcachefs/move_types.h
+++ b/fs/bcachefs/move_types.h
@@ -16,4 +16,10 @@ struct bch_move_stats {
 	atomic64_t		sectors_raced;
 };
 
+struct move_bucket_in_flight {
+	struct bpos		bucket;
+	u8			gen;
+	atomic_t		count;
+};
+
 #endif /* _BCACHEFS_MOVE_TYPES_H */
diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c
index bfa0463b4ffe..6755310f5ebc 100644
--- a/fs/bcachefs/movinggc.c
+++ b/fs/bcachefs/movinggc.c
@@ -26,6 +26,7 @@
 #include "super-io.h"
 #include "trace.h"
 
+#include <linux/bsearch.h>
 #include <linux/freezer.h>
 #include <linux/kthread.h>
 #include <linux/math64.h>
@@ -70,62 +71,146 @@ static int bch2_bucket_is_movable(struct btree_trans *trans,
 	return ret;
 }
 
-static int bch2_copygc_next_bucket(struct btree_trans *trans,
-				   struct bpos *bucket, u8 *gen, struct bpos *pos)
+typedef FIFO(struct move_bucket_in_flight) move_buckets_in_flight;
+
+struct move_bucket {
+	struct bpos		bucket;
+	u8			gen;
+};
+
+typedef DARRAY(struct move_bucket) move_buckets;
+
+static int move_bucket_cmp(const void *_l, const void *_r)
+{
+	const struct move_bucket *l = _l;
+	const struct move_bucket *r = _r;
+
+	return bkey_cmp(l->bucket, r->bucket);
+}
+
+static bool bucket_in_flight(move_buckets *buckets_sorted, struct move_bucket b)
+{
+	return bsearch(&b,
+		       buckets_sorted->data,
+		       buckets_sorted->nr,
+		       sizeof(buckets_sorted->data[0]),
+		       move_bucket_cmp) != NULL;
+}
+
+static void move_buckets_wait(struct btree_trans *trans,
+			      struct moving_context *ctxt,
+			      move_buckets_in_flight *buckets_in_flight,
+			      size_t nr, bool verify_evacuated)
+{
+	while (!fifo_empty(buckets_in_flight)) {
+		struct move_bucket_in_flight *i = &fifo_peek_front(buckets_in_flight);
+
+		if (fifo_used(buckets_in_flight) > nr)
+			move_ctxt_wait_event(ctxt, trans, !atomic_read(&i->count));
+
+		if (atomic_read(&i->count))
+			break;
+
+		/*
+		 * moving_ctxt_exit calls bch2_write as it flushes pending
+		 * reads, which inits another btree_trans; this one must be
+		 * unlocked:
+		 */
+		if (verify_evacuated)
+			bch2_verify_bucket_evacuated(trans, i->bucket, i->gen);
+		buckets_in_flight->front++;
+	}
+
+	bch2_trans_unlock(trans);
+}
+
+static int bch2_copygc_get_buckets(struct btree_trans *trans,
+			struct moving_context *ctxt,
+			move_buckets_in_flight *buckets_in_flight,
+			move_buckets *buckets)
 {
 	struct btree_iter iter;
+	move_buckets buckets_sorted = { 0 };
+	struct move_bucket_in_flight *i;
 	struct bkey_s_c k;
+	size_t fifo_iter, nr_to_get;
 	int ret;
 
+	move_buckets_wait(trans, ctxt, buckets_in_flight, buckets_in_flight->size / 2, true);
+
+	nr_to_get = max(16UL, fifo_used(buckets_in_flight) / 4);
+
+	fifo_for_each_entry_ptr(i, buckets_in_flight, fifo_iter) {
+		ret = darray_push(&buckets_sorted, ((struct move_bucket) {i->bucket, i->gen}));
+		if (ret) {
+			bch_err(trans->c, "error allocating move_buckets_sorted");
+			goto err;
+		}
+	}
+
+	sort(buckets_sorted.data,
+	     buckets_sorted.nr,
+	     sizeof(buckets_sorted.data[0]),
+	     move_bucket_cmp,
+	     NULL);
+
 	ret = for_each_btree_key2_upto(trans, iter, BTREE_ID_lru,
-				  bpos_max(*pos, lru_pos(BCH_LRU_FRAGMENTATION_START, 0, 0)),
+				  lru_pos(BCH_LRU_FRAGMENTATION_START, 0, 0),
 				  lru_pos(BCH_LRU_FRAGMENTATION_START, U64_MAX, LRU_TIME_MAX),
 				  0, k, ({
-		*bucket = u64_to_bucket(k.k->p.offset);
+		struct move_bucket b = { .bucket = u64_to_bucket(k.k->p.offset) };
+		int ret = 0;
+
+		if (!bucket_in_flight(&buckets_sorted, b) &&
+		    bch2_bucket_is_movable(trans, b.bucket, lru_pos_time(k.k->p), &b.gen))
+			ret = darray_push(buckets, b) ?: buckets->nr >= nr_to_get;
 
-		bch2_bucket_is_movable(trans, *bucket, lru_pos_time(k.k->p), gen);
+		ret;
 	}));
+err:
+	darray_exit(&buckets_sorted);
 
-	*pos = iter.pos;
-	if (ret < 0)
-		return ret;
-	return ret ? 0 : -ENOENT;
+	return ret < 0 ? ret : 0;
 }
 
-static int bch2_copygc(struct bch_fs *c)
+static int bch2_copygc(struct btree_trans *trans,
+		       struct moving_context *ctxt,
+		       move_buckets_in_flight *buckets_in_flight)
 {
-	struct bch_move_stats move_stats;
-	struct btree_trans trans;
-	struct moving_context ctxt;
+	struct bch_fs *c = trans->c;
 	struct data_update_opts data_opts = {
 		.btree_insert_flags = BTREE_INSERT_USE_RESERVE|JOURNAL_WATERMARK_copygc,
 	};
-	struct bpos bucket;
-	struct bpos pos;
-	u8 gen = 0;
-	unsigned nr_evacuated;
+	move_buckets buckets = { 0 };
+	struct move_bucket_in_flight *f;
+	struct move_bucket *i;
+	u64 moved = atomic64_read(&ctxt->stats->sectors_moved);
 	int ret = 0;
 
-	bch2_move_stats_init(&move_stats, "copygc");
-	bch2_moving_ctxt_init(&ctxt, c, NULL, &move_stats,
-			      writepoint_ptr(&c->copygc_write_point),
-			      false);
-	bch2_trans_init(&trans, c, 0, 0);
+	ret = bch2_btree_write_buffer_flush(trans);
+	if (bch2_fs_fatal_err_on(ret, c, "%s: error %s from bch2_btree_write_buffer_flush()",
+				 __func__, bch2_err_str(ret)))
+		return ret;
 
-	ret = bch2_btree_write_buffer_flush(&trans);
-	BUG_ON(ret);
+	ret = bch2_copygc_get_buckets(trans, ctxt, buckets_in_flight, &buckets);
+	if (ret)
+		goto err;
 
-	for (nr_evacuated = 0, pos = POS_MIN;
-	     nr_evacuated < 32 && !ret;
-	     nr_evacuated++, pos = bpos_nosnap_successor(pos)) {
-		ret = bch2_copygc_next_bucket(&trans, &bucket, &gen, &pos) ?:
-			__bch2_evacuate_bucket(&trans, &ctxt, bucket, gen, data_opts);
-		if (bkey_eq(pos, POS_MAX))
+	darray_for_each(buckets, i) {
+		if (unlikely(freezing(current)))
 			break;
-	}
 
-	bch2_trans_exit(&trans);
-	bch2_moving_ctxt_exit(&ctxt);
+		f = fifo_push_ref(buckets_in_flight);
+		f->bucket	= i->bucket;
+		f->gen		= i->gen;
+		atomic_set(&f->count, 0);
+
+		ret = __bch2_evacuate_bucket(trans, ctxt, f, f->bucket, f->gen, data_opts);
+		if (ret)
+			goto err;
+	}
+err:
+	darray_exit(&buckets);
 
 	/* no entries in LRU btree found, or got to end: */
 	if (ret == -ENOENT)
@@ -134,7 +219,8 @@ static int bch2_copygc(struct bch_fs *c)
 	if (ret < 0 && !bch2_err_matches(ret, EROFS))
 		bch_err(c, "error from bch2_move_data() in copygc: %s", bch2_err_str(ret));
 
-	trace_and_count(c, copygc, c, atomic64_read(&move_stats.sectors_moved), 0, 0, 0);
+	moved = atomic64_read(&ctxt->stats->sectors_moved) - moved;
+	trace_and_count(c, copygc, c, moved, 0, 0, 0);
 	return ret;
 }
 
@@ -162,7 +248,7 @@ unsigned long bch2_copygc_wait_amount(struct bch_fs *c)
 	for_each_rw_member(ca, c, dev_idx) {
 		struct bch_dev_usage usage = bch2_dev_usage_read(ca);
 
-		fragmented_allowed = ((__dev_buckets_available(ca, usage, RESERVE_none) *
+		fragmented_allowed = ((__dev_buckets_available(ca, usage, RESERVE_stripe) *
 				       ca->mi.bucket_size) >> 1);
 		fragmented = 0;
 
@@ -191,22 +277,47 @@ void bch2_copygc_wait_to_text(struct printbuf *out, struct bch_fs *c)
 static int bch2_copygc_thread(void *arg)
 {
 	struct bch_fs *c = arg;
+	struct btree_trans trans;
+	struct moving_context ctxt;
+	struct bch_move_stats move_stats;
 	struct io_clock *clock = &c->io_clock[WRITE];
+	move_buckets_in_flight move_buckets;
 	u64 last, wait;
 	int ret = 0;
 
+	if (!init_fifo(&move_buckets, 1 << 14, GFP_KERNEL)) {
+		bch_err(c, "error allocating copygc buckets in flight");
+		return -ENOMEM;
+	}
+
 	set_freezable();
+	bch2_trans_init(&trans, c, 0, 0);
+
+	bch2_move_stats_init(&move_stats, "copygc");
+	bch2_moving_ctxt_init(&ctxt, c, NULL, &move_stats,
+			      writepoint_ptr(&c->copygc_write_point),
+			      false);
 
 	while (!ret && !kthread_should_stop()) {
+		bch2_trans_unlock(&trans);
 		cond_resched();
 
-		if (kthread_wait_freezable(c->copy_gc_enabled))
-			break;
+		if (!c->copy_gc_enabled) {
+			move_buckets_wait(&trans, &ctxt, &move_buckets, 0, true);
+			kthread_wait_freezable(c->copy_gc_enabled);
+		}
+
+		if (unlikely(freezing(current))) {
+			move_buckets_wait(&trans, &ctxt, &move_buckets, 0, true);
+			__refrigerator(false);
+			continue;
+		}
 
 		last = atomic64_read(&clock->now);
 		wait = bch2_copygc_wait_amount(c);
 
 		if (wait > clock->max_slop) {
+			move_buckets_wait(&trans, &ctxt, &move_buckets, 0, true);
 			trace_and_count(c, copygc_wait, c, wait, last + wait);
 			c->copygc_wait = last + wait;
 			bch2_kthread_io_clock_wait(clock, last + wait,
@@ -217,12 +328,16 @@ static int bch2_copygc_thread(void *arg)
 		c->copygc_wait = 0;
 
 		c->copygc_running = true;
-		ret = bch2_copygc(c);
+		ret = bch2_copygc(&trans, &ctxt, &move_buckets);
 		c->copygc_running = false;
 
 		wake_up(&c->copygc_running_wq);
 	}
 
+	bch2_moving_ctxt_exit(&ctxt);
+	bch2_trans_exit(&trans);
+	free_fifo(&move_buckets);
+
 	return 0;
 }
 
diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h
index 44c6a2a10f35..a57accc592db 100644
--- a/fs/bcachefs/util.h
+++ b/fs/bcachefs/util.h
@@ -544,6 +544,26 @@ do {									\
 	submit_bio(bio);						\
 } while (0)
 
+#define kthread_wait(cond)						\
+({									\
+	int _ret = 0;							\
+									\
+	while (1) {							\
+		set_current_state(TASK_INTERRUPTIBLE);			\
+		if (kthread_should_stop()) {				\
+			_ret = -1;					\
+			break;						\
+		}							\
+									\
+		if (cond)						\
+			break;						\
+									\
+		schedule();						\
+	}								\
+	set_current_state(TASK_RUNNING);				\
+	_ret;								\
+})
+
 #define kthread_wait_freezable(cond)					\
 ({									\
 	int _ret = 0;							\
-- 
cgit 


From 81c771b26639112603bb558bf66441ed3d229eed Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 8 Mar 2023 03:57:32 -0500
Subject: bcachefs: Improve bch2_new_stripes_to_text()

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/ec.c | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index a2facb2f9fc1..4a9a9a05058a 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -1145,6 +1145,10 @@ err:
 			}
 		}
 
+	mutex_lock(&c->ec_stripe_new_lock);
+	list_del(&s->list);
+	mutex_unlock(&c->ec_stripe_new_lock);
+
 	if (s->idx)
 		bch2_stripe_close(c, s);
 
@@ -1160,10 +1164,8 @@ static struct ec_stripe_new *get_pending_stripe(struct bch_fs *c)
 
 	mutex_lock(&c->ec_stripe_new_lock);
 	list_for_each_entry(s, &c->ec_stripe_new_list, list)
-		if (!atomic_read(&s->pin)) {
-			list_del(&s->list);
+		if (!atomic_read(&s->pin))
 			goto out;
-		}
 	s = NULL;
 out:
 	mutex_unlock(&c->ec_stripe_new_lock);
@@ -1855,8 +1857,8 @@ void bch2_new_stripes_to_text(struct printbuf *out, struct bch_fs *c)
 		       h->target, h->algo, h->redundancy);
 
 		if (h->s)
-			prt_printf(out, "\tpending: blocks %u+%u allocated %u\n",
-			       h->s->nr_data, h->s->nr_parity,
+			prt_printf(out, "\tpending: idx %llu blocks %u+%u allocated %u\n",
+			       h->s->idx, h->s->nr_data, h->s->nr_parity,
 			       bitmap_weight(h->s->blocks_allocated,
 					     h->s->nr_data));
 	}
@@ -1864,9 +1866,9 @@ void bch2_new_stripes_to_text(struct printbuf *out, struct bch_fs *c)
 
 	mutex_lock(&c->ec_stripe_new_lock);
 	list_for_each_entry(s, &c->ec_stripe_new_list, list) {
-		prt_printf(out, "\tin flight: blocks %u+%u pin %u\n",
-		       s->nr_data, s->nr_parity,
-		       atomic_read(&s->pin));
+		prt_printf(out, "\tin flight: idx %llu blocks %u+%u pin %u\n",
+			   s->idx, s->nr_data, s->nr_parity,
+			   atomic_read(&s->pin));
 	}
 	mutex_unlock(&c->ec_stripe_new_lock);
 }
-- 
cgit 


From 2a912a9a39bf0ce7709d53f00d1d341d2478c96e Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 5 Mar 2023 23:52:49 -0500
Subject: bcachefs: Kill bch2_ec_bucket_written()

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_foreground.c |  4 ++--
 fs/bcachefs/ec.c               | 17 -----------------
 fs/bcachefs/ec.h               | 16 +++++++++++++++-
 3 files changed, 17 insertions(+), 20 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index 5aab85f1a835..7b048ef99b97 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -97,7 +97,7 @@ void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob)
 	struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
 
 	if (ob->ec) {
-		bch2_ec_bucket_written(c, ob);
+		ec_stripe_new_put(c, ob->ec);
 		return;
 	}
 
@@ -838,10 +838,10 @@ got_bucket:
 
 	ob->ec_idx	= ec_idx;
 	ob->ec		= h->s;
+	ec_stripe_new_get(h->s);
 
 	add_new_bucket(c, ptrs, devs_may_alloc,
 		       nr_effective, have_cache, flags, ob);
-	atomic_inc(&h->s->pin);
 out_put_head:
 	bch2_ec_stripe_head_put(c, h);
 	return 0;
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index 4a9a9a05058a..f131a70f282f 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -1193,15 +1193,6 @@ void bch2_ec_do_stripe_creates(struct bch_fs *c)
 		bch2_write_ref_put(c, BCH_WRITE_REF_stripe_create);
 }
 
-static void ec_stripe_new_put(struct bch_fs *c, struct ec_stripe_new *s)
-{
-	BUG_ON(atomic_read(&s->pin) <= 0);
-	BUG_ON(!s->err && !s->idx);
-
-	if (atomic_dec_and_test(&s->pin))
-		bch2_ec_do_stripe_creates(c);
-}
-
 static void ec_stripe_set_pending(struct bch_fs *c, struct ec_stripe_head *h)
 {
 	struct ec_stripe_new *s = h->s;
@@ -1218,14 +1209,6 @@ static void ec_stripe_set_pending(struct bch_fs *c, struct ec_stripe_head *h)
 	ec_stripe_new_put(c, s);
 }
 
-/* have a full bucket - hand it off to be erasure coded: */
-void bch2_ec_bucket_written(struct bch_fs *c, struct open_bucket *ob)
-{
-	struct ec_stripe_new *s = ob->ec;
-
-	ec_stripe_new_put(c, s);
-}
-
 void bch2_ec_bucket_cancel(struct bch_fs *c, struct open_bucket *ob)
 {
 	struct ec_stripe_new *s = ob->ec;
diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h
index 56d1b5e7d797..d112aea9ec56 100644
--- a/fs/bcachefs/ec.h
+++ b/fs/bcachefs/ec.h
@@ -198,7 +198,6 @@ int bch2_ec_read_extent(struct bch_fs *, struct bch_read_bio *);
 
 void *bch2_writepoint_ec_buf(struct bch_fs *, struct write_point *);
 
-void bch2_ec_bucket_written(struct bch_fs *, struct open_bucket *);
 void bch2_ec_bucket_cancel(struct bch_fs *, struct open_bucket *);
 
 int bch2_ec_stripe_new_alloc(struct bch_fs *, struct ec_stripe_head *);
@@ -213,6 +212,21 @@ void bch2_stripes_heap_del(struct bch_fs *, struct stripe *, size_t);
 void bch2_stripes_heap_insert(struct bch_fs *, struct stripe *, size_t);
 
 void bch2_do_stripe_deletes(struct bch_fs *);
+void bch2_ec_do_stripe_creates(struct bch_fs *);
+
+static inline void ec_stripe_new_get(struct ec_stripe_new *s)
+{
+	atomic_inc(&s->pin);
+}
+
+static inline void ec_stripe_new_put(struct bch_fs *c, struct ec_stripe_new *s)
+{
+	BUG_ON(atomic_read(&s->pin) <= 0);
+	BUG_ON(!s->err && !s->idx);
+
+	if (atomic_dec_and_test(&s->pin))
+		bch2_ec_do_stripe_creates(c);
+}
 
 void bch2_ec_stop_dev(struct bch_fs *, struct bch_dev *);
 
-- 
cgit 


From 26bab33b6974d42f5db6d2b7c2198e3af4993cf8 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 6 Mar 2023 00:10:14 -0500
Subject: bcachefs: Fix "btree node in stripe" error

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/ec.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index f131a70f282f..e9470c06b1fa 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -915,6 +915,9 @@ static int ec_stripe_update_extent(struct btree_trans *trans,
 		b = bch2_backpointer_get_node(trans, &node_iter, bucket, *bp_offset, bp);
 		bch2_trans_iter_exit(trans, &node_iter);
 
+		if (!b)
+			return 0;
+
 		prt_printf(&buf, "found btree node in erasure coded bucket: b=%px\n", b);
 		bch2_backpointer_to_text(&buf, &bp);
 
-- 
cgit 


From a345b0f393da49be9d1110ec9e43066191f0e466 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 6 Mar 2023 02:34:59 -0500
Subject: bcachefs: bch2_btree_node_to_text() const correctness

This is for the Rust interface - Rust cares more about const than C
does.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bset.c        | 4 ++--
 fs/bcachefs/bset.h        | 4 ++--
 fs/bcachefs/btree_cache.c | 4 ++--
 fs/bcachefs/btree_cache.h | 4 ++--
 4 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c
index a7cb12ea2a04..a4c06e856c2e 100644
--- a/fs/bcachefs/bset.c
+++ b/fs/bcachefs/bset.c
@@ -1533,9 +1533,9 @@ struct bkey_s_c bch2_btree_node_iter_peek_unpack(struct btree_node_iter *iter,
 
 /* Mergesort */
 
-void bch2_btree_keys_stats(struct btree *b, struct bset_stats *stats)
+void bch2_btree_keys_stats(const struct btree *b, struct bset_stats *stats)
 {
-	struct bset_tree *t;
+	const struct bset_tree *t;
 
 	for_each_bset(b, t) {
 		enum bset_aux_tree_type type = bset_aux_tree_type(t);
diff --git a/fs/bcachefs/bset.h b/fs/bcachefs/bset.h
index 2105e7836557..632c2b8c5460 100644
--- a/fs/bcachefs/bset.h
+++ b/fs/bcachefs/bset.h
@@ -213,7 +213,7 @@ static inline size_t btree_aux_data_u64s(const struct btree *b)
 	     _k != btree_bkey_last(_b, _t);				\
 	     _k = bkey_p_next(_k))
 
-static inline bool bset_has_ro_aux_tree(struct bset_tree *t)
+static inline bool bset_has_ro_aux_tree(const struct bset_tree *t)
 {
 	return bset_aux_tree_type(t) == BSET_RO_AUX_TREE;
 }
@@ -504,7 +504,7 @@ struct bset_stats {
 	size_t failed;
 };
 
-void bch2_btree_keys_stats(struct btree *, struct bset_stats *);
+void bch2_btree_keys_stats(const struct btree *, struct bset_stats *);
 void bch2_bfloat_to_text(struct printbuf *, struct btree *,
 			 struct bkey_packed *);
 
diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index 9b331c319acc..6218a00ccb27 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -1150,7 +1150,7 @@ wait_on_io:
 }
 
 void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c,
-			     struct btree *b)
+			     const struct btree *b)
 {
 	const struct bkey_format *f = &b->format;
 	struct bset_stats stats;
@@ -1195,7 +1195,7 @@ void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c,
 	       stats.failed);
 }
 
-void bch2_btree_cache_to_text(struct printbuf *out, struct bch_fs *c)
+void bch2_btree_cache_to_text(struct printbuf *out, const struct bch_fs *c)
 {
 	prt_printf(out, "nr nodes:\t\t%u\n", c->btree_cache.used);
 	prt_printf(out, "nr dirty:\t\t%u\n", atomic_read(&c->btree_cache.dirty));
diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h
index 56ea27340771..ea375ae25a70 100644
--- a/fs/bcachefs/btree_cache.h
+++ b/fs/bcachefs/btree_cache.h
@@ -100,7 +100,7 @@ static inline unsigned btree_blocks(struct bch_fs *c)
 #define btree_node_root(_c, _b)	((_c)->btree_roots[(_b)->c.btree_id].b)
 
 void bch2_btree_node_to_text(struct printbuf *, struct bch_fs *,
-			     struct btree *);
-void bch2_btree_cache_to_text(struct printbuf *, struct bch_fs *);
+			     const struct btree *);
+void bch2_btree_cache_to_text(struct printbuf *, const struct bch_fs *);
 
 #endif /* _BCACHEFS_BTREE_CACHE_H */
-- 
cgit 


From b65499b7b16b99575f1e7921da402b3b59c47de6 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 6 Mar 2023 02:53:25 -0500
Subject: bcachefs: bch2_btree_node_ondisk_to_text()

Pulling out a helper from cmd_list.c, as the rest is being rewritten in
Rust but we're not ready to rewrite lower-level btree code in Rust.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bkey.h  |   4 +-
 fs/bcachefs/debug.c | 119 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/bcachefs/debug.h |   2 +
 3 files changed, 123 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bkey.h b/fs/bcachefs/bkey.h
index dbe4873cad02..29f44d0060d8 100644
--- a/fs/bcachefs/bkey.h
+++ b/fs/bcachefs/bkey.h
@@ -505,7 +505,7 @@ static inline struct bpos bkey_unpack_pos(const struct btree *b,
 
 /* Disassembled bkeys */
 
-static inline struct bkey_s_c bkey_disassemble(struct btree *b,
+static inline struct bkey_s_c bkey_disassemble(const struct btree *b,
 					       const struct bkey_packed *k,
 					       struct bkey *u)
 {
@@ -515,7 +515,7 @@ static inline struct bkey_s_c bkey_disassemble(struct btree *b,
 }
 
 /* non const version: */
-static inline struct bkey_s __bkey_disassemble(struct btree *b,
+static inline struct bkey_s __bkey_disassemble(const struct btree *b,
 					       struct bkey_packed *k,
 					       struct bkey *u)
 {
diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c
index 0035fe875a47..d1563caf7fb7 100644
--- a/fs/bcachefs/debug.c
+++ b/fs/bcachefs/debug.c
@@ -181,6 +181,125 @@ out:
 	bch2_btree_node_io_unlock(b);
 }
 
+void bch2_btree_node_ondisk_to_text(struct printbuf *out, struct bch_fs *c,
+				    const struct btree *b)
+{
+	struct btree_node *n_ondisk = NULL;
+	struct extent_ptr_decoded pick;
+	struct bch_dev *ca;
+	struct bio *bio = NULL;
+	unsigned offset = 0;
+	int ret;
+
+	if (bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key), NULL, &pick) <= 0) {
+		prt_printf(out, "error getting device to read from: invalid device\n");
+		return;
+	}
+
+	ca = bch_dev_bkey_exists(c, pick.ptr.dev);
+	if (!bch2_dev_get_ioref(ca, READ)) {
+		prt_printf(out, "error getting device to read from: not online\n");
+		return;
+	}
+
+	n_ondisk = kvpmalloc(btree_bytes(c), GFP_KERNEL);
+	if (!n_ondisk) {
+		prt_printf(out, "memory allocation failure\n");
+		goto out;
+	}
+
+	bio = bio_alloc_bioset(ca->disk_sb.bdev,
+			       buf_pages(n_ondisk, btree_bytes(c)),
+			       REQ_OP_READ|REQ_META,
+			       GFP_NOIO,
+			       &c->btree_bio);
+	bio->bi_iter.bi_sector	= pick.ptr.offset;
+	bch2_bio_map(bio, n_ondisk, btree_bytes(c));
+
+	ret = submit_bio_wait(bio);
+	if (ret) {
+		prt_printf(out, "IO error reading btree node: %s\n", bch2_err_str(ret));
+		goto out;
+	}
+
+	while (offset < btree_sectors(c)) {
+		struct bset *i;
+		struct nonce nonce;
+		struct bch_csum csum;
+		struct bkey_packed *k;
+		unsigned sectors;
+
+		if (!offset) {
+			i = &n_ondisk->keys;
+
+			if (!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i))) {
+				prt_printf(out, "unknown checksum type at offset %u: %llu\n",
+					   offset, BSET_CSUM_TYPE(i));
+				goto out;
+			}
+
+			nonce = btree_nonce(i, offset << 9);
+			csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, n_ondisk);
+
+			if (bch2_crc_cmp(csum, n_ondisk->csum)) {
+				prt_printf(out, "invalid checksum\n");
+				goto out;
+			}
+
+			bset_encrypt(c, i, offset << 9);
+
+			sectors = vstruct_sectors(n_ondisk, c->block_bits);
+		} else {
+			struct btree_node_entry *bne = (void *) n_ondisk + (offset << 9);
+
+			i = &bne->keys;
+
+			if (i->seq != n_ondisk->keys.seq)
+				break;
+
+			if (!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i))) {
+				prt_printf(out, "unknown checksum type at offset %u: %llu\n",
+					   offset, BSET_CSUM_TYPE(i));
+				goto out;
+			}
+
+			nonce = btree_nonce(i, offset << 9);
+			csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne);
+
+			if (bch2_crc_cmp(csum, bne->csum)) {
+				prt_printf(out, "invalid checksum");
+				goto out;
+			}
+
+			bset_encrypt(c, i, offset << 9);
+
+			sectors = vstruct_sectors(bne, c->block_bits);
+		}
+
+		prt_printf(out, "  offset %u version %u, journal seq %llu\n",
+			   offset,
+			   le16_to_cpu(i->version),
+			   le64_to_cpu(i->journal_seq));
+		offset += sectors;
+
+		printbuf_indent_add(out, 4);
+
+		for (k = i->start; k != vstruct_last(i); k = bkey_p_next(k)) {
+			struct bkey u;
+
+			bch2_bkey_val_to_text(out, c, bkey_disassemble(b, k, &u));
+			prt_newline(out);
+		}
+
+		printbuf_indent_sub(out, 4);
+	}
+out:
+	if (bio)
+		bio_put(bio);
+	kvpfree(n_ondisk, btree_bytes(c));
+	percpu_ref_put(&ca->io_ref);
+}
+
 #ifdef CONFIG_DEBUG_FS
 
 /* XXX: bch_fs refcounting */
diff --git a/fs/bcachefs/debug.h b/fs/bcachefs/debug.h
index 0b86736e5e1b..2c37143b5fd1 100644
--- a/fs/bcachefs/debug.h
+++ b/fs/bcachefs/debug.h
@@ -9,6 +9,8 @@ struct btree;
 struct bch_fs;
 
 void __bch2_btree_verify(struct bch_fs *, struct btree *);
+void bch2_btree_node_ondisk_to_text(struct printbuf *, struct bch_fs *,
+				    const struct btree *);
 
 static inline void bch2_btree_verify(struct bch_fs *c, struct btree *b)
 {
-- 
cgit 


From 511b629aca6007a8784a6005a023dd166ffaa787 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 6 Mar 2023 04:01:22 -0500
Subject: bcachefs: bch2_btree_iter_peek_node_and_restart()

Minor refactoring for the Rust interface.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c | 11 +++++++++++
 fs/bcachefs/btree_iter.h | 15 ++-------------
 2 files changed, 13 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index e6dcca59e31a..a1be6c81c3be 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1723,6 +1723,17 @@ err:
 	goto out;
 }
 
+struct btree *bch2_btree_iter_peek_node_and_restart(struct btree_iter *iter)
+{
+	struct btree *b;
+
+	while (b = bch2_btree_iter_peek_node(iter),
+	       bch2_err_matches(PTR_ERR_OR_ZERO(b), BCH_ERR_transaction_restart))
+		bch2_trans_begin(iter->trans);
+
+	return b;
+}
+
 struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
 {
 	struct btree_trans *trans = iter->trans;
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index 8e1f754e641b..c3682332e653 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -294,6 +294,7 @@ int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter);
 int __must_check bch2_btree_iter_traverse(struct btree_iter *);
 
 struct btree *bch2_btree_iter_peek_node(struct btree_iter *);
+struct btree *bch2_btree_iter_peek_node_and_restart(struct btree_iter *);
 struct btree *bch2_btree_iter_next_node(struct btree_iter *);
 
 struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *, struct bpos);
@@ -520,18 +521,6 @@ static inline struct bkey_i *bch2_bkey_get_mut(struct btree_trans *trans,
 
 u32 bch2_trans_begin(struct btree_trans *);
 
-static inline struct btree *
-__btree_iter_peek_node_and_restart(struct btree_trans *trans, struct btree_iter *iter)
-{
-	struct btree *b;
-
-	while (b = bch2_btree_iter_peek_node(iter),
-	       bch2_err_matches(PTR_ERR_OR_ZERO(b), BCH_ERR_transaction_restart))
-		bch2_trans_begin(trans);
-
-	return b;
-}
-
 /*
  * XXX
  * this does not handle transaction restarts from bch2_btree_iter_next_node()
@@ -541,7 +530,7 @@ __btree_iter_peek_node_and_restart(struct btree_trans *trans, struct btree_iter
 			      _locks_want, _depth, _flags, _b, _ret)	\
 	for (bch2_trans_node_iter_init((_trans), &(_iter), (_btree_id),	\
 				_start, _locks_want, _depth, _flags);	\
-	     (_b) = __btree_iter_peek_node_and_restart((_trans), &(_iter)),\
+	     (_b) = bch2_btree_iter_peek_node_and_restart(&(_iter)),	\
 	     !((_ret) = PTR_ERR_OR_ZERO(_b)) && (_b);			\
 	     (_b) = bch2_btree_iter_next_node(&(_iter)))
 
-- 
cgit 


From 2640faeb1741db94b717d2678f6cd644f90e3061 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 6 Mar 2023 05:29:12 -0500
Subject: bcachefs: Journal resize fixes

 - Fix a sleeping-in-atomic bug due to calling
   bch2_journal_buckets_to_sb() under the journal lock.
 - Additionally, now we mark buckets as journal buckets before adding
   them to the journal in memory and the superblock. This ensures that
   if we crash part way through we'll never be writing to journal
   buckets that aren't marked correctly.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/buckets.c    |   2 +-
 fs/bcachefs/journal.c    | 154 +++++++++++++++++++++++------------------------
 fs/bcachefs/journal_sb.c |  27 ++++-----
 fs/bcachefs/journal_sb.h |   2 +-
 4 files changed, 89 insertions(+), 96 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index c7139dd8e1dc..19b4e2bde399 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -1855,7 +1855,7 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
 	if (IS_ERR(a))
 		return PTR_ERR(a);
 
-	if (a->v.data_type && a->v.data_type != type) {
+	if (a->v.data_type && type && a->v.data_type != type) {
 		bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
 			"bucket %llu:%llu gen %u different types of data in same bucket: %s, %s\n"
 			"while marking %s",
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index 00e806a64247..3cc93caf563a 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -757,19 +757,10 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
 	u64 *new_bucket_seq = NULL, *new_buckets = NULL;
 	struct open_bucket **ob = NULL;
 	long *bu = NULL;
-	unsigned i, nr_got = 0, nr_want = nr - ja->nr;
-	unsigned old_nr			= ja->nr;
-	unsigned old_discard_idx	= ja->discard_idx;
-	unsigned old_dirty_idx_ondisk	= ja->dirty_idx_ondisk;
-	unsigned old_dirty_idx		= ja->dirty_idx;
-	unsigned old_cur_idx		= ja->cur_idx;
+	unsigned i, pos, nr_got = 0, nr_want = nr - ja->nr;
 	int ret = 0;
 
-	if (c) {
-		bch2_journal_flush_all_pins(&c->journal);
-		bch2_journal_block(&c->journal);
-		mutex_lock(&c->sb_lock);
-	}
+	BUG_ON(nr <= ja->nr);
 
 	bu		= kcalloc(nr_want, sizeof(*bu), GFP_KERNEL);
 	ob		= kcalloc(nr_want, sizeof(*ob), GFP_KERNEL);
@@ -777,7 +768,7 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
 	new_bucket_seq	= kcalloc(nr, sizeof(u64), GFP_KERNEL);
 	if (!bu || !ob || !new_buckets || !new_bucket_seq) {
 		ret = -ENOMEM;
-		goto err_unblock;
+		goto err_free;
 	}
 
 	for (nr_got = 0; nr_got < nr_want; nr_got++) {
@@ -794,87 +785,92 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
 			if (ret)
 				break;
 
+			ret = bch2_trans_run(c,
+				bch2_trans_mark_metadata_bucket(&trans, ca,
+						ob[nr_got]->bucket, BCH_DATA_journal,
+						ca->mi.bucket_size));
+			if (ret) {
+				bch2_open_bucket_put(c, ob[nr_got]);
+				bch_err(c, "error marking new journal buckets: %s", bch2_err_str(ret));
+				break;
+			}
+
 			bu[nr_got] = ob[nr_got]->bucket;
 		}
 	}
 
 	if (!nr_got)
-		goto err_unblock;
+		goto err_free;
 
-	/*
-	 * We may be called from the device add path, before the new device has
-	 * actually been added to the running filesystem:
-	 */
-	if (!new_fs)
-		spin_lock(&c->journal.lock);
+	/* Don't return an error if we successfully allocated some buckets: */
+	ret = 0;
+
+	if (c) {
+		bch2_journal_flush_all_pins(&c->journal);
+		bch2_journal_block(&c->journal);
+		mutex_lock(&c->sb_lock);
+	}
 
 	memcpy(new_buckets,	ja->buckets,	ja->nr * sizeof(u64));
 	memcpy(new_bucket_seq,	ja->bucket_seq,	ja->nr * sizeof(u64));
-	swap(new_buckets,	ja->buckets);
-	swap(new_bucket_seq,	ja->bucket_seq);
+
+	BUG_ON(ja->discard_idx > ja->nr);
+
+	pos = ja->discard_idx ?: ja->nr;
+
+	memmove(new_buckets + pos + nr_got,
+		new_buckets + pos,
+		sizeof(new_buckets[0]) * (ja->nr - pos));
+	memmove(new_bucket_seq + pos + nr_got,
+		new_bucket_seq + pos,
+		sizeof(new_bucket_seq[0]) * (ja->nr - pos));
 
 	for (i = 0; i < nr_got; i++) {
-		unsigned pos = ja->discard_idx ?: ja->nr;
-		long b = bu[i];
-
-		__array_insert_item(ja->buckets,		ja->nr, pos);
-		__array_insert_item(ja->bucket_seq,		ja->nr, pos);
-		ja->nr++;
-
-		ja->buckets[pos] = b;
-		ja->bucket_seq[pos] = 0;
-
-		if (pos <= ja->discard_idx)
-			ja->discard_idx = (ja->discard_idx + 1) % ja->nr;
-		if (pos <= ja->dirty_idx_ondisk)
-			ja->dirty_idx_ondisk = (ja->dirty_idx_ondisk + 1) % ja->nr;
-		if (pos <= ja->dirty_idx)
-			ja->dirty_idx = (ja->dirty_idx + 1) % ja->nr;
-		if (pos <= ja->cur_idx)
-			ja->cur_idx = (ja->cur_idx + 1) % ja->nr;
+		new_buckets[pos + i] = bu[i];
+		new_bucket_seq[pos + i] = 0;
 	}
 
-	ret = bch2_journal_buckets_to_sb(c, ca);
-	if (ret) {
-		/* Revert: */
-		swap(new_buckets,	ja->buckets);
-		swap(new_bucket_seq,	ja->bucket_seq);
-		ja->nr			= old_nr;
-		ja->discard_idx		= old_discard_idx;
-		ja->dirty_idx_ondisk	= old_dirty_idx_ondisk;
-		ja->dirty_idx		= old_dirty_idx;
-		ja->cur_idx		= old_cur_idx;
-	}
+	nr = ja->nr + nr_got;
 
-	if (!new_fs)
-		spin_unlock(&c->journal.lock);
+	ret = bch2_journal_buckets_to_sb(c, ca, new_buckets, nr);
+	if (ret)
+		goto err_unblock;
 
-	if (ja->nr != old_nr && !new_fs)
+	if (!new_fs)
 		bch2_write_super(c);
 
+	/* Commit: */
 	if (c)
-		bch2_journal_unblock(&c->journal);
+		spin_lock(&c->journal.lock);
 
-	if (ret)
-		goto err;
+	swap(new_buckets,	ja->buckets);
+	swap(new_bucket_seq,	ja->bucket_seq);
+	ja->nr = nr;
+
+	if (pos <= ja->discard_idx)
+		ja->discard_idx = (ja->discard_idx + nr_got) % ja->nr;
+	if (pos <= ja->dirty_idx_ondisk)
+		ja->dirty_idx_ondisk = (ja->dirty_idx_ondisk + nr_got) % ja->nr;
+	if (pos <= ja->dirty_idx)
+		ja->dirty_idx = (ja->dirty_idx + nr_got) % ja->nr;
+	if (pos <= ja->cur_idx)
+		ja->cur_idx = (ja->cur_idx + nr_got) % ja->nr;
 
-	if (!new_fs) {
-		for (i = 0; i < nr_got; i++) {
-			ret = bch2_trans_run(c,
-				bch2_trans_mark_metadata_bucket(&trans, ca,
-						bu[i], BCH_DATA_journal,
-						ca->mi.bucket_size));
-			if (ret) {
-				bch2_fs_inconsistent(c, "error marking new journal buckets: %i", ret);
-				goto err;
-			}
-		}
-	}
-err:
 	if (c)
+		spin_unlock(&c->journal.lock);
+err_unblock:
+	if (c) {
+		bch2_journal_unblock(&c->journal);
 		mutex_unlock(&c->sb_lock);
+	}
 
-	if (ob && !new_fs)
+	if (ret && !new_fs)
+		for (i = 0; i < nr_got; i++)
+			bch2_trans_run(c,
+				bch2_trans_mark_metadata_bucket(&trans, ca,
+						bu[i], BCH_DATA_free, 0));
+err_free:
+	if (!new_fs)
 		for (i = 0; i < nr_got; i++)
 			bch2_open_bucket_put(c, ob[i]);
 
@@ -882,12 +878,7 @@ err:
 	kfree(new_buckets);
 	kfree(ob);
 	kfree(bu);
-
 	return ret;
-err_unblock:
-	if (c)
-		bch2_journal_unblock(&c->journal);
-	goto err;
 }
 
 /*
@@ -901,13 +892,15 @@ int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca,
 	struct closure cl;
 	int ret = 0;
 
+	closure_init_stack(&cl);
+
+	down_write(&c->state_lock);
+
 	/* don't handle reducing nr of buckets yet: */
 	if (nr < ja->nr)
-		return 0;
-
-	closure_init_stack(&cl);
+		goto unlock;
 
-	while (ja->nr != nr) {
+	while (ja->nr < nr) {
 		struct disk_reservation disk_res = { 0, 0 };
 
 		/*
@@ -938,7 +931,8 @@ int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca,
 
 	if (ret)
 		bch_err(c, "%s: err %s", __func__, bch2_err_str(ret));
-
+unlock:
+	up_write(&c->state_lock);
 	return ret;
 }
 
diff --git a/fs/bcachefs/journal_sb.c b/fs/bcachefs/journal_sb.c
index 9b933330a4c3..5be7882342e0 100644
--- a/fs/bcachefs/journal_sb.c
+++ b/fs/bcachefs/journal_sb.c
@@ -175,46 +175,45 @@ const struct bch_sb_field_ops bch_sb_field_ops_journal_v2 = {
 	.to_text	= bch2_sb_journal_v2_to_text,
 };
 
-int bch2_journal_buckets_to_sb(struct bch_fs *c, struct bch_dev *ca)
+int bch2_journal_buckets_to_sb(struct bch_fs *c, struct bch_dev *ca,
+			       u64 *buckets, unsigned nr)
 {
-	struct journal_device *ja = &ca->journal;
 	struct bch_sb_field_journal_v2 *j;
-	unsigned i, dst = 0, nr = 1;
+	unsigned i, dst = 0, nr_compacted = 1;
 
 	if (c)
 		lockdep_assert_held(&c->sb_lock);
 
-	if (!ja->nr) {
+	if (!nr) {
 		bch2_sb_field_delete(&ca->disk_sb, BCH_SB_FIELD_journal);
 		bch2_sb_field_delete(&ca->disk_sb, BCH_SB_FIELD_journal_v2);
 		return 0;
 	}
 
-	for (i = 0; i + 1 < ja->nr; i++)
-		if (ja->buckets[i] + 1 != ja->buckets[i + 1])
-			nr++;
+	for (i = 0; i + 1 < nr; i++)
+		if (buckets[i] + 1 != buckets[i + 1])
+			nr_compacted++;
 
 	j = bch2_sb_resize_journal_v2(&ca->disk_sb,
-				 (sizeof(*j) + sizeof(j->d[0]) * nr) / sizeof(u64));
+			 (sizeof(*j) + sizeof(j->d[0]) * nr_compacted) / sizeof(u64));
 	if (!j)
 		return -BCH_ERR_ENOSPC_sb_journal;
 
 	bch2_sb_field_delete(&ca->disk_sb, BCH_SB_FIELD_journal);
 
-	j->d[dst].start = le64_to_cpu(ja->buckets[0]);
+	j->d[dst].start = le64_to_cpu(buckets[0]);
 	j->d[dst].nr	= le64_to_cpu(1);
 
-	for (i = 1; i < ja->nr; i++) {
-		if (ja->buckets[i] == ja->buckets[i - 1] + 1) {
+	for (i = 1; i < nr; i++) {
+		if (buckets[i] == buckets[i - 1] + 1) {
 			le64_add_cpu(&j->d[dst].nr, 1);
 		} else {
 			dst++;
-			j->d[dst].start = le64_to_cpu(ja->buckets[i]);
+			j->d[dst].start = le64_to_cpu(buckets[i]);
 			j->d[dst].nr	= le64_to_cpu(1);
 		}
 	}
 
-	BUG_ON(dst + 1 != nr);
-
+	BUG_ON(dst + 1 != nr_compacted);
 	return 0;
 }
diff --git a/fs/bcachefs/journal_sb.h b/fs/bcachefs/journal_sb.h
index a39192e9f6f4..ba40a7e8d90a 100644
--- a/fs/bcachefs/journal_sb.h
+++ b/fs/bcachefs/journal_sb.h
@@ -21,4 +21,4 @@ static inline unsigned bch2_sb_field_journal_v2_nr_entries(struct bch_sb_field_j
 extern const struct bch_sb_field_ops bch_sb_field_ops_journal;
 extern const struct bch_sb_field_ops bch_sb_field_ops_journal_v2;
 
-int bch2_journal_buckets_to_sb(struct bch_fs *, struct bch_dev *);
+int bch2_journal_buckets_to_sb(struct bch_fs *, struct bch_dev *, u64 *, unsigned);
-- 
cgit 


From 4b5b13da527b1ce02ee9a96382684496e8d83696 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 6 Mar 2023 07:57:51 -0500
Subject: six locks: be more careful about lost wakeups

This is a workaround for a lost wakeup bug we've been seeing - we still
need to discover the actual bug.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/six.c | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/six.c b/fs/bcachefs/six.c
index e1e9df0368b6..b54a2ac480c8 100644
--- a/fs/bcachefs/six.c
+++ b/fs/bcachefs/six.c
@@ -142,8 +142,17 @@ static int __do_six_trylock_type(struct six_lock *lock,
 		 * lock, issue a wakeup because we might have caused a
 		 * spurious trylock failure:
 		 */
+#if 0
+		/*
+		 * This code should be sufficient, but we're seeing unexplained
+		 * lost wakeups:
+		 */
 		if (old.write_locking)
 			ret = -1 - SIX_LOCK_write;
+#else
+		if (!ret)
+			ret = -1 - SIX_LOCK_write;
+#endif
 	} else if (type == SIX_LOCK_write && lock->readers) {
 		if (try) {
 			atomic64_add(__SIX_VAL(write_locking, 1),
@@ -319,11 +328,10 @@ static bool __six_relock_type(struct six_lock *lock, enum six_lock_type type,
 		 * Similar to the lock path, we may have caused a spurious write
 		 * lock fail and need to issue a wakeup:
 		 */
-		if (old.write_locking)
-			six_lock_wakeup(lock, old, SIX_LOCK_write);
-
 		if (ret)
 			six_acquire(&lock->dep_map, 1, type == SIX_LOCK_read, ip);
+		else
+			six_lock_wakeup(lock, old, SIX_LOCK_write);
 
 		return ret;
 	}
-- 
cgit 


From db64a8e8a1688f70bdb254aed704f48a76dc7a7e Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 6 Mar 2023 10:20:36 -0500
Subject: fixup bcachefs: Use for_each_btree_key_upto() more consistently

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/io.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index 393629e52d03..6bcc91e8ac96 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -217,7 +217,8 @@ int bch2_sum_sector_overwrites(struct btree_trans *trans,
 
 	bch2_trans_copy_iter(&iter, extent_iter);
 
-	for_each_btree_key_continue_norestart(iter, BTREE_ITER_SLOTS, old, ret) {
+	for_each_btree_key_upto_continue_norestart(iter,
+				new->k.p, BTREE_ITER_SLOTS, old, ret) {
 		s64 sectors = min(new->k.p.offset, old.k->p.offset) -
 			max(bkey_start_offset(&new->k),
 			    bkey_start_offset(old.k));
-- 
cgit 


From adac06fad3333f9b9638ad9271346065d69e9c79 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 7 Mar 2023 07:25:12 -0500
Subject: bcachefs: Verbose on by default when CONFIG_BCACHEFS_DEBUG=y

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/opts.h | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
index afbf82d62977..719693b333da 100644
--- a/fs/bcachefs/opts.h
+++ b/fs/bcachefs/opts.h
@@ -92,6 +92,12 @@ enum opt_type {
 #define RATELIMIT_ERRORS_DEFAULT false
 #endif
 
+#ifdef CONFIG_BCACHEFS_DEBUG
+#define BCACHEFS_VERBOSE_DEFAULT	true
+#else
+#define BCACHEFS_VERBOSE_DEFAULT	false
+#endif
+
 #define BCH_OPTS()							\
 	x(block_size,			u16,				\
 	  OPT_FS|OPT_FORMAT|						\
@@ -276,7 +282,7 @@ enum opt_type {
 	x(verbose,			u8,				\
 	  OPT_FS|OPT_MOUNT|OPT_RUNTIME,					\
 	  OPT_BOOL(),							\
-	  BCH2_NO_SB_OPT,		false,				\
+	  BCH2_NO_SB_OPT,		BCACHEFS_VERBOSE_DEFAULT,	\
 	  NULL,		"Extra debugging information during mount/recovery")\
 	x(journal_flush_delay,		u32,				\
 	  OPT_FS|OPT_MOUNT|OPT_RUNTIME,					\
-- 
cgit 


From 83ec519aea326beed20dbac1c5a4432215adf35d Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 7 Mar 2023 07:28:20 -0500
Subject: bcachefs: When shutting down, flush btree node writes last

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update.h      |  3 ++
 fs/bcachefs/btree_update_leaf.c |  8 ++--
 fs/bcachefs/journal.c           | 20 +++++-----
 fs/bcachefs/journal_reclaim.c   | 83 ++++++++++++++++++++++++++---------------
 fs/bcachefs/journal_types.h     | 10 ++++-
 fs/bcachefs/super.c             |  6 ++-
 6 files changed, 81 insertions(+), 49 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
index ee1d15931022..46fb4a9ed295 100644
--- a/fs/bcachefs/btree_update.h
+++ b/fs/bcachefs/btree_update.h
@@ -13,6 +13,9 @@ void bch2_btree_node_prep_for_write(struct btree_trans *,
 bool bch2_btree_bset_insert_key(struct btree_trans *, struct btree_path *,
 				struct btree *, struct btree_node_iter *,
 				struct bkey_i *);
+
+int bch2_btree_node_flush0(struct journal *, struct journal_entry_pin *, u64);
+int bch2_btree_node_flush1(struct journal *, struct journal_entry_pin *, u64);
 void bch2_btree_add_journal_pin(struct bch_fs *, struct btree *, u64);
 
 void bch2_btree_insert_key_leaf(struct btree_trans *, struct btree_path *,
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index de6fedf4bdd0..9e52dff7cf2c 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -227,12 +227,12 @@ static int __btree_node_flush(struct journal *j, struct journal_entry_pin *pin,
 	return 0;
 }
 
-static int btree_node_flush0(struct journal *j, struct journal_entry_pin *pin, u64 seq)
+int bch2_btree_node_flush0(struct journal *j, struct journal_entry_pin *pin, u64 seq)
 {
 	return __btree_node_flush(j, pin, 0, seq);
 }
 
-static int btree_node_flush1(struct journal *j, struct journal_entry_pin *pin, u64 seq)
+int bch2_btree_node_flush1(struct journal *j, struct journal_entry_pin *pin, u64 seq)
 {
 	return __btree_node_flush(j, pin, 1, seq);
 }
@@ -244,8 +244,8 @@ inline void bch2_btree_add_journal_pin(struct bch_fs *c,
 
 	bch2_journal_pin_add(&c->journal, seq, &w->journal,
 			     btree_node_write_idx(b) == 0
-			     ? btree_node_flush0
-			     : btree_node_flush1);
+			     ? bch2_btree_node_flush0
+			     : bch2_btree_node_flush1);
 }
 
 /**
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index 3cc93caf563a..5dde208b4801 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -67,8 +67,9 @@ journal_seq_to_buf(struct journal *j, u64 seq)
 
 static void journal_pin_list_init(struct journal_entry_pin_list *p, int count)
 {
-	INIT_LIST_HEAD(&p->list);
-	INIT_LIST_HEAD(&p->key_cache_list);
+	unsigned i;
+	for (i = 0; i < ARRAY_SIZE(p->list); i++)
+		INIT_LIST_HEAD(&p->list[i]);
 	INIT_LIST_HEAD(&p->flushed);
 	atomic_set(&p->count, count);
 	p->devs.nr = 0;
@@ -1347,6 +1348,7 @@ bool bch2_journal_seq_pins_to_text(struct printbuf *out, struct journal *j, u64
 {
 	struct journal_entry_pin_list *pin_list;
 	struct journal_entry_pin *pin;
+	unsigned i;
 
 	spin_lock(&j->lock);
 	*seq = max(*seq, j->pin.front);
@@ -1364,15 +1366,11 @@ bool bch2_journal_seq_pins_to_text(struct printbuf *out, struct journal *j, u64
 	prt_newline(out);
 	printbuf_indent_add(out, 2);
 
-	list_for_each_entry(pin, &pin_list->list, list) {
-		prt_printf(out, "\t%px %ps", pin, pin->flush);
-		prt_newline(out);
-	}
-
-	list_for_each_entry(pin, &pin_list->key_cache_list, list) {
-		prt_printf(out, "\t%px %ps", pin, pin->flush);
-		prt_newline(out);
-	}
+	for (i = 0; i < ARRAY_SIZE(pin_list->list); i++)
+		list_for_each_entry(pin, &pin_list->list[i], list) {
+			prt_printf(out, "\t%px %ps", pin, pin->flush);
+			prt_newline(out);
+		}
 
 	if (!list_empty(&pin_list->flushed)) {
 		prt_printf(out, "flushed:");
diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
index 0f3c103e63ee..a484a62f9cbd 100644
--- a/fs/bcachefs/journal_reclaim.c
+++ b/fs/bcachefs/journal_reclaim.c
@@ -2,6 +2,7 @@
 
 #include "bcachefs.h"
 #include "btree_key_cache.h"
+#include "btree_update.h"
 #include "errcode.h"
 #include "error.h"
 #include "journal.h"
@@ -318,9 +319,7 @@ static void bch2_journal_reclaim_fast(struct journal *j)
 	 */
 	while (!fifo_empty(&j->pin) &&
 	       !atomic_read(&fifo_peek_front(&j->pin).count)) {
-		BUG_ON(!list_empty(&fifo_peek_front(&j->pin).list));
-		BUG_ON(!list_empty(&fifo_peek_front(&j->pin).flushed));
-		BUG_ON(!fifo_pop(&j->pin, temp));
+		fifo_pop(&j->pin, temp);
 		popped = true;
 	}
 
@@ -379,6 +378,17 @@ void bch2_journal_pin_drop(struct journal *j,
 	spin_unlock(&j->lock);
 }
 
+enum journal_pin_type journal_pin_type(journal_pin_flush_fn fn)
+{
+	if (fn == bch2_btree_node_flush0 ||
+	    fn == bch2_btree_node_flush1)
+		return JOURNAL_PIN_btree;
+	else if (fn == bch2_btree_key_cache_journal_flush)
+		return JOURNAL_PIN_key_cache;
+	else
+		return JOURNAL_PIN_other;
+}
+
 void bch2_journal_pin_set(struct journal *j, u64 seq,
 			  struct journal_entry_pin *pin,
 			  journal_pin_flush_fn flush_fn)
@@ -407,10 +417,8 @@ void bch2_journal_pin_set(struct journal *j, u64 seq,
 	pin->seq	= seq;
 	pin->flush	= flush_fn;
 
-	if (flush_fn == bch2_btree_key_cache_journal_flush)
-		list_add(&pin->list, &pin_list->key_cache_list);
-	else if (flush_fn)
-		list_add(&pin->list, &pin_list->list);
+	if (flush_fn)
+		list_add(&pin->list, &pin_list->list[journal_pin_type(flush_fn)]);
 	else
 		list_add(&pin->list, &pin_list->flushed);
 
@@ -446,37 +454,37 @@ void bch2_journal_pin_flush(struct journal *j, struct journal_entry_pin *pin)
 
 static struct journal_entry_pin *
 journal_get_next_pin(struct journal *j,
-		     bool get_any,
-		     bool get_key_cache,
-		     u64 max_seq, u64 *seq)
+		     u64 seq_to_flush,
+		     unsigned allowed_below_seq,
+		     unsigned allowed_above_seq,
+		     u64 *seq)
 {
 	struct journal_entry_pin_list *pin_list;
 	struct journal_entry_pin *ret = NULL;
+	unsigned i;
 
 	fifo_for_each_entry_ptr(pin_list, &j->pin, *seq) {
-		if (*seq > max_seq && !get_any && !get_key_cache)
+		if (*seq > seq_to_flush && !allowed_above_seq)
 			break;
 
-		if (*seq <= max_seq || get_any) {
-			ret = list_first_entry_or_null(&pin_list->list,
-				struct journal_entry_pin, list);
-			if (ret)
-				return ret;
-		}
-
-		if (*seq <= max_seq || get_any || get_key_cache) {
-			ret = list_first_entry_or_null(&pin_list->key_cache_list,
-				struct journal_entry_pin, list);
-			if (ret)
-				return ret;
-		}
+		for (i = 0; i < JOURNAL_PIN_NR; i++)
+			if ((((1U << i) & allowed_below_seq) && *seq <= seq_to_flush) ||
+			    ((1U << i) & allowed_above_seq)) {
+				ret = list_first_entry_or_null(&pin_list->list[i],
+					struct journal_entry_pin, list);
+				if (ret)
+					return ret;
+			}
 	}
 
 	return NULL;
 }
 
 /* returns true if we did work */
-static size_t journal_flush_pins(struct journal *j, u64 seq_to_flush,
+static size_t journal_flush_pins(struct journal *j,
+				 u64 seq_to_flush,
+				 unsigned allowed_below_seq,
+				 unsigned allowed_above_seq,
 				 unsigned min_any,
 				 unsigned min_key_cache)
 {
@@ -489,15 +497,25 @@ static size_t journal_flush_pins(struct journal *j, u64 seq_to_flush,
 	lockdep_assert_held(&j->reclaim_lock);
 
 	while (1) {
+		unsigned allowed_above = allowed_above_seq;
+		unsigned allowed_below = allowed_below_seq;
+
+		if (min_any) {
+			allowed_above |= ~0;
+			allowed_below |= ~0;
+		}
+
+		if (min_key_cache) {
+			allowed_above |= 1U << JOURNAL_PIN_key_cache;
+			allowed_below |= 1U << JOURNAL_PIN_key_cache;
+		}
+
 		cond_resched();
 
 		j->last_flushed = jiffies;
 
 		spin_lock(&j->lock);
-		pin = journal_get_next_pin(j,
-					   min_any != 0,
-					   min_key_cache != 0,
-					   seq_to_flush, &seq);
+		pin = journal_get_next_pin(j, seq_to_flush, allowed_below, allowed_above, &seq);
 		if (pin) {
 			BUG_ON(j->flush_in_progress);
 			j->flush_in_progress = pin;
@@ -656,6 +674,7 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct, bool kicked)
 				atomic_long_read(&c->btree_key_cache.nr_keys));
 
 		nr_flushed = journal_flush_pins(j, seq_to_flush,
+						~0, 0,
 						min_nr, min_key_cache);
 
 		if (direct)
@@ -776,7 +795,11 @@ static int journal_flush_done(struct journal *j, u64 seq_to_flush,
 
 	mutex_lock(&j->reclaim_lock);
 
-	if (journal_flush_pins(j, seq_to_flush, 0, 0))
+	if (journal_flush_pins(j, seq_to_flush,
+			       (1U << JOURNAL_PIN_key_cache)|
+			       (1U << JOURNAL_PIN_other), 0, 0, 0) ||
+	    journal_flush_pins(j, seq_to_flush,
+			       (1U << JOURNAL_PIN_btree), 0, 0, 0))
 		*did_work = true;
 
 	spin_lock(&j->lock);
diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h
index 0e6bde669b3e..8d8c0b3d5a30 100644
--- a/fs/bcachefs/journal_types.h
+++ b/fs/bcachefs/journal_types.h
@@ -43,9 +43,15 @@ struct journal_buf {
  * flushed:
  */
 
+enum journal_pin_type {
+	JOURNAL_PIN_btree,
+	JOURNAL_PIN_key_cache,
+	JOURNAL_PIN_other,
+	JOURNAL_PIN_NR,
+};
+
 struct journal_entry_pin_list {
-	struct list_head		list;
-	struct list_head		key_cache_list;
+	struct list_head		list[JOURNAL_PIN_NR];
 	struct list_head		flushed;
 	atomic_t			count;
 	struct bch_devs_list		devs;
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index cc27f19960f1..46dae5ab0db7 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -209,7 +209,8 @@ static void __bch2_fs_read_only(struct bch_fs *c)
 	bch2_copygc_stop(c);
 	bch2_gc_thread_stop(c);
 
-	bch_verbose(c, "flushing journal and stopping allocators");
+	bch_verbose(c, "flushing journal and stopping allocators, journal seq %llu",
+		    journal_cur_seq(&c->journal));
 
 	do {
 		clean_passes++;
@@ -223,7 +224,8 @@ static void __bch2_fs_read_only(struct bch_fs *c)
 		}
 	} while (clean_passes < 2);
 
-	bch_verbose(c, "flushing journal and stopping allocators complete");
+	bch_verbose(c, "flushing journal and stopping allocators complete, journal seq %llu",
+		    journal_cur_seq(&c->journal));
 
 	if (test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags) &&
 	    !test_bit(BCH_FS_EMERGENCY_RO, &c->flags))
-- 
cgit 


From e53d03fe39f1458065ddb5f7309ade066ba6fb95 Mon Sep 17 00:00:00 2001
From: Brian Foster <bfoster@redhat.com>
Date: Thu, 2 Mar 2023 09:03:37 -0500
Subject: bcachefs: don't bump key cache journal seq on nojournal commits

fstest generic/388 occasionally reproduces corruptions where an
inode has extents beyond i_size. This is a deliberate crash and
recovery test, and the post crash+recovery characteristics are
usually the same: the inode exists on disk in an early (i.e. just
allocated) state based on the journal sequence number associated
with the inode. Subsequent inode updates exist in the journal at
higher sequence numbers, but the inode hadn't been written back
before the associated crash and the post-crash recovery processes a
set of journal sequence numbers that doesn't include updates to the
inode. In fact, the sequence with the most recent inode key update
always happens to be the sequence just before the front of the
journal processed by recovery.

This last bit is a significant hint that the problem relates to an
on-disk journal update of the front of the journal. The root cause
of this problem is basically that the inode is updated (multiple
times) in-core and in the key cache, each time bumping the key cache
sequence number used to control the cache flush. The cache flush
skips one or more times, bumping the associated key cache journal
pin to the key cache seq value. This has a side effect of holding
the inode in memory a bit longer than normal, which helps exacerbate
this problem, but is also unsafe in certain cases where the key
cache seq may have been updated by a transaction commit that didn't
journal the associated key.

For example, consider an inode that has been allocated, updated
several times in the key cache, journaled, but not yet written back.
At this stage, everything should be consistent if the fs happens to
crash because the latest update has been journal. Now consider a key
update via bch2_extent_update_i_size_sectors() that uses the
BTREE_UPDATE_NOJOURNAL flag. While this update may not change inode
state, it can have the side effect of bumping ck->seq in
bch2_btree_insert_key_cached(). In turn, if a subsequent key cache
flush skips due to seq not matching the former, the ck->journal pin
is updated to ck->seq even though the most recent key update was not
journaled. If this pin happens to reside at the front (tail) of the
journal, this means a subsequent journal write can update last_seq
to a value beyond that which includes the most recent update to the
inode. If this occurs and the fs happens to crash before the inode
happens to flush, recovery will see the latest last_seq, fail to
recover the inode and leave the inode in the inconsistent state
described above.

To avoid this problem, skip the key cache seq update on NOJOURNAL
commits, except on initial pin add. Pass the insert entry directly
to bch2_btree_insert_key_cached() to make the associated flag
available and be consistent with btree_insert_key_leaf().

Signed-off-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_key_cache.c   | 23 +++++++++++++++++++----
 fs/bcachefs/btree_key_cache.h   |  2 +-
 fs/bcachefs/btree_update_leaf.c |  2 +-
 3 files changed, 21 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index 074c61f271d3..21e139e391e0 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -769,11 +769,11 @@ int bch2_btree_key_cache_flush(struct btree_trans *trans,
 
 bool bch2_btree_insert_key_cached(struct btree_trans *trans,
 				  unsigned flags,
-				  struct btree_path *path,
-				  struct bkey_i *insert)
+				  struct btree_insert_entry *insert_entry)
 {
 	struct bch_fs *c = trans->c;
-	struct bkey_cached *ck = (void *) path->l[0].b;
+	struct bkey_cached *ck = (void *) insert_entry->path->l[0].b;
+	struct bkey_i *insert = insert_entry->k;
 	bool kick_reclaim = false;
 
 	BUG_ON(insert->k.u64s > ck->u64s);
@@ -801,9 +801,24 @@ bool bch2_btree_insert_key_cached(struct btree_trans *trans,
 			kick_reclaim = true;
 	}
 
+	/*
+	 * To minimize lock contention, we only add the journal pin here and
+	 * defer pin updates to the flush callback via ->seq. Be careful not to
+	 * update ->seq on nojournal commits because we don't want to update the
+	 * pin to a seq that doesn't include journal updates on disk. Otherwise
+	 * we risk losing the update after a crash.
+	 *
+	 * The only exception is if the pin is not active in the first place. We
+	 * have to add the pin because journal reclaim drives key cache
+	 * flushing. The flush callback will not proceed unless ->seq matches
+	 * the latest pin, so make sure it starts with a consistent value.
+	 */
+	if (!(insert_entry->flags & BTREE_UPDATE_NOJOURNAL) ||
+	    !journal_pin_active(&ck->journal)) {
+		ck->seq = trans->journal_res.seq;
+	}
 	bch2_journal_pin_add(&c->journal, trans->journal_res.seq,
 			     &ck->journal, bch2_btree_key_cache_journal_flush);
-	ck->seq = trans->journal_res.seq;
 
 	if (kick_reclaim)
 		journal_reclaim_kick(&c->journal);
diff --git a/fs/bcachefs/btree_key_cache.h b/fs/bcachefs/btree_key_cache.h
index c86d5e48f6e3..be3acde2caa0 100644
--- a/fs/bcachefs/btree_key_cache.h
+++ b/fs/bcachefs/btree_key_cache.h
@@ -30,7 +30,7 @@ int bch2_btree_path_traverse_cached(struct btree_trans *, struct btree_path *,
 				    unsigned);
 
 bool bch2_btree_insert_key_cached(struct btree_trans *, unsigned,
-			struct btree_path *, struct bkey_i *);
+			struct btree_insert_entry *);
 int bch2_btree_key_cache_flush(struct btree_trans *,
 			       enum btree_id, struct bpos);
 void bch2_btree_key_cache_drop(struct btree_trans *,
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 9e52dff7cf2c..e9073d441b83 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -765,7 +765,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
 		if (!i->cached)
 			btree_insert_key_leaf(trans, i);
 		else if (!i->key_cache_already_flushed)
-			bch2_btree_insert_key_cached(trans, flags, i->path, i->k);
+			bch2_btree_insert_key_cached(trans, flags, i);
 		else {
 			bch2_btree_key_cache_drop(trans, i->path);
 			btree_path_set_dirty(i->path, BTREE_ITER_NEED_TRAVERSE);
-- 
cgit 


From 7635e1a6d6740ce76e1c2204f9237f01c98153b3 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 25 Feb 2023 02:22:49 -0500
Subject: bcachefs: Rework open bucket partial list allocation

Now, any open_bucket can go on the partial list: allocating from the
partial list has been moved to its own dedicated function,
open_bucket_add_bucets() -> bucket_alloc_set_partial().

In particular, this means that erasure coded buckets can safely go on
the partial list; the new location works with the "allocate an ec bucket
first, then the rest" logic.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_foreground.c | 359 ++++++++++++++++++++++++-----------------
 fs/bcachefs/alloc_foreground.h |   8 +-
 fs/bcachefs/alloc_types.h      |   3 +-
 fs/bcachefs/ec.c               |   8 +-
 fs/bcachefs/journal.c          |   3 +-
 fs/bcachefs/trace.h            |  14 +-
 6 files changed, 228 insertions(+), 167 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index 7b048ef99b97..4621ef7f1e50 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -154,23 +154,17 @@ static void open_bucket_free_unused(struct bch_fs *c,
 				    struct write_point *wp,
 				    struct open_bucket *ob)
 {
-	bool may_realloc = wp->data_type == BCH_DATA_user;
-
 	BUG_ON(c->open_buckets_partial_nr >=
 	       ARRAY_SIZE(c->open_buckets_partial));
 
-	if (may_realloc) {
-		spin_lock(&c->freelist_lock);
-		ob->on_partial_list = true;
-		c->open_buckets_partial[c->open_buckets_partial_nr++] =
-			ob - c->open_buckets;
-		spin_unlock(&c->freelist_lock);
+	spin_lock(&c->freelist_lock);
+	ob->on_partial_list = true;
+	c->open_buckets_partial[c->open_buckets_partial_nr++] =
+		ob - c->open_buckets;
+	spin_unlock(&c->freelist_lock);
 
-		closure_wake_up(&c->open_buckets_wait);
-		closure_wake_up(&c->freelist_wait);
-	} else {
-		bch2_open_bucket_put(c, ob);
-	}
+	closure_wake_up(&c->open_buckets_wait);
+	closure_wake_up(&c->freelist_wait);
 }
 
 /* _only_ for allocating the journal on a new device: */
@@ -256,7 +250,6 @@ static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev *
 
 	ob->valid	= true;
 	ob->sectors_free = ca->mi.bucket_size;
-	ob->alloc_reserve = reserve;
 	ob->dev		= ca->dev_idx;
 	ob->gen		= a->gen;
 	ob->bucket	= bucket;
@@ -383,33 +376,6 @@ err:
 	return ob;
 }
 
-static struct open_bucket *try_alloc_partial_bucket(struct bch_fs *c, struct bch_dev *ca,
-						    enum alloc_reserve reserve)
-{
-	struct open_bucket *ob;
-	int i;
-
-	spin_lock(&c->freelist_lock);
-
-	for (i = c->open_buckets_partial_nr - 1; i >= 0; --i) {
-		ob = c->open_buckets + c->open_buckets_partial[i];
-
-		if (ob->dev == ca->dev_idx &&
-		    reserve <= ob->alloc_reserve) {
-			array_remove_item(c->open_buckets_partial,
-					  c->open_buckets_partial_nr,
-					  i);
-			ob->on_partial_list = false;
-			ob->alloc_reserve = reserve;
-			spin_unlock(&c->freelist_lock);
-			return ob;
-		}
-	}
-
-	spin_unlock(&c->freelist_lock);
-	return NULL;
-}
-
 /*
  * This path is for before the freespace btree is initialized:
  *
@@ -533,7 +499,6 @@ again:
 static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans,
 				      struct bch_dev *ca,
 				      enum alloc_reserve reserve,
-				      bool may_alloc_partial,
 				      struct closure *cl,
 				      struct bch_dev_usage *usage)
 {
@@ -572,12 +537,6 @@ again:
 
 	if (waiting)
 		closure_wake_up(&c->freelist_wait);
-
-	if (may_alloc_partial) {
-		ob = try_alloc_partial_bucket(c, ca, reserve);
-		if (ob)
-			return ob;
-	}
 alloc:
 	ob = likely(freespace)
 		? bch2_bucket_alloc_freelist(trans, ca, reserve, &s, cl)
@@ -597,7 +556,6 @@ err:
 	if (!IS_ERR(ob))
 		trace_and_count(c, bucket_alloc, ca,
 				bch2_alloc_reserves[reserve],
-				may_alloc_partial,
 				ob->bucket,
 				usage->d[BCH_DATA_free].buckets,
 				avail,
@@ -609,7 +567,6 @@ err:
 	else if (!bch2_err_matches(PTR_ERR(ob), BCH_ERR_transaction_restart))
 		trace_and_count(c, bucket_alloc_fail, ca,
 				bch2_alloc_reserves[reserve],
-				may_alloc_partial,
 				0,
 				usage->d[BCH_DATA_free].buckets,
 				avail,
@@ -624,7 +581,6 @@ err:
 
 struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca,
 				      enum alloc_reserve reserve,
-				      bool may_alloc_partial,
 				      struct closure *cl)
 {
 	struct bch_dev_usage usage;
@@ -632,7 +588,7 @@ struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca,
 
 	bch2_trans_do(c, NULL, NULL, 0,
 		      PTR_ERR_OR_ZERO(ob = bch2_bucket_alloc_trans(&trans, ca, reserve,
-							may_alloc_partial, cl, &usage)));
+							cl, &usage)));
 	return ob;
 }
 
@@ -689,12 +645,10 @@ void bch2_dev_stripe_increment(struct bch_dev *ca,
 	bch2_dev_stripe_increment_inlined(ca, stripe, &usage);
 }
 
-#define BUCKET_MAY_ALLOC_PARTIAL	(1 << 0)
-#define BUCKET_ALLOC_USE_DURABILITY	(1 << 1)
-
-static void add_new_bucket(struct bch_fs *c,
+static int add_new_bucket(struct bch_fs *c,
 			   struct open_buckets *ptrs,
 			   struct bch_devs_mask *devs_may_alloc,
+			   unsigned nr_replicas,
 			   unsigned *nr_effective,
 			   bool *have_cache,
 			   unsigned flags,
@@ -703,12 +657,21 @@ static void add_new_bucket(struct bch_fs *c,
 	unsigned durability =
 		bch_dev_bkey_exists(c, ob->dev)->mi.durability;
 
+	BUG_ON(*nr_effective >= nr_replicas);
+	BUG_ON(flags & BCH_WRITE_ONLY_SPECIFIED_DEVS);
+
 	__clear_bit(ob->dev, devs_may_alloc->d);
-	*nr_effective	+= (flags & BUCKET_ALLOC_USE_DURABILITY)
+	*nr_effective	+= (flags & BCH_WRITE_ONLY_SPECIFIED_DEVS)
 		? durability : 1;
 	*have_cache	|= !durability;
 
 	ob_push(c, ptrs, ob);
+
+	if (*nr_effective >= nr_replicas)
+		return 1;
+	if (ob->ec)
+		return 1;
+	return 0;
 }
 
 int bch2_bucket_alloc_set_trans(struct btree_trans *trans,
@@ -718,8 +681,9 @@ int bch2_bucket_alloc_set_trans(struct btree_trans *trans,
 		      unsigned nr_replicas,
 		      unsigned *nr_effective,
 		      bool *have_cache,
-		      enum alloc_reserve reserve,
 		      unsigned flags,
+		      enum bch_data_type data_type,
+		      enum alloc_reserve reserve,
 		      struct closure *cl)
 {
 	struct bch_fs *c = trans->c;
@@ -752,8 +716,7 @@ int bch2_bucket_alloc_set_trans(struct btree_trans *trans,
 			continue;
 		}
 
-		ob = bch2_bucket_alloc_trans(trans, ca, reserve,
-				flags & BUCKET_MAY_ALLOC_PARTIAL, cl, &usage);
+		ob = bch2_bucket_alloc_trans(trans, ca, reserve, cl, &usage);
 		if (!IS_ERR(ob))
 			bch2_dev_stripe_increment_inlined(ca, stripe, &usage);
 		percpu_ref_put(&ca->ref);
@@ -765,10 +728,11 @@ int bch2_bucket_alloc_set_trans(struct btree_trans *trans,
 			continue;
 		}
 
-		add_new_bucket(c, ptrs, devs_may_alloc,
-			       nr_effective, have_cache, flags, ob);
+		ob->data_type = data_type;
 
-		if (*nr_effective >= nr_replicas) {
+		if (add_new_bucket(c, ptrs, devs_may_alloc,
+				   nr_replicas, nr_effective,
+				   have_cache, flags, ob)) {
 			ret = 0;
 			break;
 		}
@@ -790,7 +754,6 @@ static int bucket_alloc_from_stripe(struct btree_trans *trans,
 			 struct write_point *wp,
 			 struct bch_devs_mask *devs_may_alloc,
 			 u16 target,
-			 unsigned erasure_code,
 			 unsigned nr_replicas,
 			 unsigned *nr_effective,
 			 bool *have_cache,
@@ -804,9 +767,7 @@ static int bucket_alloc_from_stripe(struct btree_trans *trans,
 	struct open_bucket *ob;
 	struct bch_dev *ca;
 	unsigned i, ec_idx;
-
-	if (!erasure_code)
-		return 0;
+	int ret = 0;
 
 	if (nr_replicas < 2)
 		return 0;
@@ -840,53 +801,124 @@ got_bucket:
 	ob->ec		= h->s;
 	ec_stripe_new_get(h->s);
 
-	add_new_bucket(c, ptrs, devs_may_alloc,
-		       nr_effective, have_cache, flags, ob);
+	ret = add_new_bucket(c, ptrs, devs_may_alloc,
+			     nr_replicas, nr_effective,
+			     have_cache, flags, ob);
 out_put_head:
 	bch2_ec_stripe_head_put(c, h);
-	return 0;
+	return ret;
 }
 
 /* Sector allocator */
 
-static void get_buckets_from_writepoint(struct bch_fs *c,
-					struct open_buckets *ptrs,
-					struct write_point *wp,
-					struct bch_devs_mask *devs_may_alloc,
-					unsigned nr_replicas,
-					unsigned *nr_effective,
-					bool *have_cache,
-					unsigned flags,
-					bool need_ec)
+static bool want_bucket(struct bch_fs *c,
+			struct write_point *wp,
+			struct bch_devs_mask *devs_may_alloc,
+			bool *have_cache, bool ec,
+			struct open_bucket *ob)
+{
+	struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
+
+	if (!test_bit(ob->dev, devs_may_alloc->d))
+		return false;
+
+	if (ob->data_type != wp->data_type)
+		return false;
+
+	if (!ca->mi.durability &&
+	    (wp->data_type == BCH_DATA_btree || ec || *have_cache))
+		return false;
+
+	if (ec != (ob->ec != NULL))
+		return false;
+
+	return true;
+}
+
+static int bucket_alloc_set_writepoint(struct bch_fs *c,
+				       struct open_buckets *ptrs,
+				       struct write_point *wp,
+				       struct bch_devs_mask *devs_may_alloc,
+				       unsigned nr_replicas,
+				       unsigned *nr_effective,
+				       bool *have_cache,
+				       bool ec, unsigned flags)
 {
 	struct open_buckets ptrs_skip = { .nr = 0 };
 	struct open_bucket *ob;
 	unsigned i;
+	int ret = 0;
 
 	open_bucket_for_each(c, &wp->ptrs, ob, i) {
-		struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
-
-		if (*nr_effective < nr_replicas &&
-		    test_bit(ob->dev, devs_may_alloc->d) &&
-		    (ca->mi.durability ||
-		     (wp->data_type == BCH_DATA_user && !*have_cache)) &&
-		    (ob->ec || !need_ec)) {
-			add_new_bucket(c, ptrs, devs_may_alloc,
-				       nr_effective, have_cache,
-				       flags, ob);
-		} else {
+		if (!ret && want_bucket(c, wp, devs_may_alloc,
+					have_cache, ec, ob))
+			ret = add_new_bucket(c, ptrs, devs_may_alloc,
+				       nr_replicas, nr_effective,
+				       have_cache, flags, ob);
+		else
 			ob_push(c, &ptrs_skip, ob);
-		}
 	}
 	wp->ptrs = ptrs_skip;
+
+	return ret;
 }
 
-static int open_bucket_add_buckets(struct btree_trans *trans,
+static int bucket_alloc_set_partial(struct bch_fs *c,
+				    struct open_buckets *ptrs,
+				    struct write_point *wp,
+				    struct bch_devs_mask *devs_may_alloc,
+				    unsigned nr_replicas,
+				    unsigned *nr_effective,
+				    bool *have_cache, bool ec,
+				    enum alloc_reserve reserve,
+				    unsigned flags)
+{
+	int i, ret = 0;
+
+	if (!c->open_buckets_partial_nr)
+		return 0;
+
+	spin_lock(&c->freelist_lock);
+
+	if (!c->open_buckets_partial_nr)
+		goto unlock;
+
+	for (i = c->open_buckets_partial_nr - 1; i >= 0; --i) {
+		struct open_bucket *ob = c->open_buckets + c->open_buckets_partial[i];
+
+		if (want_bucket(c, wp, devs_may_alloc, have_cache, ec, ob)) {
+			struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
+			struct bch_dev_usage usage;
+			u64 avail;
+
+			bch2_dev_usage_read_fast(ca, &usage);
+			avail = dev_buckets_free(ca, usage, reserve);
+			if (!avail)
+				continue;
+
+			array_remove_item(c->open_buckets_partial,
+					  c->open_buckets_partial_nr,
+					  i);
+			ob->on_partial_list = false;
+
+			ret = add_new_bucket(c, ptrs, devs_may_alloc,
+					     nr_replicas, nr_effective,
+					     have_cache, flags, ob);
+			if (ret)
+				break;
+		}
+	}
+unlock:
+	spin_unlock(&c->freelist_lock);
+	return ret;
+}
+
+static int __open_bucket_add_buckets(struct btree_trans *trans,
 			struct open_buckets *ptrs,
 			struct write_point *wp,
 			struct bch_devs_list *devs_have,
 			u16 target,
-			unsigned erasure_code,
+			bool erasure_code,
 			unsigned nr_replicas,
 			unsigned *nr_effective,
 			bool *have_cache,
@@ -898,8 +930,8 @@ static int open_bucket_add_buckets(struct btree_trans *trans,
 	struct bch_devs_mask devs;
 	struct open_bucket *ob;
 	struct closure *cl = NULL;
-	int ret;
 	unsigned i;
+	int ret;
 
 	rcu_read_lock();
 	devs = target_rw_devs(c, wp->data_type, target);
@@ -912,52 +944,83 @@ static int open_bucket_add_buckets(struct btree_trans *trans,
 	open_bucket_for_each(c, ptrs, ob, i)
 		__clear_bit(ob->dev, devs.d);
 
+	if (erasure_code && ec_open_bucket(c, ptrs))
+		return 0;
+
+	ret = bucket_alloc_set_writepoint(c, ptrs, wp, &devs,
+				 nr_replicas, nr_effective,
+				 have_cache, erasure_code, flags);
+	if (ret)
+		return ret;
+
+	ret = bucket_alloc_set_partial(c, ptrs, wp, &devs,
+				 nr_replicas, nr_effective,
+				 have_cache, erasure_code, reserve, flags);
+	if (ret)
+		return ret;
+
 	if (erasure_code) {
-		if (!ec_open_bucket(c, ptrs)) {
-			get_buckets_from_writepoint(c, ptrs, wp, &devs,
-						    nr_replicas, nr_effective,
-						    have_cache, flags, true);
-			if (*nr_effective >= nr_replicas)
-				return 0;
+		ret = bucket_alloc_from_stripe(trans, ptrs, wp, &devs,
+					 target,
+					 nr_replicas, nr_effective,
+					 have_cache,
+					 reserve, flags, _cl);
+	} else {
+retry_blocking:
+		/*
+		 * Try nonblocking first, so that if one device is full we'll try from
+		 * other devices:
+		 */
+		ret = bch2_bucket_alloc_set_trans(trans, ptrs, &wp->stripe, &devs,
+					nr_replicas, nr_effective, have_cache,
+					flags, wp->data_type, reserve, cl);
+		if (ret &&
+		    !bch2_err_matches(ret, BCH_ERR_transaction_restart) &&
+		    !bch2_err_matches(ret, BCH_ERR_insufficient_devices) &&
+		    !cl && _cl) {
+			cl = _cl;
+			goto retry_blocking;
 		}
 
-		if (!ec_open_bucket(c, ptrs)) {
-			ret = bucket_alloc_from_stripe(trans, ptrs, wp, &devs,
-						 target, erasure_code,
-						 nr_replicas, nr_effective,
-						 have_cache, reserve, flags, _cl);
-			if (bch2_err_matches(ret, BCH_ERR_transaction_restart) ||
-			    bch2_err_matches(ret, BCH_ERR_freelist_empty) ||
-			    bch2_err_matches(ret, BCH_ERR_open_buckets_empty))
-				return ret;
-			if (*nr_effective >= nr_replicas)
-				return 0;
-		}
 	}
 
-	get_buckets_from_writepoint(c, ptrs, wp, &devs,
-				    nr_replicas, nr_effective,
-				    have_cache, flags, false);
-	if (*nr_effective >= nr_replicas)
-		return 0;
+	return ret;
+}
 
-retry_blocking:
-	/*
-	 * Try nonblocking first, so that if one device is full we'll try from
-	 * other devices:
-	 */
-	ret = bch2_bucket_alloc_set_trans(trans, ptrs, &wp->stripe, &devs,
+static int open_bucket_add_buckets(struct btree_trans *trans,
+			struct open_buckets *ptrs,
+			struct write_point *wp,
+			struct bch_devs_list *devs_have,
+			u16 target,
+			unsigned erasure_code,
+			unsigned nr_replicas,
+			unsigned *nr_effective,
+			bool *have_cache,
+			enum alloc_reserve reserve,
+			unsigned flags,
+			struct closure *cl)
+{
+	int ret;
+
+	if (erasure_code) {
+		ret = __open_bucket_add_buckets(trans, ptrs, wp,
+				devs_have, target, erasure_code,
 				nr_replicas, nr_effective, have_cache,
 				reserve, flags, cl);
-	if (ret &&
-	    !bch2_err_matches(ret, BCH_ERR_transaction_restart) &&
-	    !bch2_err_matches(ret, BCH_ERR_insufficient_devices) &&
-	    !cl && _cl) {
-		cl = _cl;
-		goto retry_blocking;
+		if (bch2_err_matches(ret, BCH_ERR_transaction_restart) ||
+		    bch2_err_matches(ret, BCH_ERR_operation_blocked) ||
+		    bch2_err_matches(ret, BCH_ERR_freelist_empty) ||
+		    bch2_err_matches(ret, BCH_ERR_open_buckets_empty))
+			return ret;
+		if (*nr_effective >= nr_replicas)
+			return 0;
 	}
 
-	return ret;
+	ret = __open_bucket_add_buckets(trans, ptrs, wp,
+			devs_have, target, false,
+			nr_replicas, nr_effective, have_cache,
+			reserve, flags, cl);
+	return ret < 0 ? ret : 0;
 }
 
 void bch2_open_buckets_stop_dev(struct bch_fs *c, struct bch_dev *ca,
@@ -1156,13 +1219,11 @@ int bch2_alloc_sectors_start_trans(struct btree_trans *trans,
 	struct open_bucket *ob;
 	struct open_buckets ptrs;
 	unsigned nr_effective, write_points_nr;
-	unsigned ob_flags = 0;
 	bool have_cache;
 	int ret;
 	int i;
 
-	if (!(flags & BCH_WRITE_ONLY_SPECIFIED_DEVS))
-		ob_flags |= BUCKET_ALLOC_USE_DURABILITY;
+	BUG_ON(flags & BCH_WRITE_ONLY_SPECIFIED_DEVS);
 
 	BUG_ON(!nr_replicas || !nr_replicas_required);
 retry:
@@ -1173,34 +1234,42 @@ retry:
 
 	*wp_ret = wp = writepoint_find(trans, write_point.v);
 
-	if (wp->data_type == BCH_DATA_user)
-		ob_flags |= BUCKET_MAY_ALLOC_PARTIAL;
-
 	/* metadata may not allocate on cache devices: */
 	if (wp->data_type != BCH_DATA_user)
 		have_cache = true;
 
-	if (!target || (flags & BCH_WRITE_ONLY_SPECIFIED_DEVS)) {
-		ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have,
-					      target, erasure_code,
-					      nr_replicas, &nr_effective,
-					      &have_cache, reserve,
-					      ob_flags, cl);
-	} else {
+	if (target && !(flags & BCH_WRITE_ONLY_SPECIFIED_DEVS)) {
 		ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have,
 					      target, erasure_code,
 					      nr_replicas, &nr_effective,
 					      &have_cache, reserve,
-					      ob_flags, NULL);
+					      flags, NULL);
 		if (!ret ||
 		    bch2_err_matches(ret, BCH_ERR_transaction_restart))
 			goto alloc_done;
 
+		/* Don't retry from all devices if we're out of open buckets: */
+		if (bch2_err_matches(ret, BCH_ERR_open_buckets_empty))
+			goto allocate_blocking;
+
+		/*
+		 * Only try to allocate cache (durability = 0 devices) from the
+		 * specified target:
+		 */
+		have_cache = true;
+
 		ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have,
 					      0, erasure_code,
 					      nr_replicas, &nr_effective,
 					      &have_cache, reserve,
-					      ob_flags, cl);
+					      flags, cl);
+	} else {
+allocate_blocking:
+		ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have,
+					      target, erasure_code,
+					      nr_replicas, &nr_effective,
+					      &have_cache, reserve,
+					      flags, cl);
 	}
 alloc_done:
 	BUG_ON(!ret && nr_effective < nr_replicas);
diff --git a/fs/bcachefs/alloc_foreground.h b/fs/bcachefs/alloc_foreground.h
index 4f492e278493..1fa96f8c6879 100644
--- a/fs/bcachefs/alloc_foreground.h
+++ b/fs/bcachefs/alloc_foreground.h
@@ -31,8 +31,7 @@ void bch2_dev_stripe_increment(struct bch_dev *, struct dev_stripe_state *);
 long bch2_bucket_alloc_new_fs(struct bch_dev *);
 
 struct open_bucket *bch2_bucket_alloc(struct bch_fs *, struct bch_dev *,
-				      enum alloc_reserve, bool,
-				      struct closure *);
+				      enum alloc_reserve, struct closure *);
 
 static inline void ob_push(struct bch_fs *c, struct open_buckets *obs,
 			   struct open_bucket *ob)
@@ -152,8 +151,9 @@ static inline bool bch2_bucket_is_open_safe(struct bch_fs *c, unsigned dev, u64
 
 int bch2_bucket_alloc_set_trans(struct btree_trans *, struct open_buckets *,
 		      struct dev_stripe_state *, struct bch_devs_mask *,
-		      unsigned, unsigned *, bool *, enum alloc_reserve,
-		      unsigned, struct closure *);
+		      unsigned, unsigned *, bool *, unsigned,
+		      enum bch_data_type, enum alloc_reserve,
+		      struct closure *);
 
 int bch2_alloc_sectors_start_trans(struct btree_trans *,
 				   unsigned, unsigned,
diff --git a/fs/bcachefs/alloc_types.h b/fs/bcachefs/alloc_types.h
index 4d09bd20d8ec..cd0c50aae416 100644
--- a/fs/bcachefs/alloc_types.h
+++ b/fs/bcachefs/alloc_types.h
@@ -53,10 +53,9 @@ struct open_bucket {
 	 * the block in the stripe this open_bucket corresponds to:
 	 */
 	u8			ec_idx;
-	enum bch_data_type	data_type:8;
+	enum bch_data_type	data_type:6;
 	unsigned		valid:1;
 	unsigned		on_partial_list:1;
-	unsigned		alloc_reserve:3;
 
 	u8			dev;
 	u8			gen;
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index e9470c06b1fa..62bfde035f78 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -1451,9 +1451,9 @@ static int new_stripe_alloc_buckets(struct btree_trans *trans, struct ec_stripe_
 					    &devs,
 					    h->s->nr_parity,
 					    &nr_have_parity,
-					    &have_cache,
+					    &have_cache, 0,
+					    BCH_DATA_parity,
 					    reserve,
-					    0,
 					    cl);
 
 		open_bucket_for_each(c, &buckets, ob, i) {
@@ -1478,9 +1478,9 @@ static int new_stripe_alloc_buckets(struct btree_trans *trans, struct ec_stripe_
 					    &devs,
 					    h->s->nr_data,
 					    &nr_have_data,
-					    &have_cache,
+					    &have_cache, 0,
+					    BCH_DATA_user,
 					    reserve,
-					    0,
 					    cl);
 
 		open_bucket_for_each(c, &buckets, ob, i) {
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index 5dde208b4801..3b07982c2330 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -780,8 +780,7 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
 				break;
 			}
 		} else {
-			ob[nr_got] = bch2_bucket_alloc(c, ca, RESERVE_none,
-					       false, cl);
+			ob[nr_got] = bch2_bucket_alloc(c, ca, RESERVE_none, cl);
 			ret = PTR_ERR_OR_ZERO(ob[nr_got]);
 			if (ret)
 				break;
diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h
index aae04d0619be..7e48e7676980 100644
--- a/fs/bcachefs/trace.h
+++ b/fs/bcachefs/trace.h
@@ -516,7 +516,6 @@ DEFINE_EVENT(bch_fs, gc_gens_end,
 
 DECLARE_EVENT_CLASS(bucket_alloc,
 	TP_PROTO(struct bch_dev *ca, const char *alloc_reserve,
-		 bool user,
 		 u64 bucket,
 		 u64 free,
 		 u64 avail,
@@ -525,14 +524,13 @@ DECLARE_EVENT_CLASS(bucket_alloc,
 		 struct bucket_alloc_state *s,
 		 bool nonblocking,
 		 const char *err),
-	TP_ARGS(ca, alloc_reserve, user, bucket, free, avail,
+	TP_ARGS(ca, alloc_reserve, bucket, free, avail,
 		copygc_wait_amount, copygc_waiting_for,
 		s, nonblocking, err),
 
 	TP_STRUCT__entry(
 		__field(u8,			dev			)
 		__array(char,	reserve,	16			)
-		__field(bool,			user	)
 		__field(u64,			bucket	)
 		__field(u64,			free			)
 		__field(u64,			avail			)
@@ -550,7 +548,6 @@ DECLARE_EVENT_CLASS(bucket_alloc,
 	TP_fast_assign(
 		__entry->dev		= ca->dev_idx;
 		strscpy(__entry->reserve, alloc_reserve, sizeof(__entry->reserve));
-		__entry->user		= user;
 		__entry->bucket		= bucket;
 		__entry->free		= free;
 		__entry->avail		= avail;
@@ -565,9 +562,8 @@ DECLARE_EVENT_CLASS(bucket_alloc,
 		strscpy(__entry->err, err, sizeof(__entry->err));
 	),
 
-	TP_printk("reserve %s user %u bucket %u:%llu free %llu avail %llu copygc_wait %llu/%lli seen %llu open %llu need_journal_commit %llu nouse %llu nocow %llu nonblocking %u err %s",
+	TP_printk("reserve %s bucket %u:%llu free %llu avail %llu copygc_wait %llu/%lli seen %llu open %llu need_journal_commit %llu nouse %llu nocow %llu nonblocking %u err %s",
 		  __entry->reserve,
-		  __entry->user,
 		  __entry->dev,
 		  __entry->bucket,
 		  __entry->free,
@@ -585,7 +581,6 @@ DECLARE_EVENT_CLASS(bucket_alloc,
 
 DEFINE_EVENT(bucket_alloc, bucket_alloc,
 	TP_PROTO(struct bch_dev *ca, const char *alloc_reserve,
-		 bool user,
 		 u64 bucket,
 		 u64 free,
 		 u64 avail,
@@ -594,14 +589,13 @@ DEFINE_EVENT(bucket_alloc, bucket_alloc,
 		 struct bucket_alloc_state *s,
 		 bool nonblocking,
 		 const char *err),
-	TP_ARGS(ca, alloc_reserve, user, bucket, free, avail,
+	TP_ARGS(ca, alloc_reserve, bucket, free, avail,
 		copygc_wait_amount, copygc_waiting_for,
 		s, nonblocking, err)
 );
 
 DEFINE_EVENT(bucket_alloc, bucket_alloc_fail,
 	TP_PROTO(struct bch_dev *ca, const char *alloc_reserve,
-		 bool user,
 		 u64 bucket,
 		 u64 free,
 		 u64 avail,
@@ -610,7 +604,7 @@ DEFINE_EVENT(bucket_alloc, bucket_alloc_fail,
 		 struct bucket_alloc_state *s,
 		 bool nonblocking,
 		 const char *err),
-	TP_ARGS(ca, alloc_reserve, user, bucket, free, avail,
+	TP_ARGS(ca, alloc_reserve, bucket, free, avail,
 		copygc_wait_amount, copygc_waiting_for,
 		s, nonblocking, err)
 );
-- 
cgit 


From 51fe0332b1e39822a6d67a0da656fcfc0db03e99 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 10 Mar 2023 14:34:30 -0500
Subject: bcachefs: Suppress transaction restart err message

This isn't a real error, and doesn't need to be printed.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fsck.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index e232f331ae9a..5e6dc6c316d1 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -954,11 +954,11 @@ static int check_inode(struct btree_trans *trans,
 				     iter->pos.snapshot),
 				POS(u.bi_inum, U64_MAX),
 				0, NULL);
-		if (ret) {
+		if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
 			bch_err(c, "error in fsck: error truncating inode: %s",
 				bch2_err_str(ret));
+		if (ret)
 			return ret;
-		}
 
 		/*
 		 * We truncated without our normal sector accounting hook, just
-- 
cgit 


From 5bf9db0179c3eb1b7d9f9b3c3fe0d30f1364bb1f Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 10 Mar 2023 17:40:21 -0500
Subject: bcachefs: evacuate_bucket() no longer calls verify_bucket_evacuated()

The copygc code itself now calls this when all moves from a given bucket
are complete.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/move.c | 8 --------
 1 file changed, 8 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index 3a650bc4173a..681d134f2e43 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -818,14 +818,6 @@ next:
 	}
 
 	trace_evacuate_bucket(c, &bucket, dirty_sectors, bucket_size, fragmentation, ret);
-
-	if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) && gen >= 0) {
-		bch2_trans_unlock(trans);
-		move_ctxt_wait_event(ctxt, NULL, list_empty(&ctxt->reads));
-		closure_sync(&ctxt->cl);
-		if (!ctxt->write_error)
-			bch2_verify_bucket_evacuated(trans, bucket, gen);
-	}
 err:
 	bch2_bkey_buf_exit(&sk, c);
 	return ret;
-- 
cgit 


From 3f5d3fb4025a7196e75250ecee8b6478f086a145 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 10 Mar 2023 18:00:10 -0500
Subject: bcachefs: evacuate_bucket() no longer moves cached ptrs

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/move.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index 681d134f2e43..de10f388b8d3 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -758,8 +758,13 @@ int __bch2_evacuate_bucket(struct btree_trans *trans,
 			data_opts.rewrite_ptrs = 0;
 
 			bkey_for_each_ptr(bch2_bkey_ptrs_c(k), ptr) {
-				if (ptr->dev == bucket.inode)
+				if (ptr->dev == bucket.inode) {
 					data_opts.rewrite_ptrs |= 1U << i;
+					if (ptr->cached) {
+						bch2_trans_iter_exit(trans, &iter);
+						goto next;
+					}
+				}
 				i++;
 			}
 
-- 
cgit 


From 702ffea204840455e4f2d918538c39cc5c59666b Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 10 Mar 2023 16:28:37 -0500
Subject: bcachefs: Extent helper improvements

 - __bch2_bkey_drop_ptr() -> bch2_bkey_drop_ptr_noerror(), now available
   outside extents.

 - Split bch2_bkey_has_device() and bch2_bkey_has_device_c(), const and
   non const versions

 - bch2_extent_has_ptr() now returns the pointer it found

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/data_update.c |  6 +++---
 fs/bcachefs/ec.c          |  2 +-
 fs/bcachefs/extents.c     | 48 ++++++++++++++++++++++++++++++-----------------
 fs/bcachefs/extents.h     | 16 +++++++++++++---
 fs/bcachefs/journal.c     |  2 +-
 fs/bcachefs/journal_io.c  |  3 +--
 fs/bcachefs/migrate.c     |  5 ++---
 7 files changed, 52 insertions(+), 30 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c
index 26b351d48940..8332e8a0b05a 100644
--- a/fs/bcachefs/data_update.c
+++ b/fs/bcachefs/data_update.c
@@ -170,13 +170,13 @@ static int __bch2_data_update_index_update(struct btree_trans *trans,
 		i = 0;
 		bkey_for_each_ptr_decode(old.k, bch2_bkey_ptrs_c(old), p, entry) {
 			if (((1U << i) & m->data_opts.rewrite_ptrs) &&
-			    bch2_extent_has_ptr(old, p, bkey_i_to_s_c(insert))) {
+			    bch2_extent_has_ptr(old, p, bkey_i_to_s(insert))) {
 				/*
 				 * If we're going to be adding a pointer to the
 				 * same device, we have to drop the old one -
 				 * otherwise, we can just mark it cached:
 				 */
-				if (bch2_bkey_has_device(bkey_i_to_s_c(&new->k_i), p.ptr.dev))
+				if (bch2_bkey_has_device_c(bkey_i_to_s_c(&new->k_i), p.ptr.dev))
 					bch2_bkey_drop_device_noerror(bkey_i_to_s(insert), p.ptr.dev);
 				else
 					bch2_bkey_mark_dev_cached(bkey_i_to_s(insert), p.ptr.dev);
@@ -188,7 +188,7 @@ static int __bch2_data_update_index_update(struct btree_trans *trans,
 		/* Add new ptrs: */
 		extent_for_each_ptr_decode(extent_i_to_s(new), p, entry) {
 			const struct bch_extent_ptr *existing_ptr =
-				bch2_bkey_has_device(bkey_i_to_s_c(insert), p.ptr.dev);
+				bch2_bkey_has_device_c(bkey_i_to_s_c(insert), p.ptr.dev);
 
 			if (existing_ptr && existing_ptr->cached) {
 				/*
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index 62bfde035f78..4adbfd4855f7 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -959,7 +959,7 @@ static int ec_stripe_update_extent(struct btree_trans *trans,
 	bkey_reassemble(n, k);
 
 	bch2_bkey_drop_ptrs(bkey_i_to_s(n), ptr, ptr->dev != dev);
-	ec_ptr = (void *) bch2_bkey_has_device(bkey_i_to_s_c(n), dev);
+	ec_ptr = bch2_bkey_has_device(bkey_i_to_s(n), dev);
 	BUG_ON(!ec_ptr);
 
 	stripe_ptr = (struct bch_extent_stripe_ptr) {
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 38be9bf91264..17e9c434619b 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -25,8 +25,6 @@
 #include "trace.h"
 #include "util.h"
 
-static union bch_extent_entry *__bch2_bkey_drop_ptr(struct bkey_s, struct bch_extent_ptr *);
-
 static unsigned bch2_crc_field_size_max[] = {
 	[BCH_EXTENT_ENTRY_crc32] = CRC32_SIZE_MAX,
 	[BCH_EXTENT_ENTRY_crc64] = CRC64_SIZE_MAX,
@@ -511,7 +509,7 @@ restart_narrow_pointers:
 
 	bkey_for_each_ptr_decode(&k->k, ptrs, p, i)
 		if (can_narrow_crc(p.crc, n)) {
-			__bch2_bkey_drop_ptr(bkey_i_to_s(k), &i->ptr);
+			bch2_bkey_drop_ptr_noerror(bkey_i_to_s(k), &i->ptr);
 			p.ptr.offset += p.crc.offset;
 			p.crc = n;
 			bch2_extent_ptr_decoded_append(k, &p);
@@ -691,7 +689,21 @@ unsigned bch2_bkey_durability(struct bch_fs *c, struct bkey_s_c k)
 	unsigned durability = 0;
 
 	bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
-		durability += bch2_extent_ptr_durability(c,& p);
+		durability += bch2_extent_ptr_durability(c, &p);
+
+	return durability;
+}
+
+static unsigned bch2_bkey_durability_safe(struct bch_fs *c, struct bkey_s_c k)
+{
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+	const union bch_extent_entry *entry;
+	struct extent_ptr_decoded p;
+	unsigned durability = 0;
+
+	bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
+		if (p.ptr.dev < c->sb.nr_devices && c->devs[p.ptr.dev])
+			durability += bch2_extent_ptr_durability(c, &p);
 
 	return durability;
 }
@@ -764,8 +776,8 @@ static void extent_entry_drop(struct bkey_s k, union bch_extent_entry *entry)
 /*
  * Returns pointer to the next entry after the one being dropped:
  */
-static union bch_extent_entry *__bch2_bkey_drop_ptr(struct bkey_s k,
-					   struct bch_extent_ptr *ptr)
+union bch_extent_entry *bch2_bkey_drop_ptr_noerror(struct bkey_s k,
+						   struct bch_extent_ptr *ptr)
 {
 	struct bkey_ptrs ptrs = bch2_bkey_ptrs(k);
 	union bch_extent_entry *entry = to_entry(ptr), *next;
@@ -808,7 +820,7 @@ union bch_extent_entry *bch2_bkey_drop_ptr(struct bkey_s k,
 {
 	bool have_dirty = bch2_bkey_dirty_devs(k.s_c).nr;
 	union bch_extent_entry *ret =
-		__bch2_bkey_drop_ptr(k, ptr);
+		bch2_bkey_drop_ptr_noerror(k, ptr);
 
 	/*
 	 * If we deleted all the dirty pointers and there's still cached
@@ -839,14 +851,13 @@ void bch2_bkey_drop_device(struct bkey_s k, unsigned dev)
 
 void bch2_bkey_drop_device_noerror(struct bkey_s k, unsigned dev)
 {
-	struct bch_extent_ptr *ptr = (void *) bch2_bkey_has_device(k.s_c, dev);
+	struct bch_extent_ptr *ptr = bch2_bkey_has_device(k, dev);
 
 	if (ptr)
-		__bch2_bkey_drop_ptr(k, ptr);
+		bch2_bkey_drop_ptr_noerror(k, ptr);
 }
 
-const struct bch_extent_ptr *
-bch2_bkey_has_device(struct bkey_s_c k, unsigned dev)
+const struct bch_extent_ptr *bch2_bkey_has_device_c(struct bkey_s_c k, unsigned dev)
 {
 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
 	const struct bch_extent_ptr *ptr;
@@ -921,11 +932,11 @@ bool bch2_extents_match(struct bkey_s_c k1, struct bkey_s_c k2)
 	}
 }
 
-bool bch2_extent_has_ptr(struct bkey_s_c k1, struct extent_ptr_decoded p1,
-			 struct bkey_s_c k2)
+struct bch_extent_ptr *
+bch2_extent_has_ptr(struct bkey_s_c k1, struct extent_ptr_decoded p1, struct bkey_s k2)
 {
-	struct bkey_ptrs_c ptrs2 = bch2_bkey_ptrs_c(k2);
-	const union bch_extent_entry *entry2;
+	struct bkey_ptrs ptrs2 = bch2_bkey_ptrs(k2);
+	union bch_extent_entry *entry2;
 	struct extent_ptr_decoded p2;
 
 	bkey_for_each_ptr_decode(k2.k, ptrs2, p2, entry2)
@@ -933,9 +944,9 @@ bool bch2_extent_has_ptr(struct bkey_s_c k1, struct extent_ptr_decoded p1,
 		    p1.ptr.gen		== p2.ptr.gen &&
 		    (s64) p1.ptr.offset + p1.crc.offset - bkey_start_offset(k1.k) ==
 		    (s64) p2.ptr.offset + p2.crc.offset - bkey_start_offset(k2.k))
-			return true;
+			return &entry2->ptr;
 
-	return false;
+	return NULL;
 }
 
 void bch2_extent_ptr_set_cached(struct bkey_s k, struct bch_extent_ptr *ptr)
@@ -991,6 +1002,9 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
 	struct bch_dev *ca;
 	bool first = true;
 
+	if (c)
+		prt_printf(out, "durability: %u ", bch2_bkey_durability_safe(c, k));
+
 	bkey_extent_entry_for_each(ptrs, entry) {
 		if (!first)
 			prt_printf(out, " ");
diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h
index bac6a1ed2c59..9b026ae95932 100644
--- a/fs/bcachefs/extents.h
+++ b/fs/bcachefs/extents.h
@@ -613,14 +613,21 @@ unsigned bch2_bkey_durability(struct bch_fs *, struct bkey_s_c);
 
 void bch2_bkey_drop_device(struct bkey_s, unsigned);
 void bch2_bkey_drop_device_noerror(struct bkey_s, unsigned);
-const struct bch_extent_ptr *bch2_bkey_has_device(struct bkey_s_c, unsigned);
+
+const struct bch_extent_ptr *bch2_bkey_has_device_c(struct bkey_s_c, unsigned);
+
+static inline struct bch_extent_ptr *bch2_bkey_has_device(struct bkey_s k, unsigned dev)
+{
+	return (void *) bch2_bkey_has_device_c(k.s_c, dev);
+}
+
 bool bch2_bkey_has_target(struct bch_fs *, struct bkey_s_c, unsigned);
 
 void bch2_bkey_extent_entry_drop(struct bkey_i *, union bch_extent_entry *);
 
 static inline void bch2_bkey_append_ptr(struct bkey_i *k, struct bch_extent_ptr ptr)
 {
-	EBUG_ON(bch2_bkey_has_device(bkey_i_to_s_c(k), ptr.dev));
+	EBUG_ON(bch2_bkey_has_device(bkey_i_to_s(k), ptr.dev));
 
 	switch (k->k.type) {
 	case KEY_TYPE_btree_ptr:
@@ -642,6 +649,8 @@ static inline void bch2_bkey_append_ptr(struct bkey_i *k, struct bch_extent_ptr
 
 void bch2_extent_ptr_decoded_append(struct bkey_i *,
 				    struct extent_ptr_decoded *);
+union bch_extent_entry *bch2_bkey_drop_ptr_noerror(struct bkey_s,
+						   struct bch_extent_ptr *);
 union bch_extent_entry *bch2_bkey_drop_ptr(struct bkey_s,
 					   struct bch_extent_ptr *);
 
@@ -665,7 +674,8 @@ do {									\
 bool bch2_bkey_matches_ptr(struct bch_fs *, struct bkey_s_c,
 			   struct bch_extent_ptr, u64);
 bool bch2_extents_match(struct bkey_s_c, struct bkey_s_c);
-bool bch2_extent_has_ptr(struct bkey_s_c, struct extent_ptr_decoded, struct bkey_s_c);
+struct bch_extent_ptr *
+bch2_extent_has_ptr(struct bkey_s_c, struct extent_ptr_decoded, struct bkey_s);
 
 void bch2_extent_ptr_set_cached(struct bkey_s, struct bch_extent_ptr *);
 
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index 3b07982c2330..410521f11ec2 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -971,7 +971,7 @@ static bool bch2_journal_writing_to_device(struct journal *j, unsigned dev_idx)
 	     seq++) {
 		struct journal_buf *buf = journal_seq_to_buf(j, seq);
 
-		if (bch2_bkey_has_device(bkey_i_to_s_c(&buf->key), dev_idx))
+		if (bch2_bkey_has_device_c(bkey_i_to_s_c(&buf->key), dev_idx))
 			ret = true;
 	}
 	spin_unlock(&j->lock);
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 51c26f9857d9..97b131fd72e6 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -1338,8 +1338,7 @@ static void __journal_write_alloc(struct journal *j,
 		if (!ca->mi.durability ||
 		    ca->mi.state != BCH_MEMBER_STATE_rw ||
 		    !ja->nr ||
-		    bch2_bkey_has_device(bkey_i_to_s_c(&w->key),
-					 ca->dev_idx) ||
+		    bch2_bkey_has_device_c(bkey_i_to_s_c(&w->key), ca->dev_idx) ||
 		    sectors > ja->sectors_free)
 			continue;
 
diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c
index e3e39127b40a..d93db07f0c87 100644
--- a/fs/bcachefs/migrate.c
+++ b/fs/bcachefs/migrate.c
@@ -46,7 +46,7 @@ static int bch2_dev_usrdata_drop_key(struct btree_trans *trans,
 	struct bkey_i *n;
 	int ret;
 
-	if (!bch2_bkey_has_device(k, dev_idx))
+	if (!bch2_bkey_has_device_c(k, dev_idx))
 		return 0;
 
 	n = bch2_bkey_make_mut(trans, k);
@@ -130,8 +130,7 @@ retry:
 		while (bch2_trans_begin(&trans),
 		       (b = bch2_btree_iter_peek_node(&iter)) &&
 		       !(ret = PTR_ERR_OR_ZERO(b))) {
-			if (!bch2_bkey_has_device(bkey_i_to_s_c(&b->key),
-						  dev_idx))
+			if (!bch2_bkey_has_device_c(bkey_i_to_s_c(&b->key), dev_idx))
 				goto next;
 
 			bch2_bkey_buf_copy(&k, c, &b->key);
-- 
cgit 


From 57c723de7d4c592ab3a38e77d414d0021e8483ee Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 4 Mar 2023 03:21:34 -0500
Subject: bcachefs: Rework __bch2_data_update_index_update()

This makes some improvements to the logic for adding/removing replicas,
as part of the larger erasure coding improvements. We now directly
consider number of replicas desired for the given inode, and
extent/pointer durability: this ensures that the extent ends up with the
desired number of replicas when we're replacing multiple pointers with
one that has higher durability (e.g. erasure coded).

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/data_update.c | 104 +++++++++++++++++++++++-----------------------
 1 file changed, 52 insertions(+), 52 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c
index 8332e8a0b05a..447863825a89 100644
--- a/fs/bcachefs/data_update.c
+++ b/fs/bcachefs/data_update.c
@@ -91,18 +91,6 @@ static int insert_snapshot_whiteouts(struct btree_trans *trans,
 	return ret;
 }
 
-static void bch2_bkey_mark_dev_cached(struct bkey_s k, unsigned dev)
-{
-	struct bkey_ptrs ptrs = bch2_bkey_ptrs(k);
-	struct bch_extent_ptr *ptr;
-
-	bkey_for_each_ptr(ptrs, ptr)
-		if (ptr->dev == dev) {
-			bch2_extent_ptr_set_cached(k, ptr);
-			return;
-		}
-}
-
 static int __bch2_data_update_index_update(struct btree_trans *trans,
 					   struct bch_write_op *op)
 {
@@ -125,15 +113,17 @@ static int __bch2_data_update_index_update(struct btree_trans *trans,
 	while (1) {
 		struct bkey_s_c k;
 		struct bkey_s_c old = bkey_i_to_s_c(m->k.k);
-		struct bkey_i *insert;
+		struct bkey_i *insert = NULL;
 		struct bkey_i_extent *new;
-		const union bch_extent_entry *entry;
+		const union bch_extent_entry *entry_c;
+		union bch_extent_entry *entry;
 		struct extent_ptr_decoded p;
+		struct bch_extent_ptr *ptr;
+		const struct bch_extent_ptr *ptr_c;
 		struct bpos next_pos;
-		bool did_work = false;
 		bool should_check_enospc;
 		s64 i_sectors_delta = 0, disk_sectors_delta = 0;
-		unsigned i;
+		unsigned rewrites_found = 0, durability, i;
 
 		bch2_trans_begin(trans);
 
@@ -145,7 +135,7 @@ static int __bch2_data_update_index_update(struct btree_trans *trans,
 		new = bkey_i_to_extent(bch2_keylist_front(keys));
 
 		if (!bch2_extents_match(k, old))
-			goto nomatch;
+			goto nowork;
 
 		bkey_reassemble(_insert.k, k);
 		insert = _insert.k;
@@ -168,50 +158,60 @@ static int __bch2_data_update_index_update(struct btree_trans *trans,
 		 * Fist, drop rewrite_ptrs from @new:
 		 */
 		i = 0;
-		bkey_for_each_ptr_decode(old.k, bch2_bkey_ptrs_c(old), p, entry) {
+		bkey_for_each_ptr_decode(old.k, bch2_bkey_ptrs_c(old), p, entry_c) {
 			if (((1U << i) & m->data_opts.rewrite_ptrs) &&
-			    bch2_extent_has_ptr(old, p, bkey_i_to_s(insert))) {
-				/*
-				 * If we're going to be adding a pointer to the
-				 * same device, we have to drop the old one -
-				 * otherwise, we can just mark it cached:
-				 */
-				if (bch2_bkey_has_device_c(bkey_i_to_s_c(&new->k_i), p.ptr.dev))
-					bch2_bkey_drop_device_noerror(bkey_i_to_s(insert), p.ptr.dev);
-				else
-					bch2_bkey_mark_dev_cached(bkey_i_to_s(insert), p.ptr.dev);
+			    (ptr = bch2_extent_has_ptr(old, p, bkey_i_to_s(insert))) &&
+			    !ptr->cached) {
+				bch2_extent_ptr_set_cached(bkey_i_to_s(insert), ptr);
+				rewrites_found |= 1U << i;
 			}
 			i++;
 		}
 
+		if (m->data_opts.rewrite_ptrs &&
+		    !rewrites_found &&
+		    bch2_bkey_durability(c, k) >= m->op.opts.data_replicas)
+			goto nowork;
 
-		/* Add new ptrs: */
-		extent_for_each_ptr_decode(extent_i_to_s(new), p, entry) {
-			const struct bch_extent_ptr *existing_ptr =
-				bch2_bkey_has_device_c(bkey_i_to_s_c(insert), p.ptr.dev);
-
-			if (existing_ptr && existing_ptr->cached) {
-				/*
-				 * We're replacing a cached pointer with a non
-				 * cached pointer:
-				 */
-				bch2_bkey_drop_device_noerror(bkey_i_to_s(insert),
-							      existing_ptr->dev);
-			} else if (existing_ptr) {
-				/*
-				 * raced with another move op? extent already
-				 * has a pointer to the device we just wrote
-				 * data to
-				 */
-				continue;
+		/*
+		 * A replica that we just wrote might conflict with a replica
+		 * that we want to keep, due to racing with another move:
+		 */
+restart_drop_conflicting_replicas:
+		extent_for_each_ptr(extent_i_to_s(new), ptr)
+			if ((ptr_c = bch2_bkey_has_device_c(bkey_i_to_s_c(insert), ptr->dev)) &&
+			    !ptr_c->cached) {
+				bch2_bkey_drop_ptr_noerror(bkey_i_to_s(&new->k_i), ptr);
+				goto restart_drop_conflicting_replicas;
 			}
 
-			bch2_extent_ptr_decoded_append(insert, &p);
-			did_work = true;
+		if (!bkey_val_u64s(&new->k))
+			goto nowork;
+
+		/* Now, drop pointers that conflict with what we just wrote: */
+		extent_for_each_ptr_decode(extent_i_to_s(new), p, entry)
+			if ((ptr = bch2_bkey_has_device(bkey_i_to_s(insert), p.ptr.dev)))
+				bch2_bkey_drop_ptr_noerror(bkey_i_to_s(insert), ptr);
+
+		durability = bch2_bkey_durability(c, bkey_i_to_s_c(insert)) +
+			bch2_bkey_durability(c, bkey_i_to_s_c(&new->k_i));
+
+		/* Now, drop excess replicas: */
+restart_drop_extra_replicas:
+		bkey_for_each_ptr_decode(old.k, bch2_bkey_ptrs(bkey_i_to_s(insert)), p, entry) {
+			unsigned ptr_durability = bch2_extent_ptr_durability(c, &p);
+
+			if (!p.ptr.cached &&
+			    durability - ptr_durability >= m->op.opts.data_replicas) {
+				durability -= ptr_durability;
+				bch2_extent_ptr_set_cached(bkey_i_to_s(insert), &entry->ptr);
+				goto restart_drop_extra_replicas;
+			}
 		}
 
-		if (!did_work)
-			goto nomatch;
+		/* Finally, add the pointers we just wrote: */
+		extent_for_each_ptr_decode(extent_i_to_s(new), p, entry)
+			bch2_extent_ptr_decoded_append(insert, &p);
 
 		bch2_bkey_narrow_crcs(insert, (struct bch_extent_crc_unpacked) { 0 });
 		bch2_extent_normalize(c, bkey_i_to_s(insert));
@@ -272,7 +272,7 @@ next:
 				goto out;
 		}
 		continue;
-nomatch:
+nowork:
 		if (m->ctxt && m->ctxt->stats) {
 			BUG_ON(k.k->p.offset <= iter.pos.offset);
 			atomic64_inc(&m->ctxt->stats->keys_raced);
-- 
cgit 


From 10d9f7d2853d5e4c6f21a0dc96f6e98c2d0828e7 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 10 Mar 2023 16:46:24 -0500
Subject: bcachefs: ec: fall back to creating new stripes for copygc

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/ec.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index 4adbfd4855f7..1fd68d44b90f 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -1706,6 +1706,14 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans,
 		if (waiting || !cl || ret != -BCH_ERR_stripe_alloc_blocked)
 			goto err;
 
+		if (reserve == RESERVE_movinggc) {
+			ret =   new_stripe_alloc_buckets(trans, h, reserve, NULL) ?:
+				__bch2_ec_stripe_head_reserve(trans, h);
+			if (ret)
+				goto err;
+			goto allocate_buf;
+		}
+
 		/* XXX freelist_wait? */
 		closure_wait(&c->freelist_wait, cl);
 		waiting = true;
-- 
cgit 


From fba053d2aaca8f9a4486e865452d80245a8cc215 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 9 Mar 2023 10:18:09 -0500
Subject: bcachefs: Second layer of refcounting for new stripes

This will be used for move writes, which will be waiting until the
stripe is created to do the index update. They need to prevent the
stripe from being reclaimed until their index update is done, so we need
another refcount that just keeps the stripe open.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>

# Conflicts:
#	fs/bcachefs/ec.c
#	fs/bcachefs/io.c
---
 fs/bcachefs/alloc_foreground.c |  4 ++--
 fs/bcachefs/ec.c               | 32 +++++++++++++++++++++-----------
 fs/bcachefs/ec.h               | 36 ++++++++++++++++++++++++++----------
 3 files changed, 49 insertions(+), 23 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index 4621ef7f1e50..7c81189bcd62 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -97,7 +97,7 @@ void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob)
 	struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
 
 	if (ob->ec) {
-		ec_stripe_new_put(c, ob->ec);
+		ec_stripe_new_put(c, ob->ec, STRIPE_REF_io);
 		return;
 	}
 
@@ -799,7 +799,7 @@ got_bucket:
 
 	ob->ec_idx	= ec_idx;
 	ob->ec		= h->s;
-	ec_stripe_new_get(h->s);
+	ec_stripe_new_get(h->s, STRIPE_REF_io);
 
 	ret = add_new_bucket(c, ptrs, devs_may_alloc,
 			     nr_replicas, nr_effective,
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index 1fd68d44b90f..0a2e7db6906f 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -1057,6 +1057,13 @@ static void zero_out_rest_of_ec_bucket(struct bch_fs *c,
 		s->err = ret;
 }
 
+void bch2_ec_stripe_new_free(struct bch_fs *c, struct ec_stripe_new *s)
+{
+	if (s->idx)
+		bch2_stripe_close(c, s);
+	kfree(s);
+}
+
 /*
  * data buckets of new stripe all written: create the stripe
  */
@@ -1152,13 +1159,11 @@ err:
 	list_del(&s->list);
 	mutex_unlock(&c->ec_stripe_new_lock);
 
-	if (s->idx)
-		bch2_stripe_close(c, s);
-
 	ec_stripe_buf_exit(&s->existing_stripe);
 	ec_stripe_buf_exit(&s->new_stripe);
 	closure_debug_destroy(&s->iodone);
-	kfree(s);
+
+	ec_stripe_new_put(c, s, STRIPE_REF_stripe);
 }
 
 static struct ec_stripe_new *get_pending_stripe(struct bch_fs *c)
@@ -1167,7 +1172,7 @@ static struct ec_stripe_new *get_pending_stripe(struct bch_fs *c)
 
 	mutex_lock(&c->ec_stripe_new_lock);
 	list_for_each_entry(s, &c->ec_stripe_new_list, list)
-		if (!atomic_read(&s->pin))
+		if (!atomic_read(&s->ref[STRIPE_REF_io]))
 			goto out;
 	s = NULL;
 out:
@@ -1209,7 +1214,7 @@ static void ec_stripe_set_pending(struct bch_fs *c, struct ec_stripe_head *h)
 	list_add(&s->list, &c->ec_stripe_new_list);
 	mutex_unlock(&c->ec_stripe_new_lock);
 
-	ec_stripe_new_put(c, s);
+	ec_stripe_new_put(c, s, STRIPE_REF_io);
 }
 
 void bch2_ec_bucket_cancel(struct bch_fs *c, struct open_bucket *ob)
@@ -1321,7 +1326,8 @@ static int ec_new_stripe_alloc(struct bch_fs *c, struct ec_stripe_head *h)
 
 	mutex_init(&s->lock);
 	closure_init(&s->iodone, NULL);
-	atomic_set(&s->pin, 1);
+	atomic_set(&s->ref[STRIPE_REF_stripe], 1);
+	atomic_set(&s->ref[STRIPE_REF_io], 1);
 	s->c		= c;
 	s->h		= h;
 	s->nr_data	= min_t(unsigned, h->nr_active_devs,
@@ -1829,13 +1835,16 @@ void bch2_stripes_heap_to_text(struct printbuf *out, struct bch_fs *c)
 	size_t i;
 
 	mutex_lock(&c->ec_stripes_heap_lock);
-	for (i = 0; i < min_t(size_t, h->used, 20); i++) {
+	for (i = 0; i < min_t(size_t, h->used, 50); i++) {
 		m = genradix_ptr(&c->stripes, h->data[i].idx);
 
-		prt_printf(out, "%zu %u/%u+%u\n", h->data[i].idx,
+		prt_printf(out, "%zu %u/%u+%u", h->data[i].idx,
 		       h->data[i].blocks_nonempty,
 		       m->nr_blocks - m->nr_redundant,
 		       m->nr_redundant);
+		if (bch2_stripe_is_open(c, h->data[i].idx))
+			prt_str(out, " open");
+		prt_newline(out);
 	}
 	mutex_unlock(&c->ec_stripes_heap_lock);
 }
@@ -1860,9 +1869,10 @@ void bch2_new_stripes_to_text(struct printbuf *out, struct bch_fs *c)
 
 	mutex_lock(&c->ec_stripe_new_lock);
 	list_for_each_entry(s, &c->ec_stripe_new_list, list) {
-		prt_printf(out, "\tin flight: idx %llu blocks %u+%u pin %u\n",
+		prt_printf(out, "\tin flight: idx %llu blocks %u+%u ref %u %u\n",
 			   s->idx, s->nr_data, s->nr_parity,
-			   atomic_read(&s->pin));
+			   atomic_read(&s->ref[STRIPE_REF_io]),
+			   atomic_read(&s->ref[STRIPE_REF_stripe]));
 	}
 	mutex_unlock(&c->ec_stripe_new_lock);
 }
diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h
index d112aea9ec56..8f777a37e43d 100644
--- a/fs/bcachefs/ec.h
+++ b/fs/bcachefs/ec.h
@@ -143,6 +143,12 @@ struct ec_stripe_buf {
 
 struct ec_stripe_head;
 
+enum ec_stripe_ref {
+	STRIPE_REF_io,
+	STRIPE_REF_stripe,
+	STRIPE_REF_NR
+};
+
 struct ec_stripe_new {
 	struct bch_fs		*c;
 	struct ec_stripe_head	*h;
@@ -154,8 +160,7 @@ struct ec_stripe_new {
 
 	struct closure		iodone;
 
-	/* counts in flight writes, stripe is created when pin == 0 */
-	atomic_t		pin;
+	atomic_t		ref[STRIPE_REF_NR];
 
 	int			err;
 
@@ -213,19 +218,30 @@ void bch2_stripes_heap_insert(struct bch_fs *, struct stripe *, size_t);
 
 void bch2_do_stripe_deletes(struct bch_fs *);
 void bch2_ec_do_stripe_creates(struct bch_fs *);
+void bch2_ec_stripe_new_free(struct bch_fs *, struct ec_stripe_new *);
 
-static inline void ec_stripe_new_get(struct ec_stripe_new *s)
+static inline void ec_stripe_new_get(struct ec_stripe_new *s,
+				     enum ec_stripe_ref ref)
 {
-	atomic_inc(&s->pin);
+	atomic_inc(&s->ref[ref]);
 }
 
-static inline void ec_stripe_new_put(struct bch_fs *c, struct ec_stripe_new *s)
+static inline void ec_stripe_new_put(struct bch_fs *c, struct ec_stripe_new *s,
+				     enum ec_stripe_ref ref)
 {
-	BUG_ON(atomic_read(&s->pin) <= 0);
-	BUG_ON(!s->err && !s->idx);
-
-	if (atomic_dec_and_test(&s->pin))
-		bch2_ec_do_stripe_creates(c);
+	BUG_ON(atomic_read(&s->ref[ref]) <= 0);
+
+	if (atomic_dec_and_test(&s->ref[ref]))
+		switch (ref) {
+		case STRIPE_REF_stripe:
+			bch2_ec_stripe_new_free(c, s);
+			break;
+		case STRIPE_REF_io:
+			bch2_ec_do_stripe_creates(c);
+			break;
+		default:
+			unreachable();
+		}
 }
 
 void bch2_ec_stop_dev(struct bch_fs *, struct bch_dev *);
-- 
cgit 


From 46e14854fca4a262a823079c1958a204f983fa4e Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 11 Mar 2023 15:52:37 -0500
Subject: bcachefs: Fix next_bucket()

This fixes an infinite loop in bch2_get_key_or_real_bucket_hole().

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index ce0ea4886288..e5abe6406afe 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -1006,7 +1006,7 @@ static bool next_bucket(struct bch_fs *c, struct bpos *bucket)
 	iter = bucket->inode;
 	ca = __bch2_next_dev(c, &iter, NULL);
 	if (ca)
-		bucket->offset = ca->mi.first_bucket;
+		*bucket = POS(ca->dev_idx, ca->mi.first_bucket);
 	rcu_read_unlock();
 
 	return ca != NULL;
-- 
cgit 


From e28ef07e0ef47c03f773571d85bc82fcce831376 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 10 Mar 2023 23:37:19 -0500
Subject: bcachefs: Simplify stripe_idx_to_delete

This is not technically correct - it's subject to a race if we ever end
up with a stripe with all empty blocks (that needs to be deleted) being
held open. But the "correct" version was much too inefficient, and soon
we'll be adding a stripes LRU.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/ec.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index 0a2e7db6906f..c747ae2d4046 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -659,14 +659,13 @@ static void bch2_stripe_close(struct bch_fs *c, struct ec_stripe_new *s)
 static u64 stripe_idx_to_delete(struct bch_fs *c)
 {
 	ec_stripes_heap *h = &c->ec_stripes_heap;
-	size_t heap_idx;
 
 	lockdep_assert_held(&c->ec_stripes_heap_lock);
 
-	for (heap_idx = 0; heap_idx < h->used; heap_idx++)
-		if (h->data[heap_idx].blocks_nonempty == 0 &&
-		    !bch2_stripe_is_open(c, h->data[heap_idx].idx))
-			return h->data[heap_idx].idx;
+	if (h->used &&
+	    h->data[0].blocks_nonempty == 0 &&
+	    !bch2_stripe_is_open(c, h->data[0].idx))
+		return h->data[0].idx;
 
 	return 0;
 }
-- 
cgit 


From 751c025f0de7ca55ad5f77099645b5247623de98 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 11 Mar 2023 17:21:30 -0500
Subject: bcachefs: Kill bch_write_op->btree_update_ready

This changes the write path to not add write ops to to the write_point's
list of pending work items until it's ready; this means we have to
change the lock protecting it to an irq-safe lock, but means
bch2_write_point_do_index_updates() no longer has to iterate over the
list, which is beneficial with the way the new BCH_WRITE_WAIT_FOR_EC
code works.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/io.c       | 37 +++++++++++++------------------------
 fs/bcachefs/io_types.h |  2 +-
 2 files changed, 14 insertions(+), 25 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index 6bcc91e8ac96..6fd29966c1db 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -834,36 +834,30 @@ static void bch2_write_index(struct closure *cl)
 	struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
 	struct write_point *wp = op->wp;
 	struct workqueue_struct *wq = index_update_wq(op);
+	unsigned long flags;
 
 	if ((op->flags & BCH_WRITE_DONE) &&
 	    (op->flags & BCH_WRITE_MOVE))
 		bch2_bio_free_pages_pool(op->c, &op->wbio.bio);
 
-	barrier();
-
-	/*
-	 * We're not using wp->writes_lock here, so this is racey: that's ok,
-	 * because this is just for diagnostic purposes, and we're running out
-	 * of interrupt context here so if we were to take the log we'd have to
-	 * switch to spin_lock_irq()/irqsave(), which is not free:
-	 */
+	spin_lock_irqsave(&wp->writes_lock, flags);
 	if (wp->state == WRITE_POINT_waiting_io)
 		__wp_update_state(wp, WRITE_POINT_waiting_work);
+	list_add_tail(&op->wp_list, &wp->writes);
+	spin_unlock_irqrestore (&wp->writes_lock, flags);
 
-	op->btree_update_ready = true;
 	queue_work(wq, &wp->index_update_work);
 }
 
 static inline void bch2_write_queue(struct bch_write_op *op, struct write_point *wp)
 {
-	op->btree_update_ready = false;
 	op->wp = wp;
 
-	spin_lock(&wp->writes_lock);
-	list_add_tail(&op->wp_list, &wp->writes);
-	if (wp->state == WRITE_POINT_stopped)
+	if (wp->state == WRITE_POINT_stopped) {
+		spin_lock_irq(&wp->writes_lock);
 		__wp_update_state(wp, WRITE_POINT_waiting_io);
-	spin_unlock(&wp->writes_lock);
+		spin_unlock_irq(&wp->writes_lock);
+	}
 }
 
 void bch2_write_point_do_index_updates(struct work_struct *work)
@@ -873,16 +867,12 @@ void bch2_write_point_do_index_updates(struct work_struct *work)
 	struct bch_write_op *op;
 
 	while (1) {
-		spin_lock(&wp->writes_lock);
-		list_for_each_entry(op, &wp->writes, wp_list)
-			if (op->btree_update_ready) {
-				list_del(&op->wp_list);
-				goto unlock;
-			}
-		op = NULL;
-unlock:
+		spin_lock_irq(&wp->writes_lock);
+		op = list_first_entry_or_null(&wp->writes, struct bch_write_op, wp_list);
+		if (op)
+			list_del(&op->wp_list);
 		wp_update_state(wp, op != NULL);
-		spin_unlock(&wp->writes_lock);
+		spin_unlock_irq(&wp->writes_lock);
 
 		if (!op)
 			break;
@@ -1673,7 +1663,6 @@ static void __bch2_write(struct bch_write_op *op)
 	}
 again:
 	memset(&op->failed, 0, sizeof(op->failed));
-	op->btree_update_ready = false;
 
 	do {
 		struct bkey_i *key_to_write;
diff --git a/fs/bcachefs/io_types.h b/fs/bcachefs/io_types.h
index 200af9e3e6b0..4149291c0df6 100644
--- a/fs/bcachefs/io_types.h
+++ b/fs/bcachefs/io_types.h
@@ -121,7 +121,7 @@ struct bch_write_op {
 	unsigned		nr_replicas_required:4;
 	unsigned		alloc_reserve:3;
 	unsigned		incompressible:1;
-	unsigned		btree_update_ready:1;
+	unsigned		stripe_waited:1;
 
 	struct bch_devs_list	devs_have;
 	u16			target;
-- 
cgit 


From e6539b0aebd384549aef4dc3b917658f24bb6b83 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 11 Mar 2023 17:23:08 -0500
Subject: bcachefs: Improve bch2_new_stripes_to_text()

Print out the alloc reserve, and format it a bit more nicely.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/ec.c | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index c747ae2d4046..9ee29dac48ba 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -1855,23 +1855,27 @@ void bch2_new_stripes_to_text(struct printbuf *out, struct bch_fs *c)
 
 	mutex_lock(&c->ec_stripe_head_lock);
 	list_for_each_entry(h, &c->ec_stripe_head_list, list) {
-		prt_printf(out, "target %u algo %u redundancy %u:\n",
-		       h->target, h->algo, h->redundancy);
+		prt_printf(out, "target %u algo %u redundancy %u %s:\n",
+		       h->target, h->algo, h->redundancy,
+		       bch2_alloc_reserves[h->reserve]);
 
 		if (h->s)
-			prt_printf(out, "\tpending: idx %llu blocks %u+%u allocated %u\n",
+			prt_printf(out, "\tidx %llu blocks %u+%u allocated %u\n",
 			       h->s->idx, h->s->nr_data, h->s->nr_parity,
 			       bitmap_weight(h->s->blocks_allocated,
 					     h->s->nr_data));
 	}
 	mutex_unlock(&c->ec_stripe_head_lock);
 
+	prt_printf(out, "in flight:\n");
+
 	mutex_lock(&c->ec_stripe_new_lock);
 	list_for_each_entry(s, &c->ec_stripe_new_list, list) {
-		prt_printf(out, "\tin flight: idx %llu blocks %u+%u ref %u %u\n",
+		prt_printf(out, "\tidx %llu blocks %u+%u ref %u %u %s\n",
 			   s->idx, s->nr_data, s->nr_parity,
 			   atomic_read(&s->ref[STRIPE_REF_io]),
-			   atomic_read(&s->ref[STRIPE_REF_stripe]));
+			   atomic_read(&s->ref[STRIPE_REF_stripe]),
+			   bch2_alloc_reserves[s->h->reserve]);
 	}
 	mutex_unlock(&c->ec_stripe_new_lock);
 }
-- 
cgit 


From ae1f56238d55ad6d16564455559d1fa9a8cea2f6 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 13 Mar 2023 07:09:33 -0400
Subject: bcachefs: Mark new snapshots earlier in create path

This fixes a null ptr deref when creating new snapshots:
bch2_create_trans() will lookup the subvolume and find the _new_
snapshot in the BCH_CREATE_SUBVOL path that's being created in that
transaction.

We have to call bch2_mark_snapshot() earlier so that it's properly
initialized, instead of leaving it for transaction commit.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/subvolume.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c
index d76239654a89..bcc67c0f5dfc 100644
--- a/fs/bcachefs/subvolume.c
+++ b/fs/bcachefs/subvolume.c
@@ -513,7 +513,9 @@ int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent,
 		n->v.pad	= 0;
 		SET_BCH_SNAPSHOT_SUBVOL(&n->v, true);
 
-		ret   = bch2_trans_update(trans, &iter, &n->k_i, 0);
+		ret   = bch2_trans_update(trans, &iter, &n->k_i, 0) ?:
+			bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0,
+					   bkey_s_c_null, bkey_i_to_s_c(&n->k_i), 0);
 		if (ret)
 			goto err;
 
@@ -540,7 +542,7 @@ int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent,
 		n->v.children[1] = cpu_to_le32(new_snapids[1]);
 		n->v.subvol = 0;
 		SET_BCH_SNAPSHOT_SUBVOL(&n->v, false);
-		ret = bch2_trans_update(trans, &iter, &n->k_i, 0);
+		ret   = bch2_trans_update(trans, &iter, &n->k_i, 0);
 		if (ret)
 			goto err;
 	}
-- 
cgit 


From aebe7a679cbdd827769e85f632562a66d2a2b9f3 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 13 Mar 2023 09:53:04 -0400
Subject: bcachefs: Fix stripe create error path

If we errored out on a new stripe before fully allocating it, we
shouldn't be zeroing out unwritten data.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/ec.c | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index 9ee29dac48ba..74cfd9edd680 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -1078,13 +1078,15 @@ static void ec_stripe_create(struct ec_stripe_new *s)
 
 	closure_sync(&s->iodone);
 
-	for (i = 0; i < nr_data; i++)
-		if (s->blocks[i]) {
-			ob = c->open_buckets + s->blocks[i];
+	if (!s->err) {
+		for (i = 0; i < nr_data; i++)
+			if (s->blocks[i]) {
+				ob = c->open_buckets + s->blocks[i];
 
-			if (ob->sectors_free)
-				zero_out_rest_of_ec_bucket(c, s, i, ob);
-		}
+				if (ob->sectors_free)
+					zero_out_rest_of_ec_bucket(c, s, i, ob);
+			}
+	}
 
 	if (s->err) {
 		if (!bch2_err_matches(s->err, EROFS))
-- 
cgit 


From 3997989ae1541dea4bb144bd2bf8b7dc6cae743f Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 13 Mar 2023 21:58:14 -0400
Subject: bcachefs: Don't use BTREE_ITER_INTENT in make_extent_indirect()

This is a workaround for a btree path overflow - searching with
BTREE_ITER_INTENT periodically saves the iterator position for updates,
which eventually overflows.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/reflink.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c
index d2e6adc13fb1..d8426e754cdf 100644
--- a/fs/bcachefs/reflink.c
+++ b/fs/bcachefs/reflink.c
@@ -189,7 +189,7 @@ static int bch2_make_extent_indirect(struct btree_trans *trans,
 
 	for_each_btree_key_norestart(trans, reflink_iter, BTREE_ID_reflink,
 			   POS(0, c->reflink_hint),
-			   BTREE_ITER_INTENT|BTREE_ITER_SLOTS, k, ret) {
+			   BTREE_ITER_SLOTS, k, ret) {
 		if (reflink_iter.pos.inode) {
 			bch2_btree_iter_set_pos(&reflink_iter, POS_MIN);
 			continue;
-- 
cgit 


From 2d004446c8044e1660adc53e55d151c607a472d6 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 14 Mar 2023 08:35:04 -0400
Subject: bcachefs: bch2_bucket_is_movable() -> BTREE_ITER_CACHED

BTREE_ITER_CACHED should really be the default for cached btrees - this
is an easy mistake to make.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/movinggc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c
index 6755310f5ebc..4d8d013fe4fc 100644
--- a/fs/bcachefs/movinggc.c
+++ b/fs/bcachefs/movinggc.c
@@ -46,7 +46,7 @@ static int bch2_bucket_is_movable(struct btree_trans *trans,
 	if (bch2_bucket_is_open(trans->c, bucket.inode, bucket.offset))
 		return 0;
 
-	bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, bucket, 0);
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, bucket, BTREE_ITER_CACHED);
 	k = bch2_btree_iter_peek_slot(&iter);
 	ret = bkey_err(k);
 	bch2_trans_iter_exit(trans, &iter);
-- 
cgit 


From c639c29ce6882f4f77a81d778ef4741d5a5979d9 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 14 Mar 2023 11:48:07 -0400
Subject: bcachefs: Fix an assert in copygc thread shutdown path

We're not supposed to have nested (locked) btree_trans on the stack:
this means copygc shutdown needs to exit our btree_trans before exiting
the move_ctxt, which calls bch2_write().

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/movinggc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c
index 4d8d013fe4fc..4762594f6287 100644
--- a/fs/bcachefs/movinggc.c
+++ b/fs/bcachefs/movinggc.c
@@ -334,8 +334,8 @@ static int bch2_copygc_thread(void *arg)
 		wake_up(&c->copygc_running_wq);
 	}
 
-	bch2_moving_ctxt_exit(&ctxt);
 	bch2_trans_exit(&trans);
+	bch2_moving_ctxt_exit(&ctxt);
 	free_fifo(&move_buckets);
 
 	return 0;
-- 
cgit 


From 872c0311675bdb73b29ee74c7f27afc82d4918e9 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 14 Mar 2023 12:54:21 -0400
Subject: bcachefs: Fix bch2_check_extents_to_backpointers()

In rare cases, bch2_check_extents_to_backpointers() would incorrectly
flag an extent has having a missing backpointer when we just needed to
flush the btree write buffer - we weren't tracking the last flushed
position correctly.

This adds a level field to the last_flushed pos, fixing a bug where we'd
sometimes fail on a new root node.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/backpointers.c | 30 +++++++++++++++++++-----------
 1 file changed, 19 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c
index a40c26125d2a..8517c5635226 100644
--- a/fs/bcachefs/backpointers.c
+++ b/fs/bcachefs/backpointers.c
@@ -549,13 +549,18 @@ int bch2_check_btree_backpointers(struct bch_fs *c)
 		  bch2_check_btree_backpointer(&trans, &iter, k)));
 }
 
+struct bpos_level {
+	unsigned	level;
+	struct bpos	pos;
+};
+
 static int check_bp_exists(struct btree_trans *trans,
 			   struct bpos bucket_pos,
 			   struct bch_backpointer bp,
 			   struct bkey_s_c orig_k,
 			   struct bpos bucket_start,
 			   struct bpos bucket_end,
-			   struct bpos *last_flushed_pos)
+			   struct bpos_level *last_flushed)
 {
 	struct bch_fs *c = trans->c;
 	struct btree_iter alloc_iter, bp_iter = { NULL };
@@ -600,8 +605,11 @@ static int check_bp_exists(struct btree_trans *trans,
 
 	if (bp_k.k->type != KEY_TYPE_backpointer ||
 	    memcmp(bkey_s_c_to_backpointer(bp_k).v, &bp, sizeof(bp))) {
-		if (!bpos_eq(*last_flushed_pos, orig_k.k->p)) {
-			*last_flushed_pos = orig_k.k->p;
+		if (last_flushed->level != bp.level ||
+		    !bpos_eq(last_flushed->pos, orig_k.k->p)) {
+			last_flushed->level = bp.level;
+			last_flushed->pos = orig_k.k->p;
+
 			ret = bch2_btree_write_buffer_flush_sync(trans) ?:
 				-BCH_ERR_transaction_restart_write_buffer_flush;
 			goto out;
@@ -639,7 +647,7 @@ static int check_extent_to_backpointers(struct btree_trans *trans,
 					struct btree_iter *iter,
 					struct bpos bucket_start,
 					struct bpos bucket_end,
-					struct bpos *last_flushed_pos)
+					struct bpos_level *last_flushed)
 {
 	struct bch_fs *c = trans->c;
 	struct bkey_ptrs_c ptrs;
@@ -668,7 +676,7 @@ static int check_extent_to_backpointers(struct btree_trans *trans,
 
 		ret = check_bp_exists(trans, bucket_pos, bp, k,
 				      bucket_start, bucket_end,
-				      last_flushed_pos);
+				      last_flushed);
 		if (ret)
 			return ret;
 	}
@@ -680,7 +688,7 @@ static int check_btree_root_to_backpointers(struct btree_trans *trans,
 					    enum btree_id btree_id,
 					    struct bpos bucket_start,
 					    struct bpos bucket_end,
-					    struct bpos *last_flushed_pos)
+					    struct bpos_level *last_flushed)
 {
 	struct bch_fs *c = trans->c;
 	struct btree_iter iter;
@@ -709,12 +717,12 @@ static int check_btree_root_to_backpointers(struct btree_trans *trans,
 		if (p.ptr.cached)
 			continue;
 
-		bch2_extent_ptr_to_bp(c, iter.btree_id, iter.path->level + 1,
+		bch2_extent_ptr_to_bp(c, iter.btree_id, b->c.level + 1,
 				      k, p, &bucket_pos, &bp);
 
 		ret = check_bp_exists(trans, bucket_pos, bp, k,
 				      bucket_start, bucket_end,
-				      last_flushed_pos);
+				      last_flushed);
 		if (ret)
 			goto err;
 	}
@@ -794,7 +802,7 @@ static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans,
 {
 	struct btree_iter iter;
 	enum btree_id btree_id;
-	struct bpos last_flushed_pos = SPOS_MAX;
+	struct bpos_level last_flushed = { UINT_MAX };
 	int ret = 0;
 
 	for (btree_id = 0; btree_id < BTREE_ID_NR; btree_id++) {
@@ -811,7 +819,7 @@ static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans,
 					BTREE_INSERT_NOFAIL,
 					check_extent_to_backpointers(trans, &iter,
 								bucket_start, bucket_end,
-								&last_flushed_pos));
+								&last_flushed));
 			if (ret)
 				break;
 		} while (!bch2_btree_iter_advance(&iter));
@@ -826,7 +834,7 @@ static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans,
 				BTREE_INSERT_NOFAIL,
 				check_btree_root_to_backpointers(trans, btree_id,
 							bucket_start, bucket_end,
-							&last_flushed_pos));
+							&last_flushed));
 		if (ret)
 			break;
 	}
-- 
cgit 


From 65d48e35250fe46a560dffa13876830336b152c9 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 14 Mar 2023 15:35:57 -0400
Subject: bcachefs: Private error codes: ENOMEM

This adds private error codes for most (but not all) of our ENOMEM uses,
which makes it easier to track down assorted allocation failures.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_cache.c                 | 18 ++++----
 fs/bcachefs/btree_gc.c                    | 16 +++----
 fs/bcachefs/btree_io.c                    |  2 +-
 fs/bcachefs/btree_iter.c                  |  4 +-
 fs/bcachefs/btree_key_cache.c             | 16 +++----
 fs/bcachefs/btree_update_interior.c       |  9 ++--
 fs/bcachefs/btree_update_leaf.c           |  4 +-
 fs/bcachefs/btree_write_buffer.c          |  2 +-
 fs/bcachefs/buckets.c                     | 22 +++++----
 fs/bcachefs/buckets_waiting_for_journal.c |  4 +-
 fs/bcachefs/checksum.c                    |  4 +-
 fs/bcachefs/clock.c                       |  4 +-
 fs/bcachefs/compress.c                    | 65 +++++++++++++--------------
 fs/bcachefs/counters.c                    |  2 +-
 fs/bcachefs/disk_groups.c                 |  4 +-
 fs/bcachefs/ec.c                          | 14 +++---
 fs/bcachefs/errcode.h                     | 75 +++++++++++++++++++++++++++++++
 fs/bcachefs/fs-io.c                       | 20 ++++++---
 fs/bcachefs/fsck.c                        |  4 +-
 fs/bcachefs/io.c                          | 26 +++++++----
 fs/bcachefs/journal.c                     | 16 +++----
 fs/bcachefs/journal_io.c                  |  8 ++--
 fs/bcachefs/journal_sb.c                  |  4 +-
 fs/bcachefs/journal_seq_blacklist.c       |  4 +-
 fs/bcachefs/recovery.c                    | 10 ++---
 fs/bcachefs/replicas.c                    | 38 +++++++++-------
 fs/bcachefs/subvolume.c                   |  2 +-
 fs/bcachefs/super-io.c                    | 11 ++---
 fs/bcachefs/super.c                       |  8 ++--
 29 files changed, 259 insertions(+), 157 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index 6218a00ccb27..46a8a29ddef7 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -91,7 +91,7 @@ static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp)
 
 	b->data = kvpmalloc(btree_bytes(c), gfp);
 	if (!b->data)
-		return -ENOMEM;
+		return -BCH_ERR_ENOMEM_btree_node_mem_alloc;
 #ifdef __KERNEL__
 	b->aux_data = kvmalloc(btree_aux_data_bytes(b), gfp);
 #else
@@ -104,7 +104,7 @@ static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp)
 	if (!b->aux_data) {
 		kvpfree(b->data, btree_bytes(c));
 		b->data = NULL;
-		return -ENOMEM;
+		return -BCH_ERR_ENOMEM_btree_node_mem_alloc;
 	}
 
 	return 0;
@@ -207,7 +207,7 @@ wait_on_io:
 			(1U << BTREE_NODE_read_in_flight)|
 			(1U << BTREE_NODE_write_in_flight))) {
 		if (!flush)
-			return -ENOMEM;
+			return -BCH_ERR_ENOMEM_btree_node_reclaim;
 
 		/* XXX: waiting on IO with btree cache lock held */
 		bch2_btree_node_wait_on_read(b);
@@ -215,7 +215,7 @@ wait_on_io:
 	}
 
 	if (!six_trylock_intent(&b->c.lock))
-		return -ENOMEM;
+		return -BCH_ERR_ENOMEM_btree_node_reclaim;
 
 	if (!six_trylock_write(&b->c.lock))
 		goto out_unlock_intent;
@@ -263,7 +263,7 @@ out_unlock:
 	six_unlock_write(&b->c.lock);
 out_unlock_intent:
 	six_unlock_intent(&b->c.lock);
-	ret = -ENOMEM;
+	ret = -BCH_ERR_ENOMEM_btree_node_reclaim;
 	goto out;
 }
 
@@ -462,7 +462,7 @@ int bch2_fs_btree_cache_init(struct bch_fs *c)
 
 	for (i = 0; i < bc->reserve; i++)
 		if (!__bch2_btree_node_mem_alloc(c)) {
-			ret = -ENOMEM;
+			ret = -BCH_ERR_ENOMEM_fs_btree_cache_init;
 			goto out;
 		}
 
@@ -516,7 +516,7 @@ int bch2_btree_cache_cannibalize_lock(struct bch_fs *c, struct closure *cl)
 
 	if (!cl) {
 		trace_and_count(c, btree_cache_cannibalize_lock_fail, c);
-		return -ENOMEM;
+		return -BCH_ERR_ENOMEM_btree_cache_cannibalize_lock;
 	}
 
 	closure_wait(&bc->alloc_wait, cl);
@@ -669,7 +669,7 @@ err:
 
 	mutex_unlock(&bc->lock);
 	memalloc_nofs_restore(flags);
-	return ERR_PTR(-ENOMEM);
+	return ERR_PTR(-BCH_ERR_ENOMEM_btree_node_mem_alloc);
 }
 
 /* Slowpath, don't want it inlined into btree_iter_traverse() */
@@ -698,7 +698,7 @@ static noinline struct btree *bch2_btree_node_fill(struct btree_trans *trans,
 
 	b = bch2_btree_node_mem_alloc(trans, level != 0);
 
-	if (b == ERR_PTR(-ENOMEM)) {
+	if (bch2_err_matches(PTR_ERR_OR_ZERO(b), ENOMEM)) {
 		trans->memory_allocation_failure = true;
 		trace_and_count(c, trans_restart_memory_allocation_failure, trans, _THIS_IP_, path);
 		return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_fill_mem_alloc_fail));
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 37017eea2323..e2fd4c2cfbd0 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -201,7 +201,7 @@ static int set_node_min(struct bch_fs *c, struct btree *b, struct bpos new_min)
 
 	new = kmalloc_array(BKEY_BTREE_PTR_U64s_MAX, sizeof(u64), GFP_KERNEL);
 	if (!new)
-		return -ENOMEM;
+		return -BCH_ERR_ENOMEM_gc_repair_key;
 
 	btree_ptr_to_v2(b, new);
 	b->data->min_key	= new_min;
@@ -230,7 +230,7 @@ static int set_node_max(struct bch_fs *c, struct btree *b, struct bpos new_max)
 
 	new = kmalloc_array(BKEY_BTREE_PTR_U64s_MAX, sizeof(u64), GFP_KERNEL);
 	if (!new)
-		return -ENOMEM;
+		return -BCH_ERR_ENOMEM_gc_repair_key;
 
 	btree_ptr_to_v2(b, new);
 	b->data->max_key	= new_max;
@@ -686,7 +686,7 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id
 		new = kmalloc(bkey_bytes(k->k), GFP_KERNEL);
 		if (!new) {
 			bch_err(c, "%s: error allocating new key", __func__);
-			ret = -ENOMEM;
+			ret = -BCH_ERR_ENOMEM_gc_repair_key;
 			goto err;
 		}
 
@@ -1293,7 +1293,7 @@ static int bch2_gc_start(struct bch_fs *c)
 					 sizeof(u64), GFP_KERNEL);
 	if (!c->usage_gc) {
 		bch_err(c, "error allocating c->usage_gc");
-		return -ENOMEM;
+		return -BCH_ERR_ENOMEM_gc_start;
 	}
 
 	for_each_member_device(ca, c, i) {
@@ -1303,7 +1303,7 @@ static int bch2_gc_start(struct bch_fs *c)
 		if (!ca->usage_gc) {
 			bch_err(c, "error allocating ca->usage_gc");
 			percpu_ref_put(&ca->ref);
-			return -ENOMEM;
+			return -BCH_ERR_ENOMEM_gc_start;
 		}
 
 		this_cpu_write(ca->usage_gc->d[BCH_DATA_free].buckets,
@@ -1495,7 +1495,7 @@ static int bch2_gc_alloc_start(struct bch_fs *c, bool metadata_only)
 		if (!buckets) {
 			percpu_ref_put(&ca->ref);
 			bch_err(c, "error allocating ca->buckets[gc]");
-			return -ENOMEM;
+			return -BCH_ERR_ENOMEM_gc_alloc_start;
 		}
 
 		buckets->first_bucket	= ca->mi.first_bucket;
@@ -1656,7 +1656,7 @@ static int bch2_gc_reflink_start(struct bch_fs *c,
 		r = genradix_ptr_alloc(&c->reflink_gc_table, c->reflink_gc_nr++,
 				       GFP_KERNEL);
 		if (!r) {
-			ret = -ENOMEM;
+			ret = -BCH_ERR_ENOMEM_gc_reflink_start;
 			break;
 		}
 
@@ -1977,7 +1977,7 @@ int bch2_gc_gens(struct bch_fs *c)
 		ca->oldest_gen = kvmalloc(ca->mi.nbuckets, GFP_KERNEL);
 		if (!ca->oldest_gen) {
 			percpu_ref_put(&ca->ref);
-			ret = -ENOMEM;
+			ret = -BCH_ERR_ENOMEM_gc_gens;
 			goto err;
 		}
 
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 5dc2b3ecb319..0489d07a087f 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -1483,7 +1483,7 @@ static int btree_node_read_all_replicas(struct bch_fs *c, struct btree *b, bool
 
 	ra = kzalloc(sizeof(*ra), GFP_NOFS);
 	if (!ra)
-		return -ENOMEM;
+		return -BCH_ERR_ENOMEM_btree_node_read_all_replicas;
 
 	closure_init(&ra->cl, NULL);
 	ra->c	= c;
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index a1be6c81c3be..7b3e7f9368d1 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1012,7 +1012,7 @@ retry_all:
 			__btree_path_put(path, false);
 
 			if (bch2_err_matches(ret, BCH_ERR_transaction_restart) ||
-			    ret == -ENOMEM)
+			    bch2_err_matches(ret, ENOMEM))
 				goto retry_all;
 			if (ret)
 				goto err;
@@ -2809,7 +2809,7 @@ void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
 	}
 
 	if (!new_mem)
-		return ERR_PTR(-ENOMEM);
+		return ERR_PTR(-BCH_ERR_ENOMEM_trans_kmalloc);
 
 	trans->mem = new_mem;
 	trans->mem_bytes = new_bytes;
diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index 21e139e391e0..a483bd23a336 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -336,7 +336,7 @@ btree_key_cache_create(struct btree_trans *trans, struct btree_path *path)
 		if (unlikely(!ck)) {
 			bch_err(c, "error allocating memory for key cache item, btree %s",
 				bch2_btree_ids[path->btree_id]);
-			return ERR_PTR(-ENOMEM);
+			return ERR_PTR(-BCH_ERR_ENOMEM_btree_key_cache_create);
 		}
 
 		mark_btree_node_locked(trans, path, 0, SIX_LOCK_intent);
@@ -423,7 +423,7 @@ static int btree_key_cache_fill(struct btree_trans *trans,
 			if (!new_k) {
 				bch_err(trans->c, "error allocating memory for key cache key, btree %s u64s %u",
 					bch2_btree_ids[ck->key.btree_id], new_u64s);
-				ret = -ENOMEM;
+				ret = -BCH_ERR_ENOMEM_btree_key_cache_fill;
 				goto err;
 			}
 
@@ -1043,24 +1043,24 @@ void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *c)
 int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc)
 {
 	struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
-	int ret;
 
 #ifdef __KERNEL__
 	bc->pcpu_freed = alloc_percpu(struct btree_key_cache_freelist);
 	if (!bc->pcpu_freed)
-		return -ENOMEM;
+		return -BCH_ERR_ENOMEM_fs_btree_cache_init;
 #endif
 
-	ret = rhashtable_init(&bc->table, &bch2_btree_key_cache_params);
-	if (ret)
-		return ret;
+	if (rhashtable_init(&bc->table, &bch2_btree_key_cache_params))
+		return -BCH_ERR_ENOMEM_fs_btree_cache_init;
 
 	bc->table_init_done = true;
 
 	bc->shrink.seeks		= 0;
 	bc->shrink.count_objects	= bch2_btree_key_cache_count;
 	bc->shrink.scan_objects		= bch2_btree_key_cache_scan;
-	return register_shrinker(&bc->shrink, "%s/btree_key_cache", c->name);
+	if (register_shrinker(&bc->shrink, "%s/btree_key_cache", c->name))
+		return -BCH_ERR_ENOMEM_fs_btree_cache_init;
+	return 0;
 }
 
 void bch2_btree_key_cache_to_text(struct printbuf *out, struct btree_key_cache *c)
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 1db5ef4f2257..d64a86f39595 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -2475,8 +2475,11 @@ int bch2_fs_btree_interior_update_init(struct bch_fs *c)
 	c->btree_interior_update_worker =
 		alloc_workqueue("btree_update", WQ_UNBOUND|WQ_MEM_RECLAIM, 1);
 	if (!c->btree_interior_update_worker)
-		return -ENOMEM;
+		return -BCH_ERR_ENOMEM_btree_interior_update_worker_init;
 
-	return mempool_init_kmalloc_pool(&c->btree_interior_update_pool, 1,
-					 sizeof(struct btree_update));
+	if (mempool_init_kmalloc_pool(&c->btree_interior_update_pool, 1,
+				      sizeof(struct btree_update)))
+		return -BCH_ERR_ENOMEM_btree_interior_update_pool_init;
+
+	return 0;
 }
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index e9073d441b83..19efd484fc9d 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -401,7 +401,7 @@ static int btree_key_can_insert_cached(struct btree_trans *trans, unsigned flags
 	if (!new_k) {
 		bch_err(c, "error allocating memory for key cache key, btree %s u64s %u",
 			bch2_btree_ids[path->btree_id], new_u64s);
-		return -ENOMEM;
+		return -BCH_ERR_ENOMEM_btree_key_cache_insert;
 	}
 
 	trans_for_each_update(trans, i)
@@ -1891,7 +1891,7 @@ static int __bch2_trans_log_msg(darray_u64 *entries, const char *fmt, va_list ar
 	int ret;
 
 	prt_vprintf(&buf, fmt, args);
-	ret = buf.allocation_failure ? -ENOMEM : 0;
+	ret = buf.allocation_failure ? -BCH_ERR_ENOMEM_trans_log_msg : 0;
 	if (ret)
 		goto err;
 
diff --git a/fs/bcachefs/btree_write_buffer.c b/fs/bcachefs/btree_write_buffer.c
index 026c249a3f44..80f4b9839bc2 100644
--- a/fs/bcachefs/btree_write_buffer.c
+++ b/fs/bcachefs/btree_write_buffer.c
@@ -333,7 +333,7 @@ int bch2_fs_btree_write_buffer_init(struct bch_fs *c)
 	wb->keys[0] = kvmalloc_array(wb->size, sizeof(*wb->keys[0]), GFP_KERNEL);
 	wb->keys[1] = kvmalloc_array(wb->size, sizeof(*wb->keys[1]), GFP_KERNEL);
 	if (!wb->keys[0] || !wb->keys[1])
-		return -ENOMEM;
+		return -BCH_ERR_ENOMEM_fs_btree_write_buffer_init;
 
 	return 0;
 }
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 19b4e2bde399..6e2e2ed72f65 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -906,7 +906,7 @@ static int bch2_mark_stripe_ptr(struct btree_trans *trans,
 	if (!m) {
 		bch_err(c, "error allocating memory for gc_stripes, idx %llu",
 			(u64) p.idx);
-		return -ENOMEM;
+		return -BCH_ERR_ENOMEM_mark_stripe_ptr;
 	}
 
 	mutex_lock(&c->ec_stripes_heap_lock);
@@ -1075,7 +1075,7 @@ int bch2_mark_stripe(struct btree_trans *trans,
 		if (!m) {
 			bch_err(c, "error allocating memory for gc_stripes, idx %llu",
 				idx);
-			return -ENOMEM;
+			return -BCH_ERR_ENOMEM_mark_stripe;
 		}
 		/*
 		 * This will be wrong when we bring back runtime gc: we should
@@ -2045,15 +2045,21 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
 	struct bucket_gens *bucket_gens = NULL, *old_bucket_gens = NULL;
 	unsigned long *buckets_nouse = NULL;
 	bool resize = ca->bucket_gens != NULL;
-	int ret = -ENOMEM;
+	int ret;
 
 	if (!(bucket_gens	= kvpmalloc(sizeof(struct bucket_gens) + nbuckets,
-					    GFP_KERNEL|__GFP_ZERO)) ||
-	    (c->opts.buckets_nouse &&
+					    GFP_KERNEL|__GFP_ZERO))) {
+		ret = -BCH_ERR_ENOMEM_bucket_gens;
+		goto err;
+	}
+
+	if ((c->opts.buckets_nouse &&
 	     !(buckets_nouse	= kvpmalloc(BITS_TO_LONGS(nbuckets) *
 					    sizeof(unsigned long),
-					    GFP_KERNEL|__GFP_ZERO))))
+					    GFP_KERNEL|__GFP_ZERO)))) {
+		ret = -BCH_ERR_ENOMEM_buckets_nouse;
 		goto err;
+	}
 
 	bucket_gens->first_bucket = ca->mi.first_bucket;
 	bucket_gens->nbuckets	= nbuckets;
@@ -2123,12 +2129,12 @@ int bch2_dev_buckets_alloc(struct bch_fs *c, struct bch_dev *ca)
 
 	ca->usage_base = kzalloc(sizeof(struct bch_dev_usage), GFP_KERNEL);
 	if (!ca->usage_base)
-		return -ENOMEM;
+		return -BCH_ERR_ENOMEM_usage_init;
 
 	for (i = 0; i < ARRAY_SIZE(ca->usage); i++) {
 		ca->usage[i] = alloc_percpu(struct bch_dev_usage);
 		if (!ca->usage[i])
-			return -ENOMEM;
+			return -BCH_ERR_ENOMEM_usage_init;
 	}
 
 	return bch2_dev_buckets_resize(c, ca, ca->mi.nbuckets);
diff --git a/fs/bcachefs/buckets_waiting_for_journal.c b/fs/bcachefs/buckets_waiting_for_journal.c
index f3774e30b5cd..81ab685cdef9 100644
--- a/fs/bcachefs/buckets_waiting_for_journal.c
+++ b/fs/bcachefs/buckets_waiting_for_journal.c
@@ -110,7 +110,7 @@ int bch2_set_bucket_needs_journal_commit(struct buckets_waiting_for_journal *b,
 
 	n = kvmalloc(sizeof(*n) + (sizeof(n->d[0]) << new_bits), GFP_KERNEL);
 	if (!n) {
-		ret = -ENOMEM;
+		ret = -BCH_ERR_ENOMEM_buckets_waiting_for_journal_set;
 		goto out;
 	}
 
@@ -159,7 +159,7 @@ int bch2_fs_buckets_waiting_for_journal_init(struct bch_fs *c)
 	b->t = kvmalloc(sizeof(*b->t) +
 			(sizeof(b->t->d[0]) << INITIAL_TABLE_BITS), GFP_KERNEL);
 	if (!b->t)
-		return -ENOMEM;
+		return -BCH_ERR_ENOMEM_buckets_waiting_for_journal_init;
 
 	bucket_table_init(b->t, INITIAL_TABLE_BITS);
 	return 0;
diff --git a/fs/bcachefs/checksum.c b/fs/bcachefs/checksum.c
index 43d22fe8131b..843e138862f6 100644
--- a/fs/bcachefs/checksum.c
+++ b/fs/bcachefs/checksum.c
@@ -133,7 +133,7 @@ static inline int do_encrypt(struct crypto_sync_skcipher *tfm,
 
 		sg = kmalloc_array(pages, sizeof(*sg), GFP_KERNEL);
 		if (!sg)
-			return -ENOMEM;
+			return -BCH_ERR_ENOMEM_do_encrypt;
 
 		sg_init_table(sg, pages);
 
@@ -648,7 +648,7 @@ int bch2_enable_encryption(struct bch_fs *c, bool keyed)
 
 	crypt = bch2_sb_resize_crypt(&c->disk_sb, sizeof(*crypt) / sizeof(u64));
 	if (!crypt) {
-		ret = -ENOMEM; /* XXX this technically could be -ENOSPC */
+		ret = -BCH_ERR_ENOSPC_sb_crypt;
 		goto err;
 	}
 
diff --git a/fs/bcachefs/clock.c b/fs/bcachefs/clock.c
index 00d0e6725910..f41889093a2c 100644
--- a/fs/bcachefs/clock.c
+++ b/fs/bcachefs/clock.c
@@ -184,10 +184,10 @@ int bch2_io_clock_init(struct io_clock *clock)
 
 	clock->pcpu_buf = alloc_percpu(*clock->pcpu_buf);
 	if (!clock->pcpu_buf)
-		return -ENOMEM;
+		return -BCH_ERR_ENOMEM_io_clock_init;
 
 	if (!init_heap(&clock->timers, NR_IO_TIMERS, GFP_KERNEL))
-		return -ENOMEM;
+		return -BCH_ERR_ENOMEM_io_clock_init;
 
 	return 0;
 }
diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c
index 2b7080b67eca..6bec38440249 100644
--- a/fs/bcachefs/compress.c
+++ b/fs/bcachefs/compress.c
@@ -270,7 +270,7 @@ int bch2_bio_uncompress(struct bch_fs *c, struct bio *src,
 {
 	struct bbuf dst_data = { NULL };
 	size_t dst_len = crc.uncompressed_size << 9;
-	int ret = -ENOMEM;
+	int ret;
 
 	if (crc.uncompressed_size << 9	> c->opts.encoded_extent_max ||
 	    crc.compressed_size << 9	> c->opts.encoded_extent_max)
@@ -542,7 +542,7 @@ void bch2_fs_compress_exit(struct bch_fs *c)
 	mempool_exit(&c->compression_bounce[READ]);
 }
 
-static int __bch2_fs_compress_init(struct bch_fs *c, u64 features)
+static int _bch2_fs_compress_init(struct bch_fs *c, u64 features)
 {
 	size_t decompress_workspace_size = 0;
 	bool decompress_workspace_needed;
@@ -561,34 +561,27 @@ static int __bch2_fs_compress_init(struct bch_fs *c, u64 features)
 			zstd_cctx_workspace_bound(&params.cParams),
 			zstd_dctx_workspace_bound() },
 	}, *i;
-	int ret = 0;
-
-	pr_verbose_init(c->opts, "");
+	bool have_compressed = false;
 
 	c->zstd_params = params;
 
 	for (i = compression_types;
 	     i < compression_types + ARRAY_SIZE(compression_types);
 	     i++)
-		if (features & (1 << i->feature))
-			goto have_compressed;
+		have_compressed |= (features & (1 << i->feature)) != 0;
 
-	goto out;
-have_compressed:
+	if (!have_compressed)
+		return 0;
 
-	if (!mempool_initialized(&c->compression_bounce[READ])) {
-		ret = mempool_init_kvpmalloc_pool(&c->compression_bounce[READ],
-						  1, c->opts.encoded_extent_max);
-		if (ret)
-			goto out;
-	}
+	if (!mempool_initialized(&c->compression_bounce[READ]) &&
+	    mempool_init_kvpmalloc_pool(&c->compression_bounce[READ],
+					1, c->opts.encoded_extent_max))
+		return -BCH_ERR_ENOMEM_compression_bounce_read_init;
 
-	if (!mempool_initialized(&c->compression_bounce[WRITE])) {
-		ret = mempool_init_kvpmalloc_pool(&c->compression_bounce[WRITE],
-						  1, c->opts.encoded_extent_max);
-		if (ret)
-			goto out;
-	}
+	if (!mempool_initialized(&c->compression_bounce[WRITE]) &&
+	    mempool_init_kvpmalloc_pool(&c->compression_bounce[WRITE],
+					1, c->opts.encoded_extent_max))
+		return -BCH_ERR_ENOMEM_compression_bounce_write_init;
 
 	for (i = compression_types;
 	     i < compression_types + ARRAY_SIZE(compression_types);
@@ -605,22 +598,28 @@ have_compressed:
 		if (mempool_initialized(&c->compress_workspace[i->type]))
 			continue;
 
-		ret = mempool_init_kvpmalloc_pool(
+		if (mempool_init_kvpmalloc_pool(
 				&c->compress_workspace[i->type],
-				1, i->compress_workspace);
-		if (ret)
-			goto out;
+				1, i->compress_workspace))
+			return -BCH_ERR_ENOMEM_compression_workspace_init;
 	}
 
-	if (!mempool_initialized(&c->decompress_workspace)) {
-		ret = mempool_init_kvpmalloc_pool(
-				&c->decompress_workspace,
-				1, decompress_workspace_size);
-		if (ret)
-			goto out;
-	}
-out:
+	if (!mempool_initialized(&c->decompress_workspace) &&
+	    mempool_init_kvpmalloc_pool(&c->decompress_workspace,
+					1, decompress_workspace_size))
+		return -BCH_ERR_ENOMEM_decompression_workspace_init;
+
+	return 0;
+}
+
+static int __bch2_fs_compress_init(struct bch_fs *c, u64 features)
+{
+	int ret;
+
+	pr_verbose_init(c->opts, "");
+	ret = _bch2_fs_compress_init(c, features);
 	pr_verbose_init(c->opts, "ret %i", ret);
+
 	return ret;
 }
 
diff --git a/fs/bcachefs/counters.c b/fs/bcachefs/counters.c
index edd1b2537f48..e5587bc5a2b7 100644
--- a/fs/bcachefs/counters.c
+++ b/fs/bcachefs/counters.c
@@ -96,7 +96,7 @@ int bch2_fs_counters_init(struct bch_fs *c)
 {
 	c->counters = __alloc_percpu(sizeof(u64) * BCH_COUNTER_NR, sizeof(u64));
 	if (!c->counters)
-		return -ENOMEM;
+		return -BCH_ERR_ENOMEM_fs_counters_init;
 
 	return bch2_sb_counters_to_cpu(c);
 }
diff --git a/fs/bcachefs/disk_groups.c b/fs/bcachefs/disk_groups.c
index fcd5dbff248d..1a8f8b3750da 100644
--- a/fs/bcachefs/disk_groups.c
+++ b/fs/bcachefs/disk_groups.c
@@ -68,7 +68,7 @@ static int bch2_sb_disk_groups_validate(struct bch_sb *sb,
 
 	sorted = kmalloc_array(nr_groups, sizeof(*sorted), GFP_KERNEL);
 	if (!sorted)
-		return -ENOMEM;
+		return -BCH_ERR_ENOMEM_disk_groups_validate;
 
 	memcpy(sorted, groups->entries, nr_groups * sizeof(*sorted));
 	sort(sorted, nr_groups, sizeof(*sorted), group_cmp, NULL);
@@ -134,7 +134,7 @@ int bch2_sb_disk_groups_to_cpu(struct bch_fs *c)
 	cpu_g = kzalloc(sizeof(*cpu_g) +
 			sizeof(cpu_g->entries[0]) * nr_groups, GFP_KERNEL);
 	if (!cpu_g)
-		return -ENOMEM;
+		return -BCH_ERR_ENOMEM_disk_groups_to_cpu;
 
 	cpu_g->nr = nr_groups;
 
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index 74cfd9edd680..af3a72acc67f 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -494,7 +494,7 @@ int bch2_ec_read_extent(struct bch_fs *c, struct bch_read_bio *rbio)
 
 	buf = kzalloc(sizeof(*buf), GFP_NOIO);
 	if (!buf)
-		return -ENOMEM;
+		return -BCH_ERR_ENOMEM_ec_read_extent;
 
 	ret = get_stripe_key(c, rbio->pick.ec.idx, buf);
 	if (ret) {
@@ -559,7 +559,7 @@ static int __ec_stripe_mem_alloc(struct bch_fs *c, size_t idx, gfp_t gfp)
 
 	if (idx >= h->size) {
 		if (!init_heap(&n, max(1024UL, roundup_pow_of_two(idx + 1)), gfp))
-			return -ENOMEM;
+			return -BCH_ERR_ENOMEM_ec_stripe_mem_alloc;
 
 		mutex_lock(&c->ec_stripes_heap_lock);
 		if (n.size > h->size) {
@@ -573,11 +573,11 @@ static int __ec_stripe_mem_alloc(struct bch_fs *c, size_t idx, gfp_t gfp)
 	}
 
 	if (!genradix_ptr_alloc(&c->stripes, idx, gfp))
-		return -ENOMEM;
+		return -BCH_ERR_ENOMEM_ec_stripe_mem_alloc;
 
 	if (c->gc_pos.phase != GC_PHASE_NOT_RUNNING &&
 	    !genradix_ptr_alloc(&c->gc_stripes, idx, gfp))
-		return -ENOMEM;
+		return -BCH_ERR_ENOMEM_ec_stripe_mem_alloc;
 
 	return 0;
 }
@@ -1323,7 +1323,7 @@ static int ec_new_stripe_alloc(struct bch_fs *c, struct ec_stripe_head *h)
 
 	s = kzalloc(sizeof(*s), GFP_KERNEL);
 	if (!s)
-		return -ENOMEM;
+		return -BCH_ERR_ENOMEM_ec_new_stripe_alloc;
 
 	mutex_init(&s->lock);
 	closure_init(&s->iodone, NULL);
@@ -1680,8 +1680,8 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans,
 		return h;
 
 	if (!h->s) {
-		if (ec_new_stripe_alloc(c, h)) {
-			ret = -ENOMEM;
+		ret = ec_new_stripe_alloc(c, h);
+		if (ret) {
 			bch_err(c, "failed to allocate new stripe");
 			goto err;
 		}
diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h
index 162e315601f9..4304e25a6b24 100644
--- a/fs/bcachefs/errcode.h
+++ b/fs/bcachefs/errcode.h
@@ -4,6 +4,79 @@
 
 #define BCH_ERRCODES()								\
 	x(ENOMEM,			ENOMEM_stripe_buf)			\
+	x(ENOMEM,			ENOMEM_replicas_table)			\
+	x(ENOMEM,			ENOMEM_cpu_replicas)			\
+	x(ENOMEM,			ENOMEM_replicas_gc)			\
+	x(ENOMEM,			ENOMEM_disk_groups_validate)		\
+	x(ENOMEM,			ENOMEM_disk_groups_to_cpu)		\
+	x(ENOMEM,			ENOMEM_mark_snapshot)			\
+	x(ENOMEM,			ENOMEM_mark_stripe)			\
+	x(ENOMEM,			ENOMEM_mark_stripe_ptr)			\
+	x(ENOMEM,			ENOMEM_btree_key_cache_create)		\
+	x(ENOMEM,			ENOMEM_btree_key_cache_fill)		\
+	x(ENOMEM,			ENOMEM_btree_key_cache_insert)		\
+	x(ENOMEM,			ENOMEM_trans_kmalloc)			\
+	x(ENOMEM,			ENOMEM_trans_log_msg)			\
+	x(ENOMEM,			ENOMEM_do_encrypt)			\
+	x(ENOMEM,			ENOMEM_ec_read_extent)			\
+	x(ENOMEM,			ENOMEM_ec_stripe_mem_alloc)		\
+	x(ENOMEM,			ENOMEM_ec_new_stripe_alloc)		\
+	x(ENOMEM,			ENOMEM_fs_btree_cache_init)		\
+	x(ENOMEM,			ENOMEM_fs_btree_key_cache_init)		\
+	x(ENOMEM,			ENOMEM_fs_counters_init)		\
+	x(ENOMEM,			ENOMEM_fs_btree_write_buffer_init)	\
+	x(ENOMEM,			ENOMEM_io_clock_init)			\
+	x(ENOMEM,			ENOMEM_blacklist_table_init)		\
+	x(ENOMEM,			ENOMEM_sb_realloc_injected)		\
+	x(ENOMEM,			ENOMEM_sb_bio_realloc)			\
+	x(ENOMEM,			ENOMEM_sb_buf_realloc)			\
+	x(ENOMEM,			ENOMEM_sb_journal_validate)		\
+	x(ENOMEM,			ENOMEM_sb_journal_v2_validate)		\
+	x(ENOMEM,			ENOMEM_journal_entry_add)		\
+	x(ENOMEM,			ENOMEM_journal_read_buf_realloc)	\
+	x(ENOMEM,			ENOMEM_btree_interior_update_worker_init)\
+	x(ENOMEM,			ENOMEM_btree_interior_update_pool_init)	\
+	x(ENOMEM,			ENOMEM_bio_read_init)			\
+	x(ENOMEM,			ENOMEM_bio_read_split_init)		\
+	x(ENOMEM,			ENOMEM_bio_write_init)			\
+	x(ENOMEM,			ENOMEM_bio_bounce_pages_init)		\
+	x(ENOMEM,			ENOMEM_writepage_bioset_init)		\
+	x(ENOMEM,			ENOMEM_dio_read_bioset_init)		\
+	x(ENOMEM,			ENOMEM_dio_write_bioset_init)		\
+	x(ENOMEM,			ENOMEM_nocow_flush_bioset_init)		\
+	x(ENOMEM,			ENOMEM_promote_table_init)		\
+	x(ENOMEM,			ENOMEM_compression_bounce_read_init)	\
+	x(ENOMEM,			ENOMEM_compression_bounce_write_init)	\
+	x(ENOMEM,			ENOMEM_compression_workspace_init)	\
+	x(ENOMEM,			ENOMEM_decompression_workspace_init)	\
+	x(ENOMEM,			ENOMEM_bucket_gens)			\
+	x(ENOMEM,			ENOMEM_buckets_nouse)			\
+	x(ENOMEM,			ENOMEM_usage_init)			\
+	x(ENOMEM,			ENOMEM_btree_node_read_all_replicas)	\
+	x(ENOMEM,			ENOMEM_btree_node_reclaim)		\
+	x(ENOMEM,			ENOMEM_btree_node_mem_alloc)		\
+	x(ENOMEM,			ENOMEM_btree_cache_cannibalize_lock)	\
+	x(ENOMEM,			ENOMEM_buckets_waiting_for_journal_init)\
+	x(ENOMEM,			ENOMEM_buckets_waiting_for_journal_set)	\
+	x(ENOMEM,			ENOMEM_set_nr_journal_buckets)		\
+	x(ENOMEM,			ENOMEM_dev_journal_init)		\
+	x(ENOMEM,			ENOMEM_journal_pin_fifo)		\
+	x(ENOMEM,			ENOMEM_journal_buf)			\
+	x(ENOMEM,			ENOMEM_gc_start)			\
+	x(ENOMEM,			ENOMEM_gc_alloc_start)			\
+	x(ENOMEM,			ENOMEM_gc_reflink_start)		\
+	x(ENOMEM,			ENOMEM_gc_gens)				\
+	x(ENOMEM,			ENOMEM_gc_repair_key)			\
+	x(ENOMEM,			ENOMEM_fsck_extent_ends_at)		\
+	x(ENOMEM,			ENOMEM_fsck_add_nlink)			\
+	x(ENOMEM,			ENOMEM_journal_key_insert)		\
+	x(ENOMEM,			ENOMEM_journal_keys_sort)		\
+	x(ENOMEM,			ENOMEM_journal_replay)			\
+	x(ENOMEM,			ENOMEM_read_superblock_clean)		\
+	x(ENOMEM,			ENOMEM_fs_alloc)			\
+	x(ENOMEM,			ENOMEM_fs_name_alloc)			\
+	x(ENOMEM,			ENOMEM_fs_other_alloc)			\
+	x(ENOMEM,			ENOMEM_dev_alloc)			\
 	x(ENOSPC,			ENOSPC_disk_reservation)		\
 	x(ENOSPC,			ENOSPC_bucket_alloc)			\
 	x(ENOSPC,			ENOSPC_disk_label_add)			\
@@ -14,9 +87,11 @@
 	x(ENOSPC,			ENOSPC_subvolume_create)		\
 	x(ENOSPC,			ENOSPC_sb)				\
 	x(ENOSPC,			ENOSPC_sb_journal)			\
+	x(ENOSPC,			ENOSPC_sb_journal_seq_blacklist)	\
 	x(ENOSPC,			ENOSPC_sb_quota)			\
 	x(ENOSPC,			ENOSPC_sb_replicas)			\
 	x(ENOSPC,			ENOSPC_sb_members)			\
+	x(ENOSPC,			ENOSPC_sb_crypt)			\
 	x(0,				open_buckets_empty)			\
 	x(0,				freelist_empty)				\
 	x(BCH_ERR_freelist_empty,	no_buckets_found)			\
diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index ec575b27eedb..d98b654c92b1 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -3713,16 +3713,22 @@ int bch2_fs_fsio_init(struct bch_fs *c)
 
 	if (bioset_init(&c->writepage_bioset,
 			4, offsetof(struct bch_writepage_io, op.wbio.bio),
-			BIOSET_NEED_BVECS) ||
-	    bioset_init(&c->dio_read_bioset,
+			BIOSET_NEED_BVECS))
+		return -BCH_ERR_ENOMEM_writepage_bioset_init;
+
+	if (bioset_init(&c->dio_read_bioset,
 			4, offsetof(struct dio_read, rbio.bio),
-			BIOSET_NEED_BVECS) ||
-	    bioset_init(&c->dio_write_bioset,
+			BIOSET_NEED_BVECS))
+		return -BCH_ERR_ENOMEM_dio_read_bioset_init;
+
+	if (bioset_init(&c->dio_write_bioset,
 			4, offsetof(struct dio_write, op.wbio.bio),
-			BIOSET_NEED_BVECS) ||
-	    bioset_init(&c->nocow_flush_bioset,
+			BIOSET_NEED_BVECS))
+		return -BCH_ERR_ENOMEM_dio_write_bioset_init;
+
+	if (bioset_init(&c->nocow_flush_bioset,
 			1, offsetof(struct nocow_flush, bio), 0))
-		ret = -ENOMEM;
+		return -BCH_ERR_ENOMEM_nocow_flush_bioset_init;
 
 	pr_verbose_init(c->opts, "ret %i", ret);
 	return ret;
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 5e6dc6c316d1..ed2523ac2249 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -1237,7 +1237,7 @@ static int extent_ends_at(extent_ends *extent_ends,
 			      sizeof(seen->ids.data[0]) * seen->ids.size,
 			      GFP_KERNEL);
 	if (!n.seen.ids.data)
-		return -ENOMEM;
+		return -BCH_ERR_ENOMEM_fsck_extent_ends_at;
 
 	darray_for_each(*extent_ends, i) {
 		if (i->snapshot == k.k->p.snapshot) {
@@ -2141,7 +2141,7 @@ static int add_nlink(struct bch_fs *c, struct nlink_table *t,
 		if (!d) {
 			bch_err(c, "fsck: error allocating memory for nlink_table, size %zu",
 				new_size);
-			return -ENOMEM;
+			return -BCH_ERR_ENOMEM_fsck_add_nlink;
 		}
 
 		if (t->d)
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index 6fd29966c1db..6daf5f4a905c 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -2995,18 +2995,26 @@ void bch2_fs_io_exit(struct bch_fs *c)
 int bch2_fs_io_init(struct bch_fs *c)
 {
 	if (bioset_init(&c->bio_read, 1, offsetof(struct bch_read_bio, bio),
-			BIOSET_NEED_BVECS) ||
-	    bioset_init(&c->bio_read_split, 1, offsetof(struct bch_read_bio, bio),
-			BIOSET_NEED_BVECS) ||
-	    bioset_init(&c->bio_write, 1, offsetof(struct bch_write_bio, bio),
-			BIOSET_NEED_BVECS) ||
-	    mempool_init_page_pool(&c->bio_bounce_pages,
+			BIOSET_NEED_BVECS))
+		return -BCH_ERR_ENOMEM_bio_read_init;
+
+	if (bioset_init(&c->bio_read_split, 1, offsetof(struct bch_read_bio, bio),
+			BIOSET_NEED_BVECS))
+		return -BCH_ERR_ENOMEM_bio_read_split_init;
+
+	if (bioset_init(&c->bio_write, 1, offsetof(struct bch_write_bio, bio),
+			BIOSET_NEED_BVECS))
+		return -BCH_ERR_ENOMEM_bio_write_init;
+
+	if (mempool_init_page_pool(&c->bio_bounce_pages,
 				   max_t(unsigned,
 					 c->opts.btree_node_size,
 					 c->opts.encoded_extent_max) /
-				   PAGE_SIZE, 0) ||
-	    rhashtable_init(&c->promote_table, &bch_promote_params))
-		return -ENOMEM;
+				   PAGE_SIZE, 0))
+		return -BCH_ERR_ENOMEM_bio_bounce_pages_init;
+
+	if (rhashtable_init(&c->promote_table, &bch_promote_params))
+		return -BCH_ERR_ENOMEM_promote_table_init;
 
 	return 0;
 }
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index 410521f11ec2..801f09593e6b 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -768,7 +768,7 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
 	new_buckets	= kcalloc(nr, sizeof(u64), GFP_KERNEL);
 	new_bucket_seq	= kcalloc(nr, sizeof(u64), GFP_KERNEL);
 	if (!bu || !ob || !new_buckets || !new_bucket_seq) {
-		ret = -ENOMEM;
+		ret = -BCH_ERR_ENOMEM_set_nr_journal_buckets;
 		goto err_free;
 	}
 
@@ -941,7 +941,7 @@ int bch2_dev_journal_alloc(struct bch_dev *ca)
 	unsigned nr;
 
 	if (dynamic_fault("bcachefs:add:journal_alloc"))
-		return -ENOMEM;
+		return -BCH_ERR_ENOMEM_set_nr_journal_buckets;
 
 	/* 1/128th of the device by default: */
 	nr = ca->mi.nbuckets >> 7;
@@ -1033,7 +1033,7 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq)
 		init_fifo(&j->pin, roundup_pow_of_two(nr + 1), GFP_KERNEL);
 		if (!j->pin.data) {
 			bch_err(c, "error reallocating journal fifo (%llu open entries)", nr);
-			return -ENOMEM;
+			return -BCH_ERR_ENOMEM_journal_pin_fifo;
 		}
 	}
 
@@ -1127,19 +1127,19 @@ int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb)
 
 	ja->bucket_seq = kcalloc(ja->nr, sizeof(u64), GFP_KERNEL);
 	if (!ja->bucket_seq)
-		return -ENOMEM;
+		return -BCH_ERR_ENOMEM_dev_journal_init;
 
 	nr_bvecs = DIV_ROUND_UP(JOURNAL_ENTRY_SIZE_MAX, PAGE_SIZE);
 
 	ca->journal.bio = bio_kmalloc(nr_bvecs, GFP_KERNEL);
 	if (!ca->journal.bio)
-		return -ENOMEM;
+		return -BCH_ERR_ENOMEM_dev_journal_init;
 
 	bio_init(ca->journal.bio, NULL, ca->journal.bio->bi_inline_vecs, nr_bvecs, 0);
 
 	ja->buckets = kcalloc(ja->nr, sizeof(u64), GFP_KERNEL);
 	if (!ja->buckets)
-		return -ENOMEM;
+		return -BCH_ERR_ENOMEM_dev_journal_init;
 
 	if (journal_buckets_v2) {
 		unsigned nr = bch2_sb_field_journal_v2_nr_entries(journal_buckets_v2);
@@ -1193,7 +1193,7 @@ int bch2_fs_journal_init(struct journal *j)
 		 { .cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL }).v);
 
 	if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL))) {
-		ret = -ENOMEM;
+		ret = -BCH_ERR_ENOMEM_journal_pin_fifo;
 		goto out;
 	}
 
@@ -1201,7 +1201,7 @@ int bch2_fs_journal_init(struct journal *j)
 		j->buf[i].buf_size = JOURNAL_ENTRY_SIZE_MIN;
 		j->buf[i].data = kvpmalloc(j->buf[i].buf_size, GFP_KERNEL);
 		if (!j->buf[i].data) {
-			ret = -ENOMEM;
+			ret = -BCH_ERR_ENOMEM_journal_buf;
 			goto out;
 		}
 	}
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 97b131fd72e6..38458ab0013d 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -118,7 +118,7 @@ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca,
 				journal_entry_radix_idx(c, le64_to_cpu(j->seq)),
 				GFP_KERNEL);
 	if (!_i)
-		return -ENOMEM;
+		return -BCH_ERR_ENOMEM_journal_entry_add;
 
 	/*
 	 * Duplicate journal entries? If so we want the one that didn't have a
@@ -148,7 +148,7 @@ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca,
 replace:
 	i = kvpmalloc(offsetof(struct journal_replay, j) + bytes, GFP_KERNEL);
 	if (!i)
-		return -ENOMEM;
+		return -BCH_ERR_ENOMEM_journal_entry_add;
 
 	i->nr_ptrs	= 0;
 	i->csum_good	= entry_ptr.csum_good;
@@ -835,12 +835,12 @@ static int journal_read_buf_realloc(struct journal_read_buf *b,
 
 	/* the bios are sized for this many pages, max: */
 	if (new_size > JOURNAL_ENTRY_SIZE_MAX)
-		return -ENOMEM;
+		return -BCH_ERR_ENOMEM_journal_read_buf_realloc;
 
 	new_size = roundup_pow_of_two(new_size);
 	n = kvpmalloc(new_size, GFP_KERNEL);
 	if (!n)
-		return -ENOMEM;
+		return -BCH_ERR_ENOMEM_journal_read_buf_realloc;
 
 	kvpfree(b->data, b->size);
 	b->data = n;
diff --git a/fs/bcachefs/journal_sb.c b/fs/bcachefs/journal_sb.c
index 5be7882342e0..fcefbbe7eda8 100644
--- a/fs/bcachefs/journal_sb.c
+++ b/fs/bcachefs/journal_sb.c
@@ -33,7 +33,7 @@ static int bch2_sb_journal_validate(struct bch_sb *sb,
 
 	b = kmalloc_array(nr, sizeof(u64), GFP_KERNEL);
 	if (!b)
-		return -ENOMEM;
+		return -BCH_ERR_ENOMEM_sb_journal_validate;
 
 	for (i = 0; i < nr; i++)
 		b[i] = le64_to_cpu(journal->buckets[i]);
@@ -116,7 +116,7 @@ static int bch2_sb_journal_v2_validate(struct bch_sb *sb,
 
 	b = kmalloc_array(nr, sizeof(*b), GFP_KERNEL);
 	if (!b)
-		return -ENOMEM;
+		return -BCH_ERR_ENOMEM_sb_journal_v2_validate;
 
 	for (i = 0; i < nr; i++) {
 		b[i].start = le64_to_cpu(journal->d[i].start);
diff --git a/fs/bcachefs/journal_seq_blacklist.c b/fs/bcachefs/journal_seq_blacklist.c
index 012c870acce0..d6b9f2cdf8e7 100644
--- a/fs/bcachefs/journal_seq_blacklist.c
+++ b/fs/bcachefs/journal_seq_blacklist.c
@@ -103,7 +103,7 @@ int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64 start, u64 end)
 	bl = bch2_sb_resize_journal_seq_blacklist(&c->disk_sb,
 					sb_blacklist_u64s(nr + 1));
 	if (!bl) {
-		ret = -ENOMEM;
+		ret = -BCH_ERR_ENOSPC_sb_journal_seq_blacklist;
 		goto out;
 	}
 
@@ -168,7 +168,7 @@ int bch2_blacklist_table_initialize(struct bch_fs *c)
 	t = kzalloc(sizeof(*t) + sizeof(t->entries[0]) * nr,
 		    GFP_KERNEL);
 	if (!t)
-		return -ENOMEM;
+		return -BCH_ERR_ENOMEM_blacklist_table_init;
 
 	t->nr = nr;
 
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index aafe4054d25d..137e523bb7ea 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -228,7 +228,7 @@ int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id,
 		if (!new_keys.d) {
 			bch_err(c, "%s: error allocating new key array (size %zu)",
 				__func__, new_keys.size);
-			return -ENOMEM;
+			return -BCH_ERR_ENOMEM_journal_key_insert;
 		}
 
 		/* Since @keys was full, there was no gap: */
@@ -266,7 +266,7 @@ int bch2_journal_key_insert(struct bch_fs *c, enum btree_id id,
 
 	n = kmalloc(bkey_bytes(&k->k), GFP_KERNEL);
 	if (!n)
-		return -ENOMEM;
+		return -BCH_ERR_ENOMEM_journal_key_insert;
 
 	bkey_copy(n, k);
 	ret = bch2_journal_key_insert_take(c, id, level, n);
@@ -503,7 +503,7 @@ static int journal_keys_sort(struct bch_fs *c)
 
 	keys->d = kvmalloc_array(keys->size, sizeof(keys->d[0]), GFP_KERNEL);
 	if (!keys->d)
-		return -ENOMEM;
+		return -BCH_ERR_ENOMEM_journal_keys_sort;
 
 	genradix_for_each(&c->journal_entries, iter, _i) {
 		i = *_i;
@@ -601,7 +601,7 @@ static int bch2_journal_replay(struct bch_fs *c, u64 start_seq, u64 end_seq)
 
 	keys_sorted = kvmalloc_array(sizeof(*keys_sorted), keys->nr, GFP_KERNEL);
 	if (!keys_sorted)
-		return -ENOMEM;
+		return -BCH_ERR_ENOMEM_journal_replay;
 
 	for (i = 0; i < keys->nr; i++)
 		keys_sorted[i] = &keys->d[i];
@@ -905,7 +905,7 @@ static struct bch_sb_field_clean *read_superblock_clean(struct bch_fs *c)
 			GFP_KERNEL);
 	if (!clean) {
 		mutex_unlock(&c->sb_lock);
-		return ERR_PTR(-ENOMEM);
+		return ERR_PTR(-BCH_ERR_ENOMEM_read_superblock_clean);
 	}
 
 	ret = bch2_sb_clean_validate_late(c, clean, READ);
diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c
index 3bff21959d98..8935ff5899c9 100644
--- a/fs/bcachefs/replicas.c
+++ b/fs/bcachefs/replicas.c
@@ -336,7 +336,7 @@ out:
 	return ret;
 err:
 	bch_err(c, "error updating replicas table: memory allocation failure");
-	ret = -ENOMEM;
+	ret = -BCH_ERR_ENOMEM_replicas_table;
 	goto out;
 }
 
@@ -383,14 +383,18 @@ static int bch2_mark_replicas_slowpath(struct bch_fs *c,
 	if (c->replicas_gc.entries &&
 	    !__replicas_has_entry(&c->replicas_gc, new_entry)) {
 		new_gc = cpu_replicas_add_entry(&c->replicas_gc, new_entry);
-		if (!new_gc.entries)
+		if (!new_gc.entries) {
+			ret = -BCH_ERR_ENOMEM_cpu_replicas;
 			goto err;
+		}
 	}
 
 	if (!__replicas_has_entry(&c->replicas, new_entry)) {
 		new_r = cpu_replicas_add_entry(&c->replicas, new_entry);
-		if (!new_r.entries)
+		if (!new_r.entries) {
+			ret = -BCH_ERR_ENOMEM_cpu_replicas;
 			goto err;
+		}
 
 		ret = bch2_cpu_replicas_to_sb_replicas(c, &new_r);
 		if (ret)
@@ -425,8 +429,7 @@ out:
 
 	return ret;
 err:
-	bch_err(c, "error adding replicas entry: memory allocation failure");
-	ret = -ENOMEM;
+	bch_err(c, "error adding replicas entry: %s", bch2_err_str(ret));
 	goto out;
 }
 
@@ -478,7 +481,7 @@ int bch2_replicas_gc_end(struct bch_fs *c, int ret)
 		    bch2_fs_usage_read_one(c, &c->usage_base->replicas[i])) {
 			n = cpu_replicas_add_entry(&c->replicas_gc, e);
 			if (!n.entries) {
-				ret = -ENOMEM;
+				ret = -BCH_ERR_ENOMEM_cpu_replicas;
 				goto err;
 			}
 
@@ -533,7 +536,7 @@ int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask)
 	if (!c->replicas_gc.entries) {
 		mutex_unlock(&c->sb_lock);
 		bch_err(c, "error allocating c->replicas_gc");
-		return -ENOMEM;
+		return -BCH_ERR_ENOMEM_replicas_gc;
 	}
 
 	for_each_cpu_replicas_entry(&c->replicas, e)
@@ -562,7 +565,7 @@ retry:
 	new.entries	= kcalloc(nr, new.entry_size, GFP_KERNEL);
 	if (!new.entries) {
 		bch_err(c, "error allocating c->replicas_gc");
-		return -ENOMEM;
+		return -BCH_ERR_ENOMEM_replicas_gc;
 	}
 
 	mutex_lock(&c->sb_lock);
@@ -621,7 +624,7 @@ int bch2_replicas_set_usage(struct bch_fs *c,
 
 		n = cpu_replicas_add_entry(&c->replicas, r);
 		if (!n.entries)
-			return -ENOMEM;
+			return -BCH_ERR_ENOMEM_cpu_replicas;
 
 		ret = replicas_table_update(c, &n);
 		if (ret)
@@ -655,7 +658,7 @@ __bch2_sb_replicas_to_cpu_replicas(struct bch_sb_field_replicas *sb_r,
 
 	cpu_r->entries = kcalloc(nr, entry_size, GFP_KERNEL);
 	if (!cpu_r->entries)
-		return -ENOMEM;
+		return -BCH_ERR_ENOMEM_cpu_replicas;
 
 	cpu_r->nr		= nr;
 	cpu_r->entry_size	= entry_size;
@@ -687,7 +690,7 @@ __bch2_sb_replicas_v0_to_cpu_replicas(struct bch_sb_field_replicas_v0 *sb_r,
 
 	cpu_r->entries = kcalloc(nr, entry_size, GFP_KERNEL);
 	if (!cpu_r->entries)
-		return -ENOMEM;
+		return -BCH_ERR_ENOMEM_cpu_replicas;
 
 	cpu_r->nr		= nr;
 	cpu_r->entry_size	= entry_size;
@@ -717,9 +720,8 @@ int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *c)
 		ret = __bch2_sb_replicas_to_cpu_replicas(sb_v1, &new_r);
 	else if ((sb_v0 = bch2_sb_get_replicas_v0(c->disk_sb.sb)))
 		ret = __bch2_sb_replicas_v0_to_cpu_replicas(sb_v0, &new_r);
-
 	if (ret)
-		return -ENOMEM;
+		return ret;
 
 	bch2_cpu_replicas_sort(&new_r);
 
@@ -881,8 +883,9 @@ static int bch2_sb_replicas_validate(struct bch_sb *sb, struct bch_sb_field *f,
 	struct bch_replicas_cpu cpu_r;
 	int ret;
 
-	if (__bch2_sb_replicas_to_cpu_replicas(sb_r, &cpu_r))
-		return -ENOMEM;
+	ret = __bch2_sb_replicas_to_cpu_replicas(sb_r, &cpu_r);
+	if (ret)
+		return ret;
 
 	ret = bch2_cpu_replicas_validate(&cpu_r, sb, err);
 	kfree(cpu_r.entries);
@@ -919,8 +922,9 @@ static int bch2_sb_replicas_v0_validate(struct bch_sb *sb, struct bch_sb_field *
 	struct bch_replicas_cpu cpu_r;
 	int ret;
 
-	if (__bch2_sb_replicas_v0_to_cpu_replicas(sb_r, &cpu_r))
-		return -ENOMEM;
+	ret = __bch2_sb_replicas_v0_to_cpu_replicas(sb_r, &cpu_r);
+	if (ret)
+		return ret;
 
 	ret = bch2_cpu_replicas_validate(&cpu_r, sb, err);
 	kfree(cpu_r.entries);
diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c
index bcc67c0f5dfc..43d83705a7ae 100644
--- a/fs/bcachefs/subvolume.c
+++ b/fs/bcachefs/subvolume.c
@@ -87,7 +87,7 @@ int bch2_mark_snapshot(struct btree_trans *trans,
 			       U32_MAX - new.k->p.offset,
 			       GFP_KERNEL);
 	if (!t)
-		return -ENOMEM;
+		return -BCH_ERR_ENOMEM_mark_snapshot;
 
 	if (new.k->type == KEY_TYPE_snapshot) {
 		struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(new);
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index e311b1b4595a..d23ed9ec30f1 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -139,14 +139,14 @@ int bch2_sb_realloc(struct bch_sb_handle *sb, unsigned u64s)
 		return 0;
 
 	if (dynamic_fault("bcachefs:add:super_realloc"))
-		return -ENOMEM;
+		return -BCH_ERR_ENOMEM_sb_realloc_injected;
 
 	if (sb->have_bio) {
 		unsigned nr_bvecs = DIV_ROUND_UP(new_buffer_size, PAGE_SIZE);
 
 		bio = bio_kmalloc(nr_bvecs, GFP_KERNEL);
 		if (!bio)
-			return -ENOMEM;
+			return -BCH_ERR_ENOMEM_sb_bio_realloc;
 
 		bio_init(bio, NULL, bio->bi_inline_vecs, nr_bvecs, 0);
 
@@ -156,7 +156,7 @@ int bch2_sb_realloc(struct bch_sb_handle *sb, unsigned u64s)
 
 	new_sb = krealloc(sb->sb, new_buffer_size, GFP_NOFS|__GFP_ZERO);
 	if (!new_sb)
-		return -ENOMEM;
+		return -BCH_ERR_ENOMEM_sb_buf_realloc;
 
 	sb->sb = new_sb;
 	sb->buffer_size = new_buffer_size;
@@ -562,8 +562,9 @@ reread:
 	}
 
 	if (bytes > sb->buffer_size) {
-		if (bch2_sb_realloc(sb, le32_to_cpu(sb->sb->u64s)))
-			return -ENOMEM;
+		ret = bch2_sb_realloc(sb, le32_to_cpu(sb->sb->u64s));
+		if (ret)
+			return ret;
 		goto reread;
 	}
 
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 46dae5ab0db7..7f7beed1e062 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -644,7 +644,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 
 	c = kvpmalloc(sizeof(struct bch_fs), GFP_KERNEL|__GFP_ZERO);
 	if (!c) {
-		c = ERR_PTR(-ENOMEM);
+		c = ERR_PTR(-BCH_ERR_ENOMEM_fs_alloc);
 		goto out;
 	}
 
@@ -744,7 +744,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 	strscpy(c->name, name.buf, sizeof(c->name));
 	printbuf_exit(&name);
 
-	ret = name.allocation_failure ? -ENOMEM : 0;
+	ret = name.allocation_failure ? -BCH_ERR_ENOMEM_fs_name_alloc : 0;
 	if (ret)
 		goto err;
 
@@ -808,7 +808,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 	    mempool_init_kmalloc_pool(&c->large_bkey_pool, 1, 2048) ||
 	    !(c->unused_inode_hints = kcalloc(1U << c->inode_shard_bits,
 					      sizeof(u64), GFP_KERNEL))) {
-		ret = -ENOMEM;
+		ret = -BCH_ERR_ENOMEM_fs_other_alloc;
 		goto err;
 	}
 
@@ -1189,7 +1189,7 @@ out:
 err:
 	if (ca)
 		bch2_dev_free(ca);
-	ret = -ENOMEM;
+	ret = -BCH_ERR_ENOMEM_dev_alloc;
 	goto out;
 }
 
-- 
cgit 


From b9fa375bab2786d0d2c5435b5e3fceaf6594aaf3 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 11 Mar 2023 20:38:46 -0500
Subject: bcachefs: bch2_fs_moving_ctxts_to_text()

This also adds bch2_write_op_to_text(): now we can see outstand moves,
useful for debugging shutdown with the upcoming BCH_WRITE_WAIT_FOR_EC
and likely for other things in the future.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs.h |  16 +++++---
 fs/bcachefs/io.c       |  28 ++++++++++++++
 fs/bcachefs/io.h       |  61 ++++++++++++++---------------
 fs/bcachefs/move.c     | 103 +++++++++++++++++++++++++++++++++++++++++++++----
 fs/bcachefs/move.h     |   9 +++++
 fs/bcachefs/super.c    |   3 +-
 fs/bcachefs/sysfs.c    |  26 +++----------
 7 files changed, 179 insertions(+), 67 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index baaa4cd3caa7..8be65ebb34ad 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -214,8 +214,11 @@
 #define BCH_WRITE_REF_DEBUG
 #endif
 
+#ifndef dynamic_fault
 #define dynamic_fault(...)		0
-#define race_fault(...)			0
+#endif
+
+#define race_fault(...)			dynamic_fault("bcachefs:race")
 
 #define trace_and_count(_c, _name, ...)					\
 do {									\
@@ -922,6 +925,13 @@ struct bch_fs {
 
 	mempool_t		large_bkey_pool;
 
+	/* MOVE.C */
+	struct list_head	moving_context_list;
+	struct mutex		moving_context_lock;
+
+	struct list_head	data_progress_list;
+	struct mutex		data_progress_lock;
+
 	/* REBALANCE */
 	struct bch_fs_rebalance	rebalance;
 
@@ -932,10 +942,6 @@ struct bch_fs {
 	bool			copygc_running;
 	wait_queue_head_t	copygc_running_wq;
 
-	/* DATA PROGRESS STATS */
-	struct list_head	data_progress_list;
-	struct mutex		data_progress_lock;
-
 	/* STRIPES: */
 	GENRADIX(struct stripe) stripes;
 	GENRADIX(struct gc_stripe) gc_stripes;
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index 6daf5f4a905c..1b093650ff9a 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -1870,6 +1870,34 @@ err:
 		op->end_io(op);
 }
 
+const char * const bch2_write_flags[] = {
+#define x(f)	#f,
+	BCH_WRITE_FLAGS()
+#undef x
+	NULL
+};
+
+void bch2_write_op_to_text(struct printbuf *out, struct bch_write_op *op)
+{
+	prt_str(out, "pos: ");
+	bch2_bpos_to_text(out, op->pos);
+	prt_newline(out);
+	printbuf_indent_add(out, 2);
+
+	prt_str(out, "started: ");
+	bch2_pr_time_units(out, local_clock() - op->start_time);
+	prt_newline(out);
+
+	prt_str(out, "flags: ");
+	prt_bitflags(out, bch2_write_flags, op->flags);
+	prt_newline(out);
+
+	prt_printf(out, "ref: %u", closure_nr_remaining(&op->cl));
+	prt_newline(out);
+
+	printbuf_indent_sub(out, 2);
+}
+
 /* Cache promotion on read */
 
 struct promote_op {
diff --git a/fs/bcachefs/io.h b/fs/bcachefs/io.h
index 77a4a1cef71c..87d80fb28c05 100644
--- a/fs/bcachefs/io.h
+++ b/fs/bcachefs/io.h
@@ -28,41 +28,34 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *, struct bch_fs *,
 
 const char *bch2_blk_status_to_str(blk_status_t);
 
-enum bch_write_flags {
-	__BCH_WRITE_ALLOC_NOWAIT,
-	__BCH_WRITE_CACHED,
-	__BCH_WRITE_DATA_ENCODED,
-	__BCH_WRITE_PAGES_STABLE,
-	__BCH_WRITE_PAGES_OWNED,
-	__BCH_WRITE_ONLY_SPECIFIED_DEVS,
-	__BCH_WRITE_WROTE_DATA_INLINE,
-	__BCH_WRITE_FROM_INTERNAL,
-	__BCH_WRITE_CHECK_ENOSPC,
-	__BCH_WRITE_SYNC,
-	__BCH_WRITE_MOVE,
-	__BCH_WRITE_IN_WORKER,
-	__BCH_WRITE_DONE,
-	__BCH_WRITE_IO_ERROR,
-	__BCH_WRITE_CONVERT_UNWRITTEN,
+#define BCH_WRITE_FLAGS()		\
+	x(ALLOC_NOWAIT)			\
+	x(CACHED)			\
+	x(DATA_ENCODED)			\
+	x(PAGES_STABLE)			\
+	x(PAGES_OWNED)			\
+	x(ONLY_SPECIFIED_DEVS)		\
+	x(WROTE_DATA_INLINE)		\
+	x(FROM_INTERNAL)		\
+	x(CHECK_ENOSPC)			\
+	x(SYNC)				\
+	x(MOVE)				\
+	x(IN_WORKER)			\
+	x(DONE)				\
+	x(IO_ERROR)			\
+	x(CONVERT_UNWRITTEN)
+
+enum __bch_write_flags {
+#define x(f)	__BCH_WRITE_##f,
+	BCH_WRITE_FLAGS()
+#undef x
 };
 
-#define BCH_WRITE_ALLOC_NOWAIT		(1U << __BCH_WRITE_ALLOC_NOWAIT)
-#define BCH_WRITE_CACHED		(1U << __BCH_WRITE_CACHED)
-#define BCH_WRITE_DATA_ENCODED		(1U << __BCH_WRITE_DATA_ENCODED)
-#define BCH_WRITE_PAGES_STABLE		(1U << __BCH_WRITE_PAGES_STABLE)
-#define BCH_WRITE_PAGES_OWNED		(1U << __BCH_WRITE_PAGES_OWNED)
-#define BCH_WRITE_ONLY_SPECIFIED_DEVS	(1U << __BCH_WRITE_ONLY_SPECIFIED_DEVS)
-#define BCH_WRITE_WROTE_DATA_INLINE	(1U << __BCH_WRITE_WROTE_DATA_INLINE)
-#define BCH_WRITE_FROM_INTERNAL		(1U << __BCH_WRITE_FROM_INTERNAL)
-#define BCH_WRITE_CHECK_ENOSPC		(1U << __BCH_WRITE_CHECK_ENOSPC)
-#define BCH_WRITE_SYNC			(1U << __BCH_WRITE_SYNC)
-#define BCH_WRITE_MOVE			(1U << __BCH_WRITE_MOVE)
-
-/* Internal: */
-#define BCH_WRITE_IN_WORKER		(1U << __BCH_WRITE_IN_WORKER)
-#define BCH_WRITE_DONE			(1U << __BCH_WRITE_DONE)
-#define BCH_WRITE_IO_ERROR		(1U << __BCH_WRITE_IO_ERROR)
-#define BCH_WRITE_CONVERT_UNWRITTEN	(1U << __BCH_WRITE_CONVERT_UNWRITTEN)
+enum bch_write_flags {
+#define x(f)	BCH_WRITE_##f = 1U << __BCH_WRITE_##f,
+	BCH_WRITE_FLAGS()
+#undef x
+};
 
 static inline struct workqueue_struct *index_update_wq(struct bch_write_op *op)
 {
@@ -124,6 +117,8 @@ static inline struct bch_write_bio *wbio_init(struct bio *bio)
 	return wbio;
 }
 
+void bch2_write_op_to_text(struct printbuf *, struct bch_write_op *);
+
 struct bch_devs_mask;
 struct cache_promote_op;
 struct extent_ptr_decoded;
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index de10f388b8d3..f74ef947cac5 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -40,7 +40,8 @@ static void progress_list_del(struct bch_fs *c, struct bch_move_stats *stats)
 }
 
 struct moving_io {
-	struct list_head		list;
+	struct list_head		read_list;
+	struct list_head		io_list;
 	struct move_bucket_in_flight	*b;
 	struct closure			cl;
 	bool				read_completed;
@@ -64,7 +65,12 @@ static void move_free(struct moving_io *io)
 		atomic_dec(&io->b->count);
 
 	bch2_data_update_exit(&io->write);
+
+	mutex_lock(&ctxt->lock);
+	list_del(&io->io_list);
 	wake_up(&ctxt->wait);
+	mutex_unlock(&ctxt->lock);
+
 	bch2_write_ref_put(c, BCH_WRITE_REF_move);
 	kfree(io);
 }
@@ -100,7 +106,7 @@ static void move_write(struct moving_io *io)
 struct moving_io *bch2_moving_ctxt_next_pending_write(struct moving_context *ctxt)
 {
 	struct moving_io *io =
-		list_first_entry_or_null(&ctxt->reads, struct moving_io, list);
+		list_first_entry_or_null(&ctxt->reads, struct moving_io, read_list);
 
 	return io && io->read_completed ? io : NULL;
 }
@@ -127,7 +133,7 @@ void bch2_moving_ctxt_do_pending_writes(struct moving_context *ctxt,
 		bch2_trans_unlock(trans);
 
 	while ((io = bch2_moving_ctxt_next_pending_write(ctxt))) {
-		list_del(&io->list);
+		list_del(&io->read_list);
 		move_write(io);
 	}
 }
@@ -144,6 +150,8 @@ static void bch2_move_ctxt_wait_for_io(struct moving_context *ctxt,
 
 void bch2_moving_ctxt_exit(struct moving_context *ctxt)
 {
+	struct bch_fs *c = ctxt->c;
+
 	move_ctxt_wait_event(ctxt, NULL, list_empty(&ctxt->reads));
 	closure_sync(&ctxt->cl);
 
@@ -153,12 +161,15 @@ void bch2_moving_ctxt_exit(struct moving_context *ctxt)
 	EBUG_ON(atomic_read(&ctxt->read_ios));
 
 	if (ctxt->stats) {
-		progress_list_del(ctxt->c, ctxt->stats);
-
-		trace_move_data(ctxt->c,
+		progress_list_del(c, ctxt->stats);
+		trace_move_data(c,
 				atomic64_read(&ctxt->stats->sectors_moved),
 				atomic64_read(&ctxt->stats->keys_moved));
 	}
+
+	mutex_lock(&c->moving_context_lock);
+	list_del(&ctxt->list);
+	mutex_unlock(&c->moving_context_lock);
 }
 
 void bch2_moving_ctxt_init(struct moving_context *ctxt,
@@ -171,15 +182,23 @@ void bch2_moving_ctxt_init(struct moving_context *ctxt,
 	memset(ctxt, 0, sizeof(*ctxt));
 
 	ctxt->c		= c;
+	ctxt->fn	= (void *) _RET_IP_;
 	ctxt->rate	= rate;
 	ctxt->stats	= stats;
 	ctxt->wp	= wp;
 	ctxt->wait_on_copygc = wait_on_copygc;
 
 	closure_init_stack(&ctxt->cl);
+
+	mutex_init(&ctxt->lock);
 	INIT_LIST_HEAD(&ctxt->reads);
+	INIT_LIST_HEAD(&ctxt->ios);
 	init_waitqueue_head(&ctxt->wait);
 
+	mutex_lock(&c->moving_context_lock);
+	list_add(&ctxt->list, &c->moving_context_list);
+	mutex_unlock(&c->moving_context_lock);
+
 	if (stats) {
 		progress_list_add(c, stats);
 		stats->data_type = BCH_DATA_user;
@@ -280,6 +299,7 @@ static int bch2_move_extent(struct btree_trans *trans,
 	if (!io)
 		goto err;
 
+	INIT_LIST_HEAD(&io->io_list);
 	io->write.ctxt		= ctxt;
 	io->read_sectors	= k.k->size;
 	io->write_sectors	= k.k->size;
@@ -333,9 +353,14 @@ static int bch2_move_extent(struct btree_trans *trans,
 	this_cpu_add(c->counters[BCH_COUNTER_move_extent_read], k.k->size);
 	trace_move_extent_read(k.k);
 
+
+	mutex_lock(&ctxt->lock);
 	atomic_add(io->read_sectors, &ctxt->read_sectors);
 	atomic_inc(&ctxt->read_ios);
-	list_add_tail(&io->list, &ctxt->reads);
+
+	list_add_tail(&io->read_list, &ctxt->reads);
+	list_add_tail(&io->io_list, &ctxt->ios);
+	mutex_unlock(&ctxt->lock);
 
 	/*
 	 * dropped by move_read_endio() - guards against use after free of
@@ -1107,3 +1132,67 @@ int bch2_data_job(struct bch_fs *c,
 
 	return ret;
 }
+
+void bch2_data_jobs_to_text(struct printbuf *out, struct bch_fs *c)
+{
+	struct bch_move_stats *stats;
+
+	mutex_lock(&c->data_progress_lock);
+	list_for_each_entry(stats, &c->data_progress_list, list) {
+		prt_printf(out, "%s: data type %s btree_id %s position: ",
+		       stats->name,
+		       bch2_data_types[stats->data_type],
+		       bch2_btree_ids[stats->btree_id]);
+		bch2_bpos_to_text(out, stats->pos);
+		prt_printf(out, "%s", "\n");
+	}
+	mutex_unlock(&c->data_progress_lock);
+}
+
+static void bch2_moving_ctxt_to_text(struct printbuf *out, struct moving_context *ctxt)
+{
+	struct moving_io *io;
+
+	prt_printf(out, "%ps:", ctxt->fn);
+	prt_newline(out);
+	printbuf_indent_add(out, 2);
+
+	prt_printf(out, "reads: %u sectors %u",
+		   atomic_read(&ctxt->read_ios),
+		   atomic_read(&ctxt->read_sectors));
+	prt_newline(out);
+
+	prt_printf(out, "writes: %u sectors %u",
+		   atomic_read(&ctxt->write_ios),
+		   atomic_read(&ctxt->write_sectors));
+	prt_newline(out);
+
+	printbuf_indent_add(out, 2);
+
+	mutex_lock(&ctxt->lock);
+	list_for_each_entry(io, &ctxt->ios, io_list) {
+		bch2_write_op_to_text(out, &io->write.op);
+	}
+	mutex_unlock(&ctxt->lock);
+
+	printbuf_indent_sub(out, 4);
+}
+
+void bch2_fs_moving_ctxts_to_text(struct printbuf *out, struct bch_fs *c)
+{
+	struct moving_context *ctxt;
+
+	mutex_lock(&c->moving_context_lock);
+	list_for_each_entry(ctxt, &c->moving_context_list, list)
+		bch2_moving_ctxt_to_text(out, ctxt);
+	mutex_unlock(&c->moving_context_lock);
+}
+
+void bch2_fs_move_init(struct bch_fs *c)
+{
+	INIT_LIST_HEAD(&c->moving_context_list);
+	mutex_init(&c->moving_context_lock);
+
+	INIT_LIST_HEAD(&c->data_progress_list);
+	mutex_init(&c->data_progress_lock);
+}
diff --git a/fs/bcachefs/move.h b/fs/bcachefs/move.h
index 4c0013872347..50a6f7d7a292 100644
--- a/fs/bcachefs/move.h
+++ b/fs/bcachefs/move.h
@@ -11,6 +11,9 @@ struct bch_read_bio;
 
 struct moving_context {
 	struct bch_fs		*c;
+	struct list_head	list;
+	void			*fn;
+
 	struct bch_ratelimit	*rate;
 	struct bch_move_stats	*stats;
 	struct write_point_specifier wp;
@@ -19,7 +22,10 @@ struct moving_context {
 
 	/* For waiting on outstanding reads and writes: */
 	struct closure		cl;
+
+	struct mutex		lock;
 	struct list_head	reads;
+	struct list_head	ios;
 
 	/* in flight sectors: */
 	atomic_t		read_sectors;
@@ -84,6 +90,9 @@ int bch2_data_job(struct bch_fs *,
 		  struct bch_ioctl_data);
 
 void bch2_move_stats_init(struct bch_move_stats *stats, char *name);
+void bch2_data_jobs_to_text(struct printbuf *, struct bch_fs *);
+void bch2_fs_moving_ctxts_to_text(struct printbuf *, struct bch_fs *);
 
+void bch2_fs_move_init(struct bch_fs *);
 
 #endif /* _BCACHEFS_MOVE_H */
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 7f7beed1e062..bf3aabdb0fc9 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -681,6 +681,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 	bch2_fs_rebalance_init(c);
 	bch2_fs_quota_init(c);
 	bch2_fs_ec_init_early(c);
+	bch2_fs_move_init(c);
 
 	INIT_LIST_HEAD(&c->list);
 
@@ -705,8 +706,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 	INIT_LIST_HEAD(&c->ec_stripe_new_list);
 	mutex_init(&c->ec_stripe_new_lock);
 
-	INIT_LIST_HEAD(&c->data_progress_list);
-	mutex_init(&c->data_progress_lock);
 
 	mutex_init(&c->ec_stripes_heap_lock);
 
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index e3a166f79cb6..6be6be881dbd 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -248,6 +248,7 @@ read_attribute(io_timers_read);
 read_attribute(io_timers_write);
 
 read_attribute(data_jobs);
+read_attribute(moving_ctxts);
 
 #ifdef CONFIG_BCACHEFS_TESTS
 write_attribute(perf_test);
@@ -277,25 +278,6 @@ static size_t bch2_btree_cache_size(struct bch_fs *c)
 	return ret;
 }
 
-static long data_progress_to_text(struct printbuf *out, struct bch_fs *c)
-{
-	long ret = 0;
-	struct bch_move_stats *stats;
-
-	mutex_lock(&c->data_progress_lock);
-	list_for_each_entry(stats, &c->data_progress_list, list) {
-		prt_printf(out, "%s: data type %s btree_id %s position: ",
-		       stats->name,
-		       bch2_data_types[stats->data_type],
-		       bch2_btree_ids[stats->btree_id]);
-		bch2_bpos_to_text(out, stats->pos);
-		prt_printf(out, "%s", "\n");
-	}
-
-	mutex_unlock(&c->data_progress_lock);
-	return ret;
-}
-
 static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c)
 {
 	struct btree_trans trans;
@@ -476,7 +458,10 @@ SHOW(bch2_fs)
 		bch2_io_timers_to_text(out, &c->io_clock[WRITE]);
 
 	if (attr == &sysfs_data_jobs)
-		data_progress_to_text(out, c);
+		bch2_data_jobs_to_text(out, c);
+
+	if (attr == &sysfs_moving_ctxts)
+		bch2_fs_moving_ctxts_to_text(out, c);
 
 #ifdef BCH_WRITE_REF_DEBUG
 	if (attr == &sysfs_write_refs)
@@ -693,6 +678,7 @@ struct attribute *bch2_fs_internal_files[] = {
 	sysfs_pd_controller_files(rebalance),
 
 	&sysfs_data_jobs,
+	&sysfs_moving_ctxts,
 
 	&sysfs_internal_uuid,
 	NULL
-- 
cgit 


From b40901b0f7182557851c8e9af31bacfbbd76b1ec Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 13 Mar 2023 22:01:47 -0400
Subject: bcachefs: New erasure coding shutdown path

This implements a new shutdown path for erasure coding, which is needed
for the upcoming BCH_WRITE_WAIT_FOR_EC write path.

The process is:
 - Cancel new stripes being built up
 - Close out/cancel open buckets on write points or the partial list
   that are for stripes
 - Shutdown rebalance/copygc
 - Then wait for in flight new stripes to finish

With BCH_WRITE_WAIT_FOR_EC, move ops will be waiting on stripes to fill
up before they complete; the new ec shutdown path is needed for shutting
down copygc/rebalance without deadlocking.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c | 39 +----------------
 fs/bcachefs/alloc_foreground.c | 96 ++++++++++++++++++++++++++++++++----------
 fs/bcachefs/alloc_foreground.h |  6 +--
 fs/bcachefs/bcachefs.h         |  7 ++-
 fs/bcachefs/data_update.c      |  1 +
 fs/bcachefs/ec.c               | 54 +++++++++++++++++++++---
 fs/bcachefs/ec.h               |  4 +-
 fs/bcachefs/io.c               | 10 ++++-
 fs/bcachefs/move.c             |  6 ---
 fs/bcachefs/super.c            | 12 ++----
 10 files changed, 141 insertions(+), 94 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index e5abe6406afe..17bcebbd1f2a 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -2158,44 +2158,7 @@ void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca)
 	 */
 	bch2_recalc_capacity(c);
 
-	/* Next, close write points that point to this device... */
-	for (i = 0; i < ARRAY_SIZE(c->write_points); i++)
-		bch2_writepoint_stop(c, ca, &c->write_points[i]);
-
-	bch2_writepoint_stop(c, ca, &c->copygc_write_point);
-	bch2_writepoint_stop(c, ca, &c->rebalance_write_point);
-	bch2_writepoint_stop(c, ca, &c->btree_write_point);
-
-	mutex_lock(&c->btree_reserve_cache_lock);
-	while (c->btree_reserve_cache_nr) {
-		struct btree_alloc *a =
-			&c->btree_reserve_cache[--c->btree_reserve_cache_nr];
-
-		bch2_open_buckets_put(c, &a->ob);
-	}
-	mutex_unlock(&c->btree_reserve_cache_lock);
-
-	spin_lock(&c->freelist_lock);
-	i = 0;
-	while (i < c->open_buckets_partial_nr) {
-		struct open_bucket *ob =
-			c->open_buckets + c->open_buckets_partial[i];
-
-		if (ob->dev == ca->dev_idx) {
-			--c->open_buckets_partial_nr;
-			swap(c->open_buckets_partial[i],
-			     c->open_buckets_partial[c->open_buckets_partial_nr]);
-			ob->on_partial_list = false;
-			spin_unlock(&c->freelist_lock);
-			bch2_open_bucket_put(c, ob);
-			spin_lock(&c->freelist_lock);
-		} else {
-			i++;
-		}
-	}
-	spin_unlock(&c->freelist_lock);
-
-	bch2_ec_stop_dev(c, ca);
+	bch2_open_buckets_stop(c, ca, false);
 
 	/*
 	 * Wake up threads that were blocked on allocation, so they can notice
diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index 7c81189bcd62..20c64882104e 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -1023,45 +1023,96 @@ static int open_bucket_add_buckets(struct btree_trans *trans,
 	return ret < 0 ? ret : 0;
 }
 
-void bch2_open_buckets_stop_dev(struct bch_fs *c, struct bch_dev *ca,
-				struct open_buckets *obs)
+static bool should_drop_bucket(struct open_bucket *ob, struct bch_fs *c,
+			       struct bch_dev *ca, bool ec)
 {
-	struct open_buckets ptrs = { .nr = 0 };
-	struct open_bucket *ob, *ob2;
-	unsigned i, j;
-
-	open_bucket_for_each(c, obs, ob, i) {
-		bool drop = !ca || ob->dev == ca->dev_idx;
+	if (ec) {
+		return ob->ec != NULL;
+	} else if (ca) {
+		bool drop = ob->dev == ca->dev_idx;
+		struct open_bucket *ob2;
+		unsigned i;
 
 		if (!drop && ob->ec) {
 			mutex_lock(&ob->ec->lock);
-			for (j = 0; j < ob->ec->new_stripe.key.v.nr_blocks; j++) {
-				if (!ob->ec->blocks[j])
+			for (i = 0; i < ob->ec->new_stripe.key.v.nr_blocks; i++) {
+				if (!ob->ec->blocks[i])
 					continue;
 
-				ob2 = c->open_buckets + ob->ec->blocks[j];
+				ob2 = c->open_buckets + ob->ec->blocks[i];
 				drop |= ob2->dev == ca->dev_idx;
 			}
 			mutex_unlock(&ob->ec->lock);
 		}
 
-		if (drop)
-			bch2_open_bucket_put(c, ob);
-		else
-			ob_push(c, &ptrs, ob);
+		return drop;
+	} else {
+		return true;
 	}
-
-	*obs = ptrs;
 }
 
-void bch2_writepoint_stop(struct bch_fs *c, struct bch_dev *ca,
-			  struct write_point *wp)
+static void bch2_writepoint_stop(struct bch_fs *c, struct bch_dev *ca,
+				 bool ec, struct write_point *wp)
 {
+	struct open_buckets ptrs = { .nr = 0 };
+	struct open_bucket *ob;
+	unsigned i;
+
 	mutex_lock(&wp->lock);
-	bch2_open_buckets_stop_dev(c, ca, &wp->ptrs);
+	open_bucket_for_each(c, &wp->ptrs, ob, i)
+		if (should_drop_bucket(ob, c, ca, ec))
+			bch2_open_bucket_put(c, ob);
+		else
+			ob_push(c, &ptrs, ob);
+	wp->ptrs = ptrs;
 	mutex_unlock(&wp->lock);
 }
 
+void bch2_open_buckets_stop(struct bch_fs *c, struct bch_dev *ca,
+			    bool ec)
+{
+	unsigned i;
+
+	/* Next, close write points that point to this device... */
+	for (i = 0; i < ARRAY_SIZE(c->write_points); i++)
+		bch2_writepoint_stop(c, ca, ec, &c->write_points[i]);
+
+	bch2_writepoint_stop(c, ca, ec, &c->copygc_write_point);
+	bch2_writepoint_stop(c, ca, ec, &c->rebalance_write_point);
+	bch2_writepoint_stop(c, ca, ec, &c->btree_write_point);
+
+	mutex_lock(&c->btree_reserve_cache_lock);
+	while (c->btree_reserve_cache_nr) {
+		struct btree_alloc *a =
+			&c->btree_reserve_cache[--c->btree_reserve_cache_nr];
+
+		bch2_open_buckets_put(c, &a->ob);
+	}
+	mutex_unlock(&c->btree_reserve_cache_lock);
+
+	spin_lock(&c->freelist_lock);
+	i = 0;
+	while (i < c->open_buckets_partial_nr) {
+		struct open_bucket *ob =
+			c->open_buckets + c->open_buckets_partial[i];
+
+		if (should_drop_bucket(ob, c, ca, ec)) {
+			--c->open_buckets_partial_nr;
+			swap(c->open_buckets_partial[i],
+			     c->open_buckets_partial[c->open_buckets_partial_nr]);
+			ob->on_partial_list = false;
+			spin_unlock(&c->freelist_lock);
+			bch2_open_bucket_put(c, ob);
+			spin_lock(&c->freelist_lock);
+		} else {
+			i++;
+		}
+	}
+	spin_unlock(&c->freelist_lock);
+
+	bch2_ec_stop_dev(c, ca);
+}
+
 static inline struct hlist_head *writepoint_hash(struct bch_fs *c,
 						 unsigned long write_point)
 {
@@ -1107,8 +1158,7 @@ static bool try_increase_writepoints(struct bch_fs *c)
 	return true;
 }
 
-static bool try_decrease_writepoints(struct bch_fs *c,
-				     unsigned old_nr)
+static bool try_decrease_writepoints(struct bch_fs *c, unsigned old_nr)
 {
 	struct write_point *wp;
 
@@ -1129,7 +1179,7 @@ static bool try_decrease_writepoints(struct bch_fs *c,
 	hlist_del_rcu(&wp->node);
 	mutex_unlock(&c->write_points_hash_lock);
 
-	bch2_writepoint_stop(c, NULL, wp);
+	bch2_writepoint_stop(c, NULL, false, wp);
 	return true;
 }
 
diff --git a/fs/bcachefs/alloc_foreground.h b/fs/bcachefs/alloc_foreground.h
index 1fa96f8c6879..8a1cf425091b 100644
--- a/fs/bcachefs/alloc_foreground.h
+++ b/fs/bcachefs/alloc_foreground.h
@@ -202,11 +202,7 @@ void bch2_alloc_sectors_append_ptrs(struct bch_fs *, struct write_point *,
 				    struct bkey_i *, unsigned, bool);
 void bch2_alloc_sectors_done(struct bch_fs *, struct write_point *);
 
-void bch2_open_buckets_stop_dev(struct bch_fs *, struct bch_dev *,
-				struct open_buckets *);
-
-void bch2_writepoint_stop(struct bch_fs *, struct bch_dev *,
-			  struct write_point *);
+void bch2_open_buckets_stop(struct bch_fs *c, struct bch_dev *, bool);
 
 static inline struct write_point_specifier writepoint_hashed(unsigned long v)
 {
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 8be65ebb34ad..05fc0f7434dd 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -655,7 +655,6 @@ typedef struct {
 	x(fallocate)							\
 	x(discard)							\
 	x(invalidate)							\
-	x(move)								\
 	x(delete_dead_snapshots)					\
 	x(snapshot_delete_pagecache)					\
 	x(sysfs)
@@ -958,14 +957,14 @@ struct bch_fs {
 
 	struct list_head	ec_stripe_new_list;
 	struct mutex		ec_stripe_new_lock;
+	wait_queue_head_t	ec_stripe_new_wait;
 
 	struct work_struct	ec_stripe_create_work;
 	u64			ec_stripe_hint;
 
-	struct bio_set		ec_bioset;
-
 	struct work_struct	ec_stripe_delete_work;
-	struct llist_head	ec_stripe_delete_list;
+
+	struct bio_set		ec_bioset;
 
 	/* REFLINK */
 	u64			reflink_hint;
diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c
index 447863825a89..5ec884a222f8 100644
--- a/fs/bcachefs/data_update.c
+++ b/fs/bcachefs/data_update.c
@@ -252,6 +252,7 @@ restart_drop_extra_replicas:
 				BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
 			bch2_trans_commit(trans, &op->res,
 				NULL,
+				BTREE_INSERT_NOCHECK_RW|
 				BTREE_INSERT_NOFAIL|
 				m->data_opts.btree_insert_flags);
 		if (!ret) {
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index af3a72acc67f..1e621dcc1d37 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -989,6 +989,7 @@ static int ec_stripe_update_bucket(struct btree_trans *trans, struct ec_stripe_b
 
 	while (1) {
 		ret = commit_do(trans, NULL, NULL,
+				BTREE_INSERT_NOCHECK_RW|
 				BTREE_INSERT_NOFAIL,
 			ec_stripe_update_extent(trans, bucket_pos, bucket.gen,
 						s, &bp_offset));
@@ -1127,7 +1128,9 @@ static void ec_stripe_create(struct ec_stripe_new *s)
 		goto err;
 	}
 
-	ret = bch2_trans_do(c, &s->res, NULL, BTREE_INSERT_NOFAIL,
+	ret = bch2_trans_do(c, &s->res, NULL,
+			    BTREE_INSERT_NOCHECK_RW|
+			    BTREE_INSERT_NOFAIL,
 			    ec_stripe_key_update(&trans, &s->new_stripe.key,
 						 !s->have_existing_stripe));
 	if (ret) {
@@ -1409,6 +1412,11 @@ struct ec_stripe_head *__bch2_ec_stripe_head_get(struct btree_trans *trans,
 	if (ret)
 		return ERR_PTR(ret);
 
+	if (test_bit(BCH_FS_GOING_RO, &c->flags)) {
+		h = ERR_PTR(-EROFS);
+		goto found;
+	}
+
 	list_for_each_entry(h, &c->ec_stripe_head_list, list)
 		if (h->target		== target &&
 		    h->algo		== algo &&
@@ -1753,7 +1761,7 @@ err:
 	return ERR_PTR(ret);
 }
 
-void bch2_ec_stop_dev(struct bch_fs *c, struct bch_dev *ca)
+static void __bch2_ec_stop(struct bch_fs *c, struct bch_dev *ca)
 {
 	struct ec_stripe_head *h;
 	struct open_bucket *ob;
@@ -1761,11 +1769,13 @@ void bch2_ec_stop_dev(struct bch_fs *c, struct bch_dev *ca)
 
 	mutex_lock(&c->ec_stripe_head_lock);
 	list_for_each_entry(h, &c->ec_stripe_head_list, list) {
-
 		mutex_lock(&h->lock);
 		if (!h->s)
 			goto unlock;
 
+		if (!ca)
+			goto found;
+
 		for (i = 0; i < h->s->new_stripe.key.v.nr_blocks; i++) {
 			if (!h->s->blocks[i])
 				continue;
@@ -1784,6 +1794,32 @@ unlock:
 	mutex_unlock(&c->ec_stripe_head_lock);
 }
 
+void bch2_ec_stop_dev(struct bch_fs *c, struct bch_dev *ca)
+{
+	__bch2_ec_stop(c, ca);
+}
+
+void bch2_fs_ec_stop(struct bch_fs *c)
+{
+	__bch2_ec_stop(c, NULL);
+}
+
+static bool bch2_fs_ec_flush_done(struct bch_fs *c)
+{
+	bool ret;
+
+	mutex_lock(&c->ec_stripe_new_lock);
+	ret = list_empty(&c->ec_stripe_new_list);
+	mutex_unlock(&c->ec_stripe_new_lock);
+
+	return ret;
+}
+
+void bch2_fs_ec_flush(struct bch_fs *c)
+{
+	wait_event(c->ec_stripe_new_wait, bch2_fs_ec_flush_done(c));
+}
+
 int bch2_stripes_read(struct bch_fs *c)
 {
 	struct btree_trans trans;
@@ -1915,14 +1951,22 @@ void bch2_fs_ec_exit(struct bch_fs *c)
 
 void bch2_fs_ec_init_early(struct bch_fs *c)
 {
+	spin_lock_init(&c->ec_stripes_new_lock);
+	mutex_init(&c->ec_stripes_heap_lock);
+
+	INIT_LIST_HEAD(&c->ec_stripe_head_list);
+	mutex_init(&c->ec_stripe_head_lock);
+
+	INIT_LIST_HEAD(&c->ec_stripe_new_list);
+	mutex_init(&c->ec_stripe_new_lock);
+	init_waitqueue_head(&c->ec_stripe_new_wait);
+
 	INIT_WORK(&c->ec_stripe_create_work, ec_stripe_create_work);
 	INIT_WORK(&c->ec_stripe_delete_work, ec_stripe_delete_work);
 }
 
 int bch2_fs_ec_init(struct bch_fs *c)
 {
-	spin_lock_init(&c->ec_stripes_new_lock);
-
 	return bioset_init(&c->ec_bioset, 1, offsetof(struct ec_bio, bio),
 			   BIOSET_NEED_BVECS);
 }
diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h
index 8f777a37e43d..7c08a49d7419 100644
--- a/fs/bcachefs/ec.h
+++ b/fs/bcachefs/ec.h
@@ -245,8 +245,8 @@ static inline void ec_stripe_new_put(struct bch_fs *c, struct ec_stripe_new *s,
 }
 
 void bch2_ec_stop_dev(struct bch_fs *, struct bch_dev *);
-
-void bch2_ec_flush_new_stripes(struct bch_fs *);
+void bch2_fs_ec_stop(struct bch_fs *);
+void bch2_fs_ec_flush(struct bch_fs *);
 
 int bch2_stripes_read(struct bch_fs *);
 
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index 1b093650ff9a..e82da496b3f8 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -705,7 +705,8 @@ static void bch2_write_done(struct closure *cl)
 	struct bch_fs *c = op->c;
 
 	bch2_disk_reservation_put(c, &op->res);
-	bch2_write_ref_put(c, BCH_WRITE_REF_write);
+	if (!(op->flags & BCH_WRITE_MOVE))
+		bch2_write_ref_put(c, BCH_WRITE_REF_write);
 	bch2_keylist_free(&op->insert_keys, op->inline_keys);
 
 	bch2_time_stats_update(&c->times[BCH_TIME_data_write], op->start_time);
@@ -1842,7 +1843,12 @@ void bch2_write(struct closure *cl)
 		goto err;
 	}
 
-	if (c->opts.nochanges ||
+	if (c->opts.nochanges) {
+		op->error = -BCH_ERR_erofs_no_writes;
+		goto err;
+	}
+
+	if (!(op->flags & BCH_WRITE_MOVE) &&
 	    !bch2_write_ref_tryget(c, BCH_WRITE_REF_write)) {
 		op->error = -BCH_ERR_erofs_no_writes;
 		goto err;
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index f74ef947cac5..4a9ffca7be62 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -59,7 +59,6 @@ struct moving_io {
 static void move_free(struct moving_io *io)
 {
 	struct moving_context *ctxt = io->write.ctxt;
-	struct bch_fs *c = ctxt->c;
 
 	if (io->b)
 		atomic_dec(&io->b->count);
@@ -71,7 +70,6 @@ static void move_free(struct moving_io *io)
 	wake_up(&ctxt->wait);
 	mutex_unlock(&ctxt->lock);
 
-	bch2_write_ref_put(c, BCH_WRITE_REF_move);
 	kfree(io);
 }
 
@@ -280,9 +278,6 @@ static int bch2_move_extent(struct btree_trans *trans,
 		return 0;
 	}
 
-	if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_move))
-		return -BCH_ERR_erofs_no_writes;
-
 	/*
 	 * Before memory allocations & taking nocow locks in
 	 * bch2_data_update_init():
@@ -378,7 +373,6 @@ err_free_pages:
 err_free:
 	kfree(io);
 err:
-	bch2_write_ref_put(c, BCH_WRITE_REF_move);
 	trace_and_count(c, move_extent_alloc_mem_fail, k.k);
 	return ret;
 }
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index bf3aabdb0fc9..278f8f19a230 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -205,9 +205,12 @@ static void __bch2_fs_read_only(struct bch_fs *c)
 	unsigned i, clean_passes = 0;
 	u64 seq = 0;
 
+	bch2_fs_ec_stop(c);
+	bch2_open_buckets_stop(c, NULL, true);
 	bch2_rebalance_stop(c);
 	bch2_copygc_stop(c);
 	bch2_gc_thread_stop(c);
+	bch2_fs_ec_flush(c);
 
 	bch_verbose(c, "flushing journal and stopping allocators, journal seq %llu",
 		    journal_cur_seq(&c->journal));
@@ -700,15 +703,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 	INIT_LIST_HEAD(&c->fsck_errors);
 	mutex_init(&c->fsck_error_lock);
 
-	INIT_LIST_HEAD(&c->ec_stripe_head_list);
-	mutex_init(&c->ec_stripe_head_lock);
-
-	INIT_LIST_HEAD(&c->ec_stripe_new_list);
-	mutex_init(&c->ec_stripe_new_lock);
-
-
-	mutex_init(&c->ec_stripes_heap_lock);
-
 	seqcount_init(&c->gc_pos_lock);
 
 	seqcount_init(&c->usage_lock);
-- 
cgit 


From 40a18fe27335706789b1322934f4d8b458f302e3 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 14 Mar 2023 16:21:16 -0400
Subject: bcachefs: Add error message for failing to allocate sorted journal
 keys

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/recovery.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 137e523bb7ea..73f7663cbd3f 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -502,8 +502,11 @@ static int journal_keys_sort(struct bch_fs *c)
 	keys->size = roundup_pow_of_two(nr_keys);
 
 	keys->d = kvmalloc_array(keys->size, sizeof(keys->d[0]), GFP_KERNEL);
-	if (!keys->d)
+	if (!keys->d) {
+		bch_err(c, "Failed to allocate buffer for sorted journal keys (%zu keys)",
+			nr_keys);
 		return -BCH_ERR_ENOMEM_journal_keys_sort;
+	}
 
 	genradix_for_each(&c->journal_entries, iter, _i) {
 		i = *_i;
-- 
cgit 


From 2f0815840c80075bc35f210a7acfa8b48717be5a Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 14 Mar 2023 14:39:54 -0400
Subject: bcachefs: Improve the backpointer to missing extent message

We now print the pos where the backpointer was found in the btree, as
well as the exact bucket:bucket_offset of the data, to aid in grepping
through logs.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/backpointers.c | 39 ++++++++++++++++++++++++---------------
 fs/bcachefs/backpointers.h |  2 +-
 2 files changed, 25 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c
index 8517c5635226..740084b3ff12 100644
--- a/fs/bcachefs/backpointers.c
+++ b/fs/bcachefs/backpointers.c
@@ -298,11 +298,12 @@ err:
 /*
  * Find the next backpointer >= *bp_offset:
  */
-int bch2_get_next_backpointer(struct btree_trans *trans,
-			      struct bpos bucket, int gen,
-			      u64 *bp_offset,
-			      struct bch_backpointer *dst,
-			      unsigned iter_flags)
+int __bch2_get_next_backpointer(struct btree_trans *trans,
+				struct bpos bucket, int gen,
+				u64 *bp_offset,
+				struct bpos *bp_pos_ret,
+				struct bch_backpointer *dst,
+				unsigned iter_flags)
 {
 	struct bch_fs *c = trans->c;
 	struct bpos bp_pos, bp_end_pos;
@@ -352,6 +353,7 @@ int bch2_get_next_backpointer(struct btree_trans *trans,
 
 		*dst = *bkey_s_c_to_backpointer(k).v;
 		*bp_offset = dst->bucket_offset + BACKPOINTER_OFFSET_MAX;
+		*bp_pos_ret = k.k->p;
 		goto out;
 	}
 done:
@@ -362,6 +364,19 @@ out:
 	return ret;
 }
 
+int bch2_get_next_backpointer(struct btree_trans *trans,
+			      struct bpos bucket, int gen,
+			      u64 *bp_offset,
+			      struct bch_backpointer *dst,
+			      unsigned iter_flags)
+{
+	struct bpos bp_pos;
+
+	return __bch2_get_next_backpointer(trans, bucket, gen,
+					   bp_offset, &bp_pos,
+					   dst, iter_flags);
+}
+
 static void backpointer_not_found(struct btree_trans *trans,
 				  struct bpos bucket,
 				  u64 bp_offset,
@@ -952,7 +967,7 @@ static int check_one_backpointer(struct btree_trans *trans,
 	struct printbuf buf = PRINTBUF;
 	int ret;
 
-	ret = bch2_get_next_backpointer(trans, bucket, -1, bp_offset, &bp, 0);
+	ret = __bch2_get_next_backpointer(trans, bucket, -1, bp_offset, &bp_pos, &bp, 0);
 	if (ret || *bp_offset == U64_MAX)
 		return ret;
 
@@ -968,23 +983,17 @@ static int check_one_backpointer(struct btree_trans *trans,
 	if (ret)
 		return ret;
 
-	bp_pos = bucket_pos_to_bp(c, bucket,
-			max(*bp_offset, BACKPOINTER_OFFSET_MAX) - BACKPOINTER_OFFSET_MAX);
-
 	if (!k.k && !bpos_eq(*last_flushed_pos, bp_pos)) {
 		*last_flushed_pos = bp_pos;
-		pr_info("flushing at %llu:%llu",
-			last_flushed_pos->inode,
-			last_flushed_pos->offset);
-
 		ret = bch2_btree_write_buffer_flush_sync(trans) ?:
 			-BCH_ERR_transaction_restart_write_buffer_flush;
 		goto out;
 	}
 
 	if (fsck_err_on(!k.k, c,
-			"%s backpointer points to missing extent\n%s",
-			*bp_offset < BACKPOINTER_OFFSET_MAX ? "alloc" : "btree",
+			"backpointer for %llu:%llu:%llu (btree pos %llu:%llu) points to missing extent\n  %s",
+			bucket.inode, bucket.offset, (u64) bp.bucket_offset,
+			bp_pos.inode, bp_pos.offset,
 			(bch2_backpointer_to_text(&buf, &bp), buf.buf))) {
 		ret = bch2_backpointer_del_by_offset(trans, bucket, *bp_offset, bp);
 		if (ret == -ENOENT)
diff --git a/fs/bcachefs/backpointers.h b/fs/bcachefs/backpointers.h
index 314fee21dc27..d0ba5d8596c5 100644
--- a/fs/bcachefs/backpointers.h
+++ b/fs/bcachefs/backpointers.h
@@ -48,7 +48,7 @@ static inline struct bpos bucket_pos_to_bp(const struct bch_fs *c,
 		  (bucket_to_sector(ca, bucket.offset) <<
 		   MAX_EXTENT_COMPRESS_RATIO_SHIFT) + bucket_offset);
 
-	BUG_ON(!bkey_eq(bucket, bp_pos_to_bucket(c, ret)));
+	EBUG_ON(!bkey_eq(bucket, bp_pos_to_bucket(c, ret)));
 
 	return ret;
 }
-- 
cgit 


From 26559553e47c9f2a14b5254cb307fc755ac316c8 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 15 Mar 2023 11:02:00 -0400
Subject: bcachefs: Add a fallback when journal_keys doesn't fit in ram

We may end up in a situation where allocating the buffer for the sorted
journal_keys fails - but it would likely succeed, post compaction where
we drop duplicates.

We've had reports of this allocation failing, so this adds a slowpath to
do the compaction incrementally.

This is only a band-aid fix; we need to look at limiting the number of
keys in the journal based on the amount of system RAM.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/recovery.c | 67 ++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 49 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 73f7663cbd3f..6aa99f57a001 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -476,6 +476,26 @@ void bch2_journal_keys_free(struct journal_keys *keys)
 	keys->nr = keys->gap = keys->size = 0;
 }
 
+static void __journal_keys_sort(struct journal_keys *keys)
+{
+	struct journal_key *src, *dst;
+
+	sort(keys->d, keys->nr, sizeof(keys->d[0]), journal_sort_key_cmp, NULL);
+
+	src = dst = keys->d;
+	while (src < keys->d + keys->nr) {
+		while (src + 1 < keys->d + keys->nr &&
+		       src[0].btree_id	== src[1].btree_id &&
+		       src[0].level	== src[1].level &&
+		       bpos_eq(src[0].k->k.p, src[1].k->k.p))
+			src++;
+
+		*dst++ = *src++;
+	}
+
+	keys->nr = dst - keys->d;
+}
+
 static int journal_keys_sort(struct bch_fs *c)
 {
 	struct genradix_iter iter;
@@ -483,8 +503,7 @@ static int journal_keys_sort(struct bch_fs *c)
 	struct jset_entry *entry;
 	struct bkey_i *k;
 	struct journal_keys *keys = &c->journal_keys;
-	struct journal_key *src, *dst;
-	size_t nr_keys = 0;
+	size_t nr_keys = 0, nr_read = 0;
 
 	genradix_for_each(&c->journal_entries, iter, _i) {
 		i = *_i;
@@ -503,9 +522,19 @@ static int journal_keys_sort(struct bch_fs *c)
 
 	keys->d = kvmalloc_array(keys->size, sizeof(keys->d[0]), GFP_KERNEL);
 	if (!keys->d) {
-		bch_err(c, "Failed to allocate buffer for sorted journal keys (%zu keys)",
+		bch_err(c, "Failed to allocate buffer for sorted journal keys (%zu keys); trying slowpath",
 			nr_keys);
-		return -BCH_ERR_ENOMEM_journal_keys_sort;
+
+		do {
+			keys->size >>= 1;
+			keys->d = kvmalloc_array(keys->size, sizeof(keys->d[0]), GFP_KERNEL);
+		} while (!keys->d && keys->size > nr_keys / 8);
+
+		if (!keys->d) {
+			bch_err(c, "Failed to allocate %zu size buffer for sorted journal keys; exiting",
+				keys->size);
+			return -BCH_ERR_ENOMEM_journal_keys_sort;
+		}
 	}
 
 	genradix_for_each(&c->journal_entries, iter, _i) {
@@ -514,7 +543,17 @@ static int journal_keys_sort(struct bch_fs *c)
 		if (!i || i->ignore)
 			continue;
 
-		for_each_jset_key(k, entry, &i->j)
+		for_each_jset_key(k, entry, &i->j) {
+			if (keys->nr == keys->size) {
+				__journal_keys_sort(keys);
+
+				if (keys->nr > keys->size * 7 / 8) {
+					bch_err(c, "Too many journal keys for slowpath; have %zu compacted, buf size %zu, processed %zu/%zu",
+						keys->nr, keys->size, nr_read, nr_keys);
+					return -BCH_ERR_ENOMEM_journal_keys_sort;
+				}
+			}
+
 			keys->d[keys->nr++] = (struct journal_key) {
 				.btree_id	= entry->btree_id,
 				.level		= entry->level,
@@ -522,23 +561,15 @@ static int journal_keys_sort(struct bch_fs *c)
 				.journal_seq	= le64_to_cpu(i->j.seq),
 				.journal_offset	= k->_data - i->j._data,
 			};
-	}
-
-	sort(keys->d, keys->nr, sizeof(keys->d[0]), journal_sort_key_cmp, NULL);
 
-	src = dst = keys->d;
-	while (src < keys->d + keys->nr) {
-		while (src + 1 < keys->d + keys->nr &&
-		       src[0].btree_id	== src[1].btree_id &&
-		       src[0].level	== src[1].level &&
-		       bpos_eq(src[0].k->k.p, src[1].k->k.p))
-			src++;
-
-		*dst++ = *src++;
+			nr_read++;
+		}
 	}
 
-	keys->nr = dst - keys->d;
+	__journal_keys_sort(keys);
 	keys->gap = keys->nr;
+
+	bch_verbose(c, "Journal keys: %zu read, %zu after sorting and compacting", nr_keys, keys->nr);
 	return 0;
 }
 
-- 
cgit 


From 56cc033dfcf002eb8a957097fe7290546829b7c0 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 15 Mar 2023 14:41:07 -0400
Subject: bcachefs: Don't run transaction hooks multiple times

transaction hooks aren't supposed to run unless we know the transaction
is going to commit succesfully: this fixes a bug with attempting to
delete a subvolume multiple times.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update_leaf.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 19efd484fc9d..f608e1c92285 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -622,14 +622,6 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
 
 	prefetch(&trans->c->journal.flags);
 
-	h = trans->hooks;
-	while (h) {
-		ret = h->fn(trans, h);
-		if (ret)
-			return ret;
-		h = h->next;
-	}
-
 	trans_for_each_update(trans, i) {
 		/* Multiple inserts might go to same leaf: */
 		if (!same_leaf_as_prev(trans, i))
@@ -696,6 +688,14 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
 			goto revert_fs_usage;
 	}
 
+	h = trans->hooks;
+	while (h) {
+		ret = h->fn(trans, h);
+		if (ret)
+			goto revert_fs_usage;
+		h = h->next;
+	}
+
 	trans_for_each_update(trans, i)
 		if (BTREE_NODE_TYPE_HAS_MEM_TRIGGERS & (1U << i->bkey_type)) {
 			ret = run_one_mem_trigger(trans, i, i->flags);
-- 
cgit 


From 2d33036ca9360bacef23ba32e7768ff9ea87f2be Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 16 Mar 2023 11:04:28 -0400
Subject: bcachefs: Fix for 'missing subvolume' error

Subvolumes, including their root inodes, get deleted asynchronously
after an unlink. But we still need to ensure that we tell the VFS the
inode has been deleted, otherwise VFS writeback could fire after
asynchronous deletion has finished, and try to write to an
inode/subvolume that no longer exists.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs.c | 30 +++++++++++++++++++-----------
 1 file changed, 19 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index 15ab77ebb8c6..828887abc261 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -442,19 +442,27 @@ int __bch2_unlink(struct inode *vdir, struct dentry *dentry,
 	bch2_trans_init(&trans, c, 4, 1024);
 
 	ret = commit_do(&trans, NULL, NULL,
-			      BTREE_INSERT_NOFAIL,
-			bch2_unlink_trans(&trans,
-					  inode_inum(dir), &dir_u,
-					  &inode_u, &dentry->d_name,
-					  deleting_snapshot));
+			BTREE_INSERT_NOFAIL,
+		bch2_unlink_trans(&trans,
+				  inode_inum(dir), &dir_u,
+				  &inode_u, &dentry->d_name,
+				  deleting_snapshot));
+	if (unlikely(ret))
+		goto err;
 
-	if (likely(!ret)) {
-		bch2_inode_update_after_write(&trans, dir, &dir_u,
-					      ATTR_MTIME|ATTR_CTIME);
-		bch2_inode_update_after_write(&trans, inode, &inode_u,
-					      ATTR_MTIME);
-	}
+	bch2_inode_update_after_write(&trans, dir, &dir_u,
+				      ATTR_MTIME|ATTR_CTIME);
+	bch2_inode_update_after_write(&trans, inode, &inode_u,
+				      ATTR_MTIME);
 
+	if (inode_u.bi_subvol) {
+		/*
+		 * Subvolume deletion is asynchronous, but we still want to tell
+		 * the VFS that it's been deleted here:
+		 */
+		set_nlink(&inode->v, 0);
+	}
+err:
 	bch2_trans_exit(&trans);
 	bch2_unlock_inodes(INODE_UPDATE_LOCK, dir, inode);
 
-- 
cgit 


From e1e7ecafe6482464ccc510afb38e1b9b306ce5dc Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 16 Mar 2023 12:47:35 -0400
Subject: bcachefs: Improve error handling in bch2_ioctl_subvolume_destroy()

Pure style fixes

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-ioctl.c | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c
index b51053130f28..269af9393824 100644
--- a/fs/bcachefs/fs-ioctl.c
+++ b/fs/bcachefs/fs-ioctl.c
@@ -451,19 +451,20 @@ static long bch2_ioctl_subvolume_destroy(struct bch_fs *c, struct file *filp,
 		return ret;
 
 	if (path.dentry->d_sb->s_fs_info != c) {
-		path_put(&path);
-		return -EXDEV;
+		ret = -EXDEV;
+		goto err;
 	}
 
 	dir = path.dentry->d_parent->d_inode;
 
 	ret = __bch2_unlink(dir, path.dentry, true);
-	if (!ret) {
-		fsnotify_rmdir(dir, path.dentry);
-		d_delete(path.dentry);
-	}
-	path_put(&path);
+	if (ret)
+		goto err;
 
+	fsnotify_rmdir(dir, path.dentry);
+	d_delete(path.dentry);
+err:
+	path_put(&path);
 	return ret;
 }
 
-- 
cgit 


From 9edbcc72f6987bbb58f113d04e7704b7a84106a6 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 15 Mar 2023 11:53:51 -0400
Subject: bcachefs: Fix bch2_evict_subvolume_inodes()

This fixes a bug in bch2_evict_subvolume_inodes(): d_mark_dontcache()
doesn't handle the case where i_count is already 0, we need to grab and
put the inode in order for it to be dropped.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs.h |  4 +++
 fs/bcachefs/darray.h   | 15 +++++---
 fs/bcachefs/fs.c       | 93 ++++++++++++++++++++++++++++++++++----------------
 fs/bcachefs/fs.h       |  1 +
 fs/bcachefs/inode.c    |  3 --
 fs/bcachefs/super.c    |  3 ++
 6 files changed, 81 insertions(+), 38 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 05fc0f7434dd..c1f27b4910a0 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -971,6 +971,10 @@ struct bch_fs {
 	reflink_gc_table	reflink_gc_table;
 	size_t			reflink_gc_nr;
 
+	/* fs.c */
+	struct list_head	vfs_inodes_list;
+	struct mutex		vfs_inodes_lock;
+
 	/* VFS IO PATH - fs-io.c */
 	struct bio_set		writepage_bioset;
 	struct bio_set		dio_write_bioset;
diff --git a/fs/bcachefs/darray.h b/fs/bcachefs/darray.h
index 519ab9b96e67..978ab7961f1b 100644
--- a/fs/bcachefs/darray.h
+++ b/fs/bcachefs/darray.h
@@ -19,11 +19,11 @@ struct {								\
 
 typedef DARRAY(void) darray_void;
 
-static inline int __darray_make_room(darray_void *d, size_t t_size, size_t more)
+static inline int __darray_make_room(darray_void *d, size_t t_size, size_t more, gfp_t gfp)
 {
 	if (d->nr + more > d->size) {
 		size_t new_size = roundup_pow_of_two(d->nr + more);
-		void *data = krealloc_array(d->data, new_size, t_size, GFP_KERNEL);
+		void *data = krealloc_array(d->data, new_size, t_size, gfp);
 
 		if (!data)
 			return -ENOMEM;
@@ -35,20 +35,25 @@ static inline int __darray_make_room(darray_void *d, size_t t_size, size_t more)
 	return 0;
 }
 
+#define darray_make_room_gfp(_d, _more, _gfp)				\
+	__darray_make_room((darray_void *) (_d), sizeof((_d)->data[0]), (_more), _gfp)
+
 #define darray_make_room(_d, _more)					\
-	__darray_make_room((darray_void *) (_d), sizeof((_d)->data[0]), (_more))
+	darray_make_room_gfp(_d, _more, GFP_KERNEL)
 
 #define darray_top(_d)		((_d).data[(_d).nr])
 
-#define darray_push(_d, _item)						\
+#define darray_push_gfp(_d, _item, _gfp)				\
 ({									\
-	int _ret = darray_make_room((_d), 1);				\
+	int _ret = darray_make_room_gfp((_d), 1, _gfp);			\
 									\
 	if (!_ret)							\
 		(_d)->data[(_d)->nr++] = (_item);			\
 	_ret;								\
 })
 
+#define darray_push(_d, _item)	darray_push_gfp(_d, _item, GFP_KERNEL)
+
 #define darray_insert_item(_d, _pos, _item)				\
 ({									\
 	size_t pos = (_pos);						\
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index 828887abc261..129924dfaf69 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -201,6 +201,10 @@ struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum)
 		return ERR_PTR(ret);
 	}
 
+	mutex_lock(&c->vfs_inodes_lock);
+	list_add(&inode->ei_vfs_inode_list, &c->vfs_inodes_list);
+	mutex_unlock(&c->vfs_inodes_lock);
+
 	unlock_new_inode(&inode->v);
 
 	return &inode->v;
@@ -314,6 +318,9 @@ err_before_quota:
 
 		inode = old;
 	} else {
+		mutex_lock(&c->vfs_inodes_lock);
+		list_add(&inode->ei_vfs_inode_list, &c->vfs_inodes_list);
+		mutex_unlock(&c->vfs_inodes_lock);
 		/*
 		 * we really don't want insert_inode_locked2() to be setting
 		 * I_NEW...
@@ -1370,6 +1377,7 @@ static struct inode *bch2_alloc_inode(struct super_block *sb)
 	inode_init_once(&inode->v);
 	mutex_init(&inode->ei_update_lock);
 	two_state_lock_init(&inode->ei_pagecache_lock);
+	INIT_LIST_HEAD(&inode->ei_vfs_inode_list);
 	mutex_init(&inode->ei_quota_lock);
 
 	return &inode->v;
@@ -1434,53 +1442,78 @@ static void bch2_evict_inode(struct inode *vinode)
 				KEY_TYPE_QUOTA_WARN);
 		bch2_inode_rm(c, inode_inum(inode));
 	}
+
+	mutex_lock(&c->vfs_inodes_lock);
+	list_del_init(&inode->ei_vfs_inode_list);
+	mutex_unlock(&c->vfs_inodes_lock);
 }
 
-void bch2_evict_subvolume_inodes(struct bch_fs *c,
-				 snapshot_id_list *s)
+void bch2_evict_subvolume_inodes(struct bch_fs *c, snapshot_id_list *s)
 {
-	struct super_block *sb = c->vfs_sb;
-	struct inode *inode;
+	struct bch_inode_info *inode, **i;
+	DARRAY(struct bch_inode_info *) grabbed;
+	bool clean_pass = false, this_pass_clean;
 
-	spin_lock(&sb->s_inode_list_lock);
-	list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
-		if (!snapshot_list_has_id(s, to_bch_ei(inode)->ei_subvol) ||
-		    (inode->i_state & I_FREEING))
-			continue;
+	/*
+	 * Initially, we scan for inodes without I_DONTCACHE, then mark them to
+	 * be pruned with d_mark_dontcache().
+	 *
+	 * Once we've had a clean pass where we didn't find any inodes without
+	 * I_DONTCACHE, we wait for them to be freed:
+	 */
 
-		d_mark_dontcache(inode);
-		d_prune_aliases(inode);
-	}
-	spin_unlock(&sb->s_inode_list_lock);
+	darray_init(&grabbed);
+	darray_make_room(&grabbed, 1024);
 again:
 	cond_resched();
-	spin_lock(&sb->s_inode_list_lock);
-	list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
-		if (!snapshot_list_has_id(s, to_bch_ei(inode)->ei_subvol) ||
-		    (inode->i_state & I_FREEING))
+	this_pass_clean = true;
+
+	mutex_lock(&c->vfs_inodes_lock);
+	list_for_each_entry(inode, &c->vfs_inodes_list, ei_vfs_inode_list) {
+		if (!snapshot_list_has_id(s, inode->ei_subvol))
 			continue;
 
-		if (!(inode->i_state & I_DONTCACHE)) {
-			d_mark_dontcache(inode);
-			d_prune_aliases(inode);
-		}
+		if (!(inode->v.i_state & I_DONTCACHE) &&
+		    !(inode->v.i_state & I_FREEING)) {
+			this_pass_clean = false;
+
+			d_mark_dontcache(&inode->v);
+			d_prune_aliases(&inode->v);
+
+			/*
+			 * If i_count was zero, we have to take and release a
+			 * ref in order for I_DONTCACHE to be noticed and the
+			 * inode to be dropped;
+			 */
+
+			if (!atomic_read(&inode->v.i_count) &&
+			    igrab(&inode->v) &&
+			    darray_push_gfp(&grabbed, inode, GFP_ATOMIC|__GFP_NOWARN))
+				break;
+		} else if (clean_pass && this_pass_clean) {
+			wait_queue_head_t *wq = bit_waitqueue(&inode->v.i_state, __I_NEW);
+			DEFINE_WAIT_BIT(wait, &inode->v.i_state, __I_NEW);
 
-		spin_lock(&inode->i_lock);
-		if (snapshot_list_has_id(s, to_bch_ei(inode)->ei_subvol) &&
-		    !(inode->i_state & I_FREEING)) {
-			wait_queue_head_t *wq = bit_waitqueue(&inode->i_state, __I_NEW);
-			DEFINE_WAIT_BIT(wait, &inode->i_state, __I_NEW);
 			prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
-			spin_unlock(&inode->i_lock);
-			spin_unlock(&sb->s_inode_list_lock);
+			mutex_unlock(&c->vfs_inodes_lock);
+
 			schedule();
 			finish_wait(wq, &wait.wq_entry);
 			goto again;
 		}
+	}
+	mutex_unlock(&c->vfs_inodes_lock);
 
-		spin_unlock(&inode->i_lock);
+	darray_for_each(grabbed, i)
+		iput(&(*i)->v);
+	grabbed.nr = 0;
+
+	if (!clean_pass || !this_pass_clean) {
+		clean_pass = this_pass_clean;
+		goto again;
 	}
-	spin_unlock(&sb->s_inode_list_lock);
+
+	darray_exit(&grabbed);
 }
 
 static int bch2_statfs(struct dentry *dentry, struct kstatfs *buf)
diff --git a/fs/bcachefs/fs.h b/fs/bcachefs/fs.h
index e1c73a38c607..2e63cb6603bd 100644
--- a/fs/bcachefs/fs.h
+++ b/fs/bcachefs/fs.h
@@ -13,6 +13,7 @@
 
 struct bch_inode_info {
 	struct inode		v;
+	struct list_head	ei_vfs_inode_list;
 	unsigned long		ei_flags;
 
 	struct mutex		ei_update_lock;
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index 560545a7ea03..7ccbc00b7156 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -803,9 +803,6 @@ retry:
 
 	bch2_inode_unpack(k, &inode_u);
 
-	/* Subvolume root? */
-	BUG_ON(inode_u.bi_subvol);
-
 	bkey_inode_generation_init(&delete.k_i);
 	delete.k.p = iter.pos;
 	delete.v.bi_generation = cpu_to_le32(inode_u.bi_generation + 1);
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 278f8f19a230..d6f2f453c027 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -709,6 +709,9 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 
 	sema_init(&c->io_in_flight, 128);
 
+	INIT_LIST_HEAD(&c->vfs_inodes_list);
+	mutex_init(&c->vfs_inodes_lock);
+
 	c->copy_gc_enabled		= 1;
 	c->rebalance.enabled		= 1;
 	c->promote_whole_extents	= true;
-- 
cgit 


From 711bf946d55d28336dcc4f87209c8b74e6279481 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 15 Mar 2023 19:04:05 -0400
Subject: bcachefs: Add an assert in inode_write for -ENOENT

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index 129924dfaf69..a57ab773dd27 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -105,6 +105,11 @@ retry:
 	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
 		goto retry;
 
+	bch2_fs_fatal_err_on(ret == -ENOENT, c,
+			     "inode %u:%llu not found when updating",
+			     inode_inum(inode).subvol,
+			     inode_inum(inode).inum);
+
 	bch2_trans_exit(&trans);
 	return ret < 0 ? ret : 0;
 }
-- 
cgit 


From abab7609de92c973bfa3ad069a622c0a107b6386 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 17 Mar 2023 10:56:44 -0400
Subject: bcachefs: Fix bch2_extent_fallocate() in nocow mode

When we allocate disk space, we need to be incrementing the WRITE io
clock, which perhaps should be renamed to sectors allocated - copygc
uses this io clock to know when to run.

Also, we should be incrementing the same clock when allocating btree
nodes.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update_interior.c | 2 ++
 fs/bcachefs/io.c                    | 7 +++++++
 2 files changed, 9 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index d64a86f39595..a58d2a142b67 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -11,6 +11,7 @@
 #include "btree_iter.h"
 #include "btree_locking.h"
 #include "buckets.h"
+#include "clock.h"
 #include "error.h"
 #include "extents.h"
 #include "journal.h"
@@ -363,6 +364,7 @@ static struct btree *bch2_btree_node_alloc(struct btree_update *as,
 	BUG_ON(ret);
 
 	trace_and_count(c, btree_node_alloc, c, b);
+	bch2_increment_clock(c, btree_sectors(c), WRITE);
 	return b;
 }
 
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index e82da496b3f8..0c2d42eaba56 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -384,6 +384,7 @@ int bch2_extent_fallocate(struct btree_trans *trans,
 	struct open_buckets open_buckets;
 	struct bkey_s_c k;
 	struct bkey_buf old, new;
+	unsigned sectors_allocated;
 	bool have_reservation = false;
 	bool unwritten = opts.nocow &&
 	    c->sb.version >= bcachefs_metadata_version_unwritten_extents;
@@ -394,6 +395,8 @@ int bch2_extent_fallocate(struct btree_trans *trans,
 	closure_init_stack(&cl);
 	open_buckets.nr = 0;
 retry:
+	sectors_allocated = 0;
+
 	k = bch2_btree_iter_peek_slot(iter);
 	ret = bkey_err(k);
 	if (ret)
@@ -459,6 +462,7 @@ retry:
 			return ret;
 
 		sectors = min(sectors, wp->sectors_free);
+		sectors_allocated = sectors;
 
 		bch2_key_resize(&e->k, sectors);
 
@@ -485,6 +489,9 @@ out:
 		goto retry;
 	}
 
+	if (!ret && sectors_allocated)
+		bch2_increment_clock(c, sectors_allocated, WRITE);
+
 	bch2_open_buckets_put(c, &open_buckets);
 	bch2_disk_reservation_put(c, &disk_res);
 	bch2_bkey_buf_exit(&new, c);
-- 
cgit 


From ac77810cb4ffd16976487d787e2f81ba9cb5fd0c Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 19 Mar 2023 12:50:05 -0400
Subject: bcachefs: Nocow write error path fix

The nocow write error path was iterating over pointers in an extent,
aftre we'd dropped btree locks - oops.

Fortunately we'd already stashed what we need in nocow_lock_bucket, so
use that instead.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/io.c | 18 +++++++-----------
 1 file changed, 7 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index 0c2d42eaba56..ad22557197a6 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -1481,7 +1481,7 @@ static void bch2_nocow_write(struct bch_write_op *op)
 	struct btree_iter iter;
 	struct bkey_s_c k;
 	struct bkey_ptrs_c ptrs;
-	const struct bch_extent_ptr *ptr, *ptr2;
+	const struct bch_extent_ptr *ptr;
 	struct {
 		struct bpos	b;
 		unsigned	gen;
@@ -1536,11 +1536,12 @@ retry:
 						  bucket_to_u64(buckets[nr_buckets].b));
 
 			prefetch(buckets[nr_buckets].l);
-			nr_buckets++;
 
 			if (unlikely(!bch2_dev_get_ioref(bch_dev_bkey_exists(c, ptr->dev), WRITE)))
 				goto err_get_ioref;
 
+			nr_buckets++;
+
 			if (ptr->unwritten)
 				op->flags |= BCH_WRITE_CONVERT_UNWRITTEN;
 		}
@@ -1631,12 +1632,8 @@ err:
 	}
 	return;
 err_get_ioref:
-	bkey_for_each_ptr(ptrs, ptr2) {
-		if (ptr2 == ptr)
-			break;
-
-		percpu_ref_put(&bch_dev_bkey_exists(c, ptr2->dev)->io_ref);
-	}
+	for (i = 0; i < nr_buckets; i++)
+		percpu_ref_put(&bch_dev_bkey_exists(c, buckets[i].b.inode)->io_ref);
 
 	/* Fall back to COW path: */
 	goto out;
@@ -1645,9 +1642,8 @@ err_bucket_stale:
 		bch2_bucket_nocow_unlock(&c->nocow_locks,
 					 buckets[i].b,
 					 BUCKET_NOCOW_LOCK_UPDATE);
-
-	bkey_for_each_ptr(ptrs, ptr2)
-		percpu_ref_put(&bch_dev_bkey_exists(c, ptr2->dev)->io_ref);
+	for (i = 0; i < nr_buckets; i++)
+		percpu_ref_put(&bch_dev_bkey_exists(c, buckets[i].b.inode)->io_ref);
 
 	/* We can retry this: */
 	ret = BCH_ERR_transaction_restart;
-- 
cgit 


From dc6274bcb87dbf50c62d9dbacba770bae2f10279 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 19 Mar 2023 13:01:06 -0400
Subject: bcachefs: Fix nocow write path closure bug

With regular waitlists, we need to ensure we always call finish_wait().
With closures, the equivalent is that we need to call closure_sync()
before returning with a stack-allocated closure.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/io.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index ad22557197a6..cc2dfcf16dee 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -453,13 +453,13 @@ retry:
 				opts.data_replicas,
 				opts.data_replicas,
 				RESERVE_none, 0, &cl, &wp);
-		if (bch2_err_matches(ret, BCH_ERR_operation_blocked)) {
+		if (ret) {
 			bch2_trans_unlock(trans);
 			closure_sync(&cl);
-			goto retry;
-		}
-		if (ret)
+			if (bch2_err_matches(ret, BCH_ERR_operation_blocked))
+				goto retry;
 			return ret;
+		}
 
 		sectors = min(sectors, wp->sectors_free);
 		sectors_allocated = sectors;
-- 
cgit 


From 3e36e572f14bfa2bac80c3ba07af67e204eb7820 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 19 Mar 2023 14:13:17 -0400
Subject: bcachefs: Fix an unhandled transaction restart error

This is a bit awkward: we're passing around a btree_trans, but we're not
in a context where transaction restarts are handled - we should try to
come up with a better way to denote situations like this.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/move.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index 4a9ffca7be62..d94cefec9880 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -713,6 +713,11 @@ int __bch2_evacuate_bucket(struct btree_trans *trans,
 
 	bch2_bkey_buf_init(&sk);
 
+	/*
+	 * We're not run in a context that handles transaction restarts:
+	 */
+	bch2_trans_begin(trans);
+
 	bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc,
 			     bucket, BTREE_ITER_CACHED);
 	ret = lockrestart_do(trans,
-- 
cgit 


From 330970c2c61686b43cfef47ab99a84f659271ede Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 19 Mar 2023 14:29:51 -0400
Subject: bcachefs: Make reconstruct_alloc quieter

We shouldn't be printing out fsck errors for expected errors - this
helps make test logs more readable, and makes it easier to see what the
actual failure was.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_gc.c | 72 ++++++++++++++++++++++++++------------------------
 fs/bcachefs/lru.c      |  3 ++-
 2 files changed, 39 insertions(+), 36 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index e2fd4c2cfbd0..d9f1e011ed71 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -572,15 +572,15 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id
 		struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr);
 		enum bch_data_type data_type = bch2_bkey_ptr_data_type(*k, &entry->ptr);
 
-		if (c->opts.reconstruct_alloc ||
-		    fsck_err_on(!g->gen_valid, c,
-				"bucket %u:%zu data type %s ptr gen %u missing in alloc btree\n"
-				"while marking %s",
-				p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
-				bch2_data_types[ptr_data_type(k->k, &p.ptr)],
-				p.ptr.gen,
-				(printbuf_reset(&buf),
-				 bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) {
+		if (!g->gen_valid &&
+		    (c->opts.reconstruct_alloc ||
+		     fsck_err(c, "bucket %u:%zu data type %s ptr gen %u missing in alloc btree\n"
+			      "while marking %s",
+			      p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
+			      bch2_data_types[ptr_data_type(k->k, &p.ptr)],
+			      p.ptr.gen,
+			      (printbuf_reset(&buf),
+			       bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))) {
 			if (!p.ptr.cached) {
 				g->gen_valid		= true;
 				g->gen			= p.ptr.gen;
@@ -589,14 +589,15 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id
 			}
 		}
 
-		if (fsck_err_on(gen_cmp(p.ptr.gen, g->gen) > 0, c,
-				"bucket %u:%zu data type %s ptr gen in the future: %u > %u\n"
-				"while marking %s",
-				p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
-				bch2_data_types[ptr_data_type(k->k, &p.ptr)],
-				p.ptr.gen, g->gen,
-				(printbuf_reset(&buf),
-				 bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) {
+		if (gen_cmp(p.ptr.gen, g->gen) > 0 &&
+		    (c->opts.reconstruct_alloc ||
+		     fsck_err(c, "bucket %u:%zu data type %s ptr gen in the future: %u > %u\n"
+			      "while marking %s",
+			      p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
+			      bch2_data_types[ptr_data_type(k->k, &p.ptr)],
+			      p.ptr.gen, g->gen,
+			      (printbuf_reset(&buf),
+			       bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))) {
 			if (!p.ptr.cached) {
 				g->gen_valid		= true;
 				g->gen			= p.ptr.gen;
@@ -609,25 +610,26 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id
 			}
 		}
 
-		if (fsck_err_on(gen_cmp(g->gen, p.ptr.gen) > BUCKET_GC_GEN_MAX, c,
-				"bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n"
-				"while marking %s",
-				p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->gen,
-				bch2_data_types[ptr_data_type(k->k, &p.ptr)],
-				p.ptr.gen,
-				(printbuf_reset(&buf),
-				 bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))
+		if (gen_cmp(g->gen, p.ptr.gen) > BUCKET_GC_GEN_MAX &&
+		    (c->opts.reconstruct_alloc ||
+		     fsck_err(c, "bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n"
+			      "while marking %s",
+			      p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->gen,
+			      bch2_data_types[ptr_data_type(k->k, &p.ptr)],
+			      p.ptr.gen,
+			      (printbuf_reset(&buf),
+			       bch2_bkey_val_to_text(&buf, c, *k), buf.buf))))
 			do_update = true;
 
-		if (fsck_err_on(!p.ptr.cached &&
-				gen_cmp(p.ptr.gen, g->gen) < 0, c,
-				"bucket %u:%zu data type %s stale dirty ptr: %u < %u\n"
-				"while marking %s",
-				p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
-				bch2_data_types[ptr_data_type(k->k, &p.ptr)],
-				p.ptr.gen, g->gen,
-				(printbuf_reset(&buf),
-				 bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))
+		if (!p.ptr.cached && gen_cmp(p.ptr.gen, g->gen) < 0 &&
+		    (c->opts.reconstruct_alloc ||
+		     fsck_err(c, "bucket %u:%zu data type %s stale dirty ptr: %u < %u\n"
+			      "while marking %s",
+			      p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
+			      bch2_data_types[ptr_data_type(k->k, &p.ptr)],
+			      p.ptr.gen, g->gen,
+			      (printbuf_reset(&buf),
+			       bch2_bkey_val_to_text(&buf, c, *k), buf.buf))))
 			do_update = true;
 
 		if (data_type != BCH_DATA_btree && p.ptr.gen != g->gen)
@@ -757,7 +759,7 @@ found:
 		if (level)
 			bch2_btree_node_update_key_early(trans, btree_id, level - 1, *k, new);
 
-		if (c->opts.verbose) {
+		if (0) {
 			printbuf_reset(&buf);
 			bch2_bkey_val_to_text(&buf, c, *k);
 			bch_info(c, "updated %s", buf.buf);
diff --git a/fs/bcachefs/lru.c b/fs/bcachefs/lru.c
index e913b90f37b7..c2dece27da2d 100644
--- a/fs/bcachefs/lru.c
+++ b/fs/bcachefs/lru.c
@@ -148,7 +148,8 @@ static int bch2_check_lru_key(struct btree_trans *trans,
 			goto out;
 		}
 
-		if (fsck_err(c, "incorrect lru entry: lru %s time %llu\n"
+		if (c->opts.reconstruct_alloc ||
+		    fsck_err(c, "incorrect lru entry: lru %s time %llu\n"
 			     "  %s\n"
 			     "  for %s",
 			     bch2_lru_types[type],
-- 
cgit 


From d59ca7e8c072b4587113ced8e00358368f315626 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 19 Mar 2023 14:32:23 -0400
Subject: bcachefs: verify_bucket_evacuated() -> set_btree_iter_dontneed()

This should help with excessive 'would deadlock' transaction restarts.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/move.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index d94cefec9880..ff4a74de728d 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -629,6 +629,8 @@ void bch2_verify_bucket_evacuated(struct btree_trans *trans, struct bpos bucket,
 	u64 bp_offset = 0;
 	int ret;
 
+	bch2_trans_begin(trans);
+
 	bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc,
 			     bucket, BTREE_ITER_CACHED);
 again:
@@ -649,6 +651,7 @@ again:
 		}
 	}
 
+	set_btree_iter_dontneed(&iter);
 	bch2_trans_iter_exit(trans, &iter);
 	return;
 failed_to_evacuate:
-- 
cgit 


From ffc76edbbea3a55876fcd91d10db4ce38b27cac6 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 27 Mar 2023 16:25:15 -0400
Subject: bcachefs: Fix bch2_verify_bucket_evacuated()

We were going into an infinite loop when printing out backpointers, due
to never incrementing bp_offset - whoops.

Also limit the number of backpointers we print to 10; this is debug code
and we only need to print a sample, not all of them.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/move.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index ff4a74de728d..9717fdce3ba5 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -627,6 +627,7 @@ void bch2_verify_bucket_evacuated(struct btree_trans *trans, struct bpos bucket,
 	struct printbuf buf = PRINTBUF;
 	struct bch_backpointer bp;
 	u64 bp_offset = 0;
+	unsigned nr_bps = 0;
 	int ret;
 
 	bch2_trans_begin(trans);
@@ -688,6 +689,10 @@ failed_to_evacuate:
 		prt_newline(&buf);
 		bch2_bkey_val_to_text(&buf, c, k);
 		bch2_trans_iter_exit(trans, &iter);
+
+		if (++nr_bps > 10)
+			break;
+		bp_offset++;
 	}
 
 	bch2_print_string_as_lines(KERN_ERR, buf.buf);
-- 
cgit 


From 76c70c57f093d26fcb5a1aac75db12a5caa5614d Mon Sep 17 00:00:00 2001
From: Brian Foster <bfoster@redhat.com>
Date: Wed, 22 Mar 2023 09:17:26 -0400
Subject: bcachefs: remove unused bch2_trans_log_msg()

Signed-off-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update.h      |  1 -
 fs/bcachefs/btree_update_leaf.c | 12 ------------
 2 files changed, 13 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
index 46fb4a9ed295..63ff824a72da 100644
--- a/fs/bcachefs/btree_update.h
+++ b/fs/bcachefs/btree_update.h
@@ -94,7 +94,6 @@ void bch2_trans_commit_hook(struct btree_trans *,
 			    struct btree_trans_commit_hook *);
 int __bch2_trans_commit(struct btree_trans *, unsigned);
 
-int bch2_trans_log_msg(struct btree_trans *, const char *, ...);
 int bch2_fs_log_msg(struct bch_fs *, const char *, ...);
 
 /**
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index f608e1c92285..ba3191016575 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -1919,18 +1919,6 @@ err:
 	return ret;
 }
 
-int bch2_trans_log_msg(struct btree_trans *trans, const char *fmt, ...)
-{
-	va_list args;
-	int ret;
-
-	va_start(args, fmt);
-	ret = __bch2_trans_log_msg(&trans->extra_journal_entries, fmt, args);
-	va_end(args);
-
-	return ret;
-}
-
 int bch2_fs_log_msg(struct bch_fs *c, const char *fmt, ...)
 {
 	va_list args;
-- 
cgit 


From 8bff9875a695ce9c6635693ff45fb3196688c1c6 Mon Sep 17 00:00:00 2001
From: Brian Foster <bfoster@redhat.com>
Date: Thu, 23 Mar 2023 14:09:05 -0400
Subject: bcachefs: use dedicated workqueue for tasks holding write refs

A workqueue resource deadlock has been observed when running fsck
on a filesystem with a full/stuck journal. fsck is not currently
able to repair the fs due to fairly rapid emergency shutdown, but
rather than exit gracefully the fsck process hangs during the
shutdown sequence. Fortunately this is easily recoverable from
userspace, but the root cause involves code shared between the
kernel and userspace and so should be addressed.

The deadlock scenario involves the main task in the bch2_fs_stop()
-> bch2_fs_read_only() path waiting on write references to drain
with the fs state lock held. A bch2_read_only_work() workqueue task
is scheduled on the system_long_wq, blocked on the state lock.
Finally, various other write ref holding workqueue tasks are
scheduled to run on the same workqueue and must complete in order to
release references that the initial task is waiting on.

To avoid this problem, we can split the dependent workqueue tasks
across different workqueues. It's a bit of a waste to create a
dedicated wq for the read-only worker, but there are several tasks
throughout the fs that follow the pattern of acquiring a write
reference and then scheduling to the system wq. Use a local wq
for such tasks to break the subtle dependency between these and the
read-only worker.

Signed-off-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c | 4 ++--
 fs/bcachefs/bcachefs.h         | 6 ++++++
 fs/bcachefs/ec.c               | 2 +-
 fs/bcachefs/subvolume.c        | 4 ++--
 fs/bcachefs/super.c            | 4 ++++
 5 files changed, 15 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 17bcebbd1f2a..23de3ecc6a1e 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -1760,7 +1760,7 @@ static void bch2_do_discards_work(struct work_struct *work)
 void bch2_do_discards(struct bch_fs *c)
 {
 	if (bch2_write_ref_tryget(c, BCH_WRITE_REF_discard) &&
-	    !queue_work(system_long_wq, &c->discard_work))
+	    !queue_work(c->write_ref_wq, &c->discard_work))
 		bch2_write_ref_put(c, BCH_WRITE_REF_discard);
 }
 
@@ -1886,7 +1886,7 @@ err:
 void bch2_do_invalidates(struct bch_fs *c)
 {
 	if (bch2_write_ref_tryget(c, BCH_WRITE_REF_invalidate) &&
-	    !queue_work(system_long_wq, &c->invalidate_work))
+	    !queue_work(c->write_ref_wq, &c->invalidate_work))
 		bch2_write_ref_put(c, BCH_WRITE_REF_invalidate);
 }
 
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index c1f27b4910a0..fcbbc88d77c2 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -808,6 +808,12 @@ struct bch_fs {
 	struct workqueue_struct	*btree_io_complete_wq;
 	/* copygc needs its own workqueue for index updates.. */
 	struct workqueue_struct	*copygc_wq;
+	/*
+	 * Use a dedicated wq for write ref holder tasks. Required to avoid
+	 * dependency problems with other wq tasks that can block on ref
+	 * draining, such as read-only transition.
+	 */
+	struct workqueue_struct *write_ref_wq;
 
 	/* ALLOCATION */
 	struct bch_devs_mask	rw_devs[BCH_DATA_NR];
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index 1e621dcc1d37..a444f6d513e5 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -826,7 +826,7 @@ static void ec_stripe_delete_work(struct work_struct *work)
 void bch2_do_stripe_deletes(struct bch_fs *c)
 {
 	if (bch2_write_ref_tryget(c, BCH_WRITE_REF_stripe_delete) &&
-	    !schedule_work(&c->ec_stripe_delete_work))
+	    !queue_work(c->write_ref_wq, &c->ec_stripe_delete_work))
 		bch2_write_ref_put(c, BCH_WRITE_REF_stripe_delete);
 }
 
diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c
index 43d83705a7ae..6407d19edc0e 100644
--- a/fs/bcachefs/subvolume.c
+++ b/fs/bcachefs/subvolume.c
@@ -714,7 +714,7 @@ static void bch2_delete_dead_snapshots_work(struct work_struct *work)
 void bch2_delete_dead_snapshots_async(struct bch_fs *c)
 {
 	if (bch2_write_ref_tryget(c, BCH_WRITE_REF_delete_dead_snapshots) &&
-	    !queue_work(system_long_wq, &c->snapshot_delete_work))
+	    !queue_work(c->write_ref_wq, &c->snapshot_delete_work))
 		bch2_write_ref_put(c, BCH_WRITE_REF_delete_dead_snapshots);
 }
 
@@ -926,7 +926,7 @@ int bch2_subvolume_wait_for_pagecache_and_delete_hook(struct btree_trans *trans,
 	if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_snapshot_delete_pagecache))
 		return -EROFS;
 
-	if (!queue_work(system_long_wq, &c->snapshot_wait_for_pagecache_and_delete_work))
+	if (!queue_work(c->write_ref_wq, &c->snapshot_wait_for_pagecache_and_delete_work))
 		bch2_write_ref_put(c, BCH_WRITE_REF_snapshot_delete_pagecache);
 	return 0;
 }
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index d6f2f453c027..a209de24064c 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -493,6 +493,8 @@ static void __bch2_fs_free(struct bch_fs *c)
 	kfree(c->journal_seq_blacklist_table);
 	kfree(c->unused_inode_hints);
 
+	if (c->write_ref_wq)
+		destroy_workqueue(c->write_ref_wq);
 	if (c->io_complete_wq)
 		destroy_workqueue(c->io_complete_wq);
 	if (c->copygc_wq)
@@ -787,6 +789,8 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 				WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) ||
 	    !(c->io_complete_wq = alloc_workqueue("bcachefs_io",
 				WQ_FREEZABLE|WQ_HIGHPRI|WQ_MEM_RECLAIM, 1)) ||
+	    !(c->write_ref_wq = alloc_workqueue("bcachefs_write_ref",
+				WQ_FREEZABLE, 0)) ||
 #ifndef BCH_WRITE_REF_DEBUG
 	    percpu_ref_init(&c->writes, bch2_writes_disabled,
 			    PERCPU_REF_INIT_DEAD, GFP_KERNEL) ||
-- 
cgit 


From 873555f04d81b49a96ea03b37dcd499c13e67742 Mon Sep 17 00:00:00 2001
From: Brian Foster <bfoster@redhat.com>
Date: Fri, 17 Mar 2023 08:54:01 -0400
Subject: bcachefs: more aggressive fast path write buffer key flushing

The btree write buffer flush code is prone to causing journal
deadlock due to inefficient use and release of reservation space.
Reservation is not pre-reserved for write buffered keys (as is done
for key cache keys, for example), because the write buffer flush
side uses a fast path that attempts insertion without need for any
reservation at all.

The write buffer flush attempts to deal with this by inserting keys
using the BTREE_INSERT_JOURNAL_RECLAIM flag to return an error on
journal reservations that require blocking. Upon first error, it
falls back to a slow path that inserts in journal order and supports
moving the associated journal pin forward.

The problem is that under pathological conditions (i.e. smaller log,
larger write buffer and journal reservation pressure), we've seen
instances where the fast path fails fairly quickly without having
completed many insertions, and then the slow path is unable to push
the journal pin forward enough to free up the space it needs to
completely flush the buffer. This problem is occasionally reproduced
by fstest generic/333.

To avoid this problem, update the fast path algorithm to skip key
inserts that fail due to inability to acquire needed journal
reservation without immediately breaking out of the loop. Instead,
insert as many keys as possible, zap the sequence numbers to mark
them as processed, and then fall back to the slow path to process
the remaining set in journal order. This reduces the amount of
journal reservation that might be required to flush the entire
buffer and increases the odds that the slow path is able to move the
journal pin forward and free up space as keys are processed.

Signed-off-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_write_buffer.c | 43 ++++++++++++++++++++--------------------
 1 file changed, 22 insertions(+), 21 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_write_buffer.c b/fs/bcachefs/btree_write_buffer.c
index 80f4b9839bc2..9983a47853b9 100644
--- a/fs/bcachefs/btree_write_buffer.c
+++ b/fs/bcachefs/btree_write_buffer.c
@@ -109,9 +109,9 @@ int __bch2_btree_write_buffer_flush(struct btree_trans *trans, unsigned commit_f
 	struct journal *j = &c->journal;
 	struct btree_write_buffer *wb = &c->btree_write_buffer;
 	struct journal_entry_pin pin;
-	struct btree_write_buffered_key *i, *dst, *keys;
+	struct btree_write_buffered_key *i, *keys;
 	struct btree_iter iter = { NULL };
-	size_t nr = 0, skipped = 0, fast = 0;
+	size_t nr = 0, skipped = 0, fast = 0, slowpath = 0;
 	bool write_locked = false;
 	union btree_write_buffer_state s;
 	int ret = 0;
@@ -135,15 +135,13 @@ int __bch2_btree_write_buffer_flush(struct btree_trans *trans, unsigned commit_f
 	 *
 	 * However, since we're not flushing in the order they appear in the
 	 * journal we won't be able to drop our journal pin until everything is
-	 * flushed - which means this could deadlock the journal, if we weren't
-	 * passing BTREE_INSERT_JORUNAL_RECLAIM. This causes the update to fail
+	 * flushed - which means this could deadlock the journal if we weren't
+	 * passing BTREE_INSERT_JOURNAL_RECLAIM. This causes the update to fail
 	 * if it would block taking a journal reservation.
 	 *
-	 * If that happens, we sort them by the order they appeared in the
-	 * journal - after dropping redundant entries - and then restart
-	 * flushing, this time dropping journal pins as we go.
+	 * If that happens, simply skip the key so we can optimistically insert
+	 * as many keys as possible in the fast path.
 	 */
-
 	sort(keys, nr, sizeof(keys[0]),
 	     btree_write_buffered_key_cmp, NULL);
 
@@ -152,6 +150,7 @@ int __bch2_btree_write_buffer_flush(struct btree_trans *trans, unsigned commit_f
 		    i[0].btree == i[1].btree &&
 		    bpos_eq(i[0].k.k.p, i[1].k.k.p)) {
 			skipped++;
+			i->journal_seq = 0;
 			continue;
 		}
 
@@ -177,8 +176,14 @@ int __bch2_btree_write_buffer_flush(struct btree_trans *trans, unsigned commit_f
 				bch2_trans_begin(trans);
 		} while (bch2_err_matches(ret, BCH_ERR_transaction_restart));
 
+		if (ret == -BCH_ERR_journal_reclaim_would_deadlock) {
+			slowpath++;
+			continue;
+		}
 		if (ret)
 			break;
+
+		i->journal_seq = 0;
 	}
 
 	if (write_locked)
@@ -187,7 +192,7 @@ int __bch2_btree_write_buffer_flush(struct btree_trans *trans, unsigned commit_f
 
 	trace_write_buffer_flush(trans, nr, skipped, fast, wb->size);
 
-	if (ret == -BCH_ERR_journal_reclaim_would_deadlock)
+	if (slowpath)
 		goto slowpath;
 
 	bch2_fs_fatal_err_on(ret, c, "%s: insert error %s", __func__, bch2_err_str(ret));
@@ -198,23 +203,19 @@ out:
 slowpath:
 	trace_write_buffer_flush_slowpath(trans, i - keys, nr);
 
-	dst = keys;
-	for (; i < keys + nr; i++) {
-		if (i + 1 < keys + nr &&
-		    i[0].btree == i[1].btree &&
-		    bpos_eq(i[0].k.k.p, i[1].k.k.p))
-			continue;
-
-		*dst = *i;
-		dst++;
-	}
-	nr = dst - keys;
-
+	/*
+	 * Now sort the rest by journal seq and bump the journal pin as we go.
+	 * The slowpath zapped the seq of keys that were successfully flushed so
+	 * we can skip those here.
+	 */
 	sort(keys, nr, sizeof(keys[0]),
 	     btree_write_buffered_journal_cmp,
 	     NULL);
 
 	for (i = keys; i < keys + nr; i++) {
+		if (!i->journal_seq)
+			continue;
+
 		if (i->journal_seq > pin.seq) {
 			struct journal_entry_pin pin2;
 
-- 
cgit 


From 23fd4f4dc622c36124515401d223607baec01a0d Mon Sep 17 00:00:00 2001
From: Brian Foster <bfoster@redhat.com>
Date: Mon, 20 Mar 2023 13:21:19 -0400
Subject: bcachefs: gracefully unwind journal res slowpath on shutdown

bcachefs detects journal stuck conditions in a couple different
places. If the logic in the journal reservation slow path happens to
detect the problem, I've seen instances where the filesystem remains
deadlocked even though it has been shut down. This is occasionally
reproduced by generic/333, and usually manifests as one or more
tasks stuck in the journal reservation slow path.

To help avoid this problem, repeat the journal error check in
__journal_res_get() once under spinlock to cover the case where the
previous lock holder might have triggered shutdown. This also helps
avoid spurious/duplicate stuck reports. Also, wake the journal from
the halt code to make sure blocked callers of the journal res
slowpath have a chance to wake up and observe the pending error.
This survives an overnight looping run of generic/333 without the
aforementioned lockups.

Signed-off-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/journal.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index 801f09593e6b..43bb1d4002bd 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -162,6 +162,7 @@ void bch2_journal_halt(struct journal *j)
 	__journal_entry_close(j, JOURNAL_ENTRY_ERROR_VAL);
 	if (!j->err_seq)
 		j->err_seq = journal_cur_seq(j);
+	journal_wake(j);
 	spin_unlock(&j->lock);
 }
 
@@ -362,6 +363,12 @@ retry:
 
 	spin_lock(&j->lock);
 
+	/* check once more in case somebody else shut things down... */
+	if (bch2_journal_error(j)) {
+		spin_unlock(&j->lock);
+		return -BCH_ERR_erofs_journal_err;
+	}
+
 	/*
 	 * Recheck after taking the lock, so we don't race with another thread
 	 * that just did journal_entry_open() and call journal_entry_close()
-- 
cgit 


From db1bf7290591115e4d37428ff822c20a1694a69b Mon Sep 17 00:00:00 2001
From: Brian Foster <bfoster@redhat.com>
Date: Tue, 21 Mar 2023 08:03:18 -0400
Subject: bcachefs: refactor journal stuck checking into standalone helper

bcachefs checks for journal stuck conditions both in the journal
space calculation code and the journal reservation slow path. The
logic in both places is rather tricky and can result in
non-deterministic failure characteristics and debug output.

In preparation to condense journal stuck handling to a single place,
refactor the __journal_res_get() logic into a standalone helper.
Since multiple callers into the reservation code can result in
duplicate reports, use the ->err_seq field as a serialization
mechanism for the debug dump. Finally, add some comments to help
explain the logic and hopefully facilitate further improvements in
the future.

Signed-off-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/journal.c | 85 ++++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 63 insertions(+), 22 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index 43bb1d4002bd..433c97844f36 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -75,6 +75,67 @@ static void journal_pin_list_init(struct journal_entry_pin_list *p, int count)
 	p->devs.nr = 0;
 }
 
+/*
+ * Detect stuck journal conditions and trigger shutdown. Technically the journal
+ * can end up stuck for a variety of reasons, such as a blocked I/O, journal
+ * reservation lockup, etc. Since this is a fatal error with potentially
+ * unpredictable characteristics, we want to be fairly conservative before we
+ * decide to shut things down.
+ *
+ * Consider the journal stuck when it appears full with no ability to commit
+ * btree transactions, to discard journal buckets, nor acquire priority
+ * (reserved watermark) reservation.
+ */
+static inline bool
+journal_error_check_stuck(struct journal *j, int error, unsigned flags)
+{
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
+	bool stuck = false;
+	struct printbuf buf = PRINTBUF;
+
+	if (!(error == JOURNAL_ERR_journal_full ||
+	      error == JOURNAL_ERR_journal_pin_full) ||
+	    nr_unwritten_journal_entries(j) ||
+	    (flags & JOURNAL_WATERMARK_MASK) != JOURNAL_WATERMARK_reserved)
+		return stuck;
+
+	spin_lock(&j->lock);
+
+	if (j->can_discard) {
+		spin_unlock(&j->lock);
+		return stuck;
+	}
+
+	stuck = true;
+
+	/*
+	 * The journal shutdown path will set ->err_seq, but do it here first to
+	 * serialize against concurrent failures and avoid duplicate error
+	 * reports.
+	 */
+	if (j->err_seq) {
+		spin_unlock(&j->lock);
+		return stuck;
+	}
+	j->err_seq = journal_cur_seq(j);
+	spin_unlock(&j->lock);
+
+	bch_err(c, "Journal stuck! Hava a pre-reservation but journal full (error %s)",
+		bch2_journal_errors[error]);
+	bch2_journal_debug_to_text(&buf, j);
+	bch_err(c, "%s", buf.buf);
+
+	printbuf_reset(&buf);
+	bch2_journal_pins_to_text(&buf, j);
+	bch_err(c, "Journal pins:\n%s", buf.buf);
+	printbuf_exit(&buf);
+
+	bch2_fatal_error(c);
+	dump_stack();
+
+	return stuck;
+}
+
 /* journal entry close/open: */
 
 void __bch2_journal_buf_put(struct journal *j)
@@ -416,28 +477,8 @@ unlock:
 
 	if (!ret)
 		goto retry;
-
-	if ((ret == JOURNAL_ERR_journal_full ||
-	     ret == JOURNAL_ERR_journal_pin_full) &&
-	    !can_discard &&
-	    !nr_unwritten_journal_entries(j) &&
-	    (flags & JOURNAL_WATERMARK_MASK) == JOURNAL_WATERMARK_reserved) {
-		struct printbuf buf = PRINTBUF;
-
-		bch_err(c, "Journal stuck! Hava a pre-reservation but journal full (ret %s)",
-			bch2_journal_errors[ret]);
-
-		bch2_journal_debug_to_text(&buf, j);
-		bch_err(c, "%s", buf.buf);
-
-		printbuf_reset(&buf);
-		bch2_journal_pins_to_text(&buf, j);
-		bch_err(c, "Journal pins:\n%s", buf.buf);
-
-		printbuf_exit(&buf);
-		bch2_fatal_error(c);
-		dump_stack();
-	}
+	if (journal_error_check_stuck(j, ret, flags))
+		ret = -BCH_ERR_journal_res_get_blocked;
 
 	/*
 	 * Journal is full - can't rely on reclaim from work item due to
-- 
cgit 


From 030e9f9264a9d6bbbdb29ed20429bf943ed34315 Mon Sep 17 00:00:00 2001
From: Brian Foster <bfoster@redhat.com>
Date: Tue, 21 Mar 2023 08:09:16 -0400
Subject: bcachefs: drop unnecessary journal stuck check from space calculation

The journal stucking check in bch2_journal_space_available() is
particularly aggressive and can lead to premature shutdown in some
rare cases. This is difficult to reproduce, but also comes along
with a fatal error and so is worthwhile to be cautious.

For example, we've seen instances where the journal is under heavy
reservation pressure, the journal allocation path transitions into
the final available journal bucket, the journal write path
immediately consumes that bucket and calls into
bch2_journal_space_available(), which then in turn flags the journal
as stuck because there is no available space and shuts down the
filesystem instead of submitting the journal write (that would have
otherwise succeeded).

To avoid this problem, simplify the journal stuck checking by just
relying on the higher level logic in the journal reservation path.
This produces more useful debug output and is a more reliable
indicator that things have bogged down.

Signed-off-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/journal_reclaim.c | 19 +------------------
 1 file changed, 1 insertion(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
index a484a62f9cbd..29d843e6d6d4 100644
--- a/fs/bcachefs/journal_reclaim.c
+++ b/fs/bcachefs/journal_reclaim.c
@@ -210,24 +210,7 @@ void bch2_journal_space_available(struct journal *j)
 	clean		= j->space[journal_space_clean].total;
 	total		= j->space[journal_space_total].total;
 
-	if (!clean_ondisk &&
-	    journal_cur_seq(j) == j->seq_ondisk) {
-		struct printbuf buf = PRINTBUF;
-
-		__bch2_journal_debug_to_text(&buf, j);
-		bch_err(c, "journal stuck\n%s", buf.buf);
-		printbuf_exit(&buf);
-
-		/*
-		 * Hack: bch2_fatal_error() calls bch2_journal_halt() which
-		 * takes journal lock:
-		 */
-		spin_unlock(&j->lock);
-		bch2_fatal_error(c);
-		spin_lock(&j->lock);
-
-		ret = JOURNAL_ERR_journal_stuck;
-	} else if (!j->space[journal_space_discarded].next_entry)
+	if (!j->space[journal_space_discarded].next_entry)
 		ret = JOURNAL_ERR_journal_full;
 
 	if ((j->space[journal_space_clean_ondisk].next_entry <
-- 
cgit 


From 11f117374a2a353c378f8eccff8904d209643695 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 21 Mar 2023 12:18:10 -0400
Subject: bcachefs: Call bch2_path_put_nokeep() before bch2_path_put()

bch2_path_put_nokeep() is sketchy, and we should consider removing it:
it unconditionally frees btree_paths once their ref hits 0.

The assumption is that we only use it for paths that have never been
visible outside the btree core btree code; i.e. higher level code will
never be making assumptions about locking based on these paths.

However, there's subtle brokenness with this approach:

 - If we call bch2_path_put(), then bch2_path_put_nokeep(),
   bch2_path_put() may free the first path on the assumption that we we
   have another path keeping a node locked - but then
   bch2_path_put_nokeep() just unconditionally frees it.

The same bug may arise if we're calling bch2_path_put() and
bch2_path_put_nokeep() on the same (refcounted) path, or two adjacent
paths that point to the same btree node.

This patch hacks around one of these bugs by calling
bch2_path_put_nokeep() first in bch2_trans_iter_exit.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 7b3e7f9368d1..0a62f55a3aa8 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -2722,12 +2722,12 @@ static inline void btree_path_list_add(struct btree_trans *trans,
 
 void bch2_trans_iter_exit(struct btree_trans *trans, struct btree_iter *iter)
 {
-	if (iter->path)
-		bch2_path_put(trans, iter->path,
-			      iter->flags & BTREE_ITER_INTENT);
 	if (iter->update_path)
 		bch2_path_put_nokeep(trans, iter->update_path,
 			      iter->flags & BTREE_ITER_INTENT);
+	if (iter->path)
+		bch2_path_put(trans, iter->path,
+			      iter->flags & BTREE_ITER_INTENT);
 	if (iter->key_cache_path)
 		bch2_path_put(trans, iter->key_cache_path,
 			      iter->flags & BTREE_ITER_INTENT);
-- 
cgit 


From 0fb11e0801962007adef5b628fec0f334e5358f2 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 17 Mar 2023 09:59:17 -0400
Subject: bcachefs: Improved copygc wait debugging

This just adds a line for how long copygc has been waiting to sysfs
copygc_wait, helpful for debugging why copygc isn't running.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs.h |  1 +
 fs/bcachefs/movinggc.c | 10 +++++++++-
 2 files changed, 10 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index fcbbc88d77c2..db8c0e4fc5a3 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -943,6 +943,7 @@ struct bch_fs {
 	/* COPYGC */
 	struct task_struct	*copygc_thread;
 	struct write_point	copygc_write_point;
+	s64			copygc_wait_at;
 	s64			copygc_wait;
 	bool			copygc_running;
 	wait_queue_head_t	copygc_running_wq;
diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c
index 4762594f6287..e91067b428cd 100644
--- a/fs/bcachefs/movinggc.c
+++ b/fs/bcachefs/movinggc.c
@@ -269,6 +269,12 @@ void bch2_copygc_wait_to_text(struct printbuf *out, struct bch_fs *c)
 					atomic64_read(&c->io_clock[WRITE].now)) << 9);
 	prt_newline(out);
 
+	prt_printf(out, "Currently waiting since:   ");
+	prt_human_readable_u64(out, max(0LL,
+					atomic64_read(&c->io_clock[WRITE].now) -
+					c->copygc_wait_at) << 9);
+	prt_newline(out);
+
 	prt_printf(out, "Currently calculated wait: ");
 	prt_human_readable_u64(out, bch2_copygc_wait_amount(c));
 	prt_newline(out);
@@ -317,9 +323,11 @@ static int bch2_copygc_thread(void *arg)
 		wait = bch2_copygc_wait_amount(c);
 
 		if (wait > clock->max_slop) {
+			c->copygc_wait_at = last;
+			c->copygc_wait = last + wait;
+
 			move_buckets_wait(&trans, &ctxt, &move_buckets, 0, true);
 			trace_and_count(c, copygc_wait, c, wait, last + wait);
-			c->copygc_wait = last + wait;
 			bch2_kthread_io_clock_wait(clock, last + wait,
 					MAX_SCHEDULE_TIMEOUT);
 			continue;
-- 
cgit 


From b1c945b3fd926ea121172eedc271f4b816b44387 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 22 Mar 2023 20:48:37 -0400
Subject: bcachefs: Run freespace init in device hot add path

Like in the recovery, and device add, we have to check if devices don't
have the freespace btree initialized - this was missed in the device hot
add path.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/super.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index a209de24064c..265ffa9bfd4c 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -1743,6 +1743,10 @@ int bch2_dev_online(struct bch_fs *c, const char *path)
 	bch2_write_super(c);
 	mutex_unlock(&c->sb_lock);
 
+	ret = bch2_fs_freespace_init(c);
+	if (ret)
+		bch_err(c, "device add error: error initializing free space: %s", bch2_err_str(ret));
+
 	up_write(&c->state_lock);
 	return 0;
 err:
-- 
cgit 


From e9b9e475eaef1c50e89072e4efc3910ff25a0552 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 22 Mar 2023 21:22:51 -0400
Subject: bcachefs: bch2_dev_freespace_init() Print out status every 10 seconds

It appears freespace init can still take awhile, and we've had a report
or two of it getting stuck - let's have it print out where it's at every
10 seconds.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 23de3ecc6a1e..aea6caa791ea 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -1890,7 +1890,8 @@ void bch2_do_invalidates(struct bch_fs *c)
 		bch2_write_ref_put(c, BCH_WRITE_REF_invalidate);
 }
 
-static int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca)
+static int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca,
+				   unsigned long *last_updated)
 {
 	struct btree_trans trans;
 	struct btree_iter iter;
@@ -1910,6 +1911,12 @@ static int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca)
 	 * freespace/need_discard/need_gc_gens btrees as needed:
 	 */
 	while (1) {
+		if (*last_updated + HZ * 10 < jiffies) {
+			bch_info(ca, "%s: currently at %llu/%llu",
+				 __func__, iter.pos.offset, ca->mi.nbuckets);
+			*last_updated = jiffies;
+		}
+
 		bch2_trans_begin(&trans);
 
 		if (bkey_ge(iter.pos, end)) {
@@ -1989,6 +1996,7 @@ int bch2_fs_freespace_init(struct bch_fs *c)
 	unsigned i;
 	int ret = 0;
 	bool doing_init = false;
+	unsigned long last_updated = jiffies;
 
 	/*
 	 * We can crash during the device add path, so we need to check this on
@@ -2004,7 +2012,7 @@ int bch2_fs_freespace_init(struct bch_fs *c)
 			doing_init = true;
 		}
 
-		ret = bch2_dev_freespace_init(c, ca);
+		ret = bch2_dev_freespace_init(c, ca, &last_updated);
 		if (ret) {
 			percpu_ref_put(&ca->ref);
 			return ret;
-- 
cgit 


From 2a6c302fb3a00bd74dfc3882698d290482bc5382 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 28 Mar 2023 19:15:53 -0400
Subject: bcachefs: Check return code from need_whiteout_for_snapshot()

This could return a transaction restart; we need to check for that.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update_leaf.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index ba3191016575..da9840edc023 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -1426,10 +1426,15 @@ int bch2_trans_update_extent(struct btree_trans *trans,
 			update->k.p = k.k->p;
 			update->k.p.snapshot = insert->k.p.snapshot;
 
-			if (insert->k.p.snapshot != k.k->p.snapshot ||
-			    (btree_type_has_snapshots(btree_id) &&
-			     need_whiteout_for_snapshot(trans, btree_id, update->k.p)))
+			if (insert->k.p.snapshot != k.k->p.snapshot) {
 				update->k.type = KEY_TYPE_whiteout;
+			} else if (btree_type_has_snapshots(btree_id)) {
+				ret = need_whiteout_for_snapshot(trans, btree_id, update->k.p);
+				if (ret < 0)
+					goto err;
+				if (ret)
+					update->k.type = KEY_TYPE_whiteout;
+			}
 
 			ret = bch2_btree_insert_nonextent(trans, btree_id, update,
 						  BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|flags);
-- 
cgit 


From 1546cf972709d9b10c39fdb3bff9da3f22a57226 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 28 Mar 2023 19:37:25 -0400
Subject: bcachefs: Fix bch2_get_key_or_hole()

This fixes an off by one error, due to confusing closed vs. half open
intervals.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index aea6caa791ea..e5cbb4bce1ee 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -962,10 +962,17 @@ struct bkey_s_c bch2_get_key_or_hole(struct btree_iter *iter, struct bpos end, s
 		struct bpos next;
 
 		bch2_trans_copy_iter(&iter2, iter);
-		k = bch2_btree_iter_peek_upto(&iter2,
-				bkey_min(bkey_min(end,
-						  iter->path->l[0].b->key.k.p),
-						  POS(iter->pos.inode, iter->pos.offset + U32_MAX - 1)));
+
+		if (!bpos_eq(iter->path->l[0].b->key.k.p, SPOS_MAX))
+			end = bkey_min(end, bpos_nosnap_successor(iter->path->l[0].b->key.k.p));
+
+		end = bkey_min(end, POS(iter->pos.inode, iter->pos.offset + U32_MAX - 1));
+
+		/*
+		 * btree node min/max is a closed interval, upto takes a half
+		 * open interval:
+		 */
+		k = bch2_btree_iter_peek_upto(&iter2, end);
 		next = iter2.pos;
 		bch2_trans_iter_exit(iter->trans, &iter2);
 
-- 
cgit 


From 4f77dcde2841e43e4c5fe9f178a64ec40be72e17 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 29 Mar 2023 11:01:12 -0400
Subject: bcachefs: move snapshot_t to subvolume_types.h

this doesn't need to be in bcachefs.h

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs.h        | 12 ------------
 fs/bcachefs/subvolume_types.h | 12 ++++++++++++
 2 files changed, 12 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index db8c0e4fc5a3..e5834729b52a 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -629,18 +629,6 @@ struct btree_path_buf {
 
 #define REPLICAS_DELTA_LIST_MAX	(1U << 16)
 
-struct snapshot_t {
-	u32			parent;
-	u32			children[2];
-	u32			subvol; /* Nonzero only if a subvolume points to this node: */
-	u32			equiv;
-};
-
-typedef struct {
-	u32		subvol;
-	u64		inum;
-} subvol_inum;
-
 #define BCACHEFS_ROOT_SUBVOL_INUM					\
 	((subvol_inum) { BCACHEFS_ROOT_SUBVOL,	BCACHEFS_ROOT_INO })
 
diff --git a/fs/bcachefs/subvolume_types.h b/fs/bcachefs/subvolume_types.h
index f7562b5d51df..aa49c45a35ab 100644
--- a/fs/bcachefs/subvolume_types.h
+++ b/fs/bcachefs/subvolume_types.h
@@ -6,4 +6,16 @@
 
 typedef DARRAY(u32) snapshot_id_list;
 
+struct snapshot_t {
+	u32			parent;
+	u32			children[2];
+	u32			subvol; /* Nonzero only if a subvolume points to this node: */
+	u32			equiv;
+};
+
+typedef struct {
+	u32		subvol;
+	u64		inum;
+} subvol_inum;
+
 #endif /* _BCACHEFS_SUBVOLUME_TYPES_H */
-- 
cgit 


From 6bdefe9c3900b3bb7a028486423520cdc975d9a8 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 29 Mar 2023 13:10:36 -0400
Subject: bcachefs: Use BTREE_ITER_INTENT in ec_stripe_update_extent()

This adds a flags param to bch2_backpointer_get_key() so that we can
pass BTREE_ITER_INTENT, since ec_stripe_update_extent() is updating the
extent immediately.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/backpointers.c | 7 ++++---
 fs/bcachefs/backpointers.h | 3 ++-
 fs/bcachefs/ec.c           | 3 ++-
 fs/bcachefs/move.c         | 4 ++--
 4 files changed, 10 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c
index 740084b3ff12..9121fae0fd6c 100644
--- a/fs/bcachefs/backpointers.c
+++ b/fs/bcachefs/backpointers.c
@@ -420,7 +420,8 @@ struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *trans,
 					 struct btree_iter *iter,
 					 struct bpos bucket,
 					 u64 bp_offset,
-					 struct bch_backpointer bp)
+					 struct bch_backpointer bp,
+					 unsigned iter_flags)
 {
 	struct bch_fs *c = trans->c;
 	struct bkey_s_c k;
@@ -430,7 +431,7 @@ struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *trans,
 				  bp.pos,
 				  0,
 				  min(bp.level, c->btree_roots[bp.btree_id].level),
-				  0);
+				  iter_flags);
 	k = bch2_btree_iter_peek_slot(iter);
 	if (bkey_err(k)) {
 		bch2_trans_iter_exit(trans, iter);
@@ -976,7 +977,7 @@ static int check_one_backpointer(struct btree_trans *trans,
 	    bbpos_cmp(pos, end) > 0)
 		return 0;
 
-	k = bch2_backpointer_get_key(trans, &iter, bucket, *bp_offset, bp);
+	k = bch2_backpointer_get_key(trans, &iter, bucket, *bp_offset, bp, 0);
 	ret = bkey_err(k);
 	if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node)
 		return 0;
diff --git a/fs/bcachefs/backpointers.h b/fs/bcachefs/backpointers.h
index d0ba5d8596c5..ce75b8f50241 100644
--- a/fs/bcachefs/backpointers.h
+++ b/fs/bcachefs/backpointers.h
@@ -128,7 +128,8 @@ static inline void bch2_extent_ptr_to_bp(struct bch_fs *c,
 int bch2_get_next_backpointer(struct btree_trans *, struct bpos, int,
 			      u64 *, struct bch_backpointer *, unsigned);
 struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *, struct btree_iter *,
-					 struct bpos, u64, struct bch_backpointer);
+					 struct bpos, u64, struct bch_backpointer,
+					 unsigned);
 struct btree *bch2_backpointer_get_node(struct btree_trans *, struct btree_iter *,
 					struct bpos, u64, struct bch_backpointer);
 
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index a444f6d513e5..d295e5401c7a 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -925,7 +925,8 @@ static int ec_stripe_update_extent(struct btree_trans *trans,
 		return -EIO;
 	}
 
-	k = bch2_backpointer_get_key(trans, &iter, bucket, *bp_offset, bp);
+	k = bch2_backpointer_get_key(trans, &iter, bucket, *bp_offset, bp,
+				     BTREE_ITER_INTENT);
 	ret = bkey_err(k);
 	if (ret)
 		return ret;
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index 9717fdce3ba5..be14d3737027 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -678,7 +678,7 @@ failed_to_evacuate:
 			break;
 
 		k = bch2_backpointer_get_key(trans, &iter,
-					     bucket, bp_offset, bp);
+					     bucket, bp_offset, bp, 0);
 		ret = bkey_err(k);
 		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
 			continue;
@@ -767,7 +767,7 @@ int __bch2_evacuate_bucket(struct btree_trans *trans,
 			unsigned i = 0;
 
 			k = bch2_backpointer_get_key(trans, &iter,
-						bucket, bp_offset, bp);
+						bucket, bp_offset, bp, 0);
 			ret = bkey_err(k);
 			if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
 				continue;
-- 
cgit 


From 32de2ea0d5b7e2bc2a4eeac47e38aceb0ff25cc9 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 11 Mar 2023 14:44:41 -0500
Subject: bcachefs: Rhashtable based buckets_in_flight for copygc

Previously, copygc used a fifo for tracking buckets in flight - this had
the disadvantage of being fixed size, since we pass references to
elements into the move code.

This restructures it to be a hash table and linked list, since with
erasure coding we need to be able to pipeline across an arbitrary number
of buckets.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/move_types.h |  13 ++-
 fs/bcachefs/movinggc.c   | 212 ++++++++++++++++++++++++++++-------------------
 2 files changed, 139 insertions(+), 86 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/move_types.h b/fs/bcachefs/move_types.h
index 285ffdb762ac..baf1f8570b3f 100644
--- a/fs/bcachefs/move_types.h
+++ b/fs/bcachefs/move_types.h
@@ -16,9 +16,20 @@ struct bch_move_stats {
 	atomic64_t		sectors_raced;
 };
 
-struct move_bucket_in_flight {
+struct move_bucket_key {
 	struct bpos		bucket;
 	u8			gen;
+};
+
+struct move_bucket {
+	struct move_bucket_key	k;
+	unsigned		sectors;
+};
+
+struct move_bucket_in_flight {
+	struct move_bucket_in_flight *next;
+	struct rhash_head	hash;
+	struct move_bucket	bucket;
 	atomic_t		count;
 };
 
diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c
index e91067b428cd..2d75334c541d 100644
--- a/fs/bcachefs/movinggc.c
+++ b/fs/bcachefs/movinggc.c
@@ -34,8 +34,51 @@
 #include <linux/sort.h>
 #include <linux/wait.h>
 
+struct buckets_in_flight {
+	struct rhashtable		table;
+	struct move_bucket_in_flight	*first;
+	struct move_bucket_in_flight	*last;
+	size_t				nr;
+	size_t				sectors;
+};
+
+static const struct rhashtable_params bch_move_bucket_params = {
+	.head_offset	= offsetof(struct move_bucket_in_flight, hash),
+	.key_offset	= offsetof(struct move_bucket_in_flight, bucket.k),
+	.key_len	= sizeof(struct move_bucket_key),
+};
+
+static struct move_bucket_in_flight *
+move_bucket_in_flight_add(struct buckets_in_flight *list, struct move_bucket b)
+{
+	struct move_bucket_in_flight *new = kzalloc(sizeof(*new), GFP_KERNEL);
+	int ret;
+
+	if (!new)
+		return ERR_PTR(-ENOMEM);
+
+	new->bucket = b;
+
+	ret = rhashtable_lookup_insert_fast(&list->table, &new->hash,
+					    bch_move_bucket_params);
+	if (ret) {
+		kfree(new);
+		return ERR_PTR(ret);
+	}
+
+	if (!list->first)
+		list->first = new;
+	else
+		list->last->next = new;
+
+	list->last = new;
+	list->nr++;
+	list->sectors += b.sectors;
+	return new;
+}
+
 static int bch2_bucket_is_movable(struct btree_trans *trans,
-				  struct bpos bucket, u64 time, u8 *gen)
+				  struct move_bucket *b, u64 time)
 {
 	struct btree_iter iter;
 	struct bkey_s_c k;
@@ -43,10 +86,13 @@ static int bch2_bucket_is_movable(struct btree_trans *trans,
 	const struct bch_alloc_v4 *a;
 	int ret;
 
-	if (bch2_bucket_is_open(trans->c, bucket.inode, bucket.offset))
+	if (bch2_bucket_is_open(trans->c,
+				b->k.bucket.inode,
+				b->k.bucket.offset))
 		return 0;
 
-	bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, bucket, BTREE_ITER_CACHED);
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc,
+			     b->k.bucket, BTREE_ITER_CACHED);
 	k = bch2_btree_iter_peek_slot(&iter);
 	ret = bkey_err(k);
 	bch2_trans_iter_exit(trans, &iter);
@@ -55,12 +101,14 @@ static int bch2_bucket_is_movable(struct btree_trans *trans,
 		return ret;
 
 	a = bch2_alloc_to_v4(k, &_a);
-	*gen = a->gen;
+	b->k.gen	= a->gen;
+	b->sectors	= a->dirty_sectors;
+
 	ret = data_type_movable(a->data_type) &&
 		a->fragmentation_lru &&
 		a->fragmentation_lru <= time;
 
-	if (ret) {
+	if (!ret) {
 		struct printbuf buf = PRINTBUF;
 
 		bch2_bkey_val_to_text(&buf, trans->c, k);
@@ -71,41 +119,16 @@ static int bch2_bucket_is_movable(struct btree_trans *trans,
 	return ret;
 }
 
-typedef FIFO(struct move_bucket_in_flight) move_buckets_in_flight;
-
-struct move_bucket {
-	struct bpos		bucket;
-	u8			gen;
-};
-
-typedef DARRAY(struct move_bucket) move_buckets;
-
-static int move_bucket_cmp(const void *_l, const void *_r)
-{
-	const struct move_bucket *l = _l;
-	const struct move_bucket *r = _r;
-
-	return bkey_cmp(l->bucket, r->bucket);
-}
-
-static bool bucket_in_flight(move_buckets *buckets_sorted, struct move_bucket b)
-{
-	return bsearch(&b,
-		       buckets_sorted->data,
-		       buckets_sorted->nr,
-		       sizeof(buckets_sorted->data[0]),
-		       move_bucket_cmp) != NULL;
-}
-
 static void move_buckets_wait(struct btree_trans *trans,
 			      struct moving_context *ctxt,
-			      move_buckets_in_flight *buckets_in_flight,
-			      size_t nr, bool verify_evacuated)
+			      struct buckets_in_flight *list,
+			      bool flush)
 {
-	while (!fifo_empty(buckets_in_flight)) {
-		struct move_bucket_in_flight *i = &fifo_peek_front(buckets_in_flight);
+	struct move_bucket_in_flight *i;
+	int ret;
 
-		if (fifo_used(buckets_in_flight) > nr)
+	while ((i = list->first)) {
+		if (flush)
 			move_ctxt_wait_event(ctxt, trans, !atomic_read(&i->count));
 
 		if (atomic_read(&i->count))
@@ -116,66 +139,82 @@ static void move_buckets_wait(struct btree_trans *trans,
 		 * reads, which inits another btree_trans; this one must be
 		 * unlocked:
 		 */
-		if (verify_evacuated)
-			bch2_verify_bucket_evacuated(trans, i->bucket, i->gen);
-		buckets_in_flight->front++;
+		bch2_verify_bucket_evacuated(trans, i->bucket.k.bucket, i->bucket.k.gen);
+
+		list->first = i->next;
+		if (!list->first)
+			list->last = NULL;
+
+		list->nr--;
+		list->sectors -= i->bucket.sectors;
+
+		ret = rhashtable_remove_fast(&list->table, &i->hash,
+					     bch_move_bucket_params);
+		BUG_ON(ret);
+		kfree(i);
 	}
 
 	bch2_trans_unlock(trans);
 }
 
+static bool bucket_in_flight(struct buckets_in_flight *list,
+			     struct move_bucket_key k)
+{
+	return rhashtable_lookup_fast(&list->table, &k, bch_move_bucket_params);
+}
+
+typedef DARRAY(struct move_bucket) move_buckets;
+
 static int bch2_copygc_get_buckets(struct btree_trans *trans,
 			struct moving_context *ctxt,
-			move_buckets_in_flight *buckets_in_flight,
+			struct buckets_in_flight *buckets_in_flight,
 			move_buckets *buckets)
 {
+	struct bch_fs *c = trans->c;
 	struct btree_iter iter;
-	move_buckets buckets_sorted = { 0 };
-	struct move_bucket_in_flight *i;
 	struct bkey_s_c k;
-	size_t fifo_iter, nr_to_get;
+	size_t nr_to_get = max(16UL, buckets_in_flight->nr / 4);
+	size_t saw = 0, in_flight = 0, not_movable = 0, sectors = 0;
 	int ret;
 
-	move_buckets_wait(trans, ctxt, buckets_in_flight, buckets_in_flight->size / 2, true);
+	move_buckets_wait(trans, ctxt, buckets_in_flight, false);
 
-	nr_to_get = max(16UL, fifo_used(buckets_in_flight) / 4);
-
-	fifo_for_each_entry_ptr(i, buckets_in_flight, fifo_iter) {
-		ret = darray_push(&buckets_sorted, ((struct move_bucket) {i->bucket, i->gen}));
-		if (ret) {
-			bch_err(trans->c, "error allocating move_buckets_sorted");
-			goto err;
-		}
-	}
-
-	sort(buckets_sorted.data,
-	     buckets_sorted.nr,
-	     sizeof(buckets_sorted.data[0]),
-	     move_bucket_cmp,
-	     NULL);
+	ret = bch2_btree_write_buffer_flush(trans);
+	if (bch2_fs_fatal_err_on(ret, c, "%s: error %s from bch2_btree_write_buffer_flush()",
+				 __func__, bch2_err_str(ret)))
+		return ret;
 
 	ret = for_each_btree_key2_upto(trans, iter, BTREE_ID_lru,
 				  lru_pos(BCH_LRU_FRAGMENTATION_START, 0, 0),
 				  lru_pos(BCH_LRU_FRAGMENTATION_START, U64_MAX, LRU_TIME_MAX),
 				  0, k, ({
-		struct move_bucket b = { .bucket = u64_to_bucket(k.k->p.offset) };
+		struct move_bucket b = { .k.bucket = u64_to_bucket(k.k->p.offset) };
 		int ret = 0;
 
-		if (!bucket_in_flight(&buckets_sorted, b) &&
-		    bch2_bucket_is_movable(trans, b.bucket, lru_pos_time(k.k->p), &b.gen))
-			ret = darray_push(buckets, b) ?: buckets->nr >= nr_to_get;
+		saw++;
 
+		if (!bch2_bucket_is_movable(trans, &b, lru_pos_time(k.k->p)))
+			not_movable++;
+		else if (bucket_in_flight(buckets_in_flight, b.k))
+			in_flight++;
+		else {
+			ret = darray_push(buckets, b) ?: buckets->nr >= nr_to_get;
+			if (ret >= 0)
+				sectors += b.sectors;
+		}
 		ret;
 	}));
-err:
-	darray_exit(&buckets_sorted);
+
+	pr_debug("have: %zu (%zu) saw %zu in flight %zu not movable %zu got %zu (%zu)/%zu buckets ret %i",
+		 buckets_in_flight->nr, buckets_in_flight->sectors,
+		 saw, in_flight, not_movable, buckets->nr, sectors, nr_to_get, ret);
 
 	return ret < 0 ? ret : 0;
 }
 
 static int bch2_copygc(struct btree_trans *trans,
 		       struct moving_context *ctxt,
-		       move_buckets_in_flight *buckets_in_flight)
+		       struct buckets_in_flight *buckets_in_flight)
 {
 	struct bch_fs *c = trans->c;
 	struct data_update_opts data_opts = {
@@ -187,11 +226,6 @@ static int bch2_copygc(struct btree_trans *trans,
 	u64 moved = atomic64_read(&ctxt->stats->sectors_moved);
 	int ret = 0;
 
-	ret = bch2_btree_write_buffer_flush(trans);
-	if (bch2_fs_fatal_err_on(ret, c, "%s: error %s from bch2_btree_write_buffer_flush()",
-				 __func__, bch2_err_str(ret)))
-		return ret;
-
 	ret = bch2_copygc_get_buckets(trans, ctxt, buckets_in_flight, &buckets);
 	if (ret)
 		goto err;
@@ -200,12 +234,17 @@ static int bch2_copygc(struct btree_trans *trans,
 		if (unlikely(freezing(current)))
 			break;
 
-		f = fifo_push_ref(buckets_in_flight);
-		f->bucket	= i->bucket;
-		f->gen		= i->gen;
-		atomic_set(&f->count, 0);
+		f = move_bucket_in_flight_add(buckets_in_flight, *i);
+		ret = PTR_ERR_OR_ZERO(f);
+		if (ret == -EEXIST) /* rare race: copygc_get_buckets returned same bucket more than once */
+			continue;
+		if (ret == -ENOMEM) { /* flush IO, continue later */
+			ret = 0;
+			break;
+		}
 
-		ret = __bch2_evacuate_bucket(trans, ctxt, f, f->bucket, f->gen, data_opts);
+		ret = __bch2_evacuate_bucket(trans, ctxt, f, f->bucket.k.bucket,
+					     f->bucket.k.gen, data_opts);
 		if (ret)
 			goto err;
 	}
@@ -287,13 +326,17 @@ static int bch2_copygc_thread(void *arg)
 	struct moving_context ctxt;
 	struct bch_move_stats move_stats;
 	struct io_clock *clock = &c->io_clock[WRITE];
-	move_buckets_in_flight move_buckets;
+	struct buckets_in_flight move_buckets;
 	u64 last, wait;
 	int ret = 0;
 
-	if (!init_fifo(&move_buckets, 1 << 14, GFP_KERNEL)) {
-		bch_err(c, "error allocating copygc buckets in flight");
-		return -ENOMEM;
+	memset(&move_buckets, 0, sizeof(move_buckets));
+
+	ret = rhashtable_init(&move_buckets.table, &bch_move_bucket_params);
+	if (ret) {
+		bch_err(c, "error allocating copygc buckets in flight: %s",
+			bch2_err_str(ret));
+		return ret;
 	}
 
 	set_freezable();
@@ -309,12 +352,12 @@ static int bch2_copygc_thread(void *arg)
 		cond_resched();
 
 		if (!c->copy_gc_enabled) {
-			move_buckets_wait(&trans, &ctxt, &move_buckets, 0, true);
+			move_buckets_wait(&trans, &ctxt, &move_buckets, true);
 			kthread_wait_freezable(c->copy_gc_enabled);
 		}
 
 		if (unlikely(freezing(current))) {
-			move_buckets_wait(&trans, &ctxt, &move_buckets, 0, true);
+			move_buckets_wait(&trans, &ctxt, &move_buckets, true);
 			__refrigerator(false);
 			continue;
 		}
@@ -325,8 +368,7 @@ static int bch2_copygc_thread(void *arg)
 		if (wait > clock->max_slop) {
 			c->copygc_wait_at = last;
 			c->copygc_wait = last + wait;
-
-			move_buckets_wait(&trans, &ctxt, &move_buckets, 0, true);
+			move_buckets_wait(&trans, &ctxt, &move_buckets, true);
 			trace_and_count(c, copygc_wait, c, wait, last + wait);
 			bch2_kthread_io_clock_wait(clock, last + wait,
 					MAX_SCHEDULE_TIMEOUT);
@@ -342,9 +384,9 @@ static int bch2_copygc_thread(void *arg)
 		wake_up(&c->copygc_running_wq);
 	}
 
+	move_buckets_wait(&trans, &ctxt, &move_buckets, true);
 	bch2_trans_exit(&trans);
 	bch2_moving_ctxt_exit(&ctxt);
-	free_fifo(&move_buckets);
 
 	return 0;
 }
-- 
cgit 


From 25d8f40560e75aea107b0e773f8712931173ded6 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 29 Mar 2023 22:47:30 -0400
Subject: bcachefs: Data update path no longer leaves cached replicas

It turns out that it's currently impossible to invalidate buckets
containing only cached data if they're part of a stripe. The normal
bucket invalidate path can't do it because we have to be able to
incerement the bucket's gen, which isn't correct becasue it's still a
member of the stripe - and the bucket invalidate path makes the bucket
availabel for reuse right away, which also isn't correct for buckets in
stripes.

What would work is invalidating cached data by following backpointers,
except that cached replicas don't currently get backpointers - because
they would be awkward for the existing bucket invalidate path to delete
and they haven't been needed elsewhere.

So for the time being, to prevent running out of space in stripes,
switch the data update path to not leave cached replicas; we may revisit
this in the future.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/data_update.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c
index 5ec884a222f8..865514dd2aa9 100644
--- a/fs/bcachefs/data_update.c
+++ b/fs/bcachefs/data_update.c
@@ -162,7 +162,11 @@ static int __bch2_data_update_index_update(struct btree_trans *trans,
 			if (((1U << i) & m->data_opts.rewrite_ptrs) &&
 			    (ptr = bch2_extent_has_ptr(old, p, bkey_i_to_s(insert))) &&
 			    !ptr->cached) {
+				bch2_bkey_drop_ptr_noerror(bkey_i_to_s(insert), ptr);
+				/*
+				 * See comment below:
 				bch2_extent_ptr_set_cached(bkey_i_to_s(insert), ptr);
+				*/
 				rewrites_found |= 1U << i;
 			}
 			i++;
@@ -204,7 +208,14 @@ restart_drop_extra_replicas:
 			if (!p.ptr.cached &&
 			    durability - ptr_durability >= m->op.opts.data_replicas) {
 				durability -= ptr_durability;
+				bch2_bkey_drop_ptr_noerror(bkey_i_to_s(insert), &entry->ptr);
+				/*
+				 * Currently, we're dropping unneeded replicas
+				 * instead of marking them as cached, since
+				 * cached data in stripe buckets prevents them
+				 * from being reused:
 				bch2_extent_ptr_set_cached(bkey_i_to_s(insert), &entry->ptr);
+				*/
 				goto restart_drop_extra_replicas;
 			}
 		}
-- 
cgit 


From 3d86f13df67b554a7b27e28a4b144425710409bf Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 30 Mar 2023 16:04:02 -0400
Subject: bcachefs: Improve trans_restart_split_race tracepoint

Seeing occasional test failures where we get stuck in a livelock that
involves this event - this will help track it down.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update_interior.c |  4 ++--
 fs/bcachefs/trace.c                 |  2 ++
 fs/bcachefs/trace.h                 | 31 ++++++++++++++++++++++++++++---
 3 files changed, 32 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index a58d2a142b67..6ba0954e648e 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -1680,7 +1680,7 @@ static int bch2_btree_insert_node(struct btree_update *as, struct btree_trans *t
 	BUG_ON(!as || as->b);
 	bch2_verify_keylist_sorted(keys);
 
-	if (!(local_clock() & 63))
+	if ((local_clock() & 63) == 63)
 		return btree_trans_restart(trans, BCH_ERR_transaction_restart_split_race);
 
 	ret = bch2_btree_node_lock_write(trans, path, &b->c);
@@ -1720,7 +1720,7 @@ split:
 	 * bch2_btree_path_upgrade() and allocating more nodes:
 	 */
 	if (b->c.level >= as->update_level) {
-		trace_and_count(c, trans_restart_split_race, trans, _THIS_IP_);
+		trace_and_count(c, trans_restart_split_race, trans, _THIS_IP_, b);
 		return btree_trans_restart(trans, BCH_ERR_transaction_restart_split_race);
 	}
 
diff --git a/fs/bcachefs/trace.c b/fs/bcachefs/trace.c
index 5c1d724cbb55..33efa6005c6f 100644
--- a/fs/bcachefs/trace.c
+++ b/fs/bcachefs/trace.c
@@ -2,8 +2,10 @@
 #include "bcachefs.h"
 #include "alloc_types.h"
 #include "buckets.h"
+#include "btree_cache.h"
 #include "btree_iter.h"
 #include "btree_locking.h"
+#include "btree_update_interior.h"
 #include "keylist.h"
 #include "opts.h"
 #include "six.h"
diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h
index 7e48e7676980..65521c046254 100644
--- a/fs/bcachefs/trace.h
+++ b/fs/bcachefs/trace.h
@@ -831,10 +831,35 @@ DEFINE_EVENT(transaction_event,	trans_restart_injected,
 	TP_ARGS(trans, caller_ip)
 );
 
-DEFINE_EVENT(transaction_event,	trans_restart_split_race,
+TRACE_EVENT(trans_restart_split_race,
 	TP_PROTO(struct btree_trans *trans,
-		 unsigned long caller_ip),
-	TP_ARGS(trans, caller_ip)
+		 unsigned long caller_ip,
+		 struct btree *b),
+	TP_ARGS(trans, caller_ip, b),
+
+	TP_STRUCT__entry(
+		__array(char,			trans_fn, 32	)
+		__field(unsigned long,		caller_ip	)
+		__field(u8,			level		)
+		__field(u16,			written		)
+		__field(u16,			blocks		)
+		__field(u16,			u64s_remaining	)
+	),
+
+	TP_fast_assign(
+		strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
+		__entry->caller_ip		= caller_ip;
+		__entry->level		= b->c.level;
+		__entry->written	= b->written;
+		__entry->blocks		= btree_blocks(trans->c);
+		__entry->u64s_remaining	= bch_btree_keys_u64s_remaining(trans->c, b);
+	),
+
+	TP_printk("%s %pS l=%u written %u/%u u64s remaining %u",
+		  __entry->trans_fn, (void *) __entry->caller_ip,
+		  __entry->level,
+		  __entry->written, __entry->blocks,
+		  __entry->u64s_remaining)
 );
 
 DEFINE_EVENT(transaction_event,	trans_blocked_journal_reclaim,
-- 
cgit 


From 349b1d832ba534b802a28f316e40128c54643c32 Mon Sep 17 00:00:00 2001
From: Brian Foster <bfoster@redhat.com>
Date: Wed, 22 Mar 2023 08:27:58 -0400
Subject: bcachefs: use reservation for log messages during recovery

If we block on journal reservation attempting to log journal
messages during recovery, particularly for the first message(s)
before we start doing actual work, chances are the filesystem ends
up deadlocked.

Allow logged messages to use reserved journal space to mitigate this
problem. In the worst case where no space is available whatsoever,
this at least allows the fs to recognize that the journal is stuck
and fail the mount gracefully.

Signed-off-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update.h      |  1 +
 fs/bcachefs/btree_update_leaf.c | 35 +++++++++++++++++++++++++++++------
 fs/bcachefs/recovery.c          | 16 ++++++++--------
 3 files changed, 38 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
index 63ff824a72da..5d5321dd42e8 100644
--- a/fs/bcachefs/btree_update.h
+++ b/fs/bcachefs/btree_update.h
@@ -95,6 +95,7 @@ void bch2_trans_commit_hook(struct btree_trans *,
 int __bch2_trans_commit(struct btree_trans *, unsigned);
 
 int bch2_fs_log_msg(struct bch_fs *, const char *, ...);
+int bch2_journal_log_msg(struct bch_fs *, const char *, ...);
 
 /**
  * bch2_trans_commit - insert keys at given iterator positions
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index da9840edc023..02d264b858ab 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -1924,22 +1924,45 @@ err:
 	return ret;
 }
 
-int bch2_fs_log_msg(struct bch_fs *c, const char *fmt, ...)
+static int
+__bch2_fs_log_msg(struct bch_fs *c, unsigned commit_flags, const char *fmt,
+		  va_list args)
 {
-	va_list args;
 	int ret;
 
-	va_start(args, fmt);
-
 	if (!test_bit(JOURNAL_STARTED, &c->journal.flags)) {
 		ret = __bch2_trans_log_msg(&c->journal.early_journal_entries, fmt, args);
 	} else {
-		ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_LAZY_RW,
+		ret = bch2_trans_do(c, NULL, NULL,
+			BTREE_INSERT_LAZY_RW|commit_flags,
 			__bch2_trans_log_msg(&trans.extra_journal_entries, fmt, args));
 	}
 
-	va_end(args);
+	return ret;
+}
+
+int bch2_fs_log_msg(struct bch_fs *c, const char *fmt, ...)
+{
+	va_list args;
+	int ret;
 
+	va_start(args, fmt);
+	ret = __bch2_fs_log_msg(c, 0, fmt, args);
+	va_end(args);
 	return ret;
+}
+
+/*
+ * Use for logging messages during recovery to enable reserved space and avoid
+ * blocking.
+ */
+int bch2_journal_log_msg(struct bch_fs *c, const char *fmt, ...)
+{
+	va_list args;
+	int ret;
 
+	va_start(args, fmt);
+	ret = __bch2_fs_log_msg(c, JOURNAL_WATERMARK_reserved, fmt, args);
+	va_end(args);
+	return ret;
 }
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 6aa99f57a001..1b9a8329654e 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -645,8 +645,8 @@ static int bch2_journal_replay(struct bch_fs *c, u64 start_seq, u64 end_seq)
 	     journal_sort_seq_cmp, NULL);
 
 	if (keys->nr) {
-		ret = bch2_fs_log_msg(c, "Starting journal replay (%zu keys in entries %llu-%llu)",
-				      keys->nr, start_seq, end_seq);
+		ret = bch2_journal_log_msg(c, "Starting journal replay (%zu keys in entries %llu-%llu)",
+					   keys->nr, start_seq, end_seq);
 		if (ret)
 			goto err;
 	}
@@ -680,7 +680,7 @@ static int bch2_journal_replay(struct bch_fs *c, u64 start_seq, u64 end_seq)
 	ret = bch2_journal_error(j);
 
 	if (keys->nr && !ret)
-		bch2_fs_log_msg(c, "journal replay finished");
+		bch2_journal_log_msg(c, "journal replay finished");
 err:
 	kvfree(keys_sorted);
 	return ret;
@@ -1244,8 +1244,8 @@ use_clean:
 		journal_seq += 8;
 
 	if (blacklist_seq != journal_seq) {
-		ret =   bch2_fs_log_msg(c, "blacklisting entries %llu-%llu",
-					blacklist_seq, journal_seq) ?:
+		ret =   bch2_journal_log_msg(c, "blacklisting entries %llu-%llu",
+					     blacklist_seq, journal_seq) ?:
 			bch2_journal_seq_blacklist_add(c,
 					blacklist_seq, journal_seq);
 		if (ret) {
@@ -1254,14 +1254,14 @@ use_clean:
 		}
 	}
 
-	ret =   bch2_fs_log_msg(c, "starting journal at entry %llu, replaying %llu-%llu",
-				journal_seq, last_seq, blacklist_seq - 1) ?:
+	ret =   bch2_journal_log_msg(c, "starting journal at entry %llu, replaying %llu-%llu",
+				     journal_seq, last_seq, blacklist_seq - 1) ?:
 		bch2_fs_journal_start(&c->journal, journal_seq);
 	if (ret)
 		goto err;
 
 	if (c->opts.reconstruct_alloc)
-		bch2_fs_log_msg(c, "dropping alloc info");
+		bch2_journal_log_msg(c, "dropping alloc info");
 
 	/*
 	 * Skip past versions that might have possibly been used (as nonces),
-- 
cgit 


From 62a03559d667c1b0552a692ac01c576358847318 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 31 Mar 2023 16:24:45 -0400
Subject: bcachefs: Rip out code for storing backpointers in alloc keys

We don't store backpointers in alloc keys anymore, since we gained the
btree write buffer.

This patch drops support for backpointers in alloc keys, and revs the on
disk format version so that we know a fsck is required.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c  |  34 ++---
 fs/bcachefs/alloc_foreground.c  |   6 +-
 fs/bcachefs/backpointers.c      | 329 ++++++++--------------------------------
 fs/bcachefs/backpointers.h      |  24 +--
 fs/bcachefs/bcachefs_format.h   |   3 +-
 fs/bcachefs/btree_update.h      |   1 +
 fs/bcachefs/btree_update_leaf.c |  14 ++
 fs/bcachefs/buckets.c           |   8 +-
 fs/bcachefs/ec.c                |  19 ++-
 fs/bcachefs/move.c              |  26 ++--
 fs/bcachefs/recovery.c          |   7 +-
 11 files changed, 135 insertions(+), 336 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index e5cbb4bce1ee..06032556d5c4 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -451,6 +451,8 @@ void __bch2_alloc_to_v4(struct bkey_s_c k, struct bch_alloc_v4 *out)
 
 		if (src < dst)
 			memset(src, 0, dst - src);
+
+		SET_BCH_ALLOC_V4_NR_BACKPOINTERS(out, 0);
 	} else {
 		struct bkey_alloc_unpacked u = bch2_alloc_unpack(k);
 
@@ -476,38 +478,26 @@ static noinline struct bkey_i_alloc_v4 *
 __bch2_alloc_to_v4_mut(struct btree_trans *trans, struct bkey_s_c k)
 {
 	struct bkey_i_alloc_v4 *ret;
+
+	ret = bch2_trans_kmalloc(trans, sizeof(struct bkey_i_alloc_v4));
+	if (IS_ERR(ret))
+		return ret;
+
 	if (k.k->type == KEY_TYPE_alloc_v4) {
-		struct bkey_s_c_alloc_v4 a = bkey_s_c_to_alloc_v4(k);
-		unsigned bytes = sizeof(struct bkey_i_alloc_v4) +
-			BCH_ALLOC_V4_NR_BACKPOINTERS(a.v) *
-			sizeof(struct bch_backpointer);
 		void *src, *dst;
 
-		/*
-		 * Reserve space for one more backpointer here:
-		 * Not sketchy at doing it this way, nope...
-		 */
-		ret = bch2_trans_kmalloc(trans, bytes + sizeof(struct bch_backpointer));
-		if (IS_ERR(ret))
-			return ret;
-
 		bkey_reassemble(&ret->k_i, k);
 
 		src = alloc_v4_backpointers(&ret->v);
 		SET_BCH_ALLOC_V4_BACKPOINTERS_START(&ret->v, BCH_ALLOC_V4_U64s);
 		dst = alloc_v4_backpointers(&ret->v);
 
-		memmove(dst, src, BCH_ALLOC_V4_NR_BACKPOINTERS(&ret->v) *
-			sizeof(struct bch_backpointer));
 		if (src < dst)
 			memset(src, 0, dst - src);
+
+		SET_BCH_ALLOC_V4_NR_BACKPOINTERS(&ret->v, 0);
 		set_alloc_v4_u64s(ret);
 	} else {
-		ret = bch2_trans_kmalloc(trans, sizeof(struct bkey_i_alloc_v4) +
-					 sizeof(struct bch_backpointer));
-		if (IS_ERR(ret))
-			return ret;
-
 		bkey_alloc_v4_init(&ret->k_i);
 		ret->k.p = k.k->p;
 		bch2_alloc_to_v4(k, &ret->v);
@@ -517,8 +507,12 @@ __bch2_alloc_to_v4_mut(struct btree_trans *trans, struct bkey_s_c k)
 
 static inline struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut_inlined(struct btree_trans *trans, struct bkey_s_c k)
 {
+	struct bkey_s_c_alloc_v4 a;
+
 	if (likely(k.k->type == KEY_TYPE_alloc_v4) &&
-	    BCH_ALLOC_V4_BACKPOINTERS_START(bkey_s_c_to_alloc_v4(k).v) == BCH_ALLOC_V4_U64s) {
+	    ((a = bkey_s_c_to_alloc_v4(k), true) &&
+	     BCH_ALLOC_V4_BACKPOINTERS_START(a.v) == BCH_ALLOC_V4_U64s &&
+	     BCH_ALLOC_V4_NR_BACKPOINTERS(a.v) == 0)) {
 		/*
 		 * Reserve space for one more backpointer here:
 		 * Not sketchy at doing it this way, nope...
diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index 20c64882104e..368355de5f26 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -345,17 +345,17 @@ static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bc
 
 	if (!test_bit(BCH_FS_CHECK_BACKPOINTERS_DONE, &c->flags)) {
 		struct bch_backpointer bp;
-		u64 bp_offset = 0;
+		struct bpos bp_pos = POS_MIN;
 
 		ret = bch2_get_next_backpointer(trans, POS(ca->dev_idx, b), -1,
-						&bp_offset, &bp,
+						&bp_pos, &bp,
 						BTREE_ITER_NOPRESERVE);
 		if (ret) {
 			ob = ERR_PTR(ret);
 			goto err;
 		}
 
-		if (bp_offset != U64_MAX) {
+		if (!bkey_eq(bp_pos, POS_MAX)) {
 			/*
 			 * Bucket may have data in it - we don't call
 			 * bc2h_trans_inconnsistent() because fsck hasn't
diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c
index 9121fae0fd6c..a3a1ed6e5968 100644
--- a/fs/bcachefs/backpointers.c
+++ b/fs/bcachefs/backpointers.c
@@ -69,6 +69,10 @@ void bch2_backpointer_to_text(struct printbuf *out, const struct bch_backpointer
 
 void bch2_backpointer_k_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
 {
+	prt_str(out, "bucket=");
+	bch2_bpos_to_text(out, bp_pos_to_bucket(c, k.k->p));
+	prt_str(out, " ");
+
 	bch2_backpointer_to_text(out, bkey_s_c_to_backpointer(k).v);
 }
 
@@ -81,117 +85,6 @@ void bch2_backpointer_swab(struct bkey_s k)
 	bch2_bpos_swab(&bp.v->pos);
 }
 
-#define BACKPOINTER_OFFSET_MAX	((1ULL << 40) - 1)
-
-static inline int backpointer_cmp(struct bch_backpointer l, struct bch_backpointer r)
-{
-	return cmp_int(l.bucket_offset, r.bucket_offset);
-}
-
-static int bch2_backpointer_del_by_offset(struct btree_trans *trans,
-					  struct bpos bucket,
-					  u64 bp_offset,
-					  struct bch_backpointer bp)
-{
-	struct bch_fs *c = trans->c;
-	struct btree_iter iter;
-	struct bkey_s_c k;
-	int ret;
-
-	if (bp_offset < BACKPOINTER_OFFSET_MAX) {
-		struct bch_backpointer *bps;
-		struct bkey_i_alloc_v4 *a;
-		unsigned i, nr;
-
-		bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc,
-				     bucket,
-				     BTREE_ITER_INTENT|
-				     BTREE_ITER_SLOTS|
-				     BTREE_ITER_WITH_UPDATES);
-		k = bch2_btree_iter_peek_slot(&iter);
-		ret = bkey_err(k);
-		if (ret)
-			goto err;
-
-		if (k.k->type != KEY_TYPE_alloc_v4) {
-			ret = -ENOENT;
-			goto err;
-		}
-
-		a = bch2_alloc_to_v4_mut(trans, k);
-		ret = PTR_ERR_OR_ZERO(a);
-		if (ret)
-			goto err;
-		bps = alloc_v4_backpointers(&a->v);
-		nr = BCH_ALLOC_V4_NR_BACKPOINTERS(&a->v);
-
-		for (i = 0; i < nr; i++) {
-			if (bps[i].bucket_offset == bp_offset)
-				goto found;
-			if (bps[i].bucket_offset > bp_offset)
-				break;
-		}
-
-		ret = -ENOENT;
-		goto err;
-found:
-		if (memcmp(&bps[i], &bp, sizeof(bp))) {
-			ret = -ENOENT;
-			goto err;
-		}
-		array_remove_item(bps, nr, i);
-		SET_BCH_ALLOC_V4_NR_BACKPOINTERS(&a->v, nr);
-		set_alloc_v4_u64s(a);
-		ret = bch2_trans_update(trans, &iter, &a->k_i, 0);
-	} else {
-		bp_offset -= BACKPOINTER_OFFSET_MAX;
-
-		bch2_trans_iter_init(trans, &iter, BTREE_ID_backpointers,
-				     bucket_pos_to_bp(c, bucket, bp_offset),
-				     BTREE_ITER_INTENT|
-				     BTREE_ITER_SLOTS|
-				     BTREE_ITER_WITH_UPDATES);
-		k = bch2_btree_iter_peek_slot(&iter);
-		ret = bkey_err(k);
-		if (ret)
-			goto err;
-
-		if (k.k->type != KEY_TYPE_backpointer ||
-		    memcmp(bkey_s_c_to_backpointer(k).v, &bp, sizeof(bp))) {
-			ret = -ENOENT;
-			goto err;
-		}
-
-		ret = bch2_btree_delete_at(trans, &iter, 0);
-	}
-err:
-	bch2_trans_iter_exit(trans, &iter);
-	return ret;
-}
-
-bool bch2_bucket_backpointer_del(struct btree_trans *trans,
-				 struct bkey_i_alloc_v4 *a,
-				 struct bch_backpointer bp)
-{
-	struct bch_backpointer *bps = alloc_v4_backpointers(&a->v);
-	unsigned i, nr = BCH_ALLOC_V4_NR_BACKPOINTERS(&a->v);
-
-	for (i = 0; i < nr; i++) {
-		int cmp = backpointer_cmp(bps[i], bp) ?:
-			memcmp(&bps[i], &bp, sizeof(bp));
-		if (!cmp) {
-			array_remove_item(bps, nr, i);
-			SET_BCH_ALLOC_V4_NR_BACKPOINTERS(&a->v, nr);
-			set_alloc_v4_u64s(a);
-			return true;
-		}
-		if (cmp >= 0)
-			break;
-	}
-
-	return false;
-}
-
 static noinline int backpointer_mod_err(struct btree_trans *trans,
 					struct bch_backpointer bp,
 					struct bkey_s_c bp_k,
@@ -245,7 +138,7 @@ static noinline int backpointer_mod_err(struct btree_trans *trans,
 }
 
 int bch2_bucket_backpointer_mod_nowritebuffer(struct btree_trans *trans,
-				struct bkey_i_alloc_v4 *a,
+				struct bpos bucket,
 				struct bch_backpointer bp,
 				struct bkey_s_c orig_k,
 				bool insert)
@@ -262,7 +155,7 @@ int bch2_bucket_backpointer_mod_nowritebuffer(struct btree_trans *trans,
 		return ret;
 
 	bkey_backpointer_init(&bp_k->k_i);
-	bp_k->k.p = bucket_pos_to_bp(c, a->k.p, bp.bucket_offset);
+	bp_k->k.p = bucket_pos_to_bp(c, bucket, bp.bucket_offset);
 	bp_k->v = bp;
 
 	if (!insert) {
@@ -271,7 +164,7 @@ int bch2_bucket_backpointer_mod_nowritebuffer(struct btree_trans *trans,
 	}
 
 	bch2_trans_iter_init(trans, &bp_iter, BTREE_ID_backpointers,
-			     bucket_pos_to_bp(c, a->k.p, bp.bucket_offset),
+			     bp_k->k.p,
 			     BTREE_ITER_INTENT|
 			     BTREE_ITER_SLOTS|
 			     BTREE_ITER_WITH_UPDATES);
@@ -298,94 +191,62 @@ err:
 /*
  * Find the next backpointer >= *bp_offset:
  */
-int __bch2_get_next_backpointer(struct btree_trans *trans,
-				struct bpos bucket, int gen,
-				u64 *bp_offset,
-				struct bpos *bp_pos_ret,
-				struct bch_backpointer *dst,
-				unsigned iter_flags)
+int bch2_get_next_backpointer(struct btree_trans *trans,
+			      struct bpos bucket, int gen,
+			      struct bpos *bp_pos,
+			      struct bch_backpointer *bp,
+			      unsigned iter_flags)
 {
 	struct bch_fs *c = trans->c;
-	struct bpos bp_pos, bp_end_pos;
-	struct btree_iter alloc_iter, bp_iter = { NULL };
+	struct bpos bp_end_pos = bucket_pos_to_bp(c, bpos_nosnap_successor(bucket), 0);
+	struct btree_iter alloc_iter = { NULL }, bp_iter = { NULL };
 	struct bkey_s_c k;
-	struct bkey_s_c_alloc_v4 a;
-	size_t i;
-	int ret;
-
-	if (*bp_offset == U64_MAX)
-		return 0;
-
-	bp_pos = bucket_pos_to_bp(c, bucket,
-				  max(*bp_offset, BACKPOINTER_OFFSET_MAX) - BACKPOINTER_OFFSET_MAX);
-	bp_end_pos = bucket_pos_to_bp(c, bpos_nosnap_successor(bucket), 0);
-
-	bch2_trans_iter_init(trans, &alloc_iter, BTREE_ID_alloc,
-			     bucket, BTREE_ITER_CACHED);
-	k = bch2_btree_iter_peek_slot(&alloc_iter);
-	ret = bkey_err(k);
-	if (ret)
-		goto out;
-
-	if (k.k->type != KEY_TYPE_alloc_v4)
-		goto done;
+	int ret = 0;
 
-	a = bkey_s_c_to_alloc_v4(k);
-	if (gen >= 0 && a.v->gen != gen)
+	if (bpos_ge(*bp_pos, bp_end_pos))
 		goto done;
 
-	for (i = 0; i < BCH_ALLOC_V4_NR_BACKPOINTERS(a.v); i++) {
-		if (alloc_v4_backpointers_c(a.v)[i].bucket_offset < *bp_offset)
-			continue;
+	if (gen >= 0) {
+		bch2_trans_iter_init(trans, &alloc_iter, BTREE_ID_alloc,
+				     bucket, BTREE_ITER_CACHED|iter_flags);
+		k = bch2_btree_iter_peek_slot(&alloc_iter);
+		ret = bkey_err(k);
+		if (ret)
+			goto out;
 
-		*dst = alloc_v4_backpointers_c(a.v)[i];
-		*bp_offset = dst->bucket_offset;
-		goto out;
+		if (k.k->type != KEY_TYPE_alloc_v4 ||
+		    bkey_s_c_to_alloc_v4(k).v->gen != gen)
+			goto done;
 	}
 
+	*bp_pos = bpos_max(*bp_pos, bucket_pos_to_bp(c, bucket, 0));
+
 	for_each_btree_key_norestart(trans, bp_iter, BTREE_ID_backpointers,
-				     bp_pos, 0, k, ret) {
+				     *bp_pos, iter_flags, k, ret) {
 		if (bpos_ge(k.k->p, bp_end_pos))
 			break;
 
-		if (k.k->type != KEY_TYPE_backpointer)
-			continue;
-
-		*dst = *bkey_s_c_to_backpointer(k).v;
-		*bp_offset = dst->bucket_offset + BACKPOINTER_OFFSET_MAX;
-		*bp_pos_ret = k.k->p;
+		*bp_pos = k.k->p;
+		*bp = *bkey_s_c_to_backpointer(k).v;
 		goto out;
 	}
 done:
-	*bp_offset = U64_MAX;
+	*bp_pos = SPOS_MAX;
 out:
 	bch2_trans_iter_exit(trans, &bp_iter);
 	bch2_trans_iter_exit(trans, &alloc_iter);
 	return ret;
 }
 
-int bch2_get_next_backpointer(struct btree_trans *trans,
-			      struct bpos bucket, int gen,
-			      u64 *bp_offset,
-			      struct bch_backpointer *dst,
-			      unsigned iter_flags)
-{
-	struct bpos bp_pos;
-
-	return __bch2_get_next_backpointer(trans, bucket, gen,
-					   bp_offset, &bp_pos,
-					   dst, iter_flags);
-}
-
 static void backpointer_not_found(struct btree_trans *trans,
-				  struct bpos bucket,
-				  u64 bp_offset,
+				  struct bpos bp_pos,
 				  struct bch_backpointer bp,
 				  struct bkey_s_c k,
 				  const char *thing_it_points_to)
 {
 	struct bch_fs *c = trans->c;
 	struct printbuf buf = PRINTBUF;
+	struct bpos bucket = bp_pos_to_bucket(c, bp_pos);
 
 	if (likely(!bch2_backpointers_no_use_write_buffer))
 		return;
@@ -396,14 +257,9 @@ static void backpointer_not_found(struct btree_trans *trans,
 	bch2_bpos_to_text(&buf, bucket);
 	prt_printf(&buf, "\n  ");
 
-	if (bp_offset >= BACKPOINTER_OFFSET_MAX) {
-		struct bpos bp_pos =
-			bucket_pos_to_bp(c, bucket,
-					bp_offset - BACKPOINTER_OFFSET_MAX);
-		prt_printf(&buf, "backpointer pos: ");
-		bch2_bpos_to_text(&buf, bp_pos);
-		prt_printf(&buf, "\n  ");
-	}
+	prt_printf(&buf, "backpointer pos: ");
+	bch2_bpos_to_text(&buf, bp_pos);
+	prt_printf(&buf, "\n  ");
 
 	bch2_backpointer_to_text(&buf, &bp);
 	prt_printf(&buf, "\n  ");
@@ -418,12 +274,12 @@ static void backpointer_not_found(struct btree_trans *trans,
 
 struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *trans,
 					 struct btree_iter *iter,
-					 struct bpos bucket,
-					 u64 bp_offset,
+					 struct bpos bp_pos,
 					 struct bch_backpointer bp,
 					 unsigned iter_flags)
 {
 	struct bch_fs *c = trans->c;
+	struct bpos bucket = bp_pos_to_bucket(c, bp_pos);
 	struct bkey_s_c k;
 
 	bch2_trans_node_iter_init(trans, iter,
@@ -456,7 +312,7 @@ struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *trans,
 			 * been written out yet - backpointer_get_node() checks for
 			 * this:
 			 */
-			b = bch2_backpointer_get_node(trans, iter, bucket, bp_offset, bp);
+			b = bch2_backpointer_get_node(trans, iter, bp_pos, bp);
 			if (!IS_ERR_OR_NULL(b))
 				return bkey_i_to_s_c(&b->key);
 
@@ -467,7 +323,7 @@ struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *trans,
 			return bkey_s_c_null;
 		}
 
-		backpointer_not_found(trans, bucket, bp_offset, bp, k, "extent");
+		backpointer_not_found(trans, bp_pos, bp, k, "extent");
 	}
 
 	return bkey_s_c_null;
@@ -475,11 +331,11 @@ struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *trans,
 
 struct btree *bch2_backpointer_get_node(struct btree_trans *trans,
 					struct btree_iter *iter,
-					struct bpos bucket,
-					u64 bp_offset,
+					struct bpos bp_pos,
 					struct bch_backpointer bp)
 {
 	struct bch_fs *c = trans->c;
+	struct bpos bucket = bp_pos_to_bucket(c, bp_pos);
 	struct btree *b;
 
 	BUG_ON(!bp.level);
@@ -502,7 +358,7 @@ struct btree *bch2_backpointer_get_node(struct btree_trans *trans,
 	if (b && btree_node_will_make_reachable(b)) {
 		b = ERR_PTR(-BCH_ERR_backpointer_to_overwritten_btree_node);
 	} else {
-		backpointer_not_found(trans, bucket, bp_offset, bp,
+		backpointer_not_found(trans, bp_pos, bp,
 				      bkey_i_to_s_c(&b->key), "btree node");
 		b = NULL;
 	}
@@ -571,7 +427,7 @@ struct bpos_level {
 };
 
 static int check_bp_exists(struct btree_trans *trans,
-			   struct bpos bucket_pos,
+			   struct bpos bucket,
 			   struct bch_backpointer bp,
 			   struct bkey_s_c orig_k,
 			   struct bpos bucket_start,
@@ -579,40 +435,20 @@ static int check_bp_exists(struct btree_trans *trans,
 			   struct bpos_level *last_flushed)
 {
 	struct bch_fs *c = trans->c;
-	struct btree_iter alloc_iter, bp_iter = { NULL };
+	struct btree_iter bp_iter = { NULL };
 	struct printbuf buf = PRINTBUF;
-	struct bkey_s_c alloc_k, bp_k;
+	struct bkey_s_c bp_k;
 	int ret;
 
-	if (bpos_lt(bucket_pos, bucket_start) ||
-	    bpos_gt(bucket_pos, bucket_end))
+	if (bpos_lt(bucket, bucket_start) ||
+	    bpos_gt(bucket, bucket_end))
 		return 0;
 
-	bch2_trans_iter_init(trans, &alloc_iter, BTREE_ID_alloc, bucket_pos, 0);
-	alloc_k = bch2_btree_iter_peek_slot(&alloc_iter);
-	ret = bkey_err(alloc_k);
-	if (ret)
-		goto err;
-
-	if (alloc_k.k->type == KEY_TYPE_alloc_v4) {
-		struct bkey_s_c_alloc_v4 a = bkey_s_c_to_alloc_v4(alloc_k);
-		const struct bch_backpointer *bps = alloc_v4_backpointers_c(a.v);
-		unsigned i, nr = BCH_ALLOC_V4_NR_BACKPOINTERS(a.v);
-
-		for (i = 0; i < nr; i++) {
-			int cmp = backpointer_cmp(bps[i], bp) ?:
-				memcmp(&bps[i], &bp, sizeof(bp));
-			if (!cmp)
-				goto out;
-			if (cmp >= 0)
-				break;
-		}
-	} else {
+	if (!bch2_dev_bucket_exists(c, bucket))
 		goto missing;
-	}
 
 	bch2_trans_iter_init(trans, &bp_iter, BTREE_ID_backpointers,
-			     bucket_pos_to_bp(c, bucket_pos, bp.bucket_offset),
+			     bucket_pos_to_bp(c, bucket, bp.bucket_offset),
 			     0);
 	bp_k = bch2_btree_iter_peek_slot(&bp_iter);
 	ret = bkey_err(bp_k);
@@ -636,11 +472,9 @@ out:
 err:
 fsck_err:
 	bch2_trans_iter_exit(trans, &bp_iter);
-	bch2_trans_iter_exit(trans, &alloc_iter);
 	printbuf_exit(&buf);
 	return ret;
 missing:
-
 	prt_printf(&buf, "missing backpointer for btree=%s l=%u ",
 	       bch2_btree_ids[bp.btree_id], bp.level);
 	bch2_bkey_val_to_text(&buf, c, orig_k);
@@ -649,12 +483,8 @@ missing:
 
 	if (c->sb.version < bcachefs_metadata_version_backpointers ||
 	    c->opts.reconstruct_alloc ||
-	    fsck_err(c, "%s", buf.buf)) {
-		struct bkey_i_alloc_v4 *a = bch2_alloc_to_v4_mut(trans, alloc_k);
-
-		ret   = PTR_ERR_OR_ZERO(a) ?:
-			bch2_bucket_backpointer_mod(trans, a, bp, orig_k, true);
-	}
+	    fsck_err(c, "%s", buf.buf))
+		ret = bch2_bucket_backpointer_mod(trans, bucket, bp, orig_k, true);
 
 	goto out;
 }
@@ -953,53 +783,40 @@ int bch2_check_extents_to_backpointers(struct bch_fs *c)
 }
 
 static int check_one_backpointer(struct btree_trans *trans,
-				 struct bpos bucket,
-				 u64 *bp_offset,
 				 struct bbpos start,
 				 struct bbpos end,
+				 struct bkey_s_c_backpointer bp,
 				 struct bpos *last_flushed_pos)
 {
 	struct bch_fs *c = trans->c;
 	struct btree_iter iter;
-	struct bch_backpointer bp;
-	struct bbpos pos;
-	struct bpos bp_pos;
+	struct bbpos pos = bp_to_bbpos(*bp.v);
 	struct bkey_s_c k;
 	struct printbuf buf = PRINTBUF;
 	int ret;
 
-	ret = __bch2_get_next_backpointer(trans, bucket, -1, bp_offset, &bp_pos, &bp, 0);
-	if (ret || *bp_offset == U64_MAX)
-		return ret;
-
-	pos = bp_to_bbpos(bp);
 	if (bbpos_cmp(pos, start) < 0 ||
 	    bbpos_cmp(pos, end) > 0)
 		return 0;
 
-	k = bch2_backpointer_get_key(trans, &iter, bucket, *bp_offset, bp, 0);
+	k = bch2_backpointer_get_key(trans, &iter, bp.k->p, *bp.v, 0);
 	ret = bkey_err(k);
 	if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node)
 		return 0;
 	if (ret)
 		return ret;
 
-	if (!k.k && !bpos_eq(*last_flushed_pos, bp_pos)) {
-		*last_flushed_pos = bp_pos;
+	if (!k.k && !bpos_eq(*last_flushed_pos, bp.k->p)) {
+		*last_flushed_pos = bp.k->p;
 		ret = bch2_btree_write_buffer_flush_sync(trans) ?:
 			-BCH_ERR_transaction_restart_write_buffer_flush;
 		goto out;
 	}
 
 	if (fsck_err_on(!k.k, c,
-			"backpointer for %llu:%llu:%llu (btree pos %llu:%llu) points to missing extent\n  %s",
-			bucket.inode, bucket.offset, (u64) bp.bucket_offset,
-			bp_pos.inode, bp_pos.offset,
-			(bch2_backpointer_to_text(&buf, &bp), buf.buf))) {
-		ret = bch2_backpointer_del_by_offset(trans, bucket, *bp_offset, bp);
-		if (ret == -ENOENT)
-			bch_err(c, "backpointer at %llu not found", *bp_offset);
-	}
+			"backpointer for missing extent\n  %s",
+			(bch2_backpointer_k_to_text(&buf, c, bp.s_c), buf.buf)))
+		return bch2_btree_delete_at_buffered(trans, BTREE_ID_backpointers, bp.k->p);
 out:
 fsck_err:
 	bch2_trans_iter_exit(trans, &iter);
@@ -1014,25 +831,13 @@ static int bch2_check_backpointers_to_extents_pass(struct btree_trans *trans,
 	struct btree_iter iter;
 	struct bkey_s_c k;
 	struct bpos last_flushed_pos = SPOS_MAX;
-	int ret = 0;
 
-	for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN,
-			   BTREE_ITER_PREFETCH, k, ret) {
-		u64 bp_offset = 0;
-
-		while (!(ret = commit_do(trans, NULL, NULL,
-					 BTREE_INSERT_LAZY_RW|
-					 BTREE_INSERT_NOFAIL,
-				check_one_backpointer(trans, iter.pos, &bp_offset,
-						      start, end, &last_flushed_pos))) &&
-		       bp_offset < U64_MAX)
-			bp_offset++;
-
-		if (ret)
-			break;
-	}
-	bch2_trans_iter_exit(trans, &iter);
-	return ret < 0 ? ret : 0;
+	return for_each_btree_key_commit(trans, iter, BTREE_ID_backpointers,
+				  POS_MIN, BTREE_ITER_PREFETCH, k,
+				  NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
+		check_one_backpointer(trans, start, end,
+				      bkey_s_c_to_backpointer(k),
+				      &last_flushed_pos));
 }
 
 int bch2_check_backpointers_to_extents(struct bch_fs *c)
diff --git a/fs/bcachefs/backpointers.h b/fs/bcachefs/backpointers.h
index ce75b8f50241..9c03709ade50 100644
--- a/fs/bcachefs/backpointers.h
+++ b/fs/bcachefs/backpointers.h
@@ -53,16 +53,11 @@ static inline struct bpos bucket_pos_to_bp(const struct bch_fs *c,
 	return ret;
 }
 
-bool bch2_bucket_backpointer_del(struct btree_trans *,
-				struct bkey_i_alloc_v4 *,
-				struct bch_backpointer);
-
-int bch2_bucket_backpointer_mod_nowritebuffer(struct btree_trans *,
-				struct bkey_i_alloc_v4 *,
+int bch2_bucket_backpointer_mod_nowritebuffer(struct btree_trans *, struct bpos,
 				struct bch_backpointer, struct bkey_s_c, bool);
 
 static inline int bch2_bucket_backpointer_mod(struct btree_trans *trans,
-				struct bkey_i_alloc_v4 *a,
+				struct bpos bucket,
 				struct bch_backpointer bp,
 				struct bkey_s_c orig_k,
 				bool insert)
@@ -71,13 +66,8 @@ static inline int bch2_bucket_backpointer_mod(struct btree_trans *trans,
 	struct bkey_i_backpointer *bp_k;
 	int ret;
 
-	if (!insert &&
-	    unlikely(BCH_ALLOC_V4_NR_BACKPOINTERS(&a->v)) &&
-	    bch2_bucket_backpointer_del(trans, a, bp))
-		return 0;
-
 	if (unlikely(bch2_backpointers_no_use_write_buffer))
-		return bch2_bucket_backpointer_mod_nowritebuffer(trans, a, bp, orig_k, insert);
+		return bch2_bucket_backpointer_mod_nowritebuffer(trans, bucket, bp, orig_k, insert);
 
 	bp_k = bch2_trans_kmalloc_nomemzero(trans, sizeof(struct bkey_i_backpointer));
 	ret = PTR_ERR_OR_ZERO(bp_k);
@@ -85,7 +75,7 @@ static inline int bch2_bucket_backpointer_mod(struct btree_trans *trans,
 		return ret;
 
 	bkey_backpointer_init(&bp_k->k_i);
-	bp_k->k.p = bucket_pos_to_bp(c, a->k.p, bp.bucket_offset);
+	bp_k->k.p = bucket_pos_to_bp(c, bucket, bp.bucket_offset);
 	bp_k->v = bp;
 
 	if (!insert) {
@@ -126,12 +116,12 @@ static inline void bch2_extent_ptr_to_bp(struct bch_fs *c,
 }
 
 int bch2_get_next_backpointer(struct btree_trans *, struct bpos, int,
-			      u64 *, struct bch_backpointer *, unsigned);
+			      struct bpos *, struct bch_backpointer *, unsigned);
 struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *, struct btree_iter *,
-					 struct bpos, u64, struct bch_backpointer,
+					 struct bpos, struct bch_backpointer,
 					 unsigned);
 struct btree *bch2_backpointer_get_node(struct btree_trans *, struct btree_iter *,
-					struct bpos, u64, struct bch_backpointer);
+					struct bpos, struct bch_backpointer);
 
 int bch2_check_btree_backpointers(struct bch_fs *);
 int bch2_check_extents_to_backpointers(struct bch_fs *);
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index aae658d17797..013d5e185d97 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -1558,7 +1558,8 @@ struct bch_sb_field_journal_seq_blacklist {
 	x(unwritten_extents,		24)		\
 	x(bucket_gens,			25)		\
 	x(lru_v2,			26)		\
-	x(fragmentation_lru,		27)
+	x(fragmentation_lru,		27)		\
+	x(no_bps_in_alloc_keys,		28)
 
 enum bcachefs_metadata_version {
 	bcachefs_metadata_version_min = 9,
diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
index 5d5321dd42e8..4adb6f646655 100644
--- a/fs/bcachefs/btree_update.h
+++ b/fs/bcachefs/btree_update.h
@@ -60,6 +60,7 @@ enum btree_insert_flags {
 int bch2_btree_delete_extent_at(struct btree_trans *, struct btree_iter *,
 				unsigned, unsigned);
 int bch2_btree_delete_at(struct btree_trans *, struct btree_iter *, unsigned);
+int bch2_btree_delete_at_buffered(struct btree_trans *, enum btree_id, struct bpos);
 
 int bch2_btree_insert_nonextent(struct btree_trans *, enum btree_id,
 				struct bkey_i *, enum btree_update_flags);
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 02d264b858ab..b8299914a0ab 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -1802,6 +1802,20 @@ int bch2_btree_delete_at(struct btree_trans *trans,
 	return bch2_btree_delete_extent_at(trans, iter, 0, update_flags);
 }
 
+int bch2_btree_delete_at_buffered(struct btree_trans *trans,
+				  enum btree_id btree, struct bpos pos)
+{
+	struct bkey_i *k;
+
+	k = bch2_trans_kmalloc(trans, sizeof(*k));
+	if (IS_ERR(k))
+		return PTR_ERR(k);
+
+	bkey_init(&k->k);
+	k->k.p = pos;
+	return bch2_trans_update_buffered(trans, btree, k);
+}
+
 int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id id,
 				  struct bpos start, struct bpos end,
 				  unsigned update_flags,
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 6e2e2ed72f65..f3cee8f2b793 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -1407,17 +1407,17 @@ static inline int bch2_trans_mark_pointer(struct btree_trans *trans,
 	bool insert = !(flags & BTREE_TRIGGER_OVERWRITE);
 	struct btree_iter iter;
 	struct bkey_i_alloc_v4 *a;
-	struct bpos bucket_pos;
+	struct bpos bucket;
 	struct bch_backpointer bp;
 	s64 sectors;
 	int ret;
 
-	bch2_extent_ptr_to_bp(trans->c, btree_id, level, k, p, &bucket_pos, &bp);
+	bch2_extent_ptr_to_bp(trans->c, btree_id, level, k, p, &bucket, &bp);
 	sectors = bp.bucket_len;
 	if (!insert)
 		sectors = -sectors;
 
-	a = bch2_trans_start_alloc_update(trans, &iter, bucket_pos);
+	a = bch2_trans_start_alloc_update(trans, &iter, bucket);
 	if (IS_ERR(a))
 		return PTR_ERR(a);
 
@@ -1428,7 +1428,7 @@ static inline int bch2_trans_mark_pointer(struct btree_trans *trans,
 		goto err;
 
 	if (!p.ptr.cached) {
-		ret = bch2_bucket_backpointer_mod(trans, a, bp, k, insert);
+		ret = bch2_bucket_backpointer_mod(trans, bucket, bp, k, insert);
 		if (ret)
 			goto err;
 	}
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index d295e5401c7a..1855d08efd4b 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -887,7 +887,7 @@ err:
 static int ec_stripe_update_extent(struct btree_trans *trans,
 				   struct bpos bucket, u8 gen,
 				   struct ec_stripe_buf *s,
-				   u64 *bp_offset)
+				   struct bpos *bp_pos)
 {
 	struct bch_fs *c = trans->c;
 	struct bch_backpointer bp;
@@ -900,10 +900,10 @@ static int ec_stripe_update_extent(struct btree_trans *trans,
 	int ret, dev, block;
 
 	ret = bch2_get_next_backpointer(trans, bucket, gen,
-				bp_offset, &bp, BTREE_ITER_CACHED);
+				bp_pos, &bp, BTREE_ITER_CACHED);
 	if (ret)
 		return ret;
-	if (*bp_offset == U64_MAX)
+	if (bpos_eq(*bp_pos, SPOS_MAX))
 		return 0;
 
 	if (bp.level) {
@@ -911,7 +911,7 @@ static int ec_stripe_update_extent(struct btree_trans *trans,
 		struct btree_iter node_iter;
 		struct btree *b;
 
-		b = bch2_backpointer_get_node(trans, &node_iter, bucket, *bp_offset, bp);
+		b = bch2_backpointer_get_node(trans, &node_iter, *bp_pos, bp);
 		bch2_trans_iter_exit(trans, &node_iter);
 
 		if (!b)
@@ -925,8 +925,7 @@ static int ec_stripe_update_extent(struct btree_trans *trans,
 		return -EIO;
 	}
 
-	k = bch2_backpointer_get_key(trans, &iter, bucket, *bp_offset, bp,
-				     BTREE_ITER_INTENT);
+	k = bch2_backpointer_get_key(trans, &iter, *bp_pos, bp, BTREE_ITER_INTENT);
 	ret = bkey_err(k);
 	if (ret)
 		return ret;
@@ -985,7 +984,7 @@ static int ec_stripe_update_bucket(struct btree_trans *trans, struct ec_stripe_b
 	struct bch_fs *c = trans->c;
 	struct bch_extent_ptr bucket = s->key.v.ptrs[block];
 	struct bpos bucket_pos = PTR_BUCKET_POS(c, &bucket);
-	u64 bp_offset = 0;
+	struct bpos bp_pos = POS_MIN;
 	int ret = 0;
 
 	while (1) {
@@ -993,13 +992,13 @@ static int ec_stripe_update_bucket(struct btree_trans *trans, struct ec_stripe_b
 				BTREE_INSERT_NOCHECK_RW|
 				BTREE_INSERT_NOFAIL,
 			ec_stripe_update_extent(trans, bucket_pos, bucket.gen,
-						s, &bp_offset));
+						s, &bp_pos));
 		if (ret)
 			break;
-		if (bp_offset == U64_MAX)
+		if (bkey_eq(bp_pos, POS_MAX))
 			break;
 
-		bp_offset++;
+		bp_pos = bpos_nosnap_successor(bp_pos);
 	}
 
 	return ret;
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index be14d3737027..9c8af0872b29 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -626,7 +626,7 @@ void bch2_verify_bucket_evacuated(struct btree_trans *trans, struct bpos bucket,
 	struct bkey_s_c k;
 	struct printbuf buf = PRINTBUF;
 	struct bch_backpointer bp;
-	u64 bp_offset = 0;
+	struct bpos bp_pos = POS_MIN;
 	unsigned nr_bps = 0;
 	int ret;
 
@@ -668,17 +668,16 @@ failed_to_evacuate:
 		bch2_trans_begin(trans);
 
 		ret = bch2_get_next_backpointer(trans, bucket, gen,
-						&bp_offset, &bp,
+						&bp_pos, &bp,
 						BTREE_ITER_CACHED);
 		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
 			continue;
 		if (ret)
 			break;
-		if (bp_offset == U64_MAX)
+		if (bkey_eq(bp_pos, POS_MAX))
 			break;
 
-		k = bch2_backpointer_get_key(trans, &iter,
-					     bucket, bp_offset, bp, 0);
+		k = bch2_backpointer_get_key(trans, &iter, bp_pos, bp, 0);
 		ret = bkey_err(k);
 		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
 			continue;
@@ -692,7 +691,7 @@ failed_to_evacuate:
 
 		if (++nr_bps > 10)
 			break;
-		bp_offset++;
+		bp_pos = bpos_nosnap_successor(bp_pos);
 	}
 
 	bch2_print_string_as_lines(KERN_ERR, buf.buf);
@@ -716,7 +715,8 @@ int __bch2_evacuate_bucket(struct btree_trans *trans,
 	struct data_update_opts data_opts;
 	unsigned dirty_sectors, bucket_size;
 	u64 fragmentation;
-	u64 bp_offset = 0, cur_inum = U64_MAX;
+	u64 cur_inum = U64_MAX;
+	struct bpos bp_pos = POS_MIN;
 	int ret = 0;
 
 	bch2_bkey_buf_init(&sk);
@@ -752,13 +752,13 @@ int __bch2_evacuate_bucket(struct btree_trans *trans,
 		bch2_trans_begin(trans);
 
 		ret = bch2_get_next_backpointer(trans, bucket, gen,
-						&bp_offset, &bp,
+						&bp_pos, &bp,
 						BTREE_ITER_CACHED);
 		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
 			continue;
 		if (ret)
 			goto err;
-		if (bp_offset == U64_MAX)
+		if (bkey_eq(bp_pos, POS_MAX))
 			break;
 
 		if (!bp.level) {
@@ -766,8 +766,7 @@ int __bch2_evacuate_bucket(struct btree_trans *trans,
 			struct bkey_s_c k;
 			unsigned i = 0;
 
-			k = bch2_backpointer_get_key(trans, &iter,
-						bucket, bp_offset, bp, 0);
+			k = bch2_backpointer_get_key(trans, &iter, bp_pos, bp, 0);
 			ret = bkey_err(k);
 			if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
 				continue;
@@ -822,8 +821,7 @@ int __bch2_evacuate_bucket(struct btree_trans *trans,
 		} else {
 			struct btree *b;
 
-			b = bch2_backpointer_get_node(trans, &iter,
-						bucket, bp_offset, bp);
+			b = bch2_backpointer_get_node(trans, &iter, bp_pos, bp);
 			ret = PTR_ERR_OR_ZERO(b);
 			if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node)
 				continue;
@@ -851,7 +849,7 @@ int __bch2_evacuate_bucket(struct btree_trans *trans,
 			}
 		}
 next:
-		bp_offset++;
+		bp_pos = bpos_nosnap_successor(bp_pos);
 	}
 
 	trace_evacuate_bucket(c, &bucket, dirty_sectors, bucket_size, fragmentation, ret);
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 1b9a8329654e..8cc8af6d29ef 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -1134,14 +1134,11 @@ int bch2_fs_recovery(struct bch_fs *c)
 	}
 
 	if (!c->opts.nochanges) {
-		if (c->sb.version < bcachefs_metadata_version_lru_v2) {
-			bch_info(c, "version prior to backpointers, upgrade and fsck required");
+		if (c->sb.version < bcachefs_metadata_version_no_bps_in_alloc_keys) {
+			bch_info(c, "version prior to no_bps_in_alloc_keys, upgrade and fsck required");
 			c->opts.version_upgrade	= true;
 			c->opts.fsck		= true;
 			c->opts.fix_errors	= FSCK_OPT_YES;
-		} else if (c->sb.version < bcachefs_metadata_version_fragmentation_lru) {
-			bch_info(c, "version prior to backpointers, upgrade required");
-			c->opts.version_upgrade	= true;
 		}
 	}
 
-- 
cgit 


From dde72e182758e455891ff61c11746085db8c27c1 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 30 Mar 2023 20:16:06 -0400
Subject: bcachefs: Add missing bch2_err_class() call

We're not supposed to return our private error codes to userspace.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index a57ab773dd27..58a89c36cf0e 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -1894,7 +1894,7 @@ out:
 
 err_put_super:
 	deactivate_locked_super(sb);
-	return ERR_PTR(ret);
+	return ERR_PTR(bch2_err_class(ret));
 }
 
 static void bch2_kill_sb(struct super_block *sb)
-- 
cgit 


From 8669199438aeb5daf8b17f76bc853286b93f058e Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 30 Mar 2023 18:49:02 -0400
Subject: bcachefs: Print out counters correctly

Most counters aren't in units of sectors, and the ones that are should
just be switched to bytes, for simplicity.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/sysfs.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index 6be6be881dbd..b2274cb35172 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -602,12 +602,12 @@ SHOW(bch2_fs_counters)
 			counter_since_mount = counter - c->counters_on_mount[BCH_COUNTER_##t];\
 			prt_printf(out, "since mount:");				\
 			prt_tab(out);						\
-			prt_human_readable_u64(out, counter_since_mount << 9);	\
+			prt_human_readable_u64(out, counter_since_mount);	\
 			prt_newline(out);					\
 										\
 			prt_printf(out, "since filesystem creation:");		\
 			prt_tab(out);						\
-			prt_human_readable_u64(out, counter << 9);		\
+			prt_human_readable_u64(out, counter);			\
 			prt_newline(out);					\
 		}
 	BCH_PERSISTENT_COUNTERS()
-- 
cgit 


From bb6c4b92fd8566aeb92e56f464ff809ae7e60155 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 10 Mar 2023 17:34:29 -0500
Subject: bcachefs: Improve trace_move_extent_fail()

This greatly expands the move_extent_fail tracepoint - now it includes
all the information we have available, including exactly why the extent
wasn't updated.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/data_update.c | 79 ++++++++++++++++++++++++++++++++++++++++++++---
 fs/bcachefs/trace.h       | 18 +++++++++--
 2 files changed, 90 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c
index 865514dd2aa9..7a5850679f16 100644
--- a/fs/bcachefs/data_update.c
+++ b/fs/bcachefs/data_update.c
@@ -7,6 +7,7 @@
 #include "buckets.h"
 #include "data_update.h"
 #include "ec.h"
+#include "error.h"
 #include "extents.h"
 #include "io.h"
 #include "keylist.h"
@@ -91,6 +92,70 @@ static int insert_snapshot_whiteouts(struct btree_trans *trans,
 	return ret;
 }
 
+static void trace_move_extent_fail2(struct data_update *m,
+			 struct bkey_s_c new,
+			 struct bkey_s_c wrote,
+			 struct bkey_i *insert,
+			 const char *msg)
+{
+	struct bch_fs *c = m->op.c;
+	struct bkey_s_c old = bkey_i_to_s_c(m->k.k);
+	const union bch_extent_entry *entry;
+	struct bch_extent_ptr *ptr;
+	struct extent_ptr_decoded p;
+	struct printbuf buf = PRINTBUF;
+	unsigned i, rewrites_found = 0;
+
+	if (!trace_move_extent_fail_enabled())
+		return;
+
+	prt_str(&buf, msg);
+
+	if (insert) {
+		i = 0;
+		bkey_for_each_ptr_decode(old.k, bch2_bkey_ptrs_c(old), p, entry) {
+			struct bkey_s new_s;
+			new_s.k = (void *) new.k;
+			new_s.v = (void *) new.v;
+
+			if (((1U << i) & m->data_opts.rewrite_ptrs) &&
+			    (ptr = bch2_extent_has_ptr(old, p, bkey_i_to_s(insert))) &&
+			    !ptr->cached)
+				rewrites_found |= 1U << i;
+			i++;
+		}
+	}
+
+	prt_printf(&buf, "\nrewrite ptrs:   %u%u%u%u",
+		   (m->data_opts.rewrite_ptrs & (1 << 0)) != 0,
+		   (m->data_opts.rewrite_ptrs & (1 << 1)) != 0,
+		   (m->data_opts.rewrite_ptrs & (1 << 2)) != 0,
+		   (m->data_opts.rewrite_ptrs & (1 << 3)) != 0);
+
+	prt_printf(&buf, "\nrewrites found: %u%u%u%u",
+		   (rewrites_found & (1 << 0)) != 0,
+		   (rewrites_found & (1 << 1)) != 0,
+		   (rewrites_found & (1 << 2)) != 0,
+		   (rewrites_found & (1 << 3)) != 0);
+
+	prt_str(&buf, "\nold:    ");
+	bch2_bkey_val_to_text(&buf, c, old);
+
+	prt_str(&buf, "\nnew:    ");
+	bch2_bkey_val_to_text(&buf, c, new);
+
+	prt_str(&buf, "\nwrote:  ");
+	bch2_bkey_val_to_text(&buf, c, wrote);
+
+	if (insert) {
+		prt_str(&buf, "\ninsert: ");
+		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert));
+	}
+
+	trace_move_extent_fail(c, buf.buf);
+	printbuf_exit(&buf);
+}
+
 static int __bch2_data_update_index_update(struct btree_trans *trans,
 					   struct bch_write_op *op)
 {
@@ -134,8 +199,11 @@ static int __bch2_data_update_index_update(struct btree_trans *trans,
 
 		new = bkey_i_to_extent(bch2_keylist_front(keys));
 
-		if (!bch2_extents_match(k, old))
+		if (!bch2_extents_match(k, old)) {
+			trace_move_extent_fail2(m, k, bkey_i_to_s_c(&new->k_i),
+						NULL, "no match:");
 			goto nowork;
+		}
 
 		bkey_reassemble(_insert.k, k);
 		insert = _insert.k;
@@ -174,8 +242,10 @@ static int __bch2_data_update_index_update(struct btree_trans *trans,
 
 		if (m->data_opts.rewrite_ptrs &&
 		    !rewrites_found &&
-		    bch2_bkey_durability(c, k) >= m->op.opts.data_replicas)
+		    bch2_bkey_durability(c, k) >= m->op.opts.data_replicas) {
+			trace_move_extent_fail2(m, k, bkey_i_to_s_c(&new->k_i), insert, "no rewrites found:");
 			goto nowork;
+		}
 
 		/*
 		 * A replica that we just wrote might conflict with a replica
@@ -189,8 +259,10 @@ restart_drop_conflicting_replicas:
 				goto restart_drop_conflicting_replicas;
 			}
 
-		if (!bkey_val_u64s(&new->k))
+		if (!bkey_val_u64s(&new->k)) {
+			trace_move_extent_fail2(m, k, bkey_i_to_s_c(&new->k_i), insert, "new replicas conflicted:");
 			goto nowork;
+		}
 
 		/* Now, drop pointers that conflict with what we just wrote: */
 		extent_for_each_ptr_decode(extent_i_to_s(new), p, entry)
@@ -293,7 +365,6 @@ nowork:
 		}
 
 		this_cpu_add(c->counters[BCH_COUNTER_move_extent_fail], new->k.size);
-		trace_move_extent_fail(&new->k);
 
 		bch2_btree_iter_advance(&iter);
 		goto next;
diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h
index 65521c046254..bbe8eb7a29eb 100644
--- a/fs/bcachefs/trace.h
+++ b/fs/bcachefs/trace.h
@@ -682,9 +682,21 @@ DEFINE_EVENT(bkey, move_extent_finish,
 	TP_ARGS(k)
 );
 
-DEFINE_EVENT(bkey, move_extent_fail,
-	TP_PROTO(const struct bkey *k),
-	TP_ARGS(k)
+TRACE_EVENT(move_extent_fail,
+	TP_PROTO(struct bch_fs *c, const char *msg),
+	TP_ARGS(c, msg),
+
+	TP_STRUCT__entry(
+		__field(dev_t,		dev			)
+		__string(msg,		msg			)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= c->dev;
+		__assign_str(msg, msg);
+	),
+
+	TP_printk("%d:%d %s", MAJOR(__entry->dev), MINOR(__entry->dev), __get_str(msg))
 );
 
 DEFINE_EVENT(bkey, move_extent_alloc_mem_fail,
-- 
cgit 


From 2776369266ad04095204128d72cf0d429b5fb678 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 4 Apr 2023 13:25:06 -0400
Subject: bcachefs: Add a cond_resched() call to journal_keys_sort()

We're just doing cpu work here and it could take awhile, a
cond_resched() is definitely needed.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/recovery.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 8cc8af6d29ef..91a66b5916eb 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -543,6 +543,8 @@ static int journal_keys_sort(struct bch_fs *c)
 		if (!i || i->ignore)
 			continue;
 
+		cond_resched();
+
 		for_each_jset_key(k, entry, &i->j) {
 			if (keys->nr == keys->size) {
 				__journal_keys_sort(keys);
-- 
cgit 


From c437e1537916e3c0fcc04387888e402a5940e49c Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 27 Mar 2023 15:16:24 -0400
Subject: bcachefs: Add a bch_page_state assert

Seeing an odd bug with page/folio state not being properly initialized,
this is to help track it down.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-io.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index d98b654c92b1..e25dbbf46107 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -2682,6 +2682,8 @@ static int __bch2_truncate_page(struct bch_inode_info *inode,
 			goto unlock;
 	}
 
+	BUG_ON(!s->uptodate);
+
 	if (index != start >> PAGE_SHIFT)
 		start_offset = 0;
 	if (index != end >> PAGE_SHIFT)
-- 
cgit 


From 3342ac134d403d4451e533ab7eb3aee19b01f478 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 17 Mar 2023 12:53:15 -0400
Subject: bcachefs: Rename bch_page_state -> bch_folio

Start of the large folio conversion.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-io.c | 124 ++++++++++++++++++++++++++++------------------------
 1 file changed, 66 insertions(+), 58 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index e25dbbf46107..aef2a094315a 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -336,7 +336,7 @@ static void i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode,
 
 /* stored in page->private: */
 
-struct bch_page_sector {
+struct bch_folio_sector {
 	/* Uncompressed, fully allocated replicas (or on disk reservation): */
 	unsigned		nr_replicas:4;
 
@@ -353,44 +353,48 @@ struct bch_page_sector {
 	}			state:8;
 };
 
-struct bch_page_state {
+struct bch_folio {
 	spinlock_t		lock;
 	atomic_t		write_count;
+	/*
+	 * Is the sector state up to date with the btree?
+	 * (Not the data itself)
+	 */
 	bool			uptodate;
-	struct bch_page_sector	s[PAGE_SECTORS];
+	struct bch_folio_sector	s[PAGE_SECTORS];
 };
 
-static inline struct bch_page_state *__bch2_page_state(struct page *page)
+static inline struct bch_folio *__bch2_folio(struct page *page)
 {
 	return page_has_private(page)
-		? (struct bch_page_state *) page_private(page)
+		? (struct bch_folio *) page_private(page)
 		: NULL;
 }
 
-static inline struct bch_page_state *bch2_page_state(struct page *page)
+static inline struct bch_folio *bch2_folio(struct page *page)
 {
 	EBUG_ON(!PageLocked(page));
 
-	return __bch2_page_state(page);
+	return __bch2_folio(page);
 }
 
 /* for newly allocated pages: */
-static void __bch2_page_state_release(struct page *page)
+static void __bch2_folio_release(struct page *page)
 {
 	kfree(detach_page_private(page));
 }
 
-static void bch2_page_state_release(struct page *page)
+static void bch2_folio_release(struct page *page)
 {
 	EBUG_ON(!PageLocked(page));
-	__bch2_page_state_release(page);
+	__bch2_folio_release(page);
 }
 
 /* for newly allocated pages: */
-static struct bch_page_state *__bch2_page_state_create(struct page *page,
+static struct bch_folio *__bch2_folio_create(struct page *page,
 						       gfp_t gfp)
 {
-	struct bch_page_state *s;
+	struct bch_folio *s;
 
 	s = kzalloc(sizeof(*s), GFP_NOFS|gfp);
 	if (!s)
@@ -401,10 +405,10 @@ static struct bch_page_state *__bch2_page_state_create(struct page *page,
 	return s;
 }
 
-static struct bch_page_state *bch2_page_state_create(struct page *page,
+static struct bch_folio *bch2_folio_create(struct page *page,
 						     gfp_t gfp)
 {
-	return bch2_page_state(page) ?: __bch2_page_state_create(page, gfp);
+	return bch2_folio(page) ?: __bch2_folio_create(page, gfp);
 }
 
 static unsigned bkey_to_sector_state(struct bkey_s_c k)
@@ -416,11 +420,11 @@ static unsigned bkey_to_sector_state(struct bkey_s_c k)
 	return SECTOR_UNALLOCATED;
 }
 
-static void __bch2_page_state_set(struct page *page,
-				  unsigned pg_offset, unsigned pg_len,
-				  unsigned nr_ptrs, unsigned state)
+static void __bch2_folio_set(struct page *page,
+			     unsigned pg_offset, unsigned pg_len,
+			     unsigned nr_ptrs, unsigned state)
 {
-	struct bch_page_state *s = bch2_page_state_create(page, __GFP_NOFAIL);
+	struct bch_folio *s = bch2_folio_create(page, __GFP_NOFAIL);
 	unsigned i;
 
 	BUG_ON(pg_offset >= PAGE_SECTORS);
@@ -429,8 +433,8 @@ static void __bch2_page_state_set(struct page *page,
 	spin_lock(&s->lock);
 
 	for (i = pg_offset; i < pg_offset + pg_len; i++) {
-		s->s[i].nr_replicas = nr_ptrs;
-		s->s[i].state = state;
+		s->s[i].nr_replicas	= nr_ptrs;
+		s->s[i].state		= state;
 	}
 
 	if (i == PAGE_SECTORS)
@@ -439,8 +443,12 @@ static void __bch2_page_state_set(struct page *page,
 	spin_unlock(&s->lock);
 }
 
-static int bch2_page_state_set(struct bch_fs *c, subvol_inum inum,
-			       struct page **pages, unsigned nr_pages)
+/*
+ * Initialize bch_folio state (allocated/unallocated, nr_replicas) from the
+ * extents btree:
+ */
+static int bch2_folio_set(struct bch_fs *c, subvol_inum inum,
+			  struct page **pages, unsigned nr_pages)
 {
 	struct btree_trans trans;
 	struct btree_iter iter;
@@ -474,8 +482,8 @@ retry:
 			BUG_ON(k.k->p.offset < pg_start);
 			BUG_ON(bkey_start_offset(k.k) > pg_end);
 
-			if (!bch2_page_state_create(page, __GFP_NOFAIL)->uptodate)
-				__bch2_page_state_set(page, pg_offset, pg_len, nr_ptrs, state);
+			if (!bch2_folio_create(page, __GFP_NOFAIL)->uptodate)
+				__bch2_folio_set(page, pg_offset, pg_len, nr_ptrs, state);
 
 			if (k.k->p.offset < pg_end)
 				break;
@@ -505,7 +513,7 @@ static void bch2_bio_page_state_set(struct bio *bio, struct bkey_s_c k)
 	unsigned state = bkey_to_sector_state(k);
 
 	bio_for_each_segment(bv, bio, iter)
-		__bch2_page_state_set(bv.bv_page, bv.bv_offset >> 9,
+		__bch2_folio_set(bv.bv_page, bv.bv_offset >> 9,
 				      bv.bv_len >> 9, nr_ptrs, state);
 }
 
@@ -530,14 +538,14 @@ static void mark_pagecache_unallocated(struct bch_inode_info *inode,
 			u64 pg_end = (folio->index + 1) << PAGE_SECTORS_SHIFT;
 			unsigned pg_offset = max(start, pg_start) - pg_start;
 			unsigned pg_len = min(end, pg_end) - pg_offset - pg_start;
-			struct bch_page_state *s;
+			struct bch_folio *s;
 
 			BUG_ON(end <= pg_start);
 			BUG_ON(pg_offset >= PAGE_SECTORS);
 			BUG_ON(pg_offset + pg_len > PAGE_SECTORS);
 
 			folio_lock(folio);
-			s = bch2_page_state(&folio->page);
+			s = bch2_folio(&folio->page);
 
 			if (s) {
 				spin_lock(&s->lock);
@@ -576,14 +584,14 @@ static void mark_pagecache_reserved(struct bch_inode_info *inode,
 			u64 pg_end = (folio->index + 1) << PAGE_SECTORS_SHIFT;
 			unsigned pg_offset = max(start, pg_start) - pg_start;
 			unsigned pg_len = min(end, pg_end) - pg_offset - pg_start;
-			struct bch_page_state *s;
+			struct bch_folio *s;
 
 			BUG_ON(end <= pg_start);
 			BUG_ON(pg_offset >= PAGE_SECTORS);
 			BUG_ON(pg_offset + pg_len > PAGE_SECTORS);
 
 			folio_lock(folio);
-			s = bch2_page_state(&folio->page);
+			s = bch2_folio(&folio->page);
 
 			if (s) {
 				spin_lock(&s->lock);
@@ -619,8 +627,8 @@ static inline unsigned inode_nr_replicas(struct bch_fs *c, struct bch_inode_info
 		: c->opts.data_replicas;
 }
 
-static inline unsigned sectors_to_reserve(struct bch_page_sector *s,
-						  unsigned nr_replicas)
+static inline unsigned sectors_to_reserve(struct bch_folio_sector *s,
+					  unsigned nr_replicas)
 {
 	return max(0, (int) nr_replicas -
 		   s->nr_replicas -
@@ -631,7 +639,7 @@ static int bch2_get_page_disk_reservation(struct bch_fs *c,
 				struct bch_inode_info *inode,
 				struct page *page, bool check_enospc)
 {
-	struct bch_page_state *s = bch2_page_state_create(page, 0);
+	struct bch_folio *s = bch2_folio_create(page, 0);
 	unsigned nr_replicas = inode_nr_replicas(c, inode);
 	struct disk_reservation disk_res = { 0 };
 	unsigned i, disk_res_sectors = 0;
@@ -688,7 +696,7 @@ static int bch2_page_reservation_get(struct bch_fs *c,
 			struct bch2_page_reservation *res,
 			unsigned offset, unsigned len)
 {
-	struct bch_page_state *s = bch2_page_state_create(page, 0);
+	struct bch_folio *s = bch2_folio_create(page, 0);
 	unsigned i, disk_sectors = 0, quota_sectors = 0;
 	int ret;
 
@@ -732,7 +740,7 @@ static void bch2_clear_page_bits(struct page *page)
 {
 	struct bch_inode_info *inode = to_bch_ei(page->mapping->host);
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	struct bch_page_state *s = bch2_page_state(page);
+	struct bch_folio *s = bch2_folio(page);
 	struct disk_reservation disk_res = { 0 };
 	int i, dirty_sectors = 0;
 
@@ -763,7 +771,7 @@ static void bch2_clear_page_bits(struct page *page)
 
 	i_sectors_acct(c, inode, NULL, dirty_sectors);
 
-	bch2_page_state_release(page);
+	bch2_folio_release(page);
 }
 
 static void bch2_set_page_dirty(struct bch_fs *c,
@@ -771,7 +779,7 @@ static void bch2_set_page_dirty(struct bch_fs *c,
 			struct bch2_page_reservation *res,
 			unsigned offset, unsigned len)
 {
-	struct bch_page_state *s = bch2_page_state(page);
+	struct bch_folio *s = bch2_folio(page);
 	unsigned i, dirty_sectors = 0;
 
 	WARN_ON((u64) page_offset(page) + offset + len >
@@ -889,8 +897,8 @@ vm_fault_t bch2_page_mkwrite(struct vm_fault *vmf)
 
 	len = min_t(loff_t, PAGE_SIZE, isize - page_offset(page));
 
-	if (!bch2_page_state_create(page, __GFP_NOFAIL)->uptodate) {
-		if (bch2_page_state_set(c, inode_inum(inode), &page, 1)) {
+	if (!bch2_folio_create(page, __GFP_NOFAIL)->uptodate) {
+		if (bch2_folio_set(c, inode_inum(inode), &page, 1)) {
 			unlock_page(page);
 			ret = VM_FAULT_SIGBUS;
 			goto out;
@@ -979,7 +987,7 @@ static int readpages_iter_init(struct readpages_iter *iter,
 
 	nr_pages = __readahead_batch(ractl, iter->pages, nr_pages);
 	for (i = 0; i < nr_pages; i++) {
-		__bch2_page_state_create(iter->pages[i], __GFP_NOFAIL);
+		__bch2_folio_create(iter->pages[i], __GFP_NOFAIL);
 		put_page(iter->pages[i]);
 	}
 
@@ -1036,7 +1044,7 @@ static void readpage_bio_extend(struct readpages_iter *iter,
 			if (!page)
 				break;
 
-			if (!__bch2_page_state_create(page, 0)) {
+			if (!__bch2_folio_create(page, 0)) {
 				put_page(page);
 				break;
 			}
@@ -1044,7 +1052,7 @@ static void readpage_bio_extend(struct readpages_iter *iter,
 			ret = add_to_page_cache_lru(page, iter->mapping,
 						    page_offset, GFP_NOFS);
 			if (ret) {
-				__bch2_page_state_release(page);
+				__bch2_folio_release(page);
 				put_page(page);
 				break;
 			}
@@ -1215,7 +1223,7 @@ static void __bchfs_readpage(struct bch_fs *c, struct bch_read_bio *rbio,
 {
 	struct btree_trans trans;
 
-	bch2_page_state_create(page, __GFP_NOFAIL);
+	bch2_folio_create(page, __GFP_NOFAIL);
 
 	rbio->bio.bi_opf = REQ_OP_READ|REQ_SYNC;
 	rbio->bio.bi_iter.bi_sector =
@@ -1302,12 +1310,12 @@ static void bch2_writepage_io_done(struct bch_write_op *op)
 		set_bit(EI_INODE_ERROR, &io->inode->ei_flags);
 
 		bio_for_each_segment_all(bvec, bio, iter) {
-			struct bch_page_state *s;
+			struct bch_folio *s;
 
 			SetPageError(bvec->bv_page);
 			mapping_set_error(bvec->bv_page->mapping, -EIO);
 
-			s = __bch2_page_state(bvec->bv_page);
+			s = __bch2_folio(bvec->bv_page);
 			spin_lock(&s->lock);
 			for (i = 0; i < PAGE_SECTORS; i++)
 				s->s[i].nr_replicas = 0;
@@ -1317,9 +1325,9 @@ static void bch2_writepage_io_done(struct bch_write_op *op)
 
 	if (io->op.flags & BCH_WRITE_WROTE_DATA_INLINE) {
 		bio_for_each_segment_all(bvec, bio, iter) {
-			struct bch_page_state *s;
+			struct bch_folio *s;
 
-			s = __bch2_page_state(bvec->bv_page);
+			s = __bch2_folio(bvec->bv_page);
 			spin_lock(&s->lock);
 			for (i = 0; i < PAGE_SECTORS; i++)
 				s->s[i].nr_replicas = 0;
@@ -1347,7 +1355,7 @@ static void bch2_writepage_io_done(struct bch_write_op *op)
 	i_sectors_acct(c, io->inode, NULL, io->op.i_sectors_delta);
 
 	bio_for_each_segment_all(bvec, bio, iter) {
-		struct bch_page_state *s = __bch2_page_state(bvec->bv_page);
+		struct bch_folio *s = __bch2_folio(bvec->bv_page);
 
 		if (atomic_dec_and_test(&s->write_count))
 			end_page_writeback(bvec->bv_page);
@@ -1406,7 +1414,7 @@ static int __bch2_writepage(struct folio *folio,
 	struct bch_inode_info *inode = to_bch_ei(page->mapping->host);
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
 	struct bch_writepage_state *w = data;
-	struct bch_page_state *s, orig;
+	struct bch_folio *s, orig;
 	unsigned i, offset, nr_replicas_this_write = U32_MAX;
 	loff_t i_size = i_size_read(&inode->v);
 	pgoff_t end_index = i_size >> PAGE_SHIFT;
@@ -1434,7 +1442,7 @@ static int __bch2_writepage(struct folio *folio,
 	 */
 	zero_user_segment(page, offset, PAGE_SIZE);
 do_io:
-	s = bch2_page_state_create(page, __GFP_NOFAIL);
+	s = bch2_folio_create(page, __GFP_NOFAIL);
 
 	/*
 	 * Things get really hairy with errors during writeback:
@@ -1604,8 +1612,8 @@ readpage:
 	if (ret)
 		goto err;
 out:
-	if (!bch2_page_state_create(page, __GFP_NOFAIL)->uptodate) {
-		ret = bch2_page_state_set(c, inode_inum(inode), &page, 1);
+	if (!bch2_folio_create(page, __GFP_NOFAIL)->uptodate) {
+		ret = bch2_folio_set(c, inode_inum(inode), &page, 1);
 		if (ret)
 			goto err;
 	}
@@ -1744,9 +1752,9 @@ static int __bch2_buffered_write(struct bch_inode_info *inode,
 		unsigned pg_len = min_t(unsigned, len - reserved,
 					PAGE_SIZE - pg_offset);
 
-		if (!bch2_page_state_create(page, __GFP_NOFAIL)->uptodate) {
-			ret = bch2_page_state_set(c, inode_inum(inode),
-						  pages + i, nr_pages - i);
+		if (!bch2_folio_create(page, __GFP_NOFAIL)->uptodate) {
+			ret = bch2_folio_set(c, inode_inum(inode),
+					     pages + i, nr_pages - i);
 			if (ret)
 				goto out;
 		}
@@ -2634,7 +2642,7 @@ static int __bch2_truncate_page(struct bch_inode_info *inode,
 {
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
 	struct address_space *mapping = inode->v.i_mapping;
-	struct bch_page_state *s;
+	struct bch_folio *s;
 	unsigned start_offset = start & (PAGE_SIZE - 1);
 	unsigned end_offset = ((end - 1) & (PAGE_SIZE - 1)) + 1;
 	unsigned i;
@@ -2670,7 +2678,7 @@ static int __bch2_truncate_page(struct bch_inode_info *inode,
 		}
 	}
 
-	s = bch2_page_state_create(page, 0);
+	s = bch2_folio_create(page, 0);
 	if (!s) {
 		ret = -ENOMEM;
 		goto unlock;
@@ -3459,7 +3467,7 @@ err:
 
 static int folio_data_offset(struct folio *folio, unsigned offset)
 {
-	struct bch_page_state *s = bch2_page_state(&folio->page);
+	struct bch_folio *s = bch2_folio(&folio->page);
 	unsigned i;
 
 	if (s)
@@ -3566,7 +3574,7 @@ err:
 
 static int __page_hole_offset(struct page *page, unsigned offset)
 {
-	struct bch_page_state *s = bch2_page_state(page);
+	struct bch_folio *s = bch2_folio(page);
 	unsigned i;
 
 	if (!s)
-- 
cgit 


From 30bff5944eb51a1d28b286c766599845939d2a47 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 17 Mar 2023 14:55:53 -0400
Subject: bcachefs: Initial folio conversion

This converts fs-io.c to pass folios, not pages. We're not handling
large folios yet, there's no functional changes in this patch - just a
lot of churn doing the initial type conversions.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-io.c | 613 +++++++++++++++++++++++++++-------------------------
 1 file changed, 317 insertions(+), 296 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index aef2a094315a..c688adc2527f 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -35,6 +35,26 @@
 
 #include <trace/events/writeback.h>
 
+static inline loff_t folio_end_pos(struct folio *folio)
+{
+	return folio_pos(folio) + folio_size(folio);
+}
+
+static inline size_t folio_sectors(struct folio *folio)
+{
+	return PAGE_SECTORS << folio_order(folio);
+}
+
+static inline loff_t folio_sector(struct folio *folio)
+{
+	return folio_pos(folio) >> 9;
+}
+
+static inline loff_t folio_end_sector(struct folio *folio)
+{
+	return folio_end_pos(folio) >> 9;
+}
+
 struct nocow_flush {
 	struct closure	*cl;
 	struct bch_dev	*ca;
@@ -364,35 +384,34 @@ struct bch_folio {
 	struct bch_folio_sector	s[PAGE_SECTORS];
 };
 
-static inline struct bch_folio *__bch2_folio(struct page *page)
+static inline struct bch_folio *__bch2_folio(struct folio *folio)
 {
-	return page_has_private(page)
-		? (struct bch_folio *) page_private(page)
+	return folio_has_private(folio)
+		? (struct bch_folio *) folio_get_private(folio)
 		: NULL;
 }
 
-static inline struct bch_folio *bch2_folio(struct page *page)
+static inline struct bch_folio *bch2_folio(struct folio *folio)
 {
-	EBUG_ON(!PageLocked(page));
+	EBUG_ON(!folio_test_locked(folio));
 
-	return __bch2_folio(page);
+	return __bch2_folio(folio);
 }
 
-/* for newly allocated pages: */
-static void __bch2_folio_release(struct page *page)
+/* for newly allocated folios: */
+static void __bch2_folio_release(struct folio *folio)
 {
-	kfree(detach_page_private(page));
+	kfree(folio_detach_private(folio));
 }
 
-static void bch2_folio_release(struct page *page)
+static void bch2_folio_release(struct folio *folio)
 {
-	EBUG_ON(!PageLocked(page));
-	__bch2_folio_release(page);
+	EBUG_ON(!folio_test_locked(folio));
+	__bch2_folio_release(folio);
 }
 
-/* for newly allocated pages: */
-static struct bch_folio *__bch2_folio_create(struct page *page,
-						       gfp_t gfp)
+/* for newly allocated folios: */
+static struct bch_folio *__bch2_folio_create(struct folio *folio, gfp_t gfp)
 {
 	struct bch_folio *s;
 
@@ -401,14 +420,13 @@ static struct bch_folio *__bch2_folio_create(struct page *page,
 		return NULL;
 
 	spin_lock_init(&s->lock);
-	attach_page_private(page, s);
+	folio_attach_private(folio, s);
 	return s;
 }
 
-static struct bch_folio *bch2_folio_create(struct page *page,
-						     gfp_t gfp)
+static struct bch_folio *bch2_folio_create(struct folio *folio, gfp_t gfp)
 {
-	return bch2_folio(page) ?: __bch2_folio_create(page, gfp);
+	return bch2_folio(folio) ?: __bch2_folio_create(folio, gfp);
 }
 
 static unsigned bkey_to_sector_state(struct bkey_s_c k)
@@ -420,11 +438,11 @@ static unsigned bkey_to_sector_state(struct bkey_s_c k)
 	return SECTOR_UNALLOCATED;
 }
 
-static void __bch2_folio_set(struct page *page,
+static void __bch2_folio_set(struct folio *folio,
 			     unsigned pg_offset, unsigned pg_len,
 			     unsigned nr_ptrs, unsigned state)
 {
-	struct bch_folio *s = bch2_folio_create(page, __GFP_NOFAIL);
+	struct bch_folio *s = bch2_folio_create(folio, __GFP_NOFAIL);
 	unsigned i;
 
 	BUG_ON(pg_offset >= PAGE_SECTORS);
@@ -448,13 +466,13 @@ static void __bch2_folio_set(struct page *page,
  * extents btree:
  */
 static int bch2_folio_set(struct bch_fs *c, subvol_inum inum,
-			  struct page **pages, unsigned nr_pages)
+			  struct folio **folios, unsigned nr_folios)
 {
 	struct btree_trans trans;
 	struct btree_iter iter;
 	struct bkey_s_c k;
-	u64 offset = pages[0]->index << PAGE_SECTORS_SHIFT;
-	unsigned pg_idx = 0;
+	u64 offset = folio_sector(folios[0]);
+	unsigned folio_idx = 0;
 	u32 snapshot;
 	int ret;
 
@@ -472,25 +490,25 @@ retry:
 		unsigned nr_ptrs = bch2_bkey_nr_ptrs_fully_allocated(k);
 		unsigned state = bkey_to_sector_state(k);
 
-		while (pg_idx < nr_pages) {
-			struct page *page = pages[pg_idx];
-			u64 pg_start = page->index << PAGE_SECTORS_SHIFT;
-			u64 pg_end = (page->index + 1) << PAGE_SECTORS_SHIFT;
-			unsigned pg_offset = max(bkey_start_offset(k.k), pg_start) - pg_start;
-			unsigned pg_len = min(k.k->p.offset, pg_end) - pg_offset - pg_start;
+		while (folio_idx < nr_folios) {
+			struct folio *folio = folios[folio_idx];
+			u64 folio_start	= folio_sector(folio);
+			u64 folio_end	= folio_end_sector(folio);
+			unsigned folio_offset = max(bkey_start_offset(k.k), folio_start) - folio_start;
+			unsigned folio_len = min(k.k->p.offset, folio_end) - folio_offset - folio_start;
 
-			BUG_ON(k.k->p.offset < pg_start);
-			BUG_ON(bkey_start_offset(k.k) > pg_end);
+			BUG_ON(k.k->p.offset < folio_start);
+			BUG_ON(bkey_start_offset(k.k) > folio_end);
 
-			if (!bch2_folio_create(page, __GFP_NOFAIL)->uptodate)
-				__bch2_folio_set(page, pg_offset, pg_len, nr_ptrs, state);
+			if (!bch2_folio_create(folio, __GFP_NOFAIL)->uptodate)
+				__bch2_folio_set(folio, folio_offset, folio_len, nr_ptrs, state);
 
-			if (k.k->p.offset < pg_end)
+			if (k.k->p.offset < folio_end)
 				break;
-			pg_idx++;
+			folio_idx++;
 		}
 
-		if (pg_idx == nr_pages)
+		if (folio_idx == nr_folios)
 			break;
 	}
 
@@ -513,8 +531,8 @@ static void bch2_bio_page_state_set(struct bio *bio, struct bkey_s_c k)
 	unsigned state = bkey_to_sector_state(k);
 
 	bio_for_each_segment(bv, bio, iter)
-		__bch2_folio_set(bv.bv_page, bv.bv_offset >> 9,
-				      bv.bv_len >> 9, nr_ptrs, state);
+		__bch2_folio_set(page_folio(bv.bv_page), bv.bv_offset >> 9,
+				 bv.bv_len >> 9, nr_ptrs, state);
 }
 
 static void mark_pagecache_unallocated(struct bch_inode_info *inode,
@@ -534,22 +552,22 @@ static void mark_pagecache_unallocated(struct bch_inode_info *inode,
 				  &index, end_index, &fbatch)) {
 		for (i = 0; i < folio_batch_count(&fbatch); i++) {
 			struct folio *folio = fbatch.folios[i];
-			u64 pg_start = folio->index << PAGE_SECTORS_SHIFT;
-			u64 pg_end = (folio->index + 1) << PAGE_SECTORS_SHIFT;
-			unsigned pg_offset = max(start, pg_start) - pg_start;
-			unsigned pg_len = min(end, pg_end) - pg_offset - pg_start;
+			u64 folio_start = folio->index << PAGE_SECTORS_SHIFT;
+			u64 folio_end = (folio->index + 1) << PAGE_SECTORS_SHIFT;
+			unsigned folio_offset = max(start, folio_start) - folio_start;
+			unsigned folio_len = min(end, folio_end) - folio_offset - folio_start;
 			struct bch_folio *s;
 
-			BUG_ON(end <= pg_start);
-			BUG_ON(pg_offset >= PAGE_SECTORS);
-			BUG_ON(pg_offset + pg_len > PAGE_SECTORS);
+			BUG_ON(end <= folio_start);
+			BUG_ON(folio_offset >= PAGE_SECTORS);
+			BUG_ON(folio_offset + folio_len > PAGE_SECTORS);
 
 			folio_lock(folio);
-			s = bch2_folio(&folio->page);
+			s = bch2_folio(folio);
 
 			if (s) {
 				spin_lock(&s->lock);
-				for (j = pg_offset; j < pg_offset + pg_len; j++)
+				for (j = folio_offset; j < folio_offset + folio_len; j++)
 					s->s[j].nr_replicas = 0;
 				spin_unlock(&s->lock);
 			}
@@ -580,22 +598,22 @@ static void mark_pagecache_reserved(struct bch_inode_info *inode,
 				  &index, end_index, &fbatch)) {
 		for (i = 0; i < folio_batch_count(&fbatch); i++) {
 			struct folio *folio = fbatch.folios[i];
-			u64 pg_start = folio->index << PAGE_SECTORS_SHIFT;
-			u64 pg_end = (folio->index + 1) << PAGE_SECTORS_SHIFT;
-			unsigned pg_offset = max(start, pg_start) - pg_start;
-			unsigned pg_len = min(end, pg_end) - pg_offset - pg_start;
+			u64 folio_start = folio->index << PAGE_SECTORS_SHIFT;
+			u64 folio_end = (folio->index + 1) << PAGE_SECTORS_SHIFT;
+			unsigned folio_offset = max(start, folio_start) - folio_start;
+			unsigned folio_len = min(end, folio_end) - folio_offset - folio_start;
 			struct bch_folio *s;
 
-			BUG_ON(end <= pg_start);
-			BUG_ON(pg_offset >= PAGE_SECTORS);
-			BUG_ON(pg_offset + pg_len > PAGE_SECTORS);
+			BUG_ON(end <= folio_start);
+			BUG_ON(folio_offset >= PAGE_SECTORS);
+			BUG_ON(folio_offset + folio_len > PAGE_SECTORS);
 
 			folio_lock(folio);
-			s = bch2_folio(&folio->page);
+			s = bch2_folio(folio);
 
 			if (s) {
 				spin_lock(&s->lock);
-				for (j = pg_offset; j < pg_offset + pg_len; j++)
+				for (j = folio_offset; j < folio_offset + folio_len; j++)
 					switch (s->s[j].state) {
 					case SECTOR_UNALLOCATED:
 						s->s[j].state = SECTOR_RESERVED;
@@ -635,11 +653,11 @@ static inline unsigned sectors_to_reserve(struct bch_folio_sector *s,
 		   s->replicas_reserved);
 }
 
-static int bch2_get_page_disk_reservation(struct bch_fs *c,
+static int bch2_get_folio_disk_reservation(struct bch_fs *c,
 				struct bch_inode_info *inode,
-				struct page *page, bool check_enospc)
+				struct folio *folio, bool check_enospc)
 {
-	struct bch_folio *s = bch2_folio_create(page, 0);
+	struct bch_folio *s = bch2_folio_create(folio, 0);
 	unsigned nr_replicas = inode_nr_replicas(c, inode);
 	struct disk_reservation disk_res = { 0 };
 	unsigned i, disk_res_sectors = 0;
@@ -669,34 +687,35 @@ static int bch2_get_page_disk_reservation(struct bch_fs *c,
 	return 0;
 }
 
-struct bch2_page_reservation {
+struct bch2_folio_reservation {
 	struct disk_reservation	disk;
 	struct quota_res	quota;
 };
 
-static void bch2_page_reservation_init(struct bch_fs *c,
+static void bch2_folio_reservation_init(struct bch_fs *c,
 			struct bch_inode_info *inode,
-			struct bch2_page_reservation *res)
+			struct bch2_folio_reservation *res)
 {
 	memset(res, 0, sizeof(*res));
 
 	res->disk.nr_replicas = inode_nr_replicas(c, inode);
 }
 
-static void bch2_page_reservation_put(struct bch_fs *c,
+static void bch2_folio_reservation_put(struct bch_fs *c,
 			struct bch_inode_info *inode,
-			struct bch2_page_reservation *res)
+			struct bch2_folio_reservation *res)
 {
 	bch2_disk_reservation_put(c, &res->disk);
 	bch2_quota_reservation_put(c, inode, &res->quota);
 }
 
-static int bch2_page_reservation_get(struct bch_fs *c,
-			struct bch_inode_info *inode, struct page *page,
-			struct bch2_page_reservation *res,
+static int bch2_folio_reservation_get(struct bch_fs *c,
+			struct bch_inode_info *inode,
+			struct folio *folio,
+			struct bch2_folio_reservation *res,
 			unsigned offset, unsigned len)
 {
-	struct bch_folio *s = bch2_folio_create(page, 0);
+	struct bch_folio *s = bch2_folio_create(folio, 0);
 	unsigned i, disk_sectors = 0, quota_sectors = 0;
 	int ret;
 
@@ -736,19 +755,19 @@ static int bch2_page_reservation_get(struct bch_fs *c,
 	return 0;
 }
 
-static void bch2_clear_page_bits(struct page *page)
+static void bch2_clear_folio_bits(struct folio *folio)
 {
-	struct bch_inode_info *inode = to_bch_ei(page->mapping->host);
+	struct bch_inode_info *inode = to_bch_ei(folio->mapping->host);
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	struct bch_folio *s = bch2_folio(page);
+	struct bch_folio *s = bch2_folio(folio);
 	struct disk_reservation disk_res = { 0 };
 	int i, dirty_sectors = 0;
 
 	if (!s)
 		return;
 
-	EBUG_ON(!PageLocked(page));
-	EBUG_ON(PageWriteback(page));
+	EBUG_ON(!folio_test_locked(folio));
+	EBUG_ON(folio_test_writeback(folio));
 
 	for (i = 0; i < ARRAY_SIZE(s->s); i++) {
 		disk_res.sectors += s->s[i].replicas_reserved;
@@ -771,18 +790,19 @@ static void bch2_clear_page_bits(struct page *page)
 
 	i_sectors_acct(c, inode, NULL, dirty_sectors);
 
-	bch2_folio_release(page);
+	bch2_folio_release(folio);
 }
 
-static void bch2_set_page_dirty(struct bch_fs *c,
-			struct bch_inode_info *inode, struct page *page,
-			struct bch2_page_reservation *res,
+static void bch2_set_folio_dirty(struct bch_fs *c,
+			struct bch_inode_info *inode,
+			struct folio *folio,
+			struct bch2_folio_reservation *res,
 			unsigned offset, unsigned len)
 {
-	struct bch_folio *s = bch2_folio(page);
+	struct bch_folio *s = bch2_folio(folio);
 	unsigned i, dirty_sectors = 0;
 
-	WARN_ON((u64) page_offset(page) + offset + len >
+	WARN_ON((u64) folio_pos(folio) + offset + len >
 		round_up((u64) i_size_read(&inode->v), block_bytes(c)));
 
 	spin_lock(&s->lock);
@@ -819,8 +839,8 @@ static void bch2_set_page_dirty(struct bch_fs *c,
 
 	i_sectors_acct(c, inode, &res->quota, dirty_sectors);
 
-	if (!PageDirty(page))
-		filemap_dirty_folio(inode->v.i_mapping, page_folio(page));
+	if (!folio_test_dirty(folio))
+		filemap_dirty_folio(inode->v.i_mapping, folio);
 }
 
 vm_fault_t bch2_page_fault(struct vm_fault *vmf)
@@ -863,17 +883,17 @@ got_lock:
 
 vm_fault_t bch2_page_mkwrite(struct vm_fault *vmf)
 {
-	struct page *page = vmf->page;
+	struct folio *folio = page_folio(vmf->page);
 	struct file *file = vmf->vma->vm_file;
 	struct bch_inode_info *inode = file_bch_inode(file);
 	struct address_space *mapping = file->f_mapping;
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	struct bch2_page_reservation res;
+	struct bch2_folio_reservation res;
 	unsigned len;
 	loff_t isize;
 	int ret;
 
-	bch2_page_reservation_init(c, inode, &res);
+	bch2_folio_reservation_init(c, inode, &res);
 
 	sb_start_pagefault(inode->v.i_sb);
 	file_update_time(file);
@@ -886,35 +906,35 @@ vm_fault_t bch2_page_mkwrite(struct vm_fault *vmf)
 	 */
 	bch2_pagecache_add_get(inode);
 
-	lock_page(page);
+	folio_lock(folio);
 	isize = i_size_read(&inode->v);
 
-	if (page->mapping != mapping || page_offset(page) >= isize) {
-		unlock_page(page);
+	if (folio->mapping != mapping || folio_pos(folio) >= isize) {
+		folio_unlock(folio);
 		ret = VM_FAULT_NOPAGE;
 		goto out;
 	}
 
-	len = min_t(loff_t, PAGE_SIZE, isize - page_offset(page));
+	len = min_t(loff_t, PAGE_SIZE, isize - folio_pos(folio));
 
-	if (!bch2_folio_create(page, __GFP_NOFAIL)->uptodate) {
-		if (bch2_folio_set(c, inode_inum(inode), &page, 1)) {
-			unlock_page(page);
+	if (!bch2_folio_create(folio, __GFP_NOFAIL)->uptodate) {
+		if (bch2_folio_set(c, inode_inum(inode), &folio, 1)) {
+			folio_unlock(folio);
 			ret = VM_FAULT_SIGBUS;
 			goto out;
 		}
 	}
 
-	if (bch2_page_reservation_get(c, inode, page, &res, 0, len)) {
-		unlock_page(page);
+	if (bch2_folio_reservation_get(c, inode, folio, &res, 0, len)) {
+		folio_unlock(folio);
 		ret = VM_FAULT_SIGBUS;
 		goto out;
 	}
 
-	bch2_set_page_dirty(c, inode, page, &res, 0, len);
-	bch2_page_reservation_put(c, inode, &res);
+	bch2_set_folio_dirty(c, inode, folio, &res, 0, len);
+	bch2_folio_reservation_put(c, inode, &res);
 
-	wait_for_stable_page(page);
+	folio_wait_stable(folio);
 	ret = VM_FAULT_LOCKED;
 out:
 	bch2_pagecache_add_put(inode);
@@ -928,7 +948,7 @@ void bch2_invalidate_folio(struct folio *folio, size_t offset, size_t length)
 	if (offset || length < folio_size(folio))
 		return;
 
-	bch2_clear_page_bits(&folio->page);
+	bch2_clear_folio_bits(folio);
 }
 
 bool bch2_release_folio(struct folio *folio, gfp_t gfp_mask)
@@ -936,7 +956,7 @@ bool bch2_release_folio(struct folio *folio, gfp_t gfp_mask)
 	if (folio_test_dirty(folio) || folio_test_writeback(folio))
 		return false;
 
-	bch2_clear_page_bits(&folio->page);
+	bch2_clear_folio_bits(folio);
 	return true;
 }
 
@@ -944,19 +964,16 @@ bool bch2_release_folio(struct folio *folio, gfp_t gfp_mask)
 
 static void bch2_readpages_end_io(struct bio *bio)
 {
-	struct bvec_iter_all iter;
-	struct bio_vec *bv;
-
-	bio_for_each_segment_all(bv, bio, iter) {
-		struct page *page = bv->bv_page;
+	struct folio_iter fi;
 
+	bio_for_each_folio_all(fi, bio) {
 		if (!bio->bi_status) {
-			SetPageUptodate(page);
+			folio_mark_uptodate(fi.folio);
 		} else {
-			ClearPageUptodate(page);
-			SetPageError(page);
+			folio_clear_uptodate(fi.folio);
+			folio_set_error(fi.folio);
 		}
-		unlock_page(page);
+		folio_unlock(fi.folio);
 	}
 
 	bio_put(bio);
@@ -987,21 +1004,21 @@ static int readpages_iter_init(struct readpages_iter *iter,
 
 	nr_pages = __readahead_batch(ractl, iter->pages, nr_pages);
 	for (i = 0; i < nr_pages; i++) {
-		__bch2_folio_create(iter->pages[i], __GFP_NOFAIL);
+		__bch2_folio_create(page_folio(iter->pages[i]), __GFP_NOFAIL);
 		put_page(iter->pages[i]);
 	}
 
 	return 0;
 }
 
-static inline struct page *readpage_iter_next(struct readpages_iter *iter)
+static inline struct folio *readpage_iter_next(struct readpages_iter *iter)
 {
 	if (iter->idx >= iter->nr_pages)
 		return NULL;
 
 	EBUG_ON(iter->pages[iter->idx]->index != iter->offset + iter->idx);
 
-	return iter->pages[iter->idx];
+	return page_folio(iter->pages[iter->idx]);
 }
 
 static bool extent_partial_reads_expensive(struct bkey_s_c k)
@@ -1023,12 +1040,12 @@ static void readpage_bio_extend(struct readpages_iter *iter,
 {
 	while (bio_sectors(bio) < sectors_this_extent &&
 	       bio->bi_vcnt < bio->bi_max_vecs) {
-		pgoff_t page_offset = bio_end_sector(bio) >> PAGE_SECTORS_SHIFT;
-		struct page *page = readpage_iter_next(iter);
+		pgoff_t folio_offset = bio_end_sector(bio) >> PAGE_SECTORS_SHIFT;
+		struct folio *folio = readpage_iter_next(iter);
 		int ret;
 
-		if (page) {
-			if (iter->offset + iter->idx != page_offset)
+		if (folio) {
+			if (iter->offset + iter->idx != folio_offset)
 				break;
 
 			iter->idx++;
@@ -1036,31 +1053,30 @@ static void readpage_bio_extend(struct readpages_iter *iter,
 			if (!get_more)
 				break;
 
-			page = xa_load(&iter->mapping->i_pages, page_offset);
-			if (page && !xa_is_value(page))
+			folio = xa_load(&iter->mapping->i_pages, folio_offset);
+			if (folio && !xa_is_value(folio))
 				break;
 
-			page = __page_cache_alloc(readahead_gfp_mask(iter->mapping));
-			if (!page)
+			folio = filemap_alloc_folio(readahead_gfp_mask(iter->mapping), 0);
+			if (!folio)
 				break;
 
-			if (!__bch2_folio_create(page, 0)) {
-				put_page(page);
+			if (!__bch2_folio_create(folio, 0)) {
+				folio_put(folio);
 				break;
 			}
 
-			ret = add_to_page_cache_lru(page, iter->mapping,
-						    page_offset, GFP_NOFS);
+			ret = filemap_add_folio(iter->mapping, folio, folio_offset, GFP_NOFS);
 			if (ret) {
-				__bch2_folio_release(page);
-				put_page(page);
+				__bch2_folio_release(folio);
+				folio_put(folio);
 				break;
 			}
 
-			put_page(page);
+			folio_put(folio);
 		}
 
-		BUG_ON(!bio_add_page(bio, page, PAGE_SIZE, 0));
+		BUG_ON(!bio_add_folio(bio, folio, folio_size(folio), 0));
 	}
 }
 
@@ -1178,7 +1194,7 @@ void bch2_readahead(struct readahead_control *ractl)
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
 	struct bch_io_opts opts;
 	struct btree_trans trans;
-	struct page *page;
+	struct folio *folio;
 	struct readpages_iter readpages_iter;
 	int ret;
 
@@ -1191,7 +1207,7 @@ void bch2_readahead(struct readahead_control *ractl)
 
 	bch2_pagecache_add_get(inode);
 
-	while ((page = readpage_iter_next(&readpages_iter))) {
+	while ((folio = readpage_iter_next(&readpages_iter))) {
 		pgoff_t index = readpages_iter.offset + readpages_iter.idx;
 		unsigned n = min_t(unsigned,
 				   readpages_iter.nr_pages -
@@ -1206,7 +1222,7 @@ void bch2_readahead(struct readahead_control *ractl)
 
 		rbio->bio.bi_iter.bi_sector = (sector_t) index << PAGE_SECTORS_SHIFT;
 		rbio->bio.bi_end_io = bch2_readpages_end_io;
-		BUG_ON(!bio_add_page(&rbio->bio, page, PAGE_SIZE, 0));
+		BUG_ON(!bio_add_folio(&rbio->bio, folio, folio_size(folio), 0));
 
 		bchfs_read(&trans, rbio, inode_inum(inode),
 			   &readpages_iter);
@@ -1218,30 +1234,29 @@ void bch2_readahead(struct readahead_control *ractl)
 	kfree(readpages_iter.pages);
 }
 
-static void __bchfs_readpage(struct bch_fs *c, struct bch_read_bio *rbio,
-			     subvol_inum inum, struct page *page)
+static void __bchfs_readfolio(struct bch_fs *c, struct bch_read_bio *rbio,
+			     subvol_inum inum, struct folio *folio)
 {
 	struct btree_trans trans;
 
-	bch2_folio_create(page, __GFP_NOFAIL);
+	bch2_folio_create(folio, __GFP_NOFAIL);
 
 	rbio->bio.bi_opf = REQ_OP_READ|REQ_SYNC;
-	rbio->bio.bi_iter.bi_sector =
-		(sector_t) page->index << PAGE_SECTORS_SHIFT;
-	BUG_ON(!bio_add_page(&rbio->bio, page, PAGE_SIZE, 0));
+	rbio->bio.bi_iter.bi_sector = folio_sector(folio);
+	BUG_ON(!bio_add_folio(&rbio->bio, folio, folio_size(folio), 0));
 
 	bch2_trans_init(&trans, c, 0, 0);
 	bchfs_read(&trans, rbio, inum, NULL);
 	bch2_trans_exit(&trans);
 }
 
-static void bch2_read_single_page_end_io(struct bio *bio)
+static void bch2_read_single_folio_end_io(struct bio *bio)
 {
 	complete(bio->bi_private);
 }
 
-static int bch2_read_single_page(struct page *page,
-				 struct address_space *mapping)
+static int bch2_read_single_folio(struct folio *folio,
+				  struct address_space *mapping)
 {
 	struct bch_inode_info *inode = to_bch_ei(mapping->host);
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
@@ -1255,9 +1270,9 @@ static int bch2_read_single_page(struct page *page,
 	rbio = rbio_init(bio_alloc_bioset(NULL, 1, REQ_OP_READ, GFP_NOFS, &c->bio_read),
 			 opts);
 	rbio->bio.bi_private = &done;
-	rbio->bio.bi_end_io = bch2_read_single_page_end_io;
+	rbio->bio.bi_end_io = bch2_read_single_folio_end_io;
 
-	__bchfs_readpage(c, rbio, inode_inum(inode), page);
+	__bchfs_readfolio(c, rbio, inode_inum(inode), folio);
 	wait_for_completion(&done);
 
 	ret = blk_status_to_errno(rbio->bio.bi_status);
@@ -1266,16 +1281,15 @@ static int bch2_read_single_page(struct page *page,
 	if (ret < 0)
 		return ret;
 
-	SetPageUptodate(page);
+	folio_mark_uptodate(folio);
 	return 0;
 }
 
 int bch2_read_folio(struct file *file, struct folio *folio)
 {
-	struct page *page = &folio->page;
 	int ret;
 
-	ret = bch2_read_single_page(page, page->mapping);
+	ret = bch2_read_single_folio(folio, folio->mapping);
 	folio_unlock(folio);
 	return bch2_err_class(ret);
 }
@@ -1315,7 +1329,7 @@ static void bch2_writepage_io_done(struct bch_write_op *op)
 			SetPageError(bvec->bv_page);
 			mapping_set_error(bvec->bv_page->mapping, -EIO);
 
-			s = __bch2_folio(bvec->bv_page);
+			s = __bch2_folio(page_folio(bvec->bv_page));
 			spin_lock(&s->lock);
 			for (i = 0; i < PAGE_SECTORS; i++)
 				s->s[i].nr_replicas = 0;
@@ -1327,7 +1341,7 @@ static void bch2_writepage_io_done(struct bch_write_op *op)
 		bio_for_each_segment_all(bvec, bio, iter) {
 			struct bch_folio *s;
 
-			s = __bch2_folio(bvec->bv_page);
+			s = __bch2_folio(page_folio(bvec->bv_page));
 			spin_lock(&s->lock);
 			for (i = 0; i < PAGE_SECTORS; i++)
 				s->s[i].nr_replicas = 0;
@@ -1355,10 +1369,11 @@ static void bch2_writepage_io_done(struct bch_write_op *op)
 	i_sectors_acct(c, io->inode, NULL, io->op.i_sectors_delta);
 
 	bio_for_each_segment_all(bvec, bio, iter) {
-		struct bch_folio *s = __bch2_folio(bvec->bv_page);
+		struct folio *folio = page_folio(bvec->bv_page);
+		struct bch_folio *s = __bch2_folio(folio);
 
 		if (atomic_dec_and_test(&s->write_count))
-			end_page_writeback(bvec->bv_page);
+			folio_end_writeback(folio);
 	}
 
 	bio_put(&io->op.wbio.bio);
@@ -1410,44 +1425,44 @@ static int __bch2_writepage(struct folio *folio,
 			    struct writeback_control *wbc,
 			    void *data)
 {
-	struct page *page = &folio->page;
-	struct bch_inode_info *inode = to_bch_ei(page->mapping->host);
+	struct bch_inode_info *inode = to_bch_ei(folio->mapping->host);
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
 	struct bch_writepage_state *w = data;
 	struct bch_folio *s, orig;
-	unsigned i, offset, nr_replicas_this_write = U32_MAX;
+	unsigned i, offset, f_sectors, nr_replicas_this_write = U32_MAX;
 	loff_t i_size = i_size_read(&inode->v);
 	pgoff_t end_index = i_size >> PAGE_SHIFT;
 	int ret;
 
-	EBUG_ON(!PageUptodate(page));
+	EBUG_ON(!folio_test_uptodate(folio));
 
-	/* Is the page fully inside i_size? */
-	if (page->index < end_index)
+	/* Is the folio fully inside i_size? */
+	if (folio->index < end_index)
 		goto do_io;
 
-	/* Is the page fully outside i_size? (truncate in progress) */
+	/* Is the folio fully outside i_size? (truncate in progress) */
 	offset = i_size & (PAGE_SIZE - 1);
-	if (page->index > end_index || !offset) {
-		unlock_page(page);
+	if (folio->index > end_index || !offset) {
+		folio_unlock(folio);
 		return 0;
 	}
 
 	/*
-	 * The page straddles i_size.  It must be zeroed out on each and every
+	 * The folio straddles i_size.  It must be zeroed out on each and every
 	 * writepage invocation because it may be mmapped.  "A file is mapped
-	 * in multiples of the page size.  For a file that is not a multiple of
-	 * the  page size, the remaining memory is zeroed when mapped, and
+	 * in multiples of the folio size.  For a file that is not a multiple of
+	 * the  folio size, the remaining memory is zeroed when mapped, and
 	 * writes to that region are not written out to the file."
 	 */
-	zero_user_segment(page, offset, PAGE_SIZE);
+	folio_zero_segment(folio, offset, folio_size(folio));
 do_io:
-	s = bch2_folio_create(page, __GFP_NOFAIL);
+	f_sectors = folio_sectors(folio);
+	s = bch2_folio_create(folio, __GFP_NOFAIL);
 
 	/*
 	 * Things get really hairy with errors during writeback:
 	 */
-	ret = bch2_get_page_disk_reservation(c, inode, page, false);
+	ret = bch2_get_folio_disk_reservation(c, inode, folio, false);
 	BUG_ON(ret);
 
 	/* Before unlocking the page, get copy of reservations: */
@@ -1455,7 +1470,7 @@ do_io:
 	orig = *s;
 	spin_unlock(&s->lock);
 
-	for (i = 0; i < PAGE_SECTORS; i++) {
+	for (i = 0; i < f_sectors; i++) {
 		if (s->s[i].state < SECTOR_DIRTY)
 			continue;
 
@@ -1465,7 +1480,7 @@ do_io:
 			      s->s[i].replicas_reserved);
 	}
 
-	for (i = 0; i < PAGE_SECTORS; i++) {
+	for (i = 0; i < f_sectors; i++) {
 		if (s->s[i].state < SECTOR_DIRTY)
 			continue;
 
@@ -1479,24 +1494,24 @@ do_io:
 	BUG_ON(atomic_read(&s->write_count));
 	atomic_set(&s->write_count, 1);
 
-	BUG_ON(PageWriteback(page));
-	set_page_writeback(page);
+	BUG_ON(folio_test_writeback(folio));
+	folio_start_writeback(folio);
 
-	unlock_page(page);
+	folio_unlock(folio);
 
 	offset = 0;
 	while (1) {
 		unsigned sectors = 0, dirty_sectors = 0, reserved_sectors = 0;
 		u64 sector;
 
-		while (offset < PAGE_SECTORS &&
+		while (offset < f_sectors &&
 		       orig.s[offset].state < SECTOR_DIRTY)
 			offset++;
 
-		if (offset == PAGE_SECTORS)
+		if (offset == f_sectors)
 			break;
 
-		while (offset + sectors < PAGE_SECTORS &&
+		while (offset + sectors < f_sectors &&
 		       orig.s[offset + sectors].state >= SECTOR_DIRTY) {
 			reserved_sectors += orig.s[offset + sectors].replicas_reserved;
 			dirty_sectors += orig.s[offset + sectors].state == SECTOR_DIRTY;
@@ -1504,7 +1519,7 @@ do_io:
 		}
 		BUG_ON(!sectors);
 
-		sector = ((u64) page->index << PAGE_SECTORS_SHIFT) + offset;
+		sector = folio_sector(folio) + offset;
 
 		if (w->io &&
 		    (w->io->op.res.nr_replicas != nr_replicas_this_write ||
@@ -1521,7 +1536,7 @@ do_io:
 		atomic_inc(&s->write_count);
 
 		BUG_ON(inode != w->io->inode);
-		BUG_ON(!bio_add_page(&w->io->op.wbio.bio, page,
+		BUG_ON(!bio_add_folio(&w->io->op.wbio.bio, folio,
 				     sectors << 9, offset << 9));
 
 		/* Check for writing past i_size: */
@@ -1541,7 +1556,7 @@ do_io:
 	}
 
 	if (atomic_dec_and_test(&s->write_count))
-		end_page_writeback(page);
+		folio_end_writeback(folio);
 
 	return 0;
 }
@@ -1570,61 +1585,63 @@ int bch2_write_begin(struct file *file, struct address_space *mapping,
 {
 	struct bch_inode_info *inode = to_bch_ei(mapping->host);
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	struct bch2_page_reservation *res;
+	struct bch2_folio_reservation *res;
 	pgoff_t index = pos >> PAGE_SHIFT;
 	unsigned offset = pos & (PAGE_SIZE - 1);
-	struct page *page;
+	struct folio *folio;
 	int ret = -ENOMEM;
 
 	res = kmalloc(sizeof(*res), GFP_KERNEL);
 	if (!res)
 		return -ENOMEM;
 
-	bch2_page_reservation_init(c, inode, res);
+	bch2_folio_reservation_init(c, inode, res);
 	*fsdata = res;
 
 	bch2_pagecache_add_get(inode);
 
-	page = grab_cache_page_write_begin(mapping, index);
-	if (!page)
+	folio = __filemap_get_folio(mapping, index,
+				FGP_LOCK|FGP_WRITE|FGP_CREAT|FGP_STABLE,
+				mapping_gfp_mask(mapping));
+	if (!folio)
 		goto err_unlock;
 
-	if (PageUptodate(page))
+	if (folio_test_uptodate(folio))
 		goto out;
 
-	/* If we're writing entire page, don't need to read it in first: */
-	if (len == PAGE_SIZE)
+	/* If we're writing entire folio, don't need to read it in first: */
+	if (len == folio_size(folio))
 		goto out;
 
 	if (!offset && pos + len >= inode->v.i_size) {
-		zero_user_segment(page, len, PAGE_SIZE);
-		flush_dcache_page(page);
+		folio_zero_segment(folio, len, folio_size(folio));
+		flush_dcache_folio(folio);
 		goto out;
 	}
 
 	if (index > inode->v.i_size >> PAGE_SHIFT) {
-		zero_user_segments(page, 0, offset, offset + len, PAGE_SIZE);
-		flush_dcache_page(page);
+		folio_zero_segments(folio, 0, offset, offset + len, folio_size(folio));
+		flush_dcache_folio(folio);
 		goto out;
 	}
 readpage:
-	ret = bch2_read_single_page(page, mapping);
+	ret = bch2_read_single_folio(folio, mapping);
 	if (ret)
 		goto err;
 out:
-	if (!bch2_folio_create(page, __GFP_NOFAIL)->uptodate) {
-		ret = bch2_folio_set(c, inode_inum(inode), &page, 1);
+	if (!bch2_folio_create(folio, __GFP_NOFAIL)->uptodate) {
+		ret = bch2_folio_set(c, inode_inum(inode), &folio, 1);
 		if (ret)
 			goto err;
 	}
 
-	ret = bch2_page_reservation_get(c, inode, page, res, offset, len);
+	ret = bch2_folio_reservation_get(c, inode, folio, res, offset, len);
 	if (ret) {
-		if (!PageUptodate(page)) {
+		if (!folio_test_uptodate(folio)) {
 			/*
-			 * If the page hasn't been read in, we won't know if we
+			 * If the folio hasn't been read in, we won't know if we
 			 * actually need a reservation - we don't actually need
-			 * to read here, we just need to check if the page is
+			 * to read here, we just need to check if the folio is
 			 * fully backed by uncompressed data:
 			 */
 			goto readpage;
@@ -1633,11 +1650,11 @@ out:
 		goto err;
 	}
 
-	*pagep = page;
+	*pagep = &folio->page;
 	return 0;
 err:
-	unlock_page(page);
-	put_page(page);
+	folio_unlock(folio);
+	folio_put(folio);
 	*pagep = NULL;
 err_unlock:
 	bch2_pagecache_add_put(inode);
@@ -1652,19 +1669,20 @@ int bch2_write_end(struct file *file, struct address_space *mapping,
 {
 	struct bch_inode_info *inode = to_bch_ei(mapping->host);
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	struct bch2_page_reservation *res = fsdata;
+	struct bch2_folio_reservation *res = fsdata;
+	struct folio *folio = page_folio(page);
 	unsigned offset = pos & (PAGE_SIZE - 1);
 
 	lockdep_assert_held(&inode->v.i_rwsem);
 
-	if (unlikely(copied < len && !PageUptodate(page))) {
+	if (unlikely(copied < len && !folio_test_uptodate(folio))) {
 		/*
-		 * The page needs to be read in, but that would destroy
+		 * The folio needs to be read in, but that would destroy
 		 * our partial write - simplest thing is to just force
 		 * userspace to redo the write:
 		 */
-		zero_user(page, 0, PAGE_SIZE);
-		flush_dcache_page(page);
+		folio_zero_range(folio, 0, folio_size(folio));
+		flush_dcache_folio(folio);
 		copied = 0;
 	}
 
@@ -1674,19 +1692,19 @@ int bch2_write_end(struct file *file, struct address_space *mapping,
 	spin_unlock(&inode->v.i_lock);
 
 	if (copied) {
-		if (!PageUptodate(page))
-			SetPageUptodate(page);
+		if (!folio_test_uptodate(folio))
+			folio_mark_uptodate(folio);
 
-		bch2_set_page_dirty(c, inode, page, res, offset, copied);
+		bch2_set_folio_dirty(c, inode, folio, res, offset, copied);
 
 		inode->ei_last_dirtied = (unsigned long) current;
 	}
 
-	unlock_page(page);
-	put_page(page);
+	folio_unlock(folio);
+	folio_put(folio);
 	bch2_pagecache_add_put(inode);
 
-	bch2_page_reservation_put(c, inode, res);
+	bch2_folio_reservation_put(c, inode, res);
 	kfree(res);
 
 	return copied;
@@ -1700,46 +1718,49 @@ static int __bch2_buffered_write(struct bch_inode_info *inode,
 				 loff_t pos, unsigned len)
 {
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	struct page *pages[WRITE_BATCH_PAGES];
-	struct bch2_page_reservation res;
+	struct folio *folios[WRITE_BATCH_PAGES];
+	struct bch2_folio_reservation res;
 	unsigned long index = pos >> PAGE_SHIFT;
 	unsigned offset = pos & (PAGE_SIZE - 1);
-	unsigned nr_pages = DIV_ROUND_UP(offset + len, PAGE_SIZE);
+	unsigned nr_folios = DIV_ROUND_UP(offset + len, PAGE_SIZE);
 	unsigned i, reserved = 0, set_dirty = 0;
-	unsigned copied = 0, nr_pages_copied = 0;
+	unsigned copied = 0, nr_folios_copied = 0;
 	int ret = 0;
 
 	BUG_ON(!len);
-	BUG_ON(nr_pages > ARRAY_SIZE(pages));
+	BUG_ON(nr_folios > ARRAY_SIZE(folios));
 
-	bch2_page_reservation_init(c, inode, &res);
+	bch2_folio_reservation_init(c, inode, &res);
 
-	for (i = 0; i < nr_pages; i++) {
-		pages[i] = grab_cache_page_write_begin(mapping, index + i);
-		if (!pages[i]) {
-			nr_pages = i;
+	for (i = 0; i < nr_folios; i++) {
+		folios[i] = __filemap_get_folio(mapping, index + i,
+					FGP_LOCK|FGP_WRITE|FGP_CREAT|FGP_STABLE,
+					mapping_gfp_mask(mapping));
+		if (!folios[i]) {
+			nr_folios = i;
 			if (!i) {
 				ret = -ENOMEM;
 				goto out;
 			}
 			len = min_t(unsigned, len,
-				    nr_pages * PAGE_SIZE - offset);
+				    nr_folios * PAGE_SIZE - offset);
 			break;
 		}
 	}
 
-	if (offset && !PageUptodate(pages[0])) {
-		ret = bch2_read_single_page(pages[0], mapping);
+	if (offset && !folio_test_uptodate(folios[0])) {
+		ret = bch2_read_single_folio(folios[0], mapping);
 		if (ret)
 			goto out;
 	}
 
 	if ((pos + len) & (PAGE_SIZE - 1) &&
-	    !PageUptodate(pages[nr_pages - 1])) {
-		if ((index + nr_pages - 1) << PAGE_SHIFT >= inode->v.i_size) {
-			zero_user(pages[nr_pages - 1], 0, PAGE_SIZE);
+	    !folio_test_uptodate(folios[nr_folios - 1])) {
+		if ((index + nr_folios - 1) << PAGE_SHIFT >= inode->v.i_size) {
+			folio_zero_range(folios[nr_folios - 1], 0,
+					 folio_size(folios[nr_folios - 1]));
 		} else {
-			ret = bch2_read_single_page(pages[nr_pages - 1], mapping);
+			ret = bch2_read_single_folio(folios[nr_folios - 1], mapping);
 			if (ret)
 				goto out;
 		}
@@ -1747,14 +1768,14 @@ static int __bch2_buffered_write(struct bch_inode_info *inode,
 
 	while (reserved < len) {
 		unsigned i = (offset + reserved) >> PAGE_SHIFT;
-		struct page *page = pages[i];
-		unsigned pg_offset = (offset + reserved) & (PAGE_SIZE - 1);
-		unsigned pg_len = min_t(unsigned, len - reserved,
-					PAGE_SIZE - pg_offset);
+		struct folio *folio = folios[i];
+		unsigned folio_offset = (offset + reserved) & (PAGE_SIZE - 1);
+		unsigned folio_len = min_t(unsigned, len - reserved,
+					PAGE_SIZE - folio_offset);
 
-		if (!bch2_folio_create(page, __GFP_NOFAIL)->uptodate) {
+		if (!bch2_folio_create(folio, __GFP_NOFAIL)->uptodate) {
 			ret = bch2_folio_set(c, inode_inum(inode),
-					     pages + i, nr_pages - i);
+					     folios + i, nr_folios - i);
 			if (ret)
 				goto out;
 		}
@@ -1767,43 +1788,43 @@ static int __bch2_buffered_write(struct bch_inode_info *inode,
 		 * we aren't completely out of disk space - we don't do that
 		 * yet:
 		 */
-		ret = bch2_page_reservation_get(c, inode, page, &res,
-						pg_offset, pg_len);
+		ret = bch2_folio_reservation_get(c, inode, folio, &res,
+						 folio_offset, folio_len);
 		if (unlikely(ret)) {
 			if (!reserved)
 				goto out;
 			break;
 		}
 
-		reserved += pg_len;
+		reserved += folio_len;
 	}
 
 	if (mapping_writably_mapped(mapping))
-		for (i = 0; i < nr_pages; i++)
-			flush_dcache_page(pages[i]);
+		for (i = 0; i < nr_folios; i++)
+			flush_dcache_folio(folios[i]);
 
 	while (copied < reserved) {
-		struct page *page = pages[(offset + copied) >> PAGE_SHIFT];
-		unsigned pg_offset = (offset + copied) & (PAGE_SIZE - 1);
-		unsigned pg_len = min_t(unsigned, reserved - copied,
-					PAGE_SIZE - pg_offset);
-		unsigned pg_copied = copy_page_from_iter_atomic(page,
-						pg_offset, pg_len, iter);
-
-		if (!pg_copied)
+		struct folio *folio = folios[(offset + copied) >> PAGE_SHIFT];
+		unsigned folio_offset = (offset + copied) & (PAGE_SIZE - 1);
+		unsigned folio_len = min_t(unsigned, reserved - copied,
+					PAGE_SIZE - folio_offset);
+		unsigned folio_copied = copy_page_from_iter_atomic(&folio->page,
+						folio_offset, folio_len, iter);
+
+		if (!folio_copied)
 			break;
 
-		if (!PageUptodate(page) &&
-		    pg_copied != PAGE_SIZE &&
-		    pos + copied + pg_copied < inode->v.i_size) {
-			zero_user(page, 0, PAGE_SIZE);
+		if (!folio_test_uptodate(folio) &&
+		    folio_copied != PAGE_SIZE &&
+		    pos + copied + folio_copied < inode->v.i_size) {
+			folio_zero_range(folio, 0, folio_size(folio));
 			break;
 		}
 
-		flush_dcache_page(page);
-		copied += pg_copied;
+		flush_dcache_folio(folio);
+		copied += folio_copied;
 
-		if (pg_copied != pg_len)
+		if (folio_copied != folio_len)
 			break;
 	}
 
@@ -1816,30 +1837,30 @@ static int __bch2_buffered_write(struct bch_inode_info *inode,
 	spin_unlock(&inode->v.i_lock);
 
 	while (set_dirty < copied) {
-		struct page *page = pages[(offset + set_dirty) >> PAGE_SHIFT];
-		unsigned pg_offset = (offset + set_dirty) & (PAGE_SIZE - 1);
-		unsigned pg_len = min_t(unsigned, copied - set_dirty,
-					PAGE_SIZE - pg_offset);
+		struct folio *folio = folios[(offset + set_dirty) >> PAGE_SHIFT];
+		unsigned folio_offset = (offset + set_dirty) & (PAGE_SIZE - 1);
+		unsigned folio_len = min_t(unsigned, copied - set_dirty,
+					PAGE_SIZE - folio_offset);
 
-		if (!PageUptodate(page))
-			SetPageUptodate(page);
+		if (!folio_test_uptodate(folio))
+			folio_mark_uptodate(folio);
 
-		bch2_set_page_dirty(c, inode, page, &res, pg_offset, pg_len);
-		unlock_page(page);
-		put_page(page);
+		bch2_set_folio_dirty(c, inode, folio, &res, folio_offset, folio_len);
+		folio_unlock(folio);
+		folio_put(folio);
 
-		set_dirty += pg_len;
+		set_dirty += folio_len;
 	}
 
-	nr_pages_copied = DIV_ROUND_UP(offset + copied, PAGE_SIZE);
+	nr_folios_copied = DIV_ROUND_UP(offset + copied, PAGE_SIZE);
 	inode->ei_last_dirtied = (unsigned long) current;
 out:
-	for (i = nr_pages_copied; i < nr_pages; i++) {
-		unlock_page(pages[i]);
-		put_page(pages[i]);
+	for (i = nr_folios_copied; i < nr_folios; i++) {
+		folio_unlock(folios[i]);
+		folio_put(folios[i]);
 	}
 
-	bch2_page_reservation_put(c, inode, &res);
+	bch2_folio_reservation_put(c, inode, &res);
 
 	return copied ?: ret;
 }
@@ -2646,7 +2667,7 @@ static int __bch2_truncate_page(struct bch_inode_info *inode,
 	unsigned start_offset = start & (PAGE_SIZE - 1);
 	unsigned end_offset = ((end - 1) & (PAGE_SIZE - 1)) + 1;
 	unsigned i;
-	struct page *page;
+	struct folio *folio;
 	s64 i_sectors_delta = 0;
 	int ret = 0;
 
@@ -2659,11 +2680,11 @@ static int __bch2_truncate_page(struct bch_inode_info *inode,
 	if (index << PAGE_SHIFT >= inode->v.i_size)
 		return 0;
 
-	page = find_lock_page(mapping, index);
-	if (!page) {
+	folio = filemap_lock_folio(mapping, index);
+	if (!folio) {
 		/*
 		 * XXX: we're doing two index lookups when we end up reading the
-		 * page
+		 * folio
 		 */
 		ret = range_has_data(c, inode->ei_subvol,
 				POS(inode->v.i_ino, (index << PAGE_SECTORS_SHIFT)),
@@ -2671,21 +2692,22 @@ static int __bch2_truncate_page(struct bch_inode_info *inode,
 		if (ret <= 0)
 			return ret;
 
-		page = find_or_create_page(mapping, index, GFP_KERNEL);
-		if (unlikely(!page)) {
+		folio = __filemap_get_folio(mapping, index,
+					    FGP_LOCK|FGP_CREAT, GFP_KERNEL);
+		if (unlikely(!folio)) {
 			ret = -ENOMEM;
 			goto out;
 		}
 	}
 
-	s = bch2_folio_create(page, 0);
+	s = bch2_folio_create(folio, 0);
 	if (!s) {
 		ret = -ENOMEM;
 		goto unlock;
 	}
 
-	if (!PageUptodate(page)) {
-		ret = bch2_read_single_page(page, mapping);
+	if (!folio_test_uptodate(folio)) {
+		ret = bch2_read_single_folio(folio, mapping);
 		if (ret)
 			goto unlock;
 	}
@@ -2709,33 +2731,33 @@ static int __bch2_truncate_page(struct bch_inode_info *inode,
 	i_sectors_acct(c, inode, NULL, i_sectors_delta);
 
 	/*
-	 * Caller needs to know whether this page will be written out by
+	 * Caller needs to know whether this folio will be written out by
 	 * writeback - doing an i_size update if necessary - or whether it will
 	 * be responsible for the i_size update:
 	 */
 	ret = s->s[(min_t(u64, inode->v.i_size - (index << PAGE_SHIFT),
 			  PAGE_SIZE) - 1) >> 9].state >= SECTOR_DIRTY;
 
-	zero_user_segment(page, start_offset, end_offset);
+	folio_zero_segment(folio, start_offset, end_offset);
 
 	/*
 	 * Bit of a hack - we don't want truncate to fail due to -ENOSPC.
 	 *
-	 * XXX: because we aren't currently tracking whether the page has actual
+	 * XXX: because we aren't currently tracking whether the folio has actual
 	 * data in it (vs. just 0s, or only partially written) this wrong. ick.
 	 */
-	BUG_ON(bch2_get_page_disk_reservation(c, inode, page, false));
+	BUG_ON(bch2_get_folio_disk_reservation(c, inode, folio, false));
 
 	/*
 	 * This removes any writeable userspace mappings; we need to force
 	 * .page_mkwrite to be called again before any mmapped writes, to
 	 * redirty the full page:
 	 */
-	page_mkclean(page);
-	filemap_dirty_folio(mapping, page_folio(page));
+	folio_mkclean(folio);
+	filemap_dirty_folio(mapping, folio);
 unlock:
-	unlock_page(page);
-	put_page(page);
+	folio_unlock(folio);
+	folio_put(folio);
 out:
 	return ret;
 }
@@ -3467,7 +3489,7 @@ err:
 
 static int folio_data_offset(struct folio *folio, unsigned offset)
 {
-	struct bch_folio *s = bch2_folio(&folio->page);
+	struct bch_folio *s = bch2_folio(folio);
 	unsigned i;
 
 	if (s)
@@ -3572,9 +3594,9 @@ err:
 	return vfs_setpos(file, next_data, MAX_LFS_FILESIZE);
 }
 
-static int __page_hole_offset(struct page *page, unsigned offset)
+static int __folio_hole_offset(struct folio *folio, unsigned offset)
 {
-	struct bch_folio *s = bch2_folio(page);
+	struct bch_folio *s = bch2_folio(folio);
 	unsigned i;
 
 	if (!s)
@@ -3587,22 +3609,21 @@ static int __page_hole_offset(struct page *page, unsigned offset)
 	return -1;
 }
 
-static loff_t page_hole_offset(struct address_space *mapping, loff_t offset)
+static loff_t folio_hole_offset(struct address_space *mapping, loff_t offset)
 {
 	pgoff_t index = offset >> PAGE_SHIFT;
-	struct page *page;
-	int pg_offset;
+	struct folio *folio;
+	int folio_offset;
 	loff_t ret = -1;
 
-	page = find_lock_page(mapping, index);
-	if (!page)
+	folio = filemap_lock_folio(mapping, index);
+	if (!folio)
 		return offset;
 
-	pg_offset = __page_hole_offset(page, offset & (PAGE_SIZE - 1));
-	if (pg_offset >= 0)
-		ret = ((loff_t) index << PAGE_SHIFT) + pg_offset;
-
-	unlock_page(page);
+	folio_offset = __folio_hole_offset(folio, offset & (folio_size(folio) - 1));
+	if (folio_offset >= 0)
+		ret = folio_pos(folio) + folio_offset;
+	folio_unlock(folio);
 
 	return ret;
 }
@@ -3615,7 +3636,7 @@ static loff_t bch2_seek_pagecache_hole(struct inode *vinode,
 	loff_t offset = start_offset, hole;
 
 	while (offset < end_offset) {
-		hole = page_hole_offset(mapping, offset);
+		hole = folio_hole_offset(mapping, offset);
 		if (hole >= 0 && hole <= end_offset)
 			return max(start_offset, hole);
 
-- 
cgit 


From ff9c301f287657c445136d9168261b5fa7f7ae91 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 19 Mar 2023 18:59:21 -0400
Subject: bcachefs: bio_for_each_segment_all() -> bio_for_each_folio_all()

This converts the writepage end_io path to folios.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-io.c | 26 ++++++++++++--------------
 1 file changed, 12 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index c688adc2527f..018ada1a0136 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -1316,34 +1316,33 @@ static void bch2_writepage_io_done(struct bch_write_op *op)
 		container_of(op, struct bch_writepage_io, op);
 	struct bch_fs *c = io->op.c;
 	struct bio *bio = &io->op.wbio.bio;
-	struct bvec_iter_all iter;
-	struct bio_vec *bvec;
+	struct folio_iter fi;
 	unsigned i;
 
 	if (io->op.error) {
 		set_bit(EI_INODE_ERROR, &io->inode->ei_flags);
 
-		bio_for_each_segment_all(bvec, bio, iter) {
+		bio_for_each_folio_all(fi, bio) {
 			struct bch_folio *s;
 
-			SetPageError(bvec->bv_page);
-			mapping_set_error(bvec->bv_page->mapping, -EIO);
+			folio_set_error(fi.folio);
+			mapping_set_error(fi.folio->mapping, -EIO);
 
-			s = __bch2_folio(page_folio(bvec->bv_page));
+			s = __bch2_folio(fi.folio);
 			spin_lock(&s->lock);
-			for (i = 0; i < PAGE_SECTORS; i++)
+			for (i = 0; i < folio_sectors(fi.folio); i++)
 				s->s[i].nr_replicas = 0;
 			spin_unlock(&s->lock);
 		}
 	}
 
 	if (io->op.flags & BCH_WRITE_WROTE_DATA_INLINE) {
-		bio_for_each_segment_all(bvec, bio, iter) {
+		bio_for_each_folio_all(fi, bio) {
 			struct bch_folio *s;
 
-			s = __bch2_folio(page_folio(bvec->bv_page));
+			s = __bch2_folio(fi.folio);
 			spin_lock(&s->lock);
-			for (i = 0; i < PAGE_SECTORS; i++)
+			for (i = 0; i < folio_sectors(fi.folio); i++)
 				s->s[i].nr_replicas = 0;
 			spin_unlock(&s->lock);
 		}
@@ -1368,12 +1367,11 @@ static void bch2_writepage_io_done(struct bch_write_op *op)
 	 */
 	i_sectors_acct(c, io->inode, NULL, io->op.i_sectors_delta);
 
-	bio_for_each_segment_all(bvec, bio, iter) {
-		struct folio *folio = page_folio(bvec->bv_page);
-		struct bch_folio *s = __bch2_folio(folio);
+	bio_for_each_folio_all(fi, bio) {
+		struct bch_folio *s = __bch2_folio(fi.folio);
 
 		if (atomic_dec_and_test(&s->write_count))
-			folio_end_writeback(folio);
+			folio_end_writeback(fi.folio);
 	}
 
 	bio_put(&io->op.wbio.bio);
-- 
cgit 


From e8d28c3e47d25f6a9bf83a6548395078c851d532 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 19 Mar 2023 19:06:42 -0400
Subject: bcachefs: bch2_seek_pagecache_hole() folio conversion

This converts bch2_seek_pagecache_hole() to handle large folios.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-io.c | 60 +++++++++++++++++++++++------------------------------
 1 file changed, 26 insertions(+), 34 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 018ada1a0136..0cb76238f487 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -3592,37 +3592,34 @@ err:
 	return vfs_setpos(file, next_data, MAX_LFS_FILESIZE);
 }
 
-static int __folio_hole_offset(struct folio *folio, unsigned offset)
+static bool folio_hole_offset(struct address_space *mapping, loff_t *offset)
 {
-	struct bch_folio *s = bch2_folio(folio);
-	unsigned i;
-
-	if (!s)
-		return 0;
+	struct folio *folio;
+	struct bch_folio *s;
+	unsigned i, sectors, f_offset;
+	bool ret = true;
 
-	for (i = offset >> 9; i < PAGE_SECTORS; i++)
-		if (s->s[i].state < SECTOR_DIRTY)
-			return i << 9;
+	folio = filemap_lock_folio(mapping, *offset >> PAGE_SHIFT);
+	if (!folio)
+		return true;
 
-	return -1;
-}
+	s = bch2_folio(folio);
+	if (!s)
+		goto unlock;
 
-static loff_t folio_hole_offset(struct address_space *mapping, loff_t offset)
-{
-	pgoff_t index = offset >> PAGE_SHIFT;
-	struct folio *folio;
-	int folio_offset;
-	loff_t ret = -1;
+	sectors = folio_sectors(folio);
+	f_offset = *offset - folio_pos(folio);
 
-	folio = filemap_lock_folio(mapping, index);
-	if (!folio)
-		return offset;
+	for (i = f_offset >> 9; i < sectors; i++)
+		if (s->s[i].state < SECTOR_DIRTY) {
+			*offset = max(*offset, folio_pos(folio) + (i << 9));
+			goto unlock;
+		}
 
-	folio_offset = __folio_hole_offset(folio, offset & (folio_size(folio) - 1));
-	if (folio_offset >= 0)
-		ret = folio_pos(folio) + folio_offset;
+	*offset = folio_end_pos(folio);
+	ret = false;
+unlock:
 	folio_unlock(folio);
-
 	return ret;
 }
 
@@ -3631,18 +3628,13 @@ static loff_t bch2_seek_pagecache_hole(struct inode *vinode,
 				       loff_t end_offset)
 {
 	struct address_space *mapping = vinode->i_mapping;
-	loff_t offset = start_offset, hole;
+	loff_t offset = start_offset;
 
-	while (offset < end_offset) {
-		hole = folio_hole_offset(mapping, offset);
-		if (hole >= 0 && hole <= end_offset)
-			return max(start_offset, hole);
+	while (offset < end_offset &&
+	       !folio_hole_offset(mapping, &offset))
+		;
 
-		offset += PAGE_SIZE;
-		offset &= PAGE_MASK;
-	}
-
-	return end_offset;
+	return min(offset, end_offset);
 }
 
 static loff_t bch2_seek_hole(struct file *file, u64 offset)
-- 
cgit 


From a86a92cb0d7837b2e23f6b503242810db5ec93c7 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 19 Mar 2023 19:07:28 -0400
Subject: bcachefs: bch2_seek_pagecache_data() folio conversion

This converts bch2_seek_pagecache_data() to handle large folios.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-io.c | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 0cb76238f487..de4e5effca06 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -3488,10 +3488,10 @@ err:
 static int folio_data_offset(struct folio *folio, unsigned offset)
 {
 	struct bch_folio *s = bch2_folio(folio);
-	unsigned i;
+	unsigned i, sectors = folio_sectors(folio);
 
 	if (s)
-		for (i = offset >> 9; i < PAGE_SECTORS; i++)
+		for (i = offset >> 9; i < sectors; i++)
 			if (s->s[i].state >= SECTOR_DIRTY)
 				return i << 9;
 
@@ -3519,12 +3519,10 @@ static loff_t bch2_seek_pagecache_data(struct inode *vinode,
 
 			folio_lock(folio);
 			offset = folio_data_offset(folio,
-					folio->index == start_index
-					? start_offset & (PAGE_SIZE - 1)
-					: 0);
+					max(folio_pos(folio), start_offset) -
+					folio_pos(folio));
 			if (offset >= 0) {
-				ret = clamp(((loff_t) folio->index << PAGE_SHIFT) +
-					    offset,
+				ret = clamp(folio_pos(folio) + offset,
 					    start_offset, end_offset);
 				folio_unlock(folio);
 				folio_batch_release(&fbatch);
-- 
cgit 


From 33e2eb9677054f6e79fa188788f3027152cca8ff Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 17 Mar 2023 15:37:34 -0400
Subject: bcachefs: More assorted large folio conversion

Various misc small conversions in fs-io.c for large folios.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-io.c | 59 ++++++++++++++++++++++++++---------------------------
 1 file changed, 29 insertions(+), 30 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index de4e5effca06..eaee546c0fb9 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -443,10 +443,10 @@ static void __bch2_folio_set(struct folio *folio,
 			     unsigned nr_ptrs, unsigned state)
 {
 	struct bch_folio *s = bch2_folio_create(folio, __GFP_NOFAIL);
-	unsigned i;
+	unsigned i, sectors = folio_sectors(folio);
 
-	BUG_ON(pg_offset >= PAGE_SECTORS);
-	BUG_ON(pg_offset + pg_len > PAGE_SECTORS);
+	BUG_ON(pg_offset >= sectors);
+	BUG_ON(pg_offset + pg_len > sectors);
 
 	spin_lock(&s->lock);
 
@@ -455,7 +455,7 @@ static void __bch2_folio_set(struct folio *folio,
 		s->s[i].state		= state;
 	}
 
-	if (i == PAGE_SECTORS)
+	if (i == sectors)
 		s->uptodate = true;
 
 	spin_unlock(&s->lock);
@@ -552,15 +552,13 @@ static void mark_pagecache_unallocated(struct bch_inode_info *inode,
 				  &index, end_index, &fbatch)) {
 		for (i = 0; i < folio_batch_count(&fbatch); i++) {
 			struct folio *folio = fbatch.folios[i];
-			u64 folio_start = folio->index << PAGE_SECTORS_SHIFT;
-			u64 folio_end = (folio->index + 1) << PAGE_SECTORS_SHIFT;
+			u64 folio_start = folio_sector(folio);
+			u64 folio_end = folio_end_sector(folio);
 			unsigned folio_offset = max(start, folio_start) - folio_start;
 			unsigned folio_len = min(end, folio_end) - folio_offset - folio_start;
 			struct bch_folio *s;
 
 			BUG_ON(end <= folio_start);
-			BUG_ON(folio_offset >= PAGE_SECTORS);
-			BUG_ON(folio_offset + folio_len > PAGE_SECTORS);
 
 			folio_lock(folio);
 			s = bch2_folio(folio);
@@ -598,15 +596,13 @@ static void mark_pagecache_reserved(struct bch_inode_info *inode,
 				  &index, end_index, &fbatch)) {
 		for (i = 0; i < folio_batch_count(&fbatch); i++) {
 			struct folio *folio = fbatch.folios[i];
-			u64 folio_start = folio->index << PAGE_SECTORS_SHIFT;
-			u64 folio_end = (folio->index + 1) << PAGE_SECTORS_SHIFT;
+			u64 folio_start = folio_sector(folio);
+			u64 folio_end = folio_end_sector(folio);
 			unsigned folio_offset = max(start, folio_start) - folio_start;
 			unsigned folio_len = min(end, folio_end) - folio_offset - folio_start;
 			struct bch_folio *s;
 
 			BUG_ON(end <= folio_start);
-			BUG_ON(folio_offset >= PAGE_SECTORS);
-			BUG_ON(folio_offset + folio_len > PAGE_SECTORS);
 
 			folio_lock(folio);
 			s = bch2_folio(folio);
@@ -660,13 +656,13 @@ static int bch2_get_folio_disk_reservation(struct bch_fs *c,
 	struct bch_folio *s = bch2_folio_create(folio, 0);
 	unsigned nr_replicas = inode_nr_replicas(c, inode);
 	struct disk_reservation disk_res = { 0 };
-	unsigned i, disk_res_sectors = 0;
+	unsigned i, sectors = folio_sectors(folio), disk_res_sectors = 0;
 	int ret;
 
 	if (!s)
 		return -ENOMEM;
 
-	for (i = 0; i < ARRAY_SIZE(s->s); i++)
+	for (i = 0; i < sectors; i++)
 		disk_res_sectors += sectors_to_reserve(&s->s[i], nr_replicas);
 
 	if (!disk_res_sectors)
@@ -680,7 +676,7 @@ static int bch2_get_folio_disk_reservation(struct bch_fs *c,
 	if (unlikely(ret))
 		return ret;
 
-	for (i = 0; i < ARRAY_SIZE(s->s); i++)
+	for (i = 0; i < sectors; i++)
 		s->s[i].replicas_reserved +=
 			sectors_to_reserve(&s->s[i], nr_replicas);
 
@@ -761,7 +757,7 @@ static void bch2_clear_folio_bits(struct folio *folio)
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
 	struct bch_folio *s = bch2_folio(folio);
 	struct disk_reservation disk_res = { 0 };
-	int i, dirty_sectors = 0;
+	int i, sectors = folio_sectors(folio), dirty_sectors = 0;
 
 	if (!s)
 		return;
@@ -769,7 +765,7 @@ static void bch2_clear_folio_bits(struct folio *folio)
 	EBUG_ON(!folio_test_locked(folio));
 	EBUG_ON(folio_test_writeback(folio));
 
-	for (i = 0; i < ARRAY_SIZE(s->s); i++) {
+	for (i = 0; i < sectors; i++) {
 		disk_res.sectors += s->s[i].replicas_reserved;
 		s->s[i].replicas_reserved = 0;
 
@@ -915,7 +911,7 @@ vm_fault_t bch2_page_mkwrite(struct vm_fault *vmf)
 		goto out;
 	}
 
-	len = min_t(loff_t, PAGE_SIZE, isize - folio_pos(folio));
+	len = min_t(loff_t, folio_size(folio), isize - folio_pos(folio));
 
 	if (!bch2_folio_create(folio, __GFP_NOFAIL)->uptodate) {
 		if (bch2_folio_set(c, inode_inum(inode), &folio, 1)) {
@@ -1429,18 +1425,16 @@ static int __bch2_writepage(struct folio *folio,
 	struct bch_folio *s, orig;
 	unsigned i, offset, f_sectors, nr_replicas_this_write = U32_MAX;
 	loff_t i_size = i_size_read(&inode->v);
-	pgoff_t end_index = i_size >> PAGE_SHIFT;
 	int ret;
 
 	EBUG_ON(!folio_test_uptodate(folio));
 
 	/* Is the folio fully inside i_size? */
-	if (folio->index < end_index)
+	if (folio_end_pos(folio) <= i_size)
 		goto do_io;
 
 	/* Is the folio fully outside i_size? (truncate in progress) */
-	offset = i_size & (PAGE_SIZE - 1);
-	if (folio->index > end_index || !offset) {
+	if (folio_pos(folio) >= i_size) {
 		folio_unlock(folio);
 		return 0;
 	}
@@ -1452,7 +1446,9 @@ static int __bch2_writepage(struct folio *folio,
 	 * the  folio size, the remaining memory is zeroed when mapped, and
 	 * writes to that region are not written out to the file."
 	 */
-	folio_zero_segment(folio, offset, folio_size(folio));
+	folio_zero_segment(folio,
+			   i_size - folio_pos(folio),
+			   folio_size(folio));
 do_io:
 	f_sectors = folio_sectors(folio);
 	s = bch2_folio_create(folio, __GFP_NOFAIL);
@@ -1521,7 +1517,7 @@ do_io:
 
 		if (w->io &&
 		    (w->io->op.res.nr_replicas != nr_replicas_this_write ||
-		     bio_full(&w->io->op.wbio.bio, PAGE_SIZE) ||
+		     bio_full(&w->io->op.wbio.bio, sectors << 9) ||
 		     w->io->op.wbio.bio.bi_iter.bi_size + (sectors << 9) >=
 		     (BIO_MAX_VECS * PAGE_SIZE) ||
 		     bio_end_sector(&w->io->op.wbio.bio) != sector))
@@ -1584,9 +1580,8 @@ int bch2_write_begin(struct file *file, struct address_space *mapping,
 	struct bch_inode_info *inode = to_bch_ei(mapping->host);
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
 	struct bch2_folio_reservation *res;
-	pgoff_t index = pos >> PAGE_SHIFT;
-	unsigned offset = pos & (PAGE_SIZE - 1);
 	struct folio *folio;
+	unsigned offset;
 	int ret = -ENOMEM;
 
 	res = kmalloc(sizeof(*res), GFP_KERNEL);
@@ -1598,7 +1593,7 @@ int bch2_write_begin(struct file *file, struct address_space *mapping,
 
 	bch2_pagecache_add_get(inode);
 
-	folio = __filemap_get_folio(mapping, index,
+	folio = __filemap_get_folio(mapping, pos >> PAGE_SHIFT,
 				FGP_LOCK|FGP_WRITE|FGP_CREAT|FGP_STABLE,
 				mapping_gfp_mask(mapping));
 	if (!folio)
@@ -1607,8 +1602,11 @@ int bch2_write_begin(struct file *file, struct address_space *mapping,
 	if (folio_test_uptodate(folio))
 		goto out;
 
+	offset = pos - folio_pos(folio);
+	len = min_t(size_t, len, folio_end_pos(folio) - pos);
+
 	/* If we're writing entire folio, don't need to read it in first: */
-	if (len == folio_size(folio))
+	if (!offset && len == folio_size(folio))
 		goto out;
 
 	if (!offset && pos + len >= inode->v.i_size) {
@@ -1617,7 +1615,7 @@ int bch2_write_begin(struct file *file, struct address_space *mapping,
 		goto out;
 	}
 
-	if (index > inode->v.i_size >> PAGE_SHIFT) {
+	if (folio_pos(folio) >= inode->v.i_size) {
 		folio_zero_segments(folio, 0, offset, offset + len, folio_size(folio));
 		flush_dcache_folio(folio);
 		goto out;
@@ -1669,9 +1667,10 @@ int bch2_write_end(struct file *file, struct address_space *mapping,
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
 	struct bch2_folio_reservation *res = fsdata;
 	struct folio *folio = page_folio(page);
-	unsigned offset = pos & (PAGE_SIZE - 1);
+	unsigned offset = pos - folio_pos(folio);
 
 	lockdep_assert_held(&inode->v.i_rwsem);
+	BUG_ON(offset + copied > folio_size(folio));
 
 	if (unlikely(copied < len && !folio_test_uptodate(folio))) {
 		/*
-- 
cgit 


From 49fe78ff33de4319aff7085aadda0e2c699ffd97 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 17 Mar 2023 19:46:25 -0400
Subject: bcachefs: bch_folio can now handle multi-order folios

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-io.c | 28 ++++++++++++++++++++--------
 1 file changed, 20 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index eaee546c0fb9..ad0b0c136e65 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -381,7 +381,7 @@ struct bch_folio {
 	 * (Not the data itself)
 	 */
 	bool			uptodate;
-	struct bch_folio_sector	s[PAGE_SECTORS];
+	struct bch_folio_sector	s[];
 };
 
 static inline struct bch_folio *__bch2_folio(struct folio *folio)
@@ -415,7 +415,9 @@ static struct bch_folio *__bch2_folio_create(struct folio *folio, gfp_t gfp)
 {
 	struct bch_folio *s;
 
-	s = kzalloc(sizeof(*s), GFP_NOFS|gfp);
+	s = kzalloc(sizeof(*s) +
+		    sizeof(struct bch_folio_sector) *
+		    folio_sectors(folio), GFP_NOFS|gfp);
 	if (!s)
 		return NULL;
 
@@ -1295,6 +1297,8 @@ int bch2_read_folio(struct file *file, struct folio *folio)
 struct bch_writepage_state {
 	struct bch_writepage_io	*io;
 	struct bch_io_opts	opts;
+	struct bch_folio_sector	*tmp;
+	unsigned		tmp_sectors;
 };
 
 static inline struct bch_writepage_state bch_writepage_state_init(struct bch_fs *c,
@@ -1422,7 +1426,7 @@ static int __bch2_writepage(struct folio *folio,
 	struct bch_inode_info *inode = to_bch_ei(folio->mapping->host);
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
 	struct bch_writepage_state *w = data;
-	struct bch_folio *s, orig;
+	struct bch_folio *s;
 	unsigned i, offset, f_sectors, nr_replicas_this_write = U32_MAX;
 	loff_t i_size = i_size_read(&inode->v);
 	int ret;
@@ -1453,6 +1457,13 @@ do_io:
 	f_sectors = folio_sectors(folio);
 	s = bch2_folio_create(folio, __GFP_NOFAIL);
 
+	if (f_sectors > w->tmp_sectors) {
+		kfree(w->tmp);
+		w->tmp = kzalloc(sizeof(struct bch_folio_sector) *
+				 f_sectors, __GFP_NOFAIL);
+		w->tmp_sectors = f_sectors;
+	}
+
 	/*
 	 * Things get really hairy with errors during writeback:
 	 */
@@ -1461,7 +1472,7 @@ do_io:
 
 	/* Before unlocking the page, get copy of reservations: */
 	spin_lock(&s->lock);
-	orig = *s;
+	memcpy(w->tmp, s->s, sizeof(struct bch_folio_sector) * f_sectors);
 	spin_unlock(&s->lock);
 
 	for (i = 0; i < f_sectors; i++) {
@@ -1499,16 +1510,16 @@ do_io:
 		u64 sector;
 
 		while (offset < f_sectors &&
-		       orig.s[offset].state < SECTOR_DIRTY)
+		       w->tmp[offset].state < SECTOR_DIRTY)
 			offset++;
 
 		if (offset == f_sectors)
 			break;
 
 		while (offset + sectors < f_sectors &&
-		       orig.s[offset + sectors].state >= SECTOR_DIRTY) {
-			reserved_sectors += orig.s[offset + sectors].replicas_reserved;
-			dirty_sectors += orig.s[offset + sectors].state == SECTOR_DIRTY;
+		       w->tmp[offset + sectors].state >= SECTOR_DIRTY) {
+			reserved_sectors += w->tmp[offset + sectors].replicas_reserved;
+			dirty_sectors += w->tmp[offset + sectors].state == SECTOR_DIRTY;
 			sectors++;
 		}
 		BUG_ON(!sectors);
@@ -1568,6 +1579,7 @@ int bch2_writepages(struct address_space *mapping, struct writeback_control *wbc
 	if (w.io)
 		bch2_writepage_do_io(&w);
 	blk_finish_plug(&plug);
+	kfree(w.tmp);
 	return bch2_err_class(ret);
 }
 
-- 
cgit 


From c42b57c451abe522f9eb499b2c421fa8c56eb8bf Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 18 Mar 2023 21:37:43 -0400
Subject: bcachefs: bch2_buffered_write large folio conversion

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/darray.h |   5 ++
 fs/bcachefs/fs-io.c  | 189 +++++++++++++++++++++++++++++----------------------
 2 files changed, 114 insertions(+), 80 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/darray.h b/fs/bcachefs/darray.h
index 978ab7961f1b..d4485fa01b2a 100644
--- a/fs/bcachefs/darray.h
+++ b/fs/bcachefs/darray.h
@@ -54,6 +54,11 @@ static inline int __darray_make_room(darray_void *d, size_t t_size, size_t more,
 
 #define darray_push(_d, _item)	darray_push_gfp(_d, _item, GFP_KERNEL)
 
+#define darray_pop(_d)		((_d)->data[--(_d)->nr])
+
+#define darray_first(_d)	((_d).data[0])
+#define darray_last(_d)		((_d).data[(_d).nr - 1])
+
 #define darray_insert_item(_d, _pos, _item)				\
 ({									\
 	size_t pos = (_pos);						\
diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index ad0b0c136e65..14abb7aad18f 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -1719,7 +1719,17 @@ int bch2_write_end(struct file *file, struct address_space *mapping,
 	return copied;
 }
 
-#define WRITE_BATCH_PAGES	32
+typedef DARRAY(struct folio *) folios;
+
+static noinline void folios_trunc(folios *folios, struct folio **fi)
+{
+	while (folios->data + folios->nr > fi) {
+		struct folio *f = darray_pop(folios);
+
+		folio_unlock(f);
+		folio_put(f);
+	}
+}
 
 static int __bch2_buffered_write(struct bch_inode_info *inode,
 				 struct address_space *mapping,
@@ -1727,64 +1737,73 @@ static int __bch2_buffered_write(struct bch_inode_info *inode,
 				 loff_t pos, unsigned len)
 {
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	struct folio *folios[WRITE_BATCH_PAGES];
 	struct bch2_folio_reservation res;
-	unsigned long index = pos >> PAGE_SHIFT;
-	unsigned offset = pos & (PAGE_SIZE - 1);
-	unsigned nr_folios = DIV_ROUND_UP(offset + len, PAGE_SIZE);
-	unsigned i, reserved = 0, set_dirty = 0;
-	unsigned copied = 0, nr_folios_copied = 0;
+	folios folios;
+	struct folio **fi, *f;
+	unsigned copied = 0, f_offset;
+	loff_t end = pos + len, f_pos;
 	int ret = 0;
 
 	BUG_ON(!len);
-	BUG_ON(nr_folios > ARRAY_SIZE(folios));
 
 	bch2_folio_reservation_init(c, inode, &res);
+	darray_init(&folios);
 
-	for (i = 0; i < nr_folios; i++) {
-		folios[i] = __filemap_get_folio(mapping, index + i,
-					FGP_LOCK|FGP_WRITE|FGP_CREAT|FGP_STABLE,
-					mapping_gfp_mask(mapping));
-		if (!folios[i]) {
-			nr_folios = i;
-			if (!i) {
-				ret = -ENOMEM;
-				goto out;
-			}
-			len = min_t(unsigned, len,
-				    nr_folios * PAGE_SIZE - offset);
+	f_pos = pos;
+	while (f_pos < end) {
+		unsigned fgp_flags = FGP_LOCK|FGP_WRITE|FGP_STABLE;
+
+		if ((u64) f_pos < (u64) pos + (1U << 20))
+			fgp_flags |= FGP_CREAT;
+
+		if (darray_make_room_gfp(&folios, 1,
+				mapping_gfp_mask(mapping) & GFP_KERNEL))
 			break;
-		}
+
+		f = __filemap_get_folio(mapping, f_pos >> PAGE_SHIFT,
+					fgp_flags, mapping_gfp_mask(mapping));
+		if (!f)
+			break;
+
+		BUG_ON(folios.nr && folio_pos(f) != f_pos);
+
+		f_pos = folio_end_pos(f);
+		darray_push(&folios, f);
+	}
+
+	end = min(end, f_pos);
+	if (end == pos) {
+		ret = -ENOMEM;
+		goto out;
 	}
 
-	if (offset && !folio_test_uptodate(folios[0])) {
-		ret = bch2_read_single_folio(folios[0], mapping);
+	f = darray_first(folios);
+	if (pos != folio_pos(f) && !folio_test_uptodate(f)) {
+		ret = bch2_read_single_folio(f, mapping);
 		if (ret)
 			goto out;
 	}
 
-	if ((pos + len) & (PAGE_SIZE - 1) &&
-	    !folio_test_uptodate(folios[nr_folios - 1])) {
-		if ((index + nr_folios - 1) << PAGE_SHIFT >= inode->v.i_size) {
-			folio_zero_range(folios[nr_folios - 1], 0,
-					 folio_size(folios[nr_folios - 1]));
+	f = darray_last(folios);
+	if (end != folio_end_pos(f) && !folio_test_uptodate(f)) {
+		if (end >= inode->v.i_size) {
+			folio_zero_range(f, 0, folio_size(f));
 		} else {
-			ret = bch2_read_single_folio(folios[nr_folios - 1], mapping);
+			ret = bch2_read_single_folio(f, mapping);
 			if (ret)
 				goto out;
 		}
 	}
 
-	while (reserved < len) {
-		unsigned i = (offset + reserved) >> PAGE_SHIFT;
-		struct folio *folio = folios[i];
-		unsigned folio_offset = (offset + reserved) & (PAGE_SIZE - 1);
-		unsigned folio_len = min_t(unsigned, len - reserved,
-					PAGE_SIZE - folio_offset);
+	f_pos = pos;
+	f_offset = pos - folio_pos(darray_first(folios));
+	darray_for_each(folios, fi) {
+		struct folio *f = *fi;
+		unsigned f_len = min(end, folio_end_pos(f)) - f_pos;
 
-		if (!bch2_folio_create(folio, __GFP_NOFAIL)->uptodate) {
-			ret = bch2_folio_set(c, inode_inum(inode),
-					     folios + i, nr_folios - i);
+		if (!bch2_folio_create(f, __GFP_NOFAIL)->uptodate) {
+			ret = bch2_folio_set(c, inode_inum(inode), fi,
+					     folios.data + folios.nr - fi);
 			if (ret)
 				goto out;
 		}
@@ -1797,78 +1816,89 @@ static int __bch2_buffered_write(struct bch_inode_info *inode,
 		 * we aren't completely out of disk space - we don't do that
 		 * yet:
 		 */
-		ret = bch2_folio_reservation_get(c, inode, folio, &res,
-						 folio_offset, folio_len);
+		ret = bch2_folio_reservation_get(c, inode, f, &res, f_offset, f_len);
 		if (unlikely(ret)) {
-			if (!reserved)
+			folios_trunc(&folios, fi);
+			if (!folios.nr)
 				goto out;
+
+			end = min(end, folio_end_pos(darray_last(folios)));
 			break;
 		}
 
-		reserved += folio_len;
+		f_pos = folio_end_pos(f);
+		f_offset = 0;
 	}
 
 	if (mapping_writably_mapped(mapping))
-		for (i = 0; i < nr_folios; i++)
-			flush_dcache_folio(folios[i]);
-
-	while (copied < reserved) {
-		struct folio *folio = folios[(offset + copied) >> PAGE_SHIFT];
-		unsigned folio_offset = (offset + copied) & (PAGE_SIZE - 1);
-		unsigned folio_len = min_t(unsigned, reserved - copied,
-					PAGE_SIZE - folio_offset);
-		unsigned folio_copied = copy_page_from_iter_atomic(&folio->page,
-						folio_offset, folio_len, iter);
-
-		if (!folio_copied)
+		darray_for_each(folios, fi)
+			flush_dcache_folio(*fi);
+
+	f_pos = pos;
+	f_offset = pos - folio_pos(darray_first(folios));
+	darray_for_each(folios, fi) {
+		struct folio *f = *fi;
+		unsigned f_len = min(end, folio_end_pos(f)) - f_pos;
+		unsigned f_copied = copy_page_from_iter_atomic(&f->page, f_offset, f_len, iter);
+
+		if (!f_copied) {
+			folios_trunc(&folios, fi);
 			break;
+		}
 
-		if (!folio_test_uptodate(folio) &&
-		    folio_copied != PAGE_SIZE &&
-		    pos + copied + folio_copied < inode->v.i_size) {
-			folio_zero_range(folio, 0, folio_size(folio));
+		if (!folio_test_uptodate(f) &&
+		    f_copied != folio_size(f) &&
+		    pos + copied + f_copied < inode->v.i_size) {
+			folio_zero_range(f, 0, folio_size(f));
+			folios_trunc(&folios, fi);
 			break;
 		}
 
-		flush_dcache_folio(folio);
-		copied += folio_copied;
+		flush_dcache_folio(f);
+		copied += f_copied;
 
-		if (folio_copied != folio_len)
+		if (f_copied != f_len) {
+			folios_trunc(&folios, fi + 1);
 			break;
+		}
+
+		f_pos = folio_end_pos(f);
+		f_offset = 0;
 	}
 
 	if (!copied)
 		goto out;
 
+	end = pos + copied;
+
 	spin_lock(&inode->v.i_lock);
-	if (pos + copied > inode->v.i_size)
-		i_size_write(&inode->v, pos + copied);
+	if (end > inode->v.i_size)
+		i_size_write(&inode->v, end);
 	spin_unlock(&inode->v.i_lock);
 
-	while (set_dirty < copied) {
-		struct folio *folio = folios[(offset + set_dirty) >> PAGE_SHIFT];
-		unsigned folio_offset = (offset + set_dirty) & (PAGE_SIZE - 1);
-		unsigned folio_len = min_t(unsigned, copied - set_dirty,
-					PAGE_SIZE - folio_offset);
+	f_pos = pos;
+	f_offset = pos - folio_pos(darray_first(folios));
+	darray_for_each(folios, fi) {
+		struct folio *f = *fi;
+		unsigned f_len = min(end, folio_end_pos(f)) - f_pos;
 
-		if (!folio_test_uptodate(folio))
-			folio_mark_uptodate(folio);
+		if (!folio_test_uptodate(f))
+			folio_mark_uptodate(f);
 
-		bch2_set_folio_dirty(c, inode, folio, &res, folio_offset, folio_len);
-		folio_unlock(folio);
-		folio_put(folio);
+		bch2_set_folio_dirty(c, inode, f, &res, f_offset, f_len);
 
-		set_dirty += folio_len;
+		f_pos = folio_end_pos(f);
+		f_offset = 0;
 	}
 
-	nr_folios_copied = DIV_ROUND_UP(offset + copied, PAGE_SIZE);
 	inode->ei_last_dirtied = (unsigned long) current;
 out:
-	for (i = nr_folios_copied; i < nr_folios; i++) {
-		folio_unlock(folios[i]);
-		folio_put(folios[i]);
+	darray_for_each(folios, fi) {
+		folio_unlock(*fi);
+		folio_put(*fi);
 	}
 
+	darray_exit(&folios);
 	bch2_folio_reservation_put(c, inode, &res);
 
 	return copied ?: ret;
@@ -1887,8 +1917,7 @@ static ssize_t bch2_buffered_write(struct kiocb *iocb, struct iov_iter *iter)
 
 	do {
 		unsigned offset = pos & (PAGE_SIZE - 1);
-		unsigned bytes = min_t(unsigned long, iov_iter_count(iter),
-			      PAGE_SIZE * WRITE_BATCH_PAGES - offset);
+		unsigned bytes = iov_iter_count(iter);
 again:
 		/*
 		 * Bring in the user page that we will copy from _first_.
-- 
cgit 


From 959f7368d60c89513ce44184bdfcb7304fea17ee Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 19 Mar 2023 18:03:22 -0400
Subject: bcachefs: bch2_truncate_page() large folio conversion

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-io.c | 61 ++++++++++++++++++++++++++---------------------------
 1 file changed, 30 insertions(+), 31 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 14abb7aad18f..4addc0b77136 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -2696,8 +2696,8 @@ err:
 	return ret;
 }
 
-static int __bch2_truncate_page(struct bch_inode_info *inode,
-				pgoff_t index, loff_t start, loff_t end)
+static int __bch2_truncate_folio(struct bch_inode_info *inode,
+				 pgoff_t index, loff_t start, loff_t end)
 {
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
 	struct address_space *mapping = inode->v.i_mapping;
@@ -2709,15 +2709,6 @@ static int __bch2_truncate_page(struct bch_inode_info *inode,
 	s64 i_sectors_delta = 0;
 	int ret = 0;
 
-	/* Page boundary? Nothing to do */
-	if (!((index == start >> PAGE_SHIFT && start_offset) ||
-	      (index == end >> PAGE_SHIFT && end_offset != PAGE_SIZE)))
-		return 0;
-
-	/* Above i_size? */
-	if (index << PAGE_SHIFT >= inode->v.i_size)
-		return 0;
-
 	folio = filemap_lock_folio(mapping, index);
 	if (!folio) {
 		/*
@@ -2738,6 +2729,19 @@ static int __bch2_truncate_page(struct bch_inode_info *inode,
 		}
 	}
 
+	BUG_ON(start	>= folio_end_pos(folio));
+	BUG_ON(end	<= folio_pos(folio));
+
+	start_offset	= max(start, folio_pos(folio)) - folio_pos(folio);
+	end_offset	= min(end, folio_end_pos(folio)) - folio_pos(folio);
+
+	/* Folio boundary? Nothing to do */
+	if (start_offset == 0 &&
+	    end_offset == folio_size(folio)) {
+		ret = 0;
+		goto unlock;
+	}
+
 	s = bch2_folio_create(folio, 0);
 	if (!s) {
 		ret = -ENOMEM;
@@ -2752,11 +2756,6 @@ static int __bch2_truncate_page(struct bch_inode_info *inode,
 
 	BUG_ON(!s->uptodate);
 
-	if (index != start >> PAGE_SHIFT)
-		start_offset = 0;
-	if (index != end >> PAGE_SHIFT)
-		end_offset = PAGE_SIZE;
-
 	for (i = round_up(start_offset, block_bytes(c)) >> 9;
 	     i < round_down(end_offset, block_bytes(c)) >> 9;
 	     i++) {
@@ -2773,8 +2772,8 @@ static int __bch2_truncate_page(struct bch_inode_info *inode,
 	 * writeback - doing an i_size update if necessary - or whether it will
 	 * be responsible for the i_size update:
 	 */
-	ret = s->s[(min_t(u64, inode->v.i_size - (index << PAGE_SHIFT),
-			  PAGE_SIZE) - 1) >> 9].state >= SECTOR_DIRTY;
+	ret = s->s[(min(inode->v.i_size, folio_end_pos(folio)) -
+		    folio_pos(folio) - 1) >> 9].state >= SECTOR_DIRTY;
 
 	folio_zero_segment(folio, start_offset, end_offset);
 
@@ -2800,23 +2799,23 @@ out:
 	return ret;
 }
 
-static int bch2_truncate_page(struct bch_inode_info *inode, loff_t from)
+static int bch2_truncate_folio(struct bch_inode_info *inode, loff_t from)
 {
-	return __bch2_truncate_page(inode, from >> PAGE_SHIFT,
-				    from, round_up(from, PAGE_SIZE));
+	return __bch2_truncate_folio(inode, from >> PAGE_SHIFT,
+				     from, ANYSINT_MAX(loff_t));
 }
 
-static int bch2_truncate_pages(struct bch_inode_info *inode,
-			       loff_t start, loff_t end)
+static int bch2_truncate_folios(struct bch_inode_info *inode,
+				loff_t start, loff_t end)
 {
-	int ret = __bch2_truncate_page(inode, start >> PAGE_SHIFT,
-				       start, end);
+	int ret = __bch2_truncate_folio(inode, start >> PAGE_SHIFT,
+					start, end);
 
 	if (ret >= 0 &&
 	    start >> PAGE_SHIFT != end >> PAGE_SHIFT)
-		ret = __bch2_truncate_page(inode,
-					   end >> PAGE_SHIFT,
-					   start, end);
+		ret = __bch2_truncate_folio(inode,
+					(end - 1) >> PAGE_SHIFT,
+					start, end);
 	return ret;
 }
 
@@ -2911,7 +2910,7 @@ int bch2_truncate(struct mnt_idmap *idmap,
 
 	iattr->ia_valid &= ~ATTR_SIZE;
 
-	ret = bch2_truncate_page(inode, iattr->ia_size);
+	ret = bch2_truncate_folio(inode, iattr->ia_size);
 	if (unlikely(ret < 0))
 		goto err;
 
@@ -2989,7 +2988,7 @@ static long bchfs_fpunch(struct bch_inode_info *inode, loff_t offset, loff_t len
 	bool truncated_last_page;
 	int ret = 0;
 
-	ret = bch2_truncate_pages(inode, offset, end);
+	ret = bch2_truncate_folios(inode, offset, end);
 	if (unlikely(ret < 0))
 		goto err;
 
@@ -3310,7 +3309,7 @@ static long bchfs_fallocate(struct bch_inode_info *inode, int mode,
 	}
 
 	if (mode & FALLOC_FL_ZERO_RANGE) {
-		ret = bch2_truncate_pages(inode, offset, end);
+		ret = bch2_truncate_folios(inode, offset, end);
 		if (unlikely(ret < 0))
 			return ret;
 
-- 
cgit 


From a1774a05641cebd4c42c4b5e14ca20319b32711d Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 23 Mar 2023 11:08:04 -0400
Subject: bcachefs: bch_folio_sector_state improvements

 - X-macro-ize the bch_folio_sector_state enum: this means we can easily
   generate strings, which is helpful for debugging.

 - Add helpers for state transitions: folio_sector_dirty(),
   folio_sector_undirty(), folio_sector_reserve()

 - Add folio_sector_set(), a single helper for changing folio sector
   state just so that we have a single place to instrument when we're
   debugging.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-io.c | 153 ++++++++++++++++++++++++++++++++--------------------
 1 file changed, 94 insertions(+), 59 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 4addc0b77136..02262519aae9 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -356,6 +356,65 @@ static void i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode,
 
 /* stored in page->private: */
 
+#define BCH_FOLIO_SECTOR_STATE()	\
+	x(unallocated)			\
+	x(reserved)			\
+	x(dirty)			\
+	x(dirty_reserved)		\
+	x(allocated)
+
+enum bch_folio_sector_state {
+#define x(n)	SECTOR_##n,
+	BCH_FOLIO_SECTOR_STATE()
+#undef x
+};
+
+const char * const bch2_folio_sector_states[] = {
+#define x(n)	#n,
+	BCH_FOLIO_SECTOR_STATE()
+#undef x
+	NULL
+};
+
+static inline enum bch_folio_sector_state
+folio_sector_dirty(enum bch_folio_sector_state state)
+{
+	switch (state) {
+	case SECTOR_unallocated:
+		return SECTOR_dirty;
+	case SECTOR_reserved:
+		return SECTOR_dirty_reserved;
+	default:
+		return state;
+	}
+}
+
+static inline enum bch_folio_sector_state
+folio_sector_undirty(enum bch_folio_sector_state state)
+{
+	switch (state) {
+	case SECTOR_dirty:
+		return SECTOR_unallocated;
+	case SECTOR_dirty_reserved:
+		return SECTOR_reserved;
+	default:
+		return state;
+	}
+}
+
+static inline enum bch_folio_sector_state
+folio_sector_reserve(enum bch_folio_sector_state state)
+{
+	switch (state) {
+	case SECTOR_unallocated:
+		return SECTOR_reserved;
+	case SECTOR_dirty:
+		return SECTOR_dirty_reserved;
+	default:
+		return state;
+	}
+}
+
 struct bch_folio_sector {
 	/* Uncompressed, fully allocated replicas (or on disk reservation): */
 	unsigned		nr_replicas:4;
@@ -364,13 +423,7 @@ struct bch_folio_sector {
 	unsigned		replicas_reserved:4;
 
 	/* i_sectors: */
-	enum {
-		SECTOR_UNALLOCATED,
-		SECTOR_RESERVED,
-		SECTOR_DIRTY,
-		SECTOR_DIRTY_RESERVED,
-		SECTOR_ALLOCATED,
-	}			state:8;
+	enum bch_folio_sector_state state:8;
 };
 
 struct bch_folio {
@@ -384,6 +437,13 @@ struct bch_folio {
 	struct bch_folio_sector	s[];
 };
 
+static inline void folio_sector_set(struct folio *folio,
+			     struct bch_folio *s,
+			     unsigned i, unsigned n)
+{
+	s->s[i].state = n;
+}
+
 static inline struct bch_folio *__bch2_folio(struct folio *folio)
 {
 	return folio_has_private(folio)
@@ -434,10 +494,10 @@ static struct bch_folio *bch2_folio_create(struct folio *folio, gfp_t gfp)
 static unsigned bkey_to_sector_state(struct bkey_s_c k)
 {
 	if (bkey_extent_is_reservation(k))
-		return SECTOR_RESERVED;
+		return SECTOR_reserved;
 	if (bkey_extent_is_allocation(k.k))
-		return SECTOR_ALLOCATED;
-	return SECTOR_UNALLOCATED;
+		return SECTOR_allocated;
+	return SECTOR_unallocated;
 }
 
 static void __bch2_folio_set(struct folio *folio,
@@ -454,7 +514,7 @@ static void __bch2_folio_set(struct folio *folio,
 
 	for (i = pg_offset; i < pg_offset + pg_len; i++) {
 		s->s[i].nr_replicas	= nr_ptrs;
-		s->s[i].state		= state;
+		folio_sector_set(folio, s, i, state);
 	}
 
 	if (i == sectors)
@@ -611,18 +671,10 @@ static void mark_pagecache_reserved(struct bch_inode_info *inode,
 
 			if (s) {
 				spin_lock(&s->lock);
-				for (j = folio_offset; j < folio_offset + folio_len; j++)
-					switch (s->s[j].state) {
-					case SECTOR_UNALLOCATED:
-						s->s[j].state = SECTOR_RESERVED;
-						break;
-					case SECTOR_DIRTY:
-						s->s[j].state = SECTOR_DIRTY_RESERVED;
-						i_sectors_delta--;
-						break;
-					default:
-						break;
-					}
+				for (j = folio_offset; j < folio_offset + folio_len; j++) {
+					i_sectors_delta -= s->s[j].state == SECTOR_dirty;
+					folio_sector_set(folio, s, j, folio_sector_reserve(s->s[j].state));
+				}
 				spin_unlock(&s->lock);
 			}
 
@@ -727,7 +779,7 @@ static int bch2_folio_reservation_get(struct bch_fs *c,
 	     i++) {
 		disk_sectors += sectors_to_reserve(&s->s[i],
 						res->disk.nr_replicas);
-		quota_sectors += s->s[i].state == SECTOR_UNALLOCATED;
+		quota_sectors += s->s[i].state == SECTOR_unallocated;
 	}
 
 	if (disk_sectors) {
@@ -771,17 +823,8 @@ static void bch2_clear_folio_bits(struct folio *folio)
 		disk_res.sectors += s->s[i].replicas_reserved;
 		s->s[i].replicas_reserved = 0;
 
-		switch (s->s[i].state) {
-		case SECTOR_DIRTY:
-			s->s[i].state = SECTOR_UNALLOCATED;
-			--dirty_sectors;
-			break;
-		case SECTOR_DIRTY_RESERVED:
-			s->s[i].state = SECTOR_RESERVED;
-			break;
-		default:
-			break;
-		}
+		dirty_sectors -= s->s[i].state == SECTOR_dirty;
+		folio_sector_set(folio, s, i, folio_sector_undirty(s->s[i].state));
 	}
 
 	bch2_disk_reservation_put(c, &disk_res);
@@ -820,17 +863,9 @@ static void bch2_set_folio_dirty(struct bch_fs *c,
 		s->s[i].replicas_reserved += sectors;
 		res->disk.sectors -= sectors;
 
-		switch (s->s[i].state) {
-		case SECTOR_UNALLOCATED:
-			s->s[i].state = SECTOR_DIRTY;
-			dirty_sectors++;
-			break;
-		case SECTOR_RESERVED:
-			s->s[i].state = SECTOR_DIRTY_RESERVED;
-			break;
-		default:
-			break;
-		}
+		dirty_sectors += s->s[i].state == SECTOR_unallocated;
+
+		folio_sector_set(folio, s, i, folio_sector_dirty(s->s[i].state));
 	}
 
 	spin_unlock(&s->lock);
@@ -1473,10 +1508,9 @@ do_io:
 	/* Before unlocking the page, get copy of reservations: */
 	spin_lock(&s->lock);
 	memcpy(w->tmp, s->s, sizeof(struct bch_folio_sector) * f_sectors);
-	spin_unlock(&s->lock);
 
 	for (i = 0; i < f_sectors; i++) {
-		if (s->s[i].state < SECTOR_DIRTY)
+		if (s->s[i].state < SECTOR_dirty)
 			continue;
 
 		nr_replicas_this_write =
@@ -1486,15 +1520,16 @@ do_io:
 	}
 
 	for (i = 0; i < f_sectors; i++) {
-		if (s->s[i].state < SECTOR_DIRTY)
+		if (s->s[i].state < SECTOR_dirty)
 			continue;
 
 		s->s[i].nr_replicas = w->opts.compression
 			? 0 : nr_replicas_this_write;
 
 		s->s[i].replicas_reserved = 0;
-		s->s[i].state = SECTOR_ALLOCATED;
+		folio_sector_set(folio, s, i, SECTOR_allocated);
 	}
+	spin_unlock(&s->lock);
 
 	BUG_ON(atomic_read(&s->write_count));
 	atomic_set(&s->write_count, 1);
@@ -1510,16 +1545,16 @@ do_io:
 		u64 sector;
 
 		while (offset < f_sectors &&
-		       w->tmp[offset].state < SECTOR_DIRTY)
+		       w->tmp[offset].state < SECTOR_dirty)
 			offset++;
 
 		if (offset == f_sectors)
 			break;
 
 		while (offset + sectors < f_sectors &&
-		       w->tmp[offset + sectors].state >= SECTOR_DIRTY) {
+		       w->tmp[offset + sectors].state >= SECTOR_dirty) {
 			reserved_sectors += w->tmp[offset + sectors].replicas_reserved;
-			dirty_sectors += w->tmp[offset + sectors].state == SECTOR_DIRTY;
+			dirty_sectors += w->tmp[offset + sectors].state == SECTOR_dirty;
 			sectors++;
 		}
 		BUG_ON(!sectors);
@@ -2760,9 +2795,9 @@ static int __bch2_truncate_folio(struct bch_inode_info *inode,
 	     i < round_down(end_offset, block_bytes(c)) >> 9;
 	     i++) {
 		s->s[i].nr_replicas	= 0;
-		if (s->s[i].state == SECTOR_DIRTY)
-			i_sectors_delta--;
-		s->s[i].state		= SECTOR_UNALLOCATED;
+
+		i_sectors_delta -= s->s[i].state == SECTOR_dirty;
+		folio_sector_set(folio, s, i, SECTOR_unallocated);
 	}
 
 	i_sectors_acct(c, inode, NULL, i_sectors_delta);
@@ -2773,7 +2808,7 @@ static int __bch2_truncate_folio(struct bch_inode_info *inode,
 	 * be responsible for the i_size update:
 	 */
 	ret = s->s[(min(inode->v.i_size, folio_end_pos(folio)) -
-		    folio_pos(folio) - 1) >> 9].state >= SECTOR_DIRTY;
+		    folio_pos(folio) - 1) >> 9].state >= SECTOR_dirty;
 
 	folio_zero_segment(folio, start_offset, end_offset);
 
@@ -3531,7 +3566,7 @@ static int folio_data_offset(struct folio *folio, unsigned offset)
 
 	if (s)
 		for (i = offset >> 9; i < sectors; i++)
-			if (s->s[i].state >= SECTOR_DIRTY)
+			if (s->s[i].state >= SECTOR_dirty)
 				return i << 9;
 
 	return -1;
@@ -3648,7 +3683,7 @@ static bool folio_hole_offset(struct address_space *mapping, loff_t *offset)
 	f_offset = *offset - folio_pos(folio);
 
 	for (i = f_offset >> 9; i < sectors; i++)
-		if (s->s[i].state < SECTOR_DIRTY) {
+		if (s->s[i].state < SECTOR_dirty) {
 			*offset = max(*offset, folio_pos(folio) + (i << 9));
 			goto unlock;
 		}
-- 
cgit 


From 40022c0115d29da11da262ced6ca8d1d5426a8b4 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 23 Mar 2023 12:51:47 -0400
Subject: bcachefs: filemap_get_contig_folios_d()

Add a new helper for getting a range of contiguous folios and returning
them in a darray.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-io.c | 70 ++++++++++++++++++++++++++++++++---------------------
 1 file changed, 43 insertions(+), 27 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 02262519aae9..6584a64e5631 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -55,6 +55,41 @@ static inline loff_t folio_end_sector(struct folio *folio)
 	return folio_end_pos(folio) >> 9;
 }
 
+typedef DARRAY(struct folio *) folios;
+
+static int filemap_get_contig_folios_d(struct address_space *mapping,
+				       loff_t start, loff_t end,
+				       int fgp_flags, gfp_t gfp,
+				       folios *folios)
+{
+	struct folio *f;
+	loff_t pos = start;
+	int ret = 0;
+
+	while (pos < end) {
+		if ((u64) pos >= (u64) start + (1ULL << 20))
+			fgp_flags &= ~FGP_CREAT;
+
+		ret = darray_make_room_gfp(folios, 1, gfp & GFP_KERNEL);
+		if (ret)
+			break;
+
+		f = __filemap_get_folio(mapping, pos >> PAGE_SHIFT, fgp_flags, gfp);
+		if (!f)
+			break;
+
+		BUG_ON(folios->nr && folio_pos(f) != pos);
+
+		pos = folio_end_pos(f);
+		darray_push(folios, f);
+	}
+
+	if (!folios->nr && !ret && (fgp_flags & FGP_CREAT))
+		ret = -ENOMEM;
+
+	return folios->nr ? 0 : ret;
+}
+
 struct nocow_flush {
 	struct closure	*cl;
 	struct bch_dev	*ca;
@@ -1754,8 +1789,6 @@ int bch2_write_end(struct file *file, struct address_space *mapping,
 	return copied;
 }
 
-typedef DARRAY(struct folio *) folios;
-
 static noinline void folios_trunc(folios *folios, struct folio **fi)
 {
 	while (folios->data + folios->nr > fi) {
@@ -1784,33 +1817,16 @@ static int __bch2_buffered_write(struct bch_inode_info *inode,
 	bch2_folio_reservation_init(c, inode, &res);
 	darray_init(&folios);
 
-	f_pos = pos;
-	while (f_pos < end) {
-		unsigned fgp_flags = FGP_LOCK|FGP_WRITE|FGP_STABLE;
-
-		if ((u64) f_pos < (u64) pos + (1U << 20))
-			fgp_flags |= FGP_CREAT;
-
-		if (darray_make_room_gfp(&folios, 1,
-				mapping_gfp_mask(mapping) & GFP_KERNEL))
-			break;
-
-		f = __filemap_get_folio(mapping, f_pos >> PAGE_SHIFT,
-					fgp_flags, mapping_gfp_mask(mapping));
-		if (!f)
-			break;
-
-		BUG_ON(folios.nr && folio_pos(f) != f_pos);
+	ret = filemap_get_contig_folios_d(mapping, pos, end,
+				   FGP_LOCK|FGP_WRITE|FGP_STABLE|FGP_CREAT,
+				   mapping_gfp_mask(mapping),
+				   &folios);
+	if (ret)
+		goto out;
 
-		f_pos = folio_end_pos(f);
-		darray_push(&folios, f);
-	}
+	BUG_ON(!folios.nr);
 
-	end = min(end, f_pos);
-	if (end == pos) {
-		ret = -ENOMEM;
-		goto out;
-	}
+	end = min(end, folio_end_pos(darray_last(folios)));
 
 	f = darray_first(folios);
 	if (pos != folio_pos(f) && !folio_test_uptodate(f)) {
-- 
cgit 


From 9567413c82d9dbad24ff6edd0dd160da8b6d9d8f Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 17 Mar 2023 19:24:44 -0400
Subject: bcachefs: bch2_readahead() large folio conversion

Readahead now uses the new filemap_get_contig_folios_d() helper.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-io.c | 118 ++++++++++++++++++++++++++++++++++++----------------
 1 file changed, 83 insertions(+), 35 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 6584a64e5631..af5f21a7a6d0 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -35,6 +35,49 @@
 
 #include <trace/events/writeback.h>
 
+struct folio_vec {
+	struct folio	*fv_folio;
+	size_t		fv_offset;
+	size_t		fv_len;
+};
+
+static inline struct folio_vec biovec_to_foliovec(struct bio_vec bv)
+{
+
+	struct folio *folio	= page_folio(bv.bv_page);
+	size_t offset		= (folio_page_idx(folio, bv.bv_page) << PAGE_SHIFT) +
+		bv.bv_offset;
+	size_t len = min_t(size_t, folio_size(folio) - offset, bv.bv_len);
+
+	return (struct folio_vec) {
+		.fv_folio	= folio,
+		.fv_offset	= offset,
+		.fv_len		= len,
+	};
+}
+
+static inline struct folio_vec bio_iter_iovec_folio(struct bio *bio,
+						    struct bvec_iter iter)
+{
+	return biovec_to_foliovec(bio_iter_iovec(bio, iter));
+}
+
+#define __bio_for_each_folio(bvl, bio, iter, start)			\
+	for (iter = (start);						\
+	     (iter).bi_size &&						\
+		((bvl = bio_iter_iovec_folio((bio), (iter))), 1);	\
+	     bio_advance_iter_single((bio), &(iter), (bvl).fv_len))
+
+/**
+ * bio_for_each_folio - iterate over folios within a bio
+ *
+ * Like other non-_all versions, this iterates over what bio->bi_iter currently
+ * points to. This version is for drivers, where the bio may have previously
+ * been split or cloned.
+ */
+#define bio_for_each_folio(bvl, bio, iter)				\
+	__bio_for_each_folio(bvl, bio, iter, (bio)->bi_iter)
+
 static inline loff_t folio_end_pos(struct folio *folio)
 {
 	return folio_pos(folio) + folio_size(folio);
@@ -622,14 +665,16 @@ err:
 static void bch2_bio_page_state_set(struct bio *bio, struct bkey_s_c k)
 {
 	struct bvec_iter iter;
-	struct bio_vec bv;
+	struct folio_vec fv;
 	unsigned nr_ptrs = k.k->type == KEY_TYPE_reflink_v
 		? 0 : bch2_bkey_nr_ptrs_fully_allocated(k);
 	unsigned state = bkey_to_sector_state(k);
 
-	bio_for_each_segment(bv, bio, iter)
-		__bch2_folio_set(page_folio(bv.bv_page), bv.bv_offset >> 9,
-				 bv.bv_len >> 9, nr_ptrs, state);
+	bio_for_each_folio(fv, bio, iter)
+		__bch2_folio_set(fv.fv_folio,
+				 fv.fv_offset >> 9,
+				 fv.fv_len >> 9,
+				 nr_ptrs, state);
 }
 
 static void mark_pagecache_unallocated(struct bch_inode_info *inode,
@@ -1049,44 +1094,48 @@ static void bch2_readpages_end_io(struct bio *bio)
 
 struct readpages_iter {
 	struct address_space	*mapping;
-	struct page		**pages;
-	unsigned		nr_pages;
 	unsigned		idx;
-	pgoff_t			offset;
+	folios			folios;
 };
 
 static int readpages_iter_init(struct readpages_iter *iter,
 			       struct readahead_control *ractl)
 {
-	unsigned i, nr_pages = readahead_count(ractl);
+	struct folio **fi;
+	int ret;
 
 	memset(iter, 0, sizeof(*iter));
 
-	iter->mapping	= ractl->mapping;
-	iter->offset	= readahead_index(ractl);
-	iter->nr_pages	= nr_pages;
+	iter->mapping = ractl->mapping;
 
-	iter->pages = kmalloc_array(nr_pages, sizeof(struct page *), GFP_NOFS);
-	if (!iter->pages)
-		return -ENOMEM;
+	ret = filemap_get_contig_folios_d(iter->mapping,
+				ractl->_index << PAGE_SHIFT,
+				(ractl->_index + ractl->_nr_pages) << PAGE_SHIFT,
+				0, mapping_gfp_mask(iter->mapping),
+				&iter->folios);
+	if (ret)
+		return ret;
 
-	nr_pages = __readahead_batch(ractl, iter->pages, nr_pages);
-	for (i = 0; i < nr_pages; i++) {
-		__bch2_folio_create(page_folio(iter->pages[i]), __GFP_NOFAIL);
-		put_page(iter->pages[i]);
+	darray_for_each(iter->folios, fi) {
+		ractl->_nr_pages -= 1U << folio_order(*fi);
+		__bch2_folio_create(*fi, __GFP_NOFAIL);
+		folio_put(*fi);
+		folio_put(*fi);
 	}
 
 	return 0;
 }
 
-static inline struct folio *readpage_iter_next(struct readpages_iter *iter)
+static inline struct folio *readpage_iter_peek(struct readpages_iter *iter)
 {
-	if (iter->idx >= iter->nr_pages)
+	if (iter->idx >= iter->folios.nr)
 		return NULL;
+	return iter->folios.data[iter->idx];
+}
 
-	EBUG_ON(iter->pages[iter->idx]->index != iter->offset + iter->idx);
-
-	return page_folio(iter->pages[iter->idx]);
+static inline void readpage_iter_advance(struct readpages_iter *iter)
+{
+	iter->idx++;
 }
 
 static bool extent_partial_reads_expensive(struct bkey_s_c k)
@@ -1108,16 +1157,14 @@ static void readpage_bio_extend(struct readpages_iter *iter,
 {
 	while (bio_sectors(bio) < sectors_this_extent &&
 	       bio->bi_vcnt < bio->bi_max_vecs) {
-		pgoff_t folio_offset = bio_end_sector(bio) >> PAGE_SECTORS_SHIFT;
-		struct folio *folio = readpage_iter_next(iter);
+		struct folio *folio = readpage_iter_peek(iter);
 		int ret;
 
 		if (folio) {
-			if (iter->offset + iter->idx != folio_offset)
-				break;
-
-			iter->idx++;
+			readpage_iter_advance(iter);
 		} else {
+			pgoff_t folio_offset = bio_end_sector(bio) >> PAGE_SECTORS_SHIFT;
+
 			if (!get_more)
 				break;
 
@@ -1144,6 +1191,8 @@ static void readpage_bio_extend(struct readpages_iter *iter,
 			folio_put(folio);
 		}
 
+		BUG_ON(folio_sector(folio) != bio_end_sector(bio));
+
 		BUG_ON(!bio_add_folio(bio, folio, folio_size(folio), 0));
 	}
 }
@@ -1275,10 +1324,9 @@ void bch2_readahead(struct readahead_control *ractl)
 
 	bch2_pagecache_add_get(inode);
 
-	while ((folio = readpage_iter_next(&readpages_iter))) {
-		pgoff_t index = readpages_iter.offset + readpages_iter.idx;
+	while ((folio = readpage_iter_peek(&readpages_iter))) {
 		unsigned n = min_t(unsigned,
-				   readpages_iter.nr_pages -
+				   readpages_iter.folios.nr -
 				   readpages_iter.idx,
 				   BIO_MAX_VECS);
 		struct bch_read_bio *rbio =
@@ -1286,9 +1334,9 @@ void bch2_readahead(struct readahead_control *ractl)
 						   GFP_NOFS, &c->bio_read),
 				  opts);
 
-		readpages_iter.idx++;
+		readpage_iter_advance(&readpages_iter);
 
-		rbio->bio.bi_iter.bi_sector = (sector_t) index << PAGE_SECTORS_SHIFT;
+		rbio->bio.bi_iter.bi_sector = folio_sector(folio);
 		rbio->bio.bi_end_io = bch2_readpages_end_io;
 		BUG_ON(!bio_add_folio(&rbio->bio, folio, folio_size(folio), 0));
 
@@ -1299,7 +1347,7 @@ void bch2_readahead(struct readahead_control *ractl)
 	bch2_pagecache_add_put(inode);
 
 	bch2_trans_exit(&trans);
-	kfree(readpages_iter.pages);
+	darray_exit(&readpages_iter.folios);
 }
 
 static void __bchfs_readfolio(struct bch_fs *c, struct bch_read_bio *rbio,
-- 
cgit 


From 34fdcf0632a5e549c5bfed1847a6d6995606ec17 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 27 Mar 2023 16:55:27 -0400
Subject: bcachefs: Check for folios that don't have bch_folio attached

With large folios, it's now incidentally possible to end up with a
clean, uptodate folio in the page cache that doesn't have a bch_folio
attached, if a folio has to be split.

This patch fixes __bch2_truncate_folio() to check for this; other code
paths appear to handle it.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-io.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index af5f21a7a6d0..e805d8ce7bba 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -926,6 +926,8 @@ static void bch2_set_folio_dirty(struct bch_fs *c,
 	WARN_ON((u64) folio_pos(folio) + offset + len >
 		round_up((u64) i_size_read(&inode->v), block_bytes(c)));
 
+	BUG_ON(!s->uptodate);
+
 	spin_lock(&s->lock);
 
 	for (i = round_down(offset, block_bytes(c)) >> 9;
@@ -2853,7 +2855,11 @@ static int __bch2_truncate_folio(struct bch_inode_info *inode,
 			goto unlock;
 	}
 
-	BUG_ON(!s->uptodate);
+	if (!s->uptodate) {
+		ret = bch2_folio_set(c, inode_inum(inode), &folio, 1);
+		if (ret)
+			goto unlock;
+	}
 
 	for (i = round_up(start_offset, block_bytes(c)) >> 9;
 	     i < round_down(end_offset, block_bytes(c)) >> 9;
-- 
cgit 


From 550a6a496d33034878172ed789e03feaee6cee43 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 19 Mar 2023 16:47:30 -0400
Subject: bcachefs: Enable large folios

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index 58a89c36cf0e..99082820e30b 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -1369,6 +1369,8 @@ static void bch2_vfs_inode_init(struct btree_trans *trans, subvol_inum inum,
 		inode->v.i_op	= &bch_special_inode_operations;
 		break;
 	}
+
+	mapping_set_large_folios(inode->v.i_mapping);
 }
 
 static struct inode *bch2_alloc_inode(struct super_block *sb)
-- 
cgit 


From 4ad6aa46e1c19c04e5542e8cca859d9aaa9e70b6 Mon Sep 17 00:00:00 2001
From: Brian Foster <bfoster@redhat.com>
Date: Wed, 29 Mar 2023 09:49:04 -0400
Subject: bcachefs: fix truncate overflow if folio is beyond EOF

generic/083 occasionally reproduces a panic caused by an overflow
when accessing the bch_folio_sector array of the folio being
processed by __bch2_truncate_folio(). The immediate cause of the
overflow is that the folio offset is beyond i_size, and therefore
the sector index calculation underflows on subtraction of the folio
offset.

One cause of this is mainly observed on nocow mounts. When nocow is
enabled, fallocate performs physical block allocation (as opposed to
block reservation in cow mode), which range_has_data() then
interprets as valid data that requires partial zeroing on truncate.
Therefore, if a post-eof zero range request lands across post-eof
preallocated blocks, __bch2_truncate_folio() may actually create a
post-eof folio in order to perform zeroing. To avoid this problem,
update range_has_data() to filter out unwritten blocks from folio
creation and partial zeroing.

Even though we should never create folios beyond EOF like this, the
mere existence of such folios is not necessarily a fatal error. Fix
up the truncate code to warn about this condition and not overflow
the sector array and possibly crash the system. The addition of this
warning without the corresponding unwritten extent fix has shown
that various other fstests are able to reproduce this problem fairly
frequently, but often in ways that doesn't necessarily result in a
kernel panic or a change in user observable behavior, and therefore
the problem goes undetected.

Signed-off-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-io.c | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index e805d8ce7bba..8477e1df3397 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -2783,7 +2783,7 @@ retry:
 		goto err;
 
 	for_each_btree_key_upto_norestart(&trans, iter, BTREE_ID_extents, start, end, 0, k, ret)
-		if (bkey_extent_is_data(k.k)) {
+		if (bkey_extent_is_data(k.k) && !bkey_extent_is_unwritten(k)) {
 			ret = 1;
 			break;
 		}
@@ -2809,6 +2809,7 @@ static int __bch2_truncate_folio(struct bch_inode_info *inode,
 	struct folio *folio;
 	s64 i_sectors_delta = 0;
 	int ret = 0;
+	loff_t end_pos;
 
 	folio = filemap_lock_folio(mapping, index);
 	if (!folio) {
@@ -2875,10 +2876,18 @@ static int __bch2_truncate_folio(struct bch_inode_info *inode,
 	/*
 	 * Caller needs to know whether this folio will be written out by
 	 * writeback - doing an i_size update if necessary - or whether it will
-	 * be responsible for the i_size update:
+	 * be responsible for the i_size update.
+	 *
+	 * Note that we shouldn't ever see a folio beyond EOF, but check and
+	 * warn if so. This has been observed by failure to clean up folios
+	 * after a short write and there's still a chance reclaim will fix
+	 * things up.
 	 */
-	ret = s->s[(min(inode->v.i_size, folio_end_pos(folio)) -
-		    folio_pos(folio) - 1) >> 9].state >= SECTOR_dirty;
+	WARN_ON_ONCE(folio_pos(folio) >= inode->v.i_size);
+	end_pos = folio_end_pos(folio);
+	if (inode->v.i_size > folio_pos(folio))
+		end_pos = min(inode->v.i_size, end_pos);
+	ret = s->s[(end_pos - folio_pos(folio) - 1) >> 9].state >= SECTOR_dirty;
 
 	folio_zero_segment(folio, start_offset, end_offset);
 
-- 
cgit 


From 335f7d4f22fd27ea86398a3617ce41ab3d478ae6 Mon Sep 17 00:00:00 2001
From: Brian Foster <bfoster@redhat.com>
Date: Wed, 29 Mar 2023 11:23:15 -0400
Subject: bcachefs: clean up post-eof folios on -ENOSPC

The buffered write path batches folio creations in the file mapping
based on the requested size of the write. Under low free space
conditions, it is possible to add a bunch of folios to the mapping
and then return a short write or -ENOSPC due to lack of space. If
this occurs on an extending write, the file size is updated based on
the amount of data successfully written to the file. If folios were
added beyond the final i_size, they may hang around until reclaimed,
truncated or encountered unexpectedly by another operation.

For example, generic/083 reproduces a sequence of events where a
short write leaves around one or more post-EOF folios on an inode, a
subsequent zero range request extends beyond i_size and overlaps
with an aforementioned folio, and __bch2_truncate_folio() happens
across it and complains.

Update __bch2_buffered_write() to keep track of the start offset of
the last folio added to the mapping for a prospective write. After
i_size is updated, check whether this offset starts beyond EOF. If
so, truncate pagecache beyond the latest EOF to clean up any folios
that don't reside at least partially within EOF upon completion of
the write.

Signed-off-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-io.c | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 8477e1df3397..acb2135a3235 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -1860,6 +1860,7 @@ static int __bch2_buffered_write(struct bch_inode_info *inode,
 	struct folio **fi, *f;
 	unsigned copied = 0, f_offset;
 	loff_t end = pos + len, f_pos;
+	loff_t last_folio_pos = inode->v.i_size;
 	int ret = 0;
 
 	BUG_ON(!len);
@@ -1876,8 +1877,6 @@ static int __bch2_buffered_write(struct bch_inode_info *inode,
 
 	BUG_ON(!folios.nr);
 
-	end = min(end, folio_end_pos(darray_last(folios)));
-
 	f = darray_first(folios);
 	if (pos != folio_pos(f) && !folio_test_uptodate(f)) {
 		ret = bch2_read_single_folio(f, mapping);
@@ -1886,6 +1885,8 @@ static int __bch2_buffered_write(struct bch_inode_info *inode,
 	}
 
 	f = darray_last(folios);
+	end = min(end, folio_end_pos(f));
+	last_folio_pos = folio_pos(f);
 	if (end != folio_end_pos(f) && !folio_test_uptodate(f)) {
 		if (end >= inode->v.i_size) {
 			folio_zero_range(f, 0, folio_size(f));
@@ -1999,6 +2000,14 @@ out:
 		folio_put(*fi);
 	}
 
+	/*
+	 * If the last folio added to the mapping starts beyond current EOF, we
+	 * performed a short write but left around at least one post-EOF folio.
+	 * Clean up the mapping before we return.
+	 */
+	if (last_folio_pos >= inode->v.i_size)
+		truncate_pagecache(&inode->v, inode->v.i_size);
+
 	darray_exit(&folios);
 	bch2_folio_reservation_put(c, inode, &res);
 
-- 
cgit 


From 6b9857b208d7566d8bfd332a543b1dca92202c2b Mon Sep 17 00:00:00 2001
From: Brian Foster <bfoster@redhat.com>
Date: Wed, 29 Mar 2023 10:43:23 -0400
Subject: bcachefs: use u64 for folio end pos to avoid overflows

Some of the folio_end_*() helpers are prone to overflow of signed
64-bit types because the mapping is only limited by the max value of
loff_t and the associated helpers return the start offset of the
next folio. Therefore, a folio_end_pos() of the max allowable folio in a
mapping returns a value that overflows loff_t.

This makes it hard to rely on such values when doing folio
processing across a range of a file, as bcachefs attempts to do with
the recent folio changes. For example, generic/564 causes problems
in the buffered write path when testing writes at max boundary
conditions.

The current understanding is that the pagecache historically limited
the mapping to one less page to avoid this problem and this was
dropped with some of the folio conversions, but may be reinstated to
properly address the problem. In the meantime, update the internal
folio_end_*() helpers in bcachefs to return a u64, and all of the
associated code to use or cast to u64 to avoid overflow problems.
This allows generic/564 to pass and can be reverted back to using
loff_t if at any point the pagecache subsystem can guarantee these
boundary conditions will not overflow.

Signed-off-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-io.c | 28 +++++++++++++++++-----------
 1 file changed, 17 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index acb2135a3235..7823141ea98b 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -78,7 +78,13 @@ static inline struct folio_vec bio_iter_iovec_folio(struct bio *bio,
 #define bio_for_each_folio(bvl, bio, iter)				\
 	__bio_for_each_folio(bvl, bio, iter, (bio)->bi_iter)
 
-static inline loff_t folio_end_pos(struct folio *folio)
+/*
+ * Use u64 for the end pos and sector helpers because if the folio covers the
+ * max supported range of the mapping, the start offset of the next folio
+ * overflows loff_t. This breaks much of the range based processing in the
+ * buffered write path.
+ */
+static inline u64 folio_end_pos(struct folio *folio)
 {
 	return folio_pos(folio) + folio_size(folio);
 }
@@ -93,7 +99,7 @@ static inline loff_t folio_sector(struct folio *folio)
 	return folio_pos(folio) >> 9;
 }
 
-static inline loff_t folio_end_sector(struct folio *folio)
+static inline u64 folio_end_sector(struct folio *folio)
 {
 	return folio_end_pos(folio) >> 9;
 }
@@ -101,12 +107,12 @@ static inline loff_t folio_end_sector(struct folio *folio)
 typedef DARRAY(struct folio *) folios;
 
 static int filemap_get_contig_folios_d(struct address_space *mapping,
-				       loff_t start, loff_t end,
+				       loff_t start, u64 end,
 				       int fgp_flags, gfp_t gfp,
 				       folios *folios)
 {
 	struct folio *f;
-	loff_t pos = start;
+	u64 pos = start;
 	int ret = 0;
 
 	while (pos < end) {
@@ -1859,7 +1865,7 @@ static int __bch2_buffered_write(struct bch_inode_info *inode,
 	folios folios;
 	struct folio **fi, *f;
 	unsigned copied = 0, f_offset;
-	loff_t end = pos + len, f_pos;
+	u64 end = pos + len, f_pos;
 	loff_t last_folio_pos = inode->v.i_size;
 	int ret = 0;
 
@@ -1901,7 +1907,7 @@ static int __bch2_buffered_write(struct bch_inode_info *inode,
 	f_offset = pos - folio_pos(darray_first(folios));
 	darray_for_each(folios, fi) {
 		struct folio *f = *fi;
-		unsigned f_len = min(end, folio_end_pos(f)) - f_pos;
+		u64 f_len = min(end, folio_end_pos(f)) - f_pos;
 
 		if (!bch2_folio_create(f, __GFP_NOFAIL)->uptodate) {
 			ret = bch2_folio_set(c, inode_inum(inode), fi,
@@ -1940,7 +1946,7 @@ static int __bch2_buffered_write(struct bch_inode_info *inode,
 	f_offset = pos - folio_pos(darray_first(folios));
 	darray_for_each(folios, fi) {
 		struct folio *f = *fi;
-		unsigned f_len = min(end, folio_end_pos(f)) - f_pos;
+		u64 f_len = min(end, folio_end_pos(f)) - f_pos;
 		unsigned f_copied = copy_page_from_iter_atomic(&f->page, f_offset, f_len, iter);
 
 		if (!f_copied) {
@@ -1982,7 +1988,7 @@ static int __bch2_buffered_write(struct bch_inode_info *inode,
 	f_offset = pos - folio_pos(darray_first(folios));
 	darray_for_each(folios, fi) {
 		struct folio *f = *fi;
-		unsigned f_len = min(end, folio_end_pos(f)) - f_pos;
+		u64 f_len = min(end, folio_end_pos(f)) - f_pos;
 
 		if (!folio_test_uptodate(f))
 			folio_mark_uptodate(f);
@@ -2818,7 +2824,7 @@ static int __bch2_truncate_folio(struct bch_inode_info *inode,
 	struct folio *folio;
 	s64 i_sectors_delta = 0;
 	int ret = 0;
-	loff_t end_pos;
+	u64 end_pos;
 
 	folio = filemap_lock_folio(mapping, index);
 	if (!folio) {
@@ -2844,7 +2850,7 @@ static int __bch2_truncate_folio(struct bch_inode_info *inode,
 	BUG_ON(end	<= folio_pos(folio));
 
 	start_offset	= max(start, folio_pos(folio)) - folio_pos(folio);
-	end_offset	= min(end, folio_end_pos(folio)) - folio_pos(folio);
+	end_offset	= min_t(u64, end, folio_end_pos(folio)) - folio_pos(folio);
 
 	/* Folio boundary? Nothing to do */
 	if (start_offset == 0 &&
@@ -2895,7 +2901,7 @@ static int __bch2_truncate_folio(struct bch_inode_info *inode,
 	WARN_ON_ONCE(folio_pos(folio) >= inode->v.i_size);
 	end_pos = folio_end_pos(folio);
 	if (inode->v.i_size > folio_pos(folio))
-		end_pos = min(inode->v.i_size, end_pos);
+		end_pos = min_t(u64, inode->v.i_size, end_pos);
 	ret = s->s[(end_pos - folio_pos(folio) - 1) >> 9].state >= SECTOR_dirty;
 
 	folio_zero_segment(folio, start_offset, end_offset);
-- 
cgit 


From 853b7393c20d5e129f2b16719102a05bbb5dc36f Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 15 Apr 2023 14:26:14 -0400
Subject: bcachefs: Allow answering y or n to all fsck errors of given type

This changes the ask_yn() function used by fsck to accept Y or N,
meaning yes or no for all errors of a given type.

With this, the user can be prompted only for distinct error types -
useful when a filesystem has lots of errors.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/error.c | 63 +++++++++++++++++++++++++++++++++++++++++++++++++----
 fs/bcachefs/error.h |  1 +
 2 files changed, 60 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c
index 1dae649ff0e2..aa640284ed19 100644
--- a/fs/bcachefs/error.c
+++ b/fs/bcachefs/error.c
@@ -65,10 +65,51 @@ void bch2_io_error(struct bch_dev *ca)
 	//queue_work(system_long_wq, &ca->io_error_work);
 }
 
+enum ask_yn {
+	YN_NO,
+	YN_YES,
+	YN_ALLNO,
+	YN_ALLYES,
+};
+
 #ifdef __KERNEL__
-#define ask_yn()	false
+#define bch2_fsck_ask_yn()	YN_NO
 #else
+
 #include "tools-util.h"
+
+enum ask_yn bch2_fsck_ask_yn(void)
+{
+	char *buf = NULL;
+	size_t buflen = 0;
+	bool ret;
+
+	while (true) {
+		fputs(" (y,n,Y,N) ", stdout);
+		fflush(stdout);
+
+		if (getline(&buf, &buflen, stdin) < 0)
+			die("error reading from standard input");
+
+		if (strlen(buf) != 1)
+			continue;
+
+		switch (buf[0]) {
+		case 'n':
+			return YN_NO;
+		case 'y':
+			return YN_YES;
+		case 'N':
+			return YN_ALLNO;
+		case 'Y':
+			return YN_ALLYES;
+		}
+	}
+
+	free(buf);
+	return ret;
+}
+
 #endif
 
 static struct fsck_err_state *fsck_err_get(struct bch_fs *c, const char *fmt)
@@ -161,14 +202,28 @@ int bch2_fsck_err(struct bch_fs *c, unsigned flags, const char *fmt, ...)
 		prt_str(out, ", exiting");
 		ret = -BCH_ERR_fsck_errors_not_fixed;
 	} else if (flags & FSCK_CAN_FIX) {
-		if (c->opts.fix_errors == FSCK_OPT_ASK) {
+		int fix = s && s->fix
+			? s->fix
+			: c->opts.fix_errors;
+
+		if (fix == FSCK_OPT_ASK) {
+			int ask;
+
 			prt_str(out, ": fix?");
 			bch2_print_string_as_lines(KERN_ERR, out->buf);
 			print = false;
-			ret = ask_yn()
+
+			ask = bch2_fsck_ask_yn();
+
+			if (ask >= YN_ALLNO && s)
+				s->fix = ask == YN_ALLNO
+					? FSCK_OPT_NO
+					: FSCK_OPT_YES;
+
+			ret = ask & 1
 				? -BCH_ERR_fsck_fix
 				: -BCH_ERR_fsck_ignore;
-		} else if (c->opts.fix_errors == FSCK_OPT_YES ||
+		} else if (fix == FSCK_OPT_YES ||
 			   (c->opts.nochanges &&
 			    !(flags & FSCK_CAN_IGNORE))) {
 			prt_str(out, ", fixing");
diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h
index 91c7e4ee8f72..edf12443822c 100644
--- a/fs/bcachefs/error.h
+++ b/fs/bcachefs/error.h
@@ -104,6 +104,7 @@ struct fsck_err_state {
 	u64			nr;
 	bool			ratelimited;
 	int			ret;
+	int			fix;
 	char			*last_msg;
 };
 
-- 
cgit 


From 615fccada50247abbc61c6c0a0d9c717b3fb6290 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 16 Apr 2023 07:10:46 -0400
Subject: bcachefs: Fix a slab-out-of-bounds

In __bch2_alloc_to_v4_mut(), we overrun the buffer we allocate if the
alloc key had backpointers stored in it (which we no longer support).

Fix this with a max() call.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 06032556d5c4..4032d1940884 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -479,7 +479,7 @@ __bch2_alloc_to_v4_mut(struct btree_trans *trans, struct bkey_s_c k)
 {
 	struct bkey_i_alloc_v4 *ret;
 
-	ret = bch2_trans_kmalloc(trans, sizeof(struct bkey_i_alloc_v4));
+	ret = bch2_trans_kmalloc(trans, max(bkey_bytes(k.k), sizeof(struct bkey_i_alloc_v4)));
 	if (IS_ERR(ret))
 		return ret;
 
-- 
cgit 


From e3dc75eb551599c356a9a3f8c00ae6396164457d Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 16 Apr 2023 21:49:12 -0400
Subject: bcachefs: Fix a null ptr deref in fsck check_extents()

It turns out, in rare situations we need to be passing in a disk
reservation, which will be used internally by the transaction commit
path when needed. Pass one in...

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fsck.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index ed2523ac2249..6319f2f7b16f 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -3,6 +3,7 @@
 #include "bcachefs.h"
 #include "bkey_buf.h"
 #include "btree_update.h"
+#include "buckets.h"
 #include "darray.h"
 #include "dirent.h"
 #include "error.h"
@@ -1407,6 +1408,7 @@ static int check_extents(struct bch_fs *c)
 	struct btree_iter iter;
 	struct bkey_s_c k;
 	extent_ends extent_ends = { 0 };
+	struct disk_reservation res = { 0 };
 	int ret = 0;
 
 	snapshots_seen_init(&s);
@@ -1417,10 +1419,13 @@ static int check_extents(struct bch_fs *c)
 	ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_extents,
 			POS(BCACHEFS_ROOT_INO, 0),
 			BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
-			NULL, NULL,
-			BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
-		check_extent(&trans, &iter, k, &w, &s, &extent_ends));
+			&res, NULL,
+			BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, ({
+		bch2_disk_reservation_put(c, &res);
+		check_extent(&trans, &iter, k, &w, &s, &extent_ends);
+	}));
 
+	bch2_disk_reservation_put(c, &res);
 	extent_ends_reset(&extent_ends);
 	darray_exit(&extent_ends);
 	inode_walker_exit(&w);
-- 
cgit 


From bf98ee10d45af8e97c0802e39cc77ee607072633 Mon Sep 17 00:00:00 2001
From: Brian Foster <bfoster@redhat.com>
Date: Mon, 3 Apr 2023 08:17:26 -0400
Subject: bcachefs: folio pos to bch_folio_sector index helper

Create a small helper to translate from file offset to the
associated bch_folio_sector index in the underlying bch_folio. The
helper assumes the file offset is covered by the passed folio.

Signed-off-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-io.c | 28 +++++++++++++++++-----------
 1 file changed, 17 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 7823141ea98b..ea5039254609 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -528,6 +528,14 @@ static inline void folio_sector_set(struct folio *folio,
 	s->s[i].state = n;
 }
 
+/* file offset (to folio offset) to bch_folio_sector index */
+static inline int folio_pos_to_s(struct folio *folio, loff_t pos)
+{
+	u64 f_offset = pos - folio_pos(folio);
+	BUG_ON(pos < folio_pos(folio) || pos >= folio_end_pos(folio));
+	return f_offset >> SECTOR_SHIFT;
+}
+
 static inline struct bch_folio *__bch2_folio(struct folio *folio)
 {
 	return folio_has_private(folio)
@@ -2902,7 +2910,7 @@ static int __bch2_truncate_folio(struct bch_inode_info *inode,
 	end_pos = folio_end_pos(folio);
 	if (inode->v.i_size > folio_pos(folio))
 		end_pos = min_t(u64, inode->v.i_size, end_pos);
-	ret = s->s[(end_pos - folio_pos(folio) - 1) >> 9].state >= SECTOR_dirty;
+	ret = s->s[folio_pos_to_s(folio, end_pos - 1)].state >= SECTOR_dirty;
 
 	folio_zero_segment(folio, start_offset, end_offset);
 
@@ -3653,15 +3661,15 @@ err:
 
 /* fseek: */
 
-static int folio_data_offset(struct folio *folio, unsigned offset)
+static int folio_data_offset(struct folio *folio, loff_t pos)
 {
 	struct bch_folio *s = bch2_folio(folio);
 	unsigned i, sectors = folio_sectors(folio);
 
 	if (s)
-		for (i = offset >> 9; i < sectors; i++)
+		for (i = folio_pos_to_s(folio, pos); i < sectors; i++)
 			if (s->s[i].state >= SECTOR_dirty)
-				return i << 9;
+				return i << SECTOR_SHIFT;
 
 	return -1;
 }
@@ -3687,8 +3695,7 @@ static loff_t bch2_seek_pagecache_data(struct inode *vinode,
 
 			folio_lock(folio);
 			offset = folio_data_offset(folio,
-					max(folio_pos(folio), start_offset) -
-					folio_pos(folio));
+					max(folio_pos(folio), start_offset));
 			if (offset >= 0) {
 				ret = clamp(folio_pos(folio) + offset,
 					    start_offset, end_offset);
@@ -3762,7 +3769,7 @@ static bool folio_hole_offset(struct address_space *mapping, loff_t *offset)
 {
 	struct folio *folio;
 	struct bch_folio *s;
-	unsigned i, sectors, f_offset;
+	unsigned i, sectors;
 	bool ret = true;
 
 	folio = filemap_lock_folio(mapping, *offset >> PAGE_SHIFT);
@@ -3774,11 +3781,10 @@ static bool folio_hole_offset(struct address_space *mapping, loff_t *offset)
 		goto unlock;
 
 	sectors = folio_sectors(folio);
-	f_offset = *offset - folio_pos(folio);
-
-	for (i = f_offset >> 9; i < sectors; i++)
+	for (i = folio_pos_to_s(folio, *offset); i < sectors; i++)
 		if (s->s[i].state < SECTOR_dirty) {
-			*offset = max(*offset, folio_pos(folio) + (i << 9));
+			*offset = max(*offset,
+				      folio_pos(folio) + (i << SECTOR_SHIFT));
 			goto unlock;
 		}
 
-- 
cgit 


From 251babb55d53d79bba9568d6516fd11128c34606 Mon Sep 17 00:00:00 2001
From: Brian Foster <bfoster@redhat.com>
Date: Tue, 18 Apr 2023 13:05:47 -0400
Subject: bcachefs: fix NULL bch_dev deref when checking bucket_gens keys

fsck removes bucket_gens keys for devices that do not exist in the
volume (i.e., if the device was removed). In 'fsck -n' mode, the
associated fsck_err_on() wrapper returns false to skip the key
removal. This proceeds on to the rest of the function, which
eventually segfaults on a NULL bch_dev because the device does not
exist.

Update bch2_check_bucket_gens_key() to skip out of the rest of the
function when the associated device does not exist, regardless of
running fsck in check or repair mode.

Signed-off-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 4032d1940884..968b6103e979 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -1362,17 +1362,21 @@ static int bch2_check_bucket_gens_key(struct btree_trans *trans,
 	u64 start = bucket_gens_pos_to_alloc(k.k->p, 0).offset;
 	u64 end = bucket_gens_pos_to_alloc(bpos_nosnap_successor(k.k->p), 0).offset;
 	u64 b;
-	bool need_update = false;
+	bool need_update = false, dev_exists;
 	struct printbuf buf = PRINTBUF;
 	int ret = 0;
 
 	BUG_ON(k.k->type != KEY_TYPE_bucket_gens);
 	bkey_reassemble(&g.k_i, k);
 
-	if (fsck_err_on(!bch2_dev_exists2(c, k.k->p.inode), c,
-			"bucket_gens key for invalid device:\n  %s",
-			(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
-		ret = bch2_btree_delete_at(trans, iter, 0);
+	/* if no bch_dev, skip out whether we repair or not */
+	dev_exists = bch2_dev_exists2(c, k.k->p.inode);
+	if (!dev_exists) {
+		if (fsck_err_on(!dev_exists, c,
+				"bucket_gens key for invalid device:\n  %s",
+				(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
+			ret = bch2_btree_delete_at(trans, iter, 0);
+		}
 		goto out;
 	}
 
-- 
cgit 


From 02d51bb9a7315eb569a160363058ca2cd140faa1 Mon Sep 17 00:00:00 2001
From: Brian Foster <bfoster@redhat.com>
Date: Wed, 19 Apr 2023 11:47:03 -0400
Subject: bcachefs: remove bucket_gens btree keys on device removal

If a device has keys in the bucket_gens btree associated with its
buckets and is removed from a bcachefs volume, fsck will complain
about the presence of keys associated with an invalid device index.
A repair removes the associated keys and restores correctness.
Update bch2_dev_remove_alloc() to remove device related keys at
device removal time to avoid the problem.

Signed-off-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/super.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 265ffa9bfd4c..af6cc73d9356 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -1431,6 +1431,8 @@ static int bch2_dev_remove_alloc(struct bch_fs *c, struct bch_dev *ca)
 		bch2_btree_delete_range(c, BTREE_ID_backpointers, start, end,
 					BTREE_TRIGGER_NORUN, NULL) ?:
 		bch2_btree_delete_range(c, BTREE_ID_alloc, start, end,
+					BTREE_TRIGGER_NORUN, NULL) ?:
+		bch2_btree_delete_range(c, BTREE_ID_bucket_gens, start, end,
 					BTREE_TRIGGER_NORUN, NULL);
 	if (ret)
 		bch_err(c, "error removing dev alloc info: %s", bch2_err_str(ret));
-- 
cgit 


From 09ebfa61130edaa990c0f1865fe2fa536d67c313 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 21 Apr 2023 03:33:45 -0400
Subject: bcachefs: Drop a redundant error message

When we're already read-only, we don't need to print out errors from
writing btree nodes.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_io.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 0489d07a087f..decbbaace1ee 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -1746,7 +1746,7 @@ static void btree_node_write_work(struct work_struct *work)
 	struct bch_fs *c	= wbio->wbio.c;
 	struct btree *b		= wbio->wbio.bio.bi_private;
 	struct bch_extent_ptr *ptr;
-	int ret;
+	int ret = 0;
 
 	btree_bounce_free(c,
 		wbio->data_bytes,
@@ -1776,7 +1776,8 @@ out:
 	return;
 err:
 	set_btree_node_noevict(b);
-	bch2_fs_fatal_error(c, "fatal error writing btree node");
+	if (!bch2_err_matches(ret, EROFS))
+		bch2_fs_fatal_error(c, "fatal error writing btree node");
 	goto out;
 }
 
-- 
cgit 


From 5a21764db13877eb1166baf12d2782ebb38b196e Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 20 Apr 2023 15:24:07 -0400
Subject: bcachefs: Improve move path tracepoints

Move path tracepoints now include the key being moved. Also, add new
tracepoints for the start of move_extent, and evacuate_bucket.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/data_update.c | 13 ++++++++++-
 fs/bcachefs/move.c        | 43 ++++++++++++++++++++++++++++++++---
 fs/bcachefs/trace.h       | 57 ++++++++++++++++++++++++++++++++---------------
 3 files changed, 91 insertions(+), 22 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c
index 7a5850679f16..cffb3c14d539 100644
--- a/fs/bcachefs/data_update.c
+++ b/fs/bcachefs/data_update.c
@@ -92,6 +92,17 @@ static int insert_snapshot_whiteouts(struct btree_trans *trans,
 	return ret;
 }
 
+static void trace_move_extent_finish2(struct bch_fs *c, struct bkey_s_c k)
+{
+	if (trace_move_extent_finish_enabled()) {
+		struct printbuf buf = PRINTBUF;
+
+		bch2_bkey_val_to_text(&buf, c, k);
+		trace_move_extent_finish(c, buf.buf);
+		printbuf_exit(&buf);
+	}
+}
+
 static void trace_move_extent_fail2(struct data_update *m,
 			 struct bkey_s_c new,
 			 struct bkey_s_c wrote,
@@ -342,7 +353,7 @@ restart_drop_extra_replicas:
 			bch2_btree_iter_set_pos(&iter, next_pos);
 
 			this_cpu_add(c->counters[BCH_COUNTER_move_extent_finish], new->k.size);
-			trace_move_extent_finish(&new->k);
+			trace_move_extent_finish2(c, bkey_i_to_s_c(&new->k_i));
 		}
 err:
 		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index 9c8af0872b29..498b3f9778de 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -25,6 +25,39 @@
 #include <linux/ioprio.h>
 #include <linux/kthread.h>
 
+static void trace_move_extent2(struct bch_fs *c, struct bkey_s_c k)
+{
+	if (trace_move_extent_enabled()) {
+		struct printbuf buf = PRINTBUF;
+
+		bch2_bkey_val_to_text(&buf, c, k);
+		trace_move_extent(c, buf.buf);
+		printbuf_exit(&buf);
+	}
+}
+
+static void trace_move_extent_read2(struct bch_fs *c, struct bkey_s_c k)
+{
+	if (trace_move_extent_read_enabled()) {
+		struct printbuf buf = PRINTBUF;
+
+		bch2_bkey_val_to_text(&buf, c, k);
+		trace_move_extent_read(c, buf.buf);
+		printbuf_exit(&buf);
+	}
+}
+
+static void trace_move_extent_alloc_mem_fail2(struct bch_fs *c, struct bkey_s_c k)
+{
+	if (trace_move_extent_alloc_mem_fail_enabled()) {
+		struct printbuf buf = PRINTBUF;
+
+		bch2_bkey_val_to_text(&buf, c, k);
+		trace_move_extent_alloc_mem_fail(c, buf.buf);
+		printbuf_exit(&buf);
+	}
+}
+
 static void progress_list_add(struct bch_fs *c, struct bch_move_stats *stats)
 {
 	mutex_lock(&c->data_progress_lock);
@@ -269,6 +302,8 @@ static int bch2_move_extent(struct btree_trans *trans,
 	unsigned sectors = k.k->size, pages;
 	int ret = -ENOMEM;
 
+	trace_move_extent2(c, k);
+
 	bch2_data_update_opts_normalize(k, &data_opts);
 
 	if (!data_opts.rewrite_ptrs &&
@@ -346,8 +381,7 @@ static int bch2_move_extent(struct btree_trans *trans,
 
 	this_cpu_add(c->counters[BCH_COUNTER_io_move], k.k->size);
 	this_cpu_add(c->counters[BCH_COUNTER_move_extent_read], k.k->size);
-	trace_move_extent_read(k.k);
-
+	trace_move_extent_read2(c, k);
 
 	mutex_lock(&ctxt->lock);
 	atomic_add(io->read_sectors, &ctxt->read_sectors);
@@ -373,7 +407,8 @@ err_free_pages:
 err_free:
 	kfree(io);
 err:
-	trace_and_count(c, move_extent_alloc_mem_fail, k.k);
+	this_cpu_inc(c->counters[BCH_COUNTER_move_extent_alloc_mem_fail]);
+	trace_move_extent_alloc_mem_fail2(c, k);
 	return ret;
 }
 
@@ -719,6 +754,8 @@ int __bch2_evacuate_bucket(struct btree_trans *trans,
 	struct bpos bp_pos = POS_MIN;
 	int ret = 0;
 
+	trace_bucket_evacuate(c, &bucket);
+
 	bch2_bkey_buf_init(&sk);
 
 	/*
diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h
index bbe8eb7a29eb..8027c2a14199 100644
--- a/fs/bcachefs/trace.h
+++ b/fs/bcachefs/trace.h
@@ -33,23 +33,18 @@ DECLARE_EVENT_CLASS(bpos,
 );
 
 DECLARE_EVENT_CLASS(bkey,
-	TP_PROTO(const struct bkey *k),
-	TP_ARGS(k),
+	TP_PROTO(struct bch_fs *c, const char *k),
+	TP_ARGS(c, k),
 
 	TP_STRUCT__entry(
-		__field(u64,	inode				)
-		__field(u64,	offset				)
-		__field(u32,	size				)
+		__string(k,	k				)
 	),
 
 	TP_fast_assign(
-		__entry->inode	= k->p.inode;
-		__entry->offset	= k->p.offset;
-		__entry->size	= k->size;
+		__assign_str(k, k);
 	),
 
-	TP_printk("%llu:%llu len %u", __entry->inode,
-		  __entry->offset, __entry->size)
+	TP_printk("%s", __get_str(k))
 );
 
 DECLARE_EVENT_CLASS(btree_node,
@@ -667,19 +662,45 @@ TRACE_EVENT(bucket_invalidate,
 
 /* Moving IO */
 
+TRACE_EVENT(bucket_evacuate,
+	TP_PROTO(struct bch_fs *c, struct bpos *bucket),
+	TP_ARGS(c, bucket),
+
+	TP_STRUCT__entry(
+		__field(dev_t,		dev			)
+		__field(u32,		dev_idx			)
+		__field(u64,		bucket			)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= c->dev;
+		__entry->dev_idx	= bucket->inode;
+		__entry->bucket		= bucket->offset;
+	),
+
+	TP_printk("%d:%d %u:%llu",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->dev_idx, __entry->bucket)
+);
+
+DEFINE_EVENT(bkey, move_extent,
+	TP_PROTO(struct bch_fs *c, const char *k),
+	TP_ARGS(c, k)
+);
+
 DEFINE_EVENT(bkey, move_extent_read,
-	TP_PROTO(const struct bkey *k),
-	TP_ARGS(k)
+	TP_PROTO(struct bch_fs *c, const char *k),
+	TP_ARGS(c, k)
 );
 
 DEFINE_EVENT(bkey, move_extent_write,
-	TP_PROTO(const struct bkey *k),
-	TP_ARGS(k)
+	TP_PROTO(struct bch_fs *c, const char *k),
+	TP_ARGS(c, k)
 );
 
 DEFINE_EVENT(bkey, move_extent_finish,
-	TP_PROTO(const struct bkey *k),
-	TP_ARGS(k)
+	TP_PROTO(struct bch_fs *c, const char *k),
+	TP_ARGS(c, k)
 );
 
 TRACE_EVENT(move_extent_fail,
@@ -700,8 +721,8 @@ TRACE_EVENT(move_extent_fail,
 );
 
 DEFINE_EVENT(bkey, move_extent_alloc_mem_fail,
-	TP_PROTO(const struct bkey *k),
-	TP_ARGS(k)
+	TP_PROTO(struct bch_fs *c, const char *k),
+	TP_ARGS(c, k)
 );
 
 TRACE_EVENT(move_data,
-- 
cgit 


From 1af5227c1d6b3513106f82808fe163bf0bd70df8 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 21 Apr 2023 03:42:41 -0400
Subject: bcachefs: Kill bch2_verify_bucket_evacuated()

With backpointers, it's now impossible for bch2_evacuate_bucket() to be
completely reliable: it can race with an extent being partially
overwritten or split, which needs a new write buffer flush for the
backpointer to be seen.

This shouldn't be a real issue in practice; the previous patch added a
new tracepoint so we'll be able to see more easily if it is.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/move.c     | 79 --------------------------------------------------
 fs/bcachefs/move.h     |  2 --
 fs/bcachefs/movinggc.c |  7 -----
 3 files changed, 88 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index 498b3f9778de..642c076216ea 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -654,85 +654,6 @@ int bch2_move_data(struct bch_fs *c,
 	return ret;
 }
 
-void bch2_verify_bucket_evacuated(struct btree_trans *trans, struct bpos bucket, int gen)
-{
-	struct bch_fs *c = trans->c;
-	struct btree_iter iter;
-	struct bkey_s_c k;
-	struct printbuf buf = PRINTBUF;
-	struct bch_backpointer bp;
-	struct bpos bp_pos = POS_MIN;
-	unsigned nr_bps = 0;
-	int ret;
-
-	bch2_trans_begin(trans);
-
-	bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc,
-			     bucket, BTREE_ITER_CACHED);
-again:
-	ret = lockrestart_do(trans,
-			bkey_err(k = bch2_btree_iter_peek_slot(&iter)));
-
-	if (!ret && k.k->type == KEY_TYPE_alloc_v4) {
-		struct bkey_s_c_alloc_v4 a = bkey_s_c_to_alloc_v4(k);
-
-		if (a.v->gen == gen &&
-		    a.v->dirty_sectors) {
-			if (a.v->data_type == BCH_DATA_btree) {
-				bch2_trans_unlock(trans);
-				if (bch2_btree_interior_updates_flush(c))
-					goto again;
-				goto failed_to_evacuate;
-			}
-		}
-	}
-
-	set_btree_iter_dontneed(&iter);
-	bch2_trans_iter_exit(trans, &iter);
-	return;
-failed_to_evacuate:
-	bch2_trans_iter_exit(trans, &iter);
-
-	if (test_bit(BCH_FS_EMERGENCY_RO, &c->flags))
-		return;
-
-	prt_printf(&buf, bch2_log_msg(c, "failed to evacuate bucket "));
-	bch2_bkey_val_to_text(&buf, c, k);
-
-	while (1) {
-		bch2_trans_begin(trans);
-
-		ret = bch2_get_next_backpointer(trans, bucket, gen,
-						&bp_pos, &bp,
-						BTREE_ITER_CACHED);
-		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-			continue;
-		if (ret)
-			break;
-		if (bkey_eq(bp_pos, POS_MAX))
-			break;
-
-		k = bch2_backpointer_get_key(trans, &iter, bp_pos, bp, 0);
-		ret = bkey_err(k);
-		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-			continue;
-		if (ret)
-			break;
-		if (!k.k)
-			continue;
-		prt_newline(&buf);
-		bch2_bkey_val_to_text(&buf, c, k);
-		bch2_trans_iter_exit(trans, &iter);
-
-		if (++nr_bps > 10)
-			break;
-		bp_pos = bpos_nosnap_successor(bp_pos);
-	}
-
-	bch2_print_string_as_lines(KERN_ERR, buf.buf);
-	printbuf_exit(&buf);
-}
-
 int __bch2_evacuate_bucket(struct btree_trans *trans,
 			   struct moving_context *ctxt,
 			   struct move_bucket_in_flight *bucket_in_flight,
diff --git a/fs/bcachefs/move.h b/fs/bcachefs/move.h
index 50a6f7d7a292..547ee7b72c16 100644
--- a/fs/bcachefs/move.h
+++ b/fs/bcachefs/move.h
@@ -36,8 +36,6 @@ struct moving_context {
 	wait_queue_head_t	wait;
 };
 
-void bch2_verify_bucket_evacuated(struct btree_trans *, struct bpos, int);
-
 #define move_ctxt_wait_event(_ctxt, _trans, _cond)			\
 do {									\
 	bool cond_finished = false;					\
diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c
index 2d75334c541d..02081ee1a114 100644
--- a/fs/bcachefs/movinggc.c
+++ b/fs/bcachefs/movinggc.c
@@ -134,13 +134,6 @@ static void move_buckets_wait(struct btree_trans *trans,
 		if (atomic_read(&i->count))
 			break;
 
-		/*
-		 * moving_ctxt_exit calls bch2_write as it flushes pending
-		 * reads, which inits another btree_trans; this one must be
-		 * unlocked:
-		 */
-		bch2_verify_bucket_evacuated(trans, i->bucket.k.bucket, i->bucket.k.gen);
-
 		list->first = i->next;
 		if (!list->first)
 			list->last = NULL;
-- 
cgit 


From c8d5b71411473187db4fbc6ca419496b716778b8 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 25 Apr 2023 14:32:39 -0400
Subject: bcachefs: Make sure hash info gets initialized in fsck

We had some bugs with setting/using first_this_inode in the inode
walker in the dirents/xattr code.

This patch changes to not clear first_this_inode until after
initializing the new hash info.

Also, we fix an error message to not print on transaction restart, and
add a comment to related fsck error code.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/error.c |  5 +++++
 fs/bcachefs/fsck.c  | 25 ++++++++++++-------------
 2 files changed, 17 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c
index aa640284ed19..545c55dabc27 100644
--- a/fs/bcachefs/error.c
+++ b/fs/bcachefs/error.c
@@ -158,6 +158,11 @@ int bch2_fsck_err(struct bch_fs *c, unsigned flags, const char *fmt, ...)
 	mutex_lock(&c->fsck_error_lock);
 	s = fsck_err_get(c, fmt);
 	if (s) {
+		/*
+		 * We may be called multiple times for the same error on
+		 * transaction restart - this memoizes instead of asking the user
+		 * multiple times for the same error:
+		 */
 		if (s->last_msg && !strcmp(buf.buf, s->last_msg)) {
 			ret = s->ret;
 			mutex_unlock(&c->fsck_error_lock);
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 6319f2f7b16f..4e7100577734 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -673,10 +673,8 @@ static int __walk_inode(struct btree_trans *trans,
 
 	pos.snapshot = bch2_snapshot_equiv(c, pos.snapshot);
 
-	if (pos.inode == w->cur_inum) {
-		w->first_this_inode = false;
+	if (pos.inode == w->cur_inum)
 		goto lookup_snapshot;
-	}
 
 	w->inodes.nr = 0;
 
@@ -862,10 +860,10 @@ bad_hash:
 		     (printbuf_reset(&buf),
 		      bch2_bkey_val_to_text(&buf, c, hash_k), buf.buf))) {
 		ret = hash_redo_key(trans, desc, hash_info, k_iter, hash_k);
-		if (ret) {
+		if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
 			bch_err(c, "hash_redo_key err %s", bch2_err_str(ret));
+		if (ret)
 			return ret;
-		}
 		ret = -BCH_ERR_transaction_restart_nested;
 	}
 fsck_err:
@@ -1639,6 +1637,10 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
 	if (ret < 0)
 		goto err;
 
+	if (dir->first_this_inode)
+		*hash_info = bch2_hash_info_init(c, &dir->inodes.data[0].inode);
+	dir->first_this_inode = false;
+
 	if (fsck_err_on(ret == INT_MAX, c,
 			"dirent in nonexisting directory:\n%s",
 			(printbuf_reset(&buf),
@@ -1665,11 +1667,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
 		goto out;
 	}
 
-	if (dir->first_this_inode)
-		*hash_info = bch2_hash_info_init(c, &dir->inodes.data[0].inode);
-
-	ret = hash_check_key(trans, bch2_dirent_hash_desc,
-			     hash_info, iter, k);
+	ret = hash_check_key(trans, bch2_dirent_hash_desc, hash_info, iter, k);
 	if (ret < 0)
 		goto err;
 	if (ret) {
@@ -1822,6 +1820,10 @@ static int check_xattr(struct btree_trans *trans, struct btree_iter *iter,
 	if (ret < 0)
 		return ret;
 
+	if (inode->first_this_inode)
+		*hash_info = bch2_hash_info_init(c, &inode->inodes.data[0].inode);
+	inode->first_this_inode = false;
+
 	if (fsck_err_on(ret == INT_MAX, c,
 			"xattr for missing inode %llu",
 			k.k->p.inode))
@@ -1832,9 +1834,6 @@ static int check_xattr(struct btree_trans *trans, struct btree_iter *iter,
 
 	ret = 0;
 
-	if (inode->first_this_inode)
-		*hash_info = bch2_hash_info_init(c, &inode->inodes.data[0].inode);
-
 	ret = hash_check_key(trans, bch2_xattr_hash_desc, hash_info, iter, k);
 fsck_err:
 	if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
-- 
cgit 


From a0668d77f04dd95a394cf421125a2cfd6ab68fad Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 26 Apr 2023 17:47:00 -0400
Subject: bcachefs: Fix a userspace build error

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/util.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h
index a57accc592db..8e37ce01a728 100644
--- a/fs/bcachefs/util.h
+++ b/fs/bcachefs/util.h
@@ -838,4 +838,6 @@ static inline int u8_cmp(u8 l, u8 r)
 	return cmp_int(l, r);
 }
 
+#include <linux/uuid.h>
+
 #endif /* _BCACHEFS_UTIL_H */
-- 
cgit 


From 6b52bcde4a2f3c073151356bf960596d2da9716e Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 26 Apr 2023 17:47:09 -0400
Subject: bcachefs: Always run topology error when CONFIG_BCACHEFS_DEBUG=y

Improved test coverage.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_gc.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index d9f1e011ed71..eedcc09bacff 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -1799,9 +1799,10 @@ again:
 
 	bch2_mark_superblocks(c);
 
-	if (BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb) &&
-	    !test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags) &&
-	    c->opts.fix_errors != FSCK_OPT_NO) {
+	if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) ||
+	    (BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb) &&
+	     !test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags) &&
+	     c->opts.fix_errors != FSCK_OPT_NO)) {
 		bch_info(c, "Starting topology repair pass");
 		ret = bch2_repair_topology(c);
 		if (ret)
-- 
cgit 


From 3140a3d0e990f5528707b676787faa7e29545f32 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 27 Apr 2023 14:02:31 -0400
Subject: bcachefs: Delete obsolete btree ptr check

This patch deletes a .key_invalid check for btree pointers that only
applies to _very_ old on disk format versions, and potentially
complicates the upgrade process.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/extents.c | 7 -------
 1 file changed, 7 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 17e9c434619b..ccefacd0bda7 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -197,13 +197,6 @@ int bch2_btree_ptr_v2_invalid(const struct bch_fs *c, struct bkey_s_c k,
 		return -BCH_ERR_invalid_bkey;
 	}
 
-	if (c->sb.version < bcachefs_metadata_version_snapshot &&
-	    bp.v->min_key.snapshot) {
-		prt_printf(err, "invalid min_key.snapshot (%u != 0)",
-		       bp.v->min_key.snapshot);
-		return -BCH_ERR_invalid_bkey;
-	}
-
 	return bch2_bkey_ptrs_invalid(c, k, flags, err);
 }
 
-- 
cgit 


From 958c347b4b16dd3883c3765f5f99cacfe1b862b5 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 29 Apr 2023 16:21:51 -0400
Subject: bcachefs: Mark bch2_copygc() noinline

This works around a "stack from too large" error.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/movinggc.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c
index 02081ee1a114..bd5d5221788a 100644
--- a/fs/bcachefs/movinggc.c
+++ b/fs/bcachefs/movinggc.c
@@ -205,6 +205,7 @@ static int bch2_copygc_get_buckets(struct btree_trans *trans,
 	return ret < 0 ? ret : 0;
 }
 
+noinline
 static int bch2_copygc(struct btree_trans *trans,
 		       struct moving_context *ctxt,
 		       struct buckets_in_flight *buckets_in_flight)
-- 
cgit 


From 3c434cdff0a47df50779bd55a023c363f658c69a Mon Sep 17 00:00:00 2001
From: Brian Foster <bfoster@redhat.com>
Date: Mon, 1 May 2023 07:09:33 -0400
Subject: bcachefs: fix accounting corruption race between reclaim and dev add

When a device is removed from a bcachefs volume, the associated
content is removed from the various btrees. The alloc tree uses the
key cache, so when keys are removed the deletes exist in cache for a
period of time until reclaim comes along and flushes outstanding
updates.

When a device is re-added to the bcachefs volume, the add process
re-adds some of these previously deleted keys. When marking device
superblock locations on device add, the keys will likely refer to
some of the same alloc keys that were just removed. The memory
triggers for these key updates are responsible for further updates,
such as bch2_mark_alloc() calling into bch2_dev_usage_update() to
update per-device usage accounting.

When a new key is added to key cache, the trans update path also
flushes the key to the backing btree for coherency reasons for tree
walks.

With all of this context, if a device is removed and re-added
quickly enough such that some key deletes from the remove are still
pending a key cache flush, the trans update path can view this as
addition of a new key because the old key in the insert entry refers
to a deleted key. However the deleted cached key has not been filled
by absence of a btree key, but rather refers to an explicit deletion
of an existing key that occurred during device removal.

The trans update path adds a new update to flush the key and tags
the original (cached) update to skip running the memory triggers.
This results in running triggers on the non-cached update instead,
which in turn will perform accounting updates based on incoherent
values. For example, bch2_dev_usage_update() subtracts the the old
alloc key dirty sector count in the non-cached btree key from the
newly initialized (i.e. zeroed) per device counters, leading to
underflow and accounting corruption.

There are at least a few ways to avoid this problem, the simplest of
which may be to run triggers against the cached update rather than
the non-cached update. If the key only needs to be flushed when the
key is not present in the tree, however, then this still performs an
unnecessary update. We could potentially use the cached key dirty
state to determine whether the delete is a dirty, cached update vs.
a clean cache fill, but this may require transmitting key cache
dirty state across layers, which adds complexity and seems to be of
limited value. Instead, update flush_new_cached_update() to handle
this by simply checking for the key in the btree and only perform
the flush when a backing key is not present.

Signed-off-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update_leaf.c | 22 ++++++++++++++++------
 1 file changed, 16 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index b8299914a0ab..be4c5df42be8 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -1501,21 +1501,31 @@ static noinline int flush_new_cached_update(struct btree_trans *trans,
 					    unsigned long ip)
 {
 	struct btree_path *btree_path;
+	struct bkey k;
 	int ret;
 
-	i->key_cache_already_flushed = true;
-	i->flags |= BTREE_TRIGGER_NORUN;
-
 	btree_path = bch2_path_get(trans, path->btree_id, path->pos, 1, 0,
 				   BTREE_ITER_INTENT, _THIS_IP_);
-
 	ret = bch2_btree_path_traverse(trans, btree_path, 0);
 	if (ret)
-		goto err;
+		goto out;
+
+	/*
+	 * The old key in the insert entry might actually refer to an existing
+	 * key in the btree that has been deleted from cache and not yet
+	 * flushed. Check for this and skip the flush so we don't run triggers
+	 * against a stale key.
+	 */
+	bch2_btree_path_peek_slot_exact(btree_path, &k);
+	if (!bkey_deleted(&k))
+		goto out;
+
+	i->key_cache_already_flushed = true;
+	i->flags |= BTREE_TRIGGER_NORUN;
 
 	btree_path_set_should_be_locked(btree_path);
 	ret = bch2_trans_update_by_path_trace(trans, btree_path, i->k, flags, ip);
-err:
+out:
 	bch2_path_put(trans, btree_path, true);
 	return ret;
 }
-- 
cgit 


From 0a23574ebb48844a19cf46672b28e7d439e56454 Mon Sep 17 00:00:00 2001
From: Brian Foster <bfoster@redhat.com>
Date: Mon, 1 May 2023 09:08:26 -0400
Subject: bcachefs: remove unused key cache coherency flag

Signed-off-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bkey_methods.h      | 3 ---
 fs/bcachefs/btree_update_leaf.c | 4 +---
 2 files changed, 1 insertion(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bkey_methods.h b/fs/bcachefs/bkey_methods.h
index 6ae517884a37..346b3ceb136d 100644
--- a/fs/bcachefs/bkey_methods.h
+++ b/fs/bcachefs/bkey_methods.h
@@ -83,7 +83,6 @@ enum btree_update_flags {
 	__BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE,
 	__BTREE_UPDATE_NOJOURNAL,
 	__BTREE_UPDATE_KEY_CACHE_RECLAIM,
-	__BTREE_UPDATE_NO_KEY_CACHE_COHERENCY,
 
 	__BTREE_TRIGGER_NORUN,		/* Don't run triggers at all */
 
@@ -98,8 +97,6 @@ enum btree_update_flags {
 #define BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE (1U << __BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE)
 #define BTREE_UPDATE_NOJOURNAL		(1U << __BTREE_UPDATE_NOJOURNAL)
 #define BTREE_UPDATE_KEY_CACHE_RECLAIM	(1U << __BTREE_UPDATE_KEY_CACHE_RECLAIM)
-#define BTREE_UPDATE_NO_KEY_CACHE_COHERENCY	\
-	(1U << __BTREE_UPDATE_NO_KEY_CACHE_COHERENCY)
 
 #define BTREE_TRIGGER_NORUN		(1U << __BTREE_TRIGGER_NORUN)
 
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index be4c5df42be8..c511541bb5f4 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -1606,9 +1606,7 @@ bch2_trans_update_by_path_trace(struct btree_trans *trans, struct btree_path *pa
 	 * the key cache - but the key has to exist in the btree for that to
 	 * work:
 	 */
-	if (path->cached &&
-	    bkey_deleted(&i->old_k) &&
-	    !(flags & BTREE_UPDATE_NO_KEY_CACHE_COHERENCY))
+	if (path->cached && bkey_deleted(&i->old_k))
 		return flush_new_cached_update(trans, path, i, flags, ip);
 
 	return 0;
-- 
cgit 


From 95b595a5fc4832305fe1323d934bdcae5f2b9439 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 30 Apr 2023 18:04:43 -0400
Subject: bcachefs: Btree iterator, update flags no longer conflict

Change btree_update_flags to start after the last btree iterator flag,
so that we can pass both in the same flags argument.

This is needed for the upcoming bch2_bkey_get_mut() helper.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bkey_methods.h | 2 +-
 fs/bcachefs/btree_types.h  | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bkey_methods.h b/fs/bcachefs/bkey_methods.h
index 346b3ceb136d..ec4638150eae 100644
--- a/fs/bcachefs/bkey_methods.h
+++ b/fs/bcachefs/bkey_methods.h
@@ -80,7 +80,7 @@ static inline int bch2_mark_key(struct btree_trans *trans,
 }
 
 enum btree_update_flags {
-	__BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE,
+	__BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE = __BTREE_ITER_FLAGS_END,
 	__BTREE_UPDATE_NOJOURNAL,
 	__BTREE_UPDATE_KEY_CACHE_RECLAIM,
 
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index bc4aa26b9486..fc8a3326451f 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -211,6 +211,7 @@ static const u16 BTREE_ITER_FILTER_SNAPSHOTS	= 1 << 12;
 static const u16 BTREE_ITER_NOPRESERVE		= 1 << 13;
 static const u16 BTREE_ITER_CACHED_NOFILL	= 1 << 14;
 static const u16 BTREE_ITER_KEY_CACHE_FILL	= 1 << 15;
+#define __BTREE_ITER_FLAGS_END			       16
 
 enum btree_path_uptodate {
 	BTREE_ITER_UPTODATE		= 0,
-- 
cgit 


From ab158fce47220df20f0fe2360767227328f6765b Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 30 Apr 2023 13:02:05 -0400
Subject: bcachefs: Converting to typed bkeys is now allowed for err, null ptrs

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bkey.h       | 12 ++++++------
 fs/bcachefs/btree_iter.h | 12 +++++++-----
 fs/bcachefs/fsck.c       |  2 +-
 3 files changed, 14 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bkey.h b/fs/bcachefs/bkey.h
index 29f44d0060d8..e81fb3e00c60 100644
--- a/fs/bcachefs/bkey.h
+++ b/fs/bcachefs/bkey.h
@@ -619,20 +619,20 @@ struct bkey_s_##name {							\
 									\
 static inline struct bkey_i_##name *bkey_i_to_##name(struct bkey_i *k)	\
 {									\
-	EBUG_ON(k->k.type != KEY_TYPE_##name);				\
+	EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name);	\
 	return container_of(&k->k, struct bkey_i_##name, k);		\
 }									\
 									\
 static inline const struct bkey_i_##name *				\
 bkey_i_to_##name##_c(const struct bkey_i *k)				\
 {									\
-	EBUG_ON(k->k.type != KEY_TYPE_##name);				\
+	EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name);	\
 	return container_of(&k->k, struct bkey_i_##name, k);		\
 }									\
 									\
 static inline struct bkey_s_##name bkey_s_to_##name(struct bkey_s k)	\
 {									\
-	EBUG_ON(k.k->type != KEY_TYPE_##name);				\
+	EBUG_ON(!IS_ERR_OR_NULL(k.k) && k.k->type != KEY_TYPE_##name);	\
 	return (struct bkey_s_##name) {					\
 		.k = k.k,						\
 		.v = container_of(k.v, struct bch_##name, v),		\
@@ -641,7 +641,7 @@ static inline struct bkey_s_##name bkey_s_to_##name(struct bkey_s k)	\
 									\
 static inline struct bkey_s_c_##name bkey_s_c_to_##name(struct bkey_s_c k)\
 {									\
-	EBUG_ON(k.k->type != KEY_TYPE_##name);				\
+	EBUG_ON(!IS_ERR_OR_NULL(k.k) && k.k->type != KEY_TYPE_##name);	\
 	return (struct bkey_s_c_##name) {				\
 		.k = k.k,						\
 		.v = container_of(k.v, struct bch_##name, v),		\
@@ -667,7 +667,7 @@ name##_i_to_s_c(const struct bkey_i_##name *k)				\
 									\
 static inline struct bkey_s_##name bkey_i_to_s_##name(struct bkey_i *k)	\
 {									\
-	EBUG_ON(k->k.type != KEY_TYPE_##name);				\
+	EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name);	\
 	return (struct bkey_s_##name) {					\
 		.k = &k->k,						\
 		.v = container_of(&k->v, struct bch_##name, v),		\
@@ -677,7 +677,7 @@ static inline struct bkey_s_##name bkey_i_to_s_##name(struct bkey_i *k)	\
 static inline struct bkey_s_c_##name					\
 bkey_i_to_s_c_##name(const struct bkey_i *k)				\
 {									\
-	EBUG_ON(k->k.type != KEY_TYPE_##name);				\
+	EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name);	\
 	return (struct bkey_s_c_##name) {				\
 		.k = &k->k,						\
 		.v = container_of(&k->v, struct bch_##name, v),		\
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index c3682332e653..4790472a3d0d 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -6,6 +6,13 @@
 #include "btree_types.h"
 #include "trace.h"
 
+static inline int __bkey_err(const struct bkey *k)
+{
+	return PTR_ERR_OR_ZERO(k);
+}
+
+#define bkey_err(_k)	__bkey_err((_k).k)
+
 static inline void __btree_path_get(struct btree_path *path, bool intent)
 {
 	path->ref++;
@@ -539,11 +546,6 @@ u32 bch2_trans_begin(struct btree_trans *);
 	__for_each_btree_node(_trans, _iter, _btree_id, _start,		\
 			      0, 0, _flags, _b, _ret)
 
-static inline int bkey_err(struct bkey_s_c k)
-{
-	return PTR_ERR_OR_ZERO(k.k);
-}
-
 static inline struct bkey_s_c bch2_btree_iter_peek_prev_type(struct btree_iter *iter,
 							     unsigned flags)
 {
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 4e7100577734..eda1f030f784 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -1122,7 +1122,7 @@ static int inode_backpointer_exists(struct btree_trans *trans,
 
 	d = dirent_get_by_pos(trans, &iter,
 			SPOS(inode->bi_dir, inode->bi_dir_offset, snapshot));
-	ret = bkey_err(d.s_c);
+	ret = bkey_err(d);
 	if (ret)
 		return ret == -ENOENT ? 0 : ret;
 
-- 
cgit 


From 174f930b8e1cad3915819a46bb20da214f68f2b5 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 29 Apr 2023 13:24:18 -0400
Subject: bcachefs: bkey_ops.min_val_size

This adds a new field to bkey_ops for the minimum size of the value,
which standardizes that check and also enforces the new rule (previously
done somewhat ad-hoc) that we can extend value types by adding new
fields on to the end.

To make that work we do _not_ initialize min_val_size with sizeof,
instead we initialize it to the size of the first version of those
values.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.h |  4 ++++
 fs/bcachefs/backpointers.c     |  5 -----
 fs/bcachefs/backpointers.h     |  1 +
 fs/bcachefs/bkey_methods.c     | 21 +++++++++++++--------
 fs/bcachefs/bkey_methods.h     |  3 +++
 fs/bcachefs/dirent.c           |  6 ------
 fs/bcachefs/dirent.h           |  1 +
 fs/bcachefs/ec.c               |  6 ------
 fs/bcachefs/ec.h               |  1 +
 fs/bcachefs/extents.c          | 14 --------------
 fs/bcachefs/extents.h          |  2 ++
 fs/bcachefs/inode.c            | 24 ------------------------
 fs/bcachefs/inode.h            |  4 ++++
 fs/bcachefs/lru.c              |  8 --------
 fs/bcachefs/lru.h              |  1 +
 fs/bcachefs/quota.c            |  6 ------
 fs/bcachefs/quota.h            |  1 +
 fs/bcachefs/reflink.c          | 20 --------------------
 fs/bcachefs/reflink.h          |  3 +++
 fs/bcachefs/subvolume.c        | 12 ------------
 fs/bcachefs/subvolume.h        |  2 ++
 fs/bcachefs/xattr.c            |  6 ------
 fs/bcachefs/xattr.h            |  1 +
 23 files changed, 37 insertions(+), 115 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h
index 324798396fc6..94b3c057cd62 100644
--- a/fs/bcachefs/alloc_background.h
+++ b/fs/bcachefs/alloc_background.h
@@ -159,6 +159,7 @@ void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 	.val_to_text	= bch2_alloc_to_text,		\
 	.trans_trigger	= bch2_trans_mark_alloc,	\
 	.atomic_trigger	= bch2_mark_alloc,		\
+	.min_val_size	= 8,				\
 })
 
 #define bch2_bkey_ops_alloc_v2 ((struct bkey_ops) {	\
@@ -166,6 +167,7 @@ void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 	.val_to_text	= bch2_alloc_to_text,		\
 	.trans_trigger	= bch2_trans_mark_alloc,	\
 	.atomic_trigger	= bch2_mark_alloc,		\
+	.min_val_size	= 8,				\
 })
 
 #define bch2_bkey_ops_alloc_v3 ((struct bkey_ops) {	\
@@ -173,6 +175,7 @@ void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 	.val_to_text	= bch2_alloc_to_text,		\
 	.trans_trigger	= bch2_trans_mark_alloc,	\
 	.atomic_trigger	= bch2_mark_alloc,		\
+	.min_val_size	= 16,				\
 })
 
 #define bch2_bkey_ops_alloc_v4 ((struct bkey_ops) {	\
@@ -181,6 +184,7 @@ void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 	.swab		= bch2_alloc_v4_swab,		\
 	.trans_trigger	= bch2_trans_mark_alloc,	\
 	.atomic_trigger	= bch2_mark_alloc,		\
+	.min_val_size	= 48,				\
 })
 
 int bch2_bucket_gens_invalid(const struct bch_fs *, struct bkey_s_c, unsigned, struct printbuf *);
diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c
index a3a1ed6e5968..e8666c3ed465 100644
--- a/fs/bcachefs/backpointers.c
+++ b/fs/bcachefs/backpointers.c
@@ -43,11 +43,6 @@ int bch2_backpointer_invalid(const struct bch_fs *c, struct bkey_s_c k,
 	struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(k);
 	struct bpos bucket = bp_pos_to_bucket(c, bp.k->p);
 
-	if (bkey_val_bytes(bp.k) < sizeof(*bp.v)) {
-		prt_str(err, "incorrect value size");
-		return -BCH_ERR_invalid_bkey;
-	}
-
 	if (!bpos_eq(bp.k->p, bucket_pos_to_bp(c, bucket, bp.v->bucket_offset))) {
 		prt_str(err, "backpointer at wrong pos");
 		return -BCH_ERR_invalid_bkey;
diff --git a/fs/bcachefs/backpointers.h b/fs/bcachefs/backpointers.h
index 9c03709ade50..3994bc83d69d 100644
--- a/fs/bcachefs/backpointers.h
+++ b/fs/bcachefs/backpointers.h
@@ -17,6 +17,7 @@ void bch2_backpointer_swab(struct bkey_s);
 	.key_invalid	= bch2_backpointer_invalid,	\
 	.val_to_text	= bch2_backpointer_k_to_text,	\
 	.swab		= bch2_backpointer_swab,	\
+	.min_val_size	= 32,				\
 })
 
 #define MAX_EXTENT_COMPRESS_RATIO_SHIFT		10
diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c
index 72d95831d65d..47f0ab023d64 100644
--- a/fs/bcachefs/bkey_methods.c
+++ b/fs/bcachefs/bkey_methods.c
@@ -56,17 +56,12 @@ static int empty_val_key_invalid(const struct bch_fs *c, struct bkey_s_c k,
 static int key_type_cookie_invalid(const struct bch_fs *c, struct bkey_s_c k,
 				   unsigned flags, struct printbuf *err)
 {
-	if (bkey_val_bytes(k.k) != sizeof(struct bch_cookie)) {
-		prt_printf(err, "incorrect value size (%zu != %zu)",
-		       bkey_val_bytes(k.k), sizeof(struct bch_cookie));
-		return -BCH_ERR_invalid_bkey;
-	}
-
 	return 0;
 }
 
 #define bch2_bkey_ops_cookie ((struct bkey_ops) {	\
-	.key_invalid = key_type_cookie_invalid,		\
+	.key_invalid	= key_type_cookie_invalid,	\
+	.min_val_size	= 8,				\
 })
 
 #define bch2_bkey_ops_hash_whiteout ((struct bkey_ops) {\
@@ -126,12 +121,22 @@ const struct bkey_ops bch2_bkey_ops[] = {
 int bch2_bkey_val_invalid(struct bch_fs *c, struct bkey_s_c k,
 			  unsigned flags, struct printbuf *err)
 {
+	const struct bkey_ops *ops;
+
 	if (k.k->type >= KEY_TYPE_MAX) {
 		prt_printf(err, "invalid type (%u >= %u)", k.k->type, KEY_TYPE_MAX);
 		return -BCH_ERR_invalid_bkey;
 	}
 
-	return bch2_bkey_ops[k.k->type].key_invalid(c, k, flags, err);
+	ops = &bch2_bkey_ops[k.k->type];
+
+	if (bkey_val_bytes(k.k) < ops->min_val_size) {
+		prt_printf(err, "bad val size (%zu < %u)",
+			   bkey_val_bytes(k.k), ops->min_val_size);
+		return -BCH_ERR_invalid_bkey;
+	}
+
+	return ops->key_invalid(c, k, flags, err);
 }
 
 static unsigned bch2_key_types_allowed[] = {
diff --git a/fs/bcachefs/bkey_methods.h b/fs/bcachefs/bkey_methods.h
index ec4638150eae..a65756e306b0 100644
--- a/fs/bcachefs/bkey_methods.h
+++ b/fs/bcachefs/bkey_methods.h
@@ -34,6 +34,9 @@ struct bkey_ops {
 	void		(*compat)(enum btree_id id, unsigned version,
 				  unsigned big_endian, int write,
 				  struct bkey_s);
+
+	/* Size of value type when first created: */
+	unsigned	min_val_size;
 };
 
 extern const struct bkey_ops bch2_bkey_ops[];
diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c
index 4c85d3399fb4..1544fc56974f 100644
--- a/fs/bcachefs/dirent.c
+++ b/fs/bcachefs/dirent.c
@@ -89,12 +89,6 @@ int bch2_dirent_invalid(const struct bch_fs *c, struct bkey_s_c k,
 	struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
 	unsigned len;
 
-	if (bkey_val_bytes(k.k) < sizeof(struct bch_dirent)) {
-		prt_printf(err, "incorrect value size (%zu < %zu)",
-		       bkey_val_bytes(k.k), sizeof(*d.v));
-		return -BCH_ERR_invalid_bkey;
-	}
-
 	len = bch2_dirent_name_bytes(d);
 	if (!len) {
 		prt_printf(err, "empty name");
diff --git a/fs/bcachefs/dirent.h b/fs/bcachefs/dirent.h
index ad131e8edc29..bf9ea2e35fae 100644
--- a/fs/bcachefs/dirent.h
+++ b/fs/bcachefs/dirent.h
@@ -12,6 +12,7 @@ void bch2_dirent_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 #define bch2_bkey_ops_dirent ((struct bkey_ops) {	\
 	.key_invalid	= bch2_dirent_invalid,		\
 	.val_to_text	= bch2_dirent_to_text,		\
+	.min_val_size	= 16,				\
 })
 
 struct qstr;
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index 1855d08efd4b..cf9f8c026034 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -119,12 +119,6 @@ int bch2_stripe_invalid(const struct bch_fs *c, struct bkey_s_c k,
 		return -BCH_ERR_invalid_bkey;
 	}
 
-	if (bkey_val_bytes(k.k) < sizeof(*s)) {
-		prt_printf(err, "incorrect value size (%zu < %zu)",
-		       bkey_val_bytes(k.k), sizeof(*s));
-		return -BCH_ERR_invalid_bkey;
-	}
-
 	if (bkey_val_u64s(k.k) < stripe_val_u64s(s)) {
 		prt_printf(err, "incorrect value size (%zu < %u)",
 		       bkey_val_u64s(k.k), stripe_val_u64s(s));
diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h
index 7c08a49d7419..3995b7076427 100644
--- a/fs/bcachefs/ec.h
+++ b/fs/bcachefs/ec.h
@@ -17,6 +17,7 @@ void bch2_stripe_to_text(struct printbuf *, struct bch_fs *,
 	.swab		= bch2_ptr_swab,		\
 	.trans_trigger	= bch2_trans_mark_stripe,	\
 	.atomic_trigger	= bch2_mark_stripe,		\
+	.min_val_size	= 8,				\
 })
 
 static inline unsigned stripe_csums_per_device(const struct bch_stripe *s)
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index ccefacd0bda7..e2b126ad2bab 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -183,14 +183,6 @@ void bch2_btree_ptr_to_text(struct printbuf *out, struct bch_fs *c,
 int bch2_btree_ptr_v2_invalid(const struct bch_fs *c, struct bkey_s_c k,
 			      unsigned flags, struct printbuf *err)
 {
-	struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k);
-
-	if (bkey_val_bytes(k.k) <= sizeof(*bp.v)) {
-		prt_printf(err, "value too small (%zu <= %zu)",
-		       bkey_val_bytes(k.k), sizeof(*bp.v));
-		return -BCH_ERR_invalid_bkey;
-	}
-
 	if (bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX) {
 		prt_printf(err, "value too big (%zu > %zu)",
 		       bkey_val_u64s(k.k), BKEY_BTREE_PTR_VAL_U64s_MAX);
@@ -383,12 +375,6 @@ int bch2_reservation_invalid(const struct bch_fs *c, struct bkey_s_c k,
 {
 	struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k);
 
-	if (bkey_val_bytes(k.k) != sizeof(struct bch_reservation)) {
-		prt_printf(err, "incorrect value size (%zu != %zu)",
-		       bkey_val_bytes(k.k), sizeof(*r.v));
-		return -BCH_ERR_invalid_bkey;
-	}
-
 	if (!r.v->nr_replicas || r.v->nr_replicas > BCH_REPLICAS_MAX) {
 		prt_printf(err, "invalid nr_replicas (%u)",
 		       r.v->nr_replicas);
diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h
index 9b026ae95932..31c8140950e0 100644
--- a/fs/bcachefs/extents.h
+++ b/fs/bcachefs/extents.h
@@ -407,6 +407,7 @@ void bch2_btree_ptr_v2_compat(enum btree_id, unsigned, unsigned,
 	.compat		= bch2_btree_ptr_v2_compat,		\
 	.trans_trigger	= bch2_trans_mark_extent,		\
 	.atomic_trigger	= bch2_mark_extent,			\
+	.min_val_size	= 40,					\
 })
 
 /* KEY_TYPE_extent: */
@@ -436,6 +437,7 @@ bool bch2_reservation_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);
 	.key_merge	= bch2_reservation_merge,		\
 	.trans_trigger	= bch2_trans_mark_reservation,		\
 	.atomic_trigger	= bch2_mark_reservation,		\
+	.min_val_size	= 8,					\
 })
 
 /* Extent checksum entries: */
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index 7ccbc00b7156..107210dd9c9a 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -437,12 +437,6 @@ int bch2_inode_invalid(const struct bch_fs *c, struct bkey_s_c k,
 {
 	struct bkey_s_c_inode inode = bkey_s_c_to_inode(k);
 
-	if (bkey_val_bytes(k.k) < sizeof(*inode.v)) {
-		prt_printf(err, "incorrect value size (%zu < %zu)",
-		       bkey_val_bytes(k.k), sizeof(*inode.v));
-		return -BCH_ERR_invalid_bkey;
-	}
-
 	if (INODE_STR_HASH(inode.v) >= BCH_STR_HASH_NR) {
 		prt_printf(err, "invalid str hash type (%llu >= %u)",
 		       INODE_STR_HASH(inode.v), BCH_STR_HASH_NR);
@@ -457,12 +451,6 @@ int bch2_inode_v2_invalid(const struct bch_fs *c, struct bkey_s_c k,
 {
 	struct bkey_s_c_inode_v2 inode = bkey_s_c_to_inode_v2(k);
 
-	if (bkey_val_bytes(k.k) < sizeof(*inode.v)) {
-		prt_printf(err, "incorrect value size (%zu < %zu)",
-		       bkey_val_bytes(k.k), sizeof(*inode.v));
-		return -BCH_ERR_invalid_bkey;
-	}
-
 	if (INODEv2_STR_HASH(inode.v) >= BCH_STR_HASH_NR) {
 		prt_printf(err, "invalid str hash type (%llu >= %u)",
 		       INODEv2_STR_HASH(inode.v), BCH_STR_HASH_NR);
@@ -477,12 +465,6 @@ int bch2_inode_v3_invalid(const struct bch_fs *c, struct bkey_s_c k,
 {
 	struct bkey_s_c_inode_v3 inode = bkey_s_c_to_inode_v3(k);
 
-	if (bkey_val_bytes(k.k) < sizeof(*inode.v)) {
-		prt_printf(err, "incorrect value size (%zu < %zu)",
-		       bkey_val_bytes(k.k), sizeof(*inode.v));
-		return -BCH_ERR_invalid_bkey;
-	}
-
 	if (INODEv3_FIELDS_START(inode.v) < INODEv3_FIELDS_START_INITIAL ||
 	    INODEv3_FIELDS_START(inode.v) > bkey_val_u64s(inode.k)) {
 		prt_printf(err, "invalid fields_start (got %llu, min %u max %zu)",
@@ -543,12 +525,6 @@ int bch2_inode_generation_invalid(const struct bch_fs *c, struct bkey_s_c k,
 		return -BCH_ERR_invalid_bkey;
 	}
 
-	if (bkey_val_bytes(k.k) != sizeof(struct bch_inode_generation)) {
-		prt_printf(err, "incorrect value size (%zu != %zu)",
-		       bkey_val_bytes(k.k), sizeof(struct bch_inode_generation));
-		return -BCH_ERR_invalid_bkey;
-	}
-
 	return 0;
 }
 
diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h
index f5066afb4886..0c3022d3f995 100644
--- a/fs/bcachefs/inode.h
+++ b/fs/bcachefs/inode.h
@@ -17,6 +17,7 @@ void bch2_inode_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 	.val_to_text	= bch2_inode_to_text,		\
 	.trans_trigger	= bch2_trans_mark_inode,	\
 	.atomic_trigger	= bch2_mark_inode,		\
+	.min_val_size	= 16,				\
 })
 
 #define bch2_bkey_ops_inode_v2 ((struct bkey_ops) {	\
@@ -24,6 +25,7 @@ void bch2_inode_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 	.val_to_text	= bch2_inode_to_text,		\
 	.trans_trigger	= bch2_trans_mark_inode,	\
 	.atomic_trigger	= bch2_mark_inode,		\
+	.min_val_size	= 32,				\
 })
 
 #define bch2_bkey_ops_inode_v3 ((struct bkey_ops) {	\
@@ -31,6 +33,7 @@ void bch2_inode_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 	.val_to_text	= bch2_inode_to_text,		\
 	.trans_trigger	= bch2_trans_mark_inode,	\
 	.atomic_trigger	= bch2_mark_inode,		\
+	.min_val_size	= 48,				\
 })
 
 static inline bool bkey_is_inode(const struct bkey *k)
@@ -47,6 +50,7 @@ void bch2_inode_generation_to_text(struct printbuf *, struct bch_fs *, struct bk
 #define bch2_bkey_ops_inode_generation ((struct bkey_ops) {	\
 	.key_invalid	= bch2_inode_generation_invalid,	\
 	.val_to_text	= bch2_inode_generation_to_text,	\
+	.min_val_size	= 8,					\
 })
 
 #if 0
diff --git a/fs/bcachefs/lru.c b/fs/bcachefs/lru.c
index c2dece27da2d..0e6193d6b5f8 100644
--- a/fs/bcachefs/lru.c
+++ b/fs/bcachefs/lru.c
@@ -13,14 +13,6 @@
 int bch2_lru_invalid(const struct bch_fs *c, struct bkey_s_c k,
 		     unsigned flags, struct printbuf *err)
 {
-	const struct bch_lru *lru = bkey_s_c_to_lru(k).v;
-
-	if (bkey_val_bytes(k.k) < sizeof(*lru)) {
-		prt_printf(err, "incorrect value size (%zu < %zu)",
-		       bkey_val_bytes(k.k), sizeof(*lru));
-		return -BCH_ERR_invalid_bkey;
-	}
-
 	if (!lru_pos_time(k.k->p)) {
 		prt_printf(err, "lru entry at time=0");
 		return -BCH_ERR_invalid_bkey;
diff --git a/fs/bcachefs/lru.h b/fs/bcachefs/lru.h
index 78a6076999ed..adb98429248e 100644
--- a/fs/bcachefs/lru.h
+++ b/fs/bcachefs/lru.h
@@ -51,6 +51,7 @@ void bch2_lru_pos_to_text(struct printbuf *, struct bpos);
 #define bch2_bkey_ops_lru ((struct bkey_ops) {	\
 	.key_invalid	= bch2_lru_invalid,	\
 	.val_to_text	= bch2_lru_to_text,	\
+	.min_val_size	= 8,			\
 })
 
 int bch2_lru_del(struct btree_trans *, u16, u64, u64);
diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c
index 331f22835d18..22cd662429d3 100644
--- a/fs/bcachefs/quota.c
+++ b/fs/bcachefs/quota.c
@@ -67,12 +67,6 @@ int bch2_quota_invalid(const struct bch_fs *c, struct bkey_s_c k,
 		return -BCH_ERR_invalid_bkey;
 	}
 
-	if (bkey_val_bytes(k.k) != sizeof(struct bch_quota)) {
-		prt_printf(err, "incorrect value size (%zu != %zu)",
-		       bkey_val_bytes(k.k), sizeof(struct bch_quota));
-		return -BCH_ERR_invalid_bkey;
-	}
-
 	return 0;
 }
 
diff --git a/fs/bcachefs/quota.h b/fs/bcachefs/quota.h
index 146264fd16ce..b0f7d4ee775e 100644
--- a/fs/bcachefs/quota.h
+++ b/fs/bcachefs/quota.h
@@ -13,6 +13,7 @@ void bch2_quota_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 #define bch2_bkey_ops_quota ((struct bkey_ops) {	\
 	.key_invalid	= bch2_quota_invalid,		\
 	.val_to_text	= bch2_quota_to_text,		\
+	.min_val_size	= 32,				\
 })
 
 static inline struct bch_qid bch_qid(struct bch_inode_unpacked *u)
diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c
index d8426e754cdf..9430899a5a31 100644
--- a/fs/bcachefs/reflink.c
+++ b/fs/bcachefs/reflink.c
@@ -30,12 +30,6 @@ int bch2_reflink_p_invalid(const struct bch_fs *c, struct bkey_s_c k,
 {
 	struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
 
-	if (bkey_val_bytes(p.k) != sizeof(*p.v)) {
-		prt_printf(err, "incorrect value size (%zu != %zu)",
-		       bkey_val_bytes(p.k), sizeof(*p.v));
-		return -EINVAL;
-	}
-
 	if (c->sb.version >= bcachefs_metadata_version_reflink_p_fix &&
 	    le64_to_cpu(p.v->idx) < le32_to_cpu(p.v->front_pad)) {
 		prt_printf(err, "idx < front_pad (%llu < %u)",
@@ -80,14 +74,6 @@ bool bch2_reflink_p_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r
 int bch2_reflink_v_invalid(const struct bch_fs *c, struct bkey_s_c k,
 			   unsigned flags, struct printbuf *err)
 {
-	struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(k);
-
-	if (bkey_val_bytes(r.k) < sizeof(*r.v)) {
-		prt_printf(err, "incorrect value size (%zu < %zu)",
-		       bkey_val_bytes(r.k), sizeof(*r.v));
-		return -BCH_ERR_invalid_bkey;
-	}
-
 	return bch2_bkey_ptrs_invalid(c, k, flags, err);
 }
 
@@ -133,12 +119,6 @@ int bch2_trans_mark_reflink_v(struct btree_trans *trans,
 int bch2_indirect_inline_data_invalid(const struct bch_fs *c, struct bkey_s_c k,
 				      unsigned flags, struct printbuf *err)
 {
-	if (bkey_val_bytes(k.k) < sizeof(struct bch_indirect_inline_data)) {
-		prt_printf(err, "incorrect value size (%zu < %zu)",
-		       bkey_val_bytes(k.k), sizeof(struct bch_indirect_inline_data));
-		return -BCH_ERR_invalid_bkey;
-	}
-
 	return 0;
 }
 
diff --git a/fs/bcachefs/reflink.h b/fs/bcachefs/reflink.h
index 2391037c2ece..ba400188f5be 100644
--- a/fs/bcachefs/reflink.h
+++ b/fs/bcachefs/reflink.h
@@ -14,6 +14,7 @@ bool bch2_reflink_p_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);
 	.key_merge	= bch2_reflink_p_merge,			\
 	.trans_trigger	= bch2_trans_mark_reflink_p,		\
 	.atomic_trigger	= bch2_mark_reflink_p,			\
+	.min_val_size	= 16,					\
 })
 
 int bch2_reflink_v_invalid(const struct bch_fs *, struct bkey_s_c,
@@ -29,6 +30,7 @@ int bch2_trans_mark_reflink_v(struct btree_trans *, enum btree_id, unsigned,
 	.swab		= bch2_ptr_swab,			\
 	.trans_trigger	= bch2_trans_mark_reflink_v,		\
 	.atomic_trigger	= bch2_mark_extent,			\
+	.min_val_size	= 8,					\
 })
 
 int bch2_indirect_inline_data_invalid(const struct bch_fs *, struct bkey_s_c,
@@ -44,6 +46,7 @@ int bch2_trans_mark_indirect_inline_data(struct btree_trans *,
 	.key_invalid	= bch2_indirect_inline_data_invalid,	\
 	.val_to_text	= bch2_indirect_inline_data_to_text,	\
 	.trans_trigger	= bch2_trans_mark_indirect_inline_data,	\
+	.min_val_size	= 8,					\
 })
 
 static inline const __le64 *bkey_refcount_c(struct bkey_s_c k)
diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c
index 6407d19edc0e..31af41df09e3 100644
--- a/fs/bcachefs/subvolume.c
+++ b/fs/bcachefs/subvolume.c
@@ -36,12 +36,6 @@ int bch2_snapshot_invalid(const struct bch_fs *c, struct bkey_s_c k,
 		return -BCH_ERR_invalid_bkey;
 	}
 
-	if (bkey_val_bytes(k.k) != sizeof(struct bch_snapshot)) {
-		prt_printf(err, "bad val size (%zu != %zu)",
-		       bkey_val_bytes(k.k), sizeof(struct bch_snapshot));
-		return -BCH_ERR_invalid_bkey;
-	}
-
 	s = bkey_s_c_to_snapshot(k);
 
 	id = le32_to_cpu(s.v->parent);
@@ -743,12 +737,6 @@ int bch2_subvolume_invalid(const struct bch_fs *c, struct bkey_s_c k,
 		return -BCH_ERR_invalid_bkey;
 	}
 
-	if (bkey_val_bytes(k.k) != sizeof(struct bch_subvolume)) {
-		prt_printf(err, "incorrect value size (%zu != %zu)",
-		       bkey_val_bytes(k.k), sizeof(struct bch_subvolume));
-		return -BCH_ERR_invalid_bkey;
-	}
-
 	return 0;
 }
 
diff --git a/fs/bcachefs/subvolume.h b/fs/bcachefs/subvolume.h
index df6657952e2f..1f6f7862e48f 100644
--- a/fs/bcachefs/subvolume.h
+++ b/fs/bcachefs/subvolume.h
@@ -15,6 +15,7 @@ int bch2_mark_snapshot(struct btree_trans *, enum btree_id, unsigned,
 	.key_invalid	= bch2_snapshot_invalid,		\
 	.val_to_text	= bch2_snapshot_to_text,		\
 	.atomic_trigger	= bch2_mark_snapshot,			\
+	.min_val_size	= 24,					\
 })
 
 static inline struct snapshot_t *snapshot_t(struct bch_fs *c, u32 id)
@@ -119,6 +120,7 @@ void bch2_subvolume_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c)
 #define bch2_bkey_ops_subvolume ((struct bkey_ops) {		\
 	.key_invalid	= bch2_subvolume_invalid,		\
 	.val_to_text	= bch2_subvolume_to_text,		\
+	.min_val_size	= 16,					\
 })
 
 int bch2_subvolume_get(struct btree_trans *, unsigned,
diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c
index 4c86878b3df2..448eb446946b 100644
--- a/fs/bcachefs/xattr.c
+++ b/fs/bcachefs/xattr.c
@@ -75,12 +75,6 @@ int bch2_xattr_invalid(const struct bch_fs *c, struct bkey_s_c k,
 	const struct xattr_handler *handler;
 	struct bkey_s_c_xattr xattr = bkey_s_c_to_xattr(k);
 
-	if (bkey_val_bytes(k.k) < sizeof(struct bch_xattr)) {
-		prt_printf(err, "incorrect value size (%zu < %zu)",
-		       bkey_val_bytes(k.k), sizeof(*xattr.v));
-		return -BCH_ERR_invalid_bkey;
-	}
-
 	if (bkey_val_u64s(k.k) <
 	    xattr_val_u64s(xattr.v->x_name_len,
 			   le16_to_cpu(xattr.v->x_val_len))) {
diff --git a/fs/bcachefs/xattr.h b/fs/bcachefs/xattr.h
index 1a4cff3a9d96..b3e16729bcbb 100644
--- a/fs/bcachefs/xattr.h
+++ b/fs/bcachefs/xattr.h
@@ -12,6 +12,7 @@ void bch2_xattr_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 #define bch2_bkey_ops_xattr ((struct bkey_ops) {	\
 	.key_invalid	= bch2_xattr_invalid,		\
 	.val_to_text	= bch2_xattr_to_text,		\
+	.min_val_size	= 8,				\
 })
 
 static inline unsigned xattr_val_u64s(unsigned name_len, unsigned val_len)
-- 
cgit 


From bcb79a51cb52033bb12c5ed2eb46770e984b5542 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 29 Apr 2023 19:33:09 -0400
Subject: bcachefs: bch2_bkey_get_iter() helpers

Introduce new helpers for a common pattern:

  bch2_trans_iter_init();
  bch2_btree_iter_peek_slot();

 - bch2_bkey_get_iter_type() returns -ENOENT if it doesn't find a key of
   the correct type
 - bch2_bkey_get_val_typed() copies the val out of the btree to a
   (typically stack allocated) variable; it handles the case where the
   value in the btree is smaller than the current version of the type,
   zeroing out the remainder.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c | 49 +++++++++++---------------
 fs/bcachefs/alloc_foreground.c |  5 +--
 fs/bcachefs/backpointers.c     | 29 +++++++--------
 fs/bcachefs/btree_iter.h       | 56 +++++++++++++++++++++++++++++
 fs/bcachefs/btree_key_cache.c  |  7 ++--
 fs/bcachefs/data_update.c      |  7 ++--
 fs/bcachefs/ec.c               | 15 ++++----
 fs/bcachefs/fsck.c             | 72 ++++++++++---------------------------
 fs/bcachefs/inode.c            | 17 ++++-----
 fs/bcachefs/io.c               | 11 +++---
 fs/bcachefs/lru.c              |  3 +-
 fs/bcachefs/movinggc.c         | 16 ++-------
 fs/bcachefs/quota.c            |  6 ++--
 fs/bcachefs/recovery.c         |  7 ++--
 fs/bcachefs/subvolume.c        | 80 +++++++++++-------------------------------
 15 files changed, 162 insertions(+), 218 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 968b6103e979..b938c37cb0fe 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -540,14 +540,13 @@ bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree_iter *iter
 	struct bkey_i_alloc_v4 *a;
 	int ret;
 
-	bch2_trans_iter_init(trans, iter, BTREE_ID_alloc, pos,
+	k = bch2_bkey_get_iter(trans, iter, BTREE_ID_alloc, pos,
 			     BTREE_ITER_WITH_UPDATES|
 			     BTREE_ITER_CACHED|
 			     BTREE_ITER_INTENT);
-	k = bch2_btree_iter_peek_slot(iter);
 	ret = bkey_err(k);
 	if (unlikely(ret))
-		goto err;
+		return ERR_PTR(ret);
 
 	a = bch2_alloc_to_v4_mut_inlined(trans, k);
 	ret = PTR_ERR_OR_ZERO(a);
@@ -789,13 +788,12 @@ static int bch2_bucket_do_index(struct btree_trans *trans,
 		return 0;
 	}
 
-	bch2_trans_iter_init(trans, &iter, btree,
+	old = bch2_bkey_get_iter(trans, &iter, btree,
 			     bkey_start_pos(&k->k),
 			     BTREE_ITER_INTENT);
-	old = bch2_btree_iter_peek_slot(&iter);
 	ret = bkey_err(old);
 	if (ret)
-		goto err;
+		return ret;
 
 	if (ca->mi.freespace_initialized &&
 	    test_bit(BCH_FS_CHECK_ALLOC_DONE, &c->flags) &&
@@ -833,13 +831,12 @@ static noinline int bch2_bucket_gen_update(struct btree_trans *trans,
 	if (ret)
 		return ret;
 
-	bch2_trans_iter_init(trans, &iter, BTREE_ID_bucket_gens, pos,
-			     BTREE_ITER_INTENT|
-			     BTREE_ITER_WITH_UPDATES);
-	k = bch2_btree_iter_peek_slot(&iter);
+	k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_bucket_gens, pos,
+			       BTREE_ITER_INTENT|
+			       BTREE_ITER_WITH_UPDATES);
 	ret = bkey_err(k);
 	if (ret)
-		goto err;
+		return ret;
 
 	if (k.k->type != KEY_TYPE_bucket_gens) {
 		bkey_bucket_gens_init(&g->k_i);
@@ -851,7 +848,6 @@ static noinline int bch2_bucket_gen_update(struct btree_trans *trans,
 	g->v.gens[offset] = gen;
 
 	ret = bch2_trans_update(trans, &iter, &g->k_i, 0);
-err:
 	bch2_trans_iter_exit(trans, &iter);
 	return ret;
 }
@@ -1312,18 +1308,16 @@ static int bch2_check_discard_freespace_key(struct btree_trans *trans,
 	pos.offset &= ~(~0ULL << 56);
 	genbits = iter->pos.offset & (~0ULL << 56);
 
-	bch2_trans_iter_init(trans, &alloc_iter, BTREE_ID_alloc, pos, 0);
+	alloc_k = bch2_bkey_get_iter(trans, &alloc_iter, BTREE_ID_alloc, pos, 0);
+	ret = bkey_err(alloc_k);
+	if (ret)
+		return ret;
 
 	if (fsck_err_on(!bch2_dev_bucket_exists(c, pos), c,
 			"entry in %s btree for nonexistant dev:bucket %llu:%llu",
 			bch2_btree_ids[iter->btree_id], pos.inode, pos.offset))
 		goto delete;
 
-	alloc_k = bch2_btree_iter_peek_slot(&alloc_iter);
-	ret = bkey_err(alloc_k);
-	if (ret)
-		goto err;
-
 	a = bch2_alloc_to_v4(alloc_k, &a_convert);
 
 	if (fsck_err_on(a->data_type != state ||
@@ -1336,7 +1330,6 @@ static int bch2_check_discard_freespace_key(struct btree_trans *trans,
 			genbits >> 56, alloc_freespace_genbits(*a) >> 56))
 		goto delete;
 out:
-err:
 fsck_err:
 	bch2_trans_iter_exit(trans, &alloc_iter);
 	printbuf_exit(&buf);
@@ -1525,7 +1518,7 @@ static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans,
 	struct btree_iter lru_iter;
 	struct bch_alloc_v4 a_convert;
 	const struct bch_alloc_v4 *a;
-	struct bkey_s_c alloc_k, k;
+	struct bkey_s_c alloc_k, lru_k;
 	struct printbuf buf = PRINTBUF;
 	int ret;
 
@@ -1542,21 +1535,20 @@ static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans,
 	if (a->data_type != BCH_DATA_cached)
 		return 0;
 
-	bch2_trans_iter_init(trans, &lru_iter, BTREE_ID_lru,
+	lru_k = bch2_bkey_get_iter(trans, &lru_iter, BTREE_ID_lru,
 			     lru_pos(alloc_k.k->p.inode,
 				     bucket_to_u64(alloc_k.k->p),
 				     a->io_time[READ]), 0);
-	k = bch2_btree_iter_peek_slot(&lru_iter);
-	ret = bkey_err(k);
+	ret = bkey_err(lru_k);
 	if (ret)
-		goto err;
+		return ret;
 
 	if (fsck_err_on(!a->io_time[READ], c,
 			"cached bucket with read_time 0\n"
 			"  %s",
 		(printbuf_reset(&buf),
 		 bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf)) ||
-	    fsck_err_on(k.k->type != KEY_TYPE_set, c,
+	    fsck_err_on(lru_k.k->type != KEY_TYPE_set, c,
 			"missing lru entry\n"
 			"  %s",
 			(printbuf_reset(&buf),
@@ -1645,10 +1637,9 @@ static int bch2_discard_one_bucket(struct btree_trans *trans,
 		goto out;
 	}
 
-	bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc,
-			     need_discard_iter->pos,
-			     BTREE_ITER_CACHED);
-	k = bch2_btree_iter_peek_slot(&iter);
+	k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_alloc,
+			       need_discard_iter->pos,
+			       BTREE_ITER_CACHED);
 	ret = bkey_err(k);
 	if (ret)
 		goto out;
diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index 368355de5f26..ec77601ebd0c 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -303,8 +303,9 @@ static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bc
 		goto err;
 	}
 
-	bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, POS(ca->dev_idx, b), BTREE_ITER_CACHED);
-	k = bch2_btree_iter_peek_slot(&iter);
+	k = bch2_bkey_get_iter(trans, &iter,
+			       BTREE_ID_alloc, POS(ca->dev_idx, b),
+			       BTREE_ITER_CACHED);
 	ret = bkey_err(k);
 	if (ret) {
 		ob = ERR_PTR(ret);
diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c
index e8666c3ed465..e9ae623cf4a8 100644
--- a/fs/bcachefs/backpointers.c
+++ b/fs/bcachefs/backpointers.c
@@ -158,12 +158,11 @@ int bch2_bucket_backpointer_mod_nowritebuffer(struct btree_trans *trans,
 		set_bkey_val_u64s(&bp_k->k, 0);
 	}
 
-	bch2_trans_iter_init(trans, &bp_iter, BTREE_ID_backpointers,
-			     bp_k->k.p,
-			     BTREE_ITER_INTENT|
-			     BTREE_ITER_SLOTS|
-			     BTREE_ITER_WITH_UPDATES);
-	k = bch2_btree_iter_peek_slot(&bp_iter);
+	k = bch2_bkey_get_iter(trans, &bp_iter, BTREE_ID_backpointers,
+			       bp_k->k.p,
+			       BTREE_ITER_INTENT|
+			       BTREE_ITER_SLOTS|
+			       BTREE_ITER_WITH_UPDATES);
 	ret = bkey_err(k);
 	if (ret)
 		goto err;
@@ -202,9 +201,8 @@ int bch2_get_next_backpointer(struct btree_trans *trans,
 		goto done;
 
 	if (gen >= 0) {
-		bch2_trans_iter_init(trans, &alloc_iter, BTREE_ID_alloc,
-				     bucket, BTREE_ITER_CACHED|iter_flags);
-		k = bch2_btree_iter_peek_slot(&alloc_iter);
+		k = bch2_bkey_get_iter(trans, &alloc_iter, BTREE_ID_alloc,
+				       bucket, BTREE_ITER_CACHED|iter_flags);
 		ret = bkey_err(k);
 		if (ret)
 			goto out;
@@ -381,10 +379,8 @@ static int bch2_check_btree_backpointer(struct btree_trans *trans, struct btree_
 
 	ca = bch_dev_bkey_exists(c, k.k->p.inode);
 
-	bch2_trans_iter_init(trans, &alloc_iter, BTREE_ID_alloc,
-			     bp_pos_to_bucket(c, k.k->p), 0);
-
-	alloc_k = bch2_btree_iter_peek_slot(&alloc_iter);
+	alloc_k = bch2_bkey_get_iter(trans, &alloc_iter, BTREE_ID_alloc,
+				     bp_pos_to_bucket(c, k.k->p), 0);
 	ret = bkey_err(alloc_k);
 	if (ret)
 		goto out;
@@ -442,10 +438,9 @@ static int check_bp_exists(struct btree_trans *trans,
 	if (!bch2_dev_bucket_exists(c, bucket))
 		goto missing;
 
-	bch2_trans_iter_init(trans, &bp_iter, BTREE_ID_backpointers,
-			     bucket_pos_to_bp(c, bucket, bp.bucket_offset),
-			     0);
-	bp_k = bch2_btree_iter_peek_slot(&bp_iter);
+	bp_k = bch2_bkey_get_iter(trans, &bp_iter, BTREE_ID_backpointers,
+				  bucket_pos_to_bp(c, bucket, bp.bucket_offset),
+				  0);
 	ret = bkey_err(bp_k);
 	if (ret)
 		goto err;
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index 4790472a3d0d..ab86d9d39a18 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -483,6 +483,62 @@ static inline void *bch2_trans_kmalloc_nomemzero(struct btree_trans *trans, size
 	}
 }
 
+static inline struct bkey_s_c __bch2_bkey_get_iter(struct btree_trans *trans,
+				struct btree_iter *iter,
+				unsigned btree_id, struct bpos pos,
+				unsigned flags, unsigned type)
+{
+	struct bkey_s_c k;
+
+	bch2_trans_iter_init(trans, iter, btree_id, pos, flags);
+	k = bch2_btree_iter_peek_slot(iter);
+
+	if (!bkey_err(k) && type && k.k->type != type)
+		k = bkey_s_c_err(-ENOENT);
+	if (unlikely(bkey_err(k)))
+		bch2_trans_iter_exit(trans, iter);
+	return k;
+}
+
+static inline struct bkey_s_c bch2_bkey_get_iter(struct btree_trans *trans,
+				struct btree_iter *iter,
+				unsigned btree_id, struct bpos pos,
+				unsigned flags)
+{
+	return __bch2_bkey_get_iter(trans, iter, btree_id, pos, flags, 0);
+}
+
+#define bch2_bkey_get_iter_typed(_trans, _iter, _btree_id, _pos, _flags, _type)\
+	bkey_s_c_to_##_type(__bch2_bkey_get_iter(_trans, _iter,			\
+				       _btree_id, _pos, _flags, KEY_TYPE_##_type))
+
+static inline int __bch2_bkey_get_val_typed(struct btree_trans *trans,
+				unsigned btree_id, struct bpos pos,
+				unsigned flags, unsigned type,
+				unsigned val_size, void *val)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	int ret;
+
+	k = __bch2_bkey_get_iter(trans, &iter, btree_id, pos, flags, type);
+	ret = bkey_err(k);
+	if (!ret) {
+		unsigned b = min_t(unsigned, bkey_val_bytes(k.k), val_size);
+
+		memcpy(val, k.v, b);
+		if (unlikely(b < sizeof(*val)))
+			memset((void *) val + b, 0, sizeof(*val) - b);
+		bch2_trans_iter_exit(trans, &iter);
+	}
+
+	return ret;
+}
+
+#define bch2_bkey_get_val_typed(_trans, _btree_id, _pos, _flags, _type, _val)\
+	__bch2_bkey_get_val_typed(_trans, _btree_id, _pos, _flags,	\
+				  KEY_TYPE_##_type, sizeof(*_val), _val)
+
 static inline struct bkey_i *bch2_bkey_make_mut(struct btree_trans *trans, struct bkey_s_c k)
 {
 	struct bkey_i *mut = bch2_trans_kmalloc_nomemzero(trans, bkey_bytes(k.k));
diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index a483bd23a336..727ea2d0e58d 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -386,10 +386,9 @@ static int btree_key_cache_fill(struct btree_trans *trans,
 	struct bkey_i *new_k = NULL;
 	int ret;
 
-	bch2_trans_iter_init(trans, &iter, ck->key.btree_id, ck->key.pos,
-			     BTREE_ITER_KEY_CACHE_FILL|
-			     BTREE_ITER_CACHED_NOFILL);
-	k = bch2_btree_iter_peek_slot(&iter);
+	k = bch2_bkey_get_iter(trans, &iter, ck->key.btree_id, ck->key.pos,
+			       BTREE_ITER_KEY_CACHE_FILL|
+			       BTREE_ITER_CACHED_NOFILL);
 	ret = bkey_err(k);
 	if (ret)
 		goto err;
diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c
index cffb3c14d539..c709538ce9c2 100644
--- a/fs/bcachefs/data_update.c
+++ b/fs/bcachefs/data_update.c
@@ -57,10 +57,9 @@ static int insert_snapshot_whiteouts(struct btree_trans *trans,
 
 			whiteout_pos.snapshot = k.k->p.snapshot;
 
-			bch2_trans_iter_init(trans, &iter2, id, whiteout_pos,
-					     BTREE_ITER_NOT_EXTENTS|
-					     BTREE_ITER_INTENT);
-			k2 = bch2_btree_iter_peek_slot(&iter2);
+			k2 = bch2_bkey_get_iter(trans, &iter2, id, whiteout_pos,
+						BTREE_ITER_NOT_EXTENTS|
+						BTREE_ITER_INTENT);
 			ret = bkey_err(k2);
 
 			if (!ret && k2.k->type == KEY_TYPE_deleted) {
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index cf9f8c026034..439fa540323f 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -452,9 +452,8 @@ static int get_stripe_key_trans(struct btree_trans *trans, u64 idx,
 	struct bkey_s_c k;
 	int ret;
 
-	bch2_trans_iter_init(trans, &iter, BTREE_ID_stripes,
-			     POS(0, idx), BTREE_ITER_SLOTS);
-	k = bch2_btree_iter_peek_slot(&iter);
+	k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_stripes,
+			       POS(0, idx), BTREE_ITER_SLOTS);
 	ret = bkey_err(k);
 	if (ret)
 		goto err;
@@ -755,9 +754,8 @@ static int ec_stripe_delete(struct btree_trans *trans, u64 idx)
 	struct bkey_s_c_stripe s;
 	int ret;
 
-	bch2_trans_iter_init(trans, &iter, BTREE_ID_stripes, POS(0, idx),
-			     BTREE_ITER_INTENT);
-	k = bch2_btree_iter_peek_slot(&iter);
+	k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_stripes, POS(0, idx),
+			       BTREE_ITER_INTENT);
 	ret = bkey_err(k);
 	if (ret)
 		goto err;
@@ -835,9 +833,8 @@ static int ec_stripe_key_update(struct btree_trans *trans,
 	struct bkey_s_c k;
 	int ret;
 
-	bch2_trans_iter_init(trans, &iter, BTREE_ID_stripes,
-			     new->k.p, BTREE_ITER_INTENT);
-	k = bch2_btree_iter_peek_slot(&iter);
+	k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_stripes,
+			       new->k.p, BTREE_ITER_INTENT);
 	ret = bkey_err(k);
 	if (ret)
 		goto err;
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index eda1f030f784..142e64922d8f 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -72,26 +72,14 @@ static s64 bch2_count_subdirs(struct btree_trans *trans, u64 inum,
 static int __snapshot_lookup_subvol(struct btree_trans *trans, u32 snapshot,
 				    u32 *subvol)
 {
-	struct btree_iter iter;
-	struct bkey_s_c k;
-	int ret;
-
-	bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots,
-			     POS(0, snapshot), 0);
-	k = bch2_btree_iter_peek_slot(&iter);
-	ret = bkey_err(k);
-	if (ret)
-		goto err;
-
-	if (k.k->type != KEY_TYPE_snapshot) {
+	struct bch_snapshot s;
+	int ret = bch2_bkey_get_val_typed(trans, BTREE_ID_snapshots,
+					  POS(0, snapshot), 0,
+					  snapshot, &s);
+	if (!ret)
+		*subvol = le32_to_cpu(s.subvol);
+	else if (ret == -ENOENT)
 		bch_err(trans->c, "snapshot %u not fonud", snapshot);
-		ret = -ENOENT;
-		goto err;
-	}
-
-	*subvol = le32_to_cpu(bkey_s_c_to_snapshot(k).v->subvol);
-err:
-	bch2_trans_iter_exit(trans, &iter);
 	return ret;
 
 }
@@ -152,9 +140,8 @@ static int __lookup_inode(struct btree_trans *trans, u64 inode_nr,
 	struct bkey_s_c k;
 	int ret;
 
-	bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes,
-			     SPOS(0, inode_nr, *snapshot), 0);
-	k = bch2_btree_iter_peek_slot(&iter);
+	k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
+			       SPOS(0, inode_nr, *snapshot), 0);
 	ret = bkey_err(k);
 	if (ret)
 		goto err;
@@ -259,10 +246,8 @@ static int fsck_inode_rm(struct btree_trans *trans, u64 inum, u32 snapshot)
 retry:
 	bch2_trans_begin(trans);
 
-	bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes,
-			     SPOS(0, inum, snapshot), BTREE_ITER_INTENT);
-	k = bch2_btree_iter_peek_slot(&iter);
-
+	k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
+			       SPOS(0, inum, snapshot), BTREE_ITER_INTENT);
 	ret = bkey_err(k);
 	if (ret)
 		goto err;
@@ -453,22 +438,14 @@ static int remove_backpointer(struct btree_trans *trans,
 			      struct bch_inode_unpacked *inode)
 {
 	struct btree_iter iter;
-	struct bkey_s_c k;
+	struct bkey_s_c_dirent d;
 	int ret;
 
-	bch2_trans_iter_init(trans, &iter, BTREE_ID_dirents,
-			     POS(inode->bi_dir, inode->bi_dir_offset), 0);
-	k = bch2_btree_iter_peek_slot(&iter);
-	ret = bkey_err(k);
-	if (ret)
-		goto out;
-	if (k.k->type != KEY_TYPE_dirent) {
-		ret = -ENOENT;
-		goto out;
-	}
-
-	ret = __remove_dirent(trans, k.k->p);
-out:
+	d = bch2_bkey_get_iter_typed(trans, &iter, BTREE_ID_dirents,
+				     POS(inode->bi_dir, inode->bi_dir_offset), 0,
+				     dirent);
+	ret =   bkey_err(d) ?:
+		__remove_dirent(trans, d.k->p);
 	bch2_trans_iter_exit(trans, &iter);
 	return ret;
 }
@@ -1081,20 +1058,7 @@ static struct bkey_s_c_dirent dirent_get_by_pos(struct btree_trans *trans,
 						struct btree_iter *iter,
 						struct bpos pos)
 {
-	struct bkey_s_c k;
-	int ret;
-
-	bch2_trans_iter_init(trans, iter, BTREE_ID_dirents, pos, 0);
-	k = bch2_btree_iter_peek_slot(iter);
-	ret = bkey_err(k);
-	if (!ret && k.k->type != KEY_TYPE_dirent)
-		ret = -ENOENT;
-	if (ret) {
-		bch2_trans_iter_exit(trans, iter);
-		return (struct bkey_s_c_dirent) { .k = ERR_PTR(ret) };
-	}
-
-	return bkey_s_c_to_dirent(k);
+	return bch2_bkey_get_iter_typed(trans, iter, BTREE_ID_dirents, pos, 0, dirent);
 }
 
 static bool inode_points_to_dirent(struct bch_inode_unpacked *inode,
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index 107210dd9c9a..ddcd7b125f32 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -329,13 +329,12 @@ int bch2_inode_peek(struct btree_trans *trans,
 	if (ret)
 		return ret;
 
-	bch2_trans_iter_init(trans, iter, BTREE_ID_inodes,
-			     SPOS(0, inum.inum, snapshot),
-			     flags|BTREE_ITER_CACHED);
-	k = bch2_btree_iter_peek_slot(iter);
+	k = bch2_bkey_get_iter(trans, iter, BTREE_ID_inodes,
+			       SPOS(0, inum.inum, snapshot),
+			       flags|BTREE_ITER_CACHED);
 	ret = bkey_err(k);
 	if (ret)
-		goto err;
+		return ret;
 
 	ret = bkey_is_inode(k.k) ? 0 : -ENOENT;
 	if (ret)
@@ -760,11 +759,9 @@ retry:
 	if (ret)
 		goto err;
 
-	bch2_trans_iter_init(&trans, &iter, BTREE_ID_inodes,
-			     SPOS(0, inum.inum, snapshot),
-			     BTREE_ITER_INTENT|BTREE_ITER_CACHED);
-	k = bch2_btree_iter_peek_slot(&iter);
-
+	k = bch2_bkey_get_iter(&trans, &iter, BTREE_ID_inodes,
+			       SPOS(0, inum.inum, snapshot),
+			       BTREE_ITER_INTENT|BTREE_ITER_CACHED);
 	ret = bkey_err(k);
 	if (ret)
 		goto err;
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index cc2dfcf16dee..01911db786f5 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -2312,9 +2312,8 @@ static int __bch2_rbio_narrow_crcs(struct btree_trans *trans,
 	if (crc_is_compressed(rbio->pick.crc))
 		return 0;
 
-	bch2_trans_iter_init(trans, &iter, rbio->data_btree, rbio->data_pos,
-			     BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
-	k = bch2_btree_iter_peek_slot(&iter);
+	k = bch2_bkey_get_iter(trans, &iter, rbio->data_btree, rbio->data_pos,
+			       BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
 	if ((ret = bkey_err(k)))
 		goto out;
 
@@ -2550,10 +2549,8 @@ int __bch2_read_indirect_extent(struct btree_trans *trans,
 	reflink_offset = le64_to_cpu(bkey_i_to_reflink_p(orig_k->k)->v.idx) +
 		*offset_into_extent;
 
-	bch2_trans_iter_init(trans, &iter, BTREE_ID_reflink,
-			     POS(0, reflink_offset),
-			     BTREE_ITER_SLOTS);
-	k = bch2_btree_iter_peek_slot(&iter);
+	k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_reflink,
+			       POS(0, reflink_offset), 0);
 	ret = bkey_err(k);
 	if (ret)
 		goto err;
diff --git a/fs/bcachefs/lru.c b/fs/bcachefs/lru.c
index 0e6193d6b5f8..4f23e88f6ae1 100644
--- a/fs/bcachefs/lru.c
+++ b/fs/bcachefs/lru.c
@@ -114,8 +114,7 @@ static int bch2_check_lru_key(struct btree_trans *trans,
 			alloc_pos.inode, alloc_pos.offset))
 		return bch2_btree_delete_at(trans, lru_iter, 0);
 
-	bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, alloc_pos, 0);
-	k = bch2_btree_iter_peek_slot(&iter);
+	k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_alloc, alloc_pos, 0);
 	ret = bkey_err(k);
 	if (ret)
 		goto err;
diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c
index bd5d5221788a..0d96346d5040 100644
--- a/fs/bcachefs/movinggc.c
+++ b/fs/bcachefs/movinggc.c
@@ -91,12 +91,9 @@ static int bch2_bucket_is_movable(struct btree_trans *trans,
 				b->k.bucket.offset))
 		return 0;
 
-	bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc,
-			     b->k.bucket, BTREE_ITER_CACHED);
-	k = bch2_btree_iter_peek_slot(&iter);
+	k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_alloc,
+			       b->k.bucket, BTREE_ITER_CACHED);
 	ret = bkey_err(k);
-	bch2_trans_iter_exit(trans, &iter);
-
 	if (ret)
 		return ret;
 
@@ -108,14 +105,7 @@ static int bch2_bucket_is_movable(struct btree_trans *trans,
 		a->fragmentation_lru &&
 		a->fragmentation_lru <= time;
 
-	if (!ret) {
-		struct printbuf buf = PRINTBUF;
-
-		bch2_bkey_val_to_text(&buf, trans->c, k);
-		pr_debug("%s", buf.buf);
-		printbuf_exit(&buf);
-	}
-
+	bch2_trans_iter_exit(trans, &iter);
 	return ret;
 }
 
diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c
index 22cd662429d3..7734e0dfe523 100644
--- a/fs/bcachefs/quota.c
+++ b/fs/bcachefs/quota.c
@@ -901,10 +901,8 @@ static int bch2_set_quota_trans(struct btree_trans *trans,
 	struct bkey_s_c k;
 	int ret;
 
-	bch2_trans_iter_init(trans, &iter, BTREE_ID_quotas, new_quota->k.p,
-			     BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
-	k = bch2_btree_iter_peek_slot(&iter);
-
+	k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_quotas, new_quota->k.p,
+			       BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
 	ret = bkey_err(k);
 	if (unlikely(ret))
 		return ret;
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 91a66b5916eb..6214691fa441 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -1065,12 +1065,11 @@ static int bch2_fs_upgrade_for_subvolumes(struct btree_trans *trans)
 	struct bch_inode_unpacked inode;
 	int ret;
 
-	bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes,
-			     SPOS(0, BCACHEFS_ROOT_INO, U32_MAX), 0);
-	k = bch2_btree_iter_peek_slot(&iter);
+	k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
+			       SPOS(0, BCACHEFS_ROOT_INO, U32_MAX), 0);
 	ret = bkey_err(k);
 	if (ret)
-		goto err;
+		return ret;
 
 	if (!bkey_is_inode(k.k)) {
 		bch_err(trans->c, "root inode not found");
diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c
index 31af41df09e3..376444860ac2 100644
--- a/fs/bcachefs/subvolume.c
+++ b/fs/bcachefs/subvolume.c
@@ -103,20 +103,8 @@ int bch2_mark_snapshot(struct btree_trans *trans,
 static int snapshot_lookup(struct btree_trans *trans, u32 id,
 			   struct bch_snapshot *s)
 {
-	struct btree_iter iter;
-	struct bkey_s_c k;
-	int ret;
-
-	bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots, POS(0, id),
-			     BTREE_ITER_WITH_UPDATES);
-	k = bch2_btree_iter_peek_slot(&iter);
-	ret = bkey_err(k) ?: k.k->type == KEY_TYPE_snapshot ? 0 : -ENOENT;
-
-	if (!ret)
-		*s = *bkey_s_c_to_snapshot(k).v;
-
-	bch2_trans_iter_exit(trans, &iter);
-	return ret;
+	return bch2_bkey_get_val_typed(trans, BTREE_ID_snapshots, POS(0, id),
+				       BTREE_ITER_WITH_UPDATES, snapshot, s);
 }
 
 static int snapshot_live(struct btree_trans *trans, u32 id)
@@ -402,27 +390,20 @@ err:
 
 static int bch2_snapshot_node_delete(struct btree_trans *trans, u32 id)
 {
+	struct bch_fs *c = trans->c;
 	struct btree_iter iter, p_iter = (struct btree_iter) { NULL };
-	struct bkey_s_c k;
 	struct bkey_s_c_snapshot s;
 	u32 parent_id;
 	unsigned i;
 	int ret = 0;
 
-	bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots, POS(0, id),
-			     BTREE_ITER_INTENT);
-	k = bch2_btree_iter_peek_slot(&iter);
-	ret = bkey_err(k);
-	if (ret)
-		goto err;
+	s = bch2_bkey_get_iter_typed(trans, &iter, BTREE_ID_snapshots, POS(0, id),
+				     BTREE_ITER_INTENT, snapshot);
+	ret = bkey_err(s);
+	bch2_fs_inconsistent_on(ret == -ENOENT, c, "missing snapshot %u", id);
 
-	if (k.k->type != KEY_TYPE_snapshot) {
-		bch2_fs_inconsistent(trans->c, "missing snapshot %u", id);
-		ret = -ENOENT;
+	if (ret)
 		goto err;
-	}
-
-	s = bkey_s_c_to_snapshot(k);
 
 	BUG_ON(!BCH_SNAPSHOT_DELETED(s.v));
 	parent_id = le32_to_cpu(s.v->parent);
@@ -436,7 +417,7 @@ static int bch2_snapshot_node_delete(struct btree_trans *trans, u32 id)
 		parent = bch2_bkey_get_mut_typed(trans, &p_iter, snapshot);
 		ret = PTR_ERR_OR_ZERO(parent);
 		if (unlikely(ret)) {
-			bch2_fs_inconsistent_on(ret == -ENOENT, trans->c, "missing snapshot %u", parent_id);
+			bch2_fs_inconsistent_on(ret == -ENOENT, c, "missing snapshot %u", parent_id);
 			goto err;
 		}
 
@@ -445,7 +426,7 @@ static int bch2_snapshot_node_delete(struct btree_trans *trans, u32 id)
 				break;
 
 		if (i == 2)
-			bch_err(trans->c, "snapshot %u missing child pointer to %u",
+			bch_err(c, "snapshot %u missing child pointer to %u",
 				parent_id, id);
 		else
 			parent->v.children[i] = 0;
@@ -756,21 +737,10 @@ bch2_subvolume_get_inlined(struct btree_trans *trans, unsigned subvol,
 			   int iter_flags,
 			   struct bch_subvolume *s)
 {
-	struct btree_iter iter;
-	struct bkey_s_c k;
-	int ret;
-
-	bch2_trans_iter_init(trans, &iter, BTREE_ID_subvolumes, POS(0, subvol),
-			     iter_flags);
-	k = bch2_btree_iter_peek_slot(&iter);
-	ret = bkey_err(k) ?: k.k->type == KEY_TYPE_subvolume ? 0 : -ENOENT;
-
-	if (ret == -ENOENT && inconsistent_if_not_found)
-		bch2_fs_inconsistent(trans->c, "missing subvolume %u", subvol);
-	if (!ret)
-		*s = *bkey_s_c_to_subvolume(k).v;
-
-	bch2_trans_iter_exit(trans, &iter);
+	int ret = bch2_bkey_get_val_typed(trans, BTREE_ID_subvolumes, POS(0, subvol),
+					  iter_flags, subvolume, s);
+	bch2_fs_inconsistent_on(ret == -ENOENT && inconsistent_if_not_found,
+				trans->c, "missing subvolume %u", subvol);
 	return ret;
 }
 
@@ -813,28 +783,20 @@ int bch2_subvolume_get_snapshot(struct btree_trans *trans, u32 subvol,
 int bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid)
 {
 	struct btree_iter iter;
-	struct bkey_s_c k;
 	struct bkey_s_c_subvolume subvol;
 	struct btree_trans_commit_hook *h;
 	u32 snapid;
 	int ret = 0;
 
-	bch2_trans_iter_init(trans, &iter, BTREE_ID_subvolumes,
-			     POS(0, subvolid),
-			     BTREE_ITER_CACHED|
-			     BTREE_ITER_INTENT);
-	k = bch2_btree_iter_peek_slot(&iter);
-	ret = bkey_err(k);
+	subvol = bch2_bkey_get_iter_typed(trans, &iter,
+				BTREE_ID_subvolumes, POS(0, subvolid),
+				BTREE_ITER_CACHED|BTREE_ITER_INTENT,
+				subvolume);
+	ret = bkey_err(subvol);
+	bch2_fs_inconsistent_on(ret == -ENOENT, trans->c, "missing subvolume %u", subvolid);
 	if (ret)
-		goto err;
-
-	if (k.k->type != KEY_TYPE_subvolume) {
-		bch2_fs_inconsistent(trans->c, "missing subvolume %u", subvolid);
-		ret = -EIO;
-		goto err;
-	}
+		return ret;
 
-	subvol = bkey_s_c_to_subvolume(k);
 	snapid = le32_to_cpu(subvol.v->snapshot);
 
 	ret = bch2_btree_delete_at(trans, &iter, 0);
-- 
cgit 


From d67a16df9c5e03e3e4a672bd6547812baad0bf2c Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 30 Apr 2023 20:58:59 -0400
Subject: bcachefs: Move bch2_bkey_make_mut() to btree_update.h

It's for doing updates - this is where it belongs, and next pathes will
be changing these helpers to use items from btree_update.h.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.h   | 43 -------------------------------------------
 fs/bcachefs/btree_update.h | 43 +++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 43 insertions(+), 43 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index ab86d9d39a18..188a6cd483f8 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -539,49 +539,6 @@ static inline int __bch2_bkey_get_val_typed(struct btree_trans *trans,
 	__bch2_bkey_get_val_typed(_trans, _btree_id, _pos, _flags,	\
 				  KEY_TYPE_##_type, sizeof(*_val), _val)
 
-static inline struct bkey_i *bch2_bkey_make_mut(struct btree_trans *trans, struct bkey_s_c k)
-{
-	struct bkey_i *mut = bch2_trans_kmalloc_nomemzero(trans, bkey_bytes(k.k));
-
-	if (!IS_ERR(mut))
-		bkey_reassemble(mut, k);
-	return mut;
-}
-
-static inline struct bkey_i *bch2_bkey_get_mut(struct btree_trans *trans,
-					       struct btree_iter *iter)
-{
-	struct bkey_s_c k = bch2_btree_iter_peek_slot(iter);
-
-	return unlikely(IS_ERR(k.k))
-		? ERR_CAST(k.k)
-		: bch2_bkey_make_mut(trans, k);
-}
-
-#define bch2_bkey_get_mut_typed(_trans, _iter, _type)			\
-({									\
-	struct bkey_i *_k = bch2_bkey_get_mut(_trans, _iter);		\
-	struct bkey_i_##_type *_ret;					\
-									\
-	if (IS_ERR(_k))							\
-		_ret = ERR_CAST(_k);					\
-	else if (unlikely(_k->k.type != KEY_TYPE_##_type))		\
-		_ret = ERR_PTR(-ENOENT);				\
-	else								\
-		_ret = bkey_i_to_##_type(_k);				\
-	_ret;								\
-})
-
-#define bch2_bkey_alloc(_trans, _iter, _type)				\
-({									\
-	struct bkey_i_##_type *_k = bch2_trans_kmalloc_nomemzero(_trans, sizeof(*_k));\
-	if (!IS_ERR(_k)) {						\
-		bkey_##_type##_init(&_k->k_i);				\
-		_k->k.p	= (_iter)->pos;					\
-	}								\
-	_k;								\
-})
-
 u32 bch2_trans_begin(struct btree_trans *);
 
 /*
diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
index 4adb6f646655..0b320dfaed43 100644
--- a/fs/bcachefs/btree_update.h
+++ b/fs/bcachefs/btree_update.h
@@ -183,4 +183,47 @@ static inline void bch2_trans_reset_updates(struct btree_trans *trans)
 	}
 }
 
+static inline struct bkey_i *bch2_bkey_make_mut(struct btree_trans *trans, struct bkey_s_c k)
+{
+	struct bkey_i *mut = bch2_trans_kmalloc_nomemzero(trans, bkey_bytes(k.k));
+
+	if (!IS_ERR(mut))
+		bkey_reassemble(mut, k);
+	return mut;
+}
+
+static inline struct bkey_i *bch2_bkey_get_mut(struct btree_trans *trans,
+					       struct btree_iter *iter)
+{
+	struct bkey_s_c k = bch2_btree_iter_peek_slot(iter);
+
+	return unlikely(IS_ERR(k.k))
+		? ERR_CAST(k.k)
+		: bch2_bkey_make_mut(trans, k);
+}
+
+#define bch2_bkey_get_mut_typed(_trans, _iter, _type)			\
+({									\
+	struct bkey_i *_k = bch2_bkey_get_mut(_trans, _iter);		\
+	struct bkey_i_##_type *_ret;					\
+									\
+	if (IS_ERR(_k))							\
+		_ret = ERR_CAST(_k);					\
+	else if (unlikely(_k->k.type != KEY_TYPE_##_type))		\
+		_ret = ERR_PTR(-ENOENT);				\
+	else								\
+		_ret = bkey_i_to_##_type(_k);				\
+	_ret;								\
+})
+
+#define bch2_bkey_alloc(_trans, _iter, _type)				\
+({									\
+	struct bkey_i_##_type *_k = bch2_trans_kmalloc_nomemzero(_trans, sizeof(*_k));\
+	if (!IS_ERR(_k)) {						\
+		bkey_##_type##_init(&_k->k_i);				\
+		_k->k.p	= (_iter)->pos;					\
+	}								\
+	_k;								\
+})
+
 #endif /* _BCACHEFS_BTREE_UPDATE_H */
-- 
cgit 


From 34dfa5db191fe227c0c413624b7387f1f1804029 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 27 Apr 2023 23:48:33 -0400
Subject: bcachefs: bch2_bkey_get_mut() improvements

 - bch2_bkey_get_mut() now handles types increasing in size, allocating
   a buffer for the type's current size when necessary
 - bch2_bkey_make_mut_typed()
 - bch2_bkey_get_mut() now initializes the iterator, like
   bch2_bkey_get_iter()

Also, refactor so that most of the code is in functions - now macros are
only used for wrappers.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c |  14 +-----
 fs/bcachefs/btree_update.h     | 101 +++++++++++++++++++++++++++++------------
 fs/bcachefs/buckets.c          |  14 +++---
 fs/bcachefs/io.c               |  13 +++---
 fs/bcachefs/subvolume.c        |  39 ++++++++--------
 5 files changed, 104 insertions(+), 77 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index b938c37cb0fe..7b6225fe3443 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -511,18 +511,8 @@ static inline struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut_inlined(struct btree_
 
 	if (likely(k.k->type == KEY_TYPE_alloc_v4) &&
 	    ((a = bkey_s_c_to_alloc_v4(k), true) &&
-	     BCH_ALLOC_V4_BACKPOINTERS_START(a.v) == BCH_ALLOC_V4_U64s &&
-	     BCH_ALLOC_V4_NR_BACKPOINTERS(a.v) == 0)) {
-		/*
-		 * Reserve space for one more backpointer here:
-		 * Not sketchy at doing it this way, nope...
-		 */
-		struct bkey_i_alloc_v4 *ret =
-			bch2_trans_kmalloc_nomemzero(trans, bkey_bytes(k.k) + sizeof(struct bch_backpointer));
-		if (!IS_ERR(ret))
-			bkey_reassemble(&ret->k_i, k);
-		return ret;
-	}
+	     BCH_ALLOC_V4_NR_BACKPOINTERS(a.v) == 0))
+		return bch2_bkey_make_mut_typed(trans, k, alloc_v4);
 
 	return __bch2_alloc_to_v4_mut(trans, k);
 }
diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
index 0b320dfaed43..5e2aa21caf55 100644
--- a/fs/bcachefs/btree_update.h
+++ b/fs/bcachefs/btree_update.h
@@ -183,47 +183,90 @@ static inline void bch2_trans_reset_updates(struct btree_trans *trans)
 	}
 }
 
-static inline struct bkey_i *bch2_bkey_make_mut(struct btree_trans *trans, struct bkey_s_c k)
+static inline struct bkey_i *__bch2_bkey_make_mut(struct btree_trans *trans, struct bkey_s_c k,
+						  unsigned type, unsigned min_bytes)
 {
-	struct bkey_i *mut = bch2_trans_kmalloc_nomemzero(trans, bkey_bytes(k.k));
+	unsigned bytes = max_t(unsigned, min_bytes, bkey_bytes(k.k));
+	struct bkey_i *mut;
+
+	if (type && k.k->type != type)
+		return ERR_PTR(-ENOENT);
 
-	if (!IS_ERR(mut))
+	mut = bch2_trans_kmalloc_nomemzero(trans, bytes);
+	if (!IS_ERR(mut)) {
 		bkey_reassemble(mut, k);
+
+		if (unlikely(bytes > bkey_bytes(k.k))) {
+			memset((void *) mut + bkey_bytes(k.k), 0,
+			       bytes - bkey_bytes(k.k));
+			mut->k.u64s = DIV_ROUND_UP(bytes, sizeof(u64));
+		}
+	}
 	return mut;
 }
 
-static inline struct bkey_i *bch2_bkey_get_mut(struct btree_trans *trans,
-					       struct btree_iter *iter)
+static inline struct bkey_i *bch2_bkey_make_mut(struct btree_trans *trans, struct bkey_s_c k)
 {
-	struct bkey_s_c k = bch2_btree_iter_peek_slot(iter);
+	return __bch2_bkey_make_mut(trans, k, 0, 0);
+}
 
-	return unlikely(IS_ERR(k.k))
+#define bch2_bkey_make_mut_typed(_trans, _k, _type)			\
+	bkey_i_to_##_type(__bch2_bkey_make_mut(_trans, _k,		\
+				KEY_TYPE_##_type, sizeof(struct bkey_i_##_type)))
+
+static inline struct bkey_i *__bch2_bkey_get_mut(struct btree_trans *trans,
+					 struct btree_iter *iter,
+					 unsigned btree_id, struct bpos pos,
+					 unsigned flags, unsigned type, unsigned min_bytes)
+{
+	struct bkey_s_c k = __bch2_bkey_get_iter(trans, iter,
+				btree_id, pos, flags|BTREE_ITER_INTENT, type);
+	struct bkey_i *ret = unlikely(IS_ERR(k.k))
 		? ERR_CAST(k.k)
-		: bch2_bkey_make_mut(trans, k);
+		: __bch2_bkey_make_mut(trans, k, 0, min_bytes);
+	if (unlikely(IS_ERR(ret)))
+		bch2_trans_iter_exit(trans, iter);
+	return ret;
 }
 
-#define bch2_bkey_get_mut_typed(_trans, _iter, _type)			\
-({									\
-	struct bkey_i *_k = bch2_bkey_get_mut(_trans, _iter);		\
-	struct bkey_i_##_type *_ret;					\
-									\
-	if (IS_ERR(_k))							\
-		_ret = ERR_CAST(_k);					\
-	else if (unlikely(_k->k.type != KEY_TYPE_##_type))		\
-		_ret = ERR_PTR(-ENOENT);				\
-	else								\
-		_ret = bkey_i_to_##_type(_k);				\
-	_ret;								\
-})
+static inline struct bkey_i *bch2_bkey_get_mut_minsize(struct btree_trans *trans,
+						       struct btree_iter *iter,
+						       unsigned btree_id, struct bpos pos,
+						       unsigned flags, unsigned min_bytes)
+{
+	return __bch2_bkey_get_mut(trans, iter, btree_id, pos, flags, 0, min_bytes);
+}
+
+static inline struct bkey_i *bch2_bkey_get_mut(struct btree_trans *trans,
+					       struct btree_iter *iter,
+					       unsigned btree_id, struct bpos pos,
+					       unsigned flags)
+{
+	return bch2_bkey_get_mut_minsize(trans, iter, btree_id, pos, flags, 0);
+}
+
+#define bch2_bkey_get_mut_typed(_trans, _iter, _btree_id, _pos, _flags, _type)\
+	bkey_i_to_##_type(__bch2_bkey_get_mut(_trans, _iter,		\
+			_btree_id, _pos, _flags,			\
+			KEY_TYPE_##_type, sizeof(struct bkey_i_##_type)))
+
+static inline struct bkey_i *__bch2_bkey_alloc(struct btree_trans *trans, struct btree_iter *iter,
+					       unsigned type, unsigned val_size)
+{
+	struct bkey_i *k = bch2_trans_kmalloc(trans, sizeof(*k) + val_size);
+
+	if (!IS_ERR(k)) {
+		bkey_init(&k->k);
+		k->k.p = iter->pos;
+		k->k.type = type;
+		set_bkey_val_bytes(&k->k, val_size);
+	}
+
+	return k;
+}
 
 #define bch2_bkey_alloc(_trans, _iter, _type)				\
-({									\
-	struct bkey_i_##_type *_k = bch2_trans_kmalloc_nomemzero(_trans, sizeof(*_k));\
-	if (!IS_ERR(_k)) {						\
-		bkey_##_type##_init(&_k->k_i);				\
-		_k->k.p	= (_iter)->pos;					\
-	}								\
-	_k;								\
-})
+	bkey_i_to_##_type(__bch2_bkey_alloc(_trans, _iter,		\
+				KEY_TYPE_##_type, sizeof(struct bch_##_type)))
 
 #endif /* _BCACHEFS_BTREE_UPDATE_H */
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index f3cee8f2b793..00b60749daf8 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -1448,10 +1448,9 @@ static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans,
 	struct bch_replicas_padded r;
 	int ret = 0;
 
-	bch2_trans_iter_init(trans, &iter, BTREE_ID_stripes, POS(0, p.ec.idx),
-			     BTREE_ITER_INTENT|
-			     BTREE_ITER_WITH_UPDATES);
-	s = bch2_bkey_get_mut_typed(trans, &iter, stripe);
+	s = bch2_bkey_get_mut_typed(trans, &iter,
+			BTREE_ID_stripes, POS(0, p.ec.idx),
+			BTREE_ITER_WITH_UPDATES, stripe);
 	ret = PTR_ERR_OR_ZERO(s);
 	if (unlikely(ret)) {
 		bch2_trans_inconsistent_on(ret == -ENOENT, trans,
@@ -1750,10 +1749,9 @@ static int __bch2_trans_mark_reflink_p(struct btree_trans *trans,
 	struct printbuf buf = PRINTBUF;
 	int ret;
 
-	bch2_trans_iter_init(trans, &iter, BTREE_ID_reflink, POS(0, *idx),
-			     BTREE_ITER_INTENT|
-			     BTREE_ITER_WITH_UPDATES);
-	k = bch2_bkey_get_mut(trans, &iter);
+	k = bch2_bkey_get_mut(trans, &iter,
+			BTREE_ID_reflink, POS(0, *idx),
+			BTREE_ITER_WITH_UPDATES);
 	ret = PTR_ERR_OR_ZERO(k);
 	if (ret)
 		goto err;
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index 01911db786f5..c0471a4144ff 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -257,15 +257,14 @@ static inline int bch2_extent_update_i_size_sectors(struct btree_trans *trans,
 	unsigned inode_update_flags = BTREE_UPDATE_NOJOURNAL;
 	int ret;
 
-	bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes,
-			     SPOS(0,
-				  extent_iter->pos.inode,
-				  extent_iter->snapshot),
-			     BTREE_ITER_INTENT|BTREE_ITER_CACHED);
-	k = bch2_bkey_get_mut(trans, &iter);
+	k = bch2_bkey_get_mut(trans, &iter, BTREE_ID_inodes,
+			      SPOS(0,
+				   extent_iter->pos.inode,
+				   extent_iter->snapshot),
+			      BTREE_ITER_CACHED);
 	ret = PTR_ERR_OR_ZERO(k);
 	if (unlikely(ret))
-		goto err;
+		return ret;
 
 	if (unlikely(k->k.type != KEY_TYPE_inode_v3)) {
 		k = bch2_inode_to_v3(trans, k);
diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c
index 376444860ac2..eea4c2558998 100644
--- a/fs/bcachefs/subvolume.c
+++ b/fs/bcachefs/subvolume.c
@@ -363,9 +363,9 @@ static int bch2_snapshot_node_set_deleted(struct btree_trans *trans, u32 id)
 	struct bkey_i_snapshot *s;
 	int ret = 0;
 
-	bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots, POS(0, id),
-			     BTREE_ITER_INTENT);
-	s = bch2_bkey_get_mut_typed(trans, &iter, snapshot);
+	s = bch2_bkey_get_mut_typed(trans, &iter,
+				    BTREE_ID_snapshots, POS(0, id),
+				    0, snapshot);
 	ret = PTR_ERR_OR_ZERO(s);
 	if (unlikely(ret)) {
 		bch2_fs_inconsistent_on(ret == -ENOENT, trans->c, "missing snapshot %u", id);
@@ -411,10 +411,9 @@ static int bch2_snapshot_node_delete(struct btree_trans *trans, u32 id)
 	if (parent_id) {
 		struct bkey_i_snapshot *parent;
 
-		bch2_trans_iter_init(trans, &p_iter, BTREE_ID_snapshots,
-				     POS(0, parent_id),
-				     BTREE_ITER_INTENT);
-		parent = bch2_bkey_get_mut_typed(trans, &p_iter, snapshot);
+		parent = bch2_bkey_get_mut_typed(trans, &p_iter,
+				     BTREE_ID_snapshots, POS(0, parent_id),
+				     0, snapshot);
 		ret = PTR_ERR_OR_ZERO(parent);
 		if (unlikely(ret)) {
 			bch2_fs_inconsistent_on(ret == -ENOENT, c, "missing snapshot %u", parent_id);
@@ -453,7 +452,7 @@ int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent,
 			      u32 *snapshot_subvols,
 			      unsigned nr_snapids)
 {
-	struct btree_iter iter;
+	struct btree_iter iter, parent_iter = { NULL };
 	struct bkey_i_snapshot *n;
 	struct bkey_s_c k;
 	unsigned i;
@@ -498,8 +497,9 @@ int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent,
 	}
 
 	if (parent) {
-		bch2_btree_iter_set_pos(&iter, POS(0, parent));
-		n = bch2_bkey_get_mut_typed(trans, &iter, snapshot);
+		n = bch2_bkey_get_mut_typed(trans, &parent_iter,
+				BTREE_ID_snapshots, POS(0, parent),
+				0, snapshot);
 		ret = PTR_ERR_OR_ZERO(n);
 		if (unlikely(ret)) {
 			if (ret == -ENOENT)
@@ -517,11 +517,12 @@ int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent,
 		n->v.children[1] = cpu_to_le32(new_snapids[1]);
 		n->v.subvol = 0;
 		SET_BCH_SNAPSHOT_SUBVOL(&n->v, false);
-		ret   = bch2_trans_update(trans, &iter, &n->k_i, 0);
+		ret   = bch2_trans_update(trans, &parent_iter, &n->k_i, 0);
 		if (ret)
 			goto err;
 	}
 err:
+	bch2_trans_iter_exit(trans, &parent_iter);
 	bch2_trans_iter_exit(trans, &iter);
 	return ret;
 }
@@ -888,11 +889,9 @@ int bch2_subvolume_unlink(struct btree_trans *trans, u32 subvolid)
 	struct subvolume_unlink_hook *h;
 	int ret = 0;
 
-	bch2_trans_iter_init(trans, &iter, BTREE_ID_subvolumes,
-			     POS(0, subvolid),
-			     BTREE_ITER_CACHED|
-			     BTREE_ITER_INTENT);
-	n = bch2_bkey_get_mut_typed(trans, &iter, subvolume);
+	n = bch2_bkey_get_mut_typed(trans, &iter,
+			BTREE_ID_subvolumes, POS(0, subvolid),
+			BTREE_ITER_CACHED, subvolume);
 	ret = PTR_ERR_OR_ZERO(n);
 	if (unlikely(ret)) {
 		bch2_fs_inconsistent_on(ret == -ENOENT, trans->c, "missing subvolume %u", subvolid);
@@ -956,11 +955,9 @@ found_slot:
 	if (src_subvolid) {
 		/* Creating a snapshot: */
 
-		bch2_trans_iter_init(trans, &src_iter, BTREE_ID_subvolumes,
-				     POS(0, src_subvolid),
-				     BTREE_ITER_CACHED|
-				     BTREE_ITER_INTENT);
-		src_subvol = bch2_bkey_get_mut_typed(trans, &src_iter, subvolume);
+		src_subvol = bch2_bkey_get_mut_typed(trans, &src_iter,
+				BTREE_ID_subvolumes, POS(0, src_subvolid),
+				BTREE_ITER_CACHED, subvolume);
 		ret = PTR_ERR_OR_ZERO(src_subvol);
 		if (unlikely(ret)) {
 			bch2_fs_inconsistent_on(ret == -ENOENT, trans->c,
-- 
cgit 


From f8cb35fda161715e384df340f0bae4de37c5576f Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 30 Apr 2023 18:59:28 -0400
Subject: bcachefs: bch2_bkey_alloc() now calls bch2_trans_update()

It's safe to call bch2_trans_update with a k/v pair where the value
hasn't been filled out, as long as the key part has been and the value
is filled out by transaction commit time.

This patch folds the bch2_trans_update() call into bch2_bkey_alloc(),
eliminating a bit of boilerplate.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update.h | 23 ++++++++++++++---------
 fs/bcachefs/subvolume.c    | 12 ++++--------
 2 files changed, 18 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
index 5e2aa21caf55..5743a6e0f474 100644
--- a/fs/bcachefs/btree_update.h
+++ b/fs/bcachefs/btree_update.h
@@ -251,22 +251,27 @@ static inline struct bkey_i *bch2_bkey_get_mut(struct btree_trans *trans,
 			KEY_TYPE_##_type, sizeof(struct bkey_i_##_type)))
 
 static inline struct bkey_i *__bch2_bkey_alloc(struct btree_trans *trans, struct btree_iter *iter,
-					       unsigned type, unsigned val_size)
+					       unsigned flags, unsigned type, unsigned val_size)
 {
 	struct bkey_i *k = bch2_trans_kmalloc(trans, sizeof(*k) + val_size);
+	int ret;
 
-	if (!IS_ERR(k)) {
-		bkey_init(&k->k);
-		k->k.p = iter->pos;
-		k->k.type = type;
-		set_bkey_val_bytes(&k->k, val_size);
-	}
+	if (IS_ERR(k))
+		return k;
+
+	bkey_init(&k->k);
+	k->k.p = iter->pos;
+	k->k.type = type;
+	set_bkey_val_bytes(&k->k, val_size);
 
+	ret = bch2_trans_update(trans, iter, k, flags);
+	if (unlikely(ret))
+		return ERR_PTR(ret);
 	return k;
 }
 
-#define bch2_bkey_alloc(_trans, _iter, _type)				\
-	bkey_i_to_##_type(__bch2_bkey_alloc(_trans, _iter,		\
+#define bch2_bkey_alloc(_trans, _iter, _flags, _type)			\
+	bkey_i_to_##_type(__bch2_bkey_alloc(_trans, _iter, _flags,	\
 				KEY_TYPE_##_type, sizeof(struct bch_##_type)))
 
 #endif /* _BCACHEFS_BTREE_UPDATE_H */
diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c
index eea4c2558998..48956453340d 100644
--- a/fs/bcachefs/subvolume.c
+++ b/fs/bcachefs/subvolume.c
@@ -476,7 +476,7 @@ int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent,
 			goto err;
 		}
 
-		n = bch2_bkey_alloc(trans, &iter, snapshot);
+		n = bch2_bkey_alloc(trans, &iter, 0, snapshot);
 		ret = PTR_ERR_OR_ZERO(n);
 		if (ret)
 			goto err;
@@ -487,9 +487,8 @@ int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent,
 		n->v.pad	= 0;
 		SET_BCH_SNAPSHOT_SUBVOL(&n->v, true);
 
-		ret   = bch2_trans_update(trans, &iter, &n->k_i, 0) ?:
-			bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0,
-					   bkey_s_c_null, bkey_i_to_s_c(&n->k_i), 0);
+		ret = bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0,
+					 bkey_s_c_null, bkey_i_to_s_c(&n->k_i), 0);
 		if (ret)
 			goto err;
 
@@ -981,7 +980,7 @@ found_slot:
 			goto err;
 	}
 
-	new_subvol = bch2_bkey_alloc(trans, &dst_iter, subvolume);
+	new_subvol = bch2_bkey_alloc(trans, &dst_iter, 0, subvolume);
 	ret = PTR_ERR_OR_ZERO(new_subvol);
 	if (ret)
 		goto err;
@@ -991,9 +990,6 @@ found_slot:
 	new_subvol->v.inode	= cpu_to_le64(inode);
 	SET_BCH_SUBVOLUME_RO(&new_subvol->v, ro);
 	SET_BCH_SUBVOLUME_SNAP(&new_subvol->v, src_subvolid != 0);
-	ret = bch2_trans_update(trans, &dst_iter, &new_subvol->k_i, 0);
-	if (ret)
-		goto err;
 
 	*new_subvolid	= new_subvol->k.p.offset;
 	*new_snapshotid	= new_nodes[0];
-- 
cgit 


From f12a798a898dec36de9705d40a1b03e2418aabe0 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 30 Apr 2023 18:46:24 -0400
Subject: bcachefs: bch2_bkey_get_mut() now calls bch2_trans_update()

It's safe to call bch2_trans_update with a k/v pair where the value
hasn't been filled out, as long as the key part has been and the value
is filled out by transaction commit time.

This patch folds the bch2_trans_update() call into bch2_bkey_get_mut(),
eliminating a bit of boilerplate.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update.h | 33 +++++++++++++++++++++++++++++++--
 fs/bcachefs/buckets.c      |  6 +-----
 fs/bcachefs/io.c           |  2 +-
 fs/bcachefs/subvolume.c    | 35 +++++++++++------------------------
 4 files changed, 44 insertions(+), 32 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
index 5743a6e0f474..d823334033f9 100644
--- a/fs/bcachefs/btree_update.h
+++ b/fs/bcachefs/btree_update.h
@@ -214,7 +214,7 @@ static inline struct bkey_i *bch2_bkey_make_mut(struct btree_trans *trans, struc
 	bkey_i_to_##_type(__bch2_bkey_make_mut(_trans, _k,		\
 				KEY_TYPE_##_type, sizeof(struct bkey_i_##_type)))
 
-static inline struct bkey_i *__bch2_bkey_get_mut(struct btree_trans *trans,
+static inline struct bkey_i *__bch2_bkey_get_mut_noupdate(struct btree_trans *trans,
 					 struct btree_iter *iter,
 					 unsigned btree_id, struct bpos pos,
 					 unsigned flags, unsigned type, unsigned min_bytes)
@@ -229,6 +229,35 @@ static inline struct bkey_i *__bch2_bkey_get_mut(struct btree_trans *trans,
 	return ret;
 }
 
+static inline struct bkey_i *bch2_bkey_get_mut_noupdate(struct btree_trans *trans,
+					       struct btree_iter *iter,
+					       unsigned btree_id, struct bpos pos,
+					       unsigned flags)
+{
+	return __bch2_bkey_get_mut_noupdate(trans, iter, btree_id, pos, flags, 0, 0);
+}
+
+static inline struct bkey_i *__bch2_bkey_get_mut(struct btree_trans *trans,
+					 struct btree_iter *iter,
+					 unsigned btree_id, struct bpos pos,
+					 unsigned flags, unsigned type, unsigned min_bytes)
+{
+	struct bkey_i *mut = __bch2_bkey_get_mut_noupdate(trans, iter,
+				btree_id, pos, flags|BTREE_ITER_INTENT, type, min_bytes);
+	int ret;
+
+	if (IS_ERR(mut))
+		return mut;
+
+	ret = bch2_trans_update(trans, iter, mut, flags);
+	if (ret) {
+		bch2_trans_iter_exit(trans, iter);
+		return ERR_PTR(ret);
+	}
+
+	return mut;
+}
+
 static inline struct bkey_i *bch2_bkey_get_mut_minsize(struct btree_trans *trans,
 						       struct btree_iter *iter,
 						       unsigned btree_id, struct bpos pos,
@@ -242,7 +271,7 @@ static inline struct bkey_i *bch2_bkey_get_mut(struct btree_trans *trans,
 					       unsigned btree_id, struct bpos pos,
 					       unsigned flags)
 {
-	return bch2_bkey_get_mut_minsize(trans, iter, btree_id, pos, flags, 0);
+	return __bch2_bkey_get_mut(trans, iter, btree_id, pos, flags, 0, 0);
 }
 
 #define bch2_bkey_get_mut_typed(_trans, _iter, _btree_id, _pos, _flags, _type)\
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 00b60749daf8..bce42eef6f57 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -1471,10 +1471,6 @@ static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans,
 		stripe_blockcount_get(&s->v, p.ec.block) +
 		sectors);
 
-	ret = bch2_trans_update(trans, &iter, &s->k_i, 0);
-	if (ret)
-		goto err;
-
 	bch2_bkey_to_replicas(&r.e, bkey_i_to_s_c(&s->k_i));
 	r.e.data_type = data_type;
 	update_replicas_list(trans, &r.e, sectors);
@@ -1749,7 +1745,7 @@ static int __bch2_trans_mark_reflink_p(struct btree_trans *trans,
 	struct printbuf buf = PRINTBUF;
 	int ret;
 
-	k = bch2_bkey_get_mut(trans, &iter,
+	k = bch2_bkey_get_mut_noupdate(trans, &iter,
 			BTREE_ID_reflink, POS(0, *idx),
 			BTREE_ITER_WITH_UPDATES);
 	ret = PTR_ERR_OR_ZERO(k);
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index c0471a4144ff..46dc166d23d5 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -257,7 +257,7 @@ static inline int bch2_extent_update_i_size_sectors(struct btree_trans *trans,
 	unsigned inode_update_flags = BTREE_UPDATE_NOJOURNAL;
 	int ret;
 
-	k = bch2_bkey_get_mut(trans, &iter, BTREE_ID_inodes,
+	k = bch2_bkey_get_mut_noupdate(trans, &iter, BTREE_ID_inodes,
 			      SPOS(0,
 				   extent_iter->pos.inode,
 				   extent_iter->snapshot),
diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c
index 48956453340d..cac295afc75f 100644
--- a/fs/bcachefs/subvolume.c
+++ b/fs/bcachefs/subvolume.c
@@ -369,7 +369,7 @@ static int bch2_snapshot_node_set_deleted(struct btree_trans *trans, u32 id)
 	ret = PTR_ERR_OR_ZERO(s);
 	if (unlikely(ret)) {
 		bch2_fs_inconsistent_on(ret == -ENOENT, trans->c, "missing snapshot %u", id);
-		goto err;
+		return ret;
 	}
 
 	/* already deleted? */
@@ -379,10 +379,6 @@ static int bch2_snapshot_node_set_deleted(struct btree_trans *trans, u32 id)
 	SET_BCH_SNAPSHOT_DELETED(&s->v, true);
 	SET_BCH_SNAPSHOT_SUBVOL(&s->v, false);
 	s->v.subvol = 0;
-
-	ret = bch2_trans_update(trans, &iter, &s->k_i, 0);
-	if (ret)
-		goto err;
 err:
 	bch2_trans_iter_exit(trans, &iter);
 	return ret;
@@ -434,10 +430,6 @@ static int bch2_snapshot_node_delete(struct btree_trans *trans, u32 id)
 		    le32_to_cpu(parent->v.children[1]))
 			swap(parent->v.children[0],
 			     parent->v.children[1]);
-
-		ret = bch2_trans_update(trans, &p_iter, &parent->k_i, 0);
-		if (ret)
-			goto err;
 	}
 
 	ret = bch2_btree_delete_at(trans, &iter, 0);
@@ -888,30 +880,25 @@ int bch2_subvolume_unlink(struct btree_trans *trans, u32 subvolid)
 	struct subvolume_unlink_hook *h;
 	int ret = 0;
 
+	h = bch2_trans_kmalloc(trans, sizeof(*h));
+	ret = PTR_ERR_OR_ZERO(h);
+	if (ret)
+		return ret;
+
+	h->h.fn		= bch2_subvolume_wait_for_pagecache_and_delete_hook;
+	h->subvol	= subvolid;
+	bch2_trans_commit_hook(trans, &h->h);
+
 	n = bch2_bkey_get_mut_typed(trans, &iter,
 			BTREE_ID_subvolumes, POS(0, subvolid),
 			BTREE_ITER_CACHED, subvolume);
 	ret = PTR_ERR_OR_ZERO(n);
 	if (unlikely(ret)) {
 		bch2_fs_inconsistent_on(ret == -ENOENT, trans->c, "missing subvolume %u", subvolid);
-		goto err;
+		return ret;
 	}
 
 	SET_BCH_SUBVOLUME_UNLINKED(&n->v, true);
-
-	ret = bch2_trans_update(trans, &iter, &n->k_i, 0);
-	if (ret)
-		goto err;
-
-	h = bch2_trans_kmalloc(trans, sizeof(*h));
-	ret = PTR_ERR_OR_ZERO(h);
-	if (ret)
-		goto err;
-
-	h->h.fn		= bch2_subvolume_wait_for_pagecache_and_delete_hook;
-	h->subvol	= subvolid;
-	bch2_trans_commit_hook(trans, &h->h);
-err:
 	bch2_trans_iter_exit(trans, &iter);
 	return ret;
 }
-- 
cgit 


From dbda63bbb0dbce070f22132339a07146bf1af850 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 30 Apr 2023 19:21:06 -0400
Subject: bcachefs: bch2_bkey_make_mut() now calls bch2_trans_update()

It's safe to call bch2_trans_update with a k/v pair where the value
hasn't been filled out, as long as the key part has been and the value
is filled out by transaction commit time.

This patch folds the bch2_trans_update() call into bch2_bkey_make_mut(),
eliminating a bit of boilerplate.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c  |  2 +-
 fs/bcachefs/btree_gc.c          |  8 +++-----
 fs/bcachefs/btree_update.h      | 38 ++++++++++++++++++++++++++++++++------
 fs/bcachefs/btree_update_leaf.c |  8 ++++----
 fs/bcachefs/fsck.c              |  2 +-
 fs/bcachefs/io.c                |  2 +-
 fs/bcachefs/migrate.c           |  5 ++---
 fs/bcachefs/move.c              |  2 +-
 8 files changed, 45 insertions(+), 22 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 7b6225fe3443..dcdef3bcd4c4 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -512,7 +512,7 @@ static inline struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut_inlined(struct btree_
 	if (likely(k.k->type == KEY_TYPE_alloc_v4) &&
 	    ((a = bkey_s_c_to_alloc_v4(k), true) &&
 	     BCH_ALLOC_V4_NR_BACKPOINTERS(a.v) == 0))
-		return bch2_bkey_make_mut_typed(trans, k, alloc_v4);
+		return bch2_bkey_make_mut_noupdate_typed(trans, k, alloc_v4);
 
 	return __bch2_alloc_to_v4_mut(trans, k);
 }
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index eedcc09bacff..8477e721b63c 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -1591,7 +1591,7 @@ static int bch2_gc_write_reflink_key(struct btree_trans *trans,
 			"  should be %u",
 			(bch2_bkey_val_to_text(&buf, c, k), buf.buf),
 			r->refcount)) {
-		struct bkey_i *new = bch2_bkey_make_mut(trans, k);
+		struct bkey_i *new = bch2_bkey_make_mut(trans, iter, k, 0);
 
 		ret = PTR_ERR_OR_ZERO(new);
 		if (ret)
@@ -1601,8 +1601,6 @@ static int bch2_gc_write_reflink_key(struct btree_trans *trans,
 			new->k.type = KEY_TYPE_deleted;
 		else
 			*bkey_refcount(new) = cpu_to_le64(r->refcount);
-
-		ret = bch2_trans_update(trans, iter, new, 0);
 	}
 fsck_err:
 	printbuf_exit(&buf);
@@ -1918,13 +1916,13 @@ static int gc_btree_gens_key(struct btree_trans *trans,
 	percpu_up_read(&c->mark_lock);
 	return 0;
 update:
-	u = bch2_bkey_make_mut(trans, k);
+	u = bch2_bkey_make_mut(trans, iter, k, 0);
 	ret = PTR_ERR_OR_ZERO(u);
 	if (ret)
 		return ret;
 
 	bch2_extent_normalize(c, bkey_i_to_s(u));
-	return bch2_trans_update(trans, iter, u, 0);
+	return 0;
 }
 
 static int bch2_alloc_write_oldest_gen(struct btree_trans *trans, struct btree_iter *iter,
diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
index d823334033f9..34ca2c43a8ca 100644
--- a/fs/bcachefs/btree_update.h
+++ b/fs/bcachefs/btree_update.h
@@ -183,7 +183,7 @@ static inline void bch2_trans_reset_updates(struct btree_trans *trans)
 	}
 }
 
-static inline struct bkey_i *__bch2_bkey_make_mut(struct btree_trans *trans, struct bkey_s_c k,
+static inline struct bkey_i *__bch2_bkey_make_mut_noupdate(struct btree_trans *trans, struct bkey_s_c k,
 						  unsigned type, unsigned min_bytes)
 {
 	unsigned bytes = max_t(unsigned, min_bytes, bkey_bytes(k.k));
@@ -205,13 +205,39 @@ static inline struct bkey_i *__bch2_bkey_make_mut(struct btree_trans *trans, str
 	return mut;
 }
 
-static inline struct bkey_i *bch2_bkey_make_mut(struct btree_trans *trans, struct bkey_s_c k)
+static inline struct bkey_i *bch2_bkey_make_mut_noupdate(struct btree_trans *trans, struct bkey_s_c k)
 {
-	return __bch2_bkey_make_mut(trans, k, 0, 0);
+	return __bch2_bkey_make_mut_noupdate(trans, k, 0, 0);
 }
 
-#define bch2_bkey_make_mut_typed(_trans, _k, _type)			\
-	bkey_i_to_##_type(__bch2_bkey_make_mut(_trans, _k,		\
+#define bch2_bkey_make_mut_noupdate_typed(_trans, _k, _type)		\
+	bkey_i_to_##_type(__bch2_bkey_make_mut_noupdate(_trans, _k,	\
+				KEY_TYPE_##_type, sizeof(struct bkey_i_##_type)))
+
+static inline struct bkey_i *__bch2_bkey_make_mut(struct btree_trans *trans, struct btree_iter *iter,
+					struct bkey_s_c k, unsigned flags,
+					unsigned type, unsigned min_bytes)
+{
+	struct bkey_i *mut = __bch2_bkey_make_mut_noupdate(trans, k, type, min_bytes);
+	int ret;
+
+	if (IS_ERR(mut))
+		return mut;
+
+	ret = bch2_trans_update(trans, iter, mut, flags);
+	if (ret)
+		return ERR_PTR(ret);
+	return mut;
+}
+
+static inline struct bkey_i *bch2_bkey_make_mut(struct btree_trans *trans, struct btree_iter *iter,
+						struct bkey_s_c k, unsigned flags)
+{
+	return __bch2_bkey_make_mut(trans, iter, k, flags, 0, 0);
+}
+
+#define bch2_bkey_make_mut_typed(_trans, _iter, _k, _flags, _type)	\
+	bkey_i_to_##_type(__bch2_bkey_make_mut(_trans, _iter, _k, _flags,\
 				KEY_TYPE_##_type, sizeof(struct bkey_i_##_type)))
 
 static inline struct bkey_i *__bch2_bkey_get_mut_noupdate(struct btree_trans *trans,
@@ -223,7 +249,7 @@ static inline struct bkey_i *__bch2_bkey_get_mut_noupdate(struct btree_trans *tr
 				btree_id, pos, flags|BTREE_ITER_INTENT, type);
 	struct bkey_i *ret = unlikely(IS_ERR(k.k))
 		? ERR_CAST(k.k)
-		: __bch2_bkey_make_mut(trans, k, 0, min_bytes);
+		: __bch2_bkey_make_mut_noupdate(trans, k, 0, min_bytes);
 	if (unlikely(IS_ERR(ret)))
 		bch2_trans_iter_exit(trans, iter);
 	return ret;
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index c511541bb5f4..0952885f3caa 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -1268,7 +1268,7 @@ static noinline int extent_front_merge(struct btree_trans *trans,
 	struct bkey_i *update;
 	int ret;
 
-	update = bch2_bkey_make_mut(trans, k);
+	update = bch2_bkey_make_mut_noupdate(trans, k);
 	ret = PTR_ERR_OR_ZERO(update);
 	if (ret)
 		return ret;
@@ -1390,7 +1390,7 @@ int bch2_trans_update_extent(struct btree_trans *trans,
 			trans->extra_journal_res += compressed_sectors;
 
 		if (front_split) {
-			update = bch2_bkey_make_mut(trans, k);
+			update = bch2_bkey_make_mut_noupdate(trans, k);
 			if ((ret = PTR_ERR_OR_ZERO(update)))
 				goto err;
 
@@ -1404,7 +1404,7 @@ int bch2_trans_update_extent(struct btree_trans *trans,
 
 		if (k.k->p.snapshot != insert->k.p.snapshot &&
 		    (front_split || back_split)) {
-			update = bch2_bkey_make_mut(trans, k);
+			update = bch2_bkey_make_mut_noupdate(trans, k);
 			if ((ret = PTR_ERR_OR_ZERO(update)))
 				goto err;
 
@@ -1443,7 +1443,7 @@ int bch2_trans_update_extent(struct btree_trans *trans,
 		}
 
 		if (back_split) {
-			update = bch2_bkey_make_mut(trans, k);
+			update = bch2_bkey_make_mut_noupdate(trans, k);
 			if ((ret = PTR_ERR_OR_ZERO(update)))
 				goto err;
 
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 142e64922d8f..4b28fc4f77c6 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -765,7 +765,7 @@ static int hash_redo_key(struct btree_trans *trans,
 	if (IS_ERR(delete))
 		return PTR_ERR(delete);
 
-	tmp = bch2_bkey_make_mut(trans, k);
+	tmp = bch2_bkey_make_mut_noupdate(trans, k);
 	if (IS_ERR(tmp))
 		return PTR_ERR(tmp);
 
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index 46dc166d23d5..11ed86453d66 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -1393,7 +1393,7 @@ static int bch2_nocow_write_convert_one_unwritten(struct btree_trans *trans,
 		return 0;
 	}
 
-	new = bch2_bkey_make_mut(trans, k);
+	new = bch2_bkey_make_mut_noupdate(trans, k);
 	ret = PTR_ERR_OR_ZERO(new);
 	if (ret)
 		return ret;
diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c
index d93db07f0c87..0898fa49b3cd 100644
--- a/fs/bcachefs/migrate.c
+++ b/fs/bcachefs/migrate.c
@@ -49,7 +49,7 @@ static int bch2_dev_usrdata_drop_key(struct btree_trans *trans,
 	if (!bch2_bkey_has_device_c(k, dev_idx))
 		return 0;
 
-	n = bch2_bkey_make_mut(trans, k);
+	n = bch2_bkey_make_mut(trans, iter, k, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
 	ret = PTR_ERR_OR_ZERO(n);
 	if (ret)
 		return ret;
@@ -73,8 +73,7 @@ static int bch2_dev_usrdata_drop_key(struct btree_trans *trans,
 	 */
 	if (bkey_deleted(&n->k))
 		n->k.size = 0;
-
-	return bch2_trans_update(trans, iter, n, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+	return 0;
 }
 
 static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index 642c076216ea..7e22176a5c7e 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -251,7 +251,7 @@ static int bch2_extent_drop_ptrs(struct btree_trans *trans,
 	struct bkey_i *n;
 	int ret;
 
-	n = bch2_bkey_make_mut(trans, k);
+	n = bch2_bkey_make_mut_noupdate(trans, k);
 	ret = PTR_ERR_OR_ZERO(n);
 	if (ret)
 		return ret;
-- 
cgit 


From 51e84d3bbff55f5ac79fef0d1bbf515d6d397289 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 27 Apr 2023 23:20:18 -0400
Subject: bcachefs: bch2_bkey_get_empty_slot()

Add a new helper for allocating a new slot in a btree.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update.h      |  3 +++
 fs/bcachefs/btree_update_leaf.c | 31 +++++++++++++++++++++++++++++++
 fs/bcachefs/errcode.h           |  1 +
 fs/bcachefs/subvolume.c         | 25 ++++++-------------------
 4 files changed, 41 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
index 34ca2c43a8ca..1ac3a81e0af6 100644
--- a/fs/bcachefs/btree_update.h
+++ b/fs/bcachefs/btree_update.h
@@ -86,6 +86,9 @@ int bch2_btree_node_update_key_get_iter(struct btree_trans *,
 int bch2_trans_update_extent(struct btree_trans *, struct btree_iter *,
 			     struct bkey_i *, enum btree_update_flags);
 
+int bch2_bkey_get_empty_slot(struct btree_trans *, struct btree_iter *,
+			     enum btree_id, struct bpos);
+
 int __must_check bch2_trans_update(struct btree_trans *, struct btree_iter *,
 				   struct bkey_i *, enum btree_update_flags);
 int __must_check bch2_trans_update_buffered(struct btree_trans *,
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 0952885f3caa..33693467810b 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -1735,6 +1735,37 @@ int __must_check bch2_trans_update_buffered(struct btree_trans *trans,
 	return 0;
 }
 
+int bch2_bkey_get_empty_slot(struct btree_trans *trans, struct btree_iter *iter,
+			     enum btree_id btree, struct bpos end)
+{
+	struct bkey_s_c k;
+	int ret = 0;
+
+	bch2_trans_iter_init(trans, iter, btree, POS_MAX, BTREE_ITER_INTENT);
+	k = bch2_btree_iter_prev(iter);
+	ret = bkey_err(k);
+	if (ret)
+		goto err;
+
+	bch2_btree_iter_advance(iter);
+	k = bch2_btree_iter_peek_slot(iter);
+	ret = bkey_err(k);
+	if (ret)
+		goto err;
+
+	BUG_ON(k.k->type != KEY_TYPE_deleted);
+
+	if (bkey_gt(k.k->p, end)) {
+		ret = -BCH_ERR_ENOSPC_btree_slot;
+		goto err;
+	}
+
+	return 0;
+err:
+	bch2_trans_iter_exit(trans, iter);
+	return ret;
+}
+
 void bch2_trans_commit_hook(struct btree_trans *trans,
 			    struct btree_trans_commit_hook *h)
 {
diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h
index 4304e25a6b24..c73a5e78e260 100644
--- a/fs/bcachefs/errcode.h
+++ b/fs/bcachefs/errcode.h
@@ -92,6 +92,7 @@
 	x(ENOSPC,			ENOSPC_sb_replicas)			\
 	x(ENOSPC,			ENOSPC_sb_members)			\
 	x(ENOSPC,			ENOSPC_sb_crypt)			\
+	x(ENOSPC,			ENOSPC_btree_slot)			\
 	x(0,				open_buckets_empty)			\
 	x(0,				freelist_empty)				\
 	x(BCH_ERR_freelist_empty,	no_buckets_found)			\
diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c
index cac295afc75f..8d87f90a0ac6 100644
--- a/fs/bcachefs/subvolume.c
+++ b/fs/bcachefs/subvolume.c
@@ -909,32 +909,19 @@ int bch2_subvolume_create(struct btree_trans *trans, u64 inode,
 			  u32 *new_snapshotid,
 			  bool ro)
 {
-	struct bch_fs *c = trans->c;
 	struct btree_iter dst_iter, src_iter = (struct btree_iter) { NULL };
 	struct bkey_i_subvolume *new_subvol = NULL;
 	struct bkey_i_subvolume *src_subvol = NULL;
-	struct bkey_s_c k;
 	u32 parent = 0, new_nodes[2], snapshot_subvols[2];
 	int ret = 0;
 
-	for_each_btree_key(trans, dst_iter, BTREE_ID_subvolumes, SUBVOL_POS_MIN,
-			   BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) {
-		if (bkey_gt(k.k->p, SUBVOL_POS_MAX))
-			break;
-
-		/*
-		 * bch2_subvolume_delete() doesn't flush the btree key cache -
-		 * ideally it would but that's tricky
-		 */
-		if (bkey_deleted(k.k) &&
-		    !bch2_btree_key_cache_find(c, BTREE_ID_subvolumes, dst_iter.pos))
-			goto found_slot;
-	}
-
-	if (!ret)
+	ret = bch2_bkey_get_empty_slot(trans, &dst_iter,
+				BTREE_ID_subvolumes, POS(0, U32_MAX));
+	if (ret == -BCH_ERR_ENOSPC_btree_slot)
 		ret = -BCH_ERR_ENOSPC_subvolume_create;
-	goto err;
-found_slot:
+	if (ret)
+		return ret;
+
 	snapshot_subvols[0] = dst_iter.pos.offset;
 	snapshot_subvols[1] = src_subvolid;
 
-- 
cgit 


From 1c59b483a3d249e08f0dcff43d9b78851d216fc1 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 29 Mar 2023 11:18:52 -0400
Subject: bcachefs: BTREE_ID_snapshot_tree

This adds a new btree which gets us a persistent per-snapshot-tree
identifier.

 - BTREE_ID_snapshot_trees
 - KEY_TYPE_snapshot_tree
 - bch_snapshot now has a field that points to a snapshot_tree

This is going to be used to designate one snapshot ID/subvolume out of a
given tree of snapshots as the "main" subvolume, so that we can do quota
accounting in that subvolume and not the rest.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs.h        |   1 +
 fs/bcachefs/bcachefs_format.h |  26 +-
 fs/bcachefs/bkey_methods.c    |   3 +
 fs/bcachefs/errcode.h         |   1 +
 fs/bcachefs/fsck.c            |   3 +-
 fs/bcachefs/recovery.c        |  23 +-
 fs/bcachefs/subvolume.c       | 590 +++++++++++++++++++++++++++++++++++++-----
 fs/bcachefs/subvolume.h       |  20 ++
 fs/bcachefs/subvolume_types.h |   1 +
 9 files changed, 588 insertions(+), 80 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index e5834729b52a..39fd15447753 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -445,6 +445,7 @@ enum gc_phase {
 	GC_PHASE_BTREE_need_discard,
 	GC_PHASE_BTREE_backpointers,
 	GC_PHASE_BTREE_bucket_gens,
+	GC_PHASE_BTREE_snapshot_trees,
 
 	GC_PHASE_PENDING_DELETE,
 };
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index 013d5e185d97..e9ac3aa6d91c 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -364,7 +364,8 @@ static inline void bkey_init(struct bkey *k)
 	x(alloc_v4,		27)			\
 	x(backpointer,		28)			\
 	x(inode_v3,		29)			\
-	x(bucket_gens,		30)
+	x(bucket_gens,		30)			\
+	x(snapshot_tree,	31)
 
 enum bch_bkey_type {
 #define x(name, nr) KEY_TYPE_##name	= nr,
@@ -1123,7 +1124,7 @@ struct bch_snapshot {
 	__le32			parent;
 	__le32			children[2];
 	__le32			subvol;
-	__le32			pad;
+	__le32			tree;
 };
 
 LE32_BITMASK(BCH_SNAPSHOT_DELETED,	struct bch_snapshot, flags,  0,  1)
@@ -1131,6 +1132,19 @@ LE32_BITMASK(BCH_SNAPSHOT_DELETED,	struct bch_snapshot, flags,  0,  1)
 /* True if a subvolume points to this snapshot node: */
 LE32_BITMASK(BCH_SNAPSHOT_SUBVOL,	struct bch_snapshot, flags,  1,  2)
 
+/*
+ * Snapshot trees:
+ *
+ * The snapshot_trees btree gives us persistent indentifier for each tree of
+ * bch_snapshot nodes, and allow us to record and easily find the root/master
+ * subvolume that other snapshots were created from:
+ */
+struct bch_snapshot_tree {
+	struct bch_val		v;
+	__le32			master_subvol;
+	__le32			root_snapshot;
+};
+
 /* LRU btree: */
 
 struct bch_lru {
@@ -1559,7 +1573,8 @@ struct bch_sb_field_journal_seq_blacklist {
 	x(bucket_gens,			25)		\
 	x(lru_v2,			26)		\
 	x(fragmentation_lru,		27)		\
-	x(no_bps_in_alloc_keys,		28)
+	x(no_bps_in_alloc_keys,		28)		\
+	x(snapshot_trees,		29)
 
 enum bcachefs_metadata_version {
 	bcachefs_metadata_version_min = 9,
@@ -1569,6 +1584,8 @@ enum bcachefs_metadata_version {
 	bcachefs_metadata_version_max
 };
 
+static const unsigned bcachefs_metadata_required_upgrade_below = bcachefs_metadata_version_snapshot_trees;
+
 #define bcachefs_metadata_version_current	(bcachefs_metadata_version_max - 1)
 
 #define BCH_SB_SECTOR			8
@@ -2095,7 +2112,8 @@ LE32_BITMASK(JSET_NO_FLUSH,	struct jset, flags, 5, 6);
 	x(freespace,		11)		\
 	x(need_discard,		12)		\
 	x(backpointers,		13)		\
-	x(bucket_gens,		14)
+	x(bucket_gens,		14)		\
+	x(snapshot_trees,	15)
 
 enum btree_id {
 #define x(kwd, val) BTREE_ID_##kwd = val,
diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c
index 47f0ab023d64..79f3fbe925d5 100644
--- a/fs/bcachefs/bkey_methods.c
+++ b/fs/bcachefs/bkey_methods.c
@@ -204,6 +204,9 @@ static unsigned bch2_key_types_allowed[] = {
 	[BKEY_TYPE_bucket_gens] =
 		(1U << KEY_TYPE_deleted)|
 		(1U << KEY_TYPE_bucket_gens),
+	[BKEY_TYPE_snapshot_trees] =
+		(1U << KEY_TYPE_deleted)|
+		(1U << KEY_TYPE_snapshot_tree),
 	[BKEY_TYPE_btree] =
 		(1U << KEY_TYPE_deleted)|
 		(1U << KEY_TYPE_btree_ptr)|
diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h
index c73a5e78e260..c8ac08e5548b 100644
--- a/fs/bcachefs/errcode.h
+++ b/fs/bcachefs/errcode.h
@@ -93,6 +93,7 @@
 	x(ENOSPC,			ENOSPC_sb_members)			\
 	x(ENOSPC,			ENOSPC_sb_crypt)			\
 	x(ENOSPC,			ENOSPC_btree_slot)			\
+	x(ENOSPC,			ENOSPC_snapshot_tree)			\
 	x(0,				open_buckets_empty)			\
 	x(0,				freelist_empty)				\
 	x(BCH_ERR_freelist_empty,	no_buckets_found)			\
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 4b28fc4f77c6..eb3609aa4593 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -2427,7 +2427,8 @@ int bch2_fsck_full(struct bch_fs *c)
 {
 	int ret;
 again:
-	ret =   bch2_fs_check_snapshots(c) ?:
+	ret =   bch2_fs_check_snapshot_trees(c);
+		bch2_fs_check_snapshots(c) ?:
 		bch2_fs_check_subvols(c) ?:
 		bch2_delete_dead_snapshots(c) ?:
 		check_inodes(c, true) ?:
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 6214691fa441..af76c029fb6a 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -1025,16 +1025,25 @@ fsck_err:
 
 static int bch2_fs_initialize_subvolumes(struct bch_fs *c)
 {
-	struct bkey_i_snapshot	root_snapshot;
-	struct bkey_i_subvolume root_volume;
+	struct bkey_i_snapshot_tree	root_tree;
+	struct bkey_i_snapshot		root_snapshot;
+	struct bkey_i_subvolume		root_volume;
 	int ret;
 
+	bkey_snapshot_tree_init(&root_tree.k_i);
+	root_tree.k.p.offset		= 1;
+	root_tree.v.master_subvol	= cpu_to_le32(1);
+	root_tree.v.root_snapshot	= cpu_to_le32(U32_MAX);
+	ret = bch2_btree_insert(c, BTREE_ID_snapshot_trees,
+				&root_tree.k_i,
+				NULL, NULL, 0);
+
 	bkey_snapshot_init(&root_snapshot.k_i);
 	root_snapshot.k.p.offset = U32_MAX;
 	root_snapshot.v.flags	= 0;
 	root_snapshot.v.parent	= 0;
 	root_snapshot.v.subvol	= BCACHEFS_ROOT_SUBVOL;
-	root_snapshot.v.pad	= 0;
+	root_snapshot.v.tree	= cpu_to_le32(1);
 	SET_BCH_SNAPSHOT_SUBVOL(&root_snapshot.v, true);
 
 	ret = bch2_btree_insert(c, BTREE_ID_snapshots,
@@ -1135,8 +1144,12 @@ int bch2_fs_recovery(struct bch_fs *c)
 	}
 
 	if (!c->opts.nochanges) {
-		if (c->sb.version < bcachefs_metadata_version_no_bps_in_alloc_keys) {
-			bch_info(c, "version prior to no_bps_in_alloc_keys, upgrade and fsck required");
+		if (c->sb.version < bcachefs_metadata_required_upgrade_below) {
+			bch_info(c, "version %s (%u) prior to %s (%u), upgrade and fsck required",
+				 bch2_metadata_versions[c->sb.version],
+				 c->sb.version,
+				 bch2_metadata_versions[bcachefs_metadata_required_upgrade_below],
+				 bcachefs_metadata_required_upgrade_below);
 			c->opts.version_upgrade	= true;
 			c->opts.fsck		= true;
 			c->opts.fix_errors	= FSCK_OPT_YES;
diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c
index 8d87f90a0ac6..b14da196e7fd 100644
--- a/fs/bcachefs/subvolume.c
+++ b/fs/bcachefs/subvolume.c
@@ -10,6 +10,71 @@
 
 /* Snapshot tree: */
 
+void bch2_snapshot_tree_to_text(struct printbuf *out, struct bch_fs *c,
+				struct bkey_s_c k)
+{
+	struct bkey_s_c_snapshot_tree t = bkey_s_c_to_snapshot_tree(k);
+
+	prt_printf(out, "subvol %u root snapshot %u",
+		   le32_to_cpu(t.v->master_subvol),
+		   le32_to_cpu(t.v->root_snapshot));
+}
+
+int bch2_snapshot_tree_invalid(const struct bch_fs *c, struct bkey_s_c k,
+			       unsigned flags, struct printbuf *err)
+{
+	if (bkey_gt(k.k->p, POS(0, U32_MAX)) ||
+	    bkey_lt(k.k->p, POS(0, 1))) {
+		prt_printf(err, "bad pos");
+		return -BCH_ERR_invalid_bkey;
+	}
+
+	return 0;
+}
+
+static int snapshot_tree_lookup(struct btree_trans *trans, u32 id,
+				struct bch_snapshot_tree *s)
+{
+	return bch2_bkey_get_val_typed(trans, BTREE_ID_snapshot_trees, POS(0, id),
+				       BTREE_ITER_WITH_UPDATES, snapshot_tree, s);
+}
+
+static struct bkey_i_snapshot_tree *
+__snapshot_tree_create(struct btree_trans *trans)
+{
+	struct btree_iter iter;
+	int ret = bch2_bkey_get_empty_slot(trans, &iter,
+			BTREE_ID_snapshot_trees, POS(0, U32_MAX));
+	struct bkey_i_snapshot_tree *s_t;
+
+	if (ret == -BCH_ERR_ENOSPC_btree_slot)
+		ret = -BCH_ERR_ENOSPC_snapshot_tree;
+	if (ret)
+		return ERR_PTR(ret);
+
+	s_t = bch2_bkey_alloc(trans, &iter, 0, snapshot_tree);
+	ret = PTR_ERR_OR_ZERO(s_t);
+	bch2_trans_iter_exit(trans, &iter);
+	return ret ? ERR_PTR(ret) : s_t;
+}
+
+static int snapshot_tree_create(struct btree_trans *trans,
+				u32 root_id, u32 subvol_id, u32 *tree_id)
+{
+	struct bkey_i_snapshot_tree *n_tree =
+		__snapshot_tree_create(trans);
+
+	if (IS_ERR(n_tree))
+		return PTR_ERR(n_tree);
+
+	n_tree->v.master_subvol	= cpu_to_le32(subvol_id);
+	n_tree->v.root_snapshot	= cpu_to_le32(root_id);
+	*tree_id = n_tree->k.p.offset;
+	return 0;
+}
+
+/* Snapshot nodes: */
+
 void bch2_snapshot_to_text(struct printbuf *out, struct bch_fs *c,
 			   struct bkey_s_c k)
 {
@@ -90,11 +155,13 @@ int bch2_mark_snapshot(struct btree_trans *trans,
 		t->children[0]	= le32_to_cpu(s.v->children[0]);
 		t->children[1]	= le32_to_cpu(s.v->children[1]);
 		t->subvol	= BCH_SNAPSHOT_SUBVOL(s.v) ? le32_to_cpu(s.v->subvol) : 0;
+		t->tree		= le32_to_cpu(s.v->tree);
 	} else {
 		t->parent	= 0;
 		t->children[0]	= 0;
 		t->children[1]	= 0;
 		t->subvol	= 0;
+		t->tree		= 0;
 	}
 
 	return 0;
@@ -116,7 +183,7 @@ static int snapshot_live(struct btree_trans *trans, u32 id)
 		return 0;
 
 	ret = snapshot_lookup(trans, id, &v);
-	if (ret == -ENOENT)
+	if (bch2_err_matches(ret, ENOENT))
 		bch_err(trans->c, "snapshot node %u not found", id);
 	if (ret)
 		return ret;
@@ -157,6 +224,274 @@ static int bch2_snapshot_set_equiv(struct btree_trans *trans, struct bkey_s_c k)
 }
 
 /* fsck: */
+
+static u32 bch2_snapshot_child(struct bch_fs *c, u32 id, unsigned child)
+{
+	return snapshot_t(c, id)->children[child];
+}
+
+static u32 bch2_snapshot_left_child(struct bch_fs *c, u32 id)
+{
+	return bch2_snapshot_child(c, id, 0);
+}
+
+static u32 bch2_snapshot_right_child(struct bch_fs *c, u32 id)
+{
+	return bch2_snapshot_child(c, id, 1);
+}
+
+static u32 bch2_snapshot_tree_next(struct bch_fs *c, u32 id)
+{
+	u32 n, parent;
+
+	n = bch2_snapshot_left_child(c, id);
+	if (n)
+		return n;
+
+	while ((parent = bch2_snapshot_parent(c, id))) {
+		n = bch2_snapshot_right_child(c, parent);
+		if (n && n != id)
+			return n;
+		id = parent;
+	}
+
+	return 0;
+}
+
+static u32 bch2_snapshot_tree_oldest_subvol(struct bch_fs *c, u32 snapshot_root)
+{
+	u32 id = snapshot_root;
+	u32 subvol = 0, s;
+
+	while (id) {
+		s = snapshot_t(c, id)->subvol;
+
+		if (s && (!subvol || s < subvol))
+			subvol = s;
+
+		id = bch2_snapshot_tree_next(c, id);
+	}
+
+	return subvol;
+}
+
+static int bch2_snapshot_tree_master_subvol(struct btree_trans *trans,
+					    u32 snapshot_root, u32 *subvol_id)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	struct bkey_s_c_subvolume s;
+	int ret;
+
+	for_each_btree_key_norestart(trans, iter, BTREE_ID_subvolumes, POS_MIN,
+				     0, k, ret) {
+		if (k.k->type != KEY_TYPE_subvolume)
+			continue;
+
+		s = bkey_s_c_to_subvolume(k);
+		if (!bch2_snapshot_is_ancestor(c, le32_to_cpu(s.v->snapshot), snapshot_root))
+			continue;
+		if (!BCH_SUBVOLUME_SNAP(s.v)) {
+			*subvol_id = s.k->p.offset;
+			goto found;
+		}
+	}
+	ret = ret ?: -ENOENT;
+found:
+	bch2_trans_iter_exit(trans, &iter);
+
+	if (bch2_err_matches(ret, ENOENT)) {
+		struct bkey_i_subvolume *s;
+
+		*subvol_id = bch2_snapshot_tree_oldest_subvol(c, snapshot_root);
+
+		s = bch2_bkey_get_mut_typed(trans, &iter,
+					    BTREE_ID_subvolumes, POS(0, *subvol_id),
+					    0, subvolume);
+		ret = PTR_ERR_OR_ZERO(s);
+		if (ret)
+			return ret;
+
+		SET_BCH_SUBVOLUME_SNAP(&s->v, false);
+	}
+
+	return ret;
+}
+
+static int check_snapshot_tree(struct btree_trans *trans,
+			       struct btree_iter *iter,
+			       struct bkey_s_c k)
+{
+	struct bch_fs *c = trans->c;
+	struct bkey_s_c_snapshot_tree st;
+	struct bch_snapshot s;
+	struct bch_subvolume subvol;
+	struct printbuf buf = PRINTBUF;
+	u32 root_id;
+	int ret;
+
+	if (k.k->type != KEY_TYPE_snapshot_tree)
+		return 0;
+
+	st = bkey_s_c_to_snapshot_tree(k);
+	root_id = le32_to_cpu(st.v->root_snapshot);
+
+	ret = snapshot_lookup(trans, root_id, &s);
+	if (ret && !bch2_err_matches(ret, ENOENT))
+		goto err;
+
+	if (fsck_err_on(ret ||
+			root_id != bch2_snapshot_root(c, root_id) ||
+			st.k->p.offset != le32_to_cpu(s.tree),
+			c,
+			"snapshot tree points to missing/incorrect snapshot:\n  %s",
+			(bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf))) {
+		ret = bch2_btree_delete_at(trans, iter, 0);
+		goto err;
+	}
+
+	ret = bch2_subvolume_get(trans, le32_to_cpu(st.v->master_subvol),
+				 false, 0, &subvol);
+	if (ret && !bch2_err_matches(ret, ENOENT))
+		goto err;
+
+	if (fsck_err_on(ret, c,
+			"snapshot tree points to missing subvolume:\n  %s",
+			(printbuf_reset(&buf),
+			 bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf)) ||
+	    fsck_err_on(!bch2_snapshot_is_ancestor(c,
+						   le32_to_cpu(subvol.snapshot),
+						   root_id), c,
+			"snapshot tree points to subvolume that does not point to snapshot in this tree:\n  %s",
+			(printbuf_reset(&buf),
+			 bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf)) ||
+	    fsck_err_on(BCH_SUBVOLUME_SNAP(&subvol), c,
+			"snapshot tree points to snapshot subvolume:\n  %s",
+			(printbuf_reset(&buf),
+			 bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf))) {
+		struct bkey_i_snapshot_tree *u;
+		u32 subvol_id;
+
+		ret = bch2_snapshot_tree_master_subvol(trans, root_id, &subvol_id);
+		if (ret)
+			goto err;
+
+		u = bch2_bkey_make_mut_typed(trans, iter, k, 0, snapshot_tree);
+		ret = PTR_ERR_OR_ZERO(u);
+		if (ret)
+			goto err;
+
+		u->v.master_subvol = cpu_to_le32(subvol_id);
+		st = snapshot_tree_i_to_s_c(u);
+	}
+err:
+fsck_err:
+	printbuf_exit(&buf);
+	return ret;
+}
+
+/*
+ * For each snapshot_tree, make sure it points to the root of a snapshot tree
+ * and that snapshot entry points back to it, or delete it.
+ *
+ * And, make sure it points to a subvolume within that snapshot tree, or correct
+ * it to point to the oldest subvolume within that snapshot tree.
+ */
+int bch2_fs_check_snapshot_trees(struct bch_fs *c)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	int ret;
+
+	ret = bch2_trans_run(c,
+		for_each_btree_key_commit(&trans, iter,
+			BTREE_ID_snapshot_trees, POS_MIN,
+			BTREE_ITER_PREFETCH, k,
+			NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
+		check_snapshot_tree(&trans, &iter, k)));
+
+	if (ret)
+		bch_err(c, "error %i checking snapshot trees", ret);
+	return ret;
+}
+
+/*
+ * Look up snapshot tree for @tree_id and find root,
+ * make sure @snap_id is a descendent:
+ */
+static int snapshot_tree_ptr_good(struct btree_trans *trans,
+				  u32 snap_id, u32 tree_id)
+{
+	struct bch_snapshot_tree s_t;
+	int ret = snapshot_tree_lookup(trans, tree_id, &s_t);
+
+	if (bch2_err_matches(ret, ENOENT))
+		return 0;
+	if (ret)
+		return ret;
+
+	return bch2_snapshot_is_ancestor(trans->c, snap_id, le32_to_cpu(s_t.root_snapshot));
+}
+
+/*
+ * snapshot_tree pointer was incorrect: look up root snapshot node, make sure
+ * its snapshot_tree pointer is correct (allocate new one if necessary), then
+ * update this node's pointer to root node's pointer:
+ */
+static int snapshot_tree_ptr_repair(struct btree_trans *trans,
+				    struct btree_iter *iter,
+				    struct bkey_s_c_snapshot *s)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter root_iter;
+	struct bch_snapshot_tree s_t;
+	struct bkey_s_c_snapshot root;
+	struct bkey_i_snapshot *u;
+	u32 root_id = bch2_snapshot_root(c, s->k->p.offset), tree_id;
+	int ret;
+
+	root = bch2_bkey_get_iter_typed(trans, &root_iter,
+			       BTREE_ID_snapshots, POS(0, root_id),
+			       BTREE_ITER_WITH_UPDATES, snapshot);
+	ret = bkey_err(root);
+	if (ret)
+		goto err;
+
+	tree_id = le32_to_cpu(root.v->tree);
+
+	ret = snapshot_tree_lookup(trans, tree_id, &s_t);
+	if (ret && !bch2_err_matches(ret, ENOENT))
+		return ret;
+
+	if (ret || le32_to_cpu(s_t.root_snapshot) != root_id) {
+		u = bch2_bkey_make_mut_typed(trans, &root_iter, root.s_c, 0, snapshot);
+		ret =   PTR_ERR_OR_ZERO(u) ?:
+			snapshot_tree_create(trans, root_id,
+				bch2_snapshot_tree_oldest_subvol(c, root_id),
+				&tree_id);
+		if (ret)
+			goto err;
+
+		u->v.tree = cpu_to_le32(tree_id);
+		if (s->k->p.snapshot == root_id)
+			*s = snapshot_i_to_s_c(u);
+	}
+
+	if (s->k->p.snapshot != root_id) {
+		u = bch2_bkey_make_mut_typed(trans, iter, s->s_c, 0, snapshot);
+		ret = PTR_ERR_OR_ZERO(u);
+		if (ret)
+			goto err;
+
+		u->v.tree = cpu_to_le32(tree_id);
+		*s = snapshot_i_to_s_c(u);
+	}
+err:
+	bch2_trans_iter_exit(trans, &root_iter);
+	return ret;
+}
+
 static int check_snapshot(struct btree_trans *trans,
 			  struct btree_iter *iter,
 			  struct bkey_s_c k)
@@ -177,7 +512,7 @@ static int check_snapshot(struct btree_trans *trans,
 	id = le32_to_cpu(s.v->parent);
 	if (id) {
 		ret = snapshot_lookup(trans, id, &v);
-		if (ret == -ENOENT)
+		if (bch2_err_matches(ret, ENOENT))
 			bch_err(c, "snapshot with nonexistent parent:\n  %s",
 				(bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf));
 		if (ret)
@@ -196,7 +531,7 @@ static int check_snapshot(struct btree_trans *trans,
 		id = le32_to_cpu(s.v->children[i]);
 
 		ret = snapshot_lookup(trans, id, &v);
-		if (ret == -ENOENT)
+		if (bch2_err_matches(ret, ENOENT))
 			bch_err(c, "snapshot node %llu has nonexistent child %u",
 				s.k->p.offset, id);
 		if (ret)
@@ -216,7 +551,7 @@ static int check_snapshot(struct btree_trans *trans,
 	if (should_have_subvol) {
 		id = le32_to_cpu(s.v->subvol);
 		ret = bch2_subvolume_get(trans, id, 0, false, &subvol);
-		if (ret == -ENOENT)
+		if (bch2_err_matches(ret, ENOENT))
 			bch_err(c, "snapshot points to nonexistent subvolume:\n  %s",
 				(bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf));
 		if (ret)
@@ -242,9 +577,23 @@ static int check_snapshot(struct btree_trans *trans,
 			ret = bch2_trans_update(trans, iter, &u->k_i, 0);
 			if (ret)
 				goto err;
+
+			s = snapshot_i_to_s_c(u);
 		}
 	}
 
+	ret = snapshot_tree_ptr_good(trans, s.k->p.offset, le32_to_cpu(s.v->tree));
+	if (ret < 0)
+		goto err;
+
+	if (fsck_err_on(!ret, c, "snapshot points to missing/incorrect tree:\n  %s",
+			(bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
+		ret = snapshot_tree_ptr_repair(trans, iter, &s);
+		if (ret)
+			goto err;
+	}
+	ret = 0;
+
 	if (BCH_SNAPSHOT_DELETED(s.v))
 		set_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags);
 err:
@@ -255,23 +604,18 @@ fsck_err:
 
 int bch2_fs_check_snapshots(struct bch_fs *c)
 {
-	struct btree_trans trans;
 	struct btree_iter iter;
 	struct bkey_s_c k;
 	int ret;
 
-	bch2_trans_init(&trans, c, 0, 0);
-
-	ret = for_each_btree_key_commit(&trans, iter,
+	ret = bch2_trans_run(c,
+		for_each_btree_key_commit(&trans, iter,
 			BTREE_ID_snapshots, POS_MIN,
 			BTREE_ITER_PREFETCH, k,
 			NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
-		check_snapshot(&trans, &iter, k));
-
+		check_snapshot(&trans, &iter, k)));
 	if (ret)
-		bch_err(c, "error %i checking snapshots", ret);
-
-	bch2_trans_exit(&trans);
+		bch_err(c, "%s: error %s", __func__, bch2_err_str(ret));
 	return ret;
 }
 
@@ -279,10 +623,11 @@ static int check_subvol(struct btree_trans *trans,
 			struct btree_iter *iter,
 			struct bkey_s_c k)
 {
+	struct bch_fs *c = trans->c;
 	struct bkey_s_c_subvolume subvol;
 	struct bch_snapshot snapshot;
 	unsigned snapid;
-	int ret;
+	int ret = 0;
 
 	if (k.k->type != KEY_TYPE_subvolume)
 		return 0;
@@ -291,8 +636,8 @@ static int check_subvol(struct btree_trans *trans,
 	snapid = le32_to_cpu(subvol.v->snapshot);
 	ret = snapshot_lookup(trans, snapid, &snapshot);
 
-	if (ret == -ENOENT)
-		bch_err(trans->c, "subvolume %llu points to nonexistent snapshot %u",
+	if (bch2_err_matches(ret, ENOENT))
+		bch_err(c, "subvolume %llu points to nonexistent snapshot %u",
 			k.k->p.offset, snapid);
 	if (ret)
 		return ret;
@@ -300,30 +645,55 @@ static int check_subvol(struct btree_trans *trans,
 	if (BCH_SUBVOLUME_UNLINKED(subvol.v)) {
 		ret = bch2_subvolume_delete(trans, iter->pos.offset);
 		if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
-			bch_err(trans->c, "error deleting subvolume %llu: %s",
+			bch_err(c, "error deleting subvolume %llu: %s",
 				iter->pos.offset, bch2_err_str(ret));
 		if (ret)
 			return ret;
 	}
 
-	return 0;
+	if (!BCH_SUBVOLUME_SNAP(subvol.v)) {
+		u32 snapshot_root = bch2_snapshot_root(c, le32_to_cpu(subvol.v->snapshot));
+		u32 snapshot_tree = snapshot_t(c, snapshot_root)->tree;
+		struct bch_snapshot_tree st;
+
+		ret = snapshot_tree_lookup(trans, snapshot_tree, &st);
+
+		bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c,
+				"%s: snapshot tree %u not found", __func__, snapshot_tree);
+
+		if (ret)
+			return ret;
+
+		if (fsck_err_on(le32_to_cpu(st.master_subvol) != subvol.k->p.offset, c,
+				"subvolume %llu is not set as snapshot but is not master subvolume",
+				k.k->p.offset)) {
+			struct bkey_i_subvolume *s =
+				bch2_bkey_make_mut_typed(trans, iter, subvol.s_c, 0, subvolume);
+			ret = PTR_ERR_OR_ZERO(s);
+			if (ret)
+				return ret;
+
+			SET_BCH_SUBVOLUME_SNAP(&s->v, true);
+		}
+	}
+
+fsck_err:
+	return ret;
 }
 
 int bch2_fs_check_subvols(struct bch_fs *c)
 {
-	struct btree_trans trans;
 	struct btree_iter iter;
 	struct bkey_s_c k;
 	int ret;
 
-	bch2_trans_init(&trans, c, 0, 0);
-
-	ret = for_each_btree_key_commit(&trans, iter,
+	ret = bch2_trans_run(c,
+		for_each_btree_key_commit(&trans, iter,
 			BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_PREFETCH, k,
 			NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
-		check_subvol(&trans, &iter, k));
-
-	bch2_trans_exit(&trans);
+		check_subvol(&trans, &iter, k)));
+	if (ret)
+		bch_err(c, "%s: error %s", __func__, bch2_err_str(ret));
 
 	return ret;
 }
@@ -335,20 +705,15 @@ void bch2_fs_snapshots_exit(struct bch_fs *c)
 
 int bch2_fs_snapshots_start(struct bch_fs *c)
 {
-	struct btree_trans trans;
 	struct btree_iter iter;
 	struct bkey_s_c k;
 	int ret = 0;
 
-	bch2_trans_init(&trans, c, 0, 0);
-
-	for_each_btree_key2(&trans, iter, BTREE_ID_snapshots,
+	ret = bch2_trans_run(c,
+		for_each_btree_key2(&trans, iter, BTREE_ID_snapshots,
 			   POS_MIN, 0, k,
-		bch2_mark_snapshot(&trans, BTREE_ID_snapshots, 0, bkey_s_c_null, k, 0) ?:
-		bch2_snapshot_set_equiv(&trans, k));
-
-	bch2_trans_exit(&trans);
-
+			bch2_mark_snapshot(&trans, BTREE_ID_snapshots, 0, bkey_s_c_null, k, 0) ?:
+			bch2_snapshot_set_equiv(&trans, k)));
 	if (ret)
 		bch_err(c, "error starting snapshots: %s", bch2_err_str(ret));
 	return ret;
@@ -368,7 +733,8 @@ static int bch2_snapshot_node_set_deleted(struct btree_trans *trans, u32 id)
 				    0, snapshot);
 	ret = PTR_ERR_OR_ZERO(s);
 	if (unlikely(ret)) {
-		bch2_fs_inconsistent_on(ret == -ENOENT, trans->c, "missing snapshot %u", id);
+		bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT),
+					trans->c, "missing snapshot %u", id);
 		return ret;
 	}
 
@@ -388,6 +754,7 @@ static int bch2_snapshot_node_delete(struct btree_trans *trans, u32 id)
 {
 	struct bch_fs *c = trans->c;
 	struct btree_iter iter, p_iter = (struct btree_iter) { NULL };
+	struct btree_iter tree_iter = (struct btree_iter) { NULL };
 	struct bkey_s_c_snapshot s;
 	u32 parent_id;
 	unsigned i;
@@ -396,7 +763,8 @@ static int bch2_snapshot_node_delete(struct btree_trans *trans, u32 id)
 	s = bch2_bkey_get_iter_typed(trans, &iter, BTREE_ID_snapshots, POS(0, id),
 				     BTREE_ITER_INTENT, snapshot);
 	ret = bkey_err(s);
-	bch2_fs_inconsistent_on(ret == -ENOENT, c, "missing snapshot %u", id);
+	bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c,
+				"missing snapshot %u", id);
 
 	if (ret)
 		goto err;
@@ -412,7 +780,8 @@ static int bch2_snapshot_node_delete(struct btree_trans *trans, u32 id)
 				     0, snapshot);
 		ret = PTR_ERR_OR_ZERO(parent);
 		if (unlikely(ret)) {
-			bch2_fs_inconsistent_on(ret == -ENOENT, c, "missing snapshot %u", parent_id);
+			bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c,
+						"missing snapshot %u", parent_id);
 			goto err;
 		}
 
@@ -430,25 +799,49 @@ static int bch2_snapshot_node_delete(struct btree_trans *trans, u32 id)
 		    le32_to_cpu(parent->v.children[1]))
 			swap(parent->v.children[0],
 			     parent->v.children[1]);
+	} else {
+		/*
+		 * We're deleting the root of a snapshot tree: update the
+		 * snapshot_tree entry to point to the new root, or delete it if
+		 * this is the last snapshot ID in this tree:
+		 */
+		struct bkey_i_snapshot_tree *s_t;
+
+		BUG_ON(s.v->children[1]);
+
+		s_t = bch2_bkey_get_mut_typed(trans, &tree_iter,
+				BTREE_ID_snapshot_trees, POS(0, le32_to_cpu(s.v->tree)),
+				0, snapshot_tree);
+		ret = PTR_ERR_OR_ZERO(s_t);
+		if (ret)
+			goto err;
+
+		if (s.v->children[0]) {
+			s_t->v.root_snapshot = cpu_to_le32(s.v->children[0]);
+		} else {
+			s_t->k.type = KEY_TYPE_deleted;
+			set_bkey_val_u64s(&s_t->k, 0);
+		}
 	}
 
 	ret = bch2_btree_delete_at(trans, &iter, 0);
 err:
+	bch2_trans_iter_exit(trans, &tree_iter);
 	bch2_trans_iter_exit(trans, &p_iter);
 	bch2_trans_iter_exit(trans, &iter);
 	return ret;
 }
 
-int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent,
-			      u32 *new_snapids,
-			      u32 *snapshot_subvols,
-			      unsigned nr_snapids)
+static int create_snapids(struct btree_trans *trans, u32 parent, u32 tree,
+			  u32 *new_snapids,
+			  u32 *snapshot_subvols,
+			  unsigned nr_snapids)
 {
-	struct btree_iter iter, parent_iter = { NULL };
+	struct btree_iter iter;
 	struct bkey_i_snapshot *n;
 	struct bkey_s_c k;
 	unsigned i;
-	int ret = 0;
+	int ret;
 
 	bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots,
 			     POS_MIN, BTREE_ITER_INTENT);
@@ -476,7 +869,7 @@ int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent,
 		n->v.flags	= 0;
 		n->v.parent	= cpu_to_le32(parent);
 		n->v.subvol	= cpu_to_le32(snapshot_subvols[i]);
-		n->v.pad	= 0;
+		n->v.tree	= cpu_to_le32(tree);
 		SET_BCH_SNAPSHOT_SUBVOL(&n->v, true);
 
 		ret = bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0,
@@ -486,38 +879,92 @@ int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent,
 
 		new_snapids[i]	= iter.pos.offset;
 	}
+err:
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
 
-	if (parent) {
-		n = bch2_bkey_get_mut_typed(trans, &parent_iter,
-				BTREE_ID_snapshots, POS(0, parent),
-				0, snapshot);
-		ret = PTR_ERR_OR_ZERO(n);
-		if (unlikely(ret)) {
-			if (ret == -ENOENT)
-				bch_err(trans->c, "snapshot %u not found", parent);
-			goto err;
-		}
+/*
+ * Create new snapshot IDs as children of an existing snapshot ID:
+ */
+static int bch2_snapshot_node_create_children(struct btree_trans *trans, u32 parent,
+			      u32 *new_snapids,
+			      u32 *snapshot_subvols,
+			      unsigned nr_snapids)
+{
+	struct btree_iter iter;
+	struct bkey_i_snapshot *n_parent;
+	int ret = 0;
 
-		if (n->v.children[0] || n->v.children[1]) {
-			bch_err(trans->c, "Trying to add child snapshot nodes to parent that already has children");
-			ret = -EINVAL;
-			goto err;
-		}
+	n_parent = bch2_bkey_get_mut_typed(trans, &iter,
+			BTREE_ID_snapshots, POS(0, parent),
+			0, snapshot);
+	ret = PTR_ERR_OR_ZERO(n_parent);
+	if (unlikely(ret)) {
+		if (bch2_err_matches(ret, ENOENT))
+			bch_err(trans->c, "snapshot %u not found", parent);
+		return ret;
+	}
 
-		n->v.children[0] = cpu_to_le32(new_snapids[0]);
-		n->v.children[1] = cpu_to_le32(new_snapids[1]);
-		n->v.subvol = 0;
-		SET_BCH_SNAPSHOT_SUBVOL(&n->v, false);
-		ret   = bch2_trans_update(trans, &parent_iter, &n->k_i, 0);
-		if (ret)
-			goto err;
+	if (n_parent->v.children[0] || n_parent->v.children[1]) {
+		bch_err(trans->c, "Trying to add child snapshot nodes to parent that already has children");
+		ret = -EINVAL;
+		goto err;
 	}
+
+	ret = create_snapids(trans, parent, le32_to_cpu(n_parent->v.tree),
+			     new_snapids, snapshot_subvols, nr_snapids);
+	if (ret)
+		goto err;
+
+	n_parent->v.children[0] = cpu_to_le32(new_snapids[0]);
+	n_parent->v.children[1] = cpu_to_le32(new_snapids[1]);
+	n_parent->v.subvol = 0;
+	SET_BCH_SNAPSHOT_SUBVOL(&n_parent->v, false);
 err:
-	bch2_trans_iter_exit(trans, &parent_iter);
 	bch2_trans_iter_exit(trans, &iter);
 	return ret;
 }
 
+/*
+ * Create a snapshot node that is the root of a new tree:
+ */
+static int bch2_snapshot_node_create_tree(struct btree_trans *trans,
+			      u32 *new_snapids,
+			      u32 *snapshot_subvols,
+			      unsigned nr_snapids)
+{
+	struct bkey_i_snapshot_tree *n_tree;
+	int ret;
+
+	n_tree = __snapshot_tree_create(trans);
+	ret =   PTR_ERR_OR_ZERO(n_tree) ?:
+		create_snapids(trans, 0, n_tree->k.p.offset,
+			     new_snapids, snapshot_subvols, nr_snapids);
+	if (ret)
+		return ret;
+
+	n_tree->v.master_subvol	= cpu_to_le32(snapshot_subvols[0]);
+	n_tree->v.root_snapshot	= cpu_to_le32(new_snapids[0]);
+	return 0;
+}
+
+int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent,
+			      u32 *new_snapids,
+			      u32 *snapshot_subvols,
+			      unsigned nr_snapids)
+{
+	BUG_ON((parent == 0) != (nr_snapids == 1));
+	BUG_ON((parent != 0) != (nr_snapids == 2));
+
+	return parent
+		? bch2_snapshot_node_create_children(trans, parent,
+				new_snapids, snapshot_subvols, nr_snapids)
+		: bch2_snapshot_node_create_tree(trans,
+				new_snapids, snapshot_subvols, nr_snapids);
+
+}
+
 static int snapshot_delete_key(struct btree_trans *trans,
 			       struct btree_iter *iter,
 			       struct bkey_s_c k,
@@ -731,7 +1178,8 @@ bch2_subvolume_get_inlined(struct btree_trans *trans, unsigned subvol,
 {
 	int ret = bch2_bkey_get_val_typed(trans, BTREE_ID_subvolumes, POS(0, subvol),
 					  iter_flags, subvolume, s);
-	bch2_fs_inconsistent_on(ret == -ENOENT && inconsistent_if_not_found,
+	bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT) &&
+				inconsistent_if_not_found,
 				trans->c, "missing subvolume %u", subvol);
 	return ret;
 }
@@ -785,7 +1233,8 @@ int bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid)
 				BTREE_ITER_CACHED|BTREE_ITER_INTENT,
 				subvolume);
 	ret = bkey_err(subvol);
-	bch2_fs_inconsistent_on(ret == -ENOENT, trans->c, "missing subvolume %u", subvolid);
+	bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), trans->c,
+				"missing subvolume %u", subvolid);
 	if (ret)
 		return ret;
 
@@ -894,7 +1343,8 @@ int bch2_subvolume_unlink(struct btree_trans *trans, u32 subvolid)
 			BTREE_ITER_CACHED, subvolume);
 	ret = PTR_ERR_OR_ZERO(n);
 	if (unlikely(ret)) {
-		bch2_fs_inconsistent_on(ret == -ENOENT, trans->c, "missing subvolume %u", subvolid);
+		bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), trans->c,
+					"missing subvolume %u", subvolid);
 		return ret;
 	}
 
diff --git a/fs/bcachefs/subvolume.h b/fs/bcachefs/subvolume.h
index 1f6f7862e48f..dcd9f5f95535 100644
--- a/fs/bcachefs/subvolume.h
+++ b/fs/bcachefs/subvolume.h
@@ -5,6 +5,16 @@
 #include "darray.h"
 #include "subvolume_types.h"
 
+void bch2_snapshot_tree_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
+int bch2_snapshot_tree_invalid(const struct bch_fs *, struct bkey_s_c,
+			       unsigned, struct printbuf *);
+
+#define bch2_bkey_ops_snapshot_tree ((struct bkey_ops) {	\
+	.key_invalid	= bch2_snapshot_tree_invalid,		\
+	.val_to_text	= bch2_snapshot_tree_to_text,		\
+	.min_val_size	= 8,					\
+})
+
 void bch2_snapshot_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 int bch2_snapshot_invalid(const struct bch_fs *, struct bkey_s_c,
 			  unsigned, struct printbuf *);
@@ -28,6 +38,15 @@ static inline u32 bch2_snapshot_parent(struct bch_fs *c, u32 id)
 	return snapshot_t(c, id)->parent;
 }
 
+static inline u32 bch2_snapshot_root(struct bch_fs *c, u32 id)
+{
+	u32 parent;
+
+	while ((parent = bch2_snapshot_parent(c, id)))
+		id = parent;
+	return id;
+}
+
 static inline u32 bch2_snapshot_equiv(struct bch_fs *c, u32 id)
 {
 	return snapshot_t(c, id)->equiv;
@@ -107,6 +126,7 @@ static inline int snapshot_list_add(struct bch_fs *c, snapshot_id_list *s, u32 i
 	return ret;
 }
 
+int bch2_fs_check_snapshot_trees(struct bch_fs *);
 int bch2_fs_check_snapshots(struct bch_fs *);
 int bch2_fs_check_subvols(struct bch_fs *);
 
diff --git a/fs/bcachefs/subvolume_types.h b/fs/bcachefs/subvolume_types.h
index aa49c45a35ab..c6c1cbad9781 100644
--- a/fs/bcachefs/subvolume_types.h
+++ b/fs/bcachefs/subvolume_types.h
@@ -10,6 +10,7 @@ struct snapshot_t {
 	u32			parent;
 	u32			children[2];
 	u32			subvol; /* Nonzero only if a subvolume points to this node: */
+	u32			tree;
 	u32			equiv;
 };
 
-- 
cgit 


From 653693beea8ac93e57fc17afc7353bd158bcd5ff Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 29 Mar 2023 11:18:59 -0400
Subject: bcachefs: Add otime, parent to bch_subvolume

Add two new fields to bch_subvolume:
 - otime: creation time
 - parent: For snapshots, this is the id of the subvolume the snapshot
   was created from

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs_format.h |  8 ++++
 fs/bcachefs/subvolume.c       | 97 ++++++++++++++++++++++++++++++++++++-------
 fs/bcachefs/subvolume.h       |  1 -
 3 files changed, 89 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index e9ac3aa6d91c..ad87cdff8544 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -254,6 +254,11 @@ struct bkey_packed {
 	__u8		pad[sizeof(struct bkey) - 3];
 } __packed __aligned(8);
 
+typedef struct {
+	__le64			lo;
+	__le64			hi;
+} bch_le128;
+
 #define BKEY_U64s			(sizeof(struct bkey) / sizeof(__u64))
 #define BKEY_U64s_MAX			U8_MAX
 #define BKEY_VAL_U64s_MAX		(BKEY_U64s_MAX - BKEY_U64s)
@@ -1106,6 +1111,9 @@ struct bch_subvolume {
 	__le32			flags;
 	__le32			snapshot;
 	__le64			inode;
+	__le32			parent;
+	__le32			pad;
+	bch_le128		otime;
 };
 
 LE32_BITMASK(BCH_SUBVOLUME_RO,		struct bch_subvolume, flags,  0,  1)
diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c
index b14da196e7fd..922360dec627 100644
--- a/fs/bcachefs/subvolume.c
+++ b/fs/bcachefs/subvolume.c
@@ -8,6 +8,8 @@
 #include "fs.h"
 #include "subvolume.h"
 
+static int bch2_subvolume_delete(struct btree_trans *, u32);
+
 /* Snapshot tree: */
 
 void bch2_snapshot_tree_to_text(struct printbuf *out, struct bch_fs *c,
@@ -643,12 +645,13 @@ static int check_subvol(struct btree_trans *trans,
 		return ret;
 
 	if (BCH_SUBVOLUME_UNLINKED(subvol.v)) {
+		bch2_fs_lazy_rw(c);
+
 		ret = bch2_subvolume_delete(trans, iter->pos.offset);
-		if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
+		if (ret)
 			bch_err(c, "error deleting subvolume %llu: %s",
 				iter->pos.offset, bch2_err_str(ret));
-		if (ret)
-			return ret;
+		return ret ?: -BCH_ERR_transaction_restart_nested;
 	}
 
 	if (!BCH_SUBVOLUME_SNAP(subvol.v)) {
@@ -1166,8 +1169,11 @@ void bch2_subvolume_to_text(struct printbuf *out, struct bch_fs *c,
 	struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k);
 
 	prt_printf(out, "root %llu snapshot id %u",
-	       le64_to_cpu(s.v->inode),
-	       le32_to_cpu(s.v->snapshot));
+		   le64_to_cpu(s.v->inode),
+		   le32_to_cpu(s.v->snapshot));
+
+	if (bkey_val_bytes(s.k) > offsetof(struct bch_subvolume, parent))
+		prt_printf(out, " parent %u", le32_to_cpu(s.v->parent));
 }
 
 static __always_inline int
@@ -1204,23 +1210,71 @@ int bch2_snapshot_get_subvol(struct btree_trans *trans, u32 snapshot,
 int bch2_subvolume_get_snapshot(struct btree_trans *trans, u32 subvol,
 				u32 *snapid)
 {
-	struct bch_subvolume s;
+	struct btree_iter iter;
+	struct bkey_s_c k;
 	int ret;
 
-	ret = bch2_subvolume_get_inlined(trans, subvol, true,
-					 BTREE_ITER_CACHED|
-					 BTREE_ITER_WITH_UPDATES,
-					 &s);
-	if (!ret)
-		*snapid = le32_to_cpu(s.snapshot);
+	k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_subvolumes, POS(0, subvol),
+			       BTREE_ITER_CACHED|
+			       BTREE_ITER_WITH_UPDATES);
+	ret = bkey_err(k) ?: k.k->type == KEY_TYPE_subvolume ? 0 : -ENOENT;
+
+	if (likely(!ret))
+		*snapid = le32_to_cpu(bkey_s_c_to_subvolume(k).v->snapshot);
+	else if (bch2_err_matches(ret, ENOENT))
+		bch2_fs_inconsistent(trans->c, "missing subvolume %u", subvol);
+	bch2_trans_iter_exit(trans, &iter);
 	return ret;
 }
 
+static int bch2_subvolume_reparent(struct btree_trans *trans,
+				   struct btree_iter *iter,
+				   struct bkey_s_c k,
+				   u32 old_parent, u32 new_parent)
+{
+	struct bkey_i_subvolume *s;
+	int ret;
+
+	if (k.k->type != KEY_TYPE_subvolume)
+		return 0;
+
+	if (bkey_val_bytes(k.k) > offsetof(struct bch_subvolume, parent) &&
+	    le32_to_cpu(bkey_s_c_to_subvolume(k).v->parent) != old_parent)
+		return 0;
+
+	s = bch2_bkey_make_mut_typed(trans, iter, k, 0, subvolume);
+	ret = PTR_ERR_OR_ZERO(s);
+	if (ret)
+		return ret;
+
+	s->v.parent = cpu_to_le32(new_parent);
+	return 0;
+}
+
+/*
+ * Scan for subvolumes with parent @subvolid_to_delete, reparent:
+ */
+static int bch2_subvolumes_reparent(struct btree_trans *trans, u32 subvolid_to_delete)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	struct bch_subvolume s;
+
+	return lockrestart_do(trans,
+			bch2_subvolume_get(trans, subvolid_to_delete, true,
+				   BTREE_ITER_CACHED, &s)) ?:
+		for_each_btree_key_commit(trans, iter,
+				BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_PREFETCH, k,
+				NULL, NULL, BTREE_INSERT_NOFAIL,
+			bch2_subvolume_reparent(trans, &iter, k,
+					subvolid_to_delete, le32_to_cpu(s.parent)));
+}
+
 /*
  * Delete subvolume, mark snapshot ID as deleted, queue up snapshot
  * deletion/cleanup:
  */
-int bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid)
+static int __bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid)
 {
 	struct btree_iter iter;
 	struct bkey_s_c_subvolume subvol;
@@ -1260,6 +1314,13 @@ err:
 	return ret;
 }
 
+static int bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid)
+{
+	return bch2_subvolumes_reparent(trans, subvolid) ?:
+		commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL,
+			  __bch2_subvolume_delete(trans, subvolid));
+}
+
 void bch2_subvolume_wait_for_pagecache_and_delete(struct work_struct *work)
 {
 	struct bch_fs *c = container_of(work, struct bch_fs,
@@ -1280,8 +1341,7 @@ void bch2_subvolume_wait_for_pagecache_and_delete(struct work_struct *work)
 		bch2_evict_subvolume_inodes(c, &s);
 
 		for (id = s.data; id < s.data + s.nr; id++) {
-			ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_NOFAIL,
-				      bch2_subvolume_delete(&trans, *id));
+			ret = bch2_trans_run(c, bch2_subvolume_delete(&trans, *id));
 			if (ret) {
 				bch_err(c, "error deleting subvolume %u: %s", *id, bch2_err_str(ret));
 				break;
@@ -1359,6 +1419,7 @@ int bch2_subvolume_create(struct btree_trans *trans, u64 inode,
 			  u32 *new_snapshotid,
 			  bool ro)
 {
+	struct bch_fs *c = trans->c;
 	struct btree_iter dst_iter, src_iter = (struct btree_iter) { NULL };
 	struct bkey_i_subvolume *new_subvol = NULL;
 	struct bkey_i_subvolume *src_subvol = NULL;
@@ -1383,7 +1444,7 @@ int bch2_subvolume_create(struct btree_trans *trans, u64 inode,
 				BTREE_ITER_CACHED, subvolume);
 		ret = PTR_ERR_OR_ZERO(src_subvol);
 		if (unlikely(ret)) {
-			bch2_fs_inconsistent_on(ret == -ENOENT, trans->c,
+			bch2_fs_inconsistent_on(ret == -ENOENT, c,
 						"subvolume %u not found", src_subvolid);
 			goto err;
 		}
@@ -1412,6 +1473,10 @@ int bch2_subvolume_create(struct btree_trans *trans, u64 inode,
 	new_subvol->v.flags	= 0;
 	new_subvol->v.snapshot	= cpu_to_le32(new_nodes[0]);
 	new_subvol->v.inode	= cpu_to_le64(inode);
+	new_subvol->v.parent	= cpu_to_le32(src_subvolid);
+	new_subvol->v.otime.lo	= cpu_to_le64(bch2_current_time(c));
+	new_subvol->v.otime.hi	= 0;
+
 	SET_BCH_SUBVOLUME_RO(&new_subvol->v, ro);
 	SET_BCH_SUBVOLUME_SNAP(&new_subvol->v, src_subvolid != 0);
 
diff --git a/fs/bcachefs/subvolume.h b/fs/bcachefs/subvolume.h
index dcd9f5f95535..1ee4562198a6 100644
--- a/fs/bcachefs/subvolume.h
+++ b/fs/bcachefs/subvolume.h
@@ -156,7 +156,6 @@ int bch2_snapshot_node_create(struct btree_trans *, u32,
 int bch2_delete_dead_snapshots(struct bch_fs *);
 void bch2_delete_dead_snapshots_async(struct bch_fs *);
 
-int bch2_subvolume_delete(struct btree_trans *, u32);
 int bch2_subvolume_unlink(struct btree_trans *, u32);
 int bch2_subvolume_create(struct btree_trans *, u64, u32,
 			  u32 *, u32 *, bool);
-- 
cgit 


From cb1b479dc1c78d1d224e4aa6aba212a7bd3263a4 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 28 Apr 2023 03:50:57 -0400
Subject: bcachefs: Fix quotas + snapshots

Now that we can reliably designate and find the master subvolume out of
a tree of snapshots, we can finally make quotas work with snapshots:

That is - quotas will now _ignore_ snapshot subvolumes, and only be in
effect for the master (non snapshot) subvolume.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-io.c     |  7 ++++++-
 fs/bcachefs/quota.c     | 25 ++++++++++++++-----------
 fs/bcachefs/subvolume.c | 10 +++++-----
 fs/bcachefs/subvolume.h |  2 ++
 4 files changed, 27 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index ea5039254609..64897cee8494 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -333,6 +333,9 @@ static int bch2_quota_reservation_add(struct bch_fs *c,
 {
 	int ret;
 
+	if (test_bit(EI_INODE_SNAPSHOT, &inode->ei_flags))
+		return 0;
+
 	mutex_lock(&inode->ei_quota_lock);
 	ret = bch2_quota_acct(c, inode->ei_qid, Q_SPC, sectors,
 			      check_enospc ? KEY_TYPE_QUOTA_PREALLOC : KEY_TYPE_QUOTA_NOCHECK);
@@ -414,7 +417,9 @@ static void __i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode,
 	inode->v.i_blocks += sectors;
 
 #ifdef CONFIG_BCACHEFS_QUOTA
-	if (quota_res && sectors > 0) {
+	if (quota_res &&
+	    !test_bit(EI_INODE_SNAPSHOT, &inode->ei_flags) &&
+	    sectors > 0) {
 		BUG_ON(sectors > quota_res->sectors);
 		BUG_ON(sectors > inode->ei_quota_reserved);
 
diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c
index 7734e0dfe523..310eb9d26571 100644
--- a/fs/bcachefs/quota.c
+++ b/fs/bcachefs/quota.c
@@ -2,6 +2,7 @@
 #include "bcachefs.h"
 #include "btree_update.h"
 #include "errcode.h"
+#include "error.h"
 #include "inode.h"
 #include "quota.h"
 #include "subvolume.h"
@@ -556,23 +557,25 @@ static int bch2_fs_quota_read_inode(struct btree_trans *trans,
 {
 	struct bch_fs *c = trans->c;
 	struct bch_inode_unpacked u;
-	struct bch_subvolume subvolume;
+	struct bch_snapshot_tree s_t;
 	int ret;
 
-	ret = bch2_snapshot_get_subvol(trans, k.k->p.snapshot, &subvolume);
+	ret = bch2_snapshot_tree_lookup(trans,
+			snapshot_t(c, k.k->p.snapshot)->tree, &s_t);
+	bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c,
+			"%s: snapshot tree %u not found", __func__,
+			snapshot_t(c, k.k->p.snapshot)->tree);
 	if (ret)
 		return ret;
 
-	/*
-	 * We don't do quota accounting in snapshots:
-	 */
-	if (BCH_SUBVOLUME_SNAP(&subvolume))
+	if (!s_t.master_subvol)
 		goto advance;
 
-	if (!bkey_is_inode(k.k))
-		goto advance;
-
-	ret = bch2_inode_unpack(k, &u);
+	ret = bch2_inode_find_by_inum_trans(trans,
+				(subvol_inum) {
+					le32_to_cpu(s_t.master_subvol),
+					k.k->p.offset,
+				}, &u);
 	if (ret)
 		return ret;
 
@@ -581,7 +584,7 @@ static int bch2_fs_quota_read_inode(struct btree_trans *trans,
 	bch2_quota_acct(c, bch_qid(&u), Q_INO, 1,
 			KEY_TYPE_QUOTA_NOCHECK);
 advance:
-	bch2_btree_iter_set_pos(iter, POS(iter->pos.inode, iter->pos.offset + 1));
+	bch2_btree_iter_set_pos(iter, bpos_nosnap_successor(iter->pos));
 	return 0;
 }
 
diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c
index 922360dec627..388fa12bbd8b 100644
--- a/fs/bcachefs/subvolume.c
+++ b/fs/bcachefs/subvolume.c
@@ -34,8 +34,8 @@ int bch2_snapshot_tree_invalid(const struct bch_fs *c, struct bkey_s_c k,
 	return 0;
 }
 
-static int snapshot_tree_lookup(struct btree_trans *trans, u32 id,
-				struct bch_snapshot_tree *s)
+int bch2_snapshot_tree_lookup(struct btree_trans *trans, u32 id,
+			      struct bch_snapshot_tree *s)
 {
 	return bch2_bkey_get_val_typed(trans, BTREE_ID_snapshot_trees, POS(0, id),
 				       BTREE_ITER_WITH_UPDATES, snapshot_tree, s);
@@ -426,7 +426,7 @@ static int snapshot_tree_ptr_good(struct btree_trans *trans,
 				  u32 snap_id, u32 tree_id)
 {
 	struct bch_snapshot_tree s_t;
-	int ret = snapshot_tree_lookup(trans, tree_id, &s_t);
+	int ret = bch2_snapshot_tree_lookup(trans, tree_id, &s_t);
 
 	if (bch2_err_matches(ret, ENOENT))
 		return 0;
@@ -462,7 +462,7 @@ static int snapshot_tree_ptr_repair(struct btree_trans *trans,
 
 	tree_id = le32_to_cpu(root.v->tree);
 
-	ret = snapshot_tree_lookup(trans, tree_id, &s_t);
+	ret = bch2_snapshot_tree_lookup(trans, tree_id, &s_t);
 	if (ret && !bch2_err_matches(ret, ENOENT))
 		return ret;
 
@@ -659,7 +659,7 @@ static int check_subvol(struct btree_trans *trans,
 		u32 snapshot_tree = snapshot_t(c, snapshot_root)->tree;
 		struct bch_snapshot_tree st;
 
-		ret = snapshot_tree_lookup(trans, snapshot_tree, &st);
+		ret = bch2_snapshot_tree_lookup(trans, snapshot_tree, &st);
 
 		bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c,
 				"%s: snapshot tree %u not found", __func__, snapshot_tree);
diff --git a/fs/bcachefs/subvolume.h b/fs/bcachefs/subvolume.h
index 1ee4562198a6..1a39f713db87 100644
--- a/fs/bcachefs/subvolume.h
+++ b/fs/bcachefs/subvolume.h
@@ -15,6 +15,8 @@ int bch2_snapshot_tree_invalid(const struct bch_fs *, struct bkey_s_c,
 	.min_val_size	= 8,					\
 })
 
+int bch2_snapshot_tree_lookup(struct btree_trans *, u32, struct bch_snapshot_tree *);
+
 void bch2_snapshot_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 int bch2_snapshot_invalid(const struct bch_fs *, struct bkey_s_c,
 			  unsigned, struct printbuf *);
-- 
cgit 


From 38e3d93fa1da7e3f0bc61b240a65cee7fb024400 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 2 May 2023 18:22:12 -0400
Subject: bcachefs: Improved comment for bch2_replicas_gc2()

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/replicas.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c
index 8935ff5899c9..8ae50dfd8c8c 100644
--- a/fs/bcachefs/replicas.c
+++ b/fs/bcachefs/replicas.c
@@ -550,8 +550,14 @@ int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask)
 	return 0;
 }
 
-/* New much simpler mechanism for clearing out unneeded replicas entries: */
-
+/*
+ * New much simpler mechanism for clearing out unneeded replicas entries - drop
+ * replicas entries that have 0 sectors used.
+ *
+ * However, we don't track sector counts for journal usage, so this doesn't drop
+ * any BCH_DATA_journal entries; the old bch2_replicas_gc_(start|end) mechanism
+ * is retained for that.
+ */
 int bch2_replicas_gc2(struct bch_fs *c)
 {
 	struct bch_replicas_cpu new = { 0 };
-- 
cgit 


From a7b29b8d9a17297499a409274e75d674e7930ff9 Mon Sep 17 00:00:00 2001
From: Brian Foster <bfoster@redhat.com>
Date: Thu, 4 May 2023 12:44:15 -0400
Subject: bcachefs: mark journal replicas before journal write submission

The journal write submission path marks the associated replica
entries for journal data in journal_write_done(), which is just
after journal write bio submission. This creates a small window
where journal entries might have been written out, but the
associated replica is not marked such that recovery does not know
that the associated device contains journal data.

Move the replica marking a bit earlier in the write path such that
recovery is guaranteed to recognize that the device contains journal
data in the event of a crash.

Signed-off-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/journal_io.c | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 38458ab0013d..ede9d198bb85 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -1462,7 +1462,6 @@ static void journal_write_done(struct closure *cl)
 	struct journal *j = container_of(cl, struct journal, io);
 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
 	struct journal_buf *w = journal_last_unwritten_buf(j);
-	struct bch_replicas_padded replicas;
 	union journal_res_state old, new;
 	u64 v, seq;
 	int err = 0;
@@ -1474,13 +1473,7 @@ static void journal_write_done(struct closure *cl)
 	if (!w->devs_written.nr) {
 		bch_err(c, "unable to write journal to sufficient devices");
 		err = -EIO;
-	} else {
-		bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal,
-					 w->devs_written);
-		if (bch2_mark_replicas(c, &replicas.e))
-			err = -EIO;
 	}
-
 	if (err)
 		bch2_fatal_error(c);
 
@@ -1672,6 +1665,7 @@ void bch2_journal_write(struct closure *cl)
 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
 	struct bch_dev *ca;
 	struct journal_buf *w = journal_last_unwritten_buf(j);
+	struct bch_replicas_padded replicas;
 	struct jset_entry *start, *end;
 	struct jset *jset;
 	struct bio *bio;
@@ -1822,9 +1816,7 @@ retry_alloc:
 		bch_err(c, "Unable to allocate journal write:\n%s",
 			journal_debug_buf.buf);
 		printbuf_exit(&journal_debug_buf);
-		bch2_fatal_error(c);
-		continue_at(cl, journal_write_done, c->io_complete_wq);
-		return;
+		goto err;
 	}
 
 	w->devs_written = bch2_bkey_devs(bkey_i_to_s_c(&w->key));
@@ -1838,6 +1830,16 @@ retry_alloc:
 	if (nr_rw_members > 1)
 		w->separate_flush = true;
 
+	/*
+	 * Mark journal replicas before we submit the write to guarantee
+	 * recovery will find the journal entries after a crash.
+	 */
+	bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal,
+				 w->devs_written);
+	ret = bch2_mark_replicas(c, &replicas.e);
+	if (ret)
+		goto err;
+
 	if (!JSET_NO_FLUSH(jset) && w->separate_flush) {
 		for_each_rw_member(ca, c, i) {
 			percpu_ref_get(&ca->io_ref);
-- 
cgit 


From 92e637cef4fc9380363b425de740827d7c492219 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 8 May 2023 14:23:08 -0400
Subject: bcachefs: Delete some dead code in bch2_replicas_gc_end()

bch2_replicas_gc_(start|end) is now only used for journal replicas
entries, which don't have bucket sector counts - so this code is
entirely dead and can be deleted.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/replicas.c | 25 -------------------------
 1 file changed, 25 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c
index 8ae50dfd8c8c..76efbfce7683 100644
--- a/fs/bcachefs/replicas.c
+++ b/fs/bcachefs/replicas.c
@@ -460,36 +460,11 @@ int bch2_replicas_delta_list_mark(struct bch_fs *c,
 
 int bch2_replicas_gc_end(struct bch_fs *c, int ret)
 {
-	unsigned i;
-
 	lockdep_assert_held(&c->replicas_gc_lock);
 
 	mutex_lock(&c->sb_lock);
 	percpu_down_write(&c->mark_lock);
 
-	/*
-	 * this is kind of crappy; the replicas gc mechanism needs to be ripped
-	 * out
-	 */
-
-	for (i = 0; i < c->replicas.nr; i++) {
-		struct bch_replicas_entry *e =
-			cpu_replicas_entry(&c->replicas, i);
-		struct bch_replicas_cpu n;
-
-		if (!__replicas_has_entry(&c->replicas_gc, e) &&
-		    bch2_fs_usage_read_one(c, &c->usage_base->replicas[i])) {
-			n = cpu_replicas_add_entry(&c->replicas_gc, e);
-			if (!n.entries) {
-				ret = -BCH_ERR_ENOMEM_cpu_replicas;
-				goto err;
-			}
-
-			swap(n, c->replicas_gc);
-			kfree(n.entries);
-		}
-	}
-
 	ret = bch2_cpu_replicas_to_sb_replicas(c, &c->replicas_gc);
 	if (ret)
 		goto err;
-- 
cgit 


From 4a2e5d7ba5b8208ea5a20eeb274b2b0333ab5dcf Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 12 May 2023 20:28:54 -0400
Subject: bcachefs: Replace a BUG_ON() with fatal error

A user hit this BUG_ON() - it's unclear how it happened, so replace it
with a fatal error that will cause us to go read only, and print out
more information.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/journal_io.c | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index ede9d198bb85..b455ef041dfe 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -1743,7 +1743,16 @@ void bch2_journal_write(struct closure *cl)
 	BUG_ON(u64s > j->entry_u64s_reserved);
 
 	le32_add_cpu(&jset->u64s, u64s);
-	BUG_ON(vstruct_sectors(jset, c->block_bits) > w->sectors);
+
+	sectors = vstruct_sectors(jset, c->block_bits);
+	bytes	= vstruct_bytes(jset);
+
+	if (sectors > w->sectors) {
+		bch2_fs_fatal_error(c, "aieeee! journal write overran available space, %zu > %u (extra %u reserved %u/%u)",
+				    vstruct_bytes(jset), w->sectors << 9,
+				    u64s, w->u64s_reserved, j->entry_u64s_reserved);
+		goto err;
+	}
 
 	jset->magic		= cpu_to_le64(jset_magic(c));
 	jset->version		= c->sb.version < bcachefs_metadata_version_bkey_renumber
@@ -1780,10 +1789,6 @@ void bch2_journal_write(struct closure *cl)
 	    jset_validate(c, NULL, jset, 0, WRITE))
 		goto err;
 
-	sectors = vstruct_sectors(jset, c->block_bits);
-	BUG_ON(sectors > w->sectors);
-
-	bytes = vstruct_bytes(jset);
 	memset((void *) jset + bytes, 0, (sectors << 9) - bytes);
 
 retry_alloc:
-- 
cgit 


From 73da30e8e0f8ffcc91691934f202ab6e2f985604 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 13 May 2023 00:11:14 -0400
Subject: bcachefs: Fix check_overlapping_extents()

A error check had a flipped conditional - whoops.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fsck.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index eb3609aa4593..1b3ee66265c9 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -1176,7 +1176,7 @@ static int check_overlapping_extents(struct btree_trans *trans,
 				goto err;
 			bkey_reassemble(update, k);
 			ret = bch2_trans_update_extent(trans, iter, update, 0);
-			if (!ret)
+			if (ret)
 				goto err;
 		}
 	}
-- 
cgit 


From d598a9b7e27158d3b6972077e2f7296f279c2e8b Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 13 May 2023 17:21:55 -0400
Subject: bcachefs: Use memcpy_u64s_small() for copying keys

Small performance optimization; an open coded loop is better than rep ;
movsq for small copies.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bkey.c      | 2 +-
 fs/bcachefs/bkey_sort.c | 2 +-
 fs/bcachefs/bset.c      | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bkey.c b/fs/bcachefs/bkey.c
index abb444192749..ee7ba700e75f 100644
--- a/fs/bcachefs/bkey.c
+++ b/fs/bcachefs/bkey.c
@@ -360,7 +360,7 @@ bool bch2_bkey_pack(struct bkey_packed *out, const struct bkey_i *in,
 	memmove_u64s((u64 *) out + format->key_u64s,
 		     &in->v,
 		     bkey_val_u64s(&in->k));
-	memcpy_u64s(out, &tmp, format->key_u64s);
+	memcpy_u64s_small(out, &tmp, format->key_u64s);
 
 	return true;
 }
diff --git a/fs/bcachefs/bkey_sort.c b/fs/bcachefs/bkey_sort.c
index cdef41db7692..b9aa027c881b 100644
--- a/fs/bcachefs/bkey_sort.c
+++ b/fs/bcachefs/bkey_sort.c
@@ -188,7 +188,7 @@ unsigned bch2_sort_keys(struct bkey_packed *dst,
 		}
 
 		if (bkey_deleted(in)) {
-			memcpy_u64s(out, in, bkeyp_key_u64s(f, in));
+			memcpy_u64s_small(out, in, bkeyp_key_u64s(f, in));
 			set_bkeyp_val_u64s(f, out, 0);
 		} else {
 			bkey_copy(out, in);
diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c
index a4c06e856c2e..4d55011551e0 100644
--- a/fs/bcachefs/bset.c
+++ b/fs/bcachefs/bset.c
@@ -1022,7 +1022,7 @@ void bch2_bset_insert(struct btree *b,
 		set_btree_bset_end(b, t);
 	}
 
-	memcpy_u64s(where, src,
+	memcpy_u64s_small(where, src,
 		    bkeyp_key_u64s(f, src));
 	memcpy_u64s(bkeyp_val(f, where), &insert->v,
 		    bkeyp_val_u64s(f, src));
-- 
cgit 


From a49bd8c007e4f4840f8c4d7fe7d62c7bdc7fffca Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 14 May 2023 23:01:14 -0400
Subject: bcachefs: Delete an incorrect bch2_trans_unlock()

These deletes a bch2_trans_unlock() call from __bch2_move_data(). It was
redundant; bch2_move_extent() has the correct unlock call, and it was
buggy because when move_extent calls bch2_extent_drop_ptrs() we don't
want the transaction to be unlocked yet - this fixes a btree_iter.c
assertion.

Fixes https://github.com/koverstreet/bcachefs/issues/511.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/move.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index 7e22176a5c7e..2ec30a3fd193 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -583,7 +583,6 @@ static int __bch2_move_data(struct moving_context *ctxt,
 		 */
 		bch2_bkey_buf_reassemble(&sk, c, k);
 		k = bkey_i_to_s_c(sk.k);
-		bch2_trans_unlock(&trans);
 
 		ret2 = bch2_move_extent(&trans, &iter, ctxt, NULL,
 					io_opts, btree_id, k, data_opts);
-- 
cgit 


From faa62a2036a491a919deffd980abc867be51b6f1 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 20 May 2023 02:20:28 -0400
Subject: bcachefs: alloc_v4_u64s() fix

With the recent bkey_ops.min_val_size addition, bkey values are
automatically extended to the size of the current version.

The check in bch2_alloc_v4_invalid() needs to be updated to take this
into account.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index dcdef3bcd4c4..f774a660a681 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -269,9 +269,9 @@ int bch2_alloc_v4_invalid(const struct bch_fs *c, struct bkey_s_c k,
 	struct bkey_s_c_alloc_v4 a = bkey_s_c_to_alloc_v4(k);
 	int rw = flags & WRITE;
 
-	if (alloc_v4_u64s(a.v) != bkey_val_u64s(k.k)) {
-		prt_printf(err, "bad val size (%lu != %u)",
-		       bkey_val_u64s(k.k), alloc_v4_u64s(a.v));
+	if (alloc_v4_u64s(a.v) > bkey_val_u64s(k.k)) {
+		prt_printf(err, "bad val size (%u > %lu)",
+		       alloc_v4_u64s(a.v), bkey_val_u64s(k.k));
 		return -BCH_ERR_invalid_bkey;
 	}
 
-- 
cgit 


From 0b438c5bfaebda3fdf6edc35d9572d4e2f66aef1 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 22 May 2023 00:49:06 -0400
Subject: bcachefs: Clear btree_node_just_written() when node reused or evicted

This fixes the following bug:

Journal reclaim attempts to flush a node, but races with the node being
evicted from the btree node cache; when we lock the node, the data
buffers have already been freed.

We don't evict a node that's dirty, so calling btree_node_write() is
fine - it's a noop - except that the btree_node_just_written bit causes
bch2_btree_post_write_cleanup() to run (resorting the node), which then
causes a null ptr deref.

00078 Unable to handle kernel NULL pointer dereference at virtual address 000000000000009e
00078 Mem abort info:
00078   ESR = 0x0000000096000005
00078   EC = 0x25: DABT (current EL), IL = 32 bits
00078   SET = 0, FnV = 0
00078   EA = 0, S1PTW = 0
00078   FSC = 0x05: level 1 translation fault
00078 Data abort info:
00078   ISV = 0, ISS = 0x00000005
00078   CM = 0, WnR = 0
00078 user pgtable: 4k pages, 39-bit VAs, pgdp=000000007ed64000
00078 [000000000000009e] pgd=0000000000000000, p4d=0000000000000000, pud=0000000000000000
00078 Internal error: Oops: 0000000096000005 [#1] SMP
00078 Modules linked in:
00078 CPU: 75 PID: 1170 Comm: stress-ng-utime Not tainted 6.3.0-ktest-g5ef5b466e77e #2078
00078 Hardware name: linux,dummy-virt (DT)
00078 pstate: 80001005 (Nzcv daif -PAN -UAO -TCO -DIT +SSBS BTYPE=--)
00078 pc : btree_node_sort+0xc4/0x568
00078 lr : bch2_btree_post_write_cleanup+0x6c/0x1c0
00078 sp : ffffff803e30b350
00078 x29: ffffff803e30b350 x28: 0000000000000001 x27: ffffff80076e52a8
00078 x26: 0000000000000002 x25: 0000000000000000 x24: ffffffc00912e000
00078 x23: ffffff80076e52a8 x22: 0000000000000000 x21: ffffff80076e52bc
00078 x20: ffffff80076e5200 x19: 0000000000000000 x18: 0000000000000000
00078 x17: fffffffff8000000 x16: 0000000008000000 x15: 0000000008000000
00078 x14: 0000000000000002 x13: 0000000000000000 x12: 00000000000000a0
00078 x11: ffffff803e30b400 x10: ffffff803e30b408 x9 : 0000000000000001
00078 x8 : 0000000000000000 x7 : ffffff803e480000 x6 : 00000000000000a0
00078 x5 : 0000000000000088 x4 : 0000000000000000 x3 : 0000000000000010
00078 x2 : 0000000000000000 x1 : 0000000000000000 x0 : ffffff80076e52a8
00078 Call trace:
00078  btree_node_sort+0xc4/0x568
00078  bch2_btree_post_write_cleanup+0x6c/0x1c0
00078  bch2_btree_node_write+0x108/0x148
00078  __btree_node_flush+0x104/0x160
00078  bch2_btree_node_flush0+0x1c/0x30
00078  journal_flush_pins.constprop.0+0x184/0x2d0
00078  __bch2_journal_reclaim+0x4d4/0x508
00078  bch2_journal_reclaim+0x1c/0x30
00078  __bch2_journal_preres_get+0x244/0x268
00078  bch2_trans_journal_preres_get_cold+0xa4/0x180
00078  __bch2_trans_commit+0x61c/0x1bb0
00078  bch2_setattr_nonsize+0x254/0x318
00078  bch2_setattr+0x5c/0x78
00078  notify_change+0x2bc/0x408
00078  vfs_utimes+0x11c/0x218
00078  do_utimes+0x84/0x140
00078  __arm64_sys_utimensat+0x68/0xa8
00078  invoke_syscall.constprop.0+0x54/0xf0
00078  do_el0_svc+0x48/0xd8
00078  el0_svc+0x14/0x48
00078  el0t_64_sync_handler+0xb0/0xb8
00078  el0t_64_sync+0x14c/0x150
00078 Code: 8b050265 910020c6 8b060266 910060ac (79402cad)
00078 ---[ end trace 0000000000000000 ]---

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_cache.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index 46a8a29ddef7..76e08f2f6689 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -55,6 +55,8 @@ static void btree_node_data_free(struct bch_fs *c, struct btree *b)
 
 	EBUG_ON(btree_node_write_in_flight(b));
 
+	clear_btree_node_just_written(b);
+
 	kvpfree(b->data, btree_bytes(c));
 	b->data = NULL;
 #ifdef __KERNEL__
@@ -648,6 +650,7 @@ err:
 	/* Try to cannibalize another cached btree node: */
 	if (bc->alloc_lock == current) {
 		b2 = btree_node_cannibalize(c);
+		clear_btree_node_just_written(b2);
 		bch2_btree_node_hash_remove(bc, b2);
 
 		if (b) {
-- 
cgit 


From 962210b281b327b236215c736b9f648369f0d39d Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 22 May 2023 14:39:44 -0400
Subject: bcachefs: Fix a buffer overrun in bch2_fs_usage_read()

We were copying the size of a struct bch_fs_usage_online to a struct
bch_fs_usage, which is 8 bytes smaller.

This adds some new helpers so we can do this correctly, and get rid of
some magic +1s too.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/buckets.c | 16 +++++++++-------
 fs/bcachefs/buckets.h | 18 ++++++++++++++++--
 2 files changed, 25 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index bce42eef6f57..bd144182c1e1 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -137,17 +137,17 @@ u64 bch2_fs_usage_read_one(struct bch_fs *c, u64 *v)
 struct bch_fs_usage_online *bch2_fs_usage_read(struct bch_fs *c)
 {
 	struct bch_fs_usage_online *ret;
-	unsigned seq, i, v, u64s = fs_usage_u64s(c) + 1;
+	unsigned nr_replicas = READ_ONCE(c->replicas.nr);
+	unsigned seq, i;
 retry:
-	ret = kmalloc(u64s * sizeof(u64), GFP_NOFS);
+	ret = kmalloc(__fs_usage_online_u64s(nr_replicas) * sizeof(u64), GFP_NOFS);
 	if (unlikely(!ret))
 		return NULL;
 
 	percpu_down_read(&c->mark_lock);
 
-	v = fs_usage_u64s(c) + 1;
-	if (unlikely(u64s != v)) {
-		u64s = v;
+	if (nr_replicas != c->replicas.nr) {
+		nr_replicas = c->replicas.nr;
 		percpu_up_read(&c->mark_lock);
 		kfree(ret);
 		goto retry;
@@ -157,10 +157,12 @@ retry:
 
 	do {
 		seq = read_seqcount_begin(&c->usage_lock);
-		unsafe_memcpy(&ret->u, c->usage_base, u64s * sizeof(u64),
+		unsafe_memcpy(&ret->u, c->usage_base,
+			      __fs_usage_u64s(nr_replicas) * sizeof(u64),
 			      "embedded variable length struct");
 		for (i = 0; i < ARRAY_SIZE(c->usage); i++)
-			acc_u64s_percpu((u64 *) &ret->u, (u64 __percpu *) c->usage[i], u64s);
+			acc_u64s_percpu((u64 *) &ret->u, (u64 __percpu *) c->usage[i],
+					__fs_usage_u64s(nr_replicas));
 	} while (read_seqcount_retry(&c->usage_lock, seq));
 
 	return ret;
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index d677b0225c52..bdf4fff9cb8a 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -207,10 +207,24 @@ static inline u64 dev_buckets_available(struct bch_dev *ca,
 
 /* Filesystem usage: */
 
+static inline unsigned __fs_usage_u64s(unsigned nr_replicas)
+{
+	return sizeof(struct bch_fs_usage) / sizeof(u64) + nr_replicas;
+}
+
 static inline unsigned fs_usage_u64s(struct bch_fs *c)
 {
-	return sizeof(struct bch_fs_usage) / sizeof(u64) +
-		READ_ONCE(c->replicas.nr);
+	return __fs_usage_u64s(READ_ONCE(c->replicas.nr));
+}
+
+static inline unsigned __fs_usage_online_u64s(unsigned nr_replicas)
+{
+	return sizeof(struct bch_fs_usage_online) / sizeof(u64) + nr_replicas;
+}
+
+static inline unsigned fs_usage_online_u64s(struct bch_fs *c)
+{
+	return __fs_usage_online_u64s(READ_ONCE(c->replicas.nr));
 }
 
 static inline unsigned dev_usage_u64s(void)
-- 
cgit 


From f375d6ca58d5f28b9c0a3af449a0dd640ddcc6a1 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 16 Jun 2023 18:55:07 -0400
Subject: bcachefs: Don't call local_clock() twice in trans_begin()

local_clock() is not as cheap as we'd like it to be, alas

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 0a62f55a3aa8..d906bfb6754d 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -2850,6 +2850,7 @@ static noinline void bch2_trans_reset_srcu_lock(struct btree_trans *trans)
 u32 bch2_trans_begin(struct btree_trans *trans)
 {
 	struct btree_path *path;
+	u64 now;
 
 	bch2_trans_reset_updates(trans);
 
@@ -2878,13 +2879,16 @@ u32 bch2_trans_begin(struct btree_trans *trans)
 			path->preserve = false;
 	}
 
+	now = local_clock();
 	if (!trans->restarted &&
 	    (need_resched() ||
-	     local_clock() - trans->last_begin_time > BTREE_TRANS_MAX_LOCK_HOLD_TIME_NS)) {
+	     now - trans->last_begin_time > BTREE_TRANS_MAX_LOCK_HOLD_TIME_NS)) {
 		bch2_trans_unlock(trans);
 		cond_resched();
 		bch2_trans_relock(trans);
+		now = local_clock();
 	}
+	trans->last_begin_time = now;
 
 	if (unlikely(time_after(jiffies, trans->srcu_lock_time + msecs_to_jiffies(10))))
 		bch2_trans_reset_srcu_lock(trans);
@@ -2895,7 +2899,6 @@ u32 bch2_trans_begin(struct btree_trans *trans)
 		trans->notrace_relock_fail = false;
 	}
 
-	trans->last_begin_time = local_clock();
 	return trans->restart_count;
 }
 
-- 
cgit 


From 01bf56a9771466147d94a013bc5678d0ed1b1382 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 20 May 2023 20:40:08 -0400
Subject: six locks: six_lock_readers_add()

This moves a helper out of the bcachefs code that shouldn't have been
there, since it touches six lock internals.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_locking.c | 10 ----------
 fs/bcachefs/six.c           | 13 +++++++++++++
 fs/bcachefs/six.h           |  3 +--
 3 files changed, 14 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c
index 9e097ab668a6..d7b0c4436caf 100644
--- a/fs/bcachefs/btree_locking.c
+++ b/fs/bcachefs/btree_locking.c
@@ -24,16 +24,6 @@ void bch2_assert_btree_nodes_not_locked(void)
 
 /* Btree node locking: */
 
-static inline void six_lock_readers_add(struct six_lock *lock, int nr)
-{
-	if (lock->readers)
-		this_cpu_add(*lock->readers, nr);
-	else if (nr > 0)
-		atomic64_add(__SIX_VAL(read_lock, nr), &lock->state.counter);
-	else
-		atomic64_sub(__SIX_VAL(read_lock, -nr), &lock->state.counter);
-}
-
 struct six_lock_count bch2_btree_node_lock_counts(struct btree_trans *trans,
 						  struct btree_path *skip,
 						  struct btree_bkey_cached_common *b,
diff --git a/fs/bcachefs/six.c b/fs/bcachefs/six.c
index b54a2ac480c8..0f9e1bf31008 100644
--- a/fs/bcachefs/six.c
+++ b/fs/bcachefs/six.c
@@ -40,6 +40,8 @@ struct six_lock_vals {
 	enum six_lock_type	unlock_wakeup;
 };
 
+#define __SIX_VAL(field, _v)	(((union six_lock_state) { .field = _v }).v)
+
 #define __SIX_LOCK_HELD_read	__SIX_VAL(read_lock, ~0)
 #define __SIX_LOCK_HELD_intent	__SIX_VAL(intent_lock, ~0)
 #define __SIX_LOCK_HELD_write	__SIX_VAL(seq, 1)
@@ -847,3 +849,14 @@ struct six_lock_count six_lock_counts(struct six_lock *lock)
 	return ret;
 }
 EXPORT_SYMBOL_GPL(six_lock_counts);
+
+void six_lock_readers_add(struct six_lock *lock, int nr)
+{
+	if (lock->readers)
+		this_cpu_add(*lock->readers, nr);
+	else if (nr > 0)
+		atomic64_add(__SIX_VAL(read_lock, nr), &lock->state.counter);
+	else
+		atomic64_sub(__SIX_VAL(read_lock, -nr), &lock->state.counter);
+}
+EXPORT_SYMBOL_GPL(six_lock_readers_add);
diff --git a/fs/bcachefs/six.h b/fs/bcachefs/six.h
index 09abea29a021..6b53818ae97a 100644
--- a/fs/bcachefs/six.h
+++ b/fs/bcachefs/six.h
@@ -152,8 +152,6 @@ do {									\
 	__six_lock_init((lock), #lock, &__key);				\
 } while (0)
 
-#define __SIX_VAL(field, _v)	(((union six_lock_state) { .field = _v }).v)
-
 #define __SIX_LOCK(type)						\
 bool six_trylock_ip_##type(struct six_lock *, unsigned long);		\
 bool six_relock_ip_##type(struct six_lock *, u32, unsigned long);	\
@@ -258,5 +256,6 @@ struct six_lock_count {
 };
 
 struct six_lock_count six_lock_counts(struct six_lock *);
+void six_lock_readers_add(struct six_lock *, int);
 
 #endif /* _LINUX_SIX_H */
-- 
cgit 


From 0d2234a79e877b1bfa71b2c8c712a155be419827 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 20 May 2023 20:57:55 -0400
Subject: six locks: Kill six_lock_pcpu_(alloc|free)

six_lock_pcpu_alloc() is an unsafe interface: it's not safe to allocate
or free the percpu reader count on an existing lock that's in use, the
only safe time to allocate percpu readers is when the lock is first
being initialized.

This patch adds a flags parameter to six_lock_init(), and instead of
six_lock_pcpu_free() we now expose six_lock_exit(), which does the same
thing but is less likely to be misused.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_cache.c     |  8 +++----
 fs/bcachefs/btree_key_cache.c | 13 ++++-------
 fs/bcachefs/btree_locking.c   |  5 ++--
 fs/bcachefs/btree_locking.h   |  2 +-
 fs/bcachefs/six.c             | 53 +++++++++++++++++++++++++++----------------
 fs/bcachefs/six.h             | 27 ++++++++--------------
 6 files changed, 56 insertions(+), 52 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index 76e08f2f6689..5801f4ff9097 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -121,7 +121,6 @@ static struct btree *__btree_node_mem_alloc(struct bch_fs *c, gfp_t gfp)
 		return NULL;
 
 	bkey_btree_ptr_init(&b->key);
-	bch2_btree_lock_init(&b->c);
 	INIT_LIST_HEAD(&b->list);
 	INIT_LIST_HEAD(&b->write_blocked);
 	b->byte_order = ilog2(btree_bytes(c));
@@ -142,6 +141,8 @@ struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *c)
 		return NULL;
 	}
 
+	bch2_btree_lock_init(&b->c, 0);
+
 	bc->used++;
 	list_add(&b->list, &bc->freeable);
 	return b;
@@ -435,7 +436,7 @@ void bch2_fs_btree_cache_exit(struct bch_fs *c)
 	while (!list_empty(&bc->freed_nonpcpu)) {
 		b = list_first_entry(&bc->freed_nonpcpu, struct btree, list);
 		list_del(&b->list);
-		six_lock_pcpu_free(&b->c.lock);
+		six_lock_exit(&b->c.lock);
 		kfree(b);
 	}
 
@@ -595,8 +596,7 @@ struct btree *bch2_btree_node_mem_alloc(struct btree_trans *trans, bool pcpu_rea
 		mutex_lock(&bc->lock);
 	}
 
-	if (pcpu_read_locks)
-		six_lock_pcpu_alloc(&b->c.lock);
+	bch2_btree_lock_init(&b->c, pcpu_read_locks ? SIX_LOCK_INIT_PCPU : 0);
 
 	BUG_ON(!six_trylock_intent(&b->c.lock));
 	BUG_ON(!six_trylock_write(&b->c.lock));
diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index 727ea2d0e58d..9725d85b99b3 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -282,9 +282,7 @@ bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path,
 		return NULL;
 init:
 	INIT_LIST_HEAD(&ck->list);
-	bch2_btree_lock_init(&ck->c);
-	if (pcpu_readers)
-		six_lock_pcpu_alloc(&ck->c.lock);
+	bch2_btree_lock_init(&ck->c, pcpu_readers ? SIX_LOCK_INIT_PCPU : 0);
 
 	ck->c.cached = true;
 	BUG_ON(!six_trylock_intent(&ck->c.lock));
@@ -340,9 +338,6 @@ btree_key_cache_create(struct btree_trans *trans, struct btree_path *path)
 		}
 
 		mark_btree_node_locked(trans, path, 0, SIX_LOCK_intent);
-	} else {
-		if (path->btree_id == BTREE_ID_subvolumes)
-			six_lock_pcpu_alloc(&ck->c.lock);
 	}
 
 	ck->c.level		= 0;
@@ -871,7 +866,7 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
 			break;
 
 		list_del(&ck->list);
-		six_lock_pcpu_free(&ck->c.lock);
+		six_lock_exit(&ck->c.lock);
 		kmem_cache_free(bch2_key_cache, ck);
 		atomic_long_dec(&bc->nr_freed);
 		scanned++;
@@ -887,7 +882,7 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
 			break;
 
 		list_del(&ck->list);
-		six_lock_pcpu_free(&ck->c.lock);
+		six_lock_exit(&ck->c.lock);
 		kmem_cache_free(bch2_key_cache, ck);
 		atomic_long_dec(&bc->nr_freed);
 		scanned++;
@@ -1012,7 +1007,7 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc)
 
 		list_del(&ck->list);
 		kfree(ck->k);
-		six_lock_pcpu_free(&ck->c.lock);
+		six_lock_exit(&ck->c.lock);
 		kmem_cache_free(bch2_key_cache, ck);
 	}
 
diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c
index d7b0c4436caf..6e1306add443 100644
--- a/fs/bcachefs/btree_locking.c
+++ b/fs/bcachefs/btree_locking.c
@@ -6,9 +6,10 @@
 
 static struct lock_class_key bch2_btree_node_lock_key;
 
-void bch2_btree_lock_init(struct btree_bkey_cached_common *b)
+void bch2_btree_lock_init(struct btree_bkey_cached_common *b,
+			  enum six_lock_init_flags flags)
 {
-	__six_lock_init(&b->lock, "b->c.lock", &bch2_btree_node_lock_key);
+	__six_lock_init(&b->lock, "b->c.lock", &bch2_btree_node_lock_key, flags);
 	lockdep_set_novalidate_class(&b->lock);
 }
 
diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h
index 76aac49966fe..660975839c89 100644
--- a/fs/bcachefs/btree_locking.h
+++ b/fs/bcachefs/btree_locking.h
@@ -13,7 +13,7 @@
 #include "btree_iter.h"
 #include "six.h"
 
-void bch2_btree_lock_init(struct btree_bkey_cached_common *);
+void bch2_btree_lock_init(struct btree_bkey_cached_common *, enum six_lock_init_flags);
 
 #ifdef CONFIG_LOCKDEP
 void bch2_assert_btree_nodes_not_locked(void);
diff --git a/fs/bcachefs/six.c b/fs/bcachefs/six.c
index 0f9e1bf31008..f75387b9da88 100644
--- a/fs/bcachefs/six.c
+++ b/fs/bcachefs/six.c
@@ -814,25 +814,6 @@ void six_lock_wakeup_all(struct six_lock *lock)
 }
 EXPORT_SYMBOL_GPL(six_lock_wakeup_all);
 
-void six_lock_pcpu_free(struct six_lock *lock)
-{
-	BUG_ON(lock->readers && pcpu_read_count(lock));
-	BUG_ON(lock->state.read_lock);
-
-	free_percpu(lock->readers);
-	lock->readers = NULL;
-}
-EXPORT_SYMBOL_GPL(six_lock_pcpu_free);
-
-void six_lock_pcpu_alloc(struct six_lock *lock)
-{
-#ifdef __KERNEL__
-	if (!lock->readers)
-		lock->readers = alloc_percpu(unsigned);
-#endif
-}
-EXPORT_SYMBOL_GPL(six_lock_pcpu_alloc);
-
 /*
  * Returns lock held counts, for both read and intent
  */
@@ -860,3 +841,37 @@ void six_lock_readers_add(struct six_lock *lock, int nr)
 		atomic64_sub(__SIX_VAL(read_lock, -nr), &lock->state.counter);
 }
 EXPORT_SYMBOL_GPL(six_lock_readers_add);
+
+void six_lock_exit(struct six_lock *lock)
+{
+	WARN_ON(lock->readers && pcpu_read_count(lock));
+	WARN_ON(lock->state.read_lock);
+
+	free_percpu(lock->readers);
+	lock->readers = NULL;
+}
+EXPORT_SYMBOL_GPL(six_lock_exit);
+
+void __six_lock_init(struct six_lock *lock, const char *name,
+		     struct lock_class_key *key, enum six_lock_init_flags flags)
+{
+	atomic64_set(&lock->state.counter, 0);
+	raw_spin_lock_init(&lock->wait_lock);
+	INIT_LIST_HEAD(&lock->wait_list);
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	debug_check_no_locks_freed((void *) lock, sizeof(*lock));
+	lockdep_init_map(&lock->dep_map, name, key, 0);
+#endif
+
+	if (flags & SIX_LOCK_INIT_PCPU) {
+		/*
+		 * We don't return an error here on memory allocation failure
+		 * since percpu is an optimization, and locks will work with the
+		 * same semantics in non-percpu mode: callers can check for
+		 * failure if they wish by checking lock->readers, but generally
+		 * will not want to treat it as an error.
+		 */
+		lock->readers = alloc_percpu(unsigned);
+	}
+}
+EXPORT_SYMBOL_GPL(__six_lock_init);
diff --git a/fs/bcachefs/six.h b/fs/bcachefs/six.h
index 6b53818ae97a..2c8424bd7d2f 100644
--- a/fs/bcachefs/six.h
+++ b/fs/bcachefs/six.h
@@ -132,24 +132,20 @@ struct six_lock_waiter {
 
 typedef int (*six_lock_should_sleep_fn)(struct six_lock *lock, void *);
 
-static __always_inline void __six_lock_init(struct six_lock *lock,
-					    const char *name,
-					    struct lock_class_key *key)
-{
-	atomic64_set(&lock->state.counter, 0);
-	raw_spin_lock_init(&lock->wait_lock);
-	INIT_LIST_HEAD(&lock->wait_list);
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-	debug_check_no_locks_freed((void *) lock, sizeof(*lock));
-	lockdep_init_map(&lock->dep_map, name, key, 0);
-#endif
-}
+void six_lock_exit(struct six_lock *lock);
 
-#define six_lock_init(lock)						\
+enum six_lock_init_flags {
+	SIX_LOCK_INIT_PCPU	= 1U << 0,
+};
+
+void __six_lock_init(struct six_lock *lock, const char *name,
+		     struct lock_class_key *key, enum six_lock_init_flags flags);
+
+#define six_lock_init(lock, flags)					\
 do {									\
 	static struct lock_class_key __key;				\
 									\
-	__six_lock_init((lock), #lock, &__key);				\
+	__six_lock_init((lock), #lock, &__key, flags);			\
 } while (0)
 
 #define __SIX_LOCK(type)						\
@@ -248,9 +244,6 @@ void six_lock_increment(struct six_lock *, enum six_lock_type);
 
 void six_lock_wakeup_all(struct six_lock *);
 
-void six_lock_pcpu_free(struct six_lock *);
-void six_lock_pcpu_alloc(struct six_lock *);
-
 struct six_lock_count {
 	unsigned n[3];
 };
-- 
cgit 


From 0157f9c5a7c77b1cb89756351929dba4b28d5f75 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 21 May 2023 16:38:09 -0400
Subject: six locks: Remove hacks for percpu mode lost wakeup

The lost wakeup bug hasn't been observed in awhile, and we're trying to
provoke it and determine if it still exists.

This patch removes some defenses that were added to attempt to track it
down; if it still exists, this should make it easier to see it.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/six.c | 11 +----------
 1 file changed, 1 insertion(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/six.c b/fs/bcachefs/six.c
index f75387b9da88..32ad545ba570 100644
--- a/fs/bcachefs/six.c
+++ b/fs/bcachefs/six.c
@@ -144,17 +144,8 @@ static int __do_six_trylock_type(struct six_lock *lock,
 		 * lock, issue a wakeup because we might have caused a
 		 * spurious trylock failure:
 		 */
-#if 0
-		/*
-		 * This code should be sufficient, but we're seeing unexplained
-		 * lost wakeups:
-		 */
 		if (old.write_locking)
 			ret = -1 - SIX_LOCK_write;
-#else
-		if (!ret)
-			ret = -1 - SIX_LOCK_write;
-#endif
 	} else if (type == SIX_LOCK_write && lock->readers) {
 		if (try) {
 			atomic64_add(__SIX_VAL(write_locking, 1),
@@ -332,7 +323,7 @@ static bool __six_relock_type(struct six_lock *lock, enum six_lock_type type,
 		 */
 		if (ret)
 			six_acquire(&lock->dep_map, 1, type == SIX_LOCK_read, ip);
-		else
+		else if (old.write_locking)
 			six_lock_wakeup(lock, old, SIX_LOCK_write);
 
 		return ret;
-- 
cgit 


From d2c86b77de5894bbe26ecbf5214227f61855aed7 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 20 May 2023 20:37:53 -0400
Subject: six locks: Centralize setting of waiting bit

Originally, the waiting bit was always set by trylock() on failure:
however, it's now set by __six_lock_type_slowpath(), with wait_lock held
- which is the more correct place to do it.

That made setting the waiting bit in trylock redundant, so this patch
deletes that.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/six.c | 15 +++------------
 1 file changed, 3 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/six.c b/fs/bcachefs/six.c
index 32ad545ba570..d8f1d20f5ece 100644
--- a/fs/bcachefs/six.c
+++ b/fs/bcachefs/six.c
@@ -151,14 +151,6 @@ static int __do_six_trylock_type(struct six_lock *lock,
 			atomic64_add(__SIX_VAL(write_locking, 1),
 				     &lock->state.counter);
 			smp_mb__after_atomic();
-		} else if (!(lock->state.waiters & (1 << SIX_LOCK_write))) {
-			atomic64_add(__SIX_VAL(waiters, 1 << SIX_LOCK_write),
-				     &lock->state.counter);
-			/*
-			 * pairs with barrier after unlock and before checking
-			 * for readers in unlock path
-			 */
-			smp_mb__after_atomic();
 		}
 
 		ret = !pcpu_read_count(lock);
@@ -190,10 +182,9 @@ static int __do_six_trylock_type(struct six_lock *lock,
 
 				if (type == SIX_LOCK_write)
 					new.write_locking = 0;
-			} else if (!try && !(new.waiters & (1 << type)))
-				new.waiters |= 1 << type;
-			else
-				break; /* waiting bit already set */
+			} else {
+				break;
+			}
 		} while ((v = atomic64_cmpxchg_acquire(&lock->state.counter,
 					old.v, new.v)) != old.v);
 
-- 
cgit 


From c4bd3491b1c0b335f63599ec96d1d4ab0d37a3c1 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 20 May 2023 21:44:30 -0400
Subject: six locks: Simplify dispatch

Originally, we used inlining/flattening to cause the compiler to
generate different versions of lock/trylock/relock/unlock for each lock
type - read, intent, and write. This made the individual functions
smaller and let the compiler eliminate table lookups: however, as the
code has gotten more complicated these optimizations have gotten less
worthwhile, and all the tricky inlining and dispatching made the code
less readable.

Text size: 11015 bytes -> 7467 bytes, and benchmarks show no loss of
performance.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/six.c |  86 +++++++------------------------
 fs/bcachefs/six.h | 148 ++++++++++++++++++++++++++++++++----------------------
 2 files changed, 106 insertions(+), 128 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/six.c b/fs/bcachefs/six.c
index d8f1d20f5ece..1a64b8a027a7 100644
--- a/fs/bcachefs/six.c
+++ b/fs/bcachefs/six.c
@@ -245,9 +245,10 @@ unlock:
 	}
 }
 
-static inline void six_lock_wakeup(struct six_lock *lock,
-				   union six_lock_state state,
-				   enum six_lock_type lock_type)
+__always_inline
+static void six_lock_wakeup(struct six_lock *lock,
+			    union six_lock_state state,
+			    enum six_lock_type lock_type)
 {
 	if (lock_type == SIX_LOCK_write && state.read_lock)
 		return;
@@ -258,6 +259,7 @@ static inline void six_lock_wakeup(struct six_lock *lock,
 	__six_lock_wakeup(lock, lock_type);
 }
 
+__always_inline
 static bool do_six_trylock_type(struct six_lock *lock,
 				enum six_lock_type type,
 				bool try)
@@ -271,9 +273,8 @@ static bool do_six_trylock_type(struct six_lock *lock,
 	return ret > 0;
 }
 
-__always_inline __flatten
-static bool __six_trylock_type(struct six_lock *lock, enum six_lock_type type,
-			       unsigned long ip)
+bool six_trylock_ip_type(struct six_lock *lock, enum six_lock_type type,
+			 unsigned long ip)
 {
 	if (!do_six_trylock_type(lock, type, true))
 		return false;
@@ -283,9 +284,8 @@ static bool __six_trylock_type(struct six_lock *lock, enum six_lock_type type,
 	return true;
 }
 
-__always_inline __flatten
-static bool __six_relock_type(struct six_lock *lock, enum six_lock_type type,
-			      unsigned seq, unsigned long ip)
+bool six_relock_ip_type(struct six_lock *lock, enum six_lock_type type,
+			unsigned seq, unsigned long ip)
 {
 	const struct six_lock_vals l[] = LOCK_VALS;
 	union six_lock_state old;
@@ -335,6 +335,7 @@ static bool __six_relock_type(struct six_lock *lock, enum six_lock_type type,
 		six_acquire(&lock->dep_map, 1, type == SIX_LOCK_read, ip);
 	return true;
 }
+EXPORT_SYMBOL_GPL(six_relock_ip_type);
 
 #ifdef CONFIG_SIX_LOCK_SPIN_ON_OWNER
 
@@ -566,11 +567,10 @@ out:
 	return ret;
 }
 
-__always_inline __flatten
-static int __six_lock_type_waiter(struct six_lock *lock, enum six_lock_type type,
-			 struct six_lock_waiter *wait,
-			 six_lock_should_sleep_fn should_sleep_fn, void *p,
-			 unsigned long ip)
+int six_lock_type_ip_waiter(struct six_lock *lock, enum six_lock_type type,
+			    struct six_lock_waiter *wait,
+			    six_lock_should_sleep_fn should_sleep_fn, void *p,
+			    unsigned long ip)
 {
 	int ret;
 
@@ -589,18 +589,9 @@ static int __six_lock_type_waiter(struct six_lock *lock, enum six_lock_type type
 
 	return ret;
 }
+EXPORT_SYMBOL_GPL(six_lock_type_ip_waiter);
 
 __always_inline
-static int __six_lock_type(struct six_lock *lock, enum six_lock_type type,
-			   six_lock_should_sleep_fn should_sleep_fn, void *p,
-			   unsigned long ip)
-{
-	struct six_lock_waiter wait;
-
-	return __six_lock_type_waiter(lock, type, &wait, should_sleep_fn, p, ip);
-}
-
-__always_inline __flatten
 static void do_six_unlock_type(struct six_lock *lock, enum six_lock_type type)
 {
 	const struct six_lock_vals l[] = LOCK_VALS;
@@ -628,9 +619,7 @@ static void do_six_unlock_type(struct six_lock *lock, enum six_lock_type type)
 	six_lock_wakeup(lock, state, l[type].unlock_wakeup);
 }
 
-__always_inline __flatten
-static void __six_unlock_type(struct six_lock *lock, enum six_lock_type type,
-			      unsigned long ip)
+void six_unlock_ip_type(struct six_lock *lock, enum six_lock_type type, unsigned long ip)
 {
 	EBUG_ON(type == SIX_LOCK_write &&
 		!(lock->state.v & __SIX_LOCK_HELD_intent));
@@ -649,48 +638,7 @@ static void __six_unlock_type(struct six_lock *lock, enum six_lock_type type,
 
 	do_six_unlock_type(lock, type);
 }
-
-#define __SIX_LOCK(type)						\
-bool six_trylock_ip_##type(struct six_lock *lock, unsigned long ip)	\
-{									\
-	return __six_trylock_type(lock, SIX_LOCK_##type, ip);		\
-}									\
-EXPORT_SYMBOL_GPL(six_trylock_ip_##type);				\
-									\
-bool six_relock_ip_##type(struct six_lock *lock, u32 seq, unsigned long ip)\
-{									\
-	return __six_relock_type(lock, SIX_LOCK_##type, seq, ip);	\
-}									\
-EXPORT_SYMBOL_GPL(six_relock_ip_##type);				\
-									\
-int six_lock_ip_##type(struct six_lock *lock,				\
-		    six_lock_should_sleep_fn should_sleep_fn, void *p,	\
-		    unsigned long ip)					\
-{									\
-	return __six_lock_type(lock, SIX_LOCK_##type, should_sleep_fn, p, ip);\
-}									\
-EXPORT_SYMBOL_GPL(six_lock_ip_##type);					\
-									\
-int six_lock_ip_waiter_##type(struct six_lock *lock,			\
-			   struct six_lock_waiter *wait,		\
-			   six_lock_should_sleep_fn should_sleep_fn, void *p,\
-			   unsigned long ip)				\
-{									\
-	return __six_lock_type_waiter(lock, SIX_LOCK_##type, wait, should_sleep_fn, p, ip);\
-}									\
-EXPORT_SYMBOL_GPL(six_lock_ip_waiter_##type);				\
-									\
-void six_unlock_ip_##type(struct six_lock *lock, unsigned long ip)	\
-{									\
-	__six_unlock_type(lock, SIX_LOCK_##type, ip);			\
-}									\
-EXPORT_SYMBOL_GPL(six_unlock_ip_##type);
-
-__SIX_LOCK(read)
-__SIX_LOCK(intent)
-__SIX_LOCK(write)
-
-#undef __SIX_LOCK
+EXPORT_SYMBOL_GPL(six_unlock_ip_type);
 
 /* Convert from intent to read: */
 void six_lock_downgrade(struct six_lock *lock)
diff --git a/fs/bcachefs/six.h b/fs/bcachefs/six.h
index 2c8424bd7d2f..5ddabbfb8aba 100644
--- a/fs/bcachefs/six.h
+++ b/fs/bcachefs/six.h
@@ -148,37 +148,116 @@ do {									\
 	__six_lock_init((lock), #lock, &__key, flags);			\
 } while (0)
 
+bool six_trylock_ip_type(struct six_lock *lock, enum six_lock_type type,
+			 unsigned long ip);
+
+static inline bool six_trylock_type(struct six_lock *lock, enum six_lock_type type)
+{
+	return six_trylock_ip_type(lock, type, _THIS_IP_);
+}
+
+int six_lock_type_ip_waiter(struct six_lock *lock, enum six_lock_type type,
+			    struct six_lock_waiter *wait,
+			    six_lock_should_sleep_fn should_sleep_fn, void *p,
+			    unsigned long ip);
+
+static inline int six_lock_type_waiter(struct six_lock *lock, enum six_lock_type type,
+				struct six_lock_waiter *wait,
+				six_lock_should_sleep_fn should_sleep_fn, void *p)
+{
+	return six_lock_type_ip_waiter(lock, type, wait, should_sleep_fn, p, _THIS_IP_);
+}
+
+static inline int six_lock_ip_type(struct six_lock *lock, enum six_lock_type type,
+				six_lock_should_sleep_fn should_sleep_fn, void *p,
+				unsigned long ip)
+{
+	struct six_lock_waiter wait;
+
+	return six_lock_type_ip_waiter(lock, type, &wait, should_sleep_fn, p, ip);
+}
+
+static inline int six_lock_type(struct six_lock *lock, enum six_lock_type type,
+				six_lock_should_sleep_fn should_sleep_fn, void *p)
+{
+	struct six_lock_waiter wait;
+
+	return six_lock_type_ip_waiter(lock, type, &wait, should_sleep_fn, p, _THIS_IP_);
+}
+
+bool six_relock_ip_type(struct six_lock *lock, enum six_lock_type type,
+			unsigned seq, unsigned long ip);
+
+static inline bool six_relock_type(struct six_lock *lock, enum six_lock_type type,
+				   unsigned seq)
+{
+	return six_relock_ip_type(lock, type, seq, _THIS_IP_);
+}
+
+void six_unlock_ip_type(struct six_lock *lock, enum six_lock_type type, unsigned long ip);
+
+static inline void six_unlock_type(struct six_lock *lock, enum six_lock_type type)
+{
+	six_unlock_ip_type(lock, type, _THIS_IP_);
+}
+
 #define __SIX_LOCK(type)						\
-bool six_trylock_ip_##type(struct six_lock *, unsigned long);		\
-bool six_relock_ip_##type(struct six_lock *, u32, unsigned long);	\
-int six_lock_ip_##type(struct six_lock *, six_lock_should_sleep_fn,	\
-		       void *, unsigned long);				\
-int six_lock_ip_waiter_##type(struct six_lock *, struct six_lock_waiter *,\
-			six_lock_should_sleep_fn, void *, unsigned long);\
-void six_unlock_ip_##type(struct six_lock *, unsigned long);		\
+static inline bool six_trylock_ip_##type(struct six_lock *lock, unsigned long ip)\
+{									\
+	return six_trylock_ip_type(lock, SIX_LOCK_##type, ip);		\
+}									\
 									\
 static inline bool six_trylock_##type(struct six_lock *lock)		\
 {									\
-	return six_trylock_ip_##type(lock, _THIS_IP_);			\
+	return six_trylock_ip_type(lock, SIX_LOCK_##type, _THIS_IP_);	\
+}									\
+									\
+static inline int six_lock_ip_waiter_##type(struct six_lock *lock,	\
+			   struct six_lock_waiter *wait,		\
+			   six_lock_should_sleep_fn should_sleep_fn, void *p,\
+			   unsigned long ip)				\
+{									\
+	return six_lock_type_ip_waiter(lock, SIX_LOCK_##type, wait, should_sleep_fn, p, ip);\
+}									\
+									\
+static inline int six_lock_ip_##type(struct six_lock *lock,		\
+		    six_lock_should_sleep_fn should_sleep_fn, void *p,	\
+		    unsigned long ip)					\
+{									\
+	return six_lock_ip_type(lock, SIX_LOCK_##type, should_sleep_fn, p, ip);\
+}									\
+									\
+static inline bool six_relock_ip_##type(struct six_lock *lock, u32 seq, unsigned long ip)\
+{									\
+	return six_relock_ip_type(lock, SIX_LOCK_##type, seq, ip);	\
 }									\
+									\
 static inline bool six_relock_##type(struct six_lock *lock, u32 seq)	\
 {									\
-	return six_relock_ip_##type(lock, seq, _THIS_IP_);		\
+	return six_relock_ip_type(lock, SIX_LOCK_##type, seq, _THIS_IP_);\
 }									\
+									\
 static inline int six_lock_##type(struct six_lock *lock,		\
 				  six_lock_should_sleep_fn fn, void *p)\
 {									\
 	return six_lock_ip_##type(lock, fn, p, _THIS_IP_);		\
 }									\
+									\
 static inline int six_lock_waiter_##type(struct six_lock *lock,		\
 			struct six_lock_waiter *wait,			\
 			six_lock_should_sleep_fn fn, void *p)		\
 {									\
 	return six_lock_ip_waiter_##type(lock, wait, fn, p, _THIS_IP_);	\
 }									\
+									\
+static inline void six_unlock_ip_##type(struct six_lock *lock, unsigned long ip)	\
+{									\
+	six_unlock_ip_type(lock, SIX_LOCK_##type, ip);			\
+}									\
+									\
 static inline void six_unlock_##type(struct six_lock *lock)		\
 {									\
-	return six_unlock_ip_##type(lock, _THIS_IP_);			\
+	six_unlock_ip_type(lock, SIX_LOCK_##type, _THIS_IP_);		\
 }
 
 __SIX_LOCK(read)
@@ -186,55 +265,6 @@ __SIX_LOCK(intent)
 __SIX_LOCK(write)
 #undef __SIX_LOCK
 
-#define SIX_LOCK_DISPATCH(type, fn, ...)			\
-	switch (type) {						\
-	case SIX_LOCK_read:					\
-		return fn##_read(__VA_ARGS__);			\
-	case SIX_LOCK_intent:					\
-		return fn##_intent(__VA_ARGS__);		\
-	case SIX_LOCK_write:					\
-		return fn##_write(__VA_ARGS__);			\
-	default:						\
-		BUG();						\
-	}
-
-static inline bool six_trylock_type(struct six_lock *lock, enum six_lock_type type)
-{
-	SIX_LOCK_DISPATCH(type, six_trylock, lock);
-}
-
-static inline bool six_relock_type(struct six_lock *lock, enum six_lock_type type,
-				   unsigned seq)
-{
-	SIX_LOCK_DISPATCH(type, six_relock, lock, seq);
-}
-
-static inline int six_lock_type(struct six_lock *lock, enum six_lock_type type,
-				six_lock_should_sleep_fn should_sleep_fn, void *p)
-{
-	SIX_LOCK_DISPATCH(type, six_lock, lock, should_sleep_fn, p);
-}
-
-static inline int six_lock_type_ip_waiter(struct six_lock *lock, enum six_lock_type type,
-				struct six_lock_waiter *wait,
-				six_lock_should_sleep_fn should_sleep_fn, void *p,
-				unsigned long ip)
-{
-	SIX_LOCK_DISPATCH(type, six_lock_ip_waiter, lock, wait, should_sleep_fn, p, ip);
-}
-
-static inline int six_lock_type_waiter(struct six_lock *lock, enum six_lock_type type,
-				struct six_lock_waiter *wait,
-				six_lock_should_sleep_fn should_sleep_fn, void *p)
-{
-	SIX_LOCK_DISPATCH(type, six_lock_waiter, lock, wait, should_sleep_fn, p);
-}
-
-static inline void six_unlock_type(struct six_lock *lock, enum six_lock_type type)
-{
-	SIX_LOCK_DISPATCH(type, six_unlock, lock);
-}
-
 void six_lock_downgrade(struct six_lock *);
 bool six_lock_tryupgrade(struct six_lock *);
 bool six_trylock_convert(struct six_lock *, enum six_lock_type,
-- 
cgit 


From 1fb4fe63178881a0ac043a5c05288d9fff85d6b8 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 20 May 2023 23:57:48 -0400
Subject: six locks: Kill six_lock_state union

As suggested by Linus, this drops the six_lock_state union in favor of
raw bitmasks.

On the one hand, bitfields give more type-level structure to the code.
However, a significant amount of the code was working with
six_lock_state as a u64/atomic64_t, and the conversions from the
bitfields to the u64 were deemed a bit too out-there.

More significantly, because bitfield order is poorly defined (#ifdef
__LITTLE_ENDIAN_BITFIELD can be used, but is gross), incrementing the
sequence number would overflow into the rest of the bitfield if the
compiler didn't put the sequence number at the high end of the word.

The new code is a bit saner when we're on an architecture without real
atomic64_t support - all accesses to lock->state now go through
atomic64_*() operations.

On architectures with real atomic64_t support, we additionally use
atomic bit ops for setting/clearing individual bits.

Text size: 7467 bytes -> 4649 bytes - compilers still suck at
bitfields.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_cache.c           |   6 +-
 fs/bcachefs/btree_io.c              |   2 +-
 fs/bcachefs/btree_iter.c            |   4 +-
 fs/bcachefs/btree_iter.h            |   2 +-
 fs/bcachefs/btree_key_cache.c       |   6 +-
 fs/bcachefs/btree_locking.h         |   4 +-
 fs/bcachefs/btree_update_interior.c |   2 +-
 fs/bcachefs/six.c                   | 312 ++++++++++++++++++++++--------------
 fs/bcachefs/six.h                   |  40 +----
 fs/bcachefs/trace.h                 |   8 +-
 10 files changed, 221 insertions(+), 165 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index 5801f4ff9097..58ef9e7b4bdf 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -735,7 +735,7 @@ static noinline struct btree *bch2_btree_node_fill(struct btree_trans *trans,
 	set_btree_node_read_in_flight(b);
 
 	six_unlock_write(&b->c.lock);
-	seq = b->c.lock.state.seq;
+	seq = six_lock_seq(&b->c.lock);
 	six_unlock_intent(&b->c.lock);
 
 	/* Unlock before doing IO: */
@@ -859,7 +859,7 @@ retry:
 	}
 
 	if (unlikely(btree_node_read_in_flight(b))) {
-		u32 seq = b->c.lock.state.seq;
+		u32 seq = six_lock_seq(&b->c.lock);
 
 		six_unlock_type(&b->c.lock, lock_type);
 		bch2_trans_unlock(trans);
@@ -957,7 +957,7 @@ struct btree *bch2_btree_node_get(struct btree_trans *trans, struct btree_path *
 	}
 
 	if (unlikely(btree_node_read_in_flight(b))) {
-		u32 seq = b->c.lock.state.seq;
+		u32 seq = six_lock_seq(&b->c.lock);
 
 		six_unlock_type(&b->c.lock, lock_type);
 		bch2_trans_unlock(trans);
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index decbbaace1ee..0a7a18eca397 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -483,7 +483,7 @@ void bch2_btree_init_next(struct btree_trans *trans, struct btree *b)
 	struct btree_node_entry *bne;
 	bool reinit_iter = false;
 
-	EBUG_ON(!(b->c.lock.state.seq & 1));
+	EBUG_ON(!six_lock_counts(&b->c.lock).n[SIX_LOCK_write]);
 	BUG_ON(bset_written(b, bset(b, &b->set[1])));
 	BUG_ON(btree_node_just_written(b));
 
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index d906bfb6754d..3e65e6876ec7 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -652,9 +652,9 @@ void bch2_btree_path_level_init(struct btree_trans *trans,
 	BUG_ON(path->cached);
 
 	EBUG_ON(!btree_path_pos_in_node(path, b));
-	EBUG_ON(b->c.lock.state.seq & 1);
+	EBUG_ON(six_lock_seq(&b->c.lock) & 1);
 
-	path->l[b->c.level].lock_seq = b->c.lock.state.seq;
+	path->l[b->c.level].lock_seq = six_lock_seq(&b->c.lock);
 	path->l[b->c.level].b = b;
 	__btree_path_level_init(path, b->c.level);
 }
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index 188a6cd483f8..7d3564d72a7d 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -49,7 +49,7 @@ static inline bool btree_node_lock_seq_matches(const struct btree_path *path,
 	 * write lock. The lock sequence number is incremented by taking and
 	 * releasing write locks and is even when unlocked:
 	 */
-	return path->l[level].lock_seq >> 1 == b->c.lock.state.seq >> 1;
+	return path->l[level].lock_seq >> 1 == six_lock_seq(&b->c.lock) >> 1;
 }
 
 static inline struct btree *btree_node_parent(struct btree_path *path,
diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index 9725d85b99b3..37977b774d61 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -251,7 +251,7 @@ bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path,
 		}
 
 		path->l[0].b = (void *) ck;
-		path->l[0].lock_seq = ck->c.lock.state.seq;
+		path->l[0].lock_seq = six_lock_seq(&ck->c.lock);
 		mark_btree_node_locked(trans, path, 0, SIX_LOCK_intent);
 
 		ret = bch2_btree_node_lock_write(trans, path, &ck->c);
@@ -506,7 +506,7 @@ retry:
 		mark_btree_node_locked(trans, path, 0, lock_want);
 	}
 
-	path->l[0].lock_seq	= ck->c.lock.state.seq;
+	path->l[0].lock_seq	= six_lock_seq(&ck->c.lock);
 	path->l[0].b		= (void *) ck;
 fill:
 	path->uptodate = BTREE_ITER_UPTODATE;
@@ -588,7 +588,7 @@ retry:
 		mark_btree_node_locked(trans, path, 0, lock_want);
 	}
 
-	path->l[0].lock_seq	= ck->c.lock.state.seq;
+	path->l[0].lock_seq	= six_lock_seq(&ck->c.lock);
 	path->l[0].b		= (void *) ck;
 fill:
 	if (!ck->valid)
diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h
index 660975839c89..a897bdc123c3 100644
--- a/fs/bcachefs/btree_locking.h
+++ b/fs/bcachefs/btree_locking.h
@@ -175,7 +175,7 @@ bch2_btree_node_unlock_write_inlined(struct btree_trans *trans, struct btree_pat
 	struct btree_path *linked;
 
 	EBUG_ON(path->l[b->c.level].b != b);
-	EBUG_ON(path->l[b->c.level].lock_seq + 1 != b->c.lock.state.seq);
+	EBUG_ON(path->l[b->c.level].lock_seq + 1 != six_lock_seq(&b->c.lock));
 	EBUG_ON(btree_node_locked_type(path, b->c.level) != SIX_LOCK_write);
 
 	mark_btree_node_locked_noreset(path, b->c.level, SIX_LOCK_intent);
@@ -283,7 +283,7 @@ static inline int __btree_node_lock_write(struct btree_trans *trans,
 					  bool lock_may_not_fail)
 {
 	EBUG_ON(&path->l[b->level].b->c != b);
-	EBUG_ON(path->l[b->level].lock_seq != b->lock.state.seq);
+	EBUG_ON(path->l[b->level].lock_seq != six_lock_seq(&b->lock));
 	EBUG_ON(!btree_node_intent_locked(path, b->level));
 
 	/*
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 6ba0954e648e..1319337c5382 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -688,7 +688,7 @@ err:
 		bch2_trans_unlock(&trans);
 		btree_node_lock_nopath_nofail(&trans, &b->c, SIX_LOCK_intent);
 		mark_btree_node_locked(&trans, path, b->c.level, SIX_LOCK_intent);
-		path->l[b->c.level].lock_seq = b->c.lock.state.seq;
+		path->l[b->c.level].lock_seq = six_lock_seq(&b->c.lock);
 		path->l[b->c.level].b = b;
 
 		bch2_btree_node_lock_write_nofail(&trans, path, &b->c);
diff --git a/fs/bcachefs/six.c b/fs/bcachefs/six.c
index 1a64b8a027a7..2e222eb2a907 100644
--- a/fs/bcachefs/six.c
+++ b/fs/bcachefs/six.c
@@ -13,9 +13,9 @@
 #include "six.h"
 
 #ifdef DEBUG
-#define EBUG_ON(cond)		BUG_ON(cond)
+#define EBUG_ON(cond)			BUG_ON(cond)
 #else
-#define EBUG_ON(cond)		do {} while (0)
+#define EBUG_ON(cond)			do {} while (0)
 #endif
 
 #define six_acquire(l, t, r, ip)	lock_acquire(l, 0, t, r, 1, NULL, ip)
@@ -23,6 +23,39 @@
 
 static void do_six_unlock_type(struct six_lock *lock, enum six_lock_type type);
 
+/*
+ * bits 0-26		reader count
+ * bits 26-27		write_locking (a thread is trying to get a write lock,
+ *			but does not have one yet)
+ * bits 27-28		held for intent
+ * bits 28-29		nospin - optimistic spinning has timed out
+ * bits 29-30		has read waiters
+ * bits 30-31		has intent waiters
+ * bits 31-32		has write waiters
+ * bits 32-64		sequence number: incremented on every write lock or
+ *			unlock, thus bit 33 (sequence number odd) indicates
+ *			lock is currently held for write
+ */
+
+#define SIX_STATE_READ_OFFSET		0
+#define SIX_STATE_READ_BITS		26
+
+#define SIX_STATE_READ_LOCK		~(~0ULL << 26)
+#define SIX_STATE_WRITE_LOCKING		(1ULL << 26)
+#define SIX_STATE_INTENT_HELD		(1ULL << 27)
+#define SIX_STATE_NOSPIN		(1ULL << 28)
+#define SIX_STATE_WAITING_READ		(1ULL << (29 + SIX_LOCK_read))
+#define SIX_STATE_WAITING_INTENT	(1ULL << (29 + SIX_LOCK_intent))
+#define SIX_STATE_WAITING_WRITE		(1ULL << (29 + SIX_LOCK_write))
+
+#define SIX_STATE_SEQ_OFFSET		32
+#define SIX_STATE_SEQ_BITS		32
+#define SIX_STATE_SEQ			(~0ULL << 32)
+
+#define SIX_LOCK_HELD_read		SIX_STATE_READ_LOCK
+#define SIX_LOCK_HELD_intent		SIX_STATE_INTENT_HELD
+#define SIX_LOCK_HELD_write		(1ULL << SIX_STATE_SEQ_OFFSET)
+
 struct six_lock_vals {
 	/* Value we add to the lock in order to take the lock: */
 	u64			lock_val;
@@ -40,44 +73,109 @@ struct six_lock_vals {
 	enum six_lock_type	unlock_wakeup;
 };
 
-#define __SIX_VAL(field, _v)	(((union six_lock_state) { .field = _v }).v)
-
-#define __SIX_LOCK_HELD_read	__SIX_VAL(read_lock, ~0)
-#define __SIX_LOCK_HELD_intent	__SIX_VAL(intent_lock, ~0)
-#define __SIX_LOCK_HELD_write	__SIX_VAL(seq, 1)
-
 #define LOCK_VALS {							\
 	[SIX_LOCK_read] = {						\
-		.lock_val	= __SIX_VAL(read_lock, 1),		\
-		.lock_fail	= __SIX_LOCK_HELD_write + __SIX_VAL(write_locking, 1),\
-		.unlock_val	= -__SIX_VAL(read_lock, 1),		\
-		.held_mask	= __SIX_LOCK_HELD_read,			\
+		.lock_val	= 1ULL << SIX_STATE_READ_OFFSET,	\
+		.lock_fail	= SIX_LOCK_HELD_write|SIX_STATE_WRITE_LOCKING,\
+		.unlock_val	= -(1ULL << SIX_STATE_READ_OFFSET),	\
+		.held_mask	= SIX_LOCK_HELD_read,			\
 		.unlock_wakeup	= SIX_LOCK_write,			\
 	},								\
 	[SIX_LOCK_intent] = {						\
-		.lock_val	= __SIX_VAL(intent_lock, 1),		\
-		.lock_fail	= __SIX_LOCK_HELD_intent,		\
-		.unlock_val	= -__SIX_VAL(intent_lock, 1),		\
-		.held_mask	= __SIX_LOCK_HELD_intent,		\
+		.lock_val	= SIX_STATE_INTENT_HELD,		\
+		.lock_fail	= SIX_LOCK_HELD_intent,			\
+		.unlock_val	= -SIX_STATE_INTENT_HELD,		\
+		.held_mask	= SIX_LOCK_HELD_intent,			\
 		.unlock_wakeup	= SIX_LOCK_intent,			\
 	},								\
 	[SIX_LOCK_write] = {						\
-		.lock_val	= __SIX_VAL(seq, 1),			\
-		.lock_fail	= __SIX_LOCK_HELD_read,			\
-		.unlock_val	= __SIX_VAL(seq, 1),			\
-		.held_mask	= __SIX_LOCK_HELD_write,		\
+		.lock_val	= SIX_LOCK_HELD_write,			\
+		.lock_fail	= SIX_LOCK_HELD_read,			\
+		.unlock_val	= SIX_LOCK_HELD_write,			\
+		.held_mask	= SIX_LOCK_HELD_write,			\
 		.unlock_wakeup	= SIX_LOCK_read,			\
 	},								\
 }
 
+static inline u32 six_state_seq(u64 state)
+{
+	return state >> SIX_STATE_SEQ_OFFSET;
+}
+
+#ifdef CONFIG_GENERIC_ATOMIC64
+
+static inline void six_set_bitmask(struct six_lock *lock, u64 mask)
+{
+	u64 old, new, v = atomic64_read(&lock->state);
+
+	do {
+		old = new = v;
+		if ((old & mask) == mask)
+			break;
+		new |= mask;
+	} while ((v = atomic64_cmpxchg(&lock->state, old, new)) != old);
+}
+
+static inline void six_clear_bitmask(struct six_lock *lock, u64 mask)
+{
+	u64 old, new, v = atomic64_read(&lock->state);
+
+	do {
+		old = new = v;
+		if (!(old & mask))
+			break;
+		new &= ~mask;
+	} while ((v = atomic64_cmpxchg(&lock->state, old, new)) != old);
+}
+
+#else
+
+/*
+ * Returns the index of the first set bit, treating @mask as an array of ulongs:
+ * that is, a bit index that can be passed to test_bit()/set_bit().
+ *
+ * Assumes the set bit we want is in the low 4 bytes:
+ */
+static inline unsigned u64_mask_to_ulong_bitnr(u64 mask)
+{
+#if BITS_PER_LONG == 64
+	return ilog2(mask);
+#else
+#if defined(__LITTLE_ENDIAN)
+	return ilog2((u32) mask);
+#elif defined(__BIG_ENDIAN)
+	return ilog2((u32) mask) + 32;
+#else
+#error Unknown byteorder
+#endif
+#endif
+}
+
+static inline void six_set_bitmask(struct six_lock *lock, u64 mask)
+{
+	unsigned bitnr = u64_mask_to_ulong_bitnr(mask);
+
+	if (!test_bit(bitnr, (unsigned long *) &lock->state))
+		set_bit(bitnr, (unsigned long *) &lock->state);
+}
+
+static inline void six_clear_bitmask(struct six_lock *lock, u64 mask)
+{
+	unsigned bitnr = u64_mask_to_ulong_bitnr(mask);
+
+	if (test_bit(bitnr, (unsigned long *) &lock->state))
+		clear_bit(bitnr, (unsigned long *) &lock->state);
+}
+
+#endif
+
 static inline void six_set_owner(struct six_lock *lock, enum six_lock_type type,
-				 union six_lock_state old,
-				 struct task_struct *owner)
+				 u64 old, struct task_struct *owner)
 {
 	if (type != SIX_LOCK_intent)
 		return;
 
-	if (!old.intent_lock) {
+	if (!(old & SIX_LOCK_HELD_intent)) {
 		EBUG_ON(lock->owner);
 		lock->owner = owner;
 	} else {
@@ -95,22 +193,20 @@ static inline unsigned pcpu_read_count(struct six_lock *lock)
 	return read_count;
 }
 
-/* This is probably up there with the more evil things I've done */
-#define waitlist_bitnr(id) ilog2((((union six_lock_state) { .waiters = 1 << (id) }).l))
-
 static int __do_six_trylock_type(struct six_lock *lock,
 				 enum six_lock_type type,
 				 struct task_struct *task,
 				 bool try)
 {
 	const struct six_lock_vals l[] = LOCK_VALS;
-	union six_lock_state old, new;
 	int ret;
-	u64 v;
+	u64 old, new, v;
 
 	EBUG_ON(type == SIX_LOCK_write && lock->owner != task);
-	EBUG_ON(type == SIX_LOCK_write && (lock->state.seq & 1));
-	EBUG_ON(type == SIX_LOCK_write && (try != !(lock->state.write_locking)));
+	EBUG_ON(type == SIX_LOCK_write &&
+		(atomic64_read(&lock->state) & SIX_LOCK_HELD_write));
+	EBUG_ON(type == SIX_LOCK_write &&
+		(try != !(atomic64_read(&lock->state) & SIX_STATE_WRITE_LOCKING)));
 
 	/*
 	 * Percpu reader mode:
@@ -133,8 +229,8 @@ static int __do_six_trylock_type(struct six_lock *lock,
 
 		smp_mb();
 
-		old.v = READ_ONCE(lock->state.v);
-		ret = !(old.v & l[type].lock_fail);
+		old = atomic64_read(&lock->state);
+		ret = !(old & l[type].lock_fail);
 
 		this_cpu_sub(*lock->readers, !ret);
 		preempt_enable();
@@ -144,12 +240,12 @@ static int __do_six_trylock_type(struct six_lock *lock,
 		 * lock, issue a wakeup because we might have caused a
 		 * spurious trylock failure:
 		 */
-		if (old.write_locking)
+		if (old & SIX_STATE_WRITE_LOCKING)
 			ret = -1 - SIX_LOCK_write;
 	} else if (type == SIX_LOCK_write && lock->readers) {
 		if (try) {
-			atomic64_add(__SIX_VAL(write_locking, 1),
-				     &lock->state.counter);
+			atomic64_add(SIX_STATE_WRITE_LOCKING,
+				     &lock->state);
 			smp_mb__after_atomic();
 		}
 
@@ -161,47 +257,47 @@ static int __do_six_trylock_type(struct six_lock *lock,
 		 */
 		v = 0;
 		if (ret)
-			v += __SIX_VAL(seq, 1);
+			v += SIX_LOCK_HELD_write;
 		if (ret || try)
-			v -= __SIX_VAL(write_locking, 1);
+			v -= SIX_STATE_WRITE_LOCKING;
 
 		if (try && !ret) {
-			old.v = atomic64_add_return(v, &lock->state.counter);
-			if (old.waiters & (1 << SIX_LOCK_read))
+			old = atomic64_add_return(v, &lock->state);
+			if (old & SIX_STATE_WAITING_READ)
 				ret = -1 - SIX_LOCK_read;
 		} else {
-			atomic64_add(v, &lock->state.counter);
+			atomic64_add(v, &lock->state);
 		}
 	} else {
-		v = READ_ONCE(lock->state.v);
+		v = atomic64_read(&lock->state);
 		do {
-			new.v = old.v = v;
+			new = old = v;
 
-			if (!(old.v & l[type].lock_fail)) {
-				new.v += l[type].lock_val;
+			if (!(old & l[type].lock_fail)) {
+				new += l[type].lock_val;
 
 				if (type == SIX_LOCK_write)
-					new.write_locking = 0;
+					new &= ~SIX_STATE_WRITE_LOCKING;
 			} else {
 				break;
 			}
-		} while ((v = atomic64_cmpxchg_acquire(&lock->state.counter,
-					old.v, new.v)) != old.v);
+		} while ((v = atomic64_cmpxchg_acquire(&lock->state, old, new)) != old);
 
-		ret = !(old.v & l[type].lock_fail);
+		ret = !(old & l[type].lock_fail);
 
-		EBUG_ON(ret && !(lock->state.v & l[type].held_mask));
+		EBUG_ON(ret && !(atomic64_read(&lock->state) & l[type].held_mask));
 	}
 
 	if (ret > 0)
 		six_set_owner(lock, type, old, task);
 
-	EBUG_ON(type == SIX_LOCK_write && (try || ret > 0) && (lock->state.write_locking));
+	EBUG_ON(type == SIX_LOCK_write && (try || ret > 0) &&
+		(atomic64_read(&lock->state) & SIX_STATE_WRITE_LOCKING));
 
 	return ret;
 }
 
-static inline void __six_lock_wakeup(struct six_lock *lock, enum six_lock_type lock_type)
+static void __six_lock_wakeup(struct six_lock *lock, enum six_lock_type lock_type)
 {
 	struct six_lock_waiter *w, *next;
 	struct task_struct *task;
@@ -235,7 +331,7 @@ again:
 		wake_up_process(task);
 	}
 
-	clear_bit(waitlist_bitnr(lock_type), (unsigned long *) &lock->state.v);
+	six_clear_bitmask(lock, SIX_STATE_WAITING_READ << lock_type);
 unlock:
 	raw_spin_unlock(&lock->wait_lock);
 
@@ -246,14 +342,13 @@ unlock:
 }
 
 __always_inline
-static void six_lock_wakeup(struct six_lock *lock,
-			    union six_lock_state state,
+static void six_lock_wakeup(struct six_lock *lock, u64 state,
 			    enum six_lock_type lock_type)
 {
-	if (lock_type == SIX_LOCK_write && state.read_lock)
+	if (lock_type == SIX_LOCK_write && (state & SIX_LOCK_HELD_read))
 		return;
 
-	if (!(state.waiters & (1 << lock_type)))
+	if (!(state & (SIX_STATE_WAITING_READ << lock_type)))
 		return;
 
 	__six_lock_wakeup(lock, lock_type);
@@ -288,8 +383,7 @@ bool six_relock_ip_type(struct six_lock *lock, enum six_lock_type type,
 			unsigned seq, unsigned long ip)
 {
 	const struct six_lock_vals l[] = LOCK_VALS;
-	union six_lock_state old;
-	u64 v;
+	u64 old, v;
 
 	EBUG_ON(type == SIX_LOCK_write);
 
@@ -302,8 +396,8 @@ bool six_relock_ip_type(struct six_lock *lock, enum six_lock_type type,
 
 		smp_mb();
 
-		old.v = READ_ONCE(lock->state.v);
-		ret = !(old.v & l[type].lock_fail) && old.seq == seq;
+		old = atomic64_read(&lock->state);
+		ret = !(old & l[type].lock_fail) && six_state_seq(old) == seq;
 
 		this_cpu_sub(*lock->readers, !ret);
 		preempt_enable();
@@ -314,21 +408,21 @@ bool six_relock_ip_type(struct six_lock *lock, enum six_lock_type type,
 		 */
 		if (ret)
 			six_acquire(&lock->dep_map, 1, type == SIX_LOCK_read, ip);
-		else if (old.write_locking)
+		else if (old & SIX_STATE_WRITE_LOCKING)
 			six_lock_wakeup(lock, old, SIX_LOCK_write);
 
 		return ret;
 	}
 
-	v = READ_ONCE(lock->state.v);
+	v = atomic64_read(&lock->state);
 	do {
-		old.v = v;
+		old = v;
 
-		if (old.seq != seq || old.v & l[type].lock_fail)
+		if ((old & l[type].lock_fail) || six_state_seq(old) != seq)
 			return false;
-	} while ((v = atomic64_cmpxchg_acquire(&lock->state.counter,
-				old.v,
-				old.v + l[type].lock_val)) != old.v);
+	} while ((v = atomic64_cmpxchg_acquire(&lock->state,
+				old,
+				old + l[type].lock_val)) != old);
 
 	six_set_owner(lock, type, old, current);
 	if (type != SIX_LOCK_write)
@@ -355,17 +449,6 @@ static inline bool six_can_spin_on_owner(struct six_lock *lock)
 	return ret;
 }
 
-static inline void six_set_nospin(struct six_lock *lock)
-{
-	union six_lock_state old, new;
-	u64 v = READ_ONCE(lock->state.v);
-
-	do {
-		new.v = old.v = v;
-		new.nospin = true;
-	} while ((v = atomic64_cmpxchg(&lock->state.counter, old.v, new.v)) != old.v);
-}
-
 static inline bool six_spin_on_owner(struct six_lock *lock,
 				     struct task_struct *owner,
 				     u64 end_time)
@@ -389,7 +472,7 @@ static inline bool six_spin_on_owner(struct six_lock *lock,
 		}
 
 		if (!(++loop & 0xf) && (time_after64(sched_clock(), end_time))) {
-			six_set_nospin(lock);
+			six_set_bitmask(lock, SIX_STATE_NOSPIN);
 			ret = false;
 			break;
 		}
@@ -483,12 +566,12 @@ static int __six_lock_type_slowpath(struct six_lock *lock, enum six_lock_type ty
 				    six_lock_should_sleep_fn should_sleep_fn, void *p,
 				    unsigned long ip)
 {
-	union six_lock_state old;
+	u64 old;
 	int ret = 0;
 
 	if (type == SIX_LOCK_write) {
-		EBUG_ON(lock->state.write_locking);
-		atomic64_add(__SIX_VAL(write_locking, 1), &lock->state.counter);
+		EBUG_ON(atomic64_read(&lock->state) & SIX_STATE_WRITE_LOCKING);
+		atomic64_add(SIX_STATE_WRITE_LOCKING, &lock->state);
 		smp_mb__after_atomic();
 	}
 
@@ -502,8 +585,7 @@ static int __six_lock_type_slowpath(struct six_lock *lock, enum six_lock_type ty
 	wait->lock_acquired	= false;
 
 	raw_spin_lock(&lock->wait_lock);
-	if (!(lock->state.waiters & (1 << type)))
-		set_bit(waitlist_bitnr(type), (unsigned long *) &lock->state.v);
+	six_set_bitmask(lock, SIX_STATE_WAITING_READ << type);
 	/*
 	 * Retry taking the lock after taking waitlist lock, have raced with an
 	 * unlock:
@@ -558,9 +640,8 @@ static int __six_lock_type_slowpath(struct six_lock *lock, enum six_lock_type ty
 
 	__set_current_state(TASK_RUNNING);
 out:
-	if (ret && type == SIX_LOCK_write && lock->state.write_locking) {
-		old.v = atomic64_sub_return(__SIX_VAL(write_locking, 1),
-					    &lock->state.counter);
+	if (ret && type == SIX_LOCK_write) {
+		six_clear_bitmask(lock, SIX_STATE_WRITE_LOCKING);
 		six_lock_wakeup(lock, old, SIX_LOCK_read);
 	}
 
@@ -595,7 +676,7 @@ __always_inline
 static void do_six_unlock_type(struct six_lock *lock, enum six_lock_type type)
 {
 	const struct six_lock_vals l[] = LOCK_VALS;
-	union six_lock_state state;
+	u64 state;
 
 	if (type == SIX_LOCK_intent)
 		lock->owner = NULL;
@@ -605,15 +686,15 @@ static void do_six_unlock_type(struct six_lock *lock, enum six_lock_type type)
 		smp_mb(); /* unlock barrier */
 		this_cpu_dec(*lock->readers);
 		smp_mb(); /* between unlocking and checking for waiters */
-		state.v = READ_ONCE(lock->state.v);
+		state = atomic64_read(&lock->state);
 	} else {
 		u64 v = l[type].unlock_val;
 
 		if (type != SIX_LOCK_read)
-			v -= lock->state.v & __SIX_VAL(nospin, 1);
+			v -= atomic64_read(&lock->state) & SIX_STATE_NOSPIN;
 
-		EBUG_ON(!(lock->state.v & l[type].held_mask));
-		state.v = atomic64_add_return_release(v, &lock->state.counter);
+		EBUG_ON(!(atomic64_read(&lock->state) & l[type].held_mask));
+		state = atomic64_add_return_release(v, &lock->state);
 	}
 
 	six_lock_wakeup(lock, state, l[type].unlock_wakeup);
@@ -622,7 +703,7 @@ static void do_six_unlock_type(struct six_lock *lock, enum six_lock_type type)
 void six_unlock_ip_type(struct six_lock *lock, enum six_lock_type type, unsigned long ip)
 {
 	EBUG_ON(type == SIX_LOCK_write &&
-		!(lock->state.v & __SIX_LOCK_HELD_intent));
+		!(atomic64_read(&lock->state) & SIX_LOCK_HELD_intent));
 	EBUG_ON((type == SIX_LOCK_write ||
 		 type == SIX_LOCK_intent) &&
 		lock->owner != current);
@@ -650,23 +731,22 @@ EXPORT_SYMBOL_GPL(six_lock_downgrade);
 
 bool six_lock_tryupgrade(struct six_lock *lock)
 {
-	union six_lock_state old, new;
-	u64 v = READ_ONCE(lock->state.v);
+	const struct six_lock_vals l[] = LOCK_VALS;
+	u64 old, new, v = atomic64_read(&lock->state);
 
 	do {
-		new.v = old.v = v;
+		new = old = v;
 
-		if (new.intent_lock)
+		if (new & SIX_LOCK_HELD_intent)
 			return false;
 
 		if (!lock->readers) {
-			EBUG_ON(!new.read_lock);
-			new.read_lock--;
+			EBUG_ON(!(new & SIX_LOCK_HELD_read));
+			new += l[SIX_LOCK_read].unlock_val;
 		}
 
-		new.intent_lock = 1;
-	} while ((v = atomic64_cmpxchg_acquire(&lock->state.counter,
-				old.v, new.v)) != old.v);
+		new |= SIX_LOCK_HELD_intent;
+	} while ((v = atomic64_cmpxchg_acquire(&lock->state, old, new)) != old);
 
 	if (lock->readers)
 		this_cpu_dec(*lock->readers);
@@ -712,13 +792,14 @@ void six_lock_increment(struct six_lock *lock, enum six_lock_type type)
 		if (lock->readers) {
 			this_cpu_inc(*lock->readers);
 		} else {
-			EBUG_ON(!lock->state.read_lock &&
-				!lock->state.intent_lock);
-			atomic64_add(l[type].lock_val, &lock->state.counter);
+			EBUG_ON(!(atomic64_read(&lock->state) &
+				  (SIX_LOCK_HELD_read|
+				   SIX_LOCK_HELD_intent)));
+			atomic64_add(l[type].lock_val, &lock->state);
 		}
 		break;
 	case SIX_LOCK_intent:
-		EBUG_ON(!lock->state.intent_lock);
+		EBUG_ON(!(atomic64_read(&lock->state) & SIX_LOCK_HELD_intent));
 		lock->intent_lock_recurse++;
 		break;
 	case SIX_LOCK_write:
@@ -730,7 +811,7 @@ EXPORT_SYMBOL_GPL(six_lock_increment);
 
 void six_lock_wakeup_all(struct six_lock *lock)
 {
-	union six_lock_state state = lock->state;
+	u64 state = atomic64_read(&lock->state);
 	struct six_lock_waiter *w;
 
 	six_lock_wakeup(lock, state, SIX_LOCK_read);
@@ -752,10 +833,11 @@ struct six_lock_count six_lock_counts(struct six_lock *lock)
 	struct six_lock_count ret;
 
 	ret.n[SIX_LOCK_read]	= !lock->readers
-		? lock->state.read_lock
+		? atomic64_read(&lock->state) & SIX_STATE_READ_LOCK
 		: pcpu_read_count(lock);
-	ret.n[SIX_LOCK_intent]	= lock->state.intent_lock + lock->intent_lock_recurse;
-	ret.n[SIX_LOCK_write]	= lock->state.seq & 1;
+	ret.n[SIX_LOCK_intent]	= !!(atomic64_read(&lock->state) & SIX_LOCK_HELD_intent) +
+		lock->intent_lock_recurse;
+	ret.n[SIX_LOCK_write]	= !!(atomic64_read(&lock->state) & SIX_LOCK_HELD_write);
 
 	return ret;
 }
@@ -765,17 +847,15 @@ void six_lock_readers_add(struct six_lock *lock, int nr)
 {
 	if (lock->readers)
 		this_cpu_add(*lock->readers, nr);
-	else if (nr > 0)
-		atomic64_add(__SIX_VAL(read_lock, nr), &lock->state.counter);
-	else
-		atomic64_sub(__SIX_VAL(read_lock, -nr), &lock->state.counter);
+	else /* reader count starts at bit 0 */
+		atomic64_add(nr, &lock->state);
 }
 EXPORT_SYMBOL_GPL(six_lock_readers_add);
 
 void six_lock_exit(struct six_lock *lock)
 {
 	WARN_ON(lock->readers && pcpu_read_count(lock));
-	WARN_ON(lock->state.read_lock);
+	WARN_ON(atomic64_read(&lock->state) & SIX_LOCK_HELD_read);
 
 	free_percpu(lock->readers);
 	lock->readers = NULL;
@@ -785,7 +865,7 @@ EXPORT_SYMBOL_GPL(six_lock_exit);
 void __six_lock_init(struct six_lock *lock, const char *name,
 		     struct lock_class_key *key, enum six_lock_init_flags flags)
 {
-	atomic64_set(&lock->state.counter, 0);
+	atomic64_set(&lock->state, 0);
 	raw_spin_lock_init(&lock->wait_lock);
 	INIT_LIST_HEAD(&lock->wait_list);
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
diff --git a/fs/bcachefs/six.h b/fs/bcachefs/six.h
index 5ddabbfb8aba..449589f76628 100644
--- a/fs/bcachefs/six.h
+++ b/fs/bcachefs/six.h
@@ -68,39 +68,6 @@
 
 #define SIX_LOCK_SEPARATE_LOCKFNS
 
-union six_lock_state {
-	struct {
-		atomic64_t	counter;
-	};
-
-	struct {
-		u64		v;
-	};
-
-	struct {
-		/* for waitlist_bitnr() */
-		unsigned long	l;
-	};
-
-	struct {
-		unsigned	read_lock:26;
-		unsigned	write_locking:1;
-		unsigned	intent_lock:1;
-		unsigned	nospin:1;
-		unsigned	waiters:3;
-		/*
-		 * seq works much like in seqlocks: it's incremented every time
-		 * we lock and unlock for write.
-		 *
-		 * If it's odd write lock is held, even unlocked.
-		 *
-		 * Thus readers can unlock, and then lock again later iff it
-		 * hasn't been modified in the meantime.
-		 */
-		u32		seq;
-	};
-};
-
 enum six_lock_type {
 	SIX_LOCK_read,
 	SIX_LOCK_intent,
@@ -108,7 +75,7 @@ enum six_lock_type {
 };
 
 struct six_lock {
-	union six_lock_state	state;
+	atomic64_t		state;
 	unsigned		intent_lock_recurse;
 	struct task_struct	*owner;
 	unsigned __percpu	*readers;
@@ -148,6 +115,11 @@ do {									\
 	__six_lock_init((lock), #lock, &__key, flags);			\
 } while (0)
 
+static inline u32 six_lock_seq(const struct six_lock *lock)
+{
+	return atomic64_read(&lock->state) >> 32;
+}
+
 bool six_trylock_ip_type(struct six_lock *lock, enum six_lock_type type,
 			 unsigned long ip);
 
diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h
index 8027c2a14199..cfb1779d712a 100644
--- a/fs/bcachefs/trace.h
+++ b/fs/bcachefs/trace.h
@@ -420,7 +420,9 @@ TRACE_EVENT(btree_path_relock_fail,
 		else
 			scnprintf(__entry->node, sizeof(__entry->node), "%px", b);
 		__entry->iter_lock_seq		= path->l[level].lock_seq;
-		__entry->node_lock_seq		= is_btree_node(path, level) ? path->l[level].b->c.lock.state.seq : 0;
+		__entry->node_lock_seq		= is_btree_node(path, level)
+			? six_lock_seq(&path->l[level].b->c.lock)
+			: 0;
 	),
 
 	TP_printk("%s %pS btree %s pos %llu:%llu:%u level %u node %s iter seq %u lock seq %u",
@@ -475,7 +477,9 @@ TRACE_EVENT(btree_path_upgrade_fail,
 		__entry->read_count		= c.n[SIX_LOCK_read];
 		__entry->intent_count		= c.n[SIX_LOCK_read];
 		__entry->iter_lock_seq		= path->l[level].lock_seq;
-		__entry->node_lock_seq		= is_btree_node(path, level) ? path->l[level].b->c.lock.state.seq : 0;
+		__entry->node_lock_seq		= is_btree_node(path, level)
+			? six_lock_seq(&path->l[level].b->c.lock)
+			: 0;
 	),
 
 	TP_printk("%s %pS btree %s pos %llu:%llu:%u level %u locked %u held %u:%u lock count %u:%u iter seq %u lock seq %u",
-- 
cgit 


From 91d16f16d0fd4b6eb8503068ea7f6ad8305e32db Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 21 May 2023 15:40:40 -0400
Subject: six locks: Documentation, renaming

 - Expanded and revamped overview documentation in six.h, giving an
   overview of all features
 - docbook-comments for all external interfaces
 - Rename some functions for simplicity, i.e.
   six_lock_ip_type() -> six_lock_ip()

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_locking.h |   4 +-
 fs/bcachefs/six.c           | 218 +++++++++++++++++++++++++-------
 fs/bcachefs/six.h           | 298 ++++++++++++++++++++++++++++++++------------
 3 files changed, 395 insertions(+), 125 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h
index a897bdc123c3..f9bb8736c061 100644
--- a/fs/bcachefs/btree_locking.h
+++ b/fs/bcachefs/btree_locking.h
@@ -205,8 +205,8 @@ static inline int __btree_node_lock_nopath(struct btree_trans *trans,
 	trans->lock_must_abort	= false;
 	trans->locking		= b;
 
-	ret = six_lock_type_ip_waiter(&b->lock, type, &trans->locking_wait,
-				   bch2_six_check_for_deadlock, trans, ip);
+	ret = six_lock_ip_waiter(&b->lock, type, &trans->locking_wait,
+				 bch2_six_check_for_deadlock, trans, ip);
 	WRITE_ONCE(trans->locking, NULL);
 	WRITE_ONCE(trans->locking_wait.start_time, 0);
 	return ret;
diff --git a/fs/bcachefs/six.c b/fs/bcachefs/six.c
index 2e222eb2a907..a1f007095ec9 100644
--- a/fs/bcachefs/six.c
+++ b/fs/bcachefs/six.c
@@ -193,10 +193,8 @@ static inline unsigned pcpu_read_count(struct six_lock *lock)
 	return read_count;
 }
 
-static int __do_six_trylock_type(struct six_lock *lock,
-				 enum six_lock_type type,
-				 struct task_struct *task,
-				 bool try)
+static int __do_six_trylock(struct six_lock *lock, enum six_lock_type type,
+			    struct task_struct *task, bool try)
 {
 	const struct six_lock_vals l[] = LOCK_VALS;
 	int ret;
@@ -316,7 +314,7 @@ again:
 			goto unlock;
 		saw_one = true;
 
-		ret = __do_six_trylock_type(lock, lock_type, w->task, false);
+		ret = __do_six_trylock(lock, lock_type, w->task, false);
 		if (ret <= 0)
 			goto unlock;
 
@@ -355,32 +353,48 @@ static void six_lock_wakeup(struct six_lock *lock, u64 state,
 }
 
 __always_inline
-static bool do_six_trylock_type(struct six_lock *lock,
-				enum six_lock_type type,
-				bool try)
+static bool do_six_trylock(struct six_lock *lock, enum six_lock_type type, bool try)
 {
 	int ret;
 
-	ret = __do_six_trylock_type(lock, type, current, try);
+	ret = __do_six_trylock(lock, type, current, try);
 	if (ret < 0)
 		__six_lock_wakeup(lock, -ret - 1);
 
 	return ret > 0;
 }
 
-bool six_trylock_ip_type(struct six_lock *lock, enum six_lock_type type,
-			 unsigned long ip)
+/**
+ * six_trylock_ip - attempt to take a six lock without blocking
+ * @lock:	lock to take
+ * @type:	SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
+ * @ip:		ip parameter for lockdep/lockstat, i.e. _THIS_IP_
+ *
+ * Return: true on success, false on failure.
+ */
+bool six_trylock_ip(struct six_lock *lock, enum six_lock_type type, unsigned long ip)
 {
-	if (!do_six_trylock_type(lock, type, true))
+	if (!do_six_trylock(lock, type, true))
 		return false;
 
 	if (type != SIX_LOCK_write)
 		six_acquire(&lock->dep_map, 1, type == SIX_LOCK_read, ip);
 	return true;
 }
+EXPORT_SYMBOL_GPL(six_trylock_ip);
 
-bool six_relock_ip_type(struct six_lock *lock, enum six_lock_type type,
-			unsigned seq, unsigned long ip)
+/**
+ * six_relock_ip - attempt to re-take a lock that was held previously
+ * @lock:	lock to take
+ * @type:	SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
+ * @seq:	lock sequence number obtained from six_lock_seq() while lock was
+ *		held previously
+ * @ip:		ip parameter for lockdep/lockstat, i.e. _THIS_IP_
+ *
+ * Return: true on success, false on failure.
+ */
+bool six_relock_ip(struct six_lock *lock, enum six_lock_type type,
+		   unsigned seq, unsigned long ip)
 {
 	const struct six_lock_vals l[] = LOCK_VALS;
 	u64 old, v;
@@ -421,15 +435,15 @@ bool six_relock_ip_type(struct six_lock *lock, enum six_lock_type type,
 		if ((old & l[type].lock_fail) || six_state_seq(old) != seq)
 			return false;
 	} while ((v = atomic64_cmpxchg_acquire(&lock->state,
-				old,
-				old + l[type].lock_val)) != old);
+					       old,
+					       old + l[type].lock_val)) != old);
 
 	six_set_owner(lock, type, old, current);
 	if (type != SIX_LOCK_write)
 		six_acquire(&lock->dep_map, 1, type == SIX_LOCK_read, ip);
 	return true;
 }
-EXPORT_SYMBOL_GPL(six_relock_ip_type);
+EXPORT_SYMBOL_GPL(six_relock_ip);
 
 #ifdef CONFIG_SIX_LOCK_SPIN_ON_OWNER
 
@@ -512,7 +526,7 @@ static inline bool six_optimistic_spin(struct six_lock *lock, enum six_lock_type
 		if (owner && !six_spin_on_owner(lock, owner, end_time))
 			break;
 
-		if (do_six_trylock_type(lock, type, false)) {
+		if (do_six_trylock(lock, type, false)) {
 			osq_unlock(&lock->osq);
 			preempt_enable();
 			return true;
@@ -561,10 +575,10 @@ static inline bool six_optimistic_spin(struct six_lock *lock, enum six_lock_type
 #endif
 
 noinline
-static int __six_lock_type_slowpath(struct six_lock *lock, enum six_lock_type type,
-				    struct six_lock_waiter *wait,
-				    six_lock_should_sleep_fn should_sleep_fn, void *p,
-				    unsigned long ip)
+static int six_lock_slowpath(struct six_lock *lock, enum six_lock_type type,
+			     struct six_lock_waiter *wait,
+			     six_lock_should_sleep_fn should_sleep_fn, void *p,
+			     unsigned long ip)
 {
 	u64 old;
 	int ret = 0;
@@ -587,10 +601,10 @@ static int __six_lock_type_slowpath(struct six_lock *lock, enum six_lock_type ty
 	raw_spin_lock(&lock->wait_lock);
 	six_set_bitmask(lock, SIX_STATE_WAITING_READ << type);
 	/*
-	 * Retry taking the lock after taking waitlist lock, have raced with an
-	 * unlock:
+	 * Retry taking the lock after taking waitlist lock, in case we raced
+	 * with an unlock:
 	 */
-	ret = __do_six_trylock_type(lock, type, current, false);
+	ret = __do_six_trylock(lock, type, current, false);
 	if (ret <= 0) {
 		wait->start_time = local_clock();
 
@@ -648,10 +662,40 @@ out:
 	return ret;
 }
 
-int six_lock_type_ip_waiter(struct six_lock *lock, enum six_lock_type type,
-			    struct six_lock_waiter *wait,
-			    six_lock_should_sleep_fn should_sleep_fn, void *p,
-			    unsigned long ip)
+/**
+ * six_lock_ip_waiter - take a lock, with full waitlist interface
+ * @lock:	lock to take
+ * @type:	SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
+ * @wait:	pointer to wait object, which will be added to lock's waitlist
+ * @should_sleep_fn: callback run after adding to waitlist, immediately prior
+ *		to scheduling
+ * @p:		passed through to @should_sleep_fn
+ * @ip:		ip parameter for lockdep/lockstat, i.e. _THIS_IP_
+ *
+ * This is the most general six_lock() variant, with parameters to support full
+ * cycle detection for deadlock avoidance.
+ *
+ * The code calling this function must implement tracking of held locks, and the
+ * @wait object should be embedded into the struct that tracks held locks -
+ * which must also be accessible in a thread-safe way.
+ *
+ * @should_sleep_fn should invoke the cycle detector; it should walk each
+ * lock's waiters, and for each waiter recursively walk their held locks.
+ *
+ * When this function must block, @wait will be added to @lock's waitlist before
+ * calling trylock, and before calling @should_sleep_fn, and @wait will not be
+ * removed from the lock waitlist until the lock has been successfully acquired,
+ * or we abort.
+ *
+ * @wait.start_time will be monotonically increasing for any given waitlist, and
+ * thus may be used as a loop cursor.
+ *
+ * Return: 0 on success, or the return code from @should_sleep_fn on failure.
+ */
+int six_lock_ip_waiter(struct six_lock *lock, enum six_lock_type type,
+		       struct six_lock_waiter *wait,
+		       six_lock_should_sleep_fn should_sleep_fn, void *p,
+		       unsigned long ip)
 {
 	int ret;
 
@@ -660,8 +704,8 @@ int six_lock_type_ip_waiter(struct six_lock *lock, enum six_lock_type type,
 	if (type != SIX_LOCK_write)
 		six_acquire(&lock->dep_map, 0, type == SIX_LOCK_read, ip);
 
-	ret = do_six_trylock_type(lock, type, true) ? 0
-		: __six_lock_type_slowpath(lock, type, wait, should_sleep_fn, p, ip);
+	ret = do_six_trylock(lock, type, true) ? 0
+		: six_lock_slowpath(lock, type, wait, should_sleep_fn, p, ip);
 
 	if (ret && type != SIX_LOCK_write)
 		six_release(&lock->dep_map, ip);
@@ -670,7 +714,7 @@ int six_lock_type_ip_waiter(struct six_lock *lock, enum six_lock_type type,
 
 	return ret;
 }
-EXPORT_SYMBOL_GPL(six_lock_type_ip_waiter);
+EXPORT_SYMBOL_GPL(six_lock_ip_waiter);
 
 __always_inline
 static void do_six_unlock_type(struct six_lock *lock, enum six_lock_type type)
@@ -700,7 +744,22 @@ static void do_six_unlock_type(struct six_lock *lock, enum six_lock_type type)
 	six_lock_wakeup(lock, state, l[type].unlock_wakeup);
 }
 
-void six_unlock_ip_type(struct six_lock *lock, enum six_lock_type type, unsigned long ip)
+/**
+ * six_unlock_ip - drop a six lock
+ * @lock:	lock to unlock
+ * @type:	SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
+ * @ip:		ip parameter for lockdep/lockstat, i.e. _THIS_IP_
+ *
+ * When a lock is held multiple times (because six_lock_incement()) was used),
+ * this decrements the 'lock held' counter by one.
+ *
+ * For example:
+ * six_lock_read(&foo->lock);				read count 1
+ * six_lock_increment(&foo->lock, SIX_LOCK_read);	read count 2
+ * six_lock_unlock(&foo->lock, SIX_LOCK_read);		read count 1
+ * six_lock_unlock(&foo->lock, SIX_LOCK_read);		read count 0
+ */
+void six_unlock_ip(struct six_lock *lock, enum six_lock_type type, unsigned long ip)
 {
 	EBUG_ON(type == SIX_LOCK_write &&
 		!(atomic64_read(&lock->state) & SIX_LOCK_HELD_intent));
@@ -719,9 +778,14 @@ void six_unlock_ip_type(struct six_lock *lock, enum six_lock_type type, unsigned
 
 	do_six_unlock_type(lock, type);
 }
-EXPORT_SYMBOL_GPL(six_unlock_ip_type);
+EXPORT_SYMBOL_GPL(six_unlock_ip);
 
-/* Convert from intent to read: */
+/**
+ * six_lock_downgrade - convert an intent lock to a read lock
+ * @lock:	lock to dowgrade
+ *
+ * @lock will have read count incremented and intent count decremented
+ */
 void six_lock_downgrade(struct six_lock *lock)
 {
 	six_lock_increment(lock, SIX_LOCK_read);
@@ -729,6 +793,15 @@ void six_lock_downgrade(struct six_lock *lock)
 }
 EXPORT_SYMBOL_GPL(six_lock_downgrade);
 
+/**
+ * six_lock_tryupgrade - attempt to convert read lock to an intent lock
+ * @lock:	lock to upgrade
+ *
+ * On success, @lock will have intent count incremented and read count
+ * decremented
+ *
+ * Return: true on success, false on failure
+ */
 bool six_lock_tryupgrade(struct six_lock *lock)
 {
 	const struct six_lock_vals l[] = LOCK_VALS;
@@ -757,6 +830,17 @@ bool six_lock_tryupgrade(struct six_lock *lock)
 }
 EXPORT_SYMBOL_GPL(six_lock_tryupgrade);
 
+/**
+ * six_trylock_convert - attempt to convert a held lock from one type to another
+ * @lock:	lock to upgrade
+ * @from:	SIX_LOCK_read or SIX_LOCK_intent
+ * @to:		SIX_LOCK_read or SIX_LOCK_intent
+ *
+ * On success, @lock will have intent count incremented and read count
+ * decremented
+ *
+ * Return: true on success, false on failure
+ */
 bool six_trylock_convert(struct six_lock *lock,
 			 enum six_lock_type from,
 			 enum six_lock_type to)
@@ -775,9 +859,16 @@ bool six_trylock_convert(struct six_lock *lock,
 }
 EXPORT_SYMBOL_GPL(six_trylock_convert);
 
-/*
- * Increment read/intent lock count, assuming we already have it read or intent
- * locked:
+/**
+ * six_lock_increment - increase held lock count on a lock that is already held
+ * @lock:	lock to increment
+ * @type:	SIX_LOCK_read or SIX_LOCK_intent
+ *
+ * @lock must already be held, with a lock type that is greater than or equal to
+ * @type
+ *
+ * A corresponding six_unlock_type() call will be required for @lock to be fully
+ * unlocked.
  */
 void six_lock_increment(struct six_lock *lock, enum six_lock_type type)
 {
@@ -809,6 +900,16 @@ void six_lock_increment(struct six_lock *lock, enum six_lock_type type)
 }
 EXPORT_SYMBOL_GPL(six_lock_increment);
 
+/**
+ * six_lock_wakeup_all - wake up all waiters on @lock
+ * @lock:	lock to wake up waiters for
+ *
+ * Wakeing up waiters will cause them to re-run should_sleep_fn, which may then
+ * abort the lock operation.
+ *
+ * This function is never needed in a bug-free program; it's only useful in
+ * debug code, e.g. to determine if a cycle detector is at fault.
+ */
 void six_lock_wakeup_all(struct six_lock *lock)
 {
 	u64 state = atomic64_read(&lock->state);
@@ -825,8 +926,11 @@ void six_lock_wakeup_all(struct six_lock *lock)
 }
 EXPORT_SYMBOL_GPL(six_lock_wakeup_all);
 
-/*
- * Returns lock held counts, for both read and intent
+/**
+ * six_lock_counts - return held lock counts, for each lock type
+ * @lock:	lock to return counters for
+ *
+ * Return: the number of times a lock is held for read, intent and write.
  */
 struct six_lock_count six_lock_counts(struct six_lock *lock)
 {
@@ -843,15 +947,45 @@ struct six_lock_count six_lock_counts(struct six_lock *lock)
 }
 EXPORT_SYMBOL_GPL(six_lock_counts);
 
+/**
+ * six_lock_readers_add - directly manipulate reader count of a lock
+ * @lock:	lock to add/subtract readers for
+ * @nr:		reader count to add/subtract
+ *
+ * When an upper layer is implementing lock reentrency, we may have both read
+ * and intent locks on the same lock.
+ *
+ * When we need to take a write lock, the read locks will cause self-deadlock,
+ * because six locks themselves do not track which read locks are held by the
+ * current thread and which are held by a different thread - it does no
+ * per-thread tracking of held locks.
+ *
+ * The upper layer that is tracking held locks may however, if trylock() has
+ * failed, count up its own read locks, subtract them, take the write lock, and
+ * then re-add them.
+ *
+ * As in any other situation when taking a write lock, @lock must be held for
+ * intent one (or more) times, so @lock will never be left unlocked.
+ */
 void six_lock_readers_add(struct six_lock *lock, int nr)
 {
-	if (lock->readers)
+	if (lock->readers) {
 		this_cpu_add(*lock->readers, nr);
-	else /* reader count starts at bit 0 */
+	} else {
+		EBUG_ON((int) (atomic64_read(&lock->state) & SIX_STATE_READ_LOCK) + nr < 0);
+		/* reader count starts at bit 0 */
 		atomic64_add(nr, &lock->state);
+	}
 }
 EXPORT_SYMBOL_GPL(six_lock_readers_add);
 
+/**
+ * six_lock_exit - release resources held by a lock prior to freeing
+ * @lock:	lock to exit
+ *
+ * When a lock was initialized in percpu mode (SIX_OLCK_INIT_PCPU), this is
+ * required to free the percpu read counts.
+ */
 void six_lock_exit(struct six_lock *lock)
 {
 	WARN_ON(lock->readers && pcpu_read_count(lock));
diff --git a/fs/bcachefs/six.h b/fs/bcachefs/six.h
index 449589f76628..82bf9de72490 100644
--- a/fs/bcachefs/six.h
+++ b/fs/bcachefs/six.h
@@ -3,59 +3,124 @@
 #ifndef _LINUX_SIX_H
 #define _LINUX_SIX_H
 
-/*
- * Shared/intent/exclusive locks: sleepable read/write locks, much like rw
- * semaphores, except with a third intermediate state, intent. Basic operations
- * are:
+/**
+ * DOC: SIX locks overview
  *
- * six_lock_read(&foo->lock);
- * six_unlock_read(&foo->lock);
+ * Shared/intent/exclusive locks: sleepable read/write locks, like rw semaphores
+ * but with an additional state: read/shared, intent, exclusive/write
  *
- * six_lock_intent(&foo->lock);
- * six_unlock_intent(&foo->lock);
+ * The purpose of the intent state is to allow for greater concurrency on tree
+ * structures without deadlocking. In general, a read can't be upgraded to a
+ * write lock without deadlocking, so an operation that updates multiple nodes
+ * will have to take write locks for the full duration of the operation.
  *
- * six_lock_write(&foo->lock);
- * six_unlock_write(&foo->lock);
+ * But by adding an intent state, which is exclusive with other intent locks but
+ * not with readers, we can take intent locks at thte start of the operation,
+ * and then take write locks only for the actual update to each individual
+ * nodes, without deadlocking.
  *
- * Intent locks block other intent locks, but do not block read locks, and you
- * must have an intent lock held before taking a write lock, like so:
+ * Example usage:
+ *   six_lock_read(&foo->lock);
+ *   six_unlock_read(&foo->lock);
  *
- * six_lock_intent(&foo->lock);
- * six_lock_write(&foo->lock);
- * six_unlock_write(&foo->lock);
- * six_unlock_intent(&foo->lock);
+ * An intent lock must be held before taking a write lock:
+ *   six_lock_intent(&foo->lock);
+ *   six_lock_write(&foo->lock);
+ *   six_unlock_write(&foo->lock);
+ *   six_unlock_intent(&foo->lock);
  *
  * Other operations:
- *
  *   six_trylock_read()
  *   six_trylock_intent()
  *   six_trylock_write()
  *
- *   six_lock_downgrade():	convert from intent to read
- *   six_lock_tryupgrade():	attempt to convert from read to intent
- *
- * Locks also embed a sequence number, which is incremented when the lock is
- * locked or unlocked for write. The current sequence number can be grabbed
- * while a lock is held from lock->state.seq; then, if you drop the lock you can
- * use six_relock_(read|intent_write)(lock, seq) to attempt to retake the lock
- * iff it hasn't been locked for write in the meantime.
- *
- * There are also operations that take the lock type as a parameter, where the
- * type is one of SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write:
- *
- *   six_lock_type(lock, type)
- *   six_unlock_type(lock, type)
- *   six_relock(lock, type, seq)
- *   six_trylock_type(lock, type)
- *   six_trylock_convert(lock, from, to)
- *
- * A lock may be held multiple times by the same thread (for read or intent,
- * not write). However, the six locks code does _not_ implement the actual
- * recursive checks itself though - rather, if your code (e.g. btree iterator
- * code) knows that the current thread already has a lock held, and for the
- * correct type, six_lock_increment() may be used to bump up the counter for
- * that type - the only effect is that one more call to unlock will be required
- * before the lock is unlocked.
+ *   six_lock_downgrade()	convert from intent to read
+ *   six_lock_tryupgrade()	attempt to convert from read to intent, may fail
+ *
+ * There are also interfaces that take the lock type as an enum:
+ *
+ *   six_lock_type(&foo->lock, SIX_LOCK_read);
+ *   six_trylock_convert(&foo->lock, SIX_LOCK_read, SIX_LOCK_intent)
+ *   six_lock_type(&foo->lock, SIX_LOCK_write);
+ *   six_unlock_type(&foo->lock, SIX_LOCK_write);
+ *   six_unlock_type(&foo->lock, SIX_LOCK_intent);
+ *
+ * Lock sequence numbers - unlock(), relock():
+ *
+ *   Locks embed sequences numbers, which are incremented on write lock/unlock.
+ *   This allows locks to be dropped and the retaken iff the state they protect
+ *   hasn't changed; this makes it much easier to avoid holding locks while e.g.
+ *   doing IO or allocating memory.
+ *
+ *   Example usage:
+ *     six_lock_read(&foo->lock);
+ *     u32 seq = six_lock_seq(&foo->lock);
+ *     six_unlock_read(&foo->lock);
+ *
+ *     some_operation_that_may_block();
+ *
+ *     if (six_relock_read(&foo->lock, seq)) { ... }
+ *
+ *   If the relock operation succeeds, it is as if the lock was never unlocked.
+ *
+ * Reentrancy:
+ *
+ *   Six locks are not by themselves reentrent, but have counters for both the
+ *   read and intent states that can be used to provide reentrency by an upper
+ *   layer that tracks held locks. If a lock is known to already be held in the
+ *   read or intent state, six_lock_increment() can be used to bump the "lock
+ *   held in this state" counter, increasing the number of unlock calls that
+ *   will be required to fully unlock it.
+ *
+ *   Example usage:
+ *     six_lock_read(&foo->lock);
+ *     six_lock_increment(&foo->lock, SIX_LOCK_read);
+ *     six_unlock_read(&foo->lock);
+ *     six_unlock_read(&foo->lock);
+ *   foo->lock is now fully unlocked.
+ *
+ *   Since the intent state supercedes read, it's legal to increment the read
+ *   counter when holding an intent lock, but not the reverse.
+ *
+ *   A lock may only be held once for write: six_lock_increment(.., SIX_LOCK_write)
+ *   is not legal.
+ *
+ * should_sleep_fn:
+ *
+ *   There is a six_lock() variant that takes a function pointer that is called
+ *   immediately prior to schedule() when blocking, and may return an error to
+ *   abort.
+ *
+ *   One possible use for this feature is when objects being locked are part of
+ *   a cache and may reused, and lock ordering is based on a property of the
+ *   object that will change when the object is reused - i.e. logical key order.
+ *
+ *   If looking up an object in the cache may race with object reuse, and lock
+ *   ordering is required to prevent deadlock, object reuse may change the
+ *   correct lock order for that object and cause a deadlock. should_sleep_fn
+ *   can be used to check if the object is still the object we want and avoid
+ *   this deadlock.
+ *
+ * Wait list entry interface:
+ *
+ *   There is a six_lock() variant, six_lock_waiter(), that takes a pointer to a
+ *   wait list entry. By embedding six_lock_waiter into another object, and by
+ *   traversing lock waitlists, it is then possible for an upper layer to
+ *   implement full cycle detection for deadlock avoidance.
+ *
+ *   should_sleep_fn should be used for invoking the cycle detector, walking the
+ *   graph of held locks to check for a deadlock. The upper layer must track
+ *   held locks for each thread, and each thread's held locks must be reachable
+ *   from its six_lock_waiter object.
+ *
+ *   six_lock_waiter() will add the wait object to the waitlist re-trying taking
+ *   the lock, and before calling should_sleep_fn, and the wait object will not
+ *   be removed from the waitlist until either the lock has been successfully
+ *   acquired, or we aborted because should_sleep_fn returned an error.
+ *
+ *   Also, six_lock_waiter contains a timestamp, and waiters on a waitlist will
+ *   have timestamps in strictly ascending order - this is so the timestamp can
+ *   be used as a cursor for lock graph traverse.
  */
 
 #include <linux/lockdep.h>
@@ -66,8 +131,6 @@
 #include <linux/osq_lock.h>
 #endif
 
-#define SIX_LOCK_SEPARATE_LOCKFNS
-
 enum six_lock_type {
 	SIX_LOCK_read,
 	SIX_LOCK_intent,
@@ -108,6 +171,11 @@ enum six_lock_init_flags {
 void __six_lock_init(struct six_lock *lock, const char *name,
 		     struct lock_class_key *key, enum six_lock_init_flags flags);
 
+/**
+ * six_lock_init - initialize a six lock
+ * @lock:	lock to initialize
+ * @flags:	optional flags, i.e. SIX_LOCK_INIT_PCPU
+ */
 #define six_lock_init(lock, flags)					\
 do {									\
 	static struct lock_class_key __key;				\
@@ -115,73 +183,148 @@ do {									\
 	__six_lock_init((lock), #lock, &__key, flags);			\
 } while (0)
 
+/**
+ * six_lock_seq - obtain current lock sequence number
+ * @lock:	six_lock to obtain sequence number for
+ *
+ * @lock should be held for read or intent, and not write
+ *
+ * By saving the lock sequence number, we can unlock @lock and then (typically
+ * after some blocking operation) attempt to relock it: the relock will succeed
+ * if the sequence number hasn't changed, meaning no write locks have been taken
+ * and state corresponding to what @lock protects is still valid.
+ */
 static inline u32 six_lock_seq(const struct six_lock *lock)
 {
 	return atomic64_read(&lock->state) >> 32;
 }
 
-bool six_trylock_ip_type(struct six_lock *lock, enum six_lock_type type,
-			 unsigned long ip);
+bool six_trylock_ip(struct six_lock *lock, enum six_lock_type type, unsigned long ip);
 
+/**
+ * six_trylock_type - attempt to take a six lock without blocking
+ * @lock:	lock to take
+ * @type:	SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
+ *
+ * Return: true on success, false on failure.
+ */
 static inline bool six_trylock_type(struct six_lock *lock, enum six_lock_type type)
 {
-	return six_trylock_ip_type(lock, type, _THIS_IP_);
+	return six_trylock_ip(lock, type, _THIS_IP_);
 }
 
-int six_lock_type_ip_waiter(struct six_lock *lock, enum six_lock_type type,
-			    struct six_lock_waiter *wait,
-			    six_lock_should_sleep_fn should_sleep_fn, void *p,
-			    unsigned long ip);
-
-static inline int six_lock_type_waiter(struct six_lock *lock, enum six_lock_type type,
-				struct six_lock_waiter *wait,
-				six_lock_should_sleep_fn should_sleep_fn, void *p)
+int six_lock_ip_waiter(struct six_lock *lock, enum six_lock_type type,
+		       struct six_lock_waiter *wait,
+		       six_lock_should_sleep_fn should_sleep_fn, void *p,
+		       unsigned long ip);
+
+/**
+ * six_lock_waiter - take a lock, with full waitlist interface
+ * @lock:	lock to take
+ * @type:	SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
+ * @wait:	pointer to wait object, which will be added to lock's waitlist
+ * @should_sleep_fn: callback run after adding to waitlist, immediately prior
+ *		to scheduling
+ * @p:		passed through to @should_sleep_fn
+ *
+ * This is a convenience wrapper around six_lock_ip_waiter(), see that function
+ * for full documentation.
+ *
+ * Return: 0 on success, or the return code from @should_sleep_fn on failure.
+ */
+static inline int six_lock_waiter(struct six_lock *lock, enum six_lock_type type,
+				  struct six_lock_waiter *wait,
+				  six_lock_should_sleep_fn should_sleep_fn, void *p)
 {
-	return six_lock_type_ip_waiter(lock, type, wait, should_sleep_fn, p, _THIS_IP_);
+	return six_lock_ip_waiter(lock, type, wait, should_sleep_fn, p, _THIS_IP_);
 }
 
-static inline int six_lock_ip_type(struct six_lock *lock, enum six_lock_type type,
-				six_lock_should_sleep_fn should_sleep_fn, void *p,
-				unsigned long ip)
+/**
+ * six_lock_ip - take a six lock lock
+ * @lock:	lock to take
+ * @type:	SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
+ * @should_sleep_fn: callback run after adding to waitlist, immediately prior
+ *		to scheduling
+ * @p:		passed through to @should_sleep_fn
+ * @ip:		ip parameter for lockdep/lockstat, i.e. _THIS_IP_
+ *
+ * Return: 0 on success, or the return code from @should_sleep_fn on failure.
+ */
+static inline int six_lock_ip(struct six_lock *lock, enum six_lock_type type,
+			      six_lock_should_sleep_fn should_sleep_fn, void *p,
+			      unsigned long ip)
 {
 	struct six_lock_waiter wait;
 
-	return six_lock_type_ip_waiter(lock, type, &wait, should_sleep_fn, p, ip);
+	return six_lock_ip_waiter(lock, type, &wait, should_sleep_fn, p, ip);
 }
 
+/**
+ * six_lock_type - take a six lock lock
+ * @lock:	lock to take
+ * @type:	SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
+ * @should_sleep_fn: callback run after adding to waitlist, immediately prior
+ *		to scheduling
+ * @p:		passed through to @should_sleep_fn
+ *
+ * Return: 0 on success, or the return code from @should_sleep_fn on failure.
+ */
 static inline int six_lock_type(struct six_lock *lock, enum six_lock_type type,
 				six_lock_should_sleep_fn should_sleep_fn, void *p)
 {
 	struct six_lock_waiter wait;
 
-	return six_lock_type_ip_waiter(lock, type, &wait, should_sleep_fn, p, _THIS_IP_);
+	return six_lock_ip_waiter(lock, type, &wait, should_sleep_fn, p, _THIS_IP_);
 }
 
-bool six_relock_ip_type(struct six_lock *lock, enum six_lock_type type,
-			unsigned seq, unsigned long ip);
+bool six_relock_ip(struct six_lock *lock, enum six_lock_type type,
+		   unsigned seq, unsigned long ip);
 
+/**
+ * six_relock_type - attempt to re-take a lock that was held previously
+ * @lock:	lock to take
+ * @type:	SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
+ * @seq:	lock sequence number obtained from six_lock_seq() while lock was
+ *		held previously
+ *
+ * Return: true on success, false on failure.
+ */
 static inline bool six_relock_type(struct six_lock *lock, enum six_lock_type type,
 				   unsigned seq)
 {
-	return six_relock_ip_type(lock, type, seq, _THIS_IP_);
+	return six_relock_ip(lock, type, seq, _THIS_IP_);
 }
 
-void six_unlock_ip_type(struct six_lock *lock, enum six_lock_type type, unsigned long ip);
+void six_unlock_ip(struct six_lock *lock, enum six_lock_type type, unsigned long ip);
 
+/**
+ * six_unlock_type - drop a six lock
+ * @lock:	lock to unlock
+ * @type:	SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
+ *
+ * When a lock is held multiple times (because six_lock_incement()) was used),
+ * this decrements the 'lock held' counter by one.
+ *
+ * For example:
+ * six_lock_read(&foo->lock);				read count 1
+ * six_lock_increment(&foo->lock, SIX_LOCK_read);	read count 2
+ * six_lock_unlock(&foo->lock, SIX_LOCK_read);		read count 1
+ * six_lock_unlock(&foo->lock, SIX_LOCK_read);		read count 0
+ */
 static inline void six_unlock_type(struct six_lock *lock, enum six_lock_type type)
 {
-	six_unlock_ip_type(lock, type, _THIS_IP_);
+	six_unlock_ip(lock, type, _THIS_IP_);
 }
 
 #define __SIX_LOCK(type)						\
 static inline bool six_trylock_ip_##type(struct six_lock *lock, unsigned long ip)\
 {									\
-	return six_trylock_ip_type(lock, SIX_LOCK_##type, ip);		\
+	return six_trylock_ip(lock, SIX_LOCK_##type, ip);		\
 }									\
 									\
 static inline bool six_trylock_##type(struct six_lock *lock)		\
 {									\
-	return six_trylock_ip_type(lock, SIX_LOCK_##type, _THIS_IP_);	\
+	return six_trylock_ip(lock, SIX_LOCK_##type, _THIS_IP_);	\
 }									\
 									\
 static inline int six_lock_ip_waiter_##type(struct six_lock *lock,	\
@@ -189,24 +332,24 @@ static inline int six_lock_ip_waiter_##type(struct six_lock *lock,	\
 			   six_lock_should_sleep_fn should_sleep_fn, void *p,\
 			   unsigned long ip)				\
 {									\
-	return six_lock_type_ip_waiter(lock, SIX_LOCK_##type, wait, should_sleep_fn, p, ip);\
+	return six_lock_ip_waiter(lock, SIX_LOCK_##type, wait, should_sleep_fn, p, ip);\
 }									\
 									\
 static inline int six_lock_ip_##type(struct six_lock *lock,		\
 		    six_lock_should_sleep_fn should_sleep_fn, void *p,	\
 		    unsigned long ip)					\
 {									\
-	return six_lock_ip_type(lock, SIX_LOCK_##type, should_sleep_fn, p, ip);\
+	return six_lock_ip(lock, SIX_LOCK_##type, should_sleep_fn, p, ip);\
 }									\
 									\
 static inline bool six_relock_ip_##type(struct six_lock *lock, u32 seq, unsigned long ip)\
 {									\
-	return six_relock_ip_type(lock, SIX_LOCK_##type, seq, ip);	\
+	return six_relock_ip(lock, SIX_LOCK_##type, seq, ip);		\
 }									\
 									\
 static inline bool six_relock_##type(struct six_lock *lock, u32 seq)	\
 {									\
-	return six_relock_ip_type(lock, SIX_LOCK_##type, seq, _THIS_IP_);\
+	return six_relock_ip(lock, SIX_LOCK_##type, seq, _THIS_IP_);	\
 }									\
 									\
 static inline int six_lock_##type(struct six_lock *lock,		\
@@ -215,21 +358,14 @@ static inline int six_lock_##type(struct six_lock *lock,		\
 	return six_lock_ip_##type(lock, fn, p, _THIS_IP_);		\
 }									\
 									\
-static inline int six_lock_waiter_##type(struct six_lock *lock,		\
-			struct six_lock_waiter *wait,			\
-			six_lock_should_sleep_fn fn, void *p)		\
-{									\
-	return six_lock_ip_waiter_##type(lock, wait, fn, p, _THIS_IP_);	\
-}									\
-									\
 static inline void six_unlock_ip_##type(struct six_lock *lock, unsigned long ip)	\
 {									\
-	six_unlock_ip_type(lock, SIX_LOCK_##type, ip);			\
+	six_unlock_ip(lock, SIX_LOCK_##type, ip);			\
 }									\
 									\
 static inline void six_unlock_##type(struct six_lock *lock)		\
 {									\
-	six_unlock_ip_type(lock, SIX_LOCK_##type, _THIS_IP_);		\
+	six_unlock_ip(lock, SIX_LOCK_##type, _THIS_IP_);		\
 }
 
 __SIX_LOCK(read)
-- 
cgit 


From 37f612bea5bd921e71537df3559a117dffb0956d Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 21 May 2023 23:41:56 -0400
Subject: six locks: Improve spurious wakeup handling in pcpu reader mode

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/six.c | 41 +++++++++++++++++++++++++++--------------
 1 file changed, 27 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/six.c b/fs/bcachefs/six.c
index a1f007095ec9..fcc74e626db0 100644
--- a/fs/bcachefs/six.c
+++ b/fs/bcachefs/six.c
@@ -193,6 +193,15 @@ static inline unsigned pcpu_read_count(struct six_lock *lock)
 	return read_count;
 }
 
+/*
+ * __do_six_trylock() - main trylock routine
+ *
+ * Returns 1 on success, 0 on failure
+ *
+ * In percpu reader mode, a failed trylock may cause a spurious trylock failure
+ * for anoter thread taking the competing lock type, and we may havve to do a
+ * wakeup: when a wakeup is required, we return -1 - wakeup_type.
+ */
 static int __do_six_trylock(struct six_lock *lock, enum six_lock_type type,
 			    struct task_struct *task, bool try)
 {
@@ -219,8 +228,20 @@ static int __do_six_trylock(struct six_lock *lock, enum six_lock_type type,
 	 * the lock, then issues a full memory barrier, then reads from the
 	 * other thread's variable to check if the other thread thinks it has
 	 * the lock. If we raced, we backoff and retry/sleep.
+	 *
+	 * Failure to take the lock may cause a spurious trylock failure in
+	 * another thread, because we temporarily set the lock to indicate that
+	 * we held it. This would be a problem for a thread in six_lock(), when
+	 * they are calling trylock after adding themself to the waitlist and
+	 * prior to sleeping.
+	 *
+	 * Therefore, if we fail to get the lock, and there were waiters of the
+	 * type we conflict with, we will have to issue a wakeup.
+	 *
+	 * Since we may be called under wait_lock (and by the wakeup code
+	 * itself), we return that the wakeup has to be done instead of doing it
+	 * here.
 	 */
-
 	if (type == SIX_LOCK_read && lock->readers) {
 		preempt_disable();
 		this_cpu_inc(*lock->readers); /* signal that we own lock */
@@ -233,17 +254,11 @@ static int __do_six_trylock(struct six_lock *lock, enum six_lock_type type,
 		this_cpu_sub(*lock->readers, !ret);
 		preempt_enable();
 
-		/*
-		 * If we failed because a writer was trying to take the
-		 * lock, issue a wakeup because we might have caused a
-		 * spurious trylock failure:
-		 */
-		if (old & SIX_STATE_WRITE_LOCKING)
+		if (!ret && (old & SIX_STATE_WAITING_WRITE))
 			ret = -1 - SIX_LOCK_write;
 	} else if (type == SIX_LOCK_write && lock->readers) {
 		if (try) {
-			atomic64_add(SIX_STATE_WRITE_LOCKING,
-				     &lock->state);
+			atomic64_add(SIX_STATE_WRITE_LOCKING, &lock->state);
 			smp_mb__after_atomic();
 		}
 
@@ -259,12 +274,10 @@ static int __do_six_trylock(struct six_lock *lock, enum six_lock_type type,
 		if (ret || try)
 			v -= SIX_STATE_WRITE_LOCKING;
 
-		if (try && !ret) {
+		if (v) {
 			old = atomic64_add_return(v, &lock->state);
-			if (old & SIX_STATE_WAITING_READ)
+			if (!ret && try && (old & SIX_STATE_WAITING_READ))
 				ret = -1 - SIX_LOCK_read;
-		} else {
-			atomic64_add(v, &lock->state);
 		}
 	} else {
 		v = atomic64_read(&lock->state);
@@ -422,7 +435,7 @@ bool six_relock_ip(struct six_lock *lock, enum six_lock_type type,
 		 */
 		if (ret)
 			six_acquire(&lock->dep_map, 1, type == SIX_LOCK_read, ip);
-		else if (old & SIX_STATE_WRITE_LOCKING)
+		else if (old & SIX_STATE_WAITING_WRITE)
 			six_lock_wakeup(lock, old, SIX_LOCK_write);
 
 		return ret;
-- 
cgit 


From dc88b65f3e54b5f25dcfe1259ae21c19a6e69d7f Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 16 Jun 2023 15:00:48 -0400
Subject: six locks: Simplify six_relock()

The next patch is going to move lock->seq out of lock->state. This
replaces six_relock() with a much simpler implementation based on
trylock.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/six.c | 47 +++++------------------------------------------
 1 file changed, 5 insertions(+), 42 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/six.c b/fs/bcachefs/six.c
index fcc74e626db0..3fb5959fe40f 100644
--- a/fs/bcachefs/six.c
+++ b/fs/bcachefs/six.c
@@ -409,51 +409,14 @@ EXPORT_SYMBOL_GPL(six_trylock_ip);
 bool six_relock_ip(struct six_lock *lock, enum six_lock_type type,
 		   unsigned seq, unsigned long ip)
 {
-	const struct six_lock_vals l[] = LOCK_VALS;
-	u64 old, v;
-
-	EBUG_ON(type == SIX_LOCK_write);
-
-	if (type == SIX_LOCK_read &&
-	    lock->readers) {
-		bool ret;
-
-		preempt_disable();
-		this_cpu_inc(*lock->readers);
-
-		smp_mb();
-
-		old = atomic64_read(&lock->state);
-		ret = !(old & l[type].lock_fail) && six_state_seq(old) == seq;
-
-		this_cpu_sub(*lock->readers, !ret);
-		preempt_enable();
-
-		/*
-		 * Similar to the lock path, we may have caused a spurious write
-		 * lock fail and need to issue a wakeup:
-		 */
-		if (ret)
-			six_acquire(&lock->dep_map, 1, type == SIX_LOCK_read, ip);
-		else if (old & SIX_STATE_WAITING_WRITE)
-			six_lock_wakeup(lock, old, SIX_LOCK_write);
+	if (six_lock_seq(lock) != seq || !six_trylock_ip(lock, type, ip))
+		return false;
 
-		return ret;
+	if (six_lock_seq(lock) != seq) {
+		six_unlock_ip(lock, type, ip);
+		return false;
 	}
 
-	v = atomic64_read(&lock->state);
-	do {
-		old = v;
-
-		if ((old & l[type].lock_fail) || six_state_seq(old) != seq)
-			return false;
-	} while ((v = atomic64_cmpxchg_acquire(&lock->state,
-					       old,
-					       old + l[type].lock_val)) != old);
-
-	six_set_owner(lock, type, old, current);
-	if (type != SIX_LOCK_write)
-		six_acquire(&lock->dep_map, 1, type == SIX_LOCK_read, ip);
 	return true;
 }
 EXPORT_SYMBOL_GPL(six_relock_ip);
-- 
cgit 


From b60c8e9e7b082abac290ebdb9166b806e7d83fb7 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 16 Jun 2023 18:24:05 -0400
Subject: six locks: lock->state.seq no longer used for write lock held

lock->state.seq is shortly being moved out of lock->state, to kill the
depedency on atomic64; in preparation for that, we change the write
locking bit to write locked.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/six.c | 73 +++++++++++++++++++++++--------------------------------
 1 file changed, 30 insertions(+), 43 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/six.c b/fs/bcachefs/six.c
index 3fb5959fe40f..e566c429607b 100644
--- a/fs/bcachefs/six.c
+++ b/fs/bcachefs/six.c
@@ -41,8 +41,8 @@ static void do_six_unlock_type(struct six_lock *lock, enum six_lock_type type);
 #define SIX_STATE_READ_BITS		26
 
 #define SIX_STATE_READ_LOCK		~(~0ULL << 26)
-#define SIX_STATE_WRITE_LOCKING		(1ULL << 26)
-#define SIX_STATE_INTENT_HELD		(1ULL << 27)
+#define SIX_STATE_INTENT_HELD		(1ULL << 26)
+#define SIX_STATE_WRITE_LOCK		(1ULL << 27)
 #define SIX_STATE_NOSPIN		(1ULL << 28)
 #define SIX_STATE_WAITING_READ		(1ULL << (29 + SIX_LOCK_read))
 #define SIX_STATE_WAITING_INTENT	(1ULL << (29 + SIX_LOCK_intent))
@@ -54,7 +54,7 @@ static void do_six_unlock_type(struct six_lock *lock, enum six_lock_type type);
 
 #define SIX_LOCK_HELD_read		SIX_STATE_READ_LOCK
 #define SIX_LOCK_HELD_intent		SIX_STATE_INTENT_HELD
-#define SIX_LOCK_HELD_write		(1ULL << SIX_STATE_SEQ_OFFSET)
+#define SIX_LOCK_HELD_write		SIX_STATE_WRITE_LOCK
 
 struct six_lock_vals {
 	/* Value we add to the lock in order to take the lock: */
@@ -63,9 +63,6 @@ struct six_lock_vals {
 	/* If the lock has this value (used as a mask), taking the lock fails: */
 	u64			lock_fail;
 
-	/* Value we add to the lock in order to release the lock: */
-	u64			unlock_val;
-
 	/* Mask that indicates lock is held for this type: */
 	u64			held_mask;
 
@@ -76,22 +73,19 @@ struct six_lock_vals {
 #define LOCK_VALS {							\
 	[SIX_LOCK_read] = {						\
 		.lock_val	= 1ULL << SIX_STATE_READ_OFFSET,	\
-		.lock_fail	= SIX_LOCK_HELD_write|SIX_STATE_WRITE_LOCKING,\
-		.unlock_val	= -(1ULL << SIX_STATE_READ_OFFSET),	\
+		.lock_fail	= SIX_LOCK_HELD_write,			\
 		.held_mask	= SIX_LOCK_HELD_read,			\
 		.unlock_wakeup	= SIX_LOCK_write,			\
 	},								\
 	[SIX_LOCK_intent] = {						\
 		.lock_val	= SIX_STATE_INTENT_HELD,		\
 		.lock_fail	= SIX_LOCK_HELD_intent,			\
-		.unlock_val	= -SIX_STATE_INTENT_HELD,		\
 		.held_mask	= SIX_LOCK_HELD_intent,			\
 		.unlock_wakeup	= SIX_LOCK_intent,			\
 	},								\
 	[SIX_LOCK_write] = {						\
 		.lock_val	= SIX_LOCK_HELD_write,			\
 		.lock_fail	= SIX_LOCK_HELD_read,			\
-		.unlock_val	= SIX_LOCK_HELD_write,			\
 		.held_mask	= SIX_LOCK_HELD_write,			\
 		.unlock_wakeup	= SIX_LOCK_read,			\
 	},								\
@@ -211,9 +205,9 @@ static int __do_six_trylock(struct six_lock *lock, enum six_lock_type type,
 
 	EBUG_ON(type == SIX_LOCK_write && lock->owner != task);
 	EBUG_ON(type == SIX_LOCK_write &&
-		(atomic64_read(&lock->state) & SIX_LOCK_HELD_write));
+		(try != !(atomic64_read(&lock->state) & SIX_LOCK_HELD_write)));
 	EBUG_ON(type == SIX_LOCK_write &&
-		(try != !(atomic64_read(&lock->state) & SIX_STATE_WRITE_LOCKING)));
+		(try != !(atomic64_read(&lock->state) & SIX_STATE_WRITE_LOCK)));
 
 	/*
 	 * Percpu reader mode:
@@ -258,25 +252,15 @@ static int __do_six_trylock(struct six_lock *lock, enum six_lock_type type,
 			ret = -1 - SIX_LOCK_write;
 	} else if (type == SIX_LOCK_write && lock->readers) {
 		if (try) {
-			atomic64_add(SIX_STATE_WRITE_LOCKING, &lock->state);
+			atomic64_add(SIX_STATE_WRITE_LOCK, &lock->state);
 			smp_mb__after_atomic();
 		}
 
 		ret = !pcpu_read_count(lock);
 
-		/*
-		 * On success, we increment lock->seq; also we clear
-		 * write_locking unless we failed from the lock path:
-		 */
-		v = 0;
-		if (ret)
-			v += SIX_LOCK_HELD_write;
-		if (ret || try)
-			v -= SIX_STATE_WRITE_LOCKING;
-
-		if (v) {
-			old = atomic64_add_return(v, &lock->state);
-			if (!ret && try && (old & SIX_STATE_WAITING_READ))
+		if (try && !ret) {
+			old = atomic64_sub_return(SIX_STATE_WRITE_LOCK, &lock->state);
+			if (old & SIX_STATE_WAITING_READ)
 				ret = -1 - SIX_LOCK_read;
 		}
 	} else {
@@ -284,17 +268,13 @@ static int __do_six_trylock(struct six_lock *lock, enum six_lock_type type,
 		do {
 			new = old = v;
 
-			if (!(old & l[type].lock_fail)) {
-				new += l[type].lock_val;
+			ret = !(old & l[type].lock_fail);
 
-				if (type == SIX_LOCK_write)
-					new &= ~SIX_STATE_WRITE_LOCKING;
-			} else {
+			if (!ret || (type == SIX_LOCK_write && !try))
 				break;
-			}
-		} while ((v = atomic64_cmpxchg_acquire(&lock->state, old, new)) != old);
 
-		ret = !(old & l[type].lock_fail);
+			new += l[type].lock_val;
+		} while ((v = atomic64_cmpxchg_acquire(&lock->state, old, new)) != old);
 
 		EBUG_ON(ret && !(atomic64_read(&lock->state) & l[type].held_mask));
 	}
@@ -302,8 +282,8 @@ static int __do_six_trylock(struct six_lock *lock, enum six_lock_type type,
 	if (ret > 0)
 		six_set_owner(lock, type, old, task);
 
-	EBUG_ON(type == SIX_LOCK_write && (try || ret > 0) &&
-		(atomic64_read(&lock->state) & SIX_STATE_WRITE_LOCKING));
+	EBUG_ON(type == SIX_LOCK_write && try && ret <= 0 &&
+		(atomic64_read(&lock->state) & SIX_STATE_WRITE_LOCK));
 
 	return ret;
 }
@@ -392,6 +372,8 @@ bool six_trylock_ip(struct six_lock *lock, enum six_lock_type type, unsigned lon
 
 	if (type != SIX_LOCK_write)
 		six_acquire(&lock->dep_map, 1, type == SIX_LOCK_read, ip);
+	else
+		atomic64_add(1ULL << SIX_STATE_SEQ_OFFSET, &lock->state);
 	return true;
 }
 EXPORT_SYMBOL_GPL(six_trylock_ip);
@@ -560,8 +542,8 @@ static int six_lock_slowpath(struct six_lock *lock, enum six_lock_type type,
 	int ret = 0;
 
 	if (type == SIX_LOCK_write) {
-		EBUG_ON(atomic64_read(&lock->state) & SIX_STATE_WRITE_LOCKING);
-		atomic64_add(SIX_STATE_WRITE_LOCKING, &lock->state);
+		EBUG_ON(atomic64_read(&lock->state) & SIX_STATE_WRITE_LOCK);
+		atomic64_add(SIX_STATE_WRITE_LOCK, &lock->state);
 		smp_mb__after_atomic();
 	}
 
@@ -631,7 +613,7 @@ static int six_lock_slowpath(struct six_lock *lock, enum six_lock_type type,
 	__set_current_state(TASK_RUNNING);
 out:
 	if (ret && type == SIX_LOCK_write) {
-		six_clear_bitmask(lock, SIX_STATE_WRITE_LOCKING);
+		six_clear_bitmask(lock, SIX_STATE_WRITE_LOCK);
 		six_lock_wakeup(lock, old, SIX_LOCK_read);
 	}
 
@@ -683,6 +665,9 @@ int six_lock_ip_waiter(struct six_lock *lock, enum six_lock_type type,
 	ret = do_six_trylock(lock, type, true) ? 0
 		: six_lock_slowpath(lock, type, wait, should_sleep_fn, p, ip);
 
+	if (!ret && type == SIX_LOCK_write)
+		atomic64_add(1ULL << SIX_STATE_SEQ_OFFSET, &lock->state);
+
 	if (ret && type != SIX_LOCK_write)
 		six_release(&lock->dep_map, ip);
 	if (!ret)
@@ -708,13 +693,13 @@ static void do_six_unlock_type(struct six_lock *lock, enum six_lock_type type)
 		smp_mb(); /* between unlocking and checking for waiters */
 		state = atomic64_read(&lock->state);
 	} else {
-		u64 v = l[type].unlock_val;
+		u64 v = l[type].lock_val;
 
 		if (type != SIX_LOCK_read)
-			v -= atomic64_read(&lock->state) & SIX_STATE_NOSPIN;
+			v += atomic64_read(&lock->state) & SIX_STATE_NOSPIN;
 
 		EBUG_ON(!(atomic64_read(&lock->state) & l[type].held_mask));
-		state = atomic64_add_return_release(v, &lock->state);
+		state = atomic64_sub_return_release(v, &lock->state);
 	}
 
 	six_lock_wakeup(lock, state, l[type].unlock_wakeup);
@@ -745,6 +730,8 @@ void six_unlock_ip(struct six_lock *lock, enum six_lock_type type, unsigned long
 
 	if (type != SIX_LOCK_write)
 		six_release(&lock->dep_map, ip);
+	else
+		atomic64_add(1ULL << SIX_STATE_SEQ_OFFSET, &lock->state);
 
 	if (type == SIX_LOCK_intent &&
 	    lock->intent_lock_recurse) {
@@ -791,7 +778,7 @@ bool six_lock_tryupgrade(struct six_lock *lock)
 
 		if (!lock->readers) {
 			EBUG_ON(!(new & SIX_LOCK_HELD_read));
-			new += l[SIX_LOCK_read].unlock_val;
+			new -= l[SIX_LOCK_read].lock_val;
 		}
 
 		new |= SIX_LOCK_HELD_intent;
-- 
cgit 


From 357c1261526db604dd4593638620a801c44d02bd Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 22 May 2023 17:54:19 -0400
Subject: six_locks: Kill test_bit()/set_bit() usage

This deletes the crazy cast-atomic-to-unsigned-long, and replaces them
with atomic_and() and atomic_or().

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/six.c | 63 ++++---------------------------------------------------
 1 file changed, 4 insertions(+), 59 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/six.c b/fs/bcachefs/six.c
index e566c429607b..266ee5d95479 100644
--- a/fs/bcachefs/six.c
+++ b/fs/bcachefs/six.c
@@ -96,73 +96,18 @@ static inline u32 six_state_seq(u64 state)
 	return state >> SIX_STATE_SEQ_OFFSET;
 }
 
-#ifdef CONFIG_GENERIC_ATOMIC64
-
 static inline void six_set_bitmask(struct six_lock *lock, u64 mask)
 {
-	u64 old, new, v = atomic64_read(&lock->state);
-
-	do {
-		old = new = v;
-		if ((old & mask) == mask)
-			break;
-		new |= mask;
-	} while ((v = atomic64_cmpxchg(&lock->state, old, new)) != old);
+	if ((atomic64_read(&lock->state) & mask) != mask)
+		atomic64_or(mask, &lock->state);
 }
 
 static inline void six_clear_bitmask(struct six_lock *lock, u64 mask)
 {
-	u64 old, new, v = atomic64_read(&lock->state);
-
-	do {
-		old = new = v;
-		if (!(old & mask))
-			break;
-		new &= ~mask;
-	} while ((v = atomic64_cmpxchg(&lock->state, old, new)) != old);
+	if (atomic64_read(&lock->state) & mask)
+		atomic64_and(~mask, &lock->state);
 }
 
-#else
-
-/*
- * Returns the index of the first set bit, treating @mask as an array of ulongs:
- * that is, a bit index that can be passed to test_bit()/set_bit().
- *
- * Assumes the set bit we want is in the low 4 bytes:
- */
-static inline unsigned u64_mask_to_ulong_bitnr(u64 mask)
-{
-#if BITS_PER_LONG == 64
-	return ilog2(mask);
-#else
-#if defined(__LITTLE_ENDIAN)
-	return ilog2((u32) mask);
-#elif defined(__BIG_ENDIAN)
-	return ilog2((u32) mask) + 32;
-#else
-#error Unknown byteorder
-#endif
-#endif
-}
-
-static inline void six_set_bitmask(struct six_lock *lock, u64 mask)
-{
-	unsigned bitnr = u64_mask_to_ulong_bitnr(mask);
-
-	if (!test_bit(bitnr, (unsigned long *) &lock->state))
-		set_bit(bitnr, (unsigned long *) &lock->state);
-}
-
-static inline void six_clear_bitmask(struct six_lock *lock, u64 mask)
-{
-	unsigned bitnr = u64_mask_to_ulong_bitnr(mask);
-
-	if (test_bit(bitnr, (unsigned long *) &lock->state))
-		clear_bit(bitnr, (unsigned long *) &lock->state);
-}
-
-#endif
-
 static inline void six_set_owner(struct six_lock *lock, enum six_lock_type type,
 				 u64 old, struct task_struct *owner)
 {
-- 
cgit 


From a4e9e1f0cb71dfceaca0d2088465af05a747c710 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 16 Jun 2023 19:21:21 -0400
Subject: six locks: Single instance of six_lock_vals

Since we're not generating different versions of the lock functions for
each lock type, the constant propagation we were trying to do before is
no longer useful - this is now a small code size decrease.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/six.c | 45 ++++++++++++++++++++-------------------------
 1 file changed, 20 insertions(+), 25 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/six.c b/fs/bcachefs/six.c
index 266ee5d95479..ff5d0506662e 100644
--- a/fs/bcachefs/six.c
+++ b/fs/bcachefs/six.c
@@ -70,26 +70,26 @@ struct six_lock_vals {
 	enum six_lock_type	unlock_wakeup;
 };
 
-#define LOCK_VALS {							\
-	[SIX_LOCK_read] = {						\
-		.lock_val	= 1ULL << SIX_STATE_READ_OFFSET,	\
-		.lock_fail	= SIX_LOCK_HELD_write,			\
-		.held_mask	= SIX_LOCK_HELD_read,			\
-		.unlock_wakeup	= SIX_LOCK_write,			\
-	},								\
-	[SIX_LOCK_intent] = {						\
-		.lock_val	= SIX_STATE_INTENT_HELD,		\
-		.lock_fail	= SIX_LOCK_HELD_intent,			\
-		.held_mask	= SIX_LOCK_HELD_intent,			\
-		.unlock_wakeup	= SIX_LOCK_intent,			\
-	},								\
-	[SIX_LOCK_write] = {						\
-		.lock_val	= SIX_LOCK_HELD_write,			\
-		.lock_fail	= SIX_LOCK_HELD_read,			\
-		.held_mask	= SIX_LOCK_HELD_write,			\
-		.unlock_wakeup	= SIX_LOCK_read,			\
-	},								\
-}
+static const struct six_lock_vals l[] = {
+	[SIX_LOCK_read] = {
+		.lock_val	= 1ULL << SIX_STATE_READ_OFFSET,
+		.lock_fail	= SIX_LOCK_HELD_write,
+		.held_mask	= SIX_LOCK_HELD_read,
+		.unlock_wakeup	= SIX_LOCK_write,
+	},
+	[SIX_LOCK_intent] = {
+		.lock_val	= SIX_STATE_INTENT_HELD,
+		.lock_fail	= SIX_LOCK_HELD_intent,
+		.held_mask	= SIX_LOCK_HELD_intent,
+		.unlock_wakeup	= SIX_LOCK_intent,
+	},
+	[SIX_LOCK_write] = {
+		.lock_val	= SIX_LOCK_HELD_write,
+		.lock_fail	= SIX_LOCK_HELD_read,
+		.held_mask	= SIX_LOCK_HELD_write,
+		.unlock_wakeup	= SIX_LOCK_read,
+	},
+};
 
 static inline u32 six_state_seq(u64 state)
 {
@@ -144,7 +144,6 @@ static inline unsigned pcpu_read_count(struct six_lock *lock)
 static int __do_six_trylock(struct six_lock *lock, enum six_lock_type type,
 			    struct task_struct *task, bool try)
 {
-	const struct six_lock_vals l[] = LOCK_VALS;
 	int ret;
 	u64 old, new, v;
 
@@ -625,7 +624,6 @@ EXPORT_SYMBOL_GPL(six_lock_ip_waiter);
 __always_inline
 static void do_six_unlock_type(struct six_lock *lock, enum six_lock_type type)
 {
-	const struct six_lock_vals l[] = LOCK_VALS;
 	u64 state;
 
 	if (type == SIX_LOCK_intent)
@@ -712,7 +710,6 @@ EXPORT_SYMBOL_GPL(six_lock_downgrade);
  */
 bool six_lock_tryupgrade(struct six_lock *lock)
 {
-	const struct six_lock_vals l[] = LOCK_VALS;
 	u64 old, new, v = atomic64_read(&lock->state);
 
 	do {
@@ -780,8 +777,6 @@ EXPORT_SYMBOL_GPL(six_trylock_convert);
  */
 void six_lock_increment(struct six_lock *lock, enum six_lock_type type)
 {
-	const struct six_lock_vals l[] = LOCK_VALS;
-
 	six_acquire(&lock->dep_map, 0, type == SIX_LOCK_read, _RET_IP_);
 
 	/* XXX: assert already locked, and that we don't overflow: */
-- 
cgit 


From 2804d0f15bda386fa86c7b6b432edb758e785bce Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 22 May 2023 00:17:40 -0400
Subject: six locks: Split out seq, use atomic_t instead of atomic64_t

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/six.c | 124 ++++++++++++++++++++++++------------------------------
 fs/bcachefs/six.h |   5 ++-
 2 files changed, 58 insertions(+), 71 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/six.c b/fs/bcachefs/six.c
index ff5d0506662e..54e4aa35a350 100644
--- a/fs/bcachefs/six.c
+++ b/fs/bcachefs/six.c
@@ -32,25 +32,18 @@ static void do_six_unlock_type(struct six_lock *lock, enum six_lock_type type);
  * bits 29-30		has read waiters
  * bits 30-31		has intent waiters
  * bits 31-32		has write waiters
- * bits 32-64		sequence number: incremented on every write lock or
- *			unlock, thus bit 33 (sequence number odd) indicates
- *			lock is currently held for write
  */
 
 #define SIX_STATE_READ_OFFSET		0
 #define SIX_STATE_READ_BITS		26
 
-#define SIX_STATE_READ_LOCK		~(~0ULL << 26)
-#define SIX_STATE_INTENT_HELD		(1ULL << 26)
-#define SIX_STATE_WRITE_LOCK		(1ULL << 27)
-#define SIX_STATE_NOSPIN		(1ULL << 28)
-#define SIX_STATE_WAITING_READ		(1ULL << (29 + SIX_LOCK_read))
-#define SIX_STATE_WAITING_INTENT	(1ULL << (29 + SIX_LOCK_intent))
-#define SIX_STATE_WAITING_WRITE		(1ULL << (29 + SIX_LOCK_write))
-
-#define SIX_STATE_SEQ_OFFSET		32
-#define SIX_STATE_SEQ_BITS		32
-#define SIX_STATE_SEQ			(~0ULL << 32)
+#define SIX_STATE_READ_LOCK		~(~0U << 26)
+#define SIX_STATE_INTENT_HELD		(1U << 26)
+#define SIX_STATE_WRITE_LOCK		(1U << 27)
+#define SIX_STATE_NOSPIN		(1U << 28)
+#define SIX_STATE_WAITING_READ		(1U << (29 + SIX_LOCK_read))
+#define SIX_STATE_WAITING_INTENT	(1U << (29 + SIX_LOCK_intent))
+#define SIX_STATE_WAITING_WRITE		(1U << (29 + SIX_LOCK_write))
 
 #define SIX_LOCK_HELD_read		SIX_STATE_READ_LOCK
 #define SIX_LOCK_HELD_intent		SIX_STATE_INTENT_HELD
@@ -58,13 +51,13 @@ static void do_six_unlock_type(struct six_lock *lock, enum six_lock_type type);
 
 struct six_lock_vals {
 	/* Value we add to the lock in order to take the lock: */
-	u64			lock_val;
+	u32			lock_val;
 
 	/* If the lock has this value (used as a mask), taking the lock fails: */
-	u64			lock_fail;
+	u32			lock_fail;
 
 	/* Mask that indicates lock is held for this type: */
-	u64			held_mask;
+	u32			held_mask;
 
 	/* Waitlist we wakeup when releasing the lock: */
 	enum six_lock_type	unlock_wakeup;
@@ -72,7 +65,7 @@ struct six_lock_vals {
 
 static const struct six_lock_vals l[] = {
 	[SIX_LOCK_read] = {
-		.lock_val	= 1ULL << SIX_STATE_READ_OFFSET,
+		.lock_val	= 1U << SIX_STATE_READ_OFFSET,
 		.lock_fail	= SIX_LOCK_HELD_write,
 		.held_mask	= SIX_LOCK_HELD_read,
 		.unlock_wakeup	= SIX_LOCK_write,
@@ -91,25 +84,20 @@ static const struct six_lock_vals l[] = {
 	},
 };
 
-static inline u32 six_state_seq(u64 state)
-{
-	return state >> SIX_STATE_SEQ_OFFSET;
-}
-
-static inline void six_set_bitmask(struct six_lock *lock, u64 mask)
+static inline void six_set_bitmask(struct six_lock *lock, u32 mask)
 {
-	if ((atomic64_read(&lock->state) & mask) != mask)
-		atomic64_or(mask, &lock->state);
+	if ((atomic_read(&lock->state) & mask) != mask)
+		atomic_or(mask, &lock->state);
 }
 
-static inline void six_clear_bitmask(struct six_lock *lock, u64 mask)
+static inline void six_clear_bitmask(struct six_lock *lock, u32 mask)
 {
-	if (atomic64_read(&lock->state) & mask)
-		atomic64_and(~mask, &lock->state);
+	if (atomic_read(&lock->state) & mask)
+		atomic_and(~mask, &lock->state);
 }
 
 static inline void six_set_owner(struct six_lock *lock, enum six_lock_type type,
-				 u64 old, struct task_struct *owner)
+				 u32 old, struct task_struct *owner)
 {
 	if (type != SIX_LOCK_intent)
 		return;
@@ -145,13 +133,11 @@ static int __do_six_trylock(struct six_lock *lock, enum six_lock_type type,
 			    struct task_struct *task, bool try)
 {
 	int ret;
-	u64 old, new, v;
+	u32 old, new, v;
 
 	EBUG_ON(type == SIX_LOCK_write && lock->owner != task);
 	EBUG_ON(type == SIX_LOCK_write &&
-		(try != !(atomic64_read(&lock->state) & SIX_LOCK_HELD_write)));
-	EBUG_ON(type == SIX_LOCK_write &&
-		(try != !(atomic64_read(&lock->state) & SIX_STATE_WRITE_LOCK)));
+		(try != !(atomic_read(&lock->state) & SIX_STATE_WRITE_LOCK)));
 
 	/*
 	 * Percpu reader mode:
@@ -186,7 +172,7 @@ static int __do_six_trylock(struct six_lock *lock, enum six_lock_type type,
 
 		smp_mb();
 
-		old = atomic64_read(&lock->state);
+		old = atomic_read(&lock->state);
 		ret = !(old & l[type].lock_fail);
 
 		this_cpu_sub(*lock->readers, !ret);
@@ -196,19 +182,19 @@ static int __do_six_trylock(struct six_lock *lock, enum six_lock_type type,
 			ret = -1 - SIX_LOCK_write;
 	} else if (type == SIX_LOCK_write && lock->readers) {
 		if (try) {
-			atomic64_add(SIX_STATE_WRITE_LOCK, &lock->state);
+			atomic_add(SIX_STATE_WRITE_LOCK, &lock->state);
 			smp_mb__after_atomic();
 		}
 
 		ret = !pcpu_read_count(lock);
 
 		if (try && !ret) {
-			old = atomic64_sub_return(SIX_STATE_WRITE_LOCK, &lock->state);
+			old = atomic_sub_return(SIX_STATE_WRITE_LOCK, &lock->state);
 			if (old & SIX_STATE_WAITING_READ)
 				ret = -1 - SIX_LOCK_read;
 		}
 	} else {
-		v = atomic64_read(&lock->state);
+		v = atomic_read(&lock->state);
 		do {
 			new = old = v;
 
@@ -218,16 +204,16 @@ static int __do_six_trylock(struct six_lock *lock, enum six_lock_type type,
 				break;
 
 			new += l[type].lock_val;
-		} while ((v = atomic64_cmpxchg_acquire(&lock->state, old, new)) != old);
+		} while ((v = atomic_cmpxchg_acquire(&lock->state, old, new)) != old);
 
-		EBUG_ON(ret && !(atomic64_read(&lock->state) & l[type].held_mask));
+		EBUG_ON(ret && !(atomic_read(&lock->state) & l[type].held_mask));
 	}
 
 	if (ret > 0)
 		six_set_owner(lock, type, old, task);
 
 	EBUG_ON(type == SIX_LOCK_write && try && ret <= 0 &&
-		(atomic64_read(&lock->state) & SIX_STATE_WRITE_LOCK));
+		(atomic_read(&lock->state) & SIX_STATE_WRITE_LOCK));
 
 	return ret;
 }
@@ -277,7 +263,7 @@ unlock:
 }
 
 __always_inline
-static void six_lock_wakeup(struct six_lock *lock, u64 state,
+static void six_lock_wakeup(struct six_lock *lock, u32 state,
 			    enum six_lock_type lock_type)
 {
 	if (lock_type == SIX_LOCK_write && (state & SIX_LOCK_HELD_read))
@@ -317,7 +303,8 @@ bool six_trylock_ip(struct six_lock *lock, enum six_lock_type type, unsigned lon
 	if (type != SIX_LOCK_write)
 		six_acquire(&lock->dep_map, 1, type == SIX_LOCK_read, ip);
 	else
-		atomic64_add(1ULL << SIX_STATE_SEQ_OFFSET, &lock->state);
+		lock->seq++;
+
 	return true;
 }
 EXPORT_SYMBOL_GPL(six_trylock_ip);
@@ -482,12 +469,12 @@ static int six_lock_slowpath(struct six_lock *lock, enum six_lock_type type,
 			     six_lock_should_sleep_fn should_sleep_fn, void *p,
 			     unsigned long ip)
 {
-	u64 old;
+	u32 old;
 	int ret = 0;
 
 	if (type == SIX_LOCK_write) {
-		EBUG_ON(atomic64_read(&lock->state) & SIX_STATE_WRITE_LOCK);
-		atomic64_add(SIX_STATE_WRITE_LOCK, &lock->state);
+		EBUG_ON(atomic_read(&lock->state) & SIX_STATE_WRITE_LOCK);
+		atomic_add(SIX_STATE_WRITE_LOCK, &lock->state);
 		smp_mb__after_atomic();
 	}
 
@@ -609,8 +596,7 @@ int six_lock_ip_waiter(struct six_lock *lock, enum six_lock_type type,
 	ret = do_six_trylock(lock, type, true) ? 0
 		: six_lock_slowpath(lock, type, wait, should_sleep_fn, p, ip);
 
-	if (!ret && type == SIX_LOCK_write)
-		atomic64_add(1ULL << SIX_STATE_SEQ_OFFSET, &lock->state);
+	lock->seq += !ret && type == SIX_LOCK_write;
 
 	if (ret && type != SIX_LOCK_write)
 		six_release(&lock->dep_map, ip);
@@ -624,7 +610,7 @@ EXPORT_SYMBOL_GPL(six_lock_ip_waiter);
 __always_inline
 static void do_six_unlock_type(struct six_lock *lock, enum six_lock_type type)
 {
-	u64 state;
+	u32 state;
 
 	if (type == SIX_LOCK_intent)
 		lock->owner = NULL;
@@ -634,15 +620,15 @@ static void do_six_unlock_type(struct six_lock *lock, enum six_lock_type type)
 		smp_mb(); /* unlock barrier */
 		this_cpu_dec(*lock->readers);
 		smp_mb(); /* between unlocking and checking for waiters */
-		state = atomic64_read(&lock->state);
+		state = atomic_read(&lock->state);
 	} else {
-		u64 v = l[type].lock_val;
+		u32 v = l[type].lock_val;
 
 		if (type != SIX_LOCK_read)
-			v += atomic64_read(&lock->state) & SIX_STATE_NOSPIN;
+			v += atomic_read(&lock->state) & SIX_STATE_NOSPIN;
 
-		EBUG_ON(!(atomic64_read(&lock->state) & l[type].held_mask));
-		state = atomic64_sub_return_release(v, &lock->state);
+		EBUG_ON(!(atomic_read(&lock->state) & l[type].held_mask));
+		state = atomic_sub_return_release(v, &lock->state);
 	}
 
 	six_lock_wakeup(lock, state, l[type].unlock_wakeup);
@@ -666,7 +652,7 @@ static void do_six_unlock_type(struct six_lock *lock, enum six_lock_type type)
 void six_unlock_ip(struct six_lock *lock, enum six_lock_type type, unsigned long ip)
 {
 	EBUG_ON(type == SIX_LOCK_write &&
-		!(atomic64_read(&lock->state) & SIX_LOCK_HELD_intent));
+		!(atomic_read(&lock->state) & SIX_LOCK_HELD_intent));
 	EBUG_ON((type == SIX_LOCK_write ||
 		 type == SIX_LOCK_intent) &&
 		lock->owner != current);
@@ -674,7 +660,7 @@ void six_unlock_ip(struct six_lock *lock, enum six_lock_type type, unsigned long
 	if (type != SIX_LOCK_write)
 		six_release(&lock->dep_map, ip);
 	else
-		atomic64_add(1ULL << SIX_STATE_SEQ_OFFSET, &lock->state);
+		lock->seq++;
 
 	if (type == SIX_LOCK_intent &&
 	    lock->intent_lock_recurse) {
@@ -710,7 +696,7 @@ EXPORT_SYMBOL_GPL(six_lock_downgrade);
  */
 bool six_lock_tryupgrade(struct six_lock *lock)
 {
-	u64 old, new, v = atomic64_read(&lock->state);
+	u32 old, new, v = atomic_read(&lock->state);
 
 	do {
 		new = old = v;
@@ -724,7 +710,7 @@ bool six_lock_tryupgrade(struct six_lock *lock)
 		}
 
 		new |= SIX_LOCK_HELD_intent;
-	} while ((v = atomic64_cmpxchg_acquire(&lock->state, old, new)) != old);
+	} while ((v = atomic_cmpxchg_acquire(&lock->state, old, new)) != old);
 
 	if (lock->readers)
 		this_cpu_dec(*lock->readers);
@@ -786,14 +772,14 @@ void six_lock_increment(struct six_lock *lock, enum six_lock_type type)
 		if (lock->readers) {
 			this_cpu_inc(*lock->readers);
 		} else {
-			EBUG_ON(!(atomic64_read(&lock->state) &
+			EBUG_ON(!(atomic_read(&lock->state) &
 				  (SIX_LOCK_HELD_read|
 				   SIX_LOCK_HELD_intent)));
-			atomic64_add(l[type].lock_val, &lock->state);
+			atomic_add(l[type].lock_val, &lock->state);
 		}
 		break;
 	case SIX_LOCK_intent:
-		EBUG_ON(!(atomic64_read(&lock->state) & SIX_LOCK_HELD_intent));
+		EBUG_ON(!(atomic_read(&lock->state) & SIX_LOCK_HELD_intent));
 		lock->intent_lock_recurse++;
 		break;
 	case SIX_LOCK_write:
@@ -815,7 +801,7 @@ EXPORT_SYMBOL_GPL(six_lock_increment);
  */
 void six_lock_wakeup_all(struct six_lock *lock)
 {
-	u64 state = atomic64_read(&lock->state);
+	u32 state = atomic_read(&lock->state);
 	struct six_lock_waiter *w;
 
 	six_lock_wakeup(lock, state, SIX_LOCK_read);
@@ -840,11 +826,11 @@ struct six_lock_count six_lock_counts(struct six_lock *lock)
 	struct six_lock_count ret;
 
 	ret.n[SIX_LOCK_read]	= !lock->readers
-		? atomic64_read(&lock->state) & SIX_STATE_READ_LOCK
+		? atomic_read(&lock->state) & SIX_STATE_READ_LOCK
 		: pcpu_read_count(lock);
-	ret.n[SIX_LOCK_intent]	= !!(atomic64_read(&lock->state) & SIX_LOCK_HELD_intent) +
+	ret.n[SIX_LOCK_intent]	= !!(atomic_read(&lock->state) & SIX_LOCK_HELD_intent) +
 		lock->intent_lock_recurse;
-	ret.n[SIX_LOCK_write]	= !!(atomic64_read(&lock->state) & SIX_LOCK_HELD_write);
+	ret.n[SIX_LOCK_write]	= !!(atomic_read(&lock->state) & SIX_LOCK_HELD_write);
 
 	return ret;
 }
@@ -875,9 +861,9 @@ void six_lock_readers_add(struct six_lock *lock, int nr)
 	if (lock->readers) {
 		this_cpu_add(*lock->readers, nr);
 	} else {
-		EBUG_ON((int) (atomic64_read(&lock->state) & SIX_STATE_READ_LOCK) + nr < 0);
+		EBUG_ON((int) (atomic_read(&lock->state) & SIX_STATE_READ_LOCK) + nr < 0);
 		/* reader count starts at bit 0 */
-		atomic64_add(nr, &lock->state);
+		atomic_add(nr, &lock->state);
 	}
 }
 EXPORT_SYMBOL_GPL(six_lock_readers_add);
@@ -892,7 +878,7 @@ EXPORT_SYMBOL_GPL(six_lock_readers_add);
 void six_lock_exit(struct six_lock *lock)
 {
 	WARN_ON(lock->readers && pcpu_read_count(lock));
-	WARN_ON(atomic64_read(&lock->state) & SIX_LOCK_HELD_read);
+	WARN_ON(atomic_read(&lock->state) & SIX_LOCK_HELD_read);
 
 	free_percpu(lock->readers);
 	lock->readers = NULL;
@@ -902,7 +888,7 @@ EXPORT_SYMBOL_GPL(six_lock_exit);
 void __six_lock_init(struct six_lock *lock, const char *name,
 		     struct lock_class_key *key, enum six_lock_init_flags flags)
 {
-	atomic64_set(&lock->state, 0);
+	atomic_set(&lock->state, 0);
 	raw_spin_lock_init(&lock->wait_lock);
 	INIT_LIST_HEAD(&lock->wait_list);
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
diff --git a/fs/bcachefs/six.h b/fs/bcachefs/six.h
index 82bf9de72490..4c268b0b8316 100644
--- a/fs/bcachefs/six.h
+++ b/fs/bcachefs/six.h
@@ -138,7 +138,8 @@ enum six_lock_type {
 };
 
 struct six_lock {
-	atomic64_t		state;
+	atomic_t		state;
+	u32			seq;
 	unsigned		intent_lock_recurse;
 	struct task_struct	*owner;
 	unsigned __percpu	*readers;
@@ -196,7 +197,7 @@ do {									\
  */
 static inline u32 six_lock_seq(const struct six_lock *lock)
 {
-	return atomic64_read(&lock->state) >> 32;
+	return lock->seq;
 }
 
 bool six_trylock_ip(struct six_lock *lock, enum six_lock_type type, unsigned long ip);
-- 
cgit 


From 32913f49f54f0cf9ccf581e3abd2d1fc6ba4debf Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 16 Jun 2023 15:56:42 -0400
Subject: six locks: Seq now only incremented on unlock

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c    | 1 -
 fs/bcachefs/btree_iter.h    | 9 +--------
 fs/bcachefs/btree_locking.h | 4 ++--
 fs/bcachefs/six.c           | 5 -----
 4 files changed, 3 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 3e65e6876ec7..f0d0b64a55a4 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -652,7 +652,6 @@ void bch2_btree_path_level_init(struct btree_trans *trans,
 	BUG_ON(path->cached);
 
 	EBUG_ON(!btree_path_pos_in_node(path, b));
-	EBUG_ON(six_lock_seq(&b->c.lock) & 1);
 
 	path->l[b->c.level].lock_seq = six_lock_seq(&b->c.lock);
 	path->l[b->c.level].b = b;
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index 7d3564d72a7d..0cfb8af3d0e1 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -42,14 +42,7 @@ static inline struct btree *btree_path_node(struct btree_path *path,
 static inline bool btree_node_lock_seq_matches(const struct btree_path *path,
 					const struct btree *b, unsigned level)
 {
-	/*
-	 * We don't compare the low bits of the lock sequence numbers because
-	 * @path might have taken a write lock on @b, and we don't want to skip
-	 * the linked path if the sequence numbers were equal before taking that
-	 * write lock. The lock sequence number is incremented by taking and
-	 * releasing write locks and is even when unlocked:
-	 */
-	return path->l[level].lock_seq >> 1 == six_lock_seq(&b->c.lock) >> 1;
+	return path->l[level].lock_seq == six_lock_seq(&b->c.lock);
 }
 
 static inline struct btree *btree_node_parent(struct btree_path *path,
diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h
index f9bb8736c061..d3837c25f110 100644
--- a/fs/bcachefs/btree_locking.h
+++ b/fs/bcachefs/btree_locking.h
@@ -175,13 +175,13 @@ bch2_btree_node_unlock_write_inlined(struct btree_trans *trans, struct btree_pat
 	struct btree_path *linked;
 
 	EBUG_ON(path->l[b->c.level].b != b);
-	EBUG_ON(path->l[b->c.level].lock_seq + 1 != six_lock_seq(&b->c.lock));
+	EBUG_ON(path->l[b->c.level].lock_seq != six_lock_seq(&b->c.lock));
 	EBUG_ON(btree_node_locked_type(path, b->c.level) != SIX_LOCK_write);
 
 	mark_btree_node_locked_noreset(path, b->c.level, SIX_LOCK_intent);
 
 	trans_for_each_path_with_node(trans, b, linked)
-		linked->l[b->c.level].lock_seq += 2;
+		linked->l[b->c.level].lock_seq++;
 
 	six_unlock_write(&b->c.lock);
 }
diff --git a/fs/bcachefs/six.c b/fs/bcachefs/six.c
index 54e4aa35a350..8ce0998b9775 100644
--- a/fs/bcachefs/six.c
+++ b/fs/bcachefs/six.c
@@ -302,9 +302,6 @@ bool six_trylock_ip(struct six_lock *lock, enum six_lock_type type, unsigned lon
 
 	if (type != SIX_LOCK_write)
 		six_acquire(&lock->dep_map, 1, type == SIX_LOCK_read, ip);
-	else
-		lock->seq++;
-
 	return true;
 }
 EXPORT_SYMBOL_GPL(six_trylock_ip);
@@ -596,8 +593,6 @@ int six_lock_ip_waiter(struct six_lock *lock, enum six_lock_type type,
 	ret = do_six_trylock(lock, type, true) ? 0
 		: six_lock_slowpath(lock, type, wait, should_sleep_fn, p, ip);
 
-	lock->seq += !ret && type == SIX_LOCK_write;
-
 	if (ret && type != SIX_LOCK_write)
 		six_release(&lock->dep_map, ip);
 	if (!ret)
-- 
cgit 


From 2ab62310fd1c723bd8ab8e8242e31fa494c9681f Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 22 May 2023 12:11:13 -0400
Subject: six locks: Tiny bit more tidying

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/six.c | 64 ++++++++++++++++++++++++++-----------------------------
 1 file changed, 30 insertions(+), 34 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/six.c b/fs/bcachefs/six.c
index 8ce0998b9775..6b6c64031994 100644
--- a/fs/bcachefs/six.c
+++ b/fs/bcachefs/six.c
@@ -34,20 +34,14 @@ static void do_six_unlock_type(struct six_lock *lock, enum six_lock_type type);
  * bits 31-32		has write waiters
  */
 
-#define SIX_STATE_READ_OFFSET		0
-#define SIX_STATE_READ_BITS		26
-
-#define SIX_STATE_READ_LOCK		~(~0U << 26)
-#define SIX_STATE_INTENT_HELD		(1U << 26)
-#define SIX_STATE_WRITE_LOCK		(1U << 27)
-#define SIX_STATE_NOSPIN		(1U << 28)
-#define SIX_STATE_WAITING_READ		(1U << (29 + SIX_LOCK_read))
-#define SIX_STATE_WAITING_INTENT	(1U << (29 + SIX_LOCK_intent))
-#define SIX_STATE_WAITING_WRITE		(1U << (29 + SIX_LOCK_write))
-
-#define SIX_LOCK_HELD_read		SIX_STATE_READ_LOCK
-#define SIX_LOCK_HELD_intent		SIX_STATE_INTENT_HELD
-#define SIX_LOCK_HELD_write		SIX_STATE_WRITE_LOCK
+#define SIX_LOCK_HELD_read_OFFSET	0
+#define SIX_LOCK_HELD_read		~(~0U << 26)
+#define SIX_LOCK_HELD_intent		(1U << 26)
+#define SIX_LOCK_HELD_write		(1U << 27)
+#define SIX_LOCK_WAITING_read		(1U << (28 + SIX_LOCK_read))
+#define SIX_LOCK_WAITING_intent		(1U << (28 + SIX_LOCK_intent))
+#define SIX_LOCK_WAITING_write		(1U << (28 + SIX_LOCK_write))
+#define SIX_LOCK_NOSPIN			(1U << 31)
 
 struct six_lock_vals {
 	/* Value we add to the lock in order to take the lock: */
@@ -65,13 +59,13 @@ struct six_lock_vals {
 
 static const struct six_lock_vals l[] = {
 	[SIX_LOCK_read] = {
-		.lock_val	= 1U << SIX_STATE_READ_OFFSET,
+		.lock_val	= 1U << SIX_LOCK_HELD_read_OFFSET,
 		.lock_fail	= SIX_LOCK_HELD_write,
 		.held_mask	= SIX_LOCK_HELD_read,
 		.unlock_wakeup	= SIX_LOCK_write,
 	},
 	[SIX_LOCK_intent] = {
-		.lock_val	= SIX_STATE_INTENT_HELD,
+		.lock_val	= SIX_LOCK_HELD_intent,
 		.lock_fail	= SIX_LOCK_HELD_intent,
 		.held_mask	= SIX_LOCK_HELD_intent,
 		.unlock_wakeup	= SIX_LOCK_intent,
@@ -137,7 +131,7 @@ static int __do_six_trylock(struct six_lock *lock, enum six_lock_type type,
 
 	EBUG_ON(type == SIX_LOCK_write && lock->owner != task);
 	EBUG_ON(type == SIX_LOCK_write &&
-		(try != !(atomic_read(&lock->state) & SIX_STATE_WRITE_LOCK)));
+		(try != !(atomic_read(&lock->state) & SIX_LOCK_HELD_write)));
 
 	/*
 	 * Percpu reader mode:
@@ -178,19 +172,19 @@ static int __do_six_trylock(struct six_lock *lock, enum six_lock_type type,
 		this_cpu_sub(*lock->readers, !ret);
 		preempt_enable();
 
-		if (!ret && (old & SIX_STATE_WAITING_WRITE))
+		if (!ret && (old & SIX_LOCK_WAITING_write))
 			ret = -1 - SIX_LOCK_write;
 	} else if (type == SIX_LOCK_write && lock->readers) {
 		if (try) {
-			atomic_add(SIX_STATE_WRITE_LOCK, &lock->state);
+			atomic_add(SIX_LOCK_HELD_write, &lock->state);
 			smp_mb__after_atomic();
 		}
 
 		ret = !pcpu_read_count(lock);
 
 		if (try && !ret) {
-			old = atomic_sub_return(SIX_STATE_WRITE_LOCK, &lock->state);
-			if (old & SIX_STATE_WAITING_READ)
+			old = atomic_sub_return(SIX_LOCK_HELD_write, &lock->state);
+			if (old & SIX_LOCK_WAITING_read)
 				ret = -1 - SIX_LOCK_read;
 		}
 	} else {
@@ -200,8 +194,10 @@ static int __do_six_trylock(struct six_lock *lock, enum six_lock_type type,
 
 			ret = !(old & l[type].lock_fail);
 
-			if (!ret || (type == SIX_LOCK_write && !try))
+			if (!ret || (type == SIX_LOCK_write && !try)) {
+				smp_mb();
 				break;
+			}
 
 			new += l[type].lock_val;
 		} while ((v = atomic_cmpxchg_acquire(&lock->state, old, new)) != old);
@@ -213,7 +209,7 @@ static int __do_six_trylock(struct six_lock *lock, enum six_lock_type type,
 		six_set_owner(lock, type, old, task);
 
 	EBUG_ON(type == SIX_LOCK_write && try && ret <= 0 &&
-		(atomic_read(&lock->state) & SIX_STATE_WRITE_LOCK));
+		(atomic_read(&lock->state) & SIX_LOCK_HELD_write));
 
 	return ret;
 }
@@ -252,7 +248,7 @@ again:
 		wake_up_process(task);
 	}
 
-	six_clear_bitmask(lock, SIX_STATE_WAITING_READ << lock_type);
+	six_clear_bitmask(lock, SIX_LOCK_WAITING_read << lock_type);
 unlock:
 	raw_spin_unlock(&lock->wait_lock);
 
@@ -269,7 +265,7 @@ static void six_lock_wakeup(struct six_lock *lock, u32 state,
 	if (lock_type == SIX_LOCK_write && (state & SIX_LOCK_HELD_read))
 		return;
 
-	if (!(state & (SIX_STATE_WAITING_READ << lock_type)))
+	if (!(state & (SIX_LOCK_WAITING_read << lock_type)))
 		return;
 
 	__six_lock_wakeup(lock, lock_type);
@@ -372,7 +368,7 @@ static inline bool six_spin_on_owner(struct six_lock *lock,
 		}
 
 		if (!(++loop & 0xf) && (time_after64(sched_clock(), end_time))) {
-			six_set_bitmask(lock, SIX_STATE_NOSPIN);
+			six_set_bitmask(lock, SIX_LOCK_NOSPIN);
 			ret = false;
 			break;
 		}
@@ -470,8 +466,8 @@ static int six_lock_slowpath(struct six_lock *lock, enum six_lock_type type,
 	int ret = 0;
 
 	if (type == SIX_LOCK_write) {
-		EBUG_ON(atomic_read(&lock->state) & SIX_STATE_WRITE_LOCK);
-		atomic_add(SIX_STATE_WRITE_LOCK, &lock->state);
+		EBUG_ON(atomic_read(&lock->state) & SIX_LOCK_HELD_write);
+		atomic_add(SIX_LOCK_HELD_write, &lock->state);
 		smp_mb__after_atomic();
 	}
 
@@ -485,7 +481,7 @@ static int six_lock_slowpath(struct six_lock *lock, enum six_lock_type type,
 	wait->lock_acquired	= false;
 
 	raw_spin_lock(&lock->wait_lock);
-	six_set_bitmask(lock, SIX_STATE_WAITING_READ << type);
+	six_set_bitmask(lock, SIX_LOCK_WAITING_read << type);
 	/*
 	 * Retry taking the lock after taking waitlist lock, in case we raced
 	 * with an unlock:
@@ -530,7 +526,7 @@ static int six_lock_slowpath(struct six_lock *lock, enum six_lock_type type,
 				list_del(&wait->list);
 			raw_spin_unlock(&lock->wait_lock);
 
-			if (wait->lock_acquired)
+			if (unlikely(wait->lock_acquired))
 				do_six_unlock_type(lock, type);
 			break;
 		}
@@ -541,7 +537,7 @@ static int six_lock_slowpath(struct six_lock *lock, enum six_lock_type type,
 	__set_current_state(TASK_RUNNING);
 out:
 	if (ret && type == SIX_LOCK_write) {
-		six_clear_bitmask(lock, SIX_STATE_WRITE_LOCK);
+		six_clear_bitmask(lock, SIX_LOCK_HELD_write);
 		six_lock_wakeup(lock, old, SIX_LOCK_read);
 	}
 
@@ -620,7 +616,7 @@ static void do_six_unlock_type(struct six_lock *lock, enum six_lock_type type)
 		u32 v = l[type].lock_val;
 
 		if (type != SIX_LOCK_read)
-			v += atomic_read(&lock->state) & SIX_STATE_NOSPIN;
+			v += atomic_read(&lock->state) & SIX_LOCK_NOSPIN;
 
 		EBUG_ON(!(atomic_read(&lock->state) & l[type].held_mask));
 		state = atomic_sub_return_release(v, &lock->state);
@@ -821,7 +817,7 @@ struct six_lock_count six_lock_counts(struct six_lock *lock)
 	struct six_lock_count ret;
 
 	ret.n[SIX_LOCK_read]	= !lock->readers
-		? atomic_read(&lock->state) & SIX_STATE_READ_LOCK
+		? atomic_read(&lock->state) & SIX_LOCK_HELD_read
 		: pcpu_read_count(lock);
 	ret.n[SIX_LOCK_intent]	= !!(atomic_read(&lock->state) & SIX_LOCK_HELD_intent) +
 		lock->intent_lock_recurse;
@@ -856,7 +852,7 @@ void six_lock_readers_add(struct six_lock *lock, int nr)
 	if (lock->readers) {
 		this_cpu_add(*lock->readers, nr);
 	} else {
-		EBUG_ON((int) (atomic_read(&lock->state) & SIX_STATE_READ_LOCK) + nr < 0);
+		EBUG_ON((int) (atomic_read(&lock->state) & SIX_LOCK_HELD_read) + nr < 0);
 		/* reader count starts at bit 0 */
 		atomic_add(nr, &lock->state);
 	}
-- 
cgit 


From 96e53e909d0433f73831315b106f16895a74b843 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 23 May 2023 00:21:22 -0400
Subject: six locks: Delete redundant comment

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/six.c | 11 -----------
 1 file changed, 11 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/six.c b/fs/bcachefs/six.c
index 6b6c64031994..fa508ab2108c 100644
--- a/fs/bcachefs/six.c
+++ b/fs/bcachefs/six.c
@@ -23,17 +23,6 @@
 
 static void do_six_unlock_type(struct six_lock *lock, enum six_lock_type type);
 
-/*
- * bits 0-26		reader count
- * bits 26-27		write_locking (a thread is trying to get a write lock,
- *			but does not have one yet)
- * bits 27-28		held for intent
- * bits 28-29		nospin - optimistic spinning has timed out
- * bits 29-30		has read waiters
- * bits 30-31		has intent waiters
- * bits 31-32		has write waiters
- */
-
 #define SIX_LOCK_HELD_read_OFFSET	0
 #define SIX_LOCK_HELD_read		~(~0U << 26)
 #define SIX_LOCK_HELD_intent		(1U << 26)
-- 
cgit 


From c4687a4a7536c3b2139faa63e66afd1d3da5bf15 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 25 May 2023 18:10:04 -0400
Subject: six locks: Fix an unitialized var

In the conversion to atomic_t, six_lock_slowpath() ended up calling
six_lock_wakeup() in the failure path with a state variable that was
never initialized - whoops.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/six.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/six.c b/fs/bcachefs/six.c
index fa508ab2108c..9a5fcd7d3566 100644
--- a/fs/bcachefs/six.c
+++ b/fs/bcachefs/six.c
@@ -451,7 +451,6 @@ static int six_lock_slowpath(struct six_lock *lock, enum six_lock_type type,
 			     six_lock_should_sleep_fn should_sleep_fn, void *p,
 			     unsigned long ip)
 {
-	u32 old;
 	int ret = 0;
 
 	if (type == SIX_LOCK_write) {
@@ -527,7 +526,7 @@ static int six_lock_slowpath(struct six_lock *lock, enum six_lock_type type,
 out:
 	if (ret && type == SIX_LOCK_write) {
 		six_clear_bitmask(lock, SIX_LOCK_HELD_write);
-		six_lock_wakeup(lock, old, SIX_LOCK_read);
+		six_lock_wakeup(lock, atomic_read(&lock->state), SIX_LOCK_read);
 	}
 
 	return ret;
-- 
cgit 


From 2d9200cfe004b8208a04ea15b3967f9a14135be6 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 25 May 2023 14:35:06 -0400
Subject: six locks: Use atomic_try_cmpxchg_acquire()

This switches to a newer cmpxchg variant which updates @old for us on
failure, simplifying the cmpxchg loops a bit and supposedly generating
better code.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/six.c | 17 ++++++-----------
 1 file changed, 6 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/six.c b/fs/bcachefs/six.c
index 9a5fcd7d3566..00580f8ff1c5 100644
--- a/fs/bcachefs/six.c
+++ b/fs/bcachefs/six.c
@@ -116,7 +116,7 @@ static int __do_six_trylock(struct six_lock *lock, enum six_lock_type type,
 			    struct task_struct *task, bool try)
 {
 	int ret;
-	u32 old, new, v;
+	u32 old;
 
 	EBUG_ON(type == SIX_LOCK_write && lock->owner != task);
 	EBUG_ON(type == SIX_LOCK_write &&
@@ -177,19 +177,14 @@ static int __do_six_trylock(struct six_lock *lock, enum six_lock_type type,
 				ret = -1 - SIX_LOCK_read;
 		}
 	} else {
-		v = atomic_read(&lock->state);
+		old = atomic_read(&lock->state);
 		do {
-			new = old = v;
-
 			ret = !(old & l[type].lock_fail);
-
 			if (!ret || (type == SIX_LOCK_write && !try)) {
 				smp_mb();
 				break;
 			}
-
-			new += l[type].lock_val;
-		} while ((v = atomic_cmpxchg_acquire(&lock->state, old, new)) != old);
+		} while (!atomic_try_cmpxchg_acquire(&lock->state, &old, old + l[type].lock_val));
 
 		EBUG_ON(ret && !(atomic_read(&lock->state) & l[type].held_mask));
 	}
@@ -675,10 +670,10 @@ EXPORT_SYMBOL_GPL(six_lock_downgrade);
  */
 bool six_lock_tryupgrade(struct six_lock *lock)
 {
-	u32 old, new, v = atomic_read(&lock->state);
+	u32 old = atomic_read(&lock->state), new;
 
 	do {
-		new = old = v;
+		new = old;
 
 		if (new & SIX_LOCK_HELD_intent)
 			return false;
@@ -689,7 +684,7 @@ bool six_lock_tryupgrade(struct six_lock *lock)
 		}
 
 		new |= SIX_LOCK_HELD_intent;
-	} while ((v = atomic_cmpxchg_acquire(&lock->state, old, new)) != old);
+	} while (!atomic_try_cmpxchg_acquire(&lock->state, &old, new));
 
 	if (lock->readers)
 		this_cpu_dec(*lock->readers);
-- 
cgit 


From aab5e0972a32790c0dbfac64929529820114c674 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 10 Jun 2023 10:57:23 -0400
Subject: six locks: Disable percpu read lock mode in userspace

When running in userspace, we currently don't have a real percpu
implementation available - at least in bcachefs-tools, which is where
this code is currently used in userspace.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/six.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/six.c b/fs/bcachefs/six.c
index 00580f8ff1c5..7ce45aeaee8d 100644
--- a/fs/bcachefs/six.c
+++ b/fs/bcachefs/six.c
@@ -870,6 +870,11 @@ void __six_lock_init(struct six_lock *lock, const char *name,
 	lockdep_init_map(&lock->dep_map, name, key, 0);
 #endif
 
+	/*
+	 * Don't assume that we have real percpu variables available in
+	 * userspace:
+	 */
+#ifdef __KERNEL__
 	if (flags & SIX_LOCK_INIT_PCPU) {
 		/*
 		 * We don't return an error here on memory allocation failure
@@ -880,5 +885,6 @@ void __six_lock_init(struct six_lock *lock, const char *name,
 		 */
 		lock->readers = alloc_percpu(unsigned);
 	}
+#endif
 }
 EXPORT_SYMBOL_GPL(__six_lock_init);
-- 
cgit 


From 65bc41090720cdc249c1b0b9b9b8a8f062b41268 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 25 May 2023 22:22:25 -0400
Subject: mean and variance: More tests

Add some more tests that test conventional and weighted mean
simultaneously, and with a table of values that represents events that
we'll be using this to look for so we can verify-by-eyeball that the
output looks sane.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/mean_and_variance.h      |  14 +++--
 fs/bcachefs/mean_and_variance_test.c | 101 ++++++++++++++++++++++++++++++++---
 fs/bcachefs/util.c                   |   4 +-
 3 files changed, 102 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/mean_and_variance.h b/fs/bcachefs/mean_and_variance.h
index 880e9501c614..6dd4c050e78a 100644
--- a/fs/bcachefs/mean_and_variance.h
+++ b/fs/bcachefs/mean_and_variance.h
@@ -176,14 +176,12 @@ static inline s64 fast_divpow2(s64 n, u8 d)
  *
  * see linked pdf equation 12.
  */
-static inline struct mean_and_variance
-mean_and_variance_update(struct mean_and_variance s, s64 v)
-{
-	return (struct mean_and_variance) {
-		.n           = s.n + 1,
-		.sum         = s.sum + v,
-		.sum_squares = u128_add(s.sum_squares, u128_square(abs(v))),
-	};
+static inline void
+mean_and_variance_update(struct mean_and_variance *s, s64 v)
+{
+	s->n++;
+	s->sum += v;
+	s->sum_squares = u128_add(s->sum_squares, u128_square(abs(v)));
 }
 
 s64 mean_and_variance_get_mean(struct mean_and_variance s);
diff --git a/fs/bcachefs/mean_and_variance_test.c b/fs/bcachefs/mean_and_variance_test.c
index 2b4cf9b1781b..019583c3ca0e 100644
--- a/fs/bcachefs/mean_and_variance_test.c
+++ b/fs/bcachefs/mean_and_variance_test.c
@@ -9,15 +9,15 @@ static void mean_and_variance_basic_test(struct kunit *test)
 {
 	struct mean_and_variance s = {};
 
-	s = mean_and_variance_update(s, 2);
-	s = mean_and_variance_update(s, 2);
+	mean_and_variance_update(&s, 2);
+	mean_and_variance_update(&s, 2);
 
 	KUNIT_EXPECT_EQ(test, mean_and_variance_get_mean(s), 2);
 	KUNIT_EXPECT_EQ(test, mean_and_variance_get_variance(s), 0);
 	KUNIT_EXPECT_EQ(test, s.n, 2);
 
-	s = mean_and_variance_update(s, 4);
-	s = mean_and_variance_update(s, 4);
+	mean_and_variance_update(&s, 4);
+	mean_and_variance_update(&s, 4);
 
 	KUNIT_EXPECT_EQ(test, mean_and_variance_get_mean(s), 3);
 	KUNIT_EXPECT_EQ(test, mean_and_variance_get_variance(s), 1);
@@ -33,8 +33,6 @@ static void mean_and_variance_weighted_test(struct kunit *test)
 {
 	struct mean_and_variance_weighted s = { .weight = 2 };
 
-	s.weight = 2;
-
 	mean_and_variance_weighted_update(&s, 10);
 	KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), 10);
 	KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 0);
@@ -60,7 +58,6 @@ static void mean_and_variance_weighted_test(struct kunit *test)
 	mean_and_variance_weighted_update(&s, -30);
 	KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), -16);
 	KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 72);
-
 }
 
 static void mean_and_variance_weighted_advanced_test(struct kunit *test)
@@ -81,7 +78,93 @@ static void mean_and_variance_weighted_advanced_test(struct kunit *test)
 
 	KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), -11);
 	KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 107);
+}
+
+static void do_mean_and_variance_test(struct kunit *test,
+				      s64 initial_value,
+				      s64 initial_n,
+				      s64 n,
+				      unsigned weight,
+				      s64 *data,
+				      s64 *mean,
+				      s64 *stddev,
+				      s64 *weighted_mean,
+				      s64 *weighted_stddev)
+{
+	struct mean_and_variance mv = {};
+	struct mean_and_variance_weighted vw = { .weight = weight };
+
+	for (unsigned i = 0; i < initial_n; i++) {
+		mean_and_variance_update(&mv, initial_value);
+		mean_and_variance_weighted_update(&vw, initial_value);
 
+		KUNIT_EXPECT_EQ(test, mean_and_variance_get_mean(mv),		initial_value);
+		KUNIT_EXPECT_EQ(test, mean_and_variance_get_stddev(mv),		0);
+		KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(vw),	initial_value);
+		KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_stddev(vw),0);
+	}
+
+	for (unsigned i = 0; i < n; i++) {
+		mean_and_variance_update(&mv, data[i]);
+		mean_and_variance_weighted_update(&vw, data[i]);
+
+		KUNIT_EXPECT_EQ(test, mean_and_variance_get_mean(mv),		mean[i]);
+		KUNIT_EXPECT_EQ(test, mean_and_variance_get_stddev(mv),		stddev[i]);
+		KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(vw),	weighted_mean[i]);
+		KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_stddev(vw),weighted_stddev[i]);
+	}
+
+	KUNIT_EXPECT_EQ(test, mv.n, initial_n + n);
+}
+
+/* Test behaviour with a single outlier, then back to steady state: */
+static void mean_and_variance_test_1(struct kunit *test)
+{
+	s64 d[]			= { 100, 10, 10, 10, 10, 10, 10 };
+	s64 mean[]		= {  22, 21, 20, 19, 18, 17, 16 };
+	s64 stddev[]		= {  32, 29, 28, 27, 26, 25, 24 };
+	s64 weighted_mean[]	= {  32, 27, 22, 19, 17, 15, 14 };
+	s64 weighted_stddev[]	= {  38, 35, 31, 27, 24, 21, 18 };
+
+	do_mean_and_variance_test(test, 10, 6, ARRAY_SIZE(d), 2,
+			d, mean, stddev, weighted_mean, weighted_stddev);
+}
+
+static void mean_and_variance_test_2(struct kunit *test)
+{
+	s64 d[]			= { 100, 10, 10, 10, 10, 10, 10 };
+	s64 mean[]		= {  10, 10, 10, 10, 10, 10, 10 };
+	s64 stddev[]		= {   9,  9,  9,  9,  9,  9,  9 };
+	s64 weighted_mean[]	= {  32, 27, 22, 19, 17, 15, 14 };
+	s64 weighted_stddev[]	= {  38, 35, 31, 27, 24, 21, 18 };
+
+	do_mean_and_variance_test(test, 10, 6, ARRAY_SIZE(d), 2,
+			d, mean, stddev, weighted_mean, weighted_stddev);
+}
+
+/* Test behaviour where we switch from one steady state to another: */
+static void mean_and_variance_test_3(struct kunit *test)
+{
+	s64 d[]			= { 100, 100, 100, 100, 100 };
+	s64 mean[]		= {  22,  32,  40,  46,  50 };
+	s64 stddev[]		= {  32,  39,  42,  44,  45 };
+	s64 weighted_mean[]	= {  32,  49,  61,  71,  78 };
+	s64 weighted_stddev[]	= {  38,  44,  44,  41,  38 };
+
+	do_mean_and_variance_test(test, 10, 6, ARRAY_SIZE(d), 2,
+			d, mean, stddev, weighted_mean, weighted_stddev);
+}
+
+static void mean_and_variance_test_4(struct kunit *test)
+{
+	s64 d[]			= { 100, 100, 100, 100, 100 };
+	s64 mean[]		= {  10,  11,  12,  13,  14 };
+	s64 stddev[]		= {   9,  13,  15,  17,  19 };
+	s64 weighted_mean[]	= {  32,  49,  61,  71,  78 };
+	s64 weighted_stddev[]	= {  38,  44,  44,  41,  38 };
+
+	do_mean_and_variance_test(test, 10, 6, ARRAY_SIZE(d), 2,
+			d, mean, stddev, weighted_mean, weighted_stddev);
 }
 
 static void mean_and_variance_fast_divpow2(struct kunit *test)
@@ -139,6 +222,10 @@ static struct kunit_case mean_and_variance_test_cases[] = {
 	KUNIT_CASE(mean_and_variance_basic_test),
 	KUNIT_CASE(mean_and_variance_weighted_test),
 	KUNIT_CASE(mean_and_variance_weighted_advanced_test),
+	KUNIT_CASE(mean_and_variance_test_1),
+	KUNIT_CASE(mean_and_variance_test_2),
+	KUNIT_CASE(mean_and_variance_test_3),
+	KUNIT_CASE(mean_and_variance_test_4),
 	{}
 };
 
diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c
index e0c93da2523f..6374d8aa9afc 100644
--- a/fs/bcachefs/util.c
+++ b/fs/bcachefs/util.c
@@ -350,7 +350,7 @@ static inline void bch2_time_stats_update_one(struct bch2_time_stats *stats,
 
 	if (time_after64(end, start)) {
 		duration = end - start;
-		stats->duration_stats = mean_and_variance_update(stats->duration_stats, duration);
+		mean_and_variance_update(&stats->duration_stats, duration);
 		mean_and_variance_weighted_update(&stats->duration_stats_weighted, duration);
 		stats->max_duration = max(stats->max_duration, duration);
 		stats->min_duration = min(stats->min_duration, duration);
@@ -359,7 +359,7 @@ static inline void bch2_time_stats_update_one(struct bch2_time_stats *stats,
 
 	if (time_after64(end, stats->last_event)) {
 		freq = end - stats->last_event;
-		stats->freq_stats = mean_and_variance_update(stats->freq_stats, freq);
+		mean_and_variance_update(&stats->freq_stats, freq);
 		mean_and_variance_weighted_update(&stats->freq_stats_weighted, freq);
 		stats->max_freq = max(stats->max_freq, freq);
 		stats->min_freq = min(stats->min_freq, freq);
-- 
cgit 


From db32bb9a5fd6bd7c7031b4b9d6c9a5e27b651e5d Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 4 Jun 2023 17:58:56 -0400
Subject: mean and variance: Add a missing include

abs() is in math.h

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/mean_and_variance.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/bcachefs/mean_and_variance.h b/fs/bcachefs/mean_and_variance.h
index 6dd4c050e78a..647505010b39 100644
--- a/fs/bcachefs/mean_and_variance.h
+++ b/fs/bcachefs/mean_and_variance.h
@@ -4,6 +4,7 @@
 
 #include <linux/types.h>
 #include <linux/limits.h>
+#include <linux/math.h>
 #include <linux/math64.h>
 
 #define SQRT_U64_MAX 4294967295ULL
-- 
cgit 


From fc0ee376bb5b08844198fba13fb809102afd0b29 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 25 May 2023 23:37:06 -0400
Subject: bcachefs: Don't reuse reflink btree keyspace

We've been seeing difficult to debug "missing indirect extent" bugs,
that fsck doesn't seem to find.

One possibility is that there was a missing indirect extent, but then a
new indirect extent was created at the location of the previous indirect
extent.

This patch eliminates that possibility by always creating new indirect
extents right after the last one, at the end of the reflink btree.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs.h |  1 -
 fs/bcachefs/reflink.c  | 20 ++++----------------
 2 files changed, 4 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 39fd15447753..0dfa42e297e0 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -963,7 +963,6 @@ struct bch_fs {
 	struct bio_set		ec_bioset;
 
 	/* REFLINK */
-	u64			reflink_hint;
 	reflink_gc_table	reflink_gc_table;
 	size_t			reflink_gc_nr;
 
diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c
index 9430899a5a31..26f0275ff0af 100644
--- a/fs/bcachefs/reflink.c
+++ b/fs/bcachefs/reflink.c
@@ -167,24 +167,13 @@ static int bch2_make_extent_indirect(struct btree_trans *trans,
 	if (orig->k.type == KEY_TYPE_inline_data)
 		bch2_check_set_feature(c, BCH_FEATURE_reflink_inline_data);
 
-	for_each_btree_key_norestart(trans, reflink_iter, BTREE_ID_reflink,
-			   POS(0, c->reflink_hint),
-			   BTREE_ITER_SLOTS, k, ret) {
-		if (reflink_iter.pos.inode) {
-			bch2_btree_iter_set_pos(&reflink_iter, POS_MIN);
-			continue;
-		}
-
-		if (bkey_deleted(k.k) && orig->k.size <= k.k->size)
-			break;
-	}
-
+	bch2_trans_iter_init(trans, &reflink_iter, BTREE_ID_reflink, POS_MAX,
+			     BTREE_ITER_INTENT);
+	k = bch2_btree_iter_peek_prev(&reflink_iter);
+	ret = bkey_err(k);
 	if (ret)
 		goto err;
 
-	/* rewind iter to start of hole, if necessary: */
-	bch2_btree_iter_set_pos_to_extent_start(&reflink_iter);
-
 	r_v = bch2_trans_kmalloc(trans, sizeof(__le64) + bkey_bytes(&orig->k));
 	ret = PTR_ERR_OR_ZERO(r_v);
 	if (ret)
@@ -226,7 +215,6 @@ static int bch2_make_extent_indirect(struct btree_trans *trans,
 	ret = bch2_trans_update(trans, extent_iter, &r_p->k_i,
 				BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
 err:
-	c->reflink_hint = reflink_iter.pos.offset;
 	bch2_trans_iter_exit(trans, &reflink_iter);
 
 	return ret;
-- 
cgit 


From c26463ce9940d150dfeaac0a1a0747db9b1ca600 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 26 May 2023 18:12:55 -0400
Subject: bcachefs: Fix move_extent_fail counter

fail counters need to be events, not numbers of sectors - or the
calculations the tests use for determining if we've had too many
slowpath events don't work.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/data_update.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c
index c709538ce9c2..ae7e60d6e583 100644
--- a/fs/bcachefs/data_update.c
+++ b/fs/bcachefs/data_update.c
@@ -374,7 +374,7 @@ nowork:
 				     &m->ctxt->stats->sectors_raced);
 		}
 
-		this_cpu_add(c->counters[BCH_COUNTER_move_extent_fail], new->k.size);
+		this_cpu_inc(c->counters[BCH_COUNTER_move_extent_fail]);
 
 		bch2_btree_iter_advance(&iter);
 		goto next;
-- 
cgit 


From e7ffda565a762a6bdf782b4978af5ccfe4ab5d0d Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 27 May 2023 20:00:13 -0400
Subject: bcachefs: Fix a quota read bug

bch2_fs_quota_read() could see an inode that's been deleted
(KEY_TYPE_inode_generation) - bch2_fs_quota_read_inode() needs to check
for that instead of erroring.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/quota.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c
index 310eb9d26571..56ba82cae19d 100644
--- a/fs/bcachefs/quota.c
+++ b/fs/bcachefs/quota.c
@@ -576,6 +576,13 @@ static int bch2_fs_quota_read_inode(struct btree_trans *trans,
 					le32_to_cpu(s_t.master_subvol),
 					k.k->p.offset,
 				}, &u);
+	/*
+	 * Inode might be deleted in this snapshot - the easiest way to handle
+	 * that is to just skip it here:
+	 */
+	if (bch2_err_matches(ret, ENOENT))
+		goto advance;
+
 	if (ret)
 		return ret;
 
@@ -615,7 +622,7 @@ int bch2_fs_quota_read(struct bch_fs *c)
 			POS_MIN, BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
 		bch2_fs_quota_read_inode(&trans, &iter, k));
 	if (ret)
-		bch_err(c, "err in quota_read: %s", bch2_err_str(ret));
+		bch_err(c, "%s: err %s", __func__, bch2_err_str(ret));
 
 	bch2_trans_exit(&trans);
 	return ret;
-- 
cgit 


From f154c3eb429a340d66a06e8f8d2221d28d25ab45 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 27 May 2023 19:55:54 -0400
Subject: bcachefs: trans_for_each_path_safe()

bch2_btree_trans_to_text() is used on btree_trans objects that are owned
by different threads - when printing out deadlock cycles - so we need a
safe version of trans_for_each_path(), else we race with seeing a
btree_path that was just allocated and not fully initialized:

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c    |  8 ++++++--
 fs/bcachefs/btree_iter.h    | 29 +++++++++++++++++++++++++++++
 fs/bcachefs/btree_locking.c |  7 ++++---
 3 files changed, 39 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index f0d0b64a55a4..4830d203b37b 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -2914,6 +2914,10 @@ static void bch2_trans_alloc_paths(struct btree_trans *trans, struct bch_fs *c)
 #endif
 	if (!p)
 		p = mempool_alloc(&trans->c->btree_paths_pool, GFP_NOFS);
+	/*
+	 * paths need to be zeroed, bch2_check_for_deadlock looks at paths in
+	 * other threads
+	 */
 
 	trans->paths		= p; p += paths_bytes;
 	trans->updates		= p; p += updates_bytes;
@@ -3111,7 +3115,7 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct btree_trans *trans)
 	struct btree_path *path;
 	struct btree_bkey_cached_common *b;
 	static char lock_types[] = { 'r', 'i', 'w' };
-	unsigned l;
+	unsigned l, idx;
 
 	if (!out->nr_tabstops) {
 		printbuf_tabstop_push(out, 16);
@@ -3120,7 +3124,7 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct btree_trans *trans)
 
 	prt_printf(out, "%i %s\n", trans->locking_wait.task->pid, trans->fn);
 
-	trans_for_each_path(trans, path) {
+	trans_for_each_path_safe(trans, path, idx) {
 		if (!path->nodes_locked)
 			continue;
 
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index 0cfb8af3d0e1..9a4dbf358fe5 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -89,6 +89,35 @@ __trans_next_path(struct btree_trans *trans, unsigned idx)
 #define trans_for_each_path(_trans, _path)				\
 	trans_for_each_path_from(_trans, _path, 0)
 
+static inline struct btree_path *
+__trans_next_path_safe(struct btree_trans *trans, unsigned *idx)
+{
+	u64 l;
+
+	if (*idx == BTREE_ITER_MAX)
+		return NULL;
+
+	l = trans->paths_allocated >> *idx;
+	if (!l)
+		return NULL;
+
+	*idx += __ffs64(l);
+	EBUG_ON(*idx >= BTREE_ITER_MAX);
+	return &trans->paths[*idx];
+}
+
+/*
+ * This version is intended to be safe for use on a btree_trans that is owned by
+ * another thread, for bch2_btree_trans_to_text();
+ */
+#define trans_for_each_path_safe_from(_trans, _path, _idx, _start)	\
+	for (_idx = _start;						\
+	     (_path = __trans_next_path_safe((_trans), &_idx));		\
+	     _idx++)
+
+#define trans_for_each_path_safe(_trans, _path, _idx)			\
+	trans_for_each_path_safe_from(_trans, _path, _idx, 0)
+
 static inline struct btree_path *next_btree_path(struct btree_trans *trans, struct btree_path *path)
 {
 	unsigned idx = path ? path->sorted_idx + 1 : 0;
diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c
index 6e1306add443..1f4eca898ab7 100644
--- a/fs/bcachefs/btree_locking.c
+++ b/fs/bcachefs/btree_locking.c
@@ -255,6 +255,7 @@ int bch2_check_for_deadlock(struct btree_trans *trans, struct printbuf *cycle)
 	struct trans_waiting_for_lock *top;
 	struct btree_bkey_cached_common *b;
 	struct btree_path *path;
+	unsigned path_idx;
 	int ret;
 
 	if (trans->lock_must_abort) {
@@ -273,12 +274,12 @@ next:
 
 	top = &g.g[g.nr - 1];
 
-	trans_for_each_path_from(top->trans, path, top->path_idx) {
+	trans_for_each_path_safe_from(top->trans, path, path_idx, top->path_idx) {
 		if (!path->nodes_locked)
 			continue;
 
-		if (top->path_idx != path->idx) {
-			top->path_idx		= path->idx;
+		if (path_idx != top->path_idx) {
+			top->path_idx		= path_idx;
 			top->level		= 0;
 			top->lock_start_time	= 0;
 		}
-- 
cgit 


From e47a390aa5946e3c5bea7a4a350a88d3bb3ba5b4 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 27 May 2023 19:59:59 -0400
Subject: bcachefs: Convert -ENOENT to private error codes

As with previous conversions, replace -ENOENT uses with more informative
private error codes.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/acl.c        |  6 +++---
 fs/bcachefs/btree_iter.h |  2 +-
 fs/bcachefs/buckets.c    |  2 +-
 fs/bcachefs/chardev.c    |  2 +-
 fs/bcachefs/errcode.h    | 11 +++++++++++
 fs/bcachefs/fs-common.c  |  2 +-
 fs/bcachefs/fs-ioctl.c   |  2 +-
 fs/bcachefs/fs.c         |  6 +++---
 fs/bcachefs/fsck.c       | 24 ++++++++++++------------
 fs/bcachefs/inode.c      |  2 +-
 fs/bcachefs/move.c       |  2 +-
 fs/bcachefs/movinggc.c   |  2 +-
 fs/bcachefs/quota.c      |  4 ++--
 fs/bcachefs/recovery.c   |  2 +-
 fs/bcachefs/str_hash.h   |  4 ++--
 fs/bcachefs/subvolume.c  | 21 +++++++++++++--------
 fs/bcachefs/super.c      |  2 +-
 fs/bcachefs/xattr.c      |  6 +++---
 18 files changed, 59 insertions(+), 43 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/acl.c b/fs/bcachefs/acl.c
index 2bf58aa89f71..3fe108bc2f08 100644
--- a/fs/bcachefs/acl.c
+++ b/fs/bcachefs/acl.c
@@ -236,7 +236,7 @@ retry:
 	if (ret) {
 		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
 			goto retry;
-		if (ret != -ENOENT)
+		if (!bch2_err_matches(ret, ENOENT))
 			acl = ERR_PTR(ret);
 		goto out;
 	}
@@ -287,7 +287,7 @@ int bch2_set_acl_trans(struct btree_trans *trans, subvol_inum inum,
 				       inum, &search);
 	}
 
-	return ret == -ENOENT ? 0 : ret;
+	return bch2_err_matches(ret, ENOENT) ? 0 : ret;
 }
 
 int bch2_set_acl(struct mnt_idmap *idmap,
@@ -368,7 +368,7 @@ int bch2_acl_chmod(struct btree_trans *trans, subvol_inum inum,
 			&X_SEARCH(KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS, "", 0),
 			BTREE_ITER_INTENT);
 	if (ret)
-		return ret == -ENOENT ? 0 : ret;
+		return bch2_err_matches(ret, ENOENT) ? 0 : ret;
 
 	k = bch2_btree_iter_peek_slot(&iter);
 	xattr = bkey_s_c_to_xattr(k);
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index 9a4dbf358fe5..5e5e2a5c715c 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -516,7 +516,7 @@ static inline struct bkey_s_c __bch2_bkey_get_iter(struct btree_trans *trans,
 	k = bch2_btree_iter_peek_slot(iter);
 
 	if (!bkey_err(k) && type && k.k->type != type)
-		k = bkey_s_c_err(-ENOENT);
+		k = bkey_s_c_err(-BCH_ERR_ENOENT_bkey_type_mismatch);
 	if (unlikely(bkey_err(k)))
 		bch2_trans_iter_exit(trans, iter);
 	return k;
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index bd144182c1e1..adf3bd0e4a8f 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -1455,7 +1455,7 @@ static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans,
 			BTREE_ITER_WITH_UPDATES, stripe);
 	ret = PTR_ERR_OR_ZERO(s);
 	if (unlikely(ret)) {
-		bch2_trans_inconsistent_on(ret == -ENOENT, trans,
+		bch2_trans_inconsistent_on(bch2_err_matches(ret, ENOENT), trans,
 			"pointer to nonexistent stripe %llu",
 			(u64) p.ec.idx);
 		goto err;
diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c
index 28854a6c31b9..fb603df099a5 100644
--- a/fs/bcachefs/chardev.c
+++ b/fs/bcachefs/chardev.c
@@ -578,7 +578,7 @@ static long bch2_ioctl_disk_get_idx(struct bch_fs *c,
 			return i;
 		}
 
-	return -ENOENT;
+	return -BCH_ERR_ENOENT_dev_idx_not_found;
 }
 
 static long bch2_ioctl_disk_resize(struct bch_fs *c,
diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h
index c8ac08e5548b..acf9b92f9ab0 100644
--- a/fs/bcachefs/errcode.h
+++ b/fs/bcachefs/errcode.h
@@ -94,6 +94,17 @@
 	x(ENOSPC,			ENOSPC_sb_crypt)			\
 	x(ENOSPC,			ENOSPC_btree_slot)			\
 	x(ENOSPC,			ENOSPC_snapshot_tree)			\
+	x(ENOENT,			ENOENT_bkey_type_mismatch)		\
+	x(ENOENT,			ENOENT_str_hash_lookup)			\
+	x(ENOENT,			ENOENT_str_hash_set_must_replace)	\
+	x(ENOENT,			ENOENT_inode)				\
+	x(ENOENT,			ENOENT_not_subvol)			\
+	x(ENOENT,			ENOENT_directory_dead)			\
+	x(ENOENT,			ENOENT_subvolume)			\
+	x(ENOENT,			ENOENT_snapshot_tree)			\
+	x(ENOENT,			ENOENT_dirent_doesnt_match_inode)	\
+	x(ENOENT,			ENOENT_dev_not_found)			\
+	x(ENOENT,			ENOENT_dev_idx_not_found)		\
 	x(0,				open_buckets_empty)			\
 	x(0,				freelist_empty)				\
 	x(BCH_ERR_freelist_empty,	no_buckets_found)			\
diff --git a/fs/bcachefs/fs-common.c b/fs/bcachefs/fs-common.c
index 1f2e1fc4f6b2..bb5305441f27 100644
--- a/fs/bcachefs/fs-common.c
+++ b/fs/bcachefs/fs-common.c
@@ -281,7 +281,7 @@ int bch2_unlink_trans(struct btree_trans *trans,
 	}
 
 	if (deleting_snapshot && !inode_u->bi_subvol) {
-		ret = -ENOENT;
+		ret = -BCH_ERR_ENOENT_not_subvol;
 		goto err;
 	}
 
diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c
index 269af9393824..dfa1bf73c854 100644
--- a/fs/bcachefs/fs-ioctl.c
+++ b/fs/bcachefs/fs-ioctl.c
@@ -382,7 +382,7 @@ retry:
 
 	dir = dst_path.dentry->d_inode;
 	if (IS_DEADDIR(dir)) {
-		error = -ENOENT;
+		error = -BCH_ERR_ENOENT_directory_dead;
 		goto err3;
 	}
 
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index 99082820e30b..ba7aff6b8a51 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -105,7 +105,7 @@ retry:
 	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
 		goto retry;
 
-	bch2_fs_fatal_err_on(ret == -ENOENT, c,
+	bch2_fs_fatal_err_on(bch2_err_matches(ret, ENOENT), c,
 			     "inode %u:%llu not found when updating",
 			     inode_inum(inode).subvol,
 			     inode_inum(inode).inum);
@@ -1261,14 +1261,14 @@ retry:
 			goto err;
 
 		if (k.k->type != KEY_TYPE_dirent) {
-			ret = -ENOENT;
+			ret = -BCH_ERR_ENOENT_dirent_doesnt_match_inode;
 			goto err;
 		}
 
 		d = bkey_s_c_to_dirent(k);
 		ret = bch2_dirent_read_target(&trans, inode_inum(dir), d, &target);
 		if (ret > 0)
-			ret = -ENOENT;
+			ret = -BCH_ERR_ENOENT_dirent_doesnt_match_inode;
 		if (ret)
 			goto err;
 
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 1b3ee66265c9..dcc55cbd3808 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -78,7 +78,7 @@ static int __snapshot_lookup_subvol(struct btree_trans *trans, u32 snapshot,
 					  snapshot, &s);
 	if (!ret)
 		*subvol = le32_to_cpu(s.subvol);
-	else if (ret == -ENOENT)
+	else if (bch2_err_matches(ret, ENOENT))
 		bch_err(trans->c, "snapshot %u not fonud", snapshot);
 	return ret;
 
@@ -119,7 +119,7 @@ static int lookup_first_inode(struct btree_trans *trans, u64 inode_nr,
 		goto err;
 
 	if (!k.k || !bkey_eq(k.k->p, POS(0, inode_nr))) {
-		ret = -ENOENT;
+		ret = -BCH_ERR_ENOENT_inode;
 		goto err;
 	}
 
@@ -148,7 +148,7 @@ static int __lookup_inode(struct btree_trans *trans, u64 inode_nr,
 
 	ret = bkey_is_inode(k.k)
 		? bch2_inode_unpack(k, inode)
-		: -ENOENT;
+		: -BCH_ERR_ENOENT_inode;
 	if (!ret)
 		*snapshot = iter.pos.snapshot;
 err:
@@ -333,7 +333,7 @@ static int lookup_lostfound(struct btree_trans *trans, u32 subvol,
 
 	ret = __lookup_dirent(trans, root_hash_info, root_inum,
 			    &lostfound_str, &inum, &d_type);
-	if (ret == -ENOENT) {
+	if (bch2_err_matches(ret, ENOENT)) {
 		bch_notice(c, "creating lost+found");
 		goto create_lostfound;
 	}
@@ -1088,7 +1088,7 @@ static int inode_backpointer_exists(struct btree_trans *trans,
 			SPOS(inode->bi_dir, inode->bi_dir_offset, snapshot));
 	ret = bkey_err(d);
 	if (ret)
-		return ret == -ENOENT ? 0 : ret;
+		return bch2_err_matches(ret, ENOENT) ? 0 : ret;
 
 	ret = dirent_points_to_inode(d, inode);
 	bch2_trans_iter_exit(trans, &iter);
@@ -1653,7 +1653,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
 
 		ret = __subvol_lookup(trans, target_subvol,
 				      &target_snapshot, &target_inum);
-		if (ret && ret != -ENOENT)
+		if (ret && !bch2_err_matches(ret, ENOENT))
 			goto err;
 
 		if (fsck_err_on(ret, c,
@@ -1665,7 +1665,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
 
 		ret = __lookup_inode(trans, target_inum,
 				   &subvol_root, &target_snapshot);
-		if (ret && ret != -ENOENT)
+		if (ret && !bch2_err_matches(ret, ENOENT))
 			goto err;
 
 		if (fsck_err_on(ret, c,
@@ -1846,7 +1846,7 @@ static int check_root_trans(struct btree_trans *trans)
 	int ret;
 
 	ret = __subvol_lookup(trans, BCACHEFS_ROOT_SUBVOL, &snapshot, &inum);
-	if (ret && ret != -ENOENT)
+	if (ret && !bch2_err_matches(ret, ENOENT))
 		return ret;
 
 	if (mustfix_fsck_err_on(ret, c, "root subvol missing")) {
@@ -1873,7 +1873,7 @@ static int check_root_trans(struct btree_trans *trans)
 	}
 
 	ret = __lookup_inode(trans, BCACHEFS_ROOT_INO, &root_inode, &snapshot);
-	if (ret && ret != -ENOENT)
+	if (ret && !bch2_err_matches(ret, ENOENT))
 		return ret;
 
 	if (mustfix_fsck_err_on(ret, c, "root directory missing") ||
@@ -1972,15 +1972,15 @@ static int check_path(struct btree_trans *trans,
 			PTR_ERR_OR_ZERO((d = dirent_get_by_pos(trans, &dirent_iter,
 					  SPOS(inode->bi_dir, inode->bi_dir_offset,
 					       parent_snapshot))).k));
-		if (ret && ret != -ENOENT)
+		if (ret && !bch2_err_matches(ret, ENOENT))
 			break;
 
 		if (!ret && !dirent_points_to_inode(d, inode)) {
 			bch2_trans_iter_exit(trans, &dirent_iter);
-			ret = -ENOENT;
+			ret = -BCH_ERR_ENOENT_dirent_doesnt_match_inode;
 		}
 
-		if (ret == -ENOENT) {
+		if (bch2_err_matches(ret, ENOENT)) {
 			if (fsck_err(c,  "unreachable inode %llu:%u, type %s nlink %u backptr %llu:%llu",
 				     inode->bi_inum, snapshot,
 				     bch2_d_type_str(inode_d_type(inode)),
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index ddcd7b125f32..64e8d1f8a2fa 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -336,7 +336,7 @@ int bch2_inode_peek(struct btree_trans *trans,
 	if (ret)
 		return ret;
 
-	ret = bkey_is_inode(k.k) ? 0 : -ENOENT;
+	ret = bkey_is_inode(k.k) ? 0 : -BCH_ERR_ENOENT_inode;
 	if (ret)
 		goto err;
 
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index 2ec30a3fd193..fd629136824b 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -427,7 +427,7 @@ static int lookup_inode(struct btree_trans *trans, struct bpos pos,
 		goto err;
 
 	if (!k.k || !bkey_eq(k.k->p, pos)) {
-		ret = -ENOENT;
+		ret = -BCH_ERR_ENOENT_inode;
 		goto err;
 	}
 
diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c
index 0d96346d5040..6750767276f2 100644
--- a/fs/bcachefs/movinggc.c
+++ b/fs/bcachefs/movinggc.c
@@ -236,7 +236,7 @@ err:
 	darray_exit(&buckets);
 
 	/* no entries in LRU btree found, or got to end: */
-	if (ret == -ENOENT)
+	if (bch2_err_matches(ret, ENOENT))
 		ret = 0;
 
 	if (ret < 0 && !bch2_err_matches(ret, EROFS))
diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c
index 56ba82cae19d..d20ec9764108 100644
--- a/fs/bcachefs/quota.c
+++ b/fs/bcachefs/quota.c
@@ -900,7 +900,7 @@ static int bch2_get_next_quota(struct super_block *sb, struct kqid *kqid,
 	ret = -ENOENT;
 found:
 	mutex_unlock(&q->lock);
-	return ret;
+	return bch2_err_class(ret);
 }
 
 static int bch2_set_quota_trans(struct btree_trans *trans,
@@ -960,7 +960,7 @@ static int bch2_set_quota(struct super_block *sb, struct kqid qid,
 			    bch2_set_quota_trans(&trans, &new_quota, qdq)) ?:
 		__bch2_quota_set(c, bkey_i_to_s_c(&new_quota.k_i), qdq);
 
-	return ret;
+	return bch2_err_class(ret);
 }
 
 const struct quotactl_ops bch2_quotactl_operations = {
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index af76c029fb6a..e4983d144483 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -1082,7 +1082,7 @@ static int bch2_fs_upgrade_for_subvolumes(struct btree_trans *trans)
 
 	if (!bkey_is_inode(k.k)) {
 		bch_err(trans->c, "root inode not found");
-		ret = -ENOENT;
+		ret = -BCH_ERR_ENOENT_inode;
 		goto err;
 	}
 
diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h
index 6178ae620ff1..ae21a8cca1b4 100644
--- a/fs/bcachefs/str_hash.h
+++ b/fs/bcachefs/str_hash.h
@@ -181,7 +181,7 @@ bch2_hash_lookup(struct btree_trans *trans,
 	}
 	bch2_trans_iter_exit(trans, iter);
 
-	return ret ?: -ENOENT;
+	return ret ?: -BCH_ERR_ENOENT_str_hash_lookup;
 }
 
 static __always_inline int
@@ -288,7 +288,7 @@ found:
 not_found:
 
 	if (!found && (flags & BCH_HASH_SET_MUST_REPLACE)) {
-		ret = -ENOENT;
+		ret = -BCH_ERR_ENOENT_str_hash_set_must_replace;
 	} else if (found && (flags & BCH_HASH_SET_MUST_CREATE)) {
 		ret = -EEXIST;
 	} else {
diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c
index 388fa12bbd8b..f07b3e2b3226 100644
--- a/fs/bcachefs/subvolume.c
+++ b/fs/bcachefs/subvolume.c
@@ -37,8 +37,12 @@ int bch2_snapshot_tree_invalid(const struct bch_fs *c, struct bkey_s_c k,
 int bch2_snapshot_tree_lookup(struct btree_trans *trans, u32 id,
 			      struct bch_snapshot_tree *s)
 {
-	return bch2_bkey_get_val_typed(trans, BTREE_ID_snapshot_trees, POS(0, id),
-				       BTREE_ITER_WITH_UPDATES, snapshot_tree, s);
+	int ret = bch2_bkey_get_val_typed(trans, BTREE_ID_snapshot_trees, POS(0, id),
+					  BTREE_ITER_WITH_UPDATES, snapshot_tree, s);
+
+	if (bch2_err_matches(ret, ENOENT))
+		ret = -BCH_ERR_ENOENT_snapshot_tree;
+	return ret;
 }
 
 static struct bkey_i_snapshot_tree *
@@ -284,6 +288,7 @@ static int bch2_snapshot_tree_master_subvol(struct btree_trans *trans,
 	struct btree_iter iter;
 	struct bkey_s_c k;
 	struct bkey_s_c_subvolume s;
+	bool found = false;
 	int ret;
 
 	for_each_btree_key_norestart(trans, iter, BTREE_ID_subvolumes, POS_MIN,
@@ -296,14 +301,14 @@ static int bch2_snapshot_tree_master_subvol(struct btree_trans *trans,
 			continue;
 		if (!BCH_SUBVOLUME_SNAP(s.v)) {
 			*subvol_id = s.k->p.offset;
-			goto found;
+			found = true;
+			break;
 		}
 	}
-	ret = ret ?: -ENOENT;
-found:
+
 	bch2_trans_iter_exit(trans, &iter);
 
-	if (bch2_err_matches(ret, ENOENT)) {
+	if (!ret && !found) {
 		struct bkey_i_subvolume *s;
 
 		*subvol_id = bch2_snapshot_tree_oldest_subvol(c, snapshot_root);
@@ -1217,7 +1222,7 @@ int bch2_subvolume_get_snapshot(struct btree_trans *trans, u32 subvol,
 	k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_subvolumes, POS(0, subvol),
 			       BTREE_ITER_CACHED|
 			       BTREE_ITER_WITH_UPDATES);
-	ret = bkey_err(k) ?: k.k->type == KEY_TYPE_subvolume ? 0 : -ENOENT;
+	ret = bkey_err(k) ?: k.k->type == KEY_TYPE_subvolume ? 0 : -BCH_ERR_ENOENT_subvolume;
 
 	if (likely(!ret))
 		*snapid = le32_to_cpu(bkey_s_c_to_subvolume(k).v->snapshot);
@@ -1444,7 +1449,7 @@ int bch2_subvolume_create(struct btree_trans *trans, u64 inode,
 				BTREE_ITER_CACHED, subvolume);
 		ret = PTR_ERR_OR_ZERO(src_subvol);
 		if (unlikely(ret)) {
-			bch2_fs_inconsistent_on(ret == -ENOENT, c,
+			bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c,
 						"subvolume %u not found", src_subvolid);
 			goto err;
 		}
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index af6cc73d9356..8f0cbd7ada82 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -1833,7 +1833,7 @@ struct bch_dev *bch2_dev_lookup(struct bch_fs *c, const char *name)
 	for_each_member_device_rcu(ca, c, i, NULL)
 		if (!strcmp(name, ca->name))
 			goto found;
-	ca = ERR_PTR(-ENOENT);
+	ca = ERR_PTR(-BCH_ERR_ENOENT_dev_not_found);
 found:
 	rcu_read_unlock();
 
diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c
index 448eb446946b..05c65d94c00f 100644
--- a/fs/bcachefs/xattr.c
+++ b/fs/bcachefs/xattr.c
@@ -163,7 +163,7 @@ static int bch2_xattr_get_trans(struct btree_trans *trans, struct bch_inode_info
 err2:
 	bch2_trans_iter_exit(trans, &iter);
 err1:
-	return ret == -ENOENT ? -ENODATA : ret;
+	return ret < 0 && bch2_err_matches(ret, ENOENT) ? -ENODATA : ret;
 }
 
 int bch2_xattr_get(struct bch_fs *c, struct bch_inode_info *inode,
@@ -229,7 +229,7 @@ int bch2_xattr_set(struct btree_trans *trans, subvol_inum inum,
 				       hash_info, inum, &search);
 	}
 
-	if (ret == -ENOENT)
+	if (bch2_err_matches(ret, ENOENT))
 		ret = flags & XATTR_REPLACE ? -ENODATA : 0;
 
 	return ret;
@@ -589,7 +589,7 @@ err:
 	     opt_id == Opt_background_target))
 		bch2_rebalance_add_work(c, inode->v.i_blocks);
 
-	return ret;
+	return bch2_err_class(ret);
 }
 
 static const struct xattr_handler bch_xattr_bcachefs_handler = {
-- 
cgit 


From ad520141b155786800261cc7e02ec02f0afe2643 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 27 May 2023 23:19:13 -0400
Subject: bcachefs: Fix corruption with writeable snapshots

When partially overwriting an extent in an older snapshot, the existing
extent has to be split.

If the existing extent was overwritten in a different (sibling)
snapshot, we have to ensure that the split won't be visible in the
sibling snapshot.

data_update.c already has code for this,
bch2_insert_snapshot_writeouts() - we just need to move it into
btree_update_leaf.c and change bch2_trans_update_extent() to use it as
well.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update.h      | 23 ++++++++++
 fs/bcachefs/btree_update_leaf.c | 73 ++++++++++++++++++++++++++++++--
 fs/bcachefs/data_update.c       | 94 +++--------------------------------------
 3 files changed, 99 insertions(+), 91 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
index 1ac3a81e0af6..e90cf292f80b 100644
--- a/fs/bcachefs/btree_update.h
+++ b/fs/bcachefs/btree_update.h
@@ -4,6 +4,7 @@
 
 #include "btree_iter.h"
 #include "journal.h"
+#include "journal.h"
 
 struct bch_fs;
 struct btree;
@@ -83,6 +84,28 @@ int bch2_btree_node_update_key(struct btree_trans *, struct btree_iter *,
 int bch2_btree_node_update_key_get_iter(struct btree_trans *,
 				struct btree *, struct bkey_i *, bool);
 
+int __bch2_insert_snapshot_whiteouts(struct btree_trans *, enum btree_id,
+				     struct bpos, struct bpos);
+
+/*
+ * For use when splitting extents in existing snapshots:
+ *
+ * If @old_pos is an interior snapshot node, iterate over descendent snapshot
+ * nodes: for every descendent snapshot in whiche @old_pos is overwritten and
+ * not visible, emit a whiteout at @new_pos.
+ */
+static inline int bch2_insert_snapshot_whiteouts(struct btree_trans *trans,
+						 enum btree_id btree,
+						 struct bpos old_pos,
+						 struct bpos new_pos)
+{
+	if (!btree_type_has_snapshots(btree) ||
+	    bkey_eq(old_pos, new_pos))
+		return 0;
+
+	return __bch2_insert_snapshot_whiteouts(trans, btree, old_pos, new_pos);
+}
+
 int bch2_trans_update_extent(struct btree_trans *, struct btree_iter *,
 			     struct bkey_i *, enum btree_update_flags);
 
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 33693467810b..25d73db9adc6 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -1343,6 +1343,69 @@ static int need_whiteout_for_snapshot(struct btree_trans *trans,
 
 	return ret;
 }
+
+int __bch2_insert_snapshot_whiteouts(struct btree_trans *trans,
+				   enum btree_id id,
+				   struct bpos old_pos,
+				   struct bpos new_pos)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter old_iter, new_iter;
+	struct bkey_s_c old_k, new_k;
+	snapshot_id_list s;
+	struct bkey_i *update;
+	int ret;
+
+	if (!bch2_snapshot_has_children(c, old_pos.snapshot))
+		return 0;
+
+	darray_init(&s);
+
+	bch2_trans_iter_init(trans, &old_iter, id, old_pos,
+			     BTREE_ITER_NOT_EXTENTS|
+			     BTREE_ITER_ALL_SNAPSHOTS);
+	while ((old_k = bch2_btree_iter_prev(&old_iter)).k &&
+	       !(ret = bkey_err(old_k)) &&
+	       bkey_eq(old_pos, old_k.k->p)) {
+		struct bpos whiteout_pos =
+			SPOS(new_pos.inode, new_pos.offset, old_k.k->p.snapshot);;
+
+		if (!bch2_snapshot_is_ancestor(c, old_k.k->p.snapshot, old_pos.snapshot) ||
+		    snapshot_list_has_ancestor(c, &s, old_k.k->p.snapshot))
+			continue;
+
+		new_k = bch2_bkey_get_iter(trans, &new_iter, id, whiteout_pos,
+					   BTREE_ITER_NOT_EXTENTS|
+					   BTREE_ITER_INTENT);
+		ret = bkey_err(new_k);
+		if (ret)
+			break;
+
+		if (new_k.k->type == KEY_TYPE_deleted) {
+			update = bch2_trans_kmalloc(trans, sizeof(struct bkey_i));
+			ret = PTR_ERR_OR_ZERO(update);
+			if (ret)
+				break;
+
+			bkey_init(&update->k);
+			update->k.p		= whiteout_pos;
+			update->k.type		= KEY_TYPE_whiteout;
+
+			ret = bch2_trans_update(trans, &new_iter, update,
+						BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+		}
+		bch2_trans_iter_exit(trans, &new_iter);
+
+		ret = snapshot_list_add(c, &s, old_k.k->p.snapshot);
+		if (ret)
+			break;
+	}
+	bch2_trans_iter_exit(trans, &old_iter);
+	darray_exit(&s);
+
+	return ret;
+}
+
 int bch2_trans_update_extent(struct btree_trans *trans,
 			     struct btree_iter *orig_iter,
 			     struct bkey_i *insert,
@@ -1396,8 +1459,10 @@ int bch2_trans_update_extent(struct btree_trans *trans,
 
 			bch2_cut_back(start, update);
 
-			ret = bch2_btree_insert_nonextent(trans, btree_id, update,
-						  BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|flags);
+			ret =   bch2_insert_snapshot_whiteouts(trans, btree_id,
+						k.k->p, update->k.p) ?:
+				bch2_btree_insert_nonextent(trans, btree_id, update,
+						BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|flags);
 			if (ret)
 				goto err;
 		}
@@ -1411,7 +1476,9 @@ int bch2_trans_update_extent(struct btree_trans *trans,
 			bch2_cut_front(start, update);
 			bch2_cut_back(insert->k.p, update);
 
-			ret = bch2_btree_insert_nonextent(trans, btree_id, update,
+			ret =   bch2_insert_snapshot_whiteouts(trans, btree_id,
+						k.k->p, update->k.p) ?:
+				bch2_btree_insert_nonextent(trans, btree_id, update,
 						  BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|flags);
 			if (ret)
 				goto err;
diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c
index ae7e60d6e583..c89ee14f8b6b 100644
--- a/fs/bcachefs/data_update.c
+++ b/fs/bcachefs/data_update.c
@@ -16,81 +16,6 @@
 #include "subvolume.h"
 #include "trace.h"
 
-static int insert_snapshot_whiteouts(struct btree_trans *trans,
-				     enum btree_id id,
-				     struct bpos old_pos,
-				     struct bpos new_pos)
-{
-	struct bch_fs *c = trans->c;
-	struct btree_iter iter, iter2;
-	struct bkey_s_c k, k2;
-	snapshot_id_list s;
-	struct bkey_i *update;
-	int ret;
-
-	if (!btree_type_has_snapshots(id))
-		return 0;
-
-	darray_init(&s);
-
-	if (!bch2_snapshot_has_children(c, old_pos.snapshot))
-		return 0;
-
-	bch2_trans_iter_init(trans, &iter, id, old_pos,
-			     BTREE_ITER_NOT_EXTENTS|
-			     BTREE_ITER_ALL_SNAPSHOTS);
-	while (1) {
-		k = bch2_btree_iter_prev(&iter);
-		ret = bkey_err(k);
-		if (ret)
-			break;
-
-		if (!k.k)
-			break;
-
-		if (!bkey_eq(old_pos, k.k->p))
-			break;
-
-		if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, old_pos.snapshot) &&
-		    !snapshot_list_has_ancestor(c, &s, k.k->p.snapshot)) {
-			struct bpos whiteout_pos = new_pos;
-
-			whiteout_pos.snapshot = k.k->p.snapshot;
-
-			k2 = bch2_bkey_get_iter(trans, &iter2, id, whiteout_pos,
-						BTREE_ITER_NOT_EXTENTS|
-						BTREE_ITER_INTENT);
-			ret = bkey_err(k2);
-
-			if (!ret && k2.k->type == KEY_TYPE_deleted) {
-				update = bch2_trans_kmalloc(trans, sizeof(struct bkey_i));
-				ret = PTR_ERR_OR_ZERO(update);
-				if (ret)
-					break;
-
-				bkey_init(&update->k);
-				update->k.p		= whiteout_pos;
-				update->k.type		= KEY_TYPE_whiteout;
-
-				ret = bch2_trans_update(trans, &iter2, update,
-							BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
-			}
-			bch2_trans_iter_exit(trans, &iter2);
-
-			if (ret)
-				break;
-
-			ret = snapshot_list_add(c, &s, k.k->p.snapshot);
-			if (ret)
-				break;
-		}
-	}
-	bch2_trans_iter_exit(trans, &iter);
-	darray_exit(&s);
-
-	return ret;
-}
-
 static void trace_move_extent_finish2(struct bch_fs *c, struct bkey_s_c k)
 {
 	if (trace_move_extent_finish_enabled()) {
@@ -327,19 +252,12 @@ restart_drop_extra_replicas:
 
 		next_pos = insert->k.p;
 
-		if (!bkey_eq(bkey_start_pos(&insert->k), bkey_start_pos(k.k))) {
-			ret = insert_snapshot_whiteouts(trans, m->btree_id, k.k->p,
-							bkey_start_pos(&insert->k));
-			if (ret)
-				goto err;
-		}
-
-		if (!bkey_eq(insert->k.p, k.k->p)) {
-			ret = insert_snapshot_whiteouts(trans, m->btree_id,
-							k.k->p, insert->k.p);
-			if (ret)
-				goto err;
-		}
+		ret =   bch2_insert_snapshot_whiteouts(trans, m->btree_id,
+						k.k->p, bkey_start_pos(&insert->k)) ?:
+			bch2_insert_snapshot_whiteouts(trans, m->btree_id,
+						k.k->p, insert->k.p);
+		if (ret)
+			goto err;
 
 		ret   = bch2_trans_update(trans, &iter, insert,
 				BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
-- 
cgit 


From 70d41c9e276c5e7b130d328a6ece92c9130a6572 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 28 May 2023 00:35:35 -0400
Subject: bcachefs: Avoid __GFP_NOFAIL

We've been using __GFP_NOFAIL for allocating struct bch_folio, our
private per-folio state.

However, that struct is variable size - it holds state for each sector
in the folio, and folios can be quite large now, which means it's
possible for bch_folio to be larger than PAGE_SIZE now.

__GFP_NOFAIL allocations are undesirable in normal circumstances, but
particularly so at >= PAGE_SIZE, and warnings are emitted for that.

So, this patch adds proper error paths and eliminates most uses of
__GFP_NOFAIL. Also, do some more cleanup of gfp flags w.r.t. btree node
locks: we can use GFP_KERNEL, but only if we're not holding btree locks,
and if we are holding btree locks we should be using GFP_NOWAIT.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-io.c | 91 +++++++++++++++++++++++++++++------------------------
 1 file changed, 50 insertions(+), 41 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 64897cee8494..cf48f9a0d4e1 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -574,7 +574,7 @@ static struct bch_folio *__bch2_folio_create(struct folio *folio, gfp_t gfp)
 
 	s = kzalloc(sizeof(*s) +
 		    sizeof(struct bch_folio_sector) *
-		    folio_sectors(folio), GFP_NOFS|gfp);
+		    folio_sectors(folio), gfp);
 	if (!s)
 		return NULL;
 
@@ -601,7 +601,7 @@ static void __bch2_folio_set(struct folio *folio,
 			     unsigned pg_offset, unsigned pg_len,
 			     unsigned nr_ptrs, unsigned state)
 {
-	struct bch_folio *s = bch2_folio_create(folio, __GFP_NOFAIL);
+	struct bch_folio *s = bch2_folio(folio);
 	unsigned i, sectors = folio_sectors(folio);
 
 	BUG_ON(pg_offset >= sectors);
@@ -630,11 +630,25 @@ static int bch2_folio_set(struct bch_fs *c, subvol_inum inum,
 	struct btree_trans trans;
 	struct btree_iter iter;
 	struct bkey_s_c k;
+	struct bch_folio *s;
 	u64 offset = folio_sector(folios[0]);
-	unsigned folio_idx = 0;
+	unsigned folio_idx;
 	u32 snapshot;
+	bool need_set = false;
 	int ret;
 
+	for (folio_idx = 0; folio_idx < nr_folios; folio_idx++) {
+		s = bch2_folio_create(folios[folio_idx], GFP_KERNEL);
+		if (!s)
+			return -ENOMEM;
+
+		need_set |= !s->uptodate;
+	}
+
+	if (!need_set)
+		return 0;
+
+	folio_idx = 0;
 	bch2_trans_init(&trans, c, 0, 0);
 retry:
 	bch2_trans_begin(&trans);
@@ -659,7 +673,7 @@ retry:
 			BUG_ON(k.k->p.offset < folio_start);
 			BUG_ON(bkey_start_offset(k.k) > folio_end);
 
-			if (!bch2_folio_create(folio, __GFP_NOFAIL)->uptodate)
+			if (!bch2_folio(folio)->uptodate)
 				__bch2_folio_set(folio, folio_offset, folio_len, nr_ptrs, state);
 
 			if (k.k->p.offset < folio_end)
@@ -1051,15 +1065,8 @@ vm_fault_t bch2_page_mkwrite(struct vm_fault *vmf)
 
 	len = min_t(loff_t, folio_size(folio), isize - folio_pos(folio));
 
-	if (!bch2_folio_create(folio, __GFP_NOFAIL)->uptodate) {
-		if (bch2_folio_set(c, inode_inum(inode), &folio, 1)) {
-			folio_unlock(folio);
-			ret = VM_FAULT_SIGBUS;
-			goto out;
-		}
-	}
-
-	if (bch2_folio_reservation_get(c, inode, folio, &res, 0, len)) {
+	if (bch2_folio_set(c, inode_inum(inode), &folio, 1) ?:
+	    bch2_folio_reservation_get(c, inode, folio, &res, 0, len)) {
 		folio_unlock(folio);
 		ret = VM_FAULT_SIGBUS;
 		goto out;
@@ -1139,7 +1146,7 @@ static int readpages_iter_init(struct readpages_iter *iter,
 
 	darray_for_each(iter->folios, fi) {
 		ractl->_nr_pages -= 1U << folio_order(*fi);
-		__bch2_folio_create(*fi, __GFP_NOFAIL);
+		__bch2_folio_create(*fi, __GFP_NOFAIL|GFP_KERNEL);
 		folio_put(*fi);
 		folio_put(*fi);
 	}
@@ -1171,11 +1178,15 @@ static bool extent_partial_reads_expensive(struct bkey_s_c k)
 	return false;
 }
 
-static void readpage_bio_extend(struct readpages_iter *iter,
-				struct bio *bio,
-				unsigned sectors_this_extent,
-				bool get_more)
+static int readpage_bio_extend(struct btree_trans *trans,
+			       struct readpages_iter *iter,
+			       struct bio *bio,
+			       unsigned sectors_this_extent,
+			       bool get_more)
 {
+	/* Don't hold btree locks while allocating memory: */
+	bch2_trans_unlock(trans);
+
 	while (bio_sectors(bio) < sectors_this_extent &&
 	       bio->bi_vcnt < bio->bi_max_vecs) {
 		struct folio *folio = readpage_iter_peek(iter);
@@ -1197,12 +1208,12 @@ static void readpage_bio_extend(struct readpages_iter *iter,
 			if (!folio)
 				break;
 
-			if (!__bch2_folio_create(folio, 0)) {
+			if (!__bch2_folio_create(folio, GFP_KERNEL)) {
 				folio_put(folio);
 				break;
 			}
 
-			ret = filemap_add_folio(iter->mapping, folio, folio_offset, GFP_NOFS);
+			ret = filemap_add_folio(iter->mapping, folio, folio_offset, GFP_KERNEL);
 			if (ret) {
 				__bch2_folio_release(folio);
 				folio_put(folio);
@@ -1216,6 +1227,8 @@ static void readpage_bio_extend(struct readpages_iter *iter,
 
 		BUG_ON(!bio_add_folio(bio, folio, folio_size(folio), 0));
 	}
+
+	return bch2_trans_relock(trans);
 }
 
 static void bchfs_read(struct btree_trans *trans,
@@ -1283,9 +1296,12 @@ retry:
 
 		sectors = min(sectors, k.k->size - offset_into_extent);
 
-		if (readpages_iter)
-			readpage_bio_extend(readpages_iter, &rbio->bio, sectors,
-					    extent_partial_reads_expensive(k));
+		if (readpages_iter) {
+			ret = readpage_bio_extend(trans, readpages_iter, &rbio->bio, sectors,
+						  extent_partial_reads_expensive(k));
+			if (ret)
+				break;
+		}
 
 		bytes = min(sectors, bio_sectors(&rbio->bio)) << 9;
 		swap(rbio->bio.bi_iter.bi_size, bytes);
@@ -1594,7 +1610,7 @@ static int __bch2_writepage(struct folio *folio,
 			   folio_size(folio));
 do_io:
 	f_sectors = folio_sectors(folio);
-	s = bch2_folio_create(folio, __GFP_NOFAIL);
+	s = bch2_folio(folio);
 
 	if (f_sectors > w->tmp_sectors) {
 		kfree(w->tmp);
@@ -1776,11 +1792,9 @@ readpage:
 	if (ret)
 		goto err;
 out:
-	if (!bch2_folio_create(folio, __GFP_NOFAIL)->uptodate) {
-		ret = bch2_folio_set(c, inode_inum(inode), &folio, 1);
-		if (ret)
-			goto err;
-	}
+	ret = bch2_folio_set(c, inode_inum(inode), &folio, 1);
+	if (ret)
+		goto err;
 
 	ret = bch2_folio_reservation_get(c, inode, folio, res, offset, len);
 	if (ret) {
@@ -1916,19 +1930,16 @@ static int __bch2_buffered_write(struct bch_inode_info *inode,
 		}
 	}
 
+	ret = bch2_folio_set(c, inode_inum(inode), folios.data, folios.nr);
+	if (ret)
+		goto out;
+
 	f_pos = pos;
 	f_offset = pos - folio_pos(darray_first(folios));
 	darray_for_each(folios, fi) {
 		struct folio *f = *fi;
 		u64 f_len = min(end, folio_end_pos(f)) - f_pos;
 
-		if (!bch2_folio_create(f, __GFP_NOFAIL)->uptodate) {
-			ret = bch2_folio_set(c, inode_inum(inode), fi,
-					     folios.data + folios.nr - fi);
-			if (ret)
-				goto out;
-		}
-
 		/*
 		 * XXX: per POSIX and fstests generic/275, on -ENOSPC we're
 		 * supposed to write as much as we have disk space for.
@@ -2884,11 +2895,9 @@ static int __bch2_truncate_folio(struct bch_inode_info *inode,
 			goto unlock;
 	}
 
-	if (!s->uptodate) {
-		ret = bch2_folio_set(c, inode_inum(inode), &folio, 1);
-		if (ret)
-			goto unlock;
-	}
+	ret = bch2_folio_set(c, inode_inum(inode), &folio, 1);
+	if (ret)
+		goto unlock;
 
 	for (i = round_up(start_offset, block_bytes(c)) >> 9;
 	     i < round_down(end_offset, block_bytes(c)) >> 9;
-- 
cgit 


From e1d29c5fa1205c75dd96303f76d7aa4991555aaa Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 28 May 2023 02:35:34 -0400
Subject: bcachefs: Ensure bch2_btree_node_get() calls relock() after unlock()

Fix a bug where bch2_btree_node_get() might call bch2_trans_unlock() (in
fill) without calling bch2_trans_relock(); this is a bug when it's done
in the core btree code.

Also, twea bch2_btree_node_mem_alloc() to drop btree locks before doing
a blocking memory allocation.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_cache.c | 31 +++++++++++++++++++------------
 1 file changed, 19 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index 58ef9e7b4bdf..681a47b70a65 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -587,9 +587,10 @@ struct btree *bch2_btree_node_mem_alloc(struct btree_trans *trans, bool pcpu_rea
 			goto got_node;
 		}
 
-	b = __btree_node_mem_alloc(c, __GFP_NOWARN);
+	b = __btree_node_mem_alloc(c, GFP_NOWAIT|__GFP_NOWARN);
 	if (!b) {
 		mutex_unlock(&bc->lock);
+		bch2_trans_unlock(trans);
 		b = __btree_node_mem_alloc(c, GFP_KERNEL);
 		if (!b)
 			goto err;
@@ -618,8 +619,11 @@ got_node:
 
 	mutex_unlock(&bc->lock);
 
-	if (btree_node_data_alloc(c, b, __GFP_NOWARN|GFP_KERNEL))
-		goto err;
+	if (btree_node_data_alloc(c, b, GFP_NOWAIT|__GFP_NOWARN)) {
+		bch2_trans_unlock(trans);
+		if (btree_node_data_alloc(c, b, GFP_KERNEL|__GFP_NOWARN))
+			goto err;
+	}
 
 	mutex_lock(&bc->lock);
 	bc->used++;
@@ -812,6 +816,7 @@ static struct btree *__bch2_btree_node_get(struct btree_trans *trans, struct btr
 	struct btree_cache *bc = &c->btree_cache;
 	struct btree *b;
 	struct bset_tree *t;
+	bool need_relock = false;
 	int ret;
 
 	EBUG_ON(level >= BTREE_MAX_DEPTH);
@@ -825,6 +830,7 @@ retry:
 		 */
 		b = bch2_btree_node_fill(trans, path, k, path->btree_id,
 					 level, lock_type, true);
+		need_relock = true;
 
 		/* We raced and found the btree node in the cache */
 		if (!b)
@@ -863,6 +869,7 @@ retry:
 
 		six_unlock_type(&b->c.lock, lock_type);
 		bch2_trans_unlock(trans);
+		need_relock = true;
 
 		bch2_btree_node_wait_on_read(b);
 
@@ -870,19 +877,19 @@ retry:
 		 * should_be_locked is not set on this path yet, so we need to
 		 * relock it specifically:
 		 */
-		if (trans) {
-			int ret = bch2_trans_relock(trans) ?:
-				bch2_btree_path_relock_intent(trans, path);
-			if (ret) {
-				BUG_ON(!trans->restarted);
-				return ERR_PTR(ret);
-			}
-		}
-
 		if (!six_relock_type(&b->c.lock, lock_type, seq))
 			goto retry;
 	}
 
+	if (unlikely(need_relock)) {
+		int ret = bch2_trans_relock(trans) ?:
+			bch2_btree_path_relock_intent(trans, path);
+		if (ret) {
+			six_unlock_type(&b->c.lock, lock_type);
+			return ERR_PTR(ret);
+		}
+	}
+
 	prefetch(b->aux_data);
 
 	for_each_bset(b, t) {
-- 
cgit 


From 19c304bebda4d8815a20c8d3330459a112c329f6 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 28 May 2023 18:02:38 -0400
Subject: bcachefs: GFP_NOIO -> GFP_NOFS

GFP_NOIO dates from the bcache days, when we operated under the block
layer. Now, GFP_NOFS is more appropriate, so switch all GFP_NOIO uses to
GFP_NOFS.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_io.c              |  8 ++++----
 fs/bcachefs/btree_update_interior.c |  2 +-
 fs/bcachefs/buckets.c               |  4 ++--
 fs/bcachefs/compress.c              | 12 ++++++------
 fs/bcachefs/debug.c                 |  4 ++--
 fs/bcachefs/ec.c                    |  2 +-
 fs/bcachefs/io.c                    | 20 ++++++++++----------
 fs/bcachefs/journal_io.c            |  2 +-
 fs/bcachefs/journal_reclaim.c       |  2 +-
 fs/bcachefs/keylist.c               |  2 +-
 10 files changed, 29 insertions(+), 29 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 0a7a18eca397..27a2a7b31f37 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -117,7 +117,7 @@ static void *btree_bounce_alloc(struct bch_fs *c, size_t size,
 	p = vpmalloc(size, __GFP_NOWARN|GFP_NOWAIT);
 	if (!p) {
 		*used_mempool = true;
-		p = mempool_alloc(&c->btree_bounce_pool, GFP_NOIO);
+		p = mempool_alloc(&c->btree_bounce_pool, GFP_NOFS);
 	}
 	memalloc_nofs_restore(flags);
 	return p;
@@ -937,7 +937,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
 	/* We might get called multiple times on read retry: */
 	b->written = 0;
 
-	iter = mempool_alloc(&c->fill_iter, GFP_NOIO);
+	iter = mempool_alloc(&c->fill_iter, GFP_NOFS);
 	sort_iter_init(iter, b);
 	iter->size = (btree_blocks(c) + 1) * 2;
 
@@ -1580,7 +1580,7 @@ void bch2_btree_node_read(struct bch_fs *c, struct btree *b,
 	bio = bio_alloc_bioset(NULL,
 			       buf_pages(b->data, btree_bytes(c)),
 			       REQ_OP_READ|REQ_SYNC|REQ_META,
-			       GFP_NOIO,
+			       GFP_NOFS,
 			       &c->btree_bio);
 	rb = container_of(bio, struct btree_read_bio, bio);
 	rb->c			= c;
@@ -2077,7 +2077,7 @@ do_write:
 	wbio = container_of(bio_alloc_bioset(NULL,
 				buf_pages(data, sectors_to_write << 9),
 				REQ_OP_WRITE|REQ_META,
-				GFP_NOIO,
+				GFP_NOFS,
 				&c->btree_bio),
 			    struct btree_write_bio, wbio.bio);
 	wbio_init(&wbio->wbio.bio);
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 1319337c5382..db0d09b59f2f 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -1092,7 +1092,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
 		}
 	}
 
-	as = mempool_alloc(&c->btree_interior_update_pool, GFP_NOIO);
+	as = mempool_alloc(&c->btree_interior_update_pool, GFP_NOFS);
 	memset(as, 0, sizeof(*as));
 	closure_init(&as->cl, NULL);
 	as->c		= c;
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index adf3bd0e4a8f..405c5323f247 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -433,12 +433,12 @@ replicas_deltas_realloc(struct btree_trans *trans, unsigned more)
 	WARN_ON_ONCE(alloc_size > REPLICAS_DELTA_LIST_MAX);
 
 	if (!d || d->used + more > d->size) {
-		d = krealloc(d, alloc_size, GFP_NOIO|__GFP_ZERO);
+		d = krealloc(d, alloc_size, GFP_NOFS|__GFP_ZERO);
 
 		BUG_ON(!d && alloc_size > REPLICAS_DELTA_LIST_MAX);
 
 		if (!d) {
-			d = mempool_alloc(&trans->c->replicas_delta_pool, GFP_NOIO);
+			d = mempool_alloc(&trans->c->replicas_delta_pool, GFP_NOFS);
 			memset(d, 0, REPLICAS_DELTA_LIST_MAX);
 
 			if (trans->fs_usage_deltas)
diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c
index 6bec38440249..38a3475b1897 100644
--- a/fs/bcachefs/compress.c
+++ b/fs/bcachefs/compress.c
@@ -28,11 +28,11 @@ static struct bbuf __bounce_alloc(struct bch_fs *c, unsigned size, int rw)
 
 	BUG_ON(size > c->opts.encoded_extent_max);
 
-	b = kmalloc(size, GFP_NOIO|__GFP_NOWARN);
+	b = kmalloc(size, GFP_NOFS|__GFP_NOWARN);
 	if (b)
 		return (struct bbuf) { .b = b, .type = BB_KMALLOC, .rw = rw };
 
-	b = mempool_alloc(&c->compression_bounce[rw], GFP_NOIO);
+	b = mempool_alloc(&c->compression_bounce[rw], GFP_NOFS);
 	if (b)
 		return (struct bbuf) { .b = b, .type = BB_MEMPOOL, .rw = rw };
 
@@ -94,7 +94,7 @@ static struct bbuf __bio_map_or_bounce(struct bch_fs *c, struct bio *bio,
 	BUG_ON(DIV_ROUND_UP(start.bi_size, PAGE_SIZE) > nr_pages);
 
 	pages = nr_pages > ARRAY_SIZE(stack_pages)
-		? kmalloc_array(nr_pages, sizeof(struct page *), GFP_NOIO)
+		? kmalloc_array(nr_pages, sizeof(struct page *), GFP_NOFS)
 		: stack_pages;
 	if (!pages)
 		goto bounce;
@@ -177,7 +177,7 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src,
 			.avail_out	= dst_len,
 		};
 
-		workspace = mempool_alloc(&c->decompress_workspace, GFP_NOIO);
+		workspace = mempool_alloc(&c->decompress_workspace, GFP_NOFS);
 
 		zlib_set_workspace(&strm, workspace);
 		zlib_inflateInit2(&strm, -MAX_WBITS);
@@ -196,7 +196,7 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src,
 		if (real_src_len > src_len - 4)
 			goto err;
 
-		workspace = mempool_alloc(&c->decompress_workspace, GFP_NOIO);
+		workspace = mempool_alloc(&c->decompress_workspace, GFP_NOFS);
 		ctx = zstd_init_dctx(workspace, zstd_dctx_workspace_bound());
 
 		ret = zstd_decompress_dctx(ctx,
@@ -382,7 +382,7 @@ static unsigned __bio_compress(struct bch_fs *c,
 	dst_data = bio_map_or_bounce(c, dst, WRITE);
 	src_data = bio_map_or_bounce(c, src, READ);
 
-	workspace = mempool_alloc(&c->compress_workspace[compression_type], GFP_NOIO);
+	workspace = mempool_alloc(&c->compress_workspace[compression_type], GFP_NOFS);
 
 	*src_len = src->bi_iter.bi_size;
 	*dst_len = dst->bi_iter.bi_size;
diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c
index d1563caf7fb7..8981acc15098 100644
--- a/fs/bcachefs/debug.c
+++ b/fs/bcachefs/debug.c
@@ -47,7 +47,7 @@ static bool bch2_btree_verify_replica(struct bch_fs *c, struct btree *b,
 	bio = bio_alloc_bioset(ca->disk_sb.bdev,
 			       buf_pages(n_sorted, btree_bytes(c)),
 			       REQ_OP_READ|REQ_META,
-			       GFP_NOIO,
+			       GFP_NOFS,
 			       &c->btree_bio);
 	bio->bi_iter.bi_sector	= pick.ptr.offset;
 	bch2_bio_map(bio, n_sorted, btree_bytes(c));
@@ -211,7 +211,7 @@ void bch2_btree_node_ondisk_to_text(struct printbuf *out, struct bch_fs *c,
 	bio = bio_alloc_bioset(ca->disk_sb.bdev,
 			       buf_pages(n_ondisk, btree_bytes(c)),
 			       REQ_OP_READ|REQ_META,
-			       GFP_NOIO,
+			       GFP_NOFS,
 			       &c->btree_bio);
 	bio->bi_iter.bi_sector	= pick.ptr.offset;
 	bch2_bio_map(bio, n_ondisk, btree_bytes(c));
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index 439fa540323f..1c35fa1fedd3 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -485,7 +485,7 @@ int bch2_ec_read_extent(struct bch_fs *c, struct bch_read_bio *rbio)
 
 	BUG_ON(!rbio->pick.has_ec);
 
-	buf = kzalloc(sizeof(*buf), GFP_NOIO);
+	buf = kzalloc(sizeof(*buf), GFP_NOFS);
 	if (!buf)
 		return -BCH_ERR_ENOMEM_ec_read_extent;
 
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index 11ed86453d66..0f8d52912c2d 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -163,7 +163,7 @@ static struct page *__bio_alloc_page_pool(struct bch_fs *c, bool *using_mempool)
 	struct page *page;
 
 	if (likely(!*using_mempool)) {
-		page = alloc_page(GFP_NOIO);
+		page = alloc_page(GFP_NOFS);
 		if (unlikely(!page)) {
 			mutex_lock(&c->bio_bounce_pages_lock);
 			*using_mempool = true;
@@ -172,7 +172,7 @@ static struct page *__bio_alloc_page_pool(struct bch_fs *c, bool *using_mempool)
 		}
 	} else {
 pool_alloc:
-		page = mempool_alloc(&c->bio_bounce_pages, GFP_NOIO);
+		page = mempool_alloc(&c->bio_bounce_pages, GFP_NOFS);
 	}
 
 	return page;
@@ -660,7 +660,7 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
 
 		if (to_entry(ptr + 1) < ptrs.end) {
 			n = to_wbio(bio_alloc_clone(NULL, &wbio->bio,
-						GFP_NOIO, &ca->replica_set));
+						GFP_NOFS, &ca->replica_set));
 
 			n->bio.bi_end_io	= wbio->bio.bi_end_io;
 			n->bio.bi_private	= wbio->bio.bi_private;
@@ -976,7 +976,7 @@ static struct bio *bch2_write_bio_alloc(struct bch_fs *c,
 	pages = min(pages, BIO_MAX_VECS);
 
 	bio = bio_alloc_bioset(NULL, pages, 0,
-			       GFP_NOIO, &c->bio_write);
+			       GFP_NOFS, &c->bio_write);
 	wbio			= wbio_init(bio);
 	wbio->put_bio		= true;
 	/* copy WRITE_SYNC flag */
@@ -1314,7 +1314,7 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp,
 		BUG_ON(total_output != total_input);
 
 		dst = bio_split(src, total_input >> 9,
-				GFP_NOIO, &c->bio_write);
+				GFP_NOFS, &c->bio_write);
 		wbio_init(dst)->put_bio	= true;
 		/* copy WRITE_SYNC flag */
 		dst->bi_opf		= src->bi_opf;
@@ -2013,7 +2013,7 @@ static struct promote_op *__promote_alloc(struct btree_trans *trans,
 	if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_promote))
 		return NULL;
 
-	op = kzalloc(sizeof(*op) + sizeof(struct bio_vec) * pages, GFP_NOIO);
+	op = kzalloc(sizeof(*op) + sizeof(struct bio_vec) * pages, GFP_NOFS);
 	if (!op)
 		goto err;
 
@@ -2026,7 +2026,7 @@ static struct promote_op *__promote_alloc(struct btree_trans *trans,
 	 */
 	*rbio = kzalloc(sizeof(struct bch_read_bio) +
 			sizeof(struct bio_vec) * pages,
-			GFP_NOIO);
+			GFP_NOFS);
 	if (!*rbio)
 		goto err;
 
@@ -2034,7 +2034,7 @@ static struct promote_op *__promote_alloc(struct btree_trans *trans,
 	bio_init(&(*rbio)->bio, NULL, (*rbio)->bio.bi_inline_vecs, pages, 0);
 
 	if (bch2_bio_alloc_pages(&(*rbio)->bio, sectors << 9,
-				 GFP_NOIO))
+				 GFP_NOFS))
 		goto err;
 
 	(*rbio)->bounce		= true;
@@ -2746,7 +2746,7 @@ get_bio:
 		rbio = rbio_init(bio_alloc_bioset(NULL,
 						  DIV_ROUND_UP(sectors, PAGE_SECTORS),
 						  0,
-						  GFP_NOIO,
+						  GFP_NOFS,
 						  &c->bio_read_split),
 				 orig->opts);
 
@@ -2762,7 +2762,7 @@ get_bio:
 		 * from the whole bio, in which case we don't want to retry and
 		 * lose the error)
 		 */
-		rbio = rbio_init(bio_alloc_clone(NULL, &orig->bio, GFP_NOIO,
+		rbio = rbio_init(bio_alloc_clone(NULL, &orig->bio, GFP_NOFS,
 						 &c->bio_read_split),
 				 orig->opts);
 		rbio->bio.bi_iter = iter;
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index b455ef041dfe..8dc378674919 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -1438,7 +1438,7 @@ static void journal_buf_realloc(struct journal *j, struct journal_buf *buf)
 	if (buf->buf_size >= new_size)
 		return;
 
-	new_buf = kvpmalloc(new_size, GFP_NOIO|__GFP_NOWARN);
+	new_buf = kvpmalloc(new_size, GFP_NOFS|__GFP_NOWARN);
 	if (!new_buf)
 		return;
 
diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
index 29d843e6d6d4..2c7f8aca9319 100644
--- a/fs/bcachefs/journal_reclaim.c
+++ b/fs/bcachefs/journal_reclaim.c
@@ -271,7 +271,7 @@ void bch2_journal_do_discards(struct journal *j)
 				blkdev_issue_discard(ca->disk_sb.bdev,
 					bucket_to_sector(ca,
 						ja->buckets[ja->discard_idx]),
-					ca->mi.bucket_size, GFP_NOIO);
+					ca->mi.bucket_size, GFP_NOFS);
 
 			spin_lock(&j->lock);
 			ja->discard_idx = (ja->discard_idx + 1) % ja->nr;
diff --git a/fs/bcachefs/keylist.c b/fs/bcachefs/keylist.c
index cf5998e519e7..5699cd4873c8 100644
--- a/fs/bcachefs/keylist.c
+++ b/fs/bcachefs/keylist.c
@@ -18,7 +18,7 @@ int bch2_keylist_realloc(struct keylist *l, u64 *inline_u64s,
 	    (old_buf && roundup_pow_of_two(oldsize) == newsize))
 		return 0;
 
-	new_keys = krealloc(old_buf, sizeof(u64) * newsize, GFP_NOIO);
+	new_keys = krealloc(old_buf, sizeof(u64) * newsize, GFP_NOFS);
 	if (!new_keys)
 		return -ENOMEM;
 
-- 
cgit 


From b5fd75669ab1283e7a9caf6288c425108bd382b0 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 28 May 2023 18:06:27 -0400
Subject: bcachefs: drop_locks_do()

Add a new helper for the common pattern of:
 - trans_unlock()
 - do something
 - trans_relock()

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c            |  9 +++------
 fs/bcachefs/btree_iter.h            |  5 +++++
 fs/bcachefs/btree_locking.c         |  5 +----
 fs/bcachefs/btree_update_interior.c |  8 ++------
 fs/bcachefs/btree_update_leaf.c     | 25 ++++---------------------
 5 files changed, 15 insertions(+), 37 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 4830d203b37b..1cc53b37f78e 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -41,13 +41,10 @@ static struct btree_path *btree_path_alloc(struct btree_trans *, struct btree_pa
  */
 static inline int bch2_trans_cond_resched(struct btree_trans *trans)
 {
-	if (need_resched() || race_fault()) {
-		bch2_trans_unlock(trans);
-		schedule();
-		return bch2_trans_relock(trans);
-	} else {
+	if (need_resched() || race_fault())
+		return drop_locks_do(trans, (schedule(), 0));
+	else
 		return 0;
-	}
 }
 
 static inline int __btree_path_cmp(const struct btree_path *l,
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index 5e5e2a5c715c..9676aa335b89 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -854,6 +854,11 @@ __bch2_btree_iter_peek_upto_and_restart(struct btree_trans *trans,
 	     !((_ret) = bkey_err(_k)) && (_k).k;				\
 	     bch2_btree_iter_advance(&(_iter)))
 
+#define drop_locks_do(_trans, _do)					\
+({									\
+	bch2_trans_unlock(_trans);					\
+	_do ?: bch2_trans_relock(_trans);				\
+})
 /* new multiple iterator interface: */
 
 void bch2_trans_updates_to_text(struct printbuf *, struct btree_trans *);
diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c
index 1f4eca898ab7..23a6d63223af 100644
--- a/fs/bcachefs/btree_locking.c
+++ b/fs/bcachefs/btree_locking.c
@@ -736,11 +736,8 @@ bool bch2_trans_locked(struct btree_trans *trans)
 int __bch2_trans_mutex_lock(struct btree_trans *trans,
 			    struct mutex *lock)
 {
-	int ret;
+	int ret = drop_locks_do(trans, (mutex_lock(lock), 0));
 
-	bch2_trans_unlock(trans);
-	mutex_lock(lock);
-	ret = bch2_trans_relock(trans);
 	if (ret)
 		mutex_unlock(lock);
 	return ret;
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index db0d09b59f2f..4d6c286edb04 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -1083,9 +1083,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
 	if (flags & BTREE_INSERT_GC_LOCK_HELD)
 		lockdep_assert_held(&c->gc_lock);
 	else if (!down_read_trylock(&c->gc_lock)) {
-		bch2_trans_unlock(trans);
-		down_read(&c->gc_lock);
-		ret = bch2_trans_relock(trans);
+		ret = drop_locks_do(trans, (down_read(&c->gc_lock), 0));
 		if (ret) {
 			up_read(&c->gc_lock);
 			return ERR_PTR(ret);
@@ -2256,9 +2254,7 @@ int bch2_btree_node_update_key(struct btree_trans *trans, struct btree_iter *ite
 	if (btree_ptr_hash_val(new_key) != b->hash_val) {
 		ret = bch2_btree_cache_cannibalize_lock(c, &cl);
 		if (ret) {
-			bch2_trans_unlock(trans);
-			closure_sync(&cl);
-			ret = bch2_trans_relock(trans);
+			ret = drop_locks_do(trans, (closure_sync(&cl), 0));
 			if (ret)
 				return ret;
 		}
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 25d73db9adc6..ea7e32e7d2fd 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -316,25 +316,11 @@ static noinline int
 bch2_trans_journal_preres_get_cold(struct btree_trans *trans, unsigned flags,
 				   unsigned long trace_ip)
 {
-	struct bch_fs *c = trans->c;
-	int ret;
-
-	bch2_trans_unlock(trans);
-
-	ret = bch2_journal_preres_get(&c->journal,
+	return drop_locks_do(trans,
+		bch2_journal_preres_get(&trans->c->journal,
 			&trans->journal_preres,
 			trans->journal_preres_u64s,
-			(flags & JOURNAL_WATERMARK_MASK));
-	if (ret)
-		return ret;
-
-	ret = bch2_trans_relock(trans);
-	if (ret) {
-		trace_and_count(c, trans_restart_journal_preres_get, trans, trace_ip, 0);
-		return ret;
-	}
-
-	return 0;
+			(flags & JOURNAL_WATERMARK_MASK)));
 }
 
 static __always_inline int bch2_trans_journal_res_get(struct btree_trans *trans,
@@ -1053,10 +1039,7 @@ bch2_trans_commit_get_rw_cold(struct btree_trans *trans, unsigned flags)
 	    test_bit(BCH_FS_STARTED, &c->flags))
 		return -BCH_ERR_erofs_trans_commit;
 
-	bch2_trans_unlock(trans);
-
-	ret =   bch2_fs_read_write_early(c) ?:
-		bch2_trans_relock(trans);
+	ret = drop_locks_do(trans, bch2_fs_read_write_early(c));
 	if (ret)
 		return ret;
 
-- 
cgit 


From 78367aaa5af322b64d44a3a3354f4c75c499fef0 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 28 May 2023 01:09:50 -0400
Subject: bcachefs: bch2_trans_kmalloc no longer allocates memory with btree
 locks held

When allocating memory, gfp flags should generally be

 - GFP_NOWAIT|__GFP_NOWARN if btree locks are held
 - GFP_NOFS if in the IO path or otherwise holding resources needed for
   IO submission
 - GFP_KERNEL otherwise

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c | 29 +++++++++++++++++++++--------
 1 file changed, 21 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 1cc53b37f78e..9072819176de 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -2790,6 +2790,7 @@ void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
 	unsigned new_top = trans->mem_top + size;
 	size_t old_bytes = trans->mem_bytes;
 	size_t new_bytes = roundup_pow_of_two(new_top);
+	int ret;
 	void *new_mem;
 	void *p;
 
@@ -2797,15 +2798,27 @@ void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
 
 	WARN_ON_ONCE(new_bytes > BTREE_TRANS_MEM_MAX);
 
-	new_mem = krealloc(trans->mem, new_bytes, GFP_NOFS);
-	if (!new_mem && new_bytes <= BTREE_TRANS_MEM_MAX) {
-		new_mem = mempool_alloc(&trans->c->btree_trans_mem_pool, GFP_KERNEL);
-		new_bytes = BTREE_TRANS_MEM_MAX;
-		kfree(trans->mem);
-	}
+	new_mem = krealloc(trans->mem, new_bytes, GFP_NOWAIT|__GFP_NOWARN);
+	if (unlikely(!new_mem)) {
+		bch2_trans_unlock(trans);
+
+		new_mem = krealloc(trans->mem, new_bytes, GFP_KERNEL);
+		if (!new_mem && new_bytes <= BTREE_TRANS_MEM_MAX) {
+			new_mem = mempool_alloc(&trans->c->btree_trans_mem_pool, GFP_KERNEL);
+			new_bytes = BTREE_TRANS_MEM_MAX;
+			kfree(trans->mem);
+		}
+
+		if (!new_mem)
+			return ERR_PTR(-BCH_ERR_ENOMEM_trans_kmalloc);
+
+		trans->mem = new_mem;
+		trans->mem_bytes = new_bytes;
 
-	if (!new_mem)
-		return ERR_PTR(-BCH_ERR_ENOMEM_trans_kmalloc);
+		ret = bch2_trans_relock(trans);
+		if (ret)
+			return ERR_PTR(ret);
+	}
 
 	trans->mem = new_mem;
 	trans->mem_bytes = new_bytes;
-- 
cgit 


From 5718fda0b5ef777ef56edbe53ef6a830b845c0fa Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 28 May 2023 00:59:26 -0400
Subject: bcachefs: fs-io: Eliminate GFP_NOFS usage

GFP_NOFS doesn't ever make sense. If we're allocatingc memory it should
be GFP_NOWAIT if btree locks are held, GFP_KERNEL otherwise.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-io.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index cf48f9a0d4e1..c864c271b7c2 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -1368,7 +1368,7 @@ void bch2_readahead(struct readahead_control *ractl)
 				   BIO_MAX_VECS);
 		struct bch_read_bio *rbio =
 			rbio_init(bio_alloc_bioset(NULL, n, REQ_OP_READ,
-						   GFP_NOFS, &c->bio_read),
+						   GFP_KERNEL, &c->bio_read),
 				  opts);
 
 		readpage_iter_advance(&readpages_iter);
@@ -1379,6 +1379,7 @@ void bch2_readahead(struct readahead_control *ractl)
 
 		bchfs_read(&trans, rbio, inode_inum(inode),
 			   &readpages_iter);
+		bch2_trans_unlock(&trans);
 	}
 
 	bch2_pagecache_add_put(inode);
@@ -1420,7 +1421,7 @@ static int bch2_read_single_folio(struct folio *folio,
 
 	bch2_inode_opts_get(&opts, c, &inode->ei_inode);
 
-	rbio = rbio_init(bio_alloc_bioset(NULL, 1, REQ_OP_READ, GFP_NOFS, &c->bio_read),
+	rbio = rbio_init(bio_alloc_bioset(NULL, 1, REQ_OP_READ, GFP_KERNEL, &c->bio_read),
 			 opts);
 	rbio->bio.bi_private = &done;
 	rbio->bio.bi_end_io = bch2_read_single_folio_end_io;
@@ -1555,7 +1556,7 @@ static void bch2_writepage_io_alloc(struct bch_fs *c,
 
 	w->io = container_of(bio_alloc_bioset(NULL, BIO_MAX_VECS,
 					      REQ_OP_WRITE,
-					      GFP_NOFS,
+					      GFP_KERNEL,
 					      &c->writepage_bioset),
 			     struct bch_writepage_io, op.wbio.bio);
 
-- 
cgit 


From 4c4a8f20d1767b2ed927d25ccc363de72d48d28f Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 29 May 2023 02:26:04 -0400
Subject: bcachefs: Fix error handling in promote path

The promote path had a BUG_ON() for unknown error type, which we're now
seeing: change it to a WARN_ON() - because we're curious what this is -
and otherwise handle it in the normal error path.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/io.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index 0f8d52912c2d..199a8348355a 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -2057,14 +2057,16 @@ static struct promote_op *__promote_alloc(struct btree_trans *trans,
 				.write_flags	= BCH_WRITE_ALLOC_NOWAIT|BCH_WRITE_CACHED,
 			},
 			btree_id, k);
-	if (ret == -BCH_ERR_nocow_lock_blocked) {
+	if (ret) {
+		WARN_ONCE(ret != -BCH_ERR_nocow_lock_blocked,
+			  "%s: saw unknown error %s\n", __func__, bch2_err_str(ret));
+
 		ret = rhashtable_remove_fast(&c->promote_table, &op->hash,
 					bch_promote_params);
 		BUG_ON(ret);
 		goto err;
 	}
 
-	BUG_ON(ret);
 	op->write.op.end_io = promote_done;
 
 	return op;
-- 
cgit 


From 3ebfc8fe95c5ec560d2d5c7e7bef62ebaa33a9c4 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 29 May 2023 16:27:11 -0400
Subject: bcachefs: Use unlikely() in bch2_err_matches()

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/errcode.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h
index acf9b92f9ab0..12c0c44eb6b0 100644
--- a/fs/bcachefs/errcode.h
+++ b/fs/bcachefs/errcode.h
@@ -230,7 +230,7 @@ static inline bool _bch2_err_matches(int err, int class)
 #define bch2_err_matches(_err, _class)			\
 ({							\
 	BUILD_BUG_ON(!__builtin_constant_p(_class));	\
-	_bch2_err_matches(_err, _class);		\
+	unlikely(_bch2_err_matches(_err, _class));	\
 })
 
 int __bch2_err_class(int);
-- 
cgit 


From d95dd378c207ddec7551cce2e047e6067c3c27ab Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 28 May 2023 03:44:38 -0400
Subject: bcachefs: allocate_dropping_locks()

Add two new helpers for allocating memory with btree locks held: The
idea is to first try the allocation with GFP_NOWAIT|__GFP_NOWARN, then
if that fails - unlock, retry with GFP_KERNEL, and then call
trans_relock().

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.h      | 26 ++++++++++++++++++++++++++
 fs/bcachefs/btree_key_cache.c | 13 +++----------
 fs/bcachefs/ec.c              | 11 ++---------
 3 files changed, 31 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index 9676aa335b89..d2af3f38e6f5 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -859,6 +859,32 @@ __bch2_btree_iter_peek_upto_and_restart(struct btree_trans *trans,
 	bch2_trans_unlock(_trans);					\
 	_do ?: bch2_trans_relock(_trans);				\
 })
+
+#define allocate_dropping_locks_errcode(_trans, _do)			\
+({									\
+	gfp_t _gfp = GFP_NOWAIT|__GFP_NOWARN;				\
+	int _ret = _do;							\
+									\
+	if (bch2_err_matches(_ret, ENOMEM)) {				\
+		_gfp = GFP_KERNEL;					\
+		_ret = drop_locks_do(trans, _do);			\
+	}								\
+	_ret;								\
+})
+
+#define allocate_dropping_locks(_trans, _ret, _do)			\
+({									\
+	gfp_t _gfp = GFP_NOWAIT|__GFP_NOWARN;				\
+	typeof(_do) _p = _do;						\
+									\
+	_ret = 0;							\
+	if (unlikely(!_p)) {						\
+		_gfp = GFP_KERNEL;					\
+		_ret = drop_locks_do(trans, ((_p = _do), 0));		\
+	}								\
+	_p;								\
+})
+
 /* new multiple iterator interface: */
 
 void bch2_trans_updates_to_text(struct printbuf *, struct btree_trans *);
diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index 37977b774d61..37beb75e2571 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -264,15 +264,8 @@ bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path,
 		return ck;
 	}
 
-	ck = kmem_cache_zalloc(bch2_key_cache, GFP_NOWAIT|__GFP_NOWARN);
-	if (likely(ck))
-		goto init;
-
-	bch2_trans_unlock(trans);
-
-	ck = kmem_cache_zalloc(bch2_key_cache, GFP_KERNEL);
-
-	ret = bch2_trans_relock(trans);
+	ck = allocate_dropping_locks(trans, ret,
+			kmem_cache_zalloc(bch2_key_cache, _gfp));
 	if (ret) {
 		kmem_cache_free(bch2_key_cache, ck);
 		return ERR_PTR(ret);
@@ -280,7 +273,7 @@ bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path,
 
 	if (!ck)
 		return NULL;
-init:
+
 	INIT_LIST_HEAD(&ck->list);
 	bch2_btree_lock_init(&ck->c, pcpu_readers ? SIX_LOCK_INIT_PCPU : 0);
 
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index 1c35fa1fedd3..dfc0a61afa51 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -578,15 +578,8 @@ static int __ec_stripe_mem_alloc(struct bch_fs *c, size_t idx, gfp_t gfp)
 static int ec_stripe_mem_alloc(struct btree_trans *trans,
 			       struct btree_iter *iter)
 {
-	size_t idx = iter->pos.offset;
-
-	if (!__ec_stripe_mem_alloc(trans->c, idx, GFP_NOWAIT|__GFP_NOWARN))
-		return 0;
-
-	bch2_trans_unlock(trans);
-
-	return   __ec_stripe_mem_alloc(trans->c, idx, GFP_KERNEL) ?:
-		bch2_trans_relock(trans);
+	return allocate_dropping_locks_errcode(trans,
+			__ec_stripe_mem_alloc(trans->c, iter->pos.offset, _gfp));
 }
 
 /*
-- 
cgit 


From 5ff10c0a04c4217b24997d9b127c50602d717ab3 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 28 May 2023 03:44:38 -0400
Subject: bcachefs: Convert acl.c to allocate_dropping_locks()

More work to avoid allocating memory with btree locks held.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/acl.c | 23 ++++++++++++++++-------
 1 file changed, 16 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/acl.c b/fs/bcachefs/acl.c
index 3fe108bc2f08..ce7a460fb308 100644
--- a/fs/bcachefs/acl.c
+++ b/fs/bcachefs/acl.c
@@ -35,12 +35,14 @@ static inline int acl_to_xattr_type(int type)
 /*
  * Convert from filesystem to in-memory representation.
  */
-static struct posix_acl *bch2_acl_from_disk(const void *value, size_t size)
+static struct posix_acl *bch2_acl_from_disk(struct btree_trans *trans,
+					    const void *value, size_t size)
 {
 	const void *p, *end = value + size;
 	struct posix_acl *acl;
 	struct posix_acl_entry *out;
 	unsigned count = 0;
+	int ret;
 
 	if (!value)
 		return NULL;
@@ -81,9 +83,14 @@ static struct posix_acl *bch2_acl_from_disk(const void *value, size_t size)
 	if (!count)
 		return NULL;
 
-	acl = posix_acl_alloc(count, GFP_KERNEL);
+	acl = allocate_dropping_locks(trans, ret,
+			posix_acl_alloc(count, _gfp));
 	if (!acl)
 		return ERR_PTR(-ENOMEM);
+	if (ret) {
+		kfree(acl);
+		return ERR_PTR(ret);
+	}
 
 	out = acl->a_entries;
 
@@ -234,8 +241,6 @@ retry:
 			&X_SEARCH(acl_to_xattr_type(type), "", 0),
 			0);
 	if (ret) {
-		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-			goto retry;
 		if (!bch2_err_matches(ret, ENOENT))
 			acl = ERR_PTR(ret);
 		goto out;
@@ -249,12 +254,15 @@ retry:
 	}
 
 	xattr = bkey_s_c_to_xattr(k);
-	acl = bch2_acl_from_disk(xattr_val(xattr.v),
+	acl = bch2_acl_from_disk(&trans, xattr_val(xattr.v),
 			le16_to_cpu(xattr.v->x_val_len));
 
 	if (!IS_ERR(acl))
 		set_cached_acl(&inode->v, type, acl);
 out:
+	if (bch2_err_matches(PTR_ERR_OR_ZERO(acl), BCH_ERR_transaction_restart))
+		goto retry;
+
 	bch2_trans_iter_exit(&trans, &iter);
 	bch2_trans_exit(&trans);
 	return acl;
@@ -375,13 +383,14 @@ int bch2_acl_chmod(struct btree_trans *trans, subvol_inum inum,
 	if (ret)
 		goto err;
 
-	acl = bch2_acl_from_disk(xattr_val(xattr.v),
+	acl = bch2_acl_from_disk(trans, xattr_val(xattr.v),
 			le16_to_cpu(xattr.v->x_val_len));
 	ret = PTR_ERR_OR_ZERO(acl);
 	if (IS_ERR_OR_NULL(acl))
 		goto err;
 
-	ret = __posix_acl_chmod(&acl, GFP_KERNEL, mode);
+	ret = allocate_dropping_locks_errcode(trans,
+				__posix_acl_chmod(&acl, _gfp, mode));
 	if (ret)
 		goto err;
 
-- 
cgit 


From 21da6101bd947a08104dbf7d7cb22b40be7730e8 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 28 May 2023 19:23:35 -0400
Subject: bcachefs: replicas_deltas_realloc() uses allocate_dropping_locks()

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/buckets.c | 81 +++++++++++++++++++++++++++++++++++----------------
 1 file changed, 56 insertions(+), 25 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 405c5323f247..fbe0cd0a7de3 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -140,7 +140,7 @@ struct bch_fs_usage_online *bch2_fs_usage_read(struct bch_fs *c)
 	unsigned nr_replicas = READ_ONCE(c->replicas.nr);
 	unsigned seq, i;
 retry:
-	ret = kmalloc(__fs_usage_online_u64s(nr_replicas) * sizeof(u64), GFP_NOFS);
+	ret = kmalloc(__fs_usage_online_u64s(nr_replicas) * sizeof(u64), GFP_KERNEL);
 	if (unlikely(!ret))
 		return NULL;
 
@@ -423,8 +423,8 @@ static inline int update_cached_sectors(struct bch_fs *c,
 	return update_replicas(c, k, &r.e, sectors, journal_seq, gc);
 }
 
-static struct replicas_delta_list *
-replicas_deltas_realloc(struct btree_trans *trans, unsigned more)
+static int __replicas_deltas_realloc(struct btree_trans *trans, unsigned more,
+				     gfp_t gfp)
 {
 	struct replicas_delta_list *d = trans->fs_usage_deltas;
 	unsigned new_size = d ? (d->size + more) * 2 : 128;
@@ -433,12 +433,16 @@ replicas_deltas_realloc(struct btree_trans *trans, unsigned more)
 	WARN_ON_ONCE(alloc_size > REPLICAS_DELTA_LIST_MAX);
 
 	if (!d || d->used + more > d->size) {
-		d = krealloc(d, alloc_size, GFP_NOFS|__GFP_ZERO);
+		d = krealloc(d, alloc_size, gfp|__GFP_ZERO);
 
-		BUG_ON(!d && alloc_size > REPLICAS_DELTA_LIST_MAX);
+		if (unlikely(!d)) {
+			if (alloc_size > REPLICAS_DELTA_LIST_MAX)
+				return -ENOMEM;
+
+			d = mempool_alloc(&trans->c->replicas_delta_pool, gfp);
+			if (!d)
+				return -ENOMEM;
 
-		if (!d) {
-			d = mempool_alloc(&trans->c->replicas_delta_pool, GFP_NOFS);
 			memset(d, 0, REPLICAS_DELTA_LIST_MAX);
 
 			if (trans->fs_usage_deltas)
@@ -452,39 +456,51 @@ replicas_deltas_realloc(struct btree_trans *trans, unsigned more)
 		d->size = new_size;
 		trans->fs_usage_deltas = d;
 	}
-	return d;
+
+	return 0;
+}
+
+static int replicas_deltas_realloc(struct btree_trans *trans, unsigned more)
+{
+	return allocate_dropping_locks_errcode(trans,
+				__replicas_deltas_realloc(trans, more, _gfp));
 }
 
-static inline void update_replicas_list(struct btree_trans *trans,
+static inline int update_replicas_list(struct btree_trans *trans,
 					struct bch_replicas_entry *r,
 					s64 sectors)
 {
 	struct replicas_delta_list *d;
 	struct replicas_delta *n;
 	unsigned b;
+	int ret;
 
 	if (!sectors)
-		return;
+		return 0;
 
 	b = replicas_entry_bytes(r) + 8;
-	d = replicas_deltas_realloc(trans, b);
+	ret = replicas_deltas_realloc(trans, b);
+	if (ret)
+		return ret;
 
+	d = trans->fs_usage_deltas;
 	n = (void *) d->d + d->used;
 	n->delta = sectors;
 	memcpy((void *) n + offsetof(struct replicas_delta, r),
 	       r, replicas_entry_bytes(r));
 	bch2_replicas_entry_sort(&n->r);
 	d->used += b;
+	return 0;
 }
 
-static inline void update_cached_sectors_list(struct btree_trans *trans,
+static inline int update_cached_sectors_list(struct btree_trans *trans,
 					      unsigned dev, s64 sectors)
 {
 	struct bch_replicas_padded r;
 
 	bch2_replicas_entry_cached(&r.e, dev);
 
-	update_replicas_list(trans, &r.e, sectors);
+	return update_replicas_list(trans, &r.e, sectors);
 }
 
 int bch2_mark_alloc(struct btree_trans *trans,
@@ -1475,7 +1491,7 @@ static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans,
 
 	bch2_bkey_to_replicas(&r.e, bkey_i_to_s_c(&s->k_i));
 	r.e.data_type = data_type;
-	update_replicas_list(trans, &r.e, sectors);
+	ret = update_replicas_list(trans, &r.e, sectors);
 err:
 	bch2_trans_iter_exit(trans, &iter);
 	return ret;
@@ -1502,7 +1518,7 @@ int bch2_trans_mark_extent(struct btree_trans *trans,
 		: k.k->size;
 	s64 dirty_sectors = 0;
 	bool stale;
-	int ret;
+	int ret = 0;
 
 	r.e.data_type	= data_type;
 	r.e.nr_devs	= 0;
@@ -1521,9 +1537,12 @@ int bch2_trans_mark_extent(struct btree_trans *trans,
 		stale = ret > 0;
 
 		if (p.ptr.cached) {
-			if (!stale)
-				update_cached_sectors_list(trans, p.ptr.dev,
-							   disk_sectors);
+			if (!stale) {
+				ret = update_cached_sectors_list(trans, p.ptr.dev,
+								 disk_sectors);
+				if (ret)
+					return ret;
+			}
 		} else if (!p.has_ec) {
 			dirty_sectors	       += disk_sectors;
 			r.e.devs[r.e.nr_devs++]	= p.ptr.dev;
@@ -1538,9 +1557,9 @@ int bch2_trans_mark_extent(struct btree_trans *trans,
 	}
 
 	if (r.e.nr_devs)
-		update_replicas_list(trans, &r.e, dirty_sectors);
+		ret = update_replicas_list(trans, &r.e, dirty_sectors);
 
-	return 0;
+	return ret;
 }
 
 static int bch2_trans_mark_stripe_bucket(struct btree_trans *trans,
@@ -1657,14 +1676,18 @@ int bch2_trans_mark_stripe(struct btree_trans *trans,
 		s64 sectors = le16_to_cpu(new_s->sectors);
 
 		bch2_bkey_to_replicas(&r.e, bkey_i_to_s_c(new));
-		update_replicas_list(trans, &r.e, sectors * new_s->nr_redundant);
+		ret = update_replicas_list(trans, &r.e, sectors * new_s->nr_redundant);
+		if (ret)
+			return ret;
 	}
 
 	if (old_s) {
 		s64 sectors = -((s64) le16_to_cpu(old_s->sectors));
 
 		bch2_bkey_to_replicas(&r.e, old);
-		update_replicas_list(trans, &r.e, sectors * old_s->nr_redundant);
+		ret = update_replicas_list(trans, &r.e, sectors * old_s->nr_redundant);
+		if (ret)
+			return ret;
 	}
 
 	for (i = 0; i < nr_blocks; i++) {
@@ -1701,8 +1724,12 @@ int bch2_trans_mark_inode(struct btree_trans *trans,
 	int nr = bkey_is_inode(&new->k) - bkey_is_inode(old.k);
 
 	if (nr) {
-		struct replicas_delta_list *d =
-			replicas_deltas_realloc(trans, 0);
+		int ret = replicas_deltas_realloc(trans, 0);
+		struct replicas_delta_list *d = trans->fs_usage_deltas;
+
+		if (ret)
+			return ret;
+
 		d->nr_inodes += nr;
 	}
 
@@ -1721,13 +1748,17 @@ int bch2_trans_mark_reservation(struct btree_trans *trans,
 	unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas;
 	s64 sectors = (s64) k.k->size;
 	struct replicas_delta_list *d;
+	int ret;
 
 	if (flags & BTREE_TRIGGER_OVERWRITE)
 		sectors = -sectors;
 	sectors *= replicas;
 
-	d = replicas_deltas_realloc(trans, 0);
+	ret = replicas_deltas_realloc(trans, 0);
+	if (ret)
+		return ret;
 
+	d = trans->fs_usage_deltas;
 	replicas = clamp_t(unsigned, replicas, 1,
 			   ARRAY_SIZE(d->persistent_reserved));
 
-- 
cgit 


From 4f2c166ebe5baef20e28866b27dd97b0caa585f1 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 4 Jun 2023 18:08:56 -0400
Subject: bcachefs: Fix bch2_fsck_ask_yn()

 - getline() output includes a newline, without stripping that we were
   just looping

 - Make the prompt clearer

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/error.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c
index 545c55dabc27..b08cd23dee00 100644
--- a/fs/bcachefs/error.c
+++ b/fs/bcachefs/error.c
@@ -85,12 +85,13 @@ enum ask_yn bch2_fsck_ask_yn(void)
 	bool ret;
 
 	while (true) {
-		fputs(" (y,n,Y,N) ", stdout);
+		fputs(" (y,n, or Y,N for all errors of this type) ", stdout);
 		fflush(stdout);
 
 		if (getline(&buf, &buflen, stdin) < 0)
 			die("error reading from standard input");
 
+		strim(buf);
 		if (strlen(buf) != 1)
 			continue;
 
-- 
cgit 


From bb125baf512bffef19c510f1c53353a378537070 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 4 Jun 2023 19:40:35 -0400
Subject: bcachefs: Delete warning from promote_alloc()

It's possible to see a -BCH_ERR_ENOSPC_disk_reservation here, and that's
fine.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/io.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index 199a8348355a..25a9f657910c 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -2057,10 +2057,11 @@ static struct promote_op *__promote_alloc(struct btree_trans *trans,
 				.write_flags	= BCH_WRITE_ALLOC_NOWAIT|BCH_WRITE_CACHED,
 			},
 			btree_id, k);
+	/*
+	 * possible errors: -BCH_ERR_nocow_lock_blocked,
+	 * -BCH_ERR_ENOSPC_disk_reservation:
+	 */
 	if (ret) {
-		WARN_ONCE(ret != -BCH_ERR_nocow_lock_blocked,
-			  "%s: saw unknown error %s\n", __func__, bch2_err_str(ret));
-
 		ret = rhashtable_remove_fast(&c->promote_table, &op->hash,
 					bch_promote_params);
 		BUG_ON(ret);
-- 
cgit 


From 49c7cd9d8d5545cf45ef91f548d25b6dd23c72c7 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 30 May 2023 04:59:30 -0400
Subject: bcachefs: More drop_locks_do() conversions

Using drop_locks_do() ensures that every unlock() is paired with a
relock(), with proper error checking.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c            |  4 +---
 fs/bcachefs/btree_update_interior.c | 12 ++++-------
 fs/bcachefs/btree_update_leaf.c     | 42 ++++++++++++-------------------------
 fs/bcachefs/trace.h                 | 19 -----------------
 4 files changed, 18 insertions(+), 59 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 9072819176de..e8fec59dac02 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -2892,9 +2892,7 @@ u32 bch2_trans_begin(struct btree_trans *trans)
 	if (!trans->restarted &&
 	    (need_resched() ||
 	     now - trans->last_begin_time > BTREE_TRANS_MAX_LOCK_HOLD_TIME_NS)) {
-		bch2_trans_unlock(trans);
-		cond_resched();
-		bch2_trans_relock(trans);
+		drop_locks_do(trans, (cond_resched(), 0));
 		now = local_clock();
 	}
 	trans->last_begin_time = now;
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 4d6c286edb04..66da1da2f075 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -1126,23 +1126,19 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
 				      BTREE_UPDATE_JOURNAL_RES,
 				      journal_flags|JOURNAL_RES_GET_NONBLOCK);
 	if (ret) {
-		bch2_trans_unlock(trans);
-
 		if (flags & BTREE_INSERT_JOURNAL_RECLAIM) {
 			ret = -BCH_ERR_journal_reclaim_would_deadlock;
 			goto err;
 		}
 
-		ret = bch2_journal_preres_get(&c->journal, &as->journal_preres,
+		ret = drop_locks_do(trans,
+			bch2_journal_preres_get(&c->journal, &as->journal_preres,
 					      BTREE_UPDATE_JOURNAL_RES,
-					      journal_flags);
-		if (ret) {
+					      journal_flags));
+		if (ret == -BCH_ERR_journal_preres_get_blocked) {
 			trace_and_count(c, trans_restart_journal_preres_get, trans, _RET_IP_, journal_flags);
 			ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_journal_preres_get);
-			goto err;
 		}
-
-		ret = bch2_trans_relock(trans);
 		if (ret)
 			goto err;
 	}
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index ea7e32e7d2fd..b42b83c55c5b 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -947,34 +947,20 @@ int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags,
 			trace_and_count(c, trans_restart_btree_node_split, trans, trace_ip, i->path);
 		break;
 	case -BCH_ERR_btree_insert_need_mark_replicas:
-		bch2_trans_unlock(trans);
-
-		ret = bch2_replicas_delta_list_mark(c, trans->fs_usage_deltas);
-		if (ret)
-			break;
-
-		ret = bch2_trans_relock(trans);
-		if (ret)
-			trace_and_count(c, trans_restart_mark_replicas, trans, trace_ip);
+		ret = drop_locks_do(trans,
+			bch2_replicas_delta_list_mark(c, trans->fs_usage_deltas));
 		break;
 	case -BCH_ERR_journal_res_get_blocked:
-		bch2_trans_unlock(trans);
-
 		if ((flags & BTREE_INSERT_JOURNAL_RECLAIM) &&
 		    !(flags & JOURNAL_WATERMARK_reserved)) {
 			ret = -BCH_ERR_journal_reclaim_would_deadlock;
 			break;
 		}
 
-		ret = bch2_trans_journal_res_get(trans,
+		ret = drop_locks_do(trans,
+			bch2_trans_journal_res_get(trans,
 					(flags & JOURNAL_WATERMARK_MASK)|
-					JOURNAL_RES_GET_CHECK);
-		if (ret)
-			break;
-
-		ret = bch2_trans_relock(trans);
-		if (ret)
-			trace_and_count(c, trans_restart_journal_res_get, trans, trace_ip);
+					JOURNAL_RES_GET_CHECK));
 		break;
 	case -BCH_ERR_btree_insert_need_journal_reclaim:
 		bch2_trans_unlock(trans);
@@ -987,8 +973,6 @@ int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags,
 			break;
 
 		ret = bch2_trans_relock(trans);
-		if (ret)
-			trace_and_count(c, trans_restart_journal_reclaim, trans, trace_ip);
 		break;
 	case -BCH_ERR_btree_insert_need_flush_buffer: {
 		struct btree_write_buffer *wb = &c->btree_write_buffer;
@@ -996,20 +980,20 @@ int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags,
 		ret = 0;
 
 		if (wb->state.nr > wb->size * 3 / 4) {
-			bch2_trans_reset_updates(trans);
 			bch2_trans_unlock(trans);
-
 			mutex_lock(&wb->flush_lock);
 
-			if (wb->state.nr > wb->size * 3 / 4)
+			if (wb->state.nr > wb->size * 3 / 4) {
+				bch2_trans_begin(trans);
 				ret = __bch2_btree_write_buffer_flush(trans,
 						flags|BTREE_INSERT_NOCHECK_RW, true);
-			else
+				if (!ret) {
+					trace_and_count(c, trans_restart_write_buffer_flush, trans, _THIS_IP_);
+					ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_write_buffer_flush);
+				}
+			} else {
 				mutex_unlock(&wb->flush_lock);
-
-			if (!ret) {
-				trace_and_count(c, trans_restart_write_buffer_flush, trans, _THIS_IP_);
-				ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_write_buffer_flush);
+				ret = bch2_trans_relock(trans);
 			}
 		}
 		break;
diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h
index cfb1779d712a..a743ab477966 100644
--- a/fs/bcachefs/trace.h
+++ b/fs/bcachefs/trace.h
@@ -905,13 +905,6 @@ DEFINE_EVENT(transaction_event,	trans_blocked_journal_reclaim,
 	TP_ARGS(trans, caller_ip)
 );
 
-DEFINE_EVENT(transaction_event,	trans_restart_journal_res_get,
-	TP_PROTO(struct btree_trans *trans,
-		 unsigned long caller_ip),
-	TP_ARGS(trans, caller_ip)
-);
-
-
 TRACE_EVENT(trans_restart_journal_preres_get,
 	TP_PROTO(struct btree_trans *trans,
 		 unsigned long caller_ip,
@@ -935,12 +928,6 @@ TRACE_EVENT(trans_restart_journal_preres_get,
 		  __entry->flags)
 );
 
-DEFINE_EVENT(transaction_event,	trans_restart_journal_reclaim,
-	TP_PROTO(struct btree_trans *trans,
-		 unsigned long caller_ip),
-	TP_ARGS(trans, caller_ip)
-);
-
 DEFINE_EVENT(transaction_event,	trans_restart_fault_inject,
 	TP_PROTO(struct btree_trans *trans,
 		 unsigned long caller_ip),
@@ -953,12 +940,6 @@ DEFINE_EVENT(transaction_event,	trans_traverse_all,
 	TP_ARGS(trans, caller_ip)
 );
 
-DEFINE_EVENT(transaction_event,	trans_restart_mark_replicas,
-	TP_PROTO(struct btree_trans *trans,
-		 unsigned long caller_ip),
-	TP_ARGS(trans, caller_ip)
-);
-
 DEFINE_EVENT(transaction_event,	trans_restart_key_cache_raced,
 	TP_PROTO(struct btree_trans *trans,
 		 unsigned long caller_ip),
-- 
cgit 


From bc166d711d4eeff64c101fd4c57bfaaa3a4e7e9e Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 5 Jun 2023 01:15:33 -0400
Subject: bcachefs: Improve backpointers error message

the error message here dated from when backpointers could be stored in
alloc keys; now, we should always print the full key.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/backpointers.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c
index e9ae623cf4a8..11201064d9a4 100644
--- a/fs/bcachefs/backpointers.c
+++ b/fs/bcachefs/backpointers.c
@@ -805,7 +805,7 @@ static int check_one_backpointer(struct btree_trans *trans,
 
 	if (fsck_err_on(!k.k, c,
 			"backpointer for missing extent\n  %s",
-			(bch2_backpointer_k_to_text(&buf, c, bp.s_c), buf.buf)))
+			(bch2_bkey_val_to_text(&buf, c, bp.s_c), buf.buf)))
 		return bch2_btree_delete_at_buffered(trans, BTREE_ID_backpointers, bp.k->p);
 out:
 fsck_err:
-- 
cgit 


From 28551613b725c28552210121499f4e2f6c6a9054 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 5 Jun 2023 01:16:00 -0400
Subject: bcachefs: Clean up tests code

 - delete redundant error messages
 - convert various code to bch2_trans_run

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/tests.c | 77 +++++++++++++----------------------------------------
 1 file changed, 18 insertions(+), 59 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c
index d352821d5614..35df3f940542 100644
--- a/fs/bcachefs/tests.c
+++ b/fs/bcachefs/tests.c
@@ -593,10 +593,8 @@ static int rand_insert(struct bch_fs *c, u64 nr)
 
 		ret = commit_do(&trans, NULL, NULL, 0,
 			__bch2_btree_insert(&trans, BTREE_ID_xattrs, &k.k_i, 0));
-		if (ret) {
-			bch_err(c, "%s(): error %s", __func__, bch2_err_str(ret));
+		if (ret)
 			break;
-		}
 	}
 
 	bch2_trans_exit(&trans);
@@ -629,10 +627,8 @@ static int rand_insert_multi(struct bch_fs *c, u64 nr)
 			__bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[5].k_i, 0) ?:
 			__bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[6].k_i, 0) ?:
 			__bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[7].k_i, 0));
-		if (ret) {
-			bch_err(c, "%s(): error %s", __func__, bch2_err_str(ret));
+		if (ret)
 			break;
-		}
 	}
 
 	bch2_trans_exit(&trans);
@@ -656,10 +652,8 @@ static int rand_lookup(struct bch_fs *c, u64 nr)
 
 		lockrestart_do(&trans, bkey_err(k = bch2_btree_iter_peek(&iter)));
 		ret = bkey_err(k);
-		if (ret) {
-			bch_err(c, "%s(): error %s", __func__, bch2_err_str(ret));
+		if (ret)
 			break;
-		}
 	}
 
 	bch2_trans_iter_exit(&trans, &iter);
@@ -709,10 +703,8 @@ static int rand_mixed(struct bch_fs *c, u64 nr)
 		rand = test_rand();
 		ret = commit_do(&trans, NULL, NULL, 0,
 			rand_mixed_trans(&trans, &iter, &cookie, i, rand));
-		if (ret) {
-			bch_err(c, "%s(): update error: %s", __func__, bch2_err_str(ret));
+		if (ret)
 			break;
-		}
 	}
 
 	bch2_trans_iter_exit(&trans, &iter);
@@ -728,7 +720,7 @@ static int __do_delete(struct btree_trans *trans, struct bpos pos)
 
 	bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, pos,
 			     BTREE_ITER_INTENT);
-	lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek(&iter)));
+	k = bch2_btree_iter_peek(&iter);
 	ret = bkey_err(k);
 	if (ret)
 		goto err;
@@ -755,10 +747,8 @@ static int rand_delete(struct bch_fs *c, u64 nr)
 
 		ret = commit_do(&trans, NULL, NULL, 0,
 			__do_delete(&trans, pos));
-		if (ret) {
-			bch_err(c, "%s(): error %s", __func__, bch2_err_str(ret));
+		if (ret)
 			break;
-		}
 	}
 
 	bch2_trans_exit(&trans);
@@ -767,90 +757,59 @@ static int rand_delete(struct bch_fs *c, u64 nr)
 
 static int seq_insert(struct bch_fs *c, u64 nr)
 {
-	struct btree_trans trans;
 	struct btree_iter iter;
 	struct bkey_s_c k;
 	struct bkey_i_cookie insert;
-	int ret = 0;
 
 	bkey_cookie_init(&insert.k_i);
 
-	bch2_trans_init(&trans, c, 0, 0);
-
-	ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_xattrs,
+	return bch2_trans_run(c,
+		for_each_btree_key_commit(&trans, iter, BTREE_ID_xattrs,
 					SPOS(0, 0, U32_MAX),
 					BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k,
-					NULL, NULL, 0,
-		({
+					NULL, NULL, 0, ({
 			if (iter.pos.offset >= nr)
 				break;
 			insert.k.p = iter.pos;
 			bch2_trans_update(&trans, &iter, &insert.k_i, 0);
-		}));
-	if (ret)
-		bch_err(c, "%s(): error %s", __func__, bch2_err_str(ret));
-
-	bch2_trans_exit(&trans);
-	return ret;
+		})));
 }
 
 static int seq_lookup(struct bch_fs *c, u64 nr)
 {
-	struct btree_trans trans;
 	struct btree_iter iter;
 	struct bkey_s_c k;
-	int ret = 0;
 
-	bch2_trans_init(&trans, c, 0, 0);
-
-	ret = for_each_btree_key2_upto(&trans, iter, BTREE_ID_xattrs,
+	return bch2_trans_run(c,
+		for_each_btree_key2_upto(&trans, iter, BTREE_ID_xattrs,
 				  SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
 				  0, k,
-		0);
-	if (ret)
-		bch_err(c, "%s(): error %s", __func__, bch2_err_str(ret));
-
-	bch2_trans_exit(&trans);
-	return ret;
+		0));
 }
 
 static int seq_overwrite(struct bch_fs *c, u64 nr)
 {
-	struct btree_trans trans;
 	struct btree_iter iter;
 	struct bkey_s_c k;
-	int ret = 0;
-
-	bch2_trans_init(&trans, c, 0, 0);
 
-	ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_xattrs,
+	return bch2_trans_run(c,
+		for_each_btree_key_commit(&trans, iter, BTREE_ID_xattrs,
 					SPOS(0, 0, U32_MAX),
 					BTREE_ITER_INTENT, k,
-					NULL, NULL, 0,
-		({
+					NULL, NULL, 0, ({
 			struct bkey_i_cookie u;
 
 			bkey_reassemble(&u.k_i, k);
 			bch2_trans_update(&trans, &iter, &u.k_i, 0);
-		}));
-	if (ret)
-		bch_err(c, "%s(): error %s", __func__, bch2_err_str(ret));
-
-	bch2_trans_exit(&trans);
-	return ret;
+		})));
 }
 
 static int seq_delete(struct bch_fs *c, u64 nr)
 {
-	int ret;
-
-	ret = bch2_btree_delete_range(c, BTREE_ID_xattrs,
+	return bch2_btree_delete_range(c, BTREE_ID_xattrs,
 				      SPOS(0, 0, U32_MAX),
 				      POS(0, U64_MAX),
 				      0, NULL);
-	if (ret)
-		bch_err(c, "%s(): error %s", __func__, bch2_err_str(ret));
-	return ret;
 }
 
 typedef int (*perf_test_fn)(struct bch_fs *, u64);
-- 
cgit 


From fec4fc82b531beb2cc67b734140ffe776af33f7c Mon Sep 17 00:00:00 2001
From: Brian Foster <bfoster@redhat.com>
Date: Tue, 30 May 2023 14:41:50 -0400
Subject: bcachefs: create internal disk_groups sysfs file

We have bch2_sb_disk_groups_to_text() to dump disk group labels, but
no good information on device group membership at runtime. Add
bch2_disk_groups_to_text() and an associated 'disk_groups' sysfs
file to print group and device relationships.

Signed-off-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/disk_groups.c | 34 ++++++++++++++++++++++++++++++++++
 fs/bcachefs/disk_groups.h |  2 ++
 fs/bcachefs/sysfs.c       |  6 ++++++
 3 files changed, 42 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/disk_groups.c b/fs/bcachefs/disk_groups.c
index 1a8f8b3750da..aa3a4e5a8b2e 100644
--- a/fs/bcachefs/disk_groups.c
+++ b/fs/bcachefs/disk_groups.c
@@ -87,6 +87,40 @@ err:
 	return ret;
 }
 
+void bch2_disk_groups_to_text(struct printbuf *out, struct bch_fs *c)
+{
+	struct bch_disk_groups_cpu *g;
+	struct bch_dev *ca;
+	int i;
+	unsigned iter;
+
+	out->atomic++;
+	rcu_read_lock();
+
+	g = rcu_dereference(c->disk_groups);
+	if (!g)
+		goto out;
+
+	for (i = 0; i < g->nr; i++) {
+		if (i)
+			prt_printf(out, " ");
+
+		if (g->entries[i].deleted) {
+			prt_printf(out, "[deleted]");
+			continue;
+		}
+
+		prt_printf(out, "[parent %d devs", g->entries[i].parent);
+		for_each_member_device_rcu(ca, c, iter, &g->entries[i].devs)
+			prt_printf(out, " %s", ca->name);
+		prt_printf(out, "]");
+	}
+
+out:
+	rcu_read_unlock();
+	out->atomic--;
+}
+
 static void bch2_sb_disk_groups_to_text(struct printbuf *out,
 					struct bch_sb *sb,
 					struct bch_sb_field *f)
diff --git a/fs/bcachefs/disk_groups.h b/fs/bcachefs/disk_groups.h
index e4470c357a66..bf39db3868bf 100644
--- a/fs/bcachefs/disk_groups.h
+++ b/fs/bcachefs/disk_groups.h
@@ -88,4 +88,6 @@ int bch2_dev_group_set(struct bch_fs *, struct bch_dev *, const char *);
 const char *bch2_sb_validate_disk_groups(struct bch_sb *,
 					 struct bch_sb_field *);
 
+void bch2_disk_groups_to_text(struct printbuf *, struct bch_fs *);
+
 #endif /* _BCACHEFS_DISK_GROUPS_H */
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index b2274cb35172..77f92d537af6 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -223,6 +223,7 @@ static void bch2_write_refs_to_text(struct printbuf *out, struct bch_fs *c)
 #endif
 
 read_attribute(internal_uuid);
+read_attribute(disk_groups);
 
 read_attribute(has_data);
 read_attribute(alloc_debug);
@@ -471,6 +472,9 @@ SHOW(bch2_fs)
 	if (attr == &sysfs_nocow_lock_table)
 		bch2_nocow_locks_to_text(out, &c->nocow_locks);
 
+	if (attr == &sysfs_disk_groups)
+		bch2_disk_groups_to_text(out, c);
+
 	return 0;
 }
 
@@ -681,6 +685,8 @@ struct attribute *bch2_fs_internal_files[] = {
 	&sysfs_moving_ctxts,
 
 	&sysfs_internal_uuid,
+
+	&sysfs_disk_groups,
 	NULL
 };
 
-- 
cgit 


From a1dd428b8bb78a03f210e18b05b0d73cac86fb7d Mon Sep 17 00:00:00 2001
From: Brian Foster <bfoster@redhat.com>
Date: Tue, 30 May 2023 14:48:58 -0400
Subject: bcachefs: push rcu lock down into bch2_target_to_mask()

We have one caller that cycles the rcu lock solely for this call
(via target_rw_devs()), and we'd like to add another. Simplify
things by pushing the rcu lock down into bch2_target_to_mask(),
similar to how bch2_dev_in_target() works.

Signed-off-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_foreground.c |  2 --
 fs/bcachefs/disk_groups.c      | 16 +++++++++++++---
 2 files changed, 13 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index ec77601ebd0c..a7e6852271d2 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -934,9 +934,7 @@ static int __open_bucket_add_buckets(struct btree_trans *trans,
 	unsigned i;
 	int ret;
 
-	rcu_read_lock();
 	devs = target_rw_devs(c, wp->data_type, target);
-	rcu_read_unlock();
 
 	/* Don't allocate from devices we already have pointers to: */
 	for (i = 0; i < devs_have->nr; i++)
diff --git a/fs/bcachefs/disk_groups.c b/fs/bcachefs/disk_groups.c
index aa3a4e5a8b2e..52b640077970 100644
--- a/fs/bcachefs/disk_groups.c
+++ b/fs/bcachefs/disk_groups.c
@@ -208,26 +208,36 @@ int bch2_sb_disk_groups_to_cpu(struct bch_fs *c)
 const struct bch_devs_mask *bch2_target_to_mask(struct bch_fs *c, unsigned target)
 {
 	struct target t = target_decode(target);
+	struct bch_devs_mask *devs;
+
+	rcu_read_lock();
 
 	switch (t.type) {
 	case TARGET_NULL:
-		return NULL;
+		devs = NULL;
+		break;
 	case TARGET_DEV: {
 		struct bch_dev *ca = t.dev < c->sb.nr_devices
 			? rcu_dereference(c->devs[t.dev])
 			: NULL;
-		return ca ? &ca->self : NULL;
+		devs = ca ? &ca->self : NULL;
+		break;
 	}
 	case TARGET_GROUP: {
 		struct bch_disk_groups_cpu *g = rcu_dereference(c->disk_groups);
 
-		return g && t.group < g->nr && !g->entries[t.group].deleted
+		devs = g && t.group < g->nr && !g->entries[t.group].deleted
 			? &g->entries[t.group].devs
 			: NULL;
+		break;
 	}
 	default:
 		BUG();
 	}
+
+	rcu_read_unlock();
+
+	return devs;
 }
 
 bool bch2_dev_in_target(struct bch_fs *c, unsigned dev, unsigned target)
-- 
cgit 


From 5bc740820e7ae01b26a4dbb612df086f41f79785 Mon Sep 17 00:00:00 2001
From: Brian Foster <bfoster@redhat.com>
Date: Tue, 30 May 2023 14:51:12 -0400
Subject: bcachefs: don't spin in rebalance when background target is not
 usable

If a bcachefs filesystem is configured with a background device
(disk group), rebalance will relocate data to this device in the
background by checking extent keys for whether they currently reside
in the specified target. For keys that do not, rebalance performs a
read/write cycle to allow the write path to properly relocate data.

If the background target is not usable (read-only, for example),
however, the write path doesn't actually move data to another
device. Instead, rebalance spins indefinitely reading and rewriting
the same data over and over to the same device. If the background
target is made available again, the rebalance picks this up,
relocates the data, and eventually terminates.

To avoid this spinning behavior, update the rebalance background
target logic to not only check whether the extent is not in the
target, but whether the target is actually usable as well. If not,
then don't mark the key for rewrite.

Signed-off-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/disk_groups.h | 8 ++++++++
 fs/bcachefs/rebalance.c   | 3 ++-
 2 files changed, 10 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/disk_groups.h b/fs/bcachefs/disk_groups.h
index bf39db3868bf..ec12584ceee7 100644
--- a/fs/bcachefs/disk_groups.h
+++ b/fs/bcachefs/disk_groups.h
@@ -68,6 +68,14 @@ static inline struct bch_devs_mask target_rw_devs(struct bch_fs *c,
 	return devs;
 }
 
+static inline bool bch2_target_accepts_data(struct bch_fs *c,
+					    enum bch_data_type data_type,
+					    u16 target)
+{
+	struct bch_devs_mask rw_devs = target_rw_devs(c, data_type, target);
+	return !bitmap_empty(rw_devs.d, BCH_SB_MEMBERS_MAX);
+}
+
 bool bch2_dev_in_target(struct bch_fs *, unsigned, unsigned);
 
 int bch2_disk_path_find(struct bch_sb_handle *, const char *);
diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c
index 66c40999163d..989f37a3b46a 100644
--- a/fs/bcachefs/rebalance.c
+++ b/fs/bcachefs/rebalance.c
@@ -57,7 +57,8 @@ static bool rebalance_pred(struct bch_fs *c, void *arg,
 		i = 0;
 		bkey_for_each_ptr(ptrs, ptr) {
 			if (!ptr->cached &&
-			    !bch2_dev_in_target(c, ptr->dev, io_opts->background_target))
+			    !bch2_dev_in_target(c, ptr->dev, io_opts->background_target) &&
+			    bch2_target_accepts_data(c, BCH_DATA_user, io_opts->background_target))
 				data_opts->rewrite_ptrs |= 1U << i;
 			i++;
 		}
-- 
cgit 


From b0e8c75e40a863dd40ecdf8fd6f8cdceacb965e5 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 9 Jun 2023 15:41:41 -0400
Subject: bcachefs: Fix subvol deletion deadlock

d_prune_aliases() may call bch2_evict_inode(), which needs
c->vfs_inodes_list_lock.

Fix this by always calling igrab() before putting the inodes onto our
disposal list, and then calling d_prune_aliases() with
c->vfs_inodes_lock dropped.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs.c | 26 +++++++++++---------------
 1 file changed, 11 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index ba7aff6b8a51..f417889eba08 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -1481,22 +1481,14 @@ again:
 			continue;
 
 		if (!(inode->v.i_state & I_DONTCACHE) &&
-		    !(inode->v.i_state & I_FREEING)) {
+		    !(inode->v.i_state & I_FREEING) &&
+		    igrab(&inode->v)) {
 			this_pass_clean = false;
 
-			d_mark_dontcache(&inode->v);
-			d_prune_aliases(&inode->v);
-
-			/*
-			 * If i_count was zero, we have to take and release a
-			 * ref in order for I_DONTCACHE to be noticed and the
-			 * inode to be dropped;
-			 */
-
-			if (!atomic_read(&inode->v.i_count) &&
-			    igrab(&inode->v) &&
-			    darray_push_gfp(&grabbed, inode, GFP_ATOMIC|__GFP_NOWARN))
+			if (darray_push_gfp(&grabbed, inode, GFP_ATOMIC|__GFP_NOWARN)) {
+				iput(&inode->v);
 				break;
+			}
 		} else if (clean_pass && this_pass_clean) {
 			wait_queue_head_t *wq = bit_waitqueue(&inode->v.i_state, __I_NEW);
 			DEFINE_WAIT_BIT(wait, &inode->v.i_state, __I_NEW);
@@ -1511,8 +1503,12 @@ again:
 	}
 	mutex_unlock(&c->vfs_inodes_lock);
 
-	darray_for_each(grabbed, i)
-		iput(&(*i)->v);
+	darray_for_each(grabbed, i) {
+		inode = *i;
+		d_mark_dontcache(&inode->v);
+		d_prune_aliases(&inode->v);
+		iput(&inode->v);
+	}
 	grabbed.nr = 0;
 
 	if (!clean_pass || !this_pass_clean) {
-- 
cgit 


From 954ed17e029fbf810826739aa190cd559b6e4036 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Tue, 30 May 2023 08:15:41 -0400
Subject: bcachefs: fix NULL pointer dereference in try_alloc_bucket

On Mon, 29 May 2023, Mikulas Patocka wrote:

> The oops happens in set_btree_iter_dontneed and it is caused by the fact
> that iter->path is NULL. The code in try_alloc_bucket is buggy because it
> sets "struct btree_iter iter = { NULL };" and then jumps to the "err"
> label that tries to dereference values in "iter".

Here I'm sending a patch for it.

From: Mikulas Patocka <mpatocka@redhat.com>

The function try_alloc_bucket sets the variable "iter" to NULL and then
(on various error conditions) jumps to the label "err". On the "err"
label, it calls "set_btree_iter_dontneed" that tries to dereference
"iter->trans" and "iter->path".

So, we get an oops on error condition.

This patch fixes the crash by testing that iter.trans and iter.path is
non-zero before calling set_btree_iter_dontneed.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_foreground.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index a7e6852271d2..95829bbfe033 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -371,7 +371,8 @@ static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bc
 	if (!ob)
 		iter.path->preserve = false;
 err:
-	set_btree_iter_dontneed(&iter);
+	if (iter.trans && iter.path)
+		set_btree_iter_dontneed(&iter);
 	bch2_trans_iter_exit(trans, &iter);
 	printbuf_exit(&buf);
 	return ob;
-- 
cgit 


From 99a3d39893615ac107a4f82e86d4c26792131b91 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 10 Jun 2023 01:37:16 -0400
Subject: bcachefs: ec: Fix a lost wakeup

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/ec.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index dfc0a61afa51..b7e3889b114b 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -1146,6 +1146,7 @@ err:
 	mutex_lock(&c->ec_stripe_new_lock);
 	list_del(&s->list);
 	mutex_unlock(&c->ec_stripe_new_lock);
+	wake_up(&c->ec_stripe_new_wait);
 
 	ec_stripe_buf_exit(&s->existing_stripe);
 	ec_stripe_buf_exit(&s->new_stripe);
-- 
cgit 


From 7724664f0ee4520f1c7fd3c0cc26223ba532986f Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 11 Jun 2023 19:21:16 -0400
Subject: bcachefs: New assertions when marking filesystem clean

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/super.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 8f0cbd7ada82..9f1cca7d6c8e 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -302,6 +302,11 @@ void bch2_fs_read_only(struct bch_fs *c)
 	    test_bit(BCH_FS_STARTED, &c->flags) &&
 	    test_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags) &&
 	    !c->opts.norecovery) {
+		BUG_ON(c->journal.last_empty_seq != journal_cur_seq(&c->journal));
+		BUG_ON(atomic_read(&c->btree_cache.dirty));
+		BUG_ON(atomic_long_read(&c->btree_key_cache.nr_dirty));
+		BUG_ON(c->btree_write_buffer.state.nr);
+
 		bch_verbose(c, "marking filesystem clean");
 		bch2_fs_mark_clean(c);
 	}
-- 
cgit 


From 8e5b1115f1dd88125cbb06c344ba1f4214265042 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 11 Jun 2023 19:45:21 -0400
Subject: bcachefs: Write buffer flush needs BTREE_INSERT_NOCHECK_RW

btree write buffer flush is only invoked from contexts that already hold
a write ref, and checking if we're still RW could cause us to fail to
completely flush the write buffer when shutting down.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_write_buffer.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_write_buffer.c b/fs/bcachefs/btree_write_buffer.c
index 9983a47853b9..88c4b50dd70f 100644
--- a/fs/bcachefs/btree_write_buffer.c
+++ b/fs/bcachefs/btree_write_buffer.c
@@ -78,6 +78,7 @@ trans_commit:
 	return  bch2_trans_update(trans, iter, &wb->k, 0) ?:
 		bch2_trans_commit(trans, NULL, NULL,
 				  commit_flags|
+				  BTREE_INSERT_NOCHECK_RW|
 				  BTREE_INSERT_NOFAIL|
 				  BTREE_INSERT_JOURNAL_RECLAIM);
 }
-- 
cgit 


From 25c70097a65a22a5799442c8935fa927d91bddaf Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 11 Jun 2023 18:24:04 -0400
Subject: bcachefs: Delete weird hacky transaction restart injection

since we currently don't have a good fault injection library,
bch2_btree_insert_node() was randomly injecting faults based on
local_clock().

At the very least this should have been a debug mode only thing, but
this is a brittle method so let's just delete it.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update_interior.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 66da1da2f075..d2811c4756b7 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -1674,9 +1674,6 @@ static int bch2_btree_insert_node(struct btree_update *as, struct btree_trans *t
 	BUG_ON(!as || as->b);
 	bch2_verify_keylist_sorted(keys);
 
-	if ((local_clock() & 63) == 63)
-		return btree_trans_restart(trans, BCH_ERR_transaction_restart_split_race);
-
 	ret = bch2_btree_node_lock_write(trans, path, &b->c);
 	if (ret)
 		return ret;
-- 
cgit 


From 995f9128e03f769e4cdbcf6dbea8f0db5ce75c69 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 16 Mar 2023 18:05:00 -0400
Subject: bcachefs: Fix try_decrease_writepoints()

 - We may need to drop btree locks before taking the writepoint_lock, as
   is done in other places.
 - We should be using open_bucket_free_unused(), so that we don't waste
   space.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_foreground.c | 38 +++++++++++++++++++++-----------------
 1 file changed, 21 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index 95829bbfe033..3c5100c26916 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -35,6 +35,15 @@
 #include <linux/rculist.h>
 #include <linux/rcupdate.h>
 
+static void bch2_trans_mutex_lock_norelock(struct btree_trans *trans,
+					   struct mutex *lock)
+{
+	if (!mutex_trylock(lock)) {
+		bch2_trans_unlock(trans);
+		mutex_lock(lock);
+	}
+}
+
 const char * const bch2_alloc_reserves[] = {
 #define x(t) #t,
 	BCH_ALLOC_RESERVES()
@@ -150,9 +159,7 @@ static struct open_bucket *bch2_open_bucket_alloc(struct bch_fs *c)
 	return ob;
 }
 
-static void open_bucket_free_unused(struct bch_fs *c,
-				    struct write_point *wp,
-				    struct open_bucket *ob)
+static void open_bucket_free_unused(struct bch_fs *c, struct open_bucket *ob)
 {
 	BUG_ON(c->open_buckets_partial_nr >=
 	       ARRAY_SIZE(c->open_buckets_partial));
@@ -1158,9 +1165,12 @@ static bool try_increase_writepoints(struct bch_fs *c)
 	return true;
 }
 
-static bool try_decrease_writepoints(struct bch_fs *c, unsigned old_nr)
+static bool try_decrease_writepoints(struct btree_trans *trans, unsigned old_nr)
 {
+	struct bch_fs *c = trans->c;
 	struct write_point *wp;
+	struct open_bucket *ob;
+	unsigned i;
 
 	mutex_lock(&c->write_points_hash_lock);
 	if (c->write_points_nr < old_nr) {
@@ -1179,19 +1189,13 @@ static bool try_decrease_writepoints(struct bch_fs *c, unsigned old_nr)
 	hlist_del_rcu(&wp->node);
 	mutex_unlock(&c->write_points_hash_lock);
 
-	bch2_writepoint_stop(c, NULL, false, wp);
+	bch2_trans_mutex_lock_norelock(trans, &wp->lock);
+	open_bucket_for_each(c, &wp->ptrs, ob, i)
+		open_bucket_free_unused(c, ob);
+	mutex_unlock(&wp->lock);
 	return true;
 }
 
-static void bch2_trans_mutex_lock_norelock(struct btree_trans *trans,
-				  struct mutex *lock)
-{
-	if (!mutex_trylock(lock)) {
-		bch2_trans_unlock(trans);
-		mutex_lock(lock);
-	}
-}
-
 static struct write_point *writepoint_find(struct btree_trans *trans,
 					   unsigned long write_point)
 {
@@ -1336,7 +1340,7 @@ alloc_done:
 
 	/* Free buckets we didn't use: */
 	open_bucket_for_each(c, &wp->ptrs, ob, i)
-		open_bucket_free_unused(c, wp, ob);
+		open_bucket_free_unused(c, ob);
 
 	wp->ptrs = ptrs;
 
@@ -1353,13 +1357,13 @@ err:
 		if (ptrs.nr < ARRAY_SIZE(ptrs.v))
 			ob_push(c, &ptrs, ob);
 		else
-			open_bucket_free_unused(c, wp, ob);
+			open_bucket_free_unused(c, ob);
 	wp->ptrs = ptrs;
 
 	mutex_unlock(&wp->lock);
 
 	if (bch2_err_matches(ret, BCH_ERR_freelist_empty) &&
-	    try_decrease_writepoints(c, write_points_nr))
+	    try_decrease_writepoints(trans, write_points_nr))
 		goto retry;
 
 	if (bch2_err_matches(ret, BCH_ERR_open_buckets_empty) ||
-- 
cgit 


From 253748a26a14ae22123f3ab670ae04eb15fccc2e Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 13 Jun 2023 15:05:40 -0400
Subject: bcachefs: snapshot_to_text() includes snapshot tree

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/subvolume.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c
index f07b3e2b3226..4b6631c229ee 100644
--- a/fs/bcachefs/subvolume.c
+++ b/fs/bcachefs/subvolume.c
@@ -86,13 +86,14 @@ void bch2_snapshot_to_text(struct printbuf *out, struct bch_fs *c,
 {
 	struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(k);
 
-	prt_printf(out, "is_subvol %llu deleted %llu parent %10u children %10u %10u subvol %u",
+	prt_printf(out, "is_subvol %llu deleted %llu parent %10u children %10u %10u subvol %u tree %u",
 	       BCH_SNAPSHOT_SUBVOL(s.v),
 	       BCH_SNAPSHOT_DELETED(s.v),
 	       le32_to_cpu(s.v->parent),
 	       le32_to_cpu(s.v->children[0]),
 	       le32_to_cpu(s.v->children[1]),
-	       le32_to_cpu(s.v->subvol));
+	       le32_to_cpu(s.v->subvol),
+	       le32_to_cpu(s.v->tree));
 }
 
 int bch2_snapshot_invalid(const struct bch_fs *c, struct bkey_s_c k,
-- 
cgit 


From 91ecd41b7f02b95279dddcb2193af454efd39497 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 13 Jun 2023 15:12:04 -0400
Subject: bcachefs: bch2_extent_ptr_desired_durability()

This adds a new helper for getting a pointer's durability irrespective
of the device state, and uses it in the the data update path.

This fixes a bug where we do a data update but request 0 replicas to be
allocated, because the replica being rewritten is on a device marked as
failed.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/data_update.c |  2 +-
 fs/bcachefs/extents.c     | 28 +++++++++++++++++++++-------
 fs/bcachefs/extents.h     |  1 +
 3 files changed, 23 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c
index c89ee14f8b6b..9f7a30c7ad36 100644
--- a/fs/bcachefs/data_update.c
+++ b/fs/bcachefs/data_update.c
@@ -474,7 +474,7 @@ int bch2_data_update_init(struct btree_trans *trans,
 			if (crc_is_compressed(p.crc))
 				reserve_sectors += k.k->size;
 
-			m->op.nr_replicas += bch2_extent_ptr_durability(c, &p);
+			m->op.nr_replicas += bch2_extent_ptr_desired_durability(c, &p);
 		} else if (!p.ptr.cached) {
 			bch2_dev_list_add_dev(&m->op.devs_have, p.ptr.dev);
 		}
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index e2b126ad2bab..7e00550980de 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -641,9 +641,8 @@ unsigned bch2_bkey_replicas(struct bch_fs *c, struct bkey_s_c k)
 	return replicas;
 }
 
-unsigned bch2_extent_ptr_durability(struct bch_fs *c, struct extent_ptr_decoded *p)
+unsigned bch2_extent_ptr_desired_durability(struct bch_fs *c, struct extent_ptr_decoded *p)
 {
-	unsigned durability = 0;
 	struct bch_dev *ca;
 
 	if (p->ptr.cached)
@@ -651,13 +650,28 @@ unsigned bch2_extent_ptr_durability(struct bch_fs *c, struct extent_ptr_decoded
 
 	ca = bch_dev_bkey_exists(c, p->ptr.dev);
 
-	if (ca->mi.state != BCH_MEMBER_STATE_failed)
-		durability = max_t(unsigned, durability, ca->mi.durability);
+	return ca->mi.durability +
+		(p->has_ec
+		 ? p->ec.redundancy
+		 : 0);
+}
 
-	if (p->has_ec)
-		durability += p->ec.redundancy;
+unsigned bch2_extent_ptr_durability(struct bch_fs *c, struct extent_ptr_decoded *p)
+{
+	struct bch_dev *ca;
 
-	return durability;
+	if (p->ptr.cached)
+		return 0;
+
+	ca = bch_dev_bkey_exists(c, p->ptr.dev);
+
+	if (ca->mi.state == BCH_MEMBER_STATE_failed)
+		return 0;
+
+	return ca->mi.durability +
+		(p->has_ec
+		 ? p->ec.redundancy
+		 : 0);
 }
 
 unsigned bch2_bkey_durability(struct bch_fs *c, struct bkey_s_c k)
diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h
index 31c8140950e0..3ba41e37d864 100644
--- a/fs/bcachefs/extents.h
+++ b/fs/bcachefs/extents.h
@@ -610,6 +610,7 @@ bool bch2_bkey_is_incompressible(struct bkey_s_c);
 unsigned bch2_bkey_sectors_compressed(struct bkey_s_c);
 
 unsigned bch2_bkey_replicas(struct bch_fs *, struct bkey_s_c);
+unsigned bch2_extent_ptr_desired_durability(struct bch_fs *, struct extent_ptr_decoded *);
 unsigned bch2_extent_ptr_durability(struct bch_fs *, struct extent_ptr_decoded *);
 unsigned bch2_bkey_durability(struct bch_fs *, struct bkey_s_c);
 
-- 
cgit 


From 45a1ab57ddb53a8b392baab2142c909154a8d37c Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 16 Jun 2023 23:30:02 -0400
Subject: bcachefs: Fix bch2_btree_update_start()

The calculation for number of nodes to allocate in
bch2_btree_update_start() was incorrect - this fixes a BUG_ON() on the
small nodes test.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update_interior.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index d2811c4756b7..e95e48857bb8 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -1077,7 +1077,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
 					BKEY_BTREE_PTR_U64s_MAX * (1 + split)))
 			break;
 
-		split = true;
+		split = path->l[update_level].b->nr.live_u64s > BTREE_SPLIT_THRESHOLD(c);
 	}
 
 	if (flags & BTREE_INSERT_GC_LOCK_HELD)
-- 
cgit 


From 25aa8c2167306a3919fb6503494cd6078e33a71e Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 18 Jun 2023 13:25:09 -0400
Subject: bcachefs: bch2_trans_unlock_noassert()

This fixes a spurious assert in the btree node read path.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_cache.c   | 2 +-
 fs/bcachefs/btree_locking.c | 8 ++++++++
 fs/bcachefs/btree_locking.h | 2 ++
 3 files changed, 11 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index 681a47b70a65..435e68888918 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -744,7 +744,7 @@ static noinline struct btree *bch2_btree_node_fill(struct btree_trans *trans,
 
 	/* Unlock before doing IO: */
 	if (trans && sync)
-		bch2_trans_unlock(trans);
+		bch2_trans_unlock_noassert(trans);
 
 	bch2_btree_node_read(c, b, sync);
 
diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c
index 23a6d63223af..dc3aa70a0380 100644
--- a/fs/bcachefs/btree_locking.c
+++ b/fs/bcachefs/btree_locking.c
@@ -715,6 +715,14 @@ int bch2_trans_relock_notrace(struct btree_trans *trans)
 	return 0;
 }
 
+void bch2_trans_unlock_noassert(struct btree_trans *trans)
+{
+	struct btree_path *path;
+
+	trans_for_each_path(trans, path)
+		__bch2_btree_path_unlock(trans, path);
+}
+
 void bch2_trans_unlock(struct btree_trans *trans)
 {
 	struct btree_path *path;
diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h
index d3837c25f110..3b537e451d2c 100644
--- a/fs/bcachefs/btree_locking.h
+++ b/fs/bcachefs/btree_locking.h
@@ -21,6 +21,8 @@ void bch2_assert_btree_nodes_not_locked(void);
 static inline void bch2_assert_btree_nodes_not_locked(void) {}
 #endif
 
+void bch2_trans_unlock_noassert(struct btree_trans *);
+
 static inline bool is_btree_node(struct btree_path *path, unsigned l)
 {
 	return l < BTREE_MAX_DEPTH && !IS_ERR_OR_NULL(path->l[l].b);
-- 
cgit 


From e96f5a61cb8008a1acae9a14cc9d382554da60b4 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 18 Jun 2023 13:25:35 -0400
Subject: bcachefs: Fix bch2_check_discard_freespace_key()

We weren't correctly checking the freespace btree - it's an extents
btree, which means we need to iterate over each bucket in a freespace
extent.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c | 47 +++++++++++++++++++++++++++++++-----------
 1 file changed, 35 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index f774a660a681..6c8bcb210ad9 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -788,10 +788,12 @@ static int bch2_bucket_do_index(struct btree_trans *trans,
 	if (ca->mi.freespace_initialized &&
 	    test_bit(BCH_FS_CHECK_ALLOC_DONE, &c->flags) &&
 	    bch2_trans_inconsistent_on(old.k->type != old_type, trans,
-			"incorrect key when %s %s btree (got %s should be %s)\n"
+			"incorrect key when %s %s:%llu:%llu:0 (got %s should be %s)\n"
 			"  for %s",
 			set ? "setting" : "clearing",
 			bch2_btree_ids[btree],
+			iter.pos.inode,
+			iter.pos.offset,
 			bch2_bkey_types[old.k->type],
 			bch2_bkey_types[old_type],
 			(bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) {
@@ -1278,8 +1280,8 @@ fsck_err:
 	return ret;
 }
 
-static int bch2_check_discard_freespace_key(struct btree_trans *trans,
-					    struct btree_iter *iter)
+static int __bch2_check_discard_freespace_key(struct btree_trans *trans,
+					      struct btree_iter *iter)
 {
 	struct bch_fs *c = trans->c;
 	struct btree_iter alloc_iter;
@@ -1313,23 +1315,46 @@ static int bch2_check_discard_freespace_key(struct btree_trans *trans,
 	if (fsck_err_on(a->data_type != state ||
 			(state == BCH_DATA_free &&
 			 genbits != alloc_freespace_genbits(*a)), c,
-			"%s\n  incorrectly set in %s index (free %u, genbits %llu should be %llu)",
+			"%s\n  incorrectly set at %s:%llu:%llu:0 (free %u, genbits %llu should be %llu)",
 			(bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf),
 			bch2_btree_ids[iter->btree_id],
+			iter->pos.inode,
+			iter->pos.offset,
 			a->data_type == state,
 			genbits >> 56, alloc_freespace_genbits(*a) >> 56))
 		goto delete;
 out:
 fsck_err:
+	set_btree_iter_dontneed(&alloc_iter);
 	bch2_trans_iter_exit(trans, &alloc_iter);
 	printbuf_exit(&buf);
 	return ret;
 delete:
-	ret = bch2_btree_delete_extent_at(trans, iter,
-			iter->btree_id == BTREE_ID_freespace ? 1 : 0, 0);
+	ret =   bch2_btree_delete_extent_at(trans, iter,
+			iter->btree_id == BTREE_ID_freespace ? 1 : 0, 0) ?:
+		bch2_trans_commit(trans, NULL, NULL,
+			BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW);
 	goto out;
 }
 
+static int bch2_check_discard_freespace_key(struct btree_trans *trans,
+					    struct btree_iter *iter,
+					    struct bpos end)
+{
+	if (!btree_node_type_is_extents(iter->btree_id)) {
+		return __bch2_check_discard_freespace_key(trans, iter);
+	} else {
+		int ret = 0;
+
+		while (!bkey_eq(iter->pos, end) &&
+		       !(ret = btree_trans_too_many_iters(trans) ?:
+			       __bch2_check_discard_freespace_key(trans, iter)))
+			bch2_btree_iter_set_pos(iter, bpos_nosnap_successor(iter->pos));
+
+		return ret;
+	}
+}
+
 /*
  * We've already checked that generation numbers in the bucket_gens btree are
  * valid for buckets that exist; this just checks for keys for nonexistent
@@ -1481,16 +1506,14 @@ bkey_err:
 	if (ret < 0)
 		goto err;
 
-	ret = for_each_btree_key_commit(&trans, iter,
+	ret = for_each_btree_key2(&trans, iter,
 			BTREE_ID_need_discard, POS_MIN,
 			BTREE_ITER_PREFETCH, k,
-			NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW,
-		bch2_check_discard_freespace_key(&trans, &iter)) ?:
-	      for_each_btree_key_commit(&trans, iter,
+		bch2_check_discard_freespace_key(&trans, &iter, k.k->p)) ?:
+	      for_each_btree_key2(&trans, iter,
 			BTREE_ID_freespace, POS_MIN,
 			BTREE_ITER_PREFETCH, k,
-			NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW,
-		bch2_check_discard_freespace_key(&trans, &iter)) ?:
+		bch2_check_discard_freespace_key(&trans, &iter, k.k->p)) ?:
 	      for_each_btree_key_commit(&trans, iter,
 			BTREE_ID_bucket_gens, POS_MIN,
 			BTREE_ITER_PREFETCH, k,
-- 
cgit 


From 6547ebabdaac4407ccc978f63f4dc4d9f8936783 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 19 Jun 2023 00:07:40 -0400
Subject: bcachefs: Don't call lock_graph_descend() with wait lock held

This fixes a deadlock:

01305 WARNING: possible circular locking dependency detected
01305 6.3.0-ktest-gf4de9bee61af #5305 Tainted: G        W
01305 ------------------------------------------------------
01305 cat/14658 is trying to acquire lock:
01305 ffffffc00982f460 (fs_reclaim){+.+.}-{0:0}, at: __kmem_cache_alloc_node+0x48/0x278
01305
01305 but task is already holding lock:
01305 ffffff8011aaf040 (&lock->wait_lock){+.+.}-{2:2}, at: bch2_check_for_deadlock+0x4b8/0xa58
01305
01305 which lock already depends on the new lock.
01305
01305
01305 the existing dependency chain (in reverse order) is:
01305
01305 -> #2 (&lock->wait_lock){+.+.}-{2:2}:
01305        _raw_spin_lock+0x54/0x70
01305        __six_lock_wakeup+0x40/0x1b0
01305        six_unlock_ip+0xe8/0x248
01305        bch2_btree_key_cache_scan+0x720/0x940
01305        shrink_slab.constprop.0+0x284/0x770
01305        shrink_node+0x390/0x828
01305        balance_pgdat+0x390/0x6d0
01305        kswapd+0x2e4/0x718
01305        kthread+0x184/0x1a8
01305        ret_from_fork+0x10/0x20
01305
01305 -> #1 (&c->lock#2){+.+.}-{3:3}:
01305        __mutex_lock+0x104/0x14a0
01305        mutex_lock_nested+0x30/0x40
01305        bch2_btree_key_cache_scan+0x5c/0x940
01305        shrink_slab.constprop.0+0x284/0x770
01305        shrink_node+0x390/0x828
01305        balance_pgdat+0x390/0x6d0
01305        kswapd+0x2e4/0x718
01305        kthread+0x184/0x1a8
01305        ret_from_fork+0x10/0x20
01305
01305 -> #0 (fs_reclaim){+.+.}-{0:0}:
01305        __lock_acquire+0x19d0/0x2930
01305        lock_acquire+0x1dc/0x458
01305        fs_reclaim_acquire+0x9c/0xe0
01305        __kmem_cache_alloc_node+0x48/0x278
01305        __kmalloc_node_track_caller+0x5c/0x278
01305        krealloc+0x94/0x180
01305        bch2_printbuf_make_room.part.0+0xac/0x118
01305        bch2_prt_printf+0x150/0x1e8
01305        bch2_btree_bkey_cached_common_to_text+0x170/0x298
01305        bch2_btree_trans_to_text+0x244/0x348
01305        print_cycle+0x7c/0xb0
01305        break_cycle+0x254/0x528
01305        bch2_check_for_deadlock+0x59c/0xa58
01305        bch2_btree_deadlock_read+0x174/0x200
01305        full_proxy_read+0x94/0xf0
01305        vfs_read+0x15c/0x3a8
01305        ksys_read+0xb8/0x148
01305        __arm64_sys_read+0x48/0x60
01305        invoke_syscall.constprop.0+0x64/0x138
01305        do_el0_svc+0x84/0x138
01305        el0_svc+0x34/0x80
01305        el0t_64_sync_handler+0xb0/0xb8
01305        el0t_64_sync+0x14c/0x150
01305
01305 other info that might help us debug this:
01305
01305 Chain exists of:
01305   fs_reclaim --> &c->lock#2 --> &lock->wait_lock
01305
01305  Possible unsafe locking scenario:
01305
01305        CPU0                    CPU1
01305        ----                    ----
01305   lock(&lock->wait_lock);
01305                                lock(&c->lock#2);
01305                                lock(&lock->wait_lock);
01305   lock(fs_reclaim);
01305
01305  *** DEADLOCK ***

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_locking.c | 21 +++++++++++++++------
 1 file changed, 15 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c
index dc3aa70a0380..e6fe2a987574 100644
--- a/fs/bcachefs/btree_locking.c
+++ b/fs/bcachefs/btree_locking.c
@@ -112,10 +112,8 @@ static noinline void lock_graph_pop_all(struct lock_graph *g)
 		lock_graph_up(g);
 }
 
-static void lock_graph_down(struct lock_graph *g, struct btree_trans *trans)
+static void __lock_graph_down(struct lock_graph *g, struct btree_trans *trans)
 {
-	closure_get(&trans->ref);
-
 	g->g[g->nr++] = (struct trans_waiting_for_lock) {
 		.trans		= trans,
 		.node_want	= trans->locking,
@@ -123,6 +121,12 @@ static void lock_graph_down(struct lock_graph *g, struct btree_trans *trans)
 	};
 }
 
+static void lock_graph_down(struct lock_graph *g, struct btree_trans *trans)
+{
+	closure_get(&trans->ref);
+	__lock_graph_down(g, trans);
+}
+
 static bool lock_graph_remove_non_waiters(struct lock_graph *g)
 {
 	struct trans_waiting_for_lock *i;
@@ -223,10 +227,14 @@ static int lock_graph_descend(struct lock_graph *g, struct btree_trans *trans,
 	struct trans_waiting_for_lock *i;
 
 	for (i = g->g; i < g->g + g->nr; i++)
-		if (i->trans == trans)
+		if (i->trans == trans) {
+			closure_put(&trans->ref);
 			return break_cycle(g, cycle);
+		}
 
 	if (g->nr == ARRAY_SIZE(g->g)) {
+		closure_put(&trans->ref);
+
 		if (orig_trans->lock_may_not_fail)
 			return 0;
 
@@ -240,7 +248,7 @@ static int lock_graph_descend(struct lock_graph *g, struct btree_trans *trans,
 		return btree_trans_restart(orig_trans, BCH_ERR_transaction_restart_deadlock_recursion_limit);
 	}
 
-	lock_graph_down(g, trans);
+	__lock_graph_down(g, trans);
 	return 0;
 }
 
@@ -335,9 +343,10 @@ next:
 				    !lock_type_conflicts(lock_held, trans->locking_wait.lock_want))
 					continue;
 
-				ret = lock_graph_descend(&g, trans, cycle);
+				closure_get(&trans->ref);
 				raw_spin_unlock(&b->lock.wait_lock);
 
+				ret = lock_graph_descend(&g, trans, cycle);
 				if (ret)
 					return ret;
 				goto next;
-- 
cgit 


From a5b696ee6e10103def82ea9abc18958912e81b00 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 19 Jun 2023 21:01:13 -0400
Subject: bcachefs: seqmutex; fix a lockdep splat

We can't be holding btree_trans_lock while copying to user space, which
might incur a page fault. To fix this, convert it to a seqmutex so we
can unlock/relock.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs.h   |  3 ++-
 fs/bcachefs/btree_iter.c | 18 +++++++++---------
 fs/bcachefs/debug.c      | 46 +++++++++++++++++++++++++++++++++++-----------
 fs/bcachefs/seqmutex.h   | 48 ++++++++++++++++++++++++++++++++++++++++++++++++
 fs/bcachefs/sysfs.c      |  4 ++--
 5 files changed, 96 insertions(+), 23 deletions(-)
 create mode 100644 fs/bcachefs/seqmutex.h

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 0dfa42e297e0..4199b42db640 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -208,6 +208,7 @@
 #include "fifo.h"
 #include "nocow_locking_types.h"
 #include "opts.h"
+#include "seqmutex.h"
 #include "util.h"
 
 #ifdef CONFIG_BCACHEFS_DEBUG
@@ -779,7 +780,7 @@ struct bch_fs {
 	}			btree_write_stats[BTREE_WRITE_TYPE_NR];
 
 	/* btree_iter.c: */
-	struct mutex		btree_trans_lock;
+	struct seqmutex		btree_trans_lock;
 	struct list_head	btree_trans_list;
 	mempool_t		btree_paths_pool;
 	mempool_t		btree_trans_mem_pool;
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index e8fec59dac02..8335387d3397 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -2991,7 +2991,7 @@ void __bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, unsigned fn_
 	if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG_TRANSACTIONS)) {
 		struct btree_trans *pos;
 
-		mutex_lock(&c->btree_trans_lock);
+		seqmutex_lock(&c->btree_trans_lock);
 		list_for_each_entry(pos, &c->btree_trans_list, list) {
 			/*
 			 * We'd much prefer to be stricter here and completely
@@ -3009,7 +3009,7 @@ void __bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, unsigned fn_
 		}
 		list_add_tail(&trans->list, &c->btree_trans_list);
 list_add_done:
-		mutex_unlock(&c->btree_trans_lock);
+		seqmutex_unlock(&c->btree_trans_lock);
 	}
 }
 
@@ -3044,6 +3044,12 @@ void bch2_trans_exit(struct btree_trans *trans)
 
 	bch2_trans_unlock(trans);
 
+	if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG_TRANSACTIONS)) {
+		seqmutex_lock(&c->btree_trans_lock);
+		list_del(&trans->list);
+		seqmutex_unlock(&c->btree_trans_lock);
+	}
+
 	closure_sync(&trans->ref);
 
 	if (s)
@@ -3055,12 +3061,6 @@ void bch2_trans_exit(struct btree_trans *trans)
 
 	check_btree_paths_leaked(trans);
 
-	if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG_TRANSACTIONS)) {
-		mutex_lock(&c->btree_trans_lock);
-		list_del(&trans->list);
-		mutex_unlock(&c->btree_trans_lock);
-	}
-
 	srcu_read_unlock(&c->btree_trans_barrier, trans->srcu_idx);
 
 	bch2_journal_preres_put(&c->journal, &trans->journal_preres);
@@ -3198,7 +3198,7 @@ int bch2_fs_btree_iter_init(struct bch_fs *c)
 	}
 
 	INIT_LIST_HEAD(&c->btree_trans_list);
-	mutex_init(&c->btree_trans_lock);
+	seqmutex_init(&c->btree_trans_lock);
 
 	ret   = mempool_init_kmalloc_pool(&c->btree_paths_pool, 1,
 			sizeof(struct btree_path) * nr +
diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c
index 8981acc15098..df0e14dc96e6 100644
--- a/fs/bcachefs/debug.c
+++ b/fs/bcachefs/debug.c
@@ -627,19 +627,26 @@ static ssize_t bch2_btree_transactions_read(struct file *file, char __user *buf,
 	struct bch_fs *c = i->c;
 	struct btree_trans *trans;
 	ssize_t ret = 0;
+	u32 seq;
 
 	i->ubuf = buf;
 	i->size	= size;
 	i->ret	= 0;
-
-	mutex_lock(&c->btree_trans_lock);
+restart:
+	seqmutex_lock(&c->btree_trans_lock);
 	list_for_each_entry(trans, &c->btree_trans_list, list) {
 		if (trans->locking_wait.task->pid <= i->iter)
 			continue;
 
+		closure_get(&trans->ref);
+		seq = seqmutex_seq(&c->btree_trans_lock);
+		seqmutex_unlock(&c->btree_trans_lock);
+
 		ret = flush_buf(i);
-		if (ret)
-			break;
+		if (ret) {
+			closure_put(&trans->ref);
+			goto unlocked;
+		}
 
 		bch2_btree_trans_to_text(&i->buf, trans);
 
@@ -651,9 +658,14 @@ static ssize_t bch2_btree_transactions_read(struct file *file, char __user *buf,
 		prt_newline(&i->buf);
 
 		i->iter = trans->locking_wait.task->pid;
-	}
-	mutex_unlock(&c->btree_trans_lock);
 
+		closure_put(&trans->ref);
+
+		if (!seqmutex_relock(&c->btree_trans_lock, seq))
+			goto restart;
+	}
+	seqmutex_unlock(&c->btree_trans_lock);
+unlocked:
 	if (i->buf.allocation_failure)
 		ret = -ENOMEM;
 
@@ -815,6 +827,7 @@ static ssize_t bch2_btree_deadlock_read(struct file *file, char __user *buf,
 	struct bch_fs *c = i->c;
 	struct btree_trans *trans;
 	ssize_t ret = 0;
+	u32 seq;
 
 	i->ubuf = buf;
 	i->size	= size;
@@ -822,21 +835,32 @@ static ssize_t bch2_btree_deadlock_read(struct file *file, char __user *buf,
 
 	if (i->iter)
 		goto out;
-
-	mutex_lock(&c->btree_trans_lock);
+restart:
+	seqmutex_lock(&c->btree_trans_lock);
 	list_for_each_entry(trans, &c->btree_trans_list, list) {
 		if (trans->locking_wait.task->pid <= i->iter)
 			continue;
 
+		closure_get(&trans->ref);
+		seq = seqmutex_seq(&c->btree_trans_lock);
+		seqmutex_unlock(&c->btree_trans_lock);
+
 		ret = flush_buf(i);
-		if (ret)
-			break;
+		if (ret) {
+			closure_put(&trans->ref);
+			goto out;
+		}
 
 		bch2_check_for_deadlock(trans, &i->buf);
 
 		i->iter = trans->locking_wait.task->pid;
+
+		closure_put(&trans->ref);
+
+		if (!seqmutex_relock(&c->btree_trans_lock, seq))
+			goto restart;
 	}
-	mutex_unlock(&c->btree_trans_lock);
+	seqmutex_unlock(&c->btree_trans_lock);
 out:
 	if (i->buf.allocation_failure)
 		ret = -ENOMEM;
diff --git a/fs/bcachefs/seqmutex.h b/fs/bcachefs/seqmutex.h
new file mode 100644
index 000000000000..c1860d8163fb
--- /dev/null
+++ b/fs/bcachefs/seqmutex.h
@@ -0,0 +1,48 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_SEQMUTEX_H
+#define _BCACHEFS_SEQMUTEX_H
+
+#include <linux/mutex.h>
+
+struct seqmutex {
+	struct mutex	lock;
+	u32		seq;
+};
+
+#define seqmutex_init(_lock)	mutex_init(&(_lock)->lock)
+
+static inline bool seqmutex_trylock(struct seqmutex *lock)
+{
+	return mutex_trylock(&lock->lock);
+}
+
+static inline void seqmutex_lock(struct seqmutex *lock)
+{
+	mutex_lock(&lock->lock);
+}
+
+static inline void seqmutex_unlock(struct seqmutex *lock)
+{
+	lock->seq++;
+	mutex_unlock(&lock->lock);
+}
+
+static inline u32 seqmutex_seq(struct seqmutex *lock)
+{
+	return lock->seq;
+}
+
+static inline bool seqmutex_relock(struct seqmutex *lock, u32 seq)
+{
+	if (lock->seq != seq || !mutex_trylock(&lock->lock))
+		return false;
+
+	if (lock->seq != seq) {
+		mutex_unlock(&lock->lock);
+		return false;
+	}
+
+	return true;
+}
+
+#endif /* _BCACHEFS_SEQMUTEX_H */
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index 77f92d537af6..54e1071ecfeb 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -379,7 +379,7 @@ static void bch2_btree_wakeup_all(struct bch_fs *c)
 {
 	struct btree_trans *trans;
 
-	mutex_lock(&c->btree_trans_lock);
+	seqmutex_lock(&c->btree_trans_lock);
 	list_for_each_entry(trans, &c->btree_trans_list, list) {
 		struct btree_bkey_cached_common *b = READ_ONCE(trans->locking);
 
@@ -387,7 +387,7 @@ static void bch2_btree_wakeup_all(struct bch_fs *c)
 			six_lock_wakeup_all(&b->lock);
 
 	}
-	mutex_unlock(&c->btree_trans_lock);
+	seqmutex_unlock(&c->btree_trans_lock);
 }
 
 SHOW(bch2_fs)
-- 
cgit 


From a83e108fc1964b8273c6f51cc62588ee774a5a48 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 19 Jun 2023 21:12:05 -0400
Subject: bcachefs: fiemap: Fix a lockdep splat

As with the previous patch, we generally can't hold btree locks while
copying to userspace, as that may incur a page fault and require
mmap_lock.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index f417889eba08..9280f514bc9f 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -943,6 +943,7 @@ retry:
 		cur.k->k.p.offset += cur.k->k.size;
 
 		if (have_extent) {
+			bch2_trans_unlock(&trans);
 			ret = bch2_fill_extent(c, info,
 					bkey_i_to_s_c(prev.k), 0);
 			if (ret)
@@ -961,9 +962,11 @@ err:
 	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
 		goto retry;
 
-	if (!ret && have_extent)
+	if (!ret && have_extent) {
+		bch2_trans_unlock(&trans);
 		ret = bch2_fill_extent(c, info, bkey_i_to_s_c(prev.k),
 				       FIEMAP_EXTENT_LAST);
+	}
 
 	bch2_trans_exit(&trans);
 	bch2_bkey_buf_exit(&cur, c);
-- 
cgit 


From 1bb3c2a9747c404d23012088fbefb4499b884415 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 20 Jun 2023 13:49:25 -0400
Subject: bcachefs: New error message helpers

Add two new helpers for printing error messages with __func__ and
bch2_err_str():
 - bch_err_fn
 - bch_err_msg

Also kill the old error strings in the recovery path, which were causing
us to incorrectly report memory allocation failures - they're not needed
anymore.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c |  33 +++++++------
 fs/bcachefs/backpointers.c     |  10 +++-
 fs/bcachefs/bcachefs.h         |   5 ++
 fs/bcachefs/btree_gc.c         |  27 +++++------
 fs/bcachefs/buckets.c          |   5 +-
 fs/bcachefs/ec.c               |   4 +-
 fs/bcachefs/fsck.c             |  59 +++++++++++++----------
 fs/bcachefs/journal.c          |  15 ++++--
 fs/bcachefs/lru.c              |  16 +++----
 fs/bcachefs/move.c             |   8 ++--
 fs/bcachefs/quota.c            |   5 +-
 fs/bcachefs/recovery.c         | 106 +++++++++++++++--------------------------
 fs/bcachefs/subvolume.c        |   9 ++--
 fs/bcachefs/tests.c            |  40 ++++++++--------
 14 files changed, 171 insertions(+), 171 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 6c8bcb210ad9..f68330b48847 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -577,7 +577,7 @@ int bch2_alloc_read(struct bch_fs *c)
 	bch2_trans_exit(&trans);
 
 	if (ret)
-		bch_err(c, "error reading alloc info: %s", bch2_err_str(ret));
+		bch_err_fn(c, ret);
 
 	return ret;
 }
@@ -684,8 +684,7 @@ int bch2_bucket_gens_init(struct bch_fs *c)
 	bch2_trans_exit(&trans);
 
 	if (ret)
-		bch_err(c, "%s: error %s", __func__, bch2_err_str(ret));
-
+		bch_err_fn(c, ret);
 	return ret;
 }
 
@@ -730,7 +729,7 @@ int bch2_bucket_gens_read(struct bch_fs *c)
 	bch2_trans_exit(&trans);
 
 	if (ret)
-		bch_err(c, "error reading alloc info: %s", bch2_err_str(ret));
+		bch_err_fn(c, ret);
 
 	return ret;
 }
@@ -1521,7 +1520,9 @@ bkey_err:
 		bch2_check_bucket_gens_key(&trans, &iter, k));
 err:
 	bch2_trans_exit(&trans);
-	return ret < 0 ? ret : 0;
+	if (ret)
+		bch_err_fn(c, ret);
+	return ret;
 }
 
 static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans,
@@ -1599,20 +1600,18 @@ fsck_err:
 
 int bch2_check_alloc_to_lru_refs(struct bch_fs *c)
 {
-	struct btree_trans trans;
 	struct btree_iter iter;
 	struct bkey_s_c k;
 	int ret = 0;
 
-	bch2_trans_init(&trans, c, 0, 0);
-
-	for_each_btree_key_commit(&trans, iter, BTREE_ID_alloc,
-			POS_MIN, BTREE_ITER_PREFETCH, k,
-			NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW,
-		bch2_check_alloc_to_lru_ref(&trans, &iter));
-
-	bch2_trans_exit(&trans);
-	return ret < 0 ? ret : 0;
+	ret = bch2_trans_run(c,
+		for_each_btree_key_commit(&trans, iter, BTREE_ID_alloc,
+				POS_MIN, BTREE_ITER_PREFETCH, k,
+				NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW,
+			bch2_check_alloc_to_lru_ref(&trans, &iter)));
+	if (ret)
+		bch_err_fn(c, ret);
+	return ret;
 }
 
 static int bch2_discard_one_bucket(struct btree_trans *trans,
@@ -2024,6 +2023,7 @@ int bch2_fs_freespace_init(struct bch_fs *c)
 		ret = bch2_dev_freespace_init(c, ca, &last_updated);
 		if (ret) {
 			percpu_ref_put(&ca->ref);
+			bch_err_fn(c, ret);
 			return ret;
 		}
 	}
@@ -2032,11 +2032,10 @@ int bch2_fs_freespace_init(struct bch_fs *c)
 		mutex_lock(&c->sb_lock);
 		bch2_write_super(c);
 		mutex_unlock(&c->sb_lock);
-
 		bch_verbose(c, "done initializing freespace");
 	}
 
-	return ret;
+	return 0;
 }
 
 /* Bucket IO clocks: */
diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c
index 11201064d9a4..2641ebef6ae4 100644
--- a/fs/bcachefs/backpointers.c
+++ b/fs/bcachefs/backpointers.c
@@ -404,12 +404,16 @@ int bch2_check_btree_backpointers(struct bch_fs *c)
 {
 	struct btree_iter iter;
 	struct bkey_s_c k;
+	int ret;
 
-	return bch2_trans_run(c,
+	ret = bch2_trans_run(c,
 		for_each_btree_key_commit(&trans, iter,
 			BTREE_ID_backpointers, POS_MIN, 0, k,
 			NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
 		  bch2_check_btree_backpointer(&trans, &iter, k)));
+	if (ret)
+		bch_err_fn(c, ret);
+	return ret;
 }
 
 struct bpos_level {
@@ -769,6 +773,8 @@ int bch2_check_extents_to_backpointers(struct bch_fs *c)
 	}
 	bch2_trans_exit(&trans);
 
+	if (ret)
+		bch_err_fn(c, ret);
 	return ret;
 }
 
@@ -872,5 +878,7 @@ int bch2_check_backpointers_to_extents(struct bch_fs *c)
 	}
 	bch2_trans_exit(&trans);
 
+	if (ret)
+		bch_err_fn(c, ret);
 	return ret;
 }
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 4199b42db640..b8d50fe64b3c 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -291,6 +291,11 @@ do {									\
 #define bch_err_inum_offset_ratelimited(c, _inum, _offset, fmt, ...) \
 	printk_ratelimited(KERN_ERR bch2_fmt_inum_offset(c, _inum, _offset, fmt), ##__VA_ARGS__)
 
+#define bch_err_fn(_c, _ret)						\
+	 bch_err(_c, "%s(): error %s", __func__, bch2_err_str(_ret))
+#define bch_err_msg(_c, _ret, _msg)					\
+	 bch_err(_c, "%s(): error " _msg " %s", __func__, bch2_err_str(_ret))
+
 #define bch_verbose(c, fmt, ...)					\
 do {									\
 	if ((c)->opts.verbose)						\
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 8477e721b63c..4fbd2e545ac2 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -404,8 +404,7 @@ again:
 		}
 
 		if (ret) {
-			bch_err(c, "%s: error getting btree node: %s",
-				__func__, bch2_err_str(ret));
+			bch_err_msg(c, ret, "getting btree node");
 			break;
 		}
 
@@ -473,8 +472,7 @@ again:
 		ret = PTR_ERR_OR_ZERO(cur);
 
 		if (ret) {
-			bch_err(c, "%s: error getting btree node: %s",
-				__func__, bch2_err_str(ret));
+			bch_err_msg(c, ret, "getting btree node");
 			goto err;
 		}
 
@@ -687,7 +685,7 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id
 
 		new = kmalloc(bkey_bytes(k->k), GFP_KERNEL);
 		if (!new) {
-			bch_err(c, "%s: error allocating new key", __func__);
+			bch_err_msg(c, ret, "allocating new key");
 			ret = -BCH_ERR_ENOMEM_gc_repair_key;
 			goto err;
 		}
@@ -814,7 +812,7 @@ static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id,
 fsck_err:
 err:
 	if (ret)
-		bch_err(c, "error from %s(): %s", __func__, bch2_err_str(ret));
+		bch_err_fn(c, ret);
 	return ret;
 }
 
@@ -919,11 +917,8 @@ static int bch2_gc_btree_init_recurse(struct btree_trans *trans, struct btree *b
 
 		ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level,
 				       false, &k, true);
-		if (ret) {
-			bch_err(c, "%s: error from bch2_gc_mark_key: %s",
-				__func__, bch2_err_str(ret));
+		if (ret)
 			goto fsck_err;
-		}
 
 		if (b->c.level) {
 			bch2_bkey_buf_reassemble(&cur, c, k);
@@ -981,8 +976,7 @@ static int bch2_gc_btree_init_recurse(struct btree_trans *trans, struct btree *b
 					continue;
 				}
 			} else if (ret) {
-				bch_err(c, "%s: error getting btree node: %s",
-					__func__, bch2_err_str(ret));
+				bch_err_msg(c, ret, "getting btree node");
 				break;
 			}
 
@@ -1049,7 +1043,7 @@ fsck_err:
 	six_unlock_read(&b->c.lock);
 
 	if (ret < 0)
-		bch_err(c, "error from %s(): %s", __func__, bch2_err_str(ret));
+		bch_err_fn(c, ret);
 	printbuf_exit(&buf);
 	return ret;
 }
@@ -1079,7 +1073,7 @@ static int bch2_gc_btrees(struct bch_fs *c, bool initial, bool metadata_only)
 			: bch2_gc_btree(&trans, ids[i], initial, metadata_only);
 
 	if (ret < 0)
-		bch_err(c, "error from %s(): %s", __func__, bch2_err_str(ret));
+		bch_err_fn(c, ret);
 
 	bch2_trans_exit(&trans);
 	return ret;
@@ -1277,7 +1271,7 @@ fsck_err:
 	if (ca)
 		percpu_ref_put(&ca->ref);
 	if (ret)
-		bch_err(c, "error from %s(): %s", __func__, bch2_err_str(ret));
+		bch_err_fn(c, ret);
 
 	percpu_up_write(&c->mark_lock);
 	printbuf_exit(&buf);
@@ -1883,6 +1877,9 @@ out:
 	 * allocator thread - issue wakeup in case they blocked on gc_lock:
 	 */
 	closure_wake_up(&c->freelist_wait);
+
+	if (ret)
+		bch_err_fn(c, ret);
 	return ret;
 }
 
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index fbe0cd0a7de3..d770dc949661 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -1988,7 +1988,10 @@ static int __bch2_trans_mark_dev_sb(struct btree_trans *trans,
 
 int bch2_trans_mark_dev_sb(struct bch_fs *c, struct bch_dev *ca)
 {
-	return bch2_trans_run(c, __bch2_trans_mark_dev_sb(&trans, ca));
+	int ret = bch2_trans_run(c, __bch2_trans_mark_dev_sb(&trans, ca));
+	if (ret)
+		bch_err_fn(c, ret);
+	return ret;
 }
 
 /* Disk reservations: */
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index b7e3889b114b..0c5c291e844e 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -798,7 +798,7 @@ static void ec_stripe_delete_work(struct work_struct *work)
 		ret = commit_do(&trans, NULL, NULL, BTREE_INSERT_NOFAIL,
 				ec_stripe_delete(&trans, idx));
 		if (ret) {
-			bch_err(c, "%s: err %s", __func__, bch2_err_str(ret));
+			bch_err_fn(c, ret);
 			break;
 		}
 	}
@@ -1845,7 +1845,7 @@ int bch2_stripes_read(struct bch_fs *c)
 	bch2_trans_exit(&trans);
 
 	if (ret)
-		bch_err(c, "error reading stripes: %i", ret);
+		bch_err_fn(c, ret);
 
 	return ret;
 }
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index dcc55cbd3808..194e8d474e86 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -303,7 +303,7 @@ static int __remove_dirent(struct btree_trans *trans, struct bpos pos)
 	bch2_trans_iter_exit(trans, &iter);
 err:
 	if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
-		bch_err(c, "%s(): error %s", __func__, bch2_err_str(ret));
+		bch_err_fn(c, ret);
 	return ret;
 }
 
@@ -983,7 +983,7 @@ static int check_inode(struct btree_trans *trans,
 err:
 fsck_err:
 	if (ret)
-		bch_err(c, "%s(): error %s", __func__, bch2_err_str(ret));
+		bch_err_fn(c, ret);
 	return ret;
 }
 
@@ -1009,7 +1009,7 @@ static int check_inodes(struct bch_fs *c, bool full)
 	bch2_trans_exit(&trans);
 	snapshots_seen_exit(&s);
 	if (ret)
-		bch_err(c, "%s(): error %s", __func__, bch2_err_str(ret));
+		bch_err_fn(c, ret);
 	return ret;
 }
 
@@ -1129,7 +1129,7 @@ static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w)
 	}
 fsck_err:
 	if (ret)
-		bch_err(c, "%s(): error %s", __func__, bch2_err_str(ret));
+		bch_err_fn(c, ret);
 	if (!ret && trans_was_restarted(trans, restart_count))
 		ret = -BCH_ERR_transaction_restart_nested;
 	return ret;
@@ -1353,7 +1353,7 @@ fsck_err:
 	printbuf_exit(&buf);
 
 	if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
-		bch_err(c, "%s(): error %s", __func__, bch2_err_str(ret));
+		bch_err_fn(c, ret);
 	return ret;
 }
 
@@ -1395,7 +1395,7 @@ static int check_extents(struct bch_fs *c)
 	snapshots_seen_exit(&s);
 
 	if (ret)
-		bch_err(c, "%s(): error %s", __func__, bch2_err_str(ret));
+		bch_err_fn(c, ret);
 	return ret;
 }
 
@@ -1434,7 +1434,7 @@ static int check_subdir_count(struct btree_trans *trans, struct inode_walker *w)
 	}
 fsck_err:
 	if (ret)
-		bch_err(c, "%s(): error %s", __func__, bch2_err_str(ret));
+		bch_err_fn(c, ret);
 	if (!ret && trans_was_restarted(trans, restart_count))
 		ret = -BCH_ERR_transaction_restart_nested;
 	return ret;
@@ -1555,7 +1555,7 @@ fsck_err:
 	printbuf_exit(&buf);
 
 	if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
-		bch_err(c, "%s(): error %s", __func__, bch2_err_str(ret));
+		bch_err_fn(c, ret);
 	return ret;
 }
 
@@ -1725,7 +1725,7 @@ fsck_err:
 	printbuf_exit(&buf);
 
 	if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
-		bch_err(c, "%s(): error %s", __func__, bch2_err_str(ret));
+		bch_err_fn(c, ret);
 	return ret;
 }
 
@@ -1764,7 +1764,7 @@ static int check_dirents(struct bch_fs *c)
 	inode_walker_exit(&target);
 
 	if (ret)
-		bch_err(c, "%s(): error %s", __func__, bch2_err_str(ret));
+		bch_err_fn(c, ret);
 	return ret;
 }
 
@@ -1801,7 +1801,7 @@ static int check_xattr(struct btree_trans *trans, struct btree_iter *iter,
 	ret = hash_check_key(trans, bch2_xattr_hash_desc, hash_info, iter, k);
 fsck_err:
 	if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
-		bch_err(c, "%s(): error %s", __func__, bch2_err_str(ret));
+		bch_err_fn(c, ret);
 	return ret;
 }
 
@@ -1833,7 +1833,7 @@ static int check_xattrs(struct bch_fs *c)
 	bch2_trans_exit(&trans);
 
 	if (ret)
-		bch_err(c, "%s(): error %s", __func__, bch2_err_str(ret));
+		bch_err_fn(c, ret);
 	return ret;
 }
 
@@ -1896,12 +1896,18 @@ fsck_err:
 noinline_for_stack
 static int check_root(struct bch_fs *c)
 {
+	int ret;
+
 	bch_verbose(c, "checking root directory");
 
-	return bch2_trans_do(c, NULL, NULL,
+	ret = bch2_trans_do(c, NULL, NULL,
 			     BTREE_INSERT_NOFAIL|
 			     BTREE_INSERT_LAZY_RW,
 		check_root_trans(&trans));
+
+	if (ret)
+		bch_err_fn(c, ret);
+	return ret;
 }
 
 struct pathbuf_entry {
@@ -2038,7 +2044,7 @@ static int check_path(struct btree_trans *trans,
 	}
 fsck_err:
 	if (ret)
-		bch_err(c, "%s: err %s", __func__, bch2_err_str(ret));
+		bch_err_fn(c, ret);
 	return ret;
 }
 
@@ -2081,10 +2087,11 @@ static int check_directory_structure(struct bch_fs *c)
 			break;
 	}
 	bch2_trans_iter_exit(&trans, &iter);
-
+	bch2_trans_exit(&trans);
 	darray_exit(&path);
 
-	bch2_trans_exit(&trans);
+	if (ret)
+		bch_err_fn(c, ret);
 	return ret;
 }
 
@@ -2364,6 +2371,8 @@ static int check_nlinks(struct bch_fs *c)
 
 	kvfree(links.d);
 
+	if (ret)
+		bch_err_fn(c, ret);
 	return ret;
 }
 
@@ -2397,7 +2406,6 @@ static int fix_reflink_p_key(struct btree_trans *trans, struct btree_iter *iter,
 noinline_for_stack
 static int fix_reflink_p(struct bch_fs *c)
 {
-	struct btree_trans trans;
 	struct btree_iter iter;
 	struct bkey_s_c k;
 	int ret;
@@ -2407,15 +2415,16 @@ static int fix_reflink_p(struct bch_fs *c)
 
 	bch_verbose(c, "fixing reflink_p keys");
 
-	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
-
-	ret = for_each_btree_key_commit(&trans, iter,
-			BTREE_ID_extents, POS_MIN,
-			BTREE_ITER_INTENT|BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
-			NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW,
-		fix_reflink_p_key(&trans, &iter, k));
+	ret = bch2_trans_run(c,
+		for_each_btree_key_commit(&trans, iter,
+				BTREE_ID_extents, POS_MIN,
+				BTREE_ITER_INTENT|BTREE_ITER_PREFETCH|
+				BTREE_ITER_ALL_SNAPSHOTS, k,
+				NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW,
+			fix_reflink_p_key(&trans, &iter, k)));
 
-	bch2_trans_exit(&trans);
+	if (ret)
+		bch_err_fn(c, ret);
 	return ret;
 }
 
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index 433c97844f36..64332c78a6bb 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -978,7 +978,7 @@ int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca,
 	}
 
 	if (ret)
-		bch_err(c, "%s: err %s", __func__, bch2_err_str(ret));
+		bch_err_fn(c, ret);
 unlock:
 	up_write(&c->state_lock);
 	return ret;
@@ -987,9 +987,12 @@ unlock:
 int bch2_dev_journal_alloc(struct bch_dev *ca)
 {
 	unsigned nr;
+	int ret;
 
-	if (dynamic_fault("bcachefs:add:journal_alloc"))
-		return -BCH_ERR_ENOMEM_set_nr_journal_buckets;
+	if (dynamic_fault("bcachefs:add:journal_alloc")) {
+		ret = -BCH_ERR_ENOMEM_set_nr_journal_buckets;
+		goto err;
+	}
 
 	/* 1/128th of the device by default: */
 	nr = ca->mi.nbuckets >> 7;
@@ -1003,7 +1006,11 @@ int bch2_dev_journal_alloc(struct bch_dev *ca)
 		     min(1 << 13,
 			 (1 << 24) / ca->mi.bucket_size));
 
-	return __bch2_set_nr_journal_buckets(ca, nr, true, NULL);
+	ret = __bch2_set_nr_journal_buckets(ca, nr, true, NULL);
+err:
+	if (ret)
+		bch_err_fn(ca, ret);
+	return ret;
 }
 
 /* startup/shutdown: */
diff --git a/fs/bcachefs/lru.c b/fs/bcachefs/lru.c
index 4f23e88f6ae1..e04c037f0c01 100644
--- a/fs/bcachefs/lru.c
+++ b/fs/bcachefs/lru.c
@@ -160,20 +160,18 @@ fsck_err:
 
 int bch2_check_lrus(struct bch_fs *c)
 {
-	struct btree_trans trans;
 	struct btree_iter iter;
 	struct bkey_s_c k;
 	struct bpos last_flushed_pos = POS_MIN;
 	int ret = 0;
 
-	bch2_trans_init(&trans, c, 0, 0);
-
-	ret = for_each_btree_key_commit(&trans, iter,
-			BTREE_ID_lru, POS_MIN, BTREE_ITER_PREFETCH, k,
-			NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW,
-		bch2_check_lru_key(&trans, &iter, k, &last_flushed_pos));
-
-	bch2_trans_exit(&trans);
+	ret = bch2_trans_run(c,
+		for_each_btree_key_commit(&trans, iter,
+				BTREE_ID_lru, POS_MIN, BTREE_ITER_PREFETCH, k,
+				NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW,
+			bch2_check_lru_key(&trans, &iter, k, &last_flushed_pos)));
+	if (ret)
+		bch_err_fn(c, ret);
 	return ret;
 
 }
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index fd629136824b..37fb3784a2f9 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -690,7 +690,7 @@ int __bch2_evacuate_bucket(struct btree_trans *trans,
 	bch2_trans_iter_exit(trans, &iter);
 
 	if (ret) {
-		bch_err(c, "%s: error looking up alloc key: %s", __func__, bch2_err_str(ret));
+		bch_err_msg(c, ret, "looking up alloc key");
 		goto err;
 	}
 
@@ -701,7 +701,7 @@ int __bch2_evacuate_bucket(struct btree_trans *trans,
 
 	ret = bch2_btree_write_buffer_flush(trans);
 	if (ret) {
-		bch_err(c, "%s: error flushing btree write buffer: %s", __func__, bch2_err_str(ret));
+		bch_err_msg(c, ret, "flushing btree write buffer");
 		goto err;
 	}
 
@@ -904,7 +904,7 @@ next:
 	bch2_trans_exit(&trans);
 
 	if (ret)
-		bch_err(c, "error in %s(): %s", __func__, bch2_err_str(ret));
+		bch_err_fn(c, ret);
 
 	bch2_btree_interior_updates_flush(c);
 
@@ -1029,6 +1029,8 @@ int bch2_scan_old_btree_nodes(struct bch_fs *c, struct bch_move_stats *stats)
 		mutex_unlock(&c->sb_lock);
 	}
 
+	if (ret)
+		bch_err_fn(c, ret);
 	return ret;
 }
 
diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c
index d20ec9764108..7e1f1828ab20 100644
--- a/fs/bcachefs/quota.c
+++ b/fs/bcachefs/quota.c
@@ -621,10 +621,11 @@ int bch2_fs_quota_read(struct bch_fs *c)
 	      for_each_btree_key2(&trans, iter, BTREE_ID_inodes,
 			POS_MIN, BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
 		bch2_fs_quota_read_inode(&trans, &iter, k));
-	if (ret)
-		bch_err(c, "%s: err %s", __func__, bch2_err_str(ret));
 
 	bch2_trans_exit(&trans);
+
+	if (ret)
+		bch_err_fn(c, ret);
 	return ret;
 }
 
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index e4983d144483..09c9d4058f82 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -685,6 +685,9 @@ static int bch2_journal_replay(struct bch_fs *c, u64 start_seq, u64 end_seq)
 		bch2_journal_log_msg(c, "journal replay finished");
 err:
 	kvfree(keys_sorted);
+
+	if (ret)
+		bch_err_fn(c, ret);
 	return ret;
 }
 
@@ -1034,9 +1037,6 @@ static int bch2_fs_initialize_subvolumes(struct bch_fs *c)
 	root_tree.k.p.offset		= 1;
 	root_tree.v.master_subvol	= cpu_to_le32(1);
 	root_tree.v.root_snapshot	= cpu_to_le32(U32_MAX);
-	ret = bch2_btree_insert(c, BTREE_ID_snapshot_trees,
-				&root_tree.k_i,
-				NULL, NULL, 0);
 
 	bkey_snapshot_init(&root_snapshot.k_i);
 	root_snapshot.k.p.offset = U32_MAX;
@@ -1046,28 +1046,27 @@ static int bch2_fs_initialize_subvolumes(struct bch_fs *c)
 	root_snapshot.v.tree	= cpu_to_le32(1);
 	SET_BCH_SNAPSHOT_SUBVOL(&root_snapshot.v, true);
 
-	ret = bch2_btree_insert(c, BTREE_ID_snapshots,
-				&root_snapshot.k_i,
-				NULL, NULL, 0);
-	if (ret)
-		return ret;
-
 	bkey_subvolume_init(&root_volume.k_i);
 	root_volume.k.p.offset = BCACHEFS_ROOT_SUBVOL;
 	root_volume.v.flags	= 0;
 	root_volume.v.snapshot	= cpu_to_le32(U32_MAX);
 	root_volume.v.inode	= cpu_to_le64(BCACHEFS_ROOT_INO);
 
-	ret = bch2_btree_insert(c, BTREE_ID_subvolumes,
-				&root_volume.k_i,
-				NULL, NULL, 0);
+	ret =   bch2_btree_insert(c, BTREE_ID_snapshot_trees,
+				  &root_tree.k_i,
+				  NULL, NULL, 0) ?:
+		bch2_btree_insert(c, BTREE_ID_snapshots,
+				  &root_snapshot.k_i,
+				  NULL, NULL, 0) ?:
+		bch2_btree_insert(c, BTREE_ID_subvolumes,
+				  &root_volume.k_i,
+				  NULL, NULL, 0);
 	if (ret)
-		return ret;
-
-	return 0;
+		bch_err_fn(c, ret);
+	return ret;
 }
 
-static int bch2_fs_upgrade_for_subvolumes(struct btree_trans *trans)
+static int __bch2_fs_upgrade_for_subvolumes(struct btree_trans *trans)
 {
 	struct btree_iter iter;
 	struct bkey_s_c k;
@@ -1097,9 +1096,19 @@ err:
 	return ret;
 }
 
+/* set bi_subvol on root inode */
+noinline_for_stack
+static int bch2_fs_upgrade_for_subvolumes(struct bch_fs *c)
+{
+	int ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_LAZY_RW,
+				__bch2_fs_upgrade_for_subvolumes(&trans));
+	if (ret)
+		bch_err_fn(c, ret);
+	return ret;
+}
+
 int bch2_fs_recovery(struct bch_fs *c)
 {
-	const char *err = "cannot allocate memory";
 	struct bch_sb_field_clean *clean = NULL;
 	struct jset *last_journal_entry = NULL;
 	u64 last_seq, blacklist_seq, journal_seq;
@@ -1137,12 +1146,6 @@ int bch2_fs_recovery(struct bch_fs *c)
 		goto err;
 	}
 
-	if (!(c->sb.features & (1ULL << BCH_FEATURE_alloc_v2))) {
-		bch_info(c, "alloc_v2 feature bit not set, fsck required");
-		c->opts.fsck = true;
-		c->opts.fix_errors = FSCK_OPT_YES;
-	}
-
 	if (!c->opts.nochanges) {
 		if (c->sb.version < bcachefs_metadata_required_upgrade_below) {
 			bch_info(c, "version %s (%u) prior to %s (%u), upgrade and fsck required",
@@ -1286,34 +1289,28 @@ use_clean:
 		goto err;
 
 	bch_verbose(c, "starting alloc read");
-	err = "error reading allocation information";
-
 	down_read(&c->gc_lock);
 	ret = c->sb.version < bcachefs_metadata_version_bucket_gens
 		? bch2_alloc_read(c)
 		: bch2_bucket_gens_read(c);
 	up_read(&c->gc_lock);
-
 	if (ret)
 		goto err;
 	bch_verbose(c, "alloc read done");
 
 	bch_verbose(c, "starting stripes_read");
-	err = "error reading stripes";
 	ret = bch2_stripes_read(c);
 	if (ret)
 		goto err;
 	bch_verbose(c, "stripes_read done");
 
 	if (c->sb.version < bcachefs_metadata_version_snapshot_2) {
-		err = "error creating root snapshot node";
 		ret = bch2_fs_initialize_subvolumes(c);
 		if (ret)
 			goto err;
 	}
 
 	bch_verbose(c, "reading snapshots table");
-	err = "error reading snapshots table";
 	ret = bch2_fs_snapshots_start(c);
 	if (ret)
 		goto err;
@@ -1323,7 +1320,6 @@ use_clean:
 		bool metadata_only = c->opts.norecovery;
 
 		bch_info(c, "checking allocations");
-		err = "error checking allocations";
 		ret = bch2_gc(c, true, metadata_only);
 		if (ret)
 			goto err;
@@ -1334,7 +1330,6 @@ use_clean:
 		set_bit(BCH_FS_MAY_GO_RW, &c->flags);
 
 		bch_info(c, "starting journal replay, %zu keys", c->journal_keys.nr);
-		err = "journal replay failed";
 		ret = bch2_journal_replay(c, last_seq, blacklist_seq - 1);
 		if (ret)
 			goto err;
@@ -1342,7 +1337,6 @@ use_clean:
 			bch_info(c, "journal replay done");
 
 		bch_info(c, "checking need_discard and freespace btrees");
-		err = "error checking need_discard and freespace btrees";
 		ret = bch2_check_alloc_info(c);
 		if (ret)
 			goto err;
@@ -1351,7 +1345,6 @@ use_clean:
 		set_bit(BCH_FS_CHECK_ALLOC_DONE, &c->flags);
 
 		bch_info(c, "checking lrus");
-		err = "error checking lrus";
 		ret = bch2_check_lrus(c);
 		if (ret)
 			goto err;
@@ -1359,21 +1352,18 @@ use_clean:
 		set_bit(BCH_FS_CHECK_LRUS_DONE, &c->flags);
 
 		bch_info(c, "checking backpointers to alloc keys");
-		err = "error checking backpointers to alloc keys";
 		ret = bch2_check_btree_backpointers(c);
 		if (ret)
 			goto err;
 		bch_verbose(c, "done checking backpointers to alloc keys");
 
 		bch_info(c, "checking backpointers to extents");
-		err = "error checking backpointers to extents";
 		ret = bch2_check_backpointers_to_extents(c);
 		if (ret)
 			goto err;
 		bch_verbose(c, "done checking backpointers to extents");
 
 		bch_info(c, "checking extents to backpointers");
-		err = "error checking extents to backpointers";
 		ret = bch2_check_extents_to_backpointers(c);
 		if (ret)
 			goto err;
@@ -1381,7 +1371,6 @@ use_clean:
 		set_bit(BCH_FS_CHECK_BACKPOINTERS_DONE, &c->flags);
 
 		bch_info(c, "checking alloc to lru refs");
-		err = "error checking alloc to lru refs";
 		ret = bch2_check_alloc_to_lru_refs(c);
 		if (ret)
 			goto err;
@@ -1401,7 +1390,6 @@ use_clean:
 		set_bit(BCH_FS_MAY_GO_RW, &c->flags);
 
 		bch_verbose(c, "starting journal replay, %zu keys", c->journal_keys.nr);
-		err = "journal replay failed";
 		ret = bch2_journal_replay(c, last_seq, blacklist_seq - 1);
 		if (ret)
 			goto err;
@@ -1409,7 +1397,6 @@ use_clean:
 			bch_info(c, "journal replay done");
 	}
 
-	err = "error initializing freespace";
 	ret = bch2_fs_freespace_init(c);
 	if (ret)
 		goto err;
@@ -1417,7 +1404,6 @@ use_clean:
 	if (c->sb.version < bcachefs_metadata_version_bucket_gens &&
 	    c->opts.version_upgrade) {
 		bch_info(c, "initializing bucket_gens");
-		err = "error initializing bucket gens";
 		ret = bch2_bucket_gens_init(c);
 		if (ret)
 			goto err;
@@ -1425,24 +1411,18 @@ use_clean:
 	}
 
 	if (c->sb.version < bcachefs_metadata_version_snapshot_2) {
-		/* set bi_subvol on root inode */
-		err = "error upgrade root inode for subvolumes";
-		ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_LAZY_RW,
-				    bch2_fs_upgrade_for_subvolumes(&trans));
+		ret = bch2_fs_upgrade_for_subvolumes(c);
 		if (ret)
 			goto err;
 	}
 
 	if (c->opts.fsck) {
-		bch_info(c, "starting fsck");
-		err = "error in fsck";
 		ret = bch2_fsck_full(c);
 		if (ret)
 			goto err;
 		bch_verbose(c, "fsck done");
 	} else if (!c->sb.clean) {
 		bch_verbose(c, "checking for deleted inodes");
-		err = "error in recovery";
 		ret = bch2_fsck_walk_inodes_only(c);
 		if (ret)
 			goto err;
@@ -1489,11 +1469,8 @@ use_clean:
 		bch2_move_stats_init(&stats, "recovery");
 
 		bch_info(c, "scanning for old btree nodes");
-		ret = bch2_fs_read_write(c);
-		if (ret)
-			goto err;
-
-		ret = bch2_scan_old_btree_nodes(c, &stats);
+		ret =   bch2_fs_read_write(c) ?:
+			bch2_scan_old_btree_nodes(c, &stats);
 		if (ret)
 			goto err;
 		bch_info(c, "scanning for old btree nodes done");
@@ -1521,7 +1498,7 @@ out:
 	}
 
 	if (ret)
-		bch_err(c, "Error in recovery: %s (%s)", err, bch2_err_str(ret));
+		bch_err_fn(c, ret);
 	else
 		bch_verbose(c, "ret %s", bch2_err_str(ret));
 	return ret;
@@ -1536,7 +1513,6 @@ int bch2_fs_initialize(struct bch_fs *c)
 	struct bch_inode_unpacked root_inode, lostfound_inode;
 	struct bkey_inode_buf packed_inode;
 	struct qstr lostfound = QSTR("lost+found");
-	const char *err = "cannot allocate memory";
 	struct bch_dev *ca;
 	unsigned i;
 	int ret;
@@ -1570,7 +1546,6 @@ int bch2_fs_initialize(struct bch_fs *c)
 	for_each_online_member(ca, c, i)
 		bch2_dev_usage_init(ca);
 
-	err = "unable to allocate journal buckets";
 	for_each_online_member(ca, c, i) {
 		ret = bch2_dev_journal_alloc(ca);
 		if (ret) {
@@ -1586,7 +1561,6 @@ int bch2_fs_initialize(struct bch_fs *c)
 	bch2_fs_journal_start(&c->journal, 1);
 	bch2_journal_set_replay_done(&c->journal);
 
-	err = "error going read-write";
 	ret = bch2_fs_read_write_early(c);
 	if (ret)
 		goto err;
@@ -1596,7 +1570,6 @@ int bch2_fs_initialize(struct bch_fs *c)
 	 * btree updates
 	 */
 	bch_verbose(c, "marking superblocks");
-	err = "error marking superblock and journal";
 	for_each_member_device(ca, c, i) {
 		ret = bch2_trans_mark_dev_sb(c, ca);
 		if (ret) {
@@ -1607,19 +1580,15 @@ int bch2_fs_initialize(struct bch_fs *c)
 		ca->new_fs_bucket_idx = 0;
 	}
 
-	bch_verbose(c, "initializing freespace");
-	err = "error initializing freespace";
 	ret = bch2_fs_freespace_init(c);
 	if (ret)
 		goto err;
 
-	err = "error creating root snapshot node";
 	ret = bch2_fs_initialize_subvolumes(c);
 	if (ret)
 		goto err;
 
 	bch_verbose(c, "reading snapshots table");
-	err = "error reading snapshots table";
 	ret = bch2_fs_snapshots_start(c);
 	if (ret)
 		goto err;
@@ -1631,16 +1600,16 @@ int bch2_fs_initialize(struct bch_fs *c)
 	bch2_inode_pack(&packed_inode, &root_inode);
 	packed_inode.inode.k.p.snapshot = U32_MAX;
 
-	err = "error creating root directory";
 	ret = bch2_btree_insert(c, BTREE_ID_inodes,
 				&packed_inode.inode.k_i,
 				NULL, NULL, 0);
-	if (ret)
+	if (ret) {
+		bch_err_msg(c, ret, "creating root directory");
 		goto err;
+	}
 
 	bch2_inode_init_early(c, &lostfound_inode);
 
-	err = "error creating lost+found";
 	ret = bch2_trans_do(c, NULL, NULL, 0,
 		bch2_create_trans(&trans,
 				  BCACHEFS_ROOT_SUBVOL_INUM,
@@ -1649,7 +1618,7 @@ int bch2_fs_initialize(struct bch_fs *c)
 				  0, 0, S_IFDIR|0700, 0,
 				  NULL, NULL, (subvol_inum) { 0 }, 0));
 	if (ret) {
-		bch_err(c, "error creating lost+found");
+		bch_err_msg(c, ret, "creating lost+found");
 		goto err;
 	}
 
@@ -1659,10 +1628,11 @@ int bch2_fs_initialize(struct bch_fs *c)
 			goto err;
 	}
 
-	err = "error writing first journal entry";
 	ret = bch2_journal_flush(&c->journal);
-	if (ret)
+	if (ret) {
+		bch_err_msg(c, ret, "writing first journal entry");
 		goto err;
+	}
 
 	mutex_lock(&c->sb_lock);
 	SET_BCH_SB_INITIALIZED(c->disk_sb.sb, true);
@@ -1673,6 +1643,6 @@ int bch2_fs_initialize(struct bch_fs *c)
 
 	return 0;
 err:
-	pr_err("Error initializing new filesystem: %s (%s)", err, bch2_err_str(ret));
+	bch_err_fn(ca, ret);
 	return ret;
 }
diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c
index 4b6631c229ee..828644e6c714 100644
--- a/fs/bcachefs/subvolume.c
+++ b/fs/bcachefs/subvolume.c
@@ -623,7 +623,7 @@ int bch2_fs_check_snapshots(struct bch_fs *c)
 			NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
 		check_snapshot(&trans, &iter, k)));
 	if (ret)
-		bch_err(c, "%s: error %s", __func__, bch2_err_str(ret));
+		bch_err_fn(c, ret);
 	return ret;
 }
 
@@ -702,8 +702,7 @@ int bch2_fs_check_subvols(struct bch_fs *c)
 			NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
 		check_subvol(&trans, &iter, k)));
 	if (ret)
-		bch_err(c, "%s: error %s", __func__, bch2_err_str(ret));
-
+		bch_err_fn(c, ret);
 	return ret;
 }
 
@@ -724,7 +723,7 @@ int bch2_fs_snapshots_start(struct bch_fs *c)
 			bch2_mark_snapshot(&trans, BTREE_ID_snapshots, 0, bkey_s_c_null, k, 0) ?:
 			bch2_snapshot_set_equiv(&trans, k)));
 	if (ret)
-		bch_err(c, "error starting snapshots: %s", bch2_err_str(ret));
+		bch_err_fn(c, ret);
 	return ret;
 }
 
@@ -1123,6 +1122,8 @@ int bch2_delete_dead_snapshots(struct bch_fs *c)
 err:
 	darray_exit(&deleted);
 	bch2_trans_exit(&trans);
+	if (ret)
+		bch_err_fn(c, ret);
 	return ret;
 }
 
diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c
index 35df3f940542..50d69a5634bd 100644
--- a/fs/bcachefs/tests.c
+++ b/fs/bcachefs/tests.c
@@ -47,7 +47,7 @@ static int test_delete(struct bch_fs *c, u64 nr)
 		bch2_btree_iter_traverse(&iter) ?:
 		bch2_trans_update(&trans, &iter, &k.k_i, 0));
 	if (ret) {
-		bch_err(c, "%s(): update error in: %s", __func__, bch2_err_str(ret));
+		bch_err_msg(c, ret, "update error");
 		goto err;
 	}
 
@@ -56,7 +56,7 @@ static int test_delete(struct bch_fs *c, u64 nr)
 		bch2_btree_iter_traverse(&iter) ?:
 		bch2_btree_delete_at(&trans, &iter, 0));
 	if (ret) {
-		bch_err(c, "%s(): delete error (first): %s", __func__, bch2_err_str(ret));
+		bch_err_msg(c, ret, "delete error (first)");
 		goto err;
 	}
 
@@ -65,7 +65,7 @@ static int test_delete(struct bch_fs *c, u64 nr)
 		bch2_btree_iter_traverse(&iter) ?:
 		bch2_btree_delete_at(&trans, &iter, 0));
 	if (ret) {
-		bch_err(c, "%s(): delete error (second): %s", __func__, bch2_err_str(ret));
+		bch_err_msg(c, ret, "delete error (second)");
 		goto err;
 	}
 err:
@@ -93,7 +93,7 @@ static int test_delete_written(struct bch_fs *c, u64 nr)
 		bch2_btree_iter_traverse(&iter) ?:
 		bch2_trans_update(&trans, &iter, &k.k_i, 0));
 	if (ret) {
-		bch_err(c, "%s(): update error: %s", __func__, bch2_err_str(ret));
+		bch_err_msg(c, ret, "update error");
 		goto err;
 	}
 
@@ -104,7 +104,7 @@ static int test_delete_written(struct bch_fs *c, u64 nr)
 		bch2_btree_iter_traverse(&iter) ?:
 		bch2_btree_delete_at(&trans, &iter, 0));
 	if (ret) {
-		bch_err(c, "%s(): delete error: %s", __func__, bch2_err_str(ret));
+		bch_err_msg(c, ret, "delete error");
 		goto err;
 	}
 err:
@@ -137,7 +137,7 @@ static int test_iterate(struct bch_fs *c, u64 nr)
 		ret = bch2_btree_insert(c, BTREE_ID_xattrs, &k.k_i,
 					NULL, NULL, 0);
 		if (ret) {
-			bch_err(c, "%s(): insert error: %s", __func__, bch2_err_str(ret));
+			bch_err_msg(c, ret, "insert error");
 			goto err;
 		}
 	}
@@ -153,7 +153,7 @@ static int test_iterate(struct bch_fs *c, u64 nr)
 		0;
 	}));
 	if (ret) {
-		bch_err(c, "%s(): error iterating forwards: %s", __func__, bch2_err_str(ret));
+		bch_err_msg(c, ret, "error iterating forwards");
 		goto err;
 	}
 
@@ -168,7 +168,7 @@ static int test_iterate(struct bch_fs *c, u64 nr)
 			0;
 		}));
 	if (ret) {
-		bch_err(c, "%s(): error iterating backwards: %s", __func__, bch2_err_str(ret));
+		bch_err_msg(c, ret, "error iterating backwards");
 		goto err;
 	}
 
@@ -204,7 +204,7 @@ static int test_iterate_extents(struct bch_fs *c, u64 nr)
 		ret = bch2_btree_insert(c, BTREE_ID_extents, &k.k_i,
 					NULL, NULL, 0);
 		if (ret) {
-			bch_err(c, "%s(): insert error: %s", __func__, bch2_err_str(ret));
+			bch_err_msg(c, ret, "insert error");
 			goto err;
 		}
 	}
@@ -221,7 +221,7 @@ static int test_iterate_extents(struct bch_fs *c, u64 nr)
 		0;
 	}));
 	if (ret) {
-		bch_err(c, "%s(): error iterating forwards: %s", __func__, bch2_err_str(ret));
+		bch_err_msg(c, ret, "error iterating forwards");
 		goto err;
 	}
 
@@ -237,7 +237,7 @@ static int test_iterate_extents(struct bch_fs *c, u64 nr)
 			0;
 		}));
 	if (ret) {
-		bch_err(c, "%s(): error iterating backwards: %s", __func__, bch2_err_str(ret));
+		bch_err_msg(c, ret, "error iterating backwards");
 		goto err;
 	}
 
@@ -272,7 +272,7 @@ static int test_iterate_slots(struct bch_fs *c, u64 nr)
 		ret = bch2_btree_insert(c, BTREE_ID_xattrs, &k.k_i,
 					NULL, NULL, 0);
 		if (ret) {
-			bch_err(c, "%s(): insert error: %s", __func__, bch2_err_str(ret));
+			bch_err_msg(c, ret, "insert error");
 			goto err;
 		}
 	}
@@ -289,7 +289,7 @@ static int test_iterate_slots(struct bch_fs *c, u64 nr)
 		0;
 	}));
 	if (ret) {
-		bch_err(c, "%s(): error iterating forwards: %s", __func__, bch2_err_str(ret));
+		bch_err_msg(c, ret, "error iterating forwards");
 		goto err;
 	}
 
@@ -312,7 +312,7 @@ static int test_iterate_slots(struct bch_fs *c, u64 nr)
 		0;
 	}));
 	if (ret < 0) {
-		bch_err(c, "%s(): error iterating forwards by slots: %s", __func__, bch2_err_str(ret));
+		bch_err_msg(c, ret, "error iterating forwards by slots");
 		goto err;
 	}
 	ret = 0;
@@ -346,7 +346,7 @@ static int test_iterate_slots_extents(struct bch_fs *c, u64 nr)
 		ret = bch2_btree_insert(c, BTREE_ID_extents, &k.k_i,
 					NULL, NULL, 0);
 		if (ret) {
-			bch_err(c, "%s(): insert error: %s", __func__, bch2_err_str(ret));
+			bch_err_msg(c, ret, "insert error");
 			goto err;
 		}
 	}
@@ -364,7 +364,7 @@ static int test_iterate_slots_extents(struct bch_fs *c, u64 nr)
 		0;
 	}));
 	if (ret) {
-		bch_err(c, "%s(): error iterating forwards: %s", __func__, bch2_err_str(ret));
+		bch_err_msg(c, ret, "error iterating forwards");
 		goto err;
 	}
 
@@ -387,7 +387,7 @@ static int test_iterate_slots_extents(struct bch_fs *c, u64 nr)
 		0;
 	}));
 	if (ret) {
-		bch_err(c, "%s(): error iterating forwards by slots: %s", __func__, bch2_err_str(ret));
+		bch_err_msg(c, ret, "error iterating forwards by slots");
 		goto err;
 	}
 	ret = 0;
@@ -461,7 +461,7 @@ static int insert_test_extent(struct bch_fs *c,
 	ret = bch2_btree_insert(c, BTREE_ID_extents, &k.k_i,
 				NULL, NULL, 0);
 	if (ret)
-		bch_err(c, "%s(): insert error: %s", __func__, bch2_err_str(ret));
+		bch_err_fn(c, ret);
 	return ret;
 }
 
@@ -560,7 +560,7 @@ static int test_snapshots(struct bch_fs *c, u64 nr)
 
 	ret = test_snapshot_filter(c, snapids[0], snapids[1]);
 	if (ret) {
-		bch_err(c, "%s(): err from test_snapshot_filter: %s", __func__, bch2_err_str(ret));
+		bch_err_msg(c, ret, "from test_snapshot_filter");
 		return ret;
 	}
 
@@ -674,7 +674,7 @@ static int rand_mixed_trans(struct btree_trans *trans,
 	k = bch2_btree_iter_peek(iter);
 	ret = bkey_err(k);
 	if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
-		bch_err(trans->c, "%s(): lookup error: %s", __func__, bch2_err_str(ret));
+		bch_err_msg(trans->c, ret, "lookup error");
 	if (ret)
 		return ret;
 
-- 
cgit 


From b6898917f2b5532ca7ad9b16131a5a6b513285e5 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 21 Jun 2023 00:31:49 -0400
Subject: bcachefs: Check for ERR_PTR() from filemap_lock_folio()

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-io.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index c864c271b7c2..103f426c88e8 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -124,7 +124,7 @@ static int filemap_get_contig_folios_d(struct address_space *mapping,
 			break;
 
 		f = __filemap_get_folio(mapping, pos >> PAGE_SHIFT, fgp_flags, gfp);
-		if (!f)
+		if (IS_ERR_OR_NULL(f))
 			break;
 
 		BUG_ON(folios->nr && folio_pos(f) != pos);
@@ -1764,7 +1764,7 @@ int bch2_write_begin(struct file *file, struct address_space *mapping,
 	folio = __filemap_get_folio(mapping, pos >> PAGE_SHIFT,
 				FGP_LOCK|FGP_WRITE|FGP_CREAT|FGP_STABLE,
 				mapping_gfp_mask(mapping));
-	if (!folio)
+	if (IS_ERR_OR_NULL(folio))
 		goto err_unlock;
 
 	if (folio_test_uptodate(folio))
@@ -2852,7 +2852,7 @@ static int __bch2_truncate_folio(struct bch_inode_info *inode,
 	u64 end_pos;
 
 	folio = filemap_lock_folio(mapping, index);
-	if (!folio) {
+	if (IS_ERR_OR_NULL(folio)) {
 		/*
 		 * XXX: we're doing two index lookups when we end up reading the
 		 * folio
@@ -2865,7 +2865,7 @@ static int __bch2_truncate_folio(struct bch_inode_info *inode,
 
 		folio = __filemap_get_folio(mapping, index,
 					    FGP_LOCK|FGP_CREAT, GFP_KERNEL);
-		if (unlikely(!folio)) {
+		if (unlikely(IS_ERR_OR_NULL(folio))) {
 			ret = -ENOMEM;
 			goto out;
 		}
@@ -3788,7 +3788,7 @@ static bool folio_hole_offset(struct address_space *mapping, loff_t *offset)
 	bool ret = true;
 
 	folio = filemap_lock_folio(mapping, *offset >> PAGE_SHIFT);
-	if (!folio)
+	if (IS_ERR_OR_NULL(folio))
 		return true;
 
 	s = bch2_folio(folio);
-- 
cgit 


From 462f494bc56052e3d17c9ae48a6e407b3f9d2c0c Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 21 Jun 2023 06:00:04 -0400
Subject: bcachefs: Fix lockdep splat in bch2_readdir

dir_emit() can fault (taking mmap_lock); thus we can't be holding btree
locks.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/dirent.c | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c
index 1544fc56974f..610dd7425fb4 100644
--- a/fs/bcachefs/dirent.c
+++ b/fs/bcachefs/dirent.c
@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 
 #include "bcachefs.h"
+#include "bkey_buf.h"
 #include "bkey_methods.h"
 #include "btree_update.h"
 #include "extents.h"
@@ -504,8 +505,10 @@ int bch2_readdir(struct bch_fs *c, subvol_inum inum, struct dir_context *ctx)
 	struct bkey_s_c_dirent dirent;
 	subvol_inum target;
 	u32 snapshot;
+	struct bkey_buf sk;
 	int ret;
 
+	bch2_bkey_buf_init(&sk);
 	bch2_trans_init(&trans, c, 0, 0);
 retry:
 	bch2_trans_begin(&trans);
@@ -528,10 +531,11 @@ retry:
 		if (ret)
 			continue;
 
-		/*
-		 * XXX: dir_emit() can fault and block, while we're holding
-		 * locks
-		 */
+		/* dir_emit() can fault and block: */
+		bch2_bkey_buf_reassemble(&sk, c, k);
+		dirent = bkey_i_to_s_c_dirent(sk.k);
+		bch2_trans_unlock(&trans);
+
 		ctx->pos = dirent.k->p.offset;
 		if (!dir_emit(ctx, dirent.v->d_name,
 			      bch2_dirent_name_bytes(dirent),
@@ -554,6 +558,7 @@ err:
 		goto retry;
 
 	bch2_trans_exit(&trans);
+	bch2_bkey_buf_exit(&sk, c);
 
 	return ret;
 }
-- 
cgit 


From 9473cff989c8519d01c6a285bd94d2ed35d30251 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 21 Jun 2023 06:44:44 -0400
Subject: bcachefs: Fix more lockdep splats in debug.c

Similar to previous fixes, we can't incur page faults while holding
btree locks.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.h |  2 +-
 fs/bcachefs/debug.c      | 34 +++++++++++++++++-----------------
 fs/bcachefs/errcode.h    |  2 +-
 fs/bcachefs/io.c         |  2 +-
 4 files changed, 20 insertions(+), 20 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index d2af3f38e6f5..9ef9527dda6b 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -286,7 +286,7 @@ __always_inline
 static inline int btree_trans_restart_nounlock(struct btree_trans *trans, int err)
 {
 	BUG_ON(err <= 0);
-	BUG_ON(!bch2_err_matches(err, BCH_ERR_transaction_restart));
+	BUG_ON(!bch2_err_matches(-err, BCH_ERR_transaction_restart));
 
 	trans->restarted = err;
 	trans->last_restarted_ip = _THIS_IP_;
diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c
index df0e14dc96e6..ae47e1854b80 100644
--- a/fs/bcachefs/debug.c
+++ b/fs/bcachefs/debug.c
@@ -378,26 +378,25 @@ static ssize_t bch2_read_btree(struct file *file, char __user *buf,
 	i->size	= size;
 	i->ret	= 0;
 
-	bch2_trans_init(&trans, i->c, 0, 0);
+	ret = flush_buf(i);
+	if (ret)
+		return ret;
 
+	bch2_trans_init(&trans, i->c, 0, 0);
 	ret = for_each_btree_key2(&trans, iter, i->id, i->from,
 				  BTREE_ITER_PREFETCH|
 				  BTREE_ITER_ALL_SNAPSHOTS, k, ({
-		ret = flush_buf(i);
-		if (ret)
-			break;
-
 		bch2_bkey_val_to_text(&i->buf, i->c, k);
 		prt_newline(&i->buf);
-		0;
+		drop_locks_do(&trans, flush_buf(i));
 	}));
 	i->from = iter.pos;
 
+	bch2_trans_exit(&trans);
+
 	if (!ret)
 		ret = flush_buf(i);
 
-	bch2_trans_exit(&trans);
-
 	return ret ?: i->ret;
 }
 
@@ -429,19 +428,24 @@ static ssize_t bch2_read_btree_formats(struct file *file, char __user *buf,
 		return i->ret;
 
 	bch2_trans_init(&trans, i->c, 0, 0);
+retry:
+	bch2_trans_begin(&trans);
 
 	for_each_btree_node(&trans, iter, i->id, i->from, 0, b, ret) {
-		ret = flush_buf(i);
-		if (ret)
-			break;
-
 		bch2_btree_node_to_text(&i->buf, i->c, b);
 		i->from = !bpos_eq(SPOS_MAX, b->key.k.p)
 			? bpos_successor(b->key.k.p)
 			: b->key.k.p;
+
+		ret = drop_locks_do(&trans, flush_buf(i));
+		if (ret)
+			break;
 	}
 	bch2_trans_iter_exit(&trans, &iter);
 
+	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+		goto retry;
+
 	bch2_trans_exit(&trans);
 
 	if (!ret)
@@ -483,17 +487,13 @@ static ssize_t bch2_read_bfloat_failed(struct file *file, char __user *buf,
 		struct bkey_packed *_k =
 			bch2_btree_node_iter_peek(&l->iter, l->b);
 
-		ret = flush_buf(i);
-		if (ret)
-			break;
-
 		if (bpos_gt(l->b->key.k.p, i->prev_node)) {
 			bch2_btree_node_to_text(&i->buf, i->c, l->b);
 			i->prev_node = l->b->key.k.p;
 		}
 
 		bch2_bfloat_to_text(&i->buf, l->b, _k);
-		0;
+		drop_locks_do(&trans, flush_buf(i));
 	}));
 	i->from = iter.pos;
 
diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h
index 12c0c44eb6b0..621ff4647205 100644
--- a/fs/bcachefs/errcode.h
+++ b/fs/bcachefs/errcode.h
@@ -224,7 +224,7 @@ bool __bch2_err_matches(int, int);
 
 static inline bool _bch2_err_matches(int err, int class)
 {
-	return err && __bch2_err_matches(err, class);
+	return err < 0 && __bch2_err_matches(err, class);
 }
 
 #define bch2_err_matches(_err, _class)			\
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index 25a9f657910c..0f5cbfa78b71 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -1645,7 +1645,7 @@ err_bucket_stale:
 		percpu_ref_put(&bch_dev_bkey_exists(c, buckets[i].b.inode)->io_ref);
 
 	/* We can retry this: */
-	ret = BCH_ERR_transaction_restart;
+	ret = -BCH_ERR_transaction_restart;
 	goto out;
 }
 
-- 
cgit 


From 3a63b32f121262f0566f8b89e98dd0c10f610325 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 24 Jun 2023 12:17:57 -0400
Subject: bcachefs: bch2_trans_mark_pointer() refactoring

bch2_bucket_backpointer_mod() doesn't need to update the alloc key, we
can exit the alloc iter earlier.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/buckets.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index d770dc949661..48fdd5f96a3b 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -1441,20 +1441,20 @@ static inline int bch2_trans_mark_pointer(struct btree_trans *trans,
 
 	ret = __mark_pointer(trans, k, &p.ptr, sectors, bp.data_type,
 			     a->v.gen, &a->v.data_type,
-			     &a->v.dirty_sectors, &a->v.cached_sectors);
+			     &a->v.dirty_sectors, &a->v.cached_sectors) ?:
+		bch2_trans_update(trans, &iter, &a->k_i, 0);
+	bch2_trans_iter_exit(trans, &iter);
+
 	if (ret)
-		goto err;
+		return ret;
 
 	if (!p.ptr.cached) {
 		ret = bch2_bucket_backpointer_mod(trans, bucket, bp, k, insert);
 		if (ret)
-			goto err;
+			return ret;
 	}
 
-	ret = bch2_trans_update(trans, &iter, &a->k_i, 0);
-err:
-	bch2_trans_iter_exit(trans, &iter);
-	return ret;
+	return 0;
 }
 
 static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans,
-- 
cgit 


From e9d017234ff96ed9820dc7cd3a4c940af44330bf Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 24 Jun 2023 15:59:03 -0400
Subject: bcachefs: BCH_ERR_fsck -> EINVAL

When we return errors outside of bcachefs, we need to return a standard
error code - fix this for BCH_ERR_fsck.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/errcode.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h
index 621ff4647205..1e06d95f3484 100644
--- a/fs/bcachefs/errcode.h
+++ b/fs/bcachefs/errcode.h
@@ -151,7 +151,7 @@
 	x(0,				backpointer_to_overwritten_btree_node)	\
 	x(0,				lock_fail_root_changed)			\
 	x(0,				journal_reclaim_would_deadlock)		\
-	x(0,				fsck)					\
+	x(EINVAL,			fsck)					\
 	x(BCH_ERR_fsck,			fsck_fix)				\
 	x(BCH_ERR_fsck,			fsck_ignore)				\
 	x(BCH_ERR_fsck,			fsck_errors_not_fixed)			\
-- 
cgit 


From e53a961c6b1ced2ac1ab69fdf56706cf21e6f7a6 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 24 Jun 2023 19:30:10 -0400
Subject: bcachefs: Rename enum alloc_reserve -> bch_watermark

This is prep work for consolidating with JOURNAL_WATERMARK.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.h      |  2 +-
 fs/bcachefs/alloc_foreground.c      | 82 ++++++++++++++++++-------------------
 fs/bcachefs/alloc_foreground.h      |  8 ++--
 fs/bcachefs/alloc_types.h           | 18 ++++----
 fs/bcachefs/btree_update_interior.c |  6 +--
 fs/bcachefs/buckets.h               | 28 ++++++-------
 fs/bcachefs/data_update.c           |  4 +-
 fs/bcachefs/ec.c                    | 34 +++++++--------
 fs/bcachefs/ec.h                    |  4 +-
 fs/bcachefs/io.c                    |  4 +-
 fs/bcachefs/io.h                    |  4 +-
 fs/bcachefs/io_types.h              |  2 +-
 fs/bcachefs/journal.c               |  2 +-
 fs/bcachefs/movinggc.c              |  2 +-
 fs/bcachefs/sysfs.c                 |  4 +-
 15 files changed, 101 insertions(+), 103 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h
index 94b3c057cd62..3c4d6d40b120 100644
--- a/fs/bcachefs/alloc_background.h
+++ b/fs/bcachefs/alloc_background.h
@@ -220,7 +220,7 @@ static inline u64 should_invalidate_buckets(struct bch_dev *ca,
 	u64 free = max_t(s64, 0,
 			   u.d[BCH_DATA_free].buckets
 			 + u.d[BCH_DATA_need_discard].buckets
-			 - bch2_dev_buckets_reserved(ca, RESERVE_stripe));
+			 - bch2_dev_buckets_reserved(ca, BCH_WATERMARK_stripe));
 
 	return clamp_t(s64, want_free - free, 0, u.d[BCH_DATA_cached].buckets);
 }
diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index 3c5100c26916..c7db89b92dbf 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -44,9 +44,9 @@ static void bch2_trans_mutex_lock_norelock(struct btree_trans *trans,
 	}
 }
 
-const char * const bch2_alloc_reserves[] = {
+const char * const bch2_watermarks[] = {
 #define x(t) #t,
-	BCH_ALLOC_RESERVES()
+	BCH_WATERMARKS()
 #undef x
 	NULL
 };
@@ -188,13 +188,13 @@ long bch2_bucket_alloc_new_fs(struct bch_dev *ca)
 	return -1;
 }
 
-static inline unsigned open_buckets_reserved(enum alloc_reserve reserve)
+static inline unsigned open_buckets_reserved(enum bch_watermark watermark)
 {
-	switch (reserve) {
-	case RESERVE_btree:
-	case RESERVE_btree_movinggc:
+	switch (watermark) {
+	case BCH_WATERMARK_btree:
+	case BCH_WATERMARK_btree_copygc:
 		return 0;
-	case RESERVE_movinggc:
+	case BCH_WATERMARK_copygc:
 		return OPEN_BUCKETS_COUNT / 4;
 	default:
 		return OPEN_BUCKETS_COUNT / 2;
@@ -203,7 +203,7 @@ static inline unsigned open_buckets_reserved(enum alloc_reserve reserve)
 
 static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
 					      u64 bucket,
-					      enum alloc_reserve reserve,
+					      enum bch_watermark watermark,
 					      const struct bch_alloc_v4 *a,
 					      struct bucket_alloc_state *s,
 					      struct closure *cl)
@@ -233,7 +233,7 @@ static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev *
 
 	spin_lock(&c->freelist_lock);
 
-	if (unlikely(c->open_buckets_nr_free <= open_buckets_reserved(reserve))) {
+	if (unlikely(c->open_buckets_nr_free <= open_buckets_reserved(watermark))) {
 		if (cl)
 			closure_wait(&c->open_buckets_wait, cl);
 
@@ -284,7 +284,7 @@ static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev *
 }
 
 static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bch_dev *ca,
-					    enum alloc_reserve reserve, u64 free_entry,
+					    enum bch_watermark watermark, u64 free_entry,
 					    struct bucket_alloc_state *s,
 					    struct bkey_s_c freespace_k,
 					    struct closure *cl)
@@ -374,7 +374,7 @@ static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bc
 		}
 	}
 
-	ob = __try_alloc_bucket(c, ca, b, reserve, a, s, cl);
+	ob = __try_alloc_bucket(c, ca, b, watermark, a, s, cl);
 	if (!ob)
 		iter.path->preserve = false;
 err:
@@ -394,7 +394,7 @@ err:
 static noinline struct open_bucket *
 bch2_bucket_alloc_early(struct btree_trans *trans,
 			struct bch_dev *ca,
-			enum alloc_reserve reserve,
+			enum bch_watermark watermark,
 			struct bucket_alloc_state *s,
 			struct closure *cl)
 {
@@ -424,7 +424,7 @@ again:
 
 		s->buckets_seen++;
 
-		ob = __try_alloc_bucket(trans->c, ca, k.k->p.offset, reserve, a, s, cl);
+		ob = __try_alloc_bucket(trans->c, ca, k.k->p.offset, watermark, a, s, cl);
 		if (ob)
 			break;
 	}
@@ -445,7 +445,7 @@ again:
 
 static struct open_bucket *bch2_bucket_alloc_freelist(struct btree_trans *trans,
 						   struct bch_dev *ca,
-						   enum alloc_reserve reserve,
+						   enum bch_watermark watermark,
 						   struct bucket_alloc_state *s,
 						   struct closure *cl)
 {
@@ -474,7 +474,7 @@ again:
 
 			s->buckets_seen++;
 
-			ob = try_alloc_bucket(trans, ca, reserve,
+			ob = try_alloc_bucket(trans, ca, watermark,
 					      alloc_cursor, s, k, cl);
 			if (ob) {
 				iter.path->preserve = false;
@@ -507,7 +507,7 @@ again:
  */
 static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans,
 				      struct bch_dev *ca,
-				      enum alloc_reserve reserve,
+				      enum bch_watermark watermark,
 				      struct closure *cl,
 				      struct bch_dev_usage *usage)
 {
@@ -519,7 +519,7 @@ static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans,
 	bool waiting = false;
 again:
 	bch2_dev_usage_read_fast(ca, usage);
-	avail = dev_buckets_free(ca, *usage, reserve);
+	avail = dev_buckets_free(ca, *usage, watermark);
 
 	if (usage->d[BCH_DATA_need_discard].buckets > avail)
 		bch2_do_discards(c);
@@ -548,8 +548,8 @@ again:
 		closure_wake_up(&c->freelist_wait);
 alloc:
 	ob = likely(freespace)
-		? bch2_bucket_alloc_freelist(trans, ca, reserve, &s, cl)
-		: bch2_bucket_alloc_early(trans, ca, reserve, &s, cl);
+		? bch2_bucket_alloc_freelist(trans, ca, watermark, &s, cl)
+		: bch2_bucket_alloc_early(trans, ca, watermark, &s, cl);
 
 	if (s.skipped_need_journal_commit * 2 > avail)
 		bch2_journal_flush_async(&c->journal, NULL);
@@ -564,7 +564,7 @@ err:
 
 	if (!IS_ERR(ob))
 		trace_and_count(c, bucket_alloc, ca,
-				bch2_alloc_reserves[reserve],
+				bch2_watermarks[watermark],
 				ob->bucket,
 				usage->d[BCH_DATA_free].buckets,
 				avail,
@@ -575,7 +575,7 @@ err:
 				"");
 	else if (!bch2_err_matches(PTR_ERR(ob), BCH_ERR_transaction_restart))
 		trace_and_count(c, bucket_alloc_fail, ca,
-				bch2_alloc_reserves[reserve],
+				bch2_watermarks[watermark],
 				0,
 				usage->d[BCH_DATA_free].buckets,
 				avail,
@@ -589,14 +589,14 @@ err:
 }
 
 struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca,
-				      enum alloc_reserve reserve,
+				      enum bch_watermark watermark,
 				      struct closure *cl)
 {
 	struct bch_dev_usage usage;
 	struct open_bucket *ob;
 
 	bch2_trans_do(c, NULL, NULL, 0,
-		      PTR_ERR_OR_ZERO(ob = bch2_bucket_alloc_trans(&trans, ca, reserve,
+		      PTR_ERR_OR_ZERO(ob = bch2_bucket_alloc_trans(&trans, ca, watermark,
 							cl, &usage)));
 	return ob;
 }
@@ -629,7 +629,7 @@ static inline void bch2_dev_stripe_increment_inlined(struct bch_dev *ca,
 			       struct bch_dev_usage *usage)
 {
 	u64 *v = stripe->next_alloc + ca->dev_idx;
-	u64 free_space = dev_buckets_available(ca, RESERVE_none);
+	u64 free_space = dev_buckets_available(ca, BCH_WATERMARK_normal);
 	u64 free_space_inv = free_space
 		? div64_u64(1ULL << 48, free_space)
 		: 1ULL << 48;
@@ -692,7 +692,7 @@ int bch2_bucket_alloc_set_trans(struct btree_trans *trans,
 		      bool *have_cache,
 		      unsigned flags,
 		      enum bch_data_type data_type,
-		      enum alloc_reserve reserve,
+		      enum bch_watermark watermark,
 		      struct closure *cl)
 {
 	struct bch_fs *c = trans->c;
@@ -725,7 +725,7 @@ int bch2_bucket_alloc_set_trans(struct btree_trans *trans,
 			continue;
 		}
 
-		ob = bch2_bucket_alloc_trans(trans, ca, reserve, cl, &usage);
+		ob = bch2_bucket_alloc_trans(trans, ca, watermark, cl, &usage);
 		if (!IS_ERR(ob))
 			bch2_dev_stripe_increment_inlined(ca, stripe, &usage);
 		percpu_ref_put(&ca->ref);
@@ -766,7 +766,7 @@ static int bucket_alloc_from_stripe(struct btree_trans *trans,
 			 unsigned nr_replicas,
 			 unsigned *nr_effective,
 			 bool *have_cache,
-			 enum alloc_reserve reserve,
+			 enum bch_watermark watermark,
 			 unsigned flags,
 			 struct closure *cl)
 {
@@ -784,7 +784,7 @@ static int bucket_alloc_from_stripe(struct btree_trans *trans,
 	if (ec_open_bucket(c, ptrs))
 		return 0;
 
-	h = bch2_ec_stripe_head_get(trans, target, 0, nr_replicas - 1, reserve, cl);
+	h = bch2_ec_stripe_head_get(trans, target, 0, nr_replicas - 1, watermark, cl);
 	if (IS_ERR(h))
 		return PTR_ERR(h);
 	if (!h)
@@ -879,7 +879,7 @@ static int bucket_alloc_set_partial(struct bch_fs *c,
 				    unsigned nr_replicas,
 				    unsigned *nr_effective,
 				    bool *have_cache, bool ec,
-				    enum alloc_reserve reserve,
+				    enum bch_watermark watermark,
 				    unsigned flags)
 {
 	int i, ret = 0;
@@ -901,7 +901,7 @@ static int bucket_alloc_set_partial(struct bch_fs *c,
 			u64 avail;
 
 			bch2_dev_usage_read_fast(ca, &usage);
-			avail = dev_buckets_free(ca, usage, reserve);
+			avail = dev_buckets_free(ca, usage, watermark);
 			if (!avail)
 				continue;
 
@@ -931,7 +931,7 @@ static int __open_bucket_add_buckets(struct btree_trans *trans,
 			unsigned nr_replicas,
 			unsigned *nr_effective,
 			bool *have_cache,
-			enum alloc_reserve reserve,
+			enum bch_watermark watermark,
 			unsigned flags,
 			struct closure *_cl)
 {
@@ -962,7 +962,7 @@ static int __open_bucket_add_buckets(struct btree_trans *trans,
 
 	ret = bucket_alloc_set_partial(c, ptrs, wp, &devs,
 				 nr_replicas, nr_effective,
-				 have_cache, erasure_code, reserve, flags);
+				 have_cache, erasure_code, watermark, flags);
 	if (ret)
 		return ret;
 
@@ -971,7 +971,7 @@ static int __open_bucket_add_buckets(struct btree_trans *trans,
 					 target,
 					 nr_replicas, nr_effective,
 					 have_cache,
-					 reserve, flags, _cl);
+					 watermark, flags, _cl);
 	} else {
 retry_blocking:
 		/*
@@ -980,7 +980,7 @@ retry_blocking:
 		 */
 		ret = bch2_bucket_alloc_set_trans(trans, ptrs, &wp->stripe, &devs,
 					nr_replicas, nr_effective, have_cache,
-					flags, wp->data_type, reserve, cl);
+					flags, wp->data_type, watermark, cl);
 		if (ret &&
 		    !bch2_err_matches(ret, BCH_ERR_transaction_restart) &&
 		    !bch2_err_matches(ret, BCH_ERR_insufficient_devices) &&
@@ -1003,7 +1003,7 @@ static int open_bucket_add_buckets(struct btree_trans *trans,
 			unsigned nr_replicas,
 			unsigned *nr_effective,
 			bool *have_cache,
-			enum alloc_reserve reserve,
+			enum bch_watermark watermark,
 			unsigned flags,
 			struct closure *cl)
 {
@@ -1013,7 +1013,7 @@ static int open_bucket_add_buckets(struct btree_trans *trans,
 		ret = __open_bucket_add_buckets(trans, ptrs, wp,
 				devs_have, target, erasure_code,
 				nr_replicas, nr_effective, have_cache,
-				reserve, flags, cl);
+				watermark, flags, cl);
 		if (bch2_err_matches(ret, BCH_ERR_transaction_restart) ||
 		    bch2_err_matches(ret, BCH_ERR_operation_blocked) ||
 		    bch2_err_matches(ret, BCH_ERR_freelist_empty) ||
@@ -1026,7 +1026,7 @@ static int open_bucket_add_buckets(struct btree_trans *trans,
 	ret = __open_bucket_add_buckets(trans, ptrs, wp,
 			devs_have, target, false,
 			nr_replicas, nr_effective, have_cache,
-			reserve, flags, cl);
+			watermark, flags, cl);
 	return ret < 0 ? ret : 0;
 }
 
@@ -1263,7 +1263,7 @@ int bch2_alloc_sectors_start_trans(struct btree_trans *trans,
 			     struct bch_devs_list *devs_have,
 			     unsigned nr_replicas,
 			     unsigned nr_replicas_required,
-			     enum alloc_reserve reserve,
+			     enum bch_watermark watermark,
 			     unsigned flags,
 			     struct closure *cl,
 			     struct write_point **wp_ret)
@@ -1296,7 +1296,7 @@ retry:
 		ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have,
 					      target, erasure_code,
 					      nr_replicas, &nr_effective,
-					      &have_cache, reserve,
+					      &have_cache, watermark,
 					      flags, NULL);
 		if (!ret ||
 		    bch2_err_matches(ret, BCH_ERR_transaction_restart))
@@ -1315,14 +1315,14 @@ retry:
 		ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have,
 					      0, erasure_code,
 					      nr_replicas, &nr_effective,
-					      &have_cache, reserve,
+					      &have_cache, watermark,
 					      flags, cl);
 	} else {
 allocate_blocking:
 		ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have,
 					      target, erasure_code,
 					      nr_replicas, &nr_effective,
-					      &have_cache, reserve,
+					      &have_cache, watermark,
 					      flags, cl);
 	}
 alloc_done:
diff --git a/fs/bcachefs/alloc_foreground.h b/fs/bcachefs/alloc_foreground.h
index 8a1cf425091b..fee195f7eabf 100644
--- a/fs/bcachefs/alloc_foreground.h
+++ b/fs/bcachefs/alloc_foreground.h
@@ -14,7 +14,7 @@ struct bch_dev;
 struct bch_fs;
 struct bch_devs_List;
 
-extern const char * const bch2_alloc_reserves[];
+extern const char * const bch2_watermarks[];
 
 void bch2_reset_alloc_cursors(struct bch_fs *);
 
@@ -31,7 +31,7 @@ void bch2_dev_stripe_increment(struct bch_dev *, struct dev_stripe_state *);
 long bch2_bucket_alloc_new_fs(struct bch_dev *);
 
 struct open_bucket *bch2_bucket_alloc(struct bch_fs *, struct bch_dev *,
-				      enum alloc_reserve, struct closure *);
+				      enum bch_watermark, struct closure *);
 
 static inline void ob_push(struct bch_fs *c, struct open_buckets *obs,
 			   struct open_bucket *ob)
@@ -152,7 +152,7 @@ static inline bool bch2_bucket_is_open_safe(struct bch_fs *c, unsigned dev, u64
 int bch2_bucket_alloc_set_trans(struct btree_trans *, struct open_buckets *,
 		      struct dev_stripe_state *, struct bch_devs_mask *,
 		      unsigned, unsigned *, bool *, unsigned,
-		      enum bch_data_type, enum alloc_reserve,
+		      enum bch_data_type, enum bch_watermark,
 		      struct closure *);
 
 int bch2_alloc_sectors_start_trans(struct btree_trans *,
@@ -160,7 +160,7 @@ int bch2_alloc_sectors_start_trans(struct btree_trans *,
 				   struct write_point_specifier,
 				   struct bch_devs_list *,
 				   unsigned, unsigned,
-				   enum alloc_reserve,
+				   enum bch_watermark,
 				   unsigned,
 				   struct closure *,
 				   struct write_point **);
diff --git a/fs/bcachefs/alloc_types.h b/fs/bcachefs/alloc_types.h
index cd0c50aae416..a01fddfba004 100644
--- a/fs/bcachefs/alloc_types.h
+++ b/fs/bcachefs/alloc_types.h
@@ -16,20 +16,18 @@ struct bucket_alloc_state {
 	u64	skipped_nouse;
 };
 
-struct ec_bucket_buf;
-
-#define BCH_ALLOC_RESERVES()		\
-	x(btree_movinggc)		\
+#define BCH_WATERMARKS()		\
+	x(btree_copygc)			\
 	x(btree)			\
-	x(movinggc)			\
-	x(none)				\
+	x(copygc)			\
+	x(normal)			\
 	x(stripe)
 
-enum alloc_reserve {
-#define x(name)	RESERVE_##name,
-	BCH_ALLOC_RESERVES()
+enum bch_watermark {
+#define x(name)	BCH_WATERMARK_##name,
+	BCH_WATERMARKS()
 #undef x
-	RESERVE_NR,
+	BCH_WATERMARK_NR,
 };
 
 #define OPEN_BUCKETS_COUNT	1024
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index e95e48857bb8..eb3319f97296 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -247,15 +247,15 @@ static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans,
 	struct open_buckets ob = { .nr = 0 };
 	struct bch_devs_list devs_have = (struct bch_devs_list) { 0 };
 	unsigned nr_reserve;
-	enum alloc_reserve alloc_reserve;
+	enum bch_watermark alloc_reserve;
 	int ret;
 
 	if (flags & BTREE_INSERT_USE_RESERVE) {
 		nr_reserve	= 0;
-		alloc_reserve	= RESERVE_btree_movinggc;
+		alloc_reserve	= BCH_WATERMARK_btree_copygc;
 	} else {
 		nr_reserve	= BTREE_NODE_RESERVE;
-		alloc_reserve	= RESERVE_btree;
+		alloc_reserve	= BCH_WATERMARK_btree;
 	}
 
 	mutex_lock(&c->btree_reserve_cache_lock);
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index bdf4fff9cb8a..803780d9b8b7 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -150,26 +150,26 @@ static inline struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *ca)
 
 void bch2_dev_usage_init(struct bch_dev *);
 
-static inline u64 bch2_dev_buckets_reserved(struct bch_dev *ca, enum alloc_reserve reserve)
+static inline u64 bch2_dev_buckets_reserved(struct bch_dev *ca, enum bch_watermark watermark)
 {
 	s64 reserved = 0;
 
-	switch (reserve) {
-	case RESERVE_NR:
+	switch (watermark) {
+	case BCH_WATERMARK_NR:
 		unreachable();
-	case RESERVE_stripe:
+	case BCH_WATERMARK_stripe:
 		reserved += ca->mi.nbuckets >> 6;
 		fallthrough;
-	case RESERVE_none:
+	case BCH_WATERMARK_normal:
 		reserved += ca->mi.nbuckets >> 6;
 		fallthrough;
-	case RESERVE_movinggc:
+	case BCH_WATERMARK_copygc:
 		reserved += ca->nr_btree_reserve;
 		fallthrough;
-	case RESERVE_btree:
+	case BCH_WATERMARK_btree:
 		reserved += ca->nr_btree_reserve;
 		fallthrough;
-	case RESERVE_btree_movinggc:
+	case BCH_WATERMARK_btree_copygc:
 		break;
 	}
 
@@ -178,17 +178,17 @@ static inline u64 bch2_dev_buckets_reserved(struct bch_dev *ca, enum alloc_reser
 
 static inline u64 dev_buckets_free(struct bch_dev *ca,
 				   struct bch_dev_usage usage,
-				   enum alloc_reserve reserve)
+				   enum bch_watermark watermark)
 {
 	return max_t(s64, 0,
 		     usage.d[BCH_DATA_free].buckets -
 		     ca->nr_open_buckets -
-		     bch2_dev_buckets_reserved(ca, reserve));
+		     bch2_dev_buckets_reserved(ca, watermark));
 }
 
 static inline u64 __dev_buckets_available(struct bch_dev *ca,
 					  struct bch_dev_usage usage,
-					  enum alloc_reserve reserve)
+					  enum bch_watermark watermark)
 {
 	return max_t(s64, 0,
 		       usage.d[BCH_DATA_free].buckets
@@ -196,13 +196,13 @@ static inline u64 __dev_buckets_available(struct bch_dev *ca,
 		     + usage.d[BCH_DATA_need_gc_gens].buckets
 		     + usage.d[BCH_DATA_need_discard].buckets
 		     - ca->nr_open_buckets
-		     - bch2_dev_buckets_reserved(ca, reserve));
+		     - bch2_dev_buckets_reserved(ca, watermark));
 }
 
 static inline u64 dev_buckets_available(struct bch_dev *ca,
-					enum alloc_reserve reserve)
+					enum bch_watermark watermark)
 {
-	return __dev_buckets_available(ca, bch2_dev_usage_read(ca), reserve);
+	return __dev_buckets_available(ca, bch2_dev_usage_read(ca), watermark);
 }
 
 /* Filesystem usage: */
diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c
index 9f7a30c7ad36..cc79bcb6726c 100644
--- a/fs/bcachefs/data_update.c
+++ b/fs/bcachefs/data_update.c
@@ -381,7 +381,7 @@ void bch2_update_unwritten_extent(struct btree_trans *trans,
 				&update->op.devs_have,
 				update->op.nr_replicas,
 				update->op.nr_replicas,
-				update->op.alloc_reserve,
+				update->op.watermark,
 				0, &cl, &wp);
 		if (bch2_err_matches(ret, BCH_ERR_operation_blocked)) {
 			bch2_trans_unlock(trans);
@@ -459,7 +459,7 @@ int bch2_data_update_init(struct btree_trans *trans,
 		bch2_compression_opt_to_type[io_opts.background_compression ?:
 					     io_opts.compression];
 	if (m->data_opts.btree_insert_flags & BTREE_INSERT_USE_RESERVE)
-		m->op.alloc_reserve = RESERVE_movinggc;
+		m->op.watermark = BCH_WATERMARK_copygc;
 
 	bkey_for_each_ptr(ptrs, ptr)
 		percpu_ref_get(&bch_dev_bkey_exists(c, ptr->dev)->ref);
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index 0c5c291e844e..8d091c4a0173 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -1333,7 +1333,7 @@ static int ec_new_stripe_alloc(struct bch_fs *c, struct ec_stripe_head *h)
 static struct ec_stripe_head *
 ec_new_stripe_head_alloc(struct bch_fs *c, unsigned target,
 			 unsigned algo, unsigned redundancy,
-			 enum alloc_reserve reserve)
+			 enum bch_watermark watermark)
 {
 	struct ec_stripe_head *h;
 	struct bch_dev *ca;
@@ -1349,7 +1349,7 @@ ec_new_stripe_head_alloc(struct bch_fs *c, unsigned target,
 	h->target	= target;
 	h->algo		= algo;
 	h->redundancy	= redundancy;
-	h->reserve	= reserve;
+	h->watermark	= watermark;
 
 	rcu_read_lock();
 	h->devs = target_rw_devs(c, BCH_DATA_user, target);
@@ -1384,7 +1384,7 @@ struct ec_stripe_head *__bch2_ec_stripe_head_get(struct btree_trans *trans,
 						 unsigned target,
 						 unsigned algo,
 						 unsigned redundancy,
-						 enum alloc_reserve reserve)
+						 enum bch_watermark watermark)
 {
 	struct bch_fs *c = trans->c;
 	struct ec_stripe_head *h;
@@ -1406,21 +1406,21 @@ struct ec_stripe_head *__bch2_ec_stripe_head_get(struct btree_trans *trans,
 		if (h->target		== target &&
 		    h->algo		== algo &&
 		    h->redundancy	== redundancy &&
-		    h->reserve		== reserve) {
+		    h->watermark	== watermark) {
 			ret = bch2_trans_mutex_lock(trans, &h->lock);
 			if (ret)
 				h = ERR_PTR(ret);
 			goto found;
 		}
 
-	h = ec_new_stripe_head_alloc(c, target, algo, redundancy, reserve);
+	h = ec_new_stripe_head_alloc(c, target, algo, redundancy, watermark);
 found:
 	mutex_unlock(&c->ec_stripe_head_lock);
 	return h;
 }
 
 static int new_stripe_alloc_buckets(struct btree_trans *trans, struct ec_stripe_head *h,
-				    enum alloc_reserve reserve, struct closure *cl)
+				    enum bch_watermark watermark, struct closure *cl)
 {
 	struct bch_fs *c = trans->c;
 	struct bch_devs_mask devs = h->devs;
@@ -1453,7 +1453,7 @@ static int new_stripe_alloc_buckets(struct btree_trans *trans, struct ec_stripe_
 					    &nr_have_parity,
 					    &have_cache, 0,
 					    BCH_DATA_parity,
-					    reserve,
+					    watermark,
 					    cl);
 
 		open_bucket_for_each(c, &buckets, ob, i) {
@@ -1480,7 +1480,7 @@ static int new_stripe_alloc_buckets(struct btree_trans *trans, struct ec_stripe_
 					    &nr_have_data,
 					    &have_cache, 0,
 					    BCH_DATA_user,
-					    reserve,
+					    watermark,
 					    cl);
 
 		open_bucket_for_each(c, &buckets, ob, i) {
@@ -1658,7 +1658,7 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans,
 					       unsigned target,
 					       unsigned algo,
 					       unsigned redundancy,
-					       enum alloc_reserve reserve,
+					       enum bch_watermark watermark,
 					       struct closure *cl)
 {
 	struct bch_fs *c = trans->c;
@@ -1666,7 +1666,7 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans,
 	bool waiting = false;
 	int ret;
 
-	h = __bch2_ec_stripe_head_get(trans, target, algo, redundancy, reserve);
+	h = __bch2_ec_stripe_head_get(trans, target, algo, redundancy, watermark);
 	if (!h)
 		bch_err(c, "no stripe head");
 	if (IS_ERR_OR_NULL(h))
@@ -1687,7 +1687,7 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans,
 		goto alloc_existing;
 
 	/* First, try to allocate a full stripe: */
-	ret =   new_stripe_alloc_buckets(trans, h, RESERVE_stripe, NULL) ?:
+	ret =   new_stripe_alloc_buckets(trans, h, BCH_WATERMARK_stripe, NULL) ?:
 		__bch2_ec_stripe_head_reserve(trans, h);
 	if (!ret)
 		goto allocate_buf;
@@ -1706,8 +1706,8 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans,
 		if (waiting || !cl || ret != -BCH_ERR_stripe_alloc_blocked)
 			goto err;
 
-		if (reserve == RESERVE_movinggc) {
-			ret =   new_stripe_alloc_buckets(trans, h, reserve, NULL) ?:
+		if (watermark == BCH_WATERMARK_copygc) {
+			ret =   new_stripe_alloc_buckets(trans, h, watermark, NULL) ?:
 				__bch2_ec_stripe_head_reserve(trans, h);
 			if (ret)
 				goto err;
@@ -1723,10 +1723,10 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans,
 		closure_wake_up(&c->freelist_wait);
 alloc_existing:
 	/*
-	 * Retry allocating buckets, with the reserve watermark for this
+	 * Retry allocating buckets, with the watermark for this
 	 * particular write:
 	 */
-	ret = new_stripe_alloc_buckets(trans, h, reserve, cl);
+	ret = new_stripe_alloc_buckets(trans, h, watermark, cl);
 	if (ret)
 		goto err;
 
@@ -1880,7 +1880,7 @@ void bch2_new_stripes_to_text(struct printbuf *out, struct bch_fs *c)
 	list_for_each_entry(h, &c->ec_stripe_head_list, list) {
 		prt_printf(out, "target %u algo %u redundancy %u %s:\n",
 		       h->target, h->algo, h->redundancy,
-		       bch2_alloc_reserves[h->reserve]);
+		       bch2_watermarks[h->watermark]);
 
 		if (h->s)
 			prt_printf(out, "\tidx %llu blocks %u+%u allocated %u\n",
@@ -1898,7 +1898,7 @@ void bch2_new_stripes_to_text(struct printbuf *out, struct bch_fs *c)
 			   s->idx, s->nr_data, s->nr_parity,
 			   atomic_read(&s->ref[STRIPE_REF_io]),
 			   atomic_read(&s->ref[STRIPE_REF_stripe]),
-			   bch2_alloc_reserves[s->h->reserve]);
+			   bch2_watermarks[s->h->watermark]);
 	}
 	mutex_unlock(&c->ec_stripe_new_lock);
 }
diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h
index 3995b7076427..64ca277ca1a6 100644
--- a/fs/bcachefs/ec.h
+++ b/fs/bcachefs/ec.h
@@ -187,7 +187,7 @@ struct ec_stripe_head {
 	unsigned		target;
 	unsigned		algo;
 	unsigned		redundancy;
-	enum alloc_reserve	reserve;
+	enum bch_watermark	watermark;
 
 	struct bch_devs_mask	devs;
 	unsigned		nr_active_devs;
@@ -211,7 +211,7 @@ int bch2_ec_stripe_new_alloc(struct bch_fs *, struct ec_stripe_head *);
 void bch2_ec_stripe_head_put(struct bch_fs *, struct ec_stripe_head *);
 struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *,
 			unsigned, unsigned, unsigned,
-			enum alloc_reserve, struct closure *);
+			enum bch_watermark, struct closure *);
 
 void bch2_stripes_heap_update(struct bch_fs *, struct stripe *, size_t);
 void bch2_stripes_heap_del(struct bch_fs *, struct stripe *, size_t);
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index 0f5cbfa78b71..7db94a8cb7ff 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -451,7 +451,7 @@ retry:
 				&devs_have,
 				opts.data_replicas,
 				opts.data_replicas,
-				RESERVE_none, 0, &cl, &wp);
+				BCH_WATERMARK_normal, 0, &cl, &wp);
 		if (ret) {
 			bch2_trans_unlock(trans);
 			closure_sync(&cl);
@@ -1696,7 +1696,7 @@ again:
 				&op->devs_have,
 				op->nr_replicas,
 				op->nr_replicas_required,
-				op->alloc_reserve,
+				op->watermark,
 				op->flags,
 				(op->flags & (BCH_WRITE_ALLOC_NOWAIT|
 					      BCH_WRITE_ONLY_SPECIFIED_DEVS))
diff --git a/fs/bcachefs/io.h b/fs/bcachefs/io.h
index 87d80fb28c05..7a243a5f3f89 100644
--- a/fs/bcachefs/io.h
+++ b/fs/bcachefs/io.h
@@ -59,7 +59,7 @@ enum bch_write_flags {
 
 static inline struct workqueue_struct *index_update_wq(struct bch_write_op *op)
 {
-	return op->alloc_reserve == RESERVE_movinggc
+	return op->watermark == BCH_WATERMARK_copygc
 		? op->c->copygc_wq
 		: op->c->btree_update_wq;
 }
@@ -89,7 +89,7 @@ static inline void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c,
 	op->compression_type	= bch2_compression_opt_to_type[opts.compression];
 	op->nr_replicas		= 0;
 	op->nr_replicas_required = c->opts.data_replicas_required;
-	op->alloc_reserve	= RESERVE_none;
+	op->watermark		= BCH_WATERMARK_normal;
 	op->incompressible	= 0;
 	op->open_buckets.nr	= 0;
 	op->devs_have.nr	= 0;
diff --git a/fs/bcachefs/io_types.h b/fs/bcachefs/io_types.h
index 4149291c0df6..0fbdfbf90ad8 100644
--- a/fs/bcachefs/io_types.h
+++ b/fs/bcachefs/io_types.h
@@ -119,7 +119,7 @@ struct bch_write_op {
 	unsigned		compression_type:4;
 	unsigned		nr_replicas:4;
 	unsigned		nr_replicas_required:4;
-	unsigned		alloc_reserve:3;
+	unsigned		watermark:3;
 	unsigned		incompressible:1;
 	unsigned		stripe_waited:1;
 
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index 64332c78a6bb..dc34aba2d259 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -828,7 +828,7 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
 				break;
 			}
 		} else {
-			ob[nr_got] = bch2_bucket_alloc(c, ca, RESERVE_none, cl);
+			ob[nr_got] = bch2_bucket_alloc(c, ca, BCH_WATERMARK_normal, cl);
 			ret = PTR_ERR_OR_ZERO(ob[nr_got]);
 			if (ret)
 				break;
diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c
index 6750767276f2..7ef78cccc65c 100644
--- a/fs/bcachefs/movinggc.c
+++ b/fs/bcachefs/movinggc.c
@@ -271,7 +271,7 @@ unsigned long bch2_copygc_wait_amount(struct bch_fs *c)
 	for_each_rw_member(ca, c, dev_idx) {
 		struct bch_dev_usage usage = bch2_dev_usage_read(ca);
 
-		fragmented_allowed = ((__dev_buckets_available(ca, usage, RESERVE_stripe) *
+		fragmented_allowed = ((__dev_buckets_available(ca, usage, BCH_WATERMARK_stripe) *
 				       ca->mi.bucket_size) >> 1);
 		fragmented = 0;
 
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index 54e1071ecfeb..364cbcd2654e 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -850,8 +850,8 @@ static void dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca)
 
 	prt_printf(out, "reserves:");
 	prt_newline(out);
-	for (i = 0; i < RESERVE_NR; i++) {
-		prt_str(out, bch2_alloc_reserves[i]);
+	for (i = 0; i < BCH_WATERMARK_NR; i++) {
+		prt_str(out, bch2_watermarks[i]);
 		prt_tab(out);
 		prt_u64(out, bch2_dev_buckets_reserved(ca, i));
 		prt_tab_rjust(out);
-- 
cgit 


From 8f507f89b8b87d2ee4adc990b96388001444967e Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 24 Jun 2023 23:20:39 -0400
Subject: bcachefs: Fix check_pos_snapshot_overwritten()

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update_leaf.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index b42b83c55c5b..259e5e47d2a7 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -1218,7 +1218,6 @@ static inline int check_pos_snapshot_overwritten(struct btree_trans *trans,
 					  struct bpos pos)
 {
 	if (!btree_type_has_snapshots(id) ||
-	    pos.snapshot == U32_MAX ||
 	    !snapshot_t(trans->c, pos.snapshot)->children[0])
 		return 0;
 
-- 
cgit 


From 454377d8432fdfaa5ebfca05a9bfa3af311d5b9a Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 24 Jun 2023 23:22:20 -0400
Subject: bcachefs: Improve error message for overlapping extents

We now print out the full previous extent we overlapping with, to aid in
debugging and searching through the journal.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fsck.c | 38 ++++++++++++++++++++++++++++++++------
 1 file changed, 32 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 194e8d474e86..40804012a990 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -1143,6 +1143,25 @@ struct extent_end {
 
 typedef DARRAY(struct extent_end) extent_ends;
 
+static int get_print_extent(struct btree_trans *trans, struct bpos pos, struct printbuf *out)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	int ret;
+
+	k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_extents, pos,
+			       BTREE_ITER_SLOTS|
+			       BTREE_ITER_ALL_SNAPSHOTS|
+			       BTREE_ITER_NOT_EXTENTS);
+	ret = bkey_err(k);
+	if (ret)
+		return ret;
+
+	bch2_bkey_val_to_text(out, trans->c, k);
+	bch2_trans_iter_exit(trans, &iter);
+	return 0;
+}
+
 static int check_overlapping_extents(struct btree_trans *trans,
 			      struct snapshots_seen *seen,
 			      extent_ends *extent_ends,
@@ -1165,12 +1184,19 @@ static int check_overlapping_extents(struct btree_trans *trans,
 				  i->snapshot, &i->seen))
 			continue;
 
-		if (fsck_err_on(i->offset > bkey_start_offset(k.k), c,
-				"overlapping extents: extent in snapshot %u ends at %llu overlaps with\n%s",
-				i->snapshot,
-				i->offset,
-				(printbuf_reset(&buf),
-				 bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
+		if (i->offset <= bkey_start_offset(k.k))
+			continue;
+
+		printbuf_reset(&buf);
+		prt_str(&buf, "overlapping extents:\n  ");
+		bch2_bkey_val_to_text(&buf, c, k);
+		prt_str(&buf, "\n  ");
+
+		ret = get_print_extent(trans, SPOS(k.k->p.inode, i->offset, i->snapshot), &buf);
+		if (ret)
+			break;
+
+		if (fsck_err(c, buf.buf)) {
 			struct bkey_i *update = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
 			if ((ret = PTR_ERR_OR_ZERO(update)))
 				goto err;
-- 
cgit 


From 75da97640a75878cd197f6dd9c50b46cac6cb9a8 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 25 Jun 2023 01:34:45 -0400
Subject: bcachefs: fsck needs BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE

A few fsck paths weren't using BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE -
oops.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fsck.c | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 40804012a990..3503dabe3871 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -191,17 +191,18 @@ static int __write_inode(struct btree_trans *trans,
 			 struct bch_inode_unpacked *inode,
 			 u32 snapshot)
 {
-	struct btree_iter iter;
-	int ret;
+	struct bkey_inode_buf *inode_p =
+		bch2_trans_kmalloc(trans, sizeof(*inode_p));
 
-	bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes,
-			    SPOS(0, inode->bi_inum, snapshot),
-			    BTREE_ITER_INTENT);
+	if (IS_ERR(inode_p))
+		return PTR_ERR(inode_p);
 
-	ret   = bch2_btree_iter_traverse(&iter) ?:
-		bch2_inode_write(trans, &iter, inode);
-	bch2_trans_iter_exit(trans, &iter);
-	return ret;
+	bch2_inode_pack(inode_p, inode);
+	inode_p->inode.k.p.snapshot = snapshot;
+
+	return bch2_btree_insert_nonextent(trans, BTREE_ID_inodes,
+				&inode_p->inode.k_i,
+				BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
 }
 
 static int write_inode(struct btree_trans *trans,
@@ -1201,7 +1202,8 @@ static int check_overlapping_extents(struct btree_trans *trans,
 			if ((ret = PTR_ERR_OR_ZERO(update)))
 				goto err;
 			bkey_reassemble(update, k);
-			ret = bch2_trans_update_extent(trans, iter, update, 0);
+			ret = bch2_trans_update_extent(trans, iter, update,
+					    BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
 			if (ret)
 				goto err;
 		}
-- 
cgit 


From 298ac24e6346b517148a6645c7c5686565868753 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 26 Jun 2023 22:26:04 -0400
Subject: bcachefs: Reduce stack frame size of bch2_check_alloc_info()

Excessive inlining may (on some versions of gcc?) cause excessive stack
usage; this turns off some inlining in bch2_check_alloc_info.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c | 40 ++++++++++++++++++++++------------------
 1 file changed, 22 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index f68330b48847..4486ce0b7920 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -1030,12 +1030,13 @@ again:
 	return k;
 }
 
-static int bch2_check_alloc_key(struct btree_trans *trans,
-				struct bkey_s_c alloc_k,
-				struct btree_iter *alloc_iter,
-				struct btree_iter *discard_iter,
-				struct btree_iter *freespace_iter,
-				struct btree_iter *bucket_gens_iter)
+static noinline_for_stack
+int bch2_check_alloc_key(struct btree_trans *trans,
+			 struct bkey_s_c alloc_k,
+			 struct btree_iter *alloc_iter,
+			 struct btree_iter *discard_iter,
+			 struct btree_iter *freespace_iter,
+			 struct btree_iter *bucket_gens_iter)
 {
 	struct bch_fs *c = trans->c;
 	struct bch_dev *ca;
@@ -1159,10 +1160,11 @@ fsck_err:
 	return ret;
 }
 
-static int bch2_check_alloc_hole_freespace(struct btree_trans *trans,
-				 struct bpos start,
-				 struct bpos *end,
-				 struct btree_iter *freespace_iter)
+static noinline_for_stack
+int bch2_check_alloc_hole_freespace(struct btree_trans *trans,
+				    struct bpos start,
+				    struct bpos *end,
+				    struct btree_iter *freespace_iter)
 {
 	struct bch_fs *c = trans->c;
 	struct bch_dev *ca;
@@ -1214,10 +1216,11 @@ fsck_err:
 	return ret;
 }
 
-static int bch2_check_alloc_hole_bucket_gens(struct btree_trans *trans,
-				 struct bpos start,
-				 struct bpos *end,
-				 struct btree_iter *bucket_gens_iter)
+static noinline_for_stack
+int bch2_check_alloc_hole_bucket_gens(struct btree_trans *trans,
+				      struct bpos start,
+				      struct bpos *end,
+				      struct btree_iter *bucket_gens_iter)
 {
 	struct bch_fs *c = trans->c;
 	struct bkey_s_c k;
@@ -1279,7 +1282,7 @@ fsck_err:
 	return ret;
 }
 
-static int __bch2_check_discard_freespace_key(struct btree_trans *trans,
+static noinline_for_stack int __bch2_check_discard_freespace_key(struct btree_trans *trans,
 					      struct btree_iter *iter)
 {
 	struct bch_fs *c = trans->c;
@@ -1359,9 +1362,10 @@ static int bch2_check_discard_freespace_key(struct btree_trans *trans,
  * valid for buckets that exist; this just checks for keys for nonexistent
  * buckets.
  */
-static int bch2_check_bucket_gens_key(struct btree_trans *trans,
-				      struct btree_iter *iter,
-				      struct bkey_s_c k)
+static noinline_for_stack
+int bch2_check_bucket_gens_key(struct btree_trans *trans,
+			       struct btree_iter *iter,
+			       struct bkey_s_c k)
 {
 	struct bch_fs *c = trans->c;
 	struct bkey_i_bucket_gens g;
-- 
cgit 


From 0fb3355d0a3b055af8735fa25b5af63f4dd9a034 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 26 Jun 2023 18:36:24 -0400
Subject: bcachefs: Improve bch2_bkey_make_mut()

bch2_bkey_make_mut() now takes the bkey_s_c by reference and points it
at the new, mutable key.

This helps in some fsck paths that may have multiple repair operations
on the same key.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_gc.c     |  4 ++--
 fs/bcachefs/btree_update.h |  8 +++++---
 fs/bcachefs/migrate.c      |  2 +-
 fs/bcachefs/subvolume.c    | 10 +++++-----
 4 files changed, 13 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 4fbd2e545ac2..0c9cba56105c 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -1585,7 +1585,7 @@ static int bch2_gc_write_reflink_key(struct btree_trans *trans,
 			"  should be %u",
 			(bch2_bkey_val_to_text(&buf, c, k), buf.buf),
 			r->refcount)) {
-		struct bkey_i *new = bch2_bkey_make_mut(trans, iter, k, 0);
+		struct bkey_i *new = bch2_bkey_make_mut(trans, iter, &k, 0);
 
 		ret = PTR_ERR_OR_ZERO(new);
 		if (ret)
@@ -1913,7 +1913,7 @@ static int gc_btree_gens_key(struct btree_trans *trans,
 	percpu_up_read(&c->mark_lock);
 	return 0;
 update:
-	u = bch2_bkey_make_mut(trans, iter, k, 0);
+	u = bch2_bkey_make_mut(trans, iter, &k, 0);
 	ret = PTR_ERR_OR_ZERO(u);
 	if (ret)
 		return ret;
diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
index e90cf292f80b..8911a7f79f1c 100644
--- a/fs/bcachefs/btree_update.h
+++ b/fs/bcachefs/btree_update.h
@@ -241,10 +241,10 @@ static inline struct bkey_i *bch2_bkey_make_mut_noupdate(struct btree_trans *tra
 				KEY_TYPE_##_type, sizeof(struct bkey_i_##_type)))
 
 static inline struct bkey_i *__bch2_bkey_make_mut(struct btree_trans *trans, struct btree_iter *iter,
-					struct bkey_s_c k, unsigned flags,
+					struct bkey_s_c *k, unsigned flags,
 					unsigned type, unsigned min_bytes)
 {
-	struct bkey_i *mut = __bch2_bkey_make_mut_noupdate(trans, k, type, min_bytes);
+	struct bkey_i *mut = __bch2_bkey_make_mut_noupdate(trans, *k, type, min_bytes);
 	int ret;
 
 	if (IS_ERR(mut))
@@ -253,11 +253,13 @@ static inline struct bkey_i *__bch2_bkey_make_mut(struct btree_trans *trans, str
 	ret = bch2_trans_update(trans, iter, mut, flags);
 	if (ret)
 		return ERR_PTR(ret);
+
+	*k = bkey_i_to_s_c(mut);
 	return mut;
 }
 
 static inline struct bkey_i *bch2_bkey_make_mut(struct btree_trans *trans, struct btree_iter *iter,
-						struct bkey_s_c k, unsigned flags)
+						struct bkey_s_c *k, unsigned flags)
 {
 	return __bch2_bkey_make_mut(trans, iter, k, flags, 0, 0);
 }
diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c
index 0898fa49b3cd..783e9442b302 100644
--- a/fs/bcachefs/migrate.c
+++ b/fs/bcachefs/migrate.c
@@ -49,7 +49,7 @@ static int bch2_dev_usrdata_drop_key(struct btree_trans *trans,
 	if (!bch2_bkey_has_device_c(k, dev_idx))
 		return 0;
 
-	n = bch2_bkey_make_mut(trans, iter, k, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+	n = bch2_bkey_make_mut(trans, iter, &k, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
 	ret = PTR_ERR_OR_ZERO(n);
 	if (ret)
 		return ret;
diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c
index 828644e6c714..89c7c83c50e8 100644
--- a/fs/bcachefs/subvolume.c
+++ b/fs/bcachefs/subvolume.c
@@ -385,7 +385,7 @@ static int check_snapshot_tree(struct btree_trans *trans,
 		if (ret)
 			goto err;
 
-		u = bch2_bkey_make_mut_typed(trans, iter, k, 0, snapshot_tree);
+		u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot_tree);
 		ret = PTR_ERR_OR_ZERO(u);
 		if (ret)
 			goto err;
@@ -473,7 +473,7 @@ static int snapshot_tree_ptr_repair(struct btree_trans *trans,
 		return ret;
 
 	if (ret || le32_to_cpu(s_t.root_snapshot) != root_id) {
-		u = bch2_bkey_make_mut_typed(trans, &root_iter, root.s_c, 0, snapshot);
+		u = bch2_bkey_make_mut_typed(trans, &root_iter, &root.s_c, 0, snapshot);
 		ret =   PTR_ERR_OR_ZERO(u) ?:
 			snapshot_tree_create(trans, root_id,
 				bch2_snapshot_tree_oldest_subvol(c, root_id),
@@ -487,7 +487,7 @@ static int snapshot_tree_ptr_repair(struct btree_trans *trans,
 	}
 
 	if (s->k->p.snapshot != root_id) {
-		u = bch2_bkey_make_mut_typed(trans, iter, s->s_c, 0, snapshot);
+		u = bch2_bkey_make_mut_typed(trans, iter, &s->s_c, 0, snapshot);
 		ret = PTR_ERR_OR_ZERO(u);
 		if (ret)
 			goto err;
@@ -677,7 +677,7 @@ static int check_subvol(struct btree_trans *trans,
 				"subvolume %llu is not set as snapshot but is not master subvolume",
 				k.k->p.offset)) {
 			struct bkey_i_subvolume *s =
-				bch2_bkey_make_mut_typed(trans, iter, subvol.s_c, 0, subvolume);
+				bch2_bkey_make_mut_typed(trans, iter, &subvol.s_c, 0, subvolume);
 			ret = PTR_ERR_OR_ZERO(s);
 			if (ret)
 				return ret;
@@ -1249,7 +1249,7 @@ static int bch2_subvolume_reparent(struct btree_trans *trans,
 	    le32_to_cpu(bkey_s_c_to_subvolume(k).v->parent) != old_parent)
 		return 0;
 
-	s = bch2_bkey_make_mut_typed(trans, iter, k, 0, subvolume);
+	s = bch2_bkey_make_mut_typed(trans, iter, &k, 0, subvolume);
 	ret = PTR_ERR_OR_ZERO(s);
 	if (ret)
 		return ret;
-- 
cgit 


From 0ce4e0e759614ec19b140e8f19a67305c01cca78 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 26 Jun 2023 23:10:21 -0400
Subject: bcachefs: Add a missing rhashtable_destroy() call

Fixes https://lore.kernel.org/linux-bcachefs/784c3e6a-75bd-e6ca-535a-43b3e1daf643@kernel.dk/T/#mbf7caf005f960018eba23b58795d06c06c947411

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/movinggc.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c
index 7ef78cccc65c..5ea512968f4d 100644
--- a/fs/bcachefs/movinggc.c
+++ b/fs/bcachefs/movinggc.c
@@ -369,6 +369,7 @@ static int bch2_copygc_thread(void *arg)
 	}
 
 	move_buckets_wait(&trans, &ctxt, &move_buckets, true);
+	rhashtable_destroy(&move_buckets.table);
 	bch2_trans_exit(&trans);
 	bch2_moving_ctxt_exit(&ctxt);
 
-- 
cgit 


From b3591acc3bc25d120dd9cb72f462e009e046a254 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 26 Jun 2023 23:31:49 -0400
Subject: bcachefs: unregister_shrinker() now safe on not-registered shrinker

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_cache.c     | 3 +--
 fs/bcachefs/btree_key_cache.c | 3 +--
 2 files changed, 2 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index 435e68888918..8b27b7e98f7d 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -398,8 +398,7 @@ void bch2_fs_btree_cache_exit(struct bch_fs *c)
 	struct btree *b;
 	unsigned i, flags;
 
-	if (bc->shrink.list.next)
-		unregister_shrinker(&bc->shrink);
+	unregister_shrinker(&bc->shrink);
 
 	/* vfree() can allocate memory: */
 	flags = memalloc_nofs_save();
diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index 37beb75e2571..5f00688d46f1 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -955,8 +955,7 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc)
 	int cpu;
 #endif
 
-	if (bc->shrink.list.next)
-		unregister_shrinker(&bc->shrink);
+	unregister_shrinker(&bc->shrink);
 
 	mutex_lock(&bc->lock);
 
-- 
cgit 


From 1fa3e87ac50a24a4f6a71986a4d9bc2f16d0667e Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 27 Jun 2023 16:20:05 -0400
Subject: bcachefs: Fix leak in backpointers fsck

We were forgetting to exit a printbuf - whoops.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/backpointers.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c
index 2641ebef6ae4..a270ff96e9b4 100644
--- a/fs/bcachefs/backpointers.c
+++ b/fs/bcachefs/backpointers.c
@@ -811,8 +811,10 @@ static int check_one_backpointer(struct btree_trans *trans,
 
 	if (fsck_err_on(!k.k, c,
 			"backpointer for missing extent\n  %s",
-			(bch2_bkey_val_to_text(&buf, c, bp.s_c), buf.buf)))
-		return bch2_btree_delete_at_buffered(trans, BTREE_ID_backpointers, bp.k->p);
+			(bch2_bkey_val_to_text(&buf, c, bp.s_c), buf.buf))) {
+		ret = bch2_btree_delete_at_buffered(trans, BTREE_ID_backpointers, bp.k->p);
+		goto out;
+	}
 out:
 fsck_err:
 	bch2_trans_iter_exit(trans, &iter);
-- 
cgit 


From 06dcca5191dcae948fa8ffd9369deb832881ffcd Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 25 Jun 2023 16:35:49 -0400
Subject: bcachefs: fsck: Break walk_inode() up into multiple functions

Some refactoring, prep work for algorithm improvements related to
snapshots.

we need to add a bitmap to the list of inodes for "seen this snapshot";
for this bitmap to correctly be available, we'll need to gather the list
of inodes first, and later look up the inode for a given snapshot.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fsck.c | 103 +++++++++++++++++++++++++++++------------------------
 1 file changed, 57 insertions(+), 46 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 3503dabe3871..7af65030f9d8 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -639,26 +639,23 @@ static int add_inode(struct bch_fs *c, struct inode_walker *w,
 	}));
 }
 
-static int __walk_inode(struct btree_trans *trans,
-			struct inode_walker *w, struct bpos pos)
+static int get_inodes_all_snapshots(struct btree_trans *trans,
+				    struct inode_walker *w, u64 inum)
 {
 	struct bch_fs *c = trans->c;
 	struct btree_iter iter;
 	struct bkey_s_c k;
 	u32 restart_count = trans->restart_count;
-	unsigned i;
 	int ret;
 
-	pos.snapshot = bch2_snapshot_equiv(c, pos.snapshot);
-
-	if (pos.inode == w->cur_inum)
-		goto lookup_snapshot;
+	if (w->cur_inum == inum)
+		return 0;
 
 	w->inodes.nr = 0;
 
-	for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, pos.inode),
+	for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, inum),
 			   BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
-		if (k.k->p.offset != pos.inode)
+		if (k.k->p.offset != inum)
 			break;
 
 		if (bkey_is_inode(k.k))
@@ -669,40 +666,62 @@ static int __walk_inode(struct btree_trans *trans,
 	if (ret)
 		return ret;
 
-	w->cur_inum		= pos.inode;
+	w->cur_inum		= inum;
 	w->first_this_inode	= true;
 
 	if (trans_was_restarted(trans, restart_count))
 		return -BCH_ERR_transaction_restart_nested;
 
-lookup_snapshot:
-	for (i = 0; i < w->inodes.nr; i++)
-		if (bch2_snapshot_is_ancestor(c, pos.snapshot, w->inodes.data[i].snapshot))
+	return 0;
+}
+
+static struct inode_walker_entry *
+lookup_inode_for_snapshot(struct bch_fs *c,
+			  struct inode_walker *w, u32 snapshot)
+{
+	struct inode_walker_entry *i;
+
+	snapshot = bch2_snapshot_equiv(c, snapshot);
+
+	darray_for_each(w->inodes, i)
+		if (bch2_snapshot_is_ancestor(c, snapshot, i->snapshot))
 			goto found;
-	return INT_MAX;
+
+	return NULL;
 found:
-	BUG_ON(pos.snapshot > w->inodes.data[i].snapshot);
+	BUG_ON(snapshot > i->snapshot);
 
-	if (pos.snapshot != w->inodes.data[i].snapshot) {
-		struct inode_walker_entry e = w->inodes.data[i];
+	if (snapshot != i->snapshot) {
+		struct inode_walker_entry new = *i;
+		int ret;
 
-		e.snapshot = pos.snapshot;
-		e.count = 0;
+		new.snapshot = snapshot;
+		new.count = 0;
 
 		bch_info(c, "have key for inode %llu:%u but have inode in ancestor snapshot %u",
-			 pos.inode, pos.snapshot, w->inodes.data[i].snapshot);
+			 w->cur_inum, snapshot, i->snapshot);
 
-		while (i && w->inodes.data[i - 1].snapshot > pos.snapshot)
+		while (i > w->inodes.data && i[-1].snapshot > snapshot)
 			--i;
 
-		ret = darray_insert_item(&w->inodes, i, e);
+		ret = darray_insert_item(&w->inodes, i - w->inodes.data, new);
 		if (ret)
-			return ret;
+			return ERR_PTR(ret);
 	}
 
 	return i;
 }
 
+static struct inode_walker_entry *walk_inode(struct btree_trans *trans,
+					     struct inode_walker *w, struct bpos pos)
+{
+	int ret = get_inodes_all_snapshots(trans, w, pos.inode);
+	if (ret)
+		return ERR_PTR(ret);
+
+	return lookup_inode_for_snapshot(trans->c, w, pos.snapshot);
+}
+
 static int __get_visible_inodes(struct btree_trans *trans,
 				struct inode_walker *w,
 				struct snapshots_seen *s,
@@ -1300,11 +1319,12 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
 	if (ret)
 		goto err;
 
-	ret = __walk_inode(trans, inode, equiv);
-	if (ret < 0)
+	i = walk_inode(trans, inode, equiv);
+	ret = PTR_ERR_OR_ZERO(i);
+	if (ret)
 		goto err;
 
-	if (fsck_err_on(ret == INT_MAX, c,
+	if (fsck_err_on(!i, c,
 			"extent in missing inode:\n  %s",
 			(printbuf_reset(&buf),
 			 bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
@@ -1313,13 +1333,8 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
 		goto out;
 	}
 
-	if (ret == INT_MAX) {
-		ret = 0;
+	if (!i)
 		goto out;
-	}
-
-	i = inode->inodes.data + ret;
-	ret = 0;
 
 	if (fsck_err_on(!S_ISREG(i->inode.bi_mode) &&
 			!S_ISLNK(i->inode.bi_mode), c,
@@ -1625,7 +1640,8 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
 
 	BUG_ON(!iter->path->should_be_locked);
 
-	ret = __walk_inode(trans, dir, equiv);
+	i = walk_inode(trans, dir, equiv);
+	ret = PTR_ERR_OR_ZERO(i);
 	if (ret < 0)
 		goto err;
 
@@ -1633,7 +1649,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
 		*hash_info = bch2_hash_info_init(c, &dir->inodes.data[0].inode);
 	dir->first_this_inode = false;
 
-	if (fsck_err_on(ret == INT_MAX, c,
+	if (fsck_err_on(!i, c,
 			"dirent in nonexisting directory:\n%s",
 			(printbuf_reset(&buf),
 			 bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
@@ -1642,13 +1658,8 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
 		goto out;
 	}
 
-	if (ret == INT_MAX) {
-		ret = 0;
+	if (!i)
 		goto out;
-	}
-
-	i = dir->inodes.data + ret;
-	ret = 0;
 
 	if (fsck_err_on(!S_ISDIR(i->inode.bi_mode), c,
 			"dirent in non directory inode type %s:\n%s",
@@ -1802,30 +1813,30 @@ static int check_xattr(struct btree_trans *trans, struct btree_iter *iter,
 		       struct inode_walker *inode)
 {
 	struct bch_fs *c = trans->c;
+	struct inode_walker_entry *i;
 	int ret;
 
 	ret = check_key_has_snapshot(trans, iter, k);
 	if (ret)
 		return ret;
 
-	ret = __walk_inode(trans, inode, k.k->p);
-	if (ret < 0)
+	i = walk_inode(trans, inode, k.k->p);
+	ret = PTR_ERR_OR_ZERO(i);
+	if (ret)
 		return ret;
 
 	if (inode->first_this_inode)
 		*hash_info = bch2_hash_info_init(c, &inode->inodes.data[0].inode);
 	inode->first_this_inode = false;
 
-	if (fsck_err_on(ret == INT_MAX, c,
+	if (fsck_err_on(!i, c,
 			"xattr for missing inode %llu",
 			k.k->p.inode))
 		return bch2_btree_delete_at(trans, iter, 0);
 
-	if (ret == INT_MAX)
+	if (!i)
 		return 0;
 
-	ret = 0;
-
 	ret = hash_check_key(trans, bch2_xattr_hash_desc, hash_info, iter, k);
 fsck_err:
 	if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
-- 
cgit 


From e4eb661d3a5764273cb854d7c441819943692971 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 27 Jun 2023 19:10:24 -0400
Subject: bcachefs: Fix btree node write error message

Error messages should include the error code, when available.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_io.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 27a2a7b31f37..19aca1c0f2a2 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -1777,7 +1777,7 @@ out:
 err:
 	set_btree_node_noevict(b);
 	if (!bch2_err_matches(ret, EROFS))
-		bch2_fs_fatal_error(c, "fatal error writing btree node");
+		bch2_fs_fatal_error(c, "fatal error writing btree node: %s", bch2_err_str(ret));
 	goto out;
 }
 
-- 
cgit 


From 4e1430a728499ce8088e1bdd0dd6467ce3447ca0 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 27 Jun 2023 18:01:09 -0400
Subject: bcachefs: Expand BTREE_NODE_ID

We now have 20 bits for the btree ID in the on disk format - sufficient
for 1 million distinct btrees.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs_format.h | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index ad87cdff8544..a73f1de8e872 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -2195,13 +2195,25 @@ struct btree_node {
 	};
 } __packed __aligned(8);
 
-LE64_BITMASK(BTREE_NODE_ID,	struct btree_node, flags,  0,  4);
+LE64_BITMASK(BTREE_NODE_ID_LO,	struct btree_node, flags,  0,  4);
 LE64_BITMASK(BTREE_NODE_LEVEL,	struct btree_node, flags,  4,  8);
 LE64_BITMASK(BTREE_NODE_NEW_EXTENT_OVERWRITE,
 				struct btree_node, flags,  8,  9);
-/* 9-32 unused */
+LE64_BITMASK(BTREE_NODE_ID_HI,	struct btree_node, flags,  9, 25);
+/* 25-32 unused */
 LE64_BITMASK(BTREE_NODE_SEQ,	struct btree_node, flags, 32, 64);
 
+static inline __u64 BTREE_NODE_ID(struct btree_node *n)
+{
+	return BTREE_NODE_ID_LO(n) | (BTREE_NODE_ID_HI(n) << 4);
+}
+
+static inline void SET_BTREE_NODE_ID(struct btree_node *n, u64 v)
+{
+	SET_BTREE_NODE_ID_LO(n, v);
+	SET_BTREE_NODE_ID_HI(n, v >> 4);
+}
+
 struct btree_node_entry {
 	struct bch_csum		csum;
 
@@ -2211,7 +2223,6 @@ struct btree_node_entry {
 		__u8		pad[22];
 		__le16		u64s;
 		__u64		_data[0];
-
 	};
 	};
 } __packed __aligned(8);
-- 
cgit 


From 2766876d5d118abd59d14fbe5b31e7e208ea11f2 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 27 Jun 2023 19:02:17 -0400
Subject: bcachefs: struct bch_extent_rebalance

This adds the extent entry for extents that rebalance needs to do
something with.

We're adding this ahead of the main rebalance_work patchset, because
adding new extent entries can't be done in a forwards-compatible way.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs_format.h | 19 +++++++++++++++++--
 fs/bcachefs/extents.c         |  6 ++++++
 fs/bcachefs/extents.h         |  3 +++
 3 files changed, 26 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index a73f1de8e872..158cefb87684 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -488,8 +488,9 @@ struct bch_csum {
 	x(crc32,		1)		\
 	x(crc64,		2)		\
 	x(crc128,		3)		\
-	x(stripe_ptr,		4)
-#define BCH_EXTENT_ENTRY_MAX	5
+	x(stripe_ptr,		4)		\
+	x(rebalance,		5)
+#define BCH_EXTENT_ENTRY_MAX	6
 
 enum bch_extent_entry_type {
 #define x(f, n) BCH_EXTENT_ENTRY_##f = n,
@@ -624,6 +625,20 @@ struct bch_extent_reservation {
 #endif
 };
 
+struct bch_extent_rebalance {
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+	__u64			type:7,
+				unused:33,
+				compression:8,
+				target:16;
+#elif defined (__BIG_ENDIAN_BITFIELD)
+	__u64			target:16,
+				compression:8,
+				unused:33,
+				type:7;
+#endif
+};
+
 union bch_extent_entry {
 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ ||  __BITS_PER_LONG == 64
 	unsigned long			type;
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 7e00550980de..753a846eaf81 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -1202,6 +1202,8 @@ int bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k,
 			}
 			have_ec = true;
 			break;
+		case BCH_EXTENT_ENTRY_rebalance:
+			break;
 		}
 	}
 
@@ -1260,6 +1262,8 @@ void bch2_ptr_swab(struct bkey_s k)
 			break;
 		case BCH_EXTENT_ENTRY_stripe_ptr:
 			break;
+		case BCH_EXTENT_ENTRY_rebalance:
+			break;
 		}
 	}
 }
@@ -1310,6 +1314,8 @@ int bch2_cut_front_s(struct bpos where, struct bkey_s k)
 				break;
 			case BCH_EXTENT_ENTRY_stripe_ptr:
 				break;
+			case BCH_EXTENT_ENTRY_rebalance:
+				break;
 			}
 
 			if (extent_entry_is_crc(entry))
diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h
index 3ba41e37d864..c573a40d366a 100644
--- a/fs/bcachefs/extents.h
+++ b/fs/bcachefs/extents.h
@@ -318,6 +318,9 @@ static inline struct bkey_ptrs bch2_bkey_ptrs(struct bkey_s k)
 			(_ptr).ec = _entry->stripe_ptr;			\
 			(_ptr).has_ec	= true;				\
 			break;						\
+		default:						\
+			/* nothing */					\
+			break;						\
 		}							\
 out:									\
 	_entry < (_end);						\
-- 
cgit 


From 494036d862dfff1de9782492692da225479b7146 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 27 Jun 2023 17:29:20 -0400
Subject: bcachefs: BCH_WATERMARK_reclaim

Add another watermark for journal reclaim - this is needed for the next
patches, that unify BCH_WATERMARK with JOURNAL_WATERMARK.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_foreground.c | 6 ++++--
 fs/bcachefs/alloc_types.h      | 1 +
 fs/bcachefs/buckets.h          | 1 +
 3 files changed, 6 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index c7db89b92dbf..0cc5e9f8d461 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -191,11 +191,13 @@ long bch2_bucket_alloc_new_fs(struct bch_dev *ca)
 static inline unsigned open_buckets_reserved(enum bch_watermark watermark)
 {
 	switch (watermark) {
+	case BCH_WATERMARK_reclaim:
+		return 0;
 	case BCH_WATERMARK_btree:
 	case BCH_WATERMARK_btree_copygc:
-		return 0;
-	case BCH_WATERMARK_copygc:
 		return OPEN_BUCKETS_COUNT / 4;
+	case BCH_WATERMARK_copygc:
+		return OPEN_BUCKETS_COUNT / 3;
 	default:
 		return OPEN_BUCKETS_COUNT / 2;
 	}
diff --git a/fs/bcachefs/alloc_types.h b/fs/bcachefs/alloc_types.h
index a01fddfba004..ddcaf0631a8b 100644
--- a/fs/bcachefs/alloc_types.h
+++ b/fs/bcachefs/alloc_types.h
@@ -17,6 +17,7 @@ struct bucket_alloc_state {
 };
 
 #define BCH_WATERMARKS()		\
+	x(reclaim)			\
 	x(btree_copygc)			\
 	x(btree)			\
 	x(copygc)			\
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index 803780d9b8b7..f9d7dda07ea6 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -170,6 +170,7 @@ static inline u64 bch2_dev_buckets_reserved(struct bch_dev *ca, enum bch_waterma
 		reserved += ca->nr_btree_reserve;
 		fallthrough;
 	case BCH_WATERMARK_btree_copygc:
+	case BCH_WATERMARK_reclaim:
 		break;
 	}
 
-- 
cgit 


From ec14fc6010fdcc40e54e289afc657a676ce93e72 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 27 Jun 2023 17:32:38 -0400
Subject: bcachefs: Kill JOURNAL_WATERMARK

This unifies JOURNAL_WATERMARK with BCH_WATERMARK; we're working towards
specifying watermarks once in the transaction commit path.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_types.h           | 13 ++++++++-----
 fs/bcachefs/btree_key_cache.c       |  2 +-
 fs/bcachefs/btree_update.h          |  4 ++--
 fs/bcachefs/btree_update_interior.c |  6 +++---
 fs/bcachefs/btree_update_leaf.c     | 12 ++++++------
 fs/bcachefs/btree_write_buffer.c    |  2 +-
 fs/bcachefs/journal.c               | 15 +++++----------
 fs/bcachefs/journal.h               | 26 ++++++++++++++++----------
 fs/bcachefs/journal_io.c            |  2 +-
 fs/bcachefs/journal_types.h         | 15 +--------------
 fs/bcachefs/movinggc.c              |  2 +-
 fs/bcachefs/recovery.c              |  2 +-
 12 files changed, 46 insertions(+), 55 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_types.h b/fs/bcachefs/alloc_types.h
index ddcaf0631a8b..c33a29954e59 100644
--- a/fs/bcachefs/alloc_types.h
+++ b/fs/bcachefs/alloc_types.h
@@ -17,12 +17,12 @@ struct bucket_alloc_state {
 };
 
 #define BCH_WATERMARKS()		\
-	x(reclaim)			\
-	x(btree_copygc)			\
-	x(btree)			\
-	x(copygc)			\
+	x(stripe)			\
 	x(normal)			\
-	x(stripe)
+	x(copygc)			\
+	x(btree)			\
+	x(btree_copygc)			\
+	x(reclaim)
 
 enum bch_watermark {
 #define x(name)	BCH_WATERMARK_##name,
@@ -31,6 +31,9 @@ enum bch_watermark {
 	BCH_WATERMARK_NR,
 };
 
+#define BCH_WATERMARK_BITS	3
+#define BCH_WATERMARK_MASK	~(~0 << BCH_WATERMARK_BITS)
+
 #define OPEN_BUCKETS_COUNT	1024
 
 #define WRITE_POINT_HASH_NR	32
diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index 5f00688d46f1..40847ec1e56e 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -652,7 +652,7 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans,
 				  BTREE_INSERT_NOFAIL|
 				  BTREE_INSERT_USE_RESERVE|
 				  (ck->journal.seq == journal_last_seq(j)
-				   ? JOURNAL_WATERMARK_reserved
+				   ? BCH_WATERMARK_reclaim
 				   : 0)|
 				  commit_flags);
 
diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
index 8911a7f79f1c..8ecb87533cf3 100644
--- a/fs/bcachefs/btree_update.h
+++ b/fs/bcachefs/btree_update.h
@@ -23,8 +23,8 @@ void bch2_btree_insert_key_leaf(struct btree_trans *, struct btree_path *,
 				struct bkey_i *, u64);
 
 enum btree_insert_flags {
-	/* First two bits for journal watermark: */
-	__BTREE_INSERT_NOFAIL = 2,
+	/* First bits for bch_watermark: */
+	__BTREE_INSERT_NOFAIL = BCH_WATERMARK_BITS,
 	__BTREE_INSERT_NOCHECK_RW,
 	__BTREE_INSERT_LAZY_RW,
 	__BTREE_INSERT_USE_RESERVE,
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index eb3319f97296..914cb80fc6d9 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -651,7 +651,7 @@ static void btree_update_nodes_written(struct btree_update *as)
 			BTREE_INSERT_NOCHECK_RW|
 			BTREE_INSERT_USE_RESERVE|
 			BTREE_INSERT_JOURNAL_RECLAIM|
-			JOURNAL_WATERMARK_reserved,
+			BCH_WATERMARK_reclaim,
 			btree_update_nodes_written_trans(&trans, as));
 	bch2_trans_unlock(&trans);
 
@@ -1049,7 +1049,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
 		? BCH_DISK_RESERVATION_NOFAIL : 0;
 	unsigned nr_nodes[2] = { 0, 0 };
 	unsigned update_level = level;
-	int journal_flags = flags & JOURNAL_WATERMARK_MASK;
+	int journal_flags = flags & BCH_WATERMARK_MASK;
 	int ret = 0;
 	u32 restart_count = trans->restart_count;
 
@@ -2192,7 +2192,7 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans,
 				BTREE_INSERT_NOCHECK_RW|
 				BTREE_INSERT_USE_RESERVE|
 				BTREE_INSERT_JOURNAL_RECLAIM|
-				JOURNAL_WATERMARK_reserved);
+				BCH_WATERMARK_reclaim);
 	if (ret)
 		goto err;
 
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 259e5e47d2a7..ce6ec28d8f60 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -320,7 +320,7 @@ bch2_trans_journal_preres_get_cold(struct btree_trans *trans, unsigned flags,
 		bch2_journal_preres_get(&trans->c->journal,
 			&trans->journal_preres,
 			trans->journal_preres_u64s,
-			(flags & JOURNAL_WATERMARK_MASK)));
+			(flags & BCH_WATERMARK_MASK)));
 }
 
 static __always_inline int bch2_trans_journal_res_get(struct btree_trans *trans,
@@ -636,7 +636,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
 	 */
 	if (likely(!(flags & BTREE_INSERT_JOURNAL_REPLAY))) {
 		ret = bch2_trans_journal_res_get(trans,
-				(flags & JOURNAL_WATERMARK_MASK)|
+				(flags & BCH_WATERMARK_MASK)|
 				JOURNAL_RES_GET_NONBLOCK);
 		if (ret)
 			return ret;
@@ -885,7 +885,7 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, unsigned flags
 
 	ret = bch2_journal_preres_get(&c->journal,
 			&trans->journal_preres, trans->journal_preres_u64s,
-			(flags & JOURNAL_WATERMARK_MASK)|JOURNAL_RES_GET_NONBLOCK);
+			(flags & BCH_WATERMARK_MASK)|JOURNAL_RES_GET_NONBLOCK);
 	if (unlikely(ret == -BCH_ERR_journal_preres_get_blocked))
 		ret = bch2_trans_journal_preres_get_cold(trans, flags, trace_ip);
 	if (unlikely(ret))
@@ -952,14 +952,14 @@ int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags,
 		break;
 	case -BCH_ERR_journal_res_get_blocked:
 		if ((flags & BTREE_INSERT_JOURNAL_RECLAIM) &&
-		    !(flags & JOURNAL_WATERMARK_reserved)) {
+		    (flags & BCH_WATERMARK_MASK) != BCH_WATERMARK_reclaim) {
 			ret = -BCH_ERR_journal_reclaim_would_deadlock;
 			break;
 		}
 
 		ret = drop_locks_do(trans,
 			bch2_trans_journal_res_get(trans,
-					(flags & JOURNAL_WATERMARK_MASK)|
+					(flags & BCH_WATERMARK_MASK)|
 					JOURNAL_RES_GET_CHECK));
 		break;
 	case -BCH_ERR_btree_insert_need_journal_reclaim:
@@ -2048,7 +2048,7 @@ int bch2_journal_log_msg(struct bch_fs *c, const char *fmt, ...)
 	int ret;
 
 	va_start(args, fmt);
-	ret = __bch2_fs_log_msg(c, JOURNAL_WATERMARK_reserved, fmt, args);
+	ret = __bch2_fs_log_msg(c, BCH_WATERMARK_reclaim, fmt, args);
 	va_end(args);
 	return ret;
 }
diff --git a/fs/bcachefs/btree_write_buffer.c b/fs/bcachefs/btree_write_buffer.c
index 88c4b50dd70f..4003fa1f0319 100644
--- a/fs/bcachefs/btree_write_buffer.c
+++ b/fs/bcachefs/btree_write_buffer.c
@@ -232,7 +232,7 @@ slowpath:
 				commit_flags|
 				BTREE_INSERT_NOFAIL|
 				BTREE_INSERT_JOURNAL_RECLAIM|
-				JOURNAL_WATERMARK_reserved,
+				BCH_WATERMARK_reclaim,
 				__bch2_btree_insert(trans, i->btree, &i->k, 0));
 		if (bch2_fs_fatal_err_on(ret, c, "%s: insert error %s", __func__, bch2_err_str(ret)))
 			break;
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index dc34aba2d259..f33ab45b7944 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -19,17 +19,12 @@
 #include "journal_seq_blacklist.h"
 #include "trace.h"
 
-#define x(n)	#n,
-static const char * const bch2_journal_watermarks[] = {
-	JOURNAL_WATERMARKS()
-	NULL
-};
-
 static const char * const bch2_journal_errors[] = {
+#define x(n)	#n,
 	JOURNAL_ERRORS()
+#undef x
 	NULL
 };
-#undef x
 
 static inline bool journal_seq_unwritten(struct journal *j, u64 seq)
 {
@@ -96,7 +91,7 @@ journal_error_check_stuck(struct journal *j, int error, unsigned flags)
 	if (!(error == JOURNAL_ERR_journal_full ||
 	      error == JOURNAL_ERR_journal_pin_full) ||
 	    nr_unwritten_journal_entries(j) ||
-	    (flags & JOURNAL_WATERMARK_MASK) != JOURNAL_WATERMARK_reserved)
+	    (flags & BCH_WATERMARK_MASK) != BCH_WATERMARK_reclaim)
 		return stuck;
 
 	spin_lock(&j->lock);
@@ -440,7 +435,7 @@ retry:
 		return 0;
 	}
 
-	if ((flags & JOURNAL_WATERMARK_MASK) < j->watermark) {
+	if ((flags & BCH_WATERMARK_MASK) < j->watermark) {
 		/*
 		 * Don't want to close current journal entry, just need to
 		 * invoke reclaim:
@@ -1292,7 +1287,7 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
 	prt_printf(out, "last_seq_ondisk:\t%llu\n",		j->last_seq_ondisk);
 	prt_printf(out, "flushed_seq_ondisk:\t%llu\n",	j->flushed_seq_ondisk);
 	prt_printf(out, "prereserved:\t\t%u/%u\n",		j->prereserved.reserved, j->prereserved.remaining);
-	prt_printf(out, "watermark:\t\t%s\n",		bch2_journal_watermarks[j->watermark]);
+	prt_printf(out, "watermark:\t\t%s\n",		bch2_watermarks[j->watermark]);
 	prt_printf(out, "each entry reserved:\t%u\n",	j->entry_u64s_reserved);
 	prt_printf(out, "nr flush writes:\t%llu\n",		j->nr_flush_writes);
 	prt_printf(out, "nr noflush writes:\t%llu\n",	j->nr_noflush_writes);
diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h
index 024cea9f5902..008a2e25a4fa 100644
--- a/fs/bcachefs/journal.h
+++ b/fs/bcachefs/journal.h
@@ -294,9 +294,14 @@ static inline void bch2_journal_res_put(struct journal *j,
 int bch2_journal_res_get_slowpath(struct journal *, struct journal_res *,
 				  unsigned);
 
-/* First two bits for JOURNAL_WATERMARK: */
-#define JOURNAL_RES_GET_NONBLOCK	(1 << 2)
-#define JOURNAL_RES_GET_CHECK		(1 << 3)
+/* First bits for BCH_WATERMARK: */
+enum journal_res_flags {
+	__JOURNAL_RES_GET_NONBLOCK	= BCH_WATERMARK_BITS,
+	__JOURNAL_RES_GET_CHECK,
+};
+
+#define JOURNAL_RES_GET_NONBLOCK	(1 << __JOURNAL_RES_GET_NONBLOCK)
+#define JOURNAL_RES_GET_CHECK		(1 << __JOURNAL_RES_GET_CHECK)
 
 static inline int journal_res_get_fast(struct journal *j,
 				       struct journal_res *res,
@@ -317,7 +322,7 @@ static inline int journal_res_get_fast(struct journal *j,
 
 		EBUG_ON(!journal_state_count(new, new.idx));
 
-		if ((flags & JOURNAL_WATERMARK_MASK) < j->watermark)
+		if ((flags & BCH_WATERMARK_MASK) < j->watermark)
 			return 0;
 
 		new.cur_entry_offset += res->u64s;
@@ -373,17 +378,17 @@ out:
 static inline void journal_set_watermark(struct journal *j)
 {
 	union journal_preres_state s = READ_ONCE(j->prereserved);
-	unsigned watermark = JOURNAL_WATERMARK_any;
+	unsigned watermark = BCH_WATERMARK_stripe;
 
 	if (fifo_free(&j->pin) < j->pin.size / 4)
-		watermark = max_t(unsigned, watermark, JOURNAL_WATERMARK_copygc);
+		watermark = max_t(unsigned, watermark, BCH_WATERMARK_copygc);
 	if (fifo_free(&j->pin) < j->pin.size / 8)
-		watermark = max_t(unsigned, watermark, JOURNAL_WATERMARK_reserved);
+		watermark = max_t(unsigned, watermark, BCH_WATERMARK_reclaim);
 
 	if (s.reserved > s.remaining)
-		watermark = max_t(unsigned, watermark, JOURNAL_WATERMARK_copygc);
+		watermark = max_t(unsigned, watermark, BCH_WATERMARK_copygc);
 	if (!s.remaining)
-		watermark = max_t(unsigned, watermark, JOURNAL_WATERMARK_reserved);
+		watermark = max_t(unsigned, watermark, BCH_WATERMARK_reclaim);
 
 	if (watermark == j->watermark)
 		return;
@@ -426,13 +431,14 @@ static inline int bch2_journal_preres_get_fast(struct journal *j,
 	int d = new_u64s - res->u64s;
 	union journal_preres_state old, new;
 	u64 v = atomic64_read(&j->prereserved.counter);
+	enum bch_watermark watermark = flags & BCH_WATERMARK_MASK;
 	int ret;
 
 	do {
 		old.v = new.v = v;
 		ret = 0;
 
-		if ((flags & JOURNAL_WATERMARK_reserved) ||
+		if (watermark == BCH_WATERMARK_reclaim ||
 		    new.reserved + d < new.remaining) {
 			new.reserved += d;
 			ret = 1;
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 8dc378674919..7d0dd1b1d5cf 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -1505,7 +1505,7 @@ static void journal_write_done(struct closure *cl)
 	 * Must come before signaling write completion, for
 	 * bch2_fs_journal_stop():
 	 */
-	if (j->watermark)
+	if (j->watermark != BCH_WATERMARK_stripe)
 		journal_reclaim_kick(&c->journal);
 
 	/* also must come before signalling write completion: */
diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h
index 8d8c0b3d5a30..42504e16acb6 100644
--- a/fs/bcachefs/journal_types.h
+++ b/fs/bcachefs/journal_types.h
@@ -154,19 +154,6 @@ enum journal_flags {
 	JOURNAL_NEED_FLUSH_WRITE,
 };
 
-#define JOURNAL_WATERMARKS()		\
-	x(any)				\
-	x(copygc)			\
-	x(reserved)
-
-enum journal_watermark {
-#define x(n)	JOURNAL_WATERMARK_##n,
-	JOURNAL_WATERMARKS()
-#undef x
-};
-
-#define JOURNAL_WATERMARK_MASK	3
-
 /* Reasons we may fail to get a journal reservation: */
 #define JOURNAL_ERRORS()		\
 	x(ok)				\
@@ -191,7 +178,7 @@ struct journal {
 	struct {
 
 	union journal_res_state reservations;
-	enum journal_watermark	watermark;
+	enum bch_watermark	watermark;
 
 	union journal_preres_state prereserved;
 
diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c
index 5ea512968f4d..e1ca414047b1 100644
--- a/fs/bcachefs/movinggc.c
+++ b/fs/bcachefs/movinggc.c
@@ -202,7 +202,7 @@ static int bch2_copygc(struct btree_trans *trans,
 {
 	struct bch_fs *c = trans->c;
 	struct data_update_opts data_opts = {
-		.btree_insert_flags = BTREE_INSERT_USE_RESERVE|JOURNAL_WATERMARK_copygc,
+		.btree_insert_flags = BTREE_INSERT_USE_RESERVE|BCH_WATERMARK_copygc,
 	};
 	move_buckets buckets = { 0 };
 	struct move_bucket_in_flight *f;
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 09c9d4058f82..16a99edb2ea8 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -664,7 +664,7 @@ static int bch2_journal_replay(struct bch_fs *c, u64 start_seq, u64 end_seq)
 				    BTREE_INSERT_LAZY_RW|
 				    BTREE_INSERT_NOFAIL|
 				    (!k->allocated
-				     ? BTREE_INSERT_JOURNAL_REPLAY|JOURNAL_WATERMARK_reserved
+				     ? BTREE_INSERT_JOURNAL_REPLAY|BCH_WATERMARK_reclaim
 				     : 0),
 			     bch2_journal_replay_key(&trans, k));
 		if (ret) {
-- 
cgit 


From 0b9fbce235c3ae545b6f31b8f2de2de030689595 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 27 Jun 2023 23:28:17 -0400
Subject: bcachefs: Fix a format string warning

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fsck.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 7af65030f9d8..674018a58de5 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -1216,7 +1216,7 @@ static int check_overlapping_extents(struct btree_trans *trans,
 		if (ret)
 			break;
 
-		if (fsck_err(c, buf.buf)) {
+		if (fsck_err(c, "%s", buf.buf)) {
 			struct bkey_i *update = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
 			if ((ret = PTR_ERR_OR_ZERO(update)))
 				goto err;
-- 
cgit 


From 65db60490a36cbfc0500cb86bf539614c89501d3 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 28 Jun 2023 00:01:19 -0400
Subject: bcachefs: Fix a null ptr deref in bch2_fs_alloc() error path

This fixes a null ptr deref in bch2_free_pending_node_rewrites() when
the list head wasn't initialized.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update_interior.c | 5 ++++-
 fs/bcachefs/btree_update_interior.h | 1 +
 fs/bcachefs/super.c                 | 1 +
 3 files changed, 6 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 914cb80fc6d9..5d3a09039fab 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -2452,7 +2452,7 @@ void bch2_fs_btree_interior_update_exit(struct bch_fs *c)
 	mempool_exit(&c->btree_interior_update_pool);
 }
 
-int bch2_fs_btree_interior_update_init(struct bch_fs *c)
+void bch2_fs_btree_interior_update_init_early(struct bch_fs *c)
 {
 	mutex_init(&c->btree_reserve_cache_lock);
 	INIT_LIST_HEAD(&c->btree_interior_update_list);
@@ -2462,7 +2462,10 @@ int bch2_fs_btree_interior_update_init(struct bch_fs *c)
 
 	INIT_LIST_HEAD(&c->pending_node_rewrites);
 	mutex_init(&c->pending_node_rewrites_lock);
+}
 
+int bch2_fs_btree_interior_update_init(struct bch_fs *c)
+{
 	c->btree_interior_update_worker =
 		alloc_workqueue("btree_update", WQ_UNBOUND|WQ_MEM_RECLAIM, 1);
 	if (!c->btree_interior_update_worker)
diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h
index dcfd7ceacc59..221b7ad5d053 100644
--- a/fs/bcachefs/btree_update_interior.h
+++ b/fs/bcachefs/btree_update_interior.h
@@ -322,6 +322,7 @@ void bch2_do_pending_node_rewrites(struct bch_fs *);
 void bch2_free_pending_node_rewrites(struct bch_fs *);
 
 void bch2_fs_btree_interior_update_exit(struct bch_fs *);
+void bch2_fs_btree_interior_update_init_early(struct bch_fs *);
 int bch2_fs_btree_interior_update_init(struct bch_fs *);
 
 #endif /* _BCACHEFS_BTREE_UPDATE_INTERIOR_H */
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 9f1cca7d6c8e..5b0c7dafae2d 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -686,6 +686,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 
 	bch2_fs_copygc_init(c);
 	bch2_fs_btree_key_cache_init_early(&c->btree_key_cache);
+	bch2_fs_btree_interior_update_init_early(c);
 	bch2_fs_allocator_background_init(c);
 	bch2_fs_allocator_foreground_init(c);
 	bch2_fs_rebalance_init(c);
-- 
cgit 


From f33c58fc46a9c5bd6cbf90edb6ce17fa3fd912d5 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 27 Jun 2023 17:32:48 -0400
Subject: bcachefs: Kill BTREE_INSERT_USE_RESERVE

Now that we have journal watermarks and alloc watermarks unified,
BTREE_INSERT_USE_RESERVE is redundant and can be deleted.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c      |  6 ++--
 fs/bcachefs/btree_io.c              |  6 +++-
 fs/bcachefs/btree_key_cache.c       |  1 -
 fs/bcachefs/btree_update.h          | 29 +++++++++----------
 fs/bcachefs/btree_update_interior.c | 56 ++++++++++++++++++-------------------
 fs/bcachefs/btree_write_buffer.c    |  6 ++--
 fs/bcachefs/data_update.c           |  3 +-
 fs/bcachefs/migrate.c               |  2 +-
 fs/bcachefs/movinggc.c              |  2 +-
 9 files changed, 56 insertions(+), 55 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 4486ce0b7920..81e80f36af43 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -1719,7 +1719,8 @@ static int bch2_discard_one_bucket(struct btree_trans *trans,
 write:
 	ret =   bch2_trans_update(trans, &iter, &a->k_i, 0) ?:
 		bch2_trans_commit(trans, NULL, NULL,
-				  BTREE_INSERT_USE_RESERVE|BTREE_INSERT_NOFAIL);
+				  BCH_WATERMARK_btree|
+				  BTREE_INSERT_NOFAIL);
 	if (ret)
 		goto out;
 
@@ -1827,7 +1828,8 @@ static int invalidate_one_bucket(struct btree_trans *trans,
 	ret =   bch2_trans_update(trans, &alloc_iter, &a->k_i,
 				BTREE_TRIGGER_BUCKET_INVALIDATE) ?:
 		bch2_trans_commit(trans, NULL, NULL,
-				  BTREE_INSERT_USE_RESERVE|BTREE_INSERT_NOFAIL);
+				  BCH_WATERMARK_btree|
+				  BTREE_INSERT_NOFAIL);
 	if (ret)
 		goto out;
 
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 19aca1c0f2a2..990c2fa28114 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -1766,7 +1766,11 @@ static void btree_node_write_work(struct work_struct *work)
 	} else {
 		ret = bch2_trans_do(c, NULL, NULL, 0,
 			bch2_btree_node_update_key_get_iter(&trans, b, &wbio->key,
-							    !wbio->wbio.failed.nr));
+					BCH_WATERMARK_reclaim|
+					BTREE_INSERT_JOURNAL_RECLAIM|
+					BTREE_INSERT_NOFAIL|
+					BTREE_INSERT_NOCHECK_RW,
+					!wbio->wbio.failed.nr));
 		if (ret)
 			goto err;
 	}
diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index 40847ec1e56e..1d702efaf074 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -650,7 +650,6 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans,
 		bch2_trans_commit(trans, NULL, NULL,
 				  BTREE_INSERT_NOCHECK_RW|
 				  BTREE_INSERT_NOFAIL|
-				  BTREE_INSERT_USE_RESERVE|
 				  (ck->journal.seq == journal_last_seq(j)
 				   ? BCH_WATERMARK_reclaim
 				   : 0)|
diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
index 8ecb87533cf3..f794c9d108b8 100644
--- a/fs/bcachefs/btree_update.h
+++ b/fs/bcachefs/btree_update.h
@@ -27,7 +27,6 @@ enum btree_insert_flags {
 	__BTREE_INSERT_NOFAIL = BCH_WATERMARK_BITS,
 	__BTREE_INSERT_NOCHECK_RW,
 	__BTREE_INSERT_LAZY_RW,
-	__BTREE_INSERT_USE_RESERVE,
 	__BTREE_INSERT_JOURNAL_REPLAY,
 	__BTREE_INSERT_JOURNAL_RECLAIM,
 	__BTREE_INSERT_NOWAIT,
@@ -37,26 +36,23 @@ enum btree_insert_flags {
 };
 
 /* Don't check for -ENOSPC: */
-#define BTREE_INSERT_NOFAIL		(1 << __BTREE_INSERT_NOFAIL)
+#define BTREE_INSERT_NOFAIL		BIT(__BTREE_INSERT_NOFAIL)
 
-#define BTREE_INSERT_NOCHECK_RW		(1 << __BTREE_INSERT_NOCHECK_RW)
-#define BTREE_INSERT_LAZY_RW		(1 << __BTREE_INSERT_LAZY_RW)
-
-/* for copygc, or when merging btree nodes */
-#define BTREE_INSERT_USE_RESERVE	(1 << __BTREE_INSERT_USE_RESERVE)
+#define BTREE_INSERT_NOCHECK_RW		BIT(__BTREE_INSERT_NOCHECK_RW)
+#define BTREE_INSERT_LAZY_RW		BIT(__BTREE_INSERT_LAZY_RW)
 
 /* Insert is for journal replay - don't get journal reservations: */
-#define BTREE_INSERT_JOURNAL_REPLAY	(1 << __BTREE_INSERT_JOURNAL_REPLAY)
+#define BTREE_INSERT_JOURNAL_REPLAY	BIT(__BTREE_INSERT_JOURNAL_REPLAY)
 
 /* Insert is being called from journal reclaim path: */
-#define BTREE_INSERT_JOURNAL_RECLAIM (1 << __BTREE_INSERT_JOURNAL_RECLAIM)
+#define BTREE_INSERT_JOURNAL_RECLAIM	BIT(__BTREE_INSERT_JOURNAL_RECLAIM)
 
 /* Don't block on allocation failure (for new btree nodes: */
-#define BTREE_INSERT_NOWAIT		(1 << __BTREE_INSERT_NOWAIT)
-#define BTREE_INSERT_GC_LOCK_HELD	(1 << __BTREE_INSERT_GC_LOCK_HELD)
+#define BTREE_INSERT_NOWAIT		BIT(__BTREE_INSERT_NOWAIT)
+#define BTREE_INSERT_GC_LOCK_HELD	BIT(__BTREE_INSERT_GC_LOCK_HELD)
 
-#define BCH_HASH_SET_MUST_CREATE	(1 << __BCH_HASH_SET_MUST_CREATE)
-#define BCH_HASH_SET_MUST_REPLACE	(1 << __BCH_HASH_SET_MUST_REPLACE)
+#define BCH_HASH_SET_MUST_CREATE	BIT(__BCH_HASH_SET_MUST_CREATE)
+#define BCH_HASH_SET_MUST_REPLACE	BIT(__BCH_HASH_SET_MUST_REPLACE)
 
 int bch2_btree_delete_extent_at(struct btree_trans *, struct btree_iter *,
 				unsigned, unsigned);
@@ -80,9 +76,10 @@ int bch2_btree_node_rewrite(struct btree_trans *, struct btree_iter *,
 			    struct btree *, unsigned);
 void bch2_btree_node_rewrite_async(struct bch_fs *, struct btree *);
 int bch2_btree_node_update_key(struct btree_trans *, struct btree_iter *,
-			       struct btree *, struct bkey_i *, bool);
-int bch2_btree_node_update_key_get_iter(struct btree_trans *,
-				struct btree *, struct bkey_i *, bool);
+			       struct btree *, struct bkey_i *,
+			       unsigned, bool);
+int bch2_btree_node_update_key_get_iter(struct btree_trans *, struct btree *,
+					struct bkey_i *, unsigned, bool);
 
 int __bch2_insert_snapshot_whiteouts(struct btree_trans *, enum btree_id,
 				     struct bpos, struct bpos);
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 5d3a09039fab..4c8effa4b8d9 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -246,18 +246,12 @@ static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans,
 	BKEY_PADDED_ONSTACK(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp;
 	struct open_buckets ob = { .nr = 0 };
 	struct bch_devs_list devs_have = (struct bch_devs_list) { 0 };
-	unsigned nr_reserve;
-	enum bch_watermark alloc_reserve;
+	enum bch_watermark watermark = flags & BCH_WATERMARK_MASK;
+	unsigned nr_reserve = watermark > BCH_WATERMARK_reclaim
+		? BTREE_NODE_RESERVE
+		: 0;
 	int ret;
 
-	if (flags & BTREE_INSERT_USE_RESERVE) {
-		nr_reserve	= 0;
-		alloc_reserve	= BCH_WATERMARK_btree_copygc;
-	} else {
-		nr_reserve	= BTREE_NODE_RESERVE;
-		alloc_reserve	= BCH_WATERMARK_btree;
-	}
-
 	mutex_lock(&c->btree_reserve_cache_lock);
 	if (c->btree_reserve_cache_nr > nr_reserve) {
 		struct btree_alloc *a =
@@ -279,7 +273,7 @@ retry:
 				      &devs_have,
 				      res->nr_replicas,
 				      c->opts.metadata_replicas_required,
-				      alloc_reserve, 0, cl, &wp);
+				      watermark, 0, cl, &wp);
 	if (unlikely(ret))
 		return ERR_PTR(ret);
 
@@ -647,11 +641,10 @@ static void btree_update_nodes_written(struct btree_update *as)
 	 * which may require allocations as well.
 	 */
 	ret = commit_do(&trans, &as->disk_res, &journal_seq,
+			BCH_WATERMARK_reclaim|
 			BTREE_INSERT_NOFAIL|
 			BTREE_INSERT_NOCHECK_RW|
-			BTREE_INSERT_USE_RESERVE|
-			BTREE_INSERT_JOURNAL_RECLAIM|
-			BCH_WATERMARK_reclaim,
+			BTREE_INSERT_JOURNAL_RECLAIM,
 			btree_update_nodes_written_trans(&trans, as));
 	bch2_trans_unlock(&trans);
 
@@ -1049,14 +1042,24 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
 		? BCH_DISK_RESERVATION_NOFAIL : 0;
 	unsigned nr_nodes[2] = { 0, 0 };
 	unsigned update_level = level;
-	int journal_flags = flags & BCH_WATERMARK_MASK;
+	enum bch_watermark watermark = flags & BCH_WATERMARK_MASK;
+	unsigned journal_flags = 0;
 	int ret = 0;
 	u32 restart_count = trans->restart_count;
 
 	BUG_ON(!path->should_be_locked);
 
+	if (watermark == BCH_WATERMARK_copygc)
+		watermark = BCH_WATERMARK_btree_copygc;
+	if (watermark < BCH_WATERMARK_btree)
+		watermark = BCH_WATERMARK_btree;
+
+	flags &= ~BCH_WATERMARK_MASK;
+	flags |= watermark;
+
 	if (flags & BTREE_INSERT_JOURNAL_RECLAIM)
 		journal_flags |= JOURNAL_RES_GET_NONBLOCK;
+	journal_flags |= watermark;
 
 	while (1) {
 		nr_nodes[!!update_level] += 1 + split;
@@ -1845,9 +1848,7 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans,
 
 	parent = btree_node_parent(path, b);
 	as = bch2_btree_update_start(trans, path, level, false,
-			 BTREE_INSERT_NOFAIL|
-			 BTREE_INSERT_USE_RESERVE|
-			 flags);
+				     BTREE_INSERT_NOFAIL|flags);
 	ret = PTR_ERR_OR_ZERO(as);
 	if (ret)
 		goto err;
@@ -2127,6 +2128,7 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans,
 					struct btree_iter *iter,
 					struct btree *b, struct btree *new_hash,
 					struct bkey_i *new_key,
+					unsigned commit_flags,
 					bool skip_triggers)
 {
 	struct bch_fs *c = trans->c;
@@ -2187,12 +2189,7 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans,
 		trans->extra_journal_entries.nr += jset_u64s(new_key->k.u64s);
 	}
 
-	ret = bch2_trans_commit(trans, NULL, NULL,
-				BTREE_INSERT_NOFAIL|
-				BTREE_INSERT_NOCHECK_RW|
-				BTREE_INSERT_USE_RESERVE|
-				BTREE_INSERT_JOURNAL_RECLAIM|
-				BCH_WATERMARK_reclaim);
+	ret = bch2_trans_commit(trans, NULL, NULL, commit_flags);
 	if (ret)
 		goto err;
 
@@ -2226,7 +2223,7 @@ err:
 
 int bch2_btree_node_update_key(struct btree_trans *trans, struct btree_iter *iter,
 			       struct btree *b, struct bkey_i *new_key,
-			       bool skip_triggers)
+			       unsigned commit_flags, bool skip_triggers)
 {
 	struct bch_fs *c = trans->c;
 	struct btree *new_hash = NULL;
@@ -2256,8 +2253,8 @@ int bch2_btree_node_update_key(struct btree_trans *trans, struct btree_iter *ite
 	}
 
 	path->intent_ref++;
-	ret = __bch2_btree_node_update_key(trans, iter, b, new_hash,
-					   new_key, skip_triggers);
+	ret = __bch2_btree_node_update_key(trans, iter, b, new_hash, new_key,
+					   commit_flags, skip_triggers);
 	--path->intent_ref;
 
 	if (new_hash) {
@@ -2275,7 +2272,7 @@ int bch2_btree_node_update_key(struct btree_trans *trans, struct btree_iter *ite
 
 int bch2_btree_node_update_key_get_iter(struct btree_trans *trans,
 					struct btree *b, struct bkey_i *new_key,
-					bool skip_triggers)
+					unsigned commit_flags, bool skip_triggers)
 {
 	struct btree_iter iter;
 	int ret;
@@ -2296,7 +2293,8 @@ int bch2_btree_node_update_key_get_iter(struct btree_trans *trans,
 
 	BUG_ON(!btree_node_hashed(b));
 
-	ret = bch2_btree_node_update_key(trans, &iter, b, new_key, skip_triggers);
+	ret = bch2_btree_node_update_key(trans, &iter, b, new_key,
+					 commit_flags, skip_triggers);
 out:
 	bch2_trans_iter_exit(trans, &iter);
 	return ret;
diff --git a/fs/bcachefs/btree_write_buffer.c b/fs/bcachefs/btree_write_buffer.c
index 4003fa1f0319..b50226313a47 100644
--- a/fs/bcachefs/btree_write_buffer.c
+++ b/fs/bcachefs/btree_write_buffer.c
@@ -213,6 +213,9 @@ slowpath:
 	     btree_write_buffered_journal_cmp,
 	     NULL);
 
+	commit_flags &= ~BCH_WATERMARK_MASK;
+	commit_flags |= BCH_WATERMARK_reclaim;
+
 	for (i = keys; i < keys + nr; i++) {
 		if (!i->journal_seq)
 			continue;
@@ -231,8 +234,7 @@ slowpath:
 		ret = commit_do(trans, NULL, NULL,
 				commit_flags|
 				BTREE_INSERT_NOFAIL|
-				BTREE_INSERT_JOURNAL_RECLAIM|
-				BCH_WATERMARK_reclaim,
+				BTREE_INSERT_JOURNAL_RECLAIM,
 				__bch2_btree_insert(trans, i->btree, &i->k, 0));
 		if (bch2_fs_fatal_err_on(ret, c, "%s: insert error %s", __func__, bch2_err_str(ret)))
 			break;
diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c
index cc79bcb6726c..3c918368b2ec 100644
--- a/fs/bcachefs/data_update.c
+++ b/fs/bcachefs/data_update.c
@@ -458,8 +458,7 @@ int bch2_data_update_init(struct btree_trans *trans,
 	m->op.compression_type =
 		bch2_compression_opt_to_type[io_opts.background_compression ?:
 					     io_opts.compression];
-	if (m->data_opts.btree_insert_flags & BTREE_INSERT_USE_RESERVE)
-		m->op.watermark = BCH_WATERMARK_copygc;
+	m->op.watermark		= m->data_opts.btree_insert_flags & BCH_WATERMARK_MASK;
 
 	bkey_for_each_ptr(ptrs, ptr)
 		percpu_ref_get(&bch_dev_bkey_exists(c, ptr->dev)->ref);
diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c
index 783e9442b302..81c8cdbac285 100644
--- a/fs/bcachefs/migrate.c
+++ b/fs/bcachefs/migrate.c
@@ -141,7 +141,7 @@ retry:
 				break;
 			}
 
-			ret = bch2_btree_node_update_key(&trans, &iter, b, k.k, false);
+			ret = bch2_btree_node_update_key(&trans, &iter, b, k.k, 0, false);
 			if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
 				ret = 0;
 				continue;
diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c
index e1ca414047b1..5242f20bb680 100644
--- a/fs/bcachefs/movinggc.c
+++ b/fs/bcachefs/movinggc.c
@@ -202,7 +202,7 @@ static int bch2_copygc(struct btree_trans *trans,
 {
 	struct bch_fs *c = trans->c;
 	struct data_update_opts data_opts = {
-		.btree_insert_flags = BTREE_INSERT_USE_RESERVE|BCH_WATERMARK_copygc,
+		.btree_insert_flags = BCH_WATERMARK_copygc,
 	};
 	move_buckets buckets = { 0 };
 	struct move_bucket_in_flight *f;
-- 
cgit 


From e3804b55e4358cf5a235fa1ba32204af9f7046dd Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 28 Jun 2023 19:53:05 -0400
Subject: bcachefs: bch2_version_to_text()

Add a new helper for printing out metadata versions in a standard
format.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/opts.c     |  5 -----
 fs/bcachefs/opts.h     |  1 -
 fs/bcachefs/recovery.c | 27 ++++++++++++++++-----------
 fs/bcachefs/super-io.c | 19 +++++++++++++++++--
 fs/bcachefs/super-io.h |  2 ++
 fs/bcachefs/super.c    |  3 ++-
 6 files changed, 37 insertions(+), 20 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c
index 04e2989cd6b3..a05c389830dc 100644
--- a/fs/bcachefs/opts.c
+++ b/fs/bcachefs/opts.c
@@ -11,11 +11,6 @@
 
 #define x(t, n) [n] = #t,
 
-const char * const bch2_metadata_versions[] = {
-	BCH_METADATA_VERSIONS()
-	NULL
-};
-
 const char * const bch2_error_actions[] = {
 	BCH_ERROR_ACTIONS()
 	NULL
diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
index 719693b333da..e7cf7e92f3db 100644
--- a/fs/bcachefs/opts.h
+++ b/fs/bcachefs/opts.h
@@ -8,7 +8,6 @@
 #include <linux/sysfs.h>
 #include "bcachefs_format.h"
 
-extern const char * const bch2_metadata_versions[];
 extern const char * const bch2_error_actions[];
 extern const char * const bch2_sb_features[];
 extern const char * const bch2_sb_compat[];
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 16a99edb2ea8..b86442c7c912 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -1146,17 +1146,22 @@ int bch2_fs_recovery(struct bch_fs *c)
 		goto err;
 	}
 
-	if (!c->opts.nochanges) {
-		if (c->sb.version < bcachefs_metadata_required_upgrade_below) {
-			bch_info(c, "version %s (%u) prior to %s (%u), upgrade and fsck required",
-				 bch2_metadata_versions[c->sb.version],
-				 c->sb.version,
-				 bch2_metadata_versions[bcachefs_metadata_required_upgrade_below],
-				 bcachefs_metadata_required_upgrade_below);
-			c->opts.version_upgrade	= true;
-			c->opts.fsck		= true;
-			c->opts.fix_errors	= FSCK_OPT_YES;
-		}
+	if (!c->opts.nochanges &&
+	    c->sb.version < bcachefs_metadata_required_upgrade_below) {
+		struct printbuf buf = PRINTBUF;
+
+		prt_str(&buf, "version ");
+		bch2_version_to_text(&buf, c->sb.version);
+		prt_str(&buf, " prior to ");
+		bch2_version_to_text(&buf, bcachefs_metadata_required_upgrade_below);
+		prt_str(&buf, ", upgrade and fsck required");
+
+		bch_info(c, "%s", buf.buf);
+		printbuf_exit(&buf);
+
+		c->opts.version_upgrade	= true;
+		c->opts.fsck		= true;
+		c->opts.fix_errors	= FSCK_OPT_YES;
 	}
 
 	if (c->opts.fsck && c->opts.norecovery) {
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index d23ed9ec30f1..2237b1b94bbc 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -26,6 +26,21 @@
 static const struct blk_holder_ops bch2_sb_handle_bdev_ops = {
 };
 
+static const char * const bch2_metadata_versions[] = {
+#define x(t, n) [n] = #t,
+	BCH_METADATA_VERSIONS()
+#undef x
+};
+
+void bch2_version_to_text(struct printbuf *out, unsigned v)
+{
+	const char *str = v < ARRAY_SIZE(bch2_metadata_versions)
+		? bch2_metadata_versions[v]
+		: "(unknown version)";
+
+	prt_printf(out, "%u: %s", v, str);
+}
+
 const char * const bch2_sb_fields[] = {
 #define x(name, nr)	#name,
 	BCH_SB_FIELDS()
@@ -1510,12 +1525,12 @@ void bch2_sb_to_text(struct printbuf *out, struct bch_sb *sb,
 
 	prt_str(out, "Version:");
 	prt_tab(out);
-	prt_printf(out, "%s", bch2_metadata_versions[le16_to_cpu(sb->version)]);
+	bch2_version_to_text(out, le16_to_cpu(sb->version));
 	prt_newline(out);
 
 	prt_printf(out, "Oldest version on disk:");
 	prt_tab(out);
-	prt_printf(out, "%s", bch2_metadata_versions[le16_to_cpu(sb->version_min)]);
+	bch2_version_to_text(out, le16_to_cpu(sb->version_min));
 	prt_newline(out);
 
 	prt_printf(out, "Created:");
diff --git a/fs/bcachefs/super-io.h b/fs/bcachefs/super-io.h
index ab0ad3248e8f..4a193add3447 100644
--- a/fs/bcachefs/super-io.h
+++ b/fs/bcachefs/super-io.h
@@ -9,6 +9,8 @@
 
 #include <asm/byteorder.h>
 
+void bch2_version_to_text(struct printbuf *, unsigned);
+
 struct bch_sb_field *bch2_sb_field_get(struct bch_sb *, enum bch_sb_field_type);
 struct bch_sb_field *bch2_sb_field_resize(struct bch_sb_handle *,
 					  enum bch_sb_field_type, unsigned);
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 5b0c7dafae2d..62da48863ffa 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -877,7 +877,8 @@ static void print_mount_opts(struct bch_fs *c)
 	struct printbuf p = PRINTBUF;
 	bool first = true;
 
-	prt_printf(&p, "mounted version=%s", bch2_metadata_versions[c->sb.version]);
+	prt_str(&p, "mounted version ");
+	bch2_version_to_text(&p, c->sb.version);
 
 	if (c->opts.read_only) {
 		prt_str(&p, " opts=");
-- 
cgit 


From a02a0121b3de81f985d6c751f1557c7aea832b9a Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 28 Jun 2023 20:27:07 -0400
Subject: bcachefs: bch2_version_compatible()

This adds a new helper for checking if an on-disk version is compatible
with the running version of bcachefs - prep work for introducing
major:minor version numbers.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs_format.h |  2 -
 fs/bcachefs/btree_io.c        | 10 ++---
 fs/bcachefs/journal_io.c      | 22 +++--------
 fs/bcachefs/super-io.c        | 86 ++++++++++++++++++++++---------------------
 fs/bcachefs/super-io.h        |  6 +++
 5 files changed, 60 insertions(+), 66 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index 158cefb87684..4401d27675ed 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -1574,8 +1574,6 @@ struct bch_sb_field_journal_seq_blacklist {
  * One common version number for all on disk data structures - superblock, btree
  * nodes, journal entries
  */
-#define BCH_JSET_VERSION_OLD			2
-#define BCH_BSET_VERSION_OLD			3
 
 #define BCH_METADATA_VERSIONS()				\
 	x(bkey_renumber,		10)		\
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 990c2fa28114..9985ecd7265d 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -699,11 +699,9 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
 	struct printbuf buf2 = PRINTBUF;
 	int ret = 0;
 
-	btree_err_on((version != BCH_BSET_VERSION_OLD &&
-		      version < bcachefs_metadata_version_min) ||
-		     version >= bcachefs_metadata_version_max,
+	btree_err_on(!bch2_version_compatible(version),
 		     BTREE_ERR_INCOMPATIBLE, c, ca, b, i,
-		     "unsupported bset version");
+		     "unsupported bset version %u", version);
 
 	if (btree_err_on(version < c->sb.version_min,
 			 BTREE_ERR_FIXABLE, c, NULL, b, i,
@@ -2019,9 +2017,7 @@ do_write:
 	BUG_ON(BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN);
 	BUG_ON(i->seq != b->data->keys.seq);
 
-	i->version = c->sb.version < bcachefs_metadata_version_bkey_renumber
-		? cpu_to_le16(BCH_BSET_VERSION_OLD)
-		: cpu_to_le16(c->sb.version);
+	i->version = cpu_to_le16(c->sb.version);
 	SET_BSET_OFFSET(i, b->written);
 	SET_BSET_CSUM_TYPE(i, bch2_meta_checksum_type(c));
 
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 7d0dd1b1d5cf..a084c6d0fe23 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -745,14 +745,10 @@ static int jset_validate(struct bch_fs *c,
 		return JOURNAL_ENTRY_NONE;
 
 	version = le32_to_cpu(jset->version);
-	if (journal_entry_err_on((version != BCH_JSET_VERSION_OLD &&
-				  version < bcachefs_metadata_version_min) ||
-				 version >= bcachefs_metadata_version_max,
-				 c, jset, NULL,
-			"%s sector %llu seq %llu: unknown journal entry version %u",
+	if (journal_entry_err_on(!bch2_version_compatible(version), c, jset, NULL,
+			"%s sector %llu seq %llu: incompatible journal entry version %u",
 			ca ? ca->name : c->name,
-			sector, le64_to_cpu(jset->seq),
-			version)) {
+			sector, le64_to_cpu(jset->seq), version)) {
 		/* don't try to continue: */
 		return -EINVAL;
 	}
@@ -796,14 +792,10 @@ static int jset_validate_early(struct bch_fs *c,
 		return JOURNAL_ENTRY_NONE;
 
 	version = le32_to_cpu(jset->version);
-	if (journal_entry_err_on((version != BCH_JSET_VERSION_OLD &&
-				  version < bcachefs_metadata_version_min) ||
-				 version >= bcachefs_metadata_version_max,
-				 c, jset, NULL,
+	if (journal_entry_err_on(!bch2_version_compatible(version), c, jset, NULL,
 			"%s sector %llu seq %llu: unknown journal entry version %u",
 			ca ? ca->name : c->name,
-			sector, le64_to_cpu(jset->seq),
-			version)) {
+			sector, le64_to_cpu(jset->seq), version)) {
 		/* don't try to continue: */
 		return -EINVAL;
 	}
@@ -1755,9 +1747,7 @@ void bch2_journal_write(struct closure *cl)
 	}
 
 	jset->magic		= cpu_to_le64(jset_magic(c));
-	jset->version		= c->sb.version < bcachefs_metadata_version_bkey_renumber
-		? cpu_to_le32(BCH_JSET_VERSION_OLD)
-		: cpu_to_le32(c->sb.version);
+	jset->version		= cpu_to_le32(c->sb.version);
 
 	SET_JSET_BIG_ENDIAN(jset, CPU_BIG_ENDIAN);
 	SET_JSET_CSUM_TYPE(jset, bch2_meta_checksum_type(c));
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index 2237b1b94bbc..55a6c64de09c 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -269,40 +269,58 @@ static int validate_sb_layout(struct bch_sb_layout *layout, struct printbuf *out
 	return 0;
 }
 
-static int bch2_sb_validate(struct bch_sb_handle *disk_sb, struct printbuf *out,
-			    int rw)
+static int bch2_sb_compatible(struct bch_sb *sb, struct printbuf *out)
 {
-	struct bch_sb *sb = disk_sb->sb;
-	struct bch_sb_field *f;
-	struct bch_sb_field_members *mi;
-	enum bch_opt_id opt_id;
-	u32 version, version_min;
-	u16 block_size;
-	int ret;
-
-	version		= le16_to_cpu(sb->version);
-	version_min	= version >= bcachefs_metadata_version_bkey_renumber
-		? le16_to_cpu(sb->version_min)
-		: version;
-
-	if (version    >= bcachefs_metadata_version_max) {
-		prt_printf(out, "Unsupported superblock version %u (min %u, max %u)",
-		       version, bcachefs_metadata_version_min, bcachefs_metadata_version_max);
+	u16 version		= le16_to_cpu(sb->version);
+	u16 version_min		= le16_to_cpu(sb->version_min);
+
+	if (!bch2_version_compatible(version)) {
+		prt_str(out, "Unsupported superblock version ");
+		bch2_version_to_text(out, version);
+		prt_str(out, " (min ");
+		bch2_version_to_text(out, bcachefs_metadata_version_min);
+		prt_str(out, ", max ");
+		bch2_version_to_text(out, bcachefs_metadata_version_current);
+		prt_str(out, ")");
 		return -BCH_ERR_invalid_sb_version;
 	}
 
-	if (version_min < bcachefs_metadata_version_min) {
-		prt_printf(out, "Unsupported superblock version %u (min %u, max %u)",
-		       version_min, bcachefs_metadata_version_min, bcachefs_metadata_version_max);
+	if (!bch2_version_compatible(version_min)) {
+		prt_str(out, "Unsupported superblock version_min ");
+		bch2_version_to_text(out, version_min);
+		prt_str(out, " (min ");
+		bch2_version_to_text(out, bcachefs_metadata_version_min);
+		prt_str(out, ", max ");
+		bch2_version_to_text(out, bcachefs_metadata_version_current);
+		prt_str(out, ")");
 		return -BCH_ERR_invalid_sb_version;
 	}
 
 	if (version_min > version) {
-		prt_printf(out, "Bad minimum version %u, greater than version field %u",
-		       version_min, version);
+		prt_str(out, "Bad minimum version ");
+		bch2_version_to_text(out, version_min);
+		prt_str(out, ", greater than version field ");
+		bch2_version_to_text(out, version);
 		return -BCH_ERR_invalid_sb_version;
 	}
 
+	return 0;
+}
+
+static int bch2_sb_validate(struct bch_sb_handle *disk_sb, struct printbuf *out,
+			    int rw)
+{
+	struct bch_sb *sb = disk_sb->sb;
+	struct bch_sb_field *f;
+	struct bch_sb_field_members *mi;
+	enum bch_opt_id opt_id;
+	u16 block_size;
+	int ret;
+
+	ret = bch2_sb_compatible(sb, out);
+	if (ret)
+		return ret;
+
 	if (sb->features[1] ||
 	    (le64_to_cpu(sb->features[0]) & (~0ULL << BCH_FEATURE_NR))) {
 		prt_printf(out, "Filesystem has incompatible features");
@@ -350,7 +368,7 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb, struct printbuf *out,
 	if (rw == READ) {
 		/*
 		 * Been seeing a bug where these are getting inexplicably
-		 * zeroed, so we'r now validating them, but we have to be
+		 * zeroed, so we're now validating them, but we have to be
 		 * careful not to preven people's filesystems from mounting:
 		 */
 		if (!BCH_SB_JOURNAL_FLUSH_DELAY(sb))
@@ -531,7 +549,6 @@ int bch2_sb_from_fs(struct bch_fs *c, struct bch_dev *ca)
 static int read_one_super(struct bch_sb_handle *sb, u64 offset, struct printbuf *err)
 {
 	struct bch_csum csum;
-	u32 version, version_min;
 	size_t bytes;
 	int ret;
 reread:
@@ -551,22 +568,9 @@ reread:
 		return -BCH_ERR_invalid_sb_magic;
 	}
 
-	version		= le16_to_cpu(sb->sb->version);
-	version_min	= version >= bcachefs_metadata_version_bkey_renumber
-		? le16_to_cpu(sb->sb->version_min)
-		: version;
-
-	if (version    >= bcachefs_metadata_version_max) {
-		prt_printf(err, "Unsupported superblock version %u (min %u, max %u)",
-		       version, bcachefs_metadata_version_min, bcachefs_metadata_version_max);
-		return -BCH_ERR_invalid_sb_version;
-	}
-
-	if (version_min < bcachefs_metadata_version_min) {
-		prt_printf(err, "Unsupported superblock version %u (min %u, max %u)",
-		       version_min, bcachefs_metadata_version_min, bcachefs_metadata_version_max);
-		return -BCH_ERR_invalid_sb_version;
-	}
+	ret = bch2_sb_compatible(sb->sb, err);
+	if (ret)
+		return ret;
 
 	bytes = vstruct_bytes(sb->sb);
 
diff --git a/fs/bcachefs/super-io.h b/fs/bcachefs/super-io.h
index 4a193add3447..cda71ec845a5 100644
--- a/fs/bcachefs/super-io.h
+++ b/fs/bcachefs/super-io.h
@@ -9,6 +9,12 @@
 
 #include <asm/byteorder.h>
 
+static inline bool bch2_version_compatible(u16 version)
+{
+	return version <= bcachefs_metadata_version_current &&
+		version >= bcachefs_metadata_version_min;
+}
+
 void bch2_version_to_text(struct printbuf *, unsigned);
 
 struct bch_sb_field *bch2_sb_field_get(struct bch_sb *, enum bch_sb_field_type);
-- 
cgit 


From d14bfd1010c4ce8bede5bd98d0b332e3b34b8bd5 Mon Sep 17 00:00:00 2001
From: Brian Foster <bfoster@redhat.com>
Date: Fri, 30 Jun 2023 10:51:46 -0400
Subject: bcachefs: mark active journal devices on journal replicas gc

A simple device evacuate, remove, add test loop with concurrent
shutdowns occasionally reproduces a problem where the filesystem
fails to mount. The mount failure occurs because the filesystem was
uncleanly shut down, yet no member device is marked for journal data
in the superblock. An fsck detects the problem, restores the mark
and allows the mount to proceed without further consistency issues.

The reason for the lack of journal data marks is the gc mechanism
invoked via bch2_journal_flush_device_pins() runs while the journal
happens to be empty. This results in garbage collection of all journal
replicas entries. Once the updated replicas table is written to the
superblock, the filesystem is put in a transiently unrecoverable state
until further journal data is written, because journal recovery expects
to find at least one marked journal device whenever the filesystem is
not otherwise marked clean (i.e. as on clean unmount).

To fix this problem, update the journal replicas gc algorithm to always
mark currently active journal replicas entries by writing to the
journal. This ensures that only entries for devices that are no longer
used for journaling are garbage collected, not just those that don't
happen to currently hold journal data. This preserves the journal
recovery invariant above and avoids putting the fs into a transiently
unrecoverable state.

Signed-off-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/journal_reclaim.c | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
index 2c7f8aca9319..5174b9497721 100644
--- a/fs/bcachefs/journal_reclaim.c
+++ b/fs/bcachefs/journal_reclaim.c
@@ -837,8 +837,20 @@ int bch2_journal_flush_device_pins(struct journal *j, int dev_idx)
 	mutex_lock(&c->replicas_gc_lock);
 	bch2_replicas_gc_start(c, 1 << BCH_DATA_journal);
 
-	seq = 0;
+	/*
+	 * Now that we've populated replicas_gc, write to the journal to mark
+	 * active journal devices. This handles the case where the journal might
+	 * be empty. Otherwise we could clear all journal replicas and
+	 * temporarily put the fs into an unrecoverable state. Journal recovery
+	 * expects to find devices marked for journal data on unclean mount.
+	 */
+	ret = bch2_journal_meta(&c->journal);
+	if (ret) {
+		mutex_unlock(&c->replicas_gc_lock);
+		return ret;
+	}
 
+	seq = 0;
 	spin_lock(&j->lock);
 	while (!ret) {
 		struct bch_replicas_padded replicas;
-- 
cgit 


From bc652905c60b504ded266448b2810242d24c8d88 Mon Sep 17 00:00:00 2001
From: Brian Foster <bfoster@redhat.com>
Date: Fri, 30 Jun 2023 13:09:46 -0400
Subject: bcachefs: flush journal to avoid invalid dev usage entries on
 recovery

A crash immediately after device removal can result in an
unmountable filesystem due to recovery failure. The following
command reliably reproduces on a multi-device fs:

  bcachefs device remove <dev> && xfs_io -xc shutdown <mnt>

The post-crash mount fails with an error similar to the following,
reported by fsck:

  invalid journal entry dev_usage at offset 7994/8034 seq 12: bad dev, fixing

This refers to a device usage entry in the journal that refers to
the index of the just removed device. Recovery considers this an
invalid entry and fails to proceed.

Device usage entries are added to journal buffer writes via
bch_journal_write() -> bch2_journal_super_entries_add_common(),
which means any journal buffer write has content that refers to
member devices at the time of the journal write.

The device remove sequence already removes metadata references to
the device being removed. It then flushes any pins that refer to the
device, clears replica entries, removes the in-memory device object
and lastly updates the superblock to reflect that the device is no
longer present. The problem is that any journal writes that occur
during this sequence will include a dev usage entry so long as the
device is present. To avoid this problem, we can flush the journal
once more after the device entry is removed from the in-core
structures, but before the superblock is updated to fully remove the
device on-disk.

Signed-off-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/super.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 62da48863ffa..fe37a0464af5 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -1521,6 +1521,17 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
 
 	bch2_dev_free(ca);
 
+	/*
+	 * At this point the device object has been removed in-core, but the
+	 * on-disk journal might still refer to the device index via sb device
+	 * usage entries. Recovery fails if it sees usage information for an
+	 * invalid device. Flush journal pins to push the back of the journal
+	 * past now invalid device index references before we update the
+	 * superblock, but after the device object has been removed so any
+	 * further journal writes elide usage info for the device.
+	 */
+	bch2_journal_flush_all_pins(&c->journal);
+
 	/*
 	 * Free this device's slot in the bch_member array - all pointers to
 	 * this device must be gone:
-- 
cgit 


From faa6cb6c13c7223240366ebbf0217a6191fbfc32 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 28 Jun 2023 22:09:13 -0400
Subject: bcachefs: Allow for unknown btree IDs

We need to allow filesystems with metadata from newer versions to be
mountable and usable by older versions.

This patch enables us to roll out new btrees without a new major version
number; we can now handle btree roots for unknown btree types.

The unknown btree roots will be retained, and fsck (including
backpointers) will check them, the same as other btree types.

We add a dynamic array for the extra, unknown btree roots, in addition
to the fixed size btree root array, and add new helpers for looking up
btree roots.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/backpointers.c          | 14 ++++++++------
 fs/bcachefs/bcachefs.h              |  3 ++-
 fs/bcachefs/btree_cache.c           | 21 +++++++++++++--------
 fs/bcachefs/btree_cache.h           | 22 +++++++++++++++++++++-
 fs/bcachefs/btree_gc.c              | 22 ++++++++++++++++++----
 fs/bcachefs/btree_io.c              |  2 +-
 fs/bcachefs/btree_iter.c            |  4 ++--
 fs/bcachefs/btree_update_interior.c | 18 +++++++++---------
 fs/bcachefs/move.c                  | 10 ++++++++--
 fs/bcachefs/recovery.c              | 16 ++++++++--------
 fs/bcachefs/super.c                 |  1 +
 11 files changed, 91 insertions(+), 42 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c
index a270ff96e9b4..20a4c3f071bf 100644
--- a/fs/bcachefs/backpointers.c
+++ b/fs/bcachefs/backpointers.c
@@ -272,6 +272,7 @@ struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *trans,
 					 unsigned iter_flags)
 {
 	struct bch_fs *c = trans->c;
+	struct btree_root *r = bch2_btree_id_root(c, bp.btree_id);
 	struct bpos bucket = bp_pos_to_bucket(c, bp_pos);
 	struct bkey_s_c k;
 
@@ -279,7 +280,7 @@ struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *trans,
 				  bp.btree_id,
 				  bp.pos,
 				  0,
-				  min(bp.level, c->btree_roots[bp.btree_id].level),
+				  min(bp.level, r->level),
 				  iter_flags);
 	k = bch2_btree_iter_peek_slot(iter);
 	if (bkey_err(k)) {
@@ -287,8 +288,8 @@ struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *trans,
 		return k;
 	}
 
-	if (bp.level == c->btree_roots[bp.btree_id].level + 1)
-		k = bkey_i_to_s_c(&c->btree_roots[bp.btree_id].key);
+	if (bp.level == r->level + 1)
+		k = bkey_i_to_s_c(&r->key);
 
 	if (k.k && extent_matches_bp(c, bp.btree_id, bp.level, k, bucket, bp))
 		return k;
@@ -531,6 +532,7 @@ static int check_btree_root_to_backpointers(struct btree_trans *trans,
 					    struct bpos_level *last_flushed)
 {
 	struct bch_fs *c = trans->c;
+	struct btree_root *r = bch2_btree_id_root(c, btree_id);
 	struct btree_iter iter;
 	struct btree *b;
 	struct bkey_s_c k;
@@ -539,8 +541,7 @@ static int check_btree_root_to_backpointers(struct btree_trans *trans,
 	const union bch_extent_entry *entry;
 	int ret;
 
-	bch2_trans_node_iter_init(trans, &iter, btree_id, POS_MIN, 0,
-				  c->btree_roots[btree_id].level, 0);
+	bch2_trans_node_iter_init(trans, &iter, btree_id, POS_MIN, 0, r->level, 0);
 	b = bch2_btree_iter_peek_node(&iter);
 	ret = PTR_ERR_OR_ZERO(b);
 	if (ret)
@@ -640,12 +641,13 @@ static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans,
 						   struct bpos bucket_start,
 						   struct bpos bucket_end)
 {
+	struct bch_fs *c = trans->c;
 	struct btree_iter iter;
 	enum btree_id btree_id;
 	struct bpos_level last_flushed = { UINT_MAX };
 	int ret = 0;
 
-	for (btree_id = 0; btree_id < BTREE_ID_NR; btree_id++) {
+	for (btree_id = 0; btree_id < btree_id_nr_alive(c); btree_id++) {
 		unsigned depth = btree_type_has_ptrs(btree_id) ? 0 : 1;
 
 		bch2_trans_node_iter_init(trans, &iter, btree_id, POS_MIN, 0,
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index b8d50fe64b3c..a8488d4e18e8 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -749,7 +749,8 @@ struct bch_fs {
 	struct bio_set		btree_bio;
 	struct workqueue_struct	*io_complete_wq;
 
-	struct btree_root	btree_roots[BTREE_ID_NR];
+	struct btree_root	btree_roots_known[BTREE_ID_NR];
+	DARRAY(struct btree_root) btree_roots_extra;
 	struct mutex		btree_root_lock;
 
 	struct btree_cache	btree_cache;
diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index 8b27b7e98f7d..b36bed3e1348 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -25,13 +25,15 @@ void bch2_recalc_btree_reserve(struct bch_fs *c)
 {
 	unsigned i, reserve = 16;
 
-	if (!c->btree_roots[0].b)
+	if (!c->btree_roots_known[0].b)
 		reserve += 8;
 
-	for (i = 0; i < BTREE_ID_NR; i++)
-		if (c->btree_roots[i].b)
-			reserve += min_t(unsigned, 1,
-					 c->btree_roots[i].b->c.level) * 8;
+	for (i = 0; i < btree_id_nr_alive(c); i++) {
+		struct btree_root *r = bch2_btree_id_root(c, i);
+
+		if (r->b)
+			reserve += min_t(unsigned, 1, r->b->c.level) * 8;
+	}
 
 	c->btree_cache.reserve = reserve;
 }
@@ -409,9 +411,12 @@ void bch2_fs_btree_cache_exit(struct bch_fs *c)
 
 	kvpfree(c->verify_ondisk, btree_bytes(c));
 
-	for (i = 0; i < BTREE_ID_NR; i++)
-		if (c->btree_roots[i].b)
-			list_add(&c->btree_roots[i].b->list, &bc->live);
+	for (i = 0; i < btree_id_nr_alive(c); i++) {
+		struct btree_root *r = bch2_btree_id_root(c, i);
+
+		if (r->b)
+			list_add(&r->b->list, &bc->live);
+	}
 
 	list_splice(&bc->freeable, &bc->live);
 
diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h
index ea375ae25a70..4c11975208b3 100644
--- a/fs/bcachefs/btree_cache.h
+++ b/fs/bcachefs/btree_cache.h
@@ -97,7 +97,27 @@ static inline unsigned btree_blocks(struct bch_fs *c)
 	(BTREE_FOREGROUND_MERGE_THRESHOLD(c) +			\
 	 (BTREE_FOREGROUND_MERGE_THRESHOLD(c) >> 2))
 
-#define btree_node_root(_c, _b)	((_c)->btree_roots[(_b)->c.btree_id].b)
+static inline unsigned btree_id_nr_alive(struct bch_fs *c)
+{
+	return BTREE_ID_NR + c->btree_roots_extra.nr;
+}
+
+static inline struct btree_root *bch2_btree_id_root(struct bch_fs *c, unsigned id)
+{
+	if (likely(id < BTREE_ID_NR)) {
+		return &c->btree_roots_known[id];
+	} else {
+		unsigned idx = id - BTREE_ID_NR;
+
+		EBUG_ON(idx >= c->btree_roots_extra.nr);
+		return &c->btree_roots_extra.data[idx];
+	}
+}
+
+static inline struct btree *btree_node_root(struct bch_fs *c, struct btree *b)
+{
+	return bch2_btree_id_root(c, b->c.btree_id)->b;
+}
 
 void bch2_btree_node_to_text(struct printbuf *, struct bch_fs *,
 			     const struct btree *);
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 0c9cba56105c..9018a2a15212 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -529,8 +529,13 @@ static int bch2_repair_topology(struct bch_fs *c)
 
 	bch2_trans_init(&trans, c, 0, 0);
 
-	for (i = 0; i < BTREE_ID_NR && !ret; i++) {
-		b = c->btree_roots[i].b;
+	for (i = 0; i < btree_id_nr_alive(c)&& !ret; i++) {
+		struct btree_root *r = bch2_btree_id_root(c, i);
+
+		if (!r->alive)
+			continue;
+
+		b = r->b;
 		if (btree_node_fake(b))
 			continue;
 
@@ -883,7 +888,7 @@ static int bch2_gc_btree(struct btree_trans *trans, enum btree_id btree_id,
 		return ret;
 
 	mutex_lock(&c->btree_root_lock);
-	b = c->btree_roots[btree_id].b;
+	b = bch2_btree_id_root(c, btree_id)->b;
 	if (!btree_node_fake(b)) {
 		struct bkey_s_c k = bkey_i_to_s_c(&b->key);
 
@@ -1006,7 +1011,7 @@ static int bch2_gc_btree_init(struct btree_trans *trans,
 	struct printbuf buf = PRINTBUF;
 	int ret = 0;
 
-	b = c->btree_roots[btree_id].b;
+	b = bch2_btree_id_root(c, btree_id)->b;
 
 	if (btree_node_fake(b))
 		return 0;
@@ -1072,6 +1077,15 @@ static int bch2_gc_btrees(struct bch_fs *c, bool initial, bool metadata_only)
 			? bch2_gc_btree_init(&trans, ids[i], metadata_only)
 			: bch2_gc_btree(&trans, ids[i], initial, metadata_only);
 
+	for (i = BTREE_ID_NR; i < btree_id_nr_alive(c) && !ret; i++) {
+		if (!bch2_btree_id_root(c, i)->alive)
+			continue;
+
+		ret = initial
+			? bch2_gc_btree_init(&trans, i, metadata_only)
+			: bch2_gc_btree(&trans, i, initial, metadata_only);
+	}
+
 	if (ret < 0)
 		bch_err_fn(c, ret);
 
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 9985ecd7265d..4ece3f684ef0 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -517,7 +517,7 @@ static void btree_pos_to_text(struct printbuf *out, struct bch_fs *c,
 	prt_printf(out, "%s level %u/%u\n  ",
 	       bch2_btree_ids[b->c.btree_id],
 	       b->c.level,
-	       c->btree_roots[b->c.btree_id].level);
+	       bch2_btree_id_root(c, b->c.btree_id)->level);
 	bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&b->key));
 }
 
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 8335387d3397..c7b20baa1fd6 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -238,7 +238,7 @@ static void bch2_btree_path_verify(struct btree_trans *trans,
 	for (i = 0; i < (!path->cached ? BTREE_MAX_DEPTH : 1); i++) {
 		if (!path->l[i].b) {
 			BUG_ON(!path->cached &&
-			       c->btree_roots[path->btree_id].b->c.level > i);
+			       bch2_btree_id_root(c, path->btree_id)->b->c.level > i);
 			break;
 		}
 
@@ -732,7 +732,7 @@ static inline int btree_path_lock_root(struct btree_trans *trans,
 				       unsigned long trace_ip)
 {
 	struct bch_fs *c = trans->c;
-	struct btree *b, **rootp = &c->btree_roots[path->btree_id].b;
+	struct btree *b, **rootp = &bch2_btree_id_root(c, path->btree_id)->b;
 	enum six_lock_type lock_type;
 	unsigned i;
 	int ret;
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 4c8effa4b8d9..490c41e8f8fe 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -1199,7 +1199,7 @@ static void bch2_btree_set_root_inmem(struct bch_fs *c, struct btree *b)
 	       (b->c.level < btree_node_root(c, b)->c.level ||
 		!btree_node_dying(btree_node_root(c, b))));
 
-	btree_node_root(c, b) = b;
+	bch2_btree_id_root(c, b->c.btree_id)->b = b;
 	mutex_unlock(&c->btree_root_lock);
 
 	bch2_recalc_btree_reserve(c);
@@ -2402,7 +2402,7 @@ bool bch2_btree_interior_updates_flush(struct bch_fs *c)
 
 void bch2_journal_entry_to_btree_root(struct bch_fs *c, struct jset_entry *entry)
 {
-	struct btree_root *r = &c->btree_roots[entry->btree_id];
+	struct btree_root *r = bch2_btree_id_root(c, entry->btree_id);
 
 	mutex_lock(&c->btree_root_lock);
 
@@ -2428,15 +2428,15 @@ bch2_btree_roots_to_journal_entries(struct bch_fs *c,
 
 	mutex_lock(&c->btree_root_lock);
 
-	for (i = 0; i < BTREE_ID_NR; i++)
-		if (c->btree_roots[i].alive && !test_bit(i, &have)) {
-			journal_entry_set(end,
-					  BCH_JSET_ENTRY_btree_root,
-					  i, c->btree_roots[i].level,
-					  &c->btree_roots[i].key,
-					  c->btree_roots[i].key.k.u64s);
+	for (i = 0; i < btree_id_nr_alive(c); i++) {
+		struct btree_root *r = bch2_btree_id_root(c, i);
+
+		if (r->alive && !test_bit(i, &have)) {
+			journal_entry_set(end, BCH_JSET_ENTRY_btree_root,
+					  i, r->level, &r->key, r->key.k.u64s);
 			end = vstruct_next(end);
 		}
+	}
 
 	mutex_unlock(&c->btree_root_lock);
 
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index 37fb3784a2f9..05272673901d 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -632,7 +632,7 @@ int bch2_move_data(struct bch_fs *c,
 	bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc);
 
 	for (id = start_btree_id;
-	     id <= min_t(unsigned, end_btree_id, BTREE_ID_NR - 1);
+	     id <= min_t(unsigned, end_btree_id, btree_id_nr_alive(c) - 1);
 	     id++) {
 		stats->btree_id = id;
 
@@ -640,6 +640,9 @@ int bch2_move_data(struct bch_fs *c,
 		    id != BTREE_ID_reflink)
 			continue;
 
+		if (!bch2_btree_id_root(c, id)->b)
+			continue;
+
 		ret = __bch2_move_data(&ctxt,
 				       id == start_btree_id ? start_pos : POS_MIN,
 				       id == end_btree_id   ? end_pos   : POS_MAX,
@@ -861,10 +864,13 @@ static int bch2_move_btree(struct bch_fs *c,
 	stats->data_type = BCH_DATA_btree;
 
 	for (id = start_btree_id;
-	     id <= min_t(unsigned, end_btree_id, BTREE_ID_NR - 1);
+	     id <= min_t(unsigned, end_btree_id, btree_id_nr_alive(c) - 1);
 	     id++) {
 		stats->btree_id = id;
 
+		if (!bch2_btree_id_root(c, id)->b)
+			continue;
+
 		bch2_trans_node_iter_init(&trans, &iter, id, POS_MIN, 0, 0,
 					  BTREE_ITER_PREFETCH);
 retry:
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index b86442c7c912..268fae9e7bf9 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -702,13 +702,13 @@ static int journal_replay_entry_early(struct bch_fs *c,
 	case BCH_JSET_ENTRY_btree_root: {
 		struct btree_root *r;
 
-		if (entry->btree_id >= BTREE_ID_NR) {
-			bch_err(c, "filesystem has unknown btree type %u",
-				entry->btree_id);
-			return -EINVAL;
+		while (entry->btree_id >= c->btree_roots_extra.nr + BTREE_ID_NR) {
+			ret = darray_push(&c->btree_roots_extra, (struct btree_root) { NULL });
+			if (ret)
+				return ret;
 		}
 
-		r = &c->btree_roots[entry->btree_id];
+		r = bch2_btree_id_root(c, entry->btree_id);
 
 		if (entry->u64s) {
 			r->level = entry->level;
@@ -980,8 +980,8 @@ static int read_btree_roots(struct bch_fs *c)
 	unsigned i;
 	int ret = 0;
 
-	for (i = 0; i < BTREE_ID_NR; i++) {
-		struct btree_root *r = &c->btree_roots[i];
+	for (i = 0; i < btree_id_nr_alive(c); i++) {
+		struct btree_root *r = bch2_btree_id_root(c, i);
 
 		if (!r->alive)
 			continue;
@@ -1014,7 +1014,7 @@ static int read_btree_roots(struct bch_fs *c)
 	}
 
 	for (i = 0; i < BTREE_ID_NR; i++) {
-		struct btree_root *r = &c->btree_roots[i];
+		struct btree_root *r = bch2_btree_id_root(c, i);
 
 		if (!r->b) {
 			r->alive = false;
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index fe37a0464af5..7ecbc23af1a1 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -485,6 +485,7 @@ static void __bch2_fs_free(struct bch_fs *c)
 		for_each_possible_cpu(cpu)
 			kfree(per_cpu_ptr(c->btree_paths_bufs, cpu)->path);
 
+	darray_exit(&c->btree_roots_extra);
 	free_percpu(c->btree_paths_bufs);
 	free_percpu(c->pcpu);
 	mempool_exit(&c->large_bkey_pool);
-- 
cgit 


From 183e9c430ea9775fdd1f7097f309ef61471562fc Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 6 Jul 2023 19:23:27 -0400
Subject: bcachefs: Allow for unknown key types

This adds a new helper for lookups bkey_ops for a given key type, which
returns a null bkey_ops for unknown key types; various bkey_ops users
are tweaked as well to handle unknown key types.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bkey_methods.c      | 38 +++++++++++++++++---------------------
 fs/bcachefs/bkey_methods.h      | 12 ++++++++++--
 fs/bcachefs/btree_update_leaf.c | 10 ++++++----
 3 files changed, 33 insertions(+), 27 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c
index 79f3fbe925d5..59a4f4802ee9 100644
--- a/fs/bcachefs/bkey_methods.c
+++ b/fs/bcachefs/bkey_methods.c
@@ -118,17 +118,14 @@ const struct bkey_ops bch2_bkey_ops[] = {
 #undef x
 };
 
+const struct bkey_ops bch2_bkey_null_ops = {
+	.min_val_size = U8_MAX,
+};
+
 int bch2_bkey_val_invalid(struct bch_fs *c, struct bkey_s_c k,
 			  unsigned flags, struct printbuf *err)
 {
-	const struct bkey_ops *ops;
-
-	if (k.k->type >= KEY_TYPE_MAX) {
-		prt_printf(err, "invalid type (%u >= %u)", k.k->type, KEY_TYPE_MAX);
-		return -BCH_ERR_invalid_bkey;
-	}
-
-	ops = &bch2_bkey_ops[k.k->type];
+	const struct bkey_ops *ops = bch2_bkey_type_ops(k.k->type);
 
 	if (bkey_val_bytes(k.k) < ops->min_val_size) {
 		prt_printf(err, "bad val size (%zu < %u)",
@@ -136,6 +133,9 @@ int bch2_bkey_val_invalid(struct bch_fs *c, struct bkey_s_c k,
 		return -BCH_ERR_invalid_bkey;
 	}
 
+	if (!ops->key_invalid)
+		return 0;
+
 	return ops->key_invalid(c, k, flags, err);
 }
 
@@ -340,14 +340,10 @@ void bch2_bkey_to_text(struct printbuf *out, const struct bkey *k)
 void bch2_val_to_text(struct printbuf *out, struct bch_fs *c,
 		      struct bkey_s_c k)
 {
-	if (k.k->type < KEY_TYPE_MAX) {
-		const struct bkey_ops *ops = &bch2_bkey_ops[k.k->type];
+	const struct bkey_ops *ops = bch2_bkey_type_ops(k.k->type);
 
-		if (likely(ops->val_to_text))
-			ops->val_to_text(out, c, k);
-	} else {
-		prt_printf(out, "(invalid type %u)", k.k->type);
-	}
+	if (likely(ops->val_to_text))
+		ops->val_to_text(out, c, k);
 }
 
 void bch2_bkey_val_to_text(struct printbuf *out, struct bch_fs *c,
@@ -363,7 +359,7 @@ void bch2_bkey_val_to_text(struct printbuf *out, struct bch_fs *c,
 
 void bch2_bkey_swab_val(struct bkey_s k)
 {
-	const struct bkey_ops *ops = &bch2_bkey_ops[k.k->type];
+	const struct bkey_ops *ops = bch2_bkey_type_ops(k.k->type);
 
 	if (ops->swab)
 		ops->swab(k);
@@ -371,7 +367,7 @@ void bch2_bkey_swab_val(struct bkey_s k)
 
 bool bch2_bkey_normalize(struct bch_fs *c, struct bkey_s k)
 {
-	const struct bkey_ops *ops = &bch2_bkey_ops[k.k->type];
+	const struct bkey_ops *ops = bch2_bkey_type_ops(k.k->type);
 
 	return ops->key_normalize
 		? ops->key_normalize(c, k)
@@ -380,11 +376,11 @@ bool bch2_bkey_normalize(struct bch_fs *c, struct bkey_s k)
 
 bool bch2_bkey_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r)
 {
-	const struct bkey_ops *ops = &bch2_bkey_ops[l.k->type];
+	const struct bkey_ops *ops = bch2_bkey_type_ops(l.k->type);
 
-	return bch2_bkey_maybe_mergable(l.k, r.k) &&
+	return ops->key_merge &&
+		bch2_bkey_maybe_mergable(l.k, r.k) &&
 		(u64) l.k->size + r.k->size <= KEY_SIZE_MAX &&
-		bch2_bkey_ops[l.k->type].key_merge &&
 		!bch2_key_merging_disabled &&
 		ops->key_merge(c, l, r);
 }
@@ -509,7 +505,7 @@ void __bch2_bkey_compat(unsigned level, enum btree_id btree_id,
 		if (big_endian != CPU_BIG_ENDIAN)
 			bch2_bkey_swab_val(u);
 
-		ops = &bch2_bkey_ops[k->type];
+		ops = bch2_bkey_type_ops(k->type);
 
 		if (ops->compat)
 			ops->compat(btree_id, version, big_endian, write, u);
diff --git a/fs/bcachefs/bkey_methods.h b/fs/bcachefs/bkey_methods.h
index a65756e306b0..32b86c74cc9a 100644
--- a/fs/bcachefs/bkey_methods.h
+++ b/fs/bcachefs/bkey_methods.h
@@ -11,6 +11,7 @@ struct bkey;
 enum btree_node_type;
 
 extern const char * const bch2_bkey_types[];
+extern const struct bkey_ops bch2_bkey_null_ops;
 
 /*
  * key_invalid: checks validity of @k, returns 0 if good or -EINVAL if bad. If
@@ -41,6 +42,13 @@ struct bkey_ops {
 
 extern const struct bkey_ops bch2_bkey_ops[];
 
+static inline const struct bkey_ops *bch2_bkey_type_ops(enum bch_bkey_type type)
+{
+	return likely(type < KEY_TYPE_MAX)
+		? &bch2_bkey_ops[type]
+		: &bch2_bkey_null_ops;
+}
+
 #define BKEY_INVALID_FROM_JOURNAL		(1 << 1)
 
 int bch2_bkey_val_invalid(struct bch_fs *, struct bkey_s_c, unsigned, struct printbuf *);
@@ -75,7 +83,7 @@ static inline int bch2_mark_key(struct btree_trans *trans,
 		struct bkey_s_c old, struct bkey_s_c new,
 		unsigned flags)
 {
-	const struct bkey_ops *ops = &bch2_bkey_ops[old.k->type ?: new.k->type];
+	const struct bkey_ops *ops = bch2_bkey_type_ops(old.k->type ?: new.k->type);
 
 	return ops->atomic_trigger
 		? ops->atomic_trigger(trans, btree, level, old, new, flags)
@@ -125,7 +133,7 @@ static inline int bch2_trans_mark_key(struct btree_trans *trans,
 				      struct bkey_s_c old, struct bkey_i *new,
 				      unsigned flags)
 {
-	const struct bkey_ops *ops = &bch2_bkey_ops[old.k->type ?: new->k.type];
+	const struct bkey_ops *ops = bch2_bkey_type_ops(old.k->type ?: new->k.type);
 
 	return ops->trans_trigger
 		? ops->trans_trigger(trans, btree_id, level, old, new, flags)
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index ce6ec28d8f60..ad058b9252e1 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -407,6 +407,8 @@ static int run_one_mem_trigger(struct btree_trans *trans,
 {
 	struct bkey_s_c old = { &i->old_k, i->old_v };
 	struct bkey_i *new = i->k;
+	const struct bkey_ops *old_ops = bch2_bkey_type_ops(old.k->type);
+	const struct bkey_ops *new_ops = bch2_bkey_type_ops(i->k->k.type);
 	int ret;
 
 	verify_update_old_key(trans, i);
@@ -417,8 +419,7 @@ static int run_one_mem_trigger(struct btree_trans *trans,
 	if (!btree_node_type_needs_gc(i->btree_id))
 		return 0;
 
-	if (bch2_bkey_ops[old.k->type].atomic_trigger ==
-	    bch2_bkey_ops[i->k->k.type].atomic_trigger &&
+	if (old_ops->atomic_trigger == new_ops->atomic_trigger &&
 	    ((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) {
 		ret   = bch2_mark_key(trans, i->btree_id, i->level,
 				old, bkey_i_to_s_c(new),
@@ -450,6 +451,8 @@ static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_
 	 */
 	struct bkey old_k = i->old_k;
 	struct bkey_s_c old = { &old_k, i->old_v };
+	const struct bkey_ops *old_ops = bch2_bkey_type_ops(old.k->type);
+	const struct bkey_ops *new_ops = bch2_bkey_type_ops(i->k->k.type);
 
 	verify_update_old_key(trans, i);
 
@@ -459,8 +462,7 @@ static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_
 
 	if (!i->insert_trigger_run &&
 	    !i->overwrite_trigger_run &&
-	    bch2_bkey_ops[old.k->type].trans_trigger ==
-	    bch2_bkey_ops[i->k->k.type].trans_trigger &&
+	    old_ops->trans_trigger == new_ops->trans_trigger &&
 	    ((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) {
 		i->overwrite_trigger_run = true;
 		i->insert_trigger_run = true;
-- 
cgit 


From 236b68da5017b5336b332f941323a5bc450594b3 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 6 Jul 2023 20:11:36 -0400
Subject: bcachefs: Refactor bch_sb_field_ops handling

This changes bch_sb_field_ops lookup to match how bkey_ops now works;
for an unknown field type we return an empty ops struct.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/super-io.c | 29 ++++++++++++++++++-----------
 1 file changed, 18 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index 55a6c64de09c..71764e008e3e 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -1415,21 +1415,29 @@ static const struct bch_sb_field_ops *bch2_sb_field_ops[] = {
 #undef x
 };
 
+static const struct bch_sb_field_ops bch2_sb_field_null_ops = {
+	NULL
+};
+
+static const struct bch_sb_field_ops *bch2_sb_field_type_ops(unsigned type)
+{
+	return likely(type < ARRAY_SIZE(bch2_sb_field_ops))
+		? bch2_sb_field_ops[type]
+		: &bch2_sb_field_null_ops;
+}
+
 static int bch2_sb_field_validate(struct bch_sb *sb, struct bch_sb_field *f,
 				  struct printbuf *err)
 {
 	unsigned type = le32_to_cpu(f->type);
 	struct printbuf field_err = PRINTBUF;
+	const struct bch_sb_field_ops *ops = bch2_sb_field_type_ops(type);
 	int ret;
 
-	if (type >= BCH_SB_FIELD_NR)
-		return 0;
-
-	ret = bch2_sb_field_ops[type]->validate(sb, f, &field_err);
+	ret = ops->validate ? ops->validate(sb, f, &field_err) : 0;
 	if (ret) {
 		prt_printf(err, "Invalid superblock section %s: %s",
-		       bch2_sb_fields[type],
-		       field_err.buf);
+			   bch2_sb_fields[type], field_err.buf);
 		prt_newline(err);
 		bch2_sb_field_to_text(err, sb, f);
 	}
@@ -1442,13 +1450,12 @@ void bch2_sb_field_to_text(struct printbuf *out, struct bch_sb *sb,
 			   struct bch_sb_field *f)
 {
 	unsigned type = le32_to_cpu(f->type);
-	const struct bch_sb_field_ops *ops = type < BCH_SB_FIELD_NR
-		? bch2_sb_field_ops[type] : NULL;
+	const struct bch_sb_field_ops *ops = bch2_sb_field_type_ops(type);
 
 	if (!out->nr_tabstops)
 		printbuf_tabstop_push(out, 32);
 
-	if (ops)
+	if (type < BCH_SB_FIELD_NR)
 		prt_printf(out, "%s", bch2_sb_fields[type]);
 	else
 		prt_printf(out, "(unknown field %u)", type);
@@ -1456,9 +1463,9 @@ void bch2_sb_field_to_text(struct printbuf *out, struct bch_sb *sb,
 	prt_printf(out, " (size %zu):", vstruct_bytes(f));
 	prt_newline(out);
 
-	if (ops && ops->to_text) {
+	if (ops->to_text) {
 		printbuf_indent_add(out, 2);
-		bch2_sb_field_ops[type]->to_text(out, sb, f);
+		ops->to_text(out, sb, f);
 		printbuf_indent_sub(out, 2);
 	}
 }
-- 
cgit 


From 73bd774d28d2b2e6a05c31bf7afb9247e02a8e49 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 6 Jul 2023 22:47:42 -0400
Subject: bcachefs: Assorted sparse fixes

 - endianness fixes
 - mark some things static
 - fix a few __percpu annotations
 - fix silent enum conversions

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/acl.c                   | 10 ++++------
 fs/bcachefs/alloc_background.c      |  4 ++--
 fs/bcachefs/backpointers.c          | 12 ++++++------
 fs/bcachefs/bcachefs_format.h       |  8 ++++----
 fs/bcachefs/bkey_methods.c          |  2 +-
 fs/bcachefs/bset.c                  |  9 ++++-----
 fs/bcachefs/btree_cache.h           |  6 +++++-
 fs/bcachefs/btree_gc.c              |  4 ++--
 fs/bcachefs/btree_io.c              |  4 ++--
 fs/bcachefs/btree_io.h              |  6 +++---
 fs/bcachefs/btree_iter.c            | 10 +++++-----
 fs/bcachefs/btree_iter.h            |  4 ++--
 fs/bcachefs/btree_locking.c         |  7 -------
 fs/bcachefs/btree_locking.h         |  6 +++---
 fs/bcachefs/btree_types.h           |  5 +++++
 fs/bcachefs/btree_update_interior.c |  2 +-
 fs/bcachefs/btree_update_leaf.c     |  2 +-
 fs/bcachefs/buckets.c               |  6 +++---
 fs/bcachefs/checksum.c              |  8 ++++----
 fs/bcachefs/counters.c              |  4 ++--
 fs/bcachefs/dirent.c                |  2 +-
 fs/bcachefs/ec.c                    | 18 ++++++++++--------
 fs/bcachefs/extents.c               |  8 ++++----
 fs/bcachefs/extents.h               |  8 ++------
 fs/bcachefs/fs-io.c                 |  6 +++---
 fs/bcachefs/fsck.c                  |  8 ++++----
 fs/bcachefs/io.c                    |  2 +-
 fs/bcachefs/journal_reclaim.c       |  2 +-
 fs/bcachefs/journal_sb.c            |  8 ++++----
 fs/bcachefs/quota.c                 |  8 ++++----
 fs/bcachefs/recovery.c              |  6 +++---
 fs/bcachefs/replicas.c              |  6 +++---
 fs/bcachefs/subvolume.c             |  6 +++---
 fs/bcachefs/super.c                 |  6 +++---
 fs/bcachefs/sysfs.c                 |  2 +-
 fs/bcachefs/tests.c                 |  2 +-
 fs/bcachefs/varint.c                | 11 ++++++-----
 fs/bcachefs/xattr.c                 |  5 ++---
 38 files changed, 115 insertions(+), 118 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/acl.c b/fs/bcachefs/acl.c
index ce7a460fb308..b1a488860678 100644
--- a/fs/bcachefs/acl.c
+++ b/fs/bcachefs/acl.c
@@ -225,6 +225,7 @@ struct posix_acl *bch2_get_acl(struct mnt_idmap *idmap,
 	struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
 	struct bch_hash_info hash = bch2_hash_info_init(c, &inode->ei_inode);
+	struct xattr_search_key search = X_SEARCH(acl_to_xattr_type(type), "", 0);
 	struct btree_trans trans;
 	struct btree_iter iter = { NULL };
 	struct bkey_s_c_xattr xattr;
@@ -237,9 +238,7 @@ retry:
 	bch2_trans_begin(&trans);
 
 	ret = bch2_hash_lookup(&trans, &iter, bch2_xattr_hash_desc,
-			&hash, inode_inum(inode),
-			&X_SEARCH(acl_to_xattr_type(type), "", 0),
-			0);
+			&hash, inode_inum(inode), &search, 0);
 	if (ret) {
 		if (!bch2_err_matches(ret, ENOENT))
 			acl = ERR_PTR(ret);
@@ -364,6 +363,7 @@ int bch2_acl_chmod(struct btree_trans *trans, subvol_inum inum,
 		   struct posix_acl **new_acl)
 {
 	struct bch_hash_info hash_info = bch2_hash_info_init(trans->c, inode);
+	struct xattr_search_key search = X_SEARCH(KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS, "", 0);
 	struct btree_iter iter;
 	struct bkey_s_c_xattr xattr;
 	struct bkey_i_xattr *new;
@@ -372,9 +372,7 @@ int bch2_acl_chmod(struct btree_trans *trans, subvol_inum inum,
 	int ret;
 
 	ret = bch2_hash_lookup(trans, &iter, bch2_xattr_hash_desc,
-			       &hash_info, inum,
-			&X_SEARCH(KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS, "", 0),
-			BTREE_ITER_INTENT);
+			       &hash_info, inum, &search, BTREE_ITER_INTENT);
 	if (ret)
 		return bch2_err_matches(ret, ENOENT) ? 0 : ret;
 
diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 81e80f36af43..782086afde54 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -929,7 +929,7 @@ int bch2_trans_mark_alloc(struct btree_trans *trans,
  * This synthesizes deleted extents for holes, similar to BTREE_ITER_SLOTS for
  * extents style btrees, but works on non-extents btrees:
  */
-struct bkey_s_c bch2_get_key_or_hole(struct btree_iter *iter, struct bpos end, struct bkey *hole)
+static struct bkey_s_c bch2_get_key_or_hole(struct btree_iter *iter, struct bpos end, struct bkey *hole)
 {
 	struct bkey_s_c k = bch2_btree_iter_peek_slot(iter);
 
@@ -1000,7 +1000,7 @@ static bool next_bucket(struct bch_fs *c, struct bpos *bucket)
 	return ca != NULL;
 }
 
-struct bkey_s_c bch2_get_key_or_real_bucket_hole(struct btree_iter *iter, struct bkey *hole)
+static struct bkey_s_c bch2_get_key_or_real_bucket_hole(struct btree_iter *iter, struct bkey *hole)
 {
 	struct bch_fs *c = iter->trans->c;
 	struct bkey_s_c k;
diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c
index 20a4c3f071bf..760c4cc16a50 100644
--- a/fs/bcachefs/backpointers.c
+++ b/fs/bcachefs/backpointers.c
@@ -590,10 +590,10 @@ static size_t btree_nodes_fit_in_ram(struct bch_fs *c)
 	return div_u64(mem_bytes >> 1, btree_bytes(c));
 }
 
-int bch2_get_btree_in_memory_pos(struct btree_trans *trans,
-				 unsigned btree_leaf_mask,
-				 unsigned btree_interior_mask,
-				 struct bbpos start, struct bbpos *end)
+static int bch2_get_btree_in_memory_pos(struct btree_trans *trans,
+					unsigned btree_leaf_mask,
+					unsigned btree_interior_mask,
+					struct bbpos start, struct bbpos *end)
 {
 	struct btree_iter iter;
 	struct bkey_s_c k;
@@ -691,8 +691,8 @@ static struct bpos bucket_pos_to_bp_safe(const struct bch_fs *c,
 		: bucket;
 }
 
-int bch2_get_alloc_in_memory_pos(struct btree_trans *trans,
-				 struct bpos start, struct bpos *end)
+static int bch2_get_alloc_in_memory_pos(struct btree_trans *trans,
+					struct bpos start, struct bpos *end)
 {
 	struct btree_iter alloc_iter;
 	struct btree_iter bp_iter;
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index 4401d27675ed..49b86bfda76b 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -1371,19 +1371,19 @@ static inline bool data_type_is_hidden(enum bch_data_type type)
 struct bch_replicas_entry_v0 {
 	__u8			data_type;
 	__u8			nr_devs;
-	__u8			devs[];
+	__u8			devs[0];
 } __packed;
 
 struct bch_sb_field_replicas_v0 {
 	struct bch_sb_field	field;
-	struct bch_replicas_entry_v0 entries[];
+	struct bch_replicas_entry_v0 entries[0];
 } __packed __aligned(8);
 
 struct bch_replicas_entry {
 	__u8			data_type;
 	__u8			nr_devs;
 	__u8			nr_required;
-	__u8			devs[];
+	__u8			devs[0];
 } __packed;
 
 #define replicas_entry_bytes(_i)					\
@@ -1391,7 +1391,7 @@ struct bch_replicas_entry {
 
 struct bch_sb_field_replicas {
 	struct bch_sb_field	field;
-	struct bch_replicas_entry entries[];
+	struct bch_replicas_entry entries[0];
 } __packed __aligned(8);
 
 /* BCH_SB_FIELD_quota: */
diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c
index 59a4f4802ee9..985ea2daa886 100644
--- a/fs/bcachefs/bkey_methods.c
+++ b/fs/bcachefs/bkey_methods.c
@@ -480,7 +480,7 @@ void __bch2_bkey_compat(unsigned level, enum btree_id btree_id,
 				u->k.p.snapshot = write
 					? 0 : U32_MAX;
 			} else {
-				u64 min_packed = f->field_offset[BKEY_FIELD_SNAPSHOT];
+				u64 min_packed = le64_to_cpu(f->field_offset[BKEY_FIELD_SNAPSHOT]);
 				u64 max_packed = min_packed +
 					~(~0ULL << f->bits_per_field[BKEY_FIELD_SNAPSHOT]);
 
diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c
index 4d55011551e0..bcdf28f39b9c 100644
--- a/fs/bcachefs/bset.c
+++ b/fs/bcachefs/bset.c
@@ -599,11 +599,10 @@ static inline unsigned bkey_mantissa(const struct bkey_packed *k,
 	return (u16) v;
 }
 
-__always_inline
-static inline void make_bfloat(struct btree *b, struct bset_tree *t,
-			       unsigned j,
-			       struct bkey_packed *min_key,
-			       struct bkey_packed *max_key)
+static __always_inline void make_bfloat(struct btree *b, struct bset_tree *t,
+					unsigned j,
+					struct bkey_packed *min_key,
+					struct bkey_packed *max_key)
 {
 	struct bkey_float *f = bkey_float(b, t, j);
 	struct bkey_packed *m = tree_to_bkey(b, t, j);
diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h
index 4c11975208b3..1e562b6efa62 100644
--- a/fs/bcachefs/btree_cache.h
+++ b/fs/bcachefs/btree_cache.h
@@ -45,7 +45,11 @@ static inline u64 btree_ptr_hash_val(const struct bkey_i *k)
 	case KEY_TYPE_btree_ptr:
 		return *((u64 *) bkey_i_to_btree_ptr_c(k)->v.start);
 	case KEY_TYPE_btree_ptr_v2:
-		return bkey_i_to_btree_ptr_v2_c(k)->v.seq;
+		/*
+		 * The cast/deref is only necessary to avoid sparse endianness
+		 * warnings:
+		 */
+		return *((u64 *) &bkey_i_to_btree_ptr_v2_c(k)->v.seq);
 	default:
 		return 0;
 	}
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 9018a2a15212..ac6c748e0f7c 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -1229,7 +1229,7 @@ static int bch2_gc_done(struct bch_fs *c,
 	for_each_member_device(ca, c, dev) {
 		struct bch_dev_usage *dst = ca->usage_base;
 		struct bch_dev_usage *src = (void *)
-			bch2_acc_percpu_u64s((void *) ca->usage_gc,
+			bch2_acc_percpu_u64s((u64 __percpu *) ca->usage_gc,
 					     dev_usage_u64s());
 
 		copy_dev_field(buckets_ec,		"buckets_ec");
@@ -1245,7 +1245,7 @@ static int bch2_gc_done(struct bch_fs *c,
 		unsigned nr = fs_usage_u64s(c);
 		struct bch_fs_usage *dst = c->usage_base;
 		struct bch_fs_usage *src = (void *)
-			bch2_acc_percpu_u64s((void *) c->usage_gc, nr);
+			bch2_acc_percpu_u64s((u64 __percpu *) c->usage_gc, nr);
 
 		copy_fs_field(hidden,		"hidden");
 		copy_fs_field(btree,		"btree");
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 4ece3f684ef0..a8197c500894 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -1281,7 +1281,7 @@ struct btree_node_read_all {
 	unsigned		nr;
 	void			*buf[BCH_REPLICAS_MAX];
 	struct bio		*bio[BCH_REPLICAS_MAX];
-	int			err[BCH_REPLICAS_MAX];
+	blk_status_t		err[BCH_REPLICAS_MAX];
 };
 
 static unsigned btree_node_sectors_written(struct bch_fs *c, void *data)
@@ -2230,7 +2230,7 @@ bool bch2_btree_flush_all_writes(struct bch_fs *c)
 	return __bch2_btree_flush_all(c, BTREE_NODE_write_in_flight);
 }
 
-const char * const bch2_btree_write_types[] = {
+static const char * const bch2_btree_write_types[] = {
 #define x(t, n) [n] = #t,
 	BCH_BTREE_WRITE_TYPES()
 	NULL
diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h
index c43fb60b8c82..0cadf651e7cf 100644
--- a/fs/bcachefs/btree_io.h
+++ b/fs/bcachefs/btree_io.h
@@ -178,7 +178,7 @@ static inline void compat_bformat(unsigned level, enum btree_id btree_id,
 
 		f->field_offset[BKEY_FIELD_SNAPSHOT] = write
 			? 0
-			: U32_MAX - max_packed;
+			: cpu_to_le64(U32_MAX - max_packed);
 	}
 }
 
@@ -200,7 +200,7 @@ static inline void compat_btree_node(unsigned level, enum btree_id btree_id,
 				     struct btree_node *bn)
 {
 	if (version < bcachefs_metadata_version_inode_btree_change &&
-	    btree_node_type_is_extents(btree_id) &&
+	    btree_id_is_extents(btree_id) &&
 	    !bpos_eq(bn->min_key, POS_MIN) &&
 	    write)
 		bn->min_key = bpos_nosnap_predecessor(bn->min_key);
@@ -217,7 +217,7 @@ static inline void compat_btree_node(unsigned level, enum btree_id btree_id,
 		bn->max_key.snapshot = U32_MAX;
 
 	if (version < bcachefs_metadata_version_inode_btree_change &&
-	    btree_node_type_is_extents(btree_id) &&
+	    btree_id_is_extents(btree_id) &&
 	    !bpos_eq(bn->min_key, POS_MIN) &&
 	    !write)
 		bn->min_key = bpos_nosnap_successor(bn->min_key);
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index c7b20baa1fd6..d16331620ab9 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1438,7 +1438,7 @@ void bch2_btree_path_to_text(struct printbuf *out, struct btree_path *path)
 	prt_newline(out);
 }
 
-noinline __cold
+static noinline __cold
 void __bch2_trans_paths_to_text(struct printbuf *out, struct btree_trans *trans,
 				bool nosort)
 {
@@ -1458,7 +1458,7 @@ void bch2_trans_paths_to_text(struct printbuf *out, struct btree_trans *trans)
 	__bch2_trans_paths_to_text(out, trans, false);
 }
 
-noinline __cold
+static noinline __cold
 void __bch2_dump_trans_paths_updates(struct btree_trans *trans, bool nosort)
 {
 	struct printbuf buf = PRINTBUF;
@@ -1867,9 +1867,9 @@ static inline struct bkey_i *btree_trans_peek_updates(struct btree_iter *iter)
 		: NULL;
 }
 
-struct bkey_i *bch2_btree_journal_peek(struct btree_trans *trans,
-				       struct btree_iter *iter,
-				       struct bpos end_pos)
+static struct bkey_i *bch2_btree_journal_peek(struct btree_trans *trans,
+					      struct btree_iter *iter,
+					      struct bpos end_pos)
 {
 	struct bkey_i *k;
 
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index 9ef9527dda6b..63260f68bc67 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -283,7 +283,7 @@ static inline void bch2_trans_verify_not_in_restart(struct btree_trans *trans)
 }
 
 __always_inline
-static inline int btree_trans_restart_nounlock(struct btree_trans *trans, int err)
+static int btree_trans_restart_nounlock(struct btree_trans *trans, int err)
 {
 	BUG_ON(err <= 0);
 	BUG_ON(!bch2_err_matches(-err, BCH_ERR_transaction_restart));
@@ -294,7 +294,7 @@ static inline int btree_trans_restart_nounlock(struct btree_trans *trans, int er
 }
 
 __always_inline
-static inline int btree_trans_restart(struct btree_trans *trans, int err)
+static int btree_trans_restart(struct btree_trans *trans, int err)
 {
 	btree_trans_restart_nounlock(trans, err);
 	return -err;
diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c
index e6fe2a987574..dfdf46eb3e6d 100644
--- a/fs/bcachefs/btree_locking.c
+++ b/fs/bcachefs/btree_locking.c
@@ -598,13 +598,6 @@ int __bch2_btree_path_relock(struct btree_trans *trans,
 	return 0;
 }
 
-__flatten
-bool bch2_btree_path_upgrade_norestart(struct btree_trans *trans,
-			struct btree_path *path, unsigned long trace_ip)
-{
-	return btree_path_get_locks(trans, path, true);
-}
-
 bool bch2_btree_path_upgrade_noupgrade_sibs(struct btree_trans *trans,
 			       struct btree_path *path,
 			       unsigned new_locks_want)
diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h
index 3b537e451d2c..78daa494c914 100644
--- a/fs/bcachefs/btree_locking.h
+++ b/fs/bcachefs/btree_locking.h
@@ -93,7 +93,7 @@ static inline void mark_btree_node_locked(struct btree_trans *trans,
 					  unsigned level,
 					  enum six_lock_type type)
 {
-	mark_btree_node_locked_noreset(path, level, type);
+	mark_btree_node_locked_noreset(path, level, (enum btree_node_locked_type) type);
 #ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS
 	path->l[level].lock_taken_time = local_clock();
 #endif
@@ -246,7 +246,7 @@ static inline bool btree_node_lock_increment(struct btree_trans *trans,
 	trans_for_each_path(trans, path)
 		if (&path->l[level].b->c == b &&
 		    btree_node_locked_type(path, level) >= want) {
-			six_lock_increment(&b->lock, want);
+			six_lock_increment(&b->lock, (enum six_lock_type) want);
 			return true;
 		}
 
@@ -266,7 +266,7 @@ static inline int btree_node_lock(struct btree_trans *trans,
 	EBUG_ON(!(trans->paths_allocated & (1ULL << path->idx)));
 
 	if (likely(six_trylock_type(&b->lock, type)) ||
-	    btree_node_lock_increment(trans, b, level, type) ||
+	    btree_node_lock_increment(trans, b, level, (enum btree_node_locked_type) type) ||
 	    !(ret = btree_node_lock_nopath(trans, b, type, btree_path_ip_allocated(path)))) {
 #ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS
 		path->l[b->level].lock_taken_time = local_clock();
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index fc8a3326451f..937f9c2b63ed 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -681,6 +681,11 @@ static inline bool btree_node_type_is_extents(enum btree_node_type type)
 	return (1U << type) & BTREE_ID_IS_EXTENTS;
 }
 
+static inline bool btree_id_is_extents(enum btree_id btree)
+{
+	return btree_node_type_is_extents((enum btree_node_type) btree);
+}
+
 #define BTREE_ID_HAS_SNAPSHOTS				\
 	((1U << BTREE_ID_extents)|			\
 	 (1U << BTREE_ID_inodes)|			\
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 490c41e8f8fe..5592feff79d1 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -2036,7 +2036,7 @@ out:
 	return ret;
 }
 
-void async_btree_node_rewrite_work(struct work_struct *work)
+static void async_btree_node_rewrite_work(struct work_struct *work)
 {
 	struct async_btree_rewrite *a =
 		container_of(work, struct async_btree_rewrite, work);
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index ad058b9252e1..1474dca26dde 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -29,7 +29,7 @@
  * bch2_btree_path_peek_slot() for a cached iterator might return a key in a
  * different snapshot:
  */
-struct bkey_s_c bch2_btree_path_peek_slot_exact(struct btree_path *path, struct bkey *u)
+static struct bkey_s_c bch2_btree_path_peek_slot_exact(struct btree_path *path, struct bkey *u)
 {
 	struct bkey_s_c k = bch2_btree_path_peek_slot(path, u);
 
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 48fdd5f96a3b..797ef5eceb3f 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -374,7 +374,7 @@ static inline int update_replicas(struct bch_fs *c, struct bkey_s_c k,
 			struct bch_replicas_entry *r, s64 sectors,
 			unsigned journal_seq, bool gc)
 {
-	struct bch_fs_usage __percpu *fs_usage;
+	struct bch_fs_usage *fs_usage;
 	int idx, ret = 0;
 	struct printbuf buf = PRINTBUF;
 
@@ -1143,7 +1143,7 @@ int bch2_mark_inode(struct btree_trans *trans,
 		    unsigned flags)
 {
 	struct bch_fs *c = trans->c;
-	struct bch_fs_usage __percpu *fs_usage;
+	struct bch_fs_usage *fs_usage;
 	u64 journal_seq = trans->journal_res.seq;
 
 	if (flags & BTREE_TRIGGER_INSERT) {
@@ -1176,7 +1176,7 @@ int bch2_mark_reservation(struct btree_trans *trans,
 {
 	struct bch_fs *c = trans->c;
 	struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old : new;
-	struct bch_fs_usage __percpu *fs_usage;
+	struct bch_fs_usage *fs_usage;
 	unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas;
 	s64 sectors = (s64) k.k->size;
 
diff --git a/fs/bcachefs/checksum.c b/fs/bcachefs/checksum.c
index 843e138862f6..f2a56d786024 100644
--- a/fs/bcachefs/checksum.c
+++ b/fs/bcachefs/checksum.c
@@ -360,7 +360,7 @@ struct bch_csum bch2_checksum_merge(unsigned type, struct bch_csum a,
 
 	state.type = type;
 	bch2_checksum_init(&state);
-	state.seed = a.lo;
+	state.seed = (u64 __force) a.lo;
 
 	BUG_ON(!bch2_checksum_mergeable(type));
 
@@ -371,7 +371,7 @@ struct bch_csum bch2_checksum_merge(unsigned type, struct bch_csum a,
 				page_address(ZERO_PAGE(0)), b);
 		b_len -= b;
 	}
-	a.lo = bch2_checksum_final(&state);
+	a.lo = (__le64 __force) bch2_checksum_final(&state);
 	a.lo ^= b.lo;
 	a.hi ^= b.hi;
 	return a;
@@ -597,7 +597,7 @@ int bch2_disable_encryption(struct bch_fs *c)
 	if (ret)
 		goto out;
 
-	crypt->key.magic	= BCH_KEY_MAGIC;
+	crypt->key.magic	= cpu_to_le64(BCH_KEY_MAGIC);
 	crypt->key.key		= key;
 
 	SET_BCH_SB_ENCRYPTION_TYPE(c->disk_sb.sb, 0);
@@ -625,7 +625,7 @@ int bch2_enable_encryption(struct bch_fs *c, bool keyed)
 	if (ret)
 		goto err;
 
-	key.magic = BCH_KEY_MAGIC;
+	key.magic = cpu_to_le64(BCH_KEY_MAGIC);
 	get_random_bytes(&key.key, sizeof(key.key));
 
 	if (keyed) {
diff --git a/fs/bcachefs/counters.c b/fs/bcachefs/counters.c
index e5587bc5a2b7..442a9b806a3c 100644
--- a/fs/bcachefs/counters.c
+++ b/fs/bcachefs/counters.c
@@ -5,7 +5,7 @@
 
 /* BCH_SB_FIELD_counters */
 
-const char * const bch2_counter_names[] = {
+static const char * const bch2_counter_names[] = {
 #define x(t, n, ...) (#t),
 	BCH_PERSISTENT_COUNTERS()
 #undef x
@@ -27,7 +27,7 @@ static int bch2_sb_counters_validate(struct bch_sb *sb,
 	return 0;
 };
 
-void bch2_sb_counters_to_text(struct printbuf *out, struct bch_sb *sb,
+static void bch2_sb_counters_to_text(struct printbuf *out, struct bch_sb *sb,
 			      struct bch_sb_field *f)
 {
 	struct bch_sb_field_counters *ctrs = field_to_type(f, counters);
diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c
index 610dd7425fb4..ef3f1f9b7e8d 100644
--- a/fs/bcachefs/dirent.c
+++ b/fs/bcachefs/dirent.c
@@ -219,7 +219,7 @@ int bch2_dirent_read_target(struct btree_trans *trans, subvol_inum dir,
 	int ret = 0;
 
 	if (d.v->d_type == DT_SUBVOL &&
-	    d.v->d_parent_subvol != dir.subvol)
+	    le32_to_cpu(d.v->d_parent_subvol) != dir.subvol)
 		return 1;
 
 	if (likely(d.v->d_type != DT_SUBVOL)) {
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index 8d091c4a0173..e0d49fe49310 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -385,7 +385,7 @@ static void ec_block_endio(struct bio *bio)
 }
 
 static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf,
-			unsigned rw, unsigned idx, struct closure *cl)
+			blk_opf_t opf, unsigned idx, struct closure *cl)
 {
 	struct bch_stripe *v = &buf->key.v;
 	unsigned offset = 0, bytes = buf->size << 9;
@@ -394,6 +394,7 @@ static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf,
 	enum bch_data_type data_type = idx < buf->key.v.nr_blocks - buf->key.v.nr_redundant
 		? BCH_DATA_user
 		: BCH_DATA_parity;
+	int rw = op_is_write(opf);
 
 	if (ptr_stale(ca, ptr)) {
 		bch_err_ratelimited(c,
@@ -419,7 +420,7 @@ static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf,
 
 		ec_bio = container_of(bio_alloc_bioset(ca->disk_sb.bdev,
 						       nr_iovecs,
-						       rw,
+						       opf,
 						       GFP_KERNEL,
 						       &c->ec_bioset),
 				      struct ec_bio, bio);
@@ -1380,11 +1381,12 @@ void bch2_ec_stripe_head_put(struct bch_fs *c, struct ec_stripe_head *h)
 	mutex_unlock(&h->lock);
 }
 
-struct ec_stripe_head *__bch2_ec_stripe_head_get(struct btree_trans *trans,
-						 unsigned target,
-						 unsigned algo,
-						 unsigned redundancy,
-						 enum bch_watermark watermark)
+static struct ec_stripe_head *
+__bch2_ec_stripe_head_get(struct btree_trans *trans,
+			  unsigned target,
+			  unsigned algo,
+			  unsigned redundancy,
+			  enum bch_watermark watermark)
 {
 	struct bch_fs *c = trans->c;
 	struct ec_stripe_head *h;
@@ -1570,7 +1572,7 @@ static int __bch2_ec_stripe_head_reuse(struct btree_trans *trans, struct ec_stri
 	}
 
 	BUG_ON(h->s->existing_stripe.size != h->blocksize);
-	BUG_ON(h->s->existing_stripe.size != h->s->existing_stripe.key.v.sectors);
+	BUG_ON(h->s->existing_stripe.size != le16_to_cpu(h->s->existing_stripe.key.v.sectors));
 
 	/*
 	 * Free buckets we initially allocated - they might conflict with
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 753a846eaf81..4e89a8be5cb7 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -216,7 +216,7 @@ void bch2_btree_ptr_v2_compat(enum btree_id btree_id, unsigned version,
 	compat_bpos(0, btree_id, version, big_endian, write, &bp.v->min_key);
 
 	if (version < bcachefs_metadata_version_inode_btree_change &&
-	    btree_node_type_is_extents(btree_id) &&
+	    btree_id_is_extents(btree_id) &&
 	    !bkey_eq(bp.v->min_key, POS_MIN))
 		bp.v->min_key = write
 			? bpos_nosnap_predecessor(bp.v->min_key)
@@ -514,13 +514,13 @@ static void bch2_extent_crc_pack(union bch_extent_crc *dst,
 	switch (type) {
 	case BCH_EXTENT_ENTRY_crc32:
 		set_common_fields(dst->crc32, src);
-		dst->crc32.csum	 = *((__le32 *) &src.csum.lo);
+		dst->crc32.csum	 	= (u32 __force) *((__le32 *) &src.csum.lo);
 		break;
 	case BCH_EXTENT_ENTRY_crc64:
 		set_common_fields(dst->crc64, src);
 		dst->crc64.nonce	= src.nonce;
-		dst->crc64.csum_lo	= src.csum.lo;
-		dst->crc64.csum_hi	= *((__le16 *) &src.csum.hi);
+		dst->crc64.csum_lo	= (u64 __force) src.csum.lo;
+		dst->crc64.csum_hi	= (u64 __force) *((__le16 *) &src.csum.hi);
 		break;
 	case BCH_EXTENT_ENTRY_crc128:
 		set_common_fields(dst->crc128, src);
diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h
index c573a40d366a..f6411d63f298 100644
--- a/fs/bcachefs/extents.h
+++ b/fs/bcachefs/extents.h
@@ -154,11 +154,7 @@ bch2_extent_crc_unpack(const struct bkey *k, const union bch_extent_crc *crc)
 			common_fields(crc->crc32),
 		};
 
-		*((__le32 *) &ret.csum.lo) = crc->crc32.csum;
-
-		memcpy(&ret.csum.lo, &crc->crc32.csum,
-		       sizeof(crc->crc32.csum));
-
+		*((__le32 *) &ret.csum.lo) = (__le32 __force) crc->crc32.csum;
 		return ret;
 	}
 	case BCH_EXTENT_ENTRY_crc64: {
@@ -168,7 +164,7 @@ bch2_extent_crc_unpack(const struct bkey *k, const union bch_extent_crc *crc)
 			.csum.lo		= (__force __le64) crc->crc64.csum_lo,
 		};
 
-		*((__le16 *) &ret.csum.hi) = crc->crc64.csum_hi;
+		*((__le16 *) &ret.csum.hi) = (__le16 __force) crc->crc64.csum_hi;
 
 		return ret;
 	}
diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 103f426c88e8..513ffb5d968b 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -458,7 +458,7 @@ enum bch_folio_sector_state {
 #undef x
 };
 
-const char * const bch2_folio_sector_states[] = {
+static const char * const bch2_folio_sector_states[] = {
 #define x(n)	#n,
 	BCH_FOLIO_SECTOR_STATE()
 #undef x
@@ -997,7 +997,7 @@ vm_fault_t bch2_page_fault(struct vm_fault *vmf)
 	struct address_space *mapping = file->f_mapping;
 	struct address_space *fdm = faults_disabled_mapping();
 	struct bch_inode_info *inode = file_bch_inode(file);
-	int ret;
+	vm_fault_t ret;
 
 	if (fdm == mapping)
 		return VM_FAULT_SIGBUS;
@@ -1039,7 +1039,7 @@ vm_fault_t bch2_page_mkwrite(struct vm_fault *vmf)
 	struct bch2_folio_reservation res;
 	unsigned len;
 	loff_t isize;
-	int ret;
+	vm_fault_t ret;
 
 	bch2_folio_reservation_init(c, inode, &res);
 
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 674018a58de5..98fde0bf6edc 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -1696,8 +1696,8 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
 			goto err;
 
 		if (fsck_err_on(ret, c,
-				"dirent points to missing subvolume %llu",
-				le64_to_cpu(d.v->d_child_subvol))) {
+				"dirent points to missing subvolume %u",
+				le32_to_cpu(d.v->d_child_subvol))) {
 			ret = __remove_dirent(trans, d.k->p);
 			goto err;
 		}
@@ -2238,7 +2238,7 @@ static int check_nlinks_find_hardlinks(struct bch_fs *c,
 		 * Backpointer and directory structure checks are sufficient for
 		 * directories, since they can't have hardlinks:
 		 */
-		if (S_ISDIR(le16_to_cpu(u.bi_mode)))
+		if (S_ISDIR(u.bi_mode))
 			continue;
 
 		if (!u.bi_nlink)
@@ -2324,7 +2324,7 @@ static int check_nlinks_update_inode(struct btree_trans *trans, struct btree_ite
 
 	BUG_ON(bch2_inode_unpack(k, &u));
 
-	if (S_ISDIR(le16_to_cpu(u.bi_mode)))
+	if (S_ISDIR(u.bi_mode))
 		return 0;
 
 	if (!u.bi_nlink)
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index 7db94a8cb7ff..33762e4a0f05 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -1878,7 +1878,7 @@ err:
 		op->end_io(op);
 }
 
-const char * const bch2_write_flags[] = {
+static const char * const bch2_write_flags[] = {
 #define x(f)	#f,
 	BCH_WRITE_FLAGS()
 #undef x
diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
index 5174b9497721..72486f1f8a8e 100644
--- a/fs/bcachefs/journal_reclaim.c
+++ b/fs/bcachefs/journal_reclaim.c
@@ -361,7 +361,7 @@ void bch2_journal_pin_drop(struct journal *j,
 	spin_unlock(&j->lock);
 }
 
-enum journal_pin_type journal_pin_type(journal_pin_flush_fn fn)
+static enum journal_pin_type journal_pin_type(journal_pin_flush_fn fn)
 {
 	if (fn == bch2_btree_node_flush0 ||
 	    fn == bch2_btree_node_flush1)
diff --git a/fs/bcachefs/journal_sb.c b/fs/bcachefs/journal_sb.c
index fcefbbe7eda8..cc41bff86d6b 100644
--- a/fs/bcachefs/journal_sb.c
+++ b/fs/bcachefs/journal_sb.c
@@ -201,16 +201,16 @@ int bch2_journal_buckets_to_sb(struct bch_fs *c, struct bch_dev *ca,
 
 	bch2_sb_field_delete(&ca->disk_sb, BCH_SB_FIELD_journal);
 
-	j->d[dst].start = le64_to_cpu(buckets[0]);
-	j->d[dst].nr	= le64_to_cpu(1);
+	j->d[dst].start = cpu_to_le64(buckets[0]);
+	j->d[dst].nr	= cpu_to_le64(1);
 
 	for (i = 1; i < nr; i++) {
 		if (buckets[i] == buckets[i - 1] + 1) {
 			le64_add_cpu(&j->d[dst].nr, 1);
 		} else {
 			dst++;
-			j->d[dst].start = le64_to_cpu(buckets[i]);
-			j->d[dst].nr	= le64_to_cpu(1);
+			j->d[dst].start = cpu_to_le64(buckets[i]);
+			j->d[dst].nr	= cpu_to_le64(1);
 		}
 	}
 
diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c
index 7e1f1828ab20..1decb7191da2 100644
--- a/fs/bcachefs/quota.c
+++ b/fs/bcachefs/quota.c
@@ -480,13 +480,13 @@ static int __bch2_quota_set(struct bch_fs *c, struct bkey_s_c k,
 		}
 
 		if (qdq && qdq->d_fieldmask & QC_SPC_TIMER)
-			mq->c[Q_SPC].timer	= cpu_to_le64(qdq->d_spc_timer);
+			mq->c[Q_SPC].timer	= qdq->d_spc_timer;
 		if (qdq && qdq->d_fieldmask & QC_SPC_WARNS)
-			mq->c[Q_SPC].warns	= cpu_to_le64(qdq->d_spc_warns);
+			mq->c[Q_SPC].warns	= qdq->d_spc_warns;
 		if (qdq && qdq->d_fieldmask & QC_INO_TIMER)
-			mq->c[Q_INO].timer	= cpu_to_le64(qdq->d_ino_timer);
+			mq->c[Q_INO].timer	= qdq->d_ino_timer;
 		if (qdq && qdq->d_fieldmask & QC_INO_WARNS)
-			mq->c[Q_INO].warns	= cpu_to_le64(qdq->d_ino_warns);
+			mq->c[Q_INO].warns	= qdq->d_ino_warns;
 
 		mutex_unlock(&q->lock);
 	}
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 268fae9e7bf9..9b49a6bc6702 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -308,7 +308,7 @@ static void bch2_journal_iter_advance(struct journal_iter *iter)
 	}
 }
 
-struct bkey_s_c bch2_journal_iter_peek(struct journal_iter *iter)
+static struct bkey_s_c bch2_journal_iter_peek(struct journal_iter *iter)
 {
 	struct journal_key *k = iter->keys->d + iter->idx;
 
@@ -1042,7 +1042,7 @@ static int bch2_fs_initialize_subvolumes(struct bch_fs *c)
 	root_snapshot.k.p.offset = U32_MAX;
 	root_snapshot.v.flags	= 0;
 	root_snapshot.v.parent	= 0;
-	root_snapshot.v.subvol	= BCACHEFS_ROOT_SUBVOL;
+	root_snapshot.v.subvol	= cpu_to_le32(BCACHEFS_ROOT_SUBVOL);
 	root_snapshot.v.tree	= cpu_to_le32(1);
 	SET_BCH_SNAPSHOT_SUBVOL(&root_snapshot.v, true);
 
@@ -1468,7 +1468,7 @@ use_clean:
 
 	if (!(c->sb.compat & (1ULL << BCH_COMPAT_extents_above_btree_updates_done)) ||
 	    !(c->sb.compat & (1ULL << BCH_COMPAT_bformat_overflow_done)) ||
-	    le16_to_cpu(c->sb.version_min) < bcachefs_metadata_version_btree_ptr_sectors_written) {
+	    c->sb.version_min < bcachefs_metadata_version_btree_ptr_sectors_written) {
 		struct bch_move_stats stats;
 
 		bch2_move_stats_init(&stats, "recovery");
diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c
index 76efbfce7683..d4c1d43e8c41 100644
--- a/fs/bcachefs/replicas.c
+++ b/fs/bcachefs/replicas.c
@@ -36,8 +36,8 @@ static void bch2_cpu_replicas_sort(struct bch_replicas_cpu *r)
 	eytzinger0_sort(r->entries, r->nr, r->entry_size, memcmp, NULL);
 }
 
-void bch2_replicas_entry_v0_to_text(struct printbuf *out,
-				    struct bch_replicas_entry_v0 *e)
+static void bch2_replicas_entry_v0_to_text(struct printbuf *out,
+					   struct bch_replicas_entry_v0 *e)
 {
 	unsigned i;
 
@@ -272,7 +272,7 @@ static void __replicas_table_update_pcpu(struct bch_fs_usage __percpu *dst_p,
 {
 	unsigned src_nr = sizeof(struct bch_fs_usage) / sizeof(u64) + src_r->nr;
 	struct bch_fs_usage *dst, *src = (void *)
-		bch2_acc_percpu_u64s((void *) src_p, src_nr);
+		bch2_acc_percpu_u64s((u64 __percpu *) src_p, src_nr);
 
 	preempt_disable();
 	dst = this_cpu_ptr(dst_p);
diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c
index 89c7c83c50e8..341c0d1b81d3 100644
--- a/fs/bcachefs/subvolume.c
+++ b/fs/bcachefs/subvolume.c
@@ -825,7 +825,7 @@ static int bch2_snapshot_node_delete(struct btree_trans *trans, u32 id)
 			goto err;
 
 		if (s.v->children[0]) {
-			s_t->v.root_snapshot = cpu_to_le32(s.v->children[0]);
+			s_t->v.root_snapshot = s.v->children[0];
 		} else {
 			s_t->k.type = KEY_TYPE_deleted;
 			set_bkey_val_u64s(&s_t->k, 0);
@@ -1328,7 +1328,7 @@ static int bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid)
 			  __bch2_subvolume_delete(trans, subvolid));
 }
 
-void bch2_subvolume_wait_for_pagecache_and_delete(struct work_struct *work)
+static void bch2_subvolume_wait_for_pagecache_and_delete(struct work_struct *work)
 {
 	struct bch_fs *c = container_of(work, struct bch_fs,
 				snapshot_wait_for_pagecache_and_delete_work);
@@ -1366,7 +1366,7 @@ struct subvolume_unlink_hook {
 	u32				subvol;
 };
 
-int bch2_subvolume_wait_for_pagecache_and_delete_hook(struct btree_trans *trans,
+static int bch2_subvolume_wait_for_pagecache_and_delete_hook(struct btree_trans *trans,
 						      struct btree_trans_commit_hook *_h)
 {
 	struct subvolume_unlink_hook *h = container_of(_h, struct subvolume_unlink_hook, h);
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 7ecbc23af1a1..061a1518f28c 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -754,11 +754,11 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 		goto err;
 
 	/* Compat: */
-	if (sb->version <= bcachefs_metadata_version_inode_v2 &&
+	if (le16_to_cpu(sb->version) <= bcachefs_metadata_version_inode_v2 &&
 	    !BCH_SB_JOURNAL_FLUSH_DELAY(sb))
 		SET_BCH_SB_JOURNAL_FLUSH_DELAY(sb, 1000);
 
-	if (sb->version <= bcachefs_metadata_version_inode_v2 &&
+	if (le16_to_cpu(sb->version) <= bcachefs_metadata_version_inode_v2 &&
 	    !BCH_SB_JOURNAL_RECLAIM_DELAY(sb))
 		SET_BCH_SB_JOURNAL_RECLAIM_DELAY(sb, 100);
 
@@ -1999,7 +1999,7 @@ err:
 BCH_DEBUG_PARAMS()
 #undef BCH_DEBUG_PARAM
 
-unsigned bch2_metadata_version = bcachefs_metadata_version_current;
+static unsigned bch2_metadata_version = bcachefs_metadata_version_current;
 module_param_named(version, bch2_metadata_version, uint, 0400);
 
 module_exit(bcachefs_exit);
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index 364cbcd2654e..ef02e346e334 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -202,7 +202,7 @@ read_attribute(nocow_lock_table);
 #ifdef BCH_WRITE_REF_DEBUG
 read_attribute(write_refs);
 
-const char * const bch2_write_refs[] = {
+static const char * const bch2_write_refs[] = {
 #define x(n)	#n,
 	BCH_WRITE_REFS()
 #undef x
diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c
index 50d69a5634bd..cef23d2ccc5f 100644
--- a/fs/bcachefs/tests.c
+++ b/fs/bcachefs/tests.c
@@ -444,7 +444,7 @@ static int test_peek_end_extents(struct bch_fs *c, u64 nr)
 
 /* extent unit tests */
 
-u64 test_version;
+static u64 test_version;
 
 static int insert_test_extent(struct bch_fs *c,
 			      u64 start, u64 end)
diff --git a/fs/bcachefs/varint.c b/fs/bcachefs/varint.c
index 5143b603bf67..ef030fc02448 100644
--- a/fs/bcachefs/varint.c
+++ b/fs/bcachefs/varint.c
@@ -22,12 +22,13 @@ int bch2_varint_encode(u8 *out, u64 v)
 {
 	unsigned bits = fls64(v|1);
 	unsigned bytes = DIV_ROUND_UP(bits, 7);
+	__le64 v_le;
 
 	if (likely(bytes < 9)) {
 		v <<= bytes;
 		v |= ~(~0 << (bytes - 1));
-		v = cpu_to_le64(v);
-		memcpy(out, &v, bytes);
+		v_le = cpu_to_le64(v);
+		memcpy(out, &v_le, bytes);
 	} else {
 		*out++ = 255;
 		bytes = 9;
@@ -57,9 +58,9 @@ int bch2_varint_decode(const u8 *in, const u8 *end, u64 *out)
 		return -1;
 
 	if (likely(bytes < 9)) {
-		v = 0;
-		memcpy(&v, in, bytes);
-		v = le64_to_cpu(v);
+		__le64 v_le = 0;
+		memcpy(&v_le, in, bytes);
+		v = le64_to_cpu(v_le);
 		v >>= bytes;
 	} else {
 		v = get_unaligned_le64(++in);
diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c
index 05c65d94c00f..f47a085d1434 100644
--- a/fs/bcachefs/xattr.c
+++ b/fs/bcachefs/xattr.c
@@ -135,15 +135,14 @@ static int bch2_xattr_get_trans(struct btree_trans *trans, struct bch_inode_info
 				const char *name, void *buffer, size_t size, int type)
 {
 	struct bch_hash_info hash = bch2_hash_info_init(trans->c, &inode->ei_inode);
+	struct xattr_search_key search = X_SEARCH(type, name, strlen(name));
 	struct btree_iter iter;
 	struct bkey_s_c_xattr xattr;
 	struct bkey_s_c k;
 	int ret;
 
 	ret = bch2_hash_lookup(trans, &iter, bch2_xattr_hash_desc, &hash,
-			       inode_inum(inode),
-			       &X_SEARCH(type, name, strlen(name)),
-			       0);
+			       inode_inum(inode), &search, 0);
 	if (ret)
 		goto err1;
 
-- 
cgit 


From 8726dc936fb79fda4a0280033cdd180f7f343cdd Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 6 Jul 2023 21:16:10 -0400
Subject: bcachefs: Change check for invalid key types

As part of the forward compatibility patch series, we need to allow for
new key types without complaining loudly when running an old version.

This patch changes the flags parameter of bkey_invalid to an enum, and
adds a new flag to indicate we're being called from the transaction
commit path.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c  | 16 +++++++++------
 fs/bcachefs/alloc_background.h  | 17 +++++++++++-----
 fs/bcachefs/backpointers.c      |  3 ++-
 fs/bcachefs/backpointers.h      |  2 +-
 fs/bcachefs/bkey_methods.c      | 43 ++++++++++++++++++++++-------------------
 fs/bcachefs/bkey_methods.h      | 21 ++++++++++++--------
 fs/bcachefs/btree_update_leaf.c |  7 +++++--
 fs/bcachefs/dirent.c            |  3 ++-
 fs/bcachefs/dirent.h            |  4 +++-
 fs/bcachefs/ec.c                |  3 ++-
 fs/bcachefs/ec.h                |  4 +++-
 fs/bcachefs/extents.c           | 12 ++++++++----
 fs/bcachefs/extents.h           | 11 +++++++----
 fs/bcachefs/inode.c             | 12 ++++++++----
 fs/bcachefs/inode.h             | 12 ++++++++----
 fs/bcachefs/journal_io.c        |  3 ++-
 fs/bcachefs/lru.c               |  3 ++-
 fs/bcachefs/lru.h               |  3 ++-
 fs/bcachefs/quota.c             |  3 ++-
 fs/bcachefs/quota.h             |  4 +++-
 fs/bcachefs/reflink.c           |  9 ++++++---
 fs/bcachefs/reflink.h           |  8 +++++---
 fs/bcachefs/subvolume.c         |  6 ++++--
 fs/bcachefs/subvolume.h         |  6 ++++--
 fs/bcachefs/xattr.c             |  3 ++-
 fs/bcachefs/xattr.h             |  3 ++-
 26 files changed, 141 insertions(+), 80 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 782086afde54..b07c09a77089 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -223,7 +223,8 @@ static unsigned bch_alloc_v1_val_u64s(const struct bch_alloc *a)
 }
 
 int bch2_alloc_v1_invalid(const struct bch_fs *c, struct bkey_s_c k,
-			  unsigned flags, struct printbuf *err)
+			  enum bkey_invalid_flags flags,
+			  struct printbuf *err)
 {
 	struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k);
 
@@ -238,7 +239,8 @@ int bch2_alloc_v1_invalid(const struct bch_fs *c, struct bkey_s_c k,
 }
 
 int bch2_alloc_v2_invalid(const struct bch_fs *c, struct bkey_s_c k,
-			  unsigned flags, struct printbuf *err)
+			  enum bkey_invalid_flags flags,
+			  struct printbuf *err)
 {
 	struct bkey_alloc_unpacked u;
 
@@ -251,7 +253,8 @@ int bch2_alloc_v2_invalid(const struct bch_fs *c, struct bkey_s_c k,
 }
 
 int bch2_alloc_v3_invalid(const struct bch_fs *c, struct bkey_s_c k,
-			  unsigned flags, struct printbuf *err)
+			  enum bkey_invalid_flags flags,
+			  struct printbuf *err)
 {
 	struct bkey_alloc_unpacked u;
 
@@ -264,7 +267,7 @@ int bch2_alloc_v3_invalid(const struct bch_fs *c, struct bkey_s_c k,
 }
 
 int bch2_alloc_v4_invalid(const struct bch_fs *c, struct bkey_s_c k,
-			  unsigned flags, struct printbuf *err)
+			  enum bkey_invalid_flags flags, struct printbuf *err)
 {
 	struct bkey_s_c_alloc_v4 a = bkey_s_c_to_alloc_v4(k);
 	int rw = flags & WRITE;
@@ -282,7 +285,7 @@ int bch2_alloc_v4_invalid(const struct bch_fs *c, struct bkey_s_c k,
 	}
 
 	if (rw == WRITE &&
-	    !(flags & BKEY_INVALID_FROM_JOURNAL) &&
+	    !(flags & BKEY_INVALID_JOURNAL) &&
 	    test_bit(BCH_FS_CHECK_BACKPOINTERS_DONE, &c->flags)) {
 		unsigned i, bp_len = 0;
 
@@ -605,7 +608,8 @@ static unsigned alloc_gen(struct bkey_s_c k, unsigned offset)
 }
 
 int bch2_bucket_gens_invalid(const struct bch_fs *c, struct bkey_s_c k,
-			     unsigned flags, struct printbuf *err)
+			     enum bkey_invalid_flags flags,
+			     struct printbuf *err)
 {
 	if (bkey_val_bytes(k.k) != sizeof(struct bch_bucket_gens)) {
 		prt_printf(err, "bad val size (%lu != %zu)",
diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h
index 3c4d6d40b120..d1bf45a4b404 100644
--- a/fs/bcachefs/alloc_background.h
+++ b/fs/bcachefs/alloc_background.h
@@ -8,6 +8,8 @@
 #include "debug.h"
 #include "super.h"
 
+enum bkey_invalid_flags;
+
 /* How out of date a pointer gen is allowed to be: */
 #define BUCKET_GC_GEN_MAX	96U
 
@@ -147,10 +149,14 @@ struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut(struct btree_trans *, struct bkey_s
 
 int bch2_bucket_io_time_reset(struct btree_trans *, unsigned, size_t, int);
 
-int bch2_alloc_v1_invalid(const struct bch_fs *, struct bkey_s_c, unsigned, struct printbuf *);
-int bch2_alloc_v2_invalid(const struct bch_fs *, struct bkey_s_c, unsigned, struct printbuf *);
-int bch2_alloc_v3_invalid(const struct bch_fs *, struct bkey_s_c, unsigned, struct printbuf *);
-int bch2_alloc_v4_invalid(const struct bch_fs *, struct bkey_s_c, unsigned, struct printbuf *);
+int bch2_alloc_v1_invalid(const struct bch_fs *, struct bkey_s_c,
+			  enum bkey_invalid_flags, struct printbuf *);
+int bch2_alloc_v2_invalid(const struct bch_fs *, struct bkey_s_c,
+			  enum bkey_invalid_flags, struct printbuf *);
+int bch2_alloc_v3_invalid(const struct bch_fs *, struct bkey_s_c,
+			  enum bkey_invalid_flags, struct printbuf *);
+int bch2_alloc_v4_invalid(const struct bch_fs *, struct bkey_s_c,
+			  enum bkey_invalid_flags, struct printbuf *);
 void bch2_alloc_v4_swab(struct bkey_s);
 void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 
@@ -187,7 +193,8 @@ void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 	.min_val_size	= 48,				\
 })
 
-int bch2_bucket_gens_invalid(const struct bch_fs *, struct bkey_s_c, unsigned, struct printbuf *);
+int bch2_bucket_gens_invalid(const struct bch_fs *, struct bkey_s_c,
+			     enum bkey_invalid_flags, struct printbuf *);
 void bch2_bucket_gens_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 
 #define bch2_bkey_ops_bucket_gens ((struct bkey_ops) {	\
diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c
index 760c4cc16a50..571a7d19bea7 100644
--- a/fs/bcachefs/backpointers.c
+++ b/fs/bcachefs/backpointers.c
@@ -38,7 +38,8 @@ static bool extent_matches_bp(struct bch_fs *c,
 }
 
 int bch2_backpointer_invalid(const struct bch_fs *c, struct bkey_s_c k,
-			     unsigned flags, struct printbuf *err)
+			     enum bkey_invalid_flags flags,
+			     struct printbuf *err)
 {
 	struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(k);
 	struct bpos bucket = bp_pos_to_bucket(c, bp.k->p);
diff --git a/fs/bcachefs/backpointers.h b/fs/bcachefs/backpointers.h
index 3994bc83d69d..87e31aa1975c 100644
--- a/fs/bcachefs/backpointers.h
+++ b/fs/bcachefs/backpointers.h
@@ -8,7 +8,7 @@
 #include "super.h"
 
 int bch2_backpointer_invalid(const struct bch_fs *, struct bkey_s_c k,
-			     unsigned, struct printbuf *);
+			     enum bkey_invalid_flags, struct printbuf *);
 void bch2_backpointer_to_text(struct printbuf *, const struct bch_backpointer *);
 void bch2_backpointer_k_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 void bch2_backpointer_swab(struct bkey_s);
diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c
index 985ea2daa886..1381166bfc55 100644
--- a/fs/bcachefs/bkey_methods.c
+++ b/fs/bcachefs/bkey_methods.c
@@ -123,7 +123,8 @@ const struct bkey_ops bch2_bkey_null_ops = {
 };
 
 int bch2_bkey_val_invalid(struct bch_fs *c, struct bkey_s_c k,
-			  unsigned flags, struct printbuf *err)
+			  enum bkey_invalid_flags flags,
+			  struct printbuf *err)
 {
 	const struct bkey_ops *ops = bch2_bkey_type_ops(k.k->type);
 
@@ -215,14 +216,16 @@ static unsigned bch2_key_types_allowed[] = {
 
 int __bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k,
 			enum btree_node_type type,
-			unsigned flags, struct printbuf *err)
+			enum bkey_invalid_flags flags,
+			struct printbuf *err)
 {
 	if (k.k->u64s < BKEY_U64s) {
 		prt_printf(err, "u64s too small (%u < %zu)", k.k->u64s, BKEY_U64s);
 		return -BCH_ERR_invalid_bkey;
 	}
 
-	if (!(bch2_key_types_allowed[type] & (1U << k.k->type))) {
+	if (flags & BKEY_INVALID_COMMIT	 &&
+	    !(bch2_key_types_allowed[type] & (1U << k.k->type))) {
 		prt_printf(err, "invalid key type for btree %s (%s)",
 			   bch2_btree_ids[type], bch2_bkey_types[k.k->type]);
 		return -BCH_ERR_invalid_bkey;
@@ -246,24 +249,23 @@ int __bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k,
 		}
 	}
 
-	if (type != BKEY_TYPE_btree &&
-	    !btree_type_has_snapshots(type) &&
-	    k.k->p.snapshot) {
-		prt_printf(err, "nonzero snapshot");
-		return -BCH_ERR_invalid_bkey;
-	}
+	if (type != BKEY_TYPE_btree) {
+		if (!btree_type_has_snapshots((enum btree_id) type) &&
+		    k.k->p.snapshot) {
+			prt_printf(err, "nonzero snapshot");
+			return -BCH_ERR_invalid_bkey;
+		}
 
-	if (type != BKEY_TYPE_btree &&
-	    btree_type_has_snapshots(type) &&
-	    !k.k->p.snapshot) {
-		prt_printf(err, "snapshot == 0");
-		return -BCH_ERR_invalid_bkey;
-	}
+		if (btree_type_has_snapshots((enum btree_id) type) &&
+		    !k.k->p.snapshot) {
+			prt_printf(err, "snapshot == 0");
+			return -BCH_ERR_invalid_bkey;
+		}
 
-	if (type != BKEY_TYPE_btree &&
-	    bkey_eq(k.k->p, POS_MAX)) {
-		prt_printf(err, "key at POS_MAX");
-		return -BCH_ERR_invalid_bkey;
+		if (bkey_eq(k.k->p, POS_MAX)) {
+			prt_printf(err, "key at POS_MAX");
+			return -BCH_ERR_invalid_bkey;
+		}
 	}
 
 	return 0;
@@ -271,7 +273,8 @@ int __bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k,
 
 int bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k,
 		      enum btree_node_type type,
-		      unsigned flags, struct printbuf *err)
+		      enum bkey_invalid_flags flags,
+		      struct printbuf *err)
 {
 	return __bch2_bkey_invalid(c, k, type, flags, err) ?:
 		bch2_bkey_val_invalid(c, k, flags, err);
diff --git a/fs/bcachefs/bkey_methods.h b/fs/bcachefs/bkey_methods.h
index 32b86c74cc9a..d1ff83a73511 100644
--- a/fs/bcachefs/bkey_methods.h
+++ b/fs/bcachefs/bkey_methods.h
@@ -13,6 +13,12 @@ enum btree_node_type;
 extern const char * const bch2_bkey_types[];
 extern const struct bkey_ops bch2_bkey_null_ops;
 
+enum bkey_invalid_flags {
+	BKEY_INVALID_WRITE		= (1U << 0),
+	BKEY_INVALID_COMMIT		= (1U << 1),
+	BKEY_INVALID_JOURNAL		= (1U << 2),
+};
+
 /*
  * key_invalid: checks validity of @k, returns 0 if good or -EINVAL if bad. If
  * invalid, entire key will be deleted.
@@ -22,7 +28,7 @@ extern const struct bkey_ops bch2_bkey_null_ops;
  */
 struct bkey_ops {
 	int		(*key_invalid)(const struct bch_fs *c, struct bkey_s_c k,
-				       unsigned flags, struct printbuf *err);
+				       enum bkey_invalid_flags flags, struct printbuf *err);
 	void		(*val_to_text)(struct printbuf *, struct bch_fs *,
 				       struct bkey_s_c);
 	void		(*swab)(struct bkey_s);
@@ -49,13 +55,12 @@ static inline const struct bkey_ops *bch2_bkey_type_ops(enum bch_bkey_type type)
 		: &bch2_bkey_null_ops;
 }
 
-#define BKEY_INVALID_FROM_JOURNAL		(1 << 1)
-
-int bch2_bkey_val_invalid(struct bch_fs *, struct bkey_s_c, unsigned, struct printbuf *);
-int __bch2_bkey_invalid(struct bch_fs *, struct bkey_s_c,
-			enum btree_node_type, unsigned, struct printbuf *);
-int bch2_bkey_invalid(struct bch_fs *, struct bkey_s_c,
-		      enum btree_node_type, unsigned, struct printbuf *);
+int bch2_bkey_val_invalid(struct bch_fs *, struct bkey_s_c,
+			  enum bkey_invalid_flags, struct printbuf *);
+int __bch2_bkey_invalid(struct bch_fs *, struct bkey_s_c, enum btree_node_type,
+			enum bkey_invalid_flags, struct printbuf *);
+int bch2_bkey_invalid(struct bch_fs *, struct bkey_s_c, enum btree_node_type,
+		      enum bkey_invalid_flags, struct printbuf *);
 int bch2_bkey_in_btree_node(struct btree *, struct bkey_s_c, struct printbuf *);
 
 void bch2_bpos_to_text(struct printbuf *, struct bpos);
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 1474dca26dde..2b43f02fc455 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -856,10 +856,13 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, unsigned flags
 	struct printbuf buf = PRINTBUF;
 
 	trans_for_each_update(trans, i) {
-		int rw = (flags & BTREE_INSERT_JOURNAL_REPLAY) ? READ : WRITE;
+		enum bkey_invalid_flags invalid_flags = 0;
+
+		if (!(flags & BTREE_INSERT_JOURNAL_REPLAY))
+			invalid_flags |= BKEY_INVALID_WRITE|BKEY_INVALID_COMMIT;
 
 		if (unlikely(bch2_bkey_invalid(c, bkey_i_to_s_c(i->k),
-					       i->bkey_type, rw, &buf)))
+					       i->bkey_type, invalid_flags, &buf)))
 			return bch2_trans_commit_bkey_invalid(trans, flags, i, &buf);
 		btree_insert_entry_checks(trans, i);
 	}
diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c
index ef3f1f9b7e8d..065ea59ee9fa 100644
--- a/fs/bcachefs/dirent.c
+++ b/fs/bcachefs/dirent.c
@@ -85,7 +85,8 @@ const struct bch_hash_desc bch2_dirent_hash_desc = {
 };
 
 int bch2_dirent_invalid(const struct bch_fs *c, struct bkey_s_c k,
-			unsigned flags, struct printbuf *err)
+			enum bkey_invalid_flags flags,
+			struct printbuf *err)
 {
 	struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
 	unsigned len;
diff --git a/fs/bcachefs/dirent.h b/fs/bcachefs/dirent.h
index bf9ea2e35fae..b42f4a13bc55 100644
--- a/fs/bcachefs/dirent.h
+++ b/fs/bcachefs/dirent.h
@@ -4,9 +4,11 @@
 
 #include "str_hash.h"
 
+enum bkey_invalid_flags;
 extern const struct bch_hash_desc bch2_dirent_hash_desc;
 
-int bch2_dirent_invalid(const struct bch_fs *, struct bkey_s_c, unsigned, struct printbuf *);
+int bch2_dirent_invalid(const struct bch_fs *, struct bkey_s_c,
+			enum bkey_invalid_flags, struct printbuf *);
 void bch2_dirent_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 
 #define bch2_bkey_ops_dirent ((struct bkey_ops) {	\
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index e0d49fe49310..d35a59e2d0e9 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -105,7 +105,8 @@ struct ec_bio {
 /* Stripes btree keys: */
 
 int bch2_stripe_invalid(const struct bch_fs *c, struct bkey_s_c k,
-			unsigned flags, struct printbuf *err)
+			enum bkey_invalid_flags flags,
+			struct printbuf *err)
 {
 	const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
 
diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h
index 64ca277ca1a6..1b1848e5fa5e 100644
--- a/fs/bcachefs/ec.h
+++ b/fs/bcachefs/ec.h
@@ -6,8 +6,10 @@
 #include "buckets_types.h"
 #include "extents_types.h"
 
+enum bkey_invalid_flags;
+
 int bch2_stripe_invalid(const struct bch_fs *, struct bkey_s_c,
-			unsigned, struct printbuf *);
+			enum bkey_invalid_flags, struct printbuf *);
 void bch2_stripe_to_text(struct printbuf *, struct bch_fs *,
 			 struct bkey_s_c);
 
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 4e89a8be5cb7..7a3f42f3bc5b 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -163,7 +163,8 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k,
 /* KEY_TYPE_btree_ptr: */
 
 int bch2_btree_ptr_invalid(const struct bch_fs *c, struct bkey_s_c k,
-			   unsigned flags, struct printbuf *err)
+			   enum bkey_invalid_flags flags,
+			   struct printbuf *err)
 {
 	if (bkey_val_u64s(k.k) > BCH_REPLICAS_MAX) {
 		prt_printf(err, "value too big (%zu > %u)",
@@ -181,7 +182,8 @@ void bch2_btree_ptr_to_text(struct printbuf *out, struct bch_fs *c,
 }
 
 int bch2_btree_ptr_v2_invalid(const struct bch_fs *c, struct bkey_s_c k,
-			      unsigned flags, struct printbuf *err)
+			      enum bkey_invalid_flags flags,
+			      struct printbuf *err)
 {
 	if (bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX) {
 		prt_printf(err, "value too big (%zu > %zu)",
@@ -371,7 +373,8 @@ bool bch2_extent_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r)
 /* KEY_TYPE_reservation: */
 
 int bch2_reservation_invalid(const struct bch_fs *c, struct bkey_s_c k,
-			     unsigned flags, struct printbuf *err)
+			     enum bkey_invalid_flags flags,
+			     struct printbuf *err)
 {
 	struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k);
 
@@ -1103,7 +1106,8 @@ static int extent_ptr_invalid(const struct bch_fs *c,
 }
 
 int bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k,
-			   unsigned flags, struct printbuf *err)
+			   enum bkey_invalid_flags flags,
+			   struct printbuf *err)
 {
 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
 	const union bch_extent_entry *entry;
diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h
index f6411d63f298..7ee8d031bb6c 100644
--- a/fs/bcachefs/extents.h
+++ b/fs/bcachefs/extents.h
@@ -8,6 +8,7 @@
 
 struct bch_fs;
 struct btree_trans;
+enum bkey_invalid_flags;
 
 /* extent entries: */
 
@@ -382,11 +383,13 @@ int bch2_bkey_pick_read_device(struct bch_fs *, struct bkey_s_c,
 
 /* KEY_TYPE_btree_ptr: */
 
-int bch2_btree_ptr_invalid(const struct bch_fs *, struct bkey_s_c, unsigned, struct printbuf *);
+int bch2_btree_ptr_invalid(const struct bch_fs *, struct bkey_s_c,
+			   enum bkey_invalid_flags, struct printbuf *);
 void bch2_btree_ptr_to_text(struct printbuf *, struct bch_fs *,
 			    struct bkey_s_c);
 
-int bch2_btree_ptr_v2_invalid(const struct bch_fs *, struct bkey_s_c, unsigned, struct printbuf *);
+int bch2_btree_ptr_v2_invalid(const struct bch_fs *, struct bkey_s_c,
+			      enum bkey_invalid_flags, struct printbuf *);
 void bch2_btree_ptr_v2_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 void bch2_btree_ptr_v2_compat(enum btree_id, unsigned, unsigned,
 			      int, struct bkey_s);
@@ -426,7 +429,7 @@ bool bch2_extent_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);
 /* KEY_TYPE_reservation: */
 
 int bch2_reservation_invalid(const struct bch_fs *, struct bkey_s_c,
-			     unsigned, struct printbuf *);
+			     enum bkey_invalid_flags, struct printbuf *);
 void bch2_reservation_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 bool bch2_reservation_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);
 
@@ -685,7 +688,7 @@ bool bch2_extent_normalize(struct bch_fs *, struct bkey_s);
 void bch2_bkey_ptrs_to_text(struct printbuf *, struct bch_fs *,
 			    struct bkey_s_c);
 int bch2_bkey_ptrs_invalid(const struct bch_fs *, struct bkey_s_c,
-			   unsigned, struct printbuf *);
+			   enum bkey_invalid_flags, struct printbuf *);
 
 void bch2_ptr_swab(struct bkey_s);
 
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index 64e8d1f8a2fa..fa435d8655de 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -432,7 +432,8 @@ static int __bch2_inode_invalid(struct bkey_s_c k, struct printbuf *err)
 }
 
 int bch2_inode_invalid(const struct bch_fs *c, struct bkey_s_c k,
-		       unsigned flags, struct printbuf *err)
+		       enum bkey_invalid_flags flags,
+		       struct printbuf *err)
 {
 	struct bkey_s_c_inode inode = bkey_s_c_to_inode(k);
 
@@ -446,7 +447,8 @@ int bch2_inode_invalid(const struct bch_fs *c, struct bkey_s_c k,
 }
 
 int bch2_inode_v2_invalid(const struct bch_fs *c, struct bkey_s_c k,
-			  unsigned flags, struct printbuf *err)
+			  enum bkey_invalid_flags flags,
+			  struct printbuf *err)
 {
 	struct bkey_s_c_inode_v2 inode = bkey_s_c_to_inode_v2(k);
 
@@ -460,7 +462,8 @@ int bch2_inode_v2_invalid(const struct bch_fs *c, struct bkey_s_c k,
 }
 
 int bch2_inode_v3_invalid(const struct bch_fs *c, struct bkey_s_c k,
-			  unsigned flags, struct printbuf *err)
+			  enum bkey_invalid_flags flags,
+			  struct printbuf *err)
 {
 	struct bkey_s_c_inode_v3 inode = bkey_s_c_to_inode_v3(k);
 
@@ -517,7 +520,8 @@ void bch2_inode_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c
 }
 
 int bch2_inode_generation_invalid(const struct bch_fs *c, struct bkey_s_c k,
-				  unsigned flags, struct printbuf *err)
+				  enum bkey_invalid_flags flags,
+				  struct printbuf *err)
 {
 	if (k.k->p.inode) {
 		prt_printf(err, "nonzero k.p.inode");
diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h
index 0c3022d3f995..8f9be5e58381 100644
--- a/fs/bcachefs/inode.h
+++ b/fs/bcachefs/inode.h
@@ -5,11 +5,15 @@
 #include "bkey.h"
 #include "opts.h"
 
+enum bkey_invalid_flags;
 extern const char * const bch2_inode_opts[];
 
-int bch2_inode_invalid(const struct bch_fs *, struct bkey_s_c, unsigned, struct printbuf *);
-int bch2_inode_v2_invalid(const struct bch_fs *, struct bkey_s_c, unsigned, struct printbuf *);
-int bch2_inode_v3_invalid(const struct bch_fs *, struct bkey_s_c, unsigned, struct printbuf *);
+int bch2_inode_invalid(const struct bch_fs *, struct bkey_s_c,
+		       enum bkey_invalid_flags, struct printbuf *);
+int bch2_inode_v2_invalid(const struct bch_fs *, struct bkey_s_c,
+			  enum bkey_invalid_flags, struct printbuf *);
+int bch2_inode_v3_invalid(const struct bch_fs *, struct bkey_s_c,
+			  enum bkey_invalid_flags, struct printbuf *);
 void bch2_inode_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 
 #define bch2_bkey_ops_inode ((struct bkey_ops) {	\
@@ -44,7 +48,7 @@ static inline bool bkey_is_inode(const struct bkey *k)
 }
 
 int bch2_inode_generation_invalid(const struct bch_fs *, struct bkey_s_c,
-				  unsigned, struct printbuf *);
+				  enum bkey_invalid_flags, struct printbuf *);
 void bch2_inode_generation_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 
 #define bch2_bkey_ops_inode_generation ((struct bkey_ops) {	\
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index a084c6d0fe23..c7c2ae326ff7 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -340,7 +340,8 @@ static int journal_entry_btree_keys_validate(struct bch_fs *c,
 		int ret = journal_validate_key(c, jset, entry,
 					       entry->level,
 					       entry->btree_id,
-					       k, version, big_endian, write|BKEY_INVALID_FROM_JOURNAL);
+					       k, version, big_endian,
+					       write|BKEY_INVALID_JOURNAL);
 		if (ret == FSCK_DELETED_KEY)
 			continue;
 
diff --git a/fs/bcachefs/lru.c b/fs/bcachefs/lru.c
index e04c037f0c01..07d1929535b8 100644
--- a/fs/bcachefs/lru.c
+++ b/fs/bcachefs/lru.c
@@ -11,7 +11,8 @@
 
 /* KEY_TYPE_lru is obsolete: */
 int bch2_lru_invalid(const struct bch_fs *c, struct bkey_s_c k,
-		     unsigned flags, struct printbuf *err)
+		     enum bkey_invalid_flags flags,
+		     struct printbuf *err)
 {
 	if (!lru_pos_time(k.k->p)) {
 		prt_printf(err, "lru entry at time=0");
diff --git a/fs/bcachefs/lru.h b/fs/bcachefs/lru.h
index adb98429248e..7a3be20a8523 100644
--- a/fs/bcachefs/lru.h
+++ b/fs/bcachefs/lru.h
@@ -43,7 +43,8 @@ static inline enum bch_lru_type lru_type(struct bkey_s_c l)
 	return BCH_LRU_read;
 }
 
-int bch2_lru_invalid(const struct bch_fs *, struct bkey_s_c, unsigned, struct printbuf *);
+int bch2_lru_invalid(const struct bch_fs *, struct bkey_s_c,
+		     enum bkey_invalid_flags, struct printbuf *);
 void bch2_lru_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 
 void bch2_lru_pos_to_text(struct printbuf *, struct bpos);
diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c
index 1decb7191da2..d90db3fb823e 100644
--- a/fs/bcachefs/quota.c
+++ b/fs/bcachefs/quota.c
@@ -60,7 +60,8 @@ const struct bch_sb_field_ops bch_sb_field_ops_quota = {
 };
 
 int bch2_quota_invalid(const struct bch_fs *c, struct bkey_s_c k,
-		       unsigned flags, struct printbuf *err)
+		       enum bkey_invalid_flags flags,
+		       struct printbuf *err)
 {
 	if (k.k->p.inode >= QTYP_NR) {
 		prt_printf(err, "invalid quota type (%llu >= %u)",
diff --git a/fs/bcachefs/quota.h b/fs/bcachefs/quota.h
index b0f7d4ee775e..2f463874a362 100644
--- a/fs/bcachefs/quota.h
+++ b/fs/bcachefs/quota.h
@@ -5,9 +5,11 @@
 #include "inode.h"
 #include "quota_types.h"
 
+enum bkey_invalid_flags;
 extern const struct bch_sb_field_ops bch_sb_field_ops_quota;
 
-int bch2_quota_invalid(const struct bch_fs *, struct bkey_s_c, unsigned, struct printbuf *);
+int bch2_quota_invalid(const struct bch_fs *, struct bkey_s_c,
+		       enum bkey_invalid_flags, struct printbuf *);
 void bch2_quota_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 
 #define bch2_bkey_ops_quota ((struct bkey_ops) {	\
diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c
index 26f0275ff0af..39f711d5069e 100644
--- a/fs/bcachefs/reflink.c
+++ b/fs/bcachefs/reflink.c
@@ -26,7 +26,8 @@ static inline unsigned bkey_type_to_indirect(const struct bkey *k)
 /* reflink pointers */
 
 int bch2_reflink_p_invalid(const struct bch_fs *c, struct bkey_s_c k,
-			   unsigned flags, struct printbuf *err)
+			   enum bkey_invalid_flags flags,
+			   struct printbuf *err)
 {
 	struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
 
@@ -72,7 +73,8 @@ bool bch2_reflink_p_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r
 /* indirect extents */
 
 int bch2_reflink_v_invalid(const struct bch_fs *c, struct bkey_s_c k,
-			   unsigned flags, struct printbuf *err)
+			   enum bkey_invalid_flags flags,
+			   struct printbuf *err)
 {
 	return bch2_bkey_ptrs_invalid(c, k, flags, err);
 }
@@ -117,7 +119,8 @@ int bch2_trans_mark_reflink_v(struct btree_trans *trans,
 /* indirect inline data */
 
 int bch2_indirect_inline_data_invalid(const struct bch_fs *c, struct bkey_s_c k,
-				      unsigned flags, struct printbuf *err)
+				      enum bkey_invalid_flags flags,
+				      struct printbuf *err)
 {
 	return 0;
 }
diff --git a/fs/bcachefs/reflink.h b/fs/bcachefs/reflink.h
index ba400188f5be..fe52538efb52 100644
--- a/fs/bcachefs/reflink.h
+++ b/fs/bcachefs/reflink.h
@@ -2,8 +2,10 @@
 #ifndef _BCACHEFS_REFLINK_H
 #define _BCACHEFS_REFLINK_H
 
+enum bkey_invalid_flags;
+
 int bch2_reflink_p_invalid(const struct bch_fs *, struct bkey_s_c,
-			   unsigned, struct printbuf *);
+			   enum bkey_invalid_flags, struct printbuf *);
 void bch2_reflink_p_to_text(struct printbuf *, struct bch_fs *,
 			    struct bkey_s_c);
 bool bch2_reflink_p_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);
@@ -18,7 +20,7 @@ bool bch2_reflink_p_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);
 })
 
 int bch2_reflink_v_invalid(const struct bch_fs *, struct bkey_s_c,
-			   unsigned, struct printbuf *);
+			   enum bkey_invalid_flags, struct printbuf *);
 void bch2_reflink_v_to_text(struct printbuf *, struct bch_fs *,
 			    struct bkey_s_c);
 int bch2_trans_mark_reflink_v(struct btree_trans *, enum btree_id, unsigned,
@@ -34,7 +36,7 @@ int bch2_trans_mark_reflink_v(struct btree_trans *, enum btree_id, unsigned,
 })
 
 int bch2_indirect_inline_data_invalid(const struct bch_fs *, struct bkey_s_c,
-				      unsigned, struct printbuf *);
+				      enum bkey_invalid_flags, struct printbuf *);
 void bch2_indirect_inline_data_to_text(struct printbuf *,
 				struct bch_fs *, struct bkey_s_c);
 int bch2_trans_mark_indirect_inline_data(struct btree_trans *,
diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c
index 341c0d1b81d3..f26397aa2b31 100644
--- a/fs/bcachefs/subvolume.c
+++ b/fs/bcachefs/subvolume.c
@@ -23,7 +23,8 @@ void bch2_snapshot_tree_to_text(struct printbuf *out, struct bch_fs *c,
 }
 
 int bch2_snapshot_tree_invalid(const struct bch_fs *c, struct bkey_s_c k,
-			       unsigned flags, struct printbuf *err)
+			       enum bkey_invalid_flags flags,
+			       struct printbuf *err)
 {
 	if (bkey_gt(k.k->p, POS(0, U32_MAX)) ||
 	    bkey_lt(k.k->p, POS(0, 1))) {
@@ -97,7 +98,8 @@ void bch2_snapshot_to_text(struct printbuf *out, struct bch_fs *c,
 }
 
 int bch2_snapshot_invalid(const struct bch_fs *c, struct bkey_s_c k,
-			  unsigned flags, struct printbuf *err)
+			  enum bkey_invalid_flags flags,
+			  struct printbuf *err)
 {
 	struct bkey_s_c_snapshot s;
 	u32 i, id;
diff --git a/fs/bcachefs/subvolume.h b/fs/bcachefs/subvolume.h
index 1a39f713db87..105410e080e0 100644
--- a/fs/bcachefs/subvolume.h
+++ b/fs/bcachefs/subvolume.h
@@ -5,9 +5,11 @@
 #include "darray.h"
 #include "subvolume_types.h"
 
+enum bkey_invalid_flags;
+
 void bch2_snapshot_tree_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 int bch2_snapshot_tree_invalid(const struct bch_fs *, struct bkey_s_c,
-			       unsigned, struct printbuf *);
+			       enum bkey_invalid_flags, struct printbuf *);
 
 #define bch2_bkey_ops_snapshot_tree ((struct bkey_ops) {	\
 	.key_invalid	= bch2_snapshot_tree_invalid,		\
@@ -19,7 +21,7 @@ int bch2_snapshot_tree_lookup(struct btree_trans *, u32, struct bch_snapshot_tre
 
 void bch2_snapshot_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 int bch2_snapshot_invalid(const struct bch_fs *, struct bkey_s_c,
-			  unsigned, struct printbuf *);
+			  enum bkey_invalid_flags, struct printbuf *);
 int bch2_mark_snapshot(struct btree_trans *, enum btree_id, unsigned,
 		       struct bkey_s_c, struct bkey_s_c, unsigned);
 
diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c
index f47a085d1434..867cc6878248 100644
--- a/fs/bcachefs/xattr.c
+++ b/fs/bcachefs/xattr.c
@@ -70,7 +70,8 @@ const struct bch_hash_desc bch2_xattr_hash_desc = {
 };
 
 int bch2_xattr_invalid(const struct bch_fs *c, struct bkey_s_c k,
-		       unsigned flags, struct printbuf *err)
+		       enum bkey_invalid_flags flags,
+		       struct printbuf *err)
 {
 	const struct xattr_handler *handler;
 	struct bkey_s_c_xattr xattr = bkey_s_c_to_xattr(k);
diff --git a/fs/bcachefs/xattr.h b/fs/bcachefs/xattr.h
index b3e16729bcbb..214cbbaac304 100644
--- a/fs/bcachefs/xattr.h
+++ b/fs/bcachefs/xattr.h
@@ -6,7 +6,8 @@
 
 extern const struct bch_hash_desc bch2_xattr_hash_desc;
 
-int bch2_xattr_invalid(const struct bch_fs *, struct bkey_s_c, unsigned, struct printbuf *);
+int bch2_xattr_invalid(const struct bch_fs *, struct bkey_s_c,
+		       enum bkey_invalid_flags, struct printbuf *);
 void bch2_xattr_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 
 #define bch2_bkey_ops_xattr ((struct bkey_ops) {	\
-- 
cgit 


From c8b4534d820f47480e7d5efb38d13e10919ccc7c Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 7 Jul 2023 04:38:29 -0400
Subject: bcachefs: Delete redundant log messages

Now that we have distinct error codes for different memory allocation
failures, the early init log messages are no longer needed.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_cache.c | 19 +++++++++----------
 fs/bcachefs/checksum.c    |  3 ---
 fs/bcachefs/compress.c    | 13 +------------
 fs/bcachefs/fs-io.c       |  7 +------
 fs/bcachefs/journal.c     | 20 +++++---------------
 fs/bcachefs/recovery.c    |  2 --
 fs/bcachefs/super-io.c    |  3 ---
 fs/bcachefs/super.c       | 14 +-------------
 8 files changed, 17 insertions(+), 64 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index b36bed3e1348..f1494bd3c4ee 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -457,21 +457,17 @@ int bch2_fs_btree_cache_init(struct bch_fs *c)
 	unsigned i;
 	int ret = 0;
 
-	pr_verbose_init(c->opts, "");
-
 	ret = rhashtable_init(&bc->table, &bch_btree_cache_params);
 	if (ret)
-		goto out;
+		goto err;
 
 	bc->table_init_done = true;
 
 	bch2_recalc_btree_reserve(c);
 
 	for (i = 0; i < bc->reserve; i++)
-		if (!__bch2_btree_node_mem_alloc(c)) {
-			ret = -BCH_ERR_ENOMEM_fs_btree_cache_init;
-			goto out;
-		}
+		if (!__bch2_btree_node_mem_alloc(c))
+			goto err;
 
 	list_splice_init(&bc->live, &bc->freeable);
 
@@ -481,9 +477,12 @@ int bch2_fs_btree_cache_init(struct bch_fs *c)
 	bc->shrink.scan_objects		= bch2_btree_cache_scan;
 	bc->shrink.seeks		= 4;
 	ret = register_shrinker(&bc->shrink, "%s/btree_cache", c->name);
-out:
-	pr_verbose_init(c->opts, "ret %i", ret);
-	return ret;
+	if (ret)
+		goto err;
+
+	return 0;
+err:
+	return -BCH_ERR_ENOMEM_fs_btree_cache_init;
 }
 
 void bch2_fs_btree_cache_init_early(struct btree_cache *bc)
diff --git a/fs/bcachefs/checksum.c b/fs/bcachefs/checksum.c
index f2a56d786024..20d0e8afc1ad 100644
--- a/fs/bcachefs/checksum.c
+++ b/fs/bcachefs/checksum.c
@@ -680,8 +680,6 @@ int bch2_fs_encryption_init(struct bch_fs *c)
 	struct bch_key key;
 	int ret = 0;
 
-	pr_verbose_init(c->opts, "");
-
 	c->sha256 = crypto_alloc_shash("sha256", 0, 0);
 	ret = PTR_ERR_OR_ZERO(c->sha256);
 	if (ret) {
@@ -707,6 +705,5 @@ int bch2_fs_encryption_init(struct bch_fs *c)
 		goto out;
 out:
 	memzero_explicit(&key, sizeof(key));
-	pr_verbose_init(c->opts, "ret %i", ret);
 	return ret;
 }
diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c
index 38a3475b1897..48427a270840 100644
--- a/fs/bcachefs/compress.c
+++ b/fs/bcachefs/compress.c
@@ -542,7 +542,7 @@ void bch2_fs_compress_exit(struct bch_fs *c)
 	mempool_exit(&c->compression_bounce[READ]);
 }
 
-static int _bch2_fs_compress_init(struct bch_fs *c, u64 features)
+static int __bch2_fs_compress_init(struct bch_fs *c, u64 features)
 {
 	size_t decompress_workspace_size = 0;
 	bool decompress_workspace_needed;
@@ -612,17 +612,6 @@ static int _bch2_fs_compress_init(struct bch_fs *c, u64 features)
 	return 0;
 }
 
-static int __bch2_fs_compress_init(struct bch_fs *c, u64 features)
-{
-	int ret;
-
-	pr_verbose_init(c->opts, "");
-	ret = _bch2_fs_compress_init(c, features);
-	pr_verbose_init(c->opts, "ret %i", ret);
-
-	return ret;
-}
-
 int bch2_fs_compress_init(struct bch_fs *c)
 {
 	u64 f = c->sb.features;
diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 513ffb5d968b..cb654cfecfb9 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -3915,10 +3915,6 @@ void bch2_fs_fsio_exit(struct bch_fs *c)
 
 int bch2_fs_fsio_init(struct bch_fs *c)
 {
-	int ret = 0;
-
-	pr_verbose_init(c->opts, "");
-
 	if (bioset_init(&c->writepage_bioset,
 			4, offsetof(struct bch_writepage_io, op.wbio.bio),
 			BIOSET_NEED_BVECS))
@@ -3938,8 +3934,7 @@ int bch2_fs_fsio_init(struct bch_fs *c)
 			1, offsetof(struct nocow_flush, bio), 0))
 		return -BCH_ERR_ENOMEM_nocow_flush_bioset_init;
 
-	pr_verbose_init(c->opts, "ret %i", ret);
-	return ret;
+	return 0;
 }
 
 #endif /* NO_BCACHEFS_FS */
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index f33ab45b7944..9c4f61ab33e4 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -1220,12 +1220,8 @@ void bch2_fs_journal_exit(struct journal *j)
 
 int bch2_fs_journal_init(struct journal *j)
 {
-	struct bch_fs *c = container_of(j, struct bch_fs, journal);
 	static struct lock_class_key res_key;
 	unsigned i;
-	int ret = 0;
-
-	pr_verbose_init(c->opts, "");
 
 	spin_lock_init(&j->lock);
 	spin_lock_init(&j->err_lock);
@@ -1242,24 +1238,18 @@ int bch2_fs_journal_init(struct journal *j)
 		((union journal_res_state)
 		 { .cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL }).v);
 
-	if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL))) {
-		ret = -BCH_ERR_ENOMEM_journal_pin_fifo;
-		goto out;
-	}
+	if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)))
+		return -BCH_ERR_ENOMEM_journal_pin_fifo;
 
 	for (i = 0; i < ARRAY_SIZE(j->buf); i++) {
 		j->buf[i].buf_size = JOURNAL_ENTRY_SIZE_MIN;
 		j->buf[i].data = kvpmalloc(j->buf[i].buf_size, GFP_KERNEL);
-		if (!j->buf[i].data) {
-			ret = -BCH_ERR_ENOMEM_journal_buf;
-			goto out;
-		}
+		if (!j->buf[i].data)
+			return -BCH_ERR_ENOMEM_journal_buf;
 	}
 
 	j->pin.front = j->pin.back = 1;
-out:
-	pr_verbose_init(c->opts, "ret %i", ret);
-	return ret;
+	return 0;
 }
 
 /* debug: */
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 9b49a6bc6702..9ea85b097e8d 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -1504,8 +1504,6 @@ out:
 
 	if (ret)
 		bch_err_fn(c, ret);
-	else
-		bch_verbose(c, "ret %s", bch2_err_str(ret));
 	return ret;
 err:
 fsck_err:
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index 71764e008e3e..7f3e358d6203 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -615,8 +615,6 @@ int bch2_read_super(const char *path, struct bch_opts *opts,
 	__le64 *i;
 	int ret;
 
-	pr_verbose_init(*opts, "");
-
 	memset(sb, 0, sizeof(*sb));
 	sb->mode	= BLK_OPEN_READ;
 	sb->have_bio	= true;
@@ -726,7 +724,6 @@ got_super:
 		goto err_no_print;
 	}
 out:
-	pr_verbose_init(*opts, "ret %i", ret);
 	printbuf_exit(&err);
 	return ret;
 err:
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 061a1518f28c..06d461423da5 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -651,8 +651,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 	unsigned i, iter_size;
 	int ret = 0;
 
-	pr_verbose_init(opts, "");
-
 	c = kvpmalloc(sizeof(struct bch_fs), GFP_KERNEL|__GFP_ZERO);
 	if (!c) {
 		c = ERR_PTR(-BCH_ERR_ENOMEM_fs_alloc);
@@ -863,7 +861,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 	if (ret)
 		goto err;
 out:
-	pr_verbose_init(opts, "ret %i", PTR_ERR_OR_ZERO(c));
 	return c;
 err:
 	bch2_fs_free(c);
@@ -1179,8 +1176,6 @@ static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx)
 	struct bch_dev *ca = NULL;
 	int ret = 0;
 
-	pr_verbose_init(c->opts, "");
-
 	if (bch2_fs_init_fault("dev_alloc"))
 		goto err;
 
@@ -1191,14 +1186,11 @@ static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx)
 	ca->fs = c;
 
 	bch2_dev_attach(c, ca, dev_idx);
-out:
-	pr_verbose_init(c->opts, "ret %i", ret);
 	return ret;
 err:
 	if (ca)
 		bch2_dev_free(ca);
-	ret = -BCH_ERR_ENOMEM_dev_alloc;
-	goto out;
+	return -BCH_ERR_ENOMEM_dev_alloc;
 }
 
 static int __bch2_dev_attach_bdev(struct bch_dev *ca, struct bch_sb_handle *sb)
@@ -1874,8 +1866,6 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices,
 	if (!try_module_get(THIS_MODULE))
 		return ERR_PTR(-ENODEV);
 
-	pr_verbose_init(opts, "");
-
 	if (!nr_devices) {
 		ret = -EINVAL;
 		goto err;
@@ -1947,8 +1937,6 @@ out:
 	kfree(sb);
 	printbuf_exit(&errbuf);
 	module_put(THIS_MODULE);
-	pr_verbose_init(opts, "ret %s (%i)", bch2_err_str(PTR_ERR_OR_ZERO(c)),
-			PTR_ERR_OR_ZERO(c));
 	return c;
 err_print:
 	pr_err("bch_fs_open err opening %s: %s",
-- 
cgit 


From 7c50140fce00120b1dcf674759393267689ca2d8 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 7 Jul 2023 17:09:26 -0400
Subject: bcachefs: Convert more -EROFS to private error codes

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/ec.c      |  6 +++---
 fs/bcachefs/errcode.h |  3 +++
 fs/bcachefs/journal.c |  2 +-
 fs/bcachefs/super.c   | 11 ++++++-----
 4 files changed, 13 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index d35a59e2d0e9..efbb7cf7a5d0 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -1024,7 +1024,7 @@ static void zero_out_rest_of_ec_bucket(struct bch_fs *c,
 	int ret;
 
 	if (!bch2_dev_get_ioref(ca, WRITE)) {
-		s->err = -EROFS;
+		s->err = -BCH_ERR_erofs_no_writes;
 		return;
 	}
 
@@ -1401,7 +1401,7 @@ __bch2_ec_stripe_head_get(struct btree_trans *trans,
 		return ERR_PTR(ret);
 
 	if (test_bit(BCH_FS_GOING_RO, &c->flags)) {
-		h = ERR_PTR(-EROFS);
+		h = ERR_PTR(-BCH_ERR_erofs_no_writes);
 		goto found;
 	}
 
@@ -1774,7 +1774,7 @@ static void __bch2_ec_stop(struct bch_fs *c, struct bch_dev *ca)
 		}
 		goto unlock;
 found:
-		h->s->err = -EROFS;
+		h->s->err = -BCH_ERR_erofs_no_writes;
 		ec_stripe_set_pending(c, h);
 unlock:
 		mutex_unlock(&h->lock);
diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h
index 1e06d95f3484..d5277ec7372f 100644
--- a/fs/bcachefs/errcode.h
+++ b/fs/bcachefs/errcode.h
@@ -175,6 +175,9 @@
 	x(EROFS,			erofs_no_writes)			\
 	x(EROFS,			erofs_journal_err)			\
 	x(EROFS,			erofs_sb_err)				\
+	x(EROFS,			erofs_unfixed_errors)			\
+	x(EROFS,			erofs_norecovery)			\
+	x(EROFS,			erofs_nochanges)			\
 	x(EROFS,			insufficient_devices)			\
 	x(0,				operation_blocked)			\
 	x(BCH_ERR_operation_blocked,	btree_cache_cannibalize_lock_blocked)	\
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index 9c4f61ab33e4..80a612c0577f 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -494,7 +494,7 @@ unlock:
 	}
 
 	return ret == JOURNAL_ERR_insufficient_devices
-		? -EROFS
+		? -BCH_ERR_erofs_journal_err
 		: -BCH_ERR_journal_res_get_blocked;
 }
 
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 06d461423da5..6ab98c2299dd 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -361,20 +361,21 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early)
 
 	if (test_bit(BCH_FS_INITIAL_GC_UNFIXED, &c->flags)) {
 		bch_err(c, "cannot go rw, unfixed btree errors");
-		return -EROFS;
+		return -BCH_ERR_erofs_unfixed_errors;
 	}
 
 	if (test_bit(BCH_FS_RW, &c->flags))
 		return 0;
 
+	if (c->opts.norecovery)
+		return -BCH_ERR_erofs_norecovery;
+
 	/*
 	 * nochanges is used for fsck -n mode - we have to allow going rw
 	 * during recovery for that to work:
 	 */
-	if (c->opts.norecovery ||
-	    (c->opts.nochanges &&
-	     (!early || c->opts.read_only)))
-		return -EROFS;
+	if (c->opts.nochanges && (!early || c->opts.read_only))
+		return -BCH_ERR_erofs_nochanges;
 
 	bch_info(c, "going read-write");
 
-- 
cgit 


From 24964e1c5c89e00ca55909ab24ce8d28e2b46406 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 28 Jun 2023 19:59:56 -0400
Subject: bcachefs: BCH_SB_VERSION_UPGRADE_COMPLETE()

Version upgrades are not atomic operations: when we do a version upgrade
we need to update the superblock before we start using new features, and
then when the upgrade completes we need to update the superblock again.
This adds a new superblock field so we can detect and handle incomplete
version upgrades.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c |  3 +--
 fs/bcachefs/bcachefs.h         |  7 ++++++
 fs/bcachefs/bcachefs_format.h  |  5 ++++
 fs/bcachefs/recovery.c         | 54 ++++++++++++++++++++++++------------------
 fs/bcachefs/super-io.c         | 18 ++++++++++++++
 5 files changed, 62 insertions(+), 25 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index b07c09a77089..0fc810ca5b6b 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -1232,8 +1232,7 @@ int bch2_check_alloc_hole_bucket_gens(struct btree_trans *trans,
 	unsigned i, gens_offset, gens_end_offset;
 	int ret;
 
-	if (c->sb.version < bcachefs_metadata_version_bucket_gens &&
-	    !c->opts.version_upgrade)
+	if (c->sb.version < bcachefs_metadata_version_bucket_gens)
 		return 0;
 
 	bch2_btree_iter_set_pos(bucket_gens_iter, alloc_gens_pos(start, &gens_offset));
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index a8488d4e18e8..d7f030aa3039 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -712,6 +712,7 @@ struct bch_fs {
 
 		u16		version;
 		u16		version_min;
+		u16		version_upgrade_complete;
 
 		u8		nr_devices;
 		u8		clean;
@@ -1134,6 +1135,12 @@ static inline bool bch2_dev_exists2(const struct bch_fs *c, unsigned dev)
 	return dev < c->sb.nr_devices && c->devs[dev];
 }
 
+static inline bool bch2_version_upgrading_to(const struct bch_fs *c, unsigned new_version)
+{
+	return c->sb.version_upgrade_complete < new_version &&
+		c->sb.version >= new_version;
+}
+
 #define BKEY_PADDED_ONSTACK(key, pad)				\
 	struct { struct bkey_i key; __u64 key ## _pad[pad]; }
 
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index 49b86bfda76b..c397a3b96bd1 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -1748,6 +1748,11 @@ LE64_BITMASK(BCH_SB_JOURNAL_TRANSACTION_NAMES,struct bch_sb, flags[4], 32, 33);
 LE64_BITMASK(BCH_SB_NOCOW,		struct bch_sb, flags[4], 33, 34);
 LE64_BITMASK(BCH_SB_WRITE_BUFFER_SIZE,	struct bch_sb, flags[4], 34, 54);
 
+/* flags[4] 56-64 unused: */
+
+LE64_BITMASK(BCH_SB_VERSION_UPGRADE_COMPLETE,
+					struct bch_sb, flags[5],  0, 16);
+
 /*
  * Features:
  *
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 9ea85b097e8d..0173707cfd2e 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -1107,6 +1107,31 @@ static int bch2_fs_upgrade_for_subvolumes(struct bch_fs *c)
 	return ret;
 }
 
+static void check_version_upgrade(struct bch_fs *c)
+{
+	unsigned version = c->sb.version_upgrade_complete ?: c->sb.version;
+
+	if (version < bcachefs_metadata_required_upgrade_below) {
+		struct printbuf buf = PRINTBUF;
+
+		if (version != c->sb.version)
+			prt_str(&buf, "version upgrade incomplete:\n");
+
+		prt_str(&buf, "version ");
+		bch2_version_to_text(&buf, version);
+		prt_str(&buf, " prior to ");
+		bch2_version_to_text(&buf, bcachefs_metadata_required_upgrade_below);
+		prt_str(&buf, ", upgrade and fsck required");
+
+		bch_info(c, "%s", buf.buf);
+		printbuf_exit(&buf);
+
+		c->opts.version_upgrade	= true;
+		c->opts.fsck		= true;
+		c->opts.fix_errors	= FSCK_OPT_YES;
+	}
+}
+
 int bch2_fs_recovery(struct bch_fs *c)
 {
 	struct bch_sb_field_clean *clean = NULL;
@@ -1146,23 +1171,8 @@ int bch2_fs_recovery(struct bch_fs *c)
 		goto err;
 	}
 
-	if (!c->opts.nochanges &&
-	    c->sb.version < bcachefs_metadata_required_upgrade_below) {
-		struct printbuf buf = PRINTBUF;
-
-		prt_str(&buf, "version ");
-		bch2_version_to_text(&buf, c->sb.version);
-		prt_str(&buf, " prior to ");
-		bch2_version_to_text(&buf, bcachefs_metadata_required_upgrade_below);
-		prt_str(&buf, ", upgrade and fsck required");
-
-		bch_info(c, "%s", buf.buf);
-		printbuf_exit(&buf);
-
-		c->opts.version_upgrade	= true;
-		c->opts.fsck		= true;
-		c->opts.fix_errors	= FSCK_OPT_YES;
-	}
+	if (!c->opts.nochanges)
+		check_version_upgrade(c);
 
 	if (c->opts.fsck && c->opts.norecovery) {
 		bch_err(c, "cannot select both norecovery and fsck");
@@ -1406,8 +1416,7 @@ use_clean:
 	if (ret)
 		goto err;
 
-	if (c->sb.version < bcachefs_metadata_version_bucket_gens &&
-	    c->opts.version_upgrade) {
+	if (bch2_version_upgrading_to(c, bcachefs_metadata_version_bucket_gens)) {
 		bch_info(c, "initializing bucket_gens");
 		ret = bch2_bucket_gens_init(c);
 		if (ret)
@@ -1415,7 +1424,7 @@ use_clean:
 		bch_verbose(c, "bucket_gens init done");
 	}
 
-	if (c->sb.version < bcachefs_metadata_version_snapshot_2) {
+	if (bch2_version_upgrading_to(c, bcachefs_metadata_version_snapshot_2)) {
 		ret = bch2_fs_upgrade_for_subvolumes(c);
 		if (ret)
 			goto err;
@@ -1443,9 +1452,8 @@ use_clean:
 	}
 
 	mutex_lock(&c->sb_lock);
-	if (c->opts.version_upgrade) {
-		c->disk_sb.sb->version = cpu_to_le16(bcachefs_metadata_version_current);
-		c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALL);
+	if (BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb) != c->sb.version) {
+		SET_BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb, c->sb.version);
 		write_sb = true;
 	}
 
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index 7f3e358d6203..71a1e2d76a15 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -449,6 +449,7 @@ static void bch2_sb_update(struct bch_fs *c)
 	c->sb.user_uuid		= src->user_uuid;
 	c->sb.version		= le16_to_cpu(src->version);
 	c->sb.version_min	= le16_to_cpu(src->version_min);
+	c->sb.version_upgrade_complete = BCH_SB_VERSION_UPGRADE_COMPLETE(src) ?: c->sb.version;
 	c->sb.nr_devices	= src->nr_devices;
 	c->sb.clean		= BCH_SB_CLEAN(src);
 	c->sb.encryption_type	= BCH_SB_ENCRYPTION_TYPE(src);
@@ -1192,7 +1193,19 @@ int bch2_fs_mark_dirty(struct bch_fs *c)
 
 	mutex_lock(&c->sb_lock);
 	SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
+
+	if (BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb) > bcachefs_metadata_version_current)
+		SET_BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb, bcachefs_metadata_version_current);
+
+	if (c->opts.version_upgrade ||
+	    c->sb.version > bcachefs_metadata_version_current)
+		c->disk_sb.sb->version = cpu_to_le16(bcachefs_metadata_version_current);
+
+	if (c->opts.version_upgrade)
+		c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALL);
+
 	c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALWAYS);
+
 	c->disk_sb.sb->compat[0] &= cpu_to_le64((1ULL << BCH_COMPAT_NR) - 1);
 	ret = bch2_write_super(c);
 	mutex_unlock(&c->sb_lock);
@@ -1536,6 +1549,11 @@ void bch2_sb_to_text(struct printbuf *out, struct bch_sb *sb,
 	bch2_version_to_text(out, le16_to_cpu(sb->version));
 	prt_newline(out);
 
+	prt_str(out, "Version upgrade complete:");
+	prt_tab(out);
+	bch2_version_to_text(out, BCH_SB_VERSION_UPGRADE_COMPLETE(sb));
+	prt_newline(out);
+
 	prt_printf(out, "Oldest version on disk:");
 	prt_tab(out);
 	bch2_version_to_text(out, le16_to_cpu(sb->version_min));
-- 
cgit 


From 3045bb958acf5d3bc799c791d14e40676477bd16 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 27 Jun 2023 23:34:02 -0400
Subject: bcachefs: version_upgrade is now an enum

The version_upgrade parameter is now an enum, not a bool, and it's
persistent in the superblock:
 - compatible (default):	upgrade to the latest compatible version
 - incompatible:		upgrade to latest incompatible version
 - none

Currently all upgrades are incompatible upgrades, but the next release
will introduce major:minor versions.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs.h        |  1 +
 fs/bcachefs/bcachefs_format.h | 12 ++++++++++++
 fs/bcachefs/opts.c            |  5 +++++
 fs/bcachefs/opts.h            |  5 +++--
 fs/bcachefs/recovery.c        | 19 +++++++++++--------
 fs/bcachefs/super-io.c        |  6 +++---
 6 files changed, 35 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index d7f030aa3039..1ab32b61f060 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -573,6 +573,7 @@ enum {
 	BCH_FS_INITIAL_GC_UNFIXED,	/* kill when we enumerate fsck errors */
 	BCH_FS_NEED_ANOTHER_GC,
 
+	BCH_FS_VERSION_UPGRADE,
 	BCH_FS_HAVE_DELETED_SNAPSHOTS,
 
 	/* errors: */
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index c397a3b96bd1..8a0f90a83da9 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -1747,6 +1747,7 @@ LE64_BITMASK(BCH_SB_JOURNAL_RECLAIM_DELAY,struct bch_sb, flags[4], 0, 32);
 LE64_BITMASK(BCH_SB_JOURNAL_TRANSACTION_NAMES,struct bch_sb, flags[4], 32, 33);
 LE64_BITMASK(BCH_SB_NOCOW,		struct bch_sb, flags[4], 33, 34);
 LE64_BITMASK(BCH_SB_WRITE_BUFFER_SIZE,	struct bch_sb, flags[4], 34, 54);
+LE64_BITMASK(BCH_SB_VERSION_UPGRADE,	struct bch_sb, flags[4], 54, 56);
 
 /* flags[4] 56-64 unused: */
 
@@ -1819,6 +1820,17 @@ enum bch_sb_compat {
 
 /* options: */
 
+#define BCH_VERSION_UPGRADE_OPTS()	\
+	x(compatible,		0)	\
+	x(incompatible,		1)	\
+	x(none,			2)
+
+enum bch_version_upgrade_opts {
+#define x(t, n) BCH_VERSION_UPGRADE_##t = n,
+	BCH_VERSION_UPGRADE_OPTS()
+#undef x
+};
+
 #define BCH_REPLICAS_MAX		4U
 
 #define BCH_BKEY_PTRS_MAX		16U
diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c
index a05c389830dc..0c0c83fa4264 100644
--- a/fs/bcachefs/opts.c
+++ b/fs/bcachefs/opts.c
@@ -16,6 +16,11 @@ const char * const bch2_error_actions[] = {
 	NULL
 };
 
+const char * const bch2_version_upgrade_opts[] = {
+	BCH_VERSION_UPGRADE_OPTS()
+	NULL
+};
+
 const char * const bch2_sb_features[] = {
 	BCH_SB_FEATURES()
 	NULL
diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
index e7cf7e92f3db..e105a742fd44 100644
--- a/fs/bcachefs/opts.h
+++ b/fs/bcachefs/opts.h
@@ -9,6 +9,7 @@
 #include "bcachefs_format.h"
 
 extern const char * const bch2_error_actions[];
+extern const char * const bch2_version_upgrade_opts[];
 extern const char * const bch2_sb_features[];
 extern const char * const bch2_sb_compat[];
 extern const char * const bch2_btree_ids[];
@@ -388,8 +389,8 @@ enum opt_type {
 	  NULL,		"Reconstruct alloc btree")			\
 	x(version_upgrade,		u8,				\
 	  OPT_FS|OPT_MOUNT,						\
-	  OPT_BOOL(),							\
-	  BCH2_NO_SB_OPT,		false,				\
+	  OPT_STR(bch2_version_upgrade_opts),				\
+	  BCH_SB_VERSION_UPGRADE,	BCH_VERSION_UPGRADE_compatible,	\
 	  NULL,		"Set superblock to latest version,\n"		\
 			"allowing any new features to be used")		\
 	x(buckets_nouse,		u8,				\
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 0173707cfd2e..c90205aa223e 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -1111,11 +1111,16 @@ static void check_version_upgrade(struct bch_fs *c)
 {
 	unsigned version = c->sb.version_upgrade_complete ?: c->sb.version;
 
-	if (version < bcachefs_metadata_required_upgrade_below) {
+	if (version < bcachefs_metadata_required_upgrade_below ||
+	    (version < bcachefs_metadata_version_current &&
+	     c->opts.version_upgrade != BCH_VERSION_UPGRADE_none)) {
 		struct printbuf buf = PRINTBUF;
 
-		if (version != c->sb.version)
-			prt_str(&buf, "version upgrade incomplete:\n");
+		if (version != c->sb.version) {
+			prt_str(&buf, "version upgrade to ");
+			bch2_version_to_text(&buf, c->sb.version);
+			prt_str(&buf, " incomplete:\n");
+		}
 
 		prt_str(&buf, "version ");
 		bch2_version_to_text(&buf, version);
@@ -1126,9 +1131,9 @@ static void check_version_upgrade(struct bch_fs *c)
 		bch_info(c, "%s", buf.buf);
 		printbuf_exit(&buf);
 
-		c->opts.version_upgrade	= true;
 		c->opts.fsck		= true;
 		c->opts.fix_errors	= FSCK_OPT_YES;
+		set_bit(BCH_FS_VERSION_UPGRADE, &c->flags);
 	}
 }
 
@@ -1534,11 +1539,9 @@ int bch2_fs_initialize(struct bch_fs *c)
 	c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_extents_above_btree_updates_done);
 	c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_bformat_overflow_done);
 
-	if (c->sb.version < bcachefs_metadata_version_inode_v3)
-		c->opts.version_upgrade	= true;
-
-	if (c->opts.version_upgrade) {
+	if (c->opts.version_upgrade != BCH_VERSION_UPGRADE_none) {
 		c->disk_sb.sb->version = cpu_to_le16(bcachefs_metadata_version_current);
+		SET_BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb, bcachefs_metadata_version_current);
 		c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALL);
 		bch2_write_super(c);
 	}
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index 71a1e2d76a15..1437c363a4ab 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -816,7 +816,7 @@ int bch2_write_super(struct bch_fs *c)
 	closure_init_stack(cl);
 	memset(&sb_written, 0, sizeof(sb_written));
 
-	if (c->opts.version_upgrade) {
+	if (test_bit(BCH_FS_VERSION_UPGRADE, &c->flags)) {
 		c->disk_sb.sb->magic = BCHFS_MAGIC;
 		c->disk_sb.sb->layout.magic = BCHFS_MAGIC;
 	}
@@ -1197,11 +1197,11 @@ int bch2_fs_mark_dirty(struct bch_fs *c)
 	if (BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb) > bcachefs_metadata_version_current)
 		SET_BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb, bcachefs_metadata_version_current);
 
-	if (c->opts.version_upgrade ||
+	if (test_bit(BCH_FS_VERSION_UPGRADE, &c->flags) ||
 	    c->sb.version > bcachefs_metadata_version_current)
 		c->disk_sb.sb->version = cpu_to_le16(bcachefs_metadata_version_current);
 
-	if (c->opts.version_upgrade)
+	if (test_bit(BCH_FS_VERSION_UPGRADE, &c->flags))
 		c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALL);
 
 	c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALWAYS);
-- 
cgit 


From 9a644843c4701f86594fe0386f00af6b847f747d Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 8 Jul 2023 22:21:45 -0400
Subject: bcachefs: Fix error path in bch2_journal_flush_device_pins()

We need to always call bch2_replicas_gc_end() after we've called
bch2_replicas_gc_start(), else we leave state around that needs to be
cleaned up.

Partial fix for: https://github.com/koverstreet/bcachefs/issues/560

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/journal_reclaim.c | 8 +++-----
 fs/bcachefs/replicas.c        | 3 +++
 2 files changed, 6 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
index 72486f1f8a8e..8de83e103751 100644
--- a/fs/bcachefs/journal_reclaim.c
+++ b/fs/bcachefs/journal_reclaim.c
@@ -845,10 +845,8 @@ int bch2_journal_flush_device_pins(struct journal *j, int dev_idx)
 	 * expects to find devices marked for journal data on unclean mount.
 	 */
 	ret = bch2_journal_meta(&c->journal);
-	if (ret) {
-		mutex_unlock(&c->replicas_gc_lock);
-		return ret;
-	}
+	if (ret)
+		goto err;
 
 	seq = 0;
 	spin_lock(&j->lock);
@@ -867,7 +865,7 @@ int bch2_journal_flush_device_pins(struct journal *j, int dev_idx)
 		spin_lock(&j->lock);
 	}
 	spin_unlock(&j->lock);
-
+err:
 	ret = bch2_replicas_gc_end(c, ret);
 	mutex_unlock(&c->replicas_gc_lock);
 
diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c
index d4c1d43e8c41..5b591c59bc3e 100644
--- a/fs/bcachefs/replicas.c
+++ b/fs/bcachefs/replicas.c
@@ -462,6 +462,9 @@ int bch2_replicas_gc_end(struct bch_fs *c, int ret)
 {
 	lockdep_assert_held(&c->replicas_gc_lock);
 
+	if (ret)
+		goto err;
+
 	mutex_lock(&c->sb_lock);
 	percpu_down_write(&c->mark_lock);
 
-- 
cgit 


From 10a6ced2da8e4b53a14ab172b0ce70b83543bbfb Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 8 Jul 2023 22:27:03 -0400
Subject: bcachefs: Kill bch2_bucket_gens_read()

This folds bch2_bucket_gens_read() into bch2_alloc_read(), doing the
version check there.

This is prep work for enumarating all recovery passes: we need some
cleanup first to make calling all the recovery passes consistent.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c | 100 ++++++++++++++++++-----------------------
 fs/bcachefs/alloc_background.h |   1 -
 fs/bcachefs/recovery.c         |   6 +--
 3 files changed, 45 insertions(+), 62 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 0fc810ca5b6b..9b444bb8683c 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -551,40 +551,6 @@ err:
 	return ERR_PTR(ret);
 }
 
-int bch2_alloc_read(struct bch_fs *c)
-{
-	struct btree_trans trans;
-	struct btree_iter iter;
-	struct bkey_s_c k;
-	struct bch_alloc_v4 a;
-	struct bch_dev *ca;
-	int ret;
-
-	bch2_trans_init(&trans, c, 0, 0);
-
-	for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN,
-			   BTREE_ITER_PREFETCH, k, ret) {
-		/*
-		 * Not a fsck error because this is checked/repaired by
-		 * bch2_check_alloc_key() which runs later:
-		 */
-		if (!bch2_dev_bucket_exists(c, k.k->p))
-			continue;
-
-		ca = bch_dev_bkey_exists(c, k.k->p.inode);
-
-		*bucket_gen(ca, k.k->p.offset) = bch2_alloc_to_v4(k, &a)->gen;
-	}
-	bch2_trans_iter_exit(&trans, &iter);
-
-	bch2_trans_exit(&trans);
-
-	if (ret)
-		bch_err_fn(c, ret);
-
-	return ret;
-}
-
 static struct bpos alloc_gens_pos(struct bpos pos, unsigned *offset)
 {
 	*offset = pos.offset & KEY_TYPE_BUCKET_GENS_MASK;
@@ -692,45 +658,67 @@ int bch2_bucket_gens_init(struct bch_fs *c)
 	return ret;
 }
 
-int bch2_bucket_gens_read(struct bch_fs *c)
+int bch2_alloc_read(struct bch_fs *c)
 {
 	struct btree_trans trans;
 	struct btree_iter iter;
 	struct bkey_s_c k;
-	const struct bch_bucket_gens *g;
 	struct bch_dev *ca;
-	u64 b;
 	int ret;
 
+	down_read(&c->gc_lock);
 	bch2_trans_init(&trans, c, 0, 0);
 
-	for_each_btree_key(&trans, iter, BTREE_ID_bucket_gens, POS_MIN,
-			   BTREE_ITER_PREFETCH, k, ret) {
-		u64 start = bucket_gens_pos_to_alloc(k.k->p, 0).offset;
-		u64 end = bucket_gens_pos_to_alloc(bpos_nosnap_successor(k.k->p), 0).offset;
+	if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_bucket_gens) {
+		const struct bch_bucket_gens *g;
+		u64 b;
 
-		if (k.k->type != KEY_TYPE_bucket_gens)
-			continue;
+		for_each_btree_key(&trans, iter, BTREE_ID_bucket_gens, POS_MIN,
+				   BTREE_ITER_PREFETCH, k, ret) {
+			u64 start = bucket_gens_pos_to_alloc(k.k->p, 0).offset;
+			u64 end = bucket_gens_pos_to_alloc(bpos_nosnap_successor(k.k->p), 0).offset;
 
-		g = bkey_s_c_to_bucket_gens(k).v;
+			if (k.k->type != KEY_TYPE_bucket_gens)
+				continue;
 
-		/*
-		 * Not a fsck error because this is checked/repaired by
-		 * bch2_check_alloc_key() which runs later:
-		 */
-		if (!bch2_dev_exists2(c, k.k->p.inode))
-			continue;
+			g = bkey_s_c_to_bucket_gens(k).v;
+
+			/*
+			 * Not a fsck error because this is checked/repaired by
+			 * bch2_check_alloc_key() which runs later:
+			 */
+			if (!bch2_dev_exists2(c, k.k->p.inode))
+				continue;
 
-		ca = bch_dev_bkey_exists(c, k.k->p.inode);
+			ca = bch_dev_bkey_exists(c, k.k->p.inode);
+
+			for (b = max_t(u64, ca->mi.first_bucket, start);
+			     b < min_t(u64, ca->mi.nbuckets, end);
+			     b++)
+				*bucket_gen(ca, b) = g->gens[b & KEY_TYPE_BUCKET_GENS_MASK];
+		}
+		bch2_trans_iter_exit(&trans, &iter);
+	} else {
+		struct bch_alloc_v4 a;
 
-		for (b = max_t(u64, ca->mi.first_bucket, start);
-		     b < min_t(u64, ca->mi.nbuckets, end);
-		     b++)
-			*bucket_gen(ca, b) = g->gens[b & KEY_TYPE_BUCKET_GENS_MASK];
+		for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN,
+				   BTREE_ITER_PREFETCH, k, ret) {
+			/*
+			 * Not a fsck error because this is checked/repaired by
+			 * bch2_check_alloc_key() which runs later:
+			 */
+			if (!bch2_dev_bucket_exists(c, k.k->p))
+				continue;
+
+			ca = bch_dev_bkey_exists(c, k.k->p.inode);
+
+			*bucket_gen(ca, k.k->p.offset) = bch2_alloc_to_v4(k, &a)->gen;
+		}
+		bch2_trans_iter_exit(&trans, &iter);
 	}
-	bch2_trans_iter_exit(&trans, &iter);
 
 	bch2_trans_exit(&trans);
+	up_read(&c->gc_lock);
 
 	if (ret)
 		bch_err_fn(c, ret);
diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h
index d1bf45a4b404..c0914feb54b5 100644
--- a/fs/bcachefs/alloc_background.h
+++ b/fs/bcachefs/alloc_background.h
@@ -212,7 +212,6 @@ static inline bool bkey_is_alloc(const struct bkey *k)
 }
 
 int bch2_alloc_read(struct bch_fs *);
-int bch2_bucket_gens_read(struct bch_fs *);
 
 int bch2_trans_mark_alloc(struct btree_trans *, enum btree_id, unsigned,
 			  struct bkey_s_c, struct bkey_i *, unsigned);
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index c90205aa223e..4c61a28e492c 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -1309,11 +1309,7 @@ use_clean:
 		goto err;
 
 	bch_verbose(c, "starting alloc read");
-	down_read(&c->gc_lock);
-	ret = c->sb.version < bcachefs_metadata_version_bucket_gens
-		? bch2_alloc_read(c)
-		: bch2_bucket_gens_read(c);
-	up_read(&c->gc_lock);
+	ret = bch2_alloc_read(c);
 	if (ret)
 		goto err;
 	bch_verbose(c, "alloc read done");
-- 
cgit 


From 78328fec704e316b36142a9a13af8665cd46da47 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 8 Jul 2023 22:33:29 -0400
Subject: bcachefs: Stash journal replay params in bch_fs

For the upcoming enumeration of recovery passes, we need all recovery
passes to be called the same way - including journal replay.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs.h |  3 +++
 fs/bcachefs/recovery.c | 11 ++++++++---
 2 files changed, 11 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 1ab32b61f060..67ed55761aec 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -993,6 +993,9 @@ struct bch_fs {
 	/* QUOTAS */
 	struct bch_memquota_type quotas[QTYP_NR];
 
+	/* RECOVERY */
+	u64			journal_replay_seq_start;
+	u64			journal_replay_seq_end;
 	/* DEBUG JUNK */
 	struct dentry		*fs_debug_dir;
 	struct dentry		*btree_debug_dir;
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 4c61a28e492c..1499efc9d2a0 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -624,11 +624,13 @@ static int journal_sort_seq_cmp(const void *_l, const void *_r)
 	return cmp_int(l->journal_seq, r->journal_seq);
 }
 
-static int bch2_journal_replay(struct bch_fs *c, u64 start_seq, u64 end_seq)
+static int bch2_journal_replay(struct bch_fs *c)
 {
 	struct journal_keys *keys = &c->journal_keys;
 	struct journal_key **keys_sorted, *k;
 	struct journal *j = &c->journal;
+	u64 start_seq	= c->journal_replay_seq_start;
+	u64 end_seq	= c->journal_replay_seq_start;
 	size_t i;
 	int ret;
 
@@ -1256,6 +1258,9 @@ use_clean:
 		blacklist_seq = journal_seq = le64_to_cpu(clean->journal_seq) + 1;
 	}
 
+	c->journal_replay_seq_start	= last_seq;
+	c->journal_replay_seq_end	= blacklist_seq - 1;;
+
 	if (c->opts.reconstruct_alloc) {
 		c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
 		drop_alloc_keys(&c->journal_keys);
@@ -1346,7 +1351,7 @@ use_clean:
 		set_bit(BCH_FS_MAY_GO_RW, &c->flags);
 
 		bch_info(c, "starting journal replay, %zu keys", c->journal_keys.nr);
-		ret = bch2_journal_replay(c, last_seq, blacklist_seq - 1);
+		ret = bch2_journal_replay(c);
 		if (ret)
 			goto err;
 		if (c->opts.verbose || !c->sb.clean)
@@ -1406,7 +1411,7 @@ use_clean:
 		set_bit(BCH_FS_MAY_GO_RW, &c->flags);
 
 		bch_verbose(c, "starting journal replay, %zu keys", c->journal_keys.nr);
-		ret = bch2_journal_replay(c, last_seq, blacklist_seq - 1);
+		ret = bch2_journal_replay(c);
 		if (ret)
 			goto err;
 		if (c->opts.verbose || !c->sb.clean)
-- 
cgit 


From 067d228bb0c40542620398ef1d79f00f47c05cbb Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 7 Jul 2023 02:42:28 -0400
Subject: bcachefs: Enumerate recovery passes

Recovery and fsck have many different passes/jobs to do, which always
run in the same order - but not all of them run all the time. Some are
for fsck, some for unclean shutdown, some for version upgrades.

This adds some new structure: a defined list of recovery passes that we
can run in a loop, as well as consolidating the log messages.

The main benefit is consolidating the "should run this recovery pass"
logic, as well as cleaning up the "this recovery pass has finished"
state; instead of having a bunch of ad-hoc state bits in c->flags, we've
now got c->curr_recovery_pass.

By consolidating the "should run this recovery pass" logic, in the
future on disk format upgrades will be able to say "upgrading to this
version requires x passes to run", instead of forcing all of fsck to
run.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c |  12 +--
 fs/bcachefs/alloc_foreground.c |   9 +-
 fs/bcachefs/backpointers.c     |   6 +-
 fs/bcachefs/bcachefs.h         |  49 ++++++++-
 fs/bcachefs/btree_cache.c      |   2 +-
 fs/bcachefs/btree_gc.c         |   4 +-
 fs/bcachefs/fsck.c             |  77 ++------------
 fs/bcachefs/fsck.h             |  10 +-
 fs/bcachefs/recovery.c         | 227 ++++++++++++++++-------------------------
 fs/bcachefs/subvolume.c        |  10 +-
 fs/bcachefs/subvolume.h        |   8 +-
 11 files changed, 175 insertions(+), 239 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 9b444bb8683c..1f6a518cbe36 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -286,7 +286,7 @@ int bch2_alloc_v4_invalid(const struct bch_fs *c, struct bkey_s_c k,
 
 	if (rw == WRITE &&
 	    !(flags & BKEY_INVALID_JOURNAL) &&
-	    test_bit(BCH_FS_CHECK_BACKPOINTERS_DONE, &c->flags)) {
+	    c->curr_recovery_pass > BCH_RECOVERY_PASS_check_btree_backpointers) {
 		unsigned i, bp_len = 0;
 
 		for (i = 0; i < BCH_ALLOC_V4_NR_BACKPOINTERS(a.v); i++)
@@ -336,7 +336,7 @@ int bch2_alloc_v4_invalid(const struct bch_fs *c, struct bkey_s_c k,
 			}
 
 			if (!a.v->io_time[READ] &&
-			    test_bit(BCH_FS_CHECK_ALLOC_TO_LRU_REFS_DONE, &c->flags)) {
+			    c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_to_lru_refs) {
 				prt_printf(err, "cached bucket with read_time == 0");
 				return -BCH_ERR_invalid_bkey;
 			}
@@ -777,7 +777,7 @@ static int bch2_bucket_do_index(struct btree_trans *trans,
 		return ret;
 
 	if (ca->mi.freespace_initialized &&
-	    test_bit(BCH_FS_CHECK_ALLOC_DONE, &c->flags) &&
+	    c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info &&
 	    bch2_trans_inconsistent_on(old.k->type != old_type, trans,
 			"incorrect key when %s %s:%llu:%llu:0 (got %s should be %s)\n"
 			"  for %s",
@@ -1663,7 +1663,7 @@ static int bch2_discard_one_bucket(struct btree_trans *trans,
 	}
 
 	if (a->v.journal_seq > c->journal.flushed_seq_ondisk) {
-		if (test_bit(BCH_FS_CHECK_ALLOC_DONE, &c->flags)) {
+		if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info) {
 			bch2_trans_inconsistent(trans,
 				"clearing need_discard but journal_seq %llu > flushed_seq %llu\n"
 				"%s",
@@ -1676,7 +1676,7 @@ static int bch2_discard_one_bucket(struct btree_trans *trans,
 	}
 
 	if (a->v.data_type != BCH_DATA_need_discard) {
-		if (test_bit(BCH_FS_CHECK_ALLOC_DONE, &c->flags)) {
+		if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info) {
 			bch2_trans_inconsistent(trans,
 				"bucket incorrectly set in need_discard btree\n"
 				"%s",
@@ -1844,7 +1844,7 @@ err:
 		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&a->k_i));
 
 	bch_err(c, "%s", buf.buf);
-	if (test_bit(BCH_FS_CHECK_LRUS_DONE, &c->flags)) {
+	if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_lrus) {
 		bch2_inconsistent_error(c);
 		ret = -EINVAL;
 	}
diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index 0cc5e9f8d461..06bfcc5a498a 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -324,7 +324,7 @@ static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bc
 	a = bch2_alloc_to_v4(k, &a_convert);
 
 	if (a->data_type != BCH_DATA_free) {
-		if (!test_bit(BCH_FS_CHECK_ALLOC_DONE, &c->flags)) {
+		if (c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_alloc_info) {
 			ob = NULL;
 			goto err;
 		}
@@ -340,7 +340,7 @@ static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bc
 	}
 
 	if (genbits != (alloc_freespace_genbits(*a) >> 56) &&
-	    test_bit(BCH_FS_CHECK_ALLOC_DONE, &c->flags)) {
+	    c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info) {
 		prt_printf(&buf, "bucket in freespace btree with wrong genbits (got %u should be %llu)\n"
 		       "  freespace key ",
 		       genbits, alloc_freespace_genbits(*a) >> 56);
@@ -350,10 +350,9 @@ static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bc
 		bch2_trans_inconsistent(trans, "%s", buf.buf);
 		ob = ERR_PTR(-EIO);
 		goto err;
-
 	}
 
-	if (!test_bit(BCH_FS_CHECK_BACKPOINTERS_DONE, &c->flags)) {
+	if (c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_extents_to_backpointers) {
 		struct bch_backpointer bp;
 		struct bpos bp_pos = POS_MIN;
 
@@ -556,7 +555,7 @@ alloc:
 	if (s.skipped_need_journal_commit * 2 > avail)
 		bch2_journal_flush_async(&c->journal, NULL);
 
-	if (!ob && freespace && !test_bit(BCH_FS_CHECK_ALLOC_DONE, &c->flags)) {
+	if (!ob && freespace && c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_alloc_info) {
 		freespace = false;
 		goto alloc;
 	}
diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c
index 571a7d19bea7..d412bae553a0 100644
--- a/fs/bcachefs/backpointers.c
+++ b/fs/bcachefs/backpointers.c
@@ -104,7 +104,7 @@ static noinline int backpointer_mod_err(struct btree_trans *trans,
 		bch2_bkey_val_to_text(&buf, c, orig_k);
 
 		bch_err(c, "%s", buf.buf);
-	} else if (test_bit(BCH_FS_CHECK_BACKPOINTERS_DONE, &c->flags)) {
+	} else if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_extents_to_backpointers) {
 		prt_printf(&buf, "backpointer not found when deleting");
 		prt_newline(&buf);
 		printbuf_indent_add(&buf, 2);
@@ -125,7 +125,7 @@ static noinline int backpointer_mod_err(struct btree_trans *trans,
 
 	printbuf_exit(&buf);
 
-	if (test_bit(BCH_FS_CHECK_BACKPOINTERS_DONE, &c->flags)) {
+	if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_extents_to_backpointers) {
 		bch2_inconsistent_error(c);
 		return -EIO;
 	} else {
@@ -258,7 +258,7 @@ static void backpointer_not_found(struct btree_trans *trans,
 	bch2_backpointer_to_text(&buf, &bp);
 	prt_printf(&buf, "\n  ");
 	bch2_bkey_val_to_text(&buf, c, k);
-	if (!test_bit(BCH_FS_CHECK_BACKPOINTERS_DONE, &c->flags))
+	if (c->curr_recovery_pass >= BCH_RECOVERY_PASS_check_extents_to_backpointers)
 		bch_err_ratelimited(c, "%s", buf.buf);
 	else
 		bch2_trans_inconsistent(trans, "%s", buf.buf);
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 67ed55761aec..cfd4a7b9e894 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -564,11 +564,6 @@ enum {
 
 	/* fsck passes: */
 	BCH_FS_TOPOLOGY_REPAIR_DONE,
-	BCH_FS_INITIAL_GC_DONE,		/* kill when we enumerate fsck passes */
-	BCH_FS_CHECK_ALLOC_DONE,
-	BCH_FS_CHECK_LRUS_DONE,
-	BCH_FS_CHECK_BACKPOINTERS_DONE,
-	BCH_FS_CHECK_ALLOC_TO_LRU_REFS_DONE,
 	BCH_FS_FSCK_DONE,
 	BCH_FS_INITIAL_GC_UNFIXED,	/* kill when we enumerate fsck errors */
 	BCH_FS_NEED_ANOTHER_GC,
@@ -662,6 +657,48 @@ enum bch_write_ref {
 	BCH_WRITE_REF_NR,
 };
 
+#define PASS_SILENT		BIT(0)
+#define PASS_FSCK		BIT(1)
+#define PASS_UNCLEAN		BIT(2)
+#define PASS_ALWAYS		BIT(3)
+#define PASS_UPGRADE(v)		((v) << 4)
+
+#define BCH_RECOVERY_PASSES()									\
+	x(alloc_read,			PASS_ALWAYS)						\
+	x(stripes_read,			PASS_ALWAYS)						\
+	x(initialize_subvolumes,	PASS_UPGRADE(bcachefs_metadata_version_snapshot_2))	\
+	x(snapshots_read,		PASS_ALWAYS)						\
+	x(check_allocations,		PASS_FSCK)						\
+	x(set_may_go_rw,		PASS_ALWAYS|PASS_SILENT)				\
+	x(journal_replay,		PASS_ALWAYS)						\
+	x(check_alloc_info,		PASS_FSCK)						\
+	x(check_lrus,			PASS_FSCK)						\
+	x(check_btree_backpointers,	PASS_FSCK)						\
+	x(check_backpointers_to_extents,PASS_FSCK)						\
+	x(check_extents_to_backpointers,PASS_FSCK)						\
+	x(check_alloc_to_lru_refs,	PASS_FSCK)						\
+	x(fs_freespace_init,		PASS_ALWAYS|PASS_SILENT)				\
+	x(bucket_gens_init,		PASS_UPGRADE(bcachefs_metadata_version_bucket_gens))	\
+	x(fs_upgrade_for_subvolumes,	PASS_UPGRADE(bcachefs_metadata_version_snapshot_2))	\
+	x(check_snapshot_trees,		PASS_FSCK)						\
+	x(check_snapshots,		PASS_FSCK)						\
+	x(check_subvols,		PASS_FSCK)						\
+	x(delete_dead_snapshots,	PASS_FSCK|PASS_UNCLEAN|PASS_SILENT)			\
+	x(check_inodes,			PASS_FSCK|PASS_UNCLEAN)					\
+	x(check_extents,		PASS_FSCK)						\
+	x(check_dirents,		PASS_FSCK)						\
+	x(check_xattrs,			PASS_FSCK)						\
+	x(check_root,			PASS_FSCK)						\
+	x(check_directory_structure,	PASS_FSCK)						\
+	x(check_nlinks,			PASS_FSCK)						\
+	x(fix_reflink_p,		PASS_UPGRADE(bcachefs_metadata_version_reflink_p_fix))	\
+
+enum bch_recovery_pass {
+#define x(n, when)	BCH_RECOVERY_PASS_##n,
+	BCH_RECOVERY_PASSES()
+#undef x
+};
+
 struct bch_fs {
 	struct closure		cl;
 
@@ -996,6 +1033,8 @@ struct bch_fs {
 	/* RECOVERY */
 	u64			journal_replay_seq_start;
 	u64			journal_replay_seq_end;
+	enum bch_recovery_pass	curr_recovery_pass;
+
 	/* DEBUG JUNK */
 	struct dentry		*fs_debug_dir;
 	struct dentry		*btree_debug_dir;
diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index f1494bd3c4ee..346bfaf99460 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -776,7 +776,7 @@ static noinline void btree_bad_header(struct bch_fs *c, struct btree *b)
 {
 	struct printbuf buf = PRINTBUF;
 
-	if (!test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags))
+	if (c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_allocations)
 		return;
 
 	prt_printf(&buf,
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index ac6c748e0f7c..6000b09dec26 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -1807,7 +1807,7 @@ again:
 
 	if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) ||
 	    (BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb) &&
-	     !test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags) &&
+	     c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_allocations &&
 	     c->opts.fix_errors != FSCK_OPT_NO)) {
 		bch_info(c, "Starting topology repair pass");
 		ret = bch2_repair_topology(c);
@@ -1822,7 +1822,7 @@ again:
 
 	if (ret == -BCH_ERR_need_topology_repair &&
 	    !test_bit(BCH_FS_TOPOLOGY_REPAIR_DONE, &c->flags) &&
-	    !test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags)) {
+	    c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_allocations) {
 		set_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags);
 		SET_BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb, true);
 		ret = 0;
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 98fde0bf6edc..ddc2782fc5b1 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -350,7 +350,7 @@ static int lookup_lostfound(struct btree_trans *trans, u32 subvol,
 	}
 
 	/*
-	 * The check_dirents pass has already run, dangling dirents
+	 * The bch2_check_dirents pass has already run, dangling dirents
 	 * shouldn't exist here:
 	 */
 	return __lookup_inode(trans, inum, lostfound, &snapshot);
@@ -1008,8 +1008,9 @@ fsck_err:
 }
 
 noinline_for_stack
-static int check_inodes(struct bch_fs *c, bool full)
+int bch2_check_inodes(struct bch_fs *c)
 {
+	bool full = c->opts.fsck;
 	struct btree_trans trans;
 	struct btree_iter iter;
 	struct bch_inode_unpacked prev = { 0 };
@@ -1404,8 +1405,7 @@ fsck_err:
  * Walk extents: verify that extents have a corresponding S_ISREG inode, and
  * that i_size an i_sectors are consistent
  */
-noinline_for_stack
-static int check_extents(struct bch_fs *c)
+int bch2_check_extents(struct bch_fs *c)
 {
 	struct inode_walker w = inode_walker_init();
 	struct snapshots_seen s;
@@ -1419,8 +1419,6 @@ static int check_extents(struct bch_fs *c)
 	snapshots_seen_init(&s);
 	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
 
-	bch_verbose(c, "checking extents");
-
 	ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_extents,
 			POS(BCACHEFS_ROOT_INO, 0),
 			BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
@@ -1772,8 +1770,7 @@ fsck_err:
  * Walk dirents: verify that they all have a corresponding S_ISDIR inode,
  * validate d_type
  */
-noinline_for_stack
-static int check_dirents(struct bch_fs *c)
+int bch2_check_dirents(struct bch_fs *c)
 {
 	struct inode_walker dir = inode_walker_init();
 	struct inode_walker target = inode_walker_init();
@@ -1784,8 +1781,6 @@ static int check_dirents(struct bch_fs *c)
 	struct bkey_s_c k;
 	int ret = 0;
 
-	bch_verbose(c, "checking dirents");
-
 	snapshots_seen_init(&s);
 	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
 
@@ -1847,8 +1842,7 @@ fsck_err:
 /*
  * Walk xattrs: verify that they all have a corresponding inode
  */
-noinline_for_stack
-static int check_xattrs(struct bch_fs *c)
+int bch2_check_xattrs(struct bch_fs *c)
 {
 	struct inode_walker inode = inode_walker_init();
 	struct bch_hash_info hash_info;
@@ -1857,8 +1851,6 @@ static int check_xattrs(struct bch_fs *c)
 	struct bkey_s_c k;
 	int ret = 0;
 
-	bch_verbose(c, "checking xattrs");
-
 	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
 
 	ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_xattrs,
@@ -1932,13 +1924,10 @@ fsck_err:
 }
 
 /* Get root directory, create if it doesn't exist: */
-noinline_for_stack
-static int check_root(struct bch_fs *c)
+int bch2_check_root(struct bch_fs *c)
 {
 	int ret;
 
-	bch_verbose(c, "checking root directory");
-
 	ret = bch2_trans_do(c, NULL, NULL,
 			     BTREE_INSERT_NOFAIL|
 			     BTREE_INSERT_LAZY_RW,
@@ -2089,11 +2078,10 @@ fsck_err:
 
 /*
  * Check for unreachable inodes, as well as loops in the directory structure:
- * After check_dirents(), if an inode backpointer doesn't exist that means it's
+ * After bch2_check_dirents(), if an inode backpointer doesn't exist that means it's
  * unreachable:
  */
-noinline_for_stack
-static int check_directory_structure(struct bch_fs *c)
+int bch2_check_directory_structure(struct bch_fs *c)
 {
 	struct btree_trans trans;
 	struct btree_iter iter;
@@ -2376,15 +2364,12 @@ static int check_nlinks_update_hardlinks(struct bch_fs *c,
 	return 0;
 }
 
-noinline_for_stack
-static int check_nlinks(struct bch_fs *c)
+int bch2_check_nlinks(struct bch_fs *c)
 {
 	struct nlink_table links = { 0 };
 	u64 this_iter_range_start, next_iter_range_start = 0;
 	int ret = 0;
 
-	bch_verbose(c, "checking inode nlinks");
-
 	do {
 		this_iter_range_start = next_iter_range_start;
 		next_iter_range_start = U64_MAX;
@@ -2442,8 +2427,7 @@ static int fix_reflink_p_key(struct btree_trans *trans, struct btree_iter *iter,
 	return bch2_trans_update(trans, iter, &u->k_i, BTREE_TRIGGER_NORUN);
 }
 
-noinline_for_stack
-static int fix_reflink_p(struct bch_fs *c)
+int bch2_fix_reflink_p(struct bch_fs *c)
 {
 	struct btree_iter iter;
 	struct bkey_s_c k;
@@ -2452,8 +2436,6 @@ static int fix_reflink_p(struct bch_fs *c)
 	if (c->sb.version >= bcachefs_metadata_version_reflink_p_fix)
 		return 0;
 
-	bch_verbose(c, "fixing reflink_p keys");
-
 	ret = bch2_trans_run(c,
 		for_each_btree_key_commit(&trans, iter,
 				BTREE_ID_extents, POS_MIN,
@@ -2466,40 +2448,3 @@ static int fix_reflink_p(struct bch_fs *c)
 		bch_err_fn(c, ret);
 	return ret;
 }
-
-/*
- * Checks for inconsistencies that shouldn't happen, unless we have a bug.
- * Doesn't fix them yet, mainly because they haven't yet been observed:
- */
-int bch2_fsck_full(struct bch_fs *c)
-{
-	int ret;
-again:
-	ret =   bch2_fs_check_snapshot_trees(c);
-		bch2_fs_check_snapshots(c) ?:
-		bch2_fs_check_subvols(c) ?:
-		bch2_delete_dead_snapshots(c) ?:
-		check_inodes(c, true) ?:
-		check_extents(c) ?:
-		check_dirents(c) ?:
-		check_xattrs(c) ?:
-		check_root(c) ?:
-		check_directory_structure(c) ?:
-		check_nlinks(c) ?:
-		fix_reflink_p(c);
-
-	if (bch2_err_matches(ret, BCH_ERR_need_snapshot_cleanup)) {
-		set_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags);
-		goto again;
-	}
-
-	return ret;
-}
-
-int bch2_fsck_walk_inodes_only(struct bch_fs *c)
-{
-	return  bch2_fs_check_snapshots(c) ?:
-		bch2_fs_check_subvols(c) ?:
-		bch2_delete_dead_snapshots(c) ?:
-		check_inodes(c, false);
-}
diff --git a/fs/bcachefs/fsck.h b/fs/bcachefs/fsck.h
index 264f2706b12d..90c87b5089a0 100644
--- a/fs/bcachefs/fsck.h
+++ b/fs/bcachefs/fsck.h
@@ -2,7 +2,13 @@
 #ifndef _BCACHEFS_FSCK_H
 #define _BCACHEFS_FSCK_H
 
-int bch2_fsck_full(struct bch_fs *);
-int bch2_fsck_walk_inodes_only(struct bch_fs *);
+int bch2_check_inodes(struct bch_fs *);
+int bch2_check_extents(struct bch_fs *);
+int bch2_check_dirents(struct bch_fs *);
+int bch2_check_xattrs(struct bch_fs *);
+int bch2_check_root(struct bch_fs *);
+int bch2_check_directory_structure(struct bch_fs *);
+int bch2_check_nlinks(struct bch_fs *);
+int bch2_fix_reflink_p(struct bch_fs *);
 
 #endif /* _BCACHEFS_FSCK_H */
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 1499efc9d2a0..3b9120bd3603 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -1028,7 +1028,7 @@ fsck_err:
 	return ret;
 }
 
-static int bch2_fs_initialize_subvolumes(struct bch_fs *c)
+static int bch2_initialize_subvolumes(struct bch_fs *c)
 {
 	struct bkey_i_snapshot_tree	root_tree;
 	struct bkey_i_snapshot		root_snapshot;
@@ -1139,6 +1139,88 @@ static void check_version_upgrade(struct bch_fs *c)
 	}
 }
 
+static int bch2_check_allocations(struct bch_fs *c)
+{
+	return bch2_gc(c, true, c->opts.norecovery);
+}
+
+static int bch2_set_may_go_rw(struct bch_fs *c)
+{
+	set_bit(BCH_FS_MAY_GO_RW, &c->flags);
+	return 0;
+}
+
+struct recovery_pass_fn {
+	int		(*fn)(struct bch_fs *);
+	const char	*name;
+	unsigned	when;
+};
+
+static struct recovery_pass_fn recovery_passes[] = {
+#define x(_fn, _when)	{ .fn = bch2_##_fn, .name = #_fn, .when = _when },
+	BCH_RECOVERY_PASSES()
+#undef x
+};
+
+static bool should_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass)
+{
+	struct recovery_pass_fn *p = recovery_passes + c->curr_recovery_pass;
+
+	if (c->opts.norecovery && pass > BCH_RECOVERY_PASS_snapshots_read)
+		return false;
+	if ((p->when & PASS_FSCK) && c->opts.fsck)
+		return true;
+	if ((p->when & PASS_UNCLEAN) && !c->sb.clean)
+		return true;
+	if (p->when & PASS_ALWAYS)
+		return true;
+	if (p->when >= PASS_UPGRADE(0) &&
+	    bch2_version_upgrading_to(c, p->when >> 4))
+		return true;
+	return false;
+}
+
+static int bch2_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass)
+{
+	int ret;
+
+	c->curr_recovery_pass = pass;
+
+	if (should_run_recovery_pass(c, pass)) {
+		struct recovery_pass_fn *p = recovery_passes + pass;
+
+		if (!(p->when & PASS_SILENT))
+			printk(KERN_INFO bch2_log_msg(c, "%s..."), p->name);
+		ret = p->fn(c);
+		if (ret)
+			return ret;
+		if (!(p->when & PASS_SILENT))
+			printk(KERN_CONT " done\n");
+	}
+
+	return 0;
+}
+
+static int bch2_run_recovery_passes(struct bch_fs *c)
+{
+	int ret = 0;
+again:
+	while (c->curr_recovery_pass < ARRAY_SIZE(recovery_passes)) {
+		ret = bch2_run_recovery_pass(c, c->curr_recovery_pass);
+		if (ret)
+			break;
+		c->curr_recovery_pass++;
+	}
+
+	if (bch2_err_matches(ret, BCH_ERR_need_snapshot_cleanup)) {
+		set_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags);
+		c->curr_recovery_pass = BCH_RECOVERY_PASS_delete_dead_snapshots;
+		goto again;
+	}
+
+	return ret;
+}
+
 int bch2_fs_recovery(struct bch_fs *c)
 {
 	struct bch_sb_field_clean *clean = NULL;
@@ -1313,141 +1395,9 @@ use_clean:
 	if (ret)
 		goto err;
 
-	bch_verbose(c, "starting alloc read");
-	ret = bch2_alloc_read(c);
-	if (ret)
-		goto err;
-	bch_verbose(c, "alloc read done");
-
-	bch_verbose(c, "starting stripes_read");
-	ret = bch2_stripes_read(c);
+	ret = bch2_run_recovery_passes(c);
 	if (ret)
 		goto err;
-	bch_verbose(c, "stripes_read done");
-
-	if (c->sb.version < bcachefs_metadata_version_snapshot_2) {
-		ret = bch2_fs_initialize_subvolumes(c);
-		if (ret)
-			goto err;
-	}
-
-	bch_verbose(c, "reading snapshots table");
-	ret = bch2_fs_snapshots_start(c);
-	if (ret)
-		goto err;
-	bch_verbose(c, "reading snapshots done");
-
-	if (c->opts.fsck) {
-		bool metadata_only = c->opts.norecovery;
-
-		bch_info(c, "checking allocations");
-		ret = bch2_gc(c, true, metadata_only);
-		if (ret)
-			goto err;
-		bch_verbose(c, "done checking allocations");
-
-		set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags);
-
-		set_bit(BCH_FS_MAY_GO_RW, &c->flags);
-
-		bch_info(c, "starting journal replay, %zu keys", c->journal_keys.nr);
-		ret = bch2_journal_replay(c);
-		if (ret)
-			goto err;
-		if (c->opts.verbose || !c->sb.clean)
-			bch_info(c, "journal replay done");
-
-		bch_info(c, "checking need_discard and freespace btrees");
-		ret = bch2_check_alloc_info(c);
-		if (ret)
-			goto err;
-		bch_verbose(c, "done checking need_discard and freespace btrees");
-
-		set_bit(BCH_FS_CHECK_ALLOC_DONE, &c->flags);
-
-		bch_info(c, "checking lrus");
-		ret = bch2_check_lrus(c);
-		if (ret)
-			goto err;
-		bch_verbose(c, "done checking lrus");
-		set_bit(BCH_FS_CHECK_LRUS_DONE, &c->flags);
-
-		bch_info(c, "checking backpointers to alloc keys");
-		ret = bch2_check_btree_backpointers(c);
-		if (ret)
-			goto err;
-		bch_verbose(c, "done checking backpointers to alloc keys");
-
-		bch_info(c, "checking backpointers to extents");
-		ret = bch2_check_backpointers_to_extents(c);
-		if (ret)
-			goto err;
-		bch_verbose(c, "done checking backpointers to extents");
-
-		bch_info(c, "checking extents to backpointers");
-		ret = bch2_check_extents_to_backpointers(c);
-		if (ret)
-			goto err;
-		bch_verbose(c, "done checking extents to backpointers");
-		set_bit(BCH_FS_CHECK_BACKPOINTERS_DONE, &c->flags);
-
-		bch_info(c, "checking alloc to lru refs");
-		ret = bch2_check_alloc_to_lru_refs(c);
-		if (ret)
-			goto err;
-		bch_verbose(c, "done checking alloc to lru refs");
-		set_bit(BCH_FS_CHECK_ALLOC_TO_LRU_REFS_DONE, &c->flags);
-	} else {
-		set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags);
-		set_bit(BCH_FS_CHECK_ALLOC_DONE, &c->flags);
-		set_bit(BCH_FS_CHECK_LRUS_DONE, &c->flags);
-		set_bit(BCH_FS_CHECK_BACKPOINTERS_DONE, &c->flags);
-		set_bit(BCH_FS_CHECK_ALLOC_TO_LRU_REFS_DONE, &c->flags);
-		set_bit(BCH_FS_FSCK_DONE, &c->flags);
-
-		if (c->opts.norecovery)
-			goto out;
-
-		set_bit(BCH_FS_MAY_GO_RW, &c->flags);
-
-		bch_verbose(c, "starting journal replay, %zu keys", c->journal_keys.nr);
-		ret = bch2_journal_replay(c);
-		if (ret)
-			goto err;
-		if (c->opts.verbose || !c->sb.clean)
-			bch_info(c, "journal replay done");
-	}
-
-	ret = bch2_fs_freespace_init(c);
-	if (ret)
-		goto err;
-
-	if (bch2_version_upgrading_to(c, bcachefs_metadata_version_bucket_gens)) {
-		bch_info(c, "initializing bucket_gens");
-		ret = bch2_bucket_gens_init(c);
-		if (ret)
-			goto err;
-		bch_verbose(c, "bucket_gens init done");
-	}
-
-	if (bch2_version_upgrading_to(c, bcachefs_metadata_version_snapshot_2)) {
-		ret = bch2_fs_upgrade_for_subvolumes(c);
-		if (ret)
-			goto err;
-	}
-
-	if (c->opts.fsck) {
-		ret = bch2_fsck_full(c);
-		if (ret)
-			goto err;
-		bch_verbose(c, "fsck done");
-	} else if (!c->sb.clean) {
-		bch_verbose(c, "checking for deleted inodes");
-		ret = bch2_fsck_walk_inodes_only(c);
-		if (ret)
-			goto err;
-		bch_verbose(c, "check inodes done");
-	}
 
 	if (enabled_qtypes(c)) {
 		bch_verbose(c, "reading quotas");
@@ -1548,10 +1498,7 @@ int bch2_fs_initialize(struct bch_fs *c)
 	}
 	mutex_unlock(&c->sb_lock);
 
-	set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags);
-	set_bit(BCH_FS_CHECK_LRUS_DONE, &c->flags);
-	set_bit(BCH_FS_CHECK_BACKPOINTERS_DONE, &c->flags);
-	set_bit(BCH_FS_CHECK_ALLOC_TO_LRU_REFS_DONE, &c->flags);
+	c->curr_recovery_pass = ARRAY_SIZE(recovery_passes);
 	set_bit(BCH_FS_MAY_GO_RW, &c->flags);
 	set_bit(BCH_FS_FSCK_DONE, &c->flags);
 
@@ -1599,12 +1546,12 @@ int bch2_fs_initialize(struct bch_fs *c)
 	if (ret)
 		goto err;
 
-	ret = bch2_fs_initialize_subvolumes(c);
+	ret = bch2_initialize_subvolumes(c);
 	if (ret)
 		goto err;
 
 	bch_verbose(c, "reading snapshots table");
-	ret = bch2_fs_snapshots_start(c);
+	ret = bch2_snapshots_read(c);
 	if (ret)
 		goto err;
 	bch_verbose(c, "reading snapshots done");
diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c
index f26397aa2b31..f3852c433ca9 100644
--- a/fs/bcachefs/subvolume.c
+++ b/fs/bcachefs/subvolume.c
@@ -408,7 +408,7 @@ fsck_err:
  * And, make sure it points to a subvolume within that snapshot tree, or correct
  * it to point to the oldest subvolume within that snapshot tree.
  */
-int bch2_fs_check_snapshot_trees(struct bch_fs *c)
+int bch2_check_snapshot_trees(struct bch_fs *c)
 {
 	struct btree_iter iter;
 	struct bkey_s_c k;
@@ -612,7 +612,7 @@ fsck_err:
 	return ret;
 }
 
-int bch2_fs_check_snapshots(struct bch_fs *c)
+int bch2_check_snapshots(struct bch_fs *c)
 {
 	struct btree_iter iter;
 	struct bkey_s_c k;
@@ -692,7 +692,7 @@ fsck_err:
 	return ret;
 }
 
-int bch2_fs_check_subvols(struct bch_fs *c)
+int bch2_check_subvols(struct bch_fs *c)
 {
 	struct btree_iter iter;
 	struct bkey_s_c k;
@@ -713,7 +713,7 @@ void bch2_fs_snapshots_exit(struct bch_fs *c)
 	genradix_free(&c->snapshots);
 }
 
-int bch2_fs_snapshots_start(struct bch_fs *c)
+int bch2_snapshots_read(struct bch_fs *c)
 {
 	struct btree_iter iter;
 	struct bkey_s_c k;
@@ -1151,7 +1151,7 @@ static int bch2_delete_dead_snapshots_hook(struct btree_trans *trans,
 
 	set_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags);
 
-	if (!test_bit(BCH_FS_FSCK_DONE, &c->flags))
+	if (c->curr_recovery_pass <= BCH_RECOVERY_PASS_delete_dead_snapshots)
 		return 0;
 
 	bch2_delete_dead_snapshots_async(c);
diff --git a/fs/bcachefs/subvolume.h b/fs/bcachefs/subvolume.h
index 105410e080e0..daa9a6b0819b 100644
--- a/fs/bcachefs/subvolume.h
+++ b/fs/bcachefs/subvolume.h
@@ -130,12 +130,12 @@ static inline int snapshot_list_add(struct bch_fs *c, snapshot_id_list *s, u32 i
 	return ret;
 }
 
-int bch2_fs_check_snapshot_trees(struct bch_fs *);
-int bch2_fs_check_snapshots(struct bch_fs *);
-int bch2_fs_check_subvols(struct bch_fs *);
+int bch2_check_snapshot_trees(struct bch_fs *);
+int bch2_check_snapshots(struct bch_fs *);
+int bch2_check_subvols(struct bch_fs *);
 
 void bch2_fs_snapshots_exit(struct bch_fs *);
-int bch2_fs_snapshots_start(struct bch_fs *);
+int bch2_snapshots_read(struct bch_fs *);
 
 int bch2_subvolume_invalid(const struct bch_fs *, struct bkey_s_c,
 			   unsigned, struct printbuf *);
-- 
cgit 


From dbc7deb2afc71aa466c9ed4c3cefd838d823bff7 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 9 Jul 2023 13:20:29 -0400
Subject: bcachefs: Mark as EXPERIMENTAL

As discussed on list, bcachefs is going to be marked as experimental for
a few releases, until the inevitable tide of new bug reports subsides.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/Kconfig b/fs/bcachefs/Kconfig
index bc56c6bf37d7..49776ba0a031 100644
--- a/fs/bcachefs/Kconfig
+++ b/fs/bcachefs/Kconfig
@@ -1,6 +1,6 @@
 
 config BCACHEFS_FS
-	tristate "bcachefs filesystem support"
+	tristate "bcachefs filesystem support (EXPERIMENTAL)"
 	depends on BLOCK
 	select EXPORTFS
 	select CLOSURES
-- 
cgit 


From e8ee5cc733319496cbe8a97bc75ccdb6058d2da7 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 9 Jul 2023 13:49:34 -0400
Subject: bcachefs: Fix try_decrease_writepoints()

We were freeing open buckets on the writepoint list, but forgetting to
take them off the writepoint list - whoops

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_foreground.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index 06bfcc5a498a..fcb7311b1844 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -1193,6 +1193,7 @@ static bool try_decrease_writepoints(struct btree_trans *trans, unsigned old_nr)
 	bch2_trans_mutex_lock_norelock(trans, &wp->lock);
 	open_bucket_for_each(c, &wp->ptrs, ob, i)
 		open_bucket_free_unused(c, ob);
+	wp->ptrs.nr = 0;
 	mutex_unlock(&wp->lock);
 	return true;
 }
-- 
cgit 


From ca630f1d3767c20026b33ba1603017e5c8e9da5a Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 9 Jul 2023 14:12:58 -0400
Subject: bcachefs: Kill bch2_xattr_get()

Inline it into the only caller

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/xattr.c | 11 ++---------
 fs/bcachefs/xattr.h |  4 +---
 2 files changed, 3 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c
index 867cc6878248..43904c0ec9ba 100644
--- a/fs/bcachefs/xattr.c
+++ b/fs/bcachefs/xattr.c
@@ -166,13 +166,6 @@ err1:
 	return ret < 0 && bch2_err_matches(ret, ENOENT) ? -ENODATA : ret;
 }
 
-int bch2_xattr_get(struct bch_fs *c, struct bch_inode_info *inode,
-		   const char *name, void *buffer, size_t size, int type)
-{
-	return bch2_trans_do(c, NULL, NULL, 0,
-		bch2_xattr_get_trans(&trans, inode, name, buffer, size, type));
-}
-
 int bch2_xattr_set(struct btree_trans *trans, subvol_inum inum,
 		   const struct bch_hash_info *hash_info,
 		   const char *name, const void *value, size_t size,
@@ -365,9 +358,9 @@ static int bch2_xattr_get_handler(const struct xattr_handler *handler,
 {
 	struct bch_inode_info *inode = to_bch_ei(vinode);
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	int ret;
+	int ret = bch2_trans_do(c, NULL, NULL, 0,
+		bch2_xattr_get_trans(&trans, inode, name, buffer, size, handler->flags));
 
-	ret = bch2_xattr_get(c, inode, name, buffer, size, handler->flags);
 	return bch2_err_class(ret);
 }
 
diff --git a/fs/bcachefs/xattr.h b/fs/bcachefs/xattr.h
index 214cbbaac304..ad568c06e1f8 100644
--- a/fs/bcachefs/xattr.h
+++ b/fs/bcachefs/xattr.h
@@ -38,9 +38,7 @@ struct xattr_handler;
 struct bch_hash_info;
 struct bch_inode_info;
 
-int bch2_xattr_get(struct bch_fs *, struct bch_inode_info *,
-		  const char *, void *, size_t, int);
-
+/* Exported for cmd_migrate.c in tools: */
 int bch2_xattr_set(struct btree_trans *, subvol_inum,
 		   const struct bch_hash_info *,
 		   const char *, const void *, size_t, int, int);
-- 
cgit 


From 07f293c8630d5bdae1615e6add90c76fed333d20 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 9 Jul 2023 14:18:28 -0400
Subject: bcachefs: bch2_xattr_set() now updates ctime

Fixes fstests generic/728

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs.h    |  2 ++
 fs/bcachefs/xattr.c | 31 +++++++++++++++++++------------
 fs/bcachefs/xattr.h |  2 +-
 3 files changed, 22 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs.h b/fs/bcachefs/fs.h
index 2e63cb6603bd..6170d214d648 100644
--- a/fs/bcachefs/fs.h
+++ b/fs/bcachefs/fs.h
@@ -196,6 +196,8 @@ int bch2_vfs_init(void);
 
 #else
 
+#define bch2_inode_update_after_write(_trans, _inode, _inode_u, _fields)	do {} while (0)
+
 static inline void bch2_evict_subvolume_inodes(struct bch_fs *c,
 					       snapshot_id_list *s) {}
 static inline void bch2_vfs_exit(void) {}
diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c
index 43904c0ec9ba..70f78006daf2 100644
--- a/fs/bcachefs/xattr.c
+++ b/fs/bcachefs/xattr.c
@@ -167,23 +167,22 @@ err1:
 }
 
 int bch2_xattr_set(struct btree_trans *trans, subvol_inum inum,
+		   struct bch_inode_unpacked *inode_u,
 		   const struct bch_hash_info *hash_info,
 		   const char *name, const void *value, size_t size,
 		   int type, int flags)
 {
+	struct bch_fs *c = trans->c;
 	struct btree_iter inode_iter = { NULL };
-	struct bch_inode_unpacked inode_u;
 	int ret;
 
-	/*
-	 * We need to do an inode update so that bi_journal_sync gets updated
-	 * and fsync works:
-	 *
-	 * Perhaps we should be updating bi_mtime too?
-	 */
+	ret = bch2_inode_peek(trans, &inode_iter, inode_u, inum, BTREE_ITER_INTENT);
+	if (ret)
+		return ret;
 
-	ret   = bch2_inode_peek(trans, &inode_iter, &inode_u, inum, BTREE_ITER_INTENT) ?:
-		bch2_inode_write(trans, &inode_iter, &inode_u);
+	inode_u->bi_ctime = bch2_current_time(c);
+
+	ret = bch2_inode_write(trans, &inode_iter, inode_u);
 	bch2_trans_iter_exit(trans, &inode_iter);
 
 	if (ret)
@@ -373,12 +372,20 @@ static int bch2_xattr_set_handler(const struct xattr_handler *handler,
 	struct bch_inode_info *inode = to_bch_ei(vinode);
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
 	struct bch_hash_info hash = bch2_hash_info_init(c, &inode->ei_inode);
+	struct bch_inode_unpacked inode_u;
+	struct btree_trans trans;
 	int ret;
 
-	ret = bch2_trans_do(c, NULL, NULL, 0,
-			bch2_xattr_set(&trans, inode_inum(inode), &hash,
-				       name, value, size,
+	bch2_trans_init(&trans, c, 0, 0);
+
+	ret = commit_do(&trans, NULL, NULL, 0,
+			bch2_xattr_set(&trans, inode_inum(inode), &inode_u,
+				       &hash, name, value, size,
 				       handler->flags, flags));
+	if (!ret)
+		bch2_inode_update_after_write(&trans, inode, &inode_u, ATTR_CTIME);
+	bch2_trans_exit(&trans);
+
 	return bch2_err_class(ret);
 }
 
diff --git a/fs/bcachefs/xattr.h b/fs/bcachefs/xattr.h
index ad568c06e1f8..f5a52e3a6016 100644
--- a/fs/bcachefs/xattr.h
+++ b/fs/bcachefs/xattr.h
@@ -40,7 +40,7 @@ struct bch_inode_info;
 
 /* Exported for cmd_migrate.c in tools: */
 int bch2_xattr_set(struct btree_trans *, subvol_inum,
-		   const struct bch_hash_info *,
+		   struct bch_inode_unpacked *, const struct bch_hash_info *,
 		   const char *, const void *, size_t, int, int);
 
 ssize_t bch2_xattr_list(struct dentry *, char *, size_t);
-- 
cgit 


From 30a8278a1e2f627b1f28ab521e40eecacb223efc Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 9 Jul 2023 15:13:30 -0400
Subject: bcachefs: Add new assertions for shutdown path

We've been seeing assertions pop that indicate the btree node cache or
key cache have dirty items when we just did a clean shutdown.

Add some more assertions so we can catch this when we're dirtying items.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_key_cache.c   | 1 +
 fs/bcachefs/btree_update_leaf.c | 4 +++-
 2 files changed, 4 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index 1d702efaf074..34d959c4e640 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -780,6 +780,7 @@ bool bch2_btree_insert_key_cached(struct btree_trans *trans,
 	ck->valid = true;
 
 	if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
+		EBUG_ON(test_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags));
 		set_bit(BKEY_CACHED_DIRTY, &ck->flags);
 		atomic_long_inc(&c->btree_key_cache.nr_dirty);
 
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 2b43f02fc455..6e12e8e7c301 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -272,8 +272,10 @@ inline void bch2_btree_insert_key_leaf(struct btree_trans *trans,
 
 	bch2_btree_add_journal_pin(c, b, journal_seq);
 
-	if (unlikely(!btree_node_dirty(b)))
+	if (unlikely(!btree_node_dirty(b))) {
+		EBUG_ON(test_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags));
 		set_btree_node_dirty_acct(c, b);
+	}
 
 	live_u64s_added = (int) b->nr.live_u64s - old_live_u64s;
 	u64s_added = (int) bset_u64s(t) - old_u64s;
-- 
cgit 


From ba8eeae8eee0aae03eb6be0372519b72057d312c Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 27 Jun 2023 22:09:35 -0400
Subject: bcachefs: bcachefs_metadata_version_major_minor

This introduces major/minor versioning to the superblock version number.
Major version number changes indicate incompatible releases; we can move
forward to a new major version number, but not backwards. Minor version
numbers indicate compatible changes - these add features, but can still
be mounted and used by old versions.

With the recent patches that make it possible to roll out new btrees and
key types without breaking compatibility, we should be able to roll out
most new features without incompatible changes.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs.h        |  1 -
 fs/bcachefs/bcachefs_format.h | 46 ++++++++++++++++---------------
 fs/bcachefs/btree_io.c        |  7 +++--
 fs/bcachefs/journal_io.c      | 12 ++++++---
 fs/bcachefs/recovery.c        | 63 +++++++++++++++++++++++++++++++++----------
 fs/bcachefs/super-io.c        | 58 ++++++++++++++++++++++++++-------------
 fs/bcachefs/super-io.h        |  3 ++-
 7 files changed, 129 insertions(+), 61 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index cfd4a7b9e894..88a1782b2a0e 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -568,7 +568,6 @@ enum {
 	BCH_FS_INITIAL_GC_UNFIXED,	/* kill when we enumerate fsck errors */
 	BCH_FS_NEED_ANOTHER_GC,
 
-	BCH_FS_VERSION_UPGRADE,
 	BCH_FS_HAVE_DELETED_SNAPSHOTS,
 
 	/* errors: */
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index 8a0f90a83da9..78771d8d8a62 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -1574,28 +1574,32 @@ struct bch_sb_field_journal_seq_blacklist {
  * One common version number for all on disk data structures - superblock, btree
  * nodes, journal entries
  */
+#define BCH_VERSION_MAJOR(_v)		((__u16) ((_v) >> 10))
+#define BCH_VERSION_MINOR(_v)		((__u16) ((_v) & ~(~0U << 10)))
+#define BCH_VERSION(_major, _minor)	(((_major) << 10)|(_minor) << 0)
 
 #define BCH_METADATA_VERSIONS()				\
-	x(bkey_renumber,		10)		\
-	x(inode_btree_change,		11)		\
-	x(snapshot,			12)		\
-	x(inode_backpointers,		13)		\
-	x(btree_ptr_sectors_written,	14)		\
-	x(snapshot_2,			15)		\
-	x(reflink_p_fix,		16)		\
-	x(subvol_dirent,		17)		\
-	x(inode_v2,			18)		\
-	x(freespace,			19)		\
-	x(alloc_v4,			20)		\
-	x(new_data_types,		21)		\
-	x(backpointers,			22)		\
-	x(inode_v3,			23)		\
-	x(unwritten_extents,		24)		\
-	x(bucket_gens,			25)		\
-	x(lru_v2,			26)		\
-	x(fragmentation_lru,		27)		\
-	x(no_bps_in_alloc_keys,		28)		\
-	x(snapshot_trees,		29)
+	x(bkey_renumber,		BCH_VERSION(0, 10))		\
+	x(inode_btree_change,		BCH_VERSION(0, 11))		\
+	x(snapshot,			BCH_VERSION(0, 12))		\
+	x(inode_backpointers,		BCH_VERSION(0, 13))		\
+	x(btree_ptr_sectors_written,	BCH_VERSION(0, 14))		\
+	x(snapshot_2,			BCH_VERSION(0, 15))		\
+	x(reflink_p_fix,		BCH_VERSION(0, 16))		\
+	x(subvol_dirent,		BCH_VERSION(0, 17))		\
+	x(inode_v2,			BCH_VERSION(0, 18))		\
+	x(freespace,			BCH_VERSION(0, 19))		\
+	x(alloc_v4,			BCH_VERSION(0, 20))		\
+	x(new_data_types,		BCH_VERSION(0, 21))		\
+	x(backpointers,			BCH_VERSION(0, 22))		\
+	x(inode_v3,			BCH_VERSION(0, 23))		\
+	x(unwritten_extents,		BCH_VERSION(0, 24))		\
+	x(bucket_gens,			BCH_VERSION(0, 25))		\
+	x(lru_v2,			BCH_VERSION(0, 26))		\
+	x(fragmentation_lru,		BCH_VERSION(0, 27))		\
+	x(no_bps_in_alloc_keys,		BCH_VERSION(0, 28))		\
+	x(snapshot_trees,		BCH_VERSION(0, 29))		\
+	x(major_minor,			BCH_VERSION(1,  0))
 
 enum bcachefs_metadata_version {
 	bcachefs_metadata_version_min = 9,
@@ -1605,7 +1609,7 @@ enum bcachefs_metadata_version {
 	bcachefs_metadata_version_max
 };
 
-static const unsigned bcachefs_metadata_required_upgrade_below = bcachefs_metadata_version_snapshot_trees;
+static const unsigned bcachefs_metadata_required_upgrade_below = bcachefs_metadata_version_major_minor;
 
 #define bcachefs_metadata_version_current	(bcachefs_metadata_version_max - 1)
 
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index a8197c500894..a8f7b71139a6 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -701,7 +701,9 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
 
 	btree_err_on(!bch2_version_compatible(version),
 		     BTREE_ERR_INCOMPATIBLE, c, ca, b, i,
-		     "unsupported bset version %u", version);
+		     "unsupported bset version %u.%u",
+		     BCH_VERSION_MAJOR(version),
+		     BCH_VERSION_MINOR(version));
 
 	if (btree_err_on(version < c->sb.version_min,
 			 BTREE_ERR_FIXABLE, c, NULL, b, i,
@@ -713,7 +715,8 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
 		mutex_unlock(&c->sb_lock);
 	}
 
-	if (btree_err_on(version > c->sb.version,
+	if (btree_err_on(BCH_VERSION_MAJOR(version) >
+			 BCH_VERSION_MAJOR(c->sb.version),
 			 BTREE_ERR_FIXABLE, c, NULL, b, i,
 			 "bset version %u newer than superblock version %u",
 			 version, c->sb.version)) {
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index c7c2ae326ff7..f861ae2f176a 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -747,9 +747,11 @@ static int jset_validate(struct bch_fs *c,
 
 	version = le32_to_cpu(jset->version);
 	if (journal_entry_err_on(!bch2_version_compatible(version), c, jset, NULL,
-			"%s sector %llu seq %llu: incompatible journal entry version %u",
+			"%s sector %llu seq %llu: incompatible journal entry version %u.%u",
 			ca ? ca->name : c->name,
-			sector, le64_to_cpu(jset->seq), version)) {
+			sector, le64_to_cpu(jset->seq),
+			BCH_VERSION_MAJOR(version),
+			BCH_VERSION_MINOR(version))) {
 		/* don't try to continue: */
 		return -EINVAL;
 	}
@@ -794,9 +796,11 @@ static int jset_validate_early(struct bch_fs *c,
 
 	version = le32_to_cpu(jset->version);
 	if (journal_entry_err_on(!bch2_version_compatible(version), c, jset, NULL,
-			"%s sector %llu seq %llu: unknown journal entry version %u",
+			"%s sector %llu seq %llu: unknown journal entry version %u.%u",
 			ca ? ca->name : c->name,
-			sector, le64_to_cpu(jset->seq), version)) {
+			sector, le64_to_cpu(jset->seq),
+			BCH_VERSION_MAJOR(version),
+			BCH_VERSION_MINOR(version))) {
 		/* don't try to continue: */
 		return -EINVAL;
 	}
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 3b9120bd3603..17ffac089a5d 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -1111,31 +1111,66 @@ static int bch2_fs_upgrade_for_subvolumes(struct bch_fs *c)
 
 static void check_version_upgrade(struct bch_fs *c)
 {
-	unsigned version = c->sb.version_upgrade_complete ?: c->sb.version;
+	unsigned latest_compatible = bch2_version_compatible(c->sb.version);
+	unsigned latest_version	= bcachefs_metadata_version_current;
+	unsigned old_version = c->sb.version_upgrade_complete ?: c->sb.version;
+	unsigned new_version = 0;
+
+	if (old_version < bcachefs_metadata_required_upgrade_below) {
+		if (c->opts.version_upgrade == BCH_VERSION_UPGRADE_incompatible ||
+		    latest_compatible < bcachefs_metadata_required_upgrade_below)
+			new_version = latest_version;
+		else
+			new_version = latest_compatible;
+	} else {
+		switch (c->opts.version_upgrade) {
+		case BCH_VERSION_UPGRADE_compatible:
+			new_version = latest_compatible;
+			break;
+		case BCH_VERSION_UPGRADE_incompatible:
+			new_version = latest_version;
+			break;
+		case BCH_VERSION_UPGRADE_none:
+			new_version = old_version;
+			break;
+		}
+	}
 
-	if (version < bcachefs_metadata_required_upgrade_below ||
-	    (version < bcachefs_metadata_version_current &&
-	     c->opts.version_upgrade != BCH_VERSION_UPGRADE_none)) {
+	if (new_version > old_version) {
 		struct printbuf buf = PRINTBUF;
 
-		if (version != c->sb.version) {
-			prt_str(&buf, "version upgrade to ");
+		if (old_version < bcachefs_metadata_required_upgrade_below)
+			prt_str(&buf, "Version upgrade required:\n");
+
+		if (old_version != c->sb.version) {
+			prt_str(&buf, "Version upgrade from ");
+			bch2_version_to_text(&buf, c->sb.version_upgrade_complete);
+			prt_str(&buf, " to ");
 			bch2_version_to_text(&buf, c->sb.version);
-			prt_str(&buf, " incomplete:\n");
+			prt_str(&buf, " incomplete\n");
 		}
 
-		prt_str(&buf, "version ");
-		bch2_version_to_text(&buf, version);
-		prt_str(&buf, " prior to ");
-		bch2_version_to_text(&buf, bcachefs_metadata_required_upgrade_below);
-		prt_str(&buf, ", upgrade and fsck required");
+		prt_printf(&buf, "Doing %s version upgrade from ",
+			   BCH_VERSION_MAJOR(old_version) != BCH_VERSION_MAJOR(new_version)
+			   ? "incompatible" : "compatible");
+		bch2_version_to_text(&buf, old_version);
+		prt_str(&buf, " to ");
+		bch2_version_to_text(&buf, new_version);
+		prt_newline(&buf);
+
+		prt_str(&buf, "fsck required");
 
 		bch_info(c, "%s", buf.buf);
-		printbuf_exit(&buf);
 
 		c->opts.fsck		= true;
 		c->opts.fix_errors	= FSCK_OPT_YES;
-		set_bit(BCH_FS_VERSION_UPGRADE, &c->flags);
+
+		mutex_lock(&c->sb_lock);
+		c->disk_sb.sb->version = cpu_to_le16(new_version);
+		c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALL);
+		mutex_unlock(&c->sb_lock);
+
+		printbuf_exit(&buf);
 	}
 }
 
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index 1437c363a4ab..23e25af001f0 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -26,19 +26,42 @@
 static const struct blk_holder_ops bch2_sb_handle_bdev_ops = {
 };
 
-static const char * const bch2_metadata_versions[] = {
-#define x(t, n) [n] = #t,
+struct bch2_metadata_version_str {
+	u16		version;
+	const char	*name;
+};
+
+static const struct bch2_metadata_version_str bch2_metadata_versions[] = {
+#define x(n, v) { .version = v, .name = #n },
 	BCH_METADATA_VERSIONS()
 #undef x
 };
 
 void bch2_version_to_text(struct printbuf *out, unsigned v)
 {
-	const char *str = v < ARRAY_SIZE(bch2_metadata_versions)
-		? bch2_metadata_versions[v]
-		: "(unknown version)";
+	const char *str = "(unknown version)";
+
+	for (unsigned i = 0; i < ARRAY_SIZE(bch2_metadata_versions); i++)
+		if (bch2_metadata_versions[i].version == v) {
+			str = bch2_metadata_versions[i].name;
+			break;
+		}
+
+	prt_printf(out, "%u.%u: %s", BCH_VERSION_MAJOR(v), BCH_VERSION_MINOR(v), str);
+}
+
+unsigned bch2_latest_compatible_version(unsigned v)
+{
+	if (!BCH_VERSION_MAJOR(v))
+		return v;
+
+	for (unsigned i = 0; i < ARRAY_SIZE(bch2_metadata_versions); i++)
+		if (bch2_metadata_versions[i].version > v &&
+		    BCH_VERSION_MAJOR(bch2_metadata_versions[i].version) ==
+		    BCH_VERSION_MAJOR(v))
+			v = bch2_metadata_versions[i].version;
 
-	prt_printf(out, "%u: %s", v, str);
+	return v;
 }
 
 const char * const bch2_sb_fields[] = {
@@ -816,10 +839,9 @@ int bch2_write_super(struct bch_fs *c)
 	closure_init_stack(cl);
 	memset(&sb_written, 0, sizeof(sb_written));
 
-	if (test_bit(BCH_FS_VERSION_UPGRADE, &c->flags)) {
-		c->disk_sb.sb->magic = BCHFS_MAGIC;
-		c->disk_sb.sb->layout.magic = BCHFS_MAGIC;
-	}
+	/* Make sure we're using the new magic numbers: */
+	c->disk_sb.sb->magic = BCHFS_MAGIC;
+	c->disk_sb.sb->layout.magic = BCHFS_MAGIC;
 
 	le64_add_cpu(&c->disk_sb.sb->seq, 1);
 
@@ -1194,19 +1216,19 @@ int bch2_fs_mark_dirty(struct bch_fs *c)
 	mutex_lock(&c->sb_lock);
 	SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
 
+	/*
+	 * Downgrade, if superblock is at a higher version than currently
+	 * supported:
+	 */
 	if (BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb) > bcachefs_metadata_version_current)
 		SET_BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb, bcachefs_metadata_version_current);
-
-	if (test_bit(BCH_FS_VERSION_UPGRADE, &c->flags) ||
-	    c->sb.version > bcachefs_metadata_version_current)
+	if (c->sb.version > bcachefs_metadata_version_current)
 		c->disk_sb.sb->version = cpu_to_le16(bcachefs_metadata_version_current);
-
-	if (test_bit(BCH_FS_VERSION_UPGRADE, &c->flags))
-		c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALL);
+	if (c->sb.version_min > bcachefs_metadata_version_current)
+		c->disk_sb.sb->version_min = cpu_to_le16(bcachefs_metadata_version_current);
+	c->disk_sb.sb->compat[0] &= cpu_to_le64((1ULL << BCH_COMPAT_NR) - 1);
 
 	c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALWAYS);
-
-	c->disk_sb.sb->compat[0] &= cpu_to_le64((1ULL << BCH_COMPAT_NR) - 1);
 	ret = bch2_write_super(c);
 	mutex_unlock(&c->sb_lock);
 
diff --git a/fs/bcachefs/super-io.h b/fs/bcachefs/super-io.h
index cda71ec845a5..a850cc4ae6c7 100644
--- a/fs/bcachefs/super-io.h
+++ b/fs/bcachefs/super-io.h
@@ -11,11 +11,12 @@
 
 static inline bool bch2_version_compatible(u16 version)
 {
-	return version <= bcachefs_metadata_version_current &&
+	return BCH_VERSION_MAJOR(version) <= BCH_VERSION_MAJOR(bcachefs_metadata_version_current) &&
 		version >= bcachefs_metadata_version_min;
 }
 
 void bch2_version_to_text(struct printbuf *, unsigned);
+unsigned bch2_latest_compatible_version(unsigned);
 
 struct bch_sb_field *bch2_sb_field_get(struct bch_sb *, enum bch_sb_field_type);
 struct bch_sb_field *bch2_sb_field_resize(struct bch_sb_handle *,
-- 
cgit 


From 01e691e830edae9a145eeb70f8983223d606e2ca Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 10 Jul 2023 11:17:56 -0400
Subject: bcachefs: Fix a write buffer flush deadlock

We're not supposed to block if BTREE_INSERT_JOURNAL_RECLAIM && watermark
!= BCH_WATERMARK_reclaim.

This should really be a separate BTREE_INSERT_NONBLOCK flag - add some
comments to that effect, it's not important for this patch.

btree write buffer flush depends on this behaviour though - the first
loop tries to flush sequentially, which doesn't free up space in the
journal optimally. If that can't proceed we bail out and flush in
journal order - that won't work if we're blocked instead of returning an
error.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update_interior.c | 11 +++++++++++
 fs/bcachefs/btree_update_leaf.c     |  4 ++++
 2 files changed, 15 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 5592feff79d1..3659b2c08109 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -1158,6 +1158,17 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
 	    bch2_err_matches(ret, ENOMEM)) {
 		struct closure cl;
 
+		/*
+		 * XXX: this should probably be a separate BTREE_INSERT_NONBLOCK
+		 * flag
+		 */
+		if (bch2_err_matches(ret, ENOSPC) &&
+		    (flags & BTREE_INSERT_JOURNAL_RECLAIM) &&
+		    watermark != BCH_WATERMARK_reclaim) {
+			ret = -BCH_ERR_journal_reclaim_would_deadlock;
+			goto err;
+		}
+
 		closure_init_stack(&cl);
 
 		do {
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 6e12e8e7c301..53219fdcff66 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -958,6 +958,10 @@ int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags,
 			bch2_replicas_delta_list_mark(c, trans->fs_usage_deltas));
 		break;
 	case -BCH_ERR_journal_res_get_blocked:
+		/*
+		 * XXX: this should probably be a separate BTREE_INSERT_NONBLOCK
+		 * flag
+		 */
 		if ((flags & BTREE_INSERT_JOURNAL_RECLAIM) &&
 		    (flags & BCH_WATERMARK_MASK) != BCH_WATERMARK_reclaim) {
 			ret = -BCH_ERR_journal_reclaim_would_deadlock;
-- 
cgit 


From 6619d84626ff266721f4c3c24339c60ca8cb12e0 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 10 Jul 2023 12:23:01 -0400
Subject: bcachefs: bch2_sb_maybe_downgrade(), bch2_sb_upgrade()

Add some new helpers, and fix upgrade/downgrade in bch2_fs_initialize().

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/recovery.c |  8 ++++----
 fs/bcachefs/super-io.c | 36 ++++++++++++++++++++++++++----------
 fs/bcachefs/super-io.h |  3 +++
 3 files changed, 33 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 17ffac089a5d..9ca6c236f508 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -1166,8 +1166,7 @@ static void check_version_upgrade(struct bch_fs *c)
 		c->opts.fix_errors	= FSCK_OPT_YES;
 
 		mutex_lock(&c->sb_lock);
-		c->disk_sb.sb->version = cpu_to_le16(new_version);
-		c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALL);
+		bch2_sb_upgrade(c, new_version);
 		mutex_unlock(&c->sb_lock);
 
 		printbuf_exit(&buf);
@@ -1525,10 +1524,11 @@ int bch2_fs_initialize(struct bch_fs *c)
 	c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_extents_above_btree_updates_done);
 	c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_bformat_overflow_done);
 
+	bch2_sb_maybe_downgrade(c);
+
 	if (c->opts.version_upgrade != BCH_VERSION_UPGRADE_none) {
-		c->disk_sb.sb->version = cpu_to_le16(bcachefs_metadata_version_current);
+		bch2_sb_upgrade(c, bcachefs_metadata_version_current);
 		SET_BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb, bcachefs_metadata_version_current);
-		c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALL);
 		bch2_write_super(c);
 	}
 	mutex_unlock(&c->sb_lock);
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index 23e25af001f0..a06310492e79 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -1204,17 +1204,10 @@ int bch2_sb_clean_validate_late(struct bch_fs *c, struct bch_sb_field_clean *cle
 	return 0;
 }
 
-int bch2_fs_mark_dirty(struct bch_fs *c)
+/* Downgrade if superblock is at a higher version than currently supported: */
+void bch2_sb_maybe_downgrade(struct bch_fs *c)
 {
-	int ret;
-
-	/*
-	 * Unconditionally write superblock, to verify it hasn't changed before
-	 * we go rw:
-	 */
-
-	mutex_lock(&c->sb_lock);
-	SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
+	lockdep_assert_held(&c->sb_lock);
 
 	/*
 	 * Downgrade, if superblock is at a higher version than currently
@@ -1227,8 +1220,31 @@ int bch2_fs_mark_dirty(struct bch_fs *c)
 	if (c->sb.version_min > bcachefs_metadata_version_current)
 		c->disk_sb.sb->version_min = cpu_to_le16(bcachefs_metadata_version_current);
 	c->disk_sb.sb->compat[0] &= cpu_to_le64((1ULL << BCH_COMPAT_NR) - 1);
+}
+
+void bch2_sb_upgrade(struct bch_fs *c, unsigned new_version)
+{
+	lockdep_assert_held(&c->sb_lock);
 
+	c->disk_sb.sb->version = cpu_to_le16(new_version);
+	c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALL);
+}
+
+int bch2_fs_mark_dirty(struct bch_fs *c)
+{
+	int ret;
+
+	/*
+	 * Unconditionally write superblock, to verify it hasn't changed before
+	 * we go rw:
+	 */
+
+	mutex_lock(&c->sb_lock);
+	SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
+
+	bch2_sb_maybe_downgrade(c);
 	c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALWAYS);
+
 	ret = bch2_write_super(c);
 	mutex_unlock(&c->sb_lock);
 
diff --git a/fs/bcachefs/super-io.h b/fs/bcachefs/super-io.h
index a850cc4ae6c7..b365f088ba41 100644
--- a/fs/bcachefs/super-io.h
+++ b/fs/bcachefs/super-io.h
@@ -124,6 +124,9 @@ void bch2_journal_super_entries_add_common(struct bch_fs *,
 
 int bch2_sb_clean_validate_late(struct bch_fs *, struct bch_sb_field_clean *, int);
 
+void bch2_sb_maybe_downgrade(struct bch_fs *);
+void bch2_sb_upgrade(struct bch_fs *, unsigned);
+
 int bch2_fs_mark_dirty(struct bch_fs *);
 void bch2_fs_mark_clean(struct bch_fs *);
 
-- 
cgit 


From 065bd3356ce490ae9454d8b3c98ff298e13d09ac Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 10 Jul 2023 13:42:26 -0400
Subject: bcachefs: Version table now lists required recovery passes

Now that we've got forward compatibility sorted out, we should be doing
more frequent version upgrades in the future.

To avoid having to run a full fsck for every version upgrade, this
improves the BCH_METADATA_VERSIONS() table to explicitly specify a
bitmask of recovery passes to run when upgrading to or past a given
version.

This means we can also delete PASS_UPGRADE().

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs.h        | 17 ++++------
 fs/bcachefs/bcachefs_format.h | 72 +++++++++++++++++++++++++++++--------------
 fs/bcachefs/recovery.c        | 29 ++++++++++++-----
 fs/bcachefs/recovery.h        |  2 ++
 fs/bcachefs/super-io.c        | 32 ++++++++++++++++---
 fs/bcachefs/super-io.h        |  4 +++
 6 files changed, 110 insertions(+), 46 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 88a1782b2a0e..d8c020644f54 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -660,12 +660,11 @@ enum bch_write_ref {
 #define PASS_FSCK		BIT(1)
 #define PASS_UNCLEAN		BIT(2)
 #define PASS_ALWAYS		BIT(3)
-#define PASS_UPGRADE(v)		((v) << 4)
 
 #define BCH_RECOVERY_PASSES()									\
 	x(alloc_read,			PASS_ALWAYS)						\
 	x(stripes_read,			PASS_ALWAYS)						\
-	x(initialize_subvolumes,	PASS_UPGRADE(bcachefs_metadata_version_snapshot_2))	\
+	x(initialize_subvolumes,	0)							\
 	x(snapshots_read,		PASS_ALWAYS)						\
 	x(check_allocations,		PASS_FSCK)						\
 	x(set_may_go_rw,		PASS_ALWAYS|PASS_SILENT)				\
@@ -677,8 +676,8 @@ enum bch_write_ref {
 	x(check_extents_to_backpointers,PASS_FSCK)						\
 	x(check_alloc_to_lru_refs,	PASS_FSCK)						\
 	x(fs_freespace_init,		PASS_ALWAYS|PASS_SILENT)				\
-	x(bucket_gens_init,		PASS_UPGRADE(bcachefs_metadata_version_bucket_gens))	\
-	x(fs_upgrade_for_subvolumes,	PASS_UPGRADE(bcachefs_metadata_version_snapshot_2))	\
+	x(bucket_gens_init,		0)							\
+	x(fs_upgrade_for_subvolumes,	0)							\
 	x(check_snapshot_trees,		PASS_FSCK)						\
 	x(check_snapshots,		PASS_FSCK)						\
 	x(check_subvols,		PASS_FSCK)						\
@@ -690,7 +689,7 @@ enum bch_write_ref {
 	x(check_root,			PASS_FSCK)						\
 	x(check_directory_structure,	PASS_FSCK)						\
 	x(check_nlinks,			PASS_FSCK)						\
-	x(fix_reflink_p,		PASS_UPGRADE(bcachefs_metadata_version_reflink_p_fix))	\
+	x(fix_reflink_p,		0)							\
 
 enum bch_recovery_pass {
 #define x(n, when)	BCH_RECOVERY_PASS_##n,
@@ -1033,6 +1032,8 @@ struct bch_fs {
 	u64			journal_replay_seq_start;
 	u64			journal_replay_seq_end;
 	enum bch_recovery_pass	curr_recovery_pass;
+	/* bitmap of explicitly enabled recovery passes: */
+	u64			recovery_passes_explicit;
 
 	/* DEBUG JUNK */
 	struct dentry		*fs_debug_dir;
@@ -1177,12 +1178,6 @@ static inline bool bch2_dev_exists2(const struct bch_fs *c, unsigned dev)
 	return dev < c->sb.nr_devices && c->devs[dev];
 }
 
-static inline bool bch2_version_upgrading_to(const struct bch_fs *c, unsigned new_version)
-{
-	return c->sb.version_upgrade_complete < new_version &&
-		c->sb.version >= new_version;
-}
-
 #define BKEY_PADDED_ONSTACK(key, pad)				\
 	struct { struct bkey_i key; __u64 key ## _pad[pad]; }
 
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index 78771d8d8a62..274e57740d74 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -1578,32 +1578,58 @@ struct bch_sb_field_journal_seq_blacklist {
 #define BCH_VERSION_MINOR(_v)		((__u16) ((_v) & ~(~0U << 10)))
 #define BCH_VERSION(_major, _minor)	(((_major) << 10)|(_minor) << 0)
 
-#define BCH_METADATA_VERSIONS()				\
-	x(bkey_renumber,		BCH_VERSION(0, 10))		\
-	x(inode_btree_change,		BCH_VERSION(0, 11))		\
-	x(snapshot,			BCH_VERSION(0, 12))		\
-	x(inode_backpointers,		BCH_VERSION(0, 13))		\
-	x(btree_ptr_sectors_written,	BCH_VERSION(0, 14))		\
-	x(snapshot_2,			BCH_VERSION(0, 15))		\
-	x(reflink_p_fix,		BCH_VERSION(0, 16))		\
-	x(subvol_dirent,		BCH_VERSION(0, 17))		\
-	x(inode_v2,			BCH_VERSION(0, 18))		\
-	x(freespace,			BCH_VERSION(0, 19))		\
-	x(alloc_v4,			BCH_VERSION(0, 20))		\
-	x(new_data_types,		BCH_VERSION(0, 21))		\
-	x(backpointers,			BCH_VERSION(0, 22))		\
-	x(inode_v3,			BCH_VERSION(0, 23))		\
-	x(unwritten_extents,		BCH_VERSION(0, 24))		\
-	x(bucket_gens,			BCH_VERSION(0, 25))		\
-	x(lru_v2,			BCH_VERSION(0, 26))		\
-	x(fragmentation_lru,		BCH_VERSION(0, 27))		\
-	x(no_bps_in_alloc_keys,		BCH_VERSION(0, 28))		\
-	x(snapshot_trees,		BCH_VERSION(0, 29))		\
-	x(major_minor,			BCH_VERSION(1,  0))
+#define RECOVERY_PASS_ALL_FSCK		(1ULL << 63)
+
+#define BCH_METADATA_VERSIONS()						\
+	x(bkey_renumber,		BCH_VERSION(0, 10),		\
+	  RECOVERY_PASS_ALL_FSCK)					\
+	x(inode_btree_change,		BCH_VERSION(0, 11),		\
+	  RECOVERY_PASS_ALL_FSCK)					\
+	x(snapshot,			BCH_VERSION(0, 12),		\
+	  RECOVERY_PASS_ALL_FSCK)					\
+	x(inode_backpointers,		BCH_VERSION(0, 13),		\
+	  RECOVERY_PASS_ALL_FSCK)					\
+	x(btree_ptr_sectors_written,	BCH_VERSION(0, 14),		\
+	  RECOVERY_PASS_ALL_FSCK)					\
+	x(snapshot_2,			BCH_VERSION(0, 15),		\
+	  BIT_ULL(BCH_RECOVERY_PASS_fs_upgrade_for_subvolumes)|		\
+	  BIT_ULL(BCH_RECOVERY_PASS_initialize_subvolumes)|		\
+	  RECOVERY_PASS_ALL_FSCK)					\
+	x(reflink_p_fix,		BCH_VERSION(0, 16),		\
+	  BIT_ULL(BCH_RECOVERY_PASS_fix_reflink_p))			\
+	x(subvol_dirent,		BCH_VERSION(0, 17),		\
+	  RECOVERY_PASS_ALL_FSCK)					\
+	x(inode_v2,			BCH_VERSION(0, 18),		\
+	  RECOVERY_PASS_ALL_FSCK)					\
+	x(freespace,			BCH_VERSION(0, 19),		\
+	  RECOVERY_PASS_ALL_FSCK)					\
+	x(alloc_v4,			BCH_VERSION(0, 20),		\
+	  RECOVERY_PASS_ALL_FSCK)					\
+	x(new_data_types,		BCH_VERSION(0, 21),		\
+	  RECOVERY_PASS_ALL_FSCK)					\
+	x(backpointers,			BCH_VERSION(0, 22),		\
+	  RECOVERY_PASS_ALL_FSCK)					\
+	x(inode_v3,			BCH_VERSION(0, 23),		\
+	  RECOVERY_PASS_ALL_FSCK)					\
+	x(unwritten_extents,		BCH_VERSION(0, 24),		\
+	  RECOVERY_PASS_ALL_FSCK)					\
+	x(bucket_gens,			BCH_VERSION(0, 25),		\
+	  BIT_ULL(BCH_RECOVERY_PASS_bucket_gens_init)|			\
+	  RECOVERY_PASS_ALL_FSCK)					\
+	x(lru_v2,			BCH_VERSION(0, 26),		\
+	  RECOVERY_PASS_ALL_FSCK)					\
+	x(fragmentation_lru,		BCH_VERSION(0, 27),		\
+	  RECOVERY_PASS_ALL_FSCK)					\
+	x(no_bps_in_alloc_keys,		BCH_VERSION(0, 28),		\
+	  RECOVERY_PASS_ALL_FSCK)					\
+	x(snapshot_trees,		BCH_VERSION(0, 29),		\
+	  RECOVERY_PASS_ALL_FSCK)					\
+	x(major_minor,			BCH_VERSION(1,  0),		\
+	  0)
 
 enum bcachefs_metadata_version {
 	bcachefs_metadata_version_min = 9,
-#define x(t, n)	bcachefs_metadata_version_##t = n,
+#define x(t, n, upgrade_passes)	bcachefs_metadata_version_##t = n,
 	BCH_METADATA_VERSIONS()
 #undef x
 	bcachefs_metadata_version_max
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 9ca6c236f508..0486ec9d281c 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -1115,6 +1115,7 @@ static void check_version_upgrade(struct bch_fs *c)
 	unsigned latest_version	= bcachefs_metadata_version_current;
 	unsigned old_version = c->sb.version_upgrade_complete ?: c->sb.version;
 	unsigned new_version = 0;
+	u64 recovery_passes;
 
 	if (old_version < bcachefs_metadata_required_upgrade_below) {
 		if (c->opts.version_upgrade == BCH_VERSION_UPGRADE_incompatible ||
@@ -1158,12 +1159,15 @@ static void check_version_upgrade(struct bch_fs *c)
 		bch2_version_to_text(&buf, new_version);
 		prt_newline(&buf);
 
-		prt_str(&buf, "fsck required");
+		recovery_passes = bch2_upgrade_recovery_passes(c, old_version, new_version);
+		if (recovery_passes) {
+			prt_str(&buf, "fsck required");
 
-		bch_info(c, "%s", buf.buf);
+			c->recovery_passes_explicit |= recovery_passes;
+			c->opts.fix_errors = FSCK_OPT_YES;
+		}
 
-		c->opts.fsck		= true;
-		c->opts.fix_errors	= FSCK_OPT_YES;
+		bch_info(c, "%s", buf.buf);
 
 		mutex_lock(&c->sb_lock);
 		bch2_sb_upgrade(c, new_version);
@@ -1196,21 +1200,30 @@ static struct recovery_pass_fn recovery_passes[] = {
 #undef x
 };
 
+u64 bch2_fsck_recovery_passes(void)
+{
+	u64 ret = 0;
+
+	for (unsigned i = 0; i < ARRAY_SIZE(recovery_passes); i++)
+		if (recovery_passes[i].when & PASS_FSCK)
+			ret |= BIT_ULL(i);
+	return ret;
+}
+
 static bool should_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass)
 {
 	struct recovery_pass_fn *p = recovery_passes + c->curr_recovery_pass;
 
 	if (c->opts.norecovery && pass > BCH_RECOVERY_PASS_snapshots_read)
 		return false;
+	if (c->recovery_passes_explicit & BIT_ULL(pass))
+		return true;
 	if ((p->when & PASS_FSCK) && c->opts.fsck)
 		return true;
 	if ((p->when & PASS_UNCLEAN) && !c->sb.clean)
 		return true;
 	if (p->when & PASS_ALWAYS)
 		return true;
-	if (p->when >= PASS_UPGRADE(0) &&
-	    bch2_version_upgrading_to(c, p->when >> 4))
-		return true;
 	return false;
 }
 
@@ -1294,7 +1307,7 @@ int bch2_fs_recovery(struct bch_fs *c)
 		goto err;
 	}
 
-	if (!c->opts.nochanges)
+	if (c->opts.fsck || !(c->opts.nochanges && c->opts.norecovery))
 		check_version_upgrade(c);
 
 	if (c->opts.fsck && c->opts.norecovery) {
diff --git a/fs/bcachefs/recovery.h b/fs/bcachefs/recovery.h
index 8c0348e8b84c..f8e796c0f8c8 100644
--- a/fs/bcachefs/recovery.h
+++ b/fs/bcachefs/recovery.h
@@ -52,6 +52,8 @@ void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *,
 void bch2_journal_keys_free(struct journal_keys *);
 void bch2_journal_entries_free(struct bch_fs *);
 
+u64 bch2_fsck_recovery_passes(void);
+
 int bch2_fs_recovery(struct bch_fs *);
 int bch2_fs_initialize(struct bch_fs *);
 
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index a06310492e79..6a97af0f5896 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -4,6 +4,7 @@
 #include "btree_update_interior.h"
 #include "buckets.h"
 #include "checksum.h"
+#include "counters.h"
 #include "disk_groups.h"
 #include "ec.h"
 #include "error.h"
@@ -12,13 +13,13 @@
 #include "journal_io.h"
 #include "journal_sb.h"
 #include "journal_seq_blacklist.h"
+#include "recovery.h"
 #include "replicas.h"
 #include "quota.h"
 #include "super-io.h"
 #include "super.h"
 #include "trace.h"
 #include "vstructs.h"
-#include "counters.h"
 
 #include <linux/backing-dev.h>
 #include <linux/sort.h>
@@ -26,13 +27,18 @@
 static const struct blk_holder_ops bch2_sb_handle_bdev_ops = {
 };
 
-struct bch2_metadata_version_str {
+struct bch2_metadata_version {
 	u16		version;
 	const char	*name;
+	u64		recovery_passes;
 };
 
-static const struct bch2_metadata_version_str bch2_metadata_versions[] = {
-#define x(n, v) { .version = v, .name = #n },
+static const struct bch2_metadata_version bch2_metadata_versions[] = {
+#define x(n, v, _recovery_passes) {		\
+	.version = v,				\
+	.name = #n,				\
+	.recovery_passes = _recovery_passes,	\
+},
 	BCH_METADATA_VERSIONS()
 #undef x
 };
@@ -64,6 +70,24 @@ unsigned bch2_latest_compatible_version(unsigned v)
 	return v;
 }
 
+u64 bch2_upgrade_recovery_passes(struct bch_fs *c,
+				 unsigned old_version,
+				 unsigned new_version)
+{
+	u64 ret = 0;
+
+	for (const struct bch2_metadata_version *i = bch2_metadata_versions;
+	     i < bch2_metadata_versions + ARRAY_SIZE(bch2_metadata_versions);
+	     i++)
+		if (i->version > old_version && i->version <= new_version) {
+			if (i->recovery_passes & RECOVERY_PASS_ALL_FSCK)
+				ret |= bch2_fsck_recovery_passes();
+			ret |= i->recovery_passes;
+		}
+
+	return ret &= ~RECOVERY_PASS_ALL_FSCK;
+}
+
 const char * const bch2_sb_fields[] = {
 #define x(name, nr)	#name,
 	BCH_SB_FIELDS()
diff --git a/fs/bcachefs/super-io.h b/fs/bcachefs/super-io.h
index b365f088ba41..904adea6a0da 100644
--- a/fs/bcachefs/super-io.h
+++ b/fs/bcachefs/super-io.h
@@ -18,6 +18,10 @@ static inline bool bch2_version_compatible(u16 version)
 void bch2_version_to_text(struct printbuf *, unsigned);
 unsigned bch2_latest_compatible_version(unsigned);
 
+u64 bch2_upgrade_recovery_passes(struct bch_fs *c,
+				 unsigned,
+				 unsigned);
+
 struct bch_sb_field *bch2_sb_field_get(struct bch_sb *, enum bch_sb_field_type);
 struct bch_sb_field *bch2_sb_field_resize(struct bch_sb_handle *,
 					  enum bch_sb_field_type, unsigned);
-- 
cgit 


From f26c67f4a7c4951a312547790b11066bc510822e Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 25 Jun 2023 18:04:46 -0400
Subject: bcachefs: Snapshot depth, skiplist fields

This extents KEY_TYPE_snapshot to include some new fields:
 - depth, to indicate depth of this particular node from the root
 - skip[3], skiplist entries for quickly walking back up to the root

These are to improve bch2_snapshot_is_ancestor(), making it O(ln(n))
instead of O(n) in the snapshot tree depth.

Skiplist nodes are picked at random from the set of ancestor nodes, not
some fixed fraction.

This introduces bcachefs_metadata_version 1.1, snapshot_skiplists.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs_format.h |   6 +-
 fs/bcachefs/btree_iter.h      |   8 ++
 fs/bcachefs/recovery.c        |  13 ++-
 fs/bcachefs/subvolume.c       | 261 ++++++++++++++++++++++++++++++++++--------
 fs/bcachefs/subvolume.h       |  33 ++++--
 fs/bcachefs/subvolume_types.h |   2 +
 6 files changed, 267 insertions(+), 56 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index 274e57740d74..6d693e4def5d 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -1148,6 +1148,8 @@ struct bch_snapshot {
 	__le32			children[2];
 	__le32			subvol;
 	__le32			tree;
+	__le32			depth;
+	__le32			skip[3];
 };
 
 LE32_BITMASK(BCH_SNAPSHOT_DELETED,	struct bch_snapshot, flags,  0,  1)
@@ -1625,7 +1627,9 @@ struct bch_sb_field_journal_seq_blacklist {
 	x(snapshot_trees,		BCH_VERSION(0, 29),		\
 	  RECOVERY_PASS_ALL_FSCK)					\
 	x(major_minor,			BCH_VERSION(1,  0),		\
-	  0)
+	  0)								\
+	x(snapshot_skiplists,		BCH_VERSION(1,  1),		\
+	  BIT_ULL(BCH_RECOVERY_PASS_check_snapshots))
 
 enum bcachefs_metadata_version {
 	bcachefs_metadata_version_min = 9,
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index 63260f68bc67..13e92452270e 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -795,6 +795,14 @@ __bch2_btree_iter_peek_upto_and_restart(struct btree_trans *trans,
 			    (_do) ?: bch2_trans_commit(_trans, (_disk_res),\
 					(_journal_seq), (_commit_flags)))
 
+#define for_each_btree_key_reverse_commit(_trans, _iter, _btree_id,	\
+				  _start, _iter_flags, _k,		\
+				  _disk_res, _journal_seq, _commit_flags,\
+				  _do)					\
+	for_each_btree_key_reverse(_trans, _iter, _btree_id, _start, _iter_flags, _k,\
+			    (_do) ?: bch2_trans_commit(_trans, (_disk_res),\
+					(_journal_seq), (_commit_flags)))
+
 #define for_each_btree_key_upto_commit(_trans, _iter, _btree_id,	\
 				  _start, _end, _iter_flags, _k,	\
 				  _disk_res, _journal_seq, _commit_flags,\
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 0486ec9d281c..c46297bd1cf9 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -594,10 +594,21 @@ static int bch2_journal_replay_key(struct btree_trans *trans,
 	unsigned iter_flags =
 		BTREE_ITER_INTENT|
 		BTREE_ITER_NOT_EXTENTS;
+	unsigned update_flags = BTREE_TRIGGER_NORUN;
 	int ret;
 
+	/*
+	 * BTREE_UPDATE_KEY_CACHE_RECLAIM disables key cache lookup/update to
+	 * keep the key cache coherent with the underlying btree. Nothing
+	 * besides the allocator is doing updates yet so we don't need key cache
+	 * coherency for non-alloc btrees, and key cache fills for snapshots
+	 * btrees use BTREE_ITER_FILTER_SNAPSHOTS, which isn't available until
+	 * the snapshots recovery pass runs.
+	 */
 	if (!k->level && k->btree_id == BTREE_ID_alloc)
 		iter_flags |= BTREE_ITER_CACHED;
+	else
+		update_flags |= BTREE_UPDATE_KEY_CACHE_RECLAIM;
 
 	bch2_trans_node_iter_init(trans, &iter, k->btree_id, k->k->k.p,
 				  BTREE_MAX_DEPTH, k->level,
@@ -610,7 +621,7 @@ static int bch2_journal_replay_key(struct btree_trans *trans,
 	if (k->overwritten)
 		goto out;
 
-	ret = bch2_trans_update(trans, &iter, k->k, BTREE_TRIGGER_NORUN);
+	ret = bch2_trans_update(trans, &iter, k->k, update_flags);
 out:
 	bch2_trans_iter_exit(trans, &iter);
 	return ret;
diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c
index f3852c433ca9..cdaaf49d3b3e 100644
--- a/fs/bcachefs/subvolume.c
+++ b/fs/bcachefs/subvolume.c
@@ -8,8 +8,41 @@
 #include "fs.h"
 #include "subvolume.h"
 
+#include <linux/random.h>
+
 static int bch2_subvolume_delete(struct btree_trans *, u32);
 
+static inline u32 get_ancestor_below(struct bch_fs *c, u32 id, u32 ancestor)
+{
+	struct snapshot_t *s = snapshot_t(c, id);
+
+	if (s->skip[2] <= ancestor)
+		return s->skip[2];
+	if (s->skip[1] <= ancestor)
+		return s->skip[1];
+	if (s->skip[0] <= ancestor)
+		return s->skip[0];
+	return s->parent;
+}
+
+bool bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor)
+{
+	EBUG_ON(c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_snapshots);
+
+	while (id && id < ancestor)
+		id = get_ancestor_below(c, id, ancestor);
+
+	return id == ancestor;
+}
+
+static bool bch2_snapshot_is_ancestor_early(struct bch_fs *c, u32 id, u32 ancestor)
+{
+	while (id && id < ancestor)
+		id = snapshot_t(c, id)->parent;
+
+	return id == ancestor;
+}
+
 /* Snapshot tree: */
 
 void bch2_snapshot_tree_to_text(struct printbuf *out, struct bch_fs *c,
@@ -95,6 +128,13 @@ void bch2_snapshot_to_text(struct printbuf *out, struct bch_fs *c,
 	       le32_to_cpu(s.v->children[1]),
 	       le32_to_cpu(s.v->subvol),
 	       le32_to_cpu(s.v->tree));
+
+	if (bkey_val_bytes(k.k) > offsetof(struct bch_snapshot, depth))
+		prt_printf(out, " depth %u skiplist %u %u %u",
+			   le32_to_cpu(s.v->depth),
+			   le32_to_cpu(s.v->skip[0]),
+			   le32_to_cpu(s.v->skip[1]),
+			   le32_to_cpu(s.v->skip[2]));
 }
 
 int bch2_snapshot_invalid(const struct bch_fs *c, struct bkey_s_c k,
@@ -140,6 +180,25 @@ int bch2_snapshot_invalid(const struct bch_fs *c, struct bkey_s_c k,
 		}
 	}
 
+	if (bkey_val_bytes(k.k) > offsetof(struct bch_snapshot, skip)) {
+		if (le32_to_cpu(s.v->skip[0]) > le32_to_cpu(s.v->skip[1]) ||
+		    le32_to_cpu(s.v->skip[1]) > le32_to_cpu(s.v->skip[2])) {
+			prt_printf(err, "skiplist not normalized");
+			return -BCH_ERR_invalid_bkey;
+		}
+
+		for (i = 0; i < ARRAY_SIZE(s.v->skip); i++) {
+			id = le32_to_cpu(s.v->skip[i]);
+
+			if (!id != !s.v->parent ||
+			    (s.v->parent &&
+			     id <= k.k->p.offset)) {
+				prt_printf(err, "bad skiplist node %u)", id);
+				return -BCH_ERR_invalid_bkey;
+			}
+		}
+	}
+
 	return 0;
 }
 
@@ -165,6 +224,21 @@ int bch2_mark_snapshot(struct btree_trans *trans,
 		t->children[1]	= le32_to_cpu(s.v->children[1]);
 		t->subvol	= BCH_SNAPSHOT_SUBVOL(s.v) ? le32_to_cpu(s.v->subvol) : 0;
 		t->tree		= le32_to_cpu(s.v->tree);
+
+		if (bkey_val_bytes(s.k) > offsetof(struct bch_snapshot, depth)) {
+			t->depth	= le32_to_cpu(s.v->depth);
+			t->skip[0]	= le32_to_cpu(s.v->skip[0]);
+			t->skip[1]	= le32_to_cpu(s.v->skip[1]);
+			t->skip[2]	= le32_to_cpu(s.v->skip[2]);
+		} else {
+			t->depth	= 0;
+			t->skip[0]	= 0;
+			t->skip[1]	= 0;
+			t->skip[2]	= 0;
+		}
+
+		if (BCH_SNAPSHOT_DELETED(s.v))
+			set_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags);
 	} else {
 		t->parent	= 0;
 		t->children[0]	= 0;
@@ -370,9 +444,9 @@ static int check_snapshot_tree(struct btree_trans *trans,
 			"snapshot tree points to missing subvolume:\n  %s",
 			(printbuf_reset(&buf),
 			 bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf)) ||
-	    fsck_err_on(!bch2_snapshot_is_ancestor(c,
-						   le32_to_cpu(subvol.snapshot),
-						   root_id), c,
+	    fsck_err_on(!bch2_snapshot_is_ancestor_early(c,
+						le32_to_cpu(subvol.snapshot),
+						root_id), c,
 			"snapshot tree points to subvolume that does not point to snapshot in this tree:\n  %s",
 			(printbuf_reset(&buf),
 			 bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf)) ||
@@ -441,7 +515,47 @@ static int snapshot_tree_ptr_good(struct btree_trans *trans,
 	if (ret)
 		return ret;
 
-	return bch2_snapshot_is_ancestor(trans->c, snap_id, le32_to_cpu(s_t.root_snapshot));
+	return bch2_snapshot_is_ancestor_early(trans->c, snap_id, le32_to_cpu(s_t.root_snapshot));
+}
+
+static u32 snapshot_skiplist_get(struct bch_fs *c, u32 id)
+{
+	struct snapshot_t *s;
+
+	if (!id)
+		return 0;
+
+	s = snapshot_t(c, id);
+	if (!s->parent)
+		return id;
+
+	return bch2_snapshot_nth_parent(c, id, get_random_u32_below(s->depth));
+}
+
+static int snapshot_skiplist_good(struct btree_trans *trans, struct bch_snapshot s)
+{
+	struct bch_snapshot a;
+	unsigned i;
+	int ret;
+
+	for (i = 0; i < 3; i++) {
+		if (!s.parent != !s.skip[i])
+			return false;
+
+		if (!s.parent)
+			continue;
+
+		ret = snapshot_lookup(trans, le32_to_cpu(s.skip[i]), &a);
+		if (bch2_err_matches(ret, ENOENT))
+			return false;
+		if (ret)
+			return ret;
+
+		if (a.tree != s.tree)
+			return false;
+	}
+
+	return true;
 }
 
 /*
@@ -451,14 +565,15 @@ static int snapshot_tree_ptr_good(struct btree_trans *trans,
  */
 static int snapshot_tree_ptr_repair(struct btree_trans *trans,
 				    struct btree_iter *iter,
-				    struct bkey_s_c_snapshot *s)
+				    struct bkey_s_c k,
+				    struct bch_snapshot *s)
 {
 	struct bch_fs *c = trans->c;
 	struct btree_iter root_iter;
 	struct bch_snapshot_tree s_t;
 	struct bkey_s_c_snapshot root;
 	struct bkey_i_snapshot *u;
-	u32 root_id = bch2_snapshot_root(c, s->k->p.offset), tree_id;
+	u32 root_id = bch2_snapshot_root(c, k.k->p.offset), tree_id;
 	int ret;
 
 	root = bch2_bkey_get_iter_typed(trans, &root_iter,
@@ -484,32 +599,43 @@ static int snapshot_tree_ptr_repair(struct btree_trans *trans,
 			goto err;
 
 		u->v.tree = cpu_to_le32(tree_id);
-		if (s->k->p.snapshot == root_id)
-			*s = snapshot_i_to_s_c(u);
+		if (k.k->p.offset == root_id)
+			*s = u->v;
 	}
 
-	if (s->k->p.snapshot != root_id) {
-		u = bch2_bkey_make_mut_typed(trans, iter, &s->s_c, 0, snapshot);
+	if (k.k->p.offset != root_id) {
+		u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
 		ret = PTR_ERR_OR_ZERO(u);
 		if (ret)
 			goto err;
 
 		u->v.tree = cpu_to_le32(tree_id);
-		*s = snapshot_i_to_s_c(u);
+		*s = u->v;
 	}
 err:
 	bch2_trans_iter_exit(trans, &root_iter);
 	return ret;
 }
 
+static int cmp_le32(__le32 l, __le32 r)
+{
+	return cmp_int(le32_to_cpu(l), le32_to_cpu(r));
+}
+
 static int check_snapshot(struct btree_trans *trans,
 			  struct btree_iter *iter,
 			  struct bkey_s_c k)
 {
 	struct bch_fs *c = trans->c;
-	struct bkey_s_c_snapshot s;
+	struct bch_snapshot s;
 	struct bch_subvolume subvol;
 	struct bch_snapshot v;
+	struct bkey_i_snapshot *u;
+	u32 parent_id = bch2_snapshot_parent_early(c, k.k->p.offset);
+	u32 real_depth;
+	struct snapshot_t *parent = parent_id
+		? snapshot_t(c, parent_id)
+		: NULL;
 	struct printbuf buf = PRINTBUF;
 	bool should_have_subvol;
 	u32 i, id;
@@ -518,94 +644,123 @@ static int check_snapshot(struct btree_trans *trans,
 	if (k.k->type != KEY_TYPE_snapshot)
 		return 0;
 
-	s = bkey_s_c_to_snapshot(k);
-	id = le32_to_cpu(s.v->parent);
+	memset(&s, 0, sizeof(s));
+	memcpy(&s, k.v, bkey_val_bytes(k.k));
+
+	id = le32_to_cpu(s.parent);
 	if (id) {
 		ret = snapshot_lookup(trans, id, &v);
 		if (bch2_err_matches(ret, ENOENT))
 			bch_err(c, "snapshot with nonexistent parent:\n  %s",
-				(bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf));
+				(bch2_bkey_val_to_text(&buf, c, k), buf.buf));
 		if (ret)
 			goto err;
 
-		if (le32_to_cpu(v.children[0]) != s.k->p.offset &&
-		    le32_to_cpu(v.children[1]) != s.k->p.offset) {
+		if (le32_to_cpu(v.children[0]) != k.k->p.offset &&
+		    le32_to_cpu(v.children[1]) != k.k->p.offset) {
 			bch_err(c, "snapshot parent %u missing pointer to child %llu",
-				id, s.k->p.offset);
+				id, k.k->p.offset);
 			ret = -EINVAL;
 			goto err;
 		}
 	}
 
-	for (i = 0; i < 2 && s.v->children[i]; i++) {
-		id = le32_to_cpu(s.v->children[i]);
+	for (i = 0; i < 2 && s.children[i]; i++) {
+		id = le32_to_cpu(s.children[i]);
 
 		ret = snapshot_lookup(trans, id, &v);
 		if (bch2_err_matches(ret, ENOENT))
 			bch_err(c, "snapshot node %llu has nonexistent child %u",
-				s.k->p.offset, id);
+				k.k->p.offset, id);
 		if (ret)
 			goto err;
 
-		if (le32_to_cpu(v.parent) != s.k->p.offset) {
+		if (le32_to_cpu(v.parent) != k.k->p.offset) {
 			bch_err(c, "snapshot child %u has wrong parent (got %u should be %llu)",
-				id, le32_to_cpu(v.parent), s.k->p.offset);
+				id, le32_to_cpu(v.parent), k.k->p.offset);
 			ret = -EINVAL;
 			goto err;
 		}
 	}
 
-	should_have_subvol = BCH_SNAPSHOT_SUBVOL(s.v) &&
-		!BCH_SNAPSHOT_DELETED(s.v);
+	should_have_subvol = BCH_SNAPSHOT_SUBVOL(&s) &&
+		!BCH_SNAPSHOT_DELETED(&s);
 
 	if (should_have_subvol) {
-		id = le32_to_cpu(s.v->subvol);
+		id = le32_to_cpu(s.subvol);
 		ret = bch2_subvolume_get(trans, id, 0, false, &subvol);
 		if (bch2_err_matches(ret, ENOENT))
 			bch_err(c, "snapshot points to nonexistent subvolume:\n  %s",
-				(bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf));
+				(bch2_bkey_val_to_text(&buf, c, k), buf.buf));
 		if (ret)
 			goto err;
 
-		if (BCH_SNAPSHOT_SUBVOL(s.v) != (le32_to_cpu(subvol.snapshot) == s.k->p.offset)) {
+		if (BCH_SNAPSHOT_SUBVOL(&s) != (le32_to_cpu(subvol.snapshot) == k.k->p.offset)) {
 			bch_err(c, "snapshot node %llu has wrong BCH_SNAPSHOT_SUBVOL",
-				s.k->p.offset);
+				k.k->p.offset);
 			ret = -EINVAL;
 			goto err;
 		}
 	} else {
-		if (fsck_err_on(s.v->subvol, c, "snapshot should not point to subvol:\n  %s",
-				(bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
-			struct bkey_i_snapshot *u = bch2_trans_kmalloc(trans, sizeof(*u));
-
+		if (fsck_err_on(s.subvol, c, "snapshot should not point to subvol:\n  %s",
+				(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
+			u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
 			ret = PTR_ERR_OR_ZERO(u);
 			if (ret)
 				goto err;
 
-			bkey_reassemble(&u->k_i, s.s_c);
 			u->v.subvol = 0;
-			ret = bch2_trans_update(trans, iter, &u->k_i, 0);
-			if (ret)
-				goto err;
-
-			s = snapshot_i_to_s_c(u);
+			s = u->v;
 		}
 	}
 
-	ret = snapshot_tree_ptr_good(trans, s.k->p.offset, le32_to_cpu(s.v->tree));
+	ret = snapshot_tree_ptr_good(trans, k.k->p.offset, le32_to_cpu(s.tree));
 	if (ret < 0)
 		goto err;
 
 	if (fsck_err_on(!ret, c, "snapshot points to missing/incorrect tree:\n  %s",
-			(bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
-		ret = snapshot_tree_ptr_repair(trans, iter, &s);
+			(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
+		ret = snapshot_tree_ptr_repair(trans, iter, k, &s);
 		if (ret)
 			goto err;
 	}
 	ret = 0;
 
-	if (BCH_SNAPSHOT_DELETED(s.v))
-		set_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags);
+	real_depth = parent ? parent->depth + 1 : 0;
+
+	if (le32_to_cpu(s.depth) != real_depth &&
+	    (c->sb.version_upgrade_complete < bcachefs_metadata_version_snapshot_skiplists ||
+	     fsck_err(c, "snapshot with incorrect depth field, should be %u:\n  %s",
+		      real_depth, (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))) {
+		u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
+		ret = PTR_ERR_OR_ZERO(u);
+		if (ret)
+			goto err;
+
+		u->v.depth = cpu_to_le32(real_depth);
+		s = u->v;
+	}
+
+	ret = snapshot_skiplist_good(trans, s);
+	if (ret < 0)
+		goto err;
+
+	if (!ret &&
+	    (c->sb.version_upgrade_complete < bcachefs_metadata_version_snapshot_skiplists ||
+	     fsck_err(c, "snapshot with bad skiplist field:\n  %s",
+		      (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))) {
+		u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
+		ret = PTR_ERR_OR_ZERO(u);
+		if (ret)
+			goto err;
+
+		for (i = 0; i < ARRAY_SIZE(u->v.skip); i++)
+			u->v.skip[i] = cpu_to_le32(snapshot_skiplist_get(c, parent_id));
+
+		bubble_sort(u->v.skip, ARRAY_SIZE(u->v.skip), cmp_le32);
+		s = u->v;
+	}
+	ret = 0;
 err:
 fsck_err:
 	printbuf_exit(&buf);
@@ -618,9 +773,13 @@ int bch2_check_snapshots(struct bch_fs *c)
 	struct bkey_s_c k;
 	int ret;
 
+	/*
+	 * We iterate backwards as checking/fixing the depth field requires that
+	 * the parent's depth already be correct:
+	 */
 	ret = bch2_trans_run(c,
-		for_each_btree_key_commit(&trans, iter,
-			BTREE_ID_snapshots, POS_MIN,
+		for_each_btree_key_reverse_commit(&trans, iter,
+			BTREE_ID_snapshots, POS_MAX,
 			BTREE_ITER_PREFETCH, k,
 			NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
 		check_snapshot(&trans, &iter, k)));
@@ -847,10 +1006,12 @@ static int create_snapids(struct btree_trans *trans, u32 parent, u32 tree,
 			  u32 *snapshot_subvols,
 			  unsigned nr_snapids)
 {
+	struct bch_fs *c = trans->c;
 	struct btree_iter iter;
 	struct bkey_i_snapshot *n;
 	struct bkey_s_c k;
-	unsigned i;
+	unsigned i, j;
+	u32 depth = parent ? snapshot_t(c, parent)->depth + 1 : 0;
 	int ret;
 
 	bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots,
@@ -880,6 +1041,12 @@ static int create_snapids(struct btree_trans *trans, u32 parent, u32 tree,
 		n->v.parent	= cpu_to_le32(parent);
 		n->v.subvol	= cpu_to_le32(snapshot_subvols[i]);
 		n->v.tree	= cpu_to_le32(tree);
+		n->v.depth	= cpu_to_le32(depth);
+
+		for (j = 0; j < ARRAY_SIZE(n->v.skip); j++)
+			n->v.skip[j] = cpu_to_le32(snapshot_skiplist_get(c, parent));
+
+		bubble_sort(n->v.skip, ARRAY_SIZE(n->v.skip), cmp_le32);
 		SET_BCH_SNAPSHOT_SUBVOL(&n->v, true);
 
 		ret = bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0,
diff --git a/fs/bcachefs/subvolume.h b/fs/bcachefs/subvolume.h
index daa9a6b0819b..ab0b4a6de255 100644
--- a/fs/bcachefs/subvolume.h
+++ b/fs/bcachefs/subvolume.h
@@ -37,9 +37,34 @@ static inline struct snapshot_t *snapshot_t(struct bch_fs *c, u32 id)
 	return genradix_ptr(&c->snapshots, U32_MAX - id);
 }
 
+static inline u32 bch2_snapshot_parent_early(struct bch_fs *c, u32 id)
+{
+	return snapshot_t(c, id)->parent;
+}
+
 static inline u32 bch2_snapshot_parent(struct bch_fs *c, u32 id)
 {
+#ifdef CONFIG_BCACHEFS_DEBUG
+	u32 parent = snapshot_t(c, id)->parent;
+
+	if (parent &&
+	    snapshot_t(c, id)->depth != snapshot_t(c, parent)->depth + 1)
+		panic("id %u depth=%u parent %u depth=%u\n",
+		      id, snapshot_t(c, id)->depth,
+		      parent, snapshot_t(c, parent)->depth);
+
+	return parent;
+#else
 	return snapshot_t(c, id)->parent;
+#endif
+}
+
+static inline u32 bch2_snapshot_nth_parent(struct bch_fs *c, u32 id, u32 n)
+{
+	while (n--)
+		id = bch2_snapshot_parent(c, id);
+
+	return id;
 }
 
 static inline u32 bch2_snapshot_root(struct bch_fs *c, u32 id)
@@ -84,13 +109,7 @@ static inline u32 bch2_snapshot_sibling(struct bch_fs *c, u32 id)
 	return 0;
 }
 
-static inline bool bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor)
-{
-	while (id && id < ancestor)
-		id = bch2_snapshot_parent(c, id);
-
-	return id == ancestor;
-}
+bool bch2_snapshot_is_ancestor(struct bch_fs *, u32, u32);
 
 static inline bool bch2_snapshot_has_children(struct bch_fs *c, u32 id)
 {
diff --git a/fs/bcachefs/subvolume_types.h b/fs/bcachefs/subvolume_types.h
index c6c1cbad9781..750d975ac468 100644
--- a/fs/bcachefs/subvolume_types.h
+++ b/fs/bcachefs/subvolume_types.h
@@ -8,6 +8,8 @@ typedef DARRAY(u32) snapshot_id_list;
 
 struct snapshot_t {
 	u32			parent;
+	u32			skip[3];
+	u32			depth;
 	u32			children[2];
 	u32			subvol; /* Nonzero only if a subvolume points to this node: */
 	u32			tree;
-- 
cgit 


From b912913613a1195ba86cbb1bc9b1c12b4c6c4a14 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 10 Jul 2023 15:56:05 -0400
Subject: bcachefs: Fix build error on weird gcc
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

fixes
./include/linux/stddef.h:8:14: error: positional initialization of field in ‘struct’ declared with ‘designated_init’ attribute [-Werror=designated-init]

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/super-io.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index 6a97af0f5896..6ee1e7bb5eba 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -1487,9 +1487,7 @@ static const struct bch_sb_field_ops *bch2_sb_field_ops[] = {
 #undef x
 };
 
-static const struct bch_sb_field_ops bch2_sb_field_null_ops = {
-	NULL
-};
+static const struct bch_sb_field_ops bch2_sb_field_null_ops;
 
 static const struct bch_sb_field_ops *bch2_sb_field_type_ops(unsigned type)
 {
-- 
cgit 


From ea28c86722954a58e3bab24eec3e7624e8d4956b Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 10 Jul 2023 17:23:59 -0400
Subject: bcachefs: Don't start copygc until recovery is finished

With "bcachefs: Snapshot depth, skiplist fields", we now can't run data
move operations until after bch2_check_snapshots() is complete.

Ideally we'd have the copygc (and rebalance) threads wait until
c->curr_recovery_pass has advanced, but the waitlist handling is tricky
- so for now, move starting copygc back to read_write_late().

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/super.c | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 6ab98c2299dd..63e9dafa8395 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -344,6 +344,19 @@ static int bch2_fs_read_write_late(struct bch_fs *c)
 {
 	int ret;
 
+	/*
+	 * Data move operations can't run until after check_snapshots has
+	 * completed, and bch2_snapshot_is_ancestor() is available.
+	 *
+	 * Ideally we'd start copygc/rebalance earlier instead of waiting for
+	 * all of recovery/fsck to complete:
+	 */
+	ret = bch2_copygc_start(c);
+	if (ret) {
+		bch_err(c, "error starting copygc thread");
+		return ret;
+	}
+
 	ret = bch2_rebalance_start(c);
 	if (ret) {
 		bch_err(c, "error starting rebalance thread");
@@ -403,12 +416,6 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early)
 		return ret;
 	}
 
-	ret = bch2_copygc_start(c);
-	if (ret) {
-		bch_err(c, "error starting copygc thread");
-		return ret;
-	}
-
 	if (!early) {
 		ret = bch2_fs_read_write_late(c);
 		if (ret)
-- 
cgit 


From a09818c7e78633ee8a6d147ea5bf074d60ea66cd Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 9 Jul 2023 22:28:08 -0400
Subject: bcachefs: Fallocate now checks page cache

Previously, fallocate would only check the state of the extents btree
when determining if we need to create a reservation.

But the page cache might already have dirty data or a disk reservation.
This changes __bchfs_fallocate() to call bch2_seek_pagecache_hole() to
check for this.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-io.c | 83 +++++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 61 insertions(+), 22 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index cb654cfecfb9..0661dfd9a8d0 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -35,6 +35,8 @@
 
 #include <trace/events/writeback.h>
 
+static void bch2_clamp_data_hole(struct inode *, u64 *, u64 *, unsigned);
+
 struct folio_vec {
 	struct folio	*fv_folio;
 	size_t		fv_offset;
@@ -3370,6 +3372,8 @@ static int __bchfs_fallocate(struct bch_inode_info *inode, int mode,
 		struct quota_res quota_res = { 0 };
 		struct bkey_s_c k;
 		unsigned sectors;
+		bool is_allocation;
+		u64 hole_start, hole_end;
 		u32 snapshot;
 
 		bch2_trans_begin(&trans);
@@ -3385,6 +3389,10 @@ static int __bchfs_fallocate(struct bch_inode_info *inode, int mode,
 		if ((ret = bkey_err(k)))
 			goto bkey_err;
 
+		hole_start	= iter.pos.offset;
+		hole_end	= bpos_min(k.k->p, end_pos).offset;
+		is_allocation	= bkey_extent_is_allocation(k.k);
+
 		/* already reserved */
 		if (bkey_extent_is_reservation(k) &&
 		    bch2_bkey_nr_ptrs_fully_allocated(k) >= opts.data_replicas) {
@@ -3398,17 +3406,26 @@ static int __bchfs_fallocate(struct bch_inode_info *inode, int mode,
 			continue;
 		}
 
-		/*
-		 * XXX: for nocow mode, we should promote shared extents to
-		 * unshared here
-		 */
+		if (!(mode & FALLOC_FL_ZERO_RANGE)) {
+			ret = drop_locks_do(&trans,
+				(bch2_clamp_data_hole(&inode->v,
+						      &hole_start,
+						      &hole_end,
+						      opts.data_replicas), 0));
+			bch2_btree_iter_set_pos(&iter, POS(iter.pos.inode, hole_start));
+
+			if (ret)
+				goto bkey_err;
+
+			if (hole_start == hole_end)
+				continue;
+		}
 
-		sectors = bpos_min(k.k->p, end_pos).offset - iter.pos.offset;
+		sectors	= hole_end - hole_start;
 
-		if (!bkey_extent_is_allocation(k.k)) {
+		if (!is_allocation) {
 			ret = bch2_quota_reservation_add(c, inode,
-					&quota_res,
-					sectors, true);
+					&quota_res, sectors, true);
 			if (unlikely(ret))
 				goto bkey_err;
 		}
@@ -3420,15 +3437,15 @@ static int __bchfs_fallocate(struct bch_inode_info *inode, int mode,
 			goto bkey_err;
 
 		i_sectors_acct(c, inode, &quota_res, i_sectors_delta);
+
+		drop_locks_do(&trans,
+			(mark_pagecache_reserved(inode, hole_start, iter.pos.offset), 0));
 bkey_err:
 		bch2_quota_reservation_put(c, inode, &quota_res);
 		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
 			ret = 0;
 	}
 
-	bch2_trans_unlock(&trans); /* lock ordering, before taking pagecache locks: */
-	mark_pagecache_reserved(inode, start_sector, iter.pos.offset);
-
 	if (bch2_err_matches(ret, ENOSPC) && (mode & FALLOC_FL_ZERO_RANGE)) {
 		struct quota_res quota_res = { 0 };
 		s64 i_sectors_delta = 0;
@@ -3676,14 +3693,16 @@ err:
 
 /* fseek: */
 
-static int folio_data_offset(struct folio *folio, loff_t pos)
+static int folio_data_offset(struct folio *folio, loff_t pos,
+			     unsigned min_replicas)
 {
 	struct bch_folio *s = bch2_folio(folio);
 	unsigned i, sectors = folio_sectors(folio);
 
 	if (s)
 		for (i = folio_pos_to_s(folio, pos); i < sectors; i++)
-			if (s->s[i].state >= SECTOR_dirty)
+			if (s->s[i].state >= SECTOR_dirty &&
+			    s->s[i].nr_replicas + s->s[i].replicas_reserved >= min_replicas)
 				return i << SECTOR_SHIFT;
 
 	return -1;
@@ -3691,7 +3710,8 @@ static int folio_data_offset(struct folio *folio, loff_t pos)
 
 static loff_t bch2_seek_pagecache_data(struct inode *vinode,
 				       loff_t start_offset,
-				       loff_t end_offset)
+				       loff_t end_offset,
+				       unsigned min_replicas)
 {
 	struct folio_batch fbatch;
 	pgoff_t start_index	= start_offset >> PAGE_SHIFT;
@@ -3710,7 +3730,8 @@ static loff_t bch2_seek_pagecache_data(struct inode *vinode,
 
 			folio_lock(folio);
 			offset = folio_data_offset(folio,
-					max(folio_pos(folio), start_offset));
+					max(folio_pos(folio), start_offset),
+					min_replicas);
 			if (offset >= 0) {
 				ret = clamp(folio_pos(folio) + offset,
 					    start_offset, end_offset);
@@ -3772,7 +3793,7 @@ err:
 
 	if (next_data > offset)
 		next_data = bch2_seek_pagecache_data(&inode->v,
-						     offset, next_data);
+						     offset, next_data, 0);
 
 	if (next_data >= isize)
 		return -ENXIO;
@@ -3780,7 +3801,8 @@ err:
 	return vfs_setpos(file, next_data, MAX_LFS_FILESIZE);
 }
 
-static bool folio_hole_offset(struct address_space *mapping, loff_t *offset)
+static bool folio_hole_offset(struct address_space *mapping, loff_t *offset,
+			      unsigned min_replicas)
 {
 	struct folio *folio;
 	struct bch_folio *s;
@@ -3797,7 +3819,8 @@ static bool folio_hole_offset(struct address_space *mapping, loff_t *offset)
 
 	sectors = folio_sectors(folio);
 	for (i = folio_pos_to_s(folio, *offset); i < sectors; i++)
-		if (s->s[i].state < SECTOR_dirty) {
+		if (s->s[i].state < SECTOR_dirty ||
+		    s->s[i].nr_replicas + s->s[i].replicas_reserved < min_replicas) {
 			*offset = max(*offset,
 				      folio_pos(folio) + (i << SECTOR_SHIFT));
 			goto unlock;
@@ -3812,18 +3835,34 @@ unlock:
 
 static loff_t bch2_seek_pagecache_hole(struct inode *vinode,
 				       loff_t start_offset,
-				       loff_t end_offset)
+				       loff_t end_offset,
+				       unsigned min_replicas)
 {
 	struct address_space *mapping = vinode->i_mapping;
 	loff_t offset = start_offset;
 
 	while (offset < end_offset &&
-	       !folio_hole_offset(mapping, &offset))
+	       !folio_hole_offset(mapping, &offset, min_replicas))
 		;
 
 	return min(offset, end_offset);
 }
 
+static void bch2_clamp_data_hole(struct inode *inode,
+				 u64 *hole_start,
+				 u64 *hole_end,
+				 unsigned min_replicas)
+{
+	*hole_start = bch2_seek_pagecache_hole(inode,
+		*hole_start << 9, *hole_end << 9, min_replicas) >> 9;
+
+	if (*hole_start == *hole_end)
+		return;
+
+	*hole_end = bch2_seek_pagecache_data(inode,
+		*hole_start << 9, *hole_end << 9, min_replicas) >> 9;
+}
+
 static loff_t bch2_seek_hole(struct file *file, u64 offset)
 {
 	struct bch_inode_info *inode = file_bch_inode(file);
@@ -3853,12 +3892,12 @@ retry:
 			   BTREE_ITER_SLOTS, k, ret) {
 		if (k.k->p.inode != inode->v.i_ino) {
 			next_hole = bch2_seek_pagecache_hole(&inode->v,
-					offset, MAX_LFS_FILESIZE);
+					offset, MAX_LFS_FILESIZE, 0);
 			break;
 		} else if (!bkey_extent_is_data(k.k)) {
 			next_hole = bch2_seek_pagecache_hole(&inode->v,
 					max(offset, bkey_start_offset(k.k) << 9),
-					k.k->p.offset << 9);
+					k.k->p.offset << 9, 0);
 
 			if (next_hole < k.k->p.offset << 9)
 				break;
-- 
cgit 


From f39d1aca4de011949b1b1c636de3146f3b7c1384 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 10 Jul 2023 20:30:04 -0400
Subject: bcachefs: Add buffered IO fallback for userspace

In userspace, we want to be able to switch to buffered IO when we're
dealing with an image on a filesystem/device that doesn't support the
blocksize the filesystem was formatted with.

This plumbs through !opts.direct_io -> FMODE_BUFFERED, which will be
supported by the shim version of blkdev_get_by_path() in -tools, and it
adds a fallback to disable direct IO and retry for userspace.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/super-io.c | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index 6ee1e7bb5eba..deef31a617c4 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -662,7 +662,9 @@ int bch2_read_super(const char *path, struct bch_opts *opts,
 	struct printbuf err = PRINTBUF;
 	__le64 *i;
 	int ret;
-
+#ifndef __KERNEL__
+retry:
+#endif
 	memset(sb, 0, sizeof(*sb));
 	sb->mode	= BLK_OPEN_READ;
 	sb->have_bio	= true;
@@ -670,6 +672,11 @@ int bch2_read_super(const char *path, struct bch_opts *opts,
 	if (!sb->holder)
 		return -ENOMEM;
 
+#ifndef __KERNEL__
+	if (opt_get(*opts, direct_io) == false)
+		sb->mode |= FMODE_BUFFERED;
+#endif
+
 	if (!opt_get(*opts, noexcl))
 		sb->mode |= BLK_OPEN_EXCL;
 
@@ -754,7 +761,13 @@ int bch2_read_super(const char *path, struct bch_opts *opts,
 
 got_super:
 	if (le16_to_cpu(sb->sb->block_size) << 9 <
-	    bdev_logical_block_size(sb->bdev)) {
+	    bdev_logical_block_size(sb->bdev) &&
+	    opt_get(*opts, direct_io)) {
+#ifndef __KERNEL__
+		opt_set(*opts, direct_io, false);
+		bch2_free_super(sb);
+		goto retry;
+#endif
 		prt_printf(&err, "block size (%u) smaller than device block size (%u)",
 		       le16_to_cpu(sb->sb->block_size) << 9,
 		       bdev_logical_block_size(sb->bdev));
-- 
cgit 


From d82978ca1593890a1b41eab6d06fe6e5950e4722 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 12 Jul 2023 11:43:03 -0400
Subject: bcachefs: Add a race_fault() for write buffer slowpath

We haven't hooked up dynamic fault injection quite yet, but we will soon

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_write_buffer.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_write_buffer.c b/fs/bcachefs/btree_write_buffer.c
index b50226313a47..6c30a72e6eee 100644
--- a/fs/bcachefs/btree_write_buffer.c
+++ b/fs/bcachefs/btree_write_buffer.c
@@ -129,6 +129,9 @@ int __bch2_btree_write_buffer_flush(struct btree_trans *trans, unsigned commit_f
 	keys = wb->keys[s.idx];
 	nr = s.nr;
 
+	if (race_fault())
+		goto slowpath;
+
 	/*
 	 * We first sort so that we can detect and skip redundant updates, and
 	 * then we attempt to flush in sorted btree order, as this is most
-- 
cgit 


From 8479938d7a0f6c6cf6362c72880e753b3d7a707a Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 12 Jul 2023 13:55:03 -0400
Subject: bcachefs: Convert snapshot table to RCU array

This switches the generic radix tree for the in-memory table of snapshot
nodes to a simple rcu array. This means we have to add new locking to
deal with reallocations, but is faster than traversing the radix tree.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs.h          |   5 +-
 fs/bcachefs/btree_update_leaf.c |   4 +-
 fs/bcachefs/fsck.c              |   2 +-
 fs/bcachefs/quota.c             |   2 +-
 fs/bcachefs/subvolume.c         | 142 +++++++++++++++++++++++++++++++++-------
 fs/bcachefs/subvolume.h         |  95 ++++++++++++++++++++++-----
 fs/bcachefs/subvolume_types.h   |   4 ++
 7 files changed, 207 insertions(+), 47 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index d8c020644f54..445d010c83b3 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -774,9 +774,10 @@ struct bch_fs {
 	struct mutex		sb_lock;
 
 	/* snapshot.c: */
-	GENRADIX(struct snapshot_t) snapshots;
-	struct bch_snapshot_table __rcu *snapshot_table;
+	struct snapshot_table __rcu *snapshots;
+	size_t			snapshot_table_size;
 	struct mutex		snapshot_table_lock;
+
 	struct work_struct	snapshot_delete_work;
 	struct work_struct	snapshot_wait_for_pagecache_and_delete_work;
 	snapshot_id_list	snapshots_unlinked;
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 53219fdcff66..3638cef211b2 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -311,7 +311,7 @@ static inline void btree_insert_entry_checks(struct btree_trans *trans,
 		!(i->flags & BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) &&
 		test_bit(JOURNAL_REPLAY_DONE, &trans->c->journal.flags) &&
 		i->k->k.p.snapshot &&
-		bch2_snapshot_internal_node(trans->c, i->k->k.p.snapshot));
+		bch2_snapshot_is_internal_node(trans->c, i->k->k.p.snapshot));
 }
 
 static noinline int
@@ -1229,7 +1229,7 @@ static inline int check_pos_snapshot_overwritten(struct btree_trans *trans,
 					  struct bpos pos)
 {
 	if (!btree_type_has_snapshots(id) ||
-	    !snapshot_t(trans->c, pos.snapshot)->children[0])
+	    bch2_snapshot_is_leaf(trans->c, pos.snapshot))
 		return 0;
 
 	return __check_pos_snapshot_overwritten(trans, id, pos);
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index ddc2782fc5b1..bc769b3e932a 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -894,7 +894,7 @@ static int check_inode(struct btree_trans *trans,
 	 * particular is not atomic, so on the internal snapshot nodes
 	 * we can see inodes marked for deletion after a clean shutdown
 	 */
-	if (bch2_snapshot_internal_node(c, k.k->p.snapshot))
+	if (bch2_snapshot_is_internal_node(c, k.k->p.snapshot))
 		return 0;
 
 	if (!bkey_is_inode(k.k))
diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c
index d90db3fb823e..4f0654ff816f 100644
--- a/fs/bcachefs/quota.c
+++ b/fs/bcachefs/quota.c
@@ -562,7 +562,7 @@ static int bch2_fs_quota_read_inode(struct btree_trans *trans,
 	int ret;
 
 	ret = bch2_snapshot_tree_lookup(trans,
-			snapshot_t(c, k.k->p.snapshot)->tree, &s_t);
+			bch2_snapshot_tree(c, k.k->p.snapshot), &s_t);
 	bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c,
 			"%s: snapshot tree %u not found", __func__,
 			snapshot_t(c, k.k->p.snapshot)->tree);
diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c
index cdaaf49d3b3e..c2c2cfd74e71 100644
--- a/fs/bcachefs/subvolume.c
+++ b/fs/bcachefs/subvolume.c
@@ -12,9 +12,9 @@
 
 static int bch2_subvolume_delete(struct btree_trans *, u32);
 
-static inline u32 get_ancestor_below(struct bch_fs *c, u32 id, u32 ancestor)
+static inline u32 get_ancestor_below(struct snapshot_table *t, u32 id, u32 ancestor)
 {
-	struct snapshot_t *s = snapshot_t(c, id);
+	const struct snapshot_t *s = __snapshot_t(t, id);
 
 	if (s->skip[2] <= ancestor)
 		return s->skip[2];
@@ -27,22 +27,102 @@ static inline u32 get_ancestor_below(struct bch_fs *c, u32 id, u32 ancestor)
 
 bool bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor)
 {
+	struct snapshot_table *t;
+
 	EBUG_ON(c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_snapshots);
 
+	rcu_read_lock();
+	t = rcu_dereference(c->snapshots);
+
 	while (id && id < ancestor)
-		id = get_ancestor_below(c, id, ancestor);
+		id = get_ancestor_below(t, id, ancestor);
+	rcu_read_unlock();
 
 	return id == ancestor;
 }
 
 static bool bch2_snapshot_is_ancestor_early(struct bch_fs *c, u32 id, u32 ancestor)
 {
+	struct snapshot_table *t;
+
+	rcu_read_lock();
+	t = rcu_dereference(c->snapshots);
+
 	while (id && id < ancestor)
-		id = snapshot_t(c, id)->parent;
+		id = __snapshot_t(t, id)->parent;
+	rcu_read_unlock();
 
 	return id == ancestor;
 }
 
+static inline u32 bch2_snapshot_depth(struct bch_fs *c, u32 parent)
+{
+	u32 depth;
+
+	rcu_read_lock();
+	depth = parent ? snapshot_t(c, parent)->depth + 1 : 0;
+	rcu_read_unlock();
+
+	return depth;
+}
+
+struct snapshot_t_free_rcu {
+	struct rcu_head		rcu;
+	struct snapshot_table	*t;
+};
+
+static void snapshot_t_free_rcu(struct rcu_head *rcu)
+{
+	struct snapshot_t_free_rcu *free_rcu =
+		container_of(rcu, struct snapshot_t_free_rcu, rcu);
+
+	kvfree(free_rcu->t);
+	kfree(free_rcu);
+}
+
+static noinline struct snapshot_t *__snapshot_t_mut(struct bch_fs *c, u32 id)
+{
+	size_t idx = U32_MAX - id;
+	size_t new_size;
+	struct snapshot_table *new, *old;
+
+	new_size = max(16UL, roundup_pow_of_two(idx + 1));
+
+	new = kvzalloc(struct_size(new, s, new_size), GFP_KERNEL);
+	if (!new)
+		return NULL;
+
+	old = rcu_dereference_protected(c->snapshots, true);
+	if (old)
+		memcpy(new->s,
+		       rcu_dereference_protected(c->snapshots, true)->s,
+		       sizeof(new->s[0]) * c->snapshot_table_size);
+
+	rcu_assign_pointer(c->snapshots, new);
+	c->snapshot_table_size = new_size;
+	if (old) {
+		struct snapshot_t_free_rcu *rcu =
+			kmalloc(sizeof(*rcu), GFP_KERNEL|__GFP_NOFAIL);
+
+		rcu->t = old;
+		call_rcu(&rcu->rcu, snapshot_t_free_rcu);
+	}
+
+	return &rcu_dereference_protected(c->snapshots, true)->s[idx];
+}
+
+static inline struct snapshot_t *snapshot_t_mut(struct bch_fs *c, u32 id)
+{
+	size_t idx = U32_MAX - id;
+
+	lockdep_assert_held(&c->snapshot_table_lock);
+
+	if (likely(idx < c->snapshot_table_size))
+		return &rcu_dereference_protected(c->snapshots, true)->s[idx];
+
+	return __snapshot_t_mut(c, id);
+}
+
 /* Snapshot tree: */
 
 void bch2_snapshot_tree_to_text(struct printbuf *out, struct bch_fs *c,
@@ -209,12 +289,15 @@ int bch2_mark_snapshot(struct btree_trans *trans,
 {
 	struct bch_fs *c = trans->c;
 	struct snapshot_t *t;
+	int ret = 0;
 
-	t = genradix_ptr_alloc(&c->snapshots,
-			       U32_MAX - new.k->p.offset,
-			       GFP_KERNEL);
-	if (!t)
-		return -BCH_ERR_ENOMEM_mark_snapshot;
+	mutex_lock(&c->snapshot_table_lock);
+
+	t = snapshot_t_mut(c, new.k->p.offset);
+	if (!t) {
+		ret = -BCH_ERR_ENOMEM_mark_snapshot;
+		goto err;
+	}
 
 	if (new.k->type == KEY_TYPE_snapshot) {
 		struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(new);
@@ -246,8 +329,9 @@ int bch2_mark_snapshot(struct btree_trans *trans,
 		t->subvol	= 0;
 		t->tree		= 0;
 	}
-
-	return 0;
+err:
+	mutex_unlock(&c->snapshot_table_lock);
+	return ret;
 }
 
 static int snapshot_lookup(struct btree_trans *trans, u32 id,
@@ -300,9 +384,14 @@ static int bch2_snapshot_set_equiv(struct btree_trans *trans, struct bkey_s_c k)
 		nr_live += ret;
 	}
 
-	snapshot_t(c, id)->equiv = nr_live == 1
-		? snapshot_t(c, child[live_idx])->equiv
+	mutex_lock(&c->snapshot_table_lock);
+
+	snapshot_t_mut(c, id)->equiv = nr_live == 1
+		? snapshot_t_mut(c, child[live_idx])->equiv
 		: id;
+
+	mutex_unlock(&c->snapshot_table_lock);
+
 	return 0;
 }
 
@@ -520,16 +609,18 @@ static int snapshot_tree_ptr_good(struct btree_trans *trans,
 
 static u32 snapshot_skiplist_get(struct bch_fs *c, u32 id)
 {
-	struct snapshot_t *s;
+	const struct snapshot_t *s;
 
 	if (!id)
 		return 0;
 
+	rcu_read_lock();
 	s = snapshot_t(c, id);
-	if (!s->parent)
-		return id;
+	if (s->parent)
+		id = bch2_snapshot_nth_parent(c, id, get_random_u32_below(s->depth));
+	rcu_read_unlock();
 
-	return bch2_snapshot_nth_parent(c, id, get_random_u32_below(s->depth));
+	return id;
 }
 
 static int snapshot_skiplist_good(struct btree_trans *trans, struct bch_snapshot s)
@@ -633,9 +724,6 @@ static int check_snapshot(struct btree_trans *trans,
 	struct bkey_i_snapshot *u;
 	u32 parent_id = bch2_snapshot_parent_early(c, k.k->p.offset);
 	u32 real_depth;
-	struct snapshot_t *parent = parent_id
-		? snapshot_t(c, parent_id)
-		: NULL;
 	struct printbuf buf = PRINTBUF;
 	bool should_have_subvol;
 	u32 i, id;
@@ -726,7 +814,7 @@ static int check_snapshot(struct btree_trans *trans,
 	}
 	ret = 0;
 
-	real_depth = parent ? parent->depth + 1 : 0;
+	real_depth = bch2_snapshot_depth(c, parent_id);
 
 	if (le32_to_cpu(s.depth) != real_depth &&
 	    (c->sb.version_upgrade_complete < bcachefs_metadata_version_snapshot_skiplists ||
@@ -823,9 +911,13 @@ static int check_subvol(struct btree_trans *trans,
 
 	if (!BCH_SUBVOLUME_SNAP(subvol.v)) {
 		u32 snapshot_root = bch2_snapshot_root(c, le32_to_cpu(subvol.v->snapshot));
-		u32 snapshot_tree = snapshot_t(c, snapshot_root)->tree;
+		u32 snapshot_tree;
 		struct bch_snapshot_tree st;
 
+		rcu_read_lock();
+		snapshot_tree = snapshot_t(c, snapshot_root)->tree;
+		rcu_read_unlock();
+
 		ret = bch2_snapshot_tree_lookup(trans, snapshot_tree, &st);
 
 		bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c,
@@ -869,7 +961,7 @@ int bch2_check_subvols(struct bch_fs *c)
 
 void bch2_fs_snapshots_exit(struct bch_fs *c)
 {
-	genradix_free(&c->snapshots);
+	kfree(rcu_dereference_protected(c->snapshots, true));
 }
 
 int bch2_snapshots_read(struct bch_fs *c)
@@ -1011,7 +1103,7 @@ static int create_snapids(struct btree_trans *trans, u32 parent, u32 tree,
 	struct bkey_i_snapshot *n;
 	struct bkey_s_c k;
 	unsigned i, j;
-	u32 depth = parent ? snapshot_t(c, parent)->depth + 1 : 0;
+	u32 depth = bch2_snapshot_depth(c, parent);
 	int ret;
 
 	bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots,
@@ -1150,7 +1242,7 @@ static int snapshot_delete_key(struct btree_trans *trans,
 			       struct bpos *last_pos)
 {
 	struct bch_fs *c = trans->c;
-	u32 equiv = snapshot_t(c, k.k->p.snapshot)->equiv;
+	u32 equiv = bch2_snapshot_equiv(c, k.k->p.snapshot);
 
 	if (!bkey_eq(k.k->p, *last_pos))
 		equiv_seen->nr = 0;
diff --git a/fs/bcachefs/subvolume.h b/fs/bcachefs/subvolume.h
index ab0b4a6de255..12a08a34e9bb 100644
--- a/fs/bcachefs/subvolume.h
+++ b/fs/bcachefs/subvolume.h
@@ -32,17 +32,40 @@ int bch2_mark_snapshot(struct btree_trans *, enum btree_id, unsigned,
 	.min_val_size	= 24,					\
 })
 
-static inline struct snapshot_t *snapshot_t(struct bch_fs *c, u32 id)
+static inline struct snapshot_t *__snapshot_t(struct snapshot_table *t, u32 id)
 {
-	return genradix_ptr(&c->snapshots, U32_MAX - id);
+	return &t->s[U32_MAX - id];
 }
 
-static inline u32 bch2_snapshot_parent_early(struct bch_fs *c, u32 id)
+static inline const struct snapshot_t *snapshot_t(struct bch_fs *c, u32 id)
+{
+	return __snapshot_t(rcu_dereference(c->snapshots), id);
+}
+
+static inline u32 bch2_snapshot_tree(struct bch_fs *c, u32 id)
+{
+	rcu_read_lock();
+	id = snapshot_t(c, id)->tree;
+	rcu_read_unlock();
+
+	return id;
+}
+
+static inline u32 __bch2_snapshot_parent_early(struct bch_fs *c, u32 id)
 {
 	return snapshot_t(c, id)->parent;
 }
 
-static inline u32 bch2_snapshot_parent(struct bch_fs *c, u32 id)
+static inline u32 bch2_snapshot_parent_early(struct bch_fs *c, u32 id)
+{
+	rcu_read_lock();
+	id = __bch2_snapshot_parent_early(c, id);
+	rcu_read_unlock();
+
+	return id;
+}
+
+static inline u32 __bch2_snapshot_parent(struct bch_fs *c, u32 id)
 {
 #ifdef CONFIG_BCACHEFS_DEBUG
 	u32 parent = snapshot_t(c, id)->parent;
@@ -59,10 +82,21 @@ static inline u32 bch2_snapshot_parent(struct bch_fs *c, u32 id)
 #endif
 }
 
+static inline u32 bch2_snapshot_parent(struct bch_fs *c, u32 id)
+{
+	rcu_read_lock();
+	id = __bch2_snapshot_parent(c, id);
+	rcu_read_unlock();
+
+	return id;
+}
+
 static inline u32 bch2_snapshot_nth_parent(struct bch_fs *c, u32 id, u32 n)
 {
+	rcu_read_lock();
 	while (n--)
-		id = bch2_snapshot_parent(c, id);
+		id = __bch2_snapshot_parent(c, id);
+	rcu_read_unlock();
 
 	return id;
 }
@@ -71,37 +105,60 @@ static inline u32 bch2_snapshot_root(struct bch_fs *c, u32 id)
 {
 	u32 parent;
 
-	while ((parent = bch2_snapshot_parent(c, id)))
+	rcu_read_lock();
+	while ((parent = __bch2_snapshot_parent(c, id)))
 		id = parent;
+	rcu_read_unlock();
+
 	return id;
 }
 
-static inline u32 bch2_snapshot_equiv(struct bch_fs *c, u32 id)
+static inline u32 __bch2_snapshot_equiv(struct bch_fs *c, u32 id)
 {
 	return snapshot_t(c, id)->equiv;
 }
 
+static inline u32 bch2_snapshot_equiv(struct bch_fs *c, u32 id)
+{
+	rcu_read_lock();
+	id = __bch2_snapshot_equiv(c, id);
+	rcu_read_unlock();
+
+	return id;
+}
+
 static inline bool bch2_snapshot_is_equiv(struct bch_fs *c, u32 id)
 {
-	return id == snapshot_t(c, id)->equiv;
+	return id == bch2_snapshot_equiv(c, id);
 }
 
-static inline u32 bch2_snapshot_internal_node(struct bch_fs *c, u32 id)
+static inline bool bch2_snapshot_is_internal_node(struct bch_fs *c, u32 id)
 {
-	struct snapshot_t *s = snapshot_t(c, id);
+	const struct snapshot_t *s;
+	bool ret;
+
+	rcu_read_lock();
+	s = snapshot_t(c, id);
+	ret = s->children[0];
+	rcu_read_unlock();
 
-	return s->children[0] || s->children[1];
+	return ret;
+}
+
+static inline u32 bch2_snapshot_is_leaf(struct bch_fs *c, u32 id)
+{
+	return !bch2_snapshot_is_internal_node(c, id);
 }
 
 static inline u32 bch2_snapshot_sibling(struct bch_fs *c, u32 id)
 {
-	struct snapshot_t *s;
-	u32 parent = bch2_snapshot_parent(c, id);
+	const struct snapshot_t *s;
+	u32 parent = __bch2_snapshot_parent(c, id);
 
 	if (!parent)
 		return 0;
 
-	s = snapshot_t(c, bch2_snapshot_parent(c, id));
+	s = snapshot_t(c, __bch2_snapshot_parent(c, id));
 	if (id == s->children[0])
 		return s->children[1];
 	if (id == s->children[1])
@@ -113,9 +170,15 @@ bool bch2_snapshot_is_ancestor(struct bch_fs *, u32, u32);
 
 static inline bool bch2_snapshot_has_children(struct bch_fs *c, u32 id)
 {
-	struct snapshot_t *t = snapshot_t(c, id);
+	const struct snapshot_t *t;
+	bool ret;
 
-	return (t->children[0]|t->children[1]) != 0;
+	rcu_read_lock();
+	t = snapshot_t(c, id);
+	ret = (t->children[0]|t->children[1]) != 0;
+	rcu_read_unlock();
+
+	return ret;
 }
 
 static inline bool snapshot_list_has_id(snapshot_id_list *s, u32 id)
diff --git a/fs/bcachefs/subvolume_types.h b/fs/bcachefs/subvolume_types.h
index 750d975ac468..c596e4270690 100644
--- a/fs/bcachefs/subvolume_types.h
+++ b/fs/bcachefs/subvolume_types.h
@@ -16,6 +16,10 @@ struct snapshot_t {
 	u32			equiv;
 };
 
+struct snapshot_table {
+	struct snapshot_t	s[0];
+};
+
 typedef struct {
 	u32		subvol;
 	u64		inum;
-- 
cgit 


From 9f343e24f541bef3d5f081925eae5734c2c39c28 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 12 Jul 2023 21:48:32 -0400
Subject: bcachefs: bch_opt_fn

Minor refactoring to get rid of some unneeded token pasting.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/disk_groups.c | 21 ++++++++++++++-------
 fs/bcachefs/disk_groups.h |  7 ++++++-
 fs/bcachefs/opts.c        | 13 ++++---------
 fs/bcachefs/opts.h        | 11 +++++++++--
 4 files changed, 33 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/disk_groups.c b/fs/bcachefs/disk_groups.c
index 52b640077970..de14ca3a9895 100644
--- a/fs/bcachefs/disk_groups.c
+++ b/fs/bcachefs/disk_groups.c
@@ -460,30 +460,37 @@ int bch2_dev_group_set(struct bch_fs *c, struct bch_dev *ca, const char *name)
 	return ret;
 }
 
-int bch2_opt_target_parse(struct bch_fs *c, const char *buf, u64 *v)
+int bch2_opt_target_parse(struct bch_fs *c, const char *val, u64 *res,
+			  struct printbuf *err)
 {
 	struct bch_dev *ca;
 	int g;
 
-	if (!strlen(buf) || !strcmp(buf, "none")) {
-		*v = 0;
+	if (!val)
+		return -EINVAL;
+
+	if (!c)
+		return 0;
+
+	if (!strlen(val) || !strcmp(val, "none")) {
+		*res = 0;
 		return 0;
 	}
 
 	/* Is it a device? */
-	ca = bch2_dev_lookup(c, buf);
+	ca = bch2_dev_lookup(c, val);
 	if (!IS_ERR(ca)) {
-		*v = dev_to_target(ca->dev_idx);
+		*res = dev_to_target(ca->dev_idx);
 		percpu_ref_put(&ca->ref);
 		return 0;
 	}
 
 	mutex_lock(&c->sb_lock);
-	g = bch2_disk_path_find(&c->disk_sb, buf);
+	g = bch2_disk_path_find(&c->disk_sb, val);
 	mutex_unlock(&c->sb_lock);
 
 	if (g >= 0) {
-		*v = group_to_target(g);
+		*res = group_to_target(g);
 		return 0;
 	}
 
diff --git a/fs/bcachefs/disk_groups.h b/fs/bcachefs/disk_groups.h
index ec12584ceee7..bd7711767fd4 100644
--- a/fs/bcachefs/disk_groups.h
+++ b/fs/bcachefs/disk_groups.h
@@ -85,9 +85,14 @@ int bch2_disk_path_find_or_create(struct bch_sb_handle *, const char *);
 
 void bch2_disk_path_to_text(struct printbuf *, struct bch_sb *, unsigned);
 
-int bch2_opt_target_parse(struct bch_fs *, const char *, u64 *);
+int bch2_opt_target_parse(struct bch_fs *, const char *, u64 *, struct printbuf *);
 void bch2_opt_target_to_text(struct printbuf *, struct bch_fs *, struct bch_sb *, u64);
 
+#define bch2_opt_target (struct bch_opt_fn) {		\
+	.parse		= bch2_opt_target_parse,	\
+	.to_text	= bch2_opt_target_to_text,	\
+}
+
 int bch2_sb_disk_groups_to_cpu(struct bch_fs *);
 
 int __bch2_dev_group_set(struct bch_fs *, struct bch_dev *, const char *);
diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c
index 0c0c83fa4264..96c2f3c2fbce 100644
--- a/fs/bcachefs/opts.c
+++ b/fs/bcachefs/opts.c
@@ -167,11 +167,9 @@ const struct bch_option bch2_opt_table[] = {
 #define OPT_UINT(_min, _max)	.type = BCH_OPT_UINT,			\
 				.min = _min, .max = _max
 #define OPT_STR(_choices)	.type = BCH_OPT_STR,			\
-				.min = 0, .max = ARRAY_SIZE(_choices),\
+				.min = 0, .max = ARRAY_SIZE(_choices),	\
 				.choices = _choices
-#define OPT_FN(_fn)		.type = BCH_OPT_FN,			\
-				.parse = _fn##_parse,			\
-				.to_text = _fn##_to_text
+#define OPT_FN(_fn)		.type = BCH_OPT_FN, .fn	= _fn
 
 #define x(_name, _bits, _flags, _type, _sb_opt, _default, _hint, _help)	\
 	[Opt_##_name] = {						\
@@ -298,10 +296,7 @@ int bch2_opt_parse(struct bch_fs *c,
 		*res = ret;
 		break;
 	case BCH_OPT_FN:
-		if (!c)
-			return 0;
-
-		ret = opt->parse(c, val, res);
+		ret = opt->fn.parse(c, val, res, err);
 		if (ret < 0) {
 			if (err)
 				prt_printf(err, "%s: parse error",
@@ -344,7 +339,7 @@ void bch2_opt_to_text(struct printbuf *out,
 			prt_printf(out, "%s", opt->choices[v]);
 		break;
 	case BCH_OPT_FN:
-		opt->to_text(out, c, sb, v);
+		opt->fn.to_text(out, c, sb, v);
 		break;
 	default:
 		BUG();
diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
index e105a742fd44..3be5095aa472 100644
--- a/fs/bcachefs/opts.h
+++ b/fs/bcachefs/opts.h
@@ -8,6 +8,8 @@
 #include <linux/sysfs.h>
 #include "bcachefs_format.h"
 
+struct bch_fs;
+
 extern const char * const bch2_error_actions[];
 extern const char * const bch2_version_upgrade_opts[];
 extern const char * const bch2_sb_features[];
@@ -67,6 +69,11 @@ enum opt_type {
 	BCH_OPT_FN,
 };
 
+struct bch_opt_fn {
+	int (*parse)(struct bch_fs *, const char *, u64 *, struct printbuf *);
+	void (*to_text)(struct printbuf *, struct bch_fs *, struct bch_sb *, u64);
+};
+
 /**
  * x(name, shortopt, type, in mem type, mode, sb_opt)
  *
@@ -495,8 +502,8 @@ struct bch_option {
 	u64			min, max;
 
 	const char * const *choices;
-	int (*parse)(struct bch_fs *, const char *, u64 *);
-	void (*to_text)(struct printbuf *, struct bch_fs *, struct bch_sb *, u64);
+
+	struct bch_opt_fn	fn;
 
 	const char		*hint;
 	const char		*help;
-- 
cgit 


From a0f8faea5f47d6e18253225e8f2f88cdc49d27d8 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 11 Jul 2023 23:47:29 -0400
Subject: bcachefs: fix_errors option is now a proper enum

Before, it was parsed as a bool but internally it was really an enum:
this lets us pass in all the possible values.

But we special case the option parsing: no supplied value is parsed as
FSCK_FIX_yes, to match the previous behaviour.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_gc.c |   2 +-
 fs/bcachefs/error.c    |  12 +++---
 fs/bcachefs/error.h    |   7 ----
 fs/bcachefs/opts.c     | 102 ++++++++++++++++++++++++++++++++++---------------
 fs/bcachefs/opts.h     |  17 ++++++++-
 fs/bcachefs/recovery.c |   2 +-
 6 files changed, 95 insertions(+), 47 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 6000b09dec26..edea6bb66253 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -1808,7 +1808,7 @@ again:
 	if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) ||
 	    (BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb) &&
 	     c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_allocations &&
-	     c->opts.fix_errors != FSCK_OPT_NO)) {
+	     c->opts.fix_errors != FSCK_FIX_no)) {
 		bch_info(c, "Starting topology repair pass");
 		ret = bch2_repair_topology(c);
 		if (ret)
diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c
index b08cd23dee00..685464b8cce3 100644
--- a/fs/bcachefs/error.c
+++ b/fs/bcachefs/error.c
@@ -204,7 +204,7 @@ int bch2_fsck_err(struct bch_fs *c, unsigned flags, const char *fmt, ...)
 			prt_str(out, ", continuing");
 			ret = -BCH_ERR_fsck_ignore;
 		}
-	} else if (c->opts.fix_errors == FSCK_OPT_EXIT) {
+	} else if (c->opts.fix_errors == FSCK_FIX_exit) {
 		prt_str(out, ", exiting");
 		ret = -BCH_ERR_fsck_errors_not_fixed;
 	} else if (flags & FSCK_CAN_FIX) {
@@ -212,7 +212,7 @@ int bch2_fsck_err(struct bch_fs *c, unsigned flags, const char *fmt, ...)
 			? s->fix
 			: c->opts.fix_errors;
 
-		if (fix == FSCK_OPT_ASK) {
+		if (fix == FSCK_FIX_ask) {
 			int ask;
 
 			prt_str(out, ": fix?");
@@ -223,13 +223,13 @@ int bch2_fsck_err(struct bch_fs *c, unsigned flags, const char *fmt, ...)
 
 			if (ask >= YN_ALLNO && s)
 				s->fix = ask == YN_ALLNO
-					? FSCK_OPT_NO
-					: FSCK_OPT_YES;
+					? FSCK_FIX_no
+					: FSCK_FIX_yes;
 
 			ret = ask & 1
 				? -BCH_ERR_fsck_fix
 				: -BCH_ERR_fsck_ignore;
-		} else if (fix == FSCK_OPT_YES ||
+		} else if (fix == FSCK_FIX_yes ||
 			   (c->opts.nochanges &&
 			    !(flags & FSCK_CAN_IGNORE))) {
 			prt_str(out, ", fixing");
@@ -244,7 +244,7 @@ int bch2_fsck_err(struct bch_fs *c, unsigned flags, const char *fmt, ...)
 	}
 
 	if (ret == -BCH_ERR_fsck_ignore &&
-	    (c->opts.fix_errors == FSCK_OPT_EXIT ||
+	    (c->opts.fix_errors == FSCK_FIX_exit ||
 	     !(flags & FSCK_CAN_IGNORE)))
 		ret = -BCH_ERR_fsck_errors_not_fixed;
 
diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h
index edf12443822c..7ce9540052e5 100644
--- a/fs/bcachefs/error.h
+++ b/fs/bcachefs/error.h
@@ -91,13 +91,6 @@ do {									\
  * be able to repair:
  */
 
-enum fsck_err_opts {
-	FSCK_OPT_EXIT,
-	FSCK_OPT_YES,
-	FSCK_OPT_NO,
-	FSCK_OPT_ASK,
-};
-
 struct fsck_err_state {
 	struct list_head	list;
 	const char		*fmt;
diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c
index 96c2f3c2fbce..56f586f8d25b 100644
--- a/fs/bcachefs/opts.c
+++ b/fs/bcachefs/opts.c
@@ -5,6 +5,7 @@
 #include "bcachefs.h"
 #include "compress.h"
 #include "disk_groups.h"
+#include "error.h"
 #include "opts.h"
 #include "super-io.h"
 #include "util.h"
@@ -16,6 +17,11 @@ const char * const bch2_error_actions[] = {
 	NULL
 };
 
+const char * const bch2_fsck_fix_opts[] = {
+	BCH_FIX_ERRORS_OPTS()
+	NULL
+};
+
 const char * const bch2_version_upgrade_opts[] = {
 	BCH_VERSION_UPGRADE_OPTS()
 	NULL
@@ -89,6 +95,37 @@ const char * const bch2_fs_usage_types[] = {
 
 #undef x
 
+static int bch2_opt_fix_errors_parse(struct bch_fs *c, const char *val, u64 *res,
+				     struct printbuf *err)
+{
+	if (!val) {
+		*res = FSCK_FIX_yes;
+	} else {
+		int ret = match_string(bch2_fsck_fix_opts, -1, val);
+
+		if (ret < 0 && err)
+			prt_str(err, "fix_errors: invalid selection");
+		if (ret < 0)
+			return ret;
+		*res = ret;
+	}
+
+	return 0;
+}
+
+static void bch2_opt_fix_errors_to_text(struct printbuf *out,
+					struct bch_fs *c,
+					struct bch_sb *sb,
+					u64 v)
+{
+	prt_str(out, bch2_fsck_fix_opts[v]);
+}
+
+static const struct bch_opt_fn bch2_opt_fix_errors = {
+	.parse = bch2_opt_fix_errors_parse,
+	.to_text = bch2_opt_fix_errors_to_text,
+};
+
 const char * const bch2_d_types[BCH_DT_MAX] = {
 	[DT_UNKNOWN]	= "unknown",
 	[DT_FIFO]	= "fifo",
@@ -265,15 +302,26 @@ int bch2_opt_parse(struct bch_fs *c,
 
 	switch (opt->type) {
 	case BCH_OPT_BOOL:
-		ret = kstrtou64(val, 10, res);
+		if (val) {
+			ret = kstrtou64(val, 10, res);
+		} else {
+			ret = 0;
+			*res = 1;
+		}
+
 		if (ret < 0 || (*res != 0 && *res != 1)) {
 			if (err)
-				prt_printf(err, "%s: must be bool",
-					   opt->attr.name);
+				prt_printf(err, "%s: must be bool", opt->attr.name);
 			return ret;
 		}
 		break;
 	case BCH_OPT_UINT:
+		if (!val) {
+			prt_printf(err, "%s: required value",
+				   opt->attr.name);
+			return -EINVAL;
+		}
+
 		ret = opt->flags & OPT_HUMAN_READABLE
 			? bch2_strtou64_h(val, res)
 			: kstrtou64(val, 10, res);
@@ -285,6 +333,12 @@ int bch2_opt_parse(struct bch_fs *c,
 		}
 		break;
 	case BCH_OPT_STR:
+		if (!val) {
+			prt_printf(err, "%s: required value",
+				   opt->attr.name);
+			return -EINVAL;
+		}
+
 		ret = match_string(opt->choices, -1, val);
 		if (ret < 0) {
 			if (err)
@@ -336,7 +390,7 @@ void bch2_opt_to_text(struct printbuf *out,
 		if (flags & OPT_SHOW_FULL_LIST)
 			prt_string_option(out, opt->choices, v);
 		else
-			prt_printf(out, "%s", opt->choices[v]);
+			prt_str(out, opt->choices[v]);
 		break;
 	case BCH_OPT_FN:
 		opt->fn.to_text(out, c, sb, v);
@@ -400,31 +454,19 @@ int bch2_parse_mount_opts(struct bch_fs *c, struct bch_opts *opts,
 		name	= strsep(&opt, "=");
 		val	= opt;
 
-		if (val) {
-			id = bch2_mount_opt_lookup(name);
-			if (id < 0)
-				goto bad_opt;
+		id = bch2_mount_opt_lookup(name);
 
-			ret = bch2_opt_parse(c, &bch2_opt_table[id], val, &v, &err);
-			if (ret < 0)
-				goto bad_val;
-		} else {
-			id = bch2_mount_opt_lookup(name);
-			v = 1;
-
-			if (id < 0 &&
-			    !strncmp("no", name, 2)) {
-				id = bch2_mount_opt_lookup(name + 2);
-				v = 0;
-			}
-
-			if (id < 0)
-				goto bad_opt;
-
-			if (bch2_opt_table[id].type != BCH_OPT_BOOL)
-				goto no_val;
+		/* Check for the form "noopt", negation of a boolean opt: */
+		if (id < 0 &&
+		    !val &&
+		    !strncmp("no", name, 2)) {
+			id = bch2_mount_opt_lookup(name + 2);
+			val = "0";
 		}
 
+		if (id < 0)
+			goto bad_opt;
+
 		if (!(bch2_opt_table[id].flags & OPT_MOUNT))
 			goto bad_opt;
 
@@ -437,6 +479,10 @@ int bch2_parse_mount_opts(struct bch_fs *c, struct bch_opts *opts,
 		    !IS_ENABLED(CONFIG_BCACHEFS_QUOTA))
 			goto bad_opt;
 
+		ret = bch2_opt_parse(c, &bch2_opt_table[id], val, &v, &err);
+		if (ret < 0)
+			goto bad_val;
+
 		bch2_opt_set_by_id(opts, id, v);
 	}
 
@@ -451,10 +497,6 @@ bad_val:
 	pr_err("Invalid mount option %s", err.buf);
 	ret = -1;
 	goto out;
-no_val:
-	pr_err("Mount option %s requires a value", name);
-	ret = -1;
-	goto out;
 out:
 	kfree(copied_opts_start);
 	printbuf_exit(&err);
diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
index 3be5095aa472..92e2e5e759d9 100644
--- a/fs/bcachefs/opts.h
+++ b/fs/bcachefs/opts.h
@@ -11,6 +11,7 @@
 struct bch_fs;
 
 extern const char * const bch2_error_actions[];
+extern const char * const bch2_fsck_fix_opts[];
 extern const char * const bch2_version_upgrade_opts[];
 extern const char * const bch2_sb_features[];
 extern const char * const bch2_sb_compat[];
@@ -105,6 +106,18 @@ struct bch_opt_fn {
 #define BCACHEFS_VERBOSE_DEFAULT	false
 #endif
 
+#define BCH_FIX_ERRORS_OPTS()		\
+	x(exit,	0)			\
+	x(yes,	1)			\
+	x(no,	2)			\
+	x(ask,	3)
+
+enum fsck_err_opts {
+#define x(t, n)	FSCK_FIX_##t,
+	BCH_FIX_ERRORS_OPTS()
+#undef x
+};
+
 #define BCH_OPTS()							\
 	x(block_size,			u16,				\
 	  OPT_FS|OPT_FORMAT|						\
@@ -325,8 +338,8 @@ struct bch_opt_fn {
 	  NULL,		"Run fsck on mount")				\
 	x(fix_errors,			u8,				\
 	  OPT_FS|OPT_MOUNT,						\
-	  OPT_BOOL(),							\
-	  BCH2_NO_SB_OPT,		false,				\
+	  OPT_FN(bch2_opt_fix_errors),					\
+	  BCH2_NO_SB_OPT,		FSCK_FIX_exit,			\
 	  NULL,		"Fix errors during fsck without asking")	\
 	x(ratelimit_errors,		u8,				\
 	  OPT_FS|OPT_MOUNT,						\
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index c46297bd1cf9..63b385d8886a 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -1175,7 +1175,7 @@ static void check_version_upgrade(struct bch_fs *c)
 			prt_str(&buf, "fsck required");
 
 			c->recovery_passes_explicit |= recovery_passes;
-			c->opts.fix_errors = FSCK_OPT_YES;
+			c->opts.fix_errors = FSCK_FIX_yes;
 		}
 
 		bch_info(c, "%s", buf.buf);
-- 
cgit 


From a5cf5a4b41f3ca1ef7eb2d8fede50b2fa791b928 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 12 Jul 2023 22:06:11 -0400
Subject: bcachefs: bcachefs_format.h should be using __u64

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs_format.h | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index 6d693e4def5d..1dc178f3d72f 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -695,7 +695,7 @@ struct bch_reservation {
 /* Maximum size (in u64s) a single pointer could be: */
 #define BKEY_EXTENT_PTR_U64s_MAX\
 	((sizeof(struct bch_extent_crc128) +			\
-	  sizeof(struct bch_extent_ptr)) / sizeof(u64))
+	  sizeof(struct bch_extent_ptr)) / sizeof(__u64))
 
 /* Maximum possible size of an entire extent value: */
 #define BKEY_EXTENT_VAL_U64s_MAX				\
@@ -707,7 +707,7 @@ struct bch_reservation {
 /* Btree pointers don't carry around checksums: */
 #define BKEY_BTREE_PTR_VAL_U64s_MAX				\
 	((sizeof(struct bch_btree_ptr_v2) +			\
-	  sizeof(struct bch_extent_ptr) * BCH_REPLICAS_MAX) / sizeof(u64))
+	  sizeof(struct bch_extent_ptr) * BCH_REPLICAS_MAX) / sizeof(__u64))
 #define BKEY_BTREE_PTR_U64s_MAX					\
 	(BKEY_U64s + BKEY_BTREE_PTR_VAL_U64s_MAX)
 
@@ -749,7 +749,7 @@ struct bch_inode_v3 {
 } __packed __aligned(8);
 
 #define INODEv3_FIELDS_START_INITIAL	6
-#define INODEv3_FIELDS_START_CUR	(offsetof(struct bch_inode_v3, fields) / sizeof(u64))
+#define INODEv3_FIELDS_START_CUR	(offsetof(struct bch_inode_v3, fields) / sizeof(__u64))
 
 struct bch_inode_generation {
 	struct bch_val		v;
@@ -916,7 +916,7 @@ struct bch_dirent {
 #define DT_SUBVOL	16
 #define BCH_DT_MAX	17
 
-#define BCH_NAME_MAX	((unsigned) (U8_MAX * sizeof(u64) -		\
+#define BCH_NAME_MAX	((unsigned) (U8_MAX * sizeof(__u64) -		\
 			 sizeof(struct bkey) -				\
 			 offsetof(struct bch_dirent, d_name)))
 
@@ -1009,7 +1009,7 @@ struct bch_alloc_v4 {
 } __packed __aligned(8);
 
 #define BCH_ALLOC_V4_U64s_V0	6
-#define BCH_ALLOC_V4_U64s	(sizeof(struct bch_alloc_v4) / sizeof(u64))
+#define BCH_ALLOC_V4_U64s	(sizeof(struct bch_alloc_v4) / sizeof(__u64))
 
 BITMASK(BCH_ALLOC_V4_NEED_DISCARD,	struct bch_alloc_v4, flags,  0,  1)
 BITMASK(BCH_ALLOC_V4_NEED_INC_GEN,	struct bch_alloc_v4, flags,  1,  2)
@@ -1289,10 +1289,10 @@ struct bch_key {
 };
 
 #define BCH_KEY_MAGIC					\
-	(((u64) 'b' <<  0)|((u64) 'c' <<  8)|		\
-	 ((u64) 'h' << 16)|((u64) '*' << 24)|		\
-	 ((u64) '*' << 32)|((u64) 'k' << 40)|		\
-	 ((u64) 'e' << 48)|((u64) 'y' << 56))
+	(((__u64) 'b' <<  0)|((__u64) 'c' <<  8)|		\
+	 ((__u64) 'h' << 16)|((__u64) '*' << 24)|		\
+	 ((__u64) '*' << 32)|((__u64) 'k' << 40)|		\
+	 ((__u64) 'e' << 48)|((__u64) 'y' << 56))
 
 struct bch_encrypted_key {
 	__le64			magic;
@@ -2272,7 +2272,7 @@ static inline __u64 BTREE_NODE_ID(struct btree_node *n)
 	return BTREE_NODE_ID_LO(n) | (BTREE_NODE_ID_HI(n) << 4);
 }
 
-static inline void SET_BTREE_NODE_ID(struct btree_node *n, u64 v)
+static inline void SET_BTREE_NODE_ID(struct btree_node *n, __u64 v)
 {
 	SET_BTREE_NODE_ID_LO(n, v);
 	SET_BTREE_NODE_ID_HI(n, v >> 4);
-- 
cgit 


From e86e9124ca6c762f02cc412ce71feb9ed2e4890d Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 12 Jul 2023 22:06:37 -0400
Subject: bcachefs: Extent sb compression type fields to 8 bits

The upper 4 bits are for compression level.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs_format.h | 31 ++++++++++++++++++++++++++++---
 1 file changed, 28 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index 1dc178f3d72f..5c308f8421c5 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -1747,7 +1747,7 @@ LE64_BITMASK(BCH_SB_HAS_TOPOLOGY_ERRORS,struct bch_sb, flags[0], 61, 62);
 LE64_BITMASK(BCH_SB_BIG_ENDIAN,		struct bch_sb, flags[0], 62, 63);
 
 LE64_BITMASK(BCH_SB_STR_HASH_TYPE,	struct bch_sb, flags[1],  0,  4);
-LE64_BITMASK(BCH_SB_COMPRESSION_TYPE,	struct bch_sb, flags[1],  4,  8);
+LE64_BITMASK(BCH_SB_COMPRESSION_TYPE_LO,struct bch_sb, flags[1],  4,  8);
 LE64_BITMASK(BCH_SB_INODE_32BIT,	struct bch_sb, flags[1],  8,  9);
 
 LE64_BITMASK(BCH_SB_128_BIT_MACS,	struct bch_sb, flags[1],  9, 10);
@@ -1767,7 +1767,7 @@ LE64_BITMASK(BCH_SB_PROMOTE_TARGET,	struct bch_sb, flags[1], 28, 40);
 LE64_BITMASK(BCH_SB_FOREGROUND_TARGET,	struct bch_sb, flags[1], 40, 52);
 LE64_BITMASK(BCH_SB_BACKGROUND_TARGET,	struct bch_sb, flags[1], 52, 64);
 
-LE64_BITMASK(BCH_SB_BACKGROUND_COMPRESSION_TYPE,
+LE64_BITMASK(BCH_SB_BACKGROUND_COMPRESSION_TYPE_LO,
 					struct bch_sb, flags[2],  0,  4);
 LE64_BITMASK(BCH_SB_GC_RESERVE_BYTES,	struct bch_sb, flags[2],  4, 64);
 
@@ -1783,11 +1783,36 @@ LE64_BITMASK(BCH_SB_NOCOW,		struct bch_sb, flags[4], 33, 34);
 LE64_BITMASK(BCH_SB_WRITE_BUFFER_SIZE,	struct bch_sb, flags[4], 34, 54);
 LE64_BITMASK(BCH_SB_VERSION_UPGRADE,	struct bch_sb, flags[4], 54, 56);
 
-/* flags[4] 56-64 unused: */
+LE64_BITMASK(BCH_SB_COMPRESSION_TYPE_HI,struct bch_sb, flags[4], 56, 60);
+LE64_BITMASK(BCH_SB_BACKGROUND_COMPRESSION_TYPE_HI,
+					struct bch_sb, flags[4], 60, 64);
 
 LE64_BITMASK(BCH_SB_VERSION_UPGRADE_COMPLETE,
 					struct bch_sb, flags[5],  0, 16);
 
+static inline __u64 BCH_SB_COMPRESSION_TYPE(const struct bch_sb *sb)
+{
+	return BCH_SB_COMPRESSION_TYPE_LO(sb) | (BCH_SB_COMPRESSION_TYPE_HI(sb) << 4);
+}
+
+static inline void SET_BCH_SB_COMPRESSION_TYPE(struct bch_sb *sb, __u64 v)
+{
+	SET_BCH_SB_COMPRESSION_TYPE_LO(sb, v);
+	SET_BCH_SB_COMPRESSION_TYPE_HI(sb, v >> 4);
+}
+
+static inline __u64 BCH_SB_BACKGROUND_COMPRESSION_TYPE(const struct bch_sb *sb)
+{
+	return BCH_SB_BACKGROUND_COMPRESSION_TYPE_LO(sb) |
+		(BCH_SB_BACKGROUND_COMPRESSION_TYPE_HI(sb) << 4);
+}
+
+static inline void SET_BCH_SB_BACKGROUND_COMPRESSION_TYPE(struct bch_sb *sb, __u64 v)
+{
+	SET_BCH_SB_BACKGROUND_COMPRESSION_TYPE_LO(sb, v);
+	SET_BCH_SB_BACKGROUND_COMPRESSION_TYPE_HI(sb, v >> 4);
+}
+
 /*
  * Features:
  *
-- 
cgit 


From 986e9842fb6825f65918ed400b29c8c878359b7a Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 12 Jul 2023 22:27:16 -0400
Subject: bcachefs: Compression levels

This allows including a compression level when specifying a compression
type, e.g.
  compression=zstd:15

Values from 1 through 15 indicate compression levels, 0 or unspecified
indicates the default.

For LZ4, values 3-15 specify that the HC algorithm should be used.

Note that for compatibility, extents themselves only include the
compression type, not the compression level. This means that specifying
the same compression algorithm but different compression levels for the
compression and background_compression options will have no effect.

XXX: perhaps we could add a warning for this

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/Kconfig       |   2 +
 fs/bcachefs/checksum.h    |   6 --
 fs/bcachefs/compress.c    | 157 +++++++++++++++++++++++++++++++++++-----------
 fs/bcachefs/compress.h    |  37 +++++++++++
 fs/bcachefs/data_update.c |   4 +-
 fs/bcachefs/io.c          |  14 ++---
 fs/bcachefs/io.h          |   2 +-
 fs/bcachefs/io_types.h    |   2 +-
 fs/bcachefs/opts.h        |   4 +-
 fs/bcachefs/rebalance.c   |   3 +-
 10 files changed, 174 insertions(+), 57 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/Kconfig b/fs/bcachefs/Kconfig
index 49776ba0a031..df13a4f9a6e3 100644
--- a/fs/bcachefs/Kconfig
+++ b/fs/bcachefs/Kconfig
@@ -9,6 +9,8 @@ config BCACHEFS_FS
 	select FS_POSIX_ACL
 	select LZ4_COMPRESS
 	select LZ4_DECOMPRESS
+	select LZ4HC_COMPRESS
+	select LZ4HC_DECOMPRESS
 	select ZLIB_DEFLATE
 	select ZLIB_INFLATE
 	select ZSTD_COMPRESS
diff --git a/fs/bcachefs/checksum.h b/fs/bcachefs/checksum.h
index 409ad534d9f4..1ad1d5f03939 100644
--- a/fs/bcachefs/checksum.h
+++ b/fs/bcachefs/checksum.h
@@ -120,12 +120,6 @@ static inline enum bch_csum_type bch2_meta_checksum_type(struct bch_fs *c)
 	return bch2_csum_opt_to_type(c->opts.metadata_checksum, false);
 }
 
-static const unsigned bch2_compression_opt_to_type[] = {
-#define x(t, n) [BCH_COMPRESSION_OPT_##t] = BCH_COMPRESSION_TYPE_##t,
-	BCH_COMPRESSION_OPTS()
-#undef x
-};
-
 static inline bool bch2_checksum_type_valid(const struct bch_fs *c,
 					   unsigned type)
 {
diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c
index 48427a270840..560214c15da3 100644
--- a/fs/bcachefs/compress.c
+++ b/fs/bcachefs/compress.c
@@ -296,21 +296,32 @@ static int attempt_compress(struct bch_fs *c,
 			    void *workspace,
 			    void *dst, size_t dst_len,
 			    void *src, size_t src_len,
-			    enum bch_compression_type compression_type)
+			    struct bch_compression_opt compression)
 {
-	switch (compression_type) {
-	case BCH_COMPRESSION_TYPE_lz4: {
-		int len = src_len;
-		int ret = LZ4_compress_destSize(
-				src,		dst,
-				&len,		dst_len,
-				workspace);
-
-		if (len < src_len)
-			return -len;
+	enum bch_compression_type compression_type =
+		__bch2_compression_opt_to_type[compression.type];
 
-		return ret;
-	}
+	switch (compression_type) {
+	case BCH_COMPRESSION_TYPE_lz4:
+		if (compression.level < LZ4HC_MIN_CLEVEL) {
+			int len = src_len;
+			int ret = LZ4_compress_destSize(
+					src,		dst,
+					&len,		dst_len,
+					workspace);
+			if (len < src_len)
+				return -len;
+
+			return ret;
+		} else {
+			int ret = LZ4_compress_HC(
+					src,		dst,
+					src_len,	dst_len,
+					compression.level,
+					workspace);
+
+			return ret ?: -1;
+		}
 	case BCH_COMPRESSION_TYPE_gzip: {
 		z_stream strm = {
 			.next_in	= src,
@@ -320,7 +331,11 @@ static int attempt_compress(struct bch_fs *c,
 		};
 
 		zlib_set_workspace(&strm, workspace);
-		zlib_deflateInit2(&strm, Z_DEFAULT_COMPRESSION,
+		zlib_deflateInit2(&strm,
+				  compression.level
+				  ? clamp_t(unsigned, compression.level,
+					    Z_BEST_SPEED, Z_BEST_COMPRESSION)
+				  : Z_DEFAULT_COMPRESSION,
 				  Z_DEFLATED, -MAX_WBITS, DEF_MEM_LEVEL,
 				  Z_DEFAULT_STRATEGY);
 
@@ -333,8 +348,14 @@ static int attempt_compress(struct bch_fs *c,
 		return strm.total_out;
 	}
 	case BCH_COMPRESSION_TYPE_zstd: {
+		/*
+		 * rescale:
+		 * zstd max compression level is 22, our max level is 15
+		 */
+		unsigned level = min((compression.level * 3) / 2, zstd_max_clevel());
+		ZSTD_parameters params = zstd_get_params(level, c->opts.encoded_extent_max);
 		ZSTD_CCtx *ctx = zstd_init_cctx(workspace,
-			zstd_cctx_workspace_bound(&c->zstd_params.cParams));
+			zstd_cctx_workspace_bound(&params.cParams));
 
 		/*
 		 * ZSTD requires that when we decompress we pass in the exact
@@ -365,10 +386,12 @@ static int attempt_compress(struct bch_fs *c,
 static unsigned __bio_compress(struct bch_fs *c,
 			       struct bio *dst, size_t *dst_len,
 			       struct bio *src, size_t *src_len,
-			       enum bch_compression_type compression_type)
+			       struct bch_compression_opt compression)
 {
 	struct bbuf src_data = { NULL }, dst_data = { NULL };
 	void *workspace;
+	enum bch_compression_type compression_type =
+		__bch2_compression_opt_to_type[compression.type];
 	unsigned pad;
 	int ret = 0;
 
@@ -400,7 +423,7 @@ static unsigned __bio_compress(struct bch_fs *c,
 		ret = attempt_compress(c, workspace,
 				       dst_data.b,	*dst_len,
 				       src_data.b,	*src_len,
-				       compression_type);
+				       compression);
 		if (ret > 0) {
 			*dst_len = ret;
 			ret = 0;
@@ -447,22 +470,24 @@ static unsigned __bio_compress(struct bch_fs *c,
 	BUG_ON(!*src_len || *src_len > src->bi_iter.bi_size);
 	BUG_ON(*dst_len & (block_bytes(c) - 1));
 	BUG_ON(*src_len & (block_bytes(c) - 1));
+	ret = compression_type;
 out:
 	bio_unmap_or_unbounce(c, src_data);
 	bio_unmap_or_unbounce(c, dst_data);
-	return compression_type;
+	return ret;
 err:
-	compression_type = BCH_COMPRESSION_TYPE_incompressible;
+	ret = BCH_COMPRESSION_TYPE_incompressible;
 	goto out;
 }
 
 unsigned bch2_bio_compress(struct bch_fs *c,
 			   struct bio *dst, size_t *dst_len,
 			   struct bio *src, size_t *src_len,
-			   unsigned compression_type)
+			   unsigned compression_opt)
 {
 	unsigned orig_dst = dst->bi_iter.bi_size;
 	unsigned orig_src = src->bi_iter.bi_size;
+	unsigned compression_type;
 
 	/* Don't consume more than BCH_ENCODED_EXTENT_MAX from @src: */
 	src->bi_iter.bi_size = min_t(unsigned, src->bi_iter.bi_size,
@@ -470,11 +495,9 @@ unsigned bch2_bio_compress(struct bch_fs *c,
 	/* Don't generate a bigger output than input: */
 	dst->bi_iter.bi_size = min(dst->bi_iter.bi_size, src->bi_iter.bi_size);
 
-	if (compression_type == BCH_COMPRESSION_TYPE_lz4_old)
-		compression_type = BCH_COMPRESSION_TYPE_lz4;
-
 	compression_type =
-		__bio_compress(c, dst, dst_len, src, src_len, compression_type);
+		__bio_compress(c, dst, dst_len, src, src_len,
+			       bch2_compression_decode(compression_opt));
 
 	dst->bi_iter.bi_size = orig_dst;
 	src->bi_iter.bi_size = orig_src;
@@ -521,8 +544,10 @@ static int __bch2_check_set_has_compressed_data(struct bch_fs *c, u64 f)
 }
 
 int bch2_check_set_has_compressed_data(struct bch_fs *c,
-				       unsigned compression_type)
+				       unsigned compression_opt)
 {
+	unsigned compression_type = bch2_compression_decode(compression_opt).type;
+
 	BUG_ON(compression_type >= ARRAY_SIZE(bch2_compression_opt_to_feature));
 
 	return compression_type
@@ -546,14 +571,16 @@ static int __bch2_fs_compress_init(struct bch_fs *c, u64 features)
 {
 	size_t decompress_workspace_size = 0;
 	bool decompress_workspace_needed;
-	ZSTD_parameters params = zstd_get_params(0, c->opts.encoded_extent_max);
+	ZSTD_parameters params = zstd_get_params(zstd_max_clevel(),
+						 c->opts.encoded_extent_max);
 	struct {
-		unsigned	feature;
-		unsigned	type;
-		size_t		compress_workspace;
-		size_t		decompress_workspace;
+		unsigned			feature;
+		enum bch_compression_type	type;
+		size_t				compress_workspace;
+		size_t				decompress_workspace;
 	} compression_types[] = {
-		{ BCH_FEATURE_lz4, BCH_COMPRESSION_TYPE_lz4, LZ4_MEM_COMPRESS, 0 },
+		{ BCH_FEATURE_lz4, BCH_COMPRESSION_TYPE_lz4,
+			max_t(size_t, LZ4_MEM_COMPRESS, LZ4HC_MEM_COMPRESS) },
 		{ BCH_FEATURE_gzip, BCH_COMPRESSION_TYPE_gzip,
 			zlib_deflate_workspacesize(MAX_WBITS, DEF_MEM_LEVEL),
 			zlib_inflate_workspacesize(), },
@@ -612,16 +639,74 @@ static int __bch2_fs_compress_init(struct bch_fs *c, u64 features)
 	return 0;
 }
 
+static u64 compression_opt_to_feature(unsigned v)
+{
+	unsigned type = bch2_compression_decode(v).type;
+	return 1ULL << bch2_compression_opt_to_feature[type];
+}
+
 int bch2_fs_compress_init(struct bch_fs *c)
 {
 	u64 f = c->sb.features;
 
-	if (c->opts.compression)
-		f |= 1ULL << bch2_compression_opt_to_feature[c->opts.compression];
-
-	if (c->opts.background_compression)
-		f |= 1ULL << bch2_compression_opt_to_feature[c->opts.background_compression];
+	f |= compression_opt_to_feature(c->opts.compression);
+	f |= compression_opt_to_feature(c->opts.background_compression);
 
 	return __bch2_fs_compress_init(c, f);
+}
+
+int bch2_opt_compression_parse(struct bch_fs *c, const char *_val, u64 *res,
+			       struct printbuf *err)
+{
+	char *val = kstrdup(_val, GFP_KERNEL);
+	char *p = val, *type_str, *level_str;
+	struct bch_compression_opt opt = { 0 };
+	int ret;
+
+	if (!val)
+		return -ENOMEM;
+
+	type_str = strsep(&p, ":");
+	level_str = p;
+
+	ret = match_string(bch2_compression_opts, -1, type_str);
+	if (ret < 0 && err)
+		prt_str(err, "invalid compression type");
+	if (ret < 0)
+		goto err;
+
+	opt.type = ret;
+
+	if (level_str) {
+		unsigned level;
+
+		ret = kstrtouint(level_str, 10, &level);
+		if (!ret && !opt.type && level)
+			ret = -EINVAL;
+		if (!ret && level > 15)
+			ret = -EINVAL;
+		if (ret < 0 && err)
+			prt_str(err, "invalid compression level");
+		if (ret < 0)
+			goto err;
+
+		opt.level = level;
+	}
+
+	*res = bch2_compression_encode(opt);
+err:
+	kfree(val);
+	return ret;
+}
+
+void bch2_opt_compression_to_text(struct printbuf *out,
+				  struct bch_fs *c,
+				  struct bch_sb *sb,
+				  u64 v)
+{
+	struct bch_compression_opt opt = bch2_compression_decode(v);
 
+	prt_str(out, bch2_compression_opts[opt.type]);
+	if (opt.level)
+		prt_printf(out, ":%u", opt.level);
 }
diff --git a/fs/bcachefs/compress.h b/fs/bcachefs/compress.h
index 4bab1f61b3b5..052ea303241f 100644
--- a/fs/bcachefs/compress.h
+++ b/fs/bcachefs/compress.h
@@ -4,6 +4,35 @@
 
 #include "extents_types.h"
 
+struct bch_compression_opt {
+	u8		type:4,
+			level:4;
+};
+
+static inline struct bch_compression_opt bch2_compression_decode(unsigned v)
+{
+	return (struct bch_compression_opt) {
+		.type	= v & 15,
+		.level	= v >> 4,
+	};
+}
+
+static inline unsigned bch2_compression_encode(struct bch_compression_opt opt)
+{
+	return opt.type|(opt.level << 4);
+}
+
+static const unsigned __bch2_compression_opt_to_type[] = {
+#define x(t, n) [BCH_COMPRESSION_OPT_##t] = BCH_COMPRESSION_TYPE_##t,
+	BCH_COMPRESSION_OPTS()
+#undef x
+};
+
+static inline enum bch_compression_type bch2_compression_opt_to_type(unsigned v)
+{
+	return __bch2_compression_opt_to_type[bch2_compression_decode(v).type];
+}
+
 int bch2_bio_uncompress_inplace(struct bch_fs *, struct bio *,
 				struct bch_extent_crc_unpacked *);
 int bch2_bio_uncompress(struct bch_fs *, struct bio *, struct bio *,
@@ -15,4 +44,12 @@ int bch2_check_set_has_compressed_data(struct bch_fs *, unsigned);
 void bch2_fs_compress_exit(struct bch_fs *);
 int bch2_fs_compress_init(struct bch_fs *);
 
+int bch2_opt_compression_parse(struct bch_fs *, const char *, u64 *, struct printbuf *);
+void bch2_opt_compression_to_text(struct printbuf *, struct bch_fs *, struct bch_sb *, u64);
+
+#define bch2_opt_compression (struct bch_opt_fn) {		\
+	.parse		= bch2_opt_compression_parse,	\
+	.to_text	= bch2_opt_compression_to_text,	\
+}
+
 #endif /* _BCACHEFS_COMPRESS_H */
diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c
index 3c918368b2ec..cfc624463700 100644
--- a/fs/bcachefs/data_update.c
+++ b/fs/bcachefs/data_update.c
@@ -455,9 +455,7 @@ int bch2_data_update_init(struct btree_trans *trans,
 		BCH_WRITE_DATA_ENCODED|
 		BCH_WRITE_MOVE|
 		m->data_opts.write_flags;
-	m->op.compression_type =
-		bch2_compression_opt_to_type[io_opts.background_compression ?:
-					     io_opts.compression];
+	m->op.compression_opt	= io_opts.background_compression ?: io_opts.compression;
 	m->op.watermark		= m->data_opts.btree_insert_flags & BCH_WATERMARK_MASK;
 
 	bkey_for_each_ptr(ptrs, ptr)
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index 33762e4a0f05..8604df80a3e2 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -1078,7 +1078,7 @@ static enum prep_encoded_ret {
 	/* Can we just write the entire extent as is? */
 	if (op->crc.uncompressed_size == op->crc.live_size &&
 	    op->crc.compressed_size <= wp->sectors_free &&
-	    (op->crc.compression_type == op->compression_type ||
+	    (op->crc.compression_type == bch2_compression_opt_to_type(op->compression_opt) ||
 	     op->incompressible)) {
 		if (!crc_is_compressed(op->crc) &&
 		    op->csum_type != op->crc.csum_type &&
@@ -1126,7 +1126,7 @@ static enum prep_encoded_ret {
 	/*
 	 * If we want to compress the data, it has to be decrypted:
 	 */
-	if ((op->compression_type ||
+	if ((op->compression_opt ||
 	     bch2_csum_type_is_encryption(op->crc.csum_type) !=
 	     bch2_csum_type_is_encryption(op->csum_type)) &&
 	    bch2_write_decrypt(op))
@@ -1173,7 +1173,7 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp,
 	}
 
 	if (ec_buf ||
-	    op->compression_type ||
+	    op->compression_opt ||
 	    (op->csum_type &&
 	     !(op->flags & BCH_WRITE_PAGES_STABLE)) ||
 	    (bch2_csum_type_is_encryption(op->csum_type) &&
@@ -1196,16 +1196,16 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp,
 		    dst->bi_iter.bi_size < c->opts.encoded_extent_max)
 			break;
 
-		BUG_ON(op->compression_type &&
+		BUG_ON(op->compression_opt &&
 		       (op->flags & BCH_WRITE_DATA_ENCODED) &&
 		       bch2_csum_type_is_encryption(op->crc.csum_type));
-		BUG_ON(op->compression_type && !bounce);
+		BUG_ON(op->compression_opt && !bounce);
 
 		crc.compression_type = op->incompressible
 			? BCH_COMPRESSION_TYPE_incompressible
-			: op->compression_type
+			: op->compression_opt
 			? bch2_bio_compress(c, dst, &dst_len, src, &src_len,
-					    op->compression_type)
+					    op->compression_opt)
 			: 0;
 		if (!crc_is_compressed(crc)) {
 			dst_len = min(dst->bi_iter.bi_size, src->bi_iter.bi_size);
diff --git a/fs/bcachefs/io.h b/fs/bcachefs/io.h
index 7a243a5f3f89..1476380d5fbf 100644
--- a/fs/bcachefs/io.h
+++ b/fs/bcachefs/io.h
@@ -86,7 +86,7 @@ static inline void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c,
 	op->written		= 0;
 	op->error		= 0;
 	op->csum_type		= bch2_data_checksum_type(c, opts);
-	op->compression_type	= bch2_compression_opt_to_type[opts.compression];
+	op->compression_opt	= opts.compression;
 	op->nr_replicas		= 0;
 	op->nr_replicas_required = c->opts.data_replicas_required;
 	op->watermark		= BCH_WATERMARK_normal;
diff --git a/fs/bcachefs/io_types.h b/fs/bcachefs/io_types.h
index 0fbdfbf90ad8..737f16d78c48 100644
--- a/fs/bcachefs/io_types.h
+++ b/fs/bcachefs/io_types.h
@@ -115,8 +115,8 @@ struct bch_write_op {
 	u16			flags;
 	s16			error; /* dio write path expects it to hold -ERESTARTSYS... */
 
+	unsigned		compression_opt:8;
 	unsigned		csum_type:4;
-	unsigned		compression_type:4;
 	unsigned		nr_replicas:4;
 	unsigned		nr_replicas_required:4;
 	unsigned		watermark:3;
diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
index 92e2e5e759d9..8a9db110d64f 100644
--- a/fs/bcachefs/opts.h
+++ b/fs/bcachefs/opts.h
@@ -174,12 +174,12 @@ enum fsck_err_opts {
 	  NULL,		NULL)						\
 	x(compression,			u8,				\
 	  OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,		\
-	  OPT_STR(bch2_compression_opts),				\
+	  OPT_FN(bch2_opt_compression),					\
 	  BCH_SB_COMPRESSION_TYPE,	BCH_COMPRESSION_OPT_none,	\
 	  NULL,		NULL)						\
 	x(background_compression,	u8,				\
 	  OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,		\
-	  OPT_STR(bch2_compression_opts),				\
+	  OPT_FN(bch2_opt_compression),					\
 	  BCH_SB_BACKGROUND_COMPRESSION_TYPE,BCH_COMPRESSION_OPT_none,	\
 	  NULL,		NULL)						\
 	x(str_hash,			u8,				\
diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c
index 989f37a3b46a..c3d577236ce2 100644
--- a/fs/bcachefs/rebalance.c
+++ b/fs/bcachefs/rebalance.c
@@ -5,6 +5,7 @@
 #include "btree_iter.h"
 #include "buckets.h"
 #include "clock.h"
+#include "compress.h"
 #include "disk_groups.h"
 #include "errcode.h"
 #include "extents.h"
@@ -45,7 +46,7 @@ static bool rebalance_pred(struct bch_fs *c, void *arg,
 		bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
 			if (!p.ptr.cached &&
 			    p.crc.compression_type !=
-			    bch2_compression_opt_to_type[io_opts->background_compression])
+			    bch2_compression_opt_to_type(io_opts->background_compression))
 				data_opts->rewrite_ptrs |= 1U << i;
 			i++;
 		}
-- 
cgit 


From 5eaa76d813d48a524a1ce040539048b851a0a20c Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Thu, 13 Jul 2023 18:00:28 +0200
Subject: bcachefs: mark bch_inode_info and bkey_cached as reclaimable

Mark these caches as reclaimable, so that available memory is correctly
reported when there is a lot of cached inodes.

Note that more work is needed - you should add __GFP_RECLAIMABLE to some
of the kmalloc calls, so that they are allocated from the "kmalloc-rcl-*"
caches.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_key_cache.c | 2 +-
 fs/bcachefs/fs.c              | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index 34d959c4e640..badb541f493f 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -1066,7 +1066,7 @@ void bch2_btree_key_cache_exit(void)
 
 int __init bch2_btree_key_cache_init(void)
 {
-	bch2_key_cache = KMEM_CACHE(bkey_cached, 0);
+	bch2_key_cache = KMEM_CACHE(bkey_cached, SLAB_RECLAIM_ACCOUNT);
 	if (!bch2_key_cache)
 		return -ENOMEM;
 
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index 9280f514bc9f..e1824bdffdf8 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -1926,7 +1926,7 @@ int __init bch2_vfs_init(void)
 {
 	int ret = -ENOMEM;
 
-	bch2_inode_cache = KMEM_CACHE(bch_inode_info, 0);
+	bch2_inode_cache = KMEM_CACHE(bch_inode_info, SLAB_RECLAIM_ACCOUNT);
 	if (!bch2_inode_cache)
 		goto err;
 
-- 
cgit 


From 6132c84cacbff39e7b060abffc4175244347885d Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 13 Jul 2023 02:43:29 -0400
Subject: bcachefs: is_ancestor bitmap

Further optimization for bch2_snapshot_is_ancestor(). We add a small
inline bitmap to snapshot_t, which indicates which of the next 128
snapshot IDs are ancestors of the current id - eliminating the last few
iterations of the loop in bch2_snapshot_is_ancestor().

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/subvolume.c       | 23 +++++++++++++++--------
 fs/bcachefs/subvolume_types.h |  3 +++
 2 files changed, 18 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c
index c2c2cfd74e71..cf8af617ac00 100644
--- a/fs/bcachefs/subvolume.c
+++ b/fs/bcachefs/subvolume.c
@@ -28,17 +28,22 @@ static inline u32 get_ancestor_below(struct snapshot_table *t, u32 id, u32 ances
 bool bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor)
 {
 	struct snapshot_table *t;
+	bool ret;
 
 	EBUG_ON(c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_snapshots);
 
 	rcu_read_lock();
 	t = rcu_dereference(c->snapshots);
 
-	while (id && id < ancestor)
+	while (id && id < ancestor - IS_ANCESTOR_BITMAP)
 		id = get_ancestor_below(t, id, ancestor);
+
+	ret = id && id < ancestor
+		? test_bit(ancestor - id - 1, __snapshot_t(t, id)->is_ancestor)
+		: id == ancestor;
 	rcu_read_unlock();
 
-	return id == ancestor;
+	return ret;
 }
 
 static bool bch2_snapshot_is_ancestor_early(struct bch_fs *c, u32 id, u32 ancestor)
@@ -289,11 +294,12 @@ int bch2_mark_snapshot(struct btree_trans *trans,
 {
 	struct bch_fs *c = trans->c;
 	struct snapshot_t *t;
+	u32 id = new.k->p.offset;
 	int ret = 0;
 
 	mutex_lock(&c->snapshot_table_lock);
 
-	t = snapshot_t_mut(c, new.k->p.offset);
+	t = snapshot_t_mut(c, id);
 	if (!t) {
 		ret = -BCH_ERR_ENOMEM_mark_snapshot;
 		goto err;
@@ -301,6 +307,7 @@ int bch2_mark_snapshot(struct btree_trans *trans,
 
 	if (new.k->type == KEY_TYPE_snapshot) {
 		struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(new);
+		u32 parent = id;
 
 		t->parent	= le32_to_cpu(s.v->parent);
 		t->children[0]	= le32_to_cpu(s.v->children[0]);
@@ -320,14 +327,14 @@ int bch2_mark_snapshot(struct btree_trans *trans,
 			t->skip[2]	= 0;
 		}
 
+		while ((parent = bch2_snapshot_parent_early(c, parent)) &&
+		       parent - id - 1 < IS_ANCESTOR_BITMAP)
+			__set_bit(parent - id - 1, t->is_ancestor);
+
 		if (BCH_SNAPSHOT_DELETED(s.v))
 			set_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags);
 	} else {
-		t->parent	= 0;
-		t->children[0]	= 0;
-		t->children[1]	= 0;
-		t->subvol	= 0;
-		t->tree		= 0;
+		memset(t, 0, sizeof(*t));
 	}
 err:
 	mutex_unlock(&c->snapshot_table_lock);
diff --git a/fs/bcachefs/subvolume_types.h b/fs/bcachefs/subvolume_types.h
index c596e4270690..86833445af20 100644
--- a/fs/bcachefs/subvolume_types.h
+++ b/fs/bcachefs/subvolume_types.h
@@ -6,6 +6,8 @@
 
 typedef DARRAY(u32) snapshot_id_list;
 
+#define IS_ANCESTOR_BITMAP	128
+
 struct snapshot_t {
 	u32			parent;
 	u32			skip[3];
@@ -14,6 +16,7 @@ struct snapshot_t {
 	u32			subvol; /* Nonzero only if a subvolume points to this node: */
 	u32			tree;
 	u32			equiv;
+	unsigned long		is_ancestor[BITS_TO_LONGS(IS_ANCESTOR_BITMAP)];
 };
 
 struct snapshot_table {
-- 
cgit 


From 813e0cecd1473d7328376f9ca56f9239bc5b8146 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 15 Jul 2023 21:03:26 -0400
Subject: bcachefs: Upgrade path fixes

Some minor fixes to not print errors that are actually due to a verson
upgrade.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/backpointers.c | 2 +-
 fs/bcachefs/bcachefs.h     | 2 +-
 fs/bcachefs/buckets.c      | 2 +-
 fs/bcachefs/super-io.c     | 5 ++++-
 4 files changed, 7 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c
index d412bae553a0..7c1e6546d054 100644
--- a/fs/bcachefs/backpointers.c
+++ b/fs/bcachefs/backpointers.c
@@ -477,7 +477,7 @@ missing:
 	prt_printf(&buf, "\nbp pos ");
 	bch2_bpos_to_text(&buf, bp_iter.pos);
 
-	if (c->sb.version < bcachefs_metadata_version_backpointers ||
+	if (c->sb.version_upgrade_complete < bcachefs_metadata_version_backpointers ||
 	    c->opts.reconstruct_alloc ||
 	    fsck_err(c, "%s", buf.buf))
 		ret = bch2_bucket_backpointer_mod(trans, bucket, bp, orig_k, true);
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 445d010c83b3..6e8dc25d41db 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -677,11 +677,11 @@ enum bch_write_ref {
 	x(check_alloc_to_lru_refs,	PASS_FSCK)						\
 	x(fs_freespace_init,		PASS_ALWAYS|PASS_SILENT)				\
 	x(bucket_gens_init,		0)							\
-	x(fs_upgrade_for_subvolumes,	0)							\
 	x(check_snapshot_trees,		PASS_FSCK)						\
 	x(check_snapshots,		PASS_FSCK)						\
 	x(check_subvols,		PASS_FSCK)						\
 	x(delete_dead_snapshots,	PASS_FSCK|PASS_UNCLEAN|PASS_SILENT)			\
+	x(fs_upgrade_for_subvolumes,	0)							\
 	x(check_inodes,			PASS_FSCK|PASS_UNCLEAN)					\
 	x(check_extents,		PASS_FSCK)						\
 	x(check_dirents,		PASS_FSCK)						\
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 797ef5eceb3f..eb486467df17 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -1272,7 +1272,7 @@ int bch2_mark_reflink_p(struct btree_trans *trans,
 
 	BUG_ON(!(flags & BTREE_TRIGGER_GC));
 
-	if (c->sb.version >= bcachefs_metadata_version_reflink_p_fix) {
+	if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_reflink_p_fix) {
 		idx -= le32_to_cpu(p.v->front_pad);
 		end += le32_to_cpu(p.v->back_pad);
 	}
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index deef31a617c4..cea7c7caa1c0 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -422,6 +422,9 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb, struct printbuf *out,
 			SET_BCH_SB_JOURNAL_FLUSH_DELAY(sb, 1000);
 		if (!BCH_SB_JOURNAL_RECLAIM_DELAY(sb))
 			SET_BCH_SB_JOURNAL_RECLAIM_DELAY(sb, 1000);
+
+		if (!BCH_SB_VERSION_UPGRADE_COMPLETE(sb))
+			SET_BCH_SB_VERSION_UPGRADE_COMPLETE(sb, le16_to_cpu(sb->version));
 	}
 
 	for (opt_id = 0; opt_id < bch2_opts_nr; opt_id++) {
@@ -496,7 +499,7 @@ static void bch2_sb_update(struct bch_fs *c)
 	c->sb.user_uuid		= src->user_uuid;
 	c->sb.version		= le16_to_cpu(src->version);
 	c->sb.version_min	= le16_to_cpu(src->version_min);
-	c->sb.version_upgrade_complete = BCH_SB_VERSION_UPGRADE_COMPLETE(src) ?: c->sb.version;
+	c->sb.version_upgrade_complete = BCH_SB_VERSION_UPGRADE_COMPLETE(src);
 	c->sb.nr_devices	= src->nr_devices;
 	c->sb.clean		= BCH_SB_CLEAN(src);
 	c->sb.encryption_type	= BCH_SB_ENCRYPTION_TYPE(src);
-- 
cgit 


From 93de9e92c32c73d25bccbd82b1dabff3e3c2b7d2 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 16 Jul 2023 15:59:40 -0400
Subject: bcachefs: Inline bch2_snapshot_is_ancestor() fast path

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/subvolume.c | 2 +-
 fs/bcachefs/subvolume.h | 9 ++++++++-
 2 files changed, 9 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c
index cf8af617ac00..287492c29bcc 100644
--- a/fs/bcachefs/subvolume.c
+++ b/fs/bcachefs/subvolume.c
@@ -25,7 +25,7 @@ static inline u32 get_ancestor_below(struct snapshot_table *t, u32 id, u32 ances
 	return s->parent;
 }
 
-bool bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor)
+bool __bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor)
 {
 	struct snapshot_table *t;
 	bool ret;
diff --git a/fs/bcachefs/subvolume.h b/fs/bcachefs/subvolume.h
index 12a08a34e9bb..6905e91a9470 100644
--- a/fs/bcachefs/subvolume.h
+++ b/fs/bcachefs/subvolume.h
@@ -166,7 +166,14 @@ static inline u32 bch2_snapshot_sibling(struct bch_fs *c, u32 id)
 	return 0;
 }
 
-bool bch2_snapshot_is_ancestor(struct bch_fs *, u32, u32);
+bool __bch2_snapshot_is_ancestor(struct bch_fs *, u32, u32);
+
+static inline bool bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor)
+{
+	return id == ancestor
+		? true
+		: __bch2_snapshot_is_ancestor(c, id, ancestor);
+}
 
 static inline bool bch2_snapshot_has_children(struct bch_fs *c, u32 id)
 {
-- 
cgit 


From 5897505e67f542452b4f26b6eba949c71e3da7d5 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 16 Jul 2023 14:33:57 -0400
Subject: bcachefs: check_extents(): make sure to check i_sectors for last
 inode

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fsck.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index bc769b3e932a..f0a760a1701d 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -1426,7 +1426,8 @@ int bch2_check_extents(struct bch_fs *c)
 			BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, ({
 		bch2_disk_reservation_put(c, &res);
 		check_extent(&trans, &iter, k, &w, &s, &extent_ends);
-	}));
+	})) ?:
+	check_i_sectors(&trans, &w);
 
 	bch2_disk_reservation_put(c, &res);
 	extent_ends_reset(&extent_ends);
-- 
cgit 


From f9f52bc4f0342ddc68690ebdc581eedfbd4a9c6b Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 16 Jul 2023 14:24:36 -0400
Subject: bcachefs: fsck: inode_walker: last_pos, seen_this_pos

Prep work for changing check_extent() to avoid
key_visible_in_snapshot() - this adds the state to track whether an
inode has seen an extent at this pos.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fsck.c | 39 +++++++++++++++++++++++----------------
 1 file changed, 23 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index f0a760a1701d..45e5a52a106d 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -606,12 +606,13 @@ static int ref_visible2(struct bch_fs *c,
 struct inode_walker_entry {
 	struct bch_inode_unpacked inode;
 	u32			snapshot;
+	bool			seen_this_pos;
 	u64			count;
 };
 
 struct inode_walker {
 	bool				first_this_inode;
-	u64				cur_inum;
+	struct bpos			last_pos;
 
 	DARRAY(struct inode_walker_entry) inodes;
 };
@@ -648,9 +649,6 @@ static int get_inodes_all_snapshots(struct btree_trans *trans,
 	u32 restart_count = trans->restart_count;
 	int ret;
 
-	if (w->cur_inum == inum)
-		return 0;
-
 	w->inodes.nr = 0;
 
 	for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, inum),
@@ -666,8 +664,7 @@ static int get_inodes_all_snapshots(struct btree_trans *trans,
 	if (ret)
 		return ret;
 
-	w->cur_inum		= inum;
-	w->first_this_inode	= true;
+	w->first_this_inode = true;
 
 	if (trans_was_restarted(trans, restart_count))
 		return -BCH_ERR_transaction_restart_nested;
@@ -699,7 +696,7 @@ found:
 		new.count = 0;
 
 		bch_info(c, "have key for inode %llu:%u but have inode in ancestor snapshot %u",
-			 w->cur_inum, snapshot, i->snapshot);
+			 w->last_pos.inode, snapshot, i->snapshot);
 
 		while (i > w->inodes.data && i[-1].snapshot > snapshot)
 			--i;
@@ -715,9 +712,19 @@ found:
 static struct inode_walker_entry *walk_inode(struct btree_trans *trans,
 					     struct inode_walker *w, struct bpos pos)
 {
-	int ret = get_inodes_all_snapshots(trans, w, pos.inode);
-	if (ret)
-		return ERR_PTR(ret);
+	if (w->last_pos.inode != pos.inode) {
+		int ret = get_inodes_all_snapshots(trans, w, pos.inode);
+		if (ret)
+			return ERR_PTR(ret);
+	} else if (bkey_cmp(w->last_pos, pos)) {
+		struct inode_walker_entry *i;
+
+		darray_for_each(w->inodes, i)
+			i->seen_this_pos = false;
+
+	}
+
+	w->last_pos = pos;
 
 	return lookup_inode_for_snapshot(trans->c, w, pos.snapshot);
 }
@@ -1128,7 +1135,7 @@ static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w)
 		if (i->inode.bi_sectors == i->count)
 			continue;
 
-		count2 = bch2_count_inode_sectors(trans, w->cur_inum, i->snapshot);
+		count2 = bch2_count_inode_sectors(trans, w->last_pos.inode, i->snapshot);
 
 		if (i->count != count2) {
 			bch_err(c, "fsck counted i_sectors wrong: got %llu should be %llu",
@@ -1140,7 +1147,7 @@ static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w)
 
 		if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_I_SECTORS_DIRTY), c,
 			    "inode %llu:%u has incorrect i_sectors: got %llu, should be %llu",
-			    w->cur_inum, i->snapshot,
+			    w->last_pos.inode, i->snapshot,
 			    i->inode.bi_sectors, i->count)) {
 			i->inode.bi_sectors = i->count;
 			ret = write_inode(trans, &i->inode, i->snapshot);
@@ -1302,7 +1309,7 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
 	if (k.k->type == KEY_TYPE_whiteout)
 		goto out;
 
-	if (inode->cur_inum != k.k->p.inode) {
+	if (inode->last_pos.inode != k.k->p.inode) {
 		ret = check_i_sectors(trans, inode);
 		if (ret)
 			goto err;
@@ -1453,7 +1460,7 @@ static int check_subdir_count(struct btree_trans *trans, struct inode_walker *w)
 		if (i->inode.bi_nlink == i->count)
 			continue;
 
-		count2 = bch2_count_subdirs(trans, w->cur_inum, i->snapshot);
+		count2 = bch2_count_subdirs(trans, w->last_pos.inode, i->snapshot);
 		if (count2 < 0)
 			return count2;
 
@@ -1467,7 +1474,7 @@ static int check_subdir_count(struct btree_trans *trans, struct inode_walker *w)
 
 		if (fsck_err_on(i->inode.bi_nlink != i->count, c,
 				"directory %llu:%u with wrong i_nlink: got %u, should be %llu",
-				w->cur_inum, i->snapshot, i->inode.bi_nlink, i->count)) {
+				w->last_pos.inode, i->snapshot, i->inode.bi_nlink, i->count)) {
 			i->inode.bi_nlink = i->count;
 			ret = write_inode(trans, &i->inode, i->snapshot);
 			if (ret)
@@ -1631,7 +1638,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
 	if (k.k->type == KEY_TYPE_whiteout)
 		goto out;
 
-	if (dir->cur_inum != k.k->p.inode) {
+	if (dir->last_pos.inode != k.k->p.inode) {
 		ret = check_subdir_count(trans, dir);
 		if (ret)
 			goto err;
-- 
cgit 


From 43b81a4eaca01ef13293d139670981cc4da18d3c Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 13 Jul 2023 03:11:16 -0400
Subject: bcachefs: overlapping_extents_found()

This improves the repair path for overlapping extents - we now verify
that we find in the btree the overlapping extents that the algorithm
detected, and fail the fsck run with a more useful error if it doesn't
match.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/errcode.h |   1 +
 fs/bcachefs/fsck.c    | 120 ++++++++++++++++++++++++++++++++++----------------
 2 files changed, 84 insertions(+), 37 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h
index d5277ec7372f..a759493fee44 100644
--- a/fs/bcachefs/errcode.h
+++ b/fs/bcachefs/errcode.h
@@ -171,6 +171,7 @@
 	x(EINVAL,			device_already_online)			\
 	x(EINVAL,			insufficient_devices_to_start)		\
 	x(EINVAL,			invalid)				\
+	x(EINVAL,			internal_fsck_err)			\
 	x(EROFS,			erofs_trans_commit)			\
 	x(EROFS,			erofs_no_writes)			\
 	x(EROFS,			erofs_journal_err)			\
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 45e5a52a106d..e14e950fcdbc 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -612,6 +612,7 @@ struct inode_walker_entry {
 
 struct inode_walker {
 	bool				first_this_inode;
+	bool				recalculate_sums;
 	struct bpos			last_pos;
 
 	DARRAY(struct inode_walker_entry) inodes;
@@ -649,6 +650,7 @@ static int get_inodes_all_snapshots(struct btree_trans *trans,
 	u32 restart_count = trans->restart_count;
 	int ret;
 
+	w->recalculate_sums = false;
 	w->inodes.nr = 0;
 
 	for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, inum),
@@ -1137,12 +1139,13 @@ static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w)
 
 		count2 = bch2_count_inode_sectors(trans, w->last_pos.inode, i->snapshot);
 
-		if (i->count != count2) {
-			bch_err(c, "fsck counted i_sectors wrong: got %llu should be %llu",
-				i->count, count2);
+		if (w->recalculate_sums)
 			i->count = count2;
-			if (i->inode.bi_sectors == i->count)
-				continue;
+
+		if (i->count != count2) {
+			bch_err(c, "fsck counted i_sectors wrong for inode %llu:%u: got %llu should be %llu",
+				w->last_pos.inode, i->snapshot, i->count, count2);
+			return -BCH_ERR_internal_fsck_err;
 		}
 
 		if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_I_SECTORS_DIRTY), c,
@@ -1171,23 +1174,78 @@ struct extent_end {
 
 typedef DARRAY(struct extent_end) extent_ends;
 
-static int get_print_extent(struct btree_trans *trans, struct bpos pos, struct printbuf *out)
+static int overlapping_extents_found(struct btree_trans *trans,
+				     enum btree_id btree,
+				     struct bpos pos1, struct bkey pos2,
+				     bool *fixed)
 {
+	struct bch_fs *c = trans->c;
+	struct printbuf buf = PRINTBUF;
 	struct btree_iter iter;
 	struct bkey_s_c k;
+	u32 snapshot = min(pos1.snapshot, pos2.p.snapshot);
 	int ret;
 
-	k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_extents, pos,
-			       BTREE_ITER_SLOTS|
-			       BTREE_ITER_ALL_SNAPSHOTS|
-			       BTREE_ITER_NOT_EXTENTS);
+	BUG_ON(bkey_le(pos1, bkey_start_pos(&pos2)));
+
+	bch2_trans_iter_init(trans, &iter, btree, SPOS(pos1.inode, pos1.offset - 1, snapshot), 0);
+	k = bch2_btree_iter_peek_upto(&iter, POS(pos1.inode, U64_MAX));
 	ret = bkey_err(k);
 	if (ret)
-		return ret;
+		goto err;
+
+	prt_str(&buf, "\n  ");
+	bch2_bkey_val_to_text(&buf, c, k);
+
+	if (!bpos_eq(pos1, k.k->p)) {
+		bch_err(c, "%s: error finding first overlapping extent when repairing%s",
+			__func__, buf.buf);
+		ret = -BCH_ERR_internal_fsck_err;
+		goto err;
+	}
+
+	while (1) {
+		bch2_btree_iter_advance(&iter);
+
+		k = bch2_btree_iter_peek_upto(&iter, POS(pos1.inode, U64_MAX));
+		ret = bkey_err(k);
+		if (ret)
+			goto err;
+
+		if (bkey_ge(k.k->p, pos2.p))
+			break;
+
+	}
+
+	prt_str(&buf, "\n  ");
+	bch2_bkey_val_to_text(&buf, c, k);
+
+	if (bkey_gt(k.k->p, pos2.p) ||
+	    pos2.size != k.k->size) {
+		bch_err(c, "%s: error finding seconding overlapping extent when repairing%s",
+			__func__, buf.buf);
+		ret = -BCH_ERR_internal_fsck_err;
+		goto err;
+	}
 
-	bch2_bkey_val_to_text(out, trans->c, k);
+	if (fsck_err(c, "overlapping extents%s", buf.buf)) {
+		struct bpos update_pos = pos1.snapshot < pos2.p.snapshot ? pos1 : pos2.p;
+		struct btree_iter update_iter;
+
+		struct bkey_i *update = bch2_bkey_get_mut(trans, &update_iter,
+						btree, update_pos,
+						BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+		bch2_trans_iter_exit(trans, &update_iter);
+		if ((ret = PTR_ERR_OR_ZERO(update)))
+			goto err;
+
+		*fixed = true;
+	}
+fsck_err:
+err:
 	bch2_trans_iter_exit(trans, &iter);
-	return 0;
+	printbuf_exit(&buf);
+	return ret;
 }
 
 static int check_overlapping_extents(struct btree_trans *trans,
@@ -1198,7 +1256,7 @@ static int check_overlapping_extents(struct btree_trans *trans,
 {
 	struct bch_fs *c = trans->c;
 	struct extent_end *i;
-	struct printbuf buf = PRINTBUF;
+	bool fixed = false;
 	int ret = 0;
 
 	darray_for_each(*extent_ends, i) {
@@ -1212,33 +1270,18 @@ static int check_overlapping_extents(struct btree_trans *trans,
 				  i->snapshot, &i->seen))
 			continue;
 
-		if (i->offset <= bkey_start_offset(k.k))
-			continue;
-
-		printbuf_reset(&buf);
-		prt_str(&buf, "overlapping extents:\n  ");
-		bch2_bkey_val_to_text(&buf, c, k);
-		prt_str(&buf, "\n  ");
-
-		ret = get_print_extent(trans, SPOS(k.k->p.inode, i->offset, i->snapshot), &buf);
-		if (ret)
-			break;
-
-		if (fsck_err(c, "%s", buf.buf)) {
-			struct bkey_i *update = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
-			if ((ret = PTR_ERR_OR_ZERO(update)))
-				goto err;
-			bkey_reassemble(update, k);
-			ret = bch2_trans_update_extent(trans, iter, update,
-					    BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+		if (i->offset > bkey_start_offset(k.k)) {
+			ret = overlapping_extents_found(trans, iter->btree_id,
+							SPOS(iter->pos.inode,
+							     i->offset,
+							     i->snapshot),
+							*k.k, &fixed);
 			if (ret)
 				goto err;
 		}
 	}
 err:
-fsck_err:
-	printbuf_exit(&buf);
-	return ret;
+	return ret ?: fixed;
 }
 
 static int extent_ends_at(extent_ends *extent_ends,
@@ -1320,9 +1363,12 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
 	BUG_ON(!iter->path->should_be_locked);
 
 	ret = check_overlapping_extents(trans, s, extent_ends, k, iter);
-	if (ret)
+	if (ret < 0)
 		goto err;
 
+	if (ret)
+		inode->recalculate_sums = true;
+
 	ret = extent_ends_at(extent_ends, s, k);
 	if (ret)
 		goto err;
-- 
cgit 


From 0d8f320dd7e8fb4f1ac125f168e939b952ca3f8c Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 13 Jul 2023 01:41:02 -0400
Subject: bcachefs: Simplify check_extent()

Minor refactoring/dead code deletion, prep work for reworking
check_extent() to avoid key_visible_in_snapshot().

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fsck.c | 30 ++++++++++--------------------
 1 file changed, 10 insertions(+), 20 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index e14e950fcdbc..aa11f33c2384 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -1333,18 +1333,17 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
 	struct bch_fs *c = trans->c;
 	struct inode_walker_entry *i;
 	struct printbuf buf = PRINTBUF;
-	struct bpos equiv;
+	struct bpos equiv = k.k->p;
 	int ret = 0;
 
+	equiv.snapshot = bch2_snapshot_equiv(c, k.k->p.snapshot);
+
 	ret = check_key_has_snapshot(trans, iter, k);
 	if (ret) {
 		ret = ret < 0 ? ret : 0;
 		goto out;
 	}
 
-	equiv = k.k->p;
-	equiv.snapshot = bch2_snapshot_equiv(c, k.k->p.snapshot);
-
 	ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p);
 	if (ret)
 		goto err;
@@ -1360,8 +1359,6 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
 		extent_ends_reset(extent_ends);
 	}
 
-	BUG_ON(!iter->path->should_be_locked);
-
 	ret = check_overlapping_extents(trans, s, extent_ends, k, iter);
 	if (ret < 0)
 		goto err;
@@ -1381,11 +1378,8 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
 	if (fsck_err_on(!i, c,
 			"extent in missing inode:\n  %s",
 			(printbuf_reset(&buf),
-			 bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
-		ret = bch2_btree_delete_at(trans, iter,
-					    BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
-		goto out;
-	}
+			 bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
+		goto delete;
 
 	if (!i)
 		goto out;
@@ -1395,11 +1389,8 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
 			"extent in non regular inode mode %o:\n  %s",
 			i->inode.bi_mode,
 			(printbuf_reset(&buf),
-			 bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
-		ret = bch2_btree_delete_at(trans, iter,
-					    BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
-		goto out;
-	}
+			 bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
+		goto delete;
 
 	/*
 	 * Check inodes in reverse order, from oldest snapshots to newest, so
@@ -1440,10 +1431,6 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
 	if (bkey_extent_is_allocation(k.k))
 		for_each_visible_inode(c, s, inode, equiv.snapshot, i)
 			i->count += k.k->size;
-#if 0
-	bch2_bkey_buf_reassemble(&prev, c, k);
-#endif
-
 out:
 err:
 fsck_err:
@@ -1452,6 +1439,9 @@ fsck_err:
 	if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
 		bch_err_fn(c, ret);
 	return ret;
+delete:
+	ret = bch2_btree_delete_at(trans, iter, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+	goto out;
 }
 
 /*
-- 
cgit 


From a57f4d611320a38b1985b2e5ba342d923b35964f Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 16 Jul 2023 14:19:08 -0400
Subject: bcachefs: fsck: walk_inode() now takes is_whiteout

We only want to synthesize an inode for the current snapshot ID for non
whiteouts - this refactoring lets us call walk_inode() earlier and clean
up some control flow.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fsck.c | 25 +++++++++++++------------
 1 file changed, 13 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index aa11f33c2384..f3e3cf475c23 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -675,8 +675,8 @@ static int get_inodes_all_snapshots(struct btree_trans *trans,
 }
 
 static struct inode_walker_entry *
-lookup_inode_for_snapshot(struct bch_fs *c,
-			  struct inode_walker *w, u32 snapshot)
+lookup_inode_for_snapshot(struct bch_fs *c, struct inode_walker *w,
+			  u32 snapshot, bool is_whiteout)
 {
 	struct inode_walker_entry *i;
 
@@ -690,7 +690,7 @@ lookup_inode_for_snapshot(struct bch_fs *c,
 found:
 	BUG_ON(snapshot > i->snapshot);
 
-	if (snapshot != i->snapshot) {
+	if (snapshot != i->snapshot && !is_whiteout) {
 		struct inode_walker_entry new = *i;
 		int ret;
 
@@ -712,7 +712,8 @@ found:
 }
 
 static struct inode_walker_entry *walk_inode(struct btree_trans *trans,
-					     struct inode_walker *w, struct bpos pos)
+					     struct inode_walker *w, struct bpos pos,
+					     bool is_whiteout)
 {
 	if (w->last_pos.inode != pos.inode) {
 		int ret = get_inodes_all_snapshots(trans, w, pos.inode);
@@ -728,7 +729,7 @@ static struct inode_walker_entry *walk_inode(struct btree_trans *trans,
 
 	w->last_pos = pos;
 
-	return lookup_inode_for_snapshot(trans->c, w, pos.snapshot);
+	return lookup_inode_for_snapshot(trans->c, w, pos.snapshot, is_whiteout);
 }
 
 static int __get_visible_inodes(struct btree_trans *trans,
@@ -1359,6 +1360,11 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
 		extent_ends_reset(extent_ends);
 	}
 
+	i = walk_inode(trans, inode, equiv, k.k->type == KEY_TYPE_whiteout);
+	ret = PTR_ERR_OR_ZERO(i);
+	if (ret)
+		goto err;
+
 	ret = check_overlapping_extents(trans, s, extent_ends, k, iter);
 	if (ret < 0)
 		goto err;
@@ -1370,11 +1376,6 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
 	if (ret)
 		goto err;
 
-	i = walk_inode(trans, inode, equiv);
-	ret = PTR_ERR_OR_ZERO(i);
-	if (ret)
-		goto err;
-
 	if (fsck_err_on(!i, c,
 			"extent in missing inode:\n  %s",
 			(printbuf_reset(&buf),
@@ -1682,7 +1683,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
 
 	BUG_ON(!iter->path->should_be_locked);
 
-	i = walk_inode(trans, dir, equiv);
+	i = walk_inode(trans, dir, equiv, k.k->type == KEY_TYPE_whiteout);
 	ret = PTR_ERR_OR_ZERO(i);
 	if (ret < 0)
 		goto err;
@@ -1859,7 +1860,7 @@ static int check_xattr(struct btree_trans *trans, struct btree_iter *iter,
 	if (ret)
 		return ret;
 
-	i = walk_inode(trans, inode, k.k->p);
+	i = walk_inode(trans, inode, k.k->p, k.k->type == KEY_TYPE_whiteout);
 	ret = PTR_ERR_OR_ZERO(i);
 	if (ret)
 		return ret;
-- 
cgit 


From 650eb16b45183c7ea7c17e56ac862dc1ad1ec849 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 16 Jul 2023 14:45:23 -0400
Subject: bcachefs: check_extent() refactoring

More prep work for reducing key_visible_in_snapshot() usage - this
rearranges how KEY_TYPE_whitout keys are handled, so that they can be
marked off in inode_warker->inode->seen_this_pos.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fsck.c | 99 +++++++++++++++++++++++++++---------------------------
 1 file changed, 50 insertions(+), 49 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index f3e3cf475c23..3a3d89bdf1c9 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -1345,13 +1345,6 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
 		goto out;
 	}
 
-	ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p);
-	if (ret)
-		goto err;
-
-	if (k.k->type == KEY_TYPE_whiteout)
-		goto out;
-
 	if (inode->last_pos.inode != k.k->p.inode) {
 		ret = check_i_sectors(trans, inode);
 		if (ret)
@@ -1365,66 +1358,74 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
 	if (ret)
 		goto err;
 
-	ret = check_overlapping_extents(trans, s, extent_ends, k, iter);
-	if (ret < 0)
-		goto err;
-
-	if (ret)
-		inode->recalculate_sums = true;
-
-	ret = extent_ends_at(extent_ends, s, k);
+	ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p);
 	if (ret)
 		goto err;
 
-	if (fsck_err_on(!i, c,
-			"extent in missing inode:\n  %s",
-			(printbuf_reset(&buf),
-			 bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
-		goto delete;
+	if (k.k->type != KEY_TYPE_whiteout) {
+		if (fsck_err_on(!i, c,
+				"extent in missing inode:\n  %s",
+				(printbuf_reset(&buf),
+				 bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
+			goto delete;
+
+		if (fsck_err_on(i &&
+				!S_ISREG(i->inode.bi_mode) &&
+				!S_ISLNK(i->inode.bi_mode), c,
+				"extent in non regular inode mode %o:\n  %s",
+				i->inode.bi_mode,
+				(printbuf_reset(&buf),
+				 bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
+			goto delete;
 
-	if (!i)
-		goto out;
+		ret = check_overlapping_extents(trans, s, extent_ends, k, iter);
+		if (ret < 0)
+			goto err;
 
-	if (fsck_err_on(!S_ISREG(i->inode.bi_mode) &&
-			!S_ISLNK(i->inode.bi_mode), c,
-			"extent in non regular inode mode %o:\n  %s",
-			i->inode.bi_mode,
-			(printbuf_reset(&buf),
-			 bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
-		goto delete;
+		if (ret)
+			inode->recalculate_sums = true;
+
+		ret = extent_ends_at(extent_ends, s, k);
+		if (ret)
+			goto err;
+	}
 
 	/*
 	 * Check inodes in reverse order, from oldest snapshots to newest, so
 	 * that we emit the fewest number of whiteouts necessary:
 	 */
 	for (i = inode->inodes.data + inode->inodes.nr - 1;
-	     i >= inode->inodes.data;
+	     inode->inodes.data && i >= inode->inodes.data;
 	     --i) {
 		if (i->snapshot > equiv.snapshot ||
 		    !key_visible_in_snapshot(c, s, i->snapshot, equiv.snapshot))
 			continue;
 
-		if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_I_SIZE_DIRTY) &&
-				k.k->p.offset > round_up(i->inode.bi_size, block_bytes(c)) >> 9 &&
-				!bkey_extent_is_reservation(k), c,
-				"extent type past end of inode %llu:%u, i_size %llu\n  %s",
-				i->inode.bi_inum, i->snapshot, i->inode.bi_size,
-				(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
-			struct btree_iter iter2;
-
-			bch2_trans_copy_iter(&iter2, iter);
-			bch2_btree_iter_set_snapshot(&iter2, i->snapshot);
-			ret =   bch2_btree_iter_traverse(&iter2) ?:
-				bch2_btree_delete_at(trans, &iter2,
-					BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
-			bch2_trans_iter_exit(trans, &iter2);
-			if (ret)
-				goto err;
-
-			if (i->snapshot != equiv.snapshot) {
-				ret = snapshots_seen_add(c, s, i->snapshot);
+		if (k.k->type != KEY_TYPE_whiteout) {
+			if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_I_SIZE_DIRTY) &&
+					k.k->p.offset > round_up(i->inode.bi_size, block_bytes(c)) >> 9 &&
+					!bkey_extent_is_reservation(k), c,
+					"extent type past end of inode %llu:%u, i_size %llu\n  %s",
+					i->inode.bi_inum, i->snapshot, i->inode.bi_size,
+					(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
+				struct btree_iter iter2;
+
+				bch2_trans_copy_iter(&iter2, iter);
+				bch2_btree_iter_set_snapshot(&iter2, i->snapshot);
+				ret =   bch2_btree_iter_traverse(&iter2) ?:
+					bch2_btree_delete_at(trans, &iter2,
+						BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+				bch2_trans_iter_exit(trans, &iter2);
 				if (ret)
 					goto err;
+
+				if (i->snapshot != equiv.snapshot) {
+					ret = snapshots_seen_add(c, s, i->snapshot);
+					if (ret)
+						goto err;
+				}
+
+				iter->k.type = KEY_TYPE_whiteout;
 			}
 		}
 	}
-- 
cgit 


From a0076086da73297228665957c3b3bfac2492d67d Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 16 Jul 2023 14:55:33 -0400
Subject: bcachefs: check_extent(): don't use key_visible_in_snapshot()

This changes the main part of check_extents(), that checks the extent
against the corresponding inode, to not use key_visible_in_snapshot().

key_visible_in_snapshot() has to iterate over the list of ancestor
overwrites repeatedly calling bch2_snapshot_is_ancestor(), so this is a
significant performance improvement.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fsck.c | 47 ++++++++++++-----------------------------------
 1 file changed, 12 insertions(+), 35 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 3a3d89bdf1c9..784e55166df2 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -471,28 +471,6 @@ static inline void snapshots_seen_init(struct snapshots_seen *s)
 	memset(s, 0, sizeof(*s));
 }
 
-static int snapshots_seen_add(struct bch_fs *c, struct snapshots_seen *s, u32 id)
-{
-	struct snapshots_seen_entry *i, n = { id, id };
-	int ret;
-
-	darray_for_each(s->ids, i) {
-		if (n.equiv < i->equiv)
-			break;
-
-		if (i->equiv == n.equiv) {
-			bch_err(c, "%s(): adding duplicate snapshot", __func__);
-			return -EINVAL;
-		}
-	}
-
-	ret = darray_insert_item(&s->ids, i - s->ids.data, n);
-	if (ret)
-		bch_err(c, "error reallocating snapshots_seen table (size %zu)",
-			s->ids.size);
-	return ret;
-}
-
 static int snapshots_seen_update(struct bch_fs *c, struct snapshots_seen *s,
 				 enum btree_id btree_id, struct bpos pos)
 {
@@ -1391,10 +1369,14 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
 	}
 
 	/*
-	 * Check inodes in reverse order, from oldest snapshots to newest, so
-	 * that we emit the fewest number of whiteouts necessary:
+	 * Check inodes in reverse order, from oldest snapshots to newest,
+	 * starting from the inode that matches this extent's snapshot. If we
+	 * didn't have one, iterate over all inodes:
 	 */
-	for (i = inode->inodes.data + inode->inodes.nr - 1;
+	if (!i)
+		i = inode->inodes.data + inode->inodes.nr - 1;
+
+	for (;
 	     inode->inodes.data && i >= inode->inodes.data;
 	     --i) {
 		if (i->snapshot > equiv.snapshot ||
@@ -1419,20 +1401,15 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
 				if (ret)
 					goto err;
 
-				if (i->snapshot != equiv.snapshot) {
-					ret = snapshots_seen_add(c, s, i->snapshot);
-					if (ret)
-						goto err;
-				}
-
 				iter->k.type = KEY_TYPE_whiteout;
 			}
+
+			if (bkey_extent_is_allocation(k.k))
+				i->count += k.k->size;
 		}
-	}
 
-	if (bkey_extent_is_allocation(k.k))
-		for_each_visible_inode(c, s, inode, equiv.snapshot, i)
-			i->count += k.k->size;
+		i->seen_this_pos = true;
+	}
 out:
 err:
 fsck_err:
-- 
cgit 


From a397b8df5e2f2981427f2609252f37066a0cf780 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 16 Jul 2023 15:12:25 -0400
Subject: bcachefs: Refactor overlapping extent checks

Make the overlapping extent check/repair code more self contained.

This is prep work for hopefully reducing key_visible_in_snapshot() usage
here as well, and also includes a nice performance optimization to not
check ref_visible2() unless the extents potentially overlap.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fsck.c | 153 ++++++++++++++++++++++++++++++-----------------------
 1 file changed, 87 insertions(+), 66 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 784e55166df2..e40040001ac1 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -1151,7 +1151,62 @@ struct extent_end {
 	struct snapshots_seen	seen;
 };
 
-typedef DARRAY(struct extent_end) extent_ends;
+struct extent_ends {
+	struct bpos			last_pos;
+	DARRAY(struct extent_end)	e;
+};
+
+static void extent_ends_reset(struct extent_ends *extent_ends)
+{
+	struct extent_end *i;
+
+	darray_for_each(extent_ends->e, i)
+		snapshots_seen_exit(&i->seen);
+
+	extent_ends->e.nr = 0;
+}
+
+static void extent_ends_exit(struct extent_ends *extent_ends)
+{
+	extent_ends_reset(extent_ends);
+	darray_exit(&extent_ends->e);
+}
+
+static void extent_ends_init(struct extent_ends *extent_ends)
+{
+	memset(extent_ends, 0, sizeof(*extent_ends));
+}
+
+static int extent_ends_at(struct bch_fs *c,
+			  struct extent_ends *extent_ends,
+			  struct snapshots_seen *seen,
+			  struct bkey_s_c k)
+{
+	struct extent_end *i, n = (struct extent_end) {
+		.offset		= k.k->p.offset,
+		.snapshot	= k.k->p.snapshot,
+		.seen		= *seen,
+	};
+
+	n.seen.ids.data = kmemdup(seen->ids.data,
+			      sizeof(seen->ids.data[0]) * seen->ids.size,
+			      GFP_KERNEL);
+	if (!n.seen.ids.data)
+		return -BCH_ERR_ENOMEM_fsck_extent_ends_at;
+
+	darray_for_each(extent_ends->e, i) {
+		if (i->snapshot == k.k->p.snapshot) {
+			snapshots_seen_exit(&i->seen);
+			*i = n;
+			return 0;
+		}
+
+		if (i->snapshot >= k.k->p.snapshot)
+			break;
+	}
+
+	return darray_insert_item(&extent_ends->e, i - extent_ends->e.data, n);
+}
 
 static int overlapping_extents_found(struct btree_trans *trans,
 				     enum btree_id btree,
@@ -1229,8 +1284,9 @@ err:
 
 static int check_overlapping_extents(struct btree_trans *trans,
 			      struct snapshots_seen *seen,
-			      extent_ends *extent_ends,
+			      struct extent_ends *extent_ends,
 			      struct bkey_s_c k,
+			      u32 equiv,
 			      struct btree_iter *iter)
 {
 	struct bch_fs *c = trans->c;
@@ -1238,10 +1294,15 @@ static int check_overlapping_extents(struct btree_trans *trans,
 	bool fixed = false;
 	int ret = 0;
 
-	darray_for_each(*extent_ends, i) {
-		/* duplicate, due to transaction restart: */
-		if (i->offset	== k.k->p.offset &&
-		    i->snapshot == k.k->p.snapshot)
+	/* transaction restart, running again */
+	if (bpos_eq(extent_ends->last_pos, k.k->p))
+		return 0;
+
+	if (extent_ends->last_pos.inode != k.k->p.inode)
+		extent_ends_reset(extent_ends);
+
+	darray_for_each(extent_ends->e, i) {
+		if (i->offset <= bkey_start_offset(k.k))
 			continue;
 
 		if (!ref_visible2(c,
@@ -1249,65 +1310,29 @@ static int check_overlapping_extents(struct btree_trans *trans,
 				  i->snapshot, &i->seen))
 			continue;
 
-		if (i->offset > bkey_start_offset(k.k)) {
-			ret = overlapping_extents_found(trans, iter->btree_id,
-							SPOS(iter->pos.inode,
-							     i->offset,
-							     i->snapshot),
-							*k.k, &fixed);
-			if (ret)
-				goto err;
-		}
-	}
-err:
-	return ret ?: fixed;
-}
-
-static int extent_ends_at(extent_ends *extent_ends,
-			  struct snapshots_seen *seen,
-			  struct bkey_s_c k)
-{
-	struct extent_end *i, n = (struct extent_end) {
-		.snapshot	= k.k->p.snapshot,
-		.offset		= k.k->p.offset,
-		.seen		= *seen,
-	};
-
-	n.seen.ids.data = kmemdup(seen->ids.data,
-			      sizeof(seen->ids.data[0]) * seen->ids.size,
-			      GFP_KERNEL);
-	if (!n.seen.ids.data)
-		return -BCH_ERR_ENOMEM_fsck_extent_ends_at;
-
-	darray_for_each(*extent_ends, i) {
-		if (i->snapshot == k.k->p.snapshot) {
-			snapshots_seen_exit(&i->seen);
-			*i = n;
-			return 0;
-		}
-
-		if (i->snapshot >= k.k->p.snapshot)
-			break;
+		ret = overlapping_extents_found(trans, iter->btree_id,
+						SPOS(iter->pos.inode,
+						     i->offset,
+						     i->snapshot),
+						*k.k, &fixed);
+		if (ret)
+			goto err;
 	}
 
-	return darray_insert_item(extent_ends, i - extent_ends->data, n);
-}
-
-static void extent_ends_reset(extent_ends *extent_ends)
-{
-	struct extent_end *i;
-
-	darray_for_each(*extent_ends, i)
-		snapshots_seen_exit(&i->seen);
+	ret = extent_ends_at(c, extent_ends, seen, k);
+	if (ret)
+		goto err;
 
-	extent_ends->nr = 0;
+	extent_ends->last_pos = k.k->p;
+err:
+	return ret ?: fixed;
 }
 
 static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
 			struct bkey_s_c k,
 			struct inode_walker *inode,
 			struct snapshots_seen *s,
-			extent_ends *extent_ends)
+			struct extent_ends *extent_ends)
 {
 	struct bch_fs *c = trans->c;
 	struct inode_walker_entry *i;
@@ -1327,8 +1352,6 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
 		ret = check_i_sectors(trans, inode);
 		if (ret)
 			goto err;
-
-		extent_ends_reset(extent_ends);
 	}
 
 	i = walk_inode(trans, inode, equiv, k.k->type == KEY_TYPE_whiteout);
@@ -1356,16 +1379,14 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
 				 bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
 			goto delete;
 
-		ret = check_overlapping_extents(trans, s, extent_ends, k, iter);
+		ret = check_overlapping_extents(trans, s, extent_ends, k,
+						equiv.snapshot, iter);
 		if (ret < 0)
 			goto err;
 
 		if (ret)
 			inode->recalculate_sums = true;
-
-		ret = extent_ends_at(extent_ends, s, k);
-		if (ret)
-			goto err;
+		ret = 0;
 	}
 
 	/*
@@ -1434,11 +1455,12 @@ int bch2_check_extents(struct bch_fs *c)
 	struct btree_trans trans;
 	struct btree_iter iter;
 	struct bkey_s_c k;
-	extent_ends extent_ends = { 0 };
+	struct extent_ends extent_ends;
 	struct disk_reservation res = { 0 };
 	int ret = 0;
 
 	snapshots_seen_init(&s);
+	extent_ends_init(&extent_ends);
 	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
 
 	ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_extents,
@@ -1452,8 +1474,7 @@ int bch2_check_extents(struct bch_fs *c)
 	check_i_sectors(&trans, &w);
 
 	bch2_disk_reservation_put(c, &res);
-	extent_ends_reset(&extent_ends);
-	darray_exit(&extent_ends);
+	extent_ends_exit(&extent_ends);
 	inode_walker_exit(&w);
 	bch2_trans_exit(&trans);
 	snapshots_seen_exit(&s);
-- 
cgit 


From 464ee1929b7761d2939ad76573e6679b4246dc82 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 16 Jul 2023 18:15:01 -0400
Subject: bcachefs: Improve key_visible_in_snapshot()

Delete a redundant bch2_snapshot_is_ancestor() check, and convert some
assertions to debug assertions.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fsck.c | 24 ++++++++++++++++--------
 1 file changed, 16 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index e40040001ac1..93281b701473 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -517,15 +517,14 @@ static bool key_visible_in_snapshot(struct bch_fs *c, struct snapshots_seen *see
 				    u32 id, u32 ancestor)
 {
 	ssize_t i;
-	u32 top = seen->ids.nr ? seen->ids.data[seen->ids.nr - 1].equiv : 0;
 
-	BUG_ON(id > ancestor);
-	BUG_ON(!bch2_snapshot_is_equiv(c, id));
-	BUG_ON(!bch2_snapshot_is_equiv(c, ancestor));
+	EBUG_ON(id > ancestor);
+	EBUG_ON(!bch2_snapshot_is_equiv(c, id));
+	EBUG_ON(!bch2_snapshot_is_equiv(c, ancestor));
 
 	/* @ancestor should be the snapshot most recently added to @seen */
-	BUG_ON(ancestor != seen->pos.snapshot);
-	BUG_ON(ancestor != top);
+	EBUG_ON(ancestor != seen->pos.snapshot);
+	EBUG_ON(ancestor != seen->ids.data[seen->ids.nr - 1].equiv);
 
 	if (id == ancestor)
 		return true;
@@ -533,11 +532,20 @@ static bool key_visible_in_snapshot(struct bch_fs *c, struct snapshots_seen *see
 	if (!bch2_snapshot_is_ancestor(c, id, ancestor))
 		return false;
 
+	/*
+	 * We know that @id is a descendant of @ancestor, we're checking if
+	 * we've seen a key that overwrote @ancestor - i.e. also a descendent of
+	 * @ascestor and with @id as a descendent.
+	 *
+	 * But we already know that we're scanning IDs between @id and @ancestor
+	 * numerically, since snapshot ID lists are kept sorted, so if we find
+	 * an id that's an ancestor of @id we're done:
+	 */
+
 	for (i = seen->ids.nr - 2;
 	     i >= 0 && seen->ids.data[i].equiv >= id;
 	     --i)
-		if (bch2_snapshot_is_ancestor(c, id, seen->ids.data[i].equiv) &&
-		    bch2_snapshot_is_ancestor(c, seen->ids.data[i].equiv, ancestor))
+		if (bch2_snapshot_is_ancestor(c, id, seen->ids.data[i].equiv))
 			return false;
 
 	return true;
-- 
cgit 


From 6b20d746adb7d7e662f8fc6b93fab8513654d51e Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 16 Jul 2023 21:09:37 -0400
Subject: bcachefs: need_snapshot_cleanup shouldn't be a fsck error

We currently don't track whether snapshot cleanup still needs to finish
(aside from running a full fsck), so it shouldn't be a fsck error yet -
fsck -n after fsck has succesfully completed shouldn't error.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fsck.c | 28 ++++++++++++++++------------
 1 file changed, 16 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 93281b701473..c8599978ae46 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -483,27 +483,31 @@ static int snapshots_seen_update(struct bch_fs *c, struct snapshots_seen *s,
 	if (!bkey_eq(s->pos, pos))
 		s->ids.nr = 0;
 
-	pos.snapshot = n.equiv;
 	s->pos = pos;
+	s->pos.snapshot = n.equiv;
 
-	darray_for_each(s->ids, i)
-		if (i->equiv == n.equiv) {
-			if (fsck_err_on(i->id != n.id, c,
-					"snapshot deletion did not run correctly:\n"
-					"  duplicate keys in btree %s at %llu:%llu snapshots %u, %u (equiv %u)\n",
-					bch2_btree_ids[btree_id],
-					pos.inode, pos.offset,
-					i->id, n.id, n.equiv))
-				return -BCH_ERR_need_snapshot_cleanup;
-
+	darray_for_each(s->ids, i) {
+		if (i->id == n.id)
 			return 0;
+
+		/*
+		 * We currently don't rigorously track for snapshot cleanup
+		 * needing to be run, so it shouldn't be a fsck error yet:
+		 */
+		if (i->equiv == n.equiv) {
+			bch_err(c, "snapshot deletion did not finish:\n"
+				"  duplicate keys in btree %s at %llu:%llu snapshots %u, %u (equiv %u)\n",
+				bch2_btree_ids[btree_id],
+				pos.inode, pos.offset,
+				i->id, n.id, n.equiv);
+			return -BCH_ERR_need_snapshot_cleanup;
 		}
+	}
 
 	ret = darray_push(&s->ids, n);
 	if (ret)
 		bch_err(c, "error reallocating snapshots_seen table (size %zu)",
 			s->ids.size);
-fsck_err:
 	return ret;
 }
 
-- 
cgit 


From 20e6d9a8d4050220f4e0a0195d102abaf2c8439b Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 16 Jul 2023 21:56:18 -0400
Subject: bcachefs: Fix lookup_inode_for_snapshot()

This fixes a use-after-free.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fsck.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index c8599978ae46..0d7344ff64c0 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -682,6 +682,7 @@ found:
 
 	if (snapshot != i->snapshot && !is_whiteout) {
 		struct inode_walker_entry new = *i;
+		size_t pos;
 		int ret;
 
 		new.snapshot = snapshot;
@@ -693,9 +694,12 @@ found:
 		while (i > w->inodes.data && i[-1].snapshot > snapshot)
 			--i;
 
-		ret = darray_insert_item(&w->inodes, i - w->inodes.data, new);
+		pos = i - w->inodes.data;
+		ret = darray_insert_item(&w->inodes, pos, new);
 		if (ret)
 			return ERR_PTR(ret);
+
+		i = w->inodes.data + pos;
 	}
 
 	return i;
-- 
cgit 


From 970a5096acc223e90c9fad853575c87ee85b02ae Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 16 Jul 2023 22:31:19 -0400
Subject: bcachefs: Suppresss various error messages in no_data_io mode

We commonly use no_data_io mode when debugging filesystem metadata
dumps, where data checksum/compression errors are expected and
unimportant - this patch suppresses these.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/checksum.c |  2 +-
 fs/bcachefs/compress.c |  3 ++-
 fs/bcachefs/io.c       | 11 +++++++----
 3 files changed, 10 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/checksum.c b/fs/bcachefs/checksum.c
index 20d0e8afc1ad..bf03d42c6138 100644
--- a/fs/bcachefs/checksum.c
+++ b/fs/bcachefs/checksum.c
@@ -426,7 +426,7 @@ int bch2_rechecksum_bio(struct bch_fs *c, struct bio *bio,
 		merged = bch2_checksum_bio(c, crc_old.csum_type,
 				extent_nonce(version, crc_old), bio);
 
-	if (bch2_crc_cmp(merged, crc_old.csum)) {
+	if (bch2_crc_cmp(merged, crc_old.csum) && !c->opts.no_data_io) {
 		bch_err(c, "checksum error in bch2_rechecksum_bio() (memory corruption or bug?)\n"
 			"expected %0llx:%0llx got %0llx:%0llx (old type %s new type %s)",
 			crc_old.csum.hi,
diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c
index 560214c15da3..c9ca7cce55f8 100644
--- a/fs/bcachefs/compress.c
+++ b/fs/bcachefs/compress.c
@@ -240,7 +240,8 @@ int bch2_bio_uncompress_inplace(struct bch_fs *c, struct bio *bio,
 	data = __bounce_alloc(c, dst_len, WRITE);
 
 	if (__bio_uncompress(c, bio, data.b, *crc)) {
-		bch_err(c, "error rewriting existing data: decompression error");
+		if (!c->opts.no_data_io)
+			bch_err(c, "error rewriting existing data: decompression error");
 		bio_unmap_or_unbounce(c, data);
 		return -EIO;
 	}
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index 8604df80a3e2..5bacc6a9dd8f 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -1082,7 +1082,8 @@ static enum prep_encoded_ret {
 	     op->incompressible)) {
 		if (!crc_is_compressed(op->crc) &&
 		    op->csum_type != op->crc.csum_type &&
-		    bch2_write_rechecksum(c, op, op->csum_type))
+		    bch2_write_rechecksum(c, op, op->csum_type) &&
+		    !c->opts.no_data_io)
 			return PREP_ENCODED_CHECKSUM_ERR;
 
 		return PREP_ENCODED_DO_WRITE;
@@ -1102,7 +1103,7 @@ static enum prep_encoded_ret {
 		csum = bch2_checksum_bio(c, op->crc.csum_type,
 					 extent_nonce(op->version, op->crc),
 					 bio);
-		if (bch2_crc_cmp(op->crc.csum, csum))
+		if (bch2_crc_cmp(op->crc.csum, csum) && !c->opts.no_data_io)
 			return PREP_ENCODED_CHECKSUM_ERR;
 
 		if (bch2_bio_uncompress_inplace(c, bio, &op->crc))
@@ -1120,7 +1121,8 @@ static enum prep_encoded_ret {
 	 */
 	if ((op->crc.live_size != op->crc.uncompressed_size ||
 	     op->crc.csum_type != op->csum_type) &&
-	    bch2_write_rechecksum(c, op, op->csum_type))
+	    bch2_write_rechecksum(c, op, op->csum_type) &&
+	    !c->opts.no_data_io)
 		return PREP_ENCODED_CHECKSUM_ERR;
 
 	/*
@@ -2416,7 +2418,8 @@ static void __bch2_read_endio(struct work_struct *work)
 		if (ret)
 			goto decrypt_err;
 
-		if (bch2_bio_uncompress(c, src, dst, dst_iter, crc))
+		if (bch2_bio_uncompress(c, src, dst, dst_iter, crc) &&
+		    !c->opts.no_data_io)
 			goto decompression_err;
 	} else {
 		/* don't need to decrypt the entire bio: */
-- 
cgit 


From 2110f21ec0a294360f2a510ff33d38605af45d88 Mon Sep 17 00:00:00 2001
From: Brian Foster <bfoster@redhat.com>
Date: Wed, 19 Jul 2023 08:53:02 -0400
Subject: bcachefs: remove duplicate code between backpointer update paths

Signed-off-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/backpointers.c | 18 +-----------------
 fs/bcachefs/backpointers.h |  8 ++++----
 2 files changed, 5 insertions(+), 21 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c
index 7c1e6546d054..8747c5e19f99 100644
--- a/fs/bcachefs/backpointers.c
+++ b/fs/bcachefs/backpointers.c
@@ -134,31 +134,15 @@ static noinline int backpointer_mod_err(struct btree_trans *trans,
 }
 
 int bch2_bucket_backpointer_mod_nowritebuffer(struct btree_trans *trans,
-				struct bpos bucket,
+				struct bkey_i_backpointer *bp_k,
 				struct bch_backpointer bp,
 				struct bkey_s_c orig_k,
 				bool insert)
 {
-	struct bch_fs *c = trans->c;
-	struct bkey_i_backpointer *bp_k;
 	struct btree_iter bp_iter;
 	struct bkey_s_c k;
 	int ret;
 
-	bp_k = bch2_trans_kmalloc_nomemzero(trans, sizeof(struct bkey_i_backpointer));
-	ret = PTR_ERR_OR_ZERO(bp_k);
-	if (ret)
-		return ret;
-
-	bkey_backpointer_init(&bp_k->k_i);
-	bp_k->k.p = bucket_pos_to_bp(c, bucket, bp.bucket_offset);
-	bp_k->v = bp;
-
-	if (!insert) {
-		bp_k->k.type = KEY_TYPE_deleted;
-		set_bkey_val_u64s(&bp_k->k, 0);
-	}
-
 	k = bch2_bkey_get_iter(trans, &bp_iter, BTREE_ID_backpointers,
 			       bp_k->k.p,
 			       BTREE_ITER_INTENT|
diff --git a/fs/bcachefs/backpointers.h b/fs/bcachefs/backpointers.h
index 87e31aa1975c..547e0617602a 100644
--- a/fs/bcachefs/backpointers.h
+++ b/fs/bcachefs/backpointers.h
@@ -54,7 +54,7 @@ static inline struct bpos bucket_pos_to_bp(const struct bch_fs *c,
 	return ret;
 }
 
-int bch2_bucket_backpointer_mod_nowritebuffer(struct btree_trans *, struct bpos,
+int bch2_bucket_backpointer_mod_nowritebuffer(struct btree_trans *, struct bkey_i_backpointer *,
 				struct bch_backpointer, struct bkey_s_c, bool);
 
 static inline int bch2_bucket_backpointer_mod(struct btree_trans *trans,
@@ -67,9 +67,6 @@ static inline int bch2_bucket_backpointer_mod(struct btree_trans *trans,
 	struct bkey_i_backpointer *bp_k;
 	int ret;
 
-	if (unlikely(bch2_backpointers_no_use_write_buffer))
-		return bch2_bucket_backpointer_mod_nowritebuffer(trans, bucket, bp, orig_k, insert);
-
 	bp_k = bch2_trans_kmalloc_nomemzero(trans, sizeof(struct bkey_i_backpointer));
 	ret = PTR_ERR_OR_ZERO(bp_k);
 	if (ret)
@@ -84,6 +81,9 @@ static inline int bch2_bucket_backpointer_mod(struct btree_trans *trans,
 		set_bkey_val_u64s(&bp_k->k, 0);
 	}
 
+	if (unlikely(bch2_backpointers_no_use_write_buffer))
+		return bch2_bucket_backpointer_mod_nowritebuffer(trans, bp_k, bp, orig_k, insert);
+
 	return bch2_trans_update_buffered(trans, BTREE_ID_backpointers, &bp_k->k_i);
 }
 
-- 
cgit 


From a2437bba0561e94abe2f9254297b25ca13c92c23 Mon Sep 17 00:00:00 2001
From: Brian Foster <bfoster@redhat.com>
Date: Wed, 19 Jul 2023 08:53:03 -0400
Subject: bcachefs: remove unnecessary btree_insert_key_leaf() wrapper

This is in preparation to support prejournaled keys. We want the
ability to optionally pass a seq stored in the btree update rather
than the seq of the committing transaction.

Signed-off-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update_leaf.c | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 3638cef211b2..11b992282032 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -290,12 +290,6 @@ inline void bch2_btree_insert_key_leaf(struct btree_trans *trans,
 		bch2_trans_node_reinit_iter(trans, b);
 }
 
-static void btree_insert_key_leaf(struct btree_trans *trans,
-				  struct btree_insert_entry *insert)
-{
-	bch2_btree_insert_key_leaf(trans, insert->path, insert->k, trans->journal_res.seq);
-}
-
 /* Cached btree updates: */
 
 /* Normal update interface: */
@@ -753,7 +747,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
 		i->k->k.needs_whiteout = false;
 
 		if (!i->cached)
-			btree_insert_key_leaf(trans, i);
+			bch2_btree_insert_key_leaf(trans, i->path, i->k, trans->journal_res.seq);
 		else if (!i->key_cache_already_flushed)
 			bch2_btree_insert_key_cached(trans, flags, i);
 		else {
-- 
cgit 


From 78623ee0d0d72c497967be41277f022c0052631c Mon Sep 17 00:00:00 2001
From: Brian Foster <bfoster@redhat.com>
Date: Wed, 19 Jul 2023 08:53:04 -0400
Subject: bcachefs: fold bch2_trans_update_by_path_trace() into callers

There is only one other caller so eliminate some boilerplate.

Signed-off-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update_leaf.c | 27 ++++++++-------------------
 1 file changed, 8 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 11b992282032..319286294d6a 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -66,7 +66,8 @@ static void verify_update_old_key(struct btree_trans *trans, struct btree_insert
 
 static int __must_check
 bch2_trans_update_by_path(struct btree_trans *, struct btree_path *,
-			  struct bkey_i *, enum btree_update_flags);
+			  struct bkey_i *, enum btree_update_flags,
+			  unsigned long ip);
 
 static inline int btree_insert_entry_cmp(const struct btree_insert_entry *l,
 					 const struct btree_insert_entry *r)
@@ -1489,7 +1490,7 @@ int bch2_trans_update_extent(struct btree_trans *trans,
 
 			ret = bch2_trans_update_by_path(trans, iter.path, update,
 						  BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|
-						  flags);
+						  flags, _RET_IP_);
 			if (ret)
 				goto err;
 			goto out;
@@ -1527,11 +1528,6 @@ err:
 	return ret;
 }
 
-static int __must_check
-bch2_trans_update_by_path_trace(struct btree_trans *trans, struct btree_path *path,
-				struct bkey_i *k, enum btree_update_flags flags,
-				unsigned long ip);
-
 static noinline int flush_new_cached_update(struct btree_trans *trans,
 					    struct btree_path *path,
 					    struct btree_insert_entry *i,
@@ -1562,16 +1558,16 @@ static noinline int flush_new_cached_update(struct btree_trans *trans,
 	i->flags |= BTREE_TRIGGER_NORUN;
 
 	btree_path_set_should_be_locked(btree_path);
-	ret = bch2_trans_update_by_path_trace(trans, btree_path, i->k, flags, ip);
+	ret = bch2_trans_update_by_path(trans, btree_path, i->k, flags, ip);
 out:
 	bch2_path_put(trans, btree_path, true);
 	return ret;
 }
 
 static int __must_check
-bch2_trans_update_by_path_trace(struct btree_trans *trans, struct btree_path *path,
-				struct bkey_i *k, enum btree_update_flags flags,
-				unsigned long ip)
+bch2_trans_update_by_path(struct btree_trans *trans, struct btree_path *path,
+			  struct bkey_i *k, enum btree_update_flags flags,
+			  unsigned long ip)
 {
 	struct bch_fs *c = trans->c;
 	struct btree_insert_entry *i, n;
@@ -1650,13 +1646,6 @@ bch2_trans_update_by_path_trace(struct btree_trans *trans, struct btree_path *pa
 	return 0;
 }
 
-static inline int __must_check
-bch2_trans_update_by_path(struct btree_trans *trans, struct btree_path *path,
-			  struct bkey_i *k, enum btree_update_flags flags)
-{
-	return bch2_trans_update_by_path_trace(trans, path, k, flags, _RET_IP_);
-}
-
 int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter,
 				   struct bkey_i *k, enum btree_update_flags flags)
 {
@@ -1717,7 +1706,7 @@ int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter
 		path = iter->key_cache_path;
 	}
 
-	return bch2_trans_update_by_path(trans, path, k, flags);
+	return bch2_trans_update_by_path(trans, path, k, flags, _RET_IP_);
 }
 
 int __must_check bch2_trans_update_buffered(struct btree_trans *trans,
-- 
cgit 


From eabb10dc9561525661d4fda229134a6f8716e007 Mon Sep 17 00:00:00 2001
From: Brian Foster <bfoster@redhat.com>
Date: Wed, 19 Jul 2023 08:53:05 -0400
Subject: bcachefs: support btree updates of prejournaled keys

Introduce support for prejournaled key updates. This allows a
transaction to commit an update for a key that already exists (and
is pinned) in the journal. This is required for btree write buffer
updates as the current scheme of journaling both on write buffer
insertion and write buffer (slow path) flush is unsafe in certain
crash recovery scenarios.

Create a small trans update wrapper to pass along the seq where the
key resides into the btree_insert_entry. From there, trans commit
passes the seq into the btree insert path where it is used to manage
the journal pin for the associated btree leaf.

Note that this patch only introduces the underlying mechanism and
otherwise includes no functional changes.

Signed-off-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bkey_methods.h      |  2 ++
 fs/bcachefs/btree_types.h       |  1 +
 fs/bcachefs/btree_update.h      |  2 ++
 fs/bcachefs/btree_update_leaf.c | 34 +++++++++++++++++++++++++++++++---
 4 files changed, 36 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bkey_methods.h b/fs/bcachefs/bkey_methods.h
index d1ff83a73511..d7b63769068c 100644
--- a/fs/bcachefs/bkey_methods.h
+++ b/fs/bcachefs/bkey_methods.h
@@ -98,6 +98,7 @@ static inline int bch2_mark_key(struct btree_trans *trans,
 enum btree_update_flags {
 	__BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE = __BTREE_ITER_FLAGS_END,
 	__BTREE_UPDATE_NOJOURNAL,
+	__BTREE_UPDATE_PREJOURNAL,
 	__BTREE_UPDATE_KEY_CACHE_RECLAIM,
 
 	__BTREE_TRIGGER_NORUN,		/* Don't run triggers at all */
@@ -112,6 +113,7 @@ enum btree_update_flags {
 
 #define BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE (1U << __BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE)
 #define BTREE_UPDATE_NOJOURNAL		(1U << __BTREE_UPDATE_NOJOURNAL)
+#define BTREE_UPDATE_PREJOURNAL		(1U << __BTREE_UPDATE_PREJOURNAL)
 #define BTREE_UPDATE_KEY_CACHE_RECLAIM	(1U << __BTREE_UPDATE_KEY_CACHE_RECLAIM)
 
 #define BTREE_TRIGGER_NORUN		(1U << __BTREE_TRIGGER_NORUN)
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 937f9c2b63ed..9bfaa15d5ad4 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -380,6 +380,7 @@ struct btree_insert_entry {
 	u8			old_btree_u64s;
 	struct bkey_i		*k;
 	struct btree_path	*path;
+	u64			seq;
 	/* key being overwritten: */
 	struct bkey		old_k;
 	const struct bch_val	*old_v;
diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
index f794c9d108b8..256da97f721c 100644
--- a/fs/bcachefs/btree_update.h
+++ b/fs/bcachefs/btree_update.h
@@ -111,6 +111,8 @@ int bch2_bkey_get_empty_slot(struct btree_trans *, struct btree_iter *,
 
 int __must_check bch2_trans_update(struct btree_trans *, struct btree_iter *,
 				   struct bkey_i *, enum btree_update_flags);
+int __must_check bch2_trans_update_seq(struct btree_trans *, u64, struct btree_iter *,
+				       struct bkey_i *, enum btree_update_flags);
 int __must_check bch2_trans_update_buffered(struct btree_trans *,
 					    enum btree_id, struct bkey_i *);
 
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 319286294d6a..609780f0ce8e 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -747,9 +747,14 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
 	trans_for_each_update(trans, i) {
 		i->k->k.needs_whiteout = false;
 
-		if (!i->cached)
-			bch2_btree_insert_key_leaf(trans, i->path, i->k, trans->journal_res.seq);
-		else if (!i->key_cache_already_flushed)
+		if (!i->cached) {
+			u64 seq = trans->journal_res.seq;
+
+			if (i->flags & BTREE_UPDATE_PREJOURNAL)
+				seq = i->seq;
+
+			bch2_btree_insert_key_leaf(trans, i->path, i->k, seq);
+		} else if (!i->key_cache_already_flushed)
 			bch2_btree_insert_key_cached(trans, flags, i);
 		else {
 			bch2_btree_key_cache_drop(trans, i->path);
@@ -1571,12 +1576,21 @@ bch2_trans_update_by_path(struct btree_trans *trans, struct btree_path *path,
 {
 	struct bch_fs *c = trans->c;
 	struct btree_insert_entry *i, n;
+	u64 seq = 0;
 	int cmp;
 
 	EBUG_ON(!path->should_be_locked);
 	EBUG_ON(trans->nr_updates >= BTREE_ITER_MAX);
 	EBUG_ON(!bpos_eq(k->k.p, path->pos));
 
+	/*
+	 * The transaction journal res hasn't been allocated at this point.
+	 * That occurs at commit time. Reuse the seq field to pass in the seq
+	 * of a prejournaled key.
+	 */
+	if (flags & BTREE_UPDATE_PREJOURNAL)
+		seq = trans->journal_res.seq;
+
 	n = (struct btree_insert_entry) {
 		.flags		= flags,
 		.bkey_type	= __btree_node_type(path->level, path->btree_id),
@@ -1585,6 +1599,7 @@ bch2_trans_update_by_path(struct btree_trans *trans, struct btree_path *path,
 		.cached		= path->cached,
 		.path		= path,
 		.k		= k,
+		.seq		= seq,
 		.ip_allocated	= ip,
 	};
 
@@ -1612,6 +1627,7 @@ bch2_trans_update_by_path(struct btree_trans *trans, struct btree_path *path,
 		i->cached	= n.cached;
 		i->k		= n.k;
 		i->path		= n.path;
+		i->seq		= n.seq;
 		i->ip_allocated	= n.ip_allocated;
 	} else {
 		array_insert_item(trans->updates, trans->nr_updates,
@@ -1709,6 +1725,18 @@ int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter
 	return bch2_trans_update_by_path(trans, path, k, flags, _RET_IP_);
 }
 
+/*
+ * Add a transaction update for a key that has already been journaled.
+ */
+int __must_check bch2_trans_update_seq(struct btree_trans *trans, u64 seq,
+				       struct btree_iter *iter, struct bkey_i *k,
+				       enum btree_update_flags flags)
+{
+	trans->journal_res.seq = seq;
+	return bch2_trans_update(trans, iter, k, flags|BTREE_UPDATE_NOJOURNAL|
+						 BTREE_UPDATE_PREJOURNAL);
+}
+
 int __must_check bch2_trans_update_buffered(struct btree_trans *trans,
 					    enum btree_id btree,
 					    struct bkey_i *k)
-- 
cgit 


From 60a5b898007d766d6180ca101634bf7cad98d82f Mon Sep 17 00:00:00 2001
From: Brian Foster <bfoster@redhat.com>
Date: Wed, 19 Jul 2023 08:53:06 -0400
Subject: bcachefs: use prejournaled key updates for write buffer flushes

The write buffer mechanism journals keys twice in certain
situations. A key is always journaled on write buffer insertion, and
is potentially journaled again if a write buffer flush falls into
either of the slow btree insert paths. This has shown to cause
journal recovery ordering problems in the event of an untimely
crash.

For example, consider if a key is inserted into index 0 of a write
buffer, the active write buffer switches to index 1, the key is
deleted in index 1, and then index 0 is flushed. If the original key
is rejournaled in the btree update from the index 0 flush, the (now
deleted) key is journaled in a seq buffer ahead of the latest
version of key (which was journaled when the key was deleted in
index 1). If the fs crashes while this is still observable in the
log, recovery sees the key from the btree update after the delete
key from the write buffer insert, which is the incorrect order. This
problem is occasionally reproduced by generic/388 and generally
manifests as one or more backpointer entry inconsistencies.

To avoid this problem, never rejournal write buffered key updates to
the associated btree. Instead, use prejournaled key updates to pass
the journal seq of the write buffer insert down to the btree insert,
which updates the btree leaf pin to reflect the seq of the key.

Note that tracking the seq is required instead of just using
NOJOURNAL here because otherwise we lose protection of the write
buffer pin when the buffer is flushed, which means the key can fall
off the tail of the on-disk journal before the btree leaf is flushed
and lead to similar recovery inconsistencies.

Signed-off-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_write_buffer.c | 30 ++++++++++++++++++++++++++++--
 1 file changed, 28 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_write_buffer.c b/fs/bcachefs/btree_write_buffer.c
index 6c30a72e6eee..5f96db539fd7 100644
--- a/fs/bcachefs/btree_write_buffer.c
+++ b/fs/bcachefs/btree_write_buffer.c
@@ -75,7 +75,7 @@ static int bch2_btree_write_buffer_flush_one(struct btree_trans *trans,
 	}
 	return 0;
 trans_commit:
-	return  bch2_trans_update(trans, iter, &wb->k, 0) ?:
+	return  bch2_trans_update_seq(trans, wb->journal_seq, iter, &wb->k, 0) ?:
 		bch2_trans_commit(trans, NULL, NULL,
 				  commit_flags|
 				  BTREE_INSERT_NOCHECK_RW|
@@ -103,6 +103,32 @@ static union btree_write_buffer_state btree_write_buffer_switch(struct btree_wri
 	return old;
 }
 
+/*
+ * Update a btree with a write buffered key using the journal seq of the
+ * original write buffer insert.
+ *
+ * It is not safe to rejournal the key once it has been inserted into the write
+ * buffer because that may break recovery ordering. For example, the key may
+ * have already been modified in the active write buffer in a seq that comes
+ * before the current transaction. If we were to journal this key again and
+ * crash, recovery would process updates in the wrong order.
+ */
+static int
+btree_write_buffered_insert(struct btree_trans *trans,
+			  struct btree_write_buffered_key *wb)
+{
+	struct btree_iter iter;
+	int ret;
+
+	bch2_trans_iter_init(trans, &iter, wb->btree, bkey_start_pos(&wb->k.k),
+			     BTREE_ITER_CACHED|BTREE_ITER_INTENT);
+
+	ret   = bch2_btree_iter_traverse(&iter) ?:
+		bch2_trans_update_seq(trans, wb->journal_seq, &iter, &wb->k, 0);
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
 int __bch2_btree_write_buffer_flush(struct btree_trans *trans, unsigned commit_flags,
 				    bool locked)
 {
@@ -238,7 +264,7 @@ slowpath:
 				commit_flags|
 				BTREE_INSERT_NOFAIL|
 				BTREE_INSERT_JOURNAL_RECLAIM,
-				__bch2_btree_insert(trans, i->btree, &i->k, 0));
+				btree_write_buffered_insert(trans, i));
 		if (bch2_fs_fatal_err_on(ret, c, "%s: insert error %s", __func__, bch2_err_str(ret)))
 			break;
 	}
-- 
cgit 


From ef1634f0f19d676483888c2a05d7e406b366d2db Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 20 Jul 2023 18:09:26 -0400
Subject: bcachefs: Print version, options earlier in startup path

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/super.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 63e9dafa8395..c17fdcd08390 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -883,7 +883,7 @@ static void print_mount_opts(struct bch_fs *c)
 	struct printbuf p = PRINTBUF;
 	bool first = true;
 
-	prt_str(&p, "mounted version ");
+	prt_str(&p, "mounting version ");
 	bch2_version_to_text(&p, c->sb.version);
 
 	if (c->opts.read_only) {
@@ -919,6 +919,8 @@ int bch2_fs_start(struct bch_fs *c)
 	unsigned i;
 	int ret;
 
+	print_mount_opts(c);
+
 	down_write(&c->state_lock);
 
 	BUG_ON(test_bit(BCH_FS_STARTED, &c->flags));
@@ -972,7 +974,6 @@ int bch2_fs_start(struct bch_fs *c)
 			goto err;
 	}
 
-	print_mount_opts(c);
 	ret = 0;
 out:
 	up_write(&c->state_lock);
-- 
cgit 


From ae2e13d7809d79ea4d7c0cd8ee060b7911012e37 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 16 Jul 2023 23:21:17 -0400
Subject: bcachefs: bch2_run_explicit_recovery_pass()

This introduces bch2_run_explicit_recovery_pass() and uses it for when
fsck detects that we need to re-run dead snaphots cleanup, and makes
dead snapshot cleanup more like a normal recovery pass.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs.h  | 15 ++++++++++++++-
 fs/bcachefs/errcode.h   |  2 +-
 fs/bcachefs/fsck.c      |  2 +-
 fs/bcachefs/recovery.c  |  5 +----
 fs/bcachefs/subvolume.c | 10 +++++-----
 5 files changed, 22 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 6e8dc25d41db..d9e36f16af2e 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -680,7 +680,7 @@ enum bch_write_ref {
 	x(check_snapshot_trees,		PASS_FSCK)						\
 	x(check_snapshots,		PASS_FSCK)						\
 	x(check_subvols,		PASS_FSCK)						\
-	x(delete_dead_snapshots,	PASS_FSCK|PASS_UNCLEAN|PASS_SILENT)			\
+	x(delete_dead_snapshots,	PASS_FSCK|PASS_UNCLEAN)					\
 	x(fs_upgrade_for_subvolumes,	0)							\
 	x(check_inodes,			PASS_FSCK|PASS_UNCLEAN)					\
 	x(check_extents,		PASS_FSCK)						\
@@ -1179,6 +1179,19 @@ static inline bool bch2_dev_exists2(const struct bch_fs *c, unsigned dev)
 	return dev < c->sb.nr_devices && c->devs[dev];
 }
 
+/*
+ * For when we need to rewind recovery passes and run a pass we skipped:
+ */
+static inline int bch2_run_explicit_recovery_pass(struct bch_fs *c,
+						  enum bch_recovery_pass pass)
+{
+	BUG_ON(c->curr_recovery_pass < pass);
+
+	c->recovery_passes_explicit |= BIT_ULL(pass);
+	c->curr_recovery_pass = pass;
+	return -BCH_ERR_restart_recovery;
+}
+
 #define BKEY_PADDED_ONSTACK(key, pad)				\
 	struct { struct bkey_i key; __u64 key ## _pad[pad]; }
 
diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h
index a759493fee44..adc25c73e715 100644
--- a/fs/bcachefs/errcode.h
+++ b/fs/bcachefs/errcode.h
@@ -157,7 +157,7 @@
 	x(BCH_ERR_fsck,			fsck_errors_not_fixed)			\
 	x(BCH_ERR_fsck,			fsck_repair_unimplemented)		\
 	x(BCH_ERR_fsck,			fsck_repair_impossible)			\
-	x(0,				need_snapshot_cleanup)			\
+	x(0,				restart_recovery)			\
 	x(0,				need_topology_repair)			\
 	x(0,				unwritten_extent_update)		\
 	x(EINVAL,			device_state_not_allowed)		\
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 0d7344ff64c0..23cc49ab5ad5 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -500,7 +500,7 @@ static int snapshots_seen_update(struct bch_fs *c, struct snapshots_seen *s,
 				bch2_btree_ids[btree_id],
 				pos.inode, pos.offset,
 				i->id, n.id, n.equiv);
-			return -BCH_ERR_need_snapshot_cleanup;
+			return bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_delete_dead_snapshots);
 		}
 	}
 
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 63b385d8886a..433e7745799b 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -1270,11 +1270,8 @@ again:
 		c->curr_recovery_pass++;
 	}
 
-	if (bch2_err_matches(ret, BCH_ERR_need_snapshot_cleanup)) {
-		set_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags);
-		c->curr_recovery_pass = BCH_RECOVERY_PASS_delete_dead_snapshots;
+	if (bch2_err_matches(ret, BCH_ERR_restart_recovery))
 		goto again;
-	}
 
 	return ret;
 }
diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c
index 287492c29bcc..7de6fe0cdd43 100644
--- a/fs/bcachefs/subvolume.c
+++ b/fs/bcachefs/subvolume.c
@@ -331,8 +331,10 @@ int bch2_mark_snapshot(struct btree_trans *trans,
 		       parent - id - 1 < IS_ANCESTOR_BITMAP)
 			__set_bit(parent - id - 1, t->is_ancestor);
 
-		if (BCH_SNAPSHOT_DELETED(s.v))
+		if (BCH_SNAPSHOT_DELETED(s.v)) {
 			set_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags);
+			c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_delete_dead_snapshots);
+		}
 	} else {
 		memset(t, 0, sizeof(*t));
 	}
@@ -1302,9 +1304,6 @@ int bch2_delete_dead_snapshots(struct bch_fs *c)
 	u32 i, id;
 	int ret = 0;
 
-	if (!test_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags))
-		return 0;
-
 	if (!test_bit(BCH_FS_STARTED, &c->flags)) {
 		ret = bch2_fs_read_write_early(c);
 		if (ret) {
@@ -1399,7 +1398,8 @@ static void bch2_delete_dead_snapshots_work(struct work_struct *work)
 {
 	struct bch_fs *c = container_of(work, struct bch_fs, snapshot_delete_work);
 
-	bch2_delete_dead_snapshots(c);
+	if (test_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags))
+		bch2_delete_dead_snapshots(c);
 	bch2_write_ref_put(c, BCH_WRITE_REF_delete_dead_snapshots);
 }
 
-- 
cgit 


From 922bc5a037ecf82d3cbad1c9976274f441328274 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 16 Jul 2023 23:19:49 -0400
Subject: bcachefs: Make topology repair a normal recovery pass

This adds bch2_run_explicit_recovery_pass(), for rewinding recovery and
explicitly running a specific recovery pass - this is a more general
replacement for how we were running topology repair before.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs.h | 13 ++++++++-----
 fs/bcachefs/btree_gc.c | 44 +++++++++++++-------------------------------
 fs/bcachefs/btree_gc.h |  1 +
 fs/bcachefs/btree_io.c |  5 +++--
 fs/bcachefs/errcode.h  |  1 -
 fs/bcachefs/error.c    |  3 ---
 fs/bcachefs/recovery.c | 12 ++++++++----
 7 files changed, 33 insertions(+), 46 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index d9e36f16af2e..82b0706a8936 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -563,7 +563,6 @@ enum {
 	BCH_FS_CLEAN_SHUTDOWN,
 
 	/* fsck passes: */
-	BCH_FS_TOPOLOGY_REPAIR_DONE,
 	BCH_FS_FSCK_DONE,
 	BCH_FS_INITIAL_GC_UNFIXED,	/* kill when we enumerate fsck errors */
 	BCH_FS_NEED_ANOTHER_GC,
@@ -666,6 +665,7 @@ enum bch_write_ref {
 	x(stripes_read,			PASS_ALWAYS)						\
 	x(initialize_subvolumes,	0)							\
 	x(snapshots_read,		PASS_ALWAYS)						\
+	x(check_topology,		0)							\
 	x(check_allocations,		PASS_FSCK)						\
 	x(set_may_go_rw,		PASS_ALWAYS|PASS_SILENT)				\
 	x(journal_replay,		PASS_ALWAYS)						\
@@ -1185,11 +1185,14 @@ static inline bool bch2_dev_exists2(const struct bch_fs *c, unsigned dev)
 static inline int bch2_run_explicit_recovery_pass(struct bch_fs *c,
 						  enum bch_recovery_pass pass)
 {
-	BUG_ON(c->curr_recovery_pass < pass);
-
 	c->recovery_passes_explicit |= BIT_ULL(pass);
-	c->curr_recovery_pass = pass;
-	return -BCH_ERR_restart_recovery;
+
+	if (c->curr_recovery_pass >= pass) {
+		c->curr_recovery_pass = pass;
+		return -BCH_ERR_restart_recovery;
+	} else {
+		return 0;
+	}
 }
 
 #define BKEY_PADDED_ONSTACK(key, pad)				\
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index edea6bb66253..43e6222f124d 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -40,6 +40,12 @@
 #define DROP_THIS_NODE		10
 #define DROP_PREV_NODE		11
 
+static bool should_restart_for_topology_repair(struct bch_fs *c)
+{
+	return c->opts.fix_errors != FSCK_FIX_no &&
+		!(c->recovery_passes_explicit & BIT_ULL(BCH_RECOVERY_PASS_check_topology));
+}
+
 static inline void __gc_pos_set(struct bch_fs *c, struct gc_pos new_pos)
 {
 	preempt_disable();
@@ -96,9 +102,9 @@ static int bch2_gc_check_topology(struct bch_fs *c,
 				  "  cur %s",
 				  bch2_btree_ids[b->c.btree_id], b->c.level,
 				  buf1.buf, buf2.buf) &&
-			    !test_bit(BCH_FS_TOPOLOGY_REPAIR_DONE, &c->flags)) {
+			    should_restart_for_topology_repair(c)) {
 				bch_info(c, "Halting mark and sweep to start topology repair pass");
-				ret = -BCH_ERR_need_topology_repair;
+				ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology);
 				goto err;
 			} else {
 				set_bit(BCH_FS_INITIAL_GC_UNFIXED, &c->flags);
@@ -124,9 +130,9 @@ static int bch2_gc_check_topology(struct bch_fs *c,
 			  "  expected %s",
 			  bch2_btree_ids[b->c.btree_id], b->c.level,
 			  buf1.buf, buf2.buf) &&
-		    !test_bit(BCH_FS_TOPOLOGY_REPAIR_DONE, &c->flags)) {
+		    should_restart_for_topology_repair(c)) {
 			bch_info(c, "Halting mark and sweep to start topology repair pass");
-			ret = -BCH_ERR_need_topology_repair;
+			ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology);
 			goto err;
 		} else {
 			set_bit(BCH_FS_INITIAL_GC_UNFIXED, &c->flags);
@@ -520,7 +526,7 @@ fsck_err:
 	return ret;
 }
 
-static int bch2_repair_topology(struct bch_fs *c)
+int bch2_check_topology(struct bch_fs *c)
 {
 	struct btree_trans trans;
 	struct btree *b;
@@ -969,9 +975,9 @@ static int bch2_gc_btree_init_recurse(struct btree_trans *trans, struct btree *b
 					  b->c.level - 1,
 					  (printbuf_reset(&buf),
 					   bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(cur.k)), buf.buf)) &&
-				    !test_bit(BCH_FS_TOPOLOGY_REPAIR_DONE, &c->flags)) {
-					ret = -BCH_ERR_need_topology_repair;
+				    should_restart_for_topology_repair(c)) {
 					bch_info(c, "Halting mark and sweep to start topology repair pass");
+					ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology);
 					goto fsck_err;
 				} else {
 					/* Continue marking when opted to not
@@ -1805,32 +1811,8 @@ again:
 
 	bch2_mark_superblocks(c);
 
-	if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) ||
-	    (BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb) &&
-	     c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_allocations &&
-	     c->opts.fix_errors != FSCK_FIX_no)) {
-		bch_info(c, "Starting topology repair pass");
-		ret = bch2_repair_topology(c);
-		if (ret)
-			goto out;
-		bch_info(c, "Topology repair pass done");
-
-		set_bit(BCH_FS_TOPOLOGY_REPAIR_DONE, &c->flags);
-	}
-
 	ret = bch2_gc_btrees(c, initial, metadata_only);
 
-	if (ret == -BCH_ERR_need_topology_repair &&
-	    !test_bit(BCH_FS_TOPOLOGY_REPAIR_DONE, &c->flags) &&
-	    c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_allocations) {
-		set_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags);
-		SET_BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb, true);
-		ret = 0;
-	}
-
-	if (ret == -BCH_ERR_need_topology_repair)
-		ret = -BCH_ERR_fsck_errors_not_fixed;
-
 	if (ret)
 		goto out;
 
diff --git a/fs/bcachefs/btree_gc.h b/fs/bcachefs/btree_gc.h
index 95d803b5743d..402c69184656 100644
--- a/fs/bcachefs/btree_gc.h
+++ b/fs/bcachefs/btree_gc.h
@@ -4,6 +4,7 @@
 
 #include "btree_types.h"
 
+int bch2_check_topology(struct bch_fs *);
 int bch2_gc(struct bch_fs *, bool, bool);
 int bch2_gc_gens(struct bch_fs *);
 void bch2_gc_thread_stop(struct bch_fs *);
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index a8f7b71139a6..7bfb8b8d4cb5 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -610,7 +610,7 @@ static int __btree_err(enum btree_err_type type,
 	case BTREE_ERR_BAD_NODE:
 		bch2_print_string_as_lines(KERN_ERR, out.buf);
 		bch2_topology_error(c);
-		ret = -BCH_ERR_need_topology_repair;
+		ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology) ?: -EIO;
 		break;
 	case BTREE_ERR_INCOMPATIBLE:
 		bch2_print_string_as_lines(KERN_ERR, out.buf);
@@ -1566,7 +1566,8 @@ void bch2_btree_node_read(struct bch_fs *c, struct btree *b,
 		btree_pos_to_text(&buf, c, b);
 		bch_err(c, "%s", buf.buf);
 
-		if (test_bit(BCH_FS_TOPOLOGY_REPAIR_DONE, &c->flags))
+		if (c->recovery_passes_explicit & BIT_ULL(BCH_RECOVERY_PASS_check_topology) &&
+		    c->curr_recovery_pass > BCH_RECOVERY_PASS_check_topology)
 			bch2_fatal_error(c);
 
 		set_btree_node_read_error(b);
diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h
index adc25c73e715..735eb2416113 100644
--- a/fs/bcachefs/errcode.h
+++ b/fs/bcachefs/errcode.h
@@ -158,7 +158,6 @@
 	x(BCH_ERR_fsck,			fsck_repair_unimplemented)		\
 	x(BCH_ERR_fsck,			fsck_repair_impossible)			\
 	x(0,				restart_recovery)			\
-	x(0,				need_topology_repair)			\
 	x(0,				unwritten_extent_update)		\
 	x(EINVAL,			device_state_not_allowed)		\
 	x(EINVAL,			member_info_missing)			\
diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c
index 685464b8cce3..39009cf0c448 100644
--- a/fs/bcachefs/error.c
+++ b/fs/bcachefs/error.c
@@ -27,9 +27,6 @@ bool bch2_inconsistent_error(struct bch_fs *c)
 
 void bch2_topology_error(struct bch_fs *c)
 {
-	if (!test_bit(BCH_FS_TOPOLOGY_REPAIR_DONE, &c->flags))
-		return;
-
 	set_bit(BCH_FS_TOPOLOGY_ERROR, &c->flags);
 	if (test_bit(BCH_FS_FSCK_DONE, &c->flags))
 		bch2_inconsistent_error(c);
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 433e7745799b..dcd4f9f410ae 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -1262,17 +1262,16 @@ static int bch2_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass)
 static int bch2_run_recovery_passes(struct bch_fs *c)
 {
 	int ret = 0;
-again:
+
 	while (c->curr_recovery_pass < ARRAY_SIZE(recovery_passes)) {
 		ret = bch2_run_recovery_pass(c, c->curr_recovery_pass);
+		if (bch2_err_matches(ret, BCH_ERR_restart_recovery))
+			continue;
 		if (ret)
 			break;
 		c->curr_recovery_pass++;
 	}
 
-	if (bch2_err_matches(ret, BCH_ERR_restart_recovery))
-		goto again;
-
 	return ret;
 }
 
@@ -1450,6 +1449,11 @@ use_clean:
 	if (ret)
 		goto err;
 
+	if (c->opts.fsck &&
+	    (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) ||
+	     BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb)))
+		c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_check_topology);
+
 	ret = bch2_run_recovery_passes(c);
 	if (ret)
 		goto err;
-- 
cgit 


From 9d8a3c95dce626d4d792228dead1edbf3251dda6 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 17 Jul 2023 00:12:58 -0400
Subject: bcachefs: fsck: delete dead code

Delete the old, now reimplemented overlapping extent check/repair.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fsck.c | 41 -----------------------------------------
 1 file changed, 41 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 23cc49ab5ad5..022af0270692 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -1038,47 +1038,6 @@ int bch2_check_inodes(struct bch_fs *c)
 	return ret;
 }
 
-/*
- * Checking for overlapping extents needs to be reimplemented
- */
-#if 0
-static int fix_overlapping_extent(struct btree_trans *trans,
-				       struct bkey_s_c k, struct bpos cut_at)
-{
-	struct btree_iter iter;
-	struct bkey_i *u;
-	int ret;
-
-	u = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
-	ret = PTR_ERR_OR_ZERO(u);
-	if (ret)
-		return ret;
-
-	bkey_reassemble(u, k);
-	bch2_cut_front(cut_at, u);
-
-
-	/*
-	 * We don't want to go through the extent_handle_overwrites path:
-	 *
-	 * XXX: this is going to screw up disk accounting, extent triggers
-	 * assume things about extent overwrites - we should be running the
-	 * triggers manually here
-	 */
-	bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, u->k.p,
-			     BTREE_ITER_INTENT|BTREE_ITER_NOT_EXTENTS);
-
-	BUG_ON(iter.flags & BTREE_ITER_IS_EXTENTS);
-	ret   = bch2_btree_iter_traverse(&iter) ?:
-		bch2_trans_update(trans, &iter, u, BTREE_TRIGGER_NORUN) ?:
-		bch2_trans_commit(trans, NULL, NULL,
-				  BTREE_INSERT_NOFAIL|
-				  BTREE_INSERT_LAZY_RW);
-	bch2_trans_iter_exit(trans, &iter);
-	return ret;
-}
-#endif
-
 static struct bkey_s_c_dirent dirent_get_by_pos(struct btree_trans *trans,
 						struct btree_iter *iter,
 						struct bpos pos)
-- 
cgit 


From 4dc5bb9adf3035246e77d5e9cf7931b8af0fcaac Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 17 Jul 2023 00:41:48 -0400
Subject: bcachefs: move inode triggers to inode.c

bit of reorg

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/buckets.c | 71 +++------------------------------------------------
 fs/bcachefs/buckets.h | 17 +++++++++---
 fs/bcachefs/inode.c   | 53 ++++++++++++++++++++++++++++++++++++++
 fs/bcachefs/inode.h   |  5 ++++
 4 files changed, 75 insertions(+), 71 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index eb486467df17..7bb7f0caee45 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -102,18 +102,6 @@ void bch2_dev_usage_read_fast(struct bch_dev *ca, struct bch_dev_usage *usage)
 	} while (read_seqcount_retry(&c->usage_lock, seq));
 }
 
-static inline struct bch_fs_usage *fs_usage_ptr(struct bch_fs *c,
-						unsigned journal_seq,
-						bool gc)
-{
-	percpu_rwsem_assert_held(&c->mark_lock);
-	BUG_ON(!gc && !journal_seq);
-
-	return this_cpu_ptr(gc
-			    ? c->usage_gc
-			    : c->usage[journal_seq & JOURNAL_BUF_MASK]);
-}
-
 u64 bch2_fs_usage_read_one(struct bch_fs *c, u64 *v)
 {
 	ssize_t offset = v - (u64 *) c->usage_base;
@@ -460,7 +448,7 @@ static int __replicas_deltas_realloc(struct btree_trans *trans, unsigned more,
 	return 0;
 }
 
-static int replicas_deltas_realloc(struct btree_trans *trans, unsigned more)
+int bch2_replicas_deltas_realloc(struct btree_trans *trans, unsigned more)
 {
 	return allocate_dropping_locks_errcode(trans,
 				__replicas_deltas_realloc(trans, more, _gfp));
@@ -479,7 +467,7 @@ static inline int update_replicas_list(struct btree_trans *trans,
 		return 0;
 
 	b = replicas_entry_bytes(r) + 8;
-	ret = replicas_deltas_realloc(trans, b);
+	ret = bch2_replicas_deltas_realloc(trans, b);
 	if (ret)
 		return ret;
 
@@ -1137,38 +1125,6 @@ int bch2_mark_stripe(struct btree_trans *trans,
 	return 0;
 }
 
-int bch2_mark_inode(struct btree_trans *trans,
-		    enum btree_id btree_id, unsigned level,
-		    struct bkey_s_c old, struct bkey_s_c new,
-		    unsigned flags)
-{
-	struct bch_fs *c = trans->c;
-	struct bch_fs_usage *fs_usage;
-	u64 journal_seq = trans->journal_res.seq;
-
-	if (flags & BTREE_TRIGGER_INSERT) {
-		struct bch_inode_v3 *v = (struct bch_inode_v3 *) new.v;
-
-		BUG_ON(!journal_seq);
-		BUG_ON(new.k->type != KEY_TYPE_inode_v3);
-
-		v->bi_journal_seq = cpu_to_le64(journal_seq);
-	}
-
-	if (flags & BTREE_TRIGGER_GC) {
-		percpu_down_read(&c->mark_lock);
-		preempt_disable();
-
-		fs_usage = fs_usage_ptr(c, journal_seq, flags & BTREE_TRIGGER_GC);
-		fs_usage->nr_inodes += bkey_is_inode(new.k);
-		fs_usage->nr_inodes -= bkey_is_inode(old.k);
-
-		preempt_enable();
-		percpu_up_read(&c->mark_lock);
-	}
-	return 0;
-}
-
 int bch2_mark_reservation(struct btree_trans *trans,
 			  enum btree_id btree_id, unsigned level,
 			  struct bkey_s_c old, struct bkey_s_c new,
@@ -1715,27 +1671,6 @@ int bch2_trans_mark_stripe(struct btree_trans *trans,
 	return ret;
 }
 
-int bch2_trans_mark_inode(struct btree_trans *trans,
-			  enum btree_id btree_id, unsigned level,
-			  struct bkey_s_c old,
-			  struct bkey_i *new,
-			  unsigned flags)
-{
-	int nr = bkey_is_inode(&new->k) - bkey_is_inode(old.k);
-
-	if (nr) {
-		int ret = replicas_deltas_realloc(trans, 0);
-		struct replicas_delta_list *d = trans->fs_usage_deltas;
-
-		if (ret)
-			return ret;
-
-		d->nr_inodes += nr;
-	}
-
-	return 0;
-}
-
 int bch2_trans_mark_reservation(struct btree_trans *trans,
 				enum btree_id btree_id, unsigned level,
 				struct bkey_s_c old,
@@ -1754,7 +1689,7 @@ int bch2_trans_mark_reservation(struct btree_trans *trans,
 		sectors = -sectors;
 	sectors *= replicas;
 
-	ret = replicas_deltas_realloc(trans, 0);
+	ret = bch2_replicas_deltas_realloc(trans, 0);
 	if (ret)
 		return ret;
 
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index f9d7dda07ea6..a418f664896d 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -249,6 +249,20 @@ bch2_fs_usage_read_short(struct bch_fs *);
 
 /* key/bucket marking: */
 
+static inline struct bch_fs_usage *fs_usage_ptr(struct bch_fs *c,
+						unsigned journal_seq,
+						bool gc)
+{
+	percpu_rwsem_assert_held(&c->mark_lock);
+	BUG_ON(!gc && !journal_seq);
+
+	return this_cpu_ptr(gc
+			    ? c->usage_gc
+			    : c->usage[journal_seq & JOURNAL_BUF_MASK]);
+}
+
+int bch2_replicas_deltas_realloc(struct btree_trans *, unsigned);
+
 void bch2_fs_usage_initialize(struct bch_fs *);
 
 int bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *,
@@ -261,8 +275,6 @@ int bch2_mark_extent(struct btree_trans *, enum btree_id, unsigned,
 		     struct bkey_s_c, struct bkey_s_c, unsigned);
 int bch2_mark_stripe(struct btree_trans *, enum btree_id, unsigned,
 		     struct bkey_s_c, struct bkey_s_c, unsigned);
-int bch2_mark_inode(struct btree_trans *, enum btree_id, unsigned,
-		    struct bkey_s_c, struct bkey_s_c, unsigned);
 int bch2_mark_reservation(struct btree_trans *, enum btree_id, unsigned,
 			  struct bkey_s_c, struct bkey_s_c, unsigned);
 int bch2_mark_reflink_p(struct btree_trans *, enum btree_id, unsigned,
@@ -270,7 +282,6 @@ int bch2_mark_reflink_p(struct btree_trans *, enum btree_id, unsigned,
 
 int bch2_trans_mark_extent(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_i *, unsigned);
 int bch2_trans_mark_stripe(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_i *, unsigned);
-int bch2_trans_mark_inode(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_i *, unsigned);
 int bch2_trans_mark_reservation(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_i *, unsigned);
 int bch2_trans_mark_reflink_p(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_i *, unsigned);
 
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index fa435d8655de..8834809d4a67 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -519,6 +519,59 @@ void bch2_inode_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c
 	__bch2_inode_unpacked_to_text(out, &inode);
 }
 
+int bch2_trans_mark_inode(struct btree_trans *trans,
+			  enum btree_id btree_id, unsigned level,
+			  struct bkey_s_c old,
+			  struct bkey_i *new,
+			  unsigned flags)
+{
+	int nr = bkey_is_inode(&new->k) - bkey_is_inode(old.k);
+
+	if (nr) {
+		int ret = bch2_replicas_deltas_realloc(trans, 0);
+		struct replicas_delta_list *d = trans->fs_usage_deltas;
+
+		if (ret)
+			return ret;
+
+		d->nr_inodes += nr;
+	}
+
+	return 0;
+}
+
+int bch2_mark_inode(struct btree_trans *trans,
+		    enum btree_id btree_id, unsigned level,
+		    struct bkey_s_c old, struct bkey_s_c new,
+		    unsigned flags)
+{
+	struct bch_fs *c = trans->c;
+	struct bch_fs_usage *fs_usage;
+	u64 journal_seq = trans->journal_res.seq;
+
+	if (flags & BTREE_TRIGGER_INSERT) {
+		struct bch_inode_v3 *v = (struct bch_inode_v3 *) new.v;
+
+		BUG_ON(!journal_seq);
+		BUG_ON(new.k->type != KEY_TYPE_inode_v3);
+
+		v->bi_journal_seq = cpu_to_le64(journal_seq);
+	}
+
+	if (flags & BTREE_TRIGGER_GC) {
+		percpu_down_read(&c->mark_lock);
+		preempt_disable();
+
+		fs_usage = fs_usage_ptr(c, journal_seq, flags & BTREE_TRIGGER_GC);
+		fs_usage->nr_inodes += bkey_is_inode(new.k);
+		fs_usage->nr_inodes -= bkey_is_inode(old.k);
+
+		preempt_enable();
+		percpu_up_read(&c->mark_lock);
+	}
+	return 0;
+}
+
 int bch2_inode_generation_invalid(const struct bch_fs *c, struct bkey_s_c k,
 				  enum bkey_invalid_flags flags,
 				  struct printbuf *err)
diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h
index 8f9be5e58381..7809d1b6d7a2 100644
--- a/fs/bcachefs/inode.h
+++ b/fs/bcachefs/inode.h
@@ -16,6 +16,11 @@ int bch2_inode_v3_invalid(const struct bch_fs *, struct bkey_s_c,
 			  enum bkey_invalid_flags, struct printbuf *);
 void bch2_inode_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 
+int bch2_trans_mark_inode(struct btree_trans *, enum btree_id, unsigned,
+			  struct bkey_s_c, struct bkey_i *, unsigned);
+int bch2_mark_inode(struct btree_trans *, enum btree_id, unsigned,
+		    struct bkey_s_c, struct bkey_s_c, unsigned);
+
 #define bch2_bkey_ops_inode ((struct bkey_ops) {	\
 	.key_invalid	= bch2_inode_invalid,		\
 	.val_to_text	= bch2_inode_to_text,		\
-- 
cgit 


From 8e992c6c1f3869d0b33c754d4b6730603d3586c4 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 17 Jul 2023 00:56:07 -0400
Subject: bcachefs: bch2_btree_bit_mod()

New helper for bitset btrees.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update.h      |  2 ++
 fs/bcachefs/btree_update_leaf.c | 18 ++++++++++++++++++
 fs/bcachefs/lru.c               | 26 +++++---------------------
 fs/bcachefs/lru.h               | 19 ++++++++++++-------
 4 files changed, 37 insertions(+), 28 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
index 256da97f721c..f5700c286b9e 100644
--- a/fs/bcachefs/btree_update.h
+++ b/fs/bcachefs/btree_update.h
@@ -72,6 +72,8 @@ int bch2_btree_delete_range_trans(struct btree_trans *, enum btree_id,
 int bch2_btree_delete_range(struct bch_fs *, enum btree_id,
 			    struct bpos, struct bpos, unsigned, u64 *);
 
+int bch2_btree_bit_mod(struct btree_trans *, enum btree_id, struct bpos, bool);
+
 int bch2_btree_node_rewrite(struct btree_trans *, struct btree_iter *,
 			    struct btree *, unsigned);
 void bch2_btree_node_rewrite_async(struct bch_fs *, struct btree *);
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 609780f0ce8e..336a91e4fb20 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -1996,6 +1996,24 @@ int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id,
 	return ret;
 }
 
+int bch2_btree_bit_mod(struct btree_trans *trans, enum btree_id btree,
+		       struct bpos pos, bool set)
+{
+	struct bkey_i *k;
+	int ret = 0;
+
+	k = bch2_trans_kmalloc_nomemzero(trans, sizeof(*k));
+	ret = PTR_ERR_OR_ZERO(k);
+	if (unlikely(ret))
+		return ret;
+
+	bkey_init(&k->k);
+	k->k.type = set ? KEY_TYPE_set : KEY_TYPE_deleted;
+	k->k.p = pos;
+
+	return bch2_trans_update_buffered(trans, btree, k);
+}
+
 static int __bch2_trans_log_msg(darray_u64 *entries, const char *fmt, va_list args)
 {
 	struct printbuf buf = PRINTBUF;
diff --git a/fs/bcachefs/lru.c b/fs/bcachefs/lru.c
index 07d1929535b8..3e8b8f2f38a3 100644
--- a/fs/bcachefs/lru.c
+++ b/fs/bcachefs/lru.c
@@ -41,28 +41,12 @@ void bch2_lru_pos_to_text(struct printbuf *out, struct bpos lru)
 }
 
 static int __bch2_lru_set(struct btree_trans *trans, u16 lru_id,
-			u64 dev_bucket, u64 time, unsigned key_type)
+			  u64 dev_bucket, u64 time, bool set)
 {
-	struct bkey_i *k;
-	int ret = 0;
-
-	if (!time)
-		return 0;
-
-	k = bch2_trans_kmalloc_nomemzero(trans, sizeof(*k));
-	ret = PTR_ERR_OR_ZERO(k);
-	if (unlikely(ret))
-		return ret;
-
-	bkey_init(&k->k);
-	k->k.type = key_type;
-	k->k.p = lru_pos(lru_id, dev_bucket, time);
-
-	EBUG_ON(lru_pos_id(k->k.p) != lru_id);
-	EBUG_ON(lru_pos_time(k->k.p) != time);
-	EBUG_ON(k->k.p.offset != dev_bucket);
-
-	return bch2_trans_update_buffered(trans, BTREE_ID_lru, k);
+	return time
+		? bch2_btree_bit_mod(trans, BTREE_ID_lru,
+				     lru_pos(lru_id, dev_bucket, time), set)
+		: 0;
 }
 
 int bch2_lru_del(struct btree_trans *trans, u16 lru_id, u64 dev_bucket, u64 time)
diff --git a/fs/bcachefs/lru.h b/fs/bcachefs/lru.h
index 7a3be20a8523..be66bf9ad809 100644
--- a/fs/bcachefs/lru.h
+++ b/fs/bcachefs/lru.h
@@ -5,13 +5,6 @@
 #define LRU_TIME_BITS	48
 #define LRU_TIME_MAX	((1ULL << LRU_TIME_BITS) - 1)
 
-static inline struct bpos lru_pos(u16 lru_id, u64 dev_bucket, u64 time)
-{
-	EBUG_ON(time > LRU_TIME_MAX);
-
-	return POS(((u64) lru_id << LRU_TIME_BITS)|time, dev_bucket);
-}
-
 static inline u64 lru_pos_id(struct bpos pos)
 {
 	return pos.inode >> LRU_TIME_BITS;
@@ -22,6 +15,18 @@ static inline u64 lru_pos_time(struct bpos pos)
 	return pos.inode & ~(~0ULL << LRU_TIME_BITS);
 }
 
+static inline struct bpos lru_pos(u16 lru_id, u64 dev_bucket, u64 time)
+{
+	struct bpos pos = POS(((u64) lru_id << LRU_TIME_BITS)|time, dev_bucket);
+
+	EBUG_ON(time > LRU_TIME_MAX);
+	EBUG_ON(lru_pos_id(pos) != lru_id);
+	EBUG_ON(lru_pos_time(pos) != time);
+	EBUG_ON(pos.offset != dev_bucket);
+
+	return pos;
+}
+
 #define BCH_LRU_TYPES()		\
 	x(read)			\
 	x(fragmentation)
-- 
cgit 


From 2a89a3e9682b127c1978ac31eb38ef73a39a416e Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 20 Jul 2023 19:30:53 -0400
Subject: bcachefs: Fix a null ptr deref in check_xattr()

We were attempting to initialize inode hash info when no inodes were
found.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fsck.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 022af0270692..5338765b3e68 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -1662,7 +1662,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
 	if (ret < 0)
 		goto err;
 
-	if (dir->first_this_inode)
+	if (dir->first_this_inode && dir->inodes.nr)
 		*hash_info = bch2_hash_info_init(c, &dir->inodes.data[0].inode);
 	dir->first_this_inode = false;
 
@@ -1839,7 +1839,7 @@ static int check_xattr(struct btree_trans *trans, struct btree_iter *iter,
 	if (ret)
 		return ret;
 
-	if (inode->first_this_inode)
+	if (inode->first_this_inode && inode->inodes.nr)
 		*hash_info = bch2_hash_info_init(c, &inode->inodes.data[0].inode);
 	inode->first_this_inode = false;
 
-- 
cgit 


From 4437590d5f1a44078d54c0b959f38e22a8703fc7 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 20 Jul 2023 22:47:59 -0400
Subject: bcachefs: Fix btree iter leak in __bch2_insert_snapshot_whiteouts()

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update_leaf.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 336a91e4fb20..947c28b76ff0 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -1327,7 +1327,7 @@ int __bch2_insert_snapshot_whiteouts(struct btree_trans *trans,
 				   struct bpos new_pos)
 {
 	struct bch_fs *c = trans->c;
-	struct btree_iter old_iter, new_iter;
+	struct btree_iter old_iter, new_iter = { NULL };
 	struct bkey_s_c old_k, new_k;
 	snapshot_id_list s;
 	struct bkey_i *update;
@@ -1377,6 +1377,7 @@ int __bch2_insert_snapshot_whiteouts(struct btree_trans *trans,
 		if (ret)
 			break;
 	}
+	bch2_trans_iter_exit(trans, &new_iter);
 	bch2_trans_iter_exit(trans, &old_iter);
 	darray_exit(&s);
 
-- 
cgit 


From ac319b4f89b9ff1377d294013aef59129a88b2d7 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 20 Jul 2023 23:14:05 -0400
Subject: bcachefs: Move some declarations to the correct header

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update.h          | 9 ---------
 fs/bcachefs/btree_update_interior.h | 9 +++++++++
 2 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
index f5700c286b9e..d6aec9341c68 100644
--- a/fs/bcachefs/btree_update.h
+++ b/fs/bcachefs/btree_update.h
@@ -74,15 +74,6 @@ int bch2_btree_delete_range(struct bch_fs *, enum btree_id,
 
 int bch2_btree_bit_mod(struct btree_trans *, enum btree_id, struct bpos, bool);
 
-int bch2_btree_node_rewrite(struct btree_trans *, struct btree_iter *,
-			    struct btree *, unsigned);
-void bch2_btree_node_rewrite_async(struct bch_fs *, struct btree *);
-int bch2_btree_node_update_key(struct btree_trans *, struct btree_iter *,
-			       struct btree *, struct bkey_i *,
-			       unsigned, bool);
-int bch2_btree_node_update_key_get_iter(struct btree_trans *, struct btree *,
-					struct bkey_i *, unsigned, bool);
-
 int __bch2_insert_snapshot_whiteouts(struct btree_trans *, enum btree_id,
 				     struct bpos, struct bpos);
 
diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h
index 221b7ad5d053..5e0a467fe905 100644
--- a/fs/bcachefs/btree_update_interior.h
+++ b/fs/bcachefs/btree_update_interior.h
@@ -154,6 +154,15 @@ static inline int bch2_foreground_maybe_merge(struct btree_trans *trans,
 						    btree_next_sib);
 }
 
+int bch2_btree_node_rewrite(struct btree_trans *, struct btree_iter *,
+			    struct btree *, unsigned);
+void bch2_btree_node_rewrite_async(struct bch_fs *, struct btree *);
+int bch2_btree_node_update_key(struct btree_trans *, struct btree_iter *,
+			       struct btree *, struct bkey_i *,
+			       unsigned, bool);
+int bch2_btree_node_update_key_get_iter(struct btree_trans *, struct btree *,
+					struct bkey_i *, unsigned, bool);
+
 void bch2_btree_set_root_for_read(struct bch_fs *, struct btree *);
 void bch2_btree_root_alloc(struct bch_fs *, enum btree_id);
 
-- 
cgit 


From 3f4ab4c1e63ed2713e237b2af0d442380d8cdb49 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 20 Jul 2023 23:13:43 -0400
Subject: bcachefs: Fix minor memory leak on invalid bkey

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update_leaf.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 947c28b76ff0..368972a00f55 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -852,12 +852,11 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, unsigned flags
 {
 	struct bch_fs *c = trans->c;
 	struct btree_insert_entry *i;
-	int ret, u64s_delta = 0;
+	int ret = 0, u64s_delta = 0;
 
 #ifdef CONFIG_BCACHEFS_DEBUG
-	struct printbuf buf = PRINTBUF;
-
 	trans_for_each_update(trans, i) {
+		struct printbuf buf = PRINTBUF;
 		enum bkey_invalid_flags invalid_flags = 0;
 
 		if (!(flags & BTREE_INSERT_JOURNAL_REPLAY))
@@ -865,10 +864,13 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, unsigned flags
 
 		if (unlikely(bch2_bkey_invalid(c, bkey_i_to_s_c(i->k),
 					       i->bkey_type, invalid_flags, &buf)))
-			return bch2_trans_commit_bkey_invalid(trans, flags, i, &buf);
+			ret = bch2_trans_commit_bkey_invalid(trans, flags, i, &buf);
 		btree_insert_entry_checks(trans, i);
+		printbuf_exit(&buf);
+
+		if (ret)
+			return ret;
 	}
-	printbuf_exit(&buf);
 #endif
 
 	trans_for_each_update(trans, i) {
-- 
cgit 


From 85beefefd2f8c0d8fa0a93660b0a1676073c57dc Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 21 Jul 2023 00:27:19 -0400
Subject: bcachefs: bch2_trans_update_extent_overwrite()

Factor out a new helper, to be used when fsck has to repair overlapping
extents.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update.h      |   5 +-
 fs/bcachefs/btree_update_leaf.c | 212 +++++++++++++++++++++-------------------
 2 files changed, 114 insertions(+), 103 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
index d6aec9341c68..2281140a288c 100644
--- a/fs/bcachefs/btree_update.h
+++ b/fs/bcachefs/btree_update.h
@@ -96,8 +96,9 @@ static inline int bch2_insert_snapshot_whiteouts(struct btree_trans *trans,
 	return __bch2_insert_snapshot_whiteouts(trans, btree, old_pos, new_pos);
 }
 
-int bch2_trans_update_extent(struct btree_trans *, struct btree_iter *,
-			     struct bkey_i *, enum btree_update_flags);
+int bch2_trans_update_extent_overwrite(struct btree_trans *, struct btree_iter *,
+				       enum btree_update_flags,
+				       struct bkey_s_c, struct bkey_s_c);
 
 int bch2_bkey_get_empty_slot(struct btree_trans *, struct btree_iter *,
 			     enum btree_id, struct bpos);
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 368972a00f55..afe89324dfef 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -1386,19 +1386,115 @@ int __bch2_insert_snapshot_whiteouts(struct btree_trans *trans,
 	return ret;
 }
 
-int bch2_trans_update_extent(struct btree_trans *trans,
-			     struct btree_iter *orig_iter,
-			     struct bkey_i *insert,
-			     enum btree_update_flags flags)
+int bch2_trans_update_extent_overwrite(struct btree_trans *trans,
+				       struct btree_iter *iter,
+				       enum btree_update_flags flags,
+				       struct bkey_s_c old,
+				       struct bkey_s_c new)
 {
-	struct btree_iter iter;
-	struct bpos start = bkey_start_pos(&insert->k);
+	enum btree_id btree_id = iter->btree_id;
 	struct bkey_i *update;
+	struct bpos new_start = bkey_start_pos(new.k);
+	bool front_split = bkey_lt(bkey_start_pos(old.k), new_start);
+	bool back_split  = bkey_gt(old.k->p, new.k->p);
+	int ret = 0, compressed_sectors;
+
+	/*
+	 * If we're going to be splitting a compressed extent, note it
+	 * so that __bch2_trans_commit() can increase our disk
+	 * reservation:
+	 */
+	if (((front_split && back_split) ||
+	     ((front_split || back_split) && old.k->p.snapshot != new.k->p.snapshot)) &&
+	    (compressed_sectors = bch2_bkey_sectors_compressed(old)))
+		trans->extra_journal_res += compressed_sectors;
+
+	if (front_split) {
+		update = bch2_bkey_make_mut_noupdate(trans, old);
+		if ((ret = PTR_ERR_OR_ZERO(update)))
+			return ret;
+
+		bch2_cut_back(new_start, update);
+
+		ret =   bch2_insert_snapshot_whiteouts(trans, btree_id,
+					old.k->p, update->k.p) ?:
+			bch2_btree_insert_nonextent(trans, btree_id, update,
+					BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|flags);
+		if (ret)
+			return ret;
+	}
+
+	/* If we're overwriting in a different snapshot - middle split: */
+	if (old.k->p.snapshot != new.k->p.snapshot &&
+	    (front_split || back_split)) {
+		update = bch2_bkey_make_mut_noupdate(trans, old);
+		if ((ret = PTR_ERR_OR_ZERO(update)))
+			return ret;
+
+		bch2_cut_front(new_start, update);
+		bch2_cut_back(new.k->p, update);
+
+		ret =   bch2_insert_snapshot_whiteouts(trans, btree_id,
+					old.k->p, update->k.p) ?:
+			bch2_btree_insert_nonextent(trans, btree_id, update,
+					  BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|flags);
+		if (ret)
+			return ret;
+	}
+
+	if (bkey_le(old.k->p, new.k->p)) {
+		update = bch2_trans_kmalloc(trans, sizeof(*update));
+		if ((ret = PTR_ERR_OR_ZERO(update)))
+			return ret;
+
+		bkey_init(&update->k);
+		update->k.p = old.k->p;
+		update->k.p.snapshot = new.k->p.snapshot;
+
+		if (new.k->p.snapshot != old.k->p.snapshot) {
+			update->k.type = KEY_TYPE_whiteout;
+		} else if (btree_type_has_snapshots(btree_id)) {
+			ret = need_whiteout_for_snapshot(trans, btree_id, update->k.p);
+			if (ret < 0)
+				return ret;
+			if (ret)
+				update->k.type = KEY_TYPE_whiteout;
+		}
+
+		ret = bch2_btree_insert_nonextent(trans, btree_id, update,
+					  BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|flags);
+		if (ret)
+			return ret;
+	}
+
+	if (back_split) {
+		update = bch2_bkey_make_mut_noupdate(trans, old);
+		if ((ret = PTR_ERR_OR_ZERO(update)))
+			return ret;
+
+		bch2_cut_front(new.k->p, update);
+
+		ret = bch2_trans_update_by_path(trans, iter->path, update,
+					  BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|
+					  flags, _RET_IP_);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+static int bch2_trans_update_extent(struct btree_trans *trans,
+				    struct btree_iter *orig_iter,
+				    struct bkey_i *insert,
+				    enum btree_update_flags flags)
+{
+	struct btree_iter iter;
 	struct bkey_s_c k;
 	enum btree_id btree_id = orig_iter->btree_id;
-	int ret = 0, compressed_sectors;
+	int ret = 0;
 
-	bch2_trans_iter_init(trans, &iter, btree_id, start,
+	bch2_trans_iter_init(trans, &iter, btree_id, bkey_start_pos(&insert->k),
 			     BTREE_ITER_INTENT|
 			     BTREE_ITER_WITH_UPDATES|
 			     BTREE_ITER_NOT_EXTENTS);
@@ -1419,90 +1515,14 @@ int bch2_trans_update_extent(struct btree_trans *trans,
 	}
 
 	while (bkey_gt(insert->k.p, bkey_start_pos(k.k))) {
-		bool front_split = bkey_lt(bkey_start_pos(k.k), start);
-		bool back_split  = bkey_gt(k.k->p, insert->k.p);
-
-		/*
-		 * If we're going to be splitting a compressed extent, note it
-		 * so that __bch2_trans_commit() can increase our disk
-		 * reservation:
-		 */
-		if (((front_split && back_split) ||
-		     ((front_split || back_split) && k.k->p.snapshot != insert->k.p.snapshot)) &&
-		    (compressed_sectors = bch2_bkey_sectors_compressed(k)))
-			trans->extra_journal_res += compressed_sectors;
-
-		if (front_split) {
-			update = bch2_bkey_make_mut_noupdate(trans, k);
-			if ((ret = PTR_ERR_OR_ZERO(update)))
-				goto err;
-
-			bch2_cut_back(start, update);
-
-			ret =   bch2_insert_snapshot_whiteouts(trans, btree_id,
-						k.k->p, update->k.p) ?:
-				bch2_btree_insert_nonextent(trans, btree_id, update,
-						BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|flags);
-			if (ret)
-				goto err;
-		}
-
-		if (k.k->p.snapshot != insert->k.p.snapshot &&
-		    (front_split || back_split)) {
-			update = bch2_bkey_make_mut_noupdate(trans, k);
-			if ((ret = PTR_ERR_OR_ZERO(update)))
-				goto err;
-
-			bch2_cut_front(start, update);
-			bch2_cut_back(insert->k.p, update);
-
-			ret =   bch2_insert_snapshot_whiteouts(trans, btree_id,
-						k.k->p, update->k.p) ?:
-				bch2_btree_insert_nonextent(trans, btree_id, update,
-						  BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|flags);
-			if (ret)
-				goto err;
-		}
-
-		if (bkey_le(k.k->p, insert->k.p)) {
-			update = bch2_trans_kmalloc(trans, sizeof(*update));
-			if ((ret = PTR_ERR_OR_ZERO(update)))
-				goto err;
+		bool done = bkey_lt(insert->k.p, k.k->p);
 
-			bkey_init(&update->k);
-			update->k.p = k.k->p;
-			update->k.p.snapshot = insert->k.p.snapshot;
-
-			if (insert->k.p.snapshot != k.k->p.snapshot) {
-				update->k.type = KEY_TYPE_whiteout;
-			} else if (btree_type_has_snapshots(btree_id)) {
-				ret = need_whiteout_for_snapshot(trans, btree_id, update->k.p);
-				if (ret < 0)
-					goto err;
-				if (ret)
-					update->k.type = KEY_TYPE_whiteout;
-			}
-
-			ret = bch2_btree_insert_nonextent(trans, btree_id, update,
-						  BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|flags);
-			if (ret)
-				goto err;
-		}
-
-		if (back_split) {
-			update = bch2_bkey_make_mut_noupdate(trans, k);
-			if ((ret = PTR_ERR_OR_ZERO(update)))
-				goto err;
-
-			bch2_cut_front(insert->k.p, update);
+		ret = bch2_trans_update_extent_overwrite(trans, &iter, flags, k, bkey_i_to_s_c(insert));
+		if (ret)
+			goto err;
 
-			ret = bch2_trans_update_by_path(trans, iter.path, update,
-						  BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|
-						  flags, _RET_IP_);
-			if (ret)
-				goto err;
+		if (done)
 			goto out;
-		}
 next:
 		bch2_btree_iter_advance(&iter);
 		k = bch2_btree_iter_peek_upto(&iter, POS(insert->k.p.inode, U64_MAX));
@@ -1518,18 +1538,8 @@ next:
 			goto err;
 	}
 out:
-	if (!bkey_deleted(&insert->k)) {
-		/*
-		 * Rewinding iterators is expensive: get a new one and the one
-		 * that points to the start of insert will be cloned from:
-		 */
-		bch2_trans_iter_exit(trans, &iter);
-		bch2_trans_iter_init(trans, &iter, btree_id, insert->k.p,
-				     BTREE_ITER_NOT_EXTENTS|
-				     BTREE_ITER_INTENT);
-		ret   = bch2_btree_iter_traverse(&iter) ?:
-			bch2_trans_update(trans, &iter, insert, flags);
-	}
+	if (!bkey_deleted(&insert->k))
+		ret = bch2_btree_insert_nonextent(trans, btree_id, insert, flags);
 err:
 	bch2_trans_iter_exit(trans, &iter);
 
-- 
cgit 


From e8d2fe3b4bfad1c902f9b0acea1edc8d5291deb9 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 21 Jul 2023 05:38:45 -0400
Subject: bcachefs: Consolidate btree id properties

This refactoring centralizes defining per-btree properties.

bch2_key_types_allowed was also about to overflow a u32, so expand that
to a u64.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs_format.h | 77 +++++++++++++++++++++++++++++++----------
 fs/bcachefs/bkey_methods.c    | 80 +++++--------------------------------------
 fs/bcachefs/btree_gc.h        |  2 +-
 fs/bcachefs/btree_types.h     | 67 +++++++++++++++++++-----------------
 fs/bcachefs/opts.c            |  2 +-
 5 files changed, 104 insertions(+), 124 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index 5c308f8421c5..4e0673c668d3 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -2194,26 +2194,67 @@ LE32_BITMASK(JSET_NO_FLUSH,	struct jset, flags, 5, 6);
 
 /* Btree: */
 
-#define BCH_BTREE_IDS()				\
-	x(extents,		0)		\
-	x(inodes,		1)		\
-	x(dirents,		2)		\
-	x(xattrs,		3)		\
-	x(alloc,		4)		\
-	x(quotas,		5)		\
-	x(stripes,		6)		\
-	x(reflink,		7)		\
-	x(subvolumes,		8)		\
-	x(snapshots,		9)		\
-	x(lru,			10)		\
-	x(freespace,		11)		\
-	x(need_discard,		12)		\
-	x(backpointers,		13)		\
-	x(bucket_gens,		14)		\
-	x(snapshot_trees,	15)
+enum btree_id_flags {
+	BTREE_ID_EXTENTS	= BIT(0),
+	BTREE_ID_SNAPSHOTS	= BIT(1),
+	BTREE_ID_DATA		= BIT(2),
+};
+
+#define BCH_BTREE_IDS()								\
+	x(extents,		0,	BTREE_ID_EXTENTS|BTREE_ID_SNAPSHOTS|BTREE_ID_DATA,\
+	  BIT_ULL(KEY_TYPE_whiteout)|						\
+	  BIT_ULL(KEY_TYPE_error)|						\
+	  BIT_ULL(KEY_TYPE_cookie)|						\
+	  BIT_ULL(KEY_TYPE_extent)|						\
+	  BIT_ULL(KEY_TYPE_reservation)|					\
+	  BIT_ULL(KEY_TYPE_reflink_p)|						\
+	  BIT_ULL(KEY_TYPE_inline_data))					\
+	x(inodes,		1,	BTREE_ID_SNAPSHOTS,			\
+	  BIT_ULL(KEY_TYPE_whiteout)|						\
+	  BIT_ULL(KEY_TYPE_inode)|						\
+	  BIT_ULL(KEY_TYPE_inode_v2)|						\
+	  BIT_ULL(KEY_TYPE_inode_v3)|						\
+	  BIT_ULL(KEY_TYPE_inode_generation))					\
+	x(dirents,		2,	BTREE_ID_SNAPSHOTS,			\
+	  BIT_ULL(KEY_TYPE_whiteout)|						\
+	  BIT_ULL(KEY_TYPE_hash_whiteout)|					\
+	  BIT_ULL(KEY_TYPE_dirent))						\
+	x(xattrs,		3,	BTREE_ID_SNAPSHOTS,			\
+	  BIT_ULL(KEY_TYPE_whiteout)|						\
+	  BIT_ULL(KEY_TYPE_cookie)|						\
+	  BIT_ULL(KEY_TYPE_hash_whiteout)|					\
+	  BIT_ULL(KEY_TYPE_xattr))						\
+	x(alloc,		4,	0,					\
+	  BIT_ULL(KEY_TYPE_alloc)|						\
+	  BIT_ULL(KEY_TYPE_alloc_v2)|						\
+	  BIT_ULL(KEY_TYPE_alloc_v3)|						\
+	  BIT_ULL(KEY_TYPE_alloc_v4))						\
+	x(quotas,		5,	0,					\
+	  BIT_ULL(KEY_TYPE_quota))						\
+	x(stripes,		6,	0,					\
+	  BIT_ULL(KEY_TYPE_stripe))						\
+	x(reflink,		7,	BTREE_ID_EXTENTS|BTREE_ID_DATA,		\
+	  BIT_ULL(KEY_TYPE_reflink_v)|						\
+	  BIT_ULL(KEY_TYPE_indirect_inline_data))				\
+	x(subvolumes,		8,	0,					\
+	  BIT_ULL(KEY_TYPE_subvolume))						\
+	x(snapshots,		9,	0,					\
+	  BIT_ULL(KEY_TYPE_snapshot))						\
+	x(lru,			10,	0,					\
+	  BIT_ULL(KEY_TYPE_set))						\
+	x(freespace,		11,	BTREE_ID_EXTENTS,			\
+	  BIT_ULL(KEY_TYPE_set))						\
+	x(need_discard,		12,	0,					\
+	  BIT_ULL(KEY_TYPE_set))						\
+	x(backpointers,		13,	0,					\
+	  BIT_ULL(KEY_TYPE_backpointer))					\
+	x(bucket_gens,		14,	0,					\
+	  BIT_ULL(KEY_TYPE_bucket_gens))					\
+	x(snapshot_trees,	15,	0,					\
+	  BIT_ULL(KEY_TYPE_snapshot_tree))
 
 enum btree_id {
-#define x(kwd, val) BTREE_ID_##kwd = val,
+#define x(name, nr, ...) BTREE_ID_##name = nr,
 	BCH_BTREE_IDS()
 #undef x
 	BTREE_ID_NR
diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c
index 1381166bfc55..90557f4c156d 100644
--- a/fs/bcachefs/bkey_methods.c
+++ b/fs/bcachefs/bkey_methods.c
@@ -140,78 +140,14 @@ int bch2_bkey_val_invalid(struct bch_fs *c, struct bkey_s_c k,
 	return ops->key_invalid(c, k, flags, err);
 }
 
-static unsigned bch2_key_types_allowed[] = {
-	[BKEY_TYPE_extents] =
-		(1U << KEY_TYPE_deleted)|
-		(1U << KEY_TYPE_whiteout)|
-		(1U << KEY_TYPE_error)|
-		(1U << KEY_TYPE_cookie)|
-		(1U << KEY_TYPE_extent)|
-		(1U << KEY_TYPE_reservation)|
-		(1U << KEY_TYPE_reflink_p)|
-		(1U << KEY_TYPE_inline_data),
-	[BKEY_TYPE_inodes] =
-		(1U << KEY_TYPE_deleted)|
-		(1U << KEY_TYPE_whiteout)|
-		(1U << KEY_TYPE_inode)|
-		(1U << KEY_TYPE_inode_v2)|
-		(1U << KEY_TYPE_inode_v3)|
-		(1U << KEY_TYPE_inode_generation),
-	[BKEY_TYPE_dirents] =
-		(1U << KEY_TYPE_deleted)|
-		(1U << KEY_TYPE_whiteout)|
-		(1U << KEY_TYPE_hash_whiteout)|
-		(1U << KEY_TYPE_dirent),
-	[BKEY_TYPE_xattrs] =
-		(1U << KEY_TYPE_deleted)|
-		(1U << KEY_TYPE_whiteout)|
-		(1U << KEY_TYPE_cookie)|
-		(1U << KEY_TYPE_hash_whiteout)|
-		(1U << KEY_TYPE_xattr),
-	[BKEY_TYPE_alloc] =
-		(1U << KEY_TYPE_deleted)|
-		(1U << KEY_TYPE_alloc)|
-		(1U << KEY_TYPE_alloc_v2)|
-		(1U << KEY_TYPE_alloc_v3)|
-		(1U << KEY_TYPE_alloc_v4),
-	[BKEY_TYPE_quotas] =
-		(1U << KEY_TYPE_deleted)|
-		(1U << KEY_TYPE_quota),
-	[BKEY_TYPE_stripes] =
-		(1U << KEY_TYPE_deleted)|
-		(1U << KEY_TYPE_stripe),
-	[BKEY_TYPE_reflink] =
-		(1U << KEY_TYPE_deleted)|
-		(1U << KEY_TYPE_reflink_v)|
-		(1U << KEY_TYPE_indirect_inline_data),
-	[BKEY_TYPE_subvolumes] =
-		(1U << KEY_TYPE_deleted)|
-		(1U << KEY_TYPE_subvolume),
-	[BKEY_TYPE_snapshots] =
-		(1U << KEY_TYPE_deleted)|
-		(1U << KEY_TYPE_snapshot),
-	[BKEY_TYPE_lru] =
-		(1U << KEY_TYPE_deleted)|
-		(1U << KEY_TYPE_set),
-	[BKEY_TYPE_freespace] =
-		(1U << KEY_TYPE_deleted)|
-		(1U << KEY_TYPE_set),
-	[BKEY_TYPE_need_discard] =
-		(1U << KEY_TYPE_deleted)|
-		(1U << KEY_TYPE_set),
-	[BKEY_TYPE_backpointers] =
-		(1U << KEY_TYPE_deleted)|
-		(1U << KEY_TYPE_backpointer),
-	[BKEY_TYPE_bucket_gens] =
-		(1U << KEY_TYPE_deleted)|
-		(1U << KEY_TYPE_bucket_gens),
-	[BKEY_TYPE_snapshot_trees] =
-		(1U << KEY_TYPE_deleted)|
-		(1U << KEY_TYPE_snapshot_tree),
+static u64 bch2_key_types_allowed[] = {
+#define x(name, nr, flags, keys)	[BKEY_TYPE_##name] = BIT_ULL(KEY_TYPE_deleted)|keys,
+	BCH_BTREE_IDS()
+#undef x
 	[BKEY_TYPE_btree] =
-		(1U << KEY_TYPE_deleted)|
-		(1U << KEY_TYPE_btree_ptr)|
-		(1U << KEY_TYPE_btree_ptr_v2),
+		BIT_ULL(KEY_TYPE_deleted)|
+		BIT_ULL(KEY_TYPE_btree_ptr)|
+		BIT_ULL(KEY_TYPE_btree_ptr_v2),
 };
 
 int __bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k,
@@ -225,7 +161,7 @@ int __bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k,
 	}
 
 	if (flags & BKEY_INVALID_COMMIT	 &&
-	    !(bch2_key_types_allowed[type] & (1U << k.k->type))) {
+	    !(bch2_key_types_allowed[type] & BIT_ULL(k.k->type))) {
 		prt_printf(err, "invalid key type for btree %s (%s)",
 			   bch2_btree_ids[type], bch2_bkey_types[k.k->type]);
 		return -BCH_ERR_invalid_bkey;
diff --git a/fs/bcachefs/btree_gc.h b/fs/bcachefs/btree_gc.h
index 402c69184656..b45e382f7055 100644
--- a/fs/bcachefs/btree_gc.h
+++ b/fs/bcachefs/btree_gc.h
@@ -51,7 +51,7 @@ static inline int gc_pos_cmp(struct gc_pos l, struct gc_pos r)
 static inline enum gc_phase btree_id_to_gc_phase(enum btree_id id)
 {
 	switch (id) {
-#define x(name, v) case BTREE_ID_##name: return GC_PHASE_BTREE_##name;
+#define x(name, v, ...) case BTREE_ID_##name: return GC_PHASE_BTREE_##name;
 	BCH_BTREE_IDS()
 #undef x
 	default:
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 9bfaa15d5ad4..70398aaa095e 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -636,7 +636,7 @@ static inline unsigned bset_byte_offset(struct btree *b, void *i)
 }
 
 enum btree_node_type {
-#define x(kwd, val) BKEY_TYPE_##kwd = val,
+#define x(kwd, val, ...) BKEY_TYPE_##kwd = val,
 	BCH_BTREE_IDS()
 #undef x
 	BKEY_TYPE_btree,
@@ -655,31 +655,37 @@ static inline enum btree_node_type btree_node_type(struct btree *b)
 }
 
 #define BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS		\
-	((1U << BKEY_TYPE_extents)|			\
-	 (1U << BKEY_TYPE_alloc)|			\
-	 (1U << BKEY_TYPE_inodes)|			\
-	 (1U << BKEY_TYPE_stripes)|			\
-	 (1U << BKEY_TYPE_reflink)|			\
-	 (1U << BKEY_TYPE_btree))
+	(BIT(BKEY_TYPE_extents)|			\
+	 BIT(BKEY_TYPE_alloc)|				\
+	 BIT(BKEY_TYPE_inodes)|				\
+	 BIT(BKEY_TYPE_stripes)|			\
+	 BIT(BKEY_TYPE_reflink)|			\
+	 BIT(BKEY_TYPE_btree))
 
 #define BTREE_NODE_TYPE_HAS_MEM_TRIGGERS		\
-	((1U << BKEY_TYPE_alloc)|			\
-	 (1U << BKEY_TYPE_inodes)|			\
-	 (1U << BKEY_TYPE_stripes)|			\
-	 (1U << BKEY_TYPE_snapshots))
+	(BIT(BKEY_TYPE_alloc)|				\
+	 BIT(BKEY_TYPE_inodes)|				\
+	 BIT(BKEY_TYPE_stripes)|			\
+	 BIT(BKEY_TYPE_snapshots))
 
 #define BTREE_NODE_TYPE_HAS_TRIGGERS			\
 	(BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS|		\
 	 BTREE_NODE_TYPE_HAS_MEM_TRIGGERS)
 
-#define BTREE_ID_IS_EXTENTS				\
-	((1U << BTREE_ID_extents)|			\
-	 (1U << BTREE_ID_reflink)|			\
-	 (1U << BTREE_ID_freespace))
+static inline bool btree_node_type_needs_gc(enum btree_node_type type)
+{
+	return BTREE_NODE_TYPE_HAS_TRIGGERS & (1U << type);
+}
 
 static inline bool btree_node_type_is_extents(enum btree_node_type type)
 {
-	return (1U << type) & BTREE_ID_IS_EXTENTS;
+	const unsigned mask = 0
+#define x(name, nr, flags, ...)	|((!!((flags) & BTREE_ID_EXTENTS)) << nr)
+	BCH_BTREE_IDS()
+#undef x
+	;
+
+	return (1U << type) & mask;
 }
 
 static inline bool btree_id_is_extents(enum btree_id btree)
@@ -687,29 +693,26 @@ static inline bool btree_id_is_extents(enum btree_id btree)
 	return btree_node_type_is_extents((enum btree_node_type) btree);
 }
 
-#define BTREE_ID_HAS_SNAPSHOTS				\
-	((1U << BTREE_ID_extents)|			\
-	 (1U << BTREE_ID_inodes)|			\
-	 (1U << BTREE_ID_dirents)|			\
-	 (1U << BTREE_ID_xattrs))
-
-#define BTREE_ID_HAS_PTRS				\
-	((1U << BTREE_ID_extents)|			\
-	 (1U << BTREE_ID_reflink))
-
 static inline bool btree_type_has_snapshots(enum btree_id id)
 {
-	return (1 << id) & BTREE_ID_HAS_SNAPSHOTS;
+	const unsigned mask = 0
+#define x(name, nr, flags, ...)	|((!!((flags) & BTREE_ID_SNAPSHOTS)) << nr)
+	BCH_BTREE_IDS()
+#undef x
+	;
+
+	return (1U << id) & mask;
 }
 
 static inline bool btree_type_has_ptrs(enum btree_id id)
 {
-	return (1 << id) & BTREE_ID_HAS_PTRS;
-}
+	const unsigned mask = 0
+#define x(name, nr, flags, ...)	|((!!((flags) & BTREE_ID_DATA)) << nr)
+	BCH_BTREE_IDS()
+#undef x
+	;
 
-static inline bool btree_node_type_needs_gc(enum btree_node_type type)
-{
-	return BTREE_NODE_TYPE_HAS_TRIGGERS & (1U << type);
+	return (1U << id) & mask;
 }
 
 struct btree_root {
diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c
index 56f586f8d25b..3a6fd1d96ed2 100644
--- a/fs/bcachefs/opts.c
+++ b/fs/bcachefs/opts.c
@@ -10,7 +10,7 @@
 #include "super-io.h"
 #include "util.h"
 
-#define x(t, n) [n] = #t,
+#define x(t, n, ...) [n] = #t,
 
 const char * const bch2_error_actions[] = {
 	BCH_ERROR_ACTIONS()
-- 
cgit 


From 7904c82ceae963b0f89e96a49dc714adffe3adc6 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 21 Jul 2023 03:20:08 -0400
Subject: bcachefs: Move fsck_inode_rm() to inode.c

Prep work for the new deleted inodes btree

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fsck.c  | 65 +----------------------------------------------------
 fs/bcachefs/inode.c | 63 +++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/bcachefs/inode.h |  2 ++
 3 files changed, 66 insertions(+), 64 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 5338765b3e68..31eb9176eb39 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -219,69 +219,6 @@ static int write_inode(struct btree_trans *trans,
 	return ret;
 }
 
-static int fsck_inode_rm(struct btree_trans *trans, u64 inum, u32 snapshot)
-{
-	struct bch_fs *c = trans->c;
-	struct btree_iter iter = { NULL };
-	struct bkey_i_inode_generation delete;
-	struct bch_inode_unpacked inode_u;
-	struct bkey_s_c k;
-	int ret;
-
-	do {
-		ret   = bch2_btree_delete_range_trans(trans, BTREE_ID_extents,
-						      SPOS(inum, 0, snapshot),
-						      SPOS(inum, U64_MAX, snapshot),
-						      0, NULL) ?:
-			bch2_btree_delete_range_trans(trans, BTREE_ID_dirents,
-						      SPOS(inum, 0, snapshot),
-						      SPOS(inum, U64_MAX, snapshot),
-						      0, NULL) ?:
-			bch2_btree_delete_range_trans(trans, BTREE_ID_xattrs,
-						      SPOS(inum, 0, snapshot),
-						      SPOS(inum, U64_MAX, snapshot),
-						      0, NULL);
-	} while (ret == -BCH_ERR_transaction_restart_nested);
-	if (ret)
-		goto err;
-retry:
-	bch2_trans_begin(trans);
-
-	k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
-			       SPOS(0, inum, snapshot), BTREE_ITER_INTENT);
-	ret = bkey_err(k);
-	if (ret)
-		goto err;
-
-	if (!bkey_is_inode(k.k)) {
-		bch2_fs_inconsistent(c,
-				     "inode %llu:%u not found when deleting",
-				     inum, snapshot);
-		ret = -EIO;
-		goto err;
-	}
-
-	bch2_inode_unpack(k, &inode_u);
-
-	/* Subvolume root? */
-	if (inode_u.bi_subvol)
-		bch_warn(c, "deleting inode %llu marked as unlinked, but also a subvolume root!?", inode_u.bi_inum);
-
-	bkey_inode_generation_init(&delete.k_i);
-	delete.k.p = iter.pos;
-	delete.v.bi_generation = cpu_to_le32(inode_u.bi_generation + 1);
-
-	ret   = bch2_trans_update(trans, &iter, &delete.k_i, 0) ?:
-		bch2_trans_commit(trans, NULL, NULL,
-				BTREE_INSERT_NOFAIL);
-err:
-	bch2_trans_iter_exit(trans, &iter);
-	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-		goto retry;
-
-	return ret ?: -BCH_ERR_transaction_restart_nested;
-}
-
 static int __remove_dirent(struct btree_trans *trans, struct bpos pos)
 {
 	struct bch_fs *c = trans->c;
@@ -929,7 +866,7 @@ static int check_inode(struct btree_trans *trans,
 		bch2_trans_unlock(trans);
 		bch2_fs_lazy_rw(c);
 
-		ret = fsck_inode_rm(trans, u.bi_inum, iter->pos.snapshot);
+		ret = bch2_inode_rm_snapshot(trans, u.bi_inum, iter->pos.snapshot);
 		if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
 			bch_err(c, "error in fsck: error while deleting inode: %s",
 				bch2_err_str(ret));
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index 8834809d4a67..755cf7d177cd 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -923,3 +923,66 @@ void bch2_inode_opts_get(struct bch_io_opts *opts, struct bch_fs *c,
 	if (opts->nocow)
 		opts->compression = opts->background_compression = opts->data_checksum = opts->erasure_code = 0;
 }
+
+int bch2_inode_rm_snapshot(struct btree_trans *trans, u64 inum, u32 snapshot)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter iter = { NULL };
+	struct bkey_i_inode_generation delete;
+	struct bch_inode_unpacked inode_u;
+	struct bkey_s_c k;
+	int ret;
+
+	do {
+		ret   = bch2_btree_delete_range_trans(trans, BTREE_ID_extents,
+						      SPOS(inum, 0, snapshot),
+						      SPOS(inum, U64_MAX, snapshot),
+						      0, NULL) ?:
+			bch2_btree_delete_range_trans(trans, BTREE_ID_dirents,
+						      SPOS(inum, 0, snapshot),
+						      SPOS(inum, U64_MAX, snapshot),
+						      0, NULL) ?:
+			bch2_btree_delete_range_trans(trans, BTREE_ID_xattrs,
+						      SPOS(inum, 0, snapshot),
+						      SPOS(inum, U64_MAX, snapshot),
+						      0, NULL);
+	} while (ret == -BCH_ERR_transaction_restart_nested);
+	if (ret)
+		goto err;
+retry:
+	bch2_trans_begin(trans);
+
+	k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
+			       SPOS(0, inum, snapshot), BTREE_ITER_INTENT);
+	ret = bkey_err(k);
+	if (ret)
+		goto err;
+
+	if (!bkey_is_inode(k.k)) {
+		bch2_fs_inconsistent(c,
+				     "inode %llu:%u not found when deleting",
+				     inum, snapshot);
+		ret = -EIO;
+		goto err;
+	}
+
+	bch2_inode_unpack(k, &inode_u);
+
+	/* Subvolume root? */
+	if (inode_u.bi_subvol)
+		bch_warn(c, "deleting inode %llu marked as unlinked, but also a subvolume root!?", inode_u.bi_inum);
+
+	bkey_inode_generation_init(&delete.k_i);
+	delete.k.p = iter.pos;
+	delete.v.bi_generation = cpu_to_le32(inode_u.bi_generation + 1);
+
+	ret   = bch2_trans_update(trans, &iter, &delete.k_i, 0) ?:
+		bch2_trans_commit(trans, NULL, NULL,
+				BTREE_INSERT_NOFAIL);
+err:
+	bch2_trans_iter_exit(trans, &iter);
+	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+		goto retry;
+
+	return ret ?: -BCH_ERR_transaction_restart_nested;
+}
diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h
index 7809d1b6d7a2..1b9dc27e82bd 100644
--- a/fs/bcachefs/inode.h
+++ b/fs/bcachefs/inode.h
@@ -198,4 +198,6 @@ struct bch_opts bch2_inode_opts_to_opts(struct bch_inode_unpacked *);
 void bch2_inode_opts_get(struct bch_io_opts *, struct bch_fs *,
 			 struct bch_inode_unpacked *);
 
+int bch2_inode_rm_snapshot(struct btree_trans *, u64, u32);
+
 #endif /* _BCACHEFS_INODE_H */
-- 
cgit 


From bf5a261c7af80a2ac10bcc3ce0382cb238eccb8b Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 1 Aug 2023 20:06:45 -0400
Subject: bcachefs: Assorted fixes for clang

clang had a few more warnings about enum conversion, and also didn't
like the opts.c initializer.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c      |  32 +----------
 fs/bcachefs/alloc_foreground.c      |   6 +-
 fs/bcachefs/alloc_types.h           |   2 +-
 fs/bcachefs/bcachefs_format.h       |   2 +-
 fs/bcachefs/btree_iter.c            |  22 --------
 fs/bcachefs/btree_locking.c         |   6 +-
 fs/bcachefs/btree_locking.h         |   4 +-
 fs/bcachefs/btree_update_interior.c |   4 +-
 fs/bcachefs/btree_update_leaf.c     |   2 +-
 fs/bcachefs/darray.h                |   6 +-
 fs/bcachefs/ec.c                    | 108 ++++++++++++++++++++----------------
 fs/bcachefs/ec.h                    |   5 +-
 fs/bcachefs/opts.c                  |   8 +--
 fs/bcachefs/super-io.c              |   7 +--
 14 files changed, 86 insertions(+), 128 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 1f6a518cbe36..18b97416750f 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -79,36 +79,6 @@ static inline u64 alloc_field_v1_get(const struct bch_alloc *a,
 	return v;
 }
 
-static inline void alloc_field_v1_put(struct bkey_i_alloc *a, void **p,
-				      unsigned field, u64 v)
-{
-	unsigned bytes = BCH_ALLOC_V1_FIELD_BYTES[field];
-
-	if (!v)
-		return;
-
-	a->v.fields |= 1 << field;
-
-	switch (bytes) {
-	case 1:
-		*((u8 *) *p) = v;
-		break;
-	case 2:
-		*((__le16 *) *p) = cpu_to_le16(v);
-		break;
-	case 4:
-		*((__le32 *) *p) = cpu_to_le32(v);
-		break;
-	case 8:
-		*((__le64 *) *p) = cpu_to_le64(v);
-		break;
-	default:
-		BUG();
-	}
-
-	*p += bytes;
-}
-
 static void bch2_alloc_unpack_v1(struct bkey_alloc_unpacked *out,
 				 struct bkey_s_c k)
 {
@@ -1334,7 +1304,7 @@ static int bch2_check_discard_freespace_key(struct btree_trans *trans,
 					    struct btree_iter *iter,
 					    struct bpos end)
 {
-	if (!btree_node_type_is_extents(iter->btree_id)) {
+	if (!btree_id_is_extents(iter->btree_id)) {
 		return __bch2_check_discard_freespace_key(trans, iter);
 	} else {
 		int ret = 0;
diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index fcb7311b1844..1f4c5b38562d 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -1042,8 +1042,12 @@ static bool should_drop_bucket(struct open_bucket *ob, struct bch_fs *c,
 		unsigned i;
 
 		if (!drop && ob->ec) {
+			unsigned nr_blocks;
+
 			mutex_lock(&ob->ec->lock);
-			for (i = 0; i < ob->ec->new_stripe.key.v.nr_blocks; i++) {
+			nr_blocks = bkey_i_to_stripe(&ob->ec->new_stripe.key)->v.nr_blocks;
+
+			for (i = 0; i < nr_blocks; i++) {
 				if (!ob->ec->blocks[i])
 					continue;
 
diff --git a/fs/bcachefs/alloc_types.h b/fs/bcachefs/alloc_types.h
index c33a29954e59..804a843f23c1 100644
--- a/fs/bcachefs/alloc_types.h
+++ b/fs/bcachefs/alloc_types.h
@@ -32,7 +32,7 @@ enum bch_watermark {
 };
 
 #define BCH_WATERMARK_BITS	3
-#define BCH_WATERMARK_MASK	~(~0 << BCH_WATERMARK_BITS)
+#define BCH_WATERMARK_MASK	~(~0U << BCH_WATERMARK_BITS)
 
 #define OPEN_BUCKETS_COUNT	1024
 
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index 4e0673c668d3..b771d80f6361 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -2138,7 +2138,7 @@ struct jset_entry_dev_usage {
 	__le64			_buckets_unavailable; /* No longer used */
 
 	struct jset_entry_dev_usage_type d[];
-} __packed;
+};
 
 static inline unsigned jset_entry_dev_usage_nr_types(struct jset_entry_dev_usage *u)
 {
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index d16331620ab9..ecc123b2d1b3 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -35,18 +35,6 @@ static inline unsigned long btree_iter_ip_allocated(struct btree_iter *iter)
 
 static struct btree_path *btree_path_alloc(struct btree_trans *, struct btree_path *);
 
-/*
- * Unlocks before scheduling
- * Note: does not revalidate iterator
- */
-static inline int bch2_trans_cond_resched(struct btree_trans *trans)
-{
-	if (need_resched() || race_fault())
-		return drop_locks_do(trans, (schedule(), 0));
-	else
-		return 0;
-}
-
 static inline int __btree_path_cmp(const struct btree_path *l,
 				   enum btree_id	r_btree_id,
 				   bool			r_cached,
@@ -2732,16 +2720,6 @@ void bch2_trans_iter_exit(struct btree_trans *trans, struct btree_iter *iter)
 	iter->key_cache_path = NULL;
 }
 
-static inline void bch2_trans_iter_init_inlined(struct btree_trans *trans,
-			  struct btree_iter *iter,
-			  unsigned btree_id, struct bpos pos,
-			  unsigned flags)
-{
-	bch2_trans_iter_init_common(trans, iter, btree_id, pos, 0, 0,
-			       bch2_btree_iter_flags(trans, btree_id, flags),
-			       _RET_IP_);
-}
-
 void bch2_trans_iter_init_outlined(struct btree_trans *trans,
 			  struct btree_iter *iter,
 			  enum btree_id btree_id, struct bpos pos,
diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c
index dfdf46eb3e6d..40c8ed8f7bf1 100644
--- a/fs/bcachefs/btree_locking.c
+++ b/fs/bcachefs/btree_locking.c
@@ -388,7 +388,7 @@ int __bch2_btree_node_lock_write(struct btree_trans *trans, struct btree_path *p
 	six_lock_readers_add(&b->lock, readers);
 
 	if (ret)
-		mark_btree_node_locked_noreset(path, b->level, SIX_LOCK_intent);
+		mark_btree_node_locked_noreset(path, b->level, BTREE_NODE_INTENT_LOCKED);
 
 	return ret;
 }
@@ -552,7 +552,7 @@ bool bch2_btree_node_upgrade(struct btree_trans *trans,
 	trace_and_count(trans->c, btree_path_upgrade_fail, trans, _RET_IP_, path, level);
 	return false;
 success:
-	mark_btree_node_locked_noreset(path, level, SIX_LOCK_intent);
+	mark_btree_node_locked_noreset(path, level, BTREE_NODE_INTENT_LOCKED);
 	return true;
 }
 
@@ -667,7 +667,7 @@ void __bch2_btree_path_downgrade(struct btree_trans *trans,
 		} else {
 			if (btree_node_intent_locked(path, l)) {
 				six_lock_downgrade(&path->l[l].b->c.lock);
-				mark_btree_node_locked_noreset(path, l, SIX_LOCK_read);
+				mark_btree_node_locked_noreset(path, l, BTREE_NODE_READ_LOCKED);
 			}
 			break;
 		}
diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h
index 78daa494c914..22e2cd3914a5 100644
--- a/fs/bcachefs/btree_locking.h
+++ b/fs/bcachefs/btree_locking.h
@@ -180,7 +180,7 @@ bch2_btree_node_unlock_write_inlined(struct btree_trans *trans, struct btree_pat
 	EBUG_ON(path->l[b->c.level].lock_seq != six_lock_seq(&b->c.lock));
 	EBUG_ON(btree_node_locked_type(path, b->c.level) != SIX_LOCK_write);
 
-	mark_btree_node_locked_noreset(path, b->c.level, SIX_LOCK_intent);
+	mark_btree_node_locked_noreset(path, b->c.level, BTREE_NODE_INTENT_LOCKED);
 
 	trans_for_each_path_with_node(trans, b, linked)
 		linked->l[b->c.level].lock_seq++;
@@ -293,7 +293,7 @@ static inline int __btree_node_lock_write(struct btree_trans *trans,
 	 * write lock: thus, we need to tell the cycle detector we have a write
 	 * lock _before_ taking the lock:
 	 */
-	mark_btree_node_locked_noreset(path, b->level, SIX_LOCK_write);
+	mark_btree_node_locked_noreset(path, b->level, BTREE_NODE_WRITE_LOCKED);
 
 	return likely(six_trylock_write(&b->lock))
 		? 0
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 3659b2c08109..f42ef46c59df 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -188,7 +188,7 @@ static void bch2_btree_node_free_inmem(struct btree_trans *trans,
 	bch2_btree_node_hash_remove(&c->btree_cache, b);
 	__btree_node_free(c, b);
 	six_unlock_write(&b->c.lock);
-	mark_btree_node_locked_noreset(path, level, SIX_LOCK_intent);
+	mark_btree_node_locked_noreset(path, level, BTREE_NODE_INTENT_LOCKED);
 
 	trans_for_each_path(trans, path)
 		if (path->l[level].b == b) {
@@ -720,7 +720,7 @@ err:
 
 		mutex_unlock(&c->btree_interior_update_lock);
 
-		mark_btree_node_locked_noreset(path, b->c.level, SIX_LOCK_intent);
+		mark_btree_node_locked_noreset(path, b->c.level, BTREE_NODE_INTENT_LOCKED);
 		six_unlock_write(&b->c.lock);
 
 		btree_node_write_if_need(c, b, SIX_LOCK_intent);
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index afe89324dfef..369e37a415f3 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -413,7 +413,7 @@ static int run_one_mem_trigger(struct btree_trans *trans,
 	if (unlikely(flags & BTREE_TRIGGER_NORUN))
 		return 0;
 
-	if (!btree_node_type_needs_gc(i->btree_id))
+	if (!btree_node_type_needs_gc((enum btree_node_type) i->btree_id))
 		return 0;
 
 	if (old_ops->atomic_trigger == new_ops->atomic_trigger &&
diff --git a/fs/bcachefs/darray.h b/fs/bcachefs/darray.h
index d4485fa01b2a..114f86b45fd5 100644
--- a/fs/bcachefs/darray.h
+++ b/fs/bcachefs/darray.h
@@ -59,13 +59,13 @@ static inline int __darray_make_room(darray_void *d, size_t t_size, size_t more,
 #define darray_first(_d)	((_d).data[0])
 #define darray_last(_d)		((_d).data[(_d).nr - 1])
 
-#define darray_insert_item(_d, _pos, _item)				\
+#define darray_insert_item(_d, pos, _item)				\
 ({									\
-	size_t pos = (_pos);						\
+	size_t _pos = (pos);						\
 	int _ret = darray_make_room((_d), 1);				\
 									\
 	if (!_ret)							\
-		array_insert_item((_d)->data, (_d)->nr, pos, (_item));	\
+		array_insert_item((_d)->data, (_d)->nr, _pos, (_item));	\
 	_ret;								\
 })
 
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index efbb7cf7a5d0..f58e84a2bf88 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -200,11 +200,14 @@ static bool extent_has_stripe_ptr(struct bkey_s_c k, u64 idx)
 
 static void ec_stripe_buf_exit(struct ec_stripe_buf *buf)
 {
-	unsigned i;
+	if (buf->key.k.type == KEY_TYPE_stripe) {
+		struct bkey_i_stripe *s = bkey_i_to_stripe(&buf->key);
+		unsigned i;
 
-	for (i = 0; i < buf->key.v.nr_blocks; i++) {
-		kvpfree(buf->data[i], buf->size << 9);
-		buf->data[i] = NULL;
+		for (i = 0; i < s->v.nr_blocks; i++) {
+			kvpfree(buf->data[i], buf->size << 9);
+			buf->data[i] = NULL;
+		}
 	}
 }
 
@@ -212,7 +215,7 @@ static void ec_stripe_buf_exit(struct ec_stripe_buf *buf)
 static int ec_stripe_buf_init(struct ec_stripe_buf *buf,
 			      unsigned offset, unsigned size)
 {
-	struct bch_stripe *v = &buf->key.v;
+	struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v;
 	unsigned csum_granularity = 1U << v->csum_granularity_bits;
 	unsigned end = offset + size;
 	unsigned i;
@@ -228,7 +231,7 @@ static int ec_stripe_buf_init(struct ec_stripe_buf *buf,
 
 	memset(buf->valid, 0xFF, sizeof(buf->valid));
 
-	for (i = 0; i < buf->key.v.nr_blocks; i++) {
+	for (i = 0; i < v->nr_blocks; i++) {
 		buf->data[i] = kvpmalloc(buf->size << 9, GFP_KERNEL);
 		if (!buf->data[i])
 			goto err;
@@ -245,7 +248,7 @@ err:
 static struct bch_csum ec_block_checksum(struct ec_stripe_buf *buf,
 					 unsigned block, unsigned offset)
 {
-	struct bch_stripe *v = &buf->key.v;
+	struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v;
 	unsigned csum_granularity = 1 << v->csum_granularity_bits;
 	unsigned end = buf->offset + buf->size;
 	unsigned len = min(csum_granularity, end - offset);
@@ -264,7 +267,7 @@ static struct bch_csum ec_block_checksum(struct ec_stripe_buf *buf,
 
 static void ec_generate_checksums(struct ec_stripe_buf *buf)
 {
-	struct bch_stripe *v = &buf->key.v;
+	struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v;
 	unsigned i, j, csums_per_device = stripe_csums_per_device(v);
 
 	if (!v->csum_type)
@@ -281,7 +284,7 @@ static void ec_generate_checksums(struct ec_stripe_buf *buf)
 
 static void ec_validate_checksums(struct bch_fs *c, struct ec_stripe_buf *buf)
 {
-	struct bch_stripe *v = &buf->key.v;
+	struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v;
 	unsigned csum_granularity = 1 << v->csum_granularity_bits;
 	unsigned i;
 
@@ -304,7 +307,7 @@ static void ec_validate_checksums(struct bch_fs *c, struct ec_stripe_buf *buf)
 			if (bch2_crc_cmp(want, got)) {
 				struct printbuf buf2 = PRINTBUF;
 
-				bch2_bkey_val_to_text(&buf2, c, bkey_i_to_s_c(&buf->key.k_i));
+				bch2_bkey_val_to_text(&buf2, c, bkey_i_to_s_c(&buf->key));
 
 				bch_err_ratelimited(c,
 					"stripe checksum error for %ps at %u:%u: csum type %u, expected %llx got %llx\n%s",
@@ -324,7 +327,7 @@ static void ec_validate_checksums(struct bch_fs *c, struct ec_stripe_buf *buf)
 
 static void ec_generate_ec(struct ec_stripe_buf *buf)
 {
-	struct bch_stripe *v = &buf->key.v;
+	struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v;
 	unsigned nr_data = v->nr_blocks - v->nr_redundant;
 	unsigned bytes = le16_to_cpu(v->sectors) << 9;
 
@@ -333,13 +336,14 @@ static void ec_generate_ec(struct ec_stripe_buf *buf)
 
 static unsigned ec_nr_failed(struct ec_stripe_buf *buf)
 {
-	return buf->key.v.nr_blocks -
-		bitmap_weight(buf->valid, buf->key.v.nr_blocks);
+	struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v;
+
+	return v->nr_blocks - bitmap_weight(buf->valid, v->nr_blocks);
 }
 
 static int ec_do_recov(struct bch_fs *c, struct ec_stripe_buf *buf)
 {
-	struct bch_stripe *v = &buf->key.v;
+	struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v;
 	unsigned i, failed[BCH_BKEY_PTRS_MAX], nr_failed = 0;
 	unsigned nr_data = v->nr_blocks - v->nr_redundant;
 	unsigned bytes = buf->size << 9;
@@ -363,7 +367,7 @@ static int ec_do_recov(struct bch_fs *c, struct ec_stripe_buf *buf)
 static void ec_block_endio(struct bio *bio)
 {
 	struct ec_bio *ec_bio = container_of(bio, struct ec_bio, bio);
-	struct bch_stripe *v = &ec_bio->buf->key.v;
+	struct bch_stripe *v = &bkey_i_to_stripe(&ec_bio->buf->key)->v;
 	struct bch_extent_ptr *ptr = &v->ptrs[ec_bio->idx];
 	struct bch_dev *ca = ec_bio->ca;
 	struct closure *cl = bio->bi_private;
@@ -388,11 +392,11 @@ static void ec_block_endio(struct bio *bio)
 static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf,
 			blk_opf_t opf, unsigned idx, struct closure *cl)
 {
-	struct bch_stripe *v = &buf->key.v;
+	struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v;
 	unsigned offset = 0, bytes = buf->size << 9;
 	struct bch_extent_ptr *ptr = &v->ptrs[idx];
 	struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
-	enum bch_data_type data_type = idx < buf->key.v.nr_blocks - buf->key.v.nr_redundant
+	enum bch_data_type data_type = idx < v->nr_blocks - v->nr_redundant
 		? BCH_DATA_user
 		: BCH_DATA_parity;
 	int rw = op_is_write(opf);
@@ -463,7 +467,7 @@ static int get_stripe_key_trans(struct btree_trans *trans, u64 idx,
 		ret = -ENOENT;
 		goto err;
 	}
-	bkey_reassemble(&stripe->key.k_i, k);
+	bkey_reassemble(&stripe->key, k);
 err:
 	bch2_trans_iter_exit(trans, &iter);
 	return ret;
@@ -499,7 +503,7 @@ int bch2_ec_read_extent(struct bch_fs *c, struct bch_read_bio *rbio)
 		return -EIO;
 	}
 
-	v = &buf->key.v;
+	v = &bkey_i_to_stripe(&buf->key)->v;
 
 	if (!bch2_ptr_matches_stripe(v, rbio->pick)) {
 		bch_err_ratelimited(c,
@@ -875,6 +879,7 @@ static int ec_stripe_update_extent(struct btree_trans *trans,
 				   struct ec_stripe_buf *s,
 				   struct bpos *bp_pos)
 {
+	struct bch_stripe *v = &bkey_i_to_stripe(&s->key)->v;
 	struct bch_fs *c = trans->c;
 	struct bch_backpointer bp;
 	struct btree_iter iter;
@@ -926,7 +931,7 @@ static int ec_stripe_update_extent(struct btree_trans *trans,
 	if (extent_has_stripe_ptr(k, s->key.k.p.offset))
 		goto out;
 
-	ptr_c = bkey_matches_stripe(&s->key.v, k, &block);
+	ptr_c = bkey_matches_stripe(v, k, &block);
 	/*
 	 * It doesn't generally make sense to erasure code cached ptrs:
 	 * XXX: should we be incrementing a counter?
@@ -934,7 +939,7 @@ static int ec_stripe_update_extent(struct btree_trans *trans,
 	if (!ptr_c || ptr_c->cached)
 		goto out;
 
-	dev = s->key.v.ptrs[block].dev;
+	dev = v->ptrs[block].dev;
 
 	n = bch2_trans_kmalloc(trans, bkey_bytes(k.k) + sizeof(stripe_ptr));
 	ret = PTR_ERR_OR_ZERO(n);
@@ -950,7 +955,7 @@ static int ec_stripe_update_extent(struct btree_trans *trans,
 	stripe_ptr = (struct bch_extent_stripe_ptr) {
 		.type = 1 << BCH_EXTENT_ENTRY_stripe_ptr,
 		.block		= block,
-		.redundancy	= s->key.v.nr_redundant,
+		.redundancy	= v->nr_redundant,
 		.idx		= s->key.k.p.offset,
 	};
 
@@ -968,7 +973,8 @@ static int ec_stripe_update_bucket(struct btree_trans *trans, struct ec_stripe_b
 				   unsigned block)
 {
 	struct bch_fs *c = trans->c;
-	struct bch_extent_ptr bucket = s->key.v.ptrs[block];
+	struct bch_stripe *v = &bkey_i_to_stripe(&s->key)->v;
+	struct bch_extent_ptr bucket = v->ptrs[block];
 	struct bpos bucket_pos = PTR_BUCKET_POS(c, &bucket);
 	struct bpos bp_pos = POS_MIN;
 	int ret = 0;
@@ -993,7 +999,7 @@ static int ec_stripe_update_bucket(struct btree_trans *trans, struct ec_stripe_b
 static int ec_stripe_update_extents(struct bch_fs *c, struct ec_stripe_buf *s)
 {
 	struct btree_trans trans;
-	struct bch_stripe *v = &s->key.v;
+	struct bch_stripe *v = &bkey_i_to_stripe(&s->key)->v;
 	unsigned i, nr_data = v->nr_blocks - v->nr_redundant;
 	int ret = 0;
 
@@ -1057,7 +1063,7 @@ static void ec_stripe_create(struct ec_stripe_new *s)
 {
 	struct bch_fs *c = s->c;
 	struct open_bucket *ob;
-	struct bch_stripe *v = &s->new_stripe.key.v;
+	struct bch_stripe *v = &bkey_i_to_stripe(&s->new_stripe.key)->v;
 	unsigned i, nr_data = v->nr_blocks - v->nr_redundant;
 	int ret;
 
@@ -1090,7 +1096,7 @@ static void ec_stripe_create(struct ec_stripe_new *s)
 		}
 
 		for (i = 0; i < nr_data; i++)
-			if (stripe_blockcount_get(&s->existing_stripe.key.v, i))
+			if (stripe_blockcount_get(&bkey_i_to_stripe(&s->existing_stripe.key)->v, i))
 				swap(s->new_stripe.data[i],
 				     s->existing_stripe.data[i]);
 
@@ -1117,8 +1123,9 @@ static void ec_stripe_create(struct ec_stripe_new *s)
 	ret = bch2_trans_do(c, &s->res, NULL,
 			    BTREE_INSERT_NOCHECK_RW|
 			    BTREE_INSERT_NOFAIL,
-			    ec_stripe_key_update(&trans, &s->new_stripe.key,
-						 !s->have_existing_stripe));
+			    ec_stripe_key_update(&trans,
+					bkey_i_to_stripe(&s->new_stripe.key),
+					!s->have_existing_stripe));
 	if (ret) {
 		bch_err(c, "error creating stripe: error creating stripe key");
 		goto err;
@@ -1279,14 +1286,14 @@ static bool may_create_new_stripe(struct bch_fs *c)
 }
 
 static void ec_stripe_key_init(struct bch_fs *c,
-			       struct bkey_i_stripe *s,
+			       struct bkey_i *k,
 			       unsigned nr_data,
 			       unsigned nr_parity,
 			       unsigned stripe_size)
 {
+	struct bkey_i_stripe *s = bkey_stripe_init(k);
 	unsigned u64s;
 
-	bkey_stripe_init(&s->k_i);
 	s->v.sectors			= cpu_to_le16(stripe_size);
 	s->v.algorithm			= 0;
 	s->v.nr_blocks			= nr_data + nr_parity;
@@ -1325,8 +1332,8 @@ static int ec_new_stripe_alloc(struct bch_fs *c, struct ec_stripe_head *h)
 				BCH_BKEY_PTRS_MAX) - h->redundancy;
 	s->nr_parity	= h->redundancy;
 
-	ec_stripe_key_init(c, &s->new_stripe.key, s->nr_data,
-			   s->nr_parity, h->blocksize);
+	ec_stripe_key_init(c, &s->new_stripe.key,
+			   s->nr_data, s->nr_parity, h->blocksize);
 
 	h->s = s;
 	return 0;
@@ -1429,15 +1436,16 @@ static int new_stripe_alloc_buckets(struct btree_trans *trans, struct ec_stripe_
 	struct bch_devs_mask devs = h->devs;
 	struct open_bucket *ob;
 	struct open_buckets buckets;
+	struct bch_stripe *v = &bkey_i_to_stripe(&h->s->new_stripe.key)->v;
 	unsigned i, j, nr_have_parity = 0, nr_have_data = 0;
 	bool have_cache = true;
 	int ret = 0;
 
-	BUG_ON(h->s->new_stripe.key.v.nr_blocks		!= h->s->nr_data + h->s->nr_parity);
-	BUG_ON(h->s->new_stripe.key.v.nr_redundant	!= h->s->nr_parity);
+	BUG_ON(v->nr_blocks	!= h->s->nr_data + h->s->nr_parity);
+	BUG_ON(v->nr_redundant	!= h->s->nr_parity);
 
-	for_each_set_bit(i, h->s->blocks_gotten, h->s->new_stripe.key.v.nr_blocks) {
-		__clear_bit(h->s->new_stripe.key.v.ptrs[i].dev, devs.d);
+	for_each_set_bit(i, h->s->blocks_gotten, v->nr_blocks) {
+		__clear_bit(v->ptrs[i].dev, devs.d);
 		if (i < h->s->nr_data)
 			nr_have_data++;
 		else
@@ -1466,7 +1474,7 @@ static int new_stripe_alloc_buckets(struct btree_trans *trans, struct ec_stripe_
 			BUG_ON(j >= h->s->nr_data + h->s->nr_parity);
 
 			h->s->blocks[j] = buckets.v[i];
-			h->s->new_stripe.key.v.ptrs[j] = bch2_ob_ptr(c, ob);
+			v->ptrs[j] = bch2_ob_ptr(c, ob);
 			__set_bit(j, h->s->blocks_gotten);
 		}
 
@@ -1492,7 +1500,7 @@ static int new_stripe_alloc_buckets(struct btree_trans *trans, struct ec_stripe_
 			BUG_ON(j >= h->s->nr_data);
 
 			h->s->blocks[j] = buckets.v[i];
-			h->s->new_stripe.key.v.ptrs[j] = bch2_ob_ptr(c, ob);
+			v->ptrs[j] = bch2_ob_ptr(c, ob);
 			__set_bit(j, h->s->blocks_gotten);
 		}
 
@@ -1542,6 +1550,8 @@ static s64 get_existing_stripe(struct bch_fs *c,
 static int __bch2_ec_stripe_head_reuse(struct btree_trans *trans, struct ec_stripe_head *h)
 {
 	struct bch_fs *c = trans->c;
+	struct bch_stripe *new_v = &bkey_i_to_stripe(&h->s->new_stripe.key)->v;
+	struct bch_stripe *existing_v;
 	unsigned i;
 	s64 idx;
 	int ret;
@@ -1562,9 +1572,11 @@ static int __bch2_ec_stripe_head_reuse(struct btree_trans *trans, struct ec_stri
 		return ret;
 	}
 
-	BUG_ON(h->s->existing_stripe.key.v.nr_redundant != h->s->nr_parity);
-	h->s->nr_data = h->s->existing_stripe.key.v.nr_blocks -
-		h->s->existing_stripe.key.v.nr_redundant;
+	existing_v = &bkey_i_to_stripe(&h->s->existing_stripe.key)->v;
+
+	BUG_ON(existing_v->nr_redundant != h->s->nr_parity);
+	h->s->nr_data = existing_v->nr_blocks -
+		existing_v->nr_redundant;
 
 	ret = ec_stripe_buf_init(&h->s->existing_stripe, 0, h->blocksize);
 	if (ret) {
@@ -1573,21 +1585,21 @@ static int __bch2_ec_stripe_head_reuse(struct btree_trans *trans, struct ec_stri
 	}
 
 	BUG_ON(h->s->existing_stripe.size != h->blocksize);
-	BUG_ON(h->s->existing_stripe.size != le16_to_cpu(h->s->existing_stripe.key.v.sectors));
+	BUG_ON(h->s->existing_stripe.size != le16_to_cpu(existing_v->sectors));
 
 	/*
 	 * Free buckets we initially allocated - they might conflict with
 	 * blocks from the stripe we're reusing:
 	 */
-	for_each_set_bit(i, h->s->blocks_gotten, h->s->new_stripe.key.v.nr_blocks) {
+	for_each_set_bit(i, h->s->blocks_gotten, new_v->nr_blocks) {
 		bch2_open_bucket_put(c, c->open_buckets + h->s->blocks[i]);
 		h->s->blocks[i] = 0;
 	}
 	memset(h->s->blocks_gotten, 0, sizeof(h->s->blocks_gotten));
 	memset(h->s->blocks_allocated, 0, sizeof(h->s->blocks_allocated));
 
-	for (i = 0; i < h->s->existing_stripe.key.v.nr_blocks; i++) {
-		if (stripe_blockcount_get(&h->s->existing_stripe.key.v, i)) {
+	for (i = 0; i < existing_v->nr_blocks; i++) {
+		if (stripe_blockcount_get(existing_v, i)) {
 			__set_bit(i, h->s->blocks_gotten);
 			__set_bit(i, h->s->blocks_allocated);
 		}
@@ -1595,7 +1607,7 @@ static int __bch2_ec_stripe_head_reuse(struct btree_trans *trans, struct ec_stri
 		ec_block_io(c, &h->s->existing_stripe, READ, i, &h->s->iodone);
 	}
 
-	bkey_copy(&h->s->new_stripe.key.k_i, &h->s->existing_stripe.key.k_i);
+	bkey_copy(&h->s->new_stripe.key, &h->s->existing_stripe.key);
 	h->s->have_existing_stripe = true;
 
 	return 0;
@@ -1764,7 +1776,7 @@ static void __bch2_ec_stop(struct bch_fs *c, struct bch_dev *ca)
 		if (!ca)
 			goto found;
 
-		for (i = 0; i < h->s->new_stripe.key.v.nr_blocks; i++) {
+		for (i = 0; i < bkey_i_to_stripe(&h->s->new_stripe.key)->v.nr_blocks; i++) {
 			if (!h->s->blocks[i])
 				continue;
 
@@ -1922,7 +1934,7 @@ void bch2_fs_ec_exit(struct bch_fs *c)
 			break;
 
 		if (h->s) {
-			for (i = 0; i < h->s->new_stripe.key.v.nr_blocks; i++)
+			for (i = 0; i < bkey_i_to_stripe(&h->s->new_stripe.key)->v.nr_blocks; i++)
 				BUG_ON(h->s->blocks[i]);
 
 			kfree(h->s);
diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h
index 1b1848e5fa5e..885ae5d51655 100644
--- a/fs/bcachefs/ec.h
+++ b/fs/bcachefs/ec.h
@@ -138,10 +138,7 @@ struct ec_stripe_buf {
 
 	void			*data[BCH_BKEY_PTRS_MAX];
 
-	union {
-		struct bkey_i_stripe	key;
-		u64			pad[255];
-	};
+	__BKEY_PADDED(key, 255);
 };
 
 struct ec_stripe_head;
diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c
index 3a6fd1d96ed2..4d0daeba6f59 100644
--- a/fs/bcachefs/opts.c
+++ b/fs/bcachefs/opts.c
@@ -121,10 +121,10 @@ static void bch2_opt_fix_errors_to_text(struct printbuf *out,
 	prt_str(out, bch2_fsck_fix_opts[v]);
 }
 
-static const struct bch_opt_fn bch2_opt_fix_errors = {
-	.parse = bch2_opt_fix_errors_parse,
-	.to_text = bch2_opt_fix_errors_to_text,
-};
+#define bch2_opt_fix_errors (struct bch_opt_fn) {	\
+	.parse = bch2_opt_fix_errors_parse,		\
+	.to_text = bch2_opt_fix_errors_to_text,		\
+}
 
 const char * const bch2_d_types[BCH_DT_MAX] = {
 	[DT_UNKNOWN]	= "unknown",
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index cea7c7caa1c0..beb00f799fe4 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -265,16 +265,13 @@ struct bch_sb_field *bch2_sb_field_resize(struct bch_sb_handle *sb,
 
 /* Superblock validate: */
 
-static inline void __bch2_sb_layout_size_assert(void)
-{
-	BUILD_BUG_ON(sizeof(struct bch_sb_layout) != 512);
-}
-
 static int validate_sb_layout(struct bch_sb_layout *layout, struct printbuf *out)
 {
 	u64 offset, prev_offset, max_sectors;
 	unsigned i;
 
+	BUILD_BUG_ON(sizeof(struct bch_sb_layout) != 512);
+
 	if (!uuid_equal(&layout->magic, &BCACHE_MAGIC) &&
 	    !uuid_equal(&layout->magic, &BCHFS_MAGIC)) {
 		prt_printf(out, "Not a bcachefs superblock layout");
-- 
cgit 


From cf416e7ae6ea9699524cdce1a51a7cd427193203 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 2 Aug 2023 12:51:51 -0400
Subject: bcachefs: Handle weird opt string from sys_fsconfig()

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/opts.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c
index 4d0daeba6f59..960bb247f3a0 100644
--- a/fs/bcachefs/opts.c
+++ b/fs/bcachefs/opts.c
@@ -445,6 +445,13 @@ int bch2_parse_mount_opts(struct bch_fs *c, struct bch_opts *opts,
 	if (!options)
 		return 0;
 
+	/*
+	 * sys_fsconfig() is now occasionally providing us with option lists
+	 * starting with a comma - weird.
+	 */
+	if (*options == ',')
+		options++;
+
 	copied_opts = kstrdup(options, GFP_KERNEL);
 	if (!copied_opts)
 		return -1;
-- 
cgit 


From 1074a21c8858d088261387d449d3a64e188c906c Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 2 Aug 2023 19:49:24 -0400
Subject: bcachefs: recovery_types.h

Move some code out of bcachefs.h, which is too much of an everything
header.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs.h       | 43 +---------------------------------------
 fs/bcachefs/recovery_types.h | 47 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 48 insertions(+), 42 deletions(-)
 create mode 100644 fs/bcachefs/recovery_types.h

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 82b0706a8936..87be62c27414 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -208,6 +208,7 @@
 #include "fifo.h"
 #include "nocow_locking_types.h"
 #include "opts.h"
+#include "recovery_types.h"
 #include "seqmutex.h"
 #include "util.h"
 
@@ -655,48 +656,6 @@ enum bch_write_ref {
 	BCH_WRITE_REF_NR,
 };
 
-#define PASS_SILENT		BIT(0)
-#define PASS_FSCK		BIT(1)
-#define PASS_UNCLEAN		BIT(2)
-#define PASS_ALWAYS		BIT(3)
-
-#define BCH_RECOVERY_PASSES()									\
-	x(alloc_read,			PASS_ALWAYS)						\
-	x(stripes_read,			PASS_ALWAYS)						\
-	x(initialize_subvolumes,	0)							\
-	x(snapshots_read,		PASS_ALWAYS)						\
-	x(check_topology,		0)							\
-	x(check_allocations,		PASS_FSCK)						\
-	x(set_may_go_rw,		PASS_ALWAYS|PASS_SILENT)				\
-	x(journal_replay,		PASS_ALWAYS)						\
-	x(check_alloc_info,		PASS_FSCK)						\
-	x(check_lrus,			PASS_FSCK)						\
-	x(check_btree_backpointers,	PASS_FSCK)						\
-	x(check_backpointers_to_extents,PASS_FSCK)						\
-	x(check_extents_to_backpointers,PASS_FSCK)						\
-	x(check_alloc_to_lru_refs,	PASS_FSCK)						\
-	x(fs_freespace_init,		PASS_ALWAYS|PASS_SILENT)				\
-	x(bucket_gens_init,		0)							\
-	x(check_snapshot_trees,		PASS_FSCK)						\
-	x(check_snapshots,		PASS_FSCK)						\
-	x(check_subvols,		PASS_FSCK)						\
-	x(delete_dead_snapshots,	PASS_FSCK|PASS_UNCLEAN)					\
-	x(fs_upgrade_for_subvolumes,	0)							\
-	x(check_inodes,			PASS_FSCK|PASS_UNCLEAN)					\
-	x(check_extents,		PASS_FSCK)						\
-	x(check_dirents,		PASS_FSCK)						\
-	x(check_xattrs,			PASS_FSCK)						\
-	x(check_root,			PASS_FSCK)						\
-	x(check_directory_structure,	PASS_FSCK)						\
-	x(check_nlinks,			PASS_FSCK)						\
-	x(fix_reflink_p,		0)							\
-
-enum bch_recovery_pass {
-#define x(n, when)	BCH_RECOVERY_PASS_##n,
-	BCH_RECOVERY_PASSES()
-#undef x
-};
-
 struct bch_fs {
 	struct closure		cl;
 
diff --git a/fs/bcachefs/recovery_types.h b/fs/bcachefs/recovery_types.h
new file mode 100644
index 000000000000..377f51188a20
--- /dev/null
+++ b/fs/bcachefs/recovery_types.h
@@ -0,0 +1,47 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_RECOVERY_TYPES_H
+#define _BCACHEFS_RECOVERY_TYPES_H
+
+#define PASS_SILENT		BIT(0)
+#define PASS_FSCK		BIT(1)
+#define PASS_UNCLEAN		BIT(2)
+#define PASS_ALWAYS		BIT(3)
+
+#define BCH_RECOVERY_PASSES()									\
+	x(alloc_read,			PASS_ALWAYS)						\
+	x(stripes_read,			PASS_ALWAYS)						\
+	x(initialize_subvolumes,	0)							\
+	x(snapshots_read,		PASS_ALWAYS)						\
+	x(check_topology,		0)							\
+	x(check_allocations,		PASS_FSCK)						\
+	x(set_may_go_rw,		PASS_ALWAYS|PASS_SILENT)				\
+	x(journal_replay,		PASS_ALWAYS)						\
+	x(check_alloc_info,		PASS_FSCK)						\
+	x(check_lrus,			PASS_FSCK)						\
+	x(check_btree_backpointers,	PASS_FSCK)						\
+	x(check_backpointers_to_extents,PASS_FSCK)						\
+	x(check_extents_to_backpointers,PASS_FSCK)						\
+	x(check_alloc_to_lru_refs,	PASS_FSCK)						\
+	x(fs_freespace_init,		PASS_ALWAYS|PASS_SILENT)				\
+	x(bucket_gens_init,		0)							\
+	x(check_snapshot_trees,		PASS_FSCK)						\
+	x(check_snapshots,		PASS_FSCK)						\
+	x(check_subvols,		PASS_FSCK)						\
+	x(delete_dead_snapshots,	PASS_FSCK|PASS_UNCLEAN)					\
+	x(fs_upgrade_for_subvolumes,	0)							\
+	x(check_inodes,			PASS_FSCK|PASS_UNCLEAN)					\
+	x(check_extents,		PASS_FSCK)						\
+	x(check_dirents,		PASS_FSCK)						\
+	x(check_xattrs,			PASS_FSCK)						\
+	x(check_root,			PASS_FSCK)						\
+	x(check_directory_structure,	PASS_FSCK)						\
+	x(check_nlinks,			PASS_FSCK)						\
+	x(fix_reflink_p,		0)							\
+
+enum bch_recovery_pass {
+#define x(n, when)	BCH_RECOVERY_PASS_##n,
+	BCH_RECOVERY_PASSES()
+#undef x
+};
+
+#endif /* _BCACHEFS_RECOVERY_TYPES_H */
-- 
cgit 


From b56b787c7d5e8300dba41d80ddd41472a39ca86f Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 2 Aug 2023 20:19:58 -0400
Subject: bcachefs: In debug mode, run fsck again after fixing errors

We want to ensure that fsck actually fixed all the errors it found - the
second fsck run should be clean.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/recovery.c | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index dcd4f9f410ae..55a233c2c7cc 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -1458,6 +1458,29 @@ use_clean:
 	if (ret)
 		goto err;
 
+	/* If we fixed errors, verify that fs is actually clean now: */
+	if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) &&
+	    test_bit(BCH_FS_ERRORS_FIXED, &c->flags) &&
+	    !test_bit(BCH_FS_ERRORS_NOT_FIXED, &c->flags) &&
+	    !test_bit(BCH_FS_ERROR, &c->flags)) {
+		bch_info(c, "Fixed errors, running fsck a second time to verify fs is clean");
+		clear_bit(BCH_FS_ERRORS_FIXED, &c->flags);
+
+		c->curr_recovery_pass = BCH_RECOVERY_PASS_check_alloc_info;
+
+		ret = bch2_run_recovery_passes(c);
+		if (ret)
+			goto err;
+
+		if (test_bit(BCH_FS_ERRORS_FIXED, &c->flags) ||
+		    test_bit(BCH_FS_ERRORS_NOT_FIXED, &c->flags)) {
+			bch_err(c, "Second fsck run was not clean");
+			set_bit(BCH_FS_ERRORS_NOT_FIXED, &c->flags);
+		}
+
+		set_bit(BCH_FS_ERRORS_FIXED, &c->flags);
+	}
+
 	if (enabled_qtypes(c)) {
 		bch_verbose(c, "reading quotas");
 		ret = bch2_fs_quota_read(c);
-- 
cgit 


From e2bd06178c458d0178265f75d7210e6c090bbfdb Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 20 Jul 2023 22:42:26 -0400
Subject: bcachefs: Fix overlapping extent repair

A number of smallish fixes for overlapping extent repair, and (part of)
a new unit test. This fixes all the issues turned up by bhzhu203, in his
filesystem image from running mongodb + snapshots.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fsck.c  | 138 +++++++++++++++++++++++++++++++++++++---------------
 fs/bcachefs/tests.c |  31 ++++++++++++
 2 files changed, 131 insertions(+), 38 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 31eb9176eb39..28dc8b4e3ed7 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -408,6 +408,28 @@ static inline void snapshots_seen_init(struct snapshots_seen *s)
 	memset(s, 0, sizeof(*s));
 }
 
+static int snapshots_seen_add_inorder(struct bch_fs *c, struct snapshots_seen *s, u32 id)
+{
+	struct snapshots_seen_entry *i, n = {
+		.id	= id,
+		.equiv	= bch2_snapshot_equiv(c, id),
+	};
+	int ret = 0;
+
+	darray_for_each(s->ids, i) {
+		if (i->id == id)
+			return 0;
+		if (i->id > id)
+			break;
+	}
+
+	ret = darray_insert_item(&s->ids, i - s->ids.data, n);
+	if (ret)
+		bch_err(c, "error reallocating snapshots_seen table (size %zu)",
+			s->ids.size);
+	return ret;
+}
+
 static int snapshots_seen_update(struct bch_fs *c, struct snapshots_seen *s,
 				 enum btree_id btree_id, struct bpos pos)
 {
@@ -1122,74 +1144,116 @@ static int extent_ends_at(struct bch_fs *c,
 
 static int overlapping_extents_found(struct btree_trans *trans,
 				     enum btree_id btree,
-				     struct bpos pos1, struct bkey pos2,
-				     bool *fixed)
+				     struct bpos pos1, struct snapshots_seen *pos1_seen,
+				     struct bkey pos2,
+				     bool *fixed,
+				     struct extent_end *extent_end)
 {
 	struct bch_fs *c = trans->c;
 	struct printbuf buf = PRINTBUF;
-	struct btree_iter iter;
-	struct bkey_s_c k;
-	u32 snapshot = min(pos1.snapshot, pos2.p.snapshot);
+	struct btree_iter iter1, iter2 = { NULL };
+	struct bkey_s_c k1, k2;
 	int ret;
 
 	BUG_ON(bkey_le(pos1, bkey_start_pos(&pos2)));
 
-	bch2_trans_iter_init(trans, &iter, btree, SPOS(pos1.inode, pos1.offset - 1, snapshot), 0);
-	k = bch2_btree_iter_peek_upto(&iter, POS(pos1.inode, U64_MAX));
-	ret = bkey_err(k);
+	bch2_trans_iter_init(trans, &iter1, btree, pos1,
+			     BTREE_ITER_ALL_SNAPSHOTS|
+			     BTREE_ITER_NOT_EXTENTS);
+	k1 = bch2_btree_iter_peek_upto(&iter1, POS(pos1.inode, U64_MAX));
+	ret = bkey_err(k1);
 	if (ret)
 		goto err;
 
 	prt_str(&buf, "\n  ");
-	bch2_bkey_val_to_text(&buf, c, k);
+	bch2_bkey_val_to_text(&buf, c, k1);
+
+	if (!bpos_eq(pos1, k1.k->p)) {
+		prt_str(&buf, "\n  wanted\n  ");
+		bch2_bpos_to_text(&buf, pos1);
+		prt_str(&buf, "\n  ");
+		bch2_bkey_to_text(&buf, &pos2);
 
-	if (!bpos_eq(pos1, k.k->p)) {
-		bch_err(c, "%s: error finding first overlapping extent when repairing%s",
+		bch_err(c, "%s: error finding first overlapping extent when repairing, got%s",
 			__func__, buf.buf);
 		ret = -BCH_ERR_internal_fsck_err;
 		goto err;
 	}
 
+	bch2_trans_copy_iter(&iter2, &iter1);
+
 	while (1) {
-		bch2_btree_iter_advance(&iter);
+		bch2_btree_iter_advance(&iter2);
 
-		k = bch2_btree_iter_peek_upto(&iter, POS(pos1.inode, U64_MAX));
-		ret = bkey_err(k);
+		k2 = bch2_btree_iter_peek_upto(&iter2, POS(pos1.inode, U64_MAX));
+		ret = bkey_err(k2);
 		if (ret)
 			goto err;
 
-		if (bkey_ge(k.k->p, pos2.p))
+		if (bpos_ge(k2.k->p, pos2.p))
 			break;
-
 	}
 
 	prt_str(&buf, "\n  ");
-	bch2_bkey_val_to_text(&buf, c, k);
+	bch2_bkey_val_to_text(&buf, c, k2);
 
-	if (bkey_gt(k.k->p, pos2.p) ||
-	    pos2.size != k.k->size) {
+	if (bpos_gt(k2.k->p, pos2.p) ||
+	    pos2.size != k2.k->size) {
 		bch_err(c, "%s: error finding seconding overlapping extent when repairing%s",
 			__func__, buf.buf);
 		ret = -BCH_ERR_internal_fsck_err;
 		goto err;
 	}
 
+	prt_printf(&buf, "\n  overwriting %s extent",
+		   pos1.snapshot >= pos2.p.snapshot ? "first" : "second");
+
 	if (fsck_err(c, "overlapping extents%s", buf.buf)) {
-		struct bpos update_pos = pos1.snapshot < pos2.p.snapshot ? pos1 : pos2.p;
-		struct btree_iter update_iter;
+		struct btree_iter *old_iter = &iter1;
+		struct disk_reservation res = { 0 };
 
-		struct bkey_i *update = bch2_bkey_get_mut(trans, &update_iter,
-						btree, update_pos,
-						BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
-		bch2_trans_iter_exit(trans, &update_iter);
-		if ((ret = PTR_ERR_OR_ZERO(update)))
+		if (pos1.snapshot < pos2.p.snapshot) {
+			old_iter = &iter2;
+			swap(k1, k2);
+		}
+
+		trans->extra_journal_res += bch2_bkey_sectors_compressed(k2);
+
+		ret =   bch2_trans_update_extent_overwrite(trans, old_iter,
+				BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE,
+				k1, k2) ?:
+			bch2_trans_commit(trans, &res, NULL,
+				BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL);
+		bch2_disk_reservation_put(c, &res);
+
+		if (ret)
 			goto err;
 
 		*fixed = true;
+
+		if (pos1.snapshot == pos2.p.snapshot) {
+			/*
+			 * We overwrote the first extent, and did the overwrite
+			 * in the same snapshot:
+			 */
+			extent_end->offset = bkey_start_offset(&pos2);
+		} else if (pos1.snapshot > pos2.p.snapshot) {
+			/*
+			 * We overwrote the first extent in pos2's snapshot:
+			 */
+			ret = snapshots_seen_add_inorder(c, pos1_seen, pos2.p.snapshot);
+		} else {
+			/*
+			 * We overwrote the second extent - restart
+			 * check_extent() from the top:
+			 */
+			ret = -BCH_ERR_transaction_restart_nested;
+		}
 	}
 fsck_err:
 err:
-	bch2_trans_iter_exit(trans, &iter);
+	bch2_trans_iter_exit(trans, &iter2);
+	bch2_trans_iter_exit(trans, &iter1);
 	printbuf_exit(&buf);
 	return ret;
 }
@@ -1199,11 +1263,11 @@ static int check_overlapping_extents(struct btree_trans *trans,
 			      struct extent_ends *extent_ends,
 			      struct bkey_s_c k,
 			      u32 equiv,
-			      struct btree_iter *iter)
+			      struct btree_iter *iter,
+			      bool *fixed)
 {
 	struct bch_fs *c = trans->c;
 	struct extent_end *i;
-	bool fixed = false;
 	int ret = 0;
 
 	/* transaction restart, running again */
@@ -1226,7 +1290,8 @@ static int check_overlapping_extents(struct btree_trans *trans,
 						SPOS(iter->pos.inode,
 						     i->offset,
 						     i->snapshot),
-						*k.k, &fixed);
+						&i->seen,
+						*k.k, fixed, i);
 		if (ret)
 			goto err;
 	}
@@ -1237,7 +1302,7 @@ static int check_overlapping_extents(struct btree_trans *trans,
 
 	extent_ends->last_pos = k.k->p;
 err:
-	return ret ?: fixed;
+	return ret;
 }
 
 static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
@@ -1292,13 +1357,10 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
 			goto delete;
 
 		ret = check_overlapping_extents(trans, s, extent_ends, k,
-						equiv.snapshot, iter);
-		if (ret < 0)
-			goto err;
-
+						equiv.snapshot, iter,
+						&inode->recalculate_sums);
 		if (ret)
-			inode->recalculate_sums = true;
-		ret = 0;
+			goto err;
 	}
 
 	/*
@@ -1373,7 +1435,7 @@ int bch2_check_extents(struct bch_fs *c)
 
 	snapshots_seen_init(&s);
 	extent_ends_init(&extent_ends);
-	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
+	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 4096);
 
 	ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_extents,
 			POS(BCACHEFS_ROOT_INO, 0),
diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c
index cef23d2ccc5f..1d4b0a583586 100644
--- a/fs/bcachefs/tests.c
+++ b/fs/bcachefs/tests.c
@@ -503,6 +503,36 @@ static int test_extent_overwrite_all(struct bch_fs *c, u64 nr)
 		__test_extent_overwrite(c, 32, 64, 32, 128);
 }
 
+static int insert_test_overlapping_extent(struct bch_fs *c, u64 inum, u64 start, u32 len, u32 snapid)
+{
+	struct bkey_i_cookie k;
+	int ret;
+
+	bkey_cookie_init(&k.k_i);
+	k.k_i.k.p.inode	= inum;
+	k.k_i.k.p.offset = start + len;
+	k.k_i.k.p.snapshot = snapid;
+	k.k_i.k.size = len;
+
+	ret = bch2_trans_do(c, NULL, NULL, 0,
+		bch2_btree_insert_nonextent(&trans, BTREE_ID_extents, &k.k_i,
+					    BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE));
+	if (ret)
+		bch_err_fn(c, ret);
+	return ret;
+}
+
+static int test_extent_create_overlapping(struct bch_fs *c, u64 inum)
+{
+	return  insert_test_overlapping_extent(c, inum,  0, 16, U32_MAX - 2) ?: /* overwrite entire */
+		insert_test_overlapping_extent(c, inum,  2,  8, U32_MAX - 2) ?:
+		insert_test_overlapping_extent(c, inum,  4,  4, U32_MAX) ?:
+		insert_test_overlapping_extent(c, inum, 32,  8, U32_MAX - 2) ?: /* overwrite front/back */
+		insert_test_overlapping_extent(c, inum, 36,  8, U32_MAX) ?:
+		insert_test_overlapping_extent(c, inum, 60,  8, U32_MAX - 2) ?:
+		insert_test_overlapping_extent(c, inum, 64,  8, U32_MAX);
+}
+
 /* snapshot unit tests */
 
 /* Test skipping over keys in unrelated snapshots: */
@@ -901,6 +931,7 @@ int bch2_btree_perf_test(struct bch_fs *c, const char *testname,
 	perf_test(test_extent_overwrite_back);
 	perf_test(test_extent_overwrite_middle);
 	perf_test(test_extent_overwrite_all);
+	perf_test(test_extent_create_overlapping);
 
 	perf_test(test_snapshots);
 
-- 
cgit 


From 0a6d694584aeb739b976bf69646fa3c23ee117c3 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 3 Aug 2023 03:29:42 -0400
Subject: bcachefs: Fix folio leak in folio_hole_offset()

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-io.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 0661dfd9a8d0..c461b65ab57a 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -3830,6 +3830,7 @@ static bool folio_hole_offset(struct address_space *mapping, loff_t *offset,
 	ret = false;
 unlock:
 	folio_unlock(folio);
+	folio_put(folio);
 	return ret;
 }
 
-- 
cgit 


From dde8cb11645b7d95766dfd13f273facde27923a9 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 17 Jul 2023 00:56:29 -0400
Subject: bcachefs: bcachefs_metadata_version_deleted_inodes

Add a new bitset btree for inodes pending deletion; this means we no
longer have to scan the full inodes btree after an unclean shutdown.

Specifically, this adds:
 - a trigger to update the deleted_inodes btree based on changes to the
   inodes btree
 - a new recovery pass
 - and check_inodes is now only a fsck pass.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs.h        |   1 +
 fs/bcachefs/bcachefs_format.h |   8 ++-
 fs/bcachefs/inode.c           | 115 ++++++++++++++++++++++++++++++++++++++++++
 fs/bcachefs/inode.h           |   1 +
 fs/bcachefs/recovery_types.h  |   3 +-
 5 files changed, 125 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 87be62c27414..e1f1e8e871a8 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -453,6 +453,7 @@ enum gc_phase {
 	GC_PHASE_BTREE_backpointers,
 	GC_PHASE_BTREE_bucket_gens,
 	GC_PHASE_BTREE_snapshot_trees,
+	GC_PHASE_BTREE_deleted_inodes,
 
 	GC_PHASE_PENDING_DELETE,
 };
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index b771d80f6361..5ec218ee3569 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -1629,7 +1629,9 @@ struct bch_sb_field_journal_seq_blacklist {
 	x(major_minor,			BCH_VERSION(1,  0),		\
 	  0)								\
 	x(snapshot_skiplists,		BCH_VERSION(1,  1),		\
-	  BIT_ULL(BCH_RECOVERY_PASS_check_snapshots))
+	  BIT_ULL(BCH_RECOVERY_PASS_check_snapshots))			\
+	x(deleted_inodes,		BCH_VERSION(1,  2),		\
+	  BIT_ULL(BCH_RECOVERY_PASS_check_inodes))
 
 enum bcachefs_metadata_version {
 	bcachefs_metadata_version_min = 9,
@@ -2251,7 +2253,9 @@ enum btree_id_flags {
 	x(bucket_gens,		14,	0,					\
 	  BIT_ULL(KEY_TYPE_bucket_gens))					\
 	x(snapshot_trees,	15,	0,					\
-	  BIT_ULL(KEY_TYPE_snapshot_tree))
+	  BIT_ULL(KEY_TYPE_snapshot_tree))					\
+	x(deleted_inodes,	16,	BTREE_ID_SNAPSHOTS,			\
+	  BIT_ULL(KEY_TYPE_set))
 
 enum btree_id {
 #define x(name, nr, ...) BTREE_ID_##name = nr,
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index 755cf7d177cd..294966e42850 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -2,6 +2,7 @@
 
 #include "bcachefs.h"
 #include "btree_key_cache.h"
+#include "btree_write_buffer.h"
 #include "bkey_methods.h"
 #include "btree_update.h"
 #include "buckets.h"
@@ -519,6 +520,25 @@ void bch2_inode_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c
 	__bch2_inode_unpacked_to_text(out, &inode);
 }
 
+static inline u64 bkey_inode_flags(struct bkey_s_c k)
+{
+	switch (k.k->type) {
+	case KEY_TYPE_inode:
+		return le32_to_cpu(bkey_s_c_to_inode(k).v->bi_flags);
+	case KEY_TYPE_inode_v2:
+		return le64_to_cpu(bkey_s_c_to_inode_v2(k).v->bi_flags);
+	case KEY_TYPE_inode_v3:
+		return le64_to_cpu(bkey_s_c_to_inode_v3(k).v->bi_flags);
+	default:
+		return 0;
+	}
+}
+
+static inline bool bkey_is_deleted_inode(struct bkey_s_c k)
+{
+	return bkey_inode_flags(k) & BCH_INODE_UNLINKED;
+}
+
 int bch2_trans_mark_inode(struct btree_trans *trans,
 			  enum btree_id btree_id, unsigned level,
 			  struct bkey_s_c old,
@@ -526,6 +546,8 @@ int bch2_trans_mark_inode(struct btree_trans *trans,
 			  unsigned flags)
 {
 	int nr = bkey_is_inode(&new->k) - bkey_is_inode(old.k);
+	bool old_deleted = bkey_is_deleted_inode(old);
+	bool new_deleted = bkey_is_deleted_inode(bkey_i_to_s_c(new));
 
 	if (nr) {
 		int ret = bch2_replicas_deltas_realloc(trans, 0);
@@ -537,6 +559,12 @@ int bch2_trans_mark_inode(struct btree_trans *trans,
 		d->nr_inodes += nr;
 	}
 
+	if (old_deleted != new_deleted) {
+		int ret = bch2_btree_bit_mod(trans, BTREE_ID_deleted_inodes, new->k.p, new_deleted);
+		if (ret)
+			return ret;
+	}
+
 	return 0;
 }
 
@@ -986,3 +1014,90 @@ err:
 
 	return ret ?: -BCH_ERR_transaction_restart_nested;
 }
+
+static int may_delete_deleted_inode(struct btree_trans *trans, struct bpos pos)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	struct bch_inode_unpacked inode;
+	int ret;
+
+	if (bch2_snapshot_is_internal_node(c, pos.snapshot))
+		return 0;
+
+	if (!fsck_err_on(c->sb.clean, c,
+			 "filesystem marked as clean but have deleted inode %llu:%u",
+			 pos.offset, pos.snapshot))
+		return 0;
+
+	k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes, pos, BTREE_ITER_CACHED);
+	ret = bkey_err(k);
+	if (ret)
+		return ret;
+
+	ret = bkey_is_inode(k.k) ? 0 : -BCH_ERR_ENOENT_inode;
+	if (fsck_err_on(!bkey_is_inode(k.k), c,
+			"nonexistent inode %llu:%u in deleted_inodes btree",
+			pos.offset, pos.snapshot))
+		goto delete;
+
+	ret = bch2_inode_unpack(k, &inode);
+	if (ret)
+		goto err;
+
+	if (fsck_err_on(!(inode.bi_flags & BCH_INODE_UNLINKED), c,
+			"non-deleted inode %llu:%u in deleted_inodes btree",
+			pos.offset, pos.snapshot))
+		goto delete;
+
+	return 1;
+err:
+fsck_err:
+	return ret;
+delete:
+	return bch2_btree_bit_mod(trans, BTREE_ID_deleted_inodes, pos, false);
+}
+
+int bch2_delete_dead_inodes(struct bch_fs *c)
+{
+	struct btree_trans trans;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	int ret;
+
+	bch2_trans_init(&trans, c, 0, 0);
+
+	ret = bch2_btree_write_buffer_flush_sync(&trans);
+	if (ret)
+		goto err;
+
+	/*
+	 * Weird transaction restart handling here because on successful delete,
+	 * bch2_inode_rm_snapshot() will return a nested transaction restart,
+	 * but we can't retry because the btree write buffer won't have been
+	 * flushed and we'd spin:
+	 */
+	for_each_btree_key(&trans, iter, BTREE_ID_deleted_inodes, POS_MIN,
+			   BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
+		ret = lockrestart_do(&trans, may_delete_deleted_inode(&trans, k.k->p));
+		if (ret < 0)
+			break;
+
+		if (ret) {
+			if (!test_bit(BCH_FS_RW, &c->flags)) {
+				bch2_trans_unlock(&trans);
+				bch2_fs_lazy_rw(c);
+			}
+
+			ret = bch2_inode_rm_snapshot(&trans, k.k->p.offset, k.k->p.snapshot);
+			if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
+				break;
+		}
+	}
+	bch2_trans_iter_exit(&trans, &iter);
+err:
+	bch2_trans_exit(&trans);
+
+	return ret;
+}
diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h
index 1b9dc27e82bd..22b24405649f 100644
--- a/fs/bcachefs/inode.h
+++ b/fs/bcachefs/inode.h
@@ -199,5 +199,6 @@ void bch2_inode_opts_get(struct bch_io_opts *, struct bch_fs *,
 			 struct bch_inode_unpacked *);
 
 int bch2_inode_rm_snapshot(struct btree_trans *, u64, u32);
+int bch2_delete_dead_inodes(struct bch_fs *);
 
 #endif /* _BCACHEFS_INODE_H */
diff --git a/fs/bcachefs/recovery_types.h b/fs/bcachefs/recovery_types.h
index 377f51188a20..abf1f834ec7a 100644
--- a/fs/bcachefs/recovery_types.h
+++ b/fs/bcachefs/recovery_types.h
@@ -29,13 +29,14 @@
 	x(check_subvols,		PASS_FSCK)						\
 	x(delete_dead_snapshots,	PASS_FSCK|PASS_UNCLEAN)					\
 	x(fs_upgrade_for_subvolumes,	0)							\
-	x(check_inodes,			PASS_FSCK|PASS_UNCLEAN)					\
+	x(check_inodes,			PASS_FSCK)						\
 	x(check_extents,		PASS_FSCK)						\
 	x(check_dirents,		PASS_FSCK)						\
 	x(check_xattrs,			PASS_FSCK)						\
 	x(check_root,			PASS_FSCK)						\
 	x(check_directory_structure,	PASS_FSCK)						\
 	x(check_nlinks,			PASS_FSCK)						\
+	x(delete_dead_inodes,		PASS_FSCK|PASS_UNCLEAN)					\
 	x(fix_reflink_p,		0)							\
 
 enum bch_recovery_pass {
-- 
cgit 


From 6c6439650ec913c83d48055da63b8f204075afb7 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 3 Aug 2023 14:42:37 -0400
Subject: bcachefs: bkey_format helper improvements

 - add a to_text() method for bkey_format

 - convert bch2_bkey_format_validate() to modern error message style,
   where we pass a printbuf for the error string instead of returning a
   static string

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bkey.c        | 38 ++++++++++++++++++++++++++++++--------
 fs/bcachefs/bkey.h        |  3 ++-
 fs/bcachefs/btree_cache.c | 15 ++++++---------
 fs/bcachefs/btree_io.c    |  9 +++++----
 4 files changed, 43 insertions(+), 22 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bkey.c b/fs/bcachefs/bkey.c
index ee7ba700e75f..b7b77f459724 100644
--- a/fs/bcachefs/bkey.c
+++ b/fs/bcachefs/bkey.c
@@ -608,12 +608,15 @@ struct bkey_format bch2_bkey_format_done(struct bkey_format_state *s)
 	return ret;
 }
 
-const char *bch2_bkey_format_validate(struct bkey_format *f)
+int bch2_bkey_format_validate(struct bkey_format *f, struct printbuf *err)
 {
 	unsigned i, bits = KEY_PACKED_BITS_START;
 
-	if (f->nr_fields != BKEY_NR_FIELDS)
-		return "incorrect number of fields";
+	if (f->nr_fields != BKEY_NR_FIELDS) {
+		prt_printf(err, "incorrect number of fields: got %u, should be %u",
+			   f->nr_fields, BKEY_NR_FIELDS);
+		return -BCH_ERR_invalid;
+	}
 
 	/*
 	 * Verify that the packed format can't represent fields larger than the
@@ -628,16 +631,35 @@ const char *bch2_bkey_format_validate(struct bkey_format *f)
 		u64 field_offset = le64_to_cpu(f->field_offset[i]);
 
 		if (packed_max + field_offset < packed_max ||
-		    packed_max + field_offset > unpacked_max)
-			return "field too large";
+		    packed_max + field_offset > unpacked_max) {
+			prt_printf(err, "field %u too large: %llu + %llu > %llu",
+				   i, packed_max, field_offset, unpacked_max);
+			return -BCH_ERR_invalid;
+		}
 
 		bits += f->bits_per_field[i];
 	}
 
-	if (f->key_u64s != DIV_ROUND_UP(bits, 64))
-		return "incorrect key_u64s";
+	if (f->key_u64s != DIV_ROUND_UP(bits, 64)) {
+		prt_printf(err, "incorrect key_u64s: got %u, should be %u",
+			   f->key_u64s, DIV_ROUND_UP(bits, 64));
+		return -BCH_ERR_invalid;
+	}
+
+	return 0;
+}
 
-	return NULL;
+void bch2_bkey_format_to_text(struct printbuf *out, const struct bkey_format *f)
+{
+	prt_printf(out, "u64s %u fields ", f->key_u64s);
+
+	for (unsigned i = 0; i < ARRAY_SIZE(f->bits_per_field); i++) {
+		if (i)
+			prt_str(out, ", ");
+		prt_printf(out, "%u:%llu",
+			   f->bits_per_field[i],
+			   le64_to_cpu(f->field_offset[i]));
+	}
 }
 
 /*
diff --git a/fs/bcachefs/bkey.h b/fs/bcachefs/bkey.h
index e81fb3e00c60..644caa2b2b25 100644
--- a/fs/bcachefs/bkey.h
+++ b/fs/bcachefs/bkey.h
@@ -769,6 +769,7 @@ static inline void bch2_bkey_format_add_key(struct bkey_format_state *s, const s
 
 void bch2_bkey_format_add_pos(struct bkey_format_state *, struct bpos);
 struct bkey_format bch2_bkey_format_done(struct bkey_format_state *);
-const char *bch2_bkey_format_validate(struct bkey_format *);
+int bch2_bkey_format_validate(struct bkey_format *, struct printbuf *);
+void bch2_bkey_format_to_text(struct printbuf *, const struct bkey_format *);
 
 #endif /* _BCACHEFS_BKEY_H */
diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index 346bfaf99460..245ddd92b2d1 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -1165,7 +1165,6 @@ wait_on_io:
 void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c,
 			     const struct btree *b)
 {
-	const struct bkey_format *f = &b->format;
 	struct bset_stats stats;
 
 	memset(&stats, 0, sizeof(stats));
@@ -1179,9 +1178,13 @@ void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c,
 	prt_printf(out, ":\n"
 	       "    ptrs: ");
 	bch2_val_to_text(out, c, bkey_i_to_s_c(&b->key));
+	prt_newline(out);
 
-	prt_printf(out, "\n"
-	       "    format: u64s %u fields %u %u %u %u %u\n"
+	prt_printf(out,
+	       "    format: ");
+	bch2_bkey_format_to_text(out, &b->format);
+
+	prt_printf(out,
 	       "    unpack fn len: %u\n"
 	       "    bytes used %zu/%zu (%zu%% full)\n"
 	       "    sib u64s: %u, %u (merge threshold %u)\n"
@@ -1189,12 +1192,6 @@ void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c,
 	       "    nr unpacked keys %u\n"
 	       "    floats %zu\n"
 	       "    failed unpacked %zu\n",
-	       f->key_u64s,
-	       f->bits_per_field[0],
-	       f->bits_per_field[1],
-	       f->bits_per_field[2],
-	       f->bits_per_field[3],
-	       f->bits_per_field[4],
 	       b->unpack_fn_len,
 	       b->nr.live_u64s * sizeof(u64),
 	       btree_bytes(c) - sizeof(struct btree_node),
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 7bfb8b8d4cb5..2339395e0bd2 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -694,7 +694,6 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
 			 int write, bool have_retry, bool *saw_error)
 {
 	unsigned version = le16_to_cpu(i->version);
-	const char *err;
 	struct printbuf buf1 = PRINTBUF;
 	struct printbuf buf2 = PRINTBUF;
 	int ret = 0;
@@ -802,10 +801,12 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
 			compat_btree_node(b->c.level, b->c.btree_id, version,
 					  BSET_BIG_ENDIAN(i), write, bn);
 
-		err = bch2_bkey_format_validate(&bn->format);
-		btree_err_on(err,
+		btree_err_on(bch2_bkey_format_validate(&bn->format, &buf1),
 			     BTREE_ERR_BAD_NODE, c, ca, b, i,
-			     "invalid bkey format: %s", err);
+			     "invalid bkey format: %s\n  %s", buf1.buf,
+			     (printbuf_reset(&buf2),
+			      bch2_bkey_format_to_text(&buf2, &bn->format), buf2.buf));
+		printbuf_reset(&buf1);
 
 		compat_bformat(b->c.level, b->c.btree_id, version,
 			       BSET_BIG_ENDIAN(i), write,
-- 
cgit 


From 77212d3a76a4becabeac8736b686a533dd75913b Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 3 Aug 2023 16:38:36 -0400
Subject: bcachefs: Fix shift by 64 in set_inc_field()

UBSAN was complaining about a shift by 64 in set_inc_field().

This only happened when the value being shifted was 0, so in theory
should be harmless - a shift by 64 (or register width) should logically
give a result of 0, but CPUs will in practice leave the input unchanged
when the number of bits to shift by wraps - and since our input here is
0, the output is still what we want.

But, it's still undefined behaviour and we need our UBSAN output to be
clean, so it needs to be fixed.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bkey.c | 51 ++++++++++++++++++++++++---------------------------
 1 file changed, 24 insertions(+), 27 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bkey.c b/fs/bcachefs/bkey.c
index b7b77f459724..34a8fe48581c 100644
--- a/fs/bcachefs/bkey.c
+++ b/fs/bcachefs/bkey.c
@@ -184,6 +184,28 @@ static u64 get_inc_field(struct unpack_state *state, unsigned field)
 	return v + offset;
 }
 
+__always_inline
+static void __set_inc_field(struct pack_state *state, unsigned field, u64 v)
+{
+	unsigned bits = state->format->bits_per_field[field];
+
+	if (bits) {
+		if (bits > state->bits) {
+			bits -= state->bits;
+			/* avoid shift by 64 if bits is 64 - bits is never 0 here: */
+			state->w |= (v >> 1) >> (bits - 1);
+
+			*state->p = state->w;
+			state->p = next_word(state->p);
+			state->w = 0;
+			state->bits = 64;
+		}
+
+		state->bits -= bits;
+		state->w |= v << state->bits;
+	}
+}
+
 __always_inline
 static bool set_inc_field(struct pack_state *state, unsigned field, u64 v)
 {
@@ -198,20 +220,7 @@ static bool set_inc_field(struct pack_state *state, unsigned field, u64 v)
 	if (fls64(v) > bits)
 		return false;
 
-	if (bits > state->bits) {
-		bits -= state->bits;
-		/* avoid shift by 64 if bits is 0 - bits is never 64 here: */
-		state->w |= (v >> 1) >> (bits - 1);
-
-		*state->p = state->w;
-		state->p = next_word(state->p);
-		state->w = 0;
-		state->bits = 64;
-	}
-
-	state->bits -= bits;
-	state->w |= v << state->bits;
-
+	__set_inc_field(state, field, v);
 	return true;
 }
 
@@ -380,19 +389,7 @@ static bool set_inc_field_lossy(struct pack_state *state, unsigned field, u64 v)
 		ret = false;
 	}
 
-	if (bits > state->bits) {
-		bits -= state->bits;
-		state->w |= (v >> 1) >> (bits - 1);
-
-		*state->p = state->w;
-		state->p = next_word(state->p);
-		state->w = 0;
-		state->bits = 64;
-	}
-
-	state->bits -= bits;
-	state->w |= v << state->bits;
-
+	__set_inc_field(state, field, v);
 	return ret;
 }
 
-- 
cgit 


From a1d1072fe75a52786b74f8a6093db3fa6a5d15d6 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 3 Aug 2023 17:33:20 -0400
Subject: bcachefs: Print out required recovery passes on version upgrade

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/recovery.c | 62 ++++++++++++++++++++++++++++++--------------------
 1 file changed, 37 insertions(+), 25 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 55a233c2c7cc..bcc1ee0b4ef2 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -1120,6 +1120,35 @@ static int bch2_fs_upgrade_for_subvolumes(struct bch_fs *c)
 	return ret;
 }
 
+static const char * const recovery_pass_names[] = {
+#define x(_fn, _when)	#_fn,
+	BCH_RECOVERY_PASSES()
+#undef x
+	NULL
+};
+
+static int bch2_check_allocations(struct bch_fs *c)
+{
+	return bch2_gc(c, true, c->opts.norecovery);
+}
+
+static int bch2_set_may_go_rw(struct bch_fs *c)
+{
+	set_bit(BCH_FS_MAY_GO_RW, &c->flags);
+	return 0;
+}
+
+struct recovery_pass_fn {
+	int		(*fn)(struct bch_fs *);
+	unsigned	when;
+};
+
+static struct recovery_pass_fn recovery_passes[] = {
+#define x(_fn, _when)	{ .fn = bch2_##_fn, .when = _when },
+	BCH_RECOVERY_PASSES()
+#undef x
+};
+
 static void check_version_upgrade(struct bch_fs *c)
 {
 	unsigned latest_compatible = bch2_version_compatible(c->sb.version);
@@ -1172,7 +1201,12 @@ static void check_version_upgrade(struct bch_fs *c)
 
 		recovery_passes = bch2_upgrade_recovery_passes(c, old_version, new_version);
 		if (recovery_passes) {
-			prt_str(&buf, "fsck required");
+			if ((recovery_passes & RECOVERY_PASS_ALL_FSCK) == RECOVERY_PASS_ALL_FSCK)
+				prt_str(&buf, "fsck required");
+			else {
+				prt_str(&buf, "running recovery passses: ");
+				prt_bitflags(&buf, recovery_pass_names, recovery_passes);
+			}
 
 			c->recovery_passes_explicit |= recovery_passes;
 			c->opts.fix_errors = FSCK_FIX_yes;
@@ -1188,29 +1222,6 @@ static void check_version_upgrade(struct bch_fs *c)
 	}
 }
 
-static int bch2_check_allocations(struct bch_fs *c)
-{
-	return bch2_gc(c, true, c->opts.norecovery);
-}
-
-static int bch2_set_may_go_rw(struct bch_fs *c)
-{
-	set_bit(BCH_FS_MAY_GO_RW, &c->flags);
-	return 0;
-}
-
-struct recovery_pass_fn {
-	int		(*fn)(struct bch_fs *);
-	const char	*name;
-	unsigned	when;
-};
-
-static struct recovery_pass_fn recovery_passes[] = {
-#define x(_fn, _when)	{ .fn = bch2_##_fn, .name = #_fn, .when = _when },
-	BCH_RECOVERY_PASSES()
-#undef x
-};
-
 u64 bch2_fsck_recovery_passes(void)
 {
 	u64 ret = 0;
@@ -1248,7 +1259,8 @@ static int bch2_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass)
 		struct recovery_pass_fn *p = recovery_passes + pass;
 
 		if (!(p->when & PASS_SILENT))
-			printk(KERN_INFO bch2_log_msg(c, "%s..."), p->name);
+			printk(KERN_INFO bch2_log_msg(c, "%s..."),
+			       recovery_pass_names[pass]);
 		ret = p->fn(c);
 		if (ret)
 			return ret;
-- 
cgit 


From ad52bac251589cdcd206bfec46d670ba00e6edab Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 3 Aug 2023 20:37:32 -0400
Subject: bcachefs: Log a message when running an explicit recovery pass

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs.h | 16 ----------------
 fs/bcachefs/btree_io.c |  1 +
 fs/bcachefs/fsck.c     |  1 +
 fs/bcachefs/recovery.c | 20 ++++++++++----------
 fs/bcachefs/recovery.h | 23 +++++++++++++++++++++++
 5 files changed, 35 insertions(+), 26 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index e1f1e8e871a8..019d534b7325 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -1139,22 +1139,6 @@ static inline bool bch2_dev_exists2(const struct bch_fs *c, unsigned dev)
 	return dev < c->sb.nr_devices && c->devs[dev];
 }
 
-/*
- * For when we need to rewind recovery passes and run a pass we skipped:
- */
-static inline int bch2_run_explicit_recovery_pass(struct bch_fs *c,
-						  enum bch_recovery_pass pass)
-{
-	c->recovery_passes_explicit |= BIT_ULL(pass);
-
-	if (c->curr_recovery_pass >= pass) {
-		c->curr_recovery_pass = pass;
-		return -BCH_ERR_restart_recovery;
-	} else {
-		return 0;
-	}
-}
-
 #define BKEY_PADDED_ONSTACK(key, pad)				\
 	struct { struct bkey_i key; __u64 key ## _pad[pad]; }
 
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 2339395e0bd2..224ed5216b0a 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -17,6 +17,7 @@
 #include "io.h"
 #include "journal_reclaim.h"
 #include "journal_seq_blacklist.h"
+#include "recovery.h"
 #include "super-io.h"
 #include "trace.h"
 
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 28dc8b4e3ed7..0b4ddf650a97 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -11,6 +11,7 @@
 #include "fsck.h"
 #include "inode.h"
 #include "keylist.h"
+#include "recovery.h"
 #include "subvolume.h"
 #include "super.h"
 #include "xattr.h"
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index bcc1ee0b4ef2..3fd7f966acd7 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -1120,7 +1120,7 @@ static int bch2_fs_upgrade_for_subvolumes(struct bch_fs *c)
 	return ret;
 }
 
-static const char * const recovery_pass_names[] = {
+const char * const bch2_recovery_passes[] = {
 #define x(_fn, _when)	#_fn,
 	BCH_RECOVERY_PASSES()
 #undef x
@@ -1143,7 +1143,7 @@ struct recovery_pass_fn {
 	unsigned	when;
 };
 
-static struct recovery_pass_fn recovery_passes[] = {
+static struct recovery_pass_fn recovery_pass_fns[] = {
 #define x(_fn, _when)	{ .fn = bch2_##_fn, .when = _when },
 	BCH_RECOVERY_PASSES()
 #undef x
@@ -1205,7 +1205,7 @@ static void check_version_upgrade(struct bch_fs *c)
 				prt_str(&buf, "fsck required");
 			else {
 				prt_str(&buf, "running recovery passses: ");
-				prt_bitflags(&buf, recovery_pass_names, recovery_passes);
+				prt_bitflags(&buf, bch2_recovery_passes, recovery_passes);
 			}
 
 			c->recovery_passes_explicit |= recovery_passes;
@@ -1226,15 +1226,15 @@ u64 bch2_fsck_recovery_passes(void)
 {
 	u64 ret = 0;
 
-	for (unsigned i = 0; i < ARRAY_SIZE(recovery_passes); i++)
-		if (recovery_passes[i].when & PASS_FSCK)
+	for (unsigned i = 0; i < ARRAY_SIZE(recovery_pass_fns); i++)
+		if (recovery_pass_fns[i].when & PASS_FSCK)
 			ret |= BIT_ULL(i);
 	return ret;
 }
 
 static bool should_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass)
 {
-	struct recovery_pass_fn *p = recovery_passes + c->curr_recovery_pass;
+	struct recovery_pass_fn *p = recovery_pass_fns + c->curr_recovery_pass;
 
 	if (c->opts.norecovery && pass > BCH_RECOVERY_PASS_snapshots_read)
 		return false;
@@ -1256,11 +1256,11 @@ static int bch2_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass)
 	c->curr_recovery_pass = pass;
 
 	if (should_run_recovery_pass(c, pass)) {
-		struct recovery_pass_fn *p = recovery_passes + pass;
+		struct recovery_pass_fn *p = recovery_pass_fns + pass;
 
 		if (!(p->when & PASS_SILENT))
 			printk(KERN_INFO bch2_log_msg(c, "%s..."),
-			       recovery_pass_names[pass]);
+			       bch2_recovery_passes[pass]);
 		ret = p->fn(c);
 		if (ret)
 			return ret;
@@ -1275,7 +1275,7 @@ static int bch2_run_recovery_passes(struct bch_fs *c)
 {
 	int ret = 0;
 
-	while (c->curr_recovery_pass < ARRAY_SIZE(recovery_passes)) {
+	while (c->curr_recovery_pass < ARRAY_SIZE(recovery_pass_fns)) {
 		ret = bch2_run_recovery_pass(c, c->curr_recovery_pass);
 		if (bch2_err_matches(ret, BCH_ERR_restart_recovery))
 			continue;
@@ -1593,7 +1593,7 @@ int bch2_fs_initialize(struct bch_fs *c)
 	}
 	mutex_unlock(&c->sb_lock);
 
-	c->curr_recovery_pass = ARRAY_SIZE(recovery_passes);
+	c->curr_recovery_pass = ARRAY_SIZE(recovery_pass_fns);
 	set_bit(BCH_FS_MAY_GO_RW, &c->flags);
 	set_bit(BCH_FS_FSCK_DONE, &c->flags);
 
diff --git a/fs/bcachefs/recovery.h b/fs/bcachefs/recovery.h
index f8e796c0f8c8..26cd6a230ac0 100644
--- a/fs/bcachefs/recovery.h
+++ b/fs/bcachefs/recovery.h
@@ -52,9 +52,32 @@ void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *,
 void bch2_journal_keys_free(struct journal_keys *);
 void bch2_journal_entries_free(struct bch_fs *);
 
+extern const char * const bch2_recovery_passes[];
+
+/*
+ * For when we need to rewind recovery passes and run a pass we skipped:
+ */
+static inline int bch2_run_explicit_recovery_pass(struct bch_fs *c,
+						  enum bch_recovery_pass pass)
+{
+	bch_info(c, "running explicit recovery pass %s (%u), currently at %s (%u)",
+		 bch2_recovery_passes[pass], pass,
+		 bch2_recovery_passes[c->curr_recovery_pass], c->curr_recovery_pass);
+
+	c->recovery_passes_explicit |= BIT_ULL(pass);
+
+	if (c->curr_recovery_pass >= pass) {
+		c->curr_recovery_pass = pass;
+		return -BCH_ERR_restart_recovery;
+	} else {
+		return 0;
+	}
+}
+
 u64 bch2_fsck_recovery_passes(void);
 
 int bch2_fs_recovery(struct bch_fs *);
 int bch2_fs_initialize(struct bch_fs *);
 
 #endif /* _BCACHEFS_RECOVERY_H */
+
-- 
cgit 


From 0ed4ca146efda60057782ccda6dc056bf538cc75 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 3 Aug 2023 20:57:06 -0400
Subject: bcachefs: Ensure topology repair runs

This fixes should_restart_for_topology_repair() - previously it was
returning false if the btree io path had already seleceted topology
repair to run, even if it hadn't run yet.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs.h | 1 +
 fs/bcachefs/btree_gc.c | 2 +-
 fs/bcachefs/recovery.c | 2 ++
 fs/bcachefs/recovery.h | 1 +
 4 files changed, 5 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 019d534b7325..3b39597a677a 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -995,6 +995,7 @@ struct bch_fs {
 	enum bch_recovery_pass	curr_recovery_pass;
 	/* bitmap of explicitly enabled recovery passes: */
 	u64			recovery_passes_explicit;
+	u64			recovery_passes_complete;
 
 	/* DEBUG JUNK */
 	struct dentry		*fs_debug_dir;
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 43e6222f124d..3f5b4d8ee4a5 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -43,7 +43,7 @@
 static bool should_restart_for_topology_repair(struct bch_fs *c)
 {
 	return c->opts.fix_errors != FSCK_FIX_no &&
-		!(c->recovery_passes_explicit & BIT_ULL(BCH_RECOVERY_PASS_check_topology));
+		!(c->recovery_passes_complete & BIT_ULL(BCH_RECOVERY_PASS_check_topology));
 }
 
 static inline void __gc_pos_set(struct bch_fs *c, struct gc_pos new_pos)
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 3fd7f966acd7..edc9830d8163 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -1266,6 +1266,8 @@ static int bch2_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass)
 			return ret;
 		if (!(p->when & PASS_SILENT))
 			printk(KERN_CONT " done\n");
+
+		c->recovery_passes_complete |= BIT_ULL(pass);
 	}
 
 	return 0;
diff --git a/fs/bcachefs/recovery.h b/fs/bcachefs/recovery.h
index 26cd6a230ac0..0cd6b8a13c8c 100644
--- a/fs/bcachefs/recovery.h
+++ b/fs/bcachefs/recovery.h
@@ -68,6 +68,7 @@ static inline int bch2_run_explicit_recovery_pass(struct bch_fs *c,
 
 	if (c->curr_recovery_pass >= pass) {
 		c->curr_recovery_pass = pass;
+		c->recovery_passes_complete &= (1ULL << pass) >> 1;
 		return -BCH_ERR_restart_recovery;
 	} else {
 		return 0;
-- 
cgit 


From 73adfcaf54c8fae86d80da9acad828ee2001cd60 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 3 Aug 2023 20:32:46 -0400
Subject: bcachefs: Fix btree_err() macro

Error code wasn't being propagated correctly, change it to match
fsck_err()

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_io.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 224ed5216b0a..80bd84f90dbf 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -630,8 +630,11 @@ fsck_err:
 ({									\
 	int _ret = __btree_err(type, c, ca, b, i, write, have_retry, msg, ##__VA_ARGS__);\
 									\
-	if (_ret != -BCH_ERR_fsck_fix)					\
+	if (_ret != -BCH_ERR_fsck_fix) {				\
+		ret = _ret;						\
 		goto fsck_err;						\
+	}								\
+									\
 	*saw_error = true;						\
 })
 
-- 
cgit 


From 56046e3ecce5e9acf543884577d7fbad454de856 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 3 Aug 2023 19:36:28 -0400
Subject: bcachefs: Convert btree_err_type to normal error codes

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_io.c | 119 +++++++++++++++++++------------------------------
 fs/bcachefs/errcode.h  |   6 +++
 2 files changed, 53 insertions(+), 72 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 80bd84f90dbf..a3da5b4bcd21 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -542,31 +542,7 @@ static void btree_err_msg(struct printbuf *out, struct bch_fs *c,
 	prt_str(out, ": ");
 }
 
-enum btree_err_type {
-	/*
-	 * We can repair this locally, and we're after the checksum check so
-	 * there's no need to try another replica:
-	 */
-	BTREE_ERR_FIXABLE,
-	/*
-	 * We can repair this if we have to, but we should try reading another
-	 * replica if we can:
-	 */
-	BTREE_ERR_WANT_RETRY,
-	/*
-	 * Read another replica if we have one, otherwise consider the whole
-	 * node bad:
-	 */
-	BTREE_ERR_MUST_RETRY,
-	BTREE_ERR_BAD_NODE,
-	BTREE_ERR_INCOMPATIBLE,
-};
-
-enum btree_validate_ret {
-	BTREE_RETRY_READ = 64,
-};
-
-static int __btree_err(enum btree_err_type type,
+static int __btree_err(int ret,
 		       struct bch_fs *c,
 		       struct bch_dev *ca,
 		       struct btree *b,
@@ -577,7 +553,6 @@ static int __btree_err(enum btree_err_type type,
 {
 	struct printbuf out = PRINTBUF;
 	va_list args;
-	int ret = -BCH_ERR_fsck_fix;
 
 	btree_err_msg(&out, c, ca, b, i, b->written, write);
 
@@ -593,27 +568,26 @@ static int __btree_err(enum btree_err_type type,
 		goto out;
 	}
 
-	if (!have_retry && type == BTREE_ERR_WANT_RETRY)
-		type = BTREE_ERR_FIXABLE;
-	if (!have_retry && type == BTREE_ERR_MUST_RETRY)
-		type = BTREE_ERR_BAD_NODE;
+	if (!have_retry && ret == -BCH_ERR_btree_node_read_err_want_retry)
+		ret = -BCH_ERR_btree_node_read_err_fixable;
+	if (!have_retry && ret == -BCH_ERR_btree_node_read_err_must_retry)
+		ret = -BCH_ERR_btree_node_read_err_bad_node;
 
-	switch (type) {
-	case BTREE_ERR_FIXABLE:
+	switch (ret) {
+	case -BCH_ERR_btree_node_read_err_fixable:
 		mustfix_fsck_err(c, "%s", out.buf);
 		ret = -BCH_ERR_fsck_fix;
 		break;
-	case BTREE_ERR_WANT_RETRY:
-	case BTREE_ERR_MUST_RETRY:
+	case -BCH_ERR_btree_node_read_err_want_retry:
+	case -BCH_ERR_btree_node_read_err_must_retry:
 		bch2_print_string_as_lines(KERN_ERR, out.buf);
-		ret = BTREE_RETRY_READ;
 		break;
-	case BTREE_ERR_BAD_NODE:
+	case -BCH_ERR_btree_node_read_err_bad_node:
 		bch2_print_string_as_lines(KERN_ERR, out.buf);
 		bch2_topology_error(c);
 		ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology) ?: -EIO;
 		break;
-	case BTREE_ERR_INCOMPATIBLE:
+	case -BCH_ERR_btree_node_read_err_incompatible:
 		bch2_print_string_as_lines(KERN_ERR, out.buf);
 		ret = -BCH_ERR_fsck_errors_not_fixed;
 		break;
@@ -703,13 +677,13 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
 	int ret = 0;
 
 	btree_err_on(!bch2_version_compatible(version),
-		     BTREE_ERR_INCOMPATIBLE, c, ca, b, i,
+		     -BCH_ERR_btree_node_read_err_incompatible, c, ca, b, i,
 		     "unsupported bset version %u.%u",
 		     BCH_VERSION_MAJOR(version),
 		     BCH_VERSION_MINOR(version));
 
 	if (btree_err_on(version < c->sb.version_min,
-			 BTREE_ERR_FIXABLE, c, NULL, b, i,
+			 -BCH_ERR_btree_node_read_err_fixable, c, NULL, b, i,
 			 "bset version %u older than superblock version_min %u",
 			 version, c->sb.version_min)) {
 		mutex_lock(&c->sb_lock);
@@ -720,7 +694,7 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
 
 	if (btree_err_on(BCH_VERSION_MAJOR(version) >
 			 BCH_VERSION_MAJOR(c->sb.version),
-			 BTREE_ERR_FIXABLE, c, NULL, b, i,
+			 -BCH_ERR_btree_node_read_err_fixable, c, NULL, b, i,
 			 "bset version %u newer than superblock version %u",
 			 version, c->sb.version)) {
 		mutex_lock(&c->sb_lock);
@@ -730,11 +704,11 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
 	}
 
 	btree_err_on(BSET_SEPARATE_WHITEOUTS(i),
-		     BTREE_ERR_INCOMPATIBLE, c, ca, b, i,
+		     -BCH_ERR_btree_node_read_err_incompatible, c, ca, b, i,
 		     "BSET_SEPARATE_WHITEOUTS no longer supported");
 
 	if (btree_err_on(offset + sectors > btree_sectors(c),
-			 BTREE_ERR_FIXABLE, c, ca, b, i,
+			 -BCH_ERR_btree_node_read_err_fixable, c, ca, b, i,
 			 "bset past end of btree node")) {
 		i->u64s = 0;
 		ret = 0;
@@ -742,12 +716,12 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
 	}
 
 	btree_err_on(offset && !i->u64s,
-		     BTREE_ERR_FIXABLE, c, ca, b, i,
+		     -BCH_ERR_btree_node_read_err_fixable, c, ca, b, i,
 		     "empty bset");
 
 	btree_err_on(BSET_OFFSET(i) &&
 		     BSET_OFFSET(i) != offset,
-		     BTREE_ERR_WANT_RETRY, c, ca, b, i,
+		     -BCH_ERR_btree_node_read_err_want_retry, c, ca, b, i,
 		     "bset at wrong sector offset");
 
 	if (!offset) {
@@ -761,16 +735,16 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
 
 			/* XXX endianness */
 			btree_err_on(bp->seq != bn->keys.seq,
-				     BTREE_ERR_MUST_RETRY, c, ca, b, NULL,
+				     -BCH_ERR_btree_node_read_err_must_retry, c, ca, b, NULL,
 				     "incorrect sequence number (wrong btree node)");
 		}
 
 		btree_err_on(BTREE_NODE_ID(bn) != b->c.btree_id,
-			     BTREE_ERR_MUST_RETRY, c, ca, b, i,
+			     -BCH_ERR_btree_node_read_err_must_retry, c, ca, b, i,
 			     "incorrect btree id");
 
 		btree_err_on(BTREE_NODE_LEVEL(bn) != b->c.level,
-			     BTREE_ERR_MUST_RETRY, c, ca, b, i,
+			     -BCH_ERR_btree_node_read_err_must_retry, c, ca, b, i,
 			     "incorrect level");
 
 		if (!write)
@@ -787,7 +761,7 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
 			}
 
 			btree_err_on(!bpos_eq(b->data->min_key, bp->min_key),
-				     BTREE_ERR_MUST_RETRY, c, ca, b, NULL,
+				     -BCH_ERR_btree_node_read_err_must_retry, c, ca, b, NULL,
 				     "incorrect min_key: got %s should be %s",
 				     (printbuf_reset(&buf1),
 				      bch2_bpos_to_text(&buf1, bn->min_key), buf1.buf),
@@ -796,7 +770,7 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
 		}
 
 		btree_err_on(!bpos_eq(bn->max_key, b->key.k.p),
-			     BTREE_ERR_MUST_RETRY, c, ca, b, i,
+			     -BCH_ERR_btree_node_read_err_must_retry, c, ca, b, i,
 			     "incorrect max key %s",
 			     (printbuf_reset(&buf1),
 			      bch2_bpos_to_text(&buf1, bn->max_key), buf1.buf));
@@ -806,7 +780,7 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
 					  BSET_BIG_ENDIAN(i), write, bn);
 
 		btree_err_on(bch2_bkey_format_validate(&bn->format, &buf1),
-			     BTREE_ERR_BAD_NODE, c, ca, b, i,
+			     -BCH_ERR_btree_node_read_err_bad_node, c, ca, b, i,
 			     "invalid bkey format: %s\n  %s", buf1.buf,
 			     (printbuf_reset(&buf2),
 			      bch2_bkey_format_to_text(&buf2, &bn->format), buf2.buf));
@@ -850,14 +824,14 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b,
 		struct bkey tmp;
 
 		if (btree_err_on(bkey_p_next(k) > vstruct_last(i),
-				 BTREE_ERR_FIXABLE, c, NULL, b, i,
+				 -BCH_ERR_btree_node_read_err_fixable, c, NULL, b, i,
 				 "key extends past end of bset")) {
 			i->u64s = cpu_to_le16((u64 *) k - i->_data);
 			break;
 		}
 
 		if (btree_err_on(k->format > KEY_FORMAT_CURRENT,
-				 BTREE_ERR_FIXABLE, c, NULL, b, i,
+				 -BCH_ERR_btree_node_read_err_fixable, c, NULL, b, i,
 				 "invalid bkey format %u", k->format)) {
 			i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
 			memmove_u64s_down(k, bkey_p_next(k),
@@ -881,7 +855,7 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b,
 			prt_printf(&buf, "\n  ");
 			bch2_bkey_val_to_text(&buf, c, u.s_c);
 
-			btree_err(BTREE_ERR_FIXABLE, c, NULL, b, i, "%s", buf.buf);
+			btree_err(-BCH_ERR_btree_node_read_err_fixable, c, NULL, b, i, "%s", buf.buf);
 
 			i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
 			memmove_u64s_down(k, bkey_p_next(k),
@@ -905,7 +879,7 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b,
 
 			bch2_dump_bset(c, b, i, 0);
 
-			if (btree_err(BTREE_ERR_FIXABLE, c, NULL, b, i, "%s", buf.buf)) {
+			if (btree_err(-BCH_ERR_btree_node_read_err_fixable, c, NULL, b, i, "%s", buf.buf)) {
 				i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
 				memmove_u64s_down(k, bkey_p_next(k),
 						  (u64 *) vstruct_end(i) - (u64 *) k);
@@ -948,16 +922,16 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
 	iter->size = (btree_blocks(c) + 1) * 2;
 
 	if (bch2_meta_read_fault("btree"))
-		btree_err(BTREE_ERR_MUST_RETRY, c, ca, b, NULL,
+		btree_err(-BCH_ERR_btree_node_read_err_must_retry, c, ca, b, NULL,
 			  "dynamic fault");
 
 	btree_err_on(le64_to_cpu(b->data->magic) != bset_magic(c),
-		     BTREE_ERR_MUST_RETRY, c, ca, b, NULL,
+		     -BCH_ERR_btree_node_read_err_must_retry, c, ca, b, NULL,
 		     "bad magic: want %llx, got %llx",
 		     bset_magic(c), le64_to_cpu(b->data->magic));
 
 	btree_err_on(!b->data->keys.seq,
-		     BTREE_ERR_MUST_RETRY, c, ca, b, NULL,
+		     -BCH_ERR_btree_node_read_err_must_retry, c, ca, b, NULL,
 		     "bad btree header: seq 0");
 
 	if (b->key.k.type == KEY_TYPE_btree_ptr_v2) {
@@ -965,7 +939,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
 			&bkey_i_to_btree_ptr_v2(&b->key)->v;
 
 		btree_err_on(b->data->keys.seq != bp->seq,
-			     BTREE_ERR_MUST_RETRY, c, ca, b, NULL,
+			     -BCH_ERR_btree_node_read_err_must_retry, c, ca, b, NULL,
 			     "got wrong btree node (seq %llx want %llx)",
 			     b->data->keys.seq, bp->seq);
 	}
@@ -980,7 +954,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
 			i = &b->data->keys;
 
 			btree_err_on(!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)),
-				     BTREE_ERR_WANT_RETRY, c, ca, b, i,
+				     -BCH_ERR_btree_node_read_err_want_retry, c, ca, b, i,
 				     "unknown checksum type %llu",
 				     BSET_CSUM_TYPE(i));
 
@@ -988,7 +962,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
 			csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, b->data);
 
 			btree_err_on(bch2_crc_cmp(csum, b->data->csum),
-				     BTREE_ERR_WANT_RETRY, c, ca, b, i,
+				     -BCH_ERR_btree_node_read_err_want_retry, c, ca, b, i,
 				     "invalid checksum");
 
 			ret = bset_encrypt(c, i, b->written << 9);
@@ -998,7 +972,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
 
 			btree_err_on(btree_node_type_is_extents(btree_node_type(b)) &&
 				     !BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data),
-				     BTREE_ERR_INCOMPATIBLE, c, NULL, b, NULL,
+				     -BCH_ERR_btree_node_read_err_incompatible, c, NULL, b, NULL,
 				     "btree node does not have NEW_EXTENT_OVERWRITE set");
 
 			sectors = vstruct_sectors(b->data, c->block_bits);
@@ -1010,7 +984,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
 				break;
 
 			btree_err_on(!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)),
-				     BTREE_ERR_WANT_RETRY, c, ca, b, i,
+				     -BCH_ERR_btree_node_read_err_want_retry, c, ca, b, i,
 				     "unknown checksum type %llu",
 				     BSET_CSUM_TYPE(i));
 
@@ -1018,7 +992,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
 			csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne);
 
 			btree_err_on(bch2_crc_cmp(csum, bne->csum),
-				     BTREE_ERR_WANT_RETRY, c, ca, b, i,
+				     -BCH_ERR_btree_node_read_err_want_retry, c, ca, b, i,
 				     "invalid checksum");
 
 			ret = bset_encrypt(c, i, b->written << 9);
@@ -1051,12 +1025,12 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
 					true);
 
 		btree_err_on(blacklisted && first,
-			     BTREE_ERR_FIXABLE, c, ca, b, i,
+			     -BCH_ERR_btree_node_read_err_fixable, c, ca, b, i,
 			     "first btree node bset has blacklisted journal seq (%llu)",
 			     le64_to_cpu(i->journal_seq));
 
 		btree_err_on(blacklisted && ptr_written,
-			     BTREE_ERR_FIXABLE, c, ca, b, i,
+			     -BCH_ERR_btree_node_read_err_fixable, c, ca, b, i,
 			     "found blacklisted bset (journal seq %llu) in btree node at offset %u-%u/%u",
 			     le64_to_cpu(i->journal_seq),
 			     b->written, b->written + sectors, ptr_written);
@@ -1075,7 +1049,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
 
 	if (ptr_written) {
 		btree_err_on(b->written < ptr_written,
-			     BTREE_ERR_WANT_RETRY, c, ca, b, NULL,
+			     -BCH_ERR_btree_node_read_err_want_retry, c, ca, b, NULL,
 			     "btree node data missing: expected %u sectors, found %u",
 			     ptr_written, b->written);
 	} else {
@@ -1086,7 +1060,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
 				     !bch2_journal_seq_is_blacklisted(c,
 								      le64_to_cpu(bne->keys.journal_seq),
 								      true),
-				     BTREE_ERR_WANT_RETRY, c, ca, b, NULL,
+				     -BCH_ERR_btree_node_read_err_want_retry, c, ca, b, NULL,
 				     "found bset signature after last bset");
 
 		/*
@@ -1140,7 +1114,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
 			prt_printf(&buf, "\n  ");
 			bch2_bkey_val_to_text(&buf, c, u.s_c);
 
-			btree_err(BTREE_ERR_FIXABLE, c, NULL, b, i, "%s", buf.buf);
+			btree_err(-BCH_ERR_btree_node_read_err_fixable, c, NULL, b, i, "%s", buf.buf);
 
 			btree_keys_account_key_drop(&b->nr, 0, k);
 
@@ -1180,7 +1154,8 @@ out:
 	printbuf_exit(&buf);
 	return retry_read;
 fsck_err:
-	if (ret == BTREE_RETRY_READ)
+	if (ret == -BCH_ERR_btree_node_read_err_want_retry ||
+	    ret == -BCH_ERR_btree_node_read_err_must_retry)
 		retry_read = 1;
 	else
 		set_btree_node_read_error(b);
@@ -1366,14 +1341,14 @@ static void btree_node_read_all_replicas_done(struct closure *cl)
 		}
 
 		written2 = btree_node_sectors_written(c, ra->buf[i]);
-		if (btree_err_on(written2 != written, BTREE_ERR_FIXABLE, c, NULL, b, NULL,
+		if (btree_err_on(written2 != written, -BCH_ERR_btree_node_read_err_fixable, c, NULL, b, NULL,
 				 "btree node sectors written mismatch: %u != %u",
 				 written, written2) ||
 		    btree_err_on(btree_node_has_extra_bsets(c, written2, ra->buf[i]),
-				 BTREE_ERR_FIXABLE, c, NULL, b, NULL,
+				 -BCH_ERR_btree_node_read_err_fixable, c, NULL, b, NULL,
 				 "found bset signature after last bset") ||
 		    btree_err_on(memcmp(ra->buf[best], ra->buf[i], written << 9),
-				 BTREE_ERR_FIXABLE, c, NULL, b, NULL,
+				 -BCH_ERR_btree_node_read_err_fixable, c, NULL, b, NULL,
 				 "btree node replicas content mismatch"))
 			dump_bset_maps = true;
 
diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h
index 735eb2416113..f7fa87442e98 100644
--- a/fs/bcachefs/errcode.h
+++ b/fs/bcachefs/errcode.h
@@ -213,6 +213,12 @@
 	x(BCH_ERR_invalid_sb,		invalid_sb_quota)			\
 	x(BCH_ERR_invalid,		invalid_bkey)				\
 	x(BCH_ERR_operation_blocked,    nocow_lock_blocked)			\
+	x(EIO,				btree_node_read_err)			\
+	x(BCH_ERR_btree_node_read_err,	btree_node_read_err_fixable)		\
+	x(BCH_ERR_btree_node_read_err,	btree_node_read_err_want_retry)		\
+	x(BCH_ERR_btree_node_read_err,	btree_node_read_err_must_retry)		\
+	x(BCH_ERR_btree_node_read_err,	btree_node_read_err_bad_node)		\
+	x(BCH_ERR_btree_node_read_err,	btree_node_read_err_incompatible)
 
 enum bch_errcode {
 	BCH_ERR_START		= 2048,
-- 
cgit 


From f6e6f42bbbe5e421b57182bb0c92a237701ca889 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 4 Aug 2023 10:51:02 -0400
Subject: bcachefs: Fix for bch2_copygc() spuriously returning -EEXIST

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/movinggc.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c
index 5242f20bb680..256431a6dc0c 100644
--- a/fs/bcachefs/movinggc.c
+++ b/fs/bcachefs/movinggc.c
@@ -220,8 +220,10 @@ static int bch2_copygc(struct btree_trans *trans,
 
 		f = move_bucket_in_flight_add(buckets_in_flight, *i);
 		ret = PTR_ERR_OR_ZERO(f);
-		if (ret == -EEXIST) /* rare race: copygc_get_buckets returned same bucket more than once */
+		if (ret == -EEXIST) { /* rare race: copygc_get_buckets returned same bucket more than once */
+			ret = 0;
 			continue;
+		}
 		if (ret == -ENOMEM) { /* flush IO, continue later */
 			ret = 0;
 			break;
-- 
cgit 


From 4198bf03bed27aa758de36595416beb604600912 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 3 Aug 2023 03:39:49 -0400
Subject: bcachefs: Fix lock thrashing in __bchfs_fallocate()

We've observed significant lock thrashing on fstests generic/083 in
fallocate, due to dropping and retaking btree locks when checking the
pagecache for data.

This adds a nonblocking mode to bch2_clamp_data_hole(), where we only
use folio_trylock(), and can thus be used safely while btree locks are
held - thus we only have to drop btree locks as a fallback, on actual
lock contention.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-io.c | 81 ++++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 56 insertions(+), 25 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index c461b65ab57a..917ad1c8f46d 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -35,7 +35,7 @@
 
 #include <trace/events/writeback.h>
 
-static void bch2_clamp_data_hole(struct inode *, u64 *, u64 *, unsigned);
+static int bch2_clamp_data_hole(struct inode *, u64 *, u64 *, unsigned, bool);
 
 struct folio_vec {
 	struct folio	*fv_folio;
@@ -3407,11 +3407,19 @@ static int __bchfs_fallocate(struct bch_inode_info *inode, int mode,
 		}
 
 		if (!(mode & FALLOC_FL_ZERO_RANGE)) {
-			ret = drop_locks_do(&trans,
-				(bch2_clamp_data_hole(&inode->v,
-						      &hole_start,
-						      &hole_end,
-						      opts.data_replicas), 0));
+			/*
+			 * Lock ordering - can't be holding btree locks while
+			 * blocking on a folio lock:
+			 */
+			if (bch2_clamp_data_hole(&inode->v,
+						 &hole_start,
+						 &hole_end,
+						 opts.data_replicas, true))
+				ret = drop_locks_do(&trans,
+					(bch2_clamp_data_hole(&inode->v,
+							      &hole_start,
+							      &hole_end,
+							      opts.data_replicas, false), 0));
 			bch2_btree_iter_set_pos(&iter, POS(iter.pos.inode, hole_start));
 
 			if (ret)
@@ -3711,7 +3719,8 @@ static int folio_data_offset(struct folio *folio, loff_t pos,
 static loff_t bch2_seek_pagecache_data(struct inode *vinode,
 				       loff_t start_offset,
 				       loff_t end_offset,
-				       unsigned min_replicas)
+				       unsigned min_replicas,
+				       bool nonblock)
 {
 	struct folio_batch fbatch;
 	pgoff_t start_index	= start_offset >> PAGE_SHIFT;
@@ -3728,7 +3737,13 @@ static loff_t bch2_seek_pagecache_data(struct inode *vinode,
 		for (i = 0; i < folio_batch_count(&fbatch); i++) {
 			struct folio *folio = fbatch.folios[i];
 
-			folio_lock(folio);
+			if (!nonblock) {
+				folio_lock(folio);
+			} else if (!folio_trylock(folio)) {
+				folio_batch_release(&fbatch);
+				return -EAGAIN;
+			}
+
 			offset = folio_data_offset(folio,
 					max(folio_pos(folio), start_offset),
 					min_replicas);
@@ -3793,7 +3808,7 @@ err:
 
 	if (next_data > offset)
 		next_data = bch2_seek_pagecache_data(&inode->v,
-						     offset, next_data, 0);
+					offset, next_data, 0, false);
 
 	if (next_data >= isize)
 		return -ENXIO;
@@ -3801,15 +3816,18 @@ err:
 	return vfs_setpos(file, next_data, MAX_LFS_FILESIZE);
 }
 
-static bool folio_hole_offset(struct address_space *mapping, loff_t *offset,
-			      unsigned min_replicas)
+static int folio_hole_offset(struct address_space *mapping, loff_t *offset,
+			      unsigned min_replicas, bool nonblock)
 {
 	struct folio *folio;
 	struct bch_folio *s;
 	unsigned i, sectors;
 	bool ret = true;
 
-	folio = filemap_lock_folio(mapping, *offset >> PAGE_SHIFT);
+	folio = __filemap_get_folio(mapping, *offset >> PAGE_SHIFT,
+				    FGP_LOCK|(nonblock ? FGP_NOWAIT : 0), 0);
+	if (folio == ERR_PTR(-EAGAIN))
+		return -EAGAIN;
 	if (IS_ERR_OR_NULL(folio))
 		return true;
 
@@ -3837,31 +3855,44 @@ unlock:
 static loff_t bch2_seek_pagecache_hole(struct inode *vinode,
 				       loff_t start_offset,
 				       loff_t end_offset,
-				       unsigned min_replicas)
+				       unsigned min_replicas,
+				       bool nonblock)
 {
 	struct address_space *mapping = vinode->i_mapping;
 	loff_t offset = start_offset;
 
 	while (offset < end_offset &&
-	       !folio_hole_offset(mapping, &offset, min_replicas))
+	       !folio_hole_offset(mapping, &offset, min_replicas, nonblock))
 		;
 
 	return min(offset, end_offset);
 }
 
-static void bch2_clamp_data_hole(struct inode *inode,
-				 u64 *hole_start,
-				 u64 *hole_end,
-				 unsigned min_replicas)
+static int bch2_clamp_data_hole(struct inode *inode,
+				u64 *hole_start,
+				u64 *hole_end,
+				unsigned min_replicas,
+				bool nonblock)
 {
-	*hole_start = bch2_seek_pagecache_hole(inode,
-		*hole_start << 9, *hole_end << 9, min_replicas) >> 9;
+	loff_t ret;
+
+	ret = bch2_seek_pagecache_hole(inode,
+		*hole_start << 9, *hole_end << 9, min_replicas, nonblock) >> 9;
+	if (ret < 0)
+		return ret;
+
+	*hole_start = ret;
 
 	if (*hole_start == *hole_end)
-		return;
+		return 0;
 
-	*hole_end = bch2_seek_pagecache_data(inode,
-		*hole_start << 9, *hole_end << 9, min_replicas) >> 9;
+	ret = bch2_seek_pagecache_data(inode,
+		*hole_start << 9, *hole_end << 9, min_replicas, nonblock) >> 9;
+	if (ret < 0)
+		return ret;
+
+	*hole_end = ret;
+	return 0;
 }
 
 static loff_t bch2_seek_hole(struct file *file, u64 offset)
@@ -3893,12 +3924,12 @@ retry:
 			   BTREE_ITER_SLOTS, k, ret) {
 		if (k.k->p.inode != inode->v.i_ino) {
 			next_hole = bch2_seek_pagecache_hole(&inode->v,
-					offset, MAX_LFS_FILESIZE, 0);
+					offset, MAX_LFS_FILESIZE, 0, false);
 			break;
 		} else if (!bkey_extent_is_data(k.k)) {
 			next_hole = bch2_seek_pagecache_hole(&inode->v,
 					max(offset, bkey_start_offset(k.k) << 9),
-					k.k->p.offset << 9, 0);
+					k.k->p.offset << 9, 0, false);
 
 			if (next_hole < k.k->p.offset << 9)
 				break;
-- 
cgit 


From e691b391f02b2ddef1a784ea2d4cd3f46bb6a62a Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 6 Aug 2023 10:04:05 -0400
Subject: bcachefs: Add logging to bch2_inode_peek() & related

Add error messages when we fail to lookup an inode, and also add a few
missing bch2_err_class() calls.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs.h | 4 ++--
 fs/bcachefs/fs.c       | 9 +++++++--
 fs/bcachefs/inode.c    | 2 ++
 3 files changed, 11 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 3b39597a677a..30b3d7b9f9dc 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -294,8 +294,8 @@ do {									\
 
 #define bch_err_fn(_c, _ret)						\
 	 bch_err(_c, "%s(): error %s", __func__, bch2_err_str(_ret))
-#define bch_err_msg(_c, _ret, _msg)					\
-	 bch_err(_c, "%s(): error " _msg " %s", __func__, bch2_err_str(_ret))
+#define bch_err_msg(_c, _ret, _msg, ...)				\
+	 bch_err(_c, "%s(): error " _msg " %s", __func__, ##__VA_ARGS__, bch2_err_str(_ret))
 
 #define bch_verbose(c, fmt, ...)					\
 do {									\
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index e1824bdffdf8..695b8bc55590 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -203,7 +203,7 @@ struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum)
 
 	if (ret) {
 		iget_failed(&inode->v);
-		return ERR_PTR(ret);
+		return ERR_PTR(bch2_err_class(ret));
 	}
 
 	mutex_lock(&c->vfs_inodes_lock);
@@ -1000,11 +1000,16 @@ static int bch2_vfs_readdir(struct file *file, struct dir_context *ctx)
 {
 	struct bch_inode_info *inode = file_bch_inode(file);
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	int ret;
 
 	if (!dir_emit_dots(file, ctx))
 		return 0;
 
-	return bch2_readdir(c, inode_inum(inode), ctx);
+	ret = bch2_readdir(c, inode_inum(inode), ctx);
+	if (ret)
+		bch_err_fn(c, ret);
+
+	return bch2_err_class(ret);
 }
 
 static const struct file_operations bch_file_operations = {
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index 294966e42850..e0d416553bf0 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -348,6 +348,8 @@ int bch2_inode_peek(struct btree_trans *trans,
 	return 0;
 err:
 	bch2_trans_iter_exit(trans, iter);
+	if (!bch2_err_matches(ret, BCH_ERR_transaction_restart))
+		bch_err_msg(trans->c, ret, "looking up inum %u:%llu:", inum.subvol, inum.inum);
 	return ret;
 }
 
-- 
cgit 


From 58705da67aedc3f68db322d2529ff52a5cdec0fa Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 6 Aug 2023 10:02:41 -0400
Subject: bcachefs: kill EBUG_ON() redefinition in bkey.c

our debug mode assertions in bkey.c haven't been getting run, whoops

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bkey.c | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bkey.c b/fs/bcachefs/bkey.c
index 34a8fe48581c..67f205992629 100644
--- a/fs/bcachefs/bkey.c
+++ b/fs/bcachefs/bkey.c
@@ -7,14 +7,6 @@
 #include "bset.h"
 #include "util.h"
 
-#undef EBUG_ON
-
-#ifdef DEBUG_BKEYS
-#define EBUG_ON(cond)		BUG_ON(cond)
-#else
-#define EBUG_ON(cond)
-#endif
-
 const struct bkey_format bch2_bkey_format_current = BKEY_FORMAT_CURRENT;
 
 void bch2_bkey_packed_to_binary_text(struct printbuf *out,
@@ -601,7 +593,14 @@ struct bkey_format bch2_bkey_format_done(struct bkey_format_state *s)
 		}
 	}
 
-	EBUG_ON(bch2_bkey_format_validate(&ret));
+#ifdef CONFIG_BCACHEFS_DEBUG
+	{
+		struct printbuf buf = PRINTBUF;
+
+		BUG_ON(bch2_bkey_format_validate(&ret, &buf));
+		printbuf_exit(&buf);
+	}
+#endif
 	return ret;
 }
 
-- 
cgit 


From e08e63e44e65761e6eb53fcd568d194f607daa61 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 6 Aug 2023 10:04:37 -0400
Subject: bcachefs: BCH_COMPAT_bformat_overflow_done no longer required

Awhile back, we changed bkey_format generation to ensure that the packed
representation could never represent fields larger than the unpacked
representation.

This was to ensure that bkey_packed_successor() always gave a sensible
result, but in the current code bkey_packed_successor() is only used in
a debug assertion - not for anything important.

This kills the requirement that we've gotten rid of those weird bkey
formats, and instead changes the assertion to check if we're dealing
with an old weird bkey format.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bkey.c         | 54 +++++++++++++++++++++++++++++++++-------------
 fs/bcachefs/bkey.h         |  9 +++++++-
 fs/bcachefs/bkey_methods.h |  6 ------
 fs/bcachefs/btree_io.c     |  2 +-
 fs/bcachefs/recovery.c     |  7 ------
 fs/bcachefs/super.c        |  1 +
 6 files changed, 49 insertions(+), 30 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bkey.c b/fs/bcachefs/bkey.c
index 67f205992629..d6960e259c80 100644
--- a/fs/bcachefs/bkey.c
+++ b/fs/bcachefs/bkey.c
@@ -424,6 +424,24 @@ static bool bkey_packed_successor(struct bkey_packed *out,
 
 	return false;
 }
+
+static bool bkey_format_has_too_big_fields(const struct bkey_format *f)
+{
+	for (unsigned i = 0; i < f->nr_fields; i++) {
+		unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i];
+		u64 unpacked_max = ~((~0ULL << 1) << (unpacked_bits - 1));
+		u64 packed_max = f->bits_per_field[i]
+			? ~((~0ULL << 1) << (f->bits_per_field[i] - 1))
+			: 0;
+		u64 field_offset = le64_to_cpu(f->field_offset[i]);
+
+		if (packed_max + field_offset < packed_max ||
+		    packed_max + field_offset > unpacked_max)
+			return true;
+	}
+
+	return false;
+}
 #endif
 
 /*
@@ -504,7 +522,8 @@ enum bkey_pack_pos_ret bch2_bkey_pack_pos_lossy(struct bkey_packed *out,
 
 		BUG_ON(bkey_cmp_left_packed(b, out, &orig) >= 0);
 		BUG_ON(bkey_packed_successor(&successor, b, *out) &&
-		       bkey_cmp_left_packed(b, &successor, &orig) < 0);
+		       bkey_cmp_left_packed(b, &successor, &orig) < 0 &&
+		       !bkey_format_has_too_big_fields(f));
 	}
 #endif
 
@@ -597,14 +616,17 @@ struct bkey_format bch2_bkey_format_done(struct bkey_format_state *s)
 	{
 		struct printbuf buf = PRINTBUF;
 
-		BUG_ON(bch2_bkey_format_validate(&ret, &buf));
+		BUG_ON(bch2_bkey_format_invalid(NULL, &ret, 0, &buf));
 		printbuf_exit(&buf);
 	}
 #endif
 	return ret;
 }
 
-int bch2_bkey_format_validate(struct bkey_format *f, struct printbuf *err)
+int bch2_bkey_format_invalid(struct bch_fs *c,
+			     struct bkey_format *f,
+			     enum bkey_invalid_flags flags,
+			     struct printbuf *err)
 {
 	unsigned i, bits = KEY_PACKED_BITS_START;
 
@@ -619,18 +641,20 @@ int bch2_bkey_format_validate(struct bkey_format *f, struct printbuf *err)
 	 * unpacked format:
 	 */
 	for (i = 0; i < f->nr_fields; i++) {
-		unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i];
-		u64 unpacked_max = ~((~0ULL << 1) << (unpacked_bits - 1));
-		u64 packed_max = f->bits_per_field[i]
-			? ~((~0ULL << 1) << (f->bits_per_field[i] - 1))
-			: 0;
-		u64 field_offset = le64_to_cpu(f->field_offset[i]);
-
-		if (packed_max + field_offset < packed_max ||
-		    packed_max + field_offset > unpacked_max) {
-			prt_printf(err, "field %u too large: %llu + %llu > %llu",
-				   i, packed_max, field_offset, unpacked_max);
-			return -BCH_ERR_invalid;
+		if (!c || c->sb.version_min >= bcachefs_metadata_version_snapshot) {
+			unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i];
+			u64 unpacked_max = ~((~0ULL << 1) << (unpacked_bits - 1));
+			u64 packed_max = f->bits_per_field[i]
+				? ~((~0ULL << 1) << (f->bits_per_field[i] - 1))
+				: 0;
+			u64 field_offset = le64_to_cpu(f->field_offset[i]);
+
+			if (packed_max + field_offset < packed_max ||
+			    packed_max + field_offset > unpacked_max) {
+				prt_printf(err, "field %u too large: %llu + %llu > %llu",
+					   i, packed_max, field_offset, unpacked_max);
+				return -BCH_ERR_invalid;
+			}
 		}
 
 		bits += f->bits_per_field[i];
diff --git a/fs/bcachefs/bkey.h b/fs/bcachefs/bkey.h
index 644caa2b2b25..51969a46265e 100644
--- a/fs/bcachefs/bkey.h
+++ b/fs/bcachefs/bkey.h
@@ -9,6 +9,12 @@
 #include "util.h"
 #include "vstructs.h"
 
+enum bkey_invalid_flags {
+	BKEY_INVALID_WRITE		= (1U << 0),
+	BKEY_INVALID_COMMIT		= (1U << 1),
+	BKEY_INVALID_JOURNAL		= (1U << 2),
+};
+
 #if 0
 
 /*
@@ -769,7 +775,8 @@ static inline void bch2_bkey_format_add_key(struct bkey_format_state *s, const s
 
 void bch2_bkey_format_add_pos(struct bkey_format_state *, struct bpos);
 struct bkey_format bch2_bkey_format_done(struct bkey_format_state *);
-int bch2_bkey_format_validate(struct bkey_format *, struct printbuf *);
+int bch2_bkey_format_invalid(struct bch_fs *, struct bkey_format *,
+			     enum bkey_invalid_flags, struct printbuf *);
 void bch2_bkey_format_to_text(struct printbuf *, const struct bkey_format *);
 
 #endif /* _BCACHEFS_BKEY_H */
diff --git a/fs/bcachefs/bkey_methods.h b/fs/bcachefs/bkey_methods.h
index d7b63769068c..668f595e2fcf 100644
--- a/fs/bcachefs/bkey_methods.h
+++ b/fs/bcachefs/bkey_methods.h
@@ -13,12 +13,6 @@ enum btree_node_type;
 extern const char * const bch2_bkey_types[];
 extern const struct bkey_ops bch2_bkey_null_ops;
 
-enum bkey_invalid_flags {
-	BKEY_INVALID_WRITE		= (1U << 0),
-	BKEY_INVALID_COMMIT		= (1U << 1),
-	BKEY_INVALID_JOURNAL		= (1U << 2),
-};
-
 /*
  * key_invalid: checks validity of @k, returns 0 if good or -EINVAL if bad. If
  * invalid, entire key will be deleted.
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index a3da5b4bcd21..cba3c081b1d0 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -779,7 +779,7 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
 			compat_btree_node(b->c.level, b->c.btree_id, version,
 					  BSET_BIG_ENDIAN(i), write, bn);
 
-		btree_err_on(bch2_bkey_format_validate(&bn->format, &buf1),
+		btree_err_on(bch2_bkey_format_invalid(c, &bn->format, write, &buf1),
 			     -BCH_ERR_btree_node_read_err_bad_node, c, ca, b, i,
 			     "invalid bkey format: %s\n  %s", buf1.buf,
 			     (printbuf_reset(&buf2),
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index edc9830d8163..35b67c544a6a 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -1322,12 +1322,6 @@ int bch2_fs_recovery(struct bch_fs *c)
 		goto err;
 	}
 
-	if (!(c->sb.compat & (1ULL << BCH_COMPAT_bformat_overflow_done))) {
-		bch_err(c, "filesystem may have incompatible bkey formats; run fsck from the compat branch to fix");
-		ret = -EINVAL;
-		goto err;
-	}
-
 	if (c->opts.fsck || !(c->opts.nochanges && c->opts.norecovery))
 		check_version_upgrade(c);
 
@@ -1527,7 +1521,6 @@ use_clean:
 	mutex_unlock(&c->sb_lock);
 
 	if (!(c->sb.compat & (1ULL << BCH_COMPAT_extents_above_btree_updates_done)) ||
-	    !(c->sb.compat & (1ULL << BCH_COMPAT_bformat_overflow_done)) ||
 	    c->sb.version_min < bcachefs_metadata_version_btree_ptr_sectors_written) {
 		struct bch_move_stats stats;
 
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index c17fdcd08390..7ec22631cdd3 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -1996,6 +1996,7 @@ err:
 BCH_DEBUG_PARAMS()
 #undef BCH_DEBUG_PARAM
 
+__maybe_unused
 static unsigned bch2_metadata_version = bcachefs_metadata_version_current;
 module_param_named(version, bch2_metadata_version, uint, 0400);
 
-- 
cgit 


From a8712967bf07dace4d6660bdcfe939b9043f1f16 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 6 Aug 2023 10:57:25 -0400
Subject: bcachefs: Improve journal_entry_err_msg()

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/journal_io.c | 98 +++++++++++++++++++++++++-----------------------
 1 file changed, 52 insertions(+), 46 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index f861ae2f176a..86ca2c2d9f72 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -208,30 +208,38 @@ static void journal_entry_null_range(void *start, void *end)
 #define JOURNAL_ENTRY_BAD	7
 
 static void journal_entry_err_msg(struct printbuf *out,
+				  u32 version,
 				  struct jset *jset,
 				  struct jset_entry *entry)
 {
-	prt_str(out, "invalid journal entry ");
-	if (entry)
-		prt_printf(out, "%s ", bch2_jset_entry_types[entry->type]);
-
-	if (!jset)
-		prt_printf(out, "in superblock");
-	else if (!entry)
-		prt_printf(out, "at seq %llu", le64_to_cpu(jset->seq));
-	else
-		prt_printf(out, "at offset %zi/%u seq %llu",
-			   (u64 *) entry - jset->_data,
-			   le32_to_cpu(jset->u64s),
-			   le64_to_cpu(jset->seq));
+	prt_str(out, "invalid journal entry, version=");
+	bch2_version_to_text(out, version);
+
+	if (entry) {
+		prt_str(out, " type=");
+		prt_str(out, bch2_jset_entry_types[entry->type]);
+	}
+
+	if (!jset) {
+		prt_printf(out, " in superblock");
+	} else {
+
+		prt_printf(out, " seq=%llu", le64_to_cpu(jset->seq));
+
+		if (entry)
+			prt_printf(out, " offset=%zi/%u",
+				   (u64 *) entry - jset->_data,
+				   le32_to_cpu(jset->u64s));
+	}
+
 	prt_str(out, ": ");
 }
 
-#define journal_entry_err(c, jset, entry, msg, ...)			\
+#define journal_entry_err(c, version, jset, entry, msg, ...)		\
 ({									\
 	struct printbuf buf = PRINTBUF;					\
 									\
-	journal_entry_err_msg(&buf, jset, entry);			\
+	journal_entry_err_msg(&buf, version, jset, entry);		\
 	prt_printf(&buf, msg, ##__VA_ARGS__);				\
 									\
 	switch (write) {						\
@@ -251,8 +259,8 @@ static void journal_entry_err_msg(struct printbuf *out,
 	true;								\
 })
 
-#define journal_entry_err_on(cond, c, jset, entry, msg, ...)		\
-	((cond) ? journal_entry_err(c, jset, entry, msg, ##__VA_ARGS__) : false)
+#define journal_entry_err_on(cond, c, version, jset, entry, msg, ...)	\
+	((cond) ? journal_entry_err(c, version, jset, entry, msg, ##__VA_ARGS__) : false)
 
 #define FSCK_DELETED_KEY	5
 
@@ -267,7 +275,7 @@ static int journal_validate_key(struct bch_fs *c,
 	struct printbuf buf = PRINTBUF;
 	int ret = 0;
 
-	if (journal_entry_err_on(!k->k.u64s, c, jset, entry, "k->u64s 0")) {
+	if (journal_entry_err_on(!k->k.u64s, c, version, jset, entry, "k->u64s 0")) {
 		entry->u64s = cpu_to_le16((u64 *) k - entry->_data);
 		journal_entry_null_range(vstruct_next(entry), next);
 		return FSCK_DELETED_KEY;
@@ -275,7 +283,7 @@ static int journal_validate_key(struct bch_fs *c,
 
 	if (journal_entry_err_on((void *) bkey_next(k) >
 				 (void *) vstruct_next(entry),
-				 c, jset, entry,
+				 c, version, jset, entry,
 				 "extends past end of journal entry")) {
 		entry->u64s = cpu_to_le16((u64 *) k - entry->_data);
 		journal_entry_null_range(vstruct_next(entry), next);
@@ -283,7 +291,7 @@ static int journal_validate_key(struct bch_fs *c,
 	}
 
 	if (journal_entry_err_on(k->k.format != KEY_FORMAT_CURRENT,
-				 c, jset, entry,
+				 c, version, jset, entry,
 				 "bad format %u", k->k.format)) {
 		le16_add_cpu(&entry->u64s, -((u16) k->k.u64s));
 		memmove(k, bkey_next(k), next - (void *) bkey_next(k));
@@ -298,11 +306,7 @@ static int journal_validate_key(struct bch_fs *c,
 	if (bch2_bkey_invalid(c, bkey_i_to_s_c(k),
 			      __btree_node_type(level, btree_id), write, &buf)) {
 		printbuf_reset(&buf);
-		prt_printf(&buf, "invalid journal entry %s at offset %zi/%u seq %llu:",
-			   bch2_jset_entry_types[entry->type],
-			   (u64 *) entry - jset->_data,
-			   le32_to_cpu(jset->u64s),
-			   le64_to_cpu(jset->seq));
+		journal_entry_err_msg(&buf, version, jset, entry);
 		prt_newline(&buf);
 		printbuf_indent_add(&buf, 2);
 
@@ -378,7 +382,7 @@ static int journal_entry_btree_root_validate(struct bch_fs *c,
 
 	if (journal_entry_err_on(!entry->u64s ||
 				 le16_to_cpu(entry->u64s) != k->k.u64s,
-				 c, jset, entry,
+				 c, version, jset, entry,
 				 "invalid btree root journal entry: wrong number of keys")) {
 		void *next = vstruct_next(entry);
 		/*
@@ -425,7 +429,7 @@ static int journal_entry_blacklist_validate(struct bch_fs *c,
 	int ret = 0;
 
 	if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 1,
-				 c, jset, entry,
+				 c, version, jset, entry,
 		"invalid journal seq blacklist entry: bad size")) {
 		journal_entry_null_range(entry, vstruct_next(entry));
 	}
@@ -451,7 +455,7 @@ static int journal_entry_blacklist_v2_validate(struct bch_fs *c,
 	int ret = 0;
 
 	if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 2,
-				 c, jset, entry,
+				 c, version, jset, entry,
 		"invalid journal seq blacklist entry: bad size")) {
 		journal_entry_null_range(entry, vstruct_next(entry));
 		goto out;
@@ -461,7 +465,7 @@ static int journal_entry_blacklist_v2_validate(struct bch_fs *c,
 
 	if (journal_entry_err_on(le64_to_cpu(bl_entry->start) >
 				 le64_to_cpu(bl_entry->end),
-				 c, jset, entry,
+				 c, version, jset, entry,
 		"invalid journal seq blacklist entry: start > end")) {
 		journal_entry_null_range(entry, vstruct_next(entry));
 	}
@@ -492,7 +496,7 @@ static int journal_entry_usage_validate(struct bch_fs *c,
 	int ret = 0;
 
 	if (journal_entry_err_on(bytes < sizeof(*u),
-				 c, jset, entry,
+				 c, version, jset, entry,
 				 "invalid journal entry usage: bad size")) {
 		journal_entry_null_range(entry, vstruct_next(entry));
 		return ret;
@@ -525,7 +529,7 @@ static int journal_entry_data_usage_validate(struct bch_fs *c,
 
 	if (journal_entry_err_on(bytes < sizeof(*u) ||
 				 bytes < sizeof(*u) + u->r.nr_devs,
-				 c, jset, entry,
+				 c, version, jset, entry,
 				 "invalid journal entry usage: bad size")) {
 		journal_entry_null_range(entry, vstruct_next(entry));
 		return ret;
@@ -556,13 +560,13 @@ static int journal_entry_clock_validate(struct bch_fs *c,
 	int ret = 0;
 
 	if (journal_entry_err_on(bytes != sizeof(*clock),
-				 c, jset, entry, "bad size")) {
+				 c, version, jset, entry, "bad size")) {
 		journal_entry_null_range(entry, vstruct_next(entry));
 		return ret;
 	}
 
 	if (journal_entry_err_on(clock->rw > 1,
-				 c, jset, entry, "bad rw")) {
+				 c, version, jset, entry, "bad rw")) {
 		journal_entry_null_range(entry, vstruct_next(entry));
 		return ret;
 	}
@@ -593,7 +597,7 @@ static int journal_entry_dev_usage_validate(struct bch_fs *c,
 	int ret = 0;
 
 	if (journal_entry_err_on(bytes < expected,
-				 c, jset, entry, "bad size (%u < %u)",
+				 c, version, jset, entry, "bad size (%u < %u)",
 				 bytes, expected)) {
 		journal_entry_null_range(entry, vstruct_next(entry));
 		return ret;
@@ -602,13 +606,13 @@ static int journal_entry_dev_usage_validate(struct bch_fs *c,
 	dev = le32_to_cpu(u->dev);
 
 	if (journal_entry_err_on(!bch2_dev_exists2(c, dev),
-				 c, jset, entry, "bad dev")) {
+				 c, version, jset, entry, "bad dev")) {
 		journal_entry_null_range(entry, vstruct_next(entry));
 		return ret;
 	}
 
 	if (journal_entry_err_on(u->pad,
-				 c, jset, entry, "bad pad")) {
+				 c, version, jset, entry, "bad pad")) {
 		journal_entry_null_range(entry, vstruct_next(entry));
 		return ret;
 	}
@@ -714,19 +718,19 @@ static int jset_validate_entries(struct bch_fs *c, struct jset *jset,
 				 int write)
 {
 	struct jset_entry *entry;
+	unsigned version = le32_to_cpu(jset->version);
 	int ret = 0;
 
 	vstruct_for_each(jset, entry) {
-		if (journal_entry_err_on(vstruct_next(entry) >
-					 vstruct_last(jset), c, jset, entry,
+		if (journal_entry_err_on(vstruct_next(entry) > vstruct_last(jset),
+					 c, version, jset, entry,
 				"journal entry extends past end of jset")) {
 			jset->u64s = cpu_to_le32((u64 *) entry - jset->_data);
 			break;
 		}
 
 		ret = bch2_journal_entry_validate(c, jset, entry,
-					le32_to_cpu(jset->version),
-					JSET_BIG_ENDIAN(jset), write);
+					version, JSET_BIG_ENDIAN(jset), write);
 		if (ret)
 			break;
 	}
@@ -746,7 +750,8 @@ static int jset_validate(struct bch_fs *c,
 		return JOURNAL_ENTRY_NONE;
 
 	version = le32_to_cpu(jset->version);
-	if (journal_entry_err_on(!bch2_version_compatible(version), c, jset, NULL,
+	if (journal_entry_err_on(!bch2_version_compatible(version),
+			c, version, jset, NULL,
 			"%s sector %llu seq %llu: incompatible journal entry version %u.%u",
 			ca ? ca->name : c->name,
 			sector, le64_to_cpu(jset->seq),
@@ -757,7 +762,7 @@ static int jset_validate(struct bch_fs *c,
 	}
 
 	if (journal_entry_err_on(!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(jset)),
-				 c, jset, NULL,
+				 c, version, jset, NULL,
 			"%s sector %llu seq %llu: journal entry with unknown csum type %llu",
 			ca ? ca->name : c->name,
 			sector, le64_to_cpu(jset->seq),
@@ -767,7 +772,7 @@ static int jset_validate(struct bch_fs *c,
 	/* last_seq is ignored when JSET_NO_FLUSH is true */
 	if (journal_entry_err_on(!JSET_NO_FLUSH(jset) &&
 				 le64_to_cpu(jset->last_seq) > le64_to_cpu(jset->seq),
-				 c, jset, NULL,
+				 c, version, jset, NULL,
 				 "invalid journal entry: last_seq > seq (%llu > %llu)",
 				 le64_to_cpu(jset->last_seq),
 				 le64_to_cpu(jset->seq))) {
@@ -795,7 +800,8 @@ static int jset_validate_early(struct bch_fs *c,
 		return JOURNAL_ENTRY_NONE;
 
 	version = le32_to_cpu(jset->version);
-	if (journal_entry_err_on(!bch2_version_compatible(version), c, jset, NULL,
+	if (journal_entry_err_on(!bch2_version_compatible(version),
+			 c, version, jset, NULL,
 			"%s sector %llu seq %llu: unknown journal entry version %u.%u",
 			ca ? ca->name : c->name,
 			sector, le64_to_cpu(jset->seq),
@@ -810,7 +816,7 @@ static int jset_validate_early(struct bch_fs *c,
 		return JOURNAL_ENTRY_REREAD;
 
 	if (journal_entry_err_on(bytes > bucket_sectors_left << 9,
-				 c, jset, NULL,
+			 c, version, jset, NULL,
 			"%s sector %llu seq %llu: journal entry too big (%zu bytes)",
 			ca ? ca->name : c->name,
 			sector, le64_to_cpu(jset->seq), bytes))
@@ -1149,7 +1155,7 @@ int bch2_journal_read(struct bch_fs *c,
 		}
 
 		if (journal_entry_err_on(le64_to_cpu(i->j.last_seq) > le64_to_cpu(i->j.seq),
-					 c, &i->j, NULL,
+					 c, le32_to_cpu(i->j.version), &i->j, NULL,
 					 "invalid journal entry: last_seq > seq (%llu > %llu)",
 					 le64_to_cpu(i->j.last_seq),
 					 le64_to_cpu(i->j.seq)))
-- 
cgit 


From c4e382e234778197c95c5553024e0b3f93103382 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 6 Aug 2023 12:43:31 -0400
Subject: bcachefs: Convert journal validation to bkey_invalid_flags

This fixes a bug where we were already passing bkey_invalid_flags
around, but treating the parameter as just read/write - so the compat
code wasn't being run correctly.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/journal_io.c | 108 +++++++++++++++++++++++++++--------------------
 fs/bcachefs/journal_io.h |   3 +-
 2 files changed, 64 insertions(+), 47 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 86ca2c2d9f72..798d4c8c7200 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -242,7 +242,7 @@ static void journal_entry_err_msg(struct printbuf *out,
 	journal_entry_err_msg(&buf, version, jset, entry);		\
 	prt_printf(&buf, msg, ##__VA_ARGS__);				\
 									\
-	switch (write) {						\
+	switch (flags & BKEY_INVALID_WRITE) {				\
 	case READ:							\
 		mustfix_fsck_err(c, "%s", buf.buf);			\
 		break;							\
@@ -269,8 +269,10 @@ static int journal_validate_key(struct bch_fs *c,
 				struct jset_entry *entry,
 				unsigned level, enum btree_id btree_id,
 				struct bkey_i *k,
-				unsigned version, int big_endian, int write)
+				unsigned version, int big_endian,
+				enum bkey_invalid_flags flags)
 {
+	int write = flags & BKEY_INVALID_WRITE;
 	void *next = vstruct_next(entry);
 	struct printbuf buf = PRINTBUF;
 	int ret = 0;
@@ -316,6 +318,7 @@ static int journal_validate_key(struct bch_fs *c,
 				  __btree_node_type(level, btree_id), write, &buf);
 
 		mustfix_fsck_err(c, "%s", buf.buf);
+		BUG();
 
 		le16_add_cpu(&entry->u64s, -((u16) k->k.u64s));
 		memmove(k, bkey_next(k), next - (void *) bkey_next(k));
@@ -334,9 +337,10 @@ fsck_err:
 }
 
 static int journal_entry_btree_keys_validate(struct bch_fs *c,
-					     struct jset *jset,
-					     struct jset_entry *entry,
-					     unsigned version, int big_endian, int write)
+				struct jset *jset,
+				struct jset_entry *entry,
+				unsigned version, int big_endian,
+				enum bkey_invalid_flags flags)
 {
 	struct bkey_i *k = entry->start;
 
@@ -345,7 +349,7 @@ static int journal_entry_btree_keys_validate(struct bch_fs *c,
 					       entry->level,
 					       entry->btree_id,
 					       k, version, big_endian,
-					       write|BKEY_INVALID_JOURNAL);
+					       flags|BKEY_INVALID_JOURNAL);
 		if (ret == FSCK_DELETED_KEY)
 			continue;
 
@@ -373,9 +377,10 @@ static void journal_entry_btree_keys_to_text(struct printbuf *out, struct bch_fs
 }
 
 static int journal_entry_btree_root_validate(struct bch_fs *c,
-					     struct jset *jset,
-					     struct jset_entry *entry,
-					     unsigned version, int big_endian, int write)
+				struct jset *jset,
+				struct jset_entry *entry,
+				unsigned version, int big_endian,
+				enum bkey_invalid_flags flags)
 {
 	struct bkey_i *k = entry->start;
 	int ret = 0;
@@ -396,7 +401,7 @@ static int journal_entry_btree_root_validate(struct bch_fs *c,
 	}
 
 	return journal_validate_key(c, jset, entry, 1, entry->btree_id, k,
-				    version, big_endian, write);
+				    version, big_endian, flags);
 fsck_err:
 	return ret;
 }
@@ -408,9 +413,10 @@ static void journal_entry_btree_root_to_text(struct printbuf *out, struct bch_fs
 }
 
 static int journal_entry_prio_ptrs_validate(struct bch_fs *c,
-					    struct jset *jset,
-					    struct jset_entry *entry,
-					    unsigned version, int big_endian, int write)
+				struct jset *jset,
+				struct jset_entry *entry,
+				unsigned version, int big_endian,
+				enum bkey_invalid_flags flags)
 {
 	/* obsolete, don't care: */
 	return 0;
@@ -422,9 +428,10 @@ static void journal_entry_prio_ptrs_to_text(struct printbuf *out, struct bch_fs
 }
 
 static int journal_entry_blacklist_validate(struct bch_fs *c,
-					    struct jset *jset,
-					    struct jset_entry *entry,
-					    unsigned version, int big_endian, int write)
+				struct jset *jset,
+				struct jset_entry *entry,
+				unsigned version, int big_endian,
+				enum bkey_invalid_flags flags)
 {
 	int ret = 0;
 
@@ -447,9 +454,10 @@ static void journal_entry_blacklist_to_text(struct printbuf *out, struct bch_fs
 }
 
 static int journal_entry_blacklist_v2_validate(struct bch_fs *c,
-					       struct jset *jset,
-					       struct jset_entry *entry,
-					       unsigned version, int big_endian, int write)
+				struct jset *jset,
+				struct jset_entry *entry,
+				unsigned version, int big_endian,
+				enum bkey_invalid_flags flags)
 {
 	struct jset_entry_blacklist_v2 *bl_entry;
 	int ret = 0;
@@ -486,9 +494,10 @@ static void journal_entry_blacklist_v2_to_text(struct printbuf *out, struct bch_
 }
 
 static int journal_entry_usage_validate(struct bch_fs *c,
-					struct jset *jset,
-					struct jset_entry *entry,
-					unsigned version, int big_endian, int write)
+				struct jset *jset,
+				struct jset_entry *entry,
+				unsigned version, int big_endian,
+				enum bkey_invalid_flags flags)
 {
 	struct jset_entry_usage *u =
 		container_of(entry, struct jset_entry_usage, entry);
@@ -518,9 +527,10 @@ static void journal_entry_usage_to_text(struct printbuf *out, struct bch_fs *c,
 }
 
 static int journal_entry_data_usage_validate(struct bch_fs *c,
-					struct jset *jset,
-					struct jset_entry *entry,
-					unsigned version, int big_endian, int write)
+				struct jset *jset,
+				struct jset_entry *entry,
+				unsigned version, int big_endian,
+				enum bkey_invalid_flags flags)
 {
 	struct jset_entry_data_usage *u =
 		container_of(entry, struct jset_entry_data_usage, entry);
@@ -550,9 +560,10 @@ static void journal_entry_data_usage_to_text(struct printbuf *out, struct bch_fs
 }
 
 static int journal_entry_clock_validate(struct bch_fs *c,
-					struct jset *jset,
-					struct jset_entry *entry,
-					unsigned version, int big_endian, int write)
+				struct jset *jset,
+				struct jset_entry *entry,
+				unsigned version, int big_endian,
+				enum bkey_invalid_flags flags)
 {
 	struct jset_entry_clock *clock =
 		container_of(entry, struct jset_entry_clock, entry);
@@ -585,9 +596,10 @@ static void journal_entry_clock_to_text(struct printbuf *out, struct bch_fs *c,
 }
 
 static int journal_entry_dev_usage_validate(struct bch_fs *c,
-					    struct jset *jset,
-					    struct jset_entry *entry,
-					    unsigned version, int big_endian, int write)
+				struct jset *jset,
+				struct jset_entry *entry,
+				unsigned version, int big_endian,
+				enum bkey_invalid_flags flags)
 {
 	struct jset_entry_dev_usage *u =
 		container_of(entry, struct jset_entry_dev_usage, entry);
@@ -645,9 +657,10 @@ static void journal_entry_dev_usage_to_text(struct printbuf *out, struct bch_fs
 }
 
 static int journal_entry_log_validate(struct bch_fs *c,
-				      struct jset *jset,
-				      struct jset_entry *entry,
-				      unsigned version, int big_endian, int write)
+				struct jset *jset,
+				struct jset_entry *entry,
+				unsigned version, int big_endian,
+				enum bkey_invalid_flags flags)
 {
 	return 0;
 }
@@ -662,9 +675,10 @@ static void journal_entry_log_to_text(struct printbuf *out, struct bch_fs *c,
 }
 
 static int journal_entry_overwrite_validate(struct bch_fs *c,
-				      struct jset *jset,
-				      struct jset_entry *entry,
-				      unsigned version, int big_endian, int write)
+				struct jset *jset,
+				struct jset_entry *entry,
+				unsigned version, int big_endian,
+				enum bkey_invalid_flags flags)
 {
 	return journal_entry_btree_keys_validate(c, jset, entry,
 				version, big_endian, READ);
@@ -678,7 +692,8 @@ static void journal_entry_overwrite_to_text(struct printbuf *out, struct bch_fs
 
 struct jset_entry_ops {
 	int (*validate)(struct bch_fs *, struct jset *,
-			struct jset_entry *, unsigned, int, int);
+			struct jset_entry *, unsigned, int,
+			enum bkey_invalid_flags);
 	void (*to_text)(struct printbuf *, struct bch_fs *, struct jset_entry *);
 };
 
@@ -695,11 +710,12 @@ static const struct jset_entry_ops bch2_jset_entry_ops[] = {
 int bch2_journal_entry_validate(struct bch_fs *c,
 				struct jset *jset,
 				struct jset_entry *entry,
-				unsigned version, int big_endian, int write)
+				unsigned version, int big_endian,
+				enum bkey_invalid_flags flags)
 {
 	return entry->type < BCH_JSET_ENTRY_NR
 		? bch2_jset_entry_ops[entry->type].validate(c, jset, entry,
-				version, big_endian, write)
+				version, big_endian, flags)
 		: 0;
 }
 
@@ -715,7 +731,7 @@ void bch2_journal_entry_to_text(struct printbuf *out, struct bch_fs *c,
 }
 
 static int jset_validate_entries(struct bch_fs *c, struct jset *jset,
-				 int write)
+				 enum bkey_invalid_flags flags)
 {
 	struct jset_entry *entry;
 	unsigned version = le32_to_cpu(jset->version);
@@ -730,7 +746,7 @@ static int jset_validate_entries(struct bch_fs *c, struct jset *jset,
 		}
 
 		ret = bch2_journal_entry_validate(c, jset, entry,
-					version, JSET_BIG_ENDIAN(jset), write);
+					version, JSET_BIG_ENDIAN(jset), flags);
 		if (ret)
 			break;
 	}
@@ -741,7 +757,7 @@ fsck_err:
 static int jset_validate(struct bch_fs *c,
 			 struct bch_dev *ca,
 			 struct jset *jset, u64 sector,
-			 int write)
+			 enum bkey_invalid_flags flags)
 {
 	unsigned version;
 	int ret = 0;
@@ -780,7 +796,7 @@ static int jset_validate(struct bch_fs *c,
 		return JOURNAL_ENTRY_BAD;
 	}
 
-	ret = jset_validate_entries(c, jset, write);
+	ret = jset_validate_entries(c, jset, flags);
 fsck_err:
 	return ret;
 }
@@ -793,7 +809,7 @@ static int jset_validate_early(struct bch_fs *c,
 {
 	size_t bytes = vstruct_bytes(jset);
 	unsigned version;
-	int write = READ;
+	enum bkey_invalid_flags flags = BKEY_INVALID_JOURNAL;
 	int ret = 0;
 
 	if (le64_to_cpu(jset->magic) != jset_magic(c))
@@ -1133,7 +1149,7 @@ int bch2_journal_read(struct bch_fs *c,
 	 * those entries will be blacklisted:
 	 */
 	genradix_for_each_reverse(&c->journal_entries, radix_iter, _i) {
-		int write = READ;
+		enum bkey_invalid_flags flags = BKEY_INVALID_JOURNAL;
 
 		i = *_i;
 
diff --git a/fs/bcachefs/journal_io.h b/fs/bcachefs/journal_io.h
index 8801e98104bd..a88d097b13f1 100644
--- a/fs/bcachefs/journal_io.h
+++ b/fs/bcachefs/journal_io.h
@@ -50,7 +50,8 @@ static inline struct jset_entry *__jset_entry_type_next(struct jset *jset,
 		jset_entry_for_each_key(entry, k)
 
 int bch2_journal_entry_validate(struct bch_fs *, struct jset *,
-				struct jset_entry *, unsigned, int, int);
+				struct jset_entry *, unsigned, int,
+				enum bkey_invalid_flags);
 void bch2_journal_entry_to_text(struct printbuf *, struct bch_fs *,
 				struct jset_entry *);
 
-- 
cgit 


From 6fe893eade864665c0956a2ac2eff78b86dc8145 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 7 Aug 2023 20:44:56 -0400
Subject: bcachefs: Fix for sb buffer being misaligned

On old kernels, kmalloc() may return an allocation that's not naturally
aligned - this resulted in a bug where we allocated a bio with not
enough biovecs. Fix this by using buf_pages().

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/super-io.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index beb00f799fe4..a58b9750b6ce 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -203,8 +203,14 @@ int bch2_sb_realloc(struct bch_sb_handle *sb, unsigned u64s)
 	if (dynamic_fault("bcachefs:add:super_realloc"))
 		return -BCH_ERR_ENOMEM_sb_realloc_injected;
 
+	new_sb = krealloc(sb->sb, new_buffer_size, GFP_NOFS|__GFP_ZERO);
+	if (!new_sb)
+		return -BCH_ERR_ENOMEM_sb_buf_realloc;
+
+	sb->sb = new_sb;
+
 	if (sb->have_bio) {
-		unsigned nr_bvecs = DIV_ROUND_UP(new_buffer_size, PAGE_SIZE);
+		unsigned nr_bvecs = buf_pages(sb->sb, new_buffer_size);
 
 		bio = bio_kmalloc(nr_bvecs, GFP_KERNEL);
 		if (!bio)
@@ -216,11 +222,6 @@ int bch2_sb_realloc(struct bch_sb_handle *sb, unsigned u64s)
 		sb->bio = bio;
 	}
 
-	new_sb = krealloc(sb->sb, new_buffer_size, GFP_NOFS|__GFP_ZERO);
-	if (!new_sb)
-		return -BCH_ERR_ENOMEM_sb_buf_realloc;
-
-	sb->sb = new_sb;
 	sb->buffer_size = new_buffer_size;
 
 	return 0;
-- 
cgit 


From 1e81f89b020758fb424f8bb0f13405706d29dfc7 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 7 Aug 2023 12:04:05 -0400
Subject: bcachefs: Fix assorted checkpatch nits

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_types.h     |  4 ++--
 fs/bcachefs/btree_gc.c        |  2 +-
 fs/bcachefs/btree_io.h        |  4 ++--
 fs/bcachefs/btree_iter.c      |  8 ++++----
 fs/bcachefs/btree_update.h    |  4 ++--
 fs/bcachefs/buckets.c         |  1 +
 fs/bcachefs/chardev.h         |  2 +-
 fs/bcachefs/checksum.c        | 12 +++++++-----
 fs/bcachefs/compress.c        |  3 ++-
 fs/bcachefs/extents.c         | 12 ++++++------
 fs/bcachefs/fs-io.c           |  2 +-
 fs/bcachefs/io.c              |  1 +
 fs/bcachefs/io.h              |  2 +-
 fs/bcachefs/journal.c         |  4 ++--
 fs/bcachefs/journal_io.c      |  2 +-
 fs/bcachefs/journal_reclaim.c |  2 +-
 fs/bcachefs/recovery.c        |  4 ++--
 fs/bcachefs/super-io.c        |  4 +++-
 fs/bcachefs/super-io.h        |  1 +
 fs/bcachefs/util.c            | 11 +++++++----
 fs/bcachefs/util.h            |  8 ++++++--
 fs/bcachefs/varint.c          |  1 +
 22 files changed, 55 insertions(+), 39 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_types.h b/fs/bcachefs/alloc_types.h
index 804a843f23c1..b91b7a461056 100644
--- a/fs/bcachefs/alloc_types.h
+++ b/fs/bcachefs/alloc_types.h
@@ -105,7 +105,7 @@ struct write_point {
 		struct dev_stripe_state	stripe;
 
 		u64			sectors_allocated;
-	} __attribute__((__aligned__(SMP_CACHE_BYTES)));
+	} __aligned(SMP_CACHE_BYTES);
 
 	struct {
 		struct work_struct	index_update_work;
@@ -116,7 +116,7 @@ struct write_point {
 		enum write_point_state	state;
 		u64			last_state_change;
 		u64			time[WRITE_POINT_STATE_NR];
-	} __attribute__((__aligned__(SMP_CACHE_BYTES)));
+	} __aligned(SMP_CACHE_BYTES);
 };
 
 struct write_point_specifier {
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 3f5b4d8ee4a5..1a749d4be5b9 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -535,7 +535,7 @@ int bch2_check_topology(struct bch_fs *c)
 
 	bch2_trans_init(&trans, c, 0, 0);
 
-	for (i = 0; i < btree_id_nr_alive(c)&& !ret; i++) {
+	for (i = 0; i < btree_id_nr_alive(c) && !ret; i++) {
 		struct btree_root *r = bch2_btree_id_root(c, i);
 
 		if (!r->alive)
diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h
index 0cadf651e7cf..cd99bbb00a5a 100644
--- a/fs/bcachefs/btree_io.h
+++ b/fs/bcachefs/btree_io.h
@@ -143,8 +143,8 @@ enum btree_write_flags {
 	__BTREE_WRITE_ONLY_IF_NEED = BTREE_WRITE_TYPE_BITS,
 	__BTREE_WRITE_ALREADY_STARTED,
 };
-#define BTREE_WRITE_ONLY_IF_NEED	(1U << __BTREE_WRITE_ONLY_IF_NEED )
-#define BTREE_WRITE_ALREADY_STARTED	(1U << __BTREE_WRITE_ALREADY_STARTED)
+#define BTREE_WRITE_ONLY_IF_NEED	BIT(__BTREE_WRITE_ONLY_IF_NEED)
+#define BTREE_WRITE_ALREADY_STARTED	BIT(__BTREE_WRITE_ALREADY_STARTED)
 
 void __bch2_btree_node_write(struct bch_fs *, struct btree *, unsigned);
 void bch2_btree_node_write(struct bch_fs *, struct btree *,
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index ecc123b2d1b3..feb23e9c2a1a 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1008,7 +1008,7 @@ retry_all:
 	/*
 	 * We used to assert that all paths had been traversed here
 	 * (path->uptodate < BTREE_ITER_NEED_TRAVERSE); however, since
-	 * path->Should_be_locked is not set yet, we we might have unlocked and
+	 * path->should_be_locked is not set yet, we might have unlocked and
 	 * then failed to relock a path - that's fine.
 	 */
 err:
@@ -2738,9 +2738,9 @@ void bch2_trans_node_iter_init(struct btree_trans *trans,
 			       unsigned depth,
 			       unsigned flags)
 {
-       flags |= BTREE_ITER_NOT_EXTENTS;
-       flags |= __BTREE_ITER_ALL_SNAPSHOTS;
-       flags |= BTREE_ITER_ALL_SNAPSHOTS;
+	flags |= BTREE_ITER_NOT_EXTENTS;
+	flags |= __BTREE_ITER_ALL_SNAPSHOTS;
+	flags |= BTREE_ITER_ALL_SNAPSHOTS;
 
 	bch2_trans_iter_init_common(trans, iter, btree_id, pos, locks_want, depth,
 			       __bch2_btree_iter_flags(trans, btree_id, flags),
diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
index 2281140a288c..901c42b57c35 100644
--- a/fs/bcachefs/btree_update.h
+++ b/fs/bcachefs/btree_update.h
@@ -268,10 +268,10 @@ static inline struct bkey_i *__bch2_bkey_get_mut_noupdate(struct btree_trans *tr
 {
 	struct bkey_s_c k = __bch2_bkey_get_iter(trans, iter,
 				btree_id, pos, flags|BTREE_ITER_INTENT, type);
-	struct bkey_i *ret = unlikely(IS_ERR(k.k))
+	struct bkey_i *ret = IS_ERR(k.k)
 		? ERR_CAST(k.k)
 		: __bch2_bkey_make_mut_noupdate(trans, k, 0, min_bytes);
-	if (unlikely(IS_ERR(ret)))
+	if (IS_ERR(ret))
 		bch2_trans_iter_exit(trans, iter);
 	return ret;
 }
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 7bb7f0caee45..c02c8c917a29 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -1924,6 +1924,7 @@ static int __bch2_trans_mark_dev_sb(struct btree_trans *trans,
 int bch2_trans_mark_dev_sb(struct bch_fs *c, struct bch_dev *ca)
 {
 	int ret = bch2_trans_run(c, __bch2_trans_mark_dev_sb(&trans, ca));
+
 	if (ret)
 		bch_err_fn(c, ret);
 	return ret;
diff --git a/fs/bcachefs/chardev.h b/fs/bcachefs/chardev.h
index 3a4890d39ff9..0f563ca53c36 100644
--- a/fs/bcachefs/chardev.h
+++ b/fs/bcachefs/chardev.h
@@ -17,7 +17,7 @@ int __init bch2_chardev_init(void);
 static inline long bch2_fs_ioctl(struct bch_fs *c,
 				unsigned cmd, void __user * arg)
 {
-	return -ENOSYS;
+	return -ENOTTY;
 }
 
 static inline void bch2_fs_chardev_exit(struct bch_fs *c) {}
diff --git a/fs/bcachefs/checksum.c b/fs/bcachefs/checksum.c
index bf03d42c6138..76cf2e70f019 100644
--- a/fs/bcachefs/checksum.c
+++ b/fs/bcachefs/checksum.c
@@ -265,9 +265,10 @@ static struct bch_csum __bch2_checksum_bio(struct bch_fs *c, unsigned type,
 
 #ifdef CONFIG_HIGHMEM
 		__bio_for_each_segment(bv, bio, *iter, *iter) {
-			void *p = kmap_atomic(bv.bv_page) + bv.bv_offset;
+			void *p = kmap_local_page(bv.bv_page) + bv.bv_offset;
+
 			bch2_checksum_update(&state, p, bv.bv_len);
-			kunmap_atomic(p);
+			kunmap_local(p);
 		}
 #else
 		__bio_for_each_bvec(bv, bio, *iter, *iter)
@@ -287,10 +288,10 @@ static struct bch_csum __bch2_checksum_bio(struct bch_fs *c, unsigned type,
 
 #ifdef CONFIG_HIGHMEM
 		__bio_for_each_segment(bv, bio, *iter, *iter) {
-			void *p = kmap_atomic(bv.bv_page) + bv.bv_offset;
+			void *p = kmap_local_page(bv.bv_page) + bv.bv_offset;
 
 			crypto_shash_update(desc, p, bv.bv_len);
-			kunmap_atomic(p);
+			kunmap_local(p);
 		}
 #else
 		__bio_for_each_bvec(bv, bio, *iter, *iter)
@@ -427,8 +428,9 @@ int bch2_rechecksum_bio(struct bch_fs *c, struct bio *bio,
 				extent_nonce(version, crc_old), bio);
 
 	if (bch2_crc_cmp(merged, crc_old.csum) && !c->opts.no_data_io) {
-		bch_err(c, "checksum error in bch2_rechecksum_bio() (memory corruption or bug?)\n"
+		bch_err(c, "checksum error in %s() (memory corruption or bug?)\n"
 			"expected %0llx:%0llx got %0llx:%0llx (old type %s new type %s)",
+			__func__,
 			crc_old.csum.hi,
 			crc_old.csum.lo,
 			merged.hi,
diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c
index c9ca7cce55f8..6b17f7cc5860 100644
--- a/fs/bcachefs/compress.c
+++ b/fs/bcachefs/compress.c
@@ -643,7 +643,8 @@ static int __bch2_fs_compress_init(struct bch_fs *c, u64 features)
 static u64 compression_opt_to_feature(unsigned v)
 {
 	unsigned type = bch2_compression_decode(v).type;
-	return 1ULL << bch2_compression_opt_to_feature[type];
+
+	return BIT_ULL(bch2_compression_opt_to_feature[type]);
 }
 
 int bch2_fs_compress_init(struct bch_fs *c)
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 7a3f42f3bc5b..d7f74db4c83e 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -517,7 +517,7 @@ static void bch2_extent_crc_pack(union bch_extent_crc *dst,
 	switch (type) {
 	case BCH_EXTENT_ENTRY_crc32:
 		set_common_fields(dst->crc32, src);
-		dst->crc32.csum	 	= (u32 __force) *((__le32 *) &src.csum.lo);
+		dst->crc32.csum		= (u32 __force) *((__le32 *) &src.csum.lo);
 		break;
 	case BCH_EXTENT_ENTRY_crc64:
 		set_common_fields(dst->crc64, src);
@@ -915,11 +915,11 @@ bool bch2_extents_match(struct bkey_s_c k1, struct bkey_s_c k2)
 
 		bkey_for_each_ptr_decode(k1.k, ptrs1, p1, entry1)
 			bkey_for_each_ptr_decode(k2.k, ptrs2, p2, entry2)
-			if (p1.ptr.dev		== p2.ptr.dev &&
-			    p1.ptr.gen		== p2.ptr.gen &&
-			    (s64) p1.ptr.offset + p1.crc.offset - bkey_start_offset(k1.k) ==
-			    (s64) p2.ptr.offset + p2.crc.offset - bkey_start_offset(k2.k))
-				return true;
+				if (p1.ptr.dev		== p2.ptr.dev &&
+				    p1.ptr.gen		== p2.ptr.gen &&
+				    (s64) p1.ptr.offset + p1.crc.offset - bkey_start_offset(k1.k) ==
+				    (s64) p2.ptr.offset + p2.crc.offset - bkey_start_offset(k2.k))
+					return true;
 
 		return false;
 	} else {
diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 917ad1c8f46d..40bfd0b25d9d 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -2867,7 +2867,7 @@ static int __bch2_truncate_folio(struct bch_inode_info *inode,
 
 		folio = __filemap_get_folio(mapping, index,
 					    FGP_LOCK|FGP_CREAT, GFP_KERNEL);
-		if (unlikely(IS_ERR_OR_NULL(folio))) {
+		if (IS_ERR_OR_NULL(folio)) {
 			ret = -ENOMEM;
 			goto out;
 		}
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index 5bacc6a9dd8f..f42d9da2e16e 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -2435,6 +2435,7 @@ static void __bch2_read_endio(struct work_struct *work)
 
 		if (rbio->bounce) {
 			struct bvec_iter src_iter = src->bi_iter;
+
 			bio_copy_data_iter(dst, &dst_iter, src, &src_iter);
 		}
 	}
diff --git a/fs/bcachefs/io.h b/fs/bcachefs/io.h
index 1476380d5fbf..831e3f1b7e41 100644
--- a/fs/bcachefs/io.h
+++ b/fs/bcachefs/io.h
@@ -52,7 +52,7 @@ enum __bch_write_flags {
 };
 
 enum bch_write_flags {
-#define x(f)	BCH_WRITE_##f = 1U << __BCH_WRITE_##f,
+#define x(f)	BCH_WRITE_##f = BIT(__BCH_WRITE_##f),
 	BCH_WRITE_FLAGS()
 #undef x
 };
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index 80a612c0577f..055920c26da6 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -63,6 +63,7 @@ journal_seq_to_buf(struct journal *j, u64 seq)
 static void journal_pin_list_init(struct journal_entry_pin_list *p, int count)
 {
 	unsigned i;
+
 	for (i = 0; i < ARRAY_SIZE(p->list); i++)
 		INIT_LIST_HEAD(&p->list[i]);
 	INIT_LIST_HEAD(&p->flushed);
@@ -514,8 +515,7 @@ int bch2_journal_res_get_slowpath(struct journal *j, struct journal_res *res,
 	int ret;
 
 	closure_wait_event(&j->async_wait,
-		   (ret = __journal_res_get(j, res, flags)) !=
-		   -BCH_ERR_journal_res_get_blocked||
+		   (ret = __journal_res_get(j, res, flags)) != -BCH_ERR_journal_res_get_blocked ||
 		   (flags & JOURNAL_RES_GET_NONBLOCK));
 	return ret;
 }
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 798d4c8c7200..42c9700e6d26 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -1053,6 +1053,7 @@ found:
 		bch_err(c, "cur_idx %u/%u", ja->cur_idx, ja->nr);
 		for (i = 0; i < 3; i++) {
 			unsigned idx = (ja->cur_idx + ja->nr - 1 + i) % ja->nr;
+
 			bch_err(c, "bucket_seq[%u] = %llu", idx, ja->bucket_seq[idx]);
 		}
 		ja->sectors_free = 0;
@@ -1629,7 +1630,6 @@ static void do_journal_write(struct closure *cl)
 	}
 
 	continue_at(cl, journal_write_done, c->io_complete_wq);
-	return;
 }
 
 static void bch2_journal_entries_postprocess(struct bch_fs *c, struct jset *jset)
diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
index 8de83e103751..9a2a534915dd 100644
--- a/fs/bcachefs/journal_reclaim.c
+++ b/fs/bcachefs/journal_reclaim.c
@@ -345,7 +345,7 @@ static inline bool __journal_pin_drop(struct journal *j,
 	list_del_init(&pin->list);
 
 	/*
-	 * Unpinning a journal entry make make journal_next_bucket() succeed, if
+	 * Unpinning a journal entry may make journal_next_bucket() succeed, if
 	 * writing a new last_seq will now make another bucket available:
 	 */
 	return atomic_dec_and_test(&pin_list->count) &&
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 35b67c544a6a..5dbe1b273b71 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -648,7 +648,7 @@ static int bch2_journal_replay(struct bch_fs *c)
 	move_gap(keys->d, keys->nr, keys->size, keys->gap, keys->nr);
 	keys->gap = keys->nr;
 
-	keys_sorted = kvmalloc_array(sizeof(*keys_sorted), keys->nr, GFP_KERNEL);
+	keys_sorted = kvmalloc_array(keys->nr, sizeof(*keys_sorted), GFP_KERNEL);
 	if (!keys_sorted)
 		return -BCH_ERR_ENOMEM_journal_replay;
 
@@ -1403,7 +1403,7 @@ use_clean:
 	}
 
 	c->journal_replay_seq_start	= last_seq;
-	c->journal_replay_seq_end	= blacklist_seq - 1;;
+	c->journal_replay_seq_end	= blacklist_seq - 1;
 
 	if (c->opts.reconstruct_alloc) {
 		c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index a58b9750b6ce..405ea74d0b83 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -553,7 +553,9 @@ static int __copy_super(struct bch_sb_handle *dst_handle, struct bch_sb *src)
 		d = (src_f ? le32_to_cpu(src_f->u64s) : 0) -
 		    (dst_f ? le32_to_cpu(dst_f->u64s) : 0);
 		if (d > 0) {
-			int ret = bch2_sb_realloc(dst_handle, le32_to_cpu(dst_handle->sb->u64s) + d);
+			int ret = bch2_sb_realloc(dst_handle,
+					le32_to_cpu(dst_handle->sb->u64s) + d);
+
 			if (ret)
 				return ret;
 
diff --git a/fs/bcachefs/super-io.h b/fs/bcachefs/super-io.h
index 904adea6a0da..6e59b0148f8d 100644
--- a/fs/bcachefs/super-io.h
+++ b/fs/bcachefs/super-io.h
@@ -58,6 +58,7 @@ struct bch_sb_field_ops {
 static inline __le64 bch2_sb_magic(struct bch_fs *c)
 {
 	__le64 ret;
+
 	memcpy(&ret, &c->sb.uuid, sizeof(ret));
 	return ret;
 }
diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c
index 6374d8aa9afc..e4f21fcae944 100644
--- a/fs/bcachefs/util.c
+++ b/fs/bcachefs/util.c
@@ -216,6 +216,7 @@ u64 bch2_read_flag_list(char *opt, const char * const list[])
 
 	while ((p = strsep(&s, ","))) {
 		int flag = match_string(list, -1, p);
+
 		if (flag < 0) {
 			ret = -1;
 			break;
@@ -797,9 +798,10 @@ void memcpy_to_bio(struct bio *dst, struct bvec_iter dst_iter, const void *src)
 	struct bvec_iter iter;
 
 	__bio_for_each_segment(bv, dst, iter, dst_iter) {
-		void *dstp = kmap_atomic(bv.bv_page);
+		void *dstp = kmap_local_page(bv.bv_page);
+
 		memcpy(dstp + bv.bv_offset, src, bv.bv_len);
-		kunmap_atomic(dstp);
+		kunmap_local(dstp);
 
 		src += bv.bv_len;
 	}
@@ -811,9 +813,10 @@ void memcpy_from_bio(void *dst, struct bio *src, struct bvec_iter src_iter)
 	struct bvec_iter iter;
 
 	__bio_for_each_segment(bv, src, iter, src_iter) {
-		void *srcp = kmap_atomic(bv.bv_page);
+		void *srcp = kmap_local_page(bv.bv_page);
+
 		memcpy(dst, srcp + bv.bv_offset, bv.bv_len);
-		kunmap_atomic(srcp);
+		kunmap_local(srcp);
 
 		dst += bv.bv_len;
 	}
diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h
index 8e37ce01a728..3cec6171c58f 100644
--- a/fs/bcachefs/util.h
+++ b/fs/bcachefs/util.h
@@ -467,8 +467,10 @@ struct bch_pd_controller {
 	s64			last_change;
 	s64			last_target;
 
-	/* If true, the rate will not increase if bch2_ratelimit_delay()
-	 * is not being called often enough. */
+	/*
+	 * If true, the rate will not increase if bch2_ratelimit_delay()
+	 * is not being called often enough.
+	 */
 	bool			backpressure;
 };
 
@@ -604,6 +606,7 @@ static inline void __memcpy_u64s(void *dst, const void *src,
 {
 #ifdef CONFIG_X86_64
 	long d0, d1, d2;
+
 	asm volatile("rep ; movsq"
 		     : "=&c" (d0), "=&D" (d1), "=&S" (d2)
 		     : "0" (u64s), "1" (dst), "2" (src)
@@ -680,6 +683,7 @@ static inline void __memmove_u64s_up(void *_dst, const void *_src,
 
 #ifdef CONFIG_X86_64
 	long d0, d1, d2;
+
 	asm volatile("std ;\n"
 		     "rep ; movsq\n"
 		     "cld ;\n"
diff --git a/fs/bcachefs/varint.c b/fs/bcachefs/varint.c
index ef030fc02448..2a2ab86ed6e1 100644
--- a/fs/bcachefs/varint.c
+++ b/fs/bcachefs/varint.c
@@ -59,6 +59,7 @@ int bch2_varint_decode(const u8 *in, const u8 *end, u64 *out)
 
 	if (likely(bytes < 9)) {
 		__le64 v_le = 0;
+
 		memcpy(&v_le, in, bytes);
 		v = le64_to_cpu(v_le);
 		v >>= bytes;
-- 
cgit 


From dbbfca9f41e86903501dded3fd494e1a56f3c310 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 3 Aug 2023 18:18:21 -0400
Subject: bcachefs: Split up fs-io.[ch]

fs-io.c is too big - time for some reorganization
 - fs-dio.c: direct io
 - fs-pagecache.c: pagecache data structures (bch_folio), utility code

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/Makefile          |    3 +
 fs/bcachefs/fs-io-buffered.c  | 1098 +++++++++++++++
 fs/bcachefs/fs-io-buffered.h  |   27 +
 fs/bcachefs/fs-io-direct.c    |  679 ++++++++++
 fs/bcachefs/fs-io-direct.h    |   16 +
 fs/bcachefs/fs-io-pagecache.c |  780 +++++++++++
 fs/bcachefs/fs-io-pagecache.h |  176 +++
 fs/bcachefs/fs-io.c           | 2975 ++---------------------------------------
 fs/bcachefs/fs-io.h           |  166 ++-
 fs/bcachefs/fs.c              |    3 +
 fs/bcachefs/super.c           |    8 +-
 11 files changed, 3042 insertions(+), 2889 deletions(-)
 create mode 100644 fs/bcachefs/fs-io-buffered.c
 create mode 100644 fs/bcachefs/fs-io-buffered.h
 create mode 100644 fs/bcachefs/fs-io-direct.c
 create mode 100644 fs/bcachefs/fs-io-direct.h
 create mode 100644 fs/bcachefs/fs-io-pagecache.c
 create mode 100644 fs/bcachefs/fs-io-pagecache.h

(limited to 'fs')

diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile
index 55b6d85d55c3..13cacf2d8bfb 100644
--- a/fs/bcachefs/Makefile
+++ b/fs/bcachefs/Makefile
@@ -38,6 +38,9 @@ bcachefs-y		:=	\
 	fs-common.o		\
 	fs-ioctl.o		\
 	fs-io.o			\
+	fs-io-buffered.o	\
+	fs-io-direct.o		\
+	fs-io-pagecache.o	\
 	fsck.o			\
 	inode.o			\
 	io.o			\
diff --git a/fs/bcachefs/fs-io-buffered.c b/fs/bcachefs/fs-io-buffered.c
new file mode 100644
index 000000000000..102c70887f76
--- /dev/null
+++ b/fs/bcachefs/fs-io-buffered.c
@@ -0,0 +1,1098 @@
+// SPDX-License-Identifier: GPL-2.0
+#ifndef NO_BCACHEFS_FS
+
+#include "bcachefs.h"
+#include "alloc_foreground.h"
+#include "bkey_buf.h"
+#include "fs-io.h"
+#include "fs-io-buffered.h"
+#include "fs-io-direct.h"
+#include "fs-io-pagecache.h"
+#include "io.h"
+
+#include <linux/backing-dev.h>
+#include <linux/pagemap.h>
+#include <linux/writeback.h>
+
+static inline bool bio_full(struct bio *bio, unsigned len)
+{
+	if (bio->bi_vcnt >= bio->bi_max_vecs)
+		return true;
+	if (bio->bi_iter.bi_size > UINT_MAX - len)
+		return true;
+	return false;
+}
+
+/* readpage(s): */
+
+static void bch2_readpages_end_io(struct bio *bio)
+{
+	struct folio_iter fi;
+
+	bio_for_each_folio_all(fi, bio) {
+		if (!bio->bi_status) {
+			folio_mark_uptodate(fi.folio);
+		} else {
+			folio_clear_uptodate(fi.folio);
+			folio_set_error(fi.folio);
+		}
+		folio_unlock(fi.folio);
+	}
+
+	bio_put(bio);
+}
+
+struct readpages_iter {
+	struct address_space	*mapping;
+	unsigned		idx;
+	folios			folios;
+};
+
+static int readpages_iter_init(struct readpages_iter *iter,
+			       struct readahead_control *ractl)
+{
+	struct folio **fi;
+	int ret;
+
+	memset(iter, 0, sizeof(*iter));
+
+	iter->mapping = ractl->mapping;
+
+	ret = bch2_filemap_get_contig_folios_d(iter->mapping,
+				ractl->_index << PAGE_SHIFT,
+				(ractl->_index + ractl->_nr_pages) << PAGE_SHIFT,
+				0, mapping_gfp_mask(iter->mapping),
+				&iter->folios);
+	if (ret)
+		return ret;
+
+	darray_for_each(iter->folios, fi) {
+		ractl->_nr_pages -= 1U << folio_order(*fi);
+		__bch2_folio_create(*fi, __GFP_NOFAIL|GFP_KERNEL);
+		folio_put(*fi);
+		folio_put(*fi);
+	}
+
+	return 0;
+}
+
+static inline struct folio *readpage_iter_peek(struct readpages_iter *iter)
+{
+	if (iter->idx >= iter->folios.nr)
+		return NULL;
+	return iter->folios.data[iter->idx];
+}
+
+static inline void readpage_iter_advance(struct readpages_iter *iter)
+{
+	iter->idx++;
+}
+
+static bool extent_partial_reads_expensive(struct bkey_s_c k)
+{
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+	struct bch_extent_crc_unpacked crc;
+	const union bch_extent_entry *i;
+
+	bkey_for_each_crc(k.k, ptrs, crc, i)
+		if (crc.csum_type || crc.compression_type)
+			return true;
+	return false;
+}
+
+static int readpage_bio_extend(struct btree_trans *trans,
+			       struct readpages_iter *iter,
+			       struct bio *bio,
+			       unsigned sectors_this_extent,
+			       bool get_more)
+{
+	/* Don't hold btree locks while allocating memory: */
+	bch2_trans_unlock(trans);
+
+	while (bio_sectors(bio) < sectors_this_extent &&
+	       bio->bi_vcnt < bio->bi_max_vecs) {
+		struct folio *folio = readpage_iter_peek(iter);
+		int ret;
+
+		if (folio) {
+			readpage_iter_advance(iter);
+		} else {
+			pgoff_t folio_offset = bio_end_sector(bio) >> PAGE_SECTORS_SHIFT;
+
+			if (!get_more)
+				break;
+
+			folio = xa_load(&iter->mapping->i_pages, folio_offset);
+			if (folio && !xa_is_value(folio))
+				break;
+
+			folio = filemap_alloc_folio(readahead_gfp_mask(iter->mapping), 0);
+			if (!folio)
+				break;
+
+			if (!__bch2_folio_create(folio, GFP_KERNEL)) {
+				folio_put(folio);
+				break;
+			}
+
+			ret = filemap_add_folio(iter->mapping, folio, folio_offset, GFP_KERNEL);
+			if (ret) {
+				__bch2_folio_release(folio);
+				folio_put(folio);
+				break;
+			}
+
+			folio_put(folio);
+		}
+
+		BUG_ON(folio_sector(folio) != bio_end_sector(bio));
+
+		BUG_ON(!bio_add_folio(bio, folio, folio_size(folio), 0));
+	}
+
+	return bch2_trans_relock(trans);
+}
+
+static void bchfs_read(struct btree_trans *trans,
+		       struct bch_read_bio *rbio,
+		       subvol_inum inum,
+		       struct readpages_iter *readpages_iter)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter iter;
+	struct bkey_buf sk;
+	int flags = BCH_READ_RETRY_IF_STALE|
+		BCH_READ_MAY_PROMOTE;
+	u32 snapshot;
+	int ret = 0;
+
+	rbio->c = c;
+	rbio->start_time = local_clock();
+	rbio->subvol = inum.subvol;
+
+	bch2_bkey_buf_init(&sk);
+retry:
+	bch2_trans_begin(trans);
+	iter = (struct btree_iter) { NULL };
+
+	ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
+	if (ret)
+		goto err;
+
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
+			     SPOS(inum.inum, rbio->bio.bi_iter.bi_sector, snapshot),
+			     BTREE_ITER_SLOTS);
+	while (1) {
+		struct bkey_s_c k;
+		unsigned bytes, sectors, offset_into_extent;
+		enum btree_id data_btree = BTREE_ID_extents;
+
+		/*
+		 * read_extent -> io_time_reset may cause a transaction restart
+		 * without returning an error, we need to check for that here:
+		 */
+		ret = bch2_trans_relock(trans);
+		if (ret)
+			break;
+
+		bch2_btree_iter_set_pos(&iter,
+				POS(inum.inum, rbio->bio.bi_iter.bi_sector));
+
+		k = bch2_btree_iter_peek_slot(&iter);
+		ret = bkey_err(k);
+		if (ret)
+			break;
+
+		offset_into_extent = iter.pos.offset -
+			bkey_start_offset(k.k);
+		sectors = k.k->size - offset_into_extent;
+
+		bch2_bkey_buf_reassemble(&sk, c, k);
+
+		ret = bch2_read_indirect_extent(trans, &data_btree,
+					&offset_into_extent, &sk);
+		if (ret)
+			break;
+
+		k = bkey_i_to_s_c(sk.k);
+
+		sectors = min(sectors, k.k->size - offset_into_extent);
+
+		if (readpages_iter) {
+			ret = readpage_bio_extend(trans, readpages_iter, &rbio->bio, sectors,
+						  extent_partial_reads_expensive(k));
+			if (ret)
+				break;
+		}
+
+		bytes = min(sectors, bio_sectors(&rbio->bio)) << 9;
+		swap(rbio->bio.bi_iter.bi_size, bytes);
+
+		if (rbio->bio.bi_iter.bi_size == bytes)
+			flags |= BCH_READ_LAST_FRAGMENT;
+
+		bch2_bio_page_state_set(&rbio->bio, k);
+
+		bch2_read_extent(trans, rbio, iter.pos,
+				 data_btree, k, offset_into_extent, flags);
+
+		if (flags & BCH_READ_LAST_FRAGMENT)
+			break;
+
+		swap(rbio->bio.bi_iter.bi_size, bytes);
+		bio_advance(&rbio->bio, bytes);
+
+		ret = btree_trans_too_many_iters(trans);
+		if (ret)
+			break;
+	}
+err:
+	bch2_trans_iter_exit(trans, &iter);
+
+	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+		goto retry;
+
+	if (ret) {
+		bch_err_inum_offset_ratelimited(c,
+				iter.pos.inode,
+				iter.pos.offset << 9,
+				"read error %i from btree lookup", ret);
+		rbio->bio.bi_status = BLK_STS_IOERR;
+		bio_endio(&rbio->bio);
+	}
+
+	bch2_bkey_buf_exit(&sk, c);
+}
+
+void bch2_readahead(struct readahead_control *ractl)
+{
+	struct bch_inode_info *inode = to_bch_ei(ractl->mapping->host);
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	struct bch_io_opts opts;
+	struct btree_trans trans;
+	struct folio *folio;
+	struct readpages_iter readpages_iter;
+	int ret;
+
+	bch2_inode_opts_get(&opts, c, &inode->ei_inode);
+
+	ret = readpages_iter_init(&readpages_iter, ractl);
+	BUG_ON(ret);
+
+	bch2_trans_init(&trans, c, 0, 0);
+
+	bch2_pagecache_add_get(inode);
+
+	while ((folio = readpage_iter_peek(&readpages_iter))) {
+		unsigned n = min_t(unsigned,
+				   readpages_iter.folios.nr -
+				   readpages_iter.idx,
+				   BIO_MAX_VECS);
+		struct bch_read_bio *rbio =
+			rbio_init(bio_alloc_bioset(NULL, n, REQ_OP_READ,
+						   GFP_KERNEL, &c->bio_read),
+				  opts);
+
+		readpage_iter_advance(&readpages_iter);
+
+		rbio->bio.bi_iter.bi_sector = folio_sector(folio);
+		rbio->bio.bi_end_io = bch2_readpages_end_io;
+		BUG_ON(!bio_add_folio(&rbio->bio, folio, folio_size(folio), 0));
+
+		bchfs_read(&trans, rbio, inode_inum(inode),
+			   &readpages_iter);
+		bch2_trans_unlock(&trans);
+	}
+
+	bch2_pagecache_add_put(inode);
+
+	bch2_trans_exit(&trans);
+	darray_exit(&readpages_iter.folios);
+}
+
+static void __bchfs_readfolio(struct bch_fs *c, struct bch_read_bio *rbio,
+			     subvol_inum inum, struct folio *folio)
+{
+	struct btree_trans trans;
+
+	bch2_folio_create(folio, __GFP_NOFAIL);
+
+	rbio->bio.bi_opf = REQ_OP_READ|REQ_SYNC;
+	rbio->bio.bi_iter.bi_sector = folio_sector(folio);
+	BUG_ON(!bio_add_folio(&rbio->bio, folio, folio_size(folio), 0));
+
+	bch2_trans_init(&trans, c, 0, 0);
+	bchfs_read(&trans, rbio, inum, NULL);
+	bch2_trans_exit(&trans);
+}
+
+static void bch2_read_single_folio_end_io(struct bio *bio)
+{
+	complete(bio->bi_private);
+}
+
+int bch2_read_single_folio(struct folio *folio, struct address_space *mapping)
+{
+	struct bch_inode_info *inode = to_bch_ei(mapping->host);
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	struct bch_read_bio *rbio;
+	struct bch_io_opts opts;
+	int ret;
+	DECLARE_COMPLETION_ONSTACK(done);
+
+	bch2_inode_opts_get(&opts, c, &inode->ei_inode);
+
+	rbio = rbio_init(bio_alloc_bioset(NULL, 1, REQ_OP_READ, GFP_KERNEL, &c->bio_read),
+			 opts);
+	rbio->bio.bi_private = &done;
+	rbio->bio.bi_end_io = bch2_read_single_folio_end_io;
+
+	__bchfs_readfolio(c, rbio, inode_inum(inode), folio);
+	wait_for_completion(&done);
+
+	ret = blk_status_to_errno(rbio->bio.bi_status);
+	bio_put(&rbio->bio);
+
+	if (ret < 0)
+		return ret;
+
+	folio_mark_uptodate(folio);
+	return 0;
+}
+
+int bch2_read_folio(struct file *file, struct folio *folio)
+{
+	int ret;
+
+	ret = bch2_read_single_folio(folio, folio->mapping);
+	folio_unlock(folio);
+	return bch2_err_class(ret);
+}
+
+/* writepages: */
+
+struct bch_writepage_io {
+	struct bch_inode_info		*inode;
+
+	/* must be last: */
+	struct bch_write_op		op;
+};
+
+struct bch_writepage_state {
+	struct bch_writepage_io	*io;
+	struct bch_io_opts	opts;
+	struct bch_folio_sector	*tmp;
+	unsigned		tmp_sectors;
+};
+
+static inline struct bch_writepage_state bch_writepage_state_init(struct bch_fs *c,
+								  struct bch_inode_info *inode)
+{
+	struct bch_writepage_state ret = { 0 };
+
+	bch2_inode_opts_get(&ret.opts, c, &inode->ei_inode);
+	return ret;
+}
+
+static void bch2_writepage_io_done(struct bch_write_op *op)
+{
+	struct bch_writepage_io *io =
+		container_of(op, struct bch_writepage_io, op);
+	struct bch_fs *c = io->op.c;
+	struct bio *bio = &io->op.wbio.bio;
+	struct folio_iter fi;
+	unsigned i;
+
+	if (io->op.error) {
+		set_bit(EI_INODE_ERROR, &io->inode->ei_flags);
+
+		bio_for_each_folio_all(fi, bio) {
+			struct bch_folio *s;
+
+			folio_set_error(fi.folio);
+			mapping_set_error(fi.folio->mapping, -EIO);
+
+			s = __bch2_folio(fi.folio);
+			spin_lock(&s->lock);
+			for (i = 0; i < folio_sectors(fi.folio); i++)
+				s->s[i].nr_replicas = 0;
+			spin_unlock(&s->lock);
+		}
+	}
+
+	if (io->op.flags & BCH_WRITE_WROTE_DATA_INLINE) {
+		bio_for_each_folio_all(fi, bio) {
+			struct bch_folio *s;
+
+			s = __bch2_folio(fi.folio);
+			spin_lock(&s->lock);
+			for (i = 0; i < folio_sectors(fi.folio); i++)
+				s->s[i].nr_replicas = 0;
+			spin_unlock(&s->lock);
+		}
+	}
+
+	/*
+	 * racing with fallocate can cause us to add fewer sectors than
+	 * expected - but we shouldn't add more sectors than expected:
+	 */
+	WARN_ON_ONCE(io->op.i_sectors_delta > 0);
+
+	/*
+	 * (error (due to going RO) halfway through a page can screw that up
+	 * slightly)
+	 * XXX wtf?
+	   BUG_ON(io->op.op.i_sectors_delta >= PAGE_SECTORS);
+	 */
+
+	/*
+	 * PageWriteback is effectively our ref on the inode - fixup i_blocks
+	 * before calling end_page_writeback:
+	 */
+	bch2_i_sectors_acct(c, io->inode, NULL, io->op.i_sectors_delta);
+
+	bio_for_each_folio_all(fi, bio) {
+		struct bch_folio *s = __bch2_folio(fi.folio);
+
+		if (atomic_dec_and_test(&s->write_count))
+			folio_end_writeback(fi.folio);
+	}
+
+	bio_put(&io->op.wbio.bio);
+}
+
+static void bch2_writepage_do_io(struct bch_writepage_state *w)
+{
+	struct bch_writepage_io *io = w->io;
+
+	w->io = NULL;
+	closure_call(&io->op.cl, bch2_write, NULL, NULL);
+}
+
+/*
+ * Get a bch_writepage_io and add @page to it - appending to an existing one if
+ * possible, else allocating a new one:
+ */
+static void bch2_writepage_io_alloc(struct bch_fs *c,
+				    struct writeback_control *wbc,
+				    struct bch_writepage_state *w,
+				    struct bch_inode_info *inode,
+				    u64 sector,
+				    unsigned nr_replicas)
+{
+	struct bch_write_op *op;
+
+	w->io = container_of(bio_alloc_bioset(NULL, BIO_MAX_VECS,
+					      REQ_OP_WRITE,
+					      GFP_KERNEL,
+					      &c->writepage_bioset),
+			     struct bch_writepage_io, op.wbio.bio);
+
+	w->io->inode		= inode;
+	op			= &w->io->op;
+	bch2_write_op_init(op, c, w->opts);
+	op->target		= w->opts.foreground_target;
+	op->nr_replicas		= nr_replicas;
+	op->res.nr_replicas	= nr_replicas;
+	op->write_point		= writepoint_hashed(inode->ei_last_dirtied);
+	op->subvol		= inode->ei_subvol;
+	op->pos			= POS(inode->v.i_ino, sector);
+	op->end_io		= bch2_writepage_io_done;
+	op->devs_need_flush	= &inode->ei_devs_need_flush;
+	op->wbio.bio.bi_iter.bi_sector = sector;
+	op->wbio.bio.bi_opf	= wbc_to_write_flags(wbc);
+}
+
+static int __bch2_writepage(struct folio *folio,
+			    struct writeback_control *wbc,
+			    void *data)
+{
+	struct bch_inode_info *inode = to_bch_ei(folio->mapping->host);
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	struct bch_writepage_state *w = data;
+	struct bch_folio *s;
+	unsigned i, offset, f_sectors, nr_replicas_this_write = U32_MAX;
+	loff_t i_size = i_size_read(&inode->v);
+	int ret;
+
+	EBUG_ON(!folio_test_uptodate(folio));
+
+	/* Is the folio fully inside i_size? */
+	if (folio_end_pos(folio) <= i_size)
+		goto do_io;
+
+	/* Is the folio fully outside i_size? (truncate in progress) */
+	if (folio_pos(folio) >= i_size) {
+		folio_unlock(folio);
+		return 0;
+	}
+
+	/*
+	 * The folio straddles i_size.  It must be zeroed out on each and every
+	 * writepage invocation because it may be mmapped.  "A file is mapped
+	 * in multiples of the folio size.  For a file that is not a multiple of
+	 * the  folio size, the remaining memory is zeroed when mapped, and
+	 * writes to that region are not written out to the file."
+	 */
+	folio_zero_segment(folio,
+			   i_size - folio_pos(folio),
+			   folio_size(folio));
+do_io:
+	f_sectors = folio_sectors(folio);
+	s = bch2_folio(folio);
+
+	if (f_sectors > w->tmp_sectors) {
+		kfree(w->tmp);
+		w->tmp = kcalloc(f_sectors, sizeof(struct bch_folio_sector), __GFP_NOFAIL);
+		w->tmp_sectors = f_sectors;
+	}
+
+	/*
+	 * Things get really hairy with errors during writeback:
+	 */
+	ret = bch2_get_folio_disk_reservation(c, inode, folio, false);
+	BUG_ON(ret);
+
+	/* Before unlocking the page, get copy of reservations: */
+	spin_lock(&s->lock);
+	memcpy(w->tmp, s->s, sizeof(struct bch_folio_sector) * f_sectors);
+
+	for (i = 0; i < f_sectors; i++) {
+		if (s->s[i].state < SECTOR_dirty)
+			continue;
+
+		nr_replicas_this_write =
+			min_t(unsigned, nr_replicas_this_write,
+			      s->s[i].nr_replicas +
+			      s->s[i].replicas_reserved);
+	}
+
+	for (i = 0; i < f_sectors; i++) {
+		if (s->s[i].state < SECTOR_dirty)
+			continue;
+
+		s->s[i].nr_replicas = w->opts.compression
+			? 0 : nr_replicas_this_write;
+
+		s->s[i].replicas_reserved = 0;
+		bch2_folio_sector_set(folio, s, i, SECTOR_allocated);
+	}
+	spin_unlock(&s->lock);
+
+	BUG_ON(atomic_read(&s->write_count));
+	atomic_set(&s->write_count, 1);
+
+	BUG_ON(folio_test_writeback(folio));
+	folio_start_writeback(folio);
+
+	folio_unlock(folio);
+
+	offset = 0;
+	while (1) {
+		unsigned sectors = 0, dirty_sectors = 0, reserved_sectors = 0;
+		u64 sector;
+
+		while (offset < f_sectors &&
+		       w->tmp[offset].state < SECTOR_dirty)
+			offset++;
+
+		if (offset == f_sectors)
+			break;
+
+		while (offset + sectors < f_sectors &&
+		       w->tmp[offset + sectors].state >= SECTOR_dirty) {
+			reserved_sectors += w->tmp[offset + sectors].replicas_reserved;
+			dirty_sectors += w->tmp[offset + sectors].state == SECTOR_dirty;
+			sectors++;
+		}
+		BUG_ON(!sectors);
+
+		sector = folio_sector(folio) + offset;
+
+		if (w->io &&
+		    (w->io->op.res.nr_replicas != nr_replicas_this_write ||
+		     bio_full(&w->io->op.wbio.bio, sectors << 9) ||
+		     w->io->op.wbio.bio.bi_iter.bi_size + (sectors << 9) >=
+		     (BIO_MAX_VECS * PAGE_SIZE) ||
+		     bio_end_sector(&w->io->op.wbio.bio) != sector))
+			bch2_writepage_do_io(w);
+
+		if (!w->io)
+			bch2_writepage_io_alloc(c, wbc, w, inode, sector,
+						nr_replicas_this_write);
+
+		atomic_inc(&s->write_count);
+
+		BUG_ON(inode != w->io->inode);
+		BUG_ON(!bio_add_folio(&w->io->op.wbio.bio, folio,
+				     sectors << 9, offset << 9));
+
+		/* Check for writing past i_size: */
+		WARN_ONCE((bio_end_sector(&w->io->op.wbio.bio) << 9) >
+			  round_up(i_size, block_bytes(c)) &&
+			  !test_bit(BCH_FS_EMERGENCY_RO, &c->flags),
+			  "writing past i_size: %llu > %llu (unrounded %llu)\n",
+			  bio_end_sector(&w->io->op.wbio.bio) << 9,
+			  round_up(i_size, block_bytes(c)),
+			  i_size);
+
+		w->io->op.res.sectors += reserved_sectors;
+		w->io->op.i_sectors_delta -= dirty_sectors;
+		w->io->op.new_i_size = i_size;
+
+		offset += sectors;
+	}
+
+	if (atomic_dec_and_test(&s->write_count))
+		folio_end_writeback(folio);
+
+	return 0;
+}
+
+int bch2_writepages(struct address_space *mapping, struct writeback_control *wbc)
+{
+	struct bch_fs *c = mapping->host->i_sb->s_fs_info;
+	struct bch_writepage_state w =
+		bch_writepage_state_init(c, to_bch_ei(mapping->host));
+	struct blk_plug plug;
+	int ret;
+
+	blk_start_plug(&plug);
+	ret = write_cache_pages(mapping, wbc, __bch2_writepage, &w);
+	if (w.io)
+		bch2_writepage_do_io(&w);
+	blk_finish_plug(&plug);
+	kfree(w.tmp);
+	return bch2_err_class(ret);
+}
+
+/* buffered writes: */
+
+int bch2_write_begin(struct file *file, struct address_space *mapping,
+		     loff_t pos, unsigned len,
+		     struct page **pagep, void **fsdata)
+{
+	struct bch_inode_info *inode = to_bch_ei(mapping->host);
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	struct bch2_folio_reservation *res;
+	struct folio *folio;
+	unsigned offset;
+	int ret = -ENOMEM;
+
+	res = kmalloc(sizeof(*res), GFP_KERNEL);
+	if (!res)
+		return -ENOMEM;
+
+	bch2_folio_reservation_init(c, inode, res);
+	*fsdata = res;
+
+	bch2_pagecache_add_get(inode);
+
+	folio = __filemap_get_folio(mapping, pos >> PAGE_SHIFT,
+				FGP_LOCK|FGP_WRITE|FGP_CREAT|FGP_STABLE,
+				mapping_gfp_mask(mapping));
+	if (IS_ERR_OR_NULL(folio))
+		goto err_unlock;
+
+	if (folio_test_uptodate(folio))
+		goto out;
+
+	offset = pos - folio_pos(folio);
+	len = min_t(size_t, len, folio_end_pos(folio) - pos);
+
+	/* If we're writing entire folio, don't need to read it in first: */
+	if (!offset && len == folio_size(folio))
+		goto out;
+
+	if (!offset && pos + len >= inode->v.i_size) {
+		folio_zero_segment(folio, len, folio_size(folio));
+		flush_dcache_folio(folio);
+		goto out;
+	}
+
+	if (folio_pos(folio) >= inode->v.i_size) {
+		folio_zero_segments(folio, 0, offset, offset + len, folio_size(folio));
+		flush_dcache_folio(folio);
+		goto out;
+	}
+readpage:
+	ret = bch2_read_single_folio(folio, mapping);
+	if (ret)
+		goto err;
+out:
+	ret = bch2_folio_set(c, inode_inum(inode), &folio, 1);
+	if (ret)
+		goto err;
+
+	ret = bch2_folio_reservation_get(c, inode, folio, res, offset, len);
+	if (ret) {
+		if (!folio_test_uptodate(folio)) {
+			/*
+			 * If the folio hasn't been read in, we won't know if we
+			 * actually need a reservation - we don't actually need
+			 * to read here, we just need to check if the folio is
+			 * fully backed by uncompressed data:
+			 */
+			goto readpage;
+		}
+
+		goto err;
+	}
+
+	*pagep = &folio->page;
+	return 0;
+err:
+	folio_unlock(folio);
+	folio_put(folio);
+	*pagep = NULL;
+err_unlock:
+	bch2_pagecache_add_put(inode);
+	kfree(res);
+	*fsdata = NULL;
+	return bch2_err_class(ret);
+}
+
+int bch2_write_end(struct file *file, struct address_space *mapping,
+		   loff_t pos, unsigned len, unsigned copied,
+		   struct page *page, void *fsdata)
+{
+	struct bch_inode_info *inode = to_bch_ei(mapping->host);
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	struct bch2_folio_reservation *res = fsdata;
+	struct folio *folio = page_folio(page);
+	unsigned offset = pos - folio_pos(folio);
+
+	lockdep_assert_held(&inode->v.i_rwsem);
+	BUG_ON(offset + copied > folio_size(folio));
+
+	if (unlikely(copied < len && !folio_test_uptodate(folio))) {
+		/*
+		 * The folio needs to be read in, but that would destroy
+		 * our partial write - simplest thing is to just force
+		 * userspace to redo the write:
+		 */
+		folio_zero_range(folio, 0, folio_size(folio));
+		flush_dcache_folio(folio);
+		copied = 0;
+	}
+
+	spin_lock(&inode->v.i_lock);
+	if (pos + copied > inode->v.i_size)
+		i_size_write(&inode->v, pos + copied);
+	spin_unlock(&inode->v.i_lock);
+
+	if (copied) {
+		if (!folio_test_uptodate(folio))
+			folio_mark_uptodate(folio);
+
+		bch2_set_folio_dirty(c, inode, folio, res, offset, copied);
+
+		inode->ei_last_dirtied = (unsigned long) current;
+	}
+
+	folio_unlock(folio);
+	folio_put(folio);
+	bch2_pagecache_add_put(inode);
+
+	bch2_folio_reservation_put(c, inode, res);
+	kfree(res);
+
+	return copied;
+}
+
+static noinline void folios_trunc(folios *folios, struct folio **fi)
+{
+	while (folios->data + folios->nr > fi) {
+		struct folio *f = darray_pop(folios);
+
+		folio_unlock(f);
+		folio_put(f);
+	}
+}
+
+static int __bch2_buffered_write(struct bch_inode_info *inode,
+				 struct address_space *mapping,
+				 struct iov_iter *iter,
+				 loff_t pos, unsigned len)
+{
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	struct bch2_folio_reservation res;
+	folios folios;
+	struct folio **fi, *f;
+	unsigned copied = 0, f_offset;
+	u64 end = pos + len, f_pos;
+	loff_t last_folio_pos = inode->v.i_size;
+	int ret = 0;
+
+	BUG_ON(!len);
+
+	bch2_folio_reservation_init(c, inode, &res);
+	darray_init(&folios);
+
+	ret = bch2_filemap_get_contig_folios_d(mapping, pos, end,
+				   FGP_LOCK|FGP_WRITE|FGP_STABLE|FGP_CREAT,
+				   mapping_gfp_mask(mapping),
+				   &folios);
+	if (ret)
+		goto out;
+
+	BUG_ON(!folios.nr);
+
+	f = darray_first(folios);
+	if (pos != folio_pos(f) && !folio_test_uptodate(f)) {
+		ret = bch2_read_single_folio(f, mapping);
+		if (ret)
+			goto out;
+	}
+
+	f = darray_last(folios);
+	end = min(end, folio_end_pos(f));
+	last_folio_pos = folio_pos(f);
+	if (end != folio_end_pos(f) && !folio_test_uptodate(f)) {
+		if (end >= inode->v.i_size) {
+			folio_zero_range(f, 0, folio_size(f));
+		} else {
+			ret = bch2_read_single_folio(f, mapping);
+			if (ret)
+				goto out;
+		}
+	}
+
+	ret = bch2_folio_set(c, inode_inum(inode), folios.data, folios.nr);
+	if (ret)
+		goto out;
+
+	f_pos = pos;
+	f_offset = pos - folio_pos(darray_first(folios));
+	darray_for_each(folios, fi) {
+		struct folio *f = *fi;
+		u64 f_len = min(end, folio_end_pos(f)) - f_pos;
+
+		/*
+		 * XXX: per POSIX and fstests generic/275, on -ENOSPC we're
+		 * supposed to write as much as we have disk space for.
+		 *
+		 * On failure here we should still write out a partial page if
+		 * we aren't completely out of disk space - we don't do that
+		 * yet:
+		 */
+		ret = bch2_folio_reservation_get(c, inode, f, &res, f_offset, f_len);
+		if (unlikely(ret)) {
+			folios_trunc(&folios, fi);
+			if (!folios.nr)
+				goto out;
+
+			end = min(end, folio_end_pos(darray_last(folios)));
+			break;
+		}
+
+		f_pos = folio_end_pos(f);
+		f_offset = 0;
+	}
+
+	if (mapping_writably_mapped(mapping))
+		darray_for_each(folios, fi)
+			flush_dcache_folio(*fi);
+
+	f_pos = pos;
+	f_offset = pos - folio_pos(darray_first(folios));
+	darray_for_each(folios, fi) {
+		struct folio *f = *fi;
+		u64 f_len = min(end, folio_end_pos(f)) - f_pos;
+		unsigned f_copied = copy_page_from_iter_atomic(&f->page, f_offset, f_len, iter);
+
+		if (!f_copied) {
+			folios_trunc(&folios, fi);
+			break;
+		}
+
+		if (!folio_test_uptodate(f) &&
+		    f_copied != folio_size(f) &&
+		    pos + copied + f_copied < inode->v.i_size) {
+			folio_zero_range(f, 0, folio_size(f));
+			folios_trunc(&folios, fi);
+			break;
+		}
+
+		flush_dcache_folio(f);
+		copied += f_copied;
+
+		if (f_copied != f_len) {
+			folios_trunc(&folios, fi + 1);
+			break;
+		}
+
+		f_pos = folio_end_pos(f);
+		f_offset = 0;
+	}
+
+	if (!copied)
+		goto out;
+
+	end = pos + copied;
+
+	spin_lock(&inode->v.i_lock);
+	if (end > inode->v.i_size)
+		i_size_write(&inode->v, end);
+	spin_unlock(&inode->v.i_lock);
+
+	f_pos = pos;
+	f_offset = pos - folio_pos(darray_first(folios));
+	darray_for_each(folios, fi) {
+		struct folio *f = *fi;
+		u64 f_len = min(end, folio_end_pos(f)) - f_pos;
+
+		if (!folio_test_uptodate(f))
+			folio_mark_uptodate(f);
+
+		bch2_set_folio_dirty(c, inode, f, &res, f_offset, f_len);
+
+		f_pos = folio_end_pos(f);
+		f_offset = 0;
+	}
+
+	inode->ei_last_dirtied = (unsigned long) current;
+out:
+	darray_for_each(folios, fi) {
+		folio_unlock(*fi);
+		folio_put(*fi);
+	}
+
+	/*
+	 * If the last folio added to the mapping starts beyond current EOF, we
+	 * performed a short write but left around at least one post-EOF folio.
+	 * Clean up the mapping before we return.
+	 */
+	if (last_folio_pos >= inode->v.i_size)
+		truncate_pagecache(&inode->v, inode->v.i_size);
+
+	darray_exit(&folios);
+	bch2_folio_reservation_put(c, inode, &res);
+
+	return copied ?: ret;
+}
+
+static ssize_t bch2_buffered_write(struct kiocb *iocb, struct iov_iter *iter)
+{
+	struct file *file = iocb->ki_filp;
+	struct address_space *mapping = file->f_mapping;
+	struct bch_inode_info *inode = file_bch_inode(file);
+	loff_t pos = iocb->ki_pos;
+	ssize_t written = 0;
+	int ret = 0;
+
+	bch2_pagecache_add_get(inode);
+
+	do {
+		unsigned offset = pos & (PAGE_SIZE - 1);
+		unsigned bytes = iov_iter_count(iter);
+again:
+		/*
+		 * Bring in the user page that we will copy from _first_.
+		 * Otherwise there's a nasty deadlock on copying from the
+		 * same page as we're writing to, without it being marked
+		 * up-to-date.
+		 *
+		 * Not only is this an optimisation, but it is also required
+		 * to check that the address is actually valid, when atomic
+		 * usercopies are used, below.
+		 */
+		if (unlikely(fault_in_iov_iter_readable(iter, bytes))) {
+			bytes = min_t(unsigned long, iov_iter_count(iter),
+				      PAGE_SIZE - offset);
+
+			if (unlikely(fault_in_iov_iter_readable(iter, bytes))) {
+				ret = -EFAULT;
+				break;
+			}
+		}
+
+		if (unlikely(fatal_signal_pending(current))) {
+			ret = -EINTR;
+			break;
+		}
+
+		ret = __bch2_buffered_write(inode, mapping, iter, pos, bytes);
+		if (unlikely(ret < 0))
+			break;
+
+		cond_resched();
+
+		if (unlikely(ret == 0)) {
+			/*
+			 * If we were unable to copy any data at all, we must
+			 * fall back to a single segment length write.
+			 *
+			 * If we didn't fallback here, we could livelock
+			 * because not all segments in the iov can be copied at
+			 * once without a pagefault.
+			 */
+			bytes = min_t(unsigned long, PAGE_SIZE - offset,
+				      iov_iter_single_seg_count(iter));
+			goto again;
+		}
+		pos += ret;
+		written += ret;
+		ret = 0;
+
+		balance_dirty_pages_ratelimited(mapping);
+	} while (iov_iter_count(iter));
+
+	bch2_pagecache_add_put(inode);
+
+	return written ? written : ret;
+}
+
+ssize_t bch2_write_iter(struct kiocb *iocb, struct iov_iter *from)
+{
+	struct file *file = iocb->ki_filp;
+	struct bch_inode_info *inode = file_bch_inode(file);
+	ssize_t ret;
+
+	if (iocb->ki_flags & IOCB_DIRECT) {
+		ret = bch2_direct_write(iocb, from);
+		goto out;
+	}
+
+	inode_lock(&inode->v);
+
+	ret = generic_write_checks(iocb, from);
+	if (ret <= 0)
+		goto unlock;
+
+	ret = file_remove_privs(file);
+	if (ret)
+		goto unlock;
+
+	ret = file_update_time(file);
+	if (ret)
+		goto unlock;
+
+	ret = bch2_buffered_write(iocb, from);
+	if (likely(ret > 0))
+		iocb->ki_pos += ret;
+unlock:
+	inode_unlock(&inode->v);
+
+	if (ret > 0)
+		ret = generic_write_sync(iocb, ret);
+out:
+	return bch2_err_class(ret);
+}
+
+void bch2_fs_fs_io_buffered_exit(struct bch_fs *c)
+{
+	bioset_exit(&c->writepage_bioset);
+}
+
+int bch2_fs_fs_io_buffered_init(struct bch_fs *c)
+{
+	if (bioset_init(&c->writepage_bioset,
+			4, offsetof(struct bch_writepage_io, op.wbio.bio),
+			BIOSET_NEED_BVECS))
+		return -BCH_ERR_ENOMEM_writepage_bioset_init;
+
+	return 0;
+}
+
+#endif /* NO_BCACHEFS_FS */
diff --git a/fs/bcachefs/fs-io-buffered.h b/fs/bcachefs/fs-io-buffered.h
new file mode 100644
index 000000000000..a6126ff790e6
--- /dev/null
+++ b/fs/bcachefs/fs-io-buffered.h
@@ -0,0 +1,27 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_FS_IO_BUFFERED_H
+#define _BCACHEFS_FS_IO_BUFFERED_H
+
+#ifndef NO_BCACHEFS_FS
+
+int bch2_read_single_folio(struct folio *, struct address_space *);
+int bch2_read_folio(struct file *, struct folio *);
+
+int bch2_writepages(struct address_space *, struct writeback_control *);
+void bch2_readahead(struct readahead_control *);
+
+int bch2_write_begin(struct file *, struct address_space *, loff_t,
+		     unsigned, struct page **, void **);
+int bch2_write_end(struct file *, struct address_space *, loff_t,
+		   unsigned, unsigned, struct page *, void *);
+
+ssize_t bch2_write_iter(struct kiocb *, struct iov_iter *);
+
+void bch2_fs_fs_io_buffered_exit(struct bch_fs *);
+int bch2_fs_fs_io_buffered_init(struct bch_fs *);
+#else
+static inline void bch2_fs_fs_io_buffered_exit(struct bch_fs *c) {}
+static inline int bch2_fs_fs_io_buffered_init(struct bch_fs *c) { return 0; }
+#endif
+
+#endif /* _BCACHEFS_FS_IO_BUFFERED_H */
diff --git a/fs/bcachefs/fs-io-direct.c b/fs/bcachefs/fs-io-direct.c
new file mode 100644
index 000000000000..2b29abd24d56
--- /dev/null
+++ b/fs/bcachefs/fs-io-direct.c
@@ -0,0 +1,679 @@
+// SPDX-License-Identifier: GPL-2.0
+#ifndef NO_BCACHEFS_FS
+
+#include "bcachefs.h"
+#include "alloc_foreground.h"
+#include "fs.h"
+#include "fs-io.h"
+#include "fs-io-direct.h"
+#include "fs-io-pagecache.h"
+#include "io.h"
+
+#include <linux/kthread.h>
+#include <linux/pagemap.h>
+#include <linux/task_io_accounting_ops.h>
+
+/* O_DIRECT reads */
+
+struct dio_read {
+	struct closure			cl;
+	struct kiocb			*req;
+	long				ret;
+	bool				should_dirty;
+	struct bch_read_bio		rbio;
+};
+
+static void bio_check_or_release(struct bio *bio, bool check_dirty)
+{
+	if (check_dirty) {
+		bio_check_pages_dirty(bio);
+	} else {
+		bio_release_pages(bio, false);
+		bio_put(bio);
+	}
+}
+
+static void bch2_dio_read_complete(struct closure *cl)
+{
+	struct dio_read *dio = container_of(cl, struct dio_read, cl);
+
+	dio->req->ki_complete(dio->req, dio->ret);
+	bio_check_or_release(&dio->rbio.bio, dio->should_dirty);
+}
+
+static void bch2_direct_IO_read_endio(struct bio *bio)
+{
+	struct dio_read *dio = bio->bi_private;
+
+	if (bio->bi_status)
+		dio->ret = blk_status_to_errno(bio->bi_status);
+
+	closure_put(&dio->cl);
+}
+
+static void bch2_direct_IO_read_split_endio(struct bio *bio)
+{
+	struct dio_read *dio = bio->bi_private;
+	bool should_dirty = dio->should_dirty;
+
+	bch2_direct_IO_read_endio(bio);
+	bio_check_or_release(bio, should_dirty);
+}
+
+static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter)
+{
+	struct file *file = req->ki_filp;
+	struct bch_inode_info *inode = file_bch_inode(file);
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	struct bch_io_opts opts;
+	struct dio_read *dio;
+	struct bio *bio;
+	loff_t offset = req->ki_pos;
+	bool sync = is_sync_kiocb(req);
+	size_t shorten;
+	ssize_t ret;
+
+	bch2_inode_opts_get(&opts, c, &inode->ei_inode);
+
+	if ((offset|iter->count) & (block_bytes(c) - 1))
+		return -EINVAL;
+
+	ret = min_t(loff_t, iter->count,
+		    max_t(loff_t, 0, i_size_read(&inode->v) - offset));
+
+	if (!ret)
+		return ret;
+
+	shorten = iov_iter_count(iter) - round_up(ret, block_bytes(c));
+	iter->count -= shorten;
+
+	bio = bio_alloc_bioset(NULL,
+			       bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS),
+			       REQ_OP_READ,
+			       GFP_KERNEL,
+			       &c->dio_read_bioset);
+
+	bio->bi_end_io = bch2_direct_IO_read_endio;
+
+	dio = container_of(bio, struct dio_read, rbio.bio);
+	closure_init(&dio->cl, NULL);
+
+	/*
+	 * this is a _really_ horrible hack just to avoid an atomic sub at the
+	 * end:
+	 */
+	if (!sync) {
+		set_closure_fn(&dio->cl, bch2_dio_read_complete, NULL);
+		atomic_set(&dio->cl.remaining,
+			   CLOSURE_REMAINING_INITIALIZER -
+			   CLOSURE_RUNNING +
+			   CLOSURE_DESTRUCTOR);
+	} else {
+		atomic_set(&dio->cl.remaining,
+			   CLOSURE_REMAINING_INITIALIZER + 1);
+	}
+
+	dio->req	= req;
+	dio->ret	= ret;
+	/*
+	 * This is one of the sketchier things I've encountered: we have to skip
+	 * the dirtying of requests that are internal from the kernel (i.e. from
+	 * loopback), because we'll deadlock on page_lock.
+	 */
+	dio->should_dirty = iter_is_iovec(iter);
+
+	goto start;
+	while (iter->count) {
+		bio = bio_alloc_bioset(NULL,
+				       bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS),
+				       REQ_OP_READ,
+				       GFP_KERNEL,
+				       &c->bio_read);
+		bio->bi_end_io		= bch2_direct_IO_read_split_endio;
+start:
+		bio->bi_opf		= REQ_OP_READ|REQ_SYNC;
+		bio->bi_iter.bi_sector	= offset >> 9;
+		bio->bi_private		= dio;
+
+		ret = bio_iov_iter_get_pages(bio, iter);
+		if (ret < 0) {
+			/* XXX: fault inject this path */
+			bio->bi_status = BLK_STS_RESOURCE;
+			bio_endio(bio);
+			break;
+		}
+
+		offset += bio->bi_iter.bi_size;
+
+		if (dio->should_dirty)
+			bio_set_pages_dirty(bio);
+
+		if (iter->count)
+			closure_get(&dio->cl);
+
+		bch2_read(c, rbio_init(bio, opts), inode_inum(inode));
+	}
+
+	iter->count += shorten;
+
+	if (sync) {
+		closure_sync(&dio->cl);
+		closure_debug_destroy(&dio->cl);
+		ret = dio->ret;
+		bio_check_or_release(&dio->rbio.bio, dio->should_dirty);
+		return ret;
+	} else {
+		return -EIOCBQUEUED;
+	}
+}
+
+ssize_t bch2_read_iter(struct kiocb *iocb, struct iov_iter *iter)
+{
+	struct file *file = iocb->ki_filp;
+	struct bch_inode_info *inode = file_bch_inode(file);
+	struct address_space *mapping = file->f_mapping;
+	size_t count = iov_iter_count(iter);
+	ssize_t ret;
+
+	if (!count)
+		return 0; /* skip atime */
+
+	if (iocb->ki_flags & IOCB_DIRECT) {
+		struct blk_plug plug;
+
+		if (unlikely(mapping->nrpages)) {
+			ret = filemap_write_and_wait_range(mapping,
+						iocb->ki_pos,
+						iocb->ki_pos + count - 1);
+			if (ret < 0)
+				goto out;
+		}
+
+		file_accessed(file);
+
+		blk_start_plug(&plug);
+		ret = bch2_direct_IO_read(iocb, iter);
+		blk_finish_plug(&plug);
+
+		if (ret >= 0)
+			iocb->ki_pos += ret;
+	} else {
+		bch2_pagecache_add_get(inode);
+		ret = generic_file_read_iter(iocb, iter);
+		bch2_pagecache_add_put(inode);
+	}
+out:
+	return bch2_err_class(ret);
+}
+
+/* O_DIRECT writes */
+
+struct dio_write {
+	struct kiocb			*req;
+	struct address_space		*mapping;
+	struct bch_inode_info		*inode;
+	struct mm_struct		*mm;
+	unsigned			loop:1,
+					extending:1,
+					sync:1,
+					flush:1,
+					free_iov:1;
+	struct quota_res		quota_res;
+	u64				written;
+
+	struct iov_iter			iter;
+	struct iovec			inline_vecs[2];
+
+	/* must be last: */
+	struct bch_write_op		op;
+};
+
+static bool bch2_check_range_allocated(struct bch_fs *c, subvol_inum inum,
+				       u64 offset, u64 size,
+				       unsigned nr_replicas, bool compressed)
+{
+	struct btree_trans trans;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	u64 end = offset + size;
+	u32 snapshot;
+	bool ret = true;
+	int err;
+
+	bch2_trans_init(&trans, c, 0, 0);
+retry:
+	bch2_trans_begin(&trans);
+
+	err = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot);
+	if (err)
+		goto err;
+
+	for_each_btree_key_norestart(&trans, iter, BTREE_ID_extents,
+			   SPOS(inum.inum, offset, snapshot),
+			   BTREE_ITER_SLOTS, k, err) {
+		if (bkey_ge(bkey_start_pos(k.k), POS(inum.inum, end)))
+			break;
+
+		if (k.k->p.snapshot != snapshot ||
+		    nr_replicas > bch2_bkey_replicas(c, k) ||
+		    (!compressed && bch2_bkey_sectors_compressed(k))) {
+			ret = false;
+			break;
+		}
+	}
+
+	offset = iter.pos.offset;
+	bch2_trans_iter_exit(&trans, &iter);
+err:
+	if (bch2_err_matches(err, BCH_ERR_transaction_restart))
+		goto retry;
+	bch2_trans_exit(&trans);
+
+	return err ? false : ret;
+}
+
+static noinline bool bch2_dio_write_check_allocated(struct dio_write *dio)
+{
+	struct bch_fs *c = dio->op.c;
+	struct bch_inode_info *inode = dio->inode;
+	struct bio *bio = &dio->op.wbio.bio;
+
+	return bch2_check_range_allocated(c, inode_inum(inode),
+				dio->op.pos.offset, bio_sectors(bio),
+				dio->op.opts.data_replicas,
+				dio->op.opts.compression != 0);
+}
+
+static void bch2_dio_write_loop_async(struct bch_write_op *);
+static __always_inline long bch2_dio_write_done(struct dio_write *dio);
+
+/*
+ * We're going to return -EIOCBQUEUED, but we haven't finished consuming the
+ * iov_iter yet, so we need to stash a copy of the iovec: it might be on the
+ * caller's stack, we're not guaranteed that it will live for the duration of
+ * the IO:
+ */
+static noinline int bch2_dio_write_copy_iov(struct dio_write *dio)
+{
+	struct iovec *iov = dio->inline_vecs;
+
+	/*
+	 * iov_iter has a single embedded iovec - nothing to do:
+	 */
+	if (iter_is_ubuf(&dio->iter))
+		return 0;
+
+	/*
+	 * We don't currently handle non-iovec iov_iters here - return an error,
+	 * and we'll fall back to doing the IO synchronously:
+	 */
+	if (!iter_is_iovec(&dio->iter))
+		return -1;
+
+	if (dio->iter.nr_segs > ARRAY_SIZE(dio->inline_vecs)) {
+		iov = kmalloc_array(dio->iter.nr_segs, sizeof(*iov),
+				    GFP_KERNEL);
+		if (unlikely(!iov))
+			return -ENOMEM;
+
+		dio->free_iov = true;
+	}
+
+	memcpy(iov, dio->iter.__iov, dio->iter.nr_segs * sizeof(*iov));
+	dio->iter.__iov = iov;
+	return 0;
+}
+
+static void bch2_dio_write_flush_done(struct closure *cl)
+{
+	struct dio_write *dio = container_of(cl, struct dio_write, op.cl);
+	struct bch_fs *c = dio->op.c;
+
+	closure_debug_destroy(cl);
+
+	dio->op.error = bch2_journal_error(&c->journal);
+
+	bch2_dio_write_done(dio);
+}
+
+static noinline void bch2_dio_write_flush(struct dio_write *dio)
+{
+	struct bch_fs *c = dio->op.c;
+	struct bch_inode_unpacked inode;
+	int ret;
+
+	dio->flush = 0;
+
+	closure_init(&dio->op.cl, NULL);
+
+	if (!dio->op.error) {
+		ret = bch2_inode_find_by_inum(c, inode_inum(dio->inode), &inode);
+		if (ret) {
+			dio->op.error = ret;
+		} else {
+			bch2_journal_flush_seq_async(&c->journal, inode.bi_journal_seq,
+						     &dio->op.cl);
+			bch2_inode_flush_nocow_writes_async(c, dio->inode, &dio->op.cl);
+		}
+	}
+
+	if (dio->sync) {
+		closure_sync(&dio->op.cl);
+		closure_debug_destroy(&dio->op.cl);
+	} else {
+		continue_at(&dio->op.cl, bch2_dio_write_flush_done, NULL);
+	}
+}
+
+static __always_inline long bch2_dio_write_done(struct dio_write *dio)
+{
+	struct kiocb *req = dio->req;
+	struct bch_inode_info *inode = dio->inode;
+	bool sync = dio->sync;
+	long ret;
+
+	if (unlikely(dio->flush)) {
+		bch2_dio_write_flush(dio);
+		if (!sync)
+			return -EIOCBQUEUED;
+	}
+
+	bch2_pagecache_block_put(inode);
+
+	if (dio->free_iov)
+		kfree(dio->iter.__iov);
+
+	ret = dio->op.error ?: ((long) dio->written << 9);
+	bio_put(&dio->op.wbio.bio);
+
+	/* inode->i_dio_count is our ref on inode and thus bch_fs */
+	inode_dio_end(&inode->v);
+
+	if (ret < 0)
+		ret = bch2_err_class(ret);
+
+	if (!sync) {
+		req->ki_complete(req, ret);
+		ret = -EIOCBQUEUED;
+	}
+	return ret;
+}
+
+static __always_inline void bch2_dio_write_end(struct dio_write *dio)
+{
+	struct bch_fs *c = dio->op.c;
+	struct kiocb *req = dio->req;
+	struct bch_inode_info *inode = dio->inode;
+	struct bio *bio = &dio->op.wbio.bio;
+
+	req->ki_pos	+= (u64) dio->op.written << 9;
+	dio->written	+= dio->op.written;
+
+	if (dio->extending) {
+		spin_lock(&inode->v.i_lock);
+		if (req->ki_pos > inode->v.i_size)
+			i_size_write(&inode->v, req->ki_pos);
+		spin_unlock(&inode->v.i_lock);
+	}
+
+	if (dio->op.i_sectors_delta || dio->quota_res.sectors) {
+		mutex_lock(&inode->ei_quota_lock);
+		__bch2_i_sectors_acct(c, inode, &dio->quota_res, dio->op.i_sectors_delta);
+		__bch2_quota_reservation_put(c, inode, &dio->quota_res);
+		mutex_unlock(&inode->ei_quota_lock);
+	}
+
+	bio_release_pages(bio, false);
+
+	if (unlikely(dio->op.error))
+		set_bit(EI_INODE_ERROR, &inode->ei_flags);
+}
+
+static __always_inline long bch2_dio_write_loop(struct dio_write *dio)
+{
+	struct bch_fs *c = dio->op.c;
+	struct kiocb *req = dio->req;
+	struct address_space *mapping = dio->mapping;
+	struct bch_inode_info *inode = dio->inode;
+	struct bch_io_opts opts;
+	struct bio *bio = &dio->op.wbio.bio;
+	unsigned unaligned, iter_count;
+	bool sync = dio->sync, dropped_locks;
+	long ret;
+
+	bch2_inode_opts_get(&opts, c, &inode->ei_inode);
+
+	while (1) {
+		iter_count = dio->iter.count;
+
+		EBUG_ON(current->faults_disabled_mapping);
+		current->faults_disabled_mapping = mapping;
+
+		ret = bio_iov_iter_get_pages(bio, &dio->iter);
+
+		dropped_locks = fdm_dropped_locks();
+
+		current->faults_disabled_mapping = NULL;
+
+		/*
+		 * If the fault handler returned an error but also signalled
+		 * that it dropped & retook ei_pagecache_lock, we just need to
+		 * re-shoot down the page cache and retry:
+		 */
+		if (dropped_locks && ret)
+			ret = 0;
+
+		if (unlikely(ret < 0))
+			goto err;
+
+		if (unlikely(dropped_locks)) {
+			ret = bch2_write_invalidate_inode_pages_range(mapping,
+					req->ki_pos,
+					req->ki_pos + iter_count - 1);
+			if (unlikely(ret))
+				goto err;
+
+			if (!bio->bi_iter.bi_size)
+				continue;
+		}
+
+		unaligned = bio->bi_iter.bi_size & (block_bytes(c) - 1);
+		bio->bi_iter.bi_size -= unaligned;
+		iov_iter_revert(&dio->iter, unaligned);
+
+		if (!bio->bi_iter.bi_size) {
+			/*
+			 * bio_iov_iter_get_pages was only able to get <
+			 * blocksize worth of pages:
+			 */
+			ret = -EFAULT;
+			goto err;
+		}
+
+		bch2_write_op_init(&dio->op, c, opts);
+		dio->op.end_io		= sync
+			? NULL
+			: bch2_dio_write_loop_async;
+		dio->op.target		= dio->op.opts.foreground_target;
+		dio->op.write_point	= writepoint_hashed((unsigned long) current);
+		dio->op.nr_replicas	= dio->op.opts.data_replicas;
+		dio->op.subvol		= inode->ei_subvol;
+		dio->op.pos		= POS(inode->v.i_ino, (u64) req->ki_pos >> 9);
+		dio->op.devs_need_flush	= &inode->ei_devs_need_flush;
+
+		if (sync)
+			dio->op.flags |= BCH_WRITE_SYNC;
+		dio->op.flags |= BCH_WRITE_CHECK_ENOSPC;
+
+		ret = bch2_quota_reservation_add(c, inode, &dio->quota_res,
+						 bio_sectors(bio), true);
+		if (unlikely(ret))
+			goto err;
+
+		ret = bch2_disk_reservation_get(c, &dio->op.res, bio_sectors(bio),
+						dio->op.opts.data_replicas, 0);
+		if (unlikely(ret) &&
+		    !bch2_dio_write_check_allocated(dio))
+			goto err;
+
+		task_io_account_write(bio->bi_iter.bi_size);
+
+		if (unlikely(dio->iter.count) &&
+		    !dio->sync &&
+		    !dio->loop &&
+		    bch2_dio_write_copy_iov(dio))
+			dio->sync = sync = true;
+
+		dio->loop = true;
+		closure_call(&dio->op.cl, bch2_write, NULL, NULL);
+
+		if (!sync)
+			return -EIOCBQUEUED;
+
+		bch2_dio_write_end(dio);
+
+		if (likely(!dio->iter.count) || dio->op.error)
+			break;
+
+		bio_reset(bio, NULL, REQ_OP_WRITE);
+	}
+out:
+	return bch2_dio_write_done(dio);
+err:
+	dio->op.error = ret;
+
+	bio_release_pages(bio, false);
+
+	bch2_quota_reservation_put(c, inode, &dio->quota_res);
+	goto out;
+}
+
+static noinline __cold void bch2_dio_write_continue(struct dio_write *dio)
+{
+	struct mm_struct *mm = dio->mm;
+
+	bio_reset(&dio->op.wbio.bio, NULL, REQ_OP_WRITE);
+
+	if (mm)
+		kthread_use_mm(mm);
+	bch2_dio_write_loop(dio);
+	if (mm)
+		kthread_unuse_mm(mm);
+}
+
+static void bch2_dio_write_loop_async(struct bch_write_op *op)
+{
+	struct dio_write *dio = container_of(op, struct dio_write, op);
+
+	bch2_dio_write_end(dio);
+
+	if (likely(!dio->iter.count) || dio->op.error)
+		bch2_dio_write_done(dio);
+	else
+		bch2_dio_write_continue(dio);
+}
+
+ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter)
+{
+	struct file *file = req->ki_filp;
+	struct address_space *mapping = file->f_mapping;
+	struct bch_inode_info *inode = file_bch_inode(file);
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	struct dio_write *dio;
+	struct bio *bio;
+	bool locked = true, extending;
+	ssize_t ret;
+
+	prefetch(&c->opts);
+	prefetch((void *) &c->opts + 64);
+	prefetch(&inode->ei_inode);
+	prefetch((void *) &inode->ei_inode + 64);
+
+	inode_lock(&inode->v);
+
+	ret = generic_write_checks(req, iter);
+	if (unlikely(ret <= 0))
+		goto err;
+
+	ret = file_remove_privs(file);
+	if (unlikely(ret))
+		goto err;
+
+	ret = file_update_time(file);
+	if (unlikely(ret))
+		goto err;
+
+	if (unlikely((req->ki_pos|iter->count) & (block_bytes(c) - 1)))
+		goto err;
+
+	inode_dio_begin(&inode->v);
+	bch2_pagecache_block_get(inode);
+
+	extending = req->ki_pos + iter->count > inode->v.i_size;
+	if (!extending) {
+		inode_unlock(&inode->v);
+		locked = false;
+	}
+
+	bio = bio_alloc_bioset(NULL,
+			       bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS),
+			       REQ_OP_WRITE,
+			       GFP_KERNEL,
+			       &c->dio_write_bioset);
+	dio = container_of(bio, struct dio_write, op.wbio.bio);
+	dio->req		= req;
+	dio->mapping		= mapping;
+	dio->inode		= inode;
+	dio->mm			= current->mm;
+	dio->loop		= false;
+	dio->extending		= extending;
+	dio->sync		= is_sync_kiocb(req) || extending;
+	dio->flush		= iocb_is_dsync(req) && !c->opts.journal_flush_disabled;
+	dio->free_iov		= false;
+	dio->quota_res.sectors	= 0;
+	dio->written		= 0;
+	dio->iter		= *iter;
+	dio->op.c		= c;
+
+	if (unlikely(mapping->nrpages)) {
+		ret = bch2_write_invalidate_inode_pages_range(mapping,
+						req->ki_pos,
+						req->ki_pos + iter->count - 1);
+		if (unlikely(ret))
+			goto err_put_bio;
+	}
+
+	ret = bch2_dio_write_loop(dio);
+err:
+	if (locked)
+		inode_unlock(&inode->v);
+	return ret;
+err_put_bio:
+	bch2_pagecache_block_put(inode);
+	bio_put(bio);
+	inode_dio_end(&inode->v);
+	goto err;
+}
+
+void bch2_fs_fs_io_direct_exit(struct bch_fs *c)
+{
+	bioset_exit(&c->dio_write_bioset);
+	bioset_exit(&c->dio_read_bioset);
+}
+
+int bch2_fs_fs_io_direct_init(struct bch_fs *c)
+{
+	if (bioset_init(&c->dio_read_bioset,
+			4, offsetof(struct dio_read, rbio.bio),
+			BIOSET_NEED_BVECS))
+		return -BCH_ERR_ENOMEM_dio_read_bioset_init;
+
+	if (bioset_init(&c->dio_write_bioset,
+			4, offsetof(struct dio_write, op.wbio.bio),
+			BIOSET_NEED_BVECS))
+		return -BCH_ERR_ENOMEM_dio_write_bioset_init;
+
+	return 0;
+}
+
+#endif /* NO_BCACHEFS_FS */
diff --git a/fs/bcachefs/fs-io-direct.h b/fs/bcachefs/fs-io-direct.h
new file mode 100644
index 000000000000..814621ec7f81
--- /dev/null
+++ b/fs/bcachefs/fs-io-direct.h
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_FS_IO_DIRECT_H
+#define _BCACHEFS_FS_IO_DIRECT_H
+
+#ifndef NO_BCACHEFS_FS
+ssize_t bch2_direct_write(struct kiocb *, struct iov_iter *);
+ssize_t bch2_read_iter(struct kiocb *, struct iov_iter *);
+
+void bch2_fs_fs_io_direct_exit(struct bch_fs *);
+int bch2_fs_fs_io_direct_init(struct bch_fs *);
+#else
+static inline void bch2_fs_fs_io_direct_exit(struct bch_fs *c) {}
+static inline int bch2_fs_fs_io_direct_init(struct bch_fs *c) { return 0; }
+#endif
+
+#endif /* _BCACHEFS_FS_IO_DIRECT_H */
diff --git a/fs/bcachefs/fs-io-pagecache.c b/fs/bcachefs/fs-io-pagecache.c
new file mode 100644
index 000000000000..2c1ef13d9bcd
--- /dev/null
+++ b/fs/bcachefs/fs-io-pagecache.c
@@ -0,0 +1,780 @@
+// SPDX-License-Identifier: GPL-2.0
+#ifndef NO_BCACHEFS_FS
+
+#include "bcachefs.h"
+#include "btree_iter.h"
+#include "extents.h"
+#include "fs-io.h"
+#include "fs-io-pagecache.h"
+#include "subvolume.h"
+
+#include <linux/pagevec.h>
+#include <linux/writeback.h>
+
+int bch2_filemap_get_contig_folios_d(struct address_space *mapping,
+				     loff_t start, u64 end,
+				     int fgp_flags, gfp_t gfp,
+				     folios *folios)
+{
+	struct folio *f;
+	u64 pos = start;
+	int ret = 0;
+
+	while (pos < end) {
+		if ((u64) pos >= (u64) start + (1ULL << 20))
+			fgp_flags &= ~FGP_CREAT;
+
+		ret = darray_make_room_gfp(folios, 1, gfp & GFP_KERNEL);
+		if (ret)
+			break;
+
+		f = __filemap_get_folio(mapping, pos >> PAGE_SHIFT, fgp_flags, gfp);
+		if (IS_ERR_OR_NULL(f))
+			break;
+
+		BUG_ON(folios->nr && folio_pos(f) != pos);
+
+		pos = folio_end_pos(f);
+		darray_push(folios, f);
+	}
+
+	if (!folios->nr && !ret && (fgp_flags & FGP_CREAT))
+		ret = -ENOMEM;
+
+	return folios->nr ? 0 : ret;
+}
+
+/* pagecache_block must be held */
+int bch2_write_invalidate_inode_pages_range(struct address_space *mapping,
+					    loff_t start, loff_t end)
+{
+	int ret;
+
+	/*
+	 * XXX: the way this is currently implemented, we can spin if a process
+	 * is continually redirtying a specific page
+	 */
+	do {
+		if (!mapping->nrpages)
+			return 0;
+
+		ret = filemap_write_and_wait_range(mapping, start, end);
+		if (ret)
+			break;
+
+		if (!mapping->nrpages)
+			return 0;
+
+		ret = invalidate_inode_pages2_range(mapping,
+				start >> PAGE_SHIFT,
+				end >> PAGE_SHIFT);
+	} while (ret == -EBUSY);
+
+	return ret;
+}
+
+static const char * const bch2_folio_sector_states[] = {
+#define x(n)	#n,
+	BCH_FOLIO_SECTOR_STATE()
+#undef x
+	NULL
+};
+
+static inline enum bch_folio_sector_state
+folio_sector_dirty(enum bch_folio_sector_state state)
+{
+	switch (state) {
+	case SECTOR_unallocated:
+		return SECTOR_dirty;
+	case SECTOR_reserved:
+		return SECTOR_dirty_reserved;
+	default:
+		return state;
+	}
+}
+
+static inline enum bch_folio_sector_state
+folio_sector_undirty(enum bch_folio_sector_state state)
+{
+	switch (state) {
+	case SECTOR_dirty:
+		return SECTOR_unallocated;
+	case SECTOR_dirty_reserved:
+		return SECTOR_reserved;
+	default:
+		return state;
+	}
+}
+
+static inline enum bch_folio_sector_state
+folio_sector_reserve(enum bch_folio_sector_state state)
+{
+	switch (state) {
+	case SECTOR_unallocated:
+		return SECTOR_reserved;
+	case SECTOR_dirty:
+		return SECTOR_dirty_reserved;
+	default:
+		return state;
+	}
+}
+
+/* for newly allocated folios: */
+struct bch_folio *__bch2_folio_create(struct folio *folio, gfp_t gfp)
+{
+	struct bch_folio *s;
+
+	s = kzalloc(sizeof(*s) +
+		    sizeof(struct bch_folio_sector) *
+		    folio_sectors(folio), gfp);
+	if (!s)
+		return NULL;
+
+	spin_lock_init(&s->lock);
+	folio_attach_private(folio, s);
+	return s;
+}
+
+struct bch_folio *bch2_folio_create(struct folio *folio, gfp_t gfp)
+{
+	return bch2_folio(folio) ?: __bch2_folio_create(folio, gfp);
+}
+
+static unsigned bkey_to_sector_state(struct bkey_s_c k)
+{
+	if (bkey_extent_is_reservation(k))
+		return SECTOR_reserved;
+	if (bkey_extent_is_allocation(k.k))
+		return SECTOR_allocated;
+	return SECTOR_unallocated;
+}
+
+static void __bch2_folio_set(struct folio *folio,
+			     unsigned pg_offset, unsigned pg_len,
+			     unsigned nr_ptrs, unsigned state)
+{
+	struct bch_folio *s = bch2_folio(folio);
+	unsigned i, sectors = folio_sectors(folio);
+
+	BUG_ON(pg_offset >= sectors);
+	BUG_ON(pg_offset + pg_len > sectors);
+
+	spin_lock(&s->lock);
+
+	for (i = pg_offset; i < pg_offset + pg_len; i++) {
+		s->s[i].nr_replicas	= nr_ptrs;
+		bch2_folio_sector_set(folio, s, i, state);
+	}
+
+	if (i == sectors)
+		s->uptodate = true;
+
+	spin_unlock(&s->lock);
+}
+
+/*
+ * Initialize bch_folio state (allocated/unallocated, nr_replicas) from the
+ * extents btree:
+ */
+int bch2_folio_set(struct bch_fs *c, subvol_inum inum,
+		   struct folio **folios, unsigned nr_folios)
+{
+	struct btree_trans trans;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	struct bch_folio *s;
+	u64 offset = folio_sector(folios[0]);
+	unsigned folio_idx;
+	u32 snapshot;
+	bool need_set = false;
+	int ret;
+
+	for (folio_idx = 0; folio_idx < nr_folios; folio_idx++) {
+		s = bch2_folio_create(folios[folio_idx], GFP_KERNEL);
+		if (!s)
+			return -ENOMEM;
+
+		need_set |= !s->uptodate;
+	}
+
+	if (!need_set)
+		return 0;
+
+	folio_idx = 0;
+	bch2_trans_init(&trans, c, 0, 0);
+retry:
+	bch2_trans_begin(&trans);
+
+	ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot);
+	if (ret)
+		goto err;
+
+	for_each_btree_key_norestart(&trans, iter, BTREE_ID_extents,
+			   SPOS(inum.inum, offset, snapshot),
+			   BTREE_ITER_SLOTS, k, ret) {
+		unsigned nr_ptrs = bch2_bkey_nr_ptrs_fully_allocated(k);
+		unsigned state = bkey_to_sector_state(k);
+
+		while (folio_idx < nr_folios) {
+			struct folio *folio = folios[folio_idx];
+			u64 folio_start	= folio_sector(folio);
+			u64 folio_end	= folio_end_sector(folio);
+			unsigned folio_offset = max(bkey_start_offset(k.k), folio_start) -
+				folio_start;
+			unsigned folio_len = min(k.k->p.offset, folio_end) -
+				folio_offset - folio_start;
+
+			BUG_ON(k.k->p.offset < folio_start);
+			BUG_ON(bkey_start_offset(k.k) > folio_end);
+
+			if (!bch2_folio(folio)->uptodate)
+				__bch2_folio_set(folio, folio_offset, folio_len, nr_ptrs, state);
+
+			if (k.k->p.offset < folio_end)
+				break;
+			folio_idx++;
+		}
+
+		if (folio_idx == nr_folios)
+			break;
+	}
+
+	offset = iter.pos.offset;
+	bch2_trans_iter_exit(&trans, &iter);
+err:
+	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+		goto retry;
+	bch2_trans_exit(&trans);
+
+	return ret;
+}
+
+void bch2_bio_page_state_set(struct bio *bio, struct bkey_s_c k)
+{
+	struct bvec_iter iter;
+	struct folio_vec fv;
+	unsigned nr_ptrs = k.k->type == KEY_TYPE_reflink_v
+		? 0 : bch2_bkey_nr_ptrs_fully_allocated(k);
+	unsigned state = bkey_to_sector_state(k);
+
+	bio_for_each_folio(fv, bio, iter)
+		__bch2_folio_set(fv.fv_folio,
+				 fv.fv_offset >> 9,
+				 fv.fv_len >> 9,
+				 nr_ptrs, state);
+}
+
+void bch2_mark_pagecache_unallocated(struct bch_inode_info *inode,
+				     u64 start, u64 end)
+{
+	pgoff_t index = start >> PAGE_SECTORS_SHIFT;
+	pgoff_t end_index = (end - 1) >> PAGE_SECTORS_SHIFT;
+	struct folio_batch fbatch;
+	unsigned i, j;
+
+	if (end <= start)
+		return;
+
+	folio_batch_init(&fbatch);
+
+	while (filemap_get_folios(inode->v.i_mapping,
+				  &index, end_index, &fbatch)) {
+		for (i = 0; i < folio_batch_count(&fbatch); i++) {
+			struct folio *folio = fbatch.folios[i];
+			u64 folio_start = folio_sector(folio);
+			u64 folio_end = folio_end_sector(folio);
+			unsigned folio_offset = max(start, folio_start) - folio_start;
+			unsigned folio_len = min(end, folio_end) - folio_offset - folio_start;
+			struct bch_folio *s;
+
+			BUG_ON(end <= folio_start);
+
+			folio_lock(folio);
+			s = bch2_folio(folio);
+
+			if (s) {
+				spin_lock(&s->lock);
+				for (j = folio_offset; j < folio_offset + folio_len; j++)
+					s->s[j].nr_replicas = 0;
+				spin_unlock(&s->lock);
+			}
+
+			folio_unlock(folio);
+		}
+		folio_batch_release(&fbatch);
+		cond_resched();
+	}
+}
+
+void bch2_mark_pagecache_reserved(struct bch_inode_info *inode,
+				  u64 start, u64 end)
+{
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	pgoff_t index = start >> PAGE_SECTORS_SHIFT;
+	pgoff_t end_index = (end - 1) >> PAGE_SECTORS_SHIFT;
+	struct folio_batch fbatch;
+	s64 i_sectors_delta = 0;
+	unsigned i, j;
+
+	if (end <= start)
+		return;
+
+	folio_batch_init(&fbatch);
+
+	while (filemap_get_folios(inode->v.i_mapping,
+				  &index, end_index, &fbatch)) {
+		for (i = 0; i < folio_batch_count(&fbatch); i++) {
+			struct folio *folio = fbatch.folios[i];
+			u64 folio_start = folio_sector(folio);
+			u64 folio_end = folio_end_sector(folio);
+			unsigned folio_offset = max(start, folio_start) - folio_start;
+			unsigned folio_len = min(end, folio_end) - folio_offset - folio_start;
+			struct bch_folio *s;
+
+			BUG_ON(end <= folio_start);
+
+			folio_lock(folio);
+			s = bch2_folio(folio);
+
+			if (s) {
+				spin_lock(&s->lock);
+				for (j = folio_offset; j < folio_offset + folio_len; j++) {
+					i_sectors_delta -= s->s[j].state == SECTOR_dirty;
+					bch2_folio_sector_set(folio, s, j,
+						folio_sector_reserve(s->s[j].state));
+				}
+				spin_unlock(&s->lock);
+			}
+
+			folio_unlock(folio);
+		}
+		folio_batch_release(&fbatch);
+		cond_resched();
+	}
+
+	bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta);
+}
+
+static inline unsigned sectors_to_reserve(struct bch_folio_sector *s,
+					  unsigned nr_replicas)
+{
+	return max(0, (int) nr_replicas -
+		   s->nr_replicas -
+		   s->replicas_reserved);
+}
+
+int bch2_get_folio_disk_reservation(struct bch_fs *c,
+				struct bch_inode_info *inode,
+				struct folio *folio, bool check_enospc)
+{
+	struct bch_folio *s = bch2_folio_create(folio, 0);
+	unsigned nr_replicas = inode_nr_replicas(c, inode);
+	struct disk_reservation disk_res = { 0 };
+	unsigned i, sectors = folio_sectors(folio), disk_res_sectors = 0;
+	int ret;
+
+	if (!s)
+		return -ENOMEM;
+
+	for (i = 0; i < sectors; i++)
+		disk_res_sectors += sectors_to_reserve(&s->s[i], nr_replicas);
+
+	if (!disk_res_sectors)
+		return 0;
+
+	ret = bch2_disk_reservation_get(c, &disk_res,
+					disk_res_sectors, 1,
+					!check_enospc
+					? BCH_DISK_RESERVATION_NOFAIL
+					: 0);
+	if (unlikely(ret))
+		return ret;
+
+	for (i = 0; i < sectors; i++)
+		s->s[i].replicas_reserved +=
+			sectors_to_reserve(&s->s[i], nr_replicas);
+
+	return 0;
+}
+
+void bch2_folio_reservation_put(struct bch_fs *c,
+			struct bch_inode_info *inode,
+			struct bch2_folio_reservation *res)
+{
+	bch2_disk_reservation_put(c, &res->disk);
+	bch2_quota_reservation_put(c, inode, &res->quota);
+}
+
+int bch2_folio_reservation_get(struct bch_fs *c,
+			struct bch_inode_info *inode,
+			struct folio *folio,
+			struct bch2_folio_reservation *res,
+			unsigned offset, unsigned len)
+{
+	struct bch_folio *s = bch2_folio_create(folio, 0);
+	unsigned i, disk_sectors = 0, quota_sectors = 0;
+	int ret;
+
+	if (!s)
+		return -ENOMEM;
+
+	BUG_ON(!s->uptodate);
+
+	for (i = round_down(offset, block_bytes(c)) >> 9;
+	     i < round_up(offset + len, block_bytes(c)) >> 9;
+	     i++) {
+		disk_sectors += sectors_to_reserve(&s->s[i],
+						res->disk.nr_replicas);
+		quota_sectors += s->s[i].state == SECTOR_unallocated;
+	}
+
+	if (disk_sectors) {
+		ret = bch2_disk_reservation_add(c, &res->disk, disk_sectors, 0);
+		if (unlikely(ret))
+			return ret;
+	}
+
+	if (quota_sectors) {
+		ret = bch2_quota_reservation_add(c, inode, &res->quota,
+						 quota_sectors, true);
+		if (unlikely(ret)) {
+			struct disk_reservation tmp = {
+				.sectors = disk_sectors
+			};
+
+			bch2_disk_reservation_put(c, &tmp);
+			res->disk.sectors -= disk_sectors;
+			return ret;
+		}
+	}
+
+	return 0;
+}
+
+static void bch2_clear_folio_bits(struct folio *folio)
+{
+	struct bch_inode_info *inode = to_bch_ei(folio->mapping->host);
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	struct bch_folio *s = bch2_folio(folio);
+	struct disk_reservation disk_res = { 0 };
+	int i, sectors = folio_sectors(folio), dirty_sectors = 0;
+
+	if (!s)
+		return;
+
+	EBUG_ON(!folio_test_locked(folio));
+	EBUG_ON(folio_test_writeback(folio));
+
+	for (i = 0; i < sectors; i++) {
+		disk_res.sectors += s->s[i].replicas_reserved;
+		s->s[i].replicas_reserved = 0;
+
+		dirty_sectors -= s->s[i].state == SECTOR_dirty;
+		bch2_folio_sector_set(folio, s, i, folio_sector_undirty(s->s[i].state));
+	}
+
+	bch2_disk_reservation_put(c, &disk_res);
+
+	bch2_i_sectors_acct(c, inode, NULL, dirty_sectors);
+
+	bch2_folio_release(folio);
+}
+
+void bch2_set_folio_dirty(struct bch_fs *c,
+			  struct bch_inode_info *inode,
+			  struct folio *folio,
+			  struct bch2_folio_reservation *res,
+			  unsigned offset, unsigned len)
+{
+	struct bch_folio *s = bch2_folio(folio);
+	unsigned i, dirty_sectors = 0;
+
+	WARN_ON((u64) folio_pos(folio) + offset + len >
+		round_up((u64) i_size_read(&inode->v), block_bytes(c)));
+
+	BUG_ON(!s->uptodate);
+
+	spin_lock(&s->lock);
+
+	for (i = round_down(offset, block_bytes(c)) >> 9;
+	     i < round_up(offset + len, block_bytes(c)) >> 9;
+	     i++) {
+		unsigned sectors = sectors_to_reserve(&s->s[i],
+						res->disk.nr_replicas);
+
+		/*
+		 * This can happen if we race with the error path in
+		 * bch2_writepage_io_done():
+		 */
+		sectors = min_t(unsigned, sectors, res->disk.sectors);
+
+		s->s[i].replicas_reserved += sectors;
+		res->disk.sectors -= sectors;
+
+		dirty_sectors += s->s[i].state == SECTOR_unallocated;
+
+		bch2_folio_sector_set(folio, s, i, folio_sector_dirty(s->s[i].state));
+	}
+
+	spin_unlock(&s->lock);
+
+	bch2_i_sectors_acct(c, inode, &res->quota, dirty_sectors);
+
+	if (!folio_test_dirty(folio))
+		filemap_dirty_folio(inode->v.i_mapping, folio);
+}
+
+vm_fault_t bch2_page_fault(struct vm_fault *vmf)
+{
+	struct file *file = vmf->vma->vm_file;
+	struct address_space *mapping = file->f_mapping;
+	struct address_space *fdm = faults_disabled_mapping();
+	struct bch_inode_info *inode = file_bch_inode(file);
+	vm_fault_t ret;
+
+	if (fdm == mapping)
+		return VM_FAULT_SIGBUS;
+
+	/* Lock ordering: */
+	if (fdm > mapping) {
+		struct bch_inode_info *fdm_host = to_bch_ei(fdm->host);
+
+		if (bch2_pagecache_add_tryget(inode))
+			goto got_lock;
+
+		bch2_pagecache_block_put(fdm_host);
+
+		bch2_pagecache_add_get(inode);
+		bch2_pagecache_add_put(inode);
+
+		bch2_pagecache_block_get(fdm_host);
+
+		/* Signal that lock has been dropped: */
+		set_fdm_dropped_locks();
+		return VM_FAULT_SIGBUS;
+	}
+
+	bch2_pagecache_add_get(inode);
+got_lock:
+	ret = filemap_fault(vmf);
+	bch2_pagecache_add_put(inode);
+
+	return ret;
+}
+
+vm_fault_t bch2_page_mkwrite(struct vm_fault *vmf)
+{
+	struct folio *folio = page_folio(vmf->page);
+	struct file *file = vmf->vma->vm_file;
+	struct bch_inode_info *inode = file_bch_inode(file);
+	struct address_space *mapping = file->f_mapping;
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	struct bch2_folio_reservation res;
+	unsigned len;
+	loff_t isize;
+	vm_fault_t ret;
+
+	bch2_folio_reservation_init(c, inode, &res);
+
+	sb_start_pagefault(inode->v.i_sb);
+	file_update_time(file);
+
+	/*
+	 * Not strictly necessary, but helps avoid dio writes livelocking in
+	 * bch2_write_invalidate_inode_pages_range() - can drop this if/when we get
+	 * a bch2_write_invalidate_inode_pages_range() that works without dropping
+	 * page lock before invalidating page
+	 */
+	bch2_pagecache_add_get(inode);
+
+	folio_lock(folio);
+	isize = i_size_read(&inode->v);
+
+	if (folio->mapping != mapping || folio_pos(folio) >= isize) {
+		folio_unlock(folio);
+		ret = VM_FAULT_NOPAGE;
+		goto out;
+	}
+
+	len = min_t(loff_t, folio_size(folio), isize - folio_pos(folio));
+
+	if (bch2_folio_set(c, inode_inum(inode), &folio, 1) ?:
+	    bch2_folio_reservation_get(c, inode, folio, &res, 0, len)) {
+		folio_unlock(folio);
+		ret = VM_FAULT_SIGBUS;
+		goto out;
+	}
+
+	bch2_set_folio_dirty(c, inode, folio, &res, 0, len);
+	bch2_folio_reservation_put(c, inode, &res);
+
+	folio_wait_stable(folio);
+	ret = VM_FAULT_LOCKED;
+out:
+	bch2_pagecache_add_put(inode);
+	sb_end_pagefault(inode->v.i_sb);
+
+	return ret;
+}
+
+void bch2_invalidate_folio(struct folio *folio, size_t offset, size_t length)
+{
+	if (offset || length < folio_size(folio))
+		return;
+
+	bch2_clear_folio_bits(folio);
+}
+
+bool bch2_release_folio(struct folio *folio, gfp_t gfp_mask)
+{
+	if (folio_test_dirty(folio) || folio_test_writeback(folio))
+		return false;
+
+	bch2_clear_folio_bits(folio);
+	return true;
+}
+
+/* fseek: */
+
+static int folio_data_offset(struct folio *folio, loff_t pos,
+			     unsigned min_replicas)
+{
+	struct bch_folio *s = bch2_folio(folio);
+	unsigned i, sectors = folio_sectors(folio);
+
+	if (s)
+		for (i = folio_pos_to_s(folio, pos); i < sectors; i++)
+			if (s->s[i].state >= SECTOR_dirty &&
+			    s->s[i].nr_replicas + s->s[i].replicas_reserved >= min_replicas)
+				return i << SECTOR_SHIFT;
+
+	return -1;
+}
+
+loff_t bch2_seek_pagecache_data(struct inode *vinode,
+				loff_t start_offset,
+				loff_t end_offset,
+				unsigned min_replicas,
+				bool nonblock)
+{
+	struct folio_batch fbatch;
+	pgoff_t start_index	= start_offset >> PAGE_SHIFT;
+	pgoff_t end_index	= end_offset >> PAGE_SHIFT;
+	pgoff_t index		= start_index;
+	unsigned i;
+	loff_t ret;
+	int offset;
+
+	folio_batch_init(&fbatch);
+
+	while (filemap_get_folios(vinode->i_mapping,
+				  &index, end_index, &fbatch)) {
+		for (i = 0; i < folio_batch_count(&fbatch); i++) {
+			struct folio *folio = fbatch.folios[i];
+
+			if (!nonblock) {
+				folio_lock(folio);
+			} else if (!folio_trylock(folio)) {
+				folio_batch_release(&fbatch);
+				return -EAGAIN;
+			}
+
+			offset = folio_data_offset(folio,
+					max(folio_pos(folio), start_offset),
+					min_replicas);
+			if (offset >= 0) {
+				ret = clamp(folio_pos(folio) + offset,
+					    start_offset, end_offset);
+				folio_unlock(folio);
+				folio_batch_release(&fbatch);
+				return ret;
+			}
+			folio_unlock(folio);
+		}
+		folio_batch_release(&fbatch);
+		cond_resched();
+	}
+
+	return end_offset;
+}
+
+static int folio_hole_offset(struct address_space *mapping, loff_t *offset,
+			      unsigned min_replicas, bool nonblock)
+{
+	struct folio *folio;
+	struct bch_folio *s;
+	unsigned i, sectors;
+	bool ret = true;
+
+	folio = __filemap_get_folio(mapping, *offset >> PAGE_SHIFT,
+				    FGP_LOCK|(nonblock ? FGP_NOWAIT : 0), 0);
+	if (folio == ERR_PTR(-EAGAIN))
+		return -EAGAIN;
+	if (IS_ERR_OR_NULL(folio))
+		return true;
+
+	s = bch2_folio(folio);
+	if (!s)
+		goto unlock;
+
+	sectors = folio_sectors(folio);
+	for (i = folio_pos_to_s(folio, *offset); i < sectors; i++)
+		if (s->s[i].state < SECTOR_dirty ||
+		    s->s[i].nr_replicas + s->s[i].replicas_reserved < min_replicas) {
+			*offset = max(*offset,
+				      folio_pos(folio) + (i << SECTOR_SHIFT));
+			goto unlock;
+		}
+
+	*offset = folio_end_pos(folio);
+	ret = false;
+unlock:
+	folio_unlock(folio);
+	folio_put(folio);
+	return ret;
+}
+
+loff_t bch2_seek_pagecache_hole(struct inode *vinode,
+				loff_t start_offset,
+				loff_t end_offset,
+				unsigned min_replicas,
+				bool nonblock)
+{
+	struct address_space *mapping = vinode->i_mapping;
+	loff_t offset = start_offset;
+
+	while (offset < end_offset &&
+	       !folio_hole_offset(mapping, &offset, min_replicas, nonblock))
+		;
+
+	return min(offset, end_offset);
+}
+
+int bch2_clamp_data_hole(struct inode *inode,
+			 u64 *hole_start,
+			 u64 *hole_end,
+			 unsigned min_replicas,
+			 bool nonblock)
+{
+	loff_t ret;
+
+	ret = bch2_seek_pagecache_hole(inode,
+		*hole_start << 9, *hole_end << 9, min_replicas, nonblock) >> 9;
+	if (ret < 0)
+		return ret;
+
+	*hole_start = ret;
+
+	if (*hole_start == *hole_end)
+		return 0;
+
+	ret = bch2_seek_pagecache_data(inode,
+		*hole_start << 9, *hole_end << 9, min_replicas, nonblock) >> 9;
+	if (ret < 0)
+		return ret;
+
+	*hole_end = ret;
+	return 0;
+}
+
+#endif /* NO_BCACHEFS_FS */
diff --git a/fs/bcachefs/fs-io-pagecache.h b/fs/bcachefs/fs-io-pagecache.h
new file mode 100644
index 000000000000..a2222ad586e9
--- /dev/null
+++ b/fs/bcachefs/fs-io-pagecache.h
@@ -0,0 +1,176 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_FS_IO_PAGECACHE_H
+#define _BCACHEFS_FS_IO_PAGECACHE_H
+
+#include <linux/pagemap.h>
+
+typedef DARRAY(struct folio *) folios;
+
+int bch2_filemap_get_contig_folios_d(struct address_space *, loff_t,
+				     u64, int, gfp_t, folios *);
+int bch2_write_invalidate_inode_pages_range(struct address_space *, loff_t, loff_t);
+
+/*
+ * Use u64 for the end pos and sector helpers because if the folio covers the
+ * max supported range of the mapping, the start offset of the next folio
+ * overflows loff_t. This breaks much of the range based processing in the
+ * buffered write path.
+ */
+static inline u64 folio_end_pos(struct folio *folio)
+{
+	return folio_pos(folio) + folio_size(folio);
+}
+
+static inline size_t folio_sectors(struct folio *folio)
+{
+	return PAGE_SECTORS << folio_order(folio);
+}
+
+static inline loff_t folio_sector(struct folio *folio)
+{
+	return folio_pos(folio) >> 9;
+}
+
+static inline u64 folio_end_sector(struct folio *folio)
+{
+	return folio_end_pos(folio) >> 9;
+}
+
+#define BCH_FOLIO_SECTOR_STATE()	\
+	x(unallocated)			\
+	x(reserved)			\
+	x(dirty)			\
+	x(dirty_reserved)		\
+	x(allocated)
+
+enum bch_folio_sector_state {
+#define x(n)	SECTOR_##n,
+	BCH_FOLIO_SECTOR_STATE()
+#undef x
+};
+
+struct bch_folio_sector {
+	/* Uncompressed, fully allocated replicas (or on disk reservation): */
+	unsigned		nr_replicas:4;
+
+	/* Owns PAGE_SECTORS * replicas_reserved sized in memory reservation: */
+	unsigned		replicas_reserved:4;
+
+	/* i_sectors: */
+	enum bch_folio_sector_state state:8;
+};
+
+struct bch_folio {
+	spinlock_t		lock;
+	atomic_t		write_count;
+	/*
+	 * Is the sector state up to date with the btree?
+	 * (Not the data itself)
+	 */
+	bool			uptodate;
+	struct bch_folio_sector	s[];
+};
+
+/* Helper for when we need to add debug instrumentation: */
+static inline void bch2_folio_sector_set(struct folio *folio,
+			     struct bch_folio *s,
+			     unsigned i, unsigned n)
+{
+	s->s[i].state = n;
+}
+
+/* file offset (to folio offset) to bch_folio_sector index */
+static inline int folio_pos_to_s(struct folio *folio, loff_t pos)
+{
+	u64 f_offset = pos - folio_pos(folio);
+
+	BUG_ON(pos < folio_pos(folio) || pos >= folio_end_pos(folio));
+	return f_offset >> SECTOR_SHIFT;
+}
+
+/* for newly allocated folios: */
+static inline void __bch2_folio_release(struct folio *folio)
+{
+	kfree(folio_detach_private(folio));
+}
+
+static inline void bch2_folio_release(struct folio *folio)
+{
+	EBUG_ON(!folio_test_locked(folio));
+	__bch2_folio_release(folio);
+}
+
+static inline struct bch_folio *__bch2_folio(struct folio *folio)
+{
+	return folio_has_private(folio)
+		? (struct bch_folio *) folio_get_private(folio)
+		: NULL;
+}
+
+static inline struct bch_folio *bch2_folio(struct folio *folio)
+{
+	EBUG_ON(!folio_test_locked(folio));
+
+	return __bch2_folio(folio);
+}
+
+struct bch_folio *__bch2_folio_create(struct folio *, gfp_t);
+struct bch_folio *bch2_folio_create(struct folio *, gfp_t);
+
+struct bch2_folio_reservation {
+	struct disk_reservation	disk;
+	struct quota_res	quota;
+};
+
+static inline unsigned inode_nr_replicas(struct bch_fs *c, struct bch_inode_info *inode)
+{
+	/* XXX: this should not be open coded */
+	return inode->ei_inode.bi_data_replicas
+		? inode->ei_inode.bi_data_replicas - 1
+		: c->opts.data_replicas;
+}
+
+static inline void bch2_folio_reservation_init(struct bch_fs *c,
+			struct bch_inode_info *inode,
+			struct bch2_folio_reservation *res)
+{
+	memset(res, 0, sizeof(*res));
+
+	res->disk.nr_replicas = inode_nr_replicas(c, inode);
+}
+
+int bch2_folio_set(struct bch_fs *, subvol_inum, struct folio **, unsigned);
+void bch2_bio_page_state_set(struct bio *, struct bkey_s_c);
+
+void bch2_mark_pagecache_unallocated(struct bch_inode_info *, u64, u64);
+void bch2_mark_pagecache_reserved(struct bch_inode_info *, u64, u64);
+
+int bch2_get_folio_disk_reservation(struct bch_fs *,
+				struct bch_inode_info *,
+				struct folio *, bool);
+
+void bch2_folio_reservation_put(struct bch_fs *,
+			struct bch_inode_info *,
+			struct bch2_folio_reservation *);
+int bch2_folio_reservation_get(struct bch_fs *,
+			struct bch_inode_info *,
+			struct folio *,
+			struct bch2_folio_reservation *,
+			unsigned, unsigned);
+
+void bch2_set_folio_dirty(struct bch_fs *,
+			  struct bch_inode_info *,
+			  struct folio *,
+			  struct bch2_folio_reservation *,
+			  unsigned, unsigned);
+
+vm_fault_t bch2_page_fault(struct vm_fault *);
+vm_fault_t bch2_page_mkwrite(struct vm_fault *);
+void bch2_invalidate_folio(struct folio *, size_t, size_t);
+bool bch2_release_folio(struct folio *, gfp_t);
+
+loff_t bch2_seek_pagecache_data(struct inode *, loff_t, loff_t, unsigned, bool);
+loff_t bch2_seek_pagecache_hole(struct inode *, loff_t, loff_t, unsigned, bool);
+int bch2_clamp_data_hole(struct inode *, u64 *, u64 *, unsigned, bool);
+
+#endif /* _BCACHEFS_FS_IO_PAGECACHE_H */
diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 40bfd0b25d9d..11a4919f30cd 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -3,7 +3,6 @@
 
 #include "bcachefs.h"
 #include "alloc_foreground.h"
-#include "bkey_buf.h"
 #include "btree_update.h"
 #include "buckets.h"
 #include "clock.h"
@@ -12,6 +11,8 @@
 #include "extent_update.h"
 #include "fs.h"
 #include "fs-io.h"
+#include "fs-io-buffered.h"
+#include "fs-io-pagecache.h"
 #include "fsck.h"
 #include "inode.h"
 #include "journal.h"
@@ -31,2742 +32,135 @@
 #include <linux/sched/signal.h>
 #include <linux/task_io_accounting_ops.h>
 #include <linux/uio.h>
-#include <linux/writeback.h>
 
 #include <trace/events/writeback.h>
 
-static int bch2_clamp_data_hole(struct inode *, u64 *, u64 *, unsigned, bool);
-
-struct folio_vec {
-	struct folio	*fv_folio;
-	size_t		fv_offset;
-	size_t		fv_len;
-};
-
-static inline struct folio_vec biovec_to_foliovec(struct bio_vec bv)
-{
-
-	struct folio *folio	= page_folio(bv.bv_page);
-	size_t offset		= (folio_page_idx(folio, bv.bv_page) << PAGE_SHIFT) +
-		bv.bv_offset;
-	size_t len = min_t(size_t, folio_size(folio) - offset, bv.bv_len);
-
-	return (struct folio_vec) {
-		.fv_folio	= folio,
-		.fv_offset	= offset,
-		.fv_len		= len,
-	};
-}
-
-static inline struct folio_vec bio_iter_iovec_folio(struct bio *bio,
-						    struct bvec_iter iter)
-{
-	return biovec_to_foliovec(bio_iter_iovec(bio, iter));
-}
-
-#define __bio_for_each_folio(bvl, bio, iter, start)			\
-	for (iter = (start);						\
-	     (iter).bi_size &&						\
-		((bvl = bio_iter_iovec_folio((bio), (iter))), 1);	\
-	     bio_advance_iter_single((bio), &(iter), (bvl).fv_len))
-
-/**
- * bio_for_each_folio - iterate over folios within a bio
- *
- * Like other non-_all versions, this iterates over what bio->bi_iter currently
- * points to. This version is for drivers, where the bio may have previously
- * been split or cloned.
- */
-#define bio_for_each_folio(bvl, bio, iter)				\
-	__bio_for_each_folio(bvl, bio, iter, (bio)->bi_iter)
-
-/*
- * Use u64 for the end pos and sector helpers because if the folio covers the
- * max supported range of the mapping, the start offset of the next folio
- * overflows loff_t. This breaks much of the range based processing in the
- * buffered write path.
- */
-static inline u64 folio_end_pos(struct folio *folio)
-{
-	return folio_pos(folio) + folio_size(folio);
-}
-
-static inline size_t folio_sectors(struct folio *folio)
-{
-	return PAGE_SECTORS << folio_order(folio);
-}
-
-static inline loff_t folio_sector(struct folio *folio)
-{
-	return folio_pos(folio) >> 9;
-}
-
-static inline u64 folio_end_sector(struct folio *folio)
-{
-	return folio_end_pos(folio) >> 9;
-}
-
-typedef DARRAY(struct folio *) folios;
-
-static int filemap_get_contig_folios_d(struct address_space *mapping,
-				       loff_t start, u64 end,
-				       int fgp_flags, gfp_t gfp,
-				       folios *folios)
-{
-	struct folio *f;
-	u64 pos = start;
-	int ret = 0;
-
-	while (pos < end) {
-		if ((u64) pos >= (u64) start + (1ULL << 20))
-			fgp_flags &= ~FGP_CREAT;
-
-		ret = darray_make_room_gfp(folios, 1, gfp & GFP_KERNEL);
-		if (ret)
-			break;
-
-		f = __filemap_get_folio(mapping, pos >> PAGE_SHIFT, fgp_flags, gfp);
-		if (IS_ERR_OR_NULL(f))
-			break;
-
-		BUG_ON(folios->nr && folio_pos(f) != pos);
-
-		pos = folio_end_pos(f);
-		darray_push(folios, f);
-	}
-
-	if (!folios->nr && !ret && (fgp_flags & FGP_CREAT))
-		ret = -ENOMEM;
-
-	return folios->nr ? 0 : ret;
-}
-
-struct nocow_flush {
-	struct closure	*cl;
-	struct bch_dev	*ca;
-	struct bio	bio;
-};
-
-static void nocow_flush_endio(struct bio *_bio)
-{
-
-	struct nocow_flush *bio = container_of(_bio, struct nocow_flush, bio);
-
-	closure_put(bio->cl);
-	percpu_ref_put(&bio->ca->io_ref);
-	bio_put(&bio->bio);
-}
-
-static void bch2_inode_flush_nocow_writes_async(struct bch_fs *c,
-						struct bch_inode_info *inode,
-						struct closure *cl)
-{
-	struct nocow_flush *bio;
-	struct bch_dev *ca;
-	struct bch_devs_mask devs;
-	unsigned dev;
-
-	dev = find_first_bit(inode->ei_devs_need_flush.d, BCH_SB_MEMBERS_MAX);
-	if (dev == BCH_SB_MEMBERS_MAX)
-		return;
-
-	devs = inode->ei_devs_need_flush;
-	memset(&inode->ei_devs_need_flush, 0, sizeof(inode->ei_devs_need_flush));
-
-	for_each_set_bit(dev, devs.d, BCH_SB_MEMBERS_MAX) {
-		rcu_read_lock();
-		ca = rcu_dereference(c->devs[dev]);
-		if (ca && !percpu_ref_tryget(&ca->io_ref))
-			ca = NULL;
-		rcu_read_unlock();
-
-		if (!ca)
-			continue;
-
-		bio = container_of(bio_alloc_bioset(ca->disk_sb.bdev, 0,
-						    REQ_OP_FLUSH,
-						    GFP_KERNEL,
-						    &c->nocow_flush_bioset),
-				   struct nocow_flush, bio);
-		bio->cl			= cl;
-		bio->ca			= ca;
-		bio->bio.bi_end_io	= nocow_flush_endio;
-		closure_bio_submit(&bio->bio, cl);
-	}
-}
-
-static int bch2_inode_flush_nocow_writes(struct bch_fs *c,
-					 struct bch_inode_info *inode)
-{
-	struct closure cl;
-
-	closure_init_stack(&cl);
-	bch2_inode_flush_nocow_writes_async(c, inode, &cl);
-	closure_sync(&cl);
-
-	return 0;
-}
-
-static inline bool bio_full(struct bio *bio, unsigned len)
-{
-	if (bio->bi_vcnt >= bio->bi_max_vecs)
-		return true;
-	if (bio->bi_iter.bi_size > UINT_MAX - len)
-		return true;
-	return false;
-}
-
-static inline struct address_space *faults_disabled_mapping(void)
-{
-	return (void *) (((unsigned long) current->faults_disabled_mapping) & ~1UL);
-}
-
-static inline void set_fdm_dropped_locks(void)
-{
-	current->faults_disabled_mapping =
-		(void *) (((unsigned long) current->faults_disabled_mapping)|1);
-}
-
-static inline bool fdm_dropped_locks(void)
-{
-	return ((unsigned long) current->faults_disabled_mapping) & 1;
-}
-
-struct quota_res {
-	u64				sectors;
-};
-
-struct bch_writepage_io {
-	struct bch_inode_info		*inode;
-
-	/* must be last: */
-	struct bch_write_op		op;
-};
-
-struct dio_write {
-	struct kiocb			*req;
-	struct address_space		*mapping;
-	struct bch_inode_info		*inode;
-	struct mm_struct		*mm;
-	unsigned			loop:1,
-					extending:1,
-					sync:1,
-					flush:1,
-					free_iov:1;
-	struct quota_res		quota_res;
-	u64				written;
-
-	struct iov_iter			iter;
-	struct iovec			inline_vecs[2];
-
-	/* must be last: */
-	struct bch_write_op		op;
-};
-
-struct dio_read {
-	struct closure			cl;
-	struct kiocb			*req;
-	long				ret;
-	bool				should_dirty;
-	struct bch_read_bio		rbio;
-};
-
-/* pagecache_block must be held */
-static noinline int write_invalidate_inode_pages_range(struct address_space *mapping,
-					      loff_t start, loff_t end)
-{
-	int ret;
-
-	/*
-	 * XXX: the way this is currently implemented, we can spin if a process
-	 * is continually redirtying a specific page
-	 */
-	do {
-		if (!mapping->nrpages)
-			return 0;
-
-		ret = filemap_write_and_wait_range(mapping, start, end);
-		if (ret)
-			break;
-
-		if (!mapping->nrpages)
-			return 0;
-
-		ret = invalidate_inode_pages2_range(mapping,
-				start >> PAGE_SHIFT,
-				end >> PAGE_SHIFT);
-	} while (ret == -EBUSY);
-
-	return ret;
-}
-
-/* quotas */
-
-#ifdef CONFIG_BCACHEFS_QUOTA
-
-static void __bch2_quota_reservation_put(struct bch_fs *c,
-					 struct bch_inode_info *inode,
-					 struct quota_res *res)
-{
-	BUG_ON(res->sectors > inode->ei_quota_reserved);
-
-	bch2_quota_acct(c, inode->ei_qid, Q_SPC,
-			-((s64) res->sectors), KEY_TYPE_QUOTA_PREALLOC);
-	inode->ei_quota_reserved -= res->sectors;
-	res->sectors = 0;
-}
-
-static void bch2_quota_reservation_put(struct bch_fs *c,
-				       struct bch_inode_info *inode,
-				       struct quota_res *res)
-{
-	if (res->sectors) {
-		mutex_lock(&inode->ei_quota_lock);
-		__bch2_quota_reservation_put(c, inode, res);
-		mutex_unlock(&inode->ei_quota_lock);
-	}
-}
-
-static int bch2_quota_reservation_add(struct bch_fs *c,
-				      struct bch_inode_info *inode,
-				      struct quota_res *res,
-				      u64 sectors,
-				      bool check_enospc)
-{
-	int ret;
-
-	if (test_bit(EI_INODE_SNAPSHOT, &inode->ei_flags))
-		return 0;
-
-	mutex_lock(&inode->ei_quota_lock);
-	ret = bch2_quota_acct(c, inode->ei_qid, Q_SPC, sectors,
-			      check_enospc ? KEY_TYPE_QUOTA_PREALLOC : KEY_TYPE_QUOTA_NOCHECK);
-	if (likely(!ret)) {
-		inode->ei_quota_reserved += sectors;
-		res->sectors += sectors;
-	}
-	mutex_unlock(&inode->ei_quota_lock);
-
-	return ret;
-}
-
-#else
-
-static void __bch2_quota_reservation_put(struct bch_fs *c,
-					 struct bch_inode_info *inode,
-					 struct quota_res *res) {}
-
-static void bch2_quota_reservation_put(struct bch_fs *c,
-				       struct bch_inode_info *inode,
-				       struct quota_res *res) {}
-
-static int bch2_quota_reservation_add(struct bch_fs *c,
-				      struct bch_inode_info *inode,
-				      struct quota_res *res,
-				      unsigned sectors,
-				      bool check_enospc)
-{
-	return 0;
-}
-
-#endif
-
-/* i_size updates: */
-
-struct inode_new_size {
-	loff_t		new_size;
-	u64		now;
-	unsigned	fields;
-};
-
-static int inode_set_size(struct bch_inode_info *inode,
-			  struct bch_inode_unpacked *bi,
-			  void *p)
-{
-	struct inode_new_size *s = p;
-
-	bi->bi_size = s->new_size;
-	if (s->fields & ATTR_ATIME)
-		bi->bi_atime = s->now;
-	if (s->fields & ATTR_MTIME)
-		bi->bi_mtime = s->now;
-	if (s->fields & ATTR_CTIME)
-		bi->bi_ctime = s->now;
-
-	return 0;
-}
-
-int __must_check bch2_write_inode_size(struct bch_fs *c,
-				       struct bch_inode_info *inode,
-				       loff_t new_size, unsigned fields)
-{
-	struct inode_new_size s = {
-		.new_size	= new_size,
-		.now		= bch2_current_time(c),
-		.fields		= fields,
-	};
-
-	return bch2_write_inode(c, inode, inode_set_size, &s, fields);
-}
-
-static void __i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode,
-			   struct quota_res *quota_res, s64 sectors)
-{
-	bch2_fs_inconsistent_on((s64) inode->v.i_blocks + sectors < 0, c,
-				"inode %lu i_blocks underflow: %llu + %lli < 0 (ondisk %lli)",
-				inode->v.i_ino, (u64) inode->v.i_blocks, sectors,
-				inode->ei_inode.bi_sectors);
-	inode->v.i_blocks += sectors;
-
-#ifdef CONFIG_BCACHEFS_QUOTA
-	if (quota_res &&
-	    !test_bit(EI_INODE_SNAPSHOT, &inode->ei_flags) &&
-	    sectors > 0) {
-		BUG_ON(sectors > quota_res->sectors);
-		BUG_ON(sectors > inode->ei_quota_reserved);
-
-		quota_res->sectors -= sectors;
-		inode->ei_quota_reserved -= sectors;
-	} else {
-		bch2_quota_acct(c, inode->ei_qid, Q_SPC, sectors, KEY_TYPE_QUOTA_WARN);
-	}
-#endif
-}
-
-static void i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode,
-			   struct quota_res *quota_res, s64 sectors)
-{
-	if (sectors) {
-		mutex_lock(&inode->ei_quota_lock);
-		__i_sectors_acct(c, inode, quota_res, sectors);
-		mutex_unlock(&inode->ei_quota_lock);
-	}
-}
-
-/* page state: */
-
-/* stored in page->private: */
-
-#define BCH_FOLIO_SECTOR_STATE()	\
-	x(unallocated)			\
-	x(reserved)			\
-	x(dirty)			\
-	x(dirty_reserved)		\
-	x(allocated)
-
-enum bch_folio_sector_state {
-#define x(n)	SECTOR_##n,
-	BCH_FOLIO_SECTOR_STATE()
-#undef x
-};
-
-static const char * const bch2_folio_sector_states[] = {
-#define x(n)	#n,
-	BCH_FOLIO_SECTOR_STATE()
-#undef x
-	NULL
-};
-
-static inline enum bch_folio_sector_state
-folio_sector_dirty(enum bch_folio_sector_state state)
-{
-	switch (state) {
-	case SECTOR_unallocated:
-		return SECTOR_dirty;
-	case SECTOR_reserved:
-		return SECTOR_dirty_reserved;
-	default:
-		return state;
-	}
-}
-
-static inline enum bch_folio_sector_state
-folio_sector_undirty(enum bch_folio_sector_state state)
-{
-	switch (state) {
-	case SECTOR_dirty:
-		return SECTOR_unallocated;
-	case SECTOR_dirty_reserved:
-		return SECTOR_reserved;
-	default:
-		return state;
-	}
-}
-
-static inline enum bch_folio_sector_state
-folio_sector_reserve(enum bch_folio_sector_state state)
-{
-	switch (state) {
-	case SECTOR_unallocated:
-		return SECTOR_reserved;
-	case SECTOR_dirty:
-		return SECTOR_dirty_reserved;
-	default:
-		return state;
-	}
-}
-
-struct bch_folio_sector {
-	/* Uncompressed, fully allocated replicas (or on disk reservation): */
-	unsigned		nr_replicas:4;
-
-	/* Owns PAGE_SECTORS * replicas_reserved sized in memory reservation: */
-	unsigned		replicas_reserved:4;
-
-	/* i_sectors: */
-	enum bch_folio_sector_state state:8;
-};
-
-struct bch_folio {
-	spinlock_t		lock;
-	atomic_t		write_count;
-	/*
-	 * Is the sector state up to date with the btree?
-	 * (Not the data itself)
-	 */
-	bool			uptodate;
-	struct bch_folio_sector	s[];
-};
-
-static inline void folio_sector_set(struct folio *folio,
-			     struct bch_folio *s,
-			     unsigned i, unsigned n)
-{
-	s->s[i].state = n;
-}
-
-/* file offset (to folio offset) to bch_folio_sector index */
-static inline int folio_pos_to_s(struct folio *folio, loff_t pos)
-{
-	u64 f_offset = pos - folio_pos(folio);
-	BUG_ON(pos < folio_pos(folio) || pos >= folio_end_pos(folio));
-	return f_offset >> SECTOR_SHIFT;
-}
-
-static inline struct bch_folio *__bch2_folio(struct folio *folio)
-{
-	return folio_has_private(folio)
-		? (struct bch_folio *) folio_get_private(folio)
-		: NULL;
-}
-
-static inline struct bch_folio *bch2_folio(struct folio *folio)
-{
-	EBUG_ON(!folio_test_locked(folio));
-
-	return __bch2_folio(folio);
-}
-
-/* for newly allocated folios: */
-static void __bch2_folio_release(struct folio *folio)
-{
-	kfree(folio_detach_private(folio));
-}
-
-static void bch2_folio_release(struct folio *folio)
-{
-	EBUG_ON(!folio_test_locked(folio));
-	__bch2_folio_release(folio);
-}
-
-/* for newly allocated folios: */
-static struct bch_folio *__bch2_folio_create(struct folio *folio, gfp_t gfp)
-{
-	struct bch_folio *s;
-
-	s = kzalloc(sizeof(*s) +
-		    sizeof(struct bch_folio_sector) *
-		    folio_sectors(folio), gfp);
-	if (!s)
-		return NULL;
-
-	spin_lock_init(&s->lock);
-	folio_attach_private(folio, s);
-	return s;
-}
-
-static struct bch_folio *bch2_folio_create(struct folio *folio, gfp_t gfp)
-{
-	return bch2_folio(folio) ?: __bch2_folio_create(folio, gfp);
-}
-
-static unsigned bkey_to_sector_state(struct bkey_s_c k)
-{
-	if (bkey_extent_is_reservation(k))
-		return SECTOR_reserved;
-	if (bkey_extent_is_allocation(k.k))
-		return SECTOR_allocated;
-	return SECTOR_unallocated;
-}
-
-static void __bch2_folio_set(struct folio *folio,
-			     unsigned pg_offset, unsigned pg_len,
-			     unsigned nr_ptrs, unsigned state)
-{
-	struct bch_folio *s = bch2_folio(folio);
-	unsigned i, sectors = folio_sectors(folio);
-
-	BUG_ON(pg_offset >= sectors);
-	BUG_ON(pg_offset + pg_len > sectors);
-
-	spin_lock(&s->lock);
-
-	for (i = pg_offset; i < pg_offset + pg_len; i++) {
-		s->s[i].nr_replicas	= nr_ptrs;
-		folio_sector_set(folio, s, i, state);
-	}
-
-	if (i == sectors)
-		s->uptodate = true;
-
-	spin_unlock(&s->lock);
-}
-
-/*
- * Initialize bch_folio state (allocated/unallocated, nr_replicas) from the
- * extents btree:
- */
-static int bch2_folio_set(struct bch_fs *c, subvol_inum inum,
-			  struct folio **folios, unsigned nr_folios)
-{
-	struct btree_trans trans;
-	struct btree_iter iter;
-	struct bkey_s_c k;
-	struct bch_folio *s;
-	u64 offset = folio_sector(folios[0]);
-	unsigned folio_idx;
-	u32 snapshot;
-	bool need_set = false;
-	int ret;
-
-	for (folio_idx = 0; folio_idx < nr_folios; folio_idx++) {
-		s = bch2_folio_create(folios[folio_idx], GFP_KERNEL);
-		if (!s)
-			return -ENOMEM;
-
-		need_set |= !s->uptodate;
-	}
-
-	if (!need_set)
-		return 0;
-
-	folio_idx = 0;
-	bch2_trans_init(&trans, c, 0, 0);
-retry:
-	bch2_trans_begin(&trans);
-
-	ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot);
-	if (ret)
-		goto err;
-
-	for_each_btree_key_norestart(&trans, iter, BTREE_ID_extents,
-			   SPOS(inum.inum, offset, snapshot),
-			   BTREE_ITER_SLOTS, k, ret) {
-		unsigned nr_ptrs = bch2_bkey_nr_ptrs_fully_allocated(k);
-		unsigned state = bkey_to_sector_state(k);
-
-		while (folio_idx < nr_folios) {
-			struct folio *folio = folios[folio_idx];
-			u64 folio_start	= folio_sector(folio);
-			u64 folio_end	= folio_end_sector(folio);
-			unsigned folio_offset = max(bkey_start_offset(k.k), folio_start) - folio_start;
-			unsigned folio_len = min(k.k->p.offset, folio_end) - folio_offset - folio_start;
-
-			BUG_ON(k.k->p.offset < folio_start);
-			BUG_ON(bkey_start_offset(k.k) > folio_end);
-
-			if (!bch2_folio(folio)->uptodate)
-				__bch2_folio_set(folio, folio_offset, folio_len, nr_ptrs, state);
-
-			if (k.k->p.offset < folio_end)
-				break;
-			folio_idx++;
-		}
-
-		if (folio_idx == nr_folios)
-			break;
-	}
-
-	offset = iter.pos.offset;
-	bch2_trans_iter_exit(&trans, &iter);
-err:
-	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-		goto retry;
-	bch2_trans_exit(&trans);
-
-	return ret;
-}
-
-static void bch2_bio_page_state_set(struct bio *bio, struct bkey_s_c k)
-{
-	struct bvec_iter iter;
-	struct folio_vec fv;
-	unsigned nr_ptrs = k.k->type == KEY_TYPE_reflink_v
-		? 0 : bch2_bkey_nr_ptrs_fully_allocated(k);
-	unsigned state = bkey_to_sector_state(k);
-
-	bio_for_each_folio(fv, bio, iter)
-		__bch2_folio_set(fv.fv_folio,
-				 fv.fv_offset >> 9,
-				 fv.fv_len >> 9,
-				 nr_ptrs, state);
-}
-
-static void mark_pagecache_unallocated(struct bch_inode_info *inode,
-				       u64 start, u64 end)
-{
-	pgoff_t index = start >> PAGE_SECTORS_SHIFT;
-	pgoff_t end_index = (end - 1) >> PAGE_SECTORS_SHIFT;
-	struct folio_batch fbatch;
-	unsigned i, j;
-
-	if (end <= start)
-		return;
-
-	folio_batch_init(&fbatch);
-
-	while (filemap_get_folios(inode->v.i_mapping,
-				  &index, end_index, &fbatch)) {
-		for (i = 0; i < folio_batch_count(&fbatch); i++) {
-			struct folio *folio = fbatch.folios[i];
-			u64 folio_start = folio_sector(folio);
-			u64 folio_end = folio_end_sector(folio);
-			unsigned folio_offset = max(start, folio_start) - folio_start;
-			unsigned folio_len = min(end, folio_end) - folio_offset - folio_start;
-			struct bch_folio *s;
-
-			BUG_ON(end <= folio_start);
-
-			folio_lock(folio);
-			s = bch2_folio(folio);
-
-			if (s) {
-				spin_lock(&s->lock);
-				for (j = folio_offset; j < folio_offset + folio_len; j++)
-					s->s[j].nr_replicas = 0;
-				spin_unlock(&s->lock);
-			}
-
-			folio_unlock(folio);
-		}
-		folio_batch_release(&fbatch);
-		cond_resched();
-	}
-}
-
-static void mark_pagecache_reserved(struct bch_inode_info *inode,
-				    u64 start, u64 end)
-{
-	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	pgoff_t index = start >> PAGE_SECTORS_SHIFT;
-	pgoff_t end_index = (end - 1) >> PAGE_SECTORS_SHIFT;
-	struct folio_batch fbatch;
-	s64 i_sectors_delta = 0;
-	unsigned i, j;
-
-	if (end <= start)
-		return;
-
-	folio_batch_init(&fbatch);
-
-	while (filemap_get_folios(inode->v.i_mapping,
-				  &index, end_index, &fbatch)) {
-		for (i = 0; i < folio_batch_count(&fbatch); i++) {
-			struct folio *folio = fbatch.folios[i];
-			u64 folio_start = folio_sector(folio);
-			u64 folio_end = folio_end_sector(folio);
-			unsigned folio_offset = max(start, folio_start) - folio_start;
-			unsigned folio_len = min(end, folio_end) - folio_offset - folio_start;
-			struct bch_folio *s;
-
-			BUG_ON(end <= folio_start);
-
-			folio_lock(folio);
-			s = bch2_folio(folio);
-
-			if (s) {
-				spin_lock(&s->lock);
-				for (j = folio_offset; j < folio_offset + folio_len; j++) {
-					i_sectors_delta -= s->s[j].state == SECTOR_dirty;
-					folio_sector_set(folio, s, j, folio_sector_reserve(s->s[j].state));
-				}
-				spin_unlock(&s->lock);
-			}
-
-			folio_unlock(folio);
-		}
-		folio_batch_release(&fbatch);
-		cond_resched();
-	}
-
-	i_sectors_acct(c, inode, NULL, i_sectors_delta);
-}
-
-static inline unsigned inode_nr_replicas(struct bch_fs *c, struct bch_inode_info *inode)
-{
-	/* XXX: this should not be open coded */
-	return inode->ei_inode.bi_data_replicas
-		? inode->ei_inode.bi_data_replicas - 1
-		: c->opts.data_replicas;
-}
-
-static inline unsigned sectors_to_reserve(struct bch_folio_sector *s,
-					  unsigned nr_replicas)
-{
-	return max(0, (int) nr_replicas -
-		   s->nr_replicas -
-		   s->replicas_reserved);
-}
-
-static int bch2_get_folio_disk_reservation(struct bch_fs *c,
-				struct bch_inode_info *inode,
-				struct folio *folio, bool check_enospc)
-{
-	struct bch_folio *s = bch2_folio_create(folio, 0);
-	unsigned nr_replicas = inode_nr_replicas(c, inode);
-	struct disk_reservation disk_res = { 0 };
-	unsigned i, sectors = folio_sectors(folio), disk_res_sectors = 0;
-	int ret;
-
-	if (!s)
-		return -ENOMEM;
-
-	for (i = 0; i < sectors; i++)
-		disk_res_sectors += sectors_to_reserve(&s->s[i], nr_replicas);
-
-	if (!disk_res_sectors)
-		return 0;
-
-	ret = bch2_disk_reservation_get(c, &disk_res,
-					disk_res_sectors, 1,
-					!check_enospc
-					? BCH_DISK_RESERVATION_NOFAIL
-					: 0);
-	if (unlikely(ret))
-		return ret;
-
-	for (i = 0; i < sectors; i++)
-		s->s[i].replicas_reserved +=
-			sectors_to_reserve(&s->s[i], nr_replicas);
-
-	return 0;
-}
-
-struct bch2_folio_reservation {
-	struct disk_reservation	disk;
-	struct quota_res	quota;
-};
-
-static void bch2_folio_reservation_init(struct bch_fs *c,
-			struct bch_inode_info *inode,
-			struct bch2_folio_reservation *res)
-{
-	memset(res, 0, sizeof(*res));
-
-	res->disk.nr_replicas = inode_nr_replicas(c, inode);
-}
-
-static void bch2_folio_reservation_put(struct bch_fs *c,
-			struct bch_inode_info *inode,
-			struct bch2_folio_reservation *res)
-{
-	bch2_disk_reservation_put(c, &res->disk);
-	bch2_quota_reservation_put(c, inode, &res->quota);
-}
-
-static int bch2_folio_reservation_get(struct bch_fs *c,
-			struct bch_inode_info *inode,
-			struct folio *folio,
-			struct bch2_folio_reservation *res,
-			unsigned offset, unsigned len)
-{
-	struct bch_folio *s = bch2_folio_create(folio, 0);
-	unsigned i, disk_sectors = 0, quota_sectors = 0;
-	int ret;
-
-	if (!s)
-		return -ENOMEM;
-
-	BUG_ON(!s->uptodate);
-
-	for (i = round_down(offset, block_bytes(c)) >> 9;
-	     i < round_up(offset + len, block_bytes(c)) >> 9;
-	     i++) {
-		disk_sectors += sectors_to_reserve(&s->s[i],
-						res->disk.nr_replicas);
-		quota_sectors += s->s[i].state == SECTOR_unallocated;
-	}
-
-	if (disk_sectors) {
-		ret = bch2_disk_reservation_add(c, &res->disk, disk_sectors, 0);
-		if (unlikely(ret))
-			return ret;
-	}
-
-	if (quota_sectors) {
-		ret = bch2_quota_reservation_add(c, inode, &res->quota,
-						 quota_sectors, true);
-		if (unlikely(ret)) {
-			struct disk_reservation tmp = {
-				.sectors = disk_sectors
-			};
-
-			bch2_disk_reservation_put(c, &tmp);
-			res->disk.sectors -= disk_sectors;
-			return ret;
-		}
-	}
-
-	return 0;
-}
-
-static void bch2_clear_folio_bits(struct folio *folio)
-{
-	struct bch_inode_info *inode = to_bch_ei(folio->mapping->host);
-	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	struct bch_folio *s = bch2_folio(folio);
-	struct disk_reservation disk_res = { 0 };
-	int i, sectors = folio_sectors(folio), dirty_sectors = 0;
-
-	if (!s)
-		return;
-
-	EBUG_ON(!folio_test_locked(folio));
-	EBUG_ON(folio_test_writeback(folio));
-
-	for (i = 0; i < sectors; i++) {
-		disk_res.sectors += s->s[i].replicas_reserved;
-		s->s[i].replicas_reserved = 0;
-
-		dirty_sectors -= s->s[i].state == SECTOR_dirty;
-		folio_sector_set(folio, s, i, folio_sector_undirty(s->s[i].state));
-	}
-
-	bch2_disk_reservation_put(c, &disk_res);
-
-	i_sectors_acct(c, inode, NULL, dirty_sectors);
-
-	bch2_folio_release(folio);
-}
-
-static void bch2_set_folio_dirty(struct bch_fs *c,
-			struct bch_inode_info *inode,
-			struct folio *folio,
-			struct bch2_folio_reservation *res,
-			unsigned offset, unsigned len)
-{
-	struct bch_folio *s = bch2_folio(folio);
-	unsigned i, dirty_sectors = 0;
-
-	WARN_ON((u64) folio_pos(folio) + offset + len >
-		round_up((u64) i_size_read(&inode->v), block_bytes(c)));
-
-	BUG_ON(!s->uptodate);
-
-	spin_lock(&s->lock);
-
-	for (i = round_down(offset, block_bytes(c)) >> 9;
-	     i < round_up(offset + len, block_bytes(c)) >> 9;
-	     i++) {
-		unsigned sectors = sectors_to_reserve(&s->s[i],
-						res->disk.nr_replicas);
-
-		/*
-		 * This can happen if we race with the error path in
-		 * bch2_writepage_io_done():
-		 */
-		sectors = min_t(unsigned, sectors, res->disk.sectors);
-
-		s->s[i].replicas_reserved += sectors;
-		res->disk.sectors -= sectors;
-
-		dirty_sectors += s->s[i].state == SECTOR_unallocated;
-
-		folio_sector_set(folio, s, i, folio_sector_dirty(s->s[i].state));
-	}
-
-	spin_unlock(&s->lock);
-
-	i_sectors_acct(c, inode, &res->quota, dirty_sectors);
-
-	if (!folio_test_dirty(folio))
-		filemap_dirty_folio(inode->v.i_mapping, folio);
-}
-
-vm_fault_t bch2_page_fault(struct vm_fault *vmf)
-{
-	struct file *file = vmf->vma->vm_file;
-	struct address_space *mapping = file->f_mapping;
-	struct address_space *fdm = faults_disabled_mapping();
-	struct bch_inode_info *inode = file_bch_inode(file);
-	vm_fault_t ret;
-
-	if (fdm == mapping)
-		return VM_FAULT_SIGBUS;
-
-	/* Lock ordering: */
-	if (fdm > mapping) {
-		struct bch_inode_info *fdm_host = to_bch_ei(fdm->host);
-
-		if (bch2_pagecache_add_tryget(inode))
-			goto got_lock;
-
-		bch2_pagecache_block_put(fdm_host);
-
-		bch2_pagecache_add_get(inode);
-		bch2_pagecache_add_put(inode);
-
-		bch2_pagecache_block_get(fdm_host);
-
-		/* Signal that lock has been dropped: */
-		set_fdm_dropped_locks();
-		return VM_FAULT_SIGBUS;
-	}
-
-	bch2_pagecache_add_get(inode);
-got_lock:
-	ret = filemap_fault(vmf);
-	bch2_pagecache_add_put(inode);
-
-	return ret;
-}
-
-vm_fault_t bch2_page_mkwrite(struct vm_fault *vmf)
-{
-	struct folio *folio = page_folio(vmf->page);
-	struct file *file = vmf->vma->vm_file;
-	struct bch_inode_info *inode = file_bch_inode(file);
-	struct address_space *mapping = file->f_mapping;
-	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	struct bch2_folio_reservation res;
-	unsigned len;
-	loff_t isize;
-	vm_fault_t ret;
-
-	bch2_folio_reservation_init(c, inode, &res);
-
-	sb_start_pagefault(inode->v.i_sb);
-	file_update_time(file);
-
-	/*
-	 * Not strictly necessary, but helps avoid dio writes livelocking in
-	 * write_invalidate_inode_pages_range() - can drop this if/when we get
-	 * a write_invalidate_inode_pages_range() that works without dropping
-	 * page lock before invalidating page
-	 */
-	bch2_pagecache_add_get(inode);
-
-	folio_lock(folio);
-	isize = i_size_read(&inode->v);
-
-	if (folio->mapping != mapping || folio_pos(folio) >= isize) {
-		folio_unlock(folio);
-		ret = VM_FAULT_NOPAGE;
-		goto out;
-	}
-
-	len = min_t(loff_t, folio_size(folio), isize - folio_pos(folio));
-
-	if (bch2_folio_set(c, inode_inum(inode), &folio, 1) ?:
-	    bch2_folio_reservation_get(c, inode, folio, &res, 0, len)) {
-		folio_unlock(folio);
-		ret = VM_FAULT_SIGBUS;
-		goto out;
-	}
-
-	bch2_set_folio_dirty(c, inode, folio, &res, 0, len);
-	bch2_folio_reservation_put(c, inode, &res);
-
-	folio_wait_stable(folio);
-	ret = VM_FAULT_LOCKED;
-out:
-	bch2_pagecache_add_put(inode);
-	sb_end_pagefault(inode->v.i_sb);
-
-	return ret;
-}
-
-void bch2_invalidate_folio(struct folio *folio, size_t offset, size_t length)
-{
-	if (offset || length < folio_size(folio))
-		return;
-
-	bch2_clear_folio_bits(folio);
-}
-
-bool bch2_release_folio(struct folio *folio, gfp_t gfp_mask)
-{
-	if (folio_test_dirty(folio) || folio_test_writeback(folio))
-		return false;
-
-	bch2_clear_folio_bits(folio);
-	return true;
-}
-
-/* readpage(s): */
-
-static void bch2_readpages_end_io(struct bio *bio)
-{
-	struct folio_iter fi;
-
-	bio_for_each_folio_all(fi, bio) {
-		if (!bio->bi_status) {
-			folio_mark_uptodate(fi.folio);
-		} else {
-			folio_clear_uptodate(fi.folio);
-			folio_set_error(fi.folio);
-		}
-		folio_unlock(fi.folio);
-	}
-
-	bio_put(bio);
-}
-
-struct readpages_iter {
-	struct address_space	*mapping;
-	unsigned		idx;
-	folios			folios;
-};
-
-static int readpages_iter_init(struct readpages_iter *iter,
-			       struct readahead_control *ractl)
-{
-	struct folio **fi;
-	int ret;
-
-	memset(iter, 0, sizeof(*iter));
-
-	iter->mapping = ractl->mapping;
-
-	ret = filemap_get_contig_folios_d(iter->mapping,
-				ractl->_index << PAGE_SHIFT,
-				(ractl->_index + ractl->_nr_pages) << PAGE_SHIFT,
-				0, mapping_gfp_mask(iter->mapping),
-				&iter->folios);
-	if (ret)
-		return ret;
-
-	darray_for_each(iter->folios, fi) {
-		ractl->_nr_pages -= 1U << folio_order(*fi);
-		__bch2_folio_create(*fi, __GFP_NOFAIL|GFP_KERNEL);
-		folio_put(*fi);
-		folio_put(*fi);
-	}
-
-	return 0;
-}
-
-static inline struct folio *readpage_iter_peek(struct readpages_iter *iter)
-{
-	if (iter->idx >= iter->folios.nr)
-		return NULL;
-	return iter->folios.data[iter->idx];
-}
-
-static inline void readpage_iter_advance(struct readpages_iter *iter)
-{
-	iter->idx++;
-}
-
-static bool extent_partial_reads_expensive(struct bkey_s_c k)
-{
-	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-	struct bch_extent_crc_unpacked crc;
-	const union bch_extent_entry *i;
-
-	bkey_for_each_crc(k.k, ptrs, crc, i)
-		if (crc.csum_type || crc.compression_type)
-			return true;
-	return false;
-}
-
-static int readpage_bio_extend(struct btree_trans *trans,
-			       struct readpages_iter *iter,
-			       struct bio *bio,
-			       unsigned sectors_this_extent,
-			       bool get_more)
-{
-	/* Don't hold btree locks while allocating memory: */
-	bch2_trans_unlock(trans);
-
-	while (bio_sectors(bio) < sectors_this_extent &&
-	       bio->bi_vcnt < bio->bi_max_vecs) {
-		struct folio *folio = readpage_iter_peek(iter);
-		int ret;
-
-		if (folio) {
-			readpage_iter_advance(iter);
-		} else {
-			pgoff_t folio_offset = bio_end_sector(bio) >> PAGE_SECTORS_SHIFT;
-
-			if (!get_more)
-				break;
-
-			folio = xa_load(&iter->mapping->i_pages, folio_offset);
-			if (folio && !xa_is_value(folio))
-				break;
-
-			folio = filemap_alloc_folio(readahead_gfp_mask(iter->mapping), 0);
-			if (!folio)
-				break;
-
-			if (!__bch2_folio_create(folio, GFP_KERNEL)) {
-				folio_put(folio);
-				break;
-			}
-
-			ret = filemap_add_folio(iter->mapping, folio, folio_offset, GFP_KERNEL);
-			if (ret) {
-				__bch2_folio_release(folio);
-				folio_put(folio);
-				break;
-			}
-
-			folio_put(folio);
-		}
-
-		BUG_ON(folio_sector(folio) != bio_end_sector(bio));
-
-		BUG_ON(!bio_add_folio(bio, folio, folio_size(folio), 0));
-	}
-
-	return bch2_trans_relock(trans);
-}
-
-static void bchfs_read(struct btree_trans *trans,
-		       struct bch_read_bio *rbio,
-		       subvol_inum inum,
-		       struct readpages_iter *readpages_iter)
-{
-	struct bch_fs *c = trans->c;
-	struct btree_iter iter;
-	struct bkey_buf sk;
-	int flags = BCH_READ_RETRY_IF_STALE|
-		BCH_READ_MAY_PROMOTE;
-	u32 snapshot;
-	int ret = 0;
-
-	rbio->c = c;
-	rbio->start_time = local_clock();
-	rbio->subvol = inum.subvol;
-
-	bch2_bkey_buf_init(&sk);
-retry:
-	bch2_trans_begin(trans);
-	iter = (struct btree_iter) { NULL };
-
-	ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
-	if (ret)
-		goto err;
-
-	bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
-			     SPOS(inum.inum, rbio->bio.bi_iter.bi_sector, snapshot),
-			     BTREE_ITER_SLOTS);
-	while (1) {
-		struct bkey_s_c k;
-		unsigned bytes, sectors, offset_into_extent;
-		enum btree_id data_btree = BTREE_ID_extents;
-
-		/*
-		 * read_extent -> io_time_reset may cause a transaction restart
-		 * without returning an error, we need to check for that here:
-		 */
-		ret = bch2_trans_relock(trans);
-		if (ret)
-			break;
-
-		bch2_btree_iter_set_pos(&iter,
-				POS(inum.inum, rbio->bio.bi_iter.bi_sector));
-
-		k = bch2_btree_iter_peek_slot(&iter);
-		ret = bkey_err(k);
-		if (ret)
-			break;
-
-		offset_into_extent = iter.pos.offset -
-			bkey_start_offset(k.k);
-		sectors = k.k->size - offset_into_extent;
-
-		bch2_bkey_buf_reassemble(&sk, c, k);
-
-		ret = bch2_read_indirect_extent(trans, &data_btree,
-					&offset_into_extent, &sk);
-		if (ret)
-			break;
-
-		k = bkey_i_to_s_c(sk.k);
-
-		sectors = min(sectors, k.k->size - offset_into_extent);
-
-		if (readpages_iter) {
-			ret = readpage_bio_extend(trans, readpages_iter, &rbio->bio, sectors,
-						  extent_partial_reads_expensive(k));
-			if (ret)
-				break;
-		}
-
-		bytes = min(sectors, bio_sectors(&rbio->bio)) << 9;
-		swap(rbio->bio.bi_iter.bi_size, bytes);
-
-		if (rbio->bio.bi_iter.bi_size == bytes)
-			flags |= BCH_READ_LAST_FRAGMENT;
-
-		bch2_bio_page_state_set(&rbio->bio, k);
-
-		bch2_read_extent(trans, rbio, iter.pos,
-				 data_btree, k, offset_into_extent, flags);
-
-		if (flags & BCH_READ_LAST_FRAGMENT)
-			break;
-
-		swap(rbio->bio.bi_iter.bi_size, bytes);
-		bio_advance(&rbio->bio, bytes);
-
-		ret = btree_trans_too_many_iters(trans);
-		if (ret)
-			break;
-	}
-err:
-	bch2_trans_iter_exit(trans, &iter);
-
-	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-		goto retry;
-
-	if (ret) {
-		bch_err_inum_offset_ratelimited(c,
-				iter.pos.inode,
-				iter.pos.offset << 9,
-				"read error %i from btree lookup", ret);
-		rbio->bio.bi_status = BLK_STS_IOERR;
-		bio_endio(&rbio->bio);
-	}
-
-	bch2_bkey_buf_exit(&sk, c);
-}
-
-void bch2_readahead(struct readahead_control *ractl)
-{
-	struct bch_inode_info *inode = to_bch_ei(ractl->mapping->host);
-	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	struct bch_io_opts opts;
-	struct btree_trans trans;
-	struct folio *folio;
-	struct readpages_iter readpages_iter;
-	int ret;
-
-	bch2_inode_opts_get(&opts, c, &inode->ei_inode);
-
-	ret = readpages_iter_init(&readpages_iter, ractl);
-	BUG_ON(ret);
-
-	bch2_trans_init(&trans, c, 0, 0);
-
-	bch2_pagecache_add_get(inode);
-
-	while ((folio = readpage_iter_peek(&readpages_iter))) {
-		unsigned n = min_t(unsigned,
-				   readpages_iter.folios.nr -
-				   readpages_iter.idx,
-				   BIO_MAX_VECS);
-		struct bch_read_bio *rbio =
-			rbio_init(bio_alloc_bioset(NULL, n, REQ_OP_READ,
-						   GFP_KERNEL, &c->bio_read),
-				  opts);
-
-		readpage_iter_advance(&readpages_iter);
-
-		rbio->bio.bi_iter.bi_sector = folio_sector(folio);
-		rbio->bio.bi_end_io = bch2_readpages_end_io;
-		BUG_ON(!bio_add_folio(&rbio->bio, folio, folio_size(folio), 0));
-
-		bchfs_read(&trans, rbio, inode_inum(inode),
-			   &readpages_iter);
-		bch2_trans_unlock(&trans);
-	}
-
-	bch2_pagecache_add_put(inode);
-
-	bch2_trans_exit(&trans);
-	darray_exit(&readpages_iter.folios);
-}
-
-static void __bchfs_readfolio(struct bch_fs *c, struct bch_read_bio *rbio,
-			     subvol_inum inum, struct folio *folio)
-{
-	struct btree_trans trans;
-
-	bch2_folio_create(folio, __GFP_NOFAIL);
-
-	rbio->bio.bi_opf = REQ_OP_READ|REQ_SYNC;
-	rbio->bio.bi_iter.bi_sector = folio_sector(folio);
-	BUG_ON(!bio_add_folio(&rbio->bio, folio, folio_size(folio), 0));
-
-	bch2_trans_init(&trans, c, 0, 0);
-	bchfs_read(&trans, rbio, inum, NULL);
-	bch2_trans_exit(&trans);
-}
-
-static void bch2_read_single_folio_end_io(struct bio *bio)
-{
-	complete(bio->bi_private);
-}
-
-static int bch2_read_single_folio(struct folio *folio,
-				  struct address_space *mapping)
-{
-	struct bch_inode_info *inode = to_bch_ei(mapping->host);
-	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	struct bch_read_bio *rbio;
-	struct bch_io_opts opts;
-	int ret;
-	DECLARE_COMPLETION_ONSTACK(done);
-
-	bch2_inode_opts_get(&opts, c, &inode->ei_inode);
-
-	rbio = rbio_init(bio_alloc_bioset(NULL, 1, REQ_OP_READ, GFP_KERNEL, &c->bio_read),
-			 opts);
-	rbio->bio.bi_private = &done;
-	rbio->bio.bi_end_io = bch2_read_single_folio_end_io;
-
-	__bchfs_readfolio(c, rbio, inode_inum(inode), folio);
-	wait_for_completion(&done);
-
-	ret = blk_status_to_errno(rbio->bio.bi_status);
-	bio_put(&rbio->bio);
-
-	if (ret < 0)
-		return ret;
-
-	folio_mark_uptodate(folio);
-	return 0;
-}
-
-int bch2_read_folio(struct file *file, struct folio *folio)
-{
-	int ret;
-
-	ret = bch2_read_single_folio(folio, folio->mapping);
-	folio_unlock(folio);
-	return bch2_err_class(ret);
-}
-
-/* writepages: */
-
-struct bch_writepage_state {
-	struct bch_writepage_io	*io;
-	struct bch_io_opts	opts;
-	struct bch_folio_sector	*tmp;
-	unsigned		tmp_sectors;
-};
-
-static inline struct bch_writepage_state bch_writepage_state_init(struct bch_fs *c,
-								  struct bch_inode_info *inode)
-{
-	struct bch_writepage_state ret = { 0 };
-
-	bch2_inode_opts_get(&ret.opts, c, &inode->ei_inode);
-	return ret;
-}
-
-static void bch2_writepage_io_done(struct bch_write_op *op)
-{
-	struct bch_writepage_io *io =
-		container_of(op, struct bch_writepage_io, op);
-	struct bch_fs *c = io->op.c;
-	struct bio *bio = &io->op.wbio.bio;
-	struct folio_iter fi;
-	unsigned i;
-
-	if (io->op.error) {
-		set_bit(EI_INODE_ERROR, &io->inode->ei_flags);
-
-		bio_for_each_folio_all(fi, bio) {
-			struct bch_folio *s;
-
-			folio_set_error(fi.folio);
-			mapping_set_error(fi.folio->mapping, -EIO);
-
-			s = __bch2_folio(fi.folio);
-			spin_lock(&s->lock);
-			for (i = 0; i < folio_sectors(fi.folio); i++)
-				s->s[i].nr_replicas = 0;
-			spin_unlock(&s->lock);
-		}
-	}
-
-	if (io->op.flags & BCH_WRITE_WROTE_DATA_INLINE) {
-		bio_for_each_folio_all(fi, bio) {
-			struct bch_folio *s;
-
-			s = __bch2_folio(fi.folio);
-			spin_lock(&s->lock);
-			for (i = 0; i < folio_sectors(fi.folio); i++)
-				s->s[i].nr_replicas = 0;
-			spin_unlock(&s->lock);
-		}
-	}
-
-	/*
-	 * racing with fallocate can cause us to add fewer sectors than
-	 * expected - but we shouldn't add more sectors than expected:
-	 */
-	WARN_ON_ONCE(io->op.i_sectors_delta > 0);
-
-	/*
-	 * (error (due to going RO) halfway through a page can screw that up
-	 * slightly)
-	 * XXX wtf?
-	   BUG_ON(io->op.op.i_sectors_delta >= PAGE_SECTORS);
-	 */
-
-	/*
-	 * PageWriteback is effectively our ref on the inode - fixup i_blocks
-	 * before calling end_page_writeback:
-	 */
-	i_sectors_acct(c, io->inode, NULL, io->op.i_sectors_delta);
-
-	bio_for_each_folio_all(fi, bio) {
-		struct bch_folio *s = __bch2_folio(fi.folio);
-
-		if (atomic_dec_and_test(&s->write_count))
-			folio_end_writeback(fi.folio);
-	}
-
-	bio_put(&io->op.wbio.bio);
-}
-
-static void bch2_writepage_do_io(struct bch_writepage_state *w)
-{
-	struct bch_writepage_io *io = w->io;
-
-	w->io = NULL;
-	closure_call(&io->op.cl, bch2_write, NULL, NULL);
-}
-
-/*
- * Get a bch_writepage_io and add @page to it - appending to an existing one if
- * possible, else allocating a new one:
- */
-static void bch2_writepage_io_alloc(struct bch_fs *c,
-				    struct writeback_control *wbc,
-				    struct bch_writepage_state *w,
-				    struct bch_inode_info *inode,
-				    u64 sector,
-				    unsigned nr_replicas)
-{
-	struct bch_write_op *op;
-
-	w->io = container_of(bio_alloc_bioset(NULL, BIO_MAX_VECS,
-					      REQ_OP_WRITE,
-					      GFP_KERNEL,
-					      &c->writepage_bioset),
-			     struct bch_writepage_io, op.wbio.bio);
-
-	w->io->inode		= inode;
-	op			= &w->io->op;
-	bch2_write_op_init(op, c, w->opts);
-	op->target		= w->opts.foreground_target;
-	op->nr_replicas		= nr_replicas;
-	op->res.nr_replicas	= nr_replicas;
-	op->write_point		= writepoint_hashed(inode->ei_last_dirtied);
-	op->subvol		= inode->ei_subvol;
-	op->pos			= POS(inode->v.i_ino, sector);
-	op->end_io		= bch2_writepage_io_done;
-	op->devs_need_flush	= &inode->ei_devs_need_flush;
-	op->wbio.bio.bi_iter.bi_sector = sector;
-	op->wbio.bio.bi_opf	= wbc_to_write_flags(wbc);
-}
-
-static int __bch2_writepage(struct folio *folio,
-			    struct writeback_control *wbc,
-			    void *data)
-{
-	struct bch_inode_info *inode = to_bch_ei(folio->mapping->host);
-	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	struct bch_writepage_state *w = data;
-	struct bch_folio *s;
-	unsigned i, offset, f_sectors, nr_replicas_this_write = U32_MAX;
-	loff_t i_size = i_size_read(&inode->v);
-	int ret;
-
-	EBUG_ON(!folio_test_uptodate(folio));
-
-	/* Is the folio fully inside i_size? */
-	if (folio_end_pos(folio) <= i_size)
-		goto do_io;
-
-	/* Is the folio fully outside i_size? (truncate in progress) */
-	if (folio_pos(folio) >= i_size) {
-		folio_unlock(folio);
-		return 0;
-	}
-
-	/*
-	 * The folio straddles i_size.  It must be zeroed out on each and every
-	 * writepage invocation because it may be mmapped.  "A file is mapped
-	 * in multiples of the folio size.  For a file that is not a multiple of
-	 * the  folio size, the remaining memory is zeroed when mapped, and
-	 * writes to that region are not written out to the file."
-	 */
-	folio_zero_segment(folio,
-			   i_size - folio_pos(folio),
-			   folio_size(folio));
-do_io:
-	f_sectors = folio_sectors(folio);
-	s = bch2_folio(folio);
-
-	if (f_sectors > w->tmp_sectors) {
-		kfree(w->tmp);
-		w->tmp = kzalloc(sizeof(struct bch_folio_sector) *
-				 f_sectors, __GFP_NOFAIL);
-		w->tmp_sectors = f_sectors;
-	}
-
-	/*
-	 * Things get really hairy with errors during writeback:
-	 */
-	ret = bch2_get_folio_disk_reservation(c, inode, folio, false);
-	BUG_ON(ret);
-
-	/* Before unlocking the page, get copy of reservations: */
-	spin_lock(&s->lock);
-	memcpy(w->tmp, s->s, sizeof(struct bch_folio_sector) * f_sectors);
-
-	for (i = 0; i < f_sectors; i++) {
-		if (s->s[i].state < SECTOR_dirty)
-			continue;
-
-		nr_replicas_this_write =
-			min_t(unsigned, nr_replicas_this_write,
-			      s->s[i].nr_replicas +
-			      s->s[i].replicas_reserved);
-	}
-
-	for (i = 0; i < f_sectors; i++) {
-		if (s->s[i].state < SECTOR_dirty)
-			continue;
-
-		s->s[i].nr_replicas = w->opts.compression
-			? 0 : nr_replicas_this_write;
-
-		s->s[i].replicas_reserved = 0;
-		folio_sector_set(folio, s, i, SECTOR_allocated);
-	}
-	spin_unlock(&s->lock);
-
-	BUG_ON(atomic_read(&s->write_count));
-	atomic_set(&s->write_count, 1);
-
-	BUG_ON(folio_test_writeback(folio));
-	folio_start_writeback(folio);
-
-	folio_unlock(folio);
-
-	offset = 0;
-	while (1) {
-		unsigned sectors = 0, dirty_sectors = 0, reserved_sectors = 0;
-		u64 sector;
-
-		while (offset < f_sectors &&
-		       w->tmp[offset].state < SECTOR_dirty)
-			offset++;
-
-		if (offset == f_sectors)
-			break;
-
-		while (offset + sectors < f_sectors &&
-		       w->tmp[offset + sectors].state >= SECTOR_dirty) {
-			reserved_sectors += w->tmp[offset + sectors].replicas_reserved;
-			dirty_sectors += w->tmp[offset + sectors].state == SECTOR_dirty;
-			sectors++;
-		}
-		BUG_ON(!sectors);
-
-		sector = folio_sector(folio) + offset;
-
-		if (w->io &&
-		    (w->io->op.res.nr_replicas != nr_replicas_this_write ||
-		     bio_full(&w->io->op.wbio.bio, sectors << 9) ||
-		     w->io->op.wbio.bio.bi_iter.bi_size + (sectors << 9) >=
-		     (BIO_MAX_VECS * PAGE_SIZE) ||
-		     bio_end_sector(&w->io->op.wbio.bio) != sector))
-			bch2_writepage_do_io(w);
-
-		if (!w->io)
-			bch2_writepage_io_alloc(c, wbc, w, inode, sector,
-						nr_replicas_this_write);
-
-		atomic_inc(&s->write_count);
-
-		BUG_ON(inode != w->io->inode);
-		BUG_ON(!bio_add_folio(&w->io->op.wbio.bio, folio,
-				     sectors << 9, offset << 9));
-
-		/* Check for writing past i_size: */
-		WARN_ONCE((bio_end_sector(&w->io->op.wbio.bio) << 9) >
-			  round_up(i_size, block_bytes(c)) &&
-			  !test_bit(BCH_FS_EMERGENCY_RO, &c->flags),
-			  "writing past i_size: %llu > %llu (unrounded %llu)\n",
-			  bio_end_sector(&w->io->op.wbio.bio) << 9,
-			  round_up(i_size, block_bytes(c)),
-			  i_size);
-
-		w->io->op.res.sectors += reserved_sectors;
-		w->io->op.i_sectors_delta -= dirty_sectors;
-		w->io->op.new_i_size = i_size;
-
-		offset += sectors;
-	}
-
-	if (atomic_dec_and_test(&s->write_count))
-		folio_end_writeback(folio);
-
-	return 0;
-}
-
-int bch2_writepages(struct address_space *mapping, struct writeback_control *wbc)
-{
-	struct bch_fs *c = mapping->host->i_sb->s_fs_info;
-	struct bch_writepage_state w =
-		bch_writepage_state_init(c, to_bch_ei(mapping->host));
-	struct blk_plug plug;
-	int ret;
-
-	blk_start_plug(&plug);
-	ret = write_cache_pages(mapping, wbc, __bch2_writepage, &w);
-	if (w.io)
-		bch2_writepage_do_io(&w);
-	blk_finish_plug(&plug);
-	kfree(w.tmp);
-	return bch2_err_class(ret);
-}
-
-/* buffered writes: */
-
-int bch2_write_begin(struct file *file, struct address_space *mapping,
-		     loff_t pos, unsigned len,
-		     struct page **pagep, void **fsdata)
-{
-	struct bch_inode_info *inode = to_bch_ei(mapping->host);
-	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	struct bch2_folio_reservation *res;
-	struct folio *folio;
-	unsigned offset;
-	int ret = -ENOMEM;
-
-	res = kmalloc(sizeof(*res), GFP_KERNEL);
-	if (!res)
-		return -ENOMEM;
-
-	bch2_folio_reservation_init(c, inode, res);
-	*fsdata = res;
-
-	bch2_pagecache_add_get(inode);
-
-	folio = __filemap_get_folio(mapping, pos >> PAGE_SHIFT,
-				FGP_LOCK|FGP_WRITE|FGP_CREAT|FGP_STABLE,
-				mapping_gfp_mask(mapping));
-	if (IS_ERR_OR_NULL(folio))
-		goto err_unlock;
-
-	if (folio_test_uptodate(folio))
-		goto out;
-
-	offset = pos - folio_pos(folio);
-	len = min_t(size_t, len, folio_end_pos(folio) - pos);
-
-	/* If we're writing entire folio, don't need to read it in first: */
-	if (!offset && len == folio_size(folio))
-		goto out;
-
-	if (!offset && pos + len >= inode->v.i_size) {
-		folio_zero_segment(folio, len, folio_size(folio));
-		flush_dcache_folio(folio);
-		goto out;
-	}
-
-	if (folio_pos(folio) >= inode->v.i_size) {
-		folio_zero_segments(folio, 0, offset, offset + len, folio_size(folio));
-		flush_dcache_folio(folio);
-		goto out;
-	}
-readpage:
-	ret = bch2_read_single_folio(folio, mapping);
-	if (ret)
-		goto err;
-out:
-	ret = bch2_folio_set(c, inode_inum(inode), &folio, 1);
-	if (ret)
-		goto err;
-
-	ret = bch2_folio_reservation_get(c, inode, folio, res, offset, len);
-	if (ret) {
-		if (!folio_test_uptodate(folio)) {
-			/*
-			 * If the folio hasn't been read in, we won't know if we
-			 * actually need a reservation - we don't actually need
-			 * to read here, we just need to check if the folio is
-			 * fully backed by uncompressed data:
-			 */
-			goto readpage;
-		}
-
-		goto err;
-	}
-
-	*pagep = &folio->page;
-	return 0;
-err:
-	folio_unlock(folio);
-	folio_put(folio);
-	*pagep = NULL;
-err_unlock:
-	bch2_pagecache_add_put(inode);
-	kfree(res);
-	*fsdata = NULL;
-	return bch2_err_class(ret);
-}
-
-int bch2_write_end(struct file *file, struct address_space *mapping,
-		   loff_t pos, unsigned len, unsigned copied,
-		   struct page *page, void *fsdata)
-{
-	struct bch_inode_info *inode = to_bch_ei(mapping->host);
-	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	struct bch2_folio_reservation *res = fsdata;
-	struct folio *folio = page_folio(page);
-	unsigned offset = pos - folio_pos(folio);
-
-	lockdep_assert_held(&inode->v.i_rwsem);
-	BUG_ON(offset + copied > folio_size(folio));
-
-	if (unlikely(copied < len && !folio_test_uptodate(folio))) {
-		/*
-		 * The folio needs to be read in, but that would destroy
-		 * our partial write - simplest thing is to just force
-		 * userspace to redo the write:
-		 */
-		folio_zero_range(folio, 0, folio_size(folio));
-		flush_dcache_folio(folio);
-		copied = 0;
-	}
-
-	spin_lock(&inode->v.i_lock);
-	if (pos + copied > inode->v.i_size)
-		i_size_write(&inode->v, pos + copied);
-	spin_unlock(&inode->v.i_lock);
-
-	if (copied) {
-		if (!folio_test_uptodate(folio))
-			folio_mark_uptodate(folio);
-
-		bch2_set_folio_dirty(c, inode, folio, res, offset, copied);
-
-		inode->ei_last_dirtied = (unsigned long) current;
-	}
-
-	folio_unlock(folio);
-	folio_put(folio);
-	bch2_pagecache_add_put(inode);
-
-	bch2_folio_reservation_put(c, inode, res);
-	kfree(res);
-
-	return copied;
-}
-
-static noinline void folios_trunc(folios *folios, struct folio **fi)
-{
-	while (folios->data + folios->nr > fi) {
-		struct folio *f = darray_pop(folios);
-
-		folio_unlock(f);
-		folio_put(f);
-	}
-}
-
-static int __bch2_buffered_write(struct bch_inode_info *inode,
-				 struct address_space *mapping,
-				 struct iov_iter *iter,
-				 loff_t pos, unsigned len)
-{
-	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	struct bch2_folio_reservation res;
-	folios folios;
-	struct folio **fi, *f;
-	unsigned copied = 0, f_offset;
-	u64 end = pos + len, f_pos;
-	loff_t last_folio_pos = inode->v.i_size;
-	int ret = 0;
-
-	BUG_ON(!len);
-
-	bch2_folio_reservation_init(c, inode, &res);
-	darray_init(&folios);
-
-	ret = filemap_get_contig_folios_d(mapping, pos, end,
-				   FGP_LOCK|FGP_WRITE|FGP_STABLE|FGP_CREAT,
-				   mapping_gfp_mask(mapping),
-				   &folios);
-	if (ret)
-		goto out;
-
-	BUG_ON(!folios.nr);
-
-	f = darray_first(folios);
-	if (pos != folio_pos(f) && !folio_test_uptodate(f)) {
-		ret = bch2_read_single_folio(f, mapping);
-		if (ret)
-			goto out;
-	}
-
-	f = darray_last(folios);
-	end = min(end, folio_end_pos(f));
-	last_folio_pos = folio_pos(f);
-	if (end != folio_end_pos(f) && !folio_test_uptodate(f)) {
-		if (end >= inode->v.i_size) {
-			folio_zero_range(f, 0, folio_size(f));
-		} else {
-			ret = bch2_read_single_folio(f, mapping);
-			if (ret)
-				goto out;
-		}
-	}
-
-	ret = bch2_folio_set(c, inode_inum(inode), folios.data, folios.nr);
-	if (ret)
-		goto out;
-
-	f_pos = pos;
-	f_offset = pos - folio_pos(darray_first(folios));
-	darray_for_each(folios, fi) {
-		struct folio *f = *fi;
-		u64 f_len = min(end, folio_end_pos(f)) - f_pos;
-
-		/*
-		 * XXX: per POSIX and fstests generic/275, on -ENOSPC we're
-		 * supposed to write as much as we have disk space for.
-		 *
-		 * On failure here we should still write out a partial page if
-		 * we aren't completely out of disk space - we don't do that
-		 * yet:
-		 */
-		ret = bch2_folio_reservation_get(c, inode, f, &res, f_offset, f_len);
-		if (unlikely(ret)) {
-			folios_trunc(&folios, fi);
-			if (!folios.nr)
-				goto out;
-
-			end = min(end, folio_end_pos(darray_last(folios)));
-			break;
-		}
-
-		f_pos = folio_end_pos(f);
-		f_offset = 0;
-	}
-
-	if (mapping_writably_mapped(mapping))
-		darray_for_each(folios, fi)
-			flush_dcache_folio(*fi);
-
-	f_pos = pos;
-	f_offset = pos - folio_pos(darray_first(folios));
-	darray_for_each(folios, fi) {
-		struct folio *f = *fi;
-		u64 f_len = min(end, folio_end_pos(f)) - f_pos;
-		unsigned f_copied = copy_page_from_iter_atomic(&f->page, f_offset, f_len, iter);
-
-		if (!f_copied) {
-			folios_trunc(&folios, fi);
-			break;
-		}
-
-		if (!folio_test_uptodate(f) &&
-		    f_copied != folio_size(f) &&
-		    pos + copied + f_copied < inode->v.i_size) {
-			folio_zero_range(f, 0, folio_size(f));
-			folios_trunc(&folios, fi);
-			break;
-		}
-
-		flush_dcache_folio(f);
-		copied += f_copied;
-
-		if (f_copied != f_len) {
-			folios_trunc(&folios, fi + 1);
-			break;
-		}
-
-		f_pos = folio_end_pos(f);
-		f_offset = 0;
-	}
-
-	if (!copied)
-		goto out;
-
-	end = pos + copied;
-
-	spin_lock(&inode->v.i_lock);
-	if (end > inode->v.i_size)
-		i_size_write(&inode->v, end);
-	spin_unlock(&inode->v.i_lock);
-
-	f_pos = pos;
-	f_offset = pos - folio_pos(darray_first(folios));
-	darray_for_each(folios, fi) {
-		struct folio *f = *fi;
-		u64 f_len = min(end, folio_end_pos(f)) - f_pos;
-
-		if (!folio_test_uptodate(f))
-			folio_mark_uptodate(f);
-
-		bch2_set_folio_dirty(c, inode, f, &res, f_offset, f_len);
-
-		f_pos = folio_end_pos(f);
-		f_offset = 0;
-	}
-
-	inode->ei_last_dirtied = (unsigned long) current;
-out:
-	darray_for_each(folios, fi) {
-		folio_unlock(*fi);
-		folio_put(*fi);
-	}
-
-	/*
-	 * If the last folio added to the mapping starts beyond current EOF, we
-	 * performed a short write but left around at least one post-EOF folio.
-	 * Clean up the mapping before we return.
-	 */
-	if (last_folio_pos >= inode->v.i_size)
-		truncate_pagecache(&inode->v, inode->v.i_size);
-
-	darray_exit(&folios);
-	bch2_folio_reservation_put(c, inode, &res);
-
-	return copied ?: ret;
-}
-
-static ssize_t bch2_buffered_write(struct kiocb *iocb, struct iov_iter *iter)
-{
-	struct file *file = iocb->ki_filp;
-	struct address_space *mapping = file->f_mapping;
-	struct bch_inode_info *inode = file_bch_inode(file);
-	loff_t pos = iocb->ki_pos;
-	ssize_t written = 0;
-	int ret = 0;
-
-	bch2_pagecache_add_get(inode);
-
-	do {
-		unsigned offset = pos & (PAGE_SIZE - 1);
-		unsigned bytes = iov_iter_count(iter);
-again:
-		/*
-		 * Bring in the user page that we will copy from _first_.
-		 * Otherwise there's a nasty deadlock on copying from the
-		 * same page as we're writing to, without it being marked
-		 * up-to-date.
-		 *
-		 * Not only is this an optimisation, but it is also required
-		 * to check that the address is actually valid, when atomic
-		 * usercopies are used, below.
-		 */
-		if (unlikely(fault_in_iov_iter_readable(iter, bytes))) {
-			bytes = min_t(unsigned long, iov_iter_count(iter),
-				      PAGE_SIZE - offset);
-
-			if (unlikely(fault_in_iov_iter_readable(iter, bytes))) {
-				ret = -EFAULT;
-				break;
-			}
-		}
-
-		if (unlikely(fatal_signal_pending(current))) {
-			ret = -EINTR;
-			break;
-		}
-
-		ret = __bch2_buffered_write(inode, mapping, iter, pos, bytes);
-		if (unlikely(ret < 0))
-			break;
-
-		cond_resched();
-
-		if (unlikely(ret == 0)) {
-			/*
-			 * If we were unable to copy any data at all, we must
-			 * fall back to a single segment length write.
-			 *
-			 * If we didn't fallback here, we could livelock
-			 * because not all segments in the iov can be copied at
-			 * once without a pagefault.
-			 */
-			bytes = min_t(unsigned long, PAGE_SIZE - offset,
-				      iov_iter_single_seg_count(iter));
-			goto again;
-		}
-		pos += ret;
-		written += ret;
-		ret = 0;
-
-		balance_dirty_pages_ratelimited(mapping);
-	} while (iov_iter_count(iter));
-
-	bch2_pagecache_add_put(inode);
-
-	return written ? written : ret;
-}
-
-/* O_DIRECT reads */
-
-static void bio_check_or_release(struct bio *bio, bool check_dirty)
-{
-	if (check_dirty) {
-		bio_check_pages_dirty(bio);
-	} else {
-		bio_release_pages(bio, false);
-		bio_put(bio);
-	}
-}
-
-static void bch2_dio_read_complete(struct closure *cl)
-{
-	struct dio_read *dio = container_of(cl, struct dio_read, cl);
-
-	dio->req->ki_complete(dio->req, dio->ret);
-	bio_check_or_release(&dio->rbio.bio, dio->should_dirty);
-}
-
-static void bch2_direct_IO_read_endio(struct bio *bio)
-{
-	struct dio_read *dio = bio->bi_private;
-
-	if (bio->bi_status)
-		dio->ret = blk_status_to_errno(bio->bi_status);
-
-	closure_put(&dio->cl);
-}
-
-static void bch2_direct_IO_read_split_endio(struct bio *bio)
-{
-	struct dio_read *dio = bio->bi_private;
-	bool should_dirty = dio->should_dirty;
-
-	bch2_direct_IO_read_endio(bio);
-	bio_check_or_release(bio, should_dirty);
-}
-
-static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter)
-{
-	struct file *file = req->ki_filp;
-	struct bch_inode_info *inode = file_bch_inode(file);
-	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	struct bch_io_opts opts;
-	struct dio_read *dio;
-	struct bio *bio;
-	loff_t offset = req->ki_pos;
-	bool sync = is_sync_kiocb(req);
-	size_t shorten;
-	ssize_t ret;
-
-	bch2_inode_opts_get(&opts, c, &inode->ei_inode);
-
-	if ((offset|iter->count) & (block_bytes(c) - 1))
-		return -EINVAL;
-
-	ret = min_t(loff_t, iter->count,
-		    max_t(loff_t, 0, i_size_read(&inode->v) - offset));
-
-	if (!ret)
-		return ret;
-
-	shorten = iov_iter_count(iter) - round_up(ret, block_bytes(c));
-	iter->count -= shorten;
-
-	bio = bio_alloc_bioset(NULL,
-			       bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS),
-			       REQ_OP_READ,
-			       GFP_KERNEL,
-			       &c->dio_read_bioset);
-
-	bio->bi_end_io = bch2_direct_IO_read_endio;
-
-	dio = container_of(bio, struct dio_read, rbio.bio);
-	closure_init(&dio->cl, NULL);
-
-	/*
-	 * this is a _really_ horrible hack just to avoid an atomic sub at the
-	 * end:
-	 */
-	if (!sync) {
-		set_closure_fn(&dio->cl, bch2_dio_read_complete, NULL);
-		atomic_set(&dio->cl.remaining,
-			   CLOSURE_REMAINING_INITIALIZER -
-			   CLOSURE_RUNNING +
-			   CLOSURE_DESTRUCTOR);
-	} else {
-		atomic_set(&dio->cl.remaining,
-			   CLOSURE_REMAINING_INITIALIZER + 1);
-	}
-
-	dio->req	= req;
-	dio->ret	= ret;
-	/*
-	 * This is one of the sketchier things I've encountered: we have to skip
-	 * the dirtying of requests that are internal from the kernel (i.e. from
-	 * loopback), because we'll deadlock on page_lock.
-	 */
-	dio->should_dirty = iter_is_iovec(iter);
-
-	goto start;
-	while (iter->count) {
-		bio = bio_alloc_bioset(NULL,
-				       bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS),
-				       REQ_OP_READ,
-				       GFP_KERNEL,
-				       &c->bio_read);
-		bio->bi_end_io		= bch2_direct_IO_read_split_endio;
-start:
-		bio->bi_opf		= REQ_OP_READ|REQ_SYNC;
-		bio->bi_iter.bi_sector	= offset >> 9;
-		bio->bi_private		= dio;
-
-		ret = bio_iov_iter_get_pages(bio, iter);
-		if (ret < 0) {
-			/* XXX: fault inject this path */
-			bio->bi_status = BLK_STS_RESOURCE;
-			bio_endio(bio);
-			break;
-		}
-
-		offset += bio->bi_iter.bi_size;
-
-		if (dio->should_dirty)
-			bio_set_pages_dirty(bio);
-
-		if (iter->count)
-			closure_get(&dio->cl);
-
-		bch2_read(c, rbio_init(bio, opts), inode_inum(inode));
-	}
-
-	iter->count += shorten;
-
-	if (sync) {
-		closure_sync(&dio->cl);
-		closure_debug_destroy(&dio->cl);
-		ret = dio->ret;
-		bio_check_or_release(&dio->rbio.bio, dio->should_dirty);
-		return ret;
-	} else {
-		return -EIOCBQUEUED;
-	}
-}
-
-ssize_t bch2_read_iter(struct kiocb *iocb, struct iov_iter *iter)
-{
-	struct file *file = iocb->ki_filp;
-	struct bch_inode_info *inode = file_bch_inode(file);
-	struct address_space *mapping = file->f_mapping;
-	size_t count = iov_iter_count(iter);
-	ssize_t ret;
-
-	if (!count)
-		return 0; /* skip atime */
-
-	if (iocb->ki_flags & IOCB_DIRECT) {
-		struct blk_plug plug;
-
-		if (unlikely(mapping->nrpages)) {
-			ret = filemap_write_and_wait_range(mapping,
-						iocb->ki_pos,
-						iocb->ki_pos + count - 1);
-			if (ret < 0)
-				goto out;
-		}
-
-		file_accessed(file);
-
-		blk_start_plug(&plug);
-		ret = bch2_direct_IO_read(iocb, iter);
-		blk_finish_plug(&plug);
-
-		if (ret >= 0)
-			iocb->ki_pos += ret;
-	} else {
-		bch2_pagecache_add_get(inode);
-		ret = generic_file_read_iter(iocb, iter);
-		bch2_pagecache_add_put(inode);
-	}
-out:
-	return bch2_err_class(ret);
-}
-
-/* O_DIRECT writes */
-
-static bool bch2_check_range_allocated(struct bch_fs *c, subvol_inum inum,
-				       u64 offset, u64 size,
-				       unsigned nr_replicas, bool compressed)
-{
-	struct btree_trans trans;
-	struct btree_iter iter;
-	struct bkey_s_c k;
-	u64 end = offset + size;
-	u32 snapshot;
-	bool ret = true;
-	int err;
-
-	bch2_trans_init(&trans, c, 0, 0);
-retry:
-	bch2_trans_begin(&trans);
-
-	err = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot);
-	if (err)
-		goto err;
-
-	for_each_btree_key_norestart(&trans, iter, BTREE_ID_extents,
-			   SPOS(inum.inum, offset, snapshot),
-			   BTREE_ITER_SLOTS, k, err) {
-		if (bkey_ge(bkey_start_pos(k.k), POS(inum.inum, end)))
-			break;
-
-		if (k.k->p.snapshot != snapshot ||
-		    nr_replicas > bch2_bkey_replicas(c, k) ||
-		    (!compressed && bch2_bkey_sectors_compressed(k))) {
-			ret = false;
-			break;
-		}
-	}
-
-	offset = iter.pos.offset;
-	bch2_trans_iter_exit(&trans, &iter);
-err:
-	if (bch2_err_matches(err, BCH_ERR_transaction_restart))
-		goto retry;
-	bch2_trans_exit(&trans);
-
-	return err ? false : ret;
-}
-
-static noinline bool bch2_dio_write_check_allocated(struct dio_write *dio)
-{
-	struct bch_fs *c = dio->op.c;
-	struct bch_inode_info *inode = dio->inode;
-	struct bio *bio = &dio->op.wbio.bio;
-
-	return bch2_check_range_allocated(c, inode_inum(inode),
-				dio->op.pos.offset, bio_sectors(bio),
-				dio->op.opts.data_replicas,
-				dio->op.opts.compression != 0);
-}
-
-static void bch2_dio_write_loop_async(struct bch_write_op *);
-static __always_inline long bch2_dio_write_done(struct dio_write *dio);
-
-/*
- * We're going to return -EIOCBQUEUED, but we haven't finished consuming the
- * iov_iter yet, so we need to stash a copy of the iovec: it might be on the
- * caller's stack, we're not guaranteed that it will live for the duration of
- * the IO:
- */
-static noinline int bch2_dio_write_copy_iov(struct dio_write *dio)
-{
-	struct iovec *iov = dio->inline_vecs;
-
-	/*
-	 * iov_iter has a single embedded iovec - nothing to do:
-	 */
-	if (iter_is_ubuf(&dio->iter))
-		return 0;
-
-	/*
-	 * We don't currently handle non-iovec iov_iters here - return an error,
-	 * and we'll fall back to doing the IO synchronously:
-	 */
-	if (!iter_is_iovec(&dio->iter))
-		return -1;
-
-	if (dio->iter.nr_segs > ARRAY_SIZE(dio->inline_vecs)) {
-		iov = kmalloc_array(dio->iter.nr_segs, sizeof(*iov),
-				    GFP_KERNEL);
-		if (unlikely(!iov))
-			return -ENOMEM;
-
-		dio->free_iov = true;
-	}
-
-	memcpy(iov, dio->iter.__iov, dio->iter.nr_segs * sizeof(*iov));
-	dio->iter.__iov = iov;
-	return 0;
-}
-
-static void bch2_dio_write_flush_done(struct closure *cl)
-{
-	struct dio_write *dio = container_of(cl, struct dio_write, op.cl);
-	struct bch_fs *c = dio->op.c;
-
-	closure_debug_destroy(cl);
-
-	dio->op.error = bch2_journal_error(&c->journal);
-
-	bch2_dio_write_done(dio);
-}
+struct nocow_flush {
+	struct closure	*cl;
+	struct bch_dev	*ca;
+	struct bio	bio;
+};
 
-static noinline void bch2_dio_write_flush(struct dio_write *dio)
+static void nocow_flush_endio(struct bio *_bio)
 {
-	struct bch_fs *c = dio->op.c;
-	struct bch_inode_unpacked inode;
-	int ret;
-
-	dio->flush = 0;
 
-	closure_init(&dio->op.cl, NULL);
-
-	if (!dio->op.error) {
-		ret = bch2_inode_find_by_inum(c, inode_inum(dio->inode), &inode);
-		if (ret) {
-			dio->op.error = ret;
-		} else {
-			bch2_journal_flush_seq_async(&c->journal, inode.bi_journal_seq, &dio->op.cl);
-			bch2_inode_flush_nocow_writes_async(c, dio->inode, &dio->op.cl);
-		}
-	}
+	struct nocow_flush *bio = container_of(_bio, struct nocow_flush, bio);
 
-	if (dio->sync) {
-		closure_sync(&dio->op.cl);
-		closure_debug_destroy(&dio->op.cl);
-	} else {
-		continue_at(&dio->op.cl, bch2_dio_write_flush_done, NULL);
-	}
+	closure_put(bio->cl);
+	percpu_ref_put(&bio->ca->io_ref);
+	bio_put(&bio->bio);
 }
 
-static __always_inline long bch2_dio_write_done(struct dio_write *dio)
+void bch2_inode_flush_nocow_writes_async(struct bch_fs *c,
+					 struct bch_inode_info *inode,
+					 struct closure *cl)
 {
-	struct kiocb *req = dio->req;
-	struct bch_inode_info *inode = dio->inode;
-	bool sync = dio->sync;
-	long ret;
-
-	if (unlikely(dio->flush)) {
-		bch2_dio_write_flush(dio);
-		if (!sync)
-			return -EIOCBQUEUED;
-	}
-
-	bch2_pagecache_block_put(inode);
+	struct nocow_flush *bio;
+	struct bch_dev *ca;
+	struct bch_devs_mask devs;
+	unsigned dev;
 
-	if (dio->free_iov)
-		kfree(dio->iter.__iov);
+	dev = find_first_bit(inode->ei_devs_need_flush.d, BCH_SB_MEMBERS_MAX);
+	if (dev == BCH_SB_MEMBERS_MAX)
+		return;
 
-	ret = dio->op.error ?: ((long) dio->written << 9);
-	bio_put(&dio->op.wbio.bio);
+	devs = inode->ei_devs_need_flush;
+	memset(&inode->ei_devs_need_flush, 0, sizeof(inode->ei_devs_need_flush));
 
-	/* inode->i_dio_count is our ref on inode and thus bch_fs */
-	inode_dio_end(&inode->v);
+	for_each_set_bit(dev, devs.d, BCH_SB_MEMBERS_MAX) {
+		rcu_read_lock();
+		ca = rcu_dereference(c->devs[dev]);
+		if (ca && !percpu_ref_tryget(&ca->io_ref))
+			ca = NULL;
+		rcu_read_unlock();
 
-	if (ret < 0)
-		ret = bch2_err_class(ret);
+		if (!ca)
+			continue;
 
-	if (!sync) {
-		req->ki_complete(req, ret);
-		ret = -EIOCBQUEUED;
+		bio = container_of(bio_alloc_bioset(ca->disk_sb.bdev, 0,
+						    REQ_OP_FLUSH,
+						    GFP_KERNEL,
+						    &c->nocow_flush_bioset),
+				   struct nocow_flush, bio);
+		bio->cl			= cl;
+		bio->ca			= ca;
+		bio->bio.bi_end_io	= nocow_flush_endio;
+		closure_bio_submit(&bio->bio, cl);
 	}
-	return ret;
 }
 
-static __always_inline void bch2_dio_write_end(struct dio_write *dio)
+static int bch2_inode_flush_nocow_writes(struct bch_fs *c,
+					 struct bch_inode_info *inode)
 {
-	struct bch_fs *c = dio->op.c;
-	struct kiocb *req = dio->req;
-	struct bch_inode_info *inode = dio->inode;
-	struct bio *bio = &dio->op.wbio.bio;
-
-	req->ki_pos	+= (u64) dio->op.written << 9;
-	dio->written	+= dio->op.written;
-
-	if (dio->extending) {
-		spin_lock(&inode->v.i_lock);
-		if (req->ki_pos > inode->v.i_size)
-			i_size_write(&inode->v, req->ki_pos);
-		spin_unlock(&inode->v.i_lock);
-	}
-
-	if (dio->op.i_sectors_delta || dio->quota_res.sectors) {
-		mutex_lock(&inode->ei_quota_lock);
-		__i_sectors_acct(c, inode, &dio->quota_res, dio->op.i_sectors_delta);
-		__bch2_quota_reservation_put(c, inode, &dio->quota_res);
-		mutex_unlock(&inode->ei_quota_lock);
-	}
+	struct closure cl;
 
-	bio_release_pages(bio, false);
+	closure_init_stack(&cl);
+	bch2_inode_flush_nocow_writes_async(c, inode, &cl);
+	closure_sync(&cl);
 
-	if (unlikely(dio->op.error))
-		set_bit(EI_INODE_ERROR, &inode->ei_flags);
+	return 0;
 }
 
-static __always_inline long bch2_dio_write_loop(struct dio_write *dio)
-{
-	struct bch_fs *c = dio->op.c;
-	struct kiocb *req = dio->req;
-	struct address_space *mapping = dio->mapping;
-	struct bch_inode_info *inode = dio->inode;
-	struct bch_io_opts opts;
-	struct bio *bio = &dio->op.wbio.bio;
-	unsigned unaligned, iter_count;
-	bool sync = dio->sync, dropped_locks;
-	long ret;
-
-	bch2_inode_opts_get(&opts, c, &inode->ei_inode);
-
-	while (1) {
-		iter_count = dio->iter.count;
-
-		EBUG_ON(current->faults_disabled_mapping);
-		current->faults_disabled_mapping = mapping;
-
-		ret = bio_iov_iter_get_pages(bio, &dio->iter);
-
-		dropped_locks = fdm_dropped_locks();
-
-		current->faults_disabled_mapping = NULL;
-
-		/*
-		 * If the fault handler returned an error but also signalled
-		 * that it dropped & retook ei_pagecache_lock, we just need to
-		 * re-shoot down the page cache and retry:
-		 */
-		if (dropped_locks && ret)
-			ret = 0;
-
-		if (unlikely(ret < 0))
-			goto err;
-
-		if (unlikely(dropped_locks)) {
-			ret = write_invalidate_inode_pages_range(mapping,
-					req->ki_pos,
-					req->ki_pos + iter_count - 1);
-			if (unlikely(ret))
-				goto err;
-
-			if (!bio->bi_iter.bi_size)
-				continue;
-		}
-
-		unaligned = bio->bi_iter.bi_size & (block_bytes(c) - 1);
-		bio->bi_iter.bi_size -= unaligned;
-		iov_iter_revert(&dio->iter, unaligned);
-
-		if (!bio->bi_iter.bi_size) {
-			/*
-			 * bio_iov_iter_get_pages was only able to get <
-			 * blocksize worth of pages:
-			 */
-			ret = -EFAULT;
-			goto err;
-		}
-
-		bch2_write_op_init(&dio->op, c, opts);
-		dio->op.end_io		= sync
-			? NULL
-			: bch2_dio_write_loop_async;
-		dio->op.target		= dio->op.opts.foreground_target;
-		dio->op.write_point	= writepoint_hashed((unsigned long) current);
-		dio->op.nr_replicas	= dio->op.opts.data_replicas;
-		dio->op.subvol		= inode->ei_subvol;
-		dio->op.pos		= POS(inode->v.i_ino, (u64) req->ki_pos >> 9);
-		dio->op.devs_need_flush	= &inode->ei_devs_need_flush;
-
-		if (sync)
-			dio->op.flags |= BCH_WRITE_SYNC;
-		dio->op.flags |= BCH_WRITE_CHECK_ENOSPC;
-
-		ret = bch2_quota_reservation_add(c, inode, &dio->quota_res,
-						 bio_sectors(bio), true);
-		if (unlikely(ret))
-			goto err;
-
-		ret = bch2_disk_reservation_get(c, &dio->op.res, bio_sectors(bio),
-						dio->op.opts.data_replicas, 0);
-		if (unlikely(ret) &&
-		    !bch2_dio_write_check_allocated(dio))
-			goto err;
-
-		task_io_account_write(bio->bi_iter.bi_size);
-
-		if (unlikely(dio->iter.count) &&
-		    !dio->sync &&
-		    !dio->loop &&
-		    bch2_dio_write_copy_iov(dio))
-			dio->sync = sync = true;
-
-		dio->loop = true;
-		closure_call(&dio->op.cl, bch2_write, NULL, NULL);
-
-		if (!sync)
-			return -EIOCBQUEUED;
-
-		bch2_dio_write_end(dio);
-
-		if (likely(!dio->iter.count) || dio->op.error)
-			break;
-
-		bio_reset(bio, NULL, REQ_OP_WRITE);
-	}
-out:
-	return bch2_dio_write_done(dio);
-err:
-	dio->op.error = ret;
-
-	bio_release_pages(bio, false);
+/* i_size updates: */
 
-	bch2_quota_reservation_put(c, inode, &dio->quota_res);
-	goto out;
-}
+struct inode_new_size {
+	loff_t		new_size;
+	u64		now;
+	unsigned	fields;
+};
 
-static noinline __cold void bch2_dio_write_continue(struct dio_write *dio)
+static int inode_set_size(struct bch_inode_info *inode,
+			  struct bch_inode_unpacked *bi,
+			  void *p)
 {
-	struct mm_struct *mm = dio->mm;
+	struct inode_new_size *s = p;
 
-	bio_reset(&dio->op.wbio.bio, NULL, REQ_OP_WRITE);
+	bi->bi_size = s->new_size;
+	if (s->fields & ATTR_ATIME)
+		bi->bi_atime = s->now;
+	if (s->fields & ATTR_MTIME)
+		bi->bi_mtime = s->now;
+	if (s->fields & ATTR_CTIME)
+		bi->bi_ctime = s->now;
 
-	if (mm)
-		kthread_use_mm(mm);
-	bch2_dio_write_loop(dio);
-	if (mm)
-		kthread_unuse_mm(mm);
+	return 0;
 }
 
-static void bch2_dio_write_loop_async(struct bch_write_op *op)
+int __must_check bch2_write_inode_size(struct bch_fs *c,
+				       struct bch_inode_info *inode,
+				       loff_t new_size, unsigned fields)
 {
-	struct dio_write *dio = container_of(op, struct dio_write, op);
-
-	bch2_dio_write_end(dio);
+	struct inode_new_size s = {
+		.new_size	= new_size,
+		.now		= bch2_current_time(c),
+		.fields		= fields,
+	};
 
-	if (likely(!dio->iter.count) || dio->op.error)
-		bch2_dio_write_done(dio);
-	else
-		bch2_dio_write_continue(dio);
+	return bch2_write_inode(c, inode, inode_set_size, &s, fields);
 }
 
-static noinline
-ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter)
+void __bch2_i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode,
+			   struct quota_res *quota_res, s64 sectors)
 {
-	struct file *file = req->ki_filp;
-	struct address_space *mapping = file->f_mapping;
-	struct bch_inode_info *inode = file_bch_inode(file);
-	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	struct dio_write *dio;
-	struct bio *bio;
-	bool locked = true, extending;
-	ssize_t ret;
-
-	prefetch(&c->opts);
-	prefetch((void *) &c->opts + 64);
-	prefetch(&inode->ei_inode);
-	prefetch((void *) &inode->ei_inode + 64);
-
-	inode_lock(&inode->v);
-
-	ret = generic_write_checks(req, iter);
-	if (unlikely(ret <= 0))
-		goto err;
-
-	ret = file_remove_privs(file);
-	if (unlikely(ret))
-		goto err;
-
-	ret = file_update_time(file);
-	if (unlikely(ret))
-		goto err;
-
-	if (unlikely((req->ki_pos|iter->count) & (block_bytes(c) - 1)))
-		goto err;
-
-	inode_dio_begin(&inode->v);
-	bch2_pagecache_block_get(inode);
-
-	extending = req->ki_pos + iter->count > inode->v.i_size;
-	if (!extending) {
-		inode_unlock(&inode->v);
-		locked = false;
-	}
-
-	bio = bio_alloc_bioset(NULL,
-			       bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS),
-			       REQ_OP_WRITE,
-			       GFP_KERNEL,
-			       &c->dio_write_bioset);
-	dio = container_of(bio, struct dio_write, op.wbio.bio);
-	dio->req		= req;
-	dio->mapping		= mapping;
-	dio->inode		= inode;
-	dio->mm			= current->mm;
-	dio->loop		= false;
-	dio->extending		= extending;
-	dio->sync		= is_sync_kiocb(req) || extending;
-	dio->flush		= iocb_is_dsync(req) && !c->opts.journal_flush_disabled;
-	dio->free_iov		= false;
-	dio->quota_res.sectors	= 0;
-	dio->written		= 0;
-	dio->iter		= *iter;
-	dio->op.c		= c;
-
-	if (unlikely(mapping->nrpages)) {
-		ret = write_invalidate_inode_pages_range(mapping,
-						req->ki_pos,
-						req->ki_pos + iter->count - 1);
-		if (unlikely(ret))
-			goto err_put_bio;
-	}
-
-	ret = bch2_dio_write_loop(dio);
-err:
-	if (locked)
-		inode_unlock(&inode->v);
-	return ret;
-err_put_bio:
-	bch2_pagecache_block_put(inode);
-	bio_put(bio);
-	inode_dio_end(&inode->v);
-	goto err;
-}
+	bch2_fs_inconsistent_on((s64) inode->v.i_blocks + sectors < 0, c,
+				"inode %lu i_blocks underflow: %llu + %lli < 0 (ondisk %lli)",
+				inode->v.i_ino, (u64) inode->v.i_blocks, sectors,
+				inode->ei_inode.bi_sectors);
+	inode->v.i_blocks += sectors;
 
-ssize_t bch2_write_iter(struct kiocb *iocb, struct iov_iter *from)
-{
-	struct file *file = iocb->ki_filp;
-	struct bch_inode_info *inode = file_bch_inode(file);
-	ssize_t	ret;
+#ifdef CONFIG_BCACHEFS_QUOTA
+	if (quota_res &&
+	    !test_bit(EI_INODE_SNAPSHOT, &inode->ei_flags) &&
+	    sectors > 0) {
+		BUG_ON(sectors > quota_res->sectors);
+		BUG_ON(sectors > inode->ei_quota_reserved);
 
-	if (iocb->ki_flags & IOCB_DIRECT) {
-		ret = bch2_direct_write(iocb, from);
-		goto out;
+		quota_res->sectors -= sectors;
+		inode->ei_quota_reserved -= sectors;
+	} else {
+		bch2_quota_acct(c, inode->ei_qid, Q_SPC, sectors, KEY_TYPE_QUOTA_WARN);
 	}
-
-	inode_lock(&inode->v);
-
-	ret = generic_write_checks(iocb, from);
-	if (ret <= 0)
-		goto unlock;
-
-	ret = file_remove_privs(file);
-	if (ret)
-		goto unlock;
-
-	ret = file_update_time(file);
-	if (ret)
-		goto unlock;
-
-	ret = bch2_buffered_write(iocb, from);
-	if (likely(ret > 0))
-		iocb->ki_pos += ret;
-unlock:
-	inode_unlock(&inode->v);
-
-	if (ret > 0)
-		ret = generic_write_sync(iocb, ret);
-out:
-	return bch2_err_class(ret);
+#endif
 }
 
 /* fsync: */
@@ -2908,10 +302,10 @@ static int __bch2_truncate_folio(struct bch_inode_info *inode,
 		s->s[i].nr_replicas	= 0;
 
 		i_sectors_delta -= s->s[i].state == SECTOR_dirty;
-		folio_sector_set(folio, s, i, SECTOR_unallocated);
+		bch2_folio_sector_set(folio, s, i, SECTOR_unallocated);
 	}
 
-	i_sectors_acct(c, inode, NULL, i_sectors_delta);
+	bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta);
 
 	/*
 	 * Caller needs to know whether this folio will be written out by
@@ -3102,7 +496,7 @@ int bch2_truncate(struct mnt_idmap *idmap,
 	ret = bch2_fpunch(c, inode_inum(inode),
 			round_up(iattr->ia_size, block_bytes(c)) >> 9,
 			U64_MAX, &i_sectors_delta);
-	i_sectors_acct(c, inode, NULL, i_sectors_delta);
+	bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta);
 
 	bch2_fs_inconsistent_on(!inode->v.i_size && inode->v.i_blocks &&
 				!bch2_journal_error(&c->journal), c,
@@ -3156,7 +550,7 @@ static long bchfs_fpunch(struct bch_inode_info *inode, loff_t offset, loff_t len
 		ret = bch2_fpunch(c, inode_inum(inode),
 				  block_start >> 9, block_end >> 9,
 				  &i_sectors_delta);
-		i_sectors_acct(c, inode, NULL, i_sectors_delta);
+		bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta);
 	}
 
 	mutex_lock(&inode->ei_update_lock);
@@ -3207,7 +601,7 @@ static long bchfs_fcollapse_finsert(struct bch_inode_info *inode,
 
 	new_size = inode->v.i_size + shift;
 
-	ret = write_invalidate_inode_pages_range(mapping, offset, LLONG_MAX);
+	ret = bch2_write_invalidate_inode_pages_range(mapping, offset, LLONG_MAX);
 	if (ret)
 		return ret;
 
@@ -3223,7 +617,7 @@ static long bchfs_fcollapse_finsert(struct bch_inode_info *inode,
 		ret = bch2_fpunch(c, inode_inum(inode),
 				  offset >> 9, (offset + len) >> 9,
 				  &i_sectors_delta);
-		i_sectors_acct(c, inode, NULL, i_sectors_delta);
+		bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta);
 
 		if (ret)
 			return ret;
@@ -3444,10 +838,10 @@ static int __bchfs_fallocate(struct bch_inode_info *inode, int mode,
 		if (ret)
 			goto bkey_err;
 
-		i_sectors_acct(c, inode, &quota_res, i_sectors_delta);
+		bch2_i_sectors_acct(c, inode, &quota_res, i_sectors_delta);
 
 		drop_locks_do(&trans,
-			(mark_pagecache_reserved(inode, hole_start, iter.pos.offset), 0));
+			(bch2_mark_pagecache_reserved(inode, hole_start, iter.pos.offset), 0));
 bkey_err:
 		bch2_quota_reservation_put(c, inode, &quota_res);
 		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
@@ -3460,7 +854,7 @@ bkey_err:
 
 		bch2_fpunch_at(&trans, &iter, inode_inum(inode),
 			       end_sector, &i_sectors_delta);
-		i_sectors_acct(c, inode, &quota_res, i_sectors_delta);
+		bch2_i_sectors_acct(c, inode, &quota_res, i_sectors_delta);
 		bch2_quota_reservation_put(c, inode, &quota_res);
 	}
 
@@ -3654,7 +1048,7 @@ loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src,
 
 	aligned_len = round_up((u64) len, block_bytes(c));
 
-	ret = write_invalidate_inode_pages_range(dst->v.i_mapping,
+	ret = bch2_write_invalidate_inode_pages_range(dst->v.i_mapping,
 				pos_dst, pos_dst + len - 1);
 	if (ret)
 		goto err;
@@ -3666,7 +1060,7 @@ loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src,
 
 	file_update_time(file_dst);
 
-	mark_pagecache_unallocated(src, pos_src >> 9,
+	bch2_mark_pagecache_unallocated(src, pos_src >> 9,
 				   (pos_src + aligned_len) >> 9);
 
 	ret = bch2_remap_range(c,
@@ -3682,7 +1076,7 @@ loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src,
 	 */
 	ret = min((u64) ret << 9, (u64) len);
 
-	i_sectors_acct(c, dst, &quota_res, i_sectors_delta);
+	bch2_i_sectors_acct(c, dst, &quota_res, i_sectors_delta);
 
 	spin_lock(&dst->v.i_lock);
 	if (pos_dst + ret > dst->v.i_size)
@@ -3701,68 +1095,6 @@ err:
 
 /* fseek: */
 
-static int folio_data_offset(struct folio *folio, loff_t pos,
-			     unsigned min_replicas)
-{
-	struct bch_folio *s = bch2_folio(folio);
-	unsigned i, sectors = folio_sectors(folio);
-
-	if (s)
-		for (i = folio_pos_to_s(folio, pos); i < sectors; i++)
-			if (s->s[i].state >= SECTOR_dirty &&
-			    s->s[i].nr_replicas + s->s[i].replicas_reserved >= min_replicas)
-				return i << SECTOR_SHIFT;
-
-	return -1;
-}
-
-static loff_t bch2_seek_pagecache_data(struct inode *vinode,
-				       loff_t start_offset,
-				       loff_t end_offset,
-				       unsigned min_replicas,
-				       bool nonblock)
-{
-	struct folio_batch fbatch;
-	pgoff_t start_index	= start_offset >> PAGE_SHIFT;
-	pgoff_t end_index	= end_offset >> PAGE_SHIFT;
-	pgoff_t index		= start_index;
-	unsigned i;
-	loff_t ret;
-	int offset;
-
-	folio_batch_init(&fbatch);
-
-	while (filemap_get_folios(vinode->i_mapping,
-				  &index, end_index, &fbatch)) {
-		for (i = 0; i < folio_batch_count(&fbatch); i++) {
-			struct folio *folio = fbatch.folios[i];
-
-			if (!nonblock) {
-				folio_lock(folio);
-			} else if (!folio_trylock(folio)) {
-				folio_batch_release(&fbatch);
-				return -EAGAIN;
-			}
-
-			offset = folio_data_offset(folio,
-					max(folio_pos(folio), start_offset),
-					min_replicas);
-			if (offset >= 0) {
-				ret = clamp(folio_pos(folio) + offset,
-					    start_offset, end_offset);
-				folio_unlock(folio);
-				folio_batch_release(&fbatch);
-				return ret;
-			}
-			folio_unlock(folio);
-		}
-		folio_batch_release(&fbatch);
-		cond_resched();
-	}
-
-	return end_offset;
-}
-
 static loff_t bch2_seek_data(struct file *file, u64 offset)
 {
 	struct bch_inode_info *inode = file_bch_inode(file);
@@ -3816,85 +1148,6 @@ err:
 	return vfs_setpos(file, next_data, MAX_LFS_FILESIZE);
 }
 
-static int folio_hole_offset(struct address_space *mapping, loff_t *offset,
-			      unsigned min_replicas, bool nonblock)
-{
-	struct folio *folio;
-	struct bch_folio *s;
-	unsigned i, sectors;
-	bool ret = true;
-
-	folio = __filemap_get_folio(mapping, *offset >> PAGE_SHIFT,
-				    FGP_LOCK|(nonblock ? FGP_NOWAIT : 0), 0);
-	if (folio == ERR_PTR(-EAGAIN))
-		return -EAGAIN;
-	if (IS_ERR_OR_NULL(folio))
-		return true;
-
-	s = bch2_folio(folio);
-	if (!s)
-		goto unlock;
-
-	sectors = folio_sectors(folio);
-	for (i = folio_pos_to_s(folio, *offset); i < sectors; i++)
-		if (s->s[i].state < SECTOR_dirty ||
-		    s->s[i].nr_replicas + s->s[i].replicas_reserved < min_replicas) {
-			*offset = max(*offset,
-				      folio_pos(folio) + (i << SECTOR_SHIFT));
-			goto unlock;
-		}
-
-	*offset = folio_end_pos(folio);
-	ret = false;
-unlock:
-	folio_unlock(folio);
-	folio_put(folio);
-	return ret;
-}
-
-static loff_t bch2_seek_pagecache_hole(struct inode *vinode,
-				       loff_t start_offset,
-				       loff_t end_offset,
-				       unsigned min_replicas,
-				       bool nonblock)
-{
-	struct address_space *mapping = vinode->i_mapping;
-	loff_t offset = start_offset;
-
-	while (offset < end_offset &&
-	       !folio_hole_offset(mapping, &offset, min_replicas, nonblock))
-		;
-
-	return min(offset, end_offset);
-}
-
-static int bch2_clamp_data_hole(struct inode *inode,
-				u64 *hole_start,
-				u64 *hole_end,
-				unsigned min_replicas,
-				bool nonblock)
-{
-	loff_t ret;
-
-	ret = bch2_seek_pagecache_hole(inode,
-		*hole_start << 9, *hole_end << 9, min_replicas, nonblock) >> 9;
-	if (ret < 0)
-		return ret;
-
-	*hole_start = ret;
-
-	if (*hole_start == *hole_end)
-		return 0;
-
-	ret = bch2_seek_pagecache_data(inode,
-		*hole_start << 9, *hole_end << 9, min_replicas, nonblock) >> 9;
-	if (ret < 0)
-		return ret;
-
-	*hole_end = ret;
-	return 0;
-}
-
 static loff_t bch2_seek_hole(struct file *file, u64 offset)
 {
 	struct bch_inode_info *inode = file_bch_inode(file);
@@ -3979,28 +1232,10 @@ loff_t bch2_llseek(struct file *file, loff_t offset, int whence)
 void bch2_fs_fsio_exit(struct bch_fs *c)
 {
 	bioset_exit(&c->nocow_flush_bioset);
-	bioset_exit(&c->dio_write_bioset);
-	bioset_exit(&c->dio_read_bioset);
-	bioset_exit(&c->writepage_bioset);
 }
 
 int bch2_fs_fsio_init(struct bch_fs *c)
 {
-	if (bioset_init(&c->writepage_bioset,
-			4, offsetof(struct bch_writepage_io, op.wbio.bio),
-			BIOSET_NEED_BVECS))
-		return -BCH_ERR_ENOMEM_writepage_bioset_init;
-
-	if (bioset_init(&c->dio_read_bioset,
-			4, offsetof(struct dio_read, rbio.bio),
-			BIOSET_NEED_BVECS))
-		return -BCH_ERR_ENOMEM_dio_read_bioset_init;
-
-	if (bioset_init(&c->dio_write_bioset,
-			4, offsetof(struct dio_write, op.wbio.bio),
-			BIOSET_NEED_BVECS))
-		return -BCH_ERR_ENOMEM_dio_write_bioset_init;
-
 	if (bioset_init(&c->nocow_flush_bioset,
 			1, offsetof(struct nocow_flush, bio), 0))
 		return -BCH_ERR_ENOMEM_nocow_flush_bioset_init;
diff --git a/fs/bcachefs/fs-io.h b/fs/bcachefs/fs-io.h
index af905331542d..bb5b709fa8cf 100644
--- a/fs/bcachefs/fs-io.h
+++ b/fs/bcachefs/fs-io.h
@@ -5,28 +5,163 @@
 #ifndef NO_BCACHEFS_FS
 
 #include "buckets.h"
+#include "fs.h"
 #include "io_types.h"
+#include "quota.h"
 
 #include <linux/uio.h>
 
-struct quota_res;
+struct folio_vec {
+	struct folio	*fv_folio;
+	size_t		fv_offset;
+	size_t		fv_len;
+};
+
+static inline struct folio_vec biovec_to_foliovec(struct bio_vec bv)
+{
+
+	struct folio *folio	= page_folio(bv.bv_page);
+	size_t offset		= (folio_page_idx(folio, bv.bv_page) << PAGE_SHIFT) +
+		bv.bv_offset;
+	size_t len = min_t(size_t, folio_size(folio) - offset, bv.bv_len);
+
+	return (struct folio_vec) {
+		.fv_folio	= folio,
+		.fv_offset	= offset,
+		.fv_len		= len,
+	};
+}
+
+static inline struct folio_vec bio_iter_iovec_folio(struct bio *bio,
+						    struct bvec_iter iter)
+{
+	return biovec_to_foliovec(bio_iter_iovec(bio, iter));
+}
+
+#define __bio_for_each_folio(bvl, bio, iter, start)			\
+	for (iter = (start);						\
+	     (iter).bi_size &&						\
+		((bvl = bio_iter_iovec_folio((bio), (iter))), 1);	\
+	     bio_advance_iter_single((bio), &(iter), (bvl).fv_len))
+
+/**
+ * bio_for_each_folio - iterate over folios within a bio
+ *
+ * Like other non-_all versions, this iterates over what bio->bi_iter currently
+ * points to. This version is for drivers, where the bio may have previously
+ * been split or cloned.
+ */
+#define bio_for_each_folio(bvl, bio, iter)				\
+	__bio_for_each_folio(bvl, bio, iter, (bio)->bi_iter)
+
+struct quota_res {
+	u64				sectors;
+};
+
+#ifdef CONFIG_BCACHEFS_QUOTA
+
+static inline void __bch2_quota_reservation_put(struct bch_fs *c,
+					 struct bch_inode_info *inode,
+					 struct quota_res *res)
+{
+	BUG_ON(res->sectors > inode->ei_quota_reserved);
+
+	bch2_quota_acct(c, inode->ei_qid, Q_SPC,
+			-((s64) res->sectors), KEY_TYPE_QUOTA_PREALLOC);
+	inode->ei_quota_reserved -= res->sectors;
+	res->sectors = 0;
+}
+
+static inline void bch2_quota_reservation_put(struct bch_fs *c,
+				       struct bch_inode_info *inode,
+				       struct quota_res *res)
+{
+	if (res->sectors) {
+		mutex_lock(&inode->ei_quota_lock);
+		__bch2_quota_reservation_put(c, inode, res);
+		mutex_unlock(&inode->ei_quota_lock);
+	}
+}
+
+static inline int bch2_quota_reservation_add(struct bch_fs *c,
+				      struct bch_inode_info *inode,
+				      struct quota_res *res,
+				      u64 sectors,
+				      bool check_enospc)
+{
+	int ret;
+
+	if (test_bit(EI_INODE_SNAPSHOT, &inode->ei_flags))
+		return 0;
+
+	mutex_lock(&inode->ei_quota_lock);
+	ret = bch2_quota_acct(c, inode->ei_qid, Q_SPC, sectors,
+			      check_enospc ? KEY_TYPE_QUOTA_PREALLOC : KEY_TYPE_QUOTA_NOCHECK);
+	if (likely(!ret)) {
+		inode->ei_quota_reserved += sectors;
+		res->sectors += sectors;
+	}
+	mutex_unlock(&inode->ei_quota_lock);
+
+	return ret;
+}
 
-int __must_check bch2_write_inode_size(struct bch_fs *,
-				       struct bch_inode_info *,
-				       loff_t, unsigned);
+#else
+
+static inline void __bch2_quota_reservation_put(struct bch_fs *c,
+					 struct bch_inode_info *inode,
+					 struct quota_res *res) {}
 
-int bch2_read_folio(struct file *, struct folio *);
+static inline void bch2_quota_reservation_put(struct bch_fs *c,
+				       struct bch_inode_info *inode,
+				       struct quota_res *res) {}
 
-int bch2_writepages(struct address_space *, struct writeback_control *);
-void bch2_readahead(struct readahead_control *);
+static inline int bch2_quota_reservation_add(struct bch_fs *c,
+				      struct bch_inode_info *inode,
+				      struct quota_res *res,
+				      unsigned sectors,
+				      bool check_enospc)
+{
+	return 0;
+}
 
-int bch2_write_begin(struct file *, struct address_space *, loff_t,
-		     unsigned, struct page **, void **);
-int bch2_write_end(struct file *, struct address_space *, loff_t,
-		   unsigned, unsigned, struct page *, void *);
+#endif
 
-ssize_t bch2_read_iter(struct kiocb *, struct iov_iter *);
-ssize_t bch2_write_iter(struct kiocb *, struct iov_iter *);
+void __bch2_i_sectors_acct(struct bch_fs *, struct bch_inode_info *,
+			   struct quota_res *, s64);
+
+static inline void bch2_i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode,
+				       struct quota_res *quota_res, s64 sectors)
+{
+	if (sectors) {
+		mutex_lock(&inode->ei_quota_lock);
+		__bch2_i_sectors_acct(c, inode, quota_res, sectors);
+		mutex_unlock(&inode->ei_quota_lock);
+	}
+}
+
+static inline struct address_space *faults_disabled_mapping(void)
+{
+	return (void *) (((unsigned long) current->faults_disabled_mapping) & ~1UL);
+}
+
+static inline void set_fdm_dropped_locks(void)
+{
+	current->faults_disabled_mapping =
+		(void *) (((unsigned long) current->faults_disabled_mapping)|1);
+}
+
+static inline bool fdm_dropped_locks(void)
+{
+	return ((unsigned long) current->faults_disabled_mapping) & 1;
+}
+
+void bch2_inode_flush_nocow_writes_async(struct bch_fs *,
+			struct bch_inode_info *, struct closure *);
+
+int __must_check bch2_write_inode_size(struct bch_fs *,
+				       struct bch_inode_info *,
+				       loff_t, unsigned);
 
 int bch2_fsync(struct file *, loff_t, loff_t, int);
 
@@ -39,11 +174,6 @@ loff_t bch2_remap_file_range(struct file *, loff_t, struct file *,
 
 loff_t bch2_llseek(struct file *, loff_t, int);
 
-vm_fault_t bch2_page_fault(struct vm_fault *);
-vm_fault_t bch2_page_mkwrite(struct vm_fault *);
-void bch2_invalidate_folio(struct folio *, size_t, size_t);
-bool bch2_release_folio(struct folio *, gfp_t);
-
 void bch2_fs_fsio_exit(struct bch_fs *);
 int bch2_fs_fsio_init(struct bch_fs *);
 #else
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index 695b8bc55590..aa7ec5dc9ff1 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -14,6 +14,9 @@
 #include "fs-common.h"
 #include "fs-io.h"
 #include "fs-ioctl.h"
+#include "fs-io-buffered.h"
+#include "fs-io-direct.h"
+#include "fs-io-pagecache.h"
 #include "fsck.h"
 #include "inode.h"
 #include "io.h"
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 7ec22631cdd3..de7bc0192c3d 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -30,6 +30,8 @@
 #include "error.h"
 #include "fs.h"
 #include "fs-io.h"
+#include "fs-io-buffered.h"
+#include "fs-io-direct.h"
 #include "fsck.h"
 #include "inode.h"
 #include "io.h"
@@ -469,6 +471,8 @@ static void __bch2_fs_free(struct bch_fs *c)
 	bch2_fs_counters_exit(c);
 	bch2_fs_snapshots_exit(c);
 	bch2_fs_quota_exit(c);
+	bch2_fs_fs_io_direct_exit(c);
+	bch2_fs_fs_io_buffered_exit(c);
 	bch2_fs_fsio_exit(c);
 	bch2_fs_ec_exit(c);
 	bch2_fs_encryption_exit(c);
@@ -842,7 +846,9 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 	    bch2_fs_encryption_init(c) ?:
 	    bch2_fs_compress_init(c) ?:
 	    bch2_fs_ec_init(c) ?:
-	    bch2_fs_fsio_init(c);
+	    bch2_fs_fsio_init(c) ?:
+	    bch2_fs_fs_io_buffered_init(c);
+	    bch2_fs_fs_io_direct_init(c);
 	if (ret)
 		goto err;
 
-- 
cgit 


From 8079aab085f2203960b352a153f07245d0507bf8 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 5 Aug 2023 12:55:08 -0400
Subject: bcachefs: Split up btree_update_leaf.c

We now have
  btree_trans_commit.c
  btree_update.c

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/Makefile             |    3 +-
 fs/bcachefs/btree_gc.h           |    1 +
 fs/bcachefs/btree_iter.h         |   16 +
 fs/bcachefs/btree_trans_commit.c | 1156 +++++++++++++++++++++
 fs/bcachefs/btree_update.c       |  943 +++++++++++++++++
 fs/bcachefs/btree_update_leaf.c  | 2107 --------------------------------------
 6 files changed, 2118 insertions(+), 2108 deletions(-)
 create mode 100644 fs/bcachefs/btree_trans_commit.c
 create mode 100644 fs/bcachefs/btree_update.c
 delete mode 100644 fs/bcachefs/btree_update_leaf.c

(limited to 'fs')

diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile
index 13cacf2d8bfb..1bb4d159630b 100644
--- a/fs/bcachefs/Makefile
+++ b/fs/bcachefs/Makefile
@@ -15,8 +15,9 @@ bcachefs-y		:=	\
 	btree_iter.o		\
 	btree_key_cache.o	\
 	btree_locking.o		\
+	btree_trans_commit.o	\
+	btree_update.o		\
 	btree_update_interior.o	\
-	btree_update_leaf.o	\
 	btree_write_buffer.o	\
 	buckets.o		\
 	buckets_waiting_for_journal.o	\
diff --git a/fs/bcachefs/btree_gc.h b/fs/bcachefs/btree_gc.h
index b45e382f7055..607575f83a00 100644
--- a/fs/bcachefs/btree_gc.h
+++ b/fs/bcachefs/btree_gc.h
@@ -2,6 +2,7 @@
 #ifndef _BCACHEFS_BTREE_GC_H
 #define _BCACHEFS_BTREE_GC_H
 
+#include "bkey.h"
 #include "btree_types.h"
 
 int bch2_check_topology(struct bch_fs *);
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index 13e92452270e..4469b2e166eb 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -221,6 +221,22 @@ struct btree_path *bch2_path_get(struct btree_trans *, enum btree_id, struct bpo
 				 unsigned, unsigned, unsigned, unsigned long);
 struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *, struct bkey *);
 
+/*
+ * bch2_btree_path_peek_slot() for a cached iterator might return a key in a
+ * different snapshot:
+ */
+static inline struct bkey_s_c bch2_btree_path_peek_slot_exact(struct btree_path *path, struct bkey *u)
+{
+	struct bkey_s_c k = bch2_btree_path_peek_slot(path, u);
+
+	if (k.k && bpos_eq(path->pos, k.k->p))
+		return k;
+
+	bkey_init(u);
+	u->p = path->pos;
+	return (struct bkey_s_c) { u, NULL };
+}
+
 struct bkey_i *bch2_btree_journal_peek_slot(struct btree_trans *,
 					struct btree_iter *, struct bpos);
 
diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c
new file mode 100644
index 000000000000..2fa123ff953e
--- /dev/null
+++ b/fs/bcachefs/btree_trans_commit.c
@@ -0,0 +1,1156 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "btree_gc.h"
+#include "btree_io.h"
+#include "btree_iter.h"
+#include "btree_key_cache.h"
+#include "btree_update_interior.h"
+#include "btree_write_buffer.h"
+#include "buckets.h"
+#include "errcode.h"
+#include "error.h"
+#include "journal.h"
+#include "journal_reclaim.h"
+#include "recovery.h"
+#include "replicas.h"
+#include "subvolume.h"
+
+#include <linux/prefetch.h>
+
+static void verify_update_old_key(struct btree_trans *trans, struct btree_insert_entry *i)
+{
+#ifdef CONFIG_BCACHEFS_DEBUG
+	struct bch_fs *c = trans->c;
+	struct bkey u;
+	struct bkey_s_c k = bch2_btree_path_peek_slot_exact(i->path, &u);
+
+	if (unlikely(trans->journal_replay_not_finished)) {
+		struct bkey_i *j_k =
+			bch2_journal_keys_peek_slot(c, i->btree_id, i->level, i->k->k.p);
+
+		if (j_k)
+			k = bkey_i_to_s_c(j_k);
+	}
+
+	u = *k.k;
+	u.needs_whiteout = i->old_k.needs_whiteout;
+
+	BUG_ON(memcmp(&i->old_k, &u, sizeof(struct bkey)));
+	BUG_ON(i->old_v != k.v);
+#endif
+}
+
+static inline struct btree_path_level *insert_l(struct btree_insert_entry *i)
+{
+	return i->path->l + i->level;
+}
+
+static inline bool same_leaf_as_prev(struct btree_trans *trans,
+				     struct btree_insert_entry *i)
+{
+	return i != trans->updates &&
+		insert_l(&i[0])->b == insert_l(&i[-1])->b;
+}
+
+static inline bool same_leaf_as_next(struct btree_trans *trans,
+				     struct btree_insert_entry *i)
+{
+	return i + 1 < trans->updates + trans->nr_updates &&
+		insert_l(&i[0])->b == insert_l(&i[1])->b;
+}
+
+inline void bch2_btree_node_prep_for_write(struct btree_trans *trans,
+					   struct btree_path *path,
+					   struct btree *b)
+{
+	struct bch_fs *c = trans->c;
+
+	if (unlikely(btree_node_just_written(b)) &&
+	    bch2_btree_post_write_cleanup(c, b))
+		bch2_trans_node_reinit_iter(trans, b);
+
+	/*
+	 * If the last bset has been written, or if it's gotten too big - start
+	 * a new bset to insert into:
+	 */
+	if (want_new_bset(c, b))
+		bch2_btree_init_next(trans, b);
+}
+
+/* Inserting into a given leaf node (last stage of insert): */
+
+/* Handle overwrites and do insert, for non extents: */
+bool bch2_btree_bset_insert_key(struct btree_trans *trans,
+				struct btree_path *path,
+				struct btree *b,
+				struct btree_node_iter *node_iter,
+				struct bkey_i *insert)
+{
+	struct bkey_packed *k;
+	unsigned clobber_u64s = 0, new_u64s = 0;
+
+	EBUG_ON(btree_node_just_written(b));
+	EBUG_ON(bset_written(b, btree_bset_last(b)));
+	EBUG_ON(bkey_deleted(&insert->k) && bkey_val_u64s(&insert->k));
+	EBUG_ON(bpos_lt(insert->k.p, b->data->min_key));
+	EBUG_ON(bpos_gt(insert->k.p, b->data->max_key));
+	EBUG_ON(insert->k.u64s >
+		bch_btree_keys_u64s_remaining(trans->c, b));
+
+	k = bch2_btree_node_iter_peek_all(node_iter, b);
+	if (k && bkey_cmp_left_packed(b, k, &insert->k.p))
+		k = NULL;
+
+	/* @k is the key being overwritten/deleted, if any: */
+	EBUG_ON(k && bkey_deleted(k));
+
+	/* Deleting, but not found? nothing to do: */
+	if (bkey_deleted(&insert->k) && !k)
+		return false;
+
+	if (bkey_deleted(&insert->k)) {
+		/* Deleting: */
+		btree_account_key_drop(b, k);
+		k->type = KEY_TYPE_deleted;
+
+		if (k->needs_whiteout)
+			push_whiteout(trans->c, b, insert->k.p);
+		k->needs_whiteout = false;
+
+		if (k >= btree_bset_last(b)->start) {
+			clobber_u64s = k->u64s;
+			bch2_bset_delete(b, k, clobber_u64s);
+			goto fix_iter;
+		} else {
+			bch2_btree_path_fix_key_modified(trans, b, k);
+		}
+
+		return true;
+	}
+
+	if (k) {
+		/* Overwriting: */
+		btree_account_key_drop(b, k);
+		k->type = KEY_TYPE_deleted;
+
+		insert->k.needs_whiteout = k->needs_whiteout;
+		k->needs_whiteout = false;
+
+		if (k >= btree_bset_last(b)->start) {
+			clobber_u64s = k->u64s;
+			goto overwrite;
+		} else {
+			bch2_btree_path_fix_key_modified(trans, b, k);
+		}
+	}
+
+	k = bch2_btree_node_iter_bset_pos(node_iter, b, bset_tree_last(b));
+overwrite:
+	bch2_bset_insert(b, node_iter, k, insert, clobber_u64s);
+	new_u64s = k->u64s;
+fix_iter:
+	if (clobber_u64s != new_u64s)
+		bch2_btree_node_iter_fix(trans, path, b, node_iter, k,
+					 clobber_u64s, new_u64s);
+	return true;
+}
+
+static int __btree_node_flush(struct journal *j, struct journal_entry_pin *pin,
+			       unsigned i, u64 seq)
+{
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
+	struct btree_write *w = container_of(pin, struct btree_write, journal);
+	struct btree *b = container_of(w, struct btree, writes[i]);
+	struct btree_trans trans;
+	unsigned long old, new, v;
+	unsigned idx = w - b->writes;
+
+	bch2_trans_init(&trans, c, 0, 0);
+
+	btree_node_lock_nopath_nofail(&trans, &b->c, SIX_LOCK_read);
+	v = READ_ONCE(b->flags);
+
+	do {
+		old = new = v;
+
+		if (!(old & (1 << BTREE_NODE_dirty)) ||
+		    !!(old & (1 << BTREE_NODE_write_idx)) != idx ||
+		    w->journal.seq != seq)
+			break;
+
+		new &= ~BTREE_WRITE_TYPE_MASK;
+		new |= BTREE_WRITE_journal_reclaim;
+		new |= 1 << BTREE_NODE_need_write;
+	} while ((v = cmpxchg(&b->flags, old, new)) != old);
+
+	btree_node_write_if_need(c, b, SIX_LOCK_read);
+	six_unlock_read(&b->c.lock);
+
+	bch2_trans_exit(&trans);
+	return 0;
+}
+
+int bch2_btree_node_flush0(struct journal *j, struct journal_entry_pin *pin, u64 seq)
+{
+	return __btree_node_flush(j, pin, 0, seq);
+}
+
+int bch2_btree_node_flush1(struct journal *j, struct journal_entry_pin *pin, u64 seq)
+{
+	return __btree_node_flush(j, pin, 1, seq);
+}
+
+inline void bch2_btree_add_journal_pin(struct bch_fs *c,
+				       struct btree *b, u64 seq)
+{
+	struct btree_write *w = btree_current_write(b);
+
+	bch2_journal_pin_add(&c->journal, seq, &w->journal,
+			     btree_node_write_idx(b) == 0
+			     ? bch2_btree_node_flush0
+			     : bch2_btree_node_flush1);
+}
+
+/**
+ * btree_insert_key - insert a key one key into a leaf node
+ */
+inline void bch2_btree_insert_key_leaf(struct btree_trans *trans,
+				       struct btree_path *path,
+				       struct bkey_i *insert,
+				       u64 journal_seq)
+{
+	struct bch_fs *c = trans->c;
+	struct btree *b = path_l(path)->b;
+	struct bset_tree *t = bset_tree_last(b);
+	struct bset *i = bset(b, t);
+	int old_u64s = bset_u64s(t);
+	int old_live_u64s = b->nr.live_u64s;
+	int live_u64s_added, u64s_added;
+
+	if (unlikely(!bch2_btree_bset_insert_key(trans, path, b,
+					&path_l(path)->iter, insert)))
+		return;
+
+	i->journal_seq = cpu_to_le64(max(journal_seq, le64_to_cpu(i->journal_seq)));
+
+	bch2_btree_add_journal_pin(c, b, journal_seq);
+
+	if (unlikely(!btree_node_dirty(b))) {
+		EBUG_ON(test_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags));
+		set_btree_node_dirty_acct(c, b);
+	}
+
+	live_u64s_added = (int) b->nr.live_u64s - old_live_u64s;
+	u64s_added = (int) bset_u64s(t) - old_u64s;
+
+	if (b->sib_u64s[0] != U16_MAX && live_u64s_added < 0)
+		b->sib_u64s[0] = max(0, (int) b->sib_u64s[0] + live_u64s_added);
+	if (b->sib_u64s[1] != U16_MAX && live_u64s_added < 0)
+		b->sib_u64s[1] = max(0, (int) b->sib_u64s[1] + live_u64s_added);
+
+	if (u64s_added > live_u64s_added &&
+	    bch2_maybe_compact_whiteouts(c, b))
+		bch2_trans_node_reinit_iter(trans, b);
+}
+
+/* Cached btree updates: */
+
+/* Normal update interface: */
+
+static inline void btree_insert_entry_checks(struct btree_trans *trans,
+					     struct btree_insert_entry *i)
+{
+	BUG_ON(!bpos_eq(i->k->k.p, i->path->pos));
+	BUG_ON(i->cached	!= i->path->cached);
+	BUG_ON(i->level		!= i->path->level);
+	BUG_ON(i->btree_id	!= i->path->btree_id);
+	EBUG_ON(!i->level &&
+		!(i->flags & BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) &&
+		test_bit(JOURNAL_REPLAY_DONE, &trans->c->journal.flags) &&
+		i->k->k.p.snapshot &&
+		bch2_snapshot_is_internal_node(trans->c, i->k->k.p.snapshot));
+}
+
+static noinline int
+bch2_trans_journal_preres_get_cold(struct btree_trans *trans, unsigned flags,
+				   unsigned long trace_ip)
+{
+	return drop_locks_do(trans,
+		bch2_journal_preres_get(&trans->c->journal,
+			&trans->journal_preres,
+			trans->journal_preres_u64s,
+			(flags & BCH_WATERMARK_MASK)));
+}
+
+static __always_inline int bch2_trans_journal_res_get(struct btree_trans *trans,
+						      unsigned flags)
+{
+	return bch2_journal_res_get(&trans->c->journal, &trans->journal_res,
+				    trans->journal_u64s, flags);
+}
+
+#define JSET_ENTRY_LOG_U64s		4
+
+static noinline void journal_transaction_name(struct btree_trans *trans)
+{
+	struct bch_fs *c = trans->c;
+	struct journal *j = &c->journal;
+	struct jset_entry *entry =
+		bch2_journal_add_entry(j, &trans->journal_res,
+				       BCH_JSET_ENTRY_log, 0, 0,
+				       JSET_ENTRY_LOG_U64s);
+	struct jset_entry_log *l =
+		container_of(entry, struct jset_entry_log, entry);
+
+	strncpy(l->d, trans->fn, JSET_ENTRY_LOG_U64s * sizeof(u64));
+}
+
+static inline int btree_key_can_insert(struct btree_trans *trans,
+				       struct btree *b, unsigned u64s)
+{
+	struct bch_fs *c = trans->c;
+
+	if (!bch2_btree_node_insert_fits(c, b, u64s))
+		return -BCH_ERR_btree_insert_btree_node_full;
+
+	return 0;
+}
+
+static int btree_key_can_insert_cached(struct btree_trans *trans, unsigned flags,
+				       struct btree_path *path, unsigned u64s)
+{
+	struct bch_fs *c = trans->c;
+	struct bkey_cached *ck = (void *) path->l[0].b;
+	struct btree_insert_entry *i;
+	unsigned new_u64s;
+	struct bkey_i *new_k;
+
+	EBUG_ON(path->level);
+
+	if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) &&
+	    bch2_btree_key_cache_must_wait(c) &&
+	    !(flags & BTREE_INSERT_JOURNAL_RECLAIM))
+		return -BCH_ERR_btree_insert_need_journal_reclaim;
+
+	/*
+	 * bch2_varint_decode can read past the end of the buffer by at most 7
+	 * bytes (it won't be used):
+	 */
+	u64s += 1;
+
+	if (u64s <= ck->u64s)
+		return 0;
+
+	new_u64s	= roundup_pow_of_two(u64s);
+	new_k		= krealloc(ck->k, new_u64s * sizeof(u64), GFP_NOFS);
+	if (!new_k) {
+		bch_err(c, "error allocating memory for key cache key, btree %s u64s %u",
+			bch2_btree_ids[path->btree_id], new_u64s);
+		return -BCH_ERR_ENOMEM_btree_key_cache_insert;
+	}
+
+	trans_for_each_update(trans, i)
+		if (i->old_v == &ck->k->v)
+			i->old_v = &new_k->v;
+
+	ck->u64s	= new_u64s;
+	ck->k		= new_k;
+	return 0;
+}
+
+/* Triggers: */
+
+static int run_one_mem_trigger(struct btree_trans *trans,
+			       struct btree_insert_entry *i,
+			       unsigned flags)
+{
+	struct bkey_s_c old = { &i->old_k, i->old_v };
+	struct bkey_i *new = i->k;
+	const struct bkey_ops *old_ops = bch2_bkey_type_ops(old.k->type);
+	const struct bkey_ops *new_ops = bch2_bkey_type_ops(i->k->k.type);
+	int ret;
+
+	verify_update_old_key(trans, i);
+
+	if (unlikely(flags & BTREE_TRIGGER_NORUN))
+		return 0;
+
+	if (!btree_node_type_needs_gc((enum btree_node_type) i->btree_id))
+		return 0;
+
+	if (old_ops->atomic_trigger == new_ops->atomic_trigger &&
+	    ((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) {
+		ret   = bch2_mark_key(trans, i->btree_id, i->level,
+				old, bkey_i_to_s_c(new),
+				BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|flags);
+	} else {
+		struct bkey		_deleted = KEY(0, 0, 0);
+		struct bkey_s_c		deleted = (struct bkey_s_c) { &_deleted, NULL };
+
+		_deleted.p = i->path->pos;
+
+		ret   = bch2_mark_key(trans, i->btree_id, i->level,
+				deleted, bkey_i_to_s_c(new),
+				BTREE_TRIGGER_INSERT|flags) ?:
+			bch2_mark_key(trans, i->btree_id, i->level,
+				old, deleted,
+				BTREE_TRIGGER_OVERWRITE|flags);
+	}
+
+	return ret;
+}
+
+static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_entry *i,
+				 bool overwrite)
+{
+	/*
+	 * Transactional triggers create new btree_insert_entries, so we can't
+	 * pass them a pointer to a btree_insert_entry, that memory is going to
+	 * move:
+	 */
+	struct bkey old_k = i->old_k;
+	struct bkey_s_c old = { &old_k, i->old_v };
+	const struct bkey_ops *old_ops = bch2_bkey_type_ops(old.k->type);
+	const struct bkey_ops *new_ops = bch2_bkey_type_ops(i->k->k.type);
+
+	verify_update_old_key(trans, i);
+
+	if ((i->flags & BTREE_TRIGGER_NORUN) ||
+	    !(BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type)))
+		return 0;
+
+	if (!i->insert_trigger_run &&
+	    !i->overwrite_trigger_run &&
+	    old_ops->trans_trigger == new_ops->trans_trigger &&
+	    ((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) {
+		i->overwrite_trigger_run = true;
+		i->insert_trigger_run = true;
+		return bch2_trans_mark_key(trans, i->btree_id, i->level, old, i->k,
+					   BTREE_TRIGGER_INSERT|
+					   BTREE_TRIGGER_OVERWRITE|
+					   i->flags) ?: 1;
+	} else if (overwrite && !i->overwrite_trigger_run) {
+		i->overwrite_trigger_run = true;
+		return bch2_trans_mark_old(trans, i->btree_id, i->level, old, i->flags) ?: 1;
+	} else if (!overwrite && !i->insert_trigger_run) {
+		i->insert_trigger_run = true;
+		return bch2_trans_mark_new(trans, i->btree_id, i->level, i->k, i->flags) ?: 1;
+	} else {
+		return 0;
+	}
+}
+
+static int run_btree_triggers(struct btree_trans *trans, enum btree_id btree_id,
+			      struct btree_insert_entry *btree_id_start)
+{
+	struct btree_insert_entry *i;
+	bool trans_trigger_run;
+	int ret, overwrite;
+
+	for (overwrite = 1; overwrite >= 0; --overwrite) {
+
+		/*
+		 * Running triggers will append more updates to the list of updates as
+		 * we're walking it:
+		 */
+		do {
+			trans_trigger_run = false;
+
+			for (i = btree_id_start;
+			     i < trans->updates + trans->nr_updates && i->btree_id <= btree_id;
+			     i++) {
+				if (i->btree_id != btree_id)
+					continue;
+
+				ret = run_one_trans_trigger(trans, i, overwrite);
+				if (ret < 0)
+					return ret;
+				if (ret)
+					trans_trigger_run = true;
+			}
+		} while (trans_trigger_run);
+	}
+
+	return 0;
+}
+
+static int bch2_trans_commit_run_triggers(struct btree_trans *trans)
+{
+	struct btree_insert_entry *i = NULL, *btree_id_start = trans->updates;
+	unsigned btree_id = 0;
+	int ret = 0;
+
+	/*
+	 *
+	 * For a given btree, this algorithm runs insert triggers before
+	 * overwrite triggers: this is so that when extents are being moved
+	 * (e.g. by FALLOCATE_FL_INSERT_RANGE), we don't drop references before
+	 * they are re-added.
+	 */
+	for (btree_id = 0; btree_id < BTREE_ID_NR; btree_id++) {
+		if (btree_id == BTREE_ID_alloc)
+			continue;
+
+		while (btree_id_start < trans->updates + trans->nr_updates &&
+		       btree_id_start->btree_id < btree_id)
+			btree_id_start++;
+
+		ret = run_btree_triggers(trans, btree_id, btree_id_start);
+		if (ret)
+			return ret;
+	}
+
+	trans_for_each_update(trans, i) {
+		if (i->btree_id > BTREE_ID_alloc)
+			break;
+		if (i->btree_id == BTREE_ID_alloc) {
+			ret = run_btree_triggers(trans, BTREE_ID_alloc, i);
+			if (ret)
+				return ret;
+			break;
+		}
+	}
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+	trans_for_each_update(trans, i)
+		BUG_ON(!(i->flags & BTREE_TRIGGER_NORUN) &&
+		       (BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type)) &&
+		       (!i->insert_trigger_run || !i->overwrite_trigger_run));
+#endif
+	return 0;
+}
+
+static noinline int bch2_trans_commit_run_gc_triggers(struct btree_trans *trans)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_insert_entry *i;
+	int ret = 0;
+
+	trans_for_each_update(trans, i) {
+		/*
+		 * XXX: synchronization of cached update triggers with gc
+		 * XXX: synchronization of interior node updates with gc
+		 */
+		BUG_ON(i->cached || i->level);
+
+		if (gc_visited(c, gc_pos_btree_node(insert_l(i)->b))) {
+			ret = run_one_mem_trigger(trans, i, i->flags|BTREE_TRIGGER_GC);
+			if (ret)
+				break;
+		}
+	}
+
+	return ret;
+}
+
+static inline int
+bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
+			       struct btree_insert_entry **stopped_at,
+			       unsigned long trace_ip)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_insert_entry *i;
+	struct btree_write_buffered_key *wb;
+	struct btree_trans_commit_hook *h;
+	unsigned u64s = 0;
+	bool marking = false;
+	int ret;
+
+	if (race_fault()) {
+		trace_and_count(c, trans_restart_fault_inject, trans, trace_ip);
+		return btree_trans_restart_nounlock(trans, BCH_ERR_transaction_restart_fault_inject);
+	}
+
+	/*
+	 * Check if the insert will fit in the leaf node with the write lock
+	 * held, otherwise another thread could write the node changing the
+	 * amount of space available:
+	 */
+
+	prefetch(&trans->c->journal.flags);
+
+	trans_for_each_update(trans, i) {
+		/* Multiple inserts might go to same leaf: */
+		if (!same_leaf_as_prev(trans, i))
+			u64s = 0;
+
+		u64s += i->k->k.u64s;
+		ret = !i->cached
+			? btree_key_can_insert(trans, insert_l(i)->b, u64s)
+			: btree_key_can_insert_cached(trans, flags, i->path, u64s);
+		if (ret) {
+			*stopped_at = i;
+			return ret;
+		}
+
+		if (btree_node_type_needs_gc(i->bkey_type))
+			marking = true;
+	}
+
+	if (trans->nr_wb_updates &&
+	    trans->nr_wb_updates + c->btree_write_buffer.state.nr > c->btree_write_buffer.size)
+		return -BCH_ERR_btree_insert_need_flush_buffer;
+
+	/*
+	 * Don't get journal reservation until after we know insert will
+	 * succeed:
+	 */
+	if (likely(!(flags & BTREE_INSERT_JOURNAL_REPLAY))) {
+		ret = bch2_trans_journal_res_get(trans,
+				(flags & BCH_WATERMARK_MASK)|
+				JOURNAL_RES_GET_NONBLOCK);
+		if (ret)
+			return ret;
+
+		if (unlikely(trans->journal_transaction_names))
+			journal_transaction_name(trans);
+	} else {
+		trans->journal_res.seq = c->journal.replay_journal_seq;
+	}
+
+	/*
+	 * Not allowed to fail after we've gotten our journal reservation - we
+	 * have to use it:
+	 */
+
+	if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) &&
+	    !(flags & BTREE_INSERT_JOURNAL_REPLAY)) {
+		if (bch2_journal_seq_verify)
+			trans_for_each_update(trans, i)
+				i->k->k.version.lo = trans->journal_res.seq;
+		else if (bch2_inject_invalid_keys)
+			trans_for_each_update(trans, i)
+				i->k->k.version = MAX_VERSION;
+	}
+
+	if (trans->fs_usage_deltas &&
+	    bch2_trans_fs_usage_apply(trans, trans->fs_usage_deltas))
+		return -BCH_ERR_btree_insert_need_mark_replicas;
+
+	if (trans->nr_wb_updates) {
+		EBUG_ON(flags & BTREE_INSERT_JOURNAL_REPLAY);
+
+		ret = bch2_btree_insert_keys_write_buffer(trans);
+		if (ret)
+			goto revert_fs_usage;
+	}
+
+	h = trans->hooks;
+	while (h) {
+		ret = h->fn(trans, h);
+		if (ret)
+			goto revert_fs_usage;
+		h = h->next;
+	}
+
+	trans_for_each_update(trans, i)
+		if (BTREE_NODE_TYPE_HAS_MEM_TRIGGERS & (1U << i->bkey_type)) {
+			ret = run_one_mem_trigger(trans, i, i->flags);
+			if (ret)
+				goto fatal_err;
+		}
+
+	if (unlikely(c->gc_pos.phase)) {
+		ret = bch2_trans_commit_run_gc_triggers(trans);
+		if  (ret)
+			goto fatal_err;
+	}
+
+	if (unlikely(trans->extra_journal_entries.nr)) {
+		memcpy_u64s_small(journal_res_entry(&c->journal, &trans->journal_res),
+				  trans->extra_journal_entries.data,
+				  trans->extra_journal_entries.nr);
+
+		trans->journal_res.offset	+= trans->extra_journal_entries.nr;
+		trans->journal_res.u64s		-= trans->extra_journal_entries.nr;
+	}
+
+	if (likely(!(flags & BTREE_INSERT_JOURNAL_REPLAY))) {
+		struct journal *j = &c->journal;
+		struct jset_entry *entry;
+
+		trans_for_each_update(trans, i) {
+			if (i->key_cache_already_flushed)
+				continue;
+
+			if (i->flags & BTREE_UPDATE_NOJOURNAL)
+				continue;
+
+			verify_update_old_key(trans, i);
+
+			if (trans->journal_transaction_names) {
+				entry = bch2_journal_add_entry(j, &trans->journal_res,
+						       BCH_JSET_ENTRY_overwrite,
+						       i->btree_id, i->level,
+						       i->old_k.u64s);
+				bkey_reassemble(&entry->start[0],
+						(struct bkey_s_c) { &i->old_k, i->old_v });
+			}
+
+			entry = bch2_journal_add_entry(j, &trans->journal_res,
+					       BCH_JSET_ENTRY_btree_keys,
+					       i->btree_id, i->level,
+					       i->k->k.u64s);
+			bkey_copy(&entry->start[0], i->k);
+		}
+
+		trans_for_each_wb_update(trans, wb) {
+			entry = bch2_journal_add_entry(j, &trans->journal_res,
+					       BCH_JSET_ENTRY_btree_keys,
+					       wb->btree, 0,
+					       wb->k.k.u64s);
+			bkey_copy(&entry->start[0], &wb->k);
+		}
+
+		if (trans->journal_seq)
+			*trans->journal_seq = trans->journal_res.seq;
+	}
+
+	trans_for_each_update(trans, i) {
+		i->k->k.needs_whiteout = false;
+
+		if (!i->cached) {
+			u64 seq = trans->journal_res.seq;
+
+			if (i->flags & BTREE_UPDATE_PREJOURNAL)
+				seq = i->seq;
+
+			bch2_btree_insert_key_leaf(trans, i->path, i->k, seq);
+		} else if (!i->key_cache_already_flushed)
+			bch2_btree_insert_key_cached(trans, flags, i);
+		else {
+			bch2_btree_key_cache_drop(trans, i->path);
+			btree_path_set_dirty(i->path, BTREE_ITER_NEED_TRAVERSE);
+		}
+	}
+
+	return 0;
+fatal_err:
+	bch2_fatal_error(c);
+revert_fs_usage:
+	if (trans->fs_usage_deltas)
+		bch2_trans_fs_usage_revert(trans, trans->fs_usage_deltas);
+	return ret;
+}
+
+static noinline int trans_lock_write_fail(struct btree_trans *trans, struct btree_insert_entry *i)
+{
+	while (--i >= trans->updates) {
+		if (same_leaf_as_prev(trans, i))
+			continue;
+
+		bch2_btree_node_unlock_write(trans, i->path, insert_l(i)->b);
+	}
+
+	trace_and_count(trans->c, trans_restart_would_deadlock_write, trans);
+	return btree_trans_restart(trans, BCH_ERR_transaction_restart_would_deadlock_write);
+}
+
+static inline int trans_lock_write(struct btree_trans *trans)
+{
+	struct btree_insert_entry *i;
+
+	trans_for_each_update(trans, i) {
+		if (same_leaf_as_prev(trans, i))
+			continue;
+
+		if (bch2_btree_node_lock_write(trans, i->path, &insert_l(i)->b->c))
+			return trans_lock_write_fail(trans, i);
+
+		if (!i->cached)
+			bch2_btree_node_prep_for_write(trans, i->path, insert_l(i)->b);
+	}
+
+	return 0;
+}
+
+static noinline void bch2_drop_overwrites_from_journal(struct btree_trans *trans)
+{
+	struct btree_insert_entry *i;
+	struct btree_write_buffered_key *wb;
+
+	trans_for_each_update(trans, i)
+		bch2_journal_key_overwritten(trans->c, i->btree_id, i->level, i->k->k.p);
+
+	trans_for_each_wb_update(trans, wb)
+		bch2_journal_key_overwritten(trans->c, wb->btree, 0, wb->k.k.p);
+}
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+static noinline int bch2_trans_commit_bkey_invalid(struct btree_trans *trans, unsigned flags,
+						   struct btree_insert_entry *i,
+						   struct printbuf *err)
+{
+	struct bch_fs *c = trans->c;
+	int rw = (flags & BTREE_INSERT_JOURNAL_REPLAY) ? READ : WRITE;
+
+	printbuf_reset(err);
+	prt_printf(err, "invalid bkey on insert from %s -> %ps",
+		   trans->fn, (void *) i->ip_allocated);
+	prt_newline(err);
+	printbuf_indent_add(err, 2);
+
+	bch2_bkey_val_to_text(err, c, bkey_i_to_s_c(i->k));
+	prt_newline(err);
+
+	bch2_bkey_invalid(c, bkey_i_to_s_c(i->k),
+			  i->bkey_type, rw, err);
+	bch2_print_string_as_lines(KERN_ERR, err->buf);
+
+	bch2_inconsistent_error(c);
+	bch2_dump_trans_updates(trans);
+	printbuf_exit(err);
+
+	return -EINVAL;
+}
+#endif
+
+/*
+ * Get journal reservation, take write locks, and attempt to do btree update(s):
+ */
+static inline int do_bch2_trans_commit(struct btree_trans *trans, unsigned flags,
+				       struct btree_insert_entry **stopped_at,
+				       unsigned long trace_ip)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_insert_entry *i;
+	int ret = 0, u64s_delta = 0;
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+	trans_for_each_update(trans, i) {
+		struct printbuf buf = PRINTBUF;
+		enum bkey_invalid_flags invalid_flags = 0;
+
+		if (!(flags & BTREE_INSERT_JOURNAL_REPLAY))
+			invalid_flags |= BKEY_INVALID_WRITE|BKEY_INVALID_COMMIT;
+
+		if (unlikely(bch2_bkey_invalid(c, bkey_i_to_s_c(i->k),
+					       i->bkey_type, invalid_flags, &buf)))
+			ret = bch2_trans_commit_bkey_invalid(trans, flags, i, &buf);
+		btree_insert_entry_checks(trans, i);
+		printbuf_exit(&buf);
+
+		if (ret)
+			return ret;
+	}
+#endif
+
+	trans_for_each_update(trans, i) {
+		if (i->cached)
+			continue;
+
+		u64s_delta += !bkey_deleted(&i->k->k) ? i->k->k.u64s : 0;
+		u64s_delta -= i->old_btree_u64s;
+
+		if (!same_leaf_as_next(trans, i)) {
+			if (u64s_delta <= 0) {
+				ret = bch2_foreground_maybe_merge(trans, i->path,
+							i->level, flags);
+				if (unlikely(ret))
+					return ret;
+			}
+
+			u64s_delta = 0;
+		}
+	}
+
+	ret = bch2_journal_preres_get(&c->journal,
+			&trans->journal_preres, trans->journal_preres_u64s,
+			(flags & BCH_WATERMARK_MASK)|JOURNAL_RES_GET_NONBLOCK);
+	if (unlikely(ret == -BCH_ERR_journal_preres_get_blocked))
+		ret = bch2_trans_journal_preres_get_cold(trans, flags, trace_ip);
+	if (unlikely(ret))
+		return ret;
+
+	ret = trans_lock_write(trans);
+	if (unlikely(ret))
+		return ret;
+
+	ret = bch2_trans_commit_write_locked(trans, flags, stopped_at, trace_ip);
+
+	if (!ret && unlikely(trans->journal_replay_not_finished))
+		bch2_drop_overwrites_from_journal(trans);
+
+	trans_for_each_update(trans, i)
+		if (!same_leaf_as_prev(trans, i))
+			bch2_btree_node_unlock_write_inlined(trans, i->path,
+							insert_l(i)->b);
+
+	if (!ret && trans->journal_pin)
+		bch2_journal_pin_add(&c->journal, trans->journal_res.seq,
+				     trans->journal_pin, NULL);
+
+	/*
+	 * Drop journal reservation after dropping write locks, since dropping
+	 * the journal reservation may kick off a journal write:
+	 */
+	bch2_journal_res_put(&c->journal, &trans->journal_res);
+
+	if (unlikely(ret))
+		return ret;
+
+	bch2_trans_downgrade(trans);
+
+	return 0;
+}
+
+static int journal_reclaim_wait_done(struct bch_fs *c)
+{
+	int ret = bch2_journal_error(&c->journal) ?:
+		!bch2_btree_key_cache_must_wait(c);
+
+	if (!ret)
+		journal_reclaim_kick(&c->journal);
+	return ret;
+}
+
+static noinline
+int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags,
+			    struct btree_insert_entry *i,
+			    int ret, unsigned long trace_ip)
+{
+	struct bch_fs *c = trans->c;
+
+	switch (ret) {
+	case -BCH_ERR_btree_insert_btree_node_full:
+		ret = bch2_btree_split_leaf(trans, i->path, flags);
+		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+			trace_and_count(c, trans_restart_btree_node_split, trans, trace_ip, i->path);
+		break;
+	case -BCH_ERR_btree_insert_need_mark_replicas:
+		ret = drop_locks_do(trans,
+			bch2_replicas_delta_list_mark(c, trans->fs_usage_deltas));
+		break;
+	case -BCH_ERR_journal_res_get_blocked:
+		/*
+		 * XXX: this should probably be a separate BTREE_INSERT_NONBLOCK
+		 * flag
+		 */
+		if ((flags & BTREE_INSERT_JOURNAL_RECLAIM) &&
+		    (flags & BCH_WATERMARK_MASK) != BCH_WATERMARK_reclaim) {
+			ret = -BCH_ERR_journal_reclaim_would_deadlock;
+			break;
+		}
+
+		ret = drop_locks_do(trans,
+			bch2_trans_journal_res_get(trans,
+					(flags & BCH_WATERMARK_MASK)|
+					JOURNAL_RES_GET_CHECK));
+		break;
+	case -BCH_ERR_btree_insert_need_journal_reclaim:
+		bch2_trans_unlock(trans);
+
+		trace_and_count(c, trans_blocked_journal_reclaim, trans, trace_ip);
+
+		wait_event_freezable(c->journal.reclaim_wait,
+				     (ret = journal_reclaim_wait_done(c)));
+		if (ret < 0)
+			break;
+
+		ret = bch2_trans_relock(trans);
+		break;
+	case -BCH_ERR_btree_insert_need_flush_buffer: {
+		struct btree_write_buffer *wb = &c->btree_write_buffer;
+
+		ret = 0;
+
+		if (wb->state.nr > wb->size * 3 / 4) {
+			bch2_trans_unlock(trans);
+			mutex_lock(&wb->flush_lock);
+
+			if (wb->state.nr > wb->size * 3 / 4) {
+				bch2_trans_begin(trans);
+				ret = __bch2_btree_write_buffer_flush(trans,
+						flags|BTREE_INSERT_NOCHECK_RW, true);
+				if (!ret) {
+					trace_and_count(c, trans_restart_write_buffer_flush, trans, _THIS_IP_);
+					ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_write_buffer_flush);
+				}
+			} else {
+				mutex_unlock(&wb->flush_lock);
+				ret = bch2_trans_relock(trans);
+			}
+		}
+		break;
+	}
+	default:
+		BUG_ON(ret >= 0);
+		break;
+	}
+
+	BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart) != !!trans->restarted);
+
+	bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOSPC) &&
+				!(flags & BTREE_INSERT_NOWAIT) &&
+				(flags & BTREE_INSERT_NOFAIL), c,
+		"%s: incorrectly got %s\n", __func__, bch2_err_str(ret));
+
+	return ret;
+}
+
+static noinline int
+bch2_trans_commit_get_rw_cold(struct btree_trans *trans, unsigned flags)
+{
+	struct bch_fs *c = trans->c;
+	int ret;
+
+	if (likely(!(flags & BTREE_INSERT_LAZY_RW)) ||
+	    test_bit(BCH_FS_STARTED, &c->flags))
+		return -BCH_ERR_erofs_trans_commit;
+
+	ret = drop_locks_do(trans, bch2_fs_read_write_early(c));
+	if (ret)
+		return ret;
+
+	bch2_write_ref_get(c, BCH_WRITE_REF_trans);
+	return 0;
+}
+
+/*
+ * This is for updates done in the early part of fsck - btree_gc - before we've
+ * gone RW. we only add the new key to the list of keys for journal replay to
+ * do.
+ */
+static noinline int
+do_bch2_trans_commit_to_journal_replay(struct btree_trans *trans)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_insert_entry *i;
+	int ret = 0;
+
+	trans_for_each_update(trans, i) {
+		ret = bch2_journal_key_insert(c, i->btree_id, i->level, i->k);
+		if (ret)
+			break;
+	}
+
+	return ret;
+}
+
+int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_insert_entry *i = NULL;
+	struct btree_write_buffered_key *wb;
+	unsigned u64s;
+	int ret = 0;
+
+	if (!trans->nr_updates &&
+	    !trans->nr_wb_updates &&
+	    !trans->extra_journal_entries.nr)
+		goto out_reset;
+
+	if (flags & BTREE_INSERT_GC_LOCK_HELD)
+		lockdep_assert_held(&c->gc_lock);
+
+	ret = bch2_trans_commit_run_triggers(trans);
+	if (ret)
+		goto out_reset;
+
+	if (unlikely(!test_bit(BCH_FS_MAY_GO_RW, &c->flags))) {
+		ret = do_bch2_trans_commit_to_journal_replay(trans);
+		goto out_reset;
+	}
+
+	if (!(flags & BTREE_INSERT_NOCHECK_RW) &&
+	    unlikely(!bch2_write_ref_tryget(c, BCH_WRITE_REF_trans))) {
+		ret = bch2_trans_commit_get_rw_cold(trans, flags);
+		if (ret)
+			goto out_reset;
+	}
+
+	if (c->btree_write_buffer.state.nr > c->btree_write_buffer.size / 2 &&
+	    mutex_trylock(&c->btree_write_buffer.flush_lock)) {
+		bch2_trans_begin(trans);
+		bch2_trans_unlock(trans);
+
+		ret = __bch2_btree_write_buffer_flush(trans,
+					flags|BTREE_INSERT_NOCHECK_RW, true);
+		if (!ret) {
+			trace_and_count(c, trans_restart_write_buffer_flush, trans, _THIS_IP_);
+			ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_write_buffer_flush);
+		}
+		goto out;
+	}
+
+	EBUG_ON(test_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags));
+
+	memset(&trans->journal_preres, 0, sizeof(trans->journal_preres));
+
+	trans->journal_u64s		= trans->extra_journal_entries.nr;
+	trans->journal_preres_u64s	= 0;
+
+	trans->journal_transaction_names = READ_ONCE(c->opts.journal_transaction_names);
+
+	if (trans->journal_transaction_names)
+		trans->journal_u64s += jset_u64s(JSET_ENTRY_LOG_U64s);
+
+	trans_for_each_update(trans, i) {
+		EBUG_ON(!i->path->should_be_locked);
+
+		ret = bch2_btree_path_upgrade(trans, i->path, i->level + 1);
+		if (unlikely(ret))
+			goto out;
+
+		EBUG_ON(!btree_node_intent_locked(i->path, i->level));
+
+		if (i->key_cache_already_flushed)
+			continue;
+
+		/* we're going to journal the key being updated: */
+		u64s = jset_u64s(i->k->k.u64s);
+		if (i->cached &&
+		    likely(!(flags & BTREE_INSERT_JOURNAL_REPLAY)))
+			trans->journal_preres_u64s += u64s;
+
+		if (i->flags & BTREE_UPDATE_NOJOURNAL)
+			continue;
+
+		trans->journal_u64s += u64s;
+
+		/* and we're also going to log the overwrite: */
+		if (trans->journal_transaction_names)
+			trans->journal_u64s += jset_u64s(i->old_k.u64s);
+	}
+
+	trans_for_each_wb_update(trans, wb)
+		trans->journal_u64s += jset_u64s(wb->k.k.u64s);
+
+	if (trans->extra_journal_res) {
+		ret = bch2_disk_reservation_add(c, trans->disk_res,
+				trans->extra_journal_res,
+				(flags & BTREE_INSERT_NOFAIL)
+				? BCH_DISK_RESERVATION_NOFAIL : 0);
+		if (ret)
+			goto err;
+	}
+retry:
+	bch2_trans_verify_not_in_restart(trans);
+	memset(&trans->journal_res, 0, sizeof(trans->journal_res));
+
+	ret = do_bch2_trans_commit(trans, flags, &i, _RET_IP_);
+
+	/* make sure we didn't drop or screw up locks: */
+	bch2_trans_verify_locks(trans);
+
+	if (ret)
+		goto err;
+
+	trace_and_count(c, transaction_commit, trans, _RET_IP_);
+out:
+	bch2_journal_preres_put(&c->journal, &trans->journal_preres);
+
+	if (likely(!(flags & BTREE_INSERT_NOCHECK_RW)))
+		bch2_write_ref_put(c, BCH_WRITE_REF_trans);
+out_reset:
+	bch2_trans_reset_updates(trans);
+
+	return ret;
+err:
+	ret = bch2_trans_commit_error(trans, flags, i, ret, _RET_IP_);
+	if (ret)
+		goto out;
+
+	goto retry;
+}
diff --git a/fs/bcachefs/btree_update.c b/fs/bcachefs/btree_update.c
new file mode 100644
index 000000000000..366929da58a0
--- /dev/null
+++ b/fs/bcachefs/btree_update.c
@@ -0,0 +1,943 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "btree_update.h"
+#include "btree_iter.h"
+#include "btree_locking.h"
+#include "buckets.h"
+#include "debug.h"
+#include "errcode.h"
+#include "error.h"
+#include "extents.h"
+#include "keylist.h"
+#include "recovery.h"
+#include "subvolume.h"
+#include "trace.h"
+
+static inline int btree_insert_entry_cmp(const struct btree_insert_entry *l,
+					 const struct btree_insert_entry *r)
+{
+	return   cmp_int(l->btree_id,	r->btree_id) ?:
+		 cmp_int(l->cached,	r->cached) ?:
+		 -cmp_int(l->level,	r->level) ?:
+		 bpos_cmp(l->k->k.p,	r->k->k.p);
+}
+
+static int __must_check
+bch2_trans_update_by_path(struct btree_trans *, struct btree_path *,
+			  struct bkey_i *, enum btree_update_flags,
+			  unsigned long ip);
+
+static noinline int __check_pos_snapshot_overwritten(struct btree_trans *trans,
+					  enum btree_id id,
+					  struct bpos pos)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	int ret;
+
+	bch2_trans_iter_init(trans, &iter, id, pos,
+			     BTREE_ITER_NOT_EXTENTS|
+			     BTREE_ITER_ALL_SNAPSHOTS);
+	while (1) {
+		k = bch2_btree_iter_prev(&iter);
+		ret = bkey_err(k);
+		if (ret)
+			break;
+
+		if (!k.k)
+			break;
+
+		if (!bkey_eq(pos, k.k->p))
+			break;
+
+		if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, pos.snapshot)) {
+			ret = 1;
+			break;
+		}
+	}
+	bch2_trans_iter_exit(trans, &iter);
+
+	return ret;
+}
+
+static inline int check_pos_snapshot_overwritten(struct btree_trans *trans,
+					  enum btree_id id,
+					  struct bpos pos)
+{
+	if (!btree_type_has_snapshots(id) ||
+	    bch2_snapshot_is_leaf(trans->c, pos.snapshot))
+		return 0;
+
+	return __check_pos_snapshot_overwritten(trans, id, pos);
+}
+
+static noinline int extent_front_merge(struct btree_trans *trans,
+				       struct btree_iter *iter,
+				       struct bkey_s_c k,
+				       struct bkey_i **insert,
+				       enum btree_update_flags flags)
+{
+	struct bch_fs *c = trans->c;
+	struct bkey_i *update;
+	int ret;
+
+	update = bch2_bkey_make_mut_noupdate(trans, k);
+	ret = PTR_ERR_OR_ZERO(update);
+	if (ret)
+		return ret;
+
+	if (!bch2_bkey_merge(c, bkey_i_to_s(update), bkey_i_to_s_c(*insert)))
+		return 0;
+
+	ret =   check_pos_snapshot_overwritten(trans, iter->btree_id, k.k->p) ?:
+		check_pos_snapshot_overwritten(trans, iter->btree_id, (*insert)->k.p);
+	if (ret < 0)
+		return ret;
+	if (ret)
+		return 0;
+
+	ret = bch2_btree_delete_at(trans, iter, flags);
+	if (ret)
+		return ret;
+
+	*insert = update;
+	return 0;
+}
+
+static noinline int extent_back_merge(struct btree_trans *trans,
+				      struct btree_iter *iter,
+				      struct bkey_i *insert,
+				      struct bkey_s_c k)
+{
+	struct bch_fs *c = trans->c;
+	int ret;
+
+	ret =   check_pos_snapshot_overwritten(trans, iter->btree_id, insert->k.p) ?:
+		check_pos_snapshot_overwritten(trans, iter->btree_id, k.k->p);
+	if (ret < 0)
+		return ret;
+	if (ret)
+		return 0;
+
+	bch2_bkey_merge(c, bkey_i_to_s(insert), k);
+	return 0;
+}
+
+/*
+ * When deleting, check if we need to emit a whiteout (because we're overwriting
+ * something in an ancestor snapshot)
+ */
+static int need_whiteout_for_snapshot(struct btree_trans *trans,
+				      enum btree_id btree_id, struct bpos pos)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	u32 snapshot = pos.snapshot;
+	int ret;
+
+	if (!bch2_snapshot_parent(trans->c, pos.snapshot))
+		return 0;
+
+	pos.snapshot++;
+
+	for_each_btree_key_norestart(trans, iter, btree_id, pos,
+			   BTREE_ITER_ALL_SNAPSHOTS|
+			   BTREE_ITER_NOPRESERVE, k, ret) {
+		if (!bkey_eq(k.k->p, pos))
+			break;
+
+		if (bch2_snapshot_is_ancestor(trans->c, snapshot,
+					      k.k->p.snapshot)) {
+			ret = !bkey_whiteout(k.k);
+			break;
+		}
+	}
+	bch2_trans_iter_exit(trans, &iter);
+
+	return ret;
+}
+
+int __bch2_insert_snapshot_whiteouts(struct btree_trans *trans,
+				   enum btree_id id,
+				   struct bpos old_pos,
+				   struct bpos new_pos)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter old_iter, new_iter = { NULL };
+	struct bkey_s_c old_k, new_k;
+	snapshot_id_list s;
+	struct bkey_i *update;
+	int ret;
+
+	if (!bch2_snapshot_has_children(c, old_pos.snapshot))
+		return 0;
+
+	darray_init(&s);
+
+	bch2_trans_iter_init(trans, &old_iter, id, old_pos,
+			     BTREE_ITER_NOT_EXTENTS|
+			     BTREE_ITER_ALL_SNAPSHOTS);
+	while ((old_k = bch2_btree_iter_prev(&old_iter)).k &&
+	       !(ret = bkey_err(old_k)) &&
+	       bkey_eq(old_pos, old_k.k->p)) {
+		struct bpos whiteout_pos =
+			SPOS(new_pos.inode, new_pos.offset, old_k.k->p.snapshot);;
+
+		if (!bch2_snapshot_is_ancestor(c, old_k.k->p.snapshot, old_pos.snapshot) ||
+		    snapshot_list_has_ancestor(c, &s, old_k.k->p.snapshot))
+			continue;
+
+		new_k = bch2_bkey_get_iter(trans, &new_iter, id, whiteout_pos,
+					   BTREE_ITER_NOT_EXTENTS|
+					   BTREE_ITER_INTENT);
+		ret = bkey_err(new_k);
+		if (ret)
+			break;
+
+		if (new_k.k->type == KEY_TYPE_deleted) {
+			update = bch2_trans_kmalloc(trans, sizeof(struct bkey_i));
+			ret = PTR_ERR_OR_ZERO(update);
+			if (ret)
+				break;
+
+			bkey_init(&update->k);
+			update->k.p		= whiteout_pos;
+			update->k.type		= KEY_TYPE_whiteout;
+
+			ret = bch2_trans_update(trans, &new_iter, update,
+						BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+		}
+		bch2_trans_iter_exit(trans, &new_iter);
+
+		ret = snapshot_list_add(c, &s, old_k.k->p.snapshot);
+		if (ret)
+			break;
+	}
+	bch2_trans_iter_exit(trans, &new_iter);
+	bch2_trans_iter_exit(trans, &old_iter);
+	darray_exit(&s);
+
+	return ret;
+}
+
+int bch2_trans_update_extent_overwrite(struct btree_trans *trans,
+				       struct btree_iter *iter,
+				       enum btree_update_flags flags,
+				       struct bkey_s_c old,
+				       struct bkey_s_c new)
+{
+	enum btree_id btree_id = iter->btree_id;
+	struct bkey_i *update;
+	struct bpos new_start = bkey_start_pos(new.k);
+	bool front_split = bkey_lt(bkey_start_pos(old.k), new_start);
+	bool back_split  = bkey_gt(old.k->p, new.k->p);
+	int ret = 0, compressed_sectors;
+
+	/*
+	 * If we're going to be splitting a compressed extent, note it
+	 * so that __bch2_trans_commit() can increase our disk
+	 * reservation:
+	 */
+	if (((front_split && back_split) ||
+	     ((front_split || back_split) && old.k->p.snapshot != new.k->p.snapshot)) &&
+	    (compressed_sectors = bch2_bkey_sectors_compressed(old)))
+		trans->extra_journal_res += compressed_sectors;
+
+	if (front_split) {
+		update = bch2_bkey_make_mut_noupdate(trans, old);
+		if ((ret = PTR_ERR_OR_ZERO(update)))
+			return ret;
+
+		bch2_cut_back(new_start, update);
+
+		ret =   bch2_insert_snapshot_whiteouts(trans, btree_id,
+					old.k->p, update->k.p) ?:
+			bch2_btree_insert_nonextent(trans, btree_id, update,
+					BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|flags);
+		if (ret)
+			return ret;
+	}
+
+	/* If we're overwriting in a different snapshot - middle split: */
+	if (old.k->p.snapshot != new.k->p.snapshot &&
+	    (front_split || back_split)) {
+		update = bch2_bkey_make_mut_noupdate(trans, old);
+		if ((ret = PTR_ERR_OR_ZERO(update)))
+			return ret;
+
+		bch2_cut_front(new_start, update);
+		bch2_cut_back(new.k->p, update);
+
+		ret =   bch2_insert_snapshot_whiteouts(trans, btree_id,
+					old.k->p, update->k.p) ?:
+			bch2_btree_insert_nonextent(trans, btree_id, update,
+					  BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|flags);
+		if (ret)
+			return ret;
+	}
+
+	if (bkey_le(old.k->p, new.k->p)) {
+		update = bch2_trans_kmalloc(trans, sizeof(*update));
+		if ((ret = PTR_ERR_OR_ZERO(update)))
+			return ret;
+
+		bkey_init(&update->k);
+		update->k.p = old.k->p;
+		update->k.p.snapshot = new.k->p.snapshot;
+
+		if (new.k->p.snapshot != old.k->p.snapshot) {
+			update->k.type = KEY_TYPE_whiteout;
+		} else if (btree_type_has_snapshots(btree_id)) {
+			ret = need_whiteout_for_snapshot(trans, btree_id, update->k.p);
+			if (ret < 0)
+				return ret;
+			if (ret)
+				update->k.type = KEY_TYPE_whiteout;
+		}
+
+		ret = bch2_btree_insert_nonextent(trans, btree_id, update,
+					  BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|flags);
+		if (ret)
+			return ret;
+	}
+
+	if (back_split) {
+		update = bch2_bkey_make_mut_noupdate(trans, old);
+		if ((ret = PTR_ERR_OR_ZERO(update)))
+			return ret;
+
+		bch2_cut_front(new.k->p, update);
+
+		ret = bch2_trans_update_by_path(trans, iter->path, update,
+					  BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|
+					  flags, _RET_IP_);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+static int bch2_trans_update_extent(struct btree_trans *trans,
+				    struct btree_iter *orig_iter,
+				    struct bkey_i *insert,
+				    enum btree_update_flags flags)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	enum btree_id btree_id = orig_iter->btree_id;
+	int ret = 0;
+
+	bch2_trans_iter_init(trans, &iter, btree_id, bkey_start_pos(&insert->k),
+			     BTREE_ITER_INTENT|
+			     BTREE_ITER_WITH_UPDATES|
+			     BTREE_ITER_NOT_EXTENTS);
+	k = bch2_btree_iter_peek_upto(&iter, POS(insert->k.p.inode, U64_MAX));
+	if ((ret = bkey_err(k)))
+		goto err;
+	if (!k.k)
+		goto out;
+
+	if (bkey_eq(k.k->p, bkey_start_pos(&insert->k))) {
+		if (bch2_bkey_maybe_mergable(k.k, &insert->k)) {
+			ret = extent_front_merge(trans, &iter, k, &insert, flags);
+			if (ret)
+				goto err;
+		}
+
+		goto next;
+	}
+
+	while (bkey_gt(insert->k.p, bkey_start_pos(k.k))) {
+		bool done = bkey_lt(insert->k.p, k.k->p);
+
+		ret = bch2_trans_update_extent_overwrite(trans, &iter, flags, k, bkey_i_to_s_c(insert));
+		if (ret)
+			goto err;
+
+		if (done)
+			goto out;
+next:
+		bch2_btree_iter_advance(&iter);
+		k = bch2_btree_iter_peek_upto(&iter, POS(insert->k.p.inode, U64_MAX));
+		if ((ret = bkey_err(k)))
+			goto err;
+		if (!k.k)
+			goto out;
+	}
+
+	if (bch2_bkey_maybe_mergable(&insert->k, k.k)) {
+		ret = extent_back_merge(trans, &iter, insert, k);
+		if (ret)
+			goto err;
+	}
+out:
+	if (!bkey_deleted(&insert->k))
+		ret = bch2_btree_insert_nonextent(trans, btree_id, insert, flags);
+err:
+	bch2_trans_iter_exit(trans, &iter);
+
+	return ret;
+}
+
+static noinline int flush_new_cached_update(struct btree_trans *trans,
+					    struct btree_path *path,
+					    struct btree_insert_entry *i,
+					    enum btree_update_flags flags,
+					    unsigned long ip)
+{
+	struct btree_path *btree_path;
+	struct bkey k;
+	int ret;
+
+	btree_path = bch2_path_get(trans, path->btree_id, path->pos, 1, 0,
+				   BTREE_ITER_INTENT, _THIS_IP_);
+	ret = bch2_btree_path_traverse(trans, btree_path, 0);
+	if (ret)
+		goto out;
+
+	/*
+	 * The old key in the insert entry might actually refer to an existing
+	 * key in the btree that has been deleted from cache and not yet
+	 * flushed. Check for this and skip the flush so we don't run triggers
+	 * against a stale key.
+	 */
+	bch2_btree_path_peek_slot_exact(btree_path, &k);
+	if (!bkey_deleted(&k))
+		goto out;
+
+	i->key_cache_already_flushed = true;
+	i->flags |= BTREE_TRIGGER_NORUN;
+
+	btree_path_set_should_be_locked(btree_path);
+	ret = bch2_trans_update_by_path(trans, btree_path, i->k, flags, ip);
+out:
+	bch2_path_put(trans, btree_path, true);
+	return ret;
+}
+
+static int __must_check
+bch2_trans_update_by_path(struct btree_trans *trans, struct btree_path *path,
+			  struct bkey_i *k, enum btree_update_flags flags,
+			  unsigned long ip)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_insert_entry *i, n;
+	u64 seq = 0;
+	int cmp;
+
+	EBUG_ON(!path->should_be_locked);
+	EBUG_ON(trans->nr_updates >= BTREE_ITER_MAX);
+	EBUG_ON(!bpos_eq(k->k.p, path->pos));
+
+	/*
+	 * The transaction journal res hasn't been allocated at this point.
+	 * That occurs at commit time. Reuse the seq field to pass in the seq
+	 * of a prejournaled key.
+	 */
+	if (flags & BTREE_UPDATE_PREJOURNAL)
+		seq = trans->journal_res.seq;
+
+	n = (struct btree_insert_entry) {
+		.flags		= flags,
+		.bkey_type	= __btree_node_type(path->level, path->btree_id),
+		.btree_id	= path->btree_id,
+		.level		= path->level,
+		.cached		= path->cached,
+		.path		= path,
+		.k		= k,
+		.seq		= seq,
+		.ip_allocated	= ip,
+	};
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+	trans_for_each_update(trans, i)
+		BUG_ON(i != trans->updates &&
+		       btree_insert_entry_cmp(i - 1, i) >= 0);
+#endif
+
+	/*
+	 * Pending updates are kept sorted: first, find position of new update,
+	 * then delete/trim any updates the new update overwrites:
+	 */
+	trans_for_each_update(trans, i) {
+		cmp = btree_insert_entry_cmp(&n, i);
+		if (cmp <= 0)
+			break;
+	}
+
+	if (!cmp && i < trans->updates + trans->nr_updates) {
+		EBUG_ON(i->insert_trigger_run || i->overwrite_trigger_run);
+
+		bch2_path_put(trans, i->path, true);
+		i->flags	= n.flags;
+		i->cached	= n.cached;
+		i->k		= n.k;
+		i->path		= n.path;
+		i->seq		= n.seq;
+		i->ip_allocated	= n.ip_allocated;
+	} else {
+		array_insert_item(trans->updates, trans->nr_updates,
+				  i - trans->updates, n);
+
+		i->old_v = bch2_btree_path_peek_slot_exact(path, &i->old_k).v;
+		i->old_btree_u64s = !bkey_deleted(&i->old_k) ? i->old_k.u64s : 0;
+
+		if (unlikely(trans->journal_replay_not_finished)) {
+			struct bkey_i *j_k =
+				bch2_journal_keys_peek_slot(c, n.btree_id, n.level, k->k.p);
+
+			if (j_k) {
+				i->old_k = j_k->k;
+				i->old_v = &j_k->v;
+			}
+		}
+	}
+
+	__btree_path_get(i->path, true);
+
+	/*
+	 * If a key is present in the key cache, it must also exist in the
+	 * btree - this is necessary for cache coherency. When iterating over
+	 * a btree that's cached in the key cache, the btree iter code checks
+	 * the key cache - but the key has to exist in the btree for that to
+	 * work:
+	 */
+	if (path->cached && bkey_deleted(&i->old_k))
+		return flush_new_cached_update(trans, path, i, flags, ip);
+
+	return 0;
+}
+
+int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter,
+				   struct bkey_i *k, enum btree_update_flags flags)
+{
+	struct btree_path *path = iter->update_path ?: iter->path;
+	struct bkey_cached *ck;
+	int ret;
+
+	if (iter->flags & BTREE_ITER_IS_EXTENTS)
+		return bch2_trans_update_extent(trans, iter, k, flags);
+
+	if (bkey_deleted(&k->k) &&
+	    !(flags & BTREE_UPDATE_KEY_CACHE_RECLAIM) &&
+	    (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)) {
+		ret = need_whiteout_for_snapshot(trans, iter->btree_id, k->k.p);
+		if (unlikely(ret < 0))
+			return ret;
+
+		if (ret)
+			k->k.type = KEY_TYPE_whiteout;
+	}
+
+	/*
+	 * Ensure that updates to cached btrees go to the key cache:
+	 */
+	if (!(flags & BTREE_UPDATE_KEY_CACHE_RECLAIM) &&
+	    !path->cached &&
+	    !path->level &&
+	    btree_id_cached(trans->c, path->btree_id)) {
+		if (!iter->key_cache_path ||
+		    !iter->key_cache_path->should_be_locked ||
+		    !bpos_eq(iter->key_cache_path->pos, k->k.p)) {
+			if (!iter->key_cache_path)
+				iter->key_cache_path =
+					bch2_path_get(trans, path->btree_id, path->pos, 1, 0,
+						      BTREE_ITER_INTENT|
+						      BTREE_ITER_CACHED, _THIS_IP_);
+
+			iter->key_cache_path =
+				bch2_btree_path_set_pos(trans, iter->key_cache_path, path->pos,
+							iter->flags & BTREE_ITER_INTENT,
+							_THIS_IP_);
+
+			ret = bch2_btree_path_traverse(trans, iter->key_cache_path,
+						       BTREE_ITER_CACHED);
+			if (unlikely(ret))
+				return ret;
+
+			ck = (void *) iter->key_cache_path->l[0].b;
+
+			if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
+				trace_and_count(trans->c, trans_restart_key_cache_raced, trans, _RET_IP_);
+				return btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_raced);
+			}
+
+			btree_path_set_should_be_locked(iter->key_cache_path);
+		}
+
+		path = iter->key_cache_path;
+	}
+
+	return bch2_trans_update_by_path(trans, path, k, flags, _RET_IP_);
+}
+
+/*
+ * Add a transaction update for a key that has already been journaled.
+ */
+int __must_check bch2_trans_update_seq(struct btree_trans *trans, u64 seq,
+				       struct btree_iter *iter, struct bkey_i *k,
+				       enum btree_update_flags flags)
+{
+	trans->journal_res.seq = seq;
+	return bch2_trans_update(trans, iter, k, flags|BTREE_UPDATE_NOJOURNAL|
+						 BTREE_UPDATE_PREJOURNAL);
+}
+
+int __must_check bch2_trans_update_buffered(struct btree_trans *trans,
+					    enum btree_id btree,
+					    struct bkey_i *k)
+{
+	struct btree_write_buffered_key *i;
+	int ret;
+
+	EBUG_ON(trans->nr_wb_updates > trans->wb_updates_size);
+	EBUG_ON(k->k.u64s > BTREE_WRITE_BUFERED_U64s_MAX);
+
+	trans_for_each_wb_update(trans, i) {
+		if (i->btree == btree && bpos_eq(i->k.k.p, k->k.p)) {
+			bkey_copy(&i->k, k);
+			return 0;
+		}
+	}
+
+	if (!trans->wb_updates ||
+	    trans->nr_wb_updates == trans->wb_updates_size) {
+		struct btree_write_buffered_key *u;
+
+		if (trans->nr_wb_updates == trans->wb_updates_size) {
+			struct btree_transaction_stats *s = btree_trans_stats(trans);
+
+			BUG_ON(trans->wb_updates_size > U8_MAX / 2);
+			trans->wb_updates_size = max(1, trans->wb_updates_size * 2);
+			if (s)
+				s->wb_updates_size = trans->wb_updates_size;
+		}
+
+		u = bch2_trans_kmalloc_nomemzero(trans,
+					trans->wb_updates_size *
+					sizeof(struct btree_write_buffered_key));
+		ret = PTR_ERR_OR_ZERO(u);
+		if (ret)
+			return ret;
+
+		if (trans->nr_wb_updates)
+			memcpy(u, trans->wb_updates, trans->nr_wb_updates *
+			       sizeof(struct btree_write_buffered_key));
+		trans->wb_updates = u;
+	}
+
+	trans->wb_updates[trans->nr_wb_updates] = (struct btree_write_buffered_key) {
+		.btree	= btree,
+	};
+
+	bkey_copy(&trans->wb_updates[trans->nr_wb_updates].k, k);
+	trans->nr_wb_updates++;
+
+	return 0;
+}
+
+int bch2_bkey_get_empty_slot(struct btree_trans *trans, struct btree_iter *iter,
+			     enum btree_id btree, struct bpos end)
+{
+	struct bkey_s_c k;
+	int ret = 0;
+
+	bch2_trans_iter_init(trans, iter, btree, POS_MAX, BTREE_ITER_INTENT);
+	k = bch2_btree_iter_prev(iter);
+	ret = bkey_err(k);
+	if (ret)
+		goto err;
+
+	bch2_btree_iter_advance(iter);
+	k = bch2_btree_iter_peek_slot(iter);
+	ret = bkey_err(k);
+	if (ret)
+		goto err;
+
+	BUG_ON(k.k->type != KEY_TYPE_deleted);
+
+	if (bkey_gt(k.k->p, end)) {
+		ret = -BCH_ERR_ENOSPC_btree_slot;
+		goto err;
+	}
+
+	return 0;
+err:
+	bch2_trans_iter_exit(trans, iter);
+	return ret;
+}
+
+void bch2_trans_commit_hook(struct btree_trans *trans,
+			    struct btree_trans_commit_hook *h)
+{
+	h->next = trans->hooks;
+	trans->hooks = h;
+}
+
+int bch2_btree_insert_nonextent(struct btree_trans *trans,
+				enum btree_id btree, struct bkey_i *k,
+				enum btree_update_flags flags)
+{
+	struct btree_iter iter;
+	int ret;
+
+	bch2_trans_iter_init(trans, &iter, btree, k->k.p,
+			     BTREE_ITER_NOT_EXTENTS|
+			     BTREE_ITER_INTENT);
+	ret   = bch2_btree_iter_traverse(&iter) ?:
+		bch2_trans_update(trans, &iter, k, flags);
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+int __bch2_btree_insert(struct btree_trans *trans, enum btree_id id,
+			struct bkey_i *k, enum btree_update_flags flags)
+{
+	struct btree_iter iter;
+	int ret;
+
+	bch2_trans_iter_init(trans, &iter, id, bkey_start_pos(&k->k),
+			     BTREE_ITER_CACHED|
+			     BTREE_ITER_INTENT);
+	ret   = bch2_btree_iter_traverse(&iter) ?:
+		bch2_trans_update(trans, &iter, k, flags);
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+/**
+ * bch2_btree_insert - insert keys into the extent btree
+ * @c:			pointer to struct bch_fs
+ * @id:			btree to insert into
+ * @insert_keys:	list of keys to insert
+ * @hook:		insert callback
+ */
+int bch2_btree_insert(struct bch_fs *c, enum btree_id id,
+		      struct bkey_i *k,
+		      struct disk_reservation *disk_res,
+		      u64 *journal_seq, int flags)
+{
+	return bch2_trans_do(c, disk_res, journal_seq, flags,
+			     __bch2_btree_insert(&trans, id, k, 0));
+}
+
+int bch2_btree_delete_extent_at(struct btree_trans *trans, struct btree_iter *iter,
+				unsigned len, unsigned update_flags)
+{
+	struct bkey_i *k;
+
+	k = bch2_trans_kmalloc(trans, sizeof(*k));
+	if (IS_ERR(k))
+		return PTR_ERR(k);
+
+	bkey_init(&k->k);
+	k->k.p = iter->pos;
+	bch2_key_resize(&k->k, len);
+	return bch2_trans_update(trans, iter, k, update_flags);
+}
+
+int bch2_btree_delete_at(struct btree_trans *trans,
+			 struct btree_iter *iter, unsigned update_flags)
+{
+	return bch2_btree_delete_extent_at(trans, iter, 0, update_flags);
+}
+
+int bch2_btree_delete_at_buffered(struct btree_trans *trans,
+				  enum btree_id btree, struct bpos pos)
+{
+	struct bkey_i *k;
+
+	k = bch2_trans_kmalloc(trans, sizeof(*k));
+	if (IS_ERR(k))
+		return PTR_ERR(k);
+
+	bkey_init(&k->k);
+	k->k.p = pos;
+	return bch2_trans_update_buffered(trans, btree, k);
+}
+
+int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id id,
+				  struct bpos start, struct bpos end,
+				  unsigned update_flags,
+				  u64 *journal_seq)
+{
+	u32 restart_count = trans->restart_count;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	int ret = 0;
+
+	bch2_trans_iter_init(trans, &iter, id, start, BTREE_ITER_INTENT);
+	while ((k = bch2_btree_iter_peek_upto(&iter, end)).k) {
+		struct disk_reservation disk_res =
+			bch2_disk_reservation_init(trans->c, 0);
+		struct bkey_i delete;
+
+		ret = bkey_err(k);
+		if (ret)
+			goto err;
+
+		bkey_init(&delete.k);
+
+		/*
+		 * This could probably be more efficient for extents:
+		 */
+
+		/*
+		 * For extents, iter.pos won't necessarily be the same as
+		 * bkey_start_pos(k.k) (for non extents they always will be the
+		 * same). It's important that we delete starting from iter.pos
+		 * because the range we want to delete could start in the middle
+		 * of k.
+		 *
+		 * (bch2_btree_iter_peek() does guarantee that iter.pos >=
+		 * bkey_start_pos(k.k)).
+		 */
+		delete.k.p = iter.pos;
+
+		if (iter.flags & BTREE_ITER_IS_EXTENTS)
+			bch2_key_resize(&delete.k,
+					bpos_min(end, k.k->p).offset -
+					iter.pos.offset);
+
+		ret   = bch2_trans_update(trans, &iter, &delete, update_flags) ?:
+			bch2_trans_commit(trans, &disk_res, journal_seq,
+					  BTREE_INSERT_NOFAIL);
+		bch2_disk_reservation_put(trans->c, &disk_res);
+err:
+		/*
+		 * the bch2_trans_begin() call is in a weird place because we
+		 * need to call it after every transaction commit, to avoid path
+		 * overflow, but don't want to call it if the delete operation
+		 * is a no-op and we have no work to do:
+		 */
+		bch2_trans_begin(trans);
+
+		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+			ret = 0;
+		if (ret)
+			break;
+	}
+	bch2_trans_iter_exit(trans, &iter);
+
+	if (!ret && trans_was_restarted(trans, restart_count))
+		ret = -BCH_ERR_transaction_restart_nested;
+	return ret;
+}
+
+/*
+ * bch_btree_delete_range - delete everything within a given range
+ *
+ * Range is a half open interval - [start, end)
+ */
+int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id,
+			    struct bpos start, struct bpos end,
+			    unsigned update_flags,
+			    u64 *journal_seq)
+{
+	int ret = bch2_trans_run(c,
+			bch2_btree_delete_range_trans(&trans, id, start, end,
+						      update_flags, journal_seq));
+	if (ret == -BCH_ERR_transaction_restart_nested)
+		ret = 0;
+	return ret;
+}
+
+int bch2_btree_bit_mod(struct btree_trans *trans, enum btree_id btree,
+		       struct bpos pos, bool set)
+{
+	struct bkey_i *k;
+	int ret = 0;
+
+	k = bch2_trans_kmalloc_nomemzero(trans, sizeof(*k));
+	ret = PTR_ERR_OR_ZERO(k);
+	if (unlikely(ret))
+		return ret;
+
+	bkey_init(&k->k);
+	k->k.type = set ? KEY_TYPE_set : KEY_TYPE_deleted;
+	k->k.p = pos;
+
+	return bch2_trans_update_buffered(trans, btree, k);
+}
+
+static int __bch2_trans_log_msg(darray_u64 *entries, const char *fmt, va_list args)
+{
+	struct printbuf buf = PRINTBUF;
+	struct jset_entry_log *l;
+	unsigned u64s;
+	int ret;
+
+	prt_vprintf(&buf, fmt, args);
+	ret = buf.allocation_failure ? -BCH_ERR_ENOMEM_trans_log_msg : 0;
+	if (ret)
+		goto err;
+
+	u64s = DIV_ROUND_UP(buf.pos, sizeof(u64));
+
+	ret = darray_make_room(entries, jset_u64s(u64s));
+	if (ret)
+		goto err;
+
+	l = (void *) &darray_top(*entries);
+	l->entry.u64s		= cpu_to_le16(u64s);
+	l->entry.btree_id	= 0;
+	l->entry.level		= 1;
+	l->entry.type		= BCH_JSET_ENTRY_log;
+	l->entry.pad[0]		= 0;
+	l->entry.pad[1]		= 0;
+	l->entry.pad[2]		= 0;
+	memcpy(l->d, buf.buf, buf.pos);
+	while (buf.pos & 7)
+		l->d[buf.pos++] = '\0';
+
+	entries->nr += jset_u64s(u64s);
+err:
+	printbuf_exit(&buf);
+	return ret;
+}
+
+static int
+__bch2_fs_log_msg(struct bch_fs *c, unsigned commit_flags, const char *fmt,
+		  va_list args)
+{
+	int ret;
+
+	if (!test_bit(JOURNAL_STARTED, &c->journal.flags)) {
+		ret = __bch2_trans_log_msg(&c->journal.early_journal_entries, fmt, args);
+	} else {
+		ret = bch2_trans_do(c, NULL, NULL,
+			BTREE_INSERT_LAZY_RW|commit_flags,
+			__bch2_trans_log_msg(&trans.extra_journal_entries, fmt, args));
+	}
+
+	return ret;
+}
+
+int bch2_fs_log_msg(struct bch_fs *c, const char *fmt, ...)
+{
+	va_list args;
+	int ret;
+
+	va_start(args, fmt);
+	ret = __bch2_fs_log_msg(c, 0, fmt, args);
+	va_end(args);
+	return ret;
+}
+
+/*
+ * Use for logging messages during recovery to enable reserved space and avoid
+ * blocking.
+ */
+int bch2_journal_log_msg(struct bch_fs *c, const char *fmt, ...)
+{
+	va_list args;
+	int ret;
+
+	va_start(args, fmt);
+	ret = __bch2_fs_log_msg(c, BCH_WATERMARK_reclaim, fmt, args);
+	va_end(args);
+	return ret;
+}
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
deleted file mode 100644
index 369e37a415f3..000000000000
--- a/fs/bcachefs/btree_update_leaf.c
+++ /dev/null
@@ -1,2107 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "btree_update.h"
-#include "btree_update_interior.h"
-#include "btree_gc.h"
-#include "btree_io.h"
-#include "btree_iter.h"
-#include "btree_key_cache.h"
-#include "btree_locking.h"
-#include "btree_write_buffer.h"
-#include "buckets.h"
-#include "debug.h"
-#include "errcode.h"
-#include "error.h"
-#include "extent_update.h"
-#include "journal.h"
-#include "journal_reclaim.h"
-#include "keylist.h"
-#include "recovery.h"
-#include "subvolume.h"
-#include "replicas.h"
-#include "trace.h"
-
-#include <linux/prefetch.h>
-#include <linux/sort.h>
-
-/*
- * bch2_btree_path_peek_slot() for a cached iterator might return a key in a
- * different snapshot:
- */
-static struct bkey_s_c bch2_btree_path_peek_slot_exact(struct btree_path *path, struct bkey *u)
-{
-	struct bkey_s_c k = bch2_btree_path_peek_slot(path, u);
-
-	if (k.k && bpos_eq(path->pos, k.k->p))
-		return k;
-
-	bkey_init(u);
-	u->p = path->pos;
-	return (struct bkey_s_c) { u, NULL };
-}
-
-static void verify_update_old_key(struct btree_trans *trans, struct btree_insert_entry *i)
-{
-#ifdef CONFIG_BCACHEFS_DEBUG
-	struct bch_fs *c = trans->c;
-	struct bkey u;
-	struct bkey_s_c k = bch2_btree_path_peek_slot_exact(i->path, &u);
-
-	if (unlikely(trans->journal_replay_not_finished)) {
-		struct bkey_i *j_k =
-			bch2_journal_keys_peek_slot(c, i->btree_id, i->level, i->k->k.p);
-
-		if (j_k)
-			k = bkey_i_to_s_c(j_k);
-	}
-
-	u = *k.k;
-	u.needs_whiteout = i->old_k.needs_whiteout;
-
-	BUG_ON(memcmp(&i->old_k, &u, sizeof(struct bkey)));
-	BUG_ON(i->old_v != k.v);
-#endif
-}
-
-static int __must_check
-bch2_trans_update_by_path(struct btree_trans *, struct btree_path *,
-			  struct bkey_i *, enum btree_update_flags,
-			  unsigned long ip);
-
-static inline int btree_insert_entry_cmp(const struct btree_insert_entry *l,
-					 const struct btree_insert_entry *r)
-{
-	return   cmp_int(l->btree_id,	r->btree_id) ?:
-		 cmp_int(l->cached,	r->cached) ?:
-		 -cmp_int(l->level,	r->level) ?:
-		 bpos_cmp(l->k->k.p,	r->k->k.p);
-}
-
-static inline struct btree_path_level *insert_l(struct btree_insert_entry *i)
-{
-	return i->path->l + i->level;
-}
-
-static inline bool same_leaf_as_prev(struct btree_trans *trans,
-				     struct btree_insert_entry *i)
-{
-	return i != trans->updates &&
-		insert_l(&i[0])->b == insert_l(&i[-1])->b;
-}
-
-static inline bool same_leaf_as_next(struct btree_trans *trans,
-				     struct btree_insert_entry *i)
-{
-	return i + 1 < trans->updates + trans->nr_updates &&
-		insert_l(&i[0])->b == insert_l(&i[1])->b;
-}
-
-inline void bch2_btree_node_prep_for_write(struct btree_trans *trans,
-					   struct btree_path *path,
-					   struct btree *b)
-{
-	struct bch_fs *c = trans->c;
-
-	if (unlikely(btree_node_just_written(b)) &&
-	    bch2_btree_post_write_cleanup(c, b))
-		bch2_trans_node_reinit_iter(trans, b);
-
-	/*
-	 * If the last bset has been written, or if it's gotten too big - start
-	 * a new bset to insert into:
-	 */
-	if (want_new_bset(c, b))
-		bch2_btree_init_next(trans, b);
-}
-
-/* Inserting into a given leaf node (last stage of insert): */
-
-/* Handle overwrites and do insert, for non extents: */
-bool bch2_btree_bset_insert_key(struct btree_trans *trans,
-				struct btree_path *path,
-				struct btree *b,
-				struct btree_node_iter *node_iter,
-				struct bkey_i *insert)
-{
-	struct bkey_packed *k;
-	unsigned clobber_u64s = 0, new_u64s = 0;
-
-	EBUG_ON(btree_node_just_written(b));
-	EBUG_ON(bset_written(b, btree_bset_last(b)));
-	EBUG_ON(bkey_deleted(&insert->k) && bkey_val_u64s(&insert->k));
-	EBUG_ON(bpos_lt(insert->k.p, b->data->min_key));
-	EBUG_ON(bpos_gt(insert->k.p, b->data->max_key));
-	EBUG_ON(insert->k.u64s >
-		bch_btree_keys_u64s_remaining(trans->c, b));
-
-	k = bch2_btree_node_iter_peek_all(node_iter, b);
-	if (k && bkey_cmp_left_packed(b, k, &insert->k.p))
-		k = NULL;
-
-	/* @k is the key being overwritten/deleted, if any: */
-	EBUG_ON(k && bkey_deleted(k));
-
-	/* Deleting, but not found? nothing to do: */
-	if (bkey_deleted(&insert->k) && !k)
-		return false;
-
-	if (bkey_deleted(&insert->k)) {
-		/* Deleting: */
-		btree_account_key_drop(b, k);
-		k->type = KEY_TYPE_deleted;
-
-		if (k->needs_whiteout)
-			push_whiteout(trans->c, b, insert->k.p);
-		k->needs_whiteout = false;
-
-		if (k >= btree_bset_last(b)->start) {
-			clobber_u64s = k->u64s;
-			bch2_bset_delete(b, k, clobber_u64s);
-			goto fix_iter;
-		} else {
-			bch2_btree_path_fix_key_modified(trans, b, k);
-		}
-
-		return true;
-	}
-
-	if (k) {
-		/* Overwriting: */
-		btree_account_key_drop(b, k);
-		k->type = KEY_TYPE_deleted;
-
-		insert->k.needs_whiteout = k->needs_whiteout;
-		k->needs_whiteout = false;
-
-		if (k >= btree_bset_last(b)->start) {
-			clobber_u64s = k->u64s;
-			goto overwrite;
-		} else {
-			bch2_btree_path_fix_key_modified(trans, b, k);
-		}
-	}
-
-	k = bch2_btree_node_iter_bset_pos(node_iter, b, bset_tree_last(b));
-overwrite:
-	bch2_bset_insert(b, node_iter, k, insert, clobber_u64s);
-	new_u64s = k->u64s;
-fix_iter:
-	if (clobber_u64s != new_u64s)
-		bch2_btree_node_iter_fix(trans, path, b, node_iter, k,
-					 clobber_u64s, new_u64s);
-	return true;
-}
-
-static int __btree_node_flush(struct journal *j, struct journal_entry_pin *pin,
-			       unsigned i, u64 seq)
-{
-	struct bch_fs *c = container_of(j, struct bch_fs, journal);
-	struct btree_write *w = container_of(pin, struct btree_write, journal);
-	struct btree *b = container_of(w, struct btree, writes[i]);
-	struct btree_trans trans;
-	unsigned long old, new, v;
-	unsigned idx = w - b->writes;
-
-	bch2_trans_init(&trans, c, 0, 0);
-
-	btree_node_lock_nopath_nofail(&trans, &b->c, SIX_LOCK_read);
-	v = READ_ONCE(b->flags);
-
-	do {
-		old = new = v;
-
-		if (!(old & (1 << BTREE_NODE_dirty)) ||
-		    !!(old & (1 << BTREE_NODE_write_idx)) != idx ||
-		    w->journal.seq != seq)
-			break;
-
-		new &= ~BTREE_WRITE_TYPE_MASK;
-		new |= BTREE_WRITE_journal_reclaim;
-		new |= 1 << BTREE_NODE_need_write;
-	} while ((v = cmpxchg(&b->flags, old, new)) != old);
-
-	btree_node_write_if_need(c, b, SIX_LOCK_read);
-	six_unlock_read(&b->c.lock);
-
-	bch2_trans_exit(&trans);
-	return 0;
-}
-
-int bch2_btree_node_flush0(struct journal *j, struct journal_entry_pin *pin, u64 seq)
-{
-	return __btree_node_flush(j, pin, 0, seq);
-}
-
-int bch2_btree_node_flush1(struct journal *j, struct journal_entry_pin *pin, u64 seq)
-{
-	return __btree_node_flush(j, pin, 1, seq);
-}
-
-inline void bch2_btree_add_journal_pin(struct bch_fs *c,
-				       struct btree *b, u64 seq)
-{
-	struct btree_write *w = btree_current_write(b);
-
-	bch2_journal_pin_add(&c->journal, seq, &w->journal,
-			     btree_node_write_idx(b) == 0
-			     ? bch2_btree_node_flush0
-			     : bch2_btree_node_flush1);
-}
-
-/**
- * btree_insert_key - insert a key one key into a leaf node
- */
-inline void bch2_btree_insert_key_leaf(struct btree_trans *trans,
-				       struct btree_path *path,
-				       struct bkey_i *insert,
-				       u64 journal_seq)
-{
-	struct bch_fs *c = trans->c;
-	struct btree *b = path_l(path)->b;
-	struct bset_tree *t = bset_tree_last(b);
-	struct bset *i = bset(b, t);
-	int old_u64s = bset_u64s(t);
-	int old_live_u64s = b->nr.live_u64s;
-	int live_u64s_added, u64s_added;
-
-	if (unlikely(!bch2_btree_bset_insert_key(trans, path, b,
-					&path_l(path)->iter, insert)))
-		return;
-
-	i->journal_seq = cpu_to_le64(max(journal_seq, le64_to_cpu(i->journal_seq)));
-
-	bch2_btree_add_journal_pin(c, b, journal_seq);
-
-	if (unlikely(!btree_node_dirty(b))) {
-		EBUG_ON(test_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags));
-		set_btree_node_dirty_acct(c, b);
-	}
-
-	live_u64s_added = (int) b->nr.live_u64s - old_live_u64s;
-	u64s_added = (int) bset_u64s(t) - old_u64s;
-
-	if (b->sib_u64s[0] != U16_MAX && live_u64s_added < 0)
-		b->sib_u64s[0] = max(0, (int) b->sib_u64s[0] + live_u64s_added);
-	if (b->sib_u64s[1] != U16_MAX && live_u64s_added < 0)
-		b->sib_u64s[1] = max(0, (int) b->sib_u64s[1] + live_u64s_added);
-
-	if (u64s_added > live_u64s_added &&
-	    bch2_maybe_compact_whiteouts(c, b))
-		bch2_trans_node_reinit_iter(trans, b);
-}
-
-/* Cached btree updates: */
-
-/* Normal update interface: */
-
-static inline void btree_insert_entry_checks(struct btree_trans *trans,
-					     struct btree_insert_entry *i)
-{
-	BUG_ON(!bpos_eq(i->k->k.p, i->path->pos));
-	BUG_ON(i->cached	!= i->path->cached);
-	BUG_ON(i->level		!= i->path->level);
-	BUG_ON(i->btree_id	!= i->path->btree_id);
-	EBUG_ON(!i->level &&
-		!(i->flags & BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) &&
-		test_bit(JOURNAL_REPLAY_DONE, &trans->c->journal.flags) &&
-		i->k->k.p.snapshot &&
-		bch2_snapshot_is_internal_node(trans->c, i->k->k.p.snapshot));
-}
-
-static noinline int
-bch2_trans_journal_preres_get_cold(struct btree_trans *trans, unsigned flags,
-				   unsigned long trace_ip)
-{
-	return drop_locks_do(trans,
-		bch2_journal_preres_get(&trans->c->journal,
-			&trans->journal_preres,
-			trans->journal_preres_u64s,
-			(flags & BCH_WATERMARK_MASK)));
-}
-
-static __always_inline int bch2_trans_journal_res_get(struct btree_trans *trans,
-						      unsigned flags)
-{
-	return bch2_journal_res_get(&trans->c->journal, &trans->journal_res,
-				    trans->journal_u64s, flags);
-}
-
-#define JSET_ENTRY_LOG_U64s		4
-
-static noinline void journal_transaction_name(struct btree_trans *trans)
-{
-	struct bch_fs *c = trans->c;
-	struct journal *j = &c->journal;
-	struct jset_entry *entry =
-		bch2_journal_add_entry(j, &trans->journal_res,
-				       BCH_JSET_ENTRY_log, 0, 0,
-				       JSET_ENTRY_LOG_U64s);
-	struct jset_entry_log *l =
-		container_of(entry, struct jset_entry_log, entry);
-
-	strncpy(l->d, trans->fn, JSET_ENTRY_LOG_U64s * sizeof(u64));
-}
-
-static inline int btree_key_can_insert(struct btree_trans *trans,
-				       struct btree *b, unsigned u64s)
-{
-	struct bch_fs *c = trans->c;
-
-	if (!bch2_btree_node_insert_fits(c, b, u64s))
-		return -BCH_ERR_btree_insert_btree_node_full;
-
-	return 0;
-}
-
-static int btree_key_can_insert_cached(struct btree_trans *trans, unsigned flags,
-				       struct btree_path *path, unsigned u64s)
-{
-	struct bch_fs *c = trans->c;
-	struct bkey_cached *ck = (void *) path->l[0].b;
-	struct btree_insert_entry *i;
-	unsigned new_u64s;
-	struct bkey_i *new_k;
-
-	EBUG_ON(path->level);
-
-	if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) &&
-	    bch2_btree_key_cache_must_wait(c) &&
-	    !(flags & BTREE_INSERT_JOURNAL_RECLAIM))
-		return -BCH_ERR_btree_insert_need_journal_reclaim;
-
-	/*
-	 * bch2_varint_decode can read past the end of the buffer by at most 7
-	 * bytes (it won't be used):
-	 */
-	u64s += 1;
-
-	if (u64s <= ck->u64s)
-		return 0;
-
-	new_u64s	= roundup_pow_of_two(u64s);
-	new_k		= krealloc(ck->k, new_u64s * sizeof(u64), GFP_NOFS);
-	if (!new_k) {
-		bch_err(c, "error allocating memory for key cache key, btree %s u64s %u",
-			bch2_btree_ids[path->btree_id], new_u64s);
-		return -BCH_ERR_ENOMEM_btree_key_cache_insert;
-	}
-
-	trans_for_each_update(trans, i)
-		if (i->old_v == &ck->k->v)
-			i->old_v = &new_k->v;
-
-	ck->u64s	= new_u64s;
-	ck->k		= new_k;
-	return 0;
-}
-
-/* Triggers: */
-
-static int run_one_mem_trigger(struct btree_trans *trans,
-			       struct btree_insert_entry *i,
-			       unsigned flags)
-{
-	struct bkey_s_c old = { &i->old_k, i->old_v };
-	struct bkey_i *new = i->k;
-	const struct bkey_ops *old_ops = bch2_bkey_type_ops(old.k->type);
-	const struct bkey_ops *new_ops = bch2_bkey_type_ops(i->k->k.type);
-	int ret;
-
-	verify_update_old_key(trans, i);
-
-	if (unlikely(flags & BTREE_TRIGGER_NORUN))
-		return 0;
-
-	if (!btree_node_type_needs_gc((enum btree_node_type) i->btree_id))
-		return 0;
-
-	if (old_ops->atomic_trigger == new_ops->atomic_trigger &&
-	    ((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) {
-		ret   = bch2_mark_key(trans, i->btree_id, i->level,
-				old, bkey_i_to_s_c(new),
-				BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|flags);
-	} else {
-		struct bkey		_deleted = KEY(0, 0, 0);
-		struct bkey_s_c		deleted = (struct bkey_s_c) { &_deleted, NULL };
-
-		_deleted.p = i->path->pos;
-
-		ret   = bch2_mark_key(trans, i->btree_id, i->level,
-				deleted, bkey_i_to_s_c(new),
-				BTREE_TRIGGER_INSERT|flags) ?:
-			bch2_mark_key(trans, i->btree_id, i->level,
-				old, deleted,
-				BTREE_TRIGGER_OVERWRITE|flags);
-	}
-
-	return ret;
-}
-
-static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_entry *i,
-				 bool overwrite)
-{
-	/*
-	 * Transactional triggers create new btree_insert_entries, so we can't
-	 * pass them a pointer to a btree_insert_entry, that memory is going to
-	 * move:
-	 */
-	struct bkey old_k = i->old_k;
-	struct bkey_s_c old = { &old_k, i->old_v };
-	const struct bkey_ops *old_ops = bch2_bkey_type_ops(old.k->type);
-	const struct bkey_ops *new_ops = bch2_bkey_type_ops(i->k->k.type);
-
-	verify_update_old_key(trans, i);
-
-	if ((i->flags & BTREE_TRIGGER_NORUN) ||
-	    !(BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type)))
-		return 0;
-
-	if (!i->insert_trigger_run &&
-	    !i->overwrite_trigger_run &&
-	    old_ops->trans_trigger == new_ops->trans_trigger &&
-	    ((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) {
-		i->overwrite_trigger_run = true;
-		i->insert_trigger_run = true;
-		return bch2_trans_mark_key(trans, i->btree_id, i->level, old, i->k,
-					   BTREE_TRIGGER_INSERT|
-					   BTREE_TRIGGER_OVERWRITE|
-					   i->flags) ?: 1;
-	} else if (overwrite && !i->overwrite_trigger_run) {
-		i->overwrite_trigger_run = true;
-		return bch2_trans_mark_old(trans, i->btree_id, i->level, old, i->flags) ?: 1;
-	} else if (!overwrite && !i->insert_trigger_run) {
-		i->insert_trigger_run = true;
-		return bch2_trans_mark_new(trans, i->btree_id, i->level, i->k, i->flags) ?: 1;
-	} else {
-		return 0;
-	}
-}
-
-static int run_btree_triggers(struct btree_trans *trans, enum btree_id btree_id,
-			      struct btree_insert_entry *btree_id_start)
-{
-	struct btree_insert_entry *i;
-	bool trans_trigger_run;
-	int ret, overwrite;
-
-	for (overwrite = 1; overwrite >= 0; --overwrite) {
-
-		/*
-		 * Running triggers will append more updates to the list of updates as
-		 * we're walking it:
-		 */
-		do {
-			trans_trigger_run = false;
-
-			for (i = btree_id_start;
-			     i < trans->updates + trans->nr_updates && i->btree_id <= btree_id;
-			     i++) {
-				if (i->btree_id != btree_id)
-					continue;
-
-				ret = run_one_trans_trigger(trans, i, overwrite);
-				if (ret < 0)
-					return ret;
-				if (ret)
-					trans_trigger_run = true;
-			}
-		} while (trans_trigger_run);
-	}
-
-	return 0;
-}
-
-static int bch2_trans_commit_run_triggers(struct btree_trans *trans)
-{
-	struct btree_insert_entry *i = NULL, *btree_id_start = trans->updates;
-	unsigned btree_id = 0;
-	int ret = 0;
-
-	/*
-	 *
-	 * For a given btree, this algorithm runs insert triggers before
-	 * overwrite triggers: this is so that when extents are being moved
-	 * (e.g. by FALLOCATE_FL_INSERT_RANGE), we don't drop references before
-	 * they are re-added.
-	 */
-	for (btree_id = 0; btree_id < BTREE_ID_NR; btree_id++) {
-		if (btree_id == BTREE_ID_alloc)
-			continue;
-
-		while (btree_id_start < trans->updates + trans->nr_updates &&
-		       btree_id_start->btree_id < btree_id)
-			btree_id_start++;
-
-		ret = run_btree_triggers(trans, btree_id, btree_id_start);
-		if (ret)
-			return ret;
-	}
-
-	trans_for_each_update(trans, i) {
-		if (i->btree_id > BTREE_ID_alloc)
-			break;
-		if (i->btree_id == BTREE_ID_alloc) {
-			ret = run_btree_triggers(trans, BTREE_ID_alloc, i);
-			if (ret)
-				return ret;
-			break;
-		}
-	}
-
-#ifdef CONFIG_BCACHEFS_DEBUG
-	trans_for_each_update(trans, i)
-		BUG_ON(!(i->flags & BTREE_TRIGGER_NORUN) &&
-		       (BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type)) &&
-		       (!i->insert_trigger_run || !i->overwrite_trigger_run));
-#endif
-	return 0;
-}
-
-static noinline int bch2_trans_commit_run_gc_triggers(struct btree_trans *trans)
-{
-	struct bch_fs *c = trans->c;
-	struct btree_insert_entry *i;
-	int ret = 0;
-
-	trans_for_each_update(trans, i) {
-		/*
-		 * XXX: synchronization of cached update triggers with gc
-		 * XXX: synchronization of interior node updates with gc
-		 */
-		BUG_ON(i->cached || i->level);
-
-		if (gc_visited(c, gc_pos_btree_node(insert_l(i)->b))) {
-			ret = run_one_mem_trigger(trans, i, i->flags|BTREE_TRIGGER_GC);
-			if (ret)
-				break;
-		}
-	}
-
-	return ret;
-}
-
-static inline int
-bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
-			       struct btree_insert_entry **stopped_at,
-			       unsigned long trace_ip)
-{
-	struct bch_fs *c = trans->c;
-	struct btree_insert_entry *i;
-	struct btree_write_buffered_key *wb;
-	struct btree_trans_commit_hook *h;
-	unsigned u64s = 0;
-	bool marking = false;
-	int ret;
-
-	if (race_fault()) {
-		trace_and_count(c, trans_restart_fault_inject, trans, trace_ip);
-		return btree_trans_restart_nounlock(trans, BCH_ERR_transaction_restart_fault_inject);
-	}
-
-	/*
-	 * Check if the insert will fit in the leaf node with the write lock
-	 * held, otherwise another thread could write the node changing the
-	 * amount of space available:
-	 */
-
-	prefetch(&trans->c->journal.flags);
-
-	trans_for_each_update(trans, i) {
-		/* Multiple inserts might go to same leaf: */
-		if (!same_leaf_as_prev(trans, i))
-			u64s = 0;
-
-		u64s += i->k->k.u64s;
-		ret = !i->cached
-			? btree_key_can_insert(trans, insert_l(i)->b, u64s)
-			: btree_key_can_insert_cached(trans, flags, i->path, u64s);
-		if (ret) {
-			*stopped_at = i;
-			return ret;
-		}
-
-		if (btree_node_type_needs_gc(i->bkey_type))
-			marking = true;
-	}
-
-	if (trans->nr_wb_updates &&
-	    trans->nr_wb_updates + c->btree_write_buffer.state.nr > c->btree_write_buffer.size)
-		return -BCH_ERR_btree_insert_need_flush_buffer;
-
-	/*
-	 * Don't get journal reservation until after we know insert will
-	 * succeed:
-	 */
-	if (likely(!(flags & BTREE_INSERT_JOURNAL_REPLAY))) {
-		ret = bch2_trans_journal_res_get(trans,
-				(flags & BCH_WATERMARK_MASK)|
-				JOURNAL_RES_GET_NONBLOCK);
-		if (ret)
-			return ret;
-
-		if (unlikely(trans->journal_transaction_names))
-			journal_transaction_name(trans);
-	} else {
-		trans->journal_res.seq = c->journal.replay_journal_seq;
-	}
-
-	/*
-	 * Not allowed to fail after we've gotten our journal reservation - we
-	 * have to use it:
-	 */
-
-	if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) &&
-	    !(flags & BTREE_INSERT_JOURNAL_REPLAY)) {
-		if (bch2_journal_seq_verify)
-			trans_for_each_update(trans, i)
-				i->k->k.version.lo = trans->journal_res.seq;
-		else if (bch2_inject_invalid_keys)
-			trans_for_each_update(trans, i)
-				i->k->k.version = MAX_VERSION;
-	}
-
-	if (trans->fs_usage_deltas &&
-	    bch2_trans_fs_usage_apply(trans, trans->fs_usage_deltas))
-		return -BCH_ERR_btree_insert_need_mark_replicas;
-
-	if (trans->nr_wb_updates) {
-		EBUG_ON(flags & BTREE_INSERT_JOURNAL_REPLAY);
-
-		ret = bch2_btree_insert_keys_write_buffer(trans);
-		if (ret)
-			goto revert_fs_usage;
-	}
-
-	h = trans->hooks;
-	while (h) {
-		ret = h->fn(trans, h);
-		if (ret)
-			goto revert_fs_usage;
-		h = h->next;
-	}
-
-	trans_for_each_update(trans, i)
-		if (BTREE_NODE_TYPE_HAS_MEM_TRIGGERS & (1U << i->bkey_type)) {
-			ret = run_one_mem_trigger(trans, i, i->flags);
-			if (ret)
-				goto fatal_err;
-		}
-
-	if (unlikely(c->gc_pos.phase)) {
-		ret = bch2_trans_commit_run_gc_triggers(trans);
-		if  (ret)
-			goto fatal_err;
-	}
-
-	if (unlikely(trans->extra_journal_entries.nr)) {
-		memcpy_u64s_small(journal_res_entry(&c->journal, &trans->journal_res),
-				  trans->extra_journal_entries.data,
-				  trans->extra_journal_entries.nr);
-
-		trans->journal_res.offset	+= trans->extra_journal_entries.nr;
-		trans->journal_res.u64s		-= trans->extra_journal_entries.nr;
-	}
-
-	if (likely(!(flags & BTREE_INSERT_JOURNAL_REPLAY))) {
-		struct journal *j = &c->journal;
-		struct jset_entry *entry;
-
-		trans_for_each_update(trans, i) {
-			if (i->key_cache_already_flushed)
-				continue;
-
-			if (i->flags & BTREE_UPDATE_NOJOURNAL)
-				continue;
-
-			verify_update_old_key(trans, i);
-
-			if (trans->journal_transaction_names) {
-				entry = bch2_journal_add_entry(j, &trans->journal_res,
-						       BCH_JSET_ENTRY_overwrite,
-						       i->btree_id, i->level,
-						       i->old_k.u64s);
-				bkey_reassemble(&entry->start[0],
-						(struct bkey_s_c) { &i->old_k, i->old_v });
-			}
-
-			entry = bch2_journal_add_entry(j, &trans->journal_res,
-					       BCH_JSET_ENTRY_btree_keys,
-					       i->btree_id, i->level,
-					       i->k->k.u64s);
-			bkey_copy(&entry->start[0], i->k);
-		}
-
-		trans_for_each_wb_update(trans, wb) {
-			entry = bch2_journal_add_entry(j, &trans->journal_res,
-					       BCH_JSET_ENTRY_btree_keys,
-					       wb->btree, 0,
-					       wb->k.k.u64s);
-			bkey_copy(&entry->start[0], &wb->k);
-		}
-
-		if (trans->journal_seq)
-			*trans->journal_seq = trans->journal_res.seq;
-	}
-
-	trans_for_each_update(trans, i) {
-		i->k->k.needs_whiteout = false;
-
-		if (!i->cached) {
-			u64 seq = trans->journal_res.seq;
-
-			if (i->flags & BTREE_UPDATE_PREJOURNAL)
-				seq = i->seq;
-
-			bch2_btree_insert_key_leaf(trans, i->path, i->k, seq);
-		} else if (!i->key_cache_already_flushed)
-			bch2_btree_insert_key_cached(trans, flags, i);
-		else {
-			bch2_btree_key_cache_drop(trans, i->path);
-			btree_path_set_dirty(i->path, BTREE_ITER_NEED_TRAVERSE);
-		}
-	}
-
-	return 0;
-fatal_err:
-	bch2_fatal_error(c);
-revert_fs_usage:
-	if (trans->fs_usage_deltas)
-		bch2_trans_fs_usage_revert(trans, trans->fs_usage_deltas);
-	return ret;
-}
-
-static noinline int trans_lock_write_fail(struct btree_trans *trans, struct btree_insert_entry *i)
-{
-	while (--i >= trans->updates) {
-		if (same_leaf_as_prev(trans, i))
-			continue;
-
-		bch2_btree_node_unlock_write(trans, i->path, insert_l(i)->b);
-	}
-
-	trace_and_count(trans->c, trans_restart_would_deadlock_write, trans);
-	return btree_trans_restart(trans, BCH_ERR_transaction_restart_would_deadlock_write);
-}
-
-static inline int trans_lock_write(struct btree_trans *trans)
-{
-	struct btree_insert_entry *i;
-
-	trans_for_each_update(trans, i) {
-		if (same_leaf_as_prev(trans, i))
-			continue;
-
-		if (bch2_btree_node_lock_write(trans, i->path, &insert_l(i)->b->c))
-			return trans_lock_write_fail(trans, i);
-
-		if (!i->cached)
-			bch2_btree_node_prep_for_write(trans, i->path, insert_l(i)->b);
-	}
-
-	return 0;
-}
-
-static noinline void bch2_drop_overwrites_from_journal(struct btree_trans *trans)
-{
-	struct btree_insert_entry *i;
-	struct btree_write_buffered_key *wb;
-
-	trans_for_each_update(trans, i)
-		bch2_journal_key_overwritten(trans->c, i->btree_id, i->level, i->k->k.p);
-
-	trans_for_each_wb_update(trans, wb)
-		bch2_journal_key_overwritten(trans->c, wb->btree, 0, wb->k.k.p);
-}
-
-#ifdef CONFIG_BCACHEFS_DEBUG
-static noinline int bch2_trans_commit_bkey_invalid(struct btree_trans *trans, unsigned flags,
-						   struct btree_insert_entry *i,
-						   struct printbuf *err)
-{
-	struct bch_fs *c = trans->c;
-	int rw = (flags & BTREE_INSERT_JOURNAL_REPLAY) ? READ : WRITE;
-
-	printbuf_reset(err);
-	prt_printf(err, "invalid bkey on insert from %s -> %ps",
-		   trans->fn, (void *) i->ip_allocated);
-	prt_newline(err);
-	printbuf_indent_add(err, 2);
-
-	bch2_bkey_val_to_text(err, c, bkey_i_to_s_c(i->k));
-	prt_newline(err);
-
-	bch2_bkey_invalid(c, bkey_i_to_s_c(i->k),
-			  i->bkey_type, rw, err);
-	bch2_print_string_as_lines(KERN_ERR, err->buf);
-
-	bch2_inconsistent_error(c);
-	bch2_dump_trans_updates(trans);
-	printbuf_exit(err);
-
-	return -EINVAL;
-}
-#endif
-
-/*
- * Get journal reservation, take write locks, and attempt to do btree update(s):
- */
-static inline int do_bch2_trans_commit(struct btree_trans *trans, unsigned flags,
-				       struct btree_insert_entry **stopped_at,
-				       unsigned long trace_ip)
-{
-	struct bch_fs *c = trans->c;
-	struct btree_insert_entry *i;
-	int ret = 0, u64s_delta = 0;
-
-#ifdef CONFIG_BCACHEFS_DEBUG
-	trans_for_each_update(trans, i) {
-		struct printbuf buf = PRINTBUF;
-		enum bkey_invalid_flags invalid_flags = 0;
-
-		if (!(flags & BTREE_INSERT_JOURNAL_REPLAY))
-			invalid_flags |= BKEY_INVALID_WRITE|BKEY_INVALID_COMMIT;
-
-		if (unlikely(bch2_bkey_invalid(c, bkey_i_to_s_c(i->k),
-					       i->bkey_type, invalid_flags, &buf)))
-			ret = bch2_trans_commit_bkey_invalid(trans, flags, i, &buf);
-		btree_insert_entry_checks(trans, i);
-		printbuf_exit(&buf);
-
-		if (ret)
-			return ret;
-	}
-#endif
-
-	trans_for_each_update(trans, i) {
-		if (i->cached)
-			continue;
-
-		u64s_delta += !bkey_deleted(&i->k->k) ? i->k->k.u64s : 0;
-		u64s_delta -= i->old_btree_u64s;
-
-		if (!same_leaf_as_next(trans, i)) {
-			if (u64s_delta <= 0) {
-				ret = bch2_foreground_maybe_merge(trans, i->path,
-							i->level, flags);
-				if (unlikely(ret))
-					return ret;
-			}
-
-			u64s_delta = 0;
-		}
-	}
-
-	ret = bch2_journal_preres_get(&c->journal,
-			&trans->journal_preres, trans->journal_preres_u64s,
-			(flags & BCH_WATERMARK_MASK)|JOURNAL_RES_GET_NONBLOCK);
-	if (unlikely(ret == -BCH_ERR_journal_preres_get_blocked))
-		ret = bch2_trans_journal_preres_get_cold(trans, flags, trace_ip);
-	if (unlikely(ret))
-		return ret;
-
-	ret = trans_lock_write(trans);
-	if (unlikely(ret))
-		return ret;
-
-	ret = bch2_trans_commit_write_locked(trans, flags, stopped_at, trace_ip);
-
-	if (!ret && unlikely(trans->journal_replay_not_finished))
-		bch2_drop_overwrites_from_journal(trans);
-
-	trans_for_each_update(trans, i)
-		if (!same_leaf_as_prev(trans, i))
-			bch2_btree_node_unlock_write_inlined(trans, i->path,
-							insert_l(i)->b);
-
-	if (!ret && trans->journal_pin)
-		bch2_journal_pin_add(&c->journal, trans->journal_res.seq,
-				     trans->journal_pin, NULL);
-
-	/*
-	 * Drop journal reservation after dropping write locks, since dropping
-	 * the journal reservation may kick off a journal write:
-	 */
-	bch2_journal_res_put(&c->journal, &trans->journal_res);
-
-	if (unlikely(ret))
-		return ret;
-
-	bch2_trans_downgrade(trans);
-
-	return 0;
-}
-
-static int journal_reclaim_wait_done(struct bch_fs *c)
-{
-	int ret = bch2_journal_error(&c->journal) ?:
-		!bch2_btree_key_cache_must_wait(c);
-
-	if (!ret)
-		journal_reclaim_kick(&c->journal);
-	return ret;
-}
-
-static noinline
-int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags,
-			    struct btree_insert_entry *i,
-			    int ret, unsigned long trace_ip)
-{
-	struct bch_fs *c = trans->c;
-
-	switch (ret) {
-	case -BCH_ERR_btree_insert_btree_node_full:
-		ret = bch2_btree_split_leaf(trans, i->path, flags);
-		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-			trace_and_count(c, trans_restart_btree_node_split, trans, trace_ip, i->path);
-		break;
-	case -BCH_ERR_btree_insert_need_mark_replicas:
-		ret = drop_locks_do(trans,
-			bch2_replicas_delta_list_mark(c, trans->fs_usage_deltas));
-		break;
-	case -BCH_ERR_journal_res_get_blocked:
-		/*
-		 * XXX: this should probably be a separate BTREE_INSERT_NONBLOCK
-		 * flag
-		 */
-		if ((flags & BTREE_INSERT_JOURNAL_RECLAIM) &&
-		    (flags & BCH_WATERMARK_MASK) != BCH_WATERMARK_reclaim) {
-			ret = -BCH_ERR_journal_reclaim_would_deadlock;
-			break;
-		}
-
-		ret = drop_locks_do(trans,
-			bch2_trans_journal_res_get(trans,
-					(flags & BCH_WATERMARK_MASK)|
-					JOURNAL_RES_GET_CHECK));
-		break;
-	case -BCH_ERR_btree_insert_need_journal_reclaim:
-		bch2_trans_unlock(trans);
-
-		trace_and_count(c, trans_blocked_journal_reclaim, trans, trace_ip);
-
-		wait_event_freezable(c->journal.reclaim_wait,
-				     (ret = journal_reclaim_wait_done(c)));
-		if (ret < 0)
-			break;
-
-		ret = bch2_trans_relock(trans);
-		break;
-	case -BCH_ERR_btree_insert_need_flush_buffer: {
-		struct btree_write_buffer *wb = &c->btree_write_buffer;
-
-		ret = 0;
-
-		if (wb->state.nr > wb->size * 3 / 4) {
-			bch2_trans_unlock(trans);
-			mutex_lock(&wb->flush_lock);
-
-			if (wb->state.nr > wb->size * 3 / 4) {
-				bch2_trans_begin(trans);
-				ret = __bch2_btree_write_buffer_flush(trans,
-						flags|BTREE_INSERT_NOCHECK_RW, true);
-				if (!ret) {
-					trace_and_count(c, trans_restart_write_buffer_flush, trans, _THIS_IP_);
-					ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_write_buffer_flush);
-				}
-			} else {
-				mutex_unlock(&wb->flush_lock);
-				ret = bch2_trans_relock(trans);
-			}
-		}
-		break;
-	}
-	default:
-		BUG_ON(ret >= 0);
-		break;
-	}
-
-	BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart) != !!trans->restarted);
-
-	bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOSPC) &&
-				!(flags & BTREE_INSERT_NOWAIT) &&
-				(flags & BTREE_INSERT_NOFAIL), c,
-		"%s: incorrectly got %s\n", __func__, bch2_err_str(ret));
-
-	return ret;
-}
-
-static noinline int
-bch2_trans_commit_get_rw_cold(struct btree_trans *trans, unsigned flags)
-{
-	struct bch_fs *c = trans->c;
-	int ret;
-
-	if (likely(!(flags & BTREE_INSERT_LAZY_RW)) ||
-	    test_bit(BCH_FS_STARTED, &c->flags))
-		return -BCH_ERR_erofs_trans_commit;
-
-	ret = drop_locks_do(trans, bch2_fs_read_write_early(c));
-	if (ret)
-		return ret;
-
-	bch2_write_ref_get(c, BCH_WRITE_REF_trans);
-	return 0;
-}
-
-/*
- * This is for updates done in the early part of fsck - btree_gc - before we've
- * gone RW. we only add the new key to the list of keys for journal replay to
- * do.
- */
-static noinline int
-do_bch2_trans_commit_to_journal_replay(struct btree_trans *trans)
-{
-	struct bch_fs *c = trans->c;
-	struct btree_insert_entry *i;
-	int ret = 0;
-
-	trans_for_each_update(trans, i) {
-		ret = bch2_journal_key_insert(c, i->btree_id, i->level, i->k);
-		if (ret)
-			break;
-	}
-
-	return ret;
-}
-
-int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
-{
-	struct bch_fs *c = trans->c;
-	struct btree_insert_entry *i = NULL;
-	struct btree_write_buffered_key *wb;
-	unsigned u64s;
-	int ret = 0;
-
-	if (!trans->nr_updates &&
-	    !trans->nr_wb_updates &&
-	    !trans->extra_journal_entries.nr)
-		goto out_reset;
-
-	if (flags & BTREE_INSERT_GC_LOCK_HELD)
-		lockdep_assert_held(&c->gc_lock);
-
-	ret = bch2_trans_commit_run_triggers(trans);
-	if (ret)
-		goto out_reset;
-
-	if (unlikely(!test_bit(BCH_FS_MAY_GO_RW, &c->flags))) {
-		ret = do_bch2_trans_commit_to_journal_replay(trans);
-		goto out_reset;
-	}
-
-	if (!(flags & BTREE_INSERT_NOCHECK_RW) &&
-	    unlikely(!bch2_write_ref_tryget(c, BCH_WRITE_REF_trans))) {
-		ret = bch2_trans_commit_get_rw_cold(trans, flags);
-		if (ret)
-			goto out_reset;
-	}
-
-	if (c->btree_write_buffer.state.nr > c->btree_write_buffer.size / 2 &&
-	    mutex_trylock(&c->btree_write_buffer.flush_lock)) {
-		bch2_trans_begin(trans);
-		bch2_trans_unlock(trans);
-
-		ret = __bch2_btree_write_buffer_flush(trans,
-					flags|BTREE_INSERT_NOCHECK_RW, true);
-		if (!ret) {
-			trace_and_count(c, trans_restart_write_buffer_flush, trans, _THIS_IP_);
-			ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_write_buffer_flush);
-		}
-		goto out;
-	}
-
-	EBUG_ON(test_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags));
-
-	memset(&trans->journal_preres, 0, sizeof(trans->journal_preres));
-
-	trans->journal_u64s		= trans->extra_journal_entries.nr;
-	trans->journal_preres_u64s	= 0;
-
-	trans->journal_transaction_names = READ_ONCE(c->opts.journal_transaction_names);
-
-	if (trans->journal_transaction_names)
-		trans->journal_u64s += jset_u64s(JSET_ENTRY_LOG_U64s);
-
-	trans_for_each_update(trans, i) {
-		EBUG_ON(!i->path->should_be_locked);
-
-		ret = bch2_btree_path_upgrade(trans, i->path, i->level + 1);
-		if (unlikely(ret))
-			goto out;
-
-		EBUG_ON(!btree_node_intent_locked(i->path, i->level));
-
-		if (i->key_cache_already_flushed)
-			continue;
-
-		/* we're going to journal the key being updated: */
-		u64s = jset_u64s(i->k->k.u64s);
-		if (i->cached &&
-		    likely(!(flags & BTREE_INSERT_JOURNAL_REPLAY)))
-			trans->journal_preres_u64s += u64s;
-
-		if (i->flags & BTREE_UPDATE_NOJOURNAL)
-			continue;
-
-		trans->journal_u64s += u64s;
-
-		/* and we're also going to log the overwrite: */
-		if (trans->journal_transaction_names)
-			trans->journal_u64s += jset_u64s(i->old_k.u64s);
-	}
-
-	trans_for_each_wb_update(trans, wb)
-		trans->journal_u64s += jset_u64s(wb->k.k.u64s);
-
-	if (trans->extra_journal_res) {
-		ret = bch2_disk_reservation_add(c, trans->disk_res,
-				trans->extra_journal_res,
-				(flags & BTREE_INSERT_NOFAIL)
-				? BCH_DISK_RESERVATION_NOFAIL : 0);
-		if (ret)
-			goto err;
-	}
-retry:
-	bch2_trans_verify_not_in_restart(trans);
-	memset(&trans->journal_res, 0, sizeof(trans->journal_res));
-
-	ret = do_bch2_trans_commit(trans, flags, &i, _RET_IP_);
-
-	/* make sure we didn't drop or screw up locks: */
-	bch2_trans_verify_locks(trans);
-
-	if (ret)
-		goto err;
-
-	trace_and_count(c, transaction_commit, trans, _RET_IP_);
-out:
-	bch2_journal_preres_put(&c->journal, &trans->journal_preres);
-
-	if (likely(!(flags & BTREE_INSERT_NOCHECK_RW)))
-		bch2_write_ref_put(c, BCH_WRITE_REF_trans);
-out_reset:
-	bch2_trans_reset_updates(trans);
-
-	return ret;
-err:
-	ret = bch2_trans_commit_error(trans, flags, i, ret, _RET_IP_);
-	if (ret)
-		goto out;
-
-	goto retry;
-}
-
-static noinline int __check_pos_snapshot_overwritten(struct btree_trans *trans,
-					  enum btree_id id,
-					  struct bpos pos)
-{
-	struct bch_fs *c = trans->c;
-	struct btree_iter iter;
-	struct bkey_s_c k;
-	int ret;
-
-	bch2_trans_iter_init(trans, &iter, id, pos,
-			     BTREE_ITER_NOT_EXTENTS|
-			     BTREE_ITER_ALL_SNAPSHOTS);
-	while (1) {
-		k = bch2_btree_iter_prev(&iter);
-		ret = bkey_err(k);
-		if (ret)
-			break;
-
-		if (!k.k)
-			break;
-
-		if (!bkey_eq(pos, k.k->p))
-			break;
-
-		if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, pos.snapshot)) {
-			ret = 1;
-			break;
-		}
-	}
-	bch2_trans_iter_exit(trans, &iter);
-
-	return ret;
-}
-
-static inline int check_pos_snapshot_overwritten(struct btree_trans *trans,
-					  enum btree_id id,
-					  struct bpos pos)
-{
-	if (!btree_type_has_snapshots(id) ||
-	    bch2_snapshot_is_leaf(trans->c, pos.snapshot))
-		return 0;
-
-	return __check_pos_snapshot_overwritten(trans, id, pos);
-}
-
-static noinline int extent_front_merge(struct btree_trans *trans,
-				       struct btree_iter *iter,
-				       struct bkey_s_c k,
-				       struct bkey_i **insert,
-				       enum btree_update_flags flags)
-{
-	struct bch_fs *c = trans->c;
-	struct bkey_i *update;
-	int ret;
-
-	update = bch2_bkey_make_mut_noupdate(trans, k);
-	ret = PTR_ERR_OR_ZERO(update);
-	if (ret)
-		return ret;
-
-	if (!bch2_bkey_merge(c, bkey_i_to_s(update), bkey_i_to_s_c(*insert)))
-		return 0;
-
-	ret =   check_pos_snapshot_overwritten(trans, iter->btree_id, k.k->p) ?:
-		check_pos_snapshot_overwritten(trans, iter->btree_id, (*insert)->k.p);
-	if (ret < 0)
-		return ret;
-	if (ret)
-		return 0;
-
-	ret = bch2_btree_delete_at(trans, iter, flags);
-	if (ret)
-		return ret;
-
-	*insert = update;
-	return 0;
-}
-
-static noinline int extent_back_merge(struct btree_trans *trans,
-				      struct btree_iter *iter,
-				      struct bkey_i *insert,
-				      struct bkey_s_c k)
-{
-	struct bch_fs *c = trans->c;
-	int ret;
-
-	ret =   check_pos_snapshot_overwritten(trans, iter->btree_id, insert->k.p) ?:
-		check_pos_snapshot_overwritten(trans, iter->btree_id, k.k->p);
-	if (ret < 0)
-		return ret;
-	if (ret)
-		return 0;
-
-	bch2_bkey_merge(c, bkey_i_to_s(insert), k);
-	return 0;
-}
-
-/*
- * When deleting, check if we need to emit a whiteout (because we're overwriting
- * something in an ancestor snapshot)
- */
-static int need_whiteout_for_snapshot(struct btree_trans *trans,
-				      enum btree_id btree_id, struct bpos pos)
-{
-	struct btree_iter iter;
-	struct bkey_s_c k;
-	u32 snapshot = pos.snapshot;
-	int ret;
-
-	if (!bch2_snapshot_parent(trans->c, pos.snapshot))
-		return 0;
-
-	pos.snapshot++;
-
-	for_each_btree_key_norestart(trans, iter, btree_id, pos,
-			   BTREE_ITER_ALL_SNAPSHOTS|
-			   BTREE_ITER_NOPRESERVE, k, ret) {
-		if (!bkey_eq(k.k->p, pos))
-			break;
-
-		if (bch2_snapshot_is_ancestor(trans->c, snapshot,
-					      k.k->p.snapshot)) {
-			ret = !bkey_whiteout(k.k);
-			break;
-		}
-	}
-	bch2_trans_iter_exit(trans, &iter);
-
-	return ret;
-}
-
-int __bch2_insert_snapshot_whiteouts(struct btree_trans *trans,
-				   enum btree_id id,
-				   struct bpos old_pos,
-				   struct bpos new_pos)
-{
-	struct bch_fs *c = trans->c;
-	struct btree_iter old_iter, new_iter = { NULL };
-	struct bkey_s_c old_k, new_k;
-	snapshot_id_list s;
-	struct bkey_i *update;
-	int ret;
-
-	if (!bch2_snapshot_has_children(c, old_pos.snapshot))
-		return 0;
-
-	darray_init(&s);
-
-	bch2_trans_iter_init(trans, &old_iter, id, old_pos,
-			     BTREE_ITER_NOT_EXTENTS|
-			     BTREE_ITER_ALL_SNAPSHOTS);
-	while ((old_k = bch2_btree_iter_prev(&old_iter)).k &&
-	       !(ret = bkey_err(old_k)) &&
-	       bkey_eq(old_pos, old_k.k->p)) {
-		struct bpos whiteout_pos =
-			SPOS(new_pos.inode, new_pos.offset, old_k.k->p.snapshot);;
-
-		if (!bch2_snapshot_is_ancestor(c, old_k.k->p.snapshot, old_pos.snapshot) ||
-		    snapshot_list_has_ancestor(c, &s, old_k.k->p.snapshot))
-			continue;
-
-		new_k = bch2_bkey_get_iter(trans, &new_iter, id, whiteout_pos,
-					   BTREE_ITER_NOT_EXTENTS|
-					   BTREE_ITER_INTENT);
-		ret = bkey_err(new_k);
-		if (ret)
-			break;
-
-		if (new_k.k->type == KEY_TYPE_deleted) {
-			update = bch2_trans_kmalloc(trans, sizeof(struct bkey_i));
-			ret = PTR_ERR_OR_ZERO(update);
-			if (ret)
-				break;
-
-			bkey_init(&update->k);
-			update->k.p		= whiteout_pos;
-			update->k.type		= KEY_TYPE_whiteout;
-
-			ret = bch2_trans_update(trans, &new_iter, update,
-						BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
-		}
-		bch2_trans_iter_exit(trans, &new_iter);
-
-		ret = snapshot_list_add(c, &s, old_k.k->p.snapshot);
-		if (ret)
-			break;
-	}
-	bch2_trans_iter_exit(trans, &new_iter);
-	bch2_trans_iter_exit(trans, &old_iter);
-	darray_exit(&s);
-
-	return ret;
-}
-
-int bch2_trans_update_extent_overwrite(struct btree_trans *trans,
-				       struct btree_iter *iter,
-				       enum btree_update_flags flags,
-				       struct bkey_s_c old,
-				       struct bkey_s_c new)
-{
-	enum btree_id btree_id = iter->btree_id;
-	struct bkey_i *update;
-	struct bpos new_start = bkey_start_pos(new.k);
-	bool front_split = bkey_lt(bkey_start_pos(old.k), new_start);
-	bool back_split  = bkey_gt(old.k->p, new.k->p);
-	int ret = 0, compressed_sectors;
-
-	/*
-	 * If we're going to be splitting a compressed extent, note it
-	 * so that __bch2_trans_commit() can increase our disk
-	 * reservation:
-	 */
-	if (((front_split && back_split) ||
-	     ((front_split || back_split) && old.k->p.snapshot != new.k->p.snapshot)) &&
-	    (compressed_sectors = bch2_bkey_sectors_compressed(old)))
-		trans->extra_journal_res += compressed_sectors;
-
-	if (front_split) {
-		update = bch2_bkey_make_mut_noupdate(trans, old);
-		if ((ret = PTR_ERR_OR_ZERO(update)))
-			return ret;
-
-		bch2_cut_back(new_start, update);
-
-		ret =   bch2_insert_snapshot_whiteouts(trans, btree_id,
-					old.k->p, update->k.p) ?:
-			bch2_btree_insert_nonextent(trans, btree_id, update,
-					BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|flags);
-		if (ret)
-			return ret;
-	}
-
-	/* If we're overwriting in a different snapshot - middle split: */
-	if (old.k->p.snapshot != new.k->p.snapshot &&
-	    (front_split || back_split)) {
-		update = bch2_bkey_make_mut_noupdate(trans, old);
-		if ((ret = PTR_ERR_OR_ZERO(update)))
-			return ret;
-
-		bch2_cut_front(new_start, update);
-		bch2_cut_back(new.k->p, update);
-
-		ret =   bch2_insert_snapshot_whiteouts(trans, btree_id,
-					old.k->p, update->k.p) ?:
-			bch2_btree_insert_nonextent(trans, btree_id, update,
-					  BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|flags);
-		if (ret)
-			return ret;
-	}
-
-	if (bkey_le(old.k->p, new.k->p)) {
-		update = bch2_trans_kmalloc(trans, sizeof(*update));
-		if ((ret = PTR_ERR_OR_ZERO(update)))
-			return ret;
-
-		bkey_init(&update->k);
-		update->k.p = old.k->p;
-		update->k.p.snapshot = new.k->p.snapshot;
-
-		if (new.k->p.snapshot != old.k->p.snapshot) {
-			update->k.type = KEY_TYPE_whiteout;
-		} else if (btree_type_has_snapshots(btree_id)) {
-			ret = need_whiteout_for_snapshot(trans, btree_id, update->k.p);
-			if (ret < 0)
-				return ret;
-			if (ret)
-				update->k.type = KEY_TYPE_whiteout;
-		}
-
-		ret = bch2_btree_insert_nonextent(trans, btree_id, update,
-					  BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|flags);
-		if (ret)
-			return ret;
-	}
-
-	if (back_split) {
-		update = bch2_bkey_make_mut_noupdate(trans, old);
-		if ((ret = PTR_ERR_OR_ZERO(update)))
-			return ret;
-
-		bch2_cut_front(new.k->p, update);
-
-		ret = bch2_trans_update_by_path(trans, iter->path, update,
-					  BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|
-					  flags, _RET_IP_);
-		if (ret)
-			return ret;
-	}
-
-	return 0;
-}
-
-static int bch2_trans_update_extent(struct btree_trans *trans,
-				    struct btree_iter *orig_iter,
-				    struct bkey_i *insert,
-				    enum btree_update_flags flags)
-{
-	struct btree_iter iter;
-	struct bkey_s_c k;
-	enum btree_id btree_id = orig_iter->btree_id;
-	int ret = 0;
-
-	bch2_trans_iter_init(trans, &iter, btree_id, bkey_start_pos(&insert->k),
-			     BTREE_ITER_INTENT|
-			     BTREE_ITER_WITH_UPDATES|
-			     BTREE_ITER_NOT_EXTENTS);
-	k = bch2_btree_iter_peek_upto(&iter, POS(insert->k.p.inode, U64_MAX));
-	if ((ret = bkey_err(k)))
-		goto err;
-	if (!k.k)
-		goto out;
-
-	if (bkey_eq(k.k->p, bkey_start_pos(&insert->k))) {
-		if (bch2_bkey_maybe_mergable(k.k, &insert->k)) {
-			ret = extent_front_merge(trans, &iter, k, &insert, flags);
-			if (ret)
-				goto err;
-		}
-
-		goto next;
-	}
-
-	while (bkey_gt(insert->k.p, bkey_start_pos(k.k))) {
-		bool done = bkey_lt(insert->k.p, k.k->p);
-
-		ret = bch2_trans_update_extent_overwrite(trans, &iter, flags, k, bkey_i_to_s_c(insert));
-		if (ret)
-			goto err;
-
-		if (done)
-			goto out;
-next:
-		bch2_btree_iter_advance(&iter);
-		k = bch2_btree_iter_peek_upto(&iter, POS(insert->k.p.inode, U64_MAX));
-		if ((ret = bkey_err(k)))
-			goto err;
-		if (!k.k)
-			goto out;
-	}
-
-	if (bch2_bkey_maybe_mergable(&insert->k, k.k)) {
-		ret = extent_back_merge(trans, &iter, insert, k);
-		if (ret)
-			goto err;
-	}
-out:
-	if (!bkey_deleted(&insert->k))
-		ret = bch2_btree_insert_nonextent(trans, btree_id, insert, flags);
-err:
-	bch2_trans_iter_exit(trans, &iter);
-
-	return ret;
-}
-
-static noinline int flush_new_cached_update(struct btree_trans *trans,
-					    struct btree_path *path,
-					    struct btree_insert_entry *i,
-					    enum btree_update_flags flags,
-					    unsigned long ip)
-{
-	struct btree_path *btree_path;
-	struct bkey k;
-	int ret;
-
-	btree_path = bch2_path_get(trans, path->btree_id, path->pos, 1, 0,
-				   BTREE_ITER_INTENT, _THIS_IP_);
-	ret = bch2_btree_path_traverse(trans, btree_path, 0);
-	if (ret)
-		goto out;
-
-	/*
-	 * The old key in the insert entry might actually refer to an existing
-	 * key in the btree that has been deleted from cache and not yet
-	 * flushed. Check for this and skip the flush so we don't run triggers
-	 * against a stale key.
-	 */
-	bch2_btree_path_peek_slot_exact(btree_path, &k);
-	if (!bkey_deleted(&k))
-		goto out;
-
-	i->key_cache_already_flushed = true;
-	i->flags |= BTREE_TRIGGER_NORUN;
-
-	btree_path_set_should_be_locked(btree_path);
-	ret = bch2_trans_update_by_path(trans, btree_path, i->k, flags, ip);
-out:
-	bch2_path_put(trans, btree_path, true);
-	return ret;
-}
-
-static int __must_check
-bch2_trans_update_by_path(struct btree_trans *trans, struct btree_path *path,
-			  struct bkey_i *k, enum btree_update_flags flags,
-			  unsigned long ip)
-{
-	struct bch_fs *c = trans->c;
-	struct btree_insert_entry *i, n;
-	u64 seq = 0;
-	int cmp;
-
-	EBUG_ON(!path->should_be_locked);
-	EBUG_ON(trans->nr_updates >= BTREE_ITER_MAX);
-	EBUG_ON(!bpos_eq(k->k.p, path->pos));
-
-	/*
-	 * The transaction journal res hasn't been allocated at this point.
-	 * That occurs at commit time. Reuse the seq field to pass in the seq
-	 * of a prejournaled key.
-	 */
-	if (flags & BTREE_UPDATE_PREJOURNAL)
-		seq = trans->journal_res.seq;
-
-	n = (struct btree_insert_entry) {
-		.flags		= flags,
-		.bkey_type	= __btree_node_type(path->level, path->btree_id),
-		.btree_id	= path->btree_id,
-		.level		= path->level,
-		.cached		= path->cached,
-		.path		= path,
-		.k		= k,
-		.seq		= seq,
-		.ip_allocated	= ip,
-	};
-
-#ifdef CONFIG_BCACHEFS_DEBUG
-	trans_for_each_update(trans, i)
-		BUG_ON(i != trans->updates &&
-		       btree_insert_entry_cmp(i - 1, i) >= 0);
-#endif
-
-	/*
-	 * Pending updates are kept sorted: first, find position of new update,
-	 * then delete/trim any updates the new update overwrites:
-	 */
-	trans_for_each_update(trans, i) {
-		cmp = btree_insert_entry_cmp(&n, i);
-		if (cmp <= 0)
-			break;
-	}
-
-	if (!cmp && i < trans->updates + trans->nr_updates) {
-		EBUG_ON(i->insert_trigger_run || i->overwrite_trigger_run);
-
-		bch2_path_put(trans, i->path, true);
-		i->flags	= n.flags;
-		i->cached	= n.cached;
-		i->k		= n.k;
-		i->path		= n.path;
-		i->seq		= n.seq;
-		i->ip_allocated	= n.ip_allocated;
-	} else {
-		array_insert_item(trans->updates, trans->nr_updates,
-				  i - trans->updates, n);
-
-		i->old_v = bch2_btree_path_peek_slot_exact(path, &i->old_k).v;
-		i->old_btree_u64s = !bkey_deleted(&i->old_k) ? i->old_k.u64s : 0;
-
-		if (unlikely(trans->journal_replay_not_finished)) {
-			struct bkey_i *j_k =
-				bch2_journal_keys_peek_slot(c, n.btree_id, n.level, k->k.p);
-
-			if (j_k) {
-				i->old_k = j_k->k;
-				i->old_v = &j_k->v;
-			}
-		}
-	}
-
-	__btree_path_get(i->path, true);
-
-	/*
-	 * If a key is present in the key cache, it must also exist in the
-	 * btree - this is necessary for cache coherency. When iterating over
-	 * a btree that's cached in the key cache, the btree iter code checks
-	 * the key cache - but the key has to exist in the btree for that to
-	 * work:
-	 */
-	if (path->cached && bkey_deleted(&i->old_k))
-		return flush_new_cached_update(trans, path, i, flags, ip);
-
-	return 0;
-}
-
-int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter,
-				   struct bkey_i *k, enum btree_update_flags flags)
-{
-	struct btree_path *path = iter->update_path ?: iter->path;
-	struct bkey_cached *ck;
-	int ret;
-
-	if (iter->flags & BTREE_ITER_IS_EXTENTS)
-		return bch2_trans_update_extent(trans, iter, k, flags);
-
-	if (bkey_deleted(&k->k) &&
-	    !(flags & BTREE_UPDATE_KEY_CACHE_RECLAIM) &&
-	    (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)) {
-		ret = need_whiteout_for_snapshot(trans, iter->btree_id, k->k.p);
-		if (unlikely(ret < 0))
-			return ret;
-
-		if (ret)
-			k->k.type = KEY_TYPE_whiteout;
-	}
-
-	/*
-	 * Ensure that updates to cached btrees go to the key cache:
-	 */
-	if (!(flags & BTREE_UPDATE_KEY_CACHE_RECLAIM) &&
-	    !path->cached &&
-	    !path->level &&
-	    btree_id_cached(trans->c, path->btree_id)) {
-		if (!iter->key_cache_path ||
-		    !iter->key_cache_path->should_be_locked ||
-		    !bpos_eq(iter->key_cache_path->pos, k->k.p)) {
-			if (!iter->key_cache_path)
-				iter->key_cache_path =
-					bch2_path_get(trans, path->btree_id, path->pos, 1, 0,
-						      BTREE_ITER_INTENT|
-						      BTREE_ITER_CACHED, _THIS_IP_);
-
-			iter->key_cache_path =
-				bch2_btree_path_set_pos(trans, iter->key_cache_path, path->pos,
-							iter->flags & BTREE_ITER_INTENT,
-							_THIS_IP_);
-
-			ret = bch2_btree_path_traverse(trans, iter->key_cache_path,
-						       BTREE_ITER_CACHED);
-			if (unlikely(ret))
-				return ret;
-
-			ck = (void *) iter->key_cache_path->l[0].b;
-
-			if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
-				trace_and_count(trans->c, trans_restart_key_cache_raced, trans, _RET_IP_);
-				return btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_raced);
-			}
-
-			btree_path_set_should_be_locked(iter->key_cache_path);
-		}
-
-		path = iter->key_cache_path;
-	}
-
-	return bch2_trans_update_by_path(trans, path, k, flags, _RET_IP_);
-}
-
-/*
- * Add a transaction update for a key that has already been journaled.
- */
-int __must_check bch2_trans_update_seq(struct btree_trans *trans, u64 seq,
-				       struct btree_iter *iter, struct bkey_i *k,
-				       enum btree_update_flags flags)
-{
-	trans->journal_res.seq = seq;
-	return bch2_trans_update(trans, iter, k, flags|BTREE_UPDATE_NOJOURNAL|
-						 BTREE_UPDATE_PREJOURNAL);
-}
-
-int __must_check bch2_trans_update_buffered(struct btree_trans *trans,
-					    enum btree_id btree,
-					    struct bkey_i *k)
-{
-	struct btree_write_buffered_key *i;
-	int ret;
-
-	EBUG_ON(trans->nr_wb_updates > trans->wb_updates_size);
-	EBUG_ON(k->k.u64s > BTREE_WRITE_BUFERED_U64s_MAX);
-
-	trans_for_each_wb_update(trans, i) {
-		if (i->btree == btree && bpos_eq(i->k.k.p, k->k.p)) {
-			bkey_copy(&i->k, k);
-			return 0;
-		}
-	}
-
-	if (!trans->wb_updates ||
-	    trans->nr_wb_updates == trans->wb_updates_size) {
-		struct btree_write_buffered_key *u;
-
-		if (trans->nr_wb_updates == trans->wb_updates_size) {
-			struct btree_transaction_stats *s = btree_trans_stats(trans);
-
-			BUG_ON(trans->wb_updates_size > U8_MAX / 2);
-			trans->wb_updates_size = max(1, trans->wb_updates_size * 2);
-			if (s)
-				s->wb_updates_size = trans->wb_updates_size;
-		}
-
-		u = bch2_trans_kmalloc_nomemzero(trans,
-					trans->wb_updates_size *
-					sizeof(struct btree_write_buffered_key));
-		ret = PTR_ERR_OR_ZERO(u);
-		if (ret)
-			return ret;
-
-		if (trans->nr_wb_updates)
-			memcpy(u, trans->wb_updates, trans->nr_wb_updates *
-			       sizeof(struct btree_write_buffered_key));
-		trans->wb_updates = u;
-	}
-
-	trans->wb_updates[trans->nr_wb_updates] = (struct btree_write_buffered_key) {
-		.btree	= btree,
-	};
-
-	bkey_copy(&trans->wb_updates[trans->nr_wb_updates].k, k);
-	trans->nr_wb_updates++;
-
-	return 0;
-}
-
-int bch2_bkey_get_empty_slot(struct btree_trans *trans, struct btree_iter *iter,
-			     enum btree_id btree, struct bpos end)
-{
-	struct bkey_s_c k;
-	int ret = 0;
-
-	bch2_trans_iter_init(trans, iter, btree, POS_MAX, BTREE_ITER_INTENT);
-	k = bch2_btree_iter_prev(iter);
-	ret = bkey_err(k);
-	if (ret)
-		goto err;
-
-	bch2_btree_iter_advance(iter);
-	k = bch2_btree_iter_peek_slot(iter);
-	ret = bkey_err(k);
-	if (ret)
-		goto err;
-
-	BUG_ON(k.k->type != KEY_TYPE_deleted);
-
-	if (bkey_gt(k.k->p, end)) {
-		ret = -BCH_ERR_ENOSPC_btree_slot;
-		goto err;
-	}
-
-	return 0;
-err:
-	bch2_trans_iter_exit(trans, iter);
-	return ret;
-}
-
-void bch2_trans_commit_hook(struct btree_trans *trans,
-			    struct btree_trans_commit_hook *h)
-{
-	h->next = trans->hooks;
-	trans->hooks = h;
-}
-
-int bch2_btree_insert_nonextent(struct btree_trans *trans,
-				enum btree_id btree, struct bkey_i *k,
-				enum btree_update_flags flags)
-{
-	struct btree_iter iter;
-	int ret;
-
-	bch2_trans_iter_init(trans, &iter, btree, k->k.p,
-			     BTREE_ITER_NOT_EXTENTS|
-			     BTREE_ITER_INTENT);
-	ret   = bch2_btree_iter_traverse(&iter) ?:
-		bch2_trans_update(trans, &iter, k, flags);
-	bch2_trans_iter_exit(trans, &iter);
-	return ret;
-}
-
-int __bch2_btree_insert(struct btree_trans *trans, enum btree_id id,
-			struct bkey_i *k, enum btree_update_flags flags)
-{
-	struct btree_iter iter;
-	int ret;
-
-	bch2_trans_iter_init(trans, &iter, id, bkey_start_pos(&k->k),
-			     BTREE_ITER_CACHED|
-			     BTREE_ITER_INTENT);
-	ret   = bch2_btree_iter_traverse(&iter) ?:
-		bch2_trans_update(trans, &iter, k, flags);
-	bch2_trans_iter_exit(trans, &iter);
-	return ret;
-}
-
-/**
- * bch2_btree_insert - insert keys into the extent btree
- * @c:			pointer to struct bch_fs
- * @id:			btree to insert into
- * @insert_keys:	list of keys to insert
- * @hook:		insert callback
- */
-int bch2_btree_insert(struct bch_fs *c, enum btree_id id,
-		      struct bkey_i *k,
-		      struct disk_reservation *disk_res,
-		      u64 *journal_seq, int flags)
-{
-	return bch2_trans_do(c, disk_res, journal_seq, flags,
-			     __bch2_btree_insert(&trans, id, k, 0));
-}
-
-int bch2_btree_delete_extent_at(struct btree_trans *trans, struct btree_iter *iter,
-				unsigned len, unsigned update_flags)
-{
-	struct bkey_i *k;
-
-	k = bch2_trans_kmalloc(trans, sizeof(*k));
-	if (IS_ERR(k))
-		return PTR_ERR(k);
-
-	bkey_init(&k->k);
-	k->k.p = iter->pos;
-	bch2_key_resize(&k->k, len);
-	return bch2_trans_update(trans, iter, k, update_flags);
-}
-
-int bch2_btree_delete_at(struct btree_trans *trans,
-			 struct btree_iter *iter, unsigned update_flags)
-{
-	return bch2_btree_delete_extent_at(trans, iter, 0, update_flags);
-}
-
-int bch2_btree_delete_at_buffered(struct btree_trans *trans,
-				  enum btree_id btree, struct bpos pos)
-{
-	struct bkey_i *k;
-
-	k = bch2_trans_kmalloc(trans, sizeof(*k));
-	if (IS_ERR(k))
-		return PTR_ERR(k);
-
-	bkey_init(&k->k);
-	k->k.p = pos;
-	return bch2_trans_update_buffered(trans, btree, k);
-}
-
-int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id id,
-				  struct bpos start, struct bpos end,
-				  unsigned update_flags,
-				  u64 *journal_seq)
-{
-	u32 restart_count = trans->restart_count;
-	struct btree_iter iter;
-	struct bkey_s_c k;
-	int ret = 0;
-
-	bch2_trans_iter_init(trans, &iter, id, start, BTREE_ITER_INTENT);
-	while ((k = bch2_btree_iter_peek_upto(&iter, end)).k) {
-		struct disk_reservation disk_res =
-			bch2_disk_reservation_init(trans->c, 0);
-		struct bkey_i delete;
-
-		ret = bkey_err(k);
-		if (ret)
-			goto err;
-
-		bkey_init(&delete.k);
-
-		/*
-		 * This could probably be more efficient for extents:
-		 */
-
-		/*
-		 * For extents, iter.pos won't necessarily be the same as
-		 * bkey_start_pos(k.k) (for non extents they always will be the
-		 * same). It's important that we delete starting from iter.pos
-		 * because the range we want to delete could start in the middle
-		 * of k.
-		 *
-		 * (bch2_btree_iter_peek() does guarantee that iter.pos >=
-		 * bkey_start_pos(k.k)).
-		 */
-		delete.k.p = iter.pos;
-
-		if (iter.flags & BTREE_ITER_IS_EXTENTS)
-			bch2_key_resize(&delete.k,
-					bpos_min(end, k.k->p).offset -
-					iter.pos.offset);
-
-		ret   = bch2_trans_update(trans, &iter, &delete, update_flags) ?:
-			bch2_trans_commit(trans, &disk_res, journal_seq,
-					  BTREE_INSERT_NOFAIL);
-		bch2_disk_reservation_put(trans->c, &disk_res);
-err:
-		/*
-		 * the bch2_trans_begin() call is in a weird place because we
-		 * need to call it after every transaction commit, to avoid path
-		 * overflow, but don't want to call it if the delete operation
-		 * is a no-op and we have no work to do:
-		 */
-		bch2_trans_begin(trans);
-
-		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-			ret = 0;
-		if (ret)
-			break;
-	}
-	bch2_trans_iter_exit(trans, &iter);
-
-	if (!ret && trans_was_restarted(trans, restart_count))
-		ret = -BCH_ERR_transaction_restart_nested;
-	return ret;
-}
-
-/*
- * bch_btree_delete_range - delete everything within a given range
- *
- * Range is a half open interval - [start, end)
- */
-int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id,
-			    struct bpos start, struct bpos end,
-			    unsigned update_flags,
-			    u64 *journal_seq)
-{
-	int ret = bch2_trans_run(c,
-			bch2_btree_delete_range_trans(&trans, id, start, end,
-						      update_flags, journal_seq));
-	if (ret == -BCH_ERR_transaction_restart_nested)
-		ret = 0;
-	return ret;
-}
-
-int bch2_btree_bit_mod(struct btree_trans *trans, enum btree_id btree,
-		       struct bpos pos, bool set)
-{
-	struct bkey_i *k;
-	int ret = 0;
-
-	k = bch2_trans_kmalloc_nomemzero(trans, sizeof(*k));
-	ret = PTR_ERR_OR_ZERO(k);
-	if (unlikely(ret))
-		return ret;
-
-	bkey_init(&k->k);
-	k->k.type = set ? KEY_TYPE_set : KEY_TYPE_deleted;
-	k->k.p = pos;
-
-	return bch2_trans_update_buffered(trans, btree, k);
-}
-
-static int __bch2_trans_log_msg(darray_u64 *entries, const char *fmt, va_list args)
-{
-	struct printbuf buf = PRINTBUF;
-	struct jset_entry_log *l;
-	unsigned u64s;
-	int ret;
-
-	prt_vprintf(&buf, fmt, args);
-	ret = buf.allocation_failure ? -BCH_ERR_ENOMEM_trans_log_msg : 0;
-	if (ret)
-		goto err;
-
-	u64s = DIV_ROUND_UP(buf.pos, sizeof(u64));
-
-	ret = darray_make_room(entries, jset_u64s(u64s));
-	if (ret)
-		goto err;
-
-	l = (void *) &darray_top(*entries);
-	l->entry.u64s		= cpu_to_le16(u64s);
-	l->entry.btree_id	= 0;
-	l->entry.level		= 1;
-	l->entry.type		= BCH_JSET_ENTRY_log;
-	l->entry.pad[0]		= 0;
-	l->entry.pad[1]		= 0;
-	l->entry.pad[2]		= 0;
-	memcpy(l->d, buf.buf, buf.pos);
-	while (buf.pos & 7)
-		l->d[buf.pos++] = '\0';
-
-	entries->nr += jset_u64s(u64s);
-err:
-	printbuf_exit(&buf);
-	return ret;
-}
-
-static int
-__bch2_fs_log_msg(struct bch_fs *c, unsigned commit_flags, const char *fmt,
-		  va_list args)
-{
-	int ret;
-
-	if (!test_bit(JOURNAL_STARTED, &c->journal.flags)) {
-		ret = __bch2_trans_log_msg(&c->journal.early_journal_entries, fmt, args);
-	} else {
-		ret = bch2_trans_do(c, NULL, NULL,
-			BTREE_INSERT_LAZY_RW|commit_flags,
-			__bch2_trans_log_msg(&trans.extra_journal_entries, fmt, args));
-	}
-
-	return ret;
-}
-
-int bch2_fs_log_msg(struct bch_fs *c, const char *fmt, ...)
-{
-	va_list args;
-	int ret;
-
-	va_start(args, fmt);
-	ret = __bch2_fs_log_msg(c, 0, fmt, args);
-	va_end(args);
-	return ret;
-}
-
-/*
- * Use for logging messages during recovery to enable reserved space and avoid
- * blocking.
- */
-int bch2_journal_log_msg(struct bch_fs *c, const char *fmt, ...)
-{
-	va_list args;
-	int ret;
-
-	va_start(args, fmt);
-	ret = __bch2_fs_log_msg(c, BCH_WATERMARK_reclaim, fmt, args);
-	va_end(args);
-	return ret;
-}
-- 
cgit 


From fb8e5b4cae91b1837006df3b6f81424e71ee8572 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 5 Aug 2023 15:40:21 -0400
Subject: bcachefs: sb-members.c

Split out a new file for bch_sb_field_members - we'll likely want to
move more code here in the future.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/Makefile           |   1 +
 fs/bcachefs/alloc_foreground.h |   2 +-
 fs/bcachefs/buckets.h          |  47 ++++++++-
 fs/bcachefs/disk_groups.c      |   1 +
 fs/bcachefs/journal_reclaim.c  |   3 +-
 fs/bcachefs/sb-members.c       | 173 +++++++++++++++++++++++++++++++++
 fs/bcachefs/sb-members.h       | 176 +++++++++++++++++++++++++++++++++
 fs/bcachefs/super-io.c         | 167 +-------------------------------
 fs/bcachefs/super.h            | 214 -----------------------------------------
 9 files changed, 401 insertions(+), 383 deletions(-)
 create mode 100644 fs/bcachefs/sb-members.c
 create mode 100644 fs/bcachefs/sb-members.h

(limited to 'fs')

diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile
index 1bb4d159630b..7baf27595343 100644
--- a/fs/bcachefs/Makefile
+++ b/fs/bcachefs/Makefile
@@ -64,6 +64,7 @@ bcachefs-y		:=	\
 	recovery.o		\
 	reflink.o		\
 	replicas.o		\
+	sb-members.o		\
 	siphash.o		\
 	six.o			\
 	subvolume.o		\
diff --git a/fs/bcachefs/alloc_foreground.h b/fs/bcachefs/alloc_foreground.h
index fee195f7eabf..7aaeec44c746 100644
--- a/fs/bcachefs/alloc_foreground.h
+++ b/fs/bcachefs/alloc_foreground.h
@@ -5,7 +5,7 @@
 #include "bcachefs.h"
 #include "alloc_types.h"
 #include "extents.h"
-#include "super.h"
+#include "sb-members.h"
 
 #include <linux/hash.h>
 
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index a418f664896d..f192809f50cf 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -10,7 +10,31 @@
 
 #include "buckets_types.h"
 #include "extents.h"
-#include "super.h"
+#include "sb-members.h"
+
+static inline size_t sector_to_bucket(const struct bch_dev *ca, sector_t s)
+{
+	return div_u64(s, ca->mi.bucket_size);
+}
+
+static inline sector_t bucket_to_sector(const struct bch_dev *ca, size_t b)
+{
+	return ((sector_t) b) * ca->mi.bucket_size;
+}
+
+static inline sector_t bucket_remainder(const struct bch_dev *ca, sector_t s)
+{
+	u32 remainder;
+
+	div_u64_rem(s, ca->mi.bucket_size, &remainder);
+	return remainder;
+}
+
+static inline size_t sector_to_bucket_and_offset(const struct bch_dev *ca, sector_t s,
+						 u32 *offset)
+{
+	return div_u64_rem(s, ca->mi.bucket_size, offset);
+}
 
 #define for_each_bucket(_b, _buckets)				\
 	for (_b = (_buckets)->b + (_buckets)->first_bucket;	\
@@ -292,6 +316,27 @@ int bch2_trans_mark_metadata_bucket(struct btree_trans *, struct bch_dev *,
 				    size_t, enum bch_data_type, unsigned);
 int bch2_trans_mark_dev_sb(struct bch_fs *, struct bch_dev *);
 
+static inline bool is_superblock_bucket(struct bch_dev *ca, u64 b)
+{
+	struct bch_sb_layout *layout = &ca->disk_sb.sb->layout;
+	u64 b_offset	= bucket_to_sector(ca, b);
+	u64 b_end	= bucket_to_sector(ca, b + 1);
+	unsigned i;
+
+	if (!b)
+		return true;
+
+	for (i = 0; i < layout->nr_superblocks; i++) {
+		u64 offset = le64_to_cpu(layout->sb_offset[i]);
+		u64 end = offset + (1 << layout->sb_max_size_bits);
+
+		if (!(offset >= b_end || end <= b_offset))
+			return true;
+	}
+
+	return false;
+}
+
 /* disk reservations: */
 
 static inline void bch2_disk_reservation_put(struct bch_fs *c,
diff --git a/fs/bcachefs/disk_groups.c b/fs/bcachefs/disk_groups.c
index de14ca3a9895..f36472c4a781 100644
--- a/fs/bcachefs/disk_groups.c
+++ b/fs/bcachefs/disk_groups.c
@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 #include "bcachefs.h"
 #include "disk_groups.h"
+#include "sb-members.h"
 #include "super-io.h"
 
 #include <linux/sort.h>
diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
index 9a2a534915dd..10e1860dad79 100644
--- a/fs/bcachefs/journal_reclaim.c
+++ b/fs/bcachefs/journal_reclaim.c
@@ -3,13 +3,14 @@
 #include "bcachefs.h"
 #include "btree_key_cache.h"
 #include "btree_update.h"
+#include "buckets.h"
 #include "errcode.h"
 #include "error.h"
 #include "journal.h"
 #include "journal_io.h"
 #include "journal_reclaim.h"
 #include "replicas.h"
-#include "super.h"
+#include "sb-members.h"
 #include "trace.h"
 
 #include <linux/kthread.h>
diff --git a/fs/bcachefs/sb-members.c b/fs/bcachefs/sb-members.c
new file mode 100644
index 000000000000..16a2b3389525
--- /dev/null
+++ b/fs/bcachefs/sb-members.c
@@ -0,0 +1,173 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "disk_groups.h"
+#include "replicas.h"
+#include "sb-members.h"
+#include "super-io.h"
+
+/* Code for bch_sb_field_members: */
+
+static int bch2_sb_members_validate(struct bch_sb *sb,
+				    struct bch_sb_field *f,
+				    struct printbuf *err)
+{
+	struct bch_sb_field_members *mi = field_to_type(f, members);
+	unsigned i;
+
+	if ((void *) (mi->members + sb->nr_devices) >
+	    vstruct_end(&mi->field)) {
+		prt_printf(err, "too many devices for section size");
+		return -BCH_ERR_invalid_sb_members;
+	}
+
+	for (i = 0; i < sb->nr_devices; i++) {
+		struct bch_member *m = mi->members + i;
+
+		if (!bch2_member_exists(m))
+			continue;
+
+		if (le64_to_cpu(m->nbuckets) > LONG_MAX) {
+			prt_printf(err, "device %u: too many buckets (got %llu, max %lu)",
+			       i, le64_to_cpu(m->nbuckets), LONG_MAX);
+			return -BCH_ERR_invalid_sb_members;
+		}
+
+		if (le64_to_cpu(m->nbuckets) -
+		    le16_to_cpu(m->first_bucket) < BCH_MIN_NR_NBUCKETS) {
+			prt_printf(err, "device %u: not enough buckets (got %llu, max %u)",
+			       i, le64_to_cpu(m->nbuckets), BCH_MIN_NR_NBUCKETS);
+			return -BCH_ERR_invalid_sb_members;
+		}
+
+		if (le16_to_cpu(m->bucket_size) <
+		    le16_to_cpu(sb->block_size)) {
+			prt_printf(err, "device %u: bucket size %u smaller than block size %u",
+			       i, le16_to_cpu(m->bucket_size), le16_to_cpu(sb->block_size));
+			return -BCH_ERR_invalid_sb_members;
+		}
+
+		if (le16_to_cpu(m->bucket_size) <
+		    BCH_SB_BTREE_NODE_SIZE(sb)) {
+			prt_printf(err, "device %u: bucket size %u smaller than btree node size %llu",
+			       i, le16_to_cpu(m->bucket_size), BCH_SB_BTREE_NODE_SIZE(sb));
+			return -BCH_ERR_invalid_sb_members;
+		}
+	}
+
+	return 0;
+}
+
+static void bch2_sb_members_to_text(struct printbuf *out, struct bch_sb *sb,
+				    struct bch_sb_field *f)
+{
+	struct bch_sb_field_members *mi = field_to_type(f, members);
+	struct bch_sb_field_disk_groups *gi = bch2_sb_get_disk_groups(sb);
+	unsigned i;
+
+	for (i = 0; i < sb->nr_devices; i++) {
+		struct bch_member *m = mi->members + i;
+		unsigned data_have = bch2_sb_dev_has_data(sb, i);
+		u64 bucket_size = le16_to_cpu(m->bucket_size);
+		u64 device_size = le64_to_cpu(m->nbuckets) * bucket_size;
+
+		if (!bch2_member_exists(m))
+			continue;
+
+		prt_printf(out, "Device:");
+		prt_tab(out);
+		prt_printf(out, "%u", i);
+		prt_newline(out);
+
+		printbuf_indent_add(out, 2);
+
+		prt_printf(out, "UUID:");
+		prt_tab(out);
+		pr_uuid(out, m->uuid.b);
+		prt_newline(out);
+
+		prt_printf(out, "Size:");
+		prt_tab(out);
+		prt_units_u64(out, device_size << 9);
+		prt_newline(out);
+
+		prt_printf(out, "Bucket size:");
+		prt_tab(out);
+		prt_units_u64(out, bucket_size << 9);
+		prt_newline(out);
+
+		prt_printf(out, "First bucket:");
+		prt_tab(out);
+		prt_printf(out, "%u", le16_to_cpu(m->first_bucket));
+		prt_newline(out);
+
+		prt_printf(out, "Buckets:");
+		prt_tab(out);
+		prt_printf(out, "%llu", le64_to_cpu(m->nbuckets));
+		prt_newline(out);
+
+		prt_printf(out, "Last mount:");
+		prt_tab(out);
+		if (m->last_mount)
+			pr_time(out, le64_to_cpu(m->last_mount));
+		else
+			prt_printf(out, "(never)");
+		prt_newline(out);
+
+		prt_printf(out, "State:");
+		prt_tab(out);
+		prt_printf(out, "%s",
+		       BCH_MEMBER_STATE(m) < BCH_MEMBER_STATE_NR
+		       ? bch2_member_states[BCH_MEMBER_STATE(m)]
+		       : "unknown");
+		prt_newline(out);
+
+		prt_printf(out, "Label:");
+		prt_tab(out);
+		if (BCH_MEMBER_GROUP(m)) {
+			unsigned idx = BCH_MEMBER_GROUP(m) - 1;
+
+			if (idx < disk_groups_nr(gi))
+				prt_printf(out, "%s (%u)",
+				       gi->entries[idx].label, idx);
+			else
+				prt_printf(out, "(bad disk labels section)");
+		} else {
+			prt_printf(out, "(none)");
+		}
+		prt_newline(out);
+
+		prt_printf(out, "Data allowed:");
+		prt_tab(out);
+		if (BCH_MEMBER_DATA_ALLOWED(m))
+			prt_bitflags(out, bch2_data_types, BCH_MEMBER_DATA_ALLOWED(m));
+		else
+			prt_printf(out, "(none)");
+		prt_newline(out);
+
+		prt_printf(out, "Has data:");
+		prt_tab(out);
+		if (data_have)
+			prt_bitflags(out, bch2_data_types, data_have);
+		else
+			prt_printf(out, "(none)");
+		prt_newline(out);
+
+		prt_printf(out, "Discard:");
+		prt_tab(out);
+		prt_printf(out, "%llu", BCH_MEMBER_DISCARD(m));
+		prt_newline(out);
+
+		prt_printf(out, "Freespace initialized:");
+		prt_tab(out);
+		prt_printf(out, "%llu", BCH_MEMBER_FREESPACE_INITIALIZED(m));
+		prt_newline(out);
+
+		printbuf_indent_sub(out, 2);
+	}
+}
+
+const struct bch_sb_field_ops bch_sb_field_ops_members = {
+	.validate	= bch2_sb_members_validate,
+	.to_text	= bch2_sb_members_to_text,
+};
diff --git a/fs/bcachefs/sb-members.h b/fs/bcachefs/sb-members.h
new file mode 100644
index 000000000000..34e1cf6046e3
--- /dev/null
+++ b/fs/bcachefs/sb-members.h
@@ -0,0 +1,176 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_SB_MEMBERS_H
+#define _BCACHEFS_SB_MEMBERS_H
+
+static inline bool bch2_dev_is_online(struct bch_dev *ca)
+{
+	return !percpu_ref_is_zero(&ca->io_ref);
+}
+
+static inline bool bch2_dev_is_readable(struct bch_dev *ca)
+{
+	return bch2_dev_is_online(ca) &&
+		ca->mi.state != BCH_MEMBER_STATE_failed;
+}
+
+static inline bool bch2_dev_get_ioref(struct bch_dev *ca, int rw)
+{
+	if (!percpu_ref_tryget(&ca->io_ref))
+		return false;
+
+	if (ca->mi.state == BCH_MEMBER_STATE_rw ||
+	    (ca->mi.state == BCH_MEMBER_STATE_ro && rw == READ))
+		return true;
+
+	percpu_ref_put(&ca->io_ref);
+	return false;
+}
+
+static inline unsigned dev_mask_nr(const struct bch_devs_mask *devs)
+{
+	return bitmap_weight(devs->d, BCH_SB_MEMBERS_MAX);
+}
+
+static inline bool bch2_dev_list_has_dev(struct bch_devs_list devs,
+					 unsigned dev)
+{
+	unsigned i;
+
+	for (i = 0; i < devs.nr; i++)
+		if (devs.devs[i] == dev)
+			return true;
+
+	return false;
+}
+
+static inline void bch2_dev_list_drop_dev(struct bch_devs_list *devs,
+					  unsigned dev)
+{
+	unsigned i;
+
+	for (i = 0; i < devs->nr; i++)
+		if (devs->devs[i] == dev) {
+			array_remove_item(devs->devs, devs->nr, i);
+			return;
+		}
+}
+
+static inline void bch2_dev_list_add_dev(struct bch_devs_list *devs,
+					 unsigned dev)
+{
+	if (!bch2_dev_list_has_dev(*devs, dev)) {
+		BUG_ON(devs->nr >= ARRAY_SIZE(devs->devs));
+		devs->devs[devs->nr++] = dev;
+	}
+}
+
+static inline struct bch_devs_list bch2_dev_list_single(unsigned dev)
+{
+	return (struct bch_devs_list) { .nr = 1, .devs[0] = dev };
+}
+
+static inline struct bch_dev *__bch2_next_dev(struct bch_fs *c, unsigned *iter,
+					      const struct bch_devs_mask *mask)
+{
+	struct bch_dev *ca = NULL;
+
+	while ((*iter = mask
+		? find_next_bit(mask->d, c->sb.nr_devices, *iter)
+		: *iter) < c->sb.nr_devices &&
+	       !(ca = rcu_dereference_check(c->devs[*iter],
+					    lockdep_is_held(&c->state_lock))))
+		(*iter)++;
+
+	return ca;
+}
+
+#define for_each_member_device_rcu(ca, c, iter, mask)			\
+	for ((iter) = 0; ((ca) = __bch2_next_dev((c), &(iter), mask)); (iter)++)
+
+static inline struct bch_dev *bch2_get_next_dev(struct bch_fs *c, unsigned *iter)
+{
+	struct bch_dev *ca;
+
+	rcu_read_lock();
+	if ((ca = __bch2_next_dev(c, iter, NULL)))
+		percpu_ref_get(&ca->ref);
+	rcu_read_unlock();
+
+	return ca;
+}
+
+/*
+ * If you break early, you must drop your ref on the current device
+ */
+#define for_each_member_device(ca, c, iter)				\
+	for ((iter) = 0;						\
+	     (ca = bch2_get_next_dev(c, &(iter)));			\
+	     percpu_ref_put(&ca->ref), (iter)++)
+
+static inline struct bch_dev *bch2_get_next_online_dev(struct bch_fs *c,
+						      unsigned *iter,
+						      int state_mask)
+{
+	struct bch_dev *ca;
+
+	rcu_read_lock();
+	while ((ca = __bch2_next_dev(c, iter, NULL)) &&
+	       (!((1 << ca->mi.state) & state_mask) ||
+		!percpu_ref_tryget(&ca->io_ref)))
+		(*iter)++;
+	rcu_read_unlock();
+
+	return ca;
+}
+
+#define __for_each_online_member(ca, c, iter, state_mask)		\
+	for ((iter) = 0;						\
+	     (ca = bch2_get_next_online_dev(c, &(iter), state_mask));	\
+	     percpu_ref_put(&ca->io_ref), (iter)++)
+
+#define for_each_online_member(ca, c, iter)				\
+	__for_each_online_member(ca, c, iter, ~0)
+
+#define for_each_rw_member(ca, c, iter)					\
+	__for_each_online_member(ca, c, iter, 1 << BCH_MEMBER_STATE_rw)
+
+#define for_each_readable_member(ca, c, iter)				\
+	__for_each_online_member(ca, c, iter,				\
+		(1 << BCH_MEMBER_STATE_rw)|(1 << BCH_MEMBER_STATE_ro))
+
+/*
+ * If a key exists that references a device, the device won't be going away and
+ * we can omit rcu_read_lock():
+ */
+static inline struct bch_dev *bch_dev_bkey_exists(const struct bch_fs *c, unsigned idx)
+{
+	EBUG_ON(idx >= c->sb.nr_devices || !c->devs[idx]);
+
+	return rcu_dereference_check(c->devs[idx], 1);
+}
+
+static inline struct bch_dev *bch_dev_locked(struct bch_fs *c, unsigned idx)
+{
+	EBUG_ON(idx >= c->sb.nr_devices || !c->devs[idx]);
+
+	return rcu_dereference_protected(c->devs[idx],
+					 lockdep_is_held(&c->sb_lock) ||
+					 lockdep_is_held(&c->state_lock));
+}
+
+/* XXX kill, move to struct bch_fs */
+static inline struct bch_devs_mask bch2_online_devs(struct bch_fs *c)
+{
+	struct bch_devs_mask devs;
+	struct bch_dev *ca;
+	unsigned i;
+
+	memset(&devs, 0, sizeof(devs));
+	for_each_online_member(ca, c, i)
+		__set_bit(ca->dev_idx, devs.d);
+	return devs;
+}
+
+extern const struct bch_sb_field_ops bch_sb_field_ops_members;
+
+#endif /* _BCACHEFS_SB_MEMBERS_H */
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index 405ea74d0b83..bea922e720a2 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -16,6 +16,7 @@
 #include "recovery.h"
 #include "replicas.h"
 #include "quota.h"
+#include "sb-members.h"
 #include "super-io.h"
 #include "super.h"
 #include "trace.h"
@@ -1015,172 +1016,6 @@ void __bch2_check_set_feature(struct bch_fs *c, unsigned feat)
 	mutex_unlock(&c->sb_lock);
 }
 
-/* BCH_SB_FIELD_members: */
-
-static int bch2_sb_members_validate(struct bch_sb *sb,
-				    struct bch_sb_field *f,
-				    struct printbuf *err)
-{
-	struct bch_sb_field_members *mi = field_to_type(f, members);
-	unsigned i;
-
-	if ((void *) (mi->members + sb->nr_devices) >
-	    vstruct_end(&mi->field)) {
-		prt_printf(err, "too many devices for section size");
-		return -BCH_ERR_invalid_sb_members;
-	}
-
-	for (i = 0; i < sb->nr_devices; i++) {
-		struct bch_member *m = mi->members + i;
-
-		if (!bch2_member_exists(m))
-			continue;
-
-		if (le64_to_cpu(m->nbuckets) > LONG_MAX) {
-			prt_printf(err, "device %u: too many buckets (got %llu, max %lu)",
-			       i, le64_to_cpu(m->nbuckets), LONG_MAX);
-			return -BCH_ERR_invalid_sb_members;
-		}
-
-		if (le64_to_cpu(m->nbuckets) -
-		    le16_to_cpu(m->first_bucket) < BCH_MIN_NR_NBUCKETS) {
-			prt_printf(err, "device %u: not enough buckets (got %llu, max %u)",
-			       i, le64_to_cpu(m->nbuckets), BCH_MIN_NR_NBUCKETS);
-			return -BCH_ERR_invalid_sb_members;
-		}
-
-		if (le16_to_cpu(m->bucket_size) <
-		    le16_to_cpu(sb->block_size)) {
-			prt_printf(err, "device %u: bucket size %u smaller than block size %u",
-			       i, le16_to_cpu(m->bucket_size), le16_to_cpu(sb->block_size));
-			return -BCH_ERR_invalid_sb_members;
-		}
-
-		if (le16_to_cpu(m->bucket_size) <
-		    BCH_SB_BTREE_NODE_SIZE(sb)) {
-			prt_printf(err, "device %u: bucket size %u smaller than btree node size %llu",
-			       i, le16_to_cpu(m->bucket_size), BCH_SB_BTREE_NODE_SIZE(sb));
-			return -BCH_ERR_invalid_sb_members;
-		}
-	}
-
-	return 0;
-}
-
-static void bch2_sb_members_to_text(struct printbuf *out, struct bch_sb *sb,
-				    struct bch_sb_field *f)
-{
-	struct bch_sb_field_members *mi = field_to_type(f, members);
-	struct bch_sb_field_disk_groups *gi = bch2_sb_get_disk_groups(sb);
-	unsigned i;
-
-	for (i = 0; i < sb->nr_devices; i++) {
-		struct bch_member *m = mi->members + i;
-		unsigned data_have = bch2_sb_dev_has_data(sb, i);
-		u64 bucket_size = le16_to_cpu(m->bucket_size);
-		u64 device_size = le64_to_cpu(m->nbuckets) * bucket_size;
-
-		if (!bch2_member_exists(m))
-			continue;
-
-		prt_printf(out, "Device:");
-		prt_tab(out);
-		prt_printf(out, "%u", i);
-		prt_newline(out);
-
-		printbuf_indent_add(out, 2);
-
-		prt_printf(out, "UUID:");
-		prt_tab(out);
-		pr_uuid(out, m->uuid.b);
-		prt_newline(out);
-
-		prt_printf(out, "Size:");
-		prt_tab(out);
-		prt_units_u64(out, device_size << 9);
-		prt_newline(out);
-
-		prt_printf(out, "Bucket size:");
-		prt_tab(out);
-		prt_units_u64(out, bucket_size << 9);
-		prt_newline(out);
-
-		prt_printf(out, "First bucket:");
-		prt_tab(out);
-		prt_printf(out, "%u", le16_to_cpu(m->first_bucket));
-		prt_newline(out);
-
-		prt_printf(out, "Buckets:");
-		prt_tab(out);
-		prt_printf(out, "%llu", le64_to_cpu(m->nbuckets));
-		prt_newline(out);
-
-		prt_printf(out, "Last mount:");
-		prt_tab(out);
-		if (m->last_mount)
-			pr_time(out, le64_to_cpu(m->last_mount));
-		else
-			prt_printf(out, "(never)");
-		prt_newline(out);
-
-		prt_printf(out, "State:");
-		prt_tab(out);
-		prt_printf(out, "%s",
-		       BCH_MEMBER_STATE(m) < BCH_MEMBER_STATE_NR
-		       ? bch2_member_states[BCH_MEMBER_STATE(m)]
-		       : "unknown");
-		prt_newline(out);
-
-		prt_printf(out, "Label:");
-		prt_tab(out);
-		if (BCH_MEMBER_GROUP(m)) {
-			unsigned idx = BCH_MEMBER_GROUP(m) - 1;
-
-			if (idx < disk_groups_nr(gi))
-				prt_printf(out, "%s (%u)",
-				       gi->entries[idx].label, idx);
-			else
-				prt_printf(out, "(bad disk labels section)");
-		} else {
-			prt_printf(out, "(none)");
-		}
-		prt_newline(out);
-
-		prt_printf(out, "Data allowed:");
-		prt_tab(out);
-		if (BCH_MEMBER_DATA_ALLOWED(m))
-			prt_bitflags(out, bch2_data_types, BCH_MEMBER_DATA_ALLOWED(m));
-		else
-			prt_printf(out, "(none)");
-		prt_newline(out);
-
-		prt_printf(out, "Has data:");
-		prt_tab(out);
-		if (data_have)
-			prt_bitflags(out, bch2_data_types, data_have);
-		else
-			prt_printf(out, "(none)");
-		prt_newline(out);
-
-		prt_printf(out, "Discard:");
-		prt_tab(out);
-		prt_printf(out, "%llu", BCH_MEMBER_DISCARD(m));
-		prt_newline(out);
-
-		prt_printf(out, "Freespace initialized:");
-		prt_tab(out);
-		prt_printf(out, "%llu", BCH_MEMBER_FREESPACE_INITIALIZED(m));
-		prt_newline(out);
-
-		printbuf_indent_sub(out, 2);
-	}
-}
-
-static const struct bch_sb_field_ops bch_sb_field_ops_members = {
-	.validate	= bch2_sb_members_validate,
-	.to_text	= bch2_sb_members_to_text,
-};
-
 /* BCH_SB_FIELD_crypt: */
 
 static int bch2_sb_crypt_validate(struct bch_sb *sb,
diff --git a/fs/bcachefs/super.h b/fs/bcachefs/super.h
index 36bcb9ec2b3a..bf762df18012 100644
--- a/fs/bcachefs/super.h
+++ b/fs/bcachefs/super.h
@@ -8,220 +8,6 @@
 
 #include <linux/math64.h>
 
-static inline size_t sector_to_bucket(const struct bch_dev *ca, sector_t s)
-{
-	return div_u64(s, ca->mi.bucket_size);
-}
-
-static inline sector_t bucket_to_sector(const struct bch_dev *ca, size_t b)
-{
-	return ((sector_t) b) * ca->mi.bucket_size;
-}
-
-static inline sector_t bucket_remainder(const struct bch_dev *ca, sector_t s)
-{
-	u32 remainder;
-
-	div_u64_rem(s, ca->mi.bucket_size, &remainder);
-	return remainder;
-}
-
-static inline size_t sector_to_bucket_and_offset(const struct bch_dev *ca, sector_t s,
-						 u32 *offset)
-{
-	return div_u64_rem(s, ca->mi.bucket_size, offset);
-}
-
-static inline bool bch2_dev_is_online(struct bch_dev *ca)
-{
-	return !percpu_ref_is_zero(&ca->io_ref);
-}
-
-static inline bool bch2_dev_is_readable(struct bch_dev *ca)
-{
-	return bch2_dev_is_online(ca) &&
-		ca->mi.state != BCH_MEMBER_STATE_failed;
-}
-
-static inline bool bch2_dev_get_ioref(struct bch_dev *ca, int rw)
-{
-	if (!percpu_ref_tryget(&ca->io_ref))
-		return false;
-
-	if (ca->mi.state == BCH_MEMBER_STATE_rw ||
-	    (ca->mi.state == BCH_MEMBER_STATE_ro && rw == READ))
-		return true;
-
-	percpu_ref_put(&ca->io_ref);
-	return false;
-}
-
-static inline unsigned dev_mask_nr(const struct bch_devs_mask *devs)
-{
-	return bitmap_weight(devs->d, BCH_SB_MEMBERS_MAX);
-}
-
-static inline bool bch2_dev_list_has_dev(struct bch_devs_list devs,
-					 unsigned dev)
-{
-	unsigned i;
-
-	for (i = 0; i < devs.nr; i++)
-		if (devs.devs[i] == dev)
-			return true;
-
-	return false;
-}
-
-static inline void bch2_dev_list_drop_dev(struct bch_devs_list *devs,
-					  unsigned dev)
-{
-	unsigned i;
-
-	for (i = 0; i < devs->nr; i++)
-		if (devs->devs[i] == dev) {
-			array_remove_item(devs->devs, devs->nr, i);
-			return;
-		}
-}
-
-static inline void bch2_dev_list_add_dev(struct bch_devs_list *devs,
-					 unsigned dev)
-{
-	if (!bch2_dev_list_has_dev(*devs, dev)) {
-		BUG_ON(devs->nr >= ARRAY_SIZE(devs->devs));
-		devs->devs[devs->nr++] = dev;
-	}
-}
-
-static inline struct bch_devs_list bch2_dev_list_single(unsigned dev)
-{
-	return (struct bch_devs_list) { .nr = 1, .devs[0] = dev };
-}
-
-static inline struct bch_dev *__bch2_next_dev(struct bch_fs *c, unsigned *iter,
-					      const struct bch_devs_mask *mask)
-{
-	struct bch_dev *ca = NULL;
-
-	while ((*iter = mask
-		? find_next_bit(mask->d, c->sb.nr_devices, *iter)
-		: *iter) < c->sb.nr_devices &&
-	       !(ca = rcu_dereference_check(c->devs[*iter],
-					    lockdep_is_held(&c->state_lock))))
-		(*iter)++;
-
-	return ca;
-}
-
-#define for_each_member_device_rcu(ca, c, iter, mask)			\
-	for ((iter) = 0; ((ca) = __bch2_next_dev((c), &(iter), mask)); (iter)++)
-
-static inline struct bch_dev *bch2_get_next_dev(struct bch_fs *c, unsigned *iter)
-{
-	struct bch_dev *ca;
-
-	rcu_read_lock();
-	if ((ca = __bch2_next_dev(c, iter, NULL)))
-		percpu_ref_get(&ca->ref);
-	rcu_read_unlock();
-
-	return ca;
-}
-
-/*
- * If you break early, you must drop your ref on the current device
- */
-#define for_each_member_device(ca, c, iter)				\
-	for ((iter) = 0;						\
-	     (ca = bch2_get_next_dev(c, &(iter)));			\
-	     percpu_ref_put(&ca->ref), (iter)++)
-
-static inline struct bch_dev *bch2_get_next_online_dev(struct bch_fs *c,
-						      unsigned *iter,
-						      int state_mask)
-{
-	struct bch_dev *ca;
-
-	rcu_read_lock();
-	while ((ca = __bch2_next_dev(c, iter, NULL)) &&
-	       (!((1 << ca->mi.state) & state_mask) ||
-		!percpu_ref_tryget(&ca->io_ref)))
-		(*iter)++;
-	rcu_read_unlock();
-
-	return ca;
-}
-
-#define __for_each_online_member(ca, c, iter, state_mask)		\
-	for ((iter) = 0;						\
-	     (ca = bch2_get_next_online_dev(c, &(iter), state_mask));	\
-	     percpu_ref_put(&ca->io_ref), (iter)++)
-
-#define for_each_online_member(ca, c, iter)				\
-	__for_each_online_member(ca, c, iter, ~0)
-
-#define for_each_rw_member(ca, c, iter)					\
-	__for_each_online_member(ca, c, iter, 1 << BCH_MEMBER_STATE_rw)
-
-#define for_each_readable_member(ca, c, iter)				\
-	__for_each_online_member(ca, c, iter,				\
-		(1 << BCH_MEMBER_STATE_rw)|(1 << BCH_MEMBER_STATE_ro))
-
-/*
- * If a key exists that references a device, the device won't be going away and
- * we can omit rcu_read_lock():
- */
-static inline struct bch_dev *bch_dev_bkey_exists(const struct bch_fs *c, unsigned idx)
-{
-	EBUG_ON(idx >= c->sb.nr_devices || !c->devs[idx]);
-
-	return rcu_dereference_check(c->devs[idx], 1);
-}
-
-static inline struct bch_dev *bch_dev_locked(struct bch_fs *c, unsigned idx)
-{
-	EBUG_ON(idx >= c->sb.nr_devices || !c->devs[idx]);
-
-	return rcu_dereference_protected(c->devs[idx],
-					 lockdep_is_held(&c->sb_lock) ||
-					 lockdep_is_held(&c->state_lock));
-}
-
-/* XXX kill, move to struct bch_fs */
-static inline struct bch_devs_mask bch2_online_devs(struct bch_fs *c)
-{
-	struct bch_devs_mask devs;
-	struct bch_dev *ca;
-	unsigned i;
-
-	memset(&devs, 0, sizeof(devs));
-	for_each_online_member(ca, c, i)
-		__set_bit(ca->dev_idx, devs.d);
-	return devs;
-}
-
-static inline bool is_superblock_bucket(struct bch_dev *ca, u64 b)
-{
-	struct bch_sb_layout *layout = &ca->disk_sb.sb->layout;
-	u64 b_offset	= bucket_to_sector(ca, b);
-	u64 b_end	= bucket_to_sector(ca, b + 1);
-	unsigned i;
-
-	if (!b)
-		return true;
-
-	for (i = 0; i < layout->nr_superblocks; i++) {
-		u64 offset = le64_to_cpu(layout->sb_offset[i]);
-		u64 end = offset + (1 << layout->sb_max_size_bits);
-
-		if (!(offset >= b_end || end <= b_offset))
-			return true;
-	}
-
-	return false;
-}
-
 struct bch_fs *bch2_dev_to_fs(dev_t);
 struct bch_fs *bch2_uuid_to_fs(__uuid_t);
 
-- 
cgit 


From 0ec3985694802bebbeb087165328810062e73130 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 5 Aug 2023 15:43:00 -0400
Subject: bcachefs: Move bch_sb_field_crypt code to checksum.c

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/checksum.c | 42 ++++++++++++++++++++++++++++++++++++++++++
 fs/bcachefs/checksum.h |  2 ++
 fs/bcachefs/super-io.c | 42 ------------------------------------------
 3 files changed, 44 insertions(+), 42 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/checksum.c b/fs/bcachefs/checksum.c
index 76cf2e70f019..36939020f67d 100644
--- a/fs/bcachefs/checksum.c
+++ b/fs/bcachefs/checksum.c
@@ -460,6 +460,48 @@ int bch2_rechecksum_bio(struct bch_fs *c, struct bio *bio,
 	return 0;
 }
 
+/* BCH_SB_FIELD_crypt: */
+
+static int bch2_sb_crypt_validate(struct bch_sb *sb,
+				  struct bch_sb_field *f,
+				  struct printbuf *err)
+{
+	struct bch_sb_field_crypt *crypt = field_to_type(f, crypt);
+
+	if (vstruct_bytes(&crypt->field) < sizeof(*crypt)) {
+		prt_printf(err, "wrong size (got %zu should be %zu)",
+		       vstruct_bytes(&crypt->field), sizeof(*crypt));
+		return -BCH_ERR_invalid_sb_crypt;
+	}
+
+	if (BCH_CRYPT_KDF_TYPE(crypt)) {
+		prt_printf(err, "bad kdf type %llu", BCH_CRYPT_KDF_TYPE(crypt));
+		return -BCH_ERR_invalid_sb_crypt;
+	}
+
+	return 0;
+}
+
+static void bch2_sb_crypt_to_text(struct printbuf *out, struct bch_sb *sb,
+				  struct bch_sb_field *f)
+{
+	struct bch_sb_field_crypt *crypt = field_to_type(f, crypt);
+
+	prt_printf(out, "KFD:               %llu", BCH_CRYPT_KDF_TYPE(crypt));
+	prt_newline(out);
+	prt_printf(out, "scrypt n:          %llu", BCH_KDF_SCRYPT_N(crypt));
+	prt_newline(out);
+	prt_printf(out, "scrypt r:          %llu", BCH_KDF_SCRYPT_R(crypt));
+	prt_newline(out);
+	prt_printf(out, "scrypt p:          %llu", BCH_KDF_SCRYPT_P(crypt));
+	prt_newline(out);
+}
+
+const struct bch_sb_field_ops bch_sb_field_ops_crypt = {
+	.validate	= bch2_sb_crypt_validate,
+	.to_text	= bch2_sb_crypt_to_text,
+};
+
 #ifdef __KERNEL__
 static int __bch2_request_key(char *key_description, struct bch_key *key)
 {
diff --git a/fs/bcachefs/checksum.h b/fs/bcachefs/checksum.h
index 1ad1d5f03939..c7b1a8fca685 100644
--- a/fs/bcachefs/checksum.h
+++ b/fs/bcachefs/checksum.h
@@ -72,6 +72,8 @@ static inline int bch2_encrypt_bio(struct bch_fs *c, unsigned type,
 		: 0;
 }
 
+extern const struct bch_sb_field_ops bch_sb_field_ops_crypt;
+
 int bch2_decrypt_sb_key(struct bch_fs *, struct bch_sb_field_crypt *,
 			struct bch_key *);
 
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index bea922e720a2..ff86088022fc 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -1016,48 +1016,6 @@ void __bch2_check_set_feature(struct bch_fs *c, unsigned feat)
 	mutex_unlock(&c->sb_lock);
 }
 
-/* BCH_SB_FIELD_crypt: */
-
-static int bch2_sb_crypt_validate(struct bch_sb *sb,
-				  struct bch_sb_field *f,
-				  struct printbuf *err)
-{
-	struct bch_sb_field_crypt *crypt = field_to_type(f, crypt);
-
-	if (vstruct_bytes(&crypt->field) < sizeof(*crypt)) {
-		prt_printf(err, "wrong size (got %zu should be %zu)",
-		       vstruct_bytes(&crypt->field), sizeof(*crypt));
-		return -BCH_ERR_invalid_sb_crypt;
-	}
-
-	if (BCH_CRYPT_KDF_TYPE(crypt)) {
-		prt_printf(err, "bad kdf type %llu", BCH_CRYPT_KDF_TYPE(crypt));
-		return -BCH_ERR_invalid_sb_crypt;
-	}
-
-	return 0;
-}
-
-static void bch2_sb_crypt_to_text(struct printbuf *out, struct bch_sb *sb,
-				  struct bch_sb_field *f)
-{
-	struct bch_sb_field_crypt *crypt = field_to_type(f, crypt);
-
-	prt_printf(out, "KFD:               %llu", BCH_CRYPT_KDF_TYPE(crypt));
-	prt_newline(out);
-	prt_printf(out, "scrypt n:          %llu", BCH_KDF_SCRYPT_N(crypt));
-	prt_newline(out);
-	prt_printf(out, "scrypt r:          %llu", BCH_KDF_SCRYPT_R(crypt));
-	prt_newline(out);
-	prt_printf(out, "scrypt p:          %llu", BCH_KDF_SCRYPT_P(crypt));
-	prt_newline(out);
-}
-
-static const struct bch_sb_field_ops bch_sb_field_ops_crypt = {
-	.validate	= bch2_sb_crypt_validate,
-	.to_text	= bch2_sb_crypt_to_text,
-};
-
 /* BCH_SB_FIELD_clean: */
 
 int bch2_sb_clean_validate_late(struct bch_fs *c, struct bch_sb_field_clean *clean, int write)
-- 
cgit 


From a37ad1a3aba957b20c8c434a88c1724e49409eb3 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 5 Aug 2023 15:54:38 -0400
Subject: bcachefs: sb-clean.c

Pull code for bch_sb_field_clean out into its own file.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/Makefile     |   1 +
 fs/bcachefs/journal_io.c |   1 +
 fs/bcachefs/recovery.c   | 145 ++---------------
 fs/bcachefs/sb-clean.c   | 395 +++++++++++++++++++++++++++++++++++++++++++++++
 fs/bcachefs/sb-clean.h   |  16 ++
 fs/bcachefs/super-io.c   | 251 +-----------------------------
 fs/bcachefs/super-io.h   |  10 --
 fs/bcachefs/super.c      |   1 +
 8 files changed, 424 insertions(+), 396 deletions(-)
 create mode 100644 fs/bcachefs/sb-clean.c
 create mode 100644 fs/bcachefs/sb-clean.h

(limited to 'fs')

diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile
index 7baf27595343..0ee755864e68 100644
--- a/fs/bcachefs/Makefile
+++ b/fs/bcachefs/Makefile
@@ -64,6 +64,7 @@ bcachefs-y		:=	\
 	recovery.o		\
 	reflink.o		\
 	replicas.o		\
+	sb-clean.o		\
 	sb-members.o		\
 	siphash.o		\
 	six.o			\
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 42c9700e6d26..378b3f9170d4 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -14,6 +14,7 @@
 #include "journal_reclaim.h"
 #include "journal_seq_blacklist.h"
 #include "replicas.h"
+#include "sb-clean.h"
 #include "trace.h"
 
 static struct nonce journal_nonce(const struct jset *jset)
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 5dbe1b273b71..048a62f90b06 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -23,6 +23,7 @@
 #include "quota.h"
 #include "recovery.h"
 #include "replicas.h"
+#include "sb-clean.h"
 #include "subvolume.h"
 #include "super-io.h"
 
@@ -846,134 +847,6 @@ static int journal_replay_early(struct bch_fs *c,
 
 /* sb clean section: */
 
-static struct bkey_i *btree_root_find(struct bch_fs *c,
-				      struct bch_sb_field_clean *clean,
-				      struct jset *j,
-				      enum btree_id id, unsigned *level)
-{
-	struct bkey_i *k;
-	struct jset_entry *entry, *start, *end;
-
-	if (clean) {
-		start = clean->start;
-		end = vstruct_end(&clean->field);
-	} else {
-		start = j->start;
-		end = vstruct_last(j);
-	}
-
-	for (entry = start; entry < end; entry = vstruct_next(entry))
-		if (entry->type == BCH_JSET_ENTRY_btree_root &&
-		    entry->btree_id == id)
-			goto found;
-
-	return NULL;
-found:
-	if (!entry->u64s)
-		return ERR_PTR(-EINVAL);
-
-	k = entry->start;
-	*level = entry->level;
-	return k;
-}
-
-static int verify_superblock_clean(struct bch_fs *c,
-				   struct bch_sb_field_clean **cleanp,
-				   struct jset *j)
-{
-	unsigned i;
-	struct bch_sb_field_clean *clean = *cleanp;
-	struct printbuf buf1 = PRINTBUF;
-	struct printbuf buf2 = PRINTBUF;
-	int ret = 0;
-
-	if (mustfix_fsck_err_on(j->seq != clean->journal_seq, c,
-			"superblock journal seq (%llu) doesn't match journal (%llu) after clean shutdown",
-			le64_to_cpu(clean->journal_seq),
-			le64_to_cpu(j->seq))) {
-		kfree(clean);
-		*cleanp = NULL;
-		return 0;
-	}
-
-	for (i = 0; i < BTREE_ID_NR; i++) {
-		struct bkey_i *k1, *k2;
-		unsigned l1 = 0, l2 = 0;
-
-		k1 = btree_root_find(c, clean, NULL, i, &l1);
-		k2 = btree_root_find(c, NULL, j, i, &l2);
-
-		if (!k1 && !k2)
-			continue;
-
-		printbuf_reset(&buf1);
-		printbuf_reset(&buf2);
-
-		if (k1)
-			bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(k1));
-		else
-			prt_printf(&buf1, "(none)");
-
-		if (k2)
-			bch2_bkey_val_to_text(&buf2, c, bkey_i_to_s_c(k2));
-		else
-			prt_printf(&buf2, "(none)");
-
-		mustfix_fsck_err_on(!k1 || !k2 ||
-				    IS_ERR(k1) ||
-				    IS_ERR(k2) ||
-				    k1->k.u64s != k2->k.u64s ||
-				    memcmp(k1, k2, bkey_bytes(&k1->k)) ||
-				    l1 != l2, c,
-			"superblock btree root %u doesn't match journal after clean shutdown\n"
-			"sb:      l=%u %s\n"
-			"journal: l=%u %s\n", i,
-			l1, buf1.buf,
-			l2, buf2.buf);
-	}
-fsck_err:
-	printbuf_exit(&buf2);
-	printbuf_exit(&buf1);
-	return ret;
-}
-
-static struct bch_sb_field_clean *read_superblock_clean(struct bch_fs *c)
-{
-	struct bch_sb_field_clean *clean, *sb_clean;
-	int ret;
-
-	mutex_lock(&c->sb_lock);
-	sb_clean = bch2_sb_get_clean(c->disk_sb.sb);
-
-	if (fsck_err_on(!sb_clean, c,
-			"superblock marked clean but clean section not present")) {
-		SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
-		c->sb.clean = false;
-		mutex_unlock(&c->sb_lock);
-		return NULL;
-	}
-
-	clean = kmemdup(sb_clean, vstruct_bytes(&sb_clean->field),
-			GFP_KERNEL);
-	if (!clean) {
-		mutex_unlock(&c->sb_lock);
-		return ERR_PTR(-BCH_ERR_ENOMEM_read_superblock_clean);
-	}
-
-	ret = bch2_sb_clean_validate_late(c, clean, READ);
-	if (ret) {
-		mutex_unlock(&c->sb_lock);
-		return ERR_PTR(ret);
-	}
-
-	mutex_unlock(&c->sb_lock);
-
-	return clean;
-fsck_err:
-	mutex_unlock(&c->sb_lock);
-	return ERR_PTR(ret);
-}
-
 static bool btree_id_is_alloc(enum btree_id id)
 {
 	switch (id) {
@@ -1297,17 +1170,17 @@ int bch2_fs_recovery(struct bch_fs *c)
 	bool write_sb = false;
 	int ret = 0;
 
-	if (c->sb.clean)
-		clean = read_superblock_clean(c);
-	ret = PTR_ERR_OR_ZERO(clean);
-	if (ret)
-		goto err;
+	if (c->sb.clean) {
+		clean = bch2_read_superblock_clean(c);
+		ret = PTR_ERR_OR_ZERO(clean);
+		if (ret)
+			goto err;
 
-	if (c->sb.clean)
 		bch_info(c, "recovering from clean shutdown, journal seq %llu",
 			 le64_to_cpu(clean->journal_seq));
-	else
+	} else {
 		bch_info(c, "recovering from unclean shutdown");
+	}
 
 	if (!(c->sb.features & (1ULL << BCH_FEATURE_new_extent_overwrite))) {
 		bch_err(c, "feature new_extent_overwrite not set, filesystem no longer supported");
@@ -1386,7 +1259,7 @@ int bch2_fs_recovery(struct bch_fs *c)
 			goto err;
 
 		if (c->sb.clean && last_journal_entry) {
-			ret = verify_superblock_clean(c, &clean,
+			ret = bch2_verify_superblock_clean(c, &clean,
 						      last_journal_entry);
 			if (ret)
 				goto err;
diff --git a/fs/bcachefs/sb-clean.c b/fs/bcachefs/sb-clean.c
new file mode 100644
index 000000000000..a3695e56a155
--- /dev/null
+++ b/fs/bcachefs/sb-clean.c
@@ -0,0 +1,395 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "btree_update_interior.h"
+#include "buckets.h"
+#include "error.h"
+#include "journal_io.h"
+#include "replicas.h"
+#include "sb-clean.h"
+#include "super-io.h"
+
+/*
+ * BCH_SB_FIELD_clean:
+ *
+ * Btree roots, and a few other things, are recovered from the journal after an
+ * unclean shutdown - but after a clean shutdown, to avoid having to read the
+ * journal, we can store them in the superblock.
+ *
+ * bch_sb_field_clean simply contains a list of journal entries, stored exactly
+ * as they would be in the journal:
+ */
+
+int bch2_sb_clean_validate_late(struct bch_fs *c, struct bch_sb_field_clean *clean,
+				int write)
+{
+	struct jset_entry *entry;
+	int ret;
+
+	for (entry = clean->start;
+	     entry < (struct jset_entry *) vstruct_end(&clean->field);
+	     entry = vstruct_next(entry)) {
+		ret = bch2_journal_entry_validate(c, NULL, entry,
+						  le16_to_cpu(c->disk_sb.sb->version),
+						  BCH_SB_BIG_ENDIAN(c->disk_sb.sb),
+						  write);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+static struct bkey_i *btree_root_find(struct bch_fs *c,
+				      struct bch_sb_field_clean *clean,
+				      struct jset *j,
+				      enum btree_id id, unsigned *level)
+{
+	struct bkey_i *k;
+	struct jset_entry *entry, *start, *end;
+
+	if (clean) {
+		start = clean->start;
+		end = vstruct_end(&clean->field);
+	} else {
+		start = j->start;
+		end = vstruct_last(j);
+	}
+
+	for (entry = start; entry < end; entry = vstruct_next(entry))
+		if (entry->type == BCH_JSET_ENTRY_btree_root &&
+		    entry->btree_id == id)
+			goto found;
+
+	return NULL;
+found:
+	if (!entry->u64s)
+		return ERR_PTR(-EINVAL);
+
+	k = entry->start;
+	*level = entry->level;
+	return k;
+}
+
+int bch2_verify_superblock_clean(struct bch_fs *c,
+				 struct bch_sb_field_clean **cleanp,
+				 struct jset *j)
+{
+	unsigned i;
+	struct bch_sb_field_clean *clean = *cleanp;
+	struct printbuf buf1 = PRINTBUF;
+	struct printbuf buf2 = PRINTBUF;
+	int ret = 0;
+
+	if (mustfix_fsck_err_on(j->seq != clean->journal_seq, c,
+			"superblock journal seq (%llu) doesn't match journal (%llu) after clean shutdown",
+			le64_to_cpu(clean->journal_seq),
+			le64_to_cpu(j->seq))) {
+		kfree(clean);
+		*cleanp = NULL;
+		return 0;
+	}
+
+	for (i = 0; i < BTREE_ID_NR; i++) {
+		struct bkey_i *k1, *k2;
+		unsigned l1 = 0, l2 = 0;
+
+		k1 = btree_root_find(c, clean, NULL, i, &l1);
+		k2 = btree_root_find(c, NULL, j, i, &l2);
+
+		if (!k1 && !k2)
+			continue;
+
+		printbuf_reset(&buf1);
+		printbuf_reset(&buf2);
+
+		if (k1)
+			bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(k1));
+		else
+			prt_printf(&buf1, "(none)");
+
+		if (k2)
+			bch2_bkey_val_to_text(&buf2, c, bkey_i_to_s_c(k2));
+		else
+			prt_printf(&buf2, "(none)");
+
+		mustfix_fsck_err_on(!k1 || !k2 ||
+				    IS_ERR(k1) ||
+				    IS_ERR(k2) ||
+				    k1->k.u64s != k2->k.u64s ||
+				    memcmp(k1, k2, bkey_bytes(&k1->k)) ||
+				    l1 != l2, c,
+			"superblock btree root %u doesn't match journal after clean shutdown\n"
+			"sb:      l=%u %s\n"
+			"journal: l=%u %s\n", i,
+			l1, buf1.buf,
+			l2, buf2.buf);
+	}
+fsck_err:
+	printbuf_exit(&buf2);
+	printbuf_exit(&buf1);
+	return ret;
+}
+
+struct bch_sb_field_clean *bch2_read_superblock_clean(struct bch_fs *c)
+{
+	struct bch_sb_field_clean *clean, *sb_clean;
+	int ret;
+
+	mutex_lock(&c->sb_lock);
+	sb_clean = bch2_sb_get_clean(c->disk_sb.sb);
+
+	if (fsck_err_on(!sb_clean, c,
+			"superblock marked clean but clean section not present")) {
+		SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
+		c->sb.clean = false;
+		mutex_unlock(&c->sb_lock);
+		return NULL;
+	}
+
+	clean = kmemdup(sb_clean, vstruct_bytes(&sb_clean->field),
+			GFP_KERNEL);
+	if (!clean) {
+		mutex_unlock(&c->sb_lock);
+		return ERR_PTR(-BCH_ERR_ENOMEM_read_superblock_clean);
+	}
+
+	ret = bch2_sb_clean_validate_late(c, clean, READ);
+	if (ret) {
+		mutex_unlock(&c->sb_lock);
+		return ERR_PTR(ret);
+	}
+
+	mutex_unlock(&c->sb_lock);
+
+	return clean;
+fsck_err:
+	mutex_unlock(&c->sb_lock);
+	return ERR_PTR(ret);
+}
+
+static struct jset_entry *jset_entry_init(struct jset_entry **end, size_t size)
+{
+	struct jset_entry *entry = *end;
+	unsigned u64s = DIV_ROUND_UP(size, sizeof(u64));
+
+	memset(entry, 0, u64s * sizeof(u64));
+	/*
+	 * The u64s field counts from the start of data, ignoring the shared
+	 * fields.
+	 */
+	entry->u64s = cpu_to_le16(u64s - 1);
+
+	*end = vstruct_next(*end);
+	return entry;
+}
+
+void bch2_journal_super_entries_add_common(struct bch_fs *c,
+					   struct jset_entry **end,
+					   u64 journal_seq)
+{
+	struct bch_dev *ca;
+	unsigned i, dev;
+
+	percpu_down_read(&c->mark_lock);
+
+	if (!journal_seq) {
+		for (i = 0; i < ARRAY_SIZE(c->usage); i++)
+			bch2_fs_usage_acc_to_base(c, i);
+	} else {
+		bch2_fs_usage_acc_to_base(c, journal_seq & JOURNAL_BUF_MASK);
+	}
+
+	{
+		struct jset_entry_usage *u =
+			container_of(jset_entry_init(end, sizeof(*u)),
+				     struct jset_entry_usage, entry);
+
+		u->entry.type	= BCH_JSET_ENTRY_usage;
+		u->entry.btree_id = BCH_FS_USAGE_inodes;
+		u->v		= cpu_to_le64(c->usage_base->nr_inodes);
+	}
+
+	{
+		struct jset_entry_usage *u =
+			container_of(jset_entry_init(end, sizeof(*u)),
+				     struct jset_entry_usage, entry);
+
+		u->entry.type	= BCH_JSET_ENTRY_usage;
+		u->entry.btree_id = BCH_FS_USAGE_key_version;
+		u->v		= cpu_to_le64(atomic64_read(&c->key_version));
+	}
+
+	for (i = 0; i < BCH_REPLICAS_MAX; i++) {
+		struct jset_entry_usage *u =
+			container_of(jset_entry_init(end, sizeof(*u)),
+				     struct jset_entry_usage, entry);
+
+		u->entry.type	= BCH_JSET_ENTRY_usage;
+		u->entry.btree_id = BCH_FS_USAGE_reserved;
+		u->entry.level	= i;
+		u->v		= cpu_to_le64(c->usage_base->persistent_reserved[i]);
+	}
+
+	for (i = 0; i < c->replicas.nr; i++) {
+		struct bch_replicas_entry *e =
+			cpu_replicas_entry(&c->replicas, i);
+		struct jset_entry_data_usage *u =
+			container_of(jset_entry_init(end, sizeof(*u) + e->nr_devs),
+				     struct jset_entry_data_usage, entry);
+
+		u->entry.type	= BCH_JSET_ENTRY_data_usage;
+		u->v		= cpu_to_le64(c->usage_base->replicas[i]);
+		unsafe_memcpy(&u->r, e, replicas_entry_bytes(e),
+			      "embedded variable length struct");
+	}
+
+	for_each_member_device(ca, c, dev) {
+		unsigned b = sizeof(struct jset_entry_dev_usage) +
+			sizeof(struct jset_entry_dev_usage_type) * BCH_DATA_NR;
+		struct jset_entry_dev_usage *u =
+			container_of(jset_entry_init(end, b),
+				     struct jset_entry_dev_usage, entry);
+
+		u->entry.type = BCH_JSET_ENTRY_dev_usage;
+		u->dev = cpu_to_le32(dev);
+		u->buckets_ec		= cpu_to_le64(ca->usage_base->buckets_ec);
+
+		for (i = 0; i < BCH_DATA_NR; i++) {
+			u->d[i].buckets = cpu_to_le64(ca->usage_base->d[i].buckets);
+			u->d[i].sectors	= cpu_to_le64(ca->usage_base->d[i].sectors);
+			u->d[i].fragmented = cpu_to_le64(ca->usage_base->d[i].fragmented);
+		}
+	}
+
+	percpu_up_read(&c->mark_lock);
+
+	for (i = 0; i < 2; i++) {
+		struct jset_entry_clock *clock =
+			container_of(jset_entry_init(end, sizeof(*clock)),
+				     struct jset_entry_clock, entry);
+
+		clock->entry.type = BCH_JSET_ENTRY_clock;
+		clock->rw	= i;
+		clock->time	= cpu_to_le64(atomic64_read(&c->io_clock[i].now));
+	}
+}
+
+static int bch2_sb_clean_validate(struct bch_sb *sb,
+				  struct bch_sb_field *f,
+				  struct printbuf *err)
+{
+	struct bch_sb_field_clean *clean = field_to_type(f, clean);
+
+	if (vstruct_bytes(&clean->field) < sizeof(*clean)) {
+		prt_printf(err, "wrong size (got %zu should be %zu)",
+		       vstruct_bytes(&clean->field), sizeof(*clean));
+		return -BCH_ERR_invalid_sb_clean;
+	}
+
+	return 0;
+}
+
+static void bch2_sb_clean_to_text(struct printbuf *out, struct bch_sb *sb,
+				  struct bch_sb_field *f)
+{
+	struct bch_sb_field_clean *clean = field_to_type(f, clean);
+	struct jset_entry *entry;
+
+	prt_printf(out, "flags:          %x",	le32_to_cpu(clean->flags));
+	prt_newline(out);
+	prt_printf(out, "journal_seq:    %llu",	le64_to_cpu(clean->journal_seq));
+	prt_newline(out);
+
+	for (entry = clean->start;
+	     entry != vstruct_end(&clean->field);
+	     entry = vstruct_next(entry)) {
+		if (entry->type == BCH_JSET_ENTRY_btree_keys &&
+		    !entry->u64s)
+			continue;
+
+		bch2_journal_entry_to_text(out, NULL, entry);
+		prt_newline(out);
+	}
+}
+
+const struct bch_sb_field_ops bch_sb_field_ops_clean = {
+	.validate	= bch2_sb_clean_validate,
+	.to_text	= bch2_sb_clean_to_text,
+};
+
+int bch2_fs_mark_dirty(struct bch_fs *c)
+{
+	int ret;
+
+	/*
+	 * Unconditionally write superblock, to verify it hasn't changed before
+	 * we go rw:
+	 */
+
+	mutex_lock(&c->sb_lock);
+	SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
+
+	bch2_sb_maybe_downgrade(c);
+	c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALWAYS);
+
+	ret = bch2_write_super(c);
+	mutex_unlock(&c->sb_lock);
+
+	return ret;
+}
+
+void bch2_fs_mark_clean(struct bch_fs *c)
+{
+	struct bch_sb_field_clean *sb_clean;
+	struct jset_entry *entry;
+	unsigned u64s;
+	int ret;
+
+	mutex_lock(&c->sb_lock);
+	if (BCH_SB_CLEAN(c->disk_sb.sb))
+		goto out;
+
+	SET_BCH_SB_CLEAN(c->disk_sb.sb, true);
+
+	c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_alloc_info);
+	c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_alloc_metadata);
+	c->disk_sb.sb->features[0] &= cpu_to_le64(~(1ULL << BCH_FEATURE_extents_above_btree_updates));
+	c->disk_sb.sb->features[0] &= cpu_to_le64(~(1ULL << BCH_FEATURE_btree_updates_journalled));
+
+	u64s = sizeof(*sb_clean) / sizeof(u64) + c->journal.entry_u64s_reserved;
+
+	sb_clean = bch2_sb_resize_clean(&c->disk_sb, u64s);
+	if (!sb_clean) {
+		bch_err(c, "error resizing superblock while setting filesystem clean");
+		goto out;
+	}
+
+	sb_clean->flags		= 0;
+	sb_clean->journal_seq	= cpu_to_le64(atomic64_read(&c->journal.seq));
+
+	/* Trying to catch outstanding bug: */
+	BUG_ON(le64_to_cpu(sb_clean->journal_seq) > S64_MAX);
+
+	entry = sb_clean->start;
+	bch2_journal_super_entries_add_common(c, &entry, 0);
+	entry = bch2_btree_roots_to_journal_entries(c, entry, entry);
+	BUG_ON((void *) entry > vstruct_end(&sb_clean->field));
+
+	memset(entry, 0,
+	       vstruct_end(&sb_clean->field) - (void *) entry);
+
+	/*
+	 * this should be in the write path, and we should be validating every
+	 * superblock section:
+	 */
+	ret = bch2_sb_clean_validate_late(c, sb_clean, WRITE);
+	if (ret) {
+		bch_err(c, "error writing marking filesystem clean: validate error");
+		goto out;
+	}
+
+	bch2_write_super(c);
+out:
+	mutex_unlock(&c->sb_lock);
+}
diff --git a/fs/bcachefs/sb-clean.h b/fs/bcachefs/sb-clean.h
new file mode 100644
index 000000000000..71caef281239
--- /dev/null
+++ b/fs/bcachefs/sb-clean.h
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_SB_CLEAN_H
+#define _BCACHEFS_SB_CLEAN_H
+
+int bch2_sb_clean_validate_late(struct bch_fs *, struct bch_sb_field_clean *, int);
+int bch2_verify_superblock_clean(struct bch_fs *, struct bch_sb_field_clean **,
+				 struct jset *);
+struct bch_sb_field_clean *bch2_read_superblock_clean(struct bch_fs *);
+void bch2_journal_super_entries_add_common(struct bch_fs *, struct jset_entry **, u64);
+
+extern const struct bch_sb_field_ops bch_sb_field_ops_clean;
+
+int bch2_fs_mark_dirty(struct bch_fs *);
+void bch2_fs_mark_clean(struct bch_fs *);
+
+#endif /* _BCACHEFS_SB_CLEAN_H */
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index ff86088022fc..f01883e785a5 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -1,8 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 
 #include "bcachefs.h"
-#include "btree_update_interior.h"
-#include "buckets.h"
 #include "checksum.h"
 #include "counters.h"
 #include "disk_groups.h"
@@ -10,12 +8,12 @@
 #include "error.h"
 #include "io.h"
 #include "journal.h"
-#include "journal_io.h"
 #include "journal_sb.h"
 #include "journal_seq_blacklist.h"
 #include "recovery.h"
 #include "replicas.h"
 #include "quota.h"
+#include "sb-clean.h"
 #include "sb-members.h"
 #include "super-io.h"
 #include "super.h"
@@ -1016,27 +1014,6 @@ void __bch2_check_set_feature(struct bch_fs *c, unsigned feat)
 	mutex_unlock(&c->sb_lock);
 }
 
-/* BCH_SB_FIELD_clean: */
-
-int bch2_sb_clean_validate_late(struct bch_fs *c, struct bch_sb_field_clean *clean, int write)
-{
-	struct jset_entry *entry;
-	int ret;
-
-	for (entry = clean->start;
-	     entry < (struct jset_entry *) vstruct_end(&clean->field);
-	     entry = vstruct_next(entry)) {
-		ret = bch2_journal_entry_validate(c, NULL, entry,
-						  le16_to_cpu(c->disk_sb.sb->version),
-						  BCH_SB_BIG_ENDIAN(c->disk_sb.sb),
-						  write);
-		if (ret)
-			return ret;
-	}
-
-	return 0;
-}
-
 /* Downgrade if superblock is at a higher version than currently supported: */
 void bch2_sb_maybe_downgrade(struct bch_fs *c)
 {
@@ -1063,232 +1040,6 @@ void bch2_sb_upgrade(struct bch_fs *c, unsigned new_version)
 	c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALL);
 }
 
-int bch2_fs_mark_dirty(struct bch_fs *c)
-{
-	int ret;
-
-	/*
-	 * Unconditionally write superblock, to verify it hasn't changed before
-	 * we go rw:
-	 */
-
-	mutex_lock(&c->sb_lock);
-	SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
-
-	bch2_sb_maybe_downgrade(c);
-	c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALWAYS);
-
-	ret = bch2_write_super(c);
-	mutex_unlock(&c->sb_lock);
-
-	return ret;
-}
-
-static struct jset_entry *jset_entry_init(struct jset_entry **end, size_t size)
-{
-	struct jset_entry *entry = *end;
-	unsigned u64s = DIV_ROUND_UP(size, sizeof(u64));
-
-	memset(entry, 0, u64s * sizeof(u64));
-	/*
-	 * The u64s field counts from the start of data, ignoring the shared
-	 * fields.
-	 */
-	entry->u64s = cpu_to_le16(u64s - 1);
-
-	*end = vstruct_next(*end);
-	return entry;
-}
-
-void bch2_journal_super_entries_add_common(struct bch_fs *c,
-					   struct jset_entry **end,
-					   u64 journal_seq)
-{
-	struct bch_dev *ca;
-	unsigned i, dev;
-
-	percpu_down_read(&c->mark_lock);
-
-	if (!journal_seq) {
-		for (i = 0; i < ARRAY_SIZE(c->usage); i++)
-			bch2_fs_usage_acc_to_base(c, i);
-	} else {
-		bch2_fs_usage_acc_to_base(c, journal_seq & JOURNAL_BUF_MASK);
-	}
-
-	{
-		struct jset_entry_usage *u =
-			container_of(jset_entry_init(end, sizeof(*u)),
-				     struct jset_entry_usage, entry);
-
-		u->entry.type	= BCH_JSET_ENTRY_usage;
-		u->entry.btree_id = BCH_FS_USAGE_inodes;
-		u->v		= cpu_to_le64(c->usage_base->nr_inodes);
-	}
-
-	{
-		struct jset_entry_usage *u =
-			container_of(jset_entry_init(end, sizeof(*u)),
-				     struct jset_entry_usage, entry);
-
-		u->entry.type	= BCH_JSET_ENTRY_usage;
-		u->entry.btree_id = BCH_FS_USAGE_key_version;
-		u->v		= cpu_to_le64(atomic64_read(&c->key_version));
-	}
-
-	for (i = 0; i < BCH_REPLICAS_MAX; i++) {
-		struct jset_entry_usage *u =
-			container_of(jset_entry_init(end, sizeof(*u)),
-				     struct jset_entry_usage, entry);
-
-		u->entry.type	= BCH_JSET_ENTRY_usage;
-		u->entry.btree_id = BCH_FS_USAGE_reserved;
-		u->entry.level	= i;
-		u->v		= cpu_to_le64(c->usage_base->persistent_reserved[i]);
-	}
-
-	for (i = 0; i < c->replicas.nr; i++) {
-		struct bch_replicas_entry *e =
-			cpu_replicas_entry(&c->replicas, i);
-		struct jset_entry_data_usage *u =
-			container_of(jset_entry_init(end, sizeof(*u) + e->nr_devs),
-				     struct jset_entry_data_usage, entry);
-
-		u->entry.type	= BCH_JSET_ENTRY_data_usage;
-		u->v		= cpu_to_le64(c->usage_base->replicas[i]);
-		unsafe_memcpy(&u->r, e, replicas_entry_bytes(e),
-			      "embedded variable length struct");
-	}
-
-	for_each_member_device(ca, c, dev) {
-		unsigned b = sizeof(struct jset_entry_dev_usage) +
-			sizeof(struct jset_entry_dev_usage_type) * BCH_DATA_NR;
-		struct jset_entry_dev_usage *u =
-			container_of(jset_entry_init(end, b),
-				     struct jset_entry_dev_usage, entry);
-
-		u->entry.type = BCH_JSET_ENTRY_dev_usage;
-		u->dev = cpu_to_le32(dev);
-		u->buckets_ec		= cpu_to_le64(ca->usage_base->buckets_ec);
-
-		for (i = 0; i < BCH_DATA_NR; i++) {
-			u->d[i].buckets = cpu_to_le64(ca->usage_base->d[i].buckets);
-			u->d[i].sectors	= cpu_to_le64(ca->usage_base->d[i].sectors);
-			u->d[i].fragmented = cpu_to_le64(ca->usage_base->d[i].fragmented);
-		}
-	}
-
-	percpu_up_read(&c->mark_lock);
-
-	for (i = 0; i < 2; i++) {
-		struct jset_entry_clock *clock =
-			container_of(jset_entry_init(end, sizeof(*clock)),
-				     struct jset_entry_clock, entry);
-
-		clock->entry.type = BCH_JSET_ENTRY_clock;
-		clock->rw	= i;
-		clock->time	= cpu_to_le64(atomic64_read(&c->io_clock[i].now));
-	}
-}
-
-void bch2_fs_mark_clean(struct bch_fs *c)
-{
-	struct bch_sb_field_clean *sb_clean;
-	struct jset_entry *entry;
-	unsigned u64s;
-	int ret;
-
-	mutex_lock(&c->sb_lock);
-	if (BCH_SB_CLEAN(c->disk_sb.sb))
-		goto out;
-
-	SET_BCH_SB_CLEAN(c->disk_sb.sb, true);
-
-	c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_alloc_info);
-	c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_alloc_metadata);
-	c->disk_sb.sb->features[0] &= cpu_to_le64(~(1ULL << BCH_FEATURE_extents_above_btree_updates));
-	c->disk_sb.sb->features[0] &= cpu_to_le64(~(1ULL << BCH_FEATURE_btree_updates_journalled));
-
-	u64s = sizeof(*sb_clean) / sizeof(u64) + c->journal.entry_u64s_reserved;
-
-	sb_clean = bch2_sb_resize_clean(&c->disk_sb, u64s);
-	if (!sb_clean) {
-		bch_err(c, "error resizing superblock while setting filesystem clean");
-		goto out;
-	}
-
-	sb_clean->flags		= 0;
-	sb_clean->journal_seq	= cpu_to_le64(atomic64_read(&c->journal.seq));
-
-	/* Trying to catch outstanding bug: */
-	BUG_ON(le64_to_cpu(sb_clean->journal_seq) > S64_MAX);
-
-	entry = sb_clean->start;
-	bch2_journal_super_entries_add_common(c, &entry, 0);
-	entry = bch2_btree_roots_to_journal_entries(c, entry, entry);
-	BUG_ON((void *) entry > vstruct_end(&sb_clean->field));
-
-	memset(entry, 0,
-	       vstruct_end(&sb_clean->field) - (void *) entry);
-
-	/*
-	 * this should be in the write path, and we should be validating every
-	 * superblock section:
-	 */
-	ret = bch2_sb_clean_validate_late(c, sb_clean, WRITE);
-	if (ret) {
-		bch_err(c, "error writing marking filesystem clean: validate error");
-		goto out;
-	}
-
-	bch2_write_super(c);
-out:
-	mutex_unlock(&c->sb_lock);
-}
-
-static int bch2_sb_clean_validate(struct bch_sb *sb,
-				  struct bch_sb_field *f,
-				  struct printbuf *err)
-{
-	struct bch_sb_field_clean *clean = field_to_type(f, clean);
-
-	if (vstruct_bytes(&clean->field) < sizeof(*clean)) {
-		prt_printf(err, "wrong size (got %zu should be %zu)",
-		       vstruct_bytes(&clean->field), sizeof(*clean));
-		return -BCH_ERR_invalid_sb_clean;
-	}
-
-	return 0;
-}
-
-static void bch2_sb_clean_to_text(struct printbuf *out, struct bch_sb *sb,
-				  struct bch_sb_field *f)
-{
-	struct bch_sb_field_clean *clean = field_to_type(f, clean);
-	struct jset_entry *entry;
-
-	prt_printf(out, "flags:          %x",	le32_to_cpu(clean->flags));
-	prt_newline(out);
-	prt_printf(out, "journal_seq:    %llu",	le64_to_cpu(clean->journal_seq));
-	prt_newline(out);
-
-	for (entry = clean->start;
-	     entry != vstruct_end(&clean->field);
-	     entry = vstruct_next(entry)) {
-		if (entry->type == BCH_JSET_ENTRY_btree_keys &&
-		    !entry->u64s)
-			continue;
-
-		bch2_journal_entry_to_text(out, NULL, entry);
-		prt_newline(out);
-	}
-}
-
-static const struct bch_sb_field_ops bch_sb_field_ops_clean = {
-	.validate	= bch2_sb_clean_validate,
-	.to_text	= bch2_sb_clean_to_text,
-};
-
 static const struct bch_sb_field_ops *bch2_sb_field_ops[] = {
 #define x(f, nr)					\
 	[BCH_SB_FIELD_##f] = &bch_sb_field_ops_##f,
diff --git a/fs/bcachefs/super-io.h b/fs/bcachefs/super-io.h
index 6e59b0148f8d..d51c0a19586f 100644
--- a/fs/bcachefs/super-io.h
+++ b/fs/bcachefs/super-io.h
@@ -122,19 +122,9 @@ static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi)
 	};
 }
 
-/* BCH_SB_FIELD_clean: */
-
-void bch2_journal_super_entries_add_common(struct bch_fs *,
-					   struct jset_entry **, u64);
-
-int bch2_sb_clean_validate_late(struct bch_fs *, struct bch_sb_field_clean *, int);
-
 void bch2_sb_maybe_downgrade(struct bch_fs *);
 void bch2_sb_upgrade(struct bch_fs *, unsigned);
 
-int bch2_fs_mark_dirty(struct bch_fs *);
-void bch2_fs_mark_clean(struct bch_fs *);
-
 void bch2_sb_field_to_text(struct printbuf *, struct bch_sb *,
 			   struct bch_sb_field *);
 void bch2_sb_layout_to_text(struct printbuf *, struct bch_sb_layout *);
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index de7bc0192c3d..46f0be3d0cef 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -46,6 +46,7 @@
 #include "rebalance.h"
 #include "recovery.h"
 #include "replicas.h"
+#include "sb-clean.h"
 #include "subvolume.h"
 #include "super.h"
 #include "super-io.h"
-- 
cgit 


From 401585fe87c2359f7aa24a309ec90e2589da9b46 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 5 Aug 2023 16:08:44 -0400
Subject: bcachefs: btree_journal_iter.c

Split out a new file from recovery.c for managing the list of keys we
read from the journal: before journal replay finishes the btree iterator
code needs to be able to iterate over and return keys from the journal
as well, so there's a fair bit of code here.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/Makefile                |   1 +
 fs/bcachefs/btree_gc.c              |   1 +
 fs/bcachefs/btree_iter.c            |   2 +-
 fs/bcachefs/btree_journal_iter.c    | 531 ++++++++++++++++++++++++++++++++++++
 fs/bcachefs/btree_journal_iter.h    |  57 ++++
 fs/bcachefs/btree_trans_commit.c    |   2 +-
 fs/bcachefs/btree_update.c          |   2 +-
 fs/bcachefs/btree_update_interior.c |   2 +-
 fs/bcachefs/recovery.c              | 521 +----------------------------------
 fs/bcachefs/recovery.h              |  51 ----
 fs/bcachefs/super.c                 |   1 +
 11 files changed, 597 insertions(+), 574 deletions(-)
 create mode 100644 fs/bcachefs/btree_journal_iter.c
 create mode 100644 fs/bcachefs/btree_journal_iter.h

(limited to 'fs')

diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile
index 0ee755864e68..4b7f384f703f 100644
--- a/fs/bcachefs/Makefile
+++ b/fs/bcachefs/Makefile
@@ -13,6 +13,7 @@ bcachefs-y		:=	\
 	btree_gc.o		\
 	btree_io.o		\
 	btree_iter.o		\
+	btree_journal_iter.o	\
 	btree_key_cache.o	\
 	btree_locking.o		\
 	btree_trans_commit.o	\
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 1a749d4be5b9..dac2eb76c985 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -9,6 +9,7 @@
 #include "alloc_foreground.h"
 #include "bkey_methods.h"
 #include "bkey_buf.h"
+#include "btree_journal_iter.h"
 #include "btree_key_cache.h"
 #include "btree_locking.h"
 #include "btree_update_interior.h"
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index feb23e9c2a1a..ad95849845a5 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -5,6 +5,7 @@
 #include "bkey_buf.h"
 #include "btree_cache.h"
 #include "btree_iter.h"
+#include "btree_journal_iter.h"
 #include "btree_key_cache.h"
 #include "btree_locking.h"
 #include "btree_update.h"
@@ -12,7 +13,6 @@
 #include "error.h"
 #include "extents.h"
 #include "journal.h"
-#include "recovery.h"
 #include "replicas.h"
 #include "subvolume.h"
 #include "trace.h"
diff --git a/fs/bcachefs/btree_journal_iter.c b/fs/bcachefs/btree_journal_iter.c
new file mode 100644
index 000000000000..58a981bcf3aa
--- /dev/null
+++ b/fs/bcachefs/btree_journal_iter.c
@@ -0,0 +1,531 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "bset.h"
+#include "btree_journal_iter.h"
+#include "journal_io.h"
+
+#include <linux/sort.h>
+
+/*
+ * For managing keys we read from the journal: until journal replay works normal
+ * btree lookups need to be able to find and return keys from the journal where
+ * they overwrite what's in the btree, so we have a special iterator and
+ * operations for the regular btree iter code to use:
+ */
+
+static int __journal_key_cmp(enum btree_id	l_btree_id,
+			     unsigned		l_level,
+			     struct bpos	l_pos,
+			     const struct journal_key *r)
+{
+	return (cmp_int(l_btree_id,	r->btree_id) ?:
+		cmp_int(l_level,	r->level) ?:
+		bpos_cmp(l_pos,	r->k->k.p));
+}
+
+static int journal_key_cmp(const struct journal_key *l, const struct journal_key *r)
+{
+	return __journal_key_cmp(l->btree_id, l->level, l->k->k.p, r);
+}
+
+static inline size_t idx_to_pos(struct journal_keys *keys, size_t idx)
+{
+	size_t gap_size = keys->size - keys->nr;
+
+	if (idx >= keys->gap)
+		idx += gap_size;
+	return idx;
+}
+
+static inline struct journal_key *idx_to_key(struct journal_keys *keys, size_t idx)
+{
+	return keys->d + idx_to_pos(keys, idx);
+}
+
+static size_t __bch2_journal_key_search(struct journal_keys *keys,
+					enum btree_id id, unsigned level,
+					struct bpos pos)
+{
+	size_t l = 0, r = keys->nr, m;
+
+	while (l < r) {
+		m = l + ((r - l) >> 1);
+		if (__journal_key_cmp(id, level, pos, idx_to_key(keys, m)) > 0)
+			l = m + 1;
+		else
+			r = m;
+	}
+
+	BUG_ON(l < keys->nr &&
+	       __journal_key_cmp(id, level, pos, idx_to_key(keys, l)) > 0);
+
+	BUG_ON(l &&
+	       __journal_key_cmp(id, level, pos, idx_to_key(keys, l - 1)) <= 0);
+
+	return l;
+}
+
+static size_t bch2_journal_key_search(struct journal_keys *keys,
+				      enum btree_id id, unsigned level,
+				      struct bpos pos)
+{
+	return idx_to_pos(keys, __bch2_journal_key_search(keys, id, level, pos));
+}
+
+struct bkey_i *bch2_journal_keys_peek_upto(struct bch_fs *c, enum btree_id btree_id,
+					   unsigned level, struct bpos pos,
+					   struct bpos end_pos, size_t *idx)
+{
+	struct journal_keys *keys = &c->journal_keys;
+	unsigned iters = 0;
+	struct journal_key *k;
+search:
+	if (!*idx)
+		*idx = __bch2_journal_key_search(keys, btree_id, level, pos);
+
+	while ((k = *idx < keys->nr ? idx_to_key(keys, *idx) : NULL)) {
+		if (__journal_key_cmp(btree_id, level, end_pos, k) < 0)
+			return NULL;
+
+		if (__journal_key_cmp(btree_id, level, pos, k) <= 0 &&
+		    !k->overwritten)
+			return k->k;
+
+		(*idx)++;
+		iters++;
+		if (iters == 10) {
+			*idx = 0;
+			goto search;
+		}
+	}
+
+	return NULL;
+}
+
+struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *c, enum btree_id btree_id,
+					   unsigned level, struct bpos pos)
+{
+	size_t idx = 0;
+
+	return bch2_journal_keys_peek_upto(c, btree_id, level, pos, pos, &idx);
+}
+
+static void journal_iters_fix(struct bch_fs *c)
+{
+	struct journal_keys *keys = &c->journal_keys;
+	/* The key we just inserted is immediately before the gap: */
+	size_t gap_end = keys->gap + (keys->size - keys->nr);
+	struct btree_and_journal_iter *iter;
+
+	/*
+	 * If an iterator points one after the key we just inserted, decrement
+	 * the iterator so it points at the key we just inserted - if the
+	 * decrement was unnecessary, bch2_btree_and_journal_iter_peek() will
+	 * handle that:
+	 */
+	list_for_each_entry(iter, &c->journal_iters, journal.list)
+		if (iter->journal.idx == gap_end)
+			iter->journal.idx = keys->gap - 1;
+}
+
+static void journal_iters_move_gap(struct bch_fs *c, size_t old_gap, size_t new_gap)
+{
+	struct journal_keys *keys = &c->journal_keys;
+	struct journal_iter *iter;
+	size_t gap_size = keys->size - keys->nr;
+
+	list_for_each_entry(iter, &c->journal_iters, list) {
+		if (iter->idx > old_gap)
+			iter->idx -= gap_size;
+		if (iter->idx >= new_gap)
+			iter->idx += gap_size;
+	}
+}
+
+int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id,
+				 unsigned level, struct bkey_i *k)
+{
+	struct journal_key n = {
+		.btree_id	= id,
+		.level		= level,
+		.k		= k,
+		.allocated	= true,
+		/*
+		 * Ensure these keys are done last by journal replay, to unblock
+		 * journal reclaim:
+		 */
+		.journal_seq	= U32_MAX,
+	};
+	struct journal_keys *keys = &c->journal_keys;
+	size_t idx = bch2_journal_key_search(keys, id, level, k->k.p);
+
+	BUG_ON(test_bit(BCH_FS_RW, &c->flags));
+
+	if (idx < keys->size &&
+	    journal_key_cmp(&n, &keys->d[idx]) == 0) {
+		if (keys->d[idx].allocated)
+			kfree(keys->d[idx].k);
+		keys->d[idx] = n;
+		return 0;
+	}
+
+	if (idx > keys->gap)
+		idx -= keys->size - keys->nr;
+
+	if (keys->nr == keys->size) {
+		struct journal_keys new_keys = {
+			.nr			= keys->nr,
+			.size			= max_t(size_t, keys->size, 8) * 2,
+		};
+
+		new_keys.d = kvmalloc_array(new_keys.size, sizeof(new_keys.d[0]), GFP_KERNEL);
+		if (!new_keys.d) {
+			bch_err(c, "%s: error allocating new key array (size %zu)",
+				__func__, new_keys.size);
+			return -BCH_ERR_ENOMEM_journal_key_insert;
+		}
+
+		/* Since @keys was full, there was no gap: */
+		memcpy(new_keys.d, keys->d, sizeof(keys->d[0]) * keys->nr);
+		kvfree(keys->d);
+		*keys = new_keys;
+
+		/* And now the gap is at the end: */
+		keys->gap = keys->nr;
+	}
+
+	journal_iters_move_gap(c, keys->gap, idx);
+
+	move_gap(keys->d, keys->nr, keys->size, keys->gap, idx);
+	keys->gap = idx;
+
+	keys->nr++;
+	keys->d[keys->gap++] = n;
+
+	journal_iters_fix(c);
+
+	return 0;
+}
+
+/*
+ * Can only be used from the recovery thread while we're still RO - can't be
+ * used once we've got RW, as journal_keys is at that point used by multiple
+ * threads:
+ */
+int bch2_journal_key_insert(struct bch_fs *c, enum btree_id id,
+			    unsigned level, struct bkey_i *k)
+{
+	struct bkey_i *n;
+	int ret;
+
+	n = kmalloc(bkey_bytes(&k->k), GFP_KERNEL);
+	if (!n)
+		return -BCH_ERR_ENOMEM_journal_key_insert;
+
+	bkey_copy(n, k);
+	ret = bch2_journal_key_insert_take(c, id, level, n);
+	if (ret)
+		kfree(n);
+	return ret;
+}
+
+int bch2_journal_key_delete(struct bch_fs *c, enum btree_id id,
+			    unsigned level, struct bpos pos)
+{
+	struct bkey_i whiteout;
+
+	bkey_init(&whiteout.k);
+	whiteout.k.p = pos;
+
+	return bch2_journal_key_insert(c, id, level, &whiteout);
+}
+
+void bch2_journal_key_overwritten(struct bch_fs *c, enum btree_id btree,
+				  unsigned level, struct bpos pos)
+{
+	struct journal_keys *keys = &c->journal_keys;
+	size_t idx = bch2_journal_key_search(keys, btree, level, pos);
+
+	if (idx < keys->size &&
+	    keys->d[idx].btree_id	== btree &&
+	    keys->d[idx].level		== level &&
+	    bpos_eq(keys->d[idx].k->k.p, pos))
+		keys->d[idx].overwritten = true;
+}
+
+static void bch2_journal_iter_advance(struct journal_iter *iter)
+{
+	if (iter->idx < iter->keys->size) {
+		iter->idx++;
+		if (iter->idx == iter->keys->gap)
+			iter->idx += iter->keys->size - iter->keys->nr;
+	}
+}
+
+static struct bkey_s_c bch2_journal_iter_peek(struct journal_iter *iter)
+{
+	struct journal_key *k = iter->keys->d + iter->idx;
+
+	while (k < iter->keys->d + iter->keys->size &&
+	       k->btree_id	== iter->btree_id &&
+	       k->level		== iter->level) {
+		if (!k->overwritten)
+			return bkey_i_to_s_c(k->k);
+
+		bch2_journal_iter_advance(iter);
+		k = iter->keys->d + iter->idx;
+	}
+
+	return bkey_s_c_null;
+}
+
+static void bch2_journal_iter_exit(struct journal_iter *iter)
+{
+	list_del(&iter->list);
+}
+
+static void bch2_journal_iter_init(struct bch_fs *c,
+				   struct journal_iter *iter,
+				   enum btree_id id, unsigned level,
+				   struct bpos pos)
+{
+	iter->btree_id	= id;
+	iter->level	= level;
+	iter->keys	= &c->journal_keys;
+	iter->idx	= bch2_journal_key_search(&c->journal_keys, id, level, pos);
+}
+
+static struct bkey_s_c bch2_journal_iter_peek_btree(struct btree_and_journal_iter *iter)
+{
+	return bch2_btree_node_iter_peek_unpack(&iter->node_iter,
+						iter->b, &iter->unpacked);
+}
+
+static void bch2_journal_iter_advance_btree(struct btree_and_journal_iter *iter)
+{
+	bch2_btree_node_iter_advance(&iter->node_iter, iter->b);
+}
+
+void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *iter)
+{
+	if (bpos_eq(iter->pos, SPOS_MAX))
+		iter->at_end = true;
+	else
+		iter->pos = bpos_successor(iter->pos);
+}
+
+struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *iter)
+{
+	struct bkey_s_c btree_k, journal_k, ret;
+again:
+	if (iter->at_end)
+		return bkey_s_c_null;
+
+	while ((btree_k = bch2_journal_iter_peek_btree(iter)).k &&
+	       bpos_lt(btree_k.k->p, iter->pos))
+		bch2_journal_iter_advance_btree(iter);
+
+	while ((journal_k = bch2_journal_iter_peek(&iter->journal)).k &&
+	       bpos_lt(journal_k.k->p, iter->pos))
+		bch2_journal_iter_advance(&iter->journal);
+
+	ret = journal_k.k &&
+		(!btree_k.k || bpos_le(journal_k.k->p, btree_k.k->p))
+		? journal_k
+		: btree_k;
+
+	if (ret.k && iter->b && bpos_gt(ret.k->p, iter->b->data->max_key))
+		ret = bkey_s_c_null;
+
+	if (ret.k) {
+		iter->pos = ret.k->p;
+		if (bkey_deleted(ret.k)) {
+			bch2_btree_and_journal_iter_advance(iter);
+			goto again;
+		}
+	} else {
+		iter->pos = SPOS_MAX;
+		iter->at_end = true;
+	}
+
+	return ret;
+}
+
+void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *iter)
+{
+	bch2_journal_iter_exit(&iter->journal);
+}
+
+void __bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *iter,
+						  struct bch_fs *c,
+						  struct btree *b,
+						  struct btree_node_iter node_iter,
+						  struct bpos pos)
+{
+	memset(iter, 0, sizeof(*iter));
+
+	iter->b = b;
+	iter->node_iter = node_iter;
+	bch2_journal_iter_init(c, &iter->journal, b->c.btree_id, b->c.level, pos);
+	INIT_LIST_HEAD(&iter->journal.list);
+	iter->pos = b->data->min_key;
+	iter->at_end = false;
+}
+
+/*
+ * this version is used by btree_gc before filesystem has gone RW and
+ * multithreaded, so uses the journal_iters list:
+ */
+void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *iter,
+						struct bch_fs *c,
+						struct btree *b)
+{
+	struct btree_node_iter node_iter;
+
+	bch2_btree_node_iter_init_from_start(&node_iter, b);
+	__bch2_btree_and_journal_iter_init_node_iter(iter, c, b, node_iter, b->data->min_key);
+	list_add(&iter->journal.list, &c->journal_iters);
+}
+
+/* sort and dedup all keys in the journal: */
+
+void bch2_journal_entries_free(struct bch_fs *c)
+{
+	struct journal_replay **i;
+	struct genradix_iter iter;
+
+	genradix_for_each(&c->journal_entries, iter, i)
+		if (*i)
+			kvpfree(*i, offsetof(struct journal_replay, j) +
+				vstruct_bytes(&(*i)->j));
+	genradix_free(&c->journal_entries);
+}
+
+/*
+ * When keys compare equal, oldest compares first:
+ */
+static int journal_sort_key_cmp(const void *_l, const void *_r)
+{
+	const struct journal_key *l = _l;
+	const struct journal_key *r = _r;
+
+	return  journal_key_cmp(l, r) ?:
+		cmp_int(l->journal_seq, r->journal_seq) ?:
+		cmp_int(l->journal_offset, r->journal_offset);
+}
+
+void bch2_journal_keys_free(struct journal_keys *keys)
+{
+	struct journal_key *i;
+
+	move_gap(keys->d, keys->nr, keys->size, keys->gap, keys->nr);
+	keys->gap = keys->nr;
+
+	for (i = keys->d; i < keys->d + keys->nr; i++)
+		if (i->allocated)
+			kfree(i->k);
+
+	kvfree(keys->d);
+	keys->d = NULL;
+	keys->nr = keys->gap = keys->size = 0;
+}
+
+static void __journal_keys_sort(struct journal_keys *keys)
+{
+	struct journal_key *src, *dst;
+
+	sort(keys->d, keys->nr, sizeof(keys->d[0]), journal_sort_key_cmp, NULL);
+
+	src = dst = keys->d;
+	while (src < keys->d + keys->nr) {
+		while (src + 1 < keys->d + keys->nr &&
+		       src[0].btree_id	== src[1].btree_id &&
+		       src[0].level	== src[1].level &&
+		       bpos_eq(src[0].k->k.p, src[1].k->k.p))
+			src++;
+
+		*dst++ = *src++;
+	}
+
+	keys->nr = dst - keys->d;
+}
+
+int bch2_journal_keys_sort(struct bch_fs *c)
+{
+	struct genradix_iter iter;
+	struct journal_replay *i, **_i;
+	struct jset_entry *entry;
+	struct bkey_i *k;
+	struct journal_keys *keys = &c->journal_keys;
+	size_t nr_keys = 0, nr_read = 0;
+
+	genradix_for_each(&c->journal_entries, iter, _i) {
+		i = *_i;
+
+		if (!i || i->ignore)
+			continue;
+
+		for_each_jset_key(k, entry, &i->j)
+			nr_keys++;
+	}
+
+	if (!nr_keys)
+		return 0;
+
+	keys->size = roundup_pow_of_two(nr_keys);
+
+	keys->d = kvmalloc_array(keys->size, sizeof(keys->d[0]), GFP_KERNEL);
+	if (!keys->d) {
+		bch_err(c, "Failed to allocate buffer for sorted journal keys (%zu keys); trying slowpath",
+			nr_keys);
+
+		do {
+			keys->size >>= 1;
+			keys->d = kvmalloc_array(keys->size, sizeof(keys->d[0]), GFP_KERNEL);
+		} while (!keys->d && keys->size > nr_keys / 8);
+
+		if (!keys->d) {
+			bch_err(c, "Failed to allocate %zu size buffer for sorted journal keys; exiting",
+				keys->size);
+			return -BCH_ERR_ENOMEM_journal_keys_sort;
+		}
+	}
+
+	genradix_for_each(&c->journal_entries, iter, _i) {
+		i = *_i;
+
+		if (!i || i->ignore)
+			continue;
+
+		cond_resched();
+
+		for_each_jset_key(k, entry, &i->j) {
+			if (keys->nr == keys->size) {
+				__journal_keys_sort(keys);
+
+				if (keys->nr > keys->size * 7 / 8) {
+					bch_err(c, "Too many journal keys for slowpath; have %zu compacted, buf size %zu, processed %zu/%zu",
+						keys->nr, keys->size, nr_read, nr_keys);
+					return -BCH_ERR_ENOMEM_journal_keys_sort;
+				}
+			}
+
+			keys->d[keys->nr++] = (struct journal_key) {
+				.btree_id	= entry->btree_id,
+				.level		= entry->level,
+				.k		= k,
+				.journal_seq	= le64_to_cpu(i->j.seq),
+				.journal_offset	= k->_data - i->j._data,
+			};
+
+			nr_read++;
+		}
+	}
+
+	__journal_keys_sort(keys);
+	keys->gap = keys->nr;
+
+	bch_verbose(c, "Journal keys: %zu read, %zu after sorting and compacting", nr_keys, keys->nr);
+	return 0;
+}
diff --git a/fs/bcachefs/btree_journal_iter.h b/fs/bcachefs/btree_journal_iter.h
new file mode 100644
index 000000000000..5d64e7e22f26
--- /dev/null
+++ b/fs/bcachefs/btree_journal_iter.h
@@ -0,0 +1,57 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BTREE_JOURNAL_ITER_H
+#define _BCACHEFS_BTREE_JOURNAL_ITER_H
+
+struct journal_iter {
+	struct list_head	list;
+	enum btree_id		btree_id;
+	unsigned		level;
+	size_t			idx;
+	struct journal_keys	*keys;
+};
+
+/*
+ * Iterate over keys in the btree, with keys from the journal overlaid on top:
+ */
+
+struct btree_and_journal_iter {
+	struct btree		*b;
+	struct btree_node_iter	node_iter;
+	struct bkey		unpacked;
+
+	struct journal_iter	journal;
+	struct bpos		pos;
+	bool			at_end;
+};
+
+struct bkey_i *bch2_journal_keys_peek_upto(struct bch_fs *, enum btree_id,
+				unsigned, struct bpos, struct bpos, size_t *);
+struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *, enum btree_id,
+					   unsigned, struct bpos);
+
+int bch2_journal_key_insert_take(struct bch_fs *, enum btree_id,
+				 unsigned, struct bkey_i *);
+int bch2_journal_key_insert(struct bch_fs *, enum btree_id,
+			    unsigned, struct bkey_i *);
+int bch2_journal_key_delete(struct bch_fs *, enum btree_id,
+			    unsigned, struct bpos);
+void bch2_journal_key_overwritten(struct bch_fs *, enum btree_id,
+				  unsigned, struct bpos);
+
+void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *);
+struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *);
+
+void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *);
+void __bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *,
+				struct bch_fs *, struct btree *,
+				struct btree_node_iter, struct bpos);
+void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *,
+						struct bch_fs *,
+						struct btree *);
+
+void bch2_journal_keys_free(struct journal_keys *);
+void bch2_journal_entries_free(struct bch_fs *);
+
+int bch2_journal_keys_sort(struct bch_fs *);
+
+#endif /* _BCACHEFS_BTREE_JOURNAL_ITER_H */
diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c
index 2fa123ff953e..78a09aa050c7 100644
--- a/fs/bcachefs/btree_trans_commit.c
+++ b/fs/bcachefs/btree_trans_commit.c
@@ -4,6 +4,7 @@
 #include "btree_gc.h"
 #include "btree_io.h"
 #include "btree_iter.h"
+#include "btree_journal_iter.h"
 #include "btree_key_cache.h"
 #include "btree_update_interior.h"
 #include "btree_write_buffer.h"
@@ -12,7 +13,6 @@
 #include "error.h"
 #include "journal.h"
 #include "journal_reclaim.h"
-#include "recovery.h"
 #include "replicas.h"
 #include "subvolume.h"
 
diff --git a/fs/bcachefs/btree_update.c b/fs/bcachefs/btree_update.c
index 366929da58a0..612fba60be14 100644
--- a/fs/bcachefs/btree_update.c
+++ b/fs/bcachefs/btree_update.c
@@ -3,6 +3,7 @@
 #include "bcachefs.h"
 #include "btree_update.h"
 #include "btree_iter.h"
+#include "btree_journal_iter.h"
 #include "btree_locking.h"
 #include "buckets.h"
 #include "debug.h"
@@ -10,7 +11,6 @@
 #include "error.h"
 #include "extents.h"
 #include "keylist.h"
-#include "recovery.h"
 #include "subvolume.h"
 #include "trace.h"
 
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index f42ef46c59df..986dd541435a 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -5,6 +5,7 @@
 #include "bkey_methods.h"
 #include "btree_cache.h"
 #include "btree_gc.h"
+#include "btree_journal_iter.h"
 #include "btree_update.h"
 #include "btree_update_interior.h"
 #include "btree_io.h"
@@ -17,7 +18,6 @@
 #include "journal.h"
 #include "journal_reclaim.h"
 #include "keylist.h"
-#include "recovery.h"
 #include "replicas.h"
 #include "super-io.h"
 #include "trace.h"
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 048a62f90b06..bd4a99c0d21f 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -5,6 +5,7 @@
 #include "bkey_buf.h"
 #include "alloc_background.h"
 #include "btree_gc.h"
+#include "btree_journal_iter.h"
 #include "btree_update.h"
 #include "btree_update_interior.h"
 #include "btree_io.h"
@@ -58,524 +59,6 @@ static void zero_out_btree_mem_ptr(struct journal_keys *keys)
 			bkey_i_to_btree_ptr_v2(i->k)->v.mem_ptr = 0;
 }
 
-/* iterate over keys read from the journal: */
-
-static int __journal_key_cmp(enum btree_id	l_btree_id,
-			     unsigned		l_level,
-			     struct bpos	l_pos,
-			     const struct journal_key *r)
-{
-	return (cmp_int(l_btree_id,	r->btree_id) ?:
-		cmp_int(l_level,	r->level) ?:
-		bpos_cmp(l_pos,	r->k->k.p));
-}
-
-static int journal_key_cmp(const struct journal_key *l, const struct journal_key *r)
-{
-	return __journal_key_cmp(l->btree_id, l->level, l->k->k.p, r);
-}
-
-static inline size_t idx_to_pos(struct journal_keys *keys, size_t idx)
-{
-	size_t gap_size = keys->size - keys->nr;
-
-	if (idx >= keys->gap)
-		idx += gap_size;
-	return idx;
-}
-
-static inline struct journal_key *idx_to_key(struct journal_keys *keys, size_t idx)
-{
-	return keys->d + idx_to_pos(keys, idx);
-}
-
-static size_t __bch2_journal_key_search(struct journal_keys *keys,
-					enum btree_id id, unsigned level,
-					struct bpos pos)
-{
-	size_t l = 0, r = keys->nr, m;
-
-	while (l < r) {
-		m = l + ((r - l) >> 1);
-		if (__journal_key_cmp(id, level, pos, idx_to_key(keys, m)) > 0)
-			l = m + 1;
-		else
-			r = m;
-	}
-
-	BUG_ON(l < keys->nr &&
-	       __journal_key_cmp(id, level, pos, idx_to_key(keys, l)) > 0);
-
-	BUG_ON(l &&
-	       __journal_key_cmp(id, level, pos, idx_to_key(keys, l - 1)) <= 0);
-
-	return l;
-}
-
-static size_t bch2_journal_key_search(struct journal_keys *keys,
-				      enum btree_id id, unsigned level,
-				      struct bpos pos)
-{
-	return idx_to_pos(keys, __bch2_journal_key_search(keys, id, level, pos));
-}
-
-struct bkey_i *bch2_journal_keys_peek_upto(struct bch_fs *c, enum btree_id btree_id,
-					   unsigned level, struct bpos pos,
-					   struct bpos end_pos, size_t *idx)
-{
-	struct journal_keys *keys = &c->journal_keys;
-	unsigned iters = 0;
-	struct journal_key *k;
-search:
-	if (!*idx)
-		*idx = __bch2_journal_key_search(keys, btree_id, level, pos);
-
-	while ((k = *idx < keys->nr ? idx_to_key(keys, *idx) : NULL)) {
-		if (__journal_key_cmp(btree_id, level, end_pos, k) < 0)
-			return NULL;
-
-		if (__journal_key_cmp(btree_id, level, pos, k) <= 0 &&
-		    !k->overwritten)
-			return k->k;
-
-		(*idx)++;
-		iters++;
-		if (iters == 10) {
-			*idx = 0;
-			goto search;
-		}
-	}
-
-	return NULL;
-}
-
-struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *c, enum btree_id btree_id,
-					   unsigned level, struct bpos pos)
-{
-	size_t idx = 0;
-
-	return bch2_journal_keys_peek_upto(c, btree_id, level, pos, pos, &idx);
-}
-
-static void journal_iters_fix(struct bch_fs *c)
-{
-	struct journal_keys *keys = &c->journal_keys;
-	/* The key we just inserted is immediately before the gap: */
-	size_t gap_end = keys->gap + (keys->size - keys->nr);
-	struct btree_and_journal_iter *iter;
-
-	/*
-	 * If an iterator points one after the key we just inserted, decrement
-	 * the iterator so it points at the key we just inserted - if the
-	 * decrement was unnecessary, bch2_btree_and_journal_iter_peek() will
-	 * handle that:
-	 */
-	list_for_each_entry(iter, &c->journal_iters, journal.list)
-		if (iter->journal.idx == gap_end)
-			iter->journal.idx = keys->gap - 1;
-}
-
-static void journal_iters_move_gap(struct bch_fs *c, size_t old_gap, size_t new_gap)
-{
-	struct journal_keys *keys = &c->journal_keys;
-	struct journal_iter *iter;
-	size_t gap_size = keys->size - keys->nr;
-
-	list_for_each_entry(iter, &c->journal_iters, list) {
-		if (iter->idx > old_gap)
-			iter->idx -= gap_size;
-		if (iter->idx >= new_gap)
-			iter->idx += gap_size;
-	}
-}
-
-int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id,
-				 unsigned level, struct bkey_i *k)
-{
-	struct journal_key n = {
-		.btree_id	= id,
-		.level		= level,
-		.k		= k,
-		.allocated	= true,
-		/*
-		 * Ensure these keys are done last by journal replay, to unblock
-		 * journal reclaim:
-		 */
-		.journal_seq	= U32_MAX,
-	};
-	struct journal_keys *keys = &c->journal_keys;
-	size_t idx = bch2_journal_key_search(keys, id, level, k->k.p);
-
-	BUG_ON(test_bit(BCH_FS_RW, &c->flags));
-
-	if (idx < keys->size &&
-	    journal_key_cmp(&n, &keys->d[idx]) == 0) {
-		if (keys->d[idx].allocated)
-			kfree(keys->d[idx].k);
-		keys->d[idx] = n;
-		return 0;
-	}
-
-	if (idx > keys->gap)
-		idx -= keys->size - keys->nr;
-
-	if (keys->nr == keys->size) {
-		struct journal_keys new_keys = {
-			.nr			= keys->nr,
-			.size			= max_t(size_t, keys->size, 8) * 2,
-		};
-
-		new_keys.d = kvmalloc_array(new_keys.size, sizeof(new_keys.d[0]), GFP_KERNEL);
-		if (!new_keys.d) {
-			bch_err(c, "%s: error allocating new key array (size %zu)",
-				__func__, new_keys.size);
-			return -BCH_ERR_ENOMEM_journal_key_insert;
-		}
-
-		/* Since @keys was full, there was no gap: */
-		memcpy(new_keys.d, keys->d, sizeof(keys->d[0]) * keys->nr);
-		kvfree(keys->d);
-		*keys = new_keys;
-
-		/* And now the gap is at the end: */
-		keys->gap = keys->nr;
-	}
-
-	journal_iters_move_gap(c, keys->gap, idx);
-
-	move_gap(keys->d, keys->nr, keys->size, keys->gap, idx);
-	keys->gap = idx;
-
-	keys->nr++;
-	keys->d[keys->gap++] = n;
-
-	journal_iters_fix(c);
-
-	return 0;
-}
-
-/*
- * Can only be used from the recovery thread while we're still RO - can't be
- * used once we've got RW, as journal_keys is at that point used by multiple
- * threads:
- */
-int bch2_journal_key_insert(struct bch_fs *c, enum btree_id id,
-			    unsigned level, struct bkey_i *k)
-{
-	struct bkey_i *n;
-	int ret;
-
-	n = kmalloc(bkey_bytes(&k->k), GFP_KERNEL);
-	if (!n)
-		return -BCH_ERR_ENOMEM_journal_key_insert;
-
-	bkey_copy(n, k);
-	ret = bch2_journal_key_insert_take(c, id, level, n);
-	if (ret)
-		kfree(n);
-	return ret;
-}
-
-int bch2_journal_key_delete(struct bch_fs *c, enum btree_id id,
-			    unsigned level, struct bpos pos)
-{
-	struct bkey_i whiteout;
-
-	bkey_init(&whiteout.k);
-	whiteout.k.p = pos;
-
-	return bch2_journal_key_insert(c, id, level, &whiteout);
-}
-
-void bch2_journal_key_overwritten(struct bch_fs *c, enum btree_id btree,
-				  unsigned level, struct bpos pos)
-{
-	struct journal_keys *keys = &c->journal_keys;
-	size_t idx = bch2_journal_key_search(keys, btree, level, pos);
-
-	if (idx < keys->size &&
-	    keys->d[idx].btree_id	== btree &&
-	    keys->d[idx].level		== level &&
-	    bpos_eq(keys->d[idx].k->k.p, pos))
-		keys->d[idx].overwritten = true;
-}
-
-static void bch2_journal_iter_advance(struct journal_iter *iter)
-{
-	if (iter->idx < iter->keys->size) {
-		iter->idx++;
-		if (iter->idx == iter->keys->gap)
-			iter->idx += iter->keys->size - iter->keys->nr;
-	}
-}
-
-static struct bkey_s_c bch2_journal_iter_peek(struct journal_iter *iter)
-{
-	struct journal_key *k = iter->keys->d + iter->idx;
-
-	while (k < iter->keys->d + iter->keys->size &&
-	       k->btree_id	== iter->btree_id &&
-	       k->level		== iter->level) {
-		if (!k->overwritten)
-			return bkey_i_to_s_c(k->k);
-
-		bch2_journal_iter_advance(iter);
-		k = iter->keys->d + iter->idx;
-	}
-
-	return bkey_s_c_null;
-}
-
-static void bch2_journal_iter_exit(struct journal_iter *iter)
-{
-	list_del(&iter->list);
-}
-
-static void bch2_journal_iter_init(struct bch_fs *c,
-				   struct journal_iter *iter,
-				   enum btree_id id, unsigned level,
-				   struct bpos pos)
-{
-	iter->btree_id	= id;
-	iter->level	= level;
-	iter->keys	= &c->journal_keys;
-	iter->idx	= bch2_journal_key_search(&c->journal_keys, id, level, pos);
-}
-
-static struct bkey_s_c bch2_journal_iter_peek_btree(struct btree_and_journal_iter *iter)
-{
-	return bch2_btree_node_iter_peek_unpack(&iter->node_iter,
-						iter->b, &iter->unpacked);
-}
-
-static void bch2_journal_iter_advance_btree(struct btree_and_journal_iter *iter)
-{
-	bch2_btree_node_iter_advance(&iter->node_iter, iter->b);
-}
-
-void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *iter)
-{
-	if (bpos_eq(iter->pos, SPOS_MAX))
-		iter->at_end = true;
-	else
-		iter->pos = bpos_successor(iter->pos);
-}
-
-struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *iter)
-{
-	struct bkey_s_c btree_k, journal_k, ret;
-again:
-	if (iter->at_end)
-		return bkey_s_c_null;
-
-	while ((btree_k = bch2_journal_iter_peek_btree(iter)).k &&
-	       bpos_lt(btree_k.k->p, iter->pos))
-		bch2_journal_iter_advance_btree(iter);
-
-	while ((journal_k = bch2_journal_iter_peek(&iter->journal)).k &&
-	       bpos_lt(journal_k.k->p, iter->pos))
-		bch2_journal_iter_advance(&iter->journal);
-
-	ret = journal_k.k &&
-		(!btree_k.k || bpos_le(journal_k.k->p, btree_k.k->p))
-		? journal_k
-		: btree_k;
-
-	if (ret.k && iter->b && bpos_gt(ret.k->p, iter->b->data->max_key))
-		ret = bkey_s_c_null;
-
-	if (ret.k) {
-		iter->pos = ret.k->p;
-		if (bkey_deleted(ret.k)) {
-			bch2_btree_and_journal_iter_advance(iter);
-			goto again;
-		}
-	} else {
-		iter->pos = SPOS_MAX;
-		iter->at_end = true;
-	}
-
-	return ret;
-}
-
-void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *iter)
-{
-	bch2_journal_iter_exit(&iter->journal);
-}
-
-void __bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *iter,
-						  struct bch_fs *c,
-						  struct btree *b,
-						  struct btree_node_iter node_iter,
-						  struct bpos pos)
-{
-	memset(iter, 0, sizeof(*iter));
-
-	iter->b = b;
-	iter->node_iter = node_iter;
-	bch2_journal_iter_init(c, &iter->journal, b->c.btree_id, b->c.level, pos);
-	INIT_LIST_HEAD(&iter->journal.list);
-	iter->pos = b->data->min_key;
-	iter->at_end = false;
-}
-
-/*
- * this version is used by btree_gc before filesystem has gone RW and
- * multithreaded, so uses the journal_iters list:
- */
-void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *iter,
-						struct bch_fs *c,
-						struct btree *b)
-{
-	struct btree_node_iter node_iter;
-
-	bch2_btree_node_iter_init_from_start(&node_iter, b);
-	__bch2_btree_and_journal_iter_init_node_iter(iter, c, b, node_iter, b->data->min_key);
-	list_add(&iter->journal.list, &c->journal_iters);
-}
-
-/* sort and dedup all keys in the journal: */
-
-void bch2_journal_entries_free(struct bch_fs *c)
-{
-	struct journal_replay **i;
-	struct genradix_iter iter;
-
-	genradix_for_each(&c->journal_entries, iter, i)
-		if (*i)
-			kvpfree(*i, offsetof(struct journal_replay, j) +
-				vstruct_bytes(&(*i)->j));
-	genradix_free(&c->journal_entries);
-}
-
-/*
- * When keys compare equal, oldest compares first:
- */
-static int journal_sort_key_cmp(const void *_l, const void *_r)
-{
-	const struct journal_key *l = _l;
-	const struct journal_key *r = _r;
-
-	return  journal_key_cmp(l, r) ?:
-		cmp_int(l->journal_seq, r->journal_seq) ?:
-		cmp_int(l->journal_offset, r->journal_offset);
-}
-
-void bch2_journal_keys_free(struct journal_keys *keys)
-{
-	struct journal_key *i;
-
-	move_gap(keys->d, keys->nr, keys->size, keys->gap, keys->nr);
-	keys->gap = keys->nr;
-
-	for (i = keys->d; i < keys->d + keys->nr; i++)
-		if (i->allocated)
-			kfree(i->k);
-
-	kvfree(keys->d);
-	keys->d = NULL;
-	keys->nr = keys->gap = keys->size = 0;
-}
-
-static void __journal_keys_sort(struct journal_keys *keys)
-{
-	struct journal_key *src, *dst;
-
-	sort(keys->d, keys->nr, sizeof(keys->d[0]), journal_sort_key_cmp, NULL);
-
-	src = dst = keys->d;
-	while (src < keys->d + keys->nr) {
-		while (src + 1 < keys->d + keys->nr &&
-		       src[0].btree_id	== src[1].btree_id &&
-		       src[0].level	== src[1].level &&
-		       bpos_eq(src[0].k->k.p, src[1].k->k.p))
-			src++;
-
-		*dst++ = *src++;
-	}
-
-	keys->nr = dst - keys->d;
-}
-
-static int journal_keys_sort(struct bch_fs *c)
-{
-	struct genradix_iter iter;
-	struct journal_replay *i, **_i;
-	struct jset_entry *entry;
-	struct bkey_i *k;
-	struct journal_keys *keys = &c->journal_keys;
-	size_t nr_keys = 0, nr_read = 0;
-
-	genradix_for_each(&c->journal_entries, iter, _i) {
-		i = *_i;
-
-		if (!i || i->ignore)
-			continue;
-
-		for_each_jset_key(k, entry, &i->j)
-			nr_keys++;
-	}
-
-	if (!nr_keys)
-		return 0;
-
-	keys->size = roundup_pow_of_two(nr_keys);
-
-	keys->d = kvmalloc_array(keys->size, sizeof(keys->d[0]), GFP_KERNEL);
-	if (!keys->d) {
-		bch_err(c, "Failed to allocate buffer for sorted journal keys (%zu keys); trying slowpath",
-			nr_keys);
-
-		do {
-			keys->size >>= 1;
-			keys->d = kvmalloc_array(keys->size, sizeof(keys->d[0]), GFP_KERNEL);
-		} while (!keys->d && keys->size > nr_keys / 8);
-
-		if (!keys->d) {
-			bch_err(c, "Failed to allocate %zu size buffer for sorted journal keys; exiting",
-				keys->size);
-			return -BCH_ERR_ENOMEM_journal_keys_sort;
-		}
-	}
-
-	genradix_for_each(&c->journal_entries, iter, _i) {
-		i = *_i;
-
-		if (!i || i->ignore)
-			continue;
-
-		cond_resched();
-
-		for_each_jset_key(k, entry, &i->j) {
-			if (keys->nr == keys->size) {
-				__journal_keys_sort(keys);
-
-				if (keys->nr > keys->size * 7 / 8) {
-					bch_err(c, "Too many journal keys for slowpath; have %zu compacted, buf size %zu, processed %zu/%zu",
-						keys->nr, keys->size, nr_read, nr_keys);
-					return -BCH_ERR_ENOMEM_journal_keys_sort;
-				}
-			}
-
-			keys->d[keys->nr++] = (struct journal_key) {
-				.btree_id	= entry->btree_id,
-				.level		= entry->level,
-				.k		= k,
-				.journal_seq	= le64_to_cpu(i->j.seq),
-				.journal_offset	= k->_data - i->j._data,
-			};
-
-			nr_read++;
-		}
-	}
-
-	__journal_keys_sort(keys);
-	keys->gap = keys->nr;
-
-	bch_verbose(c, "Journal keys: %zu read, %zu after sorting and compacting", nr_keys, keys->nr);
-	return 0;
-}
-
 /* journal replay: */
 
 static void replay_now_at(struct journal *j, u64 seq)
@@ -1254,7 +737,7 @@ int bch2_fs_recovery(struct bch_fs *c)
 				}
 		}
 
-		ret = journal_keys_sort(c);
+		ret = bch2_journal_keys_sort(c);
 		if (ret)
 			goto err;
 
diff --git a/fs/bcachefs/recovery.h b/fs/bcachefs/recovery.h
index 0cd6b8a13c8c..852d30567da9 100644
--- a/fs/bcachefs/recovery.h
+++ b/fs/bcachefs/recovery.h
@@ -2,56 +2,6 @@
 #ifndef _BCACHEFS_RECOVERY_H
 #define _BCACHEFS_RECOVERY_H
 
-struct journal_iter {
-	struct list_head	list;
-	enum btree_id		btree_id;
-	unsigned		level;
-	size_t			idx;
-	struct journal_keys	*keys;
-};
-
-/*
- * Iterate over keys in the btree, with keys from the journal overlaid on top:
- */
-
-struct btree_and_journal_iter {
-	struct btree		*b;
-	struct btree_node_iter	node_iter;
-	struct bkey		unpacked;
-
-	struct journal_iter	journal;
-	struct bpos		pos;
-	bool			at_end;
-};
-
-struct bkey_i *bch2_journal_keys_peek_upto(struct bch_fs *, enum btree_id,
-				unsigned, struct bpos, struct bpos, size_t *);
-struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *, enum btree_id,
-					   unsigned, struct bpos);
-
-int bch2_journal_key_insert_take(struct bch_fs *, enum btree_id,
-				 unsigned, struct bkey_i *);
-int bch2_journal_key_insert(struct bch_fs *, enum btree_id,
-			    unsigned, struct bkey_i *);
-int bch2_journal_key_delete(struct bch_fs *, enum btree_id,
-			    unsigned, struct bpos);
-void bch2_journal_key_overwritten(struct bch_fs *, enum btree_id,
-				  unsigned, struct bpos);
-
-void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *);
-struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *);
-
-void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *);
-void __bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *,
-				struct bch_fs *, struct btree *,
-				struct btree_node_iter, struct bpos);
-void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *,
-						struct bch_fs *,
-						struct btree *);
-
-void bch2_journal_keys_free(struct journal_keys *);
-void bch2_journal_entries_free(struct bch_fs *);
-
 extern const char * const bch2_recovery_passes[];
 
 /*
@@ -81,4 +31,3 @@ int bch2_fs_recovery(struct bch_fs *);
 int bch2_fs_initialize(struct bch_fs *);
 
 #endif /* _BCACHEFS_RECOVERY_H */
-
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 46f0be3d0cef..d9dbcd0bdbf5 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -13,6 +13,7 @@
 #include "bkey_sort.h"
 #include "btree_cache.h"
 #include "btree_gc.h"
+#include "btree_journal_iter.h"
 #include "btree_key_cache.h"
 #include "btree_update_interior.h"
 #include "btree_io.h"
-- 
cgit 


From 83b3d9598a650d28071dbda0b97c7a9a6abd6e4a Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 11 Aug 2023 19:30:38 -0400
Subject: bcachefs: Fix 'journal not marked as containing replicas'

This fixes the replicas_write_errors test: the patch
  bcachefs: mark journal replicas before journal write submission

partially fixed replicas marking for the journal, but it broke the case
where one replica failed - this patch re-adds marking after the journal
write completes, when we know how many replicas succeeded.

Additionally, we do not consider it a fsck error when the very last
journal entry is not correctly marked, since there is an inherent race
there.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/journal_io.c | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 378b3f9170d4..50a7c3330807 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -1305,18 +1305,14 @@ int bch2_journal_read(struct bch_fs *c,
 
 		bch2_replicas_entry_sort(&replicas.e);
 
-		/*
-		 * If we're mounting in degraded mode - if we didn't read all
-		 * the devices - this is wrong:
-		 */
-
 		printbuf_reset(&buf);
 		bch2_replicas_entry_to_text(&buf, &replicas.e);
 
 		if (!degraded &&
-		    fsck_err_on(!bch2_replicas_marked(c, &replicas.e), c,
-				"superblock not marked as containing replicas %s",
-				buf.buf)) {
+		    !bch2_replicas_marked(c, &replicas.e) &&
+		    (le64_to_cpu(i->j.seq) == *last_seq ||
+		     fsck_err(c, "superblock not marked as containing replicas for journal entry %llu\n  %s",
+			      le64_to_cpu(i->j.seq), buf.buf))) {
 			ret = bch2_mark_replicas(c, &replicas.e);
 			if (ret)
 				goto err;
@@ -1483,6 +1479,7 @@ static void journal_write_done(struct closure *cl)
 	struct journal *j = container_of(cl, struct journal, io);
 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
 	struct journal_buf *w = journal_last_unwritten_buf(j);
+	struct bch_replicas_padded replicas;
 	union journal_res_state old, new;
 	u64 v, seq;
 	int err = 0;
@@ -1494,7 +1491,13 @@ static void journal_write_done(struct closure *cl)
 	if (!w->devs_written.nr) {
 		bch_err(c, "unable to write journal to sufficient devices");
 		err = -EIO;
+	} else {
+		bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal,
+					 w->devs_written);
+		if (bch2_mark_replicas(c, &replicas.e))
+			err = -EIO;
 	}
+
 	if (err)
 		bch2_fatal_error(c);
 
-- 
cgit 


From e0a2b00a42e347a1be596fd9d1c071f0cd8c3f1f Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 11 Aug 2023 22:22:31 -0400
Subject: bcachefs: Fix check_version_upgrade()

We were failing to upgrade to the latest compatible version - whoops.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/recovery.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index bd4a99c0d21f..33a68a335be6 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -507,7 +507,7 @@ static struct recovery_pass_fn recovery_pass_fns[] = {
 
 static void check_version_upgrade(struct bch_fs *c)
 {
-	unsigned latest_compatible = bch2_version_compatible(c->sb.version);
+	unsigned latest_compatible = bch2_latest_compatible_version(c->sb.version);
 	unsigned latest_version	= bcachefs_metadata_version_current;
 	unsigned old_version = c->sb.version_upgrade_complete ?: c->sb.version;
 	unsigned new_version = 0;
-- 
cgit 


From e6375481c9efb765687cc4d6c1396b335c3d5ef1 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 12 Aug 2023 12:13:19 -0400
Subject: bcachefs: Improve bch2_write_points_to_text()

Now we also print the open_buckets owned by each write_point - this is
to help with debugging a shutdown hang.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_foreground.c | 46 +++++++++++++++++++++++++++++++-----------
 1 file changed, 34 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index 1f4c5b38562d..089185a661ee 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -1516,25 +1516,47 @@ static const char * const bch2_write_point_states[] = {
 	NULL
 };
 
+static void bch2_write_point_to_text(struct printbuf *out, struct bch_fs *c,
+				     struct write_point *wp)
+{
+	struct open_bucket *ob;
+	unsigned i;
+
+	prt_printf(out, "%lu: ", wp->write_point);
+	prt_human_readable_u64(out, wp->sectors_allocated);
+
+	prt_printf(out, " last wrote: ");
+	bch2_pr_time_units(out, sched_clock() - wp->last_used);
+
+	for (i = 0; i < WRITE_POINT_STATE_NR; i++) {
+		prt_printf(out, " %s: ", bch2_write_point_states[i]);
+		bch2_pr_time_units(out, wp->time[i]);
+	}
+
+	prt_newline(out);
+
+	printbuf_indent_add(out, 2);
+	open_bucket_for_each(c, &wp->ptrs, ob, i)
+		bch2_open_bucket_to_text(out, c, ob);
+	printbuf_indent_sub(out, 2);
+}
+
 void bch2_write_points_to_text(struct printbuf *out, struct bch_fs *c)
 {
 	struct write_point *wp;
-	unsigned i;
 
+	prt_str(out, "Foreground write points\n");
 	for (wp = c->write_points;
 	     wp < c->write_points + ARRAY_SIZE(c->write_points);
-	     wp++) {
-		prt_printf(out, "%lu: ", wp->write_point);
-		prt_human_readable_u64(out, wp->sectors_allocated);
+	     wp++)
+		bch2_write_point_to_text(out, c, wp);
 
-		prt_printf(out, " last wrote: ");
-		bch2_pr_time_units(out, sched_clock() - wp->last_used);
+	prt_str(out, "Copygc write point\n");
+	bch2_write_point_to_text(out, c, &c->copygc_write_point);
 
-		for (i = 0; i < WRITE_POINT_STATE_NR; i++) {
-			prt_printf(out, " %s: ", bch2_write_point_states[i]);
-			bch2_pr_time_units(out, wp->time[i]);
-		}
+	prt_str(out, "Rebalance write point\n");
+	bch2_write_point_to_text(out, c, &c->rebalance_write_point);
 
-		prt_newline(out);
-	}
+	prt_str(out, "Btree write point\n");
+	bch2_write_point_to_text(out, c, &c->btree_write_point);
 }
-- 
cgit 


From 791236b85c2dfd3bc6b857431658efb49de83343 Mon Sep 17 00:00:00 2001
From: Joshua Ashton <joshua@froggi.es>
Date: Sat, 12 Aug 2023 15:47:45 +0100
Subject: bcachefs: Add btree_trans* to inode_set_fn

This will be used when we need to re-hash a directory tree when setting
flags.

It is not possible to have concurrent btree_trans on a thread.

Signed-off-by: Joshua Ashton <joshua@froggi.es>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-io.c    | 12 ++++++++----
 fs/bcachefs/fs-ioctl.c | 11 +++++++----
 fs/bcachefs/fs.c       |  5 +++--
 fs/bcachefs/fs.h       |  3 ++-
 fs/bcachefs/xattr.c    |  3 ++-
 5 files changed, 22 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 11a4919f30cd..ceab12fb8a8f 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -109,7 +109,8 @@ struct inode_new_size {
 	unsigned	fields;
 };
 
-static int inode_set_size(struct bch_inode_info *inode,
+static int inode_set_size(struct btree_trans *trans,
+			  struct bch_inode_info *inode,
 			  struct bch_inode_unpacked *bi,
 			  void *p)
 {
@@ -389,7 +390,8 @@ static int bch2_extend(struct mnt_idmap *idmap,
 	return bch2_setattr_nonsize(idmap, inode, iattr);
 }
 
-static int bch2_truncate_finish_fn(struct bch_inode_info *inode,
+static int bch2_truncate_finish_fn(struct btree_trans *trans,
+				   struct bch_inode_info *inode,
 				   struct bch_inode_unpacked *bi,
 				   void *p)
 {
@@ -397,7 +399,8 @@ static int bch2_truncate_finish_fn(struct bch_inode_info *inode,
 	return 0;
 }
 
-static int bch2_truncate_start_fn(struct bch_inode_info *inode,
+static int bch2_truncate_start_fn(struct btree_trans *trans,
+				  struct bch_inode_info *inode,
 				  struct bch_inode_unpacked *bi, void *p)
 {
 	u64 *new_i_size = p;
@@ -518,7 +521,8 @@ err:
 
 /* fallocate: */
 
-static int inode_update_times_fn(struct bch_inode_info *inode,
+static int inode_update_times_fn(struct btree_trans *trans,
+				 struct bch_inode_info *inode,
 				 struct bch_inode_unpacked *bi, void *p)
 {
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c
index dfa1bf73c854..141bcced031e 100644
--- a/fs/bcachefs/fs-ioctl.c
+++ b/fs/bcachefs/fs-ioctl.c
@@ -31,7 +31,8 @@ struct flags_set {
 	bool			projinherit;
 };
 
-static int bch2_inode_flags_set(struct bch_inode_info *inode,
+static int bch2_inode_flags_set(struct btree_trans *trans,
+				struct bch_inode_info *inode,
 				struct bch_inode_unpacked *bi,
 				void *p)
 {
@@ -124,7 +125,8 @@ static int bch2_ioc_fsgetxattr(struct bch_inode_info *inode,
 	return copy_to_user(arg, &fa, sizeof(fa));
 }
 
-static int fssetxattr_inode_update_fn(struct bch_inode_info *inode,
+static int fssetxattr_inode_update_fn(struct btree_trans *trans,
+				      struct bch_inode_info *inode,
 				      struct bch_inode_unpacked *bi,
 				      void *p)
 {
@@ -135,7 +137,7 @@ static int fssetxattr_inode_update_fn(struct bch_inode_info *inode,
 		bi->bi_project = s->projid;
 	}
 
-	return bch2_inode_flags_set(inode, bi, p);
+	return bch2_inode_flags_set(trans, inode, bi, p);
 }
 
 static int bch2_ioc_fssetxattr(struct bch_fs *c,
@@ -192,7 +194,8 @@ err:
 	return ret;
 }
 
-static int bch2_reinherit_attrs_fn(struct bch_inode_info *inode,
+static int bch2_reinherit_attrs_fn(struct btree_trans *trans,
+				   struct bch_inode_info *inode,
 				   struct bch_inode_unpacked *bi,
 				   void *p)
 {
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index aa7ec5dc9ff1..113518ebd095 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -92,7 +92,7 @@ retry:
 
 	ret   = bch2_inode_peek(&trans, &iter, &inode_u, inode_inum(inode),
 				BTREE_ITER_INTENT) ?:
-		(set ? set(inode, &inode_u, p) : 0) ?:
+		(set ? set(&trans, inode, &inode_u, p) : 0) ?:
 		bch2_inode_write(&trans, &iter, &inode_u) ?:
 		bch2_trans_commit(&trans, NULL, NULL, BTREE_INSERT_NOFAIL);
 
@@ -1414,7 +1414,8 @@ static void bch2_destroy_inode(struct inode *vinode)
 	call_rcu(&vinode->i_rcu, bch2_i_callback);
 }
 
-static int inode_update_times_fn(struct bch_inode_info *inode,
+static int inode_update_times_fn(struct btree_trans *trans,
+				 struct bch_inode_info *inode,
 				 struct bch_inode_unpacked *bi,
 				 void *p)
 {
diff --git a/fs/bcachefs/fs.h b/fs/bcachefs/fs.h
index 6170d214d648..10e11119ded2 100644
--- a/fs/bcachefs/fs.h
+++ b/fs/bcachefs/fs.h
@@ -174,7 +174,8 @@ static inline int bch2_set_projid(struct bch_fs *c,
 struct inode *bch2_vfs_inode_get(struct bch_fs *, subvol_inum);
 
 /* returns 0 if we want to do the update, or error is passed up */
-typedef int (*inode_set_fn)(struct bch_inode_info *,
+typedef int (*inode_set_fn)(struct btree_trans *,
+			    struct bch_inode_info *,
 			    struct bch_inode_unpacked *, void *);
 
 void bch2_inode_update_after_write(struct btree_trans *,
diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c
index 70f78006daf2..6f6b3caf0607 100644
--- a/fs/bcachefs/xattr.c
+++ b/fs/bcachefs/xattr.c
@@ -494,7 +494,8 @@ struct inode_opt_set {
 	bool			defined;
 };
 
-static int inode_opt_set_fn(struct bch_inode_info *inode,
+static int inode_opt_set_fn(struct btree_trans *trans,
+			    struct bch_inode_info *inode,
 			    struct bch_inode_unpacked *bi,
 			    void *p)
 {
-- 
cgit 


From c036359197705e5b133b30154771c3ebad5698b0 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 12 Aug 2023 12:34:47 -0400
Subject: bcachefs: Check for directories in deleted inodes btree

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/inode.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index e0d416553bf0..46c4012facb0 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -1048,6 +1048,11 @@ static int may_delete_deleted_inode(struct btree_trans *trans, struct bpos pos)
 	if (ret)
 		goto err;
 
+	if (fsck_err_on(S_ISDIR(inode.bi_mode), c,
+			"directory %llu:%u in deleted_inodes btree",
+			pos.offset, pos.snapshot))
+		goto delete;
+
 	if (fsck_err_on(!(inode.bi_flags & BCH_INODE_UNLINKED), c,
 			"non-deleted inode %llu:%u in deleted_inodes btree",
 			pos.offset, pos.snapshot))
-- 
cgit 


From c294ea50da4b1a0ee84253f46391aa87a6efe91c Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 12 Aug 2023 15:05:06 -0400
Subject: bcachefs: six locks: Fix missing barrier on wait->lock_acquired

Six locks do lock handoff via the wakeup path: the thread doing the
wakeup also takes the lock on behalf of the waiter, which means the
waiter only has to look at its waitlist entry, and doesn't have to touch
the lock cacheline while another thread is using it.

Linus noticed that this needs a real barrier, which this patch fixes.

Also add a comment for the should_sleep_fn() error path.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: linux-bcachefs@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
---
 fs/bcachefs/six.c | 33 +++++++++++++++++++++++++--------
 1 file changed, 25 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/six.c b/fs/bcachefs/six.c
index 7ce45aeaee8d..0473aa4dd18a 100644
--- a/fs/bcachefs/six.c
+++ b/fs/bcachefs/six.c
@@ -221,14 +221,16 @@ again:
 		if (ret <= 0)
 			goto unlock;
 
-		__list_del(w->list.prev, w->list.next);
 		task = w->task;
+		__list_del(w->list.prev, w->list.next);
 		/*
-		 * Do no writes to @w besides setting lock_acquired - otherwise
-		 * we would need a memory barrier:
+		 * The release barrier here ensures the ordering of the
+		 * __list_del before setting w->lock_acquired; @w is on the
+		 * stack of the thread doing the waiting and will be reused
+		 * after it sees w->lock_acquired with no other locking:
+		 * pairs with smp_load_acquire() in six_lock_slowpath()
 		 */
-		barrier();
-		w->lock_acquired = true;
+		smp_store_release(&w->lock_acquired, true);
 		wake_up_process(task);
 	}
 
@@ -499,17 +501,32 @@ static int six_lock_slowpath(struct six_lock *lock, enum six_lock_type type,
 	while (1) {
 		set_current_state(TASK_UNINTERRUPTIBLE);
 
-		if (wait->lock_acquired)
+		/*
+		 * Ensures that writes to the waitlist entry happen after we see
+		 * wait->lock_acquired: pairs with the smp_store_release in
+		 * __six_lock_wakeup
+		 */
+		if (smp_load_acquire(&wait->lock_acquired))
 			break;
 
 		ret = should_sleep_fn ? should_sleep_fn(lock, p) : 0;
 		if (unlikely(ret)) {
+			bool acquired;
+
+			/*
+			 * If should_sleep_fn() returns an error, we are
+			 * required to return that error even if we already
+			 * acquired the lock - should_sleep_fn() might have
+			 * modified external state (e.g. when the deadlock cycle
+			 * detector in bcachefs issued a transaction restart)
+			 */
 			raw_spin_lock(&lock->wait_lock);
-			if (!wait->lock_acquired)
+			acquired = wait->lock_acquired;
+			if (!acquired)
 				list_del(&wait->list);
 			raw_spin_unlock(&lock->wait_lock);
 
-			if (unlikely(wait->lock_acquired))
+			if (unlikely(acquired))
 				do_six_unlock_type(lock, type);
 			break;
 		}
-- 
cgit 


From 73ded163e5ec47d229683b32c501e548b745d032 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 12 Aug 2023 16:46:54 -0400
Subject: bcachefs: Add a comment for should_drop_open_bucket()

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_foreground.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index 089185a661ee..e02749ddc362 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -989,7 +989,6 @@ retry_blocking:
 			cl = _cl;
 			goto retry_blocking;
 		}
-
 	}
 
 	return ret;
@@ -1031,6 +1030,16 @@ static int open_bucket_add_buckets(struct btree_trans *trans,
 	return ret < 0 ? ret : 0;
 }
 
+/**
+ * should_drop_bucket - check if this is open_bucket should go away
+ * @ca:		if set, we're killing buckets for a particular device
+ * @ec:		if true, we're shutting down erasure coding and killing all ec
+ *		open_buckets
+ *		otherwise, return true
+ *
+ * We're killing open_buckets because we're shutting down a device, erasure
+ * coding, or the entire filesystem - check if this open_bucket matches:
+ */
 static bool should_drop_bucket(struct open_bucket *ob, struct bch_fs *c,
 			       struct bch_dev *ca, bool ec)
 {
-- 
cgit 


From df5a915a15a7343170f6ce707d807daa9efc245c Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 12 Aug 2023 16:52:33 -0400
Subject: bcachefs: Fix lifetime in bch2_write_done(), add assertion

We're hunting for an open_bucket leak, add an assertion to help track it
down: also, we can't use the bch_fs after dropping our write ref to it.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/io.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index f42d9da2e16e..499585d7cc5d 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -710,13 +710,15 @@ static void bch2_write_done(struct closure *cl)
 	struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
 	struct bch_fs *c = op->c;
 
+	EBUG_ON(op->open_buckets.nr);
+
+	bch2_time_stats_update(&c->times[BCH_TIME_data_write], op->start_time);
 	bch2_disk_reservation_put(c, &op->res);
+
 	if (!(op->flags & BCH_WRITE_MOVE))
 		bch2_write_ref_put(c, BCH_WRITE_REF_write);
 	bch2_keylist_free(&op->insert_keys, op->inline_keys);
 
-	bch2_time_stats_update(&c->times[BCH_TIME_data_write], op->start_time);
-
 	EBUG_ON(cl->parent);
 	closure_debug_destroy(cl);
 	if (op->end_io)
-- 
cgit 


From 93ee2c4b211b6fcfa0d796801372732a8214617c Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 12 Aug 2023 16:51:45 -0400
Subject: bcachefs: Don't open code closure_nr_remaining()

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update_interior.c | 2 +-
 fs/bcachefs/data_update.c           | 2 +-
 fs/bcachefs/io.c                    | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 986dd541435a..c741150e68af 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -2385,7 +2385,7 @@ void bch2_btree_updates_to_text(struct printbuf *out, struct bch_fs *c)
 		       as,
 		       as->mode,
 		       as->nodes_written,
-		       atomic_read(&as->cl.remaining) & CLOSURE_REMAINING_MASK,
+		       closure_nr_remaining(&as->cl),
 		       as->journal.seq);
 	mutex_unlock(&c->btree_interior_update_lock);
 }
diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c
index cfc624463700..81518f20d37d 100644
--- a/fs/bcachefs/data_update.c
+++ b/fs/bcachefs/data_update.c
@@ -415,7 +415,7 @@ void bch2_update_unwritten_extent(struct btree_trans *trans,
 			break;
 	}
 
-	if ((atomic_read(&cl.remaining) & CLOSURE_REMAINING_MASK) != 1) {
+	if (closure_nr_remaining(&cl) != 1) {
 		bch2_trans_unlock(trans);
 		closure_sync(&cl);
 	}
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index 499585d7cc5d..a3dc944d63cf 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -478,7 +478,7 @@ retry:
 	ret = bch2_extent_update(trans, inum, iter, new.k, &disk_res,
 				 0, i_sectors_delta, true);
 out:
-	if ((atomic_read(&cl.remaining) & CLOSURE_REMAINING_MASK) != 1) {
+	if (closure_nr_remaining(&cl) != 1) {
 		bch2_trans_unlock(trans);
 		closure_sync(&cl);
 	}
-- 
cgit 


From f854ce4d0a3f7281b5b99b28dd028abe21c1b0c8 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 12 Aug 2023 17:10:42 -0400
Subject: bcachefs: six locks: Guard against wakee exiting in
 __six_lock_wakeup()

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/six.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/six.c b/fs/bcachefs/six.c
index 0473aa4dd18a..7faa27310de4 100644
--- a/fs/bcachefs/six.c
+++ b/fs/bcachefs/six.c
@@ -8,6 +8,7 @@
 #include <linux/sched.h>
 #include <linux/sched/clock.h>
 #include <linux/sched/rt.h>
+#include <linux/sched/task.h>
 #include <linux/slab.h>
 
 #include "six.h"
@@ -221,7 +222,12 @@ again:
 		if (ret <= 0)
 			goto unlock;
 
-		task = w->task;
+		/*
+		 * Similar to percpu_rwsem_wake_function(), we need to guard
+		 * against the wakee noticing w->lock_acquired, returning, and
+		 * then exiting before we do the wakeup:
+		 */
+		task = get_task_struct(w->task);
 		__list_del(w->list.prev, w->list.next);
 		/*
 		 * The release barrier here ensures the ordering of the
@@ -232,6 +238,7 @@ again:
 		 */
 		smp_store_release(&w->lock_acquired, true);
 		wake_up_process(task);
+		put_task_struct(task);
 	}
 
 	six_clear_bitmask(lock, SIX_LOCK_WAITING_read << lock_type);
-- 
cgit 


From 01a7e74fe14179cba90bf3f52ad3188a1d6819d2 Mon Sep 17 00:00:00 2001
From: Joshua Ashton <joshua@froggi.es>
Date: Sat, 12 Aug 2023 22:26:29 +0100
Subject: bcachefs: Introduce bch2_dirent_get_name

A nice cleanup that avoids a bunch of open-coding name/string usage
around dirent usage.

Will be used by casefolding impl in future commits.

Signed-off-by: Joshua Ashton <joshua@froggi.es>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/dirent.c | 52 ++++++++++++++++++++++++++++++----------------------
 fs/bcachefs/dirent.h |  2 +-
 fs/bcachefs/fs.c     |  8 +++++---
 3 files changed, 36 insertions(+), 26 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c
index 065ea59ee9fa..a87c4e5f089d 100644
--- a/fs/bcachefs/dirent.c
+++ b/fs/bcachefs/dirent.c
@@ -13,7 +13,7 @@
 
 #include <linux/dcache.h>
 
-unsigned bch2_dirent_name_bytes(struct bkey_s_c_dirent d)
+static unsigned bch2_dirent_name_bytes(struct bkey_s_c_dirent d)
 {
 	unsigned len = bkey_val_bytes(d.k) -
 		offsetof(struct bch_dirent, d_name);
@@ -21,6 +21,11 @@ unsigned bch2_dirent_name_bytes(struct bkey_s_c_dirent d)
 	return strnlen(d.v->d_name, len);
 }
 
+struct qstr bch2_dirent_get_name(struct bkey_s_c_dirent d)
+{
+	return (struct qstr) QSTR_INIT(d.v->d_name, bch2_dirent_name_bytes(d));
+}
+
 static u64 bch2_dirent_hash(const struct bch_hash_info *info,
 			    const struct qstr *name)
 {
@@ -41,7 +46,7 @@ static u64 dirent_hash_key(const struct bch_hash_info *info, const void *key)
 static u64 dirent_hash_bkey(const struct bch_hash_info *info, struct bkey_s_c k)
 {
 	struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
-	struct qstr name = QSTR_INIT(d.v->d_name, bch2_dirent_name_bytes(d));
+	struct qstr name = bch2_dirent_get_name(d);
 
 	return bch2_dirent_hash(info, &name);
 }
@@ -49,20 +54,20 @@ static u64 dirent_hash_bkey(const struct bch_hash_info *info, struct bkey_s_c k)
 static bool dirent_cmp_key(struct bkey_s_c _l, const void *_r)
 {
 	struct bkey_s_c_dirent l = bkey_s_c_to_dirent(_l);
-	int len = bch2_dirent_name_bytes(l);
-	const struct qstr *r = _r;
+	const struct qstr l_name = bch2_dirent_get_name(l);
+	const struct qstr *r_name = _r;
 
-	return len - r->len ?: memcmp(l.v->d_name, r->name, len);
+	return l_name.len - r_name->len ?: memcmp(l_name.name, r_name->name, l_name.len);
 }
 
 static bool dirent_cmp_bkey(struct bkey_s_c _l, struct bkey_s_c _r)
 {
 	struct bkey_s_c_dirent l = bkey_s_c_to_dirent(_l);
 	struct bkey_s_c_dirent r = bkey_s_c_to_dirent(_r);
-	int l_len = bch2_dirent_name_bytes(l);
-	int r_len = bch2_dirent_name_bytes(r);
+	const struct qstr l_name = bch2_dirent_get_name(l);
+	const struct qstr r_name = bch2_dirent_get_name(r);
 
-	return l_len - r_len ?: memcmp(l.v->d_name, r.v->d_name, l_len);
+	return l_name.len - r_name.len ?: memcmp(l_name.name, r_name.name, l_name.len);
 }
 
 static bool dirent_is_visible(subvol_inum inum, struct bkey_s_c k)
@@ -89,37 +94,36 @@ int bch2_dirent_invalid(const struct bch_fs *c, struct bkey_s_c k,
 			struct printbuf *err)
 {
 	struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
-	unsigned len;
+	struct qstr d_name = bch2_dirent_get_name(d);
 
-	len = bch2_dirent_name_bytes(d);
-	if (!len) {
+	if (!d_name.len) {
 		prt_printf(err, "empty name");
 		return -BCH_ERR_invalid_bkey;
 	}
 
-	if (bkey_val_u64s(k.k) > dirent_val_u64s(len)) {
+	if (bkey_val_u64s(k.k) > dirent_val_u64s(d_name.len)) {
 		prt_printf(err, "value too big (%zu > %u)",
-		       bkey_val_u64s(k.k), dirent_val_u64s(len));
+		       bkey_val_u64s(k.k), dirent_val_u64s(d_name.len));
 		return -BCH_ERR_invalid_bkey;
 	}
 
-	if (len > BCH_NAME_MAX) {
+	if (d_name.len > BCH_NAME_MAX) {
 		prt_printf(err, "dirent name too big (%u > %u)",
-		       len, BCH_NAME_MAX);
+		       d_name.len, BCH_NAME_MAX);
 		return -BCH_ERR_invalid_bkey;
 	}
 
-	if (len == 1 && !memcmp(d.v->d_name, ".", 1)) {
+	if (d_name.len == 1 && !memcmp(d_name.name, ".", 1)) {
 		prt_printf(err, "invalid name");
 		return -BCH_ERR_invalid_bkey;
 	}
 
-	if (len == 2 && !memcmp(d.v->d_name, "..", 2)) {
+	if (d_name.len == 2 && !memcmp(d_name.name, "..", 2)) {
 		prt_printf(err, "invalid name");
 		return -BCH_ERR_invalid_bkey;
 	}
 
-	if (memchr(d.v->d_name, '/', len)) {
+	if (memchr(d_name.name, '/', d_name.len)) {
 		prt_printf(err, "invalid name");
 		return -BCH_ERR_invalid_bkey;
 	}
@@ -137,10 +141,11 @@ void bch2_dirent_to_text(struct printbuf *out, struct bch_fs *c,
 			 struct bkey_s_c k)
 {
 	struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
+	struct qstr d_name = bch2_dirent_get_name(d);
 
 	prt_printf(out, "%.*s -> %llu type %s",
-	       bch2_dirent_name_bytes(d),
-	       d.v->d_name,
+	       d_name.len,
+	       d_name.name,
 	       d.v->d_type != DT_SUBVOL
 	       ? le64_to_cpu(d.v->d_inum)
 	       : le32_to_cpu(d.v->d_child_subvol),
@@ -507,6 +512,7 @@ int bch2_readdir(struct bch_fs *c, subvol_inum inum, struct dir_context *ctx)
 	subvol_inum target;
 	u32 snapshot;
 	struct bkey_buf sk;
+	struct qstr name;
 	int ret;
 
 	bch2_bkey_buf_init(&sk);
@@ -537,9 +543,11 @@ retry:
 		dirent = bkey_i_to_s_c_dirent(sk.k);
 		bch2_trans_unlock(&trans);
 
+		name = bch2_dirent_get_name(dirent);
+
 		ctx->pos = dirent.k->p.offset;
-		if (!dir_emit(ctx, dirent.v->d_name,
-			      bch2_dirent_name_bytes(dirent),
+		if (!dir_emit(ctx, name.name,
+			      name.len,
 			      target.inum,
 			      vfs_d_type(dirent.v->d_type)))
 			break;
diff --git a/fs/bcachefs/dirent.h b/fs/bcachefs/dirent.h
index b42f4a13bc55..e9fa1df38232 100644
--- a/fs/bcachefs/dirent.h
+++ b/fs/bcachefs/dirent.h
@@ -24,7 +24,7 @@ struct bch_fs;
 struct bch_hash_info;
 struct bch_inode_info;
 
-unsigned bch2_dirent_name_bytes(struct bkey_s_c_dirent);
+struct qstr bch2_dirent_get_name(struct bkey_s_c_dirent d);
 
 static inline unsigned dirent_val_u64s(unsigned len)
 {
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index 113518ebd095..0e1b31707d80 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -1237,7 +1237,8 @@ static int bch2_get_name(struct dentry *parent, char *name, struct dentry *child
 	struct bch_inode_unpacked inode_u;
 	subvol_inum target;
 	u32 snapshot;
-	unsigned name_len;
+	struct qstr dirent_name;
+	unsigned name_len = 0;
 	int ret;
 
 	if (!S_ISDIR(dir->v.i_mode))
@@ -1314,9 +1315,10 @@ retry:
 	ret = -ENOENT;
 	goto err;
 found:
-	name_len = min_t(unsigned, bch2_dirent_name_bytes(d), NAME_MAX);
+	dirent_name = bch2_dirent_get_name(d);
 
-	memcpy(name, d.v->d_name, name_len);
+	name_len = min_t(unsigned, dirent_name.len, NAME_MAX);
+	memcpy(name, dirent_name.name, name_len);
 	name[name_len] = '\0';
 err:
 	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-- 
cgit 


From 29c336afebb9013706334c1b451f31fe5ff37f34 Mon Sep 17 00:00:00 2001
From: Joshua Ashton <joshua@froggi.es>
Date: Sat, 12 Aug 2023 22:26:30 +0100
Subject: bcachefs: Optimize bch2_dirent_name_bytes

Avoids doing a full strnlen for getting the length of the name of a
dirent entry.

Given the fact that the name of dirents is stored at the end of the
bkey's value, and we know the length of that in u64s, we can find the
last u64 and figure out how many NUL bytes are at the end of the string.

On little endian systems this ends up being the leading zeros of the
last u64, whereas on big endian systems this ends up being the trailing
zeros of the last u64.
We can take that value in bits and divide it by 8 to get the number of
NUL bytes at the end.

There is no endian-fixup or other compatibility here as this is string
data interpreted as a u64.

Signed-off-by: Joshua Ashton <joshua@froggi.es>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/dirent.c | 21 +++++++++++++++++----
 1 file changed, 17 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c
index a87c4e5f089d..6f9eb88c7dba 100644
--- a/fs/bcachefs/dirent.c
+++ b/fs/bcachefs/dirent.c
@@ -15,10 +15,18 @@
 
 static unsigned bch2_dirent_name_bytes(struct bkey_s_c_dirent d)
 {
-	unsigned len = bkey_val_bytes(d.k) -
-		offsetof(struct bch_dirent, d_name);
-
-	return strnlen(d.v->d_name, len);
+	unsigned bkey_u64s = bkey_val_u64s(d.k);
+	unsigned bkey_bytes = bkey_u64s * sizeof(u64);
+	u64 last_u64 = ((u64*)d.v)[bkey_u64s - 1];
+#if CPU_BIG_ENDIAN
+	unsigned trailing_nuls = last_u64 ? __builtin_ctzll(last_u64) / 8 : 64 / 8;
+#else
+	unsigned trailing_nuls = last_u64 ? __builtin_clzll(last_u64) / 8 : 64 / 8;
+#endif
+
+	return bkey_bytes -
+		offsetof(struct bch_dirent, d_name) -
+		trailing_nuls;
 }
 
 struct qstr bch2_dirent_get_name(struct bkey_s_c_dirent d)
@@ -113,6 +121,11 @@ int bch2_dirent_invalid(const struct bch_fs *c, struct bkey_s_c k,
 		return -BCH_ERR_invalid_bkey;
 	}
 
+	if (d_name.len != strnlen(d_name.name, d_name.len)) {
+		prt_printf(err, "dirent has stray data after name's NUL");
+		return -BCH_ERR_invalid_bkey;
+	}
+
 	if (d_name.len == 1 && !memcmp(d_name.name, ".", 1)) {
 		prt_printf(err, "invalid name");
 		return -BCH_ERR_invalid_bkey;
-- 
cgit 


From a125c0742ccb0b5f2bc84f3f1a8bcee173c1130d Mon Sep 17 00:00:00 2001
From: Joshua Ashton <joshua@froggi.es>
Date: Sun, 13 Aug 2023 16:53:45 +0100
Subject: bcachefs: Lower BCH_NAME_MAX to 512

To ensure we aren't shooting ourselves in the foot after merge for
potentially doing future revisions for dirent or for storing multiple
names for casefolding, limit this to 512 for now.

Previously this define was linked to the max size a d_name in
bch_dirent could be.

Signed-off-by: Joshua Ashton <joshua@froggi.es>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs_format.h | 4 +---
 fs/bcachefs/dirent.c          | 6 +++++-
 2 files changed, 6 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index 5ec218ee3569..23bae622309c 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -916,9 +916,7 @@ struct bch_dirent {
 #define DT_SUBVOL	16
 #define BCH_DT_MAX	17
 
-#define BCH_NAME_MAX	((unsigned) (U8_MAX * sizeof(__u64) -		\
-			 sizeof(struct bkey) -				\
-			 offsetof(struct bch_dirent, d_name)))
+#define BCH_NAME_MAX	512
 
 /* Xattrs */
 
diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c
index 6f9eb88c7dba..a7559ab03802 100644
--- a/fs/bcachefs/dirent.c
+++ b/fs/bcachefs/dirent.c
@@ -115,7 +115,11 @@ int bch2_dirent_invalid(const struct bch_fs *c, struct bkey_s_c k,
 		return -BCH_ERR_invalid_bkey;
 	}
 
-	if (d_name.len > BCH_NAME_MAX) {
+	/*
+	 * Check new keys don't exceed the max length
+	 * (older keys may be larger.)
+	 */
+	if ((flags & BKEY_INVALID_COMMIT) && d_name.len > BCH_NAME_MAX) {
 		prt_printf(err, "dirent name too big (%u > %u)",
 		       d_name.len, BCH_NAME_MAX);
 		return -BCH_ERR_invalid_bkey;
-- 
cgit 


From e9679b4a0618b0b55d22ec555bc9c6b2dab39809 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 13 Aug 2023 13:04:08 -0400
Subject: bcachefs: Fix 'pointer to invalid device' check

This fixes the device removal tests, which have been failing at random
due to the fact that when we're running the .key_invalid checks in the
write path the key may actually no longer exist - we might be racing
with the keys being deleted.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/extents.c | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index d7f74db4c83e..1b25f84e4b9c 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -1059,6 +1059,7 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
 
 static int extent_ptr_invalid(const struct bch_fs *c,
 			      struct bkey_s_c k,
+			      enum bkey_invalid_flags flags,
 			      const struct bch_extent_ptr *ptr,
 			      unsigned size_ondisk,
 			      bool metadata,
@@ -1071,6 +1072,14 @@ static int extent_ptr_invalid(const struct bch_fs *c,
 	struct bch_dev *ca;
 
 	if (!bch2_dev_exists2(c, ptr->dev)) {
+		/*
+		 * If we're in the write path this key might have already been
+		 * overwritten, and we could be seeing a device that doesn't
+		 * exist anymore due to racing with device removal:
+		 */
+		if (flags & BKEY_INVALID_WRITE)
+			return 0;
+
 		prt_printf(err, "pointer to invalid device (%u)", ptr->dev);
 		return -BCH_ERR_invalid_bkey;
 	}
@@ -1136,8 +1145,8 @@ int bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k,
 
 		switch (extent_entry_type(entry)) {
 		case BCH_EXTENT_ENTRY_ptr:
-			ret = extent_ptr_invalid(c, k, &entry->ptr, size_ondisk,
-						 false, err);
+			ret = extent_ptr_invalid(c, k, flags, &entry->ptr,
+						 size_ondisk, false, err);
 			if (ret)
 				return ret;
 
-- 
cgit 


From ff5b741c25fb9546d876ca4c0c1d8720f6a2471c Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 13 Aug 2023 18:15:53 -0400
Subject: bcachefs: Zero btree_paths on allocation

This fixes a bug in the cycle detector, bch2_check_for_deadlock() - we
have to make sure the node pointers in the btree paths array are set to
something not-garbage before another thread may see them.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index ad95849845a5..d22412dc5b46 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -2898,12 +2898,14 @@ static void bch2_trans_alloc_paths(struct btree_trans *trans, struct bch_fs *c)
 #ifdef __KERNEL__
 	p = this_cpu_xchg(c->btree_paths_bufs->path, NULL);
 #endif
-	if (!p)
+	if (!p) {
 		p = mempool_alloc(&trans->c->btree_paths_pool, GFP_NOFS);
-	/*
-	 * paths need to be zeroed, bch2_check_for_deadlock looks at paths in
-	 * other threads
-	 */
+		/*
+		 * paths need to be zeroed, bch2_check_for_deadlock looks at
+		 * paths in other threads
+		 */
+		memset(p, 0, paths_bytes);
+	}
 
 	trans->paths		= p; p += paths_bytes;
 	trans->updates		= p; p += updates_bytes;
-- 
cgit 


From c8ef8c3eb573ae2d7f7f41d93f0e34b8f6dfcd6f Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 13 Aug 2023 18:04:32 -0400
Subject: bcachefs: Fix bch2_extent_fallocate()

 - There was no need for a retry loop in bch2_extent_fallocate(); if we
   have to retry we may be overwriting something different and we need
   to return an error and let the caller retry.
 - The bch2_alloc_sectors_start() error path was wrong, and wasn't
   running our cleanup at the end of the function

This also fixes a very rare open bucket leak due to the missing cleanup.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/io.c | 39 ++++++++++++++-------------------------
 1 file changed, 14 insertions(+), 25 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index a3dc944d63cf..3c614c864b6e 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -380,10 +380,10 @@ int bch2_extent_fallocate(struct btree_trans *trans,
 	struct bch_fs *c = trans->c;
 	struct disk_reservation disk_res = { 0 };
 	struct closure cl;
-	struct open_buckets open_buckets;
+	struct open_buckets open_buckets = { 0 };
 	struct bkey_s_c k;
 	struct bkey_buf old, new;
-	unsigned sectors_allocated;
+	unsigned sectors_allocated = 0;
 	bool have_reservation = false;
 	bool unwritten = opts.nocow &&
 	    c->sb.version >= bcachefs_metadata_version_unwritten_extents;
@@ -392,9 +392,6 @@ int bch2_extent_fallocate(struct btree_trans *trans,
 	bch2_bkey_buf_init(&old);
 	bch2_bkey_buf_init(&new);
 	closure_init_stack(&cl);
-	open_buckets.nr = 0;
-retry:
-	sectors_allocated = 0;
 
 	k = bch2_btree_iter_peek_slot(iter);
 	ret = bkey_err(k);
@@ -413,14 +410,14 @@ retry:
 		 */
 		ret = bch2_disk_reservation_get(c, &disk_res, sectors, new_replicas, 0);
 		if (unlikely(ret))
-			goto out;
+			goto err;
 
 		bch2_bkey_buf_reassemble(&old, c, k);
 	}
 
 	if (have_reservation) {
 		if (!bch2_extents_match(k, bkey_i_to_s_c(old.k)))
-			goto out;
+			goto err;
 
 		bch2_key_resize(&new.k->k, sectors);
 	} else if (!unwritten) {
@@ -452,13 +449,10 @@ retry:
 				opts.data_replicas,
 				opts.data_replicas,
 				BCH_WATERMARK_normal, 0, &cl, &wp);
-		if (ret) {
-			bch2_trans_unlock(trans);
-			closure_sync(&cl);
-			if (bch2_err_matches(ret, BCH_ERR_operation_blocked))
-				goto retry;
-			return ret;
-		}
+		if (bch2_err_matches(ret, BCH_ERR_operation_blocked))
+			ret = -BCH_ERR_transaction_restart_nested;
+		if (ret)
+			goto err;
 
 		sectors = min(sectors, wp->sectors_free);
 		sectors_allocated = sectors;
@@ -477,17 +471,7 @@ retry:
 
 	ret = bch2_extent_update(trans, inum, iter, new.k, &disk_res,
 				 0, i_sectors_delta, true);
-out:
-	if (closure_nr_remaining(&cl) != 1) {
-		bch2_trans_unlock(trans);
-		closure_sync(&cl);
-	}
-
-	if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
-		bch2_trans_begin(trans);
-		goto retry;
-	}
-
+err:
 	if (!ret && sectors_allocated)
 		bch2_increment_clock(c, sectors_allocated, WRITE);
 
@@ -496,6 +480,11 @@ out:
 	bch2_bkey_buf_exit(&new, c);
 	bch2_bkey_buf_exit(&old, c);
 
+	if (closure_nr_remaining(&cl) != 1) {
+		bch2_trans_unlock(trans);
+		closure_sync(&cl);
+	}
+
 	return ret;
 }
 
-- 
cgit 


From 029b85fe417c36b5689093dd4861c7980299a50f Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 13 Aug 2023 19:34:02 -0400
Subject: bcachefs: Fix bkey format calculation

For extents, we increase the number of bits of the size field to allow
extents to get bigger due to merging - but this code didn't check for
overflow.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bkey.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bkey.c b/fs/bcachefs/bkey.c
index d6960e259c80..0a5bfe6e9a2d 100644
--- a/fs/bcachefs/bkey.c
+++ b/fs/bcachefs/bkey.c
@@ -591,8 +591,10 @@ struct bkey_format bch2_bkey_format_done(struct bkey_format_state *s)
 
 	/* allow for extent merging: */
 	if (ret.bits_per_field[BKEY_FIELD_SIZE]) {
-		ret.bits_per_field[BKEY_FIELD_SIZE] += 4;
-		bits += 4;
+		unsigned b = min(4U, 32U - ret.bits_per_field[BKEY_FIELD_SIZE]);
+
+		ret.bits_per_field[BKEY_FIELD_SIZE] += b;
+		bits += b;
 	}
 
 	ret.key_u64s = DIV_ROUND_UP(bits, 64);
-- 
cgit 


From 8c9b0f7bdc4183573d57f0442693d99cc9758617 Mon Sep 17 00:00:00 2001
From: Brian Foster <bfoster@redhat.com>
Date: Mon, 14 Aug 2023 10:49:42 -0400
Subject: bcachefs: fix up wonky error handling in bch2_seek_pagecache_hole()

The folio_hole_offset() helper returns a mix of bool and int types.
The latter is to support a possible -EAGAIN error code when using
nonblocking locks. This is not only confusing, but the only caller
also essentially ignores errors outside of stopping the range
iteration. This means an -EAGAIN error can't return directly from
folio_hole_offset() and may be lost via bch2_clamp_data_hole().

Fix up the error handling and make it more readable.
__filemap_get_folio() returns -ENOENT instead of NULL when no folio
exists, so reuse the same error code in folio_hole_offset(). Fix up
bch2_seek_pagecache_hole() to return the current offset on -ENOENT,
but otherwise return unexpected error code up to the caller.

Signed-off-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-io-pagecache.c | 26 +++++++++++++++++---------
 1 file changed, 17 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-io-pagecache.c b/fs/bcachefs/fs-io-pagecache.c
index 2c1ef13d9bcd..1e60eead2981 100644
--- a/fs/bcachefs/fs-io-pagecache.c
+++ b/fs/bcachefs/fs-io-pagecache.c
@@ -698,20 +698,26 @@ loff_t bch2_seek_pagecache_data(struct inode *vinode,
 	return end_offset;
 }
 
+/*
+ * Search for a hole in a folio.
+ *
+ * The filemap layer returns -ENOENT if no folio exists, so reuse the same error
+ * code to indicate a pagecache hole exists at the returned offset. Otherwise
+ * return 0 if the folio is filled with data, or an error code. This function
+ * can return -EAGAIN if nonblock is specified.
+ */
 static int folio_hole_offset(struct address_space *mapping, loff_t *offset,
 			      unsigned min_replicas, bool nonblock)
 {
 	struct folio *folio;
 	struct bch_folio *s;
 	unsigned i, sectors;
-	bool ret = true;
+	int ret = -ENOENT;
 
 	folio = __filemap_get_folio(mapping, *offset >> PAGE_SHIFT,
 				    FGP_LOCK|(nonblock ? FGP_NOWAIT : 0), 0);
-	if (folio == ERR_PTR(-EAGAIN))
-		return -EAGAIN;
-	if (IS_ERR_OR_NULL(folio))
-		return true;
+	if (IS_ERR(folio))
+		return PTR_ERR(folio);
 
 	s = bch2_folio(folio);
 	if (!s)
@@ -727,7 +733,7 @@ static int folio_hole_offset(struct address_space *mapping, loff_t *offset,
 		}
 
 	*offset = folio_end_pos(folio);
-	ret = false;
+	ret = 0;
 unlock:
 	folio_unlock(folio);
 	folio_put(folio);
@@ -742,11 +748,13 @@ loff_t bch2_seek_pagecache_hole(struct inode *vinode,
 {
 	struct address_space *mapping = vinode->i_mapping;
 	loff_t offset = start_offset;
+	loff_t ret = 0;
 
-	while (offset < end_offset &&
-	       !folio_hole_offset(mapping, &offset, min_replicas, nonblock))
-		;
+	while (!ret && offset < end_offset)
+		ret = folio_hole_offset(mapping, &offset, min_replicas, nonblock);
 
+	if (ret && ret != -ENOENT)
+		return ret;
 	return min(offset, end_offset);
 }
 
-- 
cgit 


From 62898dd12ba017c56c39a74ef891845118b0c190 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 14 Aug 2023 22:29:41 -0400
Subject: bcachefs: Fix swallowing of data in buffered write path

In __bch2_buffered_write, if we fail to write to an entire !uptodate
folio, we have to back out the write, bail out and retry.

But we were missing an iov_iter_revert() call, so the data written to
the folio was lost and the rest of the write shifted to the wrong
offset.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-io-buffered.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-io-buffered.c b/fs/bcachefs/fs-io-buffered.c
index 102c70887f76..dc22182d532f 100644
--- a/fs/bcachefs/fs-io-buffered.c
+++ b/fs/bcachefs/fs-io-buffered.c
@@ -909,6 +909,7 @@ static int __bch2_buffered_write(struct bch_inode_info *inode,
 		if (!folio_test_uptodate(f) &&
 		    f_copied != folio_size(f) &&
 		    pos + copied + f_copied < inode->v.i_size) {
+			iov_iter_revert(iter, f_copied);
 			folio_zero_range(f, 0, folio_size(f));
 			folios_trunc(&folios, fi);
 			break;
-- 
cgit 


From e5570df295fc951546fec782bc087f72a47277e0 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 16 Aug 2023 15:05:18 -0400
Subject: bcachefs: stack_trace_save_tsk() depends on CONFIG_STACKTRACE

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/util.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c
index e4f21fcae944..1ab7e247cca6 100644
--- a/fs/bcachefs/util.c
+++ b/fs/bcachefs/util.c
@@ -269,6 +269,7 @@ void bch2_print_string_as_lines(const char *prefix, const char *lines)
 
 int bch2_save_backtrace(bch_stacktrace *stack, struct task_struct *task)
 {
+#ifdef CONFIG_STACKTRACE
 	unsigned nr_entries = 0;
 	int ret = 0;
 
@@ -289,6 +290,9 @@ int bch2_save_backtrace(bch_stacktrace *stack, struct task_struct *task)
 	up_read(&task->signal->exec_update_lock);
 
 	return ret;
+#else
+	return 0;
+#endif
 }
 
 void bch2_prt_backtrace(struct printbuf *out, bch_stacktrace *stack)
-- 
cgit 


From 8e877caaad818595ecb6754355cea2058fd9848e Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 16 Aug 2023 16:54:33 -0400
Subject: bcachefs: Split out snapshot.c

subvolume.c has gotten a bit large, this splits out a separate file just
for managing snapshot trees - BTREE_ID_snapshots.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/Makefile             |    1 +
 fs/bcachefs/bcachefs_format.h    |    5 +
 fs/bcachefs/bkey_methods.c       |    1 +
 fs/bcachefs/btree_iter.c         |    2 +-
 fs/bcachefs/btree_trans_commit.c |    2 +-
 fs/bcachefs/btree_update.c       |    2 +-
 fs/bcachefs/fs.c                 |    1 +
 fs/bcachefs/fsck.c               |    2 +-
 fs/bcachefs/inode.c              |    1 +
 fs/bcachefs/quota.c              |    2 +-
 fs/bcachefs/recovery.c           |    1 +
 fs/bcachefs/snapshot.c           | 1358 +++++++++++++++++++++++++++++++++++++
 fs/bcachefs/snapshot.h           |  256 +++++++
 fs/bcachefs/subvolume.c          | 1360 +-------------------------------------
 fs/bcachefs/subvolume.h          |  223 -------
 fs/bcachefs/super.c              |    1 +
 fs/bcachefs/tests.c              |    2 +-
 fs/bcachefs/util.h               |    5 +
 18 files changed, 1655 insertions(+), 1570 deletions(-)
 create mode 100644 fs/bcachefs/snapshot.c
 create mode 100644 fs/bcachefs/snapshot.h

(limited to 'fs')

diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile
index 4b7f384f703f..b4fa88dfd484 100644
--- a/fs/bcachefs/Makefile
+++ b/fs/bcachefs/Makefile
@@ -69,6 +69,7 @@ bcachefs-y		:=	\
 	sb-members.o		\
 	siphash.o		\
 	six.o			\
+	snapshot.o		\
 	subvolume.o		\
 	super.o			\
 	super-io.o		\
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index 23bae622309c..20e96daf9ca1 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -1124,6 +1124,11 @@ struct bch_subvolume {
 	__le32			flags;
 	__le32			snapshot;
 	__le64			inode;
+	/*
+	 * Snapshot subvolumes form a tree, separate from the snapshot nodes
+	 * tree - if this subvolume is a snapshot, this is the ID of the
+	 * subvolume it was created from:
+	 */
 	__le32			parent;
 	__le32			pad;
 	bch_le128		otime;
diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c
index 90557f4c156d..6547142db428 100644
--- a/fs/bcachefs/bkey_methods.c
+++ b/fs/bcachefs/bkey_methods.c
@@ -13,6 +13,7 @@
 #include "lru.h"
 #include "quota.h"
 #include "reflink.h"
+#include "snapshot.h"
 #include "subvolume.h"
 #include "xattr.h"
 
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index d22412dc5b46..98cf52c5e132 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -14,7 +14,7 @@
 #include "extents.h"
 #include "journal.h"
 #include "replicas.h"
-#include "subvolume.h"
+#include "snapshot.h"
 #include "trace.h"
 
 #include <linux/random.h>
diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c
index 78a09aa050c7..83cc7f64c57c 100644
--- a/fs/bcachefs/btree_trans_commit.c
+++ b/fs/bcachefs/btree_trans_commit.c
@@ -14,7 +14,7 @@
 #include "journal.h"
 #include "journal_reclaim.h"
 #include "replicas.h"
-#include "subvolume.h"
+#include "snapshot.h"
 
 #include <linux/prefetch.h>
 
diff --git a/fs/bcachefs/btree_update.c b/fs/bcachefs/btree_update.c
index 612fba60be14..a7fa20727d4b 100644
--- a/fs/bcachefs/btree_update.c
+++ b/fs/bcachefs/btree_update.c
@@ -11,7 +11,7 @@
 #include "error.h"
 #include "extents.h"
 #include "keylist.h"
-#include "subvolume.h"
+#include "snapshot.h"
 #include "trace.h"
 
 static inline int btree_insert_entry_cmp(const struct btree_insert_entry *l,
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index 0e1b31707d80..d2f93a8af4ac 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -23,6 +23,7 @@
 #include "journal.h"
 #include "keylist.h"
 #include "quota.h"
+#include "snapshot.h"
 #include "super.h"
 #include "xattr.h"
 
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 0b4ddf650a97..9524bd621b2c 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -12,7 +12,7 @@
 #include "inode.h"
 #include "keylist.h"
 #include "recovery.h"
-#include "subvolume.h"
+#include "snapshot.h"
 #include "super.h"
 #include "xattr.h"
 
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index 46c4012facb0..8114b6e4f202 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -11,6 +11,7 @@
 #include "extent_update.h"
 #include "inode.h"
 #include "str_hash.h"
+#include "snapshot.h"
 #include "subvolume.h"
 #include "varint.h"
 
diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c
index 4f0654ff816f..ca99772aedc6 100644
--- a/fs/bcachefs/quota.c
+++ b/fs/bcachefs/quota.c
@@ -5,7 +5,7 @@
 #include "error.h"
 #include "inode.h"
 #include "quota.h"
-#include "subvolume.h"
+#include "snapshot.h"
 #include "super-io.h"
 
 static const char * const bch2_quota_types[] = {
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 33a68a335be6..30efb3c90560 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -25,6 +25,7 @@
 #include "recovery.h"
 #include "replicas.h"
 #include "sb-clean.h"
+#include "snapshot.h"
 #include "subvolume.h"
 #include "super-io.h"
 
diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c
new file mode 100644
index 000000000000..ad7991ad87a9
--- /dev/null
+++ b/fs/bcachefs/snapshot.c
@@ -0,0 +1,1358 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "btree_key_cache.h"
+#include "btree_update.h"
+#include "errcode.h"
+#include "error.h"
+#include "fs.h"
+#include "snapshot.h"
+
+#include <linux/random.h>
+
+/*
+ * Snapshot trees:
+ *
+ * Keys in BTREE_ID_snapshot_trees identify a whole tree of snapshot nodes; they
+ * exist to provide a stable identifier for the whole lifetime of a snapshot
+ * tree.
+ */
+
+void bch2_snapshot_tree_to_text(struct printbuf *out, struct bch_fs *c,
+				struct bkey_s_c k)
+{
+	struct bkey_s_c_snapshot_tree t = bkey_s_c_to_snapshot_tree(k);
+
+	prt_printf(out, "subvol %u root snapshot %u",
+		   le32_to_cpu(t.v->master_subvol),
+		   le32_to_cpu(t.v->root_snapshot));
+}
+
+int bch2_snapshot_tree_invalid(const struct bch_fs *c, struct bkey_s_c k,
+			       enum bkey_invalid_flags flags,
+			       struct printbuf *err)
+{
+	if (bkey_gt(k.k->p, POS(0, U32_MAX)) ||
+	    bkey_lt(k.k->p, POS(0, 1))) {
+		prt_printf(err, "bad pos");
+		return -BCH_ERR_invalid_bkey;
+	}
+
+	return 0;
+}
+
+int bch2_snapshot_tree_lookup(struct btree_trans *trans, u32 id,
+			      struct bch_snapshot_tree *s)
+{
+	int ret = bch2_bkey_get_val_typed(trans, BTREE_ID_snapshot_trees, POS(0, id),
+					  BTREE_ITER_WITH_UPDATES, snapshot_tree, s);
+
+	if (bch2_err_matches(ret, ENOENT))
+		ret = -BCH_ERR_ENOENT_snapshot_tree;
+	return ret;
+}
+
+struct bkey_i_snapshot_tree *
+__bch2_snapshot_tree_create(struct btree_trans *trans)
+{
+	struct btree_iter iter;
+	int ret = bch2_bkey_get_empty_slot(trans, &iter,
+			BTREE_ID_snapshot_trees, POS(0, U32_MAX));
+	struct bkey_i_snapshot_tree *s_t;
+
+	if (ret == -BCH_ERR_ENOSPC_btree_slot)
+		ret = -BCH_ERR_ENOSPC_snapshot_tree;
+	if (ret)
+		return ERR_PTR(ret);
+
+	s_t = bch2_bkey_alloc(trans, &iter, 0, snapshot_tree);
+	ret = PTR_ERR_OR_ZERO(s_t);
+	bch2_trans_iter_exit(trans, &iter);
+	return ret ? ERR_PTR(ret) : s_t;
+}
+
+static int bch2_snapshot_tree_create(struct btree_trans *trans,
+				u32 root_id, u32 subvol_id, u32 *tree_id)
+{
+	struct bkey_i_snapshot_tree *n_tree =
+		__bch2_snapshot_tree_create(trans);
+
+	if (IS_ERR(n_tree))
+		return PTR_ERR(n_tree);
+
+	n_tree->v.master_subvol	= cpu_to_le32(subvol_id);
+	n_tree->v.root_snapshot	= cpu_to_le32(root_id);
+	*tree_id = n_tree->k.p.offset;
+	return 0;
+}
+
+/* Snapshot nodes: */
+
+static inline u32 get_ancestor_below(struct snapshot_table *t, u32 id, u32 ancestor)
+{
+	const struct snapshot_t *s = __snapshot_t(t, id);
+
+	if (s->skip[2] <= ancestor)
+		return s->skip[2];
+	if (s->skip[1] <= ancestor)
+		return s->skip[1];
+	if (s->skip[0] <= ancestor)
+		return s->skip[0];
+	return s->parent;
+}
+
+bool __bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor)
+{
+	struct snapshot_table *t;
+	bool ret;
+
+	EBUG_ON(c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_snapshots);
+
+	rcu_read_lock();
+	t = rcu_dereference(c->snapshots);
+
+	while (id && id < ancestor - IS_ANCESTOR_BITMAP)
+		id = get_ancestor_below(t, id, ancestor);
+
+	ret = id && id < ancestor
+		? test_bit(ancestor - id - 1, __snapshot_t(t, id)->is_ancestor)
+		: id == ancestor;
+	rcu_read_unlock();
+
+	return ret;
+}
+
+static bool bch2_snapshot_is_ancestor_early(struct bch_fs *c, u32 id, u32 ancestor)
+{
+	struct snapshot_table *t;
+
+	rcu_read_lock();
+	t = rcu_dereference(c->snapshots);
+
+	while (id && id < ancestor)
+		id = __snapshot_t(t, id)->parent;
+	rcu_read_unlock();
+
+	return id == ancestor;
+}
+
+struct snapshot_t_free_rcu {
+	struct rcu_head		rcu;
+	struct snapshot_table	*t;
+};
+
+static void snapshot_t_free_rcu(struct rcu_head *rcu)
+{
+	struct snapshot_t_free_rcu *free_rcu =
+		container_of(rcu, struct snapshot_t_free_rcu, rcu);
+
+	kvfree(free_rcu->t);
+	kfree(free_rcu);
+}
+
+static noinline struct snapshot_t *__snapshot_t_mut(struct bch_fs *c, u32 id)
+{
+	size_t idx = U32_MAX - id;
+	size_t new_size;
+	struct snapshot_table *new, *old;
+
+	new_size = max(16UL, roundup_pow_of_two(idx + 1));
+
+	new = kvzalloc(struct_size(new, s, new_size), GFP_KERNEL);
+	if (!new)
+		return NULL;
+
+	old = rcu_dereference_protected(c->snapshots, true);
+	if (old)
+		memcpy(new->s,
+		       rcu_dereference_protected(c->snapshots, true)->s,
+		       sizeof(new->s[0]) * c->snapshot_table_size);
+
+	rcu_assign_pointer(c->snapshots, new);
+	c->snapshot_table_size = new_size;
+	if (old) {
+		struct snapshot_t_free_rcu *rcu =
+			kmalloc(sizeof(*rcu), GFP_KERNEL|__GFP_NOFAIL);
+
+		rcu->t = old;
+		call_rcu(&rcu->rcu, snapshot_t_free_rcu);
+	}
+
+	return &rcu_dereference_protected(c->snapshots, true)->s[idx];
+}
+
+static inline struct snapshot_t *snapshot_t_mut(struct bch_fs *c, u32 id)
+{
+	size_t idx = U32_MAX - id;
+
+	lockdep_assert_held(&c->snapshot_table_lock);
+
+	if (likely(idx < c->snapshot_table_size))
+		return &rcu_dereference_protected(c->snapshots, true)->s[idx];
+
+	return __snapshot_t_mut(c, id);
+}
+
+void bch2_snapshot_to_text(struct printbuf *out, struct bch_fs *c,
+			   struct bkey_s_c k)
+{
+	struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(k);
+
+	prt_printf(out, "is_subvol %llu deleted %llu parent %10u children %10u %10u subvol %u tree %u",
+	       BCH_SNAPSHOT_SUBVOL(s.v),
+	       BCH_SNAPSHOT_DELETED(s.v),
+	       le32_to_cpu(s.v->parent),
+	       le32_to_cpu(s.v->children[0]),
+	       le32_to_cpu(s.v->children[1]),
+	       le32_to_cpu(s.v->subvol),
+	       le32_to_cpu(s.v->tree));
+
+	if (bkey_val_bytes(k.k) > offsetof(struct bch_snapshot, depth))
+		prt_printf(out, " depth %u skiplist %u %u %u",
+			   le32_to_cpu(s.v->depth),
+			   le32_to_cpu(s.v->skip[0]),
+			   le32_to_cpu(s.v->skip[1]),
+			   le32_to_cpu(s.v->skip[2]));
+}
+
+int bch2_snapshot_invalid(const struct bch_fs *c, struct bkey_s_c k,
+			  enum bkey_invalid_flags flags,
+			  struct printbuf *err)
+{
+	struct bkey_s_c_snapshot s;
+	u32 i, id;
+
+	if (bkey_gt(k.k->p, POS(0, U32_MAX)) ||
+	    bkey_lt(k.k->p, POS(0, 1))) {
+		prt_printf(err, "bad pos");
+		return -BCH_ERR_invalid_bkey;
+	}
+
+	s = bkey_s_c_to_snapshot(k);
+
+	id = le32_to_cpu(s.v->parent);
+	if (id && id <= k.k->p.offset) {
+		prt_printf(err, "bad parent node (%u <= %llu)",
+		       id, k.k->p.offset);
+		return -BCH_ERR_invalid_bkey;
+	}
+
+	if (le32_to_cpu(s.v->children[0]) < le32_to_cpu(s.v->children[1])) {
+		prt_printf(err, "children not normalized");
+		return -BCH_ERR_invalid_bkey;
+	}
+
+	if (s.v->children[0] &&
+	    s.v->children[0] == s.v->children[1]) {
+		prt_printf(err, "duplicate child nodes");
+		return -BCH_ERR_invalid_bkey;
+	}
+
+	for (i = 0; i < 2; i++) {
+		id = le32_to_cpu(s.v->children[i]);
+
+		if (id >= k.k->p.offset) {
+			prt_printf(err, "bad child node (%u >= %llu)",
+			       id, k.k->p.offset);
+			return -BCH_ERR_invalid_bkey;
+		}
+	}
+
+	if (bkey_val_bytes(k.k) > offsetof(struct bch_snapshot, skip)) {
+		if (le32_to_cpu(s.v->skip[0]) > le32_to_cpu(s.v->skip[1]) ||
+		    le32_to_cpu(s.v->skip[1]) > le32_to_cpu(s.v->skip[2])) {
+			prt_printf(err, "skiplist not normalized");
+			return -BCH_ERR_invalid_bkey;
+		}
+
+		for (i = 0; i < ARRAY_SIZE(s.v->skip); i++) {
+			id = le32_to_cpu(s.v->skip[i]);
+
+			if (!id != !s.v->parent ||
+			    (s.v->parent &&
+			     id <= k.k->p.offset)) {
+				prt_printf(err, "bad skiplist node %u)", id);
+				return -BCH_ERR_invalid_bkey;
+			}
+		}
+	}
+
+	return 0;
+}
+
+int bch2_mark_snapshot(struct btree_trans *trans,
+		       enum btree_id btree, unsigned level,
+		       struct bkey_s_c old, struct bkey_s_c new,
+		       unsigned flags)
+{
+	struct bch_fs *c = trans->c;
+	struct snapshot_t *t;
+	u32 id = new.k->p.offset;
+	int ret = 0;
+
+	mutex_lock(&c->snapshot_table_lock);
+
+	t = snapshot_t_mut(c, id);
+	if (!t) {
+		ret = -BCH_ERR_ENOMEM_mark_snapshot;
+		goto err;
+	}
+
+	if (new.k->type == KEY_TYPE_snapshot) {
+		struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(new);
+		u32 parent = id;
+
+		t->parent	= le32_to_cpu(s.v->parent);
+		t->children[0]	= le32_to_cpu(s.v->children[0]);
+		t->children[1]	= le32_to_cpu(s.v->children[1]);
+		t->subvol	= BCH_SNAPSHOT_SUBVOL(s.v) ? le32_to_cpu(s.v->subvol) : 0;
+		t->tree		= le32_to_cpu(s.v->tree);
+
+		if (bkey_val_bytes(s.k) > offsetof(struct bch_snapshot, depth)) {
+			t->depth	= le32_to_cpu(s.v->depth);
+			t->skip[0]	= le32_to_cpu(s.v->skip[0]);
+			t->skip[1]	= le32_to_cpu(s.v->skip[1]);
+			t->skip[2]	= le32_to_cpu(s.v->skip[2]);
+		} else {
+			t->depth	= 0;
+			t->skip[0]	= 0;
+			t->skip[1]	= 0;
+			t->skip[2]	= 0;
+		}
+
+		while ((parent = bch2_snapshot_parent_early(c, parent)) &&
+		       parent - id - 1 < IS_ANCESTOR_BITMAP)
+			__set_bit(parent - id - 1, t->is_ancestor);
+
+		if (BCH_SNAPSHOT_DELETED(s.v)) {
+			set_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags);
+			c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_delete_dead_snapshots);
+		}
+	} else {
+		memset(t, 0, sizeof(*t));
+	}
+err:
+	mutex_unlock(&c->snapshot_table_lock);
+	return ret;
+}
+
+int bch2_snapshot_lookup(struct btree_trans *trans, u32 id,
+			 struct bch_snapshot *s)
+{
+	return bch2_bkey_get_val_typed(trans, BTREE_ID_snapshots, POS(0, id),
+				       BTREE_ITER_WITH_UPDATES, snapshot, s);
+}
+
+int bch2_snapshot_live(struct btree_trans *trans, u32 id)
+{
+	struct bch_snapshot v;
+	int ret;
+
+	if (!id)
+		return 0;
+
+	ret = bch2_snapshot_lookup(trans, id, &v);
+	if (bch2_err_matches(ret, ENOENT))
+		bch_err(trans->c, "snapshot node %u not found", id);
+	if (ret)
+		return ret;
+
+	return !BCH_SNAPSHOT_DELETED(&v);
+}
+
+/*
+ * If @k is a snapshot with just one live child, it's part of a linear chain,
+ * which we consider to be an equivalence class: and then after snapshot
+ * deletion cleanup, there should only be a single key at a given position in
+ * this equivalence class.
+ *
+ * This sets the equivalence class of @k to be the child's equivalence class, if
+ * it's part of such a linear chain: this correctly sets equivalence classes on
+ * startup if we run leaf to root (i.e. in natural key order).
+ */
+int bch2_snapshot_set_equiv(struct btree_trans *trans, struct bkey_s_c k)
+{
+	struct bch_fs *c = trans->c;
+	unsigned i, nr_live = 0, live_idx = 0;
+	struct bkey_s_c_snapshot snap;
+	u32 id = k.k->p.offset, child[2];
+
+	if (k.k->type != KEY_TYPE_snapshot)
+		return 0;
+
+	snap = bkey_s_c_to_snapshot(k);
+
+	child[0] = le32_to_cpu(snap.v->children[0]);
+	child[1] = le32_to_cpu(snap.v->children[1]);
+
+	for (i = 0; i < 2; i++) {
+		int ret = bch2_snapshot_live(trans, child[i]);
+
+		if (ret < 0)
+			return ret;
+
+		if (ret)
+			live_idx = i;
+		nr_live += ret;
+	}
+
+	mutex_lock(&c->snapshot_table_lock);
+
+	snapshot_t_mut(c, id)->equiv = nr_live == 1
+		? snapshot_t_mut(c, child[live_idx])->equiv
+		: id;
+
+	mutex_unlock(&c->snapshot_table_lock);
+
+	return 0;
+}
+
+/* fsck: */
+
+static u32 bch2_snapshot_child(struct bch_fs *c, u32 id, unsigned child)
+{
+	return snapshot_t(c, id)->children[child];
+}
+
+static u32 bch2_snapshot_left_child(struct bch_fs *c, u32 id)
+{
+	return bch2_snapshot_child(c, id, 0);
+}
+
+static u32 bch2_snapshot_right_child(struct bch_fs *c, u32 id)
+{
+	return bch2_snapshot_child(c, id, 1);
+}
+
+static u32 bch2_snapshot_tree_next(struct bch_fs *c, u32 id)
+{
+	u32 n, parent;
+
+	n = bch2_snapshot_left_child(c, id);
+	if (n)
+		return n;
+
+	while ((parent = bch2_snapshot_parent(c, id))) {
+		n = bch2_snapshot_right_child(c, parent);
+		if (n && n != id)
+			return n;
+		id = parent;
+	}
+
+	return 0;
+}
+
+static u32 bch2_snapshot_tree_oldest_subvol(struct bch_fs *c, u32 snapshot_root)
+{
+	u32 id = snapshot_root;
+	u32 subvol = 0, s;
+
+	while (id) {
+		s = snapshot_t(c, id)->subvol;
+
+		if (s && (!subvol || s < subvol))
+			subvol = s;
+
+		id = bch2_snapshot_tree_next(c, id);
+	}
+
+	return subvol;
+}
+
+static int bch2_snapshot_tree_master_subvol(struct btree_trans *trans,
+					    u32 snapshot_root, u32 *subvol_id)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	struct bkey_s_c_subvolume s;
+	bool found = false;
+	int ret;
+
+	for_each_btree_key_norestart(trans, iter, BTREE_ID_subvolumes, POS_MIN,
+				     0, k, ret) {
+		if (k.k->type != KEY_TYPE_subvolume)
+			continue;
+
+		s = bkey_s_c_to_subvolume(k);
+		if (!bch2_snapshot_is_ancestor(c, le32_to_cpu(s.v->snapshot), snapshot_root))
+			continue;
+		if (!BCH_SUBVOLUME_SNAP(s.v)) {
+			*subvol_id = s.k->p.offset;
+			found = true;
+			break;
+		}
+	}
+
+	bch2_trans_iter_exit(trans, &iter);
+
+	if (!ret && !found) {
+		struct bkey_i_subvolume *s;
+
+		*subvol_id = bch2_snapshot_tree_oldest_subvol(c, snapshot_root);
+
+		s = bch2_bkey_get_mut_typed(trans, &iter,
+					    BTREE_ID_subvolumes, POS(0, *subvol_id),
+					    0, subvolume);
+		ret = PTR_ERR_OR_ZERO(s);
+		if (ret)
+			return ret;
+
+		SET_BCH_SUBVOLUME_SNAP(&s->v, false);
+	}
+
+	return ret;
+}
+
+static int check_snapshot_tree(struct btree_trans *trans,
+			       struct btree_iter *iter,
+			       struct bkey_s_c k)
+{
+	struct bch_fs *c = trans->c;
+	struct bkey_s_c_snapshot_tree st;
+	struct bch_snapshot s;
+	struct bch_subvolume subvol;
+	struct printbuf buf = PRINTBUF;
+	u32 root_id;
+	int ret;
+
+	if (k.k->type != KEY_TYPE_snapshot_tree)
+		return 0;
+
+	st = bkey_s_c_to_snapshot_tree(k);
+	root_id = le32_to_cpu(st.v->root_snapshot);
+
+	ret = bch2_snapshot_lookup(trans, root_id, &s);
+	if (ret && !bch2_err_matches(ret, ENOENT))
+		goto err;
+
+	if (fsck_err_on(ret ||
+			root_id != bch2_snapshot_root(c, root_id) ||
+			st.k->p.offset != le32_to_cpu(s.tree),
+			c,
+			"snapshot tree points to missing/incorrect snapshot:\n  %s",
+			(bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf))) {
+		ret = bch2_btree_delete_at(trans, iter, 0);
+		goto err;
+	}
+
+	ret = bch2_subvolume_get(trans, le32_to_cpu(st.v->master_subvol),
+				 false, 0, &subvol);
+	if (ret && !bch2_err_matches(ret, ENOENT))
+		goto err;
+
+	if (fsck_err_on(ret, c,
+			"snapshot tree points to missing subvolume:\n  %s",
+			(printbuf_reset(&buf),
+			 bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf)) ||
+	    fsck_err_on(!bch2_snapshot_is_ancestor_early(c,
+						le32_to_cpu(subvol.snapshot),
+						root_id), c,
+			"snapshot tree points to subvolume that does not point to snapshot in this tree:\n  %s",
+			(printbuf_reset(&buf),
+			 bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf)) ||
+	    fsck_err_on(BCH_SUBVOLUME_SNAP(&subvol), c,
+			"snapshot tree points to snapshot subvolume:\n  %s",
+			(printbuf_reset(&buf),
+			 bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf))) {
+		struct bkey_i_snapshot_tree *u;
+		u32 subvol_id;
+
+		ret = bch2_snapshot_tree_master_subvol(trans, root_id, &subvol_id);
+		if (ret)
+			goto err;
+
+		u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot_tree);
+		ret = PTR_ERR_OR_ZERO(u);
+		if (ret)
+			goto err;
+
+		u->v.master_subvol = cpu_to_le32(subvol_id);
+		st = snapshot_tree_i_to_s_c(u);
+	}
+err:
+fsck_err:
+	printbuf_exit(&buf);
+	return ret;
+}
+
+/*
+ * For each snapshot_tree, make sure it points to the root of a snapshot tree
+ * and that snapshot entry points back to it, or delete it.
+ *
+ * And, make sure it points to a subvolume within that snapshot tree, or correct
+ * it to point to the oldest subvolume within that snapshot tree.
+ */
+int bch2_check_snapshot_trees(struct bch_fs *c)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	int ret;
+
+	ret = bch2_trans_run(c,
+		for_each_btree_key_commit(&trans, iter,
+			BTREE_ID_snapshot_trees, POS_MIN,
+			BTREE_ITER_PREFETCH, k,
+			NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
+		check_snapshot_tree(&trans, &iter, k)));
+
+	if (ret)
+		bch_err(c, "error %i checking snapshot trees", ret);
+	return ret;
+}
+
+/*
+ * Look up snapshot tree for @tree_id and find root,
+ * make sure @snap_id is a descendent:
+ */
+static int snapshot_tree_ptr_good(struct btree_trans *trans,
+				  u32 snap_id, u32 tree_id)
+{
+	struct bch_snapshot_tree s_t;
+	int ret = bch2_snapshot_tree_lookup(trans, tree_id, &s_t);
+
+	if (bch2_err_matches(ret, ENOENT))
+		return 0;
+	if (ret)
+		return ret;
+
+	return bch2_snapshot_is_ancestor_early(trans->c, snap_id, le32_to_cpu(s_t.root_snapshot));
+}
+
+u32 bch2_snapshot_skiplist_get(struct bch_fs *c, u32 id)
+{
+	const struct snapshot_t *s;
+
+	if (!id)
+		return 0;
+
+	rcu_read_lock();
+	s = snapshot_t(c, id);
+	if (s->parent)
+		id = bch2_snapshot_nth_parent(c, id, get_random_u32_below(s->depth));
+	rcu_read_unlock();
+
+	return id;
+}
+
+static int snapshot_skiplist_good(struct btree_trans *trans, struct bch_snapshot s)
+{
+	struct bch_snapshot a;
+	unsigned i;
+	int ret;
+
+	for (i = 0; i < 3; i++) {
+		if (!s.parent != !s.skip[i])
+			return false;
+
+		if (!s.parent)
+			continue;
+
+		ret = bch2_snapshot_lookup(trans, le32_to_cpu(s.skip[i]), &a);
+		if (bch2_err_matches(ret, ENOENT))
+			return false;
+		if (ret)
+			return ret;
+
+		if (a.tree != s.tree)
+			return false;
+	}
+
+	return true;
+}
+
+/*
+ * snapshot_tree pointer was incorrect: look up root snapshot node, make sure
+ * its snapshot_tree pointer is correct (allocate new one if necessary), then
+ * update this node's pointer to root node's pointer:
+ */
+static int snapshot_tree_ptr_repair(struct btree_trans *trans,
+				    struct btree_iter *iter,
+				    struct bkey_s_c k,
+				    struct bch_snapshot *s)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter root_iter;
+	struct bch_snapshot_tree s_t;
+	struct bkey_s_c_snapshot root;
+	struct bkey_i_snapshot *u;
+	u32 root_id = bch2_snapshot_root(c, k.k->p.offset), tree_id;
+	int ret;
+
+	root = bch2_bkey_get_iter_typed(trans, &root_iter,
+			       BTREE_ID_snapshots, POS(0, root_id),
+			       BTREE_ITER_WITH_UPDATES, snapshot);
+	ret = bkey_err(root);
+	if (ret)
+		goto err;
+
+	tree_id = le32_to_cpu(root.v->tree);
+
+	ret = bch2_snapshot_tree_lookup(trans, tree_id, &s_t);
+	if (ret && !bch2_err_matches(ret, ENOENT))
+		return ret;
+
+	if (ret || le32_to_cpu(s_t.root_snapshot) != root_id) {
+		u = bch2_bkey_make_mut_typed(trans, &root_iter, &root.s_c, 0, snapshot);
+		ret =   PTR_ERR_OR_ZERO(u) ?:
+			bch2_snapshot_tree_create(trans, root_id,
+				bch2_snapshot_tree_oldest_subvol(c, root_id),
+				&tree_id);
+		if (ret)
+			goto err;
+
+		u->v.tree = cpu_to_le32(tree_id);
+		if (k.k->p.offset == root_id)
+			*s = u->v;
+	}
+
+	if (k.k->p.offset != root_id) {
+		u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
+		ret = PTR_ERR_OR_ZERO(u);
+		if (ret)
+			goto err;
+
+		u->v.tree = cpu_to_le32(tree_id);
+		*s = u->v;
+	}
+err:
+	bch2_trans_iter_exit(trans, &root_iter);
+	return ret;
+}
+
+static int check_snapshot(struct btree_trans *trans,
+			  struct btree_iter *iter,
+			  struct bkey_s_c k)
+{
+	struct bch_fs *c = trans->c;
+	struct bch_snapshot s;
+	struct bch_subvolume subvol;
+	struct bch_snapshot v;
+	struct bkey_i_snapshot *u;
+	u32 parent_id = bch2_snapshot_parent_early(c, k.k->p.offset);
+	u32 real_depth;
+	struct printbuf buf = PRINTBUF;
+	bool should_have_subvol;
+	u32 i, id;
+	int ret = 0;
+
+	if (k.k->type != KEY_TYPE_snapshot)
+		return 0;
+
+	memset(&s, 0, sizeof(s));
+	memcpy(&s, k.v, bkey_val_bytes(k.k));
+
+	id = le32_to_cpu(s.parent);
+	if (id) {
+		ret = bch2_snapshot_lookup(trans, id, &v);
+		if (bch2_err_matches(ret, ENOENT))
+			bch_err(c, "snapshot with nonexistent parent:\n  %s",
+				(bch2_bkey_val_to_text(&buf, c, k), buf.buf));
+		if (ret)
+			goto err;
+
+		if (le32_to_cpu(v.children[0]) != k.k->p.offset &&
+		    le32_to_cpu(v.children[1]) != k.k->p.offset) {
+			bch_err(c, "snapshot parent %u missing pointer to child %llu",
+				id, k.k->p.offset);
+			ret = -EINVAL;
+			goto err;
+		}
+	}
+
+	for (i = 0; i < 2 && s.children[i]; i++) {
+		id = le32_to_cpu(s.children[i]);
+
+		ret = bch2_snapshot_lookup(trans, id, &v);
+		if (bch2_err_matches(ret, ENOENT))
+			bch_err(c, "snapshot node %llu has nonexistent child %u",
+				k.k->p.offset, id);
+		if (ret)
+			goto err;
+
+		if (le32_to_cpu(v.parent) != k.k->p.offset) {
+			bch_err(c, "snapshot child %u has wrong parent (got %u should be %llu)",
+				id, le32_to_cpu(v.parent), k.k->p.offset);
+			ret = -EINVAL;
+			goto err;
+		}
+	}
+
+	should_have_subvol = BCH_SNAPSHOT_SUBVOL(&s) &&
+		!BCH_SNAPSHOT_DELETED(&s);
+
+	if (should_have_subvol) {
+		id = le32_to_cpu(s.subvol);
+		ret = bch2_subvolume_get(trans, id, 0, false, &subvol);
+		if (bch2_err_matches(ret, ENOENT))
+			bch_err(c, "snapshot points to nonexistent subvolume:\n  %s",
+				(bch2_bkey_val_to_text(&buf, c, k), buf.buf));
+		if (ret)
+			goto err;
+
+		if (BCH_SNAPSHOT_SUBVOL(&s) != (le32_to_cpu(subvol.snapshot) == k.k->p.offset)) {
+			bch_err(c, "snapshot node %llu has wrong BCH_SNAPSHOT_SUBVOL",
+				k.k->p.offset);
+			ret = -EINVAL;
+			goto err;
+		}
+	} else {
+		if (fsck_err_on(s.subvol, c, "snapshot should not point to subvol:\n  %s",
+				(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
+			u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
+			ret = PTR_ERR_OR_ZERO(u);
+			if (ret)
+				goto err;
+
+			u->v.subvol = 0;
+			s = u->v;
+		}
+	}
+
+	ret = snapshot_tree_ptr_good(trans, k.k->p.offset, le32_to_cpu(s.tree));
+	if (ret < 0)
+		goto err;
+
+	if (fsck_err_on(!ret, c, "snapshot points to missing/incorrect tree:\n  %s",
+			(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
+		ret = snapshot_tree_ptr_repair(trans, iter, k, &s);
+		if (ret)
+			goto err;
+	}
+	ret = 0;
+
+	real_depth = bch2_snapshot_depth(c, parent_id);
+
+	if (le32_to_cpu(s.depth) != real_depth &&
+	    (c->sb.version_upgrade_complete < bcachefs_metadata_version_snapshot_skiplists ||
+	     fsck_err(c, "snapshot with incorrect depth field, should be %u:\n  %s",
+		      real_depth, (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))) {
+		u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
+		ret = PTR_ERR_OR_ZERO(u);
+		if (ret)
+			goto err;
+
+		u->v.depth = cpu_to_le32(real_depth);
+		s = u->v;
+	}
+
+	ret = snapshot_skiplist_good(trans, s);
+	if (ret < 0)
+		goto err;
+
+	if (!ret &&
+	    (c->sb.version_upgrade_complete < bcachefs_metadata_version_snapshot_skiplists ||
+	     fsck_err(c, "snapshot with bad skiplist field:\n  %s",
+		      (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))) {
+		u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
+		ret = PTR_ERR_OR_ZERO(u);
+		if (ret)
+			goto err;
+
+		for (i = 0; i < ARRAY_SIZE(u->v.skip); i++)
+			u->v.skip[i] = cpu_to_le32(bch2_snapshot_skiplist_get(c, parent_id));
+
+		bubble_sort(u->v.skip, ARRAY_SIZE(u->v.skip), cmp_le32);
+		s = u->v;
+	}
+	ret = 0;
+err:
+fsck_err:
+	printbuf_exit(&buf);
+	return ret;
+}
+
+int bch2_check_snapshots(struct bch_fs *c)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	int ret;
+
+	/*
+	 * We iterate backwards as checking/fixing the depth field requires that
+	 * the parent's depth already be correct:
+	 */
+	ret = bch2_trans_run(c,
+		for_each_btree_key_reverse_commit(&trans, iter,
+			BTREE_ID_snapshots, POS_MAX,
+			BTREE_ITER_PREFETCH, k,
+			NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
+		check_snapshot(&trans, &iter, k)));
+	if (ret)
+		bch_err_fn(c, ret);
+	return ret;
+}
+
+/*
+ * Mark a snapshot as deleted, for future cleanup:
+ */
+int bch2_snapshot_node_set_deleted(struct btree_trans *trans, u32 id)
+{
+	struct btree_iter iter;
+	struct bkey_i_snapshot *s;
+	int ret = 0;
+
+	s = bch2_bkey_get_mut_typed(trans, &iter,
+				    BTREE_ID_snapshots, POS(0, id),
+				    0, snapshot);
+	ret = PTR_ERR_OR_ZERO(s);
+	if (unlikely(ret)) {
+		bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT),
+					trans->c, "missing snapshot %u", id);
+		return ret;
+	}
+
+	/* already deleted? */
+	if (BCH_SNAPSHOT_DELETED(&s->v))
+		goto err;
+
+	SET_BCH_SNAPSHOT_DELETED(&s->v, true);
+	SET_BCH_SNAPSHOT_SUBVOL(&s->v, false);
+	s->v.subvol = 0;
+err:
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+static int bch2_snapshot_node_delete(struct btree_trans *trans, u32 id)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter iter, p_iter = (struct btree_iter) { NULL };
+	struct btree_iter tree_iter = (struct btree_iter) { NULL };
+	struct bkey_s_c_snapshot s;
+	u32 parent_id;
+	unsigned i;
+	int ret = 0;
+
+	s = bch2_bkey_get_iter_typed(trans, &iter, BTREE_ID_snapshots, POS(0, id),
+				     BTREE_ITER_INTENT, snapshot);
+	ret = bkey_err(s);
+	bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c,
+				"missing snapshot %u", id);
+
+	if (ret)
+		goto err;
+
+	BUG_ON(!BCH_SNAPSHOT_DELETED(s.v));
+	parent_id = le32_to_cpu(s.v->parent);
+
+	if (parent_id) {
+		struct bkey_i_snapshot *parent;
+
+		parent = bch2_bkey_get_mut_typed(trans, &p_iter,
+				     BTREE_ID_snapshots, POS(0, parent_id),
+				     0, snapshot);
+		ret = PTR_ERR_OR_ZERO(parent);
+		if (unlikely(ret)) {
+			bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c,
+						"missing snapshot %u", parent_id);
+			goto err;
+		}
+
+		for (i = 0; i < 2; i++)
+			if (le32_to_cpu(parent->v.children[i]) == id)
+				break;
+
+		if (i == 2)
+			bch_err(c, "snapshot %u missing child pointer to %u",
+				parent_id, id);
+		else
+			parent->v.children[i] = 0;
+
+		if (le32_to_cpu(parent->v.children[0]) <
+		    le32_to_cpu(parent->v.children[1]))
+			swap(parent->v.children[0],
+			     parent->v.children[1]);
+	} else {
+		/*
+		 * We're deleting the root of a snapshot tree: update the
+		 * snapshot_tree entry to point to the new root, or delete it if
+		 * this is the last snapshot ID in this tree:
+		 */
+		struct bkey_i_snapshot_tree *s_t;
+
+		BUG_ON(s.v->children[1]);
+
+		s_t = bch2_bkey_get_mut_typed(trans, &tree_iter,
+				BTREE_ID_snapshot_trees, POS(0, le32_to_cpu(s.v->tree)),
+				0, snapshot_tree);
+		ret = PTR_ERR_OR_ZERO(s_t);
+		if (ret)
+			goto err;
+
+		if (s.v->children[0]) {
+			s_t->v.root_snapshot = s.v->children[0];
+		} else {
+			s_t->k.type = KEY_TYPE_deleted;
+			set_bkey_val_u64s(&s_t->k, 0);
+		}
+	}
+
+	ret = bch2_btree_delete_at(trans, &iter, 0);
+err:
+	bch2_trans_iter_exit(trans, &tree_iter);
+	bch2_trans_iter_exit(trans, &p_iter);
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+static int create_snapids(struct btree_trans *trans, u32 parent, u32 tree,
+			  u32 *new_snapids,
+			  u32 *snapshot_subvols,
+			  unsigned nr_snapids)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter iter;
+	struct bkey_i_snapshot *n;
+	struct bkey_s_c k;
+	unsigned i, j;
+	u32 depth = bch2_snapshot_depth(c, parent);
+	int ret;
+
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots,
+			     POS_MIN, BTREE_ITER_INTENT);
+	k = bch2_btree_iter_peek(&iter);
+	ret = bkey_err(k);
+	if (ret)
+		goto err;
+
+	for (i = 0; i < nr_snapids; i++) {
+		k = bch2_btree_iter_prev_slot(&iter);
+		ret = bkey_err(k);
+		if (ret)
+			goto err;
+
+		if (!k.k || !k.k->p.offset) {
+			ret = -BCH_ERR_ENOSPC_snapshot_create;
+			goto err;
+		}
+
+		n = bch2_bkey_alloc(trans, &iter, 0, snapshot);
+		ret = PTR_ERR_OR_ZERO(n);
+		if (ret)
+			goto err;
+
+		n->v.flags	= 0;
+		n->v.parent	= cpu_to_le32(parent);
+		n->v.subvol	= cpu_to_le32(snapshot_subvols[i]);
+		n->v.tree	= cpu_to_le32(tree);
+		n->v.depth	= cpu_to_le32(depth);
+
+		for (j = 0; j < ARRAY_SIZE(n->v.skip); j++)
+			n->v.skip[j] = cpu_to_le32(bch2_snapshot_skiplist_get(c, parent));
+
+		bubble_sort(n->v.skip, ARRAY_SIZE(n->v.skip), cmp_le32);
+		SET_BCH_SNAPSHOT_SUBVOL(&n->v, true);
+
+		ret = bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0,
+					 bkey_s_c_null, bkey_i_to_s_c(&n->k_i), 0);
+		if (ret)
+			goto err;
+
+		new_snapids[i]	= iter.pos.offset;
+	}
+err:
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+/*
+ * Create new snapshot IDs as children of an existing snapshot ID:
+ */
+static int bch2_snapshot_node_create_children(struct btree_trans *trans, u32 parent,
+			      u32 *new_snapids,
+			      u32 *snapshot_subvols,
+			      unsigned nr_snapids)
+{
+	struct btree_iter iter;
+	struct bkey_i_snapshot *n_parent;
+	int ret = 0;
+
+	n_parent = bch2_bkey_get_mut_typed(trans, &iter,
+			BTREE_ID_snapshots, POS(0, parent),
+			0, snapshot);
+	ret = PTR_ERR_OR_ZERO(n_parent);
+	if (unlikely(ret)) {
+		if (bch2_err_matches(ret, ENOENT))
+			bch_err(trans->c, "snapshot %u not found", parent);
+		return ret;
+	}
+
+	if (n_parent->v.children[0] || n_parent->v.children[1]) {
+		bch_err(trans->c, "Trying to add child snapshot nodes to parent that already has children");
+		ret = -EINVAL;
+		goto err;
+	}
+
+	ret = create_snapids(trans, parent, le32_to_cpu(n_parent->v.tree),
+			     new_snapids, snapshot_subvols, nr_snapids);
+	if (ret)
+		goto err;
+
+	n_parent->v.children[0] = cpu_to_le32(new_snapids[0]);
+	n_parent->v.children[1] = cpu_to_le32(new_snapids[1]);
+	n_parent->v.subvol = 0;
+	SET_BCH_SNAPSHOT_SUBVOL(&n_parent->v, false);
+err:
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+/*
+ * Create a snapshot node that is the root of a new tree:
+ */
+static int bch2_snapshot_node_create_tree(struct btree_trans *trans,
+			      u32 *new_snapids,
+			      u32 *snapshot_subvols,
+			      unsigned nr_snapids)
+{
+	struct bkey_i_snapshot_tree *n_tree;
+	int ret;
+
+	n_tree = __bch2_snapshot_tree_create(trans);
+	ret =   PTR_ERR_OR_ZERO(n_tree) ?:
+		create_snapids(trans, 0, n_tree->k.p.offset,
+			     new_snapids, snapshot_subvols, nr_snapids);
+	if (ret)
+		return ret;
+
+	n_tree->v.master_subvol	= cpu_to_le32(snapshot_subvols[0]);
+	n_tree->v.root_snapshot	= cpu_to_le32(new_snapids[0]);
+	return 0;
+}
+
+int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent,
+			      u32 *new_snapids,
+			      u32 *snapshot_subvols,
+			      unsigned nr_snapids)
+{
+	BUG_ON((parent == 0) != (nr_snapids == 1));
+	BUG_ON((parent != 0) != (nr_snapids == 2));
+
+	return parent
+		? bch2_snapshot_node_create_children(trans, parent,
+				new_snapids, snapshot_subvols, nr_snapids)
+		: bch2_snapshot_node_create_tree(trans,
+				new_snapids, snapshot_subvols, nr_snapids);
+
+}
+
+/*
+ * If we have an unlinked inode in an internal snapshot node, and the inode
+ * really has been deleted in all child snapshots, how does this get cleaned up?
+ *
+ * first there is the problem of how keys that have been overwritten in all
+ * child snapshots get deleted (unimplemented?), but inodes may perhaps be
+ * special?
+ *
+ * also: unlinked inode in internal snapshot appears to not be getting deleted
+ * correctly if inode doesn't exist in leaf snapshots
+ */
+
+static int snapshot_delete_key(struct btree_trans *trans,
+			       struct btree_iter *iter,
+			       struct bkey_s_c k,
+			       snapshot_id_list *deleted,
+			       snapshot_id_list *equiv_seen,
+			       struct bpos *last_pos)
+{
+	struct bch_fs *c = trans->c;
+	u32 equiv = bch2_snapshot_equiv(c, k.k->p.snapshot);
+
+	if (!bkey_eq(k.k->p, *last_pos))
+		equiv_seen->nr = 0;
+	*last_pos = k.k->p;
+
+	if (snapshot_list_has_id(deleted, k.k->p.snapshot) ||
+	    snapshot_list_has_id(equiv_seen, equiv)) {
+		return bch2_btree_delete_at(trans, iter,
+					    BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+	} else {
+		return snapshot_list_add(c, equiv_seen, equiv);
+	}
+}
+
+/*
+ * For a given snapshot, if it doesn't have a subvolume that points to it, and
+ * it doesn't have child snapshot nodes - it's now redundant and we can mark it
+ * as deleted.
+ */
+static int bch2_delete_redundant_snapshot(struct btree_trans *trans, struct btree_iter *iter,
+					  struct bkey_s_c k)
+{
+	struct bkey_s_c_snapshot snap;
+	u32 children[2];
+	int ret;
+
+	if (k.k->type != KEY_TYPE_snapshot)
+		return 0;
+
+	snap = bkey_s_c_to_snapshot(k);
+	if (BCH_SNAPSHOT_DELETED(snap.v) ||
+	    BCH_SNAPSHOT_SUBVOL(snap.v))
+		return 0;
+
+	children[0] = le32_to_cpu(snap.v->children[0]);
+	children[1] = le32_to_cpu(snap.v->children[1]);
+
+	ret   = bch2_snapshot_live(trans, children[0]) ?:
+		bch2_snapshot_live(trans, children[1]);
+	if (ret < 0)
+		return ret;
+
+	if (!ret)
+		return bch2_snapshot_node_set_deleted(trans, k.k->p.offset);
+	return 0;
+}
+
+int bch2_delete_dead_snapshots(struct bch_fs *c)
+{
+	struct btree_trans trans;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	struct bkey_s_c_snapshot snap;
+	snapshot_id_list deleted = { 0 };
+	u32 i, id;
+	int ret = 0;
+
+	if (!test_bit(BCH_FS_STARTED, &c->flags)) {
+		ret = bch2_fs_read_write_early(c);
+		if (ret) {
+			bch_err(c, "error deleleting dead snapshots: error going rw: %s", bch2_err_str(ret));
+			return ret;
+		}
+	}
+
+	bch2_trans_init(&trans, c, 0, 0);
+
+	/*
+	 * For every snapshot node: If we have no live children and it's not
+	 * pointed to by a subvolume, delete it:
+	 */
+	ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_snapshots,
+			POS_MIN, 0, k,
+			NULL, NULL, 0,
+		bch2_delete_redundant_snapshot(&trans, &iter, k));
+	if (ret) {
+		bch_err(c, "error deleting redundant snapshots: %s", bch2_err_str(ret));
+		goto err;
+	}
+
+	for_each_btree_key2(&trans, iter, BTREE_ID_snapshots,
+			   POS_MIN, 0, k,
+		bch2_snapshot_set_equiv(&trans, k));
+	if (ret) {
+		bch_err(c, "error in bch2_snapshots_set_equiv: %s", bch2_err_str(ret));
+		goto err;
+	}
+
+	for_each_btree_key(&trans, iter, BTREE_ID_snapshots,
+			   POS_MIN, 0, k, ret) {
+		if (k.k->type != KEY_TYPE_snapshot)
+			continue;
+
+		snap = bkey_s_c_to_snapshot(k);
+		if (BCH_SNAPSHOT_DELETED(snap.v)) {
+			ret = snapshot_list_add(c, &deleted, k.k->p.offset);
+			if (ret)
+				break;
+		}
+	}
+	bch2_trans_iter_exit(&trans, &iter);
+
+	if (ret) {
+		bch_err_msg(c, ret, "walking snapshots");
+		goto err;
+	}
+
+	for (id = 0; id < BTREE_ID_NR; id++) {
+		struct bpos last_pos = POS_MIN;
+		snapshot_id_list equiv_seen = { 0 };
+
+		if (!btree_type_has_snapshots(id))
+			continue;
+
+		ret = for_each_btree_key_commit(&trans, iter,
+				id, POS_MIN,
+				BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
+				NULL, NULL, BTREE_INSERT_NOFAIL,
+			snapshot_delete_key(&trans, &iter, k, &deleted, &equiv_seen, &last_pos));
+
+		darray_exit(&equiv_seen);
+
+		if (ret) {
+			bch_err_msg(c, ret, "deleting keys from dying snapshots");
+			goto err;
+		}
+	}
+
+	for (i = 0; i < deleted.nr; i++) {
+		u32 node_to_delete = deleted.data[i];
+
+		ret = commit_do(&trans, NULL, NULL, 0,
+			bch2_snapshot_node_delete(&trans, node_to_delete));
+		if (ret) {
+			bch_err_msg(c, ret, "deleting snapshot %u", node_to_delete);
+			goto err;
+		}
+	}
+
+	clear_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags);
+err:
+	darray_exit(&deleted);
+	bch2_trans_exit(&trans);
+	if (ret)
+		bch_err_fn(c, ret);
+	return ret;
+}
+
+void bch2_delete_dead_snapshots_work(struct work_struct *work)
+{
+	struct bch_fs *c = container_of(work, struct bch_fs, snapshot_delete_work);
+
+	if (test_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags))
+		bch2_delete_dead_snapshots(c);
+	bch2_write_ref_put(c, BCH_WRITE_REF_delete_dead_snapshots);
+}
+
+void bch2_delete_dead_snapshots_async(struct bch_fs *c)
+{
+	if (bch2_write_ref_tryget(c, BCH_WRITE_REF_delete_dead_snapshots) &&
+	    !queue_work(c->write_ref_wq, &c->snapshot_delete_work))
+		bch2_write_ref_put(c, BCH_WRITE_REF_delete_dead_snapshots);
+}
+
+int bch2_delete_dead_snapshots_hook(struct btree_trans *trans,
+					   struct btree_trans_commit_hook *h)
+{
+	struct bch_fs *c = trans->c;
+
+	set_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags);
+
+	if (c->curr_recovery_pass <= BCH_RECOVERY_PASS_delete_dead_snapshots)
+		return 0;
+
+	bch2_delete_dead_snapshots_async(c);
+	return 0;
+}
+
+int bch2_snapshots_read(struct bch_fs *c)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	int ret = 0;
+
+	ret = bch2_trans_run(c,
+		for_each_btree_key2(&trans, iter, BTREE_ID_snapshots,
+			   POS_MIN, 0, k,
+			bch2_mark_snapshot(&trans, BTREE_ID_snapshots, 0, bkey_s_c_null, k, 0) ?:
+			bch2_snapshot_set_equiv(&trans, k)));
+	if (ret)
+		bch_err_fn(c, ret);
+	return ret;
+}
+
+void bch2_fs_snapshots_exit(struct bch_fs *c)
+{
+	kfree(rcu_dereference_protected(c->snapshots, true));
+}
diff --git a/fs/bcachefs/snapshot.h b/fs/bcachefs/snapshot.h
new file mode 100644
index 000000000000..826bff2ff7be
--- /dev/null
+++ b/fs/bcachefs/snapshot.h
@@ -0,0 +1,256 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_SNAPSHOT_H
+#define _BCACHEFS_SNAPSHOT_H
+
+enum bkey_invalid_flags;
+
+void bch2_snapshot_tree_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
+int bch2_snapshot_tree_invalid(const struct bch_fs *, struct bkey_s_c,
+			       enum bkey_invalid_flags, struct printbuf *);
+
+#define bch2_bkey_ops_snapshot_tree ((struct bkey_ops) {	\
+	.key_invalid	= bch2_snapshot_tree_invalid,		\
+	.val_to_text	= bch2_snapshot_tree_to_text,		\
+	.min_val_size	= 8,					\
+})
+
+struct bkey_i_snapshot_tree *__bch2_snapshot_tree_create(struct btree_trans *);
+
+int bch2_snapshot_tree_lookup(struct btree_trans *, u32, struct bch_snapshot_tree *);
+
+void bch2_snapshot_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
+int bch2_snapshot_invalid(const struct bch_fs *, struct bkey_s_c,
+			  enum bkey_invalid_flags, struct printbuf *);
+int bch2_mark_snapshot(struct btree_trans *, enum btree_id, unsigned,
+		       struct bkey_s_c, struct bkey_s_c, unsigned);
+
+#define bch2_bkey_ops_snapshot ((struct bkey_ops) {		\
+	.key_invalid	= bch2_snapshot_invalid,		\
+	.val_to_text	= bch2_snapshot_to_text,		\
+	.atomic_trigger	= bch2_mark_snapshot,			\
+	.min_val_size	= 24,					\
+})
+
+static inline struct snapshot_t *__snapshot_t(struct snapshot_table *t, u32 id)
+{
+	return &t->s[U32_MAX - id];
+}
+
+static inline const struct snapshot_t *snapshot_t(struct bch_fs *c, u32 id)
+{
+	return __snapshot_t(rcu_dereference(c->snapshots), id);
+}
+
+static inline u32 bch2_snapshot_tree(struct bch_fs *c, u32 id)
+{
+	rcu_read_lock();
+	id = snapshot_t(c, id)->tree;
+	rcu_read_unlock();
+
+	return id;
+}
+
+static inline u32 __bch2_snapshot_parent_early(struct bch_fs *c, u32 id)
+{
+	return snapshot_t(c, id)->parent;
+}
+
+static inline u32 bch2_snapshot_parent_early(struct bch_fs *c, u32 id)
+{
+	rcu_read_lock();
+	id = __bch2_snapshot_parent_early(c, id);
+	rcu_read_unlock();
+
+	return id;
+}
+
+static inline u32 __bch2_snapshot_parent(struct bch_fs *c, u32 id)
+{
+#ifdef CONFIG_BCACHEFS_DEBUG
+	u32 parent = snapshot_t(c, id)->parent;
+
+	if (parent &&
+	    snapshot_t(c, id)->depth != snapshot_t(c, parent)->depth + 1)
+		panic("id %u depth=%u parent %u depth=%u\n",
+		      id, snapshot_t(c, id)->depth,
+		      parent, snapshot_t(c, parent)->depth);
+
+	return parent;
+#else
+	return snapshot_t(c, id)->parent;
+#endif
+}
+
+static inline u32 bch2_snapshot_parent(struct bch_fs *c, u32 id)
+{
+	rcu_read_lock();
+	id = __bch2_snapshot_parent(c, id);
+	rcu_read_unlock();
+
+	return id;
+}
+
+static inline u32 bch2_snapshot_nth_parent(struct bch_fs *c, u32 id, u32 n)
+{
+	rcu_read_lock();
+	while (n--)
+		id = __bch2_snapshot_parent(c, id);
+	rcu_read_unlock();
+
+	return id;
+}
+
+u32 bch2_snapshot_skiplist_get(struct bch_fs *, u32);
+
+static inline u32 bch2_snapshot_root(struct bch_fs *c, u32 id)
+{
+	u32 parent;
+
+	rcu_read_lock();
+	while ((parent = __bch2_snapshot_parent(c, id)))
+		id = parent;
+	rcu_read_unlock();
+
+	return id;
+}
+
+static inline u32 __bch2_snapshot_equiv(struct bch_fs *c, u32 id)
+{
+	return snapshot_t(c, id)->equiv;
+}
+
+static inline u32 bch2_snapshot_equiv(struct bch_fs *c, u32 id)
+{
+	rcu_read_lock();
+	id = __bch2_snapshot_equiv(c, id);
+	rcu_read_unlock();
+
+	return id;
+}
+
+static inline bool bch2_snapshot_is_equiv(struct bch_fs *c, u32 id)
+{
+	return id == bch2_snapshot_equiv(c, id);
+}
+
+static inline bool bch2_snapshot_is_internal_node(struct bch_fs *c, u32 id)
+{
+	const struct snapshot_t *s;
+	bool ret;
+
+	rcu_read_lock();
+	s = snapshot_t(c, id);
+	ret = s->children[0];
+	rcu_read_unlock();
+
+	return ret;
+}
+
+static inline u32 bch2_snapshot_is_leaf(struct bch_fs *c, u32 id)
+{
+	return !bch2_snapshot_is_internal_node(c, id);
+}
+
+static inline u32 bch2_snapshot_sibling(struct bch_fs *c, u32 id)
+{
+	const struct snapshot_t *s;
+	u32 parent = __bch2_snapshot_parent(c, id);
+
+	if (!parent)
+		return 0;
+
+	s = snapshot_t(c, __bch2_snapshot_parent(c, id));
+	if (id == s->children[0])
+		return s->children[1];
+	if (id == s->children[1])
+		return s->children[0];
+	return 0;
+}
+
+static inline u32 bch2_snapshot_depth(struct bch_fs *c, u32 parent)
+{
+	u32 depth;
+
+	rcu_read_lock();
+	depth = parent ? snapshot_t(c, parent)->depth + 1 : 0;
+	rcu_read_unlock();
+
+	return depth;
+}
+
+bool __bch2_snapshot_is_ancestor(struct bch_fs *, u32, u32);
+
+static inline bool bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor)
+{
+	return id == ancestor
+		? true
+		: __bch2_snapshot_is_ancestor(c, id, ancestor);
+}
+
+static inline bool bch2_snapshot_has_children(struct bch_fs *c, u32 id)
+{
+	const struct snapshot_t *t;
+	bool ret;
+
+	rcu_read_lock();
+	t = snapshot_t(c, id);
+	ret = (t->children[0]|t->children[1]) != 0;
+	rcu_read_unlock();
+
+	return ret;
+}
+
+static inline bool snapshot_list_has_id(snapshot_id_list *s, u32 id)
+{
+	u32 *i;
+
+	darray_for_each(*s, i)
+		if (*i == id)
+			return true;
+	return false;
+}
+
+static inline bool snapshot_list_has_ancestor(struct bch_fs *c, snapshot_id_list *s, u32 id)
+{
+	u32 *i;
+
+	darray_for_each(*s, i)
+		if (bch2_snapshot_is_ancestor(c, id, *i))
+			return true;
+	return false;
+}
+
+static inline int snapshot_list_add(struct bch_fs *c, snapshot_id_list *s, u32 id)
+{
+	int ret;
+
+	BUG_ON(snapshot_list_has_id(s, id));
+	ret = darray_push(s, id);
+	if (ret)
+		bch_err(c, "error reallocating snapshot_id_list (size %zu)", s->size);
+	return ret;
+}
+
+int bch2_snapshot_lookup(struct btree_trans *trans, u32 id,
+			 struct bch_snapshot *s);
+int bch2_snapshot_get_subvol(struct btree_trans *, u32,
+			     struct bch_subvolume *);
+int bch2_snapshot_live(struct btree_trans *trans, u32 id);
+int bch2_snapshot_set_equiv(struct btree_trans *trans, struct bkey_s_c k);
+
+/* only exported for tests: */
+int bch2_snapshot_node_create(struct btree_trans *, u32,
+			      u32 *, u32 *, unsigned);
+
+int bch2_check_snapshot_trees(struct bch_fs *);
+int bch2_check_snapshots(struct bch_fs *);
+
+int bch2_snapshot_node_set_deleted(struct btree_trans *, u32);
+int bch2_delete_dead_snapshots_hook(struct btree_trans *,
+				    struct btree_trans_commit_hook *);
+void bch2_delete_dead_snapshots_work(struct work_struct *);
+
+int bch2_snapshots_read(struct bch_fs *);
+void bch2_fs_snapshots_exit(struct bch_fs *);
+
+#endif /* _BCACHEFS_SNAPSHOT_H */
diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c
index 7de6fe0cdd43..0214a98deb4f 100644
--- a/fs/bcachefs/subvolume.c
+++ b/fs/bcachefs/subvolume.c
@@ -6,885 +6,13 @@
 #include "errcode.h"
 #include "error.h"
 #include "fs.h"
+#include "snapshot.h"
 #include "subvolume.h"
 
 #include <linux/random.h>
 
 static int bch2_subvolume_delete(struct btree_trans *, u32);
 
-static inline u32 get_ancestor_below(struct snapshot_table *t, u32 id, u32 ancestor)
-{
-	const struct snapshot_t *s = __snapshot_t(t, id);
-
-	if (s->skip[2] <= ancestor)
-		return s->skip[2];
-	if (s->skip[1] <= ancestor)
-		return s->skip[1];
-	if (s->skip[0] <= ancestor)
-		return s->skip[0];
-	return s->parent;
-}
-
-bool __bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor)
-{
-	struct snapshot_table *t;
-	bool ret;
-
-	EBUG_ON(c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_snapshots);
-
-	rcu_read_lock();
-	t = rcu_dereference(c->snapshots);
-
-	while (id && id < ancestor - IS_ANCESTOR_BITMAP)
-		id = get_ancestor_below(t, id, ancestor);
-
-	ret = id && id < ancestor
-		? test_bit(ancestor - id - 1, __snapshot_t(t, id)->is_ancestor)
-		: id == ancestor;
-	rcu_read_unlock();
-
-	return ret;
-}
-
-static bool bch2_snapshot_is_ancestor_early(struct bch_fs *c, u32 id, u32 ancestor)
-{
-	struct snapshot_table *t;
-
-	rcu_read_lock();
-	t = rcu_dereference(c->snapshots);
-
-	while (id && id < ancestor)
-		id = __snapshot_t(t, id)->parent;
-	rcu_read_unlock();
-
-	return id == ancestor;
-}
-
-static inline u32 bch2_snapshot_depth(struct bch_fs *c, u32 parent)
-{
-	u32 depth;
-
-	rcu_read_lock();
-	depth = parent ? snapshot_t(c, parent)->depth + 1 : 0;
-	rcu_read_unlock();
-
-	return depth;
-}
-
-struct snapshot_t_free_rcu {
-	struct rcu_head		rcu;
-	struct snapshot_table	*t;
-};
-
-static void snapshot_t_free_rcu(struct rcu_head *rcu)
-{
-	struct snapshot_t_free_rcu *free_rcu =
-		container_of(rcu, struct snapshot_t_free_rcu, rcu);
-
-	kvfree(free_rcu->t);
-	kfree(free_rcu);
-}
-
-static noinline struct snapshot_t *__snapshot_t_mut(struct bch_fs *c, u32 id)
-{
-	size_t idx = U32_MAX - id;
-	size_t new_size;
-	struct snapshot_table *new, *old;
-
-	new_size = max(16UL, roundup_pow_of_two(idx + 1));
-
-	new = kvzalloc(struct_size(new, s, new_size), GFP_KERNEL);
-	if (!new)
-		return NULL;
-
-	old = rcu_dereference_protected(c->snapshots, true);
-	if (old)
-		memcpy(new->s,
-		       rcu_dereference_protected(c->snapshots, true)->s,
-		       sizeof(new->s[0]) * c->snapshot_table_size);
-
-	rcu_assign_pointer(c->snapshots, new);
-	c->snapshot_table_size = new_size;
-	if (old) {
-		struct snapshot_t_free_rcu *rcu =
-			kmalloc(sizeof(*rcu), GFP_KERNEL|__GFP_NOFAIL);
-
-		rcu->t = old;
-		call_rcu(&rcu->rcu, snapshot_t_free_rcu);
-	}
-
-	return &rcu_dereference_protected(c->snapshots, true)->s[idx];
-}
-
-static inline struct snapshot_t *snapshot_t_mut(struct bch_fs *c, u32 id)
-{
-	size_t idx = U32_MAX - id;
-
-	lockdep_assert_held(&c->snapshot_table_lock);
-
-	if (likely(idx < c->snapshot_table_size))
-		return &rcu_dereference_protected(c->snapshots, true)->s[idx];
-
-	return __snapshot_t_mut(c, id);
-}
-
-/* Snapshot tree: */
-
-void bch2_snapshot_tree_to_text(struct printbuf *out, struct bch_fs *c,
-				struct bkey_s_c k)
-{
-	struct bkey_s_c_snapshot_tree t = bkey_s_c_to_snapshot_tree(k);
-
-	prt_printf(out, "subvol %u root snapshot %u",
-		   le32_to_cpu(t.v->master_subvol),
-		   le32_to_cpu(t.v->root_snapshot));
-}
-
-int bch2_snapshot_tree_invalid(const struct bch_fs *c, struct bkey_s_c k,
-			       enum bkey_invalid_flags flags,
-			       struct printbuf *err)
-{
-	if (bkey_gt(k.k->p, POS(0, U32_MAX)) ||
-	    bkey_lt(k.k->p, POS(0, 1))) {
-		prt_printf(err, "bad pos");
-		return -BCH_ERR_invalid_bkey;
-	}
-
-	return 0;
-}
-
-int bch2_snapshot_tree_lookup(struct btree_trans *trans, u32 id,
-			      struct bch_snapshot_tree *s)
-{
-	int ret = bch2_bkey_get_val_typed(trans, BTREE_ID_snapshot_trees, POS(0, id),
-					  BTREE_ITER_WITH_UPDATES, snapshot_tree, s);
-
-	if (bch2_err_matches(ret, ENOENT))
-		ret = -BCH_ERR_ENOENT_snapshot_tree;
-	return ret;
-}
-
-static struct bkey_i_snapshot_tree *
-__snapshot_tree_create(struct btree_trans *trans)
-{
-	struct btree_iter iter;
-	int ret = bch2_bkey_get_empty_slot(trans, &iter,
-			BTREE_ID_snapshot_trees, POS(0, U32_MAX));
-	struct bkey_i_snapshot_tree *s_t;
-
-	if (ret == -BCH_ERR_ENOSPC_btree_slot)
-		ret = -BCH_ERR_ENOSPC_snapshot_tree;
-	if (ret)
-		return ERR_PTR(ret);
-
-	s_t = bch2_bkey_alloc(trans, &iter, 0, snapshot_tree);
-	ret = PTR_ERR_OR_ZERO(s_t);
-	bch2_trans_iter_exit(trans, &iter);
-	return ret ? ERR_PTR(ret) : s_t;
-}
-
-static int snapshot_tree_create(struct btree_trans *trans,
-				u32 root_id, u32 subvol_id, u32 *tree_id)
-{
-	struct bkey_i_snapshot_tree *n_tree =
-		__snapshot_tree_create(trans);
-
-	if (IS_ERR(n_tree))
-		return PTR_ERR(n_tree);
-
-	n_tree->v.master_subvol	= cpu_to_le32(subvol_id);
-	n_tree->v.root_snapshot	= cpu_to_le32(root_id);
-	*tree_id = n_tree->k.p.offset;
-	return 0;
-}
-
-/* Snapshot nodes: */
-
-void bch2_snapshot_to_text(struct printbuf *out, struct bch_fs *c,
-			   struct bkey_s_c k)
-{
-	struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(k);
-
-	prt_printf(out, "is_subvol %llu deleted %llu parent %10u children %10u %10u subvol %u tree %u",
-	       BCH_SNAPSHOT_SUBVOL(s.v),
-	       BCH_SNAPSHOT_DELETED(s.v),
-	       le32_to_cpu(s.v->parent),
-	       le32_to_cpu(s.v->children[0]),
-	       le32_to_cpu(s.v->children[1]),
-	       le32_to_cpu(s.v->subvol),
-	       le32_to_cpu(s.v->tree));
-
-	if (bkey_val_bytes(k.k) > offsetof(struct bch_snapshot, depth))
-		prt_printf(out, " depth %u skiplist %u %u %u",
-			   le32_to_cpu(s.v->depth),
-			   le32_to_cpu(s.v->skip[0]),
-			   le32_to_cpu(s.v->skip[1]),
-			   le32_to_cpu(s.v->skip[2]));
-}
-
-int bch2_snapshot_invalid(const struct bch_fs *c, struct bkey_s_c k,
-			  enum bkey_invalid_flags flags,
-			  struct printbuf *err)
-{
-	struct bkey_s_c_snapshot s;
-	u32 i, id;
-
-	if (bkey_gt(k.k->p, POS(0, U32_MAX)) ||
-	    bkey_lt(k.k->p, POS(0, 1))) {
-		prt_printf(err, "bad pos");
-		return -BCH_ERR_invalid_bkey;
-	}
-
-	s = bkey_s_c_to_snapshot(k);
-
-	id = le32_to_cpu(s.v->parent);
-	if (id && id <= k.k->p.offset) {
-		prt_printf(err, "bad parent node (%u <= %llu)",
-		       id, k.k->p.offset);
-		return -BCH_ERR_invalid_bkey;
-	}
-
-	if (le32_to_cpu(s.v->children[0]) < le32_to_cpu(s.v->children[1])) {
-		prt_printf(err, "children not normalized");
-		return -BCH_ERR_invalid_bkey;
-	}
-
-	if (s.v->children[0] &&
-	    s.v->children[0] == s.v->children[1]) {
-		prt_printf(err, "duplicate child nodes");
-		return -BCH_ERR_invalid_bkey;
-	}
-
-	for (i = 0; i < 2; i++) {
-		id = le32_to_cpu(s.v->children[i]);
-
-		if (id >= k.k->p.offset) {
-			prt_printf(err, "bad child node (%u >= %llu)",
-			       id, k.k->p.offset);
-			return -BCH_ERR_invalid_bkey;
-		}
-	}
-
-	if (bkey_val_bytes(k.k) > offsetof(struct bch_snapshot, skip)) {
-		if (le32_to_cpu(s.v->skip[0]) > le32_to_cpu(s.v->skip[1]) ||
-		    le32_to_cpu(s.v->skip[1]) > le32_to_cpu(s.v->skip[2])) {
-			prt_printf(err, "skiplist not normalized");
-			return -BCH_ERR_invalid_bkey;
-		}
-
-		for (i = 0; i < ARRAY_SIZE(s.v->skip); i++) {
-			id = le32_to_cpu(s.v->skip[i]);
-
-			if (!id != !s.v->parent ||
-			    (s.v->parent &&
-			     id <= k.k->p.offset)) {
-				prt_printf(err, "bad skiplist node %u)", id);
-				return -BCH_ERR_invalid_bkey;
-			}
-		}
-	}
-
-	return 0;
-}
-
-int bch2_mark_snapshot(struct btree_trans *trans,
-		       enum btree_id btree, unsigned level,
-		       struct bkey_s_c old, struct bkey_s_c new,
-		       unsigned flags)
-{
-	struct bch_fs *c = trans->c;
-	struct snapshot_t *t;
-	u32 id = new.k->p.offset;
-	int ret = 0;
-
-	mutex_lock(&c->snapshot_table_lock);
-
-	t = snapshot_t_mut(c, id);
-	if (!t) {
-		ret = -BCH_ERR_ENOMEM_mark_snapshot;
-		goto err;
-	}
-
-	if (new.k->type == KEY_TYPE_snapshot) {
-		struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(new);
-		u32 parent = id;
-
-		t->parent	= le32_to_cpu(s.v->parent);
-		t->children[0]	= le32_to_cpu(s.v->children[0]);
-		t->children[1]	= le32_to_cpu(s.v->children[1]);
-		t->subvol	= BCH_SNAPSHOT_SUBVOL(s.v) ? le32_to_cpu(s.v->subvol) : 0;
-		t->tree		= le32_to_cpu(s.v->tree);
-
-		if (bkey_val_bytes(s.k) > offsetof(struct bch_snapshot, depth)) {
-			t->depth	= le32_to_cpu(s.v->depth);
-			t->skip[0]	= le32_to_cpu(s.v->skip[0]);
-			t->skip[1]	= le32_to_cpu(s.v->skip[1]);
-			t->skip[2]	= le32_to_cpu(s.v->skip[2]);
-		} else {
-			t->depth	= 0;
-			t->skip[0]	= 0;
-			t->skip[1]	= 0;
-			t->skip[2]	= 0;
-		}
-
-		while ((parent = bch2_snapshot_parent_early(c, parent)) &&
-		       parent - id - 1 < IS_ANCESTOR_BITMAP)
-			__set_bit(parent - id - 1, t->is_ancestor);
-
-		if (BCH_SNAPSHOT_DELETED(s.v)) {
-			set_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags);
-			c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_delete_dead_snapshots);
-		}
-	} else {
-		memset(t, 0, sizeof(*t));
-	}
-err:
-	mutex_unlock(&c->snapshot_table_lock);
-	return ret;
-}
-
-static int snapshot_lookup(struct btree_trans *trans, u32 id,
-			   struct bch_snapshot *s)
-{
-	return bch2_bkey_get_val_typed(trans, BTREE_ID_snapshots, POS(0, id),
-				       BTREE_ITER_WITH_UPDATES, snapshot, s);
-}
-
-static int snapshot_live(struct btree_trans *trans, u32 id)
-{
-	struct bch_snapshot v;
-	int ret;
-
-	if (!id)
-		return 0;
-
-	ret = snapshot_lookup(trans, id, &v);
-	if (bch2_err_matches(ret, ENOENT))
-		bch_err(trans->c, "snapshot node %u not found", id);
-	if (ret)
-		return ret;
-
-	return !BCH_SNAPSHOT_DELETED(&v);
-}
-
-static int bch2_snapshot_set_equiv(struct btree_trans *trans, struct bkey_s_c k)
-{
-	struct bch_fs *c = trans->c;
-	unsigned i, nr_live = 0, live_idx = 0;
-	struct bkey_s_c_snapshot snap;
-	u32 id = k.k->p.offset, child[2];
-
-	if (k.k->type != KEY_TYPE_snapshot)
-		return 0;
-
-	snap = bkey_s_c_to_snapshot(k);
-
-	child[0] = le32_to_cpu(snap.v->children[0]);
-	child[1] = le32_to_cpu(snap.v->children[1]);
-
-	for (i = 0; i < 2; i++) {
-		int ret = snapshot_live(trans, child[i]);
-
-		if (ret < 0)
-			return ret;
-
-		if (ret)
-			live_idx = i;
-		nr_live += ret;
-	}
-
-	mutex_lock(&c->snapshot_table_lock);
-
-	snapshot_t_mut(c, id)->equiv = nr_live == 1
-		? snapshot_t_mut(c, child[live_idx])->equiv
-		: id;
-
-	mutex_unlock(&c->snapshot_table_lock);
-
-	return 0;
-}
-
-/* fsck: */
-
-static u32 bch2_snapshot_child(struct bch_fs *c, u32 id, unsigned child)
-{
-	return snapshot_t(c, id)->children[child];
-}
-
-static u32 bch2_snapshot_left_child(struct bch_fs *c, u32 id)
-{
-	return bch2_snapshot_child(c, id, 0);
-}
-
-static u32 bch2_snapshot_right_child(struct bch_fs *c, u32 id)
-{
-	return bch2_snapshot_child(c, id, 1);
-}
-
-static u32 bch2_snapshot_tree_next(struct bch_fs *c, u32 id)
-{
-	u32 n, parent;
-
-	n = bch2_snapshot_left_child(c, id);
-	if (n)
-		return n;
-
-	while ((parent = bch2_snapshot_parent(c, id))) {
-		n = bch2_snapshot_right_child(c, parent);
-		if (n && n != id)
-			return n;
-		id = parent;
-	}
-
-	return 0;
-}
-
-static u32 bch2_snapshot_tree_oldest_subvol(struct bch_fs *c, u32 snapshot_root)
-{
-	u32 id = snapshot_root;
-	u32 subvol = 0, s;
-
-	while (id) {
-		s = snapshot_t(c, id)->subvol;
-
-		if (s && (!subvol || s < subvol))
-			subvol = s;
-
-		id = bch2_snapshot_tree_next(c, id);
-	}
-
-	return subvol;
-}
-
-static int bch2_snapshot_tree_master_subvol(struct btree_trans *trans,
-					    u32 snapshot_root, u32 *subvol_id)
-{
-	struct bch_fs *c = trans->c;
-	struct btree_iter iter;
-	struct bkey_s_c k;
-	struct bkey_s_c_subvolume s;
-	bool found = false;
-	int ret;
-
-	for_each_btree_key_norestart(trans, iter, BTREE_ID_subvolumes, POS_MIN,
-				     0, k, ret) {
-		if (k.k->type != KEY_TYPE_subvolume)
-			continue;
-
-		s = bkey_s_c_to_subvolume(k);
-		if (!bch2_snapshot_is_ancestor(c, le32_to_cpu(s.v->snapshot), snapshot_root))
-			continue;
-		if (!BCH_SUBVOLUME_SNAP(s.v)) {
-			*subvol_id = s.k->p.offset;
-			found = true;
-			break;
-		}
-	}
-
-	bch2_trans_iter_exit(trans, &iter);
-
-	if (!ret && !found) {
-		struct bkey_i_subvolume *s;
-
-		*subvol_id = bch2_snapshot_tree_oldest_subvol(c, snapshot_root);
-
-		s = bch2_bkey_get_mut_typed(trans, &iter,
-					    BTREE_ID_subvolumes, POS(0, *subvol_id),
-					    0, subvolume);
-		ret = PTR_ERR_OR_ZERO(s);
-		if (ret)
-			return ret;
-
-		SET_BCH_SUBVOLUME_SNAP(&s->v, false);
-	}
-
-	return ret;
-}
-
-static int check_snapshot_tree(struct btree_trans *trans,
-			       struct btree_iter *iter,
-			       struct bkey_s_c k)
-{
-	struct bch_fs *c = trans->c;
-	struct bkey_s_c_snapshot_tree st;
-	struct bch_snapshot s;
-	struct bch_subvolume subvol;
-	struct printbuf buf = PRINTBUF;
-	u32 root_id;
-	int ret;
-
-	if (k.k->type != KEY_TYPE_snapshot_tree)
-		return 0;
-
-	st = bkey_s_c_to_snapshot_tree(k);
-	root_id = le32_to_cpu(st.v->root_snapshot);
-
-	ret = snapshot_lookup(trans, root_id, &s);
-	if (ret && !bch2_err_matches(ret, ENOENT))
-		goto err;
-
-	if (fsck_err_on(ret ||
-			root_id != bch2_snapshot_root(c, root_id) ||
-			st.k->p.offset != le32_to_cpu(s.tree),
-			c,
-			"snapshot tree points to missing/incorrect snapshot:\n  %s",
-			(bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf))) {
-		ret = bch2_btree_delete_at(trans, iter, 0);
-		goto err;
-	}
-
-	ret = bch2_subvolume_get(trans, le32_to_cpu(st.v->master_subvol),
-				 false, 0, &subvol);
-	if (ret && !bch2_err_matches(ret, ENOENT))
-		goto err;
-
-	if (fsck_err_on(ret, c,
-			"snapshot tree points to missing subvolume:\n  %s",
-			(printbuf_reset(&buf),
-			 bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf)) ||
-	    fsck_err_on(!bch2_snapshot_is_ancestor_early(c,
-						le32_to_cpu(subvol.snapshot),
-						root_id), c,
-			"snapshot tree points to subvolume that does not point to snapshot in this tree:\n  %s",
-			(printbuf_reset(&buf),
-			 bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf)) ||
-	    fsck_err_on(BCH_SUBVOLUME_SNAP(&subvol), c,
-			"snapshot tree points to snapshot subvolume:\n  %s",
-			(printbuf_reset(&buf),
-			 bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf))) {
-		struct bkey_i_snapshot_tree *u;
-		u32 subvol_id;
-
-		ret = bch2_snapshot_tree_master_subvol(trans, root_id, &subvol_id);
-		if (ret)
-			goto err;
-
-		u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot_tree);
-		ret = PTR_ERR_OR_ZERO(u);
-		if (ret)
-			goto err;
-
-		u->v.master_subvol = cpu_to_le32(subvol_id);
-		st = snapshot_tree_i_to_s_c(u);
-	}
-err:
-fsck_err:
-	printbuf_exit(&buf);
-	return ret;
-}
-
-/*
- * For each snapshot_tree, make sure it points to the root of a snapshot tree
- * and that snapshot entry points back to it, or delete it.
- *
- * And, make sure it points to a subvolume within that snapshot tree, or correct
- * it to point to the oldest subvolume within that snapshot tree.
- */
-int bch2_check_snapshot_trees(struct bch_fs *c)
-{
-	struct btree_iter iter;
-	struct bkey_s_c k;
-	int ret;
-
-	ret = bch2_trans_run(c,
-		for_each_btree_key_commit(&trans, iter,
-			BTREE_ID_snapshot_trees, POS_MIN,
-			BTREE_ITER_PREFETCH, k,
-			NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
-		check_snapshot_tree(&trans, &iter, k)));
-
-	if (ret)
-		bch_err(c, "error %i checking snapshot trees", ret);
-	return ret;
-}
-
-/*
- * Look up snapshot tree for @tree_id and find root,
- * make sure @snap_id is a descendent:
- */
-static int snapshot_tree_ptr_good(struct btree_trans *trans,
-				  u32 snap_id, u32 tree_id)
-{
-	struct bch_snapshot_tree s_t;
-	int ret = bch2_snapshot_tree_lookup(trans, tree_id, &s_t);
-
-	if (bch2_err_matches(ret, ENOENT))
-		return 0;
-	if (ret)
-		return ret;
-
-	return bch2_snapshot_is_ancestor_early(trans->c, snap_id, le32_to_cpu(s_t.root_snapshot));
-}
-
-static u32 snapshot_skiplist_get(struct bch_fs *c, u32 id)
-{
-	const struct snapshot_t *s;
-
-	if (!id)
-		return 0;
-
-	rcu_read_lock();
-	s = snapshot_t(c, id);
-	if (s->parent)
-		id = bch2_snapshot_nth_parent(c, id, get_random_u32_below(s->depth));
-	rcu_read_unlock();
-
-	return id;
-}
-
-static int snapshot_skiplist_good(struct btree_trans *trans, struct bch_snapshot s)
-{
-	struct bch_snapshot a;
-	unsigned i;
-	int ret;
-
-	for (i = 0; i < 3; i++) {
-		if (!s.parent != !s.skip[i])
-			return false;
-
-		if (!s.parent)
-			continue;
-
-		ret = snapshot_lookup(trans, le32_to_cpu(s.skip[i]), &a);
-		if (bch2_err_matches(ret, ENOENT))
-			return false;
-		if (ret)
-			return ret;
-
-		if (a.tree != s.tree)
-			return false;
-	}
-
-	return true;
-}
-
-/*
- * snapshot_tree pointer was incorrect: look up root snapshot node, make sure
- * its snapshot_tree pointer is correct (allocate new one if necessary), then
- * update this node's pointer to root node's pointer:
- */
-static int snapshot_tree_ptr_repair(struct btree_trans *trans,
-				    struct btree_iter *iter,
-				    struct bkey_s_c k,
-				    struct bch_snapshot *s)
-{
-	struct bch_fs *c = trans->c;
-	struct btree_iter root_iter;
-	struct bch_snapshot_tree s_t;
-	struct bkey_s_c_snapshot root;
-	struct bkey_i_snapshot *u;
-	u32 root_id = bch2_snapshot_root(c, k.k->p.offset), tree_id;
-	int ret;
-
-	root = bch2_bkey_get_iter_typed(trans, &root_iter,
-			       BTREE_ID_snapshots, POS(0, root_id),
-			       BTREE_ITER_WITH_UPDATES, snapshot);
-	ret = bkey_err(root);
-	if (ret)
-		goto err;
-
-	tree_id = le32_to_cpu(root.v->tree);
-
-	ret = bch2_snapshot_tree_lookup(trans, tree_id, &s_t);
-	if (ret && !bch2_err_matches(ret, ENOENT))
-		return ret;
-
-	if (ret || le32_to_cpu(s_t.root_snapshot) != root_id) {
-		u = bch2_bkey_make_mut_typed(trans, &root_iter, &root.s_c, 0, snapshot);
-		ret =   PTR_ERR_OR_ZERO(u) ?:
-			snapshot_tree_create(trans, root_id,
-				bch2_snapshot_tree_oldest_subvol(c, root_id),
-				&tree_id);
-		if (ret)
-			goto err;
-
-		u->v.tree = cpu_to_le32(tree_id);
-		if (k.k->p.offset == root_id)
-			*s = u->v;
-	}
-
-	if (k.k->p.offset != root_id) {
-		u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
-		ret = PTR_ERR_OR_ZERO(u);
-		if (ret)
-			goto err;
-
-		u->v.tree = cpu_to_le32(tree_id);
-		*s = u->v;
-	}
-err:
-	bch2_trans_iter_exit(trans, &root_iter);
-	return ret;
-}
-
-static int cmp_le32(__le32 l, __le32 r)
-{
-	return cmp_int(le32_to_cpu(l), le32_to_cpu(r));
-}
-
-static int check_snapshot(struct btree_trans *trans,
-			  struct btree_iter *iter,
-			  struct bkey_s_c k)
-{
-	struct bch_fs *c = trans->c;
-	struct bch_snapshot s;
-	struct bch_subvolume subvol;
-	struct bch_snapshot v;
-	struct bkey_i_snapshot *u;
-	u32 parent_id = bch2_snapshot_parent_early(c, k.k->p.offset);
-	u32 real_depth;
-	struct printbuf buf = PRINTBUF;
-	bool should_have_subvol;
-	u32 i, id;
-	int ret = 0;
-
-	if (k.k->type != KEY_TYPE_snapshot)
-		return 0;
-
-	memset(&s, 0, sizeof(s));
-	memcpy(&s, k.v, bkey_val_bytes(k.k));
-
-	id = le32_to_cpu(s.parent);
-	if (id) {
-		ret = snapshot_lookup(trans, id, &v);
-		if (bch2_err_matches(ret, ENOENT))
-			bch_err(c, "snapshot with nonexistent parent:\n  %s",
-				(bch2_bkey_val_to_text(&buf, c, k), buf.buf));
-		if (ret)
-			goto err;
-
-		if (le32_to_cpu(v.children[0]) != k.k->p.offset &&
-		    le32_to_cpu(v.children[1]) != k.k->p.offset) {
-			bch_err(c, "snapshot parent %u missing pointer to child %llu",
-				id, k.k->p.offset);
-			ret = -EINVAL;
-			goto err;
-		}
-	}
-
-	for (i = 0; i < 2 && s.children[i]; i++) {
-		id = le32_to_cpu(s.children[i]);
-
-		ret = snapshot_lookup(trans, id, &v);
-		if (bch2_err_matches(ret, ENOENT))
-			bch_err(c, "snapshot node %llu has nonexistent child %u",
-				k.k->p.offset, id);
-		if (ret)
-			goto err;
-
-		if (le32_to_cpu(v.parent) != k.k->p.offset) {
-			bch_err(c, "snapshot child %u has wrong parent (got %u should be %llu)",
-				id, le32_to_cpu(v.parent), k.k->p.offset);
-			ret = -EINVAL;
-			goto err;
-		}
-	}
-
-	should_have_subvol = BCH_SNAPSHOT_SUBVOL(&s) &&
-		!BCH_SNAPSHOT_DELETED(&s);
-
-	if (should_have_subvol) {
-		id = le32_to_cpu(s.subvol);
-		ret = bch2_subvolume_get(trans, id, 0, false, &subvol);
-		if (bch2_err_matches(ret, ENOENT))
-			bch_err(c, "snapshot points to nonexistent subvolume:\n  %s",
-				(bch2_bkey_val_to_text(&buf, c, k), buf.buf));
-		if (ret)
-			goto err;
-
-		if (BCH_SNAPSHOT_SUBVOL(&s) != (le32_to_cpu(subvol.snapshot) == k.k->p.offset)) {
-			bch_err(c, "snapshot node %llu has wrong BCH_SNAPSHOT_SUBVOL",
-				k.k->p.offset);
-			ret = -EINVAL;
-			goto err;
-		}
-	} else {
-		if (fsck_err_on(s.subvol, c, "snapshot should not point to subvol:\n  %s",
-				(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
-			u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
-			ret = PTR_ERR_OR_ZERO(u);
-			if (ret)
-				goto err;
-
-			u->v.subvol = 0;
-			s = u->v;
-		}
-	}
-
-	ret = snapshot_tree_ptr_good(trans, k.k->p.offset, le32_to_cpu(s.tree));
-	if (ret < 0)
-		goto err;
-
-	if (fsck_err_on(!ret, c, "snapshot points to missing/incorrect tree:\n  %s",
-			(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
-		ret = snapshot_tree_ptr_repair(trans, iter, k, &s);
-		if (ret)
-			goto err;
-	}
-	ret = 0;
-
-	real_depth = bch2_snapshot_depth(c, parent_id);
-
-	if (le32_to_cpu(s.depth) != real_depth &&
-	    (c->sb.version_upgrade_complete < bcachefs_metadata_version_snapshot_skiplists ||
-	     fsck_err(c, "snapshot with incorrect depth field, should be %u:\n  %s",
-		      real_depth, (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))) {
-		u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
-		ret = PTR_ERR_OR_ZERO(u);
-		if (ret)
-			goto err;
-
-		u->v.depth = cpu_to_le32(real_depth);
-		s = u->v;
-	}
-
-	ret = snapshot_skiplist_good(trans, s);
-	if (ret < 0)
-		goto err;
-
-	if (!ret &&
-	    (c->sb.version_upgrade_complete < bcachefs_metadata_version_snapshot_skiplists ||
-	     fsck_err(c, "snapshot with bad skiplist field:\n  %s",
-		      (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))) {
-		u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
-		ret = PTR_ERR_OR_ZERO(u);
-		if (ret)
-			goto err;
-
-		for (i = 0; i < ARRAY_SIZE(u->v.skip); i++)
-			u->v.skip[i] = cpu_to_le32(snapshot_skiplist_get(c, parent_id));
-
-		bubble_sort(u->v.skip, ARRAY_SIZE(u->v.skip), cmp_le32);
-		s = u->v;
-	}
-	ret = 0;
-err:
-fsck_err:
-	printbuf_exit(&buf);
-	return ret;
-}
-
-int bch2_check_snapshots(struct bch_fs *c)
-{
-	struct btree_iter iter;
-	struct bkey_s_c k;
-	int ret;
-
-	/*
-	 * We iterate backwards as checking/fixing the depth field requires that
-	 * the parent's depth already be correct:
-	 */
-	ret = bch2_trans_run(c,
-		for_each_btree_key_reverse_commit(&trans, iter,
-			BTREE_ID_snapshots, POS_MAX,
-			BTREE_ITER_PREFETCH, k,
-			NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
-		check_snapshot(&trans, &iter, k)));
-	if (ret)
-		bch_err_fn(c, ret);
-	return ret;
-}
-
 static int check_subvol(struct btree_trans *trans,
 			struct btree_iter *iter,
 			struct bkey_s_c k)
@@ -900,7 +28,7 @@ static int check_subvol(struct btree_trans *trans,
 
 	subvol = bkey_s_c_to_subvolume(k);
 	snapid = le32_to_cpu(subvol.v->snapshot);
-	ret = snapshot_lookup(trans, snapid, &snapshot);
+	ret = bch2_snapshot_lookup(trans, snapid, &snapshot);
 
 	if (bch2_err_matches(ret, ENOENT))
 		bch_err(c, "subvolume %llu points to nonexistent snapshot %u",
@@ -968,462 +96,6 @@ int bch2_check_subvols(struct bch_fs *c)
 	return ret;
 }
 
-void bch2_fs_snapshots_exit(struct bch_fs *c)
-{
-	kfree(rcu_dereference_protected(c->snapshots, true));
-}
-
-int bch2_snapshots_read(struct bch_fs *c)
-{
-	struct btree_iter iter;
-	struct bkey_s_c k;
-	int ret = 0;
-
-	ret = bch2_trans_run(c,
-		for_each_btree_key2(&trans, iter, BTREE_ID_snapshots,
-			   POS_MIN, 0, k,
-			bch2_mark_snapshot(&trans, BTREE_ID_snapshots, 0, bkey_s_c_null, k, 0) ?:
-			bch2_snapshot_set_equiv(&trans, k)));
-	if (ret)
-		bch_err_fn(c, ret);
-	return ret;
-}
-
-/*
- * Mark a snapshot as deleted, for future cleanup:
- */
-static int bch2_snapshot_node_set_deleted(struct btree_trans *trans, u32 id)
-{
-	struct btree_iter iter;
-	struct bkey_i_snapshot *s;
-	int ret = 0;
-
-	s = bch2_bkey_get_mut_typed(trans, &iter,
-				    BTREE_ID_snapshots, POS(0, id),
-				    0, snapshot);
-	ret = PTR_ERR_OR_ZERO(s);
-	if (unlikely(ret)) {
-		bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT),
-					trans->c, "missing snapshot %u", id);
-		return ret;
-	}
-
-	/* already deleted? */
-	if (BCH_SNAPSHOT_DELETED(&s->v))
-		goto err;
-
-	SET_BCH_SNAPSHOT_DELETED(&s->v, true);
-	SET_BCH_SNAPSHOT_SUBVOL(&s->v, false);
-	s->v.subvol = 0;
-err:
-	bch2_trans_iter_exit(trans, &iter);
-	return ret;
-}
-
-static int bch2_snapshot_node_delete(struct btree_trans *trans, u32 id)
-{
-	struct bch_fs *c = trans->c;
-	struct btree_iter iter, p_iter = (struct btree_iter) { NULL };
-	struct btree_iter tree_iter = (struct btree_iter) { NULL };
-	struct bkey_s_c_snapshot s;
-	u32 parent_id;
-	unsigned i;
-	int ret = 0;
-
-	s = bch2_bkey_get_iter_typed(trans, &iter, BTREE_ID_snapshots, POS(0, id),
-				     BTREE_ITER_INTENT, snapshot);
-	ret = bkey_err(s);
-	bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c,
-				"missing snapshot %u", id);
-
-	if (ret)
-		goto err;
-
-	BUG_ON(!BCH_SNAPSHOT_DELETED(s.v));
-	parent_id = le32_to_cpu(s.v->parent);
-
-	if (parent_id) {
-		struct bkey_i_snapshot *parent;
-
-		parent = bch2_bkey_get_mut_typed(trans, &p_iter,
-				     BTREE_ID_snapshots, POS(0, parent_id),
-				     0, snapshot);
-		ret = PTR_ERR_OR_ZERO(parent);
-		if (unlikely(ret)) {
-			bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c,
-						"missing snapshot %u", parent_id);
-			goto err;
-		}
-
-		for (i = 0; i < 2; i++)
-			if (le32_to_cpu(parent->v.children[i]) == id)
-				break;
-
-		if (i == 2)
-			bch_err(c, "snapshot %u missing child pointer to %u",
-				parent_id, id);
-		else
-			parent->v.children[i] = 0;
-
-		if (le32_to_cpu(parent->v.children[0]) <
-		    le32_to_cpu(parent->v.children[1]))
-			swap(parent->v.children[0],
-			     parent->v.children[1]);
-	} else {
-		/*
-		 * We're deleting the root of a snapshot tree: update the
-		 * snapshot_tree entry to point to the new root, or delete it if
-		 * this is the last snapshot ID in this tree:
-		 */
-		struct bkey_i_snapshot_tree *s_t;
-
-		BUG_ON(s.v->children[1]);
-
-		s_t = bch2_bkey_get_mut_typed(trans, &tree_iter,
-				BTREE_ID_snapshot_trees, POS(0, le32_to_cpu(s.v->tree)),
-				0, snapshot_tree);
-		ret = PTR_ERR_OR_ZERO(s_t);
-		if (ret)
-			goto err;
-
-		if (s.v->children[0]) {
-			s_t->v.root_snapshot = s.v->children[0];
-		} else {
-			s_t->k.type = KEY_TYPE_deleted;
-			set_bkey_val_u64s(&s_t->k, 0);
-		}
-	}
-
-	ret = bch2_btree_delete_at(trans, &iter, 0);
-err:
-	bch2_trans_iter_exit(trans, &tree_iter);
-	bch2_trans_iter_exit(trans, &p_iter);
-	bch2_trans_iter_exit(trans, &iter);
-	return ret;
-}
-
-static int create_snapids(struct btree_trans *trans, u32 parent, u32 tree,
-			  u32 *new_snapids,
-			  u32 *snapshot_subvols,
-			  unsigned nr_snapids)
-{
-	struct bch_fs *c = trans->c;
-	struct btree_iter iter;
-	struct bkey_i_snapshot *n;
-	struct bkey_s_c k;
-	unsigned i, j;
-	u32 depth = bch2_snapshot_depth(c, parent);
-	int ret;
-
-	bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots,
-			     POS_MIN, BTREE_ITER_INTENT);
-	k = bch2_btree_iter_peek(&iter);
-	ret = bkey_err(k);
-	if (ret)
-		goto err;
-
-	for (i = 0; i < nr_snapids; i++) {
-		k = bch2_btree_iter_prev_slot(&iter);
-		ret = bkey_err(k);
-		if (ret)
-			goto err;
-
-		if (!k.k || !k.k->p.offset) {
-			ret = -BCH_ERR_ENOSPC_snapshot_create;
-			goto err;
-		}
-
-		n = bch2_bkey_alloc(trans, &iter, 0, snapshot);
-		ret = PTR_ERR_OR_ZERO(n);
-		if (ret)
-			goto err;
-
-		n->v.flags	= 0;
-		n->v.parent	= cpu_to_le32(parent);
-		n->v.subvol	= cpu_to_le32(snapshot_subvols[i]);
-		n->v.tree	= cpu_to_le32(tree);
-		n->v.depth	= cpu_to_le32(depth);
-
-		for (j = 0; j < ARRAY_SIZE(n->v.skip); j++)
-			n->v.skip[j] = cpu_to_le32(snapshot_skiplist_get(c, parent));
-
-		bubble_sort(n->v.skip, ARRAY_SIZE(n->v.skip), cmp_le32);
-		SET_BCH_SNAPSHOT_SUBVOL(&n->v, true);
-
-		ret = bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0,
-					 bkey_s_c_null, bkey_i_to_s_c(&n->k_i), 0);
-		if (ret)
-			goto err;
-
-		new_snapids[i]	= iter.pos.offset;
-	}
-err:
-	bch2_trans_iter_exit(trans, &iter);
-	return ret;
-}
-
-/*
- * Create new snapshot IDs as children of an existing snapshot ID:
- */
-static int bch2_snapshot_node_create_children(struct btree_trans *trans, u32 parent,
-			      u32 *new_snapids,
-			      u32 *snapshot_subvols,
-			      unsigned nr_snapids)
-{
-	struct btree_iter iter;
-	struct bkey_i_snapshot *n_parent;
-	int ret = 0;
-
-	n_parent = bch2_bkey_get_mut_typed(trans, &iter,
-			BTREE_ID_snapshots, POS(0, parent),
-			0, snapshot);
-	ret = PTR_ERR_OR_ZERO(n_parent);
-	if (unlikely(ret)) {
-		if (bch2_err_matches(ret, ENOENT))
-			bch_err(trans->c, "snapshot %u not found", parent);
-		return ret;
-	}
-
-	if (n_parent->v.children[0] || n_parent->v.children[1]) {
-		bch_err(trans->c, "Trying to add child snapshot nodes to parent that already has children");
-		ret = -EINVAL;
-		goto err;
-	}
-
-	ret = create_snapids(trans, parent, le32_to_cpu(n_parent->v.tree),
-			     new_snapids, snapshot_subvols, nr_snapids);
-	if (ret)
-		goto err;
-
-	n_parent->v.children[0] = cpu_to_le32(new_snapids[0]);
-	n_parent->v.children[1] = cpu_to_le32(new_snapids[1]);
-	n_parent->v.subvol = 0;
-	SET_BCH_SNAPSHOT_SUBVOL(&n_parent->v, false);
-err:
-	bch2_trans_iter_exit(trans, &iter);
-	return ret;
-}
-
-/*
- * Create a snapshot node that is the root of a new tree:
- */
-static int bch2_snapshot_node_create_tree(struct btree_trans *trans,
-			      u32 *new_snapids,
-			      u32 *snapshot_subvols,
-			      unsigned nr_snapids)
-{
-	struct bkey_i_snapshot_tree *n_tree;
-	int ret;
-
-	n_tree = __snapshot_tree_create(trans);
-	ret =   PTR_ERR_OR_ZERO(n_tree) ?:
-		create_snapids(trans, 0, n_tree->k.p.offset,
-			     new_snapids, snapshot_subvols, nr_snapids);
-	if (ret)
-		return ret;
-
-	n_tree->v.master_subvol	= cpu_to_le32(snapshot_subvols[0]);
-	n_tree->v.root_snapshot	= cpu_to_le32(new_snapids[0]);
-	return 0;
-}
-
-int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent,
-			      u32 *new_snapids,
-			      u32 *snapshot_subvols,
-			      unsigned nr_snapids)
-{
-	BUG_ON((parent == 0) != (nr_snapids == 1));
-	BUG_ON((parent != 0) != (nr_snapids == 2));
-
-	return parent
-		? bch2_snapshot_node_create_children(trans, parent,
-				new_snapids, snapshot_subvols, nr_snapids)
-		: bch2_snapshot_node_create_tree(trans,
-				new_snapids, snapshot_subvols, nr_snapids);
-
-}
-
-static int snapshot_delete_key(struct btree_trans *trans,
-			       struct btree_iter *iter,
-			       struct bkey_s_c k,
-			       snapshot_id_list *deleted,
-			       snapshot_id_list *equiv_seen,
-			       struct bpos *last_pos)
-{
-	struct bch_fs *c = trans->c;
-	u32 equiv = bch2_snapshot_equiv(c, k.k->p.snapshot);
-
-	if (!bkey_eq(k.k->p, *last_pos))
-		equiv_seen->nr = 0;
-	*last_pos = k.k->p;
-
-	if (snapshot_list_has_id(deleted, k.k->p.snapshot) ||
-	    snapshot_list_has_id(equiv_seen, equiv)) {
-		return bch2_btree_delete_at(trans, iter,
-					    BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
-	} else {
-		return snapshot_list_add(c, equiv_seen, equiv);
-	}
-}
-
-static int bch2_delete_redundant_snapshot(struct btree_trans *trans, struct btree_iter *iter,
-					  struct bkey_s_c k)
-{
-	struct bkey_s_c_snapshot snap;
-	u32 children[2];
-	int ret;
-
-	if (k.k->type != KEY_TYPE_snapshot)
-		return 0;
-
-	snap = bkey_s_c_to_snapshot(k);
-	if (BCH_SNAPSHOT_DELETED(snap.v) ||
-	    BCH_SNAPSHOT_SUBVOL(snap.v))
-		return 0;
-
-	children[0] = le32_to_cpu(snap.v->children[0]);
-	children[1] = le32_to_cpu(snap.v->children[1]);
-
-	ret   = snapshot_live(trans, children[0]) ?:
-		snapshot_live(trans, children[1]);
-	if (ret < 0)
-		return ret;
-
-	if (!ret)
-		return bch2_snapshot_node_set_deleted(trans, k.k->p.offset);
-	return 0;
-}
-
-int bch2_delete_dead_snapshots(struct bch_fs *c)
-{
-	struct btree_trans trans;
-	struct btree_iter iter;
-	struct bkey_s_c k;
-	struct bkey_s_c_snapshot snap;
-	snapshot_id_list deleted = { 0 };
-	u32 i, id;
-	int ret = 0;
-
-	if (!test_bit(BCH_FS_STARTED, &c->flags)) {
-		ret = bch2_fs_read_write_early(c);
-		if (ret) {
-			bch_err(c, "error deleleting dead snapshots: error going rw: %s", bch2_err_str(ret));
-			return ret;
-		}
-	}
-
-	bch2_trans_init(&trans, c, 0, 0);
-
-	/*
-	 * For every snapshot node: If we have no live children and it's not
-	 * pointed to by a subvolume, delete it:
-	 */
-	ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_snapshots,
-			POS_MIN, 0, k,
-			NULL, NULL, 0,
-		bch2_delete_redundant_snapshot(&trans, &iter, k));
-	if (ret) {
-		bch_err(c, "error deleting redundant snapshots: %s", bch2_err_str(ret));
-		goto err;
-	}
-
-	for_each_btree_key2(&trans, iter, BTREE_ID_snapshots,
-			   POS_MIN, 0, k,
-		bch2_snapshot_set_equiv(&trans, k));
-	if (ret) {
-		bch_err(c, "error in bch2_snapshots_set_equiv: %s", bch2_err_str(ret));
-		goto err;
-	}
-
-	for_each_btree_key(&trans, iter, BTREE_ID_snapshots,
-			   POS_MIN, 0, k, ret) {
-		if (k.k->type != KEY_TYPE_snapshot)
-			continue;
-
-		snap = bkey_s_c_to_snapshot(k);
-		if (BCH_SNAPSHOT_DELETED(snap.v)) {
-			ret = snapshot_list_add(c, &deleted, k.k->p.offset);
-			if (ret)
-				break;
-		}
-	}
-	bch2_trans_iter_exit(&trans, &iter);
-
-	if (ret) {
-		bch_err(c, "error walking snapshots: %s", bch2_err_str(ret));
-		goto err;
-	}
-
-	for (id = 0; id < BTREE_ID_NR; id++) {
-		struct bpos last_pos = POS_MIN;
-		snapshot_id_list equiv_seen = { 0 };
-
-		if (!btree_type_has_snapshots(id))
-			continue;
-
-		ret = for_each_btree_key_commit(&trans, iter,
-				id, POS_MIN,
-				BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
-				NULL, NULL, BTREE_INSERT_NOFAIL,
-			snapshot_delete_key(&trans, &iter, k, &deleted, &equiv_seen, &last_pos));
-
-		darray_exit(&equiv_seen);
-
-		if (ret) {
-			bch_err(c, "error deleting snapshot keys: %s", bch2_err_str(ret));
-			goto err;
-		}
-	}
-
-	for (i = 0; i < deleted.nr; i++) {
-		ret = commit_do(&trans, NULL, NULL, 0,
-			bch2_snapshot_node_delete(&trans, deleted.data[i]));
-		if (ret) {
-			bch_err(c, "error deleting snapshot %u: %s",
-				deleted.data[i], bch2_err_str(ret));
-			goto err;
-		}
-	}
-
-	clear_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags);
-err:
-	darray_exit(&deleted);
-	bch2_trans_exit(&trans);
-	if (ret)
-		bch_err_fn(c, ret);
-	return ret;
-}
-
-static void bch2_delete_dead_snapshots_work(struct work_struct *work)
-{
-	struct bch_fs *c = container_of(work, struct bch_fs, snapshot_delete_work);
-
-	if (test_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags))
-		bch2_delete_dead_snapshots(c);
-	bch2_write_ref_put(c, BCH_WRITE_REF_delete_dead_snapshots);
-}
-
-void bch2_delete_dead_snapshots_async(struct bch_fs *c)
-{
-	if (bch2_write_ref_tryget(c, BCH_WRITE_REF_delete_dead_snapshots) &&
-	    !queue_work(c->write_ref_wq, &c->snapshot_delete_work))
-		bch2_write_ref_put(c, BCH_WRITE_REF_delete_dead_snapshots);
-}
-
-static int bch2_delete_dead_snapshots_hook(struct btree_trans *trans,
-					   struct btree_trans_commit_hook *h)
-{
-	struct bch_fs *c = trans->c;
-
-	set_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags);
-
-	if (c->curr_recovery_pass <= BCH_RECOVERY_PASS_delete_dead_snapshots)
-		return 0;
-
-	bch2_delete_dead_snapshots_async(c);
-	return 0;
-}
-
 /* Subvolumes: */
 
 int bch2_subvolume_invalid(const struct bch_fs *c, struct bkey_s_c k,
@@ -1478,26 +150,27 @@ int bch2_snapshot_get_subvol(struct btree_trans *trans, u32 snapshot,
 {
 	struct bch_snapshot snap;
 
-	return  snapshot_lookup(trans, snapshot, &snap) ?:
+	return  bch2_snapshot_lookup(trans, snapshot, &snap) ?:
 		bch2_subvolume_get(trans, le32_to_cpu(snap.subvol), true, 0, subvol);
 }
 
-int bch2_subvolume_get_snapshot(struct btree_trans *trans, u32 subvol,
+int bch2_subvolume_get_snapshot(struct btree_trans *trans, u32 subvolid,
 				u32 *snapid)
 {
 	struct btree_iter iter;
-	struct bkey_s_c k;
+	struct bkey_s_c_subvolume subvol;
 	int ret;
 
-	k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_subvolumes, POS(0, subvol),
-			       BTREE_ITER_CACHED|
-			       BTREE_ITER_WITH_UPDATES);
-	ret = bkey_err(k) ?: k.k->type == KEY_TYPE_subvolume ? 0 : -BCH_ERR_ENOENT_subvolume;
+	subvol = bch2_bkey_get_iter_typed(trans, &iter,
+					  BTREE_ID_subvolumes, POS(0, subvolid),
+					  BTREE_ITER_CACHED|BTREE_ITER_WITH_UPDATES,
+					  subvolume);
+	ret = bkey_err(subvol);
+	bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), trans->c,
+				"missing subvolume %u", subvolid);
 
 	if (likely(!ret))
-		*snapid = le32_to_cpu(bkey_s_c_to_subvolume(k).v->snapshot);
-	else if (bch2_err_matches(ret, ENOENT))
-		bch2_fs_inconsistent(trans->c, "missing subvolume %u", subvol);
+		*snapid = le32_to_cpu(subvol.v->snapshot);
 	bch2_trans_iter_exit(trans, &iter);
 	return ret;
 }
@@ -1527,7 +200,12 @@ static int bch2_subvolume_reparent(struct btree_trans *trans,
 }
 
 /*
- * Scan for subvolumes with parent @subvolid_to_delete, reparent:
+ * Separate from the snapshot tree in the snapshots btree, we record the tree
+ * structure of how snapshot subvolumes were created - the parent subvolume of
+ * each snapshot subvolume.
+ *
+ * When a subvolume is deleted, we scan for child subvolumes and reparant them,
+ * to avoid dangling references:
  */
 static int bch2_subvolumes_reparent(struct btree_trans *trans, u32 subvolid_to_delete)
 {
diff --git a/fs/bcachefs/subvolume.h b/fs/bcachefs/subvolume.h
index 6905e91a9470..8d4c50f4cd05 100644
--- a/fs/bcachefs/subvolume.h
+++ b/fs/bcachefs/subvolume.h
@@ -7,225 +7,8 @@
 
 enum bkey_invalid_flags;
 
-void bch2_snapshot_tree_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
-int bch2_snapshot_tree_invalid(const struct bch_fs *, struct bkey_s_c,
-			       enum bkey_invalid_flags, struct printbuf *);
-
-#define bch2_bkey_ops_snapshot_tree ((struct bkey_ops) {	\
-	.key_invalid	= bch2_snapshot_tree_invalid,		\
-	.val_to_text	= bch2_snapshot_tree_to_text,		\
-	.min_val_size	= 8,					\
-})
-
-int bch2_snapshot_tree_lookup(struct btree_trans *, u32, struct bch_snapshot_tree *);
-
-void bch2_snapshot_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
-int bch2_snapshot_invalid(const struct bch_fs *, struct bkey_s_c,
-			  enum bkey_invalid_flags, struct printbuf *);
-int bch2_mark_snapshot(struct btree_trans *, enum btree_id, unsigned,
-		       struct bkey_s_c, struct bkey_s_c, unsigned);
-
-#define bch2_bkey_ops_snapshot ((struct bkey_ops) {		\
-	.key_invalid	= bch2_snapshot_invalid,		\
-	.val_to_text	= bch2_snapshot_to_text,		\
-	.atomic_trigger	= bch2_mark_snapshot,			\
-	.min_val_size	= 24,					\
-})
-
-static inline struct snapshot_t *__snapshot_t(struct snapshot_table *t, u32 id)
-{
-	return &t->s[U32_MAX - id];
-}
-
-static inline const struct snapshot_t *snapshot_t(struct bch_fs *c, u32 id)
-{
-	return __snapshot_t(rcu_dereference(c->snapshots), id);
-}
-
-static inline u32 bch2_snapshot_tree(struct bch_fs *c, u32 id)
-{
-	rcu_read_lock();
-	id = snapshot_t(c, id)->tree;
-	rcu_read_unlock();
-
-	return id;
-}
-
-static inline u32 __bch2_snapshot_parent_early(struct bch_fs *c, u32 id)
-{
-	return snapshot_t(c, id)->parent;
-}
-
-static inline u32 bch2_snapshot_parent_early(struct bch_fs *c, u32 id)
-{
-	rcu_read_lock();
-	id = __bch2_snapshot_parent_early(c, id);
-	rcu_read_unlock();
-
-	return id;
-}
-
-static inline u32 __bch2_snapshot_parent(struct bch_fs *c, u32 id)
-{
-#ifdef CONFIG_BCACHEFS_DEBUG
-	u32 parent = snapshot_t(c, id)->parent;
-
-	if (parent &&
-	    snapshot_t(c, id)->depth != snapshot_t(c, parent)->depth + 1)
-		panic("id %u depth=%u parent %u depth=%u\n",
-		      id, snapshot_t(c, id)->depth,
-		      parent, snapshot_t(c, parent)->depth);
-
-	return parent;
-#else
-	return snapshot_t(c, id)->parent;
-#endif
-}
-
-static inline u32 bch2_snapshot_parent(struct bch_fs *c, u32 id)
-{
-	rcu_read_lock();
-	id = __bch2_snapshot_parent(c, id);
-	rcu_read_unlock();
-
-	return id;
-}
-
-static inline u32 bch2_snapshot_nth_parent(struct bch_fs *c, u32 id, u32 n)
-{
-	rcu_read_lock();
-	while (n--)
-		id = __bch2_snapshot_parent(c, id);
-	rcu_read_unlock();
-
-	return id;
-}
-
-static inline u32 bch2_snapshot_root(struct bch_fs *c, u32 id)
-{
-	u32 parent;
-
-	rcu_read_lock();
-	while ((parent = __bch2_snapshot_parent(c, id)))
-		id = parent;
-	rcu_read_unlock();
-
-	return id;
-}
-
-static inline u32 __bch2_snapshot_equiv(struct bch_fs *c, u32 id)
-{
-	return snapshot_t(c, id)->equiv;
-}
-
-static inline u32 bch2_snapshot_equiv(struct bch_fs *c, u32 id)
-{
-	rcu_read_lock();
-	id = __bch2_snapshot_equiv(c, id);
-	rcu_read_unlock();
-
-	return id;
-}
-
-static inline bool bch2_snapshot_is_equiv(struct bch_fs *c, u32 id)
-{
-	return id == bch2_snapshot_equiv(c, id);
-}
-
-static inline bool bch2_snapshot_is_internal_node(struct bch_fs *c, u32 id)
-{
-	const struct snapshot_t *s;
-	bool ret;
-
-	rcu_read_lock();
-	s = snapshot_t(c, id);
-	ret = s->children[0];
-	rcu_read_unlock();
-
-	return ret;
-}
-
-static inline u32 bch2_snapshot_is_leaf(struct bch_fs *c, u32 id)
-{
-	return !bch2_snapshot_is_internal_node(c, id);
-}
-
-static inline u32 bch2_snapshot_sibling(struct bch_fs *c, u32 id)
-{
-	const struct snapshot_t *s;
-	u32 parent = __bch2_snapshot_parent(c, id);
-
-	if (!parent)
-		return 0;
-
-	s = snapshot_t(c, __bch2_snapshot_parent(c, id));
-	if (id == s->children[0])
-		return s->children[1];
-	if (id == s->children[1])
-		return s->children[0];
-	return 0;
-}
-
-bool __bch2_snapshot_is_ancestor(struct bch_fs *, u32, u32);
-
-static inline bool bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor)
-{
-	return id == ancestor
-		? true
-		: __bch2_snapshot_is_ancestor(c, id, ancestor);
-}
-
-static inline bool bch2_snapshot_has_children(struct bch_fs *c, u32 id)
-{
-	const struct snapshot_t *t;
-	bool ret;
-
-	rcu_read_lock();
-	t = snapshot_t(c, id);
-	ret = (t->children[0]|t->children[1]) != 0;
-	rcu_read_unlock();
-
-	return ret;
-}
-
-static inline bool snapshot_list_has_id(snapshot_id_list *s, u32 id)
-{
-	u32 *i;
-
-	darray_for_each(*s, i)
-		if (*i == id)
-			return true;
-	return false;
-}
-
-static inline bool snapshot_list_has_ancestor(struct bch_fs *c, snapshot_id_list *s, u32 id)
-{
-	u32 *i;
-
-	darray_for_each(*s, i)
-		if (bch2_snapshot_is_ancestor(c, id, *i))
-			return true;
-	return false;
-}
-
-static inline int snapshot_list_add(struct bch_fs *c, snapshot_id_list *s, u32 id)
-{
-	int ret;
-
-	BUG_ON(snapshot_list_has_id(s, id));
-	ret = darray_push(s, id);
-	if (ret)
-		bch_err(c, "error reallocating snapshot_id_list (size %zu)", s->size);
-	return ret;
-}
-
-int bch2_check_snapshot_trees(struct bch_fs *);
-int bch2_check_snapshots(struct bch_fs *);
 int bch2_check_subvols(struct bch_fs *);
 
-void bch2_fs_snapshots_exit(struct bch_fs *);
-int bch2_snapshots_read(struct bch_fs *);
-
 int bch2_subvolume_invalid(const struct bch_fs *, struct bkey_s_c,
 			   unsigned, struct printbuf *);
 void bch2_subvolume_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
@@ -238,14 +21,8 @@ void bch2_subvolume_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c)
 
 int bch2_subvolume_get(struct btree_trans *, unsigned,
 		       bool, int, struct bch_subvolume *);
-int bch2_snapshot_get_subvol(struct btree_trans *, u32,
-			     struct bch_subvolume *);
 int bch2_subvolume_get_snapshot(struct btree_trans *, u32, u32 *);
 
-/* only exported for tests: */
-int bch2_snapshot_node_create(struct btree_trans *, u32,
-			      u32 *, u32 *, unsigned);
-
 int bch2_delete_dead_snapshots(struct bch_fs *);
 void bch2_delete_dead_snapshots_async(struct bch_fs *);
 
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index d9dbcd0bdbf5..8e2ec3b6c9b3 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -48,6 +48,7 @@
 #include "recovery.h"
 #include "replicas.h"
 #include "sb-clean.h"
+#include "snapshot.h"
 #include "subvolume.h"
 #include "super.h"
 #include "super-io.h"
diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c
index 1d4b0a583586..72389c7376d6 100644
--- a/fs/bcachefs/tests.c
+++ b/fs/bcachefs/tests.c
@@ -4,7 +4,7 @@
 #include "bcachefs.h"
 #include "btree_update.h"
 #include "journal_reclaim.h"
-#include "subvolume.h"
+#include "snapshot.h"
 #include "tests.h"
 
 #include "linux/kthread.h"
diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h
index 3cec6171c58f..d34423352f60 100644
--- a/fs/bcachefs/util.h
+++ b/fs/bcachefs/util.h
@@ -842,6 +842,11 @@ static inline int u8_cmp(u8 l, u8 r)
 	return cmp_int(l, r);
 }
 
+static inline int cmp_le32(__le32 l, __le32 r)
+{
+	return cmp_int(le32_to_cpu(l), le32_to_cpu(r));
+}
+
 #include <linux/uuid.h>
 
 #endif /* _BCACHEFS_UTIL_H */
-- 
cgit 


From d0445e131e3ff1a68492f7f6e754cd672db4774d Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 17 Aug 2023 16:35:58 -0400
Subject: bcachefs: Fix divide by zero in rebalance_work()

This fixes https://github.com/koverstreet/bcachefs-tools/issues/159

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/rebalance.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c
index c3d577236ce2..15ce3ecba0ba 100644
--- a/fs/bcachefs/rebalance.c
+++ b/fs/bcachefs/rebalance.c
@@ -113,6 +113,10 @@ static void rebalance_work_accumulate(struct rebalance_work *w,
 	unsigned percent_full;
 	u64 work = dev_work + unknown_dev;
 
+	/* avoid divide by 0 */
+	if (!capacity)
+		return;
+
 	if (work < dev_work || work < unknown_dev)
 		work = U64_MAX;
 	work = min(work, capacity);
-- 
cgit 


From 55d5276d2e7b7ac88a21a7ab89efd308734bb775 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 17 Aug 2023 22:04:20 -0400
Subject: bcachefs: Improve btree_path_relock_fail tracepoint

In https://github.com/koverstreet/bcachefs/issues/450, we're seeing
unexplained btree_path_relock_fail events - according to the information
currently in the tracepoint, it appears the relock should be succeeding.

This adds lock counts to the tracepoint to help track it down.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/trace.h | 26 ++++++++++++++++++++++----
 1 file changed, 22 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h
index a743ab477966..97fe774237d0 100644
--- a/fs/bcachefs/trace.h
+++ b/fs/bcachefs/trace.h
@@ -403,29 +403,43 @@ TRACE_EVENT(btree_path_relock_fail,
 		__field(u8,			level		)
 		TRACE_BPOS_entries(pos)
 		__array(char,			node, 24	)
+		__field(u8,			self_read_count	)
+		__field(u8,			self_intent_count)
+		__field(u8,			read_count	)
+		__field(u8,			intent_count	)
 		__field(u32,			iter_lock_seq	)
 		__field(u32,			node_lock_seq	)
 	),
 
 	TP_fast_assign(
 		struct btree *b = btree_path_node(path, level);
+		struct six_lock_count c;
 
 		strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
 		__entry->caller_ip		= caller_ip;
 		__entry->btree_id		= path->btree_id;
 		__entry->level			= path->level;
 		TRACE_BPOS_assign(pos, path->pos);
-		if (IS_ERR(b))
+
+		c = bch2_btree_node_lock_counts(trans, NULL, &path->l[level].b->c, level),
+		__entry->self_read_count	= c.n[SIX_LOCK_read];
+		__entry->self_intent_count	= c.n[SIX_LOCK_intent];
+
+		if (IS_ERR(b)) {
 			strscpy(__entry->node, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node));
-		else
+		} else {
+			c = six_lock_counts(&path->l[level].b->c.lock);
+			__entry->read_count	= c.n[SIX_LOCK_read];
+			__entry->intent_count	= c.n[SIX_LOCK_intent];
 			scnprintf(__entry->node, sizeof(__entry->node), "%px", b);
+		}
 		__entry->iter_lock_seq		= path->l[level].lock_seq;
 		__entry->node_lock_seq		= is_btree_node(path, level)
 			? six_lock_seq(&path->l[level].b->c.lock)
 			: 0;
 	),
 
-	TP_printk("%s %pS btree %s pos %llu:%llu:%u level %u node %s iter seq %u lock seq %u",
+	TP_printk("%s %pS btree %s pos %llu:%llu:%u level %u node %s held %u:%u lock count %u:%u iter seq %u lock seq %u",
 		  __entry->trans_fn,
 		  (void *) __entry->caller_ip,
 		  bch2_btree_ids[__entry->btree_id],
@@ -434,6 +448,10 @@ TRACE_EVENT(btree_path_relock_fail,
 		  __entry->pos_snapshot,
 		  __entry->level,
 		  __entry->node,
+		  __entry->self_read_count,
+		  __entry->self_intent_count,
+		  __entry->read_count,
+		  __entry->intent_count,
 		  __entry->iter_lock_seq,
 		  __entry->node_lock_seq)
 );
@@ -475,7 +493,7 @@ TRACE_EVENT(btree_path_upgrade_fail,
 		__entry->self_intent_count	= c.n[SIX_LOCK_intent];
 		c = six_lock_counts(&path->l[level].b->c.lock);
 		__entry->read_count		= c.n[SIX_LOCK_read];
-		__entry->intent_count		= c.n[SIX_LOCK_read];
+		__entry->intent_count		= c.n[SIX_LOCK_intent];
 		__entry->iter_lock_seq		= path->l[level].lock_seq;
 		__entry->node_lock_seq		= is_btree_node(path, level)
 			? six_lock_seq(&path->l[level].b->c.lock)
-- 
cgit 


From adc0e9509184da7204397f4d0410b61f75bac706 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 18 Aug 2023 00:05:35 -0400
Subject: bcachefs: Delete a faulty assertion

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/journal_io.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 50a7c3330807..34740dca4b15 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -319,7 +319,6 @@ static int journal_validate_key(struct bch_fs *c,
 				  __btree_node_type(level, btree_id), write, &buf);
 
 		mustfix_fsck_err(c, "%s", buf.buf);
-		BUG();
 
 		le16_add_cpu(&entry->u64s, -((u16) k->k.u64s));
 		memmove(k, bkey_next(k), next - (void *) bkey_next(k));
-- 
cgit 


From 7573041ab958a14407621ef9756be49548f937e6 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 18 Aug 2023 17:44:21 -0400
Subject: bcachefs: Fix bch2_mount error path

In the bch2_mount() error path, we were calling
deactivate_locked_super(), which calls ->kill_sb(), which in our case
was calling bch2_fs_free() without __bch2_fs_stop().

This changes bch2_mount() to just call bch2_fs_stop() directly.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs.c    | 8 +++++++-
 fs/bcachefs/super.c | 2 ++
 2 files changed, 9 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index d2f93a8af4ac..48431700b83e 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -1906,7 +1906,10 @@ out:
 	return dget(sb->s_root);
 
 err_put_super:
+	sb->s_fs_info = NULL;
+	c->vfs_sb = NULL;
 	deactivate_locked_super(sb);
+	bch2_fs_stop(c);
 	return ERR_PTR(bch2_err_class(ret));
 }
 
@@ -1914,8 +1917,11 @@ static void bch2_kill_sb(struct super_block *sb)
 {
 	struct bch_fs *c = sb->s_fs_info;
 
+	if (c)
+		c->vfs_sb = NULL;
 	generic_shutdown_super(sb);
-	bch2_fs_free(c);
+	if (c)
+		bch2_fs_free(c);
 }
 
 static struct file_system_type bcache_fs_type = {
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 8e2ec3b6c9b3..60424865980d 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -581,6 +581,8 @@ void bch2_fs_free(struct bch_fs *c)
 {
 	unsigned i;
 
+	BUG_ON(!test_bit(BCH_FS_STOPPING, &c->flags));
+
 	mutex_lock(&bch_fs_list_lock);
 	list_del(&c->list);
 	mutex_unlock(&bch_fs_list_lock);
-- 
cgit 


From fa5bed376a184f2dcb48dba5c076583ed6c61340 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 18 Aug 2023 21:13:44 -0400
Subject: bcachefs: move check_pos_snapshot_overwritten() to snapshot.c

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update.c | 53 ++++------------------------------------------
 fs/bcachefs/snapshot.c     | 34 +++++++++++++++++++++++++++++
 fs/bcachefs/snapshot.h     | 13 ++++++++++++
 3 files changed, 51 insertions(+), 49 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update.c b/fs/bcachefs/btree_update.c
index a7fa20727d4b..880ce7431894 100644
--- a/fs/bcachefs/btree_update.c
+++ b/fs/bcachefs/btree_update.c
@@ -28,51 +28,6 @@ bch2_trans_update_by_path(struct btree_trans *, struct btree_path *,
 			  struct bkey_i *, enum btree_update_flags,
 			  unsigned long ip);
 
-static noinline int __check_pos_snapshot_overwritten(struct btree_trans *trans,
-					  enum btree_id id,
-					  struct bpos pos)
-{
-	struct bch_fs *c = trans->c;
-	struct btree_iter iter;
-	struct bkey_s_c k;
-	int ret;
-
-	bch2_trans_iter_init(trans, &iter, id, pos,
-			     BTREE_ITER_NOT_EXTENTS|
-			     BTREE_ITER_ALL_SNAPSHOTS);
-	while (1) {
-		k = bch2_btree_iter_prev(&iter);
-		ret = bkey_err(k);
-		if (ret)
-			break;
-
-		if (!k.k)
-			break;
-
-		if (!bkey_eq(pos, k.k->p))
-			break;
-
-		if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, pos.snapshot)) {
-			ret = 1;
-			break;
-		}
-	}
-	bch2_trans_iter_exit(trans, &iter);
-
-	return ret;
-}
-
-static inline int check_pos_snapshot_overwritten(struct btree_trans *trans,
-					  enum btree_id id,
-					  struct bpos pos)
-{
-	if (!btree_type_has_snapshots(id) ||
-	    bch2_snapshot_is_leaf(trans->c, pos.snapshot))
-		return 0;
-
-	return __check_pos_snapshot_overwritten(trans, id, pos);
-}
-
 static noinline int extent_front_merge(struct btree_trans *trans,
 				       struct btree_iter *iter,
 				       struct bkey_s_c k,
@@ -91,8 +46,8 @@ static noinline int extent_front_merge(struct btree_trans *trans,
 	if (!bch2_bkey_merge(c, bkey_i_to_s(update), bkey_i_to_s_c(*insert)))
 		return 0;
 
-	ret =   check_pos_snapshot_overwritten(trans, iter->btree_id, k.k->p) ?:
-		check_pos_snapshot_overwritten(trans, iter->btree_id, (*insert)->k.p);
+	ret =   bch2_key_has_snapshot_overwrites(trans, iter->btree_id, k.k->p) ?:
+		bch2_key_has_snapshot_overwrites(trans, iter->btree_id, (*insert)->k.p);
 	if (ret < 0)
 		return ret;
 	if (ret)
@@ -114,8 +69,8 @@ static noinline int extent_back_merge(struct btree_trans *trans,
 	struct bch_fs *c = trans->c;
 	int ret;
 
-	ret =   check_pos_snapshot_overwritten(trans, iter->btree_id, insert->k.p) ?:
-		check_pos_snapshot_overwritten(trans, iter->btree_id, k.k->p);
+	ret =   bch2_key_has_snapshot_overwrites(trans, iter->btree_id, insert->k.p) ?:
+		bch2_key_has_snapshot_overwrites(trans, iter->btree_id, k.k->p);
 	if (ret < 0)
 		return ret;
 	if (ret)
diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c
index ad7991ad87a9..0284250ac6cc 100644
--- a/fs/bcachefs/snapshot.c
+++ b/fs/bcachefs/snapshot.c
@@ -1336,6 +1336,40 @@ int bch2_delete_dead_snapshots_hook(struct btree_trans *trans,
 	return 0;
 }
 
+int __bch2_key_has_snapshot_overwrites(struct btree_trans *trans,
+				       enum btree_id id,
+				       struct bpos pos)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	int ret;
+
+	bch2_trans_iter_init(trans, &iter, id, pos,
+			     BTREE_ITER_NOT_EXTENTS|
+			     BTREE_ITER_ALL_SNAPSHOTS);
+	while (1) {
+		k = bch2_btree_iter_prev(&iter);
+		ret = bkey_err(k);
+		if (ret)
+			break;
+
+		if (!k.k)
+			break;
+
+		if (!bkey_eq(pos, k.k->p))
+			break;
+
+		if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, pos.snapshot)) {
+			ret = 1;
+			break;
+		}
+	}
+	bch2_trans_iter_exit(trans, &iter);
+
+	return ret;
+}
+
 int bch2_snapshots_read(struct bch_fs *c)
 {
 	struct btree_iter iter;
diff --git a/fs/bcachefs/snapshot.h b/fs/bcachefs/snapshot.h
index 826bff2ff7be..624a42d1c8b7 100644
--- a/fs/bcachefs/snapshot.h
+++ b/fs/bcachefs/snapshot.h
@@ -250,6 +250,19 @@ int bch2_delete_dead_snapshots_hook(struct btree_trans *,
 				    struct btree_trans_commit_hook *);
 void bch2_delete_dead_snapshots_work(struct work_struct *);
 
+int __bch2_key_has_snapshot_overwrites(struct btree_trans *, enum btree_id, struct bpos);
+
+static inline int bch2_key_has_snapshot_overwrites(struct btree_trans *trans,
+					  enum btree_id id,
+					  struct bpos pos)
+{
+	if (!btree_type_has_snapshots(id) ||
+	    bch2_snapshot_is_leaf(trans->c, pos.snapshot))
+		return 0;
+
+	return __bch2_key_has_snapshot_overwrites(trans, id, pos);
+}
+
 int bch2_snapshots_read(struct bch_fs *);
 void bch2_fs_snapshots_exit(struct bch_fs *);
 
-- 
cgit 


From 66487c54ad8207ef1c029df45ffaeaafbcc11f77 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 13 Jul 2023 02:43:29 -0400
Subject: bcachefs: Fix is_ancestor bitmap

The is_ancestor bitmap is at optimization for bch2_snapshot_is_ancestor;
once we get sufficiently close to the ancestor ID we're searching for we
test a bitmap.

But initialization of the is_ancestor bitmap was broken; we do it by
using bch2_snapshot_parent(), but we call that on nodes that haven't
been initialized yet with bch2_mark_snapshot().

Fix this by adding a separate loop in bch2_snapshots_read() for
initializing the is_ancestor bitmap, and also add some new debug asserts
for checking this sort of breakage in the future.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/snapshot.c | 64 +++++++++++++++++++++++++++++++++-----------------
 1 file changed, 43 insertions(+), 21 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c
index 0284250ac6cc..56961c074674 100644
--- a/fs/bcachefs/snapshot.c
+++ b/fs/bcachefs/snapshot.c
@@ -88,6 +88,20 @@ static int bch2_snapshot_tree_create(struct btree_trans *trans,
 
 /* Snapshot nodes: */
 
+static bool bch2_snapshot_is_ancestor_early(struct bch_fs *c, u32 id, u32 ancestor)
+{
+	struct snapshot_table *t;
+
+	rcu_read_lock();
+	t = rcu_dereference(c->snapshots);
+
+	while (id && id < ancestor)
+		id = __snapshot_t(t, id)->parent;
+	rcu_read_unlock();
+
+	return id == ancestor;
+}
+
 static inline u32 get_ancestor_below(struct snapshot_table *t, u32 id, u32 ancestor)
 {
 	const struct snapshot_t *s = __snapshot_t(t, id);
@@ -114,26 +128,17 @@ bool __bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor)
 	while (id && id < ancestor - IS_ANCESTOR_BITMAP)
 		id = get_ancestor_below(t, id, ancestor);
 
-	ret = id && id < ancestor
-		? test_bit(ancestor - id - 1, __snapshot_t(t, id)->is_ancestor)
-		: id == ancestor;
-	rcu_read_unlock();
-
-	return ret;
-}
+	if (id && id < ancestor) {
+		ret = test_bit(ancestor - id - 1, __snapshot_t(t, id)->is_ancestor);
 
-static bool bch2_snapshot_is_ancestor_early(struct bch_fs *c, u32 id, u32 ancestor)
-{
-	struct snapshot_table *t;
-
-	rcu_read_lock();
-	t = rcu_dereference(c->snapshots);
+		EBUG_ON(ret != bch2_snapshot_is_ancestor_early(c, id, ancestor));
+	} else {
+		ret = id == ancestor;
+	}
 
-	while (id && id < ancestor)
-		id = __snapshot_t(t, id)->parent;
 	rcu_read_unlock();
 
-	return id == ancestor;
+	return ret;
 }
 
 struct snapshot_t_free_rcu {
@@ -280,6 +285,23 @@ int bch2_snapshot_invalid(const struct bch_fs *c, struct bkey_s_c k,
 	return 0;
 }
 
+static void __set_is_ancestor_bitmap(struct bch_fs *c, u32 id)
+{
+	struct snapshot_t *t = snapshot_t_mut(c, id);
+	u32 parent = id;
+
+	while ((parent = bch2_snapshot_parent_early(c, parent)) &&
+	       parent - id - 1 < IS_ANCESTOR_BITMAP)
+		__set_bit(parent - id - 1, t->is_ancestor);
+}
+
+static void set_is_ancestor_bitmap(struct bch_fs *c, u32 id)
+{
+	mutex_lock(&c->snapshot_table_lock);
+	__set_is_ancestor_bitmap(c, id);
+	mutex_unlock(&c->snapshot_table_lock);
+}
+
 int bch2_mark_snapshot(struct btree_trans *trans,
 		       enum btree_id btree, unsigned level,
 		       struct bkey_s_c old, struct bkey_s_c new,
@@ -300,7 +322,6 @@ int bch2_mark_snapshot(struct btree_trans *trans,
 
 	if (new.k->type == KEY_TYPE_snapshot) {
 		struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(new);
-		u32 parent = id;
 
 		t->parent	= le32_to_cpu(s.v->parent);
 		t->children[0]	= le32_to_cpu(s.v->children[0]);
@@ -320,9 +341,7 @@ int bch2_mark_snapshot(struct btree_trans *trans,
 			t->skip[2]	= 0;
 		}
 
-		while ((parent = bch2_snapshot_parent_early(c, parent)) &&
-		       parent - id - 1 < IS_ANCESTOR_BITMAP)
-			__set_bit(parent - id - 1, t->is_ancestor);
+		__set_is_ancestor_bitmap(c, id);
 
 		if (BCH_SNAPSHOT_DELETED(s.v)) {
 			set_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags);
@@ -1380,7 +1399,10 @@ int bch2_snapshots_read(struct bch_fs *c)
 		for_each_btree_key2(&trans, iter, BTREE_ID_snapshots,
 			   POS_MIN, 0, k,
 			bch2_mark_snapshot(&trans, BTREE_ID_snapshots, 0, bkey_s_c_null, k, 0) ?:
-			bch2_snapshot_set_equiv(&trans, k)));
+			bch2_snapshot_set_equiv(&trans, k)) ?:
+		for_each_btree_key2(&trans, iter, BTREE_ID_snapshots,
+			   POS_MIN, 0, k,
+			   (set_is_ancestor_bitmap(c, k.k->p.offset), 0)));
 	if (ret)
 		bch_err_fn(c, ret);
 	return ret;
-- 
cgit 


From da525760802b9f18cd9eb9ecdb23952f41723de2 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 21 Aug 2023 19:57:34 -0400
Subject: bcachefs: Fix btree write buffer with snapshots btrees

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_trans_commit.c | 1 +
 fs/bcachefs/btree_write_buffer.c | 9 ++++++---
 2 files changed, 7 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c
index 83cc7f64c57c..e099fe570962 100644
--- a/fs/bcachefs/btree_trans_commit.c
+++ b/fs/bcachefs/btree_trans_commit.c
@@ -97,6 +97,7 @@ bool bch2_btree_bset_insert_key(struct btree_trans *trans,
 	EBUG_ON(bpos_gt(insert->k.p, b->data->max_key));
 	EBUG_ON(insert->k.u64s >
 		bch_btree_keys_u64s_remaining(trans->c, b));
+	EBUG_ON(!b->c.level && !bpos_eq(insert->k.p, path->pos));
 
 	k = bch2_btree_node_iter_peek_all(node_iter, b);
 	if (k && bkey_cmp_left_packed(b, k, &insert->k.p))
diff --git a/fs/bcachefs/btree_write_buffer.c b/fs/bcachefs/btree_write_buffer.c
index 5f96db539fd7..6d2d43b6ff6a 100644
--- a/fs/bcachefs/btree_write_buffer.c
+++ b/fs/bcachefs/btree_write_buffer.c
@@ -75,7 +75,8 @@ static int bch2_btree_write_buffer_flush_one(struct btree_trans *trans,
 	}
 	return 0;
 trans_commit:
-	return  bch2_trans_update_seq(trans, wb->journal_seq, iter, &wb->k, 0) ?:
+	return  bch2_trans_update_seq(trans, wb->journal_seq, iter, &wb->k,
+				      BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
 		bch2_trans_commit(trans, NULL, NULL,
 				  commit_flags|
 				  BTREE_INSERT_NOCHECK_RW|
@@ -124,7 +125,8 @@ btree_write_buffered_insert(struct btree_trans *trans,
 			     BTREE_ITER_CACHED|BTREE_ITER_INTENT);
 
 	ret   = bch2_btree_iter_traverse(&iter) ?:
-		bch2_trans_update_seq(trans, wb->journal_seq, &iter, &wb->k, 0);
+		bch2_trans_update_seq(trans, wb->journal_seq, &iter, &wb->k,
+				      BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
 	bch2_trans_iter_exit(trans, &iter);
 	return ret;
 }
@@ -193,7 +195,8 @@ int __bch2_btree_write_buffer_flush(struct btree_trans *trans, unsigned commit_f
 
 		if (!iter.path || iter.path->btree_id != i->btree) {
 			bch2_trans_iter_exit(trans, &iter);
-			bch2_trans_iter_init(trans, &iter, i->btree, i->k.k.p, BTREE_ITER_INTENT);
+			bch2_trans_iter_init(trans, &iter, i->btree, i->k.k.p,
+					     BTREE_ITER_INTENT|BTREE_ITER_ALL_SNAPSHOTS);
 		}
 
 		bch2_btree_iter_set_pos(&iter, i->k.k.p);
-- 
cgit 


From f55d6e07bc6c9b90f58586daf9c432adb5f5ce25 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 17 Aug 2023 22:10:02 -0400
Subject: bcachefs: Cleanup redundant snapshot nodes

After deleteing snapshots, we may be left with a snapshot tree where
some nodes only have one child, and we have a linear chain.

Interior snapshot nodes are never used directly (i.e. they never have
subvolumes that point to them), they are only referered to by child
snapshot nodes - hence, they are redundant.

The existing code talks about redundant snapshot nodes as forming and
equivalence class; i.e. nodes for which snapshot_t->equiv is equal. In a
given equivalence class, we only ever need a single key at a given
position - i.e. multiple versions with different snapshot fields are
redundant.

The existing snapshot cleanup code deletes these redundant keys, but not
redundant nodes. It turns out this is buggy, because we assume that
after snapshot deletion finishes we should only have a single key per
equivalence class, but the btree update path doesn't preserve this -
overwriting keys in old snapshots doesn't check for the equivalence
class being equal, and thus we can end up with duplicate keys in the
same equivalence class and fsck complaining about snapshot deletion not
having run correctly.

The equivalence class notion has been leaking out of the core snapshots
code and into too much other code, i.e. fsck, so this patch takes a
different approach: snapshot deletion now moves keys to the node in an
equivalence class being kept (the leafiest node) and then deletes the
redundant nodes in the equivalance class.

Some work has to be done to correctly delete interior snapshot nodes;
snapshot node depth and skiplist fields for descendent nodes have to be
fixed.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs_format.h |   1 +
 fs/bcachefs/snapshot.c        | 250 +++++++++++++++++++++++++++++++++++++-----
 2 files changed, 223 insertions(+), 28 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index 20e96daf9ca1..f17238be494c 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -1150,6 +1150,7 @@ struct bch_snapshot {
 	__le32			parent;
 	__le32			children[2];
 	__le32			subvol;
+	/* corresponds to a bch_snapshot_tree in BTREE_ID_snapshot_trees */
 	__le32			tree;
 	__le32			depth;
 	__le32			skip[3];
diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c
index 56961c074674..25c888051ca4 100644
--- a/fs/bcachefs/snapshot.c
+++ b/fs/bcachefs/snapshot.c
@@ -3,6 +3,7 @@
 #include "bcachefs.h"
 #include "btree_key_cache.h"
 #include "btree_update.h"
+#include "buckets.h"
 #include "errcode.h"
 #include "error.h"
 #include "fs.h"
@@ -273,10 +274,9 @@ int bch2_snapshot_invalid(const struct bch_fs *c, struct bkey_s_c k,
 		for (i = 0; i < ARRAY_SIZE(s.v->skip); i++) {
 			id = le32_to_cpu(s.v->skip[i]);
 
-			if (!id != !s.v->parent ||
-			    (s.v->parent &&
-			     id <= k.k->p.offset)) {
-				prt_printf(err, "bad skiplist node %u)", id);
+			if ((id && !s.v->parent) ||
+			    (id && id <= k.k->p.offset)) {
+				prt_printf(err, "bad skiplist node %u", id);
 				return -BCH_ERR_invalid_bkey;
 			}
 		}
@@ -933,13 +933,20 @@ err:
 	return ret;
 }
 
-static int bch2_snapshot_node_delete(struct btree_trans *trans, u32 id)
+static inline void normalize_snapshot_child_pointers(struct bch_snapshot *s)
+{
+	if (le32_to_cpu(s->children[0]) < le32_to_cpu(s->children[1]))
+		swap(s->children[0], s->children[1]);
+}
+
+int bch2_snapshot_node_delete(struct btree_trans *trans, u32 id)
 {
 	struct bch_fs *c = trans->c;
 	struct btree_iter iter, p_iter = (struct btree_iter) { NULL };
+	struct btree_iter c_iter = (struct btree_iter) { NULL };
 	struct btree_iter tree_iter = (struct btree_iter) { NULL };
 	struct bkey_s_c_snapshot s;
-	u32 parent_id;
+	u32 parent_id, child_id;
 	unsigned i;
 	int ret = 0;
 
@@ -952,8 +959,10 @@ static int bch2_snapshot_node_delete(struct btree_trans *trans, u32 id)
 	if (ret)
 		goto err;
 
-	BUG_ON(!BCH_SNAPSHOT_DELETED(s.v));
+	BUG_ON(s.v->children[1]);
+
 	parent_id = le32_to_cpu(s.v->parent);
+	child_id = le32_to_cpu(s.v->children[0]);
 
 	if (parent_id) {
 		struct bkey_i_snapshot *parent;
@@ -962,27 +971,48 @@ static int bch2_snapshot_node_delete(struct btree_trans *trans, u32 id)
 				     BTREE_ID_snapshots, POS(0, parent_id),
 				     0, snapshot);
 		ret = PTR_ERR_OR_ZERO(parent);
-		if (unlikely(ret)) {
-			bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c,
-						"missing snapshot %u", parent_id);
+		bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c,
+					"missing snapshot %u", parent_id);
+		if (unlikely(ret))
 			goto err;
-		}
 
+		/* find entry in parent->children for node being deleted */
 		for (i = 0; i < 2; i++)
 			if (le32_to_cpu(parent->v.children[i]) == id)
 				break;
 
-		if (i == 2)
-			bch_err(c, "snapshot %u missing child pointer to %u",
-				parent_id, id);
-		else
-			parent->v.children[i] = 0;
+		if (bch2_fs_inconsistent_on(i == 2, c,
+					"snapshot %u missing child pointer to %u",
+					parent_id, id))
+			goto err;
+
+		parent->v.children[i] = le32_to_cpu(child_id);
 
-		if (le32_to_cpu(parent->v.children[0]) <
-		    le32_to_cpu(parent->v.children[1]))
-			swap(parent->v.children[0],
-			     parent->v.children[1]);
-	} else {
+		normalize_snapshot_child_pointers(&parent->v);
+	}
+
+	if (child_id) {
+		struct bkey_i_snapshot *child;
+
+		child = bch2_bkey_get_mut_typed(trans, &c_iter,
+				     BTREE_ID_snapshots, POS(0, child_id),
+				     0, snapshot);
+		ret = PTR_ERR_OR_ZERO(child);
+		bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c,
+					"missing snapshot %u", child_id);
+		if (unlikely(ret))
+			goto err;
+
+		child->v.parent = cpu_to_le32(parent_id);
+
+		if (!child->v.parent) {
+			child->v.skip[0] = 0;
+			child->v.skip[1] = 0;
+			child->v.skip[2] = 0;
+		}
+	}
+
+	if (!parent_id) {
 		/*
 		 * We're deleting the root of a snapshot tree: update the
 		 * snapshot_tree entry to point to the new root, or delete it if
@@ -1011,6 +1041,7 @@ static int bch2_snapshot_node_delete(struct btree_trans *trans, u32 id)
 err:
 	bch2_trans_iter_exit(trans, &tree_iter);
 	bch2_trans_iter_exit(trans, &p_iter);
+	bch2_trans_iter_exit(trans, &c_iter);
 	bch2_trans_iter_exit(trans, &iter);
 	return ret;
 }
@@ -1166,6 +1197,12 @@ int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent,
  *
  * also: unlinked inode in internal snapshot appears to not be getting deleted
  * correctly if inode doesn't exist in leaf snapshots
+ *
+ * solution:
+ *
+ * for a key in an interior snapshot node that needs work to be done that
+ * requires it to be mutated: iterate over all descendent leaf nodes and copy
+ * that key to snapshot leaf nodes, where we can mutate it
  */
 
 static int snapshot_delete_key(struct btree_trans *trans,
@@ -1191,6 +1228,54 @@ static int snapshot_delete_key(struct btree_trans *trans,
 	}
 }
 
+static int move_key_to_correct_snapshot(struct btree_trans *trans,
+			       struct btree_iter *iter,
+			       struct bkey_s_c k)
+{
+	struct bch_fs *c = trans->c;
+	u32 equiv = bch2_snapshot_equiv(c, k.k->p.snapshot);
+
+	/*
+	 * When we have a linear chain of snapshot nodes, we consider
+	 * those to form an equivalence class: we're going to collapse
+	 * them all down to a single node, and keep the leaf-most node -
+	 * which has the same id as the equivalence class id.
+	 *
+	 * If there are multiple keys in different snapshots at the same
+	 * position, we're only going to keep the one in the newest
+	 * snapshot - the rest have been overwritten and are redundant,
+	 * and for the key we're going to keep we need to move it to the
+	 * equivalance class ID if it's not there already.
+	 */
+	if (equiv != k.k->p.snapshot) {
+		struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k);
+		struct btree_iter new_iter;
+		int ret;
+
+		ret = PTR_ERR_OR_ZERO(new);
+		if (ret)
+			return ret;
+
+		new->k.p.snapshot = equiv;
+
+		bch2_trans_iter_init(trans, &new_iter, iter->btree_id, new->k.p,
+				     BTREE_ITER_ALL_SNAPSHOTS|
+				     BTREE_ITER_CACHED|
+				     BTREE_ITER_INTENT);
+
+		ret =   bch2_btree_iter_traverse(&new_iter) ?:
+			bch2_trans_update(trans, &new_iter, new,
+					BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
+			bch2_btree_delete_at(trans, iter,
+					BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+		bch2_trans_iter_exit(trans, &new_iter);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
 /*
  * For a given snapshot, if it doesn't have a subvolume that points to it, and
  * it doesn't have child snapshot nodes - it's now redundant and we can mark it
@@ -1224,6 +1309,77 @@ static int bch2_delete_redundant_snapshot(struct btree_trans *trans, struct btre
 	return 0;
 }
 
+static inline u32 bch2_snapshot_nth_parent_skip(struct bch_fs *c, u32 id, u32 n,
+						snapshot_id_list *skip)
+{
+	rcu_read_lock();
+	while (n--) {
+		do {
+			id = __bch2_snapshot_parent(c, id);
+		} while (snapshot_list_has_id(skip, id));
+	}
+	rcu_read_unlock();
+
+	return id;
+}
+
+static int bch2_fix_child_of_deleted_snapshot(struct btree_trans *trans,
+					      struct btree_iter *iter, struct bkey_s_c k,
+					      snapshot_id_list *deleted)
+{
+	struct bch_fs *c = trans->c;
+	u32 nr_deleted_ancestors = 0;
+	struct bkey_i_snapshot *s;
+	u32 *i;
+	int ret;
+
+	if (k.k->type != KEY_TYPE_snapshot)
+		return 0;
+
+	if (snapshot_list_has_id(deleted, k.k->p.offset))
+		return 0;
+
+	s = bch2_bkey_make_mut_noupdate_typed(trans, k, snapshot);
+	ret = PTR_ERR_OR_ZERO(s);
+	if (ret)
+		return ret;
+
+	darray_for_each(*deleted, i)
+		nr_deleted_ancestors += bch2_snapshot_is_ancestor(c, s->k.p.offset, *i);
+
+	if (!nr_deleted_ancestors)
+		return 0;
+
+	le32_add_cpu(&s->v.depth, -nr_deleted_ancestors);
+
+	if (!s->v.depth) {
+		s->v.skip[0] = 0;
+		s->v.skip[1] = 0;
+		s->v.skip[2] = 0;
+	} else {
+		u32 depth = le32_to_cpu(s->v.depth);
+		u32 parent = bch2_snapshot_parent(c, s->k.p.offset);
+
+		for (unsigned j = 0; j < ARRAY_SIZE(s->v.skip); j++) {
+			u32 id = le32_to_cpu(s->v.skip[j]);
+
+			if (snapshot_list_has_id(deleted, id)) {
+				id = depth > 1
+					? bch2_snapshot_nth_parent_skip(c,
+							parent,
+							get_random_u32_below(depth - 1),
+							deleted)
+					: parent;
+				s->v.skip[j] = cpu_to_le32(id);
+			}
+		}
+
+		bubble_sort(s->v.skip, ARRAY_SIZE(s->v.skip), cmp_le32);
+	}
+
+	return bch2_trans_update(trans, iter, &s->k_i, 0);
+}
+
 int bch2_delete_dead_snapshots(struct bch_fs *c)
 {
 	struct btree_trans trans;
@@ -1231,7 +1387,8 @@ int bch2_delete_dead_snapshots(struct bch_fs *c)
 	struct bkey_s_c k;
 	struct bkey_s_c_snapshot snap;
 	snapshot_id_list deleted = { 0 };
-	u32 i, id;
+	snapshot_id_list deleted_interior = { 0 };
+	u32 *i, id;
 	int ret = 0;
 
 	if (!test_bit(BCH_FS_STARTED, &c->flags)) {
@@ -1287,6 +1444,7 @@ int bch2_delete_dead_snapshots(struct bch_fs *c)
 	for (id = 0; id < BTREE_ID_NR; id++) {
 		struct bpos last_pos = POS_MIN;
 		snapshot_id_list equiv_seen = { 0 };
+		struct disk_reservation res = { 0 };
 
 		if (!btree_type_has_snapshots(id))
 			continue;
@@ -1294,9 +1452,15 @@ int bch2_delete_dead_snapshots(struct bch_fs *c)
 		ret = for_each_btree_key_commit(&trans, iter,
 				id, POS_MIN,
 				BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
-				NULL, NULL, BTREE_INSERT_NOFAIL,
-			snapshot_delete_key(&trans, &iter, k, &deleted, &equiv_seen, &last_pos));
+				&res, NULL, BTREE_INSERT_NOFAIL,
+			snapshot_delete_key(&trans, &iter, k, &deleted, &equiv_seen, &last_pos)) ?:
+		      for_each_btree_key_commit(&trans, iter,
+				id, POS_MIN,
+				BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
+				&res, NULL, BTREE_INSERT_NOFAIL,
+			move_key_to_correct_snapshot(&trans, &iter, k));
 
+		bch2_disk_reservation_put(c, &res);
 		darray_exit(&equiv_seen);
 
 		if (ret) {
@@ -1305,19 +1469,49 @@ int bch2_delete_dead_snapshots(struct bch_fs *c)
 		}
 	}
 
-	for (i = 0; i < deleted.nr; i++) {
-		u32 node_to_delete = deleted.data[i];
+	for_each_btree_key(&trans, iter, BTREE_ID_snapshots,
+			   POS_MIN, 0, k, ret) {
+		u32 snapshot = k.k->p.offset;
+		u32 equiv = bch2_snapshot_equiv(c, snapshot);
+
+		if (equiv != snapshot)
+			snapshot_list_add(c, &deleted_interior, snapshot);
+	}
+	bch2_trans_iter_exit(&trans, &iter);
+
+	/*
+	 * Fixing children of deleted snapshots can't be done completely
+	 * atomically, if we crash between here and when we delete the interior
+	 * nodes some depth fields will be off:
+	 */
+	ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_snapshots, POS_MIN,
+				  BTREE_ITER_INTENT, k,
+				  NULL, NULL, BTREE_INSERT_NOFAIL,
+		bch2_fix_child_of_deleted_snapshot(&trans, &iter, k, &deleted_interior));
+	if (ret)
+		goto err;
+
+	darray_for_each(deleted, i) {
+		ret = commit_do(&trans, NULL, NULL, 0,
+			bch2_snapshot_node_delete(&trans, *i));
+		if (ret) {
+			bch_err_msg(c, ret, "deleting snapshot %u", *i);
+			goto err;
+		}
+	}
 
+	darray_for_each(deleted_interior, i) {
 		ret = commit_do(&trans, NULL, NULL, 0,
-			bch2_snapshot_node_delete(&trans, node_to_delete));
+			bch2_snapshot_node_delete(&trans, *i));
 		if (ret) {
-			bch_err_msg(c, ret, "deleting snapshot %u", node_to_delete);
+			bch_err_msg(c, ret, "deleting snapshot %u", *i);
 			goto err;
 		}
 	}
 
 	clear_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags);
 err:
+	darray_exit(&deleted_interior);
 	darray_exit(&deleted);
 	bch2_trans_exit(&trans);
 	if (ret)
-- 
cgit 


From a111901f52140f7f6f7ff0034c5ffa15448c784b Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 18 Aug 2023 21:14:33 -0400
Subject: bcachefs: bch2_propagate_key_to_snapshot_leaves()

If fsck finds a key that needs work done, the primary example being an
unlinked inode that needs to be deleted, and the key is in an internal
snapshot node, we have a bit of a conundrum.

The conundrum is that internal snapshot nodes are shared, and we in
general do updates in internal snapshot nodes because there may be
overwrites in some snapshots and not others, and this may affect other
keys referenced by this key (i.e. extents).

For example, we might be seeing an unlinked inode in an internal
snapshot node, but then in one child snapshot the inode might have been
reattached and might not be unlinked. Deleting the inode in the internal
snapshot node would be wrong, because then we'll delete all the extents
that the child snapshot references.

But if an unlinked inode does not have any overwrites in child
snapshots, we're fine: the inode is overwrritten in all child snapshots,
so we can do the deletion at the point of comonality in the snapshot
tree, i.e. the node where we found it.

This patch adds a new helper, bch2_propagate_key_to_snapshot_leaves(),
to handle the case where we need a to update a key that does have
overwrites in child snapshots: we copy the key to leaf snapshot nodes,
and then rewind fsck and process the needed updates there.

With this, fsck can now always correctly handle unlinked inodes found in
internal snapshot nodes.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fsck.c     | 36 ++++++++++++++------
 fs/bcachefs/snapshot.c | 91 +++++++++++++++++++++++++++++++++++++++++++++++++-
 fs/bcachefs/snapshot.h |  3 ++
 3 files changed, 118 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 9524bd621b2c..238caeeaf06c 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -853,14 +853,6 @@ static int check_inode(struct btree_trans *trans,
 	if (ret)
 		goto err;
 
-	/*
-	 * if snapshot id isn't a leaf node, skip it - deletion in
-	 * particular is not atomic, so on the internal snapshot nodes
-	 * we can see inodes marked for deletion after a clean shutdown
-	 */
-	if (bch2_snapshot_is_internal_node(c, k.k->p.snapshot))
-		return 0;
-
 	if (!bkey_is_inode(k.k))
 		return 0;
 
@@ -882,6 +874,27 @@ static int check_inode(struct btree_trans *trans,
 		return -EINVAL;
 	}
 
+	if ((u.bi_flags & (BCH_INODE_I_SIZE_DIRTY|BCH_INODE_UNLINKED)) &&
+	    bch2_key_has_snapshot_overwrites(trans, BTREE_ID_inodes, k.k->p)) {
+		struct bpos new_min_pos;
+
+		ret = bch2_propagate_key_to_snapshot_leaves(trans, iter->btree_id, k, &new_min_pos);
+		if (ret)
+			goto err;
+
+		u.bi_flags &= ~BCH_INODE_I_SIZE_DIRTY|BCH_INODE_UNLINKED;
+
+		ret = __write_inode(trans, &u, iter->pos.snapshot);
+		if (ret) {
+			bch_err_msg(c, ret, "in fsck: error updating inode");
+			return ret;
+		}
+
+		if (!bpos_eq(new_min_pos, POS_MIN))
+			bch2_btree_iter_set_pos(iter, bpos_predecessor(new_min_pos));
+		return 0;
+	}
+
 	if (u.bi_flags & BCH_INODE_UNLINKED &&
 	    (!c->sb.clean ||
 	     fsck_err(c, "filesystem marked clean, but inode %llu unlinked",
@@ -960,9 +973,10 @@ static int check_inode(struct btree_trans *trans,
 
 	if (do_update) {
 		ret = __write_inode(trans, &u, iter->pos.snapshot);
-		if (ret)
-			bch_err(c, "error in fsck: error updating inode: %s",
-				bch2_err_str(ret));
+		if (ret) {
+			bch_err_msg(c, ret, "in fsck: error updating inode");
+			return ret;
+		}
 	}
 err:
 fsck_err:
diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c
index 25c888051ca4..07e5c1b44b06 100644
--- a/fs/bcachefs/snapshot.c
+++ b/fs/bcachefs/snapshot.c
@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 
 #include "bcachefs.h"
+#include "bkey_buf.h"
 #include "btree_key_cache.h"
 #include "btree_update.h"
 #include "buckets.h"
@@ -1536,7 +1537,7 @@ void bch2_delete_dead_snapshots_async(struct bch_fs *c)
 }
 
 int bch2_delete_dead_snapshots_hook(struct btree_trans *trans,
-					   struct btree_trans_commit_hook *h)
+				    struct btree_trans_commit_hook *h)
 {
 	struct bch_fs *c = trans->c;
 
@@ -1583,6 +1584,94 @@ int __bch2_key_has_snapshot_overwrites(struct btree_trans *trans,
 	return ret;
 }
 
+static u32 bch2_snapshot_smallest_child(struct bch_fs *c, u32 id)
+{
+	const struct snapshot_t *s = snapshot_t(c, id);
+
+	return s->children[1] ?: s->children[0];
+}
+
+static u32 bch2_snapshot_smallest_descendent(struct bch_fs *c, u32 id)
+{
+	u32 child;
+
+	while ((child = bch2_snapshot_smallest_child(c, id)))
+		id = child;
+	return id;
+}
+
+static int bch2_propagate_key_to_snapshot_leaf(struct btree_trans *trans,
+					       enum btree_id btree,
+					       struct bkey_s_c interior_k,
+					       u32 leaf_id, struct bpos *new_min_pos)
+{
+	struct btree_iter iter;
+	struct bpos pos = interior_k.k->p;
+	struct bkey_s_c k;
+	struct bkey_i *new;
+	int ret;
+
+	pos.snapshot = leaf_id;
+
+	bch2_trans_iter_init(trans, &iter, btree, pos, BTREE_ITER_INTENT);
+	k = bch2_btree_iter_peek_slot(&iter);
+	ret = bkey_err(k);
+	if (ret)
+		goto out;
+
+	/* key already overwritten in this snapshot? */
+	if (k.k->p.snapshot != interior_k.k->p.snapshot)
+		goto out;
+
+	if (bpos_eq(*new_min_pos, POS_MIN)) {
+		*new_min_pos = k.k->p;
+		new_min_pos->snapshot = leaf_id;
+	}
+
+	new = bch2_bkey_make_mut_noupdate(trans, interior_k);
+	ret = PTR_ERR_OR_ZERO(new);
+	if (ret)
+		goto out;
+
+	new->k.p.snapshot = leaf_id;
+	ret = bch2_trans_update(trans, &iter, new, 0);
+out:
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+int bch2_propagate_key_to_snapshot_leaves(struct btree_trans *trans,
+					  enum btree_id btree,
+					  struct bkey_s_c k,
+					  struct bpos *new_min_pos)
+{
+	struct bch_fs *c = trans->c;
+	struct bkey_buf sk;
+	int ret;
+
+	bch2_bkey_buf_init(&sk);
+	bch2_bkey_buf_reassemble(&sk, c, k);
+	k = bkey_i_to_s_c(sk.k);
+
+	*new_min_pos = POS_MIN;
+
+	for (u32 id = bch2_snapshot_smallest_descendent(c, k.k->p.snapshot);
+	     id < k.k->p.snapshot;
+	     id++) {
+		if (!bch2_snapshot_is_ancestor(c, id, k.k->p.snapshot) ||
+		    !bch2_snapshot_is_leaf(c, id))
+			continue;
+
+		ret = commit_do(trans, NULL, NULL, 0,
+				bch2_propagate_key_to_snapshot_leaf(trans, btree, k, id, new_min_pos));
+		if (ret)
+			break;
+	}
+
+	bch2_bkey_buf_exit(&sk, c);
+	return ret;
+}
+
 int bch2_snapshots_read(struct bch_fs *c)
 {
 	struct btree_iter iter;
diff --git a/fs/bcachefs/snapshot.h b/fs/bcachefs/snapshot.h
index 624a42d1c8b7..dabc9b9d921b 100644
--- a/fs/bcachefs/snapshot.h
+++ b/fs/bcachefs/snapshot.h
@@ -263,6 +263,9 @@ static inline int bch2_key_has_snapshot_overwrites(struct btree_trans *trans,
 	return __bch2_key_has_snapshot_overwrites(trans, id, pos);
 }
 
+int bch2_propagate_key_to_snapshot_leaves(struct btree_trans *, enum btree_id,
+					  struct bkey_s_c, struct bpos *);
+
 int bch2_snapshots_read(struct bch_fs *);
 void bch2_fs_snapshots_exit(struct bch_fs *);
 
-- 
cgit 


From 4491283f8d827bdfc6baed27028ae636494bdbac Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 22 Aug 2023 18:47:16 -0400
Subject: bcachefs: Fix a double free on invalid bkey

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_trans_commit.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c
index e099fe570962..f498513d35f3 100644
--- a/fs/bcachefs/btree_trans_commit.c
+++ b/fs/bcachefs/btree_trans_commit.c
@@ -801,7 +801,6 @@ static noinline int bch2_trans_commit_bkey_invalid(struct btree_trans *trans, un
 
 	bch2_inconsistent_error(c);
 	bch2_dump_trans_updates(trans);
-	printbuf_exit(err);
 
 	return -EINVAL;
 }
-- 
cgit 


From 71aba590297e9b5f71e760b9336ecb3b44c728ef Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 22 Aug 2023 18:48:09 -0400
Subject: bcachefs: Always check alloc data type

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c | 101 +++++++++++++++++------------------------
 1 file changed, 42 insertions(+), 59 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 18b97416750f..ad82fbddcbf8 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -240,7 +240,6 @@ int bch2_alloc_v4_invalid(const struct bch_fs *c, struct bkey_s_c k,
 			  enum bkey_invalid_flags flags, struct printbuf *err)
 {
 	struct bkey_s_c_alloc_v4 a = bkey_s_c_to_alloc_v4(k);
-	int rw = flags & WRITE;
 
 	if (alloc_v4_u64s(a.v) > bkey_val_u64s(k.k)) {
 		prt_printf(err, "bad val size (%u > %lu)",
@@ -254,71 +253,55 @@ int bch2_alloc_v4_invalid(const struct bch_fs *c, struct bkey_s_c k,
 		return -BCH_ERR_invalid_bkey;
 	}
 
-	if (rw == WRITE &&
-	    !(flags & BKEY_INVALID_JOURNAL) &&
-	    c->curr_recovery_pass > BCH_RECOVERY_PASS_check_btree_backpointers) {
-		unsigned i, bp_len = 0;
-
-		for (i = 0; i < BCH_ALLOC_V4_NR_BACKPOINTERS(a.v); i++)
-			bp_len += alloc_v4_backpointers_c(a.v)[i].bucket_len;
+	if (alloc_data_type(*a.v, a.v->data_type) != a.v->data_type) {
+		prt_printf(err, "invalid data type (got %u should be %u)",
+		       a.v->data_type, alloc_data_type(*a.v, a.v->data_type));
+		return -BCH_ERR_invalid_bkey;
+	}
 
-		if (bp_len > a.v->dirty_sectors) {
-			prt_printf(err, "too many backpointers");
+	switch (a.v->data_type) {
+	case BCH_DATA_free:
+	case BCH_DATA_need_gc_gens:
+	case BCH_DATA_need_discard:
+		if (a.v->dirty_sectors ||
+		    a.v->cached_sectors ||
+		    a.v->stripe) {
+			prt_printf(err, "empty data type free but have data");
 			return -BCH_ERR_invalid_bkey;
 		}
-	}
-
-	if (rw == WRITE) {
-		if (alloc_data_type(*a.v, a.v->data_type) != a.v->data_type) {
-			prt_printf(err, "invalid data type (got %u should be %u)",
-			       a.v->data_type, alloc_data_type(*a.v, a.v->data_type));
+		break;
+	case BCH_DATA_sb:
+	case BCH_DATA_journal:
+	case BCH_DATA_btree:
+	case BCH_DATA_user:
+	case BCH_DATA_parity:
+		if (!a.v->dirty_sectors) {
+			prt_printf(err, "data_type %s but dirty_sectors==0",
+			       bch2_data_types[a.v->data_type]);
+			return -BCH_ERR_invalid_bkey;
+		}
+		break;
+	case BCH_DATA_cached:
+		if (!a.v->cached_sectors ||
+		    a.v->dirty_sectors ||
+		    a.v->stripe) {
+			prt_printf(err, "data type inconsistency");
 			return -BCH_ERR_invalid_bkey;
 		}
 
-		switch (a.v->data_type) {
-		case BCH_DATA_free:
-		case BCH_DATA_need_gc_gens:
-		case BCH_DATA_need_discard:
-			if (a.v->dirty_sectors ||
-			    a.v->cached_sectors ||
-			    a.v->stripe) {
-				prt_printf(err, "empty data type free but have data");
-				return -BCH_ERR_invalid_bkey;
-			}
-			break;
-		case BCH_DATA_sb:
-		case BCH_DATA_journal:
-		case BCH_DATA_btree:
-		case BCH_DATA_user:
-		case BCH_DATA_parity:
-			if (!a.v->dirty_sectors) {
-				prt_printf(err, "data_type %s but dirty_sectors==0",
-				       bch2_data_types[a.v->data_type]);
-				return -BCH_ERR_invalid_bkey;
-			}
-			break;
-		case BCH_DATA_cached:
-			if (!a.v->cached_sectors ||
-			    a.v->dirty_sectors ||
-			    a.v->stripe) {
-				prt_printf(err, "data type inconsistency");
-				return -BCH_ERR_invalid_bkey;
-			}
-
-			if (!a.v->io_time[READ] &&
-			    c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_to_lru_refs) {
-				prt_printf(err, "cached bucket with read_time == 0");
-				return -BCH_ERR_invalid_bkey;
-			}
-			break;
-		case BCH_DATA_stripe:
-			if (!a.v->stripe) {
-				prt_printf(err, "data_type %s but stripe==0",
-				       bch2_data_types[a.v->data_type]);
-				return -BCH_ERR_invalid_bkey;
-			}
-			break;
+		if (!a.v->io_time[READ] &&
+		    c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_to_lru_refs) {
+			prt_printf(err, "cached bucket with read_time == 0");
+			return -BCH_ERR_invalid_bkey;
+		}
+		break;
+	case BCH_DATA_stripe:
+		if (!a.v->stripe) {
+			prt_printf(err, "data_type %s but stripe==0",
+			       bch2_data_types[a.v->data_type]);
+			return -BCH_ERR_invalid_bkey;
 		}
+		break;
 	}
 
 	return 0;
-- 
cgit 


From cc07773f158331b5a9f5401e8d50cd7ed43c90ba Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 22 Aug 2023 20:29:35 -0400
Subject: bcachefs: Put bkey invalid check in commit path in a more useful
 place

When doing updates early in recovery, before we can go RW, we still want
to check that keys are valid at commit time - this moves key invalid
checking to before the "btree updates to journal" path.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_trans_commit.c | 38 +++++++++++++++++++-------------------
 1 file changed, 19 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c
index f498513d35f3..eafb0388ef82 100644
--- a/fs/bcachefs/btree_trans_commit.c
+++ b/fs/bcachefs/btree_trans_commit.c
@@ -817,25 +817,6 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, unsigned flags
 	struct btree_insert_entry *i;
 	int ret = 0, u64s_delta = 0;
 
-#ifdef CONFIG_BCACHEFS_DEBUG
-	trans_for_each_update(trans, i) {
-		struct printbuf buf = PRINTBUF;
-		enum bkey_invalid_flags invalid_flags = 0;
-
-		if (!(flags & BTREE_INSERT_JOURNAL_REPLAY))
-			invalid_flags |= BKEY_INVALID_WRITE|BKEY_INVALID_COMMIT;
-
-		if (unlikely(bch2_bkey_invalid(c, bkey_i_to_s_c(i->k),
-					       i->bkey_type, invalid_flags, &buf)))
-			ret = bch2_trans_commit_bkey_invalid(trans, flags, i, &buf);
-		btree_insert_entry_checks(trans, i);
-		printbuf_exit(&buf);
-
-		if (ret)
-			return ret;
-	}
-#endif
-
 	trans_for_each_update(trans, i) {
 		if (i->cached)
 			continue;
@@ -1048,6 +1029,25 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
 	if (ret)
 		goto out_reset;
 
+#ifdef CONFIG_BCACHEFS_DEBUG
+	trans_for_each_update(trans, i) {
+		struct printbuf buf = PRINTBUF;
+		enum bkey_invalid_flags invalid_flags = 0;
+
+		if (!(flags & BTREE_INSERT_JOURNAL_REPLAY))
+			invalid_flags |= BKEY_INVALID_WRITE|BKEY_INVALID_COMMIT;
+
+		if (unlikely(bch2_bkey_invalid(c, bkey_i_to_s_c(i->k),
+					       i->bkey_type, invalid_flags, &buf)))
+			ret = bch2_trans_commit_bkey_invalid(trans, flags, i, &buf);
+		btree_insert_entry_checks(trans, i);
+		printbuf_exit(&buf);
+
+		if (ret)
+			return ret;
+	}
+#endif
+
 	if (unlikely(!test_bit(BCH_FS_MAY_GO_RW, &c->flags))) {
 		ret = do_bch2_trans_commit_to_journal_replay(trans);
 		goto out_reset;
-- 
cgit 


From 9d2a7bd8b7f747c85a113a4967ffd5d65ef72dfa Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 23 Aug 2023 21:20:42 -0400
Subject: bcachefs: Improve bch2_moving_ctxt_to_text()

Print more information out about moving contexts - fold in the output of
the redundant bch2_data_jobs_to_text(), and also include information
relevant to whether move_data() should be blocked.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/move.c  | 44 +++++++++++++++++++-------------------------
 fs/bcachefs/move.h  |  1 -
 fs/bcachefs/sysfs.c |  5 -----
 3 files changed, 19 insertions(+), 31 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index 05272673901d..fb76a1dac74e 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -1103,46 +1103,40 @@ int bch2_data_job(struct bch_fs *c,
 	return ret;
 }
 
-void bch2_data_jobs_to_text(struct printbuf *out, struct bch_fs *c)
-{
-	struct bch_move_stats *stats;
-
-	mutex_lock(&c->data_progress_lock);
-	list_for_each_entry(stats, &c->data_progress_list, list) {
-		prt_printf(out, "%s: data type %s btree_id %s position: ",
-		       stats->name,
-		       bch2_data_types[stats->data_type],
-		       bch2_btree_ids[stats->btree_id]);
-		bch2_bpos_to_text(out, stats->pos);
-		prt_printf(out, "%s", "\n");
-	}
-	mutex_unlock(&c->data_progress_lock);
-}
-
-static void bch2_moving_ctxt_to_text(struct printbuf *out, struct moving_context *ctxt)
+static void bch2_moving_ctxt_to_text(struct printbuf *out, struct bch_fs *c, struct moving_context *ctxt)
 {
+	struct bch_move_stats *stats = ctxt->stats;
 	struct moving_io *io;
 
-	prt_printf(out, "%ps:", ctxt->fn);
+	prt_printf(out, "%s (%ps):", stats->name, ctxt->fn);
+	prt_newline(out);
+
+	prt_printf(out, " data type %s btree_id %s position: ",
+		   bch2_data_types[stats->data_type],
+		   bch2_btree_ids[stats->btree_id]);
+	bch2_bpos_to_text(out, stats->pos);
 	prt_newline(out);
 	printbuf_indent_add(out, 2);
 
-	prt_printf(out, "reads: %u sectors %u",
+	prt_printf(out, "reads: ios %u/%u sectors %u/%u",
 		   atomic_read(&ctxt->read_ios),
-		   atomic_read(&ctxt->read_sectors));
+		   c->opts.move_ios_in_flight,
+		   atomic_read(&ctxt->read_sectors),
+		   c->opts.move_bytes_in_flight >> 9);
 	prt_newline(out);
 
-	prt_printf(out, "writes: %u sectors %u",
+	prt_printf(out, "writes: ios %u/%u sectors %u/%u",
 		   atomic_read(&ctxt->write_ios),
-		   atomic_read(&ctxt->write_sectors));
+		   c->opts.move_ios_in_flight,
+		   atomic_read(&ctxt->write_sectors),
+		   c->opts.move_bytes_in_flight >> 9);
 	prt_newline(out);
 
 	printbuf_indent_add(out, 2);
 
 	mutex_lock(&ctxt->lock);
-	list_for_each_entry(io, &ctxt->ios, io_list) {
+	list_for_each_entry(io, &ctxt->ios, io_list)
 		bch2_write_op_to_text(out, &io->write.op);
-	}
 	mutex_unlock(&ctxt->lock);
 
 	printbuf_indent_sub(out, 4);
@@ -1154,7 +1148,7 @@ void bch2_fs_moving_ctxts_to_text(struct printbuf *out, struct bch_fs *c)
 
 	mutex_lock(&c->moving_context_lock);
 	list_for_each_entry(ctxt, &c->moving_context_list, list)
-		bch2_moving_ctxt_to_text(out, ctxt);
+		bch2_moving_ctxt_to_text(out, c, ctxt);
 	mutex_unlock(&c->moving_context_lock);
 }
 
diff --git a/fs/bcachefs/move.h b/fs/bcachefs/move.h
index 547ee7b72c16..c3136abe8587 100644
--- a/fs/bcachefs/move.h
+++ b/fs/bcachefs/move.h
@@ -88,7 +88,6 @@ int bch2_data_job(struct bch_fs *,
 		  struct bch_ioctl_data);
 
 void bch2_move_stats_init(struct bch_move_stats *stats, char *name);
-void bch2_data_jobs_to_text(struct printbuf *, struct bch_fs *);
 void bch2_fs_moving_ctxts_to_text(struct printbuf *, struct bch_fs *);
 
 void bch2_fs_move_init(struct bch_fs *);
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index ef02e346e334..41c6900c34c1 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -248,7 +248,6 @@ read_attribute(new_stripes);
 read_attribute(io_timers_read);
 read_attribute(io_timers_write);
 
-read_attribute(data_jobs);
 read_attribute(moving_ctxts);
 
 #ifdef CONFIG_BCACHEFS_TESTS
@@ -458,9 +457,6 @@ SHOW(bch2_fs)
 	if (attr == &sysfs_io_timers_write)
 		bch2_io_timers_to_text(out, &c->io_clock[WRITE]);
 
-	if (attr == &sysfs_data_jobs)
-		bch2_data_jobs_to_text(out, c);
-
 	if (attr == &sysfs_moving_ctxts)
 		bch2_fs_moving_ctxts_to_text(out, c);
 
@@ -681,7 +677,6 @@ struct attribute *bch2_fs_internal_files[] = {
 	&sysfs_rebalance_work,
 	sysfs_pd_controller_files(rebalance),
 
-	&sysfs_data_jobs,
 	&sysfs_moving_ctxts,
 
 	&sysfs_internal_uuid,
-- 
cgit 


From cba37d81f5c34197e815bcd60f075be232ae6783 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 24 Aug 2023 17:07:50 -0400
Subject: bcachefs: Kill stripe check in bch2_alloc_v4_invalid()

Since we set bucket data type to BCH_DATA_stripe based on the data
pointer, not just the stripe pointer, it doesn't make sense to check for
no stripe in the .key_invalid method - this is a situation that
shouldn't happen, but our other fsck/repair code handles it.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index ad82fbddcbf8..e36426b52a4a 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -296,11 +296,6 @@ int bch2_alloc_v4_invalid(const struct bch_fs *c, struct bkey_s_c k,
 		}
 		break;
 	case BCH_DATA_stripe:
-		if (!a.v->stripe) {
-			prt_printf(err, "data_type %s but stripe==0",
-			       bch2_data_types[a.v->data_type]);
-			return -BCH_ERR_invalid_bkey;
-		}
 		break;
 	}
 
-- 
cgit 


From 097d4cc8fde898334569271c9b3e24d99788ade0 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 28 Aug 2023 15:17:31 -0400
Subject: bcachefs: Fix snapshot_skiplist_good()

We weren't correctly checking snapshot skiplist nodes - we were checking
if they were in the same tree, not if they were an actual ancestor.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/snapshot.c | 30 ++++++++++--------------------
 1 file changed, 10 insertions(+), 20 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c
index 07e5c1b44b06..9da09911466e 100644
--- a/fs/bcachefs/snapshot.c
+++ b/fs/bcachefs/snapshot.c
@@ -655,28 +655,18 @@ u32 bch2_snapshot_skiplist_get(struct bch_fs *c, u32 id)
 	return id;
 }
 
-static int snapshot_skiplist_good(struct btree_trans *trans, struct bch_snapshot s)
+static int snapshot_skiplist_good(struct btree_trans *trans, u32 id, struct bch_snapshot s)
 {
-	struct bch_snapshot a;
 	unsigned i;
-	int ret;
-
-	for (i = 0; i < 3; i++) {
-		if (!s.parent != !s.skip[i])
-			return false;
-
-		if (!s.parent)
-			continue;
 
-		ret = bch2_snapshot_lookup(trans, le32_to_cpu(s.skip[i]), &a);
-		if (bch2_err_matches(ret, ENOENT))
-			return false;
-		if (ret)
-			return ret;
-
-		if (a.tree != s.tree)
-			return false;
-	}
+	for (i = 0; i < 3; i++)
+		if (!s.parent) {
+			if (s.skip[i])
+				return false;
+		} else {
+			if (!bch2_snapshot_is_ancestor_early(trans->c, id, le32_to_cpu(s.skip[i])))
+				return false;
+		}
 
 	return true;
 }
@@ -856,7 +846,7 @@ static int check_snapshot(struct btree_trans *trans,
 		s = u->v;
 	}
 
-	ret = snapshot_skiplist_good(trans, s);
+	ret = snapshot_skiplist_good(trans, k.k->p.offset, s);
 	if (ret < 0)
 		goto err;
 
-- 
cgit 


From 197763a70b6a530d959659abb917166a2f193520 Mon Sep 17 00:00:00 2001
From: Brian Foster <bfoster@redhat.com>
Date: Wed, 30 Aug 2023 06:45:59 -0400
Subject: bcachefs: restart journal reclaim thread on ro->rw transitions

Commit c2d5ff36065a4 ("bcachefs: Start journal reclaim thread
earlier") tweaked reclaim thread management to start a bit earlier
in the mount sequence by moving the start call from
__bch2_fs_read_write() to bch2_fs_journal_start(). This has the side
effect of never starting the reclaim thread on a ro->rw transition,
which can be observed by monitoring reclaim behavior via the
journal_reclaim tracepoints. I.e. once an fs has remounted ro->rw,
we only ever rely on direct reclaim from that point forward.

Since bch2_journal_reclaim_start() properly handles the case where
the reclaim thread has already been created, restore the start call
in the read-write helper. This allows the reclaim thread to start
early when appropriate and also exit/restart on remounts or freeze
cycles. In the latter case it may be possible to simply allow the
task to freeze rather than destroy it, but for now just fix the
immediate bug.

Signed-off-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/super.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 60424865980d..29cd71445a94 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -421,6 +421,10 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early)
 		return ret;
 	}
 
+	ret = bch2_journal_reclaim_start(&c->journal);
+	if (ret)
+		goto err;
+
 	if (!early) {
 		ret = bch2_fs_read_write_late(c);
 		if (ret)
-- 
cgit 


From a9a7bbab1469f0c427f90c309720c543e37ab110 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 8 Sep 2023 18:14:08 -0400
Subject: bcachefs: bch2_acl_to_text()

We can now print out acls from bch2_xattr_to_text(), when the xattr
contains an acl.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/Makefile |  2 +-
 fs/bcachefs/acl.c    | 65 +++++++++++++++++++++++++++++++++++++++++++++++-----
 fs/bcachefs/acl.h    |  6 +++--
 fs/bcachefs/xattr.c  |  8 +++++++
 4 files changed, 72 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile
index b4fa88dfd484..0a4d2fed66c1 100644
--- a/fs/bcachefs/Makefile
+++ b/fs/bcachefs/Makefile
@@ -2,6 +2,7 @@
 obj-$(CONFIG_BCACHEFS_FS)	+= bcachefs.o
 
 bcachefs-y		:=	\
+	acl.o			\
 	alloc_background.o	\
 	alloc_foreground.o	\
 	backpointers.o		\
@@ -81,5 +82,4 @@ bcachefs-y		:=	\
 	varint.o		\
 	xattr.o
 
-bcachefs-$(CONFIG_BCACHEFS_POSIX_ACL) += acl.o
 obj-$(CONFIG_MEAN_AND_VARIANCE_UNIT_TEST)   += mean_and_variance_test.o
diff --git a/fs/bcachefs/acl.c b/fs/bcachefs/acl.c
index b1a488860678..ae2036b0fcc4 100644
--- a/fs/bcachefs/acl.c
+++ b/fs/bcachefs/acl.c
@@ -1,18 +1,71 @@
 // SPDX-License-Identifier: GPL-2.0
-#ifdef CONFIG_BCACHEFS_POSIX_ACL
 
 #include "bcachefs.h"
 
-#include <linux/fs.h>
+#include "acl.h"
+#include "xattr.h"
+
 #include <linux/posix_acl.h>
+
+static const char * const acl_types[] = {
+	[ACL_USER_OBJ]	= "user_obj",
+	[ACL_USER]	= "user",
+	[ACL_GROUP_OBJ]	= "group_obj",
+	[ACL_GROUP]	= "group",
+	[ACL_MASK]	= "mask",
+	[ACL_OTHER]	= "other",
+	NULL,
+};
+
+void bch2_acl_to_text(struct printbuf *out, const void *value, size_t size)
+{
+	const void *p, *end = value + size;
+
+	if (!value ||
+	    size < sizeof(bch_acl_header) ||
+	    ((bch_acl_header *)value)->a_version != cpu_to_le32(BCH_ACL_VERSION))
+		return;
+
+	p = value + sizeof(bch_acl_header);
+	while (p < end) {
+		const bch_acl_entry *in = p;
+		unsigned tag = le16_to_cpu(in->e_tag);
+
+		prt_str(out, acl_types[tag]);
+
+		switch (tag) {
+		case ACL_USER_OBJ:
+		case ACL_GROUP_OBJ:
+		case ACL_MASK:
+		case ACL_OTHER:
+			p += sizeof(bch_acl_entry_short);
+			break;
+		case ACL_USER:
+			prt_printf(out, " uid %u", le32_to_cpu(in->e_id));
+			p += sizeof(bch_acl_entry);
+			break;
+		case ACL_GROUP:
+			prt_printf(out, " gid %u", le32_to_cpu(in->e_id));
+			p += sizeof(bch_acl_entry);
+			break;
+		}
+
+		prt_printf(out, " %o", le16_to_cpu(in->e_perm));
+
+		if (p != end)
+			prt_char(out, ' ');
+	}
+}
+
+#ifdef CONFIG_BCACHEFS_POSIX_ACL
+
+#include "fs.h"
+
+#include <linux/fs.h>
 #include <linux/posix_acl_xattr.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
 
-#include "acl.h"
-#include "fs.h"
-#include "xattr.h"
-
 static inline size_t bch2_acl_size(unsigned nr_short, unsigned nr_long)
 {
 	return sizeof(bch_acl_header) +
diff --git a/fs/bcachefs/acl.h b/fs/bcachefs/acl.h
index bb21d8d696a2..27e7eec0f278 100644
--- a/fs/bcachefs/acl.h
+++ b/fs/bcachefs/acl.h
@@ -7,8 +7,6 @@ struct bch_hash_info;
 struct bch_inode_info;
 struct posix_acl;
 
-#ifdef CONFIG_BCACHEFS_POSIX_ACL
-
 #define BCH_ACL_VERSION	0x0001
 
 typedef struct {
@@ -26,6 +24,10 @@ typedef struct {
 	__le32		a_version;
 } bch_acl_header;
 
+void bch2_acl_to_text(struct printbuf *, const void *, size_t);
+
+#ifdef CONFIG_BCACHEFS_POSIX_ACL
+
 struct posix_acl *bch2_get_acl(struct mnt_idmap *, struct dentry *, int);
 
 int bch2_set_acl_trans(struct btree_trans *, subvol_inum,
diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c
index 6f6b3caf0607..637174b249a2 100644
--- a/fs/bcachefs/xattr.c
+++ b/fs/bcachefs/xattr.c
@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 
 #include "bcachefs.h"
+#include "acl.h"
 #include "bkey_methods.h"
 #include "btree_update.h"
 #include "extents.h"
@@ -130,6 +131,13 @@ void bch2_xattr_to_text(struct printbuf *out, struct bch_fs *c,
 	       xattr.v->x_name,
 	       le16_to_cpu(xattr.v->x_val_len),
 	       (char *) xattr_val(xattr.v));
+
+	if (xattr.v->x_type == KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS ||
+	    xattr.v->x_type == KEY_TYPE_XATTR_INDEX_POSIX_ACL_DEFAULT) {
+		prt_char(out, ' ');
+		bch2_acl_to_text(out, xattr_val(xattr.v),
+				 le16_to_cpu(xattr.v->x_val_len));
+	}
 }
 
 static int bch2_xattr_get_trans(struct btree_trans *trans, struct bch_inode_info *inode,
-- 
cgit 


From 5cfd69775eb5460ef78bb5034a37eb0dc52ab65d Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 9 Sep 2023 20:10:11 -0400
Subject: bcachefs: Array bounds fixes

It's no longer legal to use a zero size array as a flexible array
member - this causes UBSAN to complain.

This patch switches our zero size arrays to normal flexible array
members when possible, and inserts casts in other places (e.g. where we
use the zero size array as a marker partway through an array).

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs_format.h | 66 ++++++++++++++++++-------------------------
 fs/bcachefs/bkey.c            |  2 +-
 fs/bcachefs/bkey.h            |  6 ++--
 fs/bcachefs/bkey_sort.h       | 16 +++++++++--
 fs/bcachefs/bset.c            | 13 +++++----
 fs/bcachefs/btree_io.c        | 21 +++++++-------
 fs/bcachefs/vstructs.h        |  6 ++--
 7 files changed, 64 insertions(+), 66 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index f17238be494c..1cce2504bca6 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -723,7 +723,7 @@ struct bch_inode {
 	__le64			bi_hash_seed;
 	__le32			bi_flags;
 	__le16			bi_mode;
-	__u8			fields[0];
+	__u8			fields[];
 } __packed __aligned(8);
 
 struct bch_inode_v2 {
@@ -733,7 +733,7 @@ struct bch_inode_v2 {
 	__le64			bi_hash_seed;
 	__le64			bi_flags;
 	__le16			bi_mode;
-	__u8			fields[0];
+	__u8			fields[];
 } __packed __aligned(8);
 
 struct bch_inode_v3 {
@@ -745,7 +745,7 @@ struct bch_inode_v3 {
 	__le64			bi_sectors;
 	__le64			bi_size;
 	__le64			bi_version;
-	__u8			fields[0];
+	__u8			fields[];
 } __packed __aligned(8);
 
 #define INODEv3_FIELDS_START_INITIAL	6
@@ -1097,20 +1097,20 @@ struct bch_reflink_v {
 	struct bch_val		v;
 	__le64			refcount;
 	union bch_extent_entry	start[0];
-	__u64			_data[0];
+	__u64			_data[];
 } __packed __aligned(8);
 
 struct bch_indirect_inline_data {
 	struct bch_val		v;
 	__le64			refcount;
-	u8			data[0];
+	u8			data[];
 };
 
 /* Inline data */
 
 struct bch_inline_data {
 	struct bch_val		v;
-	u8			data[0];
+	u8			data[];
 };
 
 /* Subvolumes: */
@@ -1223,7 +1223,7 @@ enum bch_sb_field_type {
 
 struct bch_sb_field_journal {
 	struct bch_sb_field	field;
-	__le64			buckets[0];
+	__le64			buckets[];
 };
 
 struct bch_sb_field_journal_v2 {
@@ -1232,7 +1232,7 @@ struct bch_sb_field_journal_v2 {
 	struct bch_sb_field_journal_v2_entry {
 		__le64		start;
 		__le64		nr;
-	}			d[0];
+	}			d[];
 };
 
 /* BCH_SB_FIELD_members: */
@@ -1279,7 +1279,7 @@ enum bch_member_state {
 
 struct bch_sb_field_members {
 	struct bch_sb_field	field;
-	struct bch_member	members[0];
+	struct bch_member	members[];
 };
 
 /* BCH_SB_FIELD_crypt: */
@@ -1377,19 +1377,19 @@ static inline bool data_type_is_hidden(enum bch_data_type type)
 struct bch_replicas_entry_v0 {
 	__u8			data_type;
 	__u8			nr_devs;
-	__u8			devs[0];
+	__u8			devs[];
 } __packed;
 
 struct bch_sb_field_replicas_v0 {
 	struct bch_sb_field	field;
-	struct bch_replicas_entry_v0 entries[0];
+	struct bch_replicas_entry_v0 entries[];
 } __packed __aligned(8);
 
 struct bch_replicas_entry {
 	__u8			data_type;
 	__u8			nr_devs;
 	__u8			nr_required;
-	__u8			devs[0];
+	__u8			devs[];
 } __packed;
 
 #define replicas_entry_bytes(_i)					\
@@ -1397,7 +1397,7 @@ struct bch_replicas_entry {
 
 struct bch_sb_field_replicas {
 	struct bch_sb_field	field;
-	struct bch_replicas_entry entries[0];
+	struct bch_replicas_entry entries[];
 } __packed __aligned(8);
 
 /* BCH_SB_FIELD_quota: */
@@ -1432,7 +1432,7 @@ LE64_BITMASK(BCH_GROUP_PARENT,		struct bch_disk_group, flags[0], 6, 24)
 
 struct bch_sb_field_disk_groups {
 	struct bch_sb_field	field;
-	struct bch_disk_group	entries[0];
+	struct bch_disk_group	entries[];
 } __packed __aligned(8);
 
 /* BCH_SB_FIELD_counters */
@@ -1525,7 +1525,7 @@ enum bch_persistent_counters {
 
 struct bch_sb_field_counters {
 	struct bch_sb_field	field;
-	__le64			d[0];
+	__le64			d[];
 };
 
 /*
@@ -1539,10 +1539,8 @@ struct jset_entry {
 	__u8			type; /* designates what this jset holds */
 	__u8			pad[3];
 
-	union {
-		struct bkey_i	start[0];
-		__u64		_data[0];
-	};
+	struct bkey_i		start[0];
+	__u64			_data[];
 };
 
 struct bch_sb_field_clean {
@@ -1553,10 +1551,8 @@ struct bch_sb_field_clean {
 	__le16			_write_clock;
 	__le64			journal_seq;
 
-	union {
-		struct jset_entry start[0];
-		__u64		_data[0];
-	};
+	struct jset_entry	start[0];
+	__u64			_data[];
 };
 
 struct journal_seq_blacklist_entry {
@@ -1567,10 +1563,8 @@ struct journal_seq_blacklist_entry {
 struct bch_sb_field_journal_seq_blacklist {
 	struct bch_sb_field	field;
 
-	union {
-		struct journal_seq_blacklist_entry start[0];
-		__u64		_data[0];
-	};
+	struct journal_seq_blacklist_entry start[0];
+	__u64			_data[];
 };
 
 /* Superblock: */
@@ -1706,10 +1700,8 @@ struct bch_sb {
 
 	struct bch_sb_layout	layout;
 
-	union {
-		struct bch_sb_field start[0];
-		__le64		_data[0];
-	};
+	struct bch_sb_field	start[0];
+	__le64			_data[];
 } __packed __aligned(8);
 
 /*
@@ -2186,10 +2178,8 @@ struct jset {
 	__le64			last_seq;
 
 
-	union {
-		struct jset_entry start[0];
-		__u64		_data[0];
-	};
+	struct jset_entry	start[0];
+	__u64			_data[];
 } __packed __aligned(8);
 
 LE32_BITMASK(JSET_CSUM_TYPE,	struct jset, flags, 0, 4);
@@ -2294,10 +2284,8 @@ struct bset {
 	__le16			version;
 	__le16			u64s; /* count of d[] in u64s */
 
-	union {
-		struct bkey_packed start[0];
-		__u64		_data[0];
-	};
+	struct bkey_packed	start[0];
+	__u64			_data[];
 } __packed __aligned(8);
 
 LE32_BITMASK(BSET_CSUM_TYPE,	struct bset, flags, 0, 4);
diff --git a/fs/bcachefs/bkey.c b/fs/bcachefs/bkey.c
index 0a5bfe6e9a2d..a3abd9d2d176 100644
--- a/fs/bcachefs/bkey.c
+++ b/fs/bcachefs/bkey.c
@@ -127,7 +127,7 @@ static void pack_state_finish(struct pack_state *state,
 			      struct bkey_packed *k)
 {
 	EBUG_ON(state->p <  k->_data);
-	EBUG_ON(state->p >= k->_data + state->format->key_u64s);
+	EBUG_ON(state->p >= (u64 *) k->_data + state->format->key_u64s);
 
 	*state->p = state->w;
 }
diff --git a/fs/bcachefs/bkey.h b/fs/bcachefs/bkey.h
index 51969a46265e..518450209236 100644
--- a/fs/bcachefs/bkey.h
+++ b/fs/bcachefs/bkey.h
@@ -52,7 +52,7 @@ struct bkey_s {
 
 static inline struct bkey_i *bkey_next(struct bkey_i *k)
 {
-	return (struct bkey_i *) (k->_data + k->k.u64s);
+	return (struct bkey_i *) ((u64 *) k->_data + k->k.u64s);
 }
 
 #define bkey_val_u64s(_k)	((_k)->u64s - BKEY_U64s)
@@ -397,7 +397,7 @@ static inline void set_bkeyp_val_u64s(const struct bkey_format *format,
 }
 
 #define bkeyp_val(_format, _k)						\
-	 ((struct bch_val *) ((_k)->_data + bkeyp_key_u64s(_format, _k)))
+	 ((struct bch_val *) ((u64 *) (_k)->_data + bkeyp_key_u64s(_format, _k)))
 
 extern const struct bkey_format bch2_bkey_format_current;
 
@@ -732,7 +732,7 @@ static inline unsigned high_word_offset(const struct bkey_format *f)
 #error edit for your odd byteorder.
 #endif
 
-#define high_word(f, k)		((k)->_data + high_word_offset(f))
+#define high_word(f, k)		((u64 *) (k)->_data + high_word_offset(f))
 #define next_word(p)		nth_word(p, 1)
 #define prev_word(p)		nth_word(p, -1)
 
diff --git a/fs/bcachefs/bkey_sort.h b/fs/bcachefs/bkey_sort.h
index 79cf11d1b4e7..7c0f0b160f18 100644
--- a/fs/bcachefs/bkey_sort.h
+++ b/fs/bcachefs/bkey_sort.h
@@ -9,14 +9,24 @@ struct sort_iter {
 
 	struct sort_iter_set {
 		struct bkey_packed *k, *end;
-	} data[MAX_BSETS + 1];
+	} data[];
 };
 
-static inline void sort_iter_init(struct sort_iter *iter, struct btree *b)
+static inline void sort_iter_init(struct sort_iter *iter, struct btree *b, unsigned size)
 {
 	iter->b = b;
 	iter->used = 0;
-	iter->size = ARRAY_SIZE(iter->data);
+	iter->size = size;
+}
+
+struct sort_iter_stack {
+	struct sort_iter	iter;
+	struct sort_iter_set	sets[MAX_BSETS + 1];
+};
+
+static inline void sort_iter_stack_init(struct sort_iter_stack *iter, struct btree *b)
+{
+	sort_iter_init(&iter->iter, b, ARRAY_SIZE(iter->sets));
 }
 
 static inline void sort_iter_add(struct sort_iter *iter,
diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c
index bcdf28f39b9c..685792137d2a 100644
--- a/fs/bcachefs/bset.c
+++ b/fs/bcachefs/bset.c
@@ -232,7 +232,7 @@ void bch2_verify_insert_pos(struct btree *b, struct bkey_packed *where,
 {
 	struct bset_tree *t = bch2_bkey_to_bset(b, where);
 	struct bkey_packed *prev = bch2_bkey_prev_all(b, t, where);
-	struct bkey_packed *next = (void *) (where->_data + clobber_u64s);
+	struct bkey_packed *next = (void *) ((u64 *) where->_data + clobber_u64s);
 	struct printbuf buf1 = PRINTBUF;
 	struct printbuf buf2 = PRINTBUF;
 #if 0
@@ -300,7 +300,8 @@ static unsigned bkey_float_byte_offset(unsigned idx)
 }
 
 struct ro_aux_tree {
-	struct bkey_float	f[0];
+	u8			nothing[0];
+	struct bkey_float	f[];
 };
 
 struct rw_aux_tree {
@@ -476,7 +477,7 @@ static struct bkey_packed *tree_to_prev_bkey(const struct btree *b,
 {
 	unsigned prev_u64s = ro_aux_tree_prev(b, t)[j];
 
-	return (void *) (tree_to_bkey(b, t, j)->_data - prev_u64s);
+	return (void *) ((u64 *) tree_to_bkey(b, t, j)->_data - prev_u64s);
 }
 
 static struct rw_aux_tree *rw_aux_tree(const struct btree *b,
@@ -1010,8 +1011,8 @@ void bch2_bset_insert(struct btree *b,
 		btree_keys_account_key_add(&b->nr, t - b->set, src);
 
 	if (src->u64s != clobber_u64s) {
-		u64 *src_p = where->_data + clobber_u64s;
-		u64 *dst_p = where->_data + src->u64s;
+		u64 *src_p = (u64 *) where->_data + clobber_u64s;
+		u64 *dst_p = (u64 *) where->_data + src->u64s;
 
 		EBUG_ON((int) le16_to_cpu(bset(b, t)->u64s) <
 			(int) clobber_u64s - src->u64s);
@@ -1037,7 +1038,7 @@ void bch2_bset_delete(struct btree *b,
 		      unsigned clobber_u64s)
 {
 	struct bset_tree *t = bset_tree_last(b);
-	u64 *src_p = where->_data + clobber_u64s;
+	u64 *src_p = (u64 *) where->_data + clobber_u64s;
 	u64 *dst_p = where->_data;
 
 	bch2_bset_verify_rw_aux_tree(b, t);
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index cba3c081b1d0..0edbb73a5ec8 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -292,7 +292,7 @@ static void btree_node_sort(struct bch_fs *c, struct btree *b,
 			    bool filter_whiteouts)
 {
 	struct btree_node *out;
-	struct sort_iter sort_iter;
+	struct sort_iter_stack sort_iter;
 	struct bset_tree *t;
 	struct bset *start_bset = bset(b, &b->set[start_idx]);
 	bool used_mempool = false;
@@ -301,13 +301,13 @@ static void btree_node_sort(struct bch_fs *c, struct btree *b,
 	bool sorting_entire_node = start_idx == 0 &&
 		end_idx == b->nsets;
 
-	sort_iter_init(&sort_iter, b);
+	sort_iter_stack_init(&sort_iter, b);
 
 	for (t = b->set + start_idx;
 	     t < b->set + end_idx;
 	     t++) {
 		u64s += le16_to_cpu(bset(b, t)->u64s);
-		sort_iter_add(&sort_iter,
+		sort_iter_add(&sort_iter.iter,
 			      btree_bkey_first(b, t),
 			      btree_bkey_last(b, t));
 	}
@@ -320,7 +320,7 @@ static void btree_node_sort(struct bch_fs *c, struct btree *b,
 
 	start_time = local_clock();
 
-	u64s = bch2_sort_keys(out->keys.start, &sort_iter, filter_whiteouts);
+	u64s = bch2_sort_keys(out->keys.start, &sort_iter.iter, filter_whiteouts);
 
 	out->keys.u64s = cpu_to_le16(u64s);
 
@@ -918,8 +918,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
 	b->written = 0;
 
 	iter = mempool_alloc(&c->fill_iter, GFP_NOFS);
-	sort_iter_init(iter, b);
-	iter->size = (btree_blocks(c) + 1) * 2;
+	sort_iter_init(iter, b, (btree_blocks(c) + 1) * 2);
 
 	if (bch2_meta_read_fault("btree"))
 		btree_err(-BCH_ERR_btree_node_read_err_must_retry, c, ca, b, NULL,
@@ -1852,7 +1851,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, unsigned flags)
 	struct bset *i;
 	struct btree_node *bn = NULL;
 	struct btree_node_entry *bne = NULL;
-	struct sort_iter sort_iter;
+	struct sort_iter_stack sort_iter;
 	struct nonce nonce;
 	unsigned bytes_to_write, sectors_to_write, bytes, u64s;
 	u64 seq = 0;
@@ -1925,7 +1924,7 @@ do_write:
 
 	bch2_sort_whiteouts(c, b);
 
-	sort_iter_init(&sort_iter, b);
+	sort_iter_stack_init(&sort_iter, b);
 
 	bytes = !b->written
 		? sizeof(struct btree_node)
@@ -1940,7 +1939,7 @@ do_write:
 			continue;
 
 		bytes += le16_to_cpu(i->u64s) * sizeof(u64);
-		sort_iter_add(&sort_iter,
+		sort_iter_add(&sort_iter.iter,
 			      btree_bkey_first(b, t),
 			      btree_bkey_last(b, t));
 		seq = max(seq, le64_to_cpu(i->journal_seq));
@@ -1969,14 +1968,14 @@ do_write:
 	i->journal_seq	= cpu_to_le64(seq);
 	i->u64s		= 0;
 
-	sort_iter_add(&sort_iter,
+	sort_iter_add(&sort_iter.iter,
 		      unwritten_whiteouts_start(c, b),
 		      unwritten_whiteouts_end(c, b));
 	SET_BSET_SEPARATE_WHITEOUTS(i, false);
 
 	b->whiteout_u64s = 0;
 
-	u64s = bch2_sort_keys(i->start, &sort_iter, false);
+	u64s = bch2_sort_keys(i->start, &sort_iter.iter, false);
 	le16_add_cpu(&i->u64s, u64s);
 
 	BUG_ON(!b->written && i->u64s != b->data->keys.u64s);
diff --git a/fs/bcachefs/vstructs.h b/fs/bcachefs/vstructs.h
index 53a694d71967..a6561b4b36a6 100644
--- a/fs/bcachefs/vstructs.h
+++ b/fs/bcachefs/vstructs.h
@@ -41,11 +41,11 @@
 	(round_up(vstruct_bytes(_s), 512 << (_sector_block_bits)) >> 9)
 
 #define vstruct_next(_s)						\
-	((typeof(_s))			((_s)->_data + __vstruct_u64s(_s)))
+	((typeof(_s))			((u64 *) (_s)->_data + __vstruct_u64s(_s)))
 #define vstruct_last(_s)						\
-	((typeof(&(_s)->start[0]))	((_s)->_data + __vstruct_u64s(_s)))
+	((typeof(&(_s)->start[0]))	((u64 *) (_s)->_data + __vstruct_u64s(_s)))
 #define vstruct_end(_s)							\
-	((void *)			((_s)->_data + __vstruct_u64s(_s)))
+	((void *)			((u64 *) (_s)->_data + __vstruct_u64s(_s)))
 
 #define vstruct_for_each(_s, _i)					\
 	for (_i = (_s)->start;						\
-- 
cgit 


From 5b7fbdcd5b04b618178d6339e36435997ef6b086 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 9 Sep 2023 21:14:54 -0400
Subject: bcachefs: Fix silent enum conversion error

This changes mark_btree_node_locked() to take an enum
btree_node_locked_type, not a six_lock_type, since BTREE_NODE_UNLOCKED
is -1 which may cause problems converting back and forth to
six_lock_type if short enums are in use.

With this change, we never store BTREE_NODE_UNLOCKED in a six_lock_type
enum.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c            |  8 +++++---
 fs/bcachefs/btree_key_cache.c       | 12 +++++++-----
 fs/bcachefs/btree_locking.h         |  2 +-
 fs/bcachefs/btree_update_interior.c | 14 +++++++-------
 4 files changed, 20 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 98cf52c5e132..1dbb4d7dfb45 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -689,7 +689,7 @@ void bch2_trans_node_add(struct btree_trans *trans, struct btree *b)
 			if (t != BTREE_NODE_UNLOCKED) {
 				btree_node_unlock(trans, path, b->c.level);
 				six_lock_increment(&b->c.lock, (enum six_lock_type) t);
-				mark_btree_node_locked(trans, path, b->c.level, (enum six_lock_type) t);
+				mark_btree_node_locked(trans, path, b->c.level, t);
 			}
 
 			bch2_btree_path_level_init(trans, path, b);
@@ -764,7 +764,8 @@ static inline int btree_path_lock_root(struct btree_trans *trans,
 			for (i = path->level + 1; i < BTREE_MAX_DEPTH; i++)
 				path->l[i].b = NULL;
 
-			mark_btree_node_locked(trans, path, path->level, lock_type);
+			mark_btree_node_locked(trans, path, path->level,
+					       (enum btree_node_locked_type) lock_type);
 			bch2_btree_path_level_init(trans, path, b);
 			return 0;
 		}
@@ -936,7 +937,8 @@ static __always_inline int btree_path_down(struct btree_trans *trans,
 	if (btree_node_read_locked(path, level + 1))
 		btree_node_unlock(trans, path, level + 1);
 
-	mark_btree_node_locked(trans, path, level, lock_type);
+	mark_btree_node_locked(trans, path, level,
+			       (enum btree_node_locked_type) lock_type);
 	path->level = level;
 	bch2_btree_path_level_init(trans, path, b);
 
diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index badb541f493f..505e7c365ab7 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -252,7 +252,7 @@ bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path,
 
 		path->l[0].b = (void *) ck;
 		path->l[0].lock_seq = six_lock_seq(&ck->c.lock);
-		mark_btree_node_locked(trans, path, 0, SIX_LOCK_intent);
+		mark_btree_node_locked(trans, path, 0, BTREE_NODE_INTENT_LOCKED);
 
 		ret = bch2_btree_node_lock_write(trans, path, &ck->c);
 		if (unlikely(ret)) {
@@ -330,7 +330,7 @@ btree_key_cache_create(struct btree_trans *trans, struct btree_path *path)
 			return ERR_PTR(-BCH_ERR_ENOMEM_btree_key_cache_create);
 		}
 
-		mark_btree_node_locked(trans, path, 0, SIX_LOCK_intent);
+		mark_btree_node_locked(trans, path, 0, BTREE_NODE_INTENT_LOCKED);
 	}
 
 	ck->c.level		= 0;
@@ -478,7 +478,7 @@ retry:
 		if (!ck)
 			goto retry;
 
-		mark_btree_node_locked(trans, path, 0, SIX_LOCK_intent);
+		mark_btree_node_locked(trans, path, 0, BTREE_NODE_INTENT_LOCKED);
 		path->locks_want = 1;
 	} else {
 		enum six_lock_type lock_want = __btree_lock_want(path, 0);
@@ -496,7 +496,8 @@ retry:
 			goto retry;
 		}
 
-		mark_btree_node_locked(trans, path, 0, lock_want);
+		mark_btree_node_locked(trans, path, 0,
+				       (enum btree_node_locked_type) lock_want);
 	}
 
 	path->l[0].lock_seq	= six_lock_seq(&ck->c.lock);
@@ -578,7 +579,8 @@ retry:
 			goto retry;
 		}
 
-		mark_btree_node_locked(trans, path, 0, lock_want);
+		mark_btree_node_locked(trans, path, 0,
+				       (enum btree_node_locked_type) lock_want);
 	}
 
 	path->l[0].lock_seq	= six_lock_seq(&ck->c.lock);
diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h
index 22e2cd3914a5..6231e9ffc5d7 100644
--- a/fs/bcachefs/btree_locking.h
+++ b/fs/bcachefs/btree_locking.h
@@ -91,7 +91,7 @@ static inline void mark_btree_node_unlocked(struct btree_path *path,
 static inline void mark_btree_node_locked(struct btree_trans *trans,
 					  struct btree_path *path,
 					  unsigned level,
-					  enum six_lock_type type)
+					  enum btree_node_locked_type type)
 {
 	mark_btree_node_locked_noreset(path, level, (enum btree_node_locked_type) type);
 #ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index c741150e68af..73c950d2788e 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -680,7 +680,7 @@ err:
 		 */
 		bch2_trans_unlock(&trans);
 		btree_node_lock_nopath_nofail(&trans, &b->c, SIX_LOCK_intent);
-		mark_btree_node_locked(&trans, path, b->c.level, SIX_LOCK_intent);
+		mark_btree_node_locked(&trans, path, b->c.level, BTREE_NODE_INTENT_LOCKED);
 		path->l[b->c.level].lock_seq = six_lock_seq(&b->c.lock);
 		path->l[b->c.level].b = b;
 
@@ -1513,12 +1513,12 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans,
 
 		path1 = get_unlocked_mut_path(trans, path->btree_id, n1->c.level, n1->key.k.p);
 		six_lock_increment(&n1->c.lock, SIX_LOCK_intent);
-		mark_btree_node_locked(trans, path1, n1->c.level, SIX_LOCK_intent);
+		mark_btree_node_locked(trans, path1, n1->c.level, BTREE_NODE_INTENT_LOCKED);
 		bch2_btree_path_level_init(trans, path1, n1);
 
 		path2 = get_unlocked_mut_path(trans, path->btree_id, n2->c.level, n2->key.k.p);
 		six_lock_increment(&n2->c.lock, SIX_LOCK_intent);
-		mark_btree_node_locked(trans, path2, n2->c.level, SIX_LOCK_intent);
+		mark_btree_node_locked(trans, path2, n2->c.level, BTREE_NODE_INTENT_LOCKED);
 		bch2_btree_path_level_init(trans, path2, n2);
 
 		/*
@@ -1539,7 +1539,7 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans,
 			path2->locks_want++;
 			BUG_ON(btree_node_locked(path2, n3->c.level));
 			six_lock_increment(&n3->c.lock, SIX_LOCK_intent);
-			mark_btree_node_locked(trans, path2, n3->c.level, SIX_LOCK_intent);
+			mark_btree_node_locked(trans, path2, n3->c.level, BTREE_NODE_INTENT_LOCKED);
 			bch2_btree_path_level_init(trans, path2, n3);
 
 			n3->sib_u64s[0] = U16_MAX;
@@ -1563,7 +1563,7 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans,
 
 		path1 = get_unlocked_mut_path(trans, path->btree_id, n1->c.level, n1->key.k.p);
 		six_lock_increment(&n1->c.lock, SIX_LOCK_intent);
-		mark_btree_node_locked(trans, path1, n1->c.level, SIX_LOCK_intent);
+		mark_btree_node_locked(trans, path1, n1->c.level, BTREE_NODE_INTENT_LOCKED);
 		bch2_btree_path_level_init(trans, path1, n1);
 
 		if (parent)
@@ -1890,7 +1890,7 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans,
 
 	new_path = get_unlocked_mut_path(trans, path->btree_id, n->c.level, n->key.k.p);
 	six_lock_increment(&n->c.lock, SIX_LOCK_intent);
-	mark_btree_node_locked(trans, new_path, n->c.level, SIX_LOCK_intent);
+	mark_btree_node_locked(trans, new_path, n->c.level, BTREE_NODE_INTENT_LOCKED);
 	bch2_btree_path_level_init(trans, new_path, n);
 
 	bkey_init(&delete.k);
@@ -1967,7 +1967,7 @@ int bch2_btree_node_rewrite(struct btree_trans *trans,
 
 	new_path = get_unlocked_mut_path(trans, iter->btree_id, n->c.level, n->key.k.p);
 	six_lock_increment(&n->c.lock, SIX_LOCK_intent);
-	mark_btree_node_locked(trans, new_path, n->c.level, SIX_LOCK_intent);
+	mark_btree_node_locked(trans, new_path, n->c.level, BTREE_NODE_INTENT_LOCKED);
 	bch2_btree_path_level_init(trans, new_path, n);
 
 	trace_and_count(c, btree_node_rewrite, c, b);
-- 
cgit 


From c872afa22420cbbeb8c78656926928b9e2abae18 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 10 Sep 2023 16:24:02 -0400
Subject: bcachefs: Fix bch2_propagate_key_to_snapshot_leaves()

When we handle a transaction restart in a nested context, we need to
return -BCH_ERR_transaction_restart_nested because we invalidated the
outer context's iterators and locks.

bch2_propagate_key_to_snapshot_leaves() wasn't doing this, this patch
fixes it to use trans_was_restarted().

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.h   | 11 +++++------
 fs/bcachefs/btree_update.c |  4 +---
 fs/bcachefs/fsck.c         | 13 +++----------
 fs/bcachefs/snapshot.c     |  4 +++-
 4 files changed, 12 insertions(+), 20 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index 4469b2e166eb..b885e4e210d4 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -276,9 +276,11 @@ int bch2_trans_relock_notrace(struct btree_trans *);
 void bch2_trans_unlock(struct btree_trans *);
 bool bch2_trans_locked(struct btree_trans *);
 
-static inline bool trans_was_restarted(struct btree_trans *trans, u32 restart_count)
+static inline int trans_was_restarted(struct btree_trans *trans, u32 restart_count)
 {
-	return restart_count != trans->restart_count;
+	return restart_count != trans->restart_count
+		? -BCH_ERR_transaction_restart_nested
+		: 0;
 }
 
 void __noreturn bch2_trans_restart_error(struct btree_trans *, u32);
@@ -707,10 +709,7 @@ __bch2_btree_iter_peek_upto_and_restart(struct btree_trans *trans,
 	if (!_ret)							\
 		bch2_trans_verify_not_restarted(_trans, _restart_count);\
 									\
-	if (!_ret && trans_was_restarted(_trans, _orig_restart_count))	\
-		_ret = -BCH_ERR_transaction_restart_nested;		\
-									\
-	_ret;								\
+	_ret ?: trans_was_restarted(_trans, _restart_count);		\
 })
 
 #define for_each_btree_key2(_trans, _iter, _btree_id,			\
diff --git a/fs/bcachefs/btree_update.c b/fs/bcachefs/btree_update.c
index 880ce7431894..7368e1e00f53 100644
--- a/fs/bcachefs/btree_update.c
+++ b/fs/bcachefs/btree_update.c
@@ -777,9 +777,7 @@ err:
 	}
 	bch2_trans_iter_exit(trans, &iter);
 
-	if (!ret && trans_was_restarted(trans, restart_count))
-		ret = -BCH_ERR_transaction_restart_nested;
-	return ret;
+	return ret ?: trans_was_restarted(trans, restart_count);
 }
 
 /*
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 238caeeaf06c..ded9711e44dd 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -618,10 +618,7 @@ static int get_inodes_all_snapshots(struct btree_trans *trans,
 
 	w->first_this_inode = true;
 
-	if (trans_was_restarted(trans, restart_count))
-		return -BCH_ERR_transaction_restart_nested;
-
-	return 0;
+	return trans_was_restarted(trans, restart_count);
 }
 
 static struct inode_walker_entry *
@@ -1089,9 +1086,7 @@ static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w)
 fsck_err:
 	if (ret)
 		bch_err_fn(c, ret);
-	if (!ret && trans_was_restarted(trans, restart_count))
-		ret = -BCH_ERR_transaction_restart_nested;
-	return ret;
+	return ret ?: trans_was_restarted(trans, restart_count);
 }
 
 struct extent_end {
@@ -1509,9 +1504,7 @@ static int check_subdir_count(struct btree_trans *trans, struct inode_walker *w)
 fsck_err:
 	if (ret)
 		bch_err_fn(c, ret);
-	if (!ret && trans_was_restarted(trans, restart_count))
-		ret = -BCH_ERR_transaction_restart_nested;
-	return ret;
+	return ret ?: trans_was_restarted(trans, restart_count);
 }
 
 static int check_dirent_target(struct btree_trans *trans,
diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c
index 9da09911466e..3ca61ede28d5 100644
--- a/fs/bcachefs/snapshot.c
+++ b/fs/bcachefs/snapshot.c
@@ -1637,6 +1637,7 @@ int bch2_propagate_key_to_snapshot_leaves(struct btree_trans *trans,
 {
 	struct bch_fs *c = trans->c;
 	struct bkey_buf sk;
+	u32 restart_count = trans->restart_count;
 	int ret;
 
 	bch2_bkey_buf_init(&sk);
@@ -1659,7 +1660,8 @@ int bch2_propagate_key_to_snapshot_leaves(struct btree_trans *trans,
 	}
 
 	bch2_bkey_buf_exit(&sk, c);
-	return ret;
+
+	return ret ?: trans_was_restarted(trans, restart_count);
 }
 
 int bch2_snapshots_read(struct bch_fs *c)
-- 
cgit 


From c7afec9bd63dc00047c35f9b747aa2be505533e6 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 10 Sep 2023 02:13:33 -0400
Subject: bcachefs: Fix bch_sb_handle type

blk_mode_t was recently introduced; we should be using it now, instead
of fmode_t.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/super_types.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/super_types.h b/fs/bcachefs/super_types.h
index 08faeedba326..597a8db73585 100644
--- a/fs/bcachefs/super_types.h
+++ b/fs/bcachefs/super_types.h
@@ -8,7 +8,7 @@ struct bch_sb_handle {
 	struct bio		*bio;
 	void			*holder;
 	size_t			buffer_size;
-	fmode_t			mode;
+	blk_mode_t		mode;
 	unsigned		have_layout:1;
 	unsigned		have_bio:1;
 	unsigned		fs_sb:1;
-- 
cgit 


From da187cacb8a59e668ce716214865612ae3921e91 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 10 Sep 2023 22:05:50 -0400
Subject: bcachefs: Kill missing inode warnings in bch2_quota_read()

bch2_quota_read(), when scanning for inodes, may attempt to look up
inodes that have been deleted in the main subvolume - this is not an
error.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/inode.c | 27 +++++++++++++++++++++++++--
 fs/bcachefs/inode.h |  3 +++
 fs/bcachefs/quota.c |  2 +-
 3 files changed, 29 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index 8114b6e4f202..4548de6e97b2 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -318,7 +318,7 @@ int bch2_inode_unpack(struct bkey_s_c k,
 	return bch2_inode_unpack_slowpath(k, unpacked);
 }
 
-int bch2_inode_peek(struct btree_trans *trans,
+static int bch2_inode_peek_nowarn(struct btree_trans *trans,
 		    struct btree_iter *iter,
 		    struct bch_inode_unpacked *inode,
 		    subvol_inum inum, unsigned flags)
@@ -349,7 +349,17 @@ int bch2_inode_peek(struct btree_trans *trans,
 	return 0;
 err:
 	bch2_trans_iter_exit(trans, iter);
-	if (!bch2_err_matches(ret, BCH_ERR_transaction_restart))
+	return ret;
+}
+
+int bch2_inode_peek(struct btree_trans *trans,
+		    struct btree_iter *iter,
+		    struct bch_inode_unpacked *inode,
+		    subvol_inum inum, unsigned flags)
+{
+	int ret = bch2_inode_peek_nowarn(trans, iter, inode, inum, flags);
+
+	if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
 		bch_err_msg(trans->c, ret, "looking up inum %u:%llu:", inum.subvol, inum.inum);
 	return ret;
 }
@@ -880,6 +890,19 @@ err:
 	return ret;
 }
 
+int bch2_inode_find_by_inum_nowarn_trans(struct btree_trans *trans,
+				  subvol_inum inum,
+				  struct bch_inode_unpacked *inode)
+{
+	struct btree_iter iter;
+	int ret;
+
+	ret = bch2_inode_peek_nowarn(trans, &iter, inode, inum, 0);
+	if (!ret)
+		bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
 int bch2_inode_find_by_inum_trans(struct btree_trans *trans,
 				  subvol_inum inum,
 				  struct bch_inode_unpacked *inode)
diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h
index 22b24405649f..a7464e1b6960 100644
--- a/fs/bcachefs/inode.h
+++ b/fs/bcachefs/inode.h
@@ -118,6 +118,9 @@ int bch2_inode_create(struct btree_trans *, struct btree_iter *,
 
 int bch2_inode_rm(struct bch_fs *, subvol_inum);
 
+int bch2_inode_find_by_inum_nowarn_trans(struct btree_trans *,
+				  subvol_inum,
+				  struct bch_inode_unpacked *);
 int bch2_inode_find_by_inum_trans(struct btree_trans *, subvol_inum,
 				  struct bch_inode_unpacked *);
 int bch2_inode_find_by_inum(struct bch_fs *, subvol_inum,
diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c
index ca99772aedc6..60d27f726519 100644
--- a/fs/bcachefs/quota.c
+++ b/fs/bcachefs/quota.c
@@ -572,7 +572,7 @@ static int bch2_fs_quota_read_inode(struct btree_trans *trans,
 	if (!s_t.master_subvol)
 		goto advance;
 
-	ret = bch2_inode_find_by_inum_trans(trans,
+	ret = bch2_inode_find_by_inum_nowarn_trans(trans,
 				(subvol_inum) {
 					le32_to_cpu(s_t.master_subvol),
 					k.k->p.offset,
-- 
cgit 


From e46c181af9e230c4c5dbc701fdadc295d6191eec Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 11 Sep 2023 01:37:34 -0400
Subject: bcachefs: Convert more code to bch_err_msg()

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c      |  2 +-
 fs/bcachefs/btree_gc.c              | 12 ++++-----
 fs/bcachefs/btree_update_interior.c |  5 ++--
 fs/bcachefs/ec.c                    |  3 +--
 fs/bcachefs/fs.c                    |  2 +-
 fs/bcachefs/fsck.c                  | 50 ++++++++++++++++---------------------
 fs/bcachefs/journal.c               |  2 +-
 fs/bcachefs/journal_reclaim.c       |  2 +-
 fs/bcachefs/migrate.c               |  3 +--
 fs/bcachefs/movinggc.c              |  7 +++---
 fs/bcachefs/rebalance.c             |  2 +-
 fs/bcachefs/replicas.c              |  2 +-
 fs/bcachefs/snapshot.c              |  6 ++---
 fs/bcachefs/subvolume.c             |  5 ++--
 fs/bcachefs/super.c                 | 43 ++++++++++++++++---------------
 15 files changed, 67 insertions(+), 79 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index e36426b52a4a..fcb3d53bb6f3 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -1931,7 +1931,7 @@ bkey_err:
 	bch2_trans_exit(&trans);
 
 	if (ret < 0) {
-		bch_err(ca, "error initializing free space: %s", bch2_err_str(ret));
+		bch_err_msg(ca, ret, "initializing free space");
 		return ret;
 	}
 
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index dac2eb76c985..844ac0024683 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -1483,7 +1483,7 @@ static int bch2_gc_alloc_done(struct bch_fs *c, bool metadata_only)
 			bch2_alloc_write_key(&trans, &iter, k, metadata_only));
 
 		if (ret < 0) {
-			bch_err(c, "error writing alloc info: %s", bch2_err_str(ret));
+			bch_err_fn(c, ret);
 			percpu_ref_put(&ca->ref);
 			break;
 		}
@@ -1548,7 +1548,7 @@ static int bch2_gc_alloc_start(struct bch_fs *c, bool metadata_only)
 	bch2_trans_exit(&trans);
 
 	if (ret)
-		bch_err(c, "error reading alloc info at gc start: %s", bch2_err_str(ret));
+		bch_err_fn(c, ret);
 
 	return ret;
 }
@@ -1998,7 +1998,7 @@ int bch2_gc_gens(struct bch_fs *c)
 					BTREE_INSERT_NOFAIL,
 				gc_btree_gens_key(&trans, &iter, k));
 			if (ret && !bch2_err_matches(ret, EROFS))
-				bch_err(c, "error recalculating oldest_gen: %s", bch2_err_str(ret));
+				bch_err_fn(c, ret);
 			if (ret)
 				goto err;
 		}
@@ -2011,7 +2011,7 @@ int bch2_gc_gens(struct bch_fs *c)
 			BTREE_INSERT_NOFAIL,
 		bch2_alloc_write_oldest_gen(&trans, &iter, k));
 	if (ret && !bch2_err_matches(ret, EROFS))
-		bch_err(c, "error writing oldest_gen: %s", bch2_err_str(ret));
+		bch_err_fn(c, ret);
 	if (ret)
 		goto err;
 
@@ -2083,7 +2083,7 @@ static int bch2_gc_thread(void *arg)
 		ret = bch2_gc_gens(c);
 #endif
 		if (ret < 0)
-			bch_err(c, "btree gc failed: %s", bch2_err_str(ret));
+			bch_err_fn(c, ret);
 
 		debug_check_no_locks_held();
 	}
@@ -2113,7 +2113,7 @@ int bch2_gc_thread_start(struct bch_fs *c)
 
 	p = kthread_create(bch2_gc_thread, c, "bch-gc/%s", c->name);
 	if (IS_ERR(p)) {
-		bch_err(c, "error creating gc thread: %s", bch2_err_str(PTR_ERR(p)));
+		bch_err_fn(c, PTR_ERR(p));
 		return PTR_ERR(p);
 	}
 
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 73c950d2788e..c5b571f8333c 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -2057,7 +2057,7 @@ static void async_btree_node_rewrite_work(struct work_struct *work)
 	ret = bch2_trans_do(c, NULL, NULL, 0,
 		      async_btree_node_rewrite_trans(&trans, a));
 	if (ret)
-		bch_err(c, "%s: error %s", __func__, bch2_err_str(ret));
+		bch_err_fn(c, ret);
 	bch2_write_ref_put(c, BCH_WRITE_REF_node_rewrite);
 	kfree(a);
 }
@@ -2096,8 +2096,7 @@ void bch2_btree_node_rewrite_async(struct bch_fs *c, struct btree *b)
 
 		ret = bch2_fs_read_write_early(c);
 		if (ret) {
-			bch_err(c, "%s: error going read-write: %s",
-				__func__, bch2_err_str(ret));
+			bch_err_msg(c, ret, "going read-write");
 			kfree(a);
 			return;
 		}
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index f58e84a2bf88..67a5453a36d9 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -1133,8 +1133,7 @@ static void ec_stripe_create(struct ec_stripe_new *s)
 
 	ret = ec_stripe_update_extents(c, &s->new_stripe);
 	if (ret) {
-		bch_err(c, "error creating stripe: error updating pointers: %s",
-			bch2_err_str(ret));
+		bch_err_msg(c, ret, "creating stripe: error updating pointers");
 		goto err;
 	}
 err:
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index 48431700b83e..08f810992a1b 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -1890,7 +1890,7 @@ got_sb:
 	vinode = bch2_vfs_inode_get(c, BCACHEFS_ROOT_SUBVOL_INUM);
 	ret = PTR_ERR_OR_ZERO(vinode);
 	if (ret) {
-		bch_err(c, "error mounting: error getting root inode: %s", bch2_err_str(ret));
+		bch_err_msg(c, ret, "mounting: error getting root inode");
 		goto err_put_super;
 	}
 
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index ded9711e44dd..26e0a1ced68a 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -80,7 +80,7 @@ static int __snapshot_lookup_subvol(struct btree_trans *trans, u32 snapshot,
 	if (!ret)
 		*subvol = le32_to_cpu(s.subvol);
 	else if (bch2_err_matches(ret, ENOENT))
-		bch_err(trans->c, "snapshot %u not fonud", snapshot);
+		bch_err(trans->c, "snapshot %u not found", snapshot);
 	return ret;
 
 }
@@ -127,8 +127,7 @@ static int lookup_first_inode(struct btree_trans *trans, u64 inode_nr,
 	ret = bch2_inode_unpack(k, inode);
 err:
 	if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
-		bch_err(trans->c, "error fetching inode %llu: %s",
-			inode_nr, bch2_err_str(ret));
+		bch_err_msg(trans->c, ret, "fetching inode %llu", inode_nr);
 	bch2_trans_iter_exit(trans, &iter);
 	return ret;
 }
@@ -154,8 +153,7 @@ static int __lookup_inode(struct btree_trans *trans, u64 inode_nr,
 		*snapshot = iter.pos.snapshot;
 err:
 	if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
-		bch_err(trans->c, "error fetching inode %llu:%u: %s",
-			inode_nr, *snapshot, bch2_err_str(ret));
+		bch_err_msg(trans->c, ret, "fetching inode %llu:%u", inode_nr, *snapshot);
 	bch2_trans_iter_exit(trans, &iter);
 	return ret;
 }
@@ -206,17 +204,16 @@ static int __write_inode(struct btree_trans *trans,
 				BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
 }
 
-static int write_inode(struct btree_trans *trans,
-		       struct bch_inode_unpacked *inode,
-		       u32 snapshot)
+static int fsck_write_inode(struct btree_trans *trans,
+			    struct bch_inode_unpacked *inode,
+			    u32 snapshot)
 {
 	int ret = commit_do(trans, NULL, NULL,
 				  BTREE_INSERT_NOFAIL|
 				  BTREE_INSERT_LAZY_RW,
 				  __write_inode(trans, inode, snapshot));
 	if (ret)
-		bch_err(trans->c, "error in fsck: error updating inode: %s",
-			bch2_err_str(ret));
+		bch_err_fn(trans->c, ret);
 	return ret;
 }
 
@@ -278,7 +275,7 @@ static int lookup_lostfound(struct btree_trans *trans, u32 subvol,
 	}
 
 	if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
-		bch_err(c, "error looking up lost+found: %s", bch2_err_str(ret));
+		bch_err_fn(c, ret);
 	if (ret)
 		return ret;
 
@@ -301,7 +298,7 @@ create_lostfound:
 				0, 0, S_IFDIR|0700, 0, NULL, NULL,
 				(subvol_inum) { }, 0);
 	if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
-		bch_err(c, "error creating lost+found: %s", bch2_err_str(ret));
+		bch_err_msg(c, ret, "creating lost+found");
 	return ret;
 }
 
@@ -365,8 +362,7 @@ static int reattach_inode(struct btree_trans *trans,
 				  BTREE_INSERT_NOFAIL,
 			__reattach_inode(trans, inode, inode_snapshot));
 	if (ret) {
-		bch_err(trans->c, "error reattaching inode %llu: %s",
-			inode->bi_inum, bch2_err_str(ret));
+		bch_err_msg(trans->c, ret, "reattaching inode %llu", inode->bi_inum);
 		return ret;
 	}
 
@@ -819,7 +815,7 @@ bad_hash:
 		      bch2_bkey_val_to_text(&buf, c, hash_k), buf.buf))) {
 		ret = hash_redo_key(trans, desc, hash_info, k_iter, hash_k);
 		if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
-			bch_err(c, "hash_redo_key err %s", bch2_err_str(ret));
+			bch_err_fn(c, ret);
 		if (ret)
 			return ret;
 		ret = -BCH_ERR_transaction_restart_nested;
@@ -883,7 +879,8 @@ static int check_inode(struct btree_trans *trans,
 
 		ret = __write_inode(trans, &u, iter->pos.snapshot);
 		if (ret) {
-			bch_err_msg(c, ret, "in fsck: error updating inode");
+			if (!bch2_err_matches(ret, BCH_ERR_transaction_restart))
+				bch_err_msg(c, ret, "in fsck updating inode");
 			return ret;
 		}
 
@@ -901,8 +898,7 @@ static int check_inode(struct btree_trans *trans,
 
 		ret = bch2_inode_rm_snapshot(trans, u.bi_inum, iter->pos.snapshot);
 		if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
-			bch_err(c, "error in fsck: error while deleting inode: %s",
-				bch2_err_str(ret));
+			bch_err_msg(c, ret, "in fsck deleting inode");
 		return ret;
 	}
 
@@ -925,8 +921,7 @@ static int check_inode(struct btree_trans *trans,
 				POS(u.bi_inum, U64_MAX),
 				0, NULL);
 		if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
-			bch_err(c, "error in fsck: error truncating inode: %s",
-				bch2_err_str(ret));
+			bch_err_msg(c, ret, "in fsck truncating inode");
 		if (ret)
 			return ret;
 
@@ -951,8 +946,7 @@ static int check_inode(struct btree_trans *trans,
 
 		sectors = bch2_count_inode_sectors(trans, u.bi_inum, iter->pos.snapshot);
 		if (sectors < 0) {
-			bch_err(c, "error in fsck: error recounting inode sectors: %s",
-				bch2_err_str(sectors));
+			bch_err_msg(c, sectors, "fsck recounting inode sectors");
 			return sectors;
 		}
 
@@ -971,13 +965,13 @@ static int check_inode(struct btree_trans *trans,
 	if (do_update) {
 		ret = __write_inode(trans, &u, iter->pos.snapshot);
 		if (ret) {
-			bch_err_msg(c, ret, "in fsck: error updating inode");
+			bch_err_msg(c, ret, "in fsck updating inode");
 			return ret;
 		}
 	}
 err:
 fsck_err:
-	if (ret)
+	if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
 		bch_err_fn(c, ret);
 	return ret;
 }
@@ -1078,7 +1072,7 @@ static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w)
 			    w->last_pos.inode, i->snapshot,
 			    i->inode.bi_sectors, i->count)) {
 			i->inode.bi_sectors = i->count;
-			ret = write_inode(trans, &i->inode, i->snapshot);
+			ret = fsck_write_inode(trans, &i->inode, i->snapshot);
 			if (ret)
 				break;
 		}
@@ -1496,7 +1490,7 @@ static int check_subdir_count(struct btree_trans *trans, struct inode_walker *w)
 				"directory %llu:%u with wrong i_nlink: got %u, should be %llu",
 				w->last_pos.inode, i->snapshot, i->inode.bi_nlink, i->count)) {
 			i->inode.bi_nlink = i->count;
-			ret = write_inode(trans, &i->inode, i->snapshot);
+			ret = fsck_write_inode(trans, &i->inode, i->snapshot);
 			if (ret)
 				break;
 		}
@@ -1923,7 +1917,7 @@ static int check_root_trans(struct btree_trans *trans)
 			__bch2_btree_insert(trans, BTREE_ID_subvolumes,
 					    &root_subvol.k_i, 0));
 		if (ret) {
-			bch_err(c, "error writing root subvol: %s", bch2_err_str(ret));
+			bch_err_msg(c, ret, "writing root subvol");
 			goto err;
 		}
 
@@ -1942,7 +1936,7 @@ static int check_root_trans(struct btree_trans *trans)
 
 		ret = __write_inode(trans, &root_inode, snapshot);
 		if (ret)
-			bch_err(c, "error writing root inode: %s", bch2_err_str(ret));
+			bch_err_msg(c, ret, "writing root inode");
 	}
 err:
 fsck_err:
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index 055920c26da6..4b9295a15837 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -834,7 +834,7 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
 						ca->mi.bucket_size));
 			if (ret) {
 				bch2_open_bucket_put(c, ob[nr_got]);
-				bch_err(c, "error marking new journal buckets: %s", bch2_err_str(ret));
+				bch_err_msg(c, ret, "marking new journal buckets");
 				break;
 			}
 
diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
index 10e1860dad79..73d135a8f37a 100644
--- a/fs/bcachefs/journal_reclaim.c
+++ b/fs/bcachefs/journal_reclaim.c
@@ -758,7 +758,7 @@ int bch2_journal_reclaim_start(struct journal *j)
 			   "bch-reclaim/%s", c->name);
 	ret = PTR_ERR_OR_ZERO(p);
 	if (ret) {
-		bch_err(c, "error creating journal reclaim thread: %s", bch2_err_str(ret));
+		bch_err_msg(c, ret, "creating journal reclaim thread");
 		return ret;
 	}
 
diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c
index 81c8cdbac285..3d7c5b919421 100644
--- a/fs/bcachefs/migrate.c
+++ b/fs/bcachefs/migrate.c
@@ -148,8 +148,7 @@ retry:
 			}
 
 			if (ret) {
-				bch_err(c, "Error updating btree node key: %s",
-					bch2_err_str(ret));
+				bch_err_msg(c, ret, "updating btree node key");
 				break;
 			}
 next:
diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c
index 256431a6dc0c..ac658e99bf57 100644
--- a/fs/bcachefs/movinggc.c
+++ b/fs/bcachefs/movinggc.c
@@ -242,7 +242,7 @@ err:
 		ret = 0;
 
 	if (ret < 0 && !bch2_err_matches(ret, EROFS))
-		bch_err(c, "error from bch2_move_data() in copygc: %s", bch2_err_str(ret));
+		bch_err_msg(c, ret, "from bch2_move_data()");
 
 	moved = atomic64_read(&ctxt->stats->sectors_moved) - moved;
 	trace_and_count(c, copygc, c, moved, 0, 0, 0);
@@ -320,8 +320,7 @@ static int bch2_copygc_thread(void *arg)
 
 	ret = rhashtable_init(&move_buckets.table, &bch_move_bucket_params);
 	if (ret) {
-		bch_err(c, "error allocating copygc buckets in flight: %s",
-			bch2_err_str(ret));
+		bch_err_msg(c, ret, "allocating copygc buckets in flight");
 		return ret;
 	}
 
@@ -404,7 +403,7 @@ int bch2_copygc_start(struct bch_fs *c)
 	t = kthread_create(bch2_copygc_thread, c, "bch-copygc/%s", c->name);
 	ret = PTR_ERR_OR_ZERO(t);
 	if (ret) {
-		bch_err(c, "error creating copygc thread: %s", bch2_err_str(ret));
+		bch_err_msg(c, ret, "creating copygc thread");
 		return ret;
 	}
 
diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c
index 15ce3ecba0ba..016cf0834b3d 100644
--- a/fs/bcachefs/rebalance.c
+++ b/fs/bcachefs/rebalance.c
@@ -350,7 +350,7 @@ int bch2_rebalance_start(struct bch_fs *c)
 	p = kthread_create(bch2_rebalance_thread, c, "bch-rebalance/%s", c->name);
 	ret = PTR_ERR_OR_ZERO(p);
 	if (ret) {
-		bch_err(c, "error creating rebalance thread: %s", bch2_err_str(ret));
+		bch_err_msg(c, ret, "creating rebalance thread");
 		return ret;
 	}
 
diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c
index 5b591c59bc3e..dbef41cd8593 100644
--- a/fs/bcachefs/replicas.c
+++ b/fs/bcachefs/replicas.c
@@ -429,7 +429,7 @@ out:
 
 	return ret;
 err:
-	bch_err(c, "error adding replicas entry: %s", bch2_err_str(ret));
+	bch_err_msg(c, ret, "adding replicas entry");
 	goto out;
 }
 
diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c
index 3ca61ede28d5..9bab9860b20b 100644
--- a/fs/bcachefs/snapshot.c
+++ b/fs/bcachefs/snapshot.c
@@ -1385,7 +1385,7 @@ int bch2_delete_dead_snapshots(struct bch_fs *c)
 	if (!test_bit(BCH_FS_STARTED, &c->flags)) {
 		ret = bch2_fs_read_write_early(c);
 		if (ret) {
-			bch_err(c, "error deleleting dead snapshots: error going rw: %s", bch2_err_str(ret));
+			bch_err_msg(c, ret, "error deleleting dead snapshots: error going rw");
 			return ret;
 		}
 	}
@@ -1401,7 +1401,7 @@ int bch2_delete_dead_snapshots(struct bch_fs *c)
 			NULL, NULL, 0,
 		bch2_delete_redundant_snapshot(&trans, &iter, k));
 	if (ret) {
-		bch_err(c, "error deleting redundant snapshots: %s", bch2_err_str(ret));
+		bch_err_msg(c, ret, "deleting redundant snapshots");
 		goto err;
 	}
 
@@ -1409,7 +1409,7 @@ int bch2_delete_dead_snapshots(struct bch_fs *c)
 			   POS_MIN, 0, k,
 		bch2_snapshot_set_equiv(&trans, k));
 	if (ret) {
-		bch_err(c, "error in bch2_snapshots_set_equiv: %s", bch2_err_str(ret));
+		bch_err_msg(c, ret, "in bch2_snapshots_set_equiv");
 		goto err;
 	}
 
diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c
index 0214a98deb4f..b6015a8060ec 100644
--- a/fs/bcachefs/subvolume.c
+++ b/fs/bcachefs/subvolume.c
@@ -41,8 +41,7 @@ static int check_subvol(struct btree_trans *trans,
 
 		ret = bch2_subvolume_delete(trans, iter->pos.offset);
 		if (ret)
-			bch_err(c, "error deleting subvolume %llu: %s",
-				iter->pos.offset, bch2_err_str(ret));
+			bch_err_msg(c, ret, "deleting subvolume %llu", iter->pos.offset);
 		return ret ?: -BCH_ERR_transaction_restart_nested;
 	}
 
@@ -296,7 +295,7 @@ static void bch2_subvolume_wait_for_pagecache_and_delete(struct work_struct *wor
 		for (id = s.data; id < s.data + s.nr; id++) {
 			ret = bch2_trans_run(c, bch2_subvolume_delete(&trans, *id));
 			if (ret) {
-				bch_err(c, "error deleting subvolume %u: %s", *id, bch2_err_str(ret));
+				bch_err_msg(c, ret, "deleting subvolume %u", *id);
 				break;
 			}
 		}
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 29cd71445a94..1b1a9e539f65 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -994,7 +994,7 @@ out:
 	up_write(&c->state_lock);
 	return ret;
 err:
-	bch_err(c, "error starting filesystem: %s", bch2_err_str(ret));
+	bch_err_msg(c, ret, "starting filesystem");
 	goto out;
 }
 
@@ -1459,7 +1459,7 @@ static int bch2_dev_remove_alloc(struct bch_fs *c, struct bch_dev *ca)
 		bch2_btree_delete_range(c, BTREE_ID_bucket_gens, start, end,
 					BTREE_TRIGGER_NORUN, NULL);
 	if (ret)
-		bch_err(c, "error removing dev alloc info: %s", bch2_err_str(ret));
+		bch_err_msg(c, ret, "removing dev alloc info");
 
 	return ret;
 }
@@ -1488,31 +1488,31 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
 
 	ret = bch2_dev_data_drop(c, ca->dev_idx, flags);
 	if (ret) {
-		bch_err(ca, "Remove failed: error dropping data: %s", bch2_err_str(ret));
+		bch_err_msg(ca, ret, "dropping data");
 		goto err;
 	}
 
 	ret = bch2_dev_remove_alloc(c, ca);
 	if (ret) {
-		bch_err(ca, "Remove failed, error deleting alloc info");
+		bch_err_msg(ca, ret, "deleting alloc info");
 		goto err;
 	}
 
 	ret = bch2_journal_flush_device_pins(&c->journal, ca->dev_idx);
 	if (ret) {
-		bch_err(ca, "Remove failed: error flushing journal: %s", bch2_err_str(ret));
+		bch_err_msg(ca, ret, "flushing journal");
 		goto err;
 	}
 
 	ret = bch2_journal_flush(&c->journal);
 	if (ret) {
-		bch_err(ca, "Remove failed, journal error");
+		bch_err(ca, "journal error");
 		goto err;
 	}
 
 	ret = bch2_replicas_gc2(c);
 	if (ret) {
-		bch_err(ca, "Remove failed: error from replicas gc: %s", bch2_err_str(ret));
+		bch_err_msg(ca, ret, "in replicas_gc2()");
 		goto err;
 	}
 
@@ -1587,7 +1587,7 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
 
 	ret = bch2_read_super(path, &opts, &sb);
 	if (ret) {
-		bch_err(c, "device add error: error reading super: %s", bch2_err_str(ret));
+		bch_err_msg(c, ret, "reading super");
 		goto err;
 	}
 
@@ -1603,7 +1603,7 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
 
 	ret = bch2_dev_may_add(sb.sb, c);
 	if (ret) {
-		bch_err(c, "device add error: %s", bch2_err_str(ret));
+		bch_err_fn(c, ret);
 		goto err;
 	}
 
@@ -1624,7 +1624,7 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
 
 	ret = bch2_dev_journal_alloc(ca);
 	if (ret) {
-		bch_err(c, "device add error: journal alloc failed");
+		bch_err_msg(c, ret, "allocating journal");
 		goto err;
 	}
 
@@ -1633,7 +1633,7 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
 
 	ret = bch2_sb_from_fs(c, ca);
 	if (ret) {
-		bch_err(c, "device add error: new device superblock too small");
+		bch_err_msg(c, ret, "setting up new superblock");
 		goto err_unlock;
 	}
 
@@ -1642,8 +1642,8 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
 	if (!bch2_sb_resize_members(&ca->disk_sb,
 				le32_to_cpu(mi->field.u64s) +
 				sizeof(dev_mi) / sizeof(u64))) {
-		bch_err(c, "device add error: new device superblock too small");
 		ret = -BCH_ERR_ENOSPC_sb_members;
+		bch_err_msg(c, ret, "setting up new superblock");
 		goto err_unlock;
 	}
 
@@ -1655,8 +1655,8 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
 		if (!bch2_dev_exists(c->disk_sb.sb, mi, dev_idx))
 			goto have_slot;
 no_slot:
-	bch_err(c, "device add error: already have maximum number of devices");
 	ret = -BCH_ERR_ENOSPC_sb_members;
+	bch_err_msg(c, ret, "setting up new superblock");
 	goto err_unlock;
 
 have_slot:
@@ -1666,8 +1666,8 @@ have_slot:
 
 	mi = bch2_sb_resize_members(&c->disk_sb, u64s);
 	if (!mi) {
-		bch_err(c, "device add error: no room in superblock for member info");
 		ret = -BCH_ERR_ENOSPC_sb_members;
+		bch_err_msg(c, ret, "setting up new superblock");
 		goto err_unlock;
 	}
 
@@ -1683,7 +1683,7 @@ have_slot:
 	if (BCH_MEMBER_GROUP(&dev_mi)) {
 		ret = __bch2_dev_group_set(c, ca, label.buf);
 		if (ret) {
-			bch_err(c, "device add error: error setting label");
+			bch_err_msg(c, ret, "creating new label");
 			goto err_unlock;
 		}
 	}
@@ -1695,13 +1695,13 @@ have_slot:
 
 	ret = bch2_trans_mark_dev_sb(c, ca);
 	if (ret) {
-		bch_err(c, "device add error: error marking new superblock: %s", bch2_err_str(ret));
+		bch_err_msg(c, ret, "marking new superblock");
 		goto err_late;
 	}
 
 	ret = bch2_fs_freespace_init(c);
 	if (ret) {
-		bch_err(c, "device add error: error initializing free space: %s", bch2_err_str(ret));
+		bch_err_msg(c, ret, "initializing free space");
 		goto err_late;
 	}
 
@@ -1751,7 +1751,7 @@ int bch2_dev_online(struct bch_fs *c, const char *path)
 
 	ret = bch2_dev_in_fs(c->disk_sb.sb, sb.sb);
 	if (ret) {
-		bch_err(c, "error bringing %s online: %s", path, bch2_err_str(ret));
+		bch_err_msg(c, ret, "bringing %s online", path);
 		goto err;
 	}
 
@@ -1763,8 +1763,7 @@ int bch2_dev_online(struct bch_fs *c, const char *path)
 
 	ret = bch2_trans_mark_dev_sb(c, ca);
 	if (ret) {
-		bch_err(c, "error bringing %s online: error from bch2_trans_mark_dev_sb: %s",
-			path, bch2_err_str(ret));
+		bch_err_msg(c, ret, "bringing %s online: error from bch2_trans_mark_dev_sb", path);
 		goto err;
 	}
 
@@ -1782,7 +1781,7 @@ int bch2_dev_online(struct bch_fs *c, const char *path)
 
 	ret = bch2_fs_freespace_init(c);
 	if (ret)
-		bch_err(c, "device add error: error initializing free space: %s", bch2_err_str(ret));
+		bch_err_msg(c, ret, "initializing free space");
 
 	up_write(&c->state_lock);
 	return 0;
@@ -1837,7 +1836,7 @@ int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
 
 	ret = bch2_dev_buckets_resize(c, ca, nbuckets);
 	if (ret) {
-		bch_err(ca, "Resize error: %s", bch2_err_str(ret));
+		bch_err_msg(ca, ret, "resizing buckets");
 		goto err;
 	}
 
-- 
cgit 


From 39791d7de2833ca4dae0061017621ca562748306 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 11 Sep 2023 14:34:56 -0400
Subject: bcachefs: Kill incorrect assertion

In the bch2_fs_alloc() error path we call bch2_fs_free() without setting
BCH_FS_STOPPING - this is fine.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/super.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 1b1a9e539f65..7cfc04947717 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -585,8 +585,6 @@ void bch2_fs_free(struct bch_fs *c)
 {
 	unsigned i;
 
-	BUG_ON(!test_bit(BCH_FS_STOPPING, &c->flags));
-
 	mutex_lock(&bch_fs_list_lock);
 	list_del(&c->list);
 	mutex_unlock(&bch_fs_list_lock);
-- 
cgit 


From aef32bf7cc040fc770199f0c1a0a2cd26f164f45 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 11 Sep 2023 19:48:07 -0400
Subject: bcachefs: __bch2_btree_insert() -> bch2_btree_insert_trans()

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c |  6 +++---
 fs/bcachefs/btree_update.c     |  6 +++---
 fs/bcachefs/btree_update.h     |  2 +-
 fs/bcachefs/buckets.c          |  2 +-
 fs/bcachefs/fsck.c             |  2 +-
 fs/bcachefs/tests.c            | 18 +++++++++---------
 6 files changed, 18 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index fcb3d53bb6f3..5c60f956b598 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -577,7 +577,7 @@ int bch2_bucket_gens_init(struct bch_fs *c)
 			ret = commit_do(&trans, NULL, NULL,
 					BTREE_INSERT_NOFAIL|
 					BTREE_INSERT_LAZY_RW,
-				__bch2_btree_insert(&trans, BTREE_ID_bucket_gens, &g.k_i, 0));
+				bch2_btree_insert_trans(&trans, BTREE_ID_bucket_gens, &g.k_i, 0));
 			if (ret)
 				break;
 			have_bucket_gens_key = false;
@@ -597,7 +597,7 @@ int bch2_bucket_gens_init(struct bch_fs *c)
 		ret = commit_do(&trans, NULL, NULL,
 				BTREE_INSERT_NOFAIL|
 				BTREE_INSERT_LAZY_RW,
-			__bch2_btree_insert(&trans, BTREE_ID_bucket_gens, &g.k_i, 0));
+			bch2_btree_insert_trans(&trans, BTREE_ID_bucket_gens, &g.k_i, 0));
 
 	bch2_trans_exit(&trans);
 
@@ -1911,7 +1911,7 @@ static int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca,
 			freespace->k.p		= k.k->p;
 			freespace->k.size	= k.k->size;
 
-			ret = __bch2_btree_insert(&trans, BTREE_ID_freespace, freespace, 0) ?:
+			ret = bch2_btree_insert_trans(&trans, BTREE_ID_freespace, freespace, 0) ?:
 				bch2_trans_commit(&trans, NULL, NULL,
 						  BTREE_INSERT_LAZY_RW|
 						  BTREE_INSERT_NOFAIL);
diff --git a/fs/bcachefs/btree_update.c b/fs/bcachefs/btree_update.c
index 7368e1e00f53..3debb29a27f1 100644
--- a/fs/bcachefs/btree_update.c
+++ b/fs/bcachefs/btree_update.c
@@ -648,8 +648,8 @@ int bch2_btree_insert_nonextent(struct btree_trans *trans,
 	return ret;
 }
 
-int __bch2_btree_insert(struct btree_trans *trans, enum btree_id id,
-			struct bkey_i *k, enum btree_update_flags flags)
+int bch2_btree_insert_trans(struct btree_trans *trans, enum btree_id id,
+			    struct bkey_i *k, enum btree_update_flags flags)
 {
 	struct btree_iter iter;
 	int ret;
@@ -676,7 +676,7 @@ int bch2_btree_insert(struct bch_fs *c, enum btree_id id,
 		      u64 *journal_seq, int flags)
 {
 	return bch2_trans_do(c, disk_res, journal_seq, flags,
-			     __bch2_btree_insert(&trans, id, k, 0));
+			     bch2_btree_insert_trans(&trans, id, k, 0));
 }
 
 int bch2_btree_delete_extent_at(struct btree_trans *trans, struct btree_iter *iter,
diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
index 901c42b57c35..78a92a1cfb47 100644
--- a/fs/bcachefs/btree_update.h
+++ b/fs/bcachefs/btree_update.h
@@ -62,7 +62,7 @@ int bch2_btree_delete_at_buffered(struct btree_trans *, enum btree_id, struct bp
 int bch2_btree_insert_nonextent(struct btree_trans *, enum btree_id,
 				struct bkey_i *, enum btree_update_flags);
 
-int __bch2_btree_insert(struct btree_trans *, enum btree_id, struct bkey_i *,
+int bch2_btree_insert_trans(struct btree_trans *, enum btree_id, struct bkey_i *,
 			enum btree_update_flags);
 int bch2_btree_insert(struct bch_fs *, enum btree_id, struct bkey_i *,
 		     struct disk_reservation *, u64 *, int flags);
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index c02c8c917a29..77ded7c5a22c 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -1201,7 +1201,7 @@ not_found:
 		new->k.p		= bkey_start_pos(p.k);
 		new->k.p.offset += *idx - start;
 		bch2_key_resize(&new->k, next_idx - *idx);
-		ret = __bch2_btree_insert(trans, BTREE_ID_extents, &new->k_i,
+		ret = bch2_btree_insert_trans(trans, BTREE_ID_extents, &new->k_i,
 					  BTREE_TRIGGER_NORUN);
 	}
 
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 26e0a1ced68a..e8cb4448bf2d 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -1914,7 +1914,7 @@ static int check_root_trans(struct btree_trans *trans)
 		ret = commit_do(trans, NULL, NULL,
 				      BTREE_INSERT_NOFAIL|
 				      BTREE_INSERT_LAZY_RW,
-			__bch2_btree_insert(trans, BTREE_ID_subvolumes,
+			bch2_btree_insert_trans(trans, BTREE_ID_subvolumes,
 					    &root_subvol.k_i, 0));
 		if (ret) {
 			bch_err_msg(c, ret, "writing root subvol");
diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c
index 72389c7376d6..0187c81e32ad 100644
--- a/fs/bcachefs/tests.c
+++ b/fs/bcachefs/tests.c
@@ -622,7 +622,7 @@ static int rand_insert(struct bch_fs *c, u64 nr)
 		k.k.p.snapshot = U32_MAX;
 
 		ret = commit_do(&trans, NULL, NULL, 0,
-			__bch2_btree_insert(&trans, BTREE_ID_xattrs, &k.k_i, 0));
+			bch2_btree_insert_trans(&trans, BTREE_ID_xattrs, &k.k_i, 0));
 		if (ret)
 			break;
 	}
@@ -649,14 +649,14 @@ static int rand_insert_multi(struct bch_fs *c, u64 nr)
 		}
 
 		ret = commit_do(&trans, NULL, NULL, 0,
-			__bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[0].k_i, 0) ?:
-			__bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[1].k_i, 0) ?:
-			__bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[2].k_i, 0) ?:
-			__bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[3].k_i, 0) ?:
-			__bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[4].k_i, 0) ?:
-			__bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[5].k_i, 0) ?:
-			__bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[6].k_i, 0) ?:
-			__bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[7].k_i, 0));
+			bch2_btree_insert_trans(&trans, BTREE_ID_xattrs, &k[0].k_i, 0) ?:
+			bch2_btree_insert_trans(&trans, BTREE_ID_xattrs, &k[1].k_i, 0) ?:
+			bch2_btree_insert_trans(&trans, BTREE_ID_xattrs, &k[2].k_i, 0) ?:
+			bch2_btree_insert_trans(&trans, BTREE_ID_xattrs, &k[3].k_i, 0) ?:
+			bch2_btree_insert_trans(&trans, BTREE_ID_xattrs, &k[4].k_i, 0) ?:
+			bch2_btree_insert_trans(&trans, BTREE_ID_xattrs, &k[5].k_i, 0) ?:
+			bch2_btree_insert_trans(&trans, BTREE_ID_xattrs, &k[6].k_i, 0) ?:
+			bch2_btree_insert_trans(&trans, BTREE_ID_xattrs, &k[7].k_i, 0));
 		if (ret)
 			break;
 	}
-- 
cgit 


From cbf57db53f311b09de2c17b514e104d421d72871 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 11 Sep 2023 19:50:42 -0400
Subject: bcachefs: bch2_trans_update_get_key_cache()

Factor out a slowpath into a separate function.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update.c | 71 +++++++++++++++++++++++++++-------------------
 1 file changed, 42 insertions(+), 29 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update.c b/fs/bcachefs/btree_update.c
index 3debb29a27f1..606e7050a84a 100644
--- a/fs/bcachefs/btree_update.c
+++ b/fs/bcachefs/btree_update.c
@@ -466,11 +466,49 @@ bch2_trans_update_by_path(struct btree_trans *trans, struct btree_path *path,
 	return 0;
 }
 
+static noinline int bch2_trans_update_get_key_cache(struct btree_trans *trans,
+						    struct btree_iter *iter,
+						    struct btree_path *path)
+{
+	if (!iter->key_cache_path ||
+	    !iter->key_cache_path->should_be_locked ||
+	    !bpos_eq(iter->key_cache_path->pos, iter->pos)) {
+		struct bkey_cached *ck;
+		int ret;
+
+		if (!iter->key_cache_path)
+			iter->key_cache_path =
+				bch2_path_get(trans, path->btree_id, path->pos, 1, 0,
+					      BTREE_ITER_INTENT|
+					      BTREE_ITER_CACHED, _THIS_IP_);
+
+		iter->key_cache_path =
+			bch2_btree_path_set_pos(trans, iter->key_cache_path, path->pos,
+						iter->flags & BTREE_ITER_INTENT,
+						_THIS_IP_);
+
+		ret = bch2_btree_path_traverse(trans, iter->key_cache_path,
+					       BTREE_ITER_CACHED);
+		if (unlikely(ret))
+			return ret;
+
+		ck = (void *) iter->key_cache_path->l[0].b;
+
+		if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
+			trace_and_count(trans->c, trans_restart_key_cache_raced, trans, _RET_IP_);
+			return btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_raced);
+		}
+
+		btree_path_set_should_be_locked(iter->key_cache_path);
+	}
+
+	return 0;
+}
+
 int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter,
 				   struct bkey_i *k, enum btree_update_flags flags)
 {
 	struct btree_path *path = iter->update_path ?: iter->path;
-	struct bkey_cached *ck;
 	int ret;
 
 	if (iter->flags & BTREE_ITER_IS_EXTENTS)
@@ -494,34 +532,9 @@ int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter
 	    !path->cached &&
 	    !path->level &&
 	    btree_id_cached(trans->c, path->btree_id)) {
-		if (!iter->key_cache_path ||
-		    !iter->key_cache_path->should_be_locked ||
-		    !bpos_eq(iter->key_cache_path->pos, k->k.p)) {
-			if (!iter->key_cache_path)
-				iter->key_cache_path =
-					bch2_path_get(trans, path->btree_id, path->pos, 1, 0,
-						      BTREE_ITER_INTENT|
-						      BTREE_ITER_CACHED, _THIS_IP_);
-
-			iter->key_cache_path =
-				bch2_btree_path_set_pos(trans, iter->key_cache_path, path->pos,
-							iter->flags & BTREE_ITER_INTENT,
-							_THIS_IP_);
-
-			ret = bch2_btree_path_traverse(trans, iter->key_cache_path,
-						       BTREE_ITER_CACHED);
-			if (unlikely(ret))
-				return ret;
-
-			ck = (void *) iter->key_cache_path->l[0].b;
-
-			if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
-				trace_and_count(trans->c, trans_restart_key_cache_raced, trans, _RET_IP_);
-				return btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_raced);
-			}
-
-			btree_path_set_should_be_locked(iter->key_cache_path);
-		}
+		ret = bch2_trans_update_get_key_cache(trans, iter, path);
+		if (ret)
+			return ret;
 
 		path = iter->key_cache_path;
 	}
-- 
cgit 


From 1809b8cba756d32bd6e976ed4ee64efdf66c6d94 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 10 Sep 2023 18:05:17 -0400
Subject: bcachefs: Break up io.c

More reorganization, this splits up io.c into
 - io_read.c
 - io_misc.c - fallocate, fpunch, truncate
 - io_write.c

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/Makefile           |    4 +-
 fs/bcachefs/alloc_foreground.c |    2 +-
 fs/bcachefs/btree_io.c         |    2 +-
 fs/bcachefs/btree_io.h         |    2 +-
 fs/bcachefs/compress.c         |    1 -
 fs/bcachefs/data_update.c      |    2 +-
 fs/bcachefs/data_update.h      |    2 +-
 fs/bcachefs/debug.c            |    1 -
 fs/bcachefs/ec.c               |    3 +-
 fs/bcachefs/errcode.c          |    7 +
 fs/bcachefs/errcode.h          |    4 +
 fs/bcachefs/error.c            |    1 -
 fs/bcachefs/fs-io-buffered.c   |    3 +-
 fs/bcachefs/fs-io-direct.c     |    3 +-
 fs/bcachefs/fs-io.c            |    3 +-
 fs/bcachefs/fs-io.h            |    2 +-
 fs/bcachefs/fs.c               |    2 +-
 fs/bcachefs/io.c               | 3051 ----------------------------------------
 fs/bcachefs/io.h               |  202 ---
 fs/bcachefs/io_misc.c          |  215 +++
 fs/bcachefs/io_misc.h          |   12 +
 fs/bcachefs/io_read.c          | 1207 ++++++++++++++++
 fs/bcachefs/io_read.h          |  158 +++
 fs/bcachefs/io_types.h         |  165 ---
 fs/bcachefs/io_write.c         | 1670 ++++++++++++++++++++++
 fs/bcachefs/io_write.h         |  110 ++
 fs/bcachefs/io_write_types.h   |   96 ++
 fs/bcachefs/journal_io.c       |    1 -
 fs/bcachefs/migrate.c          |    2 +-
 fs/bcachefs/move.c             |    3 +-
 fs/bcachefs/move.h             |    1 +
 fs/bcachefs/movinggc.c         |    8 -
 fs/bcachefs/rebalance.c        |    2 -
 fs/bcachefs/reflink.c          |    4 +-
 fs/bcachefs/super-io.c         |    1 -
 fs/bcachefs/super.c            |    9 +-
 36 files changed, 3510 insertions(+), 3451 deletions(-)
 delete mode 100644 fs/bcachefs/io.c
 delete mode 100644 fs/bcachefs/io.h
 create mode 100644 fs/bcachefs/io_misc.c
 create mode 100644 fs/bcachefs/io_misc.h
 create mode 100644 fs/bcachefs/io_read.c
 create mode 100644 fs/bcachefs/io_read.h
 delete mode 100644 fs/bcachefs/io_types.h
 create mode 100644 fs/bcachefs/io_write.c
 create mode 100644 fs/bcachefs/io_write.h
 create mode 100644 fs/bcachefs/io_write_types.h

(limited to 'fs')

diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile
index 0a4d2fed66c1..9c00dabb26ac 100644
--- a/fs/bcachefs/Makefile
+++ b/fs/bcachefs/Makefile
@@ -46,7 +46,9 @@ bcachefs-y		:=	\
 	fs-io-pagecache.o	\
 	fsck.o			\
 	inode.o			\
-	io.o			\
+	io_read.o		\
+	io_misc.o		\
+	io_write.o		\
 	journal.o		\
 	journal_io.o		\
 	journal_reclaim.o	\
diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index e02749ddc362..8e1888a89011 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -25,7 +25,7 @@
 #include "disk_groups.h"
 #include "ec.h"
 #include "error.h"
-#include "io.h"
+#include "io_write.h"
 #include "journal.h"
 #include "movinggc.h"
 #include "nocow_locking.h"
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 0edbb73a5ec8..00f53cb5d44b 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -14,7 +14,7 @@
 #include "debug.h"
 #include "error.h"
 #include "extents.h"
-#include "io.h"
+#include "io_write.h"
 #include "journal_reclaim.h"
 #include "journal_seq_blacklist.h"
 #include "recovery.h"
diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h
index cd99bbb00a5a..7e03dd76fb38 100644
--- a/fs/bcachefs/btree_io.h
+++ b/fs/bcachefs/btree_io.h
@@ -7,7 +7,7 @@
 #include "btree_locking.h"
 #include "checksum.h"
 #include "extents.h"
-#include "io_types.h"
+#include "io_write_types.h"
 
 struct bch_fs;
 struct btree_write;
diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c
index 6b17f7cc5860..f1651807c2b7 100644
--- a/fs/bcachefs/compress.c
+++ b/fs/bcachefs/compress.c
@@ -3,7 +3,6 @@
 #include "checksum.h"
 #include "compress.h"
 #include "extents.h"
-#include "io.h"
 #include "super-io.h"
 
 #include <linux/lz4.h>
diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c
index 81518f20d37d..29576c4c109d 100644
--- a/fs/bcachefs/data_update.c
+++ b/fs/bcachefs/data_update.c
@@ -9,7 +9,7 @@
 #include "ec.h"
 #include "error.h"
 #include "extents.h"
-#include "io.h"
+#include "io_write.h"
 #include "keylist.h"
 #include "move.h"
 #include "nocow_locking.h"
diff --git a/fs/bcachefs/data_update.h b/fs/bcachefs/data_update.h
index 49e9055cbb52..7ca1f98d7e94 100644
--- a/fs/bcachefs/data_update.h
+++ b/fs/bcachefs/data_update.h
@@ -4,7 +4,7 @@
 #define _BCACHEFS_DATA_UPDATE_H
 
 #include "bkey_buf.h"
-#include "io_types.h"
+#include "io_write_types.h"
 
 struct moving_context;
 
diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c
index ae47e1854b80..5f3e65f9069e 100644
--- a/fs/bcachefs/debug.c
+++ b/fs/bcachefs/debug.c
@@ -19,7 +19,6 @@
 #include "extents.h"
 #include "fsck.h"
 #include "inode.h"
-#include "io.h"
 #include "super.h"
 
 #include <linux/console.h>
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index 67a5453a36d9..40e72b96745a 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -11,10 +11,11 @@
 #include "btree_update.h"
 #include "btree_write_buffer.h"
 #include "buckets.h"
+#include "checksum.h"
 #include "disk_groups.h"
 #include "ec.h"
 #include "error.h"
-#include "io.h"
+#include "io_read.h"
 #include "keylist.h"
 #include "recovery.h"
 #include "replicas.h"
diff --git a/fs/bcachefs/errcode.c b/fs/bcachefs/errcode.c
index dc906fc9176f..8d58f2cca260 100644
--- a/fs/bcachefs/errcode.c
+++ b/fs/bcachefs/errcode.c
@@ -61,3 +61,10 @@ int __bch2_err_class(int err)
 
 	return -err;
 }
+
+const char *bch2_blk_status_to_str(blk_status_t status)
+{
+	if (status == BLK_STS_REMOVED)
+		return "device removed";
+	return blk_status_to_str(status);
+}
diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h
index f7fa87442e98..379d9d7ed333 100644
--- a/fs/bcachefs/errcode.h
+++ b/fs/bcachefs/errcode.h
@@ -249,4 +249,8 @@ static inline long bch2_err_class(long err)
 	return err < 0 ? __bch2_err_class(err) : err;
 }
 
+#define BLK_STS_REMOVED		((__force blk_status_t)128)
+
+const char *bch2_blk_status_to_str(blk_status_t);
+
 #endif /* _BCACHFES_ERRCODE_H */
diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c
index 39009cf0c448..2a5af8872613 100644
--- a/fs/bcachefs/error.c
+++ b/fs/bcachefs/error.c
@@ -1,7 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 #include "bcachefs.h"
 #include "error.h"
-#include "io.h"
 #include "super.h"
 
 #define FSCK_ERR_RATELIMIT_NR	10
diff --git a/fs/bcachefs/fs-io-buffered.c b/fs/bcachefs/fs-io-buffered.c
index dc22182d532f..2034d635c718 100644
--- a/fs/bcachefs/fs-io-buffered.c
+++ b/fs/bcachefs/fs-io-buffered.c
@@ -8,7 +8,8 @@
 #include "fs-io-buffered.h"
 #include "fs-io-direct.h"
 #include "fs-io-pagecache.h"
-#include "io.h"
+#include "io_read.h"
+#include "io_write.h"
 
 #include <linux/backing-dev.h>
 #include <linux/pagemap.h>
diff --git a/fs/bcachefs/fs-io-direct.c b/fs/bcachefs/fs-io-direct.c
index 2b29abd24d56..219bc1124477 100644
--- a/fs/bcachefs/fs-io-direct.c
+++ b/fs/bcachefs/fs-io-direct.c
@@ -7,7 +7,8 @@
 #include "fs-io.h"
 #include "fs-io-direct.h"
 #include "fs-io-pagecache.h"
-#include "io.h"
+#include "io_read.h"
+#include "io_write.h"
 
 #include <linux/kthread.h>
 #include <linux/pagemap.h>
diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index ceab12fb8a8f..0b0b3b0d6c7d 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -3,6 +3,7 @@
 
 #include "bcachefs.h"
 #include "alloc_foreground.h"
+#include "bkey_buf.h"
 #include "btree_update.h"
 #include "buckets.h"
 #include "clock.h"
@@ -16,7 +17,7 @@
 #include "fsck.h"
 #include "inode.h"
 #include "journal.h"
-#include "io.h"
+#include "io_misc.h"
 #include "keylist.h"
 #include "quota.h"
 #include "reflink.h"
diff --git a/fs/bcachefs/fs-io.h b/fs/bcachefs/fs-io.h
index bb5b709fa8cf..bc6e8439d40b 100644
--- a/fs/bcachefs/fs-io.h
+++ b/fs/bcachefs/fs-io.h
@@ -6,7 +6,7 @@
 
 #include "buckets.h"
 #include "fs.h"
-#include "io_types.h"
+#include "io_write_types.h"
 #include "quota.h"
 
 #include <linux/uio.h>
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index 08f810992a1b..0648874d54f3 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -19,7 +19,7 @@
 #include "fs-io-pagecache.h"
 #include "fsck.h"
 #include "inode.h"
-#include "io.h"
+#include "io_read.h"
 #include "journal.h"
 #include "keylist.h"
 #include "quota.h"
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
deleted file mode 100644
index 3c614c864b6e..000000000000
--- a/fs/bcachefs/io.c
+++ /dev/null
@@ -1,3051 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Some low level IO code, and hacks for various block layer limitations
- *
- * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
- * Copyright 2012 Google, Inc.
- */
-
-#include "bcachefs.h"
-#include "alloc_background.h"
-#include "alloc_foreground.h"
-#include "bkey_buf.h"
-#include "bset.h"
-#include "btree_update.h"
-#include "buckets.h"
-#include "checksum.h"
-#include "compress.h"
-#include "clock.h"
-#include "data_update.h"
-#include "debug.h"
-#include "disk_groups.h"
-#include "ec.h"
-#include "error.h"
-#include "extent_update.h"
-#include "inode.h"
-#include "io.h"
-#include "journal.h"
-#include "keylist.h"
-#include "move.h"
-#include "nocow_locking.h"
-#include "rebalance.h"
-#include "subvolume.h"
-#include "super.h"
-#include "super-io.h"
-#include "trace.h"
-
-#include <linux/blkdev.h>
-#include <linux/prefetch.h>
-#include <linux/random.h>
-#include <linux/sched/mm.h>
-
-const char *bch2_blk_status_to_str(blk_status_t status)
-{
-	if (status == BLK_STS_REMOVED)
-		return "device removed";
-	return blk_status_to_str(status);
-}
-
-#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
-
-static bool bch2_target_congested(struct bch_fs *c, u16 target)
-{
-	const struct bch_devs_mask *devs;
-	unsigned d, nr = 0, total = 0;
-	u64 now = local_clock(), last;
-	s64 congested;
-	struct bch_dev *ca;
-
-	if (!target)
-		return false;
-
-	rcu_read_lock();
-	devs = bch2_target_to_mask(c, target) ?:
-		&c->rw_devs[BCH_DATA_user];
-
-	for_each_set_bit(d, devs->d, BCH_SB_MEMBERS_MAX) {
-		ca = rcu_dereference(c->devs[d]);
-		if (!ca)
-			continue;
-
-		congested = atomic_read(&ca->congested);
-		last = READ_ONCE(ca->congested_last);
-		if (time_after64(now, last))
-			congested -= (now - last) >> 12;
-
-		total += max(congested, 0LL);
-		nr++;
-	}
-	rcu_read_unlock();
-
-	return bch2_rand_range(nr * CONGESTED_MAX) < total;
-}
-
-static inline void bch2_congested_acct(struct bch_dev *ca, u64 io_latency,
-				       u64 now, int rw)
-{
-	u64 latency_capable =
-		ca->io_latency[rw].quantiles.entries[QUANTILE_IDX(1)].m;
-	/* ideally we'd be taking into account the device's variance here: */
-	u64 latency_threshold = latency_capable << (rw == READ ? 2 : 3);
-	s64 latency_over = io_latency - latency_threshold;
-
-	if (latency_threshold && latency_over > 0) {
-		/*
-		 * bump up congested by approximately latency_over * 4 /
-		 * latency_threshold - we don't need much accuracy here so don't
-		 * bother with the divide:
-		 */
-		if (atomic_read(&ca->congested) < CONGESTED_MAX)
-			atomic_add(latency_over >>
-				   max_t(int, ilog2(latency_threshold) - 2, 0),
-				   &ca->congested);
-
-		ca->congested_last = now;
-	} else if (atomic_read(&ca->congested) > 0) {
-		atomic_dec(&ca->congested);
-	}
-}
-
-void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw)
-{
-	atomic64_t *latency = &ca->cur_latency[rw];
-	u64 now = local_clock();
-	u64 io_latency = time_after64(now, submit_time)
-		? now - submit_time
-		: 0;
-	u64 old, new, v = atomic64_read(latency);
-
-	do {
-		old = v;
-
-		/*
-		 * If the io latency was reasonably close to the current
-		 * latency, skip doing the update and atomic operation - most of
-		 * the time:
-		 */
-		if (abs((int) (old - io_latency)) < (old >> 1) &&
-		    now & ~(~0U << 5))
-			break;
-
-		new = ewma_add(old, io_latency, 5);
-	} while ((v = atomic64_cmpxchg(latency, old, new)) != old);
-
-	bch2_congested_acct(ca, io_latency, now, rw);
-
-	__bch2_time_stats_update(&ca->io_latency[rw], submit_time, now);
-}
-
-#else
-
-static bool bch2_target_congested(struct bch_fs *c, u16 target)
-{
-	return false;
-}
-
-#endif
-
-/* Allocate, free from mempool: */
-
-void bch2_bio_free_pages_pool(struct bch_fs *c, struct bio *bio)
-{
-	struct bvec_iter_all iter;
-	struct bio_vec *bv;
-
-	bio_for_each_segment_all(bv, bio, iter)
-		if (bv->bv_page != ZERO_PAGE(0))
-			mempool_free(bv->bv_page, &c->bio_bounce_pages);
-	bio->bi_vcnt = 0;
-}
-
-static struct page *__bio_alloc_page_pool(struct bch_fs *c, bool *using_mempool)
-{
-	struct page *page;
-
-	if (likely(!*using_mempool)) {
-		page = alloc_page(GFP_NOFS);
-		if (unlikely(!page)) {
-			mutex_lock(&c->bio_bounce_pages_lock);
-			*using_mempool = true;
-			goto pool_alloc;
-
-		}
-	} else {
-pool_alloc:
-		page = mempool_alloc(&c->bio_bounce_pages, GFP_NOFS);
-	}
-
-	return page;
-}
-
-void bch2_bio_alloc_pages_pool(struct bch_fs *c, struct bio *bio,
-			       size_t size)
-{
-	bool using_mempool = false;
-
-	while (size) {
-		struct page *page = __bio_alloc_page_pool(c, &using_mempool);
-		unsigned len = min_t(size_t, PAGE_SIZE, size);
-
-		BUG_ON(!bio_add_page(bio, page, len, 0));
-		size -= len;
-	}
-
-	if (using_mempool)
-		mutex_unlock(&c->bio_bounce_pages_lock);
-}
-
-/* Extent update path: */
-
-int bch2_sum_sector_overwrites(struct btree_trans *trans,
-			       struct btree_iter *extent_iter,
-			       struct bkey_i *new,
-			       bool *usage_increasing,
-			       s64 *i_sectors_delta,
-			       s64 *disk_sectors_delta)
-{
-	struct bch_fs *c = trans->c;
-	struct btree_iter iter;
-	struct bkey_s_c old;
-	unsigned new_replicas = bch2_bkey_replicas(c, bkey_i_to_s_c(new));
-	bool new_compressed = bch2_bkey_sectors_compressed(bkey_i_to_s_c(new));
-	int ret = 0;
-
-	*usage_increasing	= false;
-	*i_sectors_delta	= 0;
-	*disk_sectors_delta	= 0;
-
-	bch2_trans_copy_iter(&iter, extent_iter);
-
-	for_each_btree_key_upto_continue_norestart(iter,
-				new->k.p, BTREE_ITER_SLOTS, old, ret) {
-		s64 sectors = min(new->k.p.offset, old.k->p.offset) -
-			max(bkey_start_offset(&new->k),
-			    bkey_start_offset(old.k));
-
-		*i_sectors_delta += sectors *
-			(bkey_extent_is_allocation(&new->k) -
-			 bkey_extent_is_allocation(old.k));
-
-		*disk_sectors_delta += sectors * bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(new));
-		*disk_sectors_delta -= new->k.p.snapshot == old.k->p.snapshot
-			? sectors * bch2_bkey_nr_ptrs_fully_allocated(old)
-			: 0;
-
-		if (!*usage_increasing &&
-		    (new->k.p.snapshot != old.k->p.snapshot ||
-		     new_replicas > bch2_bkey_replicas(c, old) ||
-		     (!new_compressed && bch2_bkey_sectors_compressed(old))))
-			*usage_increasing = true;
-
-		if (bkey_ge(old.k->p, new->k.p))
-			break;
-	}
-
-	bch2_trans_iter_exit(trans, &iter);
-	return ret;
-}
-
-static inline int bch2_extent_update_i_size_sectors(struct btree_trans *trans,
-						    struct btree_iter *extent_iter,
-						    u64 new_i_size,
-						    s64 i_sectors_delta)
-{
-	struct btree_iter iter;
-	struct bkey_i *k;
-	struct bkey_i_inode_v3 *inode;
-	unsigned inode_update_flags = BTREE_UPDATE_NOJOURNAL;
-	int ret;
-
-	k = bch2_bkey_get_mut_noupdate(trans, &iter, BTREE_ID_inodes,
-			      SPOS(0,
-				   extent_iter->pos.inode,
-				   extent_iter->snapshot),
-			      BTREE_ITER_CACHED);
-	ret = PTR_ERR_OR_ZERO(k);
-	if (unlikely(ret))
-		return ret;
-
-	if (unlikely(k->k.type != KEY_TYPE_inode_v3)) {
-		k = bch2_inode_to_v3(trans, k);
-		ret = PTR_ERR_OR_ZERO(k);
-		if (unlikely(ret))
-			goto err;
-	}
-
-	inode = bkey_i_to_inode_v3(k);
-
-	if (!(le64_to_cpu(inode->v.bi_flags) & BCH_INODE_I_SIZE_DIRTY) &&
-	    new_i_size > le64_to_cpu(inode->v.bi_size)) {
-		inode->v.bi_size = cpu_to_le64(new_i_size);
-		inode_update_flags = 0;
-	}
-
-	if (i_sectors_delta) {
-		le64_add_cpu(&inode->v.bi_sectors, i_sectors_delta);
-		inode_update_flags = 0;
-	}
-
-	if (inode->k.p.snapshot != iter.snapshot) {
-		inode->k.p.snapshot = iter.snapshot;
-		inode_update_flags = 0;
-	}
-
-	ret = bch2_trans_update(trans, &iter, &inode->k_i,
-				BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|
-				inode_update_flags);
-err:
-	bch2_trans_iter_exit(trans, &iter);
-	return ret;
-}
-
-int bch2_extent_update(struct btree_trans *trans,
-		       subvol_inum inum,
-		       struct btree_iter *iter,
-		       struct bkey_i *k,
-		       struct disk_reservation *disk_res,
-		       u64 new_i_size,
-		       s64 *i_sectors_delta_total,
-		       bool check_enospc)
-{
-	struct bpos next_pos;
-	bool usage_increasing;
-	s64 i_sectors_delta = 0, disk_sectors_delta = 0;
-	int ret;
-
-	/*
-	 * This traverses us the iterator without changing iter->path->pos to
-	 * search_key() (which is pos + 1 for extents): we want there to be a
-	 * path already traversed at iter->pos because
-	 * bch2_trans_extent_update() will use it to attempt extent merging
-	 */
-	ret = __bch2_btree_iter_traverse(iter);
-	if (ret)
-		return ret;
-
-	ret = bch2_extent_trim_atomic(trans, iter, k);
-	if (ret)
-		return ret;
-
-	next_pos = k->k.p;
-
-	ret = bch2_sum_sector_overwrites(trans, iter, k,
-			&usage_increasing,
-			&i_sectors_delta,
-			&disk_sectors_delta);
-	if (ret)
-		return ret;
-
-	if (disk_res &&
-	    disk_sectors_delta > (s64) disk_res->sectors) {
-		ret = bch2_disk_reservation_add(trans->c, disk_res,
-					disk_sectors_delta - disk_res->sectors,
-					!check_enospc || !usage_increasing
-					? BCH_DISK_RESERVATION_NOFAIL : 0);
-		if (ret)
-			return ret;
-	}
-
-	/*
-	 * Note:
-	 * We always have to do an inode update - even when i_size/i_sectors
-	 * aren't changing - for fsync to work properly; fsync relies on
-	 * inode->bi_journal_seq which is updated by the trigger code:
-	 */
-	ret =   bch2_extent_update_i_size_sectors(trans, iter,
-						  min(k->k.p.offset << 9, new_i_size),
-						  i_sectors_delta) ?:
-		bch2_trans_update(trans, iter, k, 0) ?:
-		bch2_trans_commit(trans, disk_res, NULL,
-				BTREE_INSERT_NOCHECK_RW|
-				BTREE_INSERT_NOFAIL);
-	if (unlikely(ret))
-		return ret;
-
-	if (i_sectors_delta_total)
-		*i_sectors_delta_total += i_sectors_delta;
-	bch2_btree_iter_set_pos(iter, next_pos);
-	return 0;
-}
-
-/* Overwrites whatever was present with zeroes: */
-int bch2_extent_fallocate(struct btree_trans *trans,
-			  subvol_inum inum,
-			  struct btree_iter *iter,
-			  unsigned sectors,
-			  struct bch_io_opts opts,
-			  s64 *i_sectors_delta,
-			  struct write_point_specifier write_point)
-{
-	struct bch_fs *c = trans->c;
-	struct disk_reservation disk_res = { 0 };
-	struct closure cl;
-	struct open_buckets open_buckets = { 0 };
-	struct bkey_s_c k;
-	struct bkey_buf old, new;
-	unsigned sectors_allocated = 0;
-	bool have_reservation = false;
-	bool unwritten = opts.nocow &&
-	    c->sb.version >= bcachefs_metadata_version_unwritten_extents;
-	int ret;
-
-	bch2_bkey_buf_init(&old);
-	bch2_bkey_buf_init(&new);
-	closure_init_stack(&cl);
-
-	k = bch2_btree_iter_peek_slot(iter);
-	ret = bkey_err(k);
-	if (ret)
-		return ret;
-
-	sectors = min_t(u64, sectors, k.k->p.offset - iter->pos.offset);
-
-	if (!have_reservation) {
-		unsigned new_replicas =
-			max(0, (int) opts.data_replicas -
-			    (int) bch2_bkey_nr_ptrs_fully_allocated(k));
-		/*
-		 * Get a disk reservation before (in the nocow case) calling
-		 * into the allocator:
-		 */
-		ret = bch2_disk_reservation_get(c, &disk_res, sectors, new_replicas, 0);
-		if (unlikely(ret))
-			goto err;
-
-		bch2_bkey_buf_reassemble(&old, c, k);
-	}
-
-	if (have_reservation) {
-		if (!bch2_extents_match(k, bkey_i_to_s_c(old.k)))
-			goto err;
-
-		bch2_key_resize(&new.k->k, sectors);
-	} else if (!unwritten) {
-		struct bkey_i_reservation *reservation;
-
-		bch2_bkey_buf_realloc(&new, c, sizeof(*reservation) / sizeof(u64));
-		reservation = bkey_reservation_init(new.k);
-		reservation->k.p = iter->pos;
-		bch2_key_resize(&reservation->k, sectors);
-		reservation->v.nr_replicas = opts.data_replicas;
-	} else {
-		struct bkey_i_extent *e;
-		struct bch_devs_list devs_have;
-		struct write_point *wp;
-		struct bch_extent_ptr *ptr;
-
-		devs_have.nr = 0;
-
-		bch2_bkey_buf_realloc(&new, c, BKEY_EXTENT_U64s_MAX);
-
-		e = bkey_extent_init(new.k);
-		e->k.p = iter->pos;
-
-		ret = bch2_alloc_sectors_start_trans(trans,
-				opts.foreground_target,
-				false,
-				write_point,
-				&devs_have,
-				opts.data_replicas,
-				opts.data_replicas,
-				BCH_WATERMARK_normal, 0, &cl, &wp);
-		if (bch2_err_matches(ret, BCH_ERR_operation_blocked))
-			ret = -BCH_ERR_transaction_restart_nested;
-		if (ret)
-			goto err;
-
-		sectors = min(sectors, wp->sectors_free);
-		sectors_allocated = sectors;
-
-		bch2_key_resize(&e->k, sectors);
-
-		bch2_open_bucket_get(c, wp, &open_buckets);
-		bch2_alloc_sectors_append_ptrs(c, wp, &e->k_i, sectors, false);
-		bch2_alloc_sectors_done(c, wp);
-
-		extent_for_each_ptr(extent_i_to_s(e), ptr)
-			ptr->unwritten = true;
-	}
-
-	have_reservation = true;
-
-	ret = bch2_extent_update(trans, inum, iter, new.k, &disk_res,
-				 0, i_sectors_delta, true);
-err:
-	if (!ret && sectors_allocated)
-		bch2_increment_clock(c, sectors_allocated, WRITE);
-
-	bch2_open_buckets_put(c, &open_buckets);
-	bch2_disk_reservation_put(c, &disk_res);
-	bch2_bkey_buf_exit(&new, c);
-	bch2_bkey_buf_exit(&old, c);
-
-	if (closure_nr_remaining(&cl) != 1) {
-		bch2_trans_unlock(trans);
-		closure_sync(&cl);
-	}
-
-	return ret;
-}
-
-/*
- * Returns -BCH_ERR_transacton_restart if we had to drop locks:
- */
-int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter,
-		   subvol_inum inum, u64 end,
-		   s64 *i_sectors_delta)
-{
-	struct bch_fs *c	= trans->c;
-	unsigned max_sectors	= KEY_SIZE_MAX & (~0 << c->block_bits);
-	struct bpos end_pos = POS(inum.inum, end);
-	struct bkey_s_c k;
-	int ret = 0, ret2 = 0;
-	u32 snapshot;
-
-	while (!ret ||
-	       bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
-		struct disk_reservation disk_res =
-			bch2_disk_reservation_init(c, 0);
-		struct bkey_i delete;
-
-		if (ret)
-			ret2 = ret;
-
-		bch2_trans_begin(trans);
-
-		ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
-		if (ret)
-			continue;
-
-		bch2_btree_iter_set_snapshot(iter, snapshot);
-
-		/*
-		 * peek_upto() doesn't have ideal semantics for extents:
-		 */
-		k = bch2_btree_iter_peek_upto(iter, end_pos);
-		if (!k.k)
-			break;
-
-		ret = bkey_err(k);
-		if (ret)
-			continue;
-
-		bkey_init(&delete.k);
-		delete.k.p = iter->pos;
-
-		/* create the biggest key we can */
-		bch2_key_resize(&delete.k, max_sectors);
-		bch2_cut_back(end_pos, &delete);
-
-		ret = bch2_extent_update(trans, inum, iter, &delete,
-				&disk_res, 0, i_sectors_delta, false);
-		bch2_disk_reservation_put(c, &disk_res);
-	}
-
-	return ret ?: ret2;
-}
-
-int bch2_fpunch(struct bch_fs *c, subvol_inum inum, u64 start, u64 end,
-		s64 *i_sectors_delta)
-{
-	struct btree_trans trans;
-	struct btree_iter iter;
-	int ret;
-
-	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024);
-	bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
-			     POS(inum.inum, start),
-			     BTREE_ITER_INTENT);
-
-	ret = bch2_fpunch_at(&trans, &iter, inum, end, i_sectors_delta);
-
-	bch2_trans_iter_exit(&trans, &iter);
-	bch2_trans_exit(&trans);
-
-	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-		ret = 0;
-
-	return ret;
-}
-
-static int bch2_write_index_default(struct bch_write_op *op)
-{
-	struct bch_fs *c = op->c;
-	struct bkey_buf sk;
-	struct keylist *keys = &op->insert_keys;
-	struct bkey_i *k = bch2_keylist_front(keys);
-	struct btree_trans trans;
-	struct btree_iter iter;
-	subvol_inum inum = {
-		.subvol = op->subvol,
-		.inum	= k->k.p.inode,
-	};
-	int ret;
-
-	BUG_ON(!inum.subvol);
-
-	bch2_bkey_buf_init(&sk);
-	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024);
-
-	do {
-		bch2_trans_begin(&trans);
-
-		k = bch2_keylist_front(keys);
-		bch2_bkey_buf_copy(&sk, c, k);
-
-		ret = bch2_subvolume_get_snapshot(&trans, inum.subvol,
-						  &sk.k->k.p.snapshot);
-		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-			continue;
-		if (ret)
-			break;
-
-		bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
-				     bkey_start_pos(&sk.k->k),
-				     BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
-
-		ret = bch2_extent_update(&trans, inum, &iter, sk.k,
-					 &op->res,
-					 op->new_i_size, &op->i_sectors_delta,
-					 op->flags & BCH_WRITE_CHECK_ENOSPC);
-		bch2_trans_iter_exit(&trans, &iter);
-
-		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-			continue;
-		if (ret)
-			break;
-
-		if (bkey_ge(iter.pos, k->k.p))
-			bch2_keylist_pop_front(&op->insert_keys);
-		else
-			bch2_cut_front(iter.pos, k);
-	} while (!bch2_keylist_empty(keys));
-
-	bch2_trans_exit(&trans);
-	bch2_bkey_buf_exit(&sk, c);
-
-	return ret;
-}
-
-/* Writes */
-
-void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
-			       enum bch_data_type type,
-			       const struct bkey_i *k,
-			       bool nocow)
-{
-	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(k));
-	const struct bch_extent_ptr *ptr;
-	struct bch_write_bio *n;
-	struct bch_dev *ca;
-
-	BUG_ON(c->opts.nochanges);
-
-	bkey_for_each_ptr(ptrs, ptr) {
-		BUG_ON(ptr->dev >= BCH_SB_MEMBERS_MAX ||
-		       !c->devs[ptr->dev]);
-
-		ca = bch_dev_bkey_exists(c, ptr->dev);
-
-		if (to_entry(ptr + 1) < ptrs.end) {
-			n = to_wbio(bio_alloc_clone(NULL, &wbio->bio,
-						GFP_NOFS, &ca->replica_set));
-
-			n->bio.bi_end_io	= wbio->bio.bi_end_io;
-			n->bio.bi_private	= wbio->bio.bi_private;
-			n->parent		= wbio;
-			n->split		= true;
-			n->bounce		= false;
-			n->put_bio		= true;
-			n->bio.bi_opf		= wbio->bio.bi_opf;
-			bio_inc_remaining(&wbio->bio);
-		} else {
-			n = wbio;
-			n->split		= false;
-		}
-
-		n->c			= c;
-		n->dev			= ptr->dev;
-		n->have_ioref		= nocow || bch2_dev_get_ioref(ca,
-					type == BCH_DATA_btree ? READ : WRITE);
-		n->nocow		= nocow;
-		n->submit_time		= local_clock();
-		n->inode_offset		= bkey_start_offset(&k->k);
-		n->bio.bi_iter.bi_sector = ptr->offset;
-
-		if (likely(n->have_ioref)) {
-			this_cpu_add(ca->io_done->sectors[WRITE][type],
-				     bio_sectors(&n->bio));
-
-			bio_set_dev(&n->bio, ca->disk_sb.bdev);
-
-			if (type != BCH_DATA_btree && unlikely(c->opts.no_data_io)) {
-				bio_endio(&n->bio);
-				continue;
-			}
-
-			submit_bio(&n->bio);
-		} else {
-			n->bio.bi_status	= BLK_STS_REMOVED;
-			bio_endio(&n->bio);
-		}
-	}
-}
-
-static void __bch2_write(struct bch_write_op *);
-
-static void bch2_write_done(struct closure *cl)
-{
-	struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
-	struct bch_fs *c = op->c;
-
-	EBUG_ON(op->open_buckets.nr);
-
-	bch2_time_stats_update(&c->times[BCH_TIME_data_write], op->start_time);
-	bch2_disk_reservation_put(c, &op->res);
-
-	if (!(op->flags & BCH_WRITE_MOVE))
-		bch2_write_ref_put(c, BCH_WRITE_REF_write);
-	bch2_keylist_free(&op->insert_keys, op->inline_keys);
-
-	EBUG_ON(cl->parent);
-	closure_debug_destroy(cl);
-	if (op->end_io)
-		op->end_io(op);
-}
-
-static noinline int bch2_write_drop_io_error_ptrs(struct bch_write_op *op)
-{
-	struct keylist *keys = &op->insert_keys;
-	struct bch_extent_ptr *ptr;
-	struct bkey_i *src, *dst = keys->keys, *n;
-
-	for (src = keys->keys; src != keys->top; src = n) {
-		n = bkey_next(src);
-
-		if (bkey_extent_is_direct_data(&src->k)) {
-			bch2_bkey_drop_ptrs(bkey_i_to_s(src), ptr,
-					    test_bit(ptr->dev, op->failed.d));
-
-			if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(src)))
-				return -EIO;
-		}
-
-		if (dst != src)
-			memmove_u64s_down(dst, src, src->k.u64s);
-		dst = bkey_next(dst);
-	}
-
-	keys->top = dst;
-	return 0;
-}
-
-/**
- * bch_write_index - after a write, update index to point to new data
- */
-static void __bch2_write_index(struct bch_write_op *op)
-{
-	struct bch_fs *c = op->c;
-	struct keylist *keys = &op->insert_keys;
-	struct bkey_i *k;
-	unsigned dev;
-	int ret = 0;
-
-	if (unlikely(op->flags & BCH_WRITE_IO_ERROR)) {
-		ret = bch2_write_drop_io_error_ptrs(op);
-		if (ret)
-			goto err;
-	}
-
-	/*
-	 * probably not the ideal place to hook this in, but I don't
-	 * particularly want to plumb io_opts all the way through the btree
-	 * update stack right now
-	 */
-	for_each_keylist_key(keys, k)
-		bch2_rebalance_add_key(c, bkey_i_to_s_c(k), &op->opts);
-
-	if (!bch2_keylist_empty(keys)) {
-		u64 sectors_start = keylist_sectors(keys);
-
-		ret = !(op->flags & BCH_WRITE_MOVE)
-			? bch2_write_index_default(op)
-			: bch2_data_update_index_update(op);
-
-		BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart));
-		BUG_ON(keylist_sectors(keys) && !ret);
-
-		op->written += sectors_start - keylist_sectors(keys);
-
-		if (ret && !bch2_err_matches(ret, EROFS)) {
-			struct bkey_i *k = bch2_keylist_front(&op->insert_keys);
-
-			bch_err_inum_offset_ratelimited(c,
-				k->k.p.inode, k->k.p.offset << 9,
-				"write error while doing btree update: %s",
-				bch2_err_str(ret));
-		}
-
-		if (ret)
-			goto err;
-	}
-out:
-	/* If some a bucket wasn't written, we can't erasure code it: */
-	for_each_set_bit(dev, op->failed.d, BCH_SB_MEMBERS_MAX)
-		bch2_open_bucket_write_error(c, &op->open_buckets, dev);
-
-	bch2_open_buckets_put(c, &op->open_buckets);
-	return;
-err:
-	keys->top = keys->keys;
-	op->error = ret;
-	op->flags |= BCH_WRITE_DONE;
-	goto out;
-}
-
-static inline void __wp_update_state(struct write_point *wp, enum write_point_state state)
-{
-	if (state != wp->state) {
-		u64 now = ktime_get_ns();
-
-		if (wp->last_state_change &&
-		    time_after64(now, wp->last_state_change))
-			wp->time[wp->state] += now - wp->last_state_change;
-		wp->state = state;
-		wp->last_state_change = now;
-	}
-}
-
-static inline void wp_update_state(struct write_point *wp, bool running)
-{
-	enum write_point_state state;
-
-	state = running			 ? WRITE_POINT_running :
-		!list_empty(&wp->writes) ? WRITE_POINT_waiting_io
-					 : WRITE_POINT_stopped;
-
-	__wp_update_state(wp, state);
-}
-
-static void bch2_write_index(struct closure *cl)
-{
-	struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
-	struct write_point *wp = op->wp;
-	struct workqueue_struct *wq = index_update_wq(op);
-	unsigned long flags;
-
-	if ((op->flags & BCH_WRITE_DONE) &&
-	    (op->flags & BCH_WRITE_MOVE))
-		bch2_bio_free_pages_pool(op->c, &op->wbio.bio);
-
-	spin_lock_irqsave(&wp->writes_lock, flags);
-	if (wp->state == WRITE_POINT_waiting_io)
-		__wp_update_state(wp, WRITE_POINT_waiting_work);
-	list_add_tail(&op->wp_list, &wp->writes);
-	spin_unlock_irqrestore (&wp->writes_lock, flags);
-
-	queue_work(wq, &wp->index_update_work);
-}
-
-static inline void bch2_write_queue(struct bch_write_op *op, struct write_point *wp)
-{
-	op->wp = wp;
-
-	if (wp->state == WRITE_POINT_stopped) {
-		spin_lock_irq(&wp->writes_lock);
-		__wp_update_state(wp, WRITE_POINT_waiting_io);
-		spin_unlock_irq(&wp->writes_lock);
-	}
-}
-
-void bch2_write_point_do_index_updates(struct work_struct *work)
-{
-	struct write_point *wp =
-		container_of(work, struct write_point, index_update_work);
-	struct bch_write_op *op;
-
-	while (1) {
-		spin_lock_irq(&wp->writes_lock);
-		op = list_first_entry_or_null(&wp->writes, struct bch_write_op, wp_list);
-		if (op)
-			list_del(&op->wp_list);
-		wp_update_state(wp, op != NULL);
-		spin_unlock_irq(&wp->writes_lock);
-
-		if (!op)
-			break;
-
-		op->flags |= BCH_WRITE_IN_WORKER;
-
-		__bch2_write_index(op);
-
-		if (!(op->flags & BCH_WRITE_DONE))
-			__bch2_write(op);
-		else
-			bch2_write_done(&op->cl);
-	}
-}
-
-static void bch2_write_endio(struct bio *bio)
-{
-	struct closure *cl		= bio->bi_private;
-	struct bch_write_op *op		= container_of(cl, struct bch_write_op, cl);
-	struct bch_write_bio *wbio	= to_wbio(bio);
-	struct bch_write_bio *parent	= wbio->split ? wbio->parent : NULL;
-	struct bch_fs *c		= wbio->c;
-	struct bch_dev *ca		= bch_dev_bkey_exists(c, wbio->dev);
-
-	if (bch2_dev_inum_io_err_on(bio->bi_status, ca,
-				    op->pos.inode,
-				    wbio->inode_offset << 9,
-				    "data write error: %s",
-				    bch2_blk_status_to_str(bio->bi_status))) {
-		set_bit(wbio->dev, op->failed.d);
-		op->flags |= BCH_WRITE_IO_ERROR;
-	}
-
-	if (wbio->nocow)
-		set_bit(wbio->dev, op->devs_need_flush->d);
-
-	if (wbio->have_ioref) {
-		bch2_latency_acct(ca, wbio->submit_time, WRITE);
-		percpu_ref_put(&ca->io_ref);
-	}
-
-	if (wbio->bounce)
-		bch2_bio_free_pages_pool(c, bio);
-
-	if (wbio->put_bio)
-		bio_put(bio);
-
-	if (parent)
-		bio_endio(&parent->bio);
-	else
-		closure_put(cl);
-}
-
-static void init_append_extent(struct bch_write_op *op,
-			       struct write_point *wp,
-			       struct bversion version,
-			       struct bch_extent_crc_unpacked crc)
-{
-	struct bkey_i_extent *e;
-
-	op->pos.offset += crc.uncompressed_size;
-
-	e = bkey_extent_init(op->insert_keys.top);
-	e->k.p		= op->pos;
-	e->k.size	= crc.uncompressed_size;
-	e->k.version	= version;
-
-	if (crc.csum_type ||
-	    crc.compression_type ||
-	    crc.nonce)
-		bch2_extent_crc_append(&e->k_i, crc);
-
-	bch2_alloc_sectors_append_ptrs_inlined(op->c, wp, &e->k_i, crc.compressed_size,
-				       op->flags & BCH_WRITE_CACHED);
-
-	bch2_keylist_push(&op->insert_keys);
-}
-
-static struct bio *bch2_write_bio_alloc(struct bch_fs *c,
-					struct write_point *wp,
-					struct bio *src,
-					bool *page_alloc_failed,
-					void *buf)
-{
-	struct bch_write_bio *wbio;
-	struct bio *bio;
-	unsigned output_available =
-		min(wp->sectors_free << 9, src->bi_iter.bi_size);
-	unsigned pages = DIV_ROUND_UP(output_available +
-				      (buf
-				       ? ((unsigned long) buf & (PAGE_SIZE - 1))
-				       : 0), PAGE_SIZE);
-
-	pages = min(pages, BIO_MAX_VECS);
-
-	bio = bio_alloc_bioset(NULL, pages, 0,
-			       GFP_NOFS, &c->bio_write);
-	wbio			= wbio_init(bio);
-	wbio->put_bio		= true;
-	/* copy WRITE_SYNC flag */
-	wbio->bio.bi_opf	= src->bi_opf;
-
-	if (buf) {
-		bch2_bio_map(bio, buf, output_available);
-		return bio;
-	}
-
-	wbio->bounce		= true;
-
-	/*
-	 * We can't use mempool for more than c->sb.encoded_extent_max
-	 * worth of pages, but we'd like to allocate more if we can:
-	 */
-	bch2_bio_alloc_pages_pool(c, bio,
-				  min_t(unsigned, output_available,
-					c->opts.encoded_extent_max));
-
-	if (bio->bi_iter.bi_size < output_available)
-		*page_alloc_failed =
-			bch2_bio_alloc_pages(bio,
-					     output_available -
-					     bio->bi_iter.bi_size,
-					     GFP_NOFS) != 0;
-
-	return bio;
-}
-
-static int bch2_write_rechecksum(struct bch_fs *c,
-				 struct bch_write_op *op,
-				 unsigned new_csum_type)
-{
-	struct bio *bio = &op->wbio.bio;
-	struct bch_extent_crc_unpacked new_crc;
-	int ret;
-
-	/* bch2_rechecksum_bio() can't encrypt or decrypt data: */
-
-	if (bch2_csum_type_is_encryption(op->crc.csum_type) !=
-	    bch2_csum_type_is_encryption(new_csum_type))
-		new_csum_type = op->crc.csum_type;
-
-	ret = bch2_rechecksum_bio(c, bio, op->version, op->crc,
-				  NULL, &new_crc,
-				  op->crc.offset, op->crc.live_size,
-				  new_csum_type);
-	if (ret)
-		return ret;
-
-	bio_advance(bio, op->crc.offset << 9);
-	bio->bi_iter.bi_size = op->crc.live_size << 9;
-	op->crc = new_crc;
-	return 0;
-}
-
-static int bch2_write_decrypt(struct bch_write_op *op)
-{
-	struct bch_fs *c = op->c;
-	struct nonce nonce = extent_nonce(op->version, op->crc);
-	struct bch_csum csum;
-	int ret;
-
-	if (!bch2_csum_type_is_encryption(op->crc.csum_type))
-		return 0;
-
-	/*
-	 * If we need to decrypt data in the write path, we'll no longer be able
-	 * to verify the existing checksum (poly1305 mac, in this case) after
-	 * it's decrypted - this is the last point we'll be able to reverify the
-	 * checksum:
-	 */
-	csum = bch2_checksum_bio(c, op->crc.csum_type, nonce, &op->wbio.bio);
-	if (bch2_crc_cmp(op->crc.csum, csum))
-		return -EIO;
-
-	ret = bch2_encrypt_bio(c, op->crc.csum_type, nonce, &op->wbio.bio);
-	op->crc.csum_type = 0;
-	op->crc.csum = (struct bch_csum) { 0, 0 };
-	return ret;
-}
-
-static enum prep_encoded_ret {
-	PREP_ENCODED_OK,
-	PREP_ENCODED_ERR,
-	PREP_ENCODED_CHECKSUM_ERR,
-	PREP_ENCODED_DO_WRITE,
-} bch2_write_prep_encoded_data(struct bch_write_op *op, struct write_point *wp)
-{
-	struct bch_fs *c = op->c;
-	struct bio *bio = &op->wbio.bio;
-
-	if (!(op->flags & BCH_WRITE_DATA_ENCODED))
-		return PREP_ENCODED_OK;
-
-	BUG_ON(bio_sectors(bio) != op->crc.compressed_size);
-
-	/* Can we just write the entire extent as is? */
-	if (op->crc.uncompressed_size == op->crc.live_size &&
-	    op->crc.compressed_size <= wp->sectors_free &&
-	    (op->crc.compression_type == bch2_compression_opt_to_type(op->compression_opt) ||
-	     op->incompressible)) {
-		if (!crc_is_compressed(op->crc) &&
-		    op->csum_type != op->crc.csum_type &&
-		    bch2_write_rechecksum(c, op, op->csum_type) &&
-		    !c->opts.no_data_io)
-			return PREP_ENCODED_CHECKSUM_ERR;
-
-		return PREP_ENCODED_DO_WRITE;
-	}
-
-	/*
-	 * If the data is compressed and we couldn't write the entire extent as
-	 * is, we have to decompress it:
-	 */
-	if (crc_is_compressed(op->crc)) {
-		struct bch_csum csum;
-
-		if (bch2_write_decrypt(op))
-			return PREP_ENCODED_CHECKSUM_ERR;
-
-		/* Last point we can still verify checksum: */
-		csum = bch2_checksum_bio(c, op->crc.csum_type,
-					 extent_nonce(op->version, op->crc),
-					 bio);
-		if (bch2_crc_cmp(op->crc.csum, csum) && !c->opts.no_data_io)
-			return PREP_ENCODED_CHECKSUM_ERR;
-
-		if (bch2_bio_uncompress_inplace(c, bio, &op->crc))
-			return PREP_ENCODED_ERR;
-	}
-
-	/*
-	 * No longer have compressed data after this point - data might be
-	 * encrypted:
-	 */
-
-	/*
-	 * If the data is checksummed and we're only writing a subset,
-	 * rechecksum and adjust bio to point to currently live data:
-	 */
-	if ((op->crc.live_size != op->crc.uncompressed_size ||
-	     op->crc.csum_type != op->csum_type) &&
-	    bch2_write_rechecksum(c, op, op->csum_type) &&
-	    !c->opts.no_data_io)
-		return PREP_ENCODED_CHECKSUM_ERR;
-
-	/*
-	 * If we want to compress the data, it has to be decrypted:
-	 */
-	if ((op->compression_opt ||
-	     bch2_csum_type_is_encryption(op->crc.csum_type) !=
-	     bch2_csum_type_is_encryption(op->csum_type)) &&
-	    bch2_write_decrypt(op))
-		return PREP_ENCODED_CHECKSUM_ERR;
-
-	return PREP_ENCODED_OK;
-}
-
-static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp,
-			     struct bio **_dst)
-{
-	struct bch_fs *c = op->c;
-	struct bio *src = &op->wbio.bio, *dst = src;
-	struct bvec_iter saved_iter;
-	void *ec_buf;
-	unsigned total_output = 0, total_input = 0;
-	bool bounce = false;
-	bool page_alloc_failed = false;
-	int ret, more = 0;
-
-	BUG_ON(!bio_sectors(src));
-
-	ec_buf = bch2_writepoint_ec_buf(c, wp);
-
-	switch (bch2_write_prep_encoded_data(op, wp)) {
-	case PREP_ENCODED_OK:
-		break;
-	case PREP_ENCODED_ERR:
-		ret = -EIO;
-		goto err;
-	case PREP_ENCODED_CHECKSUM_ERR:
-		goto csum_err;
-	case PREP_ENCODED_DO_WRITE:
-		/* XXX look for bug here */
-		if (ec_buf) {
-			dst = bch2_write_bio_alloc(c, wp, src,
-						   &page_alloc_failed,
-						   ec_buf);
-			bio_copy_data(dst, src);
-			bounce = true;
-		}
-		init_append_extent(op, wp, op->version, op->crc);
-		goto do_write;
-	}
-
-	if (ec_buf ||
-	    op->compression_opt ||
-	    (op->csum_type &&
-	     !(op->flags & BCH_WRITE_PAGES_STABLE)) ||
-	    (bch2_csum_type_is_encryption(op->csum_type) &&
-	     !(op->flags & BCH_WRITE_PAGES_OWNED))) {
-		dst = bch2_write_bio_alloc(c, wp, src,
-					   &page_alloc_failed,
-					   ec_buf);
-		bounce = true;
-	}
-
-	saved_iter = dst->bi_iter;
-
-	do {
-		struct bch_extent_crc_unpacked crc = { 0 };
-		struct bversion version = op->version;
-		size_t dst_len, src_len;
-
-		if (page_alloc_failed &&
-		    dst->bi_iter.bi_size  < (wp->sectors_free << 9) &&
-		    dst->bi_iter.bi_size < c->opts.encoded_extent_max)
-			break;
-
-		BUG_ON(op->compression_opt &&
-		       (op->flags & BCH_WRITE_DATA_ENCODED) &&
-		       bch2_csum_type_is_encryption(op->crc.csum_type));
-		BUG_ON(op->compression_opt && !bounce);
-
-		crc.compression_type = op->incompressible
-			? BCH_COMPRESSION_TYPE_incompressible
-			: op->compression_opt
-			? bch2_bio_compress(c, dst, &dst_len, src, &src_len,
-					    op->compression_opt)
-			: 0;
-		if (!crc_is_compressed(crc)) {
-			dst_len = min(dst->bi_iter.bi_size, src->bi_iter.bi_size);
-			dst_len = min_t(unsigned, dst_len, wp->sectors_free << 9);
-
-			if (op->csum_type)
-				dst_len = min_t(unsigned, dst_len,
-						c->opts.encoded_extent_max);
-
-			if (bounce) {
-				swap(dst->bi_iter.bi_size, dst_len);
-				bio_copy_data(dst, src);
-				swap(dst->bi_iter.bi_size, dst_len);
-			}
-
-			src_len = dst_len;
-		}
-
-		BUG_ON(!src_len || !dst_len);
-
-		if (bch2_csum_type_is_encryption(op->csum_type)) {
-			if (bversion_zero(version)) {
-				version.lo = atomic64_inc_return(&c->key_version);
-			} else {
-				crc.nonce = op->nonce;
-				op->nonce += src_len >> 9;
-			}
-		}
-
-		if ((op->flags & BCH_WRITE_DATA_ENCODED) &&
-		    !crc_is_compressed(crc) &&
-		    bch2_csum_type_is_encryption(op->crc.csum_type) ==
-		    bch2_csum_type_is_encryption(op->csum_type)) {
-			u8 compression_type = crc.compression_type;
-			u16 nonce = crc.nonce;
-			/*
-			 * Note: when we're using rechecksum(), we need to be
-			 * checksumming @src because it has all the data our
-			 * existing checksum covers - if we bounced (because we
-			 * were trying to compress), @dst will only have the
-			 * part of the data the new checksum will cover.
-			 *
-			 * But normally we want to be checksumming post bounce,
-			 * because part of the reason for bouncing is so the
-			 * data can't be modified (by userspace) while it's in
-			 * flight.
-			 */
-			if (bch2_rechecksum_bio(c, src, version, op->crc,
-					&crc, &op->crc,
-					src_len >> 9,
-					bio_sectors(src) - (src_len >> 9),
-					op->csum_type))
-				goto csum_err;
-			/*
-			 * rchecksum_bio sets compression_type on crc from op->crc,
-			 * this isn't always correct as sometimes we're changing
-			 * an extent from uncompressed to incompressible.
-			 */
-			crc.compression_type = compression_type;
-			crc.nonce = nonce;
-		} else {
-			if ((op->flags & BCH_WRITE_DATA_ENCODED) &&
-			    bch2_rechecksum_bio(c, src, version, op->crc,
-					NULL, &op->crc,
-					src_len >> 9,
-					bio_sectors(src) - (src_len >> 9),
-					op->crc.csum_type))
-				goto csum_err;
-
-			crc.compressed_size	= dst_len >> 9;
-			crc.uncompressed_size	= src_len >> 9;
-			crc.live_size		= src_len >> 9;
-
-			swap(dst->bi_iter.bi_size, dst_len);
-			ret = bch2_encrypt_bio(c, op->csum_type,
-					       extent_nonce(version, crc), dst);
-			if (ret)
-				goto err;
-
-			crc.csum = bch2_checksum_bio(c, op->csum_type,
-					 extent_nonce(version, crc), dst);
-			crc.csum_type = op->csum_type;
-			swap(dst->bi_iter.bi_size, dst_len);
-		}
-
-		init_append_extent(op, wp, version, crc);
-
-		if (dst != src)
-			bio_advance(dst, dst_len);
-		bio_advance(src, src_len);
-		total_output	+= dst_len;
-		total_input	+= src_len;
-	} while (dst->bi_iter.bi_size &&
-		 src->bi_iter.bi_size &&
-		 wp->sectors_free &&
-		 !bch2_keylist_realloc(&op->insert_keys,
-				      op->inline_keys,
-				      ARRAY_SIZE(op->inline_keys),
-				      BKEY_EXTENT_U64s_MAX));
-
-	more = src->bi_iter.bi_size != 0;
-
-	dst->bi_iter = saved_iter;
-
-	if (dst == src && more) {
-		BUG_ON(total_output != total_input);
-
-		dst = bio_split(src, total_input >> 9,
-				GFP_NOFS, &c->bio_write);
-		wbio_init(dst)->put_bio	= true;
-		/* copy WRITE_SYNC flag */
-		dst->bi_opf		= src->bi_opf;
-	}
-
-	dst->bi_iter.bi_size = total_output;
-do_write:
-	*_dst = dst;
-	return more;
-csum_err:
-	bch_err(c, "error verifying existing checksum while rewriting existing data (memory corruption?)");
-	ret = -EIO;
-err:
-	if (to_wbio(dst)->bounce)
-		bch2_bio_free_pages_pool(c, dst);
-	if (to_wbio(dst)->put_bio)
-		bio_put(dst);
-
-	return ret;
-}
-
-static bool bch2_extent_is_writeable(struct bch_write_op *op,
-				     struct bkey_s_c k)
-{
-	struct bch_fs *c = op->c;
-	struct bkey_s_c_extent e;
-	struct extent_ptr_decoded p;
-	const union bch_extent_entry *entry;
-	unsigned replicas = 0;
-
-	if (k.k->type != KEY_TYPE_extent)
-		return false;
-
-	e = bkey_s_c_to_extent(k);
-	extent_for_each_ptr_decode(e, p, entry) {
-		if (p.crc.csum_type ||
-		    crc_is_compressed(p.crc) ||
-		    p.has_ec)
-			return false;
-
-		replicas += bch2_extent_ptr_durability(c, &p);
-	}
-
-	return replicas >= op->opts.data_replicas;
-}
-
-static inline void bch2_nocow_write_unlock(struct bch_write_op *op)
-{
-	struct bch_fs *c = op->c;
-	const struct bch_extent_ptr *ptr;
-	struct bkey_i *k;
-
-	for_each_keylist_key(&op->insert_keys, k) {
-		struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(k));
-
-		bkey_for_each_ptr(ptrs, ptr)
-			bch2_bucket_nocow_unlock(&c->nocow_locks,
-					       PTR_BUCKET_POS(c, ptr),
-					       BUCKET_NOCOW_LOCK_UPDATE);
-	}
-}
-
-static int bch2_nocow_write_convert_one_unwritten(struct btree_trans *trans,
-						  struct btree_iter *iter,
-						  struct bkey_i *orig,
-						  struct bkey_s_c k,
-						  u64 new_i_size)
-{
-	struct bkey_i *new;
-	struct bkey_ptrs ptrs;
-	struct bch_extent_ptr *ptr;
-	int ret;
-
-	if (!bch2_extents_match(bkey_i_to_s_c(orig), k)) {
-		/* trace this */
-		return 0;
-	}
-
-	new = bch2_bkey_make_mut_noupdate(trans, k);
-	ret = PTR_ERR_OR_ZERO(new);
-	if (ret)
-		return ret;
-
-	bch2_cut_front(bkey_start_pos(&orig->k), new);
-	bch2_cut_back(orig->k.p, new);
-
-	ptrs = bch2_bkey_ptrs(bkey_i_to_s(new));
-	bkey_for_each_ptr(ptrs, ptr)
-		ptr->unwritten = 0;
-
-	/*
-	 * Note that we're not calling bch2_subvol_get_snapshot() in this path -
-	 * that was done when we kicked off the write, and here it's important
-	 * that we update the extent that we wrote to - even if a snapshot has
-	 * since been created. The write is still outstanding, so we're ok
-	 * w.r.t. snapshot atomicity:
-	 */
-	return  bch2_extent_update_i_size_sectors(trans, iter,
-					min(new->k.p.offset << 9, new_i_size), 0) ?:
-		bch2_trans_update(trans, iter, new,
-				  BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
-}
-
-static void bch2_nocow_write_convert_unwritten(struct bch_write_op *op)
-{
-	struct bch_fs *c = op->c;
-	struct btree_trans trans;
-	struct btree_iter iter;
-	struct bkey_i *orig;
-	struct bkey_s_c k;
-	int ret;
-
-	bch2_trans_init(&trans, c, 0, 0);
-
-	for_each_keylist_key(&op->insert_keys, orig) {
-		ret = for_each_btree_key_upto_commit(&trans, iter, BTREE_ID_extents,
-				     bkey_start_pos(&orig->k), orig->k.p,
-				     BTREE_ITER_INTENT, k,
-				     NULL, NULL, BTREE_INSERT_NOFAIL, ({
-			bch2_nocow_write_convert_one_unwritten(&trans, &iter, orig, k, op->new_i_size);
-		}));
-
-		if (ret && !bch2_err_matches(ret, EROFS)) {
-			struct bkey_i *k = bch2_keylist_front(&op->insert_keys);
-
-			bch_err_inum_offset_ratelimited(c,
-				k->k.p.inode, k->k.p.offset << 9,
-				"write error while doing btree update: %s",
-				bch2_err_str(ret));
-		}
-
-		if (ret) {
-			op->error = ret;
-			break;
-		}
-	}
-
-	bch2_trans_exit(&trans);
-}
-
-static void __bch2_nocow_write_done(struct bch_write_op *op)
-{
-	bch2_nocow_write_unlock(op);
-
-	if (unlikely(op->flags & BCH_WRITE_IO_ERROR)) {
-		op->error = -EIO;
-	} else if (unlikely(op->flags & BCH_WRITE_CONVERT_UNWRITTEN))
-		bch2_nocow_write_convert_unwritten(op);
-}
-
-static void bch2_nocow_write_done(struct closure *cl)
-{
-	struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
-
-	__bch2_nocow_write_done(op);
-	bch2_write_done(cl);
-}
-
-static void bch2_nocow_write(struct bch_write_op *op)
-{
-	struct bch_fs *c = op->c;
-	struct btree_trans trans;
-	struct btree_iter iter;
-	struct bkey_s_c k;
-	struct bkey_ptrs_c ptrs;
-	const struct bch_extent_ptr *ptr;
-	struct {
-		struct bpos	b;
-		unsigned	gen;
-		struct nocow_lock_bucket *l;
-	} buckets[BCH_REPLICAS_MAX];
-	unsigned nr_buckets = 0;
-	u32 snapshot;
-	int ret, i;
-
-	if (op->flags & BCH_WRITE_MOVE)
-		return;
-
-	bch2_trans_init(&trans, c, 0, 0);
-retry:
-	bch2_trans_begin(&trans);
-
-	ret = bch2_subvolume_get_snapshot(&trans, op->subvol, &snapshot);
-	if (unlikely(ret))
-		goto err;
-
-	bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
-			     SPOS(op->pos.inode, op->pos.offset, snapshot),
-			     BTREE_ITER_SLOTS);
-	while (1) {
-		struct bio *bio = &op->wbio.bio;
-
-		nr_buckets = 0;
-
-		k = bch2_btree_iter_peek_slot(&iter);
-		ret = bkey_err(k);
-		if (ret)
-			break;
-
-		/* fall back to normal cow write path? */
-		if (unlikely(k.k->p.snapshot != snapshot ||
-			     !bch2_extent_is_writeable(op, k)))
-			break;
-
-		if (bch2_keylist_realloc(&op->insert_keys,
-					op->inline_keys,
-					ARRAY_SIZE(op->inline_keys),
-					k.k->u64s))
-			break;
-
-		/* Get iorefs before dropping btree locks: */
-		ptrs = bch2_bkey_ptrs_c(k);
-		bkey_for_each_ptr(ptrs, ptr) {
-			buckets[nr_buckets].b = PTR_BUCKET_POS(c, ptr);
-			buckets[nr_buckets].gen = ptr->gen;
-			buckets[nr_buckets].l =
-				bucket_nocow_lock(&c->nocow_locks,
-						  bucket_to_u64(buckets[nr_buckets].b));
-
-			prefetch(buckets[nr_buckets].l);
-
-			if (unlikely(!bch2_dev_get_ioref(bch_dev_bkey_exists(c, ptr->dev), WRITE)))
-				goto err_get_ioref;
-
-			nr_buckets++;
-
-			if (ptr->unwritten)
-				op->flags |= BCH_WRITE_CONVERT_UNWRITTEN;
-		}
-
-		/* Unlock before taking nocow locks, doing IO: */
-		bkey_reassemble(op->insert_keys.top, k);
-		bch2_trans_unlock(&trans);
-
-		bch2_cut_front(op->pos, op->insert_keys.top);
-		if (op->flags & BCH_WRITE_CONVERT_UNWRITTEN)
-			bch2_cut_back(POS(op->pos.inode, op->pos.offset + bio_sectors(bio)), op->insert_keys.top);
-
-		for (i = 0; i < nr_buckets; i++) {
-			struct bch_dev *ca = bch_dev_bkey_exists(c, buckets[i].b.inode);
-			struct nocow_lock_bucket *l = buckets[i].l;
-			bool stale;
-
-			__bch2_bucket_nocow_lock(&c->nocow_locks, l,
-						 bucket_to_u64(buckets[i].b),
-						 BUCKET_NOCOW_LOCK_UPDATE);
-
-			rcu_read_lock();
-			stale = gen_after(*bucket_gen(ca, buckets[i].b.offset), buckets[i].gen);
-			rcu_read_unlock();
-
-			if (unlikely(stale))
-				goto err_bucket_stale;
-		}
-
-		bio = &op->wbio.bio;
-		if (k.k->p.offset < op->pos.offset + bio_sectors(bio)) {
-			bio = bio_split(bio, k.k->p.offset - op->pos.offset,
-					GFP_KERNEL, &c->bio_write);
-			wbio_init(bio)->put_bio = true;
-			bio->bi_opf = op->wbio.bio.bi_opf;
-		} else {
-			op->flags |= BCH_WRITE_DONE;
-		}
-
-		op->pos.offset += bio_sectors(bio);
-		op->written += bio_sectors(bio);
-
-		bio->bi_end_io	= bch2_write_endio;
-		bio->bi_private	= &op->cl;
-		bio->bi_opf |= REQ_OP_WRITE;
-		closure_get(&op->cl);
-		bch2_submit_wbio_replicas(to_wbio(bio), c, BCH_DATA_user,
-					  op->insert_keys.top, true);
-
-		bch2_keylist_push(&op->insert_keys);
-		if (op->flags & BCH_WRITE_DONE)
-			break;
-		bch2_btree_iter_advance(&iter);
-	}
-out:
-	bch2_trans_iter_exit(&trans, &iter);
-err:
-	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-		goto retry;
-
-	if (ret) {
-		bch_err_inum_offset_ratelimited(c,
-				op->pos.inode,
-				op->pos.offset << 9,
-				"%s: btree lookup error %s",
-				__func__, bch2_err_str(ret));
-		op->error = ret;
-		op->flags |= BCH_WRITE_DONE;
-	}
-
-	bch2_trans_exit(&trans);
-
-	/* fallback to cow write path? */
-	if (!(op->flags & BCH_WRITE_DONE)) {
-		closure_sync(&op->cl);
-		__bch2_nocow_write_done(op);
-		op->insert_keys.top = op->insert_keys.keys;
-	} else if (op->flags & BCH_WRITE_SYNC) {
-		closure_sync(&op->cl);
-		bch2_nocow_write_done(&op->cl);
-	} else {
-		/*
-		 * XXX
-		 * needs to run out of process context because ei_quota_lock is
-		 * a mutex
-		 */
-		continue_at(&op->cl, bch2_nocow_write_done, index_update_wq(op));
-	}
-	return;
-err_get_ioref:
-	for (i = 0; i < nr_buckets; i++)
-		percpu_ref_put(&bch_dev_bkey_exists(c, buckets[i].b.inode)->io_ref);
-
-	/* Fall back to COW path: */
-	goto out;
-err_bucket_stale:
-	while (--i >= 0)
-		bch2_bucket_nocow_unlock(&c->nocow_locks,
-					 buckets[i].b,
-					 BUCKET_NOCOW_LOCK_UPDATE);
-	for (i = 0; i < nr_buckets; i++)
-		percpu_ref_put(&bch_dev_bkey_exists(c, buckets[i].b.inode)->io_ref);
-
-	/* We can retry this: */
-	ret = -BCH_ERR_transaction_restart;
-	goto out;
-}
-
-static void __bch2_write(struct bch_write_op *op)
-{
-	struct bch_fs *c = op->c;
-	struct write_point *wp = NULL;
-	struct bio *bio = NULL;
-	unsigned nofs_flags;
-	int ret;
-
-	nofs_flags = memalloc_nofs_save();
-
-	if (unlikely(op->opts.nocow && c->opts.nocow_enabled)) {
-		bch2_nocow_write(op);
-		if (op->flags & BCH_WRITE_DONE)
-			goto out_nofs_restore;
-	}
-again:
-	memset(&op->failed, 0, sizeof(op->failed));
-
-	do {
-		struct bkey_i *key_to_write;
-		unsigned key_to_write_offset = op->insert_keys.top_p -
-			op->insert_keys.keys_p;
-
-		/* +1 for possible cache device: */
-		if (op->open_buckets.nr + op->nr_replicas + 1 >
-		    ARRAY_SIZE(op->open_buckets.v))
-			break;
-
-		if (bch2_keylist_realloc(&op->insert_keys,
-					op->inline_keys,
-					ARRAY_SIZE(op->inline_keys),
-					BKEY_EXTENT_U64s_MAX))
-			break;
-
-		/*
-		 * The copygc thread is now global, which means it's no longer
-		 * freeing up space on specific disks, which means that
-		 * allocations for specific disks may hang arbitrarily long:
-		 */
-		ret = bch2_trans_do(c, NULL, NULL, 0,
-			bch2_alloc_sectors_start_trans(&trans,
-				op->target,
-				op->opts.erasure_code && !(op->flags & BCH_WRITE_CACHED),
-				op->write_point,
-				&op->devs_have,
-				op->nr_replicas,
-				op->nr_replicas_required,
-				op->watermark,
-				op->flags,
-				(op->flags & (BCH_WRITE_ALLOC_NOWAIT|
-					      BCH_WRITE_ONLY_SPECIFIED_DEVS))
-				? NULL : &op->cl, &wp));
-		if (unlikely(ret)) {
-			if (bch2_err_matches(ret, BCH_ERR_operation_blocked))
-				break;
-
-			goto err;
-		}
-
-		EBUG_ON(!wp);
-
-		bch2_open_bucket_get(c, wp, &op->open_buckets);
-		ret = bch2_write_extent(op, wp, &bio);
-
-		bch2_alloc_sectors_done_inlined(c, wp);
-err:
-		if (ret <= 0) {
-			op->flags |= BCH_WRITE_DONE;
-
-			if (ret < 0) {
-				op->error = ret;
-				break;
-			}
-		}
-
-		bio->bi_end_io	= bch2_write_endio;
-		bio->bi_private	= &op->cl;
-		bio->bi_opf |= REQ_OP_WRITE;
-
-		closure_get(bio->bi_private);
-
-		key_to_write = (void *) (op->insert_keys.keys_p +
-					 key_to_write_offset);
-
-		bch2_submit_wbio_replicas(to_wbio(bio), c, BCH_DATA_user,
-					  key_to_write, false);
-	} while (ret);
-
-	/*
-	 * Sync or no?
-	 *
-	 * If we're running asynchronously, wne may still want to block
-	 * synchronously here if we weren't able to submit all of the IO at
-	 * once, as that signals backpressure to the caller.
-	 */
-	if ((op->flags & BCH_WRITE_SYNC) ||
-	    (!(op->flags & BCH_WRITE_DONE) &&
-	     !(op->flags & BCH_WRITE_IN_WORKER))) {
-		closure_sync(&op->cl);
-		__bch2_write_index(op);
-
-		if (!(op->flags & BCH_WRITE_DONE))
-			goto again;
-		bch2_write_done(&op->cl);
-	} else {
-		bch2_write_queue(op, wp);
-		continue_at(&op->cl, bch2_write_index, NULL);
-	}
-out_nofs_restore:
-	memalloc_nofs_restore(nofs_flags);
-}
-
-static void bch2_write_data_inline(struct bch_write_op *op, unsigned data_len)
-{
-	struct bio *bio = &op->wbio.bio;
-	struct bvec_iter iter;
-	struct bkey_i_inline_data *id;
-	unsigned sectors;
-	int ret;
-
-	op->flags |= BCH_WRITE_WROTE_DATA_INLINE;
-	op->flags |= BCH_WRITE_DONE;
-
-	bch2_check_set_feature(op->c, BCH_FEATURE_inline_data);
-
-	ret = bch2_keylist_realloc(&op->insert_keys, op->inline_keys,
-				   ARRAY_SIZE(op->inline_keys),
-				   BKEY_U64s + DIV_ROUND_UP(data_len, 8));
-	if (ret) {
-		op->error = ret;
-		goto err;
-	}
-
-	sectors = bio_sectors(bio);
-	op->pos.offset += sectors;
-
-	id = bkey_inline_data_init(op->insert_keys.top);
-	id->k.p		= op->pos;
-	id->k.version	= op->version;
-	id->k.size	= sectors;
-
-	iter = bio->bi_iter;
-	iter.bi_size = data_len;
-	memcpy_from_bio(id->v.data, bio, iter);
-
-	while (data_len & 7)
-		id->v.data[data_len++] = '\0';
-	set_bkey_val_bytes(&id->k, data_len);
-	bch2_keylist_push(&op->insert_keys);
-
-	__bch2_write_index(op);
-err:
-	bch2_write_done(&op->cl);
-}
-
-/**
- * bch_write - handle a write to a cache device or flash only volume
- *
- * This is the starting point for any data to end up in a cache device; it could
- * be from a normal write, or a writeback write, or a write to a flash only
- * volume - it's also used by the moving garbage collector to compact data in
- * mostly empty buckets.
- *
- * It first writes the data to the cache, creating a list of keys to be inserted
- * (if the data won't fit in a single open bucket, there will be multiple keys);
- * after the data is written it calls bch_journal, and after the keys have been
- * added to the next journal write they're inserted into the btree.
- *
- * If op->discard is true, instead of inserting the data it invalidates the
- * region of the cache represented by op->bio and op->inode.
- */
-void bch2_write(struct closure *cl)
-{
-	struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
-	struct bio *bio = &op->wbio.bio;
-	struct bch_fs *c = op->c;
-	unsigned data_len;
-
-	EBUG_ON(op->cl.parent);
-	BUG_ON(!op->nr_replicas);
-	BUG_ON(!op->write_point.v);
-	BUG_ON(bkey_eq(op->pos, POS_MAX));
-
-	op->start_time = local_clock();
-	bch2_keylist_init(&op->insert_keys, op->inline_keys);
-	wbio_init(bio)->put_bio = false;
-
-	if (bio->bi_iter.bi_size & (c->opts.block_size - 1)) {
-		bch_err_inum_offset_ratelimited(c,
-			op->pos.inode,
-			op->pos.offset << 9,
-			"misaligned write");
-		op->error = -EIO;
-		goto err;
-	}
-
-	if (c->opts.nochanges) {
-		op->error = -BCH_ERR_erofs_no_writes;
-		goto err;
-	}
-
-	if (!(op->flags & BCH_WRITE_MOVE) &&
-	    !bch2_write_ref_tryget(c, BCH_WRITE_REF_write)) {
-		op->error = -BCH_ERR_erofs_no_writes;
-		goto err;
-	}
-
-	this_cpu_add(c->counters[BCH_COUNTER_io_write], bio_sectors(bio));
-	bch2_increment_clock(c, bio_sectors(bio), WRITE);
-
-	data_len = min_t(u64, bio->bi_iter.bi_size,
-			 op->new_i_size - (op->pos.offset << 9));
-
-	if (c->opts.inline_data &&
-	    data_len <= min(block_bytes(c) / 2, 1024U)) {
-		bch2_write_data_inline(op, data_len);
-		return;
-	}
-
-	__bch2_write(op);
-	return;
-err:
-	bch2_disk_reservation_put(c, &op->res);
-
-	closure_debug_destroy(&op->cl);
-	if (op->end_io)
-		op->end_io(op);
-}
-
-static const char * const bch2_write_flags[] = {
-#define x(f)	#f,
-	BCH_WRITE_FLAGS()
-#undef x
-	NULL
-};
-
-void bch2_write_op_to_text(struct printbuf *out, struct bch_write_op *op)
-{
-	prt_str(out, "pos: ");
-	bch2_bpos_to_text(out, op->pos);
-	prt_newline(out);
-	printbuf_indent_add(out, 2);
-
-	prt_str(out, "started: ");
-	bch2_pr_time_units(out, local_clock() - op->start_time);
-	prt_newline(out);
-
-	prt_str(out, "flags: ");
-	prt_bitflags(out, bch2_write_flags, op->flags);
-	prt_newline(out);
-
-	prt_printf(out, "ref: %u", closure_nr_remaining(&op->cl));
-	prt_newline(out);
-
-	printbuf_indent_sub(out, 2);
-}
-
-/* Cache promotion on read */
-
-struct promote_op {
-	struct rcu_head		rcu;
-	u64			start_time;
-
-	struct rhash_head	hash;
-	struct bpos		pos;
-
-	struct data_update	write;
-	struct bio_vec		bi_inline_vecs[0]; /* must be last */
-};
-
-static const struct rhashtable_params bch_promote_params = {
-	.head_offset	= offsetof(struct promote_op, hash),
-	.key_offset	= offsetof(struct promote_op, pos),
-	.key_len	= sizeof(struct bpos),
-};
-
-static inline bool should_promote(struct bch_fs *c, struct bkey_s_c k,
-				  struct bpos pos,
-				  struct bch_io_opts opts,
-				  unsigned flags)
-{
-	if (!(flags & BCH_READ_MAY_PROMOTE))
-		return false;
-
-	if (!opts.promote_target)
-		return false;
-
-	if (bch2_bkey_has_target(c, k, opts.promote_target))
-		return false;
-
-	if (bkey_extent_is_unwritten(k))
-		return false;
-
-	if (bch2_target_congested(c, opts.promote_target)) {
-		/* XXX trace this */
-		return false;
-	}
-
-	if (rhashtable_lookup_fast(&c->promote_table, &pos,
-				   bch_promote_params))
-		return false;
-
-	return true;
-}
-
-static void promote_free(struct bch_fs *c, struct promote_op *op)
-{
-	int ret;
-
-	bch2_data_update_exit(&op->write);
-
-	ret = rhashtable_remove_fast(&c->promote_table, &op->hash,
-				     bch_promote_params);
-	BUG_ON(ret);
-	bch2_write_ref_put(c, BCH_WRITE_REF_promote);
-	kfree_rcu(op, rcu);
-}
-
-static void promote_done(struct bch_write_op *wop)
-{
-	struct promote_op *op =
-		container_of(wop, struct promote_op, write.op);
-	struct bch_fs *c = op->write.op.c;
-
-	bch2_time_stats_update(&c->times[BCH_TIME_data_promote],
-			       op->start_time);
-	promote_free(c, op);
-}
-
-static void promote_start(struct promote_op *op, struct bch_read_bio *rbio)
-{
-	struct bio *bio = &op->write.op.wbio.bio;
-
-	trace_and_count(op->write.op.c, read_promote, &rbio->bio);
-
-	/* we now own pages: */
-	BUG_ON(!rbio->bounce);
-	BUG_ON(rbio->bio.bi_vcnt > bio->bi_max_vecs);
-
-	memcpy(bio->bi_io_vec, rbio->bio.bi_io_vec,
-	       sizeof(struct bio_vec) * rbio->bio.bi_vcnt);
-	swap(bio->bi_vcnt, rbio->bio.bi_vcnt);
-
-	bch2_data_update_read_done(&op->write, rbio->pick.crc);
-}
-
-static struct promote_op *__promote_alloc(struct btree_trans *trans,
-					  enum btree_id btree_id,
-					  struct bkey_s_c k,
-					  struct bpos pos,
-					  struct extent_ptr_decoded *pick,
-					  struct bch_io_opts opts,
-					  unsigned sectors,
-					  struct bch_read_bio **rbio)
-{
-	struct bch_fs *c = trans->c;
-	struct promote_op *op = NULL;
-	struct bio *bio;
-	unsigned pages = DIV_ROUND_UP(sectors, PAGE_SECTORS);
-	int ret;
-
-	if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_promote))
-		return NULL;
-
-	op = kzalloc(sizeof(*op) + sizeof(struct bio_vec) * pages, GFP_NOFS);
-	if (!op)
-		goto err;
-
-	op->start_time = local_clock();
-	op->pos = pos;
-
-	/*
-	 * We don't use the mempool here because extents that aren't
-	 * checksummed or compressed can be too big for the mempool:
-	 */
-	*rbio = kzalloc(sizeof(struct bch_read_bio) +
-			sizeof(struct bio_vec) * pages,
-			GFP_NOFS);
-	if (!*rbio)
-		goto err;
-
-	rbio_init(&(*rbio)->bio, opts);
-	bio_init(&(*rbio)->bio, NULL, (*rbio)->bio.bi_inline_vecs, pages, 0);
-
-	if (bch2_bio_alloc_pages(&(*rbio)->bio, sectors << 9,
-				 GFP_NOFS))
-		goto err;
-
-	(*rbio)->bounce		= true;
-	(*rbio)->split		= true;
-	(*rbio)->kmalloc	= true;
-
-	if (rhashtable_lookup_insert_fast(&c->promote_table, &op->hash,
-					  bch_promote_params))
-		goto err;
-
-	bio = &op->write.op.wbio.bio;
-	bio_init(bio, NULL, bio->bi_inline_vecs, pages, 0);
-
-	ret = bch2_data_update_init(trans, NULL, &op->write,
-			writepoint_hashed((unsigned long) current),
-			opts,
-			(struct data_update_opts) {
-				.target		= opts.promote_target,
-				.extra_replicas	= 1,
-				.write_flags	= BCH_WRITE_ALLOC_NOWAIT|BCH_WRITE_CACHED,
-			},
-			btree_id, k);
-	/*
-	 * possible errors: -BCH_ERR_nocow_lock_blocked,
-	 * -BCH_ERR_ENOSPC_disk_reservation:
-	 */
-	if (ret) {
-		ret = rhashtable_remove_fast(&c->promote_table, &op->hash,
-					bch_promote_params);
-		BUG_ON(ret);
-		goto err;
-	}
-
-	op->write.op.end_io = promote_done;
-
-	return op;
-err:
-	if (*rbio)
-		bio_free_pages(&(*rbio)->bio);
-	kfree(*rbio);
-	*rbio = NULL;
-	kfree(op);
-	bch2_write_ref_put(c, BCH_WRITE_REF_promote);
-	return NULL;
-}
-
-noinline
-static struct promote_op *promote_alloc(struct btree_trans *trans,
-					struct bvec_iter iter,
-					struct bkey_s_c k,
-					struct extent_ptr_decoded *pick,
-					struct bch_io_opts opts,
-					unsigned flags,
-					struct bch_read_bio **rbio,
-					bool *bounce,
-					bool *read_full)
-{
-	struct bch_fs *c = trans->c;
-	bool promote_full = *read_full || READ_ONCE(c->promote_whole_extents);
-	/* data might have to be decompressed in the write path: */
-	unsigned sectors = promote_full
-		? max(pick->crc.compressed_size, pick->crc.live_size)
-		: bvec_iter_sectors(iter);
-	struct bpos pos = promote_full
-		? bkey_start_pos(k.k)
-		: POS(k.k->p.inode, iter.bi_sector);
-	struct promote_op *promote;
-
-	if (!should_promote(c, k, pos, opts, flags))
-		return NULL;
-
-	promote = __promote_alloc(trans,
-				  k.k->type == KEY_TYPE_reflink_v
-				  ? BTREE_ID_reflink
-				  : BTREE_ID_extents,
-				  k, pos, pick, opts, sectors, rbio);
-	if (!promote)
-		return NULL;
-
-	*bounce		= true;
-	*read_full	= promote_full;
-	return promote;
-}
-
-/* Read */
-
-#define READ_RETRY_AVOID	1
-#define READ_RETRY		2
-#define READ_ERR		3
-
-enum rbio_context {
-	RBIO_CONTEXT_NULL,
-	RBIO_CONTEXT_HIGHPRI,
-	RBIO_CONTEXT_UNBOUND,
-};
-
-static inline struct bch_read_bio *
-bch2_rbio_parent(struct bch_read_bio *rbio)
-{
-	return rbio->split ? rbio->parent : rbio;
-}
-
-__always_inline
-static void bch2_rbio_punt(struct bch_read_bio *rbio, work_func_t fn,
-			   enum rbio_context context,
-			   struct workqueue_struct *wq)
-{
-	if (context <= rbio->context) {
-		fn(&rbio->work);
-	} else {
-		rbio->work.func		= fn;
-		rbio->context		= context;
-		queue_work(wq, &rbio->work);
-	}
-}
-
-static inline struct bch_read_bio *bch2_rbio_free(struct bch_read_bio *rbio)
-{
-	BUG_ON(rbio->bounce && !rbio->split);
-
-	if (rbio->promote)
-		promote_free(rbio->c, rbio->promote);
-	rbio->promote = NULL;
-
-	if (rbio->bounce)
-		bch2_bio_free_pages_pool(rbio->c, &rbio->bio);
-
-	if (rbio->split) {
-		struct bch_read_bio *parent = rbio->parent;
-
-		if (rbio->kmalloc)
-			kfree(rbio);
-		else
-			bio_put(&rbio->bio);
-
-		rbio = parent;
-	}
-
-	return rbio;
-}
-
-/*
- * Only called on a top level bch_read_bio to complete an entire read request,
- * not a split:
- */
-static void bch2_rbio_done(struct bch_read_bio *rbio)
-{
-	if (rbio->start_time)
-		bch2_time_stats_update(&rbio->c->times[BCH_TIME_data_read],
-				       rbio->start_time);
-	bio_endio(&rbio->bio);
-}
-
-static void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio,
-				     struct bvec_iter bvec_iter,
-				     struct bch_io_failures *failed,
-				     unsigned flags)
-{
-	struct btree_trans trans;
-	struct btree_iter iter;
-	struct bkey_buf sk;
-	struct bkey_s_c k;
-	int ret;
-
-	flags &= ~BCH_READ_LAST_FRAGMENT;
-	flags |= BCH_READ_MUST_CLONE;
-
-	bch2_bkey_buf_init(&sk);
-	bch2_trans_init(&trans, c, 0, 0);
-
-	bch2_trans_iter_init(&trans, &iter, rbio->data_btree,
-			     rbio->read_pos, BTREE_ITER_SLOTS);
-retry:
-	rbio->bio.bi_status = 0;
-
-	k = bch2_btree_iter_peek_slot(&iter);
-	if (bkey_err(k))
-		goto err;
-
-	bch2_bkey_buf_reassemble(&sk, c, k);
-	k = bkey_i_to_s_c(sk.k);
-	bch2_trans_unlock(&trans);
-
-	if (!bch2_bkey_matches_ptr(c, k,
-				   rbio->pick.ptr,
-				   rbio->data_pos.offset -
-				   rbio->pick.crc.offset)) {
-		/* extent we wanted to read no longer exists: */
-		rbio->hole = true;
-		goto out;
-	}
-
-	ret = __bch2_read_extent(&trans, rbio, bvec_iter,
-				 rbio->read_pos,
-				 rbio->data_btree,
-				 k, 0, failed, flags);
-	if (ret == READ_RETRY)
-		goto retry;
-	if (ret)
-		goto err;
-out:
-	bch2_rbio_done(rbio);
-	bch2_trans_iter_exit(&trans, &iter);
-	bch2_trans_exit(&trans);
-	bch2_bkey_buf_exit(&sk, c);
-	return;
-err:
-	rbio->bio.bi_status = BLK_STS_IOERR;
-	goto out;
-}
-
-static void bch2_rbio_retry(struct work_struct *work)
-{
-	struct bch_read_bio *rbio =
-		container_of(work, struct bch_read_bio, work);
-	struct bch_fs *c	= rbio->c;
-	struct bvec_iter iter	= rbio->bvec_iter;
-	unsigned flags		= rbio->flags;
-	subvol_inum inum = {
-		.subvol = rbio->subvol,
-		.inum	= rbio->read_pos.inode,
-	};
-	struct bch_io_failures failed = { .nr = 0 };
-
-	trace_and_count(c, read_retry, &rbio->bio);
-
-	if (rbio->retry == READ_RETRY_AVOID)
-		bch2_mark_io_failure(&failed, &rbio->pick);
-
-	rbio->bio.bi_status = 0;
-
-	rbio = bch2_rbio_free(rbio);
-
-	flags |= BCH_READ_IN_RETRY;
-	flags &= ~BCH_READ_MAY_PROMOTE;
-
-	if (flags & BCH_READ_NODECODE) {
-		bch2_read_retry_nodecode(c, rbio, iter, &failed, flags);
-	} else {
-		flags &= ~BCH_READ_LAST_FRAGMENT;
-		flags |= BCH_READ_MUST_CLONE;
-
-		__bch2_read(c, rbio, iter, inum, &failed, flags);
-	}
-}
-
-static void bch2_rbio_error(struct bch_read_bio *rbio, int retry,
-			    blk_status_t error)
-{
-	rbio->retry = retry;
-
-	if (rbio->flags & BCH_READ_IN_RETRY)
-		return;
-
-	if (retry == READ_ERR) {
-		rbio = bch2_rbio_free(rbio);
-
-		rbio->bio.bi_status = error;
-		bch2_rbio_done(rbio);
-	} else {
-		bch2_rbio_punt(rbio, bch2_rbio_retry,
-			       RBIO_CONTEXT_UNBOUND, system_unbound_wq);
-	}
-}
-
-static int __bch2_rbio_narrow_crcs(struct btree_trans *trans,
-				   struct bch_read_bio *rbio)
-{
-	struct bch_fs *c = rbio->c;
-	u64 data_offset = rbio->data_pos.offset - rbio->pick.crc.offset;
-	struct bch_extent_crc_unpacked new_crc;
-	struct btree_iter iter;
-	struct bkey_i *new;
-	struct bkey_s_c k;
-	int ret = 0;
-
-	if (crc_is_compressed(rbio->pick.crc))
-		return 0;
-
-	k = bch2_bkey_get_iter(trans, &iter, rbio->data_btree, rbio->data_pos,
-			       BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
-	if ((ret = bkey_err(k)))
-		goto out;
-
-	if (bversion_cmp(k.k->version, rbio->version) ||
-	    !bch2_bkey_matches_ptr(c, k, rbio->pick.ptr, data_offset))
-		goto out;
-
-	/* Extent was merged? */
-	if (bkey_start_offset(k.k) < data_offset ||
-	    k.k->p.offset > data_offset + rbio->pick.crc.uncompressed_size)
-		goto out;
-
-	if (bch2_rechecksum_bio(c, &rbio->bio, rbio->version,
-			rbio->pick.crc, NULL, &new_crc,
-			bkey_start_offset(k.k) - data_offset, k.k->size,
-			rbio->pick.crc.csum_type)) {
-		bch_err(c, "error verifying existing checksum while narrowing checksum (memory corruption?)");
-		ret = 0;
-		goto out;
-	}
-
-	/*
-	 * going to be temporarily appending another checksum entry:
-	 */
-	new = bch2_trans_kmalloc(trans, bkey_bytes(k.k) +
-				 sizeof(struct bch_extent_crc128));
-	if ((ret = PTR_ERR_OR_ZERO(new)))
-		goto out;
-
-	bkey_reassemble(new, k);
-
-	if (!bch2_bkey_narrow_crcs(new, new_crc))
-		goto out;
-
-	ret = bch2_trans_update(trans, &iter, new,
-				BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
-out:
-	bch2_trans_iter_exit(trans, &iter);
-	return ret;
-}
-
-static noinline void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio)
-{
-	bch2_trans_do(rbio->c, NULL, NULL, BTREE_INSERT_NOFAIL,
-		      __bch2_rbio_narrow_crcs(&trans, rbio));
-}
-
-/* Inner part that may run in process context */
-static void __bch2_read_endio(struct work_struct *work)
-{
-	struct bch_read_bio *rbio =
-		container_of(work, struct bch_read_bio, work);
-	struct bch_fs *c	= rbio->c;
-	struct bch_dev *ca	= bch_dev_bkey_exists(c, rbio->pick.ptr.dev);
-	struct bio *src		= &rbio->bio;
-	struct bio *dst		= &bch2_rbio_parent(rbio)->bio;
-	struct bvec_iter dst_iter = rbio->bvec_iter;
-	struct bch_extent_crc_unpacked crc = rbio->pick.crc;
-	struct nonce nonce = extent_nonce(rbio->version, crc);
-	unsigned nofs_flags;
-	struct bch_csum csum;
-	int ret;
-
-	nofs_flags = memalloc_nofs_save();
-
-	/* Reset iterator for checksumming and copying bounced data: */
-	if (rbio->bounce) {
-		src->bi_iter.bi_size		= crc.compressed_size << 9;
-		src->bi_iter.bi_idx		= 0;
-		src->bi_iter.bi_bvec_done	= 0;
-	} else {
-		src->bi_iter			= rbio->bvec_iter;
-	}
-
-	csum = bch2_checksum_bio(c, crc.csum_type, nonce, src);
-	if (bch2_crc_cmp(csum, rbio->pick.crc.csum) && !c->opts.no_data_io)
-		goto csum_err;
-
-	/*
-	 * XXX
-	 * We need to rework the narrow_crcs path to deliver the read completion
-	 * first, and then punt to a different workqueue, otherwise we're
-	 * holding up reads while doing btree updates which is bad for memory
-	 * reclaim.
-	 */
-	if (unlikely(rbio->narrow_crcs))
-		bch2_rbio_narrow_crcs(rbio);
-
-	if (rbio->flags & BCH_READ_NODECODE)
-		goto nodecode;
-
-	/* Adjust crc to point to subset of data we want: */
-	crc.offset     += rbio->offset_into_extent;
-	crc.live_size	= bvec_iter_sectors(rbio->bvec_iter);
-
-	if (crc_is_compressed(crc)) {
-		ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src);
-		if (ret)
-			goto decrypt_err;
-
-		if (bch2_bio_uncompress(c, src, dst, dst_iter, crc) &&
-		    !c->opts.no_data_io)
-			goto decompression_err;
-	} else {
-		/* don't need to decrypt the entire bio: */
-		nonce = nonce_add(nonce, crc.offset << 9);
-		bio_advance(src, crc.offset << 9);
-
-		BUG_ON(src->bi_iter.bi_size < dst_iter.bi_size);
-		src->bi_iter.bi_size = dst_iter.bi_size;
-
-		ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src);
-		if (ret)
-			goto decrypt_err;
-
-		if (rbio->bounce) {
-			struct bvec_iter src_iter = src->bi_iter;
-
-			bio_copy_data_iter(dst, &dst_iter, src, &src_iter);
-		}
-	}
-
-	if (rbio->promote) {
-		/*
-		 * Re encrypt data we decrypted, so it's consistent with
-		 * rbio->crc:
-		 */
-		ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src);
-		if (ret)
-			goto decrypt_err;
-
-		promote_start(rbio->promote, rbio);
-		rbio->promote = NULL;
-	}
-nodecode:
-	if (likely(!(rbio->flags & BCH_READ_IN_RETRY))) {
-		rbio = bch2_rbio_free(rbio);
-		bch2_rbio_done(rbio);
-	}
-out:
-	memalloc_nofs_restore(nofs_flags);
-	return;
-csum_err:
-	/*
-	 * Checksum error: if the bio wasn't bounced, we may have been
-	 * reading into buffers owned by userspace (that userspace can
-	 * scribble over) - retry the read, bouncing it this time:
-	 */
-	if (!rbio->bounce && (rbio->flags & BCH_READ_USER_MAPPED)) {
-		rbio->flags |= BCH_READ_MUST_BOUNCE;
-		bch2_rbio_error(rbio, READ_RETRY, BLK_STS_IOERR);
-		goto out;
-	}
-
-	bch_err_inum_offset_ratelimited(ca,
-		rbio->read_pos.inode,
-		rbio->read_pos.offset << 9,
-		"data checksum error: expected %0llx:%0llx got %0llx:%0llx (type %s)",
-		rbio->pick.crc.csum.hi, rbio->pick.crc.csum.lo,
-		csum.hi, csum.lo, bch2_csum_types[crc.csum_type]);
-	bch2_io_error(ca);
-	bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
-	goto out;
-decompression_err:
-	bch_err_inum_offset_ratelimited(c, rbio->read_pos.inode,
-					rbio->read_pos.offset << 9,
-					"decompression error");
-	bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR);
-	goto out;
-decrypt_err:
-	bch_err_inum_offset_ratelimited(c, rbio->read_pos.inode,
-					rbio->read_pos.offset << 9,
-					"decrypt error");
-	bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR);
-	goto out;
-}
-
-static void bch2_read_endio(struct bio *bio)
-{
-	struct bch_read_bio *rbio =
-		container_of(bio, struct bch_read_bio, bio);
-	struct bch_fs *c	= rbio->c;
-	struct bch_dev *ca	= bch_dev_bkey_exists(c, rbio->pick.ptr.dev);
-	struct workqueue_struct *wq = NULL;
-	enum rbio_context context = RBIO_CONTEXT_NULL;
-
-	if (rbio->have_ioref) {
-		bch2_latency_acct(ca, rbio->submit_time, READ);
-		percpu_ref_put(&ca->io_ref);
-	}
-
-	if (!rbio->split)
-		rbio->bio.bi_end_io = rbio->end_io;
-
-	if (bch2_dev_inum_io_err_on(bio->bi_status, ca,
-				    rbio->read_pos.inode,
-				    rbio->read_pos.offset,
-				    "data read error: %s",
-			       bch2_blk_status_to_str(bio->bi_status))) {
-		bch2_rbio_error(rbio, READ_RETRY_AVOID, bio->bi_status);
-		return;
-	}
-
-	if (((rbio->flags & BCH_READ_RETRY_IF_STALE) && race_fault()) ||
-	    ptr_stale(ca, &rbio->pick.ptr)) {
-		trace_and_count(c, read_reuse_race, &rbio->bio);
-
-		if (rbio->flags & BCH_READ_RETRY_IF_STALE)
-			bch2_rbio_error(rbio, READ_RETRY, BLK_STS_AGAIN);
-		else
-			bch2_rbio_error(rbio, READ_ERR, BLK_STS_AGAIN);
-		return;
-	}
-
-	if (rbio->narrow_crcs ||
-	    rbio->promote ||
-	    crc_is_compressed(rbio->pick.crc) ||
-	    bch2_csum_type_is_encryption(rbio->pick.crc.csum_type))
-		context = RBIO_CONTEXT_UNBOUND,	wq = system_unbound_wq;
-	else if (rbio->pick.crc.csum_type)
-		context = RBIO_CONTEXT_HIGHPRI,	wq = system_highpri_wq;
-
-	bch2_rbio_punt(rbio, __bch2_read_endio, context, wq);
-}
-
-int __bch2_read_indirect_extent(struct btree_trans *trans,
-				unsigned *offset_into_extent,
-				struct bkey_buf *orig_k)
-{
-	struct btree_iter iter;
-	struct bkey_s_c k;
-	u64 reflink_offset;
-	int ret;
-
-	reflink_offset = le64_to_cpu(bkey_i_to_reflink_p(orig_k->k)->v.idx) +
-		*offset_into_extent;
-
-	k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_reflink,
-			       POS(0, reflink_offset), 0);
-	ret = bkey_err(k);
-	if (ret)
-		goto err;
-
-	if (k.k->type != KEY_TYPE_reflink_v &&
-	    k.k->type != KEY_TYPE_indirect_inline_data) {
-		bch_err_inum_offset_ratelimited(trans->c,
-			orig_k->k->k.p.inode,
-			orig_k->k->k.p.offset << 9,
-			"%llu len %u points to nonexistent indirect extent %llu",
-			orig_k->k->k.p.offset,
-			orig_k->k->k.size,
-			reflink_offset);
-		bch2_inconsistent_error(trans->c);
-		ret = -EIO;
-		goto err;
-	}
-
-	*offset_into_extent = iter.pos.offset - bkey_start_offset(k.k);
-	bch2_bkey_buf_reassemble(orig_k, trans->c, k);
-err:
-	bch2_trans_iter_exit(trans, &iter);
-	return ret;
-}
-
-static noinline void read_from_stale_dirty_pointer(struct btree_trans *trans,
-						   struct bkey_s_c k,
-						   struct bch_extent_ptr ptr)
-{
-	struct bch_fs *c = trans->c;
-	struct bch_dev *ca = bch_dev_bkey_exists(c, ptr.dev);
-	struct btree_iter iter;
-	struct printbuf buf = PRINTBUF;
-	int ret;
-
-	bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc,
-			     PTR_BUCKET_POS(c, &ptr),
-			     BTREE_ITER_CACHED);
-
-	prt_printf(&buf, "Attempting to read from stale dirty pointer:");
-	printbuf_indent_add(&buf, 2);
-	prt_newline(&buf);
-
-	bch2_bkey_val_to_text(&buf, c, k);
-	prt_newline(&buf);
-
-	prt_printf(&buf, "memory gen: %u", *bucket_gen(ca, iter.pos.offset));
-
-	ret = lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_slot(&iter)));
-	if (!ret) {
-		prt_newline(&buf);
-		bch2_bkey_val_to_text(&buf, c, k);
-	}
-
-	bch2_fs_inconsistent(c, "%s", buf.buf);
-
-	bch2_trans_iter_exit(trans, &iter);
-	printbuf_exit(&buf);
-}
-
-int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig,
-		       struct bvec_iter iter, struct bpos read_pos,
-		       enum btree_id data_btree, struct bkey_s_c k,
-		       unsigned offset_into_extent,
-		       struct bch_io_failures *failed, unsigned flags)
-{
-	struct bch_fs *c = trans->c;
-	struct extent_ptr_decoded pick;
-	struct bch_read_bio *rbio = NULL;
-	struct bch_dev *ca = NULL;
-	struct promote_op *promote = NULL;
-	bool bounce = false, read_full = false, narrow_crcs = false;
-	struct bpos data_pos = bkey_start_pos(k.k);
-	int pick_ret;
-
-	if (bkey_extent_is_inline_data(k.k)) {
-		unsigned bytes = min_t(unsigned, iter.bi_size,
-				       bkey_inline_data_bytes(k.k));
-
-		swap(iter.bi_size, bytes);
-		memcpy_to_bio(&orig->bio, iter, bkey_inline_data_p(k));
-		swap(iter.bi_size, bytes);
-		bio_advance_iter(&orig->bio, &iter, bytes);
-		zero_fill_bio_iter(&orig->bio, iter);
-		goto out_read_done;
-	}
-retry_pick:
-	pick_ret = bch2_bkey_pick_read_device(c, k, failed, &pick);
-
-	/* hole or reservation - just zero fill: */
-	if (!pick_ret)
-		goto hole;
-
-	if (pick_ret < 0) {
-		bch_err_inum_offset_ratelimited(c,
-				read_pos.inode, read_pos.offset << 9,
-				"no device to read from");
-		goto err;
-	}
-
-	ca = bch_dev_bkey_exists(c, pick.ptr.dev);
-
-	/*
-	 * Stale dirty pointers are treated as IO errors, but @failed isn't
-	 * allocated unless we're in the retry path - so if we're not in the
-	 * retry path, don't check here, it'll be caught in bch2_read_endio()
-	 * and we'll end up in the retry path:
-	 */
-	if ((flags & BCH_READ_IN_RETRY) &&
-	    !pick.ptr.cached &&
-	    unlikely(ptr_stale(ca, &pick.ptr))) {
-		read_from_stale_dirty_pointer(trans, k, pick.ptr);
-		bch2_mark_io_failure(failed, &pick);
-		goto retry_pick;
-	}
-
-	/*
-	 * Unlock the iterator while the btree node's lock is still in
-	 * cache, before doing the IO:
-	 */
-	bch2_trans_unlock(trans);
-
-	if (flags & BCH_READ_NODECODE) {
-		/*
-		 * can happen if we retry, and the extent we were going to read
-		 * has been merged in the meantime:
-		 */
-		if (pick.crc.compressed_size > orig->bio.bi_vcnt * PAGE_SECTORS)
-			goto hole;
-
-		iter.bi_size	= pick.crc.compressed_size << 9;
-		goto get_bio;
-	}
-
-	if (!(flags & BCH_READ_LAST_FRAGMENT) ||
-	    bio_flagged(&orig->bio, BIO_CHAIN))
-		flags |= BCH_READ_MUST_CLONE;
-
-	narrow_crcs = !(flags & BCH_READ_IN_RETRY) &&
-		bch2_can_narrow_extent_crcs(k, pick.crc);
-
-	if (narrow_crcs && (flags & BCH_READ_USER_MAPPED))
-		flags |= BCH_READ_MUST_BOUNCE;
-
-	EBUG_ON(offset_into_extent + bvec_iter_sectors(iter) > k.k->size);
-
-	if (crc_is_compressed(pick.crc) ||
-	    (pick.crc.csum_type != BCH_CSUM_none &&
-	     (bvec_iter_sectors(iter) != pick.crc.uncompressed_size ||
-	      (bch2_csum_type_is_encryption(pick.crc.csum_type) &&
-	       (flags & BCH_READ_USER_MAPPED)) ||
-	      (flags & BCH_READ_MUST_BOUNCE)))) {
-		read_full = true;
-		bounce = true;
-	}
-
-	if (orig->opts.promote_target)
-		promote = promote_alloc(trans, iter, k, &pick, orig->opts, flags,
-					&rbio, &bounce, &read_full);
-
-	if (!read_full) {
-		EBUG_ON(crc_is_compressed(pick.crc));
-		EBUG_ON(pick.crc.csum_type &&
-			(bvec_iter_sectors(iter) != pick.crc.uncompressed_size ||
-			 bvec_iter_sectors(iter) != pick.crc.live_size ||
-			 pick.crc.offset ||
-			 offset_into_extent));
-
-		data_pos.offset += offset_into_extent;
-		pick.ptr.offset += pick.crc.offset +
-			offset_into_extent;
-		offset_into_extent		= 0;
-		pick.crc.compressed_size	= bvec_iter_sectors(iter);
-		pick.crc.uncompressed_size	= bvec_iter_sectors(iter);
-		pick.crc.offset			= 0;
-		pick.crc.live_size		= bvec_iter_sectors(iter);
-		offset_into_extent		= 0;
-	}
-get_bio:
-	if (rbio) {
-		/*
-		 * promote already allocated bounce rbio:
-		 * promote needs to allocate a bio big enough for uncompressing
-		 * data in the write path, but we're not going to use it all
-		 * here:
-		 */
-		EBUG_ON(rbio->bio.bi_iter.bi_size <
-		       pick.crc.compressed_size << 9);
-		rbio->bio.bi_iter.bi_size =
-			pick.crc.compressed_size << 9;
-	} else if (bounce) {
-		unsigned sectors = pick.crc.compressed_size;
-
-		rbio = rbio_init(bio_alloc_bioset(NULL,
-						  DIV_ROUND_UP(sectors, PAGE_SECTORS),
-						  0,
-						  GFP_NOFS,
-						  &c->bio_read_split),
-				 orig->opts);
-
-		bch2_bio_alloc_pages_pool(c, &rbio->bio, sectors << 9);
-		rbio->bounce	= true;
-		rbio->split	= true;
-	} else if (flags & BCH_READ_MUST_CLONE) {
-		/*
-		 * Have to clone if there were any splits, due to error
-		 * reporting issues (if a split errored, and retrying didn't
-		 * work, when it reports the error to its parent (us) we don't
-		 * know if the error was from our bio, and we should retry, or
-		 * from the whole bio, in which case we don't want to retry and
-		 * lose the error)
-		 */
-		rbio = rbio_init(bio_alloc_clone(NULL, &orig->bio, GFP_NOFS,
-						 &c->bio_read_split),
-				 orig->opts);
-		rbio->bio.bi_iter = iter;
-		rbio->split	= true;
-	} else {
-		rbio = orig;
-		rbio->bio.bi_iter = iter;
-		EBUG_ON(bio_flagged(&rbio->bio, BIO_CHAIN));
-	}
-
-	EBUG_ON(bio_sectors(&rbio->bio) != pick.crc.compressed_size);
-
-	rbio->c			= c;
-	rbio->submit_time	= local_clock();
-	if (rbio->split)
-		rbio->parent	= orig;
-	else
-		rbio->end_io	= orig->bio.bi_end_io;
-	rbio->bvec_iter		= iter;
-	rbio->offset_into_extent= offset_into_extent;
-	rbio->flags		= flags;
-	rbio->have_ioref	= pick_ret > 0 && bch2_dev_get_ioref(ca, READ);
-	rbio->narrow_crcs	= narrow_crcs;
-	rbio->hole		= 0;
-	rbio->retry		= 0;
-	rbio->context		= 0;
-	/* XXX: only initialize this if needed */
-	rbio->devs_have		= bch2_bkey_devs(k);
-	rbio->pick		= pick;
-	rbio->subvol		= orig->subvol;
-	rbio->read_pos		= read_pos;
-	rbio->data_btree	= data_btree;
-	rbio->data_pos		= data_pos;
-	rbio->version		= k.k->version;
-	rbio->promote		= promote;
-	INIT_WORK(&rbio->work, NULL);
-
-	rbio->bio.bi_opf	= orig->bio.bi_opf;
-	rbio->bio.bi_iter.bi_sector = pick.ptr.offset;
-	rbio->bio.bi_end_io	= bch2_read_endio;
-
-	if (rbio->bounce)
-		trace_and_count(c, read_bounce, &rbio->bio);
-
-	this_cpu_add(c->counters[BCH_COUNTER_io_read], bio_sectors(&rbio->bio));
-	bch2_increment_clock(c, bio_sectors(&rbio->bio), READ);
-
-	/*
-	 * If it's being moved internally, we don't want to flag it as a cache
-	 * hit:
-	 */
-	if (pick.ptr.cached && !(flags & BCH_READ_NODECODE))
-		bch2_bucket_io_time_reset(trans, pick.ptr.dev,
-			PTR_BUCKET_NR(ca, &pick.ptr), READ);
-
-	if (!(flags & (BCH_READ_IN_RETRY|BCH_READ_LAST_FRAGMENT))) {
-		bio_inc_remaining(&orig->bio);
-		trace_and_count(c, read_split, &orig->bio);
-	}
-
-	if (!rbio->pick.idx) {
-		if (!rbio->have_ioref) {
-			bch_err_inum_offset_ratelimited(c,
-					read_pos.inode,
-					read_pos.offset << 9,
-					"no device to read from");
-			bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
-			goto out;
-		}
-
-		this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_user],
-			     bio_sectors(&rbio->bio));
-		bio_set_dev(&rbio->bio, ca->disk_sb.bdev);
-
-		if (unlikely(c->opts.no_data_io)) {
-			if (likely(!(flags & BCH_READ_IN_RETRY)))
-				bio_endio(&rbio->bio);
-		} else {
-			if (likely(!(flags & BCH_READ_IN_RETRY)))
-				submit_bio(&rbio->bio);
-			else
-				submit_bio_wait(&rbio->bio);
-		}
-
-		/*
-		 * We just submitted IO which may block, we expect relock fail
-		 * events and shouldn't count them:
-		 */
-		trans->notrace_relock_fail = true;
-	} else {
-		/* Attempting reconstruct read: */
-		if (bch2_ec_read_extent(c, rbio)) {
-			bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
-			goto out;
-		}
-
-		if (likely(!(flags & BCH_READ_IN_RETRY)))
-			bio_endio(&rbio->bio);
-	}
-out:
-	if (likely(!(flags & BCH_READ_IN_RETRY))) {
-		return 0;
-	} else {
-		int ret;
-
-		rbio->context = RBIO_CONTEXT_UNBOUND;
-		bch2_read_endio(&rbio->bio);
-
-		ret = rbio->retry;
-		rbio = bch2_rbio_free(rbio);
-
-		if (ret == READ_RETRY_AVOID) {
-			bch2_mark_io_failure(failed, &pick);
-			ret = READ_RETRY;
-		}
-
-		if (!ret)
-			goto out_read_done;
-
-		return ret;
-	}
-
-err:
-	if (flags & BCH_READ_IN_RETRY)
-		return READ_ERR;
-
-	orig->bio.bi_status = BLK_STS_IOERR;
-	goto out_read_done;
-
-hole:
-	/*
-	 * won't normally happen in the BCH_READ_NODECODE
-	 * (bch2_move_extent()) path, but if we retry and the extent we wanted
-	 * to read no longer exists we have to signal that:
-	 */
-	if (flags & BCH_READ_NODECODE)
-		orig->hole = true;
-
-	zero_fill_bio_iter(&orig->bio, iter);
-out_read_done:
-	if (flags & BCH_READ_LAST_FRAGMENT)
-		bch2_rbio_done(orig);
-	return 0;
-}
-
-void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
-		 struct bvec_iter bvec_iter, subvol_inum inum,
-		 struct bch_io_failures *failed, unsigned flags)
-{
-	struct btree_trans trans;
-	struct btree_iter iter;
-	struct bkey_buf sk;
-	struct bkey_s_c k;
-	u32 snapshot;
-	int ret;
-
-	BUG_ON(flags & BCH_READ_NODECODE);
-
-	bch2_bkey_buf_init(&sk);
-	bch2_trans_init(&trans, c, 0, 0);
-retry:
-	bch2_trans_begin(&trans);
-	iter = (struct btree_iter) { NULL };
-
-	ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot);
-	if (ret)
-		goto err;
-
-	bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
-			     SPOS(inum.inum, bvec_iter.bi_sector, snapshot),
-			     BTREE_ITER_SLOTS);
-	while (1) {
-		unsigned bytes, sectors, offset_into_extent;
-		enum btree_id data_btree = BTREE_ID_extents;
-
-		/*
-		 * read_extent -> io_time_reset may cause a transaction restart
-		 * without returning an error, we need to check for that here:
-		 */
-		ret = bch2_trans_relock(&trans);
-		if (ret)
-			break;
-
-		bch2_btree_iter_set_pos(&iter,
-				POS(inum.inum, bvec_iter.bi_sector));
-
-		k = bch2_btree_iter_peek_slot(&iter);
-		ret = bkey_err(k);
-		if (ret)
-			break;
-
-		offset_into_extent = iter.pos.offset -
-			bkey_start_offset(k.k);
-		sectors = k.k->size - offset_into_extent;
-
-		bch2_bkey_buf_reassemble(&sk, c, k);
-
-		ret = bch2_read_indirect_extent(&trans, &data_btree,
-					&offset_into_extent, &sk);
-		if (ret)
-			break;
-
-		k = bkey_i_to_s_c(sk.k);
-
-		/*
-		 * With indirect extents, the amount of data to read is the min
-		 * of the original extent and the indirect extent:
-		 */
-		sectors = min(sectors, k.k->size - offset_into_extent);
-
-		bytes = min(sectors, bvec_iter_sectors(bvec_iter)) << 9;
-		swap(bvec_iter.bi_size, bytes);
-
-		if (bvec_iter.bi_size == bytes)
-			flags |= BCH_READ_LAST_FRAGMENT;
-
-		ret = __bch2_read_extent(&trans, rbio, bvec_iter, iter.pos,
-					 data_btree, k,
-					 offset_into_extent, failed, flags);
-		if (ret)
-			break;
-
-		if (flags & BCH_READ_LAST_FRAGMENT)
-			break;
-
-		swap(bvec_iter.bi_size, bytes);
-		bio_advance_iter(&rbio->bio, &bvec_iter, bytes);
-
-		ret = btree_trans_too_many_iters(&trans);
-		if (ret)
-			break;
-	}
-err:
-	bch2_trans_iter_exit(&trans, &iter);
-
-	if (bch2_err_matches(ret, BCH_ERR_transaction_restart) ||
-	    ret == READ_RETRY ||
-	    ret == READ_RETRY_AVOID)
-		goto retry;
-
-	bch2_trans_exit(&trans);
-	bch2_bkey_buf_exit(&sk, c);
-
-	if (ret) {
-		bch_err_inum_offset_ratelimited(c, inum.inum,
-						bvec_iter.bi_sector << 9,
-						"read error %i from btree lookup", ret);
-		rbio->bio.bi_status = BLK_STS_IOERR;
-		bch2_rbio_done(rbio);
-	}
-}
-
-void bch2_fs_io_exit(struct bch_fs *c)
-{
-	if (c->promote_table.tbl)
-		rhashtable_destroy(&c->promote_table);
-	mempool_exit(&c->bio_bounce_pages);
-	bioset_exit(&c->bio_write);
-	bioset_exit(&c->bio_read_split);
-	bioset_exit(&c->bio_read);
-}
-
-int bch2_fs_io_init(struct bch_fs *c)
-{
-	if (bioset_init(&c->bio_read, 1, offsetof(struct bch_read_bio, bio),
-			BIOSET_NEED_BVECS))
-		return -BCH_ERR_ENOMEM_bio_read_init;
-
-	if (bioset_init(&c->bio_read_split, 1, offsetof(struct bch_read_bio, bio),
-			BIOSET_NEED_BVECS))
-		return -BCH_ERR_ENOMEM_bio_read_split_init;
-
-	if (bioset_init(&c->bio_write, 1, offsetof(struct bch_write_bio, bio),
-			BIOSET_NEED_BVECS))
-		return -BCH_ERR_ENOMEM_bio_write_init;
-
-	if (mempool_init_page_pool(&c->bio_bounce_pages,
-				   max_t(unsigned,
-					 c->opts.btree_node_size,
-					 c->opts.encoded_extent_max) /
-				   PAGE_SIZE, 0))
-		return -BCH_ERR_ENOMEM_bio_bounce_pages_init;
-
-	if (rhashtable_init(&c->promote_table, &bch_promote_params))
-		return -BCH_ERR_ENOMEM_promote_table_init;
-
-	return 0;
-}
diff --git a/fs/bcachefs/io.h b/fs/bcachefs/io.h
deleted file mode 100644
index 831e3f1b7e41..000000000000
--- a/fs/bcachefs/io.h
+++ /dev/null
@@ -1,202 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_IO_H
-#define _BCACHEFS_IO_H
-
-#include "checksum.h"
-#include "bkey_buf.h"
-#include "io_types.h"
-
-#define to_wbio(_bio)			\
-	container_of((_bio), struct bch_write_bio, bio)
-
-#define to_rbio(_bio)			\
-	container_of((_bio), struct bch_read_bio, bio)
-
-void bch2_bio_free_pages_pool(struct bch_fs *, struct bio *);
-void bch2_bio_alloc_pages_pool(struct bch_fs *, struct bio *, size_t);
-
-#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
-void bch2_latency_acct(struct bch_dev *, u64, int);
-#else
-static inline void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw) {}
-#endif
-
-void bch2_submit_wbio_replicas(struct bch_write_bio *, struct bch_fs *,
-			       enum bch_data_type, const struct bkey_i *, bool);
-
-#define BLK_STS_REMOVED		((__force blk_status_t)128)
-
-const char *bch2_blk_status_to_str(blk_status_t);
-
-#define BCH_WRITE_FLAGS()		\
-	x(ALLOC_NOWAIT)			\
-	x(CACHED)			\
-	x(DATA_ENCODED)			\
-	x(PAGES_STABLE)			\
-	x(PAGES_OWNED)			\
-	x(ONLY_SPECIFIED_DEVS)		\
-	x(WROTE_DATA_INLINE)		\
-	x(FROM_INTERNAL)		\
-	x(CHECK_ENOSPC)			\
-	x(SYNC)				\
-	x(MOVE)				\
-	x(IN_WORKER)			\
-	x(DONE)				\
-	x(IO_ERROR)			\
-	x(CONVERT_UNWRITTEN)
-
-enum __bch_write_flags {
-#define x(f)	__BCH_WRITE_##f,
-	BCH_WRITE_FLAGS()
-#undef x
-};
-
-enum bch_write_flags {
-#define x(f)	BCH_WRITE_##f = BIT(__BCH_WRITE_##f),
-	BCH_WRITE_FLAGS()
-#undef x
-};
-
-static inline struct workqueue_struct *index_update_wq(struct bch_write_op *op)
-{
-	return op->watermark == BCH_WATERMARK_copygc
-		? op->c->copygc_wq
-		: op->c->btree_update_wq;
-}
-
-int bch2_sum_sector_overwrites(struct btree_trans *, struct btree_iter *,
-			       struct bkey_i *, bool *, s64 *, s64 *);
-int bch2_extent_update(struct btree_trans *, subvol_inum,
-		       struct btree_iter *, struct bkey_i *,
-		       struct disk_reservation *, u64, s64 *, bool);
-int bch2_extent_fallocate(struct btree_trans *, subvol_inum, struct btree_iter *,
-			  unsigned, struct bch_io_opts, s64 *,
-			  struct write_point_specifier);
-
-int bch2_fpunch_at(struct btree_trans *, struct btree_iter *,
-		   subvol_inum, u64, s64 *);
-int bch2_fpunch(struct bch_fs *c, subvol_inum, u64, u64, s64 *);
-
-static inline void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c,
-				      struct bch_io_opts opts)
-{
-	op->c			= c;
-	op->end_io		= NULL;
-	op->flags		= 0;
-	op->written		= 0;
-	op->error		= 0;
-	op->csum_type		= bch2_data_checksum_type(c, opts);
-	op->compression_opt	= opts.compression;
-	op->nr_replicas		= 0;
-	op->nr_replicas_required = c->opts.data_replicas_required;
-	op->watermark		= BCH_WATERMARK_normal;
-	op->incompressible	= 0;
-	op->open_buckets.nr	= 0;
-	op->devs_have.nr	= 0;
-	op->target		= 0;
-	op->opts		= opts;
-	op->subvol		= 0;
-	op->pos			= POS_MAX;
-	op->version		= ZERO_VERSION;
-	op->write_point		= (struct write_point_specifier) { 0 };
-	op->res			= (struct disk_reservation) { 0 };
-	op->new_i_size		= U64_MAX;
-	op->i_sectors_delta	= 0;
-	op->devs_need_flush	= NULL;
-}
-
-void bch2_write(struct closure *);
-
-void bch2_write_point_do_index_updates(struct work_struct *);
-
-static inline struct bch_write_bio *wbio_init(struct bio *bio)
-{
-	struct bch_write_bio *wbio = to_wbio(bio);
-
-	memset(&wbio->wbio, 0, sizeof(wbio->wbio));
-	return wbio;
-}
-
-void bch2_write_op_to_text(struct printbuf *, struct bch_write_op *);
-
-struct bch_devs_mask;
-struct cache_promote_op;
-struct extent_ptr_decoded;
-
-int __bch2_read_indirect_extent(struct btree_trans *, unsigned *,
-				struct bkey_buf *);
-
-static inline int bch2_read_indirect_extent(struct btree_trans *trans,
-					    enum btree_id *data_btree,
-					    unsigned *offset_into_extent,
-					    struct bkey_buf *k)
-{
-	if (k->k->k.type != KEY_TYPE_reflink_p)
-		return 0;
-
-	*data_btree = BTREE_ID_reflink;
-	return __bch2_read_indirect_extent(trans, offset_into_extent, k);
-}
-
-enum bch_read_flags {
-	BCH_READ_RETRY_IF_STALE		= 1 << 0,
-	BCH_READ_MAY_PROMOTE		= 1 << 1,
-	BCH_READ_USER_MAPPED		= 1 << 2,
-	BCH_READ_NODECODE		= 1 << 3,
-	BCH_READ_LAST_FRAGMENT		= 1 << 4,
-
-	/* internal: */
-	BCH_READ_MUST_BOUNCE		= 1 << 5,
-	BCH_READ_MUST_CLONE		= 1 << 6,
-	BCH_READ_IN_RETRY		= 1 << 7,
-};
-
-int __bch2_read_extent(struct btree_trans *, struct bch_read_bio *,
-		       struct bvec_iter, struct bpos, enum btree_id,
-		       struct bkey_s_c, unsigned,
-		       struct bch_io_failures *, unsigned);
-
-static inline void bch2_read_extent(struct btree_trans *trans,
-			struct bch_read_bio *rbio, struct bpos read_pos,
-			enum btree_id data_btree, struct bkey_s_c k,
-			unsigned offset_into_extent, unsigned flags)
-{
-	__bch2_read_extent(trans, rbio, rbio->bio.bi_iter, read_pos,
-			   data_btree, k, offset_into_extent, NULL, flags);
-}
-
-void __bch2_read(struct bch_fs *, struct bch_read_bio *, struct bvec_iter,
-		 subvol_inum, struct bch_io_failures *, unsigned flags);
-
-static inline void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
-			     subvol_inum inum)
-{
-	struct bch_io_failures failed = { .nr = 0 };
-
-	BUG_ON(rbio->_state);
-
-	rbio->c = c;
-	rbio->start_time = local_clock();
-	rbio->subvol = inum.subvol;
-
-	__bch2_read(c, rbio, rbio->bio.bi_iter, inum, &failed,
-		    BCH_READ_RETRY_IF_STALE|
-		    BCH_READ_MAY_PROMOTE|
-		    BCH_READ_USER_MAPPED);
-}
-
-static inline struct bch_read_bio *rbio_init(struct bio *bio,
-					     struct bch_io_opts opts)
-{
-	struct bch_read_bio *rbio = to_rbio(bio);
-
-	rbio->_state	= 0;
-	rbio->promote	= NULL;
-	rbio->opts	= opts;
-	return rbio;
-}
-
-void bch2_fs_io_exit(struct bch_fs *);
-int bch2_fs_io_init(struct bch_fs *);
-
-#endif /* _BCACHEFS_IO_H */
diff --git a/fs/bcachefs/io_misc.c b/fs/bcachefs/io_misc.c
new file mode 100644
index 000000000000..c04e5dacfc8d
--- /dev/null
+++ b/fs/bcachefs/io_misc.c
@@ -0,0 +1,215 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * io_misc.c - fallocate, fpunch, truncate:
+ */
+
+#include "bcachefs.h"
+#include "alloc_foreground.h"
+#include "bkey_buf.h"
+#include "btree_update.h"
+#include "buckets.h"
+#include "clock.h"
+#include "extents.h"
+#include "io_misc.h"
+#include "io_write.h"
+#include "subvolume.h"
+
+/* Overwrites whatever was present with zeroes: */
+int bch2_extent_fallocate(struct btree_trans *trans,
+			  subvol_inum inum,
+			  struct btree_iter *iter,
+			  unsigned sectors,
+			  struct bch_io_opts opts,
+			  s64 *i_sectors_delta,
+			  struct write_point_specifier write_point)
+{
+	struct bch_fs *c = trans->c;
+	struct disk_reservation disk_res = { 0 };
+	struct closure cl;
+	struct open_buckets open_buckets = { 0 };
+	struct bkey_s_c k;
+	struct bkey_buf old, new;
+	unsigned sectors_allocated = 0;
+	bool have_reservation = false;
+	bool unwritten = opts.nocow &&
+	    c->sb.version >= bcachefs_metadata_version_unwritten_extents;
+	int ret;
+
+	bch2_bkey_buf_init(&old);
+	bch2_bkey_buf_init(&new);
+	closure_init_stack(&cl);
+
+	k = bch2_btree_iter_peek_slot(iter);
+	ret = bkey_err(k);
+	if (ret)
+		return ret;
+
+	sectors = min_t(u64, sectors, k.k->p.offset - iter->pos.offset);
+
+	if (!have_reservation) {
+		unsigned new_replicas =
+			max(0, (int) opts.data_replicas -
+			    (int) bch2_bkey_nr_ptrs_fully_allocated(k));
+		/*
+		 * Get a disk reservation before (in the nocow case) calling
+		 * into the allocator:
+		 */
+		ret = bch2_disk_reservation_get(c, &disk_res, sectors, new_replicas, 0);
+		if (unlikely(ret))
+			goto err;
+
+		bch2_bkey_buf_reassemble(&old, c, k);
+	}
+
+	if (have_reservation) {
+		if (!bch2_extents_match(k, bkey_i_to_s_c(old.k)))
+			goto err;
+
+		bch2_key_resize(&new.k->k, sectors);
+	} else if (!unwritten) {
+		struct bkey_i_reservation *reservation;
+
+		bch2_bkey_buf_realloc(&new, c, sizeof(*reservation) / sizeof(u64));
+		reservation = bkey_reservation_init(new.k);
+		reservation->k.p = iter->pos;
+		bch2_key_resize(&reservation->k, sectors);
+		reservation->v.nr_replicas = opts.data_replicas;
+	} else {
+		struct bkey_i_extent *e;
+		struct bch_devs_list devs_have;
+		struct write_point *wp;
+		struct bch_extent_ptr *ptr;
+
+		devs_have.nr = 0;
+
+		bch2_bkey_buf_realloc(&new, c, BKEY_EXTENT_U64s_MAX);
+
+		e = bkey_extent_init(new.k);
+		e->k.p = iter->pos;
+
+		ret = bch2_alloc_sectors_start_trans(trans,
+				opts.foreground_target,
+				false,
+				write_point,
+				&devs_have,
+				opts.data_replicas,
+				opts.data_replicas,
+				BCH_WATERMARK_normal, 0, &cl, &wp);
+		if (bch2_err_matches(ret, BCH_ERR_operation_blocked))
+			ret = -BCH_ERR_transaction_restart_nested;
+		if (ret)
+			goto err;
+
+		sectors = min(sectors, wp->sectors_free);
+		sectors_allocated = sectors;
+
+		bch2_key_resize(&e->k, sectors);
+
+		bch2_open_bucket_get(c, wp, &open_buckets);
+		bch2_alloc_sectors_append_ptrs(c, wp, &e->k_i, sectors, false);
+		bch2_alloc_sectors_done(c, wp);
+
+		extent_for_each_ptr(extent_i_to_s(e), ptr)
+			ptr->unwritten = true;
+	}
+
+	have_reservation = true;
+
+	ret = bch2_extent_update(trans, inum, iter, new.k, &disk_res,
+				 0, i_sectors_delta, true);
+err:
+	if (!ret && sectors_allocated)
+		bch2_increment_clock(c, sectors_allocated, WRITE);
+
+	bch2_open_buckets_put(c, &open_buckets);
+	bch2_disk_reservation_put(c, &disk_res);
+	bch2_bkey_buf_exit(&new, c);
+	bch2_bkey_buf_exit(&old, c);
+
+	if (closure_nr_remaining(&cl) != 1) {
+		bch2_trans_unlock(trans);
+		closure_sync(&cl);
+	}
+
+	return ret;
+}
+
+/*
+ * Returns -BCH_ERR_transacton_restart if we had to drop locks:
+ */
+int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter,
+		   subvol_inum inum, u64 end,
+		   s64 *i_sectors_delta)
+{
+	struct bch_fs *c	= trans->c;
+	unsigned max_sectors	= KEY_SIZE_MAX & (~0 << c->block_bits);
+	struct bpos end_pos = POS(inum.inum, end);
+	struct bkey_s_c k;
+	int ret = 0, ret2 = 0;
+	u32 snapshot;
+
+	while (!ret ||
+	       bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
+		struct disk_reservation disk_res =
+			bch2_disk_reservation_init(c, 0);
+		struct bkey_i delete;
+
+		if (ret)
+			ret2 = ret;
+
+		bch2_trans_begin(trans);
+
+		ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
+		if (ret)
+			continue;
+
+		bch2_btree_iter_set_snapshot(iter, snapshot);
+
+		/*
+		 * peek_upto() doesn't have ideal semantics for extents:
+		 */
+		k = bch2_btree_iter_peek_upto(iter, end_pos);
+		if (!k.k)
+			break;
+
+		ret = bkey_err(k);
+		if (ret)
+			continue;
+
+		bkey_init(&delete.k);
+		delete.k.p = iter->pos;
+
+		/* create the biggest key we can */
+		bch2_key_resize(&delete.k, max_sectors);
+		bch2_cut_back(end_pos, &delete);
+
+		ret = bch2_extent_update(trans, inum, iter, &delete,
+				&disk_res, 0, i_sectors_delta, false);
+		bch2_disk_reservation_put(c, &disk_res);
+	}
+
+	return ret ?: ret2;
+}
+
+int bch2_fpunch(struct bch_fs *c, subvol_inum inum, u64 start, u64 end,
+		s64 *i_sectors_delta)
+{
+	struct btree_trans trans;
+	struct btree_iter iter;
+	int ret;
+
+	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024);
+	bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
+			     POS(inum.inum, start),
+			     BTREE_ITER_INTENT);
+
+	ret = bch2_fpunch_at(&trans, &iter, inum, end, i_sectors_delta);
+
+	bch2_trans_iter_exit(&trans, &iter);
+	bch2_trans_exit(&trans);
+
+	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+		ret = 0;
+
+	return ret;
+}
diff --git a/fs/bcachefs/io_misc.h b/fs/bcachefs/io_misc.h
new file mode 100644
index 000000000000..46e9ce3251d6
--- /dev/null
+++ b/fs/bcachefs/io_misc.h
@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_IO_MISC_H
+#define _BCACHEFS_IO_MISC_H
+
+int bch2_extent_fallocate(struct btree_trans *, subvol_inum, struct btree_iter *,
+			  unsigned, struct bch_io_opts, s64 *,
+			  struct write_point_specifier);
+int bch2_fpunch_at(struct btree_trans *, struct btree_iter *,
+		   subvol_inum, u64, s64 *);
+int bch2_fpunch(struct bch_fs *c, subvol_inum, u64, u64, s64 *);
+
+#endif /* _BCACHEFS_IO_MISC_H */
diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c
new file mode 100644
index 000000000000..cd62bf730396
--- /dev/null
+++ b/fs/bcachefs/io_read.c
@@ -0,0 +1,1207 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Some low level IO code, and hacks for various block layer limitations
+ *
+ * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
+ * Copyright 2012 Google, Inc.
+ */
+
+#include "bcachefs.h"
+#include "alloc_background.h"
+#include "alloc_foreground.h"
+#include "btree_update.h"
+#include "buckets.h"
+#include "checksum.h"
+#include "clock.h"
+#include "compress.h"
+#include "data_update.h"
+#include "disk_groups.h"
+#include "ec.h"
+#include "error.h"
+#include "io_read.h"
+#include "io_misc.h"
+#include "io_write.h"
+#include "subvolume.h"
+#include "trace.h"
+
+#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
+
+static bool bch2_target_congested(struct bch_fs *c, u16 target)
+{
+	const struct bch_devs_mask *devs;
+	unsigned d, nr = 0, total = 0;
+	u64 now = local_clock(), last;
+	s64 congested;
+	struct bch_dev *ca;
+
+	if (!target)
+		return false;
+
+	rcu_read_lock();
+	devs = bch2_target_to_mask(c, target) ?:
+		&c->rw_devs[BCH_DATA_user];
+
+	for_each_set_bit(d, devs->d, BCH_SB_MEMBERS_MAX) {
+		ca = rcu_dereference(c->devs[d]);
+		if (!ca)
+			continue;
+
+		congested = atomic_read(&ca->congested);
+		last = READ_ONCE(ca->congested_last);
+		if (time_after64(now, last))
+			congested -= (now - last) >> 12;
+
+		total += max(congested, 0LL);
+		nr++;
+	}
+	rcu_read_unlock();
+
+	return bch2_rand_range(nr * CONGESTED_MAX) < total;
+}
+
+#else
+
+static bool bch2_target_congested(struct bch_fs *c, u16 target)
+{
+	return false;
+}
+
+#endif
+
+/* Cache promotion on read */
+
+struct promote_op {
+	struct rcu_head		rcu;
+	u64			start_time;
+
+	struct rhash_head	hash;
+	struct bpos		pos;
+
+	struct data_update	write;
+	struct bio_vec		bi_inline_vecs[0]; /* must be last */
+};
+
+static const struct rhashtable_params bch_promote_params = {
+	.head_offset	= offsetof(struct promote_op, hash),
+	.key_offset	= offsetof(struct promote_op, pos),
+	.key_len	= sizeof(struct bpos),
+};
+
+static inline bool should_promote(struct bch_fs *c, struct bkey_s_c k,
+				  struct bpos pos,
+				  struct bch_io_opts opts,
+				  unsigned flags)
+{
+	if (!(flags & BCH_READ_MAY_PROMOTE))
+		return false;
+
+	if (!opts.promote_target)
+		return false;
+
+	if (bch2_bkey_has_target(c, k, opts.promote_target))
+		return false;
+
+	if (bkey_extent_is_unwritten(k))
+		return false;
+
+	if (bch2_target_congested(c, opts.promote_target)) {
+		/* XXX trace this */
+		return false;
+	}
+
+	if (rhashtable_lookup_fast(&c->promote_table, &pos,
+				   bch_promote_params))
+		return false;
+
+	return true;
+}
+
+static void promote_free(struct bch_fs *c, struct promote_op *op)
+{
+	int ret;
+
+	bch2_data_update_exit(&op->write);
+
+	ret = rhashtable_remove_fast(&c->promote_table, &op->hash,
+				     bch_promote_params);
+	BUG_ON(ret);
+	bch2_write_ref_put(c, BCH_WRITE_REF_promote);
+	kfree_rcu(op, rcu);
+}
+
+static void promote_done(struct bch_write_op *wop)
+{
+	struct promote_op *op =
+		container_of(wop, struct promote_op, write.op);
+	struct bch_fs *c = op->write.op.c;
+
+	bch2_time_stats_update(&c->times[BCH_TIME_data_promote],
+			       op->start_time);
+	promote_free(c, op);
+}
+
+static void promote_start(struct promote_op *op, struct bch_read_bio *rbio)
+{
+	struct bio *bio = &op->write.op.wbio.bio;
+
+	trace_and_count(op->write.op.c, read_promote, &rbio->bio);
+
+	/* we now own pages: */
+	BUG_ON(!rbio->bounce);
+	BUG_ON(rbio->bio.bi_vcnt > bio->bi_max_vecs);
+
+	memcpy(bio->bi_io_vec, rbio->bio.bi_io_vec,
+	       sizeof(struct bio_vec) * rbio->bio.bi_vcnt);
+	swap(bio->bi_vcnt, rbio->bio.bi_vcnt);
+
+	bch2_data_update_read_done(&op->write, rbio->pick.crc);
+}
+
+static struct promote_op *__promote_alloc(struct btree_trans *trans,
+					  enum btree_id btree_id,
+					  struct bkey_s_c k,
+					  struct bpos pos,
+					  struct extent_ptr_decoded *pick,
+					  struct bch_io_opts opts,
+					  unsigned sectors,
+					  struct bch_read_bio **rbio)
+{
+	struct bch_fs *c = trans->c;
+	struct promote_op *op = NULL;
+	struct bio *bio;
+	unsigned pages = DIV_ROUND_UP(sectors, PAGE_SECTORS);
+	int ret;
+
+	if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_promote))
+		return NULL;
+
+	op = kzalloc(sizeof(*op) + sizeof(struct bio_vec) * pages, GFP_NOFS);
+	if (!op)
+		goto err;
+
+	op->start_time = local_clock();
+	op->pos = pos;
+
+	/*
+	 * We don't use the mempool here because extents that aren't
+	 * checksummed or compressed can be too big for the mempool:
+	 */
+	*rbio = kzalloc(sizeof(struct bch_read_bio) +
+			sizeof(struct bio_vec) * pages,
+			GFP_NOFS);
+	if (!*rbio)
+		goto err;
+
+	rbio_init(&(*rbio)->bio, opts);
+	bio_init(&(*rbio)->bio, NULL, (*rbio)->bio.bi_inline_vecs, pages, 0);
+
+	if (bch2_bio_alloc_pages(&(*rbio)->bio, sectors << 9,
+				 GFP_NOFS))
+		goto err;
+
+	(*rbio)->bounce		= true;
+	(*rbio)->split		= true;
+	(*rbio)->kmalloc	= true;
+
+	if (rhashtable_lookup_insert_fast(&c->promote_table, &op->hash,
+					  bch_promote_params))
+		goto err;
+
+	bio = &op->write.op.wbio.bio;
+	bio_init(bio, NULL, bio->bi_inline_vecs, pages, 0);
+
+	ret = bch2_data_update_init(trans, NULL, &op->write,
+			writepoint_hashed((unsigned long) current),
+			opts,
+			(struct data_update_opts) {
+				.target		= opts.promote_target,
+				.extra_replicas	= 1,
+				.write_flags	= BCH_WRITE_ALLOC_NOWAIT|BCH_WRITE_CACHED,
+			},
+			btree_id, k);
+	/*
+	 * possible errors: -BCH_ERR_nocow_lock_blocked,
+	 * -BCH_ERR_ENOSPC_disk_reservation:
+	 */
+	if (ret) {
+		ret = rhashtable_remove_fast(&c->promote_table, &op->hash,
+					bch_promote_params);
+		BUG_ON(ret);
+		goto err;
+	}
+
+	op->write.op.end_io = promote_done;
+
+	return op;
+err:
+	if (*rbio)
+		bio_free_pages(&(*rbio)->bio);
+	kfree(*rbio);
+	*rbio = NULL;
+	kfree(op);
+	bch2_write_ref_put(c, BCH_WRITE_REF_promote);
+	return NULL;
+}
+
+noinline
+static struct promote_op *promote_alloc(struct btree_trans *trans,
+					struct bvec_iter iter,
+					struct bkey_s_c k,
+					struct extent_ptr_decoded *pick,
+					struct bch_io_opts opts,
+					unsigned flags,
+					struct bch_read_bio **rbio,
+					bool *bounce,
+					bool *read_full)
+{
+	struct bch_fs *c = trans->c;
+	bool promote_full = *read_full || READ_ONCE(c->promote_whole_extents);
+	/* data might have to be decompressed in the write path: */
+	unsigned sectors = promote_full
+		? max(pick->crc.compressed_size, pick->crc.live_size)
+		: bvec_iter_sectors(iter);
+	struct bpos pos = promote_full
+		? bkey_start_pos(k.k)
+		: POS(k.k->p.inode, iter.bi_sector);
+	struct promote_op *promote;
+
+	if (!should_promote(c, k, pos, opts, flags))
+		return NULL;
+
+	promote = __promote_alloc(trans,
+				  k.k->type == KEY_TYPE_reflink_v
+				  ? BTREE_ID_reflink
+				  : BTREE_ID_extents,
+				  k, pos, pick, opts, sectors, rbio);
+	if (!promote)
+		return NULL;
+
+	*bounce		= true;
+	*read_full	= promote_full;
+	return promote;
+}
+
+/* Read */
+
+#define READ_RETRY_AVOID	1
+#define READ_RETRY		2
+#define READ_ERR		3
+
+enum rbio_context {
+	RBIO_CONTEXT_NULL,
+	RBIO_CONTEXT_HIGHPRI,
+	RBIO_CONTEXT_UNBOUND,
+};
+
+static inline struct bch_read_bio *
+bch2_rbio_parent(struct bch_read_bio *rbio)
+{
+	return rbio->split ? rbio->parent : rbio;
+}
+
+__always_inline
+static void bch2_rbio_punt(struct bch_read_bio *rbio, work_func_t fn,
+			   enum rbio_context context,
+			   struct workqueue_struct *wq)
+{
+	if (context <= rbio->context) {
+		fn(&rbio->work);
+	} else {
+		rbio->work.func		= fn;
+		rbio->context		= context;
+		queue_work(wq, &rbio->work);
+	}
+}
+
+static inline struct bch_read_bio *bch2_rbio_free(struct bch_read_bio *rbio)
+{
+	BUG_ON(rbio->bounce && !rbio->split);
+
+	if (rbio->promote)
+		promote_free(rbio->c, rbio->promote);
+	rbio->promote = NULL;
+
+	if (rbio->bounce)
+		bch2_bio_free_pages_pool(rbio->c, &rbio->bio);
+
+	if (rbio->split) {
+		struct bch_read_bio *parent = rbio->parent;
+
+		if (rbio->kmalloc)
+			kfree(rbio);
+		else
+			bio_put(&rbio->bio);
+
+		rbio = parent;
+	}
+
+	return rbio;
+}
+
+/*
+ * Only called on a top level bch_read_bio to complete an entire read request,
+ * not a split:
+ */
+static void bch2_rbio_done(struct bch_read_bio *rbio)
+{
+	if (rbio->start_time)
+		bch2_time_stats_update(&rbio->c->times[BCH_TIME_data_read],
+				       rbio->start_time);
+	bio_endio(&rbio->bio);
+}
+
+static void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio,
+				     struct bvec_iter bvec_iter,
+				     struct bch_io_failures *failed,
+				     unsigned flags)
+{
+	struct btree_trans trans;
+	struct btree_iter iter;
+	struct bkey_buf sk;
+	struct bkey_s_c k;
+	int ret;
+
+	flags &= ~BCH_READ_LAST_FRAGMENT;
+	flags |= BCH_READ_MUST_CLONE;
+
+	bch2_bkey_buf_init(&sk);
+	bch2_trans_init(&trans, c, 0, 0);
+
+	bch2_trans_iter_init(&trans, &iter, rbio->data_btree,
+			     rbio->read_pos, BTREE_ITER_SLOTS);
+retry:
+	rbio->bio.bi_status = 0;
+
+	k = bch2_btree_iter_peek_slot(&iter);
+	if (bkey_err(k))
+		goto err;
+
+	bch2_bkey_buf_reassemble(&sk, c, k);
+	k = bkey_i_to_s_c(sk.k);
+	bch2_trans_unlock(&trans);
+
+	if (!bch2_bkey_matches_ptr(c, k,
+				   rbio->pick.ptr,
+				   rbio->data_pos.offset -
+				   rbio->pick.crc.offset)) {
+		/* extent we wanted to read no longer exists: */
+		rbio->hole = true;
+		goto out;
+	}
+
+	ret = __bch2_read_extent(&trans, rbio, bvec_iter,
+				 rbio->read_pos,
+				 rbio->data_btree,
+				 k, 0, failed, flags);
+	if (ret == READ_RETRY)
+		goto retry;
+	if (ret)
+		goto err;
+out:
+	bch2_rbio_done(rbio);
+	bch2_trans_iter_exit(&trans, &iter);
+	bch2_trans_exit(&trans);
+	bch2_bkey_buf_exit(&sk, c);
+	return;
+err:
+	rbio->bio.bi_status = BLK_STS_IOERR;
+	goto out;
+}
+
+static void bch2_rbio_retry(struct work_struct *work)
+{
+	struct bch_read_bio *rbio =
+		container_of(work, struct bch_read_bio, work);
+	struct bch_fs *c	= rbio->c;
+	struct bvec_iter iter	= rbio->bvec_iter;
+	unsigned flags		= rbio->flags;
+	subvol_inum inum = {
+		.subvol = rbio->subvol,
+		.inum	= rbio->read_pos.inode,
+	};
+	struct bch_io_failures failed = { .nr = 0 };
+
+	trace_and_count(c, read_retry, &rbio->bio);
+
+	if (rbio->retry == READ_RETRY_AVOID)
+		bch2_mark_io_failure(&failed, &rbio->pick);
+
+	rbio->bio.bi_status = 0;
+
+	rbio = bch2_rbio_free(rbio);
+
+	flags |= BCH_READ_IN_RETRY;
+	flags &= ~BCH_READ_MAY_PROMOTE;
+
+	if (flags & BCH_READ_NODECODE) {
+		bch2_read_retry_nodecode(c, rbio, iter, &failed, flags);
+	} else {
+		flags &= ~BCH_READ_LAST_FRAGMENT;
+		flags |= BCH_READ_MUST_CLONE;
+
+		__bch2_read(c, rbio, iter, inum, &failed, flags);
+	}
+}
+
+static void bch2_rbio_error(struct bch_read_bio *rbio, int retry,
+			    blk_status_t error)
+{
+	rbio->retry = retry;
+
+	if (rbio->flags & BCH_READ_IN_RETRY)
+		return;
+
+	if (retry == READ_ERR) {
+		rbio = bch2_rbio_free(rbio);
+
+		rbio->bio.bi_status = error;
+		bch2_rbio_done(rbio);
+	} else {
+		bch2_rbio_punt(rbio, bch2_rbio_retry,
+			       RBIO_CONTEXT_UNBOUND, system_unbound_wq);
+	}
+}
+
+static int __bch2_rbio_narrow_crcs(struct btree_trans *trans,
+				   struct bch_read_bio *rbio)
+{
+	struct bch_fs *c = rbio->c;
+	u64 data_offset = rbio->data_pos.offset - rbio->pick.crc.offset;
+	struct bch_extent_crc_unpacked new_crc;
+	struct btree_iter iter;
+	struct bkey_i *new;
+	struct bkey_s_c k;
+	int ret = 0;
+
+	if (crc_is_compressed(rbio->pick.crc))
+		return 0;
+
+	k = bch2_bkey_get_iter(trans, &iter, rbio->data_btree, rbio->data_pos,
+			       BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+	if ((ret = bkey_err(k)))
+		goto out;
+
+	if (bversion_cmp(k.k->version, rbio->version) ||
+	    !bch2_bkey_matches_ptr(c, k, rbio->pick.ptr, data_offset))
+		goto out;
+
+	/* Extent was merged? */
+	if (bkey_start_offset(k.k) < data_offset ||
+	    k.k->p.offset > data_offset + rbio->pick.crc.uncompressed_size)
+		goto out;
+
+	if (bch2_rechecksum_bio(c, &rbio->bio, rbio->version,
+			rbio->pick.crc, NULL, &new_crc,
+			bkey_start_offset(k.k) - data_offset, k.k->size,
+			rbio->pick.crc.csum_type)) {
+		bch_err(c, "error verifying existing checksum while narrowing checksum (memory corruption?)");
+		ret = 0;
+		goto out;
+	}
+
+	/*
+	 * going to be temporarily appending another checksum entry:
+	 */
+	new = bch2_trans_kmalloc(trans, bkey_bytes(k.k) +
+				 sizeof(struct bch_extent_crc128));
+	if ((ret = PTR_ERR_OR_ZERO(new)))
+		goto out;
+
+	bkey_reassemble(new, k);
+
+	if (!bch2_bkey_narrow_crcs(new, new_crc))
+		goto out;
+
+	ret = bch2_trans_update(trans, &iter, new,
+				BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+out:
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+static noinline void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio)
+{
+	bch2_trans_do(rbio->c, NULL, NULL, BTREE_INSERT_NOFAIL,
+		      __bch2_rbio_narrow_crcs(&trans, rbio));
+}
+
+/* Inner part that may run in process context */
+static void __bch2_read_endio(struct work_struct *work)
+{
+	struct bch_read_bio *rbio =
+		container_of(work, struct bch_read_bio, work);
+	struct bch_fs *c	= rbio->c;
+	struct bch_dev *ca	= bch_dev_bkey_exists(c, rbio->pick.ptr.dev);
+	struct bio *src		= &rbio->bio;
+	struct bio *dst		= &bch2_rbio_parent(rbio)->bio;
+	struct bvec_iter dst_iter = rbio->bvec_iter;
+	struct bch_extent_crc_unpacked crc = rbio->pick.crc;
+	struct nonce nonce = extent_nonce(rbio->version, crc);
+	unsigned nofs_flags;
+	struct bch_csum csum;
+	int ret;
+
+	nofs_flags = memalloc_nofs_save();
+
+	/* Reset iterator for checksumming and copying bounced data: */
+	if (rbio->bounce) {
+		src->bi_iter.bi_size		= crc.compressed_size << 9;
+		src->bi_iter.bi_idx		= 0;
+		src->bi_iter.bi_bvec_done	= 0;
+	} else {
+		src->bi_iter			= rbio->bvec_iter;
+	}
+
+	csum = bch2_checksum_bio(c, crc.csum_type, nonce, src);
+	if (bch2_crc_cmp(csum, rbio->pick.crc.csum) && !c->opts.no_data_io)
+		goto csum_err;
+
+	/*
+	 * XXX
+	 * We need to rework the narrow_crcs path to deliver the read completion
+	 * first, and then punt to a different workqueue, otherwise we're
+	 * holding up reads while doing btree updates which is bad for memory
+	 * reclaim.
+	 */
+	if (unlikely(rbio->narrow_crcs))
+		bch2_rbio_narrow_crcs(rbio);
+
+	if (rbio->flags & BCH_READ_NODECODE)
+		goto nodecode;
+
+	/* Adjust crc to point to subset of data we want: */
+	crc.offset     += rbio->offset_into_extent;
+	crc.live_size	= bvec_iter_sectors(rbio->bvec_iter);
+
+	if (crc_is_compressed(crc)) {
+		ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src);
+		if (ret)
+			goto decrypt_err;
+
+		if (bch2_bio_uncompress(c, src, dst, dst_iter, crc) &&
+		    !c->opts.no_data_io)
+			goto decompression_err;
+	} else {
+		/* don't need to decrypt the entire bio: */
+		nonce = nonce_add(nonce, crc.offset << 9);
+		bio_advance(src, crc.offset << 9);
+
+		BUG_ON(src->bi_iter.bi_size < dst_iter.bi_size);
+		src->bi_iter.bi_size = dst_iter.bi_size;
+
+		ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src);
+		if (ret)
+			goto decrypt_err;
+
+		if (rbio->bounce) {
+			struct bvec_iter src_iter = src->bi_iter;
+
+			bio_copy_data_iter(dst, &dst_iter, src, &src_iter);
+		}
+	}
+
+	if (rbio->promote) {
+		/*
+		 * Re encrypt data we decrypted, so it's consistent with
+		 * rbio->crc:
+		 */
+		ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src);
+		if (ret)
+			goto decrypt_err;
+
+		promote_start(rbio->promote, rbio);
+		rbio->promote = NULL;
+	}
+nodecode:
+	if (likely(!(rbio->flags & BCH_READ_IN_RETRY))) {
+		rbio = bch2_rbio_free(rbio);
+		bch2_rbio_done(rbio);
+	}
+out:
+	memalloc_nofs_restore(nofs_flags);
+	return;
+csum_err:
+	/*
+	 * Checksum error: if the bio wasn't bounced, we may have been
+	 * reading into buffers owned by userspace (that userspace can
+	 * scribble over) - retry the read, bouncing it this time:
+	 */
+	if (!rbio->bounce && (rbio->flags & BCH_READ_USER_MAPPED)) {
+		rbio->flags |= BCH_READ_MUST_BOUNCE;
+		bch2_rbio_error(rbio, READ_RETRY, BLK_STS_IOERR);
+		goto out;
+	}
+
+	bch_err_inum_offset_ratelimited(ca,
+		rbio->read_pos.inode,
+		rbio->read_pos.offset << 9,
+		"data checksum error: expected %0llx:%0llx got %0llx:%0llx (type %s)",
+		rbio->pick.crc.csum.hi, rbio->pick.crc.csum.lo,
+		csum.hi, csum.lo, bch2_csum_types[crc.csum_type]);
+	bch2_io_error(ca);
+	bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
+	goto out;
+decompression_err:
+	bch_err_inum_offset_ratelimited(c, rbio->read_pos.inode,
+					rbio->read_pos.offset << 9,
+					"decompression error");
+	bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR);
+	goto out;
+decrypt_err:
+	bch_err_inum_offset_ratelimited(c, rbio->read_pos.inode,
+					rbio->read_pos.offset << 9,
+					"decrypt error");
+	bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR);
+	goto out;
+}
+
+static void bch2_read_endio(struct bio *bio)
+{
+	struct bch_read_bio *rbio =
+		container_of(bio, struct bch_read_bio, bio);
+	struct bch_fs *c	= rbio->c;
+	struct bch_dev *ca	= bch_dev_bkey_exists(c, rbio->pick.ptr.dev);
+	struct workqueue_struct *wq = NULL;
+	enum rbio_context context = RBIO_CONTEXT_NULL;
+
+	if (rbio->have_ioref) {
+		bch2_latency_acct(ca, rbio->submit_time, READ);
+		percpu_ref_put(&ca->io_ref);
+	}
+
+	if (!rbio->split)
+		rbio->bio.bi_end_io = rbio->end_io;
+
+	if (bch2_dev_inum_io_err_on(bio->bi_status, ca,
+				    rbio->read_pos.inode,
+				    rbio->read_pos.offset,
+				    "data read error: %s",
+			       bch2_blk_status_to_str(bio->bi_status))) {
+		bch2_rbio_error(rbio, READ_RETRY_AVOID, bio->bi_status);
+		return;
+	}
+
+	if (((rbio->flags & BCH_READ_RETRY_IF_STALE) && race_fault()) ||
+	    ptr_stale(ca, &rbio->pick.ptr)) {
+		trace_and_count(c, read_reuse_race, &rbio->bio);
+
+		if (rbio->flags & BCH_READ_RETRY_IF_STALE)
+			bch2_rbio_error(rbio, READ_RETRY, BLK_STS_AGAIN);
+		else
+			bch2_rbio_error(rbio, READ_ERR, BLK_STS_AGAIN);
+		return;
+	}
+
+	if (rbio->narrow_crcs ||
+	    rbio->promote ||
+	    crc_is_compressed(rbio->pick.crc) ||
+	    bch2_csum_type_is_encryption(rbio->pick.crc.csum_type))
+		context = RBIO_CONTEXT_UNBOUND,	wq = system_unbound_wq;
+	else if (rbio->pick.crc.csum_type)
+		context = RBIO_CONTEXT_HIGHPRI,	wq = system_highpri_wq;
+
+	bch2_rbio_punt(rbio, __bch2_read_endio, context, wq);
+}
+
+int __bch2_read_indirect_extent(struct btree_trans *trans,
+				unsigned *offset_into_extent,
+				struct bkey_buf *orig_k)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	u64 reflink_offset;
+	int ret;
+
+	reflink_offset = le64_to_cpu(bkey_i_to_reflink_p(orig_k->k)->v.idx) +
+		*offset_into_extent;
+
+	k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_reflink,
+			       POS(0, reflink_offset), 0);
+	ret = bkey_err(k);
+	if (ret)
+		goto err;
+
+	if (k.k->type != KEY_TYPE_reflink_v &&
+	    k.k->type != KEY_TYPE_indirect_inline_data) {
+		bch_err_inum_offset_ratelimited(trans->c,
+			orig_k->k->k.p.inode,
+			orig_k->k->k.p.offset << 9,
+			"%llu len %u points to nonexistent indirect extent %llu",
+			orig_k->k->k.p.offset,
+			orig_k->k->k.size,
+			reflink_offset);
+		bch2_inconsistent_error(trans->c);
+		ret = -EIO;
+		goto err;
+	}
+
+	*offset_into_extent = iter.pos.offset - bkey_start_offset(k.k);
+	bch2_bkey_buf_reassemble(orig_k, trans->c, k);
+err:
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+static noinline void read_from_stale_dirty_pointer(struct btree_trans *trans,
+						   struct bkey_s_c k,
+						   struct bch_extent_ptr ptr)
+{
+	struct bch_fs *c = trans->c;
+	struct bch_dev *ca = bch_dev_bkey_exists(c, ptr.dev);
+	struct btree_iter iter;
+	struct printbuf buf = PRINTBUF;
+	int ret;
+
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc,
+			     PTR_BUCKET_POS(c, &ptr),
+			     BTREE_ITER_CACHED);
+
+	prt_printf(&buf, "Attempting to read from stale dirty pointer:");
+	printbuf_indent_add(&buf, 2);
+	prt_newline(&buf);
+
+	bch2_bkey_val_to_text(&buf, c, k);
+	prt_newline(&buf);
+
+	prt_printf(&buf, "memory gen: %u", *bucket_gen(ca, iter.pos.offset));
+
+	ret = lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_slot(&iter)));
+	if (!ret) {
+		prt_newline(&buf);
+		bch2_bkey_val_to_text(&buf, c, k);
+	}
+
+	bch2_fs_inconsistent(c, "%s", buf.buf);
+
+	bch2_trans_iter_exit(trans, &iter);
+	printbuf_exit(&buf);
+}
+
+int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig,
+		       struct bvec_iter iter, struct bpos read_pos,
+		       enum btree_id data_btree, struct bkey_s_c k,
+		       unsigned offset_into_extent,
+		       struct bch_io_failures *failed, unsigned flags)
+{
+	struct bch_fs *c = trans->c;
+	struct extent_ptr_decoded pick;
+	struct bch_read_bio *rbio = NULL;
+	struct bch_dev *ca = NULL;
+	struct promote_op *promote = NULL;
+	bool bounce = false, read_full = false, narrow_crcs = false;
+	struct bpos data_pos = bkey_start_pos(k.k);
+	int pick_ret;
+
+	if (bkey_extent_is_inline_data(k.k)) {
+		unsigned bytes = min_t(unsigned, iter.bi_size,
+				       bkey_inline_data_bytes(k.k));
+
+		swap(iter.bi_size, bytes);
+		memcpy_to_bio(&orig->bio, iter, bkey_inline_data_p(k));
+		swap(iter.bi_size, bytes);
+		bio_advance_iter(&orig->bio, &iter, bytes);
+		zero_fill_bio_iter(&orig->bio, iter);
+		goto out_read_done;
+	}
+retry_pick:
+	pick_ret = bch2_bkey_pick_read_device(c, k, failed, &pick);
+
+	/* hole or reservation - just zero fill: */
+	if (!pick_ret)
+		goto hole;
+
+	if (pick_ret < 0) {
+		bch_err_inum_offset_ratelimited(c,
+				read_pos.inode, read_pos.offset << 9,
+				"no device to read from");
+		goto err;
+	}
+
+	ca = bch_dev_bkey_exists(c, pick.ptr.dev);
+
+	/*
+	 * Stale dirty pointers are treated as IO errors, but @failed isn't
+	 * allocated unless we're in the retry path - so if we're not in the
+	 * retry path, don't check here, it'll be caught in bch2_read_endio()
+	 * and we'll end up in the retry path:
+	 */
+	if ((flags & BCH_READ_IN_RETRY) &&
+	    !pick.ptr.cached &&
+	    unlikely(ptr_stale(ca, &pick.ptr))) {
+		read_from_stale_dirty_pointer(trans, k, pick.ptr);
+		bch2_mark_io_failure(failed, &pick);
+		goto retry_pick;
+	}
+
+	/*
+	 * Unlock the iterator while the btree node's lock is still in
+	 * cache, before doing the IO:
+	 */
+	bch2_trans_unlock(trans);
+
+	if (flags & BCH_READ_NODECODE) {
+		/*
+		 * can happen if we retry, and the extent we were going to read
+		 * has been merged in the meantime:
+		 */
+		if (pick.crc.compressed_size > orig->bio.bi_vcnt * PAGE_SECTORS)
+			goto hole;
+
+		iter.bi_size	= pick.crc.compressed_size << 9;
+		goto get_bio;
+	}
+
+	if (!(flags & BCH_READ_LAST_FRAGMENT) ||
+	    bio_flagged(&orig->bio, BIO_CHAIN))
+		flags |= BCH_READ_MUST_CLONE;
+
+	narrow_crcs = !(flags & BCH_READ_IN_RETRY) &&
+		bch2_can_narrow_extent_crcs(k, pick.crc);
+
+	if (narrow_crcs && (flags & BCH_READ_USER_MAPPED))
+		flags |= BCH_READ_MUST_BOUNCE;
+
+	EBUG_ON(offset_into_extent + bvec_iter_sectors(iter) > k.k->size);
+
+	if (crc_is_compressed(pick.crc) ||
+	    (pick.crc.csum_type != BCH_CSUM_none &&
+	     (bvec_iter_sectors(iter) != pick.crc.uncompressed_size ||
+	      (bch2_csum_type_is_encryption(pick.crc.csum_type) &&
+	       (flags & BCH_READ_USER_MAPPED)) ||
+	      (flags & BCH_READ_MUST_BOUNCE)))) {
+		read_full = true;
+		bounce = true;
+	}
+
+	if (orig->opts.promote_target)
+		promote = promote_alloc(trans, iter, k, &pick, orig->opts, flags,
+					&rbio, &bounce, &read_full);
+
+	if (!read_full) {
+		EBUG_ON(crc_is_compressed(pick.crc));
+		EBUG_ON(pick.crc.csum_type &&
+			(bvec_iter_sectors(iter) != pick.crc.uncompressed_size ||
+			 bvec_iter_sectors(iter) != pick.crc.live_size ||
+			 pick.crc.offset ||
+			 offset_into_extent));
+
+		data_pos.offset += offset_into_extent;
+		pick.ptr.offset += pick.crc.offset +
+			offset_into_extent;
+		offset_into_extent		= 0;
+		pick.crc.compressed_size	= bvec_iter_sectors(iter);
+		pick.crc.uncompressed_size	= bvec_iter_sectors(iter);
+		pick.crc.offset			= 0;
+		pick.crc.live_size		= bvec_iter_sectors(iter);
+		offset_into_extent		= 0;
+	}
+get_bio:
+	if (rbio) {
+		/*
+		 * promote already allocated bounce rbio:
+		 * promote needs to allocate a bio big enough for uncompressing
+		 * data in the write path, but we're not going to use it all
+		 * here:
+		 */
+		EBUG_ON(rbio->bio.bi_iter.bi_size <
+		       pick.crc.compressed_size << 9);
+		rbio->bio.bi_iter.bi_size =
+			pick.crc.compressed_size << 9;
+	} else if (bounce) {
+		unsigned sectors = pick.crc.compressed_size;
+
+		rbio = rbio_init(bio_alloc_bioset(NULL,
+						  DIV_ROUND_UP(sectors, PAGE_SECTORS),
+						  0,
+						  GFP_NOFS,
+						  &c->bio_read_split),
+				 orig->opts);
+
+		bch2_bio_alloc_pages_pool(c, &rbio->bio, sectors << 9);
+		rbio->bounce	= true;
+		rbio->split	= true;
+	} else if (flags & BCH_READ_MUST_CLONE) {
+		/*
+		 * Have to clone if there were any splits, due to error
+		 * reporting issues (if a split errored, and retrying didn't
+		 * work, when it reports the error to its parent (us) we don't
+		 * know if the error was from our bio, and we should retry, or
+		 * from the whole bio, in which case we don't want to retry and
+		 * lose the error)
+		 */
+		rbio = rbio_init(bio_alloc_clone(NULL, &orig->bio, GFP_NOFS,
+						 &c->bio_read_split),
+				 orig->opts);
+		rbio->bio.bi_iter = iter;
+		rbio->split	= true;
+	} else {
+		rbio = orig;
+		rbio->bio.bi_iter = iter;
+		EBUG_ON(bio_flagged(&rbio->bio, BIO_CHAIN));
+	}
+
+	EBUG_ON(bio_sectors(&rbio->bio) != pick.crc.compressed_size);
+
+	rbio->c			= c;
+	rbio->submit_time	= local_clock();
+	if (rbio->split)
+		rbio->parent	= orig;
+	else
+		rbio->end_io	= orig->bio.bi_end_io;
+	rbio->bvec_iter		= iter;
+	rbio->offset_into_extent= offset_into_extent;
+	rbio->flags		= flags;
+	rbio->have_ioref	= pick_ret > 0 && bch2_dev_get_ioref(ca, READ);
+	rbio->narrow_crcs	= narrow_crcs;
+	rbio->hole		= 0;
+	rbio->retry		= 0;
+	rbio->context		= 0;
+	/* XXX: only initialize this if needed */
+	rbio->devs_have		= bch2_bkey_devs(k);
+	rbio->pick		= pick;
+	rbio->subvol		= orig->subvol;
+	rbio->read_pos		= read_pos;
+	rbio->data_btree	= data_btree;
+	rbio->data_pos		= data_pos;
+	rbio->version		= k.k->version;
+	rbio->promote		= promote;
+	INIT_WORK(&rbio->work, NULL);
+
+	rbio->bio.bi_opf	= orig->bio.bi_opf;
+	rbio->bio.bi_iter.bi_sector = pick.ptr.offset;
+	rbio->bio.bi_end_io	= bch2_read_endio;
+
+	if (rbio->bounce)
+		trace_and_count(c, read_bounce, &rbio->bio);
+
+	this_cpu_add(c->counters[BCH_COUNTER_io_read], bio_sectors(&rbio->bio));
+	bch2_increment_clock(c, bio_sectors(&rbio->bio), READ);
+
+	/*
+	 * If it's being moved internally, we don't want to flag it as a cache
+	 * hit:
+	 */
+	if (pick.ptr.cached && !(flags & BCH_READ_NODECODE))
+		bch2_bucket_io_time_reset(trans, pick.ptr.dev,
+			PTR_BUCKET_NR(ca, &pick.ptr), READ);
+
+	if (!(flags & (BCH_READ_IN_RETRY|BCH_READ_LAST_FRAGMENT))) {
+		bio_inc_remaining(&orig->bio);
+		trace_and_count(c, read_split, &orig->bio);
+	}
+
+	if (!rbio->pick.idx) {
+		if (!rbio->have_ioref) {
+			bch_err_inum_offset_ratelimited(c,
+					read_pos.inode,
+					read_pos.offset << 9,
+					"no device to read from");
+			bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
+			goto out;
+		}
+
+		this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_user],
+			     bio_sectors(&rbio->bio));
+		bio_set_dev(&rbio->bio, ca->disk_sb.bdev);
+
+		if (unlikely(c->opts.no_data_io)) {
+			if (likely(!(flags & BCH_READ_IN_RETRY)))
+				bio_endio(&rbio->bio);
+		} else {
+			if (likely(!(flags & BCH_READ_IN_RETRY)))
+				submit_bio(&rbio->bio);
+			else
+				submit_bio_wait(&rbio->bio);
+		}
+
+		/*
+		 * We just submitted IO which may block, we expect relock fail
+		 * events and shouldn't count them:
+		 */
+		trans->notrace_relock_fail = true;
+	} else {
+		/* Attempting reconstruct read: */
+		if (bch2_ec_read_extent(c, rbio)) {
+			bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
+			goto out;
+		}
+
+		if (likely(!(flags & BCH_READ_IN_RETRY)))
+			bio_endio(&rbio->bio);
+	}
+out:
+	if (likely(!(flags & BCH_READ_IN_RETRY))) {
+		return 0;
+	} else {
+		int ret;
+
+		rbio->context = RBIO_CONTEXT_UNBOUND;
+		bch2_read_endio(&rbio->bio);
+
+		ret = rbio->retry;
+		rbio = bch2_rbio_free(rbio);
+
+		if (ret == READ_RETRY_AVOID) {
+			bch2_mark_io_failure(failed, &pick);
+			ret = READ_RETRY;
+		}
+
+		if (!ret)
+			goto out_read_done;
+
+		return ret;
+	}
+
+err:
+	if (flags & BCH_READ_IN_RETRY)
+		return READ_ERR;
+
+	orig->bio.bi_status = BLK_STS_IOERR;
+	goto out_read_done;
+
+hole:
+	/*
+	 * won't normally happen in the BCH_READ_NODECODE
+	 * (bch2_move_extent()) path, but if we retry and the extent we wanted
+	 * to read no longer exists we have to signal that:
+	 */
+	if (flags & BCH_READ_NODECODE)
+		orig->hole = true;
+
+	zero_fill_bio_iter(&orig->bio, iter);
+out_read_done:
+	if (flags & BCH_READ_LAST_FRAGMENT)
+		bch2_rbio_done(orig);
+	return 0;
+}
+
+void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
+		 struct bvec_iter bvec_iter, subvol_inum inum,
+		 struct bch_io_failures *failed, unsigned flags)
+{
+	struct btree_trans trans;
+	struct btree_iter iter;
+	struct bkey_buf sk;
+	struct bkey_s_c k;
+	u32 snapshot;
+	int ret;
+
+	BUG_ON(flags & BCH_READ_NODECODE);
+
+	bch2_bkey_buf_init(&sk);
+	bch2_trans_init(&trans, c, 0, 0);
+retry:
+	bch2_trans_begin(&trans);
+	iter = (struct btree_iter) { NULL };
+
+	ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot);
+	if (ret)
+		goto err;
+
+	bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
+			     SPOS(inum.inum, bvec_iter.bi_sector, snapshot),
+			     BTREE_ITER_SLOTS);
+	while (1) {
+		unsigned bytes, sectors, offset_into_extent;
+		enum btree_id data_btree = BTREE_ID_extents;
+
+		/*
+		 * read_extent -> io_time_reset may cause a transaction restart
+		 * without returning an error, we need to check for that here:
+		 */
+		ret = bch2_trans_relock(&trans);
+		if (ret)
+			break;
+
+		bch2_btree_iter_set_pos(&iter,
+				POS(inum.inum, bvec_iter.bi_sector));
+
+		k = bch2_btree_iter_peek_slot(&iter);
+		ret = bkey_err(k);
+		if (ret)
+			break;
+
+		offset_into_extent = iter.pos.offset -
+			bkey_start_offset(k.k);
+		sectors = k.k->size - offset_into_extent;
+
+		bch2_bkey_buf_reassemble(&sk, c, k);
+
+		ret = bch2_read_indirect_extent(&trans, &data_btree,
+					&offset_into_extent, &sk);
+		if (ret)
+			break;
+
+		k = bkey_i_to_s_c(sk.k);
+
+		/*
+		 * With indirect extents, the amount of data to read is the min
+		 * of the original extent and the indirect extent:
+		 */
+		sectors = min(sectors, k.k->size - offset_into_extent);
+
+		bytes = min(sectors, bvec_iter_sectors(bvec_iter)) << 9;
+		swap(bvec_iter.bi_size, bytes);
+
+		if (bvec_iter.bi_size == bytes)
+			flags |= BCH_READ_LAST_FRAGMENT;
+
+		ret = __bch2_read_extent(&trans, rbio, bvec_iter, iter.pos,
+					 data_btree, k,
+					 offset_into_extent, failed, flags);
+		if (ret)
+			break;
+
+		if (flags & BCH_READ_LAST_FRAGMENT)
+			break;
+
+		swap(bvec_iter.bi_size, bytes);
+		bio_advance_iter(&rbio->bio, &bvec_iter, bytes);
+
+		ret = btree_trans_too_many_iters(&trans);
+		if (ret)
+			break;
+	}
+err:
+	bch2_trans_iter_exit(&trans, &iter);
+
+	if (bch2_err_matches(ret, BCH_ERR_transaction_restart) ||
+	    ret == READ_RETRY ||
+	    ret == READ_RETRY_AVOID)
+		goto retry;
+
+	bch2_trans_exit(&trans);
+	bch2_bkey_buf_exit(&sk, c);
+
+	if (ret) {
+		bch_err_inum_offset_ratelimited(c, inum.inum,
+						bvec_iter.bi_sector << 9,
+						"read error %i from btree lookup", ret);
+		rbio->bio.bi_status = BLK_STS_IOERR;
+		bch2_rbio_done(rbio);
+	}
+}
+
+void bch2_fs_io_read_exit(struct bch_fs *c)
+{
+	if (c->promote_table.tbl)
+		rhashtable_destroy(&c->promote_table);
+	bioset_exit(&c->bio_read_split);
+	bioset_exit(&c->bio_read);
+}
+
+int bch2_fs_io_read_init(struct bch_fs *c)
+{
+	if (bioset_init(&c->bio_read, 1, offsetof(struct bch_read_bio, bio),
+			BIOSET_NEED_BVECS))
+		return -BCH_ERR_ENOMEM_bio_read_init;
+
+	if (bioset_init(&c->bio_read_split, 1, offsetof(struct bch_read_bio, bio),
+			BIOSET_NEED_BVECS))
+		return -BCH_ERR_ENOMEM_bio_read_split_init;
+
+	if (rhashtable_init(&c->promote_table, &bch_promote_params))
+		return -BCH_ERR_ENOMEM_promote_table_init;
+
+	return 0;
+}
diff --git a/fs/bcachefs/io_read.h b/fs/bcachefs/io_read.h
new file mode 100644
index 000000000000..d9c18bb7d403
--- /dev/null
+++ b/fs/bcachefs/io_read.h
@@ -0,0 +1,158 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_IO_READ_H
+#define _BCACHEFS_IO_READ_H
+
+#include "bkey_buf.h"
+
+struct bch_read_bio {
+	struct bch_fs		*c;
+	u64			start_time;
+	u64			submit_time;
+
+	/*
+	 * Reads will often have to be split, and if the extent being read from
+	 * was checksummed or compressed we'll also have to allocate bounce
+	 * buffers and copy the data back into the original bio.
+	 *
+	 * If we didn't have to split, we have to save and restore the original
+	 * bi_end_io - @split below indicates which:
+	 */
+	union {
+	struct bch_read_bio	*parent;
+	bio_end_io_t		*end_io;
+	};
+
+	/*
+	 * Saved copy of bio->bi_iter, from submission time - allows us to
+	 * resubmit on IO error, and also to copy data back to the original bio
+	 * when we're bouncing:
+	 */
+	struct bvec_iter	bvec_iter;
+
+	unsigned		offset_into_extent;
+
+	u16			flags;
+	union {
+	struct {
+	u16			bounce:1,
+				split:1,
+				kmalloc:1,
+				have_ioref:1,
+				narrow_crcs:1,
+				hole:1,
+				retry:2,
+				context:2;
+	};
+	u16			_state;
+	};
+
+	struct bch_devs_list	devs_have;
+
+	struct extent_ptr_decoded pick;
+
+	/*
+	 * pos we read from - different from data_pos for indirect extents:
+	 */
+	u32			subvol;
+	struct bpos		read_pos;
+
+	/*
+	 * start pos of data we read (may not be pos of data we want) - for
+	 * promote, narrow extents paths:
+	 */
+	enum btree_id		data_btree;
+	struct bpos		data_pos;
+	struct bversion		version;
+
+	struct promote_op	*promote;
+
+	struct bch_io_opts	opts;
+
+	struct work_struct	work;
+
+	struct bio		bio;
+};
+
+#define to_rbio(_bio)		container_of((_bio), struct bch_read_bio, bio)
+
+struct bch_devs_mask;
+struct cache_promote_op;
+struct extent_ptr_decoded;
+
+int __bch2_read_indirect_extent(struct btree_trans *, unsigned *,
+				struct bkey_buf *);
+
+static inline int bch2_read_indirect_extent(struct btree_trans *trans,
+					    enum btree_id *data_btree,
+					    unsigned *offset_into_extent,
+					    struct bkey_buf *k)
+{
+	if (k->k->k.type != KEY_TYPE_reflink_p)
+		return 0;
+
+	*data_btree = BTREE_ID_reflink;
+	return __bch2_read_indirect_extent(trans, offset_into_extent, k);
+}
+
+enum bch_read_flags {
+	BCH_READ_RETRY_IF_STALE		= 1 << 0,
+	BCH_READ_MAY_PROMOTE		= 1 << 1,
+	BCH_READ_USER_MAPPED		= 1 << 2,
+	BCH_READ_NODECODE		= 1 << 3,
+	BCH_READ_LAST_FRAGMENT		= 1 << 4,
+
+	/* internal: */
+	BCH_READ_MUST_BOUNCE		= 1 << 5,
+	BCH_READ_MUST_CLONE		= 1 << 6,
+	BCH_READ_IN_RETRY		= 1 << 7,
+};
+
+int __bch2_read_extent(struct btree_trans *, struct bch_read_bio *,
+		       struct bvec_iter, struct bpos, enum btree_id,
+		       struct bkey_s_c, unsigned,
+		       struct bch_io_failures *, unsigned);
+
+static inline void bch2_read_extent(struct btree_trans *trans,
+			struct bch_read_bio *rbio, struct bpos read_pos,
+			enum btree_id data_btree, struct bkey_s_c k,
+			unsigned offset_into_extent, unsigned flags)
+{
+	__bch2_read_extent(trans, rbio, rbio->bio.bi_iter, read_pos,
+			   data_btree, k, offset_into_extent, NULL, flags);
+}
+
+void __bch2_read(struct bch_fs *, struct bch_read_bio *, struct bvec_iter,
+		 subvol_inum, struct bch_io_failures *, unsigned flags);
+
+static inline void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
+			     subvol_inum inum)
+{
+	struct bch_io_failures failed = { .nr = 0 };
+
+	BUG_ON(rbio->_state);
+
+	rbio->c = c;
+	rbio->start_time = local_clock();
+	rbio->subvol = inum.subvol;
+
+	__bch2_read(c, rbio, rbio->bio.bi_iter, inum, &failed,
+		    BCH_READ_RETRY_IF_STALE|
+		    BCH_READ_MAY_PROMOTE|
+		    BCH_READ_USER_MAPPED);
+}
+
+static inline struct bch_read_bio *rbio_init(struct bio *bio,
+					     struct bch_io_opts opts)
+{
+	struct bch_read_bio *rbio = to_rbio(bio);
+
+	rbio->_state	= 0;
+	rbio->promote	= NULL;
+	rbio->opts	= opts;
+	return rbio;
+}
+
+void bch2_fs_io_read_exit(struct bch_fs *);
+int bch2_fs_io_read_init(struct bch_fs *);
+
+#endif /* _BCACHEFS_IO_READ_H */
diff --git a/fs/bcachefs/io_types.h b/fs/bcachefs/io_types.h
deleted file mode 100644
index 737f16d78c48..000000000000
--- a/fs/bcachefs/io_types.h
+++ /dev/null
@@ -1,165 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_IO_TYPES_H
-#define _BCACHEFS_IO_TYPES_H
-
-#include "alloc_types.h"
-#include "btree_types.h"
-#include "buckets_types.h"
-#include "extents_types.h"
-#include "keylist_types.h"
-#include "opts.h"
-#include "super_types.h"
-
-#include <linux/llist.h>
-#include <linux/workqueue.h>
-
-struct bch_read_bio {
-	struct bch_fs		*c;
-	u64			start_time;
-	u64			submit_time;
-
-	/*
-	 * Reads will often have to be split, and if the extent being read from
-	 * was checksummed or compressed we'll also have to allocate bounce
-	 * buffers and copy the data back into the original bio.
-	 *
-	 * If we didn't have to split, we have to save and restore the original
-	 * bi_end_io - @split below indicates which:
-	 */
-	union {
-	struct bch_read_bio	*parent;
-	bio_end_io_t		*end_io;
-	};
-
-	/*
-	 * Saved copy of bio->bi_iter, from submission time - allows us to
-	 * resubmit on IO error, and also to copy data back to the original bio
-	 * when we're bouncing:
-	 */
-	struct bvec_iter	bvec_iter;
-
-	unsigned		offset_into_extent;
-
-	u16			flags;
-	union {
-	struct {
-	u16			bounce:1,
-				split:1,
-				kmalloc:1,
-				have_ioref:1,
-				narrow_crcs:1,
-				hole:1,
-				retry:2,
-				context:2;
-	};
-	u16			_state;
-	};
-
-	struct bch_devs_list	devs_have;
-
-	struct extent_ptr_decoded pick;
-
-	/*
-	 * pos we read from - different from data_pos for indirect extents:
-	 */
-	u32			subvol;
-	struct bpos		read_pos;
-
-	/*
-	 * start pos of data we read (may not be pos of data we want) - for
-	 * promote, narrow extents paths:
-	 */
-	enum btree_id		data_btree;
-	struct bpos		data_pos;
-	struct bversion		version;
-
-	struct promote_op	*promote;
-
-	struct bch_io_opts	opts;
-
-	struct work_struct	work;
-
-	struct bio		bio;
-};
-
-struct bch_write_bio {
-	struct_group(wbio,
-	struct bch_fs		*c;
-	struct bch_write_bio	*parent;
-
-	u64			submit_time;
-	u64			inode_offset;
-
-	struct bch_devs_list	failed;
-	u8			dev;
-
-	unsigned		split:1,
-				bounce:1,
-				put_bio:1,
-				have_ioref:1,
-				nocow:1,
-				used_mempool:1,
-				first_btree_write:1;
-	);
-
-	struct bio		bio;
-};
-
-struct bch_write_op {
-	struct closure		cl;
-	struct bch_fs		*c;
-	void			(*end_io)(struct bch_write_op *);
-	u64			start_time;
-
-	unsigned		written; /* sectors */
-	u16			flags;
-	s16			error; /* dio write path expects it to hold -ERESTARTSYS... */
-
-	unsigned		compression_opt:8;
-	unsigned		csum_type:4;
-	unsigned		nr_replicas:4;
-	unsigned		nr_replicas_required:4;
-	unsigned		watermark:3;
-	unsigned		incompressible:1;
-	unsigned		stripe_waited:1;
-
-	struct bch_devs_list	devs_have;
-	u16			target;
-	u16			nonce;
-	struct bch_io_opts	opts;
-
-	u32			subvol;
-	struct bpos		pos;
-	struct bversion		version;
-
-	/* For BCH_WRITE_DATA_ENCODED: */
-	struct bch_extent_crc_unpacked crc;
-
-	struct write_point_specifier write_point;
-
-	struct write_point	*wp;
-	struct list_head	wp_list;
-
-	struct disk_reservation	res;
-
-	struct open_buckets	open_buckets;
-
-	u64			new_i_size;
-	s64			i_sectors_delta;
-
-	struct bch_devs_mask	failed;
-
-	struct keylist		insert_keys;
-	u64			inline_keys[BKEY_EXTENT_U64s_MAX * 2];
-
-	/*
-	 * Bitmask of devices that have had nocow writes issued to them since
-	 * last flush:
-	 */
-	struct bch_devs_mask	*devs_need_flush;
-
-	/* Must be last: */
-	struct bch_write_bio	wbio;
-};
-
-#endif /* _BCACHEFS_IO_TYPES_H */
diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c
new file mode 100644
index 000000000000..7f29fd2f05b1
--- /dev/null
+++ b/fs/bcachefs/io_write.c
@@ -0,0 +1,1670 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
+ * Copyright 2012 Google, Inc.
+ */
+
+#include "bcachefs.h"
+#include "alloc_foreground.h"
+#include "bkey_buf.h"
+#include "bset.h"
+#include "btree_update.h"
+#include "buckets.h"
+#include "checksum.h"
+#include "clock.h"
+#include "compress.h"
+#include "debug.h"
+#include "ec.h"
+#include "error.h"
+#include "extent_update.h"
+#include "inode.h"
+#include "io_write.h"
+#include "journal.h"
+#include "keylist.h"
+#include "move.h"
+#include "nocow_locking.h"
+#include "rebalance.h"
+#include "subvolume.h"
+#include "super.h"
+#include "super-io.h"
+#include "trace.h"
+
+#include <linux/blkdev.h>
+#include <linux/prefetch.h>
+#include <linux/random.h>
+#include <linux/sched/mm.h>
+
+#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
+
+static inline void bch2_congested_acct(struct bch_dev *ca, u64 io_latency,
+				       u64 now, int rw)
+{
+	u64 latency_capable =
+		ca->io_latency[rw].quantiles.entries[QUANTILE_IDX(1)].m;
+	/* ideally we'd be taking into account the device's variance here: */
+	u64 latency_threshold = latency_capable << (rw == READ ? 2 : 3);
+	s64 latency_over = io_latency - latency_threshold;
+
+	if (latency_threshold && latency_over > 0) {
+		/*
+		 * bump up congested by approximately latency_over * 4 /
+		 * latency_threshold - we don't need much accuracy here so don't
+		 * bother with the divide:
+		 */
+		if (atomic_read(&ca->congested) < CONGESTED_MAX)
+			atomic_add(latency_over >>
+				   max_t(int, ilog2(latency_threshold) - 2, 0),
+				   &ca->congested);
+
+		ca->congested_last = now;
+	} else if (atomic_read(&ca->congested) > 0) {
+		atomic_dec(&ca->congested);
+	}
+}
+
+void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw)
+{
+	atomic64_t *latency = &ca->cur_latency[rw];
+	u64 now = local_clock();
+	u64 io_latency = time_after64(now, submit_time)
+		? now - submit_time
+		: 0;
+	u64 old, new, v = atomic64_read(latency);
+
+	do {
+		old = v;
+
+		/*
+		 * If the io latency was reasonably close to the current
+		 * latency, skip doing the update and atomic operation - most of
+		 * the time:
+		 */
+		if (abs((int) (old - io_latency)) < (old >> 1) &&
+		    now & ~(~0U << 5))
+			break;
+
+		new = ewma_add(old, io_latency, 5);
+	} while ((v = atomic64_cmpxchg(latency, old, new)) != old);
+
+	bch2_congested_acct(ca, io_latency, now, rw);
+
+	__bch2_time_stats_update(&ca->io_latency[rw], submit_time, now);
+}
+
+#endif
+
+/* Allocate, free from mempool: */
+
+void bch2_bio_free_pages_pool(struct bch_fs *c, struct bio *bio)
+{
+	struct bvec_iter_all iter;
+	struct bio_vec *bv;
+
+	bio_for_each_segment_all(bv, bio, iter)
+		if (bv->bv_page != ZERO_PAGE(0))
+			mempool_free(bv->bv_page, &c->bio_bounce_pages);
+	bio->bi_vcnt = 0;
+}
+
+static struct page *__bio_alloc_page_pool(struct bch_fs *c, bool *using_mempool)
+{
+	struct page *page;
+
+	if (likely(!*using_mempool)) {
+		page = alloc_page(GFP_NOFS);
+		if (unlikely(!page)) {
+			mutex_lock(&c->bio_bounce_pages_lock);
+			*using_mempool = true;
+			goto pool_alloc;
+
+		}
+	} else {
+pool_alloc:
+		page = mempool_alloc(&c->bio_bounce_pages, GFP_NOFS);
+	}
+
+	return page;
+}
+
+void bch2_bio_alloc_pages_pool(struct bch_fs *c, struct bio *bio,
+			       size_t size)
+{
+	bool using_mempool = false;
+
+	while (size) {
+		struct page *page = __bio_alloc_page_pool(c, &using_mempool);
+		unsigned len = min_t(size_t, PAGE_SIZE, size);
+
+		BUG_ON(!bio_add_page(bio, page, len, 0));
+		size -= len;
+	}
+
+	if (using_mempool)
+		mutex_unlock(&c->bio_bounce_pages_lock);
+}
+
+/* Extent update path: */
+
+int bch2_sum_sector_overwrites(struct btree_trans *trans,
+			       struct btree_iter *extent_iter,
+			       struct bkey_i *new,
+			       bool *usage_increasing,
+			       s64 *i_sectors_delta,
+			       s64 *disk_sectors_delta)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter iter;
+	struct bkey_s_c old;
+	unsigned new_replicas = bch2_bkey_replicas(c, bkey_i_to_s_c(new));
+	bool new_compressed = bch2_bkey_sectors_compressed(bkey_i_to_s_c(new));
+	int ret = 0;
+
+	*usage_increasing	= false;
+	*i_sectors_delta	= 0;
+	*disk_sectors_delta	= 0;
+
+	bch2_trans_copy_iter(&iter, extent_iter);
+
+	for_each_btree_key_upto_continue_norestart(iter,
+				new->k.p, BTREE_ITER_SLOTS, old, ret) {
+		s64 sectors = min(new->k.p.offset, old.k->p.offset) -
+			max(bkey_start_offset(&new->k),
+			    bkey_start_offset(old.k));
+
+		*i_sectors_delta += sectors *
+			(bkey_extent_is_allocation(&new->k) -
+			 bkey_extent_is_allocation(old.k));
+
+		*disk_sectors_delta += sectors * bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(new));
+		*disk_sectors_delta -= new->k.p.snapshot == old.k->p.snapshot
+			? sectors * bch2_bkey_nr_ptrs_fully_allocated(old)
+			: 0;
+
+		if (!*usage_increasing &&
+		    (new->k.p.snapshot != old.k->p.snapshot ||
+		     new_replicas > bch2_bkey_replicas(c, old) ||
+		     (!new_compressed && bch2_bkey_sectors_compressed(old))))
+			*usage_increasing = true;
+
+		if (bkey_ge(old.k->p, new->k.p))
+			break;
+	}
+
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+static inline int bch2_extent_update_i_size_sectors(struct btree_trans *trans,
+						    struct btree_iter *extent_iter,
+						    u64 new_i_size,
+						    s64 i_sectors_delta)
+{
+	struct btree_iter iter;
+	struct bkey_i *k;
+	struct bkey_i_inode_v3 *inode;
+	unsigned inode_update_flags = BTREE_UPDATE_NOJOURNAL;
+	int ret;
+
+	k = bch2_bkey_get_mut_noupdate(trans, &iter, BTREE_ID_inodes,
+			      SPOS(0,
+				   extent_iter->pos.inode,
+				   extent_iter->snapshot),
+			      BTREE_ITER_CACHED);
+	ret = PTR_ERR_OR_ZERO(k);
+	if (unlikely(ret))
+		return ret;
+
+	if (unlikely(k->k.type != KEY_TYPE_inode_v3)) {
+		k = bch2_inode_to_v3(trans, k);
+		ret = PTR_ERR_OR_ZERO(k);
+		if (unlikely(ret))
+			goto err;
+	}
+
+	inode = bkey_i_to_inode_v3(k);
+
+	if (!(le64_to_cpu(inode->v.bi_flags) & BCH_INODE_I_SIZE_DIRTY) &&
+	    new_i_size > le64_to_cpu(inode->v.bi_size)) {
+		inode->v.bi_size = cpu_to_le64(new_i_size);
+		inode_update_flags = 0;
+	}
+
+	if (i_sectors_delta) {
+		le64_add_cpu(&inode->v.bi_sectors, i_sectors_delta);
+		inode_update_flags = 0;
+	}
+
+	if (inode->k.p.snapshot != iter.snapshot) {
+		inode->k.p.snapshot = iter.snapshot;
+		inode_update_flags = 0;
+	}
+
+	ret = bch2_trans_update(trans, &iter, &inode->k_i,
+				BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|
+				inode_update_flags);
+err:
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+int bch2_extent_update(struct btree_trans *trans,
+		       subvol_inum inum,
+		       struct btree_iter *iter,
+		       struct bkey_i *k,
+		       struct disk_reservation *disk_res,
+		       u64 new_i_size,
+		       s64 *i_sectors_delta_total,
+		       bool check_enospc)
+{
+	struct bpos next_pos;
+	bool usage_increasing;
+	s64 i_sectors_delta = 0, disk_sectors_delta = 0;
+	int ret;
+
+	/*
+	 * This traverses us the iterator without changing iter->path->pos to
+	 * search_key() (which is pos + 1 for extents): we want there to be a
+	 * path already traversed at iter->pos because
+	 * bch2_trans_extent_update() will use it to attempt extent merging
+	 */
+	ret = __bch2_btree_iter_traverse(iter);
+	if (ret)
+		return ret;
+
+	ret = bch2_extent_trim_atomic(trans, iter, k);
+	if (ret)
+		return ret;
+
+	next_pos = k->k.p;
+
+	ret = bch2_sum_sector_overwrites(trans, iter, k,
+			&usage_increasing,
+			&i_sectors_delta,
+			&disk_sectors_delta);
+	if (ret)
+		return ret;
+
+	if (disk_res &&
+	    disk_sectors_delta > (s64) disk_res->sectors) {
+		ret = bch2_disk_reservation_add(trans->c, disk_res,
+					disk_sectors_delta - disk_res->sectors,
+					!check_enospc || !usage_increasing
+					? BCH_DISK_RESERVATION_NOFAIL : 0);
+		if (ret)
+			return ret;
+	}
+
+	/*
+	 * Note:
+	 * We always have to do an inode update - even when i_size/i_sectors
+	 * aren't changing - for fsync to work properly; fsync relies on
+	 * inode->bi_journal_seq which is updated by the trigger code:
+	 */
+	ret =   bch2_extent_update_i_size_sectors(trans, iter,
+						  min(k->k.p.offset << 9, new_i_size),
+						  i_sectors_delta) ?:
+		bch2_trans_update(trans, iter, k, 0) ?:
+		bch2_trans_commit(trans, disk_res, NULL,
+				BTREE_INSERT_NOCHECK_RW|
+				BTREE_INSERT_NOFAIL);
+	if (unlikely(ret))
+		return ret;
+
+	if (i_sectors_delta_total)
+		*i_sectors_delta_total += i_sectors_delta;
+	bch2_btree_iter_set_pos(iter, next_pos);
+	return 0;
+}
+
+static int bch2_write_index_default(struct bch_write_op *op)
+{
+	struct bch_fs *c = op->c;
+	struct bkey_buf sk;
+	struct keylist *keys = &op->insert_keys;
+	struct bkey_i *k = bch2_keylist_front(keys);
+	struct btree_trans trans;
+	struct btree_iter iter;
+	subvol_inum inum = {
+		.subvol = op->subvol,
+		.inum	= k->k.p.inode,
+	};
+	int ret;
+
+	BUG_ON(!inum.subvol);
+
+	bch2_bkey_buf_init(&sk);
+	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024);
+
+	do {
+		bch2_trans_begin(&trans);
+
+		k = bch2_keylist_front(keys);
+		bch2_bkey_buf_copy(&sk, c, k);
+
+		ret = bch2_subvolume_get_snapshot(&trans, inum.subvol,
+						  &sk.k->k.p.snapshot);
+		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+			continue;
+		if (ret)
+			break;
+
+		bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
+				     bkey_start_pos(&sk.k->k),
+				     BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+
+		ret = bch2_extent_update(&trans, inum, &iter, sk.k,
+					 &op->res,
+					 op->new_i_size, &op->i_sectors_delta,
+					 op->flags & BCH_WRITE_CHECK_ENOSPC);
+		bch2_trans_iter_exit(&trans, &iter);
+
+		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+			continue;
+		if (ret)
+			break;
+
+		if (bkey_ge(iter.pos, k->k.p))
+			bch2_keylist_pop_front(&op->insert_keys);
+		else
+			bch2_cut_front(iter.pos, k);
+	} while (!bch2_keylist_empty(keys));
+
+	bch2_trans_exit(&trans);
+	bch2_bkey_buf_exit(&sk, c);
+
+	return ret;
+}
+
+/* Writes */
+
+void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
+			       enum bch_data_type type,
+			       const struct bkey_i *k,
+			       bool nocow)
+{
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(k));
+	const struct bch_extent_ptr *ptr;
+	struct bch_write_bio *n;
+	struct bch_dev *ca;
+
+	BUG_ON(c->opts.nochanges);
+
+	bkey_for_each_ptr(ptrs, ptr) {
+		BUG_ON(ptr->dev >= BCH_SB_MEMBERS_MAX ||
+		       !c->devs[ptr->dev]);
+
+		ca = bch_dev_bkey_exists(c, ptr->dev);
+
+		if (to_entry(ptr + 1) < ptrs.end) {
+			n = to_wbio(bio_alloc_clone(NULL, &wbio->bio,
+						GFP_NOFS, &ca->replica_set));
+
+			n->bio.bi_end_io	= wbio->bio.bi_end_io;
+			n->bio.bi_private	= wbio->bio.bi_private;
+			n->parent		= wbio;
+			n->split		= true;
+			n->bounce		= false;
+			n->put_bio		= true;
+			n->bio.bi_opf		= wbio->bio.bi_opf;
+			bio_inc_remaining(&wbio->bio);
+		} else {
+			n = wbio;
+			n->split		= false;
+		}
+
+		n->c			= c;
+		n->dev			= ptr->dev;
+		n->have_ioref		= nocow || bch2_dev_get_ioref(ca,
+					type == BCH_DATA_btree ? READ : WRITE);
+		n->nocow		= nocow;
+		n->submit_time		= local_clock();
+		n->inode_offset		= bkey_start_offset(&k->k);
+		n->bio.bi_iter.bi_sector = ptr->offset;
+
+		if (likely(n->have_ioref)) {
+			this_cpu_add(ca->io_done->sectors[WRITE][type],
+				     bio_sectors(&n->bio));
+
+			bio_set_dev(&n->bio, ca->disk_sb.bdev);
+
+			if (type != BCH_DATA_btree && unlikely(c->opts.no_data_io)) {
+				bio_endio(&n->bio);
+				continue;
+			}
+
+			submit_bio(&n->bio);
+		} else {
+			n->bio.bi_status	= BLK_STS_REMOVED;
+			bio_endio(&n->bio);
+		}
+	}
+}
+
+static void __bch2_write(struct bch_write_op *);
+
+static void bch2_write_done(struct closure *cl)
+{
+	struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
+	struct bch_fs *c = op->c;
+
+	EBUG_ON(op->open_buckets.nr);
+
+	bch2_time_stats_update(&c->times[BCH_TIME_data_write], op->start_time);
+	bch2_disk_reservation_put(c, &op->res);
+
+	if (!(op->flags & BCH_WRITE_MOVE))
+		bch2_write_ref_put(c, BCH_WRITE_REF_write);
+	bch2_keylist_free(&op->insert_keys, op->inline_keys);
+
+	EBUG_ON(cl->parent);
+	closure_debug_destroy(cl);
+	if (op->end_io)
+		op->end_io(op);
+}
+
+static noinline int bch2_write_drop_io_error_ptrs(struct bch_write_op *op)
+{
+	struct keylist *keys = &op->insert_keys;
+	struct bch_extent_ptr *ptr;
+	struct bkey_i *src, *dst = keys->keys, *n;
+
+	for (src = keys->keys; src != keys->top; src = n) {
+		n = bkey_next(src);
+
+		if (bkey_extent_is_direct_data(&src->k)) {
+			bch2_bkey_drop_ptrs(bkey_i_to_s(src), ptr,
+					    test_bit(ptr->dev, op->failed.d));
+
+			if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(src)))
+				return -EIO;
+		}
+
+		if (dst != src)
+			memmove_u64s_down(dst, src, src->k.u64s);
+		dst = bkey_next(dst);
+	}
+
+	keys->top = dst;
+	return 0;
+}
+
+/**
+ * bch_write_index - after a write, update index to point to new data
+ */
+static void __bch2_write_index(struct bch_write_op *op)
+{
+	struct bch_fs *c = op->c;
+	struct keylist *keys = &op->insert_keys;
+	struct bkey_i *k;
+	unsigned dev;
+	int ret = 0;
+
+	if (unlikely(op->flags & BCH_WRITE_IO_ERROR)) {
+		ret = bch2_write_drop_io_error_ptrs(op);
+		if (ret)
+			goto err;
+	}
+
+	/*
+	 * probably not the ideal place to hook this in, but I don't
+	 * particularly want to plumb io_opts all the way through the btree
+	 * update stack right now
+	 */
+	for_each_keylist_key(keys, k)
+		bch2_rebalance_add_key(c, bkey_i_to_s_c(k), &op->opts);
+
+	if (!bch2_keylist_empty(keys)) {
+		u64 sectors_start = keylist_sectors(keys);
+
+		ret = !(op->flags & BCH_WRITE_MOVE)
+			? bch2_write_index_default(op)
+			: bch2_data_update_index_update(op);
+
+		BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart));
+		BUG_ON(keylist_sectors(keys) && !ret);
+
+		op->written += sectors_start - keylist_sectors(keys);
+
+		if (ret && !bch2_err_matches(ret, EROFS)) {
+			struct bkey_i *k = bch2_keylist_front(&op->insert_keys);
+
+			bch_err_inum_offset_ratelimited(c,
+				k->k.p.inode, k->k.p.offset << 9,
+				"write error while doing btree update: %s",
+				bch2_err_str(ret));
+		}
+
+		if (ret)
+			goto err;
+	}
+out:
+	/* If some a bucket wasn't written, we can't erasure code it: */
+	for_each_set_bit(dev, op->failed.d, BCH_SB_MEMBERS_MAX)
+		bch2_open_bucket_write_error(c, &op->open_buckets, dev);
+
+	bch2_open_buckets_put(c, &op->open_buckets);
+	return;
+err:
+	keys->top = keys->keys;
+	op->error = ret;
+	op->flags |= BCH_WRITE_DONE;
+	goto out;
+}
+
+static inline void __wp_update_state(struct write_point *wp, enum write_point_state state)
+{
+	if (state != wp->state) {
+		u64 now = ktime_get_ns();
+
+		if (wp->last_state_change &&
+		    time_after64(now, wp->last_state_change))
+			wp->time[wp->state] += now - wp->last_state_change;
+		wp->state = state;
+		wp->last_state_change = now;
+	}
+}
+
+static inline void wp_update_state(struct write_point *wp, bool running)
+{
+	enum write_point_state state;
+
+	state = running			 ? WRITE_POINT_running :
+		!list_empty(&wp->writes) ? WRITE_POINT_waiting_io
+					 : WRITE_POINT_stopped;
+
+	__wp_update_state(wp, state);
+}
+
+static void bch2_write_index(struct closure *cl)
+{
+	struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
+	struct write_point *wp = op->wp;
+	struct workqueue_struct *wq = index_update_wq(op);
+	unsigned long flags;
+
+	if ((op->flags & BCH_WRITE_DONE) &&
+	    (op->flags & BCH_WRITE_MOVE))
+		bch2_bio_free_pages_pool(op->c, &op->wbio.bio);
+
+	spin_lock_irqsave(&wp->writes_lock, flags);
+	if (wp->state == WRITE_POINT_waiting_io)
+		__wp_update_state(wp, WRITE_POINT_waiting_work);
+	list_add_tail(&op->wp_list, &wp->writes);
+	spin_unlock_irqrestore (&wp->writes_lock, flags);
+
+	queue_work(wq, &wp->index_update_work);
+}
+
+static inline void bch2_write_queue(struct bch_write_op *op, struct write_point *wp)
+{
+	op->wp = wp;
+
+	if (wp->state == WRITE_POINT_stopped) {
+		spin_lock_irq(&wp->writes_lock);
+		__wp_update_state(wp, WRITE_POINT_waiting_io);
+		spin_unlock_irq(&wp->writes_lock);
+	}
+}
+
+void bch2_write_point_do_index_updates(struct work_struct *work)
+{
+	struct write_point *wp =
+		container_of(work, struct write_point, index_update_work);
+	struct bch_write_op *op;
+
+	while (1) {
+		spin_lock_irq(&wp->writes_lock);
+		op = list_first_entry_or_null(&wp->writes, struct bch_write_op, wp_list);
+		if (op)
+			list_del(&op->wp_list);
+		wp_update_state(wp, op != NULL);
+		spin_unlock_irq(&wp->writes_lock);
+
+		if (!op)
+			break;
+
+		op->flags |= BCH_WRITE_IN_WORKER;
+
+		__bch2_write_index(op);
+
+		if (!(op->flags & BCH_WRITE_DONE))
+			__bch2_write(op);
+		else
+			bch2_write_done(&op->cl);
+	}
+}
+
+static void bch2_write_endio(struct bio *bio)
+{
+	struct closure *cl		= bio->bi_private;
+	struct bch_write_op *op		= container_of(cl, struct bch_write_op, cl);
+	struct bch_write_bio *wbio	= to_wbio(bio);
+	struct bch_write_bio *parent	= wbio->split ? wbio->parent : NULL;
+	struct bch_fs *c		= wbio->c;
+	struct bch_dev *ca		= bch_dev_bkey_exists(c, wbio->dev);
+
+	if (bch2_dev_inum_io_err_on(bio->bi_status, ca,
+				    op->pos.inode,
+				    wbio->inode_offset << 9,
+				    "data write error: %s",
+				    bch2_blk_status_to_str(bio->bi_status))) {
+		set_bit(wbio->dev, op->failed.d);
+		op->flags |= BCH_WRITE_IO_ERROR;
+	}
+
+	if (wbio->nocow)
+		set_bit(wbio->dev, op->devs_need_flush->d);
+
+	if (wbio->have_ioref) {
+		bch2_latency_acct(ca, wbio->submit_time, WRITE);
+		percpu_ref_put(&ca->io_ref);
+	}
+
+	if (wbio->bounce)
+		bch2_bio_free_pages_pool(c, bio);
+
+	if (wbio->put_bio)
+		bio_put(bio);
+
+	if (parent)
+		bio_endio(&parent->bio);
+	else
+		closure_put(cl);
+}
+
+static void init_append_extent(struct bch_write_op *op,
+			       struct write_point *wp,
+			       struct bversion version,
+			       struct bch_extent_crc_unpacked crc)
+{
+	struct bkey_i_extent *e;
+
+	op->pos.offset += crc.uncompressed_size;
+
+	e = bkey_extent_init(op->insert_keys.top);
+	e->k.p		= op->pos;
+	e->k.size	= crc.uncompressed_size;
+	e->k.version	= version;
+
+	if (crc.csum_type ||
+	    crc.compression_type ||
+	    crc.nonce)
+		bch2_extent_crc_append(&e->k_i, crc);
+
+	bch2_alloc_sectors_append_ptrs_inlined(op->c, wp, &e->k_i, crc.compressed_size,
+				       op->flags & BCH_WRITE_CACHED);
+
+	bch2_keylist_push(&op->insert_keys);
+}
+
+static struct bio *bch2_write_bio_alloc(struct bch_fs *c,
+					struct write_point *wp,
+					struct bio *src,
+					bool *page_alloc_failed,
+					void *buf)
+{
+	struct bch_write_bio *wbio;
+	struct bio *bio;
+	unsigned output_available =
+		min(wp->sectors_free << 9, src->bi_iter.bi_size);
+	unsigned pages = DIV_ROUND_UP(output_available +
+				      (buf
+				       ? ((unsigned long) buf & (PAGE_SIZE - 1))
+				       : 0), PAGE_SIZE);
+
+	pages = min(pages, BIO_MAX_VECS);
+
+	bio = bio_alloc_bioset(NULL, pages, 0,
+			       GFP_NOFS, &c->bio_write);
+	wbio			= wbio_init(bio);
+	wbio->put_bio		= true;
+	/* copy WRITE_SYNC flag */
+	wbio->bio.bi_opf	= src->bi_opf;
+
+	if (buf) {
+		bch2_bio_map(bio, buf, output_available);
+		return bio;
+	}
+
+	wbio->bounce		= true;
+
+	/*
+	 * We can't use mempool for more than c->sb.encoded_extent_max
+	 * worth of pages, but we'd like to allocate more if we can:
+	 */
+	bch2_bio_alloc_pages_pool(c, bio,
+				  min_t(unsigned, output_available,
+					c->opts.encoded_extent_max));
+
+	if (bio->bi_iter.bi_size < output_available)
+		*page_alloc_failed =
+			bch2_bio_alloc_pages(bio,
+					     output_available -
+					     bio->bi_iter.bi_size,
+					     GFP_NOFS) != 0;
+
+	return bio;
+}
+
+static int bch2_write_rechecksum(struct bch_fs *c,
+				 struct bch_write_op *op,
+				 unsigned new_csum_type)
+{
+	struct bio *bio = &op->wbio.bio;
+	struct bch_extent_crc_unpacked new_crc;
+	int ret;
+
+	/* bch2_rechecksum_bio() can't encrypt or decrypt data: */
+
+	if (bch2_csum_type_is_encryption(op->crc.csum_type) !=
+	    bch2_csum_type_is_encryption(new_csum_type))
+		new_csum_type = op->crc.csum_type;
+
+	ret = bch2_rechecksum_bio(c, bio, op->version, op->crc,
+				  NULL, &new_crc,
+				  op->crc.offset, op->crc.live_size,
+				  new_csum_type);
+	if (ret)
+		return ret;
+
+	bio_advance(bio, op->crc.offset << 9);
+	bio->bi_iter.bi_size = op->crc.live_size << 9;
+	op->crc = new_crc;
+	return 0;
+}
+
+static int bch2_write_decrypt(struct bch_write_op *op)
+{
+	struct bch_fs *c = op->c;
+	struct nonce nonce = extent_nonce(op->version, op->crc);
+	struct bch_csum csum;
+	int ret;
+
+	if (!bch2_csum_type_is_encryption(op->crc.csum_type))
+		return 0;
+
+	/*
+	 * If we need to decrypt data in the write path, we'll no longer be able
+	 * to verify the existing checksum (poly1305 mac, in this case) after
+	 * it's decrypted - this is the last point we'll be able to reverify the
+	 * checksum:
+	 */
+	csum = bch2_checksum_bio(c, op->crc.csum_type, nonce, &op->wbio.bio);
+	if (bch2_crc_cmp(op->crc.csum, csum))
+		return -EIO;
+
+	ret = bch2_encrypt_bio(c, op->crc.csum_type, nonce, &op->wbio.bio);
+	op->crc.csum_type = 0;
+	op->crc.csum = (struct bch_csum) { 0, 0 };
+	return ret;
+}
+
+static enum prep_encoded_ret {
+	PREP_ENCODED_OK,
+	PREP_ENCODED_ERR,
+	PREP_ENCODED_CHECKSUM_ERR,
+	PREP_ENCODED_DO_WRITE,
+} bch2_write_prep_encoded_data(struct bch_write_op *op, struct write_point *wp)
+{
+	struct bch_fs *c = op->c;
+	struct bio *bio = &op->wbio.bio;
+
+	if (!(op->flags & BCH_WRITE_DATA_ENCODED))
+		return PREP_ENCODED_OK;
+
+	BUG_ON(bio_sectors(bio) != op->crc.compressed_size);
+
+	/* Can we just write the entire extent as is? */
+	if (op->crc.uncompressed_size == op->crc.live_size &&
+	    op->crc.compressed_size <= wp->sectors_free &&
+	    (op->crc.compression_type == bch2_compression_opt_to_type(op->compression_opt) ||
+	     op->incompressible)) {
+		if (!crc_is_compressed(op->crc) &&
+		    op->csum_type != op->crc.csum_type &&
+		    bch2_write_rechecksum(c, op, op->csum_type) &&
+		    !c->opts.no_data_io)
+			return PREP_ENCODED_CHECKSUM_ERR;
+
+		return PREP_ENCODED_DO_WRITE;
+	}
+
+	/*
+	 * If the data is compressed and we couldn't write the entire extent as
+	 * is, we have to decompress it:
+	 */
+	if (crc_is_compressed(op->crc)) {
+		struct bch_csum csum;
+
+		if (bch2_write_decrypt(op))
+			return PREP_ENCODED_CHECKSUM_ERR;
+
+		/* Last point we can still verify checksum: */
+		csum = bch2_checksum_bio(c, op->crc.csum_type,
+					 extent_nonce(op->version, op->crc),
+					 bio);
+		if (bch2_crc_cmp(op->crc.csum, csum) && !c->opts.no_data_io)
+			return PREP_ENCODED_CHECKSUM_ERR;
+
+		if (bch2_bio_uncompress_inplace(c, bio, &op->crc))
+			return PREP_ENCODED_ERR;
+	}
+
+	/*
+	 * No longer have compressed data after this point - data might be
+	 * encrypted:
+	 */
+
+	/*
+	 * If the data is checksummed and we're only writing a subset,
+	 * rechecksum and adjust bio to point to currently live data:
+	 */
+	if ((op->crc.live_size != op->crc.uncompressed_size ||
+	     op->crc.csum_type != op->csum_type) &&
+	    bch2_write_rechecksum(c, op, op->csum_type) &&
+	    !c->opts.no_data_io)
+		return PREP_ENCODED_CHECKSUM_ERR;
+
+	/*
+	 * If we want to compress the data, it has to be decrypted:
+	 */
+	if ((op->compression_opt ||
+	     bch2_csum_type_is_encryption(op->crc.csum_type) !=
+	     bch2_csum_type_is_encryption(op->csum_type)) &&
+	    bch2_write_decrypt(op))
+		return PREP_ENCODED_CHECKSUM_ERR;
+
+	return PREP_ENCODED_OK;
+}
+
+static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp,
+			     struct bio **_dst)
+{
+	struct bch_fs *c = op->c;
+	struct bio *src = &op->wbio.bio, *dst = src;
+	struct bvec_iter saved_iter;
+	void *ec_buf;
+	unsigned total_output = 0, total_input = 0;
+	bool bounce = false;
+	bool page_alloc_failed = false;
+	int ret, more = 0;
+
+	BUG_ON(!bio_sectors(src));
+
+	ec_buf = bch2_writepoint_ec_buf(c, wp);
+
+	switch (bch2_write_prep_encoded_data(op, wp)) {
+	case PREP_ENCODED_OK:
+		break;
+	case PREP_ENCODED_ERR:
+		ret = -EIO;
+		goto err;
+	case PREP_ENCODED_CHECKSUM_ERR:
+		goto csum_err;
+	case PREP_ENCODED_DO_WRITE:
+		/* XXX look for bug here */
+		if (ec_buf) {
+			dst = bch2_write_bio_alloc(c, wp, src,
+						   &page_alloc_failed,
+						   ec_buf);
+			bio_copy_data(dst, src);
+			bounce = true;
+		}
+		init_append_extent(op, wp, op->version, op->crc);
+		goto do_write;
+	}
+
+	if (ec_buf ||
+	    op->compression_opt ||
+	    (op->csum_type &&
+	     !(op->flags & BCH_WRITE_PAGES_STABLE)) ||
+	    (bch2_csum_type_is_encryption(op->csum_type) &&
+	     !(op->flags & BCH_WRITE_PAGES_OWNED))) {
+		dst = bch2_write_bio_alloc(c, wp, src,
+					   &page_alloc_failed,
+					   ec_buf);
+		bounce = true;
+	}
+
+	saved_iter = dst->bi_iter;
+
+	do {
+		struct bch_extent_crc_unpacked crc = { 0 };
+		struct bversion version = op->version;
+		size_t dst_len, src_len;
+
+		if (page_alloc_failed &&
+		    dst->bi_iter.bi_size  < (wp->sectors_free << 9) &&
+		    dst->bi_iter.bi_size < c->opts.encoded_extent_max)
+			break;
+
+		BUG_ON(op->compression_opt &&
+		       (op->flags & BCH_WRITE_DATA_ENCODED) &&
+		       bch2_csum_type_is_encryption(op->crc.csum_type));
+		BUG_ON(op->compression_opt && !bounce);
+
+		crc.compression_type = op->incompressible
+			? BCH_COMPRESSION_TYPE_incompressible
+			: op->compression_opt
+			? bch2_bio_compress(c, dst, &dst_len, src, &src_len,
+					    op->compression_opt)
+			: 0;
+		if (!crc_is_compressed(crc)) {
+			dst_len = min(dst->bi_iter.bi_size, src->bi_iter.bi_size);
+			dst_len = min_t(unsigned, dst_len, wp->sectors_free << 9);
+
+			if (op->csum_type)
+				dst_len = min_t(unsigned, dst_len,
+						c->opts.encoded_extent_max);
+
+			if (bounce) {
+				swap(dst->bi_iter.bi_size, dst_len);
+				bio_copy_data(dst, src);
+				swap(dst->bi_iter.bi_size, dst_len);
+			}
+
+			src_len = dst_len;
+		}
+
+		BUG_ON(!src_len || !dst_len);
+
+		if (bch2_csum_type_is_encryption(op->csum_type)) {
+			if (bversion_zero(version)) {
+				version.lo = atomic64_inc_return(&c->key_version);
+			} else {
+				crc.nonce = op->nonce;
+				op->nonce += src_len >> 9;
+			}
+		}
+
+		if ((op->flags & BCH_WRITE_DATA_ENCODED) &&
+		    !crc_is_compressed(crc) &&
+		    bch2_csum_type_is_encryption(op->crc.csum_type) ==
+		    bch2_csum_type_is_encryption(op->csum_type)) {
+			u8 compression_type = crc.compression_type;
+			u16 nonce = crc.nonce;
+			/*
+			 * Note: when we're using rechecksum(), we need to be
+			 * checksumming @src because it has all the data our
+			 * existing checksum covers - if we bounced (because we
+			 * were trying to compress), @dst will only have the
+			 * part of the data the new checksum will cover.
+			 *
+			 * But normally we want to be checksumming post bounce,
+			 * because part of the reason for bouncing is so the
+			 * data can't be modified (by userspace) while it's in
+			 * flight.
+			 */
+			if (bch2_rechecksum_bio(c, src, version, op->crc,
+					&crc, &op->crc,
+					src_len >> 9,
+					bio_sectors(src) - (src_len >> 9),
+					op->csum_type))
+				goto csum_err;
+			/*
+			 * rchecksum_bio sets compression_type on crc from op->crc,
+			 * this isn't always correct as sometimes we're changing
+			 * an extent from uncompressed to incompressible.
+			 */
+			crc.compression_type = compression_type;
+			crc.nonce = nonce;
+		} else {
+			if ((op->flags & BCH_WRITE_DATA_ENCODED) &&
+			    bch2_rechecksum_bio(c, src, version, op->crc,
+					NULL, &op->crc,
+					src_len >> 9,
+					bio_sectors(src) - (src_len >> 9),
+					op->crc.csum_type))
+				goto csum_err;
+
+			crc.compressed_size	= dst_len >> 9;
+			crc.uncompressed_size	= src_len >> 9;
+			crc.live_size		= src_len >> 9;
+
+			swap(dst->bi_iter.bi_size, dst_len);
+			ret = bch2_encrypt_bio(c, op->csum_type,
+					       extent_nonce(version, crc), dst);
+			if (ret)
+				goto err;
+
+			crc.csum = bch2_checksum_bio(c, op->csum_type,
+					 extent_nonce(version, crc), dst);
+			crc.csum_type = op->csum_type;
+			swap(dst->bi_iter.bi_size, dst_len);
+		}
+
+		init_append_extent(op, wp, version, crc);
+
+		if (dst != src)
+			bio_advance(dst, dst_len);
+		bio_advance(src, src_len);
+		total_output	+= dst_len;
+		total_input	+= src_len;
+	} while (dst->bi_iter.bi_size &&
+		 src->bi_iter.bi_size &&
+		 wp->sectors_free &&
+		 !bch2_keylist_realloc(&op->insert_keys,
+				      op->inline_keys,
+				      ARRAY_SIZE(op->inline_keys),
+				      BKEY_EXTENT_U64s_MAX));
+
+	more = src->bi_iter.bi_size != 0;
+
+	dst->bi_iter = saved_iter;
+
+	if (dst == src && more) {
+		BUG_ON(total_output != total_input);
+
+		dst = bio_split(src, total_input >> 9,
+				GFP_NOFS, &c->bio_write);
+		wbio_init(dst)->put_bio	= true;
+		/* copy WRITE_SYNC flag */
+		dst->bi_opf		= src->bi_opf;
+	}
+
+	dst->bi_iter.bi_size = total_output;
+do_write:
+	*_dst = dst;
+	return more;
+csum_err:
+	bch_err(c, "error verifying existing checksum while rewriting existing data (memory corruption?)");
+	ret = -EIO;
+err:
+	if (to_wbio(dst)->bounce)
+		bch2_bio_free_pages_pool(c, dst);
+	if (to_wbio(dst)->put_bio)
+		bio_put(dst);
+
+	return ret;
+}
+
+static bool bch2_extent_is_writeable(struct bch_write_op *op,
+				     struct bkey_s_c k)
+{
+	struct bch_fs *c = op->c;
+	struct bkey_s_c_extent e;
+	struct extent_ptr_decoded p;
+	const union bch_extent_entry *entry;
+	unsigned replicas = 0;
+
+	if (k.k->type != KEY_TYPE_extent)
+		return false;
+
+	e = bkey_s_c_to_extent(k);
+	extent_for_each_ptr_decode(e, p, entry) {
+		if (p.crc.csum_type ||
+		    crc_is_compressed(p.crc) ||
+		    p.has_ec)
+			return false;
+
+		replicas += bch2_extent_ptr_durability(c, &p);
+	}
+
+	return replicas >= op->opts.data_replicas;
+}
+
+static inline void bch2_nocow_write_unlock(struct bch_write_op *op)
+{
+	struct bch_fs *c = op->c;
+	const struct bch_extent_ptr *ptr;
+	struct bkey_i *k;
+
+	for_each_keylist_key(&op->insert_keys, k) {
+		struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(k));
+
+		bkey_for_each_ptr(ptrs, ptr)
+			bch2_bucket_nocow_unlock(&c->nocow_locks,
+					       PTR_BUCKET_POS(c, ptr),
+					       BUCKET_NOCOW_LOCK_UPDATE);
+	}
+}
+
+static int bch2_nocow_write_convert_one_unwritten(struct btree_trans *trans,
+						  struct btree_iter *iter,
+						  struct bkey_i *orig,
+						  struct bkey_s_c k,
+						  u64 new_i_size)
+{
+	struct bkey_i *new;
+	struct bkey_ptrs ptrs;
+	struct bch_extent_ptr *ptr;
+	int ret;
+
+	if (!bch2_extents_match(bkey_i_to_s_c(orig), k)) {
+		/* trace this */
+		return 0;
+	}
+
+	new = bch2_bkey_make_mut_noupdate(trans, k);
+	ret = PTR_ERR_OR_ZERO(new);
+	if (ret)
+		return ret;
+
+	bch2_cut_front(bkey_start_pos(&orig->k), new);
+	bch2_cut_back(orig->k.p, new);
+
+	ptrs = bch2_bkey_ptrs(bkey_i_to_s(new));
+	bkey_for_each_ptr(ptrs, ptr)
+		ptr->unwritten = 0;
+
+	/*
+	 * Note that we're not calling bch2_subvol_get_snapshot() in this path -
+	 * that was done when we kicked off the write, and here it's important
+	 * that we update the extent that we wrote to - even if a snapshot has
+	 * since been created. The write is still outstanding, so we're ok
+	 * w.r.t. snapshot atomicity:
+	 */
+	return  bch2_extent_update_i_size_sectors(trans, iter,
+					min(new->k.p.offset << 9, new_i_size), 0) ?:
+		bch2_trans_update(trans, iter, new,
+				  BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+}
+
+static void bch2_nocow_write_convert_unwritten(struct bch_write_op *op)
+{
+	struct bch_fs *c = op->c;
+	struct btree_trans trans;
+	struct btree_iter iter;
+	struct bkey_i *orig;
+	struct bkey_s_c k;
+	int ret;
+
+	bch2_trans_init(&trans, c, 0, 0);
+
+	for_each_keylist_key(&op->insert_keys, orig) {
+		ret = for_each_btree_key_upto_commit(&trans, iter, BTREE_ID_extents,
+				     bkey_start_pos(&orig->k), orig->k.p,
+				     BTREE_ITER_INTENT, k,
+				     NULL, NULL, BTREE_INSERT_NOFAIL, ({
+			bch2_nocow_write_convert_one_unwritten(&trans, &iter, orig, k, op->new_i_size);
+		}));
+
+		if (ret && !bch2_err_matches(ret, EROFS)) {
+			struct bkey_i *k = bch2_keylist_front(&op->insert_keys);
+
+			bch_err_inum_offset_ratelimited(c,
+				k->k.p.inode, k->k.p.offset << 9,
+				"write error while doing btree update: %s",
+				bch2_err_str(ret));
+		}
+
+		if (ret) {
+			op->error = ret;
+			break;
+		}
+	}
+
+	bch2_trans_exit(&trans);
+}
+
+static void __bch2_nocow_write_done(struct bch_write_op *op)
+{
+	bch2_nocow_write_unlock(op);
+
+	if (unlikely(op->flags & BCH_WRITE_IO_ERROR)) {
+		op->error = -EIO;
+	} else if (unlikely(op->flags & BCH_WRITE_CONVERT_UNWRITTEN))
+		bch2_nocow_write_convert_unwritten(op);
+}
+
+static void bch2_nocow_write_done(struct closure *cl)
+{
+	struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
+
+	__bch2_nocow_write_done(op);
+	bch2_write_done(cl);
+}
+
+static void bch2_nocow_write(struct bch_write_op *op)
+{
+	struct bch_fs *c = op->c;
+	struct btree_trans trans;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	struct bkey_ptrs_c ptrs;
+	const struct bch_extent_ptr *ptr;
+	struct {
+		struct bpos	b;
+		unsigned	gen;
+		struct nocow_lock_bucket *l;
+	} buckets[BCH_REPLICAS_MAX];
+	unsigned nr_buckets = 0;
+	u32 snapshot;
+	int ret, i;
+
+	if (op->flags & BCH_WRITE_MOVE)
+		return;
+
+	bch2_trans_init(&trans, c, 0, 0);
+retry:
+	bch2_trans_begin(&trans);
+
+	ret = bch2_subvolume_get_snapshot(&trans, op->subvol, &snapshot);
+	if (unlikely(ret))
+		goto err;
+
+	bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
+			     SPOS(op->pos.inode, op->pos.offset, snapshot),
+			     BTREE_ITER_SLOTS);
+	while (1) {
+		struct bio *bio = &op->wbio.bio;
+
+		nr_buckets = 0;
+
+		k = bch2_btree_iter_peek_slot(&iter);
+		ret = bkey_err(k);
+		if (ret)
+			break;
+
+		/* fall back to normal cow write path? */
+		if (unlikely(k.k->p.snapshot != snapshot ||
+			     !bch2_extent_is_writeable(op, k)))
+			break;
+
+		if (bch2_keylist_realloc(&op->insert_keys,
+					op->inline_keys,
+					ARRAY_SIZE(op->inline_keys),
+					k.k->u64s))
+			break;
+
+		/* Get iorefs before dropping btree locks: */
+		ptrs = bch2_bkey_ptrs_c(k);
+		bkey_for_each_ptr(ptrs, ptr) {
+			buckets[nr_buckets].b = PTR_BUCKET_POS(c, ptr);
+			buckets[nr_buckets].gen = ptr->gen;
+			buckets[nr_buckets].l =
+				bucket_nocow_lock(&c->nocow_locks,
+						  bucket_to_u64(buckets[nr_buckets].b));
+
+			prefetch(buckets[nr_buckets].l);
+
+			if (unlikely(!bch2_dev_get_ioref(bch_dev_bkey_exists(c, ptr->dev), WRITE)))
+				goto err_get_ioref;
+
+			nr_buckets++;
+
+			if (ptr->unwritten)
+				op->flags |= BCH_WRITE_CONVERT_UNWRITTEN;
+		}
+
+		/* Unlock before taking nocow locks, doing IO: */
+		bkey_reassemble(op->insert_keys.top, k);
+		bch2_trans_unlock(&trans);
+
+		bch2_cut_front(op->pos, op->insert_keys.top);
+		if (op->flags & BCH_WRITE_CONVERT_UNWRITTEN)
+			bch2_cut_back(POS(op->pos.inode, op->pos.offset + bio_sectors(bio)), op->insert_keys.top);
+
+		for (i = 0; i < nr_buckets; i++) {
+			struct bch_dev *ca = bch_dev_bkey_exists(c, buckets[i].b.inode);
+			struct nocow_lock_bucket *l = buckets[i].l;
+			bool stale;
+
+			__bch2_bucket_nocow_lock(&c->nocow_locks, l,
+						 bucket_to_u64(buckets[i].b),
+						 BUCKET_NOCOW_LOCK_UPDATE);
+
+			rcu_read_lock();
+			stale = gen_after(*bucket_gen(ca, buckets[i].b.offset), buckets[i].gen);
+			rcu_read_unlock();
+
+			if (unlikely(stale))
+				goto err_bucket_stale;
+		}
+
+		bio = &op->wbio.bio;
+		if (k.k->p.offset < op->pos.offset + bio_sectors(bio)) {
+			bio = bio_split(bio, k.k->p.offset - op->pos.offset,
+					GFP_KERNEL, &c->bio_write);
+			wbio_init(bio)->put_bio = true;
+			bio->bi_opf = op->wbio.bio.bi_opf;
+		} else {
+			op->flags |= BCH_WRITE_DONE;
+		}
+
+		op->pos.offset += bio_sectors(bio);
+		op->written += bio_sectors(bio);
+
+		bio->bi_end_io	= bch2_write_endio;
+		bio->bi_private	= &op->cl;
+		bio->bi_opf |= REQ_OP_WRITE;
+		closure_get(&op->cl);
+		bch2_submit_wbio_replicas(to_wbio(bio), c, BCH_DATA_user,
+					  op->insert_keys.top, true);
+
+		bch2_keylist_push(&op->insert_keys);
+		if (op->flags & BCH_WRITE_DONE)
+			break;
+		bch2_btree_iter_advance(&iter);
+	}
+out:
+	bch2_trans_iter_exit(&trans, &iter);
+err:
+	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+		goto retry;
+
+	if (ret) {
+		bch_err_inum_offset_ratelimited(c,
+				op->pos.inode,
+				op->pos.offset << 9,
+				"%s: btree lookup error %s",
+				__func__, bch2_err_str(ret));
+		op->error = ret;
+		op->flags |= BCH_WRITE_DONE;
+	}
+
+	bch2_trans_exit(&trans);
+
+	/* fallback to cow write path? */
+	if (!(op->flags & BCH_WRITE_DONE)) {
+		closure_sync(&op->cl);
+		__bch2_nocow_write_done(op);
+		op->insert_keys.top = op->insert_keys.keys;
+	} else if (op->flags & BCH_WRITE_SYNC) {
+		closure_sync(&op->cl);
+		bch2_nocow_write_done(&op->cl);
+	} else {
+		/*
+		 * XXX
+		 * needs to run out of process context because ei_quota_lock is
+		 * a mutex
+		 */
+		continue_at(&op->cl, bch2_nocow_write_done, index_update_wq(op));
+	}
+	return;
+err_get_ioref:
+	for (i = 0; i < nr_buckets; i++)
+		percpu_ref_put(&bch_dev_bkey_exists(c, buckets[i].b.inode)->io_ref);
+
+	/* Fall back to COW path: */
+	goto out;
+err_bucket_stale:
+	while (--i >= 0)
+		bch2_bucket_nocow_unlock(&c->nocow_locks,
+					 buckets[i].b,
+					 BUCKET_NOCOW_LOCK_UPDATE);
+	for (i = 0; i < nr_buckets; i++)
+		percpu_ref_put(&bch_dev_bkey_exists(c, buckets[i].b.inode)->io_ref);
+
+	/* We can retry this: */
+	ret = -BCH_ERR_transaction_restart;
+	goto out;
+}
+
+static void __bch2_write(struct bch_write_op *op)
+{
+	struct bch_fs *c = op->c;
+	struct write_point *wp = NULL;
+	struct bio *bio = NULL;
+	unsigned nofs_flags;
+	int ret;
+
+	nofs_flags = memalloc_nofs_save();
+
+	if (unlikely(op->opts.nocow && c->opts.nocow_enabled)) {
+		bch2_nocow_write(op);
+		if (op->flags & BCH_WRITE_DONE)
+			goto out_nofs_restore;
+	}
+again:
+	memset(&op->failed, 0, sizeof(op->failed));
+
+	do {
+		struct bkey_i *key_to_write;
+		unsigned key_to_write_offset = op->insert_keys.top_p -
+			op->insert_keys.keys_p;
+
+		/* +1 for possible cache device: */
+		if (op->open_buckets.nr + op->nr_replicas + 1 >
+		    ARRAY_SIZE(op->open_buckets.v))
+			break;
+
+		if (bch2_keylist_realloc(&op->insert_keys,
+					op->inline_keys,
+					ARRAY_SIZE(op->inline_keys),
+					BKEY_EXTENT_U64s_MAX))
+			break;
+
+		/*
+		 * The copygc thread is now global, which means it's no longer
+		 * freeing up space on specific disks, which means that
+		 * allocations for specific disks may hang arbitrarily long:
+		 */
+		ret = bch2_trans_do(c, NULL, NULL, 0,
+			bch2_alloc_sectors_start_trans(&trans,
+				op->target,
+				op->opts.erasure_code && !(op->flags & BCH_WRITE_CACHED),
+				op->write_point,
+				&op->devs_have,
+				op->nr_replicas,
+				op->nr_replicas_required,
+				op->watermark,
+				op->flags,
+				(op->flags & (BCH_WRITE_ALLOC_NOWAIT|
+					      BCH_WRITE_ONLY_SPECIFIED_DEVS))
+				? NULL : &op->cl, &wp));
+		if (unlikely(ret)) {
+			if (bch2_err_matches(ret, BCH_ERR_operation_blocked))
+				break;
+
+			goto err;
+		}
+
+		EBUG_ON(!wp);
+
+		bch2_open_bucket_get(c, wp, &op->open_buckets);
+		ret = bch2_write_extent(op, wp, &bio);
+
+		bch2_alloc_sectors_done_inlined(c, wp);
+err:
+		if (ret <= 0) {
+			op->flags |= BCH_WRITE_DONE;
+
+			if (ret < 0) {
+				op->error = ret;
+				break;
+			}
+		}
+
+		bio->bi_end_io	= bch2_write_endio;
+		bio->bi_private	= &op->cl;
+		bio->bi_opf |= REQ_OP_WRITE;
+
+		closure_get(bio->bi_private);
+
+		key_to_write = (void *) (op->insert_keys.keys_p +
+					 key_to_write_offset);
+
+		bch2_submit_wbio_replicas(to_wbio(bio), c, BCH_DATA_user,
+					  key_to_write, false);
+	} while (ret);
+
+	/*
+	 * Sync or no?
+	 *
+	 * If we're running asynchronously, wne may still want to block
+	 * synchronously here if we weren't able to submit all of the IO at
+	 * once, as that signals backpressure to the caller.
+	 */
+	if ((op->flags & BCH_WRITE_SYNC) ||
+	    (!(op->flags & BCH_WRITE_DONE) &&
+	     !(op->flags & BCH_WRITE_IN_WORKER))) {
+		closure_sync(&op->cl);
+		__bch2_write_index(op);
+
+		if (!(op->flags & BCH_WRITE_DONE))
+			goto again;
+		bch2_write_done(&op->cl);
+	} else {
+		bch2_write_queue(op, wp);
+		continue_at(&op->cl, bch2_write_index, NULL);
+	}
+out_nofs_restore:
+	memalloc_nofs_restore(nofs_flags);
+}
+
+static void bch2_write_data_inline(struct bch_write_op *op, unsigned data_len)
+{
+	struct bio *bio = &op->wbio.bio;
+	struct bvec_iter iter;
+	struct bkey_i_inline_data *id;
+	unsigned sectors;
+	int ret;
+
+	op->flags |= BCH_WRITE_WROTE_DATA_INLINE;
+	op->flags |= BCH_WRITE_DONE;
+
+	bch2_check_set_feature(op->c, BCH_FEATURE_inline_data);
+
+	ret = bch2_keylist_realloc(&op->insert_keys, op->inline_keys,
+				   ARRAY_SIZE(op->inline_keys),
+				   BKEY_U64s + DIV_ROUND_UP(data_len, 8));
+	if (ret) {
+		op->error = ret;
+		goto err;
+	}
+
+	sectors = bio_sectors(bio);
+	op->pos.offset += sectors;
+
+	id = bkey_inline_data_init(op->insert_keys.top);
+	id->k.p		= op->pos;
+	id->k.version	= op->version;
+	id->k.size	= sectors;
+
+	iter = bio->bi_iter;
+	iter.bi_size = data_len;
+	memcpy_from_bio(id->v.data, bio, iter);
+
+	while (data_len & 7)
+		id->v.data[data_len++] = '\0';
+	set_bkey_val_bytes(&id->k, data_len);
+	bch2_keylist_push(&op->insert_keys);
+
+	__bch2_write_index(op);
+err:
+	bch2_write_done(&op->cl);
+}
+
+/**
+ * bch_write - handle a write to a cache device or flash only volume
+ *
+ * This is the starting point for any data to end up in a cache device; it could
+ * be from a normal write, or a writeback write, or a write to a flash only
+ * volume - it's also used by the moving garbage collector to compact data in
+ * mostly empty buckets.
+ *
+ * It first writes the data to the cache, creating a list of keys to be inserted
+ * (if the data won't fit in a single open bucket, there will be multiple keys);
+ * after the data is written it calls bch_journal, and after the keys have been
+ * added to the next journal write they're inserted into the btree.
+ *
+ * If op->discard is true, instead of inserting the data it invalidates the
+ * region of the cache represented by op->bio and op->inode.
+ */
+void bch2_write(struct closure *cl)
+{
+	struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
+	struct bio *bio = &op->wbio.bio;
+	struct bch_fs *c = op->c;
+	unsigned data_len;
+
+	EBUG_ON(op->cl.parent);
+	BUG_ON(!op->nr_replicas);
+	BUG_ON(!op->write_point.v);
+	BUG_ON(bkey_eq(op->pos, POS_MAX));
+
+	op->start_time = local_clock();
+	bch2_keylist_init(&op->insert_keys, op->inline_keys);
+	wbio_init(bio)->put_bio = false;
+
+	if (bio->bi_iter.bi_size & (c->opts.block_size - 1)) {
+		bch_err_inum_offset_ratelimited(c,
+			op->pos.inode,
+			op->pos.offset << 9,
+			"misaligned write");
+		op->error = -EIO;
+		goto err;
+	}
+
+	if (c->opts.nochanges) {
+		op->error = -BCH_ERR_erofs_no_writes;
+		goto err;
+	}
+
+	if (!(op->flags & BCH_WRITE_MOVE) &&
+	    !bch2_write_ref_tryget(c, BCH_WRITE_REF_write)) {
+		op->error = -BCH_ERR_erofs_no_writes;
+		goto err;
+	}
+
+	this_cpu_add(c->counters[BCH_COUNTER_io_write], bio_sectors(bio));
+	bch2_increment_clock(c, bio_sectors(bio), WRITE);
+
+	data_len = min_t(u64, bio->bi_iter.bi_size,
+			 op->new_i_size - (op->pos.offset << 9));
+
+	if (c->opts.inline_data &&
+	    data_len <= min(block_bytes(c) / 2, 1024U)) {
+		bch2_write_data_inline(op, data_len);
+		return;
+	}
+
+	__bch2_write(op);
+	return;
+err:
+	bch2_disk_reservation_put(c, &op->res);
+
+	closure_debug_destroy(&op->cl);
+	if (op->end_io)
+		op->end_io(op);
+}
+
+static const char * const bch2_write_flags[] = {
+#define x(f)	#f,
+	BCH_WRITE_FLAGS()
+#undef x
+	NULL
+};
+
+void bch2_write_op_to_text(struct printbuf *out, struct bch_write_op *op)
+{
+	prt_str(out, "pos: ");
+	bch2_bpos_to_text(out, op->pos);
+	prt_newline(out);
+	printbuf_indent_add(out, 2);
+
+	prt_str(out, "started: ");
+	bch2_pr_time_units(out, local_clock() - op->start_time);
+	prt_newline(out);
+
+	prt_str(out, "flags: ");
+	prt_bitflags(out, bch2_write_flags, op->flags);
+	prt_newline(out);
+
+	prt_printf(out, "ref: %u", closure_nr_remaining(&op->cl));
+	prt_newline(out);
+
+	printbuf_indent_sub(out, 2);
+}
+
+void bch2_fs_io_write_exit(struct bch_fs *c)
+{
+	mempool_exit(&c->bio_bounce_pages);
+	bioset_exit(&c->bio_write);
+}
+
+int bch2_fs_io_write_init(struct bch_fs *c)
+{
+	if (bioset_init(&c->bio_write, 1, offsetof(struct bch_write_bio, bio),
+			BIOSET_NEED_BVECS))
+		return -BCH_ERR_ENOMEM_bio_write_init;
+
+	if (mempool_init_page_pool(&c->bio_bounce_pages,
+				   max_t(unsigned,
+					 c->opts.btree_node_size,
+					 c->opts.encoded_extent_max) /
+				   PAGE_SIZE, 0))
+		return -BCH_ERR_ENOMEM_bio_bounce_pages_init;
+
+	return 0;
+}
diff --git a/fs/bcachefs/io_write.h b/fs/bcachefs/io_write.h
new file mode 100644
index 000000000000..9323167229ee
--- /dev/null
+++ b/fs/bcachefs/io_write.h
@@ -0,0 +1,110 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_IO_WRITE_H
+#define _BCACHEFS_IO_WRITE_H
+
+#include "checksum.h"
+#include "io_write_types.h"
+
+#define to_wbio(_bio)			\
+	container_of((_bio), struct bch_write_bio, bio)
+
+void bch2_bio_free_pages_pool(struct bch_fs *, struct bio *);
+void bch2_bio_alloc_pages_pool(struct bch_fs *, struct bio *, size_t);
+
+#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
+void bch2_latency_acct(struct bch_dev *, u64, int);
+#else
+static inline void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw) {}
+#endif
+
+void bch2_submit_wbio_replicas(struct bch_write_bio *, struct bch_fs *,
+			       enum bch_data_type, const struct bkey_i *, bool);
+
+#define BCH_WRITE_FLAGS()		\
+	x(ALLOC_NOWAIT)			\
+	x(CACHED)			\
+	x(DATA_ENCODED)			\
+	x(PAGES_STABLE)			\
+	x(PAGES_OWNED)			\
+	x(ONLY_SPECIFIED_DEVS)		\
+	x(WROTE_DATA_INLINE)		\
+	x(FROM_INTERNAL)		\
+	x(CHECK_ENOSPC)			\
+	x(SYNC)				\
+	x(MOVE)				\
+	x(IN_WORKER)			\
+	x(DONE)				\
+	x(IO_ERROR)			\
+	x(CONVERT_UNWRITTEN)
+
+enum __bch_write_flags {
+#define x(f)	__BCH_WRITE_##f,
+	BCH_WRITE_FLAGS()
+#undef x
+};
+
+enum bch_write_flags {
+#define x(f)	BCH_WRITE_##f = BIT(__BCH_WRITE_##f),
+	BCH_WRITE_FLAGS()
+#undef x
+};
+
+static inline struct workqueue_struct *index_update_wq(struct bch_write_op *op)
+{
+	return op->watermark == BCH_WATERMARK_copygc
+		? op->c->copygc_wq
+		: op->c->btree_update_wq;
+}
+
+int bch2_sum_sector_overwrites(struct btree_trans *, struct btree_iter *,
+			       struct bkey_i *, bool *, s64 *, s64 *);
+int bch2_extent_update(struct btree_trans *, subvol_inum,
+		       struct btree_iter *, struct bkey_i *,
+		       struct disk_reservation *, u64, s64 *, bool);
+
+static inline void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c,
+				      struct bch_io_opts opts)
+{
+	op->c			= c;
+	op->end_io		= NULL;
+	op->flags		= 0;
+	op->written		= 0;
+	op->error		= 0;
+	op->csum_type		= bch2_data_checksum_type(c, opts);
+	op->compression_opt	= opts.compression;
+	op->nr_replicas		= 0;
+	op->nr_replicas_required = c->opts.data_replicas_required;
+	op->watermark		= BCH_WATERMARK_normal;
+	op->incompressible	= 0;
+	op->open_buckets.nr	= 0;
+	op->devs_have.nr	= 0;
+	op->target		= 0;
+	op->opts		= opts;
+	op->subvol		= 0;
+	op->pos			= POS_MAX;
+	op->version		= ZERO_VERSION;
+	op->write_point		= (struct write_point_specifier) { 0 };
+	op->res			= (struct disk_reservation) { 0 };
+	op->new_i_size		= U64_MAX;
+	op->i_sectors_delta	= 0;
+	op->devs_need_flush	= NULL;
+}
+
+void bch2_write(struct closure *);
+
+void bch2_write_point_do_index_updates(struct work_struct *);
+
+static inline struct bch_write_bio *wbio_init(struct bio *bio)
+{
+	struct bch_write_bio *wbio = to_wbio(bio);
+
+	memset(&wbio->wbio, 0, sizeof(wbio->wbio));
+	return wbio;
+}
+
+void bch2_write_op_to_text(struct printbuf *, struct bch_write_op *);
+
+void bch2_fs_io_write_exit(struct bch_fs *);
+int bch2_fs_io_write_init(struct bch_fs *);
+
+#endif /* _BCACHEFS_IO_WRITE_H */
diff --git a/fs/bcachefs/io_write_types.h b/fs/bcachefs/io_write_types.h
new file mode 100644
index 000000000000..c7f97c2c4805
--- /dev/null
+++ b/fs/bcachefs/io_write_types.h
@@ -0,0 +1,96 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_IO_WRITE_TYPES_H
+#define _BCACHEFS_IO_WRITE_TYPES_H
+
+#include "alloc_types.h"
+#include "btree_types.h"
+#include "buckets_types.h"
+#include "extents_types.h"
+#include "keylist_types.h"
+#include "opts.h"
+#include "super_types.h"
+
+#include <linux/llist.h>
+#include <linux/workqueue.h>
+
+struct bch_write_bio {
+	struct_group(wbio,
+	struct bch_fs		*c;
+	struct bch_write_bio	*parent;
+
+	u64			submit_time;
+	u64			inode_offset;
+
+	struct bch_devs_list	failed;
+	u8			dev;
+
+	unsigned		split:1,
+				bounce:1,
+				put_bio:1,
+				have_ioref:1,
+				nocow:1,
+				used_mempool:1,
+				first_btree_write:1;
+	);
+
+	struct bio		bio;
+};
+
+struct bch_write_op {
+	struct closure		cl;
+	struct bch_fs		*c;
+	void			(*end_io)(struct bch_write_op *);
+	u64			start_time;
+
+	unsigned		written; /* sectors */
+	u16			flags;
+	s16			error; /* dio write path expects it to hold -ERESTARTSYS... */
+
+	unsigned		compression_opt:8;
+	unsigned		csum_type:4;
+	unsigned		nr_replicas:4;
+	unsigned		nr_replicas_required:4;
+	unsigned		watermark:3;
+	unsigned		incompressible:1;
+	unsigned		stripe_waited:1;
+
+	struct bch_devs_list	devs_have;
+	u16			target;
+	u16			nonce;
+	struct bch_io_opts	opts;
+
+	u32			subvol;
+	struct bpos		pos;
+	struct bversion		version;
+
+	/* For BCH_WRITE_DATA_ENCODED: */
+	struct bch_extent_crc_unpacked crc;
+
+	struct write_point_specifier write_point;
+
+	struct write_point	*wp;
+	struct list_head	wp_list;
+
+	struct disk_reservation	res;
+
+	struct open_buckets	open_buckets;
+
+	u64			new_i_size;
+	s64			i_sectors_delta;
+
+	struct bch_devs_mask	failed;
+
+	struct keylist		insert_keys;
+	u64			inline_keys[BKEY_EXTENT_U64s_MAX * 2];
+
+	/*
+	 * Bitmask of devices that have had nocow writes issued to them since
+	 * last flush:
+	 */
+	struct bch_devs_mask	*devs_need_flush;
+
+	/* Must be last: */
+	struct bch_write_bio	wbio;
+};
+
+#endif /* _BCACHEFS_IO_WRITE_TYPES_H */
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 34740dca4b15..0e606009dc46 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -8,7 +8,6 @@
 #include "checksum.h"
 #include "disk_groups.h"
 #include "error.h"
-#include "io.h"
 #include "journal.h"
 #include "journal_io.h"
 #include "journal_reclaim.h"
diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c
index 3d7c5b919421..4746dfa7af97 100644
--- a/fs/bcachefs/migrate.c
+++ b/fs/bcachefs/migrate.c
@@ -10,7 +10,7 @@
 #include "buckets.h"
 #include "errcode.h"
 #include "extents.h"
-#include "io.h"
+#include "io_write.h"
 #include "journal.h"
 #include "keylist.h"
 #include "migrate.h"
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index fb76a1dac74e..ac4df53bfde2 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -14,7 +14,8 @@
 #include "errcode.h"
 #include "error.h"
 #include "inode.h"
-#include "io.h"
+#include "io_read.h"
+#include "io_write.h"
 #include "journal_reclaim.h"
 #include "keylist.h"
 #include "move.h"
diff --git a/fs/bcachefs/move.h b/fs/bcachefs/move.h
index c3136abe8587..cbdd58db8782 100644
--- a/fs/bcachefs/move.h
+++ b/fs/bcachefs/move.h
@@ -2,6 +2,7 @@
 #ifndef _BCACHEFS_MOVE_H
 #define _BCACHEFS_MOVE_H
 
+#include "bcachefs_ioctl.h"
 #include "btree_iter.h"
 #include "buckets.h"
 #include "data_update.h"
diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c
index ac658e99bf57..2371fd61ea58 100644
--- a/fs/bcachefs/movinggc.c
+++ b/fs/bcachefs/movinggc.c
@@ -13,25 +13,17 @@
 #include "btree_write_buffer.h"
 #include "buckets.h"
 #include "clock.h"
-#include "disk_groups.h"
 #include "errcode.h"
 #include "error.h"
-#include "extents.h"
-#include "eytzinger.h"
-#include "io.h"
-#include "keylist.h"
 #include "lru.h"
 #include "move.h"
 #include "movinggc.h"
-#include "super-io.h"
 #include "trace.h"
 
-#include <linux/bsearch.h>
 #include <linux/freezer.h>
 #include <linux/kthread.h>
 #include <linux/math64.h>
 #include <linux/sched/task.h>
-#include <linux/sort.h>
 #include <linux/wait.h>
 
 struct buckets_in_flight {
diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c
index 016cf0834b3d..568f1e8e7507 100644
--- a/fs/bcachefs/rebalance.c
+++ b/fs/bcachefs/rebalance.c
@@ -8,8 +8,6 @@
 #include "compress.h"
 #include "disk_groups.h"
 #include "errcode.h"
-#include "extents.h"
-#include "io.h"
 #include "move.h"
 #include "rebalance.h"
 #include "super-io.h"
diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c
index 39f711d5069e..f155428ff395 100644
--- a/fs/bcachefs/reflink.c
+++ b/fs/bcachefs/reflink.c
@@ -5,9 +5,11 @@
 #include "buckets.h"
 #include "extents.h"
 #include "inode.h"
-#include "io.h"
+#include "io_misc.h"
+#include "io_write.h"
 #include "reflink.h"
 #include "subvolume.h"
+#include "super-io.h"
 
 #include <linux/sched/signal.h>
 
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index f01883e785a5..5a1115396edc 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -6,7 +6,6 @@
 #include "disk_groups.h"
 #include "ec.h"
 #include "error.h"
-#include "io.h"
 #include "journal.h"
 #include "journal_sb.h"
 #include "journal_seq_blacklist.h"
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 7cfc04947717..55176023f15b 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -35,7 +35,8 @@
 #include "fs-io-direct.h"
 #include "fsck.h"
 #include "inode.h"
-#include "io.h"
+#include "io_read.h"
+#include "io_write.h"
 #include "journal.h"
 #include "journal_reclaim.h"
 #include "journal_seq_blacklist.h"
@@ -483,7 +484,8 @@ static void __bch2_fs_free(struct bch_fs *c)
 	bch2_fs_fsio_exit(c);
 	bch2_fs_ec_exit(c);
 	bch2_fs_encryption_exit(c);
-	bch2_fs_io_exit(c);
+	bch2_fs_io_write_exit(c);
+	bch2_fs_io_read_exit(c);
 	bch2_fs_buckets_waiting_for_journal_exit(c);
 	bch2_fs_btree_interior_update_exit(c);
 	bch2_fs_btree_iter_exit(c);
@@ -848,7 +850,8 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 	    bch2_fs_buckets_waiting_for_journal_init(c) ?:
 	    bch2_fs_btree_write_buffer_init(c) ?:
 	    bch2_fs_subvolumes_init(c) ?:
-	    bch2_fs_io_init(c) ?:
+	    bch2_fs_io_read_init(c) ?:
+	    bch2_fs_io_write_init(c) ?:
 	    bch2_fs_nocow_locking_init(c) ?:
 	    bch2_fs_encryption_init(c) ?:
 	    bch2_fs_compress_init(c) ?:
-- 
cgit 


From 5902cc283c060f0a006ee9b2f2a64855a09399b4 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 4 Sep 2023 05:38:30 -0400
Subject: bcachefs: New io_misc.c helpers

This pulls the non vfs specific parts of truncate and finsert/fcollapse
out of fs-io.c, and moves them to io_misc.c.

This is prep work for logging these operations, to make them atomic in
the event of a crash.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-io.c   | 209 +++++-----------------------------------------
 fs/bcachefs/fs-io.h   |   2 +-
 fs/bcachefs/fs.c      |   2 +-
 fs/bcachefs/io_misc.c | 226 ++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/bcachefs/io_misc.h |   3 +
 5 files changed, 250 insertions(+), 192 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 0b0b3b0d6c7d..b36513eb3d16 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -391,33 +391,12 @@ static int bch2_extend(struct mnt_idmap *idmap,
 	return bch2_setattr_nonsize(idmap, inode, iattr);
 }
 
-static int bch2_truncate_finish_fn(struct btree_trans *trans,
-				   struct bch_inode_info *inode,
-				   struct bch_inode_unpacked *bi,
-				   void *p)
-{
-	bi->bi_flags &= ~BCH_INODE_I_SIZE_DIRTY;
-	return 0;
-}
-
-static int bch2_truncate_start_fn(struct btree_trans *trans,
-				  struct bch_inode_info *inode,
-				  struct bch_inode_unpacked *bi, void *p)
-{
-	u64 *new_i_size = p;
-
-	bi->bi_flags |= BCH_INODE_I_SIZE_DIRTY;
-	bi->bi_size = *new_i_size;
-	return 0;
-}
-
-int bch2_truncate(struct mnt_idmap *idmap,
+int bchfs_truncate(struct mnt_idmap *idmap,
 		  struct bch_inode_info *inode, struct iattr *iattr)
 {
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
 	struct address_space *mapping = inode->v.i_mapping;
 	struct bch_inode_unpacked inode_u;
-	u64 new_i_size = iattr->ia_size;
 	s64 i_sectors_delta = 0;
 	int ret = 0;
 
@@ -466,6 +445,8 @@ int bch2_truncate(struct mnt_idmap *idmap,
 	if (unlikely(ret < 0))
 		goto err;
 
+	truncate_setsize(&inode->v, iattr->ia_size);
+
 	/*
 	 * When extending, we're going to write the new i_size to disk
 	 * immediately so we need to flush anything above the current on disk
@@ -487,32 +468,22 @@ int bch2_truncate(struct mnt_idmap *idmap,
 	if (ret)
 		goto err;
 
-	mutex_lock(&inode->ei_update_lock);
-	ret = bch2_write_inode(c, inode, bch2_truncate_start_fn,
-			       &new_i_size, 0);
-	mutex_unlock(&inode->ei_update_lock);
+	ret = bch2_truncate(c, inode_inum(inode), iattr->ia_size, &i_sectors_delta);
+	bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta);
 
-	if (unlikely(ret))
+	if (unlikely(ret)) {
+		/*
+		 * If we error here, VFS caches are now inconsistent with btree
+		 */
+		set_bit(EI_INODE_ERROR, &inode->ei_flags);
 		goto err;
-
-	truncate_setsize(&inode->v, iattr->ia_size);
-
-	ret = bch2_fpunch(c, inode_inum(inode),
-			round_up(iattr->ia_size, block_bytes(c)) >> 9,
-			U64_MAX, &i_sectors_delta);
-	bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta);
+	}
 
 	bch2_fs_inconsistent_on(!inode->v.i_size && inode->v.i_blocks &&
 				!bch2_journal_error(&c->journal), c,
 				"inode %lu truncated to 0 but i_blocks %llu (ondisk %lli)",
 				inode->v.i_ino, (u64) inode->v.i_blocks,
 				inode->ei_inode.bi_sectors);
-	if (unlikely(ret))
-		goto err;
-
-	mutex_lock(&inode->ei_update_lock);
-	ret = bch2_write_inode(c, inode, bch2_truncate_finish_fn, NULL, 0);
-	mutex_unlock(&inode->ei_update_lock);
 
 	ret = bch2_setattr_nonsize(idmap, inode, iattr);
 err:
@@ -577,175 +548,33 @@ static long bchfs_fcollapse_finsert(struct bch_inode_info *inode,
 {
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
 	struct address_space *mapping = inode->v.i_mapping;
-	struct bkey_buf copy;
-	struct btree_trans trans;
-	struct btree_iter src, dst, del;
-	loff_t shift, new_size;
-	u64 src_start;
+	s64 i_sectors_delta = 0;
 	int ret = 0;
 
 	if ((offset | len) & (block_bytes(c) - 1))
 		return -EINVAL;
 
 	if (insert) {
-		if (inode->v.i_sb->s_maxbytes - inode->v.i_size < len)
-			return -EFBIG;
-
 		if (offset >= inode->v.i_size)
 			return -EINVAL;
-
-		src_start	= U64_MAX;
-		shift		= len;
 	} else {
 		if (offset + len >= inode->v.i_size)
 			return -EINVAL;
-
-		src_start	= offset + len;
-		shift		= -len;
 	}
 
-	new_size = inode->v.i_size + shift;
-
 	ret = bch2_write_invalidate_inode_pages_range(mapping, offset, LLONG_MAX);
 	if (ret)
 		return ret;
 
-	if (insert) {
-		i_size_write(&inode->v, new_size);
-		mutex_lock(&inode->ei_update_lock);
-		ret = bch2_write_inode_size(c, inode, new_size,
-					    ATTR_MTIME|ATTR_CTIME);
-		mutex_unlock(&inode->ei_update_lock);
-	} else {
-		s64 i_sectors_delta = 0;
-
-		ret = bch2_fpunch(c, inode_inum(inode),
-				  offset >> 9, (offset + len) >> 9,
-				  &i_sectors_delta);
-		bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta);
+	if (insert)
+		i_size_write(&inode->v, inode->v.i_size + len);
 
-		if (ret)
-			return ret;
-	}
-
-	bch2_bkey_buf_init(&copy);
-	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024);
-	bch2_trans_iter_init(&trans, &src, BTREE_ID_extents,
-			POS(inode->v.i_ino, src_start >> 9),
-			BTREE_ITER_INTENT);
-	bch2_trans_copy_iter(&dst, &src);
-	bch2_trans_copy_iter(&del, &src);
-
-	while (ret == 0 ||
-	       bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
-		struct disk_reservation disk_res =
-			bch2_disk_reservation_init(c, 0);
-		struct bkey_i delete;
-		struct bkey_s_c k;
-		struct bpos next_pos;
-		struct bpos move_pos = POS(inode->v.i_ino, offset >> 9);
-		struct bpos atomic_end;
-		unsigned trigger_flags = 0;
-		u32 snapshot;
-
-		bch2_trans_begin(&trans);
-
-		ret = bch2_subvolume_get_snapshot(&trans,
-					inode->ei_subvol, &snapshot);
-		if (ret)
-			continue;
-
-		bch2_btree_iter_set_snapshot(&src, snapshot);
-		bch2_btree_iter_set_snapshot(&dst, snapshot);
-		bch2_btree_iter_set_snapshot(&del, snapshot);
-
-		bch2_trans_begin(&trans);
-
-		k = insert
-			? bch2_btree_iter_peek_prev(&src)
-			: bch2_btree_iter_peek_upto(&src, POS(inode->v.i_ino, U64_MAX));
-		if ((ret = bkey_err(k)))
-			continue;
-
-		if (!k.k || k.k->p.inode != inode->v.i_ino)
-			break;
-
-		if (insert &&
-		    bkey_le(k.k->p, POS(inode->v.i_ino, offset >> 9)))
-			break;
-reassemble:
-		bch2_bkey_buf_reassemble(&copy, c, k);
-
-		if (insert &&
-		    bkey_lt(bkey_start_pos(k.k), move_pos))
-			bch2_cut_front(move_pos, copy.k);
-
-		copy.k->k.p.offset += shift >> 9;
-		bch2_btree_iter_set_pos(&dst, bkey_start_pos(&copy.k->k));
-
-		ret = bch2_extent_atomic_end(&trans, &dst, copy.k, &atomic_end);
-		if (ret)
-			continue;
-
-		if (!bkey_eq(atomic_end, copy.k->k.p)) {
-			if (insert) {
-				move_pos = atomic_end;
-				move_pos.offset -= shift >> 9;
-				goto reassemble;
-			} else {
-				bch2_cut_back(atomic_end, copy.k);
-			}
-		}
-
-		bkey_init(&delete.k);
-		delete.k.p = copy.k->k.p;
-		delete.k.size = copy.k->k.size;
-		delete.k.p.offset -= shift >> 9;
-		bch2_btree_iter_set_pos(&del, bkey_start_pos(&delete.k));
-
-		next_pos = insert ? bkey_start_pos(&delete.k) : delete.k.p;
-
-		if (copy.k->k.size != k.k->size) {
-			/* We might end up splitting compressed extents: */
-			unsigned nr_ptrs =
-				bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(copy.k));
-
-			ret = bch2_disk_reservation_get(c, &disk_res,
-					copy.k->k.size, nr_ptrs,
-					BCH_DISK_RESERVATION_NOFAIL);
-			BUG_ON(ret);
-		}
-
-		ret =   bch2_btree_iter_traverse(&del) ?:
-			bch2_trans_update(&trans, &del, &delete, trigger_flags) ?:
-			bch2_trans_update(&trans, &dst, copy.k, trigger_flags) ?:
-			bch2_trans_commit(&trans, &disk_res, NULL,
-					  BTREE_INSERT_NOFAIL);
-		bch2_disk_reservation_put(c, &disk_res);
-
-		if (!ret)
-			bch2_btree_iter_set_pos(&src, next_pos);
-	}
-	bch2_trans_iter_exit(&trans, &del);
-	bch2_trans_iter_exit(&trans, &dst);
-	bch2_trans_iter_exit(&trans, &src);
-	bch2_trans_exit(&trans);
-	bch2_bkey_buf_exit(&copy, c);
-
-	if (ret)
-		return ret;
+	ret = bch2_fcollapse_finsert(c, inode_inum(inode), offset >> 9, len >> 9,
+				     insert, &i_sectors_delta);
+	if (!ret && !insert)
+		i_size_write(&inode->v, inode->v.i_size - len);
+	bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta);
 
-	mutex_lock(&inode->ei_update_lock);
-	if (!insert) {
-		i_size_write(&inode->v, new_size);
-		ret = bch2_write_inode_size(c, inode, new_size,
-					    ATTR_MTIME|ATTR_CTIME);
-	} else {
-		/* We need an inode update to update bi_journal_seq for fsync: */
-		ret = bch2_write_inode(c, inode, inode_update_times_fn, NULL,
-				       ATTR_MTIME|ATTR_CTIME);
-	}
-	mutex_unlock(&inode->ei_update_lock);
 	return ret;
 }
 
diff --git a/fs/bcachefs/fs-io.h b/fs/bcachefs/fs-io.h
index bc6e8439d40b..ca70346e68dc 100644
--- a/fs/bcachefs/fs-io.h
+++ b/fs/bcachefs/fs-io.h
@@ -165,7 +165,7 @@ int __must_check bch2_write_inode_size(struct bch_fs *,
 
 int bch2_fsync(struct file *, loff_t, loff_t, int);
 
-int bch2_truncate(struct mnt_idmap *,
+int bchfs_truncate(struct mnt_idmap *,
 		  struct bch_inode_info *, struct iattr *);
 long bch2_fallocate_dispatch(struct file *, int, loff_t, loff_t);
 
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index 0648874d54f3..0def3a57bd6d 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -798,7 +798,7 @@ static int bch2_setattr(struct mnt_idmap *idmap,
 		return ret;
 
 	return iattr->ia_valid & ATTR_SIZE
-		? bch2_truncate(idmap, inode, iattr)
+		? bchfs_truncate(idmap, inode, iattr)
 		: bch2_setattr_nonsize(idmap, inode, iattr);
 }
 
diff --git a/fs/bcachefs/io_misc.c b/fs/bcachefs/io_misc.c
index c04e5dacfc8d..1afea613df4a 100644
--- a/fs/bcachefs/io_misc.c
+++ b/fs/bcachefs/io_misc.c
@@ -9,7 +9,10 @@
 #include "btree_update.h"
 #include "buckets.h"
 #include "clock.h"
+#include "error.h"
 #include "extents.h"
+#include "extent_update.h"
+#include "inode.h"
 #include "io_misc.h"
 #include "io_write.h"
 #include "subvolume.h"
@@ -213,3 +216,226 @@ int bch2_fpunch(struct bch_fs *c, subvol_inum inum, u64 start, u64 end,
 
 	return ret;
 }
+
+static int truncate_set_isize(struct btree_trans *trans,
+			      subvol_inum inum,
+			      u64 new_i_size)
+{
+	struct btree_iter iter = { NULL };
+	struct bch_inode_unpacked inode_u;
+	int ret;
+
+	ret   = bch2_inode_peek(trans, &iter, &inode_u, inum, BTREE_ITER_INTENT) ?:
+		(inode_u.bi_size = new_i_size, 0) ?:
+		bch2_inode_write(trans, &iter, &inode_u);
+
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+int bch2_truncate(struct bch_fs *c, subvol_inum inum, u64 new_i_size, u64 *i_sectors_delta)
+{
+	struct btree_trans trans;
+	struct btree_iter fpunch_iter;
+	int ret;
+
+	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024);
+	bch2_trans_iter_init(&trans, &fpunch_iter, BTREE_ID_extents,
+			     POS(inum.inum, round_up(new_i_size, block_bytes(c)) >> 9),
+			     BTREE_ITER_INTENT);
+
+	ret = commit_do(&trans, NULL, NULL, BTREE_INSERT_NOFAIL,
+			truncate_set_isize(&trans, inum, new_i_size));
+	if (ret)
+		goto err;
+
+	ret = bch2_fpunch_at(&trans, &fpunch_iter, inum, U64_MAX, i_sectors_delta);
+	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+		ret = 0;
+	if (ret)
+		goto err;
+err:
+	bch2_trans_iter_exit(&trans, &fpunch_iter);
+	bch2_trans_exit(&trans);
+
+	bch2_fs_fatal_err_on(ret, c, "%s: error truncating %u:%llu: %s",
+			    __func__, inum.subvol, inum.inum, bch2_err_str(ret));
+	return ret;
+}
+
+static int adjust_i_size(struct btree_trans *trans, subvol_inum inum, u64 offset, s64 len)
+{
+	struct btree_iter iter;
+	struct bch_inode_unpacked inode_u;
+	int ret;
+
+	offset	<<= 9;
+	len	<<= 9;
+
+	ret = bch2_inode_peek(trans, &iter, &inode_u, inum, BTREE_ITER_INTENT);
+	if (ret)
+		return ret;
+
+	if (len > 0) {
+		if (MAX_LFS_FILESIZE - inode_u.bi_size < len) {
+			ret = -EFBIG;
+			goto err;
+		}
+
+		if (offset >= inode_u.bi_size) {
+			ret = -EINVAL;
+			goto err;
+		}
+	}
+
+	inode_u.bi_size += len;
+	inode_u.bi_mtime = inode_u.bi_ctime = bch2_current_time(trans->c);
+
+	ret = bch2_inode_write(trans, &iter, &inode_u);
+err:
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+int bch2_fcollapse_finsert(struct bch_fs *c, subvol_inum inum,
+			   u64 offset, u64 len, bool insert,
+			   s64 *i_sectors_delta)
+{
+	struct bkey_buf copy;
+	struct btree_trans trans;
+	struct btree_iter src = { NULL }, dst = { NULL }, del = { NULL };
+	s64 shift = insert ? len : -len;
+	int ret = 0;
+
+	bch2_bkey_buf_init(&copy);
+	bch2_trans_init(&trans, c, 0, 1024);
+
+	bch2_trans_iter_init(&trans, &src, BTREE_ID_extents,
+			     POS(inum.inum, U64_MAX),
+			     BTREE_ITER_INTENT);
+	bch2_trans_copy_iter(&dst, &src);
+	bch2_trans_copy_iter(&del, &src);
+
+	if (insert) {
+		ret = commit_do(&trans, NULL, NULL, BTREE_INSERT_NOFAIL,
+				adjust_i_size(&trans, inum, offset, len));
+		if (ret)
+			goto err;
+	} else {
+		bch2_btree_iter_set_pos(&src, POS(inum.inum, offset));
+
+		ret = bch2_fpunch_at(&trans, &src, inum, offset + len, i_sectors_delta);
+		if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
+			goto err;
+
+		bch2_btree_iter_set_pos(&src, POS(inum.inum, offset + len));
+	}
+
+	while (ret == 0 || bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
+		struct disk_reservation disk_res =
+			bch2_disk_reservation_init(c, 0);
+		struct bkey_i delete;
+		struct bkey_s_c k;
+		struct bpos next_pos;
+		struct bpos move_pos = POS(inum.inum, offset);
+		struct bpos atomic_end;
+		unsigned trigger_flags = 0;
+		u32 snapshot;
+
+		bch2_trans_begin(&trans);
+
+		ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot);
+		if (ret)
+			continue;
+
+		bch2_btree_iter_set_snapshot(&src, snapshot);
+		bch2_btree_iter_set_snapshot(&dst, snapshot);
+		bch2_btree_iter_set_snapshot(&del, snapshot);
+
+		bch2_trans_begin(&trans);
+
+		k = insert
+			? bch2_btree_iter_peek_prev(&src)
+			: bch2_btree_iter_peek_upto(&src, POS(inum.inum, U64_MAX));
+		if ((ret = bkey_err(k)))
+			continue;
+
+		if (!k.k || k.k->p.inode != inum.inum)
+			break;
+
+		if (insert &&
+		    bkey_le(k.k->p, POS(inum.inum, offset)))
+			break;
+reassemble:
+		bch2_bkey_buf_reassemble(&copy, c, k);
+
+		if (insert &&
+		    bkey_lt(bkey_start_pos(k.k), move_pos))
+			bch2_cut_front(move_pos, copy.k);
+
+		copy.k->k.p.offset += shift;
+		bch2_btree_iter_set_pos(&dst, bkey_start_pos(&copy.k->k));
+
+		ret = bch2_extent_atomic_end(&trans, &dst, copy.k, &atomic_end);
+		if (ret)
+			continue;
+
+		if (!bkey_eq(atomic_end, copy.k->k.p)) {
+			if (insert) {
+				move_pos = atomic_end;
+				move_pos.offset -= shift;
+				goto reassemble;
+			} else {
+				bch2_cut_back(atomic_end, copy.k);
+			}
+		}
+
+		bkey_init(&delete.k);
+		delete.k.p = copy.k->k.p;
+		delete.k.size = copy.k->k.size;
+		delete.k.p.offset -= shift;
+		bch2_btree_iter_set_pos(&del, bkey_start_pos(&delete.k));
+
+		next_pos = insert ? bkey_start_pos(&delete.k) : delete.k.p;
+
+		if (copy.k->k.size != k.k->size) {
+			/* We might end up splitting compressed extents: */
+			unsigned nr_ptrs =
+				bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(copy.k));
+
+			ret = bch2_disk_reservation_get(c, &disk_res,
+					copy.k->k.size, nr_ptrs,
+					BCH_DISK_RESERVATION_NOFAIL);
+			BUG_ON(ret);
+		}
+
+		ret =   bch2_btree_iter_traverse(&del) ?:
+			bch2_trans_update(&trans, &del, &delete, trigger_flags) ?:
+			bch2_trans_update(&trans, &dst, copy.k, trigger_flags) ?:
+			bch2_trans_commit(&trans, &disk_res, NULL,
+					  BTREE_INSERT_NOFAIL);
+		bch2_disk_reservation_put(c, &disk_res);
+
+		if (!ret)
+			bch2_btree_iter_set_pos(&src, next_pos);
+	}
+
+	if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
+		goto err;
+
+	if (!insert) {
+		ret = commit_do(&trans, NULL, NULL, BTREE_INSERT_NOFAIL,
+				adjust_i_size(&trans, inum, offset, -len));
+	} else {
+		/* We need an inode update to update bi_journal_seq for fsync: */
+		ret = commit_do(&trans, NULL, NULL, BTREE_INSERT_NOFAIL,
+				adjust_i_size(&trans, inum, 0, 0));
+	}
+err:
+	bch2_trans_iter_exit(&trans, &del);
+	bch2_trans_iter_exit(&trans, &dst);
+	bch2_trans_iter_exit(&trans, &src);
+	bch2_trans_exit(&trans);
+	bch2_bkey_buf_exit(&copy, c);
+	return ret;
+}
diff --git a/fs/bcachefs/io_misc.h b/fs/bcachefs/io_misc.h
index 46e9ce3251d6..894a7a04ba4b 100644
--- a/fs/bcachefs/io_misc.h
+++ b/fs/bcachefs/io_misc.h
@@ -9,4 +9,7 @@ int bch2_fpunch_at(struct btree_trans *, struct btree_iter *,
 		   subvol_inum, u64, s64 *);
 int bch2_fpunch(struct bch_fs *c, subvol_inum, u64, u64, s64 *);
 
+int bch2_truncate(struct bch_fs *, subvol_inum, u64, u64 *);
+int bch2_fcollapse_finsert(struct bch_fs *, subvol_inum, u64, u64, bool, s64 *);
+
 #endif /* _BCACHEFS_IO_MISC_H */
-- 
cgit 


From aaad530ac6b1c836de4a29d227ab68be97e39a73 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 27 Aug 2023 18:27:41 -0400
Subject: bcachefs: BTREE_ID_logged_ops

Add a new btree for long running logged operations - i.e. for logging
operations that we can't do within a single btree transaction, so that
they can be resumed if we crash.

Keys in the logged operations btree will represent operations in
progress, with the state of the operation stored in the value.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/Makefile          |   1 +
 fs/bcachefs/bcachefs.h        |   1 +
 fs/bcachefs/bcachefs_format.h |   4 +-
 fs/bcachefs/btree_update.c    |  18 +++++++
 fs/bcachefs/btree_update.h    |   1 +
 fs/bcachefs/logged_ops.c      | 111 ++++++++++++++++++++++++++++++++++++++++++
 fs/bcachefs/logged_ops.h      |  18 +++++++
 fs/bcachefs/recovery.c        |   1 +
 fs/bcachefs/recovery_types.h  |   1 +
 fs/bcachefs/super.c           |   1 +
 10 files changed, 156 insertions(+), 1 deletion(-)
 create mode 100644 fs/bcachefs/logged_ops.c
 create mode 100644 fs/bcachefs/logged_ops.h

(limited to 'fs')

diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile
index 9c00dabb26ac..0749731b9072 100644
--- a/fs/bcachefs/Makefile
+++ b/fs/bcachefs/Makefile
@@ -55,6 +55,7 @@ bcachefs-y		:=	\
 	journal_sb.o		\
 	journal_seq_blacklist.o	\
 	keylist.o		\
+	logged_ops.o		\
 	lru.o			\
 	mean_and_variance.o	\
 	migrate.o		\
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 30b3d7b9f9dc..e80fef1537c9 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -454,6 +454,7 @@ enum gc_phase {
 	GC_PHASE_BTREE_bucket_gens,
 	GC_PHASE_BTREE_snapshot_trees,
 	GC_PHASE_BTREE_deleted_inodes,
+	GC_PHASE_BTREE_logged_ops,
 
 	GC_PHASE_PENDING_DELETE,
 };
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index 1cce2504bca6..31efa9e381ce 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -2249,7 +2249,9 @@ enum btree_id_flags {
 	x(snapshot_trees,	15,	0,					\
 	  BIT_ULL(KEY_TYPE_snapshot_tree))					\
 	x(deleted_inodes,	16,	BTREE_ID_SNAPSHOTS,			\
-	  BIT_ULL(KEY_TYPE_set))
+	  BIT_ULL(KEY_TYPE_set))						\
+	x(logged_ops,		17,	0,					\
+	  0)
 
 enum btree_id {
 #define x(name, nr, ...) BTREE_ID_##name = nr,
diff --git a/fs/bcachefs/btree_update.c b/fs/bcachefs/btree_update.c
index 606e7050a84a..823f0da2f502 100644
--- a/fs/bcachefs/btree_update.c
+++ b/fs/bcachefs/btree_update.c
@@ -653,6 +653,7 @@ int bch2_btree_insert_nonextent(struct btree_trans *trans,
 	int ret;
 
 	bch2_trans_iter_init(trans, &iter, btree, k->k.p,
+			     BTREE_ITER_CACHED|
 			     BTREE_ITER_NOT_EXTENTS|
 			     BTREE_ITER_INTENT);
 	ret   = bch2_btree_iter_traverse(&iter) ?:
@@ -727,6 +728,23 @@ int bch2_btree_delete_at_buffered(struct btree_trans *trans,
 	return bch2_trans_update_buffered(trans, btree, k);
 }
 
+int bch2_btree_delete(struct btree_trans *trans,
+		      enum btree_id btree, struct bpos pos,
+		      unsigned update_flags)
+{
+	struct btree_iter iter;
+	int ret;
+
+	bch2_trans_iter_init(trans, &iter, btree, pos,
+			     BTREE_ITER_CACHED|
+			     BTREE_ITER_INTENT);
+	ret   = bch2_btree_iter_traverse(&iter) ?:
+		bch2_btree_delete_at(trans, &iter, update_flags);
+	bch2_trans_iter_exit(trans, &iter);
+
+	return ret;
+}
+
 int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id id,
 				  struct bpos start, struct bpos end,
 				  unsigned update_flags,
diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
index 78a92a1cfb47..0596c5e73a3e 100644
--- a/fs/bcachefs/btree_update.h
+++ b/fs/bcachefs/btree_update.h
@@ -58,6 +58,7 @@ int bch2_btree_delete_extent_at(struct btree_trans *, struct btree_iter *,
 				unsigned, unsigned);
 int bch2_btree_delete_at(struct btree_trans *, struct btree_iter *, unsigned);
 int bch2_btree_delete_at_buffered(struct btree_trans *, enum btree_id, struct bpos);
+int bch2_btree_delete(struct btree_trans *, enum btree_id, struct bpos, unsigned);
 
 int bch2_btree_insert_nonextent(struct btree_trans *, enum btree_id,
 				struct bkey_i *, enum btree_update_flags);
diff --git a/fs/bcachefs/logged_ops.c b/fs/bcachefs/logged_ops.c
new file mode 100644
index 000000000000..28a0e7b33e49
--- /dev/null
+++ b/fs/bcachefs/logged_ops.c
@@ -0,0 +1,111 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "bkey_buf.h"
+#include "btree_update.h"
+#include "error.h"
+#include "logged_ops.h"
+#include "super.h"
+
+struct bch_logged_op_fn {
+	u8		type;
+	int		(*resume)(struct btree_trans *, struct bkey_i *);
+};
+
+static const struct bch_logged_op_fn logged_op_fns[] = {
+#define x(n)		{					\
+	.type		= KEY_TYPE_logged_op_##n,		\
+	.resume		= bch2_resume_logged_op_##n,		\
+},
+	BCH_LOGGED_OPS()
+#undef x
+};
+
+static const struct bch_logged_op_fn *logged_op_fn(enum bch_bkey_type type)
+{
+	for (unsigned i = 0; i < ARRAY_SIZE(logged_op_fns); i++)
+		if (logged_op_fns[i].type == type)
+			return logged_op_fns + i;
+	return NULL;
+}
+
+static int resume_logged_op(struct btree_trans *trans, struct btree_iter *iter,
+			    struct bkey_s_c k)
+{
+	struct bch_fs *c = trans->c;
+	const struct bch_logged_op_fn *fn = logged_op_fn(k.k->type);
+	struct bkey_buf sk;
+	u32 restart_count = trans->restart_count;
+	int ret;
+
+	if (!fn)
+		return 0;
+
+	bch2_bkey_buf_init(&sk);
+	bch2_bkey_buf_reassemble(&sk, c, k);
+
+	ret =   drop_locks_do(trans, (bch2_fs_lazy_rw(c), 0)) ?:
+		fn->resume(trans, sk.k) ?: trans_was_restarted(trans, restart_count);
+
+	bch2_bkey_buf_exit(&sk, c);
+	return ret;
+}
+
+int bch2_resume_logged_ops(struct bch_fs *c)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	int ret;
+
+	ret = bch2_trans_run(c,
+		for_each_btree_key2(&trans, iter,
+				BTREE_ID_logged_ops, POS_MIN, BTREE_ITER_PREFETCH, k,
+			resume_logged_op(&trans, &iter, k)));
+	if (ret)
+		bch_err_fn(c, ret);
+	return ret;
+}
+
+static int __bch2_logged_op_start(struct btree_trans *trans, struct bkey_i *k)
+{
+	struct btree_iter iter;
+	int ret;
+
+	ret = bch2_bkey_get_empty_slot(trans, &iter, BTREE_ID_logged_ops, POS_MAX);
+	if (ret)
+		return ret;
+
+	k->k.p = iter.pos;
+
+	ret = bch2_trans_update(trans, &iter, k, 0);
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+int bch2_logged_op_start(struct btree_trans *trans, struct bkey_i *k)
+{
+	return commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL,
+			 __bch2_logged_op_start(trans, k));
+}
+
+void bch2_logged_op_finish(struct btree_trans *trans, struct bkey_i *k)
+{
+	int ret = commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL,
+			    bch2_btree_delete(trans, BTREE_ID_logged_ops, k->k.p, 0));
+	/*
+	 * This needs to be a fatal error because we've left an unfinished
+	 * operation in the logged ops btree.
+	 *
+	 * We should only ever see an error here if the filesystem has already
+	 * been shut down, but make sure of that here:
+	 */
+	if (ret) {
+		struct bch_fs *c = trans->c;
+		struct printbuf buf = PRINTBUF;
+
+		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k));
+		bch2_fs_fatal_error(c, "%s: error deleting logged operation %s: %s",
+				     __func__, buf.buf, bch2_err_str(ret));
+		printbuf_exit(&buf);
+	}
+}
diff --git a/fs/bcachefs/logged_ops.h b/fs/bcachefs/logged_ops.h
new file mode 100644
index 000000000000..9b758008c6bd
--- /dev/null
+++ b/fs/bcachefs/logged_ops.h
@@ -0,0 +1,18 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_LOGGED_OPS_H
+#define _BCACHEFS_LOGGED_OPS_H
+
+#include "bkey.h"
+
+#define BCH_LOGGED_OPS()
+
+static inline int bch2_logged_op_update(struct btree_trans *trans, struct bkey_i *op)
+{
+	return bch2_btree_insert_nonextent(trans, BTREE_ID_logged_ops, op, 0);
+}
+
+int bch2_resume_logged_ops(struct bch_fs *);
+int bch2_logged_op_start(struct btree_trans *, struct bkey_i *);
+void bch2_logged_op_finish(struct btree_trans *, struct bkey_i *);
+
+#endif /* _BCACHEFS_LOGGED_OPS_H */
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 30efb3c90560..f5f6eea2cbae 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -20,6 +20,7 @@
 #include "journal_reclaim.h"
 #include "journal_seq_blacklist.h"
 #include "lru.h"
+#include "logged_ops.h"
 #include "move.h"
 #include "quota.h"
 #include "recovery.h"
diff --git a/fs/bcachefs/recovery_types.h b/fs/bcachefs/recovery_types.h
index abf1f834ec7a..fbfa9d831d6f 100644
--- a/fs/bcachefs/recovery_types.h
+++ b/fs/bcachefs/recovery_types.h
@@ -29,6 +29,7 @@
 	x(check_subvols,		PASS_FSCK)						\
 	x(delete_dead_snapshots,	PASS_FSCK|PASS_UNCLEAN)					\
 	x(fs_upgrade_for_subvolumes,	0)							\
+	x(resume_logged_ops,		PASS_ALWAYS)						\
 	x(check_inodes,			PASS_FSCK)						\
 	x(check_extents,		PASS_FSCK)						\
 	x(check_dirents,		PASS_FSCK)						\
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 55176023f15b..ef11cede1dba 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -791,6 +791,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 	c->btree_key_cache_btrees |= 1U << BTREE_ID_alloc;
 	if (c->opts.inodes_use_key_cache)
 		c->btree_key_cache_btrees |= 1U << BTREE_ID_inodes;
+	c->btree_key_cache_btrees |= 1U << BTREE_ID_logged_ops;
 
 	c->block_bits		= ilog2(block_sectors(c));
 	c->btree_foreground_merge_threshold = BTREE_FOREGROUND_MERGE_THRESHOLD(c);
-- 
cgit 


From b030e262b517b6bddc4bfa88ed8d335ef9de7671 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 10 Sep 2023 16:42:30 -0400
Subject: bcachefs: Log truncate operations

Previously, we guaranteed atomicity of truncate after unclean shutdown
with the BCH_INODE_I_SIZE_DIRTY flag - which required a full scan of the
inodes btree.

Recently the deleted inodes btree was added so that we no longer have to
scan for deleted inodes, but truncate was unfinished and that change
left it broken.

This patch uses the new logged operations btree to fix truncate
atomicity; we now log an operation that can be replayed at the start of
a truncate.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs_format.h | 19 ++++++++++---
 fs/bcachefs/bkey_methods.c    |  1 +
 fs/bcachefs/io_misc.c         | 64 +++++++++++++++++++++++++++++++------------
 fs/bcachefs/io_misc.h         |  9 ++++++
 fs/bcachefs/logged_ops.c      |  1 +
 fs/bcachefs/logged_ops.h      |  3 +-
 6 files changed, 75 insertions(+), 22 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index 31efa9e381ce..3c9e788f1c9d 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -370,7 +370,8 @@ static inline void bkey_init(struct bkey *k)
 	x(backpointer,		28)			\
 	x(inode_v3,		29)			\
 	x(bucket_gens,		30)			\
-	x(snapshot_tree,	31)
+	x(snapshot_tree,	31)			\
+	x(logged_op_truncate,	32)
 
 enum bch_bkey_type {
 #define x(name, nr) KEY_TYPE_##name	= nr,
@@ -847,8 +848,8 @@ enum {
 	__BCH_INODE_NODUMP		= 3,
 	__BCH_INODE_NOATIME		= 4,
 
-	__BCH_INODE_I_SIZE_DIRTY	= 5,
-	__BCH_INODE_I_SECTORS_DIRTY	= 6,
+	__BCH_INODE_I_SIZE_DIRTY	= 5, /* obsolete */
+	__BCH_INODE_I_SECTORS_DIRTY	= 6, /* obsolete */
 	__BCH_INODE_UNLINKED		= 7,
 	__BCH_INODE_BACKPTR_UNTRUSTED	= 8,
 
@@ -1183,6 +1184,16 @@ struct bch_lru {
 
 #define LRU_ID_STRIPES		(1U << 16)
 
+/* Logged operations btree: */
+
+struct bch_logged_op_truncate {
+	struct bch_val		v;
+	__le32			subvol;
+	__le32			pad;
+	__le64			inum;
+	__le64			new_i_size;
+};
+
 /* Optional/variable size superblock sections: */
 
 struct bch_sb_field {
@@ -2251,7 +2262,7 @@ enum btree_id_flags {
 	x(deleted_inodes,	16,	BTREE_ID_SNAPSHOTS,			\
 	  BIT_ULL(KEY_TYPE_set))						\
 	x(logged_ops,		17,	0,					\
-	  0)
+	  BIT_ULL(KEY_TYPE_logged_op_truncate))
 
 enum btree_id {
 #define x(name, nr, ...) BTREE_ID_##name = nr,
diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c
index 6547142db428..91e28ee3efff 100644
--- a/fs/bcachefs/bkey_methods.c
+++ b/fs/bcachefs/bkey_methods.c
@@ -10,6 +10,7 @@
 #include "error.h"
 #include "extents.h"
 #include "inode.h"
+#include "io_misc.h"
 #include "lru.h"
 #include "quota.h"
 #include "reflink.h"
diff --git a/fs/bcachefs/io_misc.c b/fs/bcachefs/io_misc.c
index 1afea613df4a..327b3dd642de 100644
--- a/fs/bcachefs/io_misc.c
+++ b/fs/bcachefs/io_misc.c
@@ -15,6 +15,7 @@
 #include "inode.h"
 #include "io_misc.h"
 #include "io_write.h"
+#include "logged_ops.h"
 #include "subvolume.h"
 
 /* Overwrites whatever was present with zeroes: */
@@ -217,6 +218,17 @@ int bch2_fpunch(struct bch_fs *c, subvol_inum inum, u64 start, u64 end,
 	return ret;
 }
 
+/* truncate: */
+
+void bch2_logged_op_truncate_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
+{
+	struct bkey_s_c_logged_op_truncate op = bkey_s_c_to_logged_op_truncate(k);
+
+	prt_printf(out, "subvol=%u", le32_to_cpu(op.v->subvol));
+	prt_printf(out, " inum=%llu", le64_to_cpu(op.v->inum));
+	prt_printf(out, " new_i_size=%llu", le64_to_cpu(op.v->new_i_size));
+}
+
 static int truncate_set_isize(struct btree_trans *trans,
 			      subvol_inum inum,
 			      u64 new_i_size)
@@ -233,36 +245,54 @@ static int truncate_set_isize(struct btree_trans *trans,
 	return ret;
 }
 
-int bch2_truncate(struct bch_fs *c, subvol_inum inum, u64 new_i_size, u64 *i_sectors_delta)
+static int __bch2_resume_logged_op_truncate(struct btree_trans *trans,
+					    struct bkey_i *op_k,
+					    u64 *i_sectors_delta)
 {
-	struct btree_trans trans;
+	struct bch_fs *c = trans->c;
 	struct btree_iter fpunch_iter;
+	struct bkey_i_logged_op_truncate *op = bkey_i_to_logged_op_truncate(op_k);
+	subvol_inum inum = { le32_to_cpu(op->v.subvol), le64_to_cpu(op->v.inum) };
+	u64 new_i_size = le64_to_cpu(op->v.new_i_size);
 	int ret;
 
-	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024);
-	bch2_trans_iter_init(&trans, &fpunch_iter, BTREE_ID_extents,
-			     POS(inum.inum, round_up(new_i_size, block_bytes(c)) >> 9),
-			     BTREE_ITER_INTENT);
-
-	ret = commit_do(&trans, NULL, NULL, BTREE_INSERT_NOFAIL,
-			truncate_set_isize(&trans, inum, new_i_size));
+	ret = commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL,
+			truncate_set_isize(trans, inum, new_i_size));
 	if (ret)
 		goto err;
 
-	ret = bch2_fpunch_at(&trans, &fpunch_iter, inum, U64_MAX, i_sectors_delta);
+	bch2_trans_iter_init(trans, &fpunch_iter, BTREE_ID_extents,
+			     POS(inum.inum, round_up(new_i_size, block_bytes(c)) >> 9),
+			     BTREE_ITER_INTENT);
+	ret = bch2_fpunch_at(trans, &fpunch_iter, inum, U64_MAX, i_sectors_delta);
+	bch2_trans_iter_exit(trans, &fpunch_iter);
+
 	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
 		ret = 0;
-	if (ret)
-		goto err;
 err:
-	bch2_trans_iter_exit(&trans, &fpunch_iter);
-	bch2_trans_exit(&trans);
-
-	bch2_fs_fatal_err_on(ret, c, "%s: error truncating %u:%llu: %s",
-			    __func__, inum.subvol, inum.inum, bch2_err_str(ret));
+	bch2_logged_op_finish(trans, op_k);
 	return ret;
 }
 
+int bch2_resume_logged_op_truncate(struct btree_trans *trans, struct bkey_i *op_k)
+{
+	return __bch2_resume_logged_op_truncate(trans, op_k, NULL);
+}
+
+int bch2_truncate(struct bch_fs *c, subvol_inum inum, u64 new_i_size, u64 *i_sectors_delta)
+{
+	struct bkey_i_logged_op_truncate op;
+
+	bkey_logged_op_truncate_init(&op.k_i);
+	op.v.subvol	= cpu_to_le32(inum.subvol);
+	op.v.inum	= cpu_to_le64(inum.inum);
+	op.v.new_i_size	= cpu_to_le64(new_i_size);
+
+	return bch2_trans_run(c,
+		bch2_logged_op_start(&trans, &op.k_i) ?:
+		__bch2_resume_logged_op_truncate(&trans, &op.k_i, i_sectors_delta));
+}
+
 static int adjust_i_size(struct btree_trans *trans, subvol_inum inum, u64 offset, s64 len)
 {
 	struct btree_iter iter;
diff --git a/fs/bcachefs/io_misc.h b/fs/bcachefs/io_misc.h
index 894a7a04ba4b..1b792451fff2 100644
--- a/fs/bcachefs/io_misc.h
+++ b/fs/bcachefs/io_misc.h
@@ -9,6 +9,15 @@ int bch2_fpunch_at(struct btree_trans *, struct btree_iter *,
 		   subvol_inum, u64, s64 *);
 int bch2_fpunch(struct bch_fs *c, subvol_inum, u64, u64, s64 *);
 
+void bch2_logged_op_truncate_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
+
+#define bch2_bkey_ops_logged_op_truncate ((struct bkey_ops) {	\
+	.val_to_text	= bch2_logged_op_truncate_to_text,	\
+	.min_val_size	= 24,					\
+})
+
+int bch2_resume_logged_op_truncate(struct btree_trans *, struct bkey_i *);
+
 int bch2_truncate(struct bch_fs *, subvol_inum, u64, u64 *);
 int bch2_fcollapse_finsert(struct bch_fs *, subvol_inum, u64, u64, bool, s64 *);
 
diff --git a/fs/bcachefs/logged_ops.c b/fs/bcachefs/logged_ops.c
index 28a0e7b33e49..e133c23ad51c 100644
--- a/fs/bcachefs/logged_ops.c
+++ b/fs/bcachefs/logged_ops.c
@@ -4,6 +4,7 @@
 #include "bkey_buf.h"
 #include "btree_update.h"
 #include "error.h"
+#include "io_misc.h"
 #include "logged_ops.h"
 #include "super.h"
 
diff --git a/fs/bcachefs/logged_ops.h b/fs/bcachefs/logged_ops.h
index 9b758008c6bd..b2f2ebea54b6 100644
--- a/fs/bcachefs/logged_ops.h
+++ b/fs/bcachefs/logged_ops.h
@@ -4,7 +4,8 @@
 
 #include "bkey.h"
 
-#define BCH_LOGGED_OPS()
+#define BCH_LOGGED_OPS()			\
+	x(truncate)
 
 static inline int bch2_logged_op_update(struct btree_trans *trans, struct bkey_i *op)
 {
-- 
cgit 


From f3e374efbf1e32fc0235d44abc68abae06a8f7ab Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 10 Sep 2023 19:11:47 -0400
Subject: bcachefs: Log finsert/fcollapse operations

Now that we have the logged operations btree, we can make
finsert/fcollapse atomic w.r.t. unclean shutdown as well.

This adds bch_logged_op_finsert to represent the state of an finsert or
fcollapse, which is a bit more complicated than truncate since we need
to track our position in the "shift extents" operation.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs_format.h |  23 ++++-
 fs/bcachefs/io_misc.c         | 211 ++++++++++++++++++++++++------------------
 fs/bcachefs/io_misc.h         |  10 ++
 fs/bcachefs/logged_ops.h      |   3 +-
 4 files changed, 152 insertions(+), 95 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index 3c9e788f1c9d..c434202f351a 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -371,7 +371,8 @@ static inline void bkey_init(struct bkey *k)
 	x(inode_v3,		29)			\
 	x(bucket_gens,		30)			\
 	x(snapshot_tree,	31)			\
-	x(logged_op_truncate,	32)
+	x(logged_op_truncate,	32)			\
+	x(logged_op_finsert,	33)
 
 enum bch_bkey_type {
 #define x(name, nr) KEY_TYPE_##name	= nr,
@@ -1194,6 +1195,23 @@ struct bch_logged_op_truncate {
 	__le64			new_i_size;
 };
 
+enum logged_op_finsert_state {
+	LOGGED_OP_FINSERT_start,
+	LOGGED_OP_FINSERT_shift_extents,
+	LOGGED_OP_FINSERT_finish,
+};
+
+struct bch_logged_op_finsert {
+	struct bch_val		v;
+	__u8			state;
+	__u8			pad[3];
+	__le32			subvol;
+	__le64			inum;
+	__le64			dst_offset;
+	__le64			src_offset;
+	__le64			pos;
+};
+
 /* Optional/variable size superblock sections: */
 
 struct bch_sb_field {
@@ -2262,7 +2280,8 @@ enum btree_id_flags {
 	x(deleted_inodes,	16,	BTREE_ID_SNAPSHOTS,			\
 	  BIT_ULL(KEY_TYPE_set))						\
 	x(logged_ops,		17,	0,					\
-	  BIT_ULL(KEY_TYPE_logged_op_truncate))
+	  BIT_ULL(KEY_TYPE_logged_op_truncate)|					\
+	  BIT_ULL(KEY_TYPE_logged_op_finsert))
 
 enum btree_id {
 #define x(name, nr, ...) BTREE_ID_##name = nr,
diff --git a/fs/bcachefs/io_misc.c b/fs/bcachefs/io_misc.c
index 327b3dd642de..b1be70e15c60 100644
--- a/fs/bcachefs/io_misc.c
+++ b/fs/bcachefs/io_misc.c
@@ -293,6 +293,18 @@ int bch2_truncate(struct bch_fs *c, subvol_inum inum, u64 new_i_size, u64 *i_sec
 		__bch2_resume_logged_op_truncate(&trans, &op.k_i, i_sectors_delta));
 }
 
+/* finsert/fcollapse: */
+
+void bch2_logged_op_finsert_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
+{
+	struct bkey_s_c_logged_op_finsert op = bkey_s_c_to_logged_op_finsert(k);
+
+	prt_printf(out, "subvol=%u",		le32_to_cpu(op.v->subvol));
+	prt_printf(out, " inum=%llu",		le64_to_cpu(op.v->inum));
+	prt_printf(out, " dst_offset=%lli",	le64_to_cpu(op.v->dst_offset));
+	prt_printf(out, " src_offset=%llu",	le64_to_cpu(op.v->src_offset));
+}
+
 static int adjust_i_size(struct btree_trans *trans, subvol_inum inum, u64 offset, s64 len)
 {
 	struct btree_iter iter;
@@ -327,145 +339,160 @@ err:
 	return ret;
 }
 
-int bch2_fcollapse_finsert(struct bch_fs *c, subvol_inum inum,
-			   u64 offset, u64 len, bool insert,
-			   s64 *i_sectors_delta)
+static int __bch2_resume_logged_op_finsert(struct btree_trans *trans,
+					   struct bkey_i *op_k,
+					   u64 *i_sectors_delta)
 {
-	struct bkey_buf copy;
-	struct btree_trans trans;
-	struct btree_iter src = { NULL }, dst = { NULL }, del = { NULL };
-	s64 shift = insert ? len : -len;
+	struct bch_fs *c = trans->c;
+	struct btree_iter iter;
+	struct bkey_i_logged_op_finsert *op = bkey_i_to_logged_op_finsert(op_k);
+	subvol_inum inum = { le32_to_cpu(op->v.subvol), le64_to_cpu(op->v.inum) };
+	u64 dst_offset = le64_to_cpu(op->v.dst_offset);
+	u64 src_offset = le64_to_cpu(op->v.src_offset);
+	s64 shift = dst_offset - src_offset;
+	u64 len = abs(shift);
+	u64 pos = le64_to_cpu(op->v.pos);
+	bool insert = shift > 0;
 	int ret = 0;
 
-	bch2_bkey_buf_init(&copy);
-	bch2_trans_init(&trans, c, 0, 1024);
-
-	bch2_trans_iter_init(&trans, &src, BTREE_ID_extents,
-			     POS(inum.inum, U64_MAX),
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
+			     POS(inum.inum, 0),
 			     BTREE_ITER_INTENT);
-	bch2_trans_copy_iter(&dst, &src);
-	bch2_trans_copy_iter(&del, &src);
+
+	switch (op->v.state) {
+case LOGGED_OP_FINSERT_start:
+	op->v.state = LOGGED_OP_FINSERT_shift_extents;
 
 	if (insert) {
-		ret = commit_do(&trans, NULL, NULL, BTREE_INSERT_NOFAIL,
-				adjust_i_size(&trans, inum, offset, len));
+		ret = commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL,
+				adjust_i_size(trans, inum, src_offset, len) ?:
+				bch2_logged_op_update(trans, &op->k_i));
 		if (ret)
 			goto err;
 	} else {
-		bch2_btree_iter_set_pos(&src, POS(inum.inum, offset));
+		bch2_btree_iter_set_pos(&iter, POS(inum.inum, src_offset));
 
-		ret = bch2_fpunch_at(&trans, &src, inum, offset + len, i_sectors_delta);
+		ret = bch2_fpunch_at(trans, &iter, inum, src_offset + len, i_sectors_delta);
 		if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
 			goto err;
 
-		bch2_btree_iter_set_pos(&src, POS(inum.inum, offset + len));
+		ret = commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL,
+				bch2_logged_op_update(trans, &op->k_i));
 	}
 
-	while (ret == 0 || bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
+	fallthrough;
+case LOGGED_OP_FINSERT_shift_extents:
+	while (1) {
 		struct disk_reservation disk_res =
 			bch2_disk_reservation_init(c, 0);
-		struct bkey_i delete;
+		struct bkey_i delete, *copy;
 		struct bkey_s_c k;
-		struct bpos next_pos;
-		struct bpos move_pos = POS(inum.inum, offset);
-		struct bpos atomic_end;
-		unsigned trigger_flags = 0;
+		struct bpos src_pos = POS(inum.inum, src_offset);
 		u32 snapshot;
 
-		bch2_trans_begin(&trans);
+		bch2_trans_begin(trans);
 
-		ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot);
+		ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
 		if (ret)
-			continue;
+			goto btree_err;
 
-		bch2_btree_iter_set_snapshot(&src, snapshot);
-		bch2_btree_iter_set_snapshot(&dst, snapshot);
-		bch2_btree_iter_set_snapshot(&del, snapshot);
-
-		bch2_trans_begin(&trans);
+		bch2_btree_iter_set_snapshot(&iter, snapshot);
+		bch2_btree_iter_set_pos(&iter, SPOS(inum.inum, pos, snapshot));
 
 		k = insert
-			? bch2_btree_iter_peek_prev(&src)
-			: bch2_btree_iter_peek_upto(&src, POS(inum.inum, U64_MAX));
+			? bch2_btree_iter_peek_prev(&iter)
+			: bch2_btree_iter_peek_upto(&iter, POS(inum.inum, U64_MAX));
 		if ((ret = bkey_err(k)))
-			continue;
+			goto btree_err;
 
-		if (!k.k || k.k->p.inode != inum.inum)
+		if (!k.k ||
+		    k.k->p.inode != inum.inum ||
+		    bkey_le(k.k->p, POS(inum.inum, src_offset)))
 			break;
 
-		if (insert &&
-		    bkey_le(k.k->p, POS(inum.inum, offset)))
-			break;
-reassemble:
-		bch2_bkey_buf_reassemble(&copy, c, k);
+		copy = bch2_bkey_make_mut_noupdate(trans, k);
+		if ((ret = PTR_ERR_OR_ZERO(copy)))
+			goto btree_err;
 
 		if (insert &&
-		    bkey_lt(bkey_start_pos(k.k), move_pos))
-			bch2_cut_front(move_pos, copy.k);
-
-		copy.k->k.p.offset += shift;
-		bch2_btree_iter_set_pos(&dst, bkey_start_pos(&copy.k->k));
+		    bkey_lt(bkey_start_pos(k.k), src_pos)) {
+			bch2_cut_front(src_pos, copy);
 
-		ret = bch2_extent_atomic_end(&trans, &dst, copy.k, &atomic_end);
-		if (ret)
-			continue;
-
-		if (!bkey_eq(atomic_end, copy.k->k.p)) {
-			if (insert) {
-				move_pos = atomic_end;
-				move_pos.offset -= shift;
-				goto reassemble;
-			} else {
-				bch2_cut_back(atomic_end, copy.k);
-			}
+			/* Splitting compressed extent? */
+			bch2_disk_reservation_add(c, &disk_res,
+					copy->k.size *
+					bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(copy)),
+					BCH_DISK_RESERVATION_NOFAIL);
 		}
 
 		bkey_init(&delete.k);
-		delete.k.p = copy.k->k.p;
-		delete.k.size = copy.k->k.size;
-		delete.k.p.offset -= shift;
-		bch2_btree_iter_set_pos(&del, bkey_start_pos(&delete.k));
+		delete.k.p = copy->k.p;
+		delete.k.p.snapshot = snapshot;
+		delete.k.size = copy->k.size;
 
-		next_pos = insert ? bkey_start_pos(&delete.k) : delete.k.p;
+		copy->k.p.offset += shift;
+		copy->k.p.snapshot = snapshot;
 
-		if (copy.k->k.size != k.k->size) {
-			/* We might end up splitting compressed extents: */
-			unsigned nr_ptrs =
-				bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(copy.k));
+		op->v.pos = cpu_to_le64(insert ? bkey_start_offset(&delete.k) : delete.k.p.offset);
 
-			ret = bch2_disk_reservation_get(c, &disk_res,
-					copy.k->k.size, nr_ptrs,
-					BCH_DISK_RESERVATION_NOFAIL);
-			BUG_ON(ret);
-		}
-
-		ret =   bch2_btree_iter_traverse(&del) ?:
-			bch2_trans_update(&trans, &del, &delete, trigger_flags) ?:
-			bch2_trans_update(&trans, &dst, copy.k, trigger_flags) ?:
-			bch2_trans_commit(&trans, &disk_res, NULL,
-					  BTREE_INSERT_NOFAIL);
+		ret =   bch2_btree_insert_trans(trans, BTREE_ID_extents, &delete, 0) ?:
+			bch2_btree_insert_trans(trans, BTREE_ID_extents, copy, 0) ?:
+			bch2_logged_op_update(trans, &op->k_i) ?:
+			bch2_trans_commit(trans, &disk_res, NULL, BTREE_INSERT_NOFAIL);
+btree_err:
 		bch2_disk_reservation_put(c, &disk_res);
 
-		if (!ret)
-			bch2_btree_iter_set_pos(&src, next_pos);
+		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+			continue;
+		if (ret)
+			goto err;
+
+		pos = le64_to_cpu(op->v.pos);
 	}
 
-	if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
-		goto err;
+	op->v.state = LOGGED_OP_FINSERT_finish;
 
 	if (!insert) {
-		ret = commit_do(&trans, NULL, NULL, BTREE_INSERT_NOFAIL,
-				adjust_i_size(&trans, inum, offset, -len));
+		ret = commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL,
+				adjust_i_size(trans, inum, src_offset, shift) ?:
+				bch2_logged_op_update(trans, &op->k_i));
 	} else {
 		/* We need an inode update to update bi_journal_seq for fsync: */
-		ret = commit_do(&trans, NULL, NULL, BTREE_INSERT_NOFAIL,
-				adjust_i_size(&trans, inum, 0, 0));
+		ret = commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL,
+				adjust_i_size(trans, inum, 0, 0) ?:
+				bch2_logged_op_update(trans, &op->k_i));
+	}
+
+	fallthrough;
+case LOGGED_OP_FINSERT_finish:
+	ret = ret;
 	}
 err:
-	bch2_trans_iter_exit(&trans, &del);
-	bch2_trans_iter_exit(&trans, &dst);
-	bch2_trans_iter_exit(&trans, &src);
-	bch2_trans_exit(&trans);
-	bch2_bkey_buf_exit(&copy, c);
+	bch2_logged_op_finish(trans, op_k);
+	bch2_trans_iter_exit(trans, &iter);
 	return ret;
 }
+
+int bch2_resume_logged_op_finsert(struct btree_trans *trans, struct bkey_i *op_k)
+{
+	return __bch2_resume_logged_op_finsert(trans, op_k, NULL);
+}
+
+int bch2_fcollapse_finsert(struct bch_fs *c, subvol_inum inum,
+			   u64 offset, u64 len, bool insert,
+			   s64 *i_sectors_delta)
+{
+	struct bkey_i_logged_op_finsert op;
+	s64 shift = insert ? len : -len;
+
+	bkey_logged_op_finsert_init(&op.k_i);
+	op.v.subvol	= cpu_to_le32(inum.subvol);
+	op.v.inum	= cpu_to_le64(inum.inum);
+	op.v.dst_offset	= cpu_to_le64(offset + shift);
+	op.v.src_offset	= cpu_to_le64(offset);
+	op.v.pos	= cpu_to_le64(insert ? U64_MAX : offset);
+
+	return bch2_trans_run(c,
+		bch2_logged_op_start(&trans, &op.k_i) ?:
+		__bch2_resume_logged_op_finsert(&trans, &op.k_i, i_sectors_delta));
+}
diff --git a/fs/bcachefs/io_misc.h b/fs/bcachefs/io_misc.h
index 1b792451fff2..c9e6ed40e1b8 100644
--- a/fs/bcachefs/io_misc.h
+++ b/fs/bcachefs/io_misc.h
@@ -19,6 +19,16 @@ void bch2_logged_op_truncate_to_text(struct printbuf *, struct bch_fs *, struct
 int bch2_resume_logged_op_truncate(struct btree_trans *, struct bkey_i *);
 
 int bch2_truncate(struct bch_fs *, subvol_inum, u64, u64 *);
+
+void bch2_logged_op_finsert_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
+
+#define bch2_bkey_ops_logged_op_finsert ((struct bkey_ops) {	\
+	.val_to_text	= bch2_logged_op_finsert_to_text,	\
+	.min_val_size	= 24,					\
+})
+
+int bch2_resume_logged_op_finsert(struct btree_trans *, struct bkey_i *);
+
 int bch2_fcollapse_finsert(struct bch_fs *, subvol_inum, u64, u64, bool, s64 *);
 
 #endif /* _BCACHEFS_IO_MISC_H */
diff --git a/fs/bcachefs/logged_ops.h b/fs/bcachefs/logged_ops.h
index b2f2ebea54b6..4d1e786a27a8 100644
--- a/fs/bcachefs/logged_ops.h
+++ b/fs/bcachefs/logged_ops.h
@@ -5,7 +5,8 @@
 #include "bkey.h"
 
 #define BCH_LOGGED_OPS()			\
-	x(truncate)
+	x(truncate)				\
+	x(finsert)
 
 static inline int bch2_logged_op_update(struct btree_trans *trans, struct bkey_i *op)
 {
-- 
cgit 


From feb5cc398120ce09fd7c72d361b3d14d9e280b96 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 11 Sep 2023 20:44:33 -0400
Subject: bcachefs: trace_read_nopromote()

Add a tracepoint to print the reason a read wasn't promoted.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/errcode.h |  9 ++++++++-
 fs/bcachefs/io_read.c | 38 +++++++++++++++++++++-----------------
 fs/bcachefs/trace.h   | 19 +++++++++++++++++++
 3 files changed, 48 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h
index 379d9d7ed333..56b6ce278648 100644
--- a/fs/bcachefs/errcode.h
+++ b/fs/bcachefs/errcode.h
@@ -218,7 +218,14 @@
 	x(BCH_ERR_btree_node_read_err,	btree_node_read_err_want_retry)		\
 	x(BCH_ERR_btree_node_read_err,	btree_node_read_err_must_retry)		\
 	x(BCH_ERR_btree_node_read_err,	btree_node_read_err_bad_node)		\
-	x(BCH_ERR_btree_node_read_err,	btree_node_read_err_incompatible)
+	x(BCH_ERR_btree_node_read_err,	btree_node_read_err_incompatible)	\
+	x(0,				nopromote)				\
+	x(BCH_ERR_nopromote,		nopromote_may_not)			\
+	x(BCH_ERR_nopromote,		nopromote_already_promoted)		\
+	x(BCH_ERR_nopromote,		nopromote_unwritten)			\
+	x(BCH_ERR_nopromote,		nopromote_congested)			\
+	x(BCH_ERR_nopromote,		nopromote_in_flight)			\
+	x(BCH_ERR_nopromote,		nopromote_enomem)
 
 enum bch_errcode {
 	BCH_ERR_START		= 2048,
diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c
index cd62bf730396..5ff430e1e244 100644
--- a/fs/bcachefs/io_read.c
+++ b/fs/bcachefs/io_read.c
@@ -87,33 +87,30 @@ static const struct rhashtable_params bch_promote_params = {
 	.key_len	= sizeof(struct bpos),
 };
 
-static inline bool should_promote(struct bch_fs *c, struct bkey_s_c k,
+static inline int should_promote(struct bch_fs *c, struct bkey_s_c k,
 				  struct bpos pos,
 				  struct bch_io_opts opts,
 				  unsigned flags)
 {
-	if (!(flags & BCH_READ_MAY_PROMOTE))
-		return false;
+	BUG_ON(!opts.promote_target);
 
-	if (!opts.promote_target)
-		return false;
+	if (!(flags & BCH_READ_MAY_PROMOTE))
+		return -BCH_ERR_nopromote_may_not;
 
 	if (bch2_bkey_has_target(c, k, opts.promote_target))
-		return false;
+		return -BCH_ERR_nopromote_already_promoted;
 
 	if (bkey_extent_is_unwritten(k))
-		return false;
+		return -BCH_ERR_nopromote_unwritten;
 
-	if (bch2_target_congested(c, opts.promote_target)) {
-		/* XXX trace this */
-		return false;
-	}
+	if (bch2_target_congested(c, opts.promote_target))
+		return -BCH_ERR_nopromote_congested;
 
 	if (rhashtable_lookup_fast(&c->promote_table, &pos,
 				   bch_promote_params))
-		return false;
+		return -BCH_ERR_nopromote_in_flight;
 
-	return true;
+	return 0;
 }
 
 static void promote_free(struct bch_fs *c, struct promote_op *op)
@@ -264,21 +261,28 @@ static struct promote_op *promote_alloc(struct btree_trans *trans,
 		? bkey_start_pos(k.k)
 		: POS(k.k->p.inode, iter.bi_sector);
 	struct promote_op *promote;
+	int ret;
 
-	if (!should_promote(c, k, pos, opts, flags))
-		return NULL;
+	ret = should_promote(c, k, pos, opts, flags);
+	if (ret)
+		goto nopromote;
 
 	promote = __promote_alloc(trans,
 				  k.k->type == KEY_TYPE_reflink_v
 				  ? BTREE_ID_reflink
 				  : BTREE_ID_extents,
 				  k, pos, pick, opts, sectors, rbio);
-	if (!promote)
-		return NULL;
+	if (!promote) {
+		ret = -BCH_ERR_nopromote_enomem;
+		goto nopromote;
+	}
 
 	*bounce		= true;
 	*read_full	= promote_full;
 	return promote;
+nopromote:
+	trace_read_nopromote(c, ret);
+	return NULL;
 }
 
 /* Read */
diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h
index 97fe774237d0..19264492151b 100644
--- a/fs/bcachefs/trace.h
+++ b/fs/bcachefs/trace.h
@@ -137,6 +137,25 @@ DEFINE_EVENT(bio, read_promote,
 	TP_ARGS(bio)
 );
 
+TRACE_EVENT(read_nopromote,
+	TP_PROTO(struct bch_fs *c, int ret),
+	TP_ARGS(c, ret),
+
+	TP_STRUCT__entry(
+		__field(dev_t,		dev		)
+		__array(char,		ret, 32		)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= c->dev;
+		strscpy(__entry->ret, bch2_err_str(ret), sizeof(__entry->ret));
+	),
+
+	TP_printk("%d,%d ret %s",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->ret)
+);
+
 DEFINE_EVENT(bio, read_bounce,
 	TP_PROTO(struct bio *bio),
 	TP_ARGS(bio)
-- 
cgit 


From 7cb0e6992e5c56c2751e08c8161203ba0566f510 Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.i.king@gmail.com>
Date: Tue, 12 Sep 2023 13:37:40 +0100
Subject: bcachefs: remove redundant initialization of pointer d

The pointer d is being initialized with a value that is never read,
it is being re-assigned later on when it is used in a for-loop.
The initialization is redundant and can be removed.

Cleans up clang-scan build warning:
fs/bcachefs/buckets.c:1303:25: warning: Value stored to 'd' during its
initialization is never read [deadcode.DeadStores]

Signed-off-by: Colin Ian King <colin.i.king@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update_interior.c | 2 +-
 fs/bcachefs/buckets.c               | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index c5b571f8333c..c8d0942650f1 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -143,7 +143,7 @@ static size_t btree_node_u64s_with_format(struct btree *b,
 }
 
 /**
- * btree_node_format_fits - check if we could rewrite node with a new format
+ * bch2_btree_node_format_fits - check if we could rewrite node with a new format
  *
  * This assumes all keys can pack with the new format -- it just checks if
  * the re-packed keys would fit inside the node itself.
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 77ded7c5a22c..78139f73636c 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -1300,7 +1300,7 @@ int bch2_trans_fs_usage_apply(struct btree_trans *trans,
 	static int warned_disk_usage = 0;
 	bool warn = false;
 	unsigned disk_res_sectors = trans->disk_res ? trans->disk_res->sectors : 0;
-	struct replicas_delta *d = deltas->d, *d2;
+	struct replicas_delta *d, *d2;
 	struct replicas_delta *top = (void *) deltas->d + deltas->used;
 	struct bch_fs_usage *dst;
 	s64 added = 0, should_not_have_added;
-- 
cgit 


From 519d6c884509b639c8102f4456ca91e354ae1205 Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.i.king@gmail.com>
Date: Tue, 12 Sep 2023 13:37:41 +0100
Subject: bcachefs: remove redundant initialization of pointer dst

The pointer dst is being initialized with a value that is never read,
it is being re-assigned later on when it is used in a while-loop
The initialization is redundant and can be removed.

Cleans up clang-scan build warning:
fs/bcachefs/disk_groups.c:186:30: warning: Value stored to 'dst' during
its initialization is never read [deadcode.DeadStores]

Signed-off-by: Colin Ian King <colin.i.king@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/disk_groups.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/disk_groups.c b/fs/bcachefs/disk_groups.c
index f36472c4a781..9fa8d7d49f3e 100644
--- a/fs/bcachefs/disk_groups.c
+++ b/fs/bcachefs/disk_groups.c
@@ -183,8 +183,7 @@ int bch2_sb_disk_groups_to_cpu(struct bch_fs *c)
 
 	for (i = 0; i < c->disk_sb.sb->nr_devices; i++) {
 		struct bch_member *m = mi->members + i;
-		struct bch_disk_group_cpu *dst =
-			&cpu_g->entries[BCH_MEMBER_GROUP(m)];
+		struct bch_disk_group_cpu *dst;
 
 		if (!bch2_member_exists(m))
 			continue;
-- 
cgit 


From c04cbc0dfd261bb8310cde82996e0360e4c38d39 Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.i.king@gmail.com>
Date: Tue, 12 Sep 2023 13:37:42 +0100
Subject: bcachefs: remove redundant initializations of variables start_offset
 and end_offset

The variables start_offset and end_offset are being initialized with
values that are never read, they being re-assigned later on. The
initializations are redundant and can be removed.

Cleans up clang-scan build warnings:
fs/bcachefs/fs-io.c:243:11: warning: Value stored to 'start_offset' during
its initialization is never read [deadcode.DeadStores]
fs/bcachefs/fs-io.c:244:11: warning: Value stored to 'end_offset' during
its initialization is never read [deadcode.DeadStores]

Signed-off-by: Colin Ian King <colin.i.king@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-io.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index b36513eb3d16..ffe9206e4c18 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -241,8 +241,8 @@ static int __bch2_truncate_folio(struct bch_inode_info *inode,
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
 	struct address_space *mapping = inode->v.i_mapping;
 	struct bch_folio *s;
-	unsigned start_offset = start & (PAGE_SIZE - 1);
-	unsigned end_offset = ((end - 1) & (PAGE_SIZE - 1)) + 1;
+	unsigned start_offset;
+	unsigned end_offset;
 	unsigned i;
 	struct folio *folio;
 	s64 i_sectors_delta = 0;
-- 
cgit 


From 2a831e4ba9b09e1b30384b65eca7c2ac33aa69b2 Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.i.king@gmail.com>
Date: Tue, 12 Sep 2023 13:37:43 +0100
Subject: bcachefs: remove duplicated assignment to variable offset_into_extent

Variable offset_into_extent is being assigned to zero and a few
statements later it is being re-assigned again to the save value.
The second assignment is redundant and can be removed. Cleans up
clang-scan build warning:

fs/bcachefs/io.c:2722:3: warning: Value stored to 'offset_into_extent'
is never read [deadcode.DeadStores]

Signed-off-by: Colin Ian King <colin.i.king@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/io_read.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c
index 5ff430e1e244..3c6c1396915a 100644
--- a/fs/bcachefs/io_read.c
+++ b/fs/bcachefs/io_read.c
@@ -897,7 +897,6 @@ retry_pick:
 		pick.crc.uncompressed_size	= bvec_iter_sectors(iter);
 		pick.crc.offset			= 0;
 		pick.crc.live_size		= bvec_iter_sectors(iter);
-		offset_into_extent		= 0;
 	}
 get_bio:
 	if (rbio) {
-- 
cgit 


From 74c1e4221b6630e864d1efe4e718538dcee5d816 Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.i.king@gmail.com>
Date: Tue, 12 Sep 2023 13:37:44 +0100
Subject: bcachefs: remove redundant pointer q

The pointer q is being assigned a value but it is never read. The
assignment and pointer are redundant and can be removed.
Cleans up clang scan build warning:

fs/bcachefs/quota.c:813:2: warning: Value stored to 'q' is never
read [deadcode.DeadStores]

Signed-off-by: Colin Ian King <colin.i.king@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/quota.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c
index 60d27f726519..f16aa3bc9679 100644
--- a/fs/bcachefs/quota.c
+++ b/fs/bcachefs/quota.c
@@ -786,7 +786,6 @@ static int bch2_quota_set_info(struct super_block *sb, int type,
 {
 	struct bch_fs *c = sb->s_fs_info;
 	struct bch_sb_field_quota *sb_quota;
-	struct bch_memquota_type *q;
 	int ret = 0;
 
 	if (0) {
@@ -810,8 +809,6 @@ static int bch2_quota_set_info(struct super_block *sb, int type,
 	    ~(QC_SPC_TIMER|QC_INO_TIMER|QC_SPC_WARNS|QC_INO_WARNS))
 		return -EINVAL;
 
-	q = &c->quotas[type];
-
 	mutex_lock(&c->sb_lock);
 	sb_quota = bch2_sb_get_or_create_quota(&c->disk_sb);
 	if (!sb_quota) {
-- 
cgit 


From 6bf3766b5211dde53e5a2f88082d71073ea6e7df Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.i.king@gmail.com>
Date: Tue, 12 Sep 2023 09:25:27 +0100
Subject: bcachefs: Fix a handful of spelling mistakes in various messages

There are several spelling mistakes in error messages. Fix these.

Signed-off-by: Colin Ian King <colin.i.king@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/backpointers.c | 2 +-
 fs/bcachefs/btree_iter.c   | 2 +-
 fs/bcachefs/recovery.c     | 2 +-
 fs/bcachefs/snapshot.c     | 2 +-
 fs/bcachefs/super-io.c     | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c
index 8747c5e19f99..bec62e5b21e5 100644
--- a/fs/bcachefs/backpointers.c
+++ b/fs/bcachefs/backpointers.c
@@ -357,7 +357,7 @@ static int bch2_check_btree_backpointer(struct btree_trans *trans, struct btree_
 	int ret = 0;
 
 	if (fsck_err_on(!bch2_dev_exists2(c, k.k->p.inode), c,
-			"backpointer for mising device:\n%s",
+			"backpointer for missing device:\n%s",
 			(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
 		ret = bch2_btree_delete_at(trans, bp_iter, 0);
 		goto out;
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 1dbb4d7dfb45..8d089bbdb1e5 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1495,7 +1495,7 @@ static void bch2_trans_update_max_paths(struct btree_trans *trans)
 static noinline void btree_path_overflow(struct btree_trans *trans)
 {
 	bch2_dump_trans_paths_updates(trans);
-	panic("trans path oveflow\n");
+	panic("trans path overflow\n");
 }
 
 static inline struct btree_path *btree_path_alloc(struct btree_trans *trans,
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index f5f6eea2cbae..b9dd858fc299 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -562,7 +562,7 @@ static void check_version_upgrade(struct bch_fs *c)
 			if ((recovery_passes & RECOVERY_PASS_ALL_FSCK) == RECOVERY_PASS_ALL_FSCK)
 				prt_str(&buf, "fsck required");
 			else {
-				prt_str(&buf, "running recovery passses: ");
+				prt_str(&buf, "running recovery passes: ");
 				prt_bitflags(&buf, bch2_recovery_passes, recovery_passes);
 			}
 
diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c
index 9bab9860b20b..cfd70d6dea62 100644
--- a/fs/bcachefs/snapshot.c
+++ b/fs/bcachefs/snapshot.c
@@ -1385,7 +1385,7 @@ int bch2_delete_dead_snapshots(struct bch_fs *c)
 	if (!test_bit(BCH_FS_STARTED, &c->flags)) {
 		ret = bch2_fs_read_write_early(c);
 		if (ret) {
-			bch_err_msg(c, ret, "error deleleting dead snapshots: error going rw");
+			bch_err_msg(c, ret, "deleting dead snapshots: error going rw");
 			return ret;
 		}
 	}
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index 5a1115396edc..d908b62c19f7 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -384,7 +384,7 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb, struct printbuf *out,
 	}
 
 	if (bch2_is_zero(sb->uuid.b, sizeof(sb->uuid))) {
-		prt_printf(out, "Bad intenal UUID (got zeroes)");
+		prt_printf(out, "Bad internal UUID (got zeroes)");
 		return -BCH_ERR_invalid_sb_uuid;
 	}
 
-- 
cgit 


From fac1250a8cc3af0e45c07ad59d7e1eabf5213688 Mon Sep 17 00:00:00 2001
From: Nathan Chancellor <nathan@kernel.org>
Date: Tue, 12 Sep 2023 12:15:38 -0700
Subject: bcachefs: Fix -Wformat in bch2_set_bucket_needs_journal_commit()

When building bcachefs for 32-bit ARM, there is a compiler warning in
bch2_set_bucket_needs_journal_commit() due to a debug print using the
wrong specifier:

  fs/bcachefs/buckets_waiting_for_journal.c:137:30: error: format specifies type 'size_t' (aka 'unsigned int') but the argument has type 'unsigned long' [-Werror,-Wformat]
    136 |         pr_debug("took %zu rehashes, table at %zu/%zu elements",
        |                                                   ~~~
        |                                                   %lu
    137 |                  nr_rehashes, nr_elements, 1UL << b->t->bits);
        |                                            ^~~~~~~~~~~~~~~~~
  include/linux/printk.h:579:26: note: expanded from macro 'pr_debug'
    579 |         dynamic_pr_debug(fmt, ##__VA_ARGS__)
        |                          ~~~    ^~~~~~~~~~~
  include/linux/dynamic_debug.h:270:22: note: expanded from macro 'dynamic_pr_debug'
    270 |                            pr_fmt(fmt), ##__VA_ARGS__)
        |                                   ~~~     ^~~~~~~~~~~
  include/linux/dynamic_debug.h:250:59: note: expanded from macro '_dynamic_func_call'
    250 |         _dynamic_func_call_cls(_DPRINTK_CLASS_DFLT, fmt, func, ##__VA_ARGS__)
        |                                                                  ^~~~~~~~~~~
  include/linux/dynamic_debug.h:248:65: note: expanded from macro '_dynamic_func_call_cls'
    248 |         __dynamic_func_call_cls(__UNIQUE_ID(ddebug), cls, fmt, func, ##__VA_ARGS__)
        |                                                                        ^~~~~~~~~~~
  include/linux/dynamic_debug.h:224:15: note: expanded from macro '__dynamic_func_call_cls'
    224 |                 func(&id, ##__VA_ARGS__);                       \
        |                             ^~~~~~~~~~~
  1 error generated.

On 64-bit architectures, size_t is 'unsigned long', so there is no
warning when using %zu but on 32-bit architectures, size_t is
'unsigned int'. Use the correct specifier to resolve the warning.

Fixes: 7a82e75ddaef ("bcachefs: New data structure for buckets waiting on journal commit")
Signed-off-by: Nathan Chancellor <nathan@kernel.org>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/buckets_waiting_for_journal.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/buckets_waiting_for_journal.c b/fs/bcachefs/buckets_waiting_for_journal.c
index 81ab685cdef9..ec1b636ef78d 100644
--- a/fs/bcachefs/buckets_waiting_for_journal.c
+++ b/fs/bcachefs/buckets_waiting_for_journal.c
@@ -133,7 +133,7 @@ retry_rehash:
 	b->t = n;
 	kvfree(t);
 
-	pr_debug("took %zu rehashes, table at %zu/%zu elements",
+	pr_debug("took %zu rehashes, table at %zu/%lu elements",
 		 nr_rehashes, nr_elements, 1UL << b->t->bits);
 out:
 	mutex_unlock(&b->lock);
-- 
cgit 


From f7ed15eb177ffd55e97c5817e2ccaacc364be4cd Mon Sep 17 00:00:00 2001
From: Nathan Chancellor <nathan@kernel.org>
Date: Tue, 12 Sep 2023 12:15:39 -0700
Subject: bcachefs: Fix -Wformat in bch2_btree_key_cache_to_text()

When building bcachefs for 32-bit ARM, there is a compiler warning in
bch2_btree_key_cache_to_text() due to use of an incorrect format
specifier:

  fs/bcachefs/btree_key_cache.c:1060:36: error: format specifies type 'size_t' (aka 'unsigned int') but the argument has type 'long' [-Werror,-Wformat]
   1060 |         prt_printf(out, "nr_freed:\t%zu",       atomic_long_read(&c->nr_freed));
        |                                     ~~~         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
        |                                     %ld
  fs/bcachefs/util.h:223:54: note: expanded from macro 'prt_printf'
    223 | #define prt_printf(_out, ...)           bch2_prt_printf(_out, __VA_ARGS__)
        |                                                               ^~~~~~~~~~~
  1 error generated.

On 64-bit architectures, size_t is 'unsigned long', so there is no
warning when using %zu but on 32-bit architectures, size_t is
'unsigned int'. Use '%lu' to match the other format specifiers used in
this function for printing values returned from atomic_long_read().

Fixes: 6d799930ce0f ("bcachefs: btree key cache pcpu freedlist")
Signed-off-by: Nathan Chancellor <nathan@kernel.org>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_key_cache.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index 505e7c365ab7..a74ee6d8a7cf 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -1053,7 +1053,7 @@ int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc)
 
 void bch2_btree_key_cache_to_text(struct printbuf *out, struct btree_key_cache *c)
 {
-	prt_printf(out, "nr_freed:\t%zu",	atomic_long_read(&c->nr_freed));
+	prt_printf(out, "nr_freed:\t%lu",	atomic_long_read(&c->nr_freed));
 	prt_newline(out);
 	prt_printf(out, "nr_keys:\t%lu",	atomic_long_read(&c->nr_keys));
 	prt_newline(out);
-- 
cgit 


From 14f63ff3f6617902cd54edb468b906214ab00f34 Mon Sep 17 00:00:00 2001
From: Nathan Chancellor <nathan@kernel.org>
Date: Tue, 12 Sep 2023 12:15:40 -0700
Subject: bcachefs: Fix -Wformat in bch2_alloc_v4_invalid()

When building bcachefs for 32-bit ARM, there is a compiler warning in
bch2_alloc_v4_invalid() due to use of an incorrect format specifier:

  fs/bcachefs/alloc_background.c:246:30: error: format specifies type 'unsigned long' but the argument has type 'unsigned int' [-Werror,-Wformat]
    245 |                 prt_printf(err, "bad val size (%u > %lu)",
        |                 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
        |                                                     %u
    246 |                        alloc_v4_u64s(a.v), bkey_val_u64s(k.k));
        |                        ~~~~~~~~~~~~~~~~~~~~^~~~~~~~~~~~~~~~~~~
  fs/bcachefs/bkey.h:58:27: note: expanded from macro 'bkey_val_u64s'
     58 | #define bkey_val_u64s(_k)       ((_k)->u64s - BKEY_U64s)
        |                                 ^
  fs/bcachefs/util.h:223:54: note: expanded from macro 'prt_printf'
    223 | #define prt_printf(_out, ...)           bch2_prt_printf(_out, __VA_ARGS__)
        |                                                               ^~~~~~~~~~~

This expression is of type 'size_t'. On 64-bit architectures, size_t is
'unsigned long', so there is no warning when using %lu but on 32-bit
architectures, size_t is 'unsigned int'. Use '%zu', the format specifier
for 'size_t' to eliminate the warning.

Fixes: 11be8e8db283 ("bcachefs: New on disk format: Backpointers")
Signed-off-by: Nathan Chancellor <nathan@kernel.org>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 5c60f956b598..a012fffb1dc4 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -242,7 +242,7 @@ int bch2_alloc_v4_invalid(const struct bch_fs *c, struct bkey_s_c k,
 	struct bkey_s_c_alloc_v4 a = bkey_s_c_to_alloc_v4(k);
 
 	if (alloc_v4_u64s(a.v) > bkey_val_u64s(k.k)) {
-		prt_printf(err, "bad val size (%u > %lu)",
+		prt_printf(err, "bad val size (%u > %zu)",
 		       alloc_v4_u64s(a.v), bkey_val_u64s(k.k));
 		return -BCH_ERR_invalid_bkey;
 	}
-- 
cgit 


From 0940863fd2186c521d91aaf58b28d872fb1bba6c Mon Sep 17 00:00:00 2001
From: Nathan Chancellor <nathan@kernel.org>
Date: Tue, 12 Sep 2023 12:15:41 -0700
Subject: bcachefs: Fix -Wformat in bch2_bucket_gens_invalid()

When building bcachefs for 32-bit ARM, there is a compiler warning in
bch2_bucket_gens_invalid() due to use of an incorrect format specifier:

  fs/bcachefs/alloc_background.c:530:10: error: format specifies type 'unsigned long' but the argument has type 'size_t' (aka 'unsigned int') [-Werror,-Wformat]
    529 |                 prt_printf(err, "bad val size (%lu != %zu)",
        |                                                ~~~
        |                                                %zu
    530 |                        bkey_val_bytes(k.k), sizeof(struct bch_bucket_gens));
        |                        ^~~~~~~~~~~~~~~~~~~
  fs/bcachefs/util.h:223:54: note: expanded from macro 'prt_printf'
    223 | #define prt_printf(_out, ...)           bch2_prt_printf(_out, __VA_ARGS__)
        |                                                               ^~~~~~~~~~~

On 64-bit architectures, size_t is 'unsigned long', so there is no
warning when using %lu but on 32-bit architectures, size_t is 'unsigned
int'. Use '%zu', the format specifier for 'size_t', to eliminate the
warning.

Fixes: 4be0d766a7e9 ("bcachefs: bucket_gens btree")
Signed-off-by: Nathan Chancellor <nathan@kernel.org>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index a012fffb1dc4..ee21aeece39e 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -526,7 +526,7 @@ int bch2_bucket_gens_invalid(const struct bch_fs *c, struct bkey_s_c k,
 			     struct printbuf *err)
 {
 	if (bkey_val_bytes(k.k) != sizeof(struct bch_bucket_gens)) {
-		prt_printf(err, "bad val size (%lu != %zu)",
+		prt_printf(err, "bad val size (%zu != %zu)",
 		       bkey_val_bytes(k.k), sizeof(struct bch_bucket_gens));
 		return -BCH_ERR_invalid_bkey;
 	}
-- 
cgit 


From 1f70225d7791e67084073e54489440d7cf8017e0 Mon Sep 17 00:00:00 2001
From: Nathan Chancellor <nathan@kernel.org>
Date: Tue, 12 Sep 2023 12:15:42 -0700
Subject: bcachefs: Fix -Wincompatible-function-pointer-types-strict from
 key_invalid callbacks

When building bcachefs with -Wincompatible-function-pointer-types-strict,
a clang warning designed to catch issues with mismatched function
pointer types, which will be fatal at runtime due to kernel Control Flow
Integrity (kCFI), there are several instances along the lines of:

  fs/bcachefs/bkey_methods.c:118:2: error: incompatible function pointer types initializing 'int (*)(const struct bch_fs *, struct bkey_s_c, enum bkey_invalid_flags, struct printbuf *)' with an expression of type 'int (const struct bch_fs *, struct bkey_s_c, unsigned int, struct printbuf *)' [-Werror,-Wincompatible-function-pointer-types-strict]
    118 |         BCH_BKEY_TYPES()
        |         ^~~~~~~~~~~~~~~~
  fs/bcachefs/bcachefs_format.h:342:2: note: expanded from macro 'BCH_BKEY_TYPES'
    342 |         x(deleted,              0)                      \
        |         ^~~~~~~~~~~~~~~~~~~~~~~~~~
  fs/bcachefs/bkey_methods.c:117:41: note: expanded from macro 'x'
    117 | #define x(name, nr) [KEY_TYPE_##name]   = bch2_bkey_ops_##name,
        |                                           ^~~~~~~~~~~~~~~~~~~~
  <scratch space>:206:1: note: expanded from here
    206 | bch2_bkey_ops_deleted
        | ^~~~~~~~~~~~~~~~~~~~~
  fs/bcachefs/bkey_methods.c:34:17: note: expanded from macro 'bch2_bkey_ops_deleted'
     34 |         .key_invalid = deleted_key_invalid,             \
        |                        ^~~~~~~~~~~~~~~~~~~

The flags parameter should be of type 'enum bkey_invalid_flags', not
'unsigned int'. Adjust the type everywhere so that there is no more
warning.

Signed-off-by: Nathan Chancellor <nathan@kernel.org>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bkey_methods.c | 10 +++++-----
 fs/bcachefs/subvolume.c    |  2 +-
 fs/bcachefs/subvolume.h    |  2 +-
 3 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c
index 91e28ee3efff..82f30ffbfb86 100644
--- a/fs/bcachefs/bkey_methods.c
+++ b/fs/bcachefs/bkey_methods.c
@@ -26,7 +26,7 @@ const char * const bch2_bkey_types[] = {
 };
 
 static int deleted_key_invalid(const struct bch_fs *c, struct bkey_s_c k,
-			       unsigned flags, struct printbuf *err)
+			       enum bkey_invalid_flags flags, struct printbuf *err)
 {
 	return 0;
 }
@@ -40,7 +40,7 @@ static int deleted_key_invalid(const struct bch_fs *c, struct bkey_s_c k,
 })
 
 static int empty_val_key_invalid(const struct bch_fs *c, struct bkey_s_c k,
-				 unsigned flags, struct printbuf *err)
+				 enum bkey_invalid_flags flags, struct printbuf *err)
 {
 	if (bkey_val_bytes(k.k)) {
 		prt_printf(err, "incorrect value size (%zu != 0)",
@@ -56,7 +56,7 @@ static int empty_val_key_invalid(const struct bch_fs *c, struct bkey_s_c k,
 })
 
 static int key_type_cookie_invalid(const struct bch_fs *c, struct bkey_s_c k,
-				   unsigned flags, struct printbuf *err)
+				   enum bkey_invalid_flags flags, struct printbuf *err)
 {
 	return 0;
 }
@@ -71,7 +71,7 @@ static int key_type_cookie_invalid(const struct bch_fs *c, struct bkey_s_c k,
 })
 
 static int key_type_inline_data_invalid(const struct bch_fs *c, struct bkey_s_c k,
-					unsigned flags, struct printbuf *err)
+					enum bkey_invalid_flags flags, struct printbuf *err)
 {
 	return 0;
 }
@@ -92,7 +92,7 @@ static void key_type_inline_data_to_text(struct printbuf *out, struct bch_fs *c,
 })
 
 static int key_type_set_invalid(const struct bch_fs *c, struct bkey_s_c k,
-				unsigned flags, struct printbuf *err)
+				enum bkey_invalid_flags flags, struct printbuf *err)
 {
 	if (bkey_val_bytes(k.k)) {
 		prt_printf(err, "incorrect value size (%zu != %zu)",
diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c
index b6015a8060ec..ca03d585a2fa 100644
--- a/fs/bcachefs/subvolume.c
+++ b/fs/bcachefs/subvolume.c
@@ -98,7 +98,7 @@ int bch2_check_subvols(struct bch_fs *c)
 /* Subvolumes: */
 
 int bch2_subvolume_invalid(const struct bch_fs *c, struct bkey_s_c k,
-			   unsigned flags, struct printbuf *err)
+			   enum bkey_invalid_flags flags, struct printbuf *err)
 {
 	if (bkey_lt(k.k->p, SUBVOL_POS_MIN) ||
 	    bkey_gt(k.k->p, SUBVOL_POS_MAX)) {
diff --git a/fs/bcachefs/subvolume.h b/fs/bcachefs/subvolume.h
index 8d4c50f4cd05..bb14f92e8687 100644
--- a/fs/bcachefs/subvolume.h
+++ b/fs/bcachefs/subvolume.h
@@ -10,7 +10,7 @@ enum bkey_invalid_flags;
 int bch2_check_subvols(struct bch_fs *);
 
 int bch2_subvolume_invalid(const struct bch_fs *, struct bkey_s_c,
-			   unsigned, struct printbuf *);
+			   enum bkey_invalid_flags, struct printbuf *);
 void bch2_subvolume_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 
 #define bch2_bkey_ops_subvolume ((struct bkey_ops) {		\
-- 
cgit 


From 53eda6f7130adb194cb3b089bc38fc32d9a1f7d5 Mon Sep 17 00:00:00 2001
From: Nathan Chancellor <nathan@kernel.org>
Date: Tue, 12 Sep 2023 12:15:43 -0700
Subject: bcachefs: Fix -Wcompare-distinct-pointer-types in do_encrypt()

When building bcachefs for 32-bit ARM, there is a warning when using
min() to compare a variable of type 'size_t' with an expression of type
'unsigned long':

  fs/bcachefs/checksum.c:142:22: error: comparison of distinct pointer types ('typeof (len) *' (aka 'unsigned int *') and 'typeof (((1UL) << 12) - offset) *' (aka 'unsigned long *')) [-Werror,-Wcompare-distinct-pointer-types]
    142 |                         unsigned pg_len = min(len, PAGE_SIZE - offset);
        |                                           ^~~~~~~~~~~~~~~~~~~~~~~~~~~~
  include/linux/minmax.h:69:19: note: expanded from macro 'min'
     69 | #define min(x, y)       __careful_cmp(x, y, <)
        |                         ^~~~~~~~~~~~~~~~~~~~~~
  include/linux/minmax.h:38:24: note: expanded from macro '__careful_cmp'
     38 |         __builtin_choose_expr(__safe_cmp(x, y), \
        |                               ^~~~~~~~~~~~~~~~
  include/linux/minmax.h:28:4: note: expanded from macro '__safe_cmp'
     28 |                 (__typecheck(x, y) && __no_side_effects(x, y))
        |                  ^~~~~~~~~~~~~~~~~
  include/linux/minmax.h:22:28: note: expanded from macro '__typecheck'
     22 |         (!!(sizeof((typeof(x) *)1 == (typeof(y) *)1)))
        |                    ~~~~~~~~~~~~~~ ^  ~~~~~~~~~~~~~~
  1 error generated.

On 64-bit architectures, size_t is 'unsigned long', so there is no
warning when comparing these two expressions. Use min_t(size_t, ...) for
this situation, eliminating the warning.

Fixes: 1fb50457684f ("bcachefs: Fix memory corruption in encryption path")
Signed-off-by: Nathan Chancellor <nathan@kernel.org>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/checksum.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/checksum.c b/fs/bcachefs/checksum.c
index 36939020f67d..ff0c3cd39ee2 100644
--- a/fs/bcachefs/checksum.c
+++ b/fs/bcachefs/checksum.c
@@ -139,7 +139,7 @@ static inline int do_encrypt(struct crypto_sync_skcipher *tfm,
 
 		for (i = 0; i < pages; i++) {
 			unsigned offset = offset_in_page(buf);
-			unsigned pg_len = min(len, PAGE_SIZE - offset);
+			unsigned pg_len = min_t(size_t, len, PAGE_SIZE - offset);
 
 			sg_set_page(sg + i, vmalloc_to_page(buf), pg_len, offset);
 			buf += pg_len;
-- 
cgit 


From e82f5f40f2b936063361812cad9338ce792dde2f Mon Sep 17 00:00:00 2001
From: Nathan Chancellor <nathan@kernel.org>
Date: Tue, 12 Sep 2023 12:15:44 -0700
Subject: bcachefs: Fix -Wcompare-distinct-pointer-types in
 bch2_copygc_get_buckets()

When building bcachefs for 32-bit ARM, there is a warning when using
max() to compare an expression involving 'size_t' with an 'unsigned
long' literal:

  fs/bcachefs/movinggc.c:159:21: error: comparison of distinct pointer types ('typeof (16UL) *' (aka 'unsigned long *') and 'typeof (buckets_in_flight->nr / 4) *' (aka 'unsigned int *')) [-Werror,-Wcompare-distinct-pointer-types]
    159 |         size_t nr_to_get = max(16UL, buckets_in_flight->nr / 4);
        |                            ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  include/linux/minmax.h:76:19: note: expanded from macro 'max'
     76 | #define max(x, y)       __careful_cmp(x, y, >)
        |                         ^~~~~~~~~~~~~~~~~~~~~~
  include/linux/minmax.h:38:24: note: expanded from macro '__careful_cmp'
     38 |         __builtin_choose_expr(__safe_cmp(x, y), \
        |                               ^~~~~~~~~~~~~~~~
  include/linux/minmax.h:28:4: note: expanded from macro '__safe_cmp'
     28 |                 (__typecheck(x, y) && __no_side_effects(x, y))
        |                  ^~~~~~~~~~~~~~~~~
  include/linux/minmax.h:22:28: note: expanded from macro '__typecheck'
     22 |         (!!(sizeof((typeof(x) *)1 == (typeof(y) *)1)))
        |                    ~~~~~~~~~~~~~~ ^  ~~~~~~~~~~~~~~
  1 error generated.

On 64-bit architectures, size_t is 'unsigned long', so there is no
warning when comparing these two expressions. Use max_t(size_t, ...) for
this situation, eliminating the warning.

Fixes: dd49018737d4 ("bcachefs: Rhashtable based buckets_in_flight for copygc")
Signed-off-by: Nathan Chancellor <nathan@kernel.org>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/movinggc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c
index 2371fd61ea58..83ebb56a3fae 100644
--- a/fs/bcachefs/movinggc.c
+++ b/fs/bcachefs/movinggc.c
@@ -148,7 +148,7 @@ static int bch2_copygc_get_buckets(struct btree_trans *trans,
 	struct bch_fs *c = trans->c;
 	struct btree_iter iter;
 	struct bkey_s_c k;
-	size_t nr_to_get = max(16UL, buckets_in_flight->nr / 4);
+	size_t nr_to_get = max_t(size_t, 16U, buckets_in_flight->nr / 4);
 	size_t saw = 0, in_flight = 0, not_movable = 0, sectors = 0;
 	int ret;
 
-- 
cgit 


From 7bba0dc6fc1e18400fe247d6385b1ce8ef566397 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 12 Sep 2023 18:41:09 -0400
Subject: bcachefs: Add a missing prefetch include

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-io-direct.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-io-direct.c b/fs/bcachefs/fs-io-direct.c
index 219bc1124477..4c61cb18c366 100644
--- a/fs/bcachefs/fs-io-direct.c
+++ b/fs/bcachefs/fs-io-direct.c
@@ -12,6 +12,7 @@
 
 #include <linux/kthread.h>
 #include <linux/pagemap.h>
+#include <linux/prefetch.h>
 #include <linux/task_io_accounting_ops.h>
 
 /* O_DIRECT reads */
-- 
cgit 


From b5e85d4d0ccf819df1ee73db41bf388ddd6e1830 Mon Sep 17 00:00:00 2001
From: Yang Li <yang.lee@linux.alibaba.com>
Date: Wed, 13 Sep 2023 08:57:56 +0800
Subject: bcachefs: Remove unneeded semicolon

./fs/bcachefs/btree_gc.c:1249:2-3: Unneeded semicolon
./fs/bcachefs/btree_gc.c:1521:2-3: Unneeded semicolon
./fs/bcachefs/btree_gc.c:1575:2-3: Unneeded semicolon
./fs/bcachefs/counters.c:46:2-3: Unneeded semicolon

Signed-off-by: Yang Li <yang.lee@linux.alibaba.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_gc.c | 6 +++---
 fs/bcachefs/counters.c | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 844ac0024683..3c8ffbbaef4f 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -1246,7 +1246,7 @@ static int bch2_gc_done(struct bch_fs *c,
 			copy_dev_field(d[i].sectors,	"%s sectors", bch2_data_types[i]);
 			copy_dev_field(d[i].fragmented,	"%s fragmented", bch2_data_types[i]);
 		}
-	};
+	}
 
 	{
 		unsigned nr = fs_usage_u64s(c);
@@ -1518,7 +1518,7 @@ static int bch2_gc_alloc_start(struct bch_fs *c, bool metadata_only)
 		buckets->first_bucket	= ca->mi.first_bucket;
 		buckets->nbuckets	= ca->mi.nbuckets;
 		rcu_assign_pointer(ca->buckets_gc, buckets);
-	};
+	}
 
 	bch2_trans_init(&trans, c, 0, 0);
 
@@ -1572,7 +1572,7 @@ static void bch2_gc_alloc_reset(struct bch_fs *c, bool metadata_only)
 			g->dirty_sectors = 0;
 			g->cached_sectors = 0;
 		}
-	};
+	}
 }
 
 static int bch2_gc_write_reflink_key(struct btree_trans *trans,
diff --git a/fs/bcachefs/counters.c b/fs/bcachefs/counters.c
index 442a9b806a3c..26eb3d82b1cb 100644
--- a/fs/bcachefs/counters.c
+++ b/fs/bcachefs/counters.c
@@ -43,7 +43,7 @@ static void bch2_sb_counters_to_text(struct printbuf *out, struct bch_sb *sb,
 		prt_tab(out);
 		prt_printf(out, "%llu", le64_to_cpu(ctrs->d[i]));
 		prt_newline(out);
-	};
+	}
 };
 
 int bch2_sb_counters_to_cpu(struct bch_fs *c)
-- 
cgit 


From 96dea3d599dbc31f59eb786af2ac5079122beb88 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 12 Sep 2023 18:41:22 -0400
Subject: bcachefs: Fix W=12 build errors

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c      | 17 ++++-----
 fs/bcachefs/alloc_foreground.c      | 15 +++++---
 fs/bcachefs/backpointers.c          |  5 +--
 fs/bcachefs/bcachefs.h              |  2 +-
 fs/bcachefs/bcachefs_format.h       | 15 ++++----
 fs/bcachefs/bkey.c                  | 31 ++++++++++-----
 fs/bcachefs/bkey_methods.c          |  6 ++-
 fs/bcachefs/bset.c                  | 12 ++++--
 fs/bcachefs/btree_cache.c           | 15 ++++++--
 fs/bcachefs/btree_gc.c              | 26 ++++++-------
 fs/bcachefs/btree_io.c              | 36 +++++-------------
 fs/bcachefs/btree_iter.c            | 37 +++++++++++++-----
 fs/bcachefs/btree_iter.h            | 54 +++++++++++++-------------
 fs/bcachefs/btree_key_cache.c       |  2 -
 fs/bcachefs/btree_trans_commit.c    | 10 ++---
 fs/bcachefs/btree_types.h           | 34 ++++++++---------
 fs/bcachefs/btree_update.c          | 20 ++++++----
 fs/bcachefs/btree_update.h          |  6 +--
 fs/bcachefs/btree_update_interior.c | 58 +++++++++++++---------------
 fs/bcachefs/checksum.c              | 12 +++---
 fs/bcachefs/checksum.h              |  5 +--
 fs/bcachefs/compress.c              |  7 +---
 fs/bcachefs/data_update.c           |  4 --
 fs/bcachefs/debug.c                 |  6 +--
 fs/bcachefs/disk_groups.c           | 12 +++---
 fs/bcachefs/errcode.c               |  2 -
 fs/bcachefs/fs-io-buffered.c        | 75 ++++++++++++++++++-------------------
 fs/bcachefs/fs-io-pagecache.c       | 23 +++++++-----
 fs/bcachefs/fs-ioctl.h              |  6 +--
 fs/bcachefs/fs.c                    |  2 +-
 fs/bcachefs/fsck.c                  | 21 +++++++----
 fs/bcachefs/inode.c                 |  3 +-
 fs/bcachefs/io_write.c              | 14 ++++---
 fs/bcachefs/journal.c               |  9 ++++-
 fs/bcachefs/journal_io.c            | 27 +++++++------
 fs/bcachefs/journal_reclaim.c       | 11 ++++--
 fs/bcachefs/move.c                  |  1 -
 fs/bcachefs/movinggc.c              | 26 ++++++-------
 fs/bcachefs/opts.h                  |  2 +-
 fs/bcachefs/printbuf.c              | 66 ++++++++++++++++++--------------
 fs/bcachefs/recovery.c              | 16 ++------
 fs/bcachefs/reflink.c               |  4 ++
 fs/bcachefs/six.c                   |  1 -
 fs/bcachefs/snapshot.c              | 10 ++---
 fs/bcachefs/super-io.c              |  4 +-
 fs/bcachefs/super.c                 |  2 +-
 fs/bcachefs/sysfs.c                 | 30 +--------------
 fs/bcachefs/tests.c                 | 57 +++++++++++++---------------
 fs/bcachefs/util.c                  | 21 +++++------
 fs/bcachefs/util.h                  |  6 +--
 fs/bcachefs/varint.c                | 24 +++++++-----
 51 files changed, 459 insertions(+), 451 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index ee21aeece39e..4eab7e59ae93 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -1200,15 +1200,15 @@ int bch2_check_alloc_hole_bucket_gens(struct btree_trans *trans,
 		}
 
 		if (need_update) {
-			struct bkey_i *k = bch2_trans_kmalloc(trans, sizeof(g));
+			struct bkey_i *u = bch2_trans_kmalloc(trans, sizeof(g));
 
-			ret = PTR_ERR_OR_ZERO(k);
+			ret = PTR_ERR_OR_ZERO(u);
 			if (ret)
 				goto err;
 
-			memcpy(k, &g, sizeof(g));
+			memcpy(u, &g, sizeof(g));
 
-			ret = bch2_trans_update(trans, bucket_gens_iter, k, 0);
+			ret = bch2_trans_update(trans, bucket_gens_iter, u, 0);
 			if (ret)
 				goto err;
 		}
@@ -1354,15 +1354,14 @@ int bch2_check_bucket_gens_key(struct btree_trans *trans,
 		}
 
 	if (need_update) {
-		struct bkey_i *k;
+		struct bkey_i *u = bch2_trans_kmalloc(trans, sizeof(g));
 
-		k = bch2_trans_kmalloc(trans, sizeof(g));
-		ret = PTR_ERR_OR_ZERO(k);
+		ret = PTR_ERR_OR_ZERO(u);
 		if (ret)
 			goto out;
 
-		memcpy(k, &g, sizeof(g));
-		ret = bch2_trans_update(trans, iter, k, 0);
+		memcpy(u, &g, sizeof(g));
+		ret = bch2_trans_update(trans, iter, u, 0);
 	}
 out:
 fsck_err:
diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index 8e1888a89011..e73b6c82870a 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -502,9 +502,14 @@ again:
 }
 
 /**
- * bch_bucket_alloc - allocate a single bucket from a specific device
+ * bch2_bucket_alloc_trans - allocate a single bucket from a specific device
+ * @trans:	transaction object
+ * @ca:		device to allocate from
+ * @watermark:	how important is this allocation?
+ * @cl:		if not NULL, closure to be used to wait if buckets not available
+ * @usage:	for secondarily also returning the current device usage
  *
- * Returns index of bucket on success, 0 on failure
+ * Returns:	an open_bucket on success, or an ERR_PTR() on failure.
  */
 static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans,
 				      struct bch_dev *ca,
@@ -775,7 +780,6 @@ static int bucket_alloc_from_stripe(struct btree_trans *trans,
 	struct dev_alloc_list devs_sorted;
 	struct ec_stripe_head *h;
 	struct open_bucket *ob;
-	struct bch_dev *ca;
 	unsigned i, ec_idx;
 	int ret = 0;
 
@@ -805,8 +809,6 @@ static int bucket_alloc_from_stripe(struct btree_trans *trans,
 		}
 	goto out_put_head;
 got_bucket:
-	ca = bch_dev_bkey_exists(c, ob->dev);
-
 	ob->ec_idx	= ec_idx;
 	ob->ec		= h->s;
 	ec_stripe_new_get(h->s, STRIPE_REF_io);
@@ -1032,10 +1034,13 @@ static int open_bucket_add_buckets(struct btree_trans *trans,
 
 /**
  * should_drop_bucket - check if this is open_bucket should go away
+ * @ob:		open_bucket to predicate on
+ * @c:		filesystem handle
  * @ca:		if set, we're killing buckets for a particular device
  * @ec:		if true, we're shutting down erasure coding and killing all ec
  *		open_buckets
  *		otherwise, return true
+ * Returns: true if we should kill this open_bucket
  *
  * We're killing open_buckets because we're shutting down a device, erasure
  * coding, or the entire filesystem - check if this open_bucket matches:
diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c
index bec62e5b21e5..82109585439b 100644
--- a/fs/bcachefs/backpointers.c
+++ b/fs/bcachefs/backpointers.c
@@ -351,7 +351,6 @@ static int bch2_check_btree_backpointer(struct btree_trans *trans, struct btree_
 {
 	struct bch_fs *c = trans->c;
 	struct btree_iter alloc_iter = { NULL };
-	struct bch_dev *ca;
 	struct bkey_s_c alloc_k;
 	struct printbuf buf = PRINTBUF;
 	int ret = 0;
@@ -363,8 +362,6 @@ static int bch2_check_btree_backpointer(struct btree_trans *trans, struct btree_
 		goto out;
 	}
 
-	ca = bch_dev_bkey_exists(c, k.k->p.inode);
-
 	alloc_k = bch2_bkey_get_iter(trans, &alloc_iter, BTREE_ID_alloc,
 				     bp_pos_to_bucket(c, k.k->p), 0);
 	ret = bkey_err(alloc_k);
@@ -629,7 +626,7 @@ static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans,
 	struct bch_fs *c = trans->c;
 	struct btree_iter iter;
 	enum btree_id btree_id;
-	struct bpos_level last_flushed = { UINT_MAX };
+	struct bpos_level last_flushed = { UINT_MAX, POS_MIN };
 	int ret = 0;
 
 	for (btree_id = 0; btree_id < btree_id_nr_alive(c); btree_id++) {
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index e80fef1537c9..9fe3dac4a005 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -371,7 +371,7 @@ BCH_DEBUG_PARAMS()
 #undef BCH_DEBUG_PARAM
 
 #ifndef CONFIG_BCACHEFS_DEBUG
-#define BCH_DEBUG_PARAM(name, description) static const bool bch2_##name;
+#define BCH_DEBUG_PARAM(name, description) static const __maybe_unused bool bch2_##name;
 BCH_DEBUG_PARAMS_DEBUG()
 #undef BCH_DEBUG_PARAM
 #endif
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index c434202f351a..f0d130440baa 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -83,8 +83,8 @@ typedef uuid_t __uuid_t;
 #endif
 
 #define BITMASK(name, type, field, offset, end)				\
-static const unsigned	name##_OFFSET = offset;				\
-static const unsigned	name##_BITS = (end - offset);			\
+static const __maybe_unused unsigned	name##_OFFSET = offset;		\
+static const __maybe_unused unsigned	name##_BITS = (end - offset);	\
 									\
 static inline __u64 name(const type *k)					\
 {									\
@@ -98,9 +98,9 @@ static inline void SET_##name(type *k, __u64 v)				\
 }
 
 #define LE_BITMASK(_bits, name, type, field, offset, end)		\
-static const unsigned	name##_OFFSET = offset;				\
-static const unsigned	name##_BITS = (end - offset);			\
-static const __u##_bits	name##_MAX = (1ULL << (end - offset)) - 1;	\
+static const __maybe_unused unsigned	name##_OFFSET = offset;		\
+static const __maybe_unused unsigned	name##_BITS = (end - offset);	\
+static const __maybe_unused __u##_bits	name##_MAX = (1ULL << (end - offset)) - 1;\
 									\
 static inline __u64 name(const type *k)					\
 {									\
@@ -1668,7 +1668,8 @@ enum bcachefs_metadata_version {
 	bcachefs_metadata_version_max
 };
 
-static const unsigned bcachefs_metadata_required_upgrade_below = bcachefs_metadata_version_major_minor;
+static const __maybe_unused
+unsigned bcachefs_metadata_required_upgrade_below = bcachefs_metadata_version_major_minor;
 
 #define bcachefs_metadata_version_current	(bcachefs_metadata_version_max - 1)
 
@@ -1975,7 +1976,7 @@ enum bch_csum_type {
 	BCH_CSUM_NR
 };
 
-static const unsigned bch_crc_bytes[] = {
+static const __maybe_unused unsigned bch_crc_bytes[] = {
 	[BCH_CSUM_none]				= 0,
 	[BCH_CSUM_crc32c_nonzero]		= 4,
 	[BCH_CSUM_crc32c]			= 4,
diff --git a/fs/bcachefs/bkey.c b/fs/bcachefs/bkey.c
index a3abd9d2d176..abdb05507d16 100644
--- a/fs/bcachefs/bkey.c
+++ b/fs/bcachefs/bkey.c
@@ -308,9 +308,14 @@ struct bpos __bkey_unpack_pos(const struct bkey_format *format,
 
 /**
  * bch2_bkey_pack_key -- pack just the key, not the value
+ * @out:	packed result
+ * @in:		key to pack
+ * @format:	format of packed result
+ *
+ * Returns: true on success, false on failure
  */
 bool bch2_bkey_pack_key(struct bkey_packed *out, const struct bkey *in,
-		   const struct bkey_format *format)
+			const struct bkey_format *format)
 {
 	struct pack_state state = pack_state_init(format, out);
 	u64 *w = out->_data;
@@ -336,9 +341,12 @@ bool bch2_bkey_pack_key(struct bkey_packed *out, const struct bkey *in,
 
 /**
  * bch2_bkey_unpack -- unpack the key and the value
+ * @b:		btree node of @src key (for packed format)
+ * @dst:	unpacked result
+ * @src:	packed input
  */
 void bch2_bkey_unpack(const struct btree *b, struct bkey_i *dst,
-		 const struct bkey_packed *src)
+		      const struct bkey_packed *src)
 {
 	__bkey_unpack_key(b, &dst->k, src);
 
@@ -349,19 +357,24 @@ void bch2_bkey_unpack(const struct btree *b, struct bkey_i *dst,
 
 /**
  * bch2_bkey_pack -- pack the key and the value
+ * @dst:	packed result
+ * @src:	unpacked input
+ * @format:	format of packed result
+ *
+ * Returns: true on success, false on failure
  */
-bool bch2_bkey_pack(struct bkey_packed *out, const struct bkey_i *in,
-	       const struct bkey_format *format)
+bool bch2_bkey_pack(struct bkey_packed *dst, const struct bkey_i *src,
+		    const struct bkey_format *format)
 {
 	struct bkey_packed tmp;
 
-	if (!bch2_bkey_pack_key(&tmp, &in->k, format))
+	if (!bch2_bkey_pack_key(&tmp, &src->k, format))
 		return false;
 
-	memmove_u64s((u64 *) out + format->key_u64s,
-		     &in->v,
-		     bkey_val_u64s(&in->k));
-	memcpy_u64s_small(out, &tmp, format->key_u64s);
+	memmove_u64s((u64 *) dst + format->key_u64s,
+		     &src->v,
+		     bkey_val_u64s(&src->k));
+	memcpy_u64s_small(dst, &tmp, format->key_u64s);
 
 	return true;
 }
diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c
index 82f30ffbfb86..be9f012fc7be 100644
--- a/fs/bcachefs/bkey_methods.c
+++ b/fs/bcachefs/bkey_methods.c
@@ -369,7 +369,6 @@ void __bch2_bkey_compat(unsigned level, enum btree_id btree_id,
 {
 	const struct bkey_ops *ops;
 	struct bkey uk;
-	struct bkey_s u;
 	unsigned nr_compat = 5;
 	int i;
 
@@ -434,7 +433,9 @@ void __bch2_bkey_compat(unsigned level, enum btree_id btree_id,
 		}
 
 		break;
-	case 4:
+	case 4: {
+		struct bkey_s u;
+
 		if (!bkey_packed(k)) {
 			u = bkey_i_to_s(packed_to_bkey(k));
 		} else {
@@ -451,6 +452,7 @@ void __bch2_bkey_compat(unsigned level, enum btree_id btree_id,
 		if (ops->compat)
 			ops->compat(btree_id, version, big_endian, write, u);
 		break;
+	}
 	default:
 		BUG();
 	}
diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c
index 685792137d2a..cff7486ef446 100644
--- a/fs/bcachefs/bset.c
+++ b/fs/bcachefs/bset.c
@@ -172,10 +172,10 @@ static void bch2_btree_node_iter_next_check(struct btree_node_iter *_iter,
 		printk(KERN_ERR "iter was:");
 
 		btree_node_iter_for_each(_iter, set) {
-			struct bkey_packed *k = __btree_node_offset_to_key(b, set->k);
-			struct bset_tree *t = bch2_bkey_to_bset(b, k);
+			struct bkey_packed *k2 = __btree_node_offset_to_key(b, set->k);
+			struct bset_tree *t = bch2_bkey_to_bset(b, k2);
 			printk(" [%zi %zi]", t - b->set,
-			       k->_data - bset(b, t)->_data);
+			       k2->_data - bset(b, t)->_data);
 		}
 		panic("\n");
 	}
@@ -1269,9 +1269,13 @@ static void btree_node_iter_init_pack_failed(struct btree_node_iter *iter,
 }
 
 /**
- * bch_btree_node_iter_init - initialize a btree node iterator, starting from a
+ * bch2_btree_node_iter_init - initialize a btree node iterator, starting from a
  * given position
  *
+ * @iter:	iterator to initialize
+ * @b:		btree node to search
+ * @search:	search key
+ *
  * Main entry point to the lookup code for individual btree nodes:
  *
  * NOTE:
diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index 245ddd92b2d1..ef9492f7e937 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -885,7 +885,7 @@ retry:
 	}
 
 	if (unlikely(need_relock)) {
-		int ret = bch2_trans_relock(trans) ?:
+		ret = bch2_trans_relock(trans) ?:
 			bch2_btree_path_relock_intent(trans, path);
 		if (ret) {
 			six_unlock_type(&b->c.lock, lock_type);
@@ -916,11 +916,20 @@ retry:
 }
 
 /**
- * bch_btree_node_get - find a btree node in the cache and lock it, reading it
+ * bch2_btree_node_get - find a btree node in the cache and lock it, reading it
  * in from disk if necessary.
  *
+ * @trans:	btree transaction object
+ * @path:	btree_path being traversed
+ * @k:		pointer to btree node (generally KEY_TYPE_btree_ptr_v2)
+ * @level:	level of btree node being looked up (0 == leaf node)
+ * @lock_type:	SIX_LOCK_read or SIX_LOCK_intent
+ * @trace_ip:	ip of caller of btree iterator code (i.e. caller of bch2_btree_iter_peek())
+ *
  * The btree node will have either a read or a write lock held, depending on
  * the @write parameter.
+ *
+ * Returns: btree node or ERR_PTR()
  */
 struct btree *bch2_btree_node_get(struct btree_trans *trans, struct btree_path *path,
 				  const struct bkey_i *k, unsigned level,
@@ -979,7 +988,7 @@ struct btree *bch2_btree_node_get(struct btree_trans *trans, struct btree_path *
 		 * relock it specifically:
 		 */
 		if (trans) {
-			int ret = bch2_trans_relock(trans) ?:
+			ret = bch2_trans_relock(trans) ?:
 				bch2_btree_path_relock_intent(trans, path);
 			if (ret) {
 				BUG_ON(!trans->restarted);
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 3c8ffbbaef4f..9496ff16fc91 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -566,8 +566,8 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id
 			       struct bkey_s_c *k)
 {
 	struct bch_fs *c = trans->c;
-	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(*k);
-	const union bch_extent_entry *entry;
+	struct bkey_ptrs_c ptrs_c = bch2_bkey_ptrs_c(*k);
+	const union bch_extent_entry *entry_c;
 	struct extent_ptr_decoded p = { 0 };
 	bool do_update = false;
 	struct printbuf buf = PRINTBUF;
@@ -577,10 +577,10 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id
 	 * XXX
 	 * use check_bucket_ref here
 	 */
-	bkey_for_each_ptr_decode(k->k, ptrs, p, entry) {
+	bkey_for_each_ptr_decode(k->k, ptrs_c, p, entry_c) {
 		struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev);
 		struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr);
-		enum bch_data_type data_type = bch2_bkey_ptr_data_type(*k, &entry->ptr);
+		enum bch_data_type data_type = bch2_bkey_ptr_data_type(*k, &entry_c->ptr);
 
 		if (!g->gen_valid &&
 		    (c->opts.reconstruct_alloc ||
@@ -1217,14 +1217,6 @@ static int bch2_gc_done(struct bch_fs *c,
 	     fsck_err(c, _msg ": got %llu, should be %llu"		\
 		      , ##__VA_ARGS__, dst->_f, src->_f)))		\
 		dst->_f = src->_f
-#define copy_stripe_field(_f, _msg, ...)				\
-	if (dst->_f != src->_f &&					\
-	    (!verify ||							\
-	     fsck_err(c, "stripe %zu has wrong "_msg			\
-		      ": got %u, should be %u",				\
-		      iter.pos, ##__VA_ARGS__,				\
-		      dst->_f, src->_f)))				\
-		dst->_f = src->_f
 #define copy_dev_field(_f, _msg, ...)					\
 	copy_field(_f, "dev %u has wrong " _msg, dev, ##__VA_ARGS__)
 #define copy_fs_field(_f, _msg, ...)					\
@@ -1776,6 +1768,12 @@ static void bch2_gc_stripes_reset(struct bch_fs *c, bool metadata_only)
 /**
  * bch2_gc - walk _all_ references to buckets, and recompute them:
  *
+ * @c:			filesystem object
+ * @initial:		are we in recovery?
+ * @metadata_only:	are we just checking metadata references, or everything?
+ *
+ * Returns: 0 on success, or standard errcode on failure
+ *
  * Order matters here:
  *  - Concurrent GC relies on the fact that we have a total ordering for
  *    everything that GC walks - see  gc_will_visit_node(),
@@ -1985,11 +1983,9 @@ int bch2_gc_gens(struct bch_fs *c)
 
 	for (i = 0; i < BTREE_ID_NR; i++)
 		if (btree_type_has_ptrs(i)) {
-			struct btree_iter iter;
-			struct bkey_s_c k;
-
 			c->gc_gens_btree = i;
 			c->gc_gens_pos = POS_MIN;
+
 			ret = for_each_btree_key_commit(&trans, iter, i,
 					POS_MIN,
 					BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS,
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 00f53cb5d44b..9fa9ed641300 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -336,7 +336,7 @@ static void btree_node_sort(struct bch_fs *c, struct btree *b,
 	start_bset->journal_seq = cpu_to_le64(seq);
 
 	if (sorting_entire_node) {
-		unsigned u64s = le16_to_cpu(out->keys.u64s);
+		u64s = le16_to_cpu(out->keys.u64s);
 
 		BUG_ON(bytes != btree_bytes(c));
 
@@ -410,8 +410,6 @@ void bch2_btree_sort_into(struct bch_fs *c,
 	bch2_verify_btree_nr_keys(dst);
 }
 
-#define SORT_CRIT	(4096 / sizeof(u64))
-
 /*
  * We're about to add another bset to the btree node, so if there's currently
  * too many bsets - sort some of them together:
@@ -542,6 +540,7 @@ static void btree_err_msg(struct printbuf *out, struct bch_fs *c,
 	prt_str(out, ": ");
 }
 
+__printf(8, 9)
 static int __btree_err(int ret,
 		       struct bch_fs *c,
 		       struct bch_dev *ca,
@@ -622,9 +621,6 @@ __cold
 void bch2_btree_node_drop_keys_outside_node(struct btree *b)
 {
 	struct bset_tree *t;
-	struct bkey_s_c k;
-	struct bkey unpacked;
-	struct btree_node_iter iter;
 
 	for_each_bset(b, t) {
 		struct bset *i = bset(b, t);
@@ -660,6 +656,9 @@ void bch2_btree_node_drop_keys_outside_node(struct btree *b)
 	bch2_bset_set_no_aux_tree(b, b->set);
 	bch2_btree_build_aux_trees(b);
 
+	struct bkey_s_c k;
+	struct bkey unpacked;
+	struct btree_node_iter iter;
 	for_each_btree_node_key_unpack(b, k, &iter, &unpacked) {
 		BUG_ON(bpos_lt(k.k->p, b->data->min_key));
 		BUG_ON(bpos_gt(k.k->p, b->data->max_key));
@@ -908,7 +907,6 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
 	bool updated_range = b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
 		BTREE_PTR_RANGE_UPDATED(&bkey_i_to_btree_ptr_v2(&b->key)->v);
 	unsigned u64s;
-	unsigned blacklisted_written, nonblacklisted_written = 0;
 	unsigned ptr_written = btree_ptr_sectors_written(&b->key);
 	struct printbuf buf = PRINTBUF;
 	int ret = 0, retry_read = 0, write = READ;
@@ -1042,8 +1040,6 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
 		sort_iter_add(iter,
 			      vstruct_idx(i, 0),
 			      vstruct_last(i));
-
-		nonblacklisted_written = b->written;
 	}
 
 	if (ptr_written) {
@@ -1061,18 +1057,6 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
 								      true),
 				     -BCH_ERR_btree_node_read_err_want_retry, c, ca, b, NULL,
 				     "found bset signature after last bset");
-
-		/*
-		 * Blacklisted bsets are those that were written after the most recent
-		 * (flush) journal write. Since there wasn't a flush, they may not have
-		 * made it to all devices - which means we shouldn't write new bsets
-		 * after them, as that could leave a gap and then reads from that device
-		 * wouldn't find all the bsets in that btree node - which means it's
-		 * important that we start writing new bsets after the most recent _non_
-		 * blacklisted bset:
-		 */
-		blacklisted_written = b->written;
-		b->written = nonblacklisted_written;
 	}
 
 	sorted = btree_bounce_alloc(c, btree_bytes(c), &used_mempool);
@@ -1140,9 +1124,9 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
 	btree_node_reset_sib_u64s(b);
 
 	bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&b->key)), ptr) {
-		struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+		struct bch_dev *ca2 = bch_dev_bkey_exists(c, ptr->dev);
 
-		if (ca->mi.state != BCH_MEMBER_STATE_rw)
+		if (ca2->mi.state != BCH_MEMBER_STATE_rw)
 			set_btree_node_need_rewrite(b);
 	}
 
@@ -1224,19 +1208,17 @@ start:
 	bch2_time_stats_update(&c->times[BCH_TIME_btree_node_read],
 			       rb->start_time);
 	bio_put(&rb->bio);
-	printbuf_exit(&buf);
 
 	if (saw_error && !btree_node_read_error(b)) {
-		struct printbuf buf = PRINTBUF;
-
+		printbuf_reset(&buf);
 		bch2_bpos_to_text(&buf, b->key.k.p);
 		bch_info(c, "%s: rewriting btree node at btree=%s level=%u %s due to error",
 			 __func__, bch2_btree_ids[b->c.btree_id], b->c.level, buf.buf);
-		printbuf_exit(&buf);
 
 		bch2_btree_node_rewrite_async(c, b);
 	}
 
+	printbuf_exit(&buf);
 	clear_btree_node_read_in_flight(b);
 	wake_up_bit(&b->flags, BTREE_NODE_read_in_flight);
 }
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 8d089bbdb1e5..6c064e82c0c8 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -488,7 +488,6 @@ fixup_done:
 	if (!bch2_btree_node_iter_end(node_iter) &&
 	    iter_current_key_modified &&
 	    b->c.level) {
-		struct bset_tree *t;
 		struct bkey_packed *k, *k2, *p;
 
 		k = bch2_btree_node_iter_peek_all(node_iter, b);
@@ -2048,8 +2047,12 @@ out:
 }
 
 /**
- * bch2_btree_iter_peek: returns first key greater than or equal to iterator's
- * current position
+ * bch2_btree_iter_peek_upto() - returns first key greater than or equal to
+ * iterator's current position
+ * @iter:	iterator to peek from
+ * @end:	search limit: returns keys less than or equal to @end
+ *
+ * Returns:	key if found, or an error extractable with bkey_err().
  */
 struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos end)
 {
@@ -2186,10 +2189,13 @@ end:
 }
 
 /**
- * bch2_btree_iter_peek_all_levels: returns the first key greater than or equal
- * to iterator's current position, returning keys from every level of the btree.
- * For keys at different levels of the btree that compare equal, the key from
- * the lower level (leaf) is returned first.
+ * bch2_btree_iter_peek_all_levels() - returns the first key greater than or
+ * equal to iterator's current position, returning keys from every level of the
+ * btree. For keys at different levels of the btree that compare equal, the key
+ * from the lower level (leaf) is returned first.
+ * @iter:	iterator to peek from
+ *
+ * Returns:	key if found, or an error extractable with bkey_err().
  */
 struct bkey_s_c bch2_btree_iter_peek_all_levels(struct btree_iter *iter)
 {
@@ -2280,8 +2286,11 @@ out_no_locked:
 }
 
 /**
- * bch2_btree_iter_next: returns first key greater than iterator's current
+ * bch2_btree_iter_next() - returns first key greater than iterator's current
  * position
+ * @iter:	iterator to peek from
+ *
+ * Returns:	key if found, or an error extractable with bkey_err().
  */
 struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter)
 {
@@ -2292,8 +2301,11 @@ struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter)
 }
 
 /**
- * bch2_btree_iter_peek_prev: returns first key less than or equal to
+ * bch2_btree_iter_peek_prev() - returns first key less than or equal to
  * iterator's current position
+ * @iter:	iterator to peek from
+ *
+ * Returns:	key if found, or an error extractable with bkey_err().
  */
 struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
 {
@@ -2416,8 +2428,11 @@ out_no_locked:
 }
 
 /**
- * bch2_btree_iter_prev: returns first key less than iterator's current
+ * bch2_btree_iter_prev() - returns first key less than iterator's current
  * position
+ * @iter:	iterator to peek from
+ *
+ * Returns:	key if found, or an error extractable with bkey_err().
  */
 struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *iter)
 {
@@ -2832,6 +2847,8 @@ static noinline void bch2_trans_reset_srcu_lock(struct btree_trans *trans)
  * bch2_trans_begin() - reset a transaction after a interrupted attempt
  * @trans: transaction to reset
  *
+ * Returns:	current restart counter, to be used with trans_was_restarted()
+ *
  * While iterating over nodes or updating nodes a attempt to lock a btree node
  * may return BCH_ERR_transaction_restart when the trylock fails. When this
  * occurs bch2_trans_begin() should be called and the transaction retried.
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index b885e4e210d4..360a26b58501 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -674,17 +674,17 @@ __bch2_btree_iter_peek_upto_and_restart(struct btree_trans *trans,
 #define lockrestart_do(_trans, _do)					\
 ({									\
 	u32 _restart_count;						\
-	int _ret;							\
+	int _ret2;							\
 									\
 	do {								\
 		_restart_count = bch2_trans_begin(_trans);		\
-		_ret = (_do);						\
-	} while (bch2_err_matches(_ret, BCH_ERR_transaction_restart));	\
+		_ret2 = (_do);						\
+	} while (bch2_err_matches(_ret2, BCH_ERR_transaction_restart));	\
 									\
-	if (!_ret)							\
+	if (!_ret2)							\
 		bch2_trans_verify_not_restarted(_trans, _restart_count);\
 									\
-	_ret;								\
+	_ret2;								\
 })
 
 /*
@@ -699,23 +699,23 @@ __bch2_btree_iter_peek_upto_and_restart(struct btree_trans *trans,
 #define nested_lockrestart_do(_trans, _do)				\
 ({									\
 	u32 _restart_count, _orig_restart_count;			\
-	int _ret;							\
+	int _ret2;							\
 									\
 	_restart_count = _orig_restart_count = (_trans)->restart_count;	\
 									\
-	while (bch2_err_matches(_ret = (_do), BCH_ERR_transaction_restart))\
+	while (bch2_err_matches(_ret2 = (_do), BCH_ERR_transaction_restart))\
 		_restart_count = bch2_trans_begin(_trans);		\
 									\
-	if (!_ret)							\
+	if (!_ret2)							\
 		bch2_trans_verify_not_restarted(_trans, _restart_count);\
 									\
-	_ret ?: trans_was_restarted(_trans, _restart_count);		\
+	_ret2 ?: trans_was_restarted(_trans, _restart_count);		\
 })
 
 #define for_each_btree_key2(_trans, _iter, _btree_id,			\
 			    _start, _flags, _k, _do)			\
 ({									\
-	int _ret = 0;							\
+	int _ret3 = 0;							\
 									\
 	bch2_trans_iter_init((_trans), &(_iter), (_btree_id),		\
 			     (_start), (_flags));			\
@@ -723,15 +723,15 @@ __bch2_btree_iter_peek_upto_and_restart(struct btree_trans *trans,
 	while (1) {							\
 		u32 _restart_count = bch2_trans_begin(_trans);		\
 									\
-		_ret = 0;						\
+		_ret3 = 0;						\
 		(_k) = bch2_btree_iter_peek_type(&(_iter), (_flags));	\
 		if (!(_k).k)						\
 			break;						\
 									\
-		_ret = bkey_err(_k) ?: (_do);				\
-		if (bch2_err_matches(_ret, BCH_ERR_transaction_restart))\
+		_ret3 = bkey_err(_k) ?: (_do);				\
+		if (bch2_err_matches(_ret3, BCH_ERR_transaction_restart))\
 			continue;					\
-		if (_ret)						\
+		if (_ret3)						\
 			break;						\
 		bch2_trans_verify_not_restarted(_trans, _restart_count);\
 		if (!bch2_btree_iter_advance(&(_iter)))			\
@@ -739,13 +739,13 @@ __bch2_btree_iter_peek_upto_and_restart(struct btree_trans *trans,
 	}								\
 									\
 	bch2_trans_iter_exit((_trans), &(_iter));			\
-	_ret;								\
+	_ret3;								\
 })
 
 #define for_each_btree_key2_upto(_trans, _iter, _btree_id,		\
 			    _start, _end, _flags, _k, _do)		\
 ({									\
-	int _ret = 0;							\
+	int _ret3 = 0;							\
 									\
 	bch2_trans_iter_init((_trans), &(_iter), (_btree_id),		\
 			     (_start), (_flags));			\
@@ -753,15 +753,15 @@ __bch2_btree_iter_peek_upto_and_restart(struct btree_trans *trans,
 	while (1) {							\
 		u32 _restart_count = bch2_trans_begin(_trans);		\
 									\
-		_ret = 0;						\
+		_ret3 = 0;						\
 		(_k) = bch2_btree_iter_peek_upto_type(&(_iter), _end, (_flags));\
 		if (!(_k).k)						\
 			break;						\
 									\
-		_ret = bkey_err(_k) ?: (_do);				\
-		if (bch2_err_matches(_ret, BCH_ERR_transaction_restart))\
+		_ret3 = bkey_err(_k) ?: (_do);				\
+		if (bch2_err_matches(_ret3, BCH_ERR_transaction_restart))\
 			continue;					\
-		if (_ret)						\
+		if (_ret3)						\
 			break;						\
 		bch2_trans_verify_not_restarted(_trans, _restart_count);\
 		if (!bch2_btree_iter_advance(&(_iter)))			\
@@ -769,13 +769,13 @@ __bch2_btree_iter_peek_upto_and_restart(struct btree_trans *trans,
 	}								\
 									\
 	bch2_trans_iter_exit((_trans), &(_iter));			\
-	_ret;								\
+	_ret3;								\
 })
 
 #define for_each_btree_key_reverse(_trans, _iter, _btree_id,		\
 				   _start, _flags, _k, _do)		\
 ({									\
-	int _ret = 0;							\
+	int _ret3 = 0;							\
 									\
 	bch2_trans_iter_init((_trans), &(_iter), (_btree_id),		\
 			     (_start), (_flags));			\
@@ -784,14 +784,14 @@ __bch2_btree_iter_peek_upto_and_restart(struct btree_trans *trans,
 		u32 _restart_count = bch2_trans_begin(_trans);		\
 		(_k) = bch2_btree_iter_peek_prev_type(&(_iter), (_flags));\
 		if (!(_k).k) {						\
-			_ret = 0;					\
+			_ret3 = 0;					\
 			break;						\
 		}							\
 									\
-		_ret = bkey_err(_k) ?: (_do);				\
-		if (bch2_err_matches(_ret, BCH_ERR_transaction_restart))\
+		_ret3 = bkey_err(_k) ?: (_do);				\
+		if (bch2_err_matches(_ret3, BCH_ERR_transaction_restart))\
 			continue;					\
-		if (_ret)						\
+		if (_ret3)						\
 			break;						\
 		bch2_trans_verify_not_restarted(_trans, _restart_count);\
 		if (!bch2_btree_iter_rewind(&(_iter)))			\
@@ -799,7 +799,7 @@ __bch2_btree_iter_peek_upto_and_restart(struct btree_trans *trans,
 	}								\
 									\
 	bch2_trans_iter_exit((_trans), &(_iter));			\
-	_ret;								\
+	_ret3;								\
 })
 
 #define for_each_btree_key_commit(_trans, _iter, _btree_id,		\
diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index a74ee6d8a7cf..784f889340cd 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -242,8 +242,6 @@ bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path,
 	}
 
 	if (ck) {
-		int ret;
-
 		ret = btree_node_lock_nopath(trans, &ck->c, SIX_LOCK_intent, _THIS_IP_);
 		if (unlikely(ret)) {
 			bkey_cached_move_to_freelist(bc, ck);
diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c
index eafb0388ef82..e3a0b101cbf8 100644
--- a/fs/bcachefs/btree_trans_commit.c
+++ b/fs/bcachefs/btree_trans_commit.c
@@ -214,7 +214,11 @@ inline void bch2_btree_add_journal_pin(struct bch_fs *c,
 }
 
 /**
- * btree_insert_key - insert a key one key into a leaf node
+ * bch2_btree_insert_key_leaf() - insert a key one key into a leaf node
+ * @trans:		btree transaction object
+ * @path:		path pointing to @insert's pos
+ * @insert:		key to insert
+ * @journal_seq:	sequence number of journal reservation
  */
 inline void bch2_btree_insert_key_leaf(struct btree_trans *trans,
 				       struct btree_path *path,
@@ -555,7 +559,6 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
 	struct btree_write_buffered_key *wb;
 	struct btree_trans_commit_hook *h;
 	unsigned u64s = 0;
-	bool marking = false;
 	int ret;
 
 	if (race_fault()) {
@@ -584,9 +587,6 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
 			*stopped_at = i;
 			return ret;
 		}
-
-		if (btree_node_type_needs_gc(i->bkey_type))
-			marking = true;
 	}
 
 	if (trans->nr_wb_updates &&
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 70398aaa095e..96a03f414dd0 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -184,34 +184,34 @@ struct btree_node_iter {
 /*
  * Iterate over all possible positions, synthesizing deleted keys for holes:
  */
-static const u16 BTREE_ITER_SLOTS		= 1 << 0;
-static const u16 BTREE_ITER_ALL_LEVELS		= 1 << 1;
+static const __maybe_unused u16 BTREE_ITER_SLOTS		= 1 << 0;
+static const __maybe_unused u16 BTREE_ITER_ALL_LEVELS		= 1 << 1;
 /*
  * Indicates that intent locks should be taken on leaf nodes, because we expect
  * to be doing updates:
  */
-static const u16 BTREE_ITER_INTENT		= 1 << 2;
+static const __maybe_unused u16 BTREE_ITER_INTENT		= 1 << 2;
 /*
  * Causes the btree iterator code to prefetch additional btree nodes from disk:
  */
-static const u16 BTREE_ITER_PREFETCH		= 1 << 3;
+static const __maybe_unused u16 BTREE_ITER_PREFETCH		= 1 << 3;
 /*
  * Used in bch2_btree_iter_traverse(), to indicate whether we're searching for
  * @pos or the first key strictly greater than @pos
  */
-static const u16 BTREE_ITER_IS_EXTENTS		= 1 << 4;
-static const u16 BTREE_ITER_NOT_EXTENTS		= 1 << 5;
-static const u16 BTREE_ITER_CACHED		= 1 << 6;
-static const u16 BTREE_ITER_WITH_KEY_CACHE	= 1 << 7;
-static const u16 BTREE_ITER_WITH_UPDATES	= 1 << 8;
-static const u16 BTREE_ITER_WITH_JOURNAL	= 1 << 9;
-static const u16 __BTREE_ITER_ALL_SNAPSHOTS	= 1 << 10;
-static const u16 BTREE_ITER_ALL_SNAPSHOTS	= 1 << 11;
-static const u16 BTREE_ITER_FILTER_SNAPSHOTS	= 1 << 12;
-static const u16 BTREE_ITER_NOPRESERVE		= 1 << 13;
-static const u16 BTREE_ITER_CACHED_NOFILL	= 1 << 14;
-static const u16 BTREE_ITER_KEY_CACHE_FILL	= 1 << 15;
-#define __BTREE_ITER_FLAGS_END			       16
+static const __maybe_unused u16 BTREE_ITER_IS_EXTENTS		= 1 << 4;
+static const __maybe_unused u16 BTREE_ITER_NOT_EXTENTS		= 1 << 5;
+static const __maybe_unused u16 BTREE_ITER_CACHED		= 1 << 6;
+static const __maybe_unused u16 BTREE_ITER_WITH_KEY_CACHE	= 1 << 7;
+static const __maybe_unused u16 BTREE_ITER_WITH_UPDATES		= 1 << 8;
+static const __maybe_unused u16 BTREE_ITER_WITH_JOURNAL		= 1 << 9;
+static const __maybe_unused u16 __BTREE_ITER_ALL_SNAPSHOTS	= 1 << 10;
+static const __maybe_unused u16 BTREE_ITER_ALL_SNAPSHOTS	= 1 << 11;
+static const __maybe_unused u16 BTREE_ITER_FILTER_SNAPSHOTS	= 1 << 12;
+static const __maybe_unused u16 BTREE_ITER_NOPRESERVE		= 1 << 13;
+static const __maybe_unused u16 BTREE_ITER_CACHED_NOFILL	= 1 << 14;
+static const __maybe_unused u16 BTREE_ITER_KEY_CACHE_FILL	= 1 << 15;
+#define __BTREE_ITER_FLAGS_END					       16
 
 enum btree_path_uptodate {
 	BTREE_ITER_UPTODATE		= 0,
diff --git a/fs/bcachefs/btree_update.c b/fs/bcachefs/btree_update.c
index 823f0da2f502..3d126f043db0 100644
--- a/fs/bcachefs/btree_update.c
+++ b/fs/bcachefs/btree_update.c
@@ -681,15 +681,17 @@ int bch2_btree_insert_trans(struct btree_trans *trans, enum btree_id id,
  * bch2_btree_insert - insert keys into the extent btree
  * @c:			pointer to struct bch_fs
  * @id:			btree to insert into
- * @insert_keys:	list of keys to insert
- * @hook:		insert callback
+ * @k:			key to insert
+ * @disk_res:		must be non-NULL whenever inserting or potentially
+ *			splitting data extents
+ * @flags:		transaction commit flags
+ *
+ * Returns:		0 on success, error code on failure
  */
-int bch2_btree_insert(struct bch_fs *c, enum btree_id id,
-		      struct bkey_i *k,
-		      struct disk_reservation *disk_res,
-		      u64 *journal_seq, int flags)
+int bch2_btree_insert(struct bch_fs *c, enum btree_id id, struct bkey_i *k,
+		      struct disk_reservation *disk_res, int flags)
 {
-	return bch2_trans_do(c, disk_res, journal_seq, flags,
+	return bch2_trans_do(c, disk_res, NULL, flags,
 			     bch2_btree_insert_trans(&trans, id, k, 0));
 }
 
@@ -847,6 +849,7 @@ int bch2_btree_bit_mod(struct btree_trans *trans, enum btree_id btree,
 	return bch2_trans_update_buffered(trans, btree, k);
 }
 
+__printf(2, 0)
 static int __bch2_trans_log_msg(darray_u64 *entries, const char *fmt, va_list args)
 {
 	struct printbuf buf = PRINTBUF;
@@ -883,6 +886,7 @@ err:
 	return ret;
 }
 
+__printf(3, 0)
 static int
 __bch2_fs_log_msg(struct bch_fs *c, unsigned commit_flags, const char *fmt,
 		  va_list args)
@@ -900,6 +904,7 @@ __bch2_fs_log_msg(struct bch_fs *c, unsigned commit_flags, const char *fmt,
 	return ret;
 }
 
+__printf(2, 3)
 int bch2_fs_log_msg(struct bch_fs *c, const char *fmt, ...)
 {
 	va_list args;
@@ -915,6 +920,7 @@ int bch2_fs_log_msg(struct bch_fs *c, const char *fmt, ...)
  * Use for logging messages during recovery to enable reserved space and avoid
  * blocking.
  */
+__printf(2, 3)
 int bch2_journal_log_msg(struct bch_fs *c, const char *fmt, ...)
 {
 	va_list args;
diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
index 0596c5e73a3e..0be980d16007 100644
--- a/fs/bcachefs/btree_update.h
+++ b/fs/bcachefs/btree_update.h
@@ -66,7 +66,7 @@ int bch2_btree_insert_nonextent(struct btree_trans *, enum btree_id,
 int bch2_btree_insert_trans(struct btree_trans *, enum btree_id, struct bkey_i *,
 			enum btree_update_flags);
 int bch2_btree_insert(struct bch_fs *, enum btree_id, struct bkey_i *,
-		     struct disk_reservation *, u64 *, int flags);
+		     struct disk_reservation *, int flags);
 
 int bch2_btree_delete_range_trans(struct btree_trans *, enum btree_id,
 				  struct bpos, struct bpos, unsigned, u64 *);
@@ -115,8 +115,8 @@ void bch2_trans_commit_hook(struct btree_trans *,
 			    struct btree_trans_commit_hook *);
 int __bch2_trans_commit(struct btree_trans *, unsigned);
 
-int bch2_fs_log_msg(struct bch_fs *, const char *, ...);
-int bch2_journal_log_msg(struct bch_fs *, const char *, ...);
+__printf(2, 3) int bch2_fs_log_msg(struct bch_fs *, const char *, ...);
+__printf(2, 3) int bch2_journal_log_msg(struct bch_fs *, const char *, ...);
 
 /**
  * bch2_trans_commit - insert keys at given iterator positions
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index c8d0942650f1..bac495b382bb 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -145,8 +145,13 @@ static size_t btree_node_u64s_with_format(struct btree *b,
 /**
  * bch2_btree_node_format_fits - check if we could rewrite node with a new format
  *
- * This assumes all keys can pack with the new format -- it just checks if
- * the re-packed keys would fit inside the node itself.
+ * @c:		filesystem handle
+ * @b:		btree node to rewrite
+ * @new_f:	bkey format to translate keys to
+ *
+ * Returns: true if all re-packed keys will be able to fit in a new node.
+ *
+ * Assumes all keys will successfully pack with the new format.
  */
 bool bch2_btree_node_format_fits(struct bch_fs *c, struct btree *b,
 				 struct bkey_format *new_f)
@@ -244,7 +249,7 @@ static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans,
 	struct write_point *wp;
 	struct btree *b;
 	BKEY_PADDED_ONSTACK(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp;
-	struct open_buckets ob = { .nr = 0 };
+	struct open_buckets obs = { .nr = 0 };
 	struct bch_devs_list devs_have = (struct bch_devs_list) { 0 };
 	enum bch_watermark watermark = flags & BCH_WATERMARK_MASK;
 	unsigned nr_reserve = watermark > BCH_WATERMARK_reclaim
@@ -257,7 +262,7 @@ static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans,
 		struct btree_alloc *a =
 			&c->btree_reserve_cache[--c->btree_reserve_cache_nr];
 
-		ob = a->ob;
+		obs = a->ob;
 		bkey_copy(&tmp.k, &a->k);
 		mutex_unlock(&c->btree_reserve_cache_lock);
 		goto mem_alloc;
@@ -292,7 +297,7 @@ retry:
 	bkey_btree_ptr_v2_init(&tmp.k);
 	bch2_alloc_sectors_append_ptrs(c, wp, &tmp.k, btree_sectors(c), false);
 
-	bch2_open_bucket_get(c, wp, &ob);
+	bch2_open_bucket_get(c, wp, &obs);
 	bch2_alloc_sectors_done(c, wp);
 mem_alloc:
 	b = bch2_btree_node_mem_alloc(trans, interior_node);
@@ -304,7 +309,7 @@ mem_alloc:
 	BUG_ON(b->ob.nr);
 
 	bkey_copy(&b->key, &tmp.k);
-	b->ob = ob;
+	b->ob = obs;
 
 	return b;
 }
@@ -697,15 +702,15 @@ err:
 		 * btree_interior_update_lock:
 		 */
 		if (as->b == b) {
-			struct bset *i = btree_bset_last(b);
-
 			BUG_ON(!b->c.level);
 			BUG_ON(!btree_node_dirty(b));
 
 			if (!ret) {
-				i->journal_seq = cpu_to_le64(
+				struct bset *last = btree_bset_last(b);
+
+				last->journal_seq = cpu_to_le64(
 							     max(journal_seq,
-								 le64_to_cpu(i->journal_seq)));
+								 le64_to_cpu(last->journal_seq)));
 
 				bch2_btree_add_journal_pin(c, b, journal_seq);
 			} else {
@@ -1216,18 +1221,6 @@ static void bch2_btree_set_root_inmem(struct bch_fs *c, struct btree *b)
 	bch2_recalc_btree_reserve(c);
 }
 
-/**
- * bch_btree_set_root - update the root in memory and on disk
- *
- * To ensure forward progress, the current task must not be holding any
- * btree node write locks. However, you must hold an intent lock on the
- * old root.
- *
- * Note: This allocates a journal entry but doesn't add any keys to
- * it.  All the btree roots are part of every journal write, so there
- * is nothing new to be done.  This just guarantees that there is a
- * journal write.
- */
 static void bch2_btree_set_root(struct btree_update *as,
 				struct btree_trans *trans,
 				struct btree_path *path,
@@ -1341,12 +1334,12 @@ __bch2_btree_insert_keys_interior(struct btree_update *as,
 		;
 
 	while (!bch2_keylist_empty(keys)) {
-		struct bkey_i *k = bch2_keylist_front(keys);
+		insert = bch2_keylist_front(keys);
 
-		if (bpos_gt(k->k.p, b->key.k.p))
+		if (bpos_gt(insert->k.p, b->key.k.p))
 			break;
 
-		bch2_insert_fixup_btree_ptr(as, trans, path, b, &node_iter, k);
+		bch2_insert_fixup_btree_ptr(as, trans, path, b, &node_iter, insert);
 		bch2_keylist_pop_front(keys);
 	}
 }
@@ -1661,12 +1654,16 @@ bch2_btree_insert_keys_interior(struct btree_update *as,
 }
 
 /**
- * bch_btree_insert_node - insert bkeys into a given btree node
+ * bch2_btree_insert_node - insert bkeys into a given btree node
  *
- * @iter:		btree iterator
+ * @as:			btree_update object
+ * @trans:		btree_trans object
+ * @path:		path that points to current node
+ * @b:			node to insert keys into
  * @keys:		list of keys to insert
- * @hook:		insert callback
- * @persistent:		if not null, @persistent will wait on journal write
+ * @flags:		transaction commit flags
+ *
+ * Returns: 0 on success, typically transaction restart error on failure
  *
  * Inserts as many keys as it can into a given btree node, splitting it if full.
  * If a split occurred, this function will return early. This can only happen
@@ -1934,9 +1931,6 @@ err_free_update:
 	goto out;
 }
 
-/**
- * bch_btree_node_rewrite - Rewrite/move a btree node
- */
 int bch2_btree_node_rewrite(struct btree_trans *trans,
 			    struct btree_iter *iter,
 			    struct btree *b,
diff --git a/fs/bcachefs/checksum.c b/fs/bcachefs/checksum.c
index ff0c3cd39ee2..3f385d499026 100644
--- a/fs/bcachefs/checksum.c
+++ b/fs/bcachefs/checksum.c
@@ -366,11 +366,11 @@ struct bch_csum bch2_checksum_merge(unsigned type, struct bch_csum a,
 	BUG_ON(!bch2_checksum_mergeable(type));
 
 	while (b_len) {
-		unsigned b = min_t(unsigned, b_len, PAGE_SIZE);
+		unsigned page_len = min_t(unsigned, b_len, PAGE_SIZE);
 
 		bch2_checksum_update(&state,
-				page_address(ZERO_PAGE(0)), b);
-		b_len -= b;
+				page_address(ZERO_PAGE(0)), page_len);
+		b_len -= page_len;
 	}
 	a.lo = (__le64 __force) bch2_checksum_final(&state);
 	a.lo ^= b.lo;
@@ -395,9 +395,9 @@ int bch2_rechecksum_bio(struct bch_fs *c, struct bio *bio,
 		unsigned			csum_type;
 		struct bch_csum			csum;
 	} splits[3] = {
-		{ crc_a, len_a, new_csum_type },
-		{ crc_b, len_b, new_csum_type },
-		{ NULL,	 bio_sectors(bio) - len_a - len_b, new_csum_type },
+		{ crc_a, len_a, new_csum_type, { 0 }},
+		{ crc_b, len_b, new_csum_type, { 0 } },
+		{ NULL,	 bio_sectors(bio) - len_a - len_b, new_csum_type, { 0 } },
 	}, *i;
 	bool mergeable = crc_old.csum_type == new_csum_type &&
 		bch2_checksum_mergeable(new_csum_type);
diff --git a/fs/bcachefs/checksum.h b/fs/bcachefs/checksum.h
index c7b1a8fca685..779f175029a8 100644
--- a/fs/bcachefs/checksum.h
+++ b/fs/bcachefs/checksum.h
@@ -40,10 +40,9 @@ struct bch_csum bch2_checksum(struct bch_fs *, unsigned, struct nonce,
  */
 #define csum_vstruct(_c, _type, _nonce, _i)				\
 ({									\
-	const void *start = ((const void *) (_i)) + sizeof((_i)->csum);	\
-	const void *end = vstruct_end(_i);				\
+	const void *_start = ((const void *) (_i)) + sizeof((_i)->csum);\
 									\
-	bch2_checksum(_c, _type, _nonce, start, end - start);		\
+	bch2_checksum(_c, _type, _nonce, _start, vstruct_end(_i) - _start);\
 })
 
 int bch2_chacha_encrypt_key(struct bch_key *, struct nonce, void *, size_t);
diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c
index f1651807c2b7..1480b64547b0 100644
--- a/fs/bcachefs/compress.c
+++ b/fs/bcachefs/compress.c
@@ -570,7 +570,6 @@ void bch2_fs_compress_exit(struct bch_fs *c)
 static int __bch2_fs_compress_init(struct bch_fs *c, u64 features)
 {
 	size_t decompress_workspace_size = 0;
-	bool decompress_workspace_needed;
 	ZSTD_parameters params = zstd_get_params(zstd_max_clevel(),
 						 c->opts.encoded_extent_max);
 	struct {
@@ -580,7 +579,8 @@ static int __bch2_fs_compress_init(struct bch_fs *c, u64 features)
 		size_t				decompress_workspace;
 	} compression_types[] = {
 		{ BCH_FEATURE_lz4, BCH_COMPRESSION_TYPE_lz4,
-			max_t(size_t, LZ4_MEM_COMPRESS, LZ4HC_MEM_COMPRESS) },
+			max_t(size_t, LZ4_MEM_COMPRESS, LZ4HC_MEM_COMPRESS),
+			0 },
 		{ BCH_FEATURE_gzip, BCH_COMPRESSION_TYPE_gzip,
 			zlib_deflate_workspacesize(MAX_WBITS, DEF_MEM_LEVEL),
 			zlib_inflate_workspacesize(), },
@@ -619,9 +619,6 @@ static int __bch2_fs_compress_init(struct bch_fs *c, u64 features)
 		if (!(features & (1 << i->feature)))
 			continue;
 
-		if (i->decompress_workspace)
-			decompress_workspace_needed = true;
-
 		if (mempool_initialized(&c->compress_workspace[i->type]))
 			continue;
 
diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c
index 29576c4c109d..84ca128a59a3 100644
--- a/fs/bcachefs/data_update.c
+++ b/fs/bcachefs/data_update.c
@@ -49,10 +49,6 @@ static void trace_move_extent_fail2(struct data_update *m,
 	if (insert) {
 		i = 0;
 		bkey_for_each_ptr_decode(old.k, bch2_bkey_ptrs_c(old), p, entry) {
-			struct bkey_s new_s;
-			new_s.k = (void *) new.k;
-			new_s.v = (void *) new.v;
-
 			if (((1U << i) & m->data_opts.rewrite_ptrs) &&
 			    (ptr = bch2_extent_has_ptr(old, p, bkey_i_to_s(insert))) &&
 			    !ptr->cached)
diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c
index 5f3e65f9069e..7593ba04dfb2 100644
--- a/fs/bcachefs/debug.c
+++ b/fs/bcachefs/debug.c
@@ -153,10 +153,8 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b)
 	BUG_ON(b->nsets != 1);
 
 	for (k = inmemory->start; k != vstruct_last(inmemory); k = bkey_p_next(k))
-		if (k->type == KEY_TYPE_btree_ptr_v2) {
-			struct bch_btree_ptr_v2 *v = (void *) bkeyp_val(&b->format, k);
-			v->mem_ptr = 0;
-		}
+		if (k->type == KEY_TYPE_btree_ptr_v2)
+			((struct bch_btree_ptr_v2 *) bkeyp_val(&b->format, k))->mem_ptr = 0;
 
 	v = c->verify_data;
 	bkey_copy(&v->key, &b->key);
diff --git a/fs/bcachefs/disk_groups.c b/fs/bcachefs/disk_groups.c
index 9fa8d7d49f3e..b292dbef7992 100644
--- a/fs/bcachefs/disk_groups.c
+++ b/fs/bcachefs/disk_groups.c
@@ -32,21 +32,21 @@ static int bch2_sb_disk_groups_validate(struct bch_sb *sb,
 
 	for (i = 0; i < sb->nr_devices; i++) {
 		struct bch_member *m = mi->members + i;
-		unsigned g;
+		unsigned group_id;
 
 		if (!BCH_MEMBER_GROUP(m))
 			continue;
 
-		g = BCH_MEMBER_GROUP(m) - 1;
+		group_id = BCH_MEMBER_GROUP(m) - 1;
 
-		if (g >= nr_groups) {
+		if (group_id >= nr_groups) {
 			prt_printf(err, "disk %u has invalid label %u (have %u)",
-			       i, g, nr_groups);
+				   i, group_id, nr_groups);
 			return -BCH_ERR_invalid_sb_disk_groups;
 		}
 
-		if (BCH_GROUP_DELETED(&groups->entries[g])) {
-			prt_printf(err, "disk %u has deleted label %u", i, g);
+		if (BCH_GROUP_DELETED(&groups->entries[group_id])) {
+			prt_printf(err, "disk %u has deleted label %u", i, group_id);
 			return -BCH_ERR_invalid_sb_disk_groups;
 		}
 	}
diff --git a/fs/bcachefs/errcode.c b/fs/bcachefs/errcode.c
index 8d58f2cca260..d260ff9bbfeb 100644
--- a/fs/bcachefs/errcode.c
+++ b/fs/bcachefs/errcode.c
@@ -12,8 +12,6 @@ static const char * const bch2_errcode_strs[] = {
 	NULL
 };
 
-#define BCH_ERR_0	0
-
 static unsigned bch2_errcode_parents[] = {
 #define x(class, err) [BCH_ERR_##err - BCH_ERR_START] = class,
 	BCH_ERRCODES()
diff --git a/fs/bcachefs/fs-io-buffered.c b/fs/bcachefs/fs-io-buffered.c
index 2034d635c718..7650d8b3122a 100644
--- a/fs/bcachefs/fs-io-buffered.c
+++ b/fs/bcachefs/fs-io-buffered.c
@@ -695,12 +695,12 @@ int bch2_write_begin(struct file *file, struct address_space *mapping,
 	if (IS_ERR_OR_NULL(folio))
 		goto err_unlock;
 
-	if (folio_test_uptodate(folio))
-		goto out;
-
 	offset = pos - folio_pos(folio);
 	len = min_t(size_t, len, folio_end_pos(folio) - pos);
 
+	if (folio_test_uptodate(folio))
+		goto out;
+
 	/* If we're writing entire folio, don't need to read it in first: */
 	if (!offset && len == folio_size(folio))
 		goto out;
@@ -801,10 +801,10 @@ int bch2_write_end(struct file *file, struct address_space *mapping,
 	return copied;
 }
 
-static noinline void folios_trunc(folios *folios, struct folio **fi)
+static noinline void folios_trunc(folios *fs, struct folio **fi)
 {
-	while (folios->data + folios->nr > fi) {
-		struct folio *f = darray_pop(folios);
+	while (fs->data + fs->nr > fi) {
+		struct folio *f = darray_pop(fs);
 
 		folio_unlock(f);
 		folio_put(f);
@@ -818,35 +818,35 @@ static int __bch2_buffered_write(struct bch_inode_info *inode,
 {
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
 	struct bch2_folio_reservation res;
-	folios folios;
+	folios fs;
 	struct folio **fi, *f;
-	unsigned copied = 0, f_offset;
-	u64 end = pos + len, f_pos;
+	unsigned copied = 0, f_offset, f_copied;
+	u64 end = pos + len, f_pos, f_len;
 	loff_t last_folio_pos = inode->v.i_size;
 	int ret = 0;
 
 	BUG_ON(!len);
 
 	bch2_folio_reservation_init(c, inode, &res);
-	darray_init(&folios);
+	darray_init(&fs);
 
 	ret = bch2_filemap_get_contig_folios_d(mapping, pos, end,
 				   FGP_LOCK|FGP_WRITE|FGP_STABLE|FGP_CREAT,
 				   mapping_gfp_mask(mapping),
-				   &folios);
+				   &fs);
 	if (ret)
 		goto out;
 
-	BUG_ON(!folios.nr);
+	BUG_ON(!fs.nr);
 
-	f = darray_first(folios);
+	f = darray_first(fs);
 	if (pos != folio_pos(f) && !folio_test_uptodate(f)) {
 		ret = bch2_read_single_folio(f, mapping);
 		if (ret)
 			goto out;
 	}
 
-	f = darray_last(folios);
+	f = darray_last(fs);
 	end = min(end, folio_end_pos(f));
 	last_folio_pos = folio_pos(f);
 	if (end != folio_end_pos(f) && !folio_test_uptodate(f)) {
@@ -859,15 +859,15 @@ static int __bch2_buffered_write(struct bch_inode_info *inode,
 		}
 	}
 
-	ret = bch2_folio_set(c, inode_inum(inode), folios.data, folios.nr);
+	ret = bch2_folio_set(c, inode_inum(inode), fs.data, fs.nr);
 	if (ret)
 		goto out;
 
 	f_pos = pos;
-	f_offset = pos - folio_pos(darray_first(folios));
-	darray_for_each(folios, fi) {
-		struct folio *f = *fi;
-		u64 f_len = min(end, folio_end_pos(f)) - f_pos;
+	f_offset = pos - folio_pos(darray_first(fs));
+	darray_for_each(fs, fi) {
+		f = *fi;
+		f_len = min(end, folio_end_pos(f)) - f_pos;
 
 		/*
 		 * XXX: per POSIX and fstests generic/275, on -ENOSPC we're
@@ -879,11 +879,11 @@ static int __bch2_buffered_write(struct bch_inode_info *inode,
 		 */
 		ret = bch2_folio_reservation_get(c, inode, f, &res, f_offset, f_len);
 		if (unlikely(ret)) {
-			folios_trunc(&folios, fi);
-			if (!folios.nr)
+			folios_trunc(&fs, fi);
+			if (!fs.nr)
 				goto out;
 
-			end = min(end, folio_end_pos(darray_last(folios)));
+			end = min(end, folio_end_pos(darray_last(fs)));
 			break;
 		}
 
@@ -892,18 +892,17 @@ static int __bch2_buffered_write(struct bch_inode_info *inode,
 	}
 
 	if (mapping_writably_mapped(mapping))
-		darray_for_each(folios, fi)
+		darray_for_each(fs, fi)
 			flush_dcache_folio(*fi);
 
 	f_pos = pos;
-	f_offset = pos - folio_pos(darray_first(folios));
-	darray_for_each(folios, fi) {
-		struct folio *f = *fi;
-		u64 f_len = min(end, folio_end_pos(f)) - f_pos;
-		unsigned f_copied = copy_page_from_iter_atomic(&f->page, f_offset, f_len, iter);
-
+	f_offset = pos - folio_pos(darray_first(fs));
+	darray_for_each(fs, fi) {
+		f = *fi;
+		f_len = min(end, folio_end_pos(f)) - f_pos;
+		f_copied = copy_page_from_iter_atomic(&f->page, f_offset, f_len, iter);
 		if (!f_copied) {
-			folios_trunc(&folios, fi);
+			folios_trunc(&fs, fi);
 			break;
 		}
 
@@ -912,7 +911,7 @@ static int __bch2_buffered_write(struct bch_inode_info *inode,
 		    pos + copied + f_copied < inode->v.i_size) {
 			iov_iter_revert(iter, f_copied);
 			folio_zero_range(f, 0, folio_size(f));
-			folios_trunc(&folios, fi);
+			folios_trunc(&fs, fi);
 			break;
 		}
 
@@ -920,7 +919,7 @@ static int __bch2_buffered_write(struct bch_inode_info *inode,
 		copied += f_copied;
 
 		if (f_copied != f_len) {
-			folios_trunc(&folios, fi + 1);
+			folios_trunc(&fs, fi + 1);
 			break;
 		}
 
@@ -939,10 +938,10 @@ static int __bch2_buffered_write(struct bch_inode_info *inode,
 	spin_unlock(&inode->v.i_lock);
 
 	f_pos = pos;
-	f_offset = pos - folio_pos(darray_first(folios));
-	darray_for_each(folios, fi) {
-		struct folio *f = *fi;
-		u64 f_len = min(end, folio_end_pos(f)) - f_pos;
+	f_offset = pos - folio_pos(darray_first(fs));
+	darray_for_each(fs, fi) {
+		f = *fi;
+		f_len = min(end, folio_end_pos(f)) - f_pos;
 
 		if (!folio_test_uptodate(f))
 			folio_mark_uptodate(f);
@@ -955,7 +954,7 @@ static int __bch2_buffered_write(struct bch_inode_info *inode,
 
 	inode->ei_last_dirtied = (unsigned long) current;
 out:
-	darray_for_each(folios, fi) {
+	darray_for_each(fs, fi) {
 		folio_unlock(*fi);
 		folio_put(*fi);
 	}
@@ -968,7 +967,7 @@ out:
 	if (last_folio_pos >= inode->v.i_size)
 		truncate_pagecache(&inode->v, inode->v.i_size);
 
-	darray_exit(&folios);
+	darray_exit(&fs);
 	bch2_folio_reservation_put(c, inode, &res);
 
 	return copied ?: ret;
diff --git a/fs/bcachefs/fs-io-pagecache.c b/fs/bcachefs/fs-io-pagecache.c
index 1e60eead2981..4d1612ede484 100644
--- a/fs/bcachefs/fs-io-pagecache.c
+++ b/fs/bcachefs/fs-io-pagecache.c
@@ -14,7 +14,7 @@
 int bch2_filemap_get_contig_folios_d(struct address_space *mapping,
 				     loff_t start, u64 end,
 				     int fgp_flags, gfp_t gfp,
-				     folios *folios)
+				     folios *fs)
 {
 	struct folio *f;
 	u64 pos = start;
@@ -24,7 +24,7 @@ int bch2_filemap_get_contig_folios_d(struct address_space *mapping,
 		if ((u64) pos >= (u64) start + (1ULL << 20))
 			fgp_flags &= ~FGP_CREAT;
 
-		ret = darray_make_room_gfp(folios, 1, gfp & GFP_KERNEL);
+		ret = darray_make_room_gfp(fs, 1, gfp & GFP_KERNEL);
 		if (ret)
 			break;
 
@@ -32,16 +32,16 @@ int bch2_filemap_get_contig_folios_d(struct address_space *mapping,
 		if (IS_ERR_OR_NULL(f))
 			break;
 
-		BUG_ON(folios->nr && folio_pos(f) != pos);
+		BUG_ON(fs->nr && folio_pos(f) != pos);
 
 		pos = folio_end_pos(f);
-		darray_push(folios, f);
+		darray_push(fs, f);
 	}
 
-	if (!folios->nr && !ret && (fgp_flags & FGP_CREAT))
+	if (!fs->nr && !ret && (fgp_flags & FGP_CREAT))
 		ret = -ENOMEM;
 
-	return folios->nr ? 0 : ret;
+	return fs->nr ? 0 : ret;
 }
 
 /* pagecache_block must be held */
@@ -73,12 +73,15 @@ int bch2_write_invalidate_inode_pages_range(struct address_space *mapping,
 	return ret;
 }
 
+#if 0
+/* Useful for debug tracing: */
 static const char * const bch2_folio_sector_states[] = {
 #define x(n)	#n,
 	BCH_FOLIO_SECTOR_STATE()
 #undef x
 	NULL
 };
+#endif
 
 static inline enum bch_folio_sector_state
 folio_sector_dirty(enum bch_folio_sector_state state)
@@ -177,20 +180,20 @@ static void __bch2_folio_set(struct folio *folio,
  * extents btree:
  */
 int bch2_folio_set(struct bch_fs *c, subvol_inum inum,
-		   struct folio **folios, unsigned nr_folios)
+		   struct folio **fs, unsigned nr_folios)
 {
 	struct btree_trans trans;
 	struct btree_iter iter;
 	struct bkey_s_c k;
 	struct bch_folio *s;
-	u64 offset = folio_sector(folios[0]);
+	u64 offset = folio_sector(fs[0]);
 	unsigned folio_idx;
 	u32 snapshot;
 	bool need_set = false;
 	int ret;
 
 	for (folio_idx = 0; folio_idx < nr_folios; folio_idx++) {
-		s = bch2_folio_create(folios[folio_idx], GFP_KERNEL);
+		s = bch2_folio_create(fs[folio_idx], GFP_KERNEL);
 		if (!s)
 			return -ENOMEM;
 
@@ -216,7 +219,7 @@ retry:
 		unsigned state = bkey_to_sector_state(k);
 
 		while (folio_idx < nr_folios) {
-			struct folio *folio = folios[folio_idx];
+			struct folio *folio = fs[folio_idx];
 			u64 folio_start	= folio_sector(folio);
 			u64 folio_end	= folio_end_sector(folio);
 			unsigned folio_offset = max(bkey_start_offset(k.k), folio_start) -
diff --git a/fs/bcachefs/fs-ioctl.h b/fs/bcachefs/fs-ioctl.h
index f201980ef2c3..54a9c21a3b83 100644
--- a/fs/bcachefs/fs-ioctl.h
+++ b/fs/bcachefs/fs-ioctl.h
@@ -5,7 +5,7 @@
 /* Inode flags: */
 
 /* bcachefs inode flags -> vfs inode flags: */
-static const unsigned bch_flags_to_vfs[] = {
+static const __maybe_unused unsigned bch_flags_to_vfs[] = {
 	[__BCH_INODE_SYNC]	= S_SYNC,
 	[__BCH_INODE_IMMUTABLE]	= S_IMMUTABLE,
 	[__BCH_INODE_APPEND]	= S_APPEND,
@@ -13,7 +13,7 @@ static const unsigned bch_flags_to_vfs[] = {
 };
 
 /* bcachefs inode flags -> FS_IOC_GETFLAGS: */
-static const unsigned bch_flags_to_uflags[] = {
+static const __maybe_unused unsigned bch_flags_to_uflags[] = {
 	[__BCH_INODE_SYNC]	= FS_SYNC_FL,
 	[__BCH_INODE_IMMUTABLE]	= FS_IMMUTABLE_FL,
 	[__BCH_INODE_APPEND]	= FS_APPEND_FL,
@@ -22,7 +22,7 @@ static const unsigned bch_flags_to_uflags[] = {
 };
 
 /* bcachefs inode flags -> FS_IOC_FSGETXATTR: */
-static const unsigned bch_flags_to_xflags[] = {
+static const __maybe_unused unsigned bch_flags_to_xflags[] = {
 	[__BCH_INODE_SYNC]	= FS_XFLAG_SYNC,
 	[__BCH_INODE_IMMUTABLE]	= FS_XFLAG_IMMUTABLE,
 	[__BCH_INODE_APPEND]	= FS_XFLAG_APPEND,
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index 0def3a57bd6d..f814e9e0a741 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -1661,7 +1661,7 @@ static int bch2_remount(struct super_block *sb, int *flags, char *data)
 		up_write(&c->state_lock);
 	}
 
-	if (opts.errors >= 0)
+	if (opt_defined(opts, errors))
 		c->opts.errors = opts.errors;
 err:
 	return bch2_err_class(ret);
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index e8cb4448bf2d..b9c9ece63175 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -471,7 +471,12 @@ static int snapshots_seen_update(struct bch_fs *c, struct snapshots_seen *s,
  * key_visible_in_snapshot - returns true if @id is a descendent of @ancestor,
  * and @ancestor hasn't been overwritten in @seen
  *
- * That is, returns whether key in @ancestor snapshot is visible in @id snapshot
+ * @c:		filesystem handle
+ * @seen:	list of snapshot ids already seen at current position
+ * @id:		descendent snapshot id
+ * @ancestor:	ancestor snapshot id
+ *
+ * Returns:	whether key in @ancestor snapshot is visible in @id snapshot
  */
 static bool key_visible_in_snapshot(struct bch_fs *c, struct snapshots_seen *seen,
 				    u32 id, u32 ancestor)
@@ -516,14 +521,16 @@ static bool key_visible_in_snapshot(struct bch_fs *c, struct snapshots_seen *see
  * snapshot id @dst, test whether there is some snapshot in which @dst is
  * visible.
  *
- * This assumes we're visiting @src keys in natural key order.
+ * @c:		filesystem handle
+ * @s:		list of snapshot IDs already seen at @src
+ * @src:	snapshot ID of src key
+ * @dst:	snapshot ID of dst key
+ * Returns:	true if there is some snapshot in which @dst is visible
  *
- * @s	- list of snapshot IDs already seen at @src
- * @src	- snapshot ID of src key
- * @dst	- snapshot ID of dst key
+ * Assumes we're visiting @src keys in natural key order
  */
-static int ref_visible(struct bch_fs *c, struct snapshots_seen *s,
-		       u32 src, u32 dst)
+static bool ref_visible(struct bch_fs *c, struct snapshots_seen *s,
+			u32 src, u32 dst)
 {
 	return dst <= src
 		? key_visible_in_snapshot(c, s, dst, src)
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index 4548de6e97b2..81ff2720835b 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -120,8 +120,7 @@ static inline void bch2_inode_pack_inlined(struct bkey_inode_buf *packed,
 	if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) {
 		struct bch_inode_unpacked unpacked;
 
-		int ret = bch2_inode_unpack(bkey_i_to_s_c(&packed->inode.k_i),
-					   &unpacked);
+		ret = bch2_inode_unpack(bkey_i_to_s_c(&packed->inode.k_i), &unpacked);
 		BUG_ON(ret);
 		BUG_ON(unpacked.bi_inum		!= inode->bi_inum);
 		BUG_ON(unpacked.bi_hash_seed	!= inode->bi_hash_seed);
diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c
index 7f29fd2f05b1..3439e9553325 100644
--- a/fs/bcachefs/io_write.c
+++ b/fs/bcachefs/io_write.c
@@ -489,7 +489,8 @@ static noinline int bch2_write_drop_io_error_ptrs(struct bch_write_op *op)
 }
 
 /**
- * bch_write_index - after a write, update index to point to new data
+ * __bch2_write_index - after a write, update index to point to new data
+ * @op:		bch_write_op to process
  */
 static void __bch2_write_index(struct bch_write_op *op)
 {
@@ -526,10 +527,10 @@ static void __bch2_write_index(struct bch_write_op *op)
 		op->written += sectors_start - keylist_sectors(keys);
 
 		if (ret && !bch2_err_matches(ret, EROFS)) {
-			struct bkey_i *k = bch2_keylist_front(&op->insert_keys);
+			struct bkey_i *insert = bch2_keylist_front(&op->insert_keys);
 
 			bch_err_inum_offset_ratelimited(c,
-				k->k.p.inode, k->k.p.offset << 9,
+				insert->k.p.inode, insert->k.p.offset << 9,
 				"write error while doing btree update: %s",
 				bch2_err_str(ret));
 		}
@@ -1179,10 +1180,10 @@ static void bch2_nocow_write_convert_unwritten(struct bch_write_op *op)
 		}));
 
 		if (ret && !bch2_err_matches(ret, EROFS)) {
-			struct bkey_i *k = bch2_keylist_front(&op->insert_keys);
+			struct bkey_i *insert = bch2_keylist_front(&op->insert_keys);
 
 			bch_err_inum_offset_ratelimited(c,
-				k->k.p.inode, k->k.p.offset << 9,
+				insert->k.p.inode, insert->k.p.offset << 9,
 				"write error while doing btree update: %s",
 				bch2_err_str(ret));
 		}
@@ -1546,7 +1547,8 @@ err:
 }
 
 /**
- * bch_write - handle a write to a cache device or flash only volume
+ * bch2_write() - handle a write to a cache device or flash only volume
+ * @cl:		&bch_write_op->cl
  *
  * This is the starting point for any data to end up in a cache device; it could
  * be from a normal write, or a writeback write, or a write to a flash only
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index 4b9295a15837..40455e892112 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -588,8 +588,13 @@ out:
 
 /**
  * bch2_journal_flush_seq_async - wait for a journal entry to be written
+ * @j:		journal object
+ * @seq:	seq to flush
+ * @parent:	closure object to wait with
+ * Returns:	1 if @seq has already been flushed, 0 if @seq is being flushed,
+ *		-EIO if @seq will never be flushed
  *
- * like bch2_journal_wait_on_seq, except that it triggers a write immediately if
+ * Like bch2_journal_wait_on_seq, except that it triggers a write immediately if
  * necessary
  */
 int bch2_journal_flush_seq_async(struct journal *j, u64 seq,
@@ -944,7 +949,7 @@ int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca,
 		goto unlock;
 
 	while (ja->nr < nr) {
-		struct disk_reservation disk_res = { 0, 0 };
+		struct disk_reservation disk_res = { 0, 0, 0 };
 
 		/*
 		 * note: journal buckets aren't really counted as _sectors_ used yet, so
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 0e606009dc46..269c8e8a1d95 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -237,17 +237,17 @@ static void journal_entry_err_msg(struct printbuf *out,
 
 #define journal_entry_err(c, version, jset, entry, msg, ...)		\
 ({									\
-	struct printbuf buf = PRINTBUF;					\
+	struct printbuf _buf = PRINTBUF;				\
 									\
-	journal_entry_err_msg(&buf, version, jset, entry);		\
-	prt_printf(&buf, msg, ##__VA_ARGS__);				\
+	journal_entry_err_msg(&_buf, version, jset, entry);		\
+	prt_printf(&_buf, msg, ##__VA_ARGS__);				\
 									\
 	switch (flags & BKEY_INVALID_WRITE) {				\
 	case READ:							\
-		mustfix_fsck_err(c, "%s", buf.buf);			\
+		mustfix_fsck_err(c, "%s", _buf.buf);			\
 		break;							\
 	case WRITE:							\
-		bch_err(c, "corrupt metadata before write: %s\n", buf.buf);\
+		bch_err(c, "corrupt metadata before write: %s\n", _buf.buf);\
 		if (bch2_fs_inconsistent(c)) {				\
 			ret = -BCH_ERR_fsck_errors_not_fixed;		\
 			goto fsck_err;					\
@@ -255,7 +255,7 @@ static void journal_entry_err_msg(struct printbuf *out,
 		break;							\
 	}								\
 									\
-	printbuf_exit(&buf);						\
+	printbuf_exit(&_buf);						\
 	true;								\
 })
 
@@ -1281,7 +1281,7 @@ int bch2_journal_read(struct bch_fs *c,
 			continue;
 
 		for (ptr = 0; ptr < i->nr_ptrs; ptr++) {
-			struct bch_dev *ca = bch_dev_bkey_exists(c, i->ptrs[ptr].dev);
+			ca = bch_dev_bkey_exists(c, i->ptrs[ptr].dev);
 
 			if (!i->ptrs[ptr].csum_good)
 				bch_err_dev_offset(ca, i->ptrs[ptr].sector,
@@ -1379,16 +1379,21 @@ static void __journal_write_alloc(struct journal *j,
 }
 
 /**
- * journal_next_bucket - move on to the next journal bucket if possible
+ * journal_write_alloc - decide where to write next journal entry
+ *
+ * @j:		journal object
+ * @w:		journal buf (entry to be written)
+ *
+ * Returns: 0 on success, or -EROFS on failure
  */
-static int journal_write_alloc(struct journal *j, struct journal_buf *w,
-			       unsigned sectors)
+static int journal_write_alloc(struct journal *j, struct journal_buf *w)
 {
 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
 	struct bch_devs_mask devs;
 	struct journal_device *ja;
 	struct bch_dev *ca;
 	struct dev_alloc_list devs_sorted;
+	unsigned sectors = vstruct_sectors(w->data, c->block_bits);
 	unsigned target = c->opts.metadata_target ?:
 		c->opts.foreground_target;
 	unsigned i, replicas = 0, replicas_want =
@@ -1812,7 +1817,7 @@ void bch2_journal_write(struct closure *cl)
 
 retry_alloc:
 	spin_lock(&j->lock);
-	ret = journal_write_alloc(j, w, sectors);
+	ret = journal_write_alloc(j, w);
 
 	if (ret && j->can_discard) {
 		spin_unlock(&j->lock);
diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
index 73d135a8f37a..1f3d5890ff11 100644
--- a/fs/bcachefs/journal_reclaim.c
+++ b/fs/bcachefs/journal_reclaim.c
@@ -292,7 +292,6 @@ void bch2_journal_do_discards(struct journal *j)
 
 static void bch2_journal_reclaim_fast(struct journal *j)
 {
-	struct journal_entry_pin_list temp;
 	bool popped = false;
 
 	lockdep_assert_held(&j->lock);
@@ -303,7 +302,7 @@ static void bch2_journal_reclaim_fast(struct journal *j)
 	 */
 	while (!fifo_empty(&j->pin) &&
 	       !atomic_read(&fifo_peek_front(&j->pin).count)) {
-		fifo_pop(&j->pin, temp);
+		j->pin.front++;
 		popped = true;
 	}
 
@@ -419,6 +418,8 @@ void bch2_journal_pin_set(struct journal *j, u64 seq,
 
 /**
  * bch2_journal_pin_flush: ensure journal pin callback is no longer running
+ * @j:		journal object
+ * @pin:	pin to flush
  */
 void bch2_journal_pin_flush(struct journal *j, struct journal_entry_pin *pin)
 {
@@ -579,7 +580,11 @@ static u64 journal_seq_to_flush(struct journal *j)
 }
 
 /**
- * bch2_journal_reclaim - free up journal buckets
+ * __bch2_journal_reclaim - free up journal buckets
+ * @j:		journal object
+ * @direct:	direct or background reclaim?
+ * @kicked:	requested to run since we last ran?
+ * Returns:	0 on success, or -EIO if the journal has been shutdown
  *
  * Background journal reclaim writes out btree nodes. It should be run
  * early enough so that we never completely run out of journal buckets.
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index ac4df53bfde2..d62b757536a3 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -724,7 +724,6 @@ int __bch2_evacuate_bucket(struct btree_trans *trans,
 
 		if (!bp.level) {
 			const struct bch_extent_ptr *ptr;
-			struct bkey_s_c k;
 			unsigned i = 0;
 
 			k = bch2_backpointer_get_key(trans, &iter, bp_pos, bp, 0);
diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c
index 83ebb56a3fae..874c9324ab66 100644
--- a/fs/bcachefs/movinggc.c
+++ b/fs/bcachefs/movinggc.c
@@ -164,7 +164,7 @@ static int bch2_copygc_get_buckets(struct btree_trans *trans,
 				  lru_pos(BCH_LRU_FRAGMENTATION_START, U64_MAX, LRU_TIME_MAX),
 				  0, k, ({
 		struct move_bucket b = { .k.bucket = u64_to_bucket(k.k->p.offset) };
-		int ret = 0;
+		int ret2 = 0;
 
 		saw++;
 
@@ -173,11 +173,11 @@ static int bch2_copygc_get_buckets(struct btree_trans *trans,
 		else if (bucket_in_flight(buckets_in_flight, b.k))
 			in_flight++;
 		else {
-			ret = darray_push(buckets, b) ?: buckets->nr >= nr_to_get;
-			if (ret >= 0)
+			ret2 = darray_push(buckets, b) ?: buckets->nr >= nr_to_get;
+			if (ret2 >= 0)
 				sectors += b.sectors;
 		}
-		ret;
+		ret2;
 	}));
 
 	pr_debug("have: %zu (%zu) saw %zu in flight %zu not movable %zu got %zu (%zu)/%zu buckets ret %i",
@@ -304,13 +304,13 @@ static int bch2_copygc_thread(void *arg)
 	struct moving_context ctxt;
 	struct bch_move_stats move_stats;
 	struct io_clock *clock = &c->io_clock[WRITE];
-	struct buckets_in_flight move_buckets;
+	struct buckets_in_flight buckets;
 	u64 last, wait;
 	int ret = 0;
 
-	memset(&move_buckets, 0, sizeof(move_buckets));
+	memset(&buckets, 0, sizeof(buckets));
 
-	ret = rhashtable_init(&move_buckets.table, &bch_move_bucket_params);
+	ret = rhashtable_init(&buckets.table, &bch_move_bucket_params);
 	if (ret) {
 		bch_err_msg(c, ret, "allocating copygc buckets in flight");
 		return ret;
@@ -329,12 +329,12 @@ static int bch2_copygc_thread(void *arg)
 		cond_resched();
 
 		if (!c->copy_gc_enabled) {
-			move_buckets_wait(&trans, &ctxt, &move_buckets, true);
+			move_buckets_wait(&trans, &ctxt, &buckets, true);
 			kthread_wait_freezable(c->copy_gc_enabled);
 		}
 
 		if (unlikely(freezing(current))) {
-			move_buckets_wait(&trans, &ctxt, &move_buckets, true);
+			move_buckets_wait(&trans, &ctxt, &buckets, true);
 			__refrigerator(false);
 			continue;
 		}
@@ -345,7 +345,7 @@ static int bch2_copygc_thread(void *arg)
 		if (wait > clock->max_slop) {
 			c->copygc_wait_at = last;
 			c->copygc_wait = last + wait;
-			move_buckets_wait(&trans, &ctxt, &move_buckets, true);
+			move_buckets_wait(&trans, &ctxt, &buckets, true);
 			trace_and_count(c, copygc_wait, c, wait, last + wait);
 			bch2_kthread_io_clock_wait(clock, last + wait,
 					MAX_SCHEDULE_TIMEOUT);
@@ -355,14 +355,14 @@ static int bch2_copygc_thread(void *arg)
 		c->copygc_wait = 0;
 
 		c->copygc_running = true;
-		ret = bch2_copygc(&trans, &ctxt, &move_buckets);
+		ret = bch2_copygc(&trans, &ctxt, &buckets);
 		c->copygc_running = false;
 
 		wake_up(&c->copygc_running_wq);
 	}
 
-	move_buckets_wait(&trans, &ctxt, &move_buckets, true);
-	rhashtable_destroy(&move_buckets.table);
+	move_buckets_wait(&trans, &ctxt, &buckets, true);
+	rhashtable_destroy(&buckets.table);
 	bch2_trans_exit(&trans);
 	bch2_moving_ctxt_exit(&ctxt);
 
diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
index 8a9db110d64f..c21c258e4018 100644
--- a/fs/bcachefs/opts.h
+++ b/fs/bcachefs/opts.h
@@ -469,7 +469,7 @@ struct bch_opts {
 #undef x
 };
 
-static const struct bch_opts bch2_opts_default = {
+static const __maybe_unused struct bch_opts bch2_opts_default = {
 #define x(_name, _bits, _mode, _type, _sb_opt, _default, ...)		\
 	._name##_defined = true,					\
 	._name = _default,						\
diff --git a/fs/bcachefs/printbuf.c b/fs/bcachefs/printbuf.c
index c41daa180682..de41f9a14492 100644
--- a/fs/bcachefs/printbuf.c
+++ b/fs/bcachefs/printbuf.c
@@ -81,8 +81,10 @@ void bch2_prt_printf(struct printbuf *out, const char *fmt, ...)
 }
 
 /**
- * printbuf_str - returns printbuf's buf as a C string, guaranteed to be null
- * terminated
+ * bch2_printbuf_str() - returns printbuf's buf as a C string, guaranteed to be
+ * null terminated
+ * @buf:	printbuf to terminate
+ * Returns:	Printbuf contents, as a nul terminated C string
  */
 const char *bch2_printbuf_str(const struct printbuf *buf)
 {
@@ -97,8 +99,9 @@ const char *bch2_printbuf_str(const struct printbuf *buf)
 }
 
 /**
- * printbuf_exit - exit a printbuf, freeing memory it owns and poisoning it
+ * bch2_printbuf_exit() - exit a printbuf, freeing memory it owns and poisoning it
  * against accidental use.
+ * @buf:	printbuf to exit
  */
 void bch2_printbuf_exit(struct printbuf *buf)
 {
@@ -120,7 +123,7 @@ void bch2_printbuf_tabstop_pop(struct printbuf *buf)
 }
 
 /*
- * printbuf_tabstop_set - add a tabstop, n spaces from the previous tabstop
+ * bch2_printbuf_tabstop_set() - add a tabstop, n spaces from the previous tabstop
  *
  * @buf: printbuf to control
  * @spaces: number of spaces from previous tabpstop
@@ -144,7 +147,7 @@ int bch2_printbuf_tabstop_push(struct printbuf *buf, unsigned spaces)
 }
 
 /**
- * printbuf_indent_add - add to the current indent level
+ * bch2_printbuf_indent_add() - add to the current indent level
  *
  * @buf: printbuf to control
  * @spaces: number of spaces to add to the current indent level
@@ -164,7 +167,7 @@ void bch2_printbuf_indent_add(struct printbuf *buf, unsigned spaces)
 }
 
 /**
- * printbuf_indent_sub - subtract from the current indent level
+ * bch2_printbuf_indent_sub() - subtract from the current indent level
  *
  * @buf: printbuf to control
  * @spaces: number of spaces to subtract from the current indent level
@@ -227,9 +230,8 @@ static void __prt_tab(struct printbuf *out)
 }
 
 /**
- * prt_tab - Advance printbuf to the next tabstop
- *
- * @buf: printbuf to control
+ * bch2_prt_tab() - Advance printbuf to the next tabstop
+ * @out:	printbuf to control
  *
  * Advance output to the next tabstop by printing spaces.
  */
@@ -267,7 +269,7 @@ static void __prt_tab_rjust(struct printbuf *buf)
 }
 
 /**
- * prt_tab_rjust - Advance printbuf to the next tabstop, right justifying
+ * bch2_prt_tab_rjust - Advance printbuf to the next tabstop, right justifying
  * previous output
  *
  * @buf: printbuf to control
@@ -284,11 +286,11 @@ void bch2_prt_tab_rjust(struct printbuf *buf)
 }
 
 /**
- * prt_bytes_indented - Print an array of chars, handling embedded control characters
+ * bch2_prt_bytes_indented() - Print an array of chars, handling embedded control characters
  *
- * @out: printbuf to output to
- * @str: string to print
- * @count: number of bytes to print
+ * @out:	output printbuf
+ * @str:	string to print
+ * @count:	number of bytes to print
  *
  * The following contol characters are handled as so:
  *   \n: prt_newline	newline that obeys current indent level
@@ -335,32 +337,38 @@ void bch2_prt_bytes_indented(struct printbuf *out, const char *str, unsigned cou
 }
 
 /**
- * prt_human_readable_u64 - Print out a u64 in human readable units
+ * bch2_prt_human_readable_u64() - Print out a u64 in human readable units
+ * @out:	output printbuf
+ * @v:		integer to print
  *
- * Units of 2^10 (default) or 10^3 are controlled via @buf->si_units
+ * Units of 2^10 (default) or 10^3 are controlled via @out->si_units
  */
-void bch2_prt_human_readable_u64(struct printbuf *buf, u64 v)
+void bch2_prt_human_readable_u64(struct printbuf *out, u64 v)
 {
-	bch2_printbuf_make_room(buf, 10);
-	buf->pos += string_get_size(v, 1, !buf->si_units,
-				    buf->buf + buf->pos,
-				    printbuf_remaining_size(buf));
+	bch2_printbuf_make_room(out, 10);
+	out->pos += string_get_size(v, 1, !out->si_units,
+				    out->buf + out->pos,
+				    printbuf_remaining_size(out));
 }
 
 /**
- * prt_human_readable_s64 - Print out a s64 in human readable units
+ * bch2_prt_human_readable_s64() - Print out a s64 in human readable units
+ * @out:	output printbuf
+ * @v:		integer to print
  *
- * Units of 2^10 (default) or 10^3 are controlled via @buf->si_units
+ * Units of 2^10 (default) or 10^3 are controlled via @out->si_units
  */
-void bch2_prt_human_readable_s64(struct printbuf *buf, s64 v)
+void bch2_prt_human_readable_s64(struct printbuf *out, s64 v)
 {
 	if (v < 0)
-		prt_char(buf, '-');
-	bch2_prt_human_readable_u64(buf, abs(v));
+		prt_char(out, '-');
+	bch2_prt_human_readable_u64(out, abs(v));
 }
 
 /**
- * prt_units_u64 - Print out a u64 according to printbuf unit options
+ * bch2_prt_units_u64() - Print out a u64 according to printbuf unit options
+ * @out:	output printbuf
+ * @v:		integer to print
  *
  * Units are either raw (default), or human reabable units (controlled via
  * @buf->human_readable_units)
@@ -374,7 +382,9 @@ void bch2_prt_units_u64(struct printbuf *out, u64 v)
 }
 
 /**
- * prt_units_s64 - Print out a s64 according to printbuf unit options
+ * bch2_prt_units_s64() - Print out a s64 according to printbuf unit options
+ * @out:	output printbuf
+ * @v:		integer to print
  *
  * Units are either raw (default), or human reabable units (controlled via
  * @buf->human_readable_units)
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index b9dd858fc299..f566c94260d6 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -423,15 +423,9 @@ static int bch2_initialize_subvolumes(struct bch_fs *c)
 	root_volume.v.snapshot	= cpu_to_le32(U32_MAX);
 	root_volume.v.inode	= cpu_to_le64(BCACHEFS_ROOT_INO);
 
-	ret =   bch2_btree_insert(c, BTREE_ID_snapshot_trees,
-				  &root_tree.k_i,
-				  NULL, NULL, 0) ?:
-		bch2_btree_insert(c, BTREE_ID_snapshots,
-				  &root_snapshot.k_i,
-				  NULL, NULL, 0) ?:
-		bch2_btree_insert(c, BTREE_ID_subvolumes,
-				  &root_volume.k_i,
-				  NULL, NULL, 0);
+	ret =   bch2_btree_insert(c, BTREE_ID_snapshot_trees,	&root_tree.k_i, NULL, 0) ?:
+		bch2_btree_insert(c, BTREE_ID_snapshots,	&root_snapshot.k_i, NULL, 0) ?:
+		bch2_btree_insert(c, BTREE_ID_subvolumes,	&root_volume.k_i, NULL, 0);
 	if (ret)
 		bch_err_fn(c, ret);
 	return ret;
@@ -1010,9 +1004,7 @@ int bch2_fs_initialize(struct bch_fs *c)
 	bch2_inode_pack(&packed_inode, &root_inode);
 	packed_inode.inode.k.p.snapshot = U32_MAX;
 
-	ret = bch2_btree_insert(c, BTREE_ID_inodes,
-				&packed_inode.inode.k_i,
-				NULL, NULL, 0);
+	ret = bch2_btree_insert(c, BTREE_ID_inodes, &packed_inode.inode.k_i, NULL, 0);
 	if (ret) {
 		bch_err_msg(c, ret, "creating root directory");
 		goto err;
diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c
index f155428ff395..fb605b25b067 100644
--- a/fs/bcachefs/reflink.c
+++ b/fs/bcachefs/reflink.c
@@ -91,6 +91,9 @@ void bch2_reflink_v_to_text(struct printbuf *out, struct bch_fs *c,
 	bch2_bkey_ptrs_to_text(out, c, k);
 }
 
+#if 0
+Currently disabled, needs to be debugged:
+
 bool bch2_reflink_v_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r)
 {
 	struct bkey_s_reflink_v   l = bkey_s_to_reflink_v(_l);
@@ -98,6 +101,7 @@ bool bch2_reflink_v_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r
 
 	return l.v->refcount == r.v->refcount && bch2_extent_merge(c, _l, _r);
 }
+#endif
 
 int bch2_trans_mark_reflink_v(struct btree_trans *trans,
 			      enum btree_id btree_id, unsigned level,
diff --git a/fs/bcachefs/six.c b/fs/bcachefs/six.c
index 7faa27310de4..b684b9f00c1b 100644
--- a/fs/bcachefs/six.c
+++ b/fs/bcachefs/six.c
@@ -29,7 +29,6 @@ static void do_six_unlock_type(struct six_lock *lock, enum six_lock_type type);
 #define SIX_LOCK_HELD_intent		(1U << 26)
 #define SIX_LOCK_HELD_write		(1U << 27)
 #define SIX_LOCK_WAITING_read		(1U << (28 + SIX_LOCK_read))
-#define SIX_LOCK_WAITING_intent		(1U << (28 + SIX_LOCK_intent))
 #define SIX_LOCK_WAITING_write		(1U << (28 + SIX_LOCK_write))
 #define SIX_LOCK_NOSPIN			(1U << 31)
 
diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c
index cfd70d6dea62..73fca04011ad 100644
--- a/fs/bcachefs/snapshot.c
+++ b/fs/bcachefs/snapshot.c
@@ -507,18 +507,18 @@ static int bch2_snapshot_tree_master_subvol(struct btree_trans *trans,
 	bch2_trans_iter_exit(trans, &iter);
 
 	if (!ret && !found) {
-		struct bkey_i_subvolume *s;
+		struct bkey_i_subvolume *u;
 
 		*subvol_id = bch2_snapshot_tree_oldest_subvol(c, snapshot_root);
 
-		s = bch2_bkey_get_mut_typed(trans, &iter,
+		u = bch2_bkey_get_mut_typed(trans, &iter,
 					    BTREE_ID_subvolumes, POS(0, *subvol_id),
 					    0, subvolume);
-		ret = PTR_ERR_OR_ZERO(s);
+		ret = PTR_ERR_OR_ZERO(u);
 		if (ret)
 			return ret;
 
-		SET_BCH_SUBVOLUME_SNAP(&s->v, false);
+		SET_BCH_SUBVOLUME_SNAP(&u->v, false);
 	}
 
 	return ret;
@@ -930,7 +930,7 @@ static inline void normalize_snapshot_child_pointers(struct bch_snapshot *s)
 		swap(s->children[0], s->children[1]);
 }
 
-int bch2_snapshot_node_delete(struct btree_trans *trans, u32 id)
+static int bch2_snapshot_node_delete(struct btree_trans *trans, u32 id)
 {
 	struct bch_fs *c = trans->c;
 	struct btree_iter iter, p_iter = (struct btree_iter) { NULL };
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index d908b62c19f7..e0bd50983bb2 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -246,9 +246,9 @@ struct bch_sb_field *bch2_sb_field_resize(struct bch_sb_handle *sb,
 		/* XXX: we're not checking that offline device have enough space */
 
 		for_each_online_member(ca, c, i) {
-			struct bch_sb_handle *sb = &ca->disk_sb;
+			struct bch_sb_handle *dev_sb = &ca->disk_sb;
 
-			if (bch2_sb_realloc(sb, le32_to_cpu(sb->sb->u64s) + d)) {
+			if (bch2_sb_realloc(dev_sb, le32_to_cpu(dev_sb->sb->u64s) + d)) {
 				percpu_ref_put(&ca->ref);
 				return NULL;
 			}
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index ef11cede1dba..332951b794b4 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -435,7 +435,7 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early)
 #ifndef BCH_WRITE_REF_DEBUG
 	percpu_ref_reinit(&c->writes);
 #else
-	for (unsigned i = 0; i < BCH_WRITE_REF_NR; i++) {
+	for (i = 0; i < BCH_WRITE_REF_NR; i++) {
 		BUG_ON(atomic_long_read(&c->writes[i]));
 		atomic_long_inc(&c->writes[i]);
 	}
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index 41c6900c34c1..1e26c2645ce4 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -113,10 +113,6 @@ do {									\
 		prt_human_readable_s64(out, val);			\
 } while (0)
 
-#define var_printf(_var, fmt)	sysfs_printf(_var, fmt, var(_var))
-#define var_print(_var)		sysfs_print(_var, var(_var))
-#define var_hprint(_var)	sysfs_hprint(_var, var(_var))
-
 #define sysfs_strtoul(file, var)					\
 do {									\
 	if (attr == &sysfs_ ## file)					\
@@ -139,30 +135,6 @@ do {									\
 	_v;								\
 })
 
-#define strtoul_restrict_or_return(cp, min, max)			\
-({									\
-	unsigned long __v = 0;						\
-	int _r = strtoul_safe_restrict(cp, __v, min, max);		\
-	if (_r)								\
-		return _r;						\
-	__v;								\
-})
-
-#define strtoi_h_or_return(cp)						\
-({									\
-	u64 _v;								\
-	int _r = strtoi_h(cp, &_v);					\
-	if (_r)								\
-		return _r;						\
-	_v;								\
-})
-
-#define sysfs_hatoi(file, var)						\
-do {									\
-	if (attr == &sysfs_ ## file)					\
-		return strtoi_h(buf, &var) ?: (ssize_t) size;		\
-} while (0)
-
 write_attribute(trigger_gc);
 write_attribute(trigger_discards);
 write_attribute(trigger_invalidates);
@@ -291,7 +263,7 @@ static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c
 	    incompressible_sectors = 0,
 	    compressed_sectors_compressed = 0,
 	    compressed_sectors_uncompressed = 0;
-	int ret;
+	int ret = 0;
 
 	if (!test_bit(BCH_FS_STARTED, &c->flags))
 		return -EPERM;
diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c
index 0187c81e32ad..18ccb37b5a26 100644
--- a/fs/bcachefs/tests.c
+++ b/fs/bcachefs/tests.c
@@ -128,14 +128,13 @@ static int test_iterate(struct bch_fs *c, u64 nr)
 	pr_info("inserting test keys");
 
 	for (i = 0; i < nr; i++) {
-		struct bkey_i_cookie k;
+		struct bkey_i_cookie ck;
 
-		bkey_cookie_init(&k.k_i);
-		k.k.p.offset = i;
-		k.k.p.snapshot = U32_MAX;
+		bkey_cookie_init(&ck.k_i);
+		ck.k.p.offset = i;
+		ck.k.p.snapshot = U32_MAX;
 
-		ret = bch2_btree_insert(c, BTREE_ID_xattrs, &k.k_i,
-					NULL, NULL, 0);
+		ret = bch2_btree_insert(c, BTREE_ID_xattrs, &ck.k_i, NULL, 0);
 		if (ret) {
 			bch_err_msg(c, ret, "insert error");
 			goto err;
@@ -194,15 +193,14 @@ static int test_iterate_extents(struct bch_fs *c, u64 nr)
 	pr_info("inserting test extents");
 
 	for (i = 0; i < nr; i += 8) {
-		struct bkey_i_cookie k;
+		struct bkey_i_cookie ck;
 
-		bkey_cookie_init(&k.k_i);
-		k.k.p.offset = i + 8;
-		k.k.p.snapshot = U32_MAX;
-		k.k.size = 8;
+		bkey_cookie_init(&ck.k_i);
+		ck.k.p.offset = i + 8;
+		ck.k.p.snapshot = U32_MAX;
+		ck.k.size = 8;
 
-		ret = bch2_btree_insert(c, BTREE_ID_extents, &k.k_i,
-					NULL, NULL, 0);
+		ret = bch2_btree_insert(c, BTREE_ID_extents, &ck.k_i, NULL, 0);
 		if (ret) {
 			bch_err_msg(c, ret, "insert error");
 			goto err;
@@ -263,14 +261,13 @@ static int test_iterate_slots(struct bch_fs *c, u64 nr)
 	pr_info("inserting test keys");
 
 	for (i = 0; i < nr; i++) {
-		struct bkey_i_cookie k;
+		struct bkey_i_cookie ck;
 
-		bkey_cookie_init(&k.k_i);
-		k.k.p.offset = i * 2;
-		k.k.p.snapshot = U32_MAX;
+		bkey_cookie_init(&ck.k_i);
+		ck.k.p.offset = i * 2;
+		ck.k.p.snapshot = U32_MAX;
 
-		ret = bch2_btree_insert(c, BTREE_ID_xattrs, &k.k_i,
-					NULL, NULL, 0);
+		ret = bch2_btree_insert(c, BTREE_ID_xattrs, &ck.k_i, NULL, 0);
 		if (ret) {
 			bch_err_msg(c, ret, "insert error");
 			goto err;
@@ -336,15 +333,14 @@ static int test_iterate_slots_extents(struct bch_fs *c, u64 nr)
 	pr_info("inserting test keys");
 
 	for (i = 0; i < nr; i += 16) {
-		struct bkey_i_cookie k;
+		struct bkey_i_cookie ck;
 
-		bkey_cookie_init(&k.k_i);
-		k.k.p.offset = i + 16;
-		k.k.p.snapshot = U32_MAX;
-		k.k.size = 8;
+		bkey_cookie_init(&ck.k_i);
+		ck.k.p.offset = i + 16;
+		ck.k.p.snapshot = U32_MAX;
+		ck.k.size = 8;
 
-		ret = bch2_btree_insert(c, BTREE_ID_extents, &k.k_i,
-					NULL, NULL, 0);
+		ret = bch2_btree_insert(c, BTREE_ID_extents, &ck.k_i, NULL, 0);
 		if (ret) {
 			bch_err_msg(c, ret, "insert error");
 			goto err;
@@ -458,8 +454,7 @@ static int insert_test_extent(struct bch_fs *c,
 	k.k_i.k.size = end - start;
 	k.k_i.k.version.lo = test_version++;
 
-	ret = bch2_btree_insert(c, BTREE_ID_extents, &k.k_i,
-				NULL, NULL, 0);
+	ret = bch2_btree_insert(c, BTREE_ID_extents, &k.k_i, NULL, 0);
 	if (ret)
 		bch_err_fn(c, ret);
 	return ret;
@@ -546,8 +541,7 @@ static int test_snapshot_filter(struct bch_fs *c, u32 snapid_lo, u32 snapid_hi)
 
 	bkey_cookie_init(&cookie.k_i);
 	cookie.k.p.snapshot = snapid_hi;
-	ret = bch2_btree_insert(c, BTREE_ID_xattrs, &cookie.k_i,
-				NULL, NULL, 0);
+	ret = bch2_btree_insert(c, BTREE_ID_xattrs, &cookie.k_i, NULL, 0);
 	if (ret)
 		return ret;
 
@@ -572,8 +566,7 @@ static int test_snapshots(struct bch_fs *c, u64 nr)
 
 	bkey_cookie_init(&cookie.k_i);
 	cookie.k.p.snapshot = U32_MAX;
-	ret = bch2_btree_insert(c, BTREE_ID_xattrs, &cookie.k_i,
-				NULL, NULL, 0);
+	ret = bch2_btree_insert(c, BTREE_ID_xattrs, &cookie.k_i, NULL, 0);
 	if (ret)
 		return ret;
 
diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c
index 1ab7e247cca6..08bac0ba8d0b 100644
--- a/fs/bcachefs/util.c
+++ b/fs/bcachefs/util.c
@@ -112,10 +112,10 @@ got_unit:
 
 #define parse_or_ret(cp, _f)			\
 do {						\
-	int ret = _f;				\
-	if (ret < 0)				\
-		return ret;			\
-	cp += ret;				\
+	int _ret = _f;				\
+	if (_ret < 0)				\
+		return _ret;			\
+	cp += _ret;				\
 } while (0)
 
 static int __bch2_strtou64_h(const char *cp, u64 *res)
@@ -605,11 +605,9 @@ void bch2_time_stats_init(struct bch2_time_stats *stats)
 
 /**
  * bch2_ratelimit_delay() - return how long to delay until the next time to do
- * some work
- *
- * @d - the struct bch_ratelimit to update
- *
- * Returns the amount of time to delay by, in jiffies
+ *		some work
+ * @d:		the struct bch_ratelimit to update
+ * Returns:	the amount of time to delay by, in jiffies
  */
 u64 bch2_ratelimit_delay(struct bch_ratelimit *d)
 {
@@ -622,9 +620,8 @@ u64 bch2_ratelimit_delay(struct bch_ratelimit *d)
 
 /**
  * bch2_ratelimit_increment() - increment @d by the amount of work done
- *
- * @d - the struct bch_ratelimit to update
- * @done - the amount of work done, in arbitrary units
+ * @d:		the struct bch_ratelimit to update
+ * @done:	the amount of work done, in arbitrary units
  */
 void bch2_ratelimit_increment(struct bch_ratelimit *d, u64 done)
 {
diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h
index d34423352f60..849a37ae497c 100644
--- a/fs/bcachefs/util.h
+++ b/fs/bcachefs/util.h
@@ -776,12 +776,12 @@ static inline void __move_gap(void *array, size_t element_size,
 
 #define bubble_sort(_base, _nr, _cmp)					\
 do {									\
-	ssize_t _i, _end;						\
+	ssize_t _i, _last;						\
 	bool _swapped = true;						\
 									\
-	for (_end = (ssize_t) (_nr) - 1; _end > 0 && _swapped; --_end) {\
+	for (_last= (ssize_t) (_nr) - 1; _last > 0 && _swapped; --_last) {\
 		_swapped = false;					\
-		for (_i = 0; _i < _end; _i++)				\
+		for (_i = 0; _i < _last; _i++)				\
 			if (_cmp((_base)[_i], (_base)[_i + 1]) > 0) {	\
 				swap((_base)[_i], (_base)[_i + 1]);	\
 				_swapped = true;			\
diff --git a/fs/bcachefs/varint.c b/fs/bcachefs/varint.c
index 2a2ab86ed6e1..cb4f33ed9ab3 100644
--- a/fs/bcachefs/varint.c
+++ b/fs/bcachefs/varint.c
@@ -13,10 +13,9 @@
 
 /**
  * bch2_varint_encode - encode a variable length integer
- * @out - destination to encode to
- * @v	- unsigned integer to encode
- *
- * Returns the size in bytes of the encoded integer - at most 9 bytes
+ * @out:	destination to encode to
+ * @v:		unsigned integer to encode
+ * Returns:	size in bytes of the encoded integer - at most 9 bytes
  */
 int bch2_varint_encode(u8 *out, u64 v)
 {
@@ -40,11 +39,10 @@ int bch2_varint_encode(u8 *out, u64 v)
 
 /**
  * bch2_varint_decode - encode a variable length integer
- * @in	- varint to decode
- * @end	- end of buffer to decode from
- * @out	- on success, decoded integer
- *
- * Returns the size in bytes of the decoded integer - or -1 on failure (would
+ * @in:		varint to decode
+ * @end:	end of buffer to decode from
+ * @out:	on success, decoded integer
+ * Returns:	size in bytes of the decoded integer - or -1 on failure (would
  * have read past the end of the buffer)
  */
 int bch2_varint_decode(const u8 *in, const u8 *end, u64 *out)
@@ -73,6 +71,9 @@ int bch2_varint_decode(const u8 *in, const u8 *end, u64 *out)
 
 /**
  * bch2_varint_encode_fast - fast version of bch2_varint_encode
+ * @out:	destination to encode to
+ * @v:		unsigned integer to encode
+ * Returns:	size in bytes of the encoded integer - at most 9 bytes
  *
  * This version assumes it's always safe to write 8 bytes to @out, even if the
  * encoded integer would be smaller.
@@ -96,6 +97,11 @@ int bch2_varint_encode_fast(u8 *out, u64 v)
 
 /**
  * bch2_varint_decode_fast - fast version of bch2_varint_decode
+ * @in:		varint to decode
+ * @end:	end of buffer to decode from
+ * @out:	on success, decoded integer
+ * Returns:	size in bytes of the decoded integer - or -1 on failure (would
+ * have read past the end of the buffer)
  *
  * This version assumes that it is safe to read at most 8 bytes past the end of
  * @end (we still return an error if the varint extends past @end).
-- 
cgit 


From 6bd68ec266ad71827ef940151067b67b62fb8fed Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 12 Sep 2023 17:16:02 -0400
Subject: bcachefs: Heap allocate btree_trans

We're using more stack than we'd like in a number of functions, and
btree_trans is the biggest object that we stack allocate.

But we have to do a heap allocatation to initialize it anyways, so
there's no real downside to heap allocating the entire thing.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/acl.c                   |  33 +++---
 fs/bcachefs/alloc_background.c      | 133 +++++++++++------------
 fs/bcachefs/alloc_foreground.c      |   2 +-
 fs/bcachefs/backpointers.c          |  22 ++--
 fs/bcachefs/bcachefs.h              |   8 +-
 fs/bcachefs/btree_gc.c              |  95 ++++++++---------
 fs/bcachefs/btree_io.c              |  13 +--
 fs/bcachefs/btree_iter.c            |  82 ++++++++-------
 fs/bcachefs/btree_iter.h            |  14 +--
 fs/bcachefs/btree_key_cache.c       |  12 +--
 fs/bcachefs/btree_trans_commit.c    |   8 +-
 fs/bcachefs/btree_types.h           |   4 +-
 fs/bcachefs/btree_update.c          |   6 +-
 fs/bcachefs/btree_update.h          |  25 ++---
 fs/bcachefs/btree_update_interior.c |  35 +++---
 fs/bcachefs/btree_write_buffer.c    |   2 +-
 fs/bcachefs/buckets.c               |   2 +-
 fs/bcachefs/data_update.c           |   2 +-
 fs/bcachefs/debug.c                 |  34 +++---
 fs/bcachefs/dirent.c                |  31 +++---
 fs/bcachefs/ec.c                    |  34 +++---
 fs/bcachefs/fs-io-buffered.c        |  16 +--
 fs/bcachefs/fs-io-direct.c          |  14 ++-
 fs/bcachefs/fs-io-pagecache.c       |  14 +--
 fs/bcachefs/fs-io.c                 |  84 +++++++--------
 fs/bcachefs/fs.c                    | 166 ++++++++++++++---------------
 fs/bcachefs/fsck.c                  |  96 +++++++----------
 fs/bcachefs/inode.c                 |  46 ++++----
 fs/bcachefs/io_misc.c               |  19 ++--
 fs/bcachefs/io_read.c               |  36 +++----
 fs/bcachefs/io_write.c              |  43 ++++----
 fs/bcachefs/journal.c               |   4 +-
 fs/bcachefs/journal_seq_blacklist.c |  12 +--
 fs/bcachefs/logged_ops.c            |   4 +-
 fs/bcachefs/lru.c                   |   4 +-
 fs/bcachefs/migrate.c               |  24 ++---
 fs/bcachefs/move.c                  |  39 ++++---
 fs/bcachefs/movinggc.c              |  18 ++--
 fs/bcachefs/quota.c                 |  14 +--
 fs/bcachefs/recovery.c              |   6 +-
 fs/bcachefs/reflink.c               |  36 +++----
 fs/bcachefs/snapshot.c              |  58 +++++-----
 fs/bcachefs/subvolume.c             |   6 +-
 fs/bcachefs/super.c                 |   7 --
 fs/bcachefs/sysfs.c                 |  10 +-
 fs/bcachefs/tests.c                 | 205 ++++++++++++++++--------------------
 fs/bcachefs/xattr.c                 |  30 +++---
 47 files changed, 731 insertions(+), 877 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/acl.c b/fs/bcachefs/acl.c
index ae2036b0fcc4..9653401957b3 100644
--- a/fs/bcachefs/acl.c
+++ b/fs/bcachefs/acl.c
@@ -279,18 +279,16 @@ struct posix_acl *bch2_get_acl(struct mnt_idmap *idmap,
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
 	struct bch_hash_info hash = bch2_hash_info_init(c, &inode->ei_inode);
 	struct xattr_search_key search = X_SEARCH(acl_to_xattr_type(type), "", 0);
-	struct btree_trans trans;
+	struct btree_trans *trans = bch2_trans_get(c);
 	struct btree_iter iter = { NULL };
 	struct bkey_s_c_xattr xattr;
 	struct posix_acl *acl = NULL;
 	struct bkey_s_c k;
 	int ret;
-
-	bch2_trans_init(&trans, c, 0, 0);
 retry:
-	bch2_trans_begin(&trans);
+	bch2_trans_begin(trans);
 
-	ret = bch2_hash_lookup(&trans, &iter, bch2_xattr_hash_desc,
+	ret = bch2_hash_lookup(trans, &iter, bch2_xattr_hash_desc,
 			&hash, inode_inum(inode), &search, 0);
 	if (ret) {
 		if (!bch2_err_matches(ret, ENOENT))
@@ -306,7 +304,7 @@ retry:
 	}
 
 	xattr = bkey_s_c_to_xattr(k);
-	acl = bch2_acl_from_disk(&trans, xattr_val(xattr.v),
+	acl = bch2_acl_from_disk(trans, xattr_val(xattr.v),
 			le16_to_cpu(xattr.v->x_val_len));
 
 	if (!IS_ERR(acl))
@@ -315,8 +313,8 @@ out:
 	if (bch2_err_matches(PTR_ERR_OR_ZERO(acl), BCH_ERR_transaction_restart))
 		goto retry;
 
-	bch2_trans_iter_exit(&trans, &iter);
-	bch2_trans_exit(&trans);
+	bch2_trans_iter_exit(trans, &iter);
+	bch2_trans_put(trans);
 	return acl;
 }
 
@@ -356,7 +354,7 @@ int bch2_set_acl(struct mnt_idmap *idmap,
 {
 	struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	struct btree_trans trans;
+	struct btree_trans *trans = bch2_trans_get(c);
 	struct btree_iter inode_iter = { NULL };
 	struct bch_inode_unpacked inode_u;
 	struct posix_acl *acl;
@@ -364,12 +362,11 @@ int bch2_set_acl(struct mnt_idmap *idmap,
 	int ret;
 
 	mutex_lock(&inode->ei_update_lock);
-	bch2_trans_init(&trans, c, 0, 0);
 retry:
-	bch2_trans_begin(&trans);
+	bch2_trans_begin(trans);
 	acl = _acl;
 
-	ret = bch2_inode_peek(&trans, &inode_iter, &inode_u, inode_inum(inode),
+	ret = bch2_inode_peek(trans, &inode_iter, &inode_u, inode_inum(inode),
 			      BTREE_ITER_INTENT);
 	if (ret)
 		goto btree_err;
@@ -382,30 +379,30 @@ retry:
 			goto btree_err;
 	}
 
-	ret = bch2_set_acl_trans(&trans, inode_inum(inode), &inode_u, acl, type);
+	ret = bch2_set_acl_trans(trans, inode_inum(inode), &inode_u, acl, type);
 	if (ret)
 		goto btree_err;
 
 	inode_u.bi_ctime	= bch2_current_time(c);
 	inode_u.bi_mode		= mode;
 
-	ret =   bch2_inode_write(&trans, &inode_iter, &inode_u) ?:
-		bch2_trans_commit(&trans, NULL, NULL, 0);
+	ret =   bch2_inode_write(trans, &inode_iter, &inode_u) ?:
+		bch2_trans_commit(trans, NULL, NULL, 0);
 btree_err:
-	bch2_trans_iter_exit(&trans, &inode_iter);
+	bch2_trans_iter_exit(trans, &inode_iter);
 
 	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
 		goto retry;
 	if (unlikely(ret))
 		goto err;
 
-	bch2_inode_update_after_write(&trans, inode, &inode_u,
+	bch2_inode_update_after_write(trans, inode, &inode_u,
 				      ATTR_CTIME|ATTR_MODE);
 
 	set_cached_acl(&inode->v, type, acl);
 err:
-	bch2_trans_exit(&trans);
 	mutex_unlock(&inode->ei_update_lock);
+	bch2_trans_put(trans);
 
 	return ret;
 }
diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 4eab7e59ae93..19ef7a444c23 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -548,7 +548,7 @@ void bch2_bucket_gens_to_text(struct printbuf *out, struct bch_fs *c, struct bke
 
 int bch2_bucket_gens_init(struct bch_fs *c)
 {
-	struct btree_trans trans;
+	struct btree_trans *trans = bch2_trans_get(c);
 	struct btree_iter iter;
 	struct bkey_s_c k;
 	struct bch_alloc_v4 a;
@@ -559,9 +559,7 @@ int bch2_bucket_gens_init(struct bch_fs *c)
 	u8 gen;
 	int ret;
 
-	bch2_trans_init(&trans, c, 0, 0);
-
-	for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN,
+	for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN,
 			   BTREE_ITER_PREFETCH, k, ret) {
 		/*
 		 * Not a fsck error because this is checked/repaired by
@@ -574,10 +572,10 @@ int bch2_bucket_gens_init(struct bch_fs *c)
 		pos = alloc_gens_pos(iter.pos, &offset);
 
 		if (have_bucket_gens_key && bkey_cmp(iter.pos, pos)) {
-			ret = commit_do(&trans, NULL, NULL,
+			ret = commit_do(trans, NULL, NULL,
 					BTREE_INSERT_NOFAIL|
 					BTREE_INSERT_LAZY_RW,
-				bch2_btree_insert_trans(&trans, BTREE_ID_bucket_gens, &g.k_i, 0));
+				bch2_btree_insert_trans(trans, BTREE_ID_bucket_gens, &g.k_i, 0));
 			if (ret)
 				break;
 			have_bucket_gens_key = false;
@@ -591,15 +589,15 @@ int bch2_bucket_gens_init(struct bch_fs *c)
 
 		g.v.gens[offset] = gen;
 	}
-	bch2_trans_iter_exit(&trans, &iter);
+	bch2_trans_iter_exit(trans, &iter);
 
 	if (have_bucket_gens_key && !ret)
-		ret = commit_do(&trans, NULL, NULL,
+		ret = commit_do(trans, NULL, NULL,
 				BTREE_INSERT_NOFAIL|
 				BTREE_INSERT_LAZY_RW,
-			bch2_btree_insert_trans(&trans, BTREE_ID_bucket_gens, &g.k_i, 0));
+			bch2_btree_insert_trans(trans, BTREE_ID_bucket_gens, &g.k_i, 0));
 
-	bch2_trans_exit(&trans);
+	bch2_trans_put(trans);
 
 	if (ret)
 		bch_err_fn(c, ret);
@@ -608,20 +606,19 @@ int bch2_bucket_gens_init(struct bch_fs *c)
 
 int bch2_alloc_read(struct bch_fs *c)
 {
-	struct btree_trans trans;
+	struct btree_trans *trans = bch2_trans_get(c);
 	struct btree_iter iter;
 	struct bkey_s_c k;
 	struct bch_dev *ca;
 	int ret;
 
 	down_read(&c->gc_lock);
-	bch2_trans_init(&trans, c, 0, 0);
 
 	if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_bucket_gens) {
 		const struct bch_bucket_gens *g;
 		u64 b;
 
-		for_each_btree_key(&trans, iter, BTREE_ID_bucket_gens, POS_MIN,
+		for_each_btree_key(trans, iter, BTREE_ID_bucket_gens, POS_MIN,
 				   BTREE_ITER_PREFETCH, k, ret) {
 			u64 start = bucket_gens_pos_to_alloc(k.k->p, 0).offset;
 			u64 end = bucket_gens_pos_to_alloc(bpos_nosnap_successor(k.k->p), 0).offset;
@@ -645,11 +642,11 @@ int bch2_alloc_read(struct bch_fs *c)
 			     b++)
 				*bucket_gen(ca, b) = g->gens[b & KEY_TYPE_BUCKET_GENS_MASK];
 		}
-		bch2_trans_iter_exit(&trans, &iter);
+		bch2_trans_iter_exit(trans, &iter);
 	} else {
 		struct bch_alloc_v4 a;
 
-		for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN,
+		for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN,
 				   BTREE_ITER_PREFETCH, k, ret) {
 			/*
 			 * Not a fsck error because this is checked/repaired by
@@ -662,10 +659,10 @@ int bch2_alloc_read(struct bch_fs *c)
 
 			*bucket_gen(ca, k.k->p.offset) = bch2_alloc_to_v4(k, &a)->gen;
 		}
-		bch2_trans_iter_exit(&trans, &iter);
+		bch2_trans_iter_exit(trans, &iter);
 	}
 
-	bch2_trans_exit(&trans);
+	bch2_trans_put(trans);
 	up_read(&c->gc_lock);
 
 	if (ret)
@@ -1371,27 +1368,25 @@ fsck_err:
 
 int bch2_check_alloc_info(struct bch_fs *c)
 {
-	struct btree_trans trans;
+	struct btree_trans *trans = bch2_trans_get(c);
 	struct btree_iter iter, discard_iter, freespace_iter, bucket_gens_iter;
 	struct bkey hole;
 	struct bkey_s_c k;
 	int ret = 0;
 
-	bch2_trans_init(&trans, c, 0, 0);
-
-	bch2_trans_iter_init(&trans, &iter, BTREE_ID_alloc, POS_MIN,
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, POS_MIN,
 			     BTREE_ITER_PREFETCH);
-	bch2_trans_iter_init(&trans, &discard_iter, BTREE_ID_need_discard, POS_MIN,
+	bch2_trans_iter_init(trans, &discard_iter, BTREE_ID_need_discard, POS_MIN,
 			     BTREE_ITER_PREFETCH);
-	bch2_trans_iter_init(&trans, &freespace_iter, BTREE_ID_freespace, POS_MIN,
+	bch2_trans_iter_init(trans, &freespace_iter, BTREE_ID_freespace, POS_MIN,
 			     BTREE_ITER_PREFETCH);
-	bch2_trans_iter_init(&trans, &bucket_gens_iter, BTREE_ID_bucket_gens, POS_MIN,
+	bch2_trans_iter_init(trans, &bucket_gens_iter, BTREE_ID_bucket_gens, POS_MIN,
 			     BTREE_ITER_PREFETCH);
 
 	while (1) {
 		struct bpos next;
 
-		bch2_trans_begin(&trans);
+		bch2_trans_begin(trans);
 
 		k = bch2_get_key_or_real_bucket_hole(&iter, &hole);
 		ret = bkey_err(k);
@@ -1404,7 +1399,7 @@ int bch2_check_alloc_info(struct bch_fs *c)
 		if (k.k->type) {
 			next = bpos_nosnap_successor(k.k->p);
 
-			ret = bch2_check_alloc_key(&trans,
+			ret = bch2_check_alloc_key(trans,
 						   k, &iter,
 						   &discard_iter,
 						   &freespace_iter,
@@ -1414,11 +1409,11 @@ int bch2_check_alloc_info(struct bch_fs *c)
 		} else {
 			next = k.k->p;
 
-			ret = bch2_check_alloc_hole_freespace(&trans,
+			ret = bch2_check_alloc_hole_freespace(trans,
 						    bkey_start_pos(k.k),
 						    &next,
 						    &freespace_iter) ?:
-				bch2_check_alloc_hole_bucket_gens(&trans,
+				bch2_check_alloc_hole_bucket_gens(trans,
 						    bkey_start_pos(k.k),
 						    &next,
 						    &bucket_gens_iter);
@@ -1426,7 +1421,7 @@ int bch2_check_alloc_info(struct bch_fs *c)
 				goto bkey_err;
 		}
 
-		ret = bch2_trans_commit(&trans, NULL, NULL,
+		ret = bch2_trans_commit(trans, NULL, NULL,
 					BTREE_INSERT_NOFAIL|
 					BTREE_INSERT_LAZY_RW);
 		if (ret)
@@ -1439,29 +1434,29 @@ bkey_err:
 		if (ret)
 			break;
 	}
-	bch2_trans_iter_exit(&trans, &bucket_gens_iter);
-	bch2_trans_iter_exit(&trans, &freespace_iter);
-	bch2_trans_iter_exit(&trans, &discard_iter);
-	bch2_trans_iter_exit(&trans, &iter);
+	bch2_trans_iter_exit(trans, &bucket_gens_iter);
+	bch2_trans_iter_exit(trans, &freespace_iter);
+	bch2_trans_iter_exit(trans, &discard_iter);
+	bch2_trans_iter_exit(trans, &iter);
 
 	if (ret < 0)
 		goto err;
 
-	ret = for_each_btree_key2(&trans, iter,
+	ret = for_each_btree_key2(trans, iter,
 			BTREE_ID_need_discard, POS_MIN,
 			BTREE_ITER_PREFETCH, k,
-		bch2_check_discard_freespace_key(&trans, &iter, k.k->p)) ?:
-	      for_each_btree_key2(&trans, iter,
+		bch2_check_discard_freespace_key(trans, &iter, k.k->p)) ?:
+	      for_each_btree_key2(trans, iter,
 			BTREE_ID_freespace, POS_MIN,
 			BTREE_ITER_PREFETCH, k,
-		bch2_check_discard_freespace_key(&trans, &iter, k.k->p)) ?:
-	      for_each_btree_key_commit(&trans, iter,
+		bch2_check_discard_freespace_key(trans, &iter, k.k->p)) ?:
+	      for_each_btree_key_commit(trans, iter,
 			BTREE_ID_bucket_gens, POS_MIN,
 			BTREE_ITER_PREFETCH, k,
 			NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW,
-		bch2_check_bucket_gens_key(&trans, &iter, k));
+		bch2_check_bucket_gens_key(trans, &iter, k));
 err:
-	bch2_trans_exit(&trans);
+	bch2_trans_put(trans);
 	if (ret)
 		bch_err_fn(c, ret);
 	return ret;
@@ -1547,10 +1542,10 @@ int bch2_check_alloc_to_lru_refs(struct bch_fs *c)
 	int ret = 0;
 
 	ret = bch2_trans_run(c,
-		for_each_btree_key_commit(&trans, iter, BTREE_ID_alloc,
+		for_each_btree_key_commit(trans, iter, BTREE_ID_alloc,
 				POS_MIN, BTREE_ITER_PREFETCH, k,
 				NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW,
-			bch2_check_alloc_to_lru_ref(&trans, &iter)));
+			bch2_check_alloc_to_lru_ref(trans, &iter)));
 	if (ret)
 		bch_err_fn(c, ret);
 	return ret;
@@ -1675,29 +1670,25 @@ out:
 static void bch2_do_discards_work(struct work_struct *work)
 {
 	struct bch_fs *c = container_of(work, struct bch_fs, discard_work);
-	struct btree_trans trans;
 	struct btree_iter iter;
 	struct bkey_s_c k;
 	u64 seen = 0, open = 0, need_journal_commit = 0, discarded = 0;
 	struct bpos discard_pos_done = POS_MAX;
 	int ret;
 
-	bch2_trans_init(&trans, c, 0, 0);
-
 	/*
 	 * We're doing the commit in bch2_discard_one_bucket instead of using
 	 * for_each_btree_key_commit() so that we can increment counters after
 	 * successful commit:
 	 */
-	ret = for_each_btree_key2(&trans, iter,
-			BTREE_ID_need_discard, POS_MIN, 0, k,
-		bch2_discard_one_bucket(&trans, &iter, &discard_pos_done,
-					&seen,
-					&open,
-					&need_journal_commit,
-					&discarded));
-
-	bch2_trans_exit(&trans);
+	ret = bch2_trans_run(c,
+		for_each_btree_key2(trans, iter,
+				BTREE_ID_need_discard, POS_MIN, 0, k,
+			bch2_discard_one_bucket(trans, &iter, &discard_pos_done,
+						&seen,
+						&open,
+						&need_journal_commit,
+						&discarded)));
 
 	if (need_journal_commit * 2 > seen)
 		bch2_journal_flush_async(&c->journal, NULL);
@@ -1803,15 +1794,13 @@ static void bch2_do_invalidates_work(struct work_struct *work)
 {
 	struct bch_fs *c = container_of(work, struct bch_fs, invalidate_work);
 	struct bch_dev *ca;
-	struct btree_trans trans;
+	struct btree_trans *trans = bch2_trans_get(c);
 	struct btree_iter iter;
 	struct bkey_s_c k;
 	unsigned i;
 	int ret = 0;
 
-	bch2_trans_init(&trans, c, 0, 0);
-
-	ret = bch2_btree_write_buffer_flush(&trans);
+	ret = bch2_btree_write_buffer_flush(trans);
 	if (ret)
 		goto err;
 
@@ -1819,11 +1808,11 @@ static void bch2_do_invalidates_work(struct work_struct *work)
 		s64 nr_to_invalidate =
 			should_invalidate_buckets(ca, bch2_dev_usage_read(ca));
 
-		ret = for_each_btree_key2_upto(&trans, iter, BTREE_ID_lru,
+		ret = for_each_btree_key2_upto(trans, iter, BTREE_ID_lru,
 				lru_pos(ca->dev_idx, 0, 0),
 				lru_pos(ca->dev_idx, U64_MAX, LRU_TIME_MAX),
 				BTREE_ITER_INTENT, k,
-			invalidate_one_bucket(&trans, &iter, k, &nr_to_invalidate));
+			invalidate_one_bucket(trans, &iter, k, &nr_to_invalidate));
 
 		if (ret < 0) {
 			percpu_ref_put(&ca->ref);
@@ -1831,7 +1820,7 @@ static void bch2_do_invalidates_work(struct work_struct *work)
 		}
 	}
 err:
-	bch2_trans_exit(&trans);
+	bch2_trans_put(trans);
 	bch2_write_ref_put(c, BCH_WRITE_REF_invalidate);
 }
 
@@ -1845,7 +1834,7 @@ void bch2_do_invalidates(struct bch_fs *c)
 static int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca,
 				   unsigned long *last_updated)
 {
-	struct btree_trans trans;
+	struct btree_trans *trans = bch2_trans_get(c);
 	struct btree_iter iter;
 	struct bkey_s_c k;
 	struct bkey hole;
@@ -1853,9 +1842,7 @@ static int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca,
 	struct bch_member *m;
 	int ret;
 
-	bch2_trans_init(&trans, c, 0, 0);
-
-	bch2_trans_iter_init(&trans, &iter, BTREE_ID_alloc,
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc,
 			     POS(ca->dev_idx, ca->mi.first_bucket),
 			     BTREE_ITER_PREFETCH);
 	/*
@@ -1869,7 +1856,7 @@ static int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca,
 			*last_updated = jiffies;
 		}
 
-		bch2_trans_begin(&trans);
+		bch2_trans_begin(trans);
 
 		if (bkey_ge(iter.pos, end)) {
 			ret = 0;
@@ -1889,8 +1876,8 @@ static int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca,
 			struct bch_alloc_v4 a_convert;
 			const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &a_convert);
 
-			ret =   bch2_bucket_do_index(&trans, k, a, true) ?:
-				bch2_trans_commit(&trans, NULL, NULL,
+			ret =   bch2_bucket_do_index(trans, k, a, true) ?:
+				bch2_trans_commit(trans, NULL, NULL,
 						  BTREE_INSERT_LAZY_RW|
 						  BTREE_INSERT_NOFAIL);
 			if (ret)
@@ -1900,7 +1887,7 @@ static int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca,
 		} else {
 			struct bkey_i *freespace;
 
-			freespace = bch2_trans_kmalloc(&trans, sizeof(*freespace));
+			freespace = bch2_trans_kmalloc(trans, sizeof(*freespace));
 			ret = PTR_ERR_OR_ZERO(freespace);
 			if (ret)
 				goto bkey_err;
@@ -1910,8 +1897,8 @@ static int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca,
 			freespace->k.p		= k.k->p;
 			freespace->k.size	= k.k->size;
 
-			ret = bch2_btree_insert_trans(&trans, BTREE_ID_freespace, freespace, 0) ?:
-				bch2_trans_commit(&trans, NULL, NULL,
+			ret = bch2_btree_insert_trans(trans, BTREE_ID_freespace, freespace, 0) ?:
+				bch2_trans_commit(trans, NULL, NULL,
 						  BTREE_INSERT_LAZY_RW|
 						  BTREE_INSERT_NOFAIL);
 			if (ret)
@@ -1926,8 +1913,8 @@ bkey_err:
 			break;
 	}
 
-	bch2_trans_iter_exit(&trans, &iter);
-	bch2_trans_exit(&trans);
+	bch2_trans_iter_exit(trans, &iter);
+	bch2_trans_put(trans);
 
 	if (ret < 0) {
 		bch_err_msg(ca, ret, "initializing free space");
diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index e73b6c82870a..3bc4abd3d7d5 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -602,7 +602,7 @@ struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca,
 	struct open_bucket *ob;
 
 	bch2_trans_do(c, NULL, NULL, 0,
-		      PTR_ERR_OR_ZERO(ob = bch2_bucket_alloc_trans(&trans, ca, watermark,
+		      PTR_ERR_OR_ZERO(ob = bch2_bucket_alloc_trans(trans, ca, watermark,
 							cl, &usage)));
 	return ob;
 }
diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c
index 82109585439b..43defeaa1718 100644
--- a/fs/bcachefs/backpointers.c
+++ b/fs/bcachefs/backpointers.c
@@ -390,10 +390,10 @@ int bch2_check_btree_backpointers(struct bch_fs *c)
 	int ret;
 
 	ret = bch2_trans_run(c,
-		for_each_btree_key_commit(&trans, iter,
+		for_each_btree_key_commit(trans, iter,
 			BTREE_ID_backpointers, POS_MIN, 0, k,
 			NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
-		  bch2_check_btree_backpointer(&trans, &iter, k)));
+		  bch2_check_btree_backpointer(trans, &iter, k)));
 	if (ret)
 		bch_err_fn(c, ret);
 	return ret;
@@ -723,13 +723,12 @@ static int bch2_get_alloc_in_memory_pos(struct btree_trans *trans,
 
 int bch2_check_extents_to_backpointers(struct bch_fs *c)
 {
-	struct btree_trans trans;
+	struct btree_trans *trans = bch2_trans_get(c);
 	struct bpos start = POS_MIN, end;
 	int ret;
 
-	bch2_trans_init(&trans, c, 0, 0);
 	while (1) {
-		ret = bch2_get_alloc_in_memory_pos(&trans, start, &end);
+		ret = bch2_get_alloc_in_memory_pos(trans, start, &end);
 		if (ret)
 			break;
 
@@ -749,13 +748,13 @@ int bch2_check_extents_to_backpointers(struct bch_fs *c)
 			printbuf_exit(&buf);
 		}
 
-		ret = bch2_check_extents_to_backpointers_pass(&trans, start, end);
+		ret = bch2_check_extents_to_backpointers_pass(trans, start, end);
 		if (ret || bpos_eq(end, SPOS_MAX))
 			break;
 
 		start = bpos_successor(end);
 	}
-	bch2_trans_exit(&trans);
+	bch2_trans_put(trans);
 
 	if (ret)
 		bch_err_fn(c, ret);
@@ -824,13 +823,12 @@ static int bch2_check_backpointers_to_extents_pass(struct btree_trans *trans,
 
 int bch2_check_backpointers_to_extents(struct bch_fs *c)
 {
-	struct btree_trans trans;
+	struct btree_trans *trans = bch2_trans_get(c);
 	struct bbpos start = (struct bbpos) { .btree = 0, .pos = POS_MIN, }, end;
 	int ret;
 
-	bch2_trans_init(&trans, c, 0, 0);
 	while (1) {
-		ret = bch2_get_btree_in_memory_pos(&trans,
+		ret = bch2_get_btree_in_memory_pos(trans,
 						   (1U << BTREE_ID_extents)|
 						   (1U << BTREE_ID_reflink),
 						   ~0,
@@ -856,13 +854,13 @@ int bch2_check_backpointers_to_extents(struct bch_fs *c)
 			printbuf_exit(&buf);
 		}
 
-		ret = bch2_check_backpointers_to_extents_pass(&trans, start, end);
+		ret = bch2_check_backpointers_to_extents_pass(trans, start, end);
 		if (ret || !bbpos_cmp(end, BBPOS_MAX))
 			break;
 
 		start = bbpos_successor(end);
 	}
-	bch2_trans_exit(&trans);
+	bch2_trans_put(trans);
 
 	if (ret)
 		bch_err_fn(c, ret);
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 9fe3dac4a005..ad18f3b10af0 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -627,8 +627,8 @@ struct journal_keys {
 	size_t			size;
 };
 
-struct btree_path_buf {
-	struct btree_path	*path;
+struct btree_trans_buf {
+	struct btree_trans	*trans;
 };
 
 #define REPLICAS_DELTA_LIST_MAX	(1U << 16)
@@ -787,9 +787,9 @@ struct bch_fs {
 	/* btree_iter.c: */
 	struct seqmutex		btree_trans_lock;
 	struct list_head	btree_trans_list;
-	mempool_t		btree_paths_pool;
+	mempool_t		btree_trans_pool;
 	mempool_t		btree_trans_mem_pool;
-	struct btree_path_buf  __percpu	*btree_paths_bufs;
+	struct btree_trans_buf  __percpu	*btree_trans_bufs;
 
 	struct srcu_struct	btree_trans_barrier;
 	bool			btree_trans_barrier_initialized;
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 9496ff16fc91..693ed067b1a7 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -529,13 +529,11 @@ fsck_err:
 
 int bch2_check_topology(struct bch_fs *c)
 {
-	struct btree_trans trans;
+	struct btree_trans *trans = bch2_trans_get(c);
 	struct btree *b;
 	unsigned i;
 	int ret = 0;
 
-	bch2_trans_init(&trans, c, 0, 0);
-
 	for (i = 0; i < btree_id_nr_alive(c) && !ret; i++) {
 		struct btree_root *r = bch2_btree_id_root(c, i);
 
@@ -546,8 +544,8 @@ int bch2_check_topology(struct bch_fs *c)
 		if (btree_node_fake(b))
 			continue;
 
-		btree_node_lock_nopath_nofail(&trans, &b->c, SIX_LOCK_read);
-		ret = bch2_btree_repair_topology_recurse(&trans, b);
+		btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read);
+		ret = bch2_btree_repair_topology_recurse(trans, b);
 		six_unlock_read(&b->c.lock);
 
 		if (ret == DROP_THIS_NODE) {
@@ -556,7 +554,7 @@ int bch2_check_topology(struct bch_fs *c)
 		}
 	}
 
-	bch2_trans_exit(&trans);
+	bch2_trans_put(trans);
 
 	return ret;
 }
@@ -1068,35 +1066,33 @@ static inline int btree_id_gc_phase_cmp(enum btree_id l, enum btree_id r)
 
 static int bch2_gc_btrees(struct bch_fs *c, bool initial, bool metadata_only)
 {
-	struct btree_trans trans;
+	struct btree_trans *trans = bch2_trans_get(c);
 	enum btree_id ids[BTREE_ID_NR];
 	unsigned i;
 	int ret = 0;
 
-	bch2_trans_init(&trans, c, 0, 0);
-
 	for (i = 0; i < BTREE_ID_NR; i++)
 		ids[i] = i;
 	bubble_sort(ids, BTREE_ID_NR, btree_id_gc_phase_cmp);
 
 	for (i = 0; i < BTREE_ID_NR && !ret; i++)
 		ret = initial
-			? bch2_gc_btree_init(&trans, ids[i], metadata_only)
-			: bch2_gc_btree(&trans, ids[i], initial, metadata_only);
+			? bch2_gc_btree_init(trans, ids[i], metadata_only)
+			: bch2_gc_btree(trans, ids[i], initial, metadata_only);
 
 	for (i = BTREE_ID_NR; i < btree_id_nr_alive(c) && !ret; i++) {
 		if (!bch2_btree_id_root(c, i)->alive)
 			continue;
 
 		ret = initial
-			? bch2_gc_btree_init(&trans, i, metadata_only)
-			: bch2_gc_btree(&trans, i, initial, metadata_only);
+			? bch2_gc_btree_init(trans, i, metadata_only)
+			: bch2_gc_btree(trans, i, initial, metadata_only);
 	}
 
 	if (ret < 0)
 		bch_err_fn(c, ret);
 
-	bch2_trans_exit(&trans);
+	bch2_trans_put(trans);
 	return ret;
 }
 
@@ -1458,21 +1454,19 @@ fsck_err:
 
 static int bch2_gc_alloc_done(struct bch_fs *c, bool metadata_only)
 {
-	struct btree_trans trans;
+	struct btree_trans *trans = bch2_trans_get(c);
 	struct btree_iter iter;
 	struct bkey_s_c k;
 	struct bch_dev *ca;
 	unsigned i;
 	int ret = 0;
 
-	bch2_trans_init(&trans, c, 0, 0);
-
 	for_each_member_device(ca, c, i) {
-		ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_alloc,
+		ret = for_each_btree_key_commit(trans, iter, BTREE_ID_alloc,
 				POS(ca->dev_idx, ca->mi.first_bucket),
 				BTREE_ITER_SLOTS|BTREE_ITER_PREFETCH, k,
 				NULL, NULL, BTREE_INSERT_LAZY_RW,
-			bch2_alloc_write_key(&trans, &iter, k, metadata_only));
+			bch2_alloc_write_key(trans, &iter, k, metadata_only));
 
 		if (ret < 0) {
 			bch_err_fn(c, ret);
@@ -1481,14 +1475,14 @@ static int bch2_gc_alloc_done(struct bch_fs *c, bool metadata_only)
 		}
 	}
 
-	bch2_trans_exit(&trans);
+	bch2_trans_put(trans);
 	return ret < 0 ? ret : 0;
 }
 
 static int bch2_gc_alloc_start(struct bch_fs *c, bool metadata_only)
 {
 	struct bch_dev *ca;
-	struct btree_trans trans;
+	struct btree_trans *trans = bch2_trans_get(c);
 	struct btree_iter iter;
 	struct bkey_s_c k;
 	struct bucket *g;
@@ -1504,7 +1498,8 @@ static int bch2_gc_alloc_start(struct bch_fs *c, bool metadata_only)
 		if (!buckets) {
 			percpu_ref_put(&ca->ref);
 			bch_err(c, "error allocating ca->buckets[gc]");
-			return -BCH_ERR_ENOMEM_gc_alloc_start;
+			ret = -BCH_ERR_ENOMEM_gc_alloc_start;
+			goto err;
 		}
 
 		buckets->first_bucket	= ca->mi.first_bucket;
@@ -1512,9 +1507,7 @@ static int bch2_gc_alloc_start(struct bch_fs *c, bool metadata_only)
 		rcu_assign_pointer(ca->buckets_gc, buckets);
 	}
 
-	bch2_trans_init(&trans, c, 0, 0);
-
-	for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN,
+	for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN,
 			   BTREE_ITER_PREFETCH, k, ret) {
 		ca = bch_dev_bkey_exists(c, k.k->p.inode);
 		g = gc_bucket(ca, k.k->p.offset);
@@ -1535,13 +1528,11 @@ static int bch2_gc_alloc_start(struct bch_fs *c, bool metadata_only)
 			g->stripe_redundancy	= a->stripe_redundancy;
 		}
 	}
-	bch2_trans_iter_exit(&trans, &iter);
-
-	bch2_trans_exit(&trans);
-
+	bch2_trans_iter_exit(trans, &iter);
+err:
+	bch2_trans_put(trans);
 	if (ret)
 		bch_err_fn(c, ret);
-
 	return ret;
 }
 
@@ -1616,7 +1607,7 @@ fsck_err:
 
 static int bch2_gc_reflink_done(struct bch_fs *c, bool metadata_only)
 {
-	struct btree_trans trans;
+	struct btree_trans *trans;
 	struct btree_iter iter;
 	struct bkey_s_c k;
 	size_t idx = 0;
@@ -1625,23 +1616,23 @@ static int bch2_gc_reflink_done(struct bch_fs *c, bool metadata_only)
 	if (metadata_only)
 		return 0;
 
-	bch2_trans_init(&trans, c, 0, 0);
+	trans = bch2_trans_get(c);
 
-	ret = for_each_btree_key_commit(&trans, iter,
+	ret = for_each_btree_key_commit(trans, iter,
 			BTREE_ID_reflink, POS_MIN,
 			BTREE_ITER_PREFETCH, k,
 			NULL, NULL, BTREE_INSERT_NOFAIL,
-		bch2_gc_write_reflink_key(&trans, &iter, k, &idx));
+		bch2_gc_write_reflink_key(trans, &iter, k, &idx));
 
 	c->reflink_gc_nr = 0;
-	bch2_trans_exit(&trans);
+	bch2_trans_put(trans);
 	return ret;
 }
 
 static int bch2_gc_reflink_start(struct bch_fs *c,
 				 bool metadata_only)
 {
-	struct btree_trans trans;
+	struct btree_trans *trans;
 	struct btree_iter iter;
 	struct bkey_s_c k;
 	struct reflink_gc *r;
@@ -1650,10 +1641,10 @@ static int bch2_gc_reflink_start(struct bch_fs *c,
 	if (metadata_only)
 		return 0;
 
-	bch2_trans_init(&trans, c, 0, 0);
+	trans = bch2_trans_get(c);
 	c->reflink_gc_nr = 0;
 
-	for_each_btree_key(&trans, iter, BTREE_ID_reflink, POS_MIN,
+	for_each_btree_key(trans, iter, BTREE_ID_reflink, POS_MIN,
 			   BTREE_ITER_PREFETCH, k, ret) {
 		const __le64 *refcount = bkey_refcount_c(k);
 
@@ -1671,9 +1662,9 @@ static int bch2_gc_reflink_start(struct bch_fs *c,
 		r->size		= k.k->size;
 		r->refcount	= 0;
 	}
-	bch2_trans_iter_exit(&trans, &iter);
+	bch2_trans_iter_exit(trans, &iter);
 
-	bch2_trans_exit(&trans);
+	bch2_trans_put(trans);
 	return ret;
 }
 
@@ -1740,7 +1731,7 @@ fsck_err:
 
 static int bch2_gc_stripes_done(struct bch_fs *c, bool metadata_only)
 {
-	struct btree_trans trans;
+	struct btree_trans *trans;
 	struct btree_iter iter;
 	struct bkey_s_c k;
 	int ret = 0;
@@ -1748,15 +1739,15 @@ static int bch2_gc_stripes_done(struct bch_fs *c, bool metadata_only)
 	if (metadata_only)
 		return 0;
 
-	bch2_trans_init(&trans, c, 0, 0);
+	trans = bch2_trans_get(c);
 
-	ret = for_each_btree_key_commit(&trans, iter,
+	ret = for_each_btree_key_commit(trans, iter,
 			BTREE_ID_stripes, POS_MIN,
 			BTREE_ITER_PREFETCH, k,
 			NULL, NULL, BTREE_INSERT_NOFAIL,
-		bch2_gc_write_stripes_key(&trans, &iter, k));
+		bch2_gc_write_stripes_key(trans, &iter, k));
 
-	bch2_trans_exit(&trans);
+	bch2_trans_put(trans);
 	return ret;
 }
 
@@ -1942,7 +1933,7 @@ static int bch2_alloc_write_oldest_gen(struct btree_trans *trans, struct btree_i
 
 int bch2_gc_gens(struct bch_fs *c)
 {
-	struct btree_trans trans;
+	struct btree_trans *trans;
 	struct btree_iter iter;
 	struct bkey_s_c k;
 	struct bch_dev *ca;
@@ -1960,7 +1951,7 @@ int bch2_gc_gens(struct bch_fs *c)
 
 	trace_and_count(c, gc_gens_start, c);
 	down_read(&c->gc_lock);
-	bch2_trans_init(&trans, c, 0, 0);
+	trans = bch2_trans_get(c);
 
 	for_each_member_device(ca, c, i) {
 		struct bucket_gens *gens;
@@ -1986,26 +1977,26 @@ int bch2_gc_gens(struct bch_fs *c)
 			c->gc_gens_btree = i;
 			c->gc_gens_pos = POS_MIN;
 
-			ret = for_each_btree_key_commit(&trans, iter, i,
+			ret = for_each_btree_key_commit(trans, iter, i,
 					POS_MIN,
 					BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS,
 					k,
 					NULL, NULL,
 					BTREE_INSERT_NOFAIL,
-				gc_btree_gens_key(&trans, &iter, k));
+				gc_btree_gens_key(trans, &iter, k));
 			if (ret && !bch2_err_matches(ret, EROFS))
 				bch_err_fn(c, ret);
 			if (ret)
 				goto err;
 		}
 
-	ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_alloc,
+	ret = for_each_btree_key_commit(trans, iter, BTREE_ID_alloc,
 			POS_MIN,
 			BTREE_ITER_PREFETCH,
 			k,
 			NULL, NULL,
 			BTREE_INSERT_NOFAIL,
-		bch2_alloc_write_oldest_gen(&trans, &iter, k));
+		bch2_alloc_write_oldest_gen(trans, &iter, k));
 	if (ret && !bch2_err_matches(ret, EROFS))
 		bch_err_fn(c, ret);
 	if (ret)
@@ -2024,7 +2015,7 @@ err:
 		ca->oldest_gen = NULL;
 	}
 
-	bch2_trans_exit(&trans);
+	bch2_trans_put(trans);
 	up_read(&c->gc_lock);
 	mutex_unlock(&c->gc_gens_lock);
 	return ret;
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 9fa9ed641300..a869cf6ac7c6 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -1628,8 +1628,7 @@ err:
 int bch2_btree_root_read(struct bch_fs *c, enum btree_id id,
 			const struct bkey_i *k, unsigned level)
 {
-	return bch2_trans_run(c, __bch2_btree_root_read(&trans, id, k, level));
-
+	return bch2_trans_run(c, __bch2_btree_root_read(trans, id, k, level));
 }
 
 void bch2_btree_complete_write(struct bch_fs *c, struct btree *b,
@@ -1691,15 +1690,13 @@ static void __btree_node_write_done(struct bch_fs *c, struct btree *b)
 
 static void btree_node_write_done(struct bch_fs *c, struct btree *b)
 {
-	struct btree_trans trans;
-
-	bch2_trans_init(&trans, c, 0, 0);
+	struct btree_trans *trans = bch2_trans_get(c);
 
-	btree_node_lock_nopath_nofail(&trans, &b->c, SIX_LOCK_read);
+	btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read);
 	__btree_node_write_done(c, b);
 	six_unlock_read(&b->c.lock);
 
-	bch2_trans_exit(&trans);
+	bch2_trans_put(trans);
 }
 
 static void btree_node_write_work(struct work_struct *work)
@@ -1728,7 +1725,7 @@ static void btree_node_write_work(struct work_struct *work)
 		}
 	} else {
 		ret = bch2_trans_do(c, NULL, NULL, 0,
-			bch2_btree_node_update_key_get_iter(&trans, b, &wbio->key,
+			bch2_btree_node_update_key_get_iter(trans, b, &wbio->key,
 					BCH_WATERMARK_reclaim|
 					BTREE_INSERT_JOURNAL_RECLAIM|
 					BTREE_INSERT_NOFAIL|
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 6c064e82c0c8..1d79514754d7 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -2906,28 +2906,23 @@ u32 bch2_trans_begin(struct btree_trans *trans)
 	return trans->restart_count;
 }
 
-static void bch2_trans_alloc_paths(struct btree_trans *trans, struct bch_fs *c)
+static struct btree_trans *bch2_trans_alloc(struct bch_fs *c)
 {
-	size_t paths_bytes	= sizeof(struct btree_path) * BTREE_ITER_MAX;
-	size_t updates_bytes	= sizeof(struct btree_insert_entry) * BTREE_ITER_MAX;
-	void *p = NULL;
+	struct btree_trans *trans;
 
-	BUG_ON(trans->used_mempool);
-
-#ifdef __KERNEL__
-	p = this_cpu_xchg(c->btree_paths_bufs->path, NULL);
-#endif
-	if (!p) {
-		p = mempool_alloc(&trans->c->btree_paths_pool, GFP_NOFS);
-		/*
-		 * paths need to be zeroed, bch2_check_for_deadlock looks at
-		 * paths in other threads
-		 */
-		memset(p, 0, paths_bytes);
+	if (IS_ENABLED(__KERNEL__)) {
+		trans = this_cpu_xchg(c->btree_trans_bufs->trans, NULL);
+		if (trans)
+			return trans;
 	}
 
-	trans->paths		= p; p += paths_bytes;
-	trans->updates		= p; p += updates_bytes;
+	trans = mempool_alloc(&c->btree_trans_pool, GFP_NOFS);
+	/*
+	 * paths need to be zeroed, bch2_check_for_deadlock looks at
+	 * paths in other threads
+	 */
+	memset(&trans->paths, 0, sizeof(trans->paths));
+	return trans;
 }
 
 const char *bch2_btree_transaction_fns[BCH_TRANSACTIONS_NR];
@@ -2947,11 +2942,14 @@ unsigned bch2_trans_get_fn_idx(const char *fn)
 	return i;
 }
 
-void __bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, unsigned fn_idx)
+struct btree_trans *__bch2_trans_get(struct bch_fs *c, unsigned fn_idx)
 	__acquires(&c->btree_trans_barrier)
 {
+	struct btree_trans *trans;
 	struct btree_transaction_stats *s;
 
+	trans = bch2_trans_alloc(c);
+
 	memset(trans, 0, sizeof(*trans));
 	trans->c		= c;
 	trans->fn		= fn_idx < ARRAY_SIZE(bch2_btree_transaction_fns)
@@ -2963,8 +2961,6 @@ void __bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, unsigned fn_
 		!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags);
 	closure_init_stack(&trans->ref);
 
-	bch2_trans_alloc_paths(trans, c);
-
 	s = btree_trans_stats(trans);
 	if (s && s->max_mem) {
 		unsigned expected_mem_bytes = roundup_pow_of_two(s->max_mem);
@@ -3010,6 +3006,8 @@ void __bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, unsigned fn_
 list_add_done:
 		seqmutex_unlock(&c->btree_trans_lock);
 	}
+
+	return trans;
 }
 
 static void check_btree_paths_leaked(struct btree_trans *trans)
@@ -3034,7 +3032,7 @@ leaked:
 #endif
 }
 
-void bch2_trans_exit(struct btree_trans *trans)
+void bch2_trans_put(struct btree_trans *trans)
 	__releases(&c->btree_trans_barrier)
 {
 	struct btree_insert_entry *i;
@@ -3080,18 +3078,11 @@ void bch2_trans_exit(struct btree_trans *trans)
 	else
 		kfree(trans->mem);
 
-#ifdef __KERNEL__
-	/*
-	 * Userspace doesn't have a real percpu implementation:
-	 */
-	trans->paths = this_cpu_xchg(c->btree_paths_bufs->path, trans->paths);
-#endif
-
-	if (trans->paths)
-		mempool_free(trans->paths, &c->btree_paths_pool);
-
-	trans->mem	= (void *) 0x1;
-	trans->paths	= (void *) 0x1;
+	/* Userspace doesn't have a real percpu implementation: */
+	if (IS_ENABLED(__KERNEL__))
+		trans = this_cpu_xchg(c->btree_trans_bufs->trans, trans);
+	if (trans)
+		mempool_free(trans, &c->btree_trans_pool);
 }
 
 static void __maybe_unused
@@ -3169,6 +3160,17 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct btree_trans *trans)
 void bch2_fs_btree_iter_exit(struct bch_fs *c)
 {
 	struct btree_transaction_stats *s;
+	struct btree_trans *trans;
+	int cpu;
+
+	trans = list_first_entry_or_null(&c->btree_trans_list, struct btree_trans, list);
+	if (trans)
+		panic("%s leaked btree_trans\n", trans->fn);
+
+	if (c->btree_trans_bufs)
+		for_each_possible_cpu(cpu)
+			kfree(per_cpu_ptr(c->btree_trans_bufs, cpu)->trans);
+	free_percpu(c->btree_trans_bufs);
 
 	for (s = c->btree_transaction_stats;
 	     s < c->btree_transaction_stats + ARRAY_SIZE(c->btree_transaction_stats);
@@ -3180,13 +3182,12 @@ void bch2_fs_btree_iter_exit(struct bch_fs *c)
 	if (c->btree_trans_barrier_initialized)
 		cleanup_srcu_struct(&c->btree_trans_barrier);
 	mempool_exit(&c->btree_trans_mem_pool);
-	mempool_exit(&c->btree_paths_pool);
+	mempool_exit(&c->btree_trans_pool);
 }
 
 int bch2_fs_btree_iter_init(struct bch_fs *c)
 {
 	struct btree_transaction_stats *s;
-	unsigned nr = BTREE_ITER_MAX;
 	int ret;
 
 	for (s = c->btree_transaction_stats;
@@ -3199,9 +3200,12 @@ int bch2_fs_btree_iter_init(struct bch_fs *c)
 	INIT_LIST_HEAD(&c->btree_trans_list);
 	seqmutex_init(&c->btree_trans_lock);
 
-	ret   = mempool_init_kmalloc_pool(&c->btree_paths_pool, 1,
-			sizeof(struct btree_path) * nr +
-			sizeof(struct btree_insert_entry) * nr) ?:
+	c->btree_trans_bufs = alloc_percpu(struct btree_trans_buf);
+	if (!c->btree_trans_bufs)
+		return -ENOMEM;
+
+	ret   = mempool_init_kmalloc_pool(&c->btree_trans_pool, 1,
+					  sizeof(struct btree_trans)) ?:
 		mempool_init_kmalloc_pool(&c->btree_trans_mem_pool, 1,
 					  BTREE_TRANS_MEM_MAX) ?:
 		init_srcu_struct(&c->btree_trans_barrier);
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index 360a26b58501..fbe273453db3 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -915,21 +915,21 @@ void bch2_btree_path_to_text(struct printbuf *, struct btree_path *);
 void bch2_trans_paths_to_text(struct printbuf *, struct btree_trans *);
 void bch2_dump_trans_updates(struct btree_trans *);
 void bch2_dump_trans_paths_updates(struct btree_trans *);
-void __bch2_trans_init(struct btree_trans *, struct bch_fs *, unsigned);
-void bch2_trans_exit(struct btree_trans *);
+
+struct btree_trans *__bch2_trans_get(struct bch_fs *, unsigned);
+void bch2_trans_put(struct btree_trans *);
 
 extern const char *bch2_btree_transaction_fns[BCH_TRANSACTIONS_NR];
 unsigned bch2_trans_get_fn_idx(const char *);
 
-#define bch2_trans_init(_trans, _c, _nr_iters, _mem)			\
-do {									\
+#define bch2_trans_get(_c)						\
+({									\
 	static unsigned trans_fn_idx;					\
 									\
 	if (unlikely(!trans_fn_idx))					\
 		trans_fn_idx = bch2_trans_get_fn_idx(__func__);		\
-									\
-	__bch2_trans_init(_trans, _c, trans_fn_idx);			\
-} while (0)
+	__bch2_trans_get(_c, trans_fn_idx);				\
+})
 
 void bch2_btree_trans_to_text(struct printbuf *, struct btree_trans *);
 
diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index 784f889340cd..29a0b566a4fe 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -704,13 +704,11 @@ int bch2_btree_key_cache_journal_flush(struct journal *j,
 	struct bkey_cached *ck =
 		container_of(pin, struct bkey_cached, journal);
 	struct bkey_cached_key key;
-	struct btree_trans trans;
+	struct btree_trans *trans = bch2_trans_get(c);
 	int srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
 	int ret = 0;
 
-	bch2_trans_init(&trans, c, 0, 0);
-
-	btree_node_lock_nopath_nofail(&trans, &ck->c, SIX_LOCK_read);
+	btree_node_lock_nopath_nofail(trans, &ck->c, SIX_LOCK_read);
 	key = ck->key;
 
 	if (ck->journal.seq != seq ||
@@ -727,13 +725,13 @@ int bch2_btree_key_cache_journal_flush(struct journal *j,
 	}
 	six_unlock_read(&ck->c.lock);
 
-	ret = commit_do(&trans, NULL, NULL, 0,
-		btree_key_cache_flush_pos(&trans, key, seq,
+	ret = commit_do(trans, NULL, NULL, 0,
+		btree_key_cache_flush_pos(trans, key, seq,
 				BTREE_INSERT_JOURNAL_RECLAIM, false));
 unlock:
 	srcu_read_unlock(&c->btree_trans_barrier, srcu_idx);
 
-	bch2_trans_exit(&trans);
+	bch2_trans_put(trans);
 	return ret;
 }
 
diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c
index e3a0b101cbf8..183db5d67a26 100644
--- a/fs/bcachefs/btree_trans_commit.c
+++ b/fs/bcachefs/btree_trans_commit.c
@@ -163,13 +163,11 @@ static int __btree_node_flush(struct journal *j, struct journal_entry_pin *pin,
 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
 	struct btree_write *w = container_of(pin, struct btree_write, journal);
 	struct btree *b = container_of(w, struct btree, writes[i]);
-	struct btree_trans trans;
+	struct btree_trans *trans = bch2_trans_get(c);
 	unsigned long old, new, v;
 	unsigned idx = w - b->writes;
 
-	bch2_trans_init(&trans, c, 0, 0);
-
-	btree_node_lock_nopath_nofail(&trans, &b->c, SIX_LOCK_read);
+	btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read);
 	v = READ_ONCE(b->flags);
 
 	do {
@@ -188,7 +186,7 @@ static int __btree_node_flush(struct journal *j, struct journal_entry_pin *pin,
 	btree_node_write_if_need(c, b, SIX_LOCK_read);
 	six_unlock_read(&b->c.lock);
 
-	bch2_trans_exit(&trans);
+	bch2_trans_put(trans);
 	return 0;
 }
 
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 96a03f414dd0..c9a38e254949 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -452,8 +452,8 @@ struct btree_trans {
 	void			*mem;
 
 	u8			sorted[BTREE_ITER_MAX + 8];
-	struct btree_path	*paths;
-	struct btree_insert_entry *updates;
+	struct btree_path	paths[BTREE_ITER_MAX];
+	struct btree_insert_entry updates[BTREE_ITER_MAX];
 	struct btree_write_buffered_key *wb_updates;
 
 	/* update path: */
diff --git a/fs/bcachefs/btree_update.c b/fs/bcachefs/btree_update.c
index 3d126f043db0..3342718de45d 100644
--- a/fs/bcachefs/btree_update.c
+++ b/fs/bcachefs/btree_update.c
@@ -692,7 +692,7 @@ int bch2_btree_insert(struct bch_fs *c, enum btree_id id, struct bkey_i *k,
 		      struct disk_reservation *disk_res, int flags)
 {
 	return bch2_trans_do(c, disk_res, NULL, flags,
-			     bch2_btree_insert_trans(&trans, id, k, 0));
+			     bch2_btree_insert_trans(trans, id, k, 0));
 }
 
 int bch2_btree_delete_extent_at(struct btree_trans *trans, struct btree_iter *iter,
@@ -824,7 +824,7 @@ int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id,
 			    u64 *journal_seq)
 {
 	int ret = bch2_trans_run(c,
-			bch2_btree_delete_range_trans(&trans, id, start, end,
+			bch2_btree_delete_range_trans(trans, id, start, end,
 						      update_flags, journal_seq));
 	if (ret == -BCH_ERR_transaction_restart_nested)
 		ret = 0;
@@ -898,7 +898,7 @@ __bch2_fs_log_msg(struct bch_fs *c, unsigned commit_flags, const char *fmt,
 	} else {
 		ret = bch2_trans_do(c, NULL, NULL,
 			BTREE_INSERT_LAZY_RW|commit_flags,
-			__bch2_trans_log_msg(&trans.extra_journal_entries, fmt, args));
+			__bch2_trans_log_msg(&trans->extra_journal_entries, fmt, args));
 	}
 
 	return ret;
diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
index 0be980d16007..4bfe602ce8e3 100644
--- a/fs/bcachefs/btree_update.h
+++ b/fs/bcachefs/btree_update.h
@@ -146,30 +146,17 @@ static inline int bch2_trans_commit(struct btree_trans *trans,
 	nested_lockrestart_do(_trans, _do ?: bch2_trans_commit(_trans, (_disk_res),\
 					(_journal_seq), (_flags)))
 
-#define bch2_trans_do(_c, _disk_res, _journal_seq, _flags, _do)		\
-({									\
-	struct btree_trans trans;					\
-	int _ret;							\
-									\
-	bch2_trans_init(&trans, (_c), 0, 0);				\
-	_ret = commit_do(&trans, _disk_res, _journal_seq, _flags, _do);	\
-	bch2_trans_exit(&trans);					\
-									\
-	_ret;								\
-})
-
 #define bch2_trans_run(_c, _do)						\
 ({									\
-	struct btree_trans trans;					\
-	int _ret;							\
-									\
-	bch2_trans_init(&trans, (_c), 0, 0);				\
-	_ret = (_do);							\
-	bch2_trans_exit(&trans);					\
-									\
+	struct btree_trans *trans = bch2_trans_get(_c);			\
+	int _ret = (_do);						\
+	bch2_trans_put(trans);						\
 	_ret;								\
 })
 
+#define bch2_trans_do(_c, _disk_res, _journal_seq, _flags, _do)		\
+	bch2_trans_run(_c, commit_do(trans, _disk_res, _journal_seq, _flags, _do))
+
 #define trans_for_each_update(_trans, _i)				\
 	for ((_i) = (_trans)->updates;					\
 	     (_i) < (_trans)->updates + (_trans)->nr_updates;		\
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index bac495b382bb..7dbf6b6c7f34 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -597,12 +597,11 @@ static void btree_update_nodes_written(struct btree_update *as)
 {
 	struct bch_fs *c = as->c;
 	struct btree *b;
-	struct btree_trans trans;
+	struct btree_trans *trans = bch2_trans_get(c);
 	u64 journal_seq = 0;
 	unsigned i;
 	int ret;
 
-	bch2_trans_init(&trans, c, 0, 512);
 	/*
 	 * If we're already in an error state, it might be because a btree node
 	 * was never written, and we might be trying to free that same btree
@@ -623,7 +622,7 @@ static void btree_update_nodes_written(struct btree_update *as)
 
 		b = as->old_nodes[i];
 
-		btree_node_lock_nopath_nofail(&trans, &b->c, SIX_LOCK_read);
+		btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read);
 		seq = b->data ? b->data->keys.seq : 0;
 		six_unlock_read(&b->c.lock);
 
@@ -645,13 +644,13 @@ static void btree_update_nodes_written(struct btree_update *as)
 	 * journal reclaim does btree updates when flushing bkey_cached entries,
 	 * which may require allocations as well.
 	 */
-	ret = commit_do(&trans, &as->disk_res, &journal_seq,
+	ret = commit_do(trans, &as->disk_res, &journal_seq,
 			BCH_WATERMARK_reclaim|
 			BTREE_INSERT_NOFAIL|
 			BTREE_INSERT_NOCHECK_RW|
 			BTREE_INSERT_JOURNAL_RECLAIM,
-			btree_update_nodes_written_trans(&trans, as));
-	bch2_trans_unlock(&trans);
+			btree_update_nodes_written_trans(trans, as));
+	bch2_trans_unlock(trans);
 
 	bch2_fs_fatal_err_on(ret && !bch2_journal_error(&c->journal), c,
 			     "%s(): error %s", __func__, bch2_err_str(ret));
@@ -660,7 +659,7 @@ err:
 		struct btree_path *path;
 
 		b = as->b;
-		path = get_unlocked_mut_path(&trans, as->btree_id, b->c.level, b->key.k.p);
+		path = get_unlocked_mut_path(trans, as->btree_id, b->c.level, b->key.k.p);
 		/*
 		 * @b is the node we did the final insert into:
 		 *
@@ -683,13 +682,13 @@ err:
 		 * we may rarely end up with a locked path besides the one we
 		 * have here:
 		 */
-		bch2_trans_unlock(&trans);
-		btree_node_lock_nopath_nofail(&trans, &b->c, SIX_LOCK_intent);
-		mark_btree_node_locked(&trans, path, b->c.level, BTREE_NODE_INTENT_LOCKED);
+		bch2_trans_unlock(trans);
+		btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent);
+		mark_btree_node_locked(trans, path, b->c.level, BTREE_NODE_INTENT_LOCKED);
 		path->l[b->c.level].lock_seq = six_lock_seq(&b->c.lock);
 		path->l[b->c.level].b = b;
 
-		bch2_btree_node_lock_write_nofail(&trans, path, &b->c);
+		bch2_btree_node_lock_write_nofail(trans, path, &b->c);
 
 		mutex_lock(&c->btree_interior_update_lock);
 
@@ -729,8 +728,8 @@ err:
 		six_unlock_write(&b->c.lock);
 
 		btree_node_write_if_need(c, b, SIX_LOCK_intent);
-		btree_node_unlock(&trans, path, b->c.level);
-		bch2_path_put(&trans, path, true);
+		btree_node_unlock(trans, path, b->c.level);
+		bch2_path_put(trans, path, true);
 	}
 
 	bch2_journal_pin_drop(&c->journal, &as->journal);
@@ -750,7 +749,7 @@ err:
 	for (i = 0; i < as->nr_new_nodes; i++) {
 		b = as->new_nodes[i];
 
-		btree_node_lock_nopath_nofail(&trans, &b->c, SIX_LOCK_read);
+		btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read);
 		btree_node_write_if_need(c, b, SIX_LOCK_read);
 		six_unlock_read(&b->c.lock);
 	}
@@ -758,8 +757,8 @@ err:
 	for (i = 0; i < as->nr_open_buckets; i++)
 		bch2_open_bucket_put(c, c->open_buckets + as->open_buckets[i]);
 
-	bch2_btree_update_free(as, &trans);
-	bch2_trans_exit(&trans);
+	bch2_btree_update_free(as, trans);
+	bch2_trans_put(trans);
 }
 
 static void btree_interior_update_work(struct work_struct *work)
@@ -2049,7 +2048,7 @@ static void async_btree_node_rewrite_work(struct work_struct *work)
 	int ret;
 
 	ret = bch2_trans_do(c, NULL, NULL, 0,
-		      async_btree_node_rewrite_trans(&trans, a));
+		      async_btree_node_rewrite_trans(trans, a));
 	if (ret)
 		bch_err_fn(c, ret);
 	bch2_write_ref_put(c, BCH_WRITE_REF_node_rewrite);
@@ -2365,7 +2364,7 @@ static int __bch2_btree_root_alloc(struct btree_trans *trans, enum btree_id id)
 
 void bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id)
 {
-	bch2_trans_run(c, __bch2_btree_root_alloc(&trans, id));
+	bch2_trans_run(c, __bch2_btree_root_alloc(trans, id));
 }
 
 void bch2_btree_updates_to_text(struct printbuf *out, struct bch_fs *c)
diff --git a/fs/bcachefs/btree_write_buffer.c b/fs/bcachefs/btree_write_buffer.c
index 6d2d43b6ff6a..4e6241db518b 100644
--- a/fs/bcachefs/btree_write_buffer.c
+++ b/fs/bcachefs/btree_write_buffer.c
@@ -296,7 +296,7 @@ static int bch2_btree_write_buffer_journal_flush(struct journal *j,
 	mutex_lock(&wb->flush_lock);
 
 	return bch2_trans_run(c,
-			__bch2_btree_write_buffer_flush(&trans, BTREE_INSERT_NOCHECK_RW, true));
+			__bch2_btree_write_buffer_flush(trans, BTREE_INSERT_NOCHECK_RW, true));
 }
 
 static inline u64 btree_write_buffer_ref(int idx)
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 78139f73636c..994129142d39 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -1923,7 +1923,7 @@ static int __bch2_trans_mark_dev_sb(struct btree_trans *trans,
 
 int bch2_trans_mark_dev_sb(struct bch_fs *c, struct bch_dev *ca)
 {
-	int ret = bch2_trans_run(c, __bch2_trans_mark_dev_sb(&trans, ca));
+	int ret = bch2_trans_run(c, __bch2_trans_mark_dev_sb(trans, ca));
 
 	if (ret)
 		bch_err_fn(c, ret);
diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c
index 84ca128a59a3..899ff46de8e0 100644
--- a/fs/bcachefs/data_update.c
+++ b/fs/bcachefs/data_update.c
@@ -303,7 +303,7 @@ out:
 
 int bch2_data_update_index_update(struct bch_write_op *op)
 {
-	return bch2_trans_run(op->c, __bch2_data_update_index_update(&trans, op));
+	return bch2_trans_run(op->c, __bch2_data_update_index_update(trans, op));
 }
 
 void bch2_data_update_read_done(struct data_update *m,
diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c
index 7593ba04dfb2..404148bd348a 100644
--- a/fs/bcachefs/debug.c
+++ b/fs/bcachefs/debug.c
@@ -366,7 +366,7 @@ static ssize_t bch2_read_btree(struct file *file, char __user *buf,
 			       size_t size, loff_t *ppos)
 {
 	struct dump_iter *i = file->private_data;
-	struct btree_trans trans;
+	struct btree_trans *trans;
 	struct btree_iter iter;
 	struct bkey_s_c k;
 	ssize_t ret;
@@ -379,17 +379,17 @@ static ssize_t bch2_read_btree(struct file *file, char __user *buf,
 	if (ret)
 		return ret;
 
-	bch2_trans_init(&trans, i->c, 0, 0);
-	ret = for_each_btree_key2(&trans, iter, i->id, i->from,
+	trans = bch2_trans_get(i->c);
+	ret = for_each_btree_key2(trans, iter, i->id, i->from,
 				  BTREE_ITER_PREFETCH|
 				  BTREE_ITER_ALL_SNAPSHOTS, k, ({
 		bch2_bkey_val_to_text(&i->buf, i->c, k);
 		prt_newline(&i->buf);
-		drop_locks_do(&trans, flush_buf(i));
+		drop_locks_do(trans, flush_buf(i));
 	}));
 	i->from = iter.pos;
 
-	bch2_trans_exit(&trans);
+	bch2_trans_put(trans);
 
 	if (!ret)
 		ret = flush_buf(i);
@@ -408,7 +408,7 @@ static ssize_t bch2_read_btree_formats(struct file *file, char __user *buf,
 				       size_t size, loff_t *ppos)
 {
 	struct dump_iter *i = file->private_data;
-	struct btree_trans trans;
+	struct btree_trans *trans;
 	struct btree_iter iter;
 	struct btree *b;
 	ssize_t ret;
@@ -424,26 +424,26 @@ static ssize_t bch2_read_btree_formats(struct file *file, char __user *buf,
 	if (bpos_eq(SPOS_MAX, i->from))
 		return i->ret;
 
-	bch2_trans_init(&trans, i->c, 0, 0);
+	trans = bch2_trans_get(i->c);
 retry:
-	bch2_trans_begin(&trans);
+	bch2_trans_begin(trans);
 
-	for_each_btree_node(&trans, iter, i->id, i->from, 0, b, ret) {
+	for_each_btree_node(trans, iter, i->id, i->from, 0, b, ret) {
 		bch2_btree_node_to_text(&i->buf, i->c, b);
 		i->from = !bpos_eq(SPOS_MAX, b->key.k.p)
 			? bpos_successor(b->key.k.p)
 			: b->key.k.p;
 
-		ret = drop_locks_do(&trans, flush_buf(i));
+		ret = drop_locks_do(trans, flush_buf(i));
 		if (ret)
 			break;
 	}
-	bch2_trans_iter_exit(&trans, &iter);
+	bch2_trans_iter_exit(trans, &iter);
 
 	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
 		goto retry;
 
-	bch2_trans_exit(&trans);
+	bch2_trans_put(trans);
 
 	if (!ret)
 		ret = flush_buf(i);
@@ -462,7 +462,7 @@ static ssize_t bch2_read_bfloat_failed(struct file *file, char __user *buf,
 				       size_t size, loff_t *ppos)
 {
 	struct dump_iter *i = file->private_data;
-	struct btree_trans trans;
+	struct btree_trans *trans;
 	struct btree_iter iter;
 	struct bkey_s_c k;
 	ssize_t ret;
@@ -475,9 +475,9 @@ static ssize_t bch2_read_bfloat_failed(struct file *file, char __user *buf,
 	if (ret)
 		return ret;
 
-	bch2_trans_init(&trans, i->c, 0, 0);
+	trans = bch2_trans_get(i->c);
 
-	ret = for_each_btree_key2(&trans, iter, i->id, i->from,
+	ret = for_each_btree_key2(trans, iter, i->id, i->from,
 				  BTREE_ITER_PREFETCH|
 				  BTREE_ITER_ALL_SNAPSHOTS, k, ({
 		struct btree_path_level *l = &iter.path->l[0];
@@ -490,11 +490,11 @@ static ssize_t bch2_read_bfloat_failed(struct file *file, char __user *buf,
 		}
 
 		bch2_bfloat_to_text(&i->buf, l->b, _k);
-		drop_locks_do(&trans, flush_buf(i));
+		drop_locks_do(trans, flush_buf(i));
 	}));
 	i->from = iter.pos;
 
-	bch2_trans_exit(&trans);
+	bch2_trans_put(trans);
 
 	if (!ret)
 		ret = flush_buf(i);
diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c
index a7559ab03802..6c6c8d57d72b 100644
--- a/fs/bcachefs/dirent.c
+++ b/fs/bcachefs/dirent.c
@@ -479,21 +479,19 @@ u64 bch2_dirent_lookup(struct bch_fs *c, subvol_inum dir,
 		       const struct bch_hash_info *hash_info,
 		       const struct qstr *name, subvol_inum *inum)
 {
-	struct btree_trans trans;
+	struct btree_trans *trans = bch2_trans_get(c);
 	struct btree_iter iter;
 	int ret;
-
-	bch2_trans_init(&trans, c, 0, 0);
 retry:
-	bch2_trans_begin(&trans);
+	bch2_trans_begin(trans);
 
-	ret = __bch2_dirent_lookup_trans(&trans, &iter, dir, hash_info,
+	ret = __bch2_dirent_lookup_trans(trans, &iter, dir, hash_info,
 					  name, inum, 0);
 	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
 		goto retry;
 	if (!ret)
-		bch2_trans_iter_exit(&trans, &iter);
-	bch2_trans_exit(&trans);
+		bch2_trans_iter_exit(trans, &iter);
+	bch2_trans_put(trans);
 	return ret;
 }
 
@@ -522,7 +520,7 @@ int bch2_empty_dir_trans(struct btree_trans *trans, subvol_inum dir)
 
 int bch2_readdir(struct bch_fs *c, subvol_inum inum, struct dir_context *ctx)
 {
-	struct btree_trans trans;
+	struct btree_trans *trans = bch2_trans_get(c);
 	struct btree_iter iter;
 	struct bkey_s_c k;
 	struct bkey_s_c_dirent dirent;
@@ -533,15 +531,14 @@ int bch2_readdir(struct bch_fs *c, subvol_inum inum, struct dir_context *ctx)
 	int ret;
 
 	bch2_bkey_buf_init(&sk);
-	bch2_trans_init(&trans, c, 0, 0);
 retry:
-	bch2_trans_begin(&trans);
+	bch2_trans_begin(trans);
 
-	ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot);
+	ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
 	if (ret)
 		goto err;
 
-	for_each_btree_key_upto_norestart(&trans, iter, BTREE_ID_dirents,
+	for_each_btree_key_upto_norestart(trans, iter, BTREE_ID_dirents,
 			   SPOS(inum.inum, ctx->pos, snapshot),
 			   POS(inum.inum, U64_MAX), 0, k, ret) {
 		if (k.k->type != KEY_TYPE_dirent)
@@ -549,7 +546,7 @@ retry:
 
 		dirent = bkey_s_c_to_dirent(k);
 
-		ret = bch2_dirent_read_target(&trans, inum, dirent, &target);
+		ret = bch2_dirent_read_target(trans, inum, dirent, &target);
 		if (ret < 0)
 			break;
 		if (ret)
@@ -558,7 +555,7 @@ retry:
 		/* dir_emit() can fault and block: */
 		bch2_bkey_buf_reassemble(&sk, c, k);
 		dirent = bkey_i_to_s_c_dirent(sk.k);
-		bch2_trans_unlock(&trans);
+		bch2_trans_unlock(trans);
 
 		name = bch2_dirent_get_name(dirent);
 
@@ -574,16 +571,16 @@ retry:
 		 * read_target looks up subvolumes, we can overflow paths if the
 		 * directory has many subvolumes in it
 		 */
-		ret = btree_trans_too_many_iters(&trans);
+		ret = btree_trans_too_many_iters(trans);
 		if (ret)
 			break;
 	}
-	bch2_trans_iter_exit(&trans, &iter);
+	bch2_trans_iter_exit(trans, &iter);
 err:
 	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
 		goto retry;
 
-	bch2_trans_exit(&trans);
+	bch2_trans_put(trans);
 	bch2_bkey_buf_exit(&sk, c);
 
 	return ret;
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index 40e72b96745a..8646856e4539 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -476,7 +476,7 @@ err:
 
 static int get_stripe_key(struct bch_fs *c, u64 idx, struct ec_stripe_buf *stripe)
 {
-	return bch2_trans_run(c, get_stripe_key_trans(&trans, idx, stripe));
+	return bch2_trans_run(c, get_stripe_key_trans(trans, idx, stripe));
 }
 
 /* recovery read path: */
@@ -788,12 +788,10 @@ static void ec_stripe_delete_work(struct work_struct *work)
 {
 	struct bch_fs *c =
 		container_of(work, struct bch_fs, ec_stripe_delete_work);
-	struct btree_trans trans;
+	struct btree_trans *trans = bch2_trans_get(c);
 	int ret;
 	u64 idx;
 
-	bch2_trans_init(&trans, c, 0, 0);
-
 	while (1) {
 		mutex_lock(&c->ec_stripes_heap_lock);
 		idx = stripe_idx_to_delete(c);
@@ -802,15 +800,15 @@ static void ec_stripe_delete_work(struct work_struct *work)
 		if (!idx)
 			break;
 
-		ret = commit_do(&trans, NULL, NULL, BTREE_INSERT_NOFAIL,
-				ec_stripe_delete(&trans, idx));
+		ret = commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL,
+				ec_stripe_delete(trans, idx));
 		if (ret) {
 			bch_err_fn(c, ret);
 			break;
 		}
 	}
 
-	bch2_trans_exit(&trans);
+	bch2_trans_put(trans);
 
 	bch2_write_ref_put(c, BCH_WRITE_REF_stripe_delete);
 }
@@ -999,24 +997,22 @@ static int ec_stripe_update_bucket(struct btree_trans *trans, struct ec_stripe_b
 
 static int ec_stripe_update_extents(struct bch_fs *c, struct ec_stripe_buf *s)
 {
-	struct btree_trans trans;
+	struct btree_trans *trans = bch2_trans_get(c);
 	struct bch_stripe *v = &bkey_i_to_stripe(&s->key)->v;
 	unsigned i, nr_data = v->nr_blocks - v->nr_redundant;
 	int ret = 0;
 
-	bch2_trans_init(&trans, c, 0, 0);
-
-	ret = bch2_btree_write_buffer_flush(&trans);
+	ret = bch2_btree_write_buffer_flush(trans);
 	if (ret)
 		goto err;
 
 	for (i = 0; i < nr_data; i++) {
-		ret = ec_stripe_update_bucket(&trans, s, i);
+		ret = ec_stripe_update_bucket(trans, s, i);
 		if (ret)
 			break;
 	}
 err:
-	bch2_trans_exit(&trans);
+	bch2_trans_put(trans);
 
 	return ret;
 }
@@ -1124,7 +1120,7 @@ static void ec_stripe_create(struct ec_stripe_new *s)
 	ret = bch2_trans_do(c, &s->res, NULL,
 			    BTREE_INSERT_NOCHECK_RW|
 			    BTREE_INSERT_NOFAIL,
-			    ec_stripe_key_update(&trans,
+			    ec_stripe_key_update(trans,
 					bkey_i_to_stripe(&s->new_stripe.key),
 					!s->have_existing_stripe));
 	if (ret) {
@@ -1822,7 +1818,7 @@ void bch2_fs_ec_flush(struct bch_fs *c)
 
 int bch2_stripes_read(struct bch_fs *c)
 {
-	struct btree_trans trans;
+	struct btree_trans *trans = bch2_trans_get(c);
 	struct btree_iter iter;
 	struct bkey_s_c k;
 	const struct bch_stripe *s;
@@ -1830,9 +1826,7 @@ int bch2_stripes_read(struct bch_fs *c)
 	unsigned i;
 	int ret;
 
-	bch2_trans_init(&trans, c, 0, 0);
-
-	for_each_btree_key(&trans, iter, BTREE_ID_stripes, POS_MIN,
+	for_each_btree_key(trans, iter, BTREE_ID_stripes, POS_MIN,
 			   BTREE_ITER_PREFETCH, k, ret) {
 		if (k.k->type != KEY_TYPE_stripe)
 			continue;
@@ -1855,9 +1849,9 @@ int bch2_stripes_read(struct bch_fs *c)
 
 		bch2_stripes_heap_insert(c, m, k.k->p.offset);
 	}
-	bch2_trans_iter_exit(&trans, &iter);
+	bch2_trans_iter_exit(trans, &iter);
 
-	bch2_trans_exit(&trans);
+	bch2_trans_put(trans);
 
 	if (ret)
 		bch_err_fn(c, ret);
diff --git a/fs/bcachefs/fs-io-buffered.c b/fs/bcachefs/fs-io-buffered.c
index 7650d8b3122a..58ccc7b91ac7 100644
--- a/fs/bcachefs/fs-io-buffered.c
+++ b/fs/bcachefs/fs-io-buffered.c
@@ -270,7 +270,7 @@ void bch2_readahead(struct readahead_control *ractl)
 	struct bch_inode_info *inode = to_bch_ei(ractl->mapping->host);
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
 	struct bch_io_opts opts;
-	struct btree_trans trans;
+	struct btree_trans *trans = bch2_trans_get(c);
 	struct folio *folio;
 	struct readpages_iter readpages_iter;
 	int ret;
@@ -280,8 +280,6 @@ void bch2_readahead(struct readahead_control *ractl)
 	ret = readpages_iter_init(&readpages_iter, ractl);
 	BUG_ON(ret);
 
-	bch2_trans_init(&trans, c, 0, 0);
-
 	bch2_pagecache_add_get(inode);
 
 	while ((folio = readpage_iter_peek(&readpages_iter))) {
@@ -300,31 +298,27 @@ void bch2_readahead(struct readahead_control *ractl)
 		rbio->bio.bi_end_io = bch2_readpages_end_io;
 		BUG_ON(!bio_add_folio(&rbio->bio, folio, folio_size(folio), 0));
 
-		bchfs_read(&trans, rbio, inode_inum(inode),
+		bchfs_read(trans, rbio, inode_inum(inode),
 			   &readpages_iter);
-		bch2_trans_unlock(&trans);
+		bch2_trans_unlock(trans);
 	}
 
 	bch2_pagecache_add_put(inode);
 
-	bch2_trans_exit(&trans);
+	bch2_trans_put(trans);
 	darray_exit(&readpages_iter.folios);
 }
 
 static void __bchfs_readfolio(struct bch_fs *c, struct bch_read_bio *rbio,
 			     subvol_inum inum, struct folio *folio)
 {
-	struct btree_trans trans;
-
 	bch2_folio_create(folio, __GFP_NOFAIL);
 
 	rbio->bio.bi_opf = REQ_OP_READ|REQ_SYNC;
 	rbio->bio.bi_iter.bi_sector = folio_sector(folio);
 	BUG_ON(!bio_add_folio(&rbio->bio, folio, folio_size(folio), 0));
 
-	bch2_trans_init(&trans, c, 0, 0);
-	bchfs_read(&trans, rbio, inum, NULL);
-	bch2_trans_exit(&trans);
+	bch2_trans_run(c, (bchfs_read(trans, rbio, inum, NULL), 0));
 }
 
 static void bch2_read_single_folio_end_io(struct bio *bio)
diff --git a/fs/bcachefs/fs-io-direct.c b/fs/bcachefs/fs-io-direct.c
index 4c61cb18c366..6a9557e7ecab 100644
--- a/fs/bcachefs/fs-io-direct.c
+++ b/fs/bcachefs/fs-io-direct.c
@@ -234,23 +234,21 @@ static bool bch2_check_range_allocated(struct bch_fs *c, subvol_inum inum,
 				       u64 offset, u64 size,
 				       unsigned nr_replicas, bool compressed)
 {
-	struct btree_trans trans;
+	struct btree_trans *trans = bch2_trans_get(c);
 	struct btree_iter iter;
 	struct bkey_s_c k;
 	u64 end = offset + size;
 	u32 snapshot;
 	bool ret = true;
 	int err;
-
-	bch2_trans_init(&trans, c, 0, 0);
 retry:
-	bch2_trans_begin(&trans);
+	bch2_trans_begin(trans);
 
-	err = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot);
+	err = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
 	if (err)
 		goto err;
 
-	for_each_btree_key_norestart(&trans, iter, BTREE_ID_extents,
+	for_each_btree_key_norestart(trans, iter, BTREE_ID_extents,
 			   SPOS(inum.inum, offset, snapshot),
 			   BTREE_ITER_SLOTS, k, err) {
 		if (bkey_ge(bkey_start_pos(k.k), POS(inum.inum, end)))
@@ -265,11 +263,11 @@ retry:
 	}
 
 	offset = iter.pos.offset;
-	bch2_trans_iter_exit(&trans, &iter);
+	bch2_trans_iter_exit(trans, &iter);
 err:
 	if (bch2_err_matches(err, BCH_ERR_transaction_restart))
 		goto retry;
-	bch2_trans_exit(&trans);
+	bch2_trans_put(trans);
 
 	return err ? false : ret;
 }
diff --git a/fs/bcachefs/fs-io-pagecache.c b/fs/bcachefs/fs-io-pagecache.c
index 4d1612ede484..8bd9bcdd27f7 100644
--- a/fs/bcachefs/fs-io-pagecache.c
+++ b/fs/bcachefs/fs-io-pagecache.c
@@ -182,7 +182,7 @@ static void __bch2_folio_set(struct folio *folio,
 int bch2_folio_set(struct bch_fs *c, subvol_inum inum,
 		   struct folio **fs, unsigned nr_folios)
 {
-	struct btree_trans trans;
+	struct btree_trans *trans;
 	struct btree_iter iter;
 	struct bkey_s_c k;
 	struct bch_folio *s;
@@ -204,15 +204,15 @@ int bch2_folio_set(struct bch_fs *c, subvol_inum inum,
 		return 0;
 
 	folio_idx = 0;
-	bch2_trans_init(&trans, c, 0, 0);
+	trans = bch2_trans_get(c);
 retry:
-	bch2_trans_begin(&trans);
+	bch2_trans_begin(trans);
 
-	ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot);
+	ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
 	if (ret)
 		goto err;
 
-	for_each_btree_key_norestart(&trans, iter, BTREE_ID_extents,
+	for_each_btree_key_norestart(trans, iter, BTREE_ID_extents,
 			   SPOS(inum.inum, offset, snapshot),
 			   BTREE_ITER_SLOTS, k, ret) {
 		unsigned nr_ptrs = bch2_bkey_nr_ptrs_fully_allocated(k);
@@ -243,11 +243,11 @@ retry:
 	}
 
 	offset = iter.pos.offset;
-	bch2_trans_iter_exit(&trans, &iter);
+	bch2_trans_iter_exit(trans, &iter);
 err:
 	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
 		goto retry;
-	bch2_trans_exit(&trans);
+	bch2_trans_put(trans);
 
 	return ret;
 }
diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index ffe9206e4c18..b0e8144ec550 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -207,31 +207,29 @@ static inline int range_has_data(struct bch_fs *c, u32 subvol,
 				 struct bpos start,
 				 struct bpos end)
 {
-	struct btree_trans trans;
+	struct btree_trans *trans = bch2_trans_get(c);
 	struct btree_iter iter;
 	struct bkey_s_c k;
 	int ret = 0;
-
-	bch2_trans_init(&trans, c, 0, 0);
 retry:
-	bch2_trans_begin(&trans);
+	bch2_trans_begin(trans);
 
-	ret = bch2_subvolume_get_snapshot(&trans, subvol, &start.snapshot);
+	ret = bch2_subvolume_get_snapshot(trans, subvol, &start.snapshot);
 	if (ret)
 		goto err;
 
-	for_each_btree_key_upto_norestart(&trans, iter, BTREE_ID_extents, start, end, 0, k, ret)
+	for_each_btree_key_upto_norestart(trans, iter, BTREE_ID_extents, start, end, 0, k, ret)
 		if (bkey_extent_is_data(k.k) && !bkey_extent_is_unwritten(k)) {
 			ret = 1;
 			break;
 		}
 	start = iter.pos;
-	bch2_trans_iter_exit(&trans, &iter);
+	bch2_trans_iter_exit(trans, &iter);
 err:
 	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
 		goto retry;
 
-	bch2_trans_exit(&trans);
+	bch2_trans_put(trans);
 	return ret;
 }
 
@@ -582,16 +580,15 @@ static int __bchfs_fallocate(struct bch_inode_info *inode, int mode,
 			     u64 start_sector, u64 end_sector)
 {
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	struct btree_trans trans;
+	struct btree_trans *trans = bch2_trans_get(c);
 	struct btree_iter iter;
 	struct bpos end_pos = POS(inode->v.i_ino, end_sector);
 	struct bch_io_opts opts;
 	int ret = 0;
 
 	bch2_inode_opts_get(&opts, c, &inode->ei_inode);
-	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 512);
 
-	bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
 			POS(inode->v.i_ino, start_sector),
 			BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
 
@@ -604,9 +601,9 @@ static int __bchfs_fallocate(struct bch_inode_info *inode, int mode,
 		u64 hole_start, hole_end;
 		u32 snapshot;
 
-		bch2_trans_begin(&trans);
+		bch2_trans_begin(trans);
 
-		ret = bch2_subvolume_get_snapshot(&trans,
+		ret = bch2_subvolume_get_snapshot(trans,
 					inode->ei_subvol, &snapshot);
 		if (ret)
 			goto bkey_err;
@@ -643,7 +640,7 @@ static int __bchfs_fallocate(struct bch_inode_info *inode, int mode,
 						 &hole_start,
 						 &hole_end,
 						 opts.data_replicas, true))
-				ret = drop_locks_do(&trans,
+				ret = drop_locks_do(trans,
 					(bch2_clamp_data_hole(&inode->v,
 							      &hole_start,
 							      &hole_end,
@@ -666,7 +663,7 @@ static int __bchfs_fallocate(struct bch_inode_info *inode, int mode,
 				goto bkey_err;
 		}
 
-		ret = bch2_extent_fallocate(&trans, inode_inum(inode), &iter,
+		ret = bch2_extent_fallocate(trans, inode_inum(inode), &iter,
 					    sectors, opts, &i_sectors_delta,
 					    writepoint_hashed((unsigned long) current));
 		if (ret)
@@ -674,7 +671,7 @@ static int __bchfs_fallocate(struct bch_inode_info *inode, int mode,
 
 		bch2_i_sectors_acct(c, inode, &quota_res, i_sectors_delta);
 
-		drop_locks_do(&trans,
+		drop_locks_do(trans,
 			(bch2_mark_pagecache_reserved(inode, hole_start, iter.pos.offset), 0));
 bkey_err:
 		bch2_quota_reservation_put(c, inode, &quota_res);
@@ -686,14 +683,14 @@ bkey_err:
 		struct quota_res quota_res = { 0 };
 		s64 i_sectors_delta = 0;
 
-		bch2_fpunch_at(&trans, &iter, inode_inum(inode),
+		bch2_fpunch_at(trans, &iter, inode_inum(inode),
 			       end_sector, &i_sectors_delta);
 		bch2_i_sectors_acct(c, inode, &quota_res, i_sectors_delta);
 		bch2_quota_reservation_put(c, inode, &quota_res);
 	}
 
-	bch2_trans_iter_exit(&trans, &iter);
-	bch2_trans_exit(&trans);
+	bch2_trans_iter_exit(trans, &iter);
+	bch2_trans_put(trans);
 	return ret;
 }
 
@@ -799,26 +796,24 @@ static int quota_reserve_range(struct bch_inode_info *inode,
 			       u64 start, u64 end)
 {
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	struct btree_trans trans;
+	struct btree_trans *trans = bch2_trans_get(c);
 	struct btree_iter iter;
 	struct bkey_s_c k;
 	u32 snapshot;
 	u64 sectors = end - start;
 	u64 pos = start;
 	int ret;
-
-	bch2_trans_init(&trans, c, 0, 0);
 retry:
-	bch2_trans_begin(&trans);
+	bch2_trans_begin(trans);
 
-	ret = bch2_subvolume_get_snapshot(&trans, inode->ei_subvol, &snapshot);
+	ret = bch2_subvolume_get_snapshot(trans, inode->ei_subvol, &snapshot);
 	if (ret)
 		goto err;
 
-	bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
 			     SPOS(inode->v.i_ino, pos, snapshot), 0);
 
-	while (!(ret = btree_trans_too_many_iters(&trans)) &&
+	while (!(ret = btree_trans_too_many_iters(trans)) &&
 	       (k = bch2_btree_iter_peek_upto(&iter, POS(inode->v.i_ino, end - 1))).k &&
 	       !(ret = bkey_err(k))) {
 		if (bkey_extent_is_allocation(k.k)) {
@@ -830,17 +825,14 @@ retry:
 		bch2_btree_iter_advance(&iter);
 	}
 	pos = iter.pos.offset;
-	bch2_trans_iter_exit(&trans, &iter);
+	bch2_trans_iter_exit(trans, &iter);
 err:
 	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
 		goto retry;
 
-	bch2_trans_exit(&trans);
-
-	if (ret)
-		return ret;
+	bch2_trans_put(trans);
 
-	return bch2_quota_reservation_add(c, inode, res, sectors, true);
+	return ret ?: bch2_quota_reservation_add(c, inode, res, sectors, true);
 }
 
 loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src,
@@ -933,7 +925,7 @@ static loff_t bch2_seek_data(struct file *file, u64 offset)
 {
 	struct bch_inode_info *inode = file_bch_inode(file);
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	struct btree_trans trans;
+	struct btree_trans *trans;
 	struct btree_iter iter;
 	struct bkey_s_c k;
 	subvol_inum inum = inode_inum(inode);
@@ -945,15 +937,15 @@ static loff_t bch2_seek_data(struct file *file, u64 offset)
 	if (offset >= isize)
 		return -ENXIO;
 
-	bch2_trans_init(&trans, c, 0, 0);
+	trans = bch2_trans_get(c);
 retry:
-	bch2_trans_begin(&trans);
+	bch2_trans_begin(trans);
 
-	ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot);
+	ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
 	if (ret)
 		goto err;
 
-	for_each_btree_key_upto_norestart(&trans, iter, BTREE_ID_extents,
+	for_each_btree_key_upto_norestart(trans, iter, BTREE_ID_extents,
 			   SPOS(inode->v.i_ino, offset >> 9, snapshot),
 			   POS(inode->v.i_ino, U64_MAX),
 			   0, k, ret) {
@@ -963,12 +955,12 @@ retry:
 		} else if (k.k->p.offset >> 9 > isize)
 			break;
 	}
-	bch2_trans_iter_exit(&trans, &iter);
+	bch2_trans_iter_exit(trans, &iter);
 err:
 	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
 		goto retry;
 
-	bch2_trans_exit(&trans);
+	bch2_trans_put(trans);
 	if (ret)
 		return ret;
 
@@ -986,7 +978,7 @@ static loff_t bch2_seek_hole(struct file *file, u64 offset)
 {
 	struct bch_inode_info *inode = file_bch_inode(file);
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	struct btree_trans trans;
+	struct btree_trans *trans;
 	struct btree_iter iter;
 	struct bkey_s_c k;
 	subvol_inum inum = inode_inum(inode);
@@ -998,15 +990,15 @@ static loff_t bch2_seek_hole(struct file *file, u64 offset)
 	if (offset >= isize)
 		return -ENXIO;
 
-	bch2_trans_init(&trans, c, 0, 0);
+	trans = bch2_trans_get(c);
 retry:
-	bch2_trans_begin(&trans);
+	bch2_trans_begin(trans);
 
-	ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot);
+	ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
 	if (ret)
 		goto err;
 
-	for_each_btree_key_norestart(&trans, iter, BTREE_ID_extents,
+	for_each_btree_key_norestart(trans, iter, BTREE_ID_extents,
 			   SPOS(inode->v.i_ino, offset >> 9, snapshot),
 			   BTREE_ITER_SLOTS, k, ret) {
 		if (k.k->p.inode != inode->v.i_ino) {
@@ -1024,12 +1016,12 @@ retry:
 			offset = max(offset, bkey_start_offset(k.k) << 9);
 		}
 	}
-	bch2_trans_iter_exit(&trans, &iter);
+	bch2_trans_iter_exit(trans, &iter);
 err:
 	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
 		goto retry;
 
-	bch2_trans_exit(&trans);
+	bch2_trans_put(trans);
 	if (ret)
 		return ret;
 
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index f814e9e0a741..bfbd4f004edc 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -82,29 +82,27 @@ int __must_check bch2_write_inode(struct bch_fs *c,
 				  inode_set_fn set,
 				  void *p, unsigned fields)
 {
-	struct btree_trans trans;
+	struct btree_trans *trans = bch2_trans_get(c);
 	struct btree_iter iter = { NULL };
 	struct bch_inode_unpacked inode_u;
 	int ret;
-
-	bch2_trans_init(&trans, c, 0, 512);
 retry:
-	bch2_trans_begin(&trans);
+	bch2_trans_begin(trans);
 
-	ret   = bch2_inode_peek(&trans, &iter, &inode_u, inode_inum(inode),
+	ret   = bch2_inode_peek(trans, &iter, &inode_u, inode_inum(inode),
 				BTREE_ITER_INTENT) ?:
-		(set ? set(&trans, inode, &inode_u, p) : 0) ?:
-		bch2_inode_write(&trans, &iter, &inode_u) ?:
-		bch2_trans_commit(&trans, NULL, NULL, BTREE_INSERT_NOFAIL);
+		(set ? set(trans, inode, &inode_u, p) : 0) ?:
+		bch2_inode_write(trans, &iter, &inode_u) ?:
+		bch2_trans_commit(trans, NULL, NULL, BTREE_INSERT_NOFAIL);
 
 	/*
 	 * the btree node lock protects inode->ei_inode, not ei_update_lock;
 	 * this is important for inode updates via bchfs_write_index_update
 	 */
 	if (!ret)
-		bch2_inode_update_after_write(&trans, inode, &inode_u, fields);
+		bch2_inode_update_after_write(trans, inode, &inode_u, fields);
 
-	bch2_trans_iter_exit(&trans, &iter);
+	bch2_trans_iter_exit(trans, &iter);
 
 	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
 		goto retry;
@@ -114,7 +112,7 @@ retry:
 			     inode_inum(inode).subvol,
 			     inode_inum(inode).inum);
 
-	bch2_trans_exit(&trans);
+	bch2_trans_put(trans);
 	return ret < 0 ? ret : 0;
 }
 
@@ -182,7 +180,7 @@ struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum)
 {
 	struct bch_inode_unpacked inode_u;
 	struct bch_inode_info *inode;
-	struct btree_trans trans;
+	struct btree_trans *trans;
 	struct bch_subvolume subvol;
 	int ret;
 
@@ -196,14 +194,14 @@ struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum)
 	if (!(inode->v.i_state & I_NEW))
 		return &inode->v;
 
-	bch2_trans_init(&trans, c, 8, 0);
-	ret = lockrestart_do(&trans,
-		bch2_subvolume_get(&trans, inum.subvol, true, 0, &subvol) ?:
-		bch2_inode_find_by_inum_trans(&trans, inum, &inode_u));
+	trans = bch2_trans_get(c);
+	ret = lockrestart_do(trans,
+		bch2_subvolume_get(trans, inum.subvol, true, 0, &subvol) ?:
+		bch2_inode_find_by_inum_trans(trans, inum, &inode_u));
 
 	if (!ret)
-		bch2_vfs_inode_init(&trans, inum, inode, &inode_u, &subvol);
-	bch2_trans_exit(&trans);
+		bch2_vfs_inode_init(trans, inum, inode, &inode_u, &subvol);
+	bch2_trans_put(trans);
 
 	if (ret) {
 		iget_failed(&inode->v);
@@ -226,7 +224,7 @@ __bch2_create(struct mnt_idmap *idmap,
 	      unsigned flags)
 {
 	struct bch_fs *c = dir->v.i_sb->s_fs_info;
-	struct btree_trans trans;
+	struct btree_trans *trans;
 	struct bch_inode_unpacked dir_u;
 	struct bch_inode_info *inode, *old;
 	struct bch_inode_unpacked inode_u;
@@ -256,13 +254,11 @@ __bch2_create(struct mnt_idmap *idmap,
 	if (!(flags & BCH_CREATE_TMPFILE))
 		mutex_lock(&dir->ei_update_lock);
 
-	bch2_trans_init(&trans, c, 8,
-			2048 + (!(flags & BCH_CREATE_TMPFILE)
-				? dentry->d_name.len : 0));
+	trans = bch2_trans_get(c);
 retry:
-	bch2_trans_begin(&trans);
+	bch2_trans_begin(trans);
 
-	ret   = bch2_create_trans(&trans,
+	ret   = bch2_create_trans(trans,
 				  inode_inum(dir), &dir_u, &inode_u,
 				  !(flags & BCH_CREATE_TMPFILE)
 				  ? &dentry->d_name : NULL,
@@ -278,9 +274,9 @@ retry:
 	inum.subvol = inode_u.bi_subvol ?: dir->ei_subvol;
 	inum.inum = inode_u.bi_inum;
 
-	ret   = bch2_subvolume_get(&trans, inum.subvol, true,
+	ret   = bch2_subvolume_get(trans, inum.subvol, true,
 				   BTREE_ITER_WITH_UPDATES, &subvol) ?:
-		bch2_trans_commit(&trans, NULL, &journal_seq, 0);
+		bch2_trans_commit(trans, NULL, &journal_seq, 0);
 	if (unlikely(ret)) {
 		bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, -1,
 				KEY_TYPE_QUOTA_WARN);
@@ -291,13 +287,13 @@ err_before_quota:
 	}
 
 	if (!(flags & BCH_CREATE_TMPFILE)) {
-		bch2_inode_update_after_write(&trans, dir, &dir_u,
+		bch2_inode_update_after_write(trans, dir, &dir_u,
 					      ATTR_MTIME|ATTR_CTIME);
 		mutex_unlock(&dir->ei_update_lock);
 	}
 
 	bch2_iget5_set(&inode->v, &inum);
-	bch2_vfs_inode_init(&trans, inum, inode, &inode_u, &subvol);
+	bch2_vfs_inode_init(trans, inum, inode, &inode_u, &subvol);
 
 	set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl);
 	set_cached_acl(&inode->v, ACL_TYPE_DEFAULT, default_acl);
@@ -337,7 +333,7 @@ err_before_quota:
 		unlock_new_inode(&inode->v);
 	}
 
-	bch2_trans_exit(&trans);
+	bch2_trans_put(trans);
 err:
 	posix_acl_release(default_acl);
 	posix_acl_release(acl);
@@ -346,7 +342,7 @@ err_trans:
 	if (!(flags & BCH_CREATE_TMPFILE))
 		mutex_unlock(&dir->ei_update_lock);
 
-	bch2_trans_exit(&trans);
+	bch2_trans_put(trans);
 	make_bad_inode(&inode->v);
 	iput(&inode->v);
 	inode = ERR_PTR(ret);
@@ -401,26 +397,25 @@ static int __bch2_link(struct bch_fs *c,
 		       struct bch_inode_info *dir,
 		       struct dentry *dentry)
 {
-	struct btree_trans trans;
+	struct btree_trans *trans = bch2_trans_get(c);
 	struct bch_inode_unpacked dir_u, inode_u;
 	int ret;
 
 	mutex_lock(&inode->ei_update_lock);
-	bch2_trans_init(&trans, c, 4, 1024);
 
-	ret = commit_do(&trans, NULL, NULL, 0,
-			bch2_link_trans(&trans,
+	ret = commit_do(trans, NULL, NULL, 0,
+			bch2_link_trans(trans,
 					inode_inum(dir),   &dir_u,
 					inode_inum(inode), &inode_u,
 					&dentry->d_name));
 
 	if (likely(!ret)) {
-		bch2_inode_update_after_write(&trans, dir, &dir_u,
+		bch2_inode_update_after_write(trans, dir, &dir_u,
 					      ATTR_MTIME|ATTR_CTIME);
-		bch2_inode_update_after_write(&trans, inode, &inode_u, ATTR_CTIME);
+		bch2_inode_update_after_write(trans, inode, &inode_u, ATTR_CTIME);
 	}
 
-	bch2_trans_exit(&trans);
+	bch2_trans_put(trans);
 	mutex_unlock(&inode->ei_update_lock);
 	return ret;
 }
@@ -451,24 +446,23 @@ int __bch2_unlink(struct inode *vdir, struct dentry *dentry,
 	struct bch_inode_info *dir = to_bch_ei(vdir);
 	struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
 	struct bch_inode_unpacked dir_u, inode_u;
-	struct btree_trans trans;
+	struct btree_trans *trans = bch2_trans_get(c);
 	int ret;
 
 	bch2_lock_inodes(INODE_UPDATE_LOCK, dir, inode);
-	bch2_trans_init(&trans, c, 4, 1024);
 
-	ret = commit_do(&trans, NULL, NULL,
+	ret = commit_do(trans, NULL, NULL,
 			BTREE_INSERT_NOFAIL,
-		bch2_unlink_trans(&trans,
+		bch2_unlink_trans(trans,
 				  inode_inum(dir), &dir_u,
 				  &inode_u, &dentry->d_name,
 				  deleting_snapshot));
 	if (unlikely(ret))
 		goto err;
 
-	bch2_inode_update_after_write(&trans, dir, &dir_u,
+	bch2_inode_update_after_write(trans, dir, &dir_u,
 				      ATTR_MTIME|ATTR_CTIME);
-	bch2_inode_update_after_write(&trans, inode, &inode_u,
+	bch2_inode_update_after_write(trans, inode, &inode_u,
 				      ATTR_MTIME);
 
 	if (inode_u.bi_subvol) {
@@ -479,8 +473,8 @@ int __bch2_unlink(struct inode *vdir, struct dentry *dentry,
 		set_nlink(&inode->v, 0);
 	}
 err:
-	bch2_trans_exit(&trans);
 	bch2_unlock_inodes(INODE_UPDATE_LOCK, dir, inode);
+	bch2_trans_put(trans);
 
 	return ret;
 }
@@ -543,7 +537,7 @@ static int bch2_rename2(struct mnt_idmap *idmap,
 	struct bch_inode_info *dst_inode = to_bch_ei(dst_dentry->d_inode);
 	struct bch_inode_unpacked dst_dir_u, src_dir_u;
 	struct bch_inode_unpacked src_inode_u, dst_inode_u;
-	struct btree_trans trans;
+	struct btree_trans *trans;
 	enum bch_rename_mode mode = flags & RENAME_EXCHANGE
 		? BCH_RENAME_EXCHANGE
 		: dst_dentry->d_inode
@@ -560,7 +554,7 @@ static int bch2_rename2(struct mnt_idmap *idmap,
 			return ret;
 	}
 
-	bch2_trans_init(&trans, c, 8, 2048);
+	trans = bch2_trans_get(c);
 
 	bch2_lock_inodes(INODE_UPDATE_LOCK,
 			 src_dir,
@@ -587,8 +581,8 @@ static int bch2_rename2(struct mnt_idmap *idmap,
 			goto err;
 	}
 
-	ret = commit_do(&trans, NULL, NULL, 0,
-			bch2_rename_trans(&trans,
+	ret = commit_do(trans, NULL, NULL, 0,
+			bch2_rename_trans(trans,
 					  inode_inum(src_dir), &src_dir_u,
 					  inode_inum(dst_dir), &dst_dir_u,
 					  &src_inode_u,
@@ -603,21 +597,21 @@ static int bch2_rename2(struct mnt_idmap *idmap,
 	BUG_ON(dst_inode &&
 	       dst_inode->v.i_ino != dst_inode_u.bi_inum);
 
-	bch2_inode_update_after_write(&trans, src_dir, &src_dir_u,
+	bch2_inode_update_after_write(trans, src_dir, &src_dir_u,
 				      ATTR_MTIME|ATTR_CTIME);
 
 	if (src_dir != dst_dir)
-		bch2_inode_update_after_write(&trans, dst_dir, &dst_dir_u,
+		bch2_inode_update_after_write(trans, dst_dir, &dst_dir_u,
 					      ATTR_MTIME|ATTR_CTIME);
 
-	bch2_inode_update_after_write(&trans, src_inode, &src_inode_u,
+	bch2_inode_update_after_write(trans, src_inode, &src_inode_u,
 				      ATTR_CTIME);
 
 	if (dst_inode)
-		bch2_inode_update_after_write(&trans, dst_inode, &dst_inode_u,
+		bch2_inode_update_after_write(trans, dst_inode, &dst_inode_u,
 					      ATTR_CTIME);
 err:
-	bch2_trans_exit(&trans);
+	bch2_trans_put(trans);
 
 	bch2_fs_quota_transfer(c, src_inode,
 			       bch_qid(&src_inode->ei_inode),
@@ -680,7 +674,7 @@ int bch2_setattr_nonsize(struct mnt_idmap *idmap,
 {
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
 	struct bch_qid qid;
-	struct btree_trans trans;
+	struct btree_trans *trans;
 	struct btree_iter inode_iter = { NULL };
 	struct bch_inode_unpacked inode_u;
 	struct posix_acl *acl = NULL;
@@ -701,13 +695,13 @@ int bch2_setattr_nonsize(struct mnt_idmap *idmap,
 	if (ret)
 		goto err;
 
-	bch2_trans_init(&trans, c, 0, 0);
+	trans = bch2_trans_get(c);
 retry:
-	bch2_trans_begin(&trans);
+	bch2_trans_begin(trans);
 	kfree(acl);
 	acl = NULL;
 
-	ret = bch2_inode_peek(&trans, &inode_iter, &inode_u, inode_inum(inode),
+	ret = bch2_inode_peek(trans, &inode_iter, &inode_u, inode_inum(inode),
 			      BTREE_ITER_INTENT);
 	if (ret)
 		goto btree_err;
@@ -715,29 +709,29 @@ retry:
 	bch2_setattr_copy(idmap, inode, &inode_u, attr);
 
 	if (attr->ia_valid & ATTR_MODE) {
-		ret = bch2_acl_chmod(&trans, inode_inum(inode), &inode_u,
+		ret = bch2_acl_chmod(trans, inode_inum(inode), &inode_u,
 				     inode_u.bi_mode, &acl);
 		if (ret)
 			goto btree_err;
 	}
 
-	ret =   bch2_inode_write(&trans, &inode_iter, &inode_u) ?:
-		bch2_trans_commit(&trans, NULL, NULL,
+	ret =   bch2_inode_write(trans, &inode_iter, &inode_u) ?:
+		bch2_trans_commit(trans, NULL, NULL,
 				  BTREE_INSERT_NOFAIL);
 btree_err:
-	bch2_trans_iter_exit(&trans, &inode_iter);
+	bch2_trans_iter_exit(trans, &inode_iter);
 
 	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
 		goto retry;
 	if (unlikely(ret))
 		goto err_trans;
 
-	bch2_inode_update_after_write(&trans, inode, &inode_u, attr->ia_valid);
+	bch2_inode_update_after_write(trans, inode, &inode_u, attr->ia_valid);
 
 	if (acl)
 		set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl);
 err_trans:
-	bch2_trans_exit(&trans);
+	bch2_trans_put(trans);
 err:
 	mutex_unlock(&inode->ei_update_lock);
 
@@ -879,7 +873,7 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
 {
 	struct bch_fs *c = vinode->i_sb->s_fs_info;
 	struct bch_inode_info *ei = to_bch_ei(vinode);
-	struct btree_trans trans;
+	struct btree_trans *trans;
 	struct btree_iter iter;
 	struct bkey_s_c k;
 	struct bkey_buf cur, prev;
@@ -900,18 +894,18 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
 
 	bch2_bkey_buf_init(&cur);
 	bch2_bkey_buf_init(&prev);
-	bch2_trans_init(&trans, c, 0, 0);
+	trans = bch2_trans_get(c);
 retry:
-	bch2_trans_begin(&trans);
+	bch2_trans_begin(trans);
 
-	ret = bch2_subvolume_get_snapshot(&trans, ei->ei_subvol, &snapshot);
+	ret = bch2_subvolume_get_snapshot(trans, ei->ei_subvol, &snapshot);
 	if (ret)
 		goto err;
 
-	bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
 			     SPOS(ei->v.i_ino, start, snapshot), 0);
 
-	while (!(ret = btree_trans_too_many_iters(&trans)) &&
+	while (!(ret = btree_trans_too_many_iters(trans)) &&
 	       (k = bch2_btree_iter_peek_upto(&iter, end)).k &&
 	       !(ret = bkey_err(k))) {
 		enum btree_id data_btree = BTREE_ID_extents;
@@ -928,7 +922,7 @@ retry:
 
 		bch2_bkey_buf_reassemble(&cur, c, k);
 
-		ret = bch2_read_indirect_extent(&trans, &data_btree,
+		ret = bch2_read_indirect_extent(trans, &data_btree,
 					&offset_into_extent, &cur);
 		if (ret)
 			break;
@@ -947,7 +941,7 @@ retry:
 		cur.k->k.p.offset += cur.k->k.size;
 
 		if (have_extent) {
-			bch2_trans_unlock(&trans);
+			bch2_trans_unlock(trans);
 			ret = bch2_fill_extent(c, info,
 					bkey_i_to_s_c(prev.k), 0);
 			if (ret)
@@ -961,18 +955,18 @@ retry:
 			POS(iter.pos.inode, iter.pos.offset + sectors));
 	}
 	start = iter.pos.offset;
-	bch2_trans_iter_exit(&trans, &iter);
+	bch2_trans_iter_exit(trans, &iter);
 err:
 	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
 		goto retry;
 
 	if (!ret && have_extent) {
-		bch2_trans_unlock(&trans);
+		bch2_trans_unlock(trans);
 		ret = bch2_fill_extent(c, info, bkey_i_to_s_c(prev.k),
 				       FIEMAP_EXTENT_LAST);
 	}
 
-	bch2_trans_exit(&trans);
+	bch2_trans_put(trans);
 	bch2_bkey_buf_exit(&cur, c);
 	bch2_bkey_buf_exit(&prev, c);
 	return ret < 0 ? ret : 0;
@@ -1230,7 +1224,7 @@ static int bch2_get_name(struct dentry *parent, char *name, struct dentry *child
 	struct bch_inode_info *inode	= to_bch_ei(child->d_inode);
 	struct bch_inode_info *dir	= to_bch_ei(parent->d_inode);
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	struct btree_trans trans;
+	struct btree_trans *trans;
 	struct btree_iter iter1;
 	struct btree_iter iter2;
 	struct bkey_s_c k;
@@ -1245,23 +1239,23 @@ static int bch2_get_name(struct dentry *parent, char *name, struct dentry *child
 	if (!S_ISDIR(dir->v.i_mode))
 		return -EINVAL;
 
-	bch2_trans_init(&trans, c, 0, 0);
+	trans = bch2_trans_get(c);
 
-	bch2_trans_iter_init(&trans, &iter1, BTREE_ID_dirents,
+	bch2_trans_iter_init(trans, &iter1, BTREE_ID_dirents,
 			     POS(dir->ei_inode.bi_inum, 0), 0);
-	bch2_trans_iter_init(&trans, &iter2, BTREE_ID_dirents,
+	bch2_trans_iter_init(trans, &iter2, BTREE_ID_dirents,
 			     POS(dir->ei_inode.bi_inum, 0), 0);
 retry:
-	bch2_trans_begin(&trans);
+	bch2_trans_begin(trans);
 
-	ret = bch2_subvolume_get_snapshot(&trans, dir->ei_subvol, &snapshot);
+	ret = bch2_subvolume_get_snapshot(trans, dir->ei_subvol, &snapshot);
 	if (ret)
 		goto err;
 
 	bch2_btree_iter_set_snapshot(&iter1, snapshot);
 	bch2_btree_iter_set_snapshot(&iter2, snapshot);
 
-	ret = bch2_inode_find_by_inum_trans(&trans, inode_inum(inode), &inode_u);
+	ret = bch2_inode_find_by_inum_trans(trans, inode_inum(inode), &inode_u);
 	if (ret)
 		goto err;
 
@@ -1279,7 +1273,7 @@ retry:
 		}
 
 		d = bkey_s_c_to_dirent(k);
-		ret = bch2_dirent_read_target(&trans, inode_inum(dir), d, &target);
+		ret = bch2_dirent_read_target(trans, inode_inum(dir), d, &target);
 		if (ret > 0)
 			ret = -BCH_ERR_ENOENT_dirent_doesnt_match_inode;
 		if (ret)
@@ -1301,7 +1295,7 @@ retry:
 				continue;
 
 			d = bkey_s_c_to_dirent(k);
-			ret = bch2_dirent_read_target(&trans, inode_inum(dir), d, &target);
+			ret = bch2_dirent_read_target(trans, inode_inum(dir), d, &target);
 			if (ret < 0)
 				break;
 			if (ret)
@@ -1325,9 +1319,9 @@ err:
 	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
 		goto retry;
 
-	bch2_trans_iter_exit(&trans, &iter1);
-	bch2_trans_iter_exit(&trans, &iter2);
-	bch2_trans_exit(&trans);
+	bch2_trans_iter_exit(trans, &iter1);
+	bch2_trans_iter_exit(trans, &iter2);
+	bch2_trans_put(trans);
 
 	return ret;
 }
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index b9c9ece63175..e3d68082fdd3 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -987,7 +987,7 @@ noinline_for_stack
 int bch2_check_inodes(struct bch_fs *c)
 {
 	bool full = c->opts.fsck;
-	struct btree_trans trans;
+	struct btree_trans *trans = bch2_trans_get(c);
 	struct btree_iter iter;
 	struct bch_inode_unpacked prev = { 0 };
 	struct snapshots_seen s;
@@ -995,16 +995,15 @@ int bch2_check_inodes(struct bch_fs *c)
 	int ret;
 
 	snapshots_seen_init(&s);
-	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
 
-	ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_inodes,
+	ret = for_each_btree_key_commit(trans, iter, BTREE_ID_inodes,
 			POS_MIN,
 			BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
 			NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
-		check_inode(&trans, &iter, k, &prev, &s, full));
+		check_inode(trans, &iter, k, &prev, &s, full));
 
-	bch2_trans_exit(&trans);
 	snapshots_seen_exit(&s);
+	bch2_trans_put(trans);
 	if (ret)
 		bch_err_fn(c, ret);
 	return ret;
@@ -1437,7 +1436,7 @@ int bch2_check_extents(struct bch_fs *c)
 {
 	struct inode_walker w = inode_walker_init();
 	struct snapshots_seen s;
-	struct btree_trans trans;
+	struct btree_trans *trans = bch2_trans_get(c);
 	struct btree_iter iter;
 	struct bkey_s_c k;
 	struct extent_ends extent_ends;
@@ -1446,23 +1445,22 @@ int bch2_check_extents(struct bch_fs *c)
 
 	snapshots_seen_init(&s);
 	extent_ends_init(&extent_ends);
-	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 4096);
 
-	ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_extents,
+	ret = for_each_btree_key_commit(trans, iter, BTREE_ID_extents,
 			POS(BCACHEFS_ROOT_INO, 0),
 			BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
 			&res, NULL,
 			BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, ({
 		bch2_disk_reservation_put(c, &res);
-		check_extent(&trans, &iter, k, &w, &s, &extent_ends);
+		check_extent(trans, &iter, k, &w, &s, &extent_ends);
 	})) ?:
-	check_i_sectors(&trans, &w);
+	check_i_sectors(trans, &w);
 
 	bch2_disk_reservation_put(c, &res);
 	extent_ends_exit(&extent_ends);
 	inode_walker_exit(&w);
-	bch2_trans_exit(&trans);
 	snapshots_seen_exit(&s);
+	bch2_trans_put(trans);
 
 	if (ret)
 		bch_err_fn(c, ret);
@@ -1803,23 +1801,22 @@ int bch2_check_dirents(struct bch_fs *c)
 	struct inode_walker target = inode_walker_init();
 	struct snapshots_seen s;
 	struct bch_hash_info hash_info;
-	struct btree_trans trans;
+	struct btree_trans *trans = bch2_trans_get(c);
 	struct btree_iter iter;
 	struct bkey_s_c k;
 	int ret = 0;
 
 	snapshots_seen_init(&s);
-	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
 
-	ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_dirents,
+	ret = for_each_btree_key_commit(trans, iter, BTREE_ID_dirents,
 			POS(BCACHEFS_ROOT_INO, 0),
 			BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS,
 			k,
 			NULL, NULL,
 			BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
-		check_dirent(&trans, &iter, k, &hash_info, &dir, &target, &s));
+		check_dirent(trans, &iter, k, &hash_info, &dir, &target, &s));
 
-	bch2_trans_exit(&trans);
+	bch2_trans_put(trans);
 	snapshots_seen_exit(&s);
 	inode_walker_exit(&dir);
 	inode_walker_exit(&target);
@@ -1873,23 +1870,18 @@ int bch2_check_xattrs(struct bch_fs *c)
 {
 	struct inode_walker inode = inode_walker_init();
 	struct bch_hash_info hash_info;
-	struct btree_trans trans;
 	struct btree_iter iter;
 	struct bkey_s_c k;
 	int ret = 0;
 
-	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
-
-	ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_xattrs,
+	ret = bch2_trans_run(c,
+		for_each_btree_key_commit(trans, iter, BTREE_ID_xattrs,
 			POS(BCACHEFS_ROOT_INO, 0),
 			BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS,
 			k,
 			NULL, NULL,
 			BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
-		check_xattr(&trans, &iter, k, &hash_info, &inode));
-
-	bch2_trans_exit(&trans);
-
+		check_xattr(trans, &iter, k, &hash_info, &inode)));
 	if (ret)
 		bch_err_fn(c, ret);
 	return ret;
@@ -1958,7 +1950,7 @@ int bch2_check_root(struct bch_fs *c)
 	ret = bch2_trans_do(c, NULL, NULL,
 			     BTREE_INSERT_NOFAIL|
 			     BTREE_INSERT_LAZY_RW,
-		check_root_trans(&trans));
+		check_root_trans(trans));
 
 	if (ret)
 		bch_err_fn(c, ret);
@@ -2110,16 +2102,14 @@ fsck_err:
  */
 int bch2_check_directory_structure(struct bch_fs *c)
 {
-	struct btree_trans trans;
+	struct btree_trans *trans = bch2_trans_get(c);
 	struct btree_iter iter;
 	struct bkey_s_c k;
 	struct bch_inode_unpacked u;
 	pathbuf path = { 0, };
 	int ret;
 
-	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
-
-	for_each_btree_key(&trans, iter, BTREE_ID_inodes, POS_MIN,
+	for_each_btree_key(trans, iter, BTREE_ID_inodes, POS_MIN,
 			   BTREE_ITER_INTENT|
 			   BTREE_ITER_PREFETCH|
 			   BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
@@ -2136,12 +2126,12 @@ int bch2_check_directory_structure(struct bch_fs *c)
 		if (u.bi_flags & BCH_INODE_UNLINKED)
 			continue;
 
-		ret = check_path(&trans, &path, &u, iter.pos.snapshot);
+		ret = check_path(trans, &path, &u, iter.pos.snapshot);
 		if (ret)
 			break;
 	}
-	bch2_trans_iter_exit(&trans, &iter);
-	bch2_trans_exit(&trans);
+	bch2_trans_iter_exit(trans, &iter);
+	bch2_trans_put(trans);
 	darray_exit(&path);
 
 	if (ret)
@@ -2230,15 +2220,13 @@ static int check_nlinks_find_hardlinks(struct bch_fs *c,
 				       struct nlink_table *t,
 				       u64 start, u64 *end)
 {
-	struct btree_trans trans;
+	struct btree_trans *trans = bch2_trans_get(c);
 	struct btree_iter iter;
 	struct bkey_s_c k;
 	struct bch_inode_unpacked u;
 	int ret = 0;
 
-	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
-
-	for_each_btree_key(&trans, iter, BTREE_ID_inodes,
+	for_each_btree_key(trans, iter, BTREE_ID_inodes,
 			   POS(0, start),
 			   BTREE_ITER_INTENT|
 			   BTREE_ITER_PREFETCH|
@@ -2267,8 +2255,8 @@ static int check_nlinks_find_hardlinks(struct bch_fs *c,
 		}
 
 	}
-	bch2_trans_iter_exit(&trans, &iter);
-	bch2_trans_exit(&trans);
+	bch2_trans_iter_exit(trans, &iter);
+	bch2_trans_put(trans);
 
 	if (ret)
 		bch_err(c, "error in fsck: btree error %i while walking inodes", ret);
@@ -2280,7 +2268,7 @@ noinline_for_stack
 static int check_nlinks_walk_dirents(struct bch_fs *c, struct nlink_table *links,
 				     u64 range_start, u64 range_end)
 {
-	struct btree_trans trans;
+	struct btree_trans *trans = bch2_trans_get(c);
 	struct snapshots_seen s;
 	struct btree_iter iter;
 	struct bkey_s_c k;
@@ -2289,9 +2277,7 @@ static int check_nlinks_walk_dirents(struct bch_fs *c, struct nlink_table *links
 
 	snapshots_seen_init(&s);
 
-	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
-
-	for_each_btree_key(&trans, iter, BTREE_ID_dirents, POS_MIN,
+	for_each_btree_key(trans, iter, BTREE_ID_dirents, POS_MIN,
 			   BTREE_ITER_INTENT|
 			   BTREE_ITER_PREFETCH|
 			   BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
@@ -2311,12 +2297,12 @@ static int check_nlinks_walk_dirents(struct bch_fs *c, struct nlink_table *links
 			break;
 		}
 	}
-	bch2_trans_iter_exit(&trans, &iter);
+	bch2_trans_iter_exit(trans, &iter);
 
 	if (ret)
 		bch_err(c, "error in fsck: btree error %i while walking dirents", ret);
 
-	bch2_trans_exit(&trans);
+	bch2_trans_put(trans);
 	snapshots_seen_exit(&s);
 	return ret;
 }
@@ -2367,22 +2353,17 @@ static int check_nlinks_update_hardlinks(struct bch_fs *c,
 			       struct nlink_table *links,
 			       u64 range_start, u64 range_end)
 {
-	struct btree_trans trans;
 	struct btree_iter iter;
 	struct bkey_s_c k;
 	size_t idx = 0;
 	int ret = 0;
 
-	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
-
-	ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_inodes,
-			POS(0, range_start),
-			BTREE_ITER_INTENT|BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
-			NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
-		check_nlinks_update_inode(&trans, &iter, k, links, &idx, range_end));
-
-	bch2_trans_exit(&trans);
-
+	ret = bch2_trans_run(c,
+		for_each_btree_key_commit(trans, iter, BTREE_ID_inodes,
+				POS(0, range_start),
+				BTREE_ITER_INTENT|BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
+				NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
+			check_nlinks_update_inode(trans, &iter, k, links, &idx, range_end)));
 	if (ret < 0) {
 		bch_err(c, "error in fsck: btree error %i while walking inodes", ret);
 		return ret;
@@ -2464,13 +2445,12 @@ int bch2_fix_reflink_p(struct bch_fs *c)
 		return 0;
 
 	ret = bch2_trans_run(c,
-		for_each_btree_key_commit(&trans, iter,
+		for_each_btree_key_commit(trans, iter,
 				BTREE_ID_extents, POS_MIN,
 				BTREE_ITER_INTENT|BTREE_ITER_PREFETCH|
 				BTREE_ITER_ALL_SNAPSHOTS, k,
 				NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW,
-			fix_reflink_p_key(&trans, &iter, k)));
-
+			fix_reflink_p_key(trans, &iter, k)));
 	if (ret)
 		bch_err_fn(c, ret);
 	return ret;
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index 81ff2720835b..8bfd99cb7ad1 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -826,7 +826,7 @@ err:
 
 int bch2_inode_rm(struct bch_fs *c, subvol_inum inum)
 {
-	struct btree_trans trans;
+	struct btree_trans *trans = bch2_trans_get(c);
 	struct btree_iter iter = { NULL };
 	struct bkey_i_inode_generation delete;
 	struct bch_inode_unpacked inode_u;
@@ -834,8 +834,6 @@ int bch2_inode_rm(struct bch_fs *c, subvol_inum inum)
 	u32 snapshot;
 	int ret;
 
-	bch2_trans_init(&trans, c, 0, 1024);
-
 	/*
 	 * If this was a directory, there shouldn't be any real dirents left -
 	 * but there could be whiteouts (from hash collisions) that we should
@@ -844,19 +842,19 @@ int bch2_inode_rm(struct bch_fs *c, subvol_inum inum)
 	 * XXX: the dirent could ideally would delete whiteouts when they're no
 	 * longer needed
 	 */
-	ret   = bch2_inode_delete_keys(&trans, inum, BTREE_ID_extents) ?:
-		bch2_inode_delete_keys(&trans, inum, BTREE_ID_xattrs) ?:
-		bch2_inode_delete_keys(&trans, inum, BTREE_ID_dirents);
+	ret   = bch2_inode_delete_keys(trans, inum, BTREE_ID_extents) ?:
+		bch2_inode_delete_keys(trans, inum, BTREE_ID_xattrs) ?:
+		bch2_inode_delete_keys(trans, inum, BTREE_ID_dirents);
 	if (ret)
 		goto err;
 retry:
-	bch2_trans_begin(&trans);
+	bch2_trans_begin(trans);
 
-	ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot);
+	ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
 	if (ret)
 		goto err;
 
-	k = bch2_bkey_get_iter(&trans, &iter, BTREE_ID_inodes,
+	k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
 			       SPOS(0, inum.inum, snapshot),
 			       BTREE_ITER_INTENT|BTREE_ITER_CACHED);
 	ret = bkey_err(k);
@@ -864,7 +862,7 @@ retry:
 		goto err;
 
 	if (!bkey_is_inode(k.k)) {
-		bch2_fs_inconsistent(trans.c,
+		bch2_fs_inconsistent(c,
 				     "inode %llu:%u not found when deleting",
 				     inum.inum, snapshot);
 		ret = -EIO;
@@ -877,15 +875,15 @@ retry:
 	delete.k.p = iter.pos;
 	delete.v.bi_generation = cpu_to_le32(inode_u.bi_generation + 1);
 
-	ret   = bch2_trans_update(&trans, &iter, &delete.k_i, 0) ?:
-		bch2_trans_commit(&trans, NULL, NULL,
+	ret   = bch2_trans_update(trans, &iter, &delete.k_i, 0) ?:
+		bch2_trans_commit(trans, NULL, NULL,
 				BTREE_INSERT_NOFAIL);
 err:
-	bch2_trans_iter_exit(&trans, &iter);
+	bch2_trans_iter_exit(trans, &iter);
 	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
 		goto retry;
 
-	bch2_trans_exit(&trans);
+	bch2_trans_put(trans);
 	return ret;
 }
 
@@ -919,7 +917,7 @@ int bch2_inode_find_by_inum(struct bch_fs *c, subvol_inum inum,
 			    struct bch_inode_unpacked *inode)
 {
 	return bch2_trans_do(c, NULL, NULL, 0,
-		bch2_inode_find_by_inum_trans(&trans, inum, inode));
+		bch2_inode_find_by_inum_trans(trans, inum, inode));
 }
 
 int bch2_inode_nlink_inc(struct bch_inode_unpacked *bi)
@@ -1091,14 +1089,12 @@ delete:
 
 int bch2_delete_dead_inodes(struct bch_fs *c)
 {
-	struct btree_trans trans;
+	struct btree_trans *trans = bch2_trans_get(c);
 	struct btree_iter iter;
 	struct bkey_s_c k;
 	int ret;
 
-	bch2_trans_init(&trans, c, 0, 0);
-
-	ret = bch2_btree_write_buffer_flush_sync(&trans);
+	ret = bch2_btree_write_buffer_flush_sync(trans);
 	if (ret)
 		goto err;
 
@@ -1108,26 +1104,26 @@ int bch2_delete_dead_inodes(struct bch_fs *c)
 	 * but we can't retry because the btree write buffer won't have been
 	 * flushed and we'd spin:
 	 */
-	for_each_btree_key(&trans, iter, BTREE_ID_deleted_inodes, POS_MIN,
+	for_each_btree_key(trans, iter, BTREE_ID_deleted_inodes, POS_MIN,
 			   BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
-		ret = lockrestart_do(&trans, may_delete_deleted_inode(&trans, k.k->p));
+		ret = lockrestart_do(trans, may_delete_deleted_inode(trans, k.k->p));
 		if (ret < 0)
 			break;
 
 		if (ret) {
 			if (!test_bit(BCH_FS_RW, &c->flags)) {
-				bch2_trans_unlock(&trans);
+				bch2_trans_unlock(trans);
 				bch2_fs_lazy_rw(c);
 			}
 
-			ret = bch2_inode_rm_snapshot(&trans, k.k->p.offset, k.k->p.snapshot);
+			ret = bch2_inode_rm_snapshot(trans, k.k->p.offset, k.k->p.snapshot);
 			if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
 				break;
 		}
 	}
-	bch2_trans_iter_exit(&trans, &iter);
+	bch2_trans_iter_exit(trans, &iter);
 err:
-	bch2_trans_exit(&trans);
+	bch2_trans_put(trans);
 
 	return ret;
 }
diff --git a/fs/bcachefs/io_misc.c b/fs/bcachefs/io_misc.c
index b1be70e15c60..668493bcfe36 100644
--- a/fs/bcachefs/io_misc.c
+++ b/fs/bcachefs/io_misc.c
@@ -198,19 +198,18 @@ int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter,
 int bch2_fpunch(struct bch_fs *c, subvol_inum inum, u64 start, u64 end,
 		s64 *i_sectors_delta)
 {
-	struct btree_trans trans;
+	struct btree_trans *trans = bch2_trans_get(c);
 	struct btree_iter iter;
 	int ret;
 
-	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024);
-	bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
 			     POS(inum.inum, start),
 			     BTREE_ITER_INTENT);
 
-	ret = bch2_fpunch_at(&trans, &iter, inum, end, i_sectors_delta);
+	ret = bch2_fpunch_at(trans, &iter, inum, end, i_sectors_delta);
 
-	bch2_trans_iter_exit(&trans, &iter);
-	bch2_trans_exit(&trans);
+	bch2_trans_iter_exit(trans, &iter);
+	bch2_trans_put(trans);
 
 	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
 		ret = 0;
@@ -289,8 +288,8 @@ int bch2_truncate(struct bch_fs *c, subvol_inum inum, u64 new_i_size, u64 *i_sec
 	op.v.new_i_size	= cpu_to_le64(new_i_size);
 
 	return bch2_trans_run(c,
-		bch2_logged_op_start(&trans, &op.k_i) ?:
-		__bch2_resume_logged_op_truncate(&trans, &op.k_i, i_sectors_delta));
+		bch2_logged_op_start(trans, &op.k_i) ?:
+		__bch2_resume_logged_op_truncate(trans, &op.k_i, i_sectors_delta));
 }
 
 /* finsert/fcollapse: */
@@ -493,6 +492,6 @@ int bch2_fcollapse_finsert(struct bch_fs *c, subvol_inum inum,
 	op.v.pos	= cpu_to_le64(insert ? U64_MAX : offset);
 
 	return bch2_trans_run(c,
-		bch2_logged_op_start(&trans, &op.k_i) ?:
-		__bch2_resume_logged_op_finsert(&trans, &op.k_i, i_sectors_delta));
+		bch2_logged_op_start(trans, &op.k_i) ?:
+		__bch2_resume_logged_op_finsert(trans, &op.k_i, i_sectors_delta));
 }
diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c
index 3c6c1396915a..9a57da00573d 100644
--- a/fs/bcachefs/io_read.c
+++ b/fs/bcachefs/io_read.c
@@ -359,7 +359,7 @@ static void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio
 				     struct bch_io_failures *failed,
 				     unsigned flags)
 {
-	struct btree_trans trans;
+	struct btree_trans *trans = bch2_trans_get(c);
 	struct btree_iter iter;
 	struct bkey_buf sk;
 	struct bkey_s_c k;
@@ -369,9 +369,8 @@ static void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio
 	flags |= BCH_READ_MUST_CLONE;
 
 	bch2_bkey_buf_init(&sk);
-	bch2_trans_init(&trans, c, 0, 0);
 
-	bch2_trans_iter_init(&trans, &iter, rbio->data_btree,
+	bch2_trans_iter_init(trans, &iter, rbio->data_btree,
 			     rbio->read_pos, BTREE_ITER_SLOTS);
 retry:
 	rbio->bio.bi_status = 0;
@@ -382,7 +381,7 @@ retry:
 
 	bch2_bkey_buf_reassemble(&sk, c, k);
 	k = bkey_i_to_s_c(sk.k);
-	bch2_trans_unlock(&trans);
+	bch2_trans_unlock(trans);
 
 	if (!bch2_bkey_matches_ptr(c, k,
 				   rbio->pick.ptr,
@@ -393,7 +392,7 @@ retry:
 		goto out;
 	}
 
-	ret = __bch2_read_extent(&trans, rbio, bvec_iter,
+	ret = __bch2_read_extent(trans, rbio, bvec_iter,
 				 rbio->read_pos,
 				 rbio->data_btree,
 				 k, 0, failed, flags);
@@ -403,8 +402,8 @@ retry:
 		goto err;
 out:
 	bch2_rbio_done(rbio);
-	bch2_trans_iter_exit(&trans, &iter);
-	bch2_trans_exit(&trans);
+	bch2_trans_iter_exit(trans, &iter);
+	bch2_trans_put(trans);
 	bch2_bkey_buf_exit(&sk, c);
 	return;
 err:
@@ -526,7 +525,7 @@ out:
 static noinline void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio)
 {
 	bch2_trans_do(rbio->c, NULL, NULL, BTREE_INSERT_NOFAIL,
-		      __bch2_rbio_narrow_crcs(&trans, rbio));
+		      __bch2_rbio_narrow_crcs(trans, rbio));
 }
 
 /* Inner part that may run in process context */
@@ -1082,7 +1081,7 @@ void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
 		 struct bvec_iter bvec_iter, subvol_inum inum,
 		 struct bch_io_failures *failed, unsigned flags)
 {
-	struct btree_trans trans;
+	struct btree_trans *trans = bch2_trans_get(c);
 	struct btree_iter iter;
 	struct bkey_buf sk;
 	struct bkey_s_c k;
@@ -1092,16 +1091,15 @@ void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
 	BUG_ON(flags & BCH_READ_NODECODE);
 
 	bch2_bkey_buf_init(&sk);
-	bch2_trans_init(&trans, c, 0, 0);
 retry:
-	bch2_trans_begin(&trans);
+	bch2_trans_begin(trans);
 	iter = (struct btree_iter) { NULL };
 
-	ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot);
+	ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
 	if (ret)
 		goto err;
 
-	bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
 			     SPOS(inum.inum, bvec_iter.bi_sector, snapshot),
 			     BTREE_ITER_SLOTS);
 	while (1) {
@@ -1112,7 +1110,7 @@ retry:
 		 * read_extent -> io_time_reset may cause a transaction restart
 		 * without returning an error, we need to check for that here:
 		 */
-		ret = bch2_trans_relock(&trans);
+		ret = bch2_trans_relock(trans);
 		if (ret)
 			break;
 
@@ -1130,7 +1128,7 @@ retry:
 
 		bch2_bkey_buf_reassemble(&sk, c, k);
 
-		ret = bch2_read_indirect_extent(&trans, &data_btree,
+		ret = bch2_read_indirect_extent(trans, &data_btree,
 					&offset_into_extent, &sk);
 		if (ret)
 			break;
@@ -1149,7 +1147,7 @@ retry:
 		if (bvec_iter.bi_size == bytes)
 			flags |= BCH_READ_LAST_FRAGMENT;
 
-		ret = __bch2_read_extent(&trans, rbio, bvec_iter, iter.pos,
+		ret = __bch2_read_extent(trans, rbio, bvec_iter, iter.pos,
 					 data_btree, k,
 					 offset_into_extent, failed, flags);
 		if (ret)
@@ -1161,19 +1159,19 @@ retry:
 		swap(bvec_iter.bi_size, bytes);
 		bio_advance_iter(&rbio->bio, &bvec_iter, bytes);
 
-		ret = btree_trans_too_many_iters(&trans);
+		ret = btree_trans_too_many_iters(trans);
 		if (ret)
 			break;
 	}
 err:
-	bch2_trans_iter_exit(&trans, &iter);
+	bch2_trans_iter_exit(trans, &iter);
 
 	if (bch2_err_matches(ret, BCH_ERR_transaction_restart) ||
 	    ret == READ_RETRY ||
 	    ret == READ_RETRY_AVOID)
 		goto retry;
 
-	bch2_trans_exit(&trans);
+	bch2_trans_put(trans);
 	bch2_bkey_buf_exit(&sk, c);
 
 	if (ret) {
diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c
index 3439e9553325..659330cbe357 100644
--- a/fs/bcachefs/io_write.c
+++ b/fs/bcachefs/io_write.c
@@ -322,7 +322,7 @@ static int bch2_write_index_default(struct bch_write_op *op)
 	struct bkey_buf sk;
 	struct keylist *keys = &op->insert_keys;
 	struct bkey_i *k = bch2_keylist_front(keys);
-	struct btree_trans trans;
+	struct btree_trans *trans = bch2_trans_get(c);
 	struct btree_iter iter;
 	subvol_inum inum = {
 		.subvol = op->subvol,
@@ -333,30 +333,29 @@ static int bch2_write_index_default(struct bch_write_op *op)
 	BUG_ON(!inum.subvol);
 
 	bch2_bkey_buf_init(&sk);
-	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024);
 
 	do {
-		bch2_trans_begin(&trans);
+		bch2_trans_begin(trans);
 
 		k = bch2_keylist_front(keys);
 		bch2_bkey_buf_copy(&sk, c, k);
 
-		ret = bch2_subvolume_get_snapshot(&trans, inum.subvol,
+		ret = bch2_subvolume_get_snapshot(trans, inum.subvol,
 						  &sk.k->k.p.snapshot);
 		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
 			continue;
 		if (ret)
 			break;
 
-		bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
+		bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
 				     bkey_start_pos(&sk.k->k),
 				     BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
 
-		ret = bch2_extent_update(&trans, inum, &iter, sk.k,
+		ret = bch2_extent_update(trans, inum, &iter, sk.k,
 					 &op->res,
 					 op->new_i_size, &op->i_sectors_delta,
 					 op->flags & BCH_WRITE_CHECK_ENOSPC);
-		bch2_trans_iter_exit(&trans, &iter);
+		bch2_trans_iter_exit(trans, &iter);
 
 		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
 			continue;
@@ -369,7 +368,7 @@ static int bch2_write_index_default(struct bch_write_op *op)
 			bch2_cut_front(iter.pos, k);
 	} while (!bch2_keylist_empty(keys));
 
-	bch2_trans_exit(&trans);
+	bch2_trans_put(trans);
 	bch2_bkey_buf_exit(&sk, c);
 
 	return ret;
@@ -1163,20 +1162,18 @@ static int bch2_nocow_write_convert_one_unwritten(struct btree_trans *trans,
 static void bch2_nocow_write_convert_unwritten(struct bch_write_op *op)
 {
 	struct bch_fs *c = op->c;
-	struct btree_trans trans;
+	struct btree_trans *trans = bch2_trans_get(c);
 	struct btree_iter iter;
 	struct bkey_i *orig;
 	struct bkey_s_c k;
 	int ret;
 
-	bch2_trans_init(&trans, c, 0, 0);
-
 	for_each_keylist_key(&op->insert_keys, orig) {
-		ret = for_each_btree_key_upto_commit(&trans, iter, BTREE_ID_extents,
+		ret = for_each_btree_key_upto_commit(trans, iter, BTREE_ID_extents,
 				     bkey_start_pos(&orig->k), orig->k.p,
 				     BTREE_ITER_INTENT, k,
 				     NULL, NULL, BTREE_INSERT_NOFAIL, ({
-			bch2_nocow_write_convert_one_unwritten(&trans, &iter, orig, k, op->new_i_size);
+			bch2_nocow_write_convert_one_unwritten(trans, &iter, orig, k, op->new_i_size);
 		}));
 
 		if (ret && !bch2_err_matches(ret, EROFS)) {
@@ -1194,7 +1191,7 @@ static void bch2_nocow_write_convert_unwritten(struct bch_write_op *op)
 		}
 	}
 
-	bch2_trans_exit(&trans);
+	bch2_trans_put(trans);
 }
 
 static void __bch2_nocow_write_done(struct bch_write_op *op)
@@ -1218,7 +1215,7 @@ static void bch2_nocow_write_done(struct closure *cl)
 static void bch2_nocow_write(struct bch_write_op *op)
 {
 	struct bch_fs *c = op->c;
-	struct btree_trans trans;
+	struct btree_trans *trans;
 	struct btree_iter iter;
 	struct bkey_s_c k;
 	struct bkey_ptrs_c ptrs;
@@ -1235,15 +1232,15 @@ static void bch2_nocow_write(struct bch_write_op *op)
 	if (op->flags & BCH_WRITE_MOVE)
 		return;
 
-	bch2_trans_init(&trans, c, 0, 0);
+	trans = bch2_trans_get(c);
 retry:
-	bch2_trans_begin(&trans);
+	bch2_trans_begin(trans);
 
-	ret = bch2_subvolume_get_snapshot(&trans, op->subvol, &snapshot);
+	ret = bch2_subvolume_get_snapshot(trans, op->subvol, &snapshot);
 	if (unlikely(ret))
 		goto err;
 
-	bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
 			     SPOS(op->pos.inode, op->pos.offset, snapshot),
 			     BTREE_ITER_SLOTS);
 	while (1) {
@@ -1289,7 +1286,7 @@ retry:
 
 		/* Unlock before taking nocow locks, doing IO: */
 		bkey_reassemble(op->insert_keys.top, k);
-		bch2_trans_unlock(&trans);
+		bch2_trans_unlock(trans);
 
 		bch2_cut_front(op->pos, op->insert_keys.top);
 		if (op->flags & BCH_WRITE_CONVERT_UNWRITTEN)
@@ -1338,7 +1335,7 @@ retry:
 		bch2_btree_iter_advance(&iter);
 	}
 out:
-	bch2_trans_iter_exit(&trans, &iter);
+	bch2_trans_iter_exit(trans, &iter);
 err:
 	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
 		goto retry;
@@ -1353,7 +1350,7 @@ err:
 		op->flags |= BCH_WRITE_DONE;
 	}
 
-	bch2_trans_exit(&trans);
+	bch2_trans_put(trans);
 
 	/* fallback to cow write path? */
 	if (!(op->flags & BCH_WRITE_DONE)) {
@@ -1431,7 +1428,7 @@ again:
 		 * allocations for specific disks may hang arbitrarily long:
 		 */
 		ret = bch2_trans_do(c, NULL, NULL, 0,
-			bch2_alloc_sectors_start_trans(&trans,
+			bch2_alloc_sectors_start_trans(trans,
 				op->target,
 				op->opts.erasure_code && !(op->flags & BCH_WRITE_CACHED),
 				op->write_point,
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index 40455e892112..ad80618d1740 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -834,7 +834,7 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
 				break;
 
 			ret = bch2_trans_run(c,
-				bch2_trans_mark_metadata_bucket(&trans, ca,
+				bch2_trans_mark_metadata_bucket(trans, ca,
 						ob[nr_got]->bucket, BCH_DATA_journal,
 						ca->mi.bucket_size));
 			if (ret) {
@@ -915,7 +915,7 @@ err_unblock:
 	if (ret && !new_fs)
 		for (i = 0; i < nr_got; i++)
 			bch2_trans_run(c,
-				bch2_trans_mark_metadata_bucket(&trans, ca,
+				bch2_trans_mark_metadata_bucket(trans, ca,
 						bu[i], BCH_DATA_free, 0));
 err_free:
 	if (!new_fs)
diff --git a/fs/bcachefs/journal_seq_blacklist.c b/fs/bcachefs/journal_seq_blacklist.c
index d6b9f2cdf8e7..1e1a79405693 100644
--- a/fs/bcachefs/journal_seq_blacklist.c
+++ b/fs/bcachefs/journal_seq_blacklist.c
@@ -250,20 +250,18 @@ void bch2_blacklist_entries_gc(struct work_struct *work)
 	struct journal_seq_blacklist_table *t;
 	struct bch_sb_field_journal_seq_blacklist *bl;
 	struct journal_seq_blacklist_entry *src, *dst;
-	struct btree_trans trans;
+	struct btree_trans *trans = bch2_trans_get(c);
 	unsigned i, nr, new_nr;
 	int ret;
 
-	bch2_trans_init(&trans, c, 0, 0);
-
 	for (i = 0; i < BTREE_ID_NR; i++) {
 		struct btree_iter iter;
 		struct btree *b;
 
-		bch2_trans_node_iter_init(&trans, &iter, i, POS_MIN,
+		bch2_trans_node_iter_init(trans, &iter, i, POS_MIN,
 					  0, 0, BTREE_ITER_PREFETCH);
 retry:
-		bch2_trans_begin(&trans);
+		bch2_trans_begin(trans);
 
 		b = bch2_btree_iter_peek_node(&iter);
 
@@ -275,10 +273,10 @@ retry:
 		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
 			goto retry;
 
-		bch2_trans_iter_exit(&trans, &iter);
+		bch2_trans_iter_exit(trans, &iter);
 	}
 
-	bch2_trans_exit(&trans);
+	bch2_trans_put(trans);
 	if (ret)
 		return;
 
diff --git a/fs/bcachefs/logged_ops.c b/fs/bcachefs/logged_ops.c
index e133c23ad51c..8640f7dee0de 100644
--- a/fs/bcachefs/logged_ops.c
+++ b/fs/bcachefs/logged_ops.c
@@ -59,9 +59,9 @@ int bch2_resume_logged_ops(struct bch_fs *c)
 	int ret;
 
 	ret = bch2_trans_run(c,
-		for_each_btree_key2(&trans, iter,
+		for_each_btree_key2(trans, iter,
 				BTREE_ID_logged_ops, POS_MIN, BTREE_ITER_PREFETCH, k,
-			resume_logged_op(&trans, &iter, k)));
+			resume_logged_op(trans, &iter, k)));
 	if (ret)
 		bch_err_fn(c, ret);
 	return ret;
diff --git a/fs/bcachefs/lru.c b/fs/bcachefs/lru.c
index 3e8b8f2f38a3..215a653322f3 100644
--- a/fs/bcachefs/lru.c
+++ b/fs/bcachefs/lru.c
@@ -151,10 +151,10 @@ int bch2_check_lrus(struct bch_fs *c)
 	int ret = 0;
 
 	ret = bch2_trans_run(c,
-		for_each_btree_key_commit(&trans, iter,
+		for_each_btree_key_commit(trans, iter,
 				BTREE_ID_lru, POS_MIN, BTREE_ITER_PREFETCH, k,
 				NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW,
-			bch2_check_lru_key(&trans, &iter, k, &last_flushed_pos)));
+			bch2_check_lru_key(trans, &iter, k, &last_flushed_pos)));
 	if (ret)
 		bch_err_fn(c, ret);
 	return ret;
diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c
index 4746dfa7af97..e3a51f6d6c9b 100644
--- a/fs/bcachefs/migrate.c
+++ b/fs/bcachefs/migrate.c
@@ -78,34 +78,32 @@ static int bch2_dev_usrdata_drop_key(struct btree_trans *trans,
 
 static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
 {
-	struct btree_trans trans;
+	struct btree_trans *trans = bch2_trans_get(c);
 	struct btree_iter iter;
 	struct bkey_s_c k;
 	enum btree_id id;
 	int ret = 0;
 
-	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
-
 	for (id = 0; id < BTREE_ID_NR; id++) {
 		if (!btree_type_has_ptrs(id))
 			continue;
 
-		ret = for_each_btree_key_commit(&trans, iter, id, POS_MIN,
+		ret = for_each_btree_key_commit(trans, iter, id, POS_MIN,
 				BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
 				NULL, NULL, BTREE_INSERT_NOFAIL,
-			bch2_dev_usrdata_drop_key(&trans, &iter, k, dev_idx, flags));
+			bch2_dev_usrdata_drop_key(trans, &iter, k, dev_idx, flags));
 		if (ret)
 			break;
 	}
 
-	bch2_trans_exit(&trans);
+	bch2_trans_put(trans);
 
 	return ret;
 }
 
 static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
 {
-	struct btree_trans trans;
+	struct btree_trans *trans;
 	struct btree_iter iter;
 	struct closure cl;
 	struct btree *b;
@@ -117,16 +115,16 @@ static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
 	if (flags & BCH_FORCE_IF_METADATA_LOST)
 		return -EINVAL;
 
+	trans = bch2_trans_get(c);
 	bch2_bkey_buf_init(&k);
-	bch2_trans_init(&trans, c, 0, 0);
 	closure_init_stack(&cl);
 
 	for (id = 0; id < BTREE_ID_NR; id++) {
-		bch2_trans_node_iter_init(&trans, &iter, id, POS_MIN, 0, 0,
+		bch2_trans_node_iter_init(trans, &iter, id, POS_MIN, 0, 0,
 					  BTREE_ITER_PREFETCH);
 retry:
 		ret = 0;
-		while (bch2_trans_begin(&trans),
+		while (bch2_trans_begin(trans),
 		       (b = bch2_btree_iter_peek_node(&iter)) &&
 		       !(ret = PTR_ERR_OR_ZERO(b))) {
 			if (!bch2_bkey_has_device_c(bkey_i_to_s_c(&b->key), dev_idx))
@@ -141,7 +139,7 @@ retry:
 				break;
 			}
 
-			ret = bch2_btree_node_update_key(&trans, &iter, b, k.k, 0, false);
+			ret = bch2_btree_node_update_key(trans, &iter, b, k.k, 0, false);
 			if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
 				ret = 0;
 				continue;
@@ -157,7 +155,7 @@ next:
 		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
 			goto retry;
 
-		bch2_trans_iter_exit(&trans, &iter);
+		bch2_trans_iter_exit(trans, &iter);
 
 		if (ret)
 			goto err;
@@ -166,8 +164,8 @@ next:
 	bch2_btree_interior_updates_flush(c);
 	ret = 0;
 err:
-	bch2_trans_exit(&trans);
 	bch2_bkey_buf_exit(&k, c);
+	bch2_trans_put(trans);
 
 	BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart));
 
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index d62b757536a3..c1aa76f9f845 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -525,7 +525,7 @@ static int __bch2_move_data(struct moving_context *ctxt,
 	struct bch_fs *c = ctxt->c;
 	struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts);
 	struct bkey_buf sk;
-	struct btree_trans trans;
+	struct btree_trans *trans = bch2_trans_get(c);
 	struct btree_iter iter;
 	struct bkey_s_c k;
 	struct data_update_opts data_opts;
@@ -533,7 +533,6 @@ static int __bch2_move_data(struct moving_context *ctxt,
 	int ret = 0, ret2;
 
 	bch2_bkey_buf_init(&sk);
-	bch2_trans_init(&trans, c, 0, 0);
 
 	if (ctxt->stats) {
 		ctxt->stats->data_type	= BCH_DATA_user;
@@ -541,15 +540,15 @@ static int __bch2_move_data(struct moving_context *ctxt,
 		ctxt->stats->pos	= start;
 	}
 
-	bch2_trans_iter_init(&trans, &iter, btree_id, start,
+	bch2_trans_iter_init(trans, &iter, btree_id, start,
 			     BTREE_ITER_PREFETCH|
 			     BTREE_ITER_ALL_SNAPSHOTS);
 
 	if (ctxt->rate)
 		bch2_ratelimit_reset(ctxt->rate);
 
-	while (!move_ratelimit(&trans, ctxt)) {
-		bch2_trans_begin(&trans);
+	while (!move_ratelimit(trans, ctxt)) {
+		bch2_trans_begin(trans);
 
 		k = bch2_btree_iter_peek(&iter);
 		if (!k.k)
@@ -570,7 +569,7 @@ static int __bch2_move_data(struct moving_context *ctxt,
 		if (!bkey_extent_is_direct_data(k.k))
 			goto next_nondata;
 
-		ret = move_get_io_opts(&trans, &io_opts, k, &cur_inum);
+		ret = move_get_io_opts(trans, &io_opts, k, &cur_inum);
 		if (ret)
 			continue;
 
@@ -585,7 +584,7 @@ static int __bch2_move_data(struct moving_context *ctxt,
 		bch2_bkey_buf_reassemble(&sk, c, k);
 		k = bkey_i_to_s_c(sk.k);
 
-		ret2 = bch2_move_extent(&trans, &iter, ctxt, NULL,
+		ret2 = bch2_move_extent(trans, &iter, ctxt, NULL,
 					io_opts, btree_id, k, data_opts);
 		if (ret2) {
 			if (bch2_err_matches(ret2, BCH_ERR_transaction_restart))
@@ -593,7 +592,7 @@ static int __bch2_move_data(struct moving_context *ctxt,
 
 			if (ret2 == -ENOMEM) {
 				/* memory allocation failure, wait for some IO to finish */
-				bch2_move_ctxt_wait_for_io(ctxt, &trans);
+				bch2_move_ctxt_wait_for_io(ctxt, trans);
 				continue;
 			}
 
@@ -610,8 +609,8 @@ next_nondata:
 		bch2_btree_iter_advance(&iter);
 	}
 
-	bch2_trans_iter_exit(&trans, &iter);
-	bch2_trans_exit(&trans);
+	bch2_trans_iter_exit(trans, &iter);
+	bch2_trans_put(trans);
 	bch2_bkey_buf_exit(&sk, c);
 
 	return ret;
@@ -826,15 +825,14 @@ int bch2_evacuate_bucket(struct bch_fs *c,
 			 struct write_point_specifier wp,
 			 bool wait_on_copygc)
 {
-	struct btree_trans trans;
+	struct btree_trans *trans = bch2_trans_get(c);
 	struct moving_context ctxt;
 	int ret;
 
-	bch2_trans_init(&trans, c, 0, 0);
 	bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc);
-	ret = __bch2_evacuate_bucket(&trans, &ctxt, NULL, bucket, gen, data_opts);
+	ret = __bch2_evacuate_bucket(trans, &ctxt, NULL, bucket, gen, data_opts);
 	bch2_moving_ctxt_exit(&ctxt);
-	bch2_trans_exit(&trans);
+	bch2_trans_put(trans);
 
 	return ret;
 }
@@ -851,14 +849,13 @@ static int bch2_move_btree(struct bch_fs *c,
 {
 	bool kthread = (current->flags & PF_KTHREAD) != 0;
 	struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts);
-	struct btree_trans trans;
+	struct btree_trans *trans = bch2_trans_get(c);
 	struct btree_iter iter;
 	struct btree *b;
 	enum btree_id id;
 	struct data_update_opts data_opts;
 	int ret = 0;
 
-	bch2_trans_init(&trans, c, 0, 0);
 	progress_list_add(c, stats);
 
 	stats->data_type = BCH_DATA_btree;
@@ -871,11 +868,11 @@ static int bch2_move_btree(struct bch_fs *c,
 		if (!bch2_btree_id_root(c, id)->b)
 			continue;
 
-		bch2_trans_node_iter_init(&trans, &iter, id, POS_MIN, 0, 0,
+		bch2_trans_node_iter_init(trans, &iter, id, POS_MIN, 0, 0,
 					  BTREE_ITER_PREFETCH);
 retry:
 		ret = 0;
-		while (bch2_trans_begin(&trans),
+		while (bch2_trans_begin(trans),
 		       (b = bch2_btree_iter_peek_node(&iter)) &&
 		       !(ret = PTR_ERR_OR_ZERO(b))) {
 			if (kthread && kthread_should_stop())
@@ -890,7 +887,7 @@ retry:
 			if (!pred(c, arg, b, &io_opts, &data_opts))
 				goto next;
 
-			ret = bch2_btree_node_rewrite(&trans, &iter, b, 0) ?: ret;
+			ret = bch2_btree_node_rewrite(trans, &iter, b, 0) ?: ret;
 			if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
 				continue;
 			if (ret)
@@ -901,13 +898,13 @@ next:
 		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
 			goto retry;
 
-		bch2_trans_iter_exit(&trans, &iter);
+		bch2_trans_iter_exit(trans, &iter);
 
 		if (kthread && kthread_should_stop())
 			break;
 	}
 
-	bch2_trans_exit(&trans);
+	bch2_trans_put(trans);
 
 	if (ret)
 		bch_err_fn(c, ret);
diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c
index 874c9324ab66..4017120baeee 100644
--- a/fs/bcachefs/movinggc.c
+++ b/fs/bcachefs/movinggc.c
@@ -300,7 +300,7 @@ void bch2_copygc_wait_to_text(struct printbuf *out, struct bch_fs *c)
 static int bch2_copygc_thread(void *arg)
 {
 	struct bch_fs *c = arg;
-	struct btree_trans trans;
+	struct btree_trans *trans;
 	struct moving_context ctxt;
 	struct bch_move_stats move_stats;
 	struct io_clock *clock = &c->io_clock[WRITE];
@@ -317,7 +317,7 @@ static int bch2_copygc_thread(void *arg)
 	}
 
 	set_freezable();
-	bch2_trans_init(&trans, c, 0, 0);
+	trans = bch2_trans_get(c);
 
 	bch2_move_stats_init(&move_stats, "copygc");
 	bch2_moving_ctxt_init(&ctxt, c, NULL, &move_stats,
@@ -325,16 +325,16 @@ static int bch2_copygc_thread(void *arg)
 			      false);
 
 	while (!ret && !kthread_should_stop()) {
-		bch2_trans_unlock(&trans);
+		bch2_trans_unlock(trans);
 		cond_resched();
 
 		if (!c->copy_gc_enabled) {
-			move_buckets_wait(&trans, &ctxt, &buckets, true);
+			move_buckets_wait(trans, &ctxt, &buckets, true);
 			kthread_wait_freezable(c->copy_gc_enabled);
 		}
 
 		if (unlikely(freezing(current))) {
-			move_buckets_wait(&trans, &ctxt, &buckets, true);
+			move_buckets_wait(trans, &ctxt, &buckets, true);
 			__refrigerator(false);
 			continue;
 		}
@@ -345,7 +345,7 @@ static int bch2_copygc_thread(void *arg)
 		if (wait > clock->max_slop) {
 			c->copygc_wait_at = last;
 			c->copygc_wait = last + wait;
-			move_buckets_wait(&trans, &ctxt, &buckets, true);
+			move_buckets_wait(trans, &ctxt, &buckets, true);
 			trace_and_count(c, copygc_wait, c, wait, last + wait);
 			bch2_kthread_io_clock_wait(clock, last + wait,
 					MAX_SCHEDULE_TIMEOUT);
@@ -355,15 +355,15 @@ static int bch2_copygc_thread(void *arg)
 		c->copygc_wait = 0;
 
 		c->copygc_running = true;
-		ret = bch2_copygc(&trans, &ctxt, &buckets);
+		ret = bch2_copygc(trans, &ctxt, &buckets);
 		c->copygc_running = false;
 
 		wake_up(&c->copygc_running_wq);
 	}
 
-	move_buckets_wait(&trans, &ctxt, &buckets, true);
+	move_buckets_wait(trans, &ctxt, &buckets, true);
 	rhashtable_destroy(&buckets.table);
-	bch2_trans_exit(&trans);
+	bch2_trans_put(trans);
 	bch2_moving_ctxt_exit(&ctxt);
 
 	return 0;
diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c
index f16aa3bc9679..36de2f071d80 100644
--- a/fs/bcachefs/quota.c
+++ b/fs/bcachefs/quota.c
@@ -599,7 +599,7 @@ advance:
 int bch2_fs_quota_read(struct bch_fs *c)
 {
 	struct bch_sb_field_quota *sb_quota;
-	struct btree_trans trans;
+	struct btree_trans *trans;
 	struct btree_iter iter;
 	struct bkey_s_c k;
 	int ret;
@@ -614,16 +614,16 @@ int bch2_fs_quota_read(struct bch_fs *c)
 	bch2_sb_quota_read(c);
 	mutex_unlock(&c->sb_lock);
 
-	bch2_trans_init(&trans, c, 0, 0);
+	trans = bch2_trans_get(c);
 
-	ret = for_each_btree_key2(&trans, iter, BTREE_ID_quotas,
+	ret = for_each_btree_key2(trans, iter, BTREE_ID_quotas,
 			POS_MIN, BTREE_ITER_PREFETCH, k,
 		__bch2_quota_set(c, k, NULL)) ?:
-	      for_each_btree_key2(&trans, iter, BTREE_ID_inodes,
+	      for_each_btree_key2(trans, iter, BTREE_ID_inodes,
 			POS_MIN, BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
-		bch2_fs_quota_read_inode(&trans, &iter, k));
+		bch2_fs_quota_read_inode(trans, &iter, k));
 
-	bch2_trans_exit(&trans);
+	bch2_trans_put(trans);
 
 	if (ret)
 		bch_err_fn(c, ret);
@@ -956,7 +956,7 @@ static int bch2_set_quota(struct super_block *sb, struct kqid qid,
 	new_quota.k.p = POS(qid.type, from_kqid(&init_user_ns, qid));
 
 	ret = bch2_trans_do(c, NULL, NULL, 0,
-			    bch2_set_quota_trans(&trans, &new_quota, qdq)) ?:
+			    bch2_set_quota_trans(trans, &new_quota, qdq)) ?:
 		__bch2_quota_set(c, bkey_i_to_s_c(&new_quota.k_i), qdq);
 
 	return bch2_err_class(ret);
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index f566c94260d6..1dceb7eeb205 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -165,7 +165,7 @@ static int bch2_journal_replay(struct bch_fs *c)
 				    (!k->allocated
 				     ? BTREE_INSERT_JOURNAL_REPLAY|BCH_WATERMARK_reclaim
 				     : 0),
-			     bch2_journal_replay_key(&trans, k));
+			     bch2_journal_replay_key(trans, k));
 		if (ret) {
 			bch_err(c, "journal replay: error while replaying key at btree %s level %u: %s",
 				bch2_btree_ids[k->btree_id], k->level, bch2_err_str(ret));
@@ -466,7 +466,7 @@ noinline_for_stack
 static int bch2_fs_upgrade_for_subvolumes(struct bch_fs *c)
 {
 	int ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_LAZY_RW,
-				__bch2_fs_upgrade_for_subvolumes(&trans));
+				__bch2_fs_upgrade_for_subvolumes(trans));
 	if (ret)
 		bch_err_fn(c, ret);
 	return ret;
@@ -1013,7 +1013,7 @@ int bch2_fs_initialize(struct bch_fs *c)
 	bch2_inode_init_early(c, &lostfound_inode);
 
 	ret = bch2_trans_do(c, NULL, NULL, 0,
-		bch2_create_trans(&trans,
+		bch2_create_trans(trans,
 				  BCACHEFS_ROOT_SUBVOL_INUM,
 				  &root_inode, &lostfound_inode,
 				  &lostfound,
diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c
index fb605b25b067..d77d0ea9afff 100644
--- a/fs/bcachefs/reflink.c
+++ b/fs/bcachefs/reflink.c
@@ -253,7 +253,7 @@ s64 bch2_remap_range(struct bch_fs *c,
 		     u64 remap_sectors,
 		     u64 new_i_size, s64 *i_sectors_delta)
 {
-	struct btree_trans trans;
+	struct btree_trans *trans;
 	struct btree_iter dst_iter, src_iter;
 	struct bkey_s_c src_k;
 	struct bkey_buf new_dst, new_src;
@@ -275,11 +275,11 @@ s64 bch2_remap_range(struct bch_fs *c,
 
 	bch2_bkey_buf_init(&new_dst);
 	bch2_bkey_buf_init(&new_src);
-	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 4096);
+	trans = bch2_trans_get(c);
 
-	bch2_trans_iter_init(&trans, &src_iter, BTREE_ID_extents, src_start,
+	bch2_trans_iter_init(trans, &src_iter, BTREE_ID_extents, src_start,
 			     BTREE_ITER_INTENT);
-	bch2_trans_iter_init(&trans, &dst_iter, BTREE_ID_extents, dst_start,
+	bch2_trans_iter_init(trans, &dst_iter, BTREE_ID_extents, dst_start,
 			     BTREE_ITER_INTENT);
 
 	while ((ret == 0 ||
@@ -287,21 +287,21 @@ s64 bch2_remap_range(struct bch_fs *c,
 	       bkey_lt(dst_iter.pos, dst_end)) {
 		struct disk_reservation disk_res = { 0 };
 
-		bch2_trans_begin(&trans);
+		bch2_trans_begin(trans);
 
 		if (fatal_signal_pending(current)) {
 			ret = -EINTR;
 			break;
 		}
 
-		ret = bch2_subvolume_get_snapshot(&trans, src_inum.subvol,
+		ret = bch2_subvolume_get_snapshot(trans, src_inum.subvol,
 						  &src_snapshot);
 		if (ret)
 			continue;
 
 		bch2_btree_iter_set_snapshot(&src_iter, src_snapshot);
 
-		ret = bch2_subvolume_get_snapshot(&trans, dst_inum.subvol,
+		ret = bch2_subvolume_get_snapshot(trans, dst_inum.subvol,
 						  &dst_snapshot);
 		if (ret)
 			continue;
@@ -318,7 +318,7 @@ s64 bch2_remap_range(struct bch_fs *c,
 			continue;
 
 		if (bkey_lt(src_want, src_iter.pos)) {
-			ret = bch2_fpunch_at(&trans, &dst_iter, dst_inum,
+			ret = bch2_fpunch_at(trans, &dst_iter, dst_inum,
 					min(dst_end.offset,
 					    dst_iter.pos.offset +
 					    src_iter.pos.offset - src_want.offset),
@@ -332,7 +332,7 @@ s64 bch2_remap_range(struct bch_fs *c,
 			bch2_bkey_buf_reassemble(&new_src, c, src_k);
 			src_k = bkey_i_to_s_c(new_src.k);
 
-			ret = bch2_make_extent_indirect(&trans, &src_iter,
+			ret = bch2_make_extent_indirect(trans, &src_iter,
 						new_src.k);
 			if (ret)
 				continue;
@@ -360,14 +360,14 @@ s64 bch2_remap_range(struct bch_fs *c,
 				min(src_k.k->p.offset - src_want.offset,
 				    dst_end.offset - dst_iter.pos.offset));
 
-		ret = bch2_extent_update(&trans, dst_inum, &dst_iter,
+		ret = bch2_extent_update(trans, dst_inum, &dst_iter,
 					 new_dst.k, &disk_res,
 					 new_i_size, i_sectors_delta,
 					 true);
 		bch2_disk_reservation_put(c, &disk_res);
 	}
-	bch2_trans_iter_exit(&trans, &dst_iter);
-	bch2_trans_iter_exit(&trans, &src_iter);
+	bch2_trans_iter_exit(trans, &dst_iter);
+	bch2_trans_iter_exit(trans, &src_iter);
 
 	BUG_ON(!ret && !bkey_eq(dst_iter.pos, dst_end));
 	BUG_ON(bkey_gt(dst_iter.pos, dst_end));
@@ -379,23 +379,23 @@ s64 bch2_remap_range(struct bch_fs *c,
 		struct bch_inode_unpacked inode_u;
 		struct btree_iter inode_iter = { NULL };
 
-		bch2_trans_begin(&trans);
+		bch2_trans_begin(trans);
 
-		ret2 = bch2_inode_peek(&trans, &inode_iter, &inode_u,
+		ret2 = bch2_inode_peek(trans, &inode_iter, &inode_u,
 				       dst_inum, BTREE_ITER_INTENT);
 
 		if (!ret2 &&
 		    inode_u.bi_size < new_i_size) {
 			inode_u.bi_size = new_i_size;
-			ret2  = bch2_inode_write(&trans, &inode_iter, &inode_u) ?:
-				bch2_trans_commit(&trans, NULL, NULL,
+			ret2  = bch2_inode_write(trans, &inode_iter, &inode_u) ?:
+				bch2_trans_commit(trans, NULL, NULL,
 						  BTREE_INSERT_NOFAIL);
 		}
 
-		bch2_trans_iter_exit(&trans, &inode_iter);
+		bch2_trans_iter_exit(trans, &inode_iter);
 	} while (bch2_err_matches(ret2, BCH_ERR_transaction_restart));
 
-	bch2_trans_exit(&trans);
+	bch2_trans_put(trans);
 	bch2_bkey_buf_exit(&new_src, c);
 	bch2_bkey_buf_exit(&new_dst, c);
 
diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c
index 73fca04011ad..ff7f71576d5c 100644
--- a/fs/bcachefs/snapshot.c
+++ b/fs/bcachefs/snapshot.c
@@ -610,11 +610,11 @@ int bch2_check_snapshot_trees(struct bch_fs *c)
 	int ret;
 
 	ret = bch2_trans_run(c,
-		for_each_btree_key_commit(&trans, iter,
+		for_each_btree_key_commit(trans, iter,
 			BTREE_ID_snapshot_trees, POS_MIN,
 			BTREE_ITER_PREFETCH, k,
 			NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
-		check_snapshot_tree(&trans, &iter, k)));
+		check_snapshot_tree(trans, &iter, k)));
 
 	if (ret)
 		bch_err(c, "error %i checking snapshot trees", ret);
@@ -883,11 +883,11 @@ int bch2_check_snapshots(struct bch_fs *c)
 	 * the parent's depth already be correct:
 	 */
 	ret = bch2_trans_run(c,
-		for_each_btree_key_reverse_commit(&trans, iter,
+		for_each_btree_key_reverse_commit(trans, iter,
 			BTREE_ID_snapshots, POS_MAX,
 			BTREE_ITER_PREFETCH, k,
 			NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
-		check_snapshot(&trans, &iter, k)));
+		check_snapshot(trans, &iter, k)));
 	if (ret)
 		bch_err_fn(c, ret);
 	return ret;
@@ -1373,7 +1373,7 @@ static int bch2_fix_child_of_deleted_snapshot(struct btree_trans *trans,
 
 int bch2_delete_dead_snapshots(struct bch_fs *c)
 {
-	struct btree_trans trans;
+	struct btree_trans *trans;
 	struct btree_iter iter;
 	struct bkey_s_c k;
 	struct bkey_s_c_snapshot snap;
@@ -1390,30 +1390,30 @@ int bch2_delete_dead_snapshots(struct bch_fs *c)
 		}
 	}
 
-	bch2_trans_init(&trans, c, 0, 0);
+	trans = bch2_trans_get(c);
 
 	/*
 	 * For every snapshot node: If we have no live children and it's not
 	 * pointed to by a subvolume, delete it:
 	 */
-	ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_snapshots,
+	ret = for_each_btree_key_commit(trans, iter, BTREE_ID_snapshots,
 			POS_MIN, 0, k,
 			NULL, NULL, 0,
-		bch2_delete_redundant_snapshot(&trans, &iter, k));
+		bch2_delete_redundant_snapshot(trans, &iter, k));
 	if (ret) {
 		bch_err_msg(c, ret, "deleting redundant snapshots");
 		goto err;
 	}
 
-	for_each_btree_key2(&trans, iter, BTREE_ID_snapshots,
+	for_each_btree_key2(trans, iter, BTREE_ID_snapshots,
 			   POS_MIN, 0, k,
-		bch2_snapshot_set_equiv(&trans, k));
+		bch2_snapshot_set_equiv(trans, k));
 	if (ret) {
 		bch_err_msg(c, ret, "in bch2_snapshots_set_equiv");
 		goto err;
 	}
 
-	for_each_btree_key(&trans, iter, BTREE_ID_snapshots,
+	for_each_btree_key(trans, iter, BTREE_ID_snapshots,
 			   POS_MIN, 0, k, ret) {
 		if (k.k->type != KEY_TYPE_snapshot)
 			continue;
@@ -1425,7 +1425,7 @@ int bch2_delete_dead_snapshots(struct bch_fs *c)
 				break;
 		}
 	}
-	bch2_trans_iter_exit(&trans, &iter);
+	bch2_trans_iter_exit(trans, &iter);
 
 	if (ret) {
 		bch_err_msg(c, ret, "walking snapshots");
@@ -1440,16 +1440,16 @@ int bch2_delete_dead_snapshots(struct bch_fs *c)
 		if (!btree_type_has_snapshots(id))
 			continue;
 
-		ret = for_each_btree_key_commit(&trans, iter,
+		ret = for_each_btree_key_commit(trans, iter,
 				id, POS_MIN,
 				BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
 				&res, NULL, BTREE_INSERT_NOFAIL,
-			snapshot_delete_key(&trans, &iter, k, &deleted, &equiv_seen, &last_pos)) ?:
-		      for_each_btree_key_commit(&trans, iter,
+			snapshot_delete_key(trans, &iter, k, &deleted, &equiv_seen, &last_pos)) ?:
+		      for_each_btree_key_commit(trans, iter,
 				id, POS_MIN,
 				BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
 				&res, NULL, BTREE_INSERT_NOFAIL,
-			move_key_to_correct_snapshot(&trans, &iter, k));
+			move_key_to_correct_snapshot(trans, &iter, k));
 
 		bch2_disk_reservation_put(c, &res);
 		darray_exit(&equiv_seen);
@@ -1460,7 +1460,7 @@ int bch2_delete_dead_snapshots(struct bch_fs *c)
 		}
 	}
 
-	for_each_btree_key(&trans, iter, BTREE_ID_snapshots,
+	for_each_btree_key(trans, iter, BTREE_ID_snapshots,
 			   POS_MIN, 0, k, ret) {
 		u32 snapshot = k.k->p.offset;
 		u32 equiv = bch2_snapshot_equiv(c, snapshot);
@@ -1468,23 +1468,23 @@ int bch2_delete_dead_snapshots(struct bch_fs *c)
 		if (equiv != snapshot)
 			snapshot_list_add(c, &deleted_interior, snapshot);
 	}
-	bch2_trans_iter_exit(&trans, &iter);
+	bch2_trans_iter_exit(trans, &iter);
 
 	/*
 	 * Fixing children of deleted snapshots can't be done completely
 	 * atomically, if we crash between here and when we delete the interior
 	 * nodes some depth fields will be off:
 	 */
-	ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_snapshots, POS_MIN,
+	ret = for_each_btree_key_commit(trans, iter, BTREE_ID_snapshots, POS_MIN,
 				  BTREE_ITER_INTENT, k,
 				  NULL, NULL, BTREE_INSERT_NOFAIL,
-		bch2_fix_child_of_deleted_snapshot(&trans, &iter, k, &deleted_interior));
+		bch2_fix_child_of_deleted_snapshot(trans, &iter, k, &deleted_interior));
 	if (ret)
 		goto err;
 
 	darray_for_each(deleted, i) {
-		ret = commit_do(&trans, NULL, NULL, 0,
-			bch2_snapshot_node_delete(&trans, *i));
+		ret = commit_do(trans, NULL, NULL, 0,
+			bch2_snapshot_node_delete(trans, *i));
 		if (ret) {
 			bch_err_msg(c, ret, "deleting snapshot %u", *i);
 			goto err;
@@ -1492,8 +1492,8 @@ int bch2_delete_dead_snapshots(struct bch_fs *c)
 	}
 
 	darray_for_each(deleted_interior, i) {
-		ret = commit_do(&trans, NULL, NULL, 0,
-			bch2_snapshot_node_delete(&trans, *i));
+		ret = commit_do(trans, NULL, NULL, 0,
+			bch2_snapshot_node_delete(trans, *i));
 		if (ret) {
 			bch_err_msg(c, ret, "deleting snapshot %u", *i);
 			goto err;
@@ -1504,7 +1504,7 @@ int bch2_delete_dead_snapshots(struct bch_fs *c)
 err:
 	darray_exit(&deleted_interior);
 	darray_exit(&deleted);
-	bch2_trans_exit(&trans);
+	bch2_trans_put(trans);
 	if (ret)
 		bch_err_fn(c, ret);
 	return ret;
@@ -1671,11 +1671,11 @@ int bch2_snapshots_read(struct bch_fs *c)
 	int ret = 0;
 
 	ret = bch2_trans_run(c,
-		for_each_btree_key2(&trans, iter, BTREE_ID_snapshots,
+		for_each_btree_key2(trans, iter, BTREE_ID_snapshots,
 			   POS_MIN, 0, k,
-			bch2_mark_snapshot(&trans, BTREE_ID_snapshots, 0, bkey_s_c_null, k, 0) ?:
-			bch2_snapshot_set_equiv(&trans, k)) ?:
-		for_each_btree_key2(&trans, iter, BTREE_ID_snapshots,
+			bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0, bkey_s_c_null, k, 0) ?:
+			bch2_snapshot_set_equiv(trans, k)) ?:
+		for_each_btree_key2(trans, iter, BTREE_ID_snapshots,
 			   POS_MIN, 0, k,
 			   (set_is_ancestor_bitmap(c, k.k->p.offset), 0)));
 	if (ret)
diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c
index ca03d585a2fa..caf2dd7dafff 100644
--- a/fs/bcachefs/subvolume.c
+++ b/fs/bcachefs/subvolume.c
@@ -86,10 +86,10 @@ int bch2_check_subvols(struct bch_fs *c)
 	int ret;
 
 	ret = bch2_trans_run(c,
-		for_each_btree_key_commit(&trans, iter,
+		for_each_btree_key_commit(trans, iter,
 			BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_PREFETCH, k,
 			NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
-		check_subvol(&trans, &iter, k)));
+		check_subvol(trans, &iter, k)));
 	if (ret)
 		bch_err_fn(c, ret);
 	return ret;
@@ -293,7 +293,7 @@ static void bch2_subvolume_wait_for_pagecache_and_delete(struct work_struct *wor
 		bch2_evict_subvolume_inodes(c, &s);
 
 		for (id = s.data; id < s.data + s.nr; id++) {
-			ret = bch2_trans_run(c, bch2_subvolume_delete(&trans, *id));
+			ret = bch2_trans_run(c, bch2_subvolume_delete(trans, *id));
 			if (ret) {
 				bch_err_msg(c, ret, "deleting subvolume %u", *id);
 				break;
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 332951b794b4..a00dc4a4a2c9 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -470,7 +470,6 @@ int bch2_fs_read_write_early(struct bch_fs *c)
 static void __bch2_fs_free(struct bch_fs *c)
 {
 	unsigned i;
-	int cpu;
 
 	for (i = 0; i < BCH_TIME_STAT_NR; i++)
 		bch2_time_stats_exit(&c->times[i]);
@@ -502,12 +501,7 @@ static void __bch2_fs_free(struct bch_fs *c)
 	percpu_free_rwsem(&c->mark_lock);
 	free_percpu(c->online_reserved);
 
-	if (c->btree_paths_bufs)
-		for_each_possible_cpu(cpu)
-			kfree(per_cpu_ptr(c->btree_paths_bufs, cpu)->path);
-
 	darray_exit(&c->btree_roots_extra);
-	free_percpu(c->btree_paths_bufs);
 	free_percpu(c->pcpu);
 	mempool_exit(&c->large_bkey_pool);
 	mempool_exit(&c->btree_bounce_pool);
@@ -829,7 +823,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 			BIOSET_NEED_BVECS) ||
 	    !(c->pcpu = alloc_percpu(struct bch_fs_pcpu)) ||
 	    !(c->online_reserved = alloc_percpu(u64)) ||
-	    !(c->btree_paths_bufs = alloc_percpu(struct btree_path_buf)) ||
 	    mempool_init_kvpmalloc_pool(&c->btree_bounce_pool, 1,
 					btree_bytes(c)) ||
 	    mempool_init_kmalloc_pool(&c->large_bkey_pool, 1, 2048) ||
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index 1e26c2645ce4..03dbea4d95ce 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -252,7 +252,7 @@ static size_t bch2_btree_cache_size(struct bch_fs *c)
 
 static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c)
 {
-	struct btree_trans trans;
+	struct btree_trans *trans;
 	struct btree_iter iter;
 	struct bkey_s_c k;
 	enum btree_id id;
@@ -268,13 +268,13 @@ static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c
 	if (!test_bit(BCH_FS_STARTED, &c->flags))
 		return -EPERM;
 
-	bch2_trans_init(&trans, c, 0, 0);
+	trans = bch2_trans_get(c);
 
 	for (id = 0; id < BTREE_ID_NR; id++) {
 		if (!btree_type_has_ptrs(id))
 			continue;
 
-		for_each_btree_key(&trans, iter, id, POS_MIN,
+		for_each_btree_key(trans, iter, id, POS_MIN,
 				   BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
 			struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
 			const union bch_extent_entry *entry;
@@ -308,10 +308,10 @@ static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c
 			else if (compressed)
 				nr_compressed_extents++;
 		}
-		bch2_trans_iter_exit(&trans, &iter);
+		bch2_trans_iter_exit(trans, &iter);
 	}
 
-	bch2_trans_exit(&trans);
+	bch2_trans_put(trans);
 
 	if (ret)
 		return ret;
diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c
index 18ccb37b5a26..c907b3e00176 100644
--- a/fs/bcachefs/tests.c
+++ b/fs/bcachefs/tests.c
@@ -31,7 +31,7 @@ static void delete_test_keys(struct bch_fs *c)
 
 static int test_delete(struct bch_fs *c, u64 nr)
 {
-	struct btree_trans trans;
+	struct btree_trans *trans = bch2_trans_get(c);
 	struct btree_iter iter;
 	struct bkey_i_cookie k;
 	int ret;
@@ -39,44 +39,43 @@ static int test_delete(struct bch_fs *c, u64 nr)
 	bkey_cookie_init(&k.k_i);
 	k.k.p.snapshot = U32_MAX;
 
-	bch2_trans_init(&trans, c, 0, 0);
-	bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs, k.k.p,
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, k.k.p,
 			     BTREE_ITER_INTENT);
 
-	ret = commit_do(&trans, NULL, NULL, 0,
+	ret = commit_do(trans, NULL, NULL, 0,
 		bch2_btree_iter_traverse(&iter) ?:
-		bch2_trans_update(&trans, &iter, &k.k_i, 0));
+		bch2_trans_update(trans, &iter, &k.k_i, 0));
 	if (ret) {
 		bch_err_msg(c, ret, "update error");
 		goto err;
 	}
 
 	pr_info("deleting once");
-	ret = commit_do(&trans, NULL, NULL, 0,
+	ret = commit_do(trans, NULL, NULL, 0,
 		bch2_btree_iter_traverse(&iter) ?:
-		bch2_btree_delete_at(&trans, &iter, 0));
+		bch2_btree_delete_at(trans, &iter, 0));
 	if (ret) {
 		bch_err_msg(c, ret, "delete error (first)");
 		goto err;
 	}
 
 	pr_info("deleting twice");
-	ret = commit_do(&trans, NULL, NULL, 0,
+	ret = commit_do(trans, NULL, NULL, 0,
 		bch2_btree_iter_traverse(&iter) ?:
-		bch2_btree_delete_at(&trans, &iter, 0));
+		bch2_btree_delete_at(trans, &iter, 0));
 	if (ret) {
 		bch_err_msg(c, ret, "delete error (second)");
 		goto err;
 	}
 err:
-	bch2_trans_iter_exit(&trans, &iter);
-	bch2_trans_exit(&trans);
+	bch2_trans_iter_exit(trans, &iter);
+	bch2_trans_put(trans);
 	return ret;
 }
 
 static int test_delete_written(struct bch_fs *c, u64 nr)
 {
-	struct btree_trans trans;
+	struct btree_trans *trans = bch2_trans_get(c);
 	struct btree_iter iter;
 	struct bkey_i_cookie k;
 	int ret;
@@ -84,45 +83,41 @@ static int test_delete_written(struct bch_fs *c, u64 nr)
 	bkey_cookie_init(&k.k_i);
 	k.k.p.snapshot = U32_MAX;
 
-	bch2_trans_init(&trans, c, 0, 0);
-
-	bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs, k.k.p,
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, k.k.p,
 			     BTREE_ITER_INTENT);
 
-	ret = commit_do(&trans, NULL, NULL, 0,
+	ret = commit_do(trans, NULL, NULL, 0,
 		bch2_btree_iter_traverse(&iter) ?:
-		bch2_trans_update(&trans, &iter, &k.k_i, 0));
+		bch2_trans_update(trans, &iter, &k.k_i, 0));
 	if (ret) {
 		bch_err_msg(c, ret, "update error");
 		goto err;
 	}
 
-	bch2_trans_unlock(&trans);
+	bch2_trans_unlock(trans);
 	bch2_journal_flush_all_pins(&c->journal);
 
-	ret = commit_do(&trans, NULL, NULL, 0,
+	ret = commit_do(trans, NULL, NULL, 0,
 		bch2_btree_iter_traverse(&iter) ?:
-		bch2_btree_delete_at(&trans, &iter, 0));
+		bch2_btree_delete_at(trans, &iter, 0));
 	if (ret) {
 		bch_err_msg(c, ret, "delete error");
 		goto err;
 	}
 err:
-	bch2_trans_iter_exit(&trans, &iter);
-	bch2_trans_exit(&trans);
+	bch2_trans_iter_exit(trans, &iter);
+	bch2_trans_put(trans);
 	return ret;
 }
 
 static int test_iterate(struct bch_fs *c, u64 nr)
 {
-	struct btree_trans trans;
+	struct btree_trans *trans = bch2_trans_get(c);
 	struct btree_iter iter = { NULL };
 	struct bkey_s_c k;
 	u64 i;
 	int ret = 0;
 
-	bch2_trans_init(&trans, c, 0, 0);
-
 	delete_test_keys(c);
 
 	pr_info("inserting test keys");
@@ -145,7 +140,7 @@ static int test_iterate(struct bch_fs *c, u64 nr)
 
 	i = 0;
 
-	ret = for_each_btree_key2_upto(&trans, iter, BTREE_ID_xattrs,
+	ret = for_each_btree_key2_upto(trans, iter, BTREE_ID_xattrs,
 				  SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
 				  0, k, ({
 		BUG_ON(k.k->p.offset != i++);
@@ -160,7 +155,7 @@ static int test_iterate(struct bch_fs *c, u64 nr)
 
 	pr_info("iterating backwards");
 
-	ret = for_each_btree_key_reverse(&trans, iter, BTREE_ID_xattrs,
+	ret = for_each_btree_key_reverse(trans, iter, BTREE_ID_xattrs,
 					 SPOS(0, U64_MAX, U32_MAX), 0, k,
 		({
 			BUG_ON(k.k->p.offset != --i);
@@ -173,21 +168,19 @@ static int test_iterate(struct bch_fs *c, u64 nr)
 
 	BUG_ON(i);
 err:
-	bch2_trans_iter_exit(&trans, &iter);
-	bch2_trans_exit(&trans);
+	bch2_trans_iter_exit(trans, &iter);
+	bch2_trans_put(trans);
 	return ret;
 }
 
 static int test_iterate_extents(struct bch_fs *c, u64 nr)
 {
-	struct btree_trans trans;
+	struct btree_trans *trans = bch2_trans_get(c);
 	struct btree_iter iter = { NULL };
 	struct bkey_s_c k;
 	u64 i;
 	int ret = 0;
 
-	bch2_trans_init(&trans, c, 0, 0);
-
 	delete_test_keys(c);
 
 	pr_info("inserting test extents");
@@ -211,7 +204,7 @@ static int test_iterate_extents(struct bch_fs *c, u64 nr)
 
 	i = 0;
 
-	ret = for_each_btree_key2_upto(&trans, iter, BTREE_ID_extents,
+	ret = for_each_btree_key2_upto(trans, iter, BTREE_ID_extents,
 				  SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
 				  0, k, ({
 		BUG_ON(bkey_start_offset(k.k) != i);
@@ -227,7 +220,7 @@ static int test_iterate_extents(struct bch_fs *c, u64 nr)
 
 	pr_info("iterating backwards");
 
-	ret = for_each_btree_key_reverse(&trans, iter, BTREE_ID_extents,
+	ret = for_each_btree_key_reverse(trans, iter, BTREE_ID_extents,
 					 SPOS(0, U64_MAX, U32_MAX), 0, k,
 		({
 			BUG_ON(k.k->p.offset != i);
@@ -241,21 +234,19 @@ static int test_iterate_extents(struct bch_fs *c, u64 nr)
 
 	BUG_ON(i);
 err:
-	bch2_trans_iter_exit(&trans, &iter);
-	bch2_trans_exit(&trans);
+	bch2_trans_iter_exit(trans, &iter);
+	bch2_trans_put(trans);
 	return ret;
 }
 
 static int test_iterate_slots(struct bch_fs *c, u64 nr)
 {
-	struct btree_trans trans;
+	struct btree_trans *trans = bch2_trans_get(c);
 	struct btree_iter iter = { NULL };
 	struct bkey_s_c k;
 	u64 i;
 	int ret = 0;
 
-	bch2_trans_init(&trans, c, 0, 0);
-
 	delete_test_keys(c);
 
 	pr_info("inserting test keys");
@@ -278,7 +269,7 @@ static int test_iterate_slots(struct bch_fs *c, u64 nr)
 
 	i = 0;
 
-	ret = for_each_btree_key2_upto(&trans, iter, BTREE_ID_xattrs,
+	ret = for_each_btree_key2_upto(trans, iter, BTREE_ID_xattrs,
 				  SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
 				  0, k, ({
 		BUG_ON(k.k->p.offset != i);
@@ -296,7 +287,7 @@ static int test_iterate_slots(struct bch_fs *c, u64 nr)
 
 	i = 0;
 
-	ret = for_each_btree_key2_upto(&trans, iter, BTREE_ID_xattrs,
+	ret = for_each_btree_key2_upto(trans, iter, BTREE_ID_xattrs,
 				  SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
 				  BTREE_ITER_SLOTS, k, ({
 		if (i >= nr * 2)
@@ -314,20 +305,18 @@ static int test_iterate_slots(struct bch_fs *c, u64 nr)
 	}
 	ret = 0;
 err:
-	bch2_trans_exit(&trans);
+	bch2_trans_put(trans);
 	return ret;
 }
 
 static int test_iterate_slots_extents(struct bch_fs *c, u64 nr)
 {
-	struct btree_trans trans;
+	struct btree_trans *trans = bch2_trans_get(c);
 	struct btree_iter iter = { NULL };
 	struct bkey_s_c k;
 	u64 i;
 	int ret = 0;
 
-	bch2_trans_init(&trans, c, 0, 0);
-
 	delete_test_keys(c);
 
 	pr_info("inserting test keys");
@@ -351,7 +340,7 @@ static int test_iterate_slots_extents(struct bch_fs *c, u64 nr)
 
 	i = 0;
 
-	ret = for_each_btree_key2_upto(&trans, iter, BTREE_ID_extents,
+	ret = for_each_btree_key2_upto(trans, iter, BTREE_ID_extents,
 				  SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
 				  0, k, ({
 		BUG_ON(bkey_start_offset(k.k) != i + 8);
@@ -370,7 +359,7 @@ static int test_iterate_slots_extents(struct bch_fs *c, u64 nr)
 
 	i = 0;
 
-	ret = for_each_btree_key2_upto(&trans, iter, BTREE_ID_extents,
+	ret = for_each_btree_key2_upto(trans, iter, BTREE_ID_extents,
 				 SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
 				 BTREE_ITER_SLOTS, k, ({
 		if (i == nr)
@@ -388,7 +377,7 @@ static int test_iterate_slots_extents(struct bch_fs *c, u64 nr)
 	}
 	ret = 0;
 err:
-	bch2_trans_exit(&trans);
+	bch2_trans_put(trans);
 	return 0;
 }
 
@@ -398,43 +387,41 @@ err:
  */
 static int test_peek_end(struct bch_fs *c, u64 nr)
 {
-	struct btree_trans trans;
+	struct btree_trans *trans = bch2_trans_get(c);
 	struct btree_iter iter;
 	struct bkey_s_c k;
 
-	bch2_trans_init(&trans, c, 0, 0);
-	bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs,
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs,
 			     SPOS(0, 0, U32_MAX), 0);
 
-	lockrestart_do(&trans, bkey_err(k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX))));
+	lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX))));
 	BUG_ON(k.k);
 
-	lockrestart_do(&trans, bkey_err(k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX))));
+	lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX))));
 	BUG_ON(k.k);
 
-	bch2_trans_iter_exit(&trans, &iter);
-	bch2_trans_exit(&trans);
+	bch2_trans_iter_exit(trans, &iter);
+	bch2_trans_put(trans);
 	return 0;
 }
 
 static int test_peek_end_extents(struct bch_fs *c, u64 nr)
 {
-	struct btree_trans trans;
+	struct btree_trans *trans = bch2_trans_get(c);
 	struct btree_iter iter;
 	struct bkey_s_c k;
 
-	bch2_trans_init(&trans, c, 0, 0);
-	bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
 			     SPOS(0, 0, U32_MAX), 0);
 
-	lockrestart_do(&trans, bkey_err(k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX))));
+	lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX))));
 	BUG_ON(k.k);
 
-	lockrestart_do(&trans, bkey_err(k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX))));
+	lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX))));
 	BUG_ON(k.k);
 
-	bch2_trans_iter_exit(&trans, &iter);
-	bch2_trans_exit(&trans);
+	bch2_trans_iter_exit(trans, &iter);
+	bch2_trans_put(trans);
 	return 0;
 }
 
@@ -510,7 +497,7 @@ static int insert_test_overlapping_extent(struct bch_fs *c, u64 inum, u64 start,
 	k.k_i.k.size = len;
 
 	ret = bch2_trans_do(c, NULL, NULL, 0,
-		bch2_btree_insert_nonextent(&trans, BTREE_ID_extents, &k.k_i,
+		bch2_btree_insert_nonextent(trans, BTREE_ID_extents, &k.k_i,
 					    BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE));
 	if (ret)
 		bch_err_fn(c, ret);
@@ -533,7 +520,7 @@ static int test_extent_create_overlapping(struct bch_fs *c, u64 inum)
 /* Test skipping over keys in unrelated snapshots: */
 static int test_snapshot_filter(struct bch_fs *c, u32 snapid_lo, u32 snapid_hi)
 {
-	struct btree_trans trans;
+	struct btree_trans *trans;
 	struct btree_iter iter;
 	struct bkey_s_c k;
 	struct bkey_i_cookie cookie;
@@ -545,15 +532,15 @@ static int test_snapshot_filter(struct bch_fs *c, u32 snapid_lo, u32 snapid_hi)
 	if (ret)
 		return ret;
 
-	bch2_trans_init(&trans, c, 0, 0);
-	bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs,
+	trans = bch2_trans_get(c);
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs,
 			     SPOS(0, 0, snapid_lo), 0);
-	lockrestart_do(&trans, bkey_err(k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX))));
+	lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX))));
 
 	BUG_ON(k.k->p.snapshot != U32_MAX);
 
-	bch2_trans_iter_exit(&trans, &iter);
-	bch2_trans_exit(&trans);
+	bch2_trans_iter_exit(trans, &iter);
+	bch2_trans_put(trans);
 	return ret;
 }
 
@@ -571,7 +558,7 @@ static int test_snapshots(struct bch_fs *c, u64 nr)
 		return ret;
 
 	ret = bch2_trans_do(c, NULL, NULL, 0,
-		      bch2_snapshot_node_create(&trans, U32_MAX,
+		      bch2_snapshot_node_create(trans, U32_MAX,
 						snapids,
 						snapid_subvols,
 						2));
@@ -602,38 +589,34 @@ static u64 test_rand(void)
 
 static int rand_insert(struct bch_fs *c, u64 nr)
 {
-	struct btree_trans trans;
+	struct btree_trans *trans = bch2_trans_get(c);
 	struct bkey_i_cookie k;
 	int ret = 0;
 	u64 i;
 
-	bch2_trans_init(&trans, c, 0, 0);
-
 	for (i = 0; i < nr; i++) {
 		bkey_cookie_init(&k.k_i);
 		k.k.p.offset = test_rand();
 		k.k.p.snapshot = U32_MAX;
 
-		ret = commit_do(&trans, NULL, NULL, 0,
-			bch2_btree_insert_trans(&trans, BTREE_ID_xattrs, &k.k_i, 0));
+		ret = commit_do(trans, NULL, NULL, 0,
+			bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k.k_i, 0));
 		if (ret)
 			break;
 	}
 
-	bch2_trans_exit(&trans);
+	bch2_trans_put(trans);
 	return ret;
 }
 
 static int rand_insert_multi(struct bch_fs *c, u64 nr)
 {
-	struct btree_trans trans;
+	struct btree_trans *trans = bch2_trans_get(c);
 	struct bkey_i_cookie k[8];
 	int ret = 0;
 	unsigned j;
 	u64 i;
 
-	bch2_trans_init(&trans, c, 0, 0);
-
 	for (i = 0; i < nr; i += ARRAY_SIZE(k)) {
 		for (j = 0; j < ARRAY_SIZE(k); j++) {
 			bkey_cookie_init(&k[j].k_i);
@@ -641,46 +624,45 @@ static int rand_insert_multi(struct bch_fs *c, u64 nr)
 			k[j].k.p.snapshot = U32_MAX;
 		}
 
-		ret = commit_do(&trans, NULL, NULL, 0,
-			bch2_btree_insert_trans(&trans, BTREE_ID_xattrs, &k[0].k_i, 0) ?:
-			bch2_btree_insert_trans(&trans, BTREE_ID_xattrs, &k[1].k_i, 0) ?:
-			bch2_btree_insert_trans(&trans, BTREE_ID_xattrs, &k[2].k_i, 0) ?:
-			bch2_btree_insert_trans(&trans, BTREE_ID_xattrs, &k[3].k_i, 0) ?:
-			bch2_btree_insert_trans(&trans, BTREE_ID_xattrs, &k[4].k_i, 0) ?:
-			bch2_btree_insert_trans(&trans, BTREE_ID_xattrs, &k[5].k_i, 0) ?:
-			bch2_btree_insert_trans(&trans, BTREE_ID_xattrs, &k[6].k_i, 0) ?:
-			bch2_btree_insert_trans(&trans, BTREE_ID_xattrs, &k[7].k_i, 0));
+		ret = commit_do(trans, NULL, NULL, 0,
+			bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k[0].k_i, 0) ?:
+			bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k[1].k_i, 0) ?:
+			bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k[2].k_i, 0) ?:
+			bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k[3].k_i, 0) ?:
+			bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k[4].k_i, 0) ?:
+			bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k[5].k_i, 0) ?:
+			bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k[6].k_i, 0) ?:
+			bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k[7].k_i, 0));
 		if (ret)
 			break;
 	}
 
-	bch2_trans_exit(&trans);
+	bch2_trans_put(trans);
 	return ret;
 }
 
 static int rand_lookup(struct bch_fs *c, u64 nr)
 {
-	struct btree_trans trans;
+	struct btree_trans *trans = bch2_trans_get(c);
 	struct btree_iter iter;
 	struct bkey_s_c k;
 	int ret = 0;
 	u64 i;
 
-	bch2_trans_init(&trans, c, 0, 0);
-	bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs,
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs,
 			     SPOS(0, 0, U32_MAX), 0);
 
 	for (i = 0; i < nr; i++) {
 		bch2_btree_iter_set_pos(&iter, SPOS(0, test_rand(), U32_MAX));
 
-		lockrestart_do(&trans, bkey_err(k = bch2_btree_iter_peek(&iter)));
+		lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek(&iter)));
 		ret = bkey_err(k);
 		if (ret)
 			break;
 	}
 
-	bch2_trans_iter_exit(&trans, &iter);
-	bch2_trans_exit(&trans);
+	bch2_trans_iter_exit(trans, &iter);
+	bch2_trans_put(trans);
 	return ret;
 }
 
@@ -712,26 +694,25 @@ static int rand_mixed_trans(struct btree_trans *trans,
 
 static int rand_mixed(struct bch_fs *c, u64 nr)
 {
-	struct btree_trans trans;
+	struct btree_trans *trans = bch2_trans_get(c);
 	struct btree_iter iter;
 	struct bkey_i_cookie cookie;
 	int ret = 0;
 	u64 i, rand;
 
-	bch2_trans_init(&trans, c, 0, 0);
-	bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs,
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs,
 			     SPOS(0, 0, U32_MAX), 0);
 
 	for (i = 0; i < nr; i++) {
 		rand = test_rand();
-		ret = commit_do(&trans, NULL, NULL, 0,
-			rand_mixed_trans(&trans, &iter, &cookie, i, rand));
+		ret = commit_do(trans, NULL, NULL, 0,
+			rand_mixed_trans(trans, &iter, &cookie, i, rand));
 		if (ret)
 			break;
 	}
 
-	bch2_trans_iter_exit(&trans, &iter);
-	bch2_trans_exit(&trans);
+	bch2_trans_iter_exit(trans, &iter);
+	bch2_trans_put(trans);
 	return ret;
 }
 
@@ -759,22 +740,20 @@ err:
 
 static int rand_delete(struct bch_fs *c, u64 nr)
 {
-	struct btree_trans trans;
+	struct btree_trans *trans = bch2_trans_get(c);
 	int ret = 0;
 	u64 i;
 
-	bch2_trans_init(&trans, c, 0, 0);
-
 	for (i = 0; i < nr; i++) {
 		struct bpos pos = SPOS(0, test_rand(), U32_MAX);
 
-		ret = commit_do(&trans, NULL, NULL, 0,
-			__do_delete(&trans, pos));
+		ret = commit_do(trans, NULL, NULL, 0,
+			__do_delete(trans, pos));
 		if (ret)
 			break;
 	}
 
-	bch2_trans_exit(&trans);
+	bch2_trans_put(trans);
 	return ret;
 }
 
@@ -787,14 +766,14 @@ static int seq_insert(struct bch_fs *c, u64 nr)
 	bkey_cookie_init(&insert.k_i);
 
 	return bch2_trans_run(c,
-		for_each_btree_key_commit(&trans, iter, BTREE_ID_xattrs,
+		for_each_btree_key_commit(trans, iter, BTREE_ID_xattrs,
 					SPOS(0, 0, U32_MAX),
 					BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k,
 					NULL, NULL, 0, ({
 			if (iter.pos.offset >= nr)
 				break;
 			insert.k.p = iter.pos;
-			bch2_trans_update(&trans, &iter, &insert.k_i, 0);
+			bch2_trans_update(trans, &iter, &insert.k_i, 0);
 		})));
 }
 
@@ -804,7 +783,7 @@ static int seq_lookup(struct bch_fs *c, u64 nr)
 	struct bkey_s_c k;
 
 	return bch2_trans_run(c,
-		for_each_btree_key2_upto(&trans, iter, BTREE_ID_xattrs,
+		for_each_btree_key2_upto(trans, iter, BTREE_ID_xattrs,
 				  SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
 				  0, k,
 		0));
@@ -816,14 +795,14 @@ static int seq_overwrite(struct bch_fs *c, u64 nr)
 	struct bkey_s_c k;
 
 	return bch2_trans_run(c,
-		for_each_btree_key_commit(&trans, iter, BTREE_ID_xattrs,
+		for_each_btree_key_commit(trans, iter, BTREE_ID_xattrs,
 					SPOS(0, 0, U32_MAX),
 					BTREE_ITER_INTENT, k,
 					NULL, NULL, 0, ({
 			struct bkey_i_cookie u;
 
 			bkey_reassemble(&u.k_i, k);
-			bch2_trans_update(&trans, &iter, &u.k_i, 0);
+			bch2_trans_update(trans, &iter, &u.k_i, 0);
 		})));
 }
 
diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c
index 637174b249a2..b069b1a62e25 100644
--- a/fs/bcachefs/xattr.c
+++ b/fs/bcachefs/xattr.c
@@ -307,24 +307,22 @@ ssize_t bch2_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size)
 {
 	struct bch_fs *c = dentry->d_sb->s_fs_info;
 	struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
-	struct btree_trans trans;
+	struct btree_trans *trans = bch2_trans_get(c);
 	struct btree_iter iter;
 	struct bkey_s_c k;
 	struct xattr_buf buf = { .buf = buffer, .len = buffer_size };
 	u64 offset = 0, inum = inode->ei_inode.bi_inum;
 	u32 snapshot;
 	int ret;
-
-	bch2_trans_init(&trans, c, 0, 0);
 retry:
-	bch2_trans_begin(&trans);
+	bch2_trans_begin(trans);
 	iter = (struct btree_iter) { NULL };
 
-	ret = bch2_subvolume_get_snapshot(&trans, inode->ei_subvol, &snapshot);
+	ret = bch2_subvolume_get_snapshot(trans, inode->ei_subvol, &snapshot);
 	if (ret)
 		goto err;
 
-	for_each_btree_key_upto_norestart(&trans, iter, BTREE_ID_xattrs,
+	for_each_btree_key_upto_norestart(trans, iter, BTREE_ID_xattrs,
 			   SPOS(inum, offset, snapshot),
 			   POS(inum, U64_MAX), 0, k, ret) {
 		if (k.k->type != KEY_TYPE_xattr)
@@ -336,12 +334,12 @@ retry:
 	}
 
 	offset = iter.pos.offset;
-	bch2_trans_iter_exit(&trans, &iter);
+	bch2_trans_iter_exit(trans, &iter);
 err:
 	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
 		goto retry;
 
-	bch2_trans_exit(&trans);
+	bch2_trans_put(trans);
 
 	if (ret)
 		goto out;
@@ -366,7 +364,7 @@ static int bch2_xattr_get_handler(const struct xattr_handler *handler,
 	struct bch_inode_info *inode = to_bch_ei(vinode);
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
 	int ret = bch2_trans_do(c, NULL, NULL, 0,
-		bch2_xattr_get_trans(&trans, inode, name, buffer, size, handler->flags));
+		bch2_xattr_get_trans(trans, inode, name, buffer, size, handler->flags));
 
 	return bch2_err_class(ret);
 }
@@ -381,18 +379,14 @@ static int bch2_xattr_set_handler(const struct xattr_handler *handler,
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
 	struct bch_hash_info hash = bch2_hash_info_init(c, &inode->ei_inode);
 	struct bch_inode_unpacked inode_u;
-	struct btree_trans trans;
 	int ret;
 
-	bch2_trans_init(&trans, c, 0, 0);
-
-	ret = commit_do(&trans, NULL, NULL, 0,
-			bch2_xattr_set(&trans, inode_inum(inode), &inode_u,
+	ret = bch2_trans_run(c,
+		commit_do(trans, NULL, NULL, 0,
+			bch2_xattr_set(trans, inode_inum(inode), &inode_u,
 				       &hash, name, value, size,
-				       handler->flags, flags));
-	if (!ret)
-		bch2_inode_update_after_write(&trans, inode, &inode_u, ATTR_CTIME);
-	bch2_trans_exit(&trans);
+				       handler->flags, flags)) ?:
+		(bch2_inode_update_after_write(trans, inode, &inode_u, ATTR_CTIME), 0));
 
 	return bch2_err_class(ret);
 }
-- 
cgit 


From a9737e0b38352e984fc67d5694b2c207c6b9679f Mon Sep 17 00:00:00 2001
From: Brian Foster <bfoster@redhat.com>
Date: Wed, 13 Sep 2023 10:14:30 -0400
Subject: bcachefs: add module description to fix modpost warning

modpost produces the following warning:

WARNING: modpost: missing MODULE_DESCRIPTION() in fs/bcachefs/bcachefs.o

Add a module description for bcachefs.

Signed-off-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/super.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index a00dc4a4a2c9..94e296397e32 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -69,6 +69,7 @@
 
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Kent Overstreet <kent.overstreet@gmail.com>");
+MODULE_DESCRIPTION("bcachefs filesystem");
 
 #define KTYPE(type)							\
 static const struct attribute_group type ## _group = {			\
-- 
cgit 


From 71933fb69b7c5fe5efd2119b645d4fde337a6f3f Mon Sep 17 00:00:00 2001
From: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Date: Wed, 13 Sep 2023 18:44:08 +0200
Subject: bcachefs: Fix use-after-free in bch2_dev_add()

If __bch2_dev_attach_bdev() fails, bch2_dev_free() is called twice.
Once here and another time in the error handling path.

This leads to several use-after-free.

Remove the redundant call and only rely on the error handling path.

Fixes: 6a44735653d4 ("bcachefs: Improved superblock-related error messages")
Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/super.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 94e296397e32..9f852a6dd76a 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -1613,10 +1613,8 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
 	bch2_dev_usage_init(ca);
 
 	ret = __bch2_dev_attach_bdev(ca, &sb);
-	if (ret) {
-		bch2_dev_free(ca);
+	if (ret)
 		goto err;
-	}
 
 	ret = bch2_dev_journal_alloc(ca);
 	if (ret) {
-- 
cgit 


From 0198b2356b5d0343ce933f8387a74972be6a2b10 Mon Sep 17 00:00:00 2001
From: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Date: Wed, 13 Sep 2023 18:44:09 +0200
Subject: bcachefs: Remove a redundant and harmless bch2_free_super() call

Remove a redundant call to bch2_free_super().

This is harmless because bch2_free_super() has a memset() at its end. So
a second call would only lead to from kfree(NULL).

Remove the redundant call and only rely on the error handling path.

Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/super.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 9f852a6dd76a..2990eed85adf 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -1605,7 +1605,6 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
 
 	ca = __bch2_dev_alloc(c, &dev_mi);
 	if (!ca) {
-		bch2_free_super(&sb);
 		ret = -ENOMEM;
 		goto err;
 	}
-- 
cgit 


From 3764647b255aafb590d03066d5c1a344463e3637 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@kernel.org>
Date: Wed, 13 Sep 2023 23:08:29 +0200
Subject: bcachefs: Remove undefined behavior in bch2_dev_buckets_reserved()

In general it's a good idea to avoid using bare unreachable() because it
introduces undefined behavior in compiled code.  In this case it even
confuses GCC into emitting an empty unused
bch2_dev_buckets_reserved.part.0() function.

Use BUG() instead, which is nice and defined.  While in theory it should
never trigger, if something were to go awry and the BCH_WATERMARK_NR
case were to actually hit, the failure mode is much more robust.

Fixes the following warnings:

  vmlinux.o: warning: objtool: bch2_bucket_alloc_trans() falls through to next function bch2_reset_alloc_cursors()
  vmlinux.o: warning: objtool: bch2_dev_buckets_reserved.part.0() is missing an ELF size annotation

Reported-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/buckets.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index f192809f50cf..0eff05c79c65 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -180,7 +180,7 @@ static inline u64 bch2_dev_buckets_reserved(struct bch_dev *ca, enum bch_waterma
 
 	switch (watermark) {
 	case BCH_WATERMARK_NR:
-		unreachable();
+		BUG();
 	case BCH_WATERMARK_stripe:
 		reserved += ca->mi.nbuckets >> 6;
 		fallthrough;
-- 
cgit 


From 439c172bc763fc1ef33246a0fb23920c1e01ffa7 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 13 Sep 2023 20:39:31 -0400
Subject: bcachefs: Kill other unreachable() uses

Per previous commit, bare unreachable() considered harmful, convert to
BUG()

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bset.c | 2 +-
 fs/bcachefs/ec.h   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c
index cff7486ef446..bb73ba9017b0 100644
--- a/fs/bcachefs/bset.c
+++ b/fs/bcachefs/bset.c
@@ -1189,7 +1189,7 @@ struct bkey_packed *__bch2_bset_search(struct btree *b,
 	case BSET_RO_AUX_TREE:
 		return bset_search_tree(b, t, search, lossy_packed_search);
 	default:
-		unreachable();
+		BUG();
 	}
 }
 
diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h
index 885ae5d51655..966d165a3b66 100644
--- a/fs/bcachefs/ec.h
+++ b/fs/bcachefs/ec.h
@@ -240,7 +240,7 @@ static inline void ec_stripe_new_put(struct bch_fs *c, struct ec_stripe_new *s,
 			bch2_ec_do_stripe_creates(c);
 			break;
 		default:
-			unreachable();
+			BUG();
 		}
 }
 
-- 
cgit 


From 8c2d82a6fe6fa0e3503c56c08d7fc599d66e2b79 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 13 Sep 2023 20:33:06 -0400
Subject: bcachefs: Change bucket_lock() to use bit_spin_lock()

bucket_lock() previously open coded a spinlock, because we need to cram
a spinlock into a single byte.

But it turns out not all archs support xchg() on a single byte; since we
need struct bucket to be small, this means we have to play fun games
with casts and ifdefs for endianness.

This fixes building on 32 bit arm, and likely other architectures.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
Cc: linux-bcachefs@vger.kernel.org
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/buckets.h | 33 ++++++++++++++++++++++++++++++---
 1 file changed, 30 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index 0eff05c79c65..ecbeb7280f87 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -40,15 +40,42 @@ static inline size_t sector_to_bucket_and_offset(const struct bch_dev *ca, secto
 	for (_b = (_buckets)->b + (_buckets)->first_bucket;	\
 	     _b < (_buckets)->b + (_buckets)->nbuckets; _b++)
 
+/*
+ * Ugly hack alert:
+ *
+ * We need to cram a spinlock in a single byte, because that's what we have left
+ * in struct bucket, and we care about the size of these - during fsck, we need
+ * in memory state for every single bucket on every device.
+ *
+ * We used to do
+ *   while (xchg(&b->lock, 1) cpu_relax();
+ * but, it turns out not all architectures support xchg on a single byte.
+ *
+ * So now we use bit_spin_lock(), with fun games since we can't burn a whole
+ * ulong for this - we just need to make sure the lock bit always ends up in the
+ * first byte.
+ */
+
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+#define BUCKET_LOCK_BITNR	0
+#else
+#define BUCKET_LOCK_BITNR	(BITS_PER_LONG - 1)
+#endif
+
+union ulong_byte_assert {
+	ulong	ulong;
+	u8	byte;
+};
+
 static inline void bucket_unlock(struct bucket *b)
 {
-	smp_store_release(&b->lock, 0);
+	BUILD_BUG_ON(!((union ulong_byte_assert) { .ulong = 1UL << BUCKET_LOCK_BITNR }).byte);
+	bit_spin_unlock(BUCKET_LOCK_BITNR, (void *) &b->lock);
 }
 
 static inline void bucket_lock(struct bucket *b)
 {
-	while (xchg(&b->lock, 1))
-		cpu_relax();
+	bit_spin_lock(BUCKET_LOCK_BITNR, (void *) &b->lock);
 }
 
 static inline struct bucket_array *gc_bucket_array(struct bch_dev *ca)
-- 
cgit 


From 301e0237cadfc7c446e16eab6df38073ade3631d Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@linaro.org>
Date: Thu, 14 Sep 2023 17:58:07 +0300
Subject: bcachefs: chardev: return -EFAULT if copy_to_user() fails

The copy_to_user() function returns the number of bytes remaining but
we want to return -EFAULT to the user.

Fixes: e0750d947352 ("bcachefs: Initial commit")
Signed-off-by: Dan Carpenter <dan.carpenter@linaro.org>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/chardev.c | 30 ++++++++++++++++++++----------
 1 file changed, 20 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c
index fb603df099a5..e5e9fddddfb5 100644
--- a/fs/bcachefs/chardev.c
+++ b/fs/bcachefs/chardev.c
@@ -149,9 +149,10 @@ static long bch2_global_ioctl(unsigned cmd, void __user *arg)
 static long bch2_ioctl_query_uuid(struct bch_fs *c,
 			struct bch_ioctl_query_uuid __user *user_arg)
 {
-	return copy_to_user(&user_arg->uuid,
-			    &c->sb.user_uuid,
-			    sizeof(c->sb.user_uuid));
+	if (copy_to_user(&user_arg->uuid, &c->sb.user_uuid,
+			 sizeof(c->sb.user_uuid)))
+		return -EFAULT;
+	return 0;
 }
 
 #if 0
@@ -338,7 +339,10 @@ static ssize_t bch2_data_job_read(struct file *file, char __user *buf,
 	if (len < sizeof(e))
 		return -EINVAL;
 
-	return copy_to_user(buf, &e, sizeof(e)) ?: sizeof(e);
+	if (copy_to_user(buf, &e, sizeof(e)))
+		return -EFAULT;
+
+	return sizeof(e);
 }
 
 static const struct file_operations bcachefs_data_ops = {
@@ -466,9 +470,11 @@ static long bch2_ioctl_fs_usage(struct bch_fs *c,
 	percpu_up_read(&c->mark_lock);
 	kfree(src);
 
-	if (!ret)
-		ret = copy_to_user(user_arg, arg,
-			sizeof(*arg) + arg->replica_entries_bytes);
+	if (ret)
+		goto err;
+	if (copy_to_user(user_arg, arg,
+			 sizeof(*arg) + arg->replica_entries_bytes))
+		ret = -EFAULT;
 err:
 	kfree(arg);
 	return ret;
@@ -513,7 +519,10 @@ static long bch2_ioctl_dev_usage(struct bch_fs *c,
 
 	percpu_ref_put(&ca->ref);
 
-	return copy_to_user(user_arg, &arg, sizeof(arg));
+	if (copy_to_user(user_arg, &arg, sizeof(arg)))
+		return -EFAULT;
+
+	return 0;
 }
 
 static long bch2_ioctl_read_super(struct bch_fs *c,
@@ -550,8 +559,9 @@ static long bch2_ioctl_read_super(struct bch_fs *c,
 		goto err;
 	}
 
-	ret = copy_to_user((void __user *)(unsigned long)arg.sb,
-			   sb, vstruct_bytes(sb));
+	if (copy_to_user((void __user *)(unsigned long)arg.sb, sb,
+			 vstruct_bytes(sb)))
+		ret = -EFAULT;
 err:
 	if (!IS_ERR_OR_NULL(ca))
 		percpu_ref_put(&ca->ref);
-- 
cgit 


From 4ba985b84de627ba4f257c9843d0dd7146df2180 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@linaro.org>
Date: Thu, 14 Sep 2023 17:59:10 +0300
Subject: bcachefs: chardev: fix an integer overflow (32 bit only)

On 32 bit systems, "sizeof(*arg) + replica_entries_bytes" can have an
integer overflow leading to memory corruption.  Use size_add() to
prevent this.

Fixes: b44dd3797034 ("bcachefs: Redo filesystem usage ioctls")
Signed-off-by: Dan Carpenter <dan.carpenter@linaro.org>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/chardev.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c
index e5e9fddddfb5..51d671267741 100644
--- a/fs/bcachefs/chardev.c
+++ b/fs/bcachefs/chardev.c
@@ -421,7 +421,7 @@ static long bch2_ioctl_fs_usage(struct bch_fs *c,
 	if (get_user(replica_entries_bytes, &user_arg->replica_entries_bytes))
 		return -EFAULT;
 
-	arg = kzalloc(sizeof(*arg) + replica_entries_bytes, GFP_KERNEL);
+	arg = kzalloc(size_add(sizeof(*arg), replica_entries_bytes), GFP_KERNEL);
 	if (!arg)
 		return -ENOMEM;
 
-- 
cgit 


From 867c1fe0187f6df4dca84a34332e00f21ef80f69 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@linaro.org>
Date: Thu, 14 Sep 2023 12:47:44 +0300
Subject: bcachefs: fix error checking in bch2_fs_alloc()

There is a typo here where it uses ";" instead of "?:".  The result is
that bch2_fs_fs_io_direct_init() is called unconditionally and the errors
from it are not checked.

Fixes: 0060c68159fc ("bcachefs: Split up fs-io.[ch]")
Signed-off-by: Dan Carpenter <dan.carpenter@linaro.org>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
Reviewed-by: Brian Foster <bfoster@redhat.com>
---
 fs/bcachefs/super.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 2990eed85adf..e94a63a22704 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -852,7 +852,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 	    bch2_fs_compress_init(c) ?:
 	    bch2_fs_ec_init(c) ?:
 	    bch2_fs_fsio_init(c) ?:
-	    bch2_fs_fs_io_buffered_init(c);
+	    bch2_fs_fs_io_buffered_init(c) ?:
 	    bch2_fs_fs_io_direct_init(c);
 	if (ret)
 		goto err;
-- 
cgit 


From 3b59fbec86e3cda0fae2ad15139ac458e024fab1 Mon Sep 17 00:00:00 2001
From: Jiapeng Chong <jiapeng.chong@linux.alibaba.com>
Date: Thu, 14 Sep 2023 14:05:54 +0800
Subject: bcachefs: Remove duplicate include

./fs/bcachefs/btree_update.h: journal.h is included more than once.

Reported-by: Abaci Robot <abaci@linux.alibaba.com>
Closes: https://bugzilla.openanolis.cn/show_bug.cgi?id=6573
Signed-off-by: Jiapeng Chong <jiapeng.chong@linux.alibaba.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
index 4bfe602ce8e3..9816d2286540 100644
--- a/fs/bcachefs/btree_update.h
+++ b/fs/bcachefs/btree_update.h
@@ -4,7 +4,6 @@
 
 #include "btree_iter.h"
 #include "journal.h"
-#include "journal.h"
 
 struct bch_fs;
 struct btree;
-- 
cgit 


From 265cc423155d56030e44068680085adb59800326 Mon Sep 17 00:00:00 2001
From: Nick Desaulniers <ndesaulniers@google.com>
Date: Tue, 19 Sep 2023 13:38:31 -0700
Subject: bcachefs: Fix -Wself-assign

Fixes the following observed error reported by Nathan on IRC.

  fs/bcachefs/io_misc.c:467:6: error: explicitly assigning value of
  variable of type 'int' to itself [-Werror,-Wself-assign]
    467 |         ret = ret;
        |         ~~~ ^ ~~~

Reported-by: Nathan Chancellor <nathan@kernel.org>
Signed-off-by: Nick Desaulniers <ndesaulniers@google.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/io_misc.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/io_misc.c b/fs/bcachefs/io_misc.c
index 668493bcfe36..32432bdddac4 100644
--- a/fs/bcachefs/io_misc.c
+++ b/fs/bcachefs/io_misc.c
@@ -462,9 +462,9 @@ btree_err:
 				bch2_logged_op_update(trans, &op->k_i));
 	}
 
-	fallthrough;
+	break;
 case LOGGED_OP_FINSERT_finish:
-	ret = ret;
+	break;
 	}
 err:
 	bch2_logged_op_finish(trans, op_k);
-- 
cgit 


From e9a0a26ed05a93034f3d49374dd5ef943db2d5b7 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@linaro.org>
Date: Fri, 15 Sep 2023 15:55:40 +0300
Subject: bcachefs: acl: Uninitialized variable in bch2_acl_chmod()

The clean up code at the end of the function uses "acl" so it needs
to be initialized to NULL.

Fixes: 53306e096d91 ("bcachefs: Always check for transaction restarts")
Signed-off-by: Dan Carpenter <dan.carpenter@linaro.org>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/acl.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/acl.c b/fs/bcachefs/acl.c
index 9653401957b3..6b1579e96dfe 100644
--- a/fs/bcachefs/acl.c
+++ b/fs/bcachefs/acl.c
@@ -417,7 +417,7 @@ int bch2_acl_chmod(struct btree_trans *trans, subvol_inum inum,
 	struct btree_iter iter;
 	struct bkey_s_c_xattr xattr;
 	struct bkey_i_xattr *new;
-	struct posix_acl *acl;
+	struct posix_acl *acl = NULL;
 	struct bkey_s_c k;
 	int ret;
 
-- 
cgit 


From b6c22147e0fcfd16e672093178d940db3ea6923e Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@linaro.org>
Date: Fri, 15 Sep 2023 15:56:07 +0300
Subject: bcachefs: acl: Add missing check in bch2_acl_chmod()

The "ret = bkey_err(k);" assignment was accidentally left out so the
call to bch2_btree_iter_peek_slot() is not checked for errors.

Fixes: 53306e096d91 ("bcachefs: Always check for transaction restarts")
Signed-off-by: Dan Carpenter <dan.carpenter@linaro.org>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/acl.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/acl.c b/fs/bcachefs/acl.c
index 6b1579e96dfe..f3809897f00a 100644
--- a/fs/bcachefs/acl.c
+++ b/fs/bcachefs/acl.c
@@ -427,9 +427,10 @@ int bch2_acl_chmod(struct btree_trans *trans, subvol_inum inum,
 		return bch2_err_matches(ret, ENOENT) ? 0 : ret;
 
 	k = bch2_btree_iter_peek_slot(&iter);
-	xattr = bkey_s_c_to_xattr(k);
+	ret = bkey_err(k);
 	if (ret)
 		goto err;
+	xattr = bkey_s_c_to_xattr(k);
 
 	acl = bch2_acl_from_disk(trans, xattr_val(xattr.v),
 			le16_to_cpu(xattr.v->x_val_len));
-- 
cgit 


From 1f12900ab52304a7c41714b6ad86990eb428fbc9 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@linaro.org>
Date: Fri, 15 Sep 2023 15:55:23 +0300
Subject: bcachefs: fs-ioctl: Fix copy_to_user() error code

The copy_to_user() function returns the number of bytes that it wasn't
able to copy but we want to return -EFAULT to the user.

Fixes: e0750d947352 ("bcachefs: Initial commit")
Signed-off-by: Dan Carpenter <dan.carpenter@linaro.org>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-ioctl.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c
index 141bcced031e..0679b2f79fd6 100644
--- a/fs/bcachefs/fs-ioctl.c
+++ b/fs/bcachefs/fs-ioctl.c
@@ -122,7 +122,10 @@ static int bch2_ioc_fsgetxattr(struct bch_inode_info *inode,
 
 	fa.fsx_projid = inode->ei_qid.q[QTYP_PRJ];
 
-	return copy_to_user(arg, &fa, sizeof(fa));
+	if (copy_to_user(arg, &fa, sizeof(fa)))
+		return -EFAULT;
+
+	return 0;
 }
 
 static int fssetxattr_inode_update_fn(struct btree_trans *trans,
-- 
cgit 


From d67a72bfc92b0c0dab9c134964090c4700892e67 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@linaro.org>
Date: Fri, 15 Sep 2023 15:56:37 +0300
Subject: bcachefs: snapshot: Add missing assignment in
 bch2_delete_dead_snapshots()

This code accidentally left out the "ret = " assignment so the errors
from for_each_btree_key2() are not checked.

Fixes: 53534482a250 ("bcachefs: for_each_btree_key2()")
Signed-off-by: Dan Carpenter <dan.carpenter@linaro.org>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/snapshot.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c
index ff7f71576d5c..0acfca00a6af 100644
--- a/fs/bcachefs/snapshot.c
+++ b/fs/bcachefs/snapshot.c
@@ -1405,8 +1405,8 @@ int bch2_delete_dead_snapshots(struct bch_fs *c)
 		goto err;
 	}
 
-	for_each_btree_key2(trans, iter, BTREE_ID_snapshots,
-			   POS_MIN, 0, k,
+	ret = for_each_btree_key2(trans, iter, BTREE_ID_snapshots,
+				  POS_MIN, 0, k,
 		bch2_snapshot_set_equiv(trans, k));
 	if (ret) {
 		bch_err_msg(c, ret, "in bch2_snapshots_set_equiv");
-- 
cgit 


From 92b63f5bf0774eab2e62b86f85bb4efb915edef1 Mon Sep 17 00:00:00 2001
From: Brian Foster <bfoster@redhat.com>
Date: Fri, 15 Sep 2023 08:51:51 -0400
Subject: bcachefs: refactor pin put helpers

We have a couple journal pin put helpers to handle cases where the
journal lock is already held or not. Refactor the helpers to lock
and reclaim from the highest level and open code the reclaim from
the one caller of the internal variant. The latter call will be
moved into the journal buf release helper in a later patch.

Signed-off-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/journal.c         |  3 ++-
 fs/bcachefs/journal_reclaim.c | 11 ++++-------
 fs/bcachefs/journal_reclaim.h |  3 ++-
 3 files changed, 8 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index ad80618d1740..210a2b90bb50 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -204,7 +204,8 @@ static void __journal_entry_close(struct journal *j, unsigned closed_val)
 	buf->data->last_seq	= cpu_to_le64(buf->last_seq);
 	BUG_ON(buf->last_seq > le64_to_cpu(buf->data->seq));
 
-	__bch2_journal_pin_put(j, le64_to_cpu(buf->data->seq));
+	if (__bch2_journal_pin_put(j, le64_to_cpu(buf->data->seq)))
+		bch2_journal_reclaim_fast(j);
 
 	cancel_delayed_work(&j->write_work);
 
diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
index 1f3d5890ff11..9a584aaaa2eb 100644
--- a/fs/bcachefs/journal_reclaim.c
+++ b/fs/bcachefs/journal_reclaim.c
@@ -290,7 +290,7 @@ void bch2_journal_do_discards(struct journal *j)
  * entry, holding it open to ensure it gets replayed during recovery:
  */
 
-static void bch2_journal_reclaim_fast(struct journal *j)
+void bch2_journal_reclaim_fast(struct journal *j)
 {
 	bool popped = false;
 
@@ -310,19 +310,16 @@ static void bch2_journal_reclaim_fast(struct journal *j)
 		bch2_journal_space_available(j);
 }
 
-void __bch2_journal_pin_put(struct journal *j, u64 seq)
+bool __bch2_journal_pin_put(struct journal *j, u64 seq)
 {
 	struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq);
 
-	if (atomic_dec_and_test(&pin_list->count))
-		bch2_journal_reclaim_fast(j);
+	return atomic_dec_and_test(&pin_list->count);
 }
 
 void bch2_journal_pin_put(struct journal *j, u64 seq)
 {
-	struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq);
-
-	if (atomic_dec_and_test(&pin_list->count)) {
+	if (__bch2_journal_pin_put(j, seq)) {
 		spin_lock(&j->lock);
 		bch2_journal_reclaim_fast(j);
 		spin_unlock(&j->lock);
diff --git a/fs/bcachefs/journal_reclaim.h b/fs/bcachefs/journal_reclaim.h
index 0fd1af120db5..494d1a6eddb0 100644
--- a/fs/bcachefs/journal_reclaim.h
+++ b/fs/bcachefs/journal_reclaim.h
@@ -31,7 +31,8 @@ journal_seq_pin(struct journal *j, u64 seq)
 	return &j->pin.data[seq & j->pin.mask];
 }
 
-void __bch2_journal_pin_put(struct journal *, u64);
+void bch2_journal_reclaim_fast(struct journal *);
+bool __bch2_journal_pin_put(struct journal *, u64);
 void bch2_journal_pin_put(struct journal *, u64);
 void bch2_journal_pin_drop(struct journal *, struct journal_entry_pin *);
 
-- 
cgit 


From fc08031bb84b2b4660406faf9f30db8cdd09d022 Mon Sep 17 00:00:00 2001
From: Brian Foster <bfoster@redhat.com>
Date: Fri, 15 Sep 2023 08:51:52 -0400
Subject: bcachefs: prepare journal buf put to handle pin put

bcachefs freeze testing has uncovered some raciness between journal
entry open/close and pin list reference count management. The
details of the problem are described in a separate patch. In
preparation for the associated fix, refactor the journal buffer put
path a bit to allow it to eventually handle dropping the pin list
reference currently held by an open journal entry.

Retain the journal write dispatch helper since the closure code is
inlined and we don't want to increase the amount of inline code in
the transaction commit path, but rename the function to reflect
the purpose of final processing of the journal buffer.

Signed-off-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/journal.c |  2 +-
 fs/bcachefs/journal.h | 22 +++++++++++++++++-----
 2 files changed, 18 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index 210a2b90bb50..be61d43458eb 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -134,7 +134,7 @@ journal_error_check_stuck(struct journal *j, int error, unsigned flags)
 
 /* journal entry close/open: */
 
-void __bch2_journal_buf_put(struct journal *j)
+void bch2_journal_buf_put_final(struct journal *j)
 {
 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
 
diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h
index 008a2e25a4fa..0a53a2142594 100644
--- a/fs/bcachefs/journal.h
+++ b/fs/bcachefs/journal.h
@@ -252,9 +252,10 @@ static inline bool journal_entry_empty(struct jset *j)
 	return true;
 }
 
-void __bch2_journal_buf_put(struct journal *);
-
-static inline void bch2_journal_buf_put(struct journal *j, unsigned idx)
+/*
+ * Drop reference on a buffer index and return true if the count has hit zero.
+ */
+static inline union journal_res_state journal_state_buf_put(struct journal *j, unsigned idx)
 {
 	union journal_res_state s;
 
@@ -264,9 +265,20 @@ static inline void bch2_journal_buf_put(struct journal *j, unsigned idx)
 				    .buf2_count = idx == 2,
 				    .buf3_count = idx == 3,
 				    }).v, &j->reservations.counter);
+	return s;
+}
+
+void bch2_journal_buf_put_final(struct journal *);
 
-	if (!journal_state_count(s, idx) && idx == s.unwritten_idx)
-		__bch2_journal_buf_put(j);
+static inline void bch2_journal_buf_put(struct journal *j, unsigned idx)
+{
+	union journal_res_state s;
+
+	s = journal_state_buf_put(j, idx);
+	if (!journal_state_count(s, idx)) {
+		if (idx == s.unwritten_idx)
+			bch2_journal_buf_put_final(j);
+	}
 }
 
 /*
-- 
cgit 


From 3e55189b504f961e68e631b72a2ed71991397ef9 Mon Sep 17 00:00:00 2001
From: Brian Foster <bfoster@redhat.com>
Date: Fri, 15 Sep 2023 08:51:53 -0400
Subject: bcachefs: fix race between journal entry close and pin set

bcachefs freeze testing via fstests generic/390 occasionally
reproduces the following BUG from bch2_fs_read_only():

  BUG_ON(atomic_long_read(&c->btree_key_cache.nr_dirty));

This indicates that one or more dirty key cache keys still exist
after the attempt to flush and quiesce the fs. The sequence that
leads to this problem actually occurs on unfreeze (ro->rw), and
looks something like the following:

- Task A begins a transaction commit and acquires journal_res for
  the current seq. This transaction intends to perform key cache
  insertion.
- Task B begins a bch2_journal_flush() via bch2_sync_fs(). This ends
  up in journal_entry_want_write(), which closes the current journal
  entry and drops the reference to the pin list created on entry open.
  The pin put pops the front of the journal via fast reclaim since the
  reference count has dropped to 0.
- Task A attempts to set the journal pin for the associated cached
  key, but bch2_journal_pin_set() skips the pin insert because the
  seq of the transaction reservation is behind the front of the pin
  list fifo.

The end result is that the pin associated with the cached key is not
added, which prevents a subsequent reclaim from processing the key
and thus leaves it dangling at freeze time. The fundamental cause of
this problem is that the front of the journal is allowed to pop
before a transaction with outstanding reservation on the associated
journal seq is able to add a pin. The count for the pin list
associated with the seq drops to zero and is prematurely reclaimed
as a result.

The logical fix for this problem lies in how the journal buffer is
managed in similar scenarios where the entry might have been closed
before a transaction with outstanding reservations happens to be
committed.

When a journal entry is opened, the current sequence number is
bumped, the associated pin list is initialized with a reference
count of 1, and the journal buffer reference count is bumped (via
journal_state_inc()). When a journal reservation is acquired, the
reservation also acquires a reference on the associated buffer. If
the journal entry is closed in the meantime, it drops both the pin
and buffer references held by the open entry, but the buffer still
has references held by outstanding reservation. After the associated
transaction commits, the reservation release drops the associated
buffer references and the buffer is written out once the reference
count has dropped to zero.

The fundamental problem here is that the lifecycle of the pin list
reference held by an open journal entry is too short to cover the
processing of transactions with outstanding reservations. The
simplest way to address this is to expand the pin list reference to
the lifecycle of the buffer vs. the shorter lifecycle of the open
journal entry. This ensures the pin list for a seq with outstanding
reservation cannot be popped and reclaimed before all outstanding
reservations have been released, even if the associated journal
entry has been closed for further reservations.

Move the pin put from journal entry close to where final processing
of the journal buffer occurs. Create a duplicate helper to cover the
case where the caller doesn't already hold the journal lock. This
allows generic/390 to pass reliably.

Signed-off-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/journal.c | 21 +++++++++++++--------
 fs/bcachefs/journal.h | 20 +++++++++++++++-----
 2 files changed, 28 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index be61d43458eb..fc3dd5bef386 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -132,13 +132,21 @@ journal_error_check_stuck(struct journal *j, int error, unsigned flags)
 	return stuck;
 }
 
-/* journal entry close/open: */
-
-void bch2_journal_buf_put_final(struct journal *j)
+/*
+ * Final processing when the last reference of a journal buffer has been
+ * dropped. Drop the pin list reference acquired at journal entry open and write
+ * the buffer, if requested.
+ */
+void bch2_journal_buf_put_final(struct journal *j, u64 seq, bool write)
 {
 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
 
-	closure_call(&j->io, bch2_journal_write, c->io_complete_wq, NULL);
+	lockdep_assert_held(&j->lock);
+
+	if (__bch2_journal_pin_put(j, seq))
+		bch2_journal_reclaim_fast(j);
+	if (write)
+		closure_call(&j->io, bch2_journal_write, c->io_complete_wq, NULL);
 }
 
 /*
@@ -204,14 +212,11 @@ static void __journal_entry_close(struct journal *j, unsigned closed_val)
 	buf->data->last_seq	= cpu_to_le64(buf->last_seq);
 	BUG_ON(buf->last_seq > le64_to_cpu(buf->data->seq));
 
-	if (__bch2_journal_pin_put(j, le64_to_cpu(buf->data->seq)))
-		bch2_journal_reclaim_fast(j);
-
 	cancel_delayed_work(&j->write_work);
 
 	bch2_journal_space_available(j);
 
-	bch2_journal_buf_put(j, old.idx);
+	__bch2_journal_buf_put(j, old.idx, le64_to_cpu(buf->data->seq));
 }
 
 void bch2_journal_halt(struct journal *j)
diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h
index 0a53a2142594..491133cc52f3 100644
--- a/fs/bcachefs/journal.h
+++ b/fs/bcachefs/journal.h
@@ -268,16 +268,26 @@ static inline union journal_res_state journal_state_buf_put(struct journal *j, u
 	return s;
 }
 
-void bch2_journal_buf_put_final(struct journal *);
+void bch2_journal_buf_put_final(struct journal *, u64, bool);
 
-static inline void bch2_journal_buf_put(struct journal *j, unsigned idx)
+static inline void __bch2_journal_buf_put(struct journal *j, unsigned idx, u64 seq)
+{
+	union journal_res_state s;
+
+	s = journal_state_buf_put(j, idx);
+	if (!journal_state_count(s, idx))
+		bch2_journal_buf_put_final(j, seq, idx == s.unwritten_idx);
+}
+
+static inline void bch2_journal_buf_put(struct journal *j, unsigned idx, u64 seq)
 {
 	union journal_res_state s;
 
 	s = journal_state_buf_put(j, idx);
 	if (!journal_state_count(s, idx)) {
-		if (idx == s.unwritten_idx)
-			bch2_journal_buf_put_final(j);
+		spin_lock(&j->lock);
+		bch2_journal_buf_put_final(j, seq, idx == s.unwritten_idx);
+		spin_unlock(&j->lock);
 	}
 }
 
@@ -298,7 +308,7 @@ static inline void bch2_journal_res_put(struct journal *j,
 				       BCH_JSET_ENTRY_btree_keys,
 				       0, 0, 0);
 
-	bch2_journal_buf_put(j, res->idx);
+	bch2_journal_buf_put(j, res->idx, res->seq);
 
 	res->ref = 0;
 }
-- 
cgit 


From f7f6943a8c6dccbd085600bbb7bae4f6f6047dc4 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 19 Sep 2023 17:09:22 -0400
Subject: bcachefs: Fix copy_to_user() usage in flush_buf()

copy_to_user() returns the number of bytes successfully copied - not an
errcode.

Reported-by: Dan Carpenter <dan.carpenter@linaro.org>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/debug.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c
index 404148bd348a..75a3dc7cbd47 100644
--- a/fs/bcachefs/debug.c
+++ b/fs/bcachefs/debug.c
@@ -319,16 +319,16 @@ static ssize_t flush_buf(struct dump_iter *i)
 {
 	if (i->buf.pos) {
 		size_t bytes = min_t(size_t, i->buf.pos, i->size);
-		int err = copy_to_user(i->ubuf, i->buf.buf, bytes);
+		int copied = bytes - copy_to_user(i->ubuf, i->buf.buf, bytes);
 
-		if (err)
-			return err;
+		i->ret	 += copied;
+		i->ubuf	 += copied;
+		i->size	 -= copied;
+		i->buf.pos -= copied;
+		memmove(i->buf.buf, i->buf.buf + copied, i->buf.pos);
 
-		i->ret	 += bytes;
-		i->ubuf	 += bytes;
-		i->size	 -= bytes;
-		i->buf.pos -= bytes;
-		memmove(i->buf.buf, i->buf.buf + bytes, i->buf.pos);
+		if (copied != bytes)
+			return -EFAULT;
 	}
 
 	return i->size ? 0 : i->ret;
-- 
cgit 


From a55fc65eb2c71da5d422937db767a78c6438f9b6 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 19 Sep 2023 22:18:39 -0400
Subject: bcachefs: Fix an overflow check

When bucket sector counts were changed from u16s to u32s, a few things
were missed. This fixes an overflow check, and a truncation that
prevented the overflow check from firing.

Reported-by: Dan Carpenter <dan.carpenter@linaro.org>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/buckets.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 994129142d39..e7f4506f69ca 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -680,7 +680,7 @@ static int check_bucket_ref(struct btree_trans *trans,
 	struct bch_fs *c = trans->c;
 	struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
 	size_t bucket_nr = PTR_BUCKET_NR(ca, ptr);
-	u16 bucket_sectors = !ptr->cached
+	u32 bucket_sectors = !ptr->cached
 		? dirty_sectors
 		: cached_sectors;
 	struct printbuf buf = PRINTBUF;
@@ -752,9 +752,9 @@ static int check_bucket_ref(struct btree_trans *trans,
 		goto err;
 	}
 
-	if ((unsigned) (bucket_sectors + sectors) > U32_MAX) {
+	if ((u64) bucket_sectors + sectors > U32_MAX) {
 		bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
-			"bucket %u:%zu gen %u data type %s sector count overflow: %u + %lli > U16_MAX\n"
+			"bucket %u:%zu gen %u data type %s sector count overflow: %u + %lli > U32_MAX\n"
 			"while marking %s",
 			ptr->dev, bucket_nr, b_gen,
 			bch2_data_types[bucket_data_type ?: ptr_data_type],
-- 
cgit 


From 75e0c4789b623db0abae497160202bc5f5d2522e Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 19 Sep 2023 22:20:25 -0400
Subject: bcachefs: Fix error checks in bch2_chacha_encrypt_key()

crypto_alloc_sync_skcipher() returns an ERR_PTR, not NULL.

Reported-by: Dan Carpenter <dan.carpenter@linaro.org>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/checksum.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/checksum.c b/fs/bcachefs/checksum.c
index 3f385d499026..c70262b7fd6e 100644
--- a/fs/bcachefs/checksum.c
+++ b/fs/bcachefs/checksum.c
@@ -159,15 +159,16 @@ int bch2_chacha_encrypt_key(struct bch_key *key, struct nonce nonce,
 		crypto_alloc_sync_skcipher("chacha20", 0, 0);
 	int ret;
 
-	if (!chacha20) {
-		pr_err("error requesting chacha20 module: %li", PTR_ERR(chacha20));
-		return PTR_ERR(chacha20);
+	ret = PTR_ERR_OR_ZERO(chacha20);
+	if (ret) {
+		pr_err("error requesting chacha20 cipher: %s", bch2_err_str(ret));
+		return ret;
 	}
 
 	ret = crypto_skcipher_setkey(&chacha20->base,
 				     (void *) key, sizeof(*key));
 	if (ret) {
-		pr_err("crypto_skcipher_setkey() error: %i", ret);
+		pr_err("error from crypto_skcipher_setkey(): %s", bch2_err_str(ret));
 		goto err;
 	}
 
@@ -578,7 +579,7 @@ int bch2_decrypt_sb_key(struct bch_fs *c,
 
 	/* decrypt real key: */
 	ret = bch2_chacha_encrypt_key(&user_key, bch2_sb_key_nonce(c),
-			     &sb_key, sizeof(sb_key));
+				      &sb_key, sizeof(sb_key));
 	if (ret)
 		goto err;
 
-- 
cgit 


From 4b33a1916a351ba3bf5af42fb38a988a5dce11bf Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 19 Sep 2023 22:26:18 -0400
Subject: bcachefs: bch2_ioctl_disk_resize_journal(): check for integer
 truncation

Reported-by: Dan Carpenter <dan.carpenter@linaro.org>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/chardev.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c
index 51d671267741..e8b6733e7f71 100644
--- a/fs/bcachefs/chardev.c
+++ b/fs/bcachefs/chardev.c
@@ -627,6 +627,9 @@ static long bch2_ioctl_disk_resize_journal(struct bch_fs *c,
 	    arg.pad)
 		return -EINVAL;
 
+	if (arg.nbuckets > U32_MAX)
+		return -EINVAL;
+
 	ca = bch2_device_lookup(c, arg.dev, arg.flags);
 	if (IS_ERR(ca))
 		return PTR_ERR(ca);
-- 
cgit 


From cfda31c03315ac8cf2a57142f5a6c7510b3f3706 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 19 Sep 2023 22:36:30 -0400
Subject: bcachefs: drop journal lock before calling journal_write

bch2_journal_write() expects process context, it takes journal_lock as
needed.

Reported-by: Dan Carpenter <dan.carpenter@linaro.org>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/journal_io.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 269c8e8a1d95..6a3d6a374e9c 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -1554,6 +1554,7 @@ static void journal_write_done(struct closure *cl)
 
 	if (!journal_state_count(new, new.unwritten_idx) &&
 	    journal_last_unwritten_seq(j) <= journal_cur_seq(j)) {
+		spin_unlock(&j->lock);
 		closure_call(&j->io, bch2_journal_write, c->io_complete_wq, NULL);
 	} else if (journal_last_unwritten_seq(j) == journal_cur_seq(j) &&
 		   new.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL) {
@@ -1566,10 +1567,11 @@ static void journal_write_done(struct closure *cl)
 		 * might want to be written now:
 		 */
 
+		spin_unlock(&j->lock);
 		mod_delayed_work(c->io_complete_wq, &j->write_work, max(0L, delta));
+	} else {
+		spin_unlock(&j->lock);
 	}
-
-	spin_unlock(&j->lock);
 }
 
 static void journal_write_endio(struct bio *bio)
-- 
cgit 


From 97ecc23632fad75c43809b3f5010800289884a08 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 20 Sep 2023 01:19:53 -0400
Subject: bcachefs: Fix strndup_user() error checking

strndup_user() returns an error pointer, not NULL.

Reported-by: Dan Carpenter <dan.carpenter@linaro.org>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/chardev.c | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c
index e8b6733e7f71..f69e15dc699c 100644
--- a/fs/bcachefs/chardev.c
+++ b/fs/bcachefs/chardev.c
@@ -86,10 +86,9 @@ static long bch2_ioctl_assemble(struct bch_ioctl_assemble __user *user_arg)
 		devs[i] = strndup_user((const char __user *)(unsigned long)
 				       user_devs[i],
 				       PATH_MAX);
-		if (!devs[i]) {
-			ret = -ENOMEM;
+		ret= PTR_ERR_OR_ZERO(devs[i]);
+		if (ret)
 			goto err;
-		}
 	}
 
 	c = bch2_fs_open(devs, arg.nr_devs, bch2_opts_empty());
@@ -117,8 +116,9 @@ static long bch2_ioctl_incremental(struct bch_ioctl_incremental __user *user_arg
 		return -EINVAL;
 
 	path = strndup_user((const char __user *)(unsigned long) arg.dev, PATH_MAX);
-	if (!path)
-		return -ENOMEM;
+	ret = PTR_ERR_OR_ZERO(path);
+	if (ret)
+		return ret;
 
 	err = bch2_fs_open_incremental(path);
 	kfree(path);
@@ -189,8 +189,9 @@ static long bch2_ioctl_disk_add(struct bch_fs *c, struct bch_ioctl_disk arg)
 		return -EINVAL;
 
 	path = strndup_user((const char __user *)(unsigned long) arg.dev, PATH_MAX);
-	if (!path)
-		return -ENOMEM;
+	ret = PTR_ERR_OR_ZERO(path);
+	if (ret)
+		return ret;
 
 	ret = bch2_dev_add(c, path);
 	kfree(path);
@@ -231,8 +232,9 @@ static long bch2_ioctl_disk_online(struct bch_fs *c, struct bch_ioctl_disk arg)
 		return -EINVAL;
 
 	path = strndup_user((const char __user *)(unsigned long) arg.dev, PATH_MAX);
-	if (!path)
-		return -ENOMEM;
+	ret = PTR_ERR_OR_ZERO(path);
+	if (ret)
+		return ret;
 
 	ret = bch2_dev_online(c, path);
 	kfree(path);
-- 
cgit 


From d04fdf5c1017b9ebfd45efbcc2c8cd95f7f4e30d Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 20 Sep 2023 01:20:40 -0400
Subject: bcachefs: snapshots: Use kvfree_rcu_mightsleep()

kvfree_rcu() was renamed - not removed.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/snapshot.c | 24 ++----------------------
 1 file changed, 2 insertions(+), 22 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c
index 0acfca00a6af..f27e8c4fc10a 100644
--- a/fs/bcachefs/snapshot.c
+++ b/fs/bcachefs/snapshot.c
@@ -143,20 +143,6 @@ bool __bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor)
 	return ret;
 }
 
-struct snapshot_t_free_rcu {
-	struct rcu_head		rcu;
-	struct snapshot_table	*t;
-};
-
-static void snapshot_t_free_rcu(struct rcu_head *rcu)
-{
-	struct snapshot_t_free_rcu *free_rcu =
-		container_of(rcu, struct snapshot_t_free_rcu, rcu);
-
-	kvfree(free_rcu->t);
-	kfree(free_rcu);
-}
-
 static noinline struct snapshot_t *__snapshot_t_mut(struct bch_fs *c, u32 id)
 {
 	size_t idx = U32_MAX - id;
@@ -177,13 +163,7 @@ static noinline struct snapshot_t *__snapshot_t_mut(struct bch_fs *c, u32 id)
 
 	rcu_assign_pointer(c->snapshots, new);
 	c->snapshot_table_size = new_size;
-	if (old) {
-		struct snapshot_t_free_rcu *rcu =
-			kmalloc(sizeof(*rcu), GFP_KERNEL|__GFP_NOFAIL);
-
-		rcu->t = old;
-		call_rcu(&rcu->rcu, snapshot_t_free_rcu);
-	}
+	kvfree_rcu_mightsleep(old);
 
 	return &rcu_dereference_protected(c->snapshots, true)->s[idx];
 }
@@ -1638,7 +1618,7 @@ int bch2_propagate_key_to_snapshot_leaves(struct btree_trans *trans,
 	struct bch_fs *c = trans->c;
 	struct bkey_buf sk;
 	u32 restart_count = trans->restart_count;
-	int ret;
+	int ret = 0;
 
 	bch2_bkey_buf_init(&sk);
 	bch2_bkey_buf_reassemble(&sk, c, k);
-- 
cgit 


From 51c801bc6414e88d686fb3229c54d7dda1508778 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 20 Sep 2023 01:31:00 -0400
Subject: bcachefs: Minor bch2_btree_node_get() smatch fixes

 - it's no longer possible for trans to be NULL
 - also, move "wait for read to complete" to the slowpath,
   __bch2_btree_node_get().

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_cache.c | 24 ++----------------------
 1 file changed, 2 insertions(+), 22 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index ef9492f7e937..82cf243aa288 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -746,7 +746,7 @@ static noinline struct btree *bch2_btree_node_fill(struct btree_trans *trans,
 	six_unlock_intent(&b->c.lock);
 
 	/* Unlock before doing IO: */
-	if (trans && sync)
+	if (path && sync)
 		bch2_trans_unlock_noassert(trans);
 
 	bch2_btree_node_read(c, b, sync);
@@ -976,28 +976,8 @@ struct btree *bch2_btree_node_get(struct btree_trans *trans, struct btree_path *
 	}
 
 	if (unlikely(btree_node_read_in_flight(b))) {
-		u32 seq = six_lock_seq(&b->c.lock);
-
 		six_unlock_type(&b->c.lock, lock_type);
-		bch2_trans_unlock(trans);
-
-		bch2_btree_node_wait_on_read(b);
-
-		/*
-		 * should_be_locked is not set on this path yet, so we need to
-		 * relock it specifically:
-		 */
-		if (trans) {
-			ret = bch2_trans_relock(trans) ?:
-				bch2_btree_path_relock_intent(trans, path);
-			if (ret) {
-				BUG_ON(!trans->restarted);
-				return ERR_PTR(ret);
-			}
-		}
-
-		if (!six_relock_type(&b->c.lock, lock_type, seq))
-			return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip);
+		return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip);
 	}
 
 	prefetch(b->aux_data);
-- 
cgit 


From 40a53b92150fe9576538f775138413c40cdb30cf Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 20 Sep 2023 01:32:20 -0400
Subject: bcachefs: More minor smatch fixes

 - fix a few uninitialized return values
 - return a proper error code in lookup_lostfound()

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update.c | 2 +-
 fs/bcachefs/errcode.h      | 1 +
 fs/bcachefs/fsck.c         | 2 +-
 fs/bcachefs/io_write.c     | 2 +-
 fs/bcachefs/move.c         | 2 +-
 fs/bcachefs/super-io.c     | 2 +-
 6 files changed, 6 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update.c b/fs/bcachefs/btree_update.c
index 3342718de45d..324767c0ddcc 100644
--- a/fs/bcachefs/btree_update.c
+++ b/fs/bcachefs/btree_update.c
@@ -124,7 +124,7 @@ int __bch2_insert_snapshot_whiteouts(struct btree_trans *trans,
 	struct bkey_s_c old_k, new_k;
 	snapshot_id_list s;
 	struct bkey_i *update;
-	int ret;
+	int ret = 0;
 
 	if (!bch2_snapshot_has_children(c, old_pos.snapshot))
 		return 0;
diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h
index 56b6ce278648..64f7176c2a4e 100644
--- a/fs/bcachefs/errcode.h
+++ b/fs/bcachefs/errcode.h
@@ -99,6 +99,7 @@
 	x(ENOENT,			ENOENT_str_hash_set_must_replace)	\
 	x(ENOENT,			ENOENT_inode)				\
 	x(ENOENT,			ENOENT_not_subvol)			\
+	x(ENOENT,			ENOENT_not_directory)			\
 	x(ENOENT,			ENOENT_directory_dead)			\
 	x(ENOENT,			ENOENT_subvolume)			\
 	x(ENOENT,			ENOENT_snapshot_tree)			\
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index e3d68082fdd3..206302b0f5ed 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -281,7 +281,7 @@ static int lookup_lostfound(struct btree_trans *trans, u32 subvol,
 
 	if (d_type != DT_DIR) {
 		bch_err(c, "error looking up lost+found: not a directory");
-		return ret;
+		return -BCH_ERR_ENOENT_not_directory;
 	}
 
 	/*
diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c
index 659330cbe357..d2a0de886c7a 100644
--- a/fs/bcachefs/io_write.c
+++ b/fs/bcachefs/io_write.c
@@ -930,7 +930,7 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp,
 	do {
 		struct bch_extent_crc_unpacked crc = { 0 };
 		struct bversion version = op->version;
-		size_t dst_len, src_len;
+		size_t dst_len = 0, src_len = 0;
 
 		if (page_alloc_failed &&
 		    dst->bi_iter.bi_size  < (wp->sectors_free << 9) &&
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index c1aa76f9f845..39a14e321680 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -627,7 +627,7 @@ int bch2_move_data(struct bch_fs *c,
 {
 	struct moving_context ctxt;
 	enum btree_id id;
-	int ret;
+	int ret = 0;
 
 	bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc);
 
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index e0bd50983bb2..55bc03d2e8ed 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -185,7 +185,7 @@ int bch2_sb_realloc(struct bch_sb_handle *sb, unsigned u64s)
 	if (sb->sb && sb->buffer_size >= new_buffer_size)
 		return 0;
 
-	if (sb->have_layout) {
+	if (sb->sb && sb->have_layout) {
 		u64 max_bytes = 512 << sb->sb->layout.sb_max_size_bits;
 
 		if (new_bytes > max_bytes) {
-- 
cgit 


From 7239f8e0ee7fb0504d18b9570172688684f3606d Mon Sep 17 00:00:00 2001
From: Brian Foster <bfoster@redhat.com>
Date: Fri, 15 Sep 2023 08:51:54 -0400
Subject: bcachefs: initial freeze/unfreeze support

Initial support for the vfs superblock freeze and unfreeze
operations. Superblock freeze occurs in stages, where the vfs
attempts to quiesce high level write operations, page faults, fs
internal operations, and then finally calls into the filesystem for
any last stage steps (i.e. log flushing, etc.) before marking the
superblock frozen.

The majority of write paths are covered by freeze protection (i.e.
sb_start_write() and friends) in higher level common code, with the
exception of the fs-internal SB_FREEZE_FS stage (i.e.
sb_start_intwrite()). This typically maps to active filesystem
transactions in a manner that allows the vfs to implement a barrier
of internal fs operations during the freeze sequence. This is not a
viable model for bcachefs, however, because it utilizes transactions
both to populate the journal as well as to perform journal reclaim.
This means that mapping intwrite protection to transaction lifecycle
or transaction commit is likely to deadlock freeze, as quiescing the
journal requires transactional operations blocked by the final stage
of freeze.

The flipside of this is that bcachefs does already maintain its own
internal sets of write references for similar purposes, currently
utilized for transitions from read-write to read-only mode. Since
this largely mirrors the high level sequence involved with freeze,
we can simply invoke this mechanism in the freeze callback to fully
quiesce the filesystem in the final stage. This means that while the
SB_FREEZE_FS stage is essentially a no-op, the ->freeze_fs()
callback that immediately follows begins by performing effectively
the same step by quiescing all internal write references.

One caveat to this approach is that without integration of internal
freeze protection, write operations gated on internal write refs
will fail with an internal -EROFS error rather than block on
acquiring freeze protection. IOW, this is roughly equivalent to only
having support for sb_start_intwrite_trylock(), and not the blocking
variant. Many of these paths already use non-blocking internal write
refs and so would map into an sb_start_intwrite_trylock() anyways.
The only instance of this I've been able to uncover that doesn't
explicitly rely on a higher level non-blocking write ref is the
bch2_rbio_narrow_crcs() path, which updates crcs in certain read
cases, and Kent has pointed out isn't critical if it happens to fail
due to read-only status.

Given that, implement basic freeze support as described above and
leave tighter integration with internal freeze protection as a
possible future enhancement. There are multiple potential ideas
worth exploring here. For example, we could implement a multi-stage
freeze callback that might allow bcachefs to quiesce its internal
write references without deadlocks, we could integrate intwrite
protection with bcachefs' internal write references somehow or
another, or perhaps consider implementing blocking support for
internal write refs to be used specifically for freeze, etc. In the
meantime, this enables functional freeze support and the associated
test coverage that comes with it.

Signed-off-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs.c | 31 +++++++++++++++++++++++++++++--
 1 file changed, 29 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index bfbd4f004edc..73a3cebd734f 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -1716,6 +1716,35 @@ static void bch2_put_super(struct super_block *sb)
 	__bch2_fs_stop(c);
 }
 
+/*
+ * bcachefs doesn't currently integrate intwrite freeze protection but the
+ * internal write references serve the same purpose. Therefore reuse the
+ * read-only transition code to perform the quiesce. The caveat is that we don't
+ * currently have the ability to block tasks that want a write reference while
+ * the superblock is frozen. This is fine for now, but we should either add
+ * blocking support or find a way to integrate sb_start_intwrite() and friends.
+ */
+static int bch2_freeze(struct super_block *sb)
+{
+	struct bch_fs *c = sb->s_fs_info;
+
+	down_write(&c->state_lock);
+	bch2_fs_read_only(c);
+	up_write(&c->state_lock);
+	return 0;
+}
+
+static int bch2_unfreeze(struct super_block *sb)
+{
+	struct bch_fs *c = sb->s_fs_info;
+	int ret;
+
+	down_write(&c->state_lock);
+	ret = bch2_fs_read_write(c);
+	up_write(&c->state_lock);
+	return ret;
+}
+
 static const struct super_operations bch_super_operations = {
 	.alloc_inode	= bch2_alloc_inode,
 	.destroy_inode	= bch2_destroy_inode,
@@ -1727,10 +1756,8 @@ static const struct super_operations bch_super_operations = {
 	.show_options	= bch2_show_options,
 	.remount_fs	= bch2_remount,
 	.put_super	= bch2_put_super,
-#if 0
 	.freeze_fs	= bch2_freeze,
 	.unfreeze_fs	= bch2_unfreeze,
-#endif
 };
 
 static int bch2_set_super(struct super_block *s, void *data)
-- 
cgit 


From d8b6f8c3c6fe7eb279d031c5f7bdde086185f0d1 Mon Sep 17 00:00:00 2001
From: Torge Matthies <openglfreak@googlemail.com>
Date: Thu, 21 Sep 2023 23:25:54 +0200
Subject: bcachefs: Fix changing durability using sysfs

Signed-off-by: Torge Matthies <openglfreak@googlemail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/sysfs.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index 03dbea4d95ce..b700be5b6664 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -977,7 +977,7 @@ STORE(bch2_dev)
 		mutex_lock(&c->sb_lock);
 		mi = &bch2_sb_get_members(c->disk_sb.sb)->members[ca->dev_idx];
 
-		if (v != BCH_MEMBER_DURABILITY(mi)) {
+		if (v + 1 != BCH_MEMBER_DURABILITY(mi)) {
 			SET_BCH_MEMBER_DURABILITY(mi, v + 1);
 			bch2_write_super(c);
 		}
-- 
cgit 


From 82142a5541063addd15c84fcda890675035df8aa Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 22 Sep 2023 14:19:52 -0400
Subject: bcachefs: Fix a null ptr deref in bch2_get_alloc_in_memory_pos()

Reported-by: smatch
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/backpointers.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c
index 43defeaa1718..cc856150a948 100644
--- a/fs/bcachefs/backpointers.c
+++ b/fs/bcachefs/backpointers.c
@@ -703,7 +703,7 @@ static int bch2_get_alloc_in_memory_pos(struct btree_trans *trans,
 
 		--btree_nodes;
 		if (!btree_nodes) {
-			*end = alloc_k.k->p;
+			*end = alloc_k.k ? alloc_k.k->p : SPOS_MAX;
 			break;
 		}
 
-- 
cgit 


From eebe8a8459f78bbac5a12d0ef76838929d736ad2 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 23 Sep 2023 16:55:03 -0400
Subject: bcachefs: Make sure to initialize equiv when creating new snapshots

Previously, equiv was set in the snapshot deletion path, which is where
it's needed - equiv, for snapshot ID equivalence classes, would ideally
be a private data structure to the snapshot deletion path.

But if a new snapshot is created while snapshot deletion is running,
move_key_to_correct_snapshot() moves a key to snapshot id 0 - oops.

Fixes: https://github.com/koverstreet/bcachefs/issues/593
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/snapshot.c | 8 ++++++--
 fs/bcachefs/snapshot.h | 2 --
 2 files changed, 6 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c
index f27e8c4fc10a..cdf9eda2ee02 100644
--- a/fs/bcachefs/snapshot.c
+++ b/fs/bcachefs/snapshot.c
@@ -343,7 +343,7 @@ int bch2_snapshot_lookup(struct btree_trans *trans, u32 id,
 				       BTREE_ITER_WITH_UPDATES, snapshot, s);
 }
 
-int bch2_snapshot_live(struct btree_trans *trans, u32 id)
+static int bch2_snapshot_live(struct btree_trans *trans, u32 id)
 {
 	struct bch_snapshot v;
 	int ret;
@@ -370,7 +370,7 @@ int bch2_snapshot_live(struct btree_trans *trans, u32 id)
  * it's part of such a linear chain: this correctly sets equivalence classes on
  * startup if we run leaf to root (i.e. in natural key order).
  */
-int bch2_snapshot_set_equiv(struct btree_trans *trans, struct bkey_s_c k)
+static int bch2_snapshot_set_equiv(struct btree_trans *trans, struct bkey_s_c k)
 {
 	struct bch_fs *c = trans->c;
 	unsigned i, nr_live = 0, live_idx = 0;
@@ -1071,6 +1071,10 @@ static int create_snapids(struct btree_trans *trans, u32 parent, u32 tree,
 			goto err;
 
 		new_snapids[i]	= iter.pos.offset;
+
+		mutex_lock(&c->snapshot_table_lock);
+		snapshot_t_mut(c, new_snapids[i])->equiv = new_snapids[i];
+		mutex_unlock(&c->snapshot_table_lock);
 	}
 err:
 	bch2_trans_iter_exit(trans, &iter);
diff --git a/fs/bcachefs/snapshot.h b/fs/bcachefs/snapshot.h
index dabc9b9d921b..de215d9d1252 100644
--- a/fs/bcachefs/snapshot.h
+++ b/fs/bcachefs/snapshot.h
@@ -235,8 +235,6 @@ int bch2_snapshot_lookup(struct btree_trans *trans, u32 id,
 			 struct bch_snapshot *s);
 int bch2_snapshot_get_subvol(struct btree_trans *, u32,
 			     struct bch_subvolume *);
-int bch2_snapshot_live(struct btree_trans *trans, u32 id);
-int bch2_snapshot_set_equiv(struct btree_trans *trans, struct bkey_s_c k);
 
 /* only exported for tests: */
 int bch2_snapshot_node_create(struct btree_trans *, u32,
-- 
cgit 


From b560e32ef78f8e748b74a27009ee417b9072dd88 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 23 Sep 2023 17:45:03 -0400
Subject: bcachefs: Always check for invalid bkeys in main commit path

Previously, we would check for invalid bkeys at transaction commit time,
but only if CONFIG_BCACHEFS_DEBUG=y.

This check is important enough to always be on - it appears there's been
corruption making it into the journal that would have been caught by it.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_trans_commit.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c
index 183db5d67a26..04c1f4610972 100644
--- a/fs/bcachefs/btree_trans_commit.c
+++ b/fs/bcachefs/btree_trans_commit.c
@@ -776,7 +776,6 @@ static noinline void bch2_drop_overwrites_from_journal(struct btree_trans *trans
 		bch2_journal_key_overwritten(trans->c, wb->btree, 0, wb->k.k.p);
 }
 
-#ifdef CONFIG_BCACHEFS_DEBUG
 static noinline int bch2_trans_commit_bkey_invalid(struct btree_trans *trans, unsigned flags,
 						   struct btree_insert_entry *i,
 						   struct printbuf *err)
@@ -802,7 +801,6 @@ static noinline int bch2_trans_commit_bkey_invalid(struct btree_trans *trans, un
 
 	return -EINVAL;
 }
-#endif
 
 /*
  * Get journal reservation, take write locks, and attempt to do btree update(s):
@@ -1027,7 +1025,6 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
 	if (ret)
 		goto out_reset;
 
-#ifdef CONFIG_BCACHEFS_DEBUG
 	trans_for_each_update(trans, i) {
 		struct printbuf buf = PRINTBUF;
 		enum bkey_invalid_flags invalid_flags = 0;
@@ -1044,7 +1041,6 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
 		if (ret)
 			return ret;
 	}
-#endif
 
 	if (unlikely(!test_bit(BCH_FS_MAY_GO_RW, &c->flags))) {
 		ret = do_bch2_trans_commit_to_journal_replay(trans);
-- 
cgit 


From 03ef80b469d5d83530ce1ce15be78a40e5300f9b Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 23 Sep 2023 18:41:51 -0400
Subject: bcachefs: Ignore unknown mount options

This makes mount option handling consistent with other filesystems -
options may be handled at different layers, so an option we don't know
about might not be intended for us.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/opts.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c
index 960bb247f3a0..739a2ef80945 100644
--- a/fs/bcachefs/opts.c
+++ b/fs/bcachefs/opts.c
@@ -471,8 +471,9 @@ int bch2_parse_mount_opts(struct bch_fs *c, struct bch_opts *opts,
 			val = "0";
 		}
 
+		/* Unknown options are ignored: */
 		if (id < 0)
-			goto bad_opt;
+			continue;
 
 		if (!(bch2_opt_table[id].flags & OPT_MOUNT))
 			goto bad_opt;
-- 
cgit 


From 793a06d984511593c6375d219b38cc84f5a71aff Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 23 Sep 2023 19:07:16 -0400
Subject: bcachefs: Fixes for building in userspace

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/checksum.c | 20 ++++++++++++++++++++
 fs/bcachefs/checksum.h |  3 +++
 fs/bcachefs/fs.h       |  2 +-
 fs/bcachefs/io_read.c  |  2 ++
 fs/bcachefs/super-io.c |  2 +-
 5 files changed, 27 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/checksum.c b/fs/bcachefs/checksum.c
index c70262b7fd6e..1948119edbf4 100644
--- a/fs/bcachefs/checksum.c
+++ b/fs/bcachefs/checksum.c
@@ -559,6 +559,26 @@ int bch2_request_key(struct bch_sb *sb, struct bch_key *key)
 	return ret;
 }
 
+#ifndef __KERNEL__
+int bch2_revoke_key(struct bch_sb *sb)
+{
+	key_serial_t key_id;
+	struct printbuf key_description = PRINTBUF;
+
+	prt_printf(&key_description, "bcachefs:");
+	pr_uuid(&key_description, sb->user_uuid.b);
+
+	key_id = request_key("user", key_description.buf, NULL, KEY_SPEC_USER_KEYRING);
+	printbuf_exit(&key_description);
+	if (key_id < 0)
+		return errno;
+
+	keyctl_revoke(key_id);
+
+	return 0;
+}
+#endif
+
 int bch2_decrypt_sb_key(struct bch_fs *c,
 			struct bch_sb_field_crypt *crypt,
 			struct bch_key *key)
diff --git a/fs/bcachefs/checksum.h b/fs/bcachefs/checksum.h
index 779f175029a8..13998388c545 100644
--- a/fs/bcachefs/checksum.h
+++ b/fs/bcachefs/checksum.h
@@ -47,6 +47,9 @@ struct bch_csum bch2_checksum(struct bch_fs *, unsigned, struct nonce,
 
 int bch2_chacha_encrypt_key(struct bch_key *, struct nonce, void *, size_t);
 int bch2_request_key(struct bch_sb *, struct bch_key *);
+#ifndef __KERNEL__
+int bch2_revoke_key(struct bch_sb *);
+#endif
 
 int bch2_encrypt(struct bch_fs *, unsigned, struct nonce,
 		 void *data, size_t);
diff --git a/fs/bcachefs/fs.h b/fs/bcachefs/fs.h
index 10e11119ded2..5edf1d4b9e6b 100644
--- a/fs/bcachefs/fs.h
+++ b/fs/bcachefs/fs.h
@@ -197,7 +197,7 @@ int bch2_vfs_init(void);
 
 #else
 
-#define bch2_inode_update_after_write(_trans, _inode, _inode_u, _fields)	do {} while (0)
+#define bch2_inode_update_after_write(_trans, _inode, _inode_u, _fields)	({ do {} while (0); })
 
 static inline void bch2_evict_subvolume_inodes(struct bch_fs *c,
 					       snapshot_id_list *s) {}
diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c
index 9a57da00573d..443c3ea65527 100644
--- a/fs/bcachefs/io_read.c
+++ b/fs/bcachefs/io_read.c
@@ -24,6 +24,8 @@
 #include "subvolume.h"
 #include "trace.h"
 
+#include <linux/sched/mm.h>
+
 #ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
 
 static bool bch2_target_congested(struct bch_fs *c, u16 target)
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index 55bc03d2e8ed..c9bf342d14aa 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -675,7 +675,7 @@ retry:
 
 #ifndef __KERNEL__
 	if (opt_get(*opts, direct_io) == false)
-		sb->mode |= FMODE_BUFFERED;
+		sb->mode |= BLK_OPEN_BUFFERED;
 #endif
 
 	if (!opt_get(*opts, noexcl))
-- 
cgit 


From efedfc2ece141389a9f522a298781cc929262701 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 24 Sep 2023 21:05:50 -0400
Subject: bcachefs: nocow locking: Fix lock leak

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/io_write.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c
index d2a0de886c7a..6e4f85eb6ec8 100644
--- a/fs/bcachefs/io_write.c
+++ b/fs/bcachefs/io_write.c
@@ -1376,10 +1376,12 @@ err_get_ioref:
 	/* Fall back to COW path: */
 	goto out;
 err_bucket_stale:
-	while (--i >= 0)
+	while (i >= 0) {
 		bch2_bucket_nocow_unlock(&c->nocow_locks,
 					 buckets[i].b,
 					 BUCKET_NOCOW_LOCK_UPDATE);
+		--i;
+	}
 	for (i = 0; i < nr_buckets; i++)
 		percpu_ref_put(&bch_dev_bkey_exists(c, buckets[i].b.inode)->io_ref);
 
-- 
cgit 


From 1e3b40980b931728f5d2c0ef8352f6aed6799a69 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 24 Sep 2023 16:25:06 -0400
Subject: bcachefs: More assertions for nocow locking

 - assert in shutdown path that no nocow locks are held
 - check for overflow when taking nocow locks

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/nocow_locking.c | 37 +++++++++++++++++++++++++++++--------
 fs/bcachefs/nocow_locking.h |  1 +
 fs/bcachefs/super.c         |  1 +
 3 files changed, 31 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/nocow_locking.c b/fs/bcachefs/nocow_locking.c
index 396357cd8f2f..3c21981a4a1c 100644
--- a/fs/bcachefs/nocow_locking.c
+++ b/fs/bcachefs/nocow_locking.c
@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 
 #include "bcachefs.h"
+#include "bkey_methods.h"
 #include "nocow_locking.h"
 #include "util.h"
 
@@ -29,9 +30,10 @@ void bch2_bucket_nocow_unlock(struct bucket_nocow_lock_table *t, struct bpos buc
 
 	for (i = 0; i < ARRAY_SIZE(l->b); i++)
 		if (l->b[i] == dev_bucket) {
-			BUG_ON(sign(atomic_read(&l->l[i])) != lock_val);
+			int v = atomic_sub_return(lock_val, &l->l[i]);
 
-			if (!atomic_sub_return(lock_val, &l->l[i]))
+			BUG_ON(v && sign(v) != lock_val);
+			if (!v)
 				closure_wake_up(&l->wait);
 			return;
 		}
@@ -64,6 +66,11 @@ got_entry:
 	if (lock_val > 0 ? v < 0 : v > 0)
 		goto fail;
 take_lock:
+	v = atomic_read(&l->l[i]);
+	/* Overflow? */
+	if (v && sign(v + lock_val) != sign(v))
+		goto fail;
+
 	atomic_add(lock_val, &l->l[i]);
 	spin_unlock(&l->lock);
 	return true;
@@ -83,6 +90,7 @@ void __bch2_bucket_nocow_lock(struct bucket_nocow_lock_table *t,
 }
 
 void bch2_nocow_locks_to_text(struct printbuf *out, struct bucket_nocow_lock_table *t)
+
 {
 	unsigned i, nr_zero = 0;
 	struct nocow_lock_bucket *l;
@@ -102,9 +110,13 @@ void bch2_nocow_locks_to_text(struct printbuf *out, struct bucket_nocow_lock_tab
 			prt_printf(out, "(%u empty entries)\n", nr_zero);
 		nr_zero = 0;
 
-		for (i = 0; i < ARRAY_SIZE(l->l); i++)
-			if (atomic_read(&l->l[i]))
-				prt_printf(out, "%llu: %i ", l->b[i], atomic_read(&l->l[i]));
+		for (i = 0; i < ARRAY_SIZE(l->l); i++) {
+			int v = atomic_read(&l->l[i]);
+			if (v) {
+				bch2_bpos_to_text(out, u64_to_bucket(l->b[i]));
+				prt_printf(out, ": %s %u ", v < 0 ? "copy" : "update", abs(v));
+			}
+		}
 		prt_newline(out);
 	}
 
@@ -112,12 +124,21 @@ void bch2_nocow_locks_to_text(struct printbuf *out, struct bucket_nocow_lock_tab
 		prt_printf(out, "(%u empty entries)\n", nr_zero);
 }
 
+void bch2_fs_nocow_locking_exit(struct bch_fs *c)
+{
+	struct bucket_nocow_lock_table *t = &c->nocow_locks;
+
+	for (struct nocow_lock_bucket *l = t->l; l < t->l + ARRAY_SIZE(t->l); l++)
+		for (unsigned j = 0; j < ARRAY_SIZE(l->l); j++)
+			BUG_ON(atomic_read(&l->l[j]));
+}
+
 int bch2_fs_nocow_locking_init(struct bch_fs *c)
 {
-	unsigned i;
+	struct bucket_nocow_lock_table *t = &c->nocow_locks;
 
-	for (i = 0; i < ARRAY_SIZE(c->nocow_locks.l); i++)
-		spin_lock_init(&c->nocow_locks.l[i].lock);
+	for (struct nocow_lock_bucket *l = t->l; l < t->l + ARRAY_SIZE(t->l); l++)
+		spin_lock_init(&l->lock);
 
 	return 0;
 }
diff --git a/fs/bcachefs/nocow_locking.h b/fs/bcachefs/nocow_locking.h
index ff8e4af52edc..f9d6a426a960 100644
--- a/fs/bcachefs/nocow_locking.h
+++ b/fs/bcachefs/nocow_locking.h
@@ -44,6 +44,7 @@ static inline bool bch2_bucket_nocow_trylock(struct bucket_nocow_lock_table *t,
 
 void bch2_nocow_locks_to_text(struct printbuf *, struct bucket_nocow_lock_table *);
 
+void bch2_fs_nocow_locking_exit(struct bch_fs *);
 int bch2_fs_nocow_locking_init(struct bch_fs *);
 
 #endif /* _BCACHEFS_NOCOW_LOCKING_H */
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index e94a63a22704..1347270e5045 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -484,6 +484,7 @@ static void __bch2_fs_free(struct bch_fs *c)
 	bch2_fs_fsio_exit(c);
 	bch2_fs_ec_exit(c);
 	bch2_fs_encryption_exit(c);
+	bch2_fs_nocow_locking_exit(c);
 	bch2_fs_io_write_exit(c);
 	bch2_fs_io_read_exit(c);
 	bch2_fs_buckets_waiting_for_journal_exit(c);
-- 
cgit 


From a190cbcfa029b7921cfda484e2a125649496941e Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 26 Sep 2023 01:39:25 -0400
Subject: bcachefs: Silence transaction restart error message

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fsck.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 206302b0f5ed..2ef14adb4cb2 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -971,10 +971,10 @@ static int check_inode(struct btree_trans *trans,
 
 	if (do_update) {
 		ret = __write_inode(trans, &u, iter->pos.snapshot);
-		if (ret) {
+		if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
 			bch_err_msg(c, ret, "in fsck updating inode");
+		if (ret)
 			return ret;
-		}
 	}
 err:
 fsck_err:
-- 
cgit 


From d2a990d1b132c8124a3856706f6b3663b9059bb5 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 26 Sep 2023 16:02:06 -0400
Subject: bcachefs: bch_err_msg(), bch_err_fn() now filters out transaction
 restart errors

These errors aren't actual errors, and should never be printed - do this
in the common helpers.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs.h |  12 +++++-
 fs/bcachefs/fsck.c     | 104 +++++++++++++++----------------------------------
 fs/bcachefs/inode.c    |   4 +-
 fs/bcachefs/tests.c    |  97 ++++++++++++++++++---------------------------
 4 files changed, 80 insertions(+), 137 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index ad18f3b10af0..e9d07f9fa1c0 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -293,9 +293,17 @@ do {									\
 	printk_ratelimited(KERN_ERR bch2_fmt_inum_offset(c, _inum, _offset, fmt), ##__VA_ARGS__)
 
 #define bch_err_fn(_c, _ret)						\
-	 bch_err(_c, "%s(): error %s", __func__, bch2_err_str(_ret))
+do {									\
+	if (_ret && !bch2_err_matches(_ret, BCH_ERR_transaction_restart))\
+		bch_err(_c, "%s(): error %s", __func__, bch2_err_str(_ret));\
+} while (0)
+
 #define bch_err_msg(_c, _ret, _msg, ...)				\
-	 bch_err(_c, "%s(): error " _msg " %s", __func__, ##__VA_ARGS__, bch2_err_str(_ret))
+do {									\
+	if (_ret && !bch2_err_matches(_ret, BCH_ERR_transaction_restart))\
+		bch_err(_c, "%s(): error " _msg " %s", __func__,	\
+			##__VA_ARGS__, bch2_err_str(_ret));		\
+} while (0)
 
 #define bch_verbose(c, fmt, ...)					\
 do {									\
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 2ef14adb4cb2..b8f9e7475dc5 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -126,8 +126,7 @@ static int lookup_first_inode(struct btree_trans *trans, u64 inode_nr,
 
 	ret = bch2_inode_unpack(k, inode);
 err:
-	if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
-		bch_err_msg(trans->c, ret, "fetching inode %llu", inode_nr);
+	bch_err_msg(trans->c, ret, "fetching inode %llu", inode_nr);
 	bch2_trans_iter_exit(trans, &iter);
 	return ret;
 }
@@ -152,8 +151,7 @@ static int __lookup_inode(struct btree_trans *trans, u64 inode_nr,
 	if (!ret)
 		*snapshot = iter.pos.snapshot;
 err:
-	if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
-		bch_err_msg(trans->c, ret, "fetching inode %llu:%u", inode_nr, *snapshot);
+	bch_err_msg(trans->c, ret, "fetching inode %llu:%u", inode_nr, *snapshot);
 	bch2_trans_iter_exit(trans, &iter);
 	return ret;
 }
@@ -238,8 +236,7 @@ static int __remove_dirent(struct btree_trans *trans, struct bpos pos)
 				  BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
 	bch2_trans_iter_exit(trans, &iter);
 err:
-	if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
-		bch_err_fn(c, ret);
+	bch_err_fn(c, ret);
 	return ret;
 }
 
@@ -274,8 +271,7 @@ static int lookup_lostfound(struct btree_trans *trans, u32 subvol,
 		goto create_lostfound;
 	}
 
-	if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
-		bch_err_fn(c, ret);
+	bch_err_fn(c, ret);
 	if (ret)
 		return ret;
 
@@ -297,8 +293,7 @@ create_lostfound:
 				lostfound, &lostfound_str,
 				0, 0, S_IFDIR|0700, 0, NULL, NULL,
 				(subvol_inum) { }, 0);
-	if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
-		bch_err_msg(c, ret, "creating lost+found");
+	bch_err_msg(c, ret, "creating lost+found");
 	return ret;
 }
 
@@ -361,11 +356,7 @@ static int reattach_inode(struct btree_trans *trans,
 				  BTREE_INSERT_LAZY_RW|
 				  BTREE_INSERT_NOFAIL,
 			__reattach_inode(trans, inode, inode_snapshot));
-	if (ret) {
-		bch_err_msg(trans->c, ret, "reattaching inode %llu", inode->bi_inum);
-		return ret;
-	}
-
+	bch_err_msg(trans->c, ret, "reattaching inode %llu", inode->bi_inum);
 	return ret;
 }
 
@@ -821,8 +812,7 @@ bad_hash:
 		     (printbuf_reset(&buf),
 		      bch2_bkey_val_to_text(&buf, c, hash_k), buf.buf))) {
 		ret = hash_redo_key(trans, desc, hash_info, k_iter, hash_k);
-		if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
-			bch_err_fn(c, ret);
+		bch_err_fn(c, ret);
 		if (ret)
 			return ret;
 		ret = -BCH_ERR_transaction_restart_nested;
@@ -885,11 +875,9 @@ static int check_inode(struct btree_trans *trans,
 		u.bi_flags &= ~BCH_INODE_I_SIZE_DIRTY|BCH_INODE_UNLINKED;
 
 		ret = __write_inode(trans, &u, iter->pos.snapshot);
-		if (ret) {
-			if (!bch2_err_matches(ret, BCH_ERR_transaction_restart))
-				bch_err_msg(c, ret, "in fsck updating inode");
+		bch_err_msg(c, ret, "in fsck updating inode");
+		if (ret)
 			return ret;
-		}
 
 		if (!bpos_eq(new_min_pos, POS_MIN))
 			bch2_btree_iter_set_pos(iter, bpos_predecessor(new_min_pos));
@@ -904,8 +892,7 @@ static int check_inode(struct btree_trans *trans,
 		bch2_fs_lazy_rw(c);
 
 		ret = bch2_inode_rm_snapshot(trans, u.bi_inum, iter->pos.snapshot);
-		if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
-			bch_err_msg(c, ret, "in fsck deleting inode");
+		bch_err_msg(c, ret, "in fsck deleting inode");
 		return ret;
 	}
 
@@ -927,8 +914,7 @@ static int check_inode(struct btree_trans *trans,
 				     iter->pos.snapshot),
 				POS(u.bi_inum, U64_MAX),
 				0, NULL);
-		if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
-			bch_err_msg(c, ret, "in fsck truncating inode");
+		bch_err_msg(c, ret, "in fsck truncating inode");
 		if (ret)
 			return ret;
 
@@ -953,7 +939,7 @@ static int check_inode(struct btree_trans *trans,
 
 		sectors = bch2_count_inode_sectors(trans, u.bi_inum, iter->pos.snapshot);
 		if (sectors < 0) {
-			bch_err_msg(c, sectors, "fsck recounting inode sectors");
+			bch_err_msg(c, sectors, "in fsck recounting inode sectors");
 			return sectors;
 		}
 
@@ -971,15 +957,13 @@ static int check_inode(struct btree_trans *trans,
 
 	if (do_update) {
 		ret = __write_inode(trans, &u, iter->pos.snapshot);
-		if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
-			bch_err_msg(c, ret, "in fsck updating inode");
+		bch_err_msg(c, ret, "in fsck updating inode");
 		if (ret)
 			return ret;
 	}
 err:
 fsck_err:
-	if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
-		bch_err_fn(c, ret);
+	bch_err_fn(c, ret);
 	return ret;
 }
 
@@ -1004,8 +988,7 @@ int bch2_check_inodes(struct bch_fs *c)
 
 	snapshots_seen_exit(&s);
 	bch2_trans_put(trans);
-	if (ret)
-		bch_err_fn(c, ret);
+	bch_err_fn(c, ret);
 	return ret;
 }
 
@@ -1084,8 +1067,7 @@ static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w)
 		}
 	}
 fsck_err:
-	if (ret)
-		bch_err_fn(c, ret);
+	bch_err_fn(c, ret);
 	return ret ?: trans_was_restarted(trans, restart_count);
 }
 
@@ -1419,9 +1401,7 @@ out:
 err:
 fsck_err:
 	printbuf_exit(&buf);
-
-	if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
-		bch_err_fn(c, ret);
+	bch_err_fn(c, ret);
 	return ret;
 delete:
 	ret = bch2_btree_delete_at(trans, iter, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
@@ -1462,8 +1442,7 @@ int bch2_check_extents(struct bch_fs *c)
 	snapshots_seen_exit(&s);
 	bch2_trans_put(trans);
 
-	if (ret)
-		bch_err_fn(c, ret);
+	bch_err_fn(c, ret);
 	return ret;
 }
 
@@ -1501,8 +1480,7 @@ static int check_subdir_count(struct btree_trans *trans, struct inode_walker *w)
 		}
 	}
 fsck_err:
-	if (ret)
-		bch_err_fn(c, ret);
+	bch_err_fn(c, ret);
 	return ret ?: trans_was_restarted(trans, restart_count);
 }
 
@@ -1619,9 +1597,7 @@ out:
 err:
 fsck_err:
 	printbuf_exit(&buf);
-
-	if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
-		bch_err_fn(c, ret);
+	bch_err_fn(c, ret);
 	return ret;
 }
 
@@ -1785,9 +1761,7 @@ out:
 err:
 fsck_err:
 	printbuf_exit(&buf);
-
-	if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
-		bch_err_fn(c, ret);
+	bch_err_fn(c, ret);
 	return ret;
 }
 
@@ -1820,9 +1794,7 @@ int bch2_check_dirents(struct bch_fs *c)
 	snapshots_seen_exit(&s);
 	inode_walker_exit(&dir);
 	inode_walker_exit(&target);
-
-	if (ret)
-		bch_err_fn(c, ret);
+	bch_err_fn(c, ret);
 	return ret;
 }
 
@@ -1858,8 +1830,7 @@ static int check_xattr(struct btree_trans *trans, struct btree_iter *iter,
 
 	ret = hash_check_key(trans, bch2_xattr_hash_desc, hash_info, iter, k);
 fsck_err:
-	if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
-		bch_err_fn(c, ret);
+	bch_err_fn(c, ret);
 	return ret;
 }
 
@@ -1882,8 +1853,7 @@ int bch2_check_xattrs(struct bch_fs *c)
 			NULL, NULL,
 			BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
 		check_xattr(trans, &iter, k, &hash_info, &inode)));
-	if (ret)
-		bch_err_fn(c, ret);
+	bch_err_fn(c, ret);
 	return ret;
 }
 
@@ -1915,10 +1885,9 @@ static int check_root_trans(struct btree_trans *trans)
 				      BTREE_INSERT_LAZY_RW,
 			bch2_btree_insert_trans(trans, BTREE_ID_subvolumes,
 					    &root_subvol.k_i, 0));
-		if (ret) {
-			bch_err_msg(c, ret, "writing root subvol");
+		bch_err_msg(c, ret, "writing root subvol");
+		if (ret)
 			goto err;
-		}
 
 	}
 
@@ -1934,8 +1903,7 @@ static int check_root_trans(struct btree_trans *trans)
 		root_inode.bi_inum = inum;
 
 		ret = __write_inode(trans, &root_inode, snapshot);
-		if (ret)
-			bch_err_msg(c, ret, "writing root inode");
+		bch_err_msg(c, ret, "writing root inode");
 	}
 err:
 fsck_err:
@@ -1951,9 +1919,7 @@ int bch2_check_root(struct bch_fs *c)
 			     BTREE_INSERT_NOFAIL|
 			     BTREE_INSERT_LAZY_RW,
 		check_root_trans(trans));
-
-	if (ret)
-		bch_err_fn(c, ret);
+	bch_err_fn(c, ret);
 	return ret;
 }
 
@@ -2090,8 +2056,7 @@ static int check_path(struct btree_trans *trans,
 		}
 	}
 fsck_err:
-	if (ret)
-		bch_err_fn(c, ret);
+	bch_err_fn(c, ret);
 	return ret;
 }
 
@@ -2133,9 +2098,7 @@ int bch2_check_directory_structure(struct bch_fs *c)
 	bch2_trans_iter_exit(trans, &iter);
 	bch2_trans_put(trans);
 	darray_exit(&path);
-
-	if (ret)
-		bch_err_fn(c, ret);
+	bch_err_fn(c, ret);
 	return ret;
 }
 
@@ -2402,9 +2365,7 @@ int bch2_check_nlinks(struct bch_fs *c)
 	} while (next_iter_range_start != U64_MAX);
 
 	kvfree(links.d);
-
-	if (ret)
-		bch_err_fn(c, ret);
+	bch_err_fn(c, ret);
 	return ret;
 }
 
@@ -2451,7 +2412,6 @@ int bch2_fix_reflink_p(struct bch_fs *c)
 				BTREE_ITER_ALL_SNAPSHOTS, k,
 				NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW,
 			fix_reflink_p_key(trans, &iter, k)));
-	if (ret)
-		bch_err_fn(c, ret);
+	bch_err_fn(c, ret);
 	return ret;
 }
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index 8bfd99cb7ad1..4a695a8e7a3b 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -357,9 +357,7 @@ int bch2_inode_peek(struct btree_trans *trans,
 		    subvol_inum inum, unsigned flags)
 {
 	int ret = bch2_inode_peek_nowarn(trans, iter, inode, inum, flags);
-
-	if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
-		bch_err_msg(trans->c, ret, "looking up inum %u:%llu:", inum.subvol, inum.inum);
+	bch_err_msg(trans->c, ret, "looking up inum %u:%llu:", inum.subvol, inum.inum);
 	return ret;
 }
 
diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c
index c907b3e00176..2fc9e60c754b 100644
--- a/fs/bcachefs/tests.c
+++ b/fs/bcachefs/tests.c
@@ -45,28 +45,25 @@ static int test_delete(struct bch_fs *c, u64 nr)
 	ret = commit_do(trans, NULL, NULL, 0,
 		bch2_btree_iter_traverse(&iter) ?:
 		bch2_trans_update(trans, &iter, &k.k_i, 0));
-	if (ret) {
-		bch_err_msg(c, ret, "update error");
+	bch_err_msg(c, ret, "update error");
+	if (ret)
 		goto err;
-	}
 
 	pr_info("deleting once");
 	ret = commit_do(trans, NULL, NULL, 0,
 		bch2_btree_iter_traverse(&iter) ?:
 		bch2_btree_delete_at(trans, &iter, 0));
-	if (ret) {
-		bch_err_msg(c, ret, "delete error (first)");
+	bch_err_msg(c, ret, "delete error (first)");
+	if (ret)
 		goto err;
-	}
 
 	pr_info("deleting twice");
 	ret = commit_do(trans, NULL, NULL, 0,
 		bch2_btree_iter_traverse(&iter) ?:
 		bch2_btree_delete_at(trans, &iter, 0));
-	if (ret) {
-		bch_err_msg(c, ret, "delete error (second)");
+	bch_err_msg(c, ret, "delete error (second)");
+	if (ret)
 		goto err;
-	}
 err:
 	bch2_trans_iter_exit(trans, &iter);
 	bch2_trans_put(trans);
@@ -89,10 +86,9 @@ static int test_delete_written(struct bch_fs *c, u64 nr)
 	ret = commit_do(trans, NULL, NULL, 0,
 		bch2_btree_iter_traverse(&iter) ?:
 		bch2_trans_update(trans, &iter, &k.k_i, 0));
-	if (ret) {
-		bch_err_msg(c, ret, "update error");
+	bch_err_msg(c, ret, "update error");
+	if (ret)
 		goto err;
-	}
 
 	bch2_trans_unlock(trans);
 	bch2_journal_flush_all_pins(&c->journal);
@@ -100,10 +96,9 @@ static int test_delete_written(struct bch_fs *c, u64 nr)
 	ret = commit_do(trans, NULL, NULL, 0,
 		bch2_btree_iter_traverse(&iter) ?:
 		bch2_btree_delete_at(trans, &iter, 0));
-	if (ret) {
-		bch_err_msg(c, ret, "delete error");
+	bch_err_msg(c, ret, "delete error");
+	if (ret)
 		goto err;
-	}
 err:
 	bch2_trans_iter_exit(trans, &iter);
 	bch2_trans_put(trans);
@@ -130,10 +125,9 @@ static int test_iterate(struct bch_fs *c, u64 nr)
 		ck.k.p.snapshot = U32_MAX;
 
 		ret = bch2_btree_insert(c, BTREE_ID_xattrs, &ck.k_i, NULL, 0);
-		if (ret) {
-			bch_err_msg(c, ret, "insert error");
+		bch_err_msg(c, ret, "insert error");
+		if (ret)
 			goto err;
-		}
 	}
 
 	pr_info("iterating forwards");
@@ -146,10 +140,9 @@ static int test_iterate(struct bch_fs *c, u64 nr)
 		BUG_ON(k.k->p.offset != i++);
 		0;
 	}));
-	if (ret) {
-		bch_err_msg(c, ret, "error iterating forwards");
+	bch_err_msg(c, ret, "error iterating forwards");
+	if (ret)
 		goto err;
-	}
 
 	BUG_ON(i != nr);
 
@@ -161,10 +154,9 @@ static int test_iterate(struct bch_fs *c, u64 nr)
 			BUG_ON(k.k->p.offset != --i);
 			0;
 		}));
-	if (ret) {
-		bch_err_msg(c, ret, "error iterating backwards");
+	bch_err_msg(c, ret, "error iterating backwards");
+	if (ret)
 		goto err;
-	}
 
 	BUG_ON(i);
 err:
@@ -194,10 +186,9 @@ static int test_iterate_extents(struct bch_fs *c, u64 nr)
 		ck.k.size = 8;
 
 		ret = bch2_btree_insert(c, BTREE_ID_extents, &ck.k_i, NULL, 0);
-		if (ret) {
-			bch_err_msg(c, ret, "insert error");
+		bch_err_msg(c, ret, "insert error");
+		if (ret)
 			goto err;
-		}
 	}
 
 	pr_info("iterating forwards");
@@ -211,10 +202,9 @@ static int test_iterate_extents(struct bch_fs *c, u64 nr)
 		i = k.k->p.offset;
 		0;
 	}));
-	if (ret) {
-		bch_err_msg(c, ret, "error iterating forwards");
+	bch_err_msg(c, ret, "error iterating forwards");
+	if (ret)
 		goto err;
-	}
 
 	BUG_ON(i != nr);
 
@@ -227,10 +217,9 @@ static int test_iterate_extents(struct bch_fs *c, u64 nr)
 			i = bkey_start_offset(k.k);
 			0;
 		}));
-	if (ret) {
-		bch_err_msg(c, ret, "error iterating backwards");
+	bch_err_msg(c, ret, "error iterating backwards");
+	if (ret)
 		goto err;
-	}
 
 	BUG_ON(i);
 err:
@@ -259,10 +248,9 @@ static int test_iterate_slots(struct bch_fs *c, u64 nr)
 		ck.k.p.snapshot = U32_MAX;
 
 		ret = bch2_btree_insert(c, BTREE_ID_xattrs, &ck.k_i, NULL, 0);
-		if (ret) {
-			bch_err_msg(c, ret, "insert error");
+		bch_err_msg(c, ret, "insert error");
+		if (ret)
 			goto err;
-		}
 	}
 
 	pr_info("iterating forwards");
@@ -276,10 +264,9 @@ static int test_iterate_slots(struct bch_fs *c, u64 nr)
 		i += 2;
 		0;
 	}));
-	if (ret) {
-		bch_err_msg(c, ret, "error iterating forwards");
+	bch_err_msg(c, ret, "error iterating forwards");
+	if (ret)
 		goto err;
-	}
 
 	BUG_ON(i != nr * 2);
 
@@ -330,10 +317,9 @@ static int test_iterate_slots_extents(struct bch_fs *c, u64 nr)
 		ck.k.size = 8;
 
 		ret = bch2_btree_insert(c, BTREE_ID_extents, &ck.k_i, NULL, 0);
-		if (ret) {
-			bch_err_msg(c, ret, "insert error");
+		bch_err_msg(c, ret, "insert error");
+		if (ret)
 			goto err;
-		}
 	}
 
 	pr_info("iterating forwards");
@@ -348,10 +334,9 @@ static int test_iterate_slots_extents(struct bch_fs *c, u64 nr)
 		i += 16;
 		0;
 	}));
-	if (ret) {
-		bch_err_msg(c, ret, "error iterating forwards");
+	bch_err_msg(c, ret, "error iterating forwards");
+	if (ret)
 		goto err;
-	}
 
 	BUG_ON(i != nr);
 
@@ -371,10 +356,9 @@ static int test_iterate_slots_extents(struct bch_fs *c, u64 nr)
 		i = k.k->p.offset;
 		0;
 	}));
-	if (ret) {
-		bch_err_msg(c, ret, "error iterating forwards by slots");
+	bch_err_msg(c, ret, "error iterating forwards by slots");
+	if (ret)
 		goto err;
-	}
 	ret = 0;
 err:
 	bch2_trans_put(trans);
@@ -442,8 +426,7 @@ static int insert_test_extent(struct bch_fs *c,
 	k.k_i.k.version.lo = test_version++;
 
 	ret = bch2_btree_insert(c, BTREE_ID_extents, &k.k_i, NULL, 0);
-	if (ret)
-		bch_err_fn(c, ret);
+	bch_err_fn(c, ret);
 	return ret;
 }
 
@@ -499,8 +482,7 @@ static int insert_test_overlapping_extent(struct bch_fs *c, u64 inum, u64 start,
 	ret = bch2_trans_do(c, NULL, NULL, 0,
 		bch2_btree_insert_nonextent(trans, BTREE_ID_extents, &k.k_i,
 					    BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE));
-	if (ret)
-		bch_err_fn(c, ret);
+	bch_err_fn(c, ret);
 	return ret;
 }
 
@@ -569,12 +551,8 @@ static int test_snapshots(struct bch_fs *c, u64 nr)
 		swap(snapids[0], snapids[1]);
 
 	ret = test_snapshot_filter(c, snapids[0], snapids[1]);
-	if (ret) {
-		bch_err_msg(c, ret, "from test_snapshot_filter");
-		return ret;
-	}
-
-	return 0;
+	bch_err_msg(c, ret, "from test_snapshot_filter");
+	return ret;
 }
 
 /* perf tests */
@@ -678,8 +656,7 @@ static int rand_mixed_trans(struct btree_trans *trans,
 
 	k = bch2_btree_iter_peek(iter);
 	ret = bkey_err(k);
-	if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
-		bch_err_msg(trans->c, ret, "lookup error");
+	bch_err_msg(trans->c, ret, "lookup error");
 	if (ret)
 		return ret;
 
-- 
cgit 


From d281701b00fc857755cd0fc08a415a694d5f49c0 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 26 Sep 2023 17:11:23 -0400
Subject: bcachefs: Fix looping around bch2_propagate_key_to_snapshot_leaves()

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/snapshot.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c
index cdf9eda2ee02..3ecc17b8d6fc 100644
--- a/fs/bcachefs/snapshot.c
+++ b/fs/bcachefs/snapshot.c
@@ -1636,9 +1636,15 @@ int bch2_propagate_key_to_snapshot_leaves(struct btree_trans *trans,
 		if (!bch2_snapshot_is_ancestor(c, id, k.k->p.snapshot) ||
 		    !bch2_snapshot_is_leaf(c, id))
 			continue;
+again:
+		ret =   btree_trans_too_many_iters(trans) ?:
+			bch2_propagate_key_to_snapshot_leaf(trans, btree, k, id, new_min_pos) ?:
+			bch2_trans_commit(trans, NULL, NULL, 0);
+		if (ret && bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
+			bch2_trans_begin(trans);
+			goto again;
+		}
 
-		ret = commit_do(trans, NULL, NULL, 0,
-				bch2_propagate_key_to_snapshot_leaf(trans, btree, k, id, new_min_pos));
 		if (ret)
 			break;
 	}
-- 
cgit 


From 1ee608c65d652af30cf69eaca425d8a5c06712d7 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 26 Sep 2023 17:20:39 -0400
Subject: bcachefs: Fall back to requesting passphrase directly

We can only do this in userspace, unfortunately - but kernel keyrings
have never seemed to worked reliably, this is a useful fallback.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/checksum.c | 33 +++++++++++++++++++++++++++++++--
 1 file changed, 31 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/checksum.c b/fs/bcachefs/checksum.c
index 1948119edbf4..50bf4a58f37d 100644
--- a/fs/bcachefs/checksum.c
+++ b/fs/bcachefs/checksum.c
@@ -534,16 +534,31 @@ static int __bch2_request_key(char *key_description, struct bch_key *key)
 {
 	key_serial_t key_id;
 
+	key_id = request_key("user", key_description, NULL,
+			     KEY_SPEC_SESSION_KEYRING);
+	if (key_id >= 0)
+		goto got_key;
+
 	key_id = request_key("user", key_description, NULL,
 			     KEY_SPEC_USER_KEYRING);
-	if (key_id < 0)
-		return -errno;
+	if (key_id >= 0)
+		goto got_key;
+
+	key_id = request_key("user", key_description, NULL,
+			     KEY_SPEC_USER_SESSION_KEYRING);
+	if (key_id >= 0)
+		goto got_key;
+
+	return -errno;
+got_key:
 
 	if (keyctl_read(key_id, (void *) key, sizeof(*key)) != sizeof(*key))
 		return -1;
 
 	return 0;
 }
+
+#include "../crypto.h"
 #endif
 
 int bch2_request_key(struct bch_sb *sb, struct bch_key *key)
@@ -556,6 +571,20 @@ int bch2_request_key(struct bch_sb *sb, struct bch_key *key)
 
 	ret = __bch2_request_key(key_description.buf, key);
 	printbuf_exit(&key_description);
+
+#ifndef __KERNEL__
+	if (ret) {
+		char *passphrase = read_passphrase("Enter passphrase: ");
+		struct bch_encrypted_key sb_key;
+
+		bch2_passphrase_check(sb, passphrase,
+				      key, &sb_key);
+		ret = 0;
+	}
+#endif
+
+	/* stash with memfd, pass memfd fd to mount */
+
 	return ret;
 }
 
-- 
cgit 


From 7dcf62c06d11195e8caecd7b2236aa5b07e3ef8c Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 26 Sep 2023 17:21:21 -0400
Subject: bcachefs: Make btree root read errors recoverable

The entire btree will be lost, but that is better than the entire
filesystem not being recoverable.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/recovery.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 1dceb7eeb205..9dbaf080dcdd 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -374,13 +374,12 @@ static int read_btree_roots(struct bch_fs *c)
 
 		ret = bch2_btree_root_read(c, i, &r->key, r->level);
 		if (ret) {
-			__fsck_err(c,
-				   btree_id_is_alloc(i)
-				   ? FSCK_CAN_IGNORE : 0,
-				   "error reading btree root %s",
-				   bch2_btree_ids[i]);
+			fsck_err(c,
+				 "error reading btree root %s",
+				 bch2_btree_ids[i]);
 			if (btree_id_is_alloc(i))
 				c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
+			ret = 0;
 		}
 	}
 
-- 
cgit 


From 4220666398de7f5127bab5437b5276b3eb155282 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 27 Sep 2023 14:44:56 -0400
Subject: bcachefs: Fix bch2_inode_delete_keys()

bch2_inode_delete_keys() was using BTREE_ITER_NOT_EXTENTS, on the
assumption that it would never need to split extents.

But that caused a race with extents being split by other threads -
specifically, the data move path. Extents iterators have the iterator
position pointing to the start of the extent, which avoids the race.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/inode.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index 4a695a8e7a3b..bb3f443d8381 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -780,6 +780,7 @@ static int bch2_inode_delete_keys(struct btree_trans *trans,
 	struct btree_iter iter;
 	struct bkey_s_c k;
 	struct bkey_i delete;
+	struct bpos end = POS(inum.inum, U64_MAX);
 	u32 snapshot;
 	int ret = 0;
 
@@ -788,7 +789,7 @@ static int bch2_inode_delete_keys(struct btree_trans *trans,
 	 * extent iterator:
 	 */
 	bch2_trans_iter_init(trans, &iter, id, POS(inum.inum, 0),
-			     BTREE_ITER_INTENT|BTREE_ITER_NOT_EXTENTS);
+			     BTREE_ITER_INTENT);
 
 	while (1) {
 		bch2_trans_begin(trans);
@@ -799,7 +800,7 @@ static int bch2_inode_delete_keys(struct btree_trans *trans,
 
 		bch2_btree_iter_set_snapshot(&iter, snapshot);
 
-		k = bch2_btree_iter_peek_upto(&iter, POS(inum.inum, U64_MAX));
+		k = bch2_btree_iter_peek_upto(&iter, end);
 		ret = bkey_err(k);
 		if (ret)
 			goto err;
@@ -810,6 +811,11 @@ static int bch2_inode_delete_keys(struct btree_trans *trans,
 		bkey_init(&delete.k);
 		delete.k.p = iter.pos;
 
+		if (iter.flags & BTREE_ITER_IS_EXTENTS)
+			bch2_key_resize(&delete.k,
+					bpos_min(end, k.k->p).offset -
+					iter.pos.offset);
+
 		ret = bch2_trans_update(trans, &iter, &delete, 0) ?:
 		      bch2_trans_commit(trans, NULL, NULL,
 					BTREE_INSERT_NOFAIL);
-- 
cgit 


From 3c40841cdcf374d661ff14e7a707681342565166 Mon Sep 17 00:00:00 2001
From: Brian Foster <bfoster@redhat.com>
Date: Wed, 27 Sep 2023 07:23:37 -0400
Subject: bcachefs: fix crc32c checksum merge byte order problem

An fsstress task on a big endian system (s390x) quickly produces a
bunch of CRC errors in the system logs. Most of these are related to
the narrow CRCs path, but the fundamental problem can be reduced to
a single write and re-read (after dropping caches) of a previously
merged extent.

The key merge path that handles extent merges eventually calls into
bch2_checksum_merge() to combine the CRCs of the associated extents.
This code attempts to avoid a byte order swap by feeding the le64
values into the crc32c code, but the latter casts the resulting u64
value down to a u32, which truncates the high bytes where the actual
crc value ends up. This results in a CRC value that does not change
(since it is merged with a CRC of 0), and checksum failures ensue.

Fix the checksum merge code to swap to cpu byte order on the
boundaries to the external crc code such that any value casting is
handled properly.

Signed-off-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/checksum.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/checksum.c b/fs/bcachefs/checksum.c
index 50bf4a58f37d..839f00dce50f 100644
--- a/fs/bcachefs/checksum.c
+++ b/fs/bcachefs/checksum.c
@@ -362,7 +362,7 @@ struct bch_csum bch2_checksum_merge(unsigned type, struct bch_csum a,
 
 	state.type = type;
 	bch2_checksum_init(&state);
-	state.seed = (u64 __force) a.lo;
+	state.seed = le64_to_cpu(a.lo);
 
 	BUG_ON(!bch2_checksum_mergeable(type));
 
@@ -373,7 +373,7 @@ struct bch_csum bch2_checksum_merge(unsigned type, struct bch_csum a,
 				page_address(ZERO_PAGE(0)), page_len);
 		b_len -= page_len;
 	}
-	a.lo = (__le64 __force) bch2_checksum_final(&state);
+	a.lo = cpu_to_le64(bch2_checksum_final(&state));
 	a.lo ^= b.lo;
 	a.hi ^= b.hi;
 	return a;
-- 
cgit 


From 73bbeaa2de1d429590a1b5ddd706dfeaf6d7d0e1 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 27 Sep 2023 19:51:29 -0400
Subject: bcachefs: bucket_lock() is now a sleepable lock

fsck_err() may sleep - it takes a mutex and may allocate memory, so
bucket_lock() needs to be a sleepable lock.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/buckets.c | 2 --
 fs/bcachefs/buckets.h | 7 +++++--
 2 files changed, 5 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index e7f4506f69ca..46b6406d772b 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -367,7 +367,6 @@ static inline int update_replicas(struct bch_fs *c, struct bkey_s_c k,
 	struct printbuf buf = PRINTBUF;
 
 	percpu_down_read(&c->mark_lock);
-	buf.atomic++;
 
 	idx = bch2_replicas_entry_idx(c, r);
 	if (idx < 0 &&
@@ -795,7 +794,6 @@ static int mark_stripe_bucket(struct btree_trans *trans,
 	/* * XXX doesn't handle deletion */
 
 	percpu_down_read(&c->mark_lock);
-	buf.atomic++;
 	g = PTR_GC_BUCKET(ca, ptr);
 
 	if (g->dirty_sectors ||
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index ecbeb7280f87..bf8d7f407e9c 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -70,12 +70,15 @@ union ulong_byte_assert {
 static inline void bucket_unlock(struct bucket *b)
 {
 	BUILD_BUG_ON(!((union ulong_byte_assert) { .ulong = 1UL << BUCKET_LOCK_BITNR }).byte);
-	bit_spin_unlock(BUCKET_LOCK_BITNR, (void *) &b->lock);
+
+	clear_bit_unlock(BUCKET_LOCK_BITNR, (void *) &b->lock);
+	wake_up_bit((void *) &b->lock, BUCKET_LOCK_BITNR);
 }
 
 static inline void bucket_lock(struct bucket *b)
 {
-	bit_spin_lock(BUCKET_LOCK_BITNR, (void *) &b->lock);
+	wait_on_bit_lock((void *) &b->lock, BUCKET_LOCK_BITNR,
+			 TASK_UNINTERRUPTIBLE);
 }
 
 static inline struct bucket_array *gc_bucket_array(struct bch_dev *ca)
-- 
cgit 


From 1241df5872b439ff1e5a8c70e0ce82345264904f Mon Sep 17 00:00:00 2001
From: Hunter Shaffer <huntershaffer182456@gmail.com>
Date: Sun, 24 Sep 2023 23:55:37 -0400
Subject: bcachefs: Add new helper to retrieve bch_member from sb

Prep work for introducing bch_sb_field_members_v2 - introduce new
helpers that will check for members_v2 if it exists, otherwise using v1

Signed-off-by: Hunter Shaffer <huntershaffer182456@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/disk_groups.c | 22 +++++++++-------------
 fs/bcachefs/journal_sb.c  | 20 ++++++++++----------
 fs/bcachefs/replicas.c    |  3 +--
 fs/bcachefs/sb-members.c  | 17 +++++++++++++++++
 fs/bcachefs/sb-members.h  |  2 ++
 fs/bcachefs/super-io.c    | 19 ++++++-------------
 fs/bcachefs/super-io.h    |  9 ++++++---
 fs/bcachefs/super.c       | 35 +++++++++++------------------------
 8 files changed, 62 insertions(+), 65 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/disk_groups.c b/fs/bcachefs/disk_groups.c
index b292dbef7992..1bf47a6209dd 100644
--- a/fs/bcachefs/disk_groups.c
+++ b/fs/bcachefs/disk_groups.c
@@ -25,19 +25,18 @@ static int bch2_sb_disk_groups_validate(struct bch_sb *sb,
 	struct bch_sb_field_disk_groups *groups =
 		field_to_type(f, disk_groups);
 	struct bch_disk_group *g, *sorted = NULL;
-	struct bch_sb_field_members *mi = bch2_sb_get_members(sb);
 	unsigned nr_groups = disk_groups_nr(groups);
 	unsigned i, len;
 	int ret = 0;
 
 	for (i = 0; i < sb->nr_devices; i++) {
-		struct bch_member *m = mi->members + i;
+		struct bch_member m = bch2_sb_member_get(sb, i);
 		unsigned group_id;
 
-		if (!BCH_MEMBER_GROUP(m))
+		if (!BCH_MEMBER_GROUP(&m))
 			continue;
 
-		group_id = BCH_MEMBER_GROUP(m) - 1;
+		group_id = BCH_MEMBER_GROUP(&m) - 1;
 
 		if (group_id >= nr_groups) {
 			prt_printf(err, "disk %u has invalid label %u (have %u)",
@@ -152,14 +151,12 @@ const struct bch_sb_field_ops bch_sb_field_ops_disk_groups = {
 
 int bch2_sb_disk_groups_to_cpu(struct bch_fs *c)
 {
-	struct bch_sb_field_members *mi;
 	struct bch_sb_field_disk_groups *groups;
 	struct bch_disk_groups_cpu *cpu_g, *old_g;
 	unsigned i, g, nr_groups;
 
 	lockdep_assert_held(&c->sb_lock);
 
-	mi		= bch2_sb_get_members(c->disk_sb.sb);
 	groups		= bch2_sb_get_disk_groups(c->disk_sb.sb);
 	nr_groups	= disk_groups_nr(groups);
 
@@ -182,13 +179,13 @@ int bch2_sb_disk_groups_to_cpu(struct bch_fs *c)
 	}
 
 	for (i = 0; i < c->disk_sb.sb->nr_devices; i++) {
-		struct bch_member *m = mi->members + i;
+		struct bch_member m = bch2_sb_member_get(c->disk_sb.sb, i);
 		struct bch_disk_group_cpu *dst;
 
-		if (!bch2_member_exists(m))
+		if (!bch2_member_exists(&m))
 			continue;
 
-		g = BCH_MEMBER_GROUP(m);
+		g = BCH_MEMBER_GROUP(&m);
 		while (g) {
 			dst = &cpu_g->entries[g - 1];
 			__set_bit(i, dst->devs.d);
@@ -528,12 +525,11 @@ void bch2_opt_target_to_text(struct printbuf *out,
 
 			rcu_read_unlock();
 		} else {
-			struct bch_sb_field_members *mi = bch2_sb_get_members(sb);
-			struct bch_member *m = mi->members + t.dev;
+			struct bch_member m = bch2_sb_member_get(sb, t.dev);
 
-			if (bch2_dev_exists(sb, mi, t.dev)) {
+			if (bch2_dev_exists(sb, t.dev)) {
 				prt_printf(out, "Device ");
-				pr_uuid(out, m->uuid.b);
+				pr_uuid(out, m.uuid.b);
 				prt_printf(out, " (%u)", t.dev);
 			} else {
 				prt_printf(out, "Bad device %u", t.dev);
diff --git a/fs/bcachefs/journal_sb.c b/fs/bcachefs/journal_sb.c
index cc41bff86d6b..3c5a95e29463 100644
--- a/fs/bcachefs/journal_sb.c
+++ b/fs/bcachefs/journal_sb.c
@@ -21,7 +21,7 @@ static int bch2_sb_journal_validate(struct bch_sb *sb,
 				    struct printbuf *err)
 {
 	struct bch_sb_field_journal *journal = field_to_type(f, journal);
-	struct bch_member *m = bch2_sb_get_members(sb)->members + sb->dev_idx;
+	struct bch_member m = bch2_sb_member_get(sb, sb->dev_idx);
 	int ret = -BCH_ERR_invalid_sb_journal;
 	unsigned nr;
 	unsigned i;
@@ -45,15 +45,15 @@ static int bch2_sb_journal_validate(struct bch_sb *sb,
 		goto err;
 	}
 
-	if (b[0] < le16_to_cpu(m->first_bucket)) {
+	if (b[0] < le16_to_cpu(m.first_bucket)) {
 		prt_printf(err, "journal bucket %llu before first bucket %u",
-		       b[0], le16_to_cpu(m->first_bucket));
+		       b[0], le16_to_cpu(m.first_bucket));
 		goto err;
 	}
 
-	if (b[nr - 1] >= le64_to_cpu(m->nbuckets)) {
+	if (b[nr - 1] >= le64_to_cpu(m.nbuckets)) {
 		prt_printf(err, "journal bucket %llu past end of device (nbuckets %llu)",
-		       b[nr - 1], le64_to_cpu(m->nbuckets));
+		       b[nr - 1], le64_to_cpu(m.nbuckets));
 		goto err;
 	}
 
@@ -104,7 +104,7 @@ static int bch2_sb_journal_v2_validate(struct bch_sb *sb,
 				    struct printbuf *err)
 {
 	struct bch_sb_field_journal_v2 *journal = field_to_type(f, journal_v2);
-	struct bch_member *m = bch2_sb_get_members(sb)->members + sb->dev_idx;
+	struct bch_member m = bch2_sb_member_get(sb, sb->dev_idx);
 	int ret = -BCH_ERR_invalid_sb_journal;
 	unsigned nr;
 	unsigned i;
@@ -130,15 +130,15 @@ static int bch2_sb_journal_v2_validate(struct bch_sb *sb,
 		goto err;
 	}
 
-	if (b[0].start < le16_to_cpu(m->first_bucket)) {
+	if (b[0].start < le16_to_cpu(m.first_bucket)) {
 		prt_printf(err, "journal bucket %llu before first bucket %u",
-		       b[0].start, le16_to_cpu(m->first_bucket));
+		       b[0].start, le16_to_cpu(m.first_bucket));
 		goto err;
 	}
 
-	if (b[nr - 1].end > le64_to_cpu(m->nbuckets)) {
+	if (b[nr - 1].end > le64_to_cpu(m.nbuckets)) {
 		prt_printf(err, "journal bucket %llu past end of device (nbuckets %llu)",
-		       b[nr - 1].end - 1, le64_to_cpu(m->nbuckets));
+		       b[nr - 1].end - 1, le64_to_cpu(m.nbuckets));
 		goto err;
 	}
 
diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c
index dbef41cd8593..a9a694fb0b18 100644
--- a/fs/bcachefs/replicas.c
+++ b/fs/bcachefs/replicas.c
@@ -805,7 +805,6 @@ static int bch2_cpu_replicas_validate(struct bch_replicas_cpu *cpu_r,
 				      struct bch_sb *sb,
 				      struct printbuf *err)
 {
-	struct bch_sb_field_members *mi = bch2_sb_get_members(sb);
 	unsigned i, j;
 
 	sort_cmp_size(cpu_r->entries,
@@ -837,7 +836,7 @@ static int bch2_cpu_replicas_validate(struct bch_replicas_cpu *cpu_r,
 		}
 
 		for (j = 0; j < e->nr_devs; j++)
-			if (!bch2_dev_exists(sb, mi, e->devs[j])) {
+			if (!bch2_dev_exists(sb, e->devs[j])) {
 				prt_printf(err, "invalid device %u in entry ", e->devs[j]);
 				bch2_replicas_entry_to_text(err, e);
 				return -BCH_ERR_invalid_sb_replicas;
diff --git a/fs/bcachefs/sb-members.c b/fs/bcachefs/sb-members.c
index 16a2b3389525..0be51d082043 100644
--- a/fs/bcachefs/sb-members.c
+++ b/fs/bcachefs/sb-members.c
@@ -7,6 +7,23 @@
 #include "super-io.h"
 
 /* Code for bch_sb_field_members: */
+static struct bch_member *members_v1_get_mut(struct bch_sb_field_members *mi, int i)
+{
+	return mi->members + i;
+}
+
+static struct bch_member members_v1_get(struct bch_sb_field_members *mi, int i)
+{
+	struct bch_member ret, *p = members_v1_get_mut(mi, i);
+	memset(&ret, 0, sizeof(ret));
+	memcpy(&ret, p, min_t(size_t, sizeof(struct bch_member), sizeof(ret))); return ret;
+}
+
+struct bch_member bch2_sb_member_get(struct bch_sb *sb, int i)
+{
+	struct bch_sb_field_members *mi1 = bch2_sb_get_members(sb);
+	return members_v1_get(mi1, i);
+}
 
 static int bch2_sb_members_validate(struct bch_sb *sb,
 				    struct bch_sb_field *f,
diff --git a/fs/bcachefs/sb-members.h b/fs/bcachefs/sb-members.h
index 34e1cf6046e3..85d1af6a990c 100644
--- a/fs/bcachefs/sb-members.h
+++ b/fs/bcachefs/sb-members.h
@@ -2,6 +2,8 @@
 #ifndef _BCACHEFS_SB_MEMBERS_H
 #define _BCACHEFS_SB_MEMBERS_H
 
+struct bch_member bch2_sb_member_get(struct bch_sb *sb, int i);
+
 static inline bool bch2_dev_is_online(struct bch_dev *ca)
 {
 	return !percpu_ref_is_zero(&ca->io_ref);
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index c9bf342d14aa..16e3a8aa290d 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -485,7 +485,6 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb, struct printbuf *out,
 static void bch2_sb_update(struct bch_fs *c)
 {
 	struct bch_sb *src = c->disk_sb.sb;
-	struct bch_sb_field_members *mi = bch2_sb_get_members(src);
 	struct bch_dev *ca;
 	unsigned i;
 
@@ -511,8 +510,10 @@ static void bch2_sb_update(struct bch_fs *c)
 	c->sb.features		= le64_to_cpu(src->features[0]);
 	c->sb.compat		= le64_to_cpu(src->compat[0]);
 
-	for_each_member_device(ca, c, i)
-		ca->mi = bch2_mi_to_cpu(mi->members + i);
+	for_each_member_device(ca, c, i) {
+		struct bch_member m = bch2_sb_member_get(src, i);
+		ca->mi = bch2_mi_to_cpu(&m);
+	}
 }
 
 static int __copy_super(struct bch_sb_handle *dst_handle, struct bch_sb *src)
@@ -1125,7 +1126,6 @@ void bch2_sb_layout_to_text(struct printbuf *out, struct bch_sb_layout *l)
 void bch2_sb_to_text(struct printbuf *out, struct bch_sb *sb,
 		     bool print_layout, unsigned fields)
 {
-	struct bch_sb_field_members *mi;
 	struct bch_sb_field *f;
 	u64 fields_have = 0;
 	unsigned nr_devices = 0;
@@ -1133,15 +1133,8 @@ void bch2_sb_to_text(struct printbuf *out, struct bch_sb *sb,
 	if (!out->nr_tabstops)
 		printbuf_tabstop_push(out, 44);
 
-	mi = bch2_sb_get_members(sb);
-	if (mi) {
-		struct bch_member *m;
-
-		for (m = mi->members;
-		     m < mi->members + sb->nr_devices;
-		     m++)
-			nr_devices += bch2_member_exists(m);
-	}
+	for (int i = 0; i < sb->nr_devices; i++)
+		nr_devices += bch2_dev_exists(sb, i);
 
 	prt_printf(out, "External UUID:");
 	prt_tab(out);
diff --git a/fs/bcachefs/super-io.h b/fs/bcachefs/super-io.h
index d51c0a19586f..817c3d790acd 100644
--- a/fs/bcachefs/super-io.h
+++ b/fs/bcachefs/super-io.h
@@ -6,6 +6,7 @@
 #include "eytzinger.h"
 #include "super_types.h"
 #include "super.h"
+#include "sb-members.h"
 
 #include <asm/byteorder.h>
 
@@ -97,11 +98,13 @@ static inline bool bch2_member_exists(struct bch_member *m)
 }
 
 static inline bool bch2_dev_exists(struct bch_sb *sb,
-				   struct bch_sb_field_members *mi,
 				   unsigned dev)
 {
-	return dev < sb->nr_devices &&
-		bch2_member_exists(&mi->members[dev]);
+	if (dev < sb->nr_devices) {
+	struct bch_member m = bch2_sb_member_get(sb, dev);
+		return bch2_member_exists(&m);
+	}
+	return false;
 }
 
 static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi)
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 1347270e5045..7ba1ebabc176 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -49,6 +49,7 @@
 #include "recovery.h"
 #include "replicas.h"
 #include "sb-clean.h"
+#include "sb-members.h"
 #include "snapshot.h"
 #include "subvolume.h"
 #include "super.h"
@@ -662,7 +663,6 @@ err:
 
 static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 {
-	struct bch_sb_field_members *mi;
 	struct bch_fs *c;
 	struct printbuf name = PRINTBUF;
 	unsigned i, iter_size;
@@ -858,9 +858,8 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 	if (ret)
 		goto err;
 
-	mi = bch2_sb_get_members(c->disk_sb.sb);
 	for (i = 0; i < c->sb.nr_devices; i++)
-		if (bch2_dev_exists(c->disk_sb.sb, mi, i) &&
+		if (bch2_dev_exists(c->disk_sb.sb, i) &&
 		    bch2_dev_alloc(c, i)) {
 			ret = -EEXIST;
 			goto err;
@@ -997,16 +996,12 @@ err:
 
 static int bch2_dev_may_add(struct bch_sb *sb, struct bch_fs *c)
 {
-	struct bch_sb_field_members *sb_mi;
-
-	sb_mi = bch2_sb_get_members(sb);
-	if (!sb_mi)
-		return -BCH_ERR_member_info_missing;
+	struct bch_member m = bch2_sb_member_get(sb, sb->dev_idx);
 
 	if (le16_to_cpu(sb->block_size) != block_sectors(c))
 		return -BCH_ERR_mismatched_block_size;
 
-	if (le16_to_cpu(sb_mi->members[sb->dev_idx].bucket_size) <
+	if (le16_to_cpu(m.bucket_size) <
 	    BCH_SB_BTREE_NODE_SIZE(c->disk_sb.sb))
 		return -BCH_ERR_bucket_size_too_small;
 
@@ -1017,12 +1012,11 @@ static int bch2_dev_in_fs(struct bch_sb *fs, struct bch_sb *sb)
 {
 	struct bch_sb *newest =
 		le64_to_cpu(fs->seq) > le64_to_cpu(sb->seq) ? fs : sb;
-	struct bch_sb_field_members *mi = bch2_sb_get_members(newest);
 
 	if (!uuid_equal(&fs->uuid, &sb->uuid))
 		return -BCH_ERR_device_not_a_member_of_filesystem;
 
-	if (!bch2_dev_exists(newest, mi, sb->dev_idx))
+	if (!bch2_dev_exists(newest, sb->dev_idx))
 		return -BCH_ERR_device_has_been_removed;
 
 	if (fs->block_size != sb->block_size)
@@ -1192,15 +1186,14 @@ static void bch2_dev_attach(struct bch_fs *c, struct bch_dev *ca,
 
 static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx)
 {
-	struct bch_member *member =
-		bch2_sb_get_members(c->disk_sb.sb)->members + dev_idx;
+	struct bch_member member = bch2_sb_member_get(c->disk_sb.sb, dev_idx);
 	struct bch_dev *ca = NULL;
 	int ret = 0;
 
 	if (bch2_fs_init_fault("dev_alloc"))
 		goto err;
 
-	ca = __bch2_dev_alloc(c, member);
+	ca = __bch2_dev_alloc(c, &member);
 	if (!ca)
 		goto err;
 
@@ -1335,7 +1328,6 @@ bool bch2_dev_state_allowed(struct bch_fs *c, struct bch_dev *ca,
 
 static bool bch2_fs_may_start(struct bch_fs *c)
 {
-	struct bch_sb_field_members *mi;
 	struct bch_dev *ca;
 	unsigned i, flags = 0;
 
@@ -1348,10 +1340,9 @@ static bool bch2_fs_may_start(struct bch_fs *c)
 	if (!c->opts.degraded &&
 	    !c->opts.very_degraded) {
 		mutex_lock(&c->sb_lock);
-		mi = bch2_sb_get_members(c->disk_sb.sb);
 
 		for (i = 0; i < c->disk_sb.sb->nr_devices; i++) {
-			if (!bch2_dev_exists(c->disk_sb.sb, mi, i))
+			if (!bch2_dev_exists(c->disk_sb.sb, i))
 				continue;
 
 			ca = bch_dev_locked(c, i);
@@ -1588,7 +1579,7 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
 		goto err;
 	}
 
-	dev_mi = bch2_sb_get_members(sb.sb)->members[sb.sb->dev_idx];
+	dev_mi = bch2_sb_member_get(sb.sb, sb.sb->dev_idx);
 
 	if (BCH_MEMBER_GROUP(&dev_mi)) {
 		bch2_disk_path_to_text(&label, sb.sb, BCH_MEMBER_GROUP(&dev_mi) - 1);
@@ -1644,9 +1635,8 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
 	if (dynamic_fault("bcachefs:add:no_slot"))
 		goto no_slot;
 
-	mi = bch2_sb_get_members(c->disk_sb.sb);
 	for (dev_idx = 0; dev_idx < BCH_SB_MEMBERS_MAX; dev_idx++)
-		if (!bch2_dev_exists(c->disk_sb.sb, mi, dev_idx))
+		if (!bch2_dev_exists(c->disk_sb.sb, dev_idx))
 			goto have_slot;
 no_slot:
 	ret = -BCH_ERR_ENOSPC_sb_members;
@@ -1875,7 +1865,6 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices,
 {
 	struct bch_sb_handle *sb = NULL;
 	struct bch_fs *c = NULL;
-	struct bch_sb_field_members *mi;
 	unsigned i, best_sb = 0;
 	struct printbuf errbuf = PRINTBUF;
 	int ret = 0;
@@ -1906,12 +1895,10 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices,
 		    le64_to_cpu(sb[best_sb].sb->seq))
 			best_sb = i;
 
-	mi = bch2_sb_get_members(sb[best_sb].sb);
-
 	i = 0;
 	while (i < nr_devices) {
 		if (i != best_sb &&
-		    !bch2_dev_exists(sb[best_sb].sb, mi, sb[i].sb->dev_idx)) {
+		    !bch2_dev_exists(sb[best_sb].sb, sb[i].sb->dev_idx)) {
 			pr_info("%pg has been removed, skipping", sb[i].bdev);
 			bch2_free_super(&sb[i]);
 			array_remove_item(sb, nr_devices, i);
-- 
cgit 


From 3f7b9713dae09df31067c6d1d33c6eb1f617529f Mon Sep 17 00:00:00 2001
From: Hunter Shaffer <huntershaffer182456@gmail.com>
Date: Mon, 25 Sep 2023 00:02:56 -0400
Subject: bcachefs: New superblock section members_v2

members_v2 has dynamically resizable entries so that we can extend
bch_member. The members can no longer be accessed with simple array
indexing Instead members_v2_get is used to find a member's exact
location within the array and returns a copy of that member.
Alternatively member_v2_get_mut retrieves a mutable point to a member.

Signed-off-by: Hunter Shaffer <huntershaffer182456@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c |   2 +-
 fs/bcachefs/bcachefs_format.h  |  14 +-
 fs/bcachefs/disk_groups.c      |   2 +-
 fs/bcachefs/errcode.h          |   1 +
 fs/bcachefs/sb-members.c       | 403 +++++++++++++++++++++++++++--------------
 fs/bcachefs/sb-members.h       |   5 +
 fs/bcachefs/super-io.c         |   1 +
 fs/bcachefs/super.c            |  54 +++---
 fs/bcachefs/sysfs.c            |   4 +-
 9 files changed, 326 insertions(+), 160 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 19ef7a444c23..4059d3d4b7f5 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -1922,7 +1922,7 @@ bkey_err:
 	}
 
 	mutex_lock(&c->sb_lock);
-	m = bch2_sb_get_members(c->disk_sb.sb)->members + ca->dev_idx;
+	m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx);
 	SET_BCH_MEMBER_FREESPACE_INITIALIZED(m, true);
 	mutex_unlock(&c->sb_lock);
 
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index f0d130440baa..c1a9ba451f04 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -1231,7 +1231,8 @@ struct bch_sb_field {
 	x(replicas,	7)			\
 	x(journal_seq_blacklist, 8)		\
 	x(journal_v2,	9)			\
-	x(counters,	10)
+	x(counters,	10)			\
+	x(members_v2,	11)
 
 enum bch_sb_field_type {
 #define x(f, nr)	BCH_SB_FIELD_##f = nr,
@@ -1279,6 +1280,8 @@ struct bch_member {
 	__le64			flags[2];
 };
 
+#define BCH_MEMBER_V1_BYTES	56
+
 LE64_BITMASK(BCH_MEMBER_STATE,		struct bch_member, flags[0],  0,  4)
 /* 4-14 unused, was TIER, HAS_(META)DATA, REPLACEMENT */
 LE64_BITMASK(BCH_MEMBER_DISCARD,	struct bch_member, flags[0], 14, 15)
@@ -1308,7 +1311,14 @@ enum bch_member_state {
 
 struct bch_sb_field_members {
 	struct bch_sb_field	field;
-	struct bch_member	members[];
+	struct bch_member	_members[]; //Members are now variable size
+};
+
+struct bch_sb_field_members_v2 {
+	struct bch_sb_field	field;
+	__le16			member_bytes; //size of single member entry
+	u8			pad[6];
+	struct bch_member	_members[];
 };
 
 /* BCH_SB_FIELD_crypt: */
diff --git a/fs/bcachefs/disk_groups.c b/fs/bcachefs/disk_groups.c
index 1bf47a6209dd..43aad8ba8cc9 100644
--- a/fs/bcachefs/disk_groups.c
+++ b/fs/bcachefs/disk_groups.c
@@ -440,7 +440,7 @@ int __bch2_dev_group_set(struct bch_fs *c, struct bch_dev *ca, const char *name)
 	if (ret)
 		return ret;
 
-	mi = &bch2_sb_get_members(c->disk_sb.sb)->members[ca->dev_idx];
+	mi = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx);
 	SET_BCH_MEMBER_GROUP(mi, v + 1);
 	return 0;
 }
diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h
index 64f7176c2a4e..7cc083776a2e 100644
--- a/fs/bcachefs/errcode.h
+++ b/fs/bcachefs/errcode.h
@@ -91,6 +91,7 @@
 	x(ENOSPC,			ENOSPC_sb_quota)			\
 	x(ENOSPC,			ENOSPC_sb_replicas)			\
 	x(ENOSPC,			ENOSPC_sb_members)			\
+	x(ENOSPC,			ENOSPC_sb_members_v2)			\
 	x(ENOSPC,			ENOSPC_sb_crypt)			\
 	x(ENOSPC,			ENOSPC_btree_slot)			\
 	x(ENOSPC,			ENOSPC_snapshot_tree)			\
diff --git a/fs/bcachefs/sb-members.c b/fs/bcachefs/sb-members.c
index 0be51d082043..bf9c6c60530a 100644
--- a/fs/bcachefs/sb-members.c
+++ b/fs/bcachefs/sb-members.c
@@ -7,9 +7,28 @@
 #include "super-io.h"
 
 /* Code for bch_sb_field_members: */
+
+static struct bch_member *members_v2_get_mut(struct bch_sb_field_members_v2 *mi, int i)
+{
+	return (void *) mi->_members + (i * le16_to_cpu(mi->member_bytes));
+}
+
+struct bch_member *bch2_members_v2_get_mut(struct bch_sb *sb, int i)
+{
+	return members_v2_get_mut(bch2_sb_get_members_v2(sb), i);
+}
+
+static struct bch_member members_v2_get(struct bch_sb_field_members_v2 *mi, int i)
+{
+	struct bch_member ret, *p = members_v2_get_mut(mi, i);
+	memset(&ret, 0, sizeof(ret));
+	memcpy(&ret, p, min_t(size_t, le16_to_cpu(mi->member_bytes), sizeof(ret)));
+	return ret;
+}
+
 static struct bch_member *members_v1_get_mut(struct bch_sb_field_members *mi, int i)
 {
-	return mi->members + i;
+	return (void *) mi->_members + (i * BCH_MEMBER_V1_BYTES);
 }
 
 static struct bch_member members_v1_get(struct bch_sb_field_members *mi, int i)
@@ -21,170 +40,292 @@ static struct bch_member members_v1_get(struct bch_sb_field_members *mi, int i)
 
 struct bch_member bch2_sb_member_get(struct bch_sb *sb, int i)
 {
+	struct bch_sb_field_members_v2 *mi2 = bch2_sb_get_members_v2(sb);
+	if (mi2)
+		return members_v2_get(mi2, i);
 	struct bch_sb_field_members *mi1 = bch2_sb_get_members(sb);
 	return members_v1_get(mi1, i);
 }
 
-static int bch2_sb_members_validate(struct bch_sb *sb,
+static int sb_members_v2_resize_entries(struct bch_fs *c)
+{
+	struct bch_sb_field_members_v2 *mi = bch2_sb_get_members_v2(c->disk_sb.sb);
+
+	if (le16_to_cpu(mi->member_bytes) < sizeof(struct bch_member)) {
+		unsigned u64s = DIV_ROUND_UP((sizeof(*mi) + sizeof(mi->_members[0]) *
+					      c->disk_sb.sb->nr_devices), 8);
+
+		mi = bch2_sb_resize_members_v2(&c->disk_sb, u64s);
+		if (!mi)
+			return -BCH_ERR_ENOSPC_sb_members_v2;
+
+		for (int i = c->disk_sb.sb->nr_devices - 1; i >= 0; --i) {
+			void *dst = (void *) mi->_members + (i * sizeof(struct bch_member));
+			memmove(dst, members_v2_get_mut(mi, i), le16_to_cpu(mi->member_bytes));
+			memset(dst + le16_to_cpu(mi->member_bytes),
+			       0, (sizeof(struct bch_member) - le16_to_cpu(mi->member_bytes)));
+		}
+		mi->member_bytes = cpu_to_le16(sizeof(struct bch_member));
+	}
+	return 0;
+}
+
+int bch2_members_v2_init(struct bch_fs *c)
+{
+	struct bch_sb_field_members *mi1;
+	struct bch_sb_field_members_v2 *mi2;
+
+	if (!bch2_sb_get_members_v2(c->disk_sb.sb)) {
+		mi2 = bch2_sb_resize_members_v2(&c->disk_sb,
+				DIV_ROUND_UP(sizeof(*mi2) +
+					     sizeof(struct bch_member) * c->sb.nr_devices,
+					     sizeof(u64)));
+		mi1 = bch2_sb_get_members(c->disk_sb.sb);
+		memcpy(&mi2->_members[0], &mi1->_members[0],
+		       BCH_MEMBER_V1_BYTES * c->sb.nr_devices);
+		memset(&mi2->pad[0], 0, sizeof(mi2->pad));
+		mi2->member_bytes = cpu_to_le16(BCH_MEMBER_V1_BYTES);
+	}
+
+	return sb_members_v2_resize_entries(c);
+}
+
+int bch_members_cpy_v2_v1(struct bch_sb_handle *disk_sb)
+{
+	struct bch_sb_field_members *mi1;
+	struct bch_sb_field_members_v2 *mi2;
+
+	mi1 = bch2_sb_resize_members(disk_sb,
+			DIV_ROUND_UP(sizeof(*mi1) + BCH_MEMBER_V1_BYTES *
+				     disk_sb->sb->nr_devices, sizeof(u64)));
+	if (!mi1)
+		return -BCH_ERR_ENOSPC_sb_members;
+
+	mi2 = bch2_sb_get_members_v2(disk_sb->sb);
+
+	for (unsigned i = 0; i < disk_sb->sb->nr_devices; i++)
+		memcpy(members_v1_get_mut(mi1, i), members_v2_get_mut(mi2, i), BCH_MEMBER_V1_BYTES);
+
+	return 0;
+}
+
+static int validate_member(struct printbuf *err,
+			   struct bch_member m,
+			   struct bch_sb *sb,
+			   int i)
+{
+	if (le64_to_cpu(m.nbuckets) > LONG_MAX) {
+		prt_printf(err, "device %u: too many buckets (got %llu, max %lu)",
+			   i, le64_to_cpu(m.nbuckets), LONG_MAX);
+		return -BCH_ERR_invalid_sb_members;
+	}
+
+	if (le64_to_cpu(m.nbuckets) -
+	    le16_to_cpu(m.first_bucket) < BCH_MIN_NR_NBUCKETS) {
+		prt_printf(err, "device %u: not enough buckets (got %llu, max %u)",
+			   i, le64_to_cpu(m.nbuckets), BCH_MIN_NR_NBUCKETS);
+		return -BCH_ERR_invalid_sb_members;
+	}
+
+	if (le16_to_cpu(m.bucket_size) <
+	    le16_to_cpu(sb->block_size)) {
+		prt_printf(err, "device %u: bucket size %u smaller than block size %u",
+			   i, le16_to_cpu(m.bucket_size), le16_to_cpu(sb->block_size));
+		return -BCH_ERR_invalid_sb_members;
+	}
+
+	if (le16_to_cpu(m.bucket_size) <
+	    BCH_SB_BTREE_NODE_SIZE(sb)) {
+		prt_printf(err, "device %u: bucket size %u smaller than btree node size %llu",
+			   i, le16_to_cpu(m.bucket_size), BCH_SB_BTREE_NODE_SIZE(sb));
+		return -BCH_ERR_invalid_sb_members;
+	}
+
+	return 0;
+}
+
+static void member_to_text(struct printbuf *out,
+			   struct bch_member m,
+			   struct bch_sb_field_disk_groups *gi,
+			   struct bch_sb *sb,
+			   int i)
+{
+	unsigned data_have = bch2_sb_dev_has_data(sb, i);
+	u64 bucket_size = le16_to_cpu(m.bucket_size);
+	u64 device_size = le64_to_cpu(m.nbuckets) * bucket_size;
+
+
+	prt_printf(out, "Device:");
+	prt_tab(out);
+	prt_printf(out, "%u", i);
+	prt_newline(out);
+
+	printbuf_indent_add(out, 2);
+
+	prt_printf(out, "UUID:");
+	prt_tab(out);
+	pr_uuid(out, m.uuid.b);
+	prt_newline(out);
+
+	prt_printf(out, "Size:");
+	prt_tab(out);
+	prt_units_u64(out, device_size << 9);
+	prt_newline(out);
+
+	prt_printf(out, "Bucket size:");
+	prt_tab(out);
+	prt_units_u64(out, bucket_size << 9);
+	prt_newline(out);
+
+	prt_printf(out, "First bucket:");
+	prt_tab(out);
+	prt_printf(out, "%u", le16_to_cpu(m.first_bucket));
+	prt_newline(out);
+
+	prt_printf(out, "Buckets:");
+	prt_tab(out);
+	prt_printf(out, "%llu", le64_to_cpu(m.nbuckets));
+	prt_newline(out);
+
+	prt_printf(out, "Last mount:");
+	prt_tab(out);
+	if (m.last_mount)
+		pr_time(out, le64_to_cpu(m.last_mount));
+	else
+		prt_printf(out, "(never)");
+	prt_newline(out);
+
+	prt_printf(out, "State:");
+	prt_tab(out);
+	prt_printf(out, "%s",
+		   BCH_MEMBER_STATE(&m) < BCH_MEMBER_STATE_NR
+		   ? bch2_member_states[BCH_MEMBER_STATE(&m)]
+		   : "unknown");
+	prt_newline(out);
+
+	prt_printf(out, "Label:");
+	prt_tab(out);
+	if (BCH_MEMBER_GROUP(&m)) {
+		unsigned idx = BCH_MEMBER_GROUP(&m) - 1;
+
+		if (idx < disk_groups_nr(gi))
+			prt_printf(out, "%s (%u)",
+				   gi->entries[idx].label, idx);
+		else
+			prt_printf(out, "(bad disk labels section)");
+	} else {
+		prt_printf(out, "(none)");
+	}
+	prt_newline(out);
+
+	prt_printf(out, "Data allowed:");
+	prt_tab(out);
+	if (BCH_MEMBER_DATA_ALLOWED(&m))
+		prt_bitflags(out, bch2_data_types, BCH_MEMBER_DATA_ALLOWED(&m));
+	else
+		prt_printf(out, "(none)");
+	prt_newline(out);
+
+	prt_printf(out, "Has data:");
+	prt_tab(out);
+	if (data_have)
+		prt_bitflags(out, bch2_data_types, data_have);
+	else
+		prt_printf(out, "(none)");
+	prt_newline(out);
+
+	prt_printf(out, "Discard:");
+	prt_tab(out);
+	prt_printf(out, "%llu", BCH_MEMBER_DISCARD(&m));
+	prt_newline(out);
+
+	prt_printf(out, "Freespace initialized:");
+	prt_tab(out);
+	prt_printf(out, "%llu", BCH_MEMBER_FREESPACE_INITIALIZED(&m));
+	prt_newline(out);
+
+	printbuf_indent_sub(out, 2);
+}
+
+static int bch2_sb_members_v1_validate(struct bch_sb *sb,
 				    struct bch_sb_field *f,
 				    struct printbuf *err)
 {
 	struct bch_sb_field_members *mi = field_to_type(f, members);
 	unsigned i;
 
-	if ((void *) (mi->members + sb->nr_devices) >
+	if ((void *) members_v1_get_mut(mi, sb->nr_devices)  >
 	    vstruct_end(&mi->field)) {
 		prt_printf(err, "too many devices for section size");
 		return -BCH_ERR_invalid_sb_members;
 	}
 
 	for (i = 0; i < sb->nr_devices; i++) {
-		struct bch_member *m = mi->members + i;
-
-		if (!bch2_member_exists(m))
-			continue;
-
-		if (le64_to_cpu(m->nbuckets) > LONG_MAX) {
-			prt_printf(err, "device %u: too many buckets (got %llu, max %lu)",
-			       i, le64_to_cpu(m->nbuckets), LONG_MAX);
-			return -BCH_ERR_invalid_sb_members;
-		}
-
-		if (le64_to_cpu(m->nbuckets) -
-		    le16_to_cpu(m->first_bucket) < BCH_MIN_NR_NBUCKETS) {
-			prt_printf(err, "device %u: not enough buckets (got %llu, max %u)",
-			       i, le64_to_cpu(m->nbuckets), BCH_MIN_NR_NBUCKETS);
-			return -BCH_ERR_invalid_sb_members;
-		}
-
-		if (le16_to_cpu(m->bucket_size) <
-		    le16_to_cpu(sb->block_size)) {
-			prt_printf(err, "device %u: bucket size %u smaller than block size %u",
-			       i, le16_to_cpu(m->bucket_size), le16_to_cpu(sb->block_size));
-			return -BCH_ERR_invalid_sb_members;
-		}
+		struct bch_member m = members_v1_get(mi, i);
 
-		if (le16_to_cpu(m->bucket_size) <
-		    BCH_SB_BTREE_NODE_SIZE(sb)) {
-			prt_printf(err, "device %u: bucket size %u smaller than btree node size %llu",
-			       i, le16_to_cpu(m->bucket_size), BCH_SB_BTREE_NODE_SIZE(sb));
-			return -BCH_ERR_invalid_sb_members;
-		}
+		int ret = validate_member(err, m, sb, i);
+		if (ret)
+			return ret;
 	}
 
 	return 0;
 }
 
-static void bch2_sb_members_to_text(struct printbuf *out, struct bch_sb *sb,
-				    struct bch_sb_field *f)
+static void bch2_sb_members_v1_to_text(struct printbuf *out, struct bch_sb *sb,
+				       struct bch_sb_field *f)
 {
 	struct bch_sb_field_members *mi = field_to_type(f, members);
 	struct bch_sb_field_disk_groups *gi = bch2_sb_get_disk_groups(sb);
 	unsigned i;
 
 	for (i = 0; i < sb->nr_devices; i++) {
-		struct bch_member *m = mi->members + i;
-		unsigned data_have = bch2_sb_dev_has_data(sb, i);
-		u64 bucket_size = le16_to_cpu(m->bucket_size);
-		u64 device_size = le64_to_cpu(m->nbuckets) * bucket_size;
-
-		if (!bch2_member_exists(m))
-			continue;
-
-		prt_printf(out, "Device:");
-		prt_tab(out);
-		prt_printf(out, "%u", i);
-		prt_newline(out);
-
-		printbuf_indent_add(out, 2);
-
-		prt_printf(out, "UUID:");
-		prt_tab(out);
-		pr_uuid(out, m->uuid.b);
-		prt_newline(out);
-
-		prt_printf(out, "Size:");
-		prt_tab(out);
-		prt_units_u64(out, device_size << 9);
-		prt_newline(out);
-
-		prt_printf(out, "Bucket size:");
-		prt_tab(out);
-		prt_units_u64(out, bucket_size << 9);
-		prt_newline(out);
-
-		prt_printf(out, "First bucket:");
-		prt_tab(out);
-		prt_printf(out, "%u", le16_to_cpu(m->first_bucket));
-		prt_newline(out);
-
-		prt_printf(out, "Buckets:");
-		prt_tab(out);
-		prt_printf(out, "%llu", le64_to_cpu(m->nbuckets));
-		prt_newline(out);
-
-		prt_printf(out, "Last mount:");
-		prt_tab(out);
-		if (m->last_mount)
-			pr_time(out, le64_to_cpu(m->last_mount));
-		else
-			prt_printf(out, "(never)");
-		prt_newline(out);
-
-		prt_printf(out, "State:");
-		prt_tab(out);
-		prt_printf(out, "%s",
-		       BCH_MEMBER_STATE(m) < BCH_MEMBER_STATE_NR
-		       ? bch2_member_states[BCH_MEMBER_STATE(m)]
-		       : "unknown");
-		prt_newline(out);
-
-		prt_printf(out, "Label:");
-		prt_tab(out);
-		if (BCH_MEMBER_GROUP(m)) {
-			unsigned idx = BCH_MEMBER_GROUP(m) - 1;
-
-			if (idx < disk_groups_nr(gi))
-				prt_printf(out, "%s (%u)",
-				       gi->entries[idx].label, idx);
-			else
-				prt_printf(out, "(bad disk labels section)");
-		} else {
-			prt_printf(out, "(none)");
-		}
-		prt_newline(out);
+		struct bch_member m = members_v1_get(mi, i);
+		member_to_text(out, m, gi, sb, i);
+	}
+}
 
-		prt_printf(out, "Data allowed:");
-		prt_tab(out);
-		if (BCH_MEMBER_DATA_ALLOWED(m))
-			prt_bitflags(out, bch2_data_types, BCH_MEMBER_DATA_ALLOWED(m));
-		else
-			prt_printf(out, "(none)");
-		prt_newline(out);
+const struct bch_sb_field_ops bch_sb_field_ops_members = {
+	.validate	= bch2_sb_members_v1_validate,
+	.to_text	= bch2_sb_members_v1_to_text,
+};
 
-		prt_printf(out, "Has data:");
-		prt_tab(out);
-		if (data_have)
-			prt_bitflags(out, bch2_data_types, data_have);
-		else
-			prt_printf(out, "(none)");
-		prt_newline(out);
+static void bch2_sb_members_v2_to_text(struct printbuf *out, struct bch_sb *sb,
+				       struct bch_sb_field *f)
+{
+	struct bch_sb_field_members_v2 *mi = field_to_type(f, members_v2);
+	struct bch_sb_field_disk_groups *gi = bch2_sb_get_disk_groups(sb);
+	unsigned i;
 
-		prt_printf(out, "Discard:");
-		prt_tab(out);
-		prt_printf(out, "%llu", BCH_MEMBER_DISCARD(m));
-		prt_newline(out);
+	for (i = 0; i < sb->nr_devices; i++) {
+		struct bch_member m = members_v2_get(mi, i);
+		member_to_text(out, m, gi, sb, i);
+	}
+}
+
+static int bch2_sb_members_v2_validate(struct bch_sb *sb,
+				       struct bch_sb_field *f,
+				       struct printbuf *err)
+{
+	struct bch_sb_field_members_v2 *mi = field_to_type(f, members_v2);
+	size_t mi_bytes = (void *) members_v2_get_mut(mi, sb->nr_devices) -
+		(void *) mi;
 
-		prt_printf(out, "Freespace initialized:");
-		prt_tab(out);
-		prt_printf(out, "%llu", BCH_MEMBER_FREESPACE_INITIALIZED(m));
-		prt_newline(out);
+	if (mi_bytes > vstruct_bytes(&mi->field)) {
+		prt_printf(err, "section too small (%zu > %zu)",
+			   mi_bytes, vstruct_bytes(&mi->field));
+		return -BCH_ERR_invalid_sb_members;
+	}
 
-		printbuf_indent_sub(out, 2);
+	for (unsigned i = 0; i < sb->nr_devices; i++) {
+		int ret = validate_member(err, members_v2_get(mi, i), sb, i);
+		if (ret)
+			return ret;
 	}
+
+	return 0;
 }
 
-const struct bch_sb_field_ops bch_sb_field_ops_members = {
-	.validate	= bch2_sb_members_validate,
-	.to_text	= bch2_sb_members_to_text,
+const struct bch_sb_field_ops bch_sb_field_ops_members_v2 = {
+	.validate	= bch2_sb_members_v2_validate,
+	.to_text	= bch2_sb_members_v2_to_text,
 };
diff --git a/fs/bcachefs/sb-members.h b/fs/bcachefs/sb-members.h
index 85d1af6a990c..72958facbb4f 100644
--- a/fs/bcachefs/sb-members.h
+++ b/fs/bcachefs/sb-members.h
@@ -2,6 +2,9 @@
 #ifndef _BCACHEFS_SB_MEMBERS_H
 #define _BCACHEFS_SB_MEMBERS_H
 
+int bch2_members_v2_init(struct bch_fs *c);
+int bch_members_cpy_v2_v1(struct bch_sb_handle *disk_sb);
+struct bch_member *bch2_members_v2_get_mut(struct bch_sb *sb, int i);
 struct bch_member bch2_sb_member_get(struct bch_sb *sb, int i);
 
 static inline bool bch2_dev_is_online(struct bch_dev *ca)
@@ -173,6 +176,8 @@ static inline struct bch_devs_mask bch2_online_devs(struct bch_fs *c)
 	return devs;
 }
 
+extern const struct bch_sb_field_ops bch_sb_field_ops_members_v2;
+
 extern const struct bch_sb_field_ops bch_sb_field_ops_members;
 
 #endif /* _BCACHEFS_SB_MEMBERS_H */
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index 16e3a8aa290d..8bb9c9a67fe5 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -892,6 +892,7 @@ int bch2_write_super(struct bch_fs *c)
 	SET_BCH_SB_BIG_ENDIAN(c->disk_sb.sb, CPU_BIG_ENDIAN);
 
 	bch2_sb_counters_from_cpu(c);
+	bch_members_cpy_v2_v1(&c->disk_sb);
 
 	for_each_online_member(ca, c, i)
 		bch2_sb_from_fs(c, ca);
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 7ba1ebabc176..c91c3f7e3328 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -400,6 +400,10 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early)
 
 	bch_info(c, "going read-write");
 
+	ret = bch2_members_v2_init(c);
+	if (ret)
+		goto err;
+
 	ret = bch2_fs_mark_dirty(c);
 	if (ret)
 		goto err;
@@ -924,7 +928,6 @@ static void print_mount_opts(struct bch_fs *c)
 
 int bch2_fs_start(struct bch_fs *c)
 {
-	struct bch_sb_field_members *mi;
 	struct bch_dev *ca;
 	time64_t now = ktime_get_real_seconds();
 	unsigned i;
@@ -938,12 +941,17 @@ int bch2_fs_start(struct bch_fs *c)
 
 	mutex_lock(&c->sb_lock);
 
+	ret = bch2_members_v2_init(c);
+	if (ret) {
+		mutex_unlock(&c->sb_lock);
+		goto err;
+	}
+
 	for_each_online_member(ca, c, i)
 		bch2_sb_from_fs(c, ca);
 
-	mi = bch2_sb_get_members(c->disk_sb.sb);
 	for_each_online_member(ca, c, i)
-		mi->members[ca->dev_idx].last_mount = cpu_to_le64(now);
+		bch2_members_v2_get_mut(c->disk_sb.sb, i)->last_mount = cpu_to_le64(now);
 
 	mutex_unlock(&c->sb_lock);
 
@@ -1382,7 +1390,7 @@ static void __bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca)
 int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca,
 			 enum bch_member_state new_state, int flags)
 {
-	struct bch_sb_field_members *mi;
+	struct bch_member *m;
 	int ret = 0;
 
 	if (ca->mi.state == new_state)
@@ -1397,8 +1405,8 @@ int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca,
 	bch_notice(ca, "%s", bch2_member_states[new_state]);
 
 	mutex_lock(&c->sb_lock);
-	mi = bch2_sb_get_members(c->disk_sb.sb);
-	SET_BCH_MEMBER_STATE(&mi->members[ca->dev_idx], new_state);
+	m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx);
+	SET_BCH_MEMBER_STATE(m, new_state);
 	bch2_write_super(c);
 	mutex_unlock(&c->sb_lock);
 
@@ -1454,7 +1462,7 @@ static int bch2_dev_remove_alloc(struct bch_fs *c, struct bch_dev *ca)
 
 int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
 {
-	struct bch_sb_field_members *mi;
+	struct bch_member *m;
 	unsigned dev_idx = ca->dev_idx, data;
 	int ret;
 
@@ -1542,8 +1550,8 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
 	 * this device must be gone:
 	 */
 	mutex_lock(&c->sb_lock);
-	mi = bch2_sb_get_members(c->disk_sb.sb);
-	memset(&mi->members[dev_idx].uuid, 0, sizeof(mi->members[dev_idx].uuid));
+	m = bch2_members_v2_get_mut(c->disk_sb.sb, dev_idx);
+	memset(&m->uuid, 0, sizeof(m->uuid));
 
 	bch2_write_super(c);
 
@@ -1566,7 +1574,7 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
 	struct bch_opts opts = bch2_opts_empty();
 	struct bch_sb_handle sb;
 	struct bch_dev *ca = NULL;
-	struct bch_sb_field_members *mi;
+	struct bch_sb_field_members_v2 *mi;
 	struct bch_member dev_mi;
 	unsigned dev_idx, nr_devices, u64s;
 	struct printbuf errbuf = PRINTBUF;
@@ -1622,9 +1630,9 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
 		goto err_unlock;
 	}
 
-	mi = bch2_sb_get_members(ca->disk_sb.sb);
+	mi = bch2_sb_get_members_v2(ca->disk_sb.sb);
 
-	if (!bch2_sb_resize_members(&ca->disk_sb,
+	if (!bch2_sb_resize_members_v2(&ca->disk_sb,
 				le32_to_cpu(mi->field.u64s) +
 				sizeof(dev_mi) / sizeof(u64))) {
 		ret = -BCH_ERR_ENOSPC_sb_members;
@@ -1645,20 +1653,21 @@ no_slot:
 
 have_slot:
 	nr_devices = max_t(unsigned, dev_idx + 1, c->sb.nr_devices);
-	u64s = (sizeof(struct bch_sb_field_members) +
-		sizeof(struct bch_member) * nr_devices) / sizeof(u64);
+	u64s = DIV_ROUND_UP(sizeof(struct bch_sb_field_members_v2) +
+			    le16_to_cpu(mi->member_bytes) * nr_devices, sizeof(u64));
 
-	mi = bch2_sb_resize_members(&c->disk_sb, u64s);
+	mi = bch2_sb_resize_members_v2(&c->disk_sb, u64s);
 	if (!mi) {
 		ret = -BCH_ERR_ENOSPC_sb_members;
 		bch_err_msg(c, ret, "setting up new superblock");
 		goto err_unlock;
 	}
+	struct bch_member *m = bch2_members_v2_get_mut(c->disk_sb.sb, dev_idx);
 
 	/* success: */
 
-	mi->members[dev_idx] = dev_mi;
-	mi->members[dev_idx].last_mount = cpu_to_le64(ktime_get_real_seconds());
+	*m = dev_mi;
+	m->last_mount = cpu_to_le64(ktime_get_real_seconds());
 	c->disk_sb.sb->nr_devices	= nr_devices;
 
 	ca->disk_sb.sb->dev_idx	= dev_idx;
@@ -1718,7 +1727,6 @@ int bch2_dev_online(struct bch_fs *c, const char *path)
 {
 	struct bch_opts opts = bch2_opts_empty();
 	struct bch_sb_handle sb = { NULL };
-	struct bch_sb_field_members *mi;
 	struct bch_dev *ca;
 	unsigned dev_idx;
 	int ret;
@@ -1755,9 +1763,9 @@ int bch2_dev_online(struct bch_fs *c, const char *path)
 		__bch2_dev_read_write(c, ca);
 
 	mutex_lock(&c->sb_lock);
-	mi = bch2_sb_get_members(c->disk_sb.sb);
+	struct bch_member *m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx);
 
-	mi->members[ca->dev_idx].last_mount =
+	m->last_mount =
 		cpu_to_le64(ktime_get_real_seconds());
 
 	bch2_write_super(c);
@@ -1799,7 +1807,7 @@ int bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca, int flags)
 
 int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
 {
-	struct bch_member *mi;
+	struct bch_member *m;
 	int ret = 0;
 
 	down_write(&c->state_lock);
@@ -1829,8 +1837,8 @@ int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
 		goto err;
 
 	mutex_lock(&c->sb_lock);
-	mi = &bch2_sb_get_members(c->disk_sb.sb)->members[ca->dev_idx];
-	mi->nbuckets = cpu_to_le64(nbuckets);
+	m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx);
+	m->nbuckets = cpu_to_le64(nbuckets);
 
 	bch2_write_super(c);
 	mutex_unlock(&c->sb_lock);
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index b700be5b6664..eb764b9a4629 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -962,7 +962,7 @@ STORE(bch2_dev)
 		bool v = strtoul_or_return(buf);
 
 		mutex_lock(&c->sb_lock);
-		mi = &bch2_sb_get_members(c->disk_sb.sb)->members[ca->dev_idx];
+		mi = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx);
 
 		if (v != BCH_MEMBER_DISCARD(mi)) {
 			SET_BCH_MEMBER_DISCARD(mi, v);
@@ -975,7 +975,7 @@ STORE(bch2_dev)
 		u64 v = strtoul_or_return(buf);
 
 		mutex_lock(&c->sb_lock);
-		mi = &bch2_sb_get_members(c->disk_sb.sb)->members[ca->dev_idx];
+		mi = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx);
 
 		if (v + 1 != BCH_MEMBER_DURABILITY(mi)) {
 			SET_BCH_MEMBER_DURABILITY(mi, v + 1);
-- 
cgit 


From 9af26120f06e03ee8cdd0f7d9f767f537f5148bd Mon Sep 17 00:00:00 2001
From: Hunter Shaffer <huntershaffer182456@gmail.com>
Date: Mon, 25 Sep 2023 00:06:32 -0400
Subject: bcachefs: Rename bch_sb_field_members -> bch_sb_field_members_v1

Signed-off-by: Hunter Shaffer <huntershaffer182456@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs_format.h |  6 +++---
 fs/bcachefs/sb-members.c      | 22 +++++++++++-----------
 fs/bcachefs/sb-members.h      |  3 +--
 fs/bcachefs/super-io.c        |  6 +++---
 fs/bcachefs/super-io.h        |  2 +-
 5 files changed, 19 insertions(+), 20 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index c1a9ba451f04..45701b1230b7 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -1222,7 +1222,7 @@ struct bch_sb_field {
 
 #define BCH_SB_FIELDS()				\
 	x(journal,	0)			\
-	x(members,	1)			\
+	x(members_v1,	1)			\
 	x(crypt,	2)			\
 	x(replicas_v0,	3)			\
 	x(quota,	4)			\
@@ -1265,7 +1265,7 @@ struct bch_sb_field_journal_v2 {
 	}			d[];
 };
 
-/* BCH_SB_FIELD_members: */
+/* BCH_SB_FIELD_members_v1: */
 
 #define BCH_MIN_NR_NBUCKETS	(1 << 6)
 
@@ -1309,7 +1309,7 @@ enum bch_member_state {
 	BCH_MEMBER_STATE_NR
 };
 
-struct bch_sb_field_members {
+struct bch_sb_field_members_v1 {
 	struct bch_sb_field	field;
 	struct bch_member	_members[]; //Members are now variable size
 };
diff --git a/fs/bcachefs/sb-members.c b/fs/bcachefs/sb-members.c
index bf9c6c60530a..b9a75eb5a8eb 100644
--- a/fs/bcachefs/sb-members.c
+++ b/fs/bcachefs/sb-members.c
@@ -6,7 +6,7 @@
 #include "sb-members.h"
 #include "super-io.h"
 
-/* Code for bch_sb_field_members: */
+/* Code for bch_sb_field_members_v1: */
 
 static struct bch_member *members_v2_get_mut(struct bch_sb_field_members_v2 *mi, int i)
 {
@@ -26,12 +26,12 @@ static struct bch_member members_v2_get(struct bch_sb_field_members_v2 *mi, int
 	return ret;
 }
 
-static struct bch_member *members_v1_get_mut(struct bch_sb_field_members *mi, int i)
+static struct bch_member *members_v1_get_mut(struct bch_sb_field_members_v1 *mi, int i)
 {
 	return (void *) mi->_members + (i * BCH_MEMBER_V1_BYTES);
 }
 
-static struct bch_member members_v1_get(struct bch_sb_field_members *mi, int i)
+static struct bch_member members_v1_get(struct bch_sb_field_members_v1 *mi, int i)
 {
 	struct bch_member ret, *p = members_v1_get_mut(mi, i);
 	memset(&ret, 0, sizeof(ret));
@@ -43,7 +43,7 @@ struct bch_member bch2_sb_member_get(struct bch_sb *sb, int i)
 	struct bch_sb_field_members_v2 *mi2 = bch2_sb_get_members_v2(sb);
 	if (mi2)
 		return members_v2_get(mi2, i);
-	struct bch_sb_field_members *mi1 = bch2_sb_get_members(sb);
+	struct bch_sb_field_members_v1 *mi1 = bch2_sb_get_members_v1(sb);
 	return members_v1_get(mi1, i);
 }
 
@@ -72,7 +72,7 @@ static int sb_members_v2_resize_entries(struct bch_fs *c)
 
 int bch2_members_v2_init(struct bch_fs *c)
 {
-	struct bch_sb_field_members *mi1;
+	struct bch_sb_field_members_v1 *mi1;
 	struct bch_sb_field_members_v2 *mi2;
 
 	if (!bch2_sb_get_members_v2(c->disk_sb.sb)) {
@@ -80,7 +80,7 @@ int bch2_members_v2_init(struct bch_fs *c)
 				DIV_ROUND_UP(sizeof(*mi2) +
 					     sizeof(struct bch_member) * c->sb.nr_devices,
 					     sizeof(u64)));
-		mi1 = bch2_sb_get_members(c->disk_sb.sb);
+		mi1 = bch2_sb_get_members_v1(c->disk_sb.sb);
 		memcpy(&mi2->_members[0], &mi1->_members[0],
 		       BCH_MEMBER_V1_BYTES * c->sb.nr_devices);
 		memset(&mi2->pad[0], 0, sizeof(mi2->pad));
@@ -92,10 +92,10 @@ int bch2_members_v2_init(struct bch_fs *c)
 
 int bch_members_cpy_v2_v1(struct bch_sb_handle *disk_sb)
 {
-	struct bch_sb_field_members *mi1;
+	struct bch_sb_field_members_v1 *mi1;
 	struct bch_sb_field_members_v2 *mi2;
 
-	mi1 = bch2_sb_resize_members(disk_sb,
+	mi1 = bch2_sb_resize_members_v1(disk_sb,
 			DIV_ROUND_UP(sizeof(*mi1) + BCH_MEMBER_V1_BYTES *
 				     disk_sb->sb->nr_devices, sizeof(u64)));
 	if (!mi1)
@@ -251,7 +251,7 @@ static int bch2_sb_members_v1_validate(struct bch_sb *sb,
 				    struct bch_sb_field *f,
 				    struct printbuf *err)
 {
-	struct bch_sb_field_members *mi = field_to_type(f, members);
+	struct bch_sb_field_members_v1 *mi = field_to_type(f, members_v1);
 	unsigned i;
 
 	if ((void *) members_v1_get_mut(mi, sb->nr_devices)  >
@@ -274,7 +274,7 @@ static int bch2_sb_members_v1_validate(struct bch_sb *sb,
 static void bch2_sb_members_v1_to_text(struct printbuf *out, struct bch_sb *sb,
 				       struct bch_sb_field *f)
 {
-	struct bch_sb_field_members *mi = field_to_type(f, members);
+	struct bch_sb_field_members_v1 *mi = field_to_type(f, members_v1);
 	struct bch_sb_field_disk_groups *gi = bch2_sb_get_disk_groups(sb);
 	unsigned i;
 
@@ -284,7 +284,7 @@ static void bch2_sb_members_v1_to_text(struct printbuf *out, struct bch_sb *sb,
 	}
 }
 
-const struct bch_sb_field_ops bch_sb_field_ops_members = {
+const struct bch_sb_field_ops bch_sb_field_ops_members_v1 = {
 	.validate	= bch2_sb_members_v1_validate,
 	.to_text	= bch2_sb_members_v1_to_text,
 };
diff --git a/fs/bcachefs/sb-members.h b/fs/bcachefs/sb-members.h
index 72958facbb4f..430f3457bfd4 100644
--- a/fs/bcachefs/sb-members.h
+++ b/fs/bcachefs/sb-members.h
@@ -176,8 +176,7 @@ static inline struct bch_devs_mask bch2_online_devs(struct bch_fs *c)
 	return devs;
 }
 
+extern const struct bch_sb_field_ops bch_sb_field_ops_members_v1;
 extern const struct bch_sb_field_ops bch_sb_field_ops_members_v2;
 
-extern const struct bch_sb_field_ops bch_sb_field_ops_members;
-
 #endif /* _BCACHEFS_SB_MEMBERS_H */
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index 8bb9c9a67fe5..4498e24dabdb 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -355,7 +355,7 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb, struct printbuf *out,
 {
 	struct bch_sb *sb = disk_sb->sb;
 	struct bch_sb_field *f;
-	struct bch_sb_field_members *mi;
+	struct bch_sb_field_members_v1 *mi;
 	enum bch_opt_id opt_id;
 	u16 block_size;
 	int ret;
@@ -458,7 +458,7 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb, struct printbuf *out,
 	}
 
 	/* members must be validated first: */
-	mi = bch2_sb_get_members(sb);
+	mi = bch2_sb_get_members_v1(sb);
 	if (!mi) {
 		prt_printf(out, "Invalid superblock: member info area missing");
 		return -BCH_ERR_invalid_sb_members_missing;
@@ -469,7 +469,7 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb, struct printbuf *out,
 		return ret;
 
 	vstruct_for_each(sb, f) {
-		if (le32_to_cpu(f->type) == BCH_SB_FIELD_members)
+		if (le32_to_cpu(f->type) == BCH_SB_FIELD_members_v1)
 			continue;
 
 		ret = bch2_sb_field_validate(sb, f, out);
diff --git a/fs/bcachefs/super-io.h b/fs/bcachefs/super-io.h
index 817c3d790acd..f7e9099931a7 100644
--- a/fs/bcachefs/super-io.h
+++ b/fs/bcachefs/super-io.h
@@ -90,7 +90,7 @@ static inline void bch2_check_set_feature(struct bch_fs *c, unsigned feat)
 		__bch2_check_set_feature(c, feat);
 }
 
-/* BCH_SB_FIELD_members: */
+/* BCH_SB_FIELD_members_v1: */
 
 static inline bool bch2_member_exists(struct bch_member *m)
 {
-- 
cgit 


From 40f7914e8dc87f8530ebbd853036cb370656b947 Mon Sep 17 00:00:00 2001
From: Hunter Shaffer <huntershaffer182456@gmail.com>
Date: Mon, 25 Sep 2023 00:46:28 -0400
Subject: bcachefs: Add iops fields to bch_member

Signed-off-by: Hunter Shaffer <huntershaffer182456@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs_format.h | 28 +++++++++++++++++++++-------
 fs/bcachefs/opts.c            |  5 +++++
 fs/bcachefs/opts.h            |  1 +
 fs/bcachefs/sb-members.c      |  8 ++++++++
 4 files changed, 35 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index 45701b1230b7..99749f3315fe 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -1269,6 +1269,19 @@ struct bch_sb_field_journal_v2 {
 
 #define BCH_MIN_NR_NBUCKETS	(1 << 6)
 
+#define BCH_IOPS_MEASUREMENTS()			\
+	x(seqread,	0)			\
+	x(seqwrite,	1)			\
+	x(randread,	2)			\
+	x(randwrite,	3)
+
+enum bch_iops_measurement {
+#define x(t, n) BCH_IOPS_##t = n,
+	BCH_IOPS_MEASUREMENTS()
+#undef x
+	BCH_IOPS_NR
+};
+
 struct bch_member {
 	__uuid_t		uuid;
 	__le64			nbuckets;	/* device size */
@@ -1277,19 +1290,20 @@ struct bch_member {
 	__le32			pad;
 	__le64			last_mount;	/* time_t */
 
-	__le64			flags[2];
+	__le64			flags;
+	__le32			iops[4];
 };
 
 #define BCH_MEMBER_V1_BYTES	56
 
-LE64_BITMASK(BCH_MEMBER_STATE,		struct bch_member, flags[0],  0,  4)
+LE64_BITMASK(BCH_MEMBER_STATE,		struct bch_member, flags,  0,  4)
 /* 4-14 unused, was TIER, HAS_(META)DATA, REPLACEMENT */
-LE64_BITMASK(BCH_MEMBER_DISCARD,	struct bch_member, flags[0], 14, 15)
-LE64_BITMASK(BCH_MEMBER_DATA_ALLOWED,	struct bch_member, flags[0], 15, 20)
-LE64_BITMASK(BCH_MEMBER_GROUP,		struct bch_member, flags[0], 20, 28)
-LE64_BITMASK(BCH_MEMBER_DURABILITY,	struct bch_member, flags[0], 28, 30)
+LE64_BITMASK(BCH_MEMBER_DISCARD,	struct bch_member, flags, 14, 15)
+LE64_BITMASK(BCH_MEMBER_DATA_ALLOWED,	struct bch_member, flags, 15, 20)
+LE64_BITMASK(BCH_MEMBER_GROUP,		struct bch_member, flags, 20, 28)
+LE64_BITMASK(BCH_MEMBER_DURABILITY,	struct bch_member, flags, 28, 30)
 LE64_BITMASK(BCH_MEMBER_FREESPACE_INITIALIZED,
-					struct bch_member, flags[0], 30, 31)
+					struct bch_member, flags, 30, 31)
 
 #if 0
 LE64_BITMASK(BCH_MEMBER_NR_READ_ERRORS,	struct bch_member, flags[1], 0,  20);
diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c
index 739a2ef80945..232f50c73a94 100644
--- a/fs/bcachefs/opts.c
+++ b/fs/bcachefs/opts.c
@@ -12,6 +12,11 @@
 
 #define x(t, n, ...) [n] = #t,
 
+const char * const bch2_iops_measurements[] = {
+	BCH_IOPS_MEASUREMENTS()
+	NULL
+};
+
 const char * const bch2_error_actions[] = {
 	BCH_ERROR_ACTIONS()
 	NULL
diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
index c21c258e4018..55014336c5f7 100644
--- a/fs/bcachefs/opts.h
+++ b/fs/bcachefs/opts.h
@@ -10,6 +10,7 @@
 
 struct bch_fs;
 
+extern const char * const bch2_iops_measurements[];
 extern const char * const bch2_error_actions[];
 extern const char * const bch2_fsck_fix_opts[];
 extern const char * const bch2_version_upgrade_opts[];
diff --git a/fs/bcachefs/sb-members.c b/fs/bcachefs/sb-members.c
index b9a75eb5a8eb..04bde1aaff9f 100644
--- a/fs/bcachefs/sb-members.c
+++ b/fs/bcachefs/sb-members.c
@@ -2,6 +2,7 @@
 
 #include "bcachefs.h"
 #include "disk_groups.h"
+#include "opts.h"
 #include "replicas.h"
 #include "sb-members.h"
 #include "super-io.h"
@@ -172,6 +173,13 @@ static void member_to_text(struct printbuf *out,
 	prt_units_u64(out, device_size << 9);
 	prt_newline(out);
 
+	for (unsigned i = 0; i < BCH_IOPS_NR; i++) {
+		prt_printf(out, "%s iops:", bch2_iops_measurements[i]);
+		prt_tab(out);
+		prt_printf(out, "%u", le32_to_cpu(m.iops[i]));
+		prt_newline(out);
+	}
+
 	prt_printf(out, "Bucket size:");
 	prt_tab(out);
 	prt_units_u64(out, bucket_size << 9);
-- 
cgit 


From dc08c661a291f5e479fdde8322a4c295c69a3aef Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 28 Sep 2023 00:50:27 -0400
Subject: bcachefs: Use strsep() in split_devs()

Minor refactoring to fix a smatch complaint.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index 73a3cebd734f..bc009ef497d0 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -1595,7 +1595,7 @@ static struct bch_fs *bch2_path_to_fs(const char *path)
 static char **split_devs(const char *_dev_name, unsigned *nr)
 {
 	char *dev_name = NULL, **devs = NULL, *s;
-	size_t i, nr_devs = 0;
+	size_t i = 0, nr_devs = 0;
 
 	dev_name = kstrdup(_dev_name, GFP_KERNEL);
 	if (!dev_name)
@@ -1610,9 +1610,7 @@ static char **split_devs(const char *_dev_name, unsigned *nr)
 		return NULL;
 	}
 
-	for (i = 0, s = dev_name;
-	     s;
-	     (s = strchr(s, ':')) && (*s++ = '\0'))
+	while ((s = strsep(&dev_name, ":")))
 		devs[i++] = s;
 
 	*nr = nr_devs;
-- 
cgit 


From 4fc1f402c6c259b1f44e45c096ac6666925a9b87 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 28 Sep 2023 00:54:12 -0400
Subject: bcachefs: Fix another smatch complaint

This should be harmless, but initialize last_seq anyways.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/recovery.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 9dbaf080dcdd..cd037f2e4235 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -644,7 +644,7 @@ int bch2_fs_recovery(struct bch_fs *c)
 {
 	struct bch_sb_field_clean *clean = NULL;
 	struct jset *last_journal_entry = NULL;
-	u64 last_seq, blacklist_seq, journal_seq;
+	u64 last_seq = 0, blacklist_seq, journal_seq;
 	bool write_sb = false;
 	int ret = 0;
 
-- 
cgit 


From 69d1f052d1675c2af7da496f0265f68673328afb Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 28 Sep 2023 17:57:21 -0400
Subject: bcachefs: Correctly initialize new buckets on device resize

bch2_dev_resize() was never updated for the allocator rewrite with
persistent freelists, and it wasn't noticed because the tests weren't
running fsck - oops.

Fix this by running bch2_dev_freespace_init() for the new buckets.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c | 21 ++++++++++++---------
 fs/bcachefs/alloc_background.h |  1 +
 fs/bcachefs/super.c            | 14 ++++++++++++++
 3 files changed, 27 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 4059d3d4b7f5..2d516207e223 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -1831,29 +1831,33 @@ void bch2_do_invalidates(struct bch_fs *c)
 		bch2_write_ref_put(c, BCH_WRITE_REF_invalidate);
 }
 
-static int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca,
-				   unsigned long *last_updated)
+int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca,
+			    u64 bucket_start, u64 bucket_end)
 {
 	struct btree_trans *trans = bch2_trans_get(c);
 	struct btree_iter iter;
 	struct bkey_s_c k;
 	struct bkey hole;
-	struct bpos end = POS(ca->dev_idx, ca->mi.nbuckets);
+	struct bpos end = POS(ca->dev_idx, bucket_end);
 	struct bch_member *m;
+	unsigned long last_updated = jiffies;
 	int ret;
 
+	BUG_ON(bucket_start > bucket_end);
+	BUG_ON(bucket_end > ca->mi.nbuckets);
+
 	bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc,
-			     POS(ca->dev_idx, ca->mi.first_bucket),
-			     BTREE_ITER_PREFETCH);
+		POS(ca->dev_idx, max_t(u64, ca->mi.first_bucket, bucket_start)),
+		BTREE_ITER_PREFETCH);
 	/*
 	 * Scan the alloc btree for every bucket on @ca, and add buckets to the
 	 * freespace/need_discard/need_gc_gens btrees as needed:
 	 */
 	while (1) {
-		if (*last_updated + HZ * 10 < jiffies) {
+		if (last_updated + HZ * 10 < jiffies) {
 			bch_info(ca, "%s: currently at %llu/%llu",
 				 __func__, iter.pos.offset, ca->mi.nbuckets);
-			*last_updated = jiffies;
+			last_updated = jiffies;
 		}
 
 		bch2_trans_begin(trans);
@@ -1935,7 +1939,6 @@ int bch2_fs_freespace_init(struct bch_fs *c)
 	unsigned i;
 	int ret = 0;
 	bool doing_init = false;
-	unsigned long last_updated = jiffies;
 
 	/*
 	 * We can crash during the device add path, so we need to check this on
@@ -1951,7 +1954,7 @@ int bch2_fs_freespace_init(struct bch_fs *c)
 			doing_init = true;
 		}
 
-		ret = bch2_dev_freespace_init(c, ca, &last_updated);
+		ret = bch2_dev_freespace_init(c, ca, 0, ca->mi.nbuckets);
 		if (ret) {
 			percpu_ref_put(&ca->ref);
 			bch_err_fn(c, ret);
diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h
index c0914feb54b5..97042067d2a9 100644
--- a/fs/bcachefs/alloc_background.h
+++ b/fs/bcachefs/alloc_background.h
@@ -245,6 +245,7 @@ static inline const struct bch_backpointer *alloc_v4_backpointers_c(const struct
 	return (void *) ((u64 *) &a->v + BCH_ALLOC_V4_BACKPOINTERS_START(a));
 }
 
+int bch2_dev_freespace_init(struct bch_fs *, struct bch_dev *, u64, u64);
 int bch2_fs_freespace_init(struct bch_fs *);
 
 void bch2_recalc_capacity(struct bch_fs *);
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index c91c3f7e3328..2014f7816f66 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -1808,9 +1808,11 @@ int bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca, int flags)
 int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
 {
 	struct bch_member *m;
+	u64 old_nbuckets;
 	int ret = 0;
 
 	down_write(&c->state_lock);
+	old_nbuckets = ca->mi.nbuckets;
 
 	if (nbuckets < ca->mi.nbuckets) {
 		bch_err(ca, "Cannot shrink yet");
@@ -1843,6 +1845,18 @@ int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
 	bch2_write_super(c);
 	mutex_unlock(&c->sb_lock);
 
+	if (ca->mi.freespace_initialized) {
+		ret = bch2_dev_freespace_init(c, ca, old_nbuckets, nbuckets);
+		if (ret)
+			goto err;
+
+		/*
+		 * XXX: this is all wrong transactionally - we'll be able to do
+		 * this correctly after the disk space accounting rewrite
+		 */
+		ca->usage_base->d[BCH_DATA_free].buckets += nbuckets - old_nbuckets;
+	}
+
 	bch2_recalc_capacity(c);
 err:
 	up_write(&c->state_lock);
-- 
cgit 


From c2d81c24123361e5092c88e67d790097308c5b95 Mon Sep 17 00:00:00 2001
From: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Date: Sun, 1 Oct 2023 09:13:54 +0200
Subject: bcachefs: Use struct_size()

Use struct_size() instead of hand writing it.
This is less verbose and more robust.

While at it, prepare for the coming implementation by GCC and Clang of the
__counted_by attribute. Flexible array members annotated with __counted_by
can have their accesses bounds-checked at run-time checking via
CONFIG_UBSAN_BOUNDS (for array indexing) and CONFIG_FORTIFY_SOURCE (for
strcpy/memcpy-family functions).

Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/disk_groups.c | 3 +--
 fs/bcachefs/super_types.h | 2 +-
 2 files changed, 2 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/disk_groups.c b/fs/bcachefs/disk_groups.c
index 43aad8ba8cc9..3c8aa69b67ea 100644
--- a/fs/bcachefs/disk_groups.c
+++ b/fs/bcachefs/disk_groups.c
@@ -163,8 +163,7 @@ int bch2_sb_disk_groups_to_cpu(struct bch_fs *c)
 	if (!groups)
 		return 0;
 
-	cpu_g = kzalloc(sizeof(*cpu_g) +
-			sizeof(cpu_g->entries[0]) * nr_groups, GFP_KERNEL);
+	cpu_g = kzalloc(struct_size(cpu_g, entries, nr_groups), GFP_KERNEL);
 	if (!cpu_g)
 		return -BCH_ERR_ENOMEM_disk_groups_to_cpu;
 
diff --git a/fs/bcachefs/super_types.h b/fs/bcachefs/super_types.h
index 597a8db73585..78d6138db62d 100644
--- a/fs/bcachefs/super_types.h
+++ b/fs/bcachefs/super_types.h
@@ -46,7 +46,7 @@ struct bch_disk_group_cpu {
 struct bch_disk_groups_cpu {
 	struct rcu_head			rcu;
 	unsigned			nr;
-	struct bch_disk_group_cpu	entries[];
+	struct bch_disk_group_cpu	entries[] __counted_by(nr);
 };
 
 #endif /* _BCACHEFS_SUPER_TYPES_H */
-- 
cgit 


From 88d39fd544d44e1243668eb192ff1cc5cdc2d770 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 6 Oct 2023 13:47:31 -0400
Subject: bcachefs: Switch to unsafe_memcpy() in a few places

The new fortify checking doesn't work for us in all places; this
switches to unsafe_memcpy() where appropriate to silence a few
warnings/errors.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/buckets.c | 5 +++--
 fs/bcachefs/extents.h | 8 +++++---
 2 files changed, 8 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 46b6406d772b..a1a4b5feadaa 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -473,8 +473,9 @@ static inline int update_replicas_list(struct btree_trans *trans,
 	d = trans->fs_usage_deltas;
 	n = (void *) d->d + d->used;
 	n->delta = sectors;
-	memcpy((void *) n + offsetof(struct replicas_delta, r),
-	       r, replicas_entry_bytes(r));
+	unsafe_memcpy((void *) n + offsetof(struct replicas_delta, r),
+		      r, replicas_entry_bytes(r),
+		      "flexible array member embedded in strcuct with padding");
 	bch2_replicas_entry_sort(&n->r);
 	d->used += b;
 	return 0;
diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h
index 7ee8d031bb6c..db1863165d62 100644
--- a/fs/bcachefs/extents.h
+++ b/fs/bcachefs/extents.h
@@ -642,9 +642,11 @@ static inline void bch2_bkey_append_ptr(struct bkey_i *k, struct bch_extent_ptr
 
 		ptr.type = 1 << BCH_EXTENT_ENTRY_ptr;
 
-		memcpy((void *) &k->v + bkey_val_bytes(&k->k),
-		       &ptr,
-		       sizeof(ptr));
+		unsafe_memcpy((void *) &k->v + bkey_val_bytes(&k->k),
+			      &ptr,
+			      sizeof(ptr),
+			      "Our memcpy target is relative to a zero size array ,"
+			      "compiler bounds checking doesn't work here");
 		k->k.u64s++;
 		break;
 	default:
-- 
cgit 


From 6929d5e74e473fb8b15875c627f7c5ad963c1889 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 28 Sep 2023 20:24:21 -0400
Subject: bcachefs: Fix handling of unknown bkey types

min_val_size was U8_MAX for unknown key types, causing us to flag any
known key as invalid - it should have been 0.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bkey_methods.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c
index be9f012fc7be..d9fb1fc81f1e 100644
--- a/fs/bcachefs/bkey_methods.c
+++ b/fs/bcachefs/bkey_methods.c
@@ -121,7 +121,6 @@ const struct bkey_ops bch2_bkey_ops[] = {
 };
 
 const struct bkey_ops bch2_bkey_null_ops = {
-	.min_val_size = U8_MAX,
 };
 
 int bch2_bkey_val_invalid(struct bch_fs *c, struct bkey_s_c k,
-- 
cgit 


From be47e0ba4fc92bd065ac0b7ad345913eea1bc6fc Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 29 Sep 2023 15:49:16 -0400
Subject: bcachefs: KEY_TYPE_error now counts towards i_sectors

KEY_TYPE_error is used when all replicas in an extent are marked as
failed; it indicates that data was present, but has been lost.

So that i_sectors doesn't change when replacing extents with
KEY_TYPE_error, we now have to count error keys as allocations - this
fixes fsck errors later.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/extents.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h
index db1863165d62..978ae5e7e8b1 100644
--- a/fs/bcachefs/extents.h
+++ b/fs/bcachefs/extents.h
@@ -520,6 +520,7 @@ static inline bool bkey_extent_is_allocation(const struct bkey *k)
 	case KEY_TYPE_reflink_v:
 	case KEY_TYPE_inline_data:
 	case KEY_TYPE_indirect_inline_data:
+	case KEY_TYPE_error:
 		return true;
 	default:
 		return false;
-- 
cgit 


From 4637429e3946d083eedde1668c17a4fadaf3c0b1 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 26 Sep 2023 17:49:34 -0400
Subject: bcachefs: bch2_sb_field_get() refactoring

Instead of using token pasting to generate methods for each superblock
section, just make the type a parameter to bch2_sb_field_get().

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/checksum.c              |  9 +++++----
 fs/bcachefs/counters.c              |  6 +++---
 fs/bcachefs/disk_groups.c           | 12 ++++++------
 fs/bcachefs/journal.c               |  4 ++--
 fs/bcachefs/journal_sb.c            |  2 +-
 fs/bcachefs/journal_seq_blacklist.c | 16 ++++++++--------
 fs/bcachefs/quota.c                 |  6 +++---
 fs/bcachefs/replicas.c              | 16 ++++++++--------
 fs/bcachefs/sb-clean.c              |  4 ++--
 fs/bcachefs/sb-members.c            | 24 ++++++++++++------------
 fs/bcachefs/super-io.c              | 18 +++++++++---------
 fs/bcachefs/super-io.h              | 30 +++++++++---------------------
 fs/bcachefs/super.c                 |  6 +++---
 13 files changed, 71 insertions(+), 82 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/checksum.c b/fs/bcachefs/checksum.c
index 839f00dce50f..3c761ad6b1c8 100644
--- a/fs/bcachefs/checksum.c
+++ b/fs/bcachefs/checksum.c
@@ -678,7 +678,7 @@ int bch2_disable_encryption(struct bch_fs *c)
 
 	mutex_lock(&c->sb_lock);
 
-	crypt = bch2_sb_get_crypt(c->disk_sb.sb);
+	crypt = bch2_sb_field_get(c->disk_sb.sb, crypt);
 	if (!crypt)
 		goto out;
 
@@ -712,7 +712,7 @@ int bch2_enable_encryption(struct bch_fs *c, bool keyed)
 	mutex_lock(&c->sb_lock);
 
 	/* Do we already have an encryption key? */
-	if (bch2_sb_get_crypt(c->disk_sb.sb))
+	if (bch2_sb_field_get(c->disk_sb.sb, crypt))
 		goto err;
 
 	ret = bch2_alloc_ciphers(c);
@@ -740,7 +740,8 @@ int bch2_enable_encryption(struct bch_fs *c, bool keyed)
 	if (ret)
 		goto err;
 
-	crypt = bch2_sb_resize_crypt(&c->disk_sb, sizeof(*crypt) / sizeof(u64));
+	crypt = bch2_sb_field_resize(&c->disk_sb, crypt,
+				     sizeof(*crypt) / sizeof(u64));
 	if (!crypt) {
 		ret = -BCH_ERR_ENOSPC_sb_crypt;
 		goto err;
@@ -781,7 +782,7 @@ int bch2_fs_encryption_init(struct bch_fs *c)
 		goto out;
 	}
 
-	crypt = bch2_sb_get_crypt(c->disk_sb.sb);
+	crypt = bch2_sb_field_get(c->disk_sb.sb, crypt);
 	if (!crypt)
 		goto out;
 
diff --git a/fs/bcachefs/counters.c b/fs/bcachefs/counters.c
index 26eb3d82b1cb..02a996e06a64 100644
--- a/fs/bcachefs/counters.c
+++ b/fs/bcachefs/counters.c
@@ -48,7 +48,7 @@ static void bch2_sb_counters_to_text(struct printbuf *out, struct bch_sb *sb,
 
 int bch2_sb_counters_to_cpu(struct bch_fs *c)
 {
-	struct bch_sb_field_counters *ctrs = bch2_sb_get_counters(c->disk_sb.sb);
+	struct bch_sb_field_counters *ctrs = bch2_sb_field_get(c->disk_sb.sb, counters);
 	unsigned int i;
 	unsigned int nr = bch2_sb_counter_nr_entries(ctrs);
 	u64 val = 0;
@@ -66,13 +66,13 @@ int bch2_sb_counters_to_cpu(struct bch_fs *c)
 
 int bch2_sb_counters_from_cpu(struct bch_fs *c)
 {
-	struct bch_sb_field_counters *ctrs = bch2_sb_get_counters(c->disk_sb.sb);
+	struct bch_sb_field_counters *ctrs = bch2_sb_field_get(c->disk_sb.sb, counters);
 	struct bch_sb_field_counters *ret;
 	unsigned int i;
 	unsigned int nr = bch2_sb_counter_nr_entries(ctrs);
 
 	if (nr < BCH_COUNTER_NR) {
-		ret = bch2_sb_resize_counters(&c->disk_sb,
+		ret = bch2_sb_field_resize(&c->disk_sb, counters,
 					       sizeof(*ctrs) / sizeof(u64) + BCH_COUNTER_NR);
 
 		if (ret) {
diff --git a/fs/bcachefs/disk_groups.c b/fs/bcachefs/disk_groups.c
index 3c8aa69b67ea..e00133b6ea51 100644
--- a/fs/bcachefs/disk_groups.c
+++ b/fs/bcachefs/disk_groups.c
@@ -157,7 +157,7 @@ int bch2_sb_disk_groups_to_cpu(struct bch_fs *c)
 
 	lockdep_assert_held(&c->sb_lock);
 
-	groups		= bch2_sb_get_disk_groups(c->disk_sb.sb);
+	groups		= bch2_sb_field_get(c->disk_sb.sb, disk_groups);
 	nr_groups	= disk_groups_nr(groups);
 
 	if (!groups)
@@ -295,7 +295,7 @@ static int __bch2_disk_group_add(struct bch_sb_handle *sb, unsigned parent,
 				 const char *name, unsigned namelen)
 {
 	struct bch_sb_field_disk_groups *groups =
-		bch2_sb_get_disk_groups(sb->sb);
+		bch2_sb_field_get(sb->sb, disk_groups);
 	unsigned i, nr_groups = disk_groups_nr(groups);
 	struct bch_disk_group *g;
 
@@ -313,7 +313,7 @@ static int __bch2_disk_group_add(struct bch_sb_handle *sb, unsigned parent,
 			 sizeof(struct bch_disk_group) * (nr_groups + 1)) /
 			sizeof(u64);
 
-		groups = bch2_sb_resize_disk_groups(sb, u64s);
+		groups = bch2_sb_field_resize(sb, disk_groups, u64s);
 		if (!groups)
 			return -BCH_ERR_ENOSPC_disk_label_add;
 
@@ -337,7 +337,7 @@ static int __bch2_disk_group_add(struct bch_sb_handle *sb, unsigned parent,
 int bch2_disk_path_find(struct bch_sb_handle *sb, const char *name)
 {
 	struct bch_sb_field_disk_groups *groups =
-		bch2_sb_get_disk_groups(sb->sb);
+		bch2_sb_field_get(sb->sb, disk_groups);
 	int v = -1;
 
 	do {
@@ -367,7 +367,7 @@ int bch2_disk_path_find_or_create(struct bch_sb_handle *sb, const char *name)
 		if (*next == '.')
 			next++;
 
-		groups = bch2_sb_get_disk_groups(sb->sb);
+		groups = bch2_sb_field_get(sb->sb, disk_groups);
 
 		v = __bch2_disk_group_find(groups, parent, name, len);
 		if (v < 0)
@@ -385,7 +385,7 @@ int bch2_disk_path_find_or_create(struct bch_sb_handle *sb, const char *name)
 void bch2_disk_path_to_text(struct printbuf *out, struct bch_sb *sb, unsigned v)
 {
 	struct bch_sb_field_disk_groups *groups =
-		bch2_sb_get_disk_groups(sb);
+		bch2_sb_field_get(sb, disk_groups);
 	struct bch_disk_group *g;
 	unsigned nr = 0;
 	u16 path[32];
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index fc3dd5bef386..0e7a9ffa3671 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -1170,9 +1170,9 @@ int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb)
 {
 	struct journal_device *ja = &ca->journal;
 	struct bch_sb_field_journal *journal_buckets =
-		bch2_sb_get_journal(sb);
+		bch2_sb_field_get(sb, journal);
 	struct bch_sb_field_journal_v2 *journal_buckets_v2 =
-		bch2_sb_get_journal_v2(sb);
+		bch2_sb_field_get(sb, journal_v2);
 	unsigned i, nr_bvecs;
 
 	ja->nr = 0;
diff --git a/fs/bcachefs/journal_sb.c b/fs/bcachefs/journal_sb.c
index 3c5a95e29463..ae4fb8c3a2bc 100644
--- a/fs/bcachefs/journal_sb.c
+++ b/fs/bcachefs/journal_sb.c
@@ -194,7 +194,7 @@ int bch2_journal_buckets_to_sb(struct bch_fs *c, struct bch_dev *ca,
 		if (buckets[i] + 1 != buckets[i + 1])
 			nr_compacted++;
 
-	j = bch2_sb_resize_journal_v2(&ca->disk_sb,
+	j = bch2_sb_field_resize(&ca->disk_sb, journal_v2,
 			 (sizeof(*j) + sizeof(j->d[0]) * nr_compacted) / sizeof(u64));
 	if (!j)
 		return -BCH_ERR_ENOSPC_sb_journal;
diff --git a/fs/bcachefs/journal_seq_blacklist.c b/fs/bcachefs/journal_seq_blacklist.c
index 1e1a79405693..f9d9aa95bf3a 100644
--- a/fs/bcachefs/journal_seq_blacklist.c
+++ b/fs/bcachefs/journal_seq_blacklist.c
@@ -58,8 +58,8 @@ blacklist_entry_try_merge(struct bch_fs *c,
 			&bl->start[i + 1],
 			sizeof(bl->start[0]) * (nr - i));
 
-		bl = bch2_sb_resize_journal_seq_blacklist(&c->disk_sb,
-							sb_blacklist_u64s(nr));
+		bl = bch2_sb_field_resize(&c->disk_sb, journal_seq_blacklist,
+					  sb_blacklist_u64s(nr));
 		BUG_ON(!bl);
 	}
 
@@ -79,7 +79,7 @@ int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64 start, u64 end)
 	int ret = 0;
 
 	mutex_lock(&c->sb_lock);
-	bl = bch2_sb_get_journal_seq_blacklist(c->disk_sb.sb);
+	bl = bch2_sb_field_get(c->disk_sb.sb, journal_seq_blacklist);
 	nr = blacklist_nr_entries(bl);
 
 	for (i = 0; i < nr; i++) {
@@ -100,8 +100,8 @@ int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64 start, u64 end)
 		}
 	}
 
-	bl = bch2_sb_resize_journal_seq_blacklist(&c->disk_sb,
-					sb_blacklist_u64s(nr + 1));
+	bl = bch2_sb_field_resize(&c->disk_sb, journal_seq_blacklist,
+				  sb_blacklist_u64s(nr + 1));
 	if (!bl) {
 		ret = -BCH_ERR_ENOSPC_sb_journal_seq_blacklist;
 		goto out;
@@ -158,7 +158,7 @@ bool bch2_journal_seq_is_blacklisted(struct bch_fs *c, u64 seq,
 int bch2_blacklist_table_initialize(struct bch_fs *c)
 {
 	struct bch_sb_field_journal_seq_blacklist *bl =
-		bch2_sb_get_journal_seq_blacklist(c->disk_sb.sb);
+		bch2_sb_field_get(c->disk_sb.sb, journal_seq_blacklist);
 	struct journal_seq_blacklist_table *t;
 	unsigned i, nr = blacklist_nr_entries(bl);
 
@@ -281,7 +281,7 @@ retry:
 		return;
 
 	mutex_lock(&c->sb_lock);
-	bl = bch2_sb_get_journal_seq_blacklist(c->disk_sb.sb);
+	bl = bch2_sb_field_get(c->disk_sb.sb, journal_seq_blacklist);
 	if (!bl)
 		goto out;
 
@@ -306,7 +306,7 @@ retry:
 	bch_info(c, "nr blacklist entries was %u, now %u", nr, new_nr);
 
 	if (new_nr != nr) {
-		bl = bch2_sb_resize_journal_seq_blacklist(&c->disk_sb,
+		bl = bch2_sb_field_resize(&c->disk_sb, journal_seq_blacklist,
 				new_nr ? sb_blacklist_u64s(new_nr) : 0);
 		BUG_ON(new_nr && !bl);
 
diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c
index 36de2f071d80..cb68ae44d597 100644
--- a/fs/bcachefs/quota.c
+++ b/fs/bcachefs/quota.c
@@ -513,12 +513,12 @@ void bch2_fs_quota_init(struct bch_fs *c)
 
 static struct bch_sb_field_quota *bch2_sb_get_or_create_quota(struct bch_sb_handle *sb)
 {
-	struct bch_sb_field_quota *sb_quota = bch2_sb_get_quota(sb->sb);
+	struct bch_sb_field_quota *sb_quota = bch2_sb_field_get(sb->sb, quota);
 
 	if (sb_quota)
 		return sb_quota;
 
-	sb_quota = bch2_sb_resize_quota(sb, sizeof(*sb_quota) / sizeof(u64));
+	sb_quota = bch2_sb_field_resize(sb, quota, sizeof(*sb_quota) / sizeof(u64));
 	if (sb_quota) {
 		unsigned qtype, qc;
 
@@ -536,7 +536,7 @@ static void bch2_sb_quota_read(struct bch_fs *c)
 	struct bch_sb_field_quota *sb_quota;
 	unsigned i, j;
 
-	sb_quota = bch2_sb_get_quota(c->disk_sb.sb);
+	sb_quota = bch2_sb_field_get(c->disk_sb.sb, quota);
 	if (!sb_quota)
 		return;
 
diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c
index a9a694fb0b18..cef2a0447b86 100644
--- a/fs/bcachefs/replicas.c
+++ b/fs/bcachefs/replicas.c
@@ -700,9 +700,9 @@ int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *c)
 	struct bch_replicas_cpu new_r = { 0, 0, NULL };
 	int ret = 0;
 
-	if ((sb_v1 = bch2_sb_get_replicas(c->disk_sb.sb)))
+	if ((sb_v1 = bch2_sb_field_get(c->disk_sb.sb, replicas)))
 		ret = __bch2_sb_replicas_to_cpu_replicas(sb_v1, &new_r);
-	else if ((sb_v0 = bch2_sb_get_replicas_v0(c->disk_sb.sb)))
+	else if ((sb_v0 = bch2_sb_field_get(c->disk_sb.sb, replicas_v0)))
 		ret = __bch2_sb_replicas_v0_to_cpu_replicas(sb_v0, &new_r);
 	if (ret)
 		return ret;
@@ -732,13 +732,13 @@ static int bch2_cpu_replicas_to_sb_replicas_v0(struct bch_fs *c,
 	for_each_cpu_replicas_entry(r, src)
 		bytes += replicas_entry_bytes(src) - 1;
 
-	sb_r = bch2_sb_resize_replicas_v0(&c->disk_sb,
+	sb_r = bch2_sb_field_resize(&c->disk_sb, replicas_v0,
 			DIV_ROUND_UP(bytes, sizeof(u64)));
 	if (!sb_r)
 		return -BCH_ERR_ENOSPC_sb_replicas;
 
 	bch2_sb_field_delete(&c->disk_sb, BCH_SB_FIELD_replicas);
-	sb_r = bch2_sb_get_replicas_v0(c->disk_sb.sb);
+	sb_r = bch2_sb_field_get(c->disk_sb.sb, replicas_v0);
 
 	memset(&sb_r->entries, 0,
 	       vstruct_end(&sb_r->field) -
@@ -777,13 +777,13 @@ static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *c,
 	if (!need_v1)
 		return bch2_cpu_replicas_to_sb_replicas_v0(c, r);
 
-	sb_r = bch2_sb_resize_replicas(&c->disk_sb,
+	sb_r = bch2_sb_field_resize(&c->disk_sb, replicas,
 			DIV_ROUND_UP(bytes, sizeof(u64)));
 	if (!sb_r)
 		return -BCH_ERR_ENOSPC_sb_replicas;
 
 	bch2_sb_field_delete(&c->disk_sb, BCH_SB_FIELD_replicas_v0);
-	sb_r = bch2_sb_get_replicas(c->disk_sb.sb);
+	sb_r = bch2_sb_field_get(c->disk_sb.sb, replicas);
 
 	memset(&sb_r->entries, 0,
 	       vstruct_end(&sb_r->field) -
@@ -998,8 +998,8 @@ unsigned bch2_sb_dev_has_data(struct bch_sb *sb, unsigned dev)
 	struct bch_sb_field_replicas_v0 *replicas_v0;
 	unsigned i, data_has = 0;
 
-	replicas = bch2_sb_get_replicas(sb);
-	replicas_v0 = bch2_sb_get_replicas_v0(sb);
+	replicas = bch2_sb_field_get(sb, replicas);
+	replicas_v0 = bch2_sb_field_get(sb, replicas_v0);
 
 	if (replicas) {
 		struct bch_replicas_entry *r;
diff --git a/fs/bcachefs/sb-clean.c b/fs/bcachefs/sb-clean.c
index a3695e56a155..61203d7c8d36 100644
--- a/fs/bcachefs/sb-clean.c
+++ b/fs/bcachefs/sb-clean.c
@@ -137,7 +137,7 @@ struct bch_sb_field_clean *bch2_read_superblock_clean(struct bch_fs *c)
 	int ret;
 
 	mutex_lock(&c->sb_lock);
-	sb_clean = bch2_sb_get_clean(c->disk_sb.sb);
+	sb_clean = bch2_sb_field_get(c->disk_sb.sb, clean);
 
 	if (fsck_err_on(!sb_clean, c,
 			"superblock marked clean but clean section not present")) {
@@ -359,7 +359,7 @@ void bch2_fs_mark_clean(struct bch_fs *c)
 
 	u64s = sizeof(*sb_clean) / sizeof(u64) + c->journal.entry_u64s_reserved;
 
-	sb_clean = bch2_sb_resize_clean(&c->disk_sb, u64s);
+	sb_clean = bch2_sb_field_resize(&c->disk_sb, clean, u64s);
 	if (!sb_clean) {
 		bch_err(c, "error resizing superblock while setting filesystem clean");
 		goto out;
diff --git a/fs/bcachefs/sb-members.c b/fs/bcachefs/sb-members.c
index 04bde1aaff9f..6dd85bb996fe 100644
--- a/fs/bcachefs/sb-members.c
+++ b/fs/bcachefs/sb-members.c
@@ -16,7 +16,7 @@ static struct bch_member *members_v2_get_mut(struct bch_sb_field_members_v2 *mi,
 
 struct bch_member *bch2_members_v2_get_mut(struct bch_sb *sb, int i)
 {
-	return members_v2_get_mut(bch2_sb_get_members_v2(sb), i);
+	return members_v2_get_mut(bch2_sb_field_get(sb, members_v2), i);
 }
 
 static struct bch_member members_v2_get(struct bch_sb_field_members_v2 *mi, int i)
@@ -41,22 +41,22 @@ static struct bch_member members_v1_get(struct bch_sb_field_members_v1 *mi, int
 
 struct bch_member bch2_sb_member_get(struct bch_sb *sb, int i)
 {
-	struct bch_sb_field_members_v2 *mi2 = bch2_sb_get_members_v2(sb);
+	struct bch_sb_field_members_v2 *mi2 = bch2_sb_field_get(sb, members_v2);
 	if (mi2)
 		return members_v2_get(mi2, i);
-	struct bch_sb_field_members_v1 *mi1 = bch2_sb_get_members_v1(sb);
+	struct bch_sb_field_members_v1 *mi1 = bch2_sb_field_get(sb, members_v1);
 	return members_v1_get(mi1, i);
 }
 
 static int sb_members_v2_resize_entries(struct bch_fs *c)
 {
-	struct bch_sb_field_members_v2 *mi = bch2_sb_get_members_v2(c->disk_sb.sb);
+	struct bch_sb_field_members_v2 *mi = bch2_sb_field_get(c->disk_sb.sb, members_v2);
 
 	if (le16_to_cpu(mi->member_bytes) < sizeof(struct bch_member)) {
 		unsigned u64s = DIV_ROUND_UP((sizeof(*mi) + sizeof(mi->_members[0]) *
 					      c->disk_sb.sb->nr_devices), 8);
 
-		mi = bch2_sb_resize_members_v2(&c->disk_sb, u64s);
+		mi = bch2_sb_field_resize(&c->disk_sb, members_v2, u64s);
 		if (!mi)
 			return -BCH_ERR_ENOSPC_sb_members_v2;
 
@@ -76,12 +76,12 @@ int bch2_members_v2_init(struct bch_fs *c)
 	struct bch_sb_field_members_v1 *mi1;
 	struct bch_sb_field_members_v2 *mi2;
 
-	if (!bch2_sb_get_members_v2(c->disk_sb.sb)) {
-		mi2 = bch2_sb_resize_members_v2(&c->disk_sb,
+	if (!bch2_sb_field_get(c->disk_sb.sb, members_v2)) {
+		mi2 = bch2_sb_field_resize(&c->disk_sb, members_v2,
 				DIV_ROUND_UP(sizeof(*mi2) +
 					     sizeof(struct bch_member) * c->sb.nr_devices,
 					     sizeof(u64)));
-		mi1 = bch2_sb_get_members_v1(c->disk_sb.sb);
+		mi1 = bch2_sb_field_get(c->disk_sb.sb, members_v1);
 		memcpy(&mi2->_members[0], &mi1->_members[0],
 		       BCH_MEMBER_V1_BYTES * c->sb.nr_devices);
 		memset(&mi2->pad[0], 0, sizeof(mi2->pad));
@@ -96,13 +96,13 @@ int bch_members_cpy_v2_v1(struct bch_sb_handle *disk_sb)
 	struct bch_sb_field_members_v1 *mi1;
 	struct bch_sb_field_members_v2 *mi2;
 
-	mi1 = bch2_sb_resize_members_v1(disk_sb,
+	mi1 = bch2_sb_field_resize(disk_sb, members_v1,
 			DIV_ROUND_UP(sizeof(*mi1) + BCH_MEMBER_V1_BYTES *
 				     disk_sb->sb->nr_devices, sizeof(u64)));
 	if (!mi1)
 		return -BCH_ERR_ENOSPC_sb_members;
 
-	mi2 = bch2_sb_get_members_v2(disk_sb->sb);
+	mi2 = bch2_sb_field_get(disk_sb->sb, members_v2);
 
 	for (unsigned i = 0; i < disk_sb->sb->nr_devices; i++)
 		memcpy(members_v1_get_mut(mi1, i), members_v2_get_mut(mi2, i), BCH_MEMBER_V1_BYTES);
@@ -283,7 +283,7 @@ static void bch2_sb_members_v1_to_text(struct printbuf *out, struct bch_sb *sb,
 				       struct bch_sb_field *f)
 {
 	struct bch_sb_field_members_v1 *mi = field_to_type(f, members_v1);
-	struct bch_sb_field_disk_groups *gi = bch2_sb_get_disk_groups(sb);
+	struct bch_sb_field_disk_groups *gi = bch2_sb_field_get(sb, disk_groups);
 	unsigned i;
 
 	for (i = 0; i < sb->nr_devices; i++) {
@@ -301,7 +301,7 @@ static void bch2_sb_members_v2_to_text(struct printbuf *out, struct bch_sb *sb,
 				       struct bch_sb_field *f)
 {
 	struct bch_sb_field_members_v2 *mi = field_to_type(f, members_v2);
-	struct bch_sb_field_disk_groups *gi = bch2_sb_get_disk_groups(sb);
+	struct bch_sb_field_disk_groups *gi = bch2_sb_field_get(sb, disk_groups);
 	unsigned i;
 
 	for (i = 0; i < sb->nr_devices; i++) {
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index 4498e24dabdb..332d41e1c0a3 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -96,7 +96,7 @@ const char * const bch2_sb_fields[] = {
 static int bch2_sb_field_validate(struct bch_sb *, struct bch_sb_field *,
 				  struct printbuf *);
 
-struct bch_sb_field *bch2_sb_field_get(struct bch_sb *sb,
+struct bch_sb_field *bch2_sb_field_get_id(struct bch_sb *sb,
 				      enum bch_sb_field_type type)
 {
 	struct bch_sb_field *f;
@@ -151,7 +151,7 @@ static struct bch_sb_field *__bch2_sb_field_resize(struct bch_sb_handle *sb,
 void bch2_sb_field_delete(struct bch_sb_handle *sb,
 			  enum bch_sb_field_type type)
 {
-	struct bch_sb_field *f = bch2_sb_field_get(sb->sb, type);
+	struct bch_sb_field *f = bch2_sb_field_get_id(sb->sb, type);
 
 	if (f)
 		__bch2_sb_field_resize(sb, f, 0);
@@ -225,11 +225,11 @@ int bch2_sb_realloc(struct bch_sb_handle *sb, unsigned u64s)
 	return 0;
 }
 
-struct bch_sb_field *bch2_sb_field_resize(struct bch_sb_handle *sb,
+struct bch_sb_field *bch2_sb_field_resize_id(struct bch_sb_handle *sb,
 					  enum bch_sb_field_type type,
 					  unsigned u64s)
 {
-	struct bch_sb_field *f = bch2_sb_field_get(sb->sb, type);
+	struct bch_sb_field *f = bch2_sb_field_get_id(sb->sb, type);
 	ssize_t old_u64s = f ? le32_to_cpu(f->u64s) : 0;
 	ssize_t d = -old_u64s + u64s;
 
@@ -255,7 +255,7 @@ struct bch_sb_field *bch2_sb_field_resize(struct bch_sb_handle *sb,
 		}
 	}
 
-	f = bch2_sb_field_get(sb->sb, type);
+	f = bch2_sb_field_get_id(sb->sb, type);
 	f = __bch2_sb_field_resize(sb, f, u64s);
 	if (f)
 		f->type = cpu_to_le32(type);
@@ -458,7 +458,7 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb, struct printbuf *out,
 	}
 
 	/* members must be validated first: */
-	mi = bch2_sb_get_members_v1(sb);
+	mi = bch2_sb_field_get(sb, members_v1);
 	if (!mi) {
 		prt_printf(out, "Invalid superblock: member info area missing");
 		return -BCH_ERR_invalid_sb_members_missing;
@@ -546,8 +546,8 @@ static int __copy_super(struct bch_sb_handle *dst_handle, struct bch_sb *src)
 		if ((1U << i) & BCH_SINGLE_DEVICE_SB_FIELDS)
 			continue;
 
-		src_f = bch2_sb_field_get(src, i);
-		dst_f = bch2_sb_field_get(dst, i);
+		src_f = bch2_sb_field_get_id(src, i);
+		dst_f = bch2_sb_field_get_id(dst, i);
 
 		d = (src_f ? le32_to_cpu(src_f->u64s) : 0) -
 		    (dst_f ? le32_to_cpu(dst_f->u64s) : 0);
@@ -559,7 +559,7 @@ static int __copy_super(struct bch_sb_handle *dst_handle, struct bch_sb *src)
 				return ret;
 
 			dst = dst_handle->sb;
-			dst_f = bch2_sb_field_get(dst, i);
+			dst_f = bch2_sb_field_get_id(dst, i);
 		}
 
 		dst_f = __bch2_sb_field_resize(dst_handle, dst_f,
diff --git a/fs/bcachefs/super-io.h b/fs/bcachefs/super-io.h
index f7e9099931a7..b0d8584f475f 100644
--- a/fs/bcachefs/super-io.h
+++ b/fs/bcachefs/super-io.h
@@ -23,31 +23,19 @@ u64 bch2_upgrade_recovery_passes(struct bch_fs *c,
 				 unsigned,
 				 unsigned);
 
-struct bch_sb_field *bch2_sb_field_get(struct bch_sb *, enum bch_sb_field_type);
-struct bch_sb_field *bch2_sb_field_resize(struct bch_sb_handle *,
-					  enum bch_sb_field_type, unsigned);
-void bch2_sb_field_delete(struct bch_sb_handle *, enum bch_sb_field_type);
-
 #define field_to_type(_f, _name)					\
 	container_of_or_null(_f, struct bch_sb_field_##_name, field)
 
-#define x(_name, _nr)							\
-static inline struct bch_sb_field_##_name *				\
-bch2_sb_get_##_name(struct bch_sb *sb)					\
-{									\
-	return field_to_type(bch2_sb_field_get(sb,			\
-				BCH_SB_FIELD_##_name), _name);		\
-}									\
-									\
-static inline struct bch_sb_field_##_name *				\
-bch2_sb_resize_##_name(struct bch_sb_handle *sb, unsigned u64s)	\
-{									\
-	return field_to_type(bch2_sb_field_resize(sb,			\
-				BCH_SB_FIELD_##_name, u64s), _name);	\
-}
+struct bch_sb_field *bch2_sb_field_get_id(struct bch_sb *, enum bch_sb_field_type);
+#define bch2_sb_field_get(_sb, _name)					\
+	field_to_type(bch2_sb_field_get_id(_sb, BCH_SB_FIELD_##_name), _name)
 
-BCH_SB_FIELDS()
-#undef x
+struct bch_sb_field *bch2_sb_field_resize_id(struct bch_sb_handle *,
+					     enum bch_sb_field_type, unsigned);
+#define bch2_sb_field_resize(_sb, _name, _u64s)				\
+	field_to_type(bch2_sb_field_resize_id(_sb, BCH_SB_FIELD_##_name, _u64s), _name)
+
+void bch2_sb_field_delete(struct bch_sb_handle *, enum bch_sb_field_type);
 
 extern const char * const bch2_sb_fields[];
 
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 2014f7816f66..1c775695336d 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -1630,9 +1630,9 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
 		goto err_unlock;
 	}
 
-	mi = bch2_sb_get_members_v2(ca->disk_sb.sb);
+	mi = bch2_sb_field_get(ca->disk_sb.sb, members_v2);
 
-	if (!bch2_sb_resize_members_v2(&ca->disk_sb,
+	if (!bch2_sb_field_resize(&ca->disk_sb, members_v2,
 				le32_to_cpu(mi->field.u64s) +
 				sizeof(dev_mi) / sizeof(u64))) {
 		ret = -BCH_ERR_ENOSPC_sb_members;
@@ -1656,7 +1656,7 @@ have_slot:
 	u64s = DIV_ROUND_UP(sizeof(struct bch_sb_field_members_v2) +
 			    le16_to_cpu(mi->member_bytes) * nr_devices, sizeof(u64));
 
-	mi = bch2_sb_resize_members_v2(&c->disk_sb, u64s);
+	mi = bch2_sb_field_resize(&c->disk_sb, members_v2, u64s);
 	if (!mi) {
 		ret = -BCH_ERR_ENOSPC_sb_members;
 		bch_err_msg(c, ret, "setting up new superblock");
-- 
cgit 


From 1e2d399970ca1d66fa1c715f4d80a0251f2a122e Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 29 Sep 2023 00:46:30 -0400
Subject: bcachefs: Fix snapshot skiplists during snapshot deletion

In snapshot deleion, we have to pick new skiplist nodes for entries that
point to nodes being deleted.

The function that finds a new skiplist node, skipping over entries being
deleted, was incorrect: if n = 0, but the parent node is being deleted,
we also need to skip over that node.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/snapshot.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c
index 3ecc17b8d6fc..b8c32d1cbd76 100644
--- a/fs/bcachefs/snapshot.c
+++ b/fs/bcachefs/snapshot.c
@@ -1288,6 +1288,9 @@ static inline u32 bch2_snapshot_nth_parent_skip(struct bch_fs *c, u32 id, u32 n,
 						snapshot_id_list *skip)
 {
 	rcu_read_lock();
+	while (snapshot_list_has_id(skip, id))
+		id = __bch2_snapshot_parent(c, id);
+
 	while (n--) {
 		do {
 			id = __bch2_snapshot_parent(c, id);
-- 
cgit 


From 37fad9497f5d37d89ed06faa64d580d1451be664 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 29 Sep 2023 01:15:33 -0400
Subject: bcachefs: snapshot_create_lock

Add a new lock for snapshot creation - this addresses a few races with
logged operations and snapshot deletion.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs.h |  1 +
 fs/bcachefs/fs-ioctl.c | 14 ++++++++++++--
 fs/bcachefs/io_misc.c  | 22 ++++++++++++++++++++--
 fs/bcachefs/snapshot.c | 13 ++++++++++---
 fs/bcachefs/super.c    |  1 +
 5 files changed, 44 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index e9d07f9fa1c0..53ffa88cae16 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -746,6 +746,7 @@ struct bch_fs {
 	struct snapshot_table __rcu *snapshots;
 	size_t			snapshot_table_size;
 	struct mutex		snapshot_table_lock;
+	struct rw_semaphore	snapshot_create_lock;
 
 	struct work_struct	snapshot_delete_work;
 	struct work_struct	snapshot_wait_for_pagecache_and_delete_work;
diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c
index 0679b2f79fd6..6040bd3f0778 100644
--- a/fs/bcachefs/fs-ioctl.c
+++ b/fs/bcachefs/fs-ioctl.c
@@ -318,8 +318,8 @@ err:
 	return ret;
 }
 
-static long bch2_ioctl_subvolume_create(struct bch_fs *c, struct file *filp,
-				struct bch_ioctl_subvolume arg)
+static long __bch2_ioctl_subvolume_create(struct bch_fs *c, struct file *filp,
+					  struct bch_ioctl_subvolume arg)
 {
 	struct inode *dir;
 	struct bch_inode_info *inode;
@@ -440,6 +440,16 @@ err1:
 	return error;
 }
 
+static long bch2_ioctl_subvolume_create(struct bch_fs *c, struct file *filp,
+					struct bch_ioctl_subvolume arg)
+{
+	down_write(&c->snapshot_create_lock);
+	long ret = __bch2_ioctl_subvolume_create(c, filp, arg);
+	up_write(&c->snapshot_create_lock);
+
+	return ret;
+}
+
 static long bch2_ioctl_subvolume_destroy(struct bch_fs *c, struct file *filp,
 				struct bch_ioctl_subvolume arg)
 {
diff --git a/fs/bcachefs/io_misc.c b/fs/bcachefs/io_misc.c
index 32432bdddac4..119834cb8f9e 100644
--- a/fs/bcachefs/io_misc.c
+++ b/fs/bcachefs/io_misc.c
@@ -287,9 +287,18 @@ int bch2_truncate(struct bch_fs *c, subvol_inum inum, u64 new_i_size, u64 *i_sec
 	op.v.inum	= cpu_to_le64(inum.inum);
 	op.v.new_i_size	= cpu_to_le64(new_i_size);
 
-	return bch2_trans_run(c,
+	/*
+	 * Logged ops aren't atomic w.r.t. snapshot creation: creating a
+	 * snapshot while they're in progress, then crashing, will result in the
+	 * resume only proceeding in one of the snapshots
+	 */
+	down_read(&c->snapshot_create_lock);
+	int ret = bch2_trans_run(c,
 		bch2_logged_op_start(trans, &op.k_i) ?:
 		__bch2_resume_logged_op_truncate(trans, &op.k_i, i_sectors_delta));
+	up_read(&c->snapshot_create_lock);
+
+	return ret;
 }
 
 /* finsert/fcollapse: */
@@ -491,7 +500,16 @@ int bch2_fcollapse_finsert(struct bch_fs *c, subvol_inum inum,
 	op.v.src_offset	= cpu_to_le64(offset);
 	op.v.pos	= cpu_to_le64(insert ? U64_MAX : offset);
 
-	return bch2_trans_run(c,
+	/*
+	 * Logged ops aren't atomic w.r.t. snapshot creation: creating a
+	 * snapshot while they're in progress, then crashing, will result in the
+	 * resume only proceeding in one of the snapshots
+	 */
+	down_read(&c->snapshot_create_lock);
+	int ret = bch2_trans_run(c,
 		bch2_logged_op_start(trans, &op.k_i) ?:
 		__bch2_resume_logged_op_finsert(trans, &op.k_i, i_sectors_delta));
+	up_read(&c->snapshot_create_lock);
+
+	return ret;
 }
diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c
index b8c32d1cbd76..4982468bfe11 100644
--- a/fs/bcachefs/snapshot.c
+++ b/fs/bcachefs/snapshot.c
@@ -1447,6 +1447,8 @@ int bch2_delete_dead_snapshots(struct bch_fs *c)
 		}
 	}
 
+	down_write(&c->snapshot_create_lock);
+
 	for_each_btree_key(trans, iter, BTREE_ID_snapshots,
 			   POS_MIN, 0, k, ret) {
 		u32 snapshot = k.k->p.offset;
@@ -1457,6 +1459,9 @@ int bch2_delete_dead_snapshots(struct bch_fs *c)
 	}
 	bch2_trans_iter_exit(trans, &iter);
 
+	if (ret)
+		goto err_create_lock;
+
 	/*
 	 * Fixing children of deleted snapshots can't be done completely
 	 * atomically, if we crash between here and when we delete the interior
@@ -1467,14 +1472,14 @@ int bch2_delete_dead_snapshots(struct bch_fs *c)
 				  NULL, NULL, BTREE_INSERT_NOFAIL,
 		bch2_fix_child_of_deleted_snapshot(trans, &iter, k, &deleted_interior));
 	if (ret)
-		goto err;
+		goto err_create_lock;
 
 	darray_for_each(deleted, i) {
 		ret = commit_do(trans, NULL, NULL, 0,
 			bch2_snapshot_node_delete(trans, *i));
 		if (ret) {
 			bch_err_msg(c, ret, "deleting snapshot %u", *i);
-			goto err;
+			goto err_create_lock;
 		}
 	}
 
@@ -1483,11 +1488,13 @@ int bch2_delete_dead_snapshots(struct bch_fs *c)
 			bch2_snapshot_node_delete(trans, *i));
 		if (ret) {
 			bch_err_msg(c, ret, "deleting snapshot %u", *i);
-			goto err;
+			goto err_create_lock;
 		}
 	}
 
 	clear_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags);
+err_create_lock:
+	up_write(&c->snapshot_create_lock);
 err:
 	darray_exit(&deleted_interior);
 	darray_exit(&deleted);
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 1c775695336d..0e85c22672be 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -720,6 +720,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 
 	mutex_init(&c->bio_bounce_pages_lock);
 	mutex_init(&c->snapshot_table_lock);
+	init_rwsem(&c->snapshot_create_lock);
 
 	spin_lock_init(&c->btree_write_error_lock);
 
-- 
cgit 


From 795413c5488536476dacff9e565e53a51d7798d8 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 29 Sep 2023 16:13:00 -0400
Subject: bcachefs: Fix drop_alloc_keys()

For consistency with the rest of the reconstruct_alloc option, we should
be skipping all alloc keys.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/recovery.c | 30 +++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index cd037f2e4235..4cd660650e5b 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -35,13 +35,27 @@
 
 #define QSTR(n) { { { .len = strlen(n) } }, .name = n }
 
+static bool btree_id_is_alloc(enum btree_id id)
+{
+	switch (id) {
+	case BTREE_ID_alloc:
+	case BTREE_ID_backpointers:
+	case BTREE_ID_need_discard:
+	case BTREE_ID_freespace:
+	case BTREE_ID_bucket_gens:
+		return true;
+	default:
+		return false;
+	}
+}
+
 /* for -o reconstruct_alloc: */
 static void drop_alloc_keys(struct journal_keys *keys)
 {
 	size_t src, dst;
 
 	for (src = 0, dst = 0; src < keys->nr; src++)
-		if (keys->d[src].btree_id != BTREE_ID_alloc)
+		if (!btree_id_is_alloc(keys->d[src].btree_id))
 			keys->d[dst++] = keys->d[src];
 
 	keys->nr = dst;
@@ -332,20 +346,6 @@ static int journal_replay_early(struct bch_fs *c,
 
 /* sb clean section: */
 
-static bool btree_id_is_alloc(enum btree_id id)
-{
-	switch (id) {
-	case BTREE_ID_alloc:
-	case BTREE_ID_backpointers:
-	case BTREE_ID_need_discard:
-	case BTREE_ID_freespace:
-	case BTREE_ID_bucket_gens:
-		return true;
-	default:
-		return false;
-	}
-}
-
 static int read_btree_roots(struct bch_fs *c)
 {
 	unsigned i;
-- 
cgit 


From 7413ab70cbc15bd8f35a78569ec5a778fb9a3e1e Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Wed, 18 Oct 2023 16:07:32 -0700
Subject: bcachefs: Refactor memcpy into direct assignment

The memcpy() in bch2_bkey_append_ptr() is operating on an embedded fake
flexible array which looks to the compiler like it has 0 size. This
causes W=1 builds to emit warnings due to -Wstringop-overflow:

   In file included from include/linux/string.h:254,
                    from include/linux/bitmap.h:11,
                    from include/linux/cpumask.h:12,
                    from include/linux/smp.h:13,
                    from include/linux/lockdep.h:14,
                    from include/linux/radix-tree.h:14,
                    from include/linux/backing-dev-defs.h:6,
                    from fs/bcachefs/bcachefs.h:182:
   fs/bcachefs/extents.c: In function 'bch2_bkey_append_ptr':
   include/linux/fortify-string.h:57:33: warning: writing 8 bytes into a region of size 0 [-Wstringop-overflow=]
      57 | #define __underlying_memcpy     __builtin_memcpy
         |                                 ^
   include/linux/fortify-string.h:648:9: note: in expansion of macro '__underlying_memcpy'
     648 |         __underlying_##op(p, q, __fortify_size);                        \
         |         ^~~~~~~~~~~~~
   include/linux/fortify-string.h:693:26: note: in expansion of macro '__fortify_memcpy_chk'
     693 | #define memcpy(p, q, s)  __fortify_memcpy_chk(p, q, s,                  \
         |                          ^~~~~~~~~~~~~~~~~~~~
   fs/bcachefs/extents.c:235:17: note: in expansion of macro 'memcpy'
     235 |                 memcpy((void *) &k->v + bkey_val_bytes(&k->k),
         |                 ^~~~~~
   fs/bcachefs/bcachefs_format.h:287:33: note: destination object 'v' of size 0
     287 |                 struct bch_val  v;
         |                                 ^

Avoid making any structure changes and just replace the u64 copy into a
direct assignment, side-stepping the entire problem.

Cc: Kent Overstreet <kent.overstreet@linux.dev>
Cc: Brian Foster <bfoster@redhat.com>
Cc: linux-bcachefs@vger.kernel.org
Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202309192314.VBsjiIm5-lkp@intel.com/
Link: https://lore.kernel.org/r/20231010235609.work.594-kees@kernel.org
Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/extents.h | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h
index 978ae5e7e8b1..879e7d218b6a 100644
--- a/fs/bcachefs/extents.h
+++ b/fs/bcachefs/extents.h
@@ -633,6 +633,8 @@ void bch2_bkey_extent_entry_drop(struct bkey_i *, union bch_extent_entry *);
 
 static inline void bch2_bkey_append_ptr(struct bkey_i *k, struct bch_extent_ptr ptr)
 {
+	struct bch_extent_ptr *dest;
+
 	EBUG_ON(bch2_bkey_has_device(bkey_i_to_s(k), ptr.dev));
 
 	switch (k->k.type) {
@@ -642,12 +644,8 @@ static inline void bch2_bkey_append_ptr(struct bkey_i *k, struct bch_extent_ptr
 		EBUG_ON(bkey_val_u64s(&k->k) >= BKEY_EXTENT_VAL_U64s_MAX);
 
 		ptr.type = 1 << BCH_EXTENT_ENTRY_ptr;
-
-		unsafe_memcpy((void *) &k->v + bkey_val_bytes(&k->k),
-			      &ptr,
-			      sizeof(ptr),
-			      "Our memcpy target is relative to a zero size array ,"
-			      "compiler bounds checking doesn't work here");
+		dest = (struct bch_extent_ptr *)((void *) &k->v + bkey_val_bytes(&k->k));
+		*dest = ptr;
 		k->k.u64s++;
 		break;
 	default:
-- 
cgit